Upgrade to version 3.4 Change-Id: I8d852ea8b76e31dafa2b0efe8f15bc90c09d6961

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f6ab76f
--- /dev/null
+++ b/.gitignore

@@ -0,0 +1,38 @@
+qrc_*cxx
+*.orig
+*.pyc
+*.diff
+diff
+*.save
+save
+*.old
+*.gmo
+*.qm
+core
+core.*
+*.bak
+*~
+*build*
+*.moc.*
+*.moc
+ui_*
+CMakeCache.txt
+tags
+.*.swp
+activity.png
+*.out
+*.php*
+*.log
+*.orig
+*.rej
+log
+patch
+*.patch
+a
+a.*
+lapack/testing
+lapack/reference
+.*project
+.settings
+Makefile
+!ci/build.gitlab-ci.yml

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000..e5a3c00
--- /dev/null
+++ b/.gitlab-ci.yml

@@ -0,0 +1,23 @@
+# This file is part of Eigen, a lightweight C++ template library
+# for linear algebra.
+#
+# Copyright (C) 2020 Arm Ltd. and Contributors
+#
+# This Source Code Form is subject to the terms of the Mozilla
+# Public License v. 2.0. If a copy of the MPL was not distributed
+# with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+stages:
+  - buildsmoketests
+  - smoketests
+  - build
+  - test
+
+variables:
+  BUILDDIR: builddir
+  EIGEN_CI_CMAKE_GENEATOR: "Ninja"
+
+include:
+  - "/ci/smoketests.gitlab-ci.yml"
+  - "/ci/build.gitlab-ci.yml"
+  - "/ci/test.gitlab-ci.yml"

diff --git a/.gitlab/issue_templates/Bug Report.md b/.gitlab/issue_templates/Bug Report.md
new file mode 100644
index 0000000..0c49b0f
--- /dev/null
+++ b/.gitlab/issue_templates/Bug Report.md

@@ -0,0 +1,69 @@
+<!--
+Please read this!
+
+Before opening a new issue, make sure to search for keywords in the issues
+filtered by "bug::confirmed" or "bug::unconfirmed" and "bugzilla" label:
+
+- https://gitlab.com/libeigen/eigen/-/issues?scope=all&utf8=%E2%9C%93&state=opened&label_name[]=bug%3A%3Aconfirmed
+- https://gitlab.com/libeigen/eigen/-/issues?scope=all&utf8=%E2%9C%93&state=opened&label_name[]=bug%3A%3Aunconfirmed
+- https://gitlab.com/libeigen/eigen/-/issues?scope=all&utf8=%E2%9C%93&state=opened&label_name[]=bugzilla
+
+and verify the issue you're about to submit isn't a duplicate. -->
+
+### Summary
+<!-- Summarize the bug encountered concisely. -->
+
+### Environment
+<!-- Please provide your development environment here -->
+- **Operating System** : Windows/Linux
+- **Architecture** : x64/Arm64/PowerPC ...
+- **Eigen Version** : 3.3.9
+- **Compiler Version** : Gcc7.0
+- **Compile Flags** : -O3 -march=native
+- **Vector Extension** : SSE/AVX/NEON ...
+
+### Minimal Example
+<!-- If possible, please create a minimal example here that exhibits the problematic behavior.
+You can also link to [godbolt](https://godbolt.org). But please note that you need to click 
+the "Share" button in the top right-hand corner of the godbolt page where you reproduce the sample 
+code to get the share link instead of in your browser address bar. 
+
+You can read [the guidelines on stackoverflow](https://stackoverflow.com/help/minimal-reproducible-example)
+on how to create a good minimal example. -->
+
+```cpp
+//show your code here
+```
+
+### Steps to reproduce
+<!-- Describe how one can reproduce the issue - this is very important. Please use an ordered list. -->
+
+1. first step
+2. second step
+3. ... 
+
+### What is the current *bug* behavior?
+<!-- Describe what actually happens. -->
+
+### What is the expected *correct* behavior?
+<!-- Describe what you should see instead. -->
+
+### Relevant logs
+<!-- Add relevant code snippets or program output within blocks marked by " ``` " -->
+
+<!-- OPTIONAL: remove this section if you are not reporting a compilation warning issue.-->
+### Warning Messages
+<!-- Show us the warning messages you got! -->
+
+<!-- OPTIONAL: remove this section if you are not reporting a performance issue. -->
+### Benchmark scripts and results
+<!-- Please share any benchmark scripts - either standalone, or using [Google Benchmark](https://github.com/google/benchmark). -->
+
+### Anything else that might help
+<!-- It will be better to provide us more information to help narrow down the cause. 
+Including but not limited to the following: 
+- lines of code that might help us diagnose the problem. 
+- potential ways to address the issue.
+- last known working/first broken version (release number or commit hash). --> 
+
+- [ ] Have a plan to fix this issue.

diff --git a/.gitlab/issue_templates/Feature Request.md b/.gitlab/issue_templates/Feature Request.md
new file mode 100644
index 0000000..2c6f908
--- /dev/null
+++ b/.gitlab/issue_templates/Feature Request.md

@@ -0,0 +1,7 @@
+### Describe the feature you would like to be implemented.
+
+### Would such a feature be useful for other users? Why?
+
+### Any hints on how to implement the requested feature?
+
+### Additional resources

diff --git a/.gitlab/merge_request_templates/Merge Request Template.md b/.gitlab/merge_request_templates/Merge Request Template.md
new file mode 100644
index 0000000..3fe963a
--- /dev/null
+++ b/.gitlab/merge_request_templates/Merge Request Template.md

@@ -0,0 +1,26 @@
+<!-- 
+Thanks for contributing a merge request! Please name and fully describe your MR as you would for a commit message.
+If the MR fixes an issue, please include "Fixes #issue" in the commit message and the MR description.
+
+In addition, we recommend that first-time contributors read our [contribution guidelines](https://eigen.tuxfamily.org/index.php?title=Contributing_to_Eigen) and [git page](https://eigen.tuxfamily.org/index.php?title=Git), which will help you submit a more standardized MR.
+
+Before submitting the MR, you also need to complete the following checks:
+- Make one PR per feature/bugfix (don't mix multiple changes into one PR). Avoid committing unrelated changes.
+- Rebase before committing
+- For code changes, run the test suite (at least the tests that are likely affected by the change).
+  See our [test guidelines](https://eigen.tuxfamily.org/index.php?title=Tests).
+- If possible, add a test (both for bug-fixes as well as new features)
+- Make sure new features are documented
+
+Note that we are a team of volunteers; we appreciate your patience during the review process.
+
+Again, thanks for contributing! -->
+
+### Reference issue
+<!-- You can link to a specific issue using the gitlab syntax #<issue number>  -->
+
+### What does this implement/fix?
+<!--Please explain your changes.-->
+
+### Additional information
+<!--Any additional information you think is important.-->

diff --git a/.hgeol b/.hgeol
new file mode 100644
index 0000000..5327df1
--- /dev/null
+++ b/.hgeol

@@ -0,0 +1,11 @@
+[patterns]
+*.sh = LF
+*.MINPACK = CRLF
+scripts/*.in = LF
+debug/msvc/*.dat = CRLF
+debug/msvc/*.natvis = CRLF
+unsupported/test/mpreal/*.* = CRLF
+** = native
+
+[repository]
+native = LF

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..f40cf77
--- /dev/null
+++ b/CMakeLists.txt

@@ -0,0 +1,647 @@
+# cmake_minimum_require must be the first command of the file
+cmake_minimum_required(VERSION 3.5.0)
+
+project(Eigen3)
+
+# guard against in-source builds
+
+if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
+  message(FATAL_ERROR "In-source builds not allowed. Please make a new directory (called a build directory) and run CMake from there. You may need to remove CMakeCache.txt. ")
+endif()
+
+
+# Alias Eigen_*_DIR to Eigen3_*_DIR:
+
+set(Eigen_SOURCE_DIR ${Eigen3_SOURCE_DIR})
+set(Eigen_BINARY_DIR ${Eigen3_BINARY_DIR})
+
+# guard against bad build-type strings
+
+if (NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE "Release")
+endif()
+
+
+#############################################################################
+# retrieve version information                                               #
+#############################################################################
+
+# automatically parse the version number
+file(READ "${PROJECT_SOURCE_DIR}/Eigen/src/Core/util/Macros.h" _eigen_version_header)
+string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen_world_version_match "${_eigen_version_header}")
+set(EIGEN_WORLD_VERSION "${CMAKE_MATCH_1}")
+string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen_major_version_match "${_eigen_version_header}")
+set(EIGEN_MAJOR_VERSION "${CMAKE_MATCH_1}")
+string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen_minor_version_match "${_eigen_version_header}")
+set(EIGEN_MINOR_VERSION "${CMAKE_MATCH_1}")
+set(EIGEN_VERSION_NUMBER ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})
+
+# if we are not in a git clone
+if(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.git)
+  # if the git program is absent or this will leave the EIGEN_GIT_REVNUM string empty,
+  # but won't stop CMake.
+  execute_process(COMMAND git ls-remote --refs -q ${CMAKE_SOURCE_DIR} HEAD OUTPUT_VARIABLE EIGEN_GIT_OUTPUT)
+endif()
+
+# extract the git rev number from the git output...
+if(EIGEN_GIT_OUTPUT)
+string(REGEX MATCH "^([0-9;a-f]+).*" EIGEN_GIT_CHANGESET_MATCH "${EIGEN_GIT_OUTPUT}")
+set(EIGEN_GIT_REVNUM "${CMAKE_MATCH_1}")
+endif()
+#...and show it next to the version number
+if(EIGEN_GIT_REVNUM)
+  set(EIGEN_VERSION "${EIGEN_VERSION_NUMBER} (git rev ${EIGEN_GIT_REVNUM})")
+else()
+  set(EIGEN_VERSION "${EIGEN_VERSION_NUMBER}")
+endif()
+
+include(CheckCXXCompilerFlag)
+include(GNUInstallDirs)
+include(CMakeDependentOption)
+
+set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
+
+
+option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tensor module)." OFF)
+
+
+macro(ei_add_cxx_compiler_flag FLAG)
+  string(REGEX REPLACE "-" "" SFLAG1 ${FLAG})
+  string(REGEX REPLACE "\\+" "p" SFLAG ${SFLAG1})
+  check_cxx_compiler_flag(${FLAG} COMPILER_SUPPORT_${SFLAG})
+  if(COMPILER_SUPPORT_${SFLAG})
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG}")
+  endif()
+endmacro()
+
+check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11)
+
+if(EIGEN_TEST_CXX11)
+  set(CMAKE_CXX_STANDARD 11)
+  set(CMAKE_CXX_EXTENSIONS OFF)
+  if(EIGEN_COMPILER_SUPPORT_CPP11)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+  endif()
+else()
+  #set(CMAKE_CXX_STANDARD 03)
+  #set(CMAKE_CXX_EXTENSIONS OFF)
+  ei_add_cxx_compiler_flag("-std=c++03")
+endif()
+
+# Determine if we should build shared libraries on this platform.
+get_cmake_property(EIGEN_BUILD_SHARED_LIBS TARGET_SUPPORTS_SHARED_LIBS)
+
+#############################################################################
+# find how to link to the standard libraries                                #
+#############################################################################
+
+find_package(StandardMathLibrary)
+
+
+set(EIGEN_TEST_CUSTOM_LINKER_FLAGS  "" CACHE STRING "Additional linker flags when linking unit tests.")
+set(EIGEN_TEST_CUSTOM_CXX_FLAGS     "" CACHE STRING "Additional compiler flags when compiling unit tests.")
+
+set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "")
+
+if(NOT STANDARD_MATH_LIBRARY_FOUND)
+
+  message(FATAL_ERROR
+    "Can't link to the standard math library. Please report to the Eigen developers, telling them about your platform.")
+
+else()
+
+  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+    set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO} ${STANDARD_MATH_LIBRARY}")
+  else()
+    set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${STANDARD_MATH_LIBRARY}")
+  endif()
+
+endif()
+
+if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+  message(STATUS "Standard libraries to link to explicitly: ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}")
+else()
+  message(STATUS "Standard libraries to link to explicitly: none")
+endif()
+
+option(EIGEN_BUILD_BTL "Build benchmark suite" OFF)
+
+# Disable pkgconfig only for native Windows builds
+if(NOT WIN32 OR NOT CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
+  option(EIGEN_BUILD_PKGCONFIG "Build pkg-config .pc file for Eigen" ON)
+endif()
+
+set(CMAKE_INCLUDE_CURRENT_DIR OFF)
+
+option(EIGEN_SPLIT_LARGE_TESTS "Split large tests into smaller executables" ON)
+
+option(EIGEN_DEFAULT_TO_ROW_MAJOR "Use row-major as default matrix storage order" OFF)
+if(EIGEN_DEFAULT_TO_ROW_MAJOR)
+  add_definitions("-DEIGEN_DEFAULT_TO_ROW_MAJOR")
+endif()
+
+set(EIGEN_TEST_MAX_SIZE "320" CACHE STRING "Maximal matrix/vector size, default is 320")
+
+if(NOT MSVC)
+  # We assume that other compilers are partly compatible with GNUCC
+
+  # clang outputs some warnings for unknown flags that are not caught by check_cxx_compiler_flag
+  # adding -Werror turns such warnings into errors
+  check_cxx_compiler_flag("-Werror" COMPILER_SUPPORT_WERROR)
+  if(COMPILER_SUPPORT_WERROR)
+    set(CMAKE_REQUIRED_FLAGS "-Werror")
+  endif()
+  ei_add_cxx_compiler_flag("-pedantic")
+  ei_add_cxx_compiler_flag("-Wall")
+  ei_add_cxx_compiler_flag("-Wextra")
+  #ei_add_cxx_compiler_flag("-Weverything")              # clang
+
+  ei_add_cxx_compiler_flag("-Wundef")
+  ei_add_cxx_compiler_flag("-Wcast-align")
+  ei_add_cxx_compiler_flag("-Wchar-subscripts")
+  ei_add_cxx_compiler_flag("-Wnon-virtual-dtor")
+  ei_add_cxx_compiler_flag("-Wunused-local-typedefs")
+  ei_add_cxx_compiler_flag("-Wpointer-arith")
+  ei_add_cxx_compiler_flag("-Wwrite-strings")
+  ei_add_cxx_compiler_flag("-Wformat-security")
+  ei_add_cxx_compiler_flag("-Wshorten-64-to-32")
+  ei_add_cxx_compiler_flag("-Wlogical-op")
+  ei_add_cxx_compiler_flag("-Wenum-conversion")
+  ei_add_cxx_compiler_flag("-Wc++11-extensions")
+  ei_add_cxx_compiler_flag("-Wdouble-promotion")
+#  ei_add_cxx_compiler_flag("-Wconversion")
+
+  ei_add_cxx_compiler_flag("-Wshadow")
+
+  ei_add_cxx_compiler_flag("-Wno-psabi")
+  ei_add_cxx_compiler_flag("-Wno-variadic-macros")
+  ei_add_cxx_compiler_flag("-Wno-long-long")
+
+  ei_add_cxx_compiler_flag("-fno-check-new")
+  ei_add_cxx_compiler_flag("-fno-common")
+  ei_add_cxx_compiler_flag("-fstrict-aliasing")
+  ei_add_cxx_compiler_flag("-wd981")                    # disable ICC's "operands are evaluated in unspecified order" remark
+  ei_add_cxx_compiler_flag("-wd2304")                   # disable ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor
+
+
+  # The -ansi flag must be added last, otherwise it is also used as a linker flag by check_cxx_compiler_flag making it fails
+  # Moreover we should not set both -strict-ansi and -ansi
+  check_cxx_compiler_flag("-strict-ansi" COMPILER_SUPPORT_STRICTANSI)
+  ei_add_cxx_compiler_flag("-Qunused-arguments")        # disable clang warning: argument unused during compilation: '-ansi'
+
+  if(COMPILER_SUPPORT_STRICTANSI)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -strict-ansi")
+  else()
+    ei_add_cxx_compiler_flag("-ansi")
+  endif()
+
+  if(ANDROID_NDK)
+    ei_add_cxx_compiler_flag("-pie")
+    ei_add_cxx_compiler_flag("-fPIE")
+  endif()
+
+  set(CMAKE_REQUIRED_FLAGS "")
+
+  option(EIGEN_TEST_SSE2 "Enable/Disable SSE2 in tests/examples" OFF)
+  if(EIGEN_TEST_SSE2)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2")
+    message(STATUS "Enabling SSE2 in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_SSE3 "Enable/Disable SSE3 in tests/examples" OFF)
+  if(EIGEN_TEST_SSE3)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse3")
+    message(STATUS "Enabling SSE3 in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_SSSE3 "Enable/Disable SSSE3 in tests/examples" OFF)
+  if(EIGEN_TEST_SSSE3)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mssse3")
+    message(STATUS "Enabling SSSE3 in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_SSE4_1 "Enable/Disable SSE4.1 in tests/examples" OFF)
+  if(EIGEN_TEST_SSE4_1)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1")
+    message(STATUS "Enabling SSE4.1 in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_SSE4_2 "Enable/Disable SSE4.2 in tests/examples" OFF)
+  if(EIGEN_TEST_SSE4_2)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2")
+    message(STATUS "Enabling SSE4.2 in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" OFF)
+  if(EIGEN_TEST_AVX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
+    message(STATUS "Enabling AVX in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_FMA "Enable/Disable FMA in tests/examples" OFF)
+  if(EIGEN_TEST_FMA AND NOT EIGEN_TEST_NEON)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
+    message(STATUS "Enabling FMA in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_AVX2 "Enable/Disable AVX2 in tests/examples" OFF)
+  if(EIGEN_TEST_AVX2)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -mfma")
+    message(STATUS "Enabling AVX2 in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF)
+  if(EIGEN_TEST_AVX512)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mfma")
+    message(STATUS "Enabling AVX512 in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_AVX512DQ "Enable/Disable AVX512DQ in tests/examples" OFF)
+  if(EIGEN_TEST_AVX512DQ)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512dq -mfma")
+    message(STATUS "Enabling AVX512DQ in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_F16C "Enable/Disable F16C in tests/examples" OFF)
+  if(EIGEN_TEST_F16C)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mf16c")
+    message(STATUS "Enabling F16C in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF)
+  if(EIGEN_TEST_ALTIVEC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
+    message(STATUS "Enabling AltiVec in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_VSX "Enable/Disable VSX in tests/examples" OFF)
+  if(EIGEN_TEST_VSX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -mvsx")
+    message(STATUS "Enabling VSX in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_MSA "Enable/Disable MSA in tests/examples" OFF)
+  if(EIGEN_TEST_MSA)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmsa")
+    message(STATUS "Enabling MSA in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
+  if(EIGEN_TEST_NEON)
+    if(EIGEN_TEST_FMA)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon-vfpv4")
+    else()
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon")
+    endif()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=hard")
+    message(STATUS "Enabling NEON in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_NEON64 "Enable/Disable Neon in tests/examples" OFF)
+  if(EIGEN_TEST_NEON64)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    message(STATUS "Enabling NEON in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_Z13 "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF)
+  if(EIGEN_TEST_Z13)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z13 -mzvector")
+    message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_Z14 "Enable/Disable S390X(zEC14) ZVECTOR in tests/examples" OFF)
+  if(EIGEN_TEST_Z14)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z14 -mzvector")
+    message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples")
+  endif()
+
+  check_cxx_compiler_flag("-fopenmp" COMPILER_SUPPORT_OPENMP)
+  if(COMPILER_SUPPORT_OPENMP)
+    option(EIGEN_TEST_OPENMP "Enable/Disable OpenMP in tests/examples" OFF)
+    if(EIGEN_TEST_OPENMP)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+      message(STATUS "Enabling OpenMP in tests/examples")
+    endif()
+  endif()
+
+else()
+
+  # C4127 - conditional expression is constant
+  # C4714 - marked as __forceinline not inlined (I failed to deactivate it selectively)
+  #         We can disable this warning in the unit tests since it is clear that it occurs
+  #         because we are oftentimes returning objects that have a destructor or may
+  #         throw exceptions - in particular in the unit tests we are throwing extra many
+  #         exceptions to cover indexing errors.
+  # C4505 - unreferenced local function has been removed (impossible to deactivate selectively)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /wd4127 /wd4505 /wd4714")
+
+  # replace all /Wx by /W4
+  string(REGEX REPLACE "/W[0-9]" "/W4" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
+  check_cxx_compiler_flag("/openmp" COMPILER_SUPPORT_OPENMP)
+  if(COMPILER_SUPPORT_OPENMP)
+    option(EIGEN_TEST_OPENMP "Enable/Disable OpenMP in tests/examples" OFF)
+    if(EIGEN_TEST_OPENMP)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
+      message(STATUS "Enabling OpenMP in tests/examples")
+    endif()
+  endif()
+
+  option(EIGEN_TEST_SSE2 "Enable/Disable SSE2 in tests/examples" OFF)
+  if(EIGEN_TEST_SSE2)
+    if(NOT CMAKE_CL_64)
+      # arch is not supported on 64 bit systems, SSE is enabled automatically.
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:SSE2")
+    endif()
+    message(STATUS "Enabling SSE2 in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" OFF)
+  if(EIGEN_TEST_AVX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
+    message(STATUS "Enabling AVX in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_FMA "Enable/Disable FMA/AVX2 in tests/examples" OFF)
+  if(EIGEN_TEST_FMA AND NOT EIGEN_TEST_NEON)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
+    message(STATUS "Enabling FMA/AVX2 in tests/examples")
+  endif()
+
+endif()
+
+option(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION "Disable explicit vectorization in tests/examples" OFF)
+option(EIGEN_TEST_X87 "Force using X87 instructions. Implies no vectorization." OFF)
+option(EIGEN_TEST_32BIT "Force generating 32bit code." OFF)
+
+if(EIGEN_TEST_X87)
+  set(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION ON)
+  if(CMAKE_COMPILER_IS_GNUCXX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpmath=387")
+    message(STATUS "Forcing use of x87 instructions in tests/examples")
+  else()
+    message(STATUS "EIGEN_TEST_X87 ignored on your compiler")
+  endif()
+endif()
+
+if(EIGEN_TEST_32BIT)
+  if(CMAKE_COMPILER_IS_GNUCXX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32")
+    message(STATUS "Forcing generation of 32-bit code in tests/examples")
+  else()
+    message(STATUS "EIGEN_TEST_32BIT ignored on your compiler")
+  endif()
+endif()
+
+if(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION)
+  add_definitions(-DEIGEN_DONT_VECTORIZE=1)
+  message(STATUS "Disabling vectorization in tests/examples")
+endif()
+
+option(EIGEN_TEST_NO_EXPLICIT_ALIGNMENT "Disable explicit alignment (hence vectorization) in tests/examples" OFF)
+if(EIGEN_TEST_NO_EXPLICIT_ALIGNMENT)
+  add_definitions(-DEIGEN_DONT_ALIGN=1)
+  message(STATUS "Disabling alignment in tests/examples")
+endif()
+
+option(EIGEN_TEST_NO_EXCEPTIONS "Disables C++ exceptions" OFF)
+if(EIGEN_TEST_NO_EXCEPTIONS)
+  ei_add_cxx_compiler_flag("-fno-exceptions")
+  message(STATUS "Disabling exceptions in tests/examples")
+endif()
+
+set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code")
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+# Backward compatibility support for EIGEN_INCLUDE_INSTALL_DIR
+if(EIGEN_INCLUDE_INSTALL_DIR)
+  message(WARNING "EIGEN_INCLUDE_INSTALL_DIR is deprecated. Use INCLUDE_INSTALL_DIR instead.")
+endif()
+
+if(EIGEN_INCLUDE_INSTALL_DIR AND NOT INCLUDE_INSTALL_DIR)
+  set(INCLUDE_INSTALL_DIR ${EIGEN_INCLUDE_INSTALL_DIR}
+      CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where Eigen header files are installed")
+else()
+  set(INCLUDE_INSTALL_DIR
+      "${CMAKE_INSTALL_INCLUDEDIR}/eigen3"
+      CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where Eigen header files are installed"
+      )
+endif()
+set(CMAKEPACKAGE_INSTALL_DIR
+    "${CMAKE_INSTALL_DATADIR}/eigen3/cmake"
+    CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where Eigen3Config.cmake is installed"
+    )
+set(PKGCONFIG_INSTALL_DIR
+    "${CMAKE_INSTALL_DATADIR}/pkgconfig"
+    CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where eigen3.pc is installed"
+    )
+
+foreach(var INCLUDE_INSTALL_DIR CMAKEPACKAGE_INSTALL_DIR PKGCONFIG_INSTALL_DIR)
+  # If an absolute path is specified, make it relative to "{CMAKE_INSTALL_PREFIX}".
+  if(IS_ABSOLUTE "${${var}}")
+    file(RELATIVE_PATH "${var}" "${CMAKE_INSTALL_PREFIX}" "${${var}}")
+  endif()
+endforeach()
+
+# similar to set_target_properties but append the property instead of overwriting it
+macro(ei_add_target_property target prop value)
+
+  get_target_property(previous ${target} ${prop})
+  # if the property wasn't previously set, ${previous} is now "previous-NOTFOUND" which cmake allows catching with plain if()
+  if(NOT previous)
+    set(previous "")
+  endif()
+  set_target_properties(${target} PROPERTIES ${prop} "${previous} ${value}")
+endmacro()
+
+install(FILES
+  signature_of_eigen3_matrix_library
+  DESTINATION ${INCLUDE_INSTALL_DIR} COMPONENT Devel
+  )
+
+if(EIGEN_BUILD_PKGCONFIG)
+    configure_file(eigen3.pc.in eigen3.pc @ONLY)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/eigen3.pc
+        DESTINATION ${PKGCONFIG_INSTALL_DIR}
+        )
+endif()
+
+install(DIRECTORY Eigen DESTINATION ${INCLUDE_INSTALL_DIR} COMPONENT Devel)
+
+
+option(EIGEN_BUILD_DOC "Enable creation of Eigen documentation" ON)
+if(EIGEN_BUILD_DOC)
+  add_subdirectory(doc EXCLUDE_FROM_ALL)
+endif()
+
+
+option(BUILD_TESTING "Enable creation of Eigen tests." ON)
+if(BUILD_TESTING)
+  include(EigenConfigureTesting)
+
+  if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
+    add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest
+  else()
+    add_subdirectory(test EXCLUDE_FROM_ALL)
+  endif()
+
+  add_subdirectory(failtest)
+endif()
+
+if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
+  add_subdirectory(blas)
+  add_subdirectory(lapack)
+else()
+  add_subdirectory(blas EXCLUDE_FROM_ALL)
+  add_subdirectory(lapack EXCLUDE_FROM_ALL)
+endif()
+
+# add SYCL
+option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
+option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation (ComputeCPP by default)." OFF)
+if(EIGEN_TEST_SYCL)
+  set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}")
+  find_package(Threads REQUIRED)
+  if(EIGEN_SYCL_TRISYCL)
+    message(STATUS "Using triSYCL")
+    include(FindTriSYCL)
+  else()
+    message(STATUS "Using ComputeCPP SYCL")
+    include(FindComputeCpp)
+    set(COMPUTECPP_DRIVER_DEFAULT_VALUE OFF)
+    if (NOT MSVC)
+      set(COMPUTECPP_DRIVER_DEFAULT_VALUE ON)
+    endif()
+    option(COMPUTECPP_USE_COMPILER_DRIVER
+      "Use ComputeCpp driver instead of a 2 steps compilation"
+      ${COMPUTECPP_DRIVER_DEFAULT_VALUE}
+    )
+  endif(EIGEN_SYCL_TRISYCL)
+  option(EIGEN_DONT_VECTORIZE_SYCL "Don't use vectorisation in the SYCL tests." OFF)
+  if(EIGEN_DONT_VECTORIZE_SYCL)
+    message(STATUS "Disabling SYCL vectorization in tests/examples")
+    # When disabling SYCL vectorization, also disable Eigen default vectorization
+    add_definitions(-DEIGEN_DONT_VECTORIZE=1)
+    add_definitions(-DEIGEN_DONT_VECTORIZE_SYCL=1)
+  endif()
+endif()
+
+add_subdirectory(unsupported)
+
+add_subdirectory(demos EXCLUDE_FROM_ALL)
+
+# must be after test and unsupported, for configuring buildtests.in
+add_subdirectory(scripts EXCLUDE_FROM_ALL)
+
+# TODO: consider also replacing EIGEN_BUILD_BTL by a custom target "make btl"?
+if(EIGEN_BUILD_BTL)
+  add_subdirectory(bench/btl EXCLUDE_FROM_ALL)
+endif()
+
+if(NOT WIN32)
+  add_subdirectory(bench/spbench EXCLUDE_FROM_ALL)
+endif()
+
+configure_file(scripts/cdashtesting.cmake.in cdashtesting.cmake @ONLY)
+
+if(BUILD_TESTING)
+  ei_testing_print_summary()
+endif()
+
+message(STATUS "")
+message(STATUS "Configured Eigen ${EIGEN_VERSION_NUMBER}")
+message(STATUS "")
+
+string(TOLOWER "${CMAKE_GENERATOR}" cmake_generator_tolower)
+if(cmake_generator_tolower MATCHES "makefile")
+  message(STATUS "Available targets (use: make TARGET):")
+else()
+  message(STATUS "Available targets (use: cmake --build . --target TARGET):")
+endif()
+message(STATUS "---------+--------------------------------------------------------------")
+message(STATUS "Target   |   Description")
+message(STATUS "---------+--------------------------------------------------------------")
+message(STATUS "install  | Install Eigen. Headers will be installed to:")
+message(STATUS "         |     <CMAKE_INSTALL_PREFIX>/<INCLUDE_INSTALL_DIR>")
+message(STATUS "         |   Using the following values:")
+message(STATUS "         |     CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")
+message(STATUS "         |     INCLUDE_INSTALL_DIR:  ${INCLUDE_INSTALL_DIR}")
+message(STATUS "         |   Change the install location of Eigen headers using:")
+message(STATUS "         |     cmake . -DCMAKE_INSTALL_PREFIX=yourprefix")
+message(STATUS "         |   Or:")
+message(STATUS "         |     cmake . -DINCLUDE_INSTALL_DIR=yourdir")
+message(STATUS "doc      | Generate the API documentation, requires Doxygen & LaTeX")
+if(BUILD_TESTING)
+  message(STATUS "check    | Build and run the unit-tests. Read this page:")
+  message(STATUS "         |   http://eigen.tuxfamily.org/index.php?title=Tests")
+endif()
+message(STATUS "blas     | Build BLAS library (not the same thing as Eigen)")
+message(STATUS "uninstall| Remove files installed by the install target")
+message(STATUS "---------+--------------------------------------------------------------")
+message(STATUS "")
+
+
+set ( EIGEN_VERSION_STRING ${EIGEN_VERSION_NUMBER} )
+set ( EIGEN_VERSION_MAJOR  ${EIGEN_WORLD_VERSION} )
+set ( EIGEN_VERSION_MINOR  ${EIGEN_MAJOR_VERSION} )
+set ( EIGEN_VERSION_PATCH  ${EIGEN_MINOR_VERSION} )
+set ( EIGEN_DEFINITIONS "")
+set ( EIGEN_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${INCLUDE_INSTALL_DIR}" )
+set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} )
+
+include (CMakePackageConfigHelpers)
+
+# Imported target support
+add_library (eigen INTERFACE)
+add_library (Eigen3::Eigen ALIAS eigen)
+target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS})
+target_include_directories (eigen INTERFACE
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+  $<INSTALL_INTERFACE:${INCLUDE_INSTALL_DIR}>
+)
+
+# Export as title case Eigen
+set_target_properties (eigen PROPERTIES EXPORT_NAME Eigen)
+
+install (TARGETS eigen EXPORT Eigen3Targets)
+
+configure_package_config_file (
+  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
+  PATH_VARS EIGEN_INCLUDE_DIR EIGEN_ROOT_DIR
+  INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
+  NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
+)
+# Remove CMAKE_SIZEOF_VOID_P from Eigen3ConfigVersion.cmake since Eigen does
+# not depend on architecture specific settings or libraries. More
+# specifically, an Eigen3Config.cmake generated from a 64 bit target can be
+# used for 32 bit targets as well (and vice versa).
+set (_Eigen3_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P})
+unset (CMAKE_SIZEOF_VOID_P)
+write_basic_package_version_file (Eigen3ConfigVersion.cmake
+                                  VERSION ${EIGEN_VERSION_NUMBER}
+                                  COMPATIBILITY SameMajorVersion)
+set (CMAKE_SIZEOF_VOID_P ${_Eigen3_CMAKE_SIZEOF_VOID_P})
+
+# The Eigen target will be located in the Eigen3 namespace. Other CMake
+# targets can refer to it using Eigen3::Eigen.
+export (TARGETS eigen NAMESPACE Eigen3:: FILE Eigen3Targets.cmake)
+# Export Eigen3 package to CMake registry such that it can be easily found by
+# CMake even if it has not been installed to a standard directory.
+export (PACKAGE Eigen3)
+
+install (EXPORT Eigen3Targets NAMESPACE Eigen3:: DESTINATION ${CMAKEPACKAGE_INSTALL_DIR})
+
+install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
+                ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
+                ${CMAKE_CURRENT_BINARY_DIR}/Eigen3ConfigVersion.cmake
+          DESTINATION ${CMAKEPACKAGE_INSTALL_DIR} )
+
+# Add uninstall target
+add_custom_target ( uninstall
+    COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/EigenUninstall.cmake)
+
+if (EIGEN_SPLIT_TESTSUITE)
+  ei_split_testsuite("${EIGEN_SPLIT_TESTSUITE}")
+endif()

diff --git a/COPYING.APACHE b/COPYING.APACHE
new file mode 100644
index 0000000..61e948d
--- /dev/null
+++ b/COPYING.APACHE

@@ -0,0 +1,203 @@
+/*
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
\ No newline at end of file

diff --git a/COPYING.BSD b/COPYING.BSD
index 11971ff..8964ddf 100644
--- a/COPYING.BSD
+++ b/COPYING.BSD

@@ -23,4 +23,4 @@
  ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
\ No newline at end of file
+*/

diff --git a/COPYING.MINPACK b/COPYING.MINPACK
index 11d8a9a..132cc3f 100644
--- a/COPYING.MINPACK
+++ b/COPYING.MINPACK

@@ -1,52 +1,51 @@
-Minpack Copyright Notice (1999) University of Chicago.  All rights reserved

-

-Redistribution and use in source and binary forms, with or

-without modification, are permitted provided that the

-following conditions are met:

-

-1. Redistributions of source code must retain the above

-copyright notice, this list of conditions and the following

-disclaimer.

-

-2. Redistributions in binary form must reproduce the above

-copyright notice, this list of conditions and the following

-disclaimer in the documentation and/or other materials

-provided with the distribution.

-

-3. The end-user documentation included with the

-redistribution, if any, must include the following

-acknowledgment:

-

-   "This product includes software developed by the

-   University of Chicago, as Operator of Argonne National

-   Laboratory.

-

-Alternately, this acknowledgment may appear in the software

-itself, if and wherever such third-party acknowledgments

-normally appear.

-

-4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS"

-WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE

-UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND

-THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR

-IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES

-OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE

-OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY

-OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR

-USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF

-THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4)

-DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION

-UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL

-BE CORRECTED.

-

-5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT

-HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF

-ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT,

-INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF

-ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF

-PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER

-SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT

-(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE,

-EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE

-POSSIBILITY OF SUCH LOSS OR DAMAGES.

-

+Minpack Copyright Notice (1999) University of Chicago.  All rights reserved
+
+Redistribution and use in source and binary forms, with or
+without modification, are permitted provided that the
+following conditions are met:
+
+1. Redistributions of source code must retain the above
+copyright notice, this list of conditions and the following
+disclaimer.
+
+2. Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following
+disclaimer in the documentation and/or other materials
+provided with the distribution.
+
+3. The end-user documentation included with the
+redistribution, if any, must include the following
+acknowledgment:
+
+   "This product includes software developed by the
+   University of Chicago, as Operator of Argonne National
+   Laboratory.
+
+Alternately, this acknowledgment may appear in the software
+itself, if and wherever such third-party acknowledgments
+normally appear.
+
+4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS"
+WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE
+UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND
+THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE
+OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY
+OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR
+USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF
+THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4)
+DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION
+UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL
+BE CORRECTED.
+
+5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT
+HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF
+ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT,
+INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF
+ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF
+PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER
+SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT
+(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE,
+EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE
+POSSIBILITY OF SUCH LOSS OR DAMAGES.

diff --git a/CTestConfig.cmake b/CTestConfig.cmake
new file mode 100644
index 0000000..0ea24b8
--- /dev/null
+++ b/CTestConfig.cmake

@@ -0,0 +1,17 @@
+## This file should be placed in the root directory of your project.
+## Then modify the CMakeLists.txt file in the root directory of your
+## project to incorporate the testing dashboard.
+## # The following are required to uses Dart and the Cdash dashboard
+##   enable_testing()
+##   include(CTest)
+set(CTEST_PROJECT_NAME "Eigen")
+set(CTEST_NIGHTLY_START_TIME "00:00:00 UTC")
+
+set(CTEST_DROP_METHOD "http")
+set(CTEST_DROP_SITE "my.cdash.org")
+set(CTEST_DROP_LOCATION "/submit.php?project=Eigen")
+set(CTEST_DROP_SITE_CDASH TRUE)
+#set(CTEST_PROJECT_SUBPROJECTS
+#Official
+#Unsupported
+#)

diff --git a/CTestCustom.cmake.in b/CTestCustom.cmake.in
new file mode 100644
index 0000000..89e487f
--- /dev/null
+++ b/CTestCustom.cmake.in

@@ -0,0 +1,4 @@
+
+set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "2000")
+set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS   "2000")
+list(APPEND CTEST_CUSTOM_ERROR_EXCEPTION    @EIGEN_CTEST_ERROR_EXCEPTION@)

diff --git a/Eigen/Cholesky b/Eigen/Cholesky
index 7314d32..a318ceb 100644
--- a/Eigen/Cholesky
+++ b/Eigen/Cholesky

@@ -1,7 +1,15 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_CHOLESKY_MODULE_H
 #define EIGEN_CHOLESKY_MODULE_H
 
 #include "Core"
+#include "Jacobi"
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
@@ -21,14 +29,17 @@
   * \endcode
   */
 
-#include "src/misc/Solve.h"
 #include "src/Cholesky/LLT.h"
 #include "src/Cholesky/LDLT.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/Cholesky/LLT_MKL.h"
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
+#include "src/misc/lapacke.h"
+#endif
+#include "src/Cholesky/LLT_LAPACKE.h"
 #endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_CHOLESKY_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */

diff --git a/Eigen/CholmodSupport b/Eigen/CholmodSupport
index b056180..bed8924 100644
--- a/Eigen/CholmodSupport
+++ b/Eigen/CholmodSupport

@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_CHOLMODSUPPORT_MODULE_H
 #define EIGEN_CHOLMODSUPPORT_MODULE_H
 
@@ -12,7 +19,7 @@
 /** \ingroup Support_modules
   * \defgroup CholmodSupport_Module CholmodSupport module
   *
-  * This module provides an interface to the Cholmod library which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
+  * This module provides an interface to the Cholmod library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
   * It provides the two following main factorization classes:
   * - class CholmodSupernodalLLT: a supernodal LLT Cholesky factorization.
   * - class CholmodDecomposiiton: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial).
@@ -33,12 +40,9 @@
   *
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 #include "src/CholmodSupport/CholmodSupport.h"
 
-
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_CHOLMODSUPPORT_MODULE_H
+

diff --git a/Eigen/Core b/Eigen/Core
index 19de854..3c03519 100644
--- a/Eigen/Core
+++ b/Eigen/Core

@@ -11,223 +11,55 @@
 #ifndef EIGEN_CORE_H
 #define EIGEN_CORE_H
 
-// first thing Eigen does: stop the compiler from committing suicide
+// first thing Eigen does: stop the compiler from reporting useless warnings.
 #include "src/Core/util/DisableStupidWarnings.h"
 
-// versions of Eigen without conflict. Do not use outside of :eigen3_restricted.
-#ifdef GOOGLE3_EIGEN_MPL2_ONLY_OVERRIDE
-#undef EIGEN_MPL2_ONLY
-#endif
-
-#ifdef __CUDACC__
-  // Do not try asserts on CUDA!
-  #ifndef EIGEN_NO_DEBUG
-  #define EIGEN_NO_DEBUG
-  #endif
-
-  #ifdef EIGEN_INTERNAL_DEBUGGING
-  #undef EIGEN_INTERNAL_DEBUGGING
-  #endif
-
-  // Do not try to vectorize on CUDA!
-  #define EIGEN_DONT_VECTORIZE
-
-  // All functions callable from CUDA code must be qualified with __device__
-  #define EIGEN_DEVICE_FUNC __host__ __device__
-
-#else
-  #define EIGEN_DEVICE_FUNC
-
-#endif
-
-// CUDA before C++11 support does not have std::max or std::min
-#if defined(__CUDA_ARCH__) && (__cplusplus < 201103L)
-  #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
-#else
-  #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
-#endif
-
 // then include this file where all our macros are defined. It's really important to do it first because
-// it's where we do all the alignment settings (platform detection and honoring the user's will if he
-// defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization.
+// it's where we do all the compiler/OS/arch detections and define most defaults.
 #include "src/Core/util/Macros.h"
 
+// This detects SSE/AVX/NEON/etc. and configure alignment settings
+#include "src/Core/util/ConfigureVectorization.h"
+
+// We need cuda_runtime.h/hip_runtime.h to ensure that
+// the EIGEN_USING_STD macro works properly on the device side
+#if defined(EIGEN_CUDACC)
+  #include <cuda_runtime.h>
+#elif defined(EIGEN_HIPCC)
+  #include <hip/hip_runtime.h>
+#endif
+
+
+#ifdef EIGEN_EXCEPTIONS
+  #include <new>
+#endif
+
 // Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3)
 // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
-#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6)
+#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6) && EIGEN_GNUC_AT_MOST(5,5)
   #pragma GCC optimize ("-fno-ipa-cp-clone")
 #endif
 
+// Prevent ICC from specializing std::complex operators that silently fail
+// on device. This allows us to use our own device-compatible specializations
+// instead.
+#if defined(EIGEN_COMP_ICC) && defined(EIGEN_GPU_COMPILE_PHASE) \
+    && !defined(_OVERRIDE_COMPLEX_SPECIALIZATION_)
+#define _OVERRIDE_COMPLEX_SPECIALIZATION_ 1
+#endif
 #include <complex>
 
 // this include file manages BLAS and MKL related macros
 // and inclusion of their respective header files
 #include "src/Core/util/MKL_support.h"
 
-// if alignment is disabled, then disable vectorization. Note: EIGEN_ALIGN is the proper check, it takes into
-// account both the user's will (EIGEN_DONT_ALIGN) and our own platform checks
-#if !EIGEN_ALIGN
-  #ifndef EIGEN_DONT_VECTORIZE
-    #define EIGEN_DONT_VECTORIZE
-  #endif
+
+#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
+  #define EIGEN_HAS_GPU_FP16
 #endif
 
-#if EIGEN_COMP_MSVC
-  #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
-  #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
-    // Remember that usage of defined() in a #define is undefined by the standard.
-    // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
-    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
-      #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
-    #endif
-  #endif
-#else
-  // Remember that usage of defined() in a #define is undefined by the standard
-  #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
-    #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
-  #endif
-#endif
-
-#ifndef EIGEN_DONT_VECTORIZE
-
-  #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
-
-    // Defines symbols for compile-time detection of which instructions are
-    // used.
-    // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_SSE
-    #define EIGEN_VECTORIZE_SSE2
-
-    // Detect sse3/ssse3/sse4:
-    // gcc and icc defines __SSE3__, ...
-    // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
-    // want to force the use of those instructions with msvc.
-    #ifdef __SSE3__
-      #define EIGEN_VECTORIZE_SSE3
-    #endif
-    #ifdef __SSSE3__
-      #define EIGEN_VECTORIZE_SSSE3
-    #endif
-    #ifdef __SSE4_1__
-      #define EIGEN_VECTORIZE_SSE4_1
-    #endif
-    #ifdef __SSE4_2__
-      #define EIGEN_VECTORIZE_SSE4_2
-    #endif
-    #ifdef __AVX__
-      #define EIGEN_VECTORIZE_AVX
-      #define EIGEN_VECTORIZE_SSE3
-      #define EIGEN_VECTORIZE_SSSE3
-      #define EIGEN_VECTORIZE_SSE4_1
-      #define EIGEN_VECTORIZE_SSE4_2
-    #endif
-    #ifdef __AVX2__
-      #define EIGEN_VECTORIZE_AVX2
-    #endif
-    #ifdef __AVX512F__
-      #define EIGEN_VECTORIZE_AVX512
-      #define EIGEN_VECTORIZE_AVX
-      #define EIGEN_VECTORIZE_FMA
-      #ifdef __AVX512DQ__
-       #define EIGEN_VECTORIZE_AVX512DQ
-      #endif
-      #ifdef __AVX512BW__
-       #define EIGEN_VECTORIZE_AVX512BW
-      #endif
-    #endif
-    #ifdef __FMA__
-      #define EIGEN_VECTORIZE_FMA
-    #endif
-    // include files
-
-    // This extern "C" works around a MINGW-w64 compilation issue
-    // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
-    // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
-    // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
-    // with conflicting linkage.  The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
-    // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
-    // notice that since these are C headers, the extern "C" is theoretically needed anyways.
-    extern "C" {
-      // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
-      // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
-      #if EIGEN_COMP_ICC >= 1110
-        #include <immintrin.h>
-      #else
-        #include <mmintrin.h>
-        #include <emmintrin.h>
-        #include <xmmintrin.h>
-        #ifdef  EIGEN_VECTORIZE_SSE3
-        #include <pmmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_SSSE3
-        #include <tmmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_SSE4_1
-        #include <smmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_SSE4_2
-        #include <nmmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_AVX
-        #include <immintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_AVX512
-        #include <immintrin.h>
-        #endif
-      #endif
-    } // end extern "C"
-  #elif defined __VSX__
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_VSX
-    #include <altivec.h>
-    // We need to #undef all these ugly tokens defined in <altivec.h>
-    // => use __vector instead of vector
-    #undef bool
-    #undef vector
-    #undef pixel
-  #elif defined __ALTIVEC__
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_ALTIVEC
-    #include <altivec.h>
-    // We need to #undef all these ugly tokens defined in <altivec.h>
-    // => use __vector instead of vector
-    #undef bool
-    #undef vector
-    #undef pixel
-  #elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_NEON
-    #include <arm_neon.h>
-  #endif
-#endif
-
-#include <float.h>
-#include <limits.h>
-#include <math.h>
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
-#if defined(__F16C__) && !EIGEN_COMP_CLANG
-  // We can use the optimized fp16 to float and float to fp16 conversion routines. Beware the intrinsic is missing from third_party/gpu/crosstool (b/28078281).
-  #define EIGEN_HAS_FP16_C
-#endif
-
-#if defined(__NVCC__)
-  #define EIGEN_VECTORIZE_CUDA
-  #include <vector_types.h>
-  #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-    #define EIGEN_HAS_CUDA_FP16
-  #endif
-#elif defined(__clang__) && defined(__CUDA__)
-  #define EIGEN_VECTORIZE_CUDA
-  #include <vector_types.h>
-#endif
-
-#if defined EIGEN_HAS_CUDA_FP16
-  #include <host_defines.h>
-  #include <cuda_fp16.h>
+#if defined(EIGEN_HAS_CUDA_BF16) || defined(EIGEN_HAS_HIP_BF16)
+  #define EIGEN_HAS_GPU_BF16
 #endif
 
 #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
@@ -249,138 +81,74 @@
 #include <cstddef>
 #include <cstdlib>
 #include <cmath>
-#if !defined HEXAGON
-using ::std::isfinite;
-using ::std::fpclassify;
-#endif
-#if defined(ANDROID) || defined(HEXAGON)
-#include <math.h>
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
-#endif
 #include <cassert>
 #include <functional>
-using ::std::binary_function;
-using ::std::equal_to;
-using ::std::greater;
-#include <iosfwd>
-using ::std::ios;
-using ::std::iostream;
-using ::std::ios_base;
-using ::std::ostream;
+#include <sstream>
+#ifndef EIGEN_NO_IO
+  #include <iosfwd>
+#endif
 #include <cstring>
 #include <string>
-using ::std::string;
-using ::std::basic_string;
 #include <limits>
-using ::std::numeric_limits;
 #include <climits> // for CHAR_BIT
 // for min/max:
 #include <algorithm>
-using ::std::min;
-using ::std::max;
-using ::std::fill;
-using ::std::fill_n;
-using ::std::lower_bound;
-using ::std::equal;
-using ::std::sort;
+
+#if EIGEN_HAS_CXX11
+#include <array>
+#endif
+
+// for std::is_nothrow_move_assignable
+#ifdef EIGEN_INCLUDE_TYPE_TRAITS
+#include <type_traits>
+#endif
 
 // for outputting debug info
 #ifdef EIGEN_DEBUG_ASSIGN
 #include <iostream>
-using ::std::cout;
-using ::std::cin;
-using ::std::cerr;
-using ::std::ios;
-using ::std::endl;
-using ::std::iostream;
-using ::std::ios_base;
-using ::std::ostream;
-using ::std::istream;
 #endif
 
 // required for __cpuid, needs to be included after cmath
-#if defined(_MSC_VER) && (defined(_M_IX86)||defined(_M_X64))
+// also required for _BitScanReverse on Windows on ARM
+#if EIGEN_COMP_MSVC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM64) && !EIGEN_OS_WINCE
   #include <intrin.h>
 #endif
 
-#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__)
-  #define EIGEN_EXCEPTIONS
+#if defined(EIGEN_USE_SYCL)
+  #undef min
+  #undef max
+  #undef isnan
+  #undef isinf
+  #undef isfinite
+  #include <CL/sycl.hpp>
+  #include <map>
+  #include <memory>
+  #include <utility>
+  #include <thread>
+  #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM0
+  #define EIGEN_SYCL_LOCAL_THREAD_DIM0 16
+  #endif
+  #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM1
+  #define EIGEN_SYCL_LOCAL_THREAD_DIM1 16
+  #endif
 #endif
 
-#ifdef EIGEN_EXCEPTIONS
-  #include <new>
+
+#if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT
+// This will generate an error message:
+#error Eigen2-support is only available up to version 3.2. Please go to "http://eigen.tuxfamily.org/index.php?title=Eigen2" for further information
 #endif
 
-/** \brief Namespace containing all symbols from the %Eigen library. */
 namespace Eigen {
 
-inline static const char *SimdInstructionSetsInUse(void) {
-#if defined(EIGEN_VECTORIZE_AVX512)
-  return "AVX512, AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
-#elif defined(EIGEN_VECTORIZE_AVX)
-  return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
-#elif defined(EIGEN_VECTORIZE_SSE4_2)
-  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
-#elif defined(EIGEN_VECTORIZE_SSE4_1)
-  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
-#elif defined(EIGEN_VECTORIZE_SSSE3)
-  return "SSE, SSE2, SSE3, SSSE3";
-#elif defined(EIGEN_VECTORIZE_SSE3)
-  return "SSE, SSE2, SSE3";
-#elif defined(EIGEN_VECTORIZE_SSE2)
-  return "SSE, SSE2";
-#elif defined(EIGEN_VECTORIZE_ALTIVEC)
-  return "AltiVec";
-#elif defined(EIGEN_VECTORIZE_VSX)
-  return "VSX";
-#elif defined(EIGEN_VECTORIZE_NEON)
-  return "ARM NEON";
-#else
-  return "None";
-#endif
-}
-
-} // end namespace Eigen
-
-#define STAGE10_FULL_EIGEN2_API             10
-#define STAGE20_RESOLVE_API_CONFLICTS       20
-#define STAGE30_FULL_EIGEN3_API             30
-#define STAGE40_FULL_EIGEN3_STRICTNESS      40
-#define STAGE99_NO_EIGEN2_SUPPORT           99
-
-#if   defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS
-  #define EIGEN2_SUPPORT
-  #define EIGEN2_SUPPORT_STAGE STAGE40_FULL_EIGEN3_STRICTNESS
-#elif defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API
-  #define EIGEN2_SUPPORT
-  #define EIGEN2_SUPPORT_STAGE STAGE30_FULL_EIGEN3_API
-#elif defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS
-  #define EIGEN2_SUPPORT
-  #define EIGEN2_SUPPORT_STAGE STAGE20_RESOLVE_API_CONFLICTS
-#elif defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API
-  #define EIGEN2_SUPPORT
-  #define EIGEN2_SUPPORT_STAGE STAGE10_FULL_EIGEN2_API
-#elif defined EIGEN2_SUPPORT
-  // default to stage 3, that's what it's always meant
-  #define EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API
-  #define EIGEN2_SUPPORT_STAGE STAGE30_FULL_EIGEN3_API
-#else
-  #define EIGEN2_SUPPORT_STAGE STAGE99_NO_EIGEN2_SUPPORT
-#endif
-
-#ifdef EIGEN2_SUPPORT
-#undef minor
-#endif
-
-// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
+// we use size_t frequently and we'll never remember to prepend it with std:: every time just to
 // ensure QNX/QCC support
 using std::size_t;
 // gcc 4.6.0 wants std:: for ptrdiff_t
 using std::ptrdiff_t;
 
+}
+
 /** \defgroup Core_Module Core module
   * This is the main module of Eigen providing dense matrix and vector support
   * (both fixed and dynamic size) with all the features corresponding to a BLAS library
@@ -392,76 +160,123 @@
   */
 
 #include "src/Core/util/Constants.h"
-#include "src/Core/util/ForwardDeclarations.h"
 #include "src/Core/util/Meta.h"
+#include "src/Core/util/ForwardDeclarations.h"
 #include "src/Core/util/StaticAssert.h"
 #include "src/Core/util/XprHelper.h"
 #include "src/Core/util/Memory.h"
+#include "src/Core/util/IntegralConstant.h"
+#include "src/Core/util/SymbolicIndex.h"
 
 #include "src/Core/NumTraits.h"
 #include "src/Core/MathFunctions.h"
 #include "src/Core/GenericPacketMath.h"
+#include "src/Core/MathFunctionsImpl.h"
+#include "src/Core/arch/Default/ConjHelper.h"
+// Generic half float support
+#include "src/Core/arch/Default/Half.h"
+#include "src/Core/arch/Default/BFloat16.h"
+#include "src/Core/arch/Default/TypeCasting.h"
+#include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h"
 
 #if defined EIGEN_VECTORIZE_AVX512
   #include "src/Core/arch/SSE/PacketMath.h"
+  #include "src/Core/arch/SSE/TypeCasting.h"
+  #include "src/Core/arch/SSE/Complex.h"
   #include "src/Core/arch/AVX/PacketMath.h"
+  #include "src/Core/arch/AVX/TypeCasting.h"
+  #include "src/Core/arch/AVX/Complex.h"
   #include "src/Core/arch/AVX512/PacketMath.h"
+  #include "src/Core/arch/AVX512/TypeCasting.h"
+  #include "src/Core/arch/AVX512/Complex.h"
+  #include "src/Core/arch/SSE/MathFunctions.h"
+  #include "src/Core/arch/AVX/MathFunctions.h"
   #include "src/Core/arch/AVX512/MathFunctions.h"
 #elif defined EIGEN_VECTORIZE_AVX
   // Use AVX for floats and doubles, SSE for integers
   #include "src/Core/arch/SSE/PacketMath.h"
+  #include "src/Core/arch/SSE/TypeCasting.h"
   #include "src/Core/arch/SSE/Complex.h"
-  #include "src/Core/arch/SSE/MathFunctions.h"
   #include "src/Core/arch/AVX/PacketMath.h"
-  #include "src/Core/arch/AVX/Complex.h"
-  #include "src/Core/arch/AVX/MathFunctions.h"
   #include "src/Core/arch/AVX/TypeCasting.h"
+  #include "src/Core/arch/AVX/Complex.h"
+  #include "src/Core/arch/SSE/MathFunctions.h"
+  #include "src/Core/arch/AVX/MathFunctions.h"
 #elif defined EIGEN_VECTORIZE_SSE
   #include "src/Core/arch/SSE/PacketMath.h"
+  #include "src/Core/arch/SSE/TypeCasting.h"
   #include "src/Core/arch/SSE/MathFunctions.h"
   #include "src/Core/arch/SSE/Complex.h"
-  #include "src/Core/arch/SSE/TypeCasting.h"
 #elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
   #include "src/Core/arch/AltiVec/PacketMath.h"
   #include "src/Core/arch/AltiVec/MathFunctions.h"
   #include "src/Core/arch/AltiVec/Complex.h"
 #elif defined EIGEN_VECTORIZE_NEON
   #include "src/Core/arch/NEON/PacketMath.h"
+  #include "src/Core/arch/NEON/TypeCasting.h"
   #include "src/Core/arch/NEON/MathFunctions.h"
   #include "src/Core/arch/NEON/Complex.h"
+#elif defined EIGEN_VECTORIZE_SVE
+  #include "src/Core/arch/SVE/PacketMath.h"
+  #include "src/Core/arch/SVE/TypeCasting.h"
+  #include "src/Core/arch/SVE/MathFunctions.h"
+#elif defined EIGEN_VECTORIZE_ZVECTOR
+  #include "src/Core/arch/ZVector/PacketMath.h"
+  #include "src/Core/arch/ZVector/MathFunctions.h"
+  #include "src/Core/arch/ZVector/Complex.h"
+#elif defined EIGEN_VECTORIZE_MSA
+  #include "src/Core/arch/MSA/PacketMath.h"
+  #include "src/Core/arch/MSA/MathFunctions.h"
+  #include "src/Core/arch/MSA/Complex.h"
 #endif
 
-#include "src/Core/arch/CUDA/Complex.h"
-#include "src/Core/arch/CUDA/Half.h"
-#include "src/Core/arch/CUDA/PacketMathHalf.h"
-#include "src/Core/arch/CUDA/TypeCasting.h"
+#if defined EIGEN_VECTORIZE_GPU
+  #include "src/Core/arch/GPU/PacketMath.h"
+  #include "src/Core/arch/GPU/MathFunctions.h"
+  #include "src/Core/arch/GPU/TypeCasting.h"
+#endif
 
-#if defined EIGEN_VECTORIZE_CUDA
-  #include "src/Core/arch/CUDA/PacketMath.h"
-  #include "src/Core/arch/CUDA/MathFunctions.h"
+#if defined(EIGEN_USE_SYCL)
+  #include "src/Core/arch/SYCL/SyclMemoryModel.h"
+  #include "src/Core/arch/SYCL/InteropHeaders.h"
+#if !defined(EIGEN_DONT_VECTORIZE_SYCL)
+  #include "src/Core/arch/SYCL/PacketMath.h"
+  #include "src/Core/arch/SYCL/MathFunctions.h"
+  #include "src/Core/arch/SYCL/TypeCasting.h"
+#endif
 #endif
 
 #include "src/Core/arch/Default/Settings.h"
+// This file provides generic implementations valid for scalar as well
+#include "src/Core/arch/Default/GenericPacketMathFunctions.h"
 
 #include "src/Core/functors/TernaryFunctors.h"
 #include "src/Core/functors/BinaryFunctors.h"
 #include "src/Core/functors/UnaryFunctors.h"
 #include "src/Core/functors/NullaryFunctors.h"
 #include "src/Core/functors/StlFunctors.h"
+#include "src/Core/functors/AssignmentFunctors.h"
 
+// Specialized functors to enable the processing of complex numbers
+// on CUDA devices
+#ifdef EIGEN_CUDACC
+#include "src/Core/arch/CUDA/Complex.h"
+#endif
+
+#include "src/Core/util/IndexedViewHelper.h"
+#include "src/Core/util/ReshapedHelper.h"
+#include "src/Core/ArithmeticSequence.h"
+#ifndef EIGEN_NO_IO
+  #include "src/Core/IO.h"
+#endif
 #include "src/Core/DenseCoeffsBase.h"
 #include "src/Core/DenseBase.h"
 #include "src/Core/MatrixBase.h"
 #include "src/Core/EigenBase.h"
 
-#ifdef EIGEN_ENABLE_EVALUATORS
-#include "src/Core/functors/AssignmentFunctors.h"
 #include "src/Core/Product.h"
 #include "src/Core/CoreEvaluators.h"
 #include "src/Core/AssignEvaluator.h"
-#include "src/Core/ProductEvaluators.h"
-#endif
-#include "src/Core/NullaryWrapper.h"
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN // work around Doxygen bug triggered by Assign.h r814874
                                 // at least confirmed with Doxygen 1.5.5 and 1.5.6
@@ -470,10 +285,11 @@
 
 #include "src/Core/ArrayBase.h"
 #include "src/Core/util/BlasUtil.h"
-#include "src/Core/util/MatrixMapper.h"
 #include "src/Core/DenseStorage.h"
 #include "src/Core/NestByValue.h"
-#include "src/Core/ForceAlignedAccess.h"
+
+// #include "src/Core/ForceAlignedAccess.h"
+
 #include "src/Core/ReturnByValue.h"
 #include "src/Core/NoAlias.h"
 #include "src/Core/PlainObjectBase.h"
@@ -487,32 +303,34 @@
 #include "src/Core/SelfCwiseBinaryOp.h"
 #include "src/Core/Dot.h"
 #include "src/Core/StableNorm.h"
-#include "src/Core/MapBase.h"
 #include "src/Core/Stride.h"
+#include "src/Core/MapBase.h"
 #include "src/Core/Map.h"
+#include "src/Core/Ref.h"
 #include "src/Core/Block.h"
 #include "src/Core/VectorBlock.h"
-#include "src/Core/Ref.h"
+#include "src/Core/IndexedView.h"
+#include "src/Core/Reshaped.h"
 #include "src/Core/Transpose.h"
 #include "src/Core/DiagonalMatrix.h"
 #include "src/Core/Diagonal.h"
 #include "src/Core/DiagonalProduct.h"
-#include "src/Core/PermutationMatrix.h"
-#include "src/Core/Transpositions.h"
 #include "src/Core/Redux.h"
 #include "src/Core/Visitor.h"
 #include "src/Core/Fuzzy.h"
-#include "src/Core/IO.h"
 #include "src/Core/Swap.h"
 #include "src/Core/CommaInitializer.h"
-#include "src/Core/Flagged.h"
-#include "src/Core/ProductBase.h"
 #include "src/Core/GeneralProduct.h"
+#include "src/Core/Solve.h"
+#include "src/Core/Inverse.h"
+#include "src/Core/SolverBase.h"
+#include "src/Core/PermutationMatrix.h"
+#include "src/Core/Transpositions.h"
 #include "src/Core/TriangularMatrix.h"
 #include "src/Core/SelfAdjointView.h"
 #include "src/Core/products/GeneralBlockPanelKernel.h"
 #include "src/Core/products/Parallelizer.h"
-#include "src/Core/products/CoeffBasedProduct.h"
+#include "src/Core/ProductEvaluators.h"
 #include "src/Core/products/GeneralMatrixVector.h"
 #include "src/Core/products/GeneralMatrixMatrix.h"
 #include "src/Core/SolveTriangular.h"
@@ -527,24 +345,33 @@
 #include "src/Core/products/TriangularSolverVector.h"
 #include "src/Core/BandMatrix.h"
 #include "src/Core/CoreIterators.h"
+#include "src/Core/ConditionEstimator.h"
+
+#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
+  #include "src/Core/arch/AltiVec/MatrixProduct.h"
+#elif defined EIGEN_VECTORIZE_NEON
+  #include "src/Core/arch/NEON/GeneralBlockPanelKernel.h"
+#endif
 
 #include "src/Core/BooleanRedux.h"
 #include "src/Core/Select.h"
 #include "src/Core/VectorwiseOp.h"
+#include "src/Core/PartialReduxEvaluator.h"
 #include "src/Core/Random.h"
 #include "src/Core/Replicate.h"
 #include "src/Core/Reverse.h"
 #include "src/Core/ArrayWrapper.h"
+#include "src/Core/StlIterators.h"
 
 #ifdef EIGEN_USE_BLAS
-#include "src/Core/products/GeneralMatrixMatrix_MKL.h"
-#include "src/Core/products/GeneralMatrixVector_MKL.h"
-#include "src/Core/products/GeneralMatrixMatrixTriangular_MKL.h"
-#include "src/Core/products/SelfadjointMatrixMatrix_MKL.h"
-#include "src/Core/products/SelfadjointMatrixVector_MKL.h"
-#include "src/Core/products/TriangularMatrixMatrix_MKL.h"
-#include "src/Core/products/TriangularMatrixVector_MKL.h"
-#include "src/Core/products/TriangularSolverMatrix_MKL.h"
+#include "src/Core/products/GeneralMatrixMatrix_BLAS.h"
+#include "src/Core/products/GeneralMatrixVector_BLAS.h"
+#include "src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h"
+#include "src/Core/products/SelfadjointMatrixMatrix_BLAS.h"
+#include "src/Core/products/SelfadjointMatrixVector_BLAS.h"
+#include "src/Core/products/TriangularMatrixMatrix_BLAS.h"
+#include "src/Core/products/TriangularMatrixVector_BLAS.h"
+#include "src/Core/products/TriangularSolverMatrix_BLAS.h"
 #endif // EIGEN_USE_BLAS
 
 #ifdef EIGEN_USE_MKL_VML
@@ -555,8 +382,4 @@
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#ifdef EIGEN2_SUPPORT
-#include "Eigen2Support"
-#endif
-
 #endif // EIGEN_CORE_H

diff --git a/Eigen/Eigen b/Eigen/Eigen
new file mode 100644
index 0000000..654c8dc
--- /dev/null
+++ b/Eigen/Eigen

@@ -0,0 +1,2 @@
+#include "Dense"
+#include "Sparse"

diff --git a/Eigen/Eigenvalues b/Eigen/Eigenvalues
index 53c5a73..5467a2e 100644
--- a/Eigen/Eigenvalues
+++ b/Eigen/Eigenvalues

@@ -1,16 +1,23 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_EIGENVALUES_MODULE_H
 #define EIGEN_EIGENVALUES_MODULE_H
 
 #include "Core"
 
-#include "src/Core/util/DisableStupidWarnings.h"
-
 #include "Cholesky"
 #include "Jacobi"
 #include "Householder"
 #include "LU"
 #include "Geometry"
 
+#include "src/Core/util/DisableStupidWarnings.h"
+
 /** \defgroup Eigenvalues_Module Eigenvalues module
   *
   *
@@ -25,6 +32,7 @@
   * \endcode
   */
 
+#include "src/misc/RealSvd2x2.h"
 #include "src/Eigenvalues/Tridiagonalization.h"
 #include "src/Eigenvalues/RealSchur.h"
 #include "src/Eigenvalues/EigenSolver.h"
@@ -37,12 +45,16 @@
 #include "src/Eigenvalues/GeneralizedEigenSolver.h"
 #include "src/Eigenvalues/MatrixBaseEigenvalues.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/Eigenvalues/RealSchur_MKL.h"
-#include "src/Eigenvalues/ComplexSchur_MKL.h"
-#include "src/Eigenvalues/SelfAdjointEigenSolver_MKL.h"
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
+#include "src/misc/lapacke.h"
+#endif
+#include "src/Eigenvalues/RealSchur_LAPACKE.h"
+#include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
+#include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"
 #endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_EIGENVALUES_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */

diff --git a/Eigen/Geometry b/Eigen/Geometry
index a7c2fd9..bc78110 100644
--- a/Eigen/Geometry
+++ b/Eigen/Geometry

@@ -1,30 +1,32 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_GEOMETRY_MODULE_H
 #define EIGEN_GEOMETRY_MODULE_H
 
 #include "Core"
 
-#include "src/Core/util/DisableStupidWarnings.h"
-
 #include "SVD"
 #include "LU"
 #include <limits>
-using ::std::numeric_limits;
 
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
+#include "src/Core/util/DisableStupidWarnings.h"
 
 /** \defgroup Geometry_Module Geometry module
   *
-  *
-  *
   * This module provides support for:
   *  - fixed-size homogeneous transformations
   *  - translation, scaling, 2D and 3D rotations
-  *  - quaternions
-  *  - \ref MatrixBase::cross() "cross product"
-  *  - \ref MatrixBase::unitOrthogonal() "orthognal vector generation"
-  *  - some linear components: parametrized-lines and hyperplanes
+  *  - \link Quaternion quaternions \endlink
+  *  - cross products (\ref MatrixBase::cross, \ref MatrixBase::cross3)
+  *  - orthognal vector generation (\ref MatrixBase::unitOrthogonal)
+  *  - some linear components: \link ParametrizedLine parametrized-lines \endlink and \link Hyperplane hyperplanes \endlink
+  *  - \link AlignedBox axis aligned bounding boxes \endlink
+  *  - \link umeyama least-square transformation fitting \endlink
   *
   * \code
   * #include <Eigen/Geometry>
@@ -34,32 +36,24 @@
 #include "src/Geometry/OrthoMethods.h"
 #include "src/Geometry/EulerAngles.h"
 
-#if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
-  #include "src/Geometry/Homogeneous.h"
-  #include "src/Geometry/RotationBase.h"
-  #include "src/Geometry/Rotation2D.h"
-  #include "src/Geometry/Quaternion.h"
-  #include "src/Geometry/AngleAxis.h"
-  #include "src/Geometry/Transform.h"
-  #include "src/Geometry/Translation.h"
-  #include "src/Geometry/Scaling.h"
-  #include "src/Geometry/Hyperplane.h"
-  #include "src/Geometry/ParametrizedLine.h"
-  #include "src/Geometry/AlignedBox.h"
-  #include "src/Geometry/Umeyama.h"
+#include "src/Geometry/Homogeneous.h"
+#include "src/Geometry/RotationBase.h"
+#include "src/Geometry/Rotation2D.h"
+#include "src/Geometry/Quaternion.h"
+#include "src/Geometry/AngleAxis.h"
+#include "src/Geometry/Transform.h"
+#include "src/Geometry/Translation.h"
+#include "src/Geometry/Scaling.h"
+#include "src/Geometry/Hyperplane.h"
+#include "src/Geometry/ParametrizedLine.h"
+#include "src/Geometry/AlignedBox.h"
+#include "src/Geometry/Umeyama.h"
 
-  // Use the SSE optimized version whenever possible. At the moment the
-  // SSE version doesn't compile when AVX is enabled
-  #if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
-    #include "src/Geometry/arch/Geometry_SSE.h"
-  #endif
-#endif
-
-#ifdef EIGEN2_SUPPORT
-#include "src/Eigen2Support/Geometry/All.h"
+// Use the SSE optimized version whenever possible.
+#if (defined EIGEN_VECTORIZE_SSE) || (defined EIGEN_VECTORIZE_NEON)
+#include "src/Geometry/arch/Geometry_SIMD.h"
 #endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_GEOMETRY_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */

diff --git a/Eigen/Householder b/Eigen/Householder
index 6e348db..f2fa799 100644
--- a/Eigen/Householder
+++ b/Eigen/Householder

@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_HOUSEHOLDER_MODULE_H
 #define EIGEN_HOUSEHOLDER_MODULE_H
 
@@ -20,4 +27,3 @@
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_HOUSEHOLDER_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */

diff --git a/Eigen/IterativeLinearSolvers b/Eigen/IterativeLinearSolvers
new file mode 100644
index 0000000..957d575
--- /dev/null
+++ b/Eigen/IterativeLinearSolvers

@@ -0,0 +1,48 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ITERATIVELINEARSOLVERS_MODULE_H
+#define EIGEN_ITERATIVELINEARSOLVERS_MODULE_H
+
+#include "SparseCore"
+#include "OrderingMethods"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** 
+  * \defgroup IterativeLinearSolvers_Module IterativeLinearSolvers module
+  *
+  * This module currently provides iterative methods to solve problems of the form \c A \c x = \c b, where \c A is a squared matrix, usually very large and sparse.
+  * Those solvers are accessible via the following classes:
+  *  - ConjugateGradient for selfadjoint (hermitian) matrices,
+  *  - LeastSquaresConjugateGradient for rectangular least-square problems,
+  *  - BiCGSTAB for general square matrices.
+  *
+  * These iterative solvers are associated with some preconditioners:
+  *  - IdentityPreconditioner - not really useful
+  *  - DiagonalPreconditioner - also called Jacobi preconditioner, work very well on diagonal dominant matrices.
+  *  - IncompleteLUT - incomplete LU factorization with dual thresholding
+  *
+  * Such problems can also be solved using the direct sparse decomposition modules: SparseCholesky, CholmodSupport, UmfPackSupport, SuperLUSupport.
+  *
+    \code
+    #include <Eigen/IterativeLinearSolvers>
+    \endcode
+  */
+
+#include "src/IterativeLinearSolvers/SolveWithGuess.h"
+#include "src/IterativeLinearSolvers/IterativeSolverBase.h"
+#include "src/IterativeLinearSolvers/BasicPreconditioners.h"
+#include "src/IterativeLinearSolvers/ConjugateGradient.h"
+#include "src/IterativeLinearSolvers/LeastSquareConjugateGradient.h"
+#include "src/IterativeLinearSolvers/BiCGSTAB.h"
+#include "src/IterativeLinearSolvers/IncompleteLUT.h"
+#include "src/IterativeLinearSolvers/IncompleteCholesky.h"
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_ITERATIVELINEARSOLVERS_MODULE_H

diff --git a/Eigen/Jacobi b/Eigen/Jacobi
index cbebb53..43edc7a 100644
--- a/Eigen/Jacobi
+++ b/Eigen/Jacobi

@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_JACOBI_MODULE_H
 #define EIGEN_JACOBI_MODULE_H
 
@@ -22,4 +29,4 @@
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_JACOBI_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
+

diff --git a/Eigen/KLUSupport b/Eigen/KLUSupport
new file mode 100644
index 0000000..b23d905
--- /dev/null
+++ b/Eigen/KLUSupport

@@ -0,0 +1,41 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_KLUSUPPORT_MODULE_H
+#define EIGEN_KLUSUPPORT_MODULE_H
+
+#include <Eigen/SparseCore>
+
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+
+extern "C" {
+#include <btf.h>
+#include <klu.h>
+   }
+
+/** \ingroup Support_modules
+  * \defgroup KLUSupport_Module KLUSupport module
+  *
+  * This module provides an interface to the KLU library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
+  * It provides the following factorization class:
+  * - class KLU: a sparse LU factorization, well-suited for circuit simulation.
+  *
+  * \code
+  * #include <Eigen/KLUSupport>
+  * \endcode
+  *
+  * In order to use this module, the klu and btf headers must be accessible from the include paths, and your binary must be linked to the klu library and its dependencies.
+  * The dependencies depend on how umfpack has been compiled.
+  * For a cmake based project, you can use our FindKLU.cmake module to help you in this task.
+  *
+  */
+
+#include "src/KLUSupport/KLUSupport.h"
+
+#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+
+#endif // EIGEN_KLUSUPPORT_MODULE_H

diff --git a/Eigen/LU b/Eigen/LU
index e5c3f32..1236ceb 100644
--- a/Eigen/LU
+++ b/Eigen/LU

@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_LU_MODULE_H
 #define EIGEN_LU_MODULE_H
 
@@ -16,28 +23,25 @@
   * \endcode
   */
 
-#include "src/misc/Solve.h"
 #include "src/misc/Kernel.h"
 #include "src/misc/Image.h"
 #include "src/LU/FullPivLU.h"
 #include "src/LU/PartialPivLU.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/LU/PartialPivLU_MKL.h"
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
+#include "src/misc/lapacke.h"
+#endif
+#include "src/LU/PartialPivLU_LAPACKE.h"
 #endif
 #include "src/LU/Determinant.h"
-#include "src/LU/Inverse.h"
+#include "src/LU/InverseImpl.h"
 
-// Use the SSE optimized version whenever possible. At the moment the
-// SSE version doesn't compile when AVX is enabled
-#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
-  #include "src/LU/arch/Inverse_SSE.h"
-#endif
-
-#ifdef EIGEN2_SUPPORT
-  #include "src/Eigen2Support/LU.h"
+#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_NEON
+  #include "src/LU/arch/InverseSize4.h"
 #endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_LU_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */

diff --git a/Eigen/MetisSupport b/Eigen/MetisSupport
new file mode 100644
index 0000000..85c41bf
--- /dev/null
+++ b/Eigen/MetisSupport

@@ -0,0 +1,35 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_METISSUPPORT_MODULE_H
+#define EIGEN_METISSUPPORT_MODULE_H
+
+#include "SparseCore"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+extern "C" {
+#include <metis.h>
+}
+
+
+/** \ingroup Support_modules
+  * \defgroup MetisSupport_Module MetisSupport module
+  *
+  * \code
+  * #include <Eigen/MetisSupport>
+  * \endcode
+  * This module defines an interface to the METIS reordering package (http://glaros.dtc.umn.edu/gkhome/views/metis). 
+  * It can be used just as any other built-in method as explained in \link OrderingMethods_Module here. \endlink
+  */
+
+
+#include "src/MetisSupport/MetisSupport.h"
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_METISSUPPORT_MODULE_H

diff --git a/Eigen/OrderingMethods b/Eigen/OrderingMethods
index 7c0f1ff..29691a6 100644
--- a/Eigen/OrderingMethods
+++ b/Eigen/OrderingMethods

@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_ORDERINGMETHODS_MODULE_H
 #define EIGEN_ORDERINGMETHODS_MODULE_H
 
@@ -56,10 +63,7 @@
   * \endcode
   */
 
-#ifndef EIGEN_MPL2_ONLY
 #include "src/OrderingMethods/Amd.h"
-#endif
-
 #include "src/OrderingMethods/Ordering.h"
 #include "src/Core/util/ReenableStupidWarnings.h"
 

diff --git a/Eigen/PaStiXSupport b/Eigen/PaStiXSupport
index 7c616ee..234619a 100644
--- a/Eigen/PaStiXSupport
+++ b/Eigen/PaStiXSupport

@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_PASTIXSUPPORT_MODULE_H
 #define EIGEN_PASTIXSUPPORT_MODULE_H
 
@@ -5,7 +12,6 @@
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
-#include <complex.h>
 extern "C" {
 #include <pastix_nompi.h>
 #include <pastix.h>
@@ -30,17 +36,14 @@
   * \endcode
   *
   * In order to use this module, the PaSTiX headers must be accessible from the include paths, and your binary must be linked to the PaSTiX library and its dependencies.
+  * This wrapper resuires PaStiX version 5.x compiled without MPI support.
   * The dependencies depend on how PaSTiX has been compiled.
   * For a cmake based project, you can use our FindPaSTiX.cmake module to help you in this task.
   *
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 #include "src/PaStiXSupport/PaStiXSupport.h"
 
-
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_PASTIXSUPPORT_MODULE_H

diff --git a/Eigen/PardisoSupport b/Eigen/PardisoSupport
index 99330ce..340edf5 100644
--- a/Eigen/PardisoSupport
+++ b/Eigen/PardisoSupport

@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_PARDISOSUPPORT_MODULE_H
 #define EIGEN_PARDISOSUPPORT_MODULE_H
 
@@ -7,8 +14,6 @@
 
 #include <mkl_pardiso.h>
 
-#include <unsupported/Eigen/SparseExtra>
-
 /** \ingroup Support_modules
   * \defgroup PardisoSupport_Module PardisoSupport module
   *

diff --git a/Eigen/QR b/Eigen/QR
index 592ba19..8465b62 100644
--- a/Eigen/QR
+++ b/Eigen/QR

@@ -1,14 +1,21 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_QR_MODULE_H
 #define EIGEN_QR_MODULE_H
 
 #include "Core"
 
-#include "src/Core/util/DisableStupidWarnings.h"
-
 #include "Cholesky"
 #include "Jacobi"
 #include "Householder"
 
+#include "src/Core/util/DisableStupidWarnings.h"
+
 /** \defgroup QR_Module QR module
   *
   *
@@ -24,25 +31,20 @@
   * \endcode
   */
 
-#include "src/misc/Solve.h"
 #include "src/QR/HouseholderQR.h"
 #include "src/QR/FullPivHouseholderQR.h"
 #include "src/QR/ColPivHouseholderQR.h"
 #include "src/QR/CompleteOrthogonalDecomposition.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/QR/HouseholderQR_MKL.h"
-#include "src/QR/ColPivHouseholderQR_MKL.h"
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
+#include "src/misc/lapacke.h"
 #endif
-
-#ifdef EIGEN2_SUPPORT
-#include "src/Eigen2Support/QR.h"
+#include "src/QR/HouseholderQR_LAPACKE.h"
+#include "src/QR/ColPivHouseholderQR_LAPACKE.h"
 #endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#ifdef EIGEN2_SUPPORT
-#include "Eigenvalues"
-#endif
-
 #endif // EIGEN_QR_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */

diff --git a/Eigen/QtAlignedMalloc b/Eigen/QtAlignedMalloc
index 6717e9b..6fe8237 100644
--- a/Eigen/QtAlignedMalloc
+++ b/Eigen/QtAlignedMalloc

@@ -1,12 +1,20 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #ifndef EIGEN_QTMALLOC_MODULE_H
 #define EIGEN_QTMALLOC_MODULE_H
 
 #include "Core"
 
+#if (!EIGEN_MALLOC_ALREADY_ALIGNED)
+
 #include "src/Core/util/DisableStupidWarnings.h"
 
-void *qMalloc(size_t size)
+void *qMalloc(std::size_t size)
 {
   return Eigen::internal::aligned_malloc(size);
 }
@@ -16,14 +24,16 @@
   Eigen::internal::aligned_free(ptr);
 }
 
-void *qRealloc(void *ptr, size_t size)
+void *qRealloc(void *ptr, std::size_t size)
 {
   void* newPtr = Eigen::internal::aligned_malloc(size);
-  memcpy(newPtr, ptr, size);
+  std::memcpy(newPtr, ptr, size);
   Eigen::internal::aligned_free(ptr);
   return newPtr;
 }
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
+#endif
+
 #endif // EIGEN_QTMALLOC_MODULE_H

diff --git a/Eigen/SPQRSupport b/Eigen/SPQRSupport
index 7701644..f70390c 100644
--- a/Eigen/SPQRSupport
+++ b/Eigen/SPQRSupport

@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPQRSUPPORT_MODULE_H
 #define EIGEN_SPQRSUPPORT_MODULE_H
 
@@ -10,7 +17,7 @@
 /** \ingroup Support_modules
   * \defgroup SPQRSupport_Module SuiteSparseQR module
   * 
-  * This module provides an interface to the SPQR library, which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
+  * This module provides an interface to the SPQR library, which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
   *
   * \code
   * #include <Eigen/SPQRSupport>
@@ -21,8 +28,6 @@
   *
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
 #include "src/CholmodSupport/CholmodSupport.h"
 #include "src/SPQRSupport/SuiteSparseQRSupport.h"
 

diff --git a/Eigen/SVD b/Eigen/SVD
index 8bfd1c3..3451794 100644
--- a/Eigen/SVD
+++ b/Eigen/SVD

@@ -31,16 +31,20 @@
   * \endcode
   */
 
+#include "src/misc/RealSvd2x2.h"
 #include "src/SVD/UpperBidiagonalization.h"
 #include "src/SVD/SVDBase.h"
 #include "src/SVD/JacobiSVD.h"
 #include "src/SVD/BDCSVD.h"
 #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
 #include "src/misc/lapacke.h"
+#endif
 #include "src/SVD/JacobiSVD_LAPACKE.h"
 #endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_SVD_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */

diff --git a/Eigen/Sparse b/Eigen/Sparse
new file mode 100644
index 0000000..a2ef7a6
--- /dev/null
+++ b/Eigen/Sparse

@@ -0,0 +1,34 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_MODULE_H
+#define EIGEN_SPARSE_MODULE_H
+
+/** \defgroup Sparse_Module Sparse meta-module
+  *
+  * Meta-module including all related modules:
+  * - \ref SparseCore_Module
+  * - \ref OrderingMethods_Module
+  * - \ref SparseCholesky_Module
+  * - \ref SparseLU_Module
+  * - \ref SparseQR_Module
+  * - \ref IterativeLinearSolvers_Module
+  *
+    \code
+    #include <Eigen/Sparse>
+    \endcode
+  */
+
+#include "SparseCore"
+#include "OrderingMethods"
+#include "SparseCholesky"
+#include "SparseLU"
+#include "SparseQR"
+#include "IterativeLinearSolvers"
+
+#endif // EIGEN_SPARSE_MODULE_H
+

diff --git a/Eigen/SparseCholesky b/Eigen/SparseCholesky
new file mode 100644
index 0000000..d2b1f12
--- /dev/null
+++ b/Eigen/SparseCholesky

@@ -0,0 +1,37 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2013 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSECHOLESKY_MODULE_H
+#define EIGEN_SPARSECHOLESKY_MODULE_H
+
+#include "SparseCore"
+#include "OrderingMethods"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** 
+  * \defgroup SparseCholesky_Module SparseCholesky module
+  *
+  * This module currently provides two variants of the direct sparse Cholesky decomposition for selfadjoint (hermitian) matrices.
+  * Those decompositions are accessible via the following classes:
+  *  - SimplicialLLt,
+  *  - SimplicialLDLt
+  *
+  * Such problems can also be solved using the ConjugateGradient solver from the IterativeLinearSolvers module.
+  *
+  * \code
+  * #include <Eigen/SparseCholesky>
+  * \endcode
+  */
+
+#include "src/SparseCholesky/SimplicialCholesky.h"
+#include "src/SparseCholesky/SimplicialCholesky_impl.h"
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_SPARSECHOLESKY_MODULE_H

diff --git a/Eigen/SparseCore b/Eigen/SparseCore
index d2b319f..76966c4 100644
--- a/Eigen/SparseCore
+++ b/Eigen/SparseCore

@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPARSECORE_MODULE_H
 #define EIGEN_SPARSECORE_MODULE_H
 
@@ -6,25 +13,15 @@
 #include "src/Core/util/DisableStupidWarnings.h"
 
 #include <vector>
-using ::std::vector;
 #include <map>
-using ::std::map;
-using ::std::multimap;
 #include <cstdlib>
 #include <cstring>
 #include <algorithm>
-using ::std::min;
-using ::std::max;
-using ::std::fill;
-using ::std::fill_n;
-using ::std::lower_bound;
-using ::std::equal;
-using ::std::sort;
 
 /** 
   * \defgroup SparseCore_Module SparseCore module
   *
-  * This module provides a sparse matrix representation, and basic associatd matrix manipulations
+  * This module provides a sparse matrix representation, and basic associated matrix manipulations
   * and operations.
   *
   * See the \ref TutorialSparse "Sparse tutorial"
@@ -36,38 +33,37 @@
   * This module depends on: Core.
   */
 
-namespace Eigen {
-
-/** The type used to identify a general sparse storage. */
-struct Sparse {};
-
-}
-
 #include "src/SparseCore/SparseUtil.h"
 #include "src/SparseCore/SparseMatrixBase.h"
+#include "src/SparseCore/SparseAssign.h"
 #include "src/SparseCore/CompressedStorage.h"
 #include "src/SparseCore/AmbiVector.h"
+#include "src/SparseCore/SparseCompressedBase.h"
 #include "src/SparseCore/SparseMatrix.h"
+#include "src/SparseCore/SparseMap.h"
 #include "src/SparseCore/MappedSparseMatrix.h"
 #include "src/SparseCore/SparseVector.h"
-#include "src/SparseCore/SparseBlock.h"
-#include "src/SparseCore/SparseTranspose.h"
+#include "src/SparseCore/SparseRef.h"
 #include "src/SparseCore/SparseCwiseUnaryOp.h"
 #include "src/SparseCore/SparseCwiseBinaryOp.h"
+#include "src/SparseCore/SparseTranspose.h"
+#include "src/SparseCore/SparseBlock.h"
 #include "src/SparseCore/SparseDot.h"
-#include "src/SparseCore/SparsePermutation.h"
 #include "src/SparseCore/SparseRedux.h"
-#include "src/SparseCore/SparseFuzzy.h"
+#include "src/SparseCore/SparseView.h"
+#include "src/SparseCore/SparseDiagonalProduct.h"
 #include "src/SparseCore/ConservativeSparseSparseProduct.h"
 #include "src/SparseCore/SparseSparseProductWithPruning.h"
 #include "src/SparseCore/SparseProduct.h"
 #include "src/SparseCore/SparseDenseProduct.h"
-#include "src/SparseCore/SparseDiagonalProduct.h"
-#include "src/SparseCore/SparseTriangularView.h"
 #include "src/SparseCore/SparseSelfAdjointView.h"
+#include "src/SparseCore/SparseTriangularView.h"
 #include "src/SparseCore/TriangularSolver.h"
-#include "src/SparseCore/SparseView.h"
+#include "src/SparseCore/SparsePermutation.h"
+#include "src/SparseCore/SparseFuzzy.h"
+#include "src/SparseCore/SparseSolverBase.h"
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_SPARSECORE_MODULE_H
+

diff --git a/Eigen/SparseLU b/Eigen/SparseLU
new file mode 100644
index 0000000..37c4a5c
--- /dev/null
+++ b/Eigen/SparseLU

@@ -0,0 +1,50 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
+// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSELU_MODULE_H
+#define EIGEN_SPARSELU_MODULE_H
+
+#include "SparseCore"
+
+/** 
+  * \defgroup SparseLU_Module SparseLU module
+  * This module defines a supernodal factorization of general sparse matrices.
+  * The code is fully optimized for supernode-panel updates with specialized kernels.
+  * Please, see the documentation of the SparseLU class for more details.
+  */
+
+// Ordering interface
+#include "OrderingMethods"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+#include "src/SparseLU/SparseLU_gemm_kernel.h"
+
+#include "src/SparseLU/SparseLU_Structs.h"
+#include "src/SparseLU/SparseLU_SupernodalMatrix.h"
+#include "src/SparseLU/SparseLUImpl.h"
+#include "src/SparseCore/SparseColEtree.h"
+#include "src/SparseLU/SparseLU_Memory.h"
+#include "src/SparseLU/SparseLU_heap_relax_snode.h"
+#include "src/SparseLU/SparseLU_relax_snode.h"
+#include "src/SparseLU/SparseLU_pivotL.h"
+#include "src/SparseLU/SparseLU_panel_dfs.h"
+#include "src/SparseLU/SparseLU_kernel_bmod.h"
+#include "src/SparseLU/SparseLU_panel_bmod.h"
+#include "src/SparseLU/SparseLU_column_dfs.h"
+#include "src/SparseLU/SparseLU_column_bmod.h"
+#include "src/SparseLU/SparseLU_copy_to_ucol.h"
+#include "src/SparseLU/SparseLU_pruneL.h"
+#include "src/SparseLU/SparseLU_Utils.h"
+#include "src/SparseLU/SparseLU.h"
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_SPARSELU_MODULE_H

diff --git a/Eigen/SparseQR b/Eigen/SparseQR
index 4ee4206..f5fc5fa 100644
--- a/Eigen/SparseQR
+++ b/Eigen/SparseQR

@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPARSEQR_MODULE_H
 #define EIGEN_SPARSEQR_MODULE_H
 
@@ -21,10 +28,6 @@
   * 
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
-#include "OrderingMethods"
 #include "src/SparseCore/SparseColEtree.h"
 #include "src/SparseQR/SparseQR.h"
 

diff --git a/Eigen/StdDeque b/Eigen/StdDeque
index c7fa11a..bc68397 100644
--- a/Eigen/StdDeque
+++ b/Eigen/StdDeque

@@ -13,9 +13,8 @@
 
 #include "Core"
 #include <deque>
-using ::std::deque;
 
-#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */
 
 #define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...)
 

diff --git a/Eigen/StdList b/Eigen/StdList
index 20786c4..4c6262c 100644
--- a/Eigen/StdList
+++ b/Eigen/StdList

@@ -12,9 +12,8 @@
 
 #include "Core"
 #include <list>
-using ::std::list;
 
-#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */    
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */
 
 #define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...)
 

diff --git a/Eigen/StdVector b/Eigen/StdVector
index 6dd1d68..0c4697a 100644
--- a/Eigen/StdVector
+++ b/Eigen/StdVector

@@ -13,9 +13,8 @@
 
 #include "Core"
 #include <vector>
-using ::std::vector;
 
-#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */
 
 #define EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(...)
 

diff --git a/Eigen/SuperLUSupport b/Eigen/SuperLUSupport
index 575e14f..59312a8 100644
--- a/Eigen/SuperLUSupport
+++ b/Eigen/SuperLUSupport

@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SUPERLUSUPPORT_MODULE_H
 #define EIGEN_SUPERLUSUPPORT_MODULE_H
 
@@ -36,6 +43,8 @@
   * - class SuperLU: a supernodal sequential LU factorization.
   * - class SuperILU: a supernodal sequential incomplete LU factorization (to be used as a preconditioner for iterative methods).
   *
+  * \warning This wrapper requires at least versions 4.0 of SuperLU. The 3.x versions are not supported.
+  *
   * \warning When including this module, you have to use SUPERLU_EMPTY instead of EMPTY which is no longer defined because it is too polluting.
   *
   * \code
@@ -48,12 +57,8 @@
   *
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 #include "src/SuperLUSupport/SuperLUSupport.h"
 
-
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_SUPERLUSUPPORT_MODULE_H

diff --git a/Eigen/UmfPackSupport b/Eigen/UmfPackSupport
index 984f64a..00eec80 100644
--- a/Eigen/UmfPackSupport
+++ b/Eigen/UmfPackSupport

@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_UMFPACKSUPPORT_MODULE_H
 #define EIGEN_UMFPACKSUPPORT_MODULE_H
 
@@ -12,7 +19,7 @@
 /** \ingroup Support_modules
   * \defgroup UmfPackSupport_Module UmfPackSupport module
   *
-  * This module provides an interface to the UmfPack library which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
+  * This module provides an interface to the UmfPack library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
   * It provides the following factorization class:
   * - class UmfPackLU: a multifrontal sequential LU factorization.
   *
@@ -26,9 +33,6 @@
   *
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 #include "src/UmfPackSupport/UmfPackSupport.h"
 
 #include "src/Core/util/ReenableStupidWarnings.h"

diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index 6c5632d..1013ca0 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h

@@ -16,6 +16,15 @@
 namespace Eigen {
 
 namespace internal {
+  template<typename _MatrixType, int _UpLo> struct traits<LDLT<_MatrixType, _UpLo> >
+   : traits<_MatrixType>
+  {
+    typedef MatrixXpr XprKind;
+    typedef SolverStorage StorageKind;
+    typedef int StorageIndex;
+    enum { Flags = 0 };
+  };
+
   template<typename MatrixType, int UpLo> struct LDLT_Traits;
 
   // PositiveSemiDef means positive semi-definite and non-zero; same for NegativeSemiDef
@@ -28,42 +37,43 @@
   *
   * \brief Robust Cholesky decomposition of a matrix with pivoting
   *
-  * \param MatrixType the type of the matrix of which to compute the LDL^T Cholesky decomposition
-  * \param UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
+  * \tparam _MatrixType the type of the matrix of which to compute the LDL^T Cholesky decomposition
+  * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
   *             The other triangular part won't be read.
   *
   * Perform a robust Cholesky decomposition of a positive semidefinite or negative semidefinite
   * matrix \f$ A \f$ such that \f$ A =  P^TLDL^*P \f$, where P is a permutation matrix, L
   * is lower triangular with a unit diagonal and D is a diagonal matrix.
   *
-  * The decomposition uses pivoting to ensure stability, so that L will have
+  * The decomposition uses pivoting to ensure stability, so that D will have
   * zeros in the bottom right rank(A) - n submatrix. Avoiding the square root
   * on D also stabilizes the computation.
   *
   * Remember that Cholesky decompositions are not rank-revealing. Also, do not use a Cholesky
   * decomposition to determine whether a system of equations has a solution.
   *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  *
   * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
   */
 template<typename _MatrixType, int _UpLo> class LDLT
+        : public SolverBase<LDLT<_MatrixType, _UpLo> >
 {
   public:
     typedef _MatrixType MatrixType;
+    typedef SolverBase<LDLT> Base;
+    friend class SolverBase<LDLT>;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(LDLT)
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options & ~RowMajorBit, // these are the options for the TmpMatrixType, we need a ColMajor matrix here!
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
       UpLo = _UpLo
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef Matrix<Scalar, RowsAtCompileTime, 1, Options, MaxRowsAtCompileTime, 1> TmpMatrixType;
+    typedef Matrix<Scalar, RowsAtCompileTime, 1, 0, MaxRowsAtCompileTime, 1> TmpMatrixType;
 
-    typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime, Index> TranspositionType;
-    typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime, Index> PermutationType;
+    typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
+    typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime> PermutationType;
 
     typedef internal::LDLT_Traits<MatrixType,UpLo> Traits;
 
@@ -85,7 +95,7 @@
       * according to the specified problem \a size.
       * \sa LDLT()
       */
-    LDLT(Index size)
+    explicit LDLT(Index size)
       : m_matrix(size, size),
         m_transpositions(size),
         m_temporary(size),
@@ -96,16 +106,35 @@
     /** \brief Constructor with decomposition
       *
       * This calculates the decomposition for the input \a matrix.
+      *
       * \sa LDLT(Index size)
       */
-    LDLT(const MatrixType& matrix)
+    template<typename InputType>
+    explicit LDLT(const EigenBase<InputType>& matrix)
       : m_matrix(matrix.rows(), matrix.cols()),
         m_transpositions(matrix.rows()),
         m_temporary(matrix.rows()),
         m_sign(internal::ZeroSign),
         m_isInitialized(false)
     {
-      compute(matrix);
+      compute(matrix.derived());
+    }
+
+    /** \brief Constructs a LDLT factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
+      *
+      * \sa LDLT(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit LDLT(EigenBase<InputType>& matrix)
+      : m_matrix(matrix.derived()),
+        m_transpositions(matrix.rows()),
+        m_temporary(matrix.rows()),
+        m_sign(internal::ZeroSign),
+        m_isInitialized(false)
+    {
+      compute(matrix.derived());
     }
 
     /** Clear any existing decomposition
@@ -152,13 +181,6 @@
       return m_sign == internal::PositiveSemiDef || m_sign == internal::ZeroSign;
     }
 
-    #ifdef EIGEN2_SUPPORT
-    inline bool isPositiveDefinite() const
-    {
-      return isPositive();
-    }
-    #endif
-
     /** \returns true if the matrix is negative (semidefinite) */
     inline bool isNegative(void) const
     {
@@ -166,6 +188,7 @@
       return m_sign == internal::NegativeSemiDef || m_sign == internal::ZeroSign;
     }
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** \returns a solution x of \f$ A x = b \f$ using the current decomposition of A.
       *
       * This function also supports in-place solves using the syntax <tt>x = decompositionObject.solve(x)</tt> .
@@ -177,33 +200,29 @@
       * \f$ L^* y_4 = y_3 \f$ and \f$ P x = y_4 \f$ in succession. If the matrix \f$ A \f$ is singular, then
       * \f$ D \f$ will also be singular (all the other matrices are invertible). In that case, the
       * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function
-      * computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular.
+      * computes the least-square solution of \f$ A x = b \f$ if \f$ A \f$ is singular.
       *
       * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt()
       */
     template<typename Rhs>
-    inline const internal::solve_retval<LDLT, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      eigen_assert(m_matrix.rows()==b.rows()
-                && "LDLT::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<LDLT, Rhs>(*this, b.derived());
-    }
-
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived, typename ResultType>
-    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
-    {
-      *result = this->solve(b);
-      return true;
-    }
+    inline const Solve<LDLT, Rhs>
+    solve(const MatrixBase<Rhs>& b) const;
     #endif
 
     template<typename Derived>
     bool solveInPlace(MatrixBase<Derived> &bAndX) const;
 
-    LDLT& compute(const MatrixType& matrix);
+    template<typename InputType>
+    LDLT& compute(const EigenBase<InputType>& matrix);
+
+    /** \returns an estimate of the reciprocal condition number of the matrix of
+     *  which \c *this is the LDLT decomposition.
+     */
+    RealScalar rcond() const
+    {
+      eigen_assert(m_isInitialized && "LDLT is not initialized.");
+      return internal::rcond_estimate_helper(m_l1_norm, *this);
+    }
 
     template <typename Derived>
     LDLT& rankUpdate(const MatrixBase<Derived>& w, const RealScalar& alpha=1);
@@ -220,22 +239,42 @@
 
     MatrixType reconstructedMatrix() const;
 
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
+    /** \returns the adjoint of \c *this, that is, a const reference to the decomposition itself as the underlying matrix is self-adjoint.
+      *
+      * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as:
+      * \code x = decomposition.adjoint().solve(b) \endcode
+      */
+    const LDLT& adjoint() const { return *this; };
+
+    EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
-      *          \c NumericalIssue if the matrix.appears to be negative.
+      * \returns \c Success if computation was successful,
+      *          \c NumericalIssue if the factorization failed because of a zero pivot.
       */
     ComputationInfo info() const
     {
       eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      return Success;
+      return m_info;
     }
 
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
+    #endif
+
   protected:
 
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+
     /** \internal
       * Used to compute and store the Cholesky decomposition A = L D L^* = U^* D U.
       * The strict upper part is used during the decomposition, the strict lower
@@ -243,10 +282,12 @@
       * is not stored), and the diagonal entries correspond to D.
       */
     MatrixType m_matrix;
+    RealScalar m_l1_norm;
     TranspositionType m_transpositions;
     TmpMatrixType m_temporary;
     internal::SignMatrix m_sign;
     bool m_isInitialized;
+    ComputationInfo m_info;
 };
 
 namespace internal {
@@ -261,37 +302,30 @@
     using std::abs;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef typename TranspositionType::StorageIndex IndexType;
     eigen_assert(mat.rows()==mat.cols());
     const Index size = mat.rows();
+    bool found_zero_pivot = false;
+    bool ret = true;
 
     if (size <= 1)
     {
       transpositions.setIdentity();
-      if (numext::real(mat.coeff(0,0)) > 0) sign = PositiveSemiDef;
-      else if (numext::real(mat.coeff(0,0)) < 0) sign = NegativeSemiDef;
+      if(size==0) sign = ZeroSign;
+      else if (numext::real(mat.coeff(0,0)) > static_cast<RealScalar>(0) ) sign = PositiveSemiDef;
+      else if (numext::real(mat.coeff(0,0)) < static_cast<RealScalar>(0)) sign = NegativeSemiDef;
       else sign = ZeroSign;
       return true;
     }
 
-    RealScalar cutoff(0), biggest_in_corner;
-
     for (Index k = 0; k < size; ++k)
     {
       // Find largest diagonal element
       Index index_of_biggest_in_corner;
-      biggest_in_corner = mat.diagonal().tail(size-k).cwiseAbs().maxCoeff(&index_of_biggest_in_corner);
+      mat.diagonal().tail(size-k).cwiseAbs().maxCoeff(&index_of_biggest_in_corner);
       index_of_biggest_in_corner += k;
 
-      if(k == 0)
-      {
-        // The biggest overall is the point of reference to which further diagonals
-        // are compared; if any diagonal is negligible compared
-        // to the largest overall, the algorithm bails.
-        cutoff = abs(NumTraits<Scalar>::epsilon() * biggest_in_corner);
-      }
-
-      transpositions.coeffRef(k) = index_of_biggest_in_corner;
+      transpositions.coeffRef(k) = IndexType(index_of_biggest_in_corner);
       if(k != index_of_biggest_in_corner)
       {
         // apply the transposition while taking care to consider only
@@ -321,27 +355,51 @@
 
       if(k>0)
       {
-        temp.head(k) = mat.diagonal().head(k).asDiagonal() * A10.adjoint();
+        temp.head(k) = mat.diagonal().real().head(k).asDiagonal() * A10.adjoint();
         mat.coeffRef(k,k) -= (A10 * temp.head(k)).value();
         if(rs>0)
           A21.noalias() -= A20 * temp.head(k);
       }
 
-      if((rs>0) && (abs(mat.coeffRef(k,k)) > cutoff))
-        A21 /= mat.coeffRef(k,k);
-
+      // In some previous versions of Eigen (e.g., 3.2.1), the scaling was omitted if the pivot
+      // was smaller than the cutoff value. However, since LDLT is not rank-revealing
+      // we should only make sure that we do not introduce INF or NaN values.
+      // Remark that LAPACK also uses 0 as the cutoff value.
       RealScalar realAkk = numext::real(mat.coeffRef(k,k));
+      bool pivot_is_valid = (abs(realAkk) > RealScalar(0));
+
+      if(k==0 && !pivot_is_valid)
+      {
+        // The entire diagonal is zero, there is nothing more to do
+        // except filling the transpositions, and checking whether the matrix is zero.
+        sign = ZeroSign;
+        for(Index j = 0; j<size; ++j)
+        {
+          transpositions.coeffRef(j) = IndexType(j);
+          ret = ret && (mat.col(j).tail(size-j-1).array()==Scalar(0)).all();
+        }
+        return ret;
+      }
+
+      if((rs>0) && pivot_is_valid)
+        A21 /= realAkk;
+      else if(rs>0)
+        ret = ret && (A21.array()==Scalar(0)).all();
+
+      if(found_zero_pivot && pivot_is_valid) ret = false; // factorization failed
+      else if(!pivot_is_valid) found_zero_pivot = true;
+
       if (sign == PositiveSemiDef) {
-        if (realAkk < 0) sign = Indefinite;
+        if (realAkk < static_cast<RealScalar>(0)) sign = Indefinite;
       } else if (sign == NegativeSemiDef) {
-        if (realAkk > 0) sign = Indefinite;
+        if (realAkk > static_cast<RealScalar>(0)) sign = Indefinite;
       } else if (sign == ZeroSign) {
-        if (realAkk > 0) sign = PositiveSemiDef;
-        else if (realAkk < 0) sign = NegativeSemiDef;
+        if (realAkk > static_cast<RealScalar>(0)) sign = PositiveSemiDef;
+        else if (realAkk < static_cast<RealScalar>(0)) sign = NegativeSemiDef;
       }
     }
 
-    return true;
+    return ret;
   }
 
   // Reference for the algorithm: Davis and Hager, "Multiple Rank
@@ -357,7 +415,6 @@
     using numext::isfinite;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
 
     const Index size = mat.rows();
     eigen_assert(mat.cols() == size && w.size()==size);
@@ -421,16 +478,16 @@
 {
   typedef const TriangularView<const MatrixType, UnitLower> MatrixL;
   typedef const TriangularView<const typename MatrixType::AdjointReturnType, UnitUpper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m; }
-  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); }
 };
 
 template<typename MatrixType> struct LDLT_Traits<MatrixType,Upper>
 {
   typedef const TriangularView<const typename MatrixType::AdjointReturnType, UnitLower> MatrixL;
   typedef const TriangularView<const MatrixType, UnitUpper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m.adjoint(); }
-  static inline MatrixU getU(const MatrixType& m) { return m; }
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m.adjoint()); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m); }
 };
 
 } // end namespace internal
@@ -438,18 +495,35 @@
 /** Compute / recompute the LDLT decomposition A = L D L^* = U^* D U of \a matrix
   */
 template<typename MatrixType, int _UpLo>
-LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const MatrixType& a)
+template<typename InputType>
+LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>& a)
 {
+  check_template_parameters();
+
   eigen_assert(a.rows()==a.cols());
   const Index size = a.rows();
 
-  m_matrix = a;
+  m_matrix = a.derived();
+
+  // Compute matrix L1 norm = max abs column sum.
+  m_l1_norm = RealScalar(0);
+  // TODO move this code to SelfAdjointView
+  for (Index col = 0; col < size; ++col) {
+    RealScalar abs_col_sum;
+    if (_UpLo == Lower)
+      abs_col_sum = m_matrix.col(col).tail(size - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>();
+    else
+      abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>();
+    if (abs_col_sum > m_l1_norm)
+      m_l1_norm = abs_col_sum;
+  }
 
   m_transpositions.resize(size);
   m_isInitialized = false;
   m_temporary.resize(size);
+  m_sign = internal::ZeroSign;
 
-  internal::ldlt_inplace<UpLo>::unblocked(m_matrix, m_transpositions, m_temporary, m_sign);
+  m_info = internal::ldlt_inplace<UpLo>::unblocked(m_matrix, m_transpositions, m_temporary, m_sign) ? Success : NumericalIssue;
 
   m_isInitialized = true;
   return *this;
@@ -462,8 +536,9 @@
   */
 template<typename MatrixType, int _UpLo>
 template<typename Derived>
-LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Derived>& w, const typename NumTraits<typename MatrixType::Scalar>::Real& sigma)
+LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Derived>& w, const typename LDLT<MatrixType,_UpLo>::RealScalar& sigma)
 {
+  typedef typename TranspositionType::StorageIndex IndexType;
   const Index size = w.rows();
   if (m_isInitialized)
   {
@@ -475,7 +550,7 @@
     m_matrix.setZero();
     m_transpositions.resize(size);
     for (Index i = 0; i < size; i++)
-      m_transpositions.coeffRef(i) = i;
+      m_transpositions.coeffRef(i) = IndexType(i);
     m_temporary.resize(size);
     m_sign = sigma>=0 ? internal::PositiveSemiDef : internal::NegativeSemiDef;
     m_isInitialized = true;
@@ -486,48 +561,56 @@
   return *this;
 }
 
-namespace internal {
-template<typename _MatrixType, int _UpLo, typename Rhs>
-struct solve_retval<LDLT<_MatrixType,_UpLo>, Rhs>
-  : solve_retval_base<LDLT<_MatrixType,_UpLo>, Rhs>
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename _MatrixType, int _UpLo>
+template<typename RhsType, typename DstType>
+void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  typedef LDLT<_MatrixType,_UpLo> LDLTType;
-  EIGEN_MAKE_SOLVE_HELPERS(LDLTType,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    eigen_assert(rhs().rows() == dec().matrixLDLT().rows());
-    // dst = P b
-    dst = dec().transpositionsP() * rhs();
-
-    // dst = L^-1 (P b)
-    dec().matrixL().solveInPlace(dst);
-
-    // dst = D^-1 (L^-1 P b)
-    // more precisely, use pseudo-inverse of D (see bug 241)
-    using std::abs;
-    typedef typename LDLTType::MatrixType MatrixType;
-    typedef typename LDLTType::Scalar Scalar;
-    typedef typename LDLTType::RealScalar RealScalar;
-    const Diagonal<const MatrixType> vectorD = dec().vectorD();
-    RealScalar tolerance = numext::maxi(vectorD.array().abs().maxCoeff() * NumTraits<Scalar>::epsilon(),
-                                        RealScalar(1) / NumTraits<RealScalar>::highest()); // motivated by LAPACK's xGELSS
-    for (Index i = 0; i < vectorD.size(); ++i) {
-      if(abs(vectorD(i)) > tolerance)
-        dst.row(i) /= vectorD(i);
-      else
-        dst.row(i).setZero();
-    }
-
-    // dst = L^-T (D^-1 L^-1 P b)
-    dec().matrixU().solveInPlace(dst);
-
-    // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b
-    dst = dec().transpositionsP().transpose() * dst;
-  }
-};
+  _solve_impl_transposed<true>(rhs, dst);
 }
 
+template<typename _MatrixType,int _UpLo>
+template<bool Conjugate, typename RhsType, typename DstType>
+void LDLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+  // dst = P b
+  dst = m_transpositions * rhs;
+
+  // dst = L^-1 (P b)
+  // dst = L^-*T (P b)
+  matrixL().template conjugateIf<!Conjugate>().solveInPlace(dst);
+
+  // dst = D^-* (L^-1 P b)
+  // dst = D^-1 (L^-*T P b)
+  // more precisely, use pseudo-inverse of D (see bug 241)
+  using std::abs;
+  const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
+  // In some previous versions, tolerance was set to the max of 1/highest (or rather numeric_limits::min())
+  // and the maximal diagonal entry * epsilon as motivated by LAPACK's xGELSS:
+  // RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
+  // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
+  // diagonal element is not well justified and leads to numerical issues in some cases.
+  // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
+  // Using numeric_limits::min() gives us more robustness to denormals.
+  RealScalar tolerance = (std::numeric_limits<RealScalar>::min)();
+  for (Index i = 0; i < vecD.size(); ++i)
+  {
+    if(abs(vecD(i)) > tolerance)
+      dst.row(i) /= vecD(i);
+    else
+      dst.row(i).setZero();
+  }
+
+  // dst = L^-* (D^-* L^-1 P b)
+  // dst = L^-T (D^-1 L^-*T P b)
+  matrixL().transpose().template conjugateIf<Conjugate>().solveInPlace(dst);
+
+  // dst = P^T (L^-* D^-* L^-1 P b) = A^-1 b
+  // dst = P^-T (L^-T D^-1 L^-*T P b) = A^-1 b
+  dst = m_transpositions.transpose() * dst;
+}
+#endif
+
 /** \internal use x = ldlt_object.solve(x);
   *
   * This is the \em in-place version of solve().
@@ -569,7 +652,7 @@
   // L^* P
   res = matrixU() * res;
   // D(L^*P)
-  res = vectorD().asDiagonal() * res;
+  res = vectorD().real().asDiagonal() * res;
   // L(DL^*P)
   res = matrixL() * res;
   // P^T (LDL^*P)
@@ -578,7 +661,6 @@
   return res;
 }
 
-#ifndef __CUDACC__
 /** \cholesky_module
   * \returns the Cholesky decomposition with full pivoting without square root of \c *this
   * \sa MatrixBase::ldlt()
@@ -600,7 +682,6 @@
 {
   return LDLT<PlainObject>(derived());
 }
-#endif // __CUDACC__
 
 } // end namespace Eigen
 

diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index 45ed843..8c9b2b3 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h

@@ -10,9 +10,19 @@
 #ifndef EIGEN_LLT_H
 #define EIGEN_LLT_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal{
+
+template<typename _MatrixType, int _UpLo> struct traits<LLT<_MatrixType, _UpLo> >
+ : traits<_MatrixType>
+{
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
+  enum { Flags = 0 };
+};
+
 template<typename MatrixType, int UpLo> struct LLT_Traits;
 }
 
@@ -22,9 +32,9 @@
   *
   * \brief Standard Cholesky decomposition (LL^T) of a matrix and associated features
   *
-  * \param MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
-  * \param UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
-  *             The other triangular part won't be read.
+  * \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
+  * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
+  *               The other triangular part won't be read.
   *
   * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite
   * matrix A such that A = LL^* = U^*U, where L is lower triangular.
@@ -40,26 +50,31 @@
   *
   * Example: \include LLT_example.cpp
   * Output: \verbinclude LLT_example.out
-  *    
+  *
+  * \b Performance: for best performance, it is recommended to use a column-major storage format
+  * with the Lower triangular part (the default), or, equivalently, a row-major storage format
+  * with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization
+  * step, and rank-updates can be up to 3 times slower.
+  *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  *
+  * Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered.
+  * Therefore, the strict lower part does not have to store correct values.
+  *
   * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
   */
- /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
-  * Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
-  * the strict lower part does not have to store correct values.
-  */
 template<typename _MatrixType, int _UpLo> class LLT
+        : public SolverBase<LLT<_MatrixType, _UpLo> >
 {
   public:
     typedef _MatrixType MatrixType;
+    typedef SolverBase<LLT> Base;
+    friend class SolverBase<LLT>;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(LLT)
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
 
     enum {
       PacketSize = internal::packet_traits<Scalar>::size,
@@ -83,14 +98,30 @@
       * according to the specified problem \a size.
       * \sa LLT()
       */
-    LLT(Index size) : m_matrix(size, size),
+    explicit LLT(Index size) : m_matrix(size, size),
                     m_isInitialized(false) {}
 
-    LLT(const MatrixType& matrix)
+    template<typename InputType>
+    explicit LLT(const EigenBase<InputType>& matrix)
       : m_matrix(matrix.rows(), matrix.cols()),
         m_isInitialized(false)
     {
-      compute(matrix);
+      compute(matrix.derived());
+    }
+
+    /** \brief Constructs a LLT factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when
+      * \c MatrixType is a Eigen::Ref.
+      *
+      * \sa LLT(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit LLT(EigenBase<InputType>& matrix)
+      : m_matrix(matrix.derived()),
+        m_isInitialized(false)
+    {
+      compute(matrix.derived());
     }
 
     /** \returns a view of the upper triangular matrix U */
@@ -107,6 +138,7 @@
       return Traits::getL(m_matrix);
     }
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
       *
       * Since this LLT class assumes anyway that the matrix A is invertible, the solution
@@ -118,30 +150,25 @@
       * \sa solveInPlace(), MatrixBase::llt(), SelfAdjointView::llt()
       */
     template<typename Rhs>
-    inline const internal::solve_retval<LLT, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LLT is not initialized.");
-      eigen_assert(m_matrix.rows()==b.rows()
-                && "LLT::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<LLT, Rhs>(*this, b.derived());
-    }
-
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived, typename ResultType>
-    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
-    {
-      *result = this->solve(b);
-      return true;
-    }
-    
-    bool isPositiveDefinite() const { return true; }
+    inline const Solve<LLT, Rhs>
+    solve(const MatrixBase<Rhs>& b) const;
     #endif
 
     template<typename Derived>
-    void solveInPlace(MatrixBase<Derived> &bAndX) const;
+    void solveInPlace(const MatrixBase<Derived> &bAndX) const;
 
-    LLT& compute(const MatrixType& matrix);
+    template<typename InputType>
+    LLT& compute(const EigenBase<InputType>& matrix);
+
+    /** \returns an estimate of the reciprocal condition number of the matrix of
+      *  which \c *this is the Cholesky decomposition.
+      */
+    RealScalar rcond() const
+    {
+      eigen_assert(m_isInitialized && "LLT is not initialized.");
+      eigen_assert(m_info == Success && "LLT failed because matrix appears to be negative");
+      return internal::rcond_estimate_helper(m_l1_norm, *this);
+    }
 
     /** \returns the LLT decomposition matrix
       *
@@ -158,8 +185,8 @@
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
-      *          \c NumericalIssue if the matrix.appears to be negative.
+      * \returns \c Success if computation was successful,
+      *          \c NumericalIssue if the matrix.appears not to be positive definite.
       */
     ComputationInfo info() const
     {
@@ -167,18 +194,40 @@
       return m_info;
     }
 
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
+    /** \returns the adjoint of \c *this, that is, a const reference to the decomposition itself as the underlying matrix is self-adjoint.
+      *
+      * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as:
+      * \code x = decomposition.adjoint().solve(b) \endcode
+      */
+    const LLT& adjoint() const EIGEN_NOEXCEPT { return *this; };
+
+    inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
 
     template<typename VectorType>
-    LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
+    LLT & rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
+    #endif
 
   protected:
+
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+
     /** \internal
       * Used to compute and store L
       * The strict upper part is not used and even not initialized.
       */
     MatrixType m_matrix;
+    RealScalar m_l1_norm;
     bool m_isInitialized;
     ComputationInfo m_info;
 };
@@ -188,12 +237,11 @@
 template<typename Scalar, int UpLo> struct llt_inplace;
 
 template<typename MatrixType, typename VectorType>
-static typename MatrixType::Index llt_rank_update_lower(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma)
+static Index llt_rank_update_lower(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma)
 {
   using std::sqrt;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::ColXpr ColXpr;
   typedef typename internal::remove_all<ColXpr>::type ColXprCleaned;
   typedef typename ColXprCleaned::SegmentReturnType ColXprSegment;
@@ -262,11 +310,10 @@
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
   template<typename MatrixType>
-  static typename MatrixType::Index unblocked(MatrixType& mat)
+  static Index unblocked(MatrixType& mat)
   {
     using std::sqrt;
-    typedef typename MatrixType::Index Index;
-    
+
     eigen_assert(mat.rows()==mat.cols());
     const Index size = mat.rows();
     for(Index k = 0; k < size; ++k)
@@ -283,15 +330,14 @@
         return k;
       mat.coeffRef(k,k) = x = sqrt(x);
       if (k>0 && rs>0) A21.noalias() -= A20 * A10.adjoint();
-      if (rs>0) A21 *= RealScalar(1)/x;
+      if (rs>0) A21 /= x;
     }
     return -1;
   }
 
   template<typename MatrixType>
-  static typename MatrixType::Index blocked(MatrixType& m)
+  static Index blocked(MatrixType& m)
   {
-    typedef typename MatrixType::Index Index;
     eigen_assert(m.rows()==m.cols());
     Index size = m.rows();
     if(size<32)
@@ -316,36 +362,36 @@
       Index ret;
       if((ret=unblocked(A11))>=0) return k+ret;
       if(rs>0) A11.adjoint().template triangularView<Upper>().template solveInPlace<OnTheRight>(A21);
-      if(rs>0) A22.template selfadjointView<Lower>().rankUpdate(A21,-1); // bottleneck
+      if(rs>0) A22.template selfadjointView<Lower>().rankUpdate(A21,typename NumTraits<RealScalar>::Literal(-1)); // bottleneck
     }
     return -1;
   }
 
   template<typename MatrixType, typename VectorType>
-  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma)
+  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma)
   {
     return Eigen::internal::llt_rank_update_lower(mat, vec, sigma);
   }
 };
-  
+
 template<typename Scalar> struct llt_inplace<Scalar, Upper>
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
 
   template<typename MatrixType>
-  static EIGEN_STRONG_INLINE typename MatrixType::Index unblocked(MatrixType& mat)
+  static EIGEN_STRONG_INLINE Index unblocked(MatrixType& mat)
   {
     Transpose<MatrixType> matt(mat);
     return llt_inplace<Scalar, Lower>::unblocked(matt);
   }
   template<typename MatrixType>
-  static EIGEN_STRONG_INLINE typename MatrixType::Index blocked(MatrixType& mat)
+  static EIGEN_STRONG_INLINE Index blocked(MatrixType& mat)
   {
     Transpose<MatrixType> matt(mat);
     return llt_inplace<Scalar, Lower>::blocked(matt);
   }
   template<typename MatrixType, typename VectorType>
-  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma)
+  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma)
   {
     Transpose<MatrixType> matt(mat);
     return llt_inplace<Scalar, Lower>::rankUpdate(matt, vec.conjugate(), sigma);
@@ -356,8 +402,8 @@
 {
   typedef const TriangularView<const MatrixType, Lower> MatrixL;
   typedef const TriangularView<const typename MatrixType::AdjointReturnType, Upper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m; }
-  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); }
   static bool inplace_decomposition(MatrixType& m)
   { return llt_inplace<typename MatrixType::Scalar, Lower>::blocked(m)==-1; }
 };
@@ -366,8 +412,8 @@
 {
   typedef const TriangularView<const typename MatrixType::AdjointReturnType, Lower> MatrixL;
   typedef const TriangularView<const MatrixType, Upper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m.adjoint(); }
-  static inline MatrixU getU(const MatrixType& m) { return m; }
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m.adjoint()); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m); }
   static bool inplace_decomposition(MatrixType& m)
   { return llt_inplace<typename MatrixType::Scalar, Upper>::blocked(m)==-1; }
 };
@@ -382,12 +428,29 @@
   * Output: \verbinclude TutorialLinAlgComputeTwice.out
   */
 template<typename MatrixType, int _UpLo>
-LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const MatrixType& a)
+template<typename InputType>
+LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>& a)
 {
+  check_template_parameters();
+
   eigen_assert(a.rows()==a.cols());
   const Index size = a.rows();
   m_matrix.resize(size, size);
-  m_matrix = a;
+  if (!internal::is_same_dense(m_matrix, a.derived()))
+    m_matrix = a.derived();
+
+  // Compute matrix L1 norm = max abs column sum.
+  m_l1_norm = RealScalar(0);
+  // TODO move this code to SelfAdjointView
+  for (Index col = 0; col < size; ++col) {
+    RealScalar abs_col_sum;
+    if (_UpLo == Lower)
+      abs_col_sum = m_matrix.col(col).tail(size - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>();
+    else
+      abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>();
+    if (abs_col_sum > m_l1_norm)
+      m_l1_norm = abs_col_sum;
+  }
 
   m_isInitialized = true;
   bool ok = Traits::inplace_decomposition(m_matrix);
@@ -403,7 +466,7 @@
   */
 template<typename _MatrixType, int _UpLo>
 template<typename VectorType>
-LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma)
+LLT<_MatrixType,_UpLo> & LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType);
   eigen_assert(v.size()==m_matrix.cols());
@@ -415,39 +478,42 @@
 
   return *this;
 }
-    
-namespace internal {
-template<typename _MatrixType, int UpLo, typename Rhs>
-struct solve_retval<LLT<_MatrixType, UpLo>, Rhs>
-  : solve_retval_base<LLT<_MatrixType, UpLo>, Rhs>
-{
-  typedef LLT<_MatrixType,UpLo> LLTType;
-  EIGEN_MAKE_SOLVE_HELPERS(LLTType,Rhs)
 
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dst = rhs();
-    dec().solveInPlace(dst);
-  }
-};
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename _MatrixType,int _UpLo>
+template<typename RhsType, typename DstType>
+void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
+{
+  _solve_impl_transposed<true>(rhs, dst);
 }
 
+template<typename _MatrixType,int _UpLo>
+template<bool Conjugate, typename RhsType, typename DstType>
+void LLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+    dst = rhs;
+
+    matrixL().template conjugateIf<!Conjugate>().solveInPlace(dst);
+    matrixU().template conjugateIf<!Conjugate>().solveInPlace(dst);
+}
+#endif
+
 /** \internal use x = llt_object.solve(x);
-  * 
+  *
   * This is the \em in-place version of solve().
   *
   * \param bAndX represents both the right-hand side matrix b and result x.
   *
-  * \returns true always! If you need to check for existence of solutions, use another decomposition like LU, QR, or SVD.
+  * This version avoids a copy when the right hand side matrix b is not needed anymore.
   *
-  * This version avoids a copy when the right hand side matrix b is not
-  * needed anymore.
+  * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
+  * This function will const_cast it, so constness isn't honored here.
   *
   * \sa LLT::solve(), MatrixBase::llt()
   */
 template<typename MatrixType, int _UpLo>
 template<typename Derived>
-void LLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const
+void LLT<MatrixType,_UpLo>::solveInPlace(const MatrixBase<Derived> &bAndX) const
 {
   eigen_assert(m_isInitialized && "LLT is not initialized.");
   eigen_assert(m_matrix.rows()==bAndX.rows());
@@ -465,7 +531,6 @@
   return matrixL() * matrixL().adjoint().toDenseMatrix();
 }
 
-#ifndef __CUDACC__
 /** \cholesky_module
   * \returns the LLT decomposition of \c *this
   * \sa SelfAdjointView::llt()
@@ -487,8 +552,7 @@
 {
   return LLT<PlainObject,UpLo>(m_matrix);
 }
-#endif // __CUDACC__
-  
+
 } // end namespace Eigen
 
 #endif // EIGEN_LLT_H

diff --git a/Eigen/src/Cholesky/LLT_LAPACKE.h b/Eigen/src/Cholesky/LLT_LAPACKE.h
new file mode 100644
index 0000000..bc6489e
--- /dev/null
+++ b/Eigen/src/Cholesky/LLT_LAPACKE.h

@@ -0,0 +1,99 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to LAPACKe
+ *     LLt decomposition based on LAPACKE_?potrf function.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_LLT_LAPACKE_H
+#define EIGEN_LLT_LAPACKE_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename Scalar> struct lapacke_llt;
+
+#define EIGEN_LAPACKE_LLT(EIGTYPE, BLASTYPE, LAPACKE_PREFIX) \
+template<> struct lapacke_llt<EIGTYPE> \
+{ \
+  template<typename MatrixType> \
+  static inline Index potrf(MatrixType& m, char uplo) \
+  { \
+    lapack_int matrix_order; \
+    lapack_int size, lda, info, StorageOrder; \
+    EIGTYPE* a; \
+    eigen_assert(m.rows()==m.cols()); \
+    /* Set up parameters for ?potrf */ \
+    size = convert_index<lapack_int>(m.rows()); \
+    StorageOrder = MatrixType::Flags&RowMajorBit?RowMajor:ColMajor; \
+    matrix_order = StorageOrder==RowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \
+    a = &(m.coeffRef(0,0)); \
+    lda = convert_index<lapack_int>(m.outerStride()); \
+\
+    info = LAPACKE_##LAPACKE_PREFIX##potrf( matrix_order, uplo, size, (BLASTYPE*)a, lda ); \
+    info = (info==0) ? -1 : info>0 ? info-1 : size; \
+    return info; \
+  } \
+}; \
+template<> struct llt_inplace<EIGTYPE, Lower> \
+{ \
+  template<typename MatrixType> \
+  static Index blocked(MatrixType& m) \
+  { \
+    return lapacke_llt<EIGTYPE>::potrf(m, 'L'); \
+  } \
+  template<typename MatrixType, typename VectorType> \
+  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
+  { return Eigen::internal::llt_rank_update_lower(mat, vec, sigma); } \
+}; \
+template<> struct llt_inplace<EIGTYPE, Upper> \
+{ \
+  template<typename MatrixType> \
+  static Index blocked(MatrixType& m) \
+  { \
+    return lapacke_llt<EIGTYPE>::potrf(m, 'U'); \
+  } \
+  template<typename MatrixType, typename VectorType> \
+  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
+  { \
+    Transpose<MatrixType> matt(mat); \
+    return llt_inplace<EIGTYPE, Lower>::rankUpdate(matt, vec.conjugate(), sigma); \
+  } \
+};
+
+EIGEN_LAPACKE_LLT(double, double, d)
+EIGEN_LAPACKE_LLT(float, float, s)
+EIGEN_LAPACKE_LLT(dcomplex, lapack_complex_double, z)
+EIGEN_LAPACKE_LLT(scomplex, lapack_complex_float, c)
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_LLT_LAPACKE_H

diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h
index c449960..adaf528 100644
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h

@@ -10,50 +10,56 @@
 #ifndef EIGEN_CHOLMODSUPPORT_H
 #define EIGEN_CHOLMODSUPPORT_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
-template<typename Scalar, typename CholmodType>
-void cholmod_configure_matrix(CholmodType& mat)
-{
-  if (internal::is_same<Scalar,float>::value)
-  {
-    mat.xtype = CHOLMOD_REAL;
-    mat.dtype = CHOLMOD_SINGLE;
-  }
-  else if (internal::is_same<Scalar,double>::value)
-  {
+template<typename Scalar> struct cholmod_configure_matrix;
+
+template<> struct cholmod_configure_matrix<double> {
+  template<typename CholmodType>
+  static void run(CholmodType& mat) {
     mat.xtype = CHOLMOD_REAL;
     mat.dtype = CHOLMOD_DOUBLE;
   }
-  else if (internal::is_same<Scalar,std::complex<float> >::value)
-  {
-    mat.xtype = CHOLMOD_COMPLEX;
-    mat.dtype = CHOLMOD_SINGLE;
-  }
-  else if (internal::is_same<Scalar,std::complex<double> >::value)
-  {
+};
+
+template<> struct cholmod_configure_matrix<std::complex<double> > {
+  template<typename CholmodType>
+  static void run(CholmodType& mat) {
     mat.xtype = CHOLMOD_COMPLEX;
     mat.dtype = CHOLMOD_DOUBLE;
   }
-  else
-  {
-    eigen_assert(false && "Scalar type not supported by CHOLMOD");
-  }
-}
+};
+
+// Other scalar types are not yet supported by Cholmod
+// template<> struct cholmod_configure_matrix<float> {
+//   template<typename CholmodType>
+//   static void run(CholmodType& mat) {
+//     mat.xtype = CHOLMOD_REAL;
+//     mat.dtype = CHOLMOD_SINGLE;
+//   }
+// };
+//
+// template<> struct cholmod_configure_matrix<std::complex<float> > {
+//   template<typename CholmodType>
+//   static void run(CholmodType& mat) {
+//     mat.xtype = CHOLMOD_COMPLEX;
+//     mat.dtype = CHOLMOD_SINGLE;
+//   }
+// };
 
 } // namespace internal
 
 /** Wraps the Eigen sparse matrix \a mat into a Cholmod sparse matrix object.
   * Note that the data are shared.
   */
-template<typename _Scalar, int _Options, typename _Index>
-cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_Index>& mat)
+template<typename _Scalar, int _Options, typename _StorageIndex>
+cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> > mat)
 {
   cholmod_sparse res;
   res.nzmax   = mat.nonZeros();
-  res.nrow    = mat.rows();;
+  res.nrow    = mat.rows();
   res.ncol    = mat.cols();
   res.p       = mat.outerIndexPtr();
   res.i       = mat.innerIndexPtr();
@@ -73,12 +79,12 @@
 
   res.dtype   = 0;
   res.stype   = -1;
-  
-  if (internal::is_same<_Index,int>::value)
+
+  if (internal::is_same<_StorageIndex,int>::value)
   {
     res.itype = CHOLMOD_INT;
   }
-  else if (internal::is_same<_Index,UF_long>::value)
+  else if (internal::is_same<_StorageIndex,SuiteSparse_long>::value)
   {
     res.itype = CHOLMOD_LONG;
   }
@@ -88,29 +94,39 @@
   }
 
   // setup res.xtype
-  internal::cholmod_configure_matrix<_Scalar>(res);
-  
+  internal::cholmod_configure_matrix<_Scalar>::run(res);
+
   res.stype = 0;
-  
+
   return res;
 }
 
 template<typename _Scalar, int _Options, typename _Index>
 const cholmod_sparse viewAsCholmod(const SparseMatrix<_Scalar,_Options,_Index>& mat)
 {
-  cholmod_sparse res = viewAsCholmod(mat.const_cast_derived());
+  cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.const_cast_derived()));
+  return res;
+}
+
+template<typename _Scalar, int _Options, typename _Index>
+const cholmod_sparse viewAsCholmod(const SparseVector<_Scalar,_Options,_Index>& mat)
+{
+  cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.const_cast_derived()));
   return res;
 }
 
 /** Returns a view of the Eigen sparse matrix \a mat as Cholmod sparse matrix.
   * The data are not copied but shared. */
 template<typename _Scalar, int _Options, typename _Index, unsigned int UpLo>
-cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
+cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<const SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
 {
-  cholmod_sparse res = viewAsCholmod(mat.matrix().const_cast_derived());
-  
+  cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.matrix().const_cast_derived()));
+
   if(UpLo==Upper) res.stype =  1;
   if(UpLo==Lower) res.stype = -1;
+  // swap stype for rowmajor matrices (only works for real matrices)
+  EIGEN_STATIC_ASSERT((_Options & RowMajorBit) == 0 || NumTraits<_Scalar>::IsComplex == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+  if(_Options & RowMajorBit) res.stype *=-1;
 
   return res;
 }
@@ -131,21 +147,59 @@
   res.x      = (void*)(mat.derived().data());
   res.z      = 0;
 
-  internal::cholmod_configure_matrix<Scalar>(res);
+  internal::cholmod_configure_matrix<Scalar>::run(res);
 
   return res;
 }
 
 /** Returns a view of the Cholmod sparse matrix \a cm as an Eigen sparse matrix.
   * The data are not copied but shared. */
-template<typename Scalar, int Flags, typename Index>
-MappedSparseMatrix<Scalar,Flags,Index> viewAsEigen(cholmod_sparse& cm)
+template<typename Scalar, int Flags, typename StorageIndex>
+MappedSparseMatrix<Scalar,Flags,StorageIndex> viewAsEigen(cholmod_sparse& cm)
 {
-  return MappedSparseMatrix<Scalar,Flags,Index>
-         (cm.nrow, cm.ncol, static_cast<Index*>(cm.p)[cm.ncol],
-          static_cast<Index*>(cm.p), static_cast<Index*>(cm.i),static_cast<Scalar*>(cm.x) );
+  return MappedSparseMatrix<Scalar,Flags,StorageIndex>
+         (cm.nrow, cm.ncol, static_cast<StorageIndex*>(cm.p)[cm.ncol],
+          static_cast<StorageIndex*>(cm.p), static_cast<StorageIndex*>(cm.i),static_cast<Scalar*>(cm.x) );
 }
 
+namespace internal {
+
+// template specializations for int and long that call the correct cholmod method
+
+#define EIGEN_CHOLMOD_SPECIALIZE0(ret, name) \
+    template<typename _StorageIndex> inline ret cm_ ## name       (cholmod_common &Common) { return cholmod_ ## name   (&Common); } \
+    template<>                       inline ret cm_ ## name<SuiteSparse_long> (cholmod_common &Common) { return cholmod_l_ ## name (&Common); }
+
+#define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1) \
+    template<typename _StorageIndex> inline ret cm_ ## name       (t1& a1, cholmod_common &Common) { return cholmod_ ## name   (&a1, &Common); } \
+    template<>                       inline ret cm_ ## name<SuiteSparse_long> (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); }
+
+EIGEN_CHOLMOD_SPECIALIZE0(int, start)
+EIGEN_CHOLMOD_SPECIALIZE0(int, finish)
+
+EIGEN_CHOLMOD_SPECIALIZE1(int, free_factor, cholmod_factor*, L)
+EIGEN_CHOLMOD_SPECIALIZE1(int, free_dense,  cholmod_dense*,  X)
+EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A)
+
+EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A)
+
+template<typename _StorageIndex> inline cholmod_dense*  cm_solve         (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_solve     (sys, &L, &B, &Common); }
+template<>                       inline cholmod_dense*  cm_solve<SuiteSparse_long>   (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_l_solve   (sys, &L, &B, &Common); }
+
+template<typename _StorageIndex> inline cholmod_sparse* cm_spsolve       (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve   (sys, &L, &B, &Common); }
+template<>                       inline cholmod_sparse* cm_spsolve<SuiteSparse_long> (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); }
+
+template<typename _StorageIndex>
+inline int  cm_factorize_p       (cholmod_sparse*  A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p   (A, beta, fset, fsize, L, &Common); }
+template<>
+inline int  cm_factorize_p<SuiteSparse_long> (cholmod_sparse*  A, double beta[2], SuiteSparse_long* fset,          std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); }
+
+#undef EIGEN_CHOLMOD_SPECIALIZE0
+#undef EIGEN_CHOLMOD_SPECIALIZE1
+
+}  // namespace internal
+
+
 enum CholmodMode {
   CholmodAuto, CholmodSimplicialLLt, CholmodSupernodalLLt, CholmodLDLt
 };
@@ -157,49 +211,56 @@
   * \sa class CholmodSupernodalLLT, class CholmodSimplicialLDLT, class CholmodSimplicialLLT
   */
 template<typename _MatrixType, int _UpLo, typename Derived>
-class CholmodBase : internal::noncopyable
+class CholmodBase : public SparseSolverBase<Derived>
 {
+  protected:
+    typedef SparseSolverBase<Derived> Base;
+    using Base::derived;
+    using Base::m_isInitialized;
   public:
     typedef _MatrixType MatrixType;
     enum { UpLo = _UpLo };
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
     typedef MatrixType CholMatrixType;
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
 
   public:
 
     CholmodBase()
-      : m_cholmodFactor(0), m_info(Success), m_isInitialized(false)
+      : m_cholmodFactor(0), m_info(Success), m_factorizationIsOk(false), m_analysisIsOk(false)
     {
-      m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0);
-      cholmod_start(&m_cholmod);
+      EIGEN_STATIC_ASSERT((internal::is_same<double,RealScalar>::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY);
+      m_shiftOffset[0] = m_shiftOffset[1] = 0.0;
+      internal::cm_start<StorageIndex>(m_cholmod);
     }
 
-    CholmodBase(const MatrixType& matrix)
-      : m_cholmodFactor(0), m_info(Success), m_isInitialized(false)
+    explicit CholmodBase(const MatrixType& matrix)
+      : m_cholmodFactor(0), m_info(Success), m_factorizationIsOk(false), m_analysisIsOk(false)
     {
-      m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0);
-      cholmod_start(&m_cholmod);
+      EIGEN_STATIC_ASSERT((internal::is_same<double,RealScalar>::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY);
+      m_shiftOffset[0] = m_shiftOffset[1] = 0.0;
+      internal::cm_start<StorageIndex>(m_cholmod);
       compute(matrix);
     }
 
     ~CholmodBase()
     {
       if(m_cholmodFactor)
-        cholmod_free_factor(&m_cholmodFactor, &m_cholmod);
-      cholmod_finish(&m_cholmod);
+        internal::cm_free_factor<StorageIndex>(m_cholmodFactor, m_cholmod);
+      internal::cm_finish<StorageIndex>(m_cholmod);
     }
-    
-    inline Index cols() const { return m_cholmodFactor->n; }
-    inline Index rows() const { return m_cholmodFactor->n; }
-    
-    Derived& derived() { return *static_cast<Derived*>(this); }
-    const Derived& derived() const { return *static_cast<const Derived*>(this); }
-    
+
+    inline StorageIndex cols() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
+    inline StorageIndex rows() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
+
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
@@ -215,57 +276,29 @@
       factorize(matrix);
       return derived();
     }
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<CholmodBase, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LLT is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "CholmodDecomposition::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<CholmodBase, Rhs>(*this, b.derived());
-    }
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<CholmodBase, Rhs>
-    solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LLT is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "CholmodDecomposition::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<CholmodBase, Rhs>(*this, b.derived());
-    }
-    
+
     /** Performs a symbolic decomposition on the sparsity pattern of \a matrix.
       *
       * This function is particularly useful when solving for several problems having the same structure.
-      * 
+      *
       * \sa factorize()
       */
     void analyzePattern(const MatrixType& matrix)
     {
       if(m_cholmodFactor)
       {
-        cholmod_free_factor(&m_cholmodFactor, &m_cholmod);
+        internal::cm_free_factor<StorageIndex>(m_cholmodFactor, m_cholmod);
         m_cholmodFactor = 0;
       }
       cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
-      m_cholmodFactor = cholmod_analyze(&A, &m_cholmod);
-      
+      m_cholmodFactor = internal::cm_analyze<StorageIndex>(A, m_cholmod);
+
       this->m_isInitialized = true;
       this->m_info = Success;
       m_analysisIsOk = true;
       m_factorizationIsOk = false;
     }
-    
+
     /** Performs a numeric decomposition of \a matrix
       *
       * The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been performed.
@@ -276,43 +309,46 @@
     {
       eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
       cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
-      cholmod_factorize_p(&A, m_shiftOffset, 0, 0, m_cholmodFactor, &m_cholmod);
-      
+      internal::cm_factorize_p<StorageIndex>(&A, m_shiftOffset, 0, 0, m_cholmodFactor, m_cholmod);
+
       // If the factorization failed, minor is the column at which it did. On success minor == n.
       this->m_info = (m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue);
       m_factorizationIsOk = true;
     }
-    
+
     /** Returns a reference to the Cholmod's configuration structure to get a full control over the performed operations.
      *  See the Cholmod user guide for details. */
     cholmod_common& cholmod() { return m_cholmod; }
-    
+
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal */
     template<typename Rhs,typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
     {
       eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
       const Index size = m_cholmodFactor->n;
       EIGEN_UNUSED_VARIABLE(size);
       eigen_assert(size==b.rows());
 
-      // note: cd stands for Cholmod Dense
-      Rhs& b_ref(b.const_cast_derived());
+      // Cholmod needs column-major storage without inner-stride, which corresponds to the default behavior of Ref.
+      Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b.derived());
+
       cholmod_dense b_cd = viewAsCholmod(b_ref);
-      cholmod_dense* x_cd = cholmod_solve(CHOLMOD_A, m_cholmodFactor, &b_cd, &m_cholmod);
+      cholmod_dense* x_cd = internal::cm_solve<StorageIndex>(CHOLMOD_A, *m_cholmodFactor, b_cd, m_cholmod);
       if(!x_cd)
       {
         this->m_info = NumericalIssue;
+        return;
       }
       // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
+      // NOTE Actually, the copy can be avoided by calling cholmod_solve2 instead of cholmod_solve
       dest = Matrix<Scalar,Dest::RowsAtCompileTime,Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),b.rows(),b.cols());
-      cholmod_free_dense(&x_cd, &m_cholmod);
+      internal::cm_free_dense<StorageIndex>(x_cd, m_cholmod);
     }
-    
+
     /** \internal */
-    template<typename RhsScalar, int RhsOptions, typename RhsIndex, typename DestScalar, int DestOptions, typename DestIndex>
-    void _solve(const SparseMatrix<RhsScalar,RhsOptions,RhsIndex> &b, SparseMatrix<DestScalar,DestOptions,DestIndex> &dest) const
+    template<typename RhsDerived, typename DestDerived>
+    void _solve_impl(const SparseMatrixBase<RhsDerived> &b, SparseMatrixBase<DestDerived> &dest) const
     {
       eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
       const Index size = m_cholmodFactor->n;
@@ -320,19 +356,22 @@
       eigen_assert(size==b.rows());
 
       // note: cs stands for Cholmod Sparse
-      cholmod_sparse b_cs = viewAsCholmod(b);
-      cholmod_sparse* x_cs = cholmod_spsolve(CHOLMOD_A, m_cholmodFactor, &b_cs, &m_cholmod);
+      Ref<SparseMatrix<typename RhsDerived::Scalar,ColMajor,typename RhsDerived::StorageIndex> > b_ref(b.const_cast_derived());
+      cholmod_sparse b_cs = viewAsCholmod(b_ref);
+      cholmod_sparse* x_cs = internal::cm_spsolve<StorageIndex>(CHOLMOD_A, *m_cholmodFactor, b_cs, m_cholmod);
       if(!x_cs)
       {
         this->m_info = NumericalIssue;
+        return;
       }
       // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
-      dest = viewAsEigen<DestScalar,DestOptions,DestIndex>(*x_cs);
-      cholmod_free_sparse(&x_cs, &m_cholmod);
+      // NOTE cholmod_spsolve in fact just calls the dense solver for blocks of 4 columns at a time (similar to Eigen's sparse solver)
+      dest.derived() = viewAsEigen<typename DestDerived::Scalar,ColMajor,typename DestDerived::StorageIndex>(*x_cs);
+      internal::cm_free_sparse<StorageIndex>(x_cs, m_cholmod);
     }
     #endif // EIGEN_PARSED_BY_DOXYGEN
-    
-    
+
+
     /** Sets the shift parameter that will be used to adjust the diagonal coefficients during the numerical factorization.
       *
       * During the numerical factorization, an offset term is added to the diagonal coefficients:\n
@@ -344,20 +383,70 @@
       */
     Derived& setShift(const RealScalar& offset)
     {
-      m_shiftOffset[0] = offset;
+      m_shiftOffset[0] = double(offset);
       return derived();
     }
-    
+
+    /** \returns the determinant of the underlying matrix from the current factorization */
+    Scalar determinant() const
+    {
+      using std::exp;
+      return exp(logDeterminant());
+    }
+
+    /** \returns the log determinant of the underlying matrix from the current factorization */
+    Scalar logDeterminant() const
+    {
+      using std::log;
+      using numext::real;
+      eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
+
+      RealScalar logDet = 0;
+      Scalar *x = static_cast<Scalar*>(m_cholmodFactor->x);
+      if (m_cholmodFactor->is_super)
+      {
+        // Supernodal factorization stored as a packed list of dense column-major blocs,
+        // as described by the following structure:
+
+        // super[k] == index of the first column of the j-th super node
+        StorageIndex *super = static_cast<StorageIndex*>(m_cholmodFactor->super);
+        // pi[k] == offset to the description of row indices
+        StorageIndex *pi = static_cast<StorageIndex*>(m_cholmodFactor->pi);
+        // px[k] == offset to the respective dense block
+        StorageIndex *px = static_cast<StorageIndex*>(m_cholmodFactor->px);
+
+        Index nb_super_nodes = m_cholmodFactor->nsuper;
+        for (Index k=0; k < nb_super_nodes; ++k)
+        {
+          StorageIndex ncols = super[k + 1] - super[k];
+          StorageIndex nrows = pi[k + 1] - pi[k];
+
+          Map<const Array<Scalar,1,Dynamic>, 0, InnerStride<> > sk(x + px[k], ncols, InnerStride<>(nrows+1));
+          logDet += sk.real().log().sum();
+        }
+      }
+      else
+      {
+        // Simplicial factorization stored as standard CSC matrix.
+        StorageIndex *p = static_cast<StorageIndex*>(m_cholmodFactor->p);
+        Index size = m_cholmodFactor->n;
+        for (Index k=0; k<size; ++k)
+          logDet += log(real( x[p[k]] ));
+      }
+      if (m_cholmodFactor->is_ll)
+        logDet *= 2.0;
+      return logDet;
+    };
+
     template<typename Stream>
     void dumpMemory(Stream& /*s*/)
     {}
-    
+
   protected:
     mutable cholmod_common m_cholmod;
     cholmod_factor* m_cholmodFactor;
-    RealScalar m_shiftOffset[2];
+    double m_shiftOffset[2];
     mutable ComputationInfo m_info;
-    bool m_isInitialized;
     int m_factorizationIsOk;
     int m_analysisIsOk;
 };
@@ -376,26 +465,30 @@
   * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
   *               or Upper. Default is Lower.
   *
+  * \implsparsesolverconcept
+  *
   * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
   *
-  * \sa \ref TutorialSparseDirectSolvers, class CholmodSupernodalLLT, class SimplicialLLT
+  * \warning Only double precision real and complex scalar types are supported by Cholmod.
+  *
+  * \sa \ref TutorialSparseSolverConcept, class CholmodSupernodalLLT, class SimplicialLLT
   */
 template<typename _MatrixType, int _UpLo = Lower>
 class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT<_MatrixType, _UpLo> >
 {
     typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT> Base;
     using Base::m_cholmod;
-    
+
   public:
-    
+
     typedef _MatrixType MatrixType;
-    
+
     CholmodSimplicialLLT() : Base() { init(); }
 
     CholmodSimplicialLLT(const MatrixType& matrix) : Base()
     {
       init();
-      compute(matrix);
+      this->compute(matrix);
     }
 
     ~CholmodSimplicialLLT() {}
@@ -423,26 +516,30 @@
   * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
   *               or Upper. Default is Lower.
   *
+  * \implsparsesolverconcept
+  *
   * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
   *
-  * \sa \ref TutorialSparseDirectSolvers, class CholmodSupernodalLLT, class SimplicialLDLT
+  * \warning Only double precision real and complex scalar types are supported by Cholmod.
+  *
+  * \sa \ref TutorialSparseSolverConcept, class CholmodSupernodalLLT, class SimplicialLDLT
   */
 template<typename _MatrixType, int _UpLo = Lower>
 class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT<_MatrixType, _UpLo> >
 {
     typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT> Base;
     using Base::m_cholmod;
-    
+
   public:
-    
+
     typedef _MatrixType MatrixType;
-    
+
     CholmodSimplicialLDLT() : Base() { init(); }
 
     CholmodSimplicialLDLT(const MatrixType& matrix) : Base()
     {
       init();
-      compute(matrix);
+      this->compute(matrix);
     }
 
     ~CholmodSimplicialLDLT() {}
@@ -468,26 +565,30 @@
   * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
   *               or Upper. Default is Lower.
   *
+  * \implsparsesolverconcept
+  *
   * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
   *
-  * \sa \ref TutorialSparseDirectSolvers
+  * \warning Only double precision real and complex scalar types are supported by Cholmod.
+  *
+  * \sa \ref TutorialSparseSolverConcept
   */
 template<typename _MatrixType, int _UpLo = Lower>
 class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT<_MatrixType, _UpLo> >
 {
     typedef CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT> Base;
     using Base::m_cholmod;
-    
+
   public:
-    
+
     typedef _MatrixType MatrixType;
-    
+
     CholmodSupernodalLLT() : Base() { init(); }
 
     CholmodSupernodalLLT(const MatrixType& matrix) : Base()
     {
       init();
-      compute(matrix);
+      this->compute(matrix);
     }
 
     ~CholmodSupernodalLLT() {}
@@ -515,30 +616,34 @@
   * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
   *               or Upper. Default is Lower.
   *
+  * \implsparsesolverconcept
+  *
   * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
   *
-  * \sa \ref TutorialSparseDirectSolvers
+  * \warning Only double precision real and complex scalar types are supported by Cholmod.
+  *
+  * \sa \ref TutorialSparseSolverConcept
   */
 template<typename _MatrixType, int _UpLo = Lower>
 class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecomposition<_MatrixType, _UpLo> >
 {
     typedef CholmodBase<_MatrixType, _UpLo, CholmodDecomposition> Base;
     using Base::m_cholmod;
-    
+
   public:
-    
+
     typedef _MatrixType MatrixType;
-    
+
     CholmodDecomposition() : Base() { init(); }
 
     CholmodDecomposition(const MatrixType& matrix) : Base()
     {
       init();
-      compute(matrix);
+      this->compute(matrix);
     }
 
     ~CholmodDecomposition() {}
-    
+
     void setMode(CholmodMode mode)
     {
       switch(mode)
@@ -572,36 +677,6 @@
     }
 };
 
-namespace internal {
-  
-template<typename _MatrixType, int _UpLo, typename Derived, typename Rhs>
-struct solve_retval<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
-  : solve_retval_base<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
-{
-  typedef CholmodBase<_MatrixType,_UpLo,Derived> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-template<typename _MatrixType, int _UpLo, typename Derived, typename Rhs>
-struct sparse_solve_retval<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
-  : sparse_solve_retval_base<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
-{
-  typedef CholmodBase<_MatrixType,_UpLo,Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_CHOLMODSUPPORT_H

diff --git a/Eigen/src/Core/ArithmeticSequence.h b/Eigen/src/Core/ArithmeticSequence.h
new file mode 100644
index 0000000..b6200fa
--- /dev/null
+++ b/Eigen/src/Core/ArithmeticSequence.h

@@ -0,0 +1,413 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARITHMETIC_SEQUENCE_H
+#define EIGEN_ARITHMETIC_SEQUENCE_H
+
+namespace Eigen {
+
+namespace internal {
+
+#if (!EIGEN_HAS_CXX11) || !((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48)
+template<typename T> struct aseq_negate {};
+
+template<> struct aseq_negate<Index> {
+  typedef Index type;
+};
+
+template<int N> struct aseq_negate<FixedInt<N> > {
+  typedef FixedInt<-N> type;
+};
+
+// Compilation error in the following case:
+template<> struct aseq_negate<FixedInt<DynamicIndex> > {};
+
+template<typename FirstType,typename SizeType,typename IncrType,
+         bool FirstIsSymbolic=symbolic::is_symbolic<FirstType>::value,
+         bool SizeIsSymbolic =symbolic::is_symbolic<SizeType>::value>
+struct aseq_reverse_first_type {
+  typedef Index type;
+};
+
+template<typename FirstType,typename SizeType,typename IncrType>
+struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,true> {
+  typedef symbolic::AddExpr<FirstType,
+                            symbolic::ProductExpr<symbolic::AddExpr<SizeType,symbolic::ValueExpr<FixedInt<-1> > >,
+                                                  symbolic::ValueExpr<IncrType> >
+                           > type;
+};
+
+template<typename SizeType,typename IncrType,typename EnableIf = void>
+struct aseq_reverse_first_type_aux {
+  typedef Index type;
+};
+
+template<typename SizeType,typename IncrType>
+struct aseq_reverse_first_type_aux<SizeType,IncrType,typename internal::enable_if<bool((SizeType::value+IncrType::value)|0x1)>::type> {
+  typedef FixedInt<(SizeType::value-1)*IncrType::value> type;
+};
+
+template<typename FirstType,typename SizeType,typename IncrType>
+struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,false> {
+  typedef typename aseq_reverse_first_type_aux<SizeType,IncrType>::type Aux;
+  typedef symbolic::AddExpr<FirstType,symbolic::ValueExpr<Aux> > type;
+};
+
+template<typename FirstType,typename SizeType,typename IncrType>
+struct aseq_reverse_first_type<FirstType,SizeType,IncrType,false,true> {
+  typedef symbolic::AddExpr<symbolic::ProductExpr<symbolic::AddExpr<SizeType,symbolic::ValueExpr<FixedInt<-1> > >,
+                                                  symbolic::ValueExpr<IncrType> >,
+                            symbolic::ValueExpr<> > type;
+};
+#endif
+
+// Helper to cleanup the type of the increment:
+template<typename T> struct cleanup_seq_incr {
+  typedef typename cleanup_index_type<T,DynamicIndex>::type type;
+};
+
+}
+
+//--------------------------------------------------------------------------------
+// seq(first,last,incr) and seqN(first,size,incr)
+//--------------------------------------------------------------------------------
+
+template<typename FirstType=Index,typename SizeType=Index,typename IncrType=internal::FixedInt<1> >
+class ArithmeticSequence;
+
+template<typename FirstType,typename SizeType,typename IncrType>
+ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
+                   typename internal::cleanup_index_type<SizeType>::type,
+                   typename internal::cleanup_seq_incr<IncrType>::type >
+seqN(FirstType first, SizeType size, IncrType incr);
+
+/** \class ArithmeticSequence
+  * \ingroup Core_Module
+  *
+  * This class represents an arithmetic progression \f$ a_0, a_1, a_2, ..., a_{n-1}\f$ defined by
+  * its \em first value \f$ a_0 \f$, its \em size (aka length) \em n, and the \em increment (aka stride)
+  * that is equal to \f$ a_{i+1}-a_{i}\f$ for any \em i.
+  *
+  * It is internally used as the return type of the Eigen::seq and Eigen::seqN functions, and as the input arguments
+  * of DenseBase::operator()(const RowIndices&, const ColIndices&), and most of the time this is the
+  * only way it is used.
+  *
+  * \tparam FirstType type of the first element, usually an Index,
+  *                   but internally it can be a symbolic expression
+  * \tparam SizeType type representing the size of the sequence, usually an Index
+  *                  or a compile time integral constant. Internally, it can also be a symbolic expression
+  * \tparam IncrType type of the increment, can be a runtime Index, or a compile time integral constant (default is compile-time 1)
+  *
+  * \sa Eigen::seq, Eigen::seqN, DenseBase::operator()(const RowIndices&, const ColIndices&), class IndexedView
+  */
+template<typename FirstType,typename SizeType,typename IncrType>
+class ArithmeticSequence
+{
+public:
+  ArithmeticSequence(FirstType first, SizeType size) : m_first(first), m_size(size) {}
+  ArithmeticSequence(FirstType first, SizeType size, IncrType incr) : m_first(first), m_size(size), m_incr(incr) {}
+
+  enum {
+    SizeAtCompileTime = internal::get_fixed_value<SizeType>::value,
+    IncrAtCompileTime = internal::get_fixed_value<IncrType,DynamicIndex>::value
+  };
+
+  /** \returns the size, i.e., number of elements, of the sequence */
+  Index size()  const { return m_size; }
+
+  /** \returns the first element \f$ a_0 \f$ in the sequence */
+  Index first()  const { return m_first; }
+
+  /** \returns the value \f$ a_i \f$ at index \a i in the sequence. */
+  Index operator[](Index i) const { return m_first + i * m_incr; }
+
+  const FirstType& firstObject() const { return m_first; }
+  const SizeType&  sizeObject()  const { return m_size; }
+  const IncrType&  incrObject()  const { return m_incr; }
+
+protected:
+  FirstType m_first;
+  SizeType  m_size;
+  IncrType  m_incr;
+
+public:
+
+#if EIGEN_HAS_CXX11 && ((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48)
+  auto reverse() const -> decltype(Eigen::seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr)) {
+    return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr);
+  }
+#else
+protected:
+  typedef typename internal::aseq_negate<IncrType>::type ReverseIncrType;
+  typedef typename internal::aseq_reverse_first_type<FirstType,SizeType,IncrType>::type ReverseFirstType;
+public:
+  ArithmeticSequence<ReverseFirstType,SizeType,ReverseIncrType>
+  reverse() const {
+    return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr);
+  }
+#endif
+};
+
+/** \returns an ArithmeticSequence starting at \a first, of length \a size, and increment \a incr
+  *
+  * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
+template<typename FirstType,typename SizeType,typename IncrType>
+ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type,typename internal::cleanup_seq_incr<IncrType>::type >
+seqN(FirstType first, SizeType size, IncrType incr)  {
+  return ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type,typename internal::cleanup_seq_incr<IncrType>::type>(first,size,incr);
+}
+
+/** \returns an ArithmeticSequence starting at \a first, of length \a size, and unit increment
+  *
+  * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType) */
+template<typename FirstType,typename SizeType>
+ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type >
+seqN(FirstType first, SizeType size)  {
+  return ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type>(first,size);
+}
+
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+
+/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and with positive (or negative) increment \a incr
+  *
+  * It is essentially an alias to:
+  * \code
+  * seqN(f, (l-f+incr)/incr, incr);
+  * \endcode
+  *
+  * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType)
+  */
+template<typename FirstType,typename LastType, typename IncrType>
+auto seq(FirstType f, LastType l, IncrType incr);
+
+/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and unit increment
+  *
+  * It is essentially an alias to:
+  * \code
+  * seqN(f,l-f+1);
+  * \endcode
+  *
+  * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType)
+  */
+template<typename FirstType,typename LastType>
+auto seq(FirstType f, LastType l);
+
+#else // EIGEN_PARSED_BY_DOXYGEN
+
+#if EIGEN_HAS_CXX11
+template<typename FirstType,typename LastType>
+auto seq(FirstType f, LastType l) -> decltype(seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+                                                   (  typename internal::cleanup_index_type<LastType>::type(l)
+                                                    - typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>())))
+{
+  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+              (typename internal::cleanup_index_type<LastType>::type(l)
+               -typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>()));
+}
+
+template<typename FirstType,typename LastType, typename IncrType>
+auto seq(FirstType f, LastType l, IncrType incr)
+  -> decltype(seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+                   (   typename internal::cleanup_index_type<LastType>::type(l)
+                     - typename internal::cleanup_index_type<FirstType>::type(f)+typename internal::cleanup_seq_incr<IncrType>::type(incr)
+                   ) / typename internal::cleanup_seq_incr<IncrType>::type(incr),
+                   typename internal::cleanup_seq_incr<IncrType>::type(incr)))
+{
+  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
+  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+              ( typename internal::cleanup_index_type<LastType>::type(l)
+               -typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr)) / CleanedIncrType(incr),
+              CleanedIncrType(incr));
+}
+
+#else // EIGEN_HAS_CXX11
+
+template<typename FirstType,typename LastType>
+typename internal::enable_if<!(symbolic::is_symbolic<FirstType>::value || symbolic::is_symbolic<LastType>::value),
+                             ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index> >::type
+seq(FirstType f, LastType l)
+{
+  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+              Index((typename internal::cleanup_index_type<LastType>::type(l)-typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>())));
+}
+
+template<typename FirstTypeDerived,typename LastType>
+typename internal::enable_if<!symbolic::is_symbolic<LastType>::value,
+    ArithmeticSequence<FirstTypeDerived, symbolic::AddExpr<symbolic::AddExpr<symbolic::NegateExpr<FirstTypeDerived>,symbolic::ValueExpr<> >,
+                                                            symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
+seq(const symbolic::BaseExpr<FirstTypeDerived> &f, LastType l)
+{
+  return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+fix<1>()));
+}
+
+template<typename FirstType,typename LastTypeDerived>
+typename internal::enable_if<!symbolic::is_symbolic<FirstType>::value,
+    ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
+                        symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::ValueExpr<> >,
+                                          symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
+seq(FirstType f, const symbolic::BaseExpr<LastTypeDerived> &l)
+{
+  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),(l.derived()-typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>()));
+}
+
+template<typename FirstTypeDerived,typename LastTypeDerived>
+ArithmeticSequence<FirstTypeDerived,
+                    symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::NegateExpr<FirstTypeDerived> >,symbolic::ValueExpr<internal::FixedInt<1> > > >
+seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<LastTypeDerived> &l)
+{
+  return seqN(f.derived(),(l.derived()-f.derived()+fix<1>()));
+}
+
+
+template<typename FirstType,typename LastType, typename IncrType>
+typename internal::enable_if<!(symbolic::is_symbolic<FirstType>::value || symbolic::is_symbolic<LastType>::value),
+    ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index,typename internal::cleanup_seq_incr<IncrType>::type> >::type
+seq(FirstType f, LastType l, IncrType incr)
+{
+  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
+  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+              Index((typename internal::cleanup_index_type<LastType>::type(l)-typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr))/CleanedIncrType(incr)), incr);
+}
+
+template<typename FirstTypeDerived,typename LastType, typename IncrType>
+typename internal::enable_if<!symbolic::is_symbolic<LastType>::value,
+    ArithmeticSequence<FirstTypeDerived,
+                        symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<symbolic::NegateExpr<FirstTypeDerived>,
+                                                                                   symbolic::ValueExpr<> >,
+                                                                 symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                                              symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                        typename internal::cleanup_seq_incr<IncrType>::type> >::type
+seq(const symbolic::BaseExpr<FirstTypeDerived> &f, LastType l, IncrType incr)
+{
+  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
+  return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
+}
+
+template<typename FirstType,typename LastTypeDerived, typename IncrType>
+typename internal::enable_if<!symbolic::is_symbolic<FirstType>::value,
+    ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
+                        symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::ValueExpr<> >,
+                                                                 symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                                               symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                        typename internal::cleanup_seq_incr<IncrType>::type> >::type
+seq(FirstType f, const symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
+{
+  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
+  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+              (l.derived()-typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
+}
+
+template<typename FirstTypeDerived,typename LastTypeDerived, typename IncrType>
+ArithmeticSequence<FirstTypeDerived,
+                    symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,
+                                                                               symbolic::NegateExpr<FirstTypeDerived> >,
+                                                             symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                                          symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                    typename internal::cleanup_seq_incr<IncrType>::type>
+seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
+{
+  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
+  return seqN(f.derived(),(l.derived()-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
+}
+#endif // EIGEN_HAS_CXX11
+
+#endif // EIGEN_PARSED_BY_DOXYGEN
+
+
+#if EIGEN_HAS_CXX11 || defined(EIGEN_PARSED_BY_DOXYGEN)
+/** \cpp11
+  * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr.
+  *
+  * It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode
+  * 
+  * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
+template<typename SizeType,typename IncrType>
+auto lastN(SizeType size, IncrType incr)
+-> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr))
+{
+  return seqN(Eigen::last-(size-fix<1>())*incr, size, incr);
+}
+
+/** \cpp11
+  * \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment.
+  *
+  *  It is a shortcut for: \code seq(last+fix<1>-size, last) \endcode
+  * 
+  * \sa lastN(SizeType,IncrType, seqN(FirstType,SizeType), seq(FirstType,LastType) */
+template<typename SizeType>
+auto lastN(SizeType size)
+-> decltype(seqN(Eigen::last+fix<1>()-size, size))
+{
+  return seqN(Eigen::last+fix<1>()-size, size);
+}
+#endif
+
+namespace internal {
+
+// Convert a symbolic span into a usable one (i.e., remove last/end "keywords")
+template<typename T>
+struct make_size_type {
+  typedef typename internal::conditional<symbolic::is_symbolic<T>::value, Index, T>::type type;
+};
+
+template<typename FirstType,typename SizeType,typename IncrType,int XprSize>
+struct IndexedViewCompatibleType<ArithmeticSequence<FirstType,SizeType,IncrType>, XprSize> {
+  typedef ArithmeticSequence<Index,typename make_size_type<SizeType>::type,IncrType> type;
+};
+
+template<typename FirstType,typename SizeType,typename IncrType>
+ArithmeticSequence<Index,typename make_size_type<SizeType>::type,IncrType>
+makeIndexedViewCompatible(const ArithmeticSequence<FirstType,SizeType,IncrType>& ids, Index size,SpecializedType) {
+  return ArithmeticSequence<Index,typename make_size_type<SizeType>::type,IncrType>(
+            eval_expr_given_size(ids.firstObject(),size),eval_expr_given_size(ids.sizeObject(),size),ids.incrObject());
+}
+
+template<typename FirstType,typename SizeType,typename IncrType>
+struct get_compile_time_incr<ArithmeticSequence<FirstType,SizeType,IncrType> > {
+  enum { value = get_fixed_value<IncrType,DynamicIndex>::value };
+};
+
+} // end namespace internal
+
+/** \namespace Eigen::indexing
+  * \ingroup Core_Module
+  * 
+  * The sole purpose of this namespace is to be able to import all functions
+  * and symbols that are expected to be used within operator() for indexing
+  * and slicing. If you already imported the whole Eigen namespace:
+  * \code using namespace Eigen; \endcode
+  * then you are already all set. Otherwise, if you don't want/cannot import
+  * the whole Eigen namespace, the following line:
+  * \code using namespace Eigen::indexing; \endcode
+  * is equivalent to:
+  * \code
+  using Eigen::all;
+  using Eigen::seq;
+  using Eigen::seqN;
+  using Eigen::lastN; // c++11 only
+  using Eigen::last;
+  using Eigen::lastp1;
+  using Eigen::fix;
+  \endcode
+  */
+namespace indexing {
+  using Eigen::all;
+  using Eigen::seq;
+  using Eigen::seqN;
+  #if EIGEN_HAS_CXX11
+  using Eigen::lastN;
+  #endif
+  using Eigen::last;
+  using Eigen::lastp1;
+  using Eigen::fix;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_ARITHMETIC_SEQUENCE_H

diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h
index 28d6f14..20c789b 100644
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h

@@ -12,7 +12,16 @@
 
 namespace Eigen {
 
-/** \class Array 
+namespace internal {
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct traits<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > : traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+{
+  typedef ArrayXpr XprKind;
+  typedef ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > XprBase;
+};
+}
+
+/** \class Array
   * \ingroup Core_Module
   *
   * \brief General-purpose arrays with easy API for coefficient-wise operations
@@ -24,20 +33,14 @@
   * API for the %Matrix class provides easy access to linear-algebra
   * operations.
   *
-  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_ARRAY_PLUGIN.
+  * See documentation of class Matrix for detailed information on the template parameters
+  * storage layout.
   *
-  * \sa \ref TutorialArrayClass, \ref TopicClassHierarchy
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_ARRAY_PLUGIN.
+  *
+  * \sa \blank \ref TutorialArrayClass, \ref TopicClassHierarchy
   */
-namespace internal {
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct traits<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > : traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-{
-  typedef ArrayXpr XprKind;
-  typedef ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > XprBase;
-};
-}
-
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
 class Array
   : public PlainObjectBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
@@ -75,6 +78,21 @@
       return Base::operator=(other);
     }
 
+    /** Set all the entries to \a value.
+      * \sa DenseBase::setConstant(), DenseBase::fill()
+      */
+    /* This overload is needed because the usage of
+      *   using Base::operator=;
+      * fails on MSVC. Since the code below is working with GCC and MSVC, we skipped
+      * the usage of 'using'. This should be done only for operator=.
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Array& operator=(const Scalar &value)
+    {
+      Base::setConstant(value);
+      return *this;
+    }
+
     /** Copies the value of the expression \a other into \c *this with automatic resizing.
       *
       * *this might be resized to match the dimensions of \a other. If *this was a null matrix (not already initialized),
@@ -86,7 +104,7 @@
       */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Array& operator=(const ArrayBase<OtherDerived>& other)
+    EIGEN_STRONG_INLINE Array& operator=(const DenseBase<OtherDerived>& other)
     {
       return Base::_set(other);
     }
@@ -129,21 +147,59 @@
     }
 #endif
 
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
-    Array(Array&& other)
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    Array(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
       : Base(std::move(other))
     {
       Base::_check_template_params();
-      if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
-        Base::_set_noalias(other);
     }
-    Array& operator=(Array&& other)
+    EIGEN_DEVICE_FUNC
+    Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
     {
-      other.swap(*this);
+      Base::operator=(std::move(other));
       return *this;
     }
 #endif
 
+    #if EIGEN_HAS_CXX11
+    /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+     *
+     * Example: \include Array_variadic_ctor_cxx11.cpp
+     * Output: \verbinclude Array_variadic_ctor_cxx11.out
+     *
+     * \sa Array(const std::initializer_list<std::initializer_list<Scalar>>&)
+     * \sa Array(const Scalar&), Array(const Scalar&,const Scalar&)
+     */
+    template <typename... ArgTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+      : Base(a0, a1, a2, a3, args...) {}
+
+    /** \brief Constructs an array and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
+      *
+      * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
+      *
+      * Example: \include Array_initializer_list_23_cxx11.cpp
+      * Output: \verbinclude Array_initializer_list_23_cxx11.out
+      *
+      * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered.
+      *
+      * In the case of a compile-time column 1D array, implicit transposition from a single row is allowed.
+      * Therefore <code> Array<int,Dynamic,1>{{1,2,3,4,5}}</code> is legal and the more verbose syntax
+      * <code>Array<int,Dynamic,1>{{1},{2},{3},{4},{5}}</code> can be avoided:
+      *
+      * Example: \include Array_initializer_list_vector_cxx11.cpp
+      * Output: \verbinclude Array_initializer_list_vector_cxx11.out
+      *
+      * In the case of fixed-sized arrays, the initializer list sizes must exactly match the array sizes,
+      * and implicit transposition is allowed for compile-time 1D arrays only.
+      *
+      * \sa  Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Array(const std::initializer_list<std::initializer_list<Scalar>>& list) : Base(list) {}
+    #endif // end EIGEN_HAS_CXX11
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename T>
@@ -161,6 +217,7 @@
       Base::_check_template_params();
       this->template _init2<T0,T1>(val0, val1);
     }
+
     #else
     /** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */
     EIGEN_DEVICE_FUNC explicit Array(const Scalar *data);
@@ -172,7 +229,8 @@
       */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE explicit Array(Index dim);
-    /** constructs an initialized 1x1 Array with the given coefficient */
+    /** constructs an initialized 1x1 Array with the given coefficient
+      * \sa const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args */
     Array(const Scalar& value);
     /** constructs an uninitialized array with \a rows rows and \a cols columns.
       *
@@ -180,11 +238,14 @@
       * it is redundant to pass these parameters, so one should use the default constructor
       * Array() instead. */
     Array(Index rows, Index cols);
-    /** constructs an initialized 2D vector with given coefficients */
+    /** constructs an initialized 2D vector with given coefficients
+      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */
     Array(const Scalar& val0, const Scalar& val1);
-    #endif
+    #endif  // end EIGEN_PARSED_BY_DOXYGEN
 
-    /** constructs an initialized 3D vector with given coefficients */
+    /** constructs an initialized 3D vector with given coefficients
+      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+      */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2)
     {
@@ -194,7 +255,9 @@
       m_storage.data()[1] = val1;
       m_storage.data()[2] = val2;
     }
-    /** constructs an initialized 4D vector with given coefficients */
+    /** constructs an initialized 4D vector with given coefficients
+      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+      */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3)
     {
@@ -206,53 +269,29 @@
       m_storage.data()[3] = val3;
     }
 
-    /** Constructor copying the value of the expression \a other */
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Array(const ArrayBase<OtherDerived>& other)
-             : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
     /** Copy constructor */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array(const Array& other)
-            : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
-    /** Copy constructor with in-place evaluation */
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Array(const ReturnByValue<OtherDerived>& other)
-    {
-      Base::_check_template_params();
-      Base::resize(other.rows(), other.cols());
-      other.evalTo(*this);
-    }
+            : Base(other)
+    { }
+
+  private:
+    struct PrivateType {};
+  public:
 
     /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other)
-      : Base(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
-    {
-      Base::_check_template_params();
-      Base::_resize_to_match(other);
-      *this = other;
-    }
+    EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other,
+                              typename internal::enable_if<internal::is_convertible<typename OtherDerived::Scalar,Scalar>::value,
+                                                           PrivateType>::type = PrivateType())
+      : Base(other.derived())
+    { }
 
-    /** Override MatrixBase::swap() since for dynamic-sized matrices of same type it is enough to swap the
-      * data pointers.
-      */
-    template<typename OtherDerived>
-    void swap(ArrayBase<OtherDerived> const & other)
-    { this->_swap(other.derived()); }
-
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT{ return 1; }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); }
 
     #ifdef EIGEN_ARRAY_PLUGIN
     #include EIGEN_ARRAY_PLUGIN
@@ -267,7 +306,7 @@
 /** \defgroup arraytypedefs Global array typedefs
   * \ingroup Core_Module
   *
-  * Eigen defines several typedef shortcuts for most common 1D and 2D array types.
+  * %Eigen defines several typedef shortcuts for most common 1D and 2D array types.
   *
   * The general patterns are the following:
   *
@@ -280,6 +319,12 @@
   * There are also \c ArraySizeType which are self-explanatory. For example, \c Array4cf is
   * a fixed-size 1D array of 4 complex floats.
   *
+  * With \cpp11, template alias are also defined for common sizes.
+  * They follow the same pattern as above except that the scalar type suffix is replaced by a
+  * template parameter, i.e.:
+  *   - `ArrayRowsCols<Type>` where `Rows` and `Cols` can be \c 2,\c 3,\c 4, or \c X for fixed or dynamic size.
+  *   - `ArraySize<Type>` where `Size` can be \c 2,\c 3,\c 4 or \c X for fixed or dynamic size 1D arrays.
+  *
   * \sa class Array
   */
 
@@ -312,8 +357,42 @@
 
 #undef EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES
 #undef EIGEN_MAKE_ARRAY_TYPEDEFS
+#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS
 
-#undef EIGEN_MAKE_ARRAY_TYPEDEFS_LARGE
+#if EIGEN_HAS_CXX11
+
+#define EIGEN_MAKE_ARRAY_TYPEDEFS(Size, SizeSuffix)               \
+/** \ingroup arraytypedefs */                                     \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Array##SizeSuffix##SizeSuffix = Array<Type, Size, Size>;    \
+/** \ingroup arraytypedefs */                                     \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Array##SizeSuffix = Array<Type, Size, 1>;
+
+#define EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Size)                     \
+/** \ingroup arraytypedefs */                                     \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Array##Size##X = Array<Type, Size, Dynamic>;                \
+/** \ingroup arraytypedefs */                                     \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Array##X##Size = Array<Type, Dynamic, Size>;
+
+EIGEN_MAKE_ARRAY_TYPEDEFS(2, 2)
+EIGEN_MAKE_ARRAY_TYPEDEFS(3, 3)
+EIGEN_MAKE_ARRAY_TYPEDEFS(4, 4)
+EIGEN_MAKE_ARRAY_TYPEDEFS(Dynamic, X)
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(2)
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(3)
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(4)
+
+#undef EIGEN_MAKE_ARRAY_TYPEDEFS
+#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS
+
+#endif // EIGEN_HAS_CXX11
 
 #define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \
 using Eigen::Matrix##SizeSuffix##TypeSuffix; \

diff --git a/Eigen/src/Core/ArrayBase.h b/Eigen/src/Core/ArrayBase.h
index e9efd31..ea3dd1c 100644
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h

@@ -32,7 +32,7 @@
   * \tparam Derived is the derived type, e.g., an array or an expression type.
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_ARRAYBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_ARRAYBASE_PLUGIN.
   *
   * \sa class MatrixBase, \ref TopicClassHierarchy
   */
@@ -46,11 +46,7 @@
 
     typedef ArrayBase Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl;
 
-    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator*;
-
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -64,8 +60,7 @@
     using Base::MaxSizeAtCompileTime;
     using Base::IsVectorAtCompileTime;
     using Base::Flags;
-    using Base::CoeffReadCost;
-
+    
     using Base::derived;
     using Base::const_cast_derived;
     using Base::rows;
@@ -74,6 +69,7 @@
     using Base::coeff;
     using Base::coeffRef;
     using Base::lazyAssign;
+    using Base::operator-;
     using Base::operator=;
     using Base::operator+=;
     using Base::operator-=;
@@ -85,26 +81,14 @@
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** \internal the plain matrix type corresponding to this expression. Note that is not necessarily
-      * exactly the return type of eval(): in the case of plain matrices, the return type of eval() is a const
-      * reference to a matrix, not a matrix! It is however guaranteed that the return type of eval() is either
-      * PlainObject or const PlainObject&.
-      */
-    typedef Array<typename internal::traits<Derived>::Scalar,
-                internal::traits<Derived>::RowsAtCompileTime,
-                internal::traits<Derived>::ColsAtCompileTime,
-                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
-                internal::traits<Derived>::MaxRowsAtCompileTime,
-                internal::traits<Derived>::MaxColsAtCompileTime
-          > PlainObject;
-
+    typedef typename Base::PlainObject PlainObject;
 
     /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase
-#   include "../plugins/CommonCwiseUnaryOps.h"
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
 #   include "../plugins/MatrixCwiseUnaryOps.h"
 #   include "../plugins/ArrayCwiseUnaryOps.h"
 #   include "../plugins/CommonCwiseBinaryOps.h"
@@ -114,34 +98,42 @@
 #     include EIGEN_ARRAYBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_DOC_UNARY_ADDONS
 
     /** Special case of the template operator=, in order to prevent the compiler
       * from generating a default operator= (issue hit with g++ 4.1)
       */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator=(const ArrayBase& other)
     {
-      return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
+      internal::call_assignment(derived(), other.derived());
+      return derived();
     }
+    
+    /** Set all the entries to \a value.
+      * \sa DenseBase::setConstant(), DenseBase::fill() */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator=(const Scalar &value)
+    { Base::setConstant(value); return derived(); }
 
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator+=(const Scalar& scalar);
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator-=(const Scalar& scalar);
 
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator+=(const ArrayBase<OtherDerived>& other);
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator-=(const ArrayBase<OtherDerived>& other);
 
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator*=(const ArrayBase<OtherDerived>& other);
 
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator/=(const ArrayBase<OtherDerived>& other);
 
   public:
@@ -153,16 +145,16 @@
     /** \returns an \link Eigen::MatrixBase Matrix \endlink expression of this array
       * \sa MatrixBase::array() */
     EIGEN_DEVICE_FUNC
-    MatrixWrapper<Derived> matrix() { return derived(); }
+    MatrixWrapper<Derived> matrix() { return MatrixWrapper<Derived>(derived()); }
     EIGEN_DEVICE_FUNC
-    const MatrixWrapper<const Derived> matrix() const { return derived(); }
+    const MatrixWrapper<const Derived> matrix() const { return MatrixWrapper<const Derived>(derived()); }
 
 //     template<typename Dest>
 //     inline void evalTo(Dest& dst) const { dst = matrix(); }
 
   protected:
-    EIGEN_DEVICE_FUNC
-    ArrayBase() : Base() {}
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(ArrayBase)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(ArrayBase)
 
   private:
     explicit ArrayBase(Index);
@@ -186,8 +178,7 @@
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
 {
-  SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
@@ -200,8 +191,7 @@
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
 {
-  SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
@@ -214,8 +204,7 @@
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
 {
-  SelfCwiseBinaryOp<internal::scalar_product_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
@@ -228,8 +217,7 @@
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
 {
-  SelfCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 

diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h
index 4bb6480..2e9555b 100644
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h

@@ -10,7 +10,7 @@
 #ifndef EIGEN_ARRAYWRAPPER_H
 #define EIGEN_ARRAYWRAPPER_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \class ArrayWrapper
   * \ingroup Core_Module
@@ -29,6 +29,12 @@
   : public traits<typename remove_all<typename ExpressionType::Nested>::type >
 {
   typedef ArrayXpr XprKind;
+  // Let's remove NestByRefBit
+  enum {
+    Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
+    LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
+    Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
+  };
 };
 }
 
@@ -39,6 +45,7 @@
     typedef ArrayBase<ArrayWrapper> Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(ArrayWrapper)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ArrayWrapper)
+    typedef typename internal::remove_all<ExpressionType>::type NestedExpression;
 
     typedef typename internal::conditional<
                        internal::is_lvalue<ExpressionType>::value,
@@ -46,92 +53,46 @@
                        const Scalar
                      >::type ScalarWithConstIfNotLvalue;
 
-    typedef typename internal::nested<ExpressionType>::type NestedExpressionType;
+    typedef typename internal::ref_selector<ExpressionType>::non_const_type NestedExpressionType;
+
+    using Base::coeffRef;
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
+    explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); }
 
     EIGEN_DEVICE_FUNC
-    inline Index rows() const { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC
-    inline Index cols() const { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const { return m_expression.innerStride(); }
-
-    EIGEN_DEVICE_FUNC
-    inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
+    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
     EIGEN_DEVICE_FUNC
     inline const Scalar* data() const { return m_expression.data(); }
 
     EIGEN_DEVICE_FUNC
-    inline CoeffReturnType coeff(Index rowId, Index colId) const
-    {
-      return m_expression.coeff(rowId, colId);
-    }
-
-    EIGEN_DEVICE_FUNC
-    inline Scalar& coeffRef(Index rowId, Index colId)
-    {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
-    }
-
-    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index rowId, Index colId) const
     {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
-    }
-
-    EIGEN_DEVICE_FUNC
-    inline CoeffReturnType coeff(Index index) const
-    {
-      return m_expression.coeff(index);
-    }
-
-    EIGEN_DEVICE_FUNC
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_expression.const_cast_derived().coeffRef(index);
+      return m_expression.coeffRef(rowId, colId);
     }
 
     EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index index) const
     {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index rowId, Index colId) const
-    {
-      return m_expression.template packet<LoadMode>(rowId, colId);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(rowId, colId, val);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return m_expression.template packet<LoadMode>(index);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& val)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(index, val);
+      return m_expression.coeffRef(index);
     }
 
     template<typename Dest>
     EIGEN_DEVICE_FUNC
     inline void evalTo(Dest& dst) const { dst = m_expression; }
 
-    const typename internal::remove_all<NestedExpressionType>::type& 
     EIGEN_DEVICE_FUNC
-    nestedExpression() const 
+    const typename internal::remove_all<NestedExpressionType>::type&
+    nestedExpression() const
     {
       return m_expression;
     }
@@ -139,11 +100,11 @@
     /** Forwards the resizing request to the nested expression
       * \sa DenseBase::resize(Index)  */
     EIGEN_DEVICE_FUNC
-    void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
+    void resize(Index newSize) { m_expression.resize(newSize); }
     /** Forwards the resizing request to the nested expression
       * \sa DenseBase::resize(Index,Index)*/
     EIGEN_DEVICE_FUNC
-    void resize(Index nbRows, Index nbCols) { m_expression.const_cast_derived().resize(nbRows,nbCols); }
+    void resize(Index rows, Index cols) { m_expression.resize(rows,cols); }
 
   protected:
     NestedExpressionType m_expression;
@@ -166,6 +127,12 @@
  : public traits<typename remove_all<typename ExpressionType::Nested>::type >
 {
   typedef MatrixXpr XprKind;
+  // Let's remove NestByRefBit
+  enum {
+    Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
+    LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
+    Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
+  };
 };
 }
 
@@ -176,6 +143,7 @@
     typedef MatrixBase<MatrixWrapper<ExpressionType> > Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(MatrixWrapper)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(MatrixWrapper)
+    typedef typename internal::remove_all<ExpressionType>::type NestedExpression;
 
     typedef typename internal::conditional<
                        internal::is_lvalue<ExpressionType>::value,
@@ -183,88 +151,42 @@
                        const Scalar
                      >::type ScalarWithConstIfNotLvalue;
 
-    typedef typename internal::nested<ExpressionType>::type NestedExpressionType;
+    typedef typename internal::ref_selector<ExpressionType>::non_const_type NestedExpressionType;
+
+    using Base::coeffRef;
 
     EIGEN_DEVICE_FUNC
-    inline MatrixWrapper(ExpressionType& a_matrix) : m_expression(a_matrix) {}
+    explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {}
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); }
 
     EIGEN_DEVICE_FUNC
-    inline Index rows() const { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC
-    inline Index cols() const { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const { return m_expression.innerStride(); }
-
-    EIGEN_DEVICE_FUNC
-    inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
+    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
     EIGEN_DEVICE_FUNC
     inline const Scalar* data() const { return m_expression.data(); }
 
     EIGEN_DEVICE_FUNC
-    inline CoeffReturnType coeff(Index rowId, Index colId) const
-    {
-      return m_expression.coeff(rowId, colId);
-    }
-
-    EIGEN_DEVICE_FUNC
-    inline Scalar& coeffRef(Index rowId, Index colId)
-    {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
-    }
-
-    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index rowId, Index colId) const
     {
       return m_expression.derived().coeffRef(rowId, colId);
     }
 
     EIGEN_DEVICE_FUNC
-    inline CoeffReturnType coeff(Index index) const
-    {
-      return m_expression.coeff(index);
-    }
-
-    EIGEN_DEVICE_FUNC
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index index) const
     {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index rowId, Index colId) const
-    {
-      return m_expression.template packet<LoadMode>(rowId, colId);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(rowId, colId, val);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return m_expression.template packet<LoadMode>(index);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& val)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(index, val);
+      return m_expression.coeffRef(index);
     }
 
     EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<NestedExpressionType>::type& 
-    nestedExpression() const 
+    const typename internal::remove_all<NestedExpressionType>::type&
+    nestedExpression() const
     {
       return m_expression;
     }
@@ -272,11 +194,11 @@
     /** Forwards the resizing request to the nested expression
       * \sa DenseBase::resize(Index)  */
     EIGEN_DEVICE_FUNC
-    void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
+    void resize(Index newSize) { m_expression.resize(newSize); }
     /** Forwards the resizing request to the nested expression
       * \sa DenseBase::resize(Index,Index)*/
     EIGEN_DEVICE_FUNC
-    void resize(Index nbRows, Index nbCols) { m_expression.const_cast_derived().resize(nbRows,nbCols); }
+    void resize(Index rows, Index cols) { m_expression.resize(rows,cols); }
 
   protected:
     NestedExpressionType m_expression;

diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index 0032aad..655412e 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h

@@ -14,489 +14,9 @@
 
 namespace Eigen {
 
-namespace internal {
-
-/***************************************************************************
-* Part 1 : the logic deciding a strategy for traversal and unrolling       *
-***************************************************************************/
-
-template <typename Derived, typename OtherDerived>
-struct assign_traits
-{
-public:
-  enum {
-    DstIsAligned = Derived::Flags & AlignedBit,
-    DstHasDirectAccess = Derived::Flags & DirectAccessBit,
-    SrcIsAligned = OtherDerived::Flags & AlignedBit,
-    JointAlignment = bool(DstIsAligned) && bool(SrcIsAligned) ? Aligned : Unaligned
-  };
-
-private:
-  enum {
-    InnerSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::SizeAtCompileTime)
-              : int(Derived::Flags)&RowMajorBit ? int(Derived::ColsAtCompileTime)
-              : int(Derived::RowsAtCompileTime),
-    InnerMaxSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::MaxSizeAtCompileTime)
-              : int(Derived::Flags)&RowMajorBit ? int(Derived::MaxColsAtCompileTime)
-              : int(Derived::MaxRowsAtCompileTime),
-    MaxSizeAtCompileTime = Derived::SizeAtCompileTime,
-    PacketSize = packet_traits<typename Derived::Scalar>::size
-  };
-
-  enum {
-    StorageOrdersAgree = (int(Derived::IsRowMajor) == int(OtherDerived::IsRowMajor)),
-    MightVectorize = StorageOrdersAgree
-                  && (int(Derived::Flags) & int(OtherDerived::Flags) & ActualPacketAccessBit),
-    MayInnerVectorize  = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
-                       && int(DstIsAligned) && int(SrcIsAligned),
-    MayLinearize = StorageOrdersAgree && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
-    MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
-                       && (DstIsAligned || MaxSizeAtCompileTime == Dynamic),
-      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
-         so it's only good for large enough sizes. */
-    MaySliceVectorize  = MightVectorize && DstHasDirectAccess
-                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*PacketSize)
-      /* slice vectorization can be slow, so we only want it if the slices are big, which is
-         indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
-         in a fixed-size matrix */
-  };
-
-public:
-  enum {
-    Traversal = int(MayInnerVectorize)  ? int(InnerVectorizedTraversal)
-              : int(MayLinearVectorize) ? int(LinearVectorizedTraversal)
-              : int(MaySliceVectorize)  ? int(SliceVectorizedTraversal)
-              : int(MayLinearize)       ? int(LinearTraversal)
-                                        : int(DefaultTraversal),
-    Vectorized = int(Traversal) == InnerVectorizedTraversal
-              || int(Traversal) == LinearVectorizedTraversal
-              || int(Traversal) == SliceVectorizedTraversal
-  };
-
-private:
-  enum {
-    UnrollingLimit      = EIGEN_UNROLLING_LIMIT * (Vectorized ? int(PacketSize) : 1),
-    MayUnrollCompletely = int(Derived::SizeAtCompileTime) != Dynamic
-                       && int(OtherDerived::CoeffReadCost) != Dynamic
-                       && int(Derived::SizeAtCompileTime) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit),
-    MayUnrollInner      = int(InnerSize) != Dynamic
-                       && int(OtherDerived::CoeffReadCost) != Dynamic
-                       && int(InnerSize) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit)
-  };
-
-public:
-  enum {
-    Unrolling = (int(Traversal) == int(InnerVectorizedTraversal) || int(Traversal) == int(DefaultTraversal))
-                ? (
-                    int(MayUnrollCompletely) ? int(CompleteUnrolling)
-                  : int(MayUnrollInner)      ? int(InnerUnrolling)
-                                             : int(NoUnrolling)
-                  )
-              : int(Traversal) == int(LinearVectorizedTraversal)
-                ? ( bool(MayUnrollCompletely) && bool(DstIsAligned) ? int(CompleteUnrolling) : int(NoUnrolling) )
-              : int(Traversal) == int(LinearTraversal)
-                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) : int(NoUnrolling) )
-              : int(NoUnrolling)
-  };
-
-#ifdef EIGEN_DEBUG_ASSIGN
-  static void debug()
-  {
-    EIGEN_DEBUG_VAR(DstIsAligned)
-    EIGEN_DEBUG_VAR(SrcIsAligned)
-    EIGEN_DEBUG_VAR(JointAlignment)
-    EIGEN_DEBUG_VAR(Derived::SizeAtCompileTime)
-    EIGEN_DEBUG_VAR(OtherDerived::CoeffReadCost)
-    EIGEN_DEBUG_VAR(InnerSize)
-    EIGEN_DEBUG_VAR(InnerMaxSize)
-    EIGEN_DEBUG_VAR(PacketSize)
-    EIGEN_DEBUG_VAR(StorageOrdersAgree)
-    EIGEN_DEBUG_VAR(MightVectorize)
-    EIGEN_DEBUG_VAR(MayLinearize)
-    EIGEN_DEBUG_VAR(MayInnerVectorize)
-    EIGEN_DEBUG_VAR(MayLinearVectorize)
-    EIGEN_DEBUG_VAR(MaySliceVectorize)
-    EIGEN_DEBUG_VAR(Traversal)
-    EIGEN_DEBUG_VAR(UnrollingLimit)
-    EIGEN_DEBUG_VAR(MayUnrollCompletely)
-    EIGEN_DEBUG_VAR(MayUnrollInner)
-    EIGEN_DEBUG_VAR(Unrolling)
-  }
-#endif
-};
-
-/***************************************************************************
-* Part 2 : meta-unrollers
-***************************************************************************/
-
-/************************
-*** Default traversal ***
-************************/
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_DefaultTraversal_CompleteUnrolling
-{
-  enum {
-    outer = Index / Derived1::InnerSizeAtCompileTime,
-    inner = Index % Derived1::InnerSizeAtCompileTime
-  };
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    dst.copyCoeffByOuterInner(outer, inner, src);
-    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
-};
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_DefaultTraversal_InnerUnrolling
-{
-  EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src, typename Derived1::Index outer)
-  {
-    dst.copyCoeffByOuterInner(outer, Index, src);
-    assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src, outer);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &, typename Derived1::Index) {}
-};
-
-/***********************
-*** Linear traversal ***
-***********************/
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_LinearTraversal_CompleteUnrolling
-{
-  EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    dst.copyCoeff(Index, src);
-    assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
-};
-
-/**************************
-*** Inner vectorization ***
-**************************/
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_innervec_CompleteUnrolling
-{
-  enum {
-    outer = Index / Derived1::InnerSizeAtCompileTime,
-    inner = Index % Derived1::InnerSizeAtCompileTime,
-    JointAlignment = assign_traits<Derived1,Derived2>::JointAlignment
-  };
-
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    dst.template copyPacketByOuterInner<Derived2, Aligned, JointAlignment>(outer, inner, src);
-    assign_innervec_CompleteUnrolling<Derived1, Derived2,
-      Index+packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_innervec_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
-};
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_innervec_InnerUnrolling
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src, typename Derived1::Index outer)
-  {
-    dst.template copyPacketByOuterInner<Derived2, Aligned, Aligned>(outer, Index, src);
-    assign_innervec_InnerUnrolling<Derived1, Derived2,
-      Index+packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src, outer);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_innervec_InnerUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &, typename Derived1::Index) {}
-};
-
-/***************************************************************************
-* Part 3 : implementation of all cases
-***************************************************************************/
-
-template<typename Derived1, typename Derived2,
-         int Traversal = assign_traits<Derived1, Derived2>::Traversal,
-         int Unrolling = assign_traits<Derived1, Derived2>::Unrolling,
-         int Version = Specialized>
-struct assign_impl;
-
-/************************
-*** Default traversal ***
-************************/
-
-template<typename Derived1, typename Derived2, int Unrolling, int Version>
-struct assign_impl<Derived1, Derived2, InvalidTraversal, Unrolling, Version>
-{
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &, const Derived2 &) { }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, DefaultTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  EIGEN_DEVICE_FUNC 
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index innerSize = dst.innerSize();
-    const Index outerSize = dst.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer)
-      for(Index inner = 0; inner < innerSize; ++inner)
-        dst.copyCoeffByOuterInner(outer, inner, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, DefaultTraversal, CompleteUnrolling, Version>
-{
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
-      ::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, DefaultTraversal, InnerUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index outerSize = dst.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer)
-      assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, 0, Derived1::InnerSizeAtCompileTime>
-        ::run(dst, src, outer);
-  }
-};
-
-/***********************
-*** Linear traversal ***
-***********************/
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, LinearTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index size = dst.size();
-    for(Index i = 0; i < size; ++i)
-      dst.copyCoeff(i, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, LinearTraversal, CompleteUnrolling, Version>
-{
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
-      ::run(dst, src);
-  }
-};
-
-/**************************
-*** Inner vectorization ***
-**************************/
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, InnerVectorizedTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index innerSize = dst.innerSize();
-    const Index outerSize = dst.outerSize();
-    const Index packetSize = packet_traits<typename Derived1::Scalar>::size;
-    for(Index outer = 0; outer < outerSize; ++outer)
-      for(Index inner = 0; inner < innerSize; inner+=packetSize)
-        dst.template copyPacketByOuterInner<Derived2, Aligned, Aligned>(outer, inner, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, InnerVectorizedTraversal, CompleteUnrolling, Version>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
-      ::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, InnerVectorizedTraversal, InnerUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index outerSize = dst.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer)
-      assign_innervec_InnerUnrolling<Derived1, Derived2, 0, Derived1::InnerSizeAtCompileTime>
-        ::run(dst, src, outer);
-  }
-};
-
-/***************************
-*** Linear vectorization ***
-***************************/
-
-template <bool IsAligned = false>
-struct unaligned_assign_impl
-{
-  template <typename Derived, typename OtherDerived>
-  static EIGEN_STRONG_INLINE void run(const Derived&, OtherDerived&, typename Derived::Index, typename Derived::Index) {}
-};
-
-template <>
-struct unaligned_assign_impl<false>
-{
-  // MSVC must not inline this functions. If it does, it fails to optimize the
-  // packet access path.
-#ifdef _MSC_VER
-  template <typename Derived, typename OtherDerived>
-  static EIGEN_DONT_INLINE void run(const Derived& src, OtherDerived& dst, typename Derived::Index start, typename Derived::Index end)
-#else
-  template <typename Derived, typename OtherDerived>
-  static EIGEN_STRONG_INLINE void run(const Derived& src, OtherDerived& dst, typename Derived::Index start, typename Derived::Index end)
-#endif
-  {
-    for (typename Derived::Index index = start; index < end; ++index)
-      dst.copyCoeff(index, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, LinearVectorizedTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index size = dst.size();
-    typedef packet_traits<typename Derived1::Scalar> PacketTraits;
-    enum {
-      packetSize = PacketTraits::size,
-      dstAlignment = PacketTraits::AlignedOnScalar ? Aligned : int(assign_traits<Derived1,Derived2>::DstIsAligned) ,
-      srcAlignment = assign_traits<Derived1,Derived2>::JointAlignment
-    };
-    const Index alignedStart = assign_traits<Derived1,Derived2>::DstIsAligned ? 0
-                             : internal::first_aligned(&dst.coeffRef(0), size);
-    const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
-
-    unaligned_assign_impl<assign_traits<Derived1,Derived2>::DstIsAligned!=0>::run(src,dst,0,alignedStart);
-
-    for(Index index = alignedStart; index < alignedEnd; index += packetSize)
-    {
-      dst.template copyPacket<Derived2, dstAlignment, srcAlignment>(index, src);
-    }
-
-    unaligned_assign_impl<>::run(src,dst,alignedEnd,size);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, LinearVectorizedTraversal, CompleteUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    enum { size = Derived1::SizeAtCompileTime,
-           packetSize = packet_traits<typename Derived1::Scalar>::size,
-           alignedSize = (size/packetSize)*packetSize };
-
-    assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, alignedSize>::run(dst, src);
-    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, alignedSize, size>::run(dst, src);
-  }
-};
-
-/**************************
-*** Slice vectorization ***
-***************************/
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, SliceVectorizedTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    typedef packet_traits<typename Derived1::Scalar> PacketTraits;
-    enum {
-      packetSize = PacketTraits::size,
-      alignable = PacketTraits::AlignedOnScalar,
-      dstAlignment = alignable ? Aligned : int(assign_traits<Derived1,Derived2>::DstIsAligned) ,
-      srcAlignment = assign_traits<Derived1,Derived2>::JointAlignment
-    };
-    const Index packetAlignedMask = packetSize - 1;
-    const Index innerSize = dst.innerSize();
-    const Index outerSize = dst.outerSize();
-    const Index alignedStep = alignable ? (packetSize - dst.outerStride() % packetSize) & packetAlignedMask : 0;
-    Index alignedStart = ((!alignable) || assign_traits<Derived1,Derived2>::DstIsAligned) ? 0
-                       : internal::first_aligned(&dst.coeffRef(0,0), innerSize);
-
-    for(Index outer = 0; outer < outerSize; ++outer)
-    {
-      const Index alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask);
-      // do the non-vectorizable part of the assignment
-      for(Index inner = 0; inner<alignedStart ; ++inner)
-        dst.copyCoeffByOuterInner(outer, inner, src);
-
-      // do the vectorizable part of the assignment
-      for(Index inner = alignedStart; inner<alignedEnd; inner+=packetSize)
-        dst.template copyPacketByOuterInner<Derived2, dstAlignment, Unaligned>(outer, inner, src);
-
-      // do the non-vectorizable part of the assignment
-      for(Index inner = alignedEnd; inner<innerSize ; ++inner)
-        dst.copyCoeffByOuterInner(outer, inner, src);
-
-      alignedStart = std::min<Index>((alignedStart+alignedStep)%packetSize, innerSize);
-    }
-  }
-};
-
-} // end namespace internal
-
-/***************************************************************************
-* Part 4 : implementation of DenseBase methods
-***************************************************************************/
-
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
   ::lazyAssign(const DenseBase<OtherDerived>& other)
 {
   enum{
@@ -507,91 +27,35 @@
   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived,OtherDerived)
   EIGEN_STATIC_ASSERT(SameType,YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
 
-#ifdef EIGEN_TEST_EVALUATORS
-  
-#ifdef EIGEN_DEBUG_ASSIGN
-  internal::copy_using_evaluator_traits<Derived, OtherDerived>::debug();
-#endif
   eigen_assert(rows() == other.rows() && cols() == other.cols());
-  internal::call_dense_assignment_loop(derived(),other.derived());
+  internal::call_assignment_no_alias(derived(),other.derived());
   
-#else // EIGEN_TEST_EVALUATORS
-
-#ifdef EIGEN_DEBUG_ASSIGN
-  internal::assign_traits<Derived, OtherDerived>::debug();
-#endif
-  eigen_assert(rows() == other.rows() && cols() == other.cols());
-  internal::assign_impl<Derived, OtherDerived, int(SameType) ? int(internal::assign_traits<Derived, OtherDerived>::Traversal)
-                                                             : int(InvalidTraversal)>::run(derived(),other.derived());
-  
-#endif // EIGEN_TEST_EVALUATORS
-  
-#ifndef EIGEN_NO_DEBUG
-  checkTransposeAliasing(other.derived());
-#endif
   return derived();
 }
 
-namespace internal {
-
-template<typename Derived, typename OtherDerived,
-         bool EvalBeforeAssigning = (int(internal::traits<OtherDerived>::Flags) & EvalBeforeAssigningBit) != 0,
-         bool NeedToTranspose = ((int(Derived::RowsAtCompileTime) == 1 && int(OtherDerived::ColsAtCompileTime) == 1)
-                              |   // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
-                                  // revert to || as soon as not needed anymore.
-                                  (int(Derived::ColsAtCompileTime) == 1 && int(OtherDerived::RowsAtCompileTime) == 1))
-                              && int(Derived::SizeAtCompileTime) != 1>
-struct assign_selector;
-
-template<typename Derived, typename OtherDerived>
-struct assign_selector<Derived,OtherDerived,false,false> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.derived()); }
-  template<typename ActualDerived, typename ActualOtherDerived>
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Derived& evalTo(ActualDerived& dst, const ActualOtherDerived& other) { other.evalTo(dst); return dst; }
-};
-template<typename Derived, typename OtherDerived>
-struct assign_selector<Derived,OtherDerived,true,false> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.eval()); }
-};
-template<typename Derived, typename OtherDerived>
-struct assign_selector<Derived,OtherDerived,false,true> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose()); }
-  template<typename ActualDerived, typename ActualOtherDerived>
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Derived& evalTo(ActualDerived& dst, const ActualOtherDerived& other) { Transpose<ActualDerived> dstTrans(dst); other.evalTo(dstTrans); return dst; }
-};
-template<typename Derived, typename OtherDerived>
-struct assign_selector<Derived,OtherDerived,true,true> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose().eval()); }
-};
-
-} // end namespace internal
-
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase<OtherDerived>& other)
 {
-  return internal::assign_selector<Derived,OtherDerived>::run(derived(), other.derived());
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
 template<typename Derived>
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase& other)
 {
-  return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
 template<typename Derived>
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const MatrixBase& other)
 {
-  return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
 template<typename Derived>
@@ -599,7 +63,8 @@
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const DenseBase<OtherDerived>& other)
 {
-  return internal::assign_selector<Derived,OtherDerived>::run(derived(), other.derived());
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
 template<typename Derived>
@@ -607,7 +72,8 @@
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const EigenBase<OtherDerived>& other)
 {
-  return internal::assign_selector<Derived,OtherDerived,false>::evalTo(derived(), other.derived());
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
 template<typename Derived>
@@ -615,7 +81,8 @@
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
 {
-  return internal::assign_selector<Derived,OtherDerived,false>::evalTo(derived(), other.derived());
+  other.derived().evalTo(derived());
+  return derived();
 }
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index b1e304e..7d76f0c 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h

@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2011-2013 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2011-2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -17,58 +17,90 @@
 // This implementation is based on Assign.h
 
 namespace internal {
-  
+
 /***************************************************************************
 * Part 1 : the logic deciding a strategy for traversal and unrolling       *
 ***************************************************************************/
 
 // copy_using_evaluator_traits is based on assign_traits
 
-template <typename Derived, typename OtherDerived>
+template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc, int MaxPacketSize = -1>
 struct copy_using_evaluator_traits
 {
+  typedef typename DstEvaluator::XprType Dst;
+  typedef typename Dst::Scalar DstScalar;
+
+  enum {
+    DstFlags = DstEvaluator::Flags,
+    SrcFlags = SrcEvaluator::Flags
+  };
+
 public:
   enum {
-    DstIsAligned = Derived::Flags & AlignedBit,
-    DstHasDirectAccess = Derived::Flags & DirectAccessBit,
-    SrcIsAligned = OtherDerived::Flags & AlignedBit,
-    JointAlignment = bool(DstIsAligned) && bool(SrcIsAligned) ? Aligned : Unaligned,
-    SrcEvalBeforeAssign = (evaluator_traits<OtherDerived>::HasEvalTo == 1)
+    DstAlignment = DstEvaluator::Alignment,
+    SrcAlignment = SrcEvaluator::Alignment,
+    DstHasDirectAccess = (DstFlags & DirectAccessBit) == DirectAccessBit,
+    JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment)
   };
 
 private:
   enum {
-    InnerSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::SizeAtCompileTime)
-              : int(Derived::Flags)&RowMajorBit ? int(Derived::ColsAtCompileTime)
-              : int(Derived::RowsAtCompileTime),
-    InnerMaxSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::MaxSizeAtCompileTime)
-              : int(Derived::Flags)&RowMajorBit ? int(Derived::MaxColsAtCompileTime)
-              : int(Derived::MaxRowsAtCompileTime),
-    MaxSizeAtCompileTime = Derived::SizeAtCompileTime,
-    PacketSize = packet_traits<typename Derived::Scalar>::size
+    InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime)
+              : int(DstFlags)&RowMajorBit ? int(Dst::ColsAtCompileTime)
+              : int(Dst::RowsAtCompileTime),
+    InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime)
+              : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime)
+              : int(Dst::MaxRowsAtCompileTime),
+    RestrictedInnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(InnerSize,MaxPacketSize),
+    RestrictedLinearSize = EIGEN_SIZE_MIN_PREFER_FIXED(Dst::SizeAtCompileTime,MaxPacketSize),
+    OuterStride = int(outer_stride_at_compile_time<Dst>::ret),
+    MaxSizeAtCompileTime = Dst::SizeAtCompileTime
   };
 
+  // TODO distinguish between linear traversal and inner-traversals
+  typedef typename find_best_packet<DstScalar,RestrictedLinearSize>::type LinearPacketType;
+  typedef typename find_best_packet<DstScalar,RestrictedInnerSize>::type InnerPacketType;
+
   enum {
-    StorageOrdersAgree = (int(Derived::IsRowMajor) == int(OtherDerived::IsRowMajor)),
-    MightVectorize = StorageOrdersAgree
-                  && (int(Derived::Flags) & int(OtherDerived::Flags) & ActualPacketAccessBit),
-    MayInnerVectorize  = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
-                       && int(DstIsAligned) && int(SrcIsAligned),
-    MayLinearize = StorageOrdersAgree && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
-    MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
-                       && (DstIsAligned || MaxSizeAtCompileTime == Dynamic),
-      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
-         so it's only good for large enough sizes. */
-    MaySliceVectorize  = MightVectorize && DstHasDirectAccess
-                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*PacketSize)
-      /* slice vectorization can be slow, so we only want it if the slices are big, which is
-         indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
-         in a fixed-size matrix */
+    LinearPacketSize = unpacket_traits<LinearPacketType>::size,
+    InnerPacketSize = unpacket_traits<InnerPacketType>::size
   };
 
 public:
   enum {
-    Traversal = int(SrcEvalBeforeAssign) ? int(AllAtOnceTraversal) 
+    LinearRequiredAlignment = unpacket_traits<LinearPacketType>::alignment,
+    InnerRequiredAlignment = unpacket_traits<InnerPacketType>::alignment
+  };
+
+private:
+  enum {
+    DstIsRowMajor = DstFlags&RowMajorBit,
+    SrcIsRowMajor = SrcFlags&RowMajorBit,
+    StorageOrdersAgree = (int(DstIsRowMajor) == int(SrcIsRowMajor)),
+    MightVectorize = bool(StorageOrdersAgree)
+                  && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit)
+                  && bool(functor_traits<AssignFunc>::PacketAccess),
+    MayInnerVectorize  = MightVectorize
+                       && int(InnerSize)!=Dynamic && int(InnerSize)%int(InnerPacketSize)==0
+                       && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0
+                       && (EIGEN_UNALIGNED_VECTORIZE  || int(JointAlignment)>=int(InnerRequiredAlignment)),
+    MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
+    MayLinearVectorize = bool(MightVectorize) && bool(MayLinearize) && bool(DstHasDirectAccess)
+                       && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
+      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
+         so it's only good for large enough sizes. */
+    MaySliceVectorize  = bool(MightVectorize) && bool(DstHasDirectAccess)
+                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=(EIGEN_UNALIGNED_VECTORIZE?InnerPacketSize:(3*InnerPacketSize)))
+      /* slice vectorization can be slow, so we only want it if the slices are big, which is
+         indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
+         in a fixed-size matrix
+         However, with EIGEN_UNALIGNED_VECTORIZE and unrolling, slice vectorization is still worth it */
+  };
+
+public:
+  enum {
+    Traversal =  int(Dst::SizeAtCompileTime) == 0 ? int(AllAtOnceTraversal) // If compile-size is zero, traversing will fail at compile-time.
+              : (int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize)) ? int(LinearVectorizedTraversal)
               : int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
               : int(MayLinearVectorize)  ? int(LinearVectorizedTraversal)
               : int(MaySliceVectorize)   ? int(SliceVectorizedTraversal)
@@ -79,15 +111,18 @@
               || int(Traversal) == SliceVectorizedTraversal
   };
 
+  typedef typename conditional<int(Traversal)==LinearVectorizedTraversal, LinearPacketType, InnerPacketType>::type PacketType;
+
 private:
   enum {
-    UnrollingLimit      = EIGEN_UNROLLING_LIMIT * (Vectorized ? int(PacketSize) : 1),
-    MayUnrollCompletely = int(Derived::SizeAtCompileTime) != Dynamic
-                       && int(OtherDerived::CoeffReadCost) != Dynamic
-                       && int(Derived::SizeAtCompileTime) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit),
+    ActualPacketSize    = int(Traversal)==LinearVectorizedTraversal ? LinearPacketSize
+                        : Vectorized ? InnerPacketSize
+                        : 1,
+    UnrollingLimit      = EIGEN_UNROLLING_LIMIT * ActualPacketSize,
+    MayUnrollCompletely = int(Dst::SizeAtCompileTime) != Dynamic
+                       && int(Dst::SizeAtCompileTime) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit),
     MayUnrollInner      = int(InnerSize) != Dynamic
-                       && int(OtherDerived::CoeffReadCost) != Dynamic
-                       && int(InnerSize) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit)
+                       && int(InnerSize) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit)
   };
 
 public:
@@ -99,34 +134,54 @@
                                              : int(NoUnrolling)
                   )
               : int(Traversal) == int(LinearVectorizedTraversal)
-                ? ( bool(MayUnrollCompletely) && bool(DstIsAligned) ? int(CompleteUnrolling) 
-                                                                    : int(NoUnrolling) )
+                ? ( bool(MayUnrollCompletely) && ( EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)))
+                          ? int(CompleteUnrolling)
+                          : int(NoUnrolling) )
               : int(Traversal) == int(LinearTraversal)
-                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) 
+                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling)
                                               : int(NoUnrolling) )
+#if EIGEN_UNALIGNED_VECTORIZE
+              : int(Traversal) == int(SliceVectorizedTraversal)
+                ? ( bool(MayUnrollInner) ? int(InnerUnrolling)
+                                         : int(NoUnrolling) )
+#endif
               : int(NoUnrolling)
   };
 
 #ifdef EIGEN_DEBUG_ASSIGN
   static void debug()
   {
-    EIGEN_DEBUG_VAR(DstIsAligned)
-    EIGEN_DEBUG_VAR(SrcIsAligned)
+    std::cerr << "DstXpr: " << typeid(typename DstEvaluator::XprType).name() << std::endl;
+    std::cerr << "SrcXpr: " << typeid(typename SrcEvaluator::XprType).name() << std::endl;
+    std::cerr.setf(std::ios::hex, std::ios::basefield);
+    std::cerr << "DstFlags" << " = " << DstFlags << " (" << demangle_flags(DstFlags) << " )" << std::endl;
+    std::cerr << "SrcFlags" << " = " << SrcFlags << " (" << demangle_flags(SrcFlags) << " )" << std::endl;
+    std::cerr.unsetf(std::ios::hex);
+    EIGEN_DEBUG_VAR(DstAlignment)
+    EIGEN_DEBUG_VAR(SrcAlignment)
+    EIGEN_DEBUG_VAR(LinearRequiredAlignment)
+    EIGEN_DEBUG_VAR(InnerRequiredAlignment)
     EIGEN_DEBUG_VAR(JointAlignment)
     EIGEN_DEBUG_VAR(InnerSize)
     EIGEN_DEBUG_VAR(InnerMaxSize)
-    EIGEN_DEBUG_VAR(PacketSize)
+    EIGEN_DEBUG_VAR(LinearPacketSize)
+    EIGEN_DEBUG_VAR(InnerPacketSize)
+    EIGEN_DEBUG_VAR(ActualPacketSize)
     EIGEN_DEBUG_VAR(StorageOrdersAgree)
     EIGEN_DEBUG_VAR(MightVectorize)
     EIGEN_DEBUG_VAR(MayLinearize)
     EIGEN_DEBUG_VAR(MayInnerVectorize)
     EIGEN_DEBUG_VAR(MayLinearVectorize)
     EIGEN_DEBUG_VAR(MaySliceVectorize)
-    EIGEN_DEBUG_VAR(Traversal)
+    std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
+    EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost)
+    EIGEN_DEBUG_VAR(DstEvaluator::CoeffReadCost)
+    EIGEN_DEBUG_VAR(Dst::SizeAtCompileTime)
     EIGEN_DEBUG_VAR(UnrollingLimit)
     EIGEN_DEBUG_VAR(MayUnrollCompletely)
     EIGEN_DEBUG_VAR(MayUnrollInner)
-    EIGEN_DEBUG_VAR(Unrolling)
+    std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl;
+    std::cerr << std::endl;
   }
 #endif
 };
@@ -142,15 +197,16 @@
 template<typename Kernel, int Index, int Stop>
 struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling
 {
+  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
   typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
   typedef typename DstEvaluatorType::XprType DstXprType;
-  
+
   enum {
     outer = Index / DstXprType::InnerSizeAtCompileTime,
     inner = Index % DstXprType::InnerSizeAtCompileTime
   };
 
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     kernel.assignCoeffByOuterInner(outer, inner);
     copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
@@ -160,23 +216,23 @@
 template<typename Kernel, int Stop>
 struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Stop, Stop>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel&) { }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
 };
 
-template<typename Kernel, int Index, int Stop>
+template<typename Kernel, int Index_, int Stop>
 struct copy_using_evaluator_DefaultTraversal_InnerUnrolling
 {
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel, int outer)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
   {
-    kernel.assignCoeffByOuterInner(outer, Index);
-    copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Index+1, Stop>::run(kernel, outer);
+    kernel.assignCoeffByOuterInner(outer, Index_);
+    copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Index_+1, Stop>::run(kernel, outer);
   }
 };
 
 template<typename Kernel, int Stop>
 struct copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Stop, Stop>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel&, int) { }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index) { }
 };
 
 /***********************
@@ -186,7 +242,7 @@
 template<typename Kernel, int Index, int Stop>
 struct copy_using_evaluator_LinearTraversal_CompleteUnrolling
 {
-  static EIGEN_STRONG_INLINE void run(Kernel& kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel)
   {
     kernel.assignCoeff(Index);
     copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
@@ -196,7 +252,7 @@
 template<typename Kernel, int Stop>
 struct copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Stop, Stop>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel&) { }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
 };
 
 /**************************
@@ -206,19 +262,22 @@
 template<typename Kernel, int Index, int Stop>
 struct copy_using_evaluator_innervec_CompleteUnrolling
 {
+  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
   typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
   typedef typename DstEvaluatorType::XprType DstXprType;
+  typedef typename Kernel::PacketType PacketType;
 
   enum {
     outer = Index / DstXprType::InnerSizeAtCompileTime,
     inner = Index % DstXprType::InnerSizeAtCompileTime,
-    JointAlignment = Kernel::AssignmentTraits::JointAlignment
+    SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
+    DstAlignment = Kernel::AssignmentTraits::DstAlignment
   };
 
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
-    kernel.template assignPacketByOuterInner<Aligned, JointAlignment>(outer, inner);
-    enum { NextIndex = Index + packet_traits<typename DstXprType::Scalar>::size };
+    kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner);
+    enum { NextIndex = Index + unpacket_traits<PacketType>::size };
     copy_using_evaluator_innervec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel);
   }
 };
@@ -226,25 +285,25 @@
 template<typename Kernel, int Stop>
 struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel&) { }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
 };
 
-template<typename Kernel, int Index, int Stop>
+template<typename Kernel, int Index_, int Stop, int SrcAlignment, int DstAlignment>
 struct copy_using_evaluator_innervec_InnerUnrolling
 {
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel, int outer)
+  typedef typename Kernel::PacketType PacketType;
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
   {
-    kernel.template assignPacketByOuterInner<Aligned, Aligned>(outer, Index);
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-    enum { NextIndex = Index + packet_traits<typename DstXprType::Scalar>::size };
-    copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop>::run(kernel, outer);
+    kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Index_);
+    enum { NextIndex = Index_ + unpacket_traits<PacketType>::size };
+    copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop, SrcAlignment, DstAlignment>::run(kernel, outer);
   }
 };
 
-template<typename Kernel, int Stop>
-struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop>
+template<typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop, SrcAlignment, DstAlignment>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel &, int) { }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &, Index) { }
 };
 
 /***************************************************************************
@@ -259,16 +318,30 @@
 struct dense_assignment_loop;
 
 /************************
+***** Special Cases *****
+************************/
+
+// Zero-sized assignment is a no-op.
+template<typename Kernel, int Unrolling>
+struct dense_assignment_loop<Kernel, AllAtOnceTraversal, Unrolling>
+{
+  EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE run(Kernel& /*kernel*/)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    EIGEN_STATIC_ASSERT(int(DstXprType::SizeAtCompileTime) == 0,
+      EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT)
+  }
+};
+
+/************************
 *** Default traversal ***
 ************************/
 
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>
 {
-  static void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE run(Kernel &kernel)
   {
-    typedef typename Kernel::Index Index;
-    
     for(Index outer = 0; outer < kernel.outerSize(); ++outer) {
       for(Index inner = 0; inner < kernel.innerSize(); ++inner) {
         kernel.assignCoeffByOuterInner(outer, inner);
@@ -280,7 +353,7 @@
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, DefaultTraversal, CompleteUnrolling>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
     copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
@@ -290,8 +363,7 @@
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, DefaultTraversal, InnerUnrolling>
 {
-  typedef typename Kernel::Index Index;
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
 
@@ -314,7 +386,7 @@
 {
   // if IsAligned = true, then do nothing
   template <typename Kernel>
-  static EIGEN_STRONG_INLINE void run(Kernel&, typename Kernel::Index, typename Kernel::Index) {}
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index, Index) {}
 };
 
 template <>
@@ -326,16 +398,16 @@
 #if EIGEN_COMP_MSVC
   template <typename Kernel>
   static EIGEN_DONT_INLINE void run(Kernel &kernel,
-                                    typename Kernel::Index start,
-                                    typename Kernel::Index end)
+                                    Index start,
+                                    Index end)
 #else
   template <typename Kernel>
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel,
-                                      typename Kernel::Index start,
-                                      typename Kernel::Index end)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel,
+                                      Index start,
+                                      Index end)
 #endif
   {
-    for (typename Kernel::Index index = start; index < end; ++index)
+    for (Index index = start; index < end; ++index)
       kernel.assignCoeff(index);
   }
 };
@@ -343,25 +415,26 @@
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
-    typedef typename Kernel::Index Index;
-
     const Index size = kernel.size();
-    typedef packet_traits<typename Kernel::Scalar> PacketTraits;
+    typedef typename Kernel::Scalar Scalar;
+    typedef typename Kernel::PacketType PacketType;
     enum {
-      packetSize = PacketTraits::size,
-      dstIsAligned = int(Kernel::AssignmentTraits::DstIsAligned),
-      dstAlignment = PacketTraits::AlignedOnScalar ? Aligned : dstIsAligned,
+      requestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment,
+      packetSize = unpacket_traits<PacketType>::size,
+      dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
+      dstAlignment = packet_traits<Scalar>::AlignedOnScalar ? int(requestedAlignment)
+                                                            : int(Kernel::AssignmentTraits::DstAlignment),
       srcAlignment = Kernel::AssignmentTraits::JointAlignment
     };
-    const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned(&kernel.dstEvaluator().coeffRef(0), size);
+    const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(kernel.dstDataPtr(), size);
     const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
 
     unaligned_dense_assignment_loop<dstIsAligned!=0>::run(kernel, 0, alignedStart);
 
     for(Index index = alignedStart; index < alignedEnd; index += packetSize)
-      kernel.template assignPacket<dstAlignment, srcAlignment>(index);
+      kernel.template assignPacket<dstAlignment, srcAlignment, PacketType>(index);
 
     unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
   }
@@ -370,14 +443,14 @@
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrolling>
 {
-  typedef typename Kernel::Index Index;
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-    
+    typedef typename Kernel::PacketType PacketType;
+
     enum { size = DstXprType::SizeAtCompileTime,
-           packetSize = packet_traits<typename Kernel::Scalar>::size,
-           alignedSize = (size/packetSize)*packetSize };
+           packetSize =unpacket_traits<PacketType>::size,
+           alignedSize = (int(size)/packetSize)*packetSize };
 
     copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, alignedSize>::run(kernel);
     copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, alignedSize, size>::run(kernel);
@@ -391,23 +464,26 @@
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling>
 {
-  static inline void run(Kernel &kernel)
+  typedef typename Kernel::PacketType PacketType;
+  enum {
+    SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
+    DstAlignment = Kernel::AssignmentTraits::DstAlignment
+  };
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
-    typedef typename Kernel::Index Index;
-
     const Index innerSize = kernel.innerSize();
     const Index outerSize = kernel.outerSize();
-    const Index packetSize = packet_traits<typename Kernel::Scalar>::size;
+    const Index packetSize = unpacket_traits<PacketType>::size;
     for(Index outer = 0; outer < outerSize; ++outer)
       for(Index inner = 0; inner < innerSize; inner+=packetSize)
-        kernel.template assignPacketByOuterInner<Aligned, Aligned>(outer, inner);
+        kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner);
   }
 };
 
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, CompleteUnrolling>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
     copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
@@ -417,13 +493,14 @@
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling>
 {
-  typedef typename Kernel::Index Index;
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    typedef typename Kernel::AssignmentTraits Traits;
     const Index outerSize = kernel.outerSize();
     for(Index outer = 0; outer < outerSize; ++outer)
-      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel, outer);
+      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime,
+                                                   Traits::SrcAlignment, Traits::DstAlignment>::run(kernel, outer);
   }
 };
 
@@ -434,9 +511,8 @@
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, LinearTraversal, NoUnrolling>
 {
-  static inline void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
-    typedef typename Kernel::Index Index;
     const Index size = kernel.size();
     for(Index i = 0; i < size; ++i)
       kernel.assignCoeff(i);
@@ -446,7 +522,7 @@
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
     copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
@@ -460,21 +536,29 @@
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
 {
-  static inline void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
-    typedef typename Kernel::Index Index;
-    typedef packet_traits<typename Kernel::Scalar> PacketTraits;
+    typedef typename Kernel::Scalar Scalar;
+    typedef typename Kernel::PacketType PacketType;
     enum {
-      packetSize = PacketTraits::size,
-      alignable = PacketTraits::AlignedOnScalar,
-      dstAlignment = alignable ? Aligned : int(Kernel::AssignmentTraits::DstIsAligned)
+      packetSize = unpacket_traits<PacketType>::size,
+      requestedAlignment = int(Kernel::AssignmentTraits::InnerRequiredAlignment),
+      alignable = packet_traits<Scalar>::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment)>=sizeof(Scalar),
+      dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
+      dstAlignment = alignable ? int(requestedAlignment)
+                               : int(Kernel::AssignmentTraits::DstAlignment)
     };
+    const Scalar *dst_ptr = kernel.dstDataPtr();
+    if((!bool(dstIsAligned)) && (UIntPtr(dst_ptr) % sizeof(Scalar))>0)
+    {
+      // the pointer is not aligned-on scalar, so alignment is not possible
+      return dense_assignment_loop<Kernel,DefaultTraversal,NoUnrolling>::run(kernel);
+    }
     const Index packetAlignedMask = packetSize - 1;
     const Index innerSize = kernel.innerSize();
     const Index outerSize = kernel.outerSize();
     const Index alignedStep = alignable ? (packetSize - kernel.outerStride() % packetSize) & packetAlignedMask : 0;
-    Index alignedStart = ((!alignable) || Kernel::AssignmentTraits::DstIsAligned) ? 0
-                       : internal::first_aligned(&kernel.dstEvaluator().coeffRef(0,0), innerSize);
+    Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned<requestedAlignment>(dst_ptr, innerSize);
 
     for(Index outer = 0; outer < outerSize; ++outer)
     {
@@ -485,36 +569,43 @@
 
       // do the vectorizable part of the assignment
       for(Index inner = alignedStart; inner<alignedEnd; inner+=packetSize)
-        kernel.template assignPacketByOuterInner<dstAlignment, Unaligned>(outer, inner);
+        kernel.template assignPacketByOuterInner<dstAlignment, Unaligned, PacketType>(outer, inner);
 
       // do the non-vectorizable part of the assignment
       for(Index inner = alignedEnd; inner<innerSize ; ++inner)
         kernel.assignCoeffByOuterInner(outer, inner);
 
-      alignedStart = std::min<Index>((alignedStart+alignedStep)%packetSize, innerSize);
+      alignedStart = numext::mini((alignedStart+alignedStep)%packetSize, innerSize);
     }
   }
 };
 
-/****************************
-*** All-at-once traversal ***
-****************************/
-
-// TODO: this 'AllAtOnceTraversal' should be dropped or caught earlier (Gael)
-// Indeed, what to do with the kernel's functor??
+#if EIGEN_UNALIGNED_VECTORIZE
 template<typename Kernel>
-struct dense_assignment_loop<Kernel, AllAtOnceTraversal, NoUnrolling>
+struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling>
 {
-  static inline void run(Kernel & kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
-    // Evaluate rhs in temporary to prevent aliasing problems in a = a * a;
-    // TODO: Do not pass the xpr object to evalTo() (Jitse)
-    kernel.srcEvaluator().evalTo(kernel.dstEvaluator(), kernel.dstExpression());
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    typedef typename Kernel::PacketType PacketType;
+
+    enum { innerSize = DstXprType::InnerSizeAtCompileTime,
+           packetSize =unpacket_traits<PacketType>::size,
+           vectorizableSize = (int(innerSize) / int(packetSize)) * int(packetSize),
+           size = DstXprType::SizeAtCompileTime };
+
+    for(Index outer = 0; outer < kernel.outerSize(); ++outer)
+    {
+      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, vectorizableSize, 0, 0>::run(kernel, outer);
+      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, vectorizableSize, innerSize>::run(kernel, outer);
+    }
   }
 };
+#endif
+
 
 /***************************************************************************
-* Part 4 : Generic Assignment routine
+* Part 4 : Generic dense assignment kernel
 ***************************************************************************/
 
 // This class generalize the assignment of a coefficient (or packet) from one dense evaluator
@@ -523,92 +614,104 @@
 // This abstraction level permits to keep the evaluation loops as simple and as generic as possible.
 // One can customize the assignment using this generic dense_assignment_kernel with different
 // functors, or by completely overloading it, by-passing a functor.
-template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor>
+template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version = Specialized>
 class generic_dense_assignment_kernel
 {
 protected:
   typedef typename DstEvaluatorTypeT::XprType DstXprType;
   typedef typename SrcEvaluatorTypeT::XprType SrcXprType;
 public:
-  
+
   typedef DstEvaluatorTypeT DstEvaluatorType;
   typedef SrcEvaluatorTypeT SrcEvaluatorType;
   typedef typename DstEvaluatorType::Scalar Scalar;
-  typedef typename DstEvaluatorType::Index Index;
-  typedef copy_using_evaluator_traits<DstXprType, SrcXprType> AssignmentTraits;
-  
-  
+  typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits;
+  typedef typename AssignmentTraits::PacketType PacketType;
+
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
     : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr)
-  {}
-  
-  Index size() const        { return m_dstExpr.size(); }
-  Index innerSize() const   { return m_dstExpr.innerSize(); }
-  Index outerSize() const   { return m_dstExpr.outerSize(); }
-  Index outerStride() const { return m_dstExpr.outerStride(); }
-  
-  // TODO get rid of this one:
-  DstXprType& dstExpression() const { return m_dstExpr; }
-  
-  DstEvaluatorType& dstEvaluator() { return m_dst; }
-  const SrcEvaluatorType& srcEvaluator() const { return m_src; }
-  
-  void assignCoeff(Index row, Index col)
+  {
+    #ifdef EIGEN_DEBUG_ASSIGN
+    AssignmentTraits::debug();
+    #endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_dstExpr.size(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index innerSize() const EIGEN_NOEXCEPT { return m_dstExpr.innerSize(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerSize() const EIGEN_NOEXCEPT { return m_dstExpr.outerSize(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dstExpr.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_dstExpr.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerStride() const EIGEN_NOEXCEPT { return m_dstExpr.outerStride(); }
+
+  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() EIGEN_NOEXCEPT { return m_dst; }
+  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const EIGEN_NOEXCEPT { return m_src; }
+
+  /// Assign src(row,col) to dst(row,col) through the assignment functor.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col)
   {
     m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col));
   }
-  
-  void assignCoeff(Index index)
+
+  /// \sa assignCoeff(Index,Index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index index)
   {
     m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index));
   }
-  
-  void assignCoeffByOuterInner(Index outer, Index inner)
+
+  /// \sa assignCoeff(Index,Index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner)
   {
-    Index row = rowIndexByOuterInner(outer, inner); 
-    Index col = colIndexByOuterInner(outer, inner); 
+    Index row = rowIndexByOuterInner(outer, inner);
+    Index col = colIndexByOuterInner(outer, inner);
     assignCoeff(row, col);
   }
-  
-  
-  template<int StoreMode, int LoadMode>
-  void assignPacket(Index row, Index col)
+
+
+  template<int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index row, Index col)
   {
-    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode>(row,col));
+    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode,PacketType>(row,col));
   }
-  
-  template<int StoreMode, int LoadMode>
-  void assignPacket(Index index)
+
+  template<int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index index)
   {
-    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode>(index));
+    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode,PacketType>(index));
   }
-  
-  template<int StoreMode, int LoadMode>
-  void assignPacketByOuterInner(Index outer, Index inner)
+
+  template<int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner)
   {
-    Index row = rowIndexByOuterInner(outer, inner); 
+    Index row = rowIndexByOuterInner(outer, inner);
     Index col = colIndexByOuterInner(outer, inner);
-    assignPacket<StoreMode,LoadMode>(row, col);
+    assignPacket<StoreMode,LoadMode,PacketType>(row, col);
   }
-  
-  static Index rowIndexByOuterInner(Index outer, Index inner)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner)
   {
     typedef typename DstEvaluatorType::ExpressionTraits Traits;
     return int(Traits::RowsAtCompileTime) == 1 ? 0
       : int(Traits::ColsAtCompileTime) == 1 ? inner
-      : int(Traits::Flags)&RowMajorBit ? outer
+      : int(DstEvaluatorType::Flags)&RowMajorBit ? outer
       : inner;
   }
 
-  static Index colIndexByOuterInner(Index outer, Index inner)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner)
   {
     typedef typename DstEvaluatorType::ExpressionTraits Traits;
     return int(Traits::ColsAtCompileTime) == 1 ? 0
       : int(Traits::RowsAtCompileTime) == 1 ? inner
-      : int(Traits::Flags)&RowMajorBit ? inner
+      : int(DstEvaluatorType::Flags)&RowMajorBit ? inner
       : outer;
   }
-  
+
+  EIGEN_DEVICE_FUNC const Scalar* dstDataPtr() const
+  {
+    return m_dstExpr.data();
+  }
+
 protected:
   DstEvaluatorType& m_dst;
   const SrcEvaluatorType& m_src;
@@ -617,223 +720,288 @@
   DstXprType& m_dstExpr;
 };
 
-template<typename DstXprType, typename SrcXprType, typename Functor>
-void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func)
+// Special kernel used when computing small products whose operands have dynamic dimensions.  It ensures that the
+// PacketSize used is no larger than 4, thereby increasing the chance that vectorized instructions will be used
+// when computing the product.
+
+template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor>
+class restricted_packet_dense_assignment_kernel : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, BuiltIn>
 {
-#ifdef EIGEN_DEBUG_ASSIGN
-  // TODO these traits should be computed from information provided by the evaluators
-  internal::copy_using_evaluator_traits<DstXprType, SrcXprType>::debug();
-#endif
+protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, BuiltIn> Base;
+ public:
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::DstXprType DstXprType;
+    typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, 4> AssignmentTraits;
+    typedef typename AssignmentTraits::PacketType PacketType;
+
+    EIGEN_DEVICE_FUNC restricted_packet_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
+    : Base(dst, src, func, dstExpr)
+  {
+  }
+ };
+
+/***************************************************************************
+* Part 5 : Entry point for dense rectangular assignment
+***************************************************************************/
+
+template<typename DstXprType,typename SrcXprType, typename Functor>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const Functor &/*func*/)
+{
+  EIGEN_ONLY_USED_FOR_DEBUG(dst);
+  EIGEN_ONLY_USED_FOR_DEBUG(src);
   eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
-  
-  typedef typename evaluator<DstXprType>::type DstEvaluatorType;
-  typedef typename evaluator<SrcXprType>::type SrcEvaluatorType;
+}
+
+template<typename DstXprType,typename SrcXprType, typename T1, typename T2>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const internal::assign_op<T1,T2> &/*func*/)
+{
+  Index dstRows = src.rows();
+  Index dstCols = src.cols();
+  if(((dst.rows()!=dstRows) || (dst.cols()!=dstCols)))
+    dst.resize(dstRows, dstCols);
+  eigen_assert(dst.rows() == dstRows && dst.cols() == dstCols);
+}
+
+template<typename DstXprType, typename SrcXprType, typename Functor>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func)
+{
+  typedef evaluator<DstXprType> DstEvaluatorType;
+  typedef evaluator<SrcXprType> SrcEvaluatorType;
+
+  SrcEvaluatorType srcEvaluator(src);
+
+  // NOTE To properly handle A = (A*A.transpose())/s with A rectangular,
+  // we need to resize the destination after the source evaluator has been created.
+  resize_if_allowed(dst, src, func);
 
   DstEvaluatorType dstEvaluator(dst);
-  SrcEvaluatorType srcEvaluator(src);
-    
+
   typedef generic_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
   Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
-  
+
   dense_assignment_loop<Kernel>::run(kernel);
 }
 
-template<typename DstXprType, typename SrcXprType>
-void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src)
+// Specialization for filling the destination with a constant value.
+#ifndef EIGEN_GPU_COMPILE_PHASE
+template<typename DstXprType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const Eigen::CwiseNullaryOp<Eigen::internal::scalar_constant_op<typename DstXprType::Scalar>, DstXprType>& src, const internal::assign_op<typename DstXprType::Scalar,typename DstXprType::Scalar>& func)
 {
-  call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar>());
+  resize_if_allowed(dst, src, func);
+  std::fill_n(dst.data(), dst.size(), src.functor()());
+}
+#endif
+
+template<typename DstXprType, typename SrcXprType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src)
+{
+  call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
 }
 
 /***************************************************************************
-* Part 5 : Entry points
+* Part 6 : Generic assignment
 ***************************************************************************/
 
-// Based on DenseBase::LazyAssign()
-// The following functions are just for testing and they are meant to be moved to operator= and the likes.
+// Based on the respective shapes of the destination and source,
+// the class AssignmentKind determine the kind of assignment mechanism.
+// AssignmentKind must define a Kind typedef.
+template<typename DstShape, typename SrcShape> struct AssignmentKind;
 
-template<typename DstXprType, template <typename> class StorageBase, typename SrcXprType>
-EIGEN_STRONG_INLINE
-const DstXprType& copy_using_evaluator(const NoAlias<DstXprType, StorageBase>& dst, 
-                                       const EigenBase<SrcXprType>& src)
+// Assignment kind defined in this file:
+struct Dense2Dense {};
+struct EigenBase2EigenBase {};
+
+template<typename,typename> struct AssignmentKind { typedef EigenBase2EigenBase Kind; };
+template<> struct AssignmentKind<DenseShape,DenseShape> { typedef Dense2Dense Kind; };
+
+// This is the main assignment class
+template< typename DstXprType, typename SrcXprType, typename Functor,
+          typename Kind = typename AssignmentKind< typename evaluator_traits<DstXprType>::Shape , typename evaluator_traits<SrcXprType>::Shape >::Kind,
+          typename EnableIf = void>
+struct Assignment;
+
+
+// The only purpose of this call_assignment() function is to deal with noalias() / "assume-aliasing" and automatic transposition.
+// Indeed, I (Gael) think that this concept of "assume-aliasing" was a mistake, and it makes thing quite complicated.
+// So this intermediate function removes everything related to "assume-aliasing" such that Assignment
+// does not has to bother about these annoying details.
+
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(Dst& dst, const Src& src)
 {
-  return noalias_copy_using_evaluator(dst.expression(), src.derived(), internal::assign_op<typename DstXprType::Scalar>());
+  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
+}
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(const Dst& dst, const Src& src)
+{
+  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
 }
 
-template<typename XprType, int AssumeAliasing = evaluator_traits<XprType>::AssumeAliasing>
-struct AddEvalIfAssumingAliasing;
-
-template<typename XprType>
-struct AddEvalIfAssumingAliasing<XprType, 0>
+// Deal with "assume-aliasing"
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing<Src>::value, void*>::type = 0)
 {
-  static const XprType& run(const XprType& xpr) 
+  typename plain_matrix_type<Src>::type tmp(src);
+  call_assignment_no_alias(dst, tmp, func);
+}
+
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<!evaluator_assume_aliasing<Src>::value, void*>::type = 0)
+{
+  call_assignment_no_alias(dst, src, func);
+}
+
+// by-pass "assume-aliasing"
+// When there is no aliasing, we require that 'dst' has been properly resized
+template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
+{
+  call_assignment_no_alias(dst.expression(), src, func);
+}
+
+
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
+{
+  enum {
+    NeedToTranspose = (    (int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1)
+                        || (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1)
+                      ) && int(Dst::SizeAtCompileTime) != 1
+  };
+
+  typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst>::type ActualDstTypeCleaned;
+  typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst&>::type ActualDstType;
+  ActualDstType actualDst(dst);
+
+  // TODO check whether this is the right place to perform these checks:
+  EIGEN_STATIC_ASSERT_LVALUE(Dst)
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned,Src)
+  EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar);
+
+  Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func);
+}
+
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_restricted_packet_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
+{
+    typedef evaluator<Dst> DstEvaluatorType;
+    typedef evaluator<Src> SrcEvaluatorType;
+    typedef restricted_packet_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Func> Kernel;
+
+    EIGEN_STATIC_ASSERT_LVALUE(Dst)
+    EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename Dst::Scalar,typename Src::Scalar);
+
+    SrcEvaluatorType srcEvaluator(src);
+    resize_if_allowed(dst, src, func);
+
+    DstEvaluatorType dstEvaluator(dst);
+    Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
+
+    dense_assignment_loop<Kernel>::run(kernel);
+}
+
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias(Dst& dst, const Src& src)
+{
+  call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
+}
+
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func)
+{
+  // TODO check whether this is the right place to perform these checks:
+  EIGEN_STATIC_ASSERT_LVALUE(Dst)
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src)
+  EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename Dst::Scalar,typename Src::Scalar);
+
+  Assignment<Dst,Src,Func>::run(dst, src, func);
+}
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src)
+{
+  call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
+}
+
+// forward declaration
+template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, const Src &src);
+
+// Generic Dense to Dense assignment
+// Note that the last template argument "Weak" is needed to make it possible to perform
+// both partial specialization+SFINAE without ambiguous specialization
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
+struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak>
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
   {
-    return xpr;
-  }
-};
-
-template<typename XprType>
-struct AddEvalIfAssumingAliasing<XprType, 1>
-{
-  static const EvalToTemp<XprType> run(const XprType& xpr)
-  {
-    return EvalToTemp<XprType>(xpr);
-  }
-};
-
-template<typename DstXprType, typename SrcXprType, typename Functor>
-EIGEN_STRONG_INLINE
-const DstXprType& copy_using_evaluator(const EigenBase<DstXprType>& dst, const EigenBase<SrcXprType>& src, const Functor &func)
-{
-  return noalias_copy_using_evaluator(dst.const_cast_derived(), 
-                                      AddEvalIfAssumingAliasing<SrcXprType>::run(src.derived()),
-                                      func
-                                     );
-}
-
-// this mimics operator=
-template<typename DstXprType, typename SrcXprType>
-EIGEN_STRONG_INLINE
-const DstXprType& copy_using_evaluator(const EigenBase<DstXprType>& dst, const EigenBase<SrcXprType>& src)
-{
-  return copy_using_evaluator(dst.const_cast_derived(), src.derived(), internal::assign_op<typename DstXprType::Scalar>());
-}
-
-template<typename DstXprType, typename SrcXprType, typename Functor>
-EIGEN_STRONG_INLINE
-const DstXprType& noalias_copy_using_evaluator(const PlainObjectBase<DstXprType>& dst, const EigenBase<SrcXprType>& src, const Functor &func)
-{
-#ifdef EIGEN_DEBUG_ASSIGN
-  internal::copy_using_evaluator_traits<DstXprType, SrcXprType>::debug();
+#ifndef EIGEN_NO_DEBUG
+    internal::check_for_aliasing(dst, src);
 #endif
-#ifdef EIGEN_NO_AUTOMATIC_RESIZING
-  eigen_assert((dst.size()==0 || (IsVectorAtCompileTime ? (dst.size() == src.size())
-                                                        : (dst.rows() == src.rows() && dst.cols() == src.cols())))
-              && "Size mismatch. Automatic resizing is disabled because EIGEN_NO_AUTOMATIC_RESIZING is defined");
-#else
-  dst.const_cast_derived().resizeLike(src.derived());
-#endif
-  call_dense_assignment_loop(dst.const_cast_derived(), src.derived(), func);
-  return dst.derived();
-}
 
-template<typename DstXprType, typename SrcXprType, typename Functor>
-EIGEN_STRONG_INLINE
-const DstXprType& noalias_copy_using_evaluator(const EigenBase<DstXprType>& dst, const EigenBase<SrcXprType>& src, const Functor &func)
-{
-  call_dense_assignment_loop(dst.const_cast_derived(), src.derived(), func);
-  return dst.derived();
-}
-
-// Based on DenseBase::swap()
-// TODO: Check whether we need to do something special for swapping two
-//       Arrays or Matrices. (Jitse)
-
-// Overload default assignPacket behavior for swapping them
-template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT>
-class swap_kernel : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar> >
-{
-  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar> > Base;
-  typedef typename DstEvaluatorTypeT::PacketScalar PacketScalar;
-  using Base::m_dst;
-  using Base::m_src;
-  using Base::m_functor;
-  
-public:
-  typedef typename Base::Scalar Scalar;
-  typedef typename Base::Index Index;
-  typedef typename Base::DstXprType DstXprType;
-  
-  swap_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, DstXprType& dstExpr)
-    : Base(dst, src, swap_assign_op<Scalar>(), dstExpr)
-  {}
-  
-  template<int StoreMode, int LoadMode>
-  void assignPacket(Index row, Index col)
-  {
-    m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(row,col), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(row,col));
-  }
-  
-  template<int StoreMode, int LoadMode>
-  void assignPacket(Index index)
-  {
-    m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(index), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(index));
-  }
-  
-  // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael)
-  template<int StoreMode, int LoadMode>
-  void assignPacketByOuterInner(Index outer, Index inner)
-  {
-    Index row = Base::rowIndexByOuterInner(outer, inner); 
-    Index col = Base::colIndexByOuterInner(outer, inner);
-    assignPacket<StoreMode,LoadMode>(row, col);
+    call_dense_assignment_loop(dst, src, func);
   }
 };
-  
-template<typename DstXprType, typename SrcXprType>
-void swap_using_evaluator(const DstXprType& dst, const SrcXprType& src)
+
+// Generic assignment through evalTo.
+// TODO: not sure we have to keep that one, but it helps porting current code to new evaluator mechanism.
+// Note that the last template argument "Weak" is needed to make it possible to perform
+// both partial specialization+SFINAE without ambiguous specialization
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
+struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
 {
-  // TODO there is too much redundancy with call_dense_assignment_loop
-  
-  eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
-  
-  typedef typename evaluator<DstXprType>::type DstEvaluatorType;
-  typedef typename evaluator<SrcXprType>::type SrcEvaluatorType;
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
 
-  DstEvaluatorType dstEvaluator(dst);
-  SrcEvaluatorType srcEvaluator(src);
-    
-  typedef swap_kernel<DstEvaluatorType,SrcEvaluatorType> Kernel;
-  Kernel kernel(dstEvaluator, srcEvaluator, dst.const_cast_derived());
-  
-  dense_assignment_loop<Kernel>::run(kernel);
-}
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    src.evalTo(dst);
+  }
 
-// Based on MatrixBase::operator+= (in CwiseBinaryOp.h)
-template<typename DstXprType, typename SrcXprType>
-void add_assign_using_evaluator(const MatrixBase<DstXprType>& dst, const MatrixBase<SrcXprType>& src)
-{
-  typedef typename DstXprType::Scalar Scalar;
-  copy_using_evaluator(dst.derived(), src.derived(), add_assign_op<Scalar>());
-}
+  // NOTE The following two functions are templated to avoid their instantiation if not needed
+  //      This is needed because some expressions supports evalTo only and/or have 'void' as scalar type.
+  template<typename SrcScalarType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,SrcScalarType> &/*func*/)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
 
-// Based on ArrayBase::operator+=
-template<typename DstXprType, typename SrcXprType>
-void add_assign_using_evaluator(const ArrayBase<DstXprType>& dst, const ArrayBase<SrcXprType>& src)
-{
-  typedef typename DstXprType::Scalar Scalar;
-  copy_using_evaluator(dst.derived(), src.derived(), add_assign_op<Scalar>());
-}
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    src.addTo(dst);
+  }
 
-// TODO: Add add_assign_using_evaluator for EigenBase ? (Jitse)
+  template<typename SrcScalarType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,SrcScalarType> &/*func*/)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
 
-template<typename DstXprType, typename SrcXprType>
-void subtract_assign_using_evaluator(const MatrixBase<DstXprType>& dst, const MatrixBase<SrcXprType>& src)
-{
-  typedef typename DstXprType::Scalar Scalar;
-  copy_using_evaluator(dst.derived(), src.derived(), sub_assign_op<Scalar>());
-}
-
-template<typename DstXprType, typename SrcXprType>
-void subtract_assign_using_evaluator(const ArrayBase<DstXprType>& dst, const ArrayBase<SrcXprType>& src)
-{
-  typedef typename DstXprType::Scalar Scalar;
-  copy_using_evaluator(dst.derived(), src.derived(), sub_assign_op<Scalar>());
-}
-
-template<typename DstXprType, typename SrcXprType>
-void multiply_assign_using_evaluator(const ArrayBase<DstXprType>& dst, const ArrayBase<SrcXprType>& src)
-{
-  typedef typename DstXprType::Scalar Scalar;
-  copy_using_evaluator(dst.derived(), src.derived(), mul_assign_op<Scalar>());
-}
-
-template<typename DstXprType, typename SrcXprType>
-void divide_assign_using_evaluator(const ArrayBase<DstXprType>& dst, const ArrayBase<SrcXprType>& src)
-{
-  typedef typename DstXprType::Scalar Scalar;
-  copy_using_evaluator(dst.derived(), src.derived(), div_assign_op<Scalar>());
-}
-
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    src.subTo(dst);
+  }
+};
 
 } // namespace internal
 

diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h
old mode 100644
new mode 100755
index 97134ff..c6140d1
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h

@@ -1,6 +1,7 @@
 /*
  Copyright (c) 2011, Intel Corporation. All rights reserved.
-
+ Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+ 
  Redistribution and use in source and binary forms, with or without modification,
  are permitted provided that the following conditions are met:
 
@@ -37,17 +38,13 @@
 
 namespace internal {
 
-template<typename Op> struct vml_call
-{ enum { IsSupported = 0 }; };
-
-template<typename Dst, typename Src, typename UnaryOp>
+template<typename Dst, typename Src>
 class vml_assign_traits
 {
   private:
     enum {
       DstHasDirectAccess = Dst::Flags & DirectAccessBit,
       SrcHasDirectAccess = Src::Flags & DirectAccessBit,
-
       StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Src::IsRowMajor)),
       InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime)
                 : int(Dst::Flags)&RowMajorBit ? int(Dst::ColsAtCompileTime)
@@ -57,166 +54,122 @@
                     : int(Dst::MaxRowsAtCompileTime),
       MaxSizeAtCompileTime = Dst::SizeAtCompileTime,
 
-      MightEnableVml =  vml_call<UnaryOp>::IsSupported && StorageOrdersAgree && DstHasDirectAccess && SrcHasDirectAccess
-                     && Src::InnerStrideAtCompileTime==1 && Dst::InnerStrideAtCompileTime==1,
+      MightEnableVml = StorageOrdersAgree && DstHasDirectAccess && SrcHasDirectAccess && Src::InnerStrideAtCompileTime==1 && Dst::InnerStrideAtCompileTime==1,
       MightLinearize = MightEnableVml && (int(Dst::Flags) & int(Src::Flags) & LinearAccessBit),
       VmlSize = MightLinearize ? MaxSizeAtCompileTime : InnerMaxSize,
-      LargeEnough = VmlSize==Dynamic || VmlSize>=EIGEN_MKL_VML_THRESHOLD,
-      MayEnableVml = MightEnableVml && LargeEnough,
-      MayLinearize = MayEnableVml && MightLinearize
+      LargeEnough = VmlSize==Dynamic || VmlSize>=EIGEN_MKL_VML_THRESHOLD
     };
   public:
     enum {
-      Traversal = MayLinearize ? LinearVectorizedTraversal
-                : MayEnableVml ? InnerVectorizedTraversal
-                : DefaultTraversal
+      EnableVml = MightEnableVml && LargeEnough,
+      Traversal = MightLinearize ? LinearTraversal : DefaultTraversal
     };
 };
 
-template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling,
-         int VmlTraversal = vml_assign_traits<Derived1, Derived2, UnaryOp>::Traversal >
-struct vml_assign_impl
-  : assign_impl<Derived1, Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>
-{
-};
-
-template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling>
-struct vml_assign_impl<Derived1, Derived2, UnaryOp, Traversal, Unrolling, InnerVectorizedTraversal>
-{
-  typedef typename Derived1::Scalar Scalar;
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1& dst, const CwiseUnaryOp<UnaryOp, Derived2>& src)
-  {
-    // in case we want to (or have to) skip VML at runtime we can call:
-    // assign_impl<Derived1,Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>::run(dst,src);
-    const Index innerSize = dst.innerSize();
-    const Index outerSize = dst.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer) {
-      const Scalar *src_ptr = src.IsRowMajor ?  &(src.nestedExpression().coeffRef(outer,0)) :
-                                                &(src.nestedExpression().coeffRef(0, outer));
-      Scalar *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));
-      vml_call<UnaryOp>::run(src.functor(), innerSize, src_ptr, dst_ptr );
-    }
-  }
-};
-
-template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling>
-struct vml_assign_impl<Derived1, Derived2, UnaryOp, Traversal, Unrolling, LinearVectorizedTraversal>
-{
-  static inline void run(Derived1& dst, const CwiseUnaryOp<UnaryOp, Derived2>& src)
-  {
-    // in case we want to (or have to) skip VML at runtime we can call:
-    // assign_impl<Derived1,Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>::run(dst,src);
-    vml_call<UnaryOp>::run(src.functor(), dst.size(), src.nestedExpression().data(), dst.data() );
-  }
-};
-
-// Macroses
-
-#define EIGEN_MKL_VML_SPECIALIZE_ASSIGN(TRAVERSAL,UNROLLING) \
-  template<typename Derived1, typename Derived2, typename UnaryOp> \
-  struct assign_impl<Derived1, Eigen::CwiseUnaryOp<UnaryOp, Derived2>, TRAVERSAL, UNROLLING, Specialized>  {  \
-    static inline void run(Derived1 &dst, const Eigen::CwiseUnaryOp<UnaryOp, Derived2> &src) { \
-      vml_assign_impl<Derived1,Derived2,UnaryOp,TRAVERSAL,UNROLLING>::run(dst, src); \
-    } \
-  };
-
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,InnerUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,InnerUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearVectorizedTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearVectorizedTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(SliceVectorizedTraversal,NoUnrolling)
-
-
+#define EIGEN_PP_EXPAND(ARG) ARG
 #if !defined (EIGEN_FAST_MATH) || (EIGEN_FAST_MATH != 1)
-#define  EIGEN_MKL_VML_MODE VML_HA
+#define EIGEN_VMLMODE_EXPAND_xLA , VML_HA
 #else
-#define  EIGEN_MKL_VML_MODE VML_LA
+#define EIGEN_VMLMODE_EXPAND_xLA , VML_LA
 #endif
 
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)     \
-  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
-    enum { IsSupported = 1 };                                                    \
-    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& /*func*/,        \
-                            int size, const EIGENTYPE* src, EIGENTYPE* dst) {    \
-      VMLOP(size, (const VMLTYPE*)src, (VMLTYPE*)dst);                           \
-    }                                                                            \
+#define EIGEN_VMLMODE_EXPAND_x_
+
+#define EIGEN_VMLMODE_PREFIX_xLA vm
+#define EIGEN_VMLMODE_PREFIX_x_  v
+#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_x,VMLMODE)
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
+  template< typename DstXprType, typename SrcXprNested>                                                                         \
+  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE,EIGENTYPE>,   \
+                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {              \
+    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                            \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) {                       \
+      resize_if_allowed(dst, src, func);                                                                                        \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                       \
+      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) {                                              \
+        VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(),                                                        \
+              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) );                                           \
+      } else {                                                                                                                  \
+        const Index outerSize = dst.outerSize();                                                                                \
+        for(Index outer = 0; outer < outerSize; ++outer) {                                                                      \
+          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.nestedExpression().coeffRef(outer,0)) :                             \
+                                                      &(src.nestedExpression().coeffRef(0, outer));                             \
+          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                           \
+          VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr,                                                                      \
+                (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE));                                             \
+        }                                                                                                                       \
+      }                                                                                                                         \
+    }                                                                                                                           \
+  };                                                                                                                            \
+
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP, VMLMODE)                                                         \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),s##VMLOP), float, float, VMLMODE)           \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),d##VMLOP), double, double, VMLMODE)
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(EIGENOP, VMLOP, VMLMODE)                                                         \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),c##VMLOP), scomplex, MKL_Complex8, VMLMODE) \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),z##VMLOP), dcomplex, MKL_Complex16, VMLMODE)
+  
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS(EIGENOP, VMLOP, VMLMODE)                                                              \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP, VMLMODE)                                                               \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(EIGENOP, VMLOP, VMLMODE)
+
+  
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sin,   Sin,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(asin,  Asin,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sinh,  Sinh,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(cos,   Cos,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(acos,  Acos,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(cosh,  Cosh,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(tan,   Tan,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(atan,  Atan,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(tanh,  Tanh,  LA)
+// EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs,   Abs,    _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(exp,   Exp,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(log,   Ln,    LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(log10, Log10, LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sqrt,  Sqrt,  _)
+
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(square, Sqr,   _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(arg, Arg,      _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(round, Round,  _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(floor, Floor,  _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)
+
+#define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
+  template< typename DstXprType, typename SrcXprNested, typename Plain>                                                       \
+  struct Assignment<DstXprType, CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested,                       \
+                    const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> >, assign_op<EIGENTYPE,EIGENTYPE>,    \
+                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {            \
+    typedef CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested,                                           \
+                    const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> > SrcXprType;                         \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) {                     \
+      resize_if_allowed(dst, src, func);                                                                                      \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                     \
+      VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.rhs().functor().m_other);                                       \
+      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal)                                              \
+      {                                                                                                                       \
+        VMLOP( dst.size(), (const VMLTYPE*)src.lhs().data(), exponent,                                                        \
+              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) );                                         \
+      } else {                                                                                                                \
+        const Index outerSize = dst.outerSize();                                                                              \
+        for(Index outer = 0; outer < outerSize; ++outer) {                                                                    \
+          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.lhs().coeffRef(outer,0)) :                                        \
+                                                      &(src.lhs().coeffRef(0, outer));                                        \
+          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                         \
+          VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, exponent,                                                          \
+                 (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE));                                          \
+        }                                                                                                                     \
+      }                                                                                                                       \
+    }                                                                                                                         \
   };
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)  \
-  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
-    enum { IsSupported = 1 };                                                    \
-    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& /*func*/,        \
-                            int size, const EIGENTYPE* src, EIGENTYPE* dst) {    \
-      MKL_INT64 vmlMode = EIGEN_MKL_VML_MODE;                                    \
-      VMLOP(size, (const VMLTYPE*)src, (VMLTYPE*)dst, vmlMode);                  \
-    }                                                                            \
-  };
-
-#define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)       \
-  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
-    enum { IsSupported = 1 };                                                    \
-    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& func,        \
-                          int size, const EIGENTYPE* src, EIGENTYPE* dst) {      \
-      EIGENTYPE exponent = func.m_exponent;                                      \
-      MKL_INT64 vmlMode = EIGEN_MKL_VML_MODE;                                    \
-      VMLOP(&size, (const VMLTYPE*)src, (const VMLTYPE*)&exponent,               \
-                        (VMLTYPE*)dst, &vmlMode);                                \
-    }                                                                            \
-  };
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP)                   \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vs##VMLOP, float, float)             \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vd##VMLOP, double, double)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX(EIGENOP, VMLOP)                \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vc##VMLOP, scomplex, MKL_Complex8)   \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vz##VMLOP, dcomplex, MKL_Complex16)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS(EIGENOP, VMLOP)                        \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP)                         \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX(EIGENOP, VMLOP)
-
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL_LA(EIGENOP, VMLOP)                \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vms##VMLOP, float, float)         \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmd##VMLOP, double, double)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX_LA(EIGENOP, VMLOP)             \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmc##VMLOP, scomplex, MKL_Complex8)  \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmz##VMLOP, dcomplex, MKL_Complex16)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(EIGENOP, VMLOP)                     \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL_LA(EIGENOP, VMLOP)                      \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX_LA(EIGENOP, VMLOP)
-
-
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(sin,  Sin)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(asin, Asin)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(cos,  Cos)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(acos, Acos)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(tan,  Tan)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(atan,  Atan)
-//EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs,  Abs)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(exp,  Exp)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(log,  Ln)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(sqrt, Sqrt)
-
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(square, Sqr)
-
-// The vm*powx functions are not avaibale in the windows version of MKL.
-#ifndef _WIN32
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmspowx_, float, float)
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmdpowx_, double, double)
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmcpowx_, scomplex, MKL_Complex8)
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmzpowx_, dcomplex, MKL_Complex16)
-#endif
+  
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmsPowx, float,    float,         LA)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmdPowx, double,   double,        LA)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmcPowx, scomplex, MKL_Complex8,  LA)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmzPowx, dcomplex, MKL_Complex16, LA)
 
 } // end namespace internal
 

diff --git a/Eigen/src/Core/BandMatrix.h b/Eigen/src/Core/BandMatrix.h
index ffd7fe8..878c024 100644
--- a/Eigen/src/Core/BandMatrix.h
+++ b/Eigen/src/Core/BandMatrix.h

@@ -10,7 +10,7 @@
 #ifndef EIGEN_BANDMATRIX_H
 #define EIGEN_BANDMATRIX_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -32,7 +32,7 @@
     };
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime> DenseMatrixType;
-    typedef typename DenseMatrixType::Index Index;
+    typedef typename DenseMatrixType::StorageIndex StorageIndex;
     typedef typename internal::traits<Derived>::CoefficientsType CoefficientsType;
     typedef EigenBase<Derived> Base;
 
@@ -45,7 +45,7 @@
     };
 
   public:
-    
+
     using Base::derived;
     using Base::rows;
     using Base::cols;
@@ -55,10 +55,10 @@
 
     /** \returns the number of sub diagonals */
     inline Index subs() const { return derived().subs(); }
-    
+
     /** \returns an expression of the underlying coefficient matrix */
     inline const CoefficientsType& coeffs() const { return derived().coeffs(); }
-    
+
     /** \returns an expression of the underlying coefficient matrix */
     inline CoefficientsType& coeffs() { return derived().coeffs(); }
 
@@ -67,7 +67,7 @@
       * \warning the internal storage must be column major. */
     inline Block<CoefficientsType,Dynamic,1> col(Index i)
     {
-      EIGEN_STATIC_ASSERT((Options&RowMajor)==0,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+      EIGEN_STATIC_ASSERT((int(Options) & int(RowMajor)) == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
       Index start = 0;
       Index len = coeffs().rows();
       if (i<=supers())
@@ -90,7 +90,7 @@
 
     template<int Index> struct DiagonalIntReturnType {
       enum {
-        ReturnOpposite = (Options&SelfAdjoint) && (((Index)>0 && Supers==0) || ((Index)<0 && Subs==0)),
+        ReturnOpposite = (int(Options) & int(SelfAdjoint)) && (((Index) > 0 && Supers == 0) || ((Index) < 0 && Subs == 0)),
         Conjugate = ReturnOpposite && NumTraits<Scalar>::IsComplex,
         ActualIndex = ReturnOpposite ? -Index : Index,
         DiagonalSize = (RowsAtCompileTime==Dynamic || ColsAtCompileTime==Dynamic)
@@ -130,7 +130,7 @@
       eigen_assert((i<0 && -i<=subs()) || (i>=0 && i<=supers()));
       return Block<const CoefficientsType,1,Dynamic>(coeffs(), supers()-i, std::max<Index>(0,i), 1, diagonalLength(i));
     }
-    
+
     template<typename Dest> inline void evalTo(Dest& dst) const
     {
       dst.resize(rows(),cols());
@@ -161,15 +161,15 @@
   *
   * \brief Represents a rectangular matrix with a banded storage
   *
-  * \param _Scalar Numeric type, i.e. float, double, int
-  * \param Rows Number of rows, or \b Dynamic
-  * \param Cols Number of columns, or \b Dynamic
-  * \param Supers Number of super diagonal
-  * \param Subs Number of sub diagonal
-  * \param _Options A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint
-  *                 The former controls \ref TopicStorageOrders "storage order", and defaults to
-  *                 column-major. The latter controls whether the matrix represents a selfadjoint 
-  *                 matrix in which case either Supers of Subs have to be null.
+  * \tparam _Scalar Numeric type, i.e. float, double, int
+  * \tparam _Rows Number of rows, or \b Dynamic
+  * \tparam _Cols Number of columns, or \b Dynamic
+  * \tparam _Supers Number of super diagonal
+  * \tparam _Subs Number of sub diagonal
+  * \tparam _Options A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint
+  *                  The former controls \ref TopicStorageOrders "storage order", and defaults to
+  *                  column-major. The latter controls whether the matrix represents a selfadjoint
+  *                  matrix in which case either Supers of Subs have to be null.
   *
   * \sa class TridiagonalMatrix
   */
@@ -179,7 +179,7 @@
 {
   typedef _Scalar Scalar;
   typedef Dense StorageKind;
-  typedef DenseIndex Index;
+  typedef Eigen::Index StorageIndex;
   enum {
     CoeffReadCost = NumTraits<Scalar>::ReadCost,
     RowsAtCompileTime = _Rows,
@@ -192,7 +192,7 @@
     Options = _Options,
     DataRowsAtCompileTime = ((Supers!=Dynamic) && (Subs!=Dynamic)) ? 1 + Supers + Subs : Dynamic
   };
-  typedef Matrix<Scalar,DataRowsAtCompileTime,ColsAtCompileTime,Options&RowMajor?RowMajor:ColMajor> CoefficientsType;
+  typedef Matrix<Scalar, DataRowsAtCompileTime, ColsAtCompileTime, int(Options) & int(RowMajor) ? RowMajor : ColMajor> CoefficientsType;
 };
 
 template<typename _Scalar, int Rows, int Cols, int Supers, int Subs, int Options>
@@ -201,26 +201,26 @@
   public:
 
     typedef typename internal::traits<BandMatrix>::Scalar Scalar;
-    typedef typename internal::traits<BandMatrix>::Index Index;
+    typedef typename internal::traits<BandMatrix>::StorageIndex StorageIndex;
     typedef typename internal::traits<BandMatrix>::CoefficientsType CoefficientsType;
 
-    inline BandMatrix(Index rows=Rows, Index cols=Cols, Index supers=Supers, Index subs=Subs)
+    explicit inline BandMatrix(Index rows=Rows, Index cols=Cols, Index supers=Supers, Index subs=Subs)
       : m_coeffs(1+supers+subs,cols),
         m_rows(rows), m_supers(supers), m_subs(subs)
     {
     }
 
     /** \returns the number of columns */
-    inline Index rows() const { return m_rows.value(); }
+    inline EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); }
 
     /** \returns the number of rows */
-    inline Index cols() const { return m_coeffs.cols(); }
+    inline EIGEN_CONSTEXPR Index cols() const { return m_coeffs.cols(); }
 
     /** \returns the number of super diagonals */
-    inline Index supers() const { return m_supers.value(); }
+    inline EIGEN_CONSTEXPR Index supers() const { return m_supers.value(); }
 
     /** \returns the number of sub diagonals */
-    inline Index subs() const { return m_subs.value(); }
+    inline EIGEN_CONSTEXPR Index subs() const { return m_subs.value(); }
 
     inline const CoefficientsType& coeffs() const { return m_coeffs; }
     inline CoefficientsType& coeffs() { return m_coeffs; }
@@ -241,7 +241,7 @@
 {
   typedef typename _CoefficientsType::Scalar Scalar;
   typedef typename _CoefficientsType::StorageKind StorageKind;
-  typedef typename _CoefficientsType::Index Index;
+  typedef typename _CoefficientsType::StorageIndex StorageIndex;
   enum {
     CoeffReadCost = internal::traits<_CoefficientsType>::CoeffReadCost,
     RowsAtCompileTime = _Rows,
@@ -264,9 +264,9 @@
 
     typedef typename internal::traits<BandMatrixWrapper>::Scalar Scalar;
     typedef typename internal::traits<BandMatrixWrapper>::CoefficientsType CoefficientsType;
-    typedef typename internal::traits<BandMatrixWrapper>::Index Index;
+    typedef typename internal::traits<BandMatrixWrapper>::StorageIndex StorageIndex;
 
-    inline BandMatrixWrapper(const CoefficientsType& coeffs, Index rows=_Rows, Index cols=_Cols, Index supers=_Supers, Index subs=_Subs)
+    explicit inline BandMatrixWrapper(const CoefficientsType& coeffs, Index rows=_Rows, Index cols=_Cols, Index supers=_Supers, Index subs=_Subs)
       : m_coeffs(coeffs),
         m_rows(rows), m_supers(supers), m_subs(subs)
     {
@@ -275,16 +275,16 @@
     }
 
     /** \returns the number of columns */
-    inline Index rows() const { return m_rows.value(); }
+    inline EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); }
 
     /** \returns the number of rows */
-    inline Index cols() const { return m_coeffs.cols(); }
+    inline EIGEN_CONSTEXPR Index cols() const { return m_coeffs.cols(); }
 
     /** \returns the number of super diagonals */
-    inline Index supers() const { return m_supers.value(); }
+    inline EIGEN_CONSTEXPR Index supers() const { return m_supers.value(); }
 
     /** \returns the number of sub diagonals */
-    inline Index subs() const { return m_subs.value(); }
+    inline EIGEN_CONSTEXPR Index subs() const { return m_subs.value(); }
 
     inline const CoefficientsType& coeffs() const { return m_coeffs; }
 
@@ -302,9 +302,9 @@
   *
   * \brief Represents a tridiagonal matrix with a compact banded storage
   *
-  * \param _Scalar Numeric type, i.e. float, double, int
-  * \param Size Number of rows and cols, or \b Dynamic
-  * \param _Options Can be 0 or \b SelfAdjoint
+  * \tparam Scalar Numeric type, i.e. float, double, int
+  * \tparam Size Number of rows and cols, or \b Dynamic
+  * \tparam Options Can be 0 or \b SelfAdjoint
   *
   * \sa class BandMatrix
   */
@@ -312,9 +312,9 @@
 class TridiagonalMatrix : public BandMatrix<Scalar,Size,Size,Options&SelfAdjoint?0:1,1,Options|RowMajor>
 {
     typedef BandMatrix<Scalar,Size,Size,Options&SelfAdjoint?0:1,1,Options|RowMajor> Base;
-    typedef typename Base::Index Index;
+    typedef typename Base::StorageIndex StorageIndex;
   public:
-    TridiagonalMatrix(Index size = Size) : Base(size,size,Options&SelfAdjoint?0:1,1) {}
+    explicit TridiagonalMatrix(Index size = Size) : Base(size,size,Options&SelfAdjoint?0:1,1) {}
 
     inline typename Base::template DiagonalIntReturnType<1>::Type super()
     { return Base::template diagonal<1>(); }
@@ -327,6 +327,25 @@
   protected:
 };
 
+
+struct BandShape {};
+
+template<typename _Scalar, int _Rows, int _Cols, int _Supers, int _Subs, int _Options>
+struct evaluator_traits<BandMatrix<_Scalar,_Rows,_Cols,_Supers,_Subs,_Options> >
+  : public evaluator_traits_base<BandMatrix<_Scalar,_Rows,_Cols,_Supers,_Subs,_Options> >
+{
+  typedef BandShape Shape;
+};
+
+template<typename _CoefficientsType,int _Rows, int _Cols, int _Supers, int _Subs,int _Options>
+struct evaluator_traits<BandMatrixWrapper<_CoefficientsType,_Rows,_Cols,_Supers,_Subs,_Options> >
+  : public evaluator_traits_base<BandMatrixWrapper<_CoefficientsType,_Rows,_Cols,_Supers,_Subs,_Options> >
+{
+  typedef BandShape Shape;
+};
+
+template<> struct AssignmentKind<DenseShape,BandShape> { typedef EigenBase2EigenBase Kind; };
+
 } // end namespace internal
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index da193d1..d0b95d5 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h

@@ -11,19 +11,72 @@
 #ifndef EIGEN_BLOCK_H
 #define EIGEN_BLOCK_H
 
-namespace Eigen { 
+namespace Eigen {
+
+namespace internal {
+template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
+struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprType>
+{
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::XprKind XprKind;
+  typedef typename ref_selector<XprType>::type XprTypeNested;
+  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
+  enum{
+    MatrixRows = traits<XprType>::RowsAtCompileTime,
+    MatrixCols = traits<XprType>::ColsAtCompileTime,
+    RowsAtCompileTime = MatrixRows == 0 ? 0 : BlockRows,
+    ColsAtCompileTime = MatrixCols == 0 ? 0 : BlockCols,
+    MaxRowsAtCompileTime = BlockRows==0 ? 0
+                         : RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime)
+                         : int(traits<XprType>::MaxRowsAtCompileTime),
+    MaxColsAtCompileTime = BlockCols==0 ? 0
+                         : ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime)
+                         : int(traits<XprType>::MaxColsAtCompileTime),
+
+    XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0,
+    IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
+               : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
+               : XprTypeIsRowMajor,
+    HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor),
+    InnerSize = IsRowMajor ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
+    InnerStrideAtCompileTime = HasSameStorageOrderAsXprType
+                             ? int(inner_stride_at_compile_time<XprType>::ret)
+                             : int(outer_stride_at_compile_time<XprType>::ret),
+    OuterStrideAtCompileTime = HasSameStorageOrderAsXprType
+                             ? int(outer_stride_at_compile_time<XprType>::ret)
+                             : int(inner_stride_at_compile_time<XprType>::ret),
+
+    // FIXME, this traits is rather specialized for dense object and it needs to be cleaned further
+    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
+    FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
+    Flags = (traits<XprType>::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit,
+    // FIXME DirectAccessBit should not be handled by expressions
+    //
+    // Alignment is needed by MapBase's assertions
+    // We can sefely set it to false here. Internal alignment errors will be detected by an eigen_internal_assert in the respective evaluator
+    Alignment = 0
+  };
+};
+
+template<typename XprType, int BlockRows=Dynamic, int BlockCols=Dynamic, bool InnerPanel = false,
+         bool HasDirectAccess = internal::has_direct_access<XprType>::ret> class BlockImpl_dense;
+
+} // end namespace internal
+
+template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, typename StorageKind> class BlockImpl;
 
 /** \class Block
   * \ingroup Core_Module
   *
   * \brief Expression of a fixed-size or dynamic-size block
   *
-  * \param XprType the type of the expression in which we are taking a block
-  * \param BlockRows the number of rows of the block we are taking at compile time (optional)
-  * \param BlockCols the number of columns of the block we are taking at compile time (optional)
-  * \param InnerPanel is true, if the block maps to a set of rows of a row major matrix or
-  *        to set of columns of a column major matrix (optional). The parameter allows to determine
-  *        at compile time whether aligned access is possible on the block expression.
+  * \tparam XprType the type of the expression in which we are taking a block
+  * \tparam BlockRows the number of rows of the block we are taking at compile time (optional)
+  * \tparam BlockCols the number of columns of the block we are taking at compile time (optional)
+  * \tparam InnerPanel is true, if the block maps to a set of rows of a row major matrix or
+  *         to set of columns of a column major matrix (optional). The parameter allows to determine
+  *         at compile time whether aligned access is possible on the block expression.
   *
   * This class represents an expression of either a fixed-size or dynamic-size block. It is the return
   * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and
@@ -47,61 +100,6 @@
   *
   * \sa DenseBase::block(Index,Index,Index,Index), DenseBase::block(Index,Index), class VectorBlock
   */
-
-namespace internal {
-template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
-struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprType>
-{
-  typedef typename traits<XprType>::Scalar Scalar;
-  typedef typename traits<XprType>::StorageKind StorageKind;
-  typedef typename traits<XprType>::XprKind XprKind;
-  typedef typename nested<XprType>::type XprTypeNested;
-  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
-  enum{
-    MatrixRows = traits<XprType>::RowsAtCompileTime,
-    MatrixCols = traits<XprType>::ColsAtCompileTime,
-    RowsAtCompileTime = MatrixRows == 0 ? 0 : BlockRows,
-    ColsAtCompileTime = MatrixCols == 0 ? 0 : BlockCols,
-    MaxRowsAtCompileTime = BlockRows==0 ? 0
-                         : RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime)
-                         : int(traits<XprType>::MaxRowsAtCompileTime),
-    MaxColsAtCompileTime = BlockCols==0 ? 0
-                         : ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime)
-                         : int(traits<XprType>::MaxColsAtCompileTime),
-    XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0,
-    IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
-               : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
-               : XprTypeIsRowMajor,
-    HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor),
-    InnerSize = IsRowMajor ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
-    InnerStrideAtCompileTime = HasSameStorageOrderAsXprType
-                             ? int(inner_stride_at_compile_time<XprType>::ret)
-                             : int(outer_stride_at_compile_time<XprType>::ret),
-    OuterStrideAtCompileTime = HasSameStorageOrderAsXprType
-                             ? int(outer_stride_at_compile_time<XprType>::ret)
-                             : int(inner_stride_at_compile_time<XprType>::ret),
-    MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
-                       && (InnerStrideAtCompileTime == 1)
-                        ? PacketAccessBit : 0,
-    MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) ? AlignedBit : 0,
-    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (traits<XprType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,
-    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
-    FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
-    Flags0 = traits<XprType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
-                                        DirectAccessBit |
-                                        MaskPacketAccessBit |
-                                        MaskAlignedBit),
-    Flags = Flags0 | FlagsLinearAccessBit | FlagsLvalueBit | FlagsRowMajorBit
-  };
-};
-
-template<typename XprType, int BlockRows=Dynamic, int BlockCols=Dynamic, bool InnerPanel = false,
-         bool HasDirectAccess = internal::has_direct_access<XprType>::ret> class BlockImpl_dense;
-         
-} // end namespace internal
-
-template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, typename StorageKind> class BlockImpl;
-
 template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class Block
   : public BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, typename internal::traits<XprType>::StorageKind>
 {
@@ -111,11 +109,13 @@
     typedef Impl Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(Block)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block)
-  
+
+    typedef typename internal::remove_all<XprType>::type NestedExpression;
+
     /** Column or Row constructor
       */
-    EIGEN_DEVICE_FUNC
-    inline Block(XprType& xpr, Index i) : Impl(xpr,i)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Block(XprType& xpr, Index i) : Impl(xpr,i)
     {
       eigen_assert( (i>=0) && (
           ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && i<xpr.rows())
@@ -124,30 +124,30 @@
 
     /** Fixed-size constructor
       */
-    EIGEN_DEVICE_FUNC
-    inline Block(XprType& xpr, Index a_startRow, Index a_startCol)
-      : Impl(xpr, a_startRow, a_startCol)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Block(XprType& xpr, Index startRow, Index startCol)
+      : Impl(xpr, startRow, startCol)
     {
       EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
-      eigen_assert(a_startRow >= 0 && BlockRows >= 1 && a_startRow + BlockRows <= xpr.rows()
-             && a_startCol >= 0 && BlockCols >= 1 && a_startCol + BlockCols <= xpr.cols());
+      eigen_assert(startRow >= 0 && BlockRows >= 0 && startRow + BlockRows <= xpr.rows()
+             && startCol >= 0 && BlockCols >= 0 && startCol + BlockCols <= xpr.cols());
     }
 
     /** Dynamic-size constructor
       */
-    EIGEN_DEVICE_FUNC
-    inline Block(XprType& xpr,
-          Index a_startRow, Index a_startCol,
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Block(XprType& xpr,
+          Index startRow, Index startCol,
           Index blockRows, Index blockCols)
-      : Impl(xpr, a_startRow, a_startCol, blockRows, blockCols)
+      : Impl(xpr, startRow, startCol, blockRows, blockCols)
     {
       eigen_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==blockRows)
           && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==blockCols));
-      eigen_assert(a_startRow >= 0 && blockRows >= 0 && a_startRow  <= xpr.rows() - blockRows
-          && a_startCol >= 0 && blockCols >= 0 && a_startCol <= xpr.cols() - blockCols);
+      eigen_assert(startRow >= 0 && blockRows >= 0 && startRow  <= xpr.rows() - blockRows
+          && startCol >= 0 && blockCols >= 0 && startCol <= xpr.cols() - blockCols);
     }
 };
-         
+
 // The generic default implementation for dense block simplu forward to the internal::BlockImpl_dense
 // that must be specialized for direct and non-direct access...
 template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
@@ -155,15 +155,15 @@
   : public internal::BlockImpl_dense<XprType, BlockRows, BlockCols, InnerPanel>
 {
     typedef internal::BlockImpl_dense<XprType, BlockRows, BlockCols, InnerPanel> Impl;
-    typedef typename XprType::Index Index;
+    typedef typename XprType::StorageIndex StorageIndex;
   public:
     typedef Impl Base;
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
-    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
-    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol) : Impl(xpr, a_startRow, a_startCol) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {}
     EIGEN_DEVICE_FUNC
-    inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol, Index blockRows, Index blockCols)
-      : Impl(xpr, a_startRow, a_startCol, blockRows, blockCols) {}
+    EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+      : Impl(xpr, startRow, startCol, blockRows, blockCols) {}
 };
 
 namespace internal {
@@ -173,13 +173,14 @@
   : public internal::dense_xpr_base<Block<XprType, BlockRows, BlockCols, InnerPanel> >::type
 {
     typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+    typedef typename internal::ref_selector<XprType>::non_const_type XprTypeNested;
   public:
 
     typedef typename internal::dense_xpr_base<BlockType>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(BlockType)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl_dense)
 
-    class InnerIterator;
+    // class InnerIterator; // FIXME apparently never used
 
     /** Column or Row constructor
       */
@@ -199,8 +200,8 @@
     /** Fixed-size constructor
       */
     EIGEN_DEVICE_FUNC
-    inline BlockImpl_dense(XprType& xpr, Index a_startRow, Index a_startCol)
-      : m_xpr(xpr), m_startRow(a_startRow), m_startCol(a_startCol),
+    inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
+      : m_xpr(xpr), m_startRow(startRow), m_startCol(startCol),
                     m_blockRows(BlockRows), m_blockCols(BlockCols)
     {}
 
@@ -208,9 +209,9 @@
       */
     EIGEN_DEVICE_FUNC
     inline BlockImpl_dense(XprType& xpr,
-          Index a_startRow, Index a_startCol,
+          Index startRow, Index startCol,
           Index blockRows, Index blockCols)
-      : m_xpr(xpr), m_startRow(a_startRow), m_startCol(a_startCol),
+      : m_xpr(xpr), m_startRow(startRow), m_startCol(startCol),
                     m_blockRows(blockRows), m_blockCols(blockCols)
     {}
 
@@ -221,15 +222,13 @@
     inline Scalar& coeffRef(Index rowId, Index colId)
     {
       EIGEN_STATIC_ASSERT_LVALUE(XprType)
-      return m_xpr.const_cast_derived()
-               .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
+      return m_xpr.coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
     }
 
     EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index rowId, Index colId) const
     {
-      return m_xpr.derived()
-               .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
+      return m_xpr.derived().coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
     }
 
     EIGEN_DEVICE_FUNC
@@ -242,43 +241,38 @@
     inline Scalar& coeffRef(Index index)
     {
       EIGEN_STATIC_ASSERT_LVALUE(XprType)
-      return m_xpr.const_cast_derived()
-             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_xpr.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                            m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
     }
 
     EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index index) const
     {
-      return m_xpr.const_cast_derived()
-             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_xpr.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                            m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
     }
 
     EIGEN_DEVICE_FUNC
     inline const CoeffReturnType coeff(Index index) const
     {
-      return m_xpr
-             .coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                    m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_xpr.coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                         m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
     }
 
     template<int LoadMode>
-    inline PacketScalar packet(Index rowId, Index colId) const
+    EIGEN_DEVICE_FUNC inline PacketScalar packet(Index rowId, Index colId) const
     {
-      return m_xpr.template packet<Unaligned>
-              (rowId + m_startRow.value(), colId + m_startCol.value());
+      return m_xpr.template packet<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value());
     }
 
     template<int LoadMode>
-    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
+    EIGEN_DEVICE_FUNC inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
     {
-      m_xpr.const_cast_derived().template writePacket<Unaligned>
-              (rowId + m_startRow.value(), colId + m_startCol.value(), val);
+      m_xpr.template writePacket<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value(), val);
     }
 
     template<int LoadMode>
-    inline PacketScalar packet(Index index) const
+    EIGEN_DEVICE_FUNC inline PacketScalar packet(Index index) const
     {
       return m_xpr.template packet<Unaligned>
               (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
@@ -286,9 +280,9 @@
     }
 
     template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& val)
+    EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& val)
     {
-      m_xpr.const_cast_derived().template writePacket<Unaligned>
+      m_xpr.template writePacket<Unaligned>
          (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
           m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0), val);
     }
@@ -300,31 +294,34 @@
     EIGEN_DEVICE_FUNC inline Index outerStride() const;
     #endif
 
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const
-    { 
-      return m_xpr; 
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
+    {
+      return m_xpr;
     }
-      
-    EIGEN_DEVICE_FUNC
-    Index startRow() const
-    { 
-      return m_startRow.value(); 
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    XprType& nestedExpression() { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    StorageIndex startRow() const EIGEN_NOEXCEPT
+    {
+      return m_startRow.value();
     }
-      
-    EIGEN_DEVICE_FUNC
-    Index startCol() const 
-    { 
-      return m_startCol.value(); 
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    StorageIndex startCol() const EIGEN_NOEXCEPT
+    {
+      return m_startCol.value();
     }
 
   protected:
 
-    const typename XprType::Nested m_xpr;
-    const internal::variable_if_dynamic<Index, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
-    const internal::variable_if_dynamic<Index, XprType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
-    const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_blockRows;
-    const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_blockCols;
+    XprTypeNested m_xpr;
+    const internal::variable_if_dynamic<StorageIndex, (XprType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
+    const internal::variable_if_dynamic<StorageIndex, (XprType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
+    const internal::variable_if_dynamic<StorageIndex, RowsAtCompileTime> m_blockRows;
+    const internal::variable_if_dynamic<StorageIndex, ColsAtCompileTime> m_blockCols;
 };
 
 /** \internal Internal implementation of dense Blocks in the direct access case.*/
@@ -333,6 +330,10 @@
   : public MapBase<Block<XprType, BlockRows, BlockCols, InnerPanel> >
 {
     typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+    typedef typename internal::ref_selector<XprType>::non_const_type XprTypeNested;
+    enum {
+      XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0
+    };
   public:
 
     typedef MapBase<BlockType> Base;
@@ -341,48 +342,53 @@
 
     /** Column or Row constructor
       */
-    EIGEN_DEVICE_FUNC
-    inline BlockImpl_dense(XprType& xpr, Index i)
-      : Base(internal::const_cast_ptr(&xpr.coeffRef(
-              (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0,
-              (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0)),
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    BlockImpl_dense(XprType& xpr, Index i)
+      : Base(xpr.data() + i * (    ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor))
+                                || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()),
              BlockRows==1 ? 1 : xpr.rows(),
              BlockCols==1 ? 1 : xpr.cols()),
-        m_xpr(xpr)
+        m_xpr(xpr),
+        m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0),
+        m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0)
     {
       init();
     }
 
     /** Fixed-size constructor
       */
-    EIGEN_DEVICE_FUNC
-    inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
-      : Base(internal::const_cast_ptr(&xpr.coeffRef(startRow,startCol))), m_xpr(xpr)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
+      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
+        m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
     {
       init();
     }
 
     /** Dynamic-size constructor
       */
-    EIGEN_DEVICE_FUNC
-    inline BlockImpl_dense(XprType& xpr,
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    BlockImpl_dense(XprType& xpr,
           Index startRow, Index startCol,
           Index blockRows, Index blockCols)
-      : Base(internal::const_cast_ptr(&xpr.coeffRef(startRow,startCol)), blockRows, blockCols),
-        m_xpr(xpr)
+      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols),
+        m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
     {
       init();
     }
 
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const
-    { 
-      return m_xpr; 
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const EIGEN_NOEXCEPT
+    {
+      return m_xpr;
     }
-      
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    XprType& nestedExpression() { return m_xpr; }
+
     /** \sa MapBase::innerStride() */
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index innerStride() const EIGEN_NOEXCEPT
     {
       return internal::traits<BlockType>::HasSameStorageOrderAsXprType
              ? m_xpr.innerStride()
@@ -390,12 +396,20 @@
     }
 
     /** \sa MapBase::outerStride() */
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index outerStride() const EIGEN_NOEXCEPT
     {
-      return m_outerStride;
+      return internal::traits<BlockType>::HasSameStorageOrderAsXprType
+                    ? m_xpr.outerStride()
+                    : m_xpr.innerStride();
     }
 
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    StorageIndex startRow() const EIGEN_NOEXCEPT { return m_startRow.value(); }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    StorageIndex startCol() const EIGEN_NOEXCEPT { return m_startCol.value(); }
+
   #ifndef __SUNPRO_CC
   // FIXME sunstudio is not friendly with the above friend...
   // META-FIXME there is no 'friend' keyword around here. Is this obsolete?
@@ -404,8 +418,8 @@
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal used by allowAligned() */
-    EIGEN_DEVICE_FUNC
-    inline BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols)
       : Base(data, blockRows, blockCols), m_xpr(xpr)
     {
       init();
@@ -413,7 +427,7 @@
     #endif
 
   protected:
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     void init()
     {
       m_outerStride = internal::traits<BlockType>::HasSameStorageOrderAsXprType
@@ -421,7 +435,9 @@
                     : m_xpr.innerStride();
     }
 
-    typename XprType::Nested m_xpr;
+    XprTypeNested m_xpr;
+    const internal::variable_if_dynamic<StorageIndex, (XprType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
+    const internal::variable_if_dynamic<StorageIndex, (XprType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
     Index m_outerStride;
 };
 

diff --git a/Eigen/src/Core/BooleanRedux.h b/Eigen/src/Core/BooleanRedux.h
index be9f48a..852de8b 100644
--- a/Eigen/src/Core/BooleanRedux.h
+++ b/Eigen/src/Core/BooleanRedux.h

@@ -14,56 +14,56 @@
 
 namespace internal {
 
-template<typename Derived, int UnrollCount>
+template<typename Derived, int UnrollCount, int Rows>
 struct all_unroller
 {
   enum {
-    col = (UnrollCount-1) / Derived::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived::RowsAtCompileTime
+    col = (UnrollCount-1) / Rows,
+    row = (UnrollCount-1) % Rows
   };
 
-  static inline bool run(const Derived &mat)
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
   {
-    return all_unroller<Derived, UnrollCount-1>::run(mat) && mat.coeff(row, col);
+    return all_unroller<Derived, UnrollCount-1, Rows>::run(mat) && mat.coeff(row, col);
   }
 };
 
-template<typename Derived>
-struct all_unroller<Derived, 0>
+template<typename Derived, int Rows>
+struct all_unroller<Derived, 0, Rows>
 {
-  static inline bool run(const Derived &/*mat*/) { return true; }
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &/*mat*/) { return true; }
 };
 
-template<typename Derived>
-struct all_unroller<Derived, Dynamic>
+template<typename Derived, int Rows>
+struct all_unroller<Derived, Dynamic, Rows>
 {
-  static inline bool run(const Derived &) { return false; }
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; }
 };
 
-template<typename Derived, int UnrollCount>
+template<typename Derived, int UnrollCount, int Rows>
 struct any_unroller
 {
   enum {
-    col = (UnrollCount-1) / Derived::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived::RowsAtCompileTime
+    col = (UnrollCount-1) / Rows,
+    row = (UnrollCount-1) % Rows
   };
-
-  static inline bool run(const Derived &mat)
+  
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
   {
-    return any_unroller<Derived, UnrollCount-1>::run(mat) || mat.coeff(row, col);
+    return any_unroller<Derived, UnrollCount-1, Rows>::run(mat) || mat.coeff(row, col);
   }
 };
 
-template<typename Derived>
-struct any_unroller<Derived, 0>
+template<typename Derived, int Rows>
+struct any_unroller<Derived, 0, Rows>
 {
-  static inline bool run(const Derived & /*mat*/) { return false; }
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived & /*mat*/) { return false; }
 };
 
-template<typename Derived>
-struct any_unroller<Derived, Dynamic>
+template<typename Derived, int Rows>
+struct any_unroller<Derived, Dynamic, Rows>
 {
-  static inline bool run(const Derived &) { return false; }
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; }
 };
 
 } // end namespace internal
@@ -76,21 +76,21 @@
   * \sa any(), Cwise::operator<()
   */
 template<typename Derived>
-inline bool DenseBase<Derived>::all() const
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
 {
+  typedef internal::evaluator<Derived> Evaluator;
   enum {
     unroll = SizeAtCompileTime != Dynamic
-          && CoeffReadCost != Dynamic
-          && NumTraits<Scalar>::AddCost != Dynamic
-          && SizeAtCompileTime * (CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
+          && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits<Scalar>::AddCost)) <= EIGEN_UNROLLING_LIMIT
   };
+  Evaluator evaluator(derived());
   if(unroll)
-    return internal::all_unroller<Derived, unroll ? int(SizeAtCompileTime) : Dynamic>::run(derived());
+    return internal::all_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, internal::traits<Derived>::RowsAtCompileTime>::run(evaluator);
   else
   {
     for(Index j = 0; j < cols(); ++j)
       for(Index i = 0; i < rows(); ++i)
-        if (!coeff(i, j)) return false;
+        if (!evaluator.coeff(i, j)) return false;
     return true;
   }
 }
@@ -100,21 +100,21 @@
   * \sa all()
   */
 template<typename Derived>
-inline bool DenseBase<Derived>::any() const
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
 {
+  typedef internal::evaluator<Derived> Evaluator;
   enum {
     unroll = SizeAtCompileTime != Dynamic
-          && CoeffReadCost != Dynamic
-          && NumTraits<Scalar>::AddCost != Dynamic
-          && SizeAtCompileTime * (CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
+          && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits<Scalar>::AddCost)) <= EIGEN_UNROLLING_LIMIT
   };
+  Evaluator evaluator(derived());
   if(unroll)
-    return internal::any_unroller<Derived, unroll ? int(SizeAtCompileTime) : Dynamic>::run(derived());
+    return internal::any_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, internal::traits<Derived>::RowsAtCompileTime>::run(evaluator);
   else
   {
     for(Index j = 0; j < cols(); ++j)
       for(Index i = 0; i < rows(); ++i)
-        if (coeff(i, j)) return true;
+        if (evaluator.coeff(i, j)) return true;
     return false;
   }
 }
@@ -124,7 +124,7 @@
   * \sa all(), any()
   */
 template<typename Derived>
-inline typename DenseBase<Derived>::Index DenseBase<Derived>::count() const
+EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase<Derived>::count() const
 {
   return derived().template cast<bool>().template cast<Index>().sum();
 }
@@ -136,7 +136,11 @@
 template<typename Derived>
 inline bool DenseBase<Derived>::hasNaN() const
 {
+#if EIGEN_COMP_MSVC || (defined __FAST_MATH__)
+  return derived().array().isNaN().any();
+#else
   return !((derived().array()==derived().array()).all());
+#endif
 }
 
 /** \returns true if \c *this contains only finite numbers, i.e., no NaN and no +/-INF values.
@@ -146,7 +150,11 @@
 template<typename Derived>
 inline bool DenseBase<Derived>::allFinite() const
 {
+#if EIGEN_COMP_MSVC || (defined __FAST_MATH__)
+  return derived().array().isFinite().all();
+#else
   return !((derived()-derived()).hasNaN());
+#endif
 }
     
 } // end namespace Eigen

diff --git a/Eigen/src/Core/CommaInitializer.h b/Eigen/src/Core/CommaInitializer.h
index 49b72af..c0e29c7 100644
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h

@@ -22,18 +22,19 @@
   * the return type of MatrixBase::operator<<, and most of the time this is the only
   * way it is used.
   *
-  * \sa \ref MatrixBaseCommaInitRef "MatrixBase::operator<<", CommaInitializer::finished()
+  * \sa \blank \ref MatrixBaseCommaInitRef "MatrixBase::operator<<", CommaInitializer::finished()
   */
 template<typename XprType>
 struct CommaInitializer
 {
   typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::Index Index;
 
   EIGEN_DEVICE_FUNC
   inline CommaInitializer(XprType& xpr, const Scalar& s)
     : m_xpr(xpr), m_row(0), m_col(1), m_currentBlockRows(1)
   {
+    eigen_assert(m_xpr.rows() > 0 && m_xpr.cols() > 0
+      && "Cannot comma-initialize a 0x0 matrix (operator<<)");
     m_xpr.coeffRef(0,0) = s;
   }
 
@@ -42,6 +43,8 @@
   inline CommaInitializer(XprType& xpr, const DenseBase<OtherDerived>& other)
     : m_xpr(xpr), m_row(0), m_col(other.cols()), m_currentBlockRows(other.rows())
   {
+    eigen_assert(m_xpr.rows() >= other.rows() && m_xpr.cols() >= other.cols()
+      && "Cannot comma-initialize a 0x0 matrix (operator<<)");
     m_xpr.block(0, 0, other.rows(), other.cols()) = other;
   }
 
@@ -81,9 +84,7 @@
   EIGEN_DEVICE_FUNC
   CommaInitializer& operator,(const DenseBase<OtherDerived>& other)
   {
-    if(other.cols()==0 || other.rows()==0)
-      return *this;
-    if (m_col==m_xpr.cols())
+    if (m_col==m_xpr.cols() && (other.cols()!=0 || other.rows()!=m_currentBlockRows))
     {
       m_row+=m_currentBlockRows;
       m_col = 0;
@@ -91,25 +92,22 @@
       eigen_assert(m_row+m_currentBlockRows<=m_xpr.rows()
         && "Too many rows passed to comma initializer (operator<<)");
     }
-    eigen_assert(m_col<m_xpr.cols()
+    eigen_assert((m_col + other.cols() <= m_xpr.cols())
       && "Too many coefficients passed to comma initializer (operator<<)");
     eigen_assert(m_currentBlockRows==other.rows());
-    if (OtherDerived::SizeAtCompileTime != Dynamic)
-      m_xpr.template block<OtherDerived::RowsAtCompileTime != Dynamic ? OtherDerived::RowsAtCompileTime : 1,
-                              OtherDerived::ColsAtCompileTime != Dynamic ? OtherDerived::ColsAtCompileTime : 1>
-                    (m_row, m_col) = other;
-    else
-      m_xpr.block(m_row, m_col, other.rows(), other.cols()) = other;
+    m_xpr.template block<OtherDerived::RowsAtCompileTime, OtherDerived::ColsAtCompileTime>
+                    (m_row, m_col, other.rows(), other.cols()) = other;
     m_col += other.cols();
     return *this;
   }
 
   EIGEN_DEVICE_FUNC
   inline ~CommaInitializer()
+#if defined VERIFY_RAISES_ASSERT && (!defined EIGEN_NO_ASSERTION_CHECKING) && defined EIGEN_EXCEPTIONS
+  EIGEN_EXCEPTION_SPEC(Eigen::eigen_assert_exception)
+#endif
   {
-    eigen_assert((m_row+m_currentBlockRows) == m_xpr.rows()
-         && m_col == m_xpr.cols()
-         && "Too few coefficients passed to comma initializer (operator<<)");
+    finished();
   }
 
   /** \returns the built matrix once all its coefficients have been set.
@@ -120,7 +118,12 @@
     * \endcode
     */
   EIGEN_DEVICE_FUNC
-  inline XprType& finished() { return m_xpr; }
+  inline XprType& finished() {
+      eigen_assert(((m_row+m_currentBlockRows) == m_xpr.rows() || m_xpr.cols() == 0)
+           && m_col == m_xpr.cols()
+           && "Too few coefficients passed to comma initializer (operator<<)");
+      return m_xpr;
+  }
 
   XprType& m_xpr;           // target expression
   Index m_row;              // current row id
@@ -142,8 +145,7 @@
   * \sa CommaInitializer::finished(), class CommaInitializer
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
 {
   return CommaInitializer<Derived>(*static_cast<Derived*>(this), s);
 }
@@ -151,8 +153,7 @@
 /** \sa operator<<(const Scalar&) */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-inline CommaInitializer<Derived>
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived>
 DenseBase<Derived>::operator<<(const DenseBase<OtherDerived>& other)
 {
   return CommaInitializer<Derived>(*static_cast<Derived *>(this), other);

diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h
new file mode 100644
index 0000000..51a2e5f
--- /dev/null
+++ b/Eigen/src/Core/ConditionEstimator.h

@@ -0,0 +1,175 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Rasmus Munk Larsen (rmlarsen@google.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CONDITIONESTIMATOR_H
+#define EIGEN_CONDITIONESTIMATOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Vector, typename RealVector, bool IsComplex>
+struct rcond_compute_sign {
+  static inline Vector run(const Vector& v) {
+    const RealVector v_abs = v.cwiseAbs();
+    return (v_abs.array() == static_cast<typename Vector::RealScalar>(0))
+            .select(Vector::Ones(v.size()), v.cwiseQuotient(v_abs));
+  }
+};
+
+// Partial specialization to avoid elementwise division for real vectors.
+template <typename Vector>
+struct rcond_compute_sign<Vector, Vector, false> {
+  static inline Vector run(const Vector& v) {
+    return (v.array() < static_cast<typename Vector::RealScalar>(0))
+           .select(-Vector::Ones(v.size()), Vector::Ones(v.size()));
+  }
+};
+
+/**
+  * \returns an estimate of ||inv(matrix)||_1 given a decomposition of
+  * \a matrix that implements .solve() and .adjoint().solve() methods.
+  *
+  * This function implements Algorithms 4.1 and 5.1 from
+  *   http://www.maths.manchester.ac.uk/~higham/narep/narep135.pdf
+  * which also forms the basis for the condition number estimators in
+  * LAPACK. Since at most 10 calls to the solve method of dec are
+  * performed, the total cost is O(dims^2), as opposed to O(dims^3)
+  * needed to compute the inverse matrix explicitly.
+  *
+  * The most common usage is in estimating the condition number
+  * ||matrix||_1 * ||inv(matrix)||_1. The first term ||matrix||_1 can be
+  * computed directly in O(n^2) operations.
+  *
+  * Supports the following decompositions: FullPivLU, PartialPivLU, LDLT, and
+  * LLT.
+  *
+  * \sa FullPivLU, PartialPivLU, LDLT, LLT.
+  */
+template <typename Decomposition>
+typename Decomposition::RealScalar rcond_invmatrix_L1_norm_estimate(const Decomposition& dec)
+{
+  typedef typename Decomposition::MatrixType MatrixType;
+  typedef typename Decomposition::Scalar Scalar;
+  typedef typename Decomposition::RealScalar RealScalar;
+  typedef typename internal::plain_col_type<MatrixType>::type Vector;
+  typedef typename internal::plain_col_type<MatrixType, RealScalar>::type RealVector;
+  const bool is_complex = (NumTraits<Scalar>::IsComplex != 0);
+
+  eigen_assert(dec.rows() == dec.cols());
+  const Index n = dec.rows();
+  if (n == 0)
+    return 0;
+
+  // Disable Index to float conversion warning
+#ifdef __INTEL_COMPILER
+  #pragma warning push
+  #pragma warning ( disable : 2259 )
+#endif
+  Vector v = dec.solve(Vector::Ones(n) / Scalar(n));
+#ifdef __INTEL_COMPILER
+  #pragma warning pop
+#endif
+
+  // lower_bound is a lower bound on
+  //   ||inv(matrix)||_1  = sup_v ||inv(matrix) v||_1 / ||v||_1
+  // and is the objective maximized by the ("super-") gradient ascent
+  // algorithm below.
+  RealScalar lower_bound = v.template lpNorm<1>();
+  if (n == 1)
+    return lower_bound;
+
+  // Gradient ascent algorithm follows: We know that the optimum is achieved at
+  // one of the simplices v = e_i, so in each iteration we follow a
+  // super-gradient to move towards the optimal one.
+  RealScalar old_lower_bound = lower_bound;
+  Vector sign_vector(n);
+  Vector old_sign_vector;
+  Index v_max_abs_index = -1;
+  Index old_v_max_abs_index = v_max_abs_index;
+  for (int k = 0; k < 4; ++k)
+  {
+    sign_vector = internal::rcond_compute_sign<Vector, RealVector, is_complex>::run(v);
+    if (k > 0 && !is_complex && sign_vector == old_sign_vector) {
+      // Break if the solution stagnated.
+      break;
+    }
+    // v_max_abs_index = argmax |real( inv(matrix)^T * sign_vector )|
+    v = dec.adjoint().solve(sign_vector);
+    v.real().cwiseAbs().maxCoeff(&v_max_abs_index);
+    if (v_max_abs_index == old_v_max_abs_index) {
+      // Break if the solution stagnated.
+      break;
+    }
+    // Move to the new simplex e_j, where j = v_max_abs_index.
+    v = dec.solve(Vector::Unit(n, v_max_abs_index));  // v = inv(matrix) * e_j.
+    lower_bound = v.template lpNorm<1>();
+    if (lower_bound <= old_lower_bound) {
+      // Break if the gradient step did not increase the lower_bound.
+      break;
+    }
+    if (!is_complex) {
+      old_sign_vector = sign_vector;
+    }
+    old_v_max_abs_index = v_max_abs_index;
+    old_lower_bound = lower_bound;
+  }
+  // The following calculates an independent estimate of ||matrix||_1 by
+  // multiplying matrix by a vector with entries of slowly increasing
+  // magnitude and alternating sign:
+  //   v_i = (-1)^{i} (1 + (i / (dim-1))), i = 0,...,dim-1.
+  // This improvement to Hager's algorithm above is due to Higham. It was
+  // added to make the algorithm more robust in certain corner cases where
+  // large elements in the matrix might otherwise escape detection due to
+  // exact cancellation (especially when op and op_adjoint correspond to a
+  // sequence of backsubstitutions and permutations), which could cause
+  // Hager's algorithm to vastly underestimate ||matrix||_1.
+  Scalar alternating_sign(RealScalar(1));
+  for (Index i = 0; i < n; ++i) {
+    // The static_cast is needed when Scalar is a complex and RealScalar implements expression templates
+    v[i] = alternating_sign * static_cast<RealScalar>(RealScalar(1) + (RealScalar(i) / (RealScalar(n - 1))));
+    alternating_sign = -alternating_sign;
+  }
+  v = dec.solve(v);
+  const RealScalar alternate_lower_bound = (2 * v.template lpNorm<1>()) / (3 * RealScalar(n));
+  return numext::maxi(lower_bound, alternate_lower_bound);
+}
+
+/** \brief Reciprocal condition number estimator.
+  *
+  * Computing a decomposition of a dense matrix takes O(n^3) operations, while
+  * this method estimates the condition number quickly and reliably in O(n^2)
+  * operations.
+  *
+  * \returns an estimate of the reciprocal condition number
+  * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given ||matrix||_1 and
+  * its decomposition. Supports the following decompositions: FullPivLU,
+  * PartialPivLU, LDLT, and LLT.
+  *
+  * \sa FullPivLU, PartialPivLU, LDLT, LLT.
+  */
+template <typename Decomposition>
+typename Decomposition::RealScalar
+rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Decomposition& dec)
+{
+  typedef typename Decomposition::RealScalar RealScalar;
+  eigen_assert(dec.rows() == dec.cols());
+  if (dec.rows() == 0)              return NumTraits<RealScalar>::infinity();
+  if (matrix_norm == RealScalar(0)) return RealScalar(0);
+  if (dec.rows() == 1)              return RealScalar(1);
+  const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec);
+  return (inverse_matrix_norm == RealScalar(0) ? RealScalar(0)
+                                               : (RealScalar(1) / inverse_matrix_norm) / matrix_norm);
+}
+
+}  // namespace internal
+
+}  // namespace Eigen
+
+#endif

diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index f595113..0ff8c8d 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h

@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2011-2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -17,196 +17,1661 @@
 
 namespace internal {
 
-// evaluator_traits<T> contains traits for evaluator_impl<T> 
-
-template<typename T>
-struct evaluator_traits
-{
-  // 1 if evaluator_impl<T>::evalTo() exists
-  // 0 if evaluator_impl<T> allows coefficient-based access
-  static const int HasEvalTo = 0;
-
-  // 1 if assignment A = B assumes aliasing when B is of type T and thus B needs to be evaluated into a
-  // temporary; 0 if not.
-  static const int AssumeAliasing = 0;
+// This class returns the evaluator kind from the expression storage kind.
+// Default assumes index based accessors
+template<typename StorageKind>
+struct storage_kind_to_evaluator_kind {
+  typedef IndexBased Kind;
 };
 
-// expression class for evaluating nested expression to a temporary
- 
-template<typename ArgType>
-class EvalToTemp;
+// This class returns the evaluator shape from the expression storage kind.
+// It can be Dense, Sparse, Triangular, Diagonal, SelfAdjoint, Band, etc.
+template<typename StorageKind> struct storage_kind_to_shape;
 
-// evaluator<T>::type is type of evaluator for T
-// evaluator<T>::nestedType is type of evaluator if T is nested inside another evaluator
- 
-template<typename T>
-struct evaluator_impl 
-{ };
- 
-template<typename T, int Nested = evaluator_traits<T>::HasEvalTo>
-struct evaluator_nested_type;
+template<> struct storage_kind_to_shape<Dense>                  { typedef DenseShape Shape;           };
+template<> struct storage_kind_to_shape<SolverStorage>          { typedef SolverShape Shape;           };
+template<> struct storage_kind_to_shape<PermutationStorage>     { typedef PermutationShape Shape;     };
+template<> struct storage_kind_to_shape<TranspositionsStorage>  { typedef TranspositionsShape Shape;  };
+
+// Evaluators have to be specialized with respect to various criteria such as:
+//  - storage/structure/shape
+//  - scalar type
+//  - etc.
+// Therefore, we need specialization of evaluator providing additional template arguments for each kind of evaluators.
+// We currently distinguish the following kind of evaluators:
+// - unary_evaluator    for expressions taking only one arguments (CwiseUnaryOp, CwiseUnaryView, Transpose, MatrixWrapper, ArrayWrapper, Reverse, Replicate)
+// - binary_evaluator   for expression taking two arguments (CwiseBinaryOp)
+// - ternary_evaluator   for expression taking three arguments (CwiseTernaryOp)
+// - product_evaluator  for linear algebra products (Product); special case of binary_evaluator because it requires additional tags for dispatching.
+// - mapbase_evaluator  for Map, Block, Ref
+// - block_evaluator    for Block (special dispatching to a mapbase_evaluator or unary_evaluator)
+
+template< typename T,
+          typename Arg1Kind   = typename evaluator_traits<typename T::Arg1>::Kind,
+          typename Arg2Kind   = typename evaluator_traits<typename T::Arg2>::Kind,
+          typename Arg3Kind   = typename evaluator_traits<typename T::Arg3>::Kind,
+          typename Arg1Scalar = typename traits<typename T::Arg1>::Scalar,
+          typename Arg2Scalar = typename traits<typename T::Arg2>::Scalar,
+          typename Arg3Scalar = typename traits<typename T::Arg3>::Scalar> struct ternary_evaluator;
+
+template< typename T,
+          typename LhsKind   = typename evaluator_traits<typename T::Lhs>::Kind,
+          typename RhsKind   = typename evaluator_traits<typename T::Rhs>::Kind,
+          typename LhsScalar = typename traits<typename T::Lhs>::Scalar,
+          typename RhsScalar = typename traits<typename T::Rhs>::Scalar> struct binary_evaluator;
+
+template< typename T,
+          typename Kind   = typename evaluator_traits<typename T::NestedExpression>::Kind,
+          typename Scalar = typename T::Scalar> struct unary_evaluator;
+
+// evaluator_traits<T> contains traits for evaluator<T>
 
 template<typename T>
-struct evaluator_nested_type<T, 0>
+struct evaluator_traits_base
 {
-  typedef evaluator_impl<T> type;
+  // by default, get evaluator kind and shape from storage
+  typedef typename storage_kind_to_evaluator_kind<typename traits<T>::StorageKind>::Kind Kind;
+  typedef typename storage_kind_to_shape<typename traits<T>::StorageKind>::Shape Shape;
 };
 
+// Default evaluator traits
 template<typename T>
-struct evaluator_nested_type<T, 1>
+struct evaluator_traits : public evaluator_traits_base<T>
 {
-  typedef evaluator_impl<EvalToTemp<T> > type;
 };
 
-template<typename T>
-struct evaluator
-{
-  typedef evaluator_impl<T> type;
-  typedef typename evaluator_nested_type<T>::type nestedType;
+template<typename T, typename Shape = typename evaluator_traits<T>::Shape >
+struct evaluator_assume_aliasing {
+  static const bool value = false;
 };
 
+// By default, we assume a unary expression:
+template<typename T>
+struct evaluator : public unary_evaluator<T>
+{
+  typedef unary_evaluator<T> Base;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const T& xpr) : Base(xpr) {}
+};
+
+
 // TODO: Think about const-correctness
-
 template<typename T>
 struct evaluator<const T>
   : evaluator<T>
-{ };
-
-// ---------- base class for all writable evaluators ----------
-
-// TODO this class does not seem to be necessary anymore
-template<typename ExpressionType>
-struct evaluator_impl_base
 {
-  typedef typename ExpressionType::Index Index;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const T& xpr) : evaluator<T>(xpr) {}
+};
+
+// ---------- base class for all evaluators ----------
+
+template<typename ExpressionType>
+struct evaluator_base
+{
   // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices.
   typedef traits<ExpressionType> ExpressionTraits;
 
-  evaluator_impl<ExpressionType>& derived() 
-  {
-    return *static_cast<evaluator_impl<ExpressionType>*>(this); 
-  }
+  enum {
+    Alignment = 0
+  };
+  // noncopyable:
+  // Don't make this class inherit noncopyable as this kills EBO (Empty Base Optimization)
+  // and make complex evaluator much larger than then should do.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator_base() {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~evaluator_base() {}
+private:
+  EIGEN_DEVICE_FUNC evaluator_base(const evaluator_base&);
+  EIGEN_DEVICE_FUNC const evaluator_base& operator=(const evaluator_base&);
 };
 
 // -------------------- Matrix and Array --------------------
 //
-// evaluator_impl<PlainObjectBase> is a common base class for the
+// evaluator<PlainObjectBase> is a common base class for the
 // Matrix and Array evaluators.
+// Here we directly specialize evaluator. This is not really a unary expression, and it is, by definition, dense,
+// so no need for more sophisticated dispatching.
+
+// this helper permits to completely eliminate m_outerStride if it is known at compiletime.
+template<typename Scalar,int OuterStride> class plainobjectbase_evaluator_data {
+public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr)
+  {
+#ifndef EIGEN_INTERNAL_DEBUGGING
+    EIGEN_UNUSED_VARIABLE(outerStride);
+#endif
+    eigen_internal_assert(outerStride==OuterStride);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+  Index outerStride() const EIGEN_NOEXCEPT { return OuterStride; }
+  const Scalar *data;
+};
+
+template<typename Scalar> class plainobjectbase_evaluator_data<Scalar,Dynamic> {
+public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Index outerStride() const { return m_outerStride; }
+  const Scalar *data;
+protected:
+  Index m_outerStride;
+};
 
 template<typename Derived>
-struct evaluator_impl<PlainObjectBase<Derived> >
-  : evaluator_impl_base<Derived>
+struct evaluator<PlainObjectBase<Derived> >
+  : evaluator_base<Derived>
 {
   typedef PlainObjectBase<Derived> PlainObjectType;
+  typedef typename PlainObjectType::Scalar Scalar;
+  typedef typename PlainObjectType::CoeffReturnType CoeffReturnType;
 
   enum {
     IsRowMajor = PlainObjectType::IsRowMajor,
     IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime,
     RowsAtCompileTime = PlainObjectType::RowsAtCompileTime,
-    ColsAtCompileTime = PlainObjectType::ColsAtCompileTime
+    ColsAtCompileTime = PlainObjectType::ColsAtCompileTime,
+
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Flags = traits<Derived>::EvaluatorFlags,
+    Alignment = traits<Derived>::Alignment
+  };
+  enum {
+    // We do not need to know the outer stride for vectors
+    OuterStrideAtCompileTime = IsVectorAtCompileTime  ? 0
+                                                      : int(IsRowMajor) ? ColsAtCompileTime
+                                                                        : RowsAtCompileTime
   };
 
-  evaluator_impl(const PlainObjectType& m) 
-    : m_data(m.data()), m_outerStride(IsVectorAtCompileTime ? 0 : m.outerStride()) 
-  { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  evaluator()
+    : m_d(0,OuterStrideAtCompileTime)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
 
-  typedef typename PlainObjectType::Index Index;
-  typedef typename PlainObjectType::Scalar Scalar;
-  typedef typename PlainObjectType::CoeffReturnType CoeffReturnType;
-  typedef typename PlainObjectType::PacketScalar PacketScalar;
-  typedef typename PlainObjectType::PacketReturnType PacketReturnType;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const PlainObjectType& m)
+    : m_d(m.data(),IsVectorAtCompileTime ? 0 : m.outerStride())
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index row, Index col) const
   {
     if (IsRowMajor)
-      return m_data[row * m_outerStride.value() + col];
+      return m_d.data[row * m_d.outerStride() + col];
     else
-      return m_data[row + col * m_outerStride.value()];
+      return m_d.data[row + col * m_d.outerStride()];
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index index) const
   {
-    return m_data[index];
+    return m_d.data[index];
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index row, Index col)
   {
     if (IsRowMajor)
-      return const_cast<Scalar*>(m_data)[row * m_outerStride.value() + col];
+      return const_cast<Scalar*>(m_d.data)[row * m_d.outerStride() + col];
     else
-      return const_cast<Scalar*>(m_data)[row + col * m_outerStride.value()];
+      return const_cast<Scalar*>(m_d.data)[row + col * m_d.outerStride()];
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index index)
   {
-    return const_cast<Scalar*>(m_data)[index];
+    return const_cast<Scalar*>(m_d.data)[index];
   }
 
-  template<int LoadMode> 
-  PacketReturnType packet(Index row, Index col) const
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
   {
     if (IsRowMajor)
-      return ploadt<PacketScalar, LoadMode>(m_data + row * m_outerStride.value() + col);
+      return ploadt<PacketType, LoadMode>(m_d.data + row * m_d.outerStride() + col);
     else
-      return ploadt<PacketScalar, LoadMode>(m_data + row + col * m_outerStride.value());
+      return ploadt<PacketType, LoadMode>(m_d.data + row + col * m_d.outerStride());
   }
 
-  template<int LoadMode> 
-  PacketReturnType packet(Index index) const
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
   {
-    return ploadt<PacketScalar, LoadMode>(m_data + index);
+    return ploadt<PacketType, LoadMode>(m_d.data + index);
   }
 
-  template<int StoreMode> 
-  void writePacket(Index row, Index col, const PacketScalar& x)
+  template<int StoreMode,typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index row, Index col, const PacketType& x)
   {
     if (IsRowMajor)
-      return pstoret<Scalar, PacketScalar, StoreMode>
-	            (const_cast<Scalar*>(m_data) + row * m_outerStride.value() + col, x);
+      return pstoret<Scalar, PacketType, StoreMode>
+	            (const_cast<Scalar*>(m_d.data) + row * m_d.outerStride() + col, x);
     else
-      return pstoret<Scalar, PacketScalar, StoreMode>
-                    (const_cast<Scalar*>(m_data) + row + col * m_outerStride.value(), x);
+      return pstoret<Scalar, PacketType, StoreMode>
+                    (const_cast<Scalar*>(m_d.data) + row + col * m_d.outerStride(), x);
   }
 
-  template<int StoreMode> 
-  void writePacket(Index index, const PacketScalar& x)
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketType& x)
   {
-    return pstoret<Scalar, PacketScalar, StoreMode>(const_cast<Scalar*>(m_data) + index, x);
+    return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x);
   }
 
 protected:
-  const Scalar *m_data;
 
-  // We do not need to know the outer stride for vectors
-  variable_if_dynamic<Index, IsVectorAtCompileTime  ? 0 
-                                                    : int(IsRowMajor) ? ColsAtCompileTime 
-                                                    : RowsAtCompileTime> m_outerStride;
+  plainobjectbase_evaluator_data<Scalar,OuterStrideAtCompileTime> m_d;
 };
 
 template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
-struct evaluator_impl<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
-  : evaluator_impl<PlainObjectBase<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > >
+struct evaluator<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
+  : evaluator<PlainObjectBase<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > >
 {
   typedef Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
 
-  evaluator_impl(const XprType& m) 
-    : evaluator_impl<PlainObjectBase<XprType> >(m) 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  evaluator() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const XprType& m)
+    : evaluator<PlainObjectBase<XprType> >(m)
   { }
 };
 
 template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
-struct evaluator_impl<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
-  : evaluator_impl<PlainObjectBase<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > >
+struct evaluator<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
+  : evaluator<PlainObjectBase<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > >
 {
   typedef Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
 
-  evaluator_impl(const XprType& m) 
-    : evaluator_impl<PlainObjectBase<XprType> >(m) 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  evaluator() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const XprType& m)
+    : evaluator<PlainObjectBase<XprType> >(m)
   { }
 };
 
+// -------------------- Transpose --------------------
+
+template<typename ArgType>
+struct unary_evaluator<Transpose<ArgType>, IndexBased>
+  : evaluator_base<Transpose<ArgType> >
+{
+  typedef Transpose<ArgType> XprType;
+
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    Flags = evaluator<ArgType>::Flags ^ RowMajorBit,
+    Alignment = evaluator<ArgType>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_argImpl.coeff(col, row);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_argImpl.coeff(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    return m_argImpl.coeffRef(col, row);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  typename XprType::Scalar& coeffRef(Index index)
+  {
+    return m_argImpl.coeffRef(index);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    return m_argImpl.template packet<LoadMode,PacketType>(col, row);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    return m_argImpl.template packet<LoadMode,PacketType>(index);
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index row, Index col, const PacketType& x)
+  {
+    m_argImpl.template writePacket<StoreMode,PacketType>(col, row, x);
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketType& x)
+  {
+    m_argImpl.template writePacket<StoreMode,PacketType>(index, x);
+  }
+
+protected:
+  evaluator<ArgType> m_argImpl;
+};
+
+// -------------------- CwiseNullaryOp --------------------
+// Like Matrix and Array, this is not really a unary expression, so we directly specialize evaluator.
+// Likewise, there is not need to more sophisticated dispatching here.
+
+template<typename Scalar,typename NullaryOp,
+         bool has_nullary = has_nullary_operator<NullaryOp>::value,
+         bool has_unary   = has_unary_operator<NullaryOp>::value,
+         bool has_binary  = has_binary_operator<NullaryOp>::value>
+struct nullary_wrapper
+{
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const { return op(i,j); }
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const { return op(i); }
+
+  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const { return op.template packetOp<T>(i,j); }
+  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const { return op.template packetOp<T>(i); }
+};
+
+template<typename Scalar,typename NullaryOp>
+struct nullary_wrapper<Scalar,NullaryOp,true,false,false>
+{
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType=0, IndexType=0) const { return op(); }
+  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType=0, IndexType=0) const { return op.template packetOp<T>(); }
+};
+
+template<typename Scalar,typename NullaryOp>
+struct nullary_wrapper<Scalar,NullaryOp,false,false,true>
+{
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j=0) const { return op(i,j); }
+  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j=0) const { return op.template packetOp<T>(i,j); }
+};
+
+// We need the following specialization for vector-only functors assigned to a runtime vector,
+// for instance, using linspace and assigning a RowVectorXd to a MatrixXd or even a row of a MatrixXd.
+// In this case, i==0 and j is used for the actual iteration.
+template<typename Scalar,typename NullaryOp>
+struct nullary_wrapper<Scalar,NullaryOp,false,true,false>
+{
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const {
+    eigen_assert(i==0 || j==0);
+    return op(i+j);
+  }
+  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const {
+    eigen_assert(i==0 || j==0);
+    return op.template packetOp<T>(i+j);
+  }
+
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const { return op(i); }
+  template <typename T, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const { return op.template packetOp<T>(i); }
+};
+
+template<typename Scalar,typename NullaryOp>
+struct nullary_wrapper<Scalar,NullaryOp,false,false,false> {};
+
+#if 0 && EIGEN_COMP_MSVC>0
+// Disable this ugly workaround. This is now handled in traits<Ref>::match,
+// but this piece of code might still become handly if some other weird compilation
+// erros pop up again.
+
+// MSVC exhibits a weird compilation error when
+// compiling:
+//    Eigen::MatrixXf A = MatrixXf::Random(3,3);
+//    Ref<const MatrixXf> R = 2.f*A;
+// and that has_*ary_operator<scalar_constant_op<float>> have not been instantiated yet.
+// The "problem" is that evaluator<2.f*A> is instantiated by traits<Ref>::match<2.f*A>
+// and at that time has_*ary_operator<T> returns true regardless of T.
+// Then nullary_wrapper is badly instantiated as nullary_wrapper<.,.,true,true,true>.
+// The trick is thus to defer the proper instantiation of nullary_wrapper when coeff(),
+// and packet() are really instantiated as implemented below:
+
+// This is a simple wrapper around Index to enforce the re-instantiation of
+// has_*ary_operator when needed.
+template<typename T> struct nullary_wrapper_workaround_msvc {
+  nullary_wrapper_workaround_msvc(const T&);
+  operator T()const;
+};
+
+template<typename Scalar,typename NullaryOp>
+struct nullary_wrapper<Scalar,NullaryOp,true,true,true>
+{
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const {
+    return nullary_wrapper<Scalar,NullaryOp,
+    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().operator()(op,i,j);
+  }
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const {
+    return nullary_wrapper<Scalar,NullaryOp,
+    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().operator()(op,i);
+  }
+
+  template <typename T, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const {
+    return nullary_wrapper<Scalar,NullaryOp,
+    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().template packetOp<T>(op,i,j);
+  }
+  template <typename T, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const {
+    return nullary_wrapper<Scalar,NullaryOp,
+    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().template packetOp<T>(op,i);
+  }
+};
+#endif // MSVC workaround
+
+template<typename NullaryOp, typename PlainObjectType>
+struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >
+  : evaluator_base<CwiseNullaryOp<NullaryOp,PlainObjectType> >
+{
+  typedef CwiseNullaryOp<NullaryOp,PlainObjectType> XprType;
+  typedef typename internal::remove_all<PlainObjectType>::type PlainObjectTypeCleaned;
+
+  enum {
+    CoeffReadCost = internal::functor_traits<NullaryOp>::Cost,
+
+    Flags = (evaluator<PlainObjectTypeCleaned>::Flags
+          &  (  HereditaryBits
+              | (functor_has_linear_access<NullaryOp>::ret  ? LinearAccessBit : 0)
+              | (functor_traits<NullaryOp>::PacketAccess    ? PacketAccessBit : 0)))
+          | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit),
+    Alignment = AlignedMax
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& n)
+    : m_functor(n.functor()), m_wrapper()
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(IndexType row, IndexType col) const
+  {
+    return m_wrapper(m_functor, row, col);
+  }
+
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(IndexType index) const
+  {
+    return m_wrapper(m_functor,index);
+  }
+
+  template<int LoadMode, typename PacketType, typename IndexType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(IndexType row, IndexType col) const
+  {
+    return m_wrapper.template packetOp<PacketType>(m_functor, row, col);
+  }
+
+  template<int LoadMode, typename PacketType, typename IndexType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(IndexType index) const
+  {
+    return m_wrapper.template packetOp<PacketType>(m_functor, index);
+  }
+
+protected:
+  const NullaryOp m_functor;
+  const internal::nullary_wrapper<CoeffReturnType,NullaryOp> m_wrapper;
+};
+
+// -------------------- CwiseUnaryOp --------------------
+
+template<typename UnaryOp, typename ArgType>
+struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
+  : evaluator_base<CwiseUnaryOp<UnaryOp, ArgType> >
+{
+  typedef CwiseUnaryOp<UnaryOp, ArgType> XprType;
+
+  enum {
+    CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<UnaryOp>::Cost),
+
+    Flags = evaluator<ArgType>::Flags
+          & (HereditaryBits | LinearAccessBit | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0)),
+    Alignment = evaluator<ArgType>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& op) : m_d(op)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_d.func()(m_d.argImpl.coeff(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_d.func()(m_d.argImpl.coeff(index));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    return m_d.func().packetOp(m_d.argImpl.template packet<LoadMode, PacketType>(row, col));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    return m_d.func().packetOp(m_d.argImpl.template packet<LoadMode, PacketType>(index));
+  }
+
+protected:
+
+  // this helper permits to completely eliminate the functor if it is empty
+  struct Data
+  {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Data(const XprType& xpr) : op(xpr.functor()), argImpl(xpr.nestedExpression()) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const UnaryOp& func() const { return op; }
+    UnaryOp op;
+    evaluator<ArgType> argImpl;
+  };
+
+  Data m_d;
+};
+
+// -------------------- CwiseTernaryOp --------------------
+
+// this is a ternary expression
+template<typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
+struct evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >
+  : public ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >
+{
+  typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
+  typedef ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> > Base;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+template<typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
+struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased, IndexBased>
+  : evaluator_base<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >
+{
+  typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
+
+  enum {
+    CoeffReadCost = int(evaluator<Arg1>::CoeffReadCost) + int(evaluator<Arg2>::CoeffReadCost) + int(evaluator<Arg3>::CoeffReadCost) + int(functor_traits<TernaryOp>::Cost),
+
+    Arg1Flags = evaluator<Arg1>::Flags,
+    Arg2Flags = evaluator<Arg2>::Flags,
+    Arg3Flags = evaluator<Arg3>::Flags,
+    SameType = is_same<typename Arg1::Scalar,typename Arg2::Scalar>::value && is_same<typename Arg1::Scalar,typename Arg3::Scalar>::value,
+    StorageOrdersAgree = (int(Arg1Flags)&RowMajorBit)==(int(Arg2Flags)&RowMajorBit) && (int(Arg1Flags)&RowMajorBit)==(int(Arg3Flags)&RowMajorBit),
+    Flags0 = (int(Arg1Flags) | int(Arg2Flags) | int(Arg3Flags)) & (
+        HereditaryBits
+        | (int(Arg1Flags) & int(Arg2Flags) & int(Arg3Flags) &
+           ( (StorageOrdersAgree ? LinearAccessBit : 0)
+           | (functor_traits<TernaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)
+           )
+        )
+     ),
+    Flags = (Flags0 & ~RowMajorBit) | (Arg1Flags & RowMajorBit),
+    Alignment = EIGEN_PLAIN_ENUM_MIN(
+        EIGEN_PLAIN_ENUM_MIN(evaluator<Arg1>::Alignment, evaluator<Arg2>::Alignment),
+        evaluator<Arg3>::Alignment)
+  };
+
+  EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr) : m_d(xpr)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<TernaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_d.func()(m_d.arg1Impl.coeff(row, col), m_d.arg2Impl.coeff(row, col), m_d.arg3Impl.coeff(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_d.func()(m_d.arg1Impl.coeff(index), m_d.arg2Impl.coeff(index), m_d.arg3Impl.coeff(index));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    return m_d.func().packetOp(m_d.arg1Impl.template packet<LoadMode,PacketType>(row, col),
+                               m_d.arg2Impl.template packet<LoadMode,PacketType>(row, col),
+                               m_d.arg3Impl.template packet<LoadMode,PacketType>(row, col));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    return m_d.func().packetOp(m_d.arg1Impl.template packet<LoadMode,PacketType>(index),
+                               m_d.arg2Impl.template packet<LoadMode,PacketType>(index),
+                               m_d.arg3Impl.template packet<LoadMode,PacketType>(index));
+  }
+
+protected:
+  // this helper permits to completely eliminate the functor if it is empty
+  struct Data
+  {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Data(const XprType& xpr) : op(xpr.functor()), arg1Impl(xpr.arg1()), arg2Impl(xpr.arg2()), arg3Impl(xpr.arg3()) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TernaryOp& func() const { return op; }
+    TernaryOp op;
+    evaluator<Arg1> arg1Impl;
+    evaluator<Arg2> arg2Impl;
+    evaluator<Arg3> arg3Impl;
+  };
+
+  Data m_d;
+};
+
+// -------------------- CwiseBinaryOp --------------------
+
+// this is a binary expression
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+  : public binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > Base;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBased>
+  : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+
+  enum {
+    CoeffReadCost = int(evaluator<Lhs>::CoeffReadCost) + int(evaluator<Rhs>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
+
+    LhsFlags = evaluator<Lhs>::Flags,
+    RhsFlags = evaluator<Rhs>::Flags,
+    SameType = is_same<typename Lhs::Scalar,typename Rhs::Scalar>::value,
+    StorageOrdersAgree = (int(LhsFlags)&RowMajorBit)==(int(RhsFlags)&RowMajorBit),
+    Flags0 = (int(LhsFlags) | int(RhsFlags)) & (
+        HereditaryBits
+      | (int(LhsFlags) & int(RhsFlags) &
+           ( (StorageOrdersAgree ? LinearAccessBit : 0)
+           | (functor_traits<BinaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)
+           )
+        )
+     ),
+    Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit),
+    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<Lhs>::Alignment,evaluator<Rhs>::Alignment)
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit binary_evaluator(const XprType& xpr) : m_d(xpr)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_d.func()(m_d.lhsImpl.coeff(row, col), m_d.rhsImpl.coeff(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_d.func()(m_d.lhsImpl.coeff(index), m_d.rhsImpl.coeff(index));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    return m_d.func().packetOp(m_d.lhsImpl.template packet<LoadMode,PacketType>(row, col),
+                               m_d.rhsImpl.template packet<LoadMode,PacketType>(row, col));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    return m_d.func().packetOp(m_d.lhsImpl.template packet<LoadMode,PacketType>(index),
+                               m_d.rhsImpl.template packet<LoadMode,PacketType>(index));
+  }
+
+protected:
+
+  // this helper permits to completely eliminate the functor if it is empty
+  struct Data
+  {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Data(const XprType& xpr) : op(xpr.functor()), lhsImpl(xpr.lhs()), rhsImpl(xpr.rhs()) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const BinaryOp& func() const { return op; }
+    BinaryOp op;
+    evaluator<Lhs> lhsImpl;
+    evaluator<Rhs> rhsImpl;
+  };
+
+  Data m_d;
+};
+
+// -------------------- CwiseUnaryView --------------------
+
+template<typename UnaryOp, typename ArgType>
+struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
+  : evaluator_base<CwiseUnaryView<UnaryOp, ArgType> >
+{
+  typedef CwiseUnaryView<UnaryOp, ArgType> XprType;
+
+  enum {
+    CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<UnaryOp>::Cost),
+
+    Flags = (evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit)),
+
+    Alignment = 0 // FIXME it is not very clear why alignment is necessarily lost...
+  };
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) : m_d(op)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_d.func()(m_d.argImpl.coeff(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_d.func()(m_d.argImpl.coeff(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    return m_d.func()(m_d.argImpl.coeffRef(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  {
+    return m_d.func()(m_d.argImpl.coeffRef(index));
+  }
+
+protected:
+
+  // this helper permits to completely eliminate the functor if it is empty
+  struct Data
+  {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Data(const XprType& xpr) : op(xpr.functor()), argImpl(xpr.nestedExpression()) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const UnaryOp& func() const { return op; }
+    UnaryOp op;
+    evaluator<ArgType> argImpl;
+  };
+
+  Data m_d;
+};
+
+// -------------------- Map --------------------
+
+// FIXME perhaps the PlainObjectType could be provided by Derived::PlainObject ?
+// but that might complicate template specialization
+template<typename Derived, typename PlainObjectType>
+struct mapbase_evaluator;
+
+template<typename Derived, typename PlainObjectType>
+struct mapbase_evaluator : evaluator_base<Derived>
+{
+  typedef Derived  XprType;
+  typedef typename XprType::PointerType PointerType;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  enum {
+    IsRowMajor = XprType::RowsAtCompileTime,
+    ColsAtCompileTime = XprType::ColsAtCompileTime,
+    CoeffReadCost = NumTraits<Scalar>::ReadCost
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit mapbase_evaluator(const XprType& map)
+    : m_data(const_cast<PointerType>(map.data())),
+      m_innerStride(map.innerStride()),
+      m_outerStride(map.outerStride())
+  {
+    EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(evaluator<Derived>::Flags&PacketAccessBit, internal::inner_stride_at_compile_time<Derived>::ret==1),
+                        PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_data[col * colStride() + row * rowStride()];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_data[index * m_innerStride.value()];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    return m_data[col * colStride() + row * rowStride()];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  {
+    return m_data[index * m_innerStride.value()];
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
+    return internal::ploadt<PacketType, LoadMode>(ptr);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    return internal::ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index row, Index col, const PacketType& x)
+  {
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
+    return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x);
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketType& x)
+  {
+    internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
+  }
+protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+  Index rowStride() const EIGEN_NOEXCEPT {
+    return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value();
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+  Index colStride() const EIGEN_NOEXCEPT {
+     return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value();
+  }
+
+  PointerType m_data;
+  const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
+  const internal::variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
+};
+
+template<typename PlainObjectType, int MapOptions, typename StrideType>
+struct evaluator<Map<PlainObjectType, MapOptions, StrideType> >
+  : public mapbase_evaluator<Map<PlainObjectType, MapOptions, StrideType>, PlainObjectType>
+{
+  typedef Map<PlainObjectType, MapOptions, StrideType> XprType;
+  typedef typename XprType::Scalar Scalar;
+  // TODO: should check for smaller packet types once we can handle multi-sized packet types
+  typedef typename packet_traits<Scalar>::type PacketScalar;
+
+  enum {
+    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
+                             ? int(PlainObjectType::InnerStrideAtCompileTime)
+                             : int(StrideType::InnerStrideAtCompileTime),
+    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
+                             ? int(PlainObjectType::OuterStrideAtCompileTime)
+                             : int(StrideType::OuterStrideAtCompileTime),
+    HasNoInnerStride = InnerStrideAtCompileTime == 1,
+    HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0,
+    HasNoStride = HasNoInnerStride && HasNoOuterStride,
+    IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic,
+
+    PacketAccessMask = bool(HasNoInnerStride) ? ~int(0) : ~int(PacketAccessBit),
+    LinearAccessMask = bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime) ? ~int(0) : ~int(LinearAccessBit),
+    Flags = int( evaluator<PlainObjectType>::Flags) & (LinearAccessMask&PacketAccessMask),
+
+    Alignment = int(MapOptions)&int(AlignedMask)
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& map)
+    : mapbase_evaluator<XprType, PlainObjectType>(map)
+  { }
+};
+
+// -------------------- Ref --------------------
+
+template<typename PlainObjectType, int RefOptions, typename StrideType>
+struct evaluator<Ref<PlainObjectType, RefOptions, StrideType> >
+  : public mapbase_evaluator<Ref<PlainObjectType, RefOptions, StrideType>, PlainObjectType>
+{
+  typedef Ref<PlainObjectType, RefOptions, StrideType> XprType;
+
+  enum {
+    Flags = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Flags,
+    Alignment = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const XprType& ref)
+    : mapbase_evaluator<XprType, PlainObjectType>(ref)
+  { }
+};
+
+// -------------------- Block --------------------
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel,
+         bool HasDirectAccess = internal::has_direct_access<ArgType>::ret> struct block_evaluator;
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
+  : block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel>
+{
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+  typedef typename XprType::Scalar Scalar;
+  // TODO: should check for smaller packet types once we can handle multi-sized packet types
+  typedef typename packet_traits<Scalar>::type PacketScalar;
+
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+
+    RowsAtCompileTime = traits<XprType>::RowsAtCompileTime,
+    ColsAtCompileTime = traits<XprType>::ColsAtCompileTime,
+    MaxRowsAtCompileTime = traits<XprType>::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = traits<XprType>::MaxColsAtCompileTime,
+
+    ArgTypeIsRowMajor = (int(evaluator<ArgType>::Flags)&RowMajorBit) != 0,
+    IsRowMajor = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? 1
+               : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0
+               : ArgTypeIsRowMajor,
+    HasSameStorageOrderAsArgType = (IsRowMajor == ArgTypeIsRowMajor),
+    InnerSize = IsRowMajor ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
+    InnerStrideAtCompileTime = HasSameStorageOrderAsArgType
+                             ? int(inner_stride_at_compile_time<ArgType>::ret)
+                             : int(outer_stride_at_compile_time<ArgType>::ret),
+    OuterStrideAtCompileTime = HasSameStorageOrderAsArgType
+                             ? int(outer_stride_at_compile_time<ArgType>::ret)
+                             : int(inner_stride_at_compile_time<ArgType>::ret),
+    MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0,
+
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,
+    FlagsRowMajorBit = XprType::Flags&RowMajorBit,
+    Flags0 = evaluator<ArgType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
+                                           DirectAccessBit |
+                                           MaskPacketAccessBit),
+    Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit,
+
+    PacketAlignment = unpacket_traits<PacketScalar>::alignment,
+    Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic)
+                             && (OuterStrideAtCompileTime!=0)
+                             && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0,
+    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0)
+  };
+  typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const XprType& block) : block_evaluator_type(block)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+};
+
+// no direct-access => dispatch to a unary evaluator
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /*HasDirectAccess*/ false>
+  : unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
+{
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit block_evaluator(const XprType& block)
+    : unary_evaluator<XprType>(block)
+  {}
+};
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBased>
+  : evaluator_base<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
+{
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& block)
+    : m_argImpl(block.nestedExpression()),
+      m_startRow(block.startRow()),
+      m_startCol(block.startCol()),
+      m_linear_offset(ForwardLinearAccess?(ArgType::IsRowMajor ? block.startRow()*block.nestedExpression().cols() + block.startCol() : block.startCol()*block.nestedExpression().rows() + block.startRow()):0)
+  { }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  enum {
+    RowsAtCompileTime = XprType::RowsAtCompileTime,
+    ForwardLinearAccess = (InnerPanel || int(XprType::IsRowMajor)==int(ArgType::IsRowMajor)) && bool(evaluator<ArgType>::Flags&LinearAccessBit)
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return linear_coeff_impl(index, bool_constant<ForwardLinearAccess>());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  {
+    return linear_coeffRef_impl(index, bool_constant<ForwardLinearAccess>());
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    return m_argImpl.template packet<LoadMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    if (ForwardLinearAccess)
+      return m_argImpl.template packet<LoadMode,PacketType>(m_linear_offset.value() + index);
+    else
+      return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+                                         RowsAtCompileTime == 1 ? index : 0);
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index row, Index col, const PacketType& x)
+  {
+    return m_argImpl.template writePacket<StoreMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col, x);
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketType& x)
+  {
+    if (ForwardLinearAccess)
+      return m_argImpl.template writePacket<StoreMode,PacketType>(m_linear_offset.value() + index, x);
+    else
+      return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+                                              RowsAtCompileTime == 1 ? index : 0,
+                                              x);
+  }
+
+protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType linear_coeff_impl(Index index, internal::true_type /* ForwardLinearAccess */) const
+  {
+    return m_argImpl.coeff(m_linear_offset.value() + index);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType linear_coeff_impl(Index index, internal::false_type /* not ForwardLinearAccess */) const
+  {
+    return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& linear_coeffRef_impl(Index index, internal::true_type /* ForwardLinearAccess */)
+  {
+    return m_argImpl.coeffRef(m_linear_offset.value() + index);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& linear_coeffRef_impl(Index index, internal::false_type /* not ForwardLinearAccess */)
+  {
+    return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+  }
+
+  evaluator<ArgType> m_argImpl;
+  const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
+  const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
+  const variable_if_dynamic<Index, ForwardLinearAccess ? Dynamic : 0> m_linear_offset;
+};
+
+// TODO: This evaluator does not actually use the child evaluator;
+// all action is via the data() as returned by the Block expression.
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAccess */ true>
+  : mapbase_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>,
+                      typename Block<ArgType, BlockRows, BlockCols, InnerPanel>::PlainObject>
+{
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+  typedef typename XprType::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit block_evaluator(const XprType& block)
+    : mapbase_evaluator<XprType, typename XprType::PlainObject>(block)
+  {
+    // TODO: for the 3.3 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime
+    eigen_assert(((internal::UIntPtr(block.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator<XprType>::Alignment)) == 0) && "data is not aligned");
+  }
+};
+
+
+// -------------------- Select --------------------
+// NOTE shall we introduce a ternary_evaluator?
+
+// TODO enable vectorization for Select
+template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
+struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
+  : evaluator_base<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
+{
+  typedef Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> XprType;
+  enum {
+    CoeffReadCost = evaluator<ConditionMatrixType>::CoeffReadCost
+                  + EIGEN_PLAIN_ENUM_MAX(evaluator<ThenMatrixType>::CoeffReadCost,
+                                         evaluator<ElseMatrixType>::CoeffReadCost),
+
+    Flags = (unsigned int)evaluator<ThenMatrixType>::Flags & evaluator<ElseMatrixType>::Flags & HereditaryBits,
+
+    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ThenMatrixType>::Alignment, evaluator<ElseMatrixType>::Alignment)
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const XprType& select)
+    : m_conditionImpl(select.conditionMatrix()),
+      m_thenImpl(select.thenMatrix()),
+      m_elseImpl(select.elseMatrix())
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    if (m_conditionImpl.coeff(row, col))
+      return m_thenImpl.coeff(row, col);
+    else
+      return m_elseImpl.coeff(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    if (m_conditionImpl.coeff(index))
+      return m_thenImpl.coeff(index);
+    else
+      return m_elseImpl.coeff(index);
+  }
+
+protected:
+  evaluator<ConditionMatrixType> m_conditionImpl;
+  evaluator<ThenMatrixType> m_thenImpl;
+  evaluator<ElseMatrixType> m_elseImpl;
+};
+
+
+// -------------------- Replicate --------------------
+
+template<typename ArgType, int RowFactor, int ColFactor>
+struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
+  : evaluator_base<Replicate<ArgType, RowFactor, ColFactor> >
+{
+  typedef Replicate<ArgType, RowFactor, ColFactor> XprType;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  enum {
+    Factor = (RowFactor==Dynamic || ColFactor==Dynamic) ? Dynamic : RowFactor*ColFactor
+  };
+  typedef typename internal::nested_eval<ArgType,Factor>::type ArgTypeNested;
+  typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
+
+  enum {
+    CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost,
+    LinearAccessMask = XprType::IsVectorAtCompileTime ? LinearAccessBit : 0,
+    Flags = (evaluator<ArgTypeNestedCleaned>::Flags & (HereditaryBits|LinearAccessMask) & ~RowMajorBit) | (traits<XprType>::Flags & RowMajorBit),
+
+    Alignment = evaluator<ArgTypeNestedCleaned>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& replicate)
+    : m_arg(replicate.nestedExpression()),
+      m_argImpl(m_arg),
+      m_rows(replicate.nestedExpression().rows()),
+      m_cols(replicate.nestedExpression().cols())
+  {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    // try to avoid using modulo; this is a pure optimization strategy
+    const Index actual_row = internal::traits<XprType>::RowsAtCompileTime==1 ? 0
+                           : RowFactor==1 ? row
+                           : row % m_rows.value();
+    const Index actual_col = internal::traits<XprType>::ColsAtCompileTime==1 ? 0
+                           : ColFactor==1 ? col
+                           : col % m_cols.value();
+
+    return m_argImpl.coeff(actual_row, actual_col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    // try to avoid using modulo; this is a pure optimization strategy
+    const Index actual_index = internal::traits<XprType>::RowsAtCompileTime==1
+                                  ? (ColFactor==1 ?  index : index%m_cols.value())
+                                  : (RowFactor==1 ?  index : index%m_rows.value());
+
+    return m_argImpl.coeff(actual_index);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    const Index actual_row = internal::traits<XprType>::RowsAtCompileTime==1 ? 0
+                           : RowFactor==1 ? row
+                           : row % m_rows.value();
+    const Index actual_col = internal::traits<XprType>::ColsAtCompileTime==1 ? 0
+                           : ColFactor==1 ? col
+                           : col % m_cols.value();
+
+    return m_argImpl.template packet<LoadMode,PacketType>(actual_row, actual_col);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    const Index actual_index = internal::traits<XprType>::RowsAtCompileTime==1
+                                  ? (ColFactor==1 ?  index : index%m_cols.value())
+                                  : (RowFactor==1 ?  index : index%m_rows.value());
+
+    return m_argImpl.template packet<LoadMode,PacketType>(actual_index);
+  }
+
+protected:
+  const ArgTypeNested m_arg;
+  evaluator<ArgTypeNestedCleaned> m_argImpl;
+  const variable_if_dynamic<Index, ArgType::RowsAtCompileTime> m_rows;
+  const variable_if_dynamic<Index, ArgType::ColsAtCompileTime> m_cols;
+};
+
+// -------------------- MatrixWrapper and ArrayWrapper --------------------
+//
+// evaluator_wrapper_base<T> is a common base class for the
+// MatrixWrapper and ArrayWrapper evaluators.
+
+template<typename XprType>
+struct evaluator_wrapper_base
+  : evaluator_base<XprType>
+{
+  typedef typename remove_all<typename XprType::NestedExpressionType>::type ArgType;
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    Flags = evaluator<ArgType>::Flags,
+    Alignment = evaluator<ArgType>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
+
+  typedef typename ArgType::Scalar Scalar;
+  typedef typename ArgType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_argImpl.coeff(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_argImpl.coeff(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    return m_argImpl.coeffRef(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  {
+    return m_argImpl.coeffRef(index);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    return m_argImpl.template packet<LoadMode,PacketType>(row, col);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    return m_argImpl.template packet<LoadMode,PacketType>(index);
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index row, Index col, const PacketType& x)
+  {
+    m_argImpl.template writePacket<StoreMode>(row, col, x);
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketType& x)
+  {
+    m_argImpl.template writePacket<StoreMode>(index, x);
+  }
+
+protected:
+  evaluator<ArgType> m_argImpl;
+};
+
+template<typename TArgType>
+struct unary_evaluator<MatrixWrapper<TArgType> >
+  : evaluator_wrapper_base<MatrixWrapper<TArgType> >
+{
+  typedef MatrixWrapper<TArgType> XprType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& wrapper)
+    : evaluator_wrapper_base<MatrixWrapper<TArgType> >(wrapper.nestedExpression())
+  { }
+};
+
+template<typename TArgType>
+struct unary_evaluator<ArrayWrapper<TArgType> >
+  : evaluator_wrapper_base<ArrayWrapper<TArgType> >
+{
+  typedef ArrayWrapper<TArgType> XprType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& wrapper)
+    : evaluator_wrapper_base<ArrayWrapper<TArgType> >(wrapper.nestedExpression())
+  { }
+};
+
+
+// -------------------- Reverse --------------------
+
+// defined in Reverse.h:
+template<typename PacketType, bool ReversePacket> struct reverse_packet_cond;
+
+template<typename ArgType, int Direction>
+struct unary_evaluator<Reverse<ArgType, Direction> >
+  : evaluator_base<Reverse<ArgType, Direction> >
+{
+  typedef Reverse<ArgType, Direction> XprType;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  enum {
+    IsRowMajor = XprType::IsRowMajor,
+    IsColMajor = !IsRowMajor,
+    ReverseRow = (Direction == Vertical)   || (Direction == BothDirections),
+    ReverseCol = (Direction == Horizontal) || (Direction == BothDirections),
+    ReversePacket = (Direction == BothDirections)
+                    || ((Direction == Vertical)   && IsColMajor)
+                    || ((Direction == Horizontal) && IsRowMajor),
+
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+
+    // let's enable LinearAccess only with vectorization because of the product overhead
+    // FIXME enable DirectAccess with negative strides?
+    Flags0 = evaluator<ArgType>::Flags,
+    LinearAccess = ( (Direction==BothDirections) && (int(Flags0)&PacketAccessBit) )
+                  || ((ReverseRow && XprType::ColsAtCompileTime==1) || (ReverseCol && XprType::RowsAtCompileTime==1))
+                 ? LinearAccessBit : 0,
+
+    Flags = int(Flags0) & (HereditaryBits | PacketAccessBit | LinearAccess),
+
+    Alignment = 0 // FIXME in some rare cases, Alignment could be preserved, like a Vector4f.
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& reverse)
+    : m_argImpl(reverse.nestedExpression()),
+      m_rows(ReverseRow ? reverse.nestedExpression().rows() : 1),
+      m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1)
+  { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_argImpl.coeff(ReverseRow ? m_rows.value() - row - 1 : row,
+                           ReverseCol ? m_cols.value() - col - 1 : col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_argImpl.coeff(m_rows.value() * m_cols.value() - index - 1);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    return m_argImpl.coeffRef(ReverseRow ? m_rows.value() - row - 1 : row,
+                              ReverseCol ? m_cols.value() - col - 1 : col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  {
+    return m_argImpl.coeffRef(m_rows.value() * m_cols.value() - index - 1);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    enum {
+      PacketSize = unpacket_traits<PacketType>::size,
+      OffsetRow  = ReverseRow && IsColMajor ? PacketSize : 1,
+      OffsetCol  = ReverseCol && IsRowMajor ? PacketSize : 1
+    };
+    typedef internal::reverse_packet_cond<PacketType,ReversePacket> reverse_packet;
+    return reverse_packet::run(m_argImpl.template packet<LoadMode,PacketType>(
+                                  ReverseRow ? m_rows.value() - row - OffsetRow : row,
+                                  ReverseCol ? m_cols.value() - col - OffsetCol : col));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    enum { PacketSize = unpacket_traits<PacketType>::size };
+    return preverse(m_argImpl.template packet<LoadMode,PacketType>(m_rows.value() * m_cols.value() - index - PacketSize));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index row, Index col, const PacketType& x)
+  {
+    // FIXME we could factorize some code with packet(i,j)
+    enum {
+      PacketSize = unpacket_traits<PacketType>::size,
+      OffsetRow  = ReverseRow && IsColMajor ? PacketSize : 1,
+      OffsetCol  = ReverseCol && IsRowMajor ? PacketSize : 1
+    };
+    typedef internal::reverse_packet_cond<PacketType,ReversePacket> reverse_packet;
+    m_argImpl.template writePacket<LoadMode>(
+                                  ReverseRow ? m_rows.value() - row - OffsetRow : row,
+                                  ReverseCol ? m_cols.value() - col - OffsetCol : col,
+                                  reverse_packet::run(x));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketType& x)
+  {
+    enum { PacketSize = unpacket_traits<PacketType>::size };
+    m_argImpl.template writePacket<LoadMode>
+      (m_rows.value() * m_cols.value() - index - PacketSize, preverse(x));
+  }
+
+protected:
+  evaluator<ArgType> m_argImpl;
+
+  // If we do not reverse rows, then we do not need to know the number of rows; same for columns
+  // Nonetheless, in this case it is important to set to 1 such that the coeff(index) method works fine for vectors.
+  const variable_if_dynamic<Index, ReverseRow ? ArgType::RowsAtCompileTime : 1> m_rows;
+  const variable_if_dynamic<Index, ReverseCol ? ArgType::ColsAtCompileTime : 1> m_cols;
+};
+
+
+// -------------------- Diagonal --------------------
+
+template<typename ArgType, int DiagIndex>
+struct evaluator<Diagonal<ArgType, DiagIndex> >
+  : evaluator_base<Diagonal<ArgType, DiagIndex> >
+{
+  typedef Diagonal<ArgType, DiagIndex> XprType;
+
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+
+    Flags = (unsigned int)(evaluator<ArgType>::Flags & (HereditaryBits | DirectAccessBit) & ~RowMajorBit) | LinearAccessBit,
+
+    Alignment = 0
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const XprType& diagonal)
+    : m_argImpl(diagonal.nestedExpression()),
+      m_index(diagonal.index())
+  { }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index) const
+  {
+    return m_argImpl.coeff(row + rowOffset(), row + colOffset());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_argImpl.coeff(index + rowOffset(), index + colOffset());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index)
+  {
+    return m_argImpl.coeffRef(row + rowOffset(), row + colOffset());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  {
+    return m_argImpl.coeffRef(index + rowOffset(), index + colOffset());
+  }
+
+protected:
+  evaluator<ArgType> m_argImpl;
+  const internal::variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
+
+private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+  Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+  Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; }
+};
+
+
+//----------------------------------------------------------------------
+// deprecated code
+//----------------------------------------------------------------------
+
 // -------------------- EvalToTemp --------------------
 
+// expression class for evaluating nested expression to a temporary
+
+template<typename ArgType> class EvalToTemp;
+
 template<typename ArgType>
 struct traits<EvalToTemp<ArgType> >
   : public traits<ArgType>
@@ -217,25 +1682,25 @@
   : public dense_xpr_base<EvalToTemp<ArgType> >::type
 {
  public:
- 
+
   typedef typename dense_xpr_base<EvalToTemp>::type Base;
   EIGEN_GENERIC_PUBLIC_INTERFACE(EvalToTemp)
- 
-  EvalToTemp(const ArgType& arg)
+
+  explicit EvalToTemp(const ArgType& arg)
     : m_arg(arg)
   { }
- 
+
   const ArgType& arg() const
   {
     return m_arg;
   }
 
-  Index rows() const 
+  EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT
   {
     return m_arg.rows();
   }
 
-  Index cols() const 
+  EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT
   {
     return m_arg.cols();
   }
@@ -243,931 +1708,30 @@
  private:
   const ArgType& m_arg;
 };
- 
-template<typename ArgType>
-struct evaluator_impl<EvalToTemp<ArgType> >
-{
-  typedef EvalToTemp<ArgType> XprType;
-  typedef typename ArgType::PlainObject PlainObject;
 
-  evaluator_impl(const XprType& xpr) 
-    : m_result(xpr.rows(), xpr.cols()), m_resultImpl(m_result)
+template<typename ArgType>
+struct evaluator<EvalToTemp<ArgType> >
+  : public evaluator<typename ArgType::PlainObject>
+{
+  typedef EvalToTemp<ArgType>                   XprType;
+  typedef typename ArgType::PlainObject         PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+    : m_result(xpr.arg())
   {
-    // TODO we should simply do m_result(xpr.arg());
-    call_dense_assignment_loop(m_result, xpr.arg());
+    ::new (static_cast<Base*>(this)) Base(m_result);
   }
 
   // This constructor is used when nesting an EvalTo evaluator in another evaluator
-  evaluator_impl(const ArgType& arg) 
-    : m_result(arg.rows(), arg.cols()), m_resultImpl(m_result)
+  EIGEN_DEVICE_FUNC evaluator(const ArgType& arg)
+    : m_result(arg)
   {
-    // TODO we should simply do m_result(xpr.arg());
-    call_dense_assignment_loop(m_result, arg);
-  }
-
-  typedef typename PlainObject::Index Index;
-  typedef typename PlainObject::Scalar Scalar;
-  typedef typename PlainObject::CoeffReturnType CoeffReturnType;
-  typedef typename PlainObject::PacketScalar PacketScalar;
-  typedef typename PlainObject::PacketReturnType PacketReturnType;
-
-  // All other functions are forwarded to m_resultImpl
-
-  CoeffReturnType coeff(Index row, Index col) const 
-  { 
-    return m_resultImpl.coeff(row, col); 
-  }
-  
-  CoeffReturnType coeff(Index index) const 
-  { 
-    return m_resultImpl.coeff(index); 
-  }
-  
-  Scalar& coeffRef(Index row, Index col) 
-  { 
-    return m_resultImpl.coeffRef(row, col); 
-  }
-  
-  Scalar& coeffRef(Index index) 
-  { 
-    return m_resultImpl.coeffRef(index); 
-  }
-
-  template<int LoadMode> 
-  PacketReturnType packet(Index row, Index col) const
-  {
-    return m_resultImpl.template packet<LoadMode>(row, col);
-  }
-
-  template<int LoadMode> 
-  PacketReturnType packet(Index index) const
-  {
-    return m_resultImpl.packet<LoadMode>(index);
-  }
-
-  template<int StoreMode> 
-  void writePacket(Index row, Index col, const PacketScalar& x)
-  {
-    m_resultImpl.template writePacket<StoreMode>(row, col, x);
-  }
-
-  template<int StoreMode> 
-  void writePacket(Index index, const PacketScalar& x)
-  {
-    m_resultImpl.template writePacket<StoreMode>(index, x);
+    ::new (static_cast<Base*>(this)) Base(m_result);
   }
 
 protected:
   PlainObject m_result;
-  typename evaluator<PlainObject>::nestedType m_resultImpl;
-};
-
-// -------------------- Transpose --------------------
-
-template<typename ArgType>
-struct evaluator_impl<Transpose<ArgType> >
-  : evaluator_impl_base<Transpose<ArgType> >
-{
-  typedef Transpose<ArgType> XprType;
-
-  evaluator_impl(const XprType& t) : m_argImpl(t.nestedExpression()) {}
-
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketScalar PacketScalar;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-
-  CoeffReturnType coeff(Index row, Index col) const
-  {
-    return m_argImpl.coeff(col, row);
-  }
-
-  CoeffReturnType coeff(Index index) const
-  {
-    return m_argImpl.coeff(index);
-  }
-
-  Scalar& coeffRef(Index row, Index col)
-  {
-    return m_argImpl.coeffRef(col, row);
-  }
-
-  typename XprType::Scalar& coeffRef(Index index)
-  {
-    return m_argImpl.coeffRef(index);
-  }
-
-  template<int LoadMode>
-  PacketReturnType packet(Index row, Index col) const
-  {
-    return m_argImpl.template packet<LoadMode>(col, row);
-  }
-
-  template<int LoadMode>
-  PacketReturnType packet(Index index) const
-  {
-    return m_argImpl.template packet<LoadMode>(index);
-  }
-
-  template<int StoreMode> 
-  void writePacket(Index row, Index col, const PacketScalar& x)
-  {
-    m_argImpl.template writePacket<StoreMode>(col, row, x);
-  }
-
-  template<int StoreMode> 
-  void writePacket(Index index, const PacketScalar& x)
-  {
-    m_argImpl.template writePacket<StoreMode>(index, x);
-  }
-
-protected:
-  typename evaluator<ArgType>::nestedType m_argImpl;
-};
-
-// -------------------- CwiseNullaryOp --------------------
-
-template<typename NullaryOp, typename PlainObjectType>
-struct evaluator_impl<CwiseNullaryOp<NullaryOp,PlainObjectType> >
-{
-  typedef CwiseNullaryOp<NullaryOp,PlainObjectType> XprType;
-
-  evaluator_impl(const XprType& n) 
-    : m_functor(n.functor()) 
-  { }
-
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketScalar PacketScalar;
-
-  CoeffReturnType coeff(Index row, Index col) const
-  {
-    return m_functor(row, col);
-  }
-
-  CoeffReturnType coeff(Index index) const
-  {
-    return m_functor(index);
-  }
-
-  template<int LoadMode>
-  PacketScalar packet(Index row, Index col) const
-  {
-    return m_functor.packetOp(row, col);
-  }
-
-  template<int LoadMode>
-  PacketScalar packet(Index index) const
-  {
-    return m_functor.packetOp(index);
-  }
-
-protected:
-  const NullaryOp m_functor;
-};
-
-// -------------------- CwiseUnaryOp --------------------
-
-template<typename UnaryOp, typename ArgType>
-struct evaluator_impl<CwiseUnaryOp<UnaryOp, ArgType> >
-{
-  typedef CwiseUnaryOp<UnaryOp, ArgType> XprType;
-
-  evaluator_impl(const XprType& op) 
-    : m_functor(op.functor()), 
-      m_argImpl(op.nestedExpression()) 
-  { }
-
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketScalar PacketScalar;
-
-  CoeffReturnType coeff(Index row, Index col) const
-  {
-    return m_functor(m_argImpl.coeff(row, col));
-  }
-
-  CoeffReturnType coeff(Index index) const
-  {
-    return m_functor(m_argImpl.coeff(index));
-  }
-
-  template<int LoadMode>
-  PacketScalar packet(Index row, Index col) const
-  {
-    return m_functor.packetOp(m_argImpl.template packet<LoadMode>(row, col));
-  }
-
-  template<int LoadMode>
-  PacketScalar packet(Index index) const
-  {
-    return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
-  }
-
-protected:
-  const UnaryOp m_functor;
-  typename evaluator<ArgType>::nestedType m_argImpl;
-};
-
-// -------------------- CwiseBinaryOp --------------------
-
-template<typename BinaryOp, typename Lhs, typename Rhs>
-struct evaluator_impl<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
-{
-  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
-
-  evaluator_impl(const XprType& xpr) 
-    : m_functor(xpr.functor()),
-      m_lhsImpl(xpr.lhs()), 
-      m_rhsImpl(xpr.rhs())  
-  { }
-
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketScalar PacketScalar;
-
-  CoeffReturnType coeff(Index row, Index col) const
-  {
-    return m_functor(m_lhsImpl.coeff(row, col), m_rhsImpl.coeff(row, col));
-  }
-
-  CoeffReturnType coeff(Index index) const
-  {
-    return m_functor(m_lhsImpl.coeff(index), m_rhsImpl.coeff(index));
-  }
-
-  template<int LoadMode>
-  PacketScalar packet(Index row, Index col) const
-  {
-    return m_functor.packetOp(m_lhsImpl.template packet<LoadMode>(row, col),
-			      m_rhsImpl.template packet<LoadMode>(row, col));
-  }
-
-  template<int LoadMode>
-  PacketScalar packet(Index index) const
-  {
-    return m_functor.packetOp(m_lhsImpl.template packet<LoadMode>(index),
-			      m_rhsImpl.template packet<LoadMode>(index));
-  }
-
-protected:
-  const BinaryOp m_functor;
-  typename evaluator<Lhs>::nestedType m_lhsImpl;
-  typename evaluator<Rhs>::nestedType m_rhsImpl;
-};
-
-// -------------------- CwiseTernaryOp --------------------
-
-template<typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
-struct evaluator_impl<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >
-{
-  typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
-
-  evaluator_impl(const XprType& xpr)
-    : m_functor(xpr.functor()),
-      m_arg1Impl(xpr.arg1()),
-      m_arg2Impl(xpr.arg2()),
-      m_arg3Impl(xpr.arg3())
-  { }
-
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketScalar PacketScalar;
-
-  CoeffReturnType coeff(Index row, Index col) const
-  {
-    return m_functor(m_arg1Impl.coeff(row, col),
-                     m_arg2Impl.coeff(row, col),
-                     m_arg3Impl.coeff(row, col));
-  }
-
-  CoeffReturnType coeff(Index index) const
-  {
-    return m_functor(m_arg1Impl.coeff(index),
-                     m_arg2Impl.coeff(index),
-                     m_arg3Impl.coeff(index));
-  }
-
-  template<int LoadMode>
-  PacketScalar packet(Index row, Index col) const
-  {
-    return m_functor.packetOp(m_arg1Impl.template packet<LoadMode>(row, col),
-                              m_arg2Impl.template packet<LoadMode>(row, col),
-                              m_arg3Impl.template packet<LoadMode>(row, col));
-  }
-
-  template<int LoadMode>
-  PacketScalar packet(Index index) const
-  {
-    return m_functor.packetOp(m_arg1Impl.template packet<LoadMode>(index),
-                              m_arg2Impl.template packet<LoadMode>(index),
-                              m_arg3Impl.template packet<LoadMode>(index));
-  }
-
-protected:
-  const TernaryOp m_functor;
-  typename evaluator<Arg1>::nestedType m_arg1Impl;
-  typename evaluator<Arg2>::nestedType m_arg2Impl;
-  typename evaluator<Arg3>::nestedType m_arg3Impl;
-};
-
-
-// -------------------- CwiseUnaryView --------------------
-
-template<typename UnaryOp, typename ArgType>
-struct evaluator_impl<CwiseUnaryView<UnaryOp, ArgType> >
-  : evaluator_impl_base<CwiseUnaryView<UnaryOp, ArgType> >
-{
-  typedef CwiseUnaryView<UnaryOp, ArgType> XprType;
-
-  evaluator_impl(const XprType& op) 
-    : m_unaryOp(op.functor()), 
-      m_argImpl(op.nestedExpression()) 
-  { }
-
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-
-  CoeffReturnType coeff(Index row, Index col) const
-  {
-    return m_unaryOp(m_argImpl.coeff(row, col));
-  }
-
-  CoeffReturnType coeff(Index index) const
-  {
-    return m_unaryOp(m_argImpl.coeff(index));
-  }
-
-  Scalar& coeffRef(Index row, Index col)
-  {
-    return m_unaryOp(m_argImpl.coeffRef(row, col));
-  }
-
-  Scalar& coeffRef(Index index)
-  {
-    return m_unaryOp(m_argImpl.coeffRef(index));
-  }
-
-protected:
-  const UnaryOp m_unaryOp;
-  typename evaluator<ArgType>::nestedType m_argImpl;
-};
-
-// -------------------- Map --------------------
-
-template<typename Derived, int AccessorsType>
-struct evaluator_impl<MapBase<Derived, AccessorsType> >
-  : evaluator_impl_base<Derived>
-{
-  typedef MapBase<Derived, AccessorsType> MapType;
-  typedef Derived XprType;
-
-  typedef typename XprType::PointerType PointerType;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketScalar PacketScalar;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  
-  evaluator_impl(const XprType& map) 
-    : m_data(const_cast<PointerType>(map.data())),  
-      m_rowStride(map.rowStride()),
-      m_colStride(map.colStride())
-  { }
- 
-  enum {
-    RowsAtCompileTime = XprType::RowsAtCompileTime
-  };
- 
-  CoeffReturnType coeff(Index row, Index col) const 
-  { 
-    return m_data[col * m_colStride + row * m_rowStride];
-  }
-  
-  CoeffReturnType coeff(Index index) const 
-  { 
-    return coeff(RowsAtCompileTime == 1 ? 0 : index,
-		 RowsAtCompileTime == 1 ? index : 0);
-  }
-
-  Scalar& coeffRef(Index row, Index col) 
-  { 
-    return m_data[col * m_colStride + row * m_rowStride];
-  }
-  
-  Scalar& coeffRef(Index index) 
-  { 
-    return coeffRef(RowsAtCompileTime == 1 ? 0 : index,
-		    RowsAtCompileTime == 1 ? index : 0);
-  }
- 
-  template<int LoadMode> 
-  PacketReturnType packet(Index row, Index col) const 
-  { 
-    PointerType ptr = m_data + row * m_rowStride + col * m_colStride;
-    return internal::ploadt<PacketScalar, LoadMode>(ptr);
-  }
-
-  template<int LoadMode> 
-  PacketReturnType packet(Index index) const 
-  { 
-    return packet<LoadMode>(RowsAtCompileTime == 1 ? 0 : index,
-			    RowsAtCompileTime == 1 ? index : 0);
-  }
-  
-  template<int StoreMode> 
-  void writePacket(Index row, Index col, const PacketScalar& x) 
-  { 
-    PointerType ptr = m_data + row * m_rowStride + col * m_colStride;
-    return internal::pstoret<Scalar, PacketScalar, StoreMode>(ptr, x);
-  }
-  
-  template<int StoreMode> 
-  void writePacket(Index index, const PacketScalar& x) 
-  { 
-    return writePacket<StoreMode>(RowsAtCompileTime == 1 ? 0 : index,
-				  RowsAtCompileTime == 1 ? index : 0,
-				  x);
-  }
- 
-protected:
-  PointerType m_data;
-  int m_rowStride;
-  int m_colStride;
-};
-
-template<typename PlainObjectType, int MapOptions, typename StrideType> 
-struct evaluator_impl<Map<PlainObjectType, MapOptions, StrideType> >
-  : public evaluator_impl<MapBase<Map<PlainObjectType, MapOptions, StrideType> > >
-{
-  typedef Map<PlainObjectType, MapOptions, StrideType> XprType;
-
-  evaluator_impl(const XprType& map) 
-    : evaluator_impl<MapBase<XprType> >(map) 
-  { }
-};
-
-// -------------------- Block --------------------
-
-template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel,
-         bool HasDirectAccess = internal::has_direct_access<ArgType>::ret> struct block_evaluator;
-         
-template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel> 
-struct evaluator_impl<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
-  : block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel>
-{
-  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
-  typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
-  evaluator_impl(const XprType& block) : block_evaluator_type(block) {}
-};
-
-template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
-struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /*HasDirectAccess*/ false>
-  : evaluator_impl_base<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
-{
-  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
-
-  block_evaluator(const XprType& block) 
-    : m_argImpl(block.nestedExpression()), 
-      m_startRow(block.startRow()), 
-      m_startCol(block.startCol()) 
-  { }
- 
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketScalar PacketScalar;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-
-  enum {
-    RowsAtCompileTime = XprType::RowsAtCompileTime
-  };
- 
-  CoeffReturnType coeff(Index row, Index col) const 
-  { 
-    return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col); 
-  }
-  
-  CoeffReturnType coeff(Index index) const 
-  { 
-    return coeff(RowsAtCompileTime == 1 ? 0 : index,
-		 RowsAtCompileTime == 1 ? index : 0);
-  }
-
-  Scalar& coeffRef(Index row, Index col) 
-  { 
-    return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col); 
-  }
-  
-  Scalar& coeffRef(Index index) 
-  { 
-    return coeffRef(RowsAtCompileTime == 1 ? 0 : index,
-		    RowsAtCompileTime == 1 ? index : 0);
-  }
- 
-  template<int LoadMode> 
-  PacketReturnType packet(Index row, Index col) const 
-  { 
-    return m_argImpl.template packet<LoadMode>(m_startRow.value() + row, m_startCol.value() + col); 
-  }
-
-  template<int LoadMode> 
-  PacketReturnType packet(Index index) const 
-  { 
-    return packet<LoadMode>(RowsAtCompileTime == 1 ? 0 : index,
-			    RowsAtCompileTime == 1 ? index : 0);
-  }
-  
-  template<int StoreMode> 
-  void writePacket(Index row, Index col, const PacketScalar& x) 
-  { 
-    return m_argImpl.template writePacket<StoreMode>(m_startRow.value() + row, m_startCol.value() + col, x); 
-  }
-  
-  template<int StoreMode> 
-  void writePacket(Index index, const PacketScalar& x) 
-  { 
-    return writePacket<StoreMode>(RowsAtCompileTime == 1 ? 0 : index,
-				  RowsAtCompileTime == 1 ? index : 0,
-				  x);
-  }
- 
-protected:
-  typename evaluator<ArgType>::nestedType m_argImpl;
-  const variable_if_dynamic<Index, ArgType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
-  const variable_if_dynamic<Index, ArgType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
-};
-
-// TODO: This evaluator does not actually use the child evaluator; 
-// all action is via the data() as returned by the Block expression.
-
-template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel> 
-struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAccess */ true>
-  : evaluator_impl<MapBase<Block<ArgType, BlockRows, BlockCols, InnerPanel> > >
-{
-  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
-
-  block_evaluator(const XprType& block) 
-    : evaluator_impl<MapBase<XprType> >(block) 
-  { }
-};
-
-
-// -------------------- Select --------------------
-
-template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
-struct evaluator_impl<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
-{
-  typedef Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> XprType;
-
-  evaluator_impl(const XprType& select) 
-    : m_conditionImpl(select.conditionMatrix()),
-      m_thenImpl(select.thenMatrix()),
-      m_elseImpl(select.elseMatrix())
-  { }
- 
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-
-  CoeffReturnType coeff(Index row, Index col) const
-  {
-    if (m_conditionImpl.coeff(row, col))
-      return m_thenImpl.coeff(row, col);
-    else
-      return m_elseImpl.coeff(row, col);
-  }
-
-  CoeffReturnType coeff(Index index) const
-  {
-    if (m_conditionImpl.coeff(index))
-      return m_thenImpl.coeff(index);
-    else
-      return m_elseImpl.coeff(index);
-  }
- 
-protected:
-  typename evaluator<ConditionMatrixType>::nestedType m_conditionImpl;
-  typename evaluator<ThenMatrixType>::nestedType m_thenImpl;
-  typename evaluator<ElseMatrixType>::nestedType m_elseImpl;
-};
-
-
-// -------------------- Replicate --------------------
-
-template<typename ArgType, int RowFactor, int ColFactor> 
-struct evaluator_impl<Replicate<ArgType, RowFactor, ColFactor> >
-{
-  typedef Replicate<ArgType, RowFactor, ColFactor> XprType;
-
-  evaluator_impl(const XprType& replicate) 
-    : m_argImpl(replicate.nestedExpression()),
-      m_rows(replicate.nestedExpression().rows()),
-      m_cols(replicate.nestedExpression().cols())
-  { }
- 
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-
-  CoeffReturnType coeff(Index row, Index col) const
-  {
-    // try to avoid using modulo; this is a pure optimization strategy
-    const Index actual_row = internal::traits<XprType>::RowsAtCompileTime==1 ? 0
-                           : RowFactor==1 ? row
-                           : row % m_rows.value();
-    const Index actual_col = internal::traits<XprType>::ColsAtCompileTime==1 ? 0
-                           : ColFactor==1 ? col
-                           : col % m_cols.value();
-    
-    return m_argImpl.coeff(actual_row, actual_col);
-  }
-
-  template<int LoadMode>
-  PacketReturnType packet(Index row, Index col) const
-  {
-    const Index actual_row = internal::traits<XprType>::RowsAtCompileTime==1 ? 0
-                           : RowFactor==1 ? row
-                           : row % m_rows.value();
-    const Index actual_col = internal::traits<XprType>::ColsAtCompileTime==1 ? 0
-                           : ColFactor==1 ? col
-                           : col % m_cols.value();
-
-    return m_argImpl.template packet<LoadMode>(actual_row, actual_col);
-  }
- 
-protected:
-  typename evaluator<ArgType>::nestedType m_argImpl;
-  const variable_if_dynamic<Index, XprType::RowsAtCompileTime> m_rows;
-  const variable_if_dynamic<Index, XprType::ColsAtCompileTime> m_cols;
-};
-
-
-// -------------------- PartialReduxExpr --------------------
-//
-// This is a wrapper around the expression object. 
-// TODO: Find out how to write a proper evaluator without duplicating
-//       the row() and col() member functions.
-
-template< typename ArgType, typename MemberOp, int Direction>
-struct evaluator_impl<PartialReduxExpr<ArgType, MemberOp, Direction> >
-{
-  typedef PartialReduxExpr<ArgType, MemberOp, Direction> XprType;
-
-  evaluator_impl(const XprType expr)
-    : m_expr(expr)
-  { }
-
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
- 
-  CoeffReturnType coeff(Index row, Index col) const 
-  { 
-    return m_expr.coeff(row, col);
-  }
-  
-  CoeffReturnType coeff(Index index) const 
-  { 
-    return m_expr.coeff(index);
-  }
-
-protected:
-  const XprType m_expr;
-};
-
-
-// -------------------- MatrixWrapper and ArrayWrapper --------------------
-//
-// evaluator_impl_wrapper_base<T> is a common base class for the
-// MatrixWrapper and ArrayWrapper evaluators.
-
-template<typename XprType>
-struct evaluator_impl_wrapper_base
-  : evaluator_impl_base<XprType>
-{
-  typedef typename remove_all<typename XprType::NestedExpressionType>::type ArgType;
-
-  evaluator_impl_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
-
-  typedef typename ArgType::Index Index;
-  typedef typename ArgType::Scalar Scalar;
-  typedef typename ArgType::CoeffReturnType CoeffReturnType;
-  typedef typename ArgType::PacketScalar PacketScalar;
-  typedef typename ArgType::PacketReturnType PacketReturnType;
-
-  CoeffReturnType coeff(Index row, Index col) const
-  {
-    return m_argImpl.coeff(row, col);
-  }
-
-  CoeffReturnType coeff(Index index) const
-  {
-    return m_argImpl.coeff(index);
-  }
-
-  Scalar& coeffRef(Index row, Index col)
-  {
-    return m_argImpl.coeffRef(row, col);
-  }
-
-  Scalar& coeffRef(Index index)
-  {
-    return m_argImpl.coeffRef(index);
-  }
-
-  template<int LoadMode> 
-  PacketReturnType packet(Index row, Index col) const
-  {
-    return m_argImpl.template packet<LoadMode>(row, col);
-  }
-
-  template<int LoadMode> 
-  PacketReturnType packet(Index index) const
-  {
-    return m_argImpl.template packet<LoadMode>(index);
-  }
-
-  template<int StoreMode> 
-  void writePacket(Index row, Index col, const PacketScalar& x)
-  {
-    m_argImpl.template writePacket<StoreMode>(row, col, x);
-  }
-
-  template<int StoreMode> 
-  void writePacket(Index index, const PacketScalar& x)
-  {
-    m_argImpl.template writePacket<StoreMode>(index, x);
-  }
-
-protected:
-  typename evaluator<ArgType>::nestedType m_argImpl;
-};
-
-template<typename TArgType>
-struct evaluator_impl<MatrixWrapper<TArgType> >
-  : evaluator_impl_wrapper_base<MatrixWrapper<TArgType> >
-{
-  typedef MatrixWrapper<TArgType> XprType;
-
-  evaluator_impl(const XprType& wrapper) 
-    : evaluator_impl_wrapper_base<MatrixWrapper<TArgType> >(wrapper.nestedExpression())
-  { }
-};
-
-template<typename TArgType>
-struct evaluator_impl<ArrayWrapper<TArgType> >
-  : evaluator_impl_wrapper_base<ArrayWrapper<TArgType> >
-{
-  typedef ArrayWrapper<TArgType> XprType;
-
-  evaluator_impl(const XprType& wrapper) 
-    : evaluator_impl_wrapper_base<ArrayWrapper<TArgType> >(wrapper.nestedExpression())
-  { }
-};
-
-
-// -------------------- Reverse --------------------
-
-// defined in Reverse.h:
-template<typename PacketScalar, bool ReversePacket> struct reverse_packet_cond;
-
-template<typename ArgType, int Direction>
-struct evaluator_impl<Reverse<ArgType, Direction> >
-  : evaluator_impl_base<Reverse<ArgType, Direction> >
-{
-  typedef Reverse<ArgType, Direction> XprType;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketScalar PacketScalar;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-
-  enum {
-    PacketSize = internal::packet_traits<Scalar>::size,
-    IsRowMajor = XprType::IsRowMajor,
-    IsColMajor = !IsRowMajor,
-    ReverseRow = (Direction == Vertical)   || (Direction == BothDirections),
-    ReverseCol = (Direction == Horizontal) || (Direction == BothDirections),
-    OffsetRow  = ReverseRow && IsColMajor ? PacketSize : 1,
-    OffsetCol  = ReverseCol && IsRowMajor ? PacketSize : 1,
-    ReversePacket = (Direction == BothDirections)
-                    || ((Direction == Vertical)   && IsColMajor)
-                    || ((Direction == Horizontal) && IsRowMajor)
-  };
-  typedef internal::reverse_packet_cond<PacketScalar,ReversePacket> reverse_packet;
-
-  evaluator_impl(const XprType& reverse) 
-    : m_argImpl(reverse.nestedExpression()),
-      m_rows(ReverseRow ? reverse.nestedExpression().rows() : 0),
-      m_cols(ReverseCol ? reverse.nestedExpression().cols() : 0)
-  { }
- 
-  CoeffReturnType coeff(Index row, Index col) const
-  {
-    return m_argImpl.coeff(ReverseRow ? m_rows.value() - row - 1 : row,
-			   ReverseCol ? m_cols.value() - col - 1 : col);
-  }
-
-  CoeffReturnType coeff(Index index) const
-  {
-    return m_argImpl.coeff(m_rows.value() * m_cols.value() - index - 1);
-  }
-
-  Scalar& coeffRef(Index row, Index col)
-  {
-    return m_argImpl.coeffRef(ReverseRow ? m_rows.value() - row - 1 : row,
-			      ReverseCol ? m_cols.value() - col - 1 : col);
-  }
-
-  Scalar& coeffRef(Index index)
-  {
-    return m_argImpl.coeffRef(m_rows.value() * m_cols.value() - index - 1);
-  }
-
-  template<int LoadMode>
-  PacketScalar packet(Index row, Index col) const
-  {
-    return reverse_packet::run(m_argImpl.template packet<LoadMode>(
-                                  ReverseRow ? m_rows.value() - row - OffsetRow : row,
-                                  ReverseCol ? m_cols.value() - col - OffsetCol : col));
-  }
-
-  template<int LoadMode>
-  PacketScalar packet(Index index) const
-  {
-    return preverse(m_argImpl.template packet<LoadMode>(m_rows.value() * m_cols.value() - index - PacketSize));
-  }
-
-  template<int LoadMode>
-  void writePacket(Index row, Index col, const PacketScalar& x)
-  {
-    m_argImpl.template writePacket<LoadMode>(
-                                  ReverseRow ? m_rows.value() - row - OffsetRow : row,
-                                  ReverseCol ? m_cols.value() - col - OffsetCol : col,
-                                  reverse_packet::run(x));
-  }
-
-  template<int LoadMode>
-  void writePacket(Index index, const PacketScalar& x)
-  {
-    m_argImpl.template writePacket<LoadMode>
-      (m_rows.value() * m_cols.value() - index - PacketSize, preverse(x));
-  }
- 
-protected:
-  typename evaluator<ArgType>::nestedType m_argImpl;
-
-  // If we do not reverse rows, then we do not need to know the number of rows; same for columns
-  const variable_if_dynamic<Index, ReverseRow ? ArgType::RowsAtCompileTime : 0> m_rows;
-  const variable_if_dynamic<Index, ReverseCol ? ArgType::ColsAtCompileTime : 0> m_cols;
-};
-
-
-// -------------------- Diagonal --------------------
-
-template<typename ArgType, int DiagIndex>
-struct evaluator_impl<Diagonal<ArgType, DiagIndex> >
-  : evaluator_impl_base<Diagonal<ArgType, DiagIndex> >
-{
-  typedef Diagonal<ArgType, DiagIndex> XprType;
-
-  evaluator_impl(const XprType& diagonal) 
-    : m_argImpl(diagonal.nestedExpression()),
-      m_index(diagonal.index())
-  { }
- 
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-
-  CoeffReturnType coeff(Index row, Index) const
-  {
-    return m_argImpl.coeff(row + rowOffset(), row + colOffset());
-  }
-
-  CoeffReturnType coeff(Index index) const
-  {
-    return m_argImpl.coeff(index + rowOffset(), index + colOffset());
-  }
-
-  Scalar& coeffRef(Index row, Index)
-  {
-    return m_argImpl.coeffRef(row + rowOffset(), row + colOffset());
-  }
-
-  Scalar& coeffRef(Index index)
-  {
-    return m_argImpl.coeffRef(index + rowOffset(), index + colOffset());
-  }
-
-protected:
-  typename evaluator<ArgType>::nestedType m_argImpl;
-  const internal::variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
-
-private:
-  EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); }
-  EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; }
 };
 
 } // namespace internal

diff --git a/Eigen/src/Core/CoreIterators.h b/Eigen/src/Core/CoreIterators.h
index 6da4683..b967196 100644
--- a/Eigen/src/Core/CoreIterators.h
+++ b/Eigen/src/Core/CoreIterators.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -15,47 +15,118 @@
 /* This file contains the respective InnerIterator definition of the expressions defined in Eigen/Core
  */
 
-/** \ingroup SparseCore_Module
-  * \class InnerIterator
-  * \brief An InnerIterator allows to loop over the element of a sparse (or dense) matrix or expression
-  *
-  * todo
+namespace internal {
+
+template<typename XprType, typename EvaluatorKind>
+class inner_iterator_selector;
+
+}
+
+/** \class InnerIterator
+  * \brief An InnerIterator allows to loop over the element of any matrix expression.
+  * 
+  * \warning To be used with care because an evaluator is constructed every time an InnerIterator iterator is constructed.
+  * 
+  * TODO: add a usage example
   */
-
-// generic version for dense matrix and expressions
-template<typename Derived> class DenseBase<Derived>::InnerIterator
+template<typename XprType>
+class InnerIterator
 {
-  protected:
-    typedef typename Derived::Scalar Scalar;
-    typedef typename Derived::Index Index;
+protected:
+  typedef internal::inner_iterator_selector<XprType, typename internal::evaluator_traits<XprType>::Kind> IteratorType;
+  typedef internal::evaluator<XprType> EvaluatorType;
+  typedef typename internal::traits<XprType>::Scalar Scalar;
+public:
+  /** Construct an iterator over the \a outerId -th row or column of \a xpr */
+  InnerIterator(const XprType &xpr, const Index &outerId)
+    : m_eval(xpr), m_iter(m_eval, outerId, xpr.innerSize())
+  {}
+  
+  /// \returns the value of the current coefficient.
+  EIGEN_STRONG_INLINE Scalar value() const          { return m_iter.value(); }
+  /** Increment the iterator \c *this to the next non-zero coefficient.
+    * Explicit zeros are not skipped over. To skip explicit zeros, see class SparseView
+    */
+  EIGEN_STRONG_INLINE InnerIterator& operator++()   { m_iter.operator++(); return *this; }
+  EIGEN_STRONG_INLINE InnerIterator& operator+=(Index i) { m_iter.operator+=(i); return *this; }
+  EIGEN_STRONG_INLINE InnerIterator operator+(Index i) 
+  { InnerIterator result(*this); result+=i; return result; }
+    
 
-    enum { IsRowMajor = (Derived::Flags&RowMajorBit)==RowMajorBit };
-  public:
-    EIGEN_STRONG_INLINE InnerIterator(const Derived& expr, Index outer)
-      : m_expression(expr), m_inner(0), m_outer(outer), m_end(expr.innerSize())
-    {}
-
-    EIGEN_STRONG_INLINE Scalar value() const
-    {
-      return (IsRowMajor) ? m_expression.coeff(m_outer, m_inner)
-                          : m_expression.coeff(m_inner, m_outer);
-    }
-
-    EIGEN_STRONG_INLINE InnerIterator& operator++() { m_inner++; return *this; }
-
-    EIGEN_STRONG_INLINE Index index() const { return m_inner; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    EIGEN_STRONG_INLINE operator bool() const { return m_inner < m_end && m_inner>=0; }
-
-  protected:
-    const Derived& m_expression;
-    Index m_inner;
-    const Index m_outer;
-    const Index m_end;
+  /// \returns the column or row index of the current coefficient.
+  EIGEN_STRONG_INLINE Index index() const           { return m_iter.index(); }
+  /// \returns the row index of the current coefficient.
+  EIGEN_STRONG_INLINE Index row() const             { return m_iter.row(); }
+  /// \returns the column index of the current coefficient.
+  EIGEN_STRONG_INLINE Index col() const             { return m_iter.col(); }
+  /// \returns \c true if the iterator \c *this still references a valid coefficient.
+  EIGEN_STRONG_INLINE operator bool() const         { return m_iter; }
+  
+protected:
+  EvaluatorType m_eval;
+  IteratorType m_iter;
+private:
+  // If you get here, then you're not using the right InnerIterator type, e.g.:
+  //   SparseMatrix<double,RowMajor> A;
+  //   SparseMatrix<double>::InnerIterator it(A,0);
+  template<typename T> InnerIterator(const EigenBase<T>&,Index outer);
 };
 
+namespace internal {
+
+// Generic inner iterator implementation for dense objects
+template<typename XprType>
+class inner_iterator_selector<XprType, IndexBased>
+{
+protected:
+  typedef evaluator<XprType> EvaluatorType;
+  typedef typename traits<XprType>::Scalar Scalar;
+  enum { IsRowMajor = (XprType::Flags&RowMajorBit)==RowMajorBit };
+  
+public:
+  EIGEN_STRONG_INLINE inner_iterator_selector(const EvaluatorType &eval, const Index &outerId, const Index &innerSize)
+    : m_eval(eval), m_inner(0), m_outer(outerId), m_end(innerSize)
+  {}
+
+  EIGEN_STRONG_INLINE Scalar value() const
+  {
+    return (IsRowMajor) ? m_eval.coeff(m_outer, m_inner)
+                        : m_eval.coeff(m_inner, m_outer);
+  }
+
+  EIGEN_STRONG_INLINE inner_iterator_selector& operator++() { m_inner++; return *this; }
+
+  EIGEN_STRONG_INLINE Index index() const { return m_inner; }
+  inline Index row() const { return IsRowMajor ? m_outer : index(); }
+  inline Index col() const { return IsRowMajor ? index() : m_outer; }
+
+  EIGEN_STRONG_INLINE operator bool() const { return m_inner < m_end && m_inner>=0; }
+
+protected:
+  const EvaluatorType& m_eval;
+  Index m_inner;
+  const Index m_outer;
+  const Index m_end;
+};
+
+// For iterator-based evaluator, inner-iterator is already implemented as
+// evaluator<>::InnerIterator
+template<typename XprType>
+class inner_iterator_selector<XprType, IteratorBased>
+ : public evaluator<XprType>::InnerIterator
+{
+protected:
+  typedef typename evaluator<XprType>::InnerIterator Base;
+  typedef evaluator<XprType> EvaluatorType;
+  
+public:
+  EIGEN_STRONG_INLINE inner_iterator_selector(const EvaluatorType &eval, const Index &outerId, const Index &/*innerSize*/)
+    : Base(eval, outerId)
+  {}  
+};
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_COREITERATORS_H

diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index dd89d26..2202b1c 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -13,26 +13,6 @@
 
 namespace Eigen {
 
-/** \class CwiseBinaryOp
-  * \ingroup Core_Module
-  *
-  * \brief Generic expression where a coefficient-wise binary operator is applied to two expressions
-  *
-  * \param BinaryOp template functor implementing the operator
-  * \param Lhs the type of the left-hand side
-  * \param Rhs the type of the right-hand side
-  *
-  * This class represents an expression  where a coefficient-wise binary operator is applied to two expressions.
-  * It is the return type of binary operators, by which we mean only those binary operators where
-  * both the left-hand side and the right-hand side are Eigen expressions.
-  * For example, the return type of matrix1+matrix2 is a CwiseBinaryOp.
-  *
-  * Most of the time, this is the only way that it is used, so you typically don't have to name
-  * CwiseBinaryOp types explicitly.
-  *
-  * \sa MatrixBase::binaryExpr(const MatrixBase<OtherDerived> &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp
-  */
-
 namespace internal {
 template<typename BinaryOp, typename Lhs, typename Rhs>
 struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
@@ -52,78 +32,82 @@
   // we still want to handle the case when the result type is different.
   typedef typename result_of<
                      BinaryOp(
-                       typename Lhs::Scalar,
-                       typename Rhs::Scalar
+                       const typename Lhs::Scalar&,
+                       const typename Rhs::Scalar&
                      )
                    >::type Scalar;
-  typedef typename promote_storage_type<typename traits<Lhs>::StorageKind,
-                                           typename traits<Rhs>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename traits<Lhs>::Index,
-                                         typename traits<Rhs>::Index>::type Index;
+  typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind,
+                                              typename traits<Rhs>::StorageKind,
+                                              BinaryOp>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<Lhs>::StorageIndex,
+                                      typename traits<Rhs>::StorageIndex>::type StorageIndex;
   typedef typename Lhs::Nested LhsNested;
   typedef typename Rhs::Nested RhsNested;
   typedef typename remove_reference<LhsNested>::type _LhsNested;
   typedef typename remove_reference<RhsNested>::type _RhsNested;
   enum {
-    LhsCoeffReadCost = _LhsNested::CoeffReadCost,
-    RhsCoeffReadCost = _RhsNested::CoeffReadCost,
-    LhsFlags = _LhsNested::Flags,
-    RhsFlags = _RhsNested::Flags,
-    SameType = is_same<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::value,
-    StorageOrdersAgree = (int(Lhs::Flags)&RowMajorBit)==(int(Rhs::Flags)&RowMajorBit),
-    Flags0 = (int(LhsFlags) | int(RhsFlags)) & (
-        HereditaryBits
-      | (int(LhsFlags) & int(RhsFlags) &
-           ( AlignedBit
-           | (StorageOrdersAgree ? LinearAccessBit : 0)
-           | (functor_traits<BinaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)
-           )
-        )
-     ),
-    Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit),
-    CoeffReadCost = LhsCoeffReadCost + RhsCoeffReadCost + functor_traits<BinaryOp>::Cost
+    Flags = cwise_promote_storage_order<typename traits<Lhs>::StorageKind,typename traits<Rhs>::StorageKind,_LhsNested::Flags & RowMajorBit,_RhsNested::Flags & RowMajorBit>::value
   };
 };
 } // end namespace internal
 
-// we require Lhs and Rhs to have the same scalar type. Currently there is no example of a binary functor
-// that would take two operands of different types. If there were such an example, then this check should be
-// moved to the BinaryOp functors, on a per-case basis. This would however require a change in the BinaryOp functors, as
-// currently they take only one typename Scalar template parameter.
-// It is tempting to always allow mixing different types but remember that this is often impossible in the vectorized paths.
-// So allowing mixing different types gives very unexpected errors when enabling vectorization, when the user tries to
-// add together a float matrix and a double matrix.
-#define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \
-  EIGEN_STATIC_ASSERT((internal::functor_is_product_like<BINOP>::ret \
-                        ? int(internal::scalar_product_traits<LHS, RHS>::Defined) \
-                        : int(internal::is_same<LHS, RHS>::value)), \
-    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-
 template<typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind>
 class CwiseBinaryOpImpl;
 
-template<typename BinaryOp, typename Lhs, typename Rhs>
-class CwiseBinaryOp : internal::no_assignment_operator,
+/** \class CwiseBinaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression where a coefficient-wise binary operator is applied to two expressions
+  *
+  * \tparam BinaryOp template functor implementing the operator
+  * \tparam LhsType the type of the left-hand side
+  * \tparam RhsType the type of the right-hand side
+  *
+  * This class represents an expression  where a coefficient-wise binary operator is applied to two expressions.
+  * It is the return type of binary operators, by which we mean only those binary operators where
+  * both the left-hand side and the right-hand side are Eigen expressions.
+  * For example, the return type of matrix1+matrix2 is a CwiseBinaryOp.
+  *
+  * Most of the time, this is the only way that it is used, so you typically don't have to name
+  * CwiseBinaryOp types explicitly.
+  *
+  * \sa MatrixBase::binaryExpr(const MatrixBase<OtherDerived> &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp
+  */
+template<typename BinaryOp, typename LhsType, typename RhsType>
+class CwiseBinaryOp :
   public CwiseBinaryOpImpl<
-          BinaryOp, Lhs, Rhs,
-          typename internal::promote_storage_type<typename internal::traits<Lhs>::StorageKind,
-                                           typename internal::traits<Rhs>::StorageKind>::ret>
+          BinaryOp, LhsType, RhsType,
+          typename internal::cwise_promote_storage_type<typename internal::traits<LhsType>::StorageKind,
+                                                        typename internal::traits<RhsType>::StorageKind,
+                                                        BinaryOp>::ret>,
+  internal::no_assignment_operator
 {
   public:
 
+    typedef typename internal::remove_all<BinaryOp>::type Functor;
+    typedef typename internal::remove_all<LhsType>::type Lhs;
+    typedef typename internal::remove_all<RhsType>::type Rhs;
+
     typedef typename CwiseBinaryOpImpl<
-        BinaryOp, Lhs, Rhs,
-        typename internal::promote_storage_type<typename internal::traits<Lhs>::StorageKind,
-                                         typename internal::traits<Rhs>::StorageKind>::ret>::Base Base;
+        BinaryOp, LhsType, RhsType,
+        typename internal::cwise_promote_storage_type<typename internal::traits<LhsType>::StorageKind,
+                                                      typename internal::traits<Rhs>::StorageKind,
+                                                      BinaryOp>::ret>::Base Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseBinaryOp)
 
-    typedef typename internal::nested<Lhs>::type LhsNested;
-    typedef typename internal::nested<Rhs>::type RhsNested;
+    typedef typename internal::ref_selector<LhsType>::type LhsNested;
+    typedef typename internal::ref_selector<RhsType>::type RhsNested;
     typedef typename internal::remove_reference<LhsNested>::type _LhsNested;
     typedef typename internal::remove_reference<RhsNested>::type _RhsNested;
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp())
+#if EIGEN_COMP_MSVC && EIGEN_HAS_CXX11
+    //Required for Visual Studio or the Copy constructor will probably not get inlined!
+    EIGEN_STRONG_INLINE
+    CwiseBinaryOp(const CwiseBinaryOp<BinaryOp,LhsType,RhsType>&) = default;
+#endif
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp())
       : m_lhs(aLhs), m_rhs(aRhs), m_functor(func)
     {
       EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename Rhs::Scalar);
@@ -132,31 +116,25 @@
       eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
     }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index rows() const {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT {
       // return the fixed size type if available to enable compile time optimizations
-      if (internal::traits<typename internal::remove_all<LhsNested>::type>::RowsAtCompileTime==Dynamic)
-        return m_rhs.rows();
-      else
-        return m_lhs.rows();
+      return internal::traits<typename internal::remove_all<LhsNested>::type>::RowsAtCompileTime==Dynamic ? m_rhs.rows() : m_lhs.rows();
     }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index cols() const {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT {
       // return the fixed size type if available to enable compile time optimizations
-      if (internal::traits<typename internal::remove_all<LhsNested>::type>::ColsAtCompileTime==Dynamic)
-        return m_rhs.cols();
-      else
-        return m_lhs.cols();
+      return internal::traits<typename internal::remove_all<LhsNested>::type>::ColsAtCompileTime==Dynamic ? m_rhs.cols() : m_lhs.cols();
     }
 
     /** \returns the left hand side nested expression */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const _LhsNested& lhs() const { return m_lhs; }
     /** \returns the right hand side nested expression */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const _RhsNested& rhs() const { return m_rhs; }
     /** \returns the functor representing the binary operation */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const BinaryOp& functor() const { return m_functor; }
 
   protected:
@@ -165,43 +143,13 @@
     const BinaryOp m_functor;
 };
 
-template<typename BinaryOp, typename Lhs, typename Rhs>
-class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Dense>
-  : public internal::dense_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type
+// Generic API dispatcher
+template<typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind>
+class CwiseBinaryOpImpl
+  : public internal::generic_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type
 {
-    typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> Derived;
-  public:
-
-    typedef typename internal::dense_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE( Derived )
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
-    {
-      return derived().functor()(derived().lhs().coeff(rowId, colId),
-                                 derived().rhs().coeff(rowId, colId));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
-    {
-      return derived().functor().packetOp(derived().lhs().template packet<LoadMode>(rowId, colId),
-                                          derived().rhs().template packet<LoadMode>(rowId, colId));
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
-    {
-      return derived().functor()(derived().lhs().coeff(index),
-                                 derived().rhs().coeff(index));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
-    {
-      return derived().functor().packetOp(derived().lhs().template packet<LoadMode>(index),
-                                          derived().rhs().template packet<LoadMode>(index));
-    }
+public:
+  typedef typename internal::generic_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type Base;
 };
 
 /** replaces \c *this by \c *this - \a other.
@@ -213,8 +161,7 @@
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
 {
-  SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
@@ -227,8 +174,7 @@
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 {
-  SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 

diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index f96ef66..289ec51 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h

@@ -12,13 +12,24 @@
 
 namespace Eigen {
 
+namespace internal {
+template<typename NullaryOp, typename PlainObjectType>
+struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectType>
+{
+  enum {
+    Flags = traits<PlainObjectType>::Flags & RowMajorBit
+  };
+};
+
+} // namespace internal
+
 /** \class CwiseNullaryOp
   * \ingroup Core_Module
   *
   * \brief Generic expression of a matrix where all coefficients are defined by a functor
   *
-  * \param NullaryOp template functor implementing the operator
-  * \param PlainObjectType the underlying plain matrix/array type
+  * \tparam NullaryOp template functor implementing the operator
+  * \tparam PlainObjectType the underlying plain matrix/array type
   *
   * This class represents an expression of a generic nullary operator.
   * It is the return type of the Ones(), Zero(), Constant(), Identity() and Random() methods,
@@ -27,27 +38,26 @@
   * However, if you want to write a function returning such an expression, you
   * will need to use this class.
   *
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp, DenseBase::NullaryExpr()
+  * The functor NullaryOp must expose one of the following method:
+    <table class="manual">
+    <tr            ><td>\c operator()() </td><td>if the procedural generation does not depend on the coefficient entries (e.g., random numbers)</td></tr>
+    <tr class="alt"><td>\c operator()(Index i)</td><td>if the procedural generation makes sense for vectors only and that it depends on the coefficient index \c i (e.g., linspace) </td></tr>
+    <tr            ><td>\c operator()(Index i,Index j)</td><td>if the procedural generation depends on the matrix coordinates \c i, \c j (e.g., to generate a checkerboard with 0 and 1)</td></tr>
+    </table>
+  * It is also possible to expose the last two operators if the generation makes sense for matrices but can be optimized for vectors.
+  *
+  * See DenseBase::NullaryExpr(Index,const CustomNullaryOp&) for an example binding
+  * C++11 random number generators.
+  *
+  * A nullary expression can also be used to implement custom sophisticated matrix manipulations
+  * that cannot be covered by the existing set of natively supported matrix manipulations.
+  * See this \ref TopicCustomizing_NullaryExpr "page" for some examples and additional explanations
+  * on the behavior of CwiseNullaryOp.
+  *
+  * \sa class CwiseUnaryOp, class CwiseBinaryOp, DenseBase::NullaryExpr
   */
-
-namespace internal {
 template<typename NullaryOp, typename PlainObjectType>
-struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectType>
-{
-  enum {
-    Flags = (traits<PlainObjectType>::Flags
-      & (  HereditaryBits
-         | (functor_has_linear_access<NullaryOp>::ret ? LinearAccessBit : 0)
-         | (functor_traits<NullaryOp>::PacketAccess ? PacketAccessBit : 0)))
-      | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit),
-    CoeffReadCost = functor_traits<NullaryOp>::Cost
-  };
-};
-}
-
-template<typename NullaryOp, typename PlainObjectType>
-class CwiseNullaryOp : internal::no_assignment_operator,
-  public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp, PlainObjectType> >::type
+class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp, PlainObjectType> >::type, internal::no_assignment_operator
 {
   public:
 
@@ -55,43 +65,19 @@
     EIGEN_DENSE_PUBLIC_INTERFACE(CwiseNullaryOp)
 
     EIGEN_DEVICE_FUNC
-    CwiseNullaryOp(Index nbRows, Index nbCols, const NullaryOp& func = NullaryOp())
-      : m_rows(nbRows), m_cols(nbCols), m_functor(func)
+    CwiseNullaryOp(Index rows, Index cols, const NullaryOp& func = NullaryOp())
+      : m_rows(rows), m_cols(cols), m_functor(func)
     {
-      eigen_assert(nbRows >= 0
-            && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == nbRows)
-            &&  nbCols >= 0
-            && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == nbCols));
+      eigen_assert(rows >= 0
+            && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
+            &&  cols >= 0
+            && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols));
     }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index rows() const { return m_rows.value(); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index cols() const { return m_cols.value(); }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
-    {
-      return m_functor(rowId, colId);
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
-    {
-      return m_functor.packetOp(rowId, colId);
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
-    {
-      return m_functor(index);
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
-    {
-      return m_functor.packetOp(index);
-    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const { return m_rows.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const { return m_cols.value(); }
 
     /** \returns the functor representing the nullary operation */
     EIGEN_DEVICE_FUNC
@@ -119,11 +105,15 @@
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const CwiseNullaryOp<CustomNullaryOp,typename DenseBase<Derived>::PlainObject>
+#else
+const CwiseNullaryOp<CustomNullaryOp,PlainObject>
+#endif
 DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
 {
-  return CwiseNullaryOp<CustomNullaryOp, Derived>(rows, cols, func);
+  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
 }
 
 /** \returns an expression of a matrix defined by a custom functor \a func
@@ -141,18 +131,22 @@
   *
   * Here is an example with C++11 random generators: \include random_cpp11.cpp
   * Output: \verbinclude random_cpp11.out
-  * 
+  *
   * \sa class CwiseNullaryOp
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+#else
+const CwiseNullaryOp<CustomNullaryOp, PlainObject>
+#endif
 DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  if(RowsAtCompileTime == 1) return CwiseNullaryOp<CustomNullaryOp, Derived>(1, size, func);
-  else return CwiseNullaryOp<CustomNullaryOp, Derived>(size, 1, func);
+  if(RowsAtCompileTime == 1) return CwiseNullaryOp<CustomNullaryOp, PlainObject>(1, size, func);
+  else return CwiseNullaryOp<CustomNullaryOp, PlainObject>(size, 1, func);
 }
 
 /** \returns an expression of a matrix defined by a custom functor \a func
@@ -166,20 +160,24 @@
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+#else
+const CwiseNullaryOp<CustomNullaryOp, PlainObject>
+#endif
 DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
 {
-  return CwiseNullaryOp<CustomNullaryOp, Derived>(RowsAtCompileTime, ColsAtCompileTime, func);
+  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
 }
 
 /** \returns an expression of a constant matrix of value \a value
   *
-  * The parameters \a nbRows and \a nbCols are the number of rows and of columns of
+  * The parameters \a rows and \a cols are the number of rows and of columns of
   * the returned matrix. Must be compatible with this DenseBase type.
   *
   * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
-  * it is redundant to pass \a nbRows and \a nbCols as arguments, so Zero() should be used
+  * it is redundant to pass \a rows and \a cols as arguments, so Zero() should be used
   * instead.
   *
   * The template parameter \a CustomNullaryOp is the type of the functor.
@@ -187,11 +185,10 @@
   * \sa class CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Constant(Index nbRows, Index nbCols, const Scalar& value)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
 {
-  return DenseBase<Derived>::NullaryExpr(nbRows, nbCols, internal::scalar_constant_op<Scalar>(value));
+  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value));
 }
 
 /** \returns an expression of a constant matrix of value \a value
@@ -210,8 +207,7 @@
   * \sa class CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(Index size, const Scalar& value)
 {
   return DenseBase<Derived>::NullaryExpr(size, internal::scalar_constant_op<Scalar>(value));
@@ -227,56 +223,45 @@
   * \sa class CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(const Scalar& value)
 {
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
   return DenseBase<Derived>::NullaryExpr(RowsAtCompileTime, ColsAtCompileTime, internal::scalar_constant_op<Scalar>(value));
 }
 
-/**
-  * \brief Sets a linearly space vector.
-  *
-  * The function generates 'size' equally spaced values in the closed interval [low,high].
-  * This particular version of LinSpaced() uses sequential access, i.e. vector access is
-  * assumed to be a(0), a(1), ..., a(size). This assumption allows for better vectorization
-  * and yields faster code than the random access version.
-  *
-  * When size is set to 1, a vector of length 1 containing 'high' is returned.
+/** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(Index,const Scalar&,const Scalar&)
   *
   * \only_for_vectors
   *
-  * Example: \include DenseBase_LinSpaced_seq.cpp
-  * Output: \verbinclude DenseBase_LinSpaced_seq.out
+  * Example: \include DenseBase_LinSpaced_seq_deprecated.cpp
+  * Output: \verbinclude DenseBase_LinSpaced_seq_deprecated.out
   *
-  * \sa setLinSpaced(Index,const Scalar&,const Scalar&), LinSpaced(Index,Scalar,Scalar), CwiseNullaryOp
+  * \sa LinSpaced(Index,const Scalar&, const Scalar&), setLinSpaced(Index,const Scalar&,const Scalar&)
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::SequentialLinSpacedReturnType
+EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,false>(low,high,size));
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar>(low,high,size));
 }
 
-/**
-  * \copydoc DenseBase::LinSpaced(Sequential_t, Index, const Scalar&, const Scalar&)
-  * Special version for fixed size types which does not require the size parameter.
+/** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(const Scalar&,const Scalar&)
+  *
+  * \sa LinSpaced(const Scalar&, const Scalar&)
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::SequentialLinSpacedReturnType
+EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,false>(low,high,Derived::SizeAtCompileTime));
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar>(low,high,Derived::SizeAtCompileTime));
 }
 
 /**
-  * \brief Sets a linearly space vector.
+  * \brief Sets a linearly spaced vector.
   *
   * The function generates 'size' equally spaced values in the closed interval [low,high].
   * When size is set to 1, a vector of length 1 containing 'high' is returned.
@@ -286,15 +271,24 @@
   * Example: \include DenseBase_LinSpaced.cpp
   * Output: \verbinclude DenseBase_LinSpaced.out
   *
-  * \sa setLinSpaced(Index,const Scalar&,const Scalar&), LinSpaced(Sequential_t,Index,const Scalar&,const Scalar&,Index), CwiseNullaryOp
+  * For integer scalar types, an even spacing is possible if and only if the length of the range,
+  * i.e., \c high-low is a scalar multiple of \c size-1, or if \c size is a scalar multiple of the
+  * number of values \c high-low+1 (meaning each value can be repeated the same number of time).
+  * If one of these two considions is not satisfied, then \c high is lowered to the largest value
+  * satisfying one of this constraint.
+  * Here are some examples:
+  *
+  * Example: \include DenseBase_LinSpacedInt.cpp
+  * Output: \verbinclude DenseBase_LinSpacedInt.out
+  *
+  * \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,true>(low,high,size));
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar>(low,high,size));
 }
 
 /**
@@ -302,24 +296,23 @@
   * Special version for fixed size types which does not require the size parameter.
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,true>(low,high,Derived::SizeAtCompileTime));
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar>(low,high,Derived::SizeAtCompileTime));
 }
 
 /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-bool DenseBase<Derived>::isApproxToConstant
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApproxToConstant
 (const Scalar& val, const RealScalar& prec) const
 {
+  typename internal::nested_eval<Derived,1>::type self(derived());
   for(Index j = 0; j < cols(); ++j)
     for(Index i = 0; i < rows(); ++i)
-      if(!internal::isApprox(this->coeff(i, j), val, prec))
+      if(!internal::isApprox(self.coeff(i, j), val, prec))
         return false;
   return true;
 }
@@ -328,8 +321,7 @@
   *
   * \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-bool DenseBase<Derived>::isConstant
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isConstant
 (const Scalar& val, const RealScalar& prec) const
 {
   return isApproxToConstant(val, prec);
@@ -340,24 +332,22 @@
   * \sa setConstant(), Constant(), class CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
 {
   setConstant(val);
 }
 
-/** Sets all coefficients in this expression to \a value.
+/** Sets all coefficients in this expression to value \a val.
   *
   * \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
 {
   return derived() = Constant(rows(), cols(), val);
 }
 
-/** Resizes to the given \a size, and sets all coefficients in this expression to the given \a value.
+/** Resizes to the given \a size, and sets all coefficients in this expression to the given value \a val.
   *
   * \only_for_vectors
   *
@@ -367,18 +357,17 @@
   * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
 {
   resize(size);
   return setConstant(val);
 }
 
-/** Resizes to the given size, and sets all coefficients in this expression to the given \a value.
+/** Resizes to the given size, and sets all coefficients in this expression to the given value \a val.
   *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
   * \param val the value to which all coefficients are set
   *
   * Example: \include Matrix_setConstant_int_int.cpp
@@ -387,16 +376,42 @@
   * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setConstant(Index nbRows, Index nbCols, const Scalar& val)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
 {
-  resize(nbRows, nbCols);
+  resize(rows, cols);
   return setConstant(val);
 }
 
+/** Resizes to the given size, changing only the number of columns, and sets all
+  * coefficients in this expression to the given value \a val. For the parameter
+  * of type NoChange_t, just pass the special value \c NoChange.
+  *
+  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setConstant(NoChange_t, Index cols, const Scalar& val)
+{
+  return setConstant(rows(), cols, val);
+}
+
+/** Resizes to the given size, changing only the number of rows, and sets all
+  * coefficients in this expression to the given value \a val. For the parameter
+  * of type NoChange_t, just pass the special value \c NoChange.
+  *
+  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setConstant(Index rows, NoChange_t, const Scalar& val)
+{
+  return setConstant(rows, cols(), val);
+}
+
+
 /**
-  * \brief Sets a linearly space vector.
+  * \brief Sets a linearly spaced vector.
   *
   * The function generates 'size' equally spaced values in the closed interval [low,high].
   * When size is set to 1, a vector of length 1 containing 'high' is returned.
@@ -406,29 +421,33 @@
   * Example: \include DenseBase_setLinSpaced.cpp
   * Output: \verbinclude DenseBase_setLinSpaced.out
   *
-  * \sa CwiseNullaryOp
+  * For integer scalar types, do not miss the explanations on the definition
+  * of \link LinSpaced(Index,const Scalar&,const Scalar&) even spacing \endlink.
+  *
+  * \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,false>(low,high,newSize));
+  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar>(low,high,newSize));
 }
 
 /**
-  * \brief Sets a linearly space vector.
+  * \brief Sets a linearly spaced vector.
   *
-  * The function fill *this with equally spaced values in the closed interval [low,high].
+  * The function fills \c *this with equally spaced values in the closed interval [low,high].
   * When size is set to 1, a vector of length 1 containing 'high' is returned.
   *
   * \only_for_vectors
   *
-  * \sa setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
+  * For integer scalar types, do not miss the explanations on the definition
+  * of \link LinSpaced(Index,const Scalar&,const Scalar&) even spacing \endlink.
+  *
+  * \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return setLinSpaced(size(), low, high);
@@ -451,11 +470,10 @@
   * \sa Zero(), Zero(Index)
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Zero(Index nbRows, Index nbCols)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Zero(Index rows, Index cols)
 {
-  return Constant(nbRows, nbCols, Scalar(0));
+  return Constant(rows, cols, Scalar(0));
 }
 
 /** \returns an expression of a zero vector.
@@ -475,8 +493,7 @@
   * \sa Zero(), Zero(Index,Index)
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Zero(Index size)
 {
   return Constant(size, Scalar(0));
@@ -493,8 +510,7 @@
   * \sa Zero(Index), Zero(Index,Index)
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Zero()
 {
   return Constant(Scalar(0));
@@ -509,12 +525,12 @@
   * \sa class CwiseNullaryOp, Zero()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-bool DenseBase<Derived>::isZero(const RealScalar& prec) const
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isZero(const RealScalar& prec) const
 {
+  typename internal::nested_eval<Derived,1>::type self(derived());
   for(Index j = 0; j < cols(); ++j)
     for(Index i = 0; i < rows(); ++i)
-      if(!internal::isMuchSmallerThan(this->coeff(i, j), static_cast<Scalar>(1), prec))
+      if(!internal::isMuchSmallerThan(self.coeff(i, j), static_cast<Scalar>(1), prec))
         return false;
   return true;
 }
@@ -527,8 +543,7 @@
   * \sa class CwiseNullaryOp, Zero()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
 {
   return setConstant(Scalar(0));
 }
@@ -543,8 +558,7 @@
   * \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setZero(Index newSize)
 {
   resize(newSize);
@@ -553,8 +567,8 @@
 
 /** Resizes to the given size, and sets all coefficients in this expression to zero.
   *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
   *
   * Example: \include Matrix_setZero_int_int.cpp
   * Output: \verbinclude Matrix_setZero_int_int.out
@@ -562,19 +576,44 @@
   * \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setZero(Index nbRows, Index nbCols)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setZero(Index rows, Index cols)
 {
-  resize(nbRows, nbCols);
+  resize(rows, cols);
   return setConstant(Scalar(0));
 }
 
+/** Resizes to the given size, changing only the number of columns, and sets all
+  * coefficients in this expression to zero. For the parameter of type NoChange_t,
+  * just pass the special value \c NoChange.
+  *
+  * \sa DenseBase::setZero(), setZero(Index), setZero(Index, Index), setZero(Index, NoChange_t), class CwiseNullaryOp, DenseBase::Zero()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setZero(NoChange_t, Index cols)
+{
+  return setZero(rows(), cols);
+}
+
+/** Resizes to the given size, changing only the number of rows, and sets all
+  * coefficients in this expression to zero. For the parameter of type NoChange_t,
+  * just pass the special value \c NoChange.
+  *
+  * \sa DenseBase::setZero(), setZero(Index), setZero(Index, Index), setZero(NoChange_t, Index), class CwiseNullaryOp, DenseBase::Zero()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setZero(Index rows, NoChange_t)
+{
+  return setZero(rows, cols());
+}
+
 // ones:
 
 /** \returns an expression of a matrix where all coefficients equal one.
   *
-  * The parameters \a nbRows and \a nbCols are the number of rows and of columns of
+  * The parameters \a rows and \a cols are the number of rows and of columns of
   * the returned matrix. Must be compatible with this MatrixBase type.
   *
   * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
@@ -587,11 +626,10 @@
   * \sa Ones(), Ones(Index), isOnes(), class Ones
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Ones(Index nbRows, Index nbCols)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Ones(Index rows, Index cols)
 {
-  return Constant(nbRows, nbCols, Scalar(1));
+  return Constant(rows, cols, Scalar(1));
 }
 
 /** \returns an expression of a vector where all coefficients equal one.
@@ -611,8 +649,7 @@
   * \sa Ones(), Ones(Index,Index), isOnes(), class Ones
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Ones(Index newSize)
 {
   return Constant(newSize, Scalar(1));
@@ -629,8 +666,7 @@
   * \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Ones()
 {
   return Constant(Scalar(1));
@@ -645,8 +681,7 @@
   * \sa class CwiseNullaryOp, Ones()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-bool DenseBase<Derived>::isOnes
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isOnes
 (const RealScalar& prec) const
 {
   return isApproxToConstant(Scalar(1), prec);
@@ -660,8 +695,7 @@
   * \sa class CwiseNullaryOp, Ones()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
 {
   return setConstant(Scalar(1));
 }
@@ -676,8 +710,7 @@
   * \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setOnes(Index newSize)
 {
   resize(newSize);
@@ -686,8 +719,8 @@
 
 /** Resizes to the given size, and sets all coefficients in this expression to one.
   *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
   *
   * Example: \include Matrix_setOnes_int_int.cpp
   * Output: \verbinclude Matrix_setOnes_int_int.out
@@ -695,19 +728,44 @@
   * \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setOnes(Index nbRows, Index nbCols)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
 {
-  resize(nbRows, nbCols);
+  resize(rows, cols);
   return setConstant(Scalar(1));
 }
 
+/** Resizes to the given size, changing only the number of rows, and sets all
+  * coefficients in this expression to one. For the parameter of type NoChange_t,
+  * just pass the special value \c NoChange.
+  *
+ * \sa MatrixBase::setOnes(), setOnes(Index), setOnes(Index, Index), setOnes(NoChange_t, Index), class CwiseNullaryOp, MatrixBase::Ones()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setOnes(Index rows, NoChange_t)
+{
+  return setOnes(rows, cols());
+}
+
+/** Resizes to the given size, changing only the number of columns, and sets all
+  * coefficients in this expression to one. For the parameter of type NoChange_t,
+  * just pass the special value \c NoChange.
+  *
+ * \sa MatrixBase::setOnes(), setOnes(Index), setOnes(Index, Index), setOnes(Index, NoChange_t) class CwiseNullaryOp, MatrixBase::Ones()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setOnes(NoChange_t, Index cols)
+{
+  return setOnes(rows(), cols);
+}
+
 // Identity:
 
 /** \returns an expression of the identity matrix (not necessarily square).
   *
-  * The parameters \a nbRows and \a nbCols are the number of rows and of columns of
+  * The parameters \a rows and \a cols are the number of rows and of columns of
   * the returned matrix. Must be compatible with this MatrixBase type.
   *
   * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
@@ -720,11 +778,10 @@
   * \sa Identity(), setIdentity(), isIdentity()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
-MatrixBase<Derived>::Identity(Index nbRows, Index nbCols)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
+MatrixBase<Derived>::Identity(Index rows, Index cols)
 {
-  return DenseBase<Derived>::NullaryExpr(nbRows, nbCols, internal::scalar_identity_op<Scalar>());
+  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_identity_op<Scalar>());
 }
 
 /** \returns an expression of the identity matrix (not necessarily square).
@@ -738,8 +795,7 @@
   * \sa Identity(Index,Index), setIdentity(), isIdentity()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
 MatrixBase<Derived>::Identity()
 {
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
@@ -759,18 +815,19 @@
 bool MatrixBase<Derived>::isIdentity
 (const RealScalar& prec) const
 {
+  typename internal::nested_eval<Derived,1>::type self(derived());
   for(Index j = 0; j < cols(); ++j)
   {
     for(Index i = 0; i < rows(); ++i)
     {
       if(i == j)
       {
-        if(!internal::isApprox(this->coeff(i, j), static_cast<Scalar>(1), prec))
+        if(!internal::isApprox(self.coeff(i, j), static_cast<Scalar>(1), prec))
           return false;
       }
       else
       {
-        if(!internal::isMuchSmallerThan(this->coeff(i, j), static_cast<RealScalar>(1), prec))
+        if(!internal::isMuchSmallerThan(self.coeff(i, j), static_cast<RealScalar>(1), prec))
           return false;
       }
     }
@@ -793,12 +850,11 @@
 template<typename Derived>
 struct setIdentity_impl<Derived, true>
 {
-  typedef typename Derived::Index Index;
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE Derived& run(Derived& m)
   {
     m.setZero();
-    const Index size = (std::min)(m.rows(), m.cols());
+    const Index size = numext::mini(m.rows(), m.cols());
     for(Index i = 0; i < size; ++i) m.coeffRef(i,i) = typename Derived::Scalar(1);
     return m;
   }
@@ -814,16 +870,15 @@
   * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
 {
   return internal::setIdentity_impl<Derived>::run(derived());
 }
 
 /** \brief Resizes to the given size, and writes the identity expression (not necessarily square) into *this.
   *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
   *
   * Example: \include Matrix_setIdentity_int_int.cpp
   * Output: \verbinclude Matrix_setIdentity_int_int.out
@@ -831,10 +886,9 @@
   * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index nbRows, Index nbCols)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
 {
-  derived().resize(nbRows, nbCols);
+  derived().resize(rows, cols);
   return setIdentity();
 }
 
@@ -845,8 +899,7 @@
   * \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i);
@@ -861,8 +914,7 @@
   * \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return BasisReturnType(SquareMatrixType::Identity(),i);
@@ -875,8 +927,7 @@
   * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
 { return Derived::Unit(0); }
 
 /** \returns an expression of the Y axis unit vector (0,1{,0}^*)
@@ -886,8 +937,7 @@
   * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
 { return Derived::Unit(1); }
 
 /** \returns an expression of the Z axis unit vector (0,0,1{,0}^*)
@@ -897,8 +947,7 @@
   * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
 { return Derived::Unit(2); }
 
 /** \returns an expression of the W axis unit vector (0,0,0,1)
@@ -908,10 +957,45 @@
   * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
 { return Derived::Unit(3); }
 
+/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector
+  *
+  * \param i index of the unique coefficient to be set to 1
+  *
+  * \only_for_vectors
+  *
+  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index i)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  eigen_assert(i<size());
+  derived().setZero();
+  derived().coeffRef(i) = Scalar(1);
+  return derived();
+}
+
+/** \brief Resizes to the given \a newSize, and writes the i-th unit (basis) vector into *this.
+  *
+  * \param newSize the new size of the vector
+  * \param i index of the unique coefficient to be set to 1
+  *
+  * \only_for_vectors
+  *
+  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index newSize, Index i)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  eigen_assert(i<newSize);
+  derived().resize(newSize);
+  return setUnit(i);
+}
+
 } // end namespace Eigen
 
 #endif // EIGEN_CWISE_NULLARY_OP_H

diff --git a/Eigen/src/Core/CwiseTernaryOp.h b/Eigen/src/Core/CwiseTernaryOp.h
index 6ce80ce..9f3576f 100644
--- a/Eigen/src/Core/CwiseTernaryOp.h
+++ b/Eigen/src/Core/CwiseTernaryOp.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 // Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
 //
@@ -14,30 +14,6 @@
 
 namespace Eigen {
 
-/** \class CwiseTernaryOp
-  * \ingroup Core_Module
-  *
-  * \brief Generic expression where a coefficient-wise ternary operator is
-  * applied to two expressions
-  *
-  * \tparam TernaryOp template functor implementing the operator
-  * \tparam Arg1 the type of the first argument
-  * \tparam Arg2 the type of the second argument
-  * \tparam Arg3 the type of the third argument
-  *
-  * This class represents an expression where a coefficient-wise ternary
-  * operator is applied to three expressions.
-  * It is the return type of ternary operators, by which we mean only those
-  * ternary operators where
-  * all three arguments are Eigen expressions.
-  * For example, the return type of betainc(matrix1, matrix2, matrix3) is a
-  * CwiseTernaryOp.
-  *
-  * Most of the time, this is the only way that it is used, so you typically
-  * don't have to name
-  * CwiseTernaryOp types explicitly.
-  */
-
 namespace internal {
 template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
 struct traits<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> > {
@@ -52,48 +28,23 @@
     MaxColsAtCompileTime = traits<Ancestor>::MaxColsAtCompileTime
   };
 
-  // even though we require Arg1 and Arg3 to have the same scalar type (see
-  // CwiseTernaryOp constructor),
+  // even though we require Arg1, Arg2, and Arg3 to have the same scalar type
+  // (see CwiseTernaryOp constructor),
   // we still want to handle the case when the result type is different.
-  typedef
-      typename result_of<TernaryOp(typename Arg1::Scalar, typename Arg2::Scalar,
-                                   typename Arg3::Scalar)>::type Scalar;
+  typedef typename result_of<TernaryOp(
+      const typename Arg1::Scalar&, const typename Arg2::Scalar&,
+      const typename Arg3::Scalar&)>::type Scalar;
 
   typedef typename internal::traits<Arg1>::StorageKind StorageKind;
-  typedef typename internal::traits<Arg1>::Index Index;
+  typedef typename internal::traits<Arg1>::StorageIndex StorageIndex;
 
   typedef typename Arg1::Nested Arg1Nested;
   typedef typename Arg2::Nested Arg2Nested;
   typedef typename Arg3::Nested Arg3Nested;
   typedef typename remove_reference<Arg1Nested>::type _Arg1Nested;
-  typedef typename remove_reference<Arg1Nested>::type _Arg2Nested;
+  typedef typename remove_reference<Arg2Nested>::type _Arg2Nested;
   typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
-  enum {
-    Arg1CoeffReadCost = _Arg1Nested::CoeffReadCost,
-    Arg2CoeffReadCost = _Arg2Nested::CoeffReadCost,
-    Arg3CoeffReadCost = _Arg3Nested::CoeffReadCost,
-    Arg1Flags = _Arg1Nested::Flags,
-    Arg2Flags = _Arg2Nested::Flags,
-    Arg3Flags = _Arg3Nested::Flags,
-    SameType12 = is_same<typename _Arg1Nested::Scalar,
-                         typename _Arg2Nested::Scalar>::value,
-    SameType13 = is_same<typename _Arg1Nested::Scalar,
-                         typename _Arg3Nested::Scalar>::value,
-    StorageOrdersAgree =
-        ((int(Arg1::Flags) & RowMajorBit) == (int(Arg2::Flags) & RowMajorBit) &&
-         (int(Arg1::Flags) & RowMajorBit) == (int(Arg3::Flags) & RowMajorBit)),
-    Flags0 = (int(Arg1Flags) | int(Arg2Flags) | int(Arg3Flags)) &
-             (HereditaryBits |
-              (int(Arg1Flags) & int(Arg2Flags) & int(Arg3Flags) &
-               (AlignedBit | (StorageOrdersAgree ? LinearAccessBit : 0) |
-                (functor_traits<TernaryOp>::PacketAccess &&
-                         StorageOrdersAgree && SameType12 && SameType13
-                     ? PacketAccessBit
-                     : 0)))),
-    Flags = (Flags0 & ~RowMajorBit) | (Arg1Flags & RowMajorBit),
-    CoeffReadCost = Arg1CoeffReadCost + Arg2CoeffReadCost + Arg3CoeffReadCost +
-                    functor_traits<TernaryOp>::Cost
-  };
+  enum { Flags = _Arg1Nested::Flags & RowMajorBit };
 };
 }  // end namespace internal
 
@@ -101,35 +52,78 @@
           typename StorageKind>
 class CwiseTernaryOpImpl;
 
-template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
-class CwiseTernaryOp
-    : internal::no_assignment_operator,
-      public CwiseTernaryOpImpl<TernaryOp, Arg1, Arg2, Arg3,
-                                typename internal::traits<Arg1>::StorageKind> {
+/** \class CwiseTernaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression where a coefficient-wise ternary operator is
+ * applied to two expressions
+  *
+  * \tparam TernaryOp template functor implementing the operator
+  * \tparam Arg1Type the type of the first argument
+  * \tparam Arg2Type the type of the second argument
+  * \tparam Arg3Type the type of the third argument
+  *
+  * This class represents an expression where a coefficient-wise ternary
+ * operator is applied to three expressions.
+  * It is the return type of ternary operators, by which we mean only those
+ * ternary operators where
+  * all three arguments are Eigen expressions.
+  * For example, the return type of betainc(matrix1, matrix2, matrix3) is a
+ * CwiseTernaryOp.
+  *
+  * Most of the time, this is the only way that it is used, so you typically
+ * don't have to name
+  * CwiseTernaryOp types explicitly.
+  *
+  * \sa MatrixBase::ternaryExpr(const MatrixBase<Argument2> &, const
+ * MatrixBase<Argument3> &, const CustomTernaryOp &) const, class CwiseBinaryOp,
+ * class CwiseUnaryOp, class CwiseNullaryOp
+  */
+template <typename TernaryOp, typename Arg1Type, typename Arg2Type,
+          typename Arg3Type>
+class CwiseTernaryOp : public CwiseTernaryOpImpl<
+                           TernaryOp, Arg1Type, Arg2Type, Arg3Type,
+                           typename internal::traits<Arg1Type>::StorageKind>,
+                       internal::no_assignment_operator
+{
  public:
+  typedef typename internal::remove_all<Arg1Type>::type Arg1;
+  typedef typename internal::remove_all<Arg2Type>::type Arg2;
+  typedef typename internal::remove_all<Arg3Type>::type Arg3;
+
   typedef typename CwiseTernaryOpImpl<
-      TernaryOp, Arg1, Arg2, Arg3,
-      typename internal::traits<Arg1>::StorageKind>::Base Base;
+      TernaryOp, Arg1Type, Arg2Type, Arg3Type,
+      typename internal::traits<Arg1Type>::StorageKind>::Base Base;
   EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseTernaryOp)
 
-  typedef typename internal::nested<Arg1>::type Arg1Nested;
-  typedef typename internal::nested<Arg2>::type Arg2Nested;
-  typedef typename internal::nested<Arg3>::type Arg3Nested;
+  typedef typename internal::ref_selector<Arg1Type>::type Arg1Nested;
+  typedef typename internal::ref_selector<Arg2Type>::type Arg2Nested;
+  typedef typename internal::ref_selector<Arg3Type>::type Arg3Nested;
   typedef typename internal::remove_reference<Arg1Nested>::type _Arg1Nested;
   typedef typename internal::remove_reference<Arg2Nested>::type _Arg2Nested;
   typedef typename internal::remove_reference<Arg3Nested>::type _Arg3Nested;
 
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE CwiseTernaryOp(const Arg1& arg1,
-                                     const Arg2& arg2,
-                                     const Arg3& arg3,
+  EIGEN_STRONG_INLINE CwiseTernaryOp(const Arg1& a1, const Arg2& a2,
+                                     const Arg3& a3,
                                      const TernaryOp& func = TernaryOp())
-      : m_arg1(arg1), m_arg2(arg2), m_arg3(arg3), m_functor(func) {
+      : m_arg1(a1), m_arg2(a2), m_arg3(a3), m_functor(func) {
     // require the sizes to match
     EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Arg1, Arg2)
     EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Arg1, Arg3)
-    eigen_assert(arg1.rows() == arg2.rows() && arg1.cols() == arg2.cols());
-    eigen_assert(arg1.rows() == arg3.rows() && arg1.cols() == arg3.cols());
+
+    // The index types should match
+    EIGEN_STATIC_ASSERT((internal::is_same<
+                         typename internal::traits<Arg1Type>::StorageKind,
+                         typename internal::traits<Arg2Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<
+                         typename internal::traits<Arg1Type>::StorageKind,
+                         typename internal::traits<Arg3Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+
+    eigen_assert(a1.rows() == a2.rows() && a1.cols() == a2.cols() &&
+                 a1.rows() == a3.rows() && a1.cols() == a3.cols());
   }
 
   EIGEN_DEVICE_FUNC
@@ -187,46 +181,15 @@
   const TernaryOp m_functor;
 };
 
-template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
-class CwiseTernaryOpImpl<TernaryOp, Arg1, Arg2, Arg3, Dense>
-    : public internal::dense_xpr_base<
+// Generic API dispatcher
+template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3,
+          typename StorageKind>
+class CwiseTernaryOpImpl
+    : public internal::generic_xpr_base<
           CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >::type {
-  typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> Derived;
-
  public:
-  typedef typename internal::dense_xpr_base<
+  typedef typename internal::generic_xpr_base<
       CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >::type Base;
-  EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const {
-    return derived().functor()(derived().arg1().coeff(rowId, colId),
-                               derived().arg2().coeff(rowId, colId),
-                               derived().arg3().coeff(rowId, colId));
-  }
-
-  template <int LoadMode>
-  EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const {
-    return derived().functor().packetOp(
-        derived().arg1().template packet<LoadMode>(rowId, colId),
-        derived().arg2().template packet<LoadMode>(rowId, colId),
-        derived().arg3().template packet<LoadMode>(rowId, colId));
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const Scalar coeff(Index index) const {
-    return derived().functor()(derived().arg1().coeff(index),
-                               derived().arg2().coeff(index),
-                               derived().arg3().coeff(index));
-  }
-
-  template <int LoadMode>
-  EIGEN_STRONG_INLINE PacketScalar packet(Index index) const {
-    return derived().functor().packetOp(
-        derived().arg1().template packet<LoadMode>(index),
-        derived().arg2().template packet<LoadMode>(index),
-        derived().arg3().template packet<LoadMode>(index));
-  }
 };
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h
index aa7df19..e68c4f7 100644
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,15 +11,34 @@
 #ifndef EIGEN_CWISE_UNARY_OP_H
 #define EIGEN_CWISE_UNARY_OP_H
 
-namespace Eigen { 
+namespace Eigen {
+
+namespace internal {
+template<typename UnaryOp, typename XprType>
+struct traits<CwiseUnaryOp<UnaryOp, XprType> >
+ : traits<XprType>
+{
+  typedef typename result_of<
+                     UnaryOp(const typename XprType::Scalar&)
+                   >::type Scalar;
+  typedef typename XprType::Nested XprTypeNested;
+  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
+  enum {
+    Flags = _XprTypeNested::Flags & RowMajorBit
+  };
+};
+}
+
+template<typename UnaryOp, typename XprType, typename StorageKind>
+class CwiseUnaryOpImpl;
 
 /** \class CwiseUnaryOp
   * \ingroup Core_Module
   *
   * \brief Generic expression where a coefficient-wise unary operator is applied to an expression
   *
-  * \param UnaryOp template functor implementing the operator
-  * \param XprType the type of the expression to which we are applying the unary operator
+  * \tparam UnaryOp template functor implementing the operator
+  * \tparam XprType the type of the expression to which we are applying the unary operator
   *
   * This class represents an expression where a unary operator is applied to an expression.
   * It is the return type of all operations taking exactly 1 input expression, regardless of the
@@ -32,102 +51,51 @@
   *
   * \sa MatrixBase::unaryExpr(const CustomUnaryOp &) const, class CwiseBinaryOp, class CwiseNullaryOp
   */
-
-namespace internal {
 template<typename UnaryOp, typename XprType>
-struct traits<CwiseUnaryOp<UnaryOp, XprType> >
- : traits<XprType>
-{
-  typedef typename result_of<
-                     UnaryOp(typename XprType::Scalar)
-                   >::type Scalar;
-  typedef typename XprType::Nested XprTypeNested;
-  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
-  enum {
-    Flags = _XprTypeNested::Flags & (
-      HereditaryBits | LinearAccessBit | AlignedBit
-      | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0)),
-    CoeffReadCost = _XprTypeNested::CoeffReadCost + functor_traits<UnaryOp>::Cost
-  };
-};
-}
-
-template<typename UnaryOp, typename XprType, typename StorageKind>
-class CwiseUnaryOpImpl;
-
-template<typename UnaryOp, typename XprType>
-class CwiseUnaryOp : internal::no_assignment_operator,
-  public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal::traits<XprType>::StorageKind>
+class CwiseUnaryOp : public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal::traits<XprType>::StorageKind>, internal::no_assignment_operator
 {
   public:
 
     typedef typename CwiseUnaryOpImpl<UnaryOp, XprType,typename internal::traits<XprType>::StorageKind>::Base Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryOp)
+    typedef typename internal::ref_selector<XprType>::type XprTypeNested;
+    typedef typename internal::remove_all<XprType>::type NestedExpression;
 
-    EIGEN_DEVICE_FUNC
-    inline CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    explicit CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
       : m_xpr(xpr), m_functor(func) {}
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index rows() const { return m_xpr.rows(); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index cols() const { return m_xpr.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
 
     /** \returns the functor representing the unary operation */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const UnaryOp& functor() const { return m_functor; }
 
     /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<XprTypeNested>::type&
     nestedExpression() const { return m_xpr; }
 
     /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
-    typename internal::remove_all<typename XprType::Nested>::type&
-    nestedExpression() { return m_xpr.const_cast_derived(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    typename internal::remove_all<XprTypeNested>::type&
+    nestedExpression() { return m_xpr; }
 
   protected:
-    typename XprType::Nested m_xpr;
+    XprTypeNested m_xpr;
     const UnaryOp m_functor;
 };
 
-// This is the generic implementation for dense storage.
-// It can be used for any expression types implementing the dense concept.
-template<typename UnaryOp, typename XprType>
-class CwiseUnaryOpImpl<UnaryOp,XprType,Dense>
-  : public internal::dense_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type
+// Generic API dispatcher
+template<typename UnaryOp, typename XprType, typename StorageKind>
+class CwiseUnaryOpImpl
+  : public internal::generic_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type
 {
-  public:
-
-    typedef CwiseUnaryOp<UnaryOp, XprType> Derived;
-    typedef typename internal::dense_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
-    {
-      return derived().functor()(derived().nestedExpression().coeff(rowId, colId));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
-    {
-      return derived().functor().packetOp(derived().nestedExpression().template packet<LoadMode>(rowId, colId));
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
-    {
-      return derived().functor()(derived().nestedExpression().coeff(index));
-    }
-
-    template<int LoadMode>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
-    {
-      return derived().functor().packetOp(derived().nestedExpression().template packet<LoadMode>(index));
-    }
+public:
+  typedef typename internal::generic_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type Base;
 };
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h
index b2638d3..a06d762 100644
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h

@@ -12,33 +12,19 @@
 
 namespace Eigen {
 
-/** \class CwiseUnaryView
-  * \ingroup Core_Module
-  *
-  * \brief Generic lvalue expression of a coefficient-wise unary operator of a matrix or a vector
-  *
-  * \param ViewOp template functor implementing the view
-  * \param MatrixType the type of the matrix we are applying the unary operator
-  *
-  * This class represents a lvalue expression of a generic unary view operator of a matrix or a vector.
-  * It is the return type of real() and imag(), and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::unaryViewExpr(const CustomUnaryOp &) const, class CwiseUnaryOp
-  */
-
 namespace internal {
 template<typename ViewOp, typename MatrixType>
 struct traits<CwiseUnaryView<ViewOp, MatrixType> >
  : traits<MatrixType>
 {
   typedef typename result_of<
-                     ViewOp(typename traits<MatrixType>::Scalar)
+                     ViewOp(const typename traits<MatrixType>::Scalar&)
                    >::type Scalar;
   typedef typename MatrixType::Nested MatrixTypeNested;
   typedef typename remove_all<MatrixTypeNested>::type _MatrixTypeNested;
   enum {
-    Flags = (traits<_MatrixTypeNested>::Flags & (HereditaryBits | LvalueBit | LinearAccessBit | DirectAccessBit)),
-    CoeffReadCost = traits<_MatrixTypeNested>::CoeffReadCost + functor_traits<ViewOp>::Cost,
+    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags = traits<_MatrixTypeNested>::Flags & (RowMajorBit | FlagsLvalueBit | DirectAccessBit), // FIXME DirectAccessBit should not be handled by expressions
     MatrixTypeInnerStride =  inner_stride_at_compile_time<MatrixType>::ret,
     // need to cast the sizeof's from size_t to int explicitly, otherwise:
     // "error: no integral type can represent all of the enumerator values
@@ -55,6 +41,19 @@
 template<typename ViewOp, typename MatrixType, typename StorageKind>
 class CwiseUnaryViewImpl;
 
+/** \class CwiseUnaryView
+  * \ingroup Core_Module
+  *
+  * \brief Generic lvalue expression of a coefficient-wise unary operator of a matrix or a vector
+  *
+  * \tparam ViewOp template functor implementing the view
+  * \tparam MatrixType the type of the matrix we are applying the unary operator
+  *
+  * This class represents a lvalue expression of a generic unary view operator of a matrix or a vector.
+  * It is the return type of real() and imag(), and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::unaryViewExpr(const CustomUnaryOp &) const, class CwiseUnaryOp
+  */
 template<typename ViewOp, typename MatrixType>
 class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename internal::traits<MatrixType>::StorageKind>
 {
@@ -62,32 +61,44 @@
 
     typedef typename CwiseUnaryViewImpl<ViewOp, MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryView)
+    typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
+    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
 
-    inline CwiseUnaryView(const MatrixType& mat, const ViewOp& func = ViewOp())
+    explicit EIGEN_DEVICE_FUNC inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp())
       : m_matrix(mat), m_functor(func) {}
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView)
 
-    EIGEN_STRONG_INLINE Index rows() const { return m_matrix.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
 
     /** \returns the functor representing unary operation */
-    const ViewOp& functor() const { return m_functor; }
+    EIGEN_DEVICE_FUNC const ViewOp& functor() const { return m_functor; }
 
     /** \returns the nested expression */
-    const typename internal::remove_all<typename MatrixType::Nested>::type&
+    EIGEN_DEVICE_FUNC const typename internal::remove_all<MatrixTypeNested>::type&
     nestedExpression() const { return m_matrix; }
 
     /** \returns the nested expression */
-    typename internal::remove_all<typename MatrixType::Nested>::type&
-    nestedExpression() { return m_matrix.const_cast_derived(); }
+    EIGEN_DEVICE_FUNC typename internal::remove_reference<MatrixTypeNested>::type&
+    nestedExpression() { return m_matrix; }
 
   protected:
-    // FIXME changed from MatrixType::Nested because of a weird compilation error with sun CC
-    typename internal::nested<MatrixType>::type m_matrix;
+    MatrixTypeNested m_matrix;
     ViewOp m_functor;
 };
 
+// Generic API dispatcher
+template<typename ViewOp, typename XprType, typename StorageKind>
+class CwiseUnaryViewImpl
+  : public internal::generic_xpr_base<CwiseUnaryView<ViewOp, XprType> >::type
+{
+public:
+  typedef typename internal::generic_xpr_base<CwiseUnaryView<ViewOp, XprType> >::type Base;
+};
+
 template<typename ViewOp, typename MatrixType>
 class CwiseUnaryViewImpl<ViewOp,MatrixType,Dense>
   : public internal::dense_xpr_base< CwiseUnaryView<ViewOp, MatrixType> >::type
@@ -99,39 +110,21 @@
 
     EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryViewImpl)
-    
-    inline Scalar* data() { return &coeffRef(0); }
-    inline const Scalar* data() const { return &coeff(0); }
 
-    inline Index innerStride() const
+    EIGEN_DEVICE_FUNC inline Scalar* data() { return &(this->coeffRef(0)); }
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return &(this->coeff(0)); }
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const
     {
       return derived().nestedExpression().innerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
     }
 
-    inline Index outerStride() const
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const
     {
       return derived().nestedExpression().outerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
     }
-
-    EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const
-    {
-      return derived().functor()(derived().nestedExpression().coeff(row, col));
-    }
-
-    EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-    {
-      return derived().functor()(derived().nestedExpression().coeff(index));
-    }
-
-    EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col)
-    {
-      return derived().functor()(const_cast_derived().nestedExpression().coeffRef(row, col));
-    }
-
-    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
-    {
-      return derived().functor()(const_cast_derived().nestedExpression().coeffRef(index));
-    }
+  protected:
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(CwiseUnaryViewImpl)
 };
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index 55cec0b..9b16db6 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h

@@ -14,15 +14,15 @@
 namespace Eigen {
 
 namespace internal {
-  
+
 // The index type defined by EIGEN_DEFAULT_DENSE_INDEX_TYPE must be a signed type.
 // This dummy function simply aims at checking that at compile time.
 static inline void check_DenseIndex_is_signed() {
-  EIGEN_STATIC_ASSERT(NumTraits<DenseIndex>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE); 
+  EIGEN_STATIC_ASSERT(NumTraits<DenseIndex>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE)
 }
 
 } // end namespace internal
-  
+
 /** \class DenseBase
   * \ingroup Core_Module
   *
@@ -34,37 +34,45 @@
   * \tparam Derived is the derived type, e.g., a matrix type or an expression.
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_DENSEBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_DENSEBASE_PLUGIN.
   *
-  * \sa \ref TopicClassHierarchy
+  * \sa \blank \ref TopicClassHierarchy
   */
 template<typename Derived> class DenseBase
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-  : public internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                                     typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>
+  : public DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value>
 #else
-  : public DenseCoeffsBase<Derived>
+  : public DenseCoeffsBase<Derived,DirectWriteAccessors>
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 {
   public:
-    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator*;
 
-    class InnerIterator;
+    /** Inner iterator type to iterate over the coefficients of a row or column.
+      * \sa class InnerIterator
+      */
+    typedef Eigen::InnerIterator<Derived> InnerIterator;
 
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
 
-    /** \brief The type of indices 
-      * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
-      * \sa \ref TopicPreprocessorDirectives.
-      */
-    typedef typename internal::traits<Derived>::Index Index; 
+    /**
+      * \brief The type used to store indices
+      * \details This typedef is relevant for types that store multiple indices such as
+      *          PermutationMatrix or Transpositions, otherwise it defaults to Eigen::Index
+      * \sa \blank \ref TopicPreprocessorDirectives, Eigen::Index, SparseMatrixBase.
+     */
+    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
 
+    /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc. */
     typedef typename internal::traits<Derived>::Scalar Scalar;
-    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
 
-    typedef DenseCoeffsBase<Derived> Base;
+    /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc.
+      *
+      * It is an alias for the Scalar type */
+    typedef Scalar value_type;
+
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value> Base;
+
     using Base::derived;
     using Base::const_cast_derived;
     using Base::rows;
@@ -74,16 +82,6 @@
     using Base::colIndexByOuterInner;
     using Base::coeff;
     using Base::coeffByOuterInner;
-    using Base::packet;
-    using Base::packetByOuterInner;
-    using Base::writePacket;
-    using Base::writePacketByOuterInner;
-    using Base::coeffRef;
-    using Base::coeffRefByOuterInner;
-    using Base::copyCoeff;
-    using Base::copyCoeffByOuterInner;
-    using Base::copyPacket;
-    using Base::copyPacketByOuterInner;
     using Base::operator();
     using Base::operator[];
     using Base::x;
@@ -152,13 +150,18 @@
           * \sa SizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime
           */
 
-      IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
-                           || internal::traits<Derived>::MaxColsAtCompileTime == 1,
+      IsVectorAtCompileTime = internal::traits<Derived>::RowsAtCompileTime == 1
+                           || internal::traits<Derived>::ColsAtCompileTime == 1,
         /**< This is set to true if either the number of rows or the number of
           * columns is known at compile-time to be equal to 1. Indeed, in that case,
           * we are dealing with a column-vector (if there is only one column) or with
           * a row-vector (if there is only one row). */
 
+      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2,
+        /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors,
+         * and 2 for matrices.
+         */
+
       Flags = internal::traits<Derived>::Flags,
         /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
           * constructed from this one. See the \ref flags "list of flags".
@@ -169,32 +172,54 @@
       InnerSizeAtCompileTime = int(IsVectorAtCompileTime) ? int(SizeAtCompileTime)
                              : int(IsRowMajor) ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
 
-      CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
-        /**< This is a rough measure of how expensive it is to read one coefficient from
-          * this expression.
-          */
-
       InnerStrideAtCompileTime = internal::inner_stride_at_compile_time<Derived>::ret,
       OuterStrideAtCompileTime = internal::outer_stride_at_compile_time<Derived>::ret
     };
 
-    enum { ThisConstantIsPrivateInPlainObjectBase };
+    typedef typename internal::find_best_packet<Scalar,SizeAtCompileTime>::type PacketScalar;
+
+    enum { IsPlainObjectBase = 0 };
+
+    /** The plain matrix type corresponding to this expression.
+      * \sa PlainObject */
+    typedef Matrix<typename internal::traits<Derived>::Scalar,
+                internal::traits<Derived>::RowsAtCompileTime,
+                internal::traits<Derived>::ColsAtCompileTime,
+                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                internal::traits<Derived>::MaxRowsAtCompileTime,
+                internal::traits<Derived>::MaxColsAtCompileTime
+          > PlainMatrix;
+
+    /** The plain array type corresponding to this expression.
+      * \sa PlainObject */
+    typedef Array<typename internal::traits<Derived>::Scalar,
+                internal::traits<Derived>::RowsAtCompileTime,
+                internal::traits<Derived>::ColsAtCompileTime,
+                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                internal::traits<Derived>::MaxRowsAtCompileTime,
+                internal::traits<Derived>::MaxColsAtCompileTime
+          > PlainArray;
+
+    /** \brief The plain matrix or array type corresponding to this expression.
+      *
+      * This is not necessarily exactly the return type of eval(). In the case of plain matrices,
+      * the return type of eval() is a const reference to a matrix, not a matrix! It is however guaranteed
+      * that the return type of eval() is either PlainObject or const PlainObject&.
+      */
+    typedef typename internal::conditional<internal::is_same<typename internal::traits<Derived>::XprKind,MatrixXpr >::value,
+                                 PlainMatrix, PlainArray>::type PlainObject;
 
     /** \returns the number of nonzero coefficients which is in practice the number
       * of stored coefficients. */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index nonZeros() const { return size(); }
-    /** \returns true if either the number of rows or the number of columns is equal to 1.
-      * In other words, this function returns
-      * \code rows()==1 || cols()==1 \endcode
-      * \sa rows(), cols(), IsVectorAtCompileTime. */
 
     /** \returns the outer size.
       *
       * \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension
       * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of columns for a
       * column-major matrix, and the number of rows for a row-major matrix. */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     Index outerSize() const
     {
       return IsVectorAtCompileTime ? 1
@@ -204,9 +229,9 @@
     /** \returns the inner size.
       *
       * \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension
-      * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a 
+      * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a
       * column-major matrix, and the number of columns for a row-major matrix. */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     Index innerSize() const
     {
       return IsVectorAtCompileTime ? this->size()
@@ -229,22 +254,21 @@
       * nothing else.
       */
     EIGEN_DEVICE_FUNC
-    void resize(Index nbRows, Index nbCols)
+    void resize(Index rows, Index cols)
     {
-      EIGEN_ONLY_USED_FOR_DEBUG(nbRows);
-      EIGEN_ONLY_USED_FOR_DEBUG(nbCols);
-      eigen_assert(nbRows == this->rows() && nbCols == this->cols()
+      EIGEN_ONLY_USED_FOR_DEBUG(rows);
+      EIGEN_ONLY_USED_FOR_DEBUG(cols);
+      eigen_assert(rows == this->rows() && cols == this->cols()
                 && "DenseBase::resize() does not actually allow to resize.");
     }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-
     /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
-    /** \internal Represents a vector with linearly spaced coefficients that allows sequential access only. */
-    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,false>,Derived> SequentialLinSpacedReturnType;
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
+    /** \internal \deprecated Represents a vector with linearly spaced coefficients that allows sequential access only. */
+    EIGEN_DEPRECATED typedef CwiseNullaryOp<internal::linspaced_op<Scalar>,PlainObject> SequentialLinSpacedReturnType;
     /** \internal Represents a vector with linearly spaced coefficients that allows random access. */
-    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,true>,Derived> RandomAccessLinSpacedReturnType;
+    typedef CwiseNullaryOp<internal::linspaced_op<Scalar>,PlainObject> RandomAccessLinSpacedReturnType;
     /** \internal the return type of MatrixBase::eigenvalues() */
     typedef Matrix<typename NumTraits<typename internal::traits<Derived>::Scalar>::Real, internal::traits<Derived>::ColsAtCompileTime, 1> EigenvaluesReturnType;
 
@@ -252,13 +276,13 @@
 
     /** Copies \a other into *this. \returns a reference to *this. */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator=(const DenseBase<OtherDerived>& other);
 
     /** Special case of the template operator=, in order to prevent the compiler
       * from generating a default operator= (issue hit with g++ 4.1)
       */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator=(const DenseBase& other);
 
     template<typename OtherDerived>
@@ -277,37 +301,34 @@
     EIGEN_DEVICE_FUNC
     Derived& operator=(const ReturnByValue<OtherDerived>& func);
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** Copies \a other into *this without evaluating other. \returns a reference to *this. */
+    /** \internal
+      * Copies \a other into *this without evaluating other. \returns a reference to *this. */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    /** \deprecated */
+    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
     Derived& lazyAssign(const DenseBase<OtherDerived>& other);
-#endif // not EIGEN_PARSED_BY_DOXYGEN
 
     EIGEN_DEVICE_FUNC
     CommaInitializer<Derived> operator<< (const Scalar& s);
 
     template<unsigned int Added,unsigned int Removed>
-    const Flagged<Derived, Added, Removed> flagged() const;
+    /** \deprecated it now returns \c *this */
+    EIGEN_DEPRECATED
+    const Derived& flagged() const
+    { return derived(); }
 
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
     CommaInitializer<Derived> operator<< (const DenseBase<OtherDerived>& other);
 
+    typedef Transpose<Derived> TransposeReturnType;
     EIGEN_DEVICE_FUNC
-    Eigen::Transpose<Derived> transpose();
+    TransposeReturnType transpose();
     typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
     EIGEN_DEVICE_FUNC
     ConstTransposeReturnType transpose() const;
     EIGEN_DEVICE_FUNC
     void transposeInPlace();
-#ifndef EIGEN_NO_DEBUG
-  protected:
-    template<typename OtherDerived>
-    void checkTransposeAliasing(const OtherDerived& other) const;
-  public:
-#endif
-
 
     EIGEN_DEVICE_FUNC static const ConstantReturnType
     Constant(Index rows, Index cols, const Scalar& value);
@@ -316,23 +337,24 @@
     EIGEN_DEVICE_FUNC static const ConstantReturnType
     Constant(const Scalar& value);
 
-    EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType
+    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
     LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high);
+    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
+    LinSpaced(Sequential_t, const Scalar& low, const Scalar& high);
+
     EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
     LinSpaced(Index size, const Scalar& low, const Scalar& high);
-    EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType
-    LinSpaced(Sequential_t, const Scalar& low, const Scalar& high);
     EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
     LinSpaced(const Scalar& low, const Scalar& high);
 
     template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
-    static const CwiseNullaryOp<CustomNullaryOp, Derived>
+    static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
     NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func);
     template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
-    static const CwiseNullaryOp<CustomNullaryOp, Derived>
+    static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
     NullaryExpr(Index size, const CustomNullaryOp& func);
     template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
-    static const CwiseNullaryOp<CustomNullaryOp, Derived>
+    static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
     NullaryExpr(const CustomNullaryOp& func);
 
     EIGEN_DEVICE_FUNC static const ConstantReturnType Zero(Index rows, Index cols);
@@ -353,7 +375,7 @@
     template<typename OtherDerived> EIGEN_DEVICE_FUNC
     bool isApprox(const DenseBase<OtherDerived>& other,
                   const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
     bool isMuchSmallerThan(const RealScalar& other,
                            const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
     template<typename OtherDerived> EIGEN_DEVICE_FUNC
@@ -364,20 +386,22 @@
     EIGEN_DEVICE_FUNC bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
     EIGEN_DEVICE_FUNC bool isZero(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
     EIGEN_DEVICE_FUNC bool isOnes(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    
+
     inline bool hasNaN() const;
     inline bool allFinite() const;
 
-    EIGEN_DEVICE_FUNC
-    inline Derived& operator*=(const Scalar& other);
-    EIGEN_DEVICE_FUNC
-    inline Derived& operator/=(const Scalar& other);
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator*=(const Scalar& other);
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator/=(const Scalar& other);
 
     typedef typename internal::add_const_on_value_type<typename internal::eval<Derived>::type>::type EvalReturnType;
     /** \returns the matrix or vector obtained by evaluating this expression.
       *
       * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
       * a const reference, in order to avoid a useless copy.
+      *
+      * \warning Be careful with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
       */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE EvalReturnType eval() const
@@ -392,24 +416,25 @@
       *
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    void swap(const DenseBase<OtherDerived>& other,
-              int = OtherDerived::ThisConstantIsPrivateInPlainObjectBase)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    void swap(const DenseBase<OtherDerived>& other)
     {
-      SwapWrapper<Derived>(derived()).lazyAssign(other.derived());
+      EIGEN_STATIC_ASSERT(!OtherDerived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      eigen_assert(rows()==other.rows() && cols()==other.cols());
+      call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
     }
 
     /** swaps *this with the matrix or array \a other.
       *
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     void swap(PlainObjectBase<OtherDerived>& other)
     {
-      SwapWrapper<Derived>(derived()).lazyAssign(other.derived());
+      eigen_assert(rows()==other.rows() && cols()==other.cols());
+      call_assignment(derived(), other.derived(), internal::swap_assign_op<Scalar>());
     }
 
-
     EIGEN_DEVICE_FUNC inline const NestByValue<Derived> nestByValue() const;
     EIGEN_DEVICE_FUNC inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
     EIGEN_DEVICE_FUNC inline ForceAlignedAccess<Derived> forceAlignedAccess();
@@ -424,28 +449,77 @@
 
     EIGEN_DEVICE_FUNC Scalar prod() const;
 
+    template<int NaNPropagation>
     EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar minCoeff() const;
+    template<int NaNPropagation>
     EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar maxCoeff() const;
 
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+
+    // By default, the fastest version with undefined NaN propagation semantics is
+    // used.
+    // TODO(rmlarsen): Replace with default template argument when we move to
+    // c++11 or beyond.
+    EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar minCoeff() const {
+      return minCoeff<PropagateFast>();
+    }
+    EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar maxCoeff() const {
+      return maxCoeff<PropagateFast>();
+    }
+
+    template<int NaNPropagation, typename IndexType>
+    EIGEN_DEVICE_FUNC
     typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const;
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+    template<int NaNPropagation, typename IndexType>
+    EIGEN_DEVICE_FUNC
     typename internal::traits<Derived>::Scalar maxCoeff(IndexType* row, IndexType* col) const;
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+    template<int NaNPropagation, typename IndexType>
+    EIGEN_DEVICE_FUNC
     typename internal::traits<Derived>::Scalar minCoeff(IndexType* index) const;
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+    template<int NaNPropagation, typename IndexType>
+    EIGEN_DEVICE_FUNC
     typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const;
 
+    // TODO(rmlarsen): Replace these methods with a default template argument.
+    template<typename IndexType>
+    EIGEN_DEVICE_FUNC inline
+    typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const {
+      return minCoeff<PropagateFast>(row, col);
+    }
+    template<typename IndexType>
+    EIGEN_DEVICE_FUNC inline
+    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* row, IndexType* col) const {
+      return maxCoeff<PropagateFast>(row, col);
+    }
+    template<typename IndexType>
+     EIGEN_DEVICE_FUNC inline
+    typename internal::traits<Derived>::Scalar minCoeff(IndexType* index) const {
+      return minCoeff<PropagateFast>(index);
+    }
+    template<typename IndexType>
+    EIGEN_DEVICE_FUNC inline
+    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const {
+      return maxCoeff<PropagateFast>(index);
+    }
+  
     template<typename BinaryOp>
     EIGEN_DEVICE_FUNC
-    typename internal::result_of<BinaryOp(typename internal::traits<Derived>::Scalar)>::type
-    redux(const BinaryOp& func) const;
+    Scalar redux(const BinaryOp& func) const;
 
     template<typename Visitor>
     EIGEN_DEVICE_FUNC
     void visit(Visitor& func) const;
 
-    inline const WithFormat<Derived> format(const IOFormat& fmt) const;
+    /** \returns a WithFormat proxy object allowing to print a matrix the with given
+      * format \a fmt.
+      *
+      * See class IOFormat for some examples.
+      *
+      * \sa class IOFormat, class WithFormat
+      */
+    inline const WithFormat<Derived> format(const IOFormat& fmt) const
+    {
+      return WithFormat<Derived>(derived(), fmt);
+    }
 
     /** \returns the unique coefficient of a 1x1 expression */
     EIGEN_DEVICE_FUNC
@@ -456,77 +530,142 @@
       return derived().coeff(0,0);
     }
 
-    bool all() const;
-    bool any() const;
-    Index count() const;
+    EIGEN_DEVICE_FUNC bool all() const;
+    EIGEN_DEVICE_FUNC bool any() const;
+    EIGEN_DEVICE_FUNC Index count() const;
 
     typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
     typedef const VectorwiseOp<const Derived, Horizontal> ConstRowwiseReturnType;
     typedef VectorwiseOp<Derived, Vertical> ColwiseReturnType;
     typedef const VectorwiseOp<const Derived, Vertical> ConstColwiseReturnType;
 
-    ConstRowwiseReturnType rowwise() const;
-    RowwiseReturnType rowwise();
-    ConstColwiseReturnType colwise() const;
-    ColwiseReturnType colwise();
+    /** \returns a VectorwiseOp wrapper of *this for broadcasting and partial reductions
+    *
+    * Example: \include MatrixBase_rowwise.cpp
+    * Output: \verbinclude MatrixBase_rowwise.out
+    *
+    * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
+    */
+    //Code moved here due to a CUDA compiler bug
+    EIGEN_DEVICE_FUNC inline ConstRowwiseReturnType rowwise() const {
+      return ConstRowwiseReturnType(derived());
+    }
+    EIGEN_DEVICE_FUNC RowwiseReturnType rowwise();
 
-    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random(Index rows, Index cols);
-    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random(Index size);
-    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random();
+    /** \returns a VectorwiseOp wrapper of *this broadcasting and partial reductions
+    *
+    * Example: \include MatrixBase_colwise.cpp
+    * Output: \verbinclude MatrixBase_colwise.out
+    *
+    * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
+    */
+    EIGEN_DEVICE_FUNC inline ConstColwiseReturnType colwise() const {
+      return ConstColwiseReturnType(derived());
+    }
+    EIGEN_DEVICE_FUNC ColwiseReturnType colwise();
+
+    typedef CwiseNullaryOp<internal::scalar_random_op<Scalar>,PlainObject> RandomReturnType;
+    static const RandomReturnType Random(Index rows, Index cols);
+    static const RandomReturnType Random(Index size);
+    static const RandomReturnType Random();
 
     template<typename ThenDerived,typename ElseDerived>
-    const Select<Derived,ThenDerived,ElseDerived>
+    inline EIGEN_DEVICE_FUNC const Select<Derived,ThenDerived,ElseDerived>
     select(const DenseBase<ThenDerived>& thenMatrix,
            const DenseBase<ElseDerived>& elseMatrix) const;
 
     template<typename ThenDerived>
-    inline const Select<Derived,ThenDerived, typename ThenDerived::ConstantReturnType>
+    inline EIGEN_DEVICE_FUNC const Select<Derived,ThenDerived, typename ThenDerived::ConstantReturnType>
     select(const DenseBase<ThenDerived>& thenMatrix, const typename ThenDerived::Scalar& elseScalar) const;
 
     template<typename ElseDerived>
-    inline const Select<Derived, typename ElseDerived::ConstantReturnType, ElseDerived >
+    inline EIGEN_DEVICE_FUNC const Select<Derived, typename ElseDerived::ConstantReturnType, ElseDerived >
     select(const typename ElseDerived::Scalar& thenScalar, const DenseBase<ElseDerived>& elseMatrix) const;
 
     template<int p> RealScalar lpNorm() const;
 
     template<int RowFactor, int ColFactor>
+    EIGEN_DEVICE_FUNC
     const Replicate<Derived,RowFactor,ColFactor> replicate() const;
-    const Replicate<Derived,Dynamic,Dynamic> replicate(Index rowFacor,Index colFactor) const;
+    /**
+    * \return an expression of the replication of \c *this
+    *
+    * Example: \include MatrixBase_replicate_int_int.cpp
+    * Output: \verbinclude MatrixBase_replicate_int_int.out
+    *
+    * \sa VectorwiseOp::replicate(), DenseBase::replicate<int,int>(), class Replicate
+    */
+    //Code moved here due to a CUDA compiler bug
+    EIGEN_DEVICE_FUNC
+    const Replicate<Derived, Dynamic, Dynamic> replicate(Index rowFactor, Index colFactor) const
+    {
+      return Replicate<Derived, Dynamic, Dynamic>(derived(), rowFactor, colFactor);
+    }
 
     typedef Reverse<Derived, BothDirections> ReverseReturnType;
     typedef const Reverse<const Derived, BothDirections> ConstReverseReturnType;
-    ReverseReturnType reverse();
-    ConstReverseReturnType reverse() const;
-    void reverseInPlace();
+    EIGEN_DEVICE_FUNC ReverseReturnType reverse();
+    /** This is the const version of reverse(). */
+    //Code moved here due to a CUDA compiler bug
+    EIGEN_DEVICE_FUNC ConstReverseReturnType reverse() const
+    {
+      return ConstReverseReturnType(derived());
+    }
+    EIGEN_DEVICE_FUNC void reverseInPlace();
+
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    /** STL-like <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">RandomAccessIterator</a>
+      * iterator type as returned by the begin() and end() methods.
+      */
+    typedef random_access_iterator_type iterator;
+    /** This is the const version of iterator (aka read-only) */
+    typedef random_access_iterator_type const_iterator;
+    #else
+    typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit,
+                                            internal::pointer_based_stl_iterator<Derived>,
+                                            internal::generic_randaccess_stl_iterator<Derived>
+                                          >::type iterator_type;
+
+    typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit,
+                                            internal::pointer_based_stl_iterator<const Derived>,
+                                            internal::generic_randaccess_stl_iterator<const Derived>
+                                          >::type const_iterator_type;
+
+    // Stl-style iterators are supported only for vectors.
+
+    typedef typename internal::conditional< IsVectorAtCompileTime,
+                                            iterator_type,
+                                            void
+                                          >::type iterator;
+
+    typedef typename internal::conditional< IsVectorAtCompileTime,
+                                            const_iterator_type,
+                                            void
+                                          >::type const_iterator;
+    #endif
+
+    inline iterator begin();
+    inline const_iterator begin() const;
+    inline const_iterator cbegin() const;
+    inline iterator end();
+    inline const_iterator end() const;
+    inline const_iterator cend() const;
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase
+#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
+#   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/BlockMethods.h"
+#   include "../plugins/IndexedViewMethods.h"
+#   include "../plugins/ReshapedMethods.h"
 #   ifdef EIGEN_DENSEBASE_PLUGIN
 #     include EIGEN_DENSEBASE_PLUGIN
 #   endif
-// Because of an intra-Google include scanner limitation,
-// third_party/stan cannot define the EIGEN_DENSEBASE_PLUGIN
-// macro
-// as "stan/math/matrix/EigenDenseBaseAddons.hpp".  According to
-// ambrose@google.com, this is a known limitation: the include
-// scanner doesn't maintain any preprocessor state about macros,
-// previously visited files, etc.  See also //base/stacktrace.cc.
-#   ifdef STAN_MATH_MATRIX_EIGEN_DENSEBASE_PLUGIN
-#     include "stan/math/matrix/EigenDenseBaseAddons.hpp"
-#   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
-
-#ifdef EIGEN2_SUPPORT
-
-    Block<Derived> corner(CornerType type, Index cRows, Index cCols);
-    const Block<Derived> corner(CornerType type, Index cRows, Index cCols) const;
-    template<int CRows, int CCols>
-    Block<Derived, CRows, CCols> corner(CornerType type);
-    template<int CRows, int CCols>
-    const Block<Derived, CRows, CCols> corner(CornerType type) const;
-
-#endif // EIGEN2_SUPPORT
-
+#undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
+#undef EIGEN_DOC_UNARY_ADDONS
 
     // disable the use of evalTo for dense objects with a nice compilation error
     template<typename Dest>
@@ -537,11 +676,12 @@
     }
 
   protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(DenseBase)
     /** Default constructor. Do nothing. */
     EIGEN_DEVICE_FUNC DenseBase()
     {
       /* Just checks for self-consistency of the flags.
-       * Only do it when debugging Eigen, as this borders on paranoiac and could slow compilation down
+       * Only do it when debugging Eigen, as this borders on paranoia and could slow compilation down
        */
 #ifdef EIGEN_INTERNAL_DEBUGGING
       EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, int(IsRowMajor))

diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h
index efabb5e..37fcdb5 100644
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h

@@ -22,11 +22,12 @@
 /** \brief Base class providing read-only coefficient access to matrices and arrays.
   * \ingroup Core_Module
   * \tparam Derived Type of the derived class
-  * \tparam #ReadOnlyAccessors Constant indicating read-only access
+  *
+  * \note #ReadOnlyAccessors Constant indicating read-only access
   *
   * This class defines the \c operator() \c const function and friends, which can be used to read specific
   * entries of a matrix or array.
-  * 
+  *
   * \sa DenseCoeffsBase<Derived, WriteAccessors>, DenseCoeffsBase<Derived, DirectAccessors>,
   *     \ref TopicClassHierarchy
   */
@@ -35,7 +36,6 @@
 {
   public:
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
 
@@ -97,8 +97,8 @@
     EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const
     {
       eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      return derived().coeff(row, col);
+                         && col >= 0 && col < cols());
+      return internal::evaluator<Derived>(derived()).coeff(row,col);
     }
 
     EIGEN_DEVICE_FUNC
@@ -117,7 +117,7 @@
     {
       eigen_assert(row >= 0 && row < rows()
           && col >= 0 && col < cols());
-      return derived().coeff(row, col);
+      return coeff(row, col);
     }
 
     /** Short version: don't use this function, use
@@ -139,8 +139,10 @@
     EIGEN_STRONG_INLINE CoeffReturnType
     coeff(Index index) const
     {
+      EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
+                          THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
       eigen_internal_assert(index >= 0 && index < size());
-      return derived().coeff(index);
+      return internal::evaluator<Derived>(derived()).coeff(index);
     }
 
 
@@ -156,12 +158,10 @@
     EIGEN_STRONG_INLINE CoeffReturnType
     operator[](Index index) const
     {
-      #ifndef EIGEN2_SUPPORT
       EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                           THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
-      #endif
       eigen_assert(index >= 0 && index < size());
-      return derived().coeff(index);
+      return coeff(index);
     }
 
     /** \returns the coefficient at given index.
@@ -179,7 +179,7 @@
     operator()(Index index) const
     {
       eigen_assert(index >= 0 && index < size());
-      return derived().coeff(index);
+      return coeff(index);
     }
 
     /** equivalent to operator[](0).  */
@@ -192,19 +192,31 @@
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType
-    y() const { return (*this)[1]; }
+    y() const
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=2, OUT_OF_RANGE_ACCESS);
+      return (*this)[1];
+    }
 
     /** equivalent to operator[](2).  */
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType
-    z() const { return (*this)[2]; }
+    z() const
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=3, OUT_OF_RANGE_ACCESS);
+      return (*this)[2];
+    }
 
     /** equivalent to operator[](3).  */
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType
-    w() const { return (*this)[3]; }
+    w() const
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=4, OUT_OF_RANGE_ACCESS);
+      return (*this)[3];
+    }
 
     /** \internal
       * \returns the packet of coefficients starting at the given row and column. It is your responsibility
@@ -219,9 +231,9 @@
     template<int LoadMode>
     EIGEN_STRONG_INLINE PacketReturnType packet(Index row, Index col) const
     {
-      eigen_internal_assert(row >= 0 && row < rows()
-                      && col >= 0 && col < cols());
-      return derived().template packet<LoadMode>(row,col);
+      typedef typename internal::packet_traits<Scalar>::type DefaultPacketType;
+      eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
+      return internal::evaluator<Derived>(derived()).template packet<LoadMode,DefaultPacketType>(row,col);
     }
 
 
@@ -246,8 +258,11 @@
     template<int LoadMode>
     EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
     {
+      EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
+                          THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
+      typedef typename internal::packet_traits<Scalar>::type DefaultPacketType;
       eigen_internal_assert(index >= 0 && index < size());
-      return derived().template packet<LoadMode>(index);
+      return internal::evaluator<Derived>(derived()).template packet<LoadMode,DefaultPacketType>(index);
     }
 
   protected:
@@ -274,12 +289,13 @@
 /** \brief Base class providing read/write coefficient access to matrices and arrays.
   * \ingroup Core_Module
   * \tparam Derived Type of the derived class
-  * \tparam #WriteAccessors Constant indicating read/write access
+  *
+  * \note #WriteAccessors Constant indicating read/write access
   *
   * This class defines the non-const \c operator() function and friends, which can be used to write specific
   * entries of a matrix or array. This class inherits DenseCoeffsBase<Derived, ReadOnlyAccessors> which
   * defines the const variant for reading specific entries.
-  * 
+  *
   * \sa DenseCoeffsBase<Derived, DirectAccessors>, \ref TopicClassHierarchy
   */
 template<typename Derived>
@@ -290,7 +306,6 @@
     typedef DenseCoeffsBase<Derived, ReadOnlyAccessors> Base;
 
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -327,8 +342,8 @@
     EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col)
     {
       eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      return derived().coeffRef(row, col);
+                         && col >= 0 && col < cols());
+      return internal::evaluator<Derived>(derived()).coeffRef(row,col);
     }
 
     EIGEN_DEVICE_FUNC
@@ -350,7 +365,7 @@
     {
       eigen_assert(row >= 0 && row < rows()
           && col >= 0 && col < cols());
-      return derived().coeffRef(row, col);
+      return coeffRef(row, col);
     }
 
 
@@ -373,8 +388,10 @@
     EIGEN_STRONG_INLINE Scalar&
     coeffRef(Index index)
     {
+      EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
+                          THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
       eigen_internal_assert(index >= 0 && index < size());
-      return derived().coeffRef(index);
+      return internal::evaluator<Derived>(derived()).coeffRef(index);
     }
 
     /** \returns a reference to the coefficient at given index.
@@ -388,12 +405,10 @@
     EIGEN_STRONG_INLINE Scalar&
     operator[](Index index)
     {
-      #ifndef EIGEN2_SUPPORT
       EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                           THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
-      #endif
       eigen_assert(index >= 0 && index < size());
-      return derived().coeffRef(index);
+      return coeffRef(index);
     }
 
     /** \returns a reference to the coefficient at given index.
@@ -410,7 +425,7 @@
     operator()(Index index)
     {
       eigen_assert(index >= 0 && index < size());
-      return derived().coeffRef(index);
+      return coeffRef(index);
     }
 
     /** equivalent to operator[](0).  */
@@ -423,169 +438,44 @@
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
-    y() { return (*this)[1]; }
+    y()
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=2, OUT_OF_RANGE_ACCESS);
+      return (*this)[1];
+    }
 
     /** equivalent to operator[](2).  */
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
-    z() { return (*this)[2]; }
+    z()
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=3, OUT_OF_RANGE_ACCESS);
+      return (*this)[2];
+    }
 
     /** equivalent to operator[](3).  */
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
-    w() { return (*this)[3]; }
-
-    /** \internal
-      * Stores the given packet of coefficients, at the given row and column of this expression. It is your responsibility
-      * to ensure that a packet really starts there. This method is only available on expressions having the
-      * PacketAccessBit.
-      *
-      * The \a LoadMode parameter may have the value \a #Aligned or \a #Unaligned. Its effect is to select
-      * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
-      * starting at an address which is a multiple of the packet size.
-      */
-
-    template<int StoreMode>
-    EIGEN_STRONG_INLINE void writePacket
-    (Index row, Index col, const typename internal::packet_traits<Scalar>::type& val)
+    w()
     {
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      derived().template writePacket<StoreMode>(row,col,val);
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=4, OUT_OF_RANGE_ACCESS);
+      return (*this)[3];
     }
-
-
-    /** \internal */
-    template<int StoreMode>
-    EIGEN_STRONG_INLINE void writePacketByOuterInner
-    (Index outer, Index inner, const typename internal::packet_traits<Scalar>::type& val)
-    {
-      writePacket<StoreMode>(rowIndexByOuterInner(outer, inner),
-                            colIndexByOuterInner(outer, inner),
-                            val);
-    }
-
-    /** \internal
-      * Stores the given packet of coefficients, at the given index in this expression. It is your responsibility
-      * to ensure that a packet really starts there. This method is only available on expressions having the
-      * PacketAccessBit and the LinearAccessBit.
-      *
-      * The \a LoadMode parameter may have the value \a Aligned or \a Unaligned. Its effect is to select
-      * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
-      * starting at an address which is a multiple of the packet size.
-      */
-    template<int StoreMode>
-    EIGEN_STRONG_INLINE void writePacket
-    (Index index, const typename internal::packet_traits<Scalar>::type& val)
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      derived().template writePacket<StoreMode>(index,val);
-    }
-
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-
-    /** \internal Copies the coefficient at position (row,col) of other into *this.
-      *
-      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
-      * with usual assignments.
-      *
-      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
-      */
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void copyCoeff(Index row, Index col, const DenseBase<OtherDerived>& other)
-    {
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      derived().coeffRef(row, col) = other.derived().coeff(row, col);
-    }
-
-    /** \internal Copies the coefficient at the given index of other into *this.
-      *
-      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
-      * with usual assignments.
-      *
-      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
-      */
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      derived().coeffRef(index) = other.derived().coeff(index);
-    }
-
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void copyCoeffByOuterInner(Index outer, Index inner, const DenseBase<OtherDerived>& other)
-    {
-      const Index row = rowIndexByOuterInner(outer,inner);
-      const Index col = colIndexByOuterInner(outer,inner);
-      // derived() is important here: copyCoeff() may be reimplemented in Derived!
-      derived().copyCoeff(row, col, other);
-    }
-
-    /** \internal Copies the packet at position (row,col) of other into *this.
-      *
-      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
-      * with usual assignments.
-      *
-      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
-      */
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    EIGEN_STRONG_INLINE void copyPacket(Index row, Index col, const DenseBase<OtherDerived>& other)
-    {
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      derived().template writePacket<StoreMode>(row, col,
-        other.derived().template packet<LoadMode>(row, col));
-    }
-
-    /** \internal Copies the packet at the given index of other into *this.
-      *
-      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
-      * with usual assignments.
-      *
-      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
-      */
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    EIGEN_STRONG_INLINE void copyPacket(Index index, const DenseBase<OtherDerived>& other)
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      derived().template writePacket<StoreMode>(index,
-        other.derived().template packet<LoadMode>(index));
-    }
-
-    /** \internal */
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    EIGEN_STRONG_INLINE void copyPacketByOuterInner(Index outer, Index inner, const DenseBase<OtherDerived>& other)
-    {
-      const Index row = rowIndexByOuterInner(outer,inner);
-      const Index col = colIndexByOuterInner(outer,inner);
-      // derived() is important here: copyCoeff() may be reimplemented in Derived!
-      derived().template copyPacket< OtherDerived, StoreMode, LoadMode>(row, col, other);
-    }
-#endif
-
 };
 
 /** \brief Base class providing direct read-only coefficient access to matrices and arrays.
   * \ingroup Core_Module
   * \tparam Derived Type of the derived class
-  * \tparam #DirectAccessors Constant indicating direct access
+  *
+  * \note #DirectAccessors Constant indicating direct access
   *
   * This class defines functions to work with strides which can be used to access entries directly. This class
   * inherits DenseCoeffsBase<Derived, ReadOnlyAccessors> which defines functions to access entries read-only using
   * \c operator() .
   *
-  * \sa \ref TopicClassHierarchy
+  * \sa \blank \ref TopicClassHierarchy
   */
 template<typename Derived>
 class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived, ReadOnlyAccessors>
@@ -593,7 +483,6 @@
   public:
 
     typedef DenseCoeffsBase<Derived, ReadOnlyAccessors> Base;
-    typedef typename internal::traits<Derived>::Index Index;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
 
@@ -606,7 +495,7 @@
       *
       * \sa outerStride(), rowStride(), colStride()
       */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index innerStride() const
     {
       return derived().innerStride();
@@ -617,14 +506,14 @@
       *
       * \sa innerStride(), rowStride(), colStride()
       */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index outerStride() const
     {
       return derived().outerStride();
     }
 
     // FIXME shall we remove it ?
-    inline Index stride() const
+    EIGEN_CONSTEXPR inline Index stride() const
     {
       return Derived::IsVectorAtCompileTime ? innerStride() : outerStride();
     }
@@ -633,7 +522,7 @@
       *
       * \sa innerStride(), outerStride(), colStride()
       */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index rowStride() const
     {
       return Derived::IsRowMajor ? outerStride() : innerStride();
@@ -643,7 +532,7 @@
       *
       * \sa innerStride(), outerStride(), rowStride()
       */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index colStride() const
     {
       return Derived::IsRowMajor ? innerStride() : outerStride();
@@ -653,13 +542,14 @@
 /** \brief Base class providing direct read/write coefficient access to matrices and arrays.
   * \ingroup Core_Module
   * \tparam Derived Type of the derived class
-  * \tparam #DirectWriteAccessors Constant indicating direct access
+  *
+  * \note #DirectWriteAccessors Constant indicating direct access
   *
   * This class defines functions to work with strides which can be used to access entries directly. This class
   * inherits DenseCoeffsBase<Derived, WriteAccessors> which defines functions to access entries read/write using
   * \c operator().
   *
-  * \sa \ref TopicClassHierarchy
+  * \sa \blank \ref TopicClassHierarchy
   */
 template<typename Derived>
 class DenseCoeffsBase<Derived, DirectWriteAccessors>
@@ -668,7 +558,6 @@
   public:
 
     typedef DenseCoeffsBase<Derived, WriteAccessors> Base;
-    typedef typename internal::traits<Derived>::Index Index;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
 
@@ -681,8 +570,8 @@
       *
       * \sa outerStride(), rowStride(), colStride()
       */
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT
     {
       return derived().innerStride();
     }
@@ -692,14 +581,14 @@
       *
       * \sa innerStride(), rowStride(), colStride()
       */
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT
     {
       return derived().outerStride();
     }
 
     // FIXME shall we remove it ?
-    inline Index stride() const
+    EIGEN_CONSTEXPR inline Index stride() const EIGEN_NOEXCEPT
     {
       return Derived::IsVectorAtCompileTime ? innerStride() : outerStride();
     }
@@ -708,8 +597,8 @@
       *
       * \sa innerStride(), outerStride(), colStride()
       */
-    EIGEN_DEVICE_FUNC
-    inline Index rowStride() const
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rowStride() const EIGEN_NOEXCEPT
     {
       return Derived::IsRowMajor ? outerStride() : innerStride();
     }
@@ -718,8 +607,8 @@
       *
       * \sa innerStride(), outerStride(), rowStride()
       */
-    EIGEN_DEVICE_FUNC
-    inline Index colStride() const
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index colStride() const EIGEN_NOEXCEPT
     {
       return Derived::IsRowMajor ? innerStride() : outerStride();
     }
@@ -727,33 +616,42 @@
 
 namespace internal {
 
-template<typename Derived, bool JustReturnZero>
+template<int Alignment, typename Derived, bool JustReturnZero>
 struct first_aligned_impl
 {
-  static inline typename Derived::Index run(const Derived&)
+  static EIGEN_CONSTEXPR inline Index run(const Derived&) EIGEN_NOEXCEPT
   { return 0; }
 };
 
-template<typename Derived>
-struct first_aligned_impl<Derived, false>
+template<int Alignment, typename Derived>
+struct first_aligned_impl<Alignment, Derived, false>
 {
-  static inline typename Derived::Index run(const Derived& m)
+  static inline Index run(const Derived& m)
   {
-    return internal::first_aligned(&m.const_cast_derived().coeffRef(0,0), m.size());
+    return internal::first_aligned<Alignment>(m.data(), m.size());
   }
 };
 
-/** \internal \returns the index of the first element of the array that is well aligned for vectorization.
+/** \internal \returns the index of the first element of the array stored by \a m that is properly aligned with respect to \a Alignment for vectorization.
+  *
+  * \tparam Alignment requested alignment in Bytes.
   *
   * There is also the variant first_aligned(const Scalar*, Integer) defined in Memory.h. See it for more
   * documentation.
   */
-template<typename Derived>
-static inline typename Derived::Index first_aligned(const Derived& m)
+template<int Alignment, typename Derived>
+static inline Index first_aligned(const DenseBase<Derived>& m)
 {
-  return first_aligned_impl
-          <Derived, (Derived::Flags & AlignedBit) || !(Derived::Flags & DirectAccessBit)>
-          ::run(m);
+  enum { ReturnZero = (int(evaluator<Derived>::Alignment) >= Alignment) || !(Derived::Flags & DirectAccessBit) };
+  return first_aligned_impl<Alignment, Derived, ReturnZero>::run(m.derived());
+}
+
+template<typename Derived>
+static inline Index first_default_aligned(const DenseBase<Derived>& m)
+{
+  typedef typename Derived::Scalar Scalar;
+  typedef typename packet_traits<Scalar>::type DefaultPacketType;
+  return internal::first_aligned<int(unpacket_traits<DefaultPacketType>::alignment),Derived>(m);
 }
 
 template<typename Derived, bool HasDirectAccess = has_direct_access<Derived>::ret>

diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index 59f5154..08ef6c5 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h

@@ -13,9 +13,9 @@
 #define EIGEN_MATRIXSTORAGE_H
 
 #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN EIGEN_DENSE_STORAGE_CTOR_PLUGIN;
+  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X) X; EIGEN_DENSE_STORAGE_CTOR_PLUGIN;
 #else
-  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X)
 #endif
 
 namespace Eigen {
@@ -40,61 +40,117 @@
   */
 template <typename T, int Size, int MatrixOrArrayOptions,
           int Alignment = (MatrixOrArrayOptions&DontAlign) ? 0
-                        : (((Size*sizeof(T))%EIGEN_ALIGN_BYTES)==0) ? EIGEN_ALIGN_BYTES
-                        : 0 >
+                        : compute_default_alignment<T,Size>::value >
 struct plain_array
 {
   T array[Size];
 
   EIGEN_DEVICE_FUNC
   plain_array()
-  { 
+  {
     check_static_allocation_size<T,Size>();
   }
 
   EIGEN_DEVICE_FUNC
   plain_array(constructor_without_unaligned_array_assert)
-  { 
+  {
     check_static_allocation_size<T,Size>();
   }
 };
 
 #if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
   #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)
-#elif EIGEN_GNUC_AT_LEAST(4,7) 
-  // GCC 4.7 is too aggressive in its optimizations and remove the alignement test based on the fact the array is declared to be aligned.
+#elif EIGEN_GNUC_AT_LEAST(4,7)
+  // GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned.
   // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900
   // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined:
   template<typename PtrType>
   EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; }
   #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \
+    eigen_assert((internal::UIntPtr(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \
               && "this assertion is explained here: " \
               "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
               " **** READ THIS WEB PAGE !!! ****");
 #else
   #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(array) & (sizemask)) == 0 \
+    eigen_assert((internal::UIntPtr(array) & (sizemask)) == 0 \
               && "this assertion is explained here: " \
               "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
               " **** READ THIS WEB PAGE !!! ****");
 #endif
 
 template <typename T, int Size, int MatrixOrArrayOptions>
-struct plain_array<T, Size, MatrixOrArrayOptions, EIGEN_ALIGN_BYTES>
+struct plain_array<T, Size, MatrixOrArrayOptions, 8>
 {
-  EIGEN_USER_ALIGN_DEFAULT T array[Size];
+  EIGEN_ALIGN_TO_BOUNDARY(8) T array[Size];
 
   EIGEN_DEVICE_FUNC
-  plain_array() 
-  { 
-    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(EIGEN_ALIGN_BYTES-1);
+  plain_array()
+  {
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(7);
     check_static_allocation_size<T,Size>();
   }
 
   EIGEN_DEVICE_FUNC
-  plain_array(constructor_without_unaligned_array_assert) 
-  { 
+  plain_array(constructor_without_unaligned_array_assert)
+  {
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 16>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(16) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array()
+  {
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(15);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert)
+  {
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 32>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(32) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array()
+  {
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(31);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert)
+  {
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 64>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(64) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array()
+  {
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(63);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert)
+  {
     check_static_allocation_size<T,Size>();
   }
 };
@@ -102,11 +158,35 @@
 template <typename T, int MatrixOrArrayOptions, int Alignment>
 struct plain_array<T, 0, MatrixOrArrayOptions, Alignment>
 {
-  EIGEN_USER_ALIGN_DEFAULT T array[1];
+  T array[1];
   EIGEN_DEVICE_FUNC plain_array() {}
   EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {}
 };
 
+struct plain_array_helper {
+  template<typename T, int Size, int MatrixOrArrayOptions, int Alignment>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static void copy(const plain_array<T, Size, MatrixOrArrayOptions, Alignment>& src, const Eigen::Index size,
+                         plain_array<T, Size, MatrixOrArrayOptions, Alignment>& dst) {
+    smart_copy(src.array, src.array + size, dst.array);
+  }
+  
+  template<typename T, int Size, int MatrixOrArrayOptions, int Alignment>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static void swap(plain_array<T, Size, MatrixOrArrayOptions, Alignment>& a, const Eigen::Index a_size,
+                   plain_array<T, Size, MatrixOrArrayOptions, Alignment>& b, const Eigen::Index b_size) {
+    if (a_size < b_size) {
+      std::swap_ranges(b.array, b.array + a_size, a.array);
+      smart_move(b.array + a_size, b.array + b_size, a.array + a_size);
+    } else if (a_size > b_size) {
+      std::swap_ranges(a.array, a.array + b_size, b.array);
+      smart_move(a.array + b_size, a.array + a_size, b.array + b_size);
+    } else {
+      std::swap_ranges(a.array, a.array + a_size, b.array);
+    }
+  }
+};
+
 } // end namespace internal
 
 /** \internal
@@ -128,24 +208,61 @@
 {
     internal::plain_array<T,Size,_Options> m_data;
   public:
-    EIGEN_DEVICE_FUNC DenseStorage() {}
-    EIGEN_DEVICE_FUNC
-    DenseStorage(internal::constructor_without_unaligned_array_assert)
-      : m_data(internal::constructor_without_unaligned_array_assert()) {}
-    EIGEN_DEVICE_FUNC 
-    DenseStorage(const DenseStorage& other) : m_data(other.m_data) {}
-    EIGEN_DEVICE_FUNC 
-    DenseStorage& operator=(const DenseStorage& other)
-    { 
-      if (this != &other) m_data = other.m_data;
-      return *this; 
+    EIGEN_DEVICE_FUNC DenseStorage() {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
     }
-    EIGEN_DEVICE_FUNC DenseStorage(DenseIndex,DenseIndex,DenseIndex) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
-    EIGEN_DEVICE_FUNC static DenseIndex rows(void) {return _Rows;}
-    EIGEN_DEVICE_FUNC static DenseIndex cols(void) {return _Cols;}
-    EIGEN_DEVICE_FUNC void conservativeResize(DenseIndex,DenseIndex,DenseIndex) {}
-    EIGEN_DEVICE_FUNC void resize(DenseIndex,DenseIndex,DenseIndex) {}
+    EIGEN_DEVICE_FUNC
+    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
+      : m_data(internal::constructor_without_unaligned_array_assert()) {}
+#if !EIGEN_HAS_CXX11 || defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN)
+    EIGEN_DEVICE_FUNC
+    DenseStorage(const DenseStorage& other) : m_data(other.m_data) {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
+    }
+#else
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) = default;
+#endif
+#if !EIGEN_HAS_CXX11
+    EIGEN_DEVICE_FUNC
+    DenseStorage& operator=(const DenseStorage& other)
+    {
+      if (this != &other) m_data = other.m_data;
+      return *this;
+    }
+#else
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) = default;
+#endif
+#if EIGEN_HAS_RVALUE_REFERENCES
+#if !EIGEN_HAS_CXX11
+    EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
+      : m_data(std::move(other.m_data))
+    {
+    }
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
+    {
+      if (this != &other)
+        m_data = std::move(other.m_data);
+      return *this;
+    }
+#else
+    EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&&) = default;
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&&) = default;
+#endif
+#endif
+    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+      eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols);
+      EIGEN_UNUSED_VARIABLE(size);
+      EIGEN_UNUSED_VARIABLE(rows);
+      EIGEN_UNUSED_VARIABLE(cols);
+    }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data, other.m_data);
+    }
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) EIGEN_NOEXCEPT {return _Cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {}
     EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
     EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
@@ -155,15 +272,15 @@
 {
   public:
     EIGEN_DEVICE_FUNC DenseStorage() {}
-    EIGEN_DEVICE_FUNC DenseStorage(internal::constructor_without_unaligned_array_assert) {}
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) {}
     EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) {}
     EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) { return *this; }
-    EIGEN_DEVICE_FUNC DenseStorage(DenseIndex,DenseIndex,DenseIndex) {}
+    EIGEN_DEVICE_FUNC DenseStorage(Index,Index,Index) {}
     EIGEN_DEVICE_FUNC void swap(DenseStorage& ) {}
-    EIGEN_DEVICE_FUNC static DenseIndex rows(void) {return _Rows;}
-    EIGEN_DEVICE_FUNC static DenseIndex cols(void) {return _Cols;}
-    EIGEN_DEVICE_FUNC void conservativeResize(DenseIndex,DenseIndex,DenseIndex) {}
-    EIGEN_DEVICE_FUNC void resize(DenseIndex,DenseIndex,DenseIndex) {}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) EIGEN_NOEXCEPT {return _Cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {}
     EIGEN_DEVICE_FUNC const T *data() const { return 0; }
     EIGEN_DEVICE_FUNC T *data() { return 0; }
 };
@@ -182,30 +299,38 @@
 template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic, Dynamic, _Options>
 {
     internal::plain_array<T,Size,_Options> m_data;
-    DenseIndex m_rows;
-    DenseIndex m_cols;
+    Index m_rows;
+    Index m_cols;
   public:
     EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0), m_cols(0) {}
-    DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {}
-    DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows), m_cols(other.m_cols) {}
-    DenseStorage& operator=(const DenseStorage& other) 
-    { 
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
+      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows), m_cols(other.m_cols)
+    {
+      internal::plain_array_helper::copy(other.m_data, m_rows * m_cols, m_data);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
+    {
       if (this != &other)
       {
-        m_data = other.m_data;
         m_rows = other.m_rows;
         m_cols = other.m_cols;
+        internal::plain_array_helper::copy(other.m_data, m_rows * m_cols, m_data);
       }
-      return *this; 
+      return *this;
     }
-    DenseStorage(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) : m_rows(nbRows), m_cols(nbCols) {}
-    void swap(DenseStorage& other)
-    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
-    EIGEN_DEVICE_FUNC DenseIndex rows() const {return m_rows;}
-    EIGEN_DEVICE_FUNC DenseIndex cols() const {return m_cols;}
-    void conservativeResize(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) { m_rows = nbRows; m_cols = nbCols; }
-    void resize(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) { m_rows = nbRows; m_cols = nbCols; }
+    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
+    {
+      internal::plain_array_helper::swap(m_data, m_rows * m_cols, other.m_data, other.m_rows * other.m_cols);
+      numext::swap(m_rows,other.m_rows);
+      numext::swap(m_cols,other.m_cols);
+    }
+    EIGEN_DEVICE_FUNC Index rows() const {return m_rows;}
+    EIGEN_DEVICE_FUNC Index cols() const {return m_cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
+    EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
     EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
     EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
@@ -214,27 +339,36 @@
 template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Size, Dynamic, _Cols, _Options>
 {
     internal::plain_array<T,Size,_Options> m_data;
-    DenseIndex m_rows;
+    Index m_rows;
   public:
     EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0) {}
-    DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {}
-    DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows) {}
-    DenseStorage& operator=(const DenseStorage& other) 
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
+      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows)
+    {
+      internal::plain_array_helper::copy(other.m_data, m_rows * _Cols, m_data);
+    }
+    
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
     {
       if (this != &other)
       {
-        m_data = other.m_data;
         m_rows = other.m_rows;
+        internal::plain_array_helper::copy(other.m_data, m_rows * _Cols, m_data);
       }
-      return *this; 
+      return *this;
     }
-    DenseStorage(DenseIndex, DenseIndex nbRows, DenseIndex) : m_rows(nbRows) {}
-    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
-    EIGEN_DEVICE_FUNC DenseIndex rows(void) const {return m_rows;}
-    EIGEN_DEVICE_FUNC DenseIndex cols(void) const {return _Cols;}
-    void conservativeResize(DenseIndex, DenseIndex nbRows, DenseIndex) { m_rows = nbRows; }
-    void resize(DenseIndex, DenseIndex nbRows, DenseIndex) { m_rows = nbRows; }
+    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
+    { 
+      internal::plain_array_helper::swap(m_data, m_rows * _Cols, other.m_data, other.m_rows * _Cols);
+      numext::swap(m_rows, other.m_rows);
+    }
+    EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;}
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols(void) const EIGEN_NOEXCEPT {return _Cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
+    EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index) { m_rows = rows; }
     EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
     EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
@@ -243,27 +377,34 @@
 template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Size, _Rows, Dynamic, _Options>
 {
     internal::plain_array<T,Size,_Options> m_data;
-    DenseIndex m_cols;
+    Index m_cols;
   public:
     EIGEN_DEVICE_FUNC DenseStorage() : m_cols(0) {}
-    DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(0) {}
-    DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_cols(other.m_cols) {}
-    DenseStorage& operator=(const DenseStorage& other)
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) 
+      : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(other.m_cols)
+    {
+      internal::plain_array_helper::copy(other.m_data, _Rows * m_cols, m_data);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
     {
       if (this != &other)
       {
-        m_data = other.m_data;
         m_cols = other.m_cols;
+        internal::plain_array_helper::copy(other.m_data, _Rows * m_cols, m_data);
       }
       return *this;
     }
-    DenseStorage(DenseIndex, DenseIndex, DenseIndex nbCols) : m_cols(nbCols) {}
-    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
-    EIGEN_DEVICE_FUNC DenseIndex rows(void) const {return _Rows;}
-    EIGEN_DEVICE_FUNC DenseIndex cols(void) const {return m_cols;}
-    void conservativeResize(DenseIndex, DenseIndex, DenseIndex nbCols) { m_cols = nbCols; }
-    void resize(DenseIndex, DenseIndex, DenseIndex nbCols) { m_cols = nbCols; }
+    EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      internal::plain_array_helper::swap(m_data, _Rows * m_cols, other.m_data, _Rows * other.m_cols);
+      numext::swap(m_cols, other.m_cols);
+    }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows(void) const EIGEN_NOEXCEPT {return _Rows;}
+    EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
+    EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) { m_cols = cols; }
     EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
     EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
@@ -272,23 +413,27 @@
 template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynamic, _Options>
 {
     T *m_data;
-    DenseIndex m_rows;
-    DenseIndex m_cols;
+    Index m_rows;
+    Index m_cols;
   public:
     EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0), m_cols(0) {}
-    DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
        : m_data(0), m_rows(0), m_cols(0) {}
-    DenseStorage(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
-      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows), m_cols(nbCols)
-    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-    DenseStorage(const DenseStorage& other)
+    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols)
+      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows), m_cols(cols)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+      eigen_internal_assert(size==rows*cols && rows>=0 && cols >=0);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
       : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*other.m_cols))
       , m_rows(other.m_rows)
       , m_cols(other.m_cols)
     {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*m_cols)
       internal::smart_copy(other.m_data, other.m_data+other.m_rows*other.m_cols, m_data);
     }
-    DenseStorage& operator=(const DenseStorage& other)
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
     {
       if (this != &other)
       {
@@ -297,47 +442,54 @@
       }
       return *this;
     }
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
-    DenseStorage(DenseStorage&& other)
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
       : m_data(std::move(other.m_data))
       , m_rows(std::move(other.m_rows))
       , m_cols(std::move(other.m_cols))
     {
       other.m_data = nullptr;
+      other.m_rows = 0;
+      other.m_cols = 0;
     }
-    DenseStorage& operator=(DenseStorage&& other)
+    EIGEN_DEVICE_FUNC
+    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_rows, other.m_rows);
-      swap(m_cols, other.m_cols);
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_rows, other.m_rows);
+      numext::swap(m_cols, other.m_cols);
       return *this;
     }
 #endif
-    ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
-    void swap(DenseStorage& other)
-    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
-    EIGEN_DEVICE_FUNC DenseIndex rows(void) const {return m_rows;}
-    EIGEN_DEVICE_FUNC DenseIndex cols(void) const {return m_cols;}
-    void conservativeResize(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
+    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
+    {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+      numext::swap(m_cols,other.m_cols);
+    }
+    EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;}
+    EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;}
+    void conservativeResize(Index size, Index rows, Index cols)
     {
       m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*m_cols);
-      m_rows = nbRows;
-      m_cols = nbCols;
+      m_rows = rows;
+      m_cols = cols;
     }
-    void resize(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
+    EIGEN_DEVICE_FUNC void resize(Index size, Index rows, Index cols)
     {
       if(size != m_rows*m_cols)
       {
         internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols);
-        if (size)
+        if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
           m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
         else
           m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
       }
-      m_rows = nbRows;
-      m_cols = nbCols;
+      m_rows = rows;
+      m_cols = cols;
     }
     EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
     EIGEN_DEVICE_FUNC T *data() { return m_data; }
@@ -347,19 +499,24 @@
 template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Rows, Dynamic, _Options>
 {
     T *m_data;
-    DenseIndex m_cols;
+    Index m_cols;
   public:
     EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_cols(0) {}
-    DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
-    DenseStorage(DenseIndex size, DenseIndex, DenseIndex nbCols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(nbCols)
-    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-    DenseStorage(const DenseStorage& other)
+    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
+    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(cols)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+      eigen_internal_assert(size==rows*cols && rows==_Rows && cols >=0);
+      EIGEN_UNUSED_VARIABLE(rows);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
       : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(_Rows*other.m_cols))
       , m_cols(other.m_cols)
     {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_cols*_Rows)
       internal::smart_copy(other.m_data, other.m_data+_Rows*m_cols, m_data);
     }
-    DenseStorage& operator=(const DenseStorage& other)
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
     {
       if (this != &other)
       {
@@ -367,43 +524,48 @@
         this->swap(tmp);
       }
       return *this;
-    }    
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
-    DenseStorage(DenseStorage&& other)
+    }
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
       : m_data(std::move(other.m_data))
       , m_cols(std::move(other.m_cols))
     {
       other.m_data = nullptr;
+      other.m_cols = 0;
     }
-    DenseStorage& operator=(DenseStorage&& other)
+    EIGEN_DEVICE_FUNC
+    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_cols, other.m_cols);
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_cols, other.m_cols);
       return *this;
     }
 #endif
-    ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
-    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
-    EIGEN_DEVICE_FUNC static DenseIndex rows(void) {return _Rows;}
-    EIGEN_DEVICE_FUNC DenseIndex cols(void) const {return m_cols;}
-    void conservativeResize(DenseIndex size, DenseIndex, DenseIndex nbCols)
+    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_cols,other.m_cols);
+    }
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;}
+    EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols)
     {
       m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, _Rows*m_cols);
-      m_cols = nbCols;
+      m_cols = cols;
     }
-    EIGEN_STRONG_INLINE void resize(DenseIndex size, DenseIndex, DenseIndex nbCols)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index size, Index, Index cols)
     {
       if(size != _Rows*m_cols)
       {
         internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols);
-        if (size)
+        if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
           m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
         else
           m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
       }
-      m_cols = nbCols;
+      m_cols = cols;
     }
     EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
     EIGEN_DEVICE_FUNC T *data() { return m_data; }
@@ -413,19 +575,24 @@
 template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dynamic, _Cols, _Options>
 {
     T *m_data;
-    DenseIndex m_rows;
+    Index m_rows;
   public:
     EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0) {}
-    DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
-    DenseStorage(DenseIndex size, DenseIndex nbRows, DenseIndex) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows)
-    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-    DenseStorage(const DenseStorage& other)
+    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
+    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+      eigen_internal_assert(size==rows*cols && rows>=0 && cols == _Cols);
+      EIGEN_UNUSED_VARIABLE(cols);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
       : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*_Cols))
       , m_rows(other.m_rows)
     {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*_Cols)
       internal::smart_copy(other.m_data, other.m_data+other.m_rows*_Cols, m_data);
     }
-    DenseStorage& operator=(const DenseStorage& other)
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
     {
       if (this != &other)
       {
@@ -433,43 +600,48 @@
         this->swap(tmp);
       }
       return *this;
-    }    
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
-    DenseStorage(DenseStorage&& other)
+    }
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
       : m_data(std::move(other.m_data))
       , m_rows(std::move(other.m_rows))
     {
       other.m_data = nullptr;
+      other.m_rows = 0;
     }
-    DenseStorage& operator=(DenseStorage&& other)
+    EIGEN_DEVICE_FUNC
+    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_rows, other.m_rows);
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_rows, other.m_rows);
       return *this;
     }
 #endif
-    ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
-    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
-    EIGEN_DEVICE_FUNC DenseIndex rows(void) const {return m_rows;}
-    EIGEN_DEVICE_FUNC static DenseIndex cols(void) {return _Cols;}
-    void conservativeResize(DenseIndex size, DenseIndex nbRows, DenseIndex)
+    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+    }
+    EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) {return _Cols;}
+    void conservativeResize(Index size, Index rows, Index)
     {
       m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*_Cols);
-      m_rows = nbRows;
+      m_rows = rows;
     }
-    EIGEN_STRONG_INLINE void resize(DenseIndex size, DenseIndex nbRows, DenseIndex)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index size, Index rows, Index)
     {
       if(size != m_rows*_Cols)
       {
         internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows);
-        if (size)
+        if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
           m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
         else
           m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
       }
-      m_rows = nbRows;
+      m_rows = rows;
     }
     EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
     EIGEN_DEVICE_FUNC T *data() { return m_data; }

diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h
index 7a554e9..3112d2c 100644
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h

@@ -11,7 +11,7 @@
 #ifndef EIGEN_DIAGONAL_H
 #define EIGEN_DIAGONAL_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \class Diagonal
   * \ingroup Core_Module
@@ -21,7 +21,7 @@
   * \param MatrixType the type of the object in which we are taking a sub/main/super diagonal
   * \param DiagIndex the index of the sub/super diagonal. The default is 0 and it means the main diagonal.
   *              A positive value means a superdiagonal, a negative value means a subdiagonal.
-  *              You can also use Dynamic so the index can be set at runtime.
+  *              You can also use DynamicIndex so the index can be set at runtime.
   *
   * The matrix is not required to be square.
   *
@@ -37,7 +37,7 @@
 struct traits<Diagonal<MatrixType,DiagIndex> >
  : traits<MatrixType>
 {
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
   typedef typename MatrixType::StorageKind StorageKind;
   enum {
@@ -52,8 +52,7 @@
                                                  MatrixType::MaxColsAtCompileTime - EIGEN_PLAIN_ENUM_MAX( DiagIndex, 0))),
     MaxColsAtCompileTime = 1,
     MaskLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
-    Flags = (unsigned int)_MatrixTypeNested::Flags & (HereditaryBits | LinearAccessBit | MaskLvalueBit | DirectAccessBit) & ~RowMajorBit,
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost,
+    Flags = (unsigned int)_MatrixTypeNested::Flags & (RowMajorBit | MaskLvalueBit | DirectAccessBit) & ~RowMajorBit, // FIXME DirectAccessBit should not be handled by expressions
     MatrixTypeOuterStride = outer_stride_at_compile_time<MatrixType>::ret,
     InnerStrideAtCompileTime = MatrixTypeOuterStride == Dynamic ? Dynamic : MatrixTypeOuterStride+1,
     OuterStrideAtCompileTime = 0
@@ -71,31 +70,30 @@
     EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)
 
     EIGEN_DEVICE_FUNC
-    inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}
+    explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index)
+    {
+      eigen_assert( a_index <= m_matrix.cols() && -a_index <= m_matrix.rows() );
+    }
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)
 
     EIGEN_DEVICE_FUNC
     inline Index rows() const
     {
-      return m_index.value()<0 ? numext::mini(Index(m_matrix.cols()),Index(m_matrix.rows()+m_index.value()))
-                               : numext::mini(Index(m_matrix.rows()),Index(m_matrix.cols()-m_index.value()));
+      return m_index.value()<0 ? numext::mini<Index>(m_matrix.cols(),m_matrix.rows()+m_index.value())
+                               : numext::mini<Index>(m_matrix.rows(),m_matrix.cols()-m_index.value());
     }
 
-    EIGEN_DEVICE_FUNC
-    inline Index cols() const { return 1; }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return 1; }
 
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const
-    {
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT {
       return m_matrix.outerStride() + 1;
     }
 
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const
-    {
-      return 0;
-    }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return 0; }
 
     typedef typename internal::conditional<
                        internal::is_lvalue<MatrixType>::value,
@@ -104,21 +102,21 @@
                      >::type ScalarWithConstIfNotLvalue;
 
     EIGEN_DEVICE_FUNC
-    inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }
+    inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.coeffRef(rowOffset(), colOffset())); }
     EIGEN_DEVICE_FUNC
-    inline const Scalar* data() const { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }
+    inline const Scalar* data() const { return &(m_matrix.coeffRef(rowOffset(), colOffset())); }
 
     EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index row, Index)
     {
       EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
+      return m_matrix.coeffRef(row+rowOffset(), row+colOffset());
     }
 
     EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index row, Index) const
     {
-      return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
+      return m_matrix.coeffRef(row+rowOffset(), row+colOffset());
     }
 
     EIGEN_DEVICE_FUNC
@@ -131,13 +129,13 @@
     inline Scalar& coeffRef(Index idx)
     {
       EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
+      return m_matrix.coeffRef(idx+rowOffset(), idx+colOffset());
     }
 
     EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index idx) const
     {
-      return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
+      return m_matrix.coeffRef(idx+rowOffset(), idx+colOffset());
     }
 
     EIGEN_DEVICE_FUNC
@@ -147,31 +145,31 @@
     }
 
     EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename MatrixType::Nested>::type& 
-    nestedExpression() const 
+    inline const typename internal::remove_all<typename MatrixType::Nested>::type&
+    nestedExpression() const
     {
       return m_matrix;
     }
 
     EIGEN_DEVICE_FUNC
-    int index() const
+    inline Index index() const
     {
       return m_index.value();
     }
 
   protected:
-    typename MatrixType::Nested m_matrix;
+    typename internal::ref_selector<MatrixType>::non_const_type m_matrix;
     const internal::variable_if_dynamicindex<Index, DiagIndex> m_index;
 
   private:
     // some compilers may fail to optimize std::max etc in case of compile-time constants...
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index absDiagIndex() const { return m_index.value()>0 ? m_index.value() : -m_index.value(); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value()>0 ? 0 : -m_index.value(); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value()>0 ? m_index.value() : 0; }
-    // triger a compile time error is someone try to call packet
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index absDiagIndex() const EIGEN_NOEXCEPT { return m_index.value()>0 ? m_index.value() : -m_index.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rowOffset() const EIGEN_NOEXCEPT { return m_index.value()>0 ? 0 : -m_index.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index colOffset() const EIGEN_NOEXCEPT { return m_index.value()>0 ? m_index.value() : 0; }
+    // trigger a compile-time error if someone try to call packet
     template<int LoadMode> typename MatrixType::PacketReturnType packet(Index) const;
     template<int LoadMode> typename MatrixType::PacketReturnType packet(Index,Index) const;
 };
@@ -185,17 +183,15 @@
   *
   * \sa class Diagonal */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-inline typename MatrixBase<Derived>::DiagonalReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalReturnType
 MatrixBase<Derived>::diagonal()
 {
-  return derived();
+  return DiagonalReturnType(derived());
 }
 
 /** This is the const version of diagonal(). */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-inline typename MatrixBase<Derived>::ConstDiagonalReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalReturnType
 MatrixBase<Derived>::diagonal() const
 {
   return ConstDiagonalReturnType(derived());
@@ -213,20 +209,18 @@
   *
   * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<DynamicIndex>::Type
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index)
 {
-  return typename DiagonalIndexReturnType<DynamicIndex>::Type(derived(), index);
+  return DiagonalDynamicIndexReturnType(derived(), index);
 }
 
 /** This is the const version of diagonal(Index). */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<DynamicIndex>::Type
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index) const
 {
-  return typename ConstDiagonalIndexReturnType<DynamicIndex>::Type(derived(), index);
+  return ConstDiagonalDynamicIndexReturnType(derived(), index);
 }
 
 /** \returns an expression of the \a DiagIndex-th sub or super diagonal of the matrix \c *this
@@ -241,22 +235,22 @@
   *
   * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-template<int Index>
+template<int Index_>
 EIGEN_DEVICE_FUNC
-inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index>::Type
+inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal()
 {
-  return derived();
+  return typename DiagonalIndexReturnType<Index_>::Type(derived());
 }
 
 /** This is the const version of diagonal<int>(). */
 template<typename Derived>
-template<int Index>
+template<int Index_>
 EIGEN_DEVICE_FUNC
-inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index>::Type
+inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal() const
 {
-  return derived();
+  return typename ConstDiagonalIndexReturnType<Index_>::Type(derived());
 }
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h
index 573c372..542685c 100644
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h

@@ -22,7 +22,7 @@
     typedef typename DiagonalVectorType::Scalar Scalar;
     typedef typename DiagonalVectorType::RealScalar RealScalar;
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
+    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
 
     enum {
       RowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
@@ -30,7 +30,7 @@
       MaxRowsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
       MaxColsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
       IsVectorAtCompileTime = 0,
-      Flags = 0
+      Flags = NoPreferredStorageOrderBit
     };
 
     typedef Matrix<Scalar, RowsAtCompileTime, ColsAtCompileTime, 0, MaxRowsAtCompileTime, MaxColsAtCompileTime> DenseMatrixType;
@@ -44,17 +44,6 @@
 
     EIGEN_DEVICE_FUNC
     DenseMatrixType toDenseMatrix() const { return derived(); }
-    template<typename DenseDerived>
-    EIGEN_DEVICE_FUNC
-    void evalTo(MatrixBase<DenseDerived> &other) const;
-    template<typename DenseDerived>
-    EIGEN_DEVICE_FUNC
-    void addTo(MatrixBase<DenseDerived> &other) const
-    { other.diagonal() += diagonal(); }
-    template<typename DenseDerived>
-    EIGEN_DEVICE_FUNC
-    void subTo(MatrixBase<DenseDerived> &other) const
-    { other.diagonal() -= diagonal(); }
 
     EIGEN_DEVICE_FUNC
     inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
@@ -66,60 +55,60 @@
     EIGEN_DEVICE_FUNC
     inline Index cols() const { return diagonal().size(); }
 
-    /** \returns the diagonal matrix product of \c *this by the matrix \a matrix.
-      */
     template<typename MatrixDerived>
     EIGEN_DEVICE_FUNC
-    const DiagonalProduct<MatrixDerived, Derived, OnTheLeft>
+    const Product<Derived,MatrixDerived,LazyProduct>
     operator*(const MatrixBase<MatrixDerived> &matrix) const
     {
-      return DiagonalProduct<MatrixDerived, Derived, OnTheLeft>(matrix.derived(), derived());
+      return Product<Derived, MatrixDerived, LazyProduct>(derived(),matrix.derived());
     }
 
+    typedef DiagonalWrapper<const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const DiagonalVectorType> > InverseReturnType;
     EIGEN_DEVICE_FUNC
-    inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const DiagonalVectorType> >
+    inline const InverseReturnType
     inverse() const
     {
-      return diagonal().cwiseInverse();
+      return InverseReturnType(diagonal().cwiseInverse());
     }
     
     EIGEN_DEVICE_FUNC
-    inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> >
+    inline const DiagonalWrapper<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DiagonalVectorType,Scalar,product) >
     operator*(const Scalar& scalar) const
     {
-      return diagonal() * scalar;
+      return DiagonalWrapper<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DiagonalVectorType,Scalar,product) >(diagonal() * scalar);
     }
     EIGEN_DEVICE_FUNC
-    friend inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> >
+    friend inline const DiagonalWrapper<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,DiagonalVectorType,product) >
     operator*(const Scalar& scalar, const DiagonalBase& other)
     {
-      return other.diagonal() * scalar;
+      return DiagonalWrapper<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,DiagonalVectorType,product) >(scalar * other.diagonal());
     }
-    
-    #ifdef EIGEN2_SUPPORT
+
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    bool isApprox(const DiagonalBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
-    {
-      return diagonal().isApprox(other.diagonal(), precision);
-    }
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    bool isApprox(const MatrixBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
-    {
-      return toDenseMatrix().isApprox(other, precision);
-    }
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    inline unspecified_expression_type
+    #else
+    inline const DiagonalWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(DiagonalVectorType,typename OtherDerived::DiagonalVectorType,sum) >
     #endif
+    operator+(const DiagonalBase<OtherDerived>& other) const
+    {
+      return (diagonal() + other.diagonal()).asDiagonal();
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    inline unspecified_expression_type
+    #else
+    inline const DiagonalWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(DiagonalVectorType,typename OtherDerived::DiagonalVectorType,difference) >
+    #endif
+    operator-(const DiagonalBase<OtherDerived>& other) const
+    {
+      return (diagonal() - other.diagonal()).asDiagonal();
+    }
 };
 
-template<typename Derived>
-template<typename DenseDerived>
-EIGEN_DEVICE_FUNC
-void DiagonalBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
-{
-  other.setZero();
-  other.diagonal() = diagonal();
-}
 #endif
 
 /** \class DiagonalMatrix
@@ -141,10 +130,9 @@
  : traits<Matrix<_Scalar,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
 {
   typedef Matrix<_Scalar,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1> DiagonalVectorType;
-  typedef Dense StorageKind;
-  typedef DenseIndex Index;
+  typedef DiagonalShape StorageKind;
   enum {
-    Flags = LvalueBit
+    Flags = LvalueBit | NoPreferredStorageOrderBit
   };
 };
 }
@@ -158,7 +146,7 @@
     typedef const DiagonalMatrix& Nested;
     typedef _Scalar Scalar;
     typedef typename internal::traits<DiagonalMatrix>::StorageKind StorageKind;
-    typedef typename internal::traits<DiagonalMatrix>::Index Index;
+    typedef typename internal::traits<DiagonalMatrix>::StorageIndex StorageIndex;
     #endif
 
   protected:
@@ -180,7 +168,7 @@
 
     /** Constructs a diagonal matrix with given dimension  */
     EIGEN_DEVICE_FUNC
-    inline DiagonalMatrix(Index dim) : m_diagonal(dim) {}
+    explicit inline DiagonalMatrix(Index dim) : m_diagonal(dim) {}
 
     /** 2D constructor. */
     EIGEN_DEVICE_FUNC
@@ -190,6 +178,30 @@
     EIGEN_DEVICE_FUNC
     inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x,y,z) {}
 
+    #if EIGEN_HAS_CXX11
+    /** \brief Construct a diagonal matrix with fixed size from an arbitrary number of coefficients. \cpp11
+      * 
+      * There exists C++98 anologue constructors for fixed-size diagonal matrices having 2 or 3 coefficients.
+      * 
+      * \warning To construct a diagonal matrix of fixed size, the number of values passed to this 
+      * constructor must match the fixed dimension of \c *this.
+      * 
+      * \sa DiagonalMatrix(const Scalar&, const Scalar&)
+      * \sa DiagonalMatrix(const Scalar&, const Scalar&, const Scalar&)
+      */
+    template <typename... ArgTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    DiagonalMatrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const ArgTypes&... args)
+      : m_diagonal(a0, a1, a2, args...) {}
+
+    /** \brief Constructs a DiagonalMatrix and initializes it by elements given by an initializer list of initializer
+      * lists \cpp11
+      */
+    EIGEN_DEVICE_FUNC
+    explicit EIGEN_STRONG_INLINE DiagonalMatrix(const std::initializer_list<std::initializer_list<Scalar>>& list)
+      : m_diagonal(list) {}
+    #endif  // EIGEN_HAS_CXX11
+
     /** Copy constructor. */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
@@ -264,14 +276,15 @@
 {
   typedef _DiagonalVectorType DiagonalVectorType;
   typedef typename DiagonalVectorType::Scalar Scalar;
-  typedef typename DiagonalVectorType::Index Index;
-  typedef typename DiagonalVectorType::StorageKind StorageKind;
+  typedef typename DiagonalVectorType::StorageIndex StorageIndex;
+  typedef DiagonalShape StorageKind;
+  typedef typename traits<DiagonalVectorType>::XprKind XprKind;
   enum {
     RowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
     ColsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
-    MaxRowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
-    MaxColsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
-    Flags =  traits<DiagonalVectorType>::Flags & LvalueBit
+    MaxRowsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
+    MaxColsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
+    Flags =  (traits<DiagonalVectorType>::Flags & LvalueBit) | NoPreferredStorageOrderBit
   };
 };
 }
@@ -288,7 +301,7 @@
 
     /** Constructor from expression of diagonal coefficients to wrap. */
     EIGEN_DEVICE_FUNC
-    inline DiagonalWrapper(DiagonalVectorType& a_diagonal) : m_diagonal(a_diagonal) {}
+    explicit inline DiagonalWrapper(DiagonalVectorType& a_diagonal) : m_diagonal(a_diagonal) {}
 
     /** \returns a const reference to the wrapped expression of diagonal coefficients. */
     EIGEN_DEVICE_FUNC
@@ -308,11 +321,10 @@
   * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal()
   **/
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-inline const DiagonalWrapper<const Derived>
+EIGEN_DEVICE_FUNC inline const DiagonalWrapper<const Derived>
 MatrixBase<Derived>::asDiagonal() const
 {
-  return derived();
+  return DiagonalWrapper<const Derived>(derived());
 }
 
 /** \returns true if *this is approximately equal to a diagonal matrix,
@@ -326,12 +338,11 @@
 template<typename Derived>
 bool MatrixBase<Derived>::isDiagonal(const RealScalar& prec) const
 {
-  using std::abs;
   if(cols() != rows()) return false;
   RealScalar maxAbsOnDiagonal = static_cast<RealScalar>(-1);
   for(Index j = 0; j < cols(); ++j)
   {
-    RealScalar absOnDiagonal = abs(coeff(j,j));
+    RealScalar absOnDiagonal = numext::abs(coeff(j,j));
     if(absOnDiagonal > maxAbsOnDiagonal) maxAbsOnDiagonal = absOnDiagonal;
   }
   for(Index j = 0; j < cols(); ++j)
@@ -343,6 +354,38 @@
   return true;
 }
 
+namespace internal {
+
+template<> struct storage_kind_to_shape<DiagonalShape> { typedef DiagonalShape Shape; };
+
+struct Diagonal2Dense {};
+
+template<> struct AssignmentKind<DenseShape,DiagonalShape> { typedef Diagonal2Dense Kind; };
+
+// Diagonal matrix to Dense assignment
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense>
+{
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+    
+    dst.setZero();
+    dst.diagonal() = src.diagonal();
+  }
+  
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  { dst.diagonal() += src.diagonal(); }
+  
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  { dst.diagonal() -= src.diagonal(); }
+};
+
+} // namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_DIAGONALMATRIX_H

diff --git a/Eigen/src/Core/DiagonalProduct.h b/Eigen/src/Core/DiagonalProduct.h
index a0510a7..7911d1c 100644
--- a/Eigen/src/Core/DiagonalProduct.h
+++ b/Eigen/src/Core/DiagonalProduct.h

@@ -13,117 +13,14 @@
 
 namespace Eigen { 
 
-namespace internal {
-template<typename MatrixType, typename DiagonalType, int ProductOrder>
-struct traits<DiagonalProduct<MatrixType, DiagonalType, ProductOrder> >
- : traits<MatrixType>
-{
-  typedef typename scalar_product_traits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
-  enum {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-
-    _StorageOrder = MatrixType::Flags & RowMajorBit ? RowMajor : ColMajor,
-    _ScalarAccessOnDiag =  !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft)
-                          ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)),
-    _SameTypes = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
-    // FIXME currently we need same types, but in the future the next rule should be the one
-    //_Vectorizable = bool(int(MatrixType::Flags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagonalType::DiagonalVectorType::Flags)&PacketAccessBit))),
-    _Vectorizable = bool(int(MatrixType::Flags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagonalType::DiagonalVectorType::Flags)&PacketAccessBit))),
-    _LinearAccessMask = (RowsAtCompileTime==1 || ColsAtCompileTime==1) ? LinearAccessBit : 0,
-
-    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixType::Flags)) | (_Vectorizable ? PacketAccessBit : 0) | AlignedBit,//(int(MatrixType::Flags)&int(DiagonalType::DiagonalVectorType::Flags)&AlignedBit),
-    CoeffReadCost = NumTraits<Scalar>::MulCost + MatrixType::CoeffReadCost + DiagonalType::DiagonalVectorType::CoeffReadCost
-  };
-};
-}
-
-template<typename MatrixType, typename DiagonalType, int ProductOrder>
-class DiagonalProduct : internal::no_assignment_operator,
-                        public MatrixBase<DiagonalProduct<MatrixType, DiagonalType, ProductOrder> >
-{
-  public:
-
-    typedef MatrixBase<DiagonalProduct> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(DiagonalProduct)
-
-    inline DiagonalProduct(const MatrixType& matrix, const DiagonalType& diagonal)
-      : m_matrix(matrix), m_diagonal(diagonal)
-    {
-      eigen_assert(diagonal.diagonal().size() == (ProductOrder == OnTheLeft ? matrix.rows() : matrix.cols()));
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const { return m_matrix.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_matrix.cols(); }
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
-    {
-      return m_diagonal.diagonal().coeff(ProductOrder == OnTheLeft ? row : col) * m_matrix.coeff(row, col);
-    }
-    
-    EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
-    {
-      enum {
-        StorageOrder = int(MatrixType::Flags) & RowMajorBit ? RowMajor : ColMajor
-      };
-      return coeff(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index row, Index col) const
-    {
-      enum {
-        StorageOrder = Flags & RowMajorBit ? RowMajor : ColMajor
-      };
-      const Index indexInDiagonalVector = ProductOrder == OnTheLeft ? row : col;
-      return packet_impl<LoadMode>(row,col,indexInDiagonalVector,typename internal::conditional<
-        ((int(StorageOrder) == RowMajor && int(ProductOrder) == OnTheLeft)
-       ||(int(StorageOrder) == ColMajor && int(ProductOrder) == OnTheRight)), internal::true_type, internal::false_type>::type());
-    }
-    
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index idx) const
-    {
-      enum {
-        StorageOrder = int(MatrixType::Flags) & RowMajorBit ? RowMajor : ColMajor
-      };
-      return packet<LoadMode>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
-    }
-
-  protected:
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, internal::true_type) const
-    {
-      return internal::pmul(m_matrix.template packet<LoadMode>(row, col),
-                     internal::pset1<PacketScalar>(m_diagonal.diagonal().coeff(id)));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, internal::false_type) const
-    {
-      enum {
-        InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
-        DiagonalVectorPacketLoadMode = (LoadMode == Aligned && (((InnerSize%16) == 0) || (int(DiagonalType::DiagonalVectorType::Flags)&AlignedBit)==AlignedBit) ? Aligned : Unaligned)
-      };
-      return internal::pmul(m_matrix.template packet<LoadMode>(row, col),
-                     m_diagonal.diagonal().template packet<DiagonalVectorPacketLoadMode>(id));
-    }
-
-    typename MatrixType::Nested m_matrix;
-    typename DiagonalType::Nested m_diagonal;
-};
-
 /** \returns the diagonal matrix product of \c *this by the diagonal matrix \a diagonal.
   */
 template<typename Derived>
 template<typename DiagonalDerived>
-EIGEN_DEVICE_FUNC
-inline const DiagonalProduct<Derived, DiagonalDerived, OnTheRight>
+EIGEN_DEVICE_FUNC inline const Product<Derived, DiagonalDerived, LazyProduct>
 MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
 {
-  return DiagonalProduct<Derived, DiagonalDerived, OnTheRight>(derived(), a_diagonal.derived());
+  return Product<Derived, DiagonalDerived, LazyProduct>(derived(),a_diagonal.derived());
 }
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index bc1e902..5c3441b 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h

@@ -10,7 +10,7 @@
 #ifndef EIGEN_DOT_H
 #define EIGEN_DOT_H
 
-namespace Eigen {
+namespace Eigen { 
 
 namespace internal {
 
@@ -28,28 +28,33 @@
 >
 struct dot_nocheck
 {
-  typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
+  typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
+  typedef typename conj_prod::result_type ResScalar;
   EIGEN_DEVICE_FUNC
-  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
+  EIGEN_STRONG_INLINE
+  static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
   {
-    return a.template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
+    return a.template binaryExpr<conj_prod>(b).sum();
   }
 };
 
 template<typename T, typename U>
 struct dot_nocheck<T, U, true>
 {
-  typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
+  typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
+  typedef typename conj_prod::result_type ResScalar;
   EIGEN_DEVICE_FUNC
-  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
+  EIGEN_STRONG_INLINE
+  static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
   {
-    return a.transpose().template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
+    return a.transpose().template binaryExpr<conj_prod>(b).sum();
   }
 };
 
 } // end namespace internal
 
-/** \returns the dot product of *this with other.
+/** \fn MatrixBase::dot
+  * \returns the dot product of *this with other.
   *
   * \only_for_vectors
   *
@@ -62,59 +67,33 @@
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
-typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
+EIGEN_STRONG_INLINE
+typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
 MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
   EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived,OtherDerived)
+#if !(defined(EIGEN_NO_STATIC_ASSERT) && defined(EIGEN_NO_DEBUG))
   typedef internal::scalar_conj_product_op<Scalar,typename OtherDerived::Scalar> func;
   EIGEN_CHECK_BINARY_COMPATIBILIY(func,Scalar,typename OtherDerived::Scalar);
-
+#endif
+  
   eigen_assert(size() == other.size());
 
   return internal::dot_nocheck<Derived,OtherDerived>::run(*this, other);
 }
 
-#ifdef EIGEN2_SUPPORT
-/** \returns the dot product of *this with other, with the Eigen2 convention that the dot product is linear in the first variable
-  * (conjugating the second variable). Of course this only makes a difference in the complex case.
-  *
-  * This method is only available in EIGEN2_SUPPORT mode.
-  *
-  * \only_for_vectors
-  *
-  * \sa dot()
-  */
-template<typename Derived>
-template<typename OtherDerived>
-typename internal::traits<Derived>::Scalar
-MatrixBase<Derived>::eigen2_dot(const MatrixBase<OtherDerived>& other) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
-  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived,OtherDerived)
-  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
-    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-
-  eigen_assert(size() == other.size());
-
-  return internal::dot_nocheck<OtherDerived,Derived>::run(other,*this);
-}
-#endif
-
-
 //---------- implementation of L2 norm and related functions ----------
 
-/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the Frobenius norm.
+/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the squared Frobenius norm.
   * In both cases, it consists in the sum of the square of all the matrix entries.
   * For vectors, this is also equals to the dot product of \c *this with itself.
   *
-  * \sa dot(), norm()
+  * \sa dot(), norm(), lpNorm()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
 {
   return numext::real((*this).cwiseAbs2().sum());
 }
@@ -123,11 +102,10 @@
   * In both cases, it consists in the square root of the sum of the square of all the matrix entries.
   * For vectors, this is also equals to the square root of the dot product of \c *this with itself.
   *
-  * \sa dot(), squaredNorm()
+  * \sa lpNorm(), dot(), squaredNorm()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
 {
   return numext::sqrt(squaredNorm());
 }
@@ -142,12 +120,10 @@
   * \sa norm(), normalize()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-inline const typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::normalized() const
 {
-  typedef typename internal::nested<Derived>::type Nested;
-  typedef typename internal::remove_reference<Nested>::type _Nested;
+  typedef typename internal::nested_eval<Derived,2>::type _Nested;
   _Nested n(derived());
   RealScalar z = n.squaredNorm();
   // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
@@ -166,8 +142,7 @@
   * \sa norm(), normalized()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-inline void MatrixBase<Derived>::normalize()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
 {
   RealScalar z = squaredNorm();
   // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
@@ -175,6 +150,52 @@
     derived() /= numext::sqrt(z);
 }
 
+/** \returns an expression of the quotient of \c *this by its own norm while avoiding underflow and overflow.
+  *
+  * \only_for_vectors
+  *
+  * This method is analogue to the normalized() method, but it reduces the risk of
+  * underflow and overflow when computing the norm.
+  *
+  * \warning If the input vector is too small (i.e., this->norm()==0),
+  *          then this function returns a copy of the input.
+  *
+  * \sa stableNorm(), stableNormalize(), normalized()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
+MatrixBase<Derived>::stableNormalized() const
+{
+  typedef typename internal::nested_eval<Derived,3>::type _Nested;
+  _Nested n(derived());
+  RealScalar w = n.cwiseAbs().maxCoeff();
+  RealScalar z = (n/w).squaredNorm();
+  if(z>RealScalar(0))
+    return n / (numext::sqrt(z)*w);
+  else
+    return n;
+}
+
+/** Normalizes the vector while avoid underflow and overflow
+  *
+  * \only_for_vectors
+  *
+  * This method is analogue to the normalize() method, but it reduces the risk of
+  * underflow and overflow when computing the norm.
+  *
+  * \warning If the input vector is too small (i.e., this->norm()==0), then \c *this is left unchanged.
+  *
+  * \sa stableNorm(), stableNormalized(), normalize()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
+{
+  RealScalar w = cwiseAbs().maxCoeff();
+  RealScalar z = (derived()/w).squaredNorm();
+  if(z>RealScalar(0))
+    derived() /= numext::sqrt(z)*w;
+}
+
 //---------- implementation of other norms ----------
 
 namespace internal {
@@ -186,7 +207,7 @@
   EIGEN_DEVICE_FUNC
   static inline RealScalar run(const MatrixBase<Derived>& m)
   {
-    using std::pow;
+    EIGEN_USING_STD(pow)
     return pow(m.cwiseAbs().array().pow(p).sum(), RealScalar(1)/p);
   }
 };
@@ -214,25 +235,35 @@
 template<typename Derived>
 struct lpNorm_selector<Derived, Infinity>
 {
+  typedef typename NumTraits<typename traits<Derived>::Scalar>::Real RealScalar;
   EIGEN_DEVICE_FUNC
-  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
+  static inline RealScalar run(const MatrixBase<Derived>& m)
   {
+    if(Derived::SizeAtCompileTime==0 || (Derived::SizeAtCompileTime==Dynamic && m.size()==0))
+      return RealScalar(0);
     return m.cwiseAbs().maxCoeff();
   }
 };
 
 } // end namespace internal
 
-/** \returns the \f$ \ell^p \f$ norm of *this, that is, returns the p-th root of the sum of the p-th powers of the absolute values
-  *          of the coefficients of *this. If \a p is the special value \a Eigen::Infinity, this function returns the \f$ \ell^\infty \f$
-  *          norm, that is the maximum of the absolute values of the coefficients of *this.
+/** \returns the \b coefficient-wise \f$ \ell^p \f$ norm of \c *this, that is, returns the p-th root of the sum of the p-th powers of the absolute values
+  *          of the coefficients of \c *this. If \a p is the special value \a Eigen::Infinity, this function returns the \f$ \ell^\infty \f$
+  *          norm, that is the maximum of the absolute values of the coefficients of \c *this.
+  *
+  * In all cases, if \c *this is empty, then the value 0 is returned.
+  *
+  * \note For matrices, this function does not compute the <a href="https://en.wikipedia.org/wiki/Operator_norm">operator-norm</a>. That is, if \c *this is a matrix, then its coefficients are interpreted as a 1D vector. Nonetheless, you can easily compute the 1-norm and \f$\infty\f$-norm matrix operator norms using \link TutorialReductionsVisitorsBroadcastingReductionsNorm partial reductions \endlink.
   *
   * \sa norm()
   */
 template<typename Derived>
 template<int p>
-EIGEN_DEVICE_FUNC
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_DEVICE_FUNC inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+#else
+EIGEN_DEVICE_FUNC MatrixBase<Derived>::RealScalar
+#endif
 MatrixBase<Derived>::lpNorm() const
 {
   return internal::lpNorm_selector<Derived, p>::run(*this);
@@ -251,8 +282,8 @@
 bool MatrixBase<Derived>::isOrthogonal
 (const MatrixBase<OtherDerived>& other, const RealScalar& prec) const
 {
-  typename internal::nested<Derived,2>::type nested(derived());
-  typename internal::nested<OtherDerived,2>::type otherNested(other.derived());
+  typename internal::nested_eval<Derived,2>::type nested(derived());
+  typename internal::nested_eval<OtherDerived,2>::type otherNested(other.derived());
   return numext::abs2(nested.dot(otherNested)) <= prec * prec * nested.squaredNorm() * otherNested.squaredNorm();
 }
 
@@ -270,13 +301,13 @@
 template<typename Derived>
 bool MatrixBase<Derived>::isUnitary(const RealScalar& prec) const
 {
-  typename Derived::Nested nested(derived());
+  typename internal::nested_eval<Derived,1>::type self(derived());
   for(Index i = 0; i < cols(); ++i)
   {
-    if(!internal::isApprox(nested.col(i).squaredNorm(), static_cast<RealScalar>(1), prec))
+    if(!internal::isApprox(self.col(i).squaredNorm(), static_cast<RealScalar>(1), prec))
       return false;
     for(Index j = 0; j < i; ++j)
-      if(!internal::isMuchSmallerThan(nested.col(i).dot(nested.col(j)), static_cast<Scalar>(1), prec))
+      if(!internal::isMuchSmallerThan(self.col(i).dot(self.col(j)), static_cast<Scalar>(1), prec))
         return false;
   }
   return true;

diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h
index f9190ad..6b3c7d3 100644
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h

@@ -13,7 +13,10 @@
 
 namespace Eigen {
 
-/** Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
+/** \class EigenBase
+  * \ingroup Core_Module
+  *
+  * Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
   *
   * In other words, an EigenBase object is an object that can be copied into a MatrixBase.
   *
@@ -21,14 +24,22 @@
   *
   * Notice that this class is trivial, it is only used to disambiguate overloaded functions.
   *
-  * \sa \ref TopicClassHierarchy
+  * \sa \blank \ref TopicClassHierarchy
   */
 template<typename Derived> struct EigenBase
 {
 //   typedef typename internal::plain_matrix_type<Derived>::type PlainObject;
 
+  /** \brief The interface type of indices
+    * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
+    * \sa StorageIndex, \ref TopicPreprocessorDirectives.
+    * DEPRECATED: Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead.
+    * Deprecation is not marked with a doxygen comment because there are too many existing usages to add the deprecation attribute.
+    */
+  typedef Eigen::Index Index;
+
+  // FIXME is it needed?
   typedef typename internal::traits<Derived>::StorageKind StorageKind;
-  typedef typename internal::traits<Derived>::Index Index;
 
   /** \returns a reference to the derived object */
   EIGEN_DEVICE_FUNC
@@ -45,15 +56,15 @@
   { return *static_cast<const Derived*>(this); }
 
   /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
-  EIGEN_DEVICE_FUNC
-  inline Index rows() const { return derived().rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); }
   /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
-  EIGEN_DEVICE_FUNC
-  inline Index cols() const { return derived().cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); }
   /** \returns the number of coefficients, which is rows()*cols().
     * \sa rows(), cols(), SizeAtCompileTime. */
-  EIGEN_DEVICE_FUNC
-  inline Index size() const { return rows() * cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  inline Index size() const EIGEN_NOEXCEPT { return rows() * cols(); }
 
   /** \internal Don't use it, but do the equivalent: \code dst = *this; \endcode */
   template<typename Dest>
@@ -122,7 +133,7 @@
 EIGEN_DEVICE_FUNC
 Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
 {
-  other.derived().evalTo(derived());
+  call_assignment(derived(), other.derived());
   return derived();
 }
 
@@ -131,7 +142,7 @@
 EIGEN_DEVICE_FUNC
 Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
 {
-  other.derived().addTo(derived());
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
@@ -140,7 +151,7 @@
 EIGEN_DEVICE_FUNC
 Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
 {
-  other.derived().subTo(derived());
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 

diff --git a/Eigen/src/Core/ForceAlignedAccess.h b/Eigen/src/Core/ForceAlignedAccess.h
index 807c7a2..817a43a 100644
--- a/Eigen/src/Core/ForceAlignedAccess.h
+++ b/Eigen/src/Core/ForceAlignedAccess.h

@@ -39,29 +39,33 @@
     typedef typename internal::dense_xpr_base<ForceAlignedAccess>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(ForceAlignedAccess)
 
-    inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
+    EIGEN_DEVICE_FUNC explicit inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
 
-    inline Index rows() const { return m_expression.rows(); }
-    inline Index cols() const { return m_expression.cols(); }
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    inline Index innerStride() const { return m_expression.innerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); }
 
-    inline const CoeffReturnType coeff(Index row, Index col) const
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
     {
       return m_expression.coeff(row, col);
     }
 
-    inline Scalar& coeffRef(Index row, Index col)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
     {
       return m_expression.const_cast_derived().coeffRef(row, col);
     }
 
-    inline const CoeffReturnType coeff(Index index) const
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const
     {
       return m_expression.coeff(index);
     }
 
-    inline Scalar& coeffRef(Index index)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
     {
       return m_expression.const_cast_derived().coeffRef(index);
     }
@@ -90,7 +94,7 @@
       m_expression.const_cast_derived().template writePacket<Aligned>(index, x);
     }
 
-    operator const ExpressionType&() const { return m_expression; }
+    EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
 
   protected:
     const ExpressionType& m_expression;
@@ -127,7 +131,7 @@
 inline typename internal::add_const_on_value_type<typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type>::type
 MatrixBase<Derived>::forceAlignedAccessIf() const
 {
-  return derived();
+  return derived();  // FIXME This should not work but apparently is never used
 }
 
 /** \returns an expression of *this with forced aligned access if \a Enable is true.
@@ -138,7 +142,7 @@
 inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type
 MatrixBase<Derived>::forceAlignedAccessIf()
 {
-  return derived();
+  return derived();  // FIXME This should not work but apparently is never used
 }
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/Fuzzy.h b/Eigen/src/Core/Fuzzy.h
index a676a45..43aa49b 100644
--- a/Eigen/src/Core/Fuzzy.h
+++ b/Eigen/src/Core/Fuzzy.h

@@ -11,7 +11,7 @@
 #ifndef EIGEN_FUZZY_H
 #define EIGEN_FUZZY_H
 
-namespace Eigen {
+namespace Eigen { 
 
 namespace internal
 {
@@ -22,8 +22,8 @@
   EIGEN_DEVICE_FUNC
   static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar& prec)
   {
-    typename internal::nested<Derived,2>::type nested(x);
-    typename internal::nested<OtherDerived,2>::type otherNested(y);
+    typename internal::nested_eval<Derived,2>::type nested(x);
+    typename internal::nested_eval<OtherDerived,2>::type otherNested(y);
     return (nested - otherNested).cwiseAbs2().sum() <= prec * prec * numext::mini(nested.cwiseAbs2().sum(), otherNested.cwiseAbs2().sum());
   }
 };
@@ -100,8 +100,7 @@
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-bool DenseBase<Derived>::isApprox(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(
   const DenseBase<OtherDerived>& other,
   const RealScalar& prec
 ) const
@@ -123,8 +122,7 @@
   * \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-bool DenseBase<Derived>::isMuchSmallerThan(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
   const typename NumTraits<Scalar>::Real& other,
   const RealScalar& prec
 ) const
@@ -144,8 +142,7 @@
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-bool DenseBase<Derived>::isMuchSmallerThan(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
   const DenseBase<OtherDerived>& other,
   const RealScalar& prec
 ) const

diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index bfd3edc..6906aa7 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h

@@ -13,44 +13,38 @@
 
 namespace Eigen {
 
-/** \class GeneralProduct
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the product of two general matrices or vectors
-  *
-  * \param LhsNested the type used to store the left-hand side
-  * \param RhsNested the type used to store the right-hand side
-  * \param ProductMode the type of the product
-  *
-  * This class represents an expression of the product of two general matrices.
-  * We call a general matrix, a dense matrix with full storage. For instance,
-  * This excludes triangular, selfadjoint, and sparse matrices.
-  * It is the return type of the operator* between general matrices. Its template
-  * arguments are determined automatically by ProductReturnType. Therefore,
-  * GeneralProduct should never be used direclty. To determine the result type of a
-  * function which involves a matrix product, use ProductReturnType::Type.
-  *
-  * \sa ProductReturnType, MatrixBase::operator*(const MatrixBase<OtherDerived>&)
-  */
-template<typename Lhs, typename Rhs, int ProductType = internal::product_type<Lhs,Rhs>::value>
-class GeneralProduct;
-
 enum {
   Large = 2,
   Small = 3
 };
 
+// Define the threshold value to fallback from the generic matrix-matrix product
+// implementation (heavy) to the lightweight coeff-based product one.
+// See generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
+// in products/GeneralMatrixMatrix.h for more details.
+// TODO This threshold should also be used in the compile-time selector below.
+#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD
+// This default value has been obtained on a Haswell architecture.
+#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20
+#endif
+
 namespace internal {
 
 template<int Rows, int Cols, int Depth> struct product_type_selector;
 
 template<int Size, int MaxSize> struct product_size_category
 {
-  enum { is_large = MaxSize == Dynamic ||
-                    Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD,
-         value = is_large  ? Large
-               : Size == 1 ? 1
-                           : Small
+  enum {
+    #ifndef EIGEN_GPU_COMPILE_PHASE
+    is_large = MaxSize == Dynamic ||
+               Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
+               (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
+    #else
+    is_large = 0,
+    #endif
+    value = is_large  ? Large
+          : Size == 1 ? 1
+                      : Small
   };
 };
 
@@ -59,14 +53,14 @@
   typedef typename remove_all<Lhs>::type _Lhs;
   typedef typename remove_all<Rhs>::type _Rhs;
   enum {
-    MaxRows  = _Lhs::MaxRowsAtCompileTime,
-    Rows  = _Lhs::RowsAtCompileTime,
-    MaxCols  = _Rhs::MaxColsAtCompileTime,
-    Cols  = _Rhs::ColsAtCompileTime,
-    MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::MaxColsAtCompileTime,
-                                           _Rhs::MaxRowsAtCompileTime),
-    Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime,
-                                        _Rhs::RowsAtCompileTime)
+    MaxRows = traits<_Lhs>::MaxRowsAtCompileTime,
+    Rows    = traits<_Lhs>::RowsAtCompileTime,
+    MaxCols = traits<_Rhs>::MaxColsAtCompileTime,
+    Cols    = traits<_Rhs>::ColsAtCompileTime,
+    MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(traits<_Lhs>::MaxColsAtCompileTime,
+                                           traits<_Rhs>::MaxRowsAtCompileTime),
+    Depth = EIGEN_SIZE_MIN_PREFER_FIXED(traits<_Lhs>::ColsAtCompileTime,
+                                        traits<_Rhs>::RowsAtCompileTime)
   };
 
   // the splitting into different lines of code here, introducing the _select enums and the typedef below,
@@ -81,7 +75,8 @@
 
 public:
   enum {
-    value = selector::ret
+    value = selector::ret,
+    ret = selector::ret
   };
 #ifdef EIGEN_DEBUG_PRODUCT
   static void debug()
@@ -97,12 +92,13 @@
 #endif
 };
 
-
 /* The following allows to select the kind of product at compile time
  * based on the three dimensions of the product.
  * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */
 // FIXME I'm not sure the current mapping is the ideal one.
 template<int M, int N>  struct product_type_selector<M,N,1>              { enum { ret = OuterProduct }; };
+template<int M>         struct product_type_selector<M, 1, 1>            { enum { ret = LazyCoeffBasedProductMode }; };
+template<int N>         struct product_type_selector<1, N, 1>            { enum { ret = LazyCoeffBasedProductMode }; };
 template<int Depth>     struct product_type_selector<1,    1,    Depth>  { enum { ret = InnerProduct }; };
 template<>              struct product_type_selector<1,    1,    1>      { enum { ret = InnerProduct }; };
 template<>              struct product_type_selector<Small,1,    Small>  { enum { ret = CoeffBasedProductMode }; };
@@ -121,60 +117,12 @@
 template<>              struct product_type_selector<Large,Small,Large>  { enum { ret = GemmProduct }; };
 template<>              struct product_type_selector<Small,Large,Large>  { enum { ret = GemmProduct }; };
 template<>              struct product_type_selector<Large,Large,Large>  { enum { ret = GemmProduct }; };
-template<>              struct product_type_selector<Large,Small,Small>  { enum { ret = GemmProduct }; };
-template<>              struct product_type_selector<Small,Large,Small>  { enum { ret = GemmProduct }; };
+template<>              struct product_type_selector<Large,Small,Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct product_type_selector<Small,Large,Small>  { enum { ret = CoeffBasedProductMode }; };
 template<>              struct product_type_selector<Large,Large,Small>  { enum { ret = GemmProduct }; };
 
 } // end namespace internal
 
-/** \class ProductReturnType
-  * \ingroup Core_Module
-  *
-  * \brief Helper class to get the correct and optimized returned type of operator*
-  *
-  * \param Lhs the type of the left-hand side
-  * \param Rhs the type of the right-hand side
-  * \param ProductMode the type of the product (determined automatically by internal::product_mode)
-  *
-  * This class defines the typename Type representing the optimized product expression
-  * between two matrix expressions. In practice, using ProductReturnType<Lhs,Rhs>::Type
-  * is the recommended way to define the result type of a function returning an expression
-  * which involve a matrix product. The class Product should never be
-  * used directly.
-  *
-  * \sa class Product, MatrixBase::operator*(const MatrixBase<OtherDerived>&)
-  */
-template<typename Lhs, typename Rhs, int ProductType>
-struct ProductReturnType
-{
-  // TODO use the nested type to reduce instanciations ????
-//   typedef typename internal::nested<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
-//   typedef typename internal::nested<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
-
-  typedef GeneralProduct<Lhs/*Nested*/, Rhs/*Nested*/, ProductType> Type;
-};
-
-template<typename Lhs, typename Rhs>
-struct ProductReturnType<Lhs,Rhs,CoeffBasedProductMode>
-{
-  typedef typename internal::nested<Lhs, Rhs::ColsAtCompileTime, typename internal::plain_matrix_type<Lhs>::type >::type LhsNested;
-  typedef typename internal::nested<Rhs, Lhs::RowsAtCompileTime, typename internal::plain_matrix_type<Rhs>::type >::type RhsNested;
-  typedef CoeffBasedProduct<LhsNested, RhsNested, EvalBeforeAssigningBit | EvalBeforeNestingBit> Type;
-};
-
-template<typename Lhs, typename Rhs>
-struct ProductReturnType<Lhs,Rhs,LazyCoeffBasedProductMode>
-{
-  typedef typename internal::nested<Lhs, Rhs::ColsAtCompileTime, typename internal::plain_matrix_type<Lhs>::type >::type LhsNested;
-  typedef typename internal::nested<Rhs, Lhs::RowsAtCompileTime, typename internal::plain_matrix_type<Rhs>::type >::type RhsNested;
-  typedef CoeffBasedProduct<LhsNested, RhsNested, NestByRefBit> Type;
-};
-
-// this is a workaround for sun CC
-template<typename Lhs, typename Rhs>
-struct LazyProductReturnType : public ProductReturnType<Lhs,Rhs,LazyCoeffBasedProductMode>
-{};
-
 /***********************************************************************
 *  Implementation of Inner Vector Vector Product
 ***********************************************************************/
@@ -186,119 +134,10 @@
 // product ends up to a row-vector times col-vector product... To tackle this use
 // case, we could have a specialization for Block<MatrixType,1,1> with: operator=(Scalar x);
 
-namespace internal {
-
-template<typename Lhs, typename Rhs>
-struct traits<GeneralProduct<Lhs,Rhs,InnerProduct> >
- : traits<Matrix<typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType,1,1> >
-{};
-
-}
-
-template<typename Lhs, typename Rhs>
-class GeneralProduct<Lhs, Rhs, InnerProduct>
-  : internal::no_assignment_operator,
-    public Matrix<typename internal::scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType,1,1>
-{
-    typedef Matrix<typename internal::scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType,1,1> Base;
-  public:
-    GeneralProduct(const Lhs& lhs, const Rhs& rhs)
-    {
-      EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::RealScalar, typename Rhs::RealScalar>::value),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-
-      Base::coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
-    }
-
-    /** Convertion to scalar */
-    operator const typename Base::Scalar() const {
-      return Base::coeff(0,0);
-    }
-};
-
 /***********************************************************************
 *  Implementation of Outer Vector Vector Product
 ***********************************************************************/
 
-namespace internal {
-
-// Column major
-template<typename ProductType, typename Dest, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(const ProductType& prod, Dest& dest, const Func& func, const false_type&)
-{
-  typedef typename Dest::Index Index;
-  // FIXME make sure lhs is sequentially stored
-  // FIXME not very good if rhs is real and lhs complex while alpha is real too
-  const Index cols = dest.cols();
-  for (Index j=0; j<cols; ++j)
-    func(dest.col(j), prod.rhs().coeff(j) * prod.lhs());
-}
-
-// Row major
-template<typename ProductType, typename Dest, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(const ProductType& prod, Dest& dest, const Func& func, const true_type&) {
-  typedef typename Dest::Index Index;
-  // FIXME make sure rhs is sequentially stored
-  // FIXME not very good if lhs is real and rhs complex while alpha is real too
-  const Index rows = dest.rows();
-  for (Index i=0; i<rows; ++i)
-    func(dest.row(i), prod.lhs().coeff(i) * prod.rhs());
-}
-
-template<typename Lhs, typename Rhs>
-struct traits<GeneralProduct<Lhs,Rhs,OuterProduct> >
- : traits<ProductBase<GeneralProduct<Lhs,Rhs,OuterProduct>, Lhs, Rhs> >
-{};
-
-}
-
-template<typename Lhs, typename Rhs>
-class GeneralProduct<Lhs, Rhs, OuterProduct>
-  : public ProductBase<GeneralProduct<Lhs,Rhs,OuterProduct>, Lhs, Rhs>
-{
-    template<typename T> struct IsRowMajor : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
-
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
-
-    GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
-    {
-      EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::RealScalar, typename Rhs::RealScalar>::value),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-    }
-
-    struct set  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
-    struct add  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
-    struct sub  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
-    struct adds {
-      Scalar m_scale;
-      adds(const Scalar& s) : m_scale(s) {}
-      template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const {
-        dst.const_cast_derived() += m_scale * src;
-      }
-    };
-
-    template<typename Dest>
-    inline void evalTo(Dest& dest) const {
-      internal::outer_product_selector_run(*this, dest, set(), IsRowMajor<Dest>());
-    }
-
-    template<typename Dest>
-    inline void addTo(Dest& dest) const {
-      internal::outer_product_selector_run(*this, dest, add(), IsRowMajor<Dest>());
-    }
-
-    template<typename Dest>
-    inline void subTo(Dest& dest) const {
-      internal::outer_product_selector_run(*this, dest, sub(), IsRowMajor<Dest>());
-    }
-
-    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
-    {
-      internal::outer_product_selector_run(*this, dest, adds(alpha), IsRowMajor<Dest>());
-    }
-};
-
 /***********************************************************************
 *  Implementation of General Matrix Vector Product
 ***********************************************************************/
@@ -312,199 +151,183 @@
  */
 namespace internal {
 
-template<typename Lhs, typename Rhs>
-struct traits<GeneralProduct<Lhs,Rhs,GemvProduct> >
- : traits<ProductBase<GeneralProduct<Lhs,Rhs,GemvProduct>, Lhs, Rhs> >
-{};
-
 template<int Side, int StorageOrder, bool BlasCompatible>
-struct gemv_selector;
+struct gemv_dense_selector;
 
 } // end namespace internal
 
-template<typename Lhs, typename Rhs>
-class GeneralProduct<Lhs, Rhs, GemvProduct>
-  : public ProductBase<GeneralProduct<Lhs,Rhs,GemvProduct>, Lhs, Rhs>
-{
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
-
-    typedef typename Lhs::Scalar LhsScalar;
-    typedef typename Rhs::Scalar RhsScalar;
-
-    GeneralProduct(const Lhs& a_lhs, const Rhs& a_rhs) : Base(a_lhs,a_rhs)
-    {
-//       EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::Scalar, typename Rhs::Scalar>::value),
-//         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-    }
-
-    enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight };
-    typedef typename internal::conditional<int(Side)==OnTheRight,_LhsNested,_RhsNested>::type MatrixType;
-
-    template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
-    {
-      eigen_assert(m_lhs.rows() == dst.rows() && m_rhs.cols() == dst.cols());
-      internal::gemv_selector<Side,(int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor,
-                       bool(internal::blas_traits<MatrixType>::HasUsableDirectAccess)>::run(*this, dst, alpha);
-    }
-};
-
 namespace internal {
 
-// The vector is on the left => transposition
-template<int StorageOrder, bool BlasCompatible>
-struct gemv_selector<OnTheLeft,StorageOrder,BlasCompatible>
-{
-  template<typename ProductType, typename Dest>
-  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
-  {
-    Transpose<Dest> destT(dest);
-    enum { OtherStorageOrder = StorageOrder == RowMajor ? ColMajor : RowMajor };
-    gemv_selector<OnTheRight,OtherStorageOrder,BlasCompatible>
-      ::run(GeneralProduct<Transpose<const typename ProductType::_RhsNested>,Transpose<const typename ProductType::_LhsNested>, GemvProduct>
-        (prod.rhs().transpose(), prod.lhs().transpose()), destT, alpha);
-  }
-};
-
 template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vector_if;
 
 template<typename Scalar,int Size,int MaxSize>
 struct gemv_static_vector_if<Scalar,Size,MaxSize,false>
 {
-  EIGEN_STRONG_INLINE  Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
 };
 
 template<typename Scalar,int Size>
 struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
 {
-  EIGEN_STRONG_INLINE Scalar* data() { return 0; }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; }
 };
 
 template<typename Scalar,int Size,int MaxSize>
 struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
 {
-  #if EIGEN_ALIGN_STATICALLY
-  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0> m_data;
-  EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
-  #else
-  // Some architectures cannot align on the stack,
-  // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
   enum {
     ForceAlignment  = internal::packet_traits<Scalar>::Vectorizable,
     PacketSize      = internal::packet_traits<Scalar>::size
   };
-  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
+  #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
+  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0,EIGEN_PLAIN_ENUM_MIN(AlignedMax,PacketSize)> m_data;
+  EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
+  #else
+  // Some architectures cannot align on the stack,
+  // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
+  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?EIGEN_MAX_ALIGN_BYTES:0),0> m_data;
   EIGEN_STRONG_INLINE Scalar* data() {
     return ForceAlignment
-            ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES)
+            ? reinterpret_cast<Scalar*>((internal::UIntPtr(m_data.array) & ~(std::size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
             : m_data.array;
   }
   #endif
 };
 
-template<> struct gemv_selector<OnTheRight,ColMajor,true>
+// The vector is on the left => transposition
+template<int StorageOrder, bool BlasCompatible>
+struct gemv_dense_selector<OnTheLeft,StorageOrder,BlasCompatible>
 {
-  template<typename ProductType, typename Dest>
-  static inline void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
   {
-    typedef typename ProductType::Index Index;
-    typedef typename ProductType::LhsScalar   LhsScalar;
-    typedef typename ProductType::RhsScalar   RhsScalar;
-    typedef typename ProductType::Scalar      ResScalar;
-    typedef typename ProductType::RealScalar  RealScalar;
-    typedef typename ProductType::ActualLhsType ActualLhsType;
-    typedef typename ProductType::ActualRhsType ActualRhsType;
-    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
-    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    Transpose<Dest> destT(dest);
+    enum { OtherStorageOrder = StorageOrder == RowMajor ? ColMajor : RowMajor };
+    gemv_dense_selector<OnTheRight,OtherStorageOrder,BlasCompatible>
+      ::run(rhs.transpose(), lhs.transpose(), destT, alpha);
+  }
+};
 
-    ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs());
-    ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs());
+template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
+{
+  template<typename Lhs, typename Rhs, typename Dest>
+  static inline void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
+  {
+    typedef typename Lhs::Scalar   LhsScalar;
+    typedef typename Rhs::Scalar   RhsScalar;
+    typedef typename Dest::Scalar  ResScalar;
+    typedef typename Dest::RealScalar  RealScalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
 
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
-                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+    ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
+    ActualRhsType actualRhs = RhsBlasTraits::extract(rhs);
+
+    ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs);
+
+    // make sure Dest is a compile-time vector type (bug 1166)
+    typedef typename conditional<Dest::IsVectorAtCompileTime, Dest, typename Dest::ColXpr>::type ActualDest;
 
     enum {
       // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
       // on, the other hand it is good for the cache to pack the vector anyways...
-      EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1,
+      EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime==1),
       ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
-      MightCannotUseDest = (Dest::InnerStrideAtCompileTime!=1) || ComplexByReal
+      MightCannotUseDest = ((!EvalToDestAtCompileTime) || ComplexByReal) && (ActualDest::MaxSizeAtCompileTime!=0)
     };
 
-    gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
-
-    bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
-    bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
-
-    RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
-
-    ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
-                                                  evalToDest ? dest.data() : static_dest.data());
-
-    if(!evalToDest)
-    {
-      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = dest.size();
-      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      #endif
-      if(!alphaIsCompatible)
-      {
-        MappedDest(actualDestPtr, dest.size()).setZero();
-        compatibleAlpha = RhsScalar(1);
-      }
-      else
-        MappedDest(actualDestPtr, dest.size()) = dest;
-    }
-
     typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
     typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
-    general_matrix_vector_product
-        <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
-        actualLhs.rows(), actualLhs.cols(),
-        LhsMapper(actualLhs.data(), actualLhs.outerStride()),
-        RhsMapper(actualRhs.data(), actualRhs.innerStride()),
-        actualDestPtr, 1,
-        compatibleAlpha);
+    RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
 
-    if (!evalToDest)
+    if(!MightCannotUseDest)
     {
-      if(!alphaIsCompatible)
-        dest += actualAlpha * MappedDest(actualDestPtr, dest.size());
-      else
-        dest = MappedDest(actualDestPtr, dest.size());
+      // shortcut if we are sure to be able to use dest directly,
+      // this ease the compiler to generate cleaner and more optimzized code for most common cases
+      general_matrix_vector_product
+          <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
+          actualLhs.rows(), actualLhs.cols(),
+          LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+          RhsMapper(actualRhs.data(), actualRhs.innerStride()),
+          dest.data(), 1,
+          compatibleAlpha);
+    }
+    else
+    {
+      gemv_static_vector_if<ResScalar,ActualDest::SizeAtCompileTime,ActualDest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
+
+      const bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
+      const bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
+
+      ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
+                                                    evalToDest ? dest.data() : static_dest.data());
+
+      if(!evalToDest)
+      {
+        #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+        Index size = dest.size();
+        EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+        #endif
+        if(!alphaIsCompatible)
+        {
+          MappedDest(actualDestPtr, dest.size()).setZero();
+          compatibleAlpha = RhsScalar(1);
+        }
+        else
+          MappedDest(actualDestPtr, dest.size()) = dest;
+      }
+
+      general_matrix_vector_product
+          <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
+          actualLhs.rows(), actualLhs.cols(),
+          LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+          RhsMapper(actualRhs.data(), actualRhs.innerStride()),
+          actualDestPtr, 1,
+          compatibleAlpha);
+
+      if (!evalToDest)
+      {
+        if(!alphaIsCompatible)
+          dest.matrix() += actualAlpha * MappedDest(actualDestPtr, dest.size());
+        else
+          dest = MappedDest(actualDestPtr, dest.size());
+      }
     }
   }
 };
 
-template<> struct gemv_selector<OnTheRight,RowMajor,true>
+template<> struct gemv_dense_selector<OnTheRight,RowMajor,true>
 {
-  template<typename ProductType, typename Dest>
-  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
   {
-    typedef typename ProductType::LhsScalar LhsScalar;
-    typedef typename ProductType::RhsScalar RhsScalar;
-    typedef typename ProductType::Scalar    ResScalar;
-    typedef typename ProductType::Index Index;
-    typedef typename ProductType::ActualLhsType ActualLhsType;
-    typedef typename ProductType::ActualRhsType ActualRhsType;
-    typedef typename ProductType::_ActualRhsType _ActualRhsType;
-    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
-    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
+    typedef typename Lhs::Scalar   LhsScalar;
+    typedef typename Rhs::Scalar   RhsScalar;
+    typedef typename Dest::Scalar  ResScalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
 
-    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
-    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
+    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
 
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
-                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+    ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs);
 
     enum {
       // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
       // on, the other hand it is good for the cache to pack the vector anyways...
-      DirectlyUseRhs = _ActualRhsType::InnerStrideAtCompileTime==1
+      DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 || ActualRhsTypeCleaned::MaxSizeAtCompileTime==0
     };
 
-    gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
+    gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
 
     ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(),
         DirectlyUseRhs ? const_cast<RhsScalar*>(actualRhs.data()) : static_rhs.data());
@@ -512,10 +335,10 @@
     if(!DirectlyUseRhs)
     {
       #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = actualRhs.size();
+      Index size = actualRhs.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
       #endif
-      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+      Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
     }
 
     typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
@@ -525,34 +348,35 @@
         actualLhs.rows(), actualLhs.cols(),
         LhsMapper(actualLhs.data(), actualLhs.outerStride()),
         RhsMapper(actualRhsPtr, 1),
-        dest.data(), dest.innerStride(),
+        dest.data(), dest.col(0).innerStride(), //NOTE  if dest is not a vector at compile-time, then dest.innerStride() might be wrong. (bug 1166)
         actualAlpha);
   }
 };
 
-template<> struct gemv_selector<OnTheRight,ColMajor,false>
+template<> struct gemv_dense_selector<OnTheRight,ColMajor,false>
 {
-  template<typename ProductType, typename Dest>
-  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
   {
-    typedef typename Dest::Index Index;
-    // TODO makes sure dest is sequentially stored in memory, otherwise use a temp
-    const Index size = prod.rhs().rows();
+    EIGEN_STATIC_ASSERT((!nested_eval<Lhs,1>::Evaluate),EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE);
+    // TODO if rhs is large enough it might be beneficial to make sure that dest is sequentially stored in memory, otherwise use a temp
+    typename nested_eval<Rhs,1>::type actual_rhs(rhs);
+    const Index size = rhs.rows();
     for(Index k=0; k<size; ++k)
-      dest += (alpha*prod.rhs().coeff(k)) * prod.lhs().col(k);
+      dest += (alpha*actual_rhs.coeff(k)) * lhs.col(k);
   }
 };
 
-template<> struct gemv_selector<OnTheRight,RowMajor,false>
+template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
 {
-  template<typename ProductType, typename Dest>
-  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
   {
-    typedef typename Dest::Index Index;
-    // TODO makes sure rhs is sequentially stored in memory, otherwise use a temp
-    const Index rows = prod.rows();
+    EIGEN_STATIC_ASSERT((!nested_eval<Lhs,1>::Evaluate),EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE);
+    typename nested_eval<Rhs,Lhs::RowsAtCompileTime>::type actual_rhs(rhs);
+    const Index rows = dest.rows();
     for(Index i=0; i<rows; ++i)
-      dest.coeffRef(i) += alpha * (prod.lhs().row(i).cwiseProduct(prod.rhs().transpose())).sum();
+      dest.coeffRef(i) += alpha * (lhs.row(i).cwiseProduct(actual_rhs.transpose())).sum();
   }
 };
 
@@ -568,12 +392,10 @@
   *
   * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
   */
-#ifndef __CUDACC__
-
-#ifdef EIGEN_TEST_EVALUATORS
 template<typename Derived>
 template<typename OtherDerived>
-inline const Product<Derived, OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const Product<Derived, OtherDerived>
 MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 {
   // A note regarding the function declaration: In MSVC, this function will sometimes
@@ -601,39 +423,7 @@
 
   return Product<Derived, OtherDerived>(derived(), other.derived());
 }
-#else
-template<typename Derived>
-template<typename OtherDerived>
-inline const typename ProductReturnType<Derived, OtherDerived>::Type
-MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
-{
-  // A note regarding the function declaration: In MSVC, this function will sometimes
-  // not be inlined since DenseStorage is an unwindable object for dynamic
-  // matrices and product types are holding a member to store the result.
-  // Thus it does not help tagging this function with EIGEN_STRONG_INLINE.
-  enum {
-    ProductIsValid =  Derived::ColsAtCompileTime==Dynamic
-                   || OtherDerived::RowsAtCompileTime==Dynamic
-                   || int(Derived::ColsAtCompileTime)==int(OtherDerived::RowsAtCompileTime),
-    AreVectors = Derived::IsVectorAtCompileTime && OtherDerived::IsVectorAtCompileTime,
-    SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(Derived,OtherDerived)
-  };
-  // note to the lost user:
-  //    * for a dot product use: v1.dot(v2)
-  //    * for a coeff-wise product use: v1.cwiseProduct(v2)
-  EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes),
-    INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS)
-  EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors),
-    INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
-  EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
-#ifdef EIGEN_DEBUG_PRODUCT
-  internal::product_type<Derived,OtherDerived>::debug();
-#endif
-  return typename ProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
-}
-#endif
 
-#endif
 /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
   *
   * The returned product will behave like any other expressions: the coefficients of the product will be
@@ -647,7 +437,8 @@
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC const typename LazyProductReturnType<Derived,OtherDerived>::Type
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const Product<Derived,OtherDerived,LazyProduct>
 MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
 {
   enum {
@@ -666,7 +457,7 @@
     INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
   EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
 
-  return typename LazyProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
+  return Product<Derived,OtherDerived,LazyProduct>(derived(), other.derived());
 }
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 90cf4a1..cf677a1 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h

@@ -44,47 +44,62 @@
   enum {
     HasHalfPacket = 0,
 
-    HasAdd = 1,
-    HasSub = 1,
-    HasMul = 1,
-    HasNegate = 1,
-    HasAbs = 1,
-    HasAbs2 = 1,
-    HasMin = 1,
-    HasMax = 1,
-    HasConj = 1,
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 0,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
     HasSetLinear = 1,
-    HasBlend = 0,
+    HasBlend     = 0,
+    // This flag is used to indicate whether packet comparison is supported.
+    // pcmp_eq, pcmp_lt and pcmp_le should be defined for it to be true.
+    HasCmp       = 0,
 
-    HasDiv = 0,
-    HasSqrt = 0,
-    HasRsqrt = 0,
-    HasExp = 0,
-    HasExpm1 = 0,
-    HasLog = 0,
-    HasLog1p = 0,
-    HasPow = 0,
+    HasDiv    = 0,
+    HasSqrt   = 0,
+    HasRsqrt  = 0,
+    HasExp    = 0,
+    HasExpm1  = 0,
+    HasLog    = 0,
+    HasLog1p  = 0,
+    HasLog10  = 0,
+    HasPow    = 0,
 
-    HasSin = 0,
-    HasCos = 0,
-    HasTan = 0,
-    HasASin = 0,
-    HasACos = 0,
-    HasATan = 0,
-    HasTanH = 0,
+    HasSin    = 0,
+    HasCos    = 0,
+    HasTan    = 0,
+    HasASin   = 0,
+    HasACos   = 0,
+    HasATan   = 0,
+    HasSinh   = 0,
+    HasCosh   = 0,
+    HasTanh   = 0,
     HasLGamma = 0,
     HasDiGamma = 0,
     HasZeta = 0,
     HasPolygamma = 0,
     HasErf = 0,
     HasErfc = 0,
+    HasNdtri = 0,
+    HasBessel = 0,
     HasIGamma = 0,
+    HasIGammaDerA = 0,
+    HasGammaSampleDerAlpha = 0,
     HasIGammac = 0,
     HasBetaInc = 0,
 
-    HasRound = 0,
-    HasFloor = 0,
-    HasCeil = 0
+    HasRound  = 0,
+    HasRint   = 0,
+    HasFloor  = 0,
+    HasCeil   = 0,
+    HasSign   = 0
   };
 };
 
@@ -114,6 +129,21 @@
 
 template<typename T> struct packet_traits<const T> : packet_traits<T> { };
 
+template<typename T> struct unpacket_traits
+{
+  typedef T type;
+  typedef T half;
+  enum
+  {
+    size = 1,
+    alignment = 1,
+    vectorizable = false,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
+
+template<typename T> struct unpacket_traits<const T> : unpacket_traits<T> { };
 
 template <typename Src, typename Tgt> struct type_casting_traits {
   enum {
@@ -123,15 +153,35 @@
   };
 };
 
-template <typename T> struct type_casting_traits<T, T> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
+/** \internal Wrapper to ensure that multiple packet types can map to the same
+    same underlying vector type. */
+template<typename T, int unique_id = 0>
+struct eigen_packet_wrapper
+{
+  EIGEN_ALWAYS_INLINE operator T&() { return m_val; }
+  EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; }
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {}
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {}
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) {
+    m_val = v;
+    return *this;
+  }
+
+  T m_val;
 };
 
 
+/** \internal A convenience utility for determining if the type is a scalar.
+ * This is used to enable some generic packet implementations.
+ */
+template<typename Packet>
+struct is_scalar {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  enum {
+    value = internal::is_same<Packet, Scalar>::value
+  };
+};
+
 /** \internal \returns static_cast<TgtType>(a) (coeff-wise) */
 template <typename SrcPacket, typename TgtPacket>
 EIGEN_DEVICE_FUNC inline TgtPacket
@@ -143,91 +193,406 @@
 pcast(const SrcPacket& a, const SrcPacket& /*b*/) {
   return static_cast<TgtPacket>(a);
 }
-
 template <typename SrcPacket, typename TgtPacket>
 EIGEN_DEVICE_FUNC inline TgtPacket
 pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const SrcPacket& /*d*/) {
   return static_cast<TgtPacket>(a);
 }
+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket
+pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const SrcPacket& /*d*/,
+      const SrcPacket& /*e*/, const SrcPacket& /*f*/, const SrcPacket& /*g*/, const SrcPacket& /*h*/) {
+  return static_cast<TgtPacket>(a);
+}
+
+/** \internal \returns reinterpret_cast<Target>(a) */
+template <typename Target, typename Packet>
+EIGEN_DEVICE_FUNC inline Target
+preinterpret(const Packet& a); /* { return reinterpret_cast<const Target&>(a); } */
 
 /** \internal \returns a + b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-padd(const Packet& a,
-        const Packet& b) { return a+b; }
+padd(const Packet& a, const Packet& b) { return a+b; }
+// Avoid compiler warning for boolean algebra.
+template<> EIGEN_DEVICE_FUNC inline bool
+padd(const bool& a, const bool& b) { return a || b; }
 
 /** \internal \returns a - b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-psub(const Packet& a,
-        const Packet& b) { return a-b; }
-
-/** \internal \returns true for if a == b */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-peq(const Packet& a, const Packet& b) { return a == b; }
-
-/** \internal \returns true for if a < b */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-plt(const Packet& a, const Packet& b) { return a < b; }
-
-/** \internal \returns true for if a <= b */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-ple(const Packet& a, const Packet& b) { return a <= b; }
-
-/** \internal \returns b if false_mask is set, else a */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pselect(const Packet& a,
-        const Packet& b,
-        const Packet& false_mask) {
-  return false_mask ? b : a;
-}
+psub(const Packet& a, const Packet& b) { return a-b; }
 
 /** \internal \returns -a (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pnegate(const Packet& a) { return -a; }
 
-/** \internal \returns conj(a) (coeff-wise) */
+template<> EIGEN_DEVICE_FUNC inline bool
+pnegate(const bool& a) { return !a; }
 
+/** \internal \returns conj(a) (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pconj(const Packet& a) { return numext::conj(a); }
 
 /** \internal \returns a * b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pmul(const Packet& a,
-        const Packet& b) { return a*b; }
+pmul(const Packet& a, const Packet& b) { return a*b; }
+// Avoid compiler warning for boolean algebra.
+template<> EIGEN_DEVICE_FUNC inline bool
+pmul(const bool& a, const bool& b) { return a && b; }
 
 /** \internal \returns a / b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pdiv(const Packet& a,
-        const Packet& b) { return a/b; }
+pdiv(const Packet& a, const Packet& b) { return a/b; }
 
-/** \internal \returns the min of \a a and \a b  (coeff-wise) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pmin(const Packet& a,
-        const Packet& b) { return numext::mini(a, b); }
+// In the generic case, memset to all one bits.
+template<typename Packet, typename EnableIf = void>
+struct ptrue_impl {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/){
+    Packet b;
+    memset(static_cast<void*>(&b), 0xff, sizeof(Packet));
+    return b;
+  }
+};
 
-/** \internal \returns the max of \a a and \a b  (coeff-wise) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pmax(const Packet& a,
-        const Packet& b) { return numext::maxi(a, b); }
+// For non-trivial scalars, set to Scalar(1) (i.e. a non-zero value).
+// Although this is technically not a valid bitmask, the scalar path for pselect
+// uses a comparison to zero, so this should still work in most cases. We don't
+// have another option, since the scalar type requires initialization.
+template<typename T>
+struct ptrue_impl<T, 
+    typename internal::enable_if<is_scalar<T>::value && NumTraits<T>::RequireInitialization>::type > {
+  static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/){
+    return T(1);
+  }
+};
 
-/** \internal \returns the absolute value of \a a */
+/** \internal \returns one bits. */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pabs(const Packet& a) { using std::abs; return abs(a); }
+ptrue(const Packet& a) {
+  return ptrue_impl<Packet>::run(a);
+}
+
+// In the general case, memset to zero.
+template<typename Packet, typename EnableIf = void>
+struct pzero_impl {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/) {
+    Packet b;
+    memset(static_cast<void*>(&b), 0x00, sizeof(Packet));
+    return b;
+  }
+};
+
+// For scalars, explicitly set to Scalar(0), since the underlying representation
+// for zero may not consist of all-zero bits.
+template<typename T>
+struct pzero_impl<T,
+    typename internal::enable_if<is_scalar<T>::value>::type> {
+  static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/) {
+    return T(0);
+  }
+};
+
+/** \internal \returns packet of zeros */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pzero(const Packet& a) {
+  return pzero_impl<Packet>::run(a);
+}
+
+/** \internal \returns a <= b as a bit mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pcmp_le(const Packet& a, const Packet& b)  { return a<=b ? ptrue(a) : pzero(a); }
+
+/** \internal \returns a < b as a bit mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pcmp_lt(const Packet& a, const Packet& b)  { return a<b ? ptrue(a) : pzero(a); }
+
+/** \internal \returns a == b as a bit mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); }
+
+/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pcmp_lt_or_nan(const Packet& a, const Packet& b) { return a>=b ? pzero(a) : ptrue(a); }
+
+template<typename T>
+struct bit_and {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const {
+    return a & b;
+  }
+};
+
+template<typename T>
+struct bit_or {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const {
+    return a | b;
+  }
+};
+
+template<typename T>
+struct bit_xor {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const {
+    return a ^ b;
+  }
+};
+
+template<typename T>
+struct bit_not {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a) const {
+    return ~a;
+  }
+};
+
+// Use operators &, |, ^, ~.
+template<typename T>
+struct operator_bitwise_helper {
+  EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) { return bit_and<T>()(a, b); }
+  EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { return bit_or<T>()(a, b); }
+  EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) { return bit_xor<T>()(a, b); }
+  EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { return bit_not<T>()(a); }
+};
+
+// Apply binary operations byte-by-byte
+template<typename T>
+struct bytewise_bitwise_helper {
+  EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) {
+    return binary(a, b, bit_and<unsigned char>());
+  }
+  EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { 
+    return binary(a, b, bit_or<unsigned char>());
+   }
+  EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) {
+    return binary(a, b, bit_xor<unsigned char>());
+  }
+  EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { 
+    return unary(a,bit_not<unsigned char>());
+   }
+  
+ private:
+  template<typename Op>
+  EIGEN_DEVICE_FUNC static inline T unary(const T& a, Op op) {
+    const unsigned char* a_ptr = reinterpret_cast<const unsigned char*>(&a);
+    T c;
+    unsigned char* c_ptr = reinterpret_cast<unsigned char*>(&c);
+    for (size_t i = 0; i < sizeof(T); ++i) {
+      *c_ptr++ = op(*a_ptr++);
+    }
+    return c;
+  }
+
+  template<typename Op>
+  EIGEN_DEVICE_FUNC static inline T binary(const T& a, const T& b, Op op) {
+    const unsigned char* a_ptr = reinterpret_cast<const unsigned char*>(&a);
+    const unsigned char* b_ptr = reinterpret_cast<const unsigned char*>(&b);
+    T c;
+    unsigned char* c_ptr = reinterpret_cast<unsigned char*>(&c);
+    for (size_t i = 0; i < sizeof(T); ++i) {
+      *c_ptr++ = op(*a_ptr++, *b_ptr++);
+    }
+    return c;
+  }
+};
+
+// In the general case, use byte-by-byte manipulation.
+template<typename T, typename EnableIf = void>
+struct bitwise_helper : public bytewise_bitwise_helper<T> {};
+
+// For integers or non-trivial scalars, use binary operators.
+template<typename T>
+struct bitwise_helper<T,
+  typename internal::enable_if<
+    is_scalar<T>::value && (NumTraits<T>::IsInteger || NumTraits<T>::RequireInitialization)>::type
+  > : public operator_bitwise_helper<T> {};
 
 /** \internal \returns the bitwise and of \a a and \a b */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pand(const Packet& a, const Packet& b) { return a & b; }
+pand(const Packet& a, const Packet& b) {
+  return bitwise_helper<Packet>::bitwise_and(a, b);
+}
 
 /** \internal \returns the bitwise or of \a a and \a b */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-por(const Packet& a, const Packet& b) { return a | b; }
+por(const Packet& a, const Packet& b) {
+  return bitwise_helper<Packet>::bitwise_or(a, b);
+}
 
 /** \internal \returns the bitwise xor of \a a and \a b */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pxor(const Packet& a, const Packet& b) { return a ^ b; }
+pxor(const Packet& a, const Packet& b) {
+  return bitwise_helper<Packet>::bitwise_xor(a, b);
+}
 
-/** \internal \returns the bitwise andnot of \a a and \a b */
+/** \internal \returns the bitwise not of \a a */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pandnot(const Packet& a, const Packet& b) { return a & (!b); }
+pnot(const Packet& a) {
+  return bitwise_helper<Packet>::bitwise_not(a);
+}
+
+/** \internal \returns the bitwise and of \a a and not \a b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pandnot(const Packet& a, const Packet& b) { return pand(a, pnot(b)); }
+
+// In the general case, use bitwise select.
+template<typename Packet, typename EnableIf = void>
+struct pselect_impl {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) {
+    return por(pand(a,mask),pandnot(b,mask));
+  }
+};
+
+// For scalars, use ternary select.
+template<typename Packet>
+struct pselect_impl<Packet, 
+    typename internal::enable_if<is_scalar<Packet>::value>::type > {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) {
+    return numext::equal_strict(mask, Packet(0)) ? b : a;
+  }
+};
+
+/** \internal \returns \a or \b for each field in packet according to \mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pselect(const Packet& mask, const Packet& a, const Packet& b) {
+  return pselect_impl<Packet>::run(mask, a, b);
+}
+
+template<> EIGEN_DEVICE_FUNC inline bool pselect<bool>(
+    const bool& cond, const bool& a, const bool& b) {
+  return cond ? a : b;
+}
+
+/** \internal \returns the min or of \a a and \a b (coeff-wise)
+    If either \a a or \a b are NaN, the result is implementation defined. */
+template<int NaNPropagation>
+struct pminmax_impl {
+  template <typename Packet, typename Op>
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) {
+    return op(a,b);
+  }
+};
+
+/** \internal \returns the min or max of \a a and \a b (coeff-wise)
+    If either \a a or \a b are NaN, NaN is returned. */
+template<>
+struct pminmax_impl<PropagateNaN> {
+  template <typename Packet, typename Op>
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) {
+  Packet not_nan_mask_a = pcmp_eq(a, a);
+  Packet not_nan_mask_b = pcmp_eq(b, b);
+  return pselect(not_nan_mask_a,
+                 pselect(not_nan_mask_b, op(a, b), b),
+                 a);
+  }
+};
+
+/** \internal \returns the min or max of \a a and \a b (coeff-wise)
+    If both \a a and \a b are NaN, NaN is returned.
+    Equivalent to std::fmin(a, b).  */
+template<>
+struct pminmax_impl<PropagateNumbers> {
+  template <typename Packet, typename Op>
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) {
+  Packet not_nan_mask_a = pcmp_eq(a, a);
+  Packet not_nan_mask_b = pcmp_eq(b, b);
+  return pselect(not_nan_mask_a,
+                 pselect(not_nan_mask_b, op(a, b), a),
+                 b);
+  }
+};
+
+
+#ifndef SYCL_DEVICE_ONLY
+#define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) Func
+#else
+#define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) \
+[](const Type& a, const Type& b) { \
+        return Func(a, b);}
+#endif
+
+/** \internal \returns the min of \a a and \a b  (coeff-wise).
+    If \a a or \b b is NaN, the return value is implementation defined. */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pmin(const Packet& a, const Packet& b) { return numext::mini(a,b); }
+
+/** \internal \returns the min of \a a and \a b  (coeff-wise).
+    NaNPropagation determines the NaN propagation semantics. */
+template <int NaNPropagation, typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pmin(const Packet& a, const Packet& b) {
+  return pminmax_impl<NaNPropagation>::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet, (pmin<Packet>)));
+}
+
+/** \internal \returns the max of \a a and \a b  (coeff-wise)
+    If \a a or \b b is NaN, the return value is implementation defined. */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pmax(const Packet& a, const Packet& b) { return numext::maxi(a, b); }
+
+/** \internal \returns the max of \a a and \a b  (coeff-wise).
+    NaNPropagation determines the NaN propagation semantics. */
+template <int NaNPropagation, typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pmax(const Packet& a, const Packet& b) {
+  return pminmax_impl<NaNPropagation>::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet,(pmax<Packet>)));
+}
+
+/** \internal \returns the absolute value of \a a */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pabs(const Packet& a) { return numext::abs(a); }
+template<> EIGEN_DEVICE_FUNC inline unsigned int
+pabs(const unsigned int& a) { return a; }
+template<> EIGEN_DEVICE_FUNC inline unsigned long
+pabs(const unsigned long& a) { return a; }
+template<> EIGEN_DEVICE_FUNC inline unsigned long long
+pabs(const unsigned long long& a) { return a; }
+
+/** \internal \returns the addsub value of \a a,b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+paddsub(const Packet& a, const Packet& b) {
+  return pselect(peven_mask(a), padd(a, b), psub(a, b));
+ }
+
+/** \internal \returns the phase angle of \a a */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+parg(const Packet& a) { using numext::arg; return arg(a); }
+
+
+/** \internal \returns \a a logically shifted by N bits to the right */
+template<int N> EIGEN_DEVICE_FUNC inline int
+parithmetic_shift_right(const int& a) { return a >> N; }
+template<int N> EIGEN_DEVICE_FUNC inline long int
+parithmetic_shift_right(const long int& a) { return a >> N; }
+
+/** \internal \returns \a a arithmetically shifted by N bits to the right */
+template<int N> EIGEN_DEVICE_FUNC inline int
+plogical_shift_right(const int& a) { return static_cast<int>(static_cast<unsigned int>(a) >> N); }
+template<int N> EIGEN_DEVICE_FUNC inline long int
+plogical_shift_right(const long int& a) { return static_cast<long>(static_cast<unsigned long>(a) >> N); }
+
+/** \internal \returns \a a shifted by N bits to the left */
+template<int N> EIGEN_DEVICE_FUNC inline int
+plogical_shift_left(const int& a) { return a << N; }
+template<int N> EIGEN_DEVICE_FUNC inline long int
+plogical_shift_left(const long int& a) { return a << N; }
+
+/** \internal \returns the significant and exponent of the underlying floating point numbers
+  * See https://en.cppreference.com/w/cpp/numeric/math/frexp
+  */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pfrexp(const Packet& a, Packet& exponent) {
+  int exp;
+  EIGEN_USING_STD(frexp);
+  Packet result = static_cast<Packet>(frexp(a, &exp));
+  exponent = static_cast<Packet>(exp);
+  return result;
+}
+
+/** \internal \returns a * 2^((int)exponent)
+  * See https://en.cppreference.com/w/cpp/numeric/math/ldexp
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pldexp(const Packet &a, const Packet &exponent) {
+  EIGEN_USING_STD(ldexp)
+  return static_cast<Packet>(ldexp(a, static_cast<int>(exponent)));
+}
+
+/** \internal \returns the min of \a a and \a b  (coeff-wise) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pabsdiff(const Packet& a, const Packet& b) { return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b)); }
 
 /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -237,10 +602,22 @@
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
 
+/** \internal \returns a packet version of \a *from, (un-aligned masked load)
+ * There is no generic implementation. We only have implementations for specialized
+ * cases. Generic case should not be called.
+ */
+template<typename Packet> EIGEN_DEVICE_FUNC inline
+typename enable_if<unpacket_traits<Packet>::masked_load_available, Packet>::type
+ploadu(const typename unpacket_traits<Packet>::type* from, typename unpacket_traits<Packet>::mask_t umask);
+
 /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
 
+/** \internal \returns a packet with constant coefficients set from bits */
+template<typename Packet,typename BitsType> EIGEN_DEVICE_FUNC inline Packet
+pset1frombits(BitsType a);
+
 /** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pload1(const typename unpacket_traits<Packet>::type  *a) { return pset1<Packet>(*a); }
@@ -250,7 +627,7 @@
   * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
   * Currently, this function is only used for scalar * complex products.
   */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
 ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
 
 /** \internal \returns a packet with elements of \a *from quadrupled.
@@ -298,9 +675,22 @@
 }
 
 /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
-template<typename Scalar>
-EIGEN_DEVICE_FUNC inline typename packet_traits<Scalar>::type
-plset(const Scalar& a) { return a; }
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
+plset(const typename unpacket_traits<Packet>::type& a) { return a; }
+
+/** \internal \returns a packet with constant coefficients \a a, e.g.: (x, 0, x, 0),
+     where x is the value of all 1-bits. */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+peven_mask(const Packet& /*a*/) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  const size_t n = unpacket_traits<Packet>::size;
+  EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n];
+  for(size_t i = 0; i < n; ++i) {
+    memset(elements+i, ((i & 1) == 0 ? 0xff : 0), sizeof(Scalar));
+  }
+  return ploadu<Packet>(elements);
+}
+
 
 /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
 template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from)
@@ -310,61 +700,39 @@
 template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
 {  (*to) = from; }
 
- template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, int /*stride*/)
+/** \internal copy the packet \a from to \a *to, (un-aligned store with a mask)
+ * There is no generic implementation. We only have implementations for specialized
+ * cases. Generic case should not be called.
+ */
+template<typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline
+typename enable_if<unpacket_traits<Packet>::masked_store_available, void>::type
+pstoreu(Scalar* to, const Packet& from, typename unpacket_traits<Packet>::mask_t umask);
+
+ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/)
  { return ploadu<Packet>(from); }
 
- template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, int /*stride*/)
+ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index /*stride*/)
  { pstore(to, from); }
 
 /** \internal tries to do cache prefetching of \a addr */
 template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
 {
-#ifdef __CUDA_ARCH__
-#if defined(__LP64__)
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+  // do nothing
+#elif defined(EIGEN_CUDA_ARCH)
+#if defined(__LP64__) || EIGEN_OS_WIN64
   // 64-bit pointer operand constraint for inlined asm
   asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
 #else
   // 32-bit pointer operand constraint for inlined asm
   asm(" prefetch.L1 [ %1 ];" : "=r"(addr) : "r"(addr));
 #endif
-#elif !defined(_MSC_VER)
+#elif (!EIGEN_COMP_MSVC) && (EIGEN_COMP_GNUC || EIGEN_COMP_CLANG || EIGEN_COMP_ICC)
   __builtin_prefetch(addr);
 #endif
 }
 
-/** \internal \returns the first element of a packet */
-template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type pfirst(const Packet& a)
-{ return a; }
-
-/** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-preduxp(const Packet* vecs) { return vecs[0]; }
-
-/** \internal \returns the sum of the elements of \a a*/
-template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
-{ return a; }
-
-/** \internal \returns the sum of the elements of \a a by block of 4 elements.
-  * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
-  * For packet-size smaller or equal to 4, this boils down to a noop.
-  */
-template<typename Packet> EIGEN_DEVICE_FUNC inline
-typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
-predux4(const Packet& a)
-{ return a; }
-
-/** \internal \returns the product of the elements of \a a*/
-template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
-{ return a; }
-
-/** \internal \returns the min of the elements of \a a*/
-template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
-{ return a; }
-
-/** \internal \returns the max of the elements of \a a*/
-template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
-{ return a; }
-
 /** \internal \returns the reversed elements of \a a*/
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
 { return a; }
@@ -372,10 +740,7 @@
 /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
 {
-  // FIXME: uncomment the following in case we drop the internal imag and real functions.
-//   using std::imag;
-//   using std::real;
-  return Packet(imag(a),real(a));
+  return Packet(numext::imag(a),numext::real(a));
 }
 
 /**************************
@@ -384,91 +749,76 @@
 
 /** \internal \returns the sine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psin(const Packet& a) { using std::sin; return sin(a); }
+Packet psin(const Packet& a) { EIGEN_USING_STD(sin); return sin(a); }
 
 /** \internal \returns the cosine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pcos(const Packet& a) { using std::cos; return cos(a); }
+Packet pcos(const Packet& a) { EIGEN_USING_STD(cos); return cos(a); }
 
 /** \internal \returns the tan of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet ptan(const Packet& a) { using std::tan; return tan(a); }
+Packet ptan(const Packet& a) { EIGEN_USING_STD(tan); return tan(a); }
 
 /** \internal \returns the arc sine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pasin(const Packet& a) { using std::asin; return asin(a); }
+Packet pasin(const Packet& a) { EIGEN_USING_STD(asin); return asin(a); }
 
 /** \internal \returns the arc cosine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pacos(const Packet& a) { using std::acos; return acos(a); }
+Packet pacos(const Packet& a) { EIGEN_USING_STD(acos); return acos(a); }
 
-/** \internal \returns the atan of \a a (coeff-wise) */
+/** \internal \returns the arc tangent of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patan(const Packet& a) { using std::atan; return atan(a); }
+Packet patan(const Packet& a) { EIGEN_USING_STD(atan); return atan(a); }
+
+/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet psinh(const Packet& a) { EIGEN_USING_STD(sinh); return sinh(a); }
+
+/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pcosh(const Packet& a) { EIGEN_USING_STD(cosh); return cosh(a); }
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet ptanh(const Packet& a) { EIGEN_USING_STD(tanh); return tanh(a); }
 
 /** \internal \returns the exp of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_DEVICE_FUNC
-Packet pexp(const Packet& a) { using std::exp; return exp(a); }
+Packet pexp(const Packet& a) { EIGEN_USING_STD(exp); return exp(a); }
 
 /** \internal \returns the expm1 of \a a (coeff-wise) */
-template<typename Packet>
-EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC Packet
-pexpm1(const Packet& a) {
-  return numext::expm1(a);
-}
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pexpm1(const Packet& a) { return numext::expm1(a); }
 
 /** \internal \returns the log of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_DEVICE_FUNC
-Packet plog(const Packet& a) { using std::log; return log(a); }
+Packet plog(const Packet& a) { EIGEN_USING_STD(log); return log(a); }
 
 /** \internal \returns the log1p of \a a (coeff-wise) */
-template <typename Packet>
-EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC Packet
-plog1p(const Packet& a) {
-  return numext::log1p(a);
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plog1p(const Packet& a) { return numext::log1p(a); }
+
+/** \internal \returns the log10 of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plog10(const Packet& a) { EIGEN_USING_STD(log10); return log10(a); }
+
+/** \internal \returns the log10 of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plog2(const Packet& a) {
+  typedef typename internal::unpacket_traits<Packet>::type Scalar;
+  return pmul(pset1<Packet>(Scalar(EIGEN_LOG2E)), plog(a)); 
 }
 
 /** \internal \returns the square-root of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_DEVICE_FUNC
-Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); }
+Packet psqrt(const Packet& a) { return numext::sqrt(a); }
 
 /** \internal \returns the reciprocal square-root of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_DEVICE_FUNC
 Packet prsqrt(const Packet& a) {
-  using std::sqrt;
-  const Packet one(1);
-  return one/sqrt(a);
-}
-
-// Default ptanh approximation threshold, assumes single precision
-// floating point.
-template<typename Packet> EIGEN_DEVICE_FUNC Packet ptanh_approx_threshold() {
-  return pset1<Packet>(0.01);
-}
-
-/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_DEVICE_FUNC Packet ptanh(const Packet& x)
-{
-  const Packet one = pset1<Packet>(1);
-  const Packet two = pset1<Packet>(2);
-  const Packet three = pset1<Packet>(3);
-  const Packet thresh = ptanh_approx_threshold<Packet>();
-  const Packet x2 = pmul(x, x);
-  const Packet small_approx = pmul(x, psub(one, pdiv(x2, three)));
-  const Packet med_approx = psub(one, pdiv(two, padd(pexp(pmul(two, x)), one)));
-
-  // If |x| > thresh, tanh(x) = 1-2/(exp(2*x) + 1)
-  // tanh(x) can be written: x(1 - x^2/3 + ...) for |x| < pi/2
-  // Select a thresh s.t. |tanh(x) - x| = O(eps), where for floats,
-  // If |x| < thresh, tanh(x) = x*(1-x^2/3)
-  // Use theresh = 0.01 as this matches the float32 approximation
-  // threshold on my system!
-  return pselect(med_approx, small_approx, ple(pabs(x), thresh));
+  typedef typename internal::unpacket_traits<Packet>::type Scalar;
+  return pdiv(pset1<Packet>(Scalar(1)), psqrt(a));
 }
 
 /** \internal \returns the rounded value of \a a (coeff-wise) */
@@ -479,15 +829,121 @@
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pfloor(const Packet& a) { using numext::floor; return floor(a); }
 
+/** \internal \returns the rounded value of \a a (coeff-wise) with current
+ * rounding mode */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet print(const Packet& a) { using numext::rint; return rint(a); }
+
 /** \internal \returns the ceil of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); }
 
+/** \internal \returns the first element of a packet */
+template<typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type
+pfirst(const Packet& a)
+{ return a; }
+
+/** \internal \returns the sum of the elements of upper and lower half of \a a if \a a is larger than 4.
+  * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
+  * For packet-size smaller or equal to 4, this boils down to a noop.
+  */
+template<typename Packet>
+EIGEN_DEVICE_FUNC inline typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
+predux_half_dowto4(const Packet& a)
+{ return a; }
+
+// Slow generic implementation of Packet reduction.
+template <typename Packet, typename Op>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type
+predux_helper(const Packet& a, Op op) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  const size_t n = unpacket_traits<Packet>::size;
+  EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n];
+  pstoreu<Scalar>(elements, a);
+  for(size_t k = n / 2; k > 0; k /= 2)  {
+    for(size_t i = 0; i < k; ++i) {
+      elements[i] = op(elements[i], elements[i + k]);
+    }
+  }
+  return elements[0];
+}
+
+/** \internal \returns the sum of the elements of \a a*/
+template<typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type
+predux(const Packet& a)
+{
+  return a;
+}
+
+/** \internal \returns the product of the elements of \a a */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(
+    const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar; 
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmul<Scalar>)));
+}
+
+/** \internal \returns the min of the elements of \a a */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(
+    const Packet &a) {
+  typedef typename unpacket_traits<Packet>::type Scalar; 
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin<PropagateFast, Scalar>)));
+}
+
+template <int NaNPropagation, typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(
+    const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar; 
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin<NaNPropagation, Scalar>)));
+}
+
+/** \internal \returns the min of the elements of \a a */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(
+    const Packet &a) {
+  typedef typename unpacket_traits<Packet>::type Scalar; 
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax<PropagateFast, Scalar>)));
+}
+
+template <int NaNPropagation, typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(
+    const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar; 
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax<NaNPropagation, Scalar>)));
+}
+
+#undef EIGEN_BINARY_OP_NAN_PROPAGATION
+
+/** \internal \returns true if all coeffs of \a a means "true"
+  * It is supposed to be called on values returned by pcmp_*.
+  */
+// not needed yet
+// template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a)
+// { return bool(a); }
+
+/** \internal \returns true if any coeffs of \a a means "true"
+  * It is supposed to be called on values returned by pcmp_*.
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a)
+{
+  // Dirty but generic implementation where "true" is assumed to be non 0 and all the sames.
+  // It is expected that "true" is either:
+  //  - Scalar(1)
+  //  - bits full of ones (NaN for floats),
+  //  - or first bit equals to 1 (1 for ints, smallest denormal for floats).
+  // For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars.
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  return numext::not_equal_strict(predux(a), Scalar(0));
+}
+
 /***************************************************************************
 * The following functions might not have to be overwritten for vectorized types
 ***************************************************************************/
 
-/** \internal copy a packet with constant coeficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */
+/** \internal copy a packet with constant coefficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */
 // NOTE: this function must really be templated on the packet type (think about different packet types for the same scalar type)
 template<typename Packet>
 inline void pstore1(typename unpacket_traits<Packet>::type* to, const typename unpacket_traits<Packet>::type& a)
@@ -503,22 +959,22 @@
 { return padd(pmul(a, b),c); }
 
 /** \internal \returns a packet version of \a *from.
-  * If LoadMode equals #Aligned, \a from must be 16 bytes aligned */
-template<typename Packet, int LoadMode>
+  * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+template<typename Packet, int Alignment>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits<Packet>::type* from)
 {
-  if(LoadMode == Aligned)
+  if(Alignment >= unpacket_traits<Packet>::alignment)
     return pload<Packet>(from);
   else
     return ploadu<Packet>(from);
 }
 
 /** \internal copy the packet \a from to \a *to.
-  * If StoreMode equals #Aligned, \a to must be 16 bytes aligned */
-template<typename Scalar, typename Packet, int LoadMode>
+  * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+template<typename Scalar, typename Packet, int Alignment>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from)
 {
-  if(LoadMode == Aligned)
+  if(Alignment >= unpacket_traits<Packet>::alignment)
     pstore(to, from);
   else
     pstoreu(to, from);
@@ -530,53 +986,23 @@
   * by the current computation.
   */
 template<typename Packet, int LoadMode>
-EIGEN_DEVICE_FUNC
-inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
 {
   return ploadt<Packet, LoadMode>(from);
 }
 
-/** \internal default implementation of palign() allowing partial specialization */
-template<int Offset,typename PacketType>
-struct palign_impl
-{
-  // by default data are aligned, so there is nothing to be done :)
-  static inline void run(PacketType&, const PacketType&) {}
-};
-
-/** \internal update \a first using the concatenation of the packet_size minus \a Offset last elements
-  * of \a first and \a Offset first elements of \a second.
-  *
-  * This function is currently only used to optimize matrix-vector products on unligned matrices.
-  * It takes 2 packets that represent a contiguous memory array, and returns a packet starting
-  * at the position \a Offset. For instance, for packets of 4 elements, we have:
-  *  Input:
-  *  - first = {f0,f1,f2,f3}
-  *  - second = {s0,s1,s2,s3}
-  * Output:
-  *   - if Offset==0 then {f0,f1,f2,f3}
-  *   - if Offset==1 then {f1,f2,f3,s0}
-  *   - if Offset==2 then {f2,f3,s0,s1}
-  *   - if Offset==3 then {f3,s0,s1,s3}
-  */
-template<int Offset,typename PacketType>
-inline void palign(PacketType& first, const PacketType& second)
-{
-  palign_impl<Offset,PacketType>::run(first,second);
-}
-
 /***************************************************************************
 * Fast complex products (GCC generates a function call which is very slow)
 ***************************************************************************/
 
 // Eigen+CUDA does not support complexes.
-#ifndef __CUDACC__
+#if !defined(EIGEN_GPUCC)
 
 template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
-{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
+{ return std::complex<float>(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); }
 
 template<> inline std::complex<double> pmul(const std::complex<double>& a, const std::complex<double>& b)
-{ return std::complex<double>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
+{ return std::complex<double>(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); }
 
 #endif
 
@@ -589,12 +1015,11 @@
   Packet packet[N];
 };
 
-template<typename SquarePacketBlock> EIGEN_DEVICE_FUNC inline void
-ptranspose(SquarePacketBlock& /*kernel*/) {
+template<typename Packet> EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet,1>& /*kernel*/) {
   // Nothing to do in the scalar case, i.e. a 1x1 matrix.
 }
 
-
 /***************************************************************************
  * Selector, i.e. vector of N boolean values used to select (i.e. blend)
  * words from 2 packets.

diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h
index 8f91826..629af94 100644
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h

@@ -11,13 +11,30 @@
 #ifndef EIGEN_GLOBAL_FUNCTIONS_H
 #define EIGEN_GLOBAL_FUNCTIONS_H
 
-#define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME,FUNCTOR) \
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+
+#define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME,FUNCTOR,DOC_OP,DOC_DETAILS) \
+  /** \returns an expression of the coefficient-wise DOC_OP of \a x
+
+    DOC_DETAILS
+
+    \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_##NAME">Math functions</a>, class CwiseUnaryOp
+    */ \
   template<typename Derived> \
   inline const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> \
-  NAME(const Eigen::ArrayBase<Derived>& x) { \
-    return x.derived(); \
+  NAME(const Eigen::ArrayBase<Derived>& x);
+
+#else
+
+#define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME,FUNCTOR,DOC_OP,DOC_DETAILS) \
+  template<typename Derived> \
+  inline const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> \
+  (NAME)(const Eigen::ArrayBase<Derived>& x) { \
+    return Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived>(x.derived()); \
   }
 
+#endif // EIGEN_PARSED_BY_DOXYGEN
+
 #define EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(NAME,FUNCTOR) \
   \
   template<typename Derived> \
@@ -30,73 +47,139 @@
   { \
     static inline typename NAME##_retval<ArrayBase<Derived> >::type run(const Eigen::ArrayBase<Derived>& x) \
     { \
-      return x.derived(); \
+      return typename NAME##_retval<ArrayBase<Derived> >::type(x.derived()); \
     } \
   };
 
-
 namespace Eigen
 {
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(real,scalar_real_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(imag,scalar_imag_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(conj,scalar_conjugate_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sin,scalar_sin_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cos,scalar_cos_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan,scalar_atan_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp, scalar_exp_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(expm1, scalar_expm1_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log1p, scalar_log1p_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round,scalar_round_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(floor,scalar_floor_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ceil,scalar_ceil_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(real,scalar_real_op,real part,\sa ArrayBase::real)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(imag,scalar_imag_op,imaginary part,\sa ArrayBase::imag)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(conj,scalar_conjugate_op,complex conjugate,\sa ArrayBase::conjugate)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(inverse,scalar_inverse_op,inverse,\sa ArrayBase::inverse)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sin,scalar_sin_op,sine,\sa ArrayBase::sin)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cos,scalar_cos_op,cosine,\sa ArrayBase::cos)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op,tangent,\sa ArrayBase::tan)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan,scalar_atan_op,arc-tangent,\sa ArrayBase::atan)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op,arc-sine,\sa ArrayBase::asin)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op,arc-consine,\sa ArrayBase::acos)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh)
+#if EIGEN_HAS_CXX11_MATH
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asinh,scalar_asinh_op,inverse hyperbolic sine,\sa ArrayBase::asinh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acosh,scalar_acosh_op,inverse hyperbolic cosine,\sa ArrayBase::acosh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atanh,scalar_atanh_op,inverse hyperbolic tangent,\sa ArrayBase::atanh)
+#endif
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op,complement error function,\sa ArrayBase::erfc)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ndtri,scalar_ndtri_op,inverse normal distribution function,\sa ArrayBase::ndtri)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op,exponential,\sa ArrayBase::exp)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(expm1,scalar_expm1_op,exponential of a value minus 1,\sa ArrayBase::expm1)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op,natural logarithm,\sa Eigen::log10 DOXCOMMA ArrayBase::log)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log1p,scalar_log1p_op,natural logarithm of 1 plus the value,\sa ArrayBase::log1p)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op,base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log10)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log2,scalar_log2_op,base 2 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log2)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op,absolute value,\sa ArrayBase::abs DOXCOMMA MatrixBase::cwiseAbs)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs2,scalar_abs2_op,squared absolute value,\sa ArrayBase::abs2 DOXCOMMA MatrixBase::cwiseAbs2)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op,complex argument,\sa ArrayBase::arg DOXCOMMA MatrixBase::cwiseArg)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op,square root,\sa ArrayBase::sqrt DOXCOMMA MatrixBase::cwiseSqrt)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rsqrt,scalar_rsqrt_op,reciprocal square root,\sa ArrayBase::rsqrt)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op,square (power 2),\sa Eigen::abs2 DOXCOMMA Eigen::pow DOXCOMMA ArrayBase::square)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube,scalar_cube_op,cube (power 3),\sa Eigen::pow DOXCOMMA ArrayBase::cube)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rint,scalar_rint_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round,scalar_round_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(floor,scalar_floor_op,nearest integer not greater than the giben value,\sa Eigen::ceil DOXCOMMA ArrayBase::floor)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ceil,scalar_ceil_op,nearest integer not less than the giben value,\sa Eigen::floor DOXCOMMA ArrayBase::ceil)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isnan,scalar_isnan_op,not-a-number test,\sa Eigen::isinf DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isnan)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op,infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign)
 
-  template<typename Derived>
-  inline const Eigen::CwiseUnaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar>, const Derived>
-  pow(const Eigen::ArrayBase<Derived>& x, const typename Derived::Scalar& exponent) {
-    return x.derived().pow(exponent);
-  }
-
-  template<typename Derived>
-  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const Derived, const Derived>
-  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<Derived>& exponents)
+  /** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent.
+    *
+    * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar).
+    *
+    * \sa ArrayBase::pow()
+    *
+    * \relates ArrayBase
+    */
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  template<typename Derived,typename ScalarExponent>
+  inline const CwiseBinaryOp<internal::scalar_pow_op<Derived::Scalar,ScalarExponent>,Derived,Constant<ScalarExponent> >
+  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent);
+#else
+  template <typename Derived,typename ScalarExponent>
+  EIGEN_DEVICE_FUNC inline
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
+    const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<typename Derived::Scalar
+                                                 EIGEN_COMMA ScalarExponent EIGEN_COMMA
+                                                 EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type,pow))
+  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent)
   {
-    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const Derived, const Derived>(
+    typedef typename internal::promote_scalar_arg<typename Derived::Scalar,ScalarExponent,
+                                                  EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type PromotedExponent;
+    return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedExponent,pow)(x.derived(),
+           typename internal::plain_constant_type<Derived,PromotedExponent>::type(x.derived().rows(), x.derived().cols(), internal::scalar_constant_op<PromotedExponent>(exponent)));
+  }
+#endif
+
+  /** \returns an expression of the coefficient-wise power of \a x to the given array of \a exponents.
+    *
+    * This function computes the coefficient-wise power.
+    *
+    * Example: \include Cwise_array_power_array.cpp
+    * Output: \verbinclude Cwise_array_power_array.out
+    *
+    * \sa ArrayBase::pow()
+    *
+    * \relates ArrayBase
+    */
+  template<typename Derived,typename ExponentDerived>
+  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
+  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents)
+  {
+    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
       x.derived(),
       exponents.derived()
     );
   }
 
-  template<typename Derived>
-  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const typename Derived::ConstantReturnType, const Derived>
-  pow(const typename Derived::Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
-  {
-    typename Derived::ConstantReturnType constant_x(exponents.rows(), exponents.cols(), x);
-    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const typename Derived::ConstantReturnType, const Derived>(
-      constant_x,
-      exponents.derived()
-    );
+  /** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents.
+    *
+    * This function computes the coefficient-wise power between a scalar and an array of exponents.
+    *
+    * \tparam Scalar is the scalar type of \a x. It must be compatible with the scalar type of the given array expression (\c Derived::Scalar).
+    *
+    * Example: \include Cwise_scalar_power_array.cpp
+    * Output: \verbinclude Cwise_scalar_power_array.out
+    *
+    * \sa ArrayBase::pow()
+    *
+    * \relates ArrayBase
+    */
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  template<typename Scalar,typename Derived>
+  inline const CwiseBinaryOp<internal::scalar_pow_op<Scalar,Derived::Scalar>,Constant<Scalar>,Derived>
+  pow(const Scalar& x,const Eigen::ArrayBase<Derived>& x);
+#else
+  template <typename Scalar, typename Derived>
+  EIGEN_DEVICE_FUNC inline
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
+    const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<typename Derived::Scalar
+                                                 EIGEN_COMMA Scalar EIGEN_COMMA
+                                                 EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type,Derived,pow))
+  pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents) {
+    typedef typename internal::promote_scalar_arg<typename Derived::Scalar,Scalar,
+                                                  EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type PromotedScalar;
+    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedScalar,Derived,pow)(
+           typename internal::plain_constant_type<Derived,PromotedScalar>::type(exponents.derived().rows(), exponents.derived().cols(), internal::scalar_constant_op<PromotedScalar>(x)), exponents.derived());
   }
+#endif
 
-  /**
-  * \brief Component-wise division of a scalar by array elements.
-  **/
-  template <typename Derived>
-  inline const Eigen::CwiseUnaryOp<Eigen::internal::scalar_inverse_mult_op<typename Derived::Scalar>, const Derived>
-    operator/(const typename Derived::Scalar& s, const Eigen::ArrayBase<Derived>& a)
-  {
-    return Eigen::CwiseUnaryOp<
-        Eigen::internal::scalar_inverse_mult_op<typename Derived::Scalar>,
-        const Derived>(
-        a.derived(),
-        Eigen::internal::scalar_inverse_mult_op<typename Derived::Scalar>(s));
-  }
 
   namespace internal
   {

diff --git a/Eigen/src/Core/IO.h b/Eigen/src/Core/IO.h
index a1a90c1..e81c315 100644
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h

@@ -41,6 +41,7 @@
   *  - \b rowSuffix string printed at the end of each row
   *  - \b matPrefix string printed at the beginning of the matrix
   *  - \b matSuffix string printed at the end of the matrix
+  *  - \b fill character printed to fill the empty space in aligned columns
   *
   * Example: \include IOFormat.cpp
   * Output: \verbinclude IOFormat.out
@@ -53,9 +54,9 @@
   IOFormat(int _precision = StreamPrecision, int _flags = 0,
     const std::string& _coeffSeparator = " ",
     const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="",
-    const std::string& _matPrefix="", const std::string& _matSuffix="")
+    const std::string& _matPrefix="", const std::string& _matSuffix="", const char _fill=' ')
   : matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator),
-    rowSpacer(""), coeffSeparator(_coeffSeparator), precision(_precision), flags(_flags)
+    rowSpacer(""), coeffSeparator(_coeffSeparator), fill(_fill), precision(_precision), flags(_flags)
   {
     // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline
     // don't add rowSpacer if columns are not to be aligned
@@ -71,6 +72,7 @@
   std::string matPrefix, matSuffix;
   std::string rowPrefix, rowSuffix, rowSeparator, rowSpacer;
   std::string coeffSeparator;
+  char fill;
   int precision;
   int flags;
 };
@@ -80,7 +82,7 @@
   *
   * \brief Pseudo expression providing matrix output with given format
   *
-  * \param ExpressionType the type of the object on which IO stream operations are performed
+  * \tparam ExpressionType the type of the object on which IO stream operations are performed
   *
   * This class represents an expression with stream operators controlled by a given IOFormat.
   * It is the return type of DenseBase::format()
@@ -105,57 +107,32 @@
     }
 
   protected:
-    const typename ExpressionType::Nested m_matrix;
+    typename ExpressionType::Nested m_matrix;
     IOFormat m_format;
 };
 
-/** \returns a WithFormat proxy object allowing to print a matrix the with given
-  * format \a fmt.
-  *
-  * See class IOFormat for some examples.
-  *
-  * \sa class IOFormat, class WithFormat
-  */
-template<typename Derived>
-inline const WithFormat<Derived>
-DenseBase<Derived>::format(const IOFormat& fmt) const
-{
-  return WithFormat<Derived>(derived(), fmt);
-}
-
 namespace internal {
 
-template<typename Scalar, bool IsInteger>
-struct significant_decimals_default_impl
-{
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline int run()
-  {
-    using std::ceil;
-    using std::log;
-    return cast<RealScalar,int>(ceil(-log(NumTraits<RealScalar>::epsilon())/log(RealScalar(10))));
-  }
-};
-
-template<typename Scalar>
-struct significant_decimals_default_impl<Scalar, true>
-{
-  static inline int run()
-  {
-    return 0;
-  }
-};
-
+// NOTE: This helper is kept for backward compatibility with previous code specializing
+//       this internal::significant_decimals_impl structure. In the future we should directly
+//       call digits10() which has been introduced in July 2016 in 3.3.
 template<typename Scalar>
 struct significant_decimals_impl
-  : significant_decimals_default_impl<Scalar, NumTraits<Scalar>::IsInteger>
-{};
+{
+  static inline int run()
+  {
+    return NumTraits<Scalar>::digits10();
+  }
+};
 
 /** \internal
   * print the matrix \a _m to the output stream \a s using the output format \a fmt */
 template<typename Derived>
 std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& fmt)
 {
+  using internal::is_same;
+  using internal::conditional;
+
   if(_m.size() == 0)
   {
     s << fmt.matPrefix << fmt.matSuffix;
@@ -164,7 +141,22 @@
   
   typename Derived::Nested m = _m;
   typedef typename Derived::Scalar Scalar;
-  typedef typename Derived::Index Index;
+  typedef typename
+      conditional<
+          is_same<Scalar, char>::value ||
+            is_same<Scalar, unsigned char>::value ||
+            is_same<Scalar, numext::int8_t>::value ||
+            is_same<Scalar, numext::uint8_t>::value,
+          int,
+          typename conditional<
+              is_same<Scalar, std::complex<char> >::value ||
+                is_same<Scalar, std::complex<unsigned char> >::value ||
+                is_same<Scalar, std::complex<numext::int8_t> >::value ||
+                is_same<Scalar, std::complex<numext::uint8_t> >::value,
+              std::complex<int>,
+              const Scalar&
+            >::type
+        >::type PrintType;
 
   Index width = 0;
 
@@ -201,33 +193,42 @@
       {
         std::stringstream sstr;
         sstr.copyfmt(s);
-        sstr << m.coeff(i,j);
+        sstr << static_cast<PrintType>(m.coeff(i,j));
         width = std::max<Index>(width, Index(sstr.str().length()));
       }
   }
+  std::streamsize old_width = s.width();
+  char old_fill_character = s.fill();
   s << fmt.matPrefix;
-  const char old_fill = s.fill();
-  s.fill(' ');
   for(Index i = 0; i < m.rows(); ++i)
   {
     if (i)
       s << fmt.rowSpacer;
     s << fmt.rowPrefix;
-    if(width) s.width(width);
-    s << m.coeff(i, 0);
+    if(width) {
+      s.fill(fmt.fill);
+      s.width(width);
+    }
+    s << static_cast<PrintType>(m.coeff(i, 0));
     for(Index j = 1; j < m.cols(); ++j)
     {
       s << fmt.coeffSeparator;
-      if (width) s.width(width);
-      s << m.coeff(i, j);
+      if(width) {
+        s.fill(fmt.fill);
+        s.width(width);
+      }
+      s << static_cast<PrintType>(m.coeff(i, j));
     }
     s << fmt.rowSuffix;
     if( i < m.rows() - 1)
       s << fmt.rowSeparator;
   }
-  s.fill(old_fill);
   s << fmt.matSuffix;
   if(explicit_precision) s.precision(old_precision);
+  if(width) {
+    s.fill(old_fill_character);
+    s.width(old_width);
+  }
   return s;
 }
 

diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h
new file mode 100644
index 0000000..0847625
--- /dev/null
+++ b/Eigen/src/Core/IndexedView.h

@@ -0,0 +1,237 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INDEXED_VIEW_H
+#define EIGEN_INDEXED_VIEW_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename XprType, typename RowIndices, typename ColIndices>
+struct traits<IndexedView<XprType, RowIndices, ColIndices> >
+ : traits<XprType>
+{
+  enum {
+    RowsAtCompileTime = int(array_size<RowIndices>::value),
+    ColsAtCompileTime = int(array_size<ColIndices>::value),
+    MaxRowsAtCompileTime = RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime) : Dynamic,
+    MaxColsAtCompileTime = ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime) : Dynamic,
+
+    XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0,
+    IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
+               : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
+               : XprTypeIsRowMajor,
+
+    RowIncr = int(get_compile_time_incr<RowIndices>::value),
+    ColIncr = int(get_compile_time_incr<ColIndices>::value),
+    InnerIncr = IsRowMajor ? ColIncr : RowIncr,
+    OuterIncr = IsRowMajor ? RowIncr : ColIncr,
+
+    HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor),
+    XprInnerStride = HasSameStorageOrderAsXprType ? int(inner_stride_at_compile_time<XprType>::ret) : int(outer_stride_at_compile_time<XprType>::ret),
+    XprOuterstride = HasSameStorageOrderAsXprType ? int(outer_stride_at_compile_time<XprType>::ret) : int(inner_stride_at_compile_time<XprType>::ret),
+
+    InnerSize = XprTypeIsRowMajor ? ColsAtCompileTime : RowsAtCompileTime,
+    IsBlockAlike = InnerIncr==1 && OuterIncr==1,
+    IsInnerPannel = HasSameStorageOrderAsXprType && is_same<AllRange<InnerSize>,typename conditional<XprTypeIsRowMajor,ColIndices,RowIndices>::type>::value,
+
+    InnerStrideAtCompileTime = InnerIncr<0 || InnerIncr==DynamicIndex || XprInnerStride==Dynamic ? Dynamic : XprInnerStride * InnerIncr,
+    OuterStrideAtCompileTime = OuterIncr<0 || OuterIncr==DynamicIndex || XprOuterstride==Dynamic ? Dynamic : XprOuterstride * OuterIncr,
+
+    ReturnAsScalar = is_same<RowIndices,SingleRange>::value && is_same<ColIndices,SingleRange>::value,
+    ReturnAsBlock = (!ReturnAsScalar) && IsBlockAlike,
+    ReturnAsIndexedView = (!ReturnAsScalar) && (!ReturnAsBlock),
+
+    // FIXME we deal with compile-time strides if and only if we have DirectAccessBit flag,
+    // but this is too strict regarding negative strides...
+    DirectAccessMask = (int(InnerIncr)!=UndefinedIncr && int(OuterIncr)!=UndefinedIncr && InnerIncr>=0 && OuterIncr>=0) ? DirectAccessBit : 0,
+    FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
+    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
+    Flags = (traits<XprType>::Flags & (HereditaryBits | DirectAccessMask )) | FlagsLvalueBit | FlagsRowMajorBit | FlagsLinearAccessBit
+  };
+
+  typedef Block<XprType,RowsAtCompileTime,ColsAtCompileTime,IsInnerPannel> BlockType;
+};
+
+}
+
+template<typename XprType, typename RowIndices, typename ColIndices, typename StorageKind>
+class IndexedViewImpl;
+
+
+/** \class IndexedView
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a non-sequential sub-matrix defined by arbitrary sequences of row and column indices
+  *
+  * \tparam XprType the type of the expression in which we are taking the intersections of sub-rows and sub-columns
+  * \tparam RowIndices the type of the object defining the sequence of row indices
+  * \tparam ColIndices the type of the object defining the sequence of column indices
+  *
+  * This class represents an expression of a sub-matrix (or sub-vector) defined as the intersection
+  * of sub-sets of rows and columns, that are themself defined by generic sequences of row indices \f$ \{r_0,r_1,..r_{m-1}\} \f$
+  * and column indices \f$ \{c_0,c_1,..c_{n-1} \}\f$. Let \f$ A \f$  be the nested matrix, then the resulting matrix \f$ B \f$ has \c m
+  * rows and \c n columns, and its entries are given by: \f$ B(i,j) = A(r_i,c_j) \f$.
+  *
+  * The \c RowIndices and \c ColIndices types must be compatible with the following API:
+  * \code
+  * <integral type> operator[](Index) const;
+  * Index size() const;
+  * \endcode
+  *
+  * Typical supported types thus include:
+  *  - std::vector<int>
+  *  - std::valarray<int>
+  *  - std::array<int>
+  *  - Plain C arrays: int[N]
+  *  - Eigen::ArrayXi
+  *  - decltype(ArrayXi::LinSpaced(...))
+  *  - Any view/expressions of the previous types
+  *  - Eigen::ArithmeticSequence
+  *  - Eigen::internal::AllRange      (helper for Eigen::all)
+  *  - Eigen::internal::SingleRange  (helper for single index)
+  *  - etc.
+  *
+  * In typical usages of %Eigen, this class should never be used directly. It is the return type of
+  * DenseBase::operator()(const RowIndices&, const ColIndices&).
+  *
+  * \sa class Block
+  */
+template<typename XprType, typename RowIndices, typename ColIndices>
+class IndexedView : public IndexedViewImpl<XprType, RowIndices, ColIndices, typename internal::traits<XprType>::StorageKind>
+{
+public:
+  typedef typename IndexedViewImpl<XprType, RowIndices, ColIndices, typename internal::traits<XprType>::StorageKind>::Base Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(IndexedView)
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(IndexedView)
+
+  typedef typename internal::ref_selector<XprType>::non_const_type MatrixTypeNested;
+  typedef typename internal::remove_all<XprType>::type NestedExpression;
+
+  template<typename T0, typename T1>
+  IndexedView(XprType& xpr, const T0& rowIndices, const T1& colIndices)
+    : m_xpr(xpr), m_rowIndices(rowIndices), m_colIndices(colIndices)
+  {}
+
+  /** \returns number of rows */
+  Index rows() const { return internal::size(m_rowIndices); }
+
+  /** \returns number of columns */
+  Index cols() const { return internal::size(m_colIndices); }
+
+  /** \returns the nested expression */
+  const typename internal::remove_all<XprType>::type&
+  nestedExpression() const { return m_xpr; }
+
+  /** \returns the nested expression */
+  typename internal::remove_reference<XprType>::type&
+  nestedExpression() { return m_xpr; }
+
+  /** \returns a const reference to the object storing/generating the row indices */
+  const RowIndices& rowIndices() const { return m_rowIndices; }
+
+  /** \returns a const reference to the object storing/generating the column indices */
+  const ColIndices& colIndices() const { return m_colIndices; }
+
+protected:
+  MatrixTypeNested m_xpr;
+  RowIndices m_rowIndices;
+  ColIndices m_colIndices;
+};
+
+
+// Generic API dispatcher
+template<typename XprType, typename RowIndices, typename ColIndices, typename StorageKind>
+class IndexedViewImpl
+  : public internal::generic_xpr_base<IndexedView<XprType, RowIndices, ColIndices> >::type
+{
+public:
+  typedef typename internal::generic_xpr_base<IndexedView<XprType, RowIndices, ColIndices> >::type Base;
+};
+
+namespace internal {
+
+
+template<typename ArgType, typename RowIndices, typename ColIndices>
+struct unary_evaluator<IndexedView<ArgType, RowIndices, ColIndices>, IndexBased>
+  : evaluator_base<IndexedView<ArgType, RowIndices, ColIndices> >
+{
+  typedef IndexedView<ArgType, RowIndices, ColIndices> XprType;
+
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost /* TODO + cost of row/col index */,
+
+    FlagsLinearAccessBit = (traits<XprType>::RowsAtCompileTime == 1 || traits<XprType>::ColsAtCompileTime == 1) ? LinearAccessBit : 0,
+
+    FlagsRowMajorBit = traits<XprType>::FlagsRowMajorBit, 
+
+    Flags = (evaluator<ArgType>::Flags & (HereditaryBits & ~RowMajorBit /*| LinearAccessBit | DirectAccessBit*/)) | FlagsLinearAccessBit | FlagsRowMajorBit,
+
+    Alignment = 0
+  };
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_argImpl.coeff(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  {
+    EIGEN_STATIC_ASSERT_LVALUE(XprType)
+    Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
+    Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
+    return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar& coeffRef(Index index) const
+  {
+    Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
+    Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
+    return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const CoeffReturnType coeff(Index index) const
+  {
+    Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
+    Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
+    return m_argImpl.coeff( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
+  }
+
+protected:
+
+  evaluator<ArgType> m_argImpl;
+  const XprType& m_xpr;
+
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_INDEXED_VIEW_H

diff --git a/Eigen/src/Core/Inverse.h b/Eigen/src/Core/Inverse.h
new file mode 100644
index 0000000..c514438
--- /dev/null
+++ b/Eigen/src/Core/Inverse.h

@@ -0,0 +1,117 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INVERSE_H
+#define EIGEN_INVERSE_H
+
+namespace Eigen {
+
+template<typename XprType,typename StorageKind> class InverseImpl;
+
+namespace internal {
+
+template<typename XprType>
+struct traits<Inverse<XprType> >
+  : traits<typename XprType::PlainObject>
+{
+  typedef typename XprType::PlainObject PlainObject;
+  typedef traits<PlainObject> BaseTraits;
+  enum {
+    Flags = BaseTraits::Flags & RowMajorBit
+  };
+};
+
+} // end namespace internal
+
+/** \class Inverse
+  *
+  * \brief Expression of the inverse of another expression
+  *
+  * \tparam XprType the type of the expression we are taking the inverse
+  *
+  * This class represents an abstract expression of A.inverse()
+  * and most of the time this is the only way it is used.
+  *
+  */
+template<typename XprType>
+class Inverse : public InverseImpl<XprType,typename internal::traits<XprType>::StorageKind>
+{
+public:
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename XprType::Scalar                            Scalar;
+  typedef typename internal::ref_selector<XprType>::type      XprTypeNested;
+  typedef typename internal::remove_all<XprTypeNested>::type  XprTypeNestedCleaned;
+  typedef typename internal::ref_selector<Inverse>::type Nested;
+  typedef typename internal::remove_all<XprType>::type NestedExpression;
+
+  explicit EIGEN_DEVICE_FUNC Inverse(const XprType &xpr)
+    : m_xpr(xpr)
+  {}
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR  Index rows() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR  Index cols() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
+
+  EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; }
+
+protected:
+  XprTypeNested m_xpr;
+};
+
+// Generic API dispatcher
+template<typename XprType, typename StorageKind>
+class InverseImpl
+  : public internal::generic_xpr_base<Inverse<XprType> >::type
+{
+public:
+  typedef typename internal::generic_xpr_base<Inverse<XprType> >::type Base;
+  typedef typename XprType::Scalar Scalar;
+private:
+
+  Scalar coeff(Index row, Index col) const;
+  Scalar coeff(Index i) const;
+};
+
+namespace internal {
+
+/** \internal
+  * \brief Default evaluator for Inverse expression.
+  *
+  * This default evaluator for Inverse expression simply evaluate the inverse into a temporary
+  * by a call to internal::call_assignment_no_alias.
+  * Therefore, inverse implementers only have to specialize Assignment<Dst,Inverse<...>, ...> for
+  * there own nested expression.
+  *
+  * \sa class Inverse
+  */
+template<typename ArgType>
+struct unary_evaluator<Inverse<ArgType> >
+  : public evaluator<typename Inverse<ArgType>::PlainObject>
+{
+  typedef Inverse<ArgType> InverseType;
+  typedef typename InverseType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  enum { Flags = Base::Flags | EvalBeforeNestingBit };
+
+  unary_evaluator(const InverseType& inv_xpr)
+    : m_result(inv_xpr.rows(), inv_xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    internal::call_assignment_no_alias(m_result, inv_xpr);
+  }
+
+protected:
+  PlainObject m_result;
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_INVERSE_H

diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index 0838d69..218cc15 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h

@@ -11,7 +11,35 @@
 #ifndef EIGEN_MAP_H
 #define EIGEN_MAP_H
 
-namespace Eigen { 
+namespace Eigen {
+
+namespace internal {
+template<typename PlainObjectType, int MapOptions, typename StrideType>
+struct traits<Map<PlainObjectType, MapOptions, StrideType> >
+  : public traits<PlainObjectType>
+{
+  typedef traits<PlainObjectType> TraitsBase;
+  enum {
+    PlainObjectTypeInnerSize = ((traits<PlainObjectType>::Flags&RowMajorBit)==RowMajorBit)
+                             ? PlainObjectType::ColsAtCompileTime
+                             : PlainObjectType::RowsAtCompileTime,
+
+    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
+                             ? int(PlainObjectType::InnerStrideAtCompileTime)
+                             : int(StrideType::InnerStrideAtCompileTime),
+    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
+                             ? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic
+                                ? Dynamic
+                                : int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize))
+                             : int(StrideType::OuterStrideAtCompileTime),
+    Alignment = int(MapOptions)&int(AlignedMask),
+    Flags0 = TraitsBase::Flags & (~NestByRefBit),
+    Flags = is_lvalue<PlainObjectType>::value ? int(Flags0) : (int(Flags0) & ~LvalueBit)
+  };
+private:
+  enum { Options }; // Expressions don't have Options
+};
+}
 
 /** \class Map
   * \ingroup Core_Module
@@ -19,7 +47,7 @@
   * \brief A matrix or vector expression mapping an existing array of data.
   *
   * \tparam PlainObjectType the equivalent matrix type of the mapped data
-  * \tparam MapOptions specifies whether the pointer is \c #Aligned, or \c #Unaligned.
+  * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
   *                The default is \c #Unaligned.
   * \tparam StrideType optionally specifies strides. By default, Map assumes the memory layout
   *                   of an ordinary, contiguous array. This can be overridden by specifying strides.
@@ -63,44 +91,6 @@
   *
   * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
   */
-
-namespace internal {
-template<typename PlainObjectType, int MapOptions, typename StrideType>
-struct traits<Map<PlainObjectType, MapOptions, StrideType> >
-  : public traits<PlainObjectType>
-{
-  typedef traits<PlainObjectType> TraitsBase;
-  typedef typename PlainObjectType::Index Index;
-  typedef typename PlainObjectType::Scalar Scalar;
-  enum {
-    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
-                             ? int(PlainObjectType::InnerStrideAtCompileTime)
-                             : int(StrideType::InnerStrideAtCompileTime),
-    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
-                             ? int(PlainObjectType::OuterStrideAtCompileTime)
-                             : int(StrideType::OuterStrideAtCompileTime),
-    HasNoInnerStride = InnerStrideAtCompileTime == 1,
-    HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0,
-    HasNoStride = HasNoInnerStride && HasNoOuterStride,
-    IsAligned = bool(EIGEN_ALIGN) && ((int(MapOptions)&Aligned)==Aligned),
-    IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic,
-    KeepsPacketAccess = bool(HasNoInnerStride)
-                        && ( bool(IsDynamicSize)
-                           || HasNoOuterStride
-                           || ( OuterStrideAtCompileTime!=Dynamic
-                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%EIGEN_ALIGN_BYTES)==0 ) ),
-    Flags0 = TraitsBase::Flags & (~NestByRefBit),
-    Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit),
-    Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
-           ? int(Flags1) : int(Flags1 & ~LinearAccessBit),
-    Flags3 = is_lvalue<PlainObjectType>::value ? int(Flags2) : (int(Flags2) & ~LvalueBit),
-    Flags = KeepsPacketAccess ? int(Flags3) : (int(Flags3) & ~PacketAccessBit)
-  };
-private:
-  enum { Options }; // Expressions don't have Options
-};
-}
-
 template<typename PlainObjectType, int MapOptions, typename StrideType> class Map
   : public MapBase<Map<PlainObjectType, MapOptions, StrideType> >
 {
@@ -110,38 +100,34 @@
     EIGEN_DENSE_PUBLIC_INTERFACE(Map)
 
     typedef typename Base::PointerType PointerType;
-#if EIGEN2_SUPPORT_STAGE <= STAGE30_FULL_EIGEN3_API
-    typedef const Scalar* PointerArgType;
-    inline PointerType cast_to_pointer_type(PointerArgType ptr) { return const_cast<PointerType>(ptr); }
-#else
     typedef PointerType PointerArgType;
     EIGEN_DEVICE_FUNC
     inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; }
-#endif
 
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index innerStride() const
     {
       return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
     }
 
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index outerStride() const
     {
       return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
-           : IsVectorAtCompileTime ? this->size()
-           : int(Flags)&RowMajorBit ? this->cols()
-           : this->rows();
+           : internal::traits<Map>::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
+           : IsVectorAtCompileTime ? (this->size() * innerStride())
+           : int(Flags)&RowMajorBit ? (this->cols() * innerStride())
+           : (this->rows() * innerStride());
     }
 
     /** Constructor in the fixed-size case.
       *
       * \param dataPtr pointer to the array to map
-      * \param a_stride optional Stride object, passing the strides.
+      * \param stride optional Stride object, passing the strides.
       */
     EIGEN_DEVICE_FUNC
-    inline Map(PointerArgType dataPtr, const StrideType& a_stride = StrideType())
-      : Base(cast_to_pointer_type(dataPtr)), m_stride(a_stride)
+    explicit inline Map(PointerArgType dataPtr, const StrideType& stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr)), m_stride(stride)
     {
       PlainObjectType::Base::_check_template_params();
     }
@@ -149,12 +135,12 @@
     /** Constructor in the dynamic-size vector case.
       *
       * \param dataPtr pointer to the array to map
-      * \param a_size the size of the vector expression
-      * \param a_stride optional Stride object, passing the strides.
+      * \param size the size of the vector expression
+      * \param stride optional Stride object, passing the strides.
       */
     EIGEN_DEVICE_FUNC
-    inline Map(PointerArgType dataPtr, Index a_size, const StrideType& a_stride = StrideType())
-      : Base(cast_to_pointer_type(dataPtr), a_size), m_stride(a_stride)
+    inline Map(PointerArgType dataPtr, Index size, const StrideType& stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr), size), m_stride(stride)
     {
       PlainObjectType::Base::_check_template_params();
     }
@@ -162,13 +148,13 @@
     /** Constructor in the dynamic-size matrix case.
       *
       * \param dataPtr pointer to the array to map
-      * \param nbRows the number of rows of the matrix expression
-      * \param nbCols the number of columns of the matrix expression
-      * \param a_stride optional Stride object, passing the strides.
+      * \param rows the number of rows of the matrix expression
+      * \param cols the number of columns of the matrix expression
+      * \param stride optional Stride object, passing the strides.
       */
     EIGEN_DEVICE_FUNC
-    inline Map(PointerArgType dataPtr, Index nbRows, Index nbCols, const StrideType& a_stride = StrideType())
-      : Base(cast_to_pointer_type(dataPtr), nbRows, nbCols), m_stride(a_stride)
+    inline Map(PointerArgType dataPtr, Index rows, Index cols, const StrideType& stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr), rows, cols), m_stride(stride)
     {
       PlainObjectType::Base::_check_template_params();
     }

diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index e8ecb17..d856447 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h

@@ -12,15 +12,25 @@
 #define EIGEN_MAPBASE_H
 
 #define EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived) \
-      EIGEN_STATIC_ASSERT((int(internal::traits<Derived>::Flags) & LinearAccessBit) || Derived::IsVectorAtCompileTime, \
+      EIGEN_STATIC_ASSERT((int(internal::evaluator<Derived>::Flags) & LinearAccessBit) || Derived::IsVectorAtCompileTime, \
                           YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT)
 
-namespace Eigen { 
+namespace Eigen {
 
-/** \class MapBase
-  * \ingroup Core_Module
+/** \ingroup Core_Module
   *
-  * \brief Base class for Map and Block expression with direct access
+  * \brief Base class for dense Map and Block expression with direct access
+  *
+  * This base class provides the const low-level accessors (e.g. coeff, coeffRef) of dense
+  * Map and Block objects with direct access.
+  * Typical users do not have to directly deal with this class.
+  *
+  * This class can be extended by through the macro plugin \c EIGEN_MAPBASE_PLUGIN.
+  * See \link TopicCustomizing_Plugins customizing Eigen \endlink for details.
+  *
+  * The \c Derived class has to provide the following two methods describing the memory layout:
+  *  \code Index innerStride() const; \endcode
+  *  \code Index outerStride() const; \endcode
   *
   * \sa class Map, class Block
   */
@@ -33,11 +43,11 @@
     enum {
       RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
       ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+      InnerStrideAtCompileTime = internal::traits<Derived>::InnerStrideAtCompileTime,
       SizeAtCompileTime = Base::SizeAtCompileTime
     };
 
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -76,8 +86,12 @@
 
     typedef typename Base::CoeffReturnType CoeffReturnType;
 
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_rows.value(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_cols.value(); }
+    /** \copydoc DenseBase::rows() */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_rows.value(); }
+    /** \copydoc DenseBase::cols() */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_cols.value(); }
 
     /** Returns a pointer to the first coefficient of the matrix or vector.
       *
@@ -85,14 +99,16 @@
       *
       * \sa innerStride(), outerStride()
       */
-    inline const Scalar* data() const { return m_data; }
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_data; }
 
+    /** \copydoc PlainObjectBase::coeff(Index,Index) const */
     EIGEN_DEVICE_FUNC
     inline const Scalar& coeff(Index rowId, Index colId) const
     {
       return m_data[colId * colStride() + rowId * rowStride()];
     }
 
+    /** \copydoc PlainObjectBase::coeff(Index) const */
     EIGEN_DEVICE_FUNC
     inline const Scalar& coeff(Index index) const
     {
@@ -100,12 +116,14 @@
       return m_data[index * innerStride()];
     }
 
+    /** \copydoc PlainObjectBase::coeffRef(Index,Index) const */
     EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index rowId, Index colId) const
     {
       return this->m_data[colId * colStride() + rowId * rowStride()];
     }
 
+    /** \copydoc PlainObjectBase::coeffRef(Index) const */
     EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index index) const
     {
@@ -113,6 +131,7 @@
       return this->m_data[index * innerStride()];
     }
 
+    /** \internal */
     template<int LoadMode>
     inline PacketScalar packet(Index rowId, Index colId) const
     {
@@ -120,6 +139,7 @@
                (m_data + (colId * colStride() + rowId * rowStride()));
     }
 
+    /** \internal */
     template<int LoadMode>
     inline PacketScalar packet(Index index) const
     {
@@ -127,13 +147,15 @@
       return internal::ploadt<PacketScalar, LoadMode>(m_data + index * innerStride());
     }
 
+    /** \internal Constructor for fixed size matrices or vectors */
     EIGEN_DEVICE_FUNC
-    inline MapBase(PointerType dataPtr) : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
+    explicit inline MapBase(PointerType dataPtr) : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
     {
       EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-      checkSanity();
+      checkSanity<Derived>();
     }
 
+    /** \internal Constructor for dynamically sized vectors */
     EIGEN_DEVICE_FUNC
     inline MapBase(PointerType dataPtr, Index vecSize)
             : m_data(dataPtr),
@@ -143,46 +165,72 @@
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
       eigen_assert(vecSize >= 0);
       eigen_assert(dataPtr == 0 || SizeAtCompileTime == Dynamic || SizeAtCompileTime == vecSize);
-      checkSanity();
+      checkSanity<Derived>();
     }
 
+    /** \internal Constructor for dynamically sized matrices */
     EIGEN_DEVICE_FUNC
-    inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols)
-            : m_data(dataPtr), m_rows(nbRows), m_cols(nbCols)
+    inline MapBase(PointerType dataPtr, Index rows, Index cols)
+            : m_data(dataPtr), m_rows(rows), m_cols(cols)
     {
       eigen_assert( (dataPtr == 0)
-              || (   nbRows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == nbRows)
-                  && nbCols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == nbCols)));
-      checkSanity();
+              || (   rows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
+                  && cols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols)));
+      checkSanity<Derived>();
     }
 
+    #ifdef EIGEN_MAPBASE_PLUGIN
+    #include EIGEN_MAPBASE_PLUGIN
+    #endif
+
   protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase)
 
+    template<typename T>
     EIGEN_DEVICE_FUNC
-    void checkSanity() const
+    void checkSanity(typename internal::enable_if<(internal::traits<T>::Alignment>0),void*>::type = 0) const
     {
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(internal::traits<Derived>::Flags&PacketAccessBit,
-                                        internal::inner_stride_at_compile_time<Derived>::ret==1),
-                          PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
-      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % EIGEN_ALIGN_BYTES) == 0)
-                   && "data is not aligned");
+#if EIGEN_MAX_ALIGN_BYTES>0
+      // innerStride() is not set yet when this function is called, so we optimistically assume the lowest plausible value:
+      const Index minInnerStride = InnerStrideAtCompileTime == Dynamic ? 1 : Index(InnerStrideAtCompileTime);
+      EIGEN_ONLY_USED_FOR_DEBUG(minInnerStride);
+      eigen_assert((   ((internal::UIntPtr(m_data) % internal::traits<Derived>::Alignment) == 0)
+                    || (cols() * rows() * minInnerStride * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
+#endif
     }
 
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    void checkSanity(typename internal::enable_if<internal::traits<T>::Alignment==0,void*>::type = 0) const
+    {}
+
     PointerType m_data;
     const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_rows;
     const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_cols;
 };
 
+/** \ingroup Core_Module
+  *
+  * \brief Base class for non-const dense Map and Block expression with direct access
+  *
+  * This base class provides the non-const low-level accessors (e.g. coeff and coeffRef) of
+  * dense Map and Block objects with direct access.
+  * It inherits MapBase<Derived, ReadOnlyAccessors> which defines the const variant for reading specific entries.
+  *
+  * \sa class Map, class Block
+  */
 template<typename Derived> class MapBase<Derived, WriteAccessors>
   : public MapBase<Derived, ReadOnlyAccessors>
 {
+    typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase;
   public:
 
     typedef MapBase<Derived, ReadOnlyAccessors> Base;
 
     typedef typename Base::Scalar Scalar;
     typedef typename Base::PacketScalar PacketScalar;
-    typedef typename Base::Index Index;
+    typedef typename Base::StorageIndex StorageIndex;
     typedef typename Base::PointerType PointerType;
 
     using Base::derived;
@@ -238,16 +286,21 @@
 
     EIGEN_DEVICE_FUNC explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {}
     EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {}
-    EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols) : Base(dataPtr, nbRows, nbCols) {}
+    EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index rows, Index cols) : Base(dataPtr, rows, cols) {}
 
     EIGEN_DEVICE_FUNC
     Derived& operator=(const MapBase& other)
     {
-      Base::Base::operator=(other);
+      ReadOnlyMapBase::Base::operator=(other);
       return derived();
     }
 
-    using Base::Base::operator=;
+    // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base,
+    // see bugs 821 and 920.
+    using ReadOnlyMapBase::Base::operator=;
+  protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase)
 };
 
 #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS

diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 0731004..61b78f4 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h

@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,8 +11,11 @@
 #ifndef EIGEN_MATHFUNCTIONS_H
 #define EIGEN_MATHFUNCTIONS_H
 
-// source: http://www.geom.uiuc.edu/~huberty/math5337/groupe/digits.html
-#define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406
+// TODO this should better be moved to NumTraits
+// Source: WolframAlpha
+#define EIGEN_PI    3.141592653589793238462643383279502884197169399375105820974944592307816406L
+#define EIGEN_LOG2E 1.442695040888963407359924681001892137426645954152985934135449406931109219L
+#define EIGEN_LN2   0.693147180559945309417232121458176568075500134360255254120680009493393621L
 
 namespace Eigen {
 
@@ -26,7 +30,7 @@
 
 namespace internal {
 
-/** \internal \struct global_math_functions_filtering_base
+/** \internal \class global_math_functions_filtering_base
   *
   * What it does:
   * Defines a typedef 'type' as follows:
@@ -95,7 +99,7 @@
 
 template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
 
-#ifdef __CUDA_ARCH__
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 template<typename T>
 struct real_impl<std::complex<T> >
 {
@@ -143,7 +147,7 @@
 
 template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
 
-#ifdef __CUDA_ARCH__
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 template<typename T>
 struct imag_impl<std::complex<T> >
 {
@@ -211,12 +215,12 @@
 template<typename Scalar>
 struct imag_ref_default_impl<Scalar, false>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline Scalar run(Scalar&)
   {
     return Scalar(0);
   }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline const Scalar run(const Scalar&)
   {
     return Scalar(0);
@@ -237,7 +241,7 @@
 ****************************************************************************/
 
 template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct conj_impl
+struct conj_default_impl
 {
   EIGEN_DEVICE_FUNC
   static inline Scalar run(const Scalar& x)
@@ -247,15 +251,19 @@
 };
 
 template<typename Scalar>
-struct conj_impl<std::complex<Scalar>,true>
+struct conj_default_impl<Scalar,true>
 {
   EIGEN_DEVICE_FUNC
-  static inline std::complex<Scalar> run(const std::complex<Scalar>& x)
+  static inline Scalar run(const Scalar& x)
   {
-    return std::complex<Scalar>(real(x), -imag(x));
+    using std::conj;
+    return conj(x);
   }
 };
 
+template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct conj_impl : conj_default_impl<Scalar, IsComplex> {};
+
 template<typename Scalar>
 struct conj_retval
 {
@@ -266,8 +274,8 @@
 * Implementation of abs2                                                 *
 ****************************************************************************/
 
-template<typename Scalar>
-struct abs2_impl
+template<typename Scalar,bool IsComplex>
+struct abs2_impl_default
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
   EIGEN_DEVICE_FUNC
@@ -277,13 +285,25 @@
   }
 };
 
-template<typename RealScalar>
-struct abs2_impl<std::complex<RealScalar> >
+template<typename Scalar>
+struct abs2_impl_default<Scalar, true> // IsComplex
 {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
   EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const std::complex<RealScalar>& x)
+  static inline RealScalar run(const Scalar& x)
   {
-    return real(x)*real(x) + imag(x)*imag(x);
+    return x.real()*x.real() + x.imag()*x.imag();
+  }
+};
+
+template<typename Scalar>
+struct abs2_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
+  {
+    return abs2_impl_default<Scalar,NumTraits<Scalar>::IsComplex>::run(x);
   }
 };
 
@@ -294,18 +314,80 @@
 };
 
 /****************************************************************************
+* Implementation of sqrt/rsqrt                                             *
+****************************************************************************/
+
+template<typename Scalar>
+struct sqrt_impl
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_ALWAYS_INLINE Scalar run(const Scalar& x)
+  {
+    EIGEN_USING_STD(sqrt);
+    return sqrt(x);
+  }
+};
+
+// Complex sqrt defined in MathFunctionsImpl.h.
+template<typename T> EIGEN_DEVICE_FUNC std::complex<T> complex_sqrt(const std::complex<T>& a_x);
+
+// Custom implementation is faster than `std::sqrt`, works on
+// GPU, and correctly handles special cases (unlike MSVC).
+template<typename T>
+struct sqrt_impl<std::complex<T> >
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_ALWAYS_INLINE std::complex<T> run(const std::complex<T>& x)
+  {
+    return complex_sqrt<T>(x);
+  }
+};
+
+template<typename Scalar>
+struct sqrt_retval
+{
+  typedef Scalar type;
+};
+
+// Default implementation relies on numext::sqrt, at bottom of file.
+template<typename T>
+struct rsqrt_impl;
+
+// Complex rsqrt defined in MathFunctionsImpl.h.
+template<typename T> EIGEN_DEVICE_FUNC std::complex<T> complex_rsqrt(const std::complex<T>& a_x);
+
+template<typename T>
+struct rsqrt_impl<std::complex<T> >
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_ALWAYS_INLINE std::complex<T> run(const std::complex<T>& x)
+  {
+    return complex_rsqrt<T>(x);
+  }
+};
+
+template<typename Scalar>
+struct rsqrt_retval
+{
+  typedef Scalar type;
+};
+
+/****************************************************************************
 * Implementation of norm1                                                *
 ****************************************************************************/
 
 template<typename Scalar, bool IsComplex>
-struct norm1_default_impl
+struct norm1_default_impl;
+
+template<typename Scalar>
+struct norm1_default_impl<Scalar,true>
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
   EIGEN_DEVICE_FUNC
   static inline RealScalar run(const Scalar& x)
   {
-    using std::abs;
-    return abs(real(x)) + abs(imag(x));
+    EIGEN_USING_STD(abs);
+    return abs(x.real()) + abs(x.imag());
   }
 };
 
@@ -315,7 +397,7 @@
   EIGEN_DEVICE_FUNC
   static inline Scalar run(const Scalar& x)
   {
-    using std::abs;
+    EIGEN_USING_STD(abs);
     return abs(x);
   }
 };
@@ -333,31 +415,7 @@
 * Implementation of hypot                                                *
 ****************************************************************************/
 
-template<typename Scalar>
-struct hypot_impl
-{
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline RealScalar run(const Scalar& x, const Scalar& y)
-  {
-    using std::abs;
-    using std::sqrt;
-    RealScalar _x = abs(x);
-    RealScalar _y = abs(y);
-    Scalar p, qp;
-    if(_x>_y)
-    {
-      p = _x;
-      qp = _y / p;
-    }
-    else
-    {
-      p = _y;
-      qp = _x / p;
-    }
-    if(p==RealScalar(0)) return RealScalar(0);
-    return p * sqrt(RealScalar(1) + qp*qp);
-  }
-};
+template<typename Scalar> struct hypot_impl;
 
 template<typename Scalar>
 struct hypot_retval
@@ -369,94 +427,98 @@
 * Implementation of cast                                                 *
 ****************************************************************************/
 
-template<typename OldType, typename NewType>
+template<typename OldType, typename NewType, typename EnableIf = void>
 struct cast_impl
 {
-  EIGEN_DEVICE_FUNC static inline NewType run(const OldType& x)
+  EIGEN_DEVICE_FUNC
+  static inline NewType run(const OldType& x)
   {
     return static_cast<NewType>(x);
   }
 };
 
+// Casting from S -> Complex<T> leads to an implicit conversion from S to T,
+// generating warnings on clang.  Here we explicitly cast the real component.
+template<typename OldType, typename NewType>
+struct cast_impl<OldType, NewType,
+  typename internal::enable_if<
+    !NumTraits<OldType>::IsComplex && NumTraits<NewType>::IsComplex
+  >::type>
+{
+  EIGEN_DEVICE_FUNC
+  static inline NewType run(const OldType& x)
+  {
+    typedef typename NumTraits<NewType>::Real NewReal;
+    return static_cast<NewType>(static_cast<NewReal>(x));
+  }
+};
+
 // here, for once, we're plainly returning NewType: we don't want cast to do weird things.
 
 template<typename OldType, typename NewType>
-EIGEN_DEVICE_FUNC inline NewType cast(const OldType& x)
+EIGEN_DEVICE_FUNC
+inline NewType cast(const OldType& x)
 {
   return cast_impl<OldType, NewType>::run(x);
 }
 
 /****************************************************************************
-* Implementation of atanh2                                                *
-****************************************************************************/
-
-template<typename Scalar>
-struct atanh2_impl
-{
-  static inline Scalar run(const Scalar& x, const Scalar& r)
-  {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-    using std::abs;
-    using std::log;
-    using std::sqrt;
-    Scalar z = x / r;
-    if (r == 0 || abs(z) > sqrt(NumTraits<Scalar>::epsilon()))
-      return log((r + x) / (r - x)) / 2;
-    else
-      return z + z*z*z / 3;
-  }
-};
-
-template<typename RealScalar>
-struct atanh2_impl<std::complex<RealScalar> >
-{
-  typedef std::complex<RealScalar> Scalar;
-  static inline Scalar run(const Scalar& x, const Scalar& r)
-  {
-    using std::log;
-    using std::norm;
-    using std::sqrt;
-    Scalar z = x / r;
-    if (r == Scalar(0) || norm(z) > NumTraits<RealScalar>::epsilon())
-      return RealScalar(0.5) * log((r + x) / (r - x));
-    else
-      return z + z*z*z / RealScalar(3);
-  }
-};
-
-template<typename Scalar>
-struct atanh2_retval
-{
-  typedef Scalar type;
-};
-
-/****************************************************************************
 * Implementation of round                                                   *
 ****************************************************************************/
 
-#if EIGEN_HAS_CXX11_MATH
-  template<typename Scalar>
-  struct round_impl {
-    static inline Scalar run(const Scalar& x)
-    {
-      EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
-      using std::round;
-      return round(x);
-    }
-  };
-#else
-  template<typename Scalar>
-  struct round_impl
+template<typename Scalar>
+struct round_impl
+{
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(const Scalar& x)
   {
-    static inline Scalar run(const Scalar& x)
-    {
-      EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
-      using std::floor;
-      using std::ceil;
-      return (x > 0.0) ? floor(x + 0.5) : ceil(x - 0.5);
-    }
-  };
+    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
+#if EIGEN_HAS_CXX11_MATH
+    EIGEN_USING_STD(round);
 #endif
+    return Scalar(round(x));
+  }
+};
+
+#if !EIGEN_HAS_CXX11_MATH
+#if EIGEN_HAS_C99_MATH
+// Use ::roundf for float.
+template<>
+struct round_impl<float> {
+  EIGEN_DEVICE_FUNC
+  static inline float run(const float& x)
+  {
+    return ::roundf(x);
+  }
+};
+#else
+template<typename Scalar>
+struct round_using_floor_ceil_impl
+{
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(const Scalar& x)
+  {
+    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
+    // Without C99 round/roundf, resort to floor/ceil.
+    EIGEN_USING_STD(floor);
+    EIGEN_USING_STD(ceil);
+    // If not enough precision to resolve a decimal at all, return the input.
+    // Otherwise, adding 0.5 can trigger an increment by 1.
+    const Scalar limit = Scalar(1ull << (NumTraits<Scalar>::digits() - 1));
+    if (x >= limit || x <= -limit) {
+      return x;
+    }
+    return (x > Scalar(0)) ? Scalar(floor(x + Scalar(0.5))) : Scalar(ceil(x - Scalar(0.5)));
+  }
+};
+
+template<>
+struct round_impl<float> : round_using_floor_ceil_impl<float> {};
+
+template<>
+struct round_impl<double> : round_using_floor_ceil_impl<double> {};
+#endif // EIGEN_HAS_C99_MATH
+#endif // !EIGEN_HAS_CXX11_MATH
 
 template<typename Scalar>
 struct round_retval
@@ -465,43 +527,112 @@
 };
 
 /****************************************************************************
+* Implementation of rint                                                    *
+****************************************************************************/
+
+template<typename Scalar>
+struct rint_impl {
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(const Scalar& x)
+  {
+    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
+#if EIGEN_HAS_CXX11_MATH
+      EIGEN_USING_STD(rint);
+#endif
+    return rint(x);
+  }
+};
+
+#if !EIGEN_HAS_CXX11_MATH
+template<>
+struct rint_impl<double> {
+  EIGEN_DEVICE_FUNC
+  static inline double run(const double& x)
+  {
+    return ::rint(x);
+  }
+};
+template<>
+struct rint_impl<float> {
+  EIGEN_DEVICE_FUNC
+  static inline float run(const float& x)
+  {
+    return ::rintf(x);
+  }
+};
+#endif
+
+template<typename Scalar>
+struct rint_retval
+{
+  typedef Scalar type;
+};
+
+/****************************************************************************
 * Implementation of arg                                                     *
 ****************************************************************************/
 
-#if EIGEN_HAS_CXX11_MATH
-  template<typename Scalar>
-  struct arg_impl {
-    static inline Scalar run(const Scalar& x)
-    {
-      using std::arg;
-      return arg(x);
-    }
-  };
+// Visual Studio 2017 has a bug where arg(float) returns 0 for negative inputs.
+// This seems to be fixed in VS 2019.
+#if EIGEN_HAS_CXX11_MATH && (!EIGEN_COMP_MSVC || EIGEN_COMP_MSVC >= 1920)
+// std::arg is only defined for types of std::complex, or integer types or float/double/long double
+template<typename Scalar,
+          bool HasStdImpl = NumTraits<Scalar>::IsComplex || is_integral<Scalar>::value
+                            || is_same<Scalar, float>::value || is_same<Scalar, double>::value
+                            || is_same<Scalar, long double>::value >
+struct arg_default_impl;
+
+template<typename Scalar>
+struct arg_default_impl<Scalar, true> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
+  {
+    #if defined(EIGEN_HIP_DEVICE_COMPILE)
+    // HIP does not seem to have a native device side implementation for the math routine "arg"
+    using std::arg;
+    #else
+    EIGEN_USING_STD(arg);
+    #endif
+    return static_cast<RealScalar>(arg(x));
+  }
+};
+
+// Must be non-complex floating-point type (e.g. half/bfloat16).
+template<typename Scalar>
+struct arg_default_impl<Scalar, false> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
+  {
+    return (x < Scalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0);
+  }
+};
 #else
-  template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-  struct arg_default_impl
+template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct arg_default_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
   {
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    EIGEN_DEVICE_FUNC
-    static inline RealScalar run(const Scalar& x)
-    {
-      return (x < 0.0) ? EIGEN_PI : 0.0; }
-  };
+    return (x < RealScalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0);
+  }
+};
 
-  template<typename Scalar>
-  struct arg_default_impl<Scalar,true>
+template<typename Scalar>
+struct arg_default_impl<Scalar,true>
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
   {
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    EIGEN_DEVICE_FUNC
-    static inline RealScalar run(const Scalar& x)
-    {
-      using std::arg;
-      return arg(x);
-    }
-  };
-
-  template<typename Scalar> struct arg_impl : arg_default_impl<Scalar> {};
+    EIGEN_USING_STD(arg);
+    return arg(x);
+  }
+};
 #endif
+template<typename Scalar> struct arg_impl : arg_default_impl<Scalar> {};
 
 template<typename Scalar>
 struct arg_retval
@@ -509,86 +640,124 @@
   typedef typename NumTraits<Scalar>::Real type;
 };
 
- /****************************************************************************
- * Implementation of expm1
- *****************************************************************************/
+/****************************************************************************
+* Implementation of expm1                                                   *
+****************************************************************************/
 
 // This implementation is based on GSL Math's expm1.
 namespace std_fallback {
-// fallback expm1 implementation in case there is no expm1(Scalar) function in
-// namespace of Scalar,
-// or that there is no suitable std::expm1 function available.  Implementation
-// attributed to Kahan. See: http://www.plunk.org/~hatch/rightway.php.
-  template <typename Scalar>
+  // fallback expm1 implementation in case there is no expm1(Scalar) function in namespace of Scalar,
+  // or that there is no suitable std::expm1 function available. Implementation
+  // attributed to Kahan. See: http://www.plunk.org/~hatch/rightway.php.
+  template<typename Scalar>
   EIGEN_DEVICE_FUNC inline Scalar expm1(const Scalar& x) {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
     typedef typename NumTraits<Scalar>::Real RealScalar;
 
-    EIGEN_USING_STD_MATH(exp);
+    EIGEN_USING_STD(exp);
     Scalar u = exp(x);
-    EIGEN_DISABLE_FLOAT_EQUALITY_WARNING
-    if (u == Scalar(1)) {
+    if (numext::equal_strict(u, Scalar(1))) {
       return x;
     }
     Scalar um1 = u - RealScalar(1);
-    if (um1 == Scalar(-1)) {
+    if (numext::equal_strict(um1, Scalar(-1))) {
       return RealScalar(-1);
     }
-    EIGEN_ENABLE_FLOAT_EQUALITY_WARNING
 
-    EIGEN_USING_STD_MATH(log);
-    return (u - RealScalar(1)) * x / log(u);
+    EIGEN_USING_STD(log);
+    Scalar logu = log(u);
+    return numext::equal_strict(u, logu) ? u : (u - RealScalar(1)) * x / logu;
   }
 }
 
-template <typename Scalar>
+template<typename Scalar>
 struct expm1_impl {
-  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x) {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
+  {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
     #if EIGEN_HAS_CXX11_MATH
     using std::expm1;
-    #endif
+    #else
     using std_fallback::expm1;
+    #endif
     return expm1(x);
   }
 };
 
-template <typename Scalar>
-struct expm1_retval {
+template<typename Scalar>
+struct expm1_retval
+{
   typedef Scalar type;
 };
 
 /****************************************************************************
+* Implementation of log                                                     *
+****************************************************************************/
+
+// Complex log defined in MathFunctionsImpl.h.
+template<typename T> EIGEN_DEVICE_FUNC std::complex<T> complex_log(const std::complex<T>& z);
+
+template<typename Scalar>
+struct log_impl {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
+  {
+    EIGEN_USING_STD(log);
+    return static_cast<Scalar>(log(x));
+  }
+};
+
+template<typename Scalar>
+struct log_impl<std::complex<Scalar> > {
+  EIGEN_DEVICE_FUNC static inline std::complex<Scalar> run(const std::complex<Scalar>& z)
+  {
+    return complex_log(z);
+  }
+};
+
+/****************************************************************************
 * Implementation of log1p                                                   *
 ****************************************************************************/
+
 namespace std_fallback {
-// fallback log1p implementation in case there is no log1p(Scalar) function in
-// namespace of Scalar,
-// or that there is no suitable std::log1p function available
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline Scalar log1p(const Scalar& x) {
-  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_USING_STD_MATH(log);
-  Scalar x1p = RealScalar(1) + x;
-  EIGEN_DISABLE_FLOAT_EQUALITY_WARNING
-  return (x1p == Scalar(1)) ? x : x * (log(x1p) / (x1p - RealScalar(1)));
-  EIGEN_ENABLE_FLOAT_EQUALITY_WARNING
-}
+  // fallback log1p implementation in case there is no log1p(Scalar) function in namespace of Scalar,
+  // or that there is no suitable std::log1p function available
+  template<typename Scalar>
+  EIGEN_DEVICE_FUNC inline Scalar log1p(const Scalar& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    EIGEN_USING_STD(log);
+    Scalar x1p = RealScalar(1) + x;
+    Scalar log_1p = log_impl<Scalar>::run(x1p);
+    const bool is_small = numext::equal_strict(x1p, Scalar(1));
+    const bool is_inf = numext::equal_strict(x1p, log_1p);
+    return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1)));
+  }
 }
 
-template <typename Scalar>
+template<typename Scalar>
 struct log1p_impl {
-  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x) {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
+  {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-    #if EIGEN_HAS_CX11_MATH
+    #if EIGEN_HAS_CXX11_MATH
     using std::log1p;
-    #endif
+    #else
     using std_fallback::log1p;
+    #endif
     return log1p(x);
   }
 };
 
+// Specialization for complex types that are not supported by std::log1p.
+template <typename RealScalar>
+struct log1p_impl<std::complex<RealScalar> > {
+  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
+      const std::complex<RealScalar>& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
+    return std_fallback::log1p(x);
+  }
+};
+
 template<typename Scalar>
 struct log1p_retval
 {
@@ -599,27 +768,26 @@
 * Implementation of pow                                                  *
 ****************************************************************************/
 
-template<typename Scalar, bool IsInteger>
-struct pow_default_impl
+template<typename ScalarX,typename ScalarY, bool IsInteger = NumTraits<ScalarX>::IsInteger&&NumTraits<ScalarY>::IsInteger>
+struct pow_impl
 {
-  typedef Scalar retval;
-  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y)
+  //typedef Scalar retval;
+  typedef typename ScalarBinaryOpTraits<ScalarX,ScalarY,internal::scalar_pow_op<ScalarX,ScalarY> >::ReturnType result_type;
+  static EIGEN_DEVICE_FUNC inline result_type run(const ScalarX& x, const ScalarY& y)
   {
-    using std::pow;
+    EIGEN_USING_STD(pow);
     return pow(x, y);
   }
 };
 
-template<typename Scalar>
-struct pow_default_impl<Scalar, true>
+template<typename ScalarX,typename ScalarY>
+struct pow_impl<ScalarX,ScalarY, true>
 {
-  static EIGEN_DEVICE_FUNC inline Scalar run(Scalar x, Scalar y)
+  typedef ScalarX result_type;
+  static EIGEN_DEVICE_FUNC inline ScalarX run(ScalarX x, ScalarY y)
   {
-    if (NumTraits<Scalar>::IsSigned && y < 0) {
-      using std::pow;
-      return pow(x, y);
-    }
-    Scalar res(1);
+    ScalarX res(1);
+    eigen_assert(!NumTraits<ScalarY>::IsSigned || y >= 0);
     if(y & 1) res *= x;
     y >>= 1;
     while(y)
@@ -632,15 +800,6 @@
   }
 };
 
-template<typename Scalar>
-struct pow_impl : pow_default_impl<Scalar, NumTraits<Scalar>::IsInteger> {};
-
-template<typename Scalar>
-struct pow_retval
-{
-  typedef Scalar type;
-};
-
 /****************************************************************************
 * Implementation of random                                               *
 ****************************************************************************/
@@ -727,19 +886,28 @@
 {
   static inline Scalar run(const Scalar& x, const Scalar& y)
   {
-    typedef typename conditional<NumTraits<Scalar>::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX;
-    if(y<x)
+    if (y <= x)
       return x;
-    std::size_t range = ScalarX(y)-ScalarX(x);
-    std::size_t offset = 0;
-    // rejection sampling
-    std::size_t divisor    = (range+RAND_MAX-1)/(range+1);
-    std::size_t multiplier = (range+RAND_MAX-1)/std::size_t(RAND_MAX);
-
+    // ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself.
+    typedef typename make_unsigned<Scalar>::type ScalarU;
+    // ScalarX is the widest of ScalarU and unsigned int.
+    // We'll deal only with ScalarX and unsigned int below thus avoiding signed
+    // types and arithmetic and signed overflows (which are undefined behavior).
+    typedef typename conditional<(ScalarU(-1) > unsigned(-1)), ScalarU, unsigned>::type ScalarX;
+    // The following difference doesn't overflow, provided our integer types are two's
+    // complement and have the same number of padding bits in signed and unsigned variants.
+    // This is the case in most modern implementations of C++.
+    ScalarX range = ScalarX(y) - ScalarX(x);
+    ScalarX offset = 0;
+    ScalarX divisor = 1;
+    ScalarX multiplier = 1;
+    const unsigned rand_max = RAND_MAX;
+    if (range <= rand_max) divisor = (rand_max + 1) / (range + 1);
+    else                   multiplier = 1 + range / (rand_max + 1);
+    // Rejection sampling.
     do {
-      offset = ( (std::size_t(std::rand()) * multiplier) / divisor );
+      offset = (unsigned(std::rand()) * multiplier) / divisor;
     } while (offset > range);
-
     return Scalar(ScalarX(x) + offset);
   }
 
@@ -763,8 +931,8 @@
 {
   static inline Scalar run(const Scalar& x, const Scalar& y)
   {
-    return Scalar(random(real(x), real(y)),
-                  random(imag(x), imag(y)));
+    return Scalar(random(x.real(), y.real()),
+                  random(x.imag(), y.imag()));
   }
   static inline Scalar run()
   {
@@ -785,6 +953,122 @@
   return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
 }
 
+// Implementation of is* functions
+
+// std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang.
+#if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG)
+#define EIGEN_USE_STD_FPCLASSIFY 1
+#else
+#define EIGEN_USE_STD_FPCLASSIFY 0
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<internal::is_integral<T>::value,bool>::type
+isnan_impl(const T&) { return false; }
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<internal::is_integral<T>::value,bool>::type
+isinf_impl(const T&) { return false; }
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<internal::is_integral<T>::value,bool>::type
+isfinite_impl(const T&) { return true; }
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
+isfinite_impl(const T& x)
+{
+  #if defined(EIGEN_GPU_COMPILE_PHASE)
+    return (::isfinite)(x);
+  #elif EIGEN_USE_STD_FPCLASSIFY
+    using std::isfinite;
+    return isfinite EIGEN_NOT_A_MACRO (x);
+  #else
+    return x<=NumTraits<T>::highest() && x>=NumTraits<T>::lowest();
+  #endif
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
+isinf_impl(const T& x)
+{
+  #if defined(EIGEN_GPU_COMPILE_PHASE)
+    return (::isinf)(x);
+  #elif EIGEN_USE_STD_FPCLASSIFY
+    using std::isinf;
+    return isinf EIGEN_NOT_A_MACRO (x);
+  #else
+    return x>NumTraits<T>::highest() || x<NumTraits<T>::lowest();
+  #endif
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
+isnan_impl(const T& x)
+{
+  #if defined(EIGEN_GPU_COMPILE_PHASE)
+    return (::isnan)(x);
+  #elif EIGEN_USE_STD_FPCLASSIFY
+    using std::isnan;
+    return isnan EIGEN_NOT_A_MACRO (x);
+  #else
+    return x != x;
+  #endif
+}
+
+#if (!EIGEN_USE_STD_FPCLASSIFY)
+
+#if EIGEN_COMP_MSVC
+
+template<typename T> EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x)
+{
+  return _fpclass(x)==_FPCLASS_NINF || _fpclass(x)==_FPCLASS_PINF;
+}
+
+//MSVC defines a _isnan builtin function, but for double only
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x)!=0; }
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x)      { return _isnan(x)!=0; }
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x)       { return _isnan(x)!=0; }
+
+EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) { return isinf_msvc_helper(x); }
+EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x)      { return isinf_msvc_helper(x); }
+EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x)       { return isinf_msvc_helper(x); }
+
+#elif (defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ && EIGEN_COMP_GNUC)
+
+#if EIGEN_GNUC_AT_LEAST(5,0)
+  #define EIGEN_TMP_NOOPT_ATTRIB EIGEN_DEVICE_FUNC inline __attribute__((optimize("no-finite-math-only")))
+#else
+  // NOTE the inline qualifier and noinline attribute are both needed: the former is to avoid linking issue (duplicate symbol),
+  //      while the second prevent too aggressive optimizations in fast-math mode:
+  #define EIGEN_TMP_NOOPT_ATTRIB EIGEN_DEVICE_FUNC inline __attribute__((noinline,optimize("no-finite-math-only")))
+#endif
+
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const long double& x) { return __builtin_isnan(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const double& x)      { return __builtin_isnan(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const float& x)       { return __builtin_isnan(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const double& x)      { return __builtin_isinf(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const float& x)       { return __builtin_isinf(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const long double& x) { return __builtin_isinf(x); }
+
+#undef EIGEN_TMP_NOOPT_ATTRIB
+
+#endif
+
+#endif
+
+// The following overload are defined at the end of this file
+template<typename T> EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>& x);
+template<typename T> EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x);
+template<typename T> EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x);
+
+template<typename T> T generic_fast_tanh_float(const T& a_x);
 } // end namespace internal
 
 /****************************************************************************
@@ -793,12 +1077,12 @@
 
 namespace numext {
 
-#ifndef __CUDA_ARCH__
+#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
 template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
 {
-  EIGEN_USING_STD_MATH(min);
+  EIGEN_USING_STD(min)
   return min EIGEN_NOT_A_MACRO (x,y);
 }
 
@@ -806,7 +1090,7 @@
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
 {
-  EIGEN_USING_STD_MATH(max);
+  EIGEN_USING_STD(max)
   return max EIGEN_NOT_A_MACRO (x,y);
 }
 #else
@@ -820,8 +1104,26 @@
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
 {
+  return fminf(x, y);
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y)
+{
   return fmin(x, y);
 }
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
+{
+#if defined(EIGEN_HIPCC)
+  // no "fminl" on HIP yet
+  return (x < y) ? x : y;
+#else
+  return fminl(x, y);
+#endif
+}
+
 template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
@@ -832,9 +1134,96 @@
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
 {
+  return fmaxf(x, y);
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
+{
   return fmax(x, y);
 }
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
+{
+#if defined(EIGEN_HIPCC)
+  // no "fmaxl" on HIP yet
+  return (x > y) ? x : y;
+#else
+  return fmaxl(x, y);
 #endif
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+
+
+#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char)   \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short)  \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int)    \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
+#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char)   \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short)  \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int)    \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
+#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)  \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)   \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
+#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)  \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)   \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
+#define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC)
+#define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC)
+#define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC,cl::sycl::cl_double)
+#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC,cl::sycl::cl_double)
+#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(NAME, FUNC, RET_TYPE) \
+  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float) \
+  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double)
+
+#define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
+template<>                                               \
+  EIGEN_DEVICE_FUNC                                      \
+  EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \
+    return cl::sycl::FUNC(x);                            \
+  }
+
+#define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) \
+  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE)
+
+#define SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2) \
+  template<>                                                                  \
+  EIGEN_DEVICE_FUNC                                                           \
+  EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x, const ARG_TYPE2& y) { \
+    return cl::sycl::FUNC(x, y);                                              \
+  }
+
+#define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
+  SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE)
+
+#define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) \
+  SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE)
+
+SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin)
+SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax)
+
+#endif
+
 
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
@@ -899,6 +1288,37 @@
   return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
 }
 
+EIGEN_DEVICE_FUNC
+inline bool abs2(bool x) { return x; }
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE T absdiff(const T& x, const T& y)
+{
+  return x > y ? x - y : y - x;
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE float absdiff(const float& x, const float& y)
+{
+  return fabsf(x - y);
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE double absdiff(const double& x, const double& y)
+{
+  return fabs(x - y);
+}
+
+#if !defined(EIGEN_GPUCC)
+// HIP and CUDA do not support long double.
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE long double absdiff(const long double& x, const long double& y) {
+  return fabsl(x - y);
+}
+#endif
+
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
@@ -913,6 +1333,10 @@
   return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
 }
 
+#if defined(SYCL_DEVICE_ONLY)
+  SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot)
+#endif
+
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
@@ -920,93 +1344,44 @@
   return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
 }
 
-#ifdef __CUDACC__
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log1p(const float& x) {
-  return ::log1pf(x);
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float log1p(const float &x) { return ::log1pf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double log1p(const double &x) { return ::log1p(x); }
+#endif
+
+template<typename ScalarX,typename ScalarY>
+EIGEN_DEVICE_FUNC
+inline typename internal::pow_impl<ScalarX,ScalarY>::result_type pow(const ScalarX& x, const ScalarY& y)
+{
+  return internal::pow_impl<ScalarX,ScalarY>::run(x, y);
 }
 
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double log1p(const double& x) {
-  return ::log1p(x);
-}
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow)
+#endif
+
+template<typename T> EIGEN_DEVICE_FUNC bool (isnan)   (const T &x) { return internal::isnan_impl(x); }
+template<typename T> EIGEN_DEVICE_FUNC bool (isinf)   (const T &x) { return internal::isinf_impl(x); }
+template<typename T> EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); }
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool)
 #endif
 
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
-inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y)
+inline EIGEN_MATHFUNC_RETVAL(rint, Scalar) rint(const Scalar& x)
 {
-  return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y);
-}
-
-template<typename Scalar>
-EIGEN_DEVICE_FUNC
-inline EIGEN_MATHFUNC_RETVAL(pow, Scalar) pow(const Scalar& x, const Scalar& y)
-{
-  return EIGEN_MATHFUNC_IMPL(pow, Scalar)::run(x, y);
-}
-
-template<typename T>
-EIGEN_DEVICE_FUNC
-bool (isfinite)(const T& x)
-{
-  #ifdef __CUDA_ARCH__
-    return (::isfinite)(x);
-  #elif EIGEN_HAS_CXX11_MATH
-    using std::isfinite;
-    return isfinite EIGEN_NOT_A_MACRO(x);
-  #else
-    return x<=NumTraits<T>::highest() && x>=NumTraits<T>::lowest();
-  #endif
-}
-
-template<typename T>
-EIGEN_DEVICE_FUNC
-bool (isfinite)(const std::complex<T>& x)
-{
-  return numext::isfinite(numext::real(x)) && numext::isfinite(numext::imag(x));
-}
-
-template<typename T>
-EIGEN_DEVICE_FUNC
-bool (isnan)(const T& x)
-{
-  #ifdef __CUDA_ARCH__
-    return (::isnan)(x);
-  #elif EIGEN_HAS_CXX11_MATH
-    using std::isnan;
-    return isnan EIGEN_NOT_A_MACRO(x);
-  #else
-    return x != x;
-  #endif
-}
-
-template<typename T>
-EIGEN_DEVICE_FUNC
-bool (isnan)(const std::complex<T>& x)
-{
-  return numext::isnan(numext::real(x)) || numext::isnan(numext::imag(x));
-}
-
-template<typename T>
-EIGEN_DEVICE_FUNC
-bool (isinf)(const T& x)
-{
-  #ifdef __CUDA_ARCH__
-    return (::isinf)(x);
-  #elif EIGEN_HAS_CXX11_MATH
-    using std::isinf;
-    return isinf EIGEN_NOT_A_MACRO(x);
-  #else
-    return x>NumTraits<T>::highest() || x<NumTraits<T>::lowest();
-  #endif
-}
-
-template<typename T>
-EIGEN_DEVICE_FUNC
-bool (isinf)(const std::complex<T>& x)
-{
-  return (numext::isinf(numext::real(x)) || numext::isinf(numext::imag(x))) && (!numext::isnan(x));
+  return EIGEN_MATHFUNC_IMPL(rint, Scalar)::run(x);
 }
 
 template<typename Scalar>
@@ -1016,15 +1391,23 @@
   return EIGEN_MATHFUNC_IMPL(round, Scalar)::run(x);
 }
 
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round)
+#endif
+
 template<typename T>
 EIGEN_DEVICE_FUNC
 T (floor)(const T& x)
 {
-  using std::floor;
+  EIGEN_USING_STD(floor)
   return floor(x);
 }
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float floor(const float &x) { return ::floorf(x); }
 
@@ -1036,11 +1419,15 @@
 EIGEN_DEVICE_FUNC
 T (ceil)(const T& x)
 {
-  using std::ceil;
+  EIGEN_USING_STD(ceil);
   return ceil(x);
 }
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float ceil(const float &x) { return ::ceilf(x); }
 
@@ -1048,6 +1435,7 @@
 double ceil(const double &x) { return ::ceil(x); }
 #endif
 
+
 /** Log base 2 for 32 bits positive integers.
   * Conveniently returns 0 for x==0. */
 inline int log2(int x)
@@ -1065,28 +1453,49 @@
 
 /** \returns the square root of \a x.
   *
-  * It is essentially equivalent to \code using std::sqrt; return sqrt(x); \endcode,
+  * It is essentially equivalent to
+  * \code using std::sqrt; return sqrt(x); \endcode
   * but slightly faster for float/double and some compilers (e.g., gcc), thanks to
   * specializations when SSE is enabled.
   *
   * It's usage is justified in performance critical functions, like norm/normalize.
   */
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE EIGEN_MATHFUNC_RETVAL(sqrt, Scalar) sqrt(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(sqrt, Scalar)::run(x);
+}
+
+// Boolean specialization, avoids implicit float to bool conversion (-Wimplicit-conversion-floating-point-to-bool).
+template<>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC
+bool sqrt<bool>(const bool &x) { return x; }
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
+#endif
+
+/** \returns the reciprocal square root of \a x. **/
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-T sqrt(const T &x)
+T rsqrt(const T& x)
 {
-  using std::sqrt;
-  return sqrt(x);
+  return internal::rsqrt_impl<T>::run(x);
 }
 
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T log(const T &x) {
-  EIGEN_USING_STD_MATH(log);
-  return log(x);
+  return internal::log_impl<T>::run(x);
 }
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log)
+#endif
+
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float log(const float &x) { return ::logf(x); }
 
@@ -1096,26 +1505,37 @@
 
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-typename NumTraits<T>::Real abs(const T &x) {
-  EIGEN_USING_STD_MATH(abs);
+typename internal::enable_if<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex,typename NumTraits<T>::Real>::type
+abs(const T &x) {
+  EIGEN_USING_STD(abs);
   return abs(x);
 }
 
-#ifdef __CUDACC__
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+typename internal::enable_if<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex),typename NumTraits<T>::Real>::type
+abs(const T &x) {
+  return x;
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float abs(const float &x) { return ::fabsf(x); }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double abs(const double &x) { return ::fabs(x); }
 
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float abs(const std::complex<float>& x) {
   return ::hypotf(x.real(), x.imag());
 }
 
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double abs(const std::complex<double>& x) {
   return ::hypot(x.real(), x.imag());
 }
@@ -1124,16 +1544,36 @@
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T exp(const T &x) {
-  EIGEN_USING_STD_MATH(exp);
+  EIGEN_USING_STD(exp);
   return exp(x);
 }
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float exp(const float &x) { return ::expf(x); }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double exp(const double &x) { return ::exp(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+std::complex<float> exp(const std::complex<float>& x) {
+  float com = ::expf(x.real());
+  float res_real = com * ::cosf(x.imag());
+  float res_imag = com * ::sinf(x.imag());
+  return std::complex<float>(res_real, res_imag);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+std::complex<double> exp(const std::complex<double>& x) {
+  double com = ::exp(x.real());
+  double res_real = com * ::cos(x.imag());
+  double res_imag = com * ::sin(x.imag());
+  return std::complex<double>(res_real, res_imag);
+}
 #endif
 
 template<typename Scalar>
@@ -1143,26 +1583,30 @@
   return EIGEN_MATHFUNC_IMPL(expm1, Scalar)::run(x);
 }
 
-#ifdef __CUDACC__
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float expm1(const float& x) {
-  return ::expm1f(x);
-}
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1)
+#endif
 
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double expm1(const double& x) {
-  return ::expm1(x);
-}
+#if defined(EIGEN_GPUCC)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float expm1(const float &x) { return ::expm1f(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double expm1(const double &x) { return ::expm1(x); }
 #endif
 
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T cos(const T &x) {
-  EIGEN_USING_STD_MATH(cos);
+  EIGEN_USING_STD(cos);
   return cos(x);
 }
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos,cos)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float cos(const float &x) { return ::cosf(x); }
 
@@ -1173,11 +1617,15 @@
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T sin(const T &x) {
-  EIGEN_USING_STD_MATH(sin);
+  EIGEN_USING_STD(sin);
   return sin(x);
 }
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float sin(const float &x) { return ::sinf(x); }
 
@@ -1188,11 +1636,15 @@
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T tan(const T &x) {
-  EIGEN_USING_STD_MATH(tan);
+  EIGEN_USING_STD(tan);
   return tan(x);
 }
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tan(const float &x) { return ::tanf(x); }
 
@@ -1203,11 +1655,25 @@
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T acos(const T &x) {
-  EIGEN_USING_STD_MATH(acos);
+  EIGEN_USING_STD(acos);
   return acos(x);
 }
 
-#ifdef __CUDACC__
+#if EIGEN_HAS_CXX11_MATH
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T acosh(const T &x) {
+  EIGEN_USING_STD(acosh);
+  return static_cast<T>(acosh(x));
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float acos(const float &x) { return ::acosf(x); }
 
@@ -1218,11 +1684,25 @@
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T asin(const T &x) {
-  EIGEN_USING_STD_MATH(asin);
+  EIGEN_USING_STD(asin);
   return asin(x);
 }
 
-#ifdef __CUDACC__
+#if EIGEN_HAS_CXX11_MATH
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T asinh(const T &x) {
+  EIGEN_USING_STD(asinh);
+  return static_cast<T>(asinh(x));
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float asin(const float &x) { return ::asinf(x); }
 
@@ -1233,11 +1713,25 @@
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T atan(const T &x) {
-  EIGEN_USING_STD_MATH(atan);
-  return atan(x);
+  EIGEN_USING_STD(atan);
+  return static_cast<T>(atan(x));
 }
 
-#ifdef __CUDACC__
+#if EIGEN_HAS_CXX11_MATH
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T atanh(const T &x) {
+  EIGEN_USING_STD(atanh);
+  return static_cast<T>(atanh(x));
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float atan(const float &x) { return ::atanf(x); }
 
@@ -1249,11 +1743,15 @@
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T cosh(const T &x) {
-  EIGEN_USING_STD_MATH(cosh);
-  return cosh(x);
+  EIGEN_USING_STD(cosh);
+  return static_cast<T>(cosh(x));
 }
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float cosh(const float &x) { return ::coshf(x); }
 
@@ -1264,11 +1762,15 @@
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T sinh(const T &x) {
-  EIGEN_USING_STD_MATH(sinh);
-  return sinh(x);
+  EIGEN_USING_STD(sinh);
+  return static_cast<T>(sinh(x));
 }
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float sinh(const float &x) { return ::sinhf(x); }
 
@@ -1279,11 +1781,20 @@
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T tanh(const T &x) {
-  EIGEN_USING_STD_MATH(tanh);
+  EIGEN_USING_STD(tanh);
   return tanh(x);
 }
 
-#ifdef __CUDACC__
+#if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && !defined(SYCL_DEVICE_ONLY)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float tanh(float x) { return internal::generic_fast_tanh_float(x); }
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tanh(const float &x) { return ::tanhf(x); }
 
@@ -1294,11 +1805,15 @@
 template <typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T fmod(const T& a, const T& b) {
-  EIGEN_USING_STD_MATH(fmod);
+  EIGEN_USING_STD(fmod);
   return fmod(a, b);
 }
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float fmod(const float& a, const float& b) {
@@ -1312,10 +1827,45 @@
 }
 #endif
 
+#if defined(SYCL_DEVICE_ONLY)
+#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY
+#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY
+#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY
+#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
+#undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY
+#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
+#undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY
+#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY
+#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE
+#undef SYCL_SPECIALIZE_GEN_UNARY_FUNC
+#undef SYCL_SPECIALIZE_UNARY_FUNC
+#undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC
+#undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC
+#undef SYCL_SPECIALIZE_BINARY_FUNC
+#endif
+
 } // end namespace numext
 
 namespace internal {
 
+template<typename T>
+EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>& x)
+{
+  return (numext::isfinite)(numext::real(x)) && (numext::isfinite)(numext::imag(x));
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x)
+{
+  return (numext::isnan)(numext::real(x)) || (numext::isnan)(numext::imag(x));
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x)
+{
+  return ((numext::isinf)(numext::real(x)) || (numext::isinf)(numext::imag(x))) && (!(numext::isnan)(x));
+}
+
 /****************************************************************************
 * Implementation of fuzzy comparisons                                       *
 ****************************************************************************/
@@ -1371,11 +1921,12 @@
 struct scalar_fuzzy_default_impl<Scalar, true, false>
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  template<typename OtherScalar>
+  template<typename OtherScalar> EIGEN_DEVICE_FUNC
   static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
   {
     return numext::abs2(x) <= numext::abs2(y) * prec * prec;
   }
+  EIGEN_DEVICE_FUNC
   static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
   {
     return numext::abs2(x - y) <= numext::mini(numext::abs2(x), numext::abs2(y)) * prec * prec;
@@ -1387,21 +1938,21 @@
 
 template<typename Scalar, typename OtherScalar> EIGEN_DEVICE_FUNC
 inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y,
-                                   typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
+                              const typename NumTraits<Scalar>::Real &precision = NumTraits<Scalar>::dummy_precision())
 {
   return scalar_fuzzy_impl<Scalar>::template isMuchSmallerThan<OtherScalar>(x, y, precision);
 }
 
 template<typename Scalar> EIGEN_DEVICE_FUNC
 inline bool isApprox(const Scalar& x, const Scalar& y,
-                          typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
+                     const typename NumTraits<Scalar>::Real &precision = NumTraits<Scalar>::dummy_precision())
 {
   return scalar_fuzzy_impl<Scalar>::isApprox(x, y, precision);
 }
 
 template<typename Scalar> EIGEN_DEVICE_FUNC
 inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y,
-                                    typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
+                               const typename NumTraits<Scalar>::Real &precision = NumTraits<Scalar>::dummy_precision())
 {
   return scalar_fuzzy_impl<Scalar>::isApproxOrLessThan(x, y, precision);
 }
@@ -1416,6 +1967,11 @@
   {
     return random<int>(0,1)==0 ? false : true;
   }
+
+  static inline bool run(const bool& a, const bool& b)
+  {
+    return random<int>(a, b)==0 ? false : true;
+  }
 };
 
 template<> struct scalar_fuzzy_impl<bool>
@@ -1442,6 +1998,57 @@
 
 };
 
+} // end namespace internal
+
+// Default implementations that rely on other numext implementations
+namespace internal {
+
+// Specialization for complex types that are not supported by std::expm1.
+template <typename RealScalar>
+struct expm1_impl<std::complex<RealScalar> > {
+  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
+      const std::complex<RealScalar>& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
+    RealScalar xr = x.real();
+    RealScalar xi = x.imag();
+    // expm1(z) = exp(z) - 1
+    //          = exp(x +  i * y) - 1
+    //          = exp(x) * (cos(y) + i * sin(y)) - 1
+    //          = exp(x) * cos(y) - 1 + i * exp(x) * sin(y)
+    // Imag(expm1(z)) = exp(x) * sin(y)
+    // Real(expm1(z)) = exp(x) * cos(y) - 1
+    //          = exp(x) * cos(y) - 1.
+    //          = expm1(x) + exp(x) * (cos(y) - 1)
+    //          = expm1(x) + exp(x) * (2 * sin(y / 2) ** 2)
+    RealScalar erm1 = numext::expm1<RealScalar>(xr);
+    RealScalar er = erm1 + RealScalar(1.);
+    RealScalar sin2 = numext::sin(xi / RealScalar(2.));
+    sin2 = sin2 * sin2;
+    RealScalar s = numext::sin(xi);
+    RealScalar real_part = erm1 - RealScalar(2.) * er * sin2;
+    return std::complex<RealScalar>(real_part, er * s);
+  }
+};
+
+template<typename T>
+struct rsqrt_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_ALWAYS_INLINE T run(const T& x) {
+    return T(1)/numext::sqrt(x);
+  }
+};
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+template<typename T>
+struct conj_impl<std::complex<T>, true>
+{
+  EIGEN_DEVICE_FUNC
+  static inline std::complex<T> run(const std::complex<T>& x)
+  {
+    return std::complex<T>(numext::real(x), -numext::imag(x));
+  }
+};
+#endif
 
 } // end namespace internal
 

diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
new file mode 100644
index 0000000..4eaaaa7
--- /dev/null
+++ b/Eigen/src/Core/MathFunctionsImpl.h

@@ -0,0 +1,200 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATHFUNCTIONSIMPL_H
+#define EIGEN_MATHFUNCTIONSIMPL_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
+    Doesn't do anything fancy, just a 13/6-degree rational interpolant which
+    is accurate up to a couple of ulps in the (approximate) range [-8, 8],
+    outside of which tanh(x) = +/-1 in single precision. The input is clamped
+    to the range [-c, c]. The value c is chosen as the smallest value where
+    the approximation evaluates to exactly 1. In the reange [-0.0004, 0.0004]
+    the approxmation tanh(x) ~= x is used for better accuracy as x tends to zero.
+
+    This implementation works on both scalars and packets.
+*/
+template<typename T>
+T generic_fast_tanh_float(const T& a_x)
+{
+  // Clamp the inputs to the range [-c, c]
+#ifdef EIGEN_VECTORIZE_FMA
+  const T plus_clamp = pset1<T>(7.99881172180175781f);
+  const T minus_clamp = pset1<T>(-7.99881172180175781f);
+#else
+  const T plus_clamp = pset1<T>(7.90531110763549805f);
+  const T minus_clamp = pset1<T>(-7.90531110763549805f);
+#endif
+  const T tiny = pset1<T>(0.0004f);
+  const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
+  const T tiny_mask = pcmp_lt(pabs(a_x), tiny);
+  // The monomial coefficients of the numerator polynomial (odd).
+  const T alpha_1 = pset1<T>(4.89352455891786e-03f);
+  const T alpha_3 = pset1<T>(6.37261928875436e-04f);
+  const T alpha_5 = pset1<T>(1.48572235717979e-05f);
+  const T alpha_7 = pset1<T>(5.12229709037114e-08f);
+  const T alpha_9 = pset1<T>(-8.60467152213735e-11f);
+  const T alpha_11 = pset1<T>(2.00018790482477e-13f);
+  const T alpha_13 = pset1<T>(-2.76076847742355e-16f);
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const T beta_0 = pset1<T>(4.89352518554385e-03f);
+  const T beta_2 = pset1<T>(2.26843463243900e-03f);
+  const T beta_4 = pset1<T>(1.18534705686654e-04f);
+  const T beta_6 = pset1<T>(1.19825839466702e-06f);
+
+  // Since the polynomials are odd/even, we need x^2.
+  const T x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial p.
+  T p = pmadd(x2, alpha_13, alpha_11);
+  p = pmadd(x2, p, alpha_9);
+  p = pmadd(x2, p, alpha_7);
+  p = pmadd(x2, p, alpha_5);
+  p = pmadd(x2, p, alpha_3);
+  p = pmadd(x2, p, alpha_1);
+  p = pmul(x, p);
+
+  // Evaluate the denominator polynomial q.
+  T q = pmadd(x2, beta_6, beta_4);
+  q = pmadd(x2, q, beta_2);
+  q = pmadd(x2, q, beta_0);
+
+  // Divide the numerator by the denominator.
+  return pselect(tiny_mask, x, pdiv(p, q));
+}
+
+template<typename RealScalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y)
+{
+  // IEEE IEC 6059 special cases.
+  if ((numext::isinf)(x) || (numext::isinf)(y))
+    return NumTraits<RealScalar>::infinity();
+  if ((numext::isnan)(x) || (numext::isnan)(y))
+    return NumTraits<RealScalar>::quiet_NaN();
+    
+  EIGEN_USING_STD(sqrt);
+  RealScalar p, qp;
+  p = numext::maxi(x,y);
+  if(p==RealScalar(0)) return RealScalar(0);
+  qp = numext::mini(y,x) / p;
+  return p * sqrt(RealScalar(1) + qp*qp);
+}
+
+template<typename Scalar>
+struct hypot_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static EIGEN_DEVICE_FUNC
+  inline RealScalar run(const Scalar& x, const Scalar& y)
+  {
+    EIGEN_USING_STD(abs);
+    return positive_real_hypot<RealScalar>(abs(x), abs(y));
+  }
+};
+
+// Generic complex sqrt implementation that correctly handles corner cases
+// according to https://en.cppreference.com/w/cpp/numeric/complex/sqrt
+template<typename T>
+EIGEN_DEVICE_FUNC std::complex<T> complex_sqrt(const std::complex<T>& z) {
+  // Computes the principal sqrt of the input.
+  //
+  // For a complex square root of the number x + i*y. We want to find real
+  // numbers u and v such that
+  //    (u + i*v)^2 = x + i*y  <=>
+  //    u^2 - v^2 + i*2*u*v = x + i*v.
+  // By equating the real and imaginary parts we get:
+  //    u^2 - v^2 = x
+  //    2*u*v = y.
+  //
+  // For x >= 0, this has the numerically stable solution
+  //    u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
+  //    v = y / (2 * u)
+  // and for x < 0,
+  //    v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
+  //    u = y / (2 * v)
+  //
+  // Letting w = sqrt(0.5 * (|x| + |z|)),
+  //   if x == 0: u = w, v = sign(y) * w
+  //   if x > 0:  u = w, v = y / (2 * w)
+  //   if x < 0:  u = |y| / (2 * w), v = sign(y) * w
+
+  const T x = numext::real(z);
+  const T y = numext::imag(z);
+  const T zero = T(0);
+  const T w = numext::sqrt(T(0.5) * (numext::abs(x) + numext::hypot(x, y)));
+
+  return
+    (numext::isinf)(y) ? std::complex<T>(NumTraits<T>::infinity(), y)
+      : x == zero ? std::complex<T>(w, y < zero ? -w : w)
+      : x > zero ? std::complex<T>(w, y / (2 * w))
+      : std::complex<T>(numext::abs(y) / (2 * w), y < zero ? -w : w );
+}
+
+// Generic complex rsqrt implementation.
+template<typename T>
+EIGEN_DEVICE_FUNC std::complex<T> complex_rsqrt(const std::complex<T>& z) {
+  // Computes the principal reciprocal sqrt of the input.
+  //
+  // For a complex reciprocal square root of the number z = x + i*y. We want to
+  // find real numbers u and v such that
+  //    (u + i*v)^2 = 1 / (x + i*y)  <=>
+  //    u^2 - v^2 + i*2*u*v = x/|z|^2 - i*v/|z|^2.
+  // By equating the real and imaginary parts we get:
+  //    u^2 - v^2 = x/|z|^2
+  //    2*u*v = y/|z|^2.
+  //
+  // For x >= 0, this has the numerically stable solution
+  //    u = sqrt(0.5 * (x + |z|)) / |z|
+  //    v = -y / (2 * u * |z|)
+  // and for x < 0,
+  //    v = -sign(y) * sqrt(0.5 * (-x + |z|)) / |z|
+  //    u = -y / (2 * v * |z|)
+  //
+  // Letting w = sqrt(0.5 * (|x| + |z|)),
+  //   if x == 0: u = w / |z|, v = -sign(y) * w / |z|
+  //   if x > 0:  u = w / |z|, v = -y / (2 * w * |z|)
+  //   if x < 0:  u = |y| / (2 * w * |z|), v = -sign(y) * w / |z|
+
+  const T x = numext::real(z);
+  const T y = numext::imag(z);
+  const T zero = T(0);
+
+  const T abs_z = numext::hypot(x, y);
+  const T w = numext::sqrt(T(0.5) * (numext::abs(x) + abs_z));
+  const T woz = w / abs_z;
+  // Corner cases consistent with 1/sqrt(z) on gcc/clang.
+  return
+    abs_z == zero ? std::complex<T>(NumTraits<T>::infinity(), NumTraits<T>::quiet_NaN())
+      : ((numext::isinf)(x) || (numext::isinf)(y)) ? std::complex<T>(zero, zero)
+      : x == zero ? std::complex<T>(woz, y < zero ? woz : -woz)
+      : x > zero ? std::complex<T>(woz, -y / (2 * w * abs_z))
+      : std::complex<T>(numext::abs(y) / (2 * w * abs_z), y < zero ? woz : -woz );
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC std::complex<T> complex_log(const std::complex<T>& z) {
+  // Computes complex log.
+  T a = numext::abs(z);
+  EIGEN_USING_STD(atan2);
+  T b = atan2(z.imag(), z.real());
+  return std::complex<T>(numext::log(a), b);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATHFUNCTIONSIMPL_H

diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index 782d67f..f0e59a9 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h

@@ -13,6 +13,45 @@
 
 namespace Eigen {
 
+namespace internal {
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+{
+private:
+  enum { size = internal::size_at_compile_time<_Rows,_Cols>::ret };
+  typedef typename find_best_packet<_Scalar,size>::type PacketScalar;
+  enum {
+      row_major_bit = _Options&RowMajor ? RowMajorBit : 0,
+      is_dynamic_size_storage = _MaxRows==Dynamic || _MaxCols==Dynamic,
+      max_size = is_dynamic_size_storage ? Dynamic : _MaxRows*_MaxCols,
+      default_alignment = compute_default_alignment<_Scalar,max_size>::value,
+      actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0,
+      required_alignment = unpacket_traits<PacketScalar>::alignment,
+      packet_access_bit = (packet_traits<_Scalar>::Vectorizable && (EIGEN_UNALIGNED_VECTORIZE || (actual_alignment>=required_alignment))) ? PacketAccessBit : 0
+    };
+
+public:
+  typedef _Scalar Scalar;
+  typedef Dense StorageKind;
+  typedef Eigen::Index StorageIndex;
+  typedef MatrixXpr XprKind;
+  enum {
+    RowsAtCompileTime = _Rows,
+    ColsAtCompileTime = _Cols,
+    MaxRowsAtCompileTime = _MaxRows,
+    MaxColsAtCompileTime = _MaxCols,
+    Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
+    Options = _Options,
+    InnerStrideAtCompileTime = 1,
+    OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime,
+
+    // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase
+    EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit,
+    Alignment = actual_alignment
+  };
+};
+}
+
 /** \class Matrix
   * \ingroup Core_Module
   *
@@ -24,13 +63,13 @@
   * The %Matrix class encompasses \em both fixed-size and dynamic-size objects (\ref fixedsize "note").
   *
   * The first three template parameters are required:
-  * \tparam _Scalar \anchor matrix_tparam_scalar Numeric type, e.g. float, double, int or std::complex<float>.
-  *                 User defined sclar types are supported as well (see \ref user_defined_scalars "here").
+  * \tparam _Scalar Numeric type, e.g. float, double, int or std::complex<float>.
+  *                 User defined scalar types are supported as well (see \ref user_defined_scalars "here").
   * \tparam _Rows Number of rows, or \b Dynamic
   * \tparam _Cols Number of columns, or \b Dynamic
   *
   * The remaining template parameters are optional -- in most cases you don't have to worry about them.
-  * \tparam _Options \anchor matrix_tparam_options A combination of either \b #RowMajor or \b #ColMajor, and of either
+  * \tparam _Options A combination of either \b #RowMajor or \b #ColMajor, and of either
   *                 \b #AutoAlign or \b #DontAlign.
   *                 The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required
   *                 for vectorization. It defaults to aligning matrices except for fixed sizes that aren't a multiple of the packet size.
@@ -67,7 +106,7 @@
   * \endcode
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_MATRIX_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_MATRIX_PLUGIN.
   *
   * <i><b>Some notes:</b></i>
   *
@@ -97,32 +136,44 @@
   * are the dimensions of the original matrix, while _Rows and _Cols are Dynamic.</dd>
   * </dl>
   *
-  * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy, 
-  * \ref TopicStorageOrders 
+  * <i><b>ABI and storage layout</b></i>
+  *
+  * The table below summarizes the ABI of some possible Matrix instances which is fixed thorough the lifetime of Eigen 3.
+  * <table  class="manual">
+  * <tr><th>Matrix type</th><th>Equivalent C structure</th></tr>
+  * <tr><td>\code Matrix<T,Dynamic,Dynamic> \endcode</td><td>\code
+  * struct {
+  *   T *data;                  // with (size_t(data)%EIGEN_MAX_ALIGN_BYTES)==0
+  *   Eigen::Index rows, cols;
+  *  };
+  * \endcode</td></tr>
+  * <tr class="alt"><td>\code
+  * Matrix<T,Dynamic,1>
+  * Matrix<T,1,Dynamic> \endcode</td><td>\code
+  * struct {
+  *   T *data;                  // with (size_t(data)%EIGEN_MAX_ALIGN_BYTES)==0
+  *   Eigen::Index size;
+  *  };
+  * \endcode</td></tr>
+  * <tr><td>\code Matrix<T,Rows,Cols> \endcode</td><td>\code
+  * struct {
+  *   T data[Rows*Cols];        // with (size_t(data)%A(Rows*Cols*sizeof(T)))==0
+  *  };
+  * \endcode</td></tr>
+  * <tr class="alt"><td>\code Matrix<T,Dynamic,Dynamic,0,MaxRows,MaxCols> \endcode</td><td>\code
+  * struct {
+  *   T data[MaxRows*MaxCols];  // with (size_t(data)%A(MaxRows*MaxCols*sizeof(T)))==0
+  *   Eigen::Index rows, cols;
+  *  };
+  * \endcode</td></tr>
+  * </table>
+  * Note that in this table Rows, Cols, MaxRows and MaxCols are all positive integers. A(S) is defined to the largest possible power-of-two
+  * smaller to EIGEN_MAX_STATIC_ALIGN_BYTES.
+  *
+  * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy,
+  * \ref TopicStorageOrders
   */
 
-namespace internal {
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-{
-  typedef _Scalar Scalar;
-  typedef Dense StorageKind;
-  typedef DenseIndex Index;
-  typedef MatrixXpr XprKind;
-  enum {
-    RowsAtCompileTime = _Rows,
-    ColsAtCompileTime = _Cols,
-    MaxRowsAtCompileTime = _MaxRows,
-    MaxColsAtCompileTime = _MaxCols,
-    Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
-    CoeffReadCost = NumTraits<Scalar>::ReadCost,
-    Options = _Options,
-    InnerStrideAtCompileTime = 1,
-    OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime
-  };
-};
-}
-
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
 class Matrix
   : public PlainObjectBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
@@ -169,7 +220,7 @@
       */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Matrix& operator=(const MatrixBase<OtherDerived>& other)
+    EIGEN_STRONG_INLINE Matrix& operator=(const DenseBase<OtherDerived>& other)
     {
       return Base::_set(other);
     }
@@ -204,78 +255,136 @@
       *
       * \sa resize(Index,Index)
       */
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Matrix() : Base()
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Matrix() : Base()
     {
       Base::_check_template_params();
       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
     }
 
     // FIXME is it still needed
-    EIGEN_DEVICE_FUNC
-    Matrix(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    explicit Matrix(internal::constructor_without_unaligned_array_assert)
       : Base(internal::constructor_without_unaligned_array_assert())
     { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
 
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
-    Matrix(Matrix&& other)
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Matrix(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
       : Base(std::move(other))
     {
       Base::_check_template_params();
-      if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
-        Base::_set_noalias(other);
     }
-    Matrix& operator=(Matrix&& other)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
     {
-      other.swap(*this);
+      Base::operator=(std::move(other));
       return *this;
     }
 #endif
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
+#if EIGEN_HAS_CXX11
+    /** \copydoc PlainObjectBase(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&... args)
+     *
+     * Example: \include Matrix_variadic_ctor_cxx11.cpp
+     * Output: \verbinclude Matrix_variadic_ctor_cxx11.out
+     *
+     * \sa Matrix(const std::initializer_list<std::initializer_list<Scalar>>&)
+     */
+    template <typename... ArgTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
+      : Base(a0, a1, a2, a3, args...) {}
+
+    /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
+      *
+      * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
+      *
+      * Example: \include Matrix_initializer_list_23_cxx11.cpp
+      * Output: \verbinclude Matrix_initializer_list_23_cxx11.out
+      *
+      * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered.
+      *
+      * In the case of a compile-time column vector, implicit transposition from a single row is allowed.
+      * Therefore <code>VectorXd{{1,2,3,4,5}}</code> is legal and the more verbose syntax
+      * <code>RowVectorXd{{1},{2},{3},{4},{5}}</code> can be avoided:
+      *
+      * Example: \include Matrix_initializer_list_vector_cxx11.cpp
+      * Output: \verbinclude Matrix_initializer_list_vector_cxx11.out
+      *
+      * In the case of fixed-sized matrices, the initializer list sizes must exactly match the matrix sizes,
+      * and implicit transposition is allowed for compile-time vectors only.
+      *
+      * \sa Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
+      */
+    EIGEN_DEVICE_FUNC
+    explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list<std::initializer_list<Scalar>>& list) : Base(list) {}
+#endif // end EIGEN_HAS_CXX11
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
 
     // This constructor is for both 1x1 matrices and dynamic vectors
     template<typename T>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE explicit Matrix(const T& x)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    explicit Matrix(const T& x)
     {
       Base::_check_template_params();
       Base::template _init1<T>(x);
     }
 
     template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Matrix(const T0& x, const T1& y)
     {
       Base::_check_template_params();
       Base::template _init2<T0,T1>(x, y);
     }
-    #else
+
+
+#else
     /** \brief Constructs a fixed-sized matrix initialized with coefficients starting at \a data */
     EIGEN_DEVICE_FUNC
     explicit Matrix(const Scalar *data);
 
     /** \brief Constructs a vector or row-vector with given dimension. \only_for_vectors
       *
-      * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
-      * it is redundant to pass the dimension here, so it makes more sense to use the default
-      * constructor Matrix() instead.
+      * This is useful for dynamic-size vectors. For fixed-size vectors,
+      * it is redundant to pass these parameters, so one should use the default constructor
+      * Matrix() instead.
+      *
+      * \warning This constructor is disabled for fixed-size \c 1x1 matrices. For instance,
+      * calling Matrix<double,1,1>(1) will call the initialization constructor: Matrix(const Scalar&).
+      * For fixed-size \c 1x1 matrices it is therefore recommended to use the default
+      * constructor Matrix() instead, especially when using one of the non standard
+      * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
       */
     EIGEN_STRONG_INLINE explicit Matrix(Index dim);
-    /** \brief Constructs an initialized 1x1 matrix with the given coefficient */
+    /** \brief Constructs an initialized 1x1 matrix with the given coefficient
+      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...) */
     Matrix(const Scalar& x);
     /** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns.
       *
       * This is useful for dynamic-size matrices. For fixed-size matrices,
       * it is redundant to pass these parameters, so one should use the default constructor
-      * Matrix() instead. */
+      * Matrix() instead.
+      *
+      * \warning This constructor is disabled for fixed-size \c 1x2 and \c 2x1 vectors. For instance,
+      * calling Matrix2f(2,1) will call the initialization constructor: Matrix(const Scalar& x, const Scalar& y).
+      * For fixed-size \c 1x2 or \c 2x1 vectors it is therefore recommended to use the default
+      * constructor Matrix() instead, especially when using one of the non standard
+      * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
+      */
     EIGEN_DEVICE_FUNC
     Matrix(Index rows, Index cols);
-    /** \brief Constructs an initialized 2D vector with given coefficients */
-    Matrix(const Scalar& x, const Scalar& y);
-    #endif
 
-    /** \brief Constructs an initialized 3D vector with given coefficients */
+    /** \brief Constructs an initialized 2D vector with given coefficients
+      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...) */
+    Matrix(const Scalar& x, const Scalar& y);
+    #endif  // end EIGEN_PARSED_BY_DOXYGEN
+
+    /** \brief Constructs an initialized 3D vector with given coefficients
+      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
+      */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z)
     {
@@ -285,7 +394,9 @@
       m_storage.data()[1] = y;
       m_storage.data()[2] = z;
     }
-    /** \brief Constructs an initialized 4D vector with given coefficients */
+    /** \brief Constructs an initialized 4D vector with given coefficients
+      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
+      */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w)
     {
@@ -298,37 +409,10 @@
     }
 
 
-    /** \brief Constructor copying the value of the expression \a other */
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Matrix(const MatrixBase<OtherDerived>& other)
-             : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      // This test resides here, to bring the error messages closer to the user. Normally, these checks
-      // are performed deeply within the library, thus causing long and scary error traces.
-      EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
     /** \brief Copy constructor */
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Matrix(const Matrix& other)
-            : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
-    /** \brief Copy constructor with in-place evaluation */
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Matrix(const ReturnByValue<OtherDerived>& other)
-    {
-      Base::_check_template_params();
-      Base::resize(other.rows(), other.cols());
-      other.evalTo(*this);
-    }
+    EIGEN_STRONG_INLINE Matrix(const Matrix& other) : Base(other)
+    { }
 
     /** \brief Copy constructor for generic expressions.
       * \sa MatrixBase::operator=(const EigenBase<OtherDerived>&)
@@ -336,26 +420,13 @@
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix(const EigenBase<OtherDerived> &other)
-      : Base(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
-    {
-      Base::_check_template_params();
-      Base::_resize_to_match(other);
-      // FIXME/CHECK: isn't *this = other.derived() more efficient. it allows to
-      //              go for pure _set() implementations, right?
-      *this = other;
-    }
+      : Base(other.derived())
+    { }
 
-    /** \internal
-      * \brief Override MatrixBase::swap() since for dynamic-sized matrices
-      * of same type it is enough to swap the data pointers.
-      */
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    void swap(MatrixBase<OtherDerived> const & other)
-    { this->_swap(other.derived()); }
-
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return 1; }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); }
 
     /////////// Geometry module ///////////
 
@@ -366,13 +437,6 @@
     EIGEN_DEVICE_FUNC
     Matrix& operator=(const RotationBase<OtherDerived,ColsAtCompileTime>& r);
 
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived>
-    explicit Matrix(const eigen2_RotationBase<OtherDerived,ColsAtCompileTime>& r);
-    template<typename OtherDerived>
-    Matrix& operator=(const eigen2_RotationBase<OtherDerived,ColsAtCompileTime>& r);
-    #endif
-
     // allow to extend Matrix outside Eigen
     #ifdef EIGEN_MATRIX_PLUGIN
     #include EIGEN_MATRIX_PLUGIN
@@ -389,7 +453,7 @@
   *
   * \ingroup Core_Module
   *
-  * Eigen defines several typedef shortcuts for most common matrix and vector types.
+  * %Eigen defines several typedef shortcuts for most common matrix and vector types.
   *
   * The general patterns are the following:
   *
@@ -402,6 +466,15 @@
   * There are also \c VectorSizeType and \c RowVectorSizeType which are self-explanatory. For example, \c Vector4cf is
   * a fixed-size vector of 4 complex floats.
   *
+  * With \cpp11, template alias are also defined for common sizes.
+  * They follow the same pattern as above except that the scalar type suffix is replaced by a
+  * template parameter, i.e.:
+  *   - `MatrixSize<Type>` where `Size` can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size.
+  *   - `MatrixXSize<Type>` and `MatrixSizeX<Type>` where `Size` can be \c 2,\c 3,\c 4 for hybrid dynamic/fixed matrices.
+  *   - `VectorSize<Type>` and `RowVectorSize<Type>` for column and row vectors.
+  *
+  * With \cpp11, you can also use fully generic column and row vector types: `Vector<Type,Size>` and `RowVector<Type,Size>`.
+  *
   * \sa class Matrix
   */
 
@@ -438,6 +511,55 @@
 #undef EIGEN_MAKE_TYPEDEFS
 #undef EIGEN_MAKE_FIXED_TYPEDEFS
 
+#if EIGEN_HAS_CXX11
+
+#define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix)                     \
+/** \ingroup matrixtypedefs */                                    \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Matrix##SizeSuffix = Matrix<Type, Size, Size>;              \
+/** \ingroup matrixtypedefs */                                    \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Vector##SizeSuffix = Matrix<Type, Size, 1>;                 \
+/** \ingroup matrixtypedefs */                                    \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using RowVector##SizeSuffix = Matrix<Type, 1, Size>;
+
+#define EIGEN_MAKE_FIXED_TYPEDEFS(Size)                           \
+/** \ingroup matrixtypedefs */                                    \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Matrix##Size##X = Matrix<Type, Size, Dynamic>;              \
+/** \ingroup matrixtypedefs */                                    \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Matrix##X##Size = Matrix<Type, Dynamic, Size>;
+
+EIGEN_MAKE_TYPEDEFS(2, 2)
+EIGEN_MAKE_TYPEDEFS(3, 3)
+EIGEN_MAKE_TYPEDEFS(4, 4)
+EIGEN_MAKE_TYPEDEFS(Dynamic, X)
+EIGEN_MAKE_FIXED_TYPEDEFS(2)
+EIGEN_MAKE_FIXED_TYPEDEFS(3)
+EIGEN_MAKE_FIXED_TYPEDEFS(4)
+
+/** \ingroup matrixtypedefs
+  * \brief \cpp11 */
+template <typename Type, int Size>
+using Vector = Matrix<Type, Size, 1>;
+
+/** \ingroup matrixtypedefs
+  * \brief \cpp11 */
+template <typename Type, int Size>
+using RowVector = Matrix<Type, 1, Size>;
+
+#undef EIGEN_MAKE_TYPEDEFS
+#undef EIGEN_MAKE_FIXED_TYPEDEFS
+
+#endif // EIGEN_HAS_CXX11
+
 } // end namespace Eigen
 
 #endif // EIGEN_MATRIX_H

diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index fa05b07..45c3a59 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h

@@ -41,9 +41,9 @@
   * \endcode
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_MATRIXBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_MATRIXBASE_PLUGIN.
   *
-  * \sa \ref TopicClassHierarchy
+  * \sa \blank \ref TopicClassHierarchy
   */
 template<typename Derived> class MatrixBase
   : public DenseBase<Derived>
@@ -52,7 +52,7 @@
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typedef MatrixBase StorageBaseType;
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
+    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -66,7 +66,6 @@
     using Base::MaxSizeAtCompileTime;
     using Base::IsVectorAtCompileTime;
     using Base::Flags;
-    using Base::CoeffReadCost;
 
     using Base::derived;
     using Base::const_cast_derived;
@@ -77,6 +76,7 @@
     using Base::coeffRef;
     using Base::lazyAssign;
     using Base::eval;
+    using Base::operator-;
     using Base::operator+=;
     using Base::operator-=;
     using Base::operator*=;
@@ -99,25 +99,13 @@
     /** \returns the size of the main diagonal, which is min(rows(),cols()).
       * \sa rows(), cols(), SizeAtCompileTime. */
     EIGEN_DEVICE_FUNC
-    inline Index diagonalSize() const { return (std::min)(rows(),cols()); }
+    inline Index diagonalSize() const { return (numext::mini)(rows(),cols()); }
 
-    /** \brief The plain matrix type corresponding to this expression.
-      *
-      * This is not necessarily exactly the return type of eval(). In the case of plain matrices,
-      * the return type of eval() is a const reference to a matrix, not a matrix! It is however guaranteed
-      * that the return type of eval() is either PlainObject or const PlainObject&.
-      */
-    typedef Matrix<typename internal::traits<Derived>::Scalar,
-                internal::traits<Derived>::RowsAtCompileTime,
-                internal::traits<Derived>::ColsAtCompileTime,
-                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
-                internal::traits<Derived>::MaxRowsAtCompileTime,
-                internal::traits<Derived>::MaxColsAtCompileTime
-          > PlainObject;
+    typedef typename Base::PlainObject PlainObject;
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
     /** \internal the return type of MatrixBase::adjoint() */
     typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
                         CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, ConstTransposeReturnType>,
@@ -126,7 +114,7 @@
     /** \internal Return type of eigenvalues() */
     typedef Matrix<std::complex<RealScalar>, internal::traits<Derived>::ColsAtCompileTime, 1, ColMajor> EigenvaluesReturnType;
     /** \internal the return type of identity */
-    typedef CwiseNullaryOp<internal::scalar_identity_op<Scalar>,Derived> IdentityReturnType;
+    typedef CwiseNullaryOp<internal::scalar_identity_op<Scalar>,PlainObject> IdentityReturnType;
     /** \internal the return type of unit vectors */
     typedef Block<const CwiseNullaryOp<internal::scalar_identity_op<Scalar>, SquareMatrixType>,
                   internal::traits<Derived>::RowsAtCompileTime,
@@ -134,7 +122,7 @@
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase
-#   include "../plugins/CommonCwiseUnaryOps.h"
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
 #   include "../plugins/CommonCwiseBinaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
 #   include "../plugins/MatrixCwiseBinaryOps.h"
@@ -142,18 +130,19 @@
 #     include EIGEN_MATRIXBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_DOC_UNARY_ADDONS
 
     /** Special case of the template operator=, in order to prevent the compiler
       * from generating a default operator= (issue hit with g++ 4.1)
       */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator=(const MatrixBase& other);
 
     // We cannot inherit here via Base::operator= since it is causing
     // trouble with MSVC.
 
     template <typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator=(const DenseBase<OtherDerived>& other);
 
     template <typename OtherDerived>
@@ -164,42 +153,21 @@
     EIGEN_DEVICE_FUNC
     Derived& operator=(const ReturnByValue<OtherDerived>& other);
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_DEVICE_FUNC
-    Derived& lazyAssign(const ProductBase<ProductDerived, Lhs,Rhs>& other);
-#endif // not EIGEN_PARSED_BY_DOXYGEN
-
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator+=(const MatrixBase<OtherDerived>& other);
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator-=(const MatrixBase<OtherDerived>& other);
 
-#ifdef __CUDACC__
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    const typename LazyProductReturnType<Derived,OtherDerived>::Type
-    operator*(const MatrixBase<OtherDerived> &other) const
-    { return this->lazyProduct(other); }
-#else
-
-#ifdef EIGEN_TEST_EVALUATORS
-    template<typename OtherDerived>
     const Product<Derived,OtherDerived>
     operator*(const MatrixBase<OtherDerived> &other) const;
-#else
-    template<typename OtherDerived>
-    const typename ProductReturnType<Derived,OtherDerived>::Type
-    operator*(const MatrixBase<OtherDerived> &other) const;
-#endif
-
-#endif
 
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    const typename LazyProductReturnType<Derived,OtherDerived>::Type
+    const Product<Derived,OtherDerived,LazyProduct>
     lazyProduct(const MatrixBase<OtherDerived> &other) const;
 
     template<typename OtherDerived>
@@ -213,26 +181,23 @@
 
     template<typename DiagonalDerived>
     EIGEN_DEVICE_FUNC
-    const DiagonalProduct<Derived, DiagonalDerived, OnTheRight>
+    const Product<Derived, DiagonalDerived, LazyProduct>
     operator*(const DiagonalBase<DiagonalDerived> &diagonal) const;
 
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
+    typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
     dot(const MatrixBase<OtherDerived>& other) const;
 
-    #ifdef EIGEN2_SUPPORT
-      template<typename OtherDerived>
-      Scalar eigen2_dot(const MatrixBase<OtherDerived>& other) const;
-    #endif
-
     EIGEN_DEVICE_FUNC RealScalar squaredNorm() const;
     EIGEN_DEVICE_FUNC RealScalar norm() const;
     RealScalar stableNorm() const;
     RealScalar blueNorm() const;
     RealScalar hypotNorm() const;
     EIGEN_DEVICE_FUNC const PlainObject normalized() const;
+    EIGEN_DEVICE_FUNC const PlainObject stableNormalized() const;
     EIGEN_DEVICE_FUNC void normalize();
+    EIGEN_DEVICE_FUNC void stableNormalize();
 
     EIGEN_DEVICE_FUNC const AdjointReturnType adjoint() const;
     EIGEN_DEVICE_FUNC void adjointInPlace();
@@ -256,29 +221,13 @@
     EIGEN_DEVICE_FUNC
     typename ConstDiagonalIndexReturnType<Index>::Type diagonal() const;
 
-    // Note: The "MatrixBase::" prefixes are added to help MSVC9 to match these declarations with the later implementations.
-    // On the other hand they confuse MSVC8...
-    #if EIGEN_COMP_MSVC >= 1500 // 2008 or later
-    typename MatrixBase::template DiagonalIndexReturnType<DynamicIndex>::Type diagonal(Index index);
-    typename MatrixBase::template ConstDiagonalIndexReturnType<DynamicIndex>::Type diagonal(Index index) const;
-    #else
-    EIGEN_DEVICE_FUNC
-    typename DiagonalIndexReturnType<DynamicIndex>::Type diagonal(Index index);
+    typedef Diagonal<Derived,DynamicIndex> DiagonalDynamicIndexReturnType;
+    typedef typename internal::add_const<Diagonal<const Derived,DynamicIndex> >::type ConstDiagonalDynamicIndexReturnType;
 
     EIGEN_DEVICE_FUNC
-    typename ConstDiagonalIndexReturnType<DynamicIndex>::Type diagonal(Index index) const;
-    #endif
-
-    #ifdef EIGEN2_SUPPORT
-    template<unsigned int Mode> typename internal::eigen2_part_return_type<Derived, Mode>::type part();
-    template<unsigned int Mode> const typename internal::eigen2_part_return_type<Derived, Mode>::type part() const;
-
-    // huuuge hack. make Eigen2's matrix.part<Diagonal>() work in eigen3. Problem: Diagonal is now a class template instead
-    // of an integer constant. Solution: overload the part() method template wrt template parameters list.
-    template<template<typename T, int N> class U>
-    const DiagonalWrapper<ConstDiagonalReturnType> part() const
-    { return diagonal().asDiagonal(); }
-    #endif // EIGEN2_SUPPORT
+    DiagonalDynamicIndexReturnType diagonal(Index index);
+    EIGEN_DEVICE_FUNC
+    ConstDiagonalDynamicIndexReturnType diagonal(Index index) const;
 
     template<unsigned int Mode> struct TriangularViewReturnType { typedef TriangularView<Derived, Mode> Type; };
     template<unsigned int Mode> struct ConstTriangularViewReturnType { typedef const TriangularView<const Derived, Mode> Type; };
@@ -319,6 +268,8 @@
     Derived& setIdentity();
     EIGEN_DEVICE_FUNC
     Derived& setIdentity(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setUnit(Index i);
+    EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i);
 
     bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
     bool isDiagonal(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
@@ -336,7 +287,7 @@
       *          fuzzy comparison such as isApprox()
       * \sa isApprox(), operator!= */
     template<typename OtherDerived>
-    inline bool operator==(const MatrixBase<OtherDerived>& other) const
+    EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase<OtherDerived>& other) const
     { return cwiseEqual(other).all(); }
 
     /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other.
@@ -344,17 +295,19 @@
       *          fuzzy comparison such as isApprox()
       * \sa isApprox(), operator== */
     template<typename OtherDerived>
-    inline bool operator!=(const MatrixBase<OtherDerived>& other) const
+    EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
     { return cwiseNotEqual(other).any(); }
 
-    NoAlias<Derived,Eigen::MatrixBase > noalias();
+    NoAlias<Derived,Eigen::MatrixBase > EIGEN_DEVICE_FUNC noalias();
 
-    inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
-    inline ForceAlignedAccess<Derived> forceAlignedAccess();
-    template<bool Enable> inline typename internal::add_const_on_value_type<typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type>::type forceAlignedAccessIf() const;
-    template<bool Enable> inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf();
+    // TODO forceAlignedAccess is temporarily disabled
+    // Need to find a nicer workaround.
+    inline const Derived& forceAlignedAccess() const { return derived(); }
+    inline Derived& forceAlignedAccess() { return derived(); }
+    template<bool Enable> inline const Derived& forceAlignedAccessIf() const { return derived(); }
+    template<bool Enable> inline Derived& forceAlignedAccessIf() { return derived(); }
 
-    Scalar trace() const;
+    EIGEN_DEVICE_FUNC Scalar trace() const;
 
     template<int p> EIGEN_DEVICE_FUNC RealScalar lpNorm() const;
 
@@ -363,109 +316,95 @@
 
     /** \returns an \link Eigen::ArrayBase Array \endlink expression of this matrix
       * \sa ArrayBase::matrix() */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ArrayWrapper<Derived> array() { return derived(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ArrayWrapper<Derived> array() { return ArrayWrapper<Derived>(derived()); }
     /** \returns a const \link Eigen::ArrayBase Array \endlink expression of this matrix
       * \sa ArrayBase::matrix() */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ArrayWrapper<const Derived> array() const { return derived(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ArrayWrapper<const Derived> array() const { return ArrayWrapper<const Derived>(derived()); }
 
 /////////// LU module ///////////
 
-    EIGEN_DEVICE_FUNC const FullPivLU<PlainObject> fullPivLu() const;
-    EIGEN_DEVICE_FUNC const PartialPivLU<PlainObject> partialPivLu() const;
+    inline const FullPivLU<PlainObject> fullPivLu() const;
+    inline const PartialPivLU<PlainObject> partialPivLu() const;
 
-    #if EIGEN2_SUPPORT_STAGE < STAGE20_RESOLVE_API_CONFLICTS
-    const LU<PlainObject> lu() const;
-    #endif
-
-    #ifdef EIGEN2_SUPPORT
-    const LU<PlainObject> eigen2_lu() const;
-    #endif
-
-    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
-    const PartialPivLU<PlainObject> lu() const;
-    #endif
-
-    #ifdef EIGEN2_SUPPORT
-    template<typename ResultType>
-    void computeInverse(MatrixBase<ResultType> *result) const {
-      *result = this->inverse();
-    }
-    #endif
+    inline const PartialPivLU<PlainObject> lu() const;
 
     EIGEN_DEVICE_FUNC
-    const internal::inverse_impl<Derived> inverse() const;
+    inline const Inverse<Derived> inverse() const;
+
     template<typename ResultType>
-    void computeInverseAndDetWithCheck(
+    inline void computeInverseAndDetWithCheck(
       ResultType& inverse,
       typename ResultType::Scalar& determinant,
       bool& invertible,
       const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
     ) const;
+
     template<typename ResultType>
-    void computeInverseWithCheck(
+    inline void computeInverseWithCheck(
       ResultType& inverse,
       bool& invertible,
       const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
     ) const;
+
+    EIGEN_DEVICE_FUNC
     Scalar determinant() const;
 
 /////////// Cholesky module ///////////
 
-    const LLT<PlainObject>  llt() const;
-    const LDLT<PlainObject> ldlt() const;
+    inline const LLT<PlainObject>  llt() const;
+    inline const LDLT<PlainObject> ldlt() const;
 
 /////////// QR module ///////////
 
-    const HouseholderQR<PlainObject> householderQr() const;
-    const ColPivHouseholderQR<PlainObject> colPivHouseholderQr() const;
-    const FullPivHouseholderQR<PlainObject> fullPivHouseholderQr() const;
-    const CompleteOrthogonalDecomposition<PlainObject> completeOrthogonalDecomposition() const;
-    #ifdef EIGEN2_SUPPORT
-    const QR<PlainObject> qr() const;
-    #endif
+    inline const HouseholderQR<PlainObject> householderQr() const;
+    inline const ColPivHouseholderQR<PlainObject> colPivHouseholderQr() const;
+    inline const FullPivHouseholderQR<PlainObject> fullPivHouseholderQr() const;
+    inline const CompleteOrthogonalDecomposition<PlainObject> completeOrthogonalDecomposition() const;
 
-    EigenvaluesReturnType eigenvalues() const;
-    RealScalar operatorNorm() const;
+/////////// Eigenvalues module ///////////
+
+    inline EigenvaluesReturnType eigenvalues() const;
+    inline RealScalar operatorNorm() const;
 
 /////////// SVD module ///////////
 
     inline JacobiSVD<PlainObject> jacobiSvd(unsigned int computationOptions = 0) const;
     inline BDCSVD<PlainObject>    bdcSvd(unsigned int computationOptions = 0) const;
 
-    #ifdef EIGEN2_SUPPORT
-    SVD<PlainObject> svd() const;
-    #endif
-
 /////////// Geometry module ///////////
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     /// \internal helper struct to form the return type of the cross product
     template<typename OtherDerived> struct cross_product_return_type {
-      typedef typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType Scalar;
+      typedef typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType Scalar;
       typedef Matrix<Scalar,MatrixBase::RowsAtCompileTime,MatrixBase::ColsAtCompileTime> type;
     };
     #endif // EIGEN_PARSED_BY_DOXYGEN
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    typename cross_product_return_type<OtherDerived>::type
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    inline typename cross_product_return_type<OtherDerived>::type
+#else
+    inline PlainObject
+#endif
     cross(const MatrixBase<OtherDerived>& other) const;
 
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    PlainObject cross3(const MatrixBase<OtherDerived>& other) const;
+    inline PlainObject cross3(const MatrixBase<OtherDerived>& other) const;
 
     EIGEN_DEVICE_FUNC
-    PlainObject unitOrthogonal(void) const;
+    inline PlainObject unitOrthogonal(void) const;
 
-    Matrix<Scalar,3,1> eulerAngles(Index a0, Index a1, Index a2) const;
+    EIGEN_DEVICE_FUNC
+    inline Matrix<Scalar,3,1> eulerAngles(Index a0, Index a1, Index a2) const;
 
-    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
-    ScalarMultipleReturnType operator*(const UniformScaling<Scalar>& s) const;
     // put this as separate enum value to work around possible GCC 4.3 bug (?)
-    enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1?Vertical:Horizontal };
+    enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1&&RowsAtCompileTime==1 ? ((internal::traits<Derived>::Flags&RowMajorBit)==RowMajorBit ? Horizontal : Vertical)
+                                          : ColsAtCompileTime==1 ? Vertical : Horizontal };
     typedef Homogeneous<Derived, HomogeneousReturnTypeDirection> HomogeneousReturnType;
-    HomogeneousReturnType homogeneous() const;
-    #endif
+    EIGEN_DEVICE_FUNC
+    inline HomogeneousReturnType homogeneous() const;
 
     enum {
       SizeMinusOne = SizeAtCompileTime==Dynamic ? Dynamic : SizeAtCompileTime-1
@@ -473,22 +412,25 @@
     typedef Block<const Derived,
                   internal::traits<Derived>::ColsAtCompileTime==1 ? SizeMinusOne : 1,
                   internal::traits<Derived>::ColsAtCompileTime==1 ? 1 : SizeMinusOne> ConstStartMinusOne;
-    typedef CwiseUnaryOp<internal::scalar_quotient1_op<typename internal::traits<Derived>::Scalar>,
-                const ConstStartMinusOne > HNormalizedReturnType;
-
-    const HNormalizedReturnType hnormalized() const;
+    typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(ConstStartMinusOne,Scalar,quotient) HNormalizedReturnType;
+    EIGEN_DEVICE_FUNC
+    inline const HNormalizedReturnType hnormalized() const;
 
 ////////// Householder module ///////////
 
+    EIGEN_DEVICE_FUNC
     void makeHouseholderInPlace(Scalar& tau, RealScalar& beta);
     template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
     void makeHouseholder(EssentialPart& essential,
                          Scalar& tau, RealScalar& beta) const;
     template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
     void applyHouseholderOnTheLeft(const EssentialPart& essential,
                                    const Scalar& tau,
                                    Scalar* workspace);
     template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
     void applyHouseholderOnTheRight(const EssentialPart& essential,
                                     const Scalar& tau,
                                     Scalar* workspace);
@@ -496,61 +438,51 @@
 ///////// Jacobi module /////////
 
     template<typename OtherScalar>
+    EIGEN_DEVICE_FUNC
     void applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j);
     template<typename OtherScalar>
+    EIGEN_DEVICE_FUNC
     void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);
 
+///////// SparseCore module /////////
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE const typename SparseMatrixBase<OtherDerived>::template CwiseProductDenseReturnType<Derived>::Type
+    cwiseProduct(const SparseMatrixBase<OtherDerived> &other) const
+    {
+      return other.cwiseProduct(derived());
+    }
+
 ///////// MatrixFunctions module /////////
 
     typedef typename internal::stem_function<Scalar>::type StemFunction;
-    const MatrixExponentialReturnValue<Derived> exp() const;
+#define EIGEN_MATRIX_FUNCTION(ReturnType, Name, Description) \
+    /** \returns an expression of the matrix Description of \c *this. \brief This function requires the <a href="unsupported/group__MatrixFunctions__Module.html"> unsupported MatrixFunctions module</a>. To compute the coefficient-wise Description use ArrayBase::##Name . */ \
+    const ReturnType<Derived> Name() const;
+#define EIGEN_MATRIX_FUNCTION_1(ReturnType, Name, Description, Argument) \
+    /** \returns an expression of the matrix Description of \c *this. \brief This function requires the <a href="unsupported/group__MatrixFunctions__Module.html"> unsupported MatrixFunctions module</a>. To compute the coefficient-wise Description use ArrayBase::##Name . */ \
+    const ReturnType<Derived> Name(Argument) const;
+
+    EIGEN_MATRIX_FUNCTION(MatrixExponentialReturnValue, exp, exponential)
+    /** \brief Helper function for the <a href="unsupported/group__MatrixFunctions__Module.html"> unsupported MatrixFunctions module</a>.*/
     const MatrixFunctionReturnValue<Derived> matrixFunction(StemFunction f) const;
-    const MatrixFunctionReturnValue<Derived> cosh() const;
-    const MatrixFunctionReturnValue<Derived> sinh() const;
-    const MatrixFunctionReturnValue<Derived> cos() const;
-    const MatrixFunctionReturnValue<Derived> sin() const;
-    const MatrixSquareRootReturnValue<Derived> sqrt() const;
-    const MatrixLogarithmReturnValue<Derived> log() const;
-    const MatrixPowerReturnValue<Derived> pow(const RealScalar& p) const;
-    const MatrixComplexPowerReturnValue<Derived> pow(const std::complex<RealScalar>& p) const;
-
-#ifdef EIGEN2_SUPPORT
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    Derived& operator+=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
-                                      EvalBeforeAssigningBit>& other);
-
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    Derived& operator-=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
-                                      EvalBeforeAssigningBit>& other);
-
-    /** \deprecated because .lazy() is deprecated
-      * Overloaded for cache friendly product evaluation */
-    template<typename OtherDerived>
-    Derived& lazyAssign(const Flagged<OtherDerived, 0, EvalBeforeAssigningBit>& other)
-    { return lazyAssign(other._expression()); }
-
-    template<unsigned int Added>
-    const Flagged<Derived, Added, 0> marked() const;
-    const Flagged<Derived, 0, EvalBeforeAssigningBit> lazy() const;
-
-    inline const Cwise<Derived> cwise() const;
-    inline Cwise<Derived> cwise();
-
-    VectorBlock<Derived> start(Index size);
-    const VectorBlock<const Derived> start(Index size) const;
-    VectorBlock<Derived> end(Index size);
-    const VectorBlock<const Derived> end(Index size) const;
-    template<int Size> VectorBlock<Derived,Size> start();
-    template<int Size> const VectorBlock<const Derived,Size> start() const;
-    template<int Size> VectorBlock<Derived,Size> end();
-    template<int Size> const VectorBlock<const Derived,Size> end() const;
-
-    Minor<Derived> minor(Index row, Index col);
-    const Minor<Derived> minor(Index row, Index col) const;
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine)
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine)
+#if EIGEN_HAS_CXX11_MATH
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, atanh, inverse hyperbolic cosine)
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, acosh, inverse hyperbolic cosine)
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, asinh, inverse hyperbolic sine)
 #endif
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine)
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine)
+    EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root)
+    EIGEN_MATRIX_FUNCTION(MatrixLogarithmReturnValue, log, logarithm)
+    EIGEN_MATRIX_FUNCTION_1(MatrixPowerReturnValue,        pow, power to \c p, const RealScalar& p)
+    EIGEN_MATRIX_FUNCTION_1(MatrixComplexPowerReturnValue, pow, power to \c p, const std::complex<RealScalar>& p)
 
   protected:
-    EIGEN_DEVICE_FUNC MatrixBase() : Base() {}
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(MatrixBase)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MatrixBase)
 
   private:
     EIGEN_DEVICE_FUNC explicit MatrixBase(int);

diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h
index 5539af1..b427576 100644
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h

@@ -13,26 +13,28 @@
 
 namespace Eigen {
 
+namespace internal {
+template<typename ExpressionType>
+struct traits<NestByValue<ExpressionType> > : public traits<ExpressionType>
+{
+  enum {
+    Flags = traits<ExpressionType>::Flags & ~NestByRefBit
+  };
+};
+}
+
 /** \class NestByValue
   * \ingroup Core_Module
   *
   * \brief Expression which must be nested by value
   *
-  * \param ExpressionType the type of the object of which we are requiring nesting-by-value
+  * \tparam ExpressionType the type of the object of which we are requiring nesting-by-value
   *
   * This class is the return type of MatrixBase::nestByValue()
   * and most of the time this is the only way it is used.
   *
   * \sa MatrixBase::nestByValue()
   */
-
-namespace internal {
-template <typename ExpressionType>
-struct traits<NestByValue<ExpressionType> > : public traits<ExpressionType> {
-  enum { Flags = traits<ExpressionType>::Flags & ~NestByRefBit };
-};
-}
-
 template<typename ExpressionType> class NestByValue
   : public internal::dense_xpr_base< NestByValue<ExpressionType> >::type
 {
@@ -41,58 +43,14 @@
     typedef typename internal::dense_xpr_base<NestByValue>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(NestByValue)
 
-    inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
+    EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
 
-    inline Index rows() const { return m_expression.rows(); }
-    inline Index cols() const { return m_expression.cols(); }
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    inline Index innerStride() const { return m_expression.innerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
 
-    inline const CoeffReturnType coeff(Index row, Index col) const
-    {
-      return m_expression.coeff(row, col);
-    }
+    EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
 
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      return m_expression.const_cast_derived().coeffRef(row, col);
-    }
-
-    inline const CoeffReturnType coeff(Index index) const
-    {
-      return m_expression.coeff(index);
-    }
-
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index row, Index col) const
-    {
-      return m_expression.template packet<LoadMode>(row, col);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index row, Index col, const PacketScalar& x)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return m_expression.template packet<LoadMode>(index);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
-    }
-
-    operator const ExpressionType&() const { return m_expression; }
+    EIGEN_DEVICE_FUNC const ExpressionType& nestedExpression() const { return m_expression; }
 
   protected:
     const ExpressionType m_expression;
@@ -107,6 +65,21 @@
   return NestByValue<Derived>(derived());
 }
 
+namespace internal {
+
+// Evaluator of Solve -> eval into a temporary
+template<typename ArgType>
+struct evaluator<NestByValue<ArgType> >
+  : public evaluator<ArgType>
+{
+  typedef evaluator<ArgType> Base;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const NestByValue<ArgType>& xpr)
+    : Base(xpr.nestedExpression())
+  {}
+};
+}
+
 } // end namespace Eigen
 
 #endif // EIGEN_NESTBYVALUE_H

diff --git a/Eigen/src/Core/NoAlias.h b/Eigen/src/Core/NoAlias.h
index 0a1c327..570283d 100644
--- a/Eigen/src/Core/NoAlias.h
+++ b/Eigen/src/Core/NoAlias.h

@@ -17,7 +17,7 @@
   *
   * \brief Pseudo expression providing an operator = assuming no aliasing
   *
-  * \param ExpressionType the type of the object on which to do the lazy assignment
+  * \tparam ExpressionType the type of the object on which to do the lazy assignment
   *
   * This class represents an expression with special assignment operators
   * assuming no aliasing between the target expression and the source expression.
@@ -30,68 +30,36 @@
 template<typename ExpressionType, template <typename> class StorageBase>
 class NoAlias
 {
-    typedef typename ExpressionType::Scalar Scalar;
   public:
-    NoAlias(ExpressionType& expression) : m_expression(expression) {}
-
-    /** Behaves like MatrixBase::lazyAssign(other)
-      * \sa MatrixBase::lazyAssign() */
+    typedef typename ExpressionType::Scalar Scalar;
+    
+    EIGEN_DEVICE_FUNC
+    explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
+    
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE ExpressionType& operator=(const StorageBase<OtherDerived>& other)
-    { return internal::assign_selector<ExpressionType,OtherDerived,false>::run(m_expression,other.derived()); }
-
-    /** \sa MatrixBase::operator+= */
+    {
+      call_assignment_no_alias(m_expression, other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
+      return m_expression;
+    }
+    
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE ExpressionType& operator+=(const StorageBase<OtherDerived>& other)
     {
-      typedef SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, ExpressionType, OtherDerived> SelfAdder;
-      SelfAdder tmp(m_expression);
-      typedef typename internal::nested<OtherDerived>::type OtherDerivedNested;
-      typedef typename internal::remove_all<OtherDerivedNested>::type _OtherDerivedNested;
-      internal::assign_selector<SelfAdder,_OtherDerivedNested,false>::run(tmp,OtherDerivedNested(other.derived()));
+      call_assignment_no_alias(m_expression, other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
       return m_expression;
     }
-
-    /** \sa MatrixBase::operator-= */
+    
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE ExpressionType& operator-=(const StorageBase<OtherDerived>& other)
     {
-      typedef SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, ExpressionType, OtherDerived> SelfAdder;
-      SelfAdder tmp(m_expression);
-      typedef typename internal::nested<OtherDerived>::type OtherDerivedNested;
-      typedef typename internal::remove_all<OtherDerivedNested>::type _OtherDerivedNested;
-      internal::assign_selector<SelfAdder,_OtherDerivedNested,false>::run(tmp,OtherDerivedNested(other.derived()));
+      call_assignment_no_alias(m_expression, other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
       return m_expression;
     }
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE ExpressionType& operator+=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    { other.derived().addTo(m_expression); return m_expression; }
-
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE ExpressionType& operator-=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    { other.derived().subTo(m_expression); return m_expression; }
-
-    template<typename Lhs, typename Rhs, int NestingFlags>
-    EIGEN_STRONG_INLINE ExpressionType& operator+=(const CoeffBasedProduct<Lhs,Rhs,NestingFlags>& other)
-    { return m_expression.derived() += CoeffBasedProduct<Lhs,Rhs,NestByRefBit>(other.lhs(), other.rhs()); }
-
-    template<typename Lhs, typename Rhs, int NestingFlags>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE ExpressionType& operator-=(const CoeffBasedProduct<Lhs,Rhs,NestingFlags>& other)
-    { return m_expression.derived() -= CoeffBasedProduct<Lhs,Rhs,NestByRefBit>(other.lhs(), other.rhs()); }
-    
-    template<typename OtherDerived>
-    ExpressionType& operator=(const ReturnByValue<OtherDerived>& func)
-    { return m_expression = func; }
-#endif
-
     EIGEN_DEVICE_FUNC
     ExpressionType& expression() const
     {
@@ -107,10 +75,10 @@
   *
   * More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag.
   * Currently, even though several expressions may alias, only product
-  * expressions have this flag. Therefore, noalias() is only usefull when
+  * expressions have this flag. Therefore, noalias() is only useful when
   * the source expression contains a matrix product.
   *
-  * Here are some examples where noalias is usefull:
+  * Here are some examples where noalias is useful:
   * \code
   * D.noalias()  = A * B;
   * D.noalias() += A.transpose() * B;
@@ -131,9 +99,9 @@
   * \sa class NoAlias
   */
 template<typename Derived>
-NoAlias<Derived,MatrixBase> MatrixBase<Derived>::noalias()
+NoAlias<Derived,MatrixBase> EIGEN_DEVICE_FUNC MatrixBase<Derived>::noalias()
 {
-  return derived();
+  return NoAlias<Derived, Eigen::MatrixBase >(derived());
 }
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index 3a10efd..72eac5a 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h

@@ -12,24 +12,112 @@
 
 namespace Eigen {
 
+namespace internal {
+
+// default implementation of digits10(), based on numeric_limits if specialized,
+// 0 for integer types, and log10(epsilon()) otherwise.
+template< typename T,
+          bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
+          bool is_integer = NumTraits<T>::IsInteger>
+struct default_digits10_impl
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static int run() { return std::numeric_limits<T>::digits10; }
+};
+
+template<typename T>
+struct default_digits10_impl<T,false,false> // Floating point
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static int run() {
+    using std::log10;
+    using std::ceil;
+    typedef typename NumTraits<T>::Real Real;
+    return int(ceil(-log10(NumTraits<Real>::epsilon())));
+  }
+};
+
+template<typename T>
+struct default_digits10_impl<T,false,true> // Integer
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static int run() { return 0; }
+};
+
+
+// default implementation of digits(), based on numeric_limits if specialized,
+// 0 for integer types, and log2(epsilon()) otherwise.
+template< typename T,
+          bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
+          bool is_integer = NumTraits<T>::IsInteger>
+struct default_digits_impl
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static int run() { return std::numeric_limits<T>::digits; }
+};
+
+template<typename T>
+struct default_digits_impl<T,false,false> // Floating point
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static int run() {
+    using std::log;
+    using std::ceil;
+    typedef typename NumTraits<T>::Real Real;
+    return int(ceil(-log(NumTraits<Real>::epsilon())/log(static_cast<Real>(2))));
+  }
+};
+
+template<typename T>
+struct default_digits_impl<T,false,true> // Integer
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static int run() { return 0; }
+};
+
+} // end namespace internal
+
+namespace numext {
+/** \internal bit-wise cast without changing the underlying bit representation. */
+
+// TODO: Replace by std::bit_cast (available in C++20)
+template <typename Tgt, typename Src>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) {
+#if EIGEN_HAS_TYPE_TRAITS
+  // The behaviour of memcpy is not specified for non-trivially copyable types
+  EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Src>::value, THIS_TYPE_IS_NOT_SUPPORTED);
+  EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Tgt>::value && std::is_default_constructible<Tgt>::value,
+                      THIS_TYPE_IS_NOT_SUPPORTED);
+#endif
+
+  EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED);
+  Tgt tgt;
+  EIGEN_USING_STD(memcpy)
+  memcpy(&tgt, &src, sizeof(Tgt));
+  return tgt;
+}
+}  // namespace numext
+
 /** \class NumTraits
   * \ingroup Core_Module
   *
   * \brief Holds information about the various numeric (i.e. scalar) types allowed by Eigen.
   *
-  * \param T the numeric type at hand
+  * \tparam T the numeric type at hand
   *
   * This class stores enums, typedefs and static methods giving information about a numeric type.
   *
   * The provided data consists of:
-  * \li A typedef \a Real, giving the "real part" type of \a T. If \a T is already real,
-  *     then \a Real is just a typedef to \a T. If \a T is \c std::complex<U> then \a Real
+  * \li A typedef \c Real, giving the "real part" type of \a T. If \a T is already real,
+  *     then \c Real is just a typedef to \a T. If \a T is \c std::complex<U> then \c Real
   *     is a typedef to \a U.
-  * \li A typedef \a NonInteger, giving the type that should be used for operations producing non-integral values,
+  * \li A typedef \c NonInteger, giving the type that should be used for operations producing non-integral values,
   *     such as quotients, square roots, etc. If \a T is a floating-point type, then this typedef just gives
   *     \a T again. Note however that many Eigen functions such as internal::sqrt simply refuse to
   *     take integers. Outside of a few cases, Eigen doesn't do automatic type promotion. Thus, this typedef is
   *     only intended as a helper for code that needs to explicitly promote types.
+  * \li A typedef \c Literal giving the type to use for numeric literals such as "2" or "0.5". For instance, for \c std::complex<U>, Literal is defined as \c U.
+  *     Of course, this type must be fully compatible with \a T. In doubt, just use \a T here.
   * \li A typedef \a Nested giving the type to use to nest a value inside of the expression tree. If you don't know what
   *     this means, just use \a T here.
   * \li An enum value \a IsComplex. It is equal to 1 if \a T is a \c std::complex
@@ -38,14 +126,27 @@
   *     and to \c 0 otherwise.
   * \li Enum values ReadCost, AddCost and MulCost representing a rough estimate of the number of CPU cycles needed
   *     to by move / add / mul instructions respectively, assuming the data is already stored in CPU registers.
-  *     Stay vague here. No need to do architecture-specific stuff.
+  *     Stay vague here. No need to do architecture-specific stuff. If you don't know what this means, just use \c Eigen::HugeCost.
   * \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
   * \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
   *     be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise.
-  * \li An epsilon() function which, unlike std::numeric_limits::epsilon(), returns a \a Real instead of a \a T.
+  * \li An epsilon() function which, unlike <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/epsilon">std::numeric_limits::epsilon()</a>,
+  *     it returns a \a Real instead of a \a T.
   * \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default
   *     value by the fuzzy comparison operators.
   * \li highest() and lowest() functions returning the highest and lowest possible values respectively.
+  * \li digits() function returning the number of radix digits (non-sign digits for integers, mantissa for floating-point). This is
+  *     the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits">std::numeric_limits<T>::digits</a>
+  *     which is used as the default implementation if specialized.
+  * \li digits10() function returning the number of decimal digits that can be represented without change. This is
+  *     the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits10">std::numeric_limits<T>::digits10</a>
+  *     which is used as the default implementation if specialized.
+  * \li min_exponent() and max_exponent() functions returning the highest and lowest possible values, respectively,
+  *     such that the radix raised to the power exponent-1 is a normalized floating-point number.  These are equivalent to
+  *     <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/min_exponent">std::numeric_limits<T>::min_exponent</a>/
+  *     <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/max_exponent">std::numeric_limits<T>::max_exponent</a>.
+  * \li infinity() function returning a representation of positive infinity, if available.
+  * \li quiet_NaN function returning a non-signaling "not-a-number", if available.
   */
 
 template<typename T> struct GenericNumTraits
@@ -60,22 +161,6 @@
     MulCost = 1
   };
 
-  // Division is messy but important, because it is expensive and throughput
-  // varies significantly. The following numbers are based on min division
-  // throughput on Haswell.
-  template<bool PacketAccess>
-  struct Div {
-    enum {
-#ifdef EIGEN_VECTORIZE_AVX
-      AVX = true,
-#else
-      AVX = false,
-#endif
-      Cost = IsInteger ? (sizeof(T) == 8 ? (IsSigned ? 24 : 21) : (IsSigned ? 8 : 9)):
-          PacketAccess ? (sizeof(T) == 8 ? (AVX ? 16 : 8) : (AVX ? 14 : 7)) : 8
-    };
-  };
-
   typedef T Real;
   typedef typename internal::conditional<
                      IsInteger,
@@ -83,65 +168,65 @@
                      T
                    >::type NonInteger;
   typedef T Nested;
+  typedef T Literal;
 
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline Real epsilon()
   {
-#if defined(__CUDA_ARCH__)
-    return internal::device::numeric_limits<T>::epsilon();
-#else
-    return std::numeric_limits<T>::epsilon();
-#endif
+    return numext::numeric_limits<T>::epsilon();
   }
-  EIGEN_DEVICE_FUNC
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline int digits10()
+  {
+    return internal::default_digits10_impl<T>::run();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline int digits()
+  {
+    return internal::default_digits_impl<T>::run();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline int min_exponent()
+  {
+    return numext::numeric_limits<T>::min_exponent;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline int max_exponent()
+  {
+    return numext::numeric_limits<T>::max_exponent;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline Real dummy_precision()
   {
     // make sure to override this for floating-point types
     return Real(0);
   }
 
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline T highest() {
-#if defined(__CUDA_ARCH__)
-    return internal::device::numeric_limits<T>::max();
-#else
-    return (std::numeric_limits<T>::max)();
-#endif
+    return (numext::numeric_limits<T>::max)();
   }
 
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline T lowest()  {
-#if defined(__CUDA_ARCH__)
-    return internal::device::numeric_limits<T>::lowest();
-#else
-    return IsInteger ? (std::numeric_limits<T>::min)() : (-(std::numeric_limits<T>::max)());
-#endif
+    return IsInteger ? (numext::numeric_limits<T>::min)()
+                     : static_cast<T>(-(numext::numeric_limits<T>::max)());
   }
 
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline T infinity() {
-#if defined(__CUDA_ARCH__)
-    return internal::device::numeric_limits<T>::infinity();
-#else
-    return std::numeric_limits<T>::infinity();
-#endif
+    return numext::numeric_limits<T>::infinity();
   }
 
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline T quiet_NaN() {
-#if defined(__CUDA_ARCH__)
-    return internal::device::numeric_limits<T>::quiet_NaN();
-#else
-    return std::numeric_limits<T>::quiet_NaN();
-#endif
+    return numext::numeric_limits<T>::quiet_NaN();
   }
-
-#ifdef EIGEN2_SUPPORT
-  enum {
-    HasFloatingPoint = !IsInteger
-  };
-  typedef NonInteger FloatingPoint;
-#endif
 };
 
 template<typename T> struct NumTraits : GenericNumTraits<T>
@@ -150,19 +235,20 @@
 template<> struct NumTraits<float>
   : GenericNumTraits<float>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline float dummy_precision() { return 1e-5f; }
 };
 
 template<> struct NumTraits<double> : GenericNumTraits<double>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline double dummy_precision() { return 1e-12; }
 };
 
 template<> struct NumTraits<long double>
   : GenericNumTraits<long double>
 {
+  EIGEN_CONSTEXPR
   static inline long double dummy_precision() { return 1e-15l; }
 };
 
@@ -170,6 +256,7 @@
   : GenericNumTraits<std::complex<_Real> >
 {
   typedef _Real Real;
+  typedef typename NumTraits<_Real>::Literal Literal;
   enum {
     IsComplex = 1,
     RequireInitialization = NumTraits<_Real>::RequireInitialization,
@@ -178,16 +265,12 @@
     MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
   };
 
-  template<bool PacketAccess>
-  struct Div {
-    enum {
-      Cost = 6 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost +
-             2 * NumTraits<Real>::template Div<PacketAccess>::Cost
-    };
-  };
-
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline Real dummy_precision() { return NumTraits<Real>::dummy_precision(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline int digits10() { return NumTraits<Real>::digits10(); }
 };
 
 template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
@@ -199,40 +282,53 @@
   typedef typename NumTraits<Scalar>::NonInteger NonIntegerScalar;
   typedef Array<NonIntegerScalar, Rows, Cols, Options, MaxRows, MaxCols> NonInteger;
   typedef ArrayType & Nested;
-  
+  typedef typename NumTraits<Scalar>::Literal Literal;
+
   enum {
     IsComplex = NumTraits<Scalar>::IsComplex,
     IsInteger = NumTraits<Scalar>::IsInteger,
     IsSigned  = NumTraits<Scalar>::IsSigned,
     RequireInitialization = 1,
-    ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::ReadCost,
-    AddCost  = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::AddCost,
-    MulCost  = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::MulCost
+    ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::ReadCost),
+    AddCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::AddCost),
+    MulCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::MulCost)
   };
 
-  template<bool PacketAccess>
-  struct Div {
-    enum {
-      Cost = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic :
-          ArrayType::SizeAtCompileTime * NumTraits<Scalar>::template Div<PacketAccess>::Cost
-    };
-  };
-
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
+
+  EIGEN_CONSTEXPR
+  static inline int digits10() { return NumTraits<Scalar>::digits10(); }
 };
 
+template<> struct NumTraits<std::string>
+  : GenericNumTraits<std::string>
+{
+  enum {
+    RequireInitialization = 1,
+    ReadCost = HugeCost,
+    AddCost  = HugeCost,
+    MulCost  = HugeCost
+  };
 
-namespace internal {
+  EIGEN_CONSTEXPR
+  static inline int digits10() { return 0; }
 
-// Internal helper defining the cost of a scalar division for the type T.
-// The default heuristic can be specialized for each scalar type and architecture.
-template<typename T, bool Vectorized=false, typename EnableIf = void>
-struct scalar_div_cost {
-  enum { value = NumTraits<T>::template Div<Vectorized>::Cost };
+private:
+  static inline std::string epsilon();
+  static inline std::string dummy_precision();
+  static inline std::string lowest();
+  static inline std::string highest();
+  static inline std::string infinity();
+  static inline std::string quiet_NaN();
 };
 
-} // end namespace internal
+// Empty specialization for void to allow template specialization based on NumTraits<T>::Real with T==void and SFINAE.
+template<> struct NumTraits<void> {};
+
+template<> struct NumTraits<bool> : GenericNumTraits<bool> {};
 
 } // end namespace Eigen
 

diff --git a/Eigen/src/Core/PartialReduxEvaluator.h b/Eigen/src/Core/PartialReduxEvaluator.h
new file mode 100644
index 0000000..17c06f0
--- /dev/null
+++ b/Eigen/src/Core/PartialReduxEvaluator.h

@@ -0,0 +1,237 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PARTIALREDUX_H
+#define EIGEN_PARTIALREDUX_H
+
+namespace Eigen { 
+
+namespace internal {
+
+
+/***************************************************************************
+*
+* This file provides evaluators for partial reductions.
+* There are two modes:
+*
+*  - scalar path: simply calls the respective function on the column or row.
+*    -> nothing special here, all the tricky part is handled by the return
+*       types of VectorwiseOp's members. They embed the functor calling the
+*       respective DenseBase's member function.
+*
+*  - vectorized path: implements a packet-wise reductions followed by
+*    some (optional) processing of the outcome, e.g., division by n for mean.
+*
+* For the vectorized path let's observe that the packet-size and outer-unrolling
+* are both decided by the assignement logic. So all we have to do is to decide
+* on the inner unrolling.
+*
+* For the unrolling, we can reuse "internal::redux_vec_unroller" from Redux.h,
+* but be need to be careful to specify correct increment.
+*
+***************************************************************************/
+
+
+/* logic deciding a strategy for unrolling of vectorized paths */
+template<typename Func, typename Evaluator>
+struct packetwise_redux_traits
+{
+  enum {
+    OuterSize = int(Evaluator::IsRowMajor) ? Evaluator::RowsAtCompileTime : Evaluator::ColsAtCompileTime,
+    Cost = OuterSize == Dynamic ? HugeCost
+         : OuterSize * Evaluator::CoeffReadCost + (OuterSize-1) * functor_traits<Func>::Cost,
+    Unrolling = Cost <= EIGEN_UNROLLING_LIMIT ? CompleteUnrolling : NoUnrolling
+  };
+
+};
+
+/* Value to be returned when size==0 , by default let's return 0 */
+template<typename PacketType,typename Func>
+EIGEN_DEVICE_FUNC
+PacketType packetwise_redux_empty_value(const Func& ) {
+  const typename unpacket_traits<PacketType>::type zero(0);
+  return pset1<PacketType>(zero);
+}
+
+/* For products the default is 1 */
+template<typename PacketType,typename Scalar>
+EIGEN_DEVICE_FUNC
+PacketType packetwise_redux_empty_value(const scalar_product_op<Scalar,Scalar>& ) {
+  return pset1<PacketType>(Scalar(1));
+}
+
+/* Perform the actual reduction */
+template<typename Func, typename Evaluator,
+         int Unrolling = packetwise_redux_traits<Func, Evaluator>::Unrolling
+>
+struct packetwise_redux_impl;
+
+/* Perform the actual reduction with unrolling */
+template<typename Func, typename Evaluator>
+struct packetwise_redux_impl<Func, Evaluator, CompleteUnrolling>
+{
+  typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
+  typedef typename Evaluator::Scalar Scalar;
+
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  PacketType run(const Evaluator &eval, const Func& func, Index /*size*/)
+  {
+    return redux_vec_unroller<Func, Evaluator, 0, packetwise_redux_traits<Func, Evaluator>::OuterSize>::template run<PacketType>(eval,func);
+  }
+};
+
+/* Add a specialization of redux_vec_unroller for size==0 at compiletime.
+ * This specialization is not required for general reductions, which is
+ * why it is defined here.
+ */
+template<typename Func, typename Evaluator, int Start>
+struct redux_vec_unroller<Func, Evaluator, Start, 0>
+{
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE PacketType run(const Evaluator &, const Func& f)
+  {
+    return packetwise_redux_empty_value<PacketType>(f);
+  }
+};
+
+/* Perform the actual reduction for dynamic sizes */
+template<typename Func, typename Evaluator>
+struct packetwise_redux_impl<Func, Evaluator, NoUnrolling>
+{
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
+
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC
+  static PacketType run(const Evaluator &eval, const Func& func, Index size)
+  {
+    if(size==0)
+      return packetwise_redux_empty_value<PacketType>(func);
+    
+    const Index size4 = (size-1)&(~3);
+    PacketType p = eval.template packetByOuterInner<Unaligned,PacketType>(0,0);
+    Index i = 1;
+    // This loop is optimized for instruction pipelining:
+    // - each iteration generates two independent instructions
+    // - thanks to branch prediction and out-of-order execution we have independent instructions across loops
+    for(; i<size4; i+=4)
+      p = func.packetOp(p,
+            func.packetOp(
+              func.packetOp(eval.template packetByOuterInner<Unaligned,PacketType>(i+0,0),eval.template packetByOuterInner<Unaligned,PacketType>(i+1,0)),
+              func.packetOp(eval.template packetByOuterInner<Unaligned,PacketType>(i+2,0),eval.template packetByOuterInner<Unaligned,PacketType>(i+3,0))));
+    for(; i<size; ++i)
+      p = func.packetOp(p, eval.template packetByOuterInner<Unaligned,PacketType>(i,0));
+    return p;
+  }
+};
+
+template< typename ArgType, typename MemberOp, int Direction>
+struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
+  : evaluator_base<PartialReduxExpr<ArgType, MemberOp, Direction> >
+{
+  typedef PartialReduxExpr<ArgType, MemberOp, Direction> XprType;
+  typedef typename internal::nested_eval<ArgType,1>::type ArgTypeNested;
+  typedef typename internal::add_const_on_value_type<ArgTypeNested>::type ConstArgTypeNested;
+  typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
+  typedef typename ArgType::Scalar InputScalar;
+  typedef typename XprType::Scalar Scalar;
+  enum {
+    TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) :  int(ArgType::ColsAtCompileTime)
+  };
+  typedef typename MemberOp::template Cost<int(TraversalSize)> CostOpType;
+  enum {
+    CoeffReadCost = TraversalSize==Dynamic ? HugeCost
+                  : TraversalSize==0 ? 1
+                  : int(TraversalSize) * int(evaluator<ArgType>::CoeffReadCost) + int(CostOpType::value),
+    
+    _ArgFlags = evaluator<ArgType>::Flags,
+
+    _Vectorizable =  bool(int(_ArgFlags)&PacketAccessBit)
+                  && bool(MemberOp::Vectorizable)
+                  && (Direction==int(Vertical) ? bool(_ArgFlags&RowMajorBit) : (_ArgFlags&RowMajorBit)==0)
+                  && (TraversalSize!=0),
+                  
+    Flags = (traits<XprType>::Flags&RowMajorBit)
+          | (evaluator<ArgType>::Flags&(HereditaryBits&(~RowMajorBit)))
+          | (_Vectorizable ? PacketAccessBit : 0)
+          | LinearAccessBit,
+    
+    Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr)
+    : m_arg(xpr.nestedExpression()), m_functor(xpr.functor())
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : (TraversalSize==0 ? 1 : int(CostOpType::value)));
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar coeff(Index i, Index j) const
+  {
+    return coeff(Direction==Vertical ? j : i);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar coeff(Index index) const
+  {
+    return m_functor(m_arg.template subVector<DirectionType(Direction)>(index));
+  }
+
+  template<int LoadMode,typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketType packet(Index i, Index j) const
+  {
+    return packet<LoadMode,PacketType>(Direction==Vertical ? j : i);
+  }
+  
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+  PacketType packet(Index idx) const
+  {
+    enum { PacketSize = internal::unpacket_traits<PacketType>::size };
+    typedef Block<const ArgTypeNestedCleaned,
+                  Direction==Vertical ? int(ArgType::RowsAtCompileTime) : int(PacketSize),
+                  Direction==Vertical ? int(PacketSize) : int(ArgType::ColsAtCompileTime),
+                  true /* InnerPanel */> PanelType;
+    
+    PanelType panel(m_arg,
+                    Direction==Vertical ? 0 : idx,
+                    Direction==Vertical ? idx : 0,
+                    Direction==Vertical ? m_arg.rows() : Index(PacketSize),
+                    Direction==Vertical ? Index(PacketSize) : m_arg.cols());
+
+    // FIXME
+    // See bug 1612, currently if PacketSize==1 (i.e. complex<double> with 128bits registers) then the storage-order of panel get reversed
+    // and methods like packetByOuterInner do not make sense anymore in this context.
+    // So let's just by pass "vectorization" in this case:
+    if(PacketSize==1)
+      return internal::pset1<PacketType>(coeff(idx));
+    
+    typedef typename internal::redux_evaluator<PanelType> PanelEvaluator;
+    PanelEvaluator panel_eval(panel);
+    typedef typename MemberOp::BinaryOp BinaryOp;
+    PacketType p = internal::packetwise_redux_impl<BinaryOp,PanelEvaluator>::template run<PacketType>(panel_eval,m_functor.binaryFunc(),m_arg.outerSize());
+    return p;
+  }
+
+protected:
+  ConstArgTypeNested m_arg;
+  const MemberOp m_functor;
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PARTIALREDUX_H

diff --git a/Eigen/src/Core/PermutationMatrix.h b/Eigen/src/Core/PermutationMatrix.h
index 1297b84..69401bf 100644
--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h

@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2009-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -13,14 +13,18 @@
 
 namespace Eigen { 
 
-template<int RowCol,typename IndicesType,typename MatrixType, typename StorageKind> class PermutedImpl;
+namespace internal {
+
+enum PermPermProduct_t {PermPermProduct};
+
+} // end namespace internal
 
 /** \class PermutationBase
   * \ingroup Core_Module
   *
   * \brief Base class for permutations
   *
-  * \param Derived the derived class
+  * \tparam Derived the derived class
   *
   * This class is the base class for all expressions representing a permutation matrix,
   * internally stored as a vector of integers.
@@ -38,17 +42,6 @@
   *
   * \sa class PermutationMatrix, class PermutationWrapper
   */
-
-namespace internal {
-
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed=false>
-struct permut_matrix_product_retval;
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed=false>
-struct permut_sparsematrix_product_retval;
-enum PermPermProduct_t {PermPermProduct};
-
-} // end namespace internal
-
 template<typename Derived>
 class PermutationBase : public EigenBase<Derived>
 {
@@ -60,19 +53,20 @@
     typedef typename Traits::IndicesType IndicesType;
     enum {
       Flags = Traits::Flags,
-      CoeffReadCost = Traits::CoeffReadCost,
       RowsAtCompileTime = Traits::RowsAtCompileTime,
       ColsAtCompileTime = Traits::ColsAtCompileTime,
       MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = Traits::MaxColsAtCompileTime
     };
-    typedef typename Traits::Scalar Scalar;
-    typedef typename Traits::Index Index;
-    typedef Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime,0,MaxRowsAtCompileTime,MaxColsAtCompileTime>
+    typedef typename Traits::StorageIndex StorageIndex;
+    typedef Matrix<StorageIndex,RowsAtCompileTime,ColsAtCompileTime,0,MaxRowsAtCompileTime,MaxColsAtCompileTime>
             DenseMatrixType;
-    typedef PermutationMatrix<IndicesType::SizeAtCompileTime,IndicesType::MaxSizeAtCompileTime,Index>
+    typedef PermutationMatrix<IndicesType::SizeAtCompileTime,IndicesType::MaxSizeAtCompileTime,StorageIndex>
             PlainPermutationType;
+    typedef PlainPermutationType PlainObject;
     using Base::derived;
+    typedef Inverse<Derived> InverseReturnType;
+    typedef void Scalar;
     #endif
 
     /** Copies the other permutation into *this */
@@ -93,32 +87,21 @@
       return derived();
     }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    Derived& operator=(const PermutationBase& other)
-    {
-      indices() = other.indices();
-      return derived();
-    }
-    #endif
-
     /** \returns the number of rows */
-    inline Index rows() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); }
 
     /** \returns the number of columns */
-    inline Index cols() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); }
 
     /** \returns the size of a side of the respective square matrix, i.e., the number of indices */
-    inline Index size() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); }
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename DenseDerived>
     void evalTo(MatrixBase<DenseDerived>& other) const
     {
       other.setZero();
-      for (int i=0; i<rows();++i)
+      for (Index i=0; i<rows(); ++i)
         other.coeffRef(indices().coeff(i),i) = typename DenseDerived::Scalar(1);
     }
     #endif
@@ -147,7 +130,8 @@
     /** Sets *this to be the identity permutation matrix */
     void setIdentity()
     {
-      for(Index i = 0; i < size(); ++i)
+      StorageIndex n = StorageIndex(size());
+      for(StorageIndex i = 0; i < n; ++i)
         indices().coeffRef(i) = i;
     }
 
@@ -163,18 +147,18 @@
       *
       * \returns a reference to *this.
       *
-      * \warning This is much slower than applyTranspositionOnTheRight(int,int):
+      * \warning This is much slower than applyTranspositionOnTheRight(Index,Index):
       * this has linear complexity and requires a lot of branching.
       *
-      * \sa applyTranspositionOnTheRight(int,int)
+      * \sa applyTranspositionOnTheRight(Index,Index)
       */
     Derived& applyTranspositionOnTheLeft(Index i, Index j)
     {
       eigen_assert(i>=0 && j>=0 && i<size() && j<size());
       for(Index k = 0; k < size(); ++k)
       {
-        if(indices().coeff(k) == i) indices().coeffRef(k) = j;
-        else if(indices().coeff(k) == j) indices().coeffRef(k) = i;
+        if(indices().coeff(k) == i) indices().coeffRef(k) = StorageIndex(j);
+        else if(indices().coeff(k) == j) indices().coeffRef(k) = StorageIndex(i);
       }
       return derived();
     }
@@ -185,7 +169,7 @@
       *
       * This is a fast operation, it only consists in swapping two indices.
       *
-      * \sa applyTranspositionOnTheLeft(int,int)
+      * \sa applyTranspositionOnTheLeft(Index,Index)
       */
     Derived& applyTranspositionOnTheRight(Index i, Index j)
     {
@@ -196,16 +180,16 @@
 
     /** \returns the inverse permutation matrix.
       *
-      * \note \note_try_to_help_rvo
+      * \note \blank \note_try_to_help_rvo
       */
-    inline Transpose<PermutationBase> inverse() const
-    { return derived(); }
+    inline InverseReturnType inverse() const
+    { return InverseReturnType(derived()); }
     /** \returns the tranpose permutation matrix.
       *
-      * \note \note_try_to_help_rvo
+      * \note \blank \note_try_to_help_rvo
       */
-    inline Transpose<PermutationBase> transpose() const
-    { return derived(); }
+    inline InverseReturnType transpose() const
+    { return InverseReturnType(derived()); }
 
     /**** multiplication helpers to hopefully get RVO ****/
 
@@ -215,13 +199,13 @@
     template<typename OtherDerived>
     void assignTranspose(const PermutationBase<OtherDerived>& other)
     {
-      for (int i=0; i<rows();++i) indices().coeffRef(other.indices().coeff(i)) = i;
+      for (Index i=0; i<rows();++i) indices().coeffRef(other.indices().coeff(i)) = i;
     }
     template<typename Lhs,typename Rhs>
     void assignProduct(const Lhs& lhs, const Rhs& rhs)
     {
       eigen_assert(lhs.cols() == rhs.rows());
-      for (int i=0; i<rows();++i) indices().coeffRef(i) = lhs.indices().coeff(rhs.indices().coeff(i));
+      for (Index i=0; i<rows();++i) indices().coeffRef(i) = lhs.indices().coeff(rhs.indices().coeff(i));
     }
 #endif
 
@@ -229,7 +213,7 @@
 
     /** \returns the product permutation matrix.
       *
-      * \note \note_try_to_help_rvo
+      * \note \blank \note_try_to_help_rvo
       */
     template<typename Other>
     inline PlainPermutationType operator*(const PermutationBase<Other>& other) const
@@ -237,57 +221,90 @@
 
     /** \returns the product of a permutation with another inverse permutation.
       *
-      * \note \note_try_to_help_rvo
+      * \note \blank \note_try_to_help_rvo
       */
     template<typename Other>
-    inline PlainPermutationType operator*(const Transpose<PermutationBase<Other> >& other) const
+    inline PlainPermutationType operator*(const InverseImpl<Other,PermutationStorage>& other) const
     { return PlainPermutationType(internal::PermPermProduct, *this, other.eval()); }
 
     /** \returns the product of an inverse permutation with another permutation.
       *
-      * \note \note_try_to_help_rvo
+      * \note \blank \note_try_to_help_rvo
       */
     template<typename Other> friend
-    inline PlainPermutationType operator*(const Transpose<PermutationBase<Other> >& other, const PermutationBase& perm)
+    inline PlainPermutationType operator*(const InverseImpl<Other, PermutationStorage>& other, const PermutationBase& perm)
     { return PlainPermutationType(internal::PermPermProduct, other.eval(), perm); }
+    
+    /** \returns the determinant of the permutation matrix, which is either 1 or -1 depending on the parity of the permutation.
+      *
+      * This function is O(\c n) procedure allocating a buffer of \c n booleans.
+      */
+    Index determinant() const
+    {
+      Index res = 1;
+      Index n = size();
+      Matrix<bool,RowsAtCompileTime,1,0,MaxRowsAtCompileTime> mask(n);
+      mask.fill(false);
+      Index r = 0;
+      while(r < n)
+      {
+        // search for the next seed
+        while(r<n && mask[r]) r++;
+        if(r>=n)
+          break;
+        // we got one, let's follow it until we are back to the seed
+        Index k0 = r++;
+        mask.coeffRef(k0) = true;
+        for(Index k=indices().coeff(k0); k!=k0; k=indices().coeff(k))
+        {
+          mask.coeffRef(k) = true;
+          res = -res;
+        }
+      }
+      return res;
+    }
 
   protected:
 
 };
 
+namespace internal {
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
+struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex> >
+ : traits<Matrix<_StorageIndex,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
+{
+  typedef PermutationStorage StorageKind;
+  typedef Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+  typedef _StorageIndex StorageIndex;
+  typedef void Scalar;
+};
+}
+
 /** \class PermutationMatrix
   * \ingroup Core_Module
   *
   * \brief Permutation matrix
   *
-  * \param SizeAtCompileTime the number of rows/cols, or Dynamic
-  * \param MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
-  * \param IndexType the interger type of the indices
+  * \tparam SizeAtCompileTime the number of rows/cols, or Dynamic
+  * \tparam MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
+  * \tparam _StorageIndex the integer type of the indices
   *
   * This class represents a permutation matrix, internally stored as a vector of integers.
   *
   * \sa class PermutationBase, class PermutationWrapper, class DiagonalMatrix
   */
-
-namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType> >
- : traits<Matrix<IndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
-{
-  typedef IndexType Index;
-  typedef Matrix<IndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
-};
-}
-
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
+class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex> >
 {
     typedef PermutationBase<PermutationMatrix> Base;
     typedef internal::traits<PermutationMatrix> Traits;
   public:
 
+    typedef const PermutationMatrix& Nested;
+
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     typedef typename Traits::IndicesType IndicesType;
+    typedef typename Traits::StorageIndex StorageIndex;
     #endif
 
     inline PermutationMatrix()
@@ -295,20 +312,16 @@
 
     /** Constructs an uninitialized permutation matrix of given size.
       */
-    inline PermutationMatrix(int size) : m_indices(size)
-    {}
+    explicit inline PermutationMatrix(Index size) : m_indices(size)
+    {
+      eigen_internal_assert(size <= NumTraits<StorageIndex>::highest());
+    }
 
     /** Copy constructor. */
     template<typename OtherDerived>
     inline PermutationMatrix(const PermutationBase<OtherDerived>& other)
       : m_indices(other.indices()) {}
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** Standard copy constructor. Defined only to prevent a default copy constructor
-      * from hiding the other templated constructor */
-    inline PermutationMatrix(const PermutationMatrix& other) : m_indices(other.indices()) {}
-    #endif
-
     /** Generic constructor from expression of the indices. The indices
       * array has the meaning that the permutations sends each integer i to indices[i].
       *
@@ -317,7 +330,7 @@
       * array's size.
       */
     template<typename Other>
-    explicit inline PermutationMatrix(const MatrixBase<Other>& a_indices) : m_indices(a_indices)
+    explicit inline PermutationMatrix(const MatrixBase<Other>& indices) : m_indices(indices)
     {}
 
     /** Convert the Transpositions \a tr to a permutation matrix */
@@ -343,17 +356,6 @@
       return Base::operator=(tr.derived());
     }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    PermutationMatrix& operator=(const PermutationMatrix& other)
-    {
-      m_indices = other.m_indices;
-      return *this;
-    }
-    #endif
-
     /** const version of indices(). */
     const IndicesType& indices() const { return m_indices; }
     /** \returns a reference to the stored array representing the permutation. */
@@ -364,10 +366,13 @@
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename Other>
-    PermutationMatrix(const Transpose<PermutationBase<Other> >& other)
-      : m_indices(other.nestedPermutation().size())
+    PermutationMatrix(const InverseImpl<Other,PermutationStorage>& other)
+      : m_indices(other.derived().nestedExpression().size())
     {
-      for (int i=0; i<m_indices.size();++i) m_indices.coeffRef(other.nestedPermutation().indices().coeff(i)) = i;
+      eigen_internal_assert(m_indices.size() <= NumTraits<StorageIndex>::highest());
+      StorageIndex end = StorageIndex(m_indices.size());
+      for (StorageIndex i=0; i<end;++i)
+        m_indices.coeffRef(other.derived().nestedExpression().indices().coeff(i)) = i;
     }
     template<typename Lhs,typename Rhs>
     PermutationMatrix(internal::PermPermProduct_t, const Lhs& lhs, const Rhs& rhs)
@@ -384,18 +389,20 @@
 
 
 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
-struct traits<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess> >
- : traits<Matrix<IndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int _PacketAccess>
+struct traits<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex>,_PacketAccess> >
+ : traits<Matrix<_StorageIndex,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
 {
-  typedef IndexType Index;
-  typedef Map<const Matrix<IndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1>, _PacketAccess> IndicesType;
+  typedef PermutationStorage StorageKind;
+  typedef Map<const Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1>, _PacketAccess> IndicesType;
+  typedef _StorageIndex StorageIndex;
+  typedef void Scalar;
 };
 }
 
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
-class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess>
-  : public PermutationBase<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int _PacketAccess>
+class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex>,_PacketAccess>
+  : public PermutationBase<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex>,_PacketAccess> >
 {
     typedef PermutationBase<Map> Base;
     typedef internal::traits<Map> Traits;
@@ -403,14 +410,14 @@
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
+    typedef typename IndicesType::Scalar StorageIndex;
     #endif
 
-    inline Map(const Index* indicesPtr)
+    inline Map(const StorageIndex* indicesPtr)
       : m_indices(indicesPtr)
     {}
 
-    inline Map(const Index* indicesPtr, Index size)
+    inline Map(const StorageIndex* indicesPtr, Index size)
       : m_indices(indicesPtr,size)
     {}
 
@@ -445,40 +452,36 @@
     IndicesType m_indices;
 };
 
-/** \class PermutationWrapper
-  * \ingroup Core_Module
-  *
-  * \brief Class to view a vector of integers as a permutation matrix
-  *
-  * \param _IndicesType the type of the vector of integer (can be any compatible expression)
-  *
-  * This class allows to view any vector expression of integers as a permutation matrix.
-  *
-  * \sa class PermutationBase, class PermutationMatrix
-  */
-
-struct PermutationStorage {};
-
 template<typename _IndicesType> class TranspositionsWrapper;
 namespace internal {
 template<typename _IndicesType>
 struct traits<PermutationWrapper<_IndicesType> >
 {
   typedef PermutationStorage StorageKind;
-  typedef typename _IndicesType::Scalar Scalar;
-  typedef typename _IndicesType::Scalar Index;
+  typedef void Scalar;
+  typedef typename _IndicesType::Scalar StorageIndex;
   typedef _IndicesType IndicesType;
   enum {
     RowsAtCompileTime = _IndicesType::SizeAtCompileTime,
     ColsAtCompileTime = _IndicesType::SizeAtCompileTime,
-    MaxRowsAtCompileTime = IndicesType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = IndicesType::MaxColsAtCompileTime,
-    Flags = 0,
-    CoeffReadCost = _IndicesType::CoeffReadCost
+    MaxRowsAtCompileTime = IndicesType::MaxSizeAtCompileTime,
+    MaxColsAtCompileTime = IndicesType::MaxSizeAtCompileTime,
+    Flags = 0
   };
 };
 }
 
+/** \class PermutationWrapper
+  * \ingroup Core_Module
+  *
+  * \brief Class to view a vector of integers as a permutation matrix
+  *
+  * \tparam _IndicesType the type of the vector of integer (can be any compatible expression)
+  *
+  * This class allows to view any vector expression of integers as a permutation matrix.
+  *
+  * \sa class PermutationBase, class PermutationMatrix
+  */
 template<typename _IndicesType>
 class PermutationWrapper : public PermutationBase<PermutationWrapper<_IndicesType> >
 {
@@ -490,8 +493,8 @@
     typedef typename Traits::IndicesType IndicesType;
     #endif
 
-    inline PermutationWrapper(const IndicesType& a_indices)
-      : m_indices(a_indices)
+    inline PermutationWrapper(const IndicesType& indices)
+      : m_indices(indices)
     {}
 
     /** const version of indices(). */
@@ -503,179 +506,86 @@
     typename IndicesType::Nested m_indices;
 };
 
+
 /** \returns the matrix with the permutation applied to the columns.
   */
-template<typename Derived, typename PermutationDerived>
-inline const internal::permut_matrix_product_retval<PermutationDerived, Derived, OnTheRight>
-operator*(const MatrixBase<Derived>& matrix,
-          const PermutationBase<PermutationDerived> &permutation)
+template<typename MatrixDerived, typename PermutationDerived>
+EIGEN_DEVICE_FUNC
+const Product<MatrixDerived, PermutationDerived, AliasFreeProduct>
+operator*(const MatrixBase<MatrixDerived> &matrix,
+          const PermutationBase<PermutationDerived>& permutation)
 {
-  return internal::permut_matrix_product_retval
-           <PermutationDerived, Derived, OnTheRight>
-           (permutation.derived(), matrix.derived());
+  return Product<MatrixDerived, PermutationDerived, AliasFreeProduct>
+            (matrix.derived(), permutation.derived());
 }
 
 /** \returns the matrix with the permutation applied to the rows.
   */
-template<typename Derived, typename PermutationDerived>
-inline const internal::permut_matrix_product_retval
-               <PermutationDerived, Derived, OnTheLeft>
+template<typename PermutationDerived, typename MatrixDerived>
+EIGEN_DEVICE_FUNC
+const Product<PermutationDerived, MatrixDerived, AliasFreeProduct>
 operator*(const PermutationBase<PermutationDerived> &permutation,
-          const MatrixBase<Derived>& matrix)
+          const MatrixBase<MatrixDerived>& matrix)
 {
-  return internal::permut_matrix_product_retval
-           <PermutationDerived, Derived, OnTheLeft>
-           (permutation.derived(), matrix.derived());
+  return Product<PermutationDerived, MatrixDerived, AliasFreeProduct>
+            (permutation.derived(), matrix.derived());
 }
 
-namespace internal {
 
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
-struct traits<permut_matrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
+template<typename PermutationType>
+class InverseImpl<PermutationType, PermutationStorage>
+  : public EigenBase<Inverse<PermutationType> >
 {
-  typedef typename MatrixType::PlainObject ReturnType;
-};
-
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
-struct permut_matrix_product_retval
- : public ReturnByValue<permut_matrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
-{
-    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
-    typedef typename MatrixType::Index Index;
-
-    permut_matrix_product_retval(const PermutationType& perm, const MatrixType& matrix)
-      : m_permutation(perm), m_matrix(matrix)
-    {}
-
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-
-    template<typename Dest> inline void evalTo(Dest& dst) const
-    {
-      const Index n = Side==OnTheLeft ? rows() : cols();
-      // FIXME we need an is_same for expression that is not sensitive to constness. For instance
-      // is_same_xpr<Block<const Matrix>, Block<Matrix> >::value should be true.
-      if(is_same<MatrixTypeNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_matrix))
-      {
-        // apply the permutation inplace
-        Matrix<bool,PermutationType::RowsAtCompileTime,1,0,PermutationType::MaxRowsAtCompileTime> mask(m_permutation.size());
-        mask.fill(false);
-        Index r = 0;
-        while(r < m_permutation.size())
-        {
-          // search for the next seed
-          while(r<m_permutation.size() && mask[r]) r++;
-          if(r>=m_permutation.size())
-            break;
-          // we got one, let's follow it until we are back to the seed
-          Index k0 = r++;
-          Index kPrev = k0;
-          mask.coeffRef(k0) = true;
-          for(Index k=m_permutation.indices().coeff(k0); k!=k0; k=m_permutation.indices().coeff(k))
-          {
-                  Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>(dst, k)
-            .swap(Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
-                       (dst,((Side==OnTheLeft) ^ Transposed) ? k0 : kPrev));
-
-            mask.coeffRef(k) = true;
-            kPrev = k;
-          }
-        }
-      }
-      else
-      {
-        for(int i = 0; i < n; ++i)
-        {
-          Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
-               (dst, ((Side==OnTheLeft) ^ Transposed) ? m_permutation.indices().coeff(i) : i)
-
-          =
-
-          Block<const MatrixTypeNestedCleaned,Side==OnTheLeft ? 1 : MatrixType::RowsAtCompileTime,Side==OnTheRight ? 1 : MatrixType::ColsAtCompileTime>
-               (m_matrix, ((Side==OnTheRight) ^ Transposed) ? m_permutation.indices().coeff(i) : i);
-        }
-      }
-    }
-
-  protected:
-    const PermutationType& m_permutation;
-    typename MatrixType::Nested m_matrix;
-};
-
-/* Template partial specialization for transposed/inverse permutations */
-
-template<typename Derived>
-struct traits<Transpose<PermutationBase<Derived> > >
- : traits<Derived>
-{};
-
-} // end namespace internal
-
-template<typename Derived>
-class Transpose<PermutationBase<Derived> >
-  : public EigenBase<Transpose<PermutationBase<Derived> > >
-{
-    typedef Derived PermutationType;
-    typedef typename PermutationType::IndicesType IndicesType;
     typedef typename PermutationType::PlainPermutationType PlainPermutationType;
+    typedef internal::traits<PermutationType> PermTraits;
+  protected:
+    InverseImpl() {}
   public:
+    typedef Inverse<PermutationType> InverseType;
+    using EigenBase<Inverse<PermutationType> >::derived;
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
-    typedef internal::traits<PermutationType> Traits;
-    typedef typename Derived::DenseMatrixType DenseMatrixType;
+    typedef typename PermutationType::DenseMatrixType DenseMatrixType;
     enum {
-      Flags = Traits::Flags,
-      CoeffReadCost = Traits::CoeffReadCost,
-      RowsAtCompileTime = Traits::RowsAtCompileTime,
-      ColsAtCompileTime = Traits::ColsAtCompileTime,
-      MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = Traits::MaxColsAtCompileTime
+      RowsAtCompileTime = PermTraits::RowsAtCompileTime,
+      ColsAtCompileTime = PermTraits::ColsAtCompileTime,
+      MaxRowsAtCompileTime = PermTraits::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = PermTraits::MaxColsAtCompileTime
     };
-    typedef typename Traits::Scalar Scalar;
     #endif
 
-    Transpose(const PermutationType& p) : m_permutation(p) {}
-
-    inline int rows() const { return m_permutation.rows(); }
-    inline int cols() const { return m_permutation.cols(); }
-
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename DenseDerived>
     void evalTo(MatrixBase<DenseDerived>& other) const
     {
       other.setZero();
-      for (int i=0; i<rows();++i)
-        other.coeffRef(i, m_permutation.indices().coeff(i)) = typename DenseDerived::Scalar(1);
+      for (Index i=0; i<derived().rows();++i)
+        other.coeffRef(i, derived().nestedExpression().indices().coeff(i)) = typename DenseDerived::Scalar(1);
     }
     #endif
 
     /** \return the equivalent permutation matrix */
-    PlainPermutationType eval() const { return *this; }
+    PlainPermutationType eval() const { return derived(); }
 
-    DenseMatrixType toDenseMatrix() const { return *this; }
+    DenseMatrixType toDenseMatrix() const { return derived(); }
 
     /** \returns the matrix with the inverse permutation applied to the columns.
       */
     template<typename OtherDerived> friend
-    inline const internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheRight, true>
-    operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trPerm)
+    const Product<OtherDerived, InverseType, AliasFreeProduct>
+    operator*(const MatrixBase<OtherDerived>& matrix, const InverseType& trPerm)
     {
-      return internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheRight, true>(trPerm.m_permutation, matrix.derived());
+      return Product<OtherDerived, InverseType, AliasFreeProduct>(matrix.derived(), trPerm.derived());
     }
 
     /** \returns the matrix with the inverse permutation applied to the rows.
       */
     template<typename OtherDerived>
-    inline const internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheLeft, true>
+    const Product<InverseType, OtherDerived, AliasFreeProduct>
     operator*(const MatrixBase<OtherDerived>& matrix) const
     {
-      return internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheLeft, true>(m_permutation, matrix.derived());
+      return Product<InverseType, OtherDerived, AliasFreeProduct>(derived(), matrix.derived());
     }
-
-    const PermutationType& nestedPermutation() const { return m_permutation; }
-
-  protected:
-    const PermutationType& m_permutation;
 };
 
 template<typename Derived>
@@ -684,6 +594,12 @@
   return derived();
 }
 
+namespace internal {
+
+template<> struct AssignmentKind<DenseShape,PermutationShape> { typedef EigenBase2EigenBase Kind; };
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_PERMUTATIONMATRIX_H

diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index 50c3656..e2ddbd1 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h

@@ -13,10 +13,10 @@
 
 #if defined(EIGEN_INITIALIZE_MATRICES_BY_ZERO)
 # define EIGEN_INITIALIZE_COEFFS
-# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(int i=0;i<base().size();++i) coeffRef(i)=Scalar(0);
+# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(Index i=0;i<base().size();++i) coeffRef(i)=Scalar(0);
 #elif defined(EIGEN_INITIALIZE_MATRICES_BY_NAN)
 # define EIGEN_INITIALIZE_COEFFS
-# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(int i=0;i<base().size();++i) coeffRef(i)=std::numeric_limits<Scalar>::quiet_NaN();
+# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(Index i=0;i<base().size();++i) coeffRef(i)=std::numeric_limits<Scalar>::quiet_NaN();
 #else
 # undef EIGEN_INITIALIZE_COEFFS
 # define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
@@ -41,7 +41,7 @@
   {
     // http://hg.mozilla.org/mozilla-central/file/6c8a909977d3/xpcom/ds/CheckedInt.h#l242
     // we assume Index is signed
-    Index max_index = (size_t(1) << (8 * sizeof(Index) - 1)) - 1; // assume Index is signed
+    Index max_index = (std::size_t(1) << (8 * sizeof(Index) - 1)) - 1; // assume Index is signed
     bool error = (rows == 0 || cols == 0) ? false
                : (rows > max_index / cols);
     if (error)
@@ -58,33 +58,41 @@
 
 } // end namespace internal
 
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+namespace doxygen {
+
+// This is a workaround to doxygen not being able to understand the inheritance logic
+// when it is hidden by the dense_xpr_base helper struct.
+// Moreover, doxygen fails to include members that are not documented in the declaration body of
+// MatrixBase if we inherits MatrixBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >,
+// this is why we simply inherits MatrixBase, though this does not make sense.
+
+/** This class is just a workaround for Doxygen and it does not not actually exist. */
+template<typename Derived> struct dense_xpr_base_dispatcher;
+/** This class is just a workaround for Doxygen and it does not not actually exist. */
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct dense_xpr_base_dispatcher<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+    : public MatrixBase {};
+/** This class is just a workaround for Doxygen and it does not not actually exist. */
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct dense_xpr_base_dispatcher<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+    : public ArrayBase {};
+
+} // namespace doxygen
+
 /** \class PlainObjectBase
+  * \ingroup Core_Module
   * \brief %Dense storage base class for matrices and arrays.
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_PLAINOBJECTBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_PLAINOBJECTBASE_PLUGIN.
+  *
+  * \tparam Derived is the derived type, e.g., a Matrix or Array
   *
   * \sa \ref TopicClassHierarchy
   */
-#ifdef EIGEN_PARSED_BY_DOXYGEN
-namespace internal {
-
-// this is a warkaround to doxygen not being able to understand the inheritence logic
-// when it is hidden by the dense_xpr_base helper struct.
-template<typename Derived> struct dense_xpr_base_dispatcher_for_doxygen;// : public MatrixBase<Derived> {};
-/** This class is just a workaround for Doxygen and it does not not actually exist. */
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct dense_xpr_base_dispatcher_for_doxygen<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-    : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > {};
-/** This class is just a workaround for Doxygen and it does not not actually exist. */
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct dense_xpr_base_dispatcher_for_doxygen<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-    : public ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > {};
-
-} // namespace internal
-
 template<typename Derived>
-class PlainObjectBase : public internal::dense_xpr_base_dispatcher_for_doxygen<Derived>
+class PlainObjectBase : public doxygen::dense_xpr_base_dispatcher<Derived>
 #else
 template<typename Derived>
 class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
@@ -95,8 +103,8 @@
     typedef typename internal::dense_xpr_base<Derived>::type Base;
 
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
     typedef typename internal::traits<Derived>::Scalar Scalar;
+
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef Derived DenseType;
@@ -110,25 +118,20 @@
     using Base::IsVectorAtCompileTime;
     using Base::Flags;
 
-    template<typename PlainObjectType, int MapOptions, typename StrideType> friend class Eigen::Map;
-    friend  class Eigen::Map<Derived, Unaligned>;
     typedef Eigen::Map<Derived, Unaligned>  MapType;
-    friend  class Eigen::Map<const Derived, Unaligned>;
     typedef const Eigen::Map<const Derived, Unaligned> ConstMapType;
-    friend  class Eigen::Map<Derived, Aligned>;
-    typedef Eigen::Map<Derived, Aligned> AlignedMapType;
-    friend  class Eigen::Map<const Derived, Aligned>;
-    typedef const Eigen::Map<const Derived, Aligned> ConstAlignedMapType;
+    typedef Eigen::Map<Derived, AlignedMax> AlignedMapType;
+    typedef const Eigen::Map<const Derived, AlignedMax> ConstAlignedMapType;
     template<typename StrideType> struct StridedMapType { typedef Eigen::Map<Derived, Unaligned, StrideType> type; };
     template<typename StrideType> struct StridedConstMapType { typedef Eigen::Map<const Derived, Unaligned, StrideType> type; };
-    template<typename StrideType> struct StridedAlignedMapType { typedef Eigen::Map<Derived, Aligned, StrideType> type; };
-    template<typename StrideType> struct StridedConstAlignedMapType { typedef Eigen::Map<const Derived, Aligned, StrideType> type; };
+    template<typename StrideType> struct StridedAlignedMapType { typedef Eigen::Map<Derived, AlignedMax, StrideType> type; };
+    template<typename StrideType> struct StridedConstAlignedMapType { typedef Eigen::Map<const Derived, AlignedMax, StrideType> type; };
 
   protected:
     DenseStorage<Scalar, Base::MaxSizeAtCompileTime, Base::RowsAtCompileTime, Base::ColsAtCompileTime, Options> m_storage;
 
   public:
-    enum { NeedsToAlign = SizeAtCompileTime != Dynamic && (internal::traits<Derived>::Flags & AlignedBit) != 0 };
+    enum { NeedsToAlign = (SizeAtCompileTime != Dynamic) && (internal::traits<Derived>::Alignment>0) };
     EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
 
     EIGEN_DEVICE_FUNC
@@ -136,11 +139,15 @@
     EIGEN_DEVICE_FUNC
     const Base& base() const { return *static_cast<const Base*>(this); }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index rows() const { return m_storage.rows(); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index cols() const { return m_storage.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_storage.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_storage.cols(); }
 
+    /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index,Index) const
+      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+      *
+      * See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& coeff(Index rowId, Index colId) const
     {
@@ -150,12 +157,20 @@
         return m_storage.data()[rowId + colId * m_storage.rows()];
     }
 
+    /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const
+      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+      *
+      * See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
     {
       return m_storage.data()[index];
     }
 
+    /** This is an overloaded version of DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const
+      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+      *
+      * See DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const for details. */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar& coeffRef(Index rowId, Index colId)
     {
@@ -165,12 +180,18 @@
         return m_storage.data()[rowId + colId * m_storage.rows()];
     }
 
+    /** This is an overloaded version of DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index) const
+      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+      *
+      * See DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index) const for details. */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
     {
       return m_storage.data()[index];
     }
 
+    /** This is the const version of coeffRef(Index,Index) which is thus synonym of coeff(Index,Index).
+      * It is provided for convenience. */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& coeffRef(Index rowId, Index colId) const
     {
@@ -180,6 +201,8 @@
         return m_storage.data()[rowId + colId * m_storage.rows()];
     }
 
+    /** This is the const version of coeffRef(Index) which is thus synonym of coeff(Index).
+      * It is provided for convenience. */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& coeffRef(Index index) const
     {
@@ -221,11 +244,11 @@
     }
 
     /** \returns a const pointer to the data array of this matrix */
-    EIGEN_STRONG_INLINE const Scalar *data() const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const
     { return m_storage.data(); }
 
     /** \returns a pointer to the data array of this matrix */
-    EIGEN_STRONG_INLINE Scalar *data()
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data()
     { return m_storage.data(); }
 
     /** Resizes \c *this to a \a rows x \a cols matrix.
@@ -245,21 +268,21 @@
       * \sa resize(Index) for vectors, resize(NoChange_t, Index), resize(Index, NoChange_t)
       */
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void resize(Index nbRows, Index nbCols)
+    EIGEN_STRONG_INLINE void resize(Index rows, Index cols)
     {
-      eigen_assert(   EIGEN_IMPLIES(RowsAtCompileTime!=Dynamic,nbRows==RowsAtCompileTime)
-                   && EIGEN_IMPLIES(ColsAtCompileTime!=Dynamic,nbCols==ColsAtCompileTime)
-                   && EIGEN_IMPLIES(RowsAtCompileTime==Dynamic && MaxRowsAtCompileTime!=Dynamic,nbRows<=MaxRowsAtCompileTime)
-                   && EIGEN_IMPLIES(ColsAtCompileTime==Dynamic && MaxColsAtCompileTime!=Dynamic,nbCols<=MaxColsAtCompileTime)
-                   && nbRows>=0 && nbCols>=0 && "Invalid sizes when resizing a matrix or array.");
-      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(nbRows, nbCols);
+      eigen_assert(   EIGEN_IMPLIES(RowsAtCompileTime!=Dynamic,rows==RowsAtCompileTime)
+                   && EIGEN_IMPLIES(ColsAtCompileTime!=Dynamic,cols==ColsAtCompileTime)
+                   && EIGEN_IMPLIES(RowsAtCompileTime==Dynamic && MaxRowsAtCompileTime!=Dynamic,rows<=MaxRowsAtCompileTime)
+                   && EIGEN_IMPLIES(ColsAtCompileTime==Dynamic && MaxColsAtCompileTime!=Dynamic,cols<=MaxColsAtCompileTime)
+                   && rows>=0 && cols>=0 && "Invalid sizes when resizing a matrix or array.");
+      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(rows, cols);
       #ifdef EIGEN_INITIALIZE_COEFFS
-        Index size = nbRows*nbCols;
+        Index size = rows*cols;
         bool size_changed = size != this->size();
-        m_storage.resize(size, nbRows, nbCols);
+        m_storage.resize(size, rows, cols);
         if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
       #else
-        m_storage.resize(nbRows*nbCols, nbRows, nbCols);
+        m_storage.resize(rows*cols, rows, cols);
       #endif
     }
 
@@ -300,9 +323,9 @@
       * \sa resize(Index,Index)
       */
     EIGEN_DEVICE_FUNC
-    inline void resize(NoChange_t, Index nbCols)
+    inline void resize(NoChange_t, Index cols)
     {
-      resize(rows(), nbCols);
+      resize(rows(), cols);
     }
 
     /** Resizes the matrix, changing only the number of rows. For the parameter of type NoChange_t, just pass the special value \c NoChange
@@ -314,9 +337,9 @@
       * \sa resize(Index,Index)
       */
     EIGEN_DEVICE_FUNC
-    inline void resize(Index nbRows, NoChange_t)
+    inline void resize(Index rows, NoChange_t)
     {
-      resize(nbRows, cols());
+      resize(rows, cols());
     }
 
     /** Resizes \c *this to have the same dimensions as \a other.
@@ -327,7 +350,7 @@
       * remain row-vectors and vectors remain vectors.
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void resizeLike(const EigenBase<OtherDerived>& _other)
     {
       const OtherDerived& other = _other.derived();
@@ -352,13 +375,13 @@
       * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or
       * conservativeResize(Index, NoChange_t).
       *
-      * Matrices are resized relative to the top-left element. In case values need to be 
+      * Matrices are resized relative to the top-left element. In case values need to be
       * appended to the matrix they will be uninitialized.
       */
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void conservativeResize(Index nbRows, Index nbCols)
+    EIGEN_STRONG_INLINE void conservativeResize(Index rows, Index cols)
     {
-      internal::conservative_resize_like_impl<Derived>::run(*this, nbRows, nbCols);
+      internal::conservative_resize_like_impl<Derived>::run(*this, rows, cols);
     }
 
     /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
@@ -369,10 +392,10 @@
       * In case the matrix is growing, new rows will be uninitialized.
       */
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void conservativeResize(Index nbRows, NoChange_t)
+    EIGEN_STRONG_INLINE void conservativeResize(Index rows, NoChange_t)
     {
       // Note: see the comment in conservativeResize(Index,Index)
-      conservativeResize(nbRows, cols());
+      conservativeResize(rows, cols());
     }
 
     /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
@@ -383,10 +406,10 @@
       * In case the matrix is growing, new columns will be uninitialized.
       */
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void conservativeResize(NoChange_t, Index nbCols)
+    EIGEN_STRONG_INLINE void conservativeResize(NoChange_t, Index cols)
     {
       // Note: see the comment in conservativeResize(Index,Index)
-      conservativeResize(rows(), nbCols);
+      conservativeResize(rows(), cols);
     }
 
     /** Resizes the vector to \a size while retaining old values.
@@ -409,7 +432,7 @@
       * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or
       * conservativeResize(Index, NoChange_t).
       *
-      * Matrices are resized relative to the top-left element. In case values need to be 
+      * Matrices are resized relative to the top-left element. In case values need to be
       * appended to the matrix they will copied from \c other.
       */
     template<typename OtherDerived>
@@ -445,6 +468,10 @@
       return Base::operator=(func);
     }
 
+    // Prevent user from trying to instantiate PlainObjectBase objects
+    // by making all its constructor protected. See bug 1074.
+  protected:
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE PlainObjectBase() : m_storage()
     {
@@ -456,41 +483,145 @@
     // FIXME is it still needed ?
     /** \internal */
     EIGEN_DEVICE_FUNC
-    PlainObjectBase(internal::constructor_without_unaligned_array_assert)
+    explicit PlainObjectBase(internal::constructor_without_unaligned_array_assert)
       : m_storage(internal::constructor_without_unaligned_array_assert())
     {
 //       _check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
     }
 #endif
 
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
     EIGEN_DEVICE_FUNC
-    PlainObjectBase(PlainObjectBase&& other)
+    PlainObjectBase(PlainObjectBase&& other) EIGEN_NOEXCEPT
       : m_storage( std::move(other.m_storage) )
     {
     }
 
     EIGEN_DEVICE_FUNC
-    PlainObjectBase& operator=(PlainObjectBase&& other)
+    PlainObjectBase& operator=(PlainObjectBase&& other) EIGEN_NOEXCEPT
     {
-      using std::swap;
-      swap(m_storage, other.m_storage);
+      _check_template_params();
+      m_storage = std::move(other.m_storage);
       return *this;
     }
 #endif
 
+    /** Copy constructor */
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE PlainObjectBase(Index a_size, Index nbRows, Index nbCols)
-      : m_storage(a_size, nbRows, nbCols)
+    EIGEN_STRONG_INLINE PlainObjectBase(const PlainObjectBase& other)
+      : Base(), m_storage(other.m_storage) { }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(Index size, Index rows, Index cols)
+      : m_storage(size, rows, cols)
     {
 //       _check_template_params();
 //       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
     }
 
-    /** \copydoc MatrixBase::operator=(const EigenBase<OtherDerived>&)
+    #if EIGEN_HAS_CXX11
+    /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11
+      *
+      * \only_for_vectors
+      *
+      * This constructor is for 1D array or vectors with more than 4 coefficients.
+      * There exists C++98 analogue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients.
+      *
+      * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this
+      * constructor must match the the fixed number of rows (resp. columns) of \c *this.
+      */
+    template <typename... ArgTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
+      : m_storage()
+    {
+      _check_template_params();
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, sizeof...(args) + 4);
+      m_storage.data()[0] = a0;
+      m_storage.data()[1] = a1;
+      m_storage.data()[2] = a2;
+      m_storage.data()[3] = a3;
+      Index i = 4;
+      auto x = {(m_storage.data()[i++] = args, 0)...};
+      static_cast<void>(x);
+    }
+
+    /** \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer
+      * lists \cpp11
+      */
+    EIGEN_DEVICE_FUNC
+    explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list<std::initializer_list<Scalar>>& list)
+      : m_storage()
+    {
+      _check_template_params();
+
+      size_t list_size = 0;
+      if (list.begin() != list.end()) {
+        list_size = list.begin()->size();
+      }
+
+      // This is to allow syntax like VectorXi {{1, 2, 3, 4}}
+      if (ColsAtCompileTime == 1 && list.size() == 1) {
+        eigen_assert(list_size == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
+        resize(list_size, ColsAtCompileTime);
+        std::copy(list.begin()->begin(), list.begin()->end(), m_storage.data());
+      } else {
+        eigen_assert(list.size() == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
+        eigen_assert(list_size == static_cast<size_t>(ColsAtCompileTime) || ColsAtCompileTime == Dynamic);
+        resize(list.size(), list_size);
+
+        Index row_index = 0;
+        for (const std::initializer_list<Scalar>& row : list) {
+          eigen_assert(list_size == row.size());
+          Index col_index = 0;
+          for (const Scalar& e : row) {
+            coeffRef(row_index, col_index) = e;
+            ++col_index;
+          }
+          ++row_index;
+        }
+      }
+    }
+    #endif  // end EIGEN_HAS_CXX11
+
+    /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(const DenseBase<OtherDerived> &other)
+      : m_storage()
+    {
+      _check_template_params();
+      resizeLike(other);
+      _set_noalias(other);
+    }
+
+    /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(const EigenBase<OtherDerived> &other)
+      : m_storage()
+    {
+      _check_template_params();
+      resizeLike(other);
+      *this = other.derived();
+    }
+    /** \brief Copy constructor with in-place evaluation */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(const ReturnByValue<OtherDerived>& other)
+    {
+      _check_template_params();
+      // FIXME this does not automatically transpose vectors if necessary
+      resize(other.rows(), other.cols());
+      other.evalTo(this->derived());
+    }
+
+  public:
+
+    /** \brief Copies the generic expression \a other into *this.
+      * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived> &other)
     {
       _resize_to_match(other);
@@ -498,22 +629,15 @@
       return this->derived();
     }
 
-    /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
-    EIGEN_STRONG_INLINE PlainObjectBase(const EigenBase<OtherDerived> &other)
-      : m_storage(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
-    {
-      _check_template_params();
-      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(other.derived().rows(), other.derived().cols());
-      Base::operator=(other.derived());
-    }
-
     /** \name Map
       * These are convenience functions returning Map objects. The Map() static functions return unaligned Map objects,
       * while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
       * \a data pointers.
       *
+      * Here is an example using strides:
+      * \include Matrix_Map_stride.cpp
+      * Output: \verbinclude Matrix_Map_stride.out
+      *
       * \see class Map
       */
     //@{
@@ -583,20 +707,28 @@
     //@}
 
     using Base::setConstant;
-    EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& value);
-    EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& value);
+    EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& val);
+    EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& val);
+    EIGEN_DEVICE_FUNC Derived& setConstant(NoChange_t, Index cols, const Scalar& val);
+    EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, NoChange_t, const Scalar& val);
 
     using Base::setZero;
     EIGEN_DEVICE_FUNC Derived& setZero(Index size);
     EIGEN_DEVICE_FUNC Derived& setZero(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setZero(NoChange_t, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setZero(Index rows, NoChange_t);
 
     using Base::setOnes;
     EIGEN_DEVICE_FUNC Derived& setOnes(Index size);
     EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setOnes(NoChange_t, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, NoChange_t);
 
     using Base::setRandom;
     Derived& setRandom(Index size);
     Derived& setRandom(Index rows, Index cols);
+    Derived& setRandom(NoChange_t, Index cols);
+    Derived& setRandom(Index rows, NoChange_t);
 
     #ifdef EIGEN_PLAINOBJECTBASE_PLUGIN
     #include EIGEN_PLAINOBJECTBASE_PLUGIN
@@ -611,7 +743,7 @@
       * remain row-vectors and vectors remain vectors.
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void _resize_to_match(const EigenBase<OtherDerived>& other)
     {
       #ifdef EIGEN_NO_AUTOMATIC_RESIZING
@@ -638,29 +770,23 @@
       *
       * \internal
       */
+    // aliasing is dealt once in internal::call_assignment
+    // so at this stage we have to assume aliasing... and resising has to be done later.
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& _set(const DenseBase<OtherDerived>& other)
     {
-      _set_selector(other.derived(), typename internal::conditional<static_cast<bool>(int(OtherDerived::Flags) & EvalBeforeAssigningBit), internal::true_type, internal::false_type>::type());
+      internal::call_assignment(this->derived(), other.derived());
       return this->derived();
     }
 
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
-    EIGEN_STRONG_INLINE void _set_selector(const OtherDerived& other, const internal::true_type&) { _set_noalias(other.eval()); }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
-    EIGEN_STRONG_INLINE void _set_selector(const OtherDerived& other, const internal::false_type&) { _set_noalias(other); }
-
     /** \internal Like _set() but additionally makes the assumption that no aliasing effect can happen (which
       * is the case when creating a new matrix) so one can enforce lazy evaluation.
       *
       * \sa operator=(const MatrixBase<OtherDerived>&), _set()
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& _set_noalias(const DenseBase<OtherDerived>& other)
     {
       // I don't think we need this resize call since the lazyAssign will anyways resize
@@ -668,55 +794,103 @@
       //_resize_to_match(other);
       // the 'false' below means to enforce lazy evaluation. We don't use lazyAssign() because
       // it wouldn't allow to copy a row-vector into a column-vector.
-      return internal::assign_selector<Derived,OtherDerived,false>::run(this->derived(), other.derived());
+      internal::call_assignment_no_alias(this->derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
+      return this->derived();
     }
 
     template<typename T0, typename T1>
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init2(Index nbRows, Index nbCols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
+    EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
     {
-      EIGEN_STATIC_ASSERT(bool(NumTraits<T0>::IsInteger) &&
-                          bool(NumTraits<T1>::IsInteger),
+      const bool t0_is_integer_alike = internal::is_valid_index_type<T0>::value;
+      const bool t1_is_integer_alike = internal::is_valid_index_type<T1>::value;
+      EIGEN_STATIC_ASSERT(t0_is_integer_alike &&
+                          t1_is_integer_alike,
                           FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
-      resize(nbRows,nbCols);
+      resize(rows,cols);
     }
+
     template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC 
-    EIGEN_STRONG_INLINE void _init2(const Scalar& val0, const Scalar& val1, typename internal::enable_if<Base::SizeAtCompileTime==2,T0>::type* = 0)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init2(const T0& val0, const T1& val1, typename internal::enable_if<Base::SizeAtCompileTime==2,T0>::type* = 0)
     {
       EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
-      m_storage.data()[0] = val0;
-      m_storage.data()[1] = val1;
+      m_storage.data()[0] = Scalar(val0);
+      m_storage.data()[1] = Scalar(val1);
     }
 
+    template<typename T0, typename T1>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init2(const Index& val0, const Index& val1,
+                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
+                                                                  && (internal::is_same<T0,Index>::value)
+                                                                  && (internal::is_same<T1,Index>::value)
+                                                                  && Base::SizeAtCompileTime==2,T1>::type* = 0)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
+      m_storage.data()[0] = Scalar(val0);
+      m_storage.data()[1] = Scalar(val1);
+    }
+
+    // The argument is convertible to the Index type and we either have a non 1x1 Matrix, or a dynamic-sized Array,
+    // then the argument is meant to be the size of the object.
     template<typename T>
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init1(Index size, typename internal::enable_if<Base::SizeAtCompileTime!=1,T>::type* = 0)
+    EIGEN_STRONG_INLINE void _init1(Index size, typename internal::enable_if<    (Base::SizeAtCompileTime!=1 || !internal::is_convertible<T, Scalar>::value)
+                                                                              && ((!internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>::type* = 0)
     {
-      EIGEN_STATIC_ASSERT(bool(NumTraits<T>::IsInteger),
+      // NOTE MSVC 2008 complains if we directly put bool(NumTraits<T>::IsInteger) as the EIGEN_STATIC_ASSERT argument.
+      const bool is_integer_alike = internal::is_valid_index_type<T>::value;
+      EIGEN_UNUSED_VARIABLE(is_integer_alike);
+      EIGEN_STATIC_ASSERT(is_integer_alike,
                           FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
       resize(size);
     }
+
+    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitly converted)
     template<typename T>
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if<Base::SizeAtCompileTime==1,T>::type* = 0)
+    EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if<Base::SizeAtCompileTime==1 && internal::is_convertible<T, Scalar>::value,T>::type* = 0)
     {
       EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
       m_storage.data()[0] = val0;
     }
 
+    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type match the index type)
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Index& val0,
+                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
+                                                                  && (internal::is_same<Index,T>::value)
+                                                                  && Base::SizeAtCompileTime==1
+                                                                  && internal::is_convertible<T, Scalar>::value,T*>::type* = 0)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
+      m_storage.data()[0] = Scalar(val0);
+    }
+
+    // Initialize a fixed size matrix from a pointer to raw data
     template<typename T>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void _init1(const Scalar* data){
       this->_set_noalias(ConstMapType(data));
     }
 
+    // Initialize an arbitrary matrix from a dense expression
     template<typename T, typename OtherDerived>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void _init1(const DenseBase<OtherDerived>& other){
       this->_set_noalias(other);
     }
 
+    // Initialize an arbitrary matrix from an object convertible to the Derived type.
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Derived& other){
+      this->_set_noalias(other);
+    }
+
+    // Initialize an arbitrary matrix from a generic Eigen expression
     template<typename T, typename OtherDerived>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void _init1(const EigenBase<OtherDerived>& other){
@@ -738,27 +912,63 @@
       this->derived() = r;
     }
 
+    // For fixed-size Array<Scalar,...>
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Scalar& val0,
+                                    typename internal::enable_if<    Base::SizeAtCompileTime!=Dynamic
+                                                                  && Base::SizeAtCompileTime!=1
+                                                                  && internal::is_convertible<T, Scalar>::value
+                                                                  && internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value,T>::type* = 0)
+    {
+      Base::setConstant(val0);
+    }
+
+    // For fixed-size Array<Index,...>
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Index& val0,
+                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
+                                                                  && (internal::is_same<Index,T>::value)
+                                                                  && Base::SizeAtCompileTime!=Dynamic
+                                                                  && Base::SizeAtCompileTime!=1
+                                                                  && internal::is_convertible<T, Scalar>::value
+                                                                  && internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value,T*>::type* = 0)
+    {
+      Base::setConstant(val0);
+    }
+
     template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
     friend struct internal::matrix_swap_impl;
 
-    /** \internal generic implementation of swap for dense storage since for dynamic-sized matrices of same type it is enough to swap the
-      * data pointers.
+  public:
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** \internal
+      * \brief Override DenseBase::swap() since for dynamic-sized matrices
+      * of same type it is enough to swap the data pointers.
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    void _swap(DenseBase<OtherDerived> const & other)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    void swap(DenseBase<OtherDerived> & other)
     {
       enum { SwapPointers = internal::is_same<Derived, OtherDerived>::value && Base::SizeAtCompileTime==Dynamic };
-      internal::matrix_swap_impl<Derived, OtherDerived, bool(SwapPointers)>::run(this->derived(), other.const_cast_derived());
+      internal::matrix_swap_impl<Derived, OtherDerived, bool(SwapPointers)>::run(this->derived(), other.derived());
     }
 
-  public:
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    EIGEN_DEVICE_FUNC 
+    /** \internal
+      * \brief const version forwarded to DenseBase::swap
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    void swap(DenseBase<OtherDerived> const & other)
+    { Base::swap(other.derived()); }
+
+    EIGEN_DEVICE_FUNC
     static EIGEN_STRONG_INLINE void _check_template_params()
     {
-      EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (Options&RowMajor)==RowMajor)
-                        && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, (Options&RowMajor)==0)
+      EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (int(Options)&RowMajor)==RowMajor)
+                        && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, (int(Options)&RowMajor)==0)
                         && ((RowsAtCompileTime == Dynamic) || (RowsAtCompileTime >= 0))
                         && ((ColsAtCompileTime == Dynamic) || (ColsAtCompileTime >= 0))
                         && ((MaxRowsAtCompileTime == Dynamic) || (MaxRowsAtCompileTime >= 0))
@@ -768,10 +978,20 @@
                         && (Options & (DontAlign|RowMajor)) == Options),
         INVALID_MATRIX_TEMPLATE_PARAMETERS)
     }
-#endif
 
-private:
-    enum { ThisConstantIsPrivateInPlainObjectBase };
+    enum { IsPlainObjectBase = 1 };
+#endif
+  public:
+    // These apparently need to be down here for nvcc+icc to prevent duplicate
+    // Map symbol.
+    template<typename PlainObjectType, int MapOptions, typename StrideType> friend class Eigen::Map;
+    friend class Eigen::Map<Derived, Unaligned>;
+    friend class Eigen::Map<const Derived, Unaligned>;
+#if EIGEN_MAX_ALIGN_BYTES>0
+    // for EIGEN_MAX_ALIGN_BYTES==0, AlignedMax==Unaligned, and many compilers generate warnings for friend-ing a class twice.
+    friend class Eigen::Map<Derived, AlignedMax>;
+    friend class Eigen::Map<const Derived, AlignedMax>;
+#endif
 };
 
 namespace internal {
@@ -779,14 +999,19 @@
 template <typename Derived, typename OtherDerived, bool IsVector>
 struct conservative_resize_like_impl
 {
-  typedef typename Derived::Index Index;
+  #if EIGEN_HAS_TYPE_TRAITS
+  static const bool IsRelocatable = std::is_trivially_copyable<typename Derived::Scalar>::value;
+  #else
+  static const bool IsRelocatable = !NumTraits<typename Derived::Scalar>::RequireInitialization;
+  #endif
   static void run(DenseBase<Derived>& _this, Index rows, Index cols)
   {
     if (_this.rows() == rows && _this.cols() == cols) return;
     EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
 
-    if ( ( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows
-         (!Derived::IsRowMajor && _this.rows() == rows) )  // column-major and we change only the number of columns
+    if ( IsRelocatable
+          && (( Derived::IsRowMajor && _this.cols() == cols) ||  // row-major and we change only the number of rows
+              (!Derived::IsRowMajor && _this.rows() == rows) ))  // column-major and we change only the number of columns
     {
       internal::check_rows_cols_for_overflow<Derived::MaxSizeAtCompileTime>::run(rows, cols);
       _this.derived().m_storage.conservativeResize(rows*cols,rows,cols);
@@ -794,9 +1019,9 @@
     else
     {
       // The storage order does not allow us to use reallocation.
-      typename Derived::PlainObject tmp(rows,cols);
-      const Index common_rows = (std::min)(rows, _this.rows());
-      const Index common_cols = (std::min)(cols, _this.cols());
+      Derived tmp(rows,cols);
+      const Index common_rows = numext::mini(rows, _this.rows());
+      const Index common_cols = numext::mini(cols, _this.cols());
       tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
       _this.derived().swap(tmp);
     }
@@ -814,8 +1039,9 @@
     EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
     EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived)
 
-    if ( ( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows
-         (!Derived::IsRowMajor && _this.rows() == other.rows()) )  // column-major and we change only the number of columns
+    if ( IsRelocatable &&
+          (( Derived::IsRowMajor && _this.cols() == other.cols()) ||  // row-major and we change only the number of rows
+           (!Derived::IsRowMajor && _this.rows() == other.rows()) ))  // column-major and we change only the number of columns
     {
       const Index new_rows = other.rows() - _this.rows();
       const Index new_cols = other.cols() - _this.cols();
@@ -828,9 +1054,9 @@
     else
     {
       // The storage order does not allow us to use reallocation.
-      typename Derived::PlainObject tmp(other);
-      const Index common_rows = (std::min)(tmp.rows(), _this.rows());
-      const Index common_cols = (std::min)(tmp.cols(), _this.cols());
+      Derived tmp(other);
+      const Index common_rows = numext::mini(tmp.rows(), _this.rows());
+      const Index common_cols = numext::mini(tmp.cols(), _this.cols());
       tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
       _this.derived().swap(tmp);
     }
@@ -843,14 +1069,18 @@
 struct conservative_resize_like_impl<Derived,OtherDerived,true>
   : conservative_resize_like_impl<Derived,OtherDerived,false>
 {
-  using conservative_resize_like_impl<Derived,OtherDerived,false>::run;
-  
-  typedef typename Derived::Index Index;
+  typedef conservative_resize_like_impl<Derived,OtherDerived,false> Base;
+  using Base::run;
+  using Base::IsRelocatable;
+
   static void run(DenseBase<Derived>& _this, Index size)
   {
     const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size;
     const Index new_cols = Derived::RowsAtCompileTime==1 ? size : 1;
-    _this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
+    if(IsRelocatable)
+      _this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
+    else
+      Base::run(_this.derived(), new_rows, new_cols);
   }
 
   static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other)
@@ -861,7 +1091,10 @@
 
     const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows();
     const Index new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1;
-    _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
+    if(IsRelocatable)
+      _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
+    else
+      Base::run(_this.derived(), new_rows, new_cols);
 
     if (num_new_elements > 0)
       _this.tail(num_new_elements) = other.tail(num_new_elements);
@@ -872,7 +1105,7 @@
 struct matrix_swap_impl
 {
   EIGEN_DEVICE_FUNC
-  static inline void run(MatrixTypeA& a, MatrixTypeB& b)
+  static EIGEN_STRONG_INLINE void run(MatrixTypeA& a, MatrixTypeB& b)
   {
     a.base().swap(b);
   }

diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 5d3789b..70a6c10 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h

@@ -12,53 +12,85 @@
 
 namespace Eigen {
 
-template<typename Lhs, typename Rhs> class Product;
-template<typename Lhs, typename Rhs, typename StorageKind> class ProductImpl;
+template<typename Lhs, typename Rhs, int Option, typename StorageKind> class ProductImpl;
+
+namespace internal {
+
+template<typename Lhs, typename Rhs, int Option>
+struct traits<Product<Lhs, Rhs, Option> >
+{
+  typedef typename remove_all<Lhs>::type LhsCleaned;
+  typedef typename remove_all<Rhs>::type RhsCleaned;
+  typedef traits<LhsCleaned> LhsTraits;
+  typedef traits<RhsCleaned> RhsTraits;
+
+  typedef MatrixXpr XprKind;
+
+  typedef typename ScalarBinaryOpTraits<typename traits<LhsCleaned>::Scalar, typename traits<RhsCleaned>::Scalar>::ReturnType Scalar;
+  typedef typename product_promote_storage_type<typename LhsTraits::StorageKind,
+                                                typename RhsTraits::StorageKind,
+                                                internal::product_type<Lhs,Rhs>::ret>::ret StorageKind;
+  typedef typename promote_index_type<typename LhsTraits::StorageIndex,
+                                      typename RhsTraits::StorageIndex>::type StorageIndex;
+
+  enum {
+    RowsAtCompileTime    = LhsTraits::RowsAtCompileTime,
+    ColsAtCompileTime    = RhsTraits::ColsAtCompileTime,
+    MaxRowsAtCompileTime = LhsTraits::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = RhsTraits::MaxColsAtCompileTime,
+
+    // FIXME: only needed by GeneralMatrixMatrixTriangular
+    InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsTraits::ColsAtCompileTime, RhsTraits::RowsAtCompileTime),
+
+    // The storage order is somewhat arbitrary here. The correct one will be determined through the evaluator.
+    Flags = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? RowMajorBit
+          : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0
+          : (   ((LhsTraits::Flags&NoPreferredStorageOrderBit) && (RhsTraits::Flags&RowMajorBit))
+             || ((RhsTraits::Flags&NoPreferredStorageOrderBit) && (LhsTraits::Flags&RowMajorBit)) ) ? RowMajorBit
+          : NoPreferredStorageOrderBit
+  };
+};
+
+} // end namespace internal
 
 /** \class Product
   * \ingroup Core_Module
   *
   * \brief Expression of the product of two arbitrary matrices or vectors
   *
-  * \param Lhs the type of the left-hand side expression
-  * \param Rhs the type of the right-hand side expression
+  * \tparam _Lhs the type of the left-hand side expression
+  * \tparam _Rhs the type of the right-hand side expression
   *
   * This class represents an expression of the product of two arbitrary matrices.
   *
+  * The other template parameters are:
+  * \tparam Option     can be DefaultProduct, AliasFreeProduct, or LazyProduct
+  *
   */
-
-// Use ProductReturnType to get correct traits, in particular vectorization flags
-namespace internal {
-template<typename Lhs, typename Rhs>
-struct traits<Product<Lhs, Rhs> >
-  : traits<typename ProductReturnType<Lhs, Rhs>::Type>
-{ 
-  // We want A+B*C to be of type Product<Matrix, Sum> and not Product<Matrix, Matrix>
-  // TODO: This flag should eventually go in a separate evaluator traits class
-  enum {
-    Flags = traits<typename ProductReturnType<Lhs, Rhs>::Type>::Flags & ~(EvalBeforeNestingBit | DirectAccessBit)
-  };
-};
-} // end namespace internal
-
-
-template<typename Lhs, typename Rhs>
-class Product : public ProductImpl<Lhs,Rhs,typename internal::promote_storage_type<typename internal::traits<Lhs>::StorageKind,
-                                                                            typename internal::traits<Rhs>::StorageKind>::ret>
+template<typename _Lhs, typename _Rhs, int Option>
+class Product : public ProductImpl<_Lhs,_Rhs,Option,
+                                   typename internal::product_promote_storage_type<typename internal::traits<_Lhs>::StorageKind,
+                                                                                   typename internal::traits<_Rhs>::StorageKind,
+                                                                                   internal::product_type<_Lhs,_Rhs>::ret>::ret>
 {
   public:
-    
+
+    typedef _Lhs Lhs;
+    typedef _Rhs Rhs;
+
     typedef typename ProductImpl<
-        Lhs, Rhs,
-        typename internal::promote_storage_type<typename Lhs::StorageKind,
-                                                typename Rhs::StorageKind>::ret>::Base Base;
+        Lhs, Rhs, Option,
+        typename internal::product_promote_storage_type<typename internal::traits<Lhs>::StorageKind,
+                                                        typename internal::traits<Rhs>::StorageKind,
+                                                        internal::product_type<Lhs,Rhs>::ret>::ret>::Base Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(Product)
 
-    typedef typename Lhs::Nested LhsNested;
-    typedef typename Rhs::Nested RhsNested;
+    typedef typename internal::ref_selector<Lhs>::type LhsNested;
+    typedef typename internal::ref_selector<Rhs>::type RhsNested;
     typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
     typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
 
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs)
     {
       eigen_assert(lhs.cols() == rhs.rows()
@@ -66,10 +98,14 @@
         && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
     }
 
-    inline Index rows() const { return m_lhs.rows(); }
-    inline Index cols() const { return m_rhs.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
 
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const LhsNestedCleaned& lhs() const { return m_lhs; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const RhsNestedCleaned& rhs() const { return m_rhs; }
 
   protected:
@@ -78,29 +114,77 @@
     RhsNested m_rhs;
 };
 
-template<typename Lhs, typename Rhs>
-class ProductImpl<Lhs,Rhs,Dense> : public internal::dense_xpr_base<Product<Lhs,Rhs> >::type
-{
-    typedef Product<Lhs, Rhs> Derived;
-  public:
+namespace internal {
 
-    typedef typename internal::dense_xpr_base<Product<Lhs, Rhs> >::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
+template<typename Lhs, typename Rhs, int Option, int ProductTag = internal::product_type<Lhs,Rhs>::ret>
+class dense_product_base
+ : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
+{};
+
+/** Conversion to scalar for inner-products */
+template<typename Lhs, typename Rhs, int Option>
+class dense_product_base<Lhs, Rhs, Option, InnerProduct>
+ : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
+{
+  typedef Product<Lhs,Rhs,Option> ProductXpr;
+  typedef typename internal::dense_xpr_base<ProductXpr>::type Base;
+public:
+  using Base::derived;
+  typedef typename Base::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const
+  {
+    return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
+  }
 };
 
-/***************************************************************************
-* Implementation of matrix base methods
-***************************************************************************/
+} // namespace internal
 
-
-/** \internal used to test the evaluator only
-  */
-template<typename Lhs,typename Rhs>
-const Product<Lhs,Rhs>
-prod(const Lhs& lhs, const Rhs& rhs)
+// Generic API dispatcher
+template<typename Lhs, typename Rhs, int Option, typename StorageKind>
+class ProductImpl : public internal::generic_xpr_base<Product<Lhs,Rhs,Option>, MatrixXpr, StorageKind>::type
 {
-  return Product<Lhs,Rhs>(lhs,rhs);
-}
+  public:
+    typedef typename internal::generic_xpr_base<Product<Lhs,Rhs,Option>, MatrixXpr, StorageKind>::type Base;
+};
+
+template<typename Lhs, typename Rhs, int Option>
+class ProductImpl<Lhs,Rhs,Option,Dense>
+  : public internal::dense_product_base<Lhs,Rhs,Option>
+{
+    typedef Product<Lhs, Rhs, Option> Derived;
+
+  public:
+
+    typedef typename internal::dense_product_base<Lhs, Rhs, Option> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
+  protected:
+    enum {
+      IsOneByOne = (RowsAtCompileTime == 1 || RowsAtCompileTime == Dynamic) &&
+                   (ColsAtCompileTime == 1 || ColsAtCompileTime == Dynamic),
+      EnableCoeff = IsOneByOne || Option==LazyProduct
+    };
+
+  public:
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const
+    {
+      EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
+      eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
+
+      return internal::evaluator<Derived>(derived()).coeff(row,col);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index i) const
+    {
+      EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
+      eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
+
+      return internal::evaluator<Derived>(derived()).coeff(i);
+    }
+
+
+};
 
 } // end namespace Eigen
 

diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 855914f..8cf294b 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h

@@ -14,97 +14,467 @@
 #define EIGEN_PRODUCTEVALUATORS_H
 
 namespace Eigen {
-  
+
 namespace internal {
-  
-// We can evaluate the product either all at once, like GeneralProduct and its evalTo() function, or
-// traverse the matrix coefficient by coefficient, like CoeffBasedProduct.  Use the existing logic
-// in ProductReturnType to decide.
 
-template<typename XprType, typename ProductType>
-struct product_evaluator_dispatcher;
-
-template<typename Lhs, typename Rhs>
-struct evaluator_impl<Product<Lhs, Rhs> >
-  : product_evaluator_dispatcher<Product<Lhs, Rhs>, typename ProductReturnType<Lhs, Rhs>::Type> 
+/** \internal
+  * Evaluator of a product expression.
+  * Since products require special treatments to handle all possible cases,
+  * we simply defer the evaluation logic to a product_evaluator class
+  * which offers more partial specialization possibilities.
+  *
+  * \sa class product_evaluator
+  */
+template<typename Lhs, typename Rhs, int Options>
+struct evaluator<Product<Lhs, Rhs, Options> >
+ : public product_evaluator<Product<Lhs, Rhs, Options> >
 {
-  typedef Product<Lhs, Rhs> XprType;
-  typedef product_evaluator_dispatcher<XprType, typename ProductReturnType<Lhs, Rhs>::Type> Base;
+  typedef Product<Lhs, Rhs, Options> XprType;
+  typedef product_evaluator<XprType> Base;
 
-  evaluator_impl(const XprType& xpr) : Base(xpr) 
-  { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
 
-template<typename XprType, typename ProductType>
-struct product_evaluator_traits_dispatcher;
+// Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
+// TODO we should apply that rule only if that's really helpful
+template<typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                                               const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
+                                               const Product<Lhs, Rhs, DefaultProduct> > >
+{
+  static const bool value = true;
+};
+template<typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
+struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                               const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
+                               const Product<Lhs, Rhs, DefaultProduct> > >
+ : public evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> >
+{
+  typedef CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                               const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
+                               const Product<Lhs, Rhs, DefaultProduct> > XprType;
+  typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> > Base;
 
-template<typename Lhs, typename Rhs>
-struct evaluator_traits<Product<Lhs, Rhs> >
-  : product_evaluator_traits_dispatcher<Product<Lhs, Rhs>, typename ProductReturnType<Lhs, Rhs>::Type> 
-{ 
-  static const int AssumeAliasing = 1;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
+    : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs())
+  {}
 };
 
-// Case 1: Evaluate all at once
-//
-// We can view the GeneralProduct class as a part of the product evaluator. 
-// Four sub-cases: InnerProduct, OuterProduct, GemmProduct and GemvProduct.
-// InnerProduct is special because GeneralProduct does not have an evalTo() method in this case.
 
-template<typename Lhs, typename Rhs>
-struct product_evaluator_traits_dispatcher<Product<Lhs, Rhs>, GeneralProduct<Lhs, Rhs, InnerProduct> > 
+template<typename Lhs, typename Rhs, int DiagIndex>
+struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> >
+ : public evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> >
 {
-  static const int HasEvalTo = 0;
+  typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
+  typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
+    : Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
+        Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
+        xpr.index() ))
+  {}
 };
 
+
+// Helper class to perform a matrix product with the destination at hand.
+// Depending on the sizes of the factors, there are different evaluation strategies
+// as controlled by internal::product_type.
+template< typename Lhs, typename Rhs,
+          typename LhsShape = typename evaluator_traits<Lhs>::Shape,
+          typename RhsShape = typename evaluator_traits<Rhs>::Shape,
+          int ProductType = internal::product_type<Lhs,Rhs>::value>
+struct generic_product_impl;
+
 template<typename Lhs, typename Rhs>
-struct product_evaluator_dispatcher<Product<Lhs, Rhs>, GeneralProduct<Lhs, Rhs, InnerProduct> > 
-  : public evaluator<typename Product<Lhs, Rhs>::PlainObject>::type
+struct evaluator_assume_aliasing<Product<Lhs, Rhs, DefaultProduct> > {
+  static const bool value = true;
+};
+
+// This is the default evaluator implementation for products:
+// It creates a temporary and call generic_product_impl
+template<typename Lhs, typename Rhs, int Options, int ProductTag, typename LhsShape, typename RhsShape>
+struct product_evaluator<Product<Lhs, Rhs, Options>, ProductTag, LhsShape, RhsShape>
+  : public evaluator<typename Product<Lhs, Rhs, Options>::PlainObject>
 {
-  typedef Product<Lhs, Rhs> XprType;
+  typedef Product<Lhs, Rhs, Options> XprType;
   typedef typename XprType::PlainObject PlainObject;
-  typedef typename evaluator<PlainObject>::type evaluator_base;
+  typedef evaluator<PlainObject> Base;
+  enum {
+    Flags = Base::Flags | EvalBeforeNestingBit
+  };
 
-  // TODO: Computation is too early (?)
-  product_evaluator_dispatcher(const XprType& xpr) : evaluator_base(m_result)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit product_evaluator(const XprType& xpr)
+    : m_result(xpr.rows(), xpr.cols())
   {
-    m_result.coeffRef(0,0) = (xpr.lhs().transpose().cwiseProduct(xpr.rhs())).sum();
+    ::new (static_cast<Base*>(this)) Base(m_result);
+
+// FIXME shall we handle nested_eval here?,
+// if so, then we must take care at removing the call to nested_eval in the specializations (e.g., in permutation_matrix_product, transposition_matrix_product, etc.)
+//     typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
+//     typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
+//     typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
+//     typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
+//
+//     const LhsNested lhs(xpr.lhs());
+//     const RhsNested rhs(xpr.rhs());
+//
+//     generic_product_impl<LhsNestedCleaned, RhsNestedCleaned>::evalTo(m_result, lhs, rhs);
+
+    generic_product_impl<Lhs, Rhs, LhsShape, RhsShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
   }
-  
-protected:  
+
+protected:
   PlainObject m_result;
 };
 
-// For the other three subcases, simply call the evalTo() method of GeneralProduct
-// TODO: GeneralProduct should take evaluators, not expression objects.
+// The following three shortcuts are enabled only if the scalar types match exactly.
+// TODO: we could enable them for different scalar types when the product is not vectorized.
 
-template<typename Lhs, typename Rhs, int ProductType>
-struct product_evaluator_traits_dispatcher<Product<Lhs, Rhs>, GeneralProduct<Lhs, Rhs, ProductType> > 
+// Dense = Product
+template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scalar,Scalar>, Dense2Dense,
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
-  static const int HasEvalTo = 1;
-};
-
-template<typename Lhs, typename Rhs, int ProductType>
-struct product_evaluator_dispatcher<Product<Lhs, Rhs>, GeneralProduct<Lhs, Rhs, ProductType> > 
-{
-  typedef Product<Lhs, Rhs> XprType;
-  typedef typename XprType::PlainObject PlainObject;
-  typedef typename evaluator<PlainObject>::type evaluator_base;
-  
-  product_evaluator_dispatcher(const XprType& xpr) : m_xpr(xpr)
-  { }
-  
-  template<typename DstEvaluatorType, typename DstXprType>
-  void evalTo(DstEvaluatorType /* not used */, DstXprType& dst) const
+  typedef Product<Lhs,Rhs,Options> SrcXprType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
   {
-    dst.resize(m_xpr.rows(), m_xpr.cols());
-    GeneralProduct<Lhs, Rhs, ProductType>(m_xpr.lhs(), m_xpr.rhs()).evalTo(dst);
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+    // FIXME shall we handle nested_eval here?
+    generic_product_impl<Lhs, Rhs>::evalTo(dst, src.lhs(), src.rhs());
   }
-  
-protected: 
-  const XprType& m_xpr;
 };
 
+// Dense += Product
+template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<Scalar,Scalar>, Dense2Dense,
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
+{
+  typedef Product<Lhs,Rhs,Options> SrcXprType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &)
+  {
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    // FIXME shall we handle nested_eval here?
+    generic_product_impl<Lhs, Rhs>::addTo(dst, src.lhs(), src.rhs());
+  }
+};
+
+// Dense -= Product
+template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<Scalar,Scalar>, Dense2Dense,
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
+{
+  typedef Product<Lhs,Rhs,Options> SrcXprType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &)
+  {
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    // FIXME shall we handle nested_eval here?
+    generic_product_impl<Lhs, Rhs>::subTo(dst, src.lhs(), src.rhs());
+  }
+};
+
+
+// Dense ?= scalar * Product
+// TODO we should apply that rule if that's really helpful
+// for instance, this is not good for inner products
+template< typename DstXprType, typename Lhs, typename Rhs, typename AssignFunc, typename Scalar, typename ScalarBis, typename Plain>
+struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>, const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
+                                           const Product<Lhs,Rhs,DefaultProduct> >, AssignFunc, Dense2Dense>
+{
+  typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>,
+                        const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
+                        const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
+  {
+    call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func);
+  }
+};
+
+//----------------------------------------
+// Catch "Dense ?= xpr + Product<>" expression to save one temporary
+// FIXME we could probably enable these rules for any product, i.e., not only Dense and DefaultProduct
+
+template<typename OtherXpr, typename Lhs, typename Rhs>
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename OtherXpr::Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, const OtherXpr,
+                                               const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
+  static const bool value = true;
+};
+
+template<typename OtherXpr, typename Lhs, typename Rhs>
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_difference_op<typename OtherXpr::Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, const OtherXpr,
+                                               const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
+  static const bool value = true;
+};
+
+template<typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2>
+struct assignment_from_xpr_op_product
+{
+  template<typename SrcXprType, typename InitialFunc>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/)
+  {
+    call_assignment_no_alias(dst, src.lhs(), Func1());
+    call_assignment_no_alias(dst, src.rhs(), Func2());
+  }
+};
+
+#define EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(ASSIGN_OP,BINOP,ASSIGN_OP2) \
+  template< typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename DstScalar, typename SrcScalar, typename OtherScalar,typename ProdScalar> \
+  struct Assignment<DstXprType, CwiseBinaryOp<internal::BINOP<OtherScalar,ProdScalar>, const OtherXpr, \
+                                            const Product<Lhs,Rhs,DefaultProduct> >, internal::ASSIGN_OP<DstScalar,SrcScalar>, Dense2Dense> \
+    : assignment_from_xpr_op_product<DstXprType, OtherXpr, Product<Lhs,Rhs,DefaultProduct>, internal::ASSIGN_OP<DstScalar,OtherScalar>, internal::ASSIGN_OP2<DstScalar,ProdScalar> > \
+  {}
+
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(assign_op,    scalar_sum_op,add_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(add_assign_op,scalar_sum_op,add_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(sub_assign_op,scalar_sum_op,sub_assign_op);
+
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(assign_op,    scalar_difference_op,sub_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(add_assign_op,scalar_difference_op,sub_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(sub_assign_op,scalar_difference_op,add_assign_op);
+
+//----------------------------------------
+
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
+{
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
+  }
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
+  }
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
+};
+
+
+/***********************************************************************
+*  Implementation of outer dense * dense vector product
+***********************************************************************/
+
+// Column major result
+template<typename Dst, typename Lhs, typename Rhs, typename Func>
+void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
+{
+  evaluator<Rhs> rhsEval(rhs);
+  ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs);
+  // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
+  // FIXME not very good if rhs is real and lhs complex while alpha is real too
+  const Index cols = dst.cols();
+  for (Index j=0; j<cols; ++j)
+    func(dst.col(j), rhsEval.coeff(Index(0),j) * actual_lhs);
+}
+
+// Row major result
+template<typename Dst, typename Lhs, typename Rhs, typename Func>
+void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
+{
+  evaluator<Lhs> lhsEval(lhs);
+  ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs);
+  // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
+  // FIXME not very good if lhs is real and rhs complex while alpha is real too
+  const Index rows = dst.rows();
+  for (Index i=0; i<rows; ++i)
+    func(dst.row(i), lhsEval.coeff(i,Index(0)) * actual_rhs);
+}
+
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
+{
+  template<typename T> struct is_row_major : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+
+  // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
+  struct set  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
+  struct add  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
+  struct sub  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
+  struct adds {
+    Scalar m_scale;
+    explicit adds(const Scalar& s) : m_scale(s) {}
+    template<typename Dst, typename Src> void EIGEN_DEVICE_FUNC operator()(const Dst& dst, const Src& src) const {
+      dst.const_cast_derived() += m_scale * src;
+    }
+  };
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
+  }
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
+  }
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
+  }
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
+  }
+
+};
+
+
+// This base class provides default implementations for evalTo, addTo, subTo, in terms of scaleAndAddTo
+template<typename Lhs, typename Rhs, typename Derived>
+struct generic_product_impl_base
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); }
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }
+
+};
+
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
+  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct> >
+{
+  typedef typename nested_eval<Lhs,1>::type LhsNested;
+  typedef typename nested_eval<Rhs,1>::type RhsNested;
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight };
+  typedef typename internal::remove_all<typename internal::conditional<int(Side)==OnTheRight,LhsNested,RhsNested>::type>::type MatrixType;
+
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    // Fallback to inner product if both the lhs and rhs is a runtime vector.
+    if (lhs.rows() == 1 && rhs.cols() == 1) {
+      dst.coeffRef(0,0) += alpha * lhs.row(0).conjugate().dot(rhs.col(0));
+      return;
+    }
+    LhsNested actual_lhs(lhs);
+    RhsNested actual_rhs(rhs);
+    internal::gemv_dense_selector<Side,
+                            (int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor,
+                            bool(internal::blas_traits<MatrixType>::HasUsableDirectAccess)
+                           >::run(actual_lhs, actual_rhs, dst, alpha);
+  }
+};
+
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    // Same as: dst.noalias() = lhs.lazyProduct(rhs);
+    // but easier on the compiler side
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
+  }
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    // dst.noalias() += lhs.lazyProduct(rhs);
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>());
+  }
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    // dst.noalias() -= lhs.lazyProduct(rhs);
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
+  }
+
+  // This is a special evaluation path called from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
+  // This variant tries to extract scalar multiples from both the LHS and RHS and factor them out. For instance:
+  //   dst {,+,-}= (s1*A)*(B*s2)
+  // will be rewritten as:
+  //   dst {,+,-}= (s1*s2) * (A.lazyProduct(B))
+  // There are at least four benefits of doing so:
+  //  1 - huge performance gain for heap-allocated matrix types as it save costly allocations.
+  //  2 - it is faster than simply by-passing the heap allocation through stack allocation.
+  //  3 - it makes this fallback consistent with the heavy GEMM routine.
+  //  4 - it fully by-passes huge stack allocation attempts when multiplying huge fixed-size matrices.
+  //      (see https://stackoverflow.com/questions/54738495)
+  // For small fixed sizes matrices, howver, the gains are less obvious, it is sometimes x2 faster, but sometimes x3 slower,
+  // and the behavior depends also a lot on the compiler... This is why this re-writting strategy is currently
+  // enabled only when falling back from the main GEMM.
+  template<typename Dst, typename Func>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func &func)
+  {
+    enum {
+      HasScalarFactor = blas_traits<Lhs>::HasScalarFactor || blas_traits<Rhs>::HasScalarFactor,
+      ConjLhs = blas_traits<Lhs>::NeedToConjugate,
+      ConjRhs = blas_traits<Rhs>::NeedToConjugate
+    };
+    // FIXME: in c++11 this should be auto, and extractScalarFactor should also return auto
+    //        this is important for real*complex_mat
+    Scalar actualAlpha = combine_scalar_factors<Scalar>(lhs, rhs);
+
+    eval_dynamic_impl(dst,
+                      blas_traits<Lhs>::extract(lhs).template conjugateIf<ConjLhs>(),
+                      blas_traits<Rhs>::extract(rhs).template conjugateIf<ConjRhs>(),
+                      func,
+                      actualAlpha,
+                      typename conditional<HasScalarFactor,true_type,false_type>::type());
+  }
+
+protected:
+
+  template<typename Dst, typename LhsT, typename RhsT, typename Func, typename Scalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar&  s /* == 1 */, false_type)
+  {
+    EIGEN_UNUSED_VARIABLE(s);
+    eigen_internal_assert(s==Scalar(1));
+    call_restricted_packet_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
+  }
+
+  template<typename Dst, typename LhsT, typename RhsT, typename Func, typename Scalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& s, true_type)
+  {
+    call_restricted_packet_assignment_no_alias(dst, s * lhs.lazyProduct(rhs), func);
+  }
+};
+
+// This specialization enforces the use of a coefficient-based evaluation strategy
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,LazyCoeffBasedProductMode>
+  : generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> {};
+
 // Case 2: Evaluate coeff by coeff
 //
 // This is mostly taken from CoeffBasedProduct.h
@@ -117,290 +487,688 @@
 template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl;
 
-template<typename Lhs, typename Rhs, typename LhsNested, typename RhsNested, int Flags>
-struct product_evaluator_traits_dispatcher<Product<Lhs, Rhs>, CoeffBasedProduct<LhsNested, RhsNested, Flags> >
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, DenseShape>
+    : evaluator_base<Product<Lhs, Rhs, LazyProduct> >
 {
-  static const int HasEvalTo = 0;
-};
-
-template<typename Lhs, typename Rhs, typename LhsNested, typename RhsNested, int Flags>
-struct product_evaluator_dispatcher<Product<Lhs, Rhs>, CoeffBasedProduct<LhsNested, RhsNested, Flags> >
-  : evaluator_impl_base<Product<Lhs, Rhs> >
-{
-  typedef Product<Lhs, Rhs> XprType;
-  typedef CoeffBasedProduct<LhsNested, RhsNested, Flags> CoeffBasedProductType;
-
-  product_evaluator_dispatcher(const XprType& xpr) 
-    : m_lhsImpl(xpr.lhs()), 
-      m_rhsImpl(xpr.rhs()),  
-      m_innerDim(xpr.lhs().cols())
-  { }
-
-  typedef typename XprType::Index Index;
+  typedef Product<Lhs, Rhs, LazyProduct> XprType;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketScalar PacketScalar;
-  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit product_evaluator(const XprType& xpr)
+    : m_lhs(xpr.lhs()),
+      m_rhs(xpr.rhs()),
+      m_lhsImpl(m_lhs),     // FIXME the creation of the evaluator objects should result in a no-op, but check that!
+      m_rhsImpl(m_rhs),     //       Moreover, they are only useful for the packet path, so we could completely disable them when not needed,
+                            //       or perhaps declare them on the fly on the packet method... We have experiment to check what's best.
+      m_innerDim(xpr.lhs().cols())
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::AddCost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+#if 0
+    std::cerr << "LhsOuterStrideBytes=  " << LhsOuterStrideBytes << "\n";
+    std::cerr << "RhsOuterStrideBytes=  " << RhsOuterStrideBytes << "\n";
+    std::cerr << "LhsAlignment=         " << LhsAlignment << "\n";
+    std::cerr << "RhsAlignment=         " << RhsAlignment << "\n";
+    std::cerr << "CanVectorizeLhs=      " << CanVectorizeLhs << "\n";
+    std::cerr << "CanVectorizeRhs=      " << CanVectorizeRhs << "\n";
+    std::cerr << "CanVectorizeInner=    " << CanVectorizeInner << "\n";
+    std::cerr << "EvalToRowMajor=       " << EvalToRowMajor << "\n";
+    std::cerr << "Alignment=            " << Alignment << "\n";
+    std::cerr << "Flags=                " << Flags << "\n";
+#endif
+  }
 
   // Everything below here is taken from CoeffBasedProduct.h
 
+  typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
+  typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
+
+  typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
+  typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
+
+  typedef evaluator<LhsNestedCleaned> LhsEtorType;
+  typedef evaluator<RhsNestedCleaned> RhsEtorType;
+
   enum {
-    RowsAtCompileTime = traits<CoeffBasedProductType>::RowsAtCompileTime,
-    PacketSize = packet_traits<Scalar>::size,
-    InnerSize  = traits<CoeffBasedProductType>::InnerSize,
-    CoeffReadCost = traits<CoeffBasedProductType>::CoeffReadCost,
-    Unroll = CoeffReadCost != Dynamic && CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
-    CanVectorizeInner = traits<CoeffBasedProductType>::CanVectorizeInner
+    RowsAtCompileTime = LhsNestedCleaned::RowsAtCompileTime,
+    ColsAtCompileTime = RhsNestedCleaned::ColsAtCompileTime,
+    InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsNestedCleaned::ColsAtCompileTime, RhsNestedCleaned::RowsAtCompileTime),
+    MaxRowsAtCompileTime = LhsNestedCleaned::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime
   };
 
-  typedef typename evaluator<Lhs>::type LhsEtorType;
-  typedef typename evaluator<Rhs>::type RhsEtorType;
-  typedef etor_product_coeff_impl<CanVectorizeInner ? InnerVectorizedTraversal : DefaultTraversal,
-                                  Unroll ? InnerSize-1 : Dynamic,
-                                  LhsEtorType, RhsEtorType, Scalar> CoeffImpl;
+  typedef typename find_best_packet<Scalar,RowsAtCompileTime>::type LhsVecPacketType;
+  typedef typename find_best_packet<Scalar,ColsAtCompileTime>::type RhsVecPacketType;
 
-  const CoeffReturnType coeff(Index row, Index col) const
+  enum {
+
+    LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
+    RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
+    CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost
+                  : InnerSize == Dynamic ? HugeCost
+                    : InnerSize * (NumTraits<Scalar>::MulCost + int(LhsCoeffReadCost) + int(RhsCoeffReadCost))
+                    + (InnerSize - 1) * NumTraits<Scalar>::AddCost,
+
+    Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
+
+    LhsFlags = LhsEtorType::Flags,
+    RhsFlags = RhsEtorType::Flags,
+
+    LhsRowMajor = LhsFlags & RowMajorBit,
+    RhsRowMajor = RhsFlags & RowMajorBit,
+
+    LhsVecPacketSize = unpacket_traits<LhsVecPacketType>::size,
+    RhsVecPacketSize = unpacket_traits<RhsVecPacketType>::size,
+
+    // Here, we don't care about alignment larger than the usable packet size.
+    LhsAlignment = EIGEN_PLAIN_ENUM_MIN(LhsEtorType::Alignment,LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))),
+    RhsAlignment = EIGEN_PLAIN_ENUM_MIN(RhsEtorType::Alignment,RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))),
+
+    SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value,
+
+    CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime!=1),
+    CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) && (RowsAtCompileTime!=1),
+
+    EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
+                    : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
+                    : (bool(RhsRowMajor) && !CanVectorizeLhs),
+
+    Flags = ((int(LhsFlags) | int(RhsFlags)) & HereditaryBits & ~RowMajorBit)
+          | (EvalToRowMajor ? RowMajorBit : 0)
+          // TODO enable vectorization for mixed types
+          | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0)
+          | (XprType::IsVectorAtCompileTime ? LinearAccessBit : 0),
+
+    LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
+    RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),
+
+    Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<=0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
+              : bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<=0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
+              : 0,
+
+    /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
+     * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
+     * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
+     * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
+     */
+    CanVectorizeInner =    SameType
+                        && LhsRowMajor
+                        && (!RhsRowMajor)
+                        && (int(LhsFlags) & int(RhsFlags) & ActualPacketAccessBit)
+                        && (int(InnerSize) % packet_traits<Scalar>::size == 0)
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const
   {
-    Scalar res;
-    CoeffImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
-    return res;
+    return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
   }
 
   /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
    * which is why we don't set the LinearAccessBit.
+   * TODO: this seems possible when the result is a vector
    */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const CoeffReturnType coeff(Index index) const
   {
-    Scalar res;
-    const Index row = RowsAtCompileTime == 1 ? 0 : index;
-    const Index col = RowsAtCompileTime == 1 ? index : 0;
-    CoeffImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
-    return res;
+    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
+    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0;
+    return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
   }
 
-  template<int LoadMode>
-  const PacketReturnType packet(Index row, Index col) const
+  template<int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const PacketType packet(Index row, Index col) const
   {
-    PacketScalar res;
-    typedef etor_product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
-				     Unroll ? InnerSize-1 : Dynamic,
-				     LhsEtorType, RhsEtorType, PacketScalar, LoadMode> PacketImpl;
+    PacketType res;
+    typedef etor_product_packet_impl<bool(int(Flags)&RowMajorBit) ? RowMajor : ColMajor,
+                                     Unroll ? int(InnerSize) : Dynamic,
+                                     LhsEtorType, RhsEtorType, PacketType, LoadMode> PacketImpl;
     PacketImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
     return res;
   }
 
+  template<int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const PacketType packet(Index index) const
+  {
+    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
+    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0;
+    return packet<LoadMode,PacketType>(row,col);
+  }
+
 protected:
-  typename evaluator<Lhs>::type m_lhsImpl;
-  typename evaluator<Rhs>::type m_rhsImpl;
+  typename internal::add_const_on_value_type<LhsNested>::type m_lhs;
+  typename internal::add_const_on_value_type<RhsNested>::type m_rhs;
+
+  LhsEtorType m_lhsImpl;
+  RhsEtorType m_rhsImpl;
 
   // TODO: Get rid of m_innerDim if known at compile time
   Index m_innerDim;
 };
 
-/***************************************************************************
-* Normal product .coeff() implementation (with meta-unrolling)
-***************************************************************************/
-
-/**************************************
-*** Scalar path  - no vectorization ***
-**************************************/
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
-struct etor_product_coeff_impl<DefaultTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, RetScalar &res)
-  {
-    etor_product_coeff_impl<DefaultTraversal, UnrollingIndex-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, innerDim, res);
-    res += lhs.coeff(row, UnrollingIndex) * rhs.coeff(UnrollingIndex, col);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct etor_product_coeff_impl<DefaultTraversal, 0, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, RetScalar &res)
-  {
-    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct etor_product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, RetScalar& res)
-  {
-    eigen_assert(innerDim>0 && "you are using a non initialized matrix");
-    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
-    for(Index i = 1; i < innerDim; ++i)
-      res += lhs.coeff(row, i) * rhs.coeff(i, col);
-  }
-};
-
-/*******************************************
-*** Scalar path with inner vectorization ***
-*******************************************/
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet>
-struct etor_product_coeff_vectorized_unroller
-{
-  typedef typename Lhs::Index Index;
-  enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, typename Lhs::PacketScalar &pres)
-  {
-    etor_product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, innerDim, pres);
-    pres = padd(pres, pmul( lhs.template packet<Aligned>(row, UnrollingIndex) , rhs.template packet<Aligned>(UnrollingIndex, col) ));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet>
-struct etor_product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, typename Lhs::PacketScalar &pres)
-  {
-    pres = pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col));
-  }
-};
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
-struct etor_product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::PacketScalar Packet;
-  typedef typename Lhs::Index Index;
-  enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, RetScalar &res)
-  {
-    Packet pres;
-    etor_product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, innerDim, pres);
-    etor_product_coeff_impl<DefaultTraversal,UnrollingIndex,Lhs,Rhs,RetScalar>::run(row, col, lhs, rhs, innerDim, res);
-    res = predux(pres);
-  }
-};
-
-template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
-struct etor_product_coeff_vectorized_dyn_selector
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, typename Lhs::Scalar &res)
-  {
-    res = lhs.row(row).transpose().cwiseProduct(rhs.col(col)).sum();
-  }
-};
-
-// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
-// NOTE maybe they are now useless since we have a specialization for Block<Matrix>
-template<typename Lhs, typename Rhs, int RhsCols>
-struct etor_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, typename Lhs::Scalar &res)
-  {
-    res = lhs.transpose().cwiseProduct(rhs.col(col)).sum();
-  }
-};
-
-template<typename Lhs, typename Rhs, int LhsRows>
-struct etor_product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index /*col*/, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, typename Lhs::Scalar &res)
-  {
-    res = lhs.row(row).transpose().cwiseProduct(rhs).sum();
-  }
-};
-
 template<typename Lhs, typename Rhs>
-struct etor_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, LazyCoeffBasedProductMode, DenseShape, DenseShape>
+  : product_evaluator<Product<Lhs, Rhs, LazyProduct>, CoeffBasedProductMode, DenseShape, DenseShape>
 {
-  typedef typename Lhs::Index Index;
-  EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, typename Lhs::Scalar &res)
-  {
-    res = lhs.transpose().cwiseProduct(rhs).sum();
-  }
+  typedef Product<Lhs, Rhs, DefaultProduct> XprType;
+  typedef Product<Lhs, Rhs, LazyProduct> BaseProduct;
+  typedef product_evaluator<BaseProduct, CoeffBasedProductMode, DenseShape, DenseShape> Base;
+  enum {
+    Flags = Base::Flags | EvalBeforeNestingBit
+  };
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit product_evaluator(const XprType& xpr)
+    : Base(BaseProduct(xpr.lhs(),xpr.rhs()))
+  {}
 };
 
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct etor_product_coeff_impl<InnerVectorizedTraversal, Dynamic, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, typename Lhs::Scalar &res)
-  {
-    etor_product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, innerDim, res);
-  }
-};
-
-/*******************
-*** Packet path  ***
-*******************/
+/****************************************
+*** Coeff based product, Packet path  ***
+****************************************/
 
 template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
 {
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
   {
     etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
+    res =  pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex-1))), rhs.template packet<LoadMode,Packet>(Index(UnrollingIndex-1), col), res);
   }
 };
 
 template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
 {
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
   {
     etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res);
+    res =  pmadd(lhs.template packet<LoadMode,Packet>(row, Index(UnrollingIndex-1)), pset1<Packet>(rhs.coeff(Index(UnrollingIndex-1), col)), res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  {
+    res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),rhs.template packet<LoadMode,Packet>(Index(0), col));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  {
+    res = pmul(lhs.template packet<LoadMode,Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
   }
 };
 
 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
   {
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
   }
 };
 
 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
   {
-    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
   }
 };
 
 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
   {
-    eigen_assert(innerDim>0 && "you are using a non initialized matrix");
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
-    for(Index i = 1; i < innerDim; ++i)
-      res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+    for(Index i = 0; i < innerDim; ++i)
+      res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode,Packet>(i, col), res);
   }
 };
 
 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
   {
-    eigen_assert(innerDim>0 && "you are using a non initialized matrix");
-    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
-    for(Index i = 1; i < innerDim; ++i)
-      res =  pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+    for(Index i = 0; i < innerDim; ++i)
+      res =  pmadd(lhs.template packet<LoadMode,Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
+  }
+};
+
+
+/***************************************************************************
+* Triangular products
+***************************************************************************/
+template<int Mode, bool LhsIsTriangular,
+         typename Lhs, bool LhsIsVector,
+         typename Rhs, bool RhsIsVector>
+struct triangular_product_impl;
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs,Rhs,TriangularShape,DenseShape,ProductTag>
+  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,TriangularShape,DenseShape,ProductTag> >
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    triangular_product_impl<Lhs::Mode,true,typename Lhs::MatrixType,false,Rhs, Rhs::ColsAtCompileTime==1>
+        ::run(dst, lhs.nestedExpression(), rhs, alpha);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs,Rhs,DenseShape,TriangularShape,ProductTag>
+: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,TriangularShape,ProductTag> >
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    triangular_product_impl<Rhs::Mode,false,Lhs,Lhs::RowsAtCompileTime==1, typename Rhs::MatrixType, false>::run(dst, lhs, rhs.nestedExpression(), alpha);
+  }
+};
+
+
+/***************************************************************************
+* SelfAdjoint products
+***************************************************************************/
+template <typename Lhs, int LhsMode, bool LhsIsVector,
+          typename Rhs, int RhsMode, bool RhsIsVector>
+struct selfadjoint_product_impl;
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
+  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag> >
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC
+  void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    selfadjoint_product_impl<typename Lhs::MatrixType,Lhs::Mode,false,Rhs,0,Rhs::IsVectorAtCompileTime>::run(dst, lhs.nestedExpression(), rhs, alpha);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag>
+: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag> >
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    selfadjoint_product_impl<Lhs,0,Lhs::IsVectorAtCompileTime,typename Rhs::MatrixType,Rhs::Mode,false>::run(dst, lhs, rhs.nestedExpression(), alpha);
+  }
+};
+
+
+/***************************************************************************
+* Diagonal products
+***************************************************************************/
+
+template<typename MatrixType, typename DiagonalType, typename Derived, int ProductOrder>
+struct diagonal_product_evaluator_base
+  : evaluator_base<Derived>
+{
+   typedef typename ScalarBinaryOpTraits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
+public:
+  enum {
+    CoeffReadCost = int(NumTraits<Scalar>::MulCost) + int(evaluator<MatrixType>::CoeffReadCost) + int(evaluator<DiagonalType>::CoeffReadCost),
+
+    MatrixFlags = evaluator<MatrixType>::Flags,
+    DiagFlags = evaluator<DiagonalType>::Flags,
+
+    _StorageOrder = (Derived::MaxRowsAtCompileTime==1 && Derived::MaxColsAtCompileTime!=1) ? RowMajor
+                  : (Derived::MaxColsAtCompileTime==1 && Derived::MaxRowsAtCompileTime!=1) ? ColMajor
+                  : MatrixFlags & RowMajorBit ? RowMajor : ColMajor,
+    _SameStorageOrder = _StorageOrder == (MatrixFlags & RowMajorBit ? RowMajor : ColMajor),
+
+    _ScalarAccessOnDiag =  !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft)
+                           ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)),
+    _SameTypes = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
+    // FIXME currently we need same types, but in the future the next rule should be the one
+    //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))),
+    _Vectorizable =   bool(int(MatrixFlags)&PacketAccessBit)
+                  &&  _SameTypes
+                  && (_SameStorageOrder || (MatrixFlags&LinearAccessBit)==LinearAccessBit)
+                  && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
+    _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
+    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
+    Alignment = evaluator<MatrixType>::Alignment,
+
+    AsScalarProduct =     (DiagonalType::SizeAtCompileTime==1)
+                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft)
+                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
+  };
+
+  EIGEN_DEVICE_FUNC diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
+    : m_diagImpl(diag), m_matImpl(mat)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
+  {
+    if(AsScalarProduct)
+      return m_diagImpl.coeff(0) * m_matImpl.coeff(idx);
+    else
+      return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
+  }
+
+protected:
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::true_type) const
+  {
+    return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
+                          internal::pset1<PacketType>(m_diagImpl.coeff(id)));
+  }
+
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::false_type) const
+  {
+    enum {
+      InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
+      DiagonalPacketLoadMode = EIGEN_PLAIN_ENUM_MIN(LoadMode,((InnerSize%16) == 0) ? int(Aligned16) : int(evaluator<DiagonalType>::Alignment)) // FIXME hardcoded 16!!
+    };
+    return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
+                          m_diagImpl.template packet<DiagonalPacketLoadMode,PacketType>(id));
+  }
+
+  evaluator<DiagonalType> m_diagImpl;
+  evaluator<MatrixType>   m_matImpl;
+};
+
+// diagonal * dense
+template<typename Lhs, typename Rhs, int ProductKind, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalShape, DenseShape>
+  : diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheLeft>
+{
+  typedef diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheLeft> Base;
+  using Base::m_diagImpl;
+  using Base::m_matImpl;
+  using Base::coeff;
+  typedef typename Base::Scalar Scalar;
+
+  typedef Product<Lhs, Rhs, ProductKind> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef typename Lhs::DiagonalVectorType DiagonalType;
+
+
+  enum { StorageOrder = Base::_StorageOrder };
+
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+    : Base(xpr.rhs(), xpr.lhs().diagonal())
+  {
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
+  {
+    return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
+  }
+
+#ifndef EIGEN_GPUCC
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
+  {
+    // FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case.
+    // See also similar calls below.
+    return this->template packet_impl<LoadMode,PacketType>(row,col, row,
+                                 typename internal::conditional<int(StorageOrder)==RowMajor, internal::true_type, internal::false_type>::type());
+  }
+
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index idx) const
+  {
+    return packet<LoadMode,PacketType>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
+  }
+#endif
+};
+
+// dense * diagonal
+template<typename Lhs, typename Rhs, int ProductKind, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape, DiagonalShape>
+  : diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheRight>
+{
+  typedef diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheRight> Base;
+  using Base::m_diagImpl;
+  using Base::m_matImpl;
+  using Base::coeff;
+  typedef typename Base::Scalar Scalar;
+
+  typedef Product<Lhs, Rhs, ProductKind> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+
+  enum { StorageOrder = Base::_StorageOrder };
+
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+    : Base(xpr.lhs(), xpr.rhs().diagonal())
+  {
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
+  {
+    return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
+  }
+
+#ifndef EIGEN_GPUCC
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
+  {
+    return this->template packet_impl<LoadMode,PacketType>(row,col, col,
+                                 typename internal::conditional<int(StorageOrder)==ColMajor, internal::true_type, internal::false_type>::type());
+  }
+
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index idx) const
+  {
+    return packet<LoadMode,PacketType>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
+  }
+#endif
+};
+
+/***************************************************************************
+* Products with permutation matrices
+***************************************************************************/
+
+/** \internal
+  * \class permutation_matrix_product
+  * Internal helper class implementing the product between a permutation matrix and a matrix.
+  * This class is specialized for DenseShape below and for SparseShape in SparseCore/SparsePermutation.h
+  */
+template<typename ExpressionType, int Side, bool Transposed, typename ExpressionShape>
+struct permutation_matrix_product;
+
+template<typename ExpressionType, int Side, bool Transposed>
+struct permutation_matrix_product<ExpressionType, Side, Transposed, DenseShape>
+{
+    typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
+    typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
+
+    template<typename Dest, typename PermutationType>
+    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
+    {
+      MatrixType mat(xpr);
+      const Index n = Side==OnTheLeft ? mat.rows() : mat.cols();
+      // FIXME we need an is_same for expression that is not sensitive to constness. For instance
+      // is_same_xpr<Block<const Matrix>, Block<Matrix> >::value should be true.
+      //if(is_same<MatrixTypeCleaned,Dest>::value && extract_data(dst) == extract_data(mat))
+      if(is_same_dense(dst, mat))
+      {
+        // apply the permutation inplace
+        Matrix<bool,PermutationType::RowsAtCompileTime,1,0,PermutationType::MaxRowsAtCompileTime> mask(perm.size());
+        mask.fill(false);
+        Index r = 0;
+        while(r < perm.size())
+        {
+          // search for the next seed
+          while(r<perm.size() && mask[r]) r++;
+          if(r>=perm.size())
+            break;
+          // we got one, let's follow it until we are back to the seed
+          Index k0 = r++;
+          Index kPrev = k0;
+          mask.coeffRef(k0) = true;
+          for(Index k=perm.indices().coeff(k0); k!=k0; k=perm.indices().coeff(k))
+          {
+                  Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>(dst, k)
+            .swap(Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
+                       (dst,((Side==OnTheLeft) ^ Transposed) ? k0 : kPrev));
+
+            mask.coeffRef(k) = true;
+            kPrev = k;
+          }
+        }
+      }
+      else
+      {
+        for(Index i = 0; i < n; ++i)
+        {
+          Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
+               (dst, ((Side==OnTheLeft) ^ Transposed) ? perm.indices().coeff(i) : i)
+
+          =
+
+          Block<const MatrixTypeCleaned,Side==OnTheLeft ? 1 : MatrixTypeCleaned::RowsAtCompileTime,Side==OnTheRight ? 1 : MatrixTypeCleaned::ColsAtCompileTime>
+               (mat, ((Side==OnTheRight) ^ Transposed) ? perm.indices().coeff(i) : i);
+        }
+      }
+    }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, PermutationShape, MatrixShape, ProductTag>
+{
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    permutation_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, MatrixShape, PermutationShape, ProductTag>
+{
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    permutation_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Inverse<Lhs>, Rhs, PermutationShape, MatrixShape, ProductTag>
+{
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Inverse<Lhs>& lhs, const Rhs& rhs)
+  {
+    permutation_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Inverse<Rhs>, MatrixShape, PermutationShape, ProductTag>
+{
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Inverse<Rhs>& rhs)
+  {
+    permutation_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
+  }
+};
+
+
+/***************************************************************************
+* Products with transpositions matrices
+***************************************************************************/
+
+// FIXME could we unify Transpositions and Permutation into a single "shape"??
+
+/** \internal
+  * \class transposition_matrix_product
+  * Internal helper class implementing the product between a permutation matrix and a matrix.
+  */
+template<typename ExpressionType, int Side, bool Transposed, typename ExpressionShape>
+struct transposition_matrix_product
+{
+  typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
+  typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
+
+  template<typename Dest, typename TranspositionType>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr)
+  {
+    MatrixType mat(xpr);
+    typedef typename TranspositionType::StorageIndex StorageIndex;
+    const Index size = tr.size();
+    StorageIndex j = 0;
+
+    if(!is_same_dense(dst,mat))
+      dst = mat;
+
+    for(Index k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k)
+      if(Index(j=tr.coeff(k))!=k)
+      {
+        if(Side==OnTheLeft)        dst.row(k).swap(dst.row(j));
+        else if(Side==OnTheRight)  dst.col(k).swap(dst.col(j));
+      }
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, TranspositionsShape, MatrixShape, ProductTag>
+{
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    transposition_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, MatrixShape, TranspositionsShape, ProductTag>
+{
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    transposition_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
+  }
+};
+
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Transpose<Lhs>, Rhs, TranspositionsShape, MatrixShape, ProductTag>
+{
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
+  {
+    transposition_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Transpose<Rhs>, MatrixShape, TranspositionsShape, ProductTag>
+{
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
+  {
+    transposition_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
   }
 };
 

diff --git a/Eigen/src/Core/Random.h b/Eigen/src/Core/Random.h
index a5aa257..dab2ac8 100644
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h

@@ -10,24 +10,13 @@
 #ifndef EIGEN_RANDOM_H
 #define EIGEN_RANDOM_H
 
-namespace Eigen {
+namespace Eigen { 
 
 namespace internal {
 
 template<typename Scalar> struct scalar_random_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_random_op)
-
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index, Index = 0) const {
-#ifndef __CUDA_ARCH__
-    // We're not compiling a cuda kernel
-    return random<Scalar>();
-#else
-    // We're trying to generate a random number from a cuda kernel.
-    assert(false && "Generating random numbers on gpu isn't supported yet");
-    return Scalar(0);
-#endif
-  }
+  inline const Scalar operator() () const { return random<Scalar>(); }
 };
 
 template<typename Scalar>
@@ -40,16 +29,16 @@
   *
   * Numbers are uniformly spread through their whole definition range for integer types,
   * and in the [-1:1] range for floating point scalar types.
-  *
+  * 
   * The parameters \a rows and \a cols are the number of rows and of columns of
   * the returned matrix. Must be compatible with this MatrixBase type.
   *
   * \not_reentrant
-  *
+  * 
   * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
   * it is redundant to pass \a rows and \a cols as arguments, so Random() should be used
   * instead.
-  *
+  * 
   *
   * Example: \include MatrixBase_random_int_int.cpp
   * Output: \verbinclude MatrixBase_random_int_int.out
@@ -57,13 +46,13 @@
   * This expression has the "evaluate before nesting" flag so that it will be evaluated into
   * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
   * behavior with expressions involving random matrices.
-  *
+  * 
   * See DenseBase::NullaryExpr(Index, const CustomNullaryOp&) for an example using C++11 random generators.
   *
   * \sa DenseBase::setRandom(), DenseBase::Random(Index), DenseBase::Random()
   */
 template<typename Derived>
-inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
+inline const typename DenseBase<Derived>::RandomReturnType
 DenseBase<Derived>::Random(Index rows, Index cols)
 {
   return NullaryExpr(rows, cols, internal::scalar_random_op<Scalar>());
@@ -94,7 +83,7 @@
   * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random()
   */
 template<typename Derived>
-inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
+inline const typename DenseBase<Derived>::RandomReturnType
 DenseBase<Derived>::Random(Index size)
 {
   return NullaryExpr(size, internal::scalar_random_op<Scalar>());
@@ -104,7 +93,7 @@
   *
   * Numbers are uniformly spread through their whole definition range for integer types,
   * and in the [-1:1] range for floating point scalar types.
-  *
+  * 
   * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
   * need to use the variants taking size arguments.
   *
@@ -114,13 +103,13 @@
   * This expression has the "evaluate before nesting" flag so that it will be evaluated into
   * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
   * behavior with expressions involving random matrices.
-  *
+  * 
   * \not_reentrant
   *
   * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random(Index)
   */
 template<typename Derived>
-inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
+inline const typename DenseBase<Derived>::RandomReturnType
 DenseBase<Derived>::Random()
 {
   return NullaryExpr(RowsAtCompileTime, ColsAtCompileTime, internal::scalar_random_op<Scalar>());
@@ -130,17 +119,16 @@
   *
   * Numbers are uniformly spread through their whole definition range for integer types,
   * and in the [-1:1] range for floating point scalar types.
-  *
+  * 
   * \not_reentrant
-  *
+  * 
   * Example: \include MatrixBase_setRandom.cpp
   * Output: \verbinclude MatrixBase_setRandom.out
   *
   * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index)
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-inline Derived& DenseBase<Derived>::setRandom()
+EIGEN_DEVICE_FUNC inline Derived& DenseBase<Derived>::setRandom()
 {
   return *this = Random(rows(), cols());
 }
@@ -149,7 +137,7 @@
   *
   * Numbers are uniformly spread through their whole definition range for integer types,
   * and in the [-1:1] range for floating point scalar types.
-  *
+  * 
   * \only_for_vectors
   * \not_reentrant
   *
@@ -172,9 +160,9 @@
   * and in the [-1:1] range for floating point scalar types.
   *
   * \not_reentrant
-  *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * 
+  * \param rows the new number of rows
+  * \param cols the new number of columns
   *
   * Example: \include Matrix_setRandom_int_int.cpp
   * Output: \verbinclude Matrix_setRandom_int_int.out
@@ -183,12 +171,48 @@
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setRandom(Index nbRows, Index nbCols)
+PlainObjectBase<Derived>::setRandom(Index rows, Index cols)
 {
-  resize(nbRows, nbCols);
+  resize(rows, cols);
   return setRandom();
 }
 
+/** Resizes to the given size, changing only the number of columns, and sets all
+  * coefficients in this expression to random values. For the parameter of type
+  * NoChange_t, just pass the special value \c NoChange.
+  *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  *
+  * \not_reentrant
+  *
+  * \sa DenseBase::setRandom(), setRandom(Index), setRandom(Index, NoChange_t), class CwiseNullaryOp, DenseBase::Random()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setRandom(NoChange_t, Index cols)
+{
+  return setRandom(rows(), cols);
+}
+
+/** Resizes to the given size, changing only the number of rows, and sets all
+  * coefficients in this expression to random values. For the parameter of type
+  * NoChange_t, just pass the special value \c NoChange.
+  *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  *
+  * \not_reentrant
+  *
+  * \sa DenseBase::setRandom(), setRandom(Index), setRandom(NoChange_t, Index), class CwiseNullaryOp, DenseBase::Random()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setRandom(Index rows, NoChange_t)
+{
+  return setRandom(rows, cols());
+}
+
 } // end namespace Eigen
 
 #endif // EIGEN_RANDOM_H

diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index 98c182c..b6790d1 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h

@@ -23,22 +23,29 @@
 * Part 1 : the logic deciding a strategy for vectorization and unrolling
 ***************************************************************************/
 
-template<typename Func, typename Derived>
+template<typename Func, typename Evaluator>
 struct redux_traits
 {
 public:
+    typedef typename find_best_packet<typename Evaluator::Scalar,Evaluator::SizeAtCompileTime>::type PacketType;
   enum {
-    PacketSize = packet_traits<typename Derived::Scalar>::size,
-    InnerMaxSize = int(Derived::IsRowMajor)
-                 ? Derived::MaxColsAtCompileTime
-                 : Derived::MaxRowsAtCompileTime
+    PacketSize = unpacket_traits<PacketType>::size,
+    InnerMaxSize = int(Evaluator::IsRowMajor)
+                 ? Evaluator::MaxColsAtCompileTime
+                 : Evaluator::MaxRowsAtCompileTime,
+    OuterMaxSize = int(Evaluator::IsRowMajor)
+                 ? Evaluator::MaxRowsAtCompileTime
+                 : Evaluator::MaxColsAtCompileTime,
+    SliceVectorizedWork = int(InnerMaxSize)==Dynamic ? Dynamic
+                        : int(OuterMaxSize)==Dynamic ? (int(InnerMaxSize)>=int(PacketSize) ? Dynamic : 0)
+                        : (int(InnerMaxSize)/int(PacketSize)) * int(OuterMaxSize)
   };
 
   enum {
-    MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit)
+    MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit)
                   && (functor_traits<Func>::PacketAccess),
-    MayLinearVectorize = MightVectorize && (int(Derived::Flags)&LinearAccessBit),
-    MaySliceVectorize  = MightVectorize && int(InnerMaxSize)>=3*PacketSize
+    MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit),
+    MaySliceVectorize  = bool(MightVectorize) && (int(SliceVectorizedWork)==Dynamic || int(SliceVectorizedWork)>=3)
   };
 
 public:
@@ -50,21 +57,36 @@
 
 public:
   enum {
-    Cost = (  Derived::SizeAtCompileTime == Dynamic
-           || Derived::CoeffReadCost == Dynamic
-           || (Derived::SizeAtCompileTime!=1 && functor_traits<Func>::Cost == Dynamic)
-           ) ? Dynamic
-           : Derived::SizeAtCompileTime * Derived::CoeffReadCost
-               + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
+    Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost
+         : int(Evaluator::SizeAtCompileTime) * int(Evaluator::CoeffReadCost) + (Evaluator::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
     UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
   };
 
 public:
   enum {
-    Unrolling = Cost != Dynamic && Cost <= UnrollingLimit
-              ? CompleteUnrolling
-              : NoUnrolling
+    Unrolling = Cost <= UnrollingLimit ? CompleteUnrolling : NoUnrolling
   };
+  
+#ifdef EIGEN_DEBUG_ASSIGN
+  static void debug()
+  {
+    std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl;
+    std::cerr.setf(std::ios::hex, std::ios::basefield);
+    EIGEN_DEBUG_VAR(Evaluator::Flags)
+    std::cerr.unsetf(std::ios::hex);
+    EIGEN_DEBUG_VAR(InnerMaxSize)
+    EIGEN_DEBUG_VAR(OuterMaxSize)
+    EIGEN_DEBUG_VAR(SliceVectorizedWork)
+    EIGEN_DEBUG_VAR(PacketSize)
+    EIGEN_DEBUG_VAR(MightVectorize)
+    EIGEN_DEBUG_VAR(MayLinearVectorize)
+    EIGEN_DEBUG_VAR(MaySliceVectorize)
+    std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
+    EIGEN_DEBUG_VAR(UnrollingLimit)
+    std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl;
+    std::cerr << std::endl;
+  }
+#endif
 };
 
 /***************************************************************************
@@ -73,88 +95,86 @@
 
 /*** no vectorization ***/
 
-template<typename Func, typename Derived, int Start, int Length>
+template<typename Func, typename Evaluator, int Start, int Length>
 struct redux_novec_unroller
 {
   enum {
     HalfLength = Length/2
   };
 
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
 
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func)
   {
-    return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
-                redux_novec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func));
+    return func(redux_novec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
+                redux_novec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func));
   }
 };
 
-template<typename Func, typename Derived, int Start>
-struct redux_novec_unroller<Func, Derived, Start, 1>
+template<typename Func, typename Evaluator, int Start>
+struct redux_novec_unroller<Func, Evaluator, Start, 1>
 {
   enum {
-    outer = Start / Derived::InnerSizeAtCompileTime,
-    inner = Start % Derived::InnerSizeAtCompileTime
+    outer = Start / Evaluator::InnerSizeAtCompileTime,
+    inner = Start % Evaluator::InnerSizeAtCompileTime
   };
 
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
 
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&)
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&)
   {
-    return mat.coeffByOuterInner(outer, inner);
+    return eval.coeffByOuterInner(outer, inner);
   }
 };
 
 // This is actually dead code and will never be called. It is required
 // to prevent false warnings regarding failed inlining though
 // for 0 length run() will never be called at all.
-template<typename Func, typename Derived, int Start>
-struct redux_novec_unroller<Func, Derived, Start, 0>
+template<typename Func, typename Evaluator, int Start>
+struct redux_novec_unroller<Func, Evaluator, Start, 0>
 {
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
   EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
 };
 
 /*** vectorization ***/
 
-template<typename Func, typename Derived, int Start, int Length>
+template<typename Func, typename Evaluator, int Start, int Length>
 struct redux_vec_unroller
 {
-  enum {
-    PacketSize = packet_traits<typename Derived::Scalar>::size,
-    HalfLength = Length/2
-  };
-
-  typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
-
-  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func)
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func& func)
   {
+    enum {
+      PacketSize = unpacket_traits<PacketType>::size,
+      HalfLength = Length/2
+    };
+
     return func.packetOp(
-            redux_vec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
-            redux_vec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func) );
+            redux_vec_unroller<Func, Evaluator, Start, HalfLength>::template run<PacketType>(eval,func),
+            redux_vec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::template run<PacketType>(eval,func) );
   }
 };
 
-template<typename Func, typename Derived, int Start>
-struct redux_vec_unroller<Func, Derived, Start, 1>
+template<typename Func, typename Evaluator, int Start>
+struct redux_vec_unroller<Func, Evaluator, Start, 1>
 {
-  enum {
-    index = Start * packet_traits<typename Derived::Scalar>::size,
-    outer = index / int(Derived::InnerSizeAtCompileTime),
-    inner = index % int(Derived::InnerSizeAtCompileTime),
-    alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned
-  };
-
-  typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
-
-  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&)
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func&)
   {
-    return mat.template packetByOuterInner<alignment>(outer, inner);
+    enum {
+      PacketSize = unpacket_traits<PacketType>::size,
+      index = Start * PacketSize,
+      outer = index / int(Evaluator::InnerSizeAtCompileTime),
+      inner = index % int(Evaluator::InnerSizeAtCompileTime),
+      alignment = Evaluator::Alignment
+    };
+    return eval.template packetByOuterInner<alignment,PacketType>(outer, inner);
   }
 };
 
@@ -162,54 +182,65 @@
 * Part 3 : implementation of all cases
 ***************************************************************************/
 
-template<typename Func, typename Derived,
-         int Traversal = redux_traits<Func, Derived>::Traversal,
-         int Unrolling = redux_traits<Func, Derived>::Unrolling
+template<typename Func, typename Evaluator,
+         int Traversal = redux_traits<Func, Evaluator>::Traversal,
+         int Unrolling = redux_traits<Func, Evaluator>::Unrolling
 >
 struct redux_impl;
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
-  typedef typename Derived::Index Index;
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func)
+  typedef typename Evaluator::Scalar Scalar;
+
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
   {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
     Scalar res;
-    res = mat.coeffByOuterInner(0, 0);
-    for(Index i = 1; i < mat.innerSize(); ++i)
-      res = func(res, mat.coeffByOuterInner(0, i));
-    for(Index i = 1; i < mat.outerSize(); ++i)
-      for(Index j = 0; j < mat.innerSize(); ++j)
-        res = func(res, mat.coeffByOuterInner(i, j));
+    res = eval.coeffByOuterInner(0, 0);
+    for(Index i = 1; i < xpr.innerSize(); ++i)
+      res = func(res, eval.coeffByOuterInner(0, i));
+    for(Index i = 1; i < xpr.outerSize(); ++i)
+      for(Index j = 0; j < xpr.innerSize(); ++j)
+        res = func(res, eval.coeffByOuterInner(i, j));
     return res;
   }
 };
 
-template<typename Func, typename Derived>
-struct redux_impl<Func,Derived, DefaultTraversal, CompleteUnrolling>
-  : public redux_novec_unroller<Func,Derived, 0, Derived::SizeAtCompileTime>
-{};
-
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func,Evaluator, DefaultTraversal, CompleteUnrolling>
+  : redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime>
 {
-  typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
-  typedef typename Derived::Index Index;
-
-  static Scalar run(const Derived& mat, const Func& func)
+  typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
+  typedef typename Evaluator::Scalar Scalar;
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/)
   {
-    const Index size = mat.size();
-    eigen_assert(size && "you are using an empty matrix");
-    const Index packetSize = packet_traits<Scalar>::size;
-    const Index alignedStart = internal::first_aligned(mat);
+    return Base::run(eval,func);
+  }
+};
+
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
+{
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
+
+  template<typename XprType>
+  static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
+  {
+    const Index size = xpr.size();
+    
+    const Index packetSize = redux_traits<Func, Evaluator>::PacketSize;
+    const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
     enum {
-      alignment = bool(Derived::Flags & DirectAccessBit) || bool(Derived::Flags & AlignedBit)
-                ? Aligned : Unaligned
+      alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
+      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment)
     };
+    const Index alignedStart = internal::first_default_aligned(xpr);
     const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
     const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
     const Index alignedEnd2 = alignedStart + alignedSize2;
@@ -217,104 +248,145 @@
     Scalar res;
     if(alignedSize)
     {
-      PacketScalar packet_res0 = mat.template packet<alignment>(alignedStart);
+      PacketScalar packet_res0 = eval.template packet<alignment,PacketScalar>(alignedStart);
       if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
       {
-        PacketScalar packet_res1 = mat.template packet<alignment>(alignedStart+packetSize);
+        PacketScalar packet_res1 = eval.template packet<alignment,PacketScalar>(alignedStart+packetSize);
         for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
         {
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment>(index));
-          packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment>(index+packetSize));
+          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(index));
+          packet_res1 = func.packetOp(packet_res1, eval.template packet<alignment,PacketScalar>(index+packetSize));
         }
 
         packet_res0 = func.packetOp(packet_res0,packet_res1);
         if(alignedEnd>alignedEnd2)
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment>(alignedEnd2));
+          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(alignedEnd2));
       }
       res = func.predux(packet_res0);
 
       for(Index index = 0; index < alignedStart; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
 
       for(Index index = alignedEnd; index < size; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
     }
     else // too small to vectorize anything.
          // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
     {
-      res = mat.coeff(0);
+      res = eval.coeff(0);
       for(Index index = 1; index < size; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
     }
 
     return res;
   }
 };
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, SliceVectorizedTraversal, NoUnrolling>
+// NOTE: for SliceVectorizedTraversal we simply bypass unrolling
+template<typename Func, typename Evaluator, int Unrolling>
+struct redux_impl<Func, Evaluator, SliceVectorizedTraversal, Unrolling>
 {
-  typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
-  typedef typename Derived::Index Index;
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
 
-  static Scalar run(const Derived& mat, const Func& func)
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
   {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
-    const Index innerSize = mat.innerSize();
-    const Index outerSize = mat.outerSize();
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
+    const Index innerSize = xpr.innerSize();
+    const Index outerSize = xpr.outerSize();
     enum {
-      packetSize = packet_traits<Scalar>::size
+      packetSize = redux_traits<Func, Evaluator>::PacketSize
     };
     const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize;
     Scalar res;
     if(packetedInnerSize)
     {
-      PacketScalar packet_res = mat.template packet<Unaligned>(0,0);
+      PacketType packet_res = eval.template packet<Unaligned,PacketType>(0,0);
       for(Index j=0; j<outerSize; ++j)
         for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize))
-          packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned>(j,i));
+          packet_res = func.packetOp(packet_res, eval.template packetByOuterInner<Unaligned,PacketType>(j,i));
 
       res = func.predux(packet_res);
       for(Index j=0; j<outerSize; ++j)
         for(Index i=packetedInnerSize; i<innerSize; ++i)
-          res = func(res, mat.coeffByOuterInner(j,i));
+          res = func(res, eval.coeffByOuterInner(j,i));
     }
     else // too small to vectorize anything.
          // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
     {
-      res = redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>::run(mat, func);
+      res = redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>::run(eval, func, xpr);
     }
 
     return res;
   }
 };
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
+  typedef typename Evaluator::Scalar Scalar;
+
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
   enum {
-    PacketSize = packet_traits<Scalar>::size,
-    Size = Derived::SizeAtCompileTime,
-    VectorizedSize = (Size / PacketSize) * PacketSize
+    PacketSize = redux_traits<Func, Evaluator>::PacketSize,
+    Size = Evaluator::SizeAtCompileTime,
+    VectorizedSize = (int(Size) / int(PacketSize)) * int(PacketSize)
   };
-  static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func)
+
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr)
   {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    EIGEN_ONLY_USED_FOR_DEBUG(xpr)
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
     if (VectorizedSize > 0) {
-      Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
+      Scalar res = func.predux(redux_vec_unroller<Func, Evaluator, 0, Size / PacketSize>::template run<PacketType>(eval,func));
       if (VectorizedSize != Size)
-        res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
+        res = func(res,redux_novec_unroller<Func, Evaluator, VectorizedSize, Size-VectorizedSize>::run(eval,func));
       return res;
     }
     else {
-      return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
+      return redux_novec_unroller<Func, Evaluator, 0, Size>::run(eval,func);
     }
   }
 };
 
+// evaluator adaptor
+template<typename _XprType>
+class redux_evaluator : public internal::evaluator<_XprType>
+{
+  typedef internal::evaluator<_XprType> Base;
+public:
+  typedef _XprType XprType;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit redux_evaluator(const XprType &xpr) : Base(xpr) {}
+  
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketScalar PacketScalar;
+  
+  enum {
+    MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
+    // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator
+    Flags = Base::Flags & ~DirectAccessBit,
+    IsRowMajor = XprType::IsRowMajor,
+    SizeAtCompileTime = XprType::SizeAtCompileTime,
+    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime
+  };
+  
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
+  { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  
+  template<int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketType packetByOuterInner(Index outer, Index inner) const
+  { return Base::template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  
+};
+
 } // end namespace internal
 
 /***************************************************************************
@@ -325,55 +397,70 @@
 /** \returns the result of a full redux operation on the whole matrix or vector using \a func
   *
   * The template parameter \a BinaryOp is the type of the functor \a func which must be
-  * an associative operator. Both current STL and TR1 functor styles are handled.
+  * an associative operator. Both current C++98 and C++11 functor styles are handled.
+  *
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
   *
   * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise()
   */
 template<typename Derived>
 template<typename Func>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE typename internal::result_of<Func(typename internal::traits<Derived>::Scalar)>::type
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::redux(const Func& func) const
 {
-  typedef typename internal::remove_all<typename Derived::Nested>::type ThisNested;
-  return internal::redux_impl<Func, ThisNested>
-            ::run(derived(), func);
+  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
+  typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
+  ThisEvaluator thisEval(derived());
+
+  // The initial expression is passed to the reducer as an additional argument instead of
+  // passing it as a member of redux_evaluator to help  
+  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func, derived());
 }
 
 /** \returns the minimum of all coefficients of \c *this.
-  * \warning the result is undefined if \c *this contains NaN.
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is minimum of elements that are not NaN
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+template<int NaNPropagation>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff() const
 {
-  return this->redux(Eigen::internal::scalar_min_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar, NaNPropagation>());
 }
 
-/** \returns the maximum of all coefficients of \c *this.
-  * \warning the result is undefined if \c *this contains NaN.
+/** \returns the maximum of all coefficients of \c *this. 
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+template<int NaNPropagation>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff() const
 {
-  return this->redux(Eigen::internal::scalar_max_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar, NaNPropagation>());
 }
 
-/** \returns the sum of all coefficients of *this
+/** \returns the sum of all coefficients of \c *this
+  *
+  * If \c *this is empty, then the value 0 is returned.
   *
   * \sa trace(), prod(), mean()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::sum() const
 {
   if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
     return Scalar(0);
-  return this->redux(Eigen::internal::scalar_sum_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_sum_op<Scalar,Scalar>());
 }
 
 /** \returns the mean of all coefficients of *this
@@ -381,11 +468,17 @@
 * \sa trace(), prod(), sum()
 */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::mean() const
 {
-  return Scalar(this->redux(Eigen::internal::scalar_sum_op<Scalar>())) / Scalar(this->size());
+#ifdef __INTEL_COMPILER
+  #pragma warning push
+  #pragma warning ( disable : 2259 )
+#endif
+  return Scalar(derived().redux(Eigen::internal::scalar_sum_op<Scalar,Scalar>())) / Scalar(this->size());
+#ifdef __INTEL_COMPILER
+  #pragma warning pop
+#endif
 }
 
 /** \returns the product of all coefficients of *this
@@ -396,13 +489,12 @@
   * \sa sum(), mean(), trace()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::prod() const
 {
   if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
     return Scalar(1);
-  return this->redux(Eigen::internal::scalar_product_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_product_op<Scalar>());
 }
 
 /** \returns the trace of \c *this, i.e. the sum of the coefficients on the main diagonal.
@@ -412,7 +504,7 @@
   * \sa diagonal(), sum()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 MatrixBase<Derived>::trace() const
 {
   return derived().diagonal().sum();

diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h
index cd6d949..c2a37ea 100644
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h

@@ -10,80 +10,7 @@
 #ifndef EIGEN_REF_H
 #define EIGEN_REF_H
 
-namespace Eigen { 
-
-template<typename Derived> class RefBase;
-template<typename PlainObjectType, int Options = 0,
-         typename StrideType = typename internal::conditional<PlainObjectType::IsVectorAtCompileTime,InnerStride<1>,OuterStride<> >::type > class Ref;
-
-/** \class Ref
-  * \ingroup Core_Module
-  *
-  * \brief A matrix or vector expression mapping an existing expressions
-  *
-  * \tparam PlainObjectType the equivalent matrix type of the mapped data
-  * \tparam Options specifies whether the pointer is \c #Aligned, or \c #Unaligned.
-  *                The default is \c #Unaligned.
-  * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1),
-  *                   but accept a variable outer stride (leading dimension).
-  *                   This can be overridden by specifying strides.
-  *                   The type passed here must be a specialization of the Stride template, see examples below.
-  *
-  * This class permits to write non template functions taking Eigen's object as parameters while limiting the number of copies.
-  * A Ref<> object can represent either a const expression or a l-value:
-  * \code
-  * // in-out argument:
-  * void foo1(Ref<VectorXf> x);
-  *
-  * // read-only const argument:
-  * void foo2(const Ref<const VectorXf>& x);
-  * \endcode
-  *
-  * In the in-out case, the input argument must satisfies the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered.
-  * By default, a Ref<VectorXf> can reference any dense vector expression of float having a contiguous memory layout.
-  * Likewise, a Ref<MatrixXf> can reference any column major dense matrix expression of float whose column's elements are contiguously stored with
-  * the possibility to have a constant space inbetween each column, i.e.: the inner stride mmust be equal to 1, but the outer-stride (or leading dimension),
-  * can be greater than the number of rows.
-  *
-  * In the const case, if the input expression does not match the above requirement, then it is evaluated into a temporary before being passed to the function.
-  * Here are some examples:
-  * \code
-  * MatrixXf A;
-  * VectorXf a;
-  * foo1(a.head());             // OK
-  * foo1(A.col());              // OK
-  * foo1(A.row());              // compilation error because here innerstride!=1
-  * foo2(A.row());              // The row is copied into a contiguous temporary
-  * foo2(2*a);                  // The expression is evaluated into a temporary
-  * foo2(A.col().segment(2,4)); // No temporary
-  * \endcode
-  *
-  * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameter.
-  * Here is an example accepting an innerstride!=1:
-  * \code
-  * // in-out argument:
-  * void foo3(Ref<VectorXf,0,InnerStride<> > x);
-  * foo3(A.row());              // OK
-  * \endcode
-  * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involved more
-  * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overloads internally calling a
-  * template function, e.g.:
-  * \code
-  * // in the .h:
-  * void foo(const Ref<MatrixXf>& A);
-  * void foo(const Ref<MatrixXf,0,Stride<> >& A);
-  *
-  * // in the .cpp:
-  * template<typename TypeOfA> void foo_impl(const TypeOfA& A) {
-  *     ... // crazy code goes here
-  * }
-  * void foo(const Ref<MatrixXf>& A) { foo_impl(A); }
-  * void foo(const Ref<MatrixXf,0,Stride<> >& A) { foo_impl(A); }
-  * \endcode
-  *
-  *
-  * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
-  */
+namespace Eigen {
 
 namespace internal {
 
@@ -95,24 +22,33 @@
   typedef _StrideType StrideType;
   enum {
     Options = _Options,
-    Flags = traits<Map<_PlainObjectType, _Options, _StrideType> >::Flags | NestByRefBit
+    Flags = traits<Map<_PlainObjectType, _Options, _StrideType> >::Flags | NestByRefBit,
+    Alignment = traits<Map<_PlainObjectType, _Options, _StrideType> >::Alignment
   };
 
   template<typename Derived> struct match {
     enum {
+      IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime,
       HasDirectAccess = internal::has_direct_access<Derived>::ret,
-      StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
+      StorageOrderMatch = IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
       InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic)
                       || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
                       || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
-      OuterStrideMatch = Derived::IsVectorAtCompileTime
+      OuterStrideMatch = IsVectorAtCompileTime
                       || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
-      AlignmentMatch = (_Options!=Aligned) || ((PlainObjectType::Flags&AlignedBit)==0) || ((traits<Derived>::Flags&AlignedBit)==AlignedBit),
-      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch
+      // NOTE, this indirection of evaluator<Derived>::Alignment is needed
+      // to workaround a very strange bug in MSVC related to the instantiation
+      // of has_*ary_operator in evaluator<CwiseNullaryOp>.
+      // This line is surprisingly very sensitive. For instance, simply adding parenthesis
+      // as "DerivedAlignment = (int(evaluator<Derived>::Alignment))," will make MSVC fail...
+      DerivedAlignment = int(evaluator<Derived>::Alignment),
+      AlignmentMatch = (int(traits<PlainObjectType>::Alignment)==int(Unaligned)) || (DerivedAlignment >= int(Alignment)), // FIXME the first condition is not very clear, it should be replaced by the required alignment
+      ScalarTypeMatch = internal::is_same<typename PlainObjectType::Scalar, typename Derived::Scalar>::value,
+      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch && ScalarTypeMatch
     };
     typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
   };
-  
+
 };
 
 template<typename Derived>
@@ -131,12 +67,12 @@
   typedef MapBase<Derived> Base;
   EIGEN_DENSE_PUBLIC_INTERFACE(RefBase)
 
-  inline Index innerStride() const
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const
   {
     return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
   }
 
-  inline Index outerStride() const
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const
   {
     return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
          : IsVectorAtCompileTime ? this->size()
@@ -144,50 +80,212 @@
          : this->rows();
   }
 
-  RefBase()
+  EIGEN_DEVICE_FUNC RefBase()
     : Base(0,RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime),
       // Stride<> does not allow default ctor for Dynamic strides, so let' initialize it with dummy values:
       m_stride(StrideType::OuterStrideAtCompileTime==Dynamic?0:StrideType::OuterStrideAtCompileTime,
                StrideType::InnerStrideAtCompileTime==Dynamic?0:StrideType::InnerStrideAtCompileTime)
   {}
-  
+
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(RefBase)
 
 protected:
 
   typedef Stride<StrideType::OuterStrideAtCompileTime,StrideType::InnerStrideAtCompileTime> StrideBase;
 
+  // Resolves inner stride if default 0.
+  static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveInnerStride(Index inner) {
+    return inner == 0 ? 1 : inner;
+  }
+
+  // Resolves outer stride if default 0.
+  static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveOuterStride(Index inner, Index outer, Index rows, Index cols, bool isVectorAtCompileTime, bool isRowMajor) {
+    return outer == 0 ? isVectorAtCompileTime ? inner * rows * cols : isRowMajor ? inner * cols : inner * rows : outer;
+  }
+
+  // Returns true if construction is valid, false if there is a stride mismatch,
+  // and fails if there is a size mismatch.
   template<typename Expression>
-  void construct(Expression& expr)
+  EIGEN_DEVICE_FUNC bool construct(Expression& expr)
   {
+    // Check matrix sizes.  If this is a compile-time vector, we do allow
+    // implicitly transposing.
+    EIGEN_STATIC_ASSERT(
+      EIGEN_PREDICATE_SAME_MATRIX_SIZE(PlainObjectType, Expression)
+      // If it is a vector, the transpose sizes might match.
+      || ( PlainObjectType::IsVectorAtCompileTime
+            && ((int(PlainObjectType::RowsAtCompileTime)==Eigen::Dynamic
+              || int(Expression::ColsAtCompileTime)==Eigen::Dynamic
+              || int(PlainObjectType::RowsAtCompileTime)==int(Expression::ColsAtCompileTime))
+            &&  (int(PlainObjectType::ColsAtCompileTime)==Eigen::Dynamic
+              || int(Expression::RowsAtCompileTime)==Eigen::Dynamic
+              || int(PlainObjectType::ColsAtCompileTime)==int(Expression::RowsAtCompileTime)))),
+      YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES
+    )
+
+    // Determine runtime rows and columns.
+    Index rows = expr.rows();
+    Index cols = expr.cols();
     if(PlainObjectType::RowsAtCompileTime==1)
     {
       eigen_assert(expr.rows()==1 || expr.cols()==1);
-      ::new (static_cast<Base*>(this)) Base(expr.data(), 1, expr.size());
+      rows = 1;
+      cols = expr.size();
     }
     else if(PlainObjectType::ColsAtCompileTime==1)
     {
       eigen_assert(expr.rows()==1 || expr.cols()==1);
-      ::new (static_cast<Base*>(this)) Base(expr.data(), expr.size(), 1);
+      rows = expr.size();
+      cols = 1;
     }
-    else
-      ::new (static_cast<Base*>(this)) Base(expr.data(), expr.rows(), expr.cols());
-    
-    if(Expression::IsVectorAtCompileTime && (!PlainObjectType::IsVectorAtCompileTime) && ((Expression::Flags&RowMajorBit)!=(PlainObjectType::Flags&RowMajorBit)))
-      ::new (&m_stride) StrideBase(expr.innerStride(), StrideType::InnerStrideAtCompileTime==0?0:1);
-    else
-      ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(),
-                                   StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride());    
+    // Verify that the sizes are valid.
+    eigen_assert(
+      (PlainObjectType::RowsAtCompileTime == Dynamic) || (PlainObjectType::RowsAtCompileTime == rows));
+    eigen_assert(
+      (PlainObjectType::ColsAtCompileTime == Dynamic) || (PlainObjectType::ColsAtCompileTime == cols));
+
+
+    // If this is a vector, we might be transposing, which means that stride should swap.
+    const bool transpose = PlainObjectType::IsVectorAtCompileTime && (rows != expr.rows());
+    // If the storage format differs, we also need to swap the stride.
+    const bool row_major = ((PlainObjectType::Flags)&RowMajorBit) != 0;
+    const bool expr_row_major = (Expression::Flags&RowMajorBit) != 0;
+    const bool storage_differs =  (row_major != expr_row_major);
+
+    const bool swap_stride = (transpose != storage_differs);
+
+    // Determine expr's actual strides, resolving any defaults if zero.
+    const Index expr_inner_actual = resolveInnerStride(expr.innerStride());
+    const Index expr_outer_actual = resolveOuterStride(expr_inner_actual,
+                                                       expr.outerStride(),
+                                                       expr.rows(),
+                                                       expr.cols(),
+                                                       Expression::IsVectorAtCompileTime != 0,
+                                                       expr_row_major);
+
+    // If this is a column-major row vector or row-major column vector, the inner-stride
+    // is arbitrary, so set it to either the compile-time inner stride or 1.
+    const bool row_vector = (rows == 1);
+    const bool col_vector = (cols == 1);
+    const Index inner_stride =
+        ( (!row_major && row_vector) || (row_major && col_vector) ) ?
+            ( StrideType::InnerStrideAtCompileTime > 0 ? Index(StrideType::InnerStrideAtCompileTime) : 1)
+            : swap_stride ? expr_outer_actual : expr_inner_actual;
+
+    // If this is a column-major column vector or row-major row vector, the outer-stride
+    // is arbitrary, so set it to either the compile-time outer stride or vector size.
+    const Index outer_stride =
+      ( (!row_major && col_vector) || (row_major && row_vector) ) ?
+          ( StrideType::OuterStrideAtCompileTime > 0 ? Index(StrideType::OuterStrideAtCompileTime) : rows * cols * inner_stride)
+          : swap_stride ? expr_inner_actual : expr_outer_actual;
+
+    // Check if given inner/outer strides are compatible with compile-time strides.
+    const bool inner_valid = (StrideType::InnerStrideAtCompileTime == Dynamic)
+        || (resolveInnerStride(Index(StrideType::InnerStrideAtCompileTime)) == inner_stride);
+    if (!inner_valid) {
+      return false;
+    }
+
+    const bool outer_valid = (StrideType::OuterStrideAtCompileTime == Dynamic)
+        || (resolveOuterStride(
+              inner_stride,
+              Index(StrideType::OuterStrideAtCompileTime),
+              rows, cols, PlainObjectType::IsVectorAtCompileTime != 0,
+              row_major)
+            == outer_stride);
+    if (!outer_valid) {
+      return false;
+    }
+
+    ::new (static_cast<Base*>(this)) Base(expr.data(), rows, cols);
+    ::new (&m_stride) StrideBase(
+      (StrideType::OuterStrideAtCompileTime == 0) ? 0 : outer_stride,
+      (StrideType::InnerStrideAtCompileTime == 0) ? 0 : inner_stride );
+    return true;
   }
 
   StrideBase m_stride;
 };
 
-
+/** \class Ref
+  * \ingroup Core_Module
+  *
+  * \brief A matrix or vector expression mapping an existing expression
+  *
+  * \tparam PlainObjectType the equivalent matrix type of the mapped data
+  * \tparam Options specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
+  *                 The default is \c #Unaligned.
+  * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1),
+  *                   but accepts a variable outer stride (leading dimension).
+  *                   This can be overridden by specifying strides.
+  *                   The type passed here must be a specialization of the Stride template, see examples below.
+  *
+  * This class provides a way to write non-template functions taking Eigen objects as parameters while limiting the number of copies.
+  * A Ref<> object can represent either a const expression or a l-value:
+  * \code
+  * // in-out argument:
+  * void foo1(Ref<VectorXf> x);
+  *
+  * // read-only const argument:
+  * void foo2(const Ref<const VectorXf>& x);
+  * \endcode
+  *
+  * In the in-out case, the input argument must satisfy the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered.
+  * By default, a Ref<VectorXf> can reference any dense vector expression of float having a contiguous memory layout.
+  * Likewise, a Ref<MatrixXf> can reference any column-major dense matrix expression of float whose column's elements are contiguously stored with
+  * the possibility to have a constant space in-between each column, i.e. the inner stride must be equal to 1, but the outer stride (or leading dimension)
+  * can be greater than the number of rows.
+  *
+  * In the const case, if the input expression does not match the above requirement, then it is evaluated into a temporary before being passed to the function.
+  * Here are some examples:
+  * \code
+  * MatrixXf A;
+  * VectorXf a;
+  * foo1(a.head());             // OK
+  * foo1(A.col());              // OK
+  * foo1(A.row());              // Compilation error because here innerstride!=1
+  * foo2(A.row());              // Compilation error because A.row() is a 1xN object while foo2 is expecting a Nx1 object
+  * foo2(A.row().transpose());  // The row is copied into a contiguous temporary
+  * foo2(2*a);                  // The expression is evaluated into a temporary
+  * foo2(A.col().segment(2,4)); // No temporary
+  * \endcode
+  *
+  * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameters.
+  * Here is an example accepting an innerstride!=1:
+  * \code
+  * // in-out argument:
+  * void foo3(Ref<VectorXf,0,InnerStride<> > x);
+  * foo3(A.row());              // OK
+  * \endcode
+  * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involve more
+  * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overload internally calling a
+  * template function, e.g.:
+  * \code
+  * // in the .h:
+  * void foo(const Ref<MatrixXf>& A);
+  * void foo(const Ref<MatrixXf,0,Stride<> >& A);
+  *
+  * // in the .cpp:
+  * template<typename TypeOfA> void foo_impl(const TypeOfA& A) {
+  *     ... // crazy code goes here
+  * }
+  * void foo(const Ref<MatrixXf>& A) { foo_impl(A); }
+  * void foo(const Ref<MatrixXf,0,Stride<> >& A) { foo_impl(A); }
+  * \endcode
+  *
+  * See also the following stackoverflow questions for further references:
+  *  - <a href="http://stackoverflow.com/questions/21132538/correct-usage-of-the-eigenref-class">Correct usage of the Eigen::Ref<> class</a>
+  *
+  * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
+  */
 template<typename PlainObjectType, int Options, typename StrideType> class Ref
   : public RefBase<Ref<PlainObjectType, Options, StrideType> >
 {
+  private:
     typedef internal::traits<Ref> Traits;
+    template<typename Derived>
+    EIGEN_DEVICE_FUNC inline Ref(const PlainObjectBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0);
   public:
 
     typedef RefBase<Ref> Base;
@@ -196,21 +294,31 @@
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename Derived>
-    inline Ref(PlainObjectBase<Derived>& expr,
-               typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
+    EIGEN_DEVICE_FUNC inline Ref(PlainObjectBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
     {
-      Base::construct(expr);
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      // Construction must pass since we will not create temprary storage in the non-const case.
+      const bool success = Base::construct(expr.derived());
+      EIGEN_UNUSED_VARIABLE(success)
+      eigen_assert(success);
     }
     template<typename Derived>
-    inline Ref(const DenseBase<Derived>& expr,
-               typename internal::enable_if<bool(internal::is_lvalue<Derived>::value&&bool(Traits::template match<Derived>::MatchAtCompileTime)),Derived>::type* = 0,
-               int = Derived::ThisConstantIsPrivateInPlainObjectBase)
+    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
     #else
+    /** Implicit constructor from any dense expression */
     template<typename Derived>
     inline Ref(DenseBase<Derived>& expr)
     #endif
     {
-      Base::construct(expr.const_cast_derived());
+      EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      EIGEN_STATIC_ASSERT(!Derived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      // Construction must pass since we will not create temporary storage in the non-const case.
+      const bool success = Base::construct(expr.const_cast_derived());
+      EIGEN_UNUSED_VARIABLE(success)
+      eigen_assert(success);
     }
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Ref)
@@ -228,7 +336,8 @@
     EIGEN_DENSE_PUBLIC_INTERFACE(Ref)
 
     template<typename Derived>
-    inline Ref(const DenseBase<Derived>& expr)
+    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::ScalarTypeMatch),Derived>::type* = 0)
     {
 //      std::cout << match_helper<Derived>::HasDirectAccess << "," << match_helper<Derived>::OuterStrideMatch << "," << match_helper<Derived>::InnerStrideMatch << "\n";
 //      std::cout << int(StrideType::OuterStrideAtCompileTime) << " - " << int(Derived::OuterStrideAtCompileTime) << "\n";
@@ -236,18 +345,30 @@
       construct(expr.derived(), typename Traits::template match<Derived>::type());
     }
 
+    EIGEN_DEVICE_FUNC inline Ref(const Ref& other) : Base(other) {
+      // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
+    }
+
+    template<typename OtherRef>
+    EIGEN_DEVICE_FUNC inline Ref(const RefBase<OtherRef>& other) {
+      construct(other.derived(), typename Traits::template match<OtherRef>::type());
+    }
+
   protected:
 
     template<typename Expression>
-    void construct(const Expression& expr,internal::true_type)
+    EIGEN_DEVICE_FUNC void construct(const Expression& expr,internal::true_type)
     {
-      Base::construct(expr);
+      // Check if we can use the underlying expr's storage directly, otherwise call the copy version.
+      if (!Base::construct(expr)) {
+        construct(expr, internal::false_type());
+      }
     }
 
     template<typename Expression>
-    void construct(const Expression& expr, internal::false_type)
+    EIGEN_DEVICE_FUNC void construct(const Expression& expr, internal::false_type)
     {
-      m_object.lazyAssign(expr);
+      internal::call_assignment_no_alias(m_object,expr,internal::assign_op<Scalar,Scalar>());
       Base::construct(m_object);
     }
 

diff --git a/Eigen/src/Core/Replicate.h b/Eigen/src/Core/Replicate.h
index dde86a8..ab5be7e 100644
--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h

@@ -10,22 +10,7 @@
 #ifndef EIGEN_REPLICATE_H
 #define EIGEN_REPLICATE_H
 
-namespace Eigen { 
-
-/**
-  * \class Replicate
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the multiple replication of a matrix or vector
-  *
-  * \param MatrixType the type of the object we are replicating
-  *
-  * This class represents an expression of the multiple replication of a matrix or vector.
-  * It is the return type of DenseBase::replicate() and most of the time
-  * this is the only way it is used.
-  *
-  * \sa DenseBase::replicate()
-  */
+namespace Eigen {
 
 namespace internal {
 template<typename MatrixType,int RowFactor,int ColFactor>
@@ -35,10 +20,7 @@
   typedef typename MatrixType::Scalar Scalar;
   typedef typename traits<MatrixType>::StorageKind StorageKind;
   typedef typename traits<MatrixType>::XprKind XprKind;
-  enum {
-    Factor = (RowFactor==Dynamic || ColFactor==Dynamic) ? Dynamic : RowFactor*ColFactor
-  };
-  typedef typename nested<MatrixType,Factor>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
   enum {
     RowsAtCompileTime = RowFactor==Dynamic || int(MatrixType::RowsAtCompileTime)==Dynamic
@@ -53,12 +35,29 @@
     IsRowMajor = MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1 ? 1
                : MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1 ? 0
                : (MatrixType::Flags & RowMajorBit) ? 1 : 0,
-    Flags = (_MatrixTypeNested::Flags & HereditaryBits & ~RowMajorBit) | (IsRowMajor ? RowMajorBit : 0),
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+
+    // FIXME enable DirectAccess with negative strides?
+    Flags = IsRowMajor ? RowMajorBit : 0
   };
 };
 }
 
+/**
+  * \class Replicate
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the multiple replication of a matrix or vector
+  *
+  * \tparam MatrixType the type of the object we are replicating
+  * \tparam RowFactor number of repetitions at compile time along the vertical direction, can be Dynamic.
+  * \tparam ColFactor number of repetitions at compile time along the horizontal direction, can be Dynamic.
+  *
+  * This class represents an expression of the multiple replication of a matrix or vector.
+  * It is the return type of DenseBase::replicate() and most of the time
+  * this is the only way it is used.
+  *
+  * \sa DenseBase::replicate()
+  */
 template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
   : public internal::dense_xpr_base< Replicate<MatrixType,RowFactor,ColFactor> >::type
 {
@@ -68,10 +67,12 @@
 
     typedef typename internal::dense_xpr_base<Replicate>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(Replicate)
+    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
 
     template<typename OriginalMatrixType>
-    inline explicit Replicate(const OriginalMatrixType& a_matrix)
-      : m_matrix(a_matrix), m_rowFactor(RowFactor), m_colFactor(ColFactor)
+    EIGEN_DEVICE_FUNC
+    inline explicit Replicate(const OriginalMatrixType& matrix)
+      : m_matrix(matrix), m_rowFactor(RowFactor), m_colFactor(ColFactor)
     {
       EIGEN_STATIC_ASSERT((internal::is_same<typename internal::remove_const<MatrixType>::type,OriginalMatrixType>::value),
                           THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
@@ -79,44 +80,23 @@
     }
 
     template<typename OriginalMatrixType>
-    inline Replicate(const OriginalMatrixType& a_matrix, Index rowFactor, Index colFactor)
-      : m_matrix(a_matrix), m_rowFactor(rowFactor), m_colFactor(colFactor)
+    EIGEN_DEVICE_FUNC
+    inline Replicate(const OriginalMatrixType& matrix, Index rowFactor, Index colFactor)
+      : m_matrix(matrix), m_rowFactor(rowFactor), m_colFactor(colFactor)
     {
       EIGEN_STATIC_ASSERT((internal::is_same<typename internal::remove_const<MatrixType>::type,OriginalMatrixType>::value),
                           THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
     }
 
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index rows() const { return m_matrix.rows() * m_rowFactor.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index cols() const { return m_matrix.cols() * m_colFactor.value(); }
 
-    inline Scalar coeff(Index rowId, Index colId) const
-    {
-      // try to avoid using modulo; this is a pure optimization strategy
-      const Index actual_row  = internal::traits<MatrixType>::RowsAtCompileTime==1 ? 0
-                            : RowFactor==1 ? rowId
-                            : rowId%m_matrix.rows();
-      const Index actual_col  = internal::traits<MatrixType>::ColsAtCompileTime==1 ? 0
-                            : ColFactor==1 ? colId
-                            : colId%m_matrix.cols();
-
-      return m_matrix.coeff(actual_row, actual_col);
-    }
-    template<int LoadMode>
-    inline PacketScalar packet(Index rowId, Index colId) const
-    {
-      const Index actual_row  = internal::traits<MatrixType>::RowsAtCompileTime==1 ? 0
-                            : RowFactor==1 ? rowId
-                            : rowId%m_matrix.rows();
-      const Index actual_col  = internal::traits<MatrixType>::ColsAtCompileTime==1 ? 0
-                            : ColFactor==1 ? colId
-                            : colId%m_matrix.cols();
-
-      return m_matrix.template packet<LoadMode>(actual_row, actual_col);
-    }
-
+    EIGEN_DEVICE_FUNC
     const _MatrixTypeNested& nestedExpression() const
-    { 
-      return m_matrix; 
+    {
+      return m_matrix;
     }
 
   protected:
@@ -135,28 +115,13 @@
   */
 template<typename Derived>
 template<int RowFactor, int ColFactor>
-inline const Replicate<Derived,RowFactor,ColFactor>
+EIGEN_DEVICE_FUNC const Replicate<Derived,RowFactor,ColFactor>
 DenseBase<Derived>::replicate() const
 {
   return Replicate<Derived,RowFactor,ColFactor>(derived());
 }
 
 /**
-  * \return an expression of the replication of \c *this
-  *
-  * Example: \include MatrixBase_replicate_int_int.cpp
-  * Output: \verbinclude MatrixBase_replicate_int_int.out
-  *
-  * \sa VectorwiseOp::replicate(), DenseBase::replicate<int,int>(), class Replicate
-  */
-template<typename Derived>
-inline const Replicate<Derived,Dynamic,Dynamic>
-DenseBase<Derived>::replicate(Index rowFactor,Index colFactor) const
-{
-  return Replicate<Derived,Dynamic,Dynamic>(derived(),rowFactor,colFactor);
-}
-
-/**
   * \return an expression of the replication of each column (or row) of \c *this
   *
   * Example: \include DirectionWise_replicate_int.cpp
@@ -165,7 +130,7 @@
   * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate
   */
 template<typename ExpressionType, int Direction>
-const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
+EIGEN_DEVICE_FUNC const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
 VectorwiseOp<ExpressionType,Direction>::replicate(Index factor) const
 {
   return typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType

diff --git a/Eigen/src/Core/Reshaped.h b/Eigen/src/Core/Reshaped.h
new file mode 100644
index 0000000..52de73b
--- /dev/null
+++ b/Eigen/src/Core/Reshaped.h

@@ -0,0 +1,454 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2014 yoco <peter.xiau@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_RESHAPED_H
+#define EIGEN_RESHAPED_H
+
+namespace Eigen {
+
+/** \class Reshaped
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a fixed-size or dynamic-size reshape
+  *
+  * \tparam XprType the type of the expression in which we are taking a reshape
+  * \tparam Rows the number of rows of the reshape we are taking at compile time (optional)
+  * \tparam Cols the number of columns of the reshape we are taking at compile time (optional)
+  * \tparam Order can be ColMajor or RowMajor, default is ColMajor.
+  *
+  * This class represents an expression of either a fixed-size or dynamic-size reshape.
+  * It is the return type of DenseBase::reshaped(NRowsType,NColsType) and
+  * most of the time this is the only way it is used.
+  *
+  * However, in C++98, if you want to directly maniputate reshaped expressions,
+  * for instance if you want to write a function returning such an expression, you
+  * will need to use this class. In C++11, it is advised to use the \em auto
+  * keyword for such use cases.
+  *
+  * Here is an example illustrating the dynamic case:
+  * \include class_Reshaped.cpp
+  * Output: \verbinclude class_Reshaped.out
+  *
+  * Here is an example illustrating the fixed-size case:
+  * \include class_FixedReshaped.cpp
+  * Output: \verbinclude class_FixedReshaped.out
+  *
+  * \sa DenseBase::reshaped(NRowsType,NColsType)
+  */
+
+namespace internal {
+
+template<typename XprType, int Rows, int Cols, int Order>
+struct traits<Reshaped<XprType, Rows, Cols, Order> > : traits<XprType>
+{
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::XprKind XprKind;
+  enum{
+    MatrixRows = traits<XprType>::RowsAtCompileTime,
+    MatrixCols = traits<XprType>::ColsAtCompileTime,
+    RowsAtCompileTime = Rows,
+    ColsAtCompileTime = Cols,
+    MaxRowsAtCompileTime = Rows,
+    MaxColsAtCompileTime = Cols,
+    XpxStorageOrder = ((int(traits<XprType>::Flags) & RowMajorBit) == RowMajorBit) ? RowMajor : ColMajor,
+    ReshapedStorageOrder = (RowsAtCompileTime == 1 && ColsAtCompileTime != 1) ? RowMajor
+                         : (ColsAtCompileTime == 1 && RowsAtCompileTime != 1) ? ColMajor
+                         : XpxStorageOrder,
+    HasSameStorageOrderAsXprType = (ReshapedStorageOrder == XpxStorageOrder),
+    InnerSize = (ReshapedStorageOrder==int(RowMajor)) ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
+    InnerStrideAtCompileTime = HasSameStorageOrderAsXprType
+                             ? int(inner_stride_at_compile_time<XprType>::ret)
+                             : Dynamic,
+    OuterStrideAtCompileTime = Dynamic,
+
+    HasDirectAccess = internal::has_direct_access<XprType>::ret
+                    && (Order==int(XpxStorageOrder))
+                    && ((evaluator<XprType>::Flags&LinearAccessBit)==LinearAccessBit),
+
+    MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
+                       && (InnerStrideAtCompileTime == 1)
+                        ? PacketAccessBit : 0,
+    //MaskAlignedBit = ((OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0,
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
+    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
+    FlagsRowMajorBit = (ReshapedStorageOrder==int(RowMajor)) ? RowMajorBit : 0,
+    FlagsDirectAccessBit = HasDirectAccess ? DirectAccessBit : 0,
+    Flags0 = traits<XprType>::Flags & ( (HereditaryBits & ~RowMajorBit) | MaskPacketAccessBit),
+
+    Flags = (Flags0 | FlagsLinearAccessBit | FlagsLvalueBit | FlagsRowMajorBit | FlagsDirectAccessBit)
+  };
+};
+
+template<typename XprType, int Rows, int Cols, int Order, bool HasDirectAccess> class ReshapedImpl_dense;
+
+} // end namespace internal
+
+template<typename XprType, int Rows, int Cols, int Order, typename StorageKind> class ReshapedImpl;
+
+template<typename XprType, int Rows, int Cols, int Order> class Reshaped
+  : public ReshapedImpl<XprType, Rows, Cols, Order, typename internal::traits<XprType>::StorageKind>
+{
+    typedef ReshapedImpl<XprType, Rows, Cols, Order, typename internal::traits<XprType>::StorageKind> Impl;
+  public:
+    //typedef typename Impl::Base Base;
+    typedef Impl Base;
+    EIGEN_GENERIC_PUBLIC_INTERFACE(Reshaped)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reshaped)
+
+    /** Fixed-size constructor
+      */
+    EIGEN_DEVICE_FUNC
+    inline Reshaped(XprType& xpr)
+      : Impl(xpr)
+    {
+      EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
+      eigen_assert(Rows * Cols == xpr.rows() * xpr.cols());
+    }
+
+    /** Dynamic-size constructor
+      */
+    EIGEN_DEVICE_FUNC
+    inline Reshaped(XprType& xpr,
+          Index reshapeRows, Index reshapeCols)
+      : Impl(xpr, reshapeRows, reshapeCols)
+    {
+      eigen_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==reshapeRows)
+          && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==reshapeCols));
+      eigen_assert(reshapeRows * reshapeCols == xpr.rows() * xpr.cols());
+    }
+};
+
+// The generic default implementation for dense reshape simply forward to the internal::ReshapedImpl_dense
+// that must be specialized for direct and non-direct access...
+template<typename XprType, int Rows, int Cols, int Order>
+class ReshapedImpl<XprType, Rows, Cols, Order, Dense>
+  : public internal::ReshapedImpl_dense<XprType, Rows, Cols, Order,internal::traits<Reshaped<XprType,Rows,Cols,Order> >::HasDirectAccess>
+{
+    typedef internal::ReshapedImpl_dense<XprType, Rows, Cols, Order,internal::traits<Reshaped<XprType,Rows,Cols,Order> >::HasDirectAccess> Impl;
+  public:
+    typedef Impl Base;
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl)
+    EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr) : Impl(xpr) {}
+    EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr, Index reshapeRows, Index reshapeCols)
+      : Impl(xpr, reshapeRows, reshapeCols) {}
+};
+
+namespace internal {
+
+/** \internal Internal implementation of dense Reshaped in the general case. */
+template<typename XprType, int Rows, int Cols, int Order>
+class ReshapedImpl_dense<XprType,Rows,Cols,Order,false>
+  : public internal::dense_xpr_base<Reshaped<XprType, Rows, Cols, Order> >::type
+{
+    typedef Reshaped<XprType, Rows, Cols, Order> ReshapedType;
+  public:
+
+    typedef typename internal::dense_xpr_base<ReshapedType>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(ReshapedType)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl_dense)
+
+    typedef typename internal::ref_selector<XprType>::non_const_type MatrixTypeNested;
+    typedef typename internal::remove_all<XprType>::type NestedExpression;
+
+    class InnerIterator;
+
+    /** Fixed-size constructor
+      */
+    EIGEN_DEVICE_FUNC
+    inline ReshapedImpl_dense(XprType& xpr)
+      : m_xpr(xpr), m_rows(Rows), m_cols(Cols)
+    {}
+
+    /** Dynamic-size constructor
+      */
+    EIGEN_DEVICE_FUNC
+    inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols)
+      : m_xpr(xpr), m_rows(nRows), m_cols(nCols)
+    {}
+
+    EIGEN_DEVICE_FUNC Index rows() const { return m_rows; }
+    EIGEN_DEVICE_FUNC Index cols() const { return m_cols; }
+
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    /** \sa MapBase::data() */
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const;
+    EIGEN_DEVICE_FUNC inline Index innerStride() const;
+    EIGEN_DEVICE_FUNC inline Index outerStride() const;
+    #endif
+
+    /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<XprType>::type&
+    nestedExpression() const { return m_xpr; }
+
+    /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC
+    typename internal::remove_reference<XprType>::type&
+    nestedExpression() { return m_xpr; }
+
+  protected:
+
+    MatrixTypeNested m_xpr;
+    const internal::variable_if_dynamic<Index, Rows> m_rows;
+    const internal::variable_if_dynamic<Index, Cols> m_cols;
+};
+
+
+/** \internal Internal implementation of dense Reshaped in the direct access case. */
+template<typename XprType, int Rows, int Cols, int Order>
+class ReshapedImpl_dense<XprType, Rows, Cols, Order, true>
+  : public MapBase<Reshaped<XprType, Rows, Cols, Order> >
+{
+    typedef Reshaped<XprType, Rows, Cols, Order> ReshapedType;
+    typedef typename internal::ref_selector<XprType>::non_const_type XprTypeNested;
+  public:
+
+    typedef MapBase<ReshapedType> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(ReshapedType)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl_dense)
+
+    /** Fixed-size constructor
+      */
+    EIGEN_DEVICE_FUNC
+    inline ReshapedImpl_dense(XprType& xpr)
+      : Base(xpr.data()), m_xpr(xpr)
+    {}
+
+    /** Dynamic-size constructor
+      */
+    EIGEN_DEVICE_FUNC
+    inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols)
+      : Base(xpr.data(), nRows, nCols),
+        m_xpr(xpr)
+    {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
+    {
+      return m_xpr;
+    }
+
+    EIGEN_DEVICE_FUNC
+    XprType& nestedExpression() { return m_xpr; }
+
+    /** \sa MapBase::innerStride() */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const
+    {
+      return m_xpr.innerStride();
+    }
+
+    /** \sa MapBase::outerStride() */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const
+    {
+      return ((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows();
+    }
+
+  protected:
+
+    XprTypeNested m_xpr;
+};
+
+// Evaluators
+template<typename ArgType, int Rows, int Cols, int Order, bool HasDirectAccess> struct reshaped_evaluator;
+
+template<typename ArgType, int Rows, int Cols, int Order>
+struct evaluator<Reshaped<ArgType, Rows, Cols, Order> >
+  : reshaped_evaluator<ArgType, Rows, Cols, Order, traits<Reshaped<ArgType,Rows,Cols,Order> >::HasDirectAccess>
+{
+  typedef Reshaped<ArgType, Rows, Cols, Order> XprType;
+  typedef typename XprType::Scalar Scalar;
+  // TODO: should check for smaller packet types
+  typedef typename packet_traits<Scalar>::type PacketScalar;
+
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    HasDirectAccess = traits<XprType>::HasDirectAccess,
+
+//     RowsAtCompileTime = traits<XprType>::RowsAtCompileTime,
+//     ColsAtCompileTime = traits<XprType>::ColsAtCompileTime,
+//     MaxRowsAtCompileTime = traits<XprType>::MaxRowsAtCompileTime,
+//     MaxColsAtCompileTime = traits<XprType>::MaxColsAtCompileTime,
+//
+//     InnerStrideAtCompileTime = traits<XprType>::HasSameStorageOrderAsXprType
+//                              ? int(inner_stride_at_compile_time<ArgType>::ret)
+//                              : Dynamic,
+//     OuterStrideAtCompileTime = Dynamic,
+
+    FlagsLinearAccessBit = (traits<XprType>::RowsAtCompileTime == 1 || traits<XprType>::ColsAtCompileTime == 1 || HasDirectAccess) ? LinearAccessBit : 0,
+    FlagsRowMajorBit = (traits<XprType>::ReshapedStorageOrder==int(RowMajor)) ? RowMajorBit : 0,
+    FlagsDirectAccessBit =  HasDirectAccess ? DirectAccessBit : 0,
+    Flags0 = evaluator<ArgType>::Flags & (HereditaryBits & ~RowMajorBit),
+    Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit | FlagsDirectAccessBit,
+
+    PacketAlignment = unpacket_traits<PacketScalar>::alignment,
+    Alignment = evaluator<ArgType>::Alignment
+  };
+  typedef reshaped_evaluator<ArgType, Rows, Cols, Order, HasDirectAccess> reshaped_evaluator_type;
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : reshaped_evaluator_type(xpr)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+};
+
+template<typename ArgType, int Rows, int Cols, int Order>
+struct reshaped_evaluator<ArgType, Rows, Cols, Order, /* HasDirectAccess */ false>
+  : evaluator_base<Reshaped<ArgType, Rows, Cols, Order> >
+{
+  typedef Reshaped<ArgType, Rows, Cols, Order> XprType;
+
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost /* TODO + cost of index computations */,
+
+    Flags = (evaluator<ArgType>::Flags & (HereditaryBits /*| LinearAccessBit | DirectAccessBit*/)),
+
+    Alignment = 0
+  };
+
+  EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  typedef std::pair<Index, Index> RowCol;
+
+  inline RowCol index_remap(Index rowId, Index colId) const
+  {
+    if(Order==ColMajor)
+    {
+      const Index nth_elem_idx = colId * m_xpr.rows() + rowId;
+      return RowCol(nth_elem_idx % m_xpr.nestedExpression().rows(),
+                    nth_elem_idx / m_xpr.nestedExpression().rows());
+    }
+    else
+    {
+      const Index nth_elem_idx = colId + rowId * m_xpr.cols();
+      return RowCol(nth_elem_idx / m_xpr.nestedExpression().cols(),
+                    nth_elem_idx % m_xpr.nestedExpression().cols());
+    }
+  }
+
+  EIGEN_DEVICE_FUNC
+  inline Scalar& coeffRef(Index rowId, Index colId)
+  {
+    EIGEN_STATIC_ASSERT_LVALUE(XprType)
+    const RowCol row_col = index_remap(rowId, colId);
+    return m_argImpl.coeffRef(row_col.first, row_col.second);
+  }
+
+  EIGEN_DEVICE_FUNC
+  inline const Scalar& coeffRef(Index rowId, Index colId) const
+  {
+    const RowCol row_col = index_remap(rowId, colId);
+    return m_argImpl.coeffRef(row_col.first, row_col.second);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const
+  {
+    const RowCol row_col = index_remap(rowId, colId);
+    return m_argImpl.coeff(row_col.first, row_col.second);
+  }
+
+  EIGEN_DEVICE_FUNC
+  inline Scalar& coeffRef(Index index)
+  {
+    EIGEN_STATIC_ASSERT_LVALUE(XprType)
+    const RowCol row_col = index_remap(Rows == 1 ? 0 : index,
+                                       Rows == 1 ? index : 0);
+    return m_argImpl.coeffRef(row_col.first, row_col.second);
+
+  }
+
+  EIGEN_DEVICE_FUNC
+  inline const Scalar& coeffRef(Index index) const
+  {
+    const RowCol row_col = index_remap(Rows == 1 ? 0 : index,
+                                       Rows == 1 ? index : 0);
+    return m_argImpl.coeffRef(row_col.first, row_col.second);
+  }
+
+  EIGEN_DEVICE_FUNC
+  inline const CoeffReturnType coeff(Index index) const
+  {
+    const RowCol row_col = index_remap(Rows == 1 ? 0 : index,
+                                       Rows == 1 ? index : 0);
+    return m_argImpl.coeff(row_col.first, row_col.second);
+  }
+#if 0
+  EIGEN_DEVICE_FUNC
+  template<int LoadMode>
+  inline PacketScalar packet(Index rowId, Index colId) const
+  {
+    const RowCol row_col = index_remap(rowId, colId);
+    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second);
+
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC
+  inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
+  {
+    const RowCol row_col = index_remap(rowId, colId);
+    m_argImpl.const_cast_derived().template writePacket<Unaligned>
+            (row_col.first, row_col.second, val);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC
+  inline PacketScalar packet(Index index) const
+  {
+    const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index,
+                                        RowsAtCompileTime == 1 ? index : 0);
+    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC
+  inline void writePacket(Index index, const PacketScalar& val)
+  {
+    const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index,
+                                        RowsAtCompileTime == 1 ? index : 0);
+    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second, val);
+  }
+#endif
+protected:
+
+  evaluator<ArgType> m_argImpl;
+  const XprType& m_xpr;
+
+};
+
+template<typename ArgType, int Rows, int Cols, int Order>
+struct reshaped_evaluator<ArgType, Rows, Cols, Order, /* HasDirectAccess */ true>
+: mapbase_evaluator<Reshaped<ArgType, Rows, Cols, Order>,
+                      typename Reshaped<ArgType, Rows, Cols, Order>::PlainObject>
+{
+  typedef Reshaped<ArgType, Rows, Cols, Order> XprType;
+  typedef typename XprType::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr)
+    : mapbase_evaluator<XprType, typename XprType::PlainObject>(xpr)
+  {
+    // TODO: for the 3.4 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime
+    eigen_assert(((internal::UIntPtr(xpr.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator<XprType>::Alignment)) == 0) && "data is not aligned");
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_RESHAPED_H

diff --git a/Eigen/src/Core/ReturnByValue.h b/Eigen/src/Core/ReturnByValue.h
index 3625028..4dad13e 100644
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h

@@ -13,11 +13,6 @@
 
 namespace Eigen {
 
-/** \class ReturnByValue
-  * \ingroup Core_Module
-  *
-  */
-
 namespace internal {
 
 template<typename Derived>
@@ -38,17 +33,22 @@
  * So internal::nested always gives the plain return matrix type.
  *
  * FIXME: I don't understand why we need this specialization: isn't this taken care of by the EvalBeforeNestingBit ??
+ * Answer: EvalBeforeNestingBit should be deprecated since we have the evaluators
  */
 template<typename Derived,int n,typename PlainObject>
-struct nested<ReturnByValue<Derived>, n, PlainObject>
+struct nested_eval<ReturnByValue<Derived>, n, PlainObject>
 {
   typedef typename traits<Derived>::ReturnType type;
 };
 
 } // end namespace internal
 
+/** \class ReturnByValue
+  * \ingroup Core_Module
+  *
+  */
 template<typename Derived> class ReturnByValue
-  : internal::no_assignment_operator, public internal::dense_xpr_base< ReturnByValue<Derived> >::type
+  : public internal::dense_xpr_base< ReturnByValue<Derived> >::type, internal::no_assignment_operator
 {
   public:
     typedef typename internal::traits<Derived>::ReturnType ReturnType;
@@ -60,8 +60,10 @@
     EIGEN_DEVICE_FUNC
     inline void evalTo(Dest& dst) const
     { static_cast<const Derived*>(this)->evalTo(dst); }
-    EIGEN_DEVICE_FUNC inline Index rows() const { return static_cast<const Derived*>(this)->rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return static_cast<const Derived*>(this)->cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return static_cast<const Derived*>(this)->rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return static_cast<const Derived*>(this)->cols(); }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 #define Unusable YOU_ARE_TRYING_TO_ACCESS_A_SINGLE_COEFFICIENT_IN_A_SPECIAL_EXPRESSION_WHERE_THAT_IS_NOT_ALLOWED_BECAUSE_THAT_WOULD_BE_INEFFICIENT
@@ -73,18 +75,45 @@
     const Unusable& coeff(Index,Index) const { return *reinterpret_cast<const Unusable*>(this); }
     Unusable& coeffRef(Index) { return *reinterpret_cast<Unusable*>(this); }
     Unusable& coeffRef(Index,Index) { return *reinterpret_cast<Unusable*>(this); }
+#undef Unusable
 #endif
 };
 
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
+EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
 {
   other.evalTo(derived());
   return derived();
 }
 
+namespace internal {
+
+// Expression is evaluated in a temporary; default implementation of Assignment is bypassed so that
+// when a ReturnByValue expression is assigned, the evaluator is not constructed.
+// TODO: Finalize port to new regime; ReturnByValue should not exist in the expression world
+
+template<typename Derived>
+struct evaluator<ReturnByValue<Derived> >
+  : public evaluator<typename internal::traits<Derived>::ReturnType>
+{
+  typedef ReturnByValue<Derived> XprType;
+  typedef typename internal::traits<Derived>::ReturnType PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+    : m_result(xpr.rows(), xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    xpr.evalTo(m_result);
+  }
+
+protected:
+  PlainObject m_result;
+};
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_RETURNBYVALUE_H

diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h
index d37c48f..28cdd76 100644
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h

@@ -14,20 +14,6 @@
 
 namespace Eigen {
 
-/** \class Reverse
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the reverse of a vector or matrix
-  *
-  * \param MatrixType the type of the object of which we are taking the reverse
-  *
-  * This class represents an expression of the reverse of a vector.
-  * It is the return type of MatrixBase::reverse() and VectorwiseOp::reverse()
-  * and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::reverse(), VectorwiseOp::reverse()
-  */
-
 namespace internal {
 
 template<typename MatrixType, int Direction>
@@ -37,36 +23,43 @@
   typedef typename MatrixType::Scalar Scalar;
   typedef typename traits<MatrixType>::StorageKind StorageKind;
   typedef typename traits<MatrixType>::XprKind XprKind;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
   enum {
     RowsAtCompileTime = MatrixType::RowsAtCompileTime,
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-
-    // let's enable LinearAccess only with vectorization because of the product overhead
-    LinearAccess = ( (Direction==BothDirections) && (int(_MatrixTypeNested::Flags)&PacketAccessBit) )
-                 ? LinearAccessBit : 0,
-
-    Flags = int(_MatrixTypeNested::Flags) & (HereditaryBits | LvalueBit | PacketAccessBit | LinearAccess),
-
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+    Flags = _MatrixTypeNested::Flags & (RowMajorBit | LvalueBit)
   };
 };
 
-template<typename PacketScalar, bool ReversePacket> struct reverse_packet_cond
+template<typename PacketType, bool ReversePacket> struct reverse_packet_cond
 {
-  static inline PacketScalar run(const PacketScalar& x) { return preverse(x); }
+  static inline PacketType run(const PacketType& x) { return preverse(x); }
 };
 
-template<typename PacketScalar> struct reverse_packet_cond<PacketScalar,false>
+template<typename PacketType> struct reverse_packet_cond<PacketType,false>
 {
-  static inline PacketScalar run(const PacketScalar& x) { return x; }
+  static inline PacketType run(const PacketType& x) { return x; }
 };
 
 } // end namespace internal
 
+/** \class Reverse
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the reverse of a vector or matrix
+  *
+  * \tparam MatrixType the type of the object of which we are taking the reverse
+  * \tparam Direction defines the direction of the reverse operation, can be Vertical, Horizontal, or BothDirections
+  *
+  * This class represents an expression of the reverse of a vector.
+  * It is the return type of MatrixBase::reverse() and VectorwiseOp::reverse()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::reverse(), VectorwiseOp::reverse()
+  */
 template<typename MatrixType, int Direction> class Reverse
   : public internal::dense_xpr_base< Reverse<MatrixType, Direction> >::type
 {
@@ -74,12 +67,9 @@
 
     typedef typename internal::dense_xpr_base<Reverse>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(Reverse)
+    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
     using Base::IsRowMajor;
 
-    // next line is necessary because otherwise const version of operator()
-    // is hidden by non-const version defined in this file
-    using Base::operator();
-
   protected:
     enum {
       PacketSize = internal::packet_traits<Scalar>::size,
@@ -95,82 +85,21 @@
     typedef internal::reverse_packet_cond<PacketScalar,ReversePacket> reverse_packet;
   public:
 
-    inline Reverse(const MatrixType& matrix) : m_matrix(matrix) { }
+    EIGEN_DEVICE_FUNC explicit inline Reverse(const MatrixType& matrix) : m_matrix(matrix) { }
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse)
 
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
 
-    inline Index innerStride() const
+    EIGEN_DEVICE_FUNC inline Index innerStride() const
     {
       return -m_matrix.innerStride();
     }
 
-    inline Scalar& operator()(Index row, Index col)
-    {
-      eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
-      return coeffRef(row, col);
-    }
-
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      return m_matrix.const_cast_derived().coeffRef(ReverseRow ? m_matrix.rows() - row - 1 : row,
-                                                    ReverseCol ? m_matrix.cols() - col - 1 : col);
-    }
-
-    inline CoeffReturnType coeff(Index row, Index col) const
-    {
-      return m_matrix.coeff(ReverseRow ? m_matrix.rows() - row - 1 : row,
-                            ReverseCol ? m_matrix.cols() - col - 1 : col);
-    }
-
-    inline CoeffReturnType coeff(Index index) const
-    {
-      return m_matrix.coeff(m_matrix.size() - index - 1);
-    }
-
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_matrix.const_cast_derived().coeffRef(m_matrix.size() - index - 1);
-    }
-
-    inline Scalar& operator()(Index index)
-    {
-      eigen_assert(index >= 0 && index < m_matrix.size());
-      return coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index row, Index col) const
-    {
-      return reverse_packet::run(m_matrix.template packet<LoadMode>(
-                                    ReverseRow ? m_matrix.rows() - row - OffsetRow : row,
-                                    ReverseCol ? m_matrix.cols() - col - OffsetCol : col));
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index row, Index col, const PacketScalar& x)
-    {
-      m_matrix.const_cast_derived().template writePacket<LoadMode>(
-                                      ReverseRow ? m_matrix.rows() - row - OffsetRow : row,
-                                      ReverseCol ? m_matrix.cols() - col - OffsetCol : col,
-                                      reverse_packet::run(x));
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return internal::preverse(m_matrix.template packet<LoadMode>( m_matrix.size() - index - PacketSize ));
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
-    {
-      m_matrix.const_cast_derived().template writePacket<LoadMode>(m_matrix.size() - index - PacketSize, internal::preverse(x));
-    }
-
-    const typename internal::remove_all<typename MatrixType::Nested>::type&
+    EIGEN_DEVICE_FUNC const typename internal::remove_all<typename MatrixType::Nested>::type&
     nestedExpression() const
     {
       return m_matrix;
@@ -187,34 +116,29 @@
   *
   */
 template<typename Derived>
-inline typename DenseBase<Derived>::ReverseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ReverseReturnType
 DenseBase<Derived>::reverse()
 {
-  return derived();
+  return ReverseReturnType(derived());
 }
 
-/** This is the const version of reverse(). */
-template<typename Derived>
-inline const typename DenseBase<Derived>::ConstReverseReturnType
-DenseBase<Derived>::reverse() const
-{
-  return derived();
-}
+
+//reverse const overload moved DenseBase.h due to a CUDA compiler bug
 
 /** This is the "in place" version of reverse: it reverses \c *this.
   *
   * In most cases it is probably better to simply use the reversed expression
   * of a matrix. However, when reversing the matrix data itself is really needed,
   * then this "in-place" version is probably the right choice because it provides
-  * the following additional features:
+  * the following additional benefits:
   *  - less error prone: doing the same operation with .reverse() requires special care:
   *    \code m = m.reverse().eval(); \endcode
-  *  - this API allows to avoid creating a temporary (the current implementation creates a temporary, but that could be avoided using swap)
+  *  - this API enables reverse operations without the need for a temporary
   *  - it allows future optimizations (cache friendliness, etc.)
   *
-  * \sa reverse() */
+  * \sa VectorwiseOp::reverseInPlace(), reverse() */
 template<typename Derived>
-inline void DenseBase<Derived>::reverseInPlace()
+EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::reverseInPlace()
 {
   if(cols()>rows())
   {
@@ -249,8 +173,10 @@
   template<typename ExpressionType>
   static void run(ExpressionType &xpr)
   {
+    const int HalfAtCompileTime = ExpressionType::RowsAtCompileTime==Dynamic?Dynamic:ExpressionType::RowsAtCompileTime/2;
     Index half = xpr.rows()/2;
-    xpr.topRows(half).swap(xpr.bottomRows(half).colwise().reverse());
+    xpr.topRows(fix<HalfAtCompileTime>(half))
+       .swap(xpr.bottomRows(fix<HalfAtCompileTime>(half)).colwise().reverse());
   }
 };
 
@@ -260,8 +186,10 @@
   template<typename ExpressionType>
   static void run(ExpressionType &xpr)
   {
+    const int HalfAtCompileTime = ExpressionType::ColsAtCompileTime==Dynamic?Dynamic:ExpressionType::ColsAtCompileTime/2;
     Index half = xpr.cols()/2;
-    xpr.leftCols(half).swap(xpr.rightCols(half).rowwise().reverse());
+    xpr.leftCols(fix<HalfAtCompileTime>(half))
+       .swap(xpr.rightCols(fix<HalfAtCompileTime>(half)).rowwise().reverse());
   }
 };
 
@@ -279,9 +207,9 @@
   *
   * \sa DenseBase::reverseInPlace(), reverse() */
 template<typename ExpressionType, int Direction>
-void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
+EIGEN_DEVICE_FUNC void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
 {
-  internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
+  internal::vectorwise_reverse_inplace_impl<Direction>::run(m_matrix);
 }
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/Select.h b/Eigen/src/Core/Select.h
index 87993bb..7c86bf8 100644
--- a/Eigen/src/Core/Select.h
+++ b/Eigen/src/Core/Select.h

@@ -10,7 +10,7 @@
 #ifndef EIGEN_SELECT_H
 #define EIGEN_SELECT_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \class Select
   * \ingroup Core_Module
@@ -43,23 +43,21 @@
     ColsAtCompileTime = ConditionMatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = ConditionMatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = ConditionMatrixType::MaxColsAtCompileTime,
-    Flags = (unsigned int)ThenMatrixType::Flags & ElseMatrixType::Flags & HereditaryBits,
-    CoeffReadCost = traits<typename remove_all<ConditionMatrixNested>::type>::CoeffReadCost
-                  + EIGEN_SIZE_MAX(traits<typename remove_all<ThenMatrixNested>::type>::CoeffReadCost,
-                                   traits<typename remove_all<ElseMatrixNested>::type>::CoeffReadCost)
+    Flags = (unsigned int)ThenMatrixType::Flags & ElseMatrixType::Flags & RowMajorBit
   };
 };
 }
 
 template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
-class Select : internal::no_assignment_operator,
-  public internal::dense_xpr_base< Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >::type
+class Select : public internal::dense_xpr_base< Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >::type,
+               internal::no_assignment_operator
 {
   public:
 
     typedef typename internal::dense_xpr_base<Select>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(Select)
 
+    inline EIGEN_DEVICE_FUNC
     Select(const ConditionMatrixType& a_conditionMatrix,
            const ThenMatrixType& a_thenMatrix,
            const ElseMatrixType& a_elseMatrix)
@@ -69,9 +67,12 @@
       eigen_assert(m_condition.cols() == m_then.cols() && m_condition.cols() == m_else.cols());
     }
 
-    Index rows() const { return m_condition.rows(); }
-    Index cols() const { return m_condition.cols(); }
+    inline EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_condition.rows(); }
+    inline EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_condition.cols(); }
 
+    inline EIGEN_DEVICE_FUNC
     const Scalar coeff(Index i, Index j) const
     {
       if (m_condition.coeff(i,j))
@@ -80,6 +81,7 @@
         return m_else.coeff(i,j);
     }
 
+    inline EIGEN_DEVICE_FUNC
     const Scalar coeff(Index i) const
     {
       if (m_condition.coeff(i))
@@ -88,17 +90,17 @@
         return m_else.coeff(i);
     }
 
-    const ConditionMatrixType& conditionMatrix() const
+    inline EIGEN_DEVICE_FUNC const ConditionMatrixType& conditionMatrix() const
     {
       return m_condition;
     }
 
-    const ThenMatrixType& thenMatrix() const
+    inline EIGEN_DEVICE_FUNC const ThenMatrixType& thenMatrix() const
     {
       return m_then;
     }
 
-    const ElseMatrixType& elseMatrix() const
+    inline EIGEN_DEVICE_FUNC const ElseMatrixType& elseMatrix() const
     {
       return m_else;
     }
@@ -120,7 +122,7 @@
   */
 template<typename Derived>
 template<typename ThenDerived,typename ElseDerived>
-inline const Select<Derived,ThenDerived,ElseDerived>
+inline EIGEN_DEVICE_FUNC const Select<Derived,ThenDerived,ElseDerived>
 DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix,
                             const DenseBase<ElseDerived>& elseMatrix) const
 {
@@ -134,7 +136,7 @@
   */
 template<typename Derived>
 template<typename ThenDerived>
-inline const Select<Derived,ThenDerived, typename ThenDerived::ConstantReturnType>
+inline EIGEN_DEVICE_FUNC const Select<Derived,ThenDerived, typename ThenDerived::ConstantReturnType>
 DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix,
                            const typename ThenDerived::Scalar& elseScalar) const
 {
@@ -149,7 +151,7 @@
   */
 template<typename Derived>
 template<typename ElseDerived>
-inline const Select<Derived, typename ElseDerived::ConstantReturnType, ElseDerived >
+inline EIGEN_DEVICE_FUNC const Select<Derived, typename ElseDerived::ConstantReturnType, ElseDerived >
 DenseBase<Derived>::select(const typename ElseDerived::Scalar& thenScalar,
                            const DenseBase<ElseDerived>& elseMatrix) const
 {

diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h
index 0e67bbc..8ce3b37 100644
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h

@@ -10,7 +10,7 @@
 #ifndef EIGEN_SELFADJOINTMATRIX_H
 #define EIGEN_SELFADJOINTMATRIX_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \class SelfAdjointView
   * \ingroup Core_Module
@@ -32,55 +32,58 @@
 template<typename MatrixType, unsigned int UpLo>
 struct traits<SelfAdjointView<MatrixType, UpLo> > : traits<MatrixType>
 {
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::non_const_type MatrixTypeNested;
   typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
   typedef MatrixType ExpressionType;
-  typedef typename MatrixType::PlainObject DenseMatrixType;
+  typedef typename MatrixType::PlainObject FullMatrixType;
   enum {
     Mode = UpLo | SelfAdjoint,
-    Flags =  MatrixTypeNestedCleaned::Flags & (HereditaryBits)
-           & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit)), // FIXME these flags should be preserved
-    CoeffReadCost = MatrixTypeNestedCleaned::CoeffReadCost
+    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags =  MatrixTypeNestedCleaned::Flags & (HereditaryBits|FlagsLvalueBit)
+           & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit)) // FIXME these flags should be preserved
   };
 };
 }
 
-template <typename Lhs, int LhsMode, bool LhsIsVector,
-          typename Rhs, int RhsMode, bool RhsIsVector>
-struct SelfadjointProductMatrix;
 
-// FIXME could also be called SelfAdjointWrapper to be consistent with DiagonalWrapper ??
-template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
-  : public TriangularBase<SelfAdjointView<MatrixType, UpLo> >
+template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
+  : public TriangularBase<SelfAdjointView<_MatrixType, UpLo> >
 {
   public:
 
+    typedef _MatrixType MatrixType;
     typedef TriangularBase<SelfAdjointView> Base;
     typedef typename internal::traits<SelfAdjointView>::MatrixTypeNested MatrixTypeNested;
     typedef typename internal::traits<SelfAdjointView>::MatrixTypeNestedCleaned MatrixTypeNestedCleaned;
+    typedef MatrixTypeNestedCleaned NestedExpression;
 
     /** \brief The type of coefficients in this matrix */
-    typedef typename internal::traits<SelfAdjointView>::Scalar Scalar; 
-
-    typedef typename MatrixType::Index Index;
+    typedef typename internal::traits<SelfAdjointView>::Scalar Scalar;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
+    typedef SelfAdjointView<typename internal::add_const<MatrixType>::type, UpLo> ConstSelfAdjointView;
 
     enum {
-      Mode = internal::traits<SelfAdjointView>::Mode
+      Mode = internal::traits<SelfAdjointView>::Mode,
+      Flags = internal::traits<SelfAdjointView>::Flags,
+      TransposeMode = ((int(Mode) & int(Upper)) ? Lower : 0) | ((int(Mode) & int(Lower)) ? Upper : 0)
     };
     typedef typename MatrixType::PlainObject PlainObject;
 
     EIGEN_DEVICE_FUNC
-    inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
-    {}
+    explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
+    {
+      EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY);
+    }
 
-    EIGEN_DEVICE_FUNC
-    inline Index rows() const { return m_matrix.rows(); }
-    EIGEN_DEVICE_FUNC
-    inline Index cols() const { return m_matrix.cols(); }
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const { return m_matrix.outerStride(); }
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const { return m_matrix.innerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return m_matrix.outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return m_matrix.innerStride(); }
 
     /** \sa MatrixBase::coeff()
       * \warning the coordinates must fit into the referenced triangular part
@@ -98,8 +101,9 @@
     EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index row, Index col)
     {
+      EIGEN_STATIC_ASSERT_LVALUE(SelfAdjointView);
       Base::check_coordinates_internal(row, col);
-      return m_matrix.const_cast_derived().coeffRef(row, col);
+      return m_matrix.coeffRef(row, col);
     }
 
     /** \internal */
@@ -109,28 +113,31 @@
     EIGEN_DEVICE_FUNC
     const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
     EIGEN_DEVICE_FUNC
-    MatrixTypeNestedCleaned& nestedExpression() { return *const_cast<MatrixTypeNestedCleaned*>(&m_matrix); }
+    MatrixTypeNestedCleaned& nestedExpression() { return m_matrix; }
 
-    /** Efficient self-adjoint matrix times vector/matrix product */
+    /** Efficient triangular matrix times vector/matrix product */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    SelfadjointProductMatrix<MatrixType,Mode,false,OtherDerived,0,OtherDerived::IsVectorAtCompileTime>
+    const Product<SelfAdjointView,OtherDerived>
     operator*(const MatrixBase<OtherDerived>& rhs) const
     {
-      return SelfadjointProductMatrix
-              <MatrixType,Mode,false,OtherDerived,0,OtherDerived::IsVectorAtCompileTime>
-              (m_matrix, rhs.derived());
+      return Product<SelfAdjointView,OtherDerived>(*this, rhs.derived());
     }
 
-    /** Efficient vector/matrix times self-adjoint matrix product */
+    /** Efficient vector/matrix times triangular matrix product */
     template<typename OtherDerived> friend
     EIGEN_DEVICE_FUNC
-    SelfadjointProductMatrix<OtherDerived,0,OtherDerived::IsVectorAtCompileTime,MatrixType,Mode,false>
+    const Product<OtherDerived,SelfAdjointView>
     operator*(const MatrixBase<OtherDerived>& lhs, const SelfAdjointView& rhs)
     {
-      return SelfadjointProductMatrix
-              <OtherDerived,0,OtherDerived::IsVectorAtCompileTime,MatrixType,Mode,false>
-              (lhs.derived(),rhs.m_matrix);
+      return Product<OtherDerived,SelfAdjointView>(lhs.derived(),rhs);
+    }
+
+    friend EIGEN_DEVICE_FUNC
+    const SelfAdjointView<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,MatrixType,product),UpLo>
+    operator*(const Scalar& s, const SelfAdjointView& mat)
+    {
+      return (s*mat.nestedExpression()).template selfadjointView<UpLo>();
     }
 
     /** Perform a symmetric rank 2 update of the selfadjoint matrix \c *this:
@@ -161,6 +168,83 @@
     EIGEN_DEVICE_FUNC
     SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));
 
+    /** \returns an expression of a triangular view extracted from the current selfadjoint view of a given triangular part
+      *
+      * The parameter \a TriMode can have the following values: \c #Upper, \c #StrictlyUpper, \c #UnitUpper,
+      * \c #Lower, \c #StrictlyLower, \c #UnitLower.
+      *
+      * If \c TriMode references the same triangular part than \c *this, then this method simply return a \c TriangularView of the nested expression,
+      * otherwise, the nested expression is first transposed, thus returning a \c TriangularView<Transpose<MatrixType>> object.
+      *
+      * \sa MatrixBase::triangularView(), class TriangularView
+      */
+    template<unsigned int TriMode>
+    EIGEN_DEVICE_FUNC
+    typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)),
+                                   TriangularView<MatrixType,TriMode>,
+                                   TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type
+    triangularView() const
+    {
+      typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), MatrixType&, typename MatrixType::ConstTransposeReturnType>::type tmp1(m_matrix);
+      typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), MatrixType&, typename MatrixType::AdjointReturnType>::type tmp2(tmp1);
+      return typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)),
+                                   TriangularView<MatrixType,TriMode>,
+                                   TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type(tmp2);
+    }
+
+    typedef SelfAdjointView<const MatrixConjugateReturnType,UpLo> ConjugateReturnType;
+    /** \sa MatrixBase::conjugate() const */
+    EIGEN_DEVICE_FUNC
+    inline const ConjugateReturnType conjugate() const
+    { return ConjugateReturnType(m_matrix.conjugate()); }
+
+    /** \returns an expression of the complex conjugate of \c *this if Cond==true,
+     *           returns \c *this otherwise.
+     */
+    template<bool Cond>
+    EIGEN_DEVICE_FUNC
+    inline typename internal::conditional<Cond,ConjugateReturnType,ConstSelfAdjointView>::type
+    conjugateIf() const
+    {
+      typedef typename internal::conditional<Cond,ConjugateReturnType,ConstSelfAdjointView>::type ReturnType;
+      return ReturnType(m_matrix.template conjugateIf<Cond>());
+    }
+
+    typedef SelfAdjointView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
+    /** \sa MatrixBase::adjoint() const */
+    EIGEN_DEVICE_FUNC
+    inline const AdjointReturnType adjoint() const
+    { return AdjointReturnType(m_matrix.adjoint()); }
+
+    typedef SelfAdjointView<typename MatrixType::TransposeReturnType,TransposeMode> TransposeReturnType;
+     /** \sa MatrixBase::transpose() */
+    EIGEN_DEVICE_FUNC
+    inline TransposeReturnType transpose()
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
+      typename MatrixType::TransposeReturnType tmp(m_matrix);
+      return TransposeReturnType(tmp);
+    }
+
+    typedef SelfAdjointView<const typename MatrixType::ConstTransposeReturnType,TransposeMode> ConstTransposeReturnType;
+    /** \sa MatrixBase::transpose() const */
+    EIGEN_DEVICE_FUNC
+    inline const ConstTransposeReturnType transpose() const
+    {
+      return ConstTransposeReturnType(m_matrix.transpose());
+    }
+
+    /** \returns a const expression of the main diagonal of the matrix \c *this
+      *
+      * This method simply returns the diagonal of the nested expression, thus by-passing the SelfAdjointView decorator.
+      *
+      * \sa MatrixBase::diagonal(), class Diagonal */
+    EIGEN_DEVICE_FUNC
+    typename MatrixType::ConstDiagonalReturnType diagonal() const
+    {
+      return typename MatrixType::ConstDiagonalReturnType(m_matrix);
+    }
+
 /////////// Cholesky module ///////////
 
     const LLT<PlainObject, UpLo> llt() const;
@@ -177,31 +261,6 @@
     EigenvaluesReturnType eigenvalues() const;
     EIGEN_DEVICE_FUNC
     RealScalar operatorNorm() const;
-    
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    SelfAdjointView& operator=(const MatrixBase<OtherDerived>& other)
-    {
-      enum {
-        OtherPart = UpLo == Upper ? StrictlyLower : StrictlyUpper
-      };
-      m_matrix.const_cast_derived().template triangularView<UpLo>() = other;
-      m_matrix.const_cast_derived().template triangularView<OtherPart>() = other.adjoint();
-      return *this;
-    }
-    template<typename OtherMatrixType, unsigned int OtherMode>
-    EIGEN_DEVICE_FUNC
-    SelfAdjointView& operator=(const TriangularView<OtherMatrixType, OtherMode>& other)
-    {
-      enum {
-        OtherPart = UpLo == Upper ? StrictlyLower : StrictlyUpper
-      };
-      m_matrix.const_cast_derived().template triangularView<UpLo>() = other.toDenseMatrix();
-      m_matrix.const_cast_derived().template triangularView<OtherPart>() = other.toDenseMatrix().adjoint();
-      return *this;
-    }
-    #endif
 
   protected:
     MatrixTypeNested m_matrix;
@@ -219,96 +278,54 @@
 
 namespace internal {
 
-template<typename Derived1, typename Derived2, int UnrollCount, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Upper), UnrollCount, ClearOpposite>
+// TODO currently a selfadjoint expression has the form SelfAdjointView<.,.>
+//      in the future selfadjoint-ness should be defined by the expression traits
+//      such that Transpose<SelfAdjointView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
+template<typename MatrixType, unsigned int Mode>
+struct evaluator_traits<SelfAdjointView<MatrixType,Mode> >
 {
-  enum {
-    col = (UnrollCount-1) / Derived1::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
-  };
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef SelfAdjointShape Shape;
+};
 
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &dst, const Derived2 &src)
+template<int UpLo, int SetOpposite, typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version>
+class triangular_dense_assignment_kernel<UpLo,SelfAdjoint,SetOpposite,DstEvaluatorTypeT,SrcEvaluatorTypeT,Functor,Version>
+  : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version>
+{
+protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version> Base;
+  typedef typename Base::DstXprType DstXprType;
+  typedef typename Base::SrcXprType SrcXprType;
+  using Base::m_dst;
+  using Base::m_src;
+  using Base::m_functor;
+public:
+
+  typedef typename Base::DstEvaluatorType DstEvaluatorType;
+  typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::AssignmentTraits AssignmentTraits;
+
+
+  EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+    : Base(dst, src, func, dstExpr)
+  {}
+
+  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
   {
-    triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Upper), UnrollCount-1, ClearOpposite>::run(dst, src);
-
-    if(row == col)
-      dst.coeffRef(row, col) = numext::real(src.coeff(row, col));
-    else if(row < col)
-      dst.coeffRef(col, row) = numext::conj(dst.coeffRef(row, col) = src.coeff(row, col));
+    eigen_internal_assert(row!=col);
+    Scalar tmp = m_src.coeff(row,col);
+    m_functor.assignCoeff(m_dst.coeffRef(row,col), tmp);
+    m_functor.assignCoeff(m_dst.coeffRef(col,row), numext::conj(tmp));
   }
-};
 
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Upper, 0, ClearOpposite>
-{
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &, const Derived2 &) {}
-};
-
-template<typename Derived1, typename Derived2, int UnrollCount, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Lower), UnrollCount, ClearOpposite>
-{
-  enum {
-    col = (UnrollCount-1) / Derived1::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
-  };
-
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &dst, const Derived2 &src)
+  EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
   {
-    triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Lower), UnrollCount-1, ClearOpposite>::run(dst, src);
-
-    if(row == col)
-      dst.coeffRef(row, col) = numext::real(src.coeff(row, col));
-    else if(row > col)
-      dst.coeffRef(col, row) = numext::conj(dst.coeffRef(row, col) = src.coeff(row, col));
+    Base::assignCoeff(id,id);
   }
-};
 
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Lower, 0, ClearOpposite>
-{
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &, const Derived2 &) {}
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Upper, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      for(Index i = 0; i < j; ++i)
-      {
-        dst.copyCoeff(i, j, src);
-        dst.coeffRef(j,i) = numext::conj(dst.coeff(i,j));
-      }
-      dst.copyCoeff(j, j, src);
-    }
-  }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Lower, Dynamic, ClearOpposite>
-{
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-  typedef typename Derived1::Index Index;
-    for(Index i = 0; i < dst.rows(); ++i)
-    {
-      for(Index j = 0; j < i; ++j)
-      {
-        dst.copyCoeff(i, j, src);
-        dst.coeffRef(j,i) = numext::conj(dst.coeff(i,j));
-      }
-      dst.copyCoeff(i, i, src);
-    }
-  }
+  EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index, Index)
+  { eigen_internal_assert(false && "should never be called"); }
 };
 
 } // end namespace internal
@@ -317,22 +334,30 @@
 * Implementation of MatrixBase methods
 ***************************************************************************/
 
+/** This is the const version of MatrixBase::selfadjointView() */
 template<typename Derived>
 template<unsigned int UpLo>
-EIGEN_DEVICE_FUNC
-typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView() const
 {
-  return derived();
+  return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
 }
 
+/** \returns an expression of a symmetric/self-adjoint view extracted from the upper or lower triangular part of the current matrix
+  *
+  * The parameter \a UpLo can be either \c #Upper or \c #Lower
+  *
+  * Example: \include MatrixBase_selfadjointView.cpp
+  * Output: \verbinclude MatrixBase_selfadjointView.out
+  *
+  * \sa class SelfAdjointView
+  */
 template<typename Derived>
 template<unsigned int UpLo>
-EIGEN_DEVICE_FUNC
-typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView()
 {
-  return derived();
+  return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
 }
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/SelfCwiseBinaryOp.h b/Eigen/src/Core/SelfCwiseBinaryOp.h
index ebb7e12..7c89c2e 100644
--- a/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h

@@ -12,216 +12,33 @@
 
 namespace Eigen { 
 
-/** \class SelfCwiseBinaryOp
-  * \ingroup Core_Module
-  *
-  * \internal
-  *
-  * \brief Internal helper class for optimizing operators like +=, -=
-  *
-  * This is a pseudo expression class re-implementing the copyCoeff/copyPacket
-  * method to directly performs a +=/-= operations in an optimal way. In particular,
-  * this allows to make sure that the input/output data are loaded only once using
-  * aligned packet loads.
-  *
-  * \sa class SwapWrapper for a similar trick.
-  */
-
-namespace internal {
-template<typename BinaryOp, typename Lhs, typename Rhs>
-struct traits<SelfCwiseBinaryOp<BinaryOp,Lhs,Rhs> >
-  : traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> >
-{
-  enum {
-    // Note that it is still a good idea to preserve the DirectAccessBit
-    // so that assign can correctly align the data.
-    Flags = traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> >::Flags | (Lhs::Flags&AlignedBit) | (Lhs::Flags&DirectAccessBit) | (Lhs::Flags&LvalueBit),
-    OuterStrideAtCompileTime = Lhs::OuterStrideAtCompileTime,
-    InnerStrideAtCompileTime = Lhs::InnerStrideAtCompileTime
-  };
-};
-}
-
-template<typename BinaryOp, typename Lhs, typename Rhs> class SelfCwiseBinaryOp
-  : public internal::dense_xpr_base< SelfCwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type
-{
-  public:
-
-    typedef typename internal::dense_xpr_base<SelfCwiseBinaryOp>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(SelfCwiseBinaryOp)
-
-    typedef typename internal::packet_traits<Scalar>::type Packet;
-
-    EIGEN_DEVICE_FUNC
-    inline SelfCwiseBinaryOp(Lhs& xpr, const BinaryOp& func = BinaryOp()) : m_matrix(xpr), m_functor(func) {}
-
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols(); }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_matrix.outerStride(); }
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_matrix.innerStride(); }
-    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_matrix.data(); }
-
-    // note that this function is needed by assign to correctly align loads/stores
-    // TODO make Assign use .data()
-    EIGEN_DEVICE_FUNC
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(Lhs)
-      return m_matrix.const_cast_derived().coeffRef(row, col);
-    }
-    EIGEN_DEVICE_FUNC
-    inline const Scalar& coeffRef(Index row, Index col) const
-    {
-      return m_matrix.coeffRef(row, col);
-    }
-
-    // note that this function is needed by assign to correctly align loads/stores
-    // TODO make Assign use .data()
-    EIGEN_DEVICE_FUNC
-    inline Scalar& coeffRef(Index index)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(Lhs)
-      return m_matrix.const_cast_derived().coeffRef(index);
-    }
-    EIGEN_DEVICE_FUNC
-    inline const Scalar& coeffRef(Index index) const
-    {
-      return m_matrix.const_cast_derived().coeffRef(index);
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    void copyCoeff(Index row, Index col, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(row >= 0 && row < rows()
-                         && col >= 0 && col < cols());
-      Scalar& tmp = m_matrix.coeffRef(row,col);
-      tmp = m_functor(tmp, _other.coeff(row,col));
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(index >= 0 && index < m_matrix.size());
-      Scalar& tmp = m_matrix.coeffRef(index);
-      tmp = m_functor(tmp, _other.coeff(index));
-    }
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(Index row, Index col, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      m_matrix.template writePacket<StoreMode>(row, col,
-        m_functor.packetOp(m_matrix.template packet<StoreMode>(row, col),_other.template packet<LoadMode>(row, col)) );
-    }
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(Index index, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(index >= 0 && index < m_matrix.size());
-      m_matrix.template writePacket<StoreMode>(index,
-        m_functor.packetOp(m_matrix.template packet<StoreMode>(index),_other.template packet<LoadMode>(index)) );
-    }
-
-    // reimplement lazyAssign to handle complex *= real
-    // see CwiseBinaryOp ctor for details
-    template<typename RhsDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE SelfCwiseBinaryOp& lazyAssign(const DenseBase<RhsDerived>& rhs)
-    {
-      EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Lhs,RhsDerived)
-      EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename RhsDerived::Scalar);
-      
-    #ifdef EIGEN_DEBUG_ASSIGN
-      internal::assign_traits<SelfCwiseBinaryOp, RhsDerived>::debug();
-    #endif
-      eigen_assert(rows() == rhs.rows() && cols() == rhs.cols());
-      internal::assign_impl<SelfCwiseBinaryOp, RhsDerived>::run(*this,rhs.derived());
-    #ifndef EIGEN_NO_DEBUG
-      this->checkTransposeAliasing(rhs.derived());
-    #endif
-      return *this;
-    }
-    
-    // overloaded to honor evaluation of special matrices
-    // maybe another solution would be to not use SelfCwiseBinaryOp
-    // at first...
-    EIGEN_DEVICE_FUNC
-    SelfCwiseBinaryOp& operator=(const Rhs& _rhs)
-    {
-      typename internal::nested<Rhs>::type rhs(_rhs);
-      return Base::operator=(rhs);
-    }
-
-    EIGEN_DEVICE_FUNC
-    Lhs& expression() const 
-    { 
-      return m_matrix;
-    }
-
-    EIGEN_DEVICE_FUNC
-    const BinaryOp& functor() const 
-    { 
-      return m_functor;
-    }
-
-  protected:
-    Lhs& m_matrix;
-    const BinaryOp& m_functor;
-
-  private:
-    SelfCwiseBinaryOp& operator=(const SelfCwiseBinaryOp&);
-};
+// TODO generalize the scalar type of 'other'
 
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-inline Derived& DenseBase<Derived>::operator*=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
-  SelfCwiseBinaryOp<internal::scalar_product_op<Scalar>, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
-  tmp = PlainObject::Constant(rows(),cols(),other);
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>());
   return derived();
 }
 
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-inline Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
-  SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
-  tmp = PlainObject::Constant(rows(),cols(),other);
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>());
   return derived();
 }
 
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-inline Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
-  SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
-  tmp = PlainObject::Constant(rows(),cols(),other);
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>());
   return derived();
 }
 
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-inline Derived& DenseBase<Derived>::operator/=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
 {
-  typedef typename internal::conditional<NumTraits<Scalar>::IsInteger,
-                                        internal::scalar_quotient_op<Scalar>,
-                                        internal::scalar_product_op<Scalar> >::type BinOp;
-  typedef typename Derived::PlainObject PlainObject;
-  SelfCwiseBinaryOp<BinOp, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
-  Scalar actual_other;
-  if(NumTraits<Scalar>::IsInteger)  actual_other = other;
-  else                              actual_other = Scalar(1)/other;
-  tmp = PlainObject::Constant(rows(),cols(), actual_other);
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>());
   return derived();
 }
 

diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h
new file mode 100644
index 0000000..23d5cb7
--- /dev/null
+++ b/Eigen/src/Core/Solve.h

@@ -0,0 +1,188 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SOLVE_H
+#define EIGEN_SOLVE_H
+
+namespace Eigen {
+
+template<typename Decomposition, typename RhsType, typename StorageKind> class SolveImpl;
+
+/** \class Solve
+  * \ingroup Core_Module
+  *
+  * \brief Pseudo expression representing a solving operation
+  *
+  * \tparam Decomposition the type of the matrix or decomposition object
+  * \tparam Rhstype the type of the right-hand side
+  *
+  * This class represents an expression of A.solve(B)
+  * and most of the time this is the only way it is used.
+  *
+  */
+namespace internal {
+
+// this solve_traits class permits to determine the evaluation type with respect to storage kind (Dense vs Sparse)
+template<typename Decomposition, typename RhsType,typename StorageKind> struct solve_traits;
+
+template<typename Decomposition, typename RhsType>
+struct solve_traits<Decomposition,RhsType,Dense>
+{
+  typedef typename make_proper_matrix_type<typename RhsType::Scalar,
+                 Decomposition::ColsAtCompileTime,
+                 RhsType::ColsAtCompileTime,
+                 RhsType::PlainObject::Options,
+                 Decomposition::MaxColsAtCompileTime,
+                 RhsType::MaxColsAtCompileTime>::type PlainObject;
+};
+
+template<typename Decomposition, typename RhsType>
+struct traits<Solve<Decomposition, RhsType> >
+  : traits<typename solve_traits<Decomposition,RhsType,typename internal::traits<RhsType>::StorageKind>::PlainObject>
+{
+  typedef typename solve_traits<Decomposition,RhsType,typename internal::traits<RhsType>::StorageKind>::PlainObject PlainObject;
+  typedef typename promote_index_type<typename Decomposition::StorageIndex, typename RhsType::StorageIndex>::type StorageIndex;
+  typedef traits<PlainObject> BaseTraits;
+  enum {
+    Flags = BaseTraits::Flags & RowMajorBit,
+    CoeffReadCost = HugeCost
+  };
+};
+
+}
+
+
+template<typename Decomposition, typename RhsType>
+class Solve : public SolveImpl<Decomposition,RhsType,typename internal::traits<RhsType>::StorageKind>
+{
+public:
+  typedef typename internal::traits<Solve>::PlainObject PlainObject;
+  typedef typename internal::traits<Solve>::StorageIndex StorageIndex;
+
+  Solve(const Decomposition &dec, const RhsType &rhs)
+    : m_dec(dec), m_rhs(rhs)
+  {}
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dec.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+
+  EIGEN_DEVICE_FUNC const Decomposition& dec() const { return m_dec; }
+  EIGEN_DEVICE_FUNC const RhsType&       rhs() const { return m_rhs; }
+
+protected:
+  const Decomposition &m_dec;
+  const RhsType       &m_rhs;
+};
+
+
+// Specialization of the Solve expression for dense results
+template<typename Decomposition, typename RhsType>
+class SolveImpl<Decomposition,RhsType,Dense>
+  : public MatrixBase<Solve<Decomposition,RhsType> >
+{
+  typedef Solve<Decomposition,RhsType> Derived;
+
+public:
+
+  typedef MatrixBase<Solve<Decomposition,RhsType> > Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
+
+private:
+
+  Scalar coeff(Index row, Index col) const;
+  Scalar coeff(Index i) const;
+};
+
+// Generic API dispatcher
+template<typename Decomposition, typename RhsType, typename StorageKind>
+class SolveImpl : public internal::generic_xpr_base<Solve<Decomposition,RhsType>, MatrixXpr, StorageKind>::type
+{
+  public:
+    typedef typename internal::generic_xpr_base<Solve<Decomposition,RhsType>, MatrixXpr, StorageKind>::type Base;
+};
+
+namespace internal {
+
+// Evaluator of Solve -> eval into a temporary
+template<typename Decomposition, typename RhsType>
+struct evaluator<Solve<Decomposition,RhsType> >
+  : public evaluator<typename Solve<Decomposition,RhsType>::PlainObject>
+{
+  typedef Solve<Decomposition,RhsType> SolveType;
+  typedef typename SolveType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  enum { Flags = Base::Flags | EvalBeforeNestingBit };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const SolveType& solve)
+    : m_result(solve.rows(), solve.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    solve.dec()._solve_impl(solve.rhs(), m_result);
+  }
+
+protected:
+  PlainObject m_result;
+};
+
+// Specialization for "dst = dec.solve(rhs)"
+// NOTE we need to specialize it for Dense2Dense to avoid ambiguous specialization error and a Sparse2Sparse specialization must exist somewhere
+template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
+{
+  typedef Solve<DecType,RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    src.dec()._solve_impl(src.rhs(), dst);
+  }
+};
+
+// Specialization for "dst = dec.transpose().solve(rhs)"
+template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<Transpose<const DecType>,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
+{
+  typedef Solve<Transpose<const DecType>,RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    src.dec().nestedExpression().template _solve_impl_transposed<false>(src.rhs(), dst);
+  }
+};
+
+// Specialization for "dst = dec.adjoint().solve(rhs)"
+template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType>,
+                  internal::assign_op<Scalar,Scalar>, Dense2Dense>
+{
+  typedef Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    src.dec().nestedExpression().nestedExpression().template _solve_impl_transposed<true>(src.rhs(), dst);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SOLVE_H

diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index f90c55f..dfbf995 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h

@@ -10,7 +10,7 @@
 #ifndef EIGEN_SOLVETRIANGULAR_H
 #define EIGEN_SOLVETRIANGULAR_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -19,7 +19,7 @@
 template<typename LhsScalar, typename RhsScalar, typename Index, int Side, int Mode, bool Conjugate, int StorageOrder>
 struct triangular_solve_vector;
 
-template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherStorageOrder>
+template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherStorageOrder, int OtherInnerStride>
 struct triangular_solve_matrix;
 
 // small helper struct extracting some traits on the underlying solver operation
@@ -54,7 +54,7 @@
   typedef blas_traits<Lhs> LhsProductTraits;
   typedef typename LhsProductTraits::ExtractType ActualLhsType;
   typedef Map<Matrix<RhsScalar,Dynamic,1>, Aligned> MappedRhs;
-  static void run(const Lhs& lhs, Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
   {
     ActualLhsType actualLhs = LhsProductTraits::extract(lhs);
 
@@ -64,11 +64,11 @@
 
     ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhs,rhs.size(),
                                                   (useRhsDirectly ? rhs.data() : 0));
-                                                  
+
     if(!useRhsDirectly)
       MappedRhs(actualRhs,rhs.size()) = rhs;
 
-    triangular_solve_vector<LhsScalar, RhsScalar, typename Lhs::Index, Side, Mode, LhsProductTraits::NeedToConjugate,
+    triangular_solve_vector<LhsScalar, RhsScalar, Index, Side, Mode, LhsProductTraits::NeedToConjugate,
                             (int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor>
       ::run(actualLhs.cols(), actualLhs.data(), actualLhs.outerStride(), actualRhs);
 
@@ -82,11 +82,10 @@
 struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
 {
   typedef typename Rhs::Scalar Scalar;
-  typedef typename Rhs::Index Index;
   typedef blas_traits<Lhs> LhsProductTraits;
   typedef typename LhsProductTraits::DirectLinearAccessType ActualLhsType;
 
-  static void run(const Lhs& lhs, Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
   {
     typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsProductTraits::extract(lhs);
 
@@ -99,8 +98,8 @@
     BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);
 
     triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
-                               (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
-      ::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.outerStride(), blocking);
+                               (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor, Rhs::InnerStrideAtCompileTime>
+      ::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.innerStride(), rhs.outerStride(), blocking);
   }
 };
 
@@ -108,48 +107,48 @@
 * meta-unrolling implementation
 ***************************************************************************/
 
-template<typename Lhs, typename Rhs, int Mode, int Index, int Size,
-         bool Stop = Index==Size>
+template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size,
+         bool Stop = LoopIndex==Size>
 struct triangular_solver_unroller;
 
-template<typename Lhs, typename Rhs, int Mode, int Index, int Size>
-struct triangular_solver_unroller<Lhs,Rhs,Mode,Index,Size,false> {
+template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size>
+struct triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex,Size,false> {
   enum {
     IsLower = ((Mode&Lower)==Lower),
-    I = IsLower ? Index : Size - Index - 1,
-    S = IsLower ? 0     : I+1
+    DiagIndex  = IsLower ? LoopIndex : Size - LoopIndex - 1,
+    StartIndex = IsLower ? 0         : DiagIndex+1
   };
-  static void run(const Lhs& lhs, Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
   {
-    if (Index>0)
-      rhs.coeffRef(I) -= lhs.row(I).template segment<Index>(S).transpose()
-                         .cwiseProduct(rhs.template segment<Index>(S)).sum();
+    if (LoopIndex>0)
+      rhs.coeffRef(DiagIndex) -= lhs.row(DiagIndex).template segment<LoopIndex>(StartIndex).transpose()
+                                .cwiseProduct(rhs.template segment<LoopIndex>(StartIndex)).sum();
 
     if(!(Mode & UnitDiag))
-      rhs.coeffRef(I) /= lhs.coeff(I,I);
+      rhs.coeffRef(DiagIndex) /= lhs.coeff(DiagIndex,DiagIndex);
 
-    triangular_solver_unroller<Lhs,Rhs,Mode,Index+1,Size>::run(lhs,rhs);
+    triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex+1,Size>::run(lhs,rhs);
   }
 };
 
-template<typename Lhs, typename Rhs, int Mode, int Index, int Size>
-struct triangular_solver_unroller<Lhs,Rhs,Mode,Index,Size,true> {
-  static void run(const Lhs&, Rhs&) {}
+template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size>
+struct triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex,Size,true> {
+  static EIGEN_DEVICE_FUNC void run(const Lhs&, Rhs&) {}
 };
 
 template<typename Lhs, typename Rhs, int Mode>
 struct triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,CompleteUnrolling,1> {
-  static void run(const Lhs& lhs, Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
   { triangular_solver_unroller<Lhs,Rhs,Mode,0,Rhs::SizeAtCompileTime>::run(lhs,rhs); }
 };
 
 template<typename Lhs, typename Rhs, int Mode>
 struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
-  static void run(const Lhs& lhs, Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
   {
     Transpose<const Lhs> trLhs(lhs);
     Transpose<Rhs> trRhs(rhs);
-    
+
     triangular_solver_unroller<Transpose<const Lhs>,Transpose<Rhs>,
                               ((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag),
                               0,Rhs::SizeAtCompileTime>::run(trLhs,trRhs);
@@ -162,63 +161,38 @@
 * TriangularView methods
 ***************************************************************************/
 
-/** "in-place" version of TriangularView::solve() where the result is written in \a other
-  *
-  * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
-  * This function will const_cast it, so constness isn't honored here.
-  *
-  * See TriangularView:solve() for the details.
-  */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatrixType, unsigned int Mode>
 template<int Side, typename OtherDerived>
-EIGEN_DEVICE_FUNC
-void TriangularView<MatrixType,Mode>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
 {
   OtherDerived& other = _other.const_cast_derived();
-  eigen_assert( cols() == rows() && ((Side==OnTheLeft && cols() == other.rows()) || (Side==OnTheRight && cols() == other.cols())) );
-  eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
+  eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
+  eigen_assert((!(int(Mode) & int(ZeroDiag))) && bool(int(Mode) & (int(Upper) | int(Lower))));
+  // If solving for a 0x0 matrix, nothing to do, simply return.
+  if (derived().cols() == 0)
+    return;
 
-  enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit  && OtherDerived::IsVectorAtCompileTime };
+  enum { copy = (internal::traits<OtherDerived>::Flags & RowMajorBit)  && OtherDerived::IsVectorAtCompileTime && OtherDerived::SizeAtCompileTime!=1};
   typedef typename internal::conditional<copy,
     typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::type OtherCopy;
   OtherCopy otherCopy(other);
 
   internal::triangular_solver_selector<MatrixType, typename internal::remove_reference<OtherCopy>::type,
-    Side, Mode>::run(nestedExpression(), otherCopy);
+    Side, Mode>::run(derived().nestedExpression(), otherCopy);
 
   if (copy)
     other = otherCopy;
 }
 
-/** \returns the product of the inverse of \c *this with \a other, \a *this being triangular.
-  *
-  * This function computes the inverse-matrix matrix product inverse(\c *this) * \a other if
-  * \a Side==OnTheLeft (the default), or the right-inverse-multiply  \a other * inverse(\c *this) if
-  * \a Side==OnTheRight.
-  *
-  * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
-  * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
-  * is an upper (resp. lower) triangular matrix.
-  *
-  * Example: \include MatrixBase_marked.cpp
-  * Output: \verbinclude MatrixBase_marked.out
-  *
-  * This function returns an expression of the inverse-multiply and can works in-place if it is assigned
-  * to the same matrix or vector \a other.
-  *
-  * For users coming from BLAS, this function (and more specifically solveInPlace()) offer
-  * all the operations supported by the \c *TRSV and \c *TRSM BLAS routines.
-  *
-  * \sa TriangularView::solveInPlace()
-  */
 template<typename Derived, unsigned int Mode>
 template<int Side, typename Other>
-EIGEN_DEVICE_FUNC
 const internal::triangular_solve_retval<Side,TriangularView<Derived,Mode>,Other>
-TriangularView<Derived,Mode>::solve(const MatrixBase<Other>& other) const
+TriangularViewImpl<Derived,Mode,Dense>::solve(const MatrixBase<Other>& other) const
 {
-  return internal::triangular_solve_retval<Side,TriangularView,Other>(*this, other.derived());
+  return internal::triangular_solve_retval<Side,TriangularViewType,Other>(derived(), other.derived());
 }
+#endif
 
 namespace internal {
 
@@ -234,18 +208,17 @@
 {
   typedef typename remove_all<typename Rhs::Nested>::type RhsNestedCleaned;
   typedef ReturnByValue<triangular_solve_retval> Base;
-  typedef typename Base::Index Index;
 
   triangular_solve_retval(const TriangularType& tri, const Rhs& rhs)
     : m_triangularMatrix(tri), m_rhs(rhs)
   {}
 
-  inline Index rows() const { return m_rhs.rows(); }
-  inline Index cols() const { return m_rhs.cols(); }
+  inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_rhs.rows(); }
+  inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
 
   template<typename Dest> inline void evalTo(Dest& dst) const
   {
-    if(!(is_same<RhsNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_rhs)))
+    if(!is_same_dense(dst,m_rhs))
       dst = m_rhs;
     m_triangularMatrix.template solveInPlace<Side>(dst);
   }

diff --git a/Eigen/src/Core/SolverBase.h b/Eigen/src/Core/SolverBase.h
new file mode 100644
index 0000000..5014610
--- /dev/null
+++ b/Eigen/src/Core/SolverBase.h

@@ -0,0 +1,168 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SOLVERBASE_H
+#define EIGEN_SOLVERBASE_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename Derived>
+struct solve_assertion {
+    template<bool Transpose_, typename Rhs>
+    static void run(const Derived& solver, const Rhs& b) { solver.template _check_solve_assertion<Transpose_>(b); }
+};
+
+template<typename Derived>
+struct solve_assertion<Transpose<Derived> >
+{
+    typedef Transpose<Derived> type;
+
+    template<bool Transpose_, typename Rhs>
+    static void run(const type& transpose, const Rhs& b)
+    {
+        internal::solve_assertion<typename internal::remove_all<Derived>::type>::template run<true>(transpose.nestedExpression(), b);
+    }
+};
+
+template<typename Scalar, typename Derived>
+struct solve_assertion<CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>, const Transpose<Derived> > >
+{
+    typedef CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>, const Transpose<Derived> > type;
+
+    template<bool Transpose_, typename Rhs>
+    static void run(const type& adjoint, const Rhs& b)
+    {
+        internal::solve_assertion<typename internal::remove_all<Transpose<Derived> >::type>::template run<true>(adjoint.nestedExpression(), b);
+    }
+};
+} // end namespace internal
+
+/** \class SolverBase
+  * \brief A base class for matrix decomposition and solvers
+  *
+  * \tparam Derived the actual type of the decomposition/solver.
+  *
+  * Any matrix decomposition inheriting this base class provide the following API:
+  *
+  * \code
+  * MatrixType A, b, x;
+  * DecompositionType dec(A);
+  * x = dec.solve(b);             // solve A   * x = b
+  * x = dec.transpose().solve(b); // solve A^T * x = b
+  * x = dec.adjoint().solve(b);   // solve A'  * x = b
+  * \endcode
+  *
+  * \warning Currently, any other usage of transpose() and adjoint() are not supported and will produce compilation errors.
+  *
+  * \sa class PartialPivLU, class FullPivLU, class HouseholderQR, class ColPivHouseholderQR, class FullPivHouseholderQR, class CompleteOrthogonalDecomposition, class LLT, class LDLT, class SVDBase
+  */
+template<typename Derived>
+class SolverBase : public EigenBase<Derived>
+{
+  public:
+
+    typedef EigenBase<Derived> Base;
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef Scalar CoeffReturnType;
+
+    template<typename Derived_>
+    friend struct internal::solve_assertion;
+
+    enum {
+      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+      SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
+                                                          internal::traits<Derived>::ColsAtCompileTime>::ret),
+      MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
+      MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
+                                                             internal::traits<Derived>::MaxColsAtCompileTime>::ret),
+      IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
+                           || internal::traits<Derived>::MaxColsAtCompileTime == 1,
+      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2
+    };
+
+    /** Default constructor */
+    SolverBase()
+    {}
+
+    ~SolverBase()
+    {}
+
+    using Base::derived;
+
+    /** \returns an expression of the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      */
+    template<typename Rhs>
+    inline const Solve<Derived, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      internal::solve_assertion<typename internal::remove_all<Derived>::type>::template run<false>(derived(), b);
+      return Solve<Derived, Rhs>(derived(), b.derived());
+    }
+
+    /** \internal the return type of transpose() */
+    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
+    /** \returns an expression of the transposed of the factored matrix.
+      *
+      * A typical usage is to solve for the transposed problem A^T x = b:
+      * \code x = dec.transpose().solve(b); \endcode
+      *
+      * \sa adjoint(), solve()
+      */
+    inline ConstTransposeReturnType transpose() const
+    {
+      return ConstTransposeReturnType(derived());
+    }
+
+    /** \internal the return type of adjoint() */
+    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
+                        CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, ConstTransposeReturnType>,
+                        ConstTransposeReturnType
+                     >::type AdjointReturnType;
+    /** \returns an expression of the adjoint of the factored matrix
+      *
+      * A typical usage is to solve for the adjoint problem A' x = b:
+      * \code x = dec.adjoint().solve(b); \endcode
+      *
+      * For real scalar types, this function is equivalent to transpose().
+      *
+      * \sa transpose(), solve()
+      */
+    inline AdjointReturnType adjoint() const
+    {
+      return AdjointReturnType(derived().transpose());
+    }
+
+  protected:
+
+    template<bool Transpose_, typename Rhs>
+    void _check_solve_assertion(const Rhs& b) const {
+        EIGEN_ONLY_USED_FOR_DEBUG(b);
+        eigen_assert(derived().m_isInitialized && "Solver is not initialized.");
+        eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "SolverBase::solve(): invalid number of rows of the right hand side matrix b");
+    }
+};
+
+namespace internal {
+
+template<typename Derived>
+struct generic_xpr_base<Derived, MatrixXpr, SolverStorage>
+{
+  typedef SolverBase<Derived> type;
+
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SOLVERBASE_H

diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h
index c862c0b..4a3f0cc 100644
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h

@@ -17,10 +17,9 @@
 template<typename ExpressionType, typename Scalar>
 inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& scale, Scalar& invScale)
 {
-  using std::max;
   Scalar maxCoeff = bl.cwiseAbs().maxCoeff();
   
-  if (maxCoeff>scale)
+  if(maxCoeff>scale)
   {
     ssq = ssq * numext::abs2(scale/maxCoeff);
     Scalar tmp = Scalar(1)/maxCoeff;
@@ -29,12 +28,21 @@
       invScale = NumTraits<Scalar>::highest();
       scale = Scalar(1)/invScale;
     }
+    else if(maxCoeff>NumTraits<Scalar>::highest()) // we got a INF
+    {
+      invScale = Scalar(1);
+      scale = maxCoeff;
+    }
     else
     {
       scale = maxCoeff;
       invScale = tmp;
     }
   }
+  else if(maxCoeff!=maxCoeff) // we got a NaN
+  {
+    scale = maxCoeff;
+  }
   
   // TODO if the maxCoeff is much much smaller than the current scale,
   // then we can neglect this sub vector
@@ -42,70 +50,124 @@
     ssq += (bl*invScale).squaredNorm();
 }
 
+template<typename VectorType, typename RealScalar>
+void stable_norm_impl_inner_step(const VectorType &vec, RealScalar& ssq, RealScalar& scale, RealScalar& invScale)
+{
+  typedef typename VectorType::Scalar Scalar;
+  const Index blockSize = 4096;
+  
+  typedef typename internal::nested_eval<VectorType,2>::type VectorTypeCopy;
+  typedef typename internal::remove_all<VectorTypeCopy>::type VectorTypeCopyClean;
+  const VectorTypeCopy copy(vec);
+  
+  enum {
+    CanAlign = (   (int(VectorTypeCopyClean::Flags)&DirectAccessBit)
+                || (int(internal::evaluator<VectorTypeCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
+               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
+                 && (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
+  };
+  typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<VectorTypeCopyClean>::Alignment>,
+                                                   typename VectorTypeCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
+  Index n = vec.size();
+  
+  Index bi = internal::first_default_aligned(copy);
+  if (bi>0)
+    internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
+  for (; bi<n; bi+=blockSize)
+    internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
+}
+
+template<typename VectorType>
+typename VectorType::RealScalar
+stable_norm_impl(const VectorType &vec, typename enable_if<VectorType::IsVectorAtCompileTime>::type* = 0 )
+{
+  using std::sqrt;
+  using std::abs;
+
+  Index n = vec.size();
+
+  if(n==1)
+    return abs(vec.coeff(0));
+
+  typedef typename VectorType::RealScalar RealScalar;
+  RealScalar scale(0);
+  RealScalar invScale(1);
+  RealScalar ssq(0); // sum of squares
+
+  stable_norm_impl_inner_step(vec, ssq, scale, invScale);
+  
+  return scale * sqrt(ssq);
+}
+
+template<typename MatrixType>
+typename MatrixType::RealScalar
+stable_norm_impl(const MatrixType &mat, typename enable_if<!MatrixType::IsVectorAtCompileTime>::type* = 0 )
+{
+  using std::sqrt;
+
+  typedef typename MatrixType::RealScalar RealScalar;
+  RealScalar scale(0);
+  RealScalar invScale(1);
+  RealScalar ssq(0); // sum of squares
+
+  for(Index j=0; j<mat.outerSize(); ++j)
+    stable_norm_impl_inner_step(mat.innerVector(j), ssq, scale, invScale);
+  return scale * sqrt(ssq);
+}
+
 template<typename Derived>
 inline typename NumTraits<typename traits<Derived>::Scalar>::Real
 blueNorm_impl(const EigenBase<Derived>& _vec)
 {
   typedef typename Derived::RealScalar RealScalar;  
-  typedef typename Derived::Index Index;
   using std::pow;
   using std::sqrt;
   using std::abs;
+
+  // This program calculates the machine-dependent constants
+  // bl, b2, slm, s2m, relerr overfl
+  // from the "basic" machine-dependent numbers
+  // nbig, ibeta, it, iemin, iemax, rbig.
+  // The following define the basic machine-dependent constants.
+  // For portability, the PORT subprograms "ilmaeh" and "rlmach"
+  // are used. For any specific computer, each of the assignment
+  // statements can be replaced
+  static const int ibeta = std::numeric_limits<RealScalar>::radix;  // base for floating-point numbers
+  static const int it    = NumTraits<RealScalar>::digits();  // number of base-beta digits in mantissa
+  static const int iemin = NumTraits<RealScalar>::min_exponent();  // minimum exponent
+  static const int iemax = NumTraits<RealScalar>::max_exponent();  // maximum exponent
+  static const RealScalar rbig   = NumTraits<RealScalar>::highest();  // largest floating-point number
+  static const RealScalar b1     = RealScalar(pow(RealScalar(ibeta),RealScalar(-((1-iemin)/2))));  // lower boundary of midrange
+  static const RealScalar b2     = RealScalar(pow(RealScalar(ibeta),RealScalar((iemax + 1 - it)/2)));  // upper boundary of midrange
+  static const RealScalar s1m    = RealScalar(pow(RealScalar(ibeta),RealScalar((2-iemin)/2)));  // scaling factor for lower range
+  static const RealScalar s2m    = RealScalar(pow(RealScalar(ibeta),RealScalar(- ((iemax+it)/2))));  // scaling factor for upper range
+  static const RealScalar eps    = RealScalar(pow(double(ibeta), 1-it));
+  static const RealScalar relerr = sqrt(eps);  // tolerance for neglecting asml
+
   const Derived& vec(_vec.derived());
-  static bool initialized = false;
-  static RealScalar b1, b2, s1m, s2m, overfl, rbig, relerr;
-  if(!initialized)
-  {
-    int ibeta, it, iemin, iemax, iexp;
-    RealScalar eps;
-    // This program calculates the machine-dependent constants
-    // bl, b2, slm, s2m, relerr overfl
-    // from the "basic" machine-dependent numbers
-    // nbig, ibeta, it, iemin, iemax, rbig.
-    // The following define the basic machine-dependent constants.
-    // For portability, the PORT subprograms "ilmaeh" and "rlmach"
-    // are used. For any specific computer, each of the assignment
-    // statements can be replaced
-    ibeta = std::numeric_limits<RealScalar>::radix;                 // base for floating-point numbers
-    it    = std::numeric_limits<RealScalar>::digits;                // number of base-beta digits in mantissa
-    iemin = std::numeric_limits<RealScalar>::min_exponent;          // minimum exponent
-    iemax = std::numeric_limits<RealScalar>::max_exponent;          // maximum exponent
-    rbig  = (std::numeric_limits<RealScalar>::max)();               // largest floating-point number
-
-    iexp  = -((1-iemin)/2);
-    b1    = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // lower boundary of midrange
-    iexp  = (iemax + 1 - it)/2;
-    b2    = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // upper boundary of midrange
-
-    iexp  = (2-iemin)/2;
-    s1m   = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // scaling factor for lower range
-    iexp  = - ((iemax+it)/2);
-    s2m   = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // scaling factor for upper range
-
-    overfl  = rbig*s2m;                                             // overflow boundary for abig
-    eps     = RealScalar(pow(double(ibeta), 1-it));
-    relerr  = sqrt(eps);                                            // tolerance for neglecting asml
-    initialized = true;
-  }
   Index n = vec.size();
   RealScalar ab2 = b2 / RealScalar(n);
   RealScalar asml = RealScalar(0);
   RealScalar amed = RealScalar(0);
   RealScalar abig = RealScalar(0);
-  for(typename Derived::InnerIterator it(vec, 0); it; ++it)
+
+  for(Index j=0; j<vec.outerSize(); ++j)
   {
-    RealScalar ax = abs(it.value());
-    if(ax > ab2)     abig += numext::abs2(ax*s2m);
-    else if(ax < b1) asml += numext::abs2(ax*s1m);
-    else             amed += numext::abs2(ax);
+    for(typename Derived::InnerIterator iter(vec, j); iter; ++iter)
+    {
+      RealScalar ax = abs(iter.value());
+      if(ax > ab2)     abig += numext::abs2(ax*s2m);
+      else if(ax < b1) asml += numext::abs2(ax*s1m);
+      else             amed += numext::abs2(ax);
+    }
   }
+  if(amed!=amed)
+    return amed;  // we got a NaN
   if(abig > RealScalar(0))
   {
     abig = sqrt(abig);
-    if(abig > overfl)
-    {
-      return rbig;
-    }
+    if(abig > rbig) // overflow, or *this contains INF values
+      return abig;  // return INF
     if(amed > RealScalar(0))
     {
       abig = abig/s2m;
@@ -150,21 +212,7 @@
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::stableNorm() const
 {
-  using std::sqrt;
-  const Index blockSize = 4096;
-  RealScalar scale(0);
-  RealScalar invScale(1);
-  RealScalar ssq(0); // sum of square
-  enum {
-    Alignment = (int(Flags)&DirectAccessBit) || (int(Flags)&AlignedBit) ? 1 : 0
-  };
-  Index n = size();
-  Index bi = internal::first_aligned(derived());
-  if (bi>0)
-    internal::stable_norm_kernel(this->head(bi), ssq, scale, invScale);
-  for (; bi<n; bi+=blockSize)
-    internal::stable_norm_kernel(this->segment(bi,numext::mini(blockSize, n - bi)).template forceAlignedAccessIf<Alignment>(), ssq, scale, invScale);
-  return scale * sqrt(ssq);
+  return internal::stable_norm_impl(derived());
 }
 
 /** \returns the \em l2 norm of \c *this using the Blue's algorithm.
@@ -192,7 +240,10 @@
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::hypotNorm() const
 {
-  return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
+  if(size()==1)
+    return numext::abs(coeff(0,0));
+  else
+    return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
 }
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/StlIterators.h b/Eigen/src/Core/StlIterators.h
new file mode 100644
index 0000000..09041db
--- /dev/null
+++ b/Eigen/src/Core/StlIterators.h

@@ -0,0 +1,463 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STLITERATORS_H
+#define EIGEN_STLITERATORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename IteratorType>
+struct indexed_based_stl_iterator_traits;
+
+template<typename  Derived>
+class indexed_based_stl_iterator_base
+{
+protected:
+  typedef indexed_based_stl_iterator_traits<Derived> traits;
+  typedef typename traits::XprType XprType;
+  typedef indexed_based_stl_iterator_base<typename traits::non_const_iterator> non_const_iterator;
+  typedef indexed_based_stl_iterator_base<typename traits::const_iterator> const_iterator;
+  typedef typename internal::conditional<internal::is_const<XprType>::value,non_const_iterator,const_iterator>::type other_iterator;
+  // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class:
+  friend class indexed_based_stl_iterator_base<typename traits::const_iterator>;
+  friend class indexed_based_stl_iterator_base<typename traits::non_const_iterator>;
+public:
+  typedef Index difference_type;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  indexed_based_stl_iterator_base() EIGEN_NO_THROW : mp_xpr(0), m_index(0) {}
+  indexed_based_stl_iterator_base(XprType& xpr, Index index) EIGEN_NO_THROW : mp_xpr(&xpr), m_index(index) {}
+
+  indexed_based_stl_iterator_base(const non_const_iterator& other) EIGEN_NO_THROW
+    : mp_xpr(other.mp_xpr), m_index(other.m_index)
+  {}
+
+  indexed_based_stl_iterator_base& operator=(const non_const_iterator& other)
+  {
+    mp_xpr = other.mp_xpr;
+    m_index = other.m_index;
+    return *this;
+  }
+
+  Derived& operator++() { ++m_index; return derived(); }
+  Derived& operator--() { --m_index; return derived(); }
+
+  Derived operator++(int) { Derived prev(derived()); operator++(); return prev;}
+  Derived operator--(int) { Derived prev(derived()); operator--(); return prev;}
+
+  friend Derived operator+(const indexed_based_stl_iterator_base& a, Index b) { Derived ret(a.derived()); ret += b; return ret; }
+  friend Derived operator-(const indexed_based_stl_iterator_base& a, Index b) { Derived ret(a.derived()); ret -= b; return ret; }
+  friend Derived operator+(Index a, const indexed_based_stl_iterator_base& b) { Derived ret(b.derived()); ret += a; return ret; }
+  friend Derived operator-(Index a, const indexed_based_stl_iterator_base& b) { Derived ret(b.derived()); ret -= a; return ret; }
+  
+  Derived& operator+=(Index b) { m_index += b; return derived(); }
+  Derived& operator-=(Index b) { m_index -= b; return derived(); }
+
+  difference_type operator-(const indexed_based_stl_iterator_base& other) const
+  {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index - other.m_index;
+  }
+
+  difference_type operator-(const other_iterator& other) const
+  {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index - other.m_index;
+  }
+
+  bool operator==(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; }
+  bool operator!=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; }
+  bool operator< (const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <  other.m_index; }
+  bool operator<=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; }
+  bool operator> (const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >  other.m_index; }
+  bool operator>=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; }
+
+  bool operator==(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; }
+  bool operator!=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; }
+  bool operator< (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <  other.m_index; }
+  bool operator<=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; }
+  bool operator> (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >  other.m_index; }
+  bool operator>=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; }
+
+protected:
+
+  Derived& derived() { return static_cast<Derived&>(*this); }
+  const Derived& derived() const { return static_cast<const Derived&>(*this); }
+
+  XprType *mp_xpr;
+  Index m_index;
+};
+
+template<typename  Derived>
+class indexed_based_stl_reverse_iterator_base
+{
+protected:
+  typedef indexed_based_stl_iterator_traits<Derived> traits;
+  typedef typename traits::XprType XprType;
+  typedef indexed_based_stl_reverse_iterator_base<typename traits::non_const_iterator> non_const_iterator;
+  typedef indexed_based_stl_reverse_iterator_base<typename traits::const_iterator> const_iterator;
+  typedef typename internal::conditional<internal::is_const<XprType>::value,non_const_iterator,const_iterator>::type other_iterator;
+  // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class:
+  friend class indexed_based_stl_reverse_iterator_base<typename traits::const_iterator>;
+  friend class indexed_based_stl_reverse_iterator_base<typename traits::non_const_iterator>;
+public:
+  typedef Index difference_type;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  indexed_based_stl_reverse_iterator_base() : mp_xpr(0), m_index(0) {}
+  indexed_based_stl_reverse_iterator_base(XprType& xpr, Index index) : mp_xpr(&xpr), m_index(index) {}
+
+  indexed_based_stl_reverse_iterator_base(const non_const_iterator& other)
+    : mp_xpr(other.mp_xpr), m_index(other.m_index)
+  {}
+
+  indexed_based_stl_reverse_iterator_base& operator=(const non_const_iterator& other)
+  {
+    mp_xpr = other.mp_xpr;
+    m_index = other.m_index;
+    return *this;
+  }
+
+  Derived& operator++() { --m_index; return derived(); }
+  Derived& operator--() { ++m_index; return derived(); }
+
+  Derived operator++(int) { Derived prev(derived()); operator++(); return prev;}
+  Derived operator--(int) { Derived prev(derived()); operator--(); return prev;}
+
+  friend Derived operator+(const indexed_based_stl_reverse_iterator_base& a, Index b) { Derived ret(a.derived()); ret += b; return ret; }
+  friend Derived operator-(const indexed_based_stl_reverse_iterator_base& a, Index b) { Derived ret(a.derived()); ret -= b; return ret; }
+  friend Derived operator+(Index a, const indexed_based_stl_reverse_iterator_base& b) { Derived ret(b.derived()); ret += a; return ret; }
+  friend Derived operator-(Index a, const indexed_based_stl_reverse_iterator_base& b) { Derived ret(b.derived()); ret -= a; return ret; }
+  
+  Derived& operator+=(Index b) { m_index -= b; return derived(); }
+  Derived& operator-=(Index b) { m_index += b; return derived(); }
+
+  difference_type operator-(const indexed_based_stl_reverse_iterator_base& other) const
+  {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return other.m_index - m_index;
+  }
+
+  difference_type operator-(const other_iterator& other) const
+  {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return other.m_index - m_index;
+  }
+
+  bool operator==(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; }
+  bool operator!=(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; }
+  bool operator< (const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >  other.m_index; }
+  bool operator<=(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; }
+  bool operator> (const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <  other.m_index; }
+  bool operator>=(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; }
+
+  bool operator==(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; }
+  bool operator!=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; }
+  bool operator< (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >  other.m_index; }
+  bool operator<=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; }
+  bool operator> (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <  other.m_index; }
+  bool operator>=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; }
+
+protected:
+
+  Derived& derived() { return static_cast<Derived&>(*this); }
+  const Derived& derived() const { return static_cast<const Derived&>(*this); }
+
+  XprType *mp_xpr;
+  Index m_index;
+};
+
+template<typename XprType>
+class pointer_based_stl_iterator
+{
+  enum { is_lvalue  = internal::is_lvalue<XprType>::value };
+  typedef pointer_based_stl_iterator<typename internal::remove_const<XprType>::type> non_const_iterator;
+  typedef pointer_based_stl_iterator<typename internal::add_const<XprType>::type> const_iterator;
+  typedef typename internal::conditional<internal::is_const<XprType>::value,non_const_iterator,const_iterator>::type other_iterator;
+  // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class:
+  friend class pointer_based_stl_iterator<typename internal::add_const<XprType>::type>;
+  friend class pointer_based_stl_iterator<typename internal::remove_const<XprType>::type>;
+public:
+  typedef Index difference_type;
+  typedef typename XprType::Scalar value_type;
+  typedef std::random_access_iterator_tag iterator_category;
+  typedef typename internal::conditional<bool(is_lvalue), value_type*, const value_type*>::type pointer;
+  typedef typename internal::conditional<bool(is_lvalue), value_type&, const value_type&>::type reference;
+
+
+  pointer_based_stl_iterator() EIGEN_NO_THROW : m_ptr(0) {}
+  pointer_based_stl_iterator(XprType& xpr, Index index) EIGEN_NO_THROW : m_incr(xpr.innerStride())
+  {
+    m_ptr = xpr.data() + index * m_incr.value();
+  }
+
+  pointer_based_stl_iterator(const non_const_iterator& other) EIGEN_NO_THROW
+    : m_ptr(other.m_ptr), m_incr(other.m_incr)
+  {}
+
+  pointer_based_stl_iterator& operator=(const non_const_iterator& other) EIGEN_NO_THROW
+  {
+    m_ptr = other.m_ptr;
+    m_incr.setValue(other.m_incr);
+    return *this;
+  }
+
+  reference operator*()         const { return *m_ptr;   }
+  reference operator[](Index i) const { return *(m_ptr+i*m_incr.value()); }
+  pointer   operator->()        const { return m_ptr;    }
+
+  pointer_based_stl_iterator& operator++() { m_ptr += m_incr.value(); return *this; }
+  pointer_based_stl_iterator& operator--() { m_ptr -= m_incr.value(); return *this; }
+
+  pointer_based_stl_iterator operator++(int) { pointer_based_stl_iterator prev(*this); operator++(); return prev;}
+  pointer_based_stl_iterator operator--(int) { pointer_based_stl_iterator prev(*this); operator--(); return prev;}
+
+  friend pointer_based_stl_iterator operator+(const pointer_based_stl_iterator& a, Index b) { pointer_based_stl_iterator ret(a); ret += b; return ret; }
+  friend pointer_based_stl_iterator operator-(const pointer_based_stl_iterator& a, Index b) { pointer_based_stl_iterator ret(a); ret -= b; return ret; }
+  friend pointer_based_stl_iterator operator+(Index a, const pointer_based_stl_iterator& b) { pointer_based_stl_iterator ret(b); ret += a; return ret; }
+  friend pointer_based_stl_iterator operator-(Index a, const pointer_based_stl_iterator& b) { pointer_based_stl_iterator ret(b); ret -= a; return ret; }
+  
+  pointer_based_stl_iterator& operator+=(Index b) { m_ptr += b*m_incr.value(); return *this; }
+  pointer_based_stl_iterator& operator-=(Index b) { m_ptr -= b*m_incr.value(); return *this; }
+
+  difference_type operator-(const pointer_based_stl_iterator& other) const {
+    return (m_ptr - other.m_ptr)/m_incr.value();
+  }
+
+  difference_type operator-(const other_iterator& other) const {
+    return (m_ptr - other.m_ptr)/m_incr.value();
+  }
+
+  bool operator==(const pointer_based_stl_iterator& other) const { return m_ptr == other.m_ptr; }
+  bool operator!=(const pointer_based_stl_iterator& other) const { return m_ptr != other.m_ptr; }
+  bool operator< (const pointer_based_stl_iterator& other) const { return m_ptr <  other.m_ptr; }
+  bool operator<=(const pointer_based_stl_iterator& other) const { return m_ptr <= other.m_ptr; }
+  bool operator> (const pointer_based_stl_iterator& other) const { return m_ptr >  other.m_ptr; }
+  bool operator>=(const pointer_based_stl_iterator& other) const { return m_ptr >= other.m_ptr; }
+
+  bool operator==(const other_iterator& other) const { return m_ptr == other.m_ptr; }
+  bool operator!=(const other_iterator& other) const { return m_ptr != other.m_ptr; }
+  bool operator< (const other_iterator& other) const { return m_ptr <  other.m_ptr; }
+  bool operator<=(const other_iterator& other) const { return m_ptr <= other.m_ptr; }
+  bool operator> (const other_iterator& other) const { return m_ptr >  other.m_ptr; }
+  bool operator>=(const other_iterator& other) const { return m_ptr >= other.m_ptr; }
+
+protected:
+
+  pointer m_ptr;
+  internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_incr;
+};
+
+template<typename _XprType>
+struct indexed_based_stl_iterator_traits<generic_randaccess_stl_iterator<_XprType> >
+{
+  typedef _XprType XprType;
+  typedef generic_randaccess_stl_iterator<typename internal::remove_const<XprType>::type> non_const_iterator;
+  typedef generic_randaccess_stl_iterator<typename internal::add_const<XprType>::type> const_iterator;
+};
+
+template<typename XprType>
+class generic_randaccess_stl_iterator : public indexed_based_stl_iterator_base<generic_randaccess_stl_iterator<XprType> >
+{
+public:
+  typedef typename XprType::Scalar value_type;
+
+protected:
+
+  enum {
+    has_direct_access = (internal::traits<XprType>::Flags & DirectAccessBit) ? 1 : 0,
+    is_lvalue  = internal::is_lvalue<XprType>::value
+  };
+
+  typedef indexed_based_stl_iterator_base<generic_randaccess_stl_iterator> Base;
+  using Base::m_index;
+  using Base::mp_xpr;
+
+  // TODO currently const Transpose/Reshape expressions never returns const references,
+  // so lets return by value too.
+  //typedef typename internal::conditional<bool(has_direct_access), const value_type&, const value_type>::type read_only_ref_t;
+  typedef const value_type read_only_ref_t;
+
+public:
+  
+  typedef typename internal::conditional<bool(is_lvalue), value_type *, const value_type *>::type pointer;
+  typedef typename internal::conditional<bool(is_lvalue), value_type&, read_only_ref_t>::type reference;
+  
+  generic_randaccess_stl_iterator() : Base() {}
+  generic_randaccess_stl_iterator(XprType& xpr, Index index) : Base(xpr,index) {}
+  generic_randaccess_stl_iterator(const typename Base::non_const_iterator& other) : Base(other) {}
+  using Base::operator=;
+
+  reference operator*()         const { return   (*mp_xpr)(m_index);   }
+  reference operator[](Index i) const { return   (*mp_xpr)(m_index+i); }
+  pointer   operator->()        const { return &((*mp_xpr)(m_index)); }
+};
+
+template<typename _XprType, DirectionType Direction>
+struct indexed_based_stl_iterator_traits<subvector_stl_iterator<_XprType,Direction> >
+{
+  typedef _XprType XprType;
+  typedef subvector_stl_iterator<typename internal::remove_const<XprType>::type, Direction> non_const_iterator;
+  typedef subvector_stl_iterator<typename internal::add_const<XprType>::type, Direction> const_iterator;
+};
+
+template<typename XprType, DirectionType Direction>
+class subvector_stl_iterator : public indexed_based_stl_iterator_base<subvector_stl_iterator<XprType,Direction> >
+{
+protected:
+
+  enum { is_lvalue  = internal::is_lvalue<XprType>::value };
+
+  typedef indexed_based_stl_iterator_base<subvector_stl_iterator> Base;
+  using Base::m_index;
+  using Base::mp_xpr;
+
+  typedef typename internal::conditional<Direction==Vertical,typename XprType::ColXpr,typename XprType::RowXpr>::type SubVectorType;
+  typedef typename internal::conditional<Direction==Vertical,typename XprType::ConstColXpr,typename XprType::ConstRowXpr>::type ConstSubVectorType;
+
+
+public:
+  typedef typename internal::conditional<bool(is_lvalue), SubVectorType, ConstSubVectorType>::type reference;
+  typedef typename reference::PlainObject value_type;
+
+private:
+  class subvector_stl_iterator_ptr
+  {
+  public:
+      subvector_stl_iterator_ptr(const reference &subvector) : m_subvector(subvector) {}
+      reference* operator->() { return &m_subvector; }
+  private:
+      reference m_subvector;
+  };
+public:
+
+  typedef subvector_stl_iterator_ptr pointer;
+  
+  subvector_stl_iterator() : Base() {}
+  subvector_stl_iterator(XprType& xpr, Index index) : Base(xpr,index) {}
+
+  reference operator*()         const { return (*mp_xpr).template subVector<Direction>(m_index); }
+  reference operator[](Index i) const { return (*mp_xpr).template subVector<Direction>(m_index+i); }
+  pointer   operator->()        const { return (*mp_xpr).template subVector<Direction>(m_index); }
+};
+
+template<typename _XprType, DirectionType Direction>
+struct indexed_based_stl_iterator_traits<subvector_stl_reverse_iterator<_XprType,Direction> >
+{
+  typedef _XprType XprType;
+  typedef subvector_stl_reverse_iterator<typename internal::remove_const<XprType>::type, Direction> non_const_iterator;
+  typedef subvector_stl_reverse_iterator<typename internal::add_const<XprType>::type, Direction> const_iterator;
+};
+
+template<typename XprType, DirectionType Direction>
+class subvector_stl_reverse_iterator : public indexed_based_stl_reverse_iterator_base<subvector_stl_reverse_iterator<XprType,Direction> >
+{
+protected:
+
+  enum { is_lvalue  = internal::is_lvalue<XprType>::value };
+
+  typedef indexed_based_stl_reverse_iterator_base<subvector_stl_reverse_iterator> Base;
+  using Base::m_index;
+  using Base::mp_xpr;
+
+  typedef typename internal::conditional<Direction==Vertical,typename XprType::ColXpr,typename XprType::RowXpr>::type SubVectorType;
+  typedef typename internal::conditional<Direction==Vertical,typename XprType::ConstColXpr,typename XprType::ConstRowXpr>::type ConstSubVectorType;
+
+
+public:
+  typedef typename internal::conditional<bool(is_lvalue), SubVectorType, ConstSubVectorType>::type reference;
+  typedef typename reference::PlainObject value_type;
+
+private:
+  class subvector_stl_reverse_iterator_ptr
+  {
+  public:
+      subvector_stl_reverse_iterator_ptr(const reference &subvector) : m_subvector(subvector) {}
+      reference* operator->() { return &m_subvector; }
+  private:
+      reference m_subvector;
+  };
+public:
+
+  typedef subvector_stl_reverse_iterator_ptr pointer;
+  
+  subvector_stl_reverse_iterator() : Base() {}
+  subvector_stl_reverse_iterator(XprType& xpr, Index index) : Base(xpr,index) {}
+
+  reference operator*()         const { return (*mp_xpr).template subVector<Direction>(m_index); }
+  reference operator[](Index i) const { return (*mp_xpr).template subVector<Direction>(m_index+i); }
+  pointer   operator->()        const { return (*mp_xpr).template subVector<Direction>(m_index); }
+};
+
+} // namespace internal
+
+
+/** returns an iterator to the first element of the 1D vector or array
+  * \only_for_vectors
+  * \sa end(), cbegin()
+  */
+template<typename Derived>
+inline typename DenseBase<Derived>::iterator DenseBase<Derived>::begin()
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  return iterator(derived(), 0);
+}
+
+/** const version of begin() */
+template<typename Derived>
+inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::begin() const
+{
+  return cbegin();
+}
+
+/** returns a read-only const_iterator to the first element of the 1D vector or array
+  * \only_for_vectors
+  * \sa cend(), begin()
+  */
+template<typename Derived>
+inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::cbegin() const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  return const_iterator(derived(), 0);
+}
+
+/** returns an iterator to the element following the last element of the 1D vector or array
+  * \only_for_vectors
+  * \sa begin(), cend()
+  */
+template<typename Derived>
+inline typename DenseBase<Derived>::iterator DenseBase<Derived>::end()
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  return iterator(derived(), size());
+}
+
+/** const version of end() */
+template<typename Derived>
+inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::end() const
+{
+  return cend();
+}
+
+/** returns a read-only const_iterator to the element following the last element of the 1D vector or array
+  * \only_for_vectors
+  * \sa begin(), cend()
+  */
+template<typename Derived>
+inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::cend() const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  return const_iterator(derived(), size());
+}
+
+} // namespace Eigen
+
+#endif // EIGEN_STLITERATORS_H

diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h
index d3d454e..d164e53 100644
--- a/Eigen/src/Core/Stride.h
+++ b/Eigen/src/Core/Stride.h

@@ -10,7 +10,7 @@
 #ifndef EIGEN_STRIDE_H
 #define EIGEN_STRIDE_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \class Stride
   * \ingroup Core_Module
@@ -31,20 +31,28 @@
   * arguments to the constructor.
   *
   * Indeed, this class takes two template parameters:
-  *  \param _OuterStrideAtCompileTime the outer stride, or Dynamic if you want to specify it at runtime.
-  *  \param _InnerStrideAtCompileTime the inner stride, or Dynamic if you want to specify it at runtime.
+  *  \tparam _OuterStrideAtCompileTime the outer stride, or Dynamic if you want to specify it at runtime.
+  *  \tparam _InnerStrideAtCompileTime the inner stride, or Dynamic if you want to specify it at runtime.
   *
   * Here is an example:
   * \include Map_general_stride.cpp
   * Output: \verbinclude Map_general_stride.out
   *
+  * Both strides can be negative. However, a negative stride of -1 cannot be specified at compile time
+  * because of the ambiguity with Dynamic which is defined to -1 (historically, negative strides were
+  * not allowed).
+  *
+  * Note that for compile-time vectors (ColsAtCompileTime==1 or RowsAtCompile==1),
+  * the inner stride is the pointer increment between two consecutive elements,
+  * regardless of storage layout.
+  *
   * \sa class InnerStride, class OuterStride, \ref TopicStorageOrders
   */
 template<int _OuterStrideAtCompileTime, int _InnerStrideAtCompileTime>
 class Stride
 {
   public:
-    typedef DenseIndex Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
     enum {
       InnerStrideAtCompileTime = _InnerStrideAtCompileTime,
       OuterStrideAtCompileTime = _OuterStrideAtCompileTime
@@ -55,6 +63,8 @@
     Stride()
       : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime)
     {
+      // FIXME: for Eigen 4 we should use DynamicIndex instead of Dynamic.
+      // FIXME: for Eigen 4 we should also unify this API with fix<>
       eigen_assert(InnerStrideAtCompileTime != Dynamic && OuterStrideAtCompileTime != Dynamic);
     }
 
@@ -63,7 +73,6 @@
     Stride(Index outerStride, Index innerStride)
       : m_outer(outerStride), m_inner(innerStride)
     {
-      eigen_assert(innerStride>=0 && outerStride>=0);
     }
 
     /** Copy constructor */
@@ -73,10 +82,10 @@
     {}
 
     /** \returns the outer stride */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index outer() const { return m_outer.value(); }
     /** \returns the inner stride */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index inner() const { return m_inner.value(); }
 
   protected:
@@ -86,26 +95,24 @@
 
 /** \brief Convenience specialization of Stride to specify only an inner stride
   * See class Map for some examples */
-template<int Value = Dynamic>
+template<int Value>
 class InnerStride : public Stride<0, Value>
 {
     typedef Stride<0, Value> Base;
   public:
-    typedef DenseIndex Index;
     EIGEN_DEVICE_FUNC InnerStride() : Base() {}
-    EIGEN_DEVICE_FUNC InnerStride(Index v) : Base(0, v) {}
+    EIGEN_DEVICE_FUNC InnerStride(Index v) : Base(0, v) {} // FIXME making this explicit could break valid code
 };
 
 /** \brief Convenience specialization of Stride to specify only an outer stride
   * See class Map for some examples */
-template<int Value = Dynamic>
+template<int Value>
 class OuterStride : public Stride<Value, 0>
 {
     typedef Stride<Value, 0> Base;
   public:
-    typedef DenseIndex Index;
     EIGEN_DEVICE_FUNC OuterStride() : Base() {}
-    EIGEN_DEVICE_FUNC OuterStride(Index v) : Base(v,0) {}
+    EIGEN_DEVICE_FUNC OuterStride(Index v) : Base(v,0) {} // FIXME making this explicit could break valid code
 };
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h
index d602fba..180a4e5 100644
--- a/Eigen/src/Core/Swap.h
+++ b/Eigen/src/Core/Swap.h

@@ -12,129 +12,57 @@
 
 namespace Eigen { 
 
-/** \class SwapWrapper
-  * \ingroup Core_Module
-  *
-  * \internal
-  *
-  * \brief Internal helper class for swapping two expressions
-  */
 namespace internal {
-template<typename ExpressionType>
-struct traits<SwapWrapper<ExpressionType> > : traits<ExpressionType> {};
-}
 
-template<typename ExpressionType> class SwapWrapper
-  : public internal::dense_xpr_base<SwapWrapper<ExpressionType> >::type
+// Overload default assignPacket behavior for swapping them
+template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT>
+class generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, Specialized>
+ : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, BuiltIn>
 {
-  public:
-
-    typedef typename internal::dense_xpr_base<SwapWrapper>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(SwapWrapper)
-    typedef typename internal::packet_traits<Scalar>::type Packet;
-
-    EIGEN_DEVICE_FUNC
-    inline SwapWrapper(ExpressionType& xpr) : m_expression(xpr) {}
-
-    EIGEN_DEVICE_FUNC
-    inline Index rows() const { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC
-    inline Index cols() const { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const { return m_expression.innerStride(); }
-    
-    typedef typename internal::conditional<
-                       internal::is_lvalue<ExpressionType>::value,
-                       Scalar,
-                       const Scalar
-                     >::type ScalarWithConstIfNotLvalue;
-                     
-    EIGEN_DEVICE_FUNC
-    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
-    EIGEN_DEVICE_FUNC
-    inline const Scalar* data() const { return m_expression.data(); }
-
-    EIGEN_DEVICE_FUNC
-    inline Scalar& coeffRef(Index rowId, Index colId)
-    {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
-    }
-
-    EIGEN_DEVICE_FUNC
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    EIGEN_DEVICE_FUNC
-    inline Scalar& coeffRef(Index rowId, Index colId) const
-    {
-      return m_expression.coeffRef(rowId, colId);
-    }
-
-    EIGEN_DEVICE_FUNC
-    inline Scalar& coeffRef(Index index) const
-    {
-      return m_expression.coeffRef(index);
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    void copyCoeff(Index rowId, Index colId, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(rowId >= 0 && rowId < rows()
-                         && colId >= 0 && colId < cols());
-      Scalar tmp = m_expression.coeff(rowId, colId);
-      m_expression.coeffRef(rowId, colId) = _other.coeff(rowId, colId);
-      _other.coeffRef(rowId, colId) = tmp;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(index >= 0 && index < m_expression.size());
-      Scalar tmp = m_expression.coeff(index);
-      m_expression.coeffRef(index) = _other.coeff(index);
-      _other.coeffRef(index) = tmp;
-    }
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(Index rowId, Index colId, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(rowId >= 0 && rowId < rows()
-                        && colId >= 0 && colId < cols());
-      Packet tmp = m_expression.template packet<StoreMode>(rowId, colId);
-      m_expression.template writePacket<StoreMode>(rowId, colId,
-        _other.template packet<LoadMode>(rowId, colId)
-      );
-      _other.template writePacket<LoadMode>(rowId, colId, tmp);
-    }
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(Index index, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(index >= 0 && index < m_expression.size());
-      Packet tmp = m_expression.template packet<StoreMode>(index);
-      m_expression.template writePacket<StoreMode>(index,
-        _other.template packet<LoadMode>(index)
-      );
-      _other.template writePacket<LoadMode>(index, tmp);
-    }
-
-    EIGEN_DEVICE_FUNC
-    ExpressionType& expression() const { return m_expression; }
-
-  protected:
-    ExpressionType& m_expression;
+protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, BuiltIn> Base;
+  using Base::m_dst;
+  using Base::m_src;
+  using Base::m_functor;
+  
+public:
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::DstXprType DstXprType;
+  typedef swap_assign_op<Scalar> Functor;
+  
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
+    : Base(dst, src, func, dstExpr)
+  {}
+  
+  template<int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE void assignPacket(Index row, Index col)
+  {
+    PacketType tmp = m_src.template packet<LoadMode,PacketType>(row,col);
+    const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(row,col, m_dst.template packet<StoreMode,PacketType>(row,col));
+    m_dst.template writePacket<StoreMode>(row,col,tmp);
+  }
+  
+  template<int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE void assignPacket(Index index)
+  {
+    PacketType tmp = m_src.template packet<LoadMode,PacketType>(index);
+    const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(index, m_dst.template packet<StoreMode,PacketType>(index));
+    m_dst.template writePacket<StoreMode>(index,tmp);
+  }
+  
+  // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael)
+  template<int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner)
+  {
+    Index row = Base::rowIndexByOuterInner(outer, inner); 
+    Index col = Base::colIndexByOuterInner(outer, inner);
+    assignPacket<StoreMode,LoadMode,PacketType>(row, col);
+  }
 };
 
+} // namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_SWAP_H

diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 2791a18..2bc658f 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h

@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -11,41 +11,23 @@
 #ifndef EIGEN_TRANSPOSE_H
 #define EIGEN_TRANSPOSE_H
 
-namespace Eigen { 
-
-/** \class Transpose
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the transpose of a matrix
-  *
-  * \param MatrixType the type of the object of which we are taking the transpose
-  *
-  * This class represents an expression of the transpose of a matrix.
-  * It is the return type of MatrixBase::transpose() and MatrixBase::adjoint()
-  * and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::transpose(), MatrixBase::adjoint()
-  */
+namespace Eigen {
 
 namespace internal {
 template<typename MatrixType>
-struct traits<Transpose<MatrixType> > : traits<MatrixType>
+struct traits<Transpose<MatrixType> > : public traits<MatrixType>
 {
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type MatrixTypeNestedPlain;
-  typedef typename traits<MatrixType>::StorageKind StorageKind;
-  typedef typename traits<MatrixType>::XprKind XprKind;
   enum {
     RowsAtCompileTime = MatrixType::ColsAtCompileTime,
     ColsAtCompileTime = MatrixType::RowsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxColsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
-    Flags0 = MatrixTypeNestedPlain::Flags & ~(LvalueBit | NestByRefBit),
+    Flags0 = traits<MatrixTypeNestedPlain>::Flags & ~(LvalueBit | NestByRefBit),
     Flags1 = Flags0 | FlagsLvalueBit,
     Flags = Flags1 ^ RowMajorBit,
-    CoeffReadCost = MatrixTypeNestedPlain::CoeffReadCost,
     InnerStrideAtCompileTime = inner_stride_at_compile_time<MatrixType>::ret,
     OuterStrideAtCompileTime = outer_stride_at_compile_time<MatrixType>::ret
   };
@@ -54,34 +36,58 @@
 
 template<typename MatrixType, typename StorageKind> class TransposeImpl;
 
+/** \class Transpose
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the transpose of a matrix
+  *
+  * \tparam MatrixType the type of the object of which we are taking the transpose
+  *
+  * This class represents an expression of the transpose of a matrix.
+  * It is the return type of MatrixBase::transpose() and MatrixBase::adjoint()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::transpose(), MatrixBase::adjoint()
+  */
 template<typename MatrixType> class Transpose
   : public TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>
 {
   public:
 
+    typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
+
     typedef typename TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(Transpose)
+    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
 
     EIGEN_DEVICE_FUNC
-    inline Transpose(MatrixType& a_matrix) : m_matrix(a_matrix) {}
+    explicit EIGEN_STRONG_INLINE Transpose(MatrixType& matrix) : m_matrix(matrix) {}
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)
 
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.cols(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
 
     /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename MatrixType::Nested>::type&
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<MatrixTypeNested>::type&
     nestedExpression() const { return m_matrix; }
 
     /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
-    typename internal::remove_all<typename MatrixType::Nested>::type&
-    nestedExpression() { return m_matrix.const_cast_derived(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    typename internal::remove_reference<MatrixTypeNested>::type&
+    nestedExpression() { return m_matrix; }
+
+    /** \internal */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    void resize(Index nrows, Index ncols) {
+      m_matrix.resize(ncols,nrows);
+    }
 
   protected:
-    typename MatrixType::Nested m_matrix;
+    typename internal::ref_selector<MatrixType>::non_const_type m_matrix;
 };
 
 namespace internal {
@@ -100,17 +106,29 @@
 
 } // end namespace internal
 
+// Generic API dispatcher
+template<typename XprType, typename StorageKind>
+class TransposeImpl
+  : public internal::generic_xpr_base<Transpose<XprType> >::type
+{
+public:
+  typedef typename internal::generic_xpr_base<Transpose<XprType> >::type Base;
+};
+
 template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
   : public internal::TransposeImpl_base<MatrixType>::type
 {
   public:
 
     typedef typename internal::TransposeImpl_base<MatrixType>::type Base;
+    using Base::coeffRef;
     EIGEN_DENSE_PUBLIC_INTERFACE(Transpose<MatrixType>)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl)
 
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return derived().nestedExpression().outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Index innerStride() const { return derived().nestedExpression().innerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Index outerStride() const { return derived().nestedExpression().outerStride(); }
 
     typedef typename internal::conditional<
                        internal::is_lvalue<MatrixType>::value,
@@ -118,70 +136,25 @@
                        const Scalar
                      >::type ScalarWithConstIfNotLvalue;
 
-    inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
-    inline const Scalar* data() const { return derived().nestedExpression().data(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Scalar* data() const { return derived().nestedExpression().data(); }
 
-    EIGEN_DEVICE_FUNC
-    inline ScalarWithConstIfNotLvalue& coeffRef(Index rowId, Index colId)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return derived().nestedExpression().const_cast_derived().coeffRef(colId, rowId);
-    }
-
-    EIGEN_DEVICE_FUNC
-    inline ScalarWithConstIfNotLvalue& coeffRef(Index index)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return derived().nestedExpression().const_cast_derived().coeffRef(index);
-    }
-
-    EIGEN_DEVICE_FUNC
-    inline const Scalar& coeffRef(Index rowId, Index colId) const
+    // FIXME: shall we keep the const version of coeffRef?
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Scalar& coeffRef(Index rowId, Index colId) const
     {
       return derived().nestedExpression().coeffRef(colId, rowId);
     }
 
-    EIGEN_DEVICE_FUNC
-    inline const Scalar& coeffRef(Index index) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Scalar& coeffRef(Index index) const
     {
       return derived().nestedExpression().coeffRef(index);
     }
-
-    EIGEN_DEVICE_FUNC
-    inline CoeffReturnType coeff(Index rowId, Index colId) const
-    {
-      return derived().nestedExpression().coeff(colId, rowId);
-    }
-
-    EIGEN_DEVICE_FUNC
-    inline CoeffReturnType coeff(Index index) const
-    {
-      return derived().nestedExpression().coeff(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index rowId, Index colId) const
-    {
-      return derived().nestedExpression().template packet<LoadMode>(colId, rowId);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index rowId, Index colId, const PacketScalar& x)
-    {
-      derived().nestedExpression().const_cast_derived().template writePacket<LoadMode>(colId, rowId, x);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return derived().nestedExpression().template packet<LoadMode>(index);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
-    {
-      derived().nestedExpression().const_cast_derived().template writePacket<LoadMode>(index, x);
-    }
+  protected:
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TransposeImpl)
 };
 
 /** \returns an expression of the transpose of *this.
@@ -204,11 +177,11 @@
   *
   * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-inline Transpose<Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Transpose<Derived>
 DenseBase<Derived>::transpose()
 {
-  return derived();
+  return TransposeReturnType(derived());
 }
 
 /** This is the const version of transpose().
@@ -217,8 +190,8 @@
   *
   * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-inline typename DenseBase<Derived>::ConstTransposeReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename DenseBase<Derived>::ConstTransposeReturnType
 DenseBase<Derived>::transpose() const
 {
   return ConstTransposeReturnType(derived());
@@ -244,12 +217,10 @@
   *
   * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-inline const typename MatrixBase<Derived>::AdjointReturnType
+EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::AdjointReturnType
 MatrixBase<Derived>::adjoint() const
 {
-  return this->transpose(); // in the complex case, the .conjugate() is be implicit here
-                            // due to implicit conversion to return type
+  return AdjointReturnType(this->transpose());
 }
 
 /***************************************************************************
@@ -259,26 +230,95 @@
 namespace internal {
 
 template<typename MatrixType,
-  bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic>
+  bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic,
+  bool MatchPacketSize =
+        (int(MatrixType::RowsAtCompileTime) == int(internal::packet_traits<typename MatrixType::Scalar>::size))
+    &&  (internal::evaluator<MatrixType>::Flags&PacketAccessBit) >
 struct inplace_transpose_selector;
 
 template<typename MatrixType>
-struct inplace_transpose_selector<MatrixType,true> { // square matrix
+struct inplace_transpose_selector<MatrixType,true,false> { // square matrix
   static void run(MatrixType& m) {
-    m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
+    m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose().template triangularView<StrictlyUpper>());
   }
 };
 
 template<typename MatrixType>
-struct inplace_transpose_selector<MatrixType,false> { // non square matrix
+struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x PacketSize
   static void run(MatrixType& m) {
-    if (m.rows()==m.cols())
-      m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
-    else
-      m = m.transpose().eval();
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
+    const Index PacketSize = internal::packet_traits<Scalar>::size;
+    const Index Alignment = internal::evaluator<MatrixType>::Alignment;
+    PacketBlock<Packet> A;
+    for (Index i=0; i<PacketSize; ++i)
+      A.packet[i] = m.template packetByOuterInner<Alignment>(i,0);
+    internal::ptranspose(A);
+    for (Index i=0; i<PacketSize; ++i)
+      m.template writePacket<Alignment>(m.rowIndexByOuterInner(i,0), m.colIndexByOuterInner(i,0), A.packet[i]);
   }
 };
 
+
+template <typename MatrixType, Index Alignment>
+void BlockedInPlaceTranspose(MatrixType& m) {
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
+  const Index PacketSize = internal::packet_traits<Scalar>::size;
+  eigen_assert(m.rows() == m.cols());
+  int row_start = 0;
+  for (; row_start + PacketSize <= m.rows(); row_start += PacketSize) {
+    for (int col_start = row_start; col_start + PacketSize <= m.cols(); col_start += PacketSize) {
+      PacketBlock<Packet> A;
+      if (row_start == col_start) {
+        for (Index i=0; i<PacketSize; ++i)
+          A.packet[i] = m.template packetByOuterInner<Alignment>(row_start + i,col_start);
+        internal::ptranspose(A);
+        for (Index i=0; i<PacketSize; ++i)
+          m.template writePacket<Alignment>(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), A.packet[i]);
+      } else {
+        PacketBlock<Packet> B;
+        for (Index i=0; i<PacketSize; ++i) {
+          A.packet[i] = m.template packetByOuterInner<Alignment>(row_start + i,col_start);
+          B.packet[i] = m.template packetByOuterInner<Alignment>(col_start + i, row_start);
+        }
+        internal::ptranspose(A);
+        internal::ptranspose(B);
+        for (Index i=0; i<PacketSize; ++i) {
+          m.template writePacket<Alignment>(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), B.packet[i]);
+          m.template writePacket<Alignment>(m.rowIndexByOuterInner(col_start + i, row_start), m.colIndexByOuterInner(col_start + i,row_start), A.packet[i]);
+        }
+      }
+    }
+  }
+  for (Index row = row_start; row < m.rows(); ++row) {
+    m.matrix().row(row).head(row).swap(
+        m.matrix().col(row).head(row).transpose());
+  }
+}
+
+template<typename MatrixType,bool MatchPacketSize>
+struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square or dynamic matrix
+  static void run(MatrixType& m) {
+    typedef typename MatrixType::Scalar Scalar;
+    if (m.rows() == m.cols()) {
+      const Index PacketSize = internal::packet_traits<Scalar>::size;
+      if (!NumTraits<Scalar>::IsComplex && m.rows() >= PacketSize) {
+        if ((m.rows() % PacketSize) == 0)
+          BlockedInPlaceTranspose<MatrixType,internal::evaluator<MatrixType>::Alignment>(m);
+        else
+          BlockedInPlaceTranspose<MatrixType,Unaligned>(m);
+      }
+      else {
+        m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose().template triangularView<StrictlyUpper>());
+      }
+    } else {
+      m = m.transpose().eval();
+    }
+  }
+};
+
+
 } // end namespace internal
 
 /** This is the "in place" version of transpose(): it replaces \c *this by its own transpose.
@@ -296,13 +336,12 @@
   * Notice however that this method is only useful if you want to replace a matrix by its own transpose.
   * If you just need the transpose of a matrix, use transpose().
   *
-  * \note if the matrix is not square, then \c *this must be a resizable matrix. 
+  * \note if the matrix is not square, then \c *this must be a resizable matrix.
   * This excludes (non-square) fixed-size matrices, block-expressions and maps.
   *
   * \sa transpose(), adjoint(), adjointInPlace() */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-inline void DenseBase<Derived>::transposeInPlace()
+EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::transposeInPlace()
 {
   eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic))
                && "transposeInPlace() called on a non-square non-resizable matrix");
@@ -333,8 +372,7 @@
   *
   * \sa transpose(), adjoint(), transposeInPlace() */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-inline void MatrixBase<Derived>::adjointInPlace()
+EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::adjointInPlace()
 {
   derived() = adjoint().eval();
 }
@@ -345,14 +383,6 @@
 
 namespace internal {
 
-template<typename BinOp,typename NestedXpr,typename Rhs>
-struct blas_traits<SelfCwiseBinaryOp<BinOp,NestedXpr,Rhs> >
- : blas_traits<NestedXpr>
-{
-  typedef SelfCwiseBinaryOp<BinOp,NestedXpr,Rhs> XprType;
-  static inline const XprType extract(const XprType& x) { return x; }
-};
-
 template<bool DestIsTransposed, typename OtherDerived>
 struct check_transpose_aliasing_compile_time_selector
 {
@@ -418,15 +448,16 @@
     }
 };
 
+template<typename Dst, typename Src>
+void check_for_aliasing(const Dst &dst, const Src &src)
+{
+  if((!Dst::IsVectorAtCompileTime) && dst.rows()>1 && dst.cols()>1)
+    internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);
+}
+
 } // end namespace internal
 
-template<typename Derived>
-template<typename OtherDerived>
-void DenseBase<Derived>::checkTransposeAliasing(const OtherDerived& other) const
-{
-    internal::checkTransposeAliasing_impl<Derived, OtherDerived>::run(derived(), other);
-}
-#endif
+#endif // EIGEN_NO_DEBUG
 
 } // end namespace Eigen
 

diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h
index 21cc9b1..38a7b01 100644
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h

@@ -12,39 +12,6 @@
 
 namespace Eigen {
 
-/** \class Transpositions
-  * \ingroup Core_Module
-  *
-  * \brief Represents a sequence of transpositions (row/column interchange)
-  *
-  * \param SizeAtCompileTime the number of transpositions, or Dynamic
-  * \param MaxSizeAtCompileTime the maximum number of transpositions, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
-  *
-  * This class represents a permutation transformation as a sequence of \em n transpositions
-  * \f$[T_{n-1} \ldots T_{i} \ldots T_{0}]\f$. It is internally stored as a vector of integers \c indices.
-  * Each transposition \f$ T_{i} \f$ applied on the left of a matrix (\f$ T_{i} M\f$) interchanges
-  * the rows \c i and \c indices[i] of the matrix \c M.
-  * A transposition applied on the right (e.g., \f$ M T_{i}\f$) yields a column interchange.
-  *
-  * Compared to the class PermutationMatrix, such a sequence of transpositions is what is
-  * computed during a decomposition with pivoting, and it is faster when applying the permutation in-place.
-  *
-  * To apply a sequence of transpositions to a matrix, simply use the operator * as in the following example:
-  * \code
-  * Transpositions tr;
-  * MatrixXf mat;
-  * mat = tr * mat;
-  * \endcode
-  * In this example, we detect that the matrix appears on both side, and so the transpositions
-  * are applied in-place without any temporary or extra copy.
-  *
-  * \sa class PermutationMatrix
-  */
-
-namespace internal {
-template<typename TranspositionType, typename MatrixType, int Side, bool Transposed=false> struct transposition_matrix_product_retval;
-}
-
 template<typename Derived>
 class TranspositionsBase
 {
@@ -53,9 +20,12 @@
   public:
 
     typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
+    typedef typename IndicesType::Scalar StorageIndex;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
+    EIGEN_DEVICE_FUNC
     Derived& derived() { return *static_cast<Derived*>(this); }
+    EIGEN_DEVICE_FUNC
     const Derived& derived() const { return *static_cast<const Derived*>(this); }
 
     /** Copies the \a other transpositions into \c *this */
@@ -66,36 +36,35 @@
       return derived();
     }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    Derived& operator=(const TranspositionsBase& other)
-    {
-      indices() = other.indices();
-      return derived();
-    }
-    #endif
-
     /** \returns the number of transpositions */
-    inline Index size() const { return static_cast<Index>(indices().size()); }
+    EIGEN_DEVICE_FUNC
+    Index size() const { return indices().size(); }
+    /** \returns the number of rows of the equivalent permutation matrix */
+    EIGEN_DEVICE_FUNC
+    Index rows() const { return indices().size(); }
+    /** \returns the number of columns of the equivalent permutation matrix */
+    EIGEN_DEVICE_FUNC
+    Index cols() const { return indices().size(); }
 
     /** Direct access to the underlying index vector */
-    inline const Index& coeff(Index i) const { return indices().coeff(i); }
+    EIGEN_DEVICE_FUNC
+    inline const StorageIndex& coeff(Index i) const { return indices().coeff(i); }
     /** Direct access to the underlying index vector */
-    inline Index& coeffRef(Index i) { return indices().coeffRef(i); }
+    inline StorageIndex& coeffRef(Index i) { return indices().coeffRef(i); }
     /** Direct access to the underlying index vector */
-    inline const Index& operator()(Index i) const { return indices()(i); }
+    inline const StorageIndex& operator()(Index i) const { return indices()(i); }
     /** Direct access to the underlying index vector */
-    inline Index& operator()(Index i) { return indices()(i); }
+    inline StorageIndex& operator()(Index i) { return indices()(i); }
     /** Direct access to the underlying index vector */
-    inline const Index& operator[](Index i) const { return indices()(i); }
+    inline const StorageIndex& operator[](Index i) const { return indices()(i); }
     /** Direct access to the underlying index vector */
-    inline Index& operator[](Index i) { return indices()(i); }
+    inline StorageIndex& operator[](Index i) { return indices()(i); }
 
     /** const version of indices(). */
+    EIGEN_DEVICE_FUNC
     const IndicesType& indices() const { return derived().indices(); }
     /** \returns a reference to the stored array representing the transpositions. */
+    EIGEN_DEVICE_FUNC
     IndicesType& indices() { return derived().indices(); }
 
     /** Resizes to given size. */
@@ -107,12 +76,12 @@
     /** Sets \c *this to represents an identity transformation */
     void setIdentity()
     {
-      for(int i = 0; i < indices().size(); ++i)
+      for(StorageIndex i = 0; i < indices().size(); ++i)
         coeffRef(i) = i;
     }
 
     // FIXME: do we want such methods ?
-    // might be usefull when the target matrix expression is complex, e.g.:
+    // might be useful when the target matrix expression is complex, e.g.:
     // object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..);
     /*
     template<typename MatrixType>
@@ -144,23 +113,53 @@
 };
 
 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-struct traits<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
+struct traits<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
+ : traits<PermutationMatrix<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
 {
-  typedef IndexType Index;
-  typedef Matrix<Index, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+  typedef Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+  typedef TranspositionsStorage StorageKind;
 };
 }
 
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType> >
+/** \class Transpositions
+  * \ingroup Core_Module
+  *
+  * \brief Represents a sequence of transpositions (row/column interchange)
+  *
+  * \tparam SizeAtCompileTime the number of transpositions, or Dynamic
+  * \tparam MaxSizeAtCompileTime the maximum number of transpositions, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
+  *
+  * This class represents a permutation transformation as a sequence of \em n transpositions
+  * \f$[T_{n-1} \ldots T_{i} \ldots T_{0}]\f$. It is internally stored as a vector of integers \c indices.
+  * Each transposition \f$ T_{i} \f$ applied on the left of a matrix (\f$ T_{i} M\f$) interchanges
+  * the rows \c i and \c indices[i] of the matrix \c M.
+  * A transposition applied on the right (e.g., \f$ M T_{i}\f$) yields a column interchange.
+  *
+  * Compared to the class PermutationMatrix, such a sequence of transpositions is what is
+  * computed during a decomposition with pivoting, and it is faster when applying the permutation in-place.
+  *
+  * To apply a sequence of transpositions to a matrix, simply use the operator * as in the following example:
+  * \code
+  * Transpositions tr;
+  * MatrixXf mat;
+  * mat = tr * mat;
+  * \endcode
+  * In this example, we detect that the matrix appears on both side, and so the transpositions
+  * are applied in-place without any temporary or extra copy.
+  *
+  * \sa class PermutationMatrix
+  */
+
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
+class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
 {
     typedef internal::traits<Transpositions> Traits;
   public:
 
     typedef TranspositionsBase<Transpositions> Base;
     typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
+    typedef typename IndicesType::Scalar StorageIndex;
 
     inline Transpositions() {}
 
@@ -169,15 +168,9 @@
     inline Transpositions(const TranspositionsBase<OtherDerived>& other)
       : m_indices(other.indices()) {}
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** Standard copy constructor. Defined only to prevent a default copy constructor
-      * from hiding the other templated constructor */
-    inline Transpositions(const Transpositions& other) : m_indices(other.indices()) {}
-    #endif
-
     /** Generic constructor from expression of the transposition indices. */
     template<typename Other>
-    explicit inline Transpositions(const MatrixBase<Other>& a_indices) : m_indices(a_indices)
+    explicit inline Transpositions(const MatrixBase<Other>& indices) : m_indices(indices)
     {}
 
     /** Copies the \a other transpositions into \c *this */
@@ -187,25 +180,16 @@
       return Base::operator=(other);
     }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    Transpositions& operator=(const Transpositions& other)
-    {
-      m_indices = other.m_indices;
-      return *this;
-    }
-    #endif
-
     /** Constructs an uninitialized permutation matrix of given size.
       */
     inline Transpositions(Index size) : m_indices(size)
     {}
 
     /** const version of indices(). */
+    EIGEN_DEVICE_FUNC
     const IndicesType& indices() const { return m_indices; }
     /** \returns a reference to the stored array representing the transpositions. */
+    EIGEN_DEVICE_FUNC
     IndicesType& indices() { return m_indices; }
 
   protected:
@@ -215,30 +199,32 @@
 
 
 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
-struct traits<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,_PacketAccess> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int _PacketAccess>
+struct traits<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,_PacketAccess> >
+ : traits<PermutationMatrix<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
 {
-  typedef IndexType Index;
-  typedef Map<const Matrix<Index,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1>, _PacketAccess> IndicesType;
+  typedef Map<const Matrix<_StorageIndex,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1>, _PacketAccess> IndicesType;
+  typedef _StorageIndex StorageIndex;
+  typedef TranspositionsStorage StorageKind;
 };
 }
 
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int PacketAccess>
-class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,PacketAccess>
- : public TranspositionsBase<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,PacketAccess> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int PacketAccess>
+class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,PacketAccess>
+ : public TranspositionsBase<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,PacketAccess> >
 {
     typedef internal::traits<Map> Traits;
   public:
 
     typedef TranspositionsBase<Map> Base;
     typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
+    typedef typename IndicesType::Scalar StorageIndex;
 
-    inline Map(const Index* indicesPtr)
+    explicit inline Map(const StorageIndex* indicesPtr)
       : m_indices(indicesPtr)
     {}
 
-    inline Map(const Index* indicesPtr, Index size)
+    inline Map(const StorageIndex* indicesPtr, Index size)
       : m_indices(indicesPtr,size)
     {}
 
@@ -261,9 +247,11 @@
     #endif
 
     /** const version of indices(). */
+    EIGEN_DEVICE_FUNC
     const IndicesType& indices() const { return m_indices; }
 
     /** \returns a reference to the stored array representing the transpositions. */
+    EIGEN_DEVICE_FUNC
     IndicesType& indices() { return m_indices; }
 
   protected:
@@ -274,9 +262,9 @@
 namespace internal {
 template<typename _IndicesType>
 struct traits<TranspositionsWrapper<_IndicesType> >
+ : traits<PermutationWrapper<_IndicesType> >
 {
-  typedef typename _IndicesType::Scalar Index;
-  typedef _IndicesType IndicesType;
+  typedef TranspositionsStorage StorageKind;
 };
 }
 
@@ -289,10 +277,10 @@
 
     typedef TranspositionsBase<TranspositionsWrapper> Base;
     typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
+    typedef typename IndicesType::Scalar StorageIndex;
 
-    inline TranspositionsWrapper(IndicesType& a_indices)
-      : m_indices(a_indices)
+    explicit inline TranspositionsWrapper(IndicesType& indices)
+      : m_indices(indices)
     {}
 
     /** Copies the \a other transpositions into \c *this */
@@ -302,102 +290,56 @@
       return Base::operator=(other);
     }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    TranspositionsWrapper& operator=(const TranspositionsWrapper& other)
-    {
-      m_indices = other.m_indices;
-      return *this;
-    }
-    #endif
-
     /** const version of indices(). */
+    EIGEN_DEVICE_FUNC
     const IndicesType& indices() const { return m_indices; }
 
     /** \returns a reference to the stored array representing the transpositions. */
+    EIGEN_DEVICE_FUNC
     IndicesType& indices() { return m_indices; }
 
   protected:
 
-    const typename IndicesType::Nested m_indices;
+    typename IndicesType::Nested m_indices;
 };
 
+
+
 /** \returns the \a matrix with the \a transpositions applied to the columns.
   */
-template<typename Derived, typename TranspositionsDerived>
-inline const internal::transposition_matrix_product_retval<TranspositionsDerived, Derived, OnTheRight>
-operator*(const MatrixBase<Derived>& matrix,
-          const TranspositionsBase<TranspositionsDerived> &transpositions)
+template<typename MatrixDerived, typename TranspositionsDerived>
+EIGEN_DEVICE_FUNC
+const Product<MatrixDerived, TranspositionsDerived, AliasFreeProduct>
+operator*(const MatrixBase<MatrixDerived> &matrix,
+          const TranspositionsBase<TranspositionsDerived>& transpositions)
 {
-  return internal::transposition_matrix_product_retval
-           <TranspositionsDerived, Derived, OnTheRight>
-           (transpositions.derived(), matrix.derived());
+  return Product<MatrixDerived, TranspositionsDerived, AliasFreeProduct>
+            (matrix.derived(), transpositions.derived());
 }
 
 /** \returns the \a matrix with the \a transpositions applied to the rows.
   */
-template<typename Derived, typename TranspositionDerived>
-inline const internal::transposition_matrix_product_retval
-               <TranspositionDerived, Derived, OnTheLeft>
-operator*(const TranspositionsBase<TranspositionDerived> &transpositions,
-          const MatrixBase<Derived>& matrix)
+template<typename TranspositionsDerived, typename MatrixDerived>
+EIGEN_DEVICE_FUNC
+const Product<TranspositionsDerived, MatrixDerived, AliasFreeProduct>
+operator*(const TranspositionsBase<TranspositionsDerived> &transpositions,
+          const MatrixBase<MatrixDerived>& matrix)
 {
-  return internal::transposition_matrix_product_retval
-           <TranspositionDerived, Derived, OnTheLeft>
-           (transpositions.derived(), matrix.derived());
+  return Product<TranspositionsDerived, MatrixDerived, AliasFreeProduct>
+            (transpositions.derived(), matrix.derived());
 }
 
+// Template partial specialization for transposed/inverse transpositions
+
 namespace internal {
 
-template<typename TranspositionType, typename MatrixType, int Side, bool Transposed>
-struct traits<transposition_matrix_product_retval<TranspositionType, MatrixType, Side, Transposed> >
-{
-  typedef typename MatrixType::PlainObject ReturnType;
-};
-
-template<typename TranspositionType, typename MatrixType, int Side, bool Transposed>
-struct transposition_matrix_product_retval
- : public ReturnByValue<transposition_matrix_product_retval<TranspositionType, MatrixType, Side, Transposed> >
-{
-    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
-    typedef typename TranspositionType::Index Index;
-
-    transposition_matrix_product_retval(const TranspositionType& tr, const MatrixType& matrix)
-      : m_transpositions(tr), m_matrix(matrix)
-    {}
-
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-
-    template<typename Dest> inline void evalTo(Dest& dst) const
-    {
-      const Index size = m_transpositions.size();
-      Index j = 0;
-
-      if(!(is_same<MatrixTypeNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_matrix)))
-        dst = m_matrix;
-
-      for(Index k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k)
-        if((j=m_transpositions.coeff(k))!=k)
-        {
-          if(Side==OnTheLeft)
-            dst.row(k).swap(dst.row(j));
-          else if(Side==OnTheRight)
-            dst.col(k).swap(dst.col(j));
-        }
-    }
-
-  protected:
-    const TranspositionType& m_transpositions;
-    typename MatrixType::Nested m_matrix;
-};
+template<typename Derived>
+struct traits<Transpose<TranspositionsBase<Derived> > >
+ : traits<Derived>
+{};
 
 } // end namespace internal
 
-/* Template partial specialization for transposed/inverse transpositions */
-
 template<typename TranspositionsDerived>
 class Transpose<TranspositionsBase<TranspositionsDerived> >
 {
@@ -405,28 +347,36 @@
     typedef typename TranspositionType::IndicesType IndicesType;
   public:
 
-    Transpose(const TranspositionType& t) : m_transpositions(t) {}
+    explicit Transpose(const TranspositionType& t) : m_transpositions(t) {}
 
-    inline int size() const { return m_transpositions.size(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index size() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
 
     /** \returns the \a matrix with the inverse transpositions applied to the columns.
       */
-    template<typename Derived> friend
-    inline const internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheRight, true>
-    operator*(const MatrixBase<Derived>& matrix, const Transpose& trt)
+    template<typename OtherDerived> friend
+    const Product<OtherDerived, Transpose, AliasFreeProduct>
+    operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trt)
     {
-      return internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheRight, true>(trt.m_transpositions, matrix.derived());
+      return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt);
     }
 
     /** \returns the \a matrix with the inverse transpositions applied to the rows.
       */
-    template<typename Derived>
-    inline const internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheLeft, true>
-    operator*(const MatrixBase<Derived>& matrix) const
+    template<typename OtherDerived>
+    const Product<Transpose, OtherDerived, AliasFreeProduct>
+    operator*(const MatrixBase<OtherDerived>& matrix) const
     {
-      return internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheLeft, true>(m_transpositions, matrix.derived());
+      return Product<Transpose, OtherDerived, AliasFreeProduct>(*this, matrix.derived());
     }
 
+    EIGEN_DEVICE_FUNC
+    const TranspositionType& nestedExpression() const { return m_transpositions; }
+
   protected:
     const TranspositionType& m_transpositions;
 };

diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index 65c8f3b..fdb8bc1 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h

@@ -11,17 +11,15 @@
 #ifndef EIGEN_TRIANGULARMATRIX_H
 #define EIGEN_TRIANGULARMATRIX_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
-  
+
 template<int Side, typename TriangularType, typename Rhs> struct triangular_solve_retval;
-  
+
 }
 
-/** \internal
-  *
-  * \class TriangularBase
+/** \class TriangularBase
   * \ingroup Core_Module
   *
   * \brief Base class for triangular part in a matrix
@@ -32,29 +30,48 @@
 
     enum {
       Mode = internal::traits<Derived>::Mode,
-      CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
       RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
       ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
       MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime
+      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
+
+      SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
+                                                   internal::traits<Derived>::ColsAtCompileTime>::ret),
+      /**< This is equal to the number of coefficients, i.e. the number of
+          * rows times the number of columns, or to \a Dynamic if this is not
+          * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
+
+      MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
+                                                   internal::traits<Derived>::MaxColsAtCompileTime>::ret)
+
     };
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
-    typedef typename internal::traits<Derived>::DenseMatrixType DenseMatrixType;
+    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
+    typedef typename internal::traits<Derived>::FullMatrixType DenseMatrixType;
     typedef DenseMatrixType DenseType;
+    typedef Derived const& Nested;
 
     EIGEN_DEVICE_FUNC
-    inline TriangularBase() { eigen_assert(!((Mode&UnitDiag) && (Mode&ZeroDiag))); }
+    inline TriangularBase() { eigen_assert(!((int(Mode) & int(UnitDiag)) && (int(Mode) & int(ZeroDiag)))); }
 
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return derived().outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return derived().innerStride(); }
+
+    // dummy resize function
     EIGEN_DEVICE_FUNC
-    inline Index rows() const { return derived().rows(); }
-    EIGEN_DEVICE_FUNC
-    inline Index cols() const { return derived().cols(); }
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const { return derived().outerStride(); }
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const { return derived().innerStride(); }
+    void resize(Index rows, Index cols)
+    {
+      EIGEN_UNUSED_VARIABLE(rows);
+      EIGEN_UNUSED_VARIABLE(cols);
+      eigen_assert(rows==this->rows() && cols==this->cols());
+    }
 
     EIGEN_DEVICE_FUNC
     inline Scalar coeff(Index row, Index col) const  { return derived().coeff(row,col); }
@@ -134,17 +151,17 @@
 /** \class TriangularView
   * \ingroup Core_Module
   *
-  * \brief Base class for triangular part in a matrix
+  * \brief Expression of a triangular part in a matrix
   *
   * \param MatrixType the type of the object in which we are taking the triangular part
   * \param Mode the kind of triangular matrix expression to construct. Can be #Upper,
   *             #Lower, #UnitUpper, #UnitLower, #StrictlyUpper, or #StrictlyLower.
-  *             This is in fact a bit field; it must have either #Upper or #Lower, 
-  *             and additionnaly it may have #UnitDiag or #ZeroDiag or neither.
+  *             This is in fact a bit field; it must have either #Upper or #Lower,
+  *             and additionally it may have #UnitDiag or #ZeroDiag or neither.
   *
   * This class represents a triangular part of a matrix, not necessarily square. Strictly speaking, for rectangular
   * matrices one should speak of "trapezoid" parts. This class is the return type
-  * of MatrixBase::triangularView() and most of the time this is the only way it is used.
+  * of MatrixBase::triangularView() and SparseMatrixBase::triangularView(), and most of the time this is the only way it is used.
   *
   * \sa MatrixBase::triangularView()
   */
@@ -152,273 +169,154 @@
 template<typename MatrixType, unsigned int _Mode>
 struct traits<TriangularView<MatrixType, _Mode> > : traits<MatrixType>
 {
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::non_const_type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type MatrixTypeNestedNonRef;
   typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
+  typedef typename MatrixType::PlainObject FullMatrixType;
   typedef MatrixType ExpressionType;
-  typedef typename MatrixType::PlainObject DenseMatrixType;
   enum {
     Mode = _Mode,
-    Flags = (MatrixTypeNestedCleaned::Flags & (HereditaryBits) & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit))) | Mode,
-    CoeffReadCost = MatrixTypeNestedCleaned::CoeffReadCost
+    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags = (MatrixTypeNestedCleaned::Flags & (HereditaryBits | FlagsLvalueBit) & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit)))
   };
 };
 }
 
-template<int Mode, bool LhsIsTriangular,
-         typename Lhs, bool LhsIsVector,
-         typename Rhs, bool RhsIsVector>
-struct TriangularProduct;
+template<typename _MatrixType, unsigned int _Mode, typename StorageKind> class TriangularViewImpl;
 
 template<typename _MatrixType, unsigned int _Mode> class TriangularView
-  : public TriangularBase<TriangularView<_MatrixType, _Mode> >
+  : public TriangularViewImpl<_MatrixType, _Mode, typename internal::traits<_MatrixType>::StorageKind >
 {
   public:
 
-    typedef TriangularBase<TriangularView> Base;
+    typedef TriangularViewImpl<_MatrixType, _Mode, typename internal::traits<_MatrixType>::StorageKind > Base;
     typedef typename internal::traits<TriangularView>::Scalar Scalar;
-
     typedef _MatrixType MatrixType;
-    typedef typename internal::traits<TriangularView>::DenseMatrixType DenseMatrixType;
-    typedef DenseMatrixType PlainObject;
 
   protected:
     typedef typename internal::traits<TriangularView>::MatrixTypeNested MatrixTypeNested;
     typedef typename internal::traits<TriangularView>::MatrixTypeNestedNonRef MatrixTypeNestedNonRef;
-    typedef typename internal::traits<TriangularView>::MatrixTypeNestedCleaned MatrixTypeNestedCleaned;
 
     typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
-    
+    typedef TriangularView<typename internal::add_const<MatrixType>::type, _Mode> ConstTriangularView;
+
   public:
-    using Base::evalToLazy;
-  
 
     typedef typename internal::traits<TriangularView>::StorageKind StorageKind;
-    typedef typename internal::traits<TriangularView>::Index Index;
+    typedef typename internal::traits<TriangularView>::MatrixTypeNestedCleaned NestedExpression;
 
     enum {
       Mode = _Mode,
+      Flags = internal::traits<TriangularView>::Flags,
       TransposeMode = (Mode & Upper ? Lower : 0)
                     | (Mode & Lower ? Upper : 0)
                     | (Mode & (UnitDiag))
-                    | (Mode & (ZeroDiag))
+                    | (Mode & (ZeroDiag)),
+      IsVectorAtCompileTime = false
     };
 
     EIGEN_DEVICE_FUNC
-    inline TriangularView(const MatrixType& matrix) : m_matrix(matrix)
+    explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix)
     {}
 
-    EIGEN_DEVICE_FUNC
-    inline Index rows() const { return m_matrix.rows(); }
-    EIGEN_DEVICE_FUNC
-    inline Index cols() const { return m_matrix.cols(); }
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const { return m_matrix.outerStride(); }
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const { return m_matrix.innerStride(); }
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView)
 
-    /** \sa MatrixBase::operator+=() */    
-    template<typename Other>
-    EIGEN_DEVICE_FUNC
-    TriangularView&  operator+=(const DenseBase<Other>& other) { return *this = m_matrix + other.derived(); }
-    /** \sa MatrixBase::operator-=() */
-    template<typename Other>
-    EIGEN_DEVICE_FUNC
-    TriangularView&  operator-=(const DenseBase<Other>& other) { return *this = m_matrix - other.derived(); }
-    /** \sa MatrixBase::operator*=() */
-    EIGEN_DEVICE_FUNC
-    TriangularView&  operator*=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = m_matrix * other; }
-    /** \sa MatrixBase::operator/=() */
-    EIGEN_DEVICE_FUNC
-    TriangularView&  operator/=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = m_matrix / other; }
+    /** \copydoc EigenBase::rows() */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    /** \copydoc EigenBase::cols() */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
 
-    /** \sa MatrixBase::fill() */
+    /** \returns a const reference to the nested expression */
     EIGEN_DEVICE_FUNC
-    void fill(const Scalar& value) { setConstant(value); }
-    /** \sa MatrixBase::setConstant() */
-    EIGEN_DEVICE_FUNC
-    TriangularView& setConstant(const Scalar& value)
-    { return *this = MatrixType::Constant(rows(), cols(), value); }
-    /** \sa MatrixBase::setZero() */
-    EIGEN_DEVICE_FUNC
-    TriangularView& setZero() { return setConstant(Scalar(0)); }
-    /** \sa MatrixBase::setOnes() */
-    EIGEN_DEVICE_FUNC
-    TriangularView& setOnes() { return setConstant(Scalar(1)); }
+    const NestedExpression& nestedExpression() const { return m_matrix; }
 
-    /** \sa MatrixBase::coeff()
-      * \warning the coordinates must fit into the referenced triangular part
-      */
+    /** \returns a reference to the nested expression */
     EIGEN_DEVICE_FUNC
-    inline Scalar coeff(Index row, Index col) const
-    {
-      Base::check_coordinates_internal(row, col);
-      return m_matrix.coeff(row, col);
-    }
+    NestedExpression& nestedExpression() { return m_matrix; }
 
-    /** \sa MatrixBase::coeffRef()
-      * \warning the coordinates must fit into the referenced triangular part
-      */
-    EIGEN_DEVICE_FUNC
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      Base::check_coordinates_internal(row, col);
-      return m_matrix.const_cast_derived().coeffRef(row, col);
-    }
-
-    EIGEN_DEVICE_FUNC
-    const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
-    EIGEN_DEVICE_FUNC
-    MatrixTypeNestedCleaned& nestedExpression() { return *const_cast<MatrixTypeNestedCleaned*>(&m_matrix); }
-
-    /** Assigns a triangular matrix to a triangular part of a dense matrix */
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    TriangularView& operator=(const TriangularBase<OtherDerived>& other);
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    TriangularView& operator=(const MatrixBase<OtherDerived>& other);
-
-    EIGEN_DEVICE_FUNC
-    TriangularView& operator=(const TriangularView& other)
-    { return *this = other.nestedExpression(); }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    void lazyAssign(const TriangularBase<OtherDerived>& other);
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    void lazyAssign(const MatrixBase<OtherDerived>& other);
-
-    /** \sa MatrixBase::conjugate() */
-    EIGEN_DEVICE_FUNC
-    inline TriangularView<MatrixConjugateReturnType,Mode> conjugate()
-    { return m_matrix.conjugate(); }
+    typedef TriangularView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
     /** \sa MatrixBase::conjugate() const */
     EIGEN_DEVICE_FUNC
-    inline const TriangularView<MatrixConjugateReturnType,Mode> conjugate() const
-    { return m_matrix.conjugate(); }
+    inline const ConjugateReturnType conjugate() const
+    { return ConjugateReturnType(m_matrix.conjugate()); }
 
+    /** \returns an expression of the complex conjugate of \c *this if Cond==true,
+     *           returns \c *this otherwise.
+     */
+    template<bool Cond>
+    EIGEN_DEVICE_FUNC
+    inline typename internal::conditional<Cond,ConjugateReturnType,ConstTriangularView>::type
+    conjugateIf() const
+    {
+      typedef typename internal::conditional<Cond,ConjugateReturnType,ConstTriangularView>::type ReturnType;
+      return ReturnType(m_matrix.template conjugateIf<Cond>());
+    }
+
+    typedef TriangularView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
     /** \sa MatrixBase::adjoint() const */
     EIGEN_DEVICE_FUNC
-    inline const TriangularView<const typename MatrixType::AdjointReturnType,TransposeMode> adjoint() const
-    { return m_matrix.adjoint(); }
+    inline const AdjointReturnType adjoint() const
+    { return AdjointReturnType(m_matrix.adjoint()); }
 
-    /** \sa MatrixBase::transpose() */
+    typedef TriangularView<typename MatrixType::TransposeReturnType,TransposeMode> TransposeReturnType;
+     /** \sa MatrixBase::transpose() */
     EIGEN_DEVICE_FUNC
-    inline TriangularView<Transpose<MatrixType>,TransposeMode> transpose()
+    inline TransposeReturnType transpose()
     {
       EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return m_matrix.const_cast_derived().transpose();
+      typename MatrixType::TransposeReturnType tmp(m_matrix);
+      return TransposeReturnType(tmp);
     }
+
+    typedef TriangularView<const typename MatrixType::ConstTransposeReturnType,TransposeMode> ConstTransposeReturnType;
     /** \sa MatrixBase::transpose() const */
     EIGEN_DEVICE_FUNC
-    inline const TriangularView<Transpose<MatrixType>,TransposeMode> transpose() const
+    inline const ConstTransposeReturnType transpose() const
     {
-      return m_matrix.transpose();
+      return ConstTransposeReturnType(m_matrix.transpose());
     }
 
-    /** Efficient triangular matrix times vector/matrix product */
-    template<typename OtherDerived>
+    template<typename Other>
     EIGEN_DEVICE_FUNC
-    TriangularProduct<Mode,true,MatrixType,false,OtherDerived, OtherDerived::IsVectorAtCompileTime>
-    operator*(const MatrixBase<OtherDerived>& rhs) const
-    {
-      return TriangularProduct
-              <Mode,true,MatrixType,false,OtherDerived,OtherDerived::IsVectorAtCompileTime>
-              (m_matrix, rhs.derived());
-    }
+    inline const Solve<TriangularView, Other>
+    solve(const MatrixBase<Other>& other) const
+    { return Solve<TriangularView, Other>(*this, other.derived()); }
 
-    /** Efficient vector/matrix times triangular matrix product */
-    template<typename OtherDerived> friend
-    EIGEN_DEVICE_FUNC
-    TriangularProduct<Mode,false,OtherDerived,OtherDerived::IsVectorAtCompileTime,MatrixType,false>
-    operator*(const MatrixBase<OtherDerived>& lhs, const TriangularView& rhs)
-    {
-      return TriangularProduct
-              <Mode,false,OtherDerived,OtherDerived::IsVectorAtCompileTime,MatrixType,false>
-              (lhs.derived(),rhs.m_matrix);
-    }
-
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived>
-    struct eigen2_product_return_type
-    {
-      typedef typename TriangularView<MatrixType,Mode>::DenseMatrixType DenseMatrixType;
-      typedef typename OtherDerived::PlainObject::DenseType OtherPlainObject;
-      typedef typename ProductReturnType<DenseMatrixType, OtherPlainObject>::Type ProdRetType;
-      typedef typename ProdRetType::PlainObject type;
-    };
-    template<typename OtherDerived>
-    const typename eigen2_product_return_type<OtherDerived>::type
-    operator*(const EigenBase<OtherDerived>& rhs) const
-    {
-      typename OtherDerived::PlainObject::DenseType rhsPlainObject;
-      rhs.evalTo(rhsPlainObject);
-      return this->toDenseMatrix() * rhsPlainObject;
-    }
-    template<typename OtherMatrixType>
-    bool isApprox(const TriangularView<OtherMatrixType, Mode>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
-    {
-      return this->toDenseMatrix().isApprox(other.toDenseMatrix(), precision);
-    }
-    template<typename OtherDerived>
-    bool isApprox(const MatrixBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
-    {
-      return this->toDenseMatrix().isApprox(other, precision);
-    }
-    #endif // EIGEN2_SUPPORT
-
+  // workaround MSVC ICE
+  #if EIGEN_COMP_MSVC
     template<int Side, typename Other>
     EIGEN_DEVICE_FUNC
     inline const internal::triangular_solve_retval<Side,TriangularView, Other>
-    solve(const MatrixBase<Other>& other) const;
-
-    template<int Side, typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    void solveInPlace(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename Other>
-    EIGEN_DEVICE_FUNC
-    inline const internal::triangular_solve_retval<OnTheLeft,TriangularView, Other> 
     solve(const MatrixBase<Other>& other) const
-    { return solve<OnTheLeft>(other); }
+    { return Base::template solve<Side>(other); }
+  #else
+    using Base::solve;
+  #endif
 
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    void solveInPlace(const MatrixBase<OtherDerived>& other) const
-    { return solveInPlace<OnTheLeft>(other); }
-
-    EIGEN_DEVICE_FUNC
-    const SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView() const
-    {
-      EIGEN_STATIC_ASSERT((Mode&UnitDiag)==0,PROGRAMMING_ERROR);
-      return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
-    }
+    /** \returns a selfadjoint view of the referenced triangular part which must be either \c #Upper or \c #Lower.
+      *
+      * This is a shortcut for \code this->nestedExpression().selfadjointView<(*this)::Mode>() \endcode
+      * \sa MatrixBase::selfadjointView() */
     EIGEN_DEVICE_FUNC
     SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView()
     {
-      EIGEN_STATIC_ASSERT((Mode&UnitDiag)==0,PROGRAMMING_ERROR);
+      EIGEN_STATIC_ASSERT((Mode&(UnitDiag|ZeroDiag))==0,PROGRAMMING_ERROR);
       return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
     }
 
-    template<typename OtherDerived>
+    /** This is the const version of selfadjointView() */
     EIGEN_DEVICE_FUNC
-    void swap(TriangularBase<OtherDerived> const & other)
+    const SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView() const
     {
-      TriangularView<SwapWrapper<MatrixType>,Mode>(const_cast<MatrixType&>(m_matrix)).lazyAssign(other.derived());
+      EIGEN_STATIC_ASSERT((Mode&(UnitDiag|ZeroDiag))==0,PROGRAMMING_ERROR);
+      return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
     }
 
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    void swap(MatrixBase<OtherDerived> const & other)
-    {
-      SwapWrapper<MatrixType> swaper(const_cast<MatrixType&>(m_matrix));
-      TriangularView<SwapWrapper<MatrixType>,Mode>(swaper).lazyAssign(other.derived());
-    }
 
+    /** \returns the determinant of the triangular matrix
+      * \sa MatrixBase::determinant() */
     EIGEN_DEVICE_FUNC
     Scalar determinant() const
     {
@@ -429,311 +327,283 @@
       else
         return m_matrix.diagonal().prod();
     }
-    
-    // TODO simplify the following:
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TriangularView& operator=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    {
-      setZero();
-      return assignProduct(other,1);
-    }
-    
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TriangularView& operator+=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    {
-      return assignProduct(other,1);
-    }
-    
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TriangularView& operator-=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    {
-      return assignProduct(other,-1);
-    }
-    
-    
-    template<typename ProductDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TriangularView& operator=(const ScaledProduct<ProductDerived>& other)
-    {
-      setZero();
-      return assignProduct(other,other.alpha());
-    }
-    
-    template<typename ProductDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TriangularView& operator+=(const ScaledProduct<ProductDerived>& other)
-    {
-      return assignProduct(other,other.alpha());
-    }
-    
-    template<typename ProductDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TriangularView& operator-=(const ScaledProduct<ProductDerived>& other)
-    {
-      return assignProduct(other,-other.alpha());
-    }
-    
+
   protected:
-    
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TriangularView& assignProduct(const ProductBase<ProductDerived, Lhs,Rhs>& prod, const Scalar& alpha);
 
     MatrixTypeNested m_matrix;
 };
 
+/** \ingroup Core_Module
+  *
+  * \brief Base class for a triangular part in a \b dense matrix
+  *
+  * This class is an abstract base class of class TriangularView, and objects of type TriangularViewImpl cannot be instantiated.
+  * It extends class TriangularView with additional methods which available for dense expressions only.
+  *
+  * \sa class TriangularView, MatrixBase::triangularView()
+  */
+template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_MatrixType,_Mode,Dense>
+  : public TriangularBase<TriangularView<_MatrixType, _Mode> >
+{
+  public:
+
+    typedef TriangularView<_MatrixType, _Mode> TriangularViewType;
+    typedef TriangularBase<TriangularViewType> Base;
+    typedef typename internal::traits<TriangularViewType>::Scalar Scalar;
+
+    typedef _MatrixType MatrixType;
+    typedef typename MatrixType::PlainObject DenseMatrixType;
+    typedef DenseMatrixType PlainObject;
+
+  public:
+    using Base::evalToLazy;
+    using Base::derived;
+
+    typedef typename internal::traits<TriangularViewType>::StorageKind StorageKind;
+
+    enum {
+      Mode = _Mode,
+      Flags = internal::traits<TriangularViewType>::Flags
+    };
+
+    /** \returns the outer-stride of the underlying dense matrix
+      * \sa DenseCoeffsBase::outerStride() */
+    EIGEN_DEVICE_FUNC
+    inline Index outerStride() const { return derived().nestedExpression().outerStride(); }
+    /** \returns the inner-stride of the underlying dense matrix
+      * \sa DenseCoeffsBase::innerStride() */
+    EIGEN_DEVICE_FUNC
+    inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
+
+    /** \sa MatrixBase::operator+=() */
+    template<typename Other>
+    EIGEN_DEVICE_FUNC
+    TriangularViewType&  operator+=(const DenseBase<Other>& other) {
+      internal::call_assignment_no_alias(derived(), other.derived(), internal::add_assign_op<Scalar,typename Other::Scalar>());
+      return derived();
+    }
+    /** \sa MatrixBase::operator-=() */
+    template<typename Other>
+    EIGEN_DEVICE_FUNC
+    TriangularViewType&  operator-=(const DenseBase<Other>& other) {
+      internal::call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op<Scalar,typename Other::Scalar>());
+      return derived();
+    }
+
+    /** \sa MatrixBase::operator*=() */
+    EIGEN_DEVICE_FUNC
+    TriangularViewType&  operator*=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = derived().nestedExpression() * other; }
+    /** \sa DenseBase::operator/=() */
+    EIGEN_DEVICE_FUNC
+    TriangularViewType&  operator/=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = derived().nestedExpression() / other; }
+
+    /** \sa MatrixBase::fill() */
+    EIGEN_DEVICE_FUNC
+    void fill(const Scalar& value) { setConstant(value); }
+    /** \sa MatrixBase::setConstant() */
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& setConstant(const Scalar& value)
+    { return *this = MatrixType::Constant(derived().rows(), derived().cols(), value); }
+    /** \sa MatrixBase::setZero() */
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& setZero() { return setConstant(Scalar(0)); }
+    /** \sa MatrixBase::setOnes() */
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& setOnes() { return setConstant(Scalar(1)); }
+
+    /** \sa MatrixBase::coeff()
+      * \warning the coordinates must fit into the referenced triangular part
+      */
+    EIGEN_DEVICE_FUNC
+    inline Scalar coeff(Index row, Index col) const
+    {
+      Base::check_coordinates_internal(row, col);
+      return derived().nestedExpression().coeff(row, col);
+    }
+
+    /** \sa MatrixBase::coeffRef()
+      * \warning the coordinates must fit into the referenced triangular part
+      */
+    EIGEN_DEVICE_FUNC
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(TriangularViewType);
+      Base::check_coordinates_internal(row, col);
+      return derived().nestedExpression().coeffRef(row, col);
+    }
+
+    /** Assigns a triangular matrix to a triangular part of a dense matrix */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& operator=(const TriangularBase<OtherDerived>& other);
+
+    /** Shortcut for\code *this = other.other.triangularView<(*this)::Mode>() \endcode */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& operator=(const MatrixBase<OtherDerived>& other);
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& operator=(const TriangularViewImpl& other)
+    { return *this = other.derived().nestedExpression(); }
+
+    template<typename OtherDerived>
+    /** \deprecated */
+    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
+    void lazyAssign(const TriangularBase<OtherDerived>& other);
+
+    template<typename OtherDerived>
+    /** \deprecated */
+    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
+    void lazyAssign(const MatrixBase<OtherDerived>& other);
+#endif
+
+    /** Efficient triangular matrix times vector/matrix product */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    const Product<TriangularViewType,OtherDerived>
+    operator*(const MatrixBase<OtherDerived>& rhs) const
+    {
+      return Product<TriangularViewType,OtherDerived>(derived(), rhs.derived());
+    }
+
+    /** Efficient vector/matrix times triangular matrix product */
+    template<typename OtherDerived> friend
+    EIGEN_DEVICE_FUNC
+    const Product<OtherDerived,TriangularViewType>
+    operator*(const MatrixBase<OtherDerived>& lhs, const TriangularViewImpl& rhs)
+    {
+      return Product<OtherDerived,TriangularViewType>(lhs.derived(),rhs.derived());
+    }
+
+    /** \returns the product of the inverse of \c *this with \a other, \a *this being triangular.
+      *
+      * This function computes the inverse-matrix matrix product inverse(\c *this) * \a other if
+      * \a Side==OnTheLeft (the default), or the right-inverse-multiply  \a other * inverse(\c *this) if
+      * \a Side==OnTheRight.
+      *
+      * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
+      *
+      * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
+      * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
+      * is an upper (resp. lower) triangular matrix.
+      *
+      * Example: \include Triangular_solve.cpp
+      * Output: \verbinclude Triangular_solve.out
+      *
+      * This function returns an expression of the inverse-multiply and can works in-place if it is assigned
+      * to the same matrix or vector \a other.
+      *
+      * For users coming from BLAS, this function (and more specifically solveInPlace()) offer
+      * all the operations supported by the \c *TRSV and \c *TRSM BLAS routines.
+      *
+      * \sa TriangularView::solveInPlace()
+      */
+    template<int Side, typename Other>
+    inline const internal::triangular_solve_retval<Side,TriangularViewType, Other>
+    solve(const MatrixBase<Other>& other) const;
+
+    /** "in-place" version of TriangularView::solve() where the result is written in \a other
+      *
+      * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
+      * This function will const_cast it, so constness isn't honored here.
+      *
+      * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
+      *
+      * See TriangularView:solve() for the details.
+      */
+    template<int Side, typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    void solveInPlace(const MatrixBase<OtherDerived>& other) const;
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    void solveInPlace(const MatrixBase<OtherDerived>& other) const
+    { return solveInPlace<OnTheLeft>(other); }
+
+    /** Swaps the coefficients of the common triangular parts of two matrices */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+    void swap(TriangularBase<OtherDerived> &other)
+#else
+    void swap(TriangularBase<OtherDerived> const & other)
+#endif
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(OtherDerived);
+      call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
+    }
+
+    /** Shortcut for \code (*this).swap(other.triangularView<(*this)::Mode>()) \endcode */
+    template<typename OtherDerived>
+    /** \deprecated */
+    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
+    void swap(MatrixBase<OtherDerived> const & other)
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(OtherDerived);
+      call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
+    }
+
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _solve_impl(const RhsType &rhs, DstType &dst) const {
+      if(!internal::is_same_dense(dst,rhs))
+        dst = rhs;
+      this->solveInPlace(dst);
+    }
+
+    template<typename ProductType>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha, bool beta);
+  protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(TriangularViewImpl)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TriangularViewImpl)
+
+};
+
 /***************************************************************************
 * Implementation of triangular evaluation/assignment
 ***************************************************************************/
 
-namespace internal {
-
-template<typename Derived1, typename Derived2, unsigned int Mode, int UnrollCount, bool ClearOpposite>
-struct triangular_assignment_selector
-{
-  enum {
-    col = (UnrollCount-1) / Derived1::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
-  };
-  
-  typedef typename Derived1::Scalar Scalar;
-
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    triangular_assignment_selector<Derived1, Derived2, Mode, UnrollCount-1, ClearOpposite>::run(dst, src);
-
-    eigen_assert( Mode == Upper || Mode == Lower
-            || Mode == StrictlyUpper || Mode == StrictlyLower
-            || Mode == UnitUpper || Mode == UnitLower);
-    if((Mode == Upper && row <= col)
-    || (Mode == Lower && row >= col)
-    || (Mode == StrictlyUpper && row < col)
-    || (Mode == StrictlyLower && row > col)
-    || (Mode == UnitUpper && row < col)
-    || (Mode == UnitLower && row > col))
-      dst.copyCoeff(row, col, src);
-    else if(ClearOpposite)
-    {
-      if (Mode&UnitDiag && row==col)
-        dst.coeffRef(row, col) = Scalar(1);
-      else
-        dst.coeffRef(row, col) = Scalar(0);
-    }
-  }
-};
-
-// prevent buggy user code from causing an infinite recursion
-template<typename Derived1, typename Derived2, unsigned int Mode, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, Mode, 0, ClearOpposite>
-{
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &, const Derived2 &) {}
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, Upper, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  typedef typename Derived1::Scalar Scalar;
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      Index maxi = (std::min)(j, dst.rows()-1);
-      for(Index i = 0; i <= maxi; ++i)
-        dst.copyCoeff(i, j, src);
-      if (ClearOpposite)
-        for(Index i = maxi+1; i < dst.rows(); ++i)
-          dst.coeffRef(i, j) = Scalar(0);
-    }
-  }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, Lower, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      for(Index i = j; i < dst.rows(); ++i)
-        dst.copyCoeff(i, j, src);
-      Index maxi = (std::min)(j, dst.rows());
-      if (ClearOpposite)
-        for(Index i = 0; i < maxi; ++i)
-          dst.coeffRef(i, j) = static_cast<typename Derived1::Scalar>(0);
-    }
-  }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, StrictlyUpper, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  typedef typename Derived1::Scalar Scalar;
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      Index maxi = (std::min)(j, dst.rows());
-      for(Index i = 0; i < maxi; ++i)
-        dst.copyCoeff(i, j, src);
-      if (ClearOpposite)
-        for(Index i = maxi; i < dst.rows(); ++i)
-          dst.coeffRef(i, j) = Scalar(0);
-    }
-  }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, StrictlyLower, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      for(Index i = j+1; i < dst.rows(); ++i)
-        dst.copyCoeff(i, j, src);
-      Index maxi = (std::min)(j, dst.rows()-1);
-      if (ClearOpposite)
-        for(Index i = 0; i <= maxi; ++i)
-          dst.coeffRef(i, j) = static_cast<typename Derived1::Scalar>(0);
-    }
-  }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, UnitUpper, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      Index maxi = (std::min)(j, dst.rows());
-      for(Index i = 0; i < maxi; ++i)
-        dst.copyCoeff(i, j, src);
-      if (ClearOpposite)
-      {
-        for(Index i = maxi+1; i < dst.rows(); ++i)
-          dst.coeffRef(i, j) = 0;
-      }
-    }
-    dst.diagonal().setOnes();
-  }
-};
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, UnitLower, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      Index maxi = (std::min)(j, dst.rows());
-      for(Index i = maxi+1; i < dst.rows(); ++i)
-        dst.copyCoeff(i, j, src);
-      if (ClearOpposite)
-      {
-        for(Index i = 0; i < maxi; ++i)
-          dst.coeffRef(i, j) = 0;
-      }
-    }
-    dst.diagonal().setOnes();
-  }
-};
-
-} // end namespace internal
-
+#ifndef EIGEN_PARSED_BY_DOXYGEN
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-inline TriangularView<MatrixType, Mode>&
-TriangularView<MatrixType, Mode>::operator=(const MatrixBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
+TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDerived>& other)
 {
-  if(OtherDerived::Flags & EvalBeforeAssigningBit)
-  {
-    typename internal::plain_matrix_type<OtherDerived>::type other_evaluated(other.rows(), other.cols());
-    other_evaluated.template triangularView<Mode>().lazyAssign(other.derived());
-    lazyAssign(other_evaluated);
-  }
-  else
-    lazyAssign(other.derived());
-  return *this;
+  internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
 }
 
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-void TriangularView<MatrixType, Mode>::lazyAssign(const MatrixBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
 {
-  enum {
-    unroll = MatrixType::SizeAtCompileTime != Dynamic
-          && internal::traits<OtherDerived>::CoeffReadCost != Dynamic
-          && MatrixType::SizeAtCompileTime*internal::traits<OtherDerived>::CoeffReadCost/2 <= EIGEN_UNROLLING_LIMIT
-  };
-  eigen_assert(m_matrix.rows() == other.rows() && m_matrix.cols() == other.cols());
-
-  internal::triangular_assignment_selector
-    <MatrixType, OtherDerived, int(Mode),
-    unroll ? int(MatrixType::SizeAtCompileTime) : Dynamic,
-    false // do not change the opposite triangular part
-    >::run(m_matrix.const_cast_derived(), other.derived());
+  internal::call_assignment_no_alias(derived(), other.template triangularView<Mode>());
 }
 
 
 
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-inline TriangularView<MatrixType, Mode>&
-TriangularView<MatrixType, Mode>::operator=(const TriangularBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
+TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<OtherDerived>& other)
 {
   eigen_assert(Mode == int(OtherDerived::Mode));
-  if(internal::traits<OtherDerived>::Flags & EvalBeforeAssigningBit)
-  {
-    typename OtherDerived::DenseMatrixType other_evaluated(other.rows(), other.cols());
-    other_evaluated.template triangularView<Mode>().lazyAssign(other.derived().nestedExpression());
-    lazyAssign(other_evaluated);
-  }
-  else
-    lazyAssign(other.derived().nestedExpression());
-  return *this;
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-void TriangularView<MatrixType, Mode>::lazyAssign(const TriangularBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
 {
-  enum {
-    unroll = MatrixType::SizeAtCompileTime != Dynamic
-                   && internal::traits<OtherDerived>::CoeffReadCost != Dynamic
-                   && MatrixType::SizeAtCompileTime * internal::traits<OtherDerived>::CoeffReadCost / 2
-                        <= EIGEN_UNROLLING_LIMIT
-  };
-  eigen_assert(m_matrix.rows() == other.rows() && m_matrix.cols() == other.cols());
-
-  internal::triangular_assignment_selector
-    <MatrixType, OtherDerived, int(Mode),
-    unroll ? int(MatrixType::SizeAtCompileTime) : Dynamic,
-    false // preserve the opposite triangular part
-    >::run(m_matrix.const_cast_derived(), other.derived().nestedExpression());
+  eigen_assert(Mode == int(OtherDerived::Mode));
+  internal::call_assignment_no_alias(derived(), other.derived());
 }
+#endif
 
 /***************************************************************************
 * Implementation of TriangularBase methods
@@ -743,39 +613,9 @@
   * If the matrix is triangular, the opposite part is set to zero. */
 template<typename Derived>
 template<typename DenseDerived>
-EIGEN_DEVICE_FUNC
-void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
+EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
 {
-  if(internal::traits<Derived>::Flags & EvalBeforeAssigningBit)
-  {
-    typename internal::plain_matrix_type<Derived>::type other_evaluated(rows(), cols());
-    evalToLazy(other_evaluated);
-    other.derived().swap(other_evaluated);
-  }
-  else
-    evalToLazy(other.derived());
-}
-
-/** Assigns a triangular or selfadjoint matrix to a dense matrix.
-  * If the matrix is triangular, the opposite part is set to zero. */
-template<typename Derived>
-template<typename DenseDerived>
-EIGEN_DEVICE_FUNC
-void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
-{
-  enum {
-    unroll = DenseDerived::SizeAtCompileTime != Dynamic
-                   && internal::traits<Derived>::CoeffReadCost != Dynamic
-                   && DenseDerived::SizeAtCompileTime * internal::traits<Derived>::CoeffReadCost / 2
-                        <= EIGEN_UNROLLING_LIMIT
-  };
-  other.derived().resize(this->rows(), this->cols());
-
-  internal::triangular_assignment_selector
-    <DenseDerived, typename internal::traits<Derived>::MatrixTypeNestedCleaned, Derived::Mode,
-    unroll ? int(DenseDerived::SizeAtCompileTime) : Dynamic,
-    true // clear the opposite triangular part
-    >::run(other.derived(), derived().nestedExpression());
+  evalToLazy(other.derived());
 }
 
 /***************************************************************************
@@ -786,49 +626,14 @@
 * Implementation of MatrixBase methods
 ***************************************************************************/
 
-#ifdef EIGEN2_SUPPORT
-
-// implementation of part<>(), including the SelfAdjoint case.
-
-namespace internal {
-template<typename MatrixType, unsigned int Mode>
-struct eigen2_part_return_type
-{
-  typedef TriangularView<MatrixType, Mode> type;
-};
-
-template<typename MatrixType>
-struct eigen2_part_return_type<MatrixType, SelfAdjoint>
-{
-  typedef SelfAdjointView<MatrixType, Upper> type;
-};
-}
-
-/** \deprecated use MatrixBase::triangularView() */
-template<typename Derived>
-template<unsigned int Mode>
-const typename internal::eigen2_part_return_type<Derived, Mode>::type MatrixBase<Derived>::part() const
-{
-  return derived();
-}
-
-/** \deprecated use MatrixBase::triangularView() */
-template<typename Derived>
-template<unsigned int Mode>
-typename internal::eigen2_part_return_type<Derived, Mode>::type MatrixBase<Derived>::part()
-{
-  return derived();
-}
-#endif
-
 /**
   * \returns an expression of a triangular view extracted from the current matrix
   *
   * The parameter \a Mode can have the following values: \c #Upper, \c #StrictlyUpper, \c #UnitUpper,
   * \c #Lower, \c #StrictlyLower, \c #UnitLower.
   *
-  * Example: \include MatrixBase_extract.cpp
-  * Output: \verbinclude MatrixBase_extract.out
+  * Example: \include MatrixBase_triangularView.cpp
+  * Output: \verbinclude MatrixBase_triangularView.out
   *
   * \sa class TriangularView
   */
@@ -838,7 +643,7 @@
 typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView()
 {
-  return derived();
+  return typename TriangularViewReturnType<Mode>::Type(derived());
 }
 
 /** This is the const version of MatrixBase::triangularView() */
@@ -848,7 +653,7 @@
 typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView() const
 {
-  return derived();
+  return typename ConstTriangularViewReturnType<Mode>::Type(derived());
 }
 
 /** \returns true if *this is approximately equal to an upper triangular matrix,
@@ -859,21 +664,20 @@
 template<typename Derived>
 bool MatrixBase<Derived>::isUpperTriangular(const RealScalar& prec) const
 {
-  using std::abs;
   RealScalar maxAbsOnUpperPart = static_cast<RealScalar>(-1);
   for(Index j = 0; j < cols(); ++j)
   {
-    Index maxi = (std::min)(j, rows()-1);
+    Index maxi = numext::mini(j, rows()-1);
     for(Index i = 0; i <= maxi; ++i)
     {
-      RealScalar absValue = abs(coeff(i,j));
+      RealScalar absValue = numext::abs(coeff(i,j));
       if(absValue > maxAbsOnUpperPart) maxAbsOnUpperPart = absValue;
     }
   }
   RealScalar threshold = maxAbsOnUpperPart * prec;
   for(Index j = 0; j < cols(); ++j)
     for(Index i = j+1; i < rows(); ++i)
-      if(abs(coeff(i, j)) > threshold) return false;
+      if(numext::abs(coeff(i, j)) > threshold) return false;
   return true;
 }
 
@@ -885,24 +689,313 @@
 template<typename Derived>
 bool MatrixBase<Derived>::isLowerTriangular(const RealScalar& prec) const
 {
-  using std::abs;
   RealScalar maxAbsOnLowerPart = static_cast<RealScalar>(-1);
   for(Index j = 0; j < cols(); ++j)
     for(Index i = j; i < rows(); ++i)
     {
-      RealScalar absValue = abs(coeff(i,j));
+      RealScalar absValue = numext::abs(coeff(i,j));
       if(absValue > maxAbsOnLowerPart) maxAbsOnLowerPart = absValue;
     }
   RealScalar threshold = maxAbsOnLowerPart * prec;
   for(Index j = 1; j < cols(); ++j)
   {
-    Index maxi = (std::min)(j, rows()-1);
+    Index maxi = numext::mini(j, rows()-1);
     for(Index i = 0; i < maxi; ++i)
-      if(abs(coeff(i, j)) > threshold) return false;
+      if(numext::abs(coeff(i, j)) > threshold) return false;
   }
   return true;
 }
 
+
+/***************************************************************************
+****************************************************************************
+* Evaluators and Assignment of triangular expressions
+***************************************************************************
+***************************************************************************/
+
+namespace internal {
+
+
+// TODO currently a triangular expression has the form TriangularView<.,.>
+//      in the future triangular-ness should be defined by the expression traits
+//      such that Transpose<TriangularView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
+template<typename MatrixType, unsigned int Mode>
+struct evaluator_traits<TriangularView<MatrixType,Mode> >
+{
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef typename glue_shapes<typename evaluator_traits<MatrixType>::Shape, TriangularShape>::type Shape;
+};
+
+template<typename MatrixType, unsigned int Mode>
+struct unary_evaluator<TriangularView<MatrixType,Mode>, IndexBased>
+ : evaluator<typename internal::remove_all<MatrixType>::type>
+{
+  typedef TriangularView<MatrixType,Mode> XprType;
+  typedef evaluator<typename internal::remove_all<MatrixType>::type> Base;
+  EIGEN_DEVICE_FUNC
+  unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {}
+};
+
+// Additional assignment kinds:
+struct Triangular2Triangular    {};
+struct Triangular2Dense         {};
+struct Dense2Triangular         {};
+
+
+template<typename Kernel, unsigned int Mode, int UnrollCount, bool ClearOpposite> struct triangular_assignment_loop;
+
+
+/** \internal Specialization of the dense assignment kernel for triangular matrices.
+  * The main difference is that the triangular, diagonal, and opposite parts are processed through three different functions.
+  * \tparam UpLo must be either Lower or Upper
+  * \tparam Mode must be either 0, UnitDiag, ZeroDiag, or SelfAdjoint
+  */
+template<int UpLo, int Mode, int SetOpposite, typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version = Specialized>
+class triangular_dense_assignment_kernel : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version>
+{
+protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version> Base;
+  typedef typename Base::DstXprType DstXprType;
+  typedef typename Base::SrcXprType SrcXprType;
+  using Base::m_dst;
+  using Base::m_src;
+  using Base::m_functor;
+public:
+
+  typedef typename Base::DstEvaluatorType DstEvaluatorType;
+  typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::AssignmentTraits AssignmentTraits;
+
+
+  EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+    : Base(dst, src, func, dstExpr)
+  {}
+
+#ifdef EIGEN_INTERNAL_DEBUGGING
+  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
+  {
+    eigen_internal_assert(row!=col);
+    Base::assignCoeff(row,col);
+  }
+#else
+  using Base::assignCoeff;
+#endif
+
+  EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
+  {
+         if(Mode==UnitDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(1));
+    else if(Mode==ZeroDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(0));
+    else if(Mode==0)                       Base::assignCoeff(id,id);
+  }
+
+  EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index row, Index col)
+  {
+    eigen_internal_assert(row!=col);
+    if(SetOpposite)
+      m_functor.assignCoeff(m_dst.coeffRef(row,col), Scalar(0));
+  }
+};
+
+template<int Mode, bool SetOpposite, typename DstXprType, typename SrcXprType, typename Functor>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func)
+{
+  typedef evaluator<DstXprType> DstEvaluatorType;
+  typedef evaluator<SrcXprType> SrcEvaluatorType;
+
+  SrcEvaluatorType srcEvaluator(src);
+
+  Index dstRows = src.rows();
+  Index dstCols = src.cols();
+  if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+    dst.resize(dstRows, dstCols);
+  DstEvaluatorType dstEvaluator(dst);
+
+  typedef triangular_dense_assignment_kernel< Mode&(Lower|Upper),Mode&(UnitDiag|ZeroDiag|SelfAdjoint),SetOpposite,
+                                              DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
+  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
+
+  enum {
+      unroll = DstXprType::SizeAtCompileTime != Dynamic
+            && SrcEvaluatorType::CoeffReadCost < HugeCost
+            && DstXprType::SizeAtCompileTime * (int(DstEvaluatorType::CoeffReadCost) + int(SrcEvaluatorType::CoeffReadCost)) / 2 <= EIGEN_UNROLLING_LIMIT
+    };
+
+  triangular_assignment_loop<Kernel, Mode, unroll ? int(DstXprType::SizeAtCompileTime) : Dynamic, SetOpposite>::run(kernel);
+}
+
+template<int Mode, bool SetOpposite, typename DstXprType, typename SrcXprType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src)
+{
+  call_triangular_assignment_loop<Mode,SetOpposite>(dst, src, internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
+}
+
+template<> struct AssignmentKind<TriangularShape,TriangularShape> { typedef Triangular2Triangular Kind; };
+template<> struct AssignmentKind<DenseShape,TriangularShape>      { typedef Triangular2Dense      Kind; };
+template<> struct AssignmentKind<TriangularShape,DenseShape>      { typedef Dense2Triangular      Kind; };
+
+
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Triangular>
+{
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  {
+    eigen_assert(int(DstXprType::Mode) == int(SrcXprType::Mode));
+
+    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);
+  }
+};
+
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense>
+{
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  {
+    call_triangular_assignment_loop<SrcXprType::Mode, (int(SrcXprType::Mode) & int(SelfAdjoint)) == 0>(dst, src, func);
+  }
+};
+
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Dense2Triangular>
+{
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  {
+    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);
+  }
+};
+
+
+template<typename Kernel, unsigned int Mode, int UnrollCount, bool SetOpposite>
+struct triangular_assignment_loop
+{
+  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
+  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
+  typedef typename DstEvaluatorType::XprType DstXprType;
+
+  enum {
+    col = (UnrollCount-1) / DstXprType::RowsAtCompileTime,
+    row = (UnrollCount-1) % DstXprType::RowsAtCompileTime
+  };
+
+  typedef typename Kernel::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC
+  static inline void run(Kernel &kernel)
+  {
+    triangular_assignment_loop<Kernel, Mode, UnrollCount-1, SetOpposite>::run(kernel);
+
+    if(row==col)
+      kernel.assignDiagonalCoeff(row);
+    else if( ((Mode&Lower) && row>col) || ((Mode&Upper) && row<col) )
+      kernel.assignCoeff(row,col);
+    else if(SetOpposite)
+      kernel.assignOppositeCoeff(row,col);
+  }
+};
+
+// prevent buggy user code from causing an infinite recursion
+template<typename Kernel, unsigned int Mode, bool SetOpposite>
+struct triangular_assignment_loop<Kernel, Mode, 0, SetOpposite>
+{
+  EIGEN_DEVICE_FUNC
+  static inline void run(Kernel &) {}
+};
+
+
+
+// TODO: experiment with a recursive assignment procedure splitting the current
+//       triangular part into one rectangular and two triangular parts.
+
+
+template<typename Kernel, unsigned int Mode, bool SetOpposite>
+struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
+{
+  typedef typename Kernel::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  static inline void run(Kernel &kernel)
+  {
+    for(Index j = 0; j < kernel.cols(); ++j)
+    {
+      Index maxi = numext::mini(j, kernel.rows());
+      Index i = 0;
+      if (((Mode&Lower) && SetOpposite) || (Mode&Upper))
+      {
+        for(; i < maxi; ++i)
+          if(Mode&Upper) kernel.assignCoeff(i, j);
+          else           kernel.assignOppositeCoeff(i, j);
+      }
+      else
+        i = maxi;
+
+      if(i<kernel.rows()) // then i==j
+        kernel.assignDiagonalCoeff(i++);
+
+      if (((Mode&Upper) && SetOpposite) || (Mode&Lower))
+      {
+        for(; i < kernel.rows(); ++i)
+          if(Mode&Lower) kernel.assignCoeff(i, j);
+          else           kernel.assignOppositeCoeff(i, j);
+      }
+    }
+  }
+};
+
+} // end namespace internal
+
+/** Assigns a triangular or selfadjoint matrix to a dense matrix.
+  * If the matrix is triangular, the opposite part is set to zero. */
+template<typename Derived>
+template<typename DenseDerived>
+EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
+{
+  other.derived().resize(this->rows(), this->cols());
+  internal::call_triangular_assignment_loop<Derived::Mode, (int(Derived::Mode) & int(SelfAdjoint)) == 0 /* SetOpposite */>(other.derived(), derived().nestedExpression());
+}
+
+namespace internal {
+
+// Triangular = Product
+template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
+{
+  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename SrcXprType::Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    dst._assignProduct(src, Scalar(1), false);
+  }
+};
+
+// Triangular += Product
+template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
+{
+  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,typename SrcXprType::Scalar> &)
+  {
+    dst._assignProduct(src, Scalar(1), true);
+  }
+};
+
+// Triangular -= Product
+template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::sub_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
+{
+  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,typename SrcXprType::Scalar> &)
+  {
+    dst._assignProduct(src, Scalar(-1), true);
+  }
+};
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_TRIANGULARMATRIX_H

diff --git a/Eigen/src/Core/VectorBlock.h b/Eigen/src/Core/VectorBlock.h
index 216c568..71c5b95 100644
--- a/Eigen/src/Core/VectorBlock.h
+++ b/Eigen/src/Core/VectorBlock.h

@@ -13,19 +13,29 @@
 
 namespace Eigen { 
 
+namespace internal {
+template<typename VectorType, int Size>
+struct traits<VectorBlock<VectorType, Size> >
+  : public traits<Block<VectorType,
+                     traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
+                     traits<VectorType>::Flags & RowMajorBit ? Size : 1> >
+{
+};
+}
+
 /** \class VectorBlock
   * \ingroup Core_Module
   *
   * \brief Expression of a fixed-size or dynamic-size sub-vector
   *
-  * \param VectorType the type of the object in which we are taking a sub-vector
-  * \param Size size of the sub-vector we are taking at compile time (optional)
+  * \tparam VectorType the type of the object in which we are taking a sub-vector
+  * \tparam Size size of the sub-vector we are taking at compile time (optional)
   *
   * This class represents an expression of either a fixed-size or dynamic-size sub-vector.
   * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment<int>(Index) and
   * most of the time this is the only way it is used.
   *
-  * However, if you want to directly maniputate sub-vector expressions,
+  * However, if you want to directly manipulate sub-vector expressions,
   * for instance if you want to write a function returning such an expression, you
   * will need to use this class.
   *
@@ -43,17 +53,6 @@
   *
   * \sa class Block, DenseBase::segment(Index,Index,Index,Index), DenseBase::segment(Index,Index)
   */
-
-namespace internal {
-template<typename VectorType, int Size>
-struct traits<VectorBlock<VectorType, Size> >
-  : public traits<Block<VectorType,
-                     traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
-                     traits<VectorType>::Flags & RowMajorBit ? Size : 1> >
-{
-};
-}
-
 template<typename VectorType, int Size> class VectorBlock
   : public Block<VectorType,
                      internal::traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
@@ -72,8 +71,8 @@
 
     /** Dynamic-size constructor
       */
-    EIGEN_DEVICE_FUNC
-    inline VectorBlock(VectorType& vector, Index start, Index size)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    VectorBlock(VectorType& vector, Index start, Index size)
       : Base(vector,
              IsColVector ? start : 0, IsColVector ? 0 : start,
              IsColVector ? size  : 1, IsColVector ? 1 : size)
@@ -83,8 +82,8 @@
 
     /** Fixed-size constructor
       */
-    EIGEN_DEVICE_FUNC
-    inline VectorBlock(VectorType& vector, Index start)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    VectorBlock(VectorType& vector, Index start)
       : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start)
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock);

diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index 0d76c62..870f4f1 100644
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -41,107 +41,107 @@
   typedef typename traits<MatrixType>::StorageKind StorageKind;
   typedef typename traits<MatrixType>::XprKind XprKind;
   typedef typename MatrixType::Scalar InputScalar;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
-  typedef typename remove_all<MatrixTypeNested>::type _MatrixTypeNested;
   enum {
     RowsAtCompileTime = Direction==Vertical   ? 1 : MatrixType::RowsAtCompileTime,
     ColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = Direction==Vertical   ? 1 : MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::MaxColsAtCompileTime,
-    Flags0 = (unsigned int)_MatrixTypeNested::Flags & HereditaryBits,
-    Flags = (Flags0 & ~RowMajorBit) | (RowsAtCompileTime == 1 ? RowMajorBit : 0),
+    Flags = RowsAtCompileTime == 1 ? RowMajorBit : 0,
     TraversalSize = Direction==Vertical ? MatrixType::RowsAtCompileTime :  MatrixType::ColsAtCompileTime
   };
-  #if EIGEN_GNUC_AT_LEAST(3,4)
-  typedef typename MemberOp::template Cost<InputScalar,int(TraversalSize)> CostOpType;
-  #else
-  typedef typename MemberOp::template Cost<InputScalar,TraversalSize> CostOpType;
-  #endif
-  enum {
-    CoeffReadCost = TraversalSize==Dynamic ? Dynamic
-                  : TraversalSize * traits<_MatrixTypeNested>::CoeffReadCost + int(CostOpType::value)
-  };
 };
 }
 
 template< typename MatrixType, typename MemberOp, int Direction>
-class PartialReduxExpr : internal::no_assignment_operator,
-  public internal::dense_xpr_base< PartialReduxExpr<MatrixType, MemberOp, Direction> >::type
+class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr<MatrixType, MemberOp, Direction> >::type,
+                         internal::no_assignment_operator
 {
   public:
 
     typedef typename internal::dense_xpr_base<PartialReduxExpr>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(PartialReduxExpr)
-    typedef typename internal::traits<PartialReduxExpr>::MatrixTypeNested MatrixTypeNested;
-    typedef typename internal::traits<PartialReduxExpr>::_MatrixTypeNested _MatrixTypeNested;
 
-    PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
+    EIGEN_DEVICE_FUNC
+    explicit PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
       : m_matrix(mat), m_functor(func) {}
 
-    Index rows() const { return (Direction==Vertical   ? 1 : m_matrix.rows()); }
-    Index cols() const { return (Direction==Horizontal ? 1 : m_matrix.cols()); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return (Direction==Vertical   ? 1 : m_matrix.rows()); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return (Direction==Horizontal ? 1 : m_matrix.cols()); }
 
-    EIGEN_STRONG_INLINE const Scalar coeff(Index i, Index j) const
-    {
-      if (Direction==Vertical)
-        return m_functor(m_matrix.col(j));
-      else
-        return m_functor(m_matrix.row(i));
-    }
+    EIGEN_DEVICE_FUNC
+    typename MatrixType::Nested nestedExpression() const { return m_matrix; }
 
-    const Scalar coeff(Index index) const
-    {
-      if (Direction==Vertical)
-        return m_functor(m_matrix.col(index));
-      else
-        return m_functor(m_matrix.row(index));
-    }
+    EIGEN_DEVICE_FUNC
+    const MemberOp& functor() const { return m_functor; }
 
   protected:
-    MatrixTypeNested m_matrix;
+    typename MatrixType::Nested m_matrix;
     const MemberOp m_functor;
 };
 
-#define EIGEN_MEMBER_FUNCTOR(MEMBER,COST)                               \
-  template <typename ResultType>                                        \
-  struct member_##MEMBER {                                              \
-    EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER)                            \
-    typedef ResultType result_type;                                     \
-    template<typename Scalar, int Size> struct Cost                     \
-    { enum { value = COST }; };                                         \
-    template<typename XprType>                                          \
-    EIGEN_STRONG_INLINE ResultType operator()(const XprType& mat) const \
-    { return mat.MEMBER(); } \
+template<typename A,typename B> struct partial_redux_dummy_func;
+
+#define EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,VECTORIZABLE,BINARYOP)                \
+  template <typename ResultType,typename Scalar>                                                            \
+  struct member_##MEMBER {                                                                  \
+    EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER)                                                \
+    typedef ResultType result_type;                                                         \
+    typedef BINARYOP<Scalar,Scalar> BinaryOp;   \
+    template<int Size> struct Cost { enum { value = COST }; };             \
+    enum { Vectorizable = VECTORIZABLE };                                                   \
+    template<typename XprType>                                                              \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                   \
+    ResultType operator()(const XprType& mat) const                                         \
+    { return mat.MEMBER(); }                                                                \
+    BinaryOp binaryFunc() const { return BinaryOp(); }                                      \
   }
 
+#define EIGEN_MEMBER_FUNCTOR(MEMBER,COST) \
+  EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,0,partial_redux_dummy_func)
+
 namespace internal {
 
-EIGEN_MEMBER_FUNCTOR(squaredNorm, Size * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(norm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(stableNorm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(blueNorm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(hypotNorm, (Size-1) * functor_traits<scalar_hypot_op<Scalar> >::Cost );
-EIGEN_MEMBER_FUNCTOR(sum, (Size-1)*NumTraits<Scalar>::AddCost);
-EIGEN_MEMBER_FUNCTOR(mean, (Size-1)*NumTraits<Scalar>::AddCost + NumTraits<Scalar>::MulCost);
-EIGEN_MEMBER_FUNCTOR(minCoeff, (Size-1)*NumTraits<Scalar>::AddCost);
-EIGEN_MEMBER_FUNCTOR(maxCoeff, (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(all, (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(any, (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(count, (Size-1)*NumTraits<Scalar>::AddCost);
-EIGEN_MEMBER_FUNCTOR(prod, (Size-1)*NumTraits<Scalar>::MulCost);
 
+EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(sum, (Size-1)*NumTraits<Scalar>::AddCost, 1, internal::scalar_sum_op);
+EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(minCoeff, (Size-1)*NumTraits<Scalar>::AddCost, 1, internal::scalar_min_op);
+EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(maxCoeff, (Size-1)*NumTraits<Scalar>::AddCost, 1, internal::scalar_max_op);
+EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(prod, (Size-1)*NumTraits<Scalar>::MulCost, 1, internal::scalar_product_op);
 
-template <typename BinaryOp, typename Scalar>
+template <int p, typename ResultType,typename Scalar>
+struct member_lpnorm {
+  typedef ResultType result_type;
+  enum { Vectorizable = 0 };
+  template<int Size> struct Cost
+  { enum { value = (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost }; };
+  EIGEN_DEVICE_FUNC member_lpnorm() {}
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC inline ResultType operator()(const XprType& mat) const
+  { return mat.template lpNorm<p>(); }
+};
+
+template <typename BinaryOpT, typename Scalar>
 struct member_redux {
+  typedef BinaryOpT BinaryOp;
   typedef typename result_of<
-                     BinaryOp(Scalar)
+                     BinaryOp(const Scalar&,const Scalar&)
                    >::type  result_type;
-  template<typename _Scalar, int Size> struct Cost
-  { enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
-  member_redux(const BinaryOp func) : m_functor(func) {}
+
+  enum { Vectorizable = functor_traits<BinaryOp>::PacketAccess };
+  template<int Size> struct Cost { enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
+  EIGEN_DEVICE_FUNC explicit member_redux(const BinaryOp func) : m_functor(func) {}
   template<typename Derived>
-  inline result_type operator()(const DenseBase<Derived>& mat) const
+  EIGEN_DEVICE_FUNC inline result_type operator()(const DenseBase<Derived>& mat) const
   { return mat.redux(m_functor); }
+  const BinaryOp& binaryFunc() const { return m_functor; }
   const BinaryOp m_functor;
 };
 }
@@ -149,18 +149,38 @@
 /** \class VectorwiseOp
   * \ingroup Core_Module
   *
-  * \brief Pseudo expression providing partial reduction operations
+  * \brief Pseudo expression providing broadcasting and partial reduction operations
   *
-  * \param ExpressionType the type of the object on which to do partial reductions
-  * \param Direction indicates the direction of the redux (#Vertical or #Horizontal)
+  * \tparam ExpressionType the type of the object on which to do partial reductions
+  * \tparam Direction indicates whether to operate on columns (#Vertical) or rows (#Horizontal)
   *
-  * This class represents a pseudo expression with partial reduction features.
+  * This class represents a pseudo expression with broadcasting and partial reduction features.
   * It is the return type of DenseBase::colwise() and DenseBase::rowwise()
-  * and most of the time this is the only way it is used.
+  * and most of the time this is the only way it is explicitly used.
+  *
+  * To understand the logic of rowwise/colwise expression, let's consider a generic case `A.colwise().foo()`
+  * where `foo` is any method of `VectorwiseOp`. This expression is equivalent to applying `foo()` to each
+  * column of `A` and then re-assemble the outputs in a matrix expression:
+  * \code [A.col(0).foo(), A.col(1).foo(), ..., A.col(A.cols()-1).foo()] \endcode
   *
   * Example: \include MatrixBase_colwise.cpp
   * Output: \verbinclude MatrixBase_colwise.out
   *
+  * The begin() and end() methods are obviously exceptions to the previous rule as they
+  * return STL-compatible begin/end iterators to the rows or columns of the nested expression.
+  * Typical use cases include for-range-loop and calls to STL algorithms:
+  *
+  * Example: \include MatrixBase_colwise_iterator_cxx11.cpp
+  * Output: \verbinclude MatrixBase_colwise_iterator_cxx11.out
+  *
+  * For a partial reduction on an empty input, some rules apply.
+  * For the sake of clarity, let's consider a vertical reduction:
+  *   - If the number of columns is zero, then a 1x0 row-major vector expression is returned.
+  *   - Otherwise, if the number of rows is zero, then
+  *       - a row vector of zeros is returned for sum-like reductions (sum, squaredNorm, norm, etc.)
+  *       - a row vector of ones is returned for a product reduction (e.g., <code>MatrixXd(n,0).colwise().prod()</code>)
+  *       - an assert is triggered for all other reductions (minCoeff,maxCoeff,redux(bin_op))
+  *
   * \sa DenseBase::colwise(), DenseBase::rowwise(), class PartialReduxExpr
   */
 template<typename ExpressionType, int Direction> class VectorwiseOp
@@ -169,16 +189,15 @@
 
     typedef typename ExpressionType::Scalar Scalar;
     typedef typename ExpressionType::RealScalar RealScalar;
-    typedef typename ExpressionType::Index Index;
-    typedef typename internal::conditional<internal::must_nest_by_value<ExpressionType>::ret,
-        ExpressionType, ExpressionType&>::type ExpressionTypeNested;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
+    typedef typename internal::ref_selector<ExpressionType>::non_const_type ExpressionTypeNested;
     typedef typename internal::remove_all<ExpressionTypeNested>::type ExpressionTypeNestedCleaned;
 
-    template<template<typename _Scalar> class Functor,
-                      typename Scalar=typename internal::traits<ExpressionType>::Scalar> struct ReturnType
+    template<template<typename OutScalar,typename InputScalar> class Functor,
+                      typename ReturnScalar=Scalar> struct ReturnType
     {
       typedef PartialReduxExpr<ExpressionType,
-                               Functor<Scalar>,
+                               Functor<ReturnScalar,Scalar>,
                                Direction
                               > Type;
     };
@@ -186,119 +205,198 @@
     template<typename BinaryOp> struct ReduxReturnType
     {
       typedef PartialReduxExpr<ExpressionType,
-                               internal::member_redux<BinaryOp,typename internal::traits<ExpressionType>::Scalar>,
+                               internal::member_redux<BinaryOp,Scalar>,
                                Direction
                               > Type;
     };
 
     enum {
-      IsVertical   = (Direction==Vertical) ? 1 : 0,
-      IsHorizontal = (Direction==Horizontal) ? 1 : 0
+      isVertical   = (Direction==Vertical) ? 1 : 0,
+      isHorizontal = (Direction==Horizontal) ? 1 : 0
     };
 
   protected:
 
-    /** \internal
-      * \returns the i-th subvector according to the \c Direction */
-    typedef typename internal::conditional<Direction==Vertical,
-                               typename ExpressionType::ColXpr,
-                               typename ExpressionType::RowXpr>::type SubVector;
-    SubVector subVector(Index i)
-    {
-      return SubVector(m_matrix.derived(),i);
-    }
-
-    /** \internal
-      * \returns the number of subvectors in the direction \c Direction */
-    Index subVectors() const
-    { return Direction==Vertical?m_matrix.cols():m_matrix.rows(); }
-
     template<typename OtherDerived> struct ExtendedType {
       typedef Replicate<OtherDerived,
-                        Direction==Vertical   ? 1 : ExpressionType::RowsAtCompileTime,
-                        Direction==Horizontal ? 1 : ExpressionType::ColsAtCompileTime> Type;
+                        isVertical   ? 1 : ExpressionType::RowsAtCompileTime,
+                        isHorizontal ? 1 : ExpressionType::ColsAtCompileTime> Type;
     };
 
     /** \internal
       * Replicates a vector to match the size of \c *this */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     typename ExtendedType<OtherDerived>::Type
     extendedTo(const DenseBase<OtherDerived>& other) const
     {
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Vertical, OtherDerived::MaxColsAtCompileTime==1),
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isVertical, OtherDerived::MaxColsAtCompileTime==1),
                           YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED)
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Horizontal, OtherDerived::MaxRowsAtCompileTime==1),
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isHorizontal, OtherDerived::MaxRowsAtCompileTime==1),
                           YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED)
       return typename ExtendedType<OtherDerived>::Type
                       (other.derived(),
-                       Direction==Vertical   ? 1 : m_matrix.rows(),
-                       Direction==Horizontal ? 1 : m_matrix.cols());
+                       isVertical   ? 1 : m_matrix.rows(),
+                       isHorizontal ? 1 : m_matrix.cols());
     }
 
     template<typename OtherDerived> struct OppositeExtendedType {
       typedef Replicate<OtherDerived,
-                        Direction==Horizontal ? 1 : ExpressionType::RowsAtCompileTime,
-                        Direction==Vertical   ? 1 : ExpressionType::ColsAtCompileTime> Type;
+                        isHorizontal ? 1 : ExpressionType::RowsAtCompileTime,
+                        isVertical   ? 1 : ExpressionType::ColsAtCompileTime> Type;
     };
 
     /** \internal
       * Replicates a vector in the opposite direction to match the size of \c *this */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     typename OppositeExtendedType<OtherDerived>::Type
     extendedToOpposite(const DenseBase<OtherDerived>& other) const
     {
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Horizontal, OtherDerived::MaxColsAtCompileTime==1),
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isHorizontal, OtherDerived::MaxColsAtCompileTime==1),
                           YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED)
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Vertical, OtherDerived::MaxRowsAtCompileTime==1),
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isVertical, OtherDerived::MaxRowsAtCompileTime==1),
                           YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED)
       return typename OppositeExtendedType<OtherDerived>::Type
                       (other.derived(),
-                       Direction==Horizontal  ? 1 : m_matrix.rows(),
-                       Direction==Vertical    ? 1 : m_matrix.cols());
+                       isHorizontal  ? 1 : m_matrix.rows(),
+                       isVertical    ? 1 : m_matrix.cols());
     }
 
   public:
-
-    inline VectorwiseOp(ExpressionType& matrix) : m_matrix(matrix) {}
+    EIGEN_DEVICE_FUNC
+    explicit inline VectorwiseOp(ExpressionType& matrix) : m_matrix(matrix) {}
 
     /** \internal */
+    EIGEN_DEVICE_FUNC
     inline const ExpressionType& _expression() const { return m_matrix; }
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    /** STL-like <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">RandomAccessIterator</a>
+      * iterator type over the columns or rows as returned by the begin() and end() methods.
+      */
+    random_access_iterator_type iterator;
+    /** This is the const version of iterator (aka read-only) */
+    random_access_iterator_type const_iterator;
+    #else
+    typedef internal::subvector_stl_iterator<ExpressionType,               DirectionType(Direction)> iterator;
+    typedef internal::subvector_stl_iterator<const ExpressionType,         DirectionType(Direction)> const_iterator;
+    typedef internal::subvector_stl_reverse_iterator<ExpressionType,       DirectionType(Direction)> reverse_iterator;
+    typedef internal::subvector_stl_reverse_iterator<const ExpressionType, DirectionType(Direction)> const_reverse_iterator;
+    #endif
+
+    /** returns an iterator to the first row (rowwise) or column (colwise) of the nested expression.
+      * \sa end(), cbegin()
+      */
+    iterator                 begin()       { return iterator      (m_matrix, 0); }
+    /** const version of begin() */
+    const_iterator           begin() const { return const_iterator(m_matrix, 0); }
+    /** const version of begin() */
+    const_iterator          cbegin() const { return const_iterator(m_matrix, 0); }
+
+    /** returns a reverse iterator to the last row (rowwise) or column (colwise) of the nested expression.
+      * \sa rend(), crbegin()
+      */
+    reverse_iterator        rbegin()       { return reverse_iterator       (m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()-1); }
+	/** const version of rbegin() */
+    const_reverse_iterator  rbegin() const { return const_reverse_iterator (m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()-1); }
+	/** const version of rbegin() */
+	const_reverse_iterator crbegin() const { return const_reverse_iterator (m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()-1); }
+
+    /** returns an iterator to the row (resp. column) following the last row (resp. column) of the nested expression
+      * \sa begin(), cend()
+      */
+    iterator                 end()         { return iterator      (m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()); }
+    /** const version of end() */
+    const_iterator           end()  const  { return const_iterator(m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()); }
+    /** const version of end() */
+    const_iterator          cend()  const  { return const_iterator(m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()); }
+
+    /** returns a reverse iterator to the row (resp. column) before the first row (resp. column) of the nested expression
+      * \sa begin(), cend()
+      */
+    reverse_iterator        rend()         { return reverse_iterator       (m_matrix, -1); }
+    /** const version of rend() */
+    const_reverse_iterator  rend()  const  { return const_reverse_iterator (m_matrix, -1); }
+    /** const version of rend() */
+    const_reverse_iterator crend()  const  { return const_reverse_iterator (m_matrix, -1); }
+
     /** \returns a row or column vector expression of \c *this reduxed by \a func
       *
       * The template parameter \a BinaryOp is the type of the functor
       * of the custom redux operator. Note that func must be an associative operator.
       *
+      * \warning the size along the reduction direction must be strictly positive,
+      *          otherwise an assertion is triggered.
+      *
       * \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise()
       */
     template<typename BinaryOp>
+    EIGEN_DEVICE_FUNC
     const typename ReduxReturnType<BinaryOp>::Type
     redux(const BinaryOp& func = BinaryOp()) const
-    { return typename ReduxReturnType<BinaryOp>::Type(_expression(), func); }
+    {
+      eigen_assert(redux_length()>0 && "you are using an empty matrix");
+      return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func));
+    }
+
+    typedef typename ReturnType<internal::member_minCoeff>::Type MinCoeffReturnType;
+    typedef typename ReturnType<internal::member_maxCoeff>::Type MaxCoeffReturnType;
+    typedef PartialReduxExpr<const CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const ExpressionTypeNestedCleaned>,internal::member_sum<RealScalar,RealScalar>,Direction> SquaredNormReturnType;
+    typedef CwiseUnaryOp<internal::scalar_sqrt_op<RealScalar>, const SquaredNormReturnType> NormReturnType;
+    typedef typename ReturnType<internal::member_blueNorm,RealScalar>::Type BlueNormReturnType;
+    typedef typename ReturnType<internal::member_stableNorm,RealScalar>::Type StableNormReturnType;
+    typedef typename ReturnType<internal::member_hypotNorm,RealScalar>::Type HypotNormReturnType;
+    typedef typename ReturnType<internal::member_sum>::Type SumReturnType;
+    typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(SumReturnType,Scalar,quotient) MeanReturnType;
+    typedef typename ReturnType<internal::member_all>::Type AllReturnType;
+    typedef typename ReturnType<internal::member_any>::Type AnyReturnType;
+    typedef PartialReduxExpr<ExpressionType, internal::member_count<Index,Scalar>, Direction> CountReturnType;
+    typedef typename ReturnType<internal::member_prod>::Type ProdReturnType;
+    typedef Reverse<const ExpressionType, Direction> ConstReverseReturnType;
+    typedef Reverse<ExpressionType, Direction> ReverseReturnType;
+
+    template<int p> struct LpNormReturnType {
+      typedef PartialReduxExpr<ExpressionType, internal::member_lpnorm<p,RealScalar,Scalar>,Direction> Type;
+    };
 
     /** \returns a row (or column) vector expression of the smallest coefficient
       * of each column (or row) of the referenced expression.
       *
+      * \warning the size along the reduction direction must be strictly positive,
+      *          otherwise an assertion is triggered.
+      *
       * \warning the result is undefined if \c *this contains NaN.
       *
       * Example: \include PartialRedux_minCoeff.cpp
       * Output: \verbinclude PartialRedux_minCoeff.out
       *
       * \sa DenseBase::minCoeff() */
-    const typename ReturnType<internal::member_minCoeff>::Type minCoeff() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const MinCoeffReturnType minCoeff() const
+    {
+      eigen_assert(redux_length()>0 && "you are using an empty matrix");
+      return MinCoeffReturnType(_expression());
+    }
 
     /** \returns a row (or column) vector expression of the largest coefficient
       * of each column (or row) of the referenced expression.
       *
+      * \warning the size along the reduction direction must be strictly positive,
+      *          otherwise an assertion is triggered.
+      *
       * \warning the result is undefined if \c *this contains NaN.
       *
       * Example: \include PartialRedux_maxCoeff.cpp
       * Output: \verbinclude PartialRedux_maxCoeff.out
       *
       * \sa DenseBase::maxCoeff() */
-    const typename ReturnType<internal::member_maxCoeff>::Type maxCoeff() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const MaxCoeffReturnType maxCoeff() const
+    {
+      eigen_assert(redux_length()>0 && "you are using an empty matrix");
+      return MaxCoeffReturnType(_expression());
+    }
 
     /** \returns a row (or column) vector expression of the squared norm
       * of each column (or row) of the referenced expression.
@@ -308,8 +406,9 @@
       * Output: \verbinclude PartialRedux_squaredNorm.out
       *
       * \sa DenseBase::squaredNorm() */
-    const typename ReturnType<internal::member_squaredNorm,RealScalar>::Type squaredNorm() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const SquaredNormReturnType squaredNorm() const
+    { return SquaredNormReturnType(m_matrix.cwiseAbs2()); }
 
     /** \returns a row (or column) vector expression of the norm
       * of each column (or row) of the referenced expression.
@@ -319,8 +418,22 @@
       * Output: \verbinclude PartialRedux_norm.out
       *
       * \sa DenseBase::norm() */
-    const typename ReturnType<internal::member_norm,RealScalar>::Type norm() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const NormReturnType norm() const
+    { return NormReturnType(squaredNorm()); }
+
+    /** \returns a row (or column) vector expression of the norm
+      * of each column (or row) of the referenced expression.
+      * This is a vector with real entries, even if the original matrix has complex entries.
+      *
+      * Example: \include PartialRedux_norm.cpp
+      * Output: \verbinclude PartialRedux_norm.out
+      *
+      * \sa DenseBase::norm() */
+    template<int p>
+    EIGEN_DEVICE_FUNC
+    const typename LpNormReturnType<p>::Type lpNorm() const
+    { return typename LpNormReturnType<p>::Type(_expression()); }
 
 
     /** \returns a row (or column) vector expression of the norm
@@ -329,8 +442,9 @@
       * This is a vector with real entries, even if the original matrix has complex entries.
       *
       * \sa DenseBase::blueNorm() */
-    const typename ReturnType<internal::member_blueNorm,RealScalar>::Type blueNorm() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const BlueNormReturnType blueNorm() const
+    { return BlueNormReturnType(_expression()); }
 
 
     /** \returns a row (or column) vector expression of the norm
@@ -339,8 +453,9 @@
       * This is a vector with real entries, even if the original matrix has complex entries.
       *
       * \sa DenseBase::stableNorm() */
-    const typename ReturnType<internal::member_stableNorm,RealScalar>::Type stableNorm() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const StableNormReturnType stableNorm() const
+    { return StableNormReturnType(_expression()); }
 
 
     /** \returns a row (or column) vector expression of the norm
@@ -349,8 +464,9 @@
       * This is a vector with real entries, even if the original matrix has complex entries.
       *
       * \sa DenseBase::hypotNorm() */
-    const typename ReturnType<internal::member_hypotNorm,RealScalar>::Type hypotNorm() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const HypotNormReturnType hypotNorm() const
+    { return HypotNormReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression of the sum
       * of each column (or row) of the referenced expression.
@@ -359,31 +475,35 @@
       * Output: \verbinclude PartialRedux_sum.out
       *
       * \sa DenseBase::sum() */
-    const typename ReturnType<internal::member_sum>::Type sum() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const SumReturnType sum() const
+    { return SumReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression of the mean
     * of each column (or row) of the referenced expression.
     *
     * \sa DenseBase::mean() */
-    const typename ReturnType<internal::member_mean>::Type mean() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const MeanReturnType mean() const
+    { return sum() / Scalar(Direction==Vertical?m_matrix.rows():m_matrix.cols()); }
 
     /** \returns a row (or column) vector expression representing
       * whether \b all coefficients of each respective column (or row) are \c true.
       * This expression can be assigned to a vector with entries of type \c bool.
       *
       * \sa DenseBase::all() */
-    const typename ReturnType<internal::member_all>::Type all() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const AllReturnType all() const
+    { return AllReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression representing
       * whether \b at \b least one coefficient of each respective column (or row) is \c true.
       * This expression can be assigned to a vector with entries of type \c bool.
       *
       * \sa DenseBase::any() */
-    const typename ReturnType<internal::member_any>::Type any() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const AnyReturnType any() const
+    { return AnyReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression representing
       * the number of \c true coefficients of each respective column (or row).
@@ -394,8 +514,9 @@
       * Output: \verbinclude PartialRedux_count.out
       *
       * \sa DenseBase::count() */
-    const PartialReduxExpr<ExpressionType, internal::member_count<Index>, Direction> count() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const CountReturnType count() const
+    { return CountReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression of the product
       * of each column (or row) of the referenced expression.
@@ -404,8 +525,9 @@
       * Output: \verbinclude PartialRedux_prod.out
       *
       * \sa DenseBase::prod() */
-    const typename ReturnType<internal::member_prod>::Type prod() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const ProdReturnType prod() const
+    { return ProdReturnType(_expression()); }
 
 
     /** \returns a matrix expression
@@ -415,10 +537,20 @@
       * Output: \verbinclude Vectorwise_reverse.out
       *
       * \sa DenseBase::reverse() */
-    const Reverse<ExpressionType, Direction> reverse() const
-    { return Reverse<ExpressionType, Direction>( _expression() ); }
+    EIGEN_DEVICE_FUNC
+    const ConstReverseReturnType reverse() const
+    { return ConstReverseReturnType( _expression() ); }
 
-    typedef Replicate<ExpressionType,Direction==Vertical?Dynamic:1,Direction==Horizontal?Dynamic:1> ReplicateReturnType;
+    /** \returns a writable matrix expression
+      * where each column (or row) are reversed.
+      *
+      * \sa reverse() const */
+    EIGEN_DEVICE_FUNC
+    ReverseReturnType reverse()
+    { return ReverseReturnType( _expression() ); }
+
+    typedef Replicate<ExpressionType,(isVertical?Dynamic:1),(isHorizontal?Dynamic:1)> ReplicateReturnType;
+    EIGEN_DEVICE_FUNC
     const ReplicateReturnType replicate(Index factor) const;
 
     /**
@@ -430,68 +562,75 @@
       * \sa VectorwiseOp::replicate(Index), DenseBase::replicate(), class Replicate
       */
     // NOTE implemented here because of sunstudio's compilation errors
-    template<int Factor> const Replicate<ExpressionType,(IsVertical?Factor:1),(IsHorizontal?Factor:1)>
+    // isVertical*Factor+isHorizontal instead of (isVertical?Factor:1) to handle CUDA bug with ternary operator
+    template<int Factor> const Replicate<ExpressionType,isVertical*Factor+isHorizontal,isHorizontal*Factor+isVertical>
+    EIGEN_DEVICE_FUNC
     replicate(Index factor = Factor) const
     {
-      return Replicate<ExpressionType,Direction==Vertical?Factor:1,Direction==Horizontal?Factor:1>
-          (_expression(),Direction==Vertical?factor:1,Direction==Horizontal?factor:1);
+      return Replicate<ExpressionType,(isVertical?Factor:1),(isHorizontal?Factor:1)>
+          (_expression(),isVertical?factor:1,isHorizontal?factor:1);
     }
 
 /////////// Artithmetic operators ///////////
 
     /** Copies the vector \a other to each subvector of \c *this */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     ExpressionType& operator=(const DenseBase<OtherDerived>& other)
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
       //eigen_assert((m_matrix.isNull()) == (other.isNull())); FIXME
-      return const_cast<ExpressionType&>(m_matrix = extendedTo(other.derived()));
+      return m_matrix = extendedTo(other.derived());
     }
 
     /** Adds the vector \a other to each subvector of \c *this */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     ExpressionType& operator+=(const DenseBase<OtherDerived>& other)
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
-      return const_cast<ExpressionType&>(m_matrix += extendedTo(other.derived()));
+      return m_matrix += extendedTo(other.derived());
     }
 
     /** Substracts the vector \a other to each subvector of \c *this */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     ExpressionType& operator-=(const DenseBase<OtherDerived>& other)
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
-      return const_cast<ExpressionType&>(m_matrix -= extendedTo(other.derived()));
+      return m_matrix -= extendedTo(other.derived());
     }
 
     /** Multiples each subvector of \c *this by the vector \a other */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     ExpressionType& operator*=(const DenseBase<OtherDerived>& other)
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
       EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
       m_matrix *= extendedTo(other.derived());
-      return const_cast<ExpressionType&>(m_matrix);
+      return m_matrix;
     }
 
     /** Divides each subvector of \c *this by the vector \a other */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     ExpressionType& operator/=(const DenseBase<OtherDerived>& other)
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
       EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
       m_matrix /= extendedTo(other.derived());
-      return const_cast<ExpressionType&>(m_matrix);
+      return m_matrix;
     }
 
     /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */
-    template<typename OtherDerived> EIGEN_STRONG_INLINE
-    CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
+    template<typename OtherDerived> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+    CwiseBinaryOp<internal::scalar_sum_op<Scalar,typename OtherDerived::Scalar>,
                   const ExpressionTypeNestedCleaned,
                   const typename ExtendedType<OtherDerived>::Type>
     operator+(const DenseBase<OtherDerived>& other) const
@@ -503,7 +642,8 @@
 
     /** Returns the expression of the difference between each subvector of \c *this and the vector \a other */
     template<typename OtherDerived>
-    CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
+    EIGEN_DEVICE_FUNC
+    CwiseBinaryOp<internal::scalar_difference_op<Scalar,typename OtherDerived::Scalar>,
                   const ExpressionTypeNestedCleaned,
                   const typename ExtendedType<OtherDerived>::Type>
     operator-(const DenseBase<OtherDerived>& other) const
@@ -515,10 +655,11 @@
 
     /** Returns the expression where each subvector is the product of the vector \a other
       * by the corresponding subvector of \c *this */
-    template<typename OtherDerived> EIGEN_STRONG_INLINE
+    template<typename OtherDerived> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
     CwiseBinaryOp<internal::scalar_product_op<Scalar>,
                   const ExpressionTypeNestedCleaned,
                   const typename ExtendedType<OtherDerived>::Type>
+    EIGEN_DEVICE_FUNC
     operator*(const DenseBase<OtherDerived>& other) const
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
@@ -530,6 +671,7 @@
     /** Returns the expression where each subvector is the quotient of the corresponding
       * subvector of \c *this by the vector \a other */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     CwiseBinaryOp<internal::scalar_quotient_op<Scalar>,
                   const ExpressionTypeNestedCleaned,
                   const typename ExtendedType<OtherDerived>::Type>
@@ -541,33 +683,35 @@
       return m_matrix / extendedTo(other.derived());
     }
 
-    /** \returns an expression where each column of row of the referenced matrix are normalized.
+    /** \returns an expression where each column (or row) of the referenced matrix are normalized.
       * The referenced matrix is \b not modified.
       * \sa MatrixBase::normalized(), normalize()
       */
+    EIGEN_DEVICE_FUNC
     CwiseBinaryOp<internal::scalar_quotient_op<Scalar>,
                   const ExpressionTypeNestedCleaned,
-                  const typename OppositeExtendedType<typename ReturnType<internal::member_norm,RealScalar>::Type>::Type>
+                  const typename OppositeExtendedType<NormReturnType>::Type>
     normalized() const { return m_matrix.cwiseQuotient(extendedToOpposite(this->norm())); }
 
 
     /** Normalize in-place each row or columns of the referenced matrix.
       * \sa MatrixBase::normalize(), normalized()
       */
-    void normalize() {
+    EIGEN_DEVICE_FUNC void normalize() {
       m_matrix = this->normalized();
     }
 
-    inline void reverseInPlace();
+    EIGEN_DEVICE_FUNC inline void reverseInPlace();
 
 /////////// Geometry module ///////////
 
-    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
-    Homogeneous<ExpressionType,Direction> homogeneous() const;
-    #endif
+    typedef Homogeneous<ExpressionType,Direction> HomogeneousReturnType;
+    EIGEN_DEVICE_FUNC
+    HomogeneousReturnType homogeneous() const;
 
     typedef typename ExpressionType::PlainObject CrossReturnType;
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     const CrossReturnType cross(const MatrixBase<OtherDerived>& other) const;
 
     enum {
@@ -592,60 +736,47 @@
                   Direction==Horizontal ? HNormalized_SizeMinusOne : 1> >
             HNormalizedReturnType;
 
+    EIGEN_DEVICE_FUNC
     const HNormalizedReturnType hnormalized() const;
 
+#   ifdef EIGEN_VECTORWISEOP_PLUGIN
+#     include EIGEN_VECTORWISEOP_PLUGIN
+#   endif
+
   protected:
+    Index redux_length() const
+    {
+      return Direction==Vertical ? m_matrix.rows() : m_matrix.cols();
+    }
     ExpressionTypeNested m_matrix;
 };
 
-/** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
-  *
-  * Example: \include MatrixBase_colwise.cpp
-  * Output: \verbinclude MatrixBase_colwise.out
-  *
-  * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
-  */
-template<typename Derived>
-inline const typename DenseBase<Derived>::ConstColwiseReturnType
-DenseBase<Derived>::colwise() const
-{
-  return derived();
-}
+//const colwise moved to DenseBase.h due to CUDA compiler bug
+
 
 /** \returns a writable VectorwiseOp wrapper of *this providing additional partial reduction operations
   *
   * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
   */
 template<typename Derived>
-inline typename DenseBase<Derived>::ColwiseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ColwiseReturnType
 DenseBase<Derived>::colwise()
 {
-  return derived();
+  return ColwiseReturnType(derived());
 }
 
-/** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
-  *
-  * Example: \include MatrixBase_rowwise.cpp
-  * Output: \verbinclude MatrixBase_rowwise.out
-  *
-  * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
-  */
-template<typename Derived>
-inline const typename DenseBase<Derived>::ConstRowwiseReturnType
-DenseBase<Derived>::rowwise() const
-{
-  return derived();
-}
+//const rowwise moved to DenseBase.h due to CUDA compiler bug
+
 
 /** \returns a writable VectorwiseOp wrapper of *this providing additional partial reduction operations
   *
   * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
   */
 template<typename Derived>
-inline typename DenseBase<Derived>::RowwiseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::RowwiseReturnType
 DenseBase<Derived>::rowwise()
 {
-  return derived();
+  return RowwiseReturnType(derived());
 }
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h
index 1c4c937..00bcca8 100644
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h

@@ -10,7 +10,7 @@
 #ifndef EIGEN_VISITOR_H
 #define EIGEN_VISITOR_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -22,6 +22,7 @@
     row = (UnrollCount-1) % Derived::RowsAtCompileTime
   };
 
+  EIGEN_DEVICE_FUNC
   static inline void run(const Derived &mat, Visitor& visitor)
   {
     visitor_impl<Visitor, Derived, UnrollCount-1>::run(mat, visitor);
@@ -32,16 +33,25 @@
 template<typename Visitor, typename Derived>
 struct visitor_impl<Visitor, Derived, 1>
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(const Derived &mat, Visitor& visitor)
   {
     return visitor.init(mat.coeff(0, 0), 0, 0);
   }
 };
 
+// This specialization enables visitors on empty matrices at compile-time
+template<typename Visitor, typename Derived>
+struct visitor_impl<Visitor, Derived, 0> {
+  EIGEN_DEVICE_FUNC
+  static inline void run(const Derived &/*mat*/, Visitor& /*visitor*/)
+  {}
+};
+
 template<typename Visitor, typename Derived>
 struct visitor_impl<Visitor, Derived, Dynamic>
 {
-  typedef typename Derived::Index Index;
+  EIGEN_DEVICE_FUNC
   static inline void run(const Derived& mat, Visitor& visitor)
   {
     visitor.init(mat.coeff(0,0), 0, 0);
@@ -53,6 +63,33 @@
   }
 };
 
+// evaluator adaptor
+template<typename XprType>
+class visitor_evaluator
+{
+public:
+  EIGEN_DEVICE_FUNC
+  explicit visitor_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  enum {
+    RowsAtCompileTime = XprType::RowsAtCompileTime,
+    CoeffReadCost = internal::evaluator<XprType>::CoeffReadCost
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_xpr.size(); }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  { return m_evaluator.coeff(row, col); }
+
+protected:
+  internal::evaluator<XprType> m_evaluator;
+  const XprType &m_xpr;
+};
 } // end namespace internal
 
 /** Applies the visitor \a visitor to the whole coefficients of the matrix or vector.
@@ -70,6 +107,8 @@
   * \note compared to one or two \em for \em loops, visitors offer automatic
   * unrolling for small fixed size matrix.
   *
+  * \note if the matrix is empty, then the visitor is left unchanged.
+  *
   * \sa minCoeff(Index*,Index*), maxCoeff(Index*,Index*), DenseBase::redux()
   */
 template<typename Derived>
@@ -77,14 +116,17 @@
 EIGEN_DEVICE_FUNC
 void DenseBase<Derived>::visit(Visitor& visitor) const
 {
-  enum { unroll = SizeAtCompileTime != Dynamic
-                   && CoeffReadCost != Dynamic
-                   && (SizeAtCompileTime == 1 || internal::functor_traits<Visitor>::Cost != Dynamic)
-                   && SizeAtCompileTime * CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits<Visitor>::Cost
-                      <= EIGEN_UNROLLING_LIMIT };
-  return internal::visitor_impl<Visitor, Derived,
-      unroll ? int(SizeAtCompileTime) : Dynamic
-    >::run(derived(), visitor);
+  if(size()==0)
+    return;
+
+  typedef typename internal::visitor_evaluator<Derived> ThisEvaluator;
+  ThisEvaluator thisEval(derived());
+
+  enum {
+    unroll =  SizeAtCompileTime != Dynamic
+           && SizeAtCompileTime * int(ThisEvaluator::CoeffReadCost) + (SizeAtCompileTime-1) * int(internal::functor_traits<Visitor>::Cost) <= EIGEN_UNROLLING_LIMIT
+  };
+  return internal::visitor_impl<Visitor, ThisEvaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(thisEval, visitor);
 }
 
 namespace internal {
@@ -95,10 +137,13 @@
 template <typename Derived>
 struct coeff_visitor
 {
-  typedef typename Derived::Index Index;
+  // default initialization to avoid countless invalid maybe-uninitialized warnings by gcc
+  EIGEN_DEVICE_FUNC
+  coeff_visitor() : row(-1), col(-1), res(0) {}
   typedef typename Derived::Scalar Scalar;
   Index row, col;
   Scalar res;
+  EIGEN_DEVICE_FUNC
   inline void init(const Scalar& value, Index i, Index j)
   {
     res = value;
@@ -112,11 +157,11 @@
   *
   * \sa DenseBase::minCoeff(Index*, Index*)
   */
-template <typename Derived>
+template <typename Derived, int NaNPropagation>
 struct min_coeff_visitor : coeff_visitor<Derived>
 {
-  typedef typename Derived::Index Index;
   typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
   void operator() (const Scalar& value, Index i, Index j)
   {
     if(value < this->res)
@@ -128,8 +173,40 @@
   }
 };
 
-template<typename Scalar>
-struct functor_traits<min_coeff_visitor<Scalar> > {
+template <typename Derived>
+struct min_coeff_visitor<Derived, PropagateNumbers> : coeff_visitor<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  void operator() (const Scalar& value, Index i, Index j)
+  {
+    if((numext::isnan)(this->res) || (!(numext::isnan)(value) && value < this->res))
+    {
+      this->res = value;
+      this->row = i;
+      this->col = j;
+    }
+  }
+};
+
+template <typename Derived>
+struct min_coeff_visitor<Derived, PropagateNaN> : coeff_visitor<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  void operator() (const Scalar& value, Index i, Index j)
+  {
+    if((numext::isnan)(value) || value < this->res)
+    {
+      this->res = value;
+      this->row = i;
+      this->col = j;
+    }
+  }
+};
+
+template<typename Scalar, int NaNPropagation>
+    struct functor_traits<min_coeff_visitor<Scalar, NaNPropagation> > {
   enum {
     Cost = NumTraits<Scalar>::AddCost
   };
@@ -140,11 +217,11 @@
   *
   * \sa DenseBase::maxCoeff(Index*, Index*)
   */
-template <typename Derived>
+template <typename Derived, int NaNPropagation>
 struct max_coeff_visitor : coeff_visitor<Derived>
 {
-  typedef typename Derived::Index Index;
   typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
   void operator() (const Scalar& value, Index i, Index j)
   {
     if(value > this->res)
@@ -156,8 +233,40 @@
   }
 };
 
-template<typename Scalar>
-struct functor_traits<max_coeff_visitor<Scalar> > {
+template <typename Derived>
+struct max_coeff_visitor<Derived, PropagateNumbers> : coeff_visitor<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  void operator() (const Scalar& value, Index i, Index j)
+  {
+    if((numext::isnan)(this->res) || (!(numext::isnan)(value) && value > this->res))
+    {
+      this->res = value;
+      this->row = i;
+      this->col = j;
+    }
+  }
+};
+
+template <typename Derived>
+struct max_coeff_visitor<Derived, PropagateNaN> : coeff_visitor<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  void operator() (const Scalar& value, Index i, Index j)
+  {
+    if((numext::isnan)(value) || value > this->res)
+    {
+      this->res = value;
+      this->row = i;
+      this->col = j;
+    }
+  }
+};
+
+template<typename Scalar, int NaNPropagation>
+struct functor_traits<max_coeff_visitor<Scalar, NaNPropagation> > {
   enum {
     Cost = NumTraits<Scalar>::AddCost
   };
@@ -165,18 +274,26 @@
 
 } // end namespace internal
 
-/** \returns the minimum of all coefficients of *this and puts in *row and *col its location.
-  * \warning the result is undefined if \c *this contains NaN.
+/** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
+  * \returns the minimum of all coefficients of *this and puts in *row and *col its location.
   *
-  * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visitor(), DenseBase::minCoeff()
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
+  *
+  * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
   */
 template<typename Derived>
-template<typename IndexType>
+template<int NaNPropagation, typename IndexType>
 EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
 {
-  internal::min_coeff_visitor<Derived> minVisitor;
+  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
+  internal::min_coeff_visitor<Derived, NaNPropagation> minVisitor;
   this->visit(minVisitor);
   *rowId = minVisitor.row;
   if (colId) *colId = minVisitor.col;
@@ -184,35 +301,50 @@
 }
 
 /** \returns the minimum of all coefficients of *this and puts in *index its location.
-  * \warning the result is undefined if \c *this contains NaN. 
   *
-  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::minCoeff()
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
+  *
+  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff()
   */
 template<typename Derived>
-template<typename IndexType>
+template<int NaNPropagation, typename IndexType>
 EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff(IndexType* index) const
 {
+  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  internal::min_coeff_visitor<Derived> minVisitor;
+      internal::min_coeff_visitor<Derived, NaNPropagation> minVisitor;
   this->visit(minVisitor);
-  *index = (RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row;
+  *index = IndexType((RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row);
   return minVisitor.res;
 }
 
-/** \returns the maximum of all coefficients of *this and puts in *row and *col its location.
-  * \warning the result is undefined if \c *this contains NaN. 
+/** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
+  * \returns the maximum of all coefficients of *this and puts in *row and *col its location.
   *
-  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff()
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
+  *
+  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
   */
 template<typename Derived>
-template<typename IndexType>
+template<int NaNPropagation, typename IndexType>
 EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
 {
-  internal::max_coeff_visitor<Derived> maxVisitor;
+  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
+  internal::max_coeff_visitor<Derived, NaNPropagation> maxVisitor;
   this->visit(maxVisitor);
   *rowPtr = maxVisitor.row;
   if (colPtr) *colPtr = maxVisitor.col;
@@ -220,18 +352,25 @@
 }
 
 /** \returns the maximum of all coefficients of *this and puts in *index its location.
-  * \warning the result is undefined if \c *this contains NaN.
+  *
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
   *
   * \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff()
   */
 template<typename Derived>
-template<typename IndexType>
+template<int NaNPropagation, typename IndexType>
 EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff(IndexType* index) const
 {
+  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  internal::max_coeff_visitor<Derived> maxVisitor;
+      internal::max_coeff_visitor<Derived, NaNPropagation> maxVisitor;
   this->visit(maxVisitor);
   *index = (RowsAtCompileTime==1) ? maxVisitor.col : maxVisitor.row;
   return maxVisitor.res;

diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index e98c40e..e9096c0 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h

@@ -22,6 +22,7 @@
   __m256  v;
 };
 
+#ifndef EIGEN_VECTORIZE_AVX512
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
   typedef Packet4cf type;
@@ -37,6 +38,7 @@
     HasMul    = 1,
     HasDiv    = 1,
     HasNegate = 1,
+    HasSqrt   = 1,
     HasAbs    = 0,
     HasAbs2   = 0,
     HasMin    = 0,
@@ -44,8 +46,20 @@
     HasSetLinear = 0
   };
 };
+#endif
 
-template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet4cf> {
+  typedef std::complex<float> type;
+  typedef Packet2cf half;
+  typedef Packet8f as_real;
+  enum {
+    size=4,
+    alignment=Aligned32,
+    vectorizable=true,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
 
 template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
@@ -67,10 +81,17 @@
   return Packet4cf(result);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) {
+  __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ);
+  return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf ptrue<Packet4cf>(const Packet4cf& a) { return Packet4cf(ptrue(Packet8f(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet4cf pand   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf por    <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf pxor   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(b.v,a.v)); }
 
 template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); }
 template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); }
@@ -78,7 +99,9 @@
 
 template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
 {
-  return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from)));
+  const float re = std::real(from);
+  const float im = std::imag(from);
+  return Packet4cf(_mm256_set_ps(im, re, im, re, im, re, im, re));
 }
 
 template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
@@ -92,7 +115,7 @@
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
 
-template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from, int stride)
+template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from, Index stride)
 {
   return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]),
                                  std::imag(from[2*stride]), std::real(from[2*stride]),
@@ -100,7 +123,7 @@
                                  std::imag(from[0*stride]), std::real(from[0*stride])));
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, int stride)
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, Index stride)
 {
   __m128 low = _mm256_extractf128_ps(from.v, 0);
   to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)),
@@ -140,87 +163,13 @@
                      Packet2cf(_mm256_extractf128_ps(a.v,1))));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4cf preduxp<Packet4cf>(const Packet4cf* vecs)
-{
-  Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0));
-  Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0));
-  t0 = _mm256_hadd_ps(t0,t1);
-  Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0));
-  Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0));
-  t2 = _mm256_hadd_ps(t2,t3);
-  
-  t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4));
-  t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4));
-
-  return Packet4cf(_mm256_add_ps(t1,t3));
-}
-
 template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a)
 {
   return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)),
                          Packet2cf(_mm256_extractf128_ps(a.v, 1))));
 }
 
-template<int Offset>
-struct palign_impl<Offset,Packet4cf>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second)
-  {
-    if (Offset==0) return;
-    palign_impl<Offset*2,Packet8f>::run(first.v, second.v);
-  }
-};
-
-template<> struct conj_helper<Packet4cf, Packet4cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet4cf, Packet4cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
-template<> struct conj_helper<Packet8f, Packet4cf, false,false>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
-  { return Packet4cf(Eigen::internal::pmul(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet4cf, Packet8f, false,false>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
-  { return Packet4cf(Eigen::internal::pmul(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f)
 
 template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
 {
@@ -244,6 +193,7 @@
   __m256d  v;
 };
 
+#ifndef EIGEN_VECTORIZE_AVX512
 template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 {
   typedef Packet2cd type;
@@ -259,6 +209,7 @@
     HasMul    = 1,
     HasDiv    = 1,
     HasNegate = 1,
+    HasSqrt   = 1,
     HasAbs    = 0,
     HasAbs2   = 0,
     HasMin    = 0,
@@ -266,8 +217,20 @@
     HasSetLinear = 0
   };
 };
+#endif
 
-template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet2cd> {
+  typedef std::complex<double> type;
+  typedef Packet1cd half;
+  typedef Packet4d as_real;
+  enum {
+    size=2,
+    alignment=Aligned32,
+    vectorizable=true,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
 
 template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
@@ -288,10 +251,17 @@
   return Packet2cd(_mm256_addsub_pd(even, odd));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) {
+  __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ);
+  return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd ptrue<Packet2cd>(const Packet2cd& a) { return Packet2cd(ptrue(Packet4d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet2cd pand   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd por    <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd pxor   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(b.v,a.v)); }
 
 template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from)
 { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); }
@@ -310,13 +280,13 @@
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
 
-template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from, int stride)
+template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from, Index stride)
 {
   return Packet2cd(_mm256_set_pd(std::imag(from[1*stride]), std::real(from[1*stride]),
 				 std::imag(from[0*stride]), std::real(from[0*stride])));
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from, int stride)
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from, Index stride)
 {
   __m128d low = _mm256_extractf128_pd(from.v, 0);
   to[stride*0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)));
@@ -343,80 +313,13 @@
                      Packet1cd(_mm256_extractf128_pd(a.v,1))));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cd preduxp<Packet2cd>(const Packet2cd* vecs)
-{
-  Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4));
-  Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4));
-
-  return Packet2cd(_mm256_add_pd(t0,t1));
-}
-
 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a)
 {
   return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)),
                      Packet1cd(_mm256_extractf128_pd(a.v,1))));
 }
 
-template<int Offset>
-struct palign_impl<Offset,Packet2cd>
-{
-  static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second)
-  {
-    if (Offset==0) return;
-    palign_impl<Offset*2,Packet4d>::run(first.v, second.v);
-  }
-};
-
-template<> struct conj_helper<Packet2cd, Packet2cd, false,true>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet2cd, Packet2cd, true,false>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
-template<> struct conj_helper<Packet4d, Packet2cd, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
-  { return Packet2cd(Eigen::internal::pmul(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet2cd, Packet4d, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
-  { return Packet2cd(Eigen::internal::pmul(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d)
 
 template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
 {
@@ -431,7 +334,7 @@
   return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5));
 }
 
-template<> EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4cf,4>& kernel) {
   __m256d P0 = _mm256_castps_pd(kernel.packet[0].v);
   __m256d P1 = _mm256_castps_pd(kernel.packet[1].v);
@@ -449,13 +352,21 @@
   kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49));
 }
 
-template<> EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet2cd,2>& kernel) {
   __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4));
   kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4));
  kernel.packet[0].v = tmp;
 }
 
+template<> EIGEN_STRONG_INLINE Packet2cd psqrt<Packet2cd>(const Packet2cd& a) {
+  return psqrt_complex<Packet2cd>(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf psqrt<Packet4cf>(const Packet4cf& a) {
+  return psqrt_complex<Packet4cf>(a);
+}
+
 } // end namespace internal
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index 62ea574..67041c8 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h

@@ -10,12 +10,7 @@
 #ifndef EIGEN_MATH_FUNCTIONS_AVX_H
 #define EIGEN_MATH_FUNCTIONS_AVX_H
 
-// For some reason, this function didn't make it into the avxintirn.h
-// used by the compiler, so we'll just wrap it.
-#define _mm256_setr_m128(lo, hi) \
-  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)
-
-/* The sin, cos, exp, and log functions of this file are loosely derived from
+/* The sin and cos functions of this file are loosely derived from
  * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
  */
 
@@ -23,183 +18,50 @@
 
 namespace internal {
 
-// Sine function
-// Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and
-// evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants
-// are (anti-)symmetric and thus have only odd/even coefficients
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
 psin<Packet8f>(const Packet8f& _x) {
-  Packet8f x = _x;
-
-  // Some useful values.
-  _EIGEN_DECLARE_CONST_Packet8i(one, 1);
-  _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
-  _EIGEN_DECLARE_CONST_Packet8f(two, 2.0f);
-  _EIGEN_DECLARE_CONST_Packet8f(one_over_four, 0.25f);
-  _EIGEN_DECLARE_CONST_Packet8f(one_over_pi, 3.183098861837907e-01f);
-  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00);
-  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04);
-  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07);
-  _EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00);
-
-  // Map x from [-Pi/4,3*Pi/4] to z in [-1,3] and subtract the shifted period.
-  Packet8f z = pmul(x, p8f_one_over_pi);
-  Packet8f shift = _mm256_floor_ps(padd(z, p8f_one_over_four));
-  x = pmadd(shift, p8f_neg_pi_first, x);
-  x = pmadd(shift, p8f_neg_pi_second, x);
-  x = pmadd(shift, p8f_neg_pi_third, x);
-  z = pmul(x, p8f_four_over_pi);
-
-  // Make a mask for the entries that need flipping, i.e. wherever the shift
-  // is odd.
-  Packet8i shift_ints = _mm256_cvtps_epi32(shift);
-  Packet8i shift_isodd =
-      (__m256i)_mm256_and_ps((__m256)shift_ints, (__m256)p8i_one);
-#ifdef EIGEN_VECTORIZE_AVX2
-  Packet8i sign_flip_mask = _mm256_slli_epi32(shift_isodd, 31);
-#else
-  __m128i lo =
-      _mm_slli_epi32(_mm256_extractf128_si256((__m256i)shift_isodd, 0), 31);
-  __m128i hi =
-      _mm_slli_epi32(_mm256_extractf128_si256((__m256i)shift_isodd, 1), 31);
-  Packet8i sign_flip_mask = _mm256_setr_m128(lo, hi);
-#endif
-
-  // Create a mask for which interpolant to use, i.e. if z > 1, then the mask
-  // is set to ones for that entry.
-  Packet8f ival_mask = _mm256_cmp_ps(z, p8f_one, _CMP_GT_OQ);
-
-  // Evaluate the polynomial for the interval [1,3] in z.
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_0, 9.999999724233232e-01f);
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01);
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02);
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04);
-  Packet8f z_minus_two = psub(z, p8f_two);
-  Packet8f z_minus_two2 = pmul(z_minus_two, z_minus_two);
-  Packet8f right = pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4);
-  right = pmadd(right, z_minus_two2, p8f_coeff_right_2);
-  right = pmadd(right, z_minus_two2, p8f_coeff_right_0);
-
-  // Evaluate the polynomial for the interval [-1,1] in z.
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01);
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02);
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03);
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05);
-  Packet8f z2 = pmul(z, z);
-  Packet8f left = pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5);
-  left = pmadd(left, z2, p8f_coeff_left_3);
-  left = pmadd(left, z2, p8f_coeff_left_1);
-  left = pmul(left, z);
-
-  // Assemble the results, i.e. select the left and right polynomials.
-  left = _mm256_andnot_ps(ival_mask, left);
-  right = _mm256_and_ps(ival_mask, right);
-  Packet8f res = _mm256_or_ps(left, right);
-
-  // Flip the sign on the odd intervals and return the result.
-  res = _mm256_xor_ps(res, (__m256)sign_flip_mask);
-  return res;
+  return psin_float(_x);
 }
 
-// Natural logarithm
-// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
-// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
-// be easily approximated by a polynomial centered on m=1 for stability.
-// TODO(gonnet): Further reduce the interval allowing for lower-degree
-//               polynomial interpolants -> ... -> profit!
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+pcos<Packet8f>(const Packet8f& _x) {
+  return pcos_float(_x);
+}
+
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
 plog<Packet8f>(const Packet8f& _x) {
-  Packet8f x = _x;
-  _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
-  _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
-  _EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f);
+  return plog_float(_x);
+}
 
-  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000);
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
+plog<Packet4d>(const Packet4d& _x) {
+  return plog_double(_x);
+}
 
-  // The smallest non denormalized float number.
-  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000);
-  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000);
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+plog2<Packet8f>(const Packet8f& _x) {
+  return plog2_float(_x);
+}
 
-  // Polynomial coefficients.
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f);
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
+plog2<Packet4d>(const Packet4d& _x) {
+  return plog2_double(_x);
+}
 
-  // invalid_mask is set to true when x is NaN
-  Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ);
-  Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f plog1p<Packet8f>(const Packet8f& _x) {
+  return generic_plog1p(_x);
+}
 
-  // Truncate input values to the minimum positive normal.
-  x = pmax(x, p8f_min_norm_pos);
-
-// Extract the shifted exponents (No bitwise shifting in regular AVX, so
-// convert to SSE and do it there).
-#ifdef EIGEN_VECTORIZE_AVX2
-  Packet8f emm0 = _mm256_cvtepi32_ps(_mm256_srli_epi32((__m256i)x, 23));
-#else
-  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256((__m256i)x, 0), 23);
-  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256((__m256i)x, 1), 23);
-  Packet8f emm0 = _mm256_cvtepi32_ps(_mm256_setr_m128(lo, hi));
-#endif
-  Packet8f e = _mm256_sub_ps(emm0, p8f_126f);
-
-  // Set the exponents to -1, i.e. x are in the range [0.5,1).
-  x = _mm256_and_ps(x, p8f_inv_mant_mask);
-  x = _mm256_or_ps(x, p8f_half);
-
-  // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
-  // and shift by -1. The values are then centered around 0, which improves
-  // the stability of the polynomial evaluation.
-  //   if( x < SQRTHF ) {
-  //     e -= 1;
-  //     x = x + x - 1.0;
-  //   } else { x = x - 1.0; }
-  Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ);
-  Packet8f tmp = _mm256_and_ps(x, mask);
-  x = psub(x, p8f_1);
-  e = psub(e, _mm256_and_ps(p8f_1, mask));
-  x = padd(x, tmp);
-
-  Packet8f x2 = pmul(x, x);
-  Packet8f x3 = pmul(x2, x);
-
-  // Evaluate the polynomial approximant of degree 8 in three parts, probably
-  // to improve instruction-level parallelism.
-  Packet8f y, y1, y2;
-  y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1);
-  y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4);
-  y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7);
-  y = pmadd(y, x, p8f_cephes_log_p2);
-  y1 = pmadd(y1, x, p8f_cephes_log_p5);
-  y2 = pmadd(y2, x, p8f_cephes_log_p8);
-  y = pmadd(y, x3, y1);
-  y = pmadd(y, x3, y2);
-  y = pmul(y, x3);
-
-  // Add the logarithm of the exponent back to the result of the interpolation.
-  y1 = pmul(e, p8f_cephes_log_q1);
-  tmp = pmul(x2, p8f_half);
-  y = padd(y, y1);
-  x = psub(x, tmp);
-  y2 = pmul(e, p8f_cephes_log_q2);
-  x = padd(x, y);
-  x = padd(x, y2);
-
-  // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
-  return _mm256_or_ps(
-      _mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)),
-      _mm256_and_ps(iszero_mask, p8f_minus_inf));
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f pexpm1<Packet8f>(const Packet8f& _x) {
+  return generic_expm1(_x);
 }
 
 // Exponential function. Works by writing "x = m*log(2) + r" where
@@ -208,331 +70,155 @@
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
 pexp<Packet8f>(const Packet8f& _x) {
-  _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
-  _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
-  _EIGEN_DECLARE_CONST_Packet8f(127, 127.0f);
-
-  _EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f);
-  _EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f);
-
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f);
-
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f);
-
-  // Clamp x.
-  Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo);
-
-  // Express exp(x) as exp(m*ln(2) + r), start by extracting
-  // m = floor(x/ln(2) + 0.5).
-  Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half));
-
-// Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
-// subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
-// truncation errors. Note that we don't use the "pmadd" function here to
-// ensure that a precision-preserving FMA instruction is used.
-#ifdef EIGEN_VECTORIZE_FMA
-  _EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f);
-  Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x);
-#else
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f);
-  Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1));
-  r = psub(r, pmul(m, p8f_cephes_exp_C2));
-#endif
-
-  Packet8f r2 = pmul(r, r);
-
-  // TODO(gonnet): Split into odd/even polynomials and try to exploit
-  //               instruction-level parallelism.
-  Packet8f y = p8f_cephes_exp_p0;
-  y = pmadd(y, r, p8f_cephes_exp_p1);
-  y = pmadd(y, r, p8f_cephes_exp_p2);
-  y = pmadd(y, r, p8f_cephes_exp_p3);
-  y = pmadd(y, r, p8f_cephes_exp_p4);
-  y = pmadd(y, r, p8f_cephes_exp_p5);
-  y = pmadd(y, r2, r);
-  y = padd(y, p8f_1);
-
-  // Build emm0 = 2^m.
-  Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127));
-#ifdef EIGEN_VECTORIZE_AVX2
-  emm0 = _mm256_slli_epi32(emm0, 23);
-#else
-  __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(emm0, 0), 23);
-  __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(emm0, 1), 23);
-  emm0 = _mm256_setr_m128(lo, hi);
-#endif
-
-  // Return 2^m * exp(r).
-  return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x);
+  return pexp_float(_x);
 }
+
+// Hyperbolic Tangent function.
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+ptanh<Packet8f>(const Packet8f& _x) {
+  return internal::generic_fast_tanh_float(_x);
+}
+
+// Exponential function for doubles.
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
 pexp<Packet4d>(const Packet4d& _x) {
-  Packet4d x = _x;
-
-  _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
-  _EIGEN_DECLARE_CONST_Packet4d(2, 2.0);
-  _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
-
-  _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437);
-  _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303);
-
-  _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599);
-
-  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4);
-  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2);
-  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1);
-
-  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6);
-  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3);
-  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1);
-  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0);
-
-  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125);
-  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6);
-  _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
-
-  Packet4d tmp, fx;
-
-  // clamp x
-  x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo);
-  // Express exp(x) as exp(g + n*log(2)).
-  fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half);
-
-  // Get the integer modulus of log(2), i.e. the "n" described above.
-  fx = _mm256_floor_pd(fx);
-
-  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
-  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
-  // digits right.
-  tmp = pmul(fx, p4d_cephes_exp_C1);
-  Packet4d z = pmul(fx, p4d_cephes_exp_C2);
-  x = psub(x, tmp);
-  x = psub(x, z);
-
-  Packet4d x2 = pmul(x, x);
-
-  // Evaluate the numerator polynomial of the rational interpolant.
-  Packet4d px = p4d_cephes_exp_p0;
-  px = pmadd(px, x2, p4d_cephes_exp_p1);
-  px = pmadd(px, x2, p4d_cephes_exp_p2);
-  px = pmul(px, x);
-
-  // Evaluate the denominator polynomial of the rational interpolant.
-  Packet4d qx = p4d_cephes_exp_q0;
-  qx = pmadd(qx, x2, p4d_cephes_exp_q1);
-  qx = pmadd(qx, x2, p4d_cephes_exp_q2);
-  qx = pmadd(qx, x2, p4d_cephes_exp_q3);
-
-  // I don't really get this bit, copied from the SSE2 routines, so...
-  // TODO(gonnet): Figure out what is going on here, perhaps find a better
-  // rational interpolant?
-  x = _mm256_div_pd(px, psub(qx, px));
-  x = pmadd(p4d_2, x, p4d_1);
-
-  // Build e=2^n by constructing the exponents in a 128-bit vector and
-  // shifting them to where they belong in double-precision values.
-  __m128i emm0 = _mm256_cvtpd_epi32(fx);
-  emm0 = _mm_add_epi32(emm0, p4i_1023);
-  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
-  __m128i lo = _mm_slli_epi64(emm0, 52);
-  __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
-  __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
-  e = _mm256_insertf128_si256(e, hi, 1);
-
-  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
-  // non-finite values in the input.
-  return pmax(pmul(x, Packet4d(e)), _x);
+  return pexp_double(_x);
 }
 
 // Functions for sqrt.
 // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
 // of Newton's method, at a cost of 1-2 bits of precision as opposed to the
-// exact solution. The main advantage of this approach is not just speed, but
-// also the fact that it can be inlined and pipelined with other computations,
-// further reducing its effective latency.
+// exact solution. It does not handle +inf, or denormalized numbers correctly.
+// The main advantage of this approach is not just speed, but also the fact that
+// it can be inlined and pipelined with other computations, further reducing its
+// effective latency. This is similar to Quake3's fast inverse square root.
+// For detail see here: http://www.beyond3d.com/content/articles/8/
 #if EIGEN_FAST_MATH
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
-psqrt<Packet8f>(const Packet8f& _x) {
-  _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
-  _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
-  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
-
-  Packet8f neg_half = pmul(_x, p8f_minus_half);
-  Packet8f denormal_mask =
-      _mm256_and_ps(_mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ),
-                    _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_GE_OQ));
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f psqrt<Packet8f>(const Packet8f& _x) {
+  Packet8f minus_half_x = pmul(_x, pset1<Packet8f>(-0.5f));
+  Packet8f denormal_mask = pandnot(
+      pcmp_lt(_x, pset1<Packet8f>((std::numeric_limits<float>::min)())),
+      pcmp_lt(_x, pzero(_x)));
 
   // Compute approximate reciprocal sqrt.
   Packet8f x = _mm256_rsqrt_ps(_x);
-
   // Do a single step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five));
-
-  // Multiply the original _x by it's reciprocal square root to extract the
-  // square root.
-  x = pmul(_x, x);
-
+  x = pmul(x, pmadd(minus_half_x, pmul(x,x), pset1<Packet8f>(1.5f)));
   // Flush results for denormals to zero.
-  return _mm256_andnot_ps(denormal_mask, x);
-}
-#else
-template <>
-EIGEN_STRONG_INLINE Packet8f psqrt<Packet8f>(const Packet8f& x) {
-  return _mm256_sqrt_ps(x);
-}
-#endif
-template <>
-EIGEN_STRONG_INLINE Packet4d psqrt<Packet4d>(const Packet4d& x) {
-  return _mm256_sqrt_pd(x);
+  return pandnot(pmul(_x,x), denormal_mask);
 }
 
-// Functions for rsqrt.
-// Almost identical to the sqrt routine, just leave out the last multiplication
-// and fill in NaN/Inf where needed. Note that this function only exists as an
-// iterative version since there is no instruction for diretly computing the
-// reciprocal square root in AVX/AVX2 (there will be one in AVX-512).
-#ifdef EIGEN_FAST_MATH
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
-prsqrt<Packet8f>(const Packet8f& _x) {
+#else
+
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f psqrt<Packet8f>(const Packet8f& _x) {
+  return _mm256_sqrt_ps(_x);
+}
+
+#endif
+
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4d psqrt<Packet4d>(const Packet4d& _x) {
+  return _mm256_sqrt_pd(_x);
+}
+
+#if EIGEN_FAST_MATH
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
   _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
   _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
-  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
   _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
 
-  // Remeber which entries were zero (or almost).
-  Packet8f is_zero =
-      _mm256_and_ps(_mm256_cmp_ps(_x, p8f_flt_min, _CMP_NGE_UQ),
-                    _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_GE_OQ));
+  Packet8f neg_half = pmul(_x, p8f_minus_half);
 
   // select only the inverse sqrt of positive normal inputs (denormals are
-  // flushed to zero and cause infs).
-  Packet8f x = _mm256_rsqrt_ps(_x);
+  // flushed to zero and cause infs as well).
+  Packet8f lt_min_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
+  Packet8f inf_mask =  _mm256_cmp_ps(_x, p8f_inf, _CMP_EQ_OQ);
+  Packet8f not_normal_finite_mask = _mm256_or_ps(lt_min_mask, inf_mask);
 
-  // Do a single step of Newton's iteration.
-  Packet8f neg_half = pmul(_x, p8f_minus_half);
-  return _mm256_blendv_ps(
-      pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five)), p8f_inf,
-      is_zero);
+  // Compute an approximate result using the rsqrt intrinsic.
+  Packet8f y_approx = _mm256_rsqrt_ps(_x);
+
+  // Do a single step of Newton-Raphson iteration to improve the approximation.
+  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
+  // It is essential to evaluate the inner term like this because forming
+  // y_n^2 may over- or underflow.
+  Packet8f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p8f_one_point_five));
+
+  // Select the result of the Newton-Raphson step for positive normal arguments.
+  // For other arguments, choose the output of the intrinsic. This will
+  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if
+  // x is zero or a positive denormalized float (equivalent to flushing positive
+  // denormalized inputs to zero).
+  return pselect<Packet8f>(not_normal_finite_mask, y_approx, y_newton);
 }
+
 #else
-template <>
-EIGEN_STRONG_INLINE Packet8f prsqrt<Packet8f>(const Packet8f& x) {
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
   _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
-  return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(x));
+  return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(_x));
 }
 #endif
-template <>
-EIGEN_STRONG_INLINE Packet4d prsqrt<Packet4d>(const Packet4d& x) {
+
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4d prsqrt<Packet4d>(const Packet4d& _x) {
   _EIGEN_DECLARE_CONST_Packet4d(one, 1.0);
-  return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(x));
+  return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(_x));
 }
 
-// Functions for division.
-// The EIGEN_FAST_MATH version uses the _mm_rcp_ps approximation and one step of
-// Newton's method, at a cost of 1-2 bits of precision as opposed to the exact
-// solution. The main advantage of this approach is not just speed, but also the
-// fact that it can be inlined and pipelined with other computations, further
-// reducing its effective latency.
-#if EIGEN_FAST_DIV
+F16_PACKET_FUNCTION(Packet8f, Packet8h, psin)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pcos)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, plog)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, plog2)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, plog1p)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pexpm1)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, ptanh)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, prsqrt)
+
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
-pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) {
-  _EIGEN_DECLARE_CONST_Packet8f(two, 2.0f);
-
-  /* Start with an estimate of the reciprocal of b. */
-  Packet8f x = _mm256_rcp_ps(b);
-
-  /* One step of Newton's method on b - x^-1 == 0. */
-#ifdef EIGEN_VECTORIZE_FMA
-  x = pmul(x, _mm256_fnmadd_ps(b, x, p8f_two));
-#else
-  x = pmul(x, pmadd(-b, x, p8f_two));
-#endif
-
-  // Multiply the inverse of b with a.
-  return pmul(a, x);
+EIGEN_STRONG_INLINE Packet8h pfrexp(const Packet8h& a, Packet8h& exponent) {
+  Packet8f fexponent;
+  const Packet8h out = float2half(pfrexp<Packet8f>(half2float(a), fexponent));
+  exponent = float2half(fexponent);
+  return out;
 }
-#else
+
 template <>
-EIGEN_STRONG_INLINE Packet8f
-pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) {
-  return _mm256_div_ps(a, b);
+EIGEN_STRONG_INLINE Packet8h pldexp(const Packet8h& a, const Packet8h& exponent) {
+  return float2half(pldexp<Packet8f>(half2float(a), half2float(exponent)));
 }
-#endif
+
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psin)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pcos)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog2)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog1p)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexpm1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexp)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, ptanh)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psqrt)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, prsqrt)
+
 template <>
-EIGEN_STRONG_INLINE Packet4d
-pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) {
-  return _mm256_div_pd(a, b);
+EIGEN_STRONG_INLINE Packet8bf pfrexp(const Packet8bf& a, Packet8bf& exponent) {
+  Packet8f fexponent;
+  const Packet8bf out = F32ToBf16(pfrexp<Packet8f>(Bf16ToF32(a), fexponent));
+  exponent = F32ToBf16(fexponent);
+  return out;
 }
+
 template <>
-EIGEN_STRONG_INLINE Packet8i
-pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/) {
-  eigen_assert(false && "packet integer division are not supported by AVX");
-  return pset1<Packet8i>(0);
-}
-
-// Hyperbolic Tangent function.
-// Doesn't do anything fancy, just a 13/6-degree rational interpolant which
-// is accurate up to a couple of ulp in the range [-9, 9], outside of which the
-// fl(tanh(x)) = +/-1.
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
-ptanh<Packet8f>(const Packet8f& _x) {
-  // Clamp the inputs to the range [-9, 9] since anything outside
-  // this range is +/-1.0f in single-precision.
-  _EIGEN_DECLARE_CONST_Packet8f(plus_9, 9.0f);
-  _EIGEN_DECLARE_CONST_Packet8f(minus_9, -9.0f);
-  const Packet8f x = pmax(p8f_minus_9, pmin(p8f_plus_9, _x));
-
-  // The monomial coefficients of the numerator polynomial (odd).
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_1, 4.89352455891786e-03f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_3, 6.37261928875436e-04f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_5, 1.48572235717979e-05f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_7, 5.12229709037114e-08f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_9, -8.60467152213735e-11f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_11, 2.00018790482477e-13f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_13, -2.76076847742355e-16f);
-
-  // The monomial coefficients of the denominator polynomial (even).
-  _EIGEN_DECLARE_CONST_Packet8f(beta_0, 4.89352518554385e-03f);
-  _EIGEN_DECLARE_CONST_Packet8f(beta_2, 2.26843463243900e-03f);
-  _EIGEN_DECLARE_CONST_Packet8f(beta_4, 1.18534705686654e-04f);
-  _EIGEN_DECLARE_CONST_Packet8f(beta_6, 1.19825839466702e-06f);
-
-  // Since the polynomials are odd/even, we need x^2.
-  const Packet8f x2 = pmul(x, x);
-
-  // Evaluate the numerator polynomial p.
-  Packet8f p = pmadd(x2, p8f_alpha_13, p8f_alpha_11);
-  p = pmadd(x2, p, p8f_alpha_9);
-  p = pmadd(x2, p, p8f_alpha_7);
-  p = pmadd(x2, p, p8f_alpha_5);
-  p = pmadd(x2, p, p8f_alpha_3);
-  p = pmadd(x2, p, p8f_alpha_1);
-  p = pmul(x, p);
-
-  // Evaluate the denominator polynomial p.
-  Packet8f q = pmadd(x2, p8f_beta_6, p8f_beta_4);
-  q = pmadd(x2, q, p8f_beta_2);
-  q = pmadd(x2, q, p8f_beta_0);
-
-  // Divide the numerator by the denominator.
-  return pdiv(p, q);
-}
-
-// Identical to the ptanh in GenericPacketMath.h, but for doubles use
-// a small/medium approximation threshold of 0.001.
-template<> EIGEN_STRONG_INLINE Packet4d ptanh_approx_threshold() {
-  return pset1<Packet4d>(0.001);
+EIGEN_STRONG_INLINE Packet8bf pldexp(const Packet8bf& a, const Packet8bf& exponent) {
+  return F32ToBf16(pldexp<Packet8f>(Bf16ToF32(a), Bf16ToF32(exponent)));
 }
 
 }  // end namespace internal

diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 43d4eed..7fc32fd 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h

@@ -10,12 +10,6 @@
 #ifndef EIGEN_PACKET_MATH_AVX_H
 #define EIGEN_PACKET_MATH_AVX_H
 
-#include <stdio.h>
-
-#if defined(HALF_PRECISION_BF16) || defined(HALF_PRECISION_FP16) || defined(CUSTOM_NUMERICS)
-#include "PacketMathGoogle.h"
-#endif  // HALF_PRECISION_BF16 || HALF_PRECISION_FP16 || defined(CUSTOM_NUMERICS)
-
 namespace Eigen {
 
 namespace internal {
@@ -24,11 +18,11 @@
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
 #endif
 
-#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
+#if !defined(EIGEN_VECTORIZE_AVX512) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
 #endif
 
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
@@ -37,23 +31,27 @@
 typedef __m256  Packet8f;
 typedef __m256i Packet8i;
 typedef __m256d Packet4d;
+typedef eigen_packet_wrapper<__m128i, 2> Packet8h;
+typedef eigen_packet_wrapper<__m128i, 3> Packet8bf;
 
 template<> struct is_arithmetic<__m256>  { enum { value = true }; };
 template<> struct is_arithmetic<__m256i> { enum { value = true }; };
 template<> struct is_arithmetic<__m256d> { enum { value = true }; };
+template<> struct is_arithmetic<Packet8h> { enum { value = true }; };
+template<> struct is_arithmetic<Packet8bf> { enum { value = true }; };
 
 #define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \
   const Packet8f p8f_##NAME = pset1<Packet8f>(X)
 
+#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \
+  const Packet4d p4d_##NAME = pset1<Packet4d>(X)
+
 #define _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(NAME,X) \
-  const Packet8f p8f_##NAME = (__m256)pset1<Packet8i>(X)
+  const Packet8f p8f_##NAME = _mm256_castsi256_ps(pset1<Packet8i>(X))
 
 #define _EIGEN_DECLARE_CONST_Packet8i(NAME,X) \
   const Packet8i p8i_##NAME = pset1<Packet8i>(X)
 
-#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \
-  const Packet4d p4d_##NAME = pset1<Packet4d>(X)
-
 // Use the packet_traits defined in AVX512/PacketMath.h instead if we're going
 // to leverage AVX512 instructions.
 #ifndef EIGEN_VECTORIZE_AVX512
@@ -64,22 +62,30 @@
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=8,
+    size = 8,
     HasHalfPacket = 1,
 
-    HasDiv  = 1,
-    HasSin  = EIGEN_FAST_MATH,
-    HasCos  = 0,
-    HasTanH = 1,
-    HasBlend = 1,
-    HasLog  = 1,
-    HasExp  = 1,
+    HasCmp  = 1,
+    HasDiv = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasExp = 1,
+    HasNdtri = 1,
+    HasBessel = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-    HasSelect = 1,
-    HasEq = 1
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasBlend = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1
   };
- };
+};
 template<> struct packet_traits<double> : default_packet_traits
 {
   typedef Packet4d type;
@@ -87,19 +93,114 @@
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size = 4,
+    size=4,
     HasHalfPacket = 1,
 
-    HasDiv = 1,
+    HasCmp  = 1,
+    HasDiv  = 1,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
     HasBlend = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1
+  };
+};
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet8h type;
+  // There is no half-size packet for Packet8h.
+  typedef Packet8h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 0,
+
+    HasCmp    = 1,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasSin    = EIGEN_FAST_MATH,
+    HasCos    = EIGEN_FAST_MATH,
+    HasNegate = 1,
+    HasAbs    = 1,
+    HasAbs2   = 0,
+    HasMin    = 1,
+    HasMax    = 1,
+    HasConj   = 1,
+    HasSetLinear = 0,
+    HasLog    = 1,
+    HasLog1p  = 1,
+    HasExpm1  = 1,
+    HasExp    = 1,
+    HasSqrt   = 1,
+    HasRsqrt  = 1,
+    HasTanh   = EIGEN_FAST_MATH,
+    HasErf    = EIGEN_FAST_MATH,
+    HasBlend  = 0,
+    HasRound  = 1,
+    HasFloor  = 1,
+    HasCeil   = 1,
+    HasRint   = 1,
+    HasBessel = 1,
+    HasNdtri  = 1
+  };
+};
+
+template <>
+struct packet_traits<bfloat16> : default_packet_traits {
+  typedef Packet8bf type;
+  // There is no half-size packet for current Packet8bf.
+  // TODO: support as SSE path.
+  typedef Packet8bf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 0,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasNegate = 1,
+    HasAbs    = 1,
+    HasAbs2   = 0,
+    HasMin    = 1,
+    HasMax    = 1,
+    HasConj   = 1,
+    HasSetLinear = 0,
+    HasLog = 1,
+    HasLog1p  = 1,
+    HasExpm1  = 1,
     HasExp = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-    HasSelect = 1,
-    HasEq = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasBlend = 0,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1,
+    HasBessel = 1,
+    HasNdtri  = 1
   };
 };
 #endif
+
+template<> struct scalar_div_cost<float,true> { enum { value = 14 }; };
+template<> struct scalar_div_cost<double,true> { enum { value = 16 }; };
+
 /* Proper support for integers is only provided by AVX2. In the meantime, we'll
    use SSE instructions and packets to deal with integers.
 template<> struct packet_traits<int>    : default_packet_traits
@@ -113,38 +214,74 @@
 };
 */
 
-template<> struct unpacket_traits<Packet8f> { typedef float  type; typedef Packet4f half; enum {size=8}; };
-template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4}; };
-template<> struct unpacket_traits<Packet8i> { typedef int    type; typedef Packet4i half; enum {size=8}; };
+template<> struct unpacket_traits<Packet8f> {
+  typedef float     type;
+  typedef Packet4f  half;
+  typedef Packet8i  integer_packet;
+  typedef uint8_t   mask_t;
+  enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=true, masked_store_available=true};
+};
+template<> struct unpacket_traits<Packet4d> {
+  typedef double type;
+  typedef Packet2d half;
+  enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet8i> { typedef int    type; typedef Packet4i half; enum {size=8, alignment=Aligned32, vectorizable=false, masked_load_available=false, masked_store_available=false}; };
+template<> struct unpacket_traits<Packet8bf> { typedef bfloat16 type; typedef Packet8bf half; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; };
+
+// Helper function for bit packing snippet of low precision comparison.
+// It packs the flags from 16x16 to 8x16.
+EIGEN_STRONG_INLINE __m128i Pack16To8(Packet8f rf) {
+  return _mm_packs_epi32(_mm256_extractf128_si256(_mm256_castps_si256(rf), 0),
+                         _mm256_extractf128_si256(_mm256_castps_si256(rf), 1));
+}
+
 
 template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float&  from) { return _mm256_set1_ps(from); }
 template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int&    from) { return _mm256_set1_epi32(from); }
 
+template<> EIGEN_STRONG_INLINE Packet8f pset1frombits<Packet8f>(unsigned int from) { return _mm256_castsi256_ps(pset1<Packet8i>(from)); }
+template<> EIGEN_STRONG_INLINE Packet4d pset1frombits<Packet4d>(uint64_t from) { return _mm256_castsi256_pd(_mm256_set1_epi64x(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) { return _mm256_setzero_ps(); }
+template<> EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) { return _mm256_setzero_pd(); }
+template<> EIGEN_STRONG_INLINE Packet8i pzero(const Packet8i& /*a*/) { return _mm256_setzero_si256(); }
+
+
+template<> EIGEN_STRONG_INLINE Packet8f peven_mask(const Packet8f& /*a*/) { return _mm256_castsi256_ps(_mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1)); }
+template<> EIGEN_STRONG_INLINE Packet8i peven_mask(const Packet8i& /*a*/) { return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); }
+template<> EIGEN_STRONG_INLINE Packet4d peven_mask(const Packet4d& /*a*/) { return _mm256_castsi256_pd(_mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1)); }
+
 template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float*  from) { return _mm256_broadcast_ss(from); }
 template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
 
-#ifndef EIGEN_VECTORIZE_AVX512
-template<> EIGEN_STRONG_INLINE Packet8f plset<float>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
-template<> EIGEN_STRONG_INLINE Packet4d plset<double>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
-#endif
+template<> EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
+template<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
+
 template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_add_epi32(a,b);
+#else
+  __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
 
 template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet8f ple<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_NGT_UQ); }
-template<> EIGEN_STRONG_INLINE Packet4d ple<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_NGT_UQ); }
-
-template<> EIGEN_STRONG_INLINE Packet8f plt<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_NGE_UQ); }
-template<> EIGEN_STRONG_INLINE Packet4d plt<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_NGE_UQ); }
-
-template<> EIGEN_STRONG_INLINE Packet8f peq<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_UQ); }
-template<> EIGEN_STRONG_INLINE Packet4d peq<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_UQ); }
-
-template<> EIGEN_STRONG_INLINE Packet8f pselect<Packet8f>(const Packet8f& a, const Packet8f& b, const Packet8f& false_mask) { return _mm256_blendv_ps(a,b,false_mask); }
-template<> EIGEN_STRONG_INLINE Packet4d pselect<Packet4d>(const Packet4d& a, const Packet4d& b, const Packet4d& false_mask) { return _mm256_blendv_pd(a,b,false_mask); }
+template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_sub_epi32(a,b);
+#else
+  __m128i lo = _mm_sub_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  __m128i hi = _mm_sub_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
 
 template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
 {
@@ -159,38 +296,45 @@
 template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; }
 
-template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) {
-#if defined(HALF_PRECISION_BF16) || defined(HALF_PRECISION_FP16) || defined(CUSTOM_NUMERICS)
-  return pmul_custom(a,b);
-#else
-  return _mm256_mul_ps(a,b);
-#endif  // HALF_PRECISION_BF16 || HALF_PRECISION_FP16 || defined(CUSTOM_NUMERICS)
-}
+template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i pmul<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_mullo_epi32(a,b);
+#else
+  const __m128i lo = _mm_mullo_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  const __m128i hi = _mm_mullo_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
 
-#ifdef __FMA__
+template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/)
+{ eigen_assert(false && "packet integer division are not supported by AVX");
+  return pset1<Packet8i>(0);
+}
+
+#ifdef EIGEN_VECTORIZE_FMA
 template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
-#if EIGEN_GNUC_AT_MOST(4, 8) || EIGEN_COMP_CLANG
-  // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
-  // and gcc stupidly generates a vfmadd132ps instruction,
-  // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate
-  // the result of the product. the issue has been fixed in gcc 4.9
+#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
+  // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
+  //  and even register spilling with clang>=6.0 (bug 1637).
+  // Gcc stupidly generates a vfmadd132ps instruction.
+  // So let's enforce it to generate a vfmadd231ps instruction since the most common use
+  //  case is to accumulate the result of the product.
   Packet8f res = c;
-  asm("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
+  __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
   return res;
 #else
-#if defined(HALF_PRECISION_BF16) || defined(HALF_PRECISION_FP16) || defined(CUSTOM_NUMERICS)
-  return pmadd_custom(a, b, c);
-#else
   return _mm256_fmadd_ps(a,b,c);
-#endif  // HALF_PRECISION_BF16 || HALF_PRECISION_FP16 || defined(CUSTOM_NUMERICS)
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
-#if EIGEN_GNUC_AT_MOST(4, 8) || EIGEN_COMP_CLANG
+#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
   // see above
   Packet4d res = c;
-  asm("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
+  __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
   return res;
 #else
   return _mm256_fmadd_pd(a,b,c);
@@ -198,23 +342,237 @@
 }
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); }
+template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); }
+template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); }
+template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); }
 
-template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pcmp_le(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LE_OQ); }
+template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LT_OQ); }
+template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt_or_nan(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a, b, _CMP_NGE_UQ); }
+template<> EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_OQ); }
+
+
+template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_cmpeq_epi32(a,b);
+#else
+  __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // There appears to be a bug in GCC, by which the optimizer may flip
+  // the argument order in calls to _mm_min_ps/_mm_max_ps, so we have to
+  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  Packet8f res;
+  asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  return res;
+#else
+  // Arguments are swapped to match NaN propagation behavior of std::min.
+  return _mm256_min_ps(b,a);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // See pmin above
+  Packet4d res;
+  asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  return res;
+#else
+  // Arguments are swapped to match NaN propagation behavior of std::min.
+  return _mm256_min_pd(b,a);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // See pmin above
+  Packet8f res;
+  asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  return res;
+#else
+  // Arguments are swapped to match NaN propagation behavior of std::max.
+  return _mm256_max_ps(b,a);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // See pmin above
+  Packet4d res;
+  asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  return res;
+#else
+  // Arguments are swapped to match NaN propagation behavior of std::max.
+  return _mm256_max_pd(b,a);
+#endif
+}
+
+// Add specializations for min/max with prescribed NaN progation.
+template<>
+EIGEN_STRONG_INLINE Packet8f pmin<PropagateNumbers, Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet8f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet4d pmin<PropagateNumbers, Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet4d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet8f pmax<PropagateNumbers, Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet8f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet4d pmax<PropagateNumbers, Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet4d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet8f pmin<PropagateNaN, Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet8f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet4d pmin<PropagateNaN, Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet4d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet8f pmax<PropagateNaN, Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet8f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet4d pmax<PropagateNaN, Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet4d>);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f print<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
+template<> EIGEN_STRONG_INLINE Packet4d print<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) { return _mm256_ceil_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { return _mm256_ceil_pd(a); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }
+
+
+template<> EIGEN_STRONG_INLINE Packet8i ptrue<Packet8i>(const Packet8i& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  // vpcmpeqd has lower latency than the more general vcmpps
+  return _mm256_cmpeq_epi32(a,a);
+#else
+  const __m256 b = _mm256_castsi256_ps(a);
+  return _mm256_castps_si256(_mm256_cmp_ps(b,b,_CMP_TRUE_UQ));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f ptrue<Packet8f>(const Packet8f& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  // vpcmpeqd has lower latency than the more general vcmpps
+  const __m256i b = _mm256_castps_si256(a);
+  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b,b));
+#else
+  return _mm256_cmp_ps(a,a,_CMP_TRUE_UQ);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4d ptrue<Packet4d>(const Packet4d& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  // vpcmpeqq has lower latency than the more general vcmppd
+  const __m256i b = _mm256_castpd_si256(a);
+  return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b,b));
+#else
+  return _mm256_cmp_pd(a,a,_CMP_TRUE_UQ);
+#endif
+}
 
 template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_and_si256(a,b);
+#else
+  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+#endif
+}
 
 template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_or_si256(a,b);
+#else
+  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+#endif
+}
 
 template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_xor_si256(a,b);
+#else
+  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+#endif
+}
 
-template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(b,a); }
+template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(b,a); }
+template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_andnot_si256(b,a);
+#else
+  return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b),_mm256_castsi256_ps(a)));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a)
+{
+  const Packet8f mask = pset1frombits<Packet8f>(static_cast<numext::uint32_t>(0x80000000u));
+  const Packet8f prev0dot5 = pset1frombits<Packet8f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
+  return _mm256_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a)
+{
+  const Packet4d mask = pset1frombits<Packet4d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
+  const Packet4d prev0dot5 = pset1frombits<Packet4d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
+  return _mm256_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pselect<Packet8f>(const Packet8f& mask, const Packet8f& a, const Packet8f& b)
+{ return _mm256_blendv_ps(b,a,mask); }
+template<> EIGEN_STRONG_INLINE Packet4d pselect<Packet4d>(const Packet4d& mask, const Packet4d& a, const Packet4d& b)
+{ return _mm256_blendv_pd(b,a,mask); }
+
+template<int N> EIGEN_STRONG_INLINE Packet8i parithmetic_shift_right(Packet8i a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_srai_epi32(a, N);
+#else
+  __m128i lo = _mm_srai_epi32(_mm256_extractf128_si256(a, 0), N);
+  __m128i hi = _mm_srai_epi32(_mm256_extractf128_si256(a, 1), N);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet8i plogical_shift_right(Packet8i a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_srli_epi32(a, N);
+#else
+  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(a, 0), N);
+  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(a, 1), N);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet8i plogical_shift_left(Packet8i a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_slli_epi32(a, N);
+#else
+  __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(a, 0), N);
+  __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(a, 1), N);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
 
 template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
 template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
@@ -224,17 +582,23 @@
 template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
 
+template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from, uint8_t umask) {
+  Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
+  const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe);
+  mask = por<Packet8i>(mask, bit_mask);
+  mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_maskload_ps(from, mask);
+}
+
 // Loads 4 floats from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3, a3}
 template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
 {
   // TODO try to find a way to avoid the need of a temporary register
-  //   Packet8f tmp  = _mm256_castps128_ps256(_mm_loadu_ps(from));
-  //   tmp = _mm256_insertf128_ps(tmp,
-  //   _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)),
-  //   1);
-  //   return _mm256_unpacklo_ps(tmp,tmp);
+//   Packet8f tmp  = _mm256_castps128_ps256(_mm_loadu_ps(from));
+//   tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
+//   return _mm256_unpacklo_ps(tmp,tmp);
 
-  //  _mm256_insertf128_ps is very slow on Haswell, thus:
+  // _mm256_insertf128_ps is very slow on Haswell, thus:
   Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
   // mimic an "inplace" permutation of the lower 128bits using a blend
   tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
@@ -263,34 +627,27 @@
 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
 
-// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
-template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, int stride)
-{
-#ifdef EIGEN_VECTORIZE_AVX2
-  Packet8i stride_vector = _mm256_set1_epi32(stride);
-  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet8f& from, uint8_t umask) {
+  Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
+  const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe);
+  mask = por<Packet8i>(mask, bit_mask);
+  mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
+  EIGEN_DEBUG_UNALIGNED_STORE return _mm256_maskstore_ps(to, mask, from);
+}
 
-  return _mm256_i32gather_ps(from, indices, 4);
-#else
+// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
+// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
+template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride)
+{
   return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
                        from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
-#endif
 }
-template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, int stride)
+template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, Index stride)
 {
-#ifdef EIGEN_VECTORIZE_AVX2
-  Packet4i stride_vector = _mm_set1_epi32(stride);
-  Packet4i stride_multiplier = _mm_set_epi32(3, 2, 1, 0);
-  Packet4i indices = _mm_mullo_epi32(stride_vector, stride_multiplier);
-
-  return _mm256_i32gather_pd(from, indices, 8);
-#else
   return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
-#endif
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, int stride)
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride)
 {
   __m128 low = _mm256_extractf128_ps(from, 0);
   to[stride*0] = _mm_cvtss_f32(low);
@@ -304,7 +661,7 @@
   to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
   to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, int stride)
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, Index stride)
 {
   __m128d low = _mm256_extractf128_pd(from, 0);
   to[stride*0] = _mm_cvtsd_f64(low);
@@ -331,9 +688,9 @@
 }
 
 #ifndef EIGEN_VECTORIZE_AVX512
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 #endif
 
 template<> EIGEN_STRONG_INLINE float  pfirst<Packet8f>(const Packet8f& a) {
@@ -356,9 +713,12 @@
 {
    __m256d tmp = _mm256_shuffle_pd(a,a,5);
   return _mm256_permute2f128_pd(tmp, tmp, 1);
-
+  #if 0
+  // This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
+  // exhibit the same latency/throughput, but it is here for future reference/benchmarking...
   __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
     return _mm256_permute_pd(swap_halves,5);
+  #endif
 }
 
 // pabs should be ok
@@ -373,62 +733,78 @@
   return _mm256_and_pd(a,mask);
 }
 
-// preduxp should be ok
-// FIXME: why is this ok? why isn't the simply implementation working as expected?
-template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
-{
-    __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]);
-    __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]);
-    __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]);
-    __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]);
-
-    __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
-    __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
-    __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
-    __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
-
-    __m256 perm1 =  _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
-    __m256 perm2 =  _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
-    __m256 perm3 =  _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
-    __m256 perm4 =  _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
-
-    __m256 sum1 = _mm256_add_ps(perm1, hsum5);
-    __m256 sum2 = _mm256_add_ps(perm2, hsum6);
-    __m256 sum3 = _mm256_add_ps(perm3, hsum7);
-    __m256 sum4 = _mm256_add_ps(perm4, hsum8);
-
-    __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
-    __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
-
-    __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
-    return final;
+template<> EIGEN_STRONG_INLINE Packet8f pfrexp<Packet8f>(const Packet8f& a, Packet8f& exponent) {
+  return pfrexp_generic(a,exponent);
 }
-template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs)
-{
- Packet4d tmp0, tmp1;
 
-  tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]);
-  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
+// Extract exponent without existence of Packet4l.
+template<>
+EIGEN_STRONG_INLINE  
+Packet4d pfrexp_generic_get_biased_exponent(const Packet4d& a) {
+  const Packet4d cst_exp_mask  = pset1frombits<Packet4d>(static_cast<uint64_t>(0x7ff0000000000000ull));
+  __m256i a_expo = _mm256_castpd_si256(pand(a, cst_exp_mask));
+#ifdef EIGEN_VECTORIZE_AVX2
+  a_expo = _mm256_srli_epi64(a_expo, 52);
+  __m128i lo = _mm256_extractf128_si256(a_expo, 0);
+  __m128i hi = _mm256_extractf128_si256(a_expo, 1);
+#else
+  __m128i lo = _mm256_extractf128_si256(a_expo, 0);
+  __m128i hi = _mm256_extractf128_si256(a_expo, 1);
+  lo = _mm_srli_epi64(lo, 52);
+  hi = _mm_srli_epi64(hi, 52);
+#endif
+  Packet2d exponent_lo = _mm_cvtepi32_pd(vec4i_swizzle1(lo, 0, 2, 1, 3));
+  Packet2d exponent_hi = _mm_cvtepi32_pd(vec4i_swizzle1(hi, 0, 2, 1, 3));
+  Packet4d exponent = _mm256_insertf128_pd(_mm256_setzero_pd(), exponent_lo, 0);
+  exponent = _mm256_insertf128_pd(exponent, exponent_hi, 1);
+  return exponent;
+}
 
-  tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]);
-  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
 
-  return _mm256_blend_pd(tmp0, tmp1, 0xC);
+template<> EIGEN_STRONG_INLINE Packet4d pfrexp<Packet4d>(const Packet4d& a, Packet4d& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pldexp<Packet8f>(const Packet8f& a, const Packet8f& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4d pldexp<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
+  // Clamp exponent to [-2099, 2099]
+  const Packet4d max_exponent = pset1<Packet4d>(2099.0);
+  const Packet4i e = _mm256_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+  
+  // Split 2^e into four factors and multiply.
+  const Packet4i bias = pset1<Packet4i>(1023);
+  Packet4i b = parithmetic_shift_right<2>(e);  // floor(e/4)
+  
+  // 2^b
+  Packet4i hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3);
+  Packet4i lo = _mm_slli_epi64(hi, 52);
+  hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
+  Packet4d c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
+  Packet4d out = pmul(pmul(pmul(a, c), c), c);  // a * 2^(3b)
+  
+  // 2^(e - 3b)
+  b = psub(psub(psub(e, b), b), b);  // e - 3b
+  hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3);
+  lo = _mm_slli_epi64(hi, 52);
+  hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
+  c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
+  out = pmul(out, c); // a * 2^e
+  return out;
 }
 
 template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
 {
-  Packet8f tmp0 = _mm256_hadd_ps(a,_mm256_permute2f128_ps(a,a,1));
-  tmp0 = _mm256_hadd_ps(tmp0,tmp0);
-  return pfirst(_mm256_hadd_ps(tmp0, tmp0));
+  return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1))));
 }
 template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
 {
-  Packet4d tmp0 = _mm256_hadd_pd(a,_mm256_permute2f128_pd(a,a,1));
-  return pfirst(_mm256_hadd_pd(tmp0,tmp0));
+  return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f predux4<Packet8f>(const Packet8f& a)
+template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a)
 {
   return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
 }
@@ -472,91 +848,18 @@
   return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
 }
 
+// not needed yet
+// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x)
+// {
+//   return _mm256_movemask_ps(x)==0xFF;
+// }
 
-template<int Offset>
-struct palign_impl<Offset,Packet8f>
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x)
 {
-  static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second)
-  {
-    if (Offset==1)
-    {
-      first = _mm256_blend_ps(first, second, 1);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
-      first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0x88);
-    }
-    else if (Offset==2)
-    {
-      first = _mm256_blend_ps(first, second, 3);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
-      first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xcc);
-    }
-    else if (Offset==3)
-    {
-      first = _mm256_blend_ps(first, second, 7);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
-      first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xee);
-    }
-    else if (Offset==4)
-    {
-      first = _mm256_blend_ps(first, second, 15);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
-      first = _mm256_permute_ps(_mm256_permute2f128_ps (tmp, tmp, 1), _MM_SHUFFLE(3,2,1,0));
-    }
-    else if (Offset==5)
-    {
-      first = _mm256_blend_ps(first, second, 31);
-      first = _mm256_permute2f128_ps(first, first, 1);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
-      first = _mm256_permute2f128_ps(tmp, tmp, 1);
-      first = _mm256_blend_ps(tmp, first, 0x88);
-    }
-    else if (Offset==6)
-    {
-      first = _mm256_blend_ps(first, second, 63);
-      first = _mm256_permute2f128_ps(first, first, 1);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
-      first = _mm256_permute2f128_ps(tmp, tmp, 1);
-      first = _mm256_blend_ps(tmp, first, 0xcc);
-    }
-    else if (Offset==7)
-    {
-      first = _mm256_blend_ps(first, second, 127);
-      first = _mm256_permute2f128_ps(first, first, 1);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
-      first = _mm256_permute2f128_ps(tmp, tmp, 1);
-      first = _mm256_blend_ps(tmp, first, 0xee);
-    }
-  }
-};
+  return _mm256_movemask_ps(x)!=0;
+}
 
-template<int Offset>
-struct palign_impl<Offset,Packet4d>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second)
-  {
-    if (Offset==1)
-    {
-      first = _mm256_blend_pd(first, second, 1);
-      __m256d tmp = _mm256_permute_pd(first, 5);
-      first = _mm256_permute2f128_pd(tmp, tmp, 1);
-      first = _mm256_blend_pd(tmp, first, 0xA);
-    }
-    else if (Offset==2)
-    {
-      first = _mm256_blend_pd(first, second, 3);
-      first = _mm256_permute2f128_pd(first, first, 1);
-    }
-    else if (Offset==3)
-    {
-      first = _mm256_blend_pd(first, second, 7);
-      __m256d tmp = _mm256_permute_pd(first, 5);
-      first = _mm256_permute2f128_pd(tmp, tmp, 1);
-      first = _mm256_blend_pd(tmp, first, 5);
-    }
-  }
-};
-
-template<> EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet8f,8>& kernel) {
   __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
   __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
@@ -584,7 +887,7 @@
   kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet8f,4>& kernel) {
   __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
   __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
@@ -602,7 +905,7 @@
   kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4d,4>& kernel) {
   __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
   __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
@@ -628,51 +931,641 @@
   return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
 }
 
-// Functions to print vectors of different types, makes debugging much easier.
-namespace{
-void print4f(char* name, __m128 val) {
-  float temp[4] __attribute__((aligned(32)));
-  _mm_store_ps(temp, val);
-  printf("%s: ", name);
-  for (int k = 0; k < 4; k++) printf("%.8e ", temp[k]);
-  printf("\n");
+// Packet math for Eigen::half
+
+template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet8h half; };
+
+template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
+  return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
 }
-void print8f(char* name, __m256 val) {
-  float temp[8] __attribute__((aligned(32)));
-  _mm256_store_ps(temp, val);
-  printf("%s: ", name);
-  for (int k = 0; k < 8; k++) printf("%.8e ", temp[k]);
-  printf("\n");
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
+  return numext::bit_cast<Eigen::half>(static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
 }
-void print4i(char* name, __m128i val) {
-  int temp[4] __attribute__((aligned(32)));
-  _mm_store_si128((__m128i*)temp, val);
-  printf("%s: ", name);
-  for (int k = 0; k < 4; k++) printf("%i ", temp[k]);
-  printf("\n");
+
+template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
+  return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
 }
-void print8i(char* name, __m256i val) {
-  int temp[8] __attribute__((aligned(32)));
-  _mm256_store_si256((__m256i*)temp, val);
-  printf("%s: ", name);
-  for (int k = 0; k < 8; k++) printf("%i ", temp[k]);
-  printf("\n");
+
+template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
 }
-void print8b(char* name, __m256i val) {
-  int temp[8] __attribute__((aligned(32)));
-  _mm256_store_si256((__m256i*)temp, val);
-  printf("%s: ", name);
-  for (int k = 0; k < 8; k++) printf("0x%08x ", temp[k]);
-  printf("\n");
+
+template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+  _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
 }
-void print4d(char* name, __m256d val) {
-  double temp[4] __attribute__((aligned(32)));
-  _mm256_store_pd(temp, val);
-  printf("%s: ", name);
-  for (int k = 0; k < 4; k++) printf("%.16e ", temp[k]);
-  printf("\n");
+
+template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
 }
-};
+
+template<> EIGEN_STRONG_INLINE Packet8h
+ploaddup<Packet8h>(const Eigen::half*  from) {
+  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
+  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
+  const numext::uint16_t c = numext::bit_cast<numext::uint16_t>(from[2]);
+  const numext::uint16_t d = numext::bit_cast<numext::uint16_t>(from[3]);
+  return _mm_set_epi16(d, d, c, c, b, b, a, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h
+ploadquad<Packet8h>(const Eigen::half* from) {
+  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
+  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
+  return _mm_set_epi16(b, b, b, b, a, a, a, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
+ return _mm_cmpeq_epi32(a, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pabs(const Packet8h& a) {
+  const __m128i sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm_andnot_si128(sign_mask, a);
+}
+
+EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
+#ifdef EIGEN_HAS_FP16_C
+  return _mm256_cvtph_ps(a);
+#else
+  EIGEN_ALIGN32 Eigen::half aux[8];
+  pstore(aux, a);
+  float f0(aux[0]);
+  float f1(aux[1]);
+  float f2(aux[2]);
+  float f3(aux[3]);
+  float f4(aux[4]);
+  float f5(aux[5]);
+  float f6(aux[6]);
+  float f7(aux[7]);
+
+  return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0);
+#endif
+}
+
+EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
+#ifdef EIGEN_HAS_FP16_C
+  return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
+#else
+  EIGEN_ALIGN32 float aux[8];
+  pstore(aux, a);
+  const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[0]));
+  const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[1]));
+  const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[2]));
+  const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[3]));
+  const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[4]));
+  const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[5]));
+  const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[6]));
+  const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[7]));
+  return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmin<Packet8h>(const Packet8h& a,
+                                            const Packet8h& b) {
+  return float2half(pmin<Packet8f>(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmax<Packet8h>(const Packet8h& a,
+                                            const Packet8h& b) {
+  return float2half(pmax<Packet8f>(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h plset<Packet8h>(const half& a) {
+  return float2half(plset<Packet8f>(static_cast<float>(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) {
+  // in some cases Packet4i is a wrapper around __m128i, so we either need to
+  // cast to Packet4i to directly call the intrinsics as below:
+  return _mm_or_si128(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) {
+  return _mm_xor_si128(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) {
+  return _mm_and_si128(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) {
+  return _mm_andnot_si128(b,a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) {
+  return _mm_blendv_epi8(b, a, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pround<Packet8h>(const Packet8h& a) {
+  return float2half(pround<Packet8f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h print<Packet8h>(const Packet8h& a) {
+  return float2half(print<Packet8f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pceil<Packet8h>(const Packet8h& a) {
+  return float2half(pceil<Packet8f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pfloor<Packet8h>(const Packet8h& a) {
+  return float2half(pfloor<Packet8f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) {
+  return Pack16To8(pcmp_eq(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a,const Packet8h& b) {
+  return Pack16To8(pcmp_le(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a,const Packet8h& b) {
+  return Pack16To8(pcmp_lt(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a,const Packet8h& b) {
+  return Pack16To8(pcmp_lt_or_nan(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
+  Packet8h sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm_xor_si128(a, sign_mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = padd(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = psub(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = pmul(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = pdiv(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
+{
+  const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0*stride]);
+  const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1*stride]);
+  const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2*stride]);
+  const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3*stride]);
+  const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4*stride]);
+  const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5*stride]);
+  const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6*stride]);
+  const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7*stride]);
+  return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
+{
+  EIGEN_ALIGN32 Eigen::half aux[8];
+  pstore(aux, from);
+  to[stride*0] = aux[0];
+  to[stride*1] = aux[1];
+  to[stride*2] = aux[2];
+  to[stride*3] = aux[3];
+  to[stride*4] = aux[4];
+  to[stride*5] = aux[5];
+  to[stride*6] = aux[6];
+  to[stride*7] = aux[7];
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux_max<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux_min<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux_mul<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a)
+{
+  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+  return _mm_shuffle_epi8(a,m);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8h,8>& kernel) {
+  __m128i a = kernel.packet[0];
+  __m128i b = kernel.packet[1];
+  __m128i c = kernel.packet[2];
+  __m128i d = kernel.packet[3];
+  __m128i e = kernel.packet[4];
+  __m128i f = kernel.packet[5];
+  __m128i g = kernel.packet[6];
+  __m128i h = kernel.packet[7];
+
+  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
+  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
+  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
+  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
+  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
+  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
+  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
+  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
+
+  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
+  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
+  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
+  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
+  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
+  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
+  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
+  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
+
+  __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
+  __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
+
+  kernel.packet[0] = a0b0c0d0e0f0g0h0;
+  kernel.packet[1] = a1b1c1d1e1f1g1h1;
+  kernel.packet[2] = a2b2c2d2e2f2g2h2;
+  kernel.packet[3] = a3b3c3d3e3f3g3h3;
+  kernel.packet[4] = a4b4c4d4e4f4g4h4;
+  kernel.packet[5] = a5b5c5d5e5f5g5h5;
+  kernel.packet[6] = a6b6c6d6e6f6g6h6;
+  kernel.packet[7] = a7b7c7d7e7f7g7h7;
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8h,4>& kernel) {
+  EIGEN_ALIGN32 Eigen::half in[4][8];
+  pstore<Eigen::half>(in[0], kernel.packet[0]);
+  pstore<Eigen::half>(in[1], kernel.packet[1]);
+  pstore<Eigen::half>(in[2], kernel.packet[2]);
+  pstore<Eigen::half>(in[3], kernel.packet[3]);
+
+  EIGEN_ALIGN32 Eigen::half out[4][8];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      out[i][j] = in[j][2*i];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+4] = in[j][2*i+1];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet8h>(out[0]);
+  kernel.packet[1] = pload<Packet8h>(out[1]);
+  kernel.packet[2] = pload<Packet8h>(out[2]);
+  kernel.packet[3] = pload<Packet8h>(out[3]);
+}
+
+// BFloat16 implementation.
+
+EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  __m256i extend = _mm256_cvtepu16_epi32(a);
+  return _mm256_castsi256_ps(_mm256_slli_epi32(extend, 16));
+#else
+  __m128i lo = _mm_cvtepu16_epi32(a);
+  __m128i hi = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
+  __m128i lo_shift = _mm_slli_epi32(lo, 16);
+  __m128i hi_shift = _mm_slli_epi32(hi, 16);
+  return _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo_shift), hi_shift, 1));
+#endif
+}
+
+// Convert float to bfloat16 according to round-to-nearest-even/denormals algorithm.
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) {
+  Packet8bf r;
+
+  __m256i input = _mm256_castps_si256(a);
+
+#ifdef EIGEN_VECTORIZE_AVX2
+  // uint32_t lsb = (input >> 16);
+  __m256i t = _mm256_srli_epi32(input, 16);
+  // uint32_t lsb = lsb & 1;
+  t = _mm256_and_si256(t, _mm256_set1_epi32(1));
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  t = _mm256_add_epi32(t, _mm256_set1_epi32(0x7fff));
+  // input += rounding_bias;
+  t = _mm256_add_epi32(t, input);
+  // input = input >> 16;
+  t = _mm256_srli_epi32(t, 16);
+  // Check NaN before converting back to bf16
+  __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q);
+  __m256i nan = _mm256_set1_epi32(0x7fc0);
+  t = _mm256_blendv_epi8(nan, t, _mm256_castps_si256(mask));
+  // output = numext::bit_cast<uint16_t>(input);
+  return _mm_packus_epi32(_mm256_extractf128_si256(t, 0),
+                         _mm256_extractf128_si256(t, 1));
+#else
+  // uint32_t lsb = (input >> 16);
+  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(input, 0), 16);
+  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(input, 1), 16);
+  // uint32_t lsb = lsb & 1;
+  lo = _mm_and_si128(lo, _mm_set1_epi32(1));
+  hi = _mm_and_si128(hi, _mm_set1_epi32(1));
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  lo = _mm_add_epi32(lo, _mm_set1_epi32(0x7fff));
+  hi = _mm_add_epi32(hi, _mm_set1_epi32(0x7fff));
+  // input += rounding_bias;
+  lo = _mm_add_epi32(lo, _mm256_extractf128_si256(input, 0));
+  hi = _mm_add_epi32(hi, _mm256_extractf128_si256(input, 1));
+  // input = input >> 16;
+  lo = _mm_srli_epi32(lo, 16);
+  hi = _mm_srli_epi32(hi, 16);
+  // Check NaN before converting back to bf16
+  __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q);
+  __m128i nan = _mm_set1_epi32(0x7fc0);
+  lo = _mm_blendv_epi8(nan, lo, _mm_castps_si128(_mm256_castps256_ps128(mask)));
+  hi = _mm_blendv_epi8(nan, hi, _mm_castps_si128(_mm256_extractf128_ps(mask, 1)));
+  // output = numext::bit_cast<uint16_t>(input);
+  return _mm_packus_epi32(lo, hi);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
+  return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 pfirst<Packet8bf>(const Packet8bf& from) {
+  return numext::bit_cast<bfloat16>(static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) {
+  return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) {
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) {
+  _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) {
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf
+ploaddup<Packet8bf>(const bfloat16* from) {
+  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
+  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
+  const numext::uint16_t c = numext::bit_cast<numext::uint16_t>(from[2]);
+  const numext::uint16_t d = numext::bit_cast<numext::uint16_t>(from[3]);
+  return _mm_set_epi16(d, d, c, c, b, b, a, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf
+ploadquad<Packet8bf>(const bfloat16* from) {
+  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
+  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
+  return _mm_set_epi16(b, b, b, b, a, a, a, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf ptrue(const Packet8bf& a) {
+ return _mm_cmpeq_epi32(a, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
+  const __m128i sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm_andnot_si128(sign_mask, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a,
+                                                const Packet8bf& b) {
+  return F32ToBf16(pmin<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a,
+                                                const Packet8bf& b) {
+  return F32ToBf16(pmax<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
+  return F32ToBf16(plset<Packet8f>(static_cast<float>(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf por(const Packet8bf& a,const Packet8bf& b) {
+  return _mm_or_si128(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pxor(const Packet8bf& a,const Packet8bf& b) {
+  return _mm_xor_si128(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pand(const Packet8bf& a,const Packet8bf& b) {
+  return _mm_and_si128(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pandnot(const Packet8bf& a,const Packet8bf& b) {
+  return _mm_andnot_si128(b,a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pselect(const Packet8bf& mask, const Packet8bf& a, const Packet8bf& b) {
+  return _mm_blendv_epi8(b, a, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(const Packet8bf& a)
+{
+  return F32ToBf16(pround<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(const Packet8bf& a) {
+  return F32ToBf16(print<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf>(const Packet8bf& a) {
+  return F32ToBf16(pceil<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(const Packet8bf& a) {
+  return F32ToBf16(pfloor<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a,const Packet8bf& b) {
+  return Pack16To8(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a,const Packet8bf& b) {
+  return Pack16To8(pcmp_le(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a,const Packet8bf& b) {
+  return Pack16To8(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a,const Packet8bf& b) {
+  return Pack16To8(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pconj(const Packet8bf& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet8bf pnegate(const Packet8bf& a) {
+  Packet8bf sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm_xor_si128(a, sign_mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return F32ToBf16(padd<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return F32ToBf16(psub<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return F32ToBf16(pmul<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return F32ToBf16(pdiv<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
+{
+  const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0*stride]);
+  const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1*stride]);
+  const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2*stride]);
+  const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3*stride]);
+  const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4*stride]);
+  const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5*stride]);
+  const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6*stride]);
+  const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7*stride]);
+  return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
+{
+  EIGEN_ALIGN32 bfloat16 aux[8];
+  pstore(aux, from);
+  to[stride*0] = aux[0];
+  to[stride*1] = aux[1];
+  to[stride*2] = aux[2];
+  to[stride*3] = aux[3];
+  to[stride*4] = aux[4];
+  to[stride*5] = aux[5];
+  to[stride*6] = aux[6];
+  to[stride*7] = aux[7];
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_max<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_min<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_mul<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a)
+{
+  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+  return _mm_shuffle_epi8(a,m);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8bf,8>& kernel) {
+  __m128i a = kernel.packet[0];
+  __m128i b = kernel.packet[1];
+  __m128i c = kernel.packet[2];
+  __m128i d = kernel.packet[3];
+  __m128i e = kernel.packet[4];
+  __m128i f = kernel.packet[5];
+  __m128i g = kernel.packet[6];
+  __m128i h = kernel.packet[7];
+
+  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
+  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
+  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
+  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
+  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
+  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
+  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
+  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
+
+  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
+  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
+  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
+  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
+  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
+  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
+  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
+  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
+
+  kernel.packet[0] = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
+  kernel.packet[1] = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
+  kernel.packet[2] = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
+  kernel.packet[3] = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
+  kernel.packet[4] = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
+  kernel.packet[5] = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
+  kernel.packet[6] = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
+  kernel.packet[7] = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8bf,4>& kernel) {
+  __m128i a = kernel.packet[0];
+  __m128i b = kernel.packet[1];
+  __m128i c = kernel.packet[2];
+  __m128i d = kernel.packet[3];
+
+  __m128i ab_03 = _mm_unpacklo_epi16(a, b);
+  __m128i cd_03 = _mm_unpacklo_epi16(c, d);
+  __m128i ab_47 = _mm_unpackhi_epi16(a, b);
+  __m128i cd_47 = _mm_unpackhi_epi16(c, d);
+
+  kernel.packet[0] = _mm_unpacklo_epi32(ab_03, cd_03);
+  kernel.packet[1] = _mm_unpackhi_epi32(ab_03, cd_03);
+  kernel.packet[2] = _mm_unpacklo_epi32(ab_47, cd_47);
+  kernel.packet[3] = _mm_unpackhi_epi32(ab_47, cd_47);
+}
 
 } // end namespace internal
 

diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h
index 83bfdc6..d507fb6 100644
--- a/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h

@@ -35,15 +35,79 @@
 };
 
 
+#ifndef EIGEN_VECTORIZE_AVX512
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<bfloat16, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<float, bfloat16> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+#endif  // EIGEN_VECTORIZE_AVX512
 
 template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
-  return _mm256_cvtps_epi32(a);
+  return _mm256_cvttps_epi32(a);
 }
 
 template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
   return _mm256_cvtepi32_ps(a);
 }
 
+template<> EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i,Packet8f>(const Packet8f& a) {
+  return _mm256_castps_si256(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f,Packet8i>(const Packet8i& a) {
+  return _mm256_castsi256_ps(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
+  return half2float(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8bf, Packet8f>(const Packet8bf& a) {
+  return Bf16ToF32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
+  return float2half(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pcast<Packet8f, Packet8bf>(const Packet8f& a) {
+  return F32ToBf16(a);
+}
+
 } // end namespace internal
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h
new file mode 100644
index 0000000..0742538
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX512/Complex.h

@@ -0,0 +1,424 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_AVX512_H
+#define EIGEN_COMPLEX_AVX512_H
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- float ----------
+struct Packet8cf
+{
+  EIGEN_STRONG_INLINE Packet8cf() {}
+  EIGEN_STRONG_INLINE explicit Packet8cf(const __m512& a) : v(a) {}
+  __m512  v;
+};
+
+template<> struct packet_traits<std::complex<float> >  : default_packet_traits
+{
+  typedef Packet8cf type;
+  typedef Packet4cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 1,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasSqrt   = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet8cf> {
+  typedef std::complex<float> type;
+  typedef Packet4cf half;
+  typedef Packet16f as_real;
+  enum {
+    size = 8,
+    alignment=unpacket_traits<Packet16f>::alignment,
+    vectorizable=true,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet8cf ptrue<Packet8cf>(const Packet8cf& a) { return Packet8cf(ptrue(Packet16f(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet8cf padd<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_add_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf psub<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_sub_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a)
+{
+  return Packet8cf(pnegate(a.v));
+}
+template<> EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a)
+{
+  const __m512 mask = _mm512_castsi512_ps(_mm512_setr_epi32(
+    0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,
+    0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
+  return Packet8cf(pxor(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf pmul<Packet8cf>(const Packet8cf& a, const Packet8cf& b)
+{
+  __m512 tmp2 = _mm512_mul_ps(_mm512_movehdup_ps(a.v), _mm512_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
+  return Packet8cf(_mm512_fmaddsub_ps(_mm512_moveldup_ps(a.v), b.v, tmp2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf pand   <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pand(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf por    <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(por(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf pxor   <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pxor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf pandnot<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pandnot(a.v,b.v)); }
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) {
+  __m512 eq = pcmp_eq<Packet16f>(a.v, b.v);
+  return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf pload <Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload<Packet16f>(&numext::real_ref(*from))); }
+template<> EIGEN_STRONG_INLINE Packet8cf ploadu<Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu<Packet16f>(&numext::real_ref(*from))); }
+
+
+template<> EIGEN_STRONG_INLINE Packet8cf pset1<Packet8cf>(const std::complex<float>& from)
+{
+  const float re = std::real(from);
+  const float im = std::imag(from);
+  return Packet8cf(_mm512_set_ps(im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf ploaddup<Packet8cf>(const std::complex<float>* from)
+{
+  return Packet8cf( _mm512_castpd_ps( ploaddup<Packet8d>((const double*)(const void*)from )) );
+}
+template<> EIGEN_STRONG_INLINE Packet8cf ploadquad<Packet8cf>(const std::complex<float>* from)
+{
+  return Packet8cf( _mm512_castpd_ps( ploadquad<Packet8d>((const double*)(const void*)from )) );
+}
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet8cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet8cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet8cf pgather<std::complex<float>, Packet8cf>(const std::complex<float>* from, Index stride)
+{
+  return Packet8cf(_mm512_castpd_ps(pgather<double,Packet8d>((const double*)(const void*)from, stride)));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet8cf>(std::complex<float>* to, const Packet8cf& from, Index stride)
+{
+  pscatter((double*)(void*)to, _mm512_castps_pd(from.v), stride);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet8cf>(const Packet8cf& a)
+{
+  return pfirst(Packet2cf(_mm512_castps512_ps128(a.v)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) {
+  return Packet8cf(_mm512_castsi512_ps(
+            _mm512_permutexvar_epi64( _mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7),
+                                      _mm512_castps_si512(a.v))));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet8cf>(const Packet8cf& a)
+{
+  return predux(padd(Packet4cf(extract256<0>(a.v)),
+                     Packet4cf(extract256<1>(a.v))));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet8cf>(const Packet8cf& a)
+{
+  return predux_mul(pmul(Packet4cf(extract256<0>(a.v)),
+                         Packet4cf(extract256<1>(a.v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4<Packet8cf>(const Packet8cf& a) {
+  __m256 lane0 = extract256<0>(a.v);
+  __m256 lane1 = extract256<1>(a.v);
+  __m256 res = _mm256_add_ps(lane0, lane1);
+  return Packet4cf(res);
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf,Packet16f)
+
+template<> EIGEN_STRONG_INLINE Packet8cf pdiv<Packet8cf>(const Packet8cf& a, const Packet8cf& b)
+{
+  Packet8cf num = pmul(a, pconj(b));
+  __m512 tmp = _mm512_mul_ps(b.v, b.v);
+  __m512 tmp2    = _mm512_shuffle_ps(tmp,tmp,0xB1);
+  __m512 denom = _mm512_add_ps(tmp, tmp2);
+  return Packet8cf(_mm512_div_ps(num.v, denom));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf pcplxflip<Packet8cf>(const Packet8cf& x)
+{
+  return Packet8cf(_mm512_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
+}
+
+//---------- double ----------
+struct Packet4cd
+{
+  EIGEN_STRONG_INLINE Packet4cd() {}
+  EIGEN_STRONG_INLINE explicit Packet4cd(const __m512d& a) : v(a) {}
+  __m512d  v;
+};
+
+template<> struct packet_traits<std::complex<double> >  : default_packet_traits
+{
+  typedef Packet4cd type;
+  typedef Packet2cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 4,
+    HasHalfPacket = 1,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasSqrt   = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet4cd> {
+  typedef std::complex<double> type;
+  typedef Packet2cd half;
+  typedef Packet8d as_real;
+  enum {
+    size = 4,
+    alignment = unpacket_traits<Packet8d>::alignment,
+    vectorizable=true,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4cd padd<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_add_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd psub<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_sub_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd pnegate(const Packet4cd& a) { return Packet4cd(pnegate(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd pconj(const Packet4cd& a)
+{
+  const __m512d mask = _mm512_castsi512_pd(
+          _mm512_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0,
+                           0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
+  return Packet4cd(pxor(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd pmul<Packet4cd>(const Packet4cd& a, const Packet4cd& b)
+{
+  __m512d tmp1 = _mm512_shuffle_pd(a.v,a.v,0x0);
+  __m512d tmp2 = _mm512_shuffle_pd(a.v,a.v,0xFF);
+  __m512d tmp3 = _mm512_shuffle_pd(b.v,b.v,0x55);
+  __m512d odd  = _mm512_mul_pd(tmp2, tmp3);
+  return Packet4cd(_mm512_fmaddsub_pd(tmp1, b.v, odd));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd ptrue<Packet4cd>(const Packet4cd& a) { return Packet4cd(ptrue(Packet8d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet4cd pand   <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pand(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd por    <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(por(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd pxor   <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd pandnot<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pandnot(a.v,b.v)); }
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) {
+  __m512d eq = pcmp_eq<Packet8d>(a.v, b.v);
+  return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd pload <Packet4cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload<Packet8d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet4cd ploadu<Packet4cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cd(ploadu<Packet8d>((const double*)from)); }
+
+template<> EIGEN_STRONG_INLINE Packet4cd pset1<Packet4cd>(const std::complex<double>& from)
+{
+  #ifdef EIGEN_VECTORIZE_AVX512DQ
+  return Packet4cd(_mm512_broadcast_f64x2(pset1<Packet1cd>(from).v));
+  #else
+  return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4( _mm_castpd_ps(pset1<Packet1cd>(from).v))));
+  #endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd ploaddup<Packet4cd>(const std::complex<double>* from) {
+  return Packet4cd(_mm512_insertf64x4(
+          _mm512_castpd256_pd512(ploaddup<Packet2cd>(from).v), ploaddup<Packet2cd>(from+1).v, 1));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet4cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet4cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet4cd pgather<std::complex<double>, Packet4cd>(const std::complex<double>* from, Index stride)
+{
+  return Packet4cd(_mm512_insertf64x4(_mm512_castpd256_pd512(
+            _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+0*stride).v), ploadu<Packet1cd>(from+1*stride).v,1)),
+            _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+2*stride).v), ploadu<Packet1cd>(from+3*stride).v,1), 1));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet4cd>(std::complex<double>* to, const Packet4cd& from, Index stride)
+{
+  __m512i fromi = _mm512_castpd_si512(from.v);
+  double* tod = (double*)(void*)to;
+  _mm_storeu_pd(tod+0*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,0)) );
+  _mm_storeu_pd(tod+2*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,1)) );
+  _mm_storeu_pd(tod+4*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,2)) );
+  _mm_storeu_pd(tod+6*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,3)) );
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet4cd>(const Packet4cd& a)
+{
+  __m128d low = extract128<0>(a.v);
+  EIGEN_ALIGN16 double res[2];
+  _mm_store_pd(res, low);
+  return std::complex<double>(res[0],res[1]);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd preverse(const Packet4cd& a) {
+  return Packet4cd(_mm512_shuffle_f64x2(a.v, a.v, (shuffle_mask<3,2,1,0>::mask)));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet4cd>(const Packet4cd& a)
+{
+  return predux(padd(Packet2cd(_mm512_extractf64x4_pd(a.v,0)),
+                     Packet2cd(_mm512_extractf64x4_pd(a.v,1))));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet4cd>(const Packet4cd& a)
+{
+  return predux_mul(pmul(Packet2cd(_mm512_extractf64x4_pd(a.v,0)),
+                         Packet2cd(_mm512_extractf64x4_pd(a.v,1))));
+}
+
+template<> struct conj_helper<Packet4cd, Packet4cd, false,true>
+{
+  EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
+  {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template<> struct conj_helper<Packet4cd, Packet4cd, true,false>
+{
+  EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
+  {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template<> struct conj_helper<Packet4cd, Packet4cd, true,true>
+{
+  EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
+  {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd,Packet8d)
+
+template<> EIGEN_STRONG_INLINE Packet4cd pdiv<Packet4cd>(const Packet4cd& a, const Packet4cd& b)
+{
+  Packet4cd num = pmul(a, pconj(b));
+  __m512d tmp = _mm512_mul_pd(b.v, b.v);
+  __m512d denom =  padd(_mm512_permute_pd(tmp,0x55), tmp);
+  return Packet4cd(_mm512_div_pd(num.v, denom));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip<Packet4cd>(const Packet4cd& x)
+{
+  return Packet4cd(_mm512_permute_pd(x.v,0x55));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8cf,4>& kernel) {
+  PacketBlock<Packet8d,4> pb;
+  
+  pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
+  pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
+  pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
+  pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v);
+  ptranspose(pb);
+  kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]);
+  kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]);
+  kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]);
+  kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8cf,8>& kernel) {
+  PacketBlock<Packet8d,8> pb;
+  
+  pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
+  pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
+  pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
+  pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v);
+  pb.packet[4] = _mm512_castps_pd(kernel.packet[4].v);
+  pb.packet[5] = _mm512_castps_pd(kernel.packet[5].v);
+  pb.packet[6] = _mm512_castps_pd(kernel.packet[6].v);
+  pb.packet[7] = _mm512_castps_pd(kernel.packet[7].v);
+  ptranspose(pb);
+  kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]);
+  kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]);
+  kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]);
+  kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
+  kernel.packet[4].v = _mm512_castpd_ps(pb.packet[4]);
+  kernel.packet[5].v = _mm512_castpd_ps(pb.packet[5]);
+  kernel.packet[6].v = _mm512_castpd_ps(pb.packet[6]);
+  kernel.packet[7].v = _mm512_castpd_ps(pb.packet[7]);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4cd,4>& kernel) {
+  __m512d T0 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<0,1,0,1>::mask)); // [a0 a1 b0 b1]
+  __m512d T1 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<2,3,2,3>::mask)); // [a2 a3 b2 b3]
+  __m512d T2 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<0,1,0,1>::mask)); // [c0 c1 d0 d1]
+  __m512d T3 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<2,3,2,3>::mask)); // [c2 c3 d2 d3]
+
+  kernel.packet[3] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<1,3,1,3>::mask))); // [a3 b3 c3 d3]
+  kernel.packet[2] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<0,2,0,2>::mask))); // [a2 b2 c2 d2]
+  kernel.packet[1] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<1,3,1,3>::mask))); // [a1 b1 c1 d1]
+  kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<0,2,0,2>::mask))); // [a0 b0 c0 d0]
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd psqrt<Packet4cd>(const Packet4cd& a) {
+  return psqrt_complex<Packet4cd>(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf psqrt<Packet8cf>(const Packet8cf& a) {
+  return psqrt_complex<Packet8cf>(a);
+}
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_AVX512_H

diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index d06a204..6fd726d 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+// Copyright (C) 2016 Pedro Gonnet (pedro.gonnet@gmail.com)
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,162 +10,61 @@
 #ifndef THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
 #define THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
 
-// This seems to be missing in some headers, adding it here if this is the case.
-#ifndef _mm512_castsi512_ps
-#define _mm512_castsi512_ps(x) ((__m512)x)
-#endif
-#ifndef _mm512_castsi512_pd
-#define _mm512_castsi512_pd(x) ((__m512d)x)
-#endif
-
 namespace Eigen {
 
 namespace internal {
 
-// Hyperbolic Tangent function.
-// Doesn't do anything fancy, just a 13/6-degree rational interpolant which
-// is accurate up to a couple of ulp in the range [-9, 9], outside of which the
-// fl(tanh(x)) = +/-1.
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
-ptanh<Packet16f>(const Packet16f& _x) {
-  // Clamp the inputs to the range [-9, 9] since anything outside
-  // this range is +/-1.0f in single-precision.
-  _EIGEN_DECLARE_CONST_Packet16f(plus_9, 9.0f);
-  _EIGEN_DECLARE_CONST_Packet16f(minus_9, -9.0f);
-  const Packet16f x = pmax(p16f_minus_9, pmin(p16f_plus_9, _x));
+// Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics.
+#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG  || EIGEN_COMP_MSVC >= 1923
 
-  // The monomial coefficients of the numerator polynomial (odd).
-  _EIGEN_DECLARE_CONST_Packet16f(alpha_1, 4.89352455891786e-03f);
-  _EIGEN_DECLARE_CONST_Packet16f(alpha_3, 6.37261928875436e-04f);
-  _EIGEN_DECLARE_CONST_Packet16f(alpha_5, 1.48572235717979e-05f);
-  _EIGEN_DECLARE_CONST_Packet16f(alpha_7, 5.12229709037114e-08f);
-  _EIGEN_DECLARE_CONST_Packet16f(alpha_9, -8.60467152213735e-11f);
-  _EIGEN_DECLARE_CONST_Packet16f(alpha_11, 2.00018790482477e-13f);
-  _EIGEN_DECLARE_CONST_Packet16f(alpha_13, -2.76076847742355e-16f);
+#define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \
+  const Packet16f p16f_##NAME = pset1<Packet16f>(X)
 
-  // The monomial coefficients of the denominator polynomial (even).
-  _EIGEN_DECLARE_CONST_Packet16f(beta_0, 4.89352518554385e-03f);
-  _EIGEN_DECLARE_CONST_Packet16f(beta_2, 2.26843463243900e-03f);
-  _EIGEN_DECLARE_CONST_Packet16f(beta_4, 1.18534705686654e-04f);
-  _EIGEN_DECLARE_CONST_Packet16f(beta_6, 1.19825839466702e-06f);
+#define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \
+  const Packet16f p16f_##NAME =  preinterpret<Packet16f,Packet16i>(pset1<Packet16i>(X))
 
-  // Since the polynomials are odd/even, we need x^2.
-  const Packet16f x2 = pmul(x, x);
+#define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \
+  const Packet8d p8d_##NAME = pset1<Packet8d>(X)
 
-  // Evaluate the numerator polynomial p.
-  Packet16f p = pmadd(x2, p16f_alpha_13, p16f_alpha_11);
-  p = pmadd(x2, p, p16f_alpha_9);
-  p = pmadd(x2, p, p16f_alpha_7);
-  p = pmadd(x2, p, p16f_alpha_5);
-  p = pmadd(x2, p, p16f_alpha_3);
-  p = pmadd(x2, p, p16f_alpha_1);
-  p = pmul(x, p);
+#define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
+  const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))
 
-  // Evaluate the denominator polynomial p.
-  Packet16f q = pmadd(x2, p16f_beta_6, p16f_beta_4);
-  q = pmadd(x2, q, p16f_beta_2);
-  q = pmadd(x2, q, p16f_beta_0);
+#define _EIGEN_DECLARE_CONST_Packet16bf(NAME, X) \
+  const Packet16bf p16bf_##NAME = pset1<Packet16bf>(X)
 
-  // Divide the numerator by the denominator.
-  return pdiv(p, q);
-}
+#define _EIGEN_DECLARE_CONST_Packet16bf_FROM_INT(NAME, X) \
+  const Packet16bf p16bf_##NAME =  preinterpret<Packet16bf,Packet16i>(pset1<Packet16i>(X))
 
-// Natural logarithm
-// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
-// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
-// be easily approximated by a polynomial centered on m=1 for stability.
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
 plog<Packet16f>(const Packet16f& _x) {
-  Packet16f x = _x;
-  _EIGEN_DECLARE_CONST_Packet16f(1, 1.0f);
-  _EIGEN_DECLARE_CONST_Packet16f(half, 0.5f);
-  _EIGEN_DECLARE_CONST_Packet16f(126f, 126.0f);
-
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inv_mant_mask, ~0x7f800000);
-
-  // The smallest non denormalized float number.
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
-
-  // Polynomial coefficients.
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_SQRTHF, 0.707106781186547524f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p0, 7.0376836292E-2f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p1, -1.1514610310E-1f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p2, 1.1676998740E-1f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p3, -1.2420140846E-1f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p4, +1.4249322787E-1f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p5, -1.6668057665E-1f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p6, +2.0000714765E-1f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p7, -2.4999993993E-1f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p8, +3.3333331174E-1f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q1, -2.12194440e-4f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f);
-
-  // invalid_mask is set to true when x is NaN
-  __mmask16 invalid_mask =
-      _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
-  __mmask16 iszero_mask =
-      _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_OQ);
-
-  // Truncate input values to the minimum positive normal.
-  x = pmax(x, p16f_min_norm_pos);
-
-  // Extract the shifted exponents.
-  Packet16f emm0 = _mm512_cvtepi32_ps(_mm512_srli_epi32((__m512i)x, 23));
-  Packet16f e = _mm512_sub_ps(emm0, p16f_126f);
-
-  // Set the exponents to -1, i.e. x are in the range [0.5,1). The casting
-  // back and forth is because _mm512_and/or_ps is not available on avx512f.
-  x = (__m512)_mm512_and_si512((__m512i)x, (__m512i)p16f_inv_mant_mask);
-  x = (__m512)_mm512_or_si512((__m512i)x, (__m512i)p16f_half);
-
-  // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
-  // and shift by -1. The values are then centered around 0, which improves
-  // the stability of the polynomial evaluation.
-  //   if( x < SQRTHF ) {
-  //     e -= 1;
-  //     x = x + x - 1.0;
-  //   } else { x = x - 1.0; }
-  __mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ);
-  Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x);
-  x = psub(x, p16f_1);
-  e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1));
-  x = padd(x, tmp);
-
-  Packet16f x2 = pmul(x, x);
-  Packet16f x3 = pmul(x2, x);
-
-  // Evaluate the polynomial approximant of degree 8 in three parts, probably
-  // to improve instruction-level parallelism.
-  Packet16f y, y1, y2;
-  y = pmadd(p16f_cephes_log_p0, x, p16f_cephes_log_p1);
-  y1 = pmadd(p16f_cephes_log_p3, x, p16f_cephes_log_p4);
-  y2 = pmadd(p16f_cephes_log_p6, x, p16f_cephes_log_p7);
-  y = pmadd(y, x, p16f_cephes_log_p2);
-  y1 = pmadd(y1, x, p16f_cephes_log_p5);
-  y2 = pmadd(y2, x, p16f_cephes_log_p8);
-  y = pmadd(y, x3, y1);
-  y = pmadd(y, x3, y2);
-  y = pmul(y, x3);
-
-  // Add the logarithm of the exponent back to the result of the interpolation.
-  y1 = pmul(e, p16f_cephes_log_q1);
-  tmp = pmul(x2, p16f_half);
-  y = padd(y, y1);
-  x = psub(x, tmp);
-  y2 = pmul(e, p16f_cephes_log_q2);
-  x = padd(x, y);
-  x = padd(x, y2);
-
-  // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
-  return _mm512_mask_blend_ps(iszero_mask,
-                              _mm512_mask_blend_ps(invalid_mask, x, p16f_nan),
-                              p16f_minus_inf);
+  return plog_float(_x);
 }
 
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
+plog<Packet8d>(const Packet8d& _x) {
+  return plog_double(_x);
+}
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, plog)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog)
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+plog2<Packet16f>(const Packet16f& _x) {
+  return plog2_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
+plog2<Packet8d>(const Packet8d& _x) {
+  return plog2_double(_x);
+}
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, plog2)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog2)
+
 // Exponential function. Works by writing "x = m*log(2) + r" where
 // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
 // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
@@ -200,17 +99,17 @@
   _EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453f);
   Packet16f r = _mm512_fmadd_ps(m, p16f_nln2, x);
   Packet16f r2 = pmul(r, r);
+  Packet16f r3 = pmul(r2, r);
 
-  // TODO(gonnet): Split into odd/even polynomials and try to exploit
-  //               instruction-level parallelism.
-  Packet16f y = p16f_cephes_exp_p0;
-  y = pmadd(y, r, p16f_cephes_exp_p1);
-  y = pmadd(y, r, p16f_cephes_exp_p2);
-  y = pmadd(y, r, p16f_cephes_exp_p3);
-  y = pmadd(y, r, p16f_cephes_exp_p4);
-  y = pmadd(y, r, p16f_cephes_exp_p5);
-  y = pmadd(y, r2, r);
-  y = padd(y, p16f_1);
+  // Evaluate the polynomial approximant,improved by instruction-level parallelism.
+  Packet16f y, y1, y2;
+  y  = pmadd(p16f_cephes_exp_p0, r, p16f_cephes_exp_p1);
+  y1 = pmadd(p16f_cephes_exp_p3, r, p16f_cephes_exp_p4);
+  y2 = padd(r, p16f_1);
+  y  = pmadd(y, r, p16f_cephes_exp_p2);
+  y1 = pmadd(y1, r, p16f_cephes_exp_p5);
+  y  = pmadd(y, r3, y1);
+  y  = pmadd(y, r2, y2);
 
   // Build emm0 = 2^m.
   Packet16i emm0 = _mm512_cvttps_epi32(padd(m, p16f_127));
@@ -223,75 +122,36 @@
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 pexp<Packet8d>(const Packet8d& _x) {
-  Packet8d x = _x;
+  return pexp_double(_x);
+}
 
-  _EIGEN_DECLARE_CONST_Packet8d(1, 1.0);
-  _EIGEN_DECLARE_CONST_Packet8d(2, 2.0);
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp)
 
-  _EIGEN_DECLARE_CONST_Packet8d(exp_hi, 709.437);
-  _EIGEN_DECLARE_CONST_Packet8d(exp_lo, -709.436139303);
+template <>
+EIGEN_STRONG_INLINE Packet16h pfrexp(const Packet16h& a, Packet16h& exponent) {
+  Packet16f fexponent;
+  const Packet16h out = float2half(pfrexp<Packet16f>(half2float(a), fexponent));
+  exponent = float2half(fexponent);
+  return out;
+}
 
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_LOG2EF, 1.4426950408889634073599);
+template <>
+EIGEN_STRONG_INLINE Packet16h pldexp(const Packet16h& a, const Packet16h& exponent) {
+  return float2half(pldexp<Packet16f>(half2float(a), half2float(exponent)));
+}
 
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p0, 1.26177193074810590878e-4);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p1, 3.02994407707441961300e-2);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p2, 9.99999999999999999910e-1);
+template <>
+EIGEN_STRONG_INLINE Packet16bf pfrexp(const Packet16bf& a, Packet16bf& exponent) {
+  Packet16f fexponent;
+  const Packet16bf out = F32ToBf16(pfrexp<Packet16f>(Bf16ToF32(a), fexponent));
+  exponent = F32ToBf16(fexponent);
+  return out;
+}
 
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q0, 3.00198505138664455042e-6);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q1, 2.52448340349684104192e-3);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q2, 2.27265548208155028766e-1);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q3, 2.00000000000000000009e0);
-
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C1, 0.693145751953125);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C2, 1.42860682030941723212e-6);
-
-  // clamp x
-  x = pmax(pmin(x, p8d_exp_hi), p8d_exp_lo);
-
-  // Express exp(x) as exp(g + n*log(2)).
-  // TODO(gonnet): Still totaly *not* convinced that roundscale is the best
-  //               option here.
-  const Packet8d n = _mm512_roundscale_round_pd(
-      _mm512_mul_pd(p8d_cephes_LOG2EF, x), 0,
-      (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
-
-  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
-  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
-  // digits right.
-  const Packet8d nC1 = pmul(n, p8d_cephes_exp_C1);
-  const Packet8d nC2 = pmul(n, p8d_cephes_exp_C2);
-  x = psub(x, nC1);
-  x = psub(x, nC2);
-
-  const Packet8d x2 = pmul(x, x);
-
-  // Evaluate the numerator polynomial of the rational interpolant.
-  Packet8d px = p8d_cephes_exp_p0;
-  px = pmadd(px, x2, p8d_cephes_exp_p1);
-  px = pmadd(px, x2, p8d_cephes_exp_p2);
-  px = pmul(px, x);
-
-  // Evaluate the denominator polynomial of the rational interpolant.
-  Packet8d qx = p8d_cephes_exp_q0;
-  qx = pmadd(qx, x2, p8d_cephes_exp_q1);
-  qx = pmadd(qx, x2, p8d_cephes_exp_q2);
-  qx = pmadd(qx, x2, p8d_cephes_exp_q3);
-
-  // I don't really get this bit, copied from the SSE2 routines, so...
-  // TODO(gonnet): Figure out what is going on here, perhaps find a better
-  // rational interpolant?
-  x = _mm512_div_pd(px, psub(qx, px));
-  x = pmadd(p8d_2, x, p8d_1);
-
-  // Build e=2^n.
-  const Packet8d e = _mm512_castsi512_pd(_mm512_slli_epi64(
-      _mm512_add_epi64(_mm512_cvtepi32_epi64(_mm512_cvtpd_epi32(n)),
-                       _mm512_set1_epi64(1023)),
-      52));
-
-  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
-  // non-finite values in the input.
-  return pmax(pmul(x, e), _x);
+template <>
+EIGEN_STRONG_INLINE Packet16bf pldexp(const Packet16bf& a, const Packet16bf& exponent) {
+  return F32ToBf16(pldexp<Packet16f>(Bf16ToF32(a), Bf16ToF32(exponent)));
 }
 
 // Functions for sqrt.
@@ -300,135 +160,201 @@
 // exact solution. The main advantage of this approach is not just speed, but
 // also the fact that it can be inlined and pipelined with other computations,
 // further reducing its effective latency.
-// TODO(gonnet): Is this really faster than just _mm512_rsqrt28_ps?
 #if EIGEN_FAST_MATH
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
 psqrt<Packet16f>(const Packet16f& _x) {
-  _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
-  _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
+  Packet16f neg_half = pmul(_x, pset1<Packet16f>(-.5f));
+  __mmask16 denormal_mask = _mm512_kand(
+      _mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()),
+                        _CMP_LT_OQ),
+      _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));
 
-  // Remeber which entries were zero (or almost).
-  __mmask16 is_zero = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_NGE_UQ) &
-      _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ);
-
-  // select only the inverse sqrt of positive normal inputs (denormals are
-  // flushed to zero and cause infs as well).
-  Packet16f x =  _mm512_rsqrt14_ps(_x);
+  Packet16f x = _mm512_rsqrt14_ps(_x);
 
   // Do a single step of Newton's iteration.
-  Packet16f neg_half = pmul(_x, p16f_minus_half);
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet16f>(1.5f)));
 
-  // Multiply the original _x by it's reciprocal square root to extract the
-  // square root.
-  return _mm512_mask_blend_ps(is_zero, pmul(_x, x), _mm512_setzero_ps());
+  // Flush results for denormals to zero.
+  return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps());
 }
 
-// TODO(gonnet): What's faster? Two steps starting from the 14-bit
-//               approximation, or a single step starting from 28 bits?
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 psqrt<Packet8d>(const Packet8d& _x) {
-  _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
-  _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
-  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
+  Packet8d neg_half = pmul(_x, pset1<Packet8d>(-.5));
+  __mmask16 denormal_mask = _mm512_kand(
+      _mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()),
+                        _CMP_LT_OQ),
+      _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));
 
-  // Remeber which entries were zero (or almost).
-  __mmask8 is_zero = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_NGE_UQ) &
-      _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ);
-
-  // select only the inverse sqrt of positive normal inputs (denormals are
-  // flushed to zero and cause infs as well).
   Packet8d x = _mm512_rsqrt14_pd(_x);
 
-  // Do a first step of Newton's iteration.
-  Packet8d neg_half = pmul(_x, p8d_minus_half);
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
 
   // Do a second step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
 
-  // Multiply the original _x by it's reciprocal square root to extract the
-  // square root.
-  return _mm512_mask_blend_pd(is_zero, pmul(_x, x), _mm512_setzero_pd());
+  return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd());
 }
 #else
 template <>
 EIGEN_STRONG_INLINE Packet16f psqrt<Packet16f>(const Packet16f& x) {
   return _mm512_sqrt_ps(x);
 }
+
 template <>
 EIGEN_STRONG_INLINE Packet8d psqrt<Packet8d>(const Packet8d& x) {
   return _mm512_sqrt_pd(x);
 }
 #endif
 
-// Functions for rsqrt.
-// Almost identical to the sqrt routine, just leave out the last multiplication
-// and fill in NaN/Inf where needed. Note that this function only exists as an
-// iterative version for doubles since there is no instruction for diretly
-// computing the reciprocal square root in AVX-512.
-#ifdef EIGEN_FAST_MATH
+F16_PACKET_FUNCTION(Packet16f, Packet16h, psqrt)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psqrt)
+
+// prsqrt for float.
+#if defined(EIGEN_VECTORIZE_AVX512ER)
+
+template <>
+EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
+  return _mm512_rsqrt28_ps(x);
+}
+#elif EIGEN_FAST_MATH
+
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
 prsqrt<Packet16f>(const Packet16f& _x) {
+  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000);
   _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
   _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000);
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
 
-  // Remeber which entries were zero (or almost).
-  __mmask16 is_zero = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_NGE_UQ) &
-      _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ);
-
-  // select only the inverse sqrt of positive normal inputs (denormals are
-  // flushed to zero and cause infs).
-  Packet16f x = _mm512_rsqrt14_ps(_x);
-
-  // Do a single step of Newton's iteration.
   Packet16f neg_half = pmul(_x, p16f_minus_half);
-  return _mm512_mask_blend_ps(
-      is_zero, pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)),
-      p16f_inf);
-}
 
-// TODO(gonnet): As for psqrt, is it perhaps better to start with the 28-bit
-//               approximation and do a single step?
+  // Identity infinite, negative and denormal arguments.
+  __mmask16 inf_mask = _mm512_cmp_ps_mask(_x, p16f_inf, _CMP_EQ_OQ);
+  __mmask16 not_pos_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LE_OQ);
+  __mmask16 not_finite_pos_mask = not_pos_mask | inf_mask;
+
+  // Compute an approximate result using the rsqrt intrinsic, forcing +inf
+  // for denormals for consistency with AVX and SSE implementations.
+  Packet16f y_approx = _mm512_rsqrt14_ps(_x);
+
+  // Do a single step of Newton-Raphson iteration to improve the approximation.
+  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
+  // It is essential to evaluate the inner term like this because forming
+  // y_n^2 may over- or underflow.
+  Packet16f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p16f_one_point_five));
+
+  // Select the result of the Newton-Raphson step for positive finite arguments.
+  // For other arguments, choose the output of the intrinsic. This will
+  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf.
+  return _mm512_mask_blend_ps(not_finite_pos_mask, y_newton, y_approx);
+}
+#else
+
+template <>
+EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
+  _EIGEN_DECLARE_CONST_Packet16f(one, 1.0f);
+  return _mm512_div_ps(p16f_one, _mm512_sqrt_ps(x));
+}
+#endif
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, prsqrt)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, prsqrt)
+
+// prsqrt for double.
+#if EIGEN_FAST_MATH
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 prsqrt<Packet8d>(const Packet8d& _x) {
   _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
   _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
   _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL);
-  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
 
-  // Remeber which entries were zero (or almost).
-  __mmask8 is_zero = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_NGE_UQ) &
-      _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ);
-
-
-  // select only the inverse sqrt of positive normal inputs (denormals are
-  // flushed to zero and cause infs as well).
-  Packet8d x = _mm512_rsqrt14_pd(_x);
-
-  // Do a first step of Newton's iteration.
   Packet8d neg_half = pmul(_x, p8d_minus_half);
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
 
-  // Do a second step of Newton's iteration.
-  return _mm512_mask_blend_pd(
-      is_zero, pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)),
-      p8d_inf);
+  // Identity infinite, negative and denormal arguments.
+  __mmask8 inf_mask = _mm512_cmp_pd_mask(_x, p8d_inf, _CMP_EQ_OQ);
+  __mmask8 not_pos_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LE_OQ);
+  __mmask8 not_finite_pos_mask = not_pos_mask | inf_mask;
+
+  // Compute an approximate result using the rsqrt intrinsic, forcing +inf
+  // for denormals for consistency with AVX and SSE implementations.
+#if defined(EIGEN_VECTORIZE_AVX512ER)
+  Packet8d y_approx = _mm512_rsqrt28_pd(_x);
+#else
+  Packet8d y_approx = _mm512_rsqrt14_pd(_x);
+#endif
+  // Do one or two steps of Newton-Raphson's to improve the approximation, depending on the
+  // starting accuracy (either 2^-14 or 2^-28, depending on whether AVX512ER is available).
+  // The Newton-Raphson algorithm has quadratic convergence and roughly doubles the number
+  // of correct digits for each step.
+  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
+  // It is essential to evaluate the inner term like this because forming
+  // y_n^2 may over- or underflow.
+  Packet8d y_newton = pmul(y_approx, pmadd(neg_half, pmul(y_approx, y_approx), p8d_one_point_five));
+#if !defined(EIGEN_VECTORIZE_AVX512ER)
+  y_newton = pmul(y_newton, pmadd(y_newton, pmul(neg_half, y_newton), p8d_one_point_five));
+#endif
+  // Select the result of the Newton-Raphson step for positive finite arguments.
+  // For other arguments, choose the output of the intrinsic. This will
+  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf.
+  return _mm512_mask_blend_pd(not_finite_pos_mask, y_newton, y_approx);
 }
 #else
 template <>
-EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
-  return _mm512_rsqrt28_ps(x);
+EIGEN_STRONG_INLINE Packet8d prsqrt<Packet8d>(const Packet8d& x) {
+  _EIGEN_DECLARE_CONST_Packet8d(one, 1.0f);
+  return _mm512_div_pd(p8d_one, _mm512_sqrt_pd(x));
 }
 #endif
 
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet16f plog1p<Packet16f>(const Packet16f& _x) {
+  return generic_plog1p(_x);
+}
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, plog1p)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog1p)
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet16f pexpm1<Packet16f>(const Packet16f& _x) {
+  return generic_expm1(_x);
+}
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1)
+
+#endif
+
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+psin<Packet16f>(const Packet16f& _x) {
+  return psin_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+pcos<Packet16f>(const Packet16f& _x) {
+  return pcos_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+ptanh<Packet16f>(const Packet16f& _x) {
+  return internal::generic_fast_tanh_float(_x);
+}
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, psin)
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pcos)
+F16_PACKET_FUNCTION(Packet16f, Packet16h, ptanh)
+
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psin)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pcos)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, ptanh)
+
 }  // end namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index a468d9b..34d49ab 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
+// Copyright (C) 2016 Benoit Steiner (benoit.steiner.goog@gmail.com)
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -19,10 +19,10 @@
 #endif
 
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
 #endif
 
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
@@ -31,6 +31,8 @@
 typedef __m512 Packet16f;
 typedef __m512i Packet16i;
 typedef __m512d Packet8d;
+typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
+typedef eigen_packet_wrapper<__m256i, 2> Packet16bf;
 
 template <>
 struct is_arithmetic<__m512> {
@@ -45,23 +47,52 @@
   enum { value = true };
 };
 
-#define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \
-  const Packet16f p16f_##NAME = pset1<Packet16f>(X)
+template<> struct is_arithmetic<Packet16h> { enum { value = true }; };
 
-#define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \
-  const Packet16f p16f_##NAME = (__m512)pset1<Packet16i>(X)
+template <>
+struct packet_traits<half> : default_packet_traits {
+  typedef Packet16h type;
+  // There is no half-size packet for Packet16h.
+  typedef Packet16h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 1,
 
-#define _EIGEN_DECLARE_CONST_Packet16i(NAME, X) \
-  const Packet16i p16i_##NAME = pset1<Packet16i>(X)
+    HasCmp    = 1,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 1,
+    HasAbs2   = 0,
+    HasMin    = 1,
+    HasMax    = 1,
+    HasConj   = 1,
+    HasSetLinear = 0,
+    HasLog    = 1,
+    HasLog1p  = 1,
+    HasExpm1  = 1,
+    HasExp    = 1,
+    HasSqrt   = 1,
+    HasRsqrt  = 1,
+    HasSin    = EIGEN_FAST_MATH,
+    HasCos    = EIGEN_FAST_MATH,
+    HasTanh   = EIGEN_FAST_MATH,
+    HasErf    = EIGEN_FAST_MATH,
+    HasBlend = 0,
+    HasRound  = 1,
+    HasFloor  = 1,
+    HasCeil   = 1,
+    HasRint   = 1,
+    HasBessel = 1,
+    HasNdtri  = 1
+  };
+};
 
-#define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \
-  const Packet8d p8d_##NAME = pset1<Packet8d>(X)
-
-#define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
-  const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))
-
-template<>
-struct packet_traits<float>  : default_packet_traits
+template<> struct packet_traits<float>  : default_packet_traits
 {
   typedef Packet16f type;
   typedef Packet8f half;
@@ -70,19 +101,36 @@
     AlignedOnScalar = 1,
     size = 16,
     HasHalfPacket = 1,
+
+    HasAbs = 1,
+    HasMin    = 1,
+    HasMax    = 1,
+    HasConj   = 1,
+    HasBlend = 0,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
     HasLog = 1,
+    HasLog1p  = 1,
+    HasExpm1  = 1,
+    HasNdtri = 1,
+    HasBessel  = 1,
     HasExp = 1,
+    HasSqrt = EIGEN_FAST_MATH,
+    HasRsqrt = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+#endif
+    HasCmp  = 1,
     HasDiv = 1,
-    HasBlend = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    HasSelect = 1,
-    HasEq = 1,
-    HasTanH = 1
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1
   };
  };
-template<>
-struct packet_traits<double> : default_packet_traits {
+template<> struct packet_traits<double> : default_packet_traits
+{
   typedef Packet8d type;
   typedef Packet4d half;
   enum {
@@ -90,44 +138,59 @@
     AlignedOnScalar = 1,
     size = 8,
     HasHalfPacket = 1,
+#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
+    HasLog  = 1,
     HasExp = 1,
-    HasDiv = 1,
-    HasBlend = 1,
-    HasSqrt = 1,
+    HasSqrt = EIGEN_FAST_MATH,
     HasRsqrt = EIGEN_FAST_MATH,
-    HasSelect = 1,
-    HasEq = 1,
+#endif
+    HasCmp  = 1,
+    HasDiv = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1
   };
 };
 
-template<>
-struct packet_traits<int> : default_packet_traits {
+/* TODO Implement AVX512 for integers
+template<> struct packet_traits<int>    : default_packet_traits
+{
   typedef Packet16i type;
-  typedef Packet8i half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size = 16,
+    size=8
   };
 };
+*/
 
 template <>
 struct unpacket_traits<Packet16f> {
   typedef float type;
   typedef Packet8f half;
-  enum { size = 16 };
+  typedef Packet16i integer_packet;
+  typedef uint16_t mask_t;
+  enum { size = 16, alignment=Aligned64, vectorizable=true, masked_load_available=true, masked_store_available=true };
 };
 template <>
 struct unpacket_traits<Packet8d> {
   typedef double type;
   typedef Packet4d half;
-  enum { size = 8 };
+  enum { size = 8, alignment=Aligned64, vectorizable=true, masked_load_available=false, masked_store_available=false };
 };
 template <>
 struct unpacket_traits<Packet16i> {
   typedef int type;
   typedef Packet8i half;
-  enum { size = 16 };
+  enum { size = 16, alignment=Aligned64, vectorizable=false, masked_load_available=false, masked_store_available=false };
+};
+
+template<>
+struct unpacket_traits<Packet16h> {
+  typedef Eigen::half type;
+  typedef Packet8h half;
+  enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
 };
 
 template <>
@@ -144,41 +207,54 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet16f pset1frombits<Packet16f>(unsigned int from) {
+  return _mm512_castsi512_ps(_mm512_set1_epi32(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pset1frombits<Packet8d>(const numext::uint64_t from) {
+  return _mm512_castsi512_pd(_mm512_set1_epi64(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pzero(const Packet16f& /*a*/) { return _mm512_setzero_ps(); }
+template<> EIGEN_STRONG_INLINE Packet8d pzero(const Packet8d& /*a*/) { return _mm512_setzero_pd(); }
+template<> EIGEN_STRONG_INLINE Packet16i pzero(const Packet16i& /*a*/) { return _mm512_setzero_si512(); }
+
+template<> EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /*a*/) {
+  return _mm512_castsi512_ps(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
+                                              0, -1, 0, -1, 0, -1, 0, -1));
+}
+template<> EIGEN_STRONG_INLINE Packet16i peven_mask(const Packet16i& /*a*/) {
+  return _mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
+                          0, -1, 0, -1, 0, -1, 0, -1);
+}
+template<> EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /*a*/) {
+  return _mm512_castsi512_pd(_mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1,
+                                              0, 0, -1, -1, 0, 0, -1, -1));
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) {
   return _mm512_broadcastss_ps(_mm_load_ps1(from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pload1<Packet8d>(const double* from) {
-  return _mm512_broadcastsd_pd(_mm_load_pd1(from));
+  return _mm512_set1_pd(*from);
 }
+
 template <>
-EIGEN_STRONG_INLINE Packet16f plset<float>(const float& a) {
+EIGEN_STRONG_INLINE Packet16f plset<Packet16f>(const float& a) {
   return _mm512_add_ps(
       _mm512_set1_ps(a),
       _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f,
                     4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d plset<double>(const double& a) {
+EIGEN_STRONG_INLINE Packet8d plset<Packet8d>(const double& a) {
   return _mm512_add_pd(_mm512_set1_pd(a),
                        _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
 }
-template <>
-EIGEN_STRONG_INLINE Packet16i plset<int>(const int& a) {
-  return _mm512_add_epi32(
-      _mm512_set1_epi32(a),
-      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5,
-                       4, 3, 2, 1, 0));
-}
-template <>
-EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
-    return _mm512_cvtps_epi32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet16i padd<Packet16i>(const Packet16i& a,
-                                              const Packet16i& b) {
-  return _mm512_add_epi32(a, b);
-}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a,
                                               const Packet16f& b) {
@@ -190,10 +266,11 @@
   return _mm512_add_pd(a, b);
 }
 template <>
-EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a,
+EIGEN_STRONG_INLINE Packet16i padd<Packet16i>(const Packet16i& a,
                                               const Packet16i& b) {
-  return _mm512_sub_epi32(a, b);
+  return _mm512_add_epi32(a, b);
 }
+
 template <>
 EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a,
                                               const Packet16f& b) {
@@ -204,63 +281,21 @@
                                             const Packet8d& b) {
   return _mm512_sub_pd(a, b);
 }
-
-// TODO (nishantpatil) Convert between _mmask16 and Packet16f. Templates in
-// GenericPacketMath need to be modified?
-
-template<>
-EIGEN_STRONG_INLINE Packet16f
-ple<Packet16f>(const Packet16f& a, const Packet16f& b) {
-  return _mm512_movm_epi32(_mm512_cmp_ps_mask(a, b, _CMP_NGT_UQ));
-}
-template<>
-EIGEN_STRONG_INLINE Packet8d
-ple<Packet8d>(const Packet8d& a, const Packet8d& b) {
-  return _mm512_movm_epi64(_mm512_cmp_pd_mask(a, b, _CMP_NGT_UQ));
-}
-
-template<>
-EIGEN_STRONG_INLINE Packet16f
-plt<Packet16f>(const Packet16f& a, const Packet16f& b) {
-  return _mm512_movm_epi32(_mm512_cmp_ps_mask(a, b, _CMP_NGE_UQ));
-}
-template<>
-EIGEN_STRONG_INLINE Packet8d plt<Packet8d>(const Packet8d& a,
-const Packet8d& b) { return _mm512_movm_epi64(_mm512_cmp_pd_mask(a, b, _CMP_NGE_UQ));
-}
-
-template<>
-EIGEN_STRONG_INLINE Packet16f peq<Packet16f>(const Packet16f& a,
-const Packet16f& b) { return _mm512_movm_epi32(_mm512_cmp_ps_mask(a, b, _CMP_EQ_UQ));
-}
-template<>
-EIGEN_STRONG_INLINE Packet8d peq<Packet8d>(const Packet8d& a,
-const Packet8d& b) { return _mm512_movm_epi64(_mm512_cmp_pd_mask(a, b, _CMP_EQ_UQ));
-}
-
-template<>
-EIGEN_STRONG_INLINE Packet16f pselect<Packet16f>(const Packet16f&
-a, const Packet16f& b, const Packet16f& false_mask) { return
-_mm512_mask_blend_ps(_mm512_movepi32_mask(false_mask), a, b);
-}
-template<>
-EIGEN_STRONG_INLINE Packet8d pselect<Packet8d>(const Packet8d& a,
-const Packet8d& b, const Packet8d& false_mask) { return
-_mm512_mask_blend_pd(_mm512_movepi32_mask(false_mask), a, b);
-}
-
 template <>
-EIGEN_STRONG_INLINE Packet16i pnegate(const Packet16i& a) {
-  return _mm512_sub_epi32(_mm512_setzero_si512(), a);
+EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a,
+                                              const Packet16i& b) {
+  return _mm512_sub_epi32(a, b);
 }
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
-  return _mm512_sub_ps(_mm512_setzero_ps(), a);
+  return _mm512_sub_ps(_mm512_set1_ps(0.0), a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) {
-  return _mm512_sub_pd(_mm512_setzero_pd(), a);
+  return _mm512_sub_pd(_mm512_set1_pd(0.0), a);
 }
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pconj(const Packet16f& a) {
   return a;
@@ -273,11 +308,7 @@
 EIGEN_STRONG_INLINE Packet16i pconj(const Packet16i& a) {
   return a;
 }
-template <>
-EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a,
-                                              const Packet16i& b) {
-  return _mm512_mullo_epi32(a, b);
-}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a,
                                               const Packet16f& b) {
@@ -288,8 +319,24 @@
                                             const Packet8d& b) {
   return _mm512_mul_pd(a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a,
+                                              const Packet16i& b) {
+  return _mm512_mullo_epi32(a, b);
+}
 
-#ifdef __FMA__
+template <>
+EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a,
+                                              const Packet16f& b) {
+  return _mm512_div_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a,
+                                            const Packet8d& b) {
+  return _mm512_div_pd(a, b);
+}
+
+#ifdef EIGEN_VECTORIZE_FMA
 template <>
 EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b,
                                     const Packet16f& c) {
@@ -303,59 +350,216 @@
 #endif
 
 template <>
-EIGEN_STRONG_INLINE Packet16i pmin<Packet16i>(const Packet16i& a,
-                                              const Packet16i& b) {
-  return _mm512_min_epi32(a, b);
+EIGEN_DEVICE_FUNC inline Packet16f pselect(const Packet16f& mask,
+                                           const Packet16f& a,
+                                           const Packet16f& b) {
+  __mmask16 mask16 = _mm512_cmp_epi32_mask(
+      _mm512_castps_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
+  return _mm512_mask_blend_ps(mask16, a, b);
 }
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet8d pselect(const Packet8d& mask,
+                                          const Packet8d& a,
+                                          const Packet8d& b) {
+  __mmask8 mask8 = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask),
+                                         _mm512_setzero_epi32(), _MM_CMPINT_EQ);
+  return _mm512_mask_blend_pd(mask8, a, b);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a,
                                               const Packet16f& b) {
-  return _mm512_min_ps(a, b);
+  // Arguments are reversed to match NaN propagation behavior of std::min.
+  return _mm512_min_ps(b, a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a,
                                             const Packet8d& b) {
-  return _mm512_min_pd(a, b);
+  // Arguments are reversed to match NaN propagation behavior of std::min.
+  return _mm512_min_pd(b, a);
 }
-template <>
-EIGEN_STRONG_INLINE Packet16i pmax<Packet16i>(const Packet16i& a,
-                                              const Packet16i& b) {
-  return _mm512_max_epi32(a, b);
-}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a,
                                               const Packet16f& b) {
-  return _mm512_max_ps(a, b);
+  // Arguments are reversed to match NaN propagation behavior of std::max.
+  return _mm512_max_ps(b, a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a,
                                             const Packet8d& b) {
-  return _mm512_max_pd(a, b);
+  // Arguments are reversed to match NaN propagation behavior of std::max.
+  return _mm512_max_pd(b, a);
 }
+
+// Add specializations for min/max with prescribed NaN progation.
+template<>
+EIGEN_STRONG_INLINE Packet16f pmin<PropagateNumbers, Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet16f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet8d pmin<PropagateNumbers, Packet8d>(const Packet8d& a, const Packet8d& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet8d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet16f pmax<PropagateNumbers, Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet16f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet8d pmax<PropagateNumbers, Packet8d>(const Packet8d& a, const Packet8d& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet8d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet16f pmin<PropagateNaN, Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet16f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet8d pmin<PropagateNaN, Packet8d>(const Packet8d& a, const Packet8d& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet8d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet16f pmax<PropagateNaN, Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet16f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet8d pmax<PropagateNaN, Packet8d>(const Packet8d& a, const Packet8d& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet8d>);
+}
+
+
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); }
+template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); }
+EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); }
+#else
+// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
+template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
+  return  _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_));
+}
+
+// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512
+template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
+  return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_));
+}
+
+EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
+  return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)),
+                                                _mm256_castps_si256(b),1));
+}
+#endif
+
+// Helper function for bit packing snippet of low precision comparison.
+// It packs the flags from 32x16 to 16x16.
+EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf) {
+  // Split data into small pieces and handle with AVX instructions
+  // to guarantee internal order of vector.
+  // Operation:
+  //   dst[15:0]    := Saturate16(rf[31:0])
+  //   dst[31:16]   := Saturate16(rf[63:32])
+  //   ...
+  //   dst[255:240] := Saturate16(rf[255:224])
+  __m256i lo = _mm256_castps_si256(extract256<0>(rf));
+  __m256i hi = _mm256_castps_si256(extract256<1>(rf));
+  __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0),
+                                      _mm256_extractf128_si256(lo, 1));
+  __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0),
+                                      _mm256_extractf128_si256(hi, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
+  return _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ);
+  return _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ);
+  return _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGE_UQ);
+  return _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) {
+  __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ);
+  return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu);
+}
+
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) {
+  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
+  return _mm512_castsi512_pd(
+      _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pcmp_le(const Packet8d& a, const Packet8d& b) {
+  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LE_OQ);
+  return _mm512_castsi512_pd(
+      _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pcmp_lt(const Packet8d& a, const Packet8d& b) {
+  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ);
+  return _mm512_castsi512_pd(
+      _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pcmp_lt_or_nan(const Packet8d& a, const Packet8d& b) {
+  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGE_UQ);
+  return _mm512_castsi512_pd(
+      _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f print<Packet16f>(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_CUR_DIRECTION); }
+template<> EIGEN_STRONG_INLINE Packet8d print<Packet8d>(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_CUR_DIRECTION); }
+
+template<> EIGEN_STRONG_INLINE Packet16f pceil<Packet16f>(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_TO_POS_INF); }
+template<> EIGEN_STRONG_INLINE Packet8d pceil<Packet8d>(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_TO_POS_INF); }
+
+template<> EIGEN_STRONG_INLINE Packet16f pfloor<Packet16f>(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEG_INF); }
+template<> EIGEN_STRONG_INLINE Packet8d pfloor<Packet8d>(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_TO_NEG_INF); }
+
+template <>
+EIGEN_STRONG_INLINE Packet16i ptrue<Packet16i>(const Packet16i& /*a*/) {
+  return _mm512_set1_epi32(0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f ptrue<Packet16f>(const Packet16f& a) {
+  return _mm512_castsi512_ps(ptrue<Packet16i>(_mm512_castps_si512(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d ptrue<Packet8d>(const Packet8d& a) {
+  return _mm512_castsi512_pd(ptrue<Packet16i>(_mm512_castpd_si512(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a,
+                                              const Packet16i& b) {
+  return _mm512_and_si512(a,b);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a,
                                               const Packet16f& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
   return _mm512_and_ps(a, b);
 #else
-  Packet16f res = _mm512_undefined_ps();
-  Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
-  res = _mm512_insertf32x4(res, _mm_and_ps(lane0_a, lane0_b), 0);
-
-  Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
-  res = _mm512_insertf32x4(res, _mm_and_ps(lane1_a, lane1_b), 1);
-
-  Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
-  res = _mm512_insertf32x4(res, _mm_and_ps(lane2_a, lane2_b), 2);
-
-  Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
-  Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
-  res = _mm512_insertf32x4(res, _mm_and_ps(lane3_a, lane3_b), 3);
-
-  return res;
+  return _mm512_castsi512_ps(pand(_mm512_castps_si512(a),_mm512_castps_si512(b)));
 #endif
 }
 template <>
@@ -371,143 +575,106 @@
 
   Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
   Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
-  res = _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1);
-
-  return res;
+  return _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1);
 #endif
 }
+
 template <>
-EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a,
-                                             const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16i por<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_or_si512(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a, const Packet16f& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
   return _mm512_or_ps(a, b);
 #else
-  Packet16f res = _mm512_undefined_ps();
-  Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
-  res = _mm512_insertf32x4(res, _mm_or_ps(lane0_a, lane0_b), 0);
-
-  Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
-  res = _mm512_insertf32x4(res, _mm_or_ps(lane1_a, lane1_b), 1);
-
-  Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
-  res = _mm512_insertf32x4(res, _mm_or_ps(lane2_a, lane2_b), 2);
-
-  Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
-  Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
-  res = _mm512_insertf32x4(res, _mm_or_ps(lane3_a, lane3_b), 3);
-
-  return res;
+  return _mm512_castsi512_ps(por(_mm512_castps_si512(a),_mm512_castps_si512(b)));
 #endif
 }
+
 template <>
 EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a,
                                            const Packet8d& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
   return _mm512_or_pd(a, b);
 #else
-  Packet8d res = _mm512_undefined_pd();
-  Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
-  res = _mm512_insertf64x4(res, _mm256_or_pd(lane0_a, lane0_b), 0);
-
-  Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
-  Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
-  res = _mm512_insertf64x4(res, _mm256_or_pd(lane1_a, lane1_b), 1);
-
-  return res;
+  return _mm512_castsi512_pd(por(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
 #endif
 }
+
 template <>
-EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a,
-                                              const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16i pxor<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_xor_si512(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a, const Packet16f& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
   return _mm512_xor_ps(a, b);
 #else
-  Packet16f res = _mm512_undefined_ps();
-  Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
-  res = _mm512_insertf32x4(res, _mm_xor_ps(lane0_a, lane0_b), 0);
-
-  Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
-  res = _mm512_insertf32x4(res, _mm_xor_ps(lane1_a, lane1_b), 1);
-
-  Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
-  res = _mm512_insertf32x4(res, _mm_xor_ps(lane2_a, lane2_b), 2);
-
-  Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
-  Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
-  res = _mm512_insertf32x4(res, _mm_xor_ps(lane3_a, lane3_b), 3);
-
-  return res;
+  return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a),_mm512_castps_si512(b)));
 #endif
 }
+
 template <>
-EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a,
-                                            const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a, const Packet8d& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
   return _mm512_xor_pd(a, b);
 #else
-  Packet8d res = _mm512_undefined_pd();
-  Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
-  res = _mm512_insertf64x4(res, _mm256_xor_pd(lane0_a, lane0_b), 0);
+  return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
+#endif
+}
 
-  Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
-  Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
-  res = _mm512_insertf64x4(res, _mm256_xor_pd(lane1_a, lane1_b), 1);
+template <>
+EIGEN_STRONG_INLINE Packet16i pandnot<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_andnot_si512(b, a);
+}
 
-  return res;
+template <>
+EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a, const Packet16f& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_andnot_ps(b, a);
+#else
+  return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a),_mm512_castps_si512(b)));
 #endif
 }
 template <>
-EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a,
-                                                 const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,const Packet8d& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
-  return _mm512_andnot_ps(a, b);
+  return _mm512_andnot_pd(b, a);
 #else
-  Packet16f res = _mm512_undefined_ps();
-  Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
-  res = _mm512_insertf32x4(res, _mm_andnot_ps(lane0_a, lane0_b), 0);
-
-  Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
-  res = _mm512_insertf32x4(res, _mm_andnot_ps(lane1_a, lane1_b), 1);
-
-  Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
-  res = _mm512_insertf32x4(res, _mm_andnot_ps(lane2_a, lane2_b), 2);
-
-  Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
-  Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
-  res = _mm512_insertf32x4(res, _mm_andnot_ps(lane3_a, lane3_b), 3);
-
-  return res;
+  return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
 #endif
 }
-template <>
-EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,
-                                               const Packet8d& b) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
-  return _mm512_andnot_pd(a, b);
-#else
-  Packet8d res = _mm512_undefined_pd();
-  Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
-  res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane0_a, lane0_b), 0);
 
-  Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
-  Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
-  res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane1_a, lane1_b), 1);
-
-  return res;
-#endif
+template<> EIGEN_STRONG_INLINE Packet16f pround<Packet16f>(const Packet16f& a)
+{
+  // Work-around for default std::round rounding mode.
+  const Packet16f mask = pset1frombits<Packet16f>(static_cast<numext::uint32_t>(0x80000000u));
+  const Packet16f prev0dot5 = pset1frombits<Packet16f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
+  return _mm512_roundscale_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
 }
+template<> EIGEN_STRONG_INLINE Packet8d pround<Packet8d>(const Packet8d& a)
+{
+  // Work-around for default std::round rounding mode.
+  const Packet8d mask = pset1frombits<Packet8d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
+  const Packet8d prev0dot5 = pset1frombits<Packet8d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
+  return _mm512_roundscale_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) {
+  return _mm512_srai_epi32(a, N);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet16i plogical_shift_right(Packet16i a) {
+  return _mm512_srli_epi32(a, N);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet16i plogical_shift_left(Packet16i a) {
+  return _mm512_slli_epi32(a, N);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from);
@@ -521,6 +688,7 @@
   EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
       reinterpret_cast<const __m512i*>(from));
 }
+
 template <>
 EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ps(from);
@@ -534,71 +702,70 @@
   EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
       reinterpret_cast<const __m512i*>(from));
 }
+
+template <>
+EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from, uint16_t umask) {
+  __mmask16 mask = static_cast<__mmask16>(umask);
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_maskz_loadu_ps(mask, from);
+}
+
 // Loads 8 floats from memory a returns the packet
 // {a0, a0  a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
 template <>
 EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
-  __m256 low_half = _mm256_load_ps(from);
-  __m512 even_elements = _mm512_cvtepu32_epi64(low_half);
+  // an unaligned load is required here as there is no requirement
+  // on the alignment of input pointer 'from'
+  __m256i low_half = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+  __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half));
   __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
   return pairs;
 }
+
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+// FIXME: this does not look optimal, better load a Packet4d and shuffle...
 // Loads 4 doubles from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3,
 // a3}
 template <>
 EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
-  __m512d x = _mm512_setzero_pd();
-  x = _mm512_insertf32x4(x, _mm_loaddup_pd(&from[0]), 0);
-  x = _mm512_insertf32x4(x, _mm_loaddup_pd(&from[1]), 1);
-  x = _mm512_insertf32x4(x, _mm_loaddup_pd(&from[2]), 2);
-  x = _mm512_insertf32x4(x, _mm_loaddup_pd(&from[3]), 3);
+ __m512d x = _mm512_setzero_pd();
+  x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[0]), 0);
+  x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[1]), 1);
+  x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[2]), 2);
+  x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3);
   return x;
 }
-// Loads 8 ints from memory a returns the packet
-// {a0, a0  a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
+#else
 template <>
-EIGEN_STRONG_INLINE Packet16i ploaddup<Packet16i>(const int* from) {
-  __m256i low_half = _mm256_load_si256((const __m256i*)from);
-  __m512 even_elements = _mm512_cvtepu32_epi64(low_half);
-  __m512i pairs = _mm512_shuffle_epi32(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
-  return pairs;
+EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
+  __m512d x = _mm512_setzero_pd();
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3));
+  return x;
 }
-// Loads 4 ints from memory a returns the packet
-// {a0, a0  a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
-template <>
-EIGEN_STRONG_INLINE Packet16i ploadquad<Packet16i>(const int* from) {
-  Packet16i tmp = _mm512_undefined_ps();
-  //= _mm512_castps128_ps512(_mm_load_ps1(from));
-  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1((const float*)(from)), 0);
-  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1((const float*)(from + 1)), 1);
-  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1((const float*)(from + 2)), 2);
-  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1((const float*)(from + 3)), 3);
-  return tmp;
-}
+#endif
+
 // Loads 4 floats from memory a returns the packet
 // {a0, a0  a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
 template <>
 EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
-  Packet16f tmp = _mm512_undefined_ps();
-  //= _mm512_castps128_ps512(_mm_load_ps1(from));
-  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from), 0);
-  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 1), 1);
-  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 2), 2);
-  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 3), 3);
-  return tmp;
+  Packet16f tmp = _mm512_castps128_ps512(ploadu<Packet4f>(from));
+  const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
+  return _mm512_permutexvar_ps(scatter_mask, tmp);
 }
-// Loads 4 doubles from memory a returns the packet
+
+// Loads 2 doubles from memory a returns the packet
 // {a0, a0  a0, a0, a1, a1, a1, a1}
 template <>
 EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
-  Packet8d tmp = _mm512_undefined_pd();
-  Packet2d tmp0 = _mm_load_pd1(from);
-  Packet2d tmp1 = _mm_load_pd1(from + 1);
-  Packet4d lane0 = _mm256_broadcastsd_pd(tmp0);
-  Packet4d lane1 = _mm256_broadcastsd_pd(tmp1);
+  __m256d lane0 = _mm256_set1_pd(*from);
+  __m256d lane1 = _mm256_set1_pd(*(from+1));
+  __m512d tmp = _mm512_undefined_pd();
   tmp = _mm512_insertf64x4(tmp, lane0, 0);
   return _mm512_insertf64x4(tmp, lane1, 1);
 }
+
 template <>
 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ps(to, from);
@@ -612,6 +779,7 @@
   EIGEN_DEBUG_ALIGNED_STORE _mm512_storeu_si512(reinterpret_cast<__m512i*>(to),
                                                 from);
 }
+
 template <>
 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ps(to, from);
@@ -626,19 +794,15 @@
       reinterpret_cast<__m512i*>(to), from);
 }
 template <>
-EIGEN_DEVICE_FUNC inline Packet16i pgather<int, Packet16i>(const int* from,
-                                                           int stride) {
-  Packet16i stride_vector = _mm512_set1_epi32(stride);
-  Packet16i stride_multiplier =
-      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
-
-  return _mm512_i32gather_epi32(indices, from, 4);
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from, uint16_t umask) {
+  __mmask16 mask = static_cast<__mmask16>(umask);
+  EIGEN_DEBUG_UNALIGNED_STORE return _mm512_mask_storeu_ps(to, mask, from);
 }
+
 template <>
 EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
-                                                             int stride) {
-  Packet16i stride_vector = _mm512_set1_epi32(stride);
+                                                             Index stride) {
+  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
   Packet16i stride_multiplier =
       _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
@@ -647,28 +811,19 @@
 }
 template <>
 EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
-                                                            int stride) {
-  Packet8i stride_vector = _mm256_set1_epi32(stride);
+                                                            Index stride) {
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
   Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
   Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
 
   return _mm512_i32gather_pd(indices, from, 8);
 }
-template <>
-EIGEN_DEVICE_FUNC inline void pscatter<int, Packet16i>(int* to,
-                                                       const Packet16i& from,
-                                                       int stride) {
-  Packet16i stride_vector = _mm512_set1_epi32(stride);
-  Packet16i stride_multiplier =
-      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
-  _mm512_i32scatter_epi32(to, indices, from, 4);
-}
+
 template <>
 EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
                                                          const Packet16f& from,
-                                                         int stride) {
-  Packet16i stride_vector = _mm512_set1_epi32(stride);
+                                                         Index stride) {
+  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
   Packet16i stride_multiplier =
       _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
@@ -677,12 +832,13 @@
 template <>
 EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
                                                          const Packet8d& from,
-                                                         int stride) {
-  Packet8i stride_vector = _mm256_set1_epi32(stride);
+                                                         Index stride) {
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
   Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
   Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
   _mm512_i32scatter_pd(to, indices, from, 8);
 }
+
 template <>
 EIGEN_STRONG_INLINE void pstore1<Packet16f>(float* to, const float& a) {
   Packet16f pa = pset1<Packet16f>(a);
@@ -698,18 +854,11 @@
   Packet16i pa = pset1<Packet16i>(a);
   pstore(to, pa);
 }
-template<>
-EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) {
-  _mm_prefetch((const char*)(addr), _MM_HINT_T0);
-}
-template<>
-EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
-  _mm_prefetch((const char*)(addr), _MM_HINT_T0);
-}
-template<>
-EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) {
-  _mm_prefetch((const char*)(addr), _MM_HINT_T0);
-}
+
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+
 template <>
 EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) {
   return _mm_cvtss_f32(_mm512_extractf32x4_ps(a, 0));
@@ -723,48 +872,85 @@
   return _mm_extract_epi32(_mm512_extracti32x4_epi32(a, 0), 0);
 }
 
-template<>
-EIGEN_STRONG_INLINE Packet16i preverse(const Packet16i& a)
+template<> EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a)
 {
-  return _mm512_permutexvar_epi32(
-      _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
-      a);
+  return _mm512_permutexvar_ps(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a);
 }
 
-template<>
-EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a)
+template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a)
 {
-  return _mm512_permutexvar_ps(
-      _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
-      a);
+  return _mm512_permutexvar_pd(_mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), a);
 }
 
-template<>
-EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a)
+template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a)
 {
-  return _mm512_permutexvar_pd(
-      _mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), a);
-}
-
-template<>
-EIGEN_STRONG_INLINE Packet16i pabs(const Packet16i& a)
-{
-  return _mm512_abs_epi32(a);
-}
-template<>
-EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a)
-{
-  return _mm512_abs_ps(a);
+  // _mm512_abs_ps intrinsic not found, so hack around it
+  return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
-  return _mm512_abs_pd(a);
+  // _mm512_abs_ps intrinsic not found, so hack around it
+  return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a),
+                                   _mm512_set1_epi64(0x7fffffffffffffff)));
+}
+
+template<>
+EIGEN_STRONG_INLINE Packet16f pfrexp<Packet16f>(const Packet16f& a, Packet16f& exponent){
+  return pfrexp_generic(a, exponent);
+}
+
+// Extract exponent without existence of Packet8l.
+template<>
+EIGEN_STRONG_INLINE  
+Packet8d pfrexp_generic_get_biased_exponent(const Packet8d& a) {
+  const Packet8d cst_exp_mask  = pset1frombits<Packet8d>(static_cast<uint64_t>(0x7ff0000000000000ull));
+  #ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_cvtepi64_pd(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52));
+  #else
+  return _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52)));
+  #endif
+}
+
+template<>
+EIGEN_STRONG_INLINE Packet8d pfrexp<Packet8d>(const Packet8d& a, Packet8d& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pldexp<Packet16f>(const Packet16f& a, const Packet16f& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d& exponent) {
+  // Clamp exponent to [-2099, 2099]
+  const Packet8d max_exponent = pset1<Packet8d>(2099.0);
+  const Packet8i e = _mm512_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+  
+  // Split 2^e into four factors and multiply.
+  const Packet8i bias = pset1<Packet8i>(1023);
+  Packet8i b = parithmetic_shift_right<2>(e);  // floor(e/4)
+  
+  // 2^b
+  const Packet8i permute_idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
+  Packet8i hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx);
+  Packet8i lo = _mm256_slli_epi64(hi, 52);
+  hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
+  Packet8d c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
+  Packet8d out = pmul(pmul(pmul(a, c), c), c);  // a * 2^(3b)
+  
+  // 2^(e - 3b)
+  b = psub(psub(psub(e, b), b), b);  // e - 3b
+  hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx);
+  lo = _mm256_slli_epi64(hi, 52);
+  hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
+  c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
+  out = pmul(out, c);  // a * 2^e
+  return out;
 }
 
 #ifdef EIGEN_VECTORIZE_AVX512DQ
 // AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
 #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)                           \
-  __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \
+  __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0);                    \
   __m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1)
 #else
 #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)                \
@@ -778,471 +964,147 @@
 
 #ifdef EIGEN_VECTORIZE_AVX512DQ
 #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
-  OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0);        \
-  OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1);
+  OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
 #else
 #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB)                    \
+  OUTPUT = _mm512_undefined_ps();                                           \
   OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
   OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \
   OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
   OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
 #endif
 
-template<>
-EIGEN_STRONG_INLINE Packet16i preduxp<Packet16i>(const Packet16i* vecs)
-{
-  __m512i quarters[4];
-  for (int i = 0; i < 4; ++i) {
-    auto v0 = vecs[i + 0];
-    auto v8 = vecs[i + 8];
-    auto t0 = _mm512_add_epi32(
-                _mm512_shuffle_i32x4(v0, v8, _MM_SHUFFLE(1, 0, 1, 0)),
-                _mm512_shuffle_i32x4(v0, v8, _MM_SHUFFLE(3, 2, 3, 2)));
-    auto v4 = vecs[i + 4];
-    auto v12 = vecs[i + 12];
-    auto t4 = _mm512_add_epi32(
-                _mm512_shuffle_i32x4(v4, v12, _MM_SHUFFLE(1, 0, 1, 0)),
-                _mm512_shuffle_i32x4(v4, v12, _MM_SHUFFLE(3, 2, 3, 2)));
-    // 128-bit lanes for add:
-    // top = t0[0] t4[1] t0[2] t4[3]
-    // bot = t0[1] t4[0] t0[3] t4[2]
-    auto top = _mm512_mask_blend_epi64(0xcc, t0, t4);
-    auto bot = _mm512_permutex_epi64(
-                 _mm512_mask_blend_epi64(0x33, t0, t4),
-                 _MM_SHUFFLE(1, 0, 3, 2));
-    auto t = _mm512_add_epi32(top, bot);
-    quarters[i] = t;
-  }
-  // Now, sum(j=0..3, quarters[i & 3][j + (i & 12)]) = sum(j=0..15, vecs[i][j])
-  // In order to use horizontal add operations, we need to work on 256-bit data
-  // only; remember that the adds are only within 128-bit lanes and interleave
-  // the sums from the two input vectors
-  __m256i result0 =
-      _mm256_hadd_epi32(
-          _mm256_hadd_epi32(
-              _mm512_castsi512_si256(quarters[0]),
-              _mm512_castsi512_si256(quarters[1])),
-          _mm256_hadd_epi32(
-              _mm512_castsi512_si256(quarters[2]),
-              _mm512_castsi512_si256(quarters[3])));
-  __m256i result1 =
-      _mm256_hadd_epi32(
-          _mm256_hadd_epi32(
-              _mm512_extracti64x4_epi64(quarters[0], 1),
-              _mm512_extracti64x4_epi64(quarters[1], 1)),
-          _mm256_hadd_epi32(
-              _mm512_extracti64x4_epi64(quarters[2], 1),
-              _mm512_extracti64x4_epi64(quarters[3], 1)));
-  return _mm512_inserti64x4(_mm512_castsi256_si512(result0), result1, 1);
-}
-
-template<>
-EIGEN_STRONG_INLINE Packet16f
-preduxp<Packet16f>(const Packet16f* vecs) {
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[0], vecs0);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[1], vecs1);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[2], vecs2);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[3], vecs3);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[4], vecs4);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[5], vecs5);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[6], vecs6);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[7], vecs7);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[8], vecs8);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[9], vecs9);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[10], vecs10);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[11], vecs11);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[12], vecs12);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[13], vecs13);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[14], vecs14);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[15], vecs15);
-
-  __m256 hsum1 = _mm256_hadd_ps(vecs0_0, vecs1_0);
-  __m256 hsum2 = _mm256_hadd_ps(vecs2_0, vecs3_0);
-  __m256 hsum3 = _mm256_hadd_ps(vecs4_0, vecs5_0);
-  __m256 hsum4 = _mm256_hadd_ps(vecs6_0, vecs7_0);
-
-  __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
-  __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
-  __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
-  __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
-
-  __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
-  __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
-  __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
-  __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
-
-  __m256 sum1 = _mm256_add_ps(perm1, hsum5);
-  __m256 sum2 = _mm256_add_ps(perm2, hsum6);
-  __m256 sum3 = _mm256_add_ps(perm3, hsum7);
-  __m256 sum4 = _mm256_add_ps(perm4, hsum8);
-
-  __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
-  __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
-
-  __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
-
-  hsum1 = _mm256_hadd_ps(vecs0_1, vecs1_1);
-  hsum2 = _mm256_hadd_ps(vecs2_1, vecs3_1);
-  hsum3 = _mm256_hadd_ps(vecs4_1, vecs5_1);
-  hsum4 = _mm256_hadd_ps(vecs6_1, vecs7_1);
-
-  hsum5 = _mm256_hadd_ps(hsum1, hsum1);
-  hsum6 = _mm256_hadd_ps(hsum2, hsum2);
-  hsum7 = _mm256_hadd_ps(hsum3, hsum3);
-  hsum8 = _mm256_hadd_ps(hsum4, hsum4);
-
-  perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
-  perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
-  perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
-  perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
-
-  sum1 = _mm256_add_ps(perm1, hsum5);
-  sum2 = _mm256_add_ps(perm2, hsum6);
-  sum3 = _mm256_add_ps(perm3, hsum7);
-  sum4 = _mm256_add_ps(perm4, hsum8);
-
-  blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
-  blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
-
-  final = padd(final, _mm256_blend_ps(blend1, blend2, 0xf0));
-
-  hsum1 = _mm256_hadd_ps(vecs8_0, vecs9_0);
-  hsum2 = _mm256_hadd_ps(vecs10_0, vecs11_0);
-  hsum3 = _mm256_hadd_ps(vecs12_0, vecs13_0);
-  hsum4 = _mm256_hadd_ps(vecs14_0, vecs15_0);
-
-  hsum5 = _mm256_hadd_ps(hsum1, hsum1);
-  hsum6 = _mm256_hadd_ps(hsum2, hsum2);
-  hsum7 = _mm256_hadd_ps(hsum3, hsum3);
-  hsum8 = _mm256_hadd_ps(hsum4, hsum4);
-
-  perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
-  perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
-  perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
-  perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
-
-  sum1 = _mm256_add_ps(perm1, hsum5);
-  sum2 = _mm256_add_ps(perm2, hsum6);
-  sum3 = _mm256_add_ps(perm3, hsum7);
-  sum4 = _mm256_add_ps(perm4, hsum8);
-
-  blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
-  blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
-
-  __m256 final_1 = _mm256_blend_ps(blend1, blend2, 0xf0);
-
-  hsum1 = _mm256_hadd_ps(vecs8_1, vecs9_1);
-  hsum2 = _mm256_hadd_ps(vecs10_1, vecs11_1);
-  hsum3 = _mm256_hadd_ps(vecs12_1, vecs13_1);
-  hsum4 = _mm256_hadd_ps(vecs14_1, vecs15_1);
-
-  hsum5 = _mm256_hadd_ps(hsum1, hsum1);
-  hsum6 = _mm256_hadd_ps(hsum2, hsum2);
-  hsum7 = _mm256_hadd_ps(hsum3, hsum3);
-  hsum8 = _mm256_hadd_ps(hsum4, hsum4);
-
-  perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
-  perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
-  perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
-  perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
-
-  sum1 = _mm256_add_ps(perm1, hsum5);
-  sum2 = _mm256_add_ps(perm2, hsum6);
-  sum3 = _mm256_add_ps(perm3, hsum7);
-  sum4 = _mm256_add_ps(perm4, hsum8);
-
-  blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
-  blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
-
-  final_1 = padd(final_1, _mm256_blend_ps(blend1, blend2, 0xf0));
-
-  __m512 final_output = _mm512_undefined_ps();
-
-  EIGEN_INSERT_8f_INTO_16f(final_output, final, final_1);
-  return final_output;
-}
-
-template<>
-EIGEN_STRONG_INLINE Packet8d
-preduxp<Packet8d>(const Packet8d* vecs) {
-  Packet4d vecs0_0 = _mm512_extractf64x4_pd(vecs[0], 0);
-  Packet4d vecs0_1 = _mm512_extractf64x4_pd(vecs[0], 1);
-
-  Packet4d vecs1_0 = _mm512_extractf64x4_pd(vecs[1], 0);
-  Packet4d vecs1_1 = _mm512_extractf64x4_pd(vecs[1], 1);
-
-  Packet4d vecs2_0 = _mm512_extractf64x4_pd(vecs[2], 0);
-  Packet4d vecs2_1 = _mm512_extractf64x4_pd(vecs[2], 1);
-
-  Packet4d vecs3_0 = _mm512_extractf64x4_pd(vecs[3], 0);
-  Packet4d vecs3_1 = _mm512_extractf64x4_pd(vecs[3], 1);
-
-  Packet4d vecs4_0 = _mm512_extractf64x4_pd(vecs[4], 0);
-  Packet4d vecs4_1 = _mm512_extractf64x4_pd(vecs[4], 1);
-
-  Packet4d vecs5_0 = _mm512_extractf64x4_pd(vecs[5], 0);
-  Packet4d vecs5_1 = _mm512_extractf64x4_pd(vecs[5], 1);
-
-  Packet4d vecs6_0 = _mm512_extractf64x4_pd(vecs[6], 0);
-  Packet4d vecs6_1 = _mm512_extractf64x4_pd(vecs[6], 1);
-
-  Packet4d vecs7_0 = _mm512_extractf64x4_pd(vecs[7], 0);
-  Packet4d vecs7_1 = _mm512_extractf64x4_pd(vecs[7], 1);
-
-  Packet4d tmp0, tmp1;
-
-  tmp0 = _mm256_hadd_pd(vecs0_0, vecs1_0);
-  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
-
-  tmp1 = _mm256_hadd_pd(vecs2_0, vecs3_0);
-  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
-
-  __m256d final_0 = _mm256_blend_pd(tmp0, tmp1, 0xC);
-
-  tmp0 = _mm256_hadd_pd(vecs0_1, vecs1_1);
-  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
-
-  tmp1 = _mm256_hadd_pd(vecs2_1, vecs3_1);
-  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
-
-  final_0 = padd(final_0, _mm256_blend_pd(tmp0, tmp1, 0xC));
-
-  tmp0 = _mm256_hadd_pd(vecs4_0, vecs5_0);
-  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
-
-  tmp1 = _mm256_hadd_pd(vecs6_0, vecs7_0);
-  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
-
-  __m256d final_1 = _mm256_blend_pd(tmp0, tmp1, 0xC);
-
-  tmp0 = _mm256_hadd_pd(vecs4_1, vecs5_1);
-  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
-
-  tmp1 = _mm256_hadd_pd(vecs6_1, vecs7_1);
-  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
-
-  final_1 = padd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC));
-
-  __m512d final_output = _mm512_undefined_pd();
-  final_output = _mm512_insertf64x4(final_output, final_0, 0);
-
-  return _mm512_insertf64x4(final_output, final_1, 1);
-}
-
 template <>
 EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
-  Packet8f x1 =
-      _mm256_add_ps(_mm512_castps512_ps256(a), _mm512_extractf64x4_pd(a, 1));
-  return predux<Packet8f>(x1);
-}
-template <>
-EIGEN_STRONG_INLINE int predux<Packet16i>(const Packet16i& a) {
-  Packet8i x1 = _mm256_add_epi32(
-                  _mm512_castsi512_si256(a),
-                  _mm512_extracti64x4_epi64(a, 1));
-  Packet4i x2 = _mm_add_epi32(
-                  _mm256_castsi256_si128(x1),
-                  _mm256_extracti32x4_epi32(x1, 1));
-  return predux<Packet4i>(x2);
-}
-template <>
-EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
-  Packet4d x1 = _mm256_add_pd(
-                  _mm512_castpd512_pd256(a),
-                  _mm512_extractf64x4_pd(a, 1));
-  return predux<Packet4d>(x1);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8i predux4<Packet16i>(const Packet16i& a) {
-// Cast not found?
-// Packet4f lane0 = _mm512_castps512_ps128(a);
 #ifdef EIGEN_VECTORIZE_AVX512DQ
-  Packet8i lane0 = _mm512_extracti32x8_epi32(a, 0);
-  Packet8i lane1 = _mm512_extracti32x8_epi32(a, 1);
-  return padd(lane0, lane1);
+  __m256 lane0 = _mm512_extractf32x8_ps(a, 0);
+  __m256 lane1 = _mm512_extractf32x8_ps(a, 1);
+  Packet8f x = _mm256_add_ps(lane0, lane1);
+  return predux<Packet8f>(x);
 #else
-  Packet4i lane0 = _mm512_extracti32x4_epi32(a, 0);
-  Packet4i lane1 = _mm512_extracti32x4_epi32(a, 1);
-  Packet4i lane2 = _mm512_extracti32x4_epi32(a, 2);
-  Packet4i lane3 = _mm512_extracti32x4_epi32(a, 3);
-  Packet4i sum0 = padd(lane0, lane2);
-  Packet4i sum1 = padd(lane1, lane3);
-  return _mm256_inserti128_si256(_mm256_castsi128_si256(sum0), sum1, 1);
+  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
+  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
+  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
+  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
+  __m128 sum = _mm_add_ps(_mm_add_ps(lane0, lane1), _mm_add_ps(lane2, lane3));
+  sum = _mm_hadd_ps(sum, sum);
+  sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
+  return _mm_cvtss_f32(sum);
 #endif
 }
 template <>
-EIGEN_STRONG_INLINE Packet8f predux4<Packet16f>(const Packet16f& a) {
-// Cast not found?
-// Packet4f lane0 = _mm512_castps512_ps128(a);
+EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
+  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
+  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
+  __m256d sum = _mm256_add_pd(lane0, lane1);
+  __m256d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
+  return _mm_cvtsd_f64(_mm256_castpd256_pd128(_mm256_hadd_pd(tmp0, tmp0)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(const Packet16f& a) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
-  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
-  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
-  return padd(lane0, lane1);
+  __m256 lane0 = _mm512_extractf32x8_ps(a, 0);
+  __m256 lane1 = _mm512_extractf32x8_ps(a, 1);
+  return _mm256_add_ps(lane0, lane1);
 #else
-  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
-  Packet4f sum0 = padd(lane0, lane2);
-  Packet4f sum1 = padd(lane1, lane3);
+  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
+  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
+  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
+  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
+  __m128 sum0 = _mm_add_ps(lane0, lane2);
+  __m128 sum1 = _mm_add_ps(lane1, lane3);
   return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
 #endif
 }
 template <>
-EIGEN_STRONG_INLINE Packet4d predux4<Packet8d>(const Packet8d& a) {
-  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
-  Packet4d res = padd(lane0, lane1);
-  return res;
+EIGEN_STRONG_INLINE Packet4d predux_half_dowto4<Packet8d>(const Packet8d& a) {
+  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
+  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
+  return _mm256_add_pd(lane0, lane1);
 }
-template <>
-EIGEN_STRONG_INLINE int predux_mul<Packet16i>(const Packet16i& a) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
-  Packet8i lane0 = _mm512_castsi512_si256(a);
-  Packet8i lane1 = _mm512_extracti32x8_epi32(a, 1);
-  Packet8i res = _mm256_mullo_epi32(lane0, lane1);
-  Packet4i res128 = _mm_mullo_epi32(
-                      _mm256_castsi256_si128(res),
-                      _mm256_extracti128_si256(res, 1));
-  res128 = _mm_mullo_epi32(
-             res128,
-             _mm_shuffle_epi32(res128, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(
-           _mm_mullo_epi32(
-             res128,
-             _mm_shuffle_epi32(res128, _MM_SHUFFLE(0, 0, 0, 1))));
-#else
-  Packet4i lane0 = _mm512_extracti32x4_epi32(a, 0);
-  Packet4i lane1 = _mm512_extracti32x4_epi32(a, 1);
-  Packet4i lane2 = _mm512_extracti32x4_epi32(a, 2);
-  Packet4i lane3 = _mm512_extracti32x4_epi32(a, 3);
-  Packet4i res = pmul(pmul(lane0, lane1), pmul(lane2, lane3));
-  res = pmul(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(pmul(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-#endif
-}
+
 template <>
 EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
+//#ifdef EIGEN_VECTORIZE_AVX512DQ
+#if 0
   Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
   Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
   Packet8f res = pmul(lane0, lane1);
-  Packet4f res128 =
-      _mm256_castps256_ps128(pmul(res, _mm256_permute2f128_ps(res, res, 1)));
-  res128 = pmul(res128, _mm_permute_ps(res128, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(pmul(res128, _mm_permute_ps(res128, _MM_SHUFFLE(0, 0, 0, 1))));
+  res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
+  res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
 #else
-  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
-  Packet4f res = pmul(pmul(lane0, lane1), pmul(lane2, lane3));
+  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
+  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
+  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
+  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
+  __m128 res = pmul(pmul(lane0, lane1), pmul(lane2, lane3));
   res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
   return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
 #endif
 }
 template <>
 EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
-  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
-  Packet4d res = pmul(lane0, lane1);
+  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
+  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
+  __m256d res = pmul(lane0, lane1);
   res = pmul(res, _mm256_permute2f128_pd(res, res, 1));
   return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1)));
 }
-template <>
-EIGEN_STRONG_INLINE int predux_min<Packet16i>(const Packet16i& a) {
-  Packet4i lane0 = _mm512_extracti32x4_epi32(a, 0);
-  Packet4i lane1 = _mm512_extracti32x4_epi32(a, 1);
-  Packet4i lane2 = _mm512_extracti32x4_epi32(a, 2);
-  Packet4i lane3 = _mm512_extracti32x4_epi32(a, 3);
-  Packet4i res =
-      _mm_min_epi32(_mm_min_epi32(lane0, lane1), _mm_min_epi32(lane2, lane3));
-  res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(
-           _mm_min_epi32(
-             res,
-             _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-}
+
 template <>
 EIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) {
-  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
-  Packet4f res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
+  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
+  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
+  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
+  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
+  __m128 res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
   res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
   return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
 }
 template <>
 EIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) {
-  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
-  Packet4d res = _mm256_min_pd(lane0, lane1);
+  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
+  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
+  __m256d res = _mm256_min_pd(lane0, lane1);
   res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
   return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
 }
-template <>
-EIGEN_STRONG_INLINE int predux_max<Packet16i>(const Packet16i& a) {
-  Packet4i lane0 = _mm512_extracti32x4_epi32(a, 0);
-  Packet4i lane1 = _mm512_extracti32x4_epi32(a, 1);
-  Packet4i lane2 = _mm512_extracti32x4_epi32(a, 2);
-  Packet4i lane3 = _mm512_extracti32x4_epi32(a, 3);
-  Packet4i res =
-      _mm_max_epi32(_mm_max_epi32(lane0, lane1), _mm_max_epi32(lane2, lane3));
-  res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(
-           _mm_max_epi32(
-             res,
-             _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-}
+
 template <>
 EIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) {
-  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
-  Packet4f res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
+  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
+  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
+  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
+  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
+  __m128 res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
   res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
   return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
 }
+
 template <>
 EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
-  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
-  Packet4d res = _mm256_max_pd(lane0, lane1);
+  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
+  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
+  __m256d res = _mm256_max_pd(lane0, lane1);
   res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
   return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
 }
 
-template <int Offset>
-struct palign_impl<Offset, Packet16i> {
-  static EIGEN_STRONG_INLINE
-  void run(Packet16i& first, const Packet16i& second) {
-    first = _mm512_alignr_epi32(second, first, Offset);
-  }
-};
-template <int Offset>
-struct palign_impl<Offset, Packet16f> {
-  static EIGEN_STRONG_INLINE
-  void run(Packet16f& first, const Packet16f& second) {
-    first = _mm512_alignr_epi32(second, first, Offset);
-  }
-};
-template <int Offset>
-struct palign_impl<Offset, Packet8d> {
-  static EIGEN_STRONG_INLINE
-  void run(Packet8d& first, const Packet8d& second) {
-    first = _mm512_alignr_epi64(second, first, Offset);
-  }
-};
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x)
+{
+  Packet16i xi = _mm512_castps_si512(x);
+  __mmask16 tmp = _mm512_test_epi32_mask(xi,xi);
+  return !_mm512_kortestz(tmp,tmp);
+}
+
+
 
 #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
   EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
 
-// For types implicitly convertible to and from __m512
-template <typename PacketType>
-EIGEN_DEVICE_FUNC inline void
-ptranspose_16_base(PacketBlock<PacketType>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
   __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
   __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
   __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
@@ -1353,22 +1215,11 @@
   PACK_OUTPUT(kernel.packet, tmp.packet, 14, 16);
   PACK_OUTPUT(kernel.packet, tmp.packet, 15, 16);
 }
-
-template <>
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
-  ptranspose_16_base(kernel);
-}
-template <>
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16i, 16>& kernel) {
-  ptranspose_16_base(kernel);
-}
 #define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE)         \
   EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], \
                            INPUT[2 * INDEX + STRIDE]);
 
-template <typename PacketType>
-EIGEN_DEVICE_FUNC inline void
-ptranspose_4_base(PacketBlock<PacketType, 4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
   __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
   __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
   __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
@@ -1401,14 +1252,6 @@
   PACK_OUTPUT_2(kernel.packet, tmp.packet, 2, 1);
   PACK_OUTPUT_2(kernel.packet, tmp.packet, 3, 1);
 }
-template <>
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
-  ptranspose_4_base(kernel);
-}
-template <>
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16i, 4>& kernel) {
-  ptranspose_4_base(kernel);
-}
 
 #define PACK_OUTPUT_SQ_D(OUTPUT, INPUT, INDEX, STRIDE)                \
   OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX], 0); \
@@ -1419,7 +1262,6 @@
   OUTPUT[INDEX] =                                                           \
       _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
 
-template <>
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 4>& kernel) {
   __m512d T0 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
   __m512d T1 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0xff);
@@ -1451,7 +1293,7 @@
   PACK_OUTPUT_D(kernel.packet, tmp.packet, 2, 1);
   PACK_OUTPUT_D(kernel.packet, tmp.packet, 3, 1);
 }
-template <>
+
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 8>& kernel) {
   __m512d T0 = _mm512_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
   __m512d T1 = _mm512_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
@@ -1511,54 +1353,951 @@
   PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 7, 8);
 }
 template <>
-EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& ifPacket,
-                                     const Packet16f& thenPacket,
-                                     const Packet16f& elsePacket) {
-  const __m512 zero = _mm512_setzero_ps();
-  const __m512 select = _mm512_set_ps(
-      ifPacket.select[15], ifPacket.select[14], ifPacket.select[13],
-      ifPacket.select[12], ifPacket.select[11], ifPacket.select[10],
-      ifPacket.select[9], ifPacket.select[8], ifPacket.select[7],
-      ifPacket.select[6], ifPacket.select[5], ifPacket.select[4],
-      ifPacket.select[3], ifPacket.select[2], ifPacket.select[1],
-      ifPacket.select[0]);
-  __mmask16 false_mask = _mm512_cmp_ps_mask(select, zero, _CMP_EQ_UQ);
-  // TODO(nishantpatil) generate false_mask from Selector
-  return _mm512_mask_blend_ps(false_mask, thenPacket, elsePacket);
+EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/,
+                                     const Packet16f& /*thenPacket*/,
+                                     const Packet16f& /*elsePacket*/) {
+  assert(false && "To be implemented");
+  return Packet16f();
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket,
                                     const Packet8d& thenPacket,
                                     const Packet8d& elsePacket) {
-  const __m512d zero = _mm512_setzero_pd();
-  const __m512d select =
-      _mm512_set_pd(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5],
-                    ifPacket.select[4], ifPacket.select[3], ifPacket.select[2],
-                    ifPacket.select[1], ifPacket.select[0]);
-  __mmask16 false_mask = _mm512_cmp_pd_mask(select, zero, _CMP_EQ_UQ);
-  // TODO(nishantpatil) generate false_mask from Selector
-  return _mm512_mask_blend_pd(false_mask, thenPacket, elsePacket);
+  __mmask8 m = (ifPacket.select[0]   )
+             | (ifPacket.select[1]<<1)
+             | (ifPacket.select[2]<<2)
+             | (ifPacket.select[3]<<3)
+             | (ifPacket.select[4]<<4)
+             | (ifPacket.select[5]<<5)
+             | (ifPacket.select[6]<<6)
+             | (ifPacket.select[7]<<7);
+  return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
 }
-// Functions to print vectors of different types, makes debugging much easier.
-namespace{
-void print16f(char* name, __m512 val) {
-  float temp[16] __attribute__((aligned(32)));
-  _mm512_store_ps(temp, val);
-  printf("%s: ", name);
-  for (int k = 0; k < 16; k++) printf("%.8e ", temp[k]);
-  printf("\n");
+
+// Packet math for Eigen::half
+template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
+  return _mm256_set1_epi16(from.x);
 }
-// void print16i(char* name, __m512i val) {
-//  int temp[16] __attribute__((aligned(32)));
-//  _mm512_store_si512((__m512i*)temp, val);
-//  printf("%s: ", name);
-//  for (int k = 0; k < 16; k++) printf("%i ", temp[k]);
-//  printf("\n");
-//}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
+  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from, 0)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
+  return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
+  return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
+  // (void*) -> workaround clang warning:
+  // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
+  _mm256_store_si256((__m256i*)(void*)to, from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
+  // (void*) -> workaround clang warning:
+  // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
+  _mm256_storeu_si256((__m256i*)(void*)to, from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h
+ploaddup<Packet16h>(const Eigen::half*  from) {
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  unsigned short c = from[2].x;
+  unsigned short d = from[3].x;
+  unsigned short e = from[4].x;
+  unsigned short f = from[5].x;
+  unsigned short g = from[6].x;
+  unsigned short h = from[7].x;
+  return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h
+ploadquad(const Eigen::half* from) {
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  unsigned short c = from[2].x;
+  unsigned short d = from[3].x;
+  return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
+}
+
+EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
+#ifdef EIGEN_HAS_FP16_C
+  return _mm512_cvtph_ps(a);
+#else
+  EIGEN_ALIGN64 half aux[16];
+  pstore(aux, a);
+  float f0(aux[0]);
+  float f1(aux[1]);
+  float f2(aux[2]);
+  float f3(aux[3]);
+  float f4(aux[4]);
+  float f5(aux[5]);
+  float f6(aux[6]);
+  float f7(aux[7]);
+  float f8(aux[8]);
+  float f9(aux[9]);
+  float fa(aux[10]);
+  float fb(aux[11]);
+  float fc(aux[12]);
+  float fd(aux[13]);
+  float fe(aux[14]);
+  float ff(aux[15]);
+
+  return _mm512_set_ps(
+      ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
+#endif
+}
+
+EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
+#ifdef EIGEN_HAS_FP16_C
+  return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
+#else
+  EIGEN_ALIGN64 float aux[16];
+  pstore(aux, a);
+  half h0(aux[0]);
+  half h1(aux[1]);
+  half h2(aux[2]);
+  half h3(aux[3]);
+  half h4(aux[4]);
+  half h5(aux[5]);
+  half h6(aux[6]);
+  half h7(aux[7]);
+  half h8(aux[8]);
+  half h9(aux[9]);
+  half ha(aux[10]);
+  half hb(aux[11]);
+  half hc(aux[12]);
+  half hd(aux[13]);
+  half he(aux[14]);
+  half hf(aux[15]);
+
+  return _mm256_set_epi16(
+      hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
+      h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
+  return ptrue(Packet8i(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pabs(const Packet16h& a) {
+  const __m256i sign_mask = _mm256_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm256_andnot_si256(sign_mask, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmin<Packet16h>(const Packet16h& a,
+                                              const Packet16h& b) {
+  return float2half(pmin<Packet16f>(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmax<Packet16h>(const Packet16h& a,
+                                              const Packet16h& b) {
+  return float2half(pmax<Packet16f>(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h plset<Packet16h>(const half& a) {
+  return float2half(plset<Packet16f>(static_cast<float>(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) {
+  // in some cases Packet8i is a wrapper around __m256i, so we need to
+  // cast to Packet8i to call the correct overload.
+  return por(Packet8i(a),Packet8i(b));
+}
+template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) {
+  return pxor(Packet8i(a),Packet8i(b));
+}
+template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) {
+  return pand(Packet8i(a),Packet8i(b));
+}
+template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) {
+  return pandnot(Packet8i(a),Packet8i(b));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) {
+  return _mm256_blendv_epi8(b, a, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pround<Packet16h>(const Packet16h& a) {
+  return float2half(pround<Packet16f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h print<Packet16h>(const Packet16h& a) {
+  return float2half(print<Packet16f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pceil<Packet16h>(const Packet16h& a) {
+  return float2half(pceil<Packet16f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pfloor<Packet16h>(const Packet16h& a) {
+  return float2half(pfloor<Packet16f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  return Pack32To16(pcmp_eq(af, bf));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pcmp_le(const Packet16h& a,const Packet16h& b) {
+  return Pack32To16(pcmp_le(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pcmp_lt(const Packet16h& a,const Packet16h& b) {
+  return Pack32To16(pcmp_lt(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(const Packet16h& a,const Packet16h& b) {
+  return Pack32To16(pcmp_lt_or_nan(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pconj(const Packet16h& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
+  Packet16h sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
+  return _mm256_xor_si256(a, sign_mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = padd(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = psub(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = pmul(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = pdiv(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
+  Packet16f from_float = half2float(from);
+  return half(predux(from_float));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
+  Packet8h lane0 = _mm256_extractf128_si256(a, 0);
+  Packet8h lane1 = _mm256_extractf128_si256(a, 1);
+  return padd<Packet8h>(lane0, lane1);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet16h>(const Packet16h& a) {
+  Packet16f af = half2float(a);
+  float reduced = predux_max<Packet16f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet16h>(const Packet16h& a) {
+  Packet16f af = half2float(a);
+  float reduced = predux_min<Packet16f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
+  Packet16f from_float = half2float(from);
+  return half(predux_mul(from_float));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a)
+{
+  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+  return _mm256_insertf128_si256(
+                    _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a,1),m)),
+                                           _mm_shuffle_epi8(_mm256_extractf128_si256(a,0),m), 1);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
+{
+  return _mm256_set_epi16(
+      from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
+      from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
+      from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
+      from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride)
+{
+  EIGEN_ALIGN64 half aux[16];
+  pstore(aux, from);
+  to[stride*0] = aux[0];
+  to[stride*1] = aux[1];
+  to[stride*2] = aux[2];
+  to[stride*3] = aux[3];
+  to[stride*4] = aux[4];
+  to[stride*5] = aux[5];
+  to[stride*6] = aux[6];
+  to[stride*7] = aux[7];
+  to[stride*8] = aux[8];
+  to[stride*9] = aux[9];
+  to[stride*10] = aux[10];
+  to[stride*11] = aux[11];
+  to[stride*12] = aux[12];
+  to[stride*13] = aux[13];
+  to[stride*14] = aux[14];
+  to[stride*15] = aux[15];
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet16h,16>& kernel) {
+  __m256i a = kernel.packet[0];
+  __m256i b = kernel.packet[1];
+  __m256i c = kernel.packet[2];
+  __m256i d = kernel.packet[3];
+  __m256i e = kernel.packet[4];
+  __m256i f = kernel.packet[5];
+  __m256i g = kernel.packet[6];
+  __m256i h = kernel.packet[7];
+  __m256i i = kernel.packet[8];
+  __m256i j = kernel.packet[9];
+  __m256i k = kernel.packet[10];
+  __m256i l = kernel.packet[11];
+  __m256i m = kernel.packet[12];
+  __m256i n = kernel.packet[13];
+  __m256i o = kernel.packet[14];
+  __m256i p = kernel.packet[15];
+
+  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
+  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
+  __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
+  __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
+  __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
+  __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
+  __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
+  __m256i op_07 = _mm256_unpacklo_epi16(o, p);
+
+  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
+  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
+  __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
+  __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
+  __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
+  __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
+  __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
+  __m256i op_8f = _mm256_unpackhi_epi16(o, p);
+
+  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
+  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
+  __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
+  __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
+  __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
+  __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
+  __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
+  __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
+
+  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
+  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
+  __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
+  __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
+  __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
+  __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
+  __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
+  __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
+
+  __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
+  __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
+  __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
+  __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
+  __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
+  __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
+  __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
+  __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
+  __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
+  __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
+  __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
+  __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
+  __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
+  __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
+  __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
+  __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
+
+  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
+  __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
+  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
+  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
+  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
+  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
+  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
+  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
+  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
+  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
+  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
+  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
+  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
+  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
+  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
+  __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
+
+  kernel.packet[0] = a_p_0;
+  kernel.packet[1] = a_p_1;
+  kernel.packet[2] = a_p_2;
+  kernel.packet[3] = a_p_3;
+  kernel.packet[4] = a_p_4;
+  kernel.packet[5] = a_p_5;
+  kernel.packet[6] = a_p_6;
+  kernel.packet[7] = a_p_7;
+  kernel.packet[8] = a_p_8;
+  kernel.packet[9] = a_p_9;
+  kernel.packet[10] = a_p_a;
+  kernel.packet[11] = a_p_b;
+  kernel.packet[12] = a_p_c;
+  kernel.packet[13] = a_p_d;
+  kernel.packet[14] = a_p_e;
+  kernel.packet[15] = a_p_f;
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet16h,8>& kernel) {
+  EIGEN_ALIGN64 half in[8][16];
+  pstore<half>(in[0], kernel.packet[0]);
+  pstore<half>(in[1], kernel.packet[1]);
+  pstore<half>(in[2], kernel.packet[2]);
+  pstore<half>(in[3], kernel.packet[3]);
+  pstore<half>(in[4], kernel.packet[4]);
+  pstore<half>(in[5], kernel.packet[5]);
+  pstore<half>(in[6], kernel.packet[6]);
+  pstore<half>(in[7], kernel.packet[7]);
+
+  EIGEN_ALIGN64 half out[8][16];
+
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      out[i][j] = in[j][2*i];
+    }
+    for (int j = 0; j < 8; ++j) {
+      out[i][j+8] = in[j][2*i+1];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet16h>(out[0]);
+  kernel.packet[1] = pload<Packet16h>(out[1]);
+  kernel.packet[2] = pload<Packet16h>(out[2]);
+  kernel.packet[3] = pload<Packet16h>(out[3]);
+  kernel.packet[4] = pload<Packet16h>(out[4]);
+  kernel.packet[5] = pload<Packet16h>(out[5]);
+  kernel.packet[6] = pload<Packet16h>(out[6]);
+  kernel.packet[7] = pload<Packet16h>(out[7]);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet16h,4>& kernel) {
+  EIGEN_ALIGN64 half in[4][16];
+  pstore<half>(in[0], kernel.packet[0]);
+  pstore<half>(in[1], kernel.packet[1]);
+  pstore<half>(in[2], kernel.packet[2]);
+  pstore<half>(in[3], kernel.packet[3]);
+
+  EIGEN_ALIGN64 half out[4][16];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      out[i][j] = in[j][4*i];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+4] = in[j][4*i+1];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+8] = in[j][4*i+2];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+12] = in[j][4*i+3];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet16h>(out[0]);
+  kernel.packet[1] = pload<Packet16h>(out[1]);
+  kernel.packet[2] = pload<Packet16h>(out[2]);
+  kernel.packet[3] = pload<Packet16h>(out[3]);
+}
+
+template <> struct is_arithmetic<Packet16bf> { enum { value = true }; };
+
+template <>
+struct packet_traits<bfloat16> : default_packet_traits {
+  typedef Packet16bf type;
+  typedef Packet8bf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 1,
+    HasBlend = 0,
+    HasInsert = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+    HasLog = 1,  // Currently fails test with bad accuracy.
+    HasLog1p  = 1,
+    HasExpm1  = 1,
+    HasNdtri = 1,
+    HasBessel  = 1,
+#endif
+    HasExp = 1,
+    HasSqrt = EIGEN_FAST_MATH,
+    HasRsqrt = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+#endif
+    HasCmp  = 1,
+    HasDiv = 1
+  };
 };
 
+template <>
+struct unpacket_traits<Packet16bf>
+{
+  typedef bfloat16 type;
+  enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  typedef Packet8bf half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pset1<Packet16bf>(const bfloat16& from) {
+  return _mm256_set1_epi16(from.value);
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 pfirst<Packet16bf>(const Packet16bf& from) {
+  bfloat16 t;
+  t.value = static_cast<unsigned short>(_mm256_extract_epi16(from, 0));
+  return t;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pload<Packet16bf>(const bfloat16* from) {
+  return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf ploadu<Packet16bf>(const bfloat16* from) {
+  return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to,
+                                          const Packet16bf& from) {
+  _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to,
+                                           const Packet16bf& from) {
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16bf
+ploaddup<Packet16bf>(const bfloat16* from) {
+  Packet16bf r;
+  unsigned short a = from[0].value;
+  unsigned short b = from[1].value;
+  unsigned short c = from[2].value;
+  unsigned short d = from[3].value;
+  unsigned short e = from[4].value;
+  unsigned short f = from[5].value;
+  unsigned short g = from[6].value;
+  unsigned short h = from[7].value;
+  return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16bf
+ploadquad(const bfloat16* from) {
+  Packet16bf r;
+  unsigned short a = from[0].value;
+  unsigned short b = from[1].value;
+  unsigned short c = from[2].value;
+  unsigned short d = from[3].value;
+  return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
+}
+
+EIGEN_STRONG_INLINE Packet16f Bf16ToF32(const Packet16bf& a) {
+  return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(a), 16));
+}
+
+// Convert float to bfloat16 according to round-to-nearest-even/denormals algorithm.
+EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) {
+  Packet16bf r;
+
+#if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_AT_LEAST(10, 1)
+  // Since GCC 10.1 supports avx512bf16 and C style explicit cast
+  // (C++ static_cast is not supported yet), do converion via intrinsic
+  // and register path for performance.
+  r = (__m256i)(_mm512_cvtneps_pbh(a));
+
+#else
+  __m512i t;
+  __m512i input = _mm512_castps_si512(a);
+  __m512i nan = _mm512_set1_epi32(0x7fc0);
+
+  // uint32_t lsb = (input >> 16) & 1;
+  t = _mm512_and_si512(_mm512_srli_epi32(input, 16), _mm512_set1_epi32(1));
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  t = _mm512_add_epi32(t, _mm512_set1_epi32(0x7fff));
+  // input += rounding_bias;
+  t = _mm512_add_epi32(t, input);
+  // input = input >> 16;
+  t = _mm512_srli_epi32(t, 16);
+
+  // Check NaN before converting back to bf16
+  __mmask16 mask = _mm512_cmp_ps_mask(a, a, _CMP_ORD_Q);
+
+  t = _mm512_mask_blend_epi32(mask, nan, t);
+  // output.value = static_cast<uint16_t>(input);
+  r = _mm512_cvtepi32_epi16(t);
+#endif // EIGEN_VECTORIZE_AVX512BF16
+
+  return r;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf ptrue(const Packet16bf& a) {
+  return ptrue<Packet8i>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf por(const Packet16bf& a, const Packet16bf& b) {
+  return por<Packet8i>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pxor(const Packet16bf& a, const Packet16bf& b) {
+  return pxor<Packet8i>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pand(const Packet16bf& a, const Packet16bf& b) {
+  return pand<Packet8i>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pandnot(const Packet16bf& a,
+                                       const Packet16bf& b) {
+  return pandnot<Packet8i>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pselect(const Packet16bf& mask,
+                                       const Packet16bf& a,
+                                       const Packet16bf& b) {
+  // Input mask is expected to be all 0/1, handle it with 8-bit
+  // intrinsic for performance.
+  return _mm256_blendv_epi8(b, a, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16bf pround<Packet16bf>(const Packet16bf& a)
+{
+  return F32ToBf16(pround<Packet16f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16bf print<Packet16bf>(const Packet16bf& a) {
+  return F32ToBf16(print<Packet16f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16bf pceil<Packet16bf>(const Packet16bf& a) {
+  return F32ToBf16(pceil<Packet16f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16bf pfloor<Packet16bf>(const Packet16bf& a) {
+  return F32ToBf16(pfloor<Packet16f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pcmp_eq(const Packet16bf& a,
+                                       const Packet16bf& b) {
+  return Pack32To16(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pcmp_le(const Packet16bf& a,
+                                       const Packet16bf& b) {
+  return Pack32To16(pcmp_le(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pcmp_lt(const Packet16bf& a,
+                                       const Packet16bf& b) {
+  return Pack32To16(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pcmp_lt_or_nan(const Packet16bf& a,
+                                              const Packet16bf& b) {
+  return Pack32To16(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pnegate(const Packet16bf& a) {
+  Packet16bf sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
+  return _mm256_xor_si256(a, sign_mask);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pconj(const Packet16bf& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pabs(const Packet16bf& a) {
+  const __m256i sign_mask = _mm256_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm256_andnot_si256(sign_mask, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf padd<Packet16bf>(const Packet16bf& a,
+                                                const Packet16bf& b) {
+  return F32ToBf16(padd<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf psub<Packet16bf>(const Packet16bf& a,
+                                                const Packet16bf& b) {
+  return F32ToBf16(psub<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pmul<Packet16bf>(const Packet16bf& a,
+                                                const Packet16bf& b) {
+  return F32ToBf16(pmul<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pdiv<Packet16bf>(const Packet16bf& a,
+                                                const Packet16bf& b) {
+  return F32ToBf16(pdiv<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pmin<Packet16bf>(const Packet16bf& a,
+                                                const Packet16bf& b) {
+  return F32ToBf16(pmin<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pmax<Packet16bf>(const Packet16bf& a,
+                                                const Packet16bf& b) {
+  return F32ToBf16(pmax<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf plset<Packet16bf>(const bfloat16& a) {
+  return F32ToBf16(plset<Packet16f>(static_cast<float>(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf predux_half_dowto4<Packet16bf>(const Packet16bf& a) {
+  Packet8bf lane0 = _mm256_extractf128_si256(a, 0);
+  Packet8bf lane1 = _mm256_extractf128_si256(a, 1);
+  return padd<Packet8bf>(lane0, lane1);
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux<Packet16bf>(const Packet16bf& p) {
+  return static_cast<bfloat16>(predux<Packet16f>(Bf16ToF32(p)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet16bf>(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_mul<Packet16f>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<Packet16bf>(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_min<Packet16f>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<Packet16bf>(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_max<Packet16f>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf preverse(const Packet16bf& a) {
+  __m256i m = _mm256_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,
+                               14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+
+  Packet16bf res;
+  // Swap hi and lo first because shuffle is in 128-bit lanes.
+  res = _mm256_permute2x128_si256(a, a, 1);
+  // Shuffle 8-bit values in src within 2*128-bit lanes.
+  return _mm256_shuffle_epi8(res, m);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pgather<bfloat16, Packet16bf>(const bfloat16* from,
+                                                             Index stride) {
+  return _mm256_set_epi16(
+      from[15*stride].value, from[14*stride].value, from[13*stride].value, from[12*stride].value,
+      from[11*stride].value, from[10*stride].value, from[9*stride].value, from[8*stride].value,
+      from[7*stride].value, from[6*stride].value, from[5*stride].value, from[4*stride].value,
+      from[3*stride].value, from[2*stride].value, from[1*stride].value, from[0*stride].value);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet16bf>(bfloat16* to,
+                                                        const Packet16bf& from,
+                                                        Index stride) {
+  EIGEN_ALIGN64 bfloat16 aux[16];
+  pstore(aux, from);
+  to[stride*0] = aux[0];
+  to[stride*1] = aux[1];
+  to[stride*2] = aux[2];
+  to[stride*3] = aux[3];
+  to[stride*4] = aux[4];
+  to[stride*5] = aux[5];
+  to[stride*6] = aux[6];
+  to[stride*7] = aux[7];
+  to[stride*8] = aux[8];
+  to[stride*9] = aux[9];
+  to[stride*10] = aux[10];
+  to[stride*11] = aux[11];
+  to[stride*12] = aux[12];
+  to[stride*13] = aux[13];
+  to[stride*14] = aux[14];
+  to[stride*15] = aux[15];
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf,16>& kernel) {
+  __m256i a = kernel.packet[0];
+  __m256i b = kernel.packet[1];
+  __m256i c = kernel.packet[2];
+  __m256i d = kernel.packet[3];
+  __m256i e = kernel.packet[4];
+  __m256i f = kernel.packet[5];
+  __m256i g = kernel.packet[6];
+  __m256i h = kernel.packet[7];
+  __m256i i = kernel.packet[8];
+  __m256i j = kernel.packet[9];
+  __m256i k = kernel.packet[10];
+  __m256i l = kernel.packet[11];
+  __m256i m = kernel.packet[12];
+  __m256i n = kernel.packet[13];
+  __m256i o = kernel.packet[14];
+  __m256i p = kernel.packet[15];
+
+  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
+  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
+  __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
+  __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
+  __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
+  __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
+  __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
+  __m256i op_07 = _mm256_unpacklo_epi16(o, p);
+
+  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
+  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
+  __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
+  __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
+  __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
+  __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
+  __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
+  __m256i op_8f = _mm256_unpackhi_epi16(o, p);
+
+  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
+  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
+  __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
+  __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
+  __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
+  __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
+  __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
+  __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
+
+  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
+  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
+  __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
+  __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
+  __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
+  __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
+  __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
+  __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
+
+  __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
+  __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
+  __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
+  __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
+  __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
+  __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
+  __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
+  __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
+  __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
+  __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
+  __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
+  __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
+  __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
+  __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
+  __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
+  __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
+
+  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
+  kernel.packet[0] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
+  kernel.packet[1] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
+  kernel.packet[2] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
+  kernel.packet[3] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
+  kernel.packet[4] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
+  kernel.packet[5] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
+  kernel.packet[6] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
+  kernel.packet[7] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  kernel.packet[8] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
+  kernel.packet[9] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
+  kernel.packet[10] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
+  kernel.packet[11] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
+  kernel.packet[12] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
+  kernel.packet[13] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
+  kernel.packet[14] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
+  kernel.packet[15] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf,4>& kernel) {
+  __m256i a = kernel.packet[0];
+  __m256i b = kernel.packet[1];
+  __m256i c = kernel.packet[2];
+  __m256i d = kernel.packet[3];
+
+  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
+  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
+  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
+  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
+
+  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
+  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
+  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
+  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
+
+  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
+  kernel.packet[0] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x20);
+  kernel.packet[1] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x20);
+  kernel.packet[2] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x31);
+  kernel.packet[3] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x31);
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_PACKET_MATH_AVX_H
+#endif // EIGEN_PACKET_MATH_AVX512_H

diff --git a/Eigen/src/Core/arch/AVX512/TypeCasting.h b/Eigen/src/Core/arch/AVX512/TypeCasting.h
new file mode 100644
index 0000000..3304127
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX512/TypeCasting.h

@@ -0,0 +1,89 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2019 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_AVX512_H
+#define EIGEN_TYPE_CASTING_AVX512_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<> EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
+  return _mm512_cvttps_epi32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
+  return _mm512_cvtepi32_ps(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16i preinterpret<Packet16i, Packet16f>(const Packet16f& a) {
+  return _mm512_castps_si512(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet16i>(const Packet16i& a) {
+  return _mm512_castsi512_ps(a);
+}
+
+template <>
+struct type_casting_traits<half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
+  return half2float(a);
+}
+
+template <>
+struct type_casting_traits<float, half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
+  return float2half(a);
+}
+
+template <>
+struct type_casting_traits<bfloat16, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16bf, Packet16f>(const Packet16bf& a) {
+  return Bf16ToF32(a);
+}
+
+template <>
+struct type_casting_traits<float, bfloat16> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet16bf pcast<Packet16f, Packet16bf>(const Packet16f& a) {
+  return F32ToBf16(a);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_AVX512_H

diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index 57df950..b393299 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h

@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010-2016 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,27 +11,72 @@
 #ifndef EIGEN_COMPLEX32_ALTIVEC_H
 #define EIGEN_COMPLEX32_ALTIVEC_H
 
-
 namespace Eigen {
 
 namespace internal {
 
-static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_ZERO_);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
-#ifdef EIGEN_VECTORIZE_VSX
-#ifdef _BIG_ENDIAN
-static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
-static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+#ifdef __VSX__
+#if defined(_BIG_ENDIAN)
+static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 #else
-static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
-static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 #endif
-#endif  // EIGEN_VECTORIZE_VSX
+#endif
 
 //---------- float ----------
 struct Packet2cf
 {
-  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf() {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b)
+  {
+    Packet4f v1, v2;
+
+    // Permute and multiply the real parts of a and b
+    v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
+    // Get the imaginary parts of a
+    v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
+    // multiply a_re * b
+    v1 = vec_madd(v1, b.v, p4f_ZERO);
+    // multiply a_im * b and get the conjugate result
+    v2 = vec_madd(v2, b.v, p4f_ZERO);
+    v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
+    // permute back to a proper order
+    v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
+
+    return Packet2cf(padd<Packet4f>(v1, v2));
+  }
+
+  EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) {
+    v = pmul(Packet2cf(*this), b).v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
+    return Packet2cf(*this) *= b;
+  }
+
+  EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
+    return Packet2cf(*this) += b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
+    return Packet2cf(*this) -= b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
+    return Packet2cf(-v);
+  }
+
   Packet4f  v;
 };
 
@@ -38,10 +84,12 @@
 {
   typedef Packet2cf type;
   typedef Packet2cf half;
+  typedef Packet4f as_real;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
+    HasHalfPacket = 0,
 
     HasAdd    = 1,
     HasSub    = 1,
@@ -52,17 +100,19 @@
     HasAbs2   = 0,
     HasMin    = 0,
     HasMax    = 0,
+#ifdef __VSX__
+    HasBlend  = 1,
+#endif
     HasSetLinear = 0
   };
 };
 
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; typedef Packet4f as_real; };
 
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
   Packet2cf res;
-  /* On AltiVec we cannot load 64-bit registers, so wa have to take care of alignment */
-  if((ptrdiff_t(&from) % 16) == 0)
+  if((std::ptrdiff_t(&from) % 16) == 0)
     res.v = pload<Packet4f>((const float *)&from);
   else
     res.v = ploadu<Packet4f>((const float *)&from);
@@ -70,69 +120,62 @@
   return res;
 }
 
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride)
+template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>*        from) { return Packet2cf(pload<Packet4f>((const float *) from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>*       from) { return Packet2cf(ploadu<Packet4f>((const float*) from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*     from) { return pset1<Packet2cf>(*from); }
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstore((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstoreu((float*)to, from.v); }
+
+EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1)
 {
-  std::complex<float> EIGEN_ALIGN16 af[2];
+  Packet4f res0, res1;
+#ifdef __VSX__
+  __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (from0));
+  __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (from1));
+#ifdef _BIG_ENDIAN
+  __asm__ ("xxpermdi %x0, %x1, %x2, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1));
+#else
+  __asm__ ("xxpermdi %x0, %x2, %x1, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1));
+#endif
+#else
+  *reinterpret_cast<std::complex<float> *>(&res0) = from0;
+  *reinterpret_cast<std::complex<float> *>(&res1) = from1;
+  res0 = vec_perm(res0, res1, p16uc_TRANSPOSE64_HI);
+#endif
+  return Packet2cf(res0);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+{
+  EIGEN_ALIGN16 std::complex<float> af[2];
   af[0] = from[0*stride];
   af[1] = from[1*stride];
-  return Packet2cf(vec_ld(0, (const float*)af));
+  return pload<Packet2cf>(af);
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride)
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
 {
-  std::complex<float> EIGEN_ALIGN16 af[2];
-  vec_st(from.v, 0, (float*)af);
+  EIGEN_ALIGN16 std::complex<float> af[2];
+  pstore<std::complex<float> >((std::complex<float> *) af, from);
   to[0*stride] = af[0];
   to[1*stride] = af[1];
 }
 
-
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf((Packet4f)vec_xor((Packet4ui)a.v, p4ui_CONJ_XOR)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  Packet4f v1, v2;
+template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v, b.v)); }
 
-  // Permute and multiply the real parts of a and b
-  v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
-  // Get the imaginary parts of a
-  v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
-  // multiply a_re * b 
-  v1 = vec_madd(v1, b.v, p4f_ZERO);
-  // multiply a_im * b and get the conjugate result
-  v2 = vec_madd(v2, b.v, p4f_ZERO);
-  v2 = (Packet4f) vec_xor((Packet4ui)v2, p4ui_CONJ_XOR);
-  // permute back to a proper order
-  v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
-  
-  return Packet2cf(vec_add(v1, v2));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_or(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_xor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v, vec_nor(b.v,b.v))); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*     from)
-{
-  return pset1<Packet2cf>(*from);
-}
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
-
-#ifndef __VSX__
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { vec_dstt((float *)addr, DST_CTRL(2,2,32), DST_CHAN); }
-#endif
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr)    { EIGEN_PPC_PREFETCH(addr); }
 
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
-  std::complex<float> EIGEN_ALIGN16 res[2];
+  EIGEN_ALIGN16 std::complex<float> res[2];
   pstore((float *)&res, a.v);
 
   return res[0];
@@ -148,92 +191,29 @@
 template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
 {
   Packet4f b;
-  b = (Packet4f) vec_sld(a.v, a.v, 8);
-  b = padd(a.v, b);
-  return pfirst(Packet2cf(b));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
-{
-  Packet4f b1, b2;
-#ifdef _BIG_ENDIAN  
-  b1 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8);
-  b2 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8);
-#else
-  b1 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8);
-  b2 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8);
-#endif
-  b2 = (Packet4f) vec_sld(b2, b2, 8);
-  b2 = padd(b1, b2);
-
-  return Packet2cf(b2);
+  b = vec_sld(a.v, a.v, 8);
+  b = padd<Packet4f>(a.v, b);
+  return pfirst<Packet2cf>(Packet2cf(b));
 }
 
 template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
 {
   Packet4f b;
   Packet2cf prod;
-  b = (Packet4f) vec_sld(a.v, a.v, 8);
-  prod = pmul(a, Packet2cf(b));
+  b = vec_sld(a.v, a.v, 8);
+  prod = pmul<Packet2cf>(a, Packet2cf(b));
 
-  return pfirst(prod);
+  return pfirst<Packet2cf>(prod);
 }
 
-template<int Offset>
-struct palign_impl<Offset,Packet2cf>
-{
-  static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
-  {
-    if (Offset==1)
-    {
-#ifdef _BIG_ENDIAN
-      first.v = vec_sld(first.v, second.v, 8);
-#else
-      first.v = vec_sld(second.v, first.v, 8);
-#endif
-    }
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
   // TODO optimize it for AltiVec
-  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
-  Packet4f s = vec_madd(b.v, b.v, p4f_ZERO);
-  return Packet2cf(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_COMPLEX32_REV))));
+  Packet2cf res = pmul(a, pconj(b));
+  Packet4f s = pmul<Packet4f>(b.v, b.v);
+  return Packet2cf(pdiv(res.v, padd<Packet4f>(s, vec_perm(s, s, p16uc_COMPLEX32_REV))));
 }
 
 template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
@@ -241,19 +221,82 @@
   return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV));
 }
 
-template<> EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
 {
   Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
   kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
   kernel.packet[0].v = tmp;
 }
 
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+  Packet4f eq = reinterpret_cast<Packet4f>(vec_cmpeq(a.v,b.v));
+  return Packet2cf(vec_and(eq, vec_perm(eq, eq, p16uc_COMPLEX32_REV)));
+}
+
+#ifdef __VSX__
+template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+  Packet2cf result;
+  result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
+  return result;
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a)
+{
+  return psqrt_complex<Packet2cf>(a);
+}
+
 //---------- double ----------
-#if defined(EIGEN_VECTORIZE_VSX)
+#ifdef __VSX__
 struct Packet1cd
 {
   EIGEN_STRONG_INLINE Packet1cd() {}
   EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b)
+  {
+    Packet2d a_re, a_im, v1, v2;
+
+    // Permute and multiply the real parts of a and b
+    a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
+    // Get the imaginary parts of a
+    a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
+    // multiply a_re * b
+    v1 = vec_madd(a_re, b.v, p2d_ZERO);
+    // multiply a_im * b and get the conjugate result
+    v2 = vec_madd(a_im, b.v, p2d_ZERO);
+    v2 = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v2), reinterpret_cast<Packet4ui>(v2), 8));
+    v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1));
+
+    return Packet1cd(padd<Packet2d>(v1, v2));
+  }
+
+  EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
+    v = pmul(Packet1cd(*this), b).v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
+    return Packet1cd(*this) *= b;
+  }
+
+  EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
+    return Packet1cd(*this) += b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
+    return Packet1cd(*this) -= b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
+    return Packet1cd(-v);
+  }
+
   Packet2d v;
 };
 
@@ -261,6 +304,7 @@
 {
   typedef Packet1cd type;
   typedef Packet1cd half;
+  typedef Packet2d as_real;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 0,
@@ -280,73 +324,42 @@
   };
 };
 
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; typedef Packet2d as_real; };
 
-template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { pstoreu((double*)to, from.v); }
 
 template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
 { /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
 
-// Google-local: Change type from DenseIndex to int in patch.
-template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, int/*DenseIndex*/ stride)
+template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index)
 {
-  std::complex<double> EIGEN_ALIGN16 af[2];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
-  return pload<Packet1cd>(af);
+  return pload<Packet1cd>(from);
 }
-// Google-local: Change type from DenseIndex to int in patch.
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, int/*DenseIndex*/ stride)
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index)
 {
-  std::complex<double> EIGEN_ALIGN16 af[2];
-  pstore<std::complex<double> >(af, from);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
+  pstore<std::complex<double> >(to, from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_add(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_sub(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
+template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2))); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
-  Packet2d a_re, a_im, v1, v2;
+template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pand(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(por(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pxor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pandnot(a.v, b.v)); }
 
-  // Permute and multiply the real parts of a and b
-  a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
-  // Get the imaginary parts of a
-  a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
-  // multiply a_re * b
-  v1 = vec_madd(a_re, b.v, p2d_ZERO);
-  // multiply a_im * b and get the conjugate result
-  v2 = vec_madd(a_im, b.v, p2d_ZERO);
-  v2 = (Packet2d) vec_sld((Packet4ui)v2, (Packet4ui)v2, 8);
-  v2 = (Packet2d) vec_xor((Packet2d)v2, (Packet2d) p2ul_CONJ_XOR1);
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from)  { return pset1<Packet1cd>(*from); }
 
-  return Packet1cd(vec_add(v1, v2));
-}
-
-template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from)
-{
-  return pset1<Packet1cd>(*from);
-}
-
-#ifndef __VSX__
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { vec_dstt((long *)addr, DST_CTRL(2,2,32), DST_CHAN); }
-#endif
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr)    { EIGEN_PPC_PREFETCH(addr); }
 
 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {
-  std::complex<double> EIGEN_ALIGN16 res[2];
+  EIGEN_ALIGN16 std::complex<double> res[2];
   pstore<std::complex<double> >(res, a);
 
   return res[0];
@@ -354,70 +367,18 @@
 
 template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
 
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
-{
-  return pfirst(a);
-}
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
-{
-  return vecs[0];
-}
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
 
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
-{
-  return pfirst(a);
-}
-
-template<int Offset>
-struct palign_impl<Offset,Packet1cd>
-{
-  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
-  {
-    // FIXME is it sure we never have to align a Packet1cd?
-    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
 
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
   // TODO optimize it for AltiVec
-  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
-  Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
-  return Packet1cd(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_REVERSE64))));
+  Packet1cd res = pmul(a,pconj(b));
+  Packet2d s = pmul<Packet2d>(b.v, b.v);
+  return Packet1cd(pdiv(res.v, padd<Packet2d>(s, vec_perm(s, s, p16uc_REVERSE64))));
 }
 
 EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
@@ -431,7 +392,24 @@
   kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
   kernel.packet[0].v = tmp;
 }
-#endif // EIGEN_VECTORIZE_VSX
+
+template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
+  // Compare real and imaginary parts of a and b to get the mask vector:
+  // [re(a)==re(b), im(a)==im(b)]
+  Packet2d eq = reinterpret_cast<Packet2d>(vec_cmpeq(a.v,b.v));
+  // Swap real/imag elements in the mask in to get:
+  // [im(a)==im(b), re(a)==re(b)]
+  Packet2d eq_swapped = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(eq), reinterpret_cast<Packet4ui>(eq), 8));
+  // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped
+  return Packet1cd(vec_and(eq, eq_swapped));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a)
+{
+  return psqrt_complex<Packet1cd>(a);
+}
+
+#endif // __VSX__
 } // end namespace internal
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
index afef4e9..3a7a329 100644
--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h

@@ -3,31 +3,15 @@
 //
 // Copyright (C) 2007 Julien Pommier
 // Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-/* The sin, cos, exp, and log functions of this file come from
- * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
- */
-
 #ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H
 #define EIGEN_MATH_FUNCTIONS_ALTIVEC_H
 
-#include <iostream>
-using ::std::cout;
-using ::std::cin;
-using ::std::cerr;
-using ::std::ios;
-using ::std::endl;
-using ::std::iostream;
-using ::std::ios_base;
-using ::std::ostream;
-using ::std::istream;
-
-#define DUMP(v) do { std::cout << #v " = " << (v) << std::endl; } while(0)
-
 namespace Eigen {
 
 namespace internal {
@@ -35,272 +19,70 @@
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f plog<Packet4f>(const Packet4f& _x)
 {
-  Packet4f x = _x;
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-  _EIGEN_DECLARE_CONST_Packet4i(23, 23);
-
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
-
-  /* the smallest non denormalized float number */
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000); // -1.f/0.f
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan,     0xffffffff);
-
-  /* natural logarithm computed for 4 simultaneous float
-    return NaN for x <= 0
-  */
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
-
-
-  Packet4i emm0;
-
-  /* isvalid_mask is 0 if x < 0 or x is NaN. */
-  Packet4ui isvalid_mask = reinterpret_cast<Packet4ui>(vec_cmpge(x, p4f_ZERO));
-  Packet4ui iszero_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(x, p4f_ZERO));
-
-  x = pmax(x, p4f_min_norm_pos);  /* cut off denormalized stuff */
-  emm0 = vec_sr(reinterpret_cast<Packet4i>(x),
-                reinterpret_cast<Packet4ui>(p4i_23));
-
-  /* keep only the fractional part */
-  x = pand(x, p4f_inv_mant_mask);
-  x = por(x, p4f_half);
-
-  emm0 = psub(emm0, p4i_0x7f);
-  Packet4f e = padd(vec_ctf(emm0, 0), p4f_1);
-
-  /* part2:
-     if( x < SQRTHF ) {
-       e -= 1;
-       x = x + x - 1.0;
-     } else { x = x - 1.0; }
-  */
-  Packet4f mask = reinterpret_cast<Packet4f>(vec_cmplt(x, p4f_cephes_SQRTHF));
-  Packet4f tmp = pand(x, mask);
-  x = psub(x, p4f_1);
-  e = psub(e, pand(p4f_1, mask));
-  x = padd(x, tmp);
-
-  Packet4f x2 = pmul(x,x);
-  Packet4f x3 = pmul(x2,x);
-
-  Packet4f y, y1, y2;
-  y  = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
-  y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
-  y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
-  y  = pmadd(y , x, p4f_cephes_log_p2);
-  y1 = pmadd(y1, x, p4f_cephes_log_p5);
-  y2 = pmadd(y2, x, p4f_cephes_log_p8);
-  y = pmadd(y, x3, y1);
-  y = pmadd(y, x3, y2);
-  y = pmul(y, x3);
-
-  y1 = pmul(e, p4f_cephes_log_q1);
-  tmp = pmul(x2, p4f_half);
-  y = padd(y, y1);
-  x = psub(x, tmp);
-  y2 = pmul(e, p4f_cephes_log_q2);
-  x = padd(x, y);
-  x = padd(x, y2);
-  // negative arg will be NAN, 0 will be -INF
-  x = vec_sel(x, p4f_minus_inf, iszero_mask);
-  x = vec_sel(p4f_minus_nan, x, isvalid_mask);
-  return x;
+  return plog_float(_x);
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f pexp<Packet4f>(const Packet4f& _x)
 {
-  Packet4f x = _x;
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-  _EIGEN_DECLARE_CONST_Packet4i(23, 23);
-
-
-  _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
-  _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
-
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
-
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
-
-  Packet4f tmp, fx;
-  Packet4i emm0;
-
-  // clamp x
-  x = vec_max(vec_min(x, p4f_exp_hi), p4f_exp_lo);
-
-  /* express exp(x) as exp(g + n*log(2)) */
-  fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
-
-  fx = vec_floor(fx);
-
-  tmp = pmul(fx, p4f_cephes_exp_C1);
-  Packet4f z = pmul(fx, p4f_cephes_exp_C2);
-  x = psub(x, tmp);
-  x = psub(x, z);
-
-  z = pmul(x,x);
-
-  Packet4f y = p4f_cephes_exp_p0;
-  y = pmadd(y, x, p4f_cephes_exp_p1);
-  y = pmadd(y, x, p4f_cephes_exp_p2);
-  y = pmadd(y, x, p4f_cephes_exp_p3);
-  y = pmadd(y, x, p4f_cephes_exp_p4);
-  y = pmadd(y, x, p4f_cephes_exp_p5);
-  y = pmadd(y, z, x);
-  y = padd(y, p4f_1);
-
-  // build 2^n
-  emm0 = vec_cts(fx, 0);
-  emm0 = vec_add(emm0, p4i_0x7f);
-  emm0 = vec_sl(emm0, reinterpret_cast<Packet4ui>(p4i_23));
-
-  // Altivec's max & min operators just drop silent NaNs. Check NaNs in
-  // inputs and return them unmodified.
-  Packet4ui isnumber_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(_x, _x));
-  return vec_sel(_x, pmax(pmul(y, reinterpret_cast<Packet4f>(emm0)), _x),
-                 isnumber_mask);
+  return pexp_float(_x);
 }
 
-#ifdef __VSX__
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psin<Packet4f>(const Packet4f& _x)
+{
+  return psin_float(_x);
+}
 
-#undef GCC_VERSION
-#define GCC_VERSION (__GNUC__ * 10000 \
-                     + __GNUC_MINOR__ * 100 \
-                     + __GNUC_PATCHLEVEL__)
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pcos<Packet4f>(const Packet4f& _x)
+{
+  return pcos_float(_x);
+}
 
-// VSX support varies between different compilers and even different
-// versions of the same compiler.  For gcc version >= 4.9.3, we can use
-// vec_cts to efficiently convert Packet2d to Packet2l.  Otherwise, use
-// a slow version that works with older compilers.
-static inline Packet2l ConvertToPacket2l(const Packet2d& x) {
-#if GCC_VERSION >= 40903 || defined(__clang__)
-  return vec_cts(x, 0);
-#else
-  double tmp[2];
-  memcpy(tmp, &x, sizeof(tmp));
-  Packet2l l = { static_cast<long long>(tmp[0]),
-                 static_cast<long long>(tmp[1]) };
-  return l;
+#ifndef EIGEN_COMP_CLANG
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& x)
+{
+  return  vec_rsqrt(x);
+}
 #endif
+
+#ifdef __VSX__
+#ifndef EIGEN_COMP_CLANG
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d prsqrt<Packet2d>(const Packet2d& x)
+{
+  return  vec_rsqrt(x);
+}
+#endif
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psqrt<Packet4f>(const Packet4f& x)
+{
+  return  vec_sqrt(x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d psqrt<Packet2d>(const Packet2d& x)
+{
+  return  vec_sqrt(x);
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d pexp<Packet2d>(const Packet2d& _x)
 {
-  Packet2d x = _x;
-
-  _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
-  _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
-  _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
-
-  _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
-  _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
-
-  Packet2d tmp, fx;
-  Packet2l emm0;
-
-  // clamp x
-  x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
-  /* express exp(x) as exp(g + n*log(2)) */
-  fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
-
-  fx = vec_floor(fx);
-
-  tmp = pmul(fx, p2d_cephes_exp_C1);
-  Packet2d z = pmul(fx, p2d_cephes_exp_C2);
-  x = psub(x, tmp);
-  x = psub(x, z);
-
-  Packet2d x2 = pmul(x,x);
-
-  Packet2d px = p2d_cephes_exp_p0;
-  px = pmadd(px, x2, p2d_cephes_exp_p1);
-  px = pmadd(px, x2, p2d_cephes_exp_p2);
-  px = pmul (px, x);
-
-  Packet2d qx = p2d_cephes_exp_q0;
-  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
-  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
-  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
-
-  x = pdiv(px,psub(qx,px));
-  x = pmadd(p2d_2,x,p2d_1);
-
-  // build 2^n
-  emm0 = ConvertToPacket2l(fx);
-
-#ifdef __POWER8_VECTOR__
-  static const Packet2l p2l_1023 = { 1023, 1023 };
-  static const Packet2ul p2ul_52 = { 52, 52 };
-
-  emm0 = vec_add(emm0, p2l_1023);
-  emm0 = vec_sl(emm0, p2ul_52);
-#else
-  // Code is a bit complex for POWER7.  There is actually a
-  // vec_xxsldi intrinsic but it is not supported by some gcc versions.
-  // So we shift (52-32) bits and do a word swap with zeros.
-  _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
-  _EIGEN_DECLARE_CONST_Packet4i(20, 20);    // 52 - 32
-
-  Packet4i emm04i = reinterpret_cast<Packet4i>(emm0);
-  emm04i = vec_add(emm04i, p4i_1023);
-  emm04i = vec_sl(emm04i, reinterpret_cast<Packet4ui>(p4i_20));
-  static const Packet16uc perm = {
-    0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
-    0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
-#ifdef  _BIG_ENDIAN
-  emm0 = reinterpret_cast<Packet2l>(vec_perm(p4i_ZERO, emm04i, perm));
-#else
-  emm0 = reinterpret_cast<Packet2l>(vec_perm(emm04i, p4i_ZERO, perm));
-#endif
-
-#endif
-
-  // Altivec's max & min operators just drop silent NaNs. Check NaNs in
-  // inputs and return them unmodified.
-  Packet2ul isnumber_mask = reinterpret_cast<Packet2ul>(vec_cmpeq(_x, _x));
-  return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x),
-                 isnumber_mask);
+  return pexp_double(_x);
 }
 #endif
 
+// Hyperbolic Tangent function.
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+ptanh<Packet4f>(const Packet4f& x) {
+  return internal::generic_fast_tanh_float(x);
+}
+
 }  // end namespace internal
 
 }  // end namespace Eigen

diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
new file mode 100644
index 0000000..8feb88e
--- /dev/null
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h

@@ -0,0 +1,2765 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com)
+// Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H
+#define EIGEN_MATRIX_PRODUCT_ALTIVEC_H
+
+#ifndef EIGEN_ALTIVEC_USE_CUSTOM_PACK
+#define EIGEN_ALTIVEC_USE_CUSTOM_PACK    1
+#endif
+
+#include "MatrixProductCommon.h"
+
+// Since LLVM doesn't support dynamic dispatching, force either always MMA or VSX
+#if EIGEN_COMP_LLVM
+#if !defined(EIGEN_ALTIVEC_DISABLE_MMA) && !defined(EIGEN_ALTIVEC_MMA_ONLY)
+#ifdef __MMA__
+#define EIGEN_ALTIVEC_MMA_ONLY
+#else
+#define EIGEN_ALTIVEC_DISABLE_MMA
+#endif
+#endif
+#endif
+
+#ifdef __has_builtin
+#if __has_builtin(__builtin_mma_assemble_acc)
+  #define ALTIVEC_MMA_SUPPORT
+#endif
+#endif
+
+#if defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+  #include "MatrixProductMMA.h"
+#endif
+
+/**************************************************************************************************
+ * TODO                                                                                           *
+ * - Check StorageOrder on dhs_pack (the innermost second loop seems unvectorized when it could). *
+ * - Check the possibility of transposing as GETREAL and GETIMAG when needed.                     *
+ **************************************************************************************************/
+namespace Eigen {
+
+namespace internal {
+
+/**************************
+ * Constants and typedefs *
+ **************************/
+template<typename Scalar>
+struct quad_traits
+{
+  typedef typename packet_traits<Scalar>::type    vectortype;
+  typedef PacketBlock<vectortype,4>                     type;
+  typedef vectortype                                 rhstype;
+  enum
+  {
+    vectorsize = packet_traits<Scalar>::size,
+    size = 4,
+    rows = 4
+  };
+};
+
+template<>
+struct quad_traits<double>
+{
+  typedef Packet2d                        vectortype;
+  typedef PacketBlock<vectortype,4>             type;
+  typedef PacketBlock<Packet2d,2>            rhstype;
+  enum
+  {
+    vectorsize = packet_traits<double>::size,
+    size = 2,
+    rows = 4
+  };
+};
+
+// MatrixProduct decomposes real/imaginary vectors into a real vector and an imaginary vector, this turned out
+// to be faster than Eigen's usual approach of having real/imaginary pairs on a single vector. This constants then
+// are responsible to extract from convert between Eigen's and MatrixProduct approach.
+
+const static Packet16uc p16uc_GETREAL32 = {  0,  1,  2,  3,
+                                             8,  9, 10, 11,
+                                            16, 17, 18, 19,
+                                            24, 25, 26, 27};
+
+const static Packet16uc p16uc_GETIMAG32 = {  4,  5,  6,  7,
+                                            12, 13, 14, 15,
+                                            20, 21, 22, 23,
+                                            28, 29, 30, 31};
+const static Packet16uc p16uc_GETREAL64 = {  0,  1,  2,  3,  4,  5,  6,  7,
+                                            16, 17, 18, 19, 20, 21, 22, 23};
+
+//[a,ai],[b,bi] = [ai,bi]
+const static Packet16uc p16uc_GETIMAG64 = {  8,  9, 10, 11, 12, 13, 14, 15,
+                                            24, 25, 26, 27, 28, 29, 30, 31};
+
+/*********************************************
+ * Single precision real and complex packing *
+ * *******************************************/
+
+/**
+ * Symm packing is related to packing of symmetric adjoint blocks, as expected the packing leaves
+ * the diagonal real, whatever is below it is copied from the respective upper diagonal element and 
+ * conjugated. There's no PanelMode available for symm packing.
+ *
+ * Packing in general is supposed to leave the lhs block and the rhs block easy to be read by gemm using 
+ * its respective rank-update instructions. The float32/64 versions are different because at this moment
+ * the size of the accumulator is fixed at 512-bits so you can't have a 4x4 accumulator of 64-bit elements.
+ * 
+ * As mentioned earlier MatrixProduct breaks complex numbers into a real vector and a complex vector so packing has
+ * to take that into account, at the moment, we run pack the real part and then the imaginary part, this is the main
+ * reason why packing for complex is broken down into several different parts, also the reason why we endup having a
+ * float32/64 and complex float32/64 version.
+ **/
+template<typename Scalar, typename Index, int StorageOrder>
+EIGEN_ALWAYS_INLINE std::complex<Scalar> getAdjointVal(Index i, Index j, const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder>& dt)
+{
+  std::complex<Scalar> v;
+  if(i < j)
+  {
+    v.real( dt(j,i).real());
+    v.imag(-dt(j,i).imag());
+  } else if(i > j)
+  {
+    v.real( dt(i,j).real());
+    v.imag( dt(i,j).imag());
+  } else {
+    v.real( dt(i,j).real());
+    v.imag((Scalar)0.0);
+  }
+  return v;
+}
+
+template<typename Scalar, typename Index, int StorageOrder, int N>
+EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex<Scalar>* blockB, const std::complex<Scalar>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
+{
+  const Index depth = k2 + rows;
+  const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder> rhs(_rhs, rhsStride);
+  const Index vectorSize = N*quad_traits<Scalar>::vectorsize;
+  const Index vectorDelta = vectorSize * rows;
+  Scalar* blockBf = reinterpret_cast<Scalar *>(blockB);
+
+  Index rir = 0, rii, j = 0;
+  for(; j + vectorSize <= cols; j+=vectorSize)
+  {
+    rii = rir + vectorDelta;
+
+    for(Index i = k2; i < depth; i++)
+    {
+      for(Index k = 0; k < vectorSize; k++)
+      {
+        std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, j + k, rhs);
+
+        blockBf[rir + k] = v.real();
+        blockBf[rii + k] = v.imag();
+      }
+      rir += vectorSize;
+      rii += vectorSize;
+    }
+
+    rir += vectorDelta;
+  }
+
+  for(; j < cols; j++)
+  {
+    rii = rir + rows;
+
+    for(Index i = k2; i < depth; i++)
+    {
+      std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, j, rhs);
+
+      blockBf[rir] = v.real();
+      blockBf[rii] = v.imag();
+
+      rir += 1;
+      rii += 1;
+    }
+
+    rir += rows;
+  }
+}
+
+template<typename Scalar, typename Index, int StorageOrder>
+EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex<Scalar>* blockA, const std::complex<Scalar>* _lhs, Index lhsStride, Index cols, Index rows)
+{
+  const Index depth = cols;
+  const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder> lhs(_lhs, lhsStride);
+  const Index vectorSize = quad_traits<Scalar>::vectorsize;
+  const Index vectorDelta = vectorSize * depth;
+  Scalar* blockAf = (Scalar *)(blockA);
+
+  Index rir = 0, rii, j = 0;
+  for(; j + vectorSize <= rows; j+=vectorSize)
+  {
+    rii = rir + vectorDelta;
+
+    for(Index i = 0; i < depth; i++)
+    {
+      for(Index k = 0; k < vectorSize; k++)
+      {
+        std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(j+k, i, lhs);
+
+        blockAf[rir + k] = v.real();
+        blockAf[rii + k] = v.imag();
+      }
+      rir += vectorSize;
+      rii += vectorSize;
+    }
+
+    rir += vectorDelta;
+  }
+
+  if (j < rows)
+  {
+    rii = rir + ((rows - j) * depth);
+
+    for(Index i = 0; i < depth; i++)
+    {
+      Index k = j;
+      for(; k < rows; k++)
+      {
+        std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(k, i, lhs);
+
+        blockAf[rir] = v.real();
+        blockAf[rii] = v.imag();
+
+        rir += 1;
+        rii += 1;
+      }
+    }
+  }
+}
+
+template<typename Scalar, typename Index, int StorageOrder, int N>
+EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
+{
+  const Index depth = k2 + rows;
+  const_blas_data_mapper<Scalar, Index, StorageOrder> rhs(_rhs, rhsStride);
+  const Index vectorSize = quad_traits<Scalar>::vectorsize;
+
+  Index ri = 0, j = 0;
+  for(; j + N*vectorSize <= cols; j+=N*vectorSize)
+  {
+    Index i = k2;
+    for(; i < depth; i++)
+    {
+      for(Index k = 0; k < N*vectorSize; k++)
+      {
+        if(i <= j+k)
+          blockB[ri + k] = rhs(j+k, i);
+        else
+          blockB[ri + k] = rhs(i, j+k);
+      }
+      ri += N*vectorSize;
+    }
+  }
+
+  for(; j < cols; j++)
+  {
+    for(Index i = k2; i < depth; i++)
+    {
+      if(j <= i)
+        blockB[ri] = rhs(i, j);
+      else
+        blockB[ri] = rhs(j, i);
+      ri += 1;
+    }
+  }
+}
+
+template<typename Scalar, typename Index, int StorageOrder>
+EIGEN_STRONG_INLINE void symm_pack_lhs_helper(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
+{
+  const Index depth = cols;
+  const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs, lhsStride);
+  const Index vectorSize = quad_traits<Scalar>::vectorsize;
+
+  Index ri = 0, j = 0;
+  for(; j + vectorSize <= rows; j+=vectorSize)
+  {
+    Index i = 0;
+
+    for(; i < depth; i++)
+    {
+      for(Index k = 0; k < vectorSize; k++)
+      {
+        if(i <= j+k)
+          blockA[ri + k] = lhs(j+k, i);
+        else
+          blockA[ri + k] = lhs(i, j+k);
+      }
+      ri += vectorSize;
+    }
+  }
+
+  if (j < rows)
+  {
+    for(Index i = 0; i < depth; i++)
+    {
+      Index k = j;
+      for(; k < rows; k++)
+      {
+        if(i <= k)
+          blockA[ri] = lhs(k, i);
+        else
+          blockA[ri] = lhs(i, k);
+        ri += 1;
+      }
+    }
+  }
+}
+
+template<typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs<std::complex<float>, Index, nr, StorageOrder>
+{
+  void operator()(std::complex<float>* blockB, const std::complex<float>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
+  {
+    symm_pack_complex_rhs_helper<float, Index, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
+  }
+};
+
+template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs<std::complex<float>, Index, Pack1, Pack2_dummy, StorageOrder>
+{
+  void operator()(std::complex<float>* blockA, const std::complex<float>* _lhs, Index lhsStride, Index cols, Index rows)
+  {
+    symm_pack_complex_lhs_helper<float, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
+  }
+};
+
+// *********** symm_pack std::complex<float64> ***********
+
+template<typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs<std::complex<double>, Index, nr, StorageOrder>
+{
+  void operator()(std::complex<double>* blockB, const std::complex<double>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
+  {
+    symm_pack_complex_rhs_helper<double, Index, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
+  }
+};
+
+template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs<std::complex<double>, Index, Pack1, Pack2_dummy, StorageOrder>
+{
+  void operator()(std::complex<double>* blockA, const std::complex<double>* _lhs, Index lhsStride, Index cols, Index rows)
+  {
+    symm_pack_complex_lhs_helper<double, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
+  }
+};
+
+// *********** symm_pack float32 ***********
+template<typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs<float, Index, nr, StorageOrder>
+{
+  void operator()(float* blockB, const float* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
+  {
+    symm_pack_rhs_helper<float, Index, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
+  }
+};
+
+template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs<float, Index, Pack1, Pack2_dummy, StorageOrder>
+{
+  void operator()(float* blockA, const float* _lhs, Index lhsStride, Index cols, Index rows)
+  {
+    symm_pack_lhs_helper<float, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
+  }
+};
+
+// *********** symm_pack float64 ***********
+template<typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs<double, Index, nr, StorageOrder>
+{
+  void operator()(double* blockB, const double* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
+  {
+    symm_pack_rhs_helper<double, Index, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
+  }
+};
+
+template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs<double, Index, Pack1, Pack2_dummy, StorageOrder>
+{
+  void operator()(double* blockA, const double* _lhs, Index lhsStride, Index cols, Index rows)
+  {
+    symm_pack_lhs_helper<double, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
+  }
+};
+
+/**
+ * PanelMode
+ * Packing might be called several times before being multiplied by gebp_kernel, this happens because 
+ * on special occasions it fills part of block with other parts of the matrix. Two variables control
+ * how PanelMode should behave: offset and stride. The idea is that those variables represent whatever
+ * is going to be the real offset and stride in the future and this is what you should obey. The process
+ * is to behave as you would with normal packing but leave the start of each part with the correct offset
+ * and the end as well respecting the real stride the block will have. Gebp is aware of both blocks stride
+ * and offset and behaves accordingly.
+ **/
+
+template<typename Scalar, typename Packet, typename Index, int N>
+EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock<Packet,N>& block)
+{
+  const Index size = 16 / sizeof(Scalar);
+  pstore<Scalar>(to + (0 * size), block.packet[0]);
+  pstore<Scalar>(to + (1 * size), block.packet[1]);
+  if (N > 2) {
+    pstore<Scalar>(to + (2 * size), block.packet[2]);
+  }
+  if (N > 3) {
+    pstore<Scalar>(to + (3 * size), block.packet[3]);
+  }
+}
+
+// General template for lhs & rhs complex packing.
+template<typename Scalar, typename Index, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode, bool UseLhs>
+struct dhs_cpack {
+  EIGEN_STRONG_INLINE void operator()(std::complex<Scalar>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+  {
+    const Index vectorSize = quad_traits<Scalar>::vectorsize;
+    const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
+    Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
+    Scalar* blockAt = reinterpret_cast<Scalar *>(blockA);
+    Index j = 0;
+
+    for(; j + vectorSize <= rows; j+=vectorSize)
+    {
+      Index i = 0;
+
+      rii = rir + vectorDelta;
+
+      for(; i + vectorSize <= depth; i+=vectorSize)
+      {
+        PacketBlock<Packet,4> blockr, blocki;
+        PacketBlock<PacketC,8> cblock;
+
+        if (UseLhs) {
+          bload<DataMapper, PacketC, Index, 2, StorageOrder, true, 4>(cblock, lhs, j, i);
+        } else {
+          bload<DataMapper, PacketC, Index, 2, StorageOrder, true, 4>(cblock, lhs, i, j);
+        }
+
+        blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32);
+        blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETREAL32);
+        blockr.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETREAL32);
+        blockr.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETREAL32);
+
+        blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETIMAG32);
+        blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETIMAG32);
+        blocki.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETIMAG32);
+        blocki.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETIMAG32);
+
+        if(Conjugate)
+        {
+          blocki.packet[0] = -blocki.packet[0];
+          blocki.packet[1] = -blocki.packet[1];
+          blocki.packet[2] = -blocki.packet[2];
+          blocki.packet[3] = -blocki.packet[3];
+        }
+
+        if(((StorageOrder == RowMajor) && UseLhs) || (((StorageOrder == ColMajor) && !UseLhs)))
+        {
+          ptranspose(blockr);
+          ptranspose(blocki);
+        }
+
+        storeBlock<Scalar, Packet, Index, 4>(blockAt + rir, blockr);
+        storeBlock<Scalar, Packet, Index, 4>(blockAt + rii, blocki);
+
+        rir += 4*vectorSize;
+        rii += 4*vectorSize;
+      }
+      for(; i < depth; i++)
+      {
+        PacketBlock<Packet,1> blockr, blocki;
+        PacketBlock<PacketC,2> cblock;
+
+        if(((StorageOrder == ColMajor) && UseLhs) || (((StorageOrder == RowMajor) && !UseLhs)))
+        {
+          if (UseLhs) {
+            cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
+            cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 2, i);
+          } else {
+            cblock.packet[0] = lhs.template loadPacket<PacketC>(i, j + 0);
+            cblock.packet[1] = lhs.template loadPacket<PacketC>(i, j + 2);
+          }
+        } else {
+          if (UseLhs) {
+            cblock.packet[0] = pload2(lhs(j + 0, i), lhs(j + 1, i));
+            cblock.packet[1] = pload2(lhs(j + 2, i), lhs(j + 3, i));
+          } else {
+            cblock.packet[0] = pload2(lhs(i, j + 0), lhs(i, j + 1));
+            cblock.packet[1] = pload2(lhs(i, j + 2), lhs(i, j + 3));
+          }
+        }
+
+        blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL32);
+        blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG32);
+
+        if(Conjugate)
+        {
+          blocki.packet[0] = -blocki.packet[0];
+        }
+
+        pstore<Scalar>(blockAt + rir, blockr.packet[0]);
+        pstore<Scalar>(blockAt + rii, blocki.packet[0]);
+
+        rir += vectorSize;
+        rii += vectorSize;
+      }
+
+      rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
+    }
+
+    if (!UseLhs)
+    {
+      if(PanelMode) rir -= (offset*(vectorSize - 1));
+
+      for(; j < rows; j++)
+      {
+        rii = rir + ((PanelMode) ? stride : depth);
+
+        for(Index i = 0; i < depth; i++)
+        {
+          blockAt[rir] = lhs(i, j).real();
+
+          if(Conjugate)
+            blockAt[rii] = -lhs(i, j).imag();
+          else
+            blockAt[rii] =  lhs(i, j).imag();
+
+          rir += 1;
+          rii += 1;
+        }
+
+        rir += ((PanelMode) ? (2*stride - depth) : depth);
+      }
+    } else {
+      if (j < rows)
+      {
+        if(PanelMode) rir += (offset*(rows - j - vectorSize));
+        rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
+
+        for(Index i = 0; i < depth; i++)
+        {
+          Index k = j;
+          for(; k < rows; k++)
+          {
+            blockAt[rir] = lhs(k, i).real();
+
+            if(Conjugate)
+              blockAt[rii] = -lhs(k, i).imag();
+            else
+              blockAt[rii] =  lhs(k, i).imag();
+
+            rir += 1;
+            rii += 1;
+          }
+        }
+      }
+    }
+  }
+};
+
+// General template for lhs & rhs packing.
+template<typename Scalar, typename Index, typename DataMapper, typename Packet, int StorageOrder, bool PanelMode, bool UseLhs>
+struct dhs_pack{
+  EIGEN_STRONG_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+  {
+    const Index vectorSize = quad_traits<Scalar>::vectorsize;
+    Index ri = 0, j = 0;
+
+    for(; j + vectorSize <= rows; j+=vectorSize)
+    {
+      Index i = 0;
+
+      if(PanelMode) ri += vectorSize*offset;
+
+      for(; i + vectorSize <= depth; i+=vectorSize)
+      {
+        PacketBlock<Packet,4> block;
+
+        if (UseLhs) {
+          bload<DataMapper, Packet, Index, 4, StorageOrder, false, 4>(block, lhs, j, i);
+        } else {
+          bload<DataMapper, Packet, Index, 4, StorageOrder, false, 4>(block, lhs, i, j);
+        }
+        if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
+        {
+          ptranspose(block);
+        }
+
+        storeBlock<Scalar, Packet, Index, 4>(blockA + ri, block);
+
+        ri += 4*vectorSize;
+      }
+      for(; i < depth; i++)
+      {
+        if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
+        {
+          if (UseLhs) {
+            blockA[ri+0] = lhs(j+0, i);
+            blockA[ri+1] = lhs(j+1, i);
+            blockA[ri+2] = lhs(j+2, i);
+            blockA[ri+3] = lhs(j+3, i);
+          } else {
+            blockA[ri+0] = lhs(i, j+0);
+            blockA[ri+1] = lhs(i, j+1);
+            blockA[ri+2] = lhs(i, j+2);
+            blockA[ri+3] = lhs(i, j+3);
+          }
+        } else {
+          Packet lhsV;
+          if (UseLhs) {
+            lhsV = lhs.template loadPacket<Packet>(j, i);
+          } else {
+            lhsV = lhs.template loadPacket<Packet>(i, j);
+          }
+          pstore<Scalar>(blockA + ri, lhsV);
+        }
+
+        ri += vectorSize;
+      }
+
+      if(PanelMode) ri += vectorSize*(stride - offset - depth);
+    }
+
+    if (!UseLhs)
+    {
+      if(PanelMode) ri += offset;
+
+      for(; j < rows; j++)
+      {
+        for(Index i = 0; i < depth; i++)
+        {
+          blockA[ri] = lhs(i, j);
+          ri += 1;
+        }
+
+        if(PanelMode) ri += stride - depth;
+      }
+    } else {
+      if (j < rows)
+      {
+        if(PanelMode) ri += offset*(rows - j);
+
+        for(Index i = 0; i < depth; i++)
+        {
+          Index k = j;
+          for(; k < rows; k++)
+          {
+            blockA[ri] = lhs(k, i);
+            ri += 1;
+          }
+        }
+      }
+    }
+  }
+};
+
+// General template for lhs packing, float64 specialization.
+template<typename Index, typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<double, Index, DataMapper, Packet2d, StorageOrder, PanelMode, true>
+{
+  EIGEN_STRONG_INLINE void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+  {
+    const Index vectorSize = quad_traits<double>::vectorsize;
+    Index ri = 0, j = 0;
+
+    for(; j + vectorSize <= rows; j+=vectorSize)
+    {
+      Index i = 0;
+
+      if(PanelMode) ri += vectorSize*offset;
+
+      for(; i + vectorSize <= depth; i+=vectorSize)
+      {
+        PacketBlock<Packet2d,2> block;
+        if(StorageOrder == RowMajor)
+        {
+          block.packet[0] = lhs.template loadPacket<Packet2d>(j + 0, i);
+          block.packet[1] = lhs.template loadPacket<Packet2d>(j + 1, i);
+
+          ptranspose(block);
+        } else {
+          block.packet[0] = lhs.template loadPacket<Packet2d>(j, i + 0);
+          block.packet[1] = lhs.template loadPacket<Packet2d>(j, i + 1);
+        }
+
+        storeBlock<double, Packet2d, Index, 2>(blockA + ri, block);
+
+        ri += 2*vectorSize;
+      }
+      for(; i < depth; i++)
+      {
+        if(StorageOrder == RowMajor)
+        {
+          blockA[ri+0] = lhs(j+0, i);
+          blockA[ri+1] = lhs(j+1, i);
+        } else {
+          Packet2d lhsV = lhs.template loadPacket<Packet2d>(j, i);
+          pstore<double>(blockA + ri, lhsV);
+        }
+
+        ri += vectorSize;
+      }
+
+      if(PanelMode) ri += vectorSize*(stride - offset - depth);
+    }
+
+    if (j < rows)
+    {
+      if(PanelMode) ri += offset*(rows - j);
+
+      for(Index i = 0; i < depth; i++)
+      {
+        Index k = j;
+        for(; k < rows; k++)
+        {
+          blockA[ri] = lhs(k, i);
+          ri += 1;
+        }
+      }
+    }
+  }
+};
+
+// General template for rhs packing, float64 specialization.
+template<typename Index, typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<double, Index, DataMapper, Packet2d, StorageOrder, PanelMode, false>
+{
+  EIGEN_STRONG_INLINE void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+  {
+    const Index vectorSize = quad_traits<double>::vectorsize;
+    Index ri = 0, j = 0;
+
+    for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
+    {
+      Index i = 0;
+
+      if(PanelMode) ri += offset*(2*vectorSize);
+
+      for(; i + vectorSize <= depth; i+=vectorSize)
+      {
+        PacketBlock<Packet2d,4> block;
+        if(StorageOrder == ColMajor)
+        {
+          PacketBlock<Packet2d,2> block1, block2;
+          block1.packet[0] = rhs.template loadPacket<Packet2d>(i, j + 0);
+          block1.packet[1] = rhs.template loadPacket<Packet2d>(i, j + 1);
+          block2.packet[0] = rhs.template loadPacket<Packet2d>(i, j + 2);
+          block2.packet[1] = rhs.template loadPacket<Packet2d>(i, j + 3);
+
+          ptranspose(block1);
+          ptranspose(block2);
+
+          pstore<double>(blockB + ri    , block1.packet[0]);
+          pstore<double>(blockB + ri + 2, block2.packet[0]);
+          pstore<double>(blockB + ri + 4, block1.packet[1]);
+          pstore<double>(blockB + ri + 6, block2.packet[1]);
+        } else {
+          block.packet[0] = rhs.template loadPacket<Packet2d>(i + 0, j + 0); //[a1 a2]
+          block.packet[1] = rhs.template loadPacket<Packet2d>(i + 0, j + 2); //[a3 a4]
+          block.packet[2] = rhs.template loadPacket<Packet2d>(i + 1, j + 0); //[b1 b2]
+          block.packet[3] = rhs.template loadPacket<Packet2d>(i + 1, j + 2); //[b3 b4]
+
+          storeBlock<double, Packet2d, Index, 4>(blockB + ri, block);
+        }
+
+        ri += 4*vectorSize;
+      }
+      for(; i < depth; i++)
+      {
+        if(StorageOrder == ColMajor)
+        {
+          blockB[ri+0] = rhs(i, j+0);
+          blockB[ri+1] = rhs(i, j+1);
+
+          ri += vectorSize;
+
+          blockB[ri+0] = rhs(i, j+2);
+          blockB[ri+1] = rhs(i, j+3);
+        } else {
+          Packet2d rhsV = rhs.template loadPacket<Packet2d>(i, j);
+          pstore<double>(blockB + ri, rhsV);
+
+          ri += vectorSize;
+
+          rhsV = rhs.template loadPacket<Packet2d>(i, j + 2);
+          pstore<double>(blockB + ri, rhsV);
+        }
+        ri += vectorSize;
+      }
+
+      if(PanelMode) ri += (2*vectorSize)*(stride - offset - depth);
+    }
+
+    if(PanelMode) ri += offset;
+
+    for(; j < cols; j++)
+    {
+      for(Index i = 0; i < depth; i++)
+      {
+        blockB[ri] = rhs(i, j);
+        ri += 1;
+      }
+
+      if(PanelMode) ri += stride - depth;
+    }
+  }
+};
+
+// General template for lhs complex packing, float64 specialization.
+template<typename Index, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
+struct dhs_cpack<double, Index, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, true>
+{
+  EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+  {
+    const Index vectorSize = quad_traits<double>::vectorsize;
+    const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
+    Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
+    double* blockAt = reinterpret_cast<double *>(blockA);
+    Index j = 0;
+
+    for(; j + vectorSize <= rows; j+=vectorSize)
+    {
+      Index i = 0;
+
+      rii = rir + vectorDelta;
+
+      for(; i + vectorSize <= depth; i+=vectorSize)
+      {
+        PacketBlock<Packet,2> blockr, blocki;
+        PacketBlock<PacketC,4> cblock;
+
+        if(StorageOrder == ColMajor)
+        {
+          cblock.packet[0] = lhs.template loadPacket<PacketC>(j, i + 0); //[a1 a1i]
+          cblock.packet[1] = lhs.template loadPacket<PacketC>(j, i + 1); //[b1 b1i]
+
+          cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 1, i + 0); //[a2 a2i]
+          cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1); //[b2 b2i]
+
+          blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETREAL64); //[a1 a2]
+          blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2]
+
+          blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETIMAG64);
+          blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETIMAG64);
+        } else {
+          cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i); //[a1 a1i]
+          cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i); //[a2 a2i]
+
+          cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 0, i + 1); //[b1 b1i]
+          cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1); //[b2 b2i
+
+          blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); //[a1 a2]
+          blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2]
+
+          blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
+          blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64);
+        }
+
+        if(Conjugate)
+        {
+          blocki.packet[0] = -blocki.packet[0];
+          blocki.packet[1] = -blocki.packet[1];
+        }
+
+        storeBlock<double, Packet, Index, 2>(blockAt + rir, blockr);
+        storeBlock<double, Packet, Index, 2>(blockAt + rii, blocki);
+
+        rir += 2*vectorSize;
+        rii += 2*vectorSize;
+      }
+      for(; i < depth; i++)
+      {
+        PacketBlock<Packet,1> blockr, blocki;
+        PacketBlock<PacketC,2> cblock;
+
+        cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
+        cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i);
+
+        blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
+        blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
+
+        if(Conjugate)
+        {
+          blocki.packet[0] = -blocki.packet[0];
+        }
+
+        pstore<double>(blockAt + rir, blockr.packet[0]);
+        pstore<double>(blockAt + rii, blocki.packet[0]);
+
+        rir += vectorSize;
+        rii += vectorSize;
+      }
+
+      rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
+    }
+
+    if (j < rows)
+    {
+      if(PanelMode) rir += (offset*(rows - j - vectorSize));
+      rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
+
+      for(Index i = 0; i < depth; i++)
+      {
+        Index k = j;
+        for(; k < rows; k++)
+        {
+          blockAt[rir] = lhs(k, i).real();
+
+          if(Conjugate)
+            blockAt[rii] = -lhs(k, i).imag();
+          else
+            blockAt[rii] =  lhs(k, i).imag();
+
+          rir += 1;
+          rii += 1;
+        }
+      }
+    }
+  }
+};
+
+// General template for rhs complex packing, float64 specialization.
+template<typename Index, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
+struct dhs_cpack<double, Index, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, false>
+{
+  EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+  {
+    const Index vectorSize = quad_traits<double>::vectorsize;
+    const Index vectorDelta = 2*vectorSize * ((PanelMode) ? stride : depth);
+    Index rir = ((PanelMode) ? (2*vectorSize*offset) : 0), rii;
+    double* blockBt = reinterpret_cast<double *>(blockB);
+    Index j = 0;
+
+    for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
+    {
+      Index i = 0;
+
+      rii = rir + vectorDelta;
+
+      for(; i < depth; i++)
+      {
+        PacketBlock<PacketC,4> cblock;
+        PacketBlock<Packet,2> blockr, blocki;
+
+        bload<DataMapper, PacketC, Index, 2, ColMajor, false, 4>(cblock, rhs, i, j);
+
+        blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
+        blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64);
+
+        blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
+        blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64);
+
+        if(Conjugate)
+        {
+          blocki.packet[0] = -blocki.packet[0];
+          blocki.packet[1] = -blocki.packet[1];
+        }
+
+        storeBlock<double, Packet, Index, 2>(blockBt + rir, blockr);
+        storeBlock<double, Packet, Index, 2>(blockBt + rii, blocki);
+
+        rir += 2*vectorSize;
+        rii += 2*vectorSize;
+      }
+
+      rir += ((PanelMode) ? (2*vectorSize*(2*stride - depth)) : vectorDelta);
+    }
+
+    if(PanelMode) rir -= (offset*(2*vectorSize - 1));
+
+    for(; j < cols; j++)
+    {
+      rii = rir + ((PanelMode) ? stride : depth);
+
+      for(Index i = 0; i < depth; i++)
+      {
+        blockBt[rir] = rhs(i, j).real();
+
+        if(Conjugate)
+          blockBt[rii] = -rhs(i, j).imag();
+        else
+          blockBt[rii] =  rhs(i, j).imag();
+
+        rir += 1;
+        rii += 1;
+      }
+
+      rir += ((PanelMode) ? (2*stride - depth) : depth);
+    }
+  }
+};
+
+/**************
+ * GEMM utils *
+ **************/
+
+// 512-bits rank1-update of acc. It can either positive or negative accumulate (useful for complex gemm).
+template<typename Packet, bool NegativeAccumulate, int N>
+EIGEN_ALWAYS_INLINE void pger_common(PacketBlock<Packet,N>* acc, const Packet& lhsV, const Packet* rhsV)
+{
+  if(NegativeAccumulate)
+  {
+    acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]);
+    if (N > 1) {
+      acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]);
+    }
+    if (N > 2) {
+      acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]);
+    }
+    if (N > 3) {
+      acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]);
+    }
+  } else {
+    acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]);
+    if (N > 1) {
+      acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]);
+    }
+    if (N > 2) {
+      acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]);
+    }
+    if (N > 3) {
+      acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]);
+    }
+  }
+}
+
+template<int N, typename Scalar, typename Packet, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pger(PacketBlock<Packet,N>* acc, const Scalar* lhs, const Packet* rhsV)
+{
+  Packet lhsV = pload<Packet>(lhs);
+
+  pger_common<Packet, NegativeAccumulate, N>(acc, lhsV, rhsV);
+}
+
+template<typename Scalar, typename Packet, typename Index, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV)
+{
+#ifdef _ARCH_PWR9
+  lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar));
+#else
+  Index i = 0;
+  do {
+    lhsV[i] = lhs[i];
+  } while (++i < remaining_rows);
+#endif
+}
+
+template<int N, typename Scalar, typename Packet, typename Index, bool NegativeAccumulate, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void pger(PacketBlock<Packet,N>* acc, const Scalar* lhs, const Packet* rhsV)
+{
+  Packet lhsV;
+  loadPacketRemaining<Scalar, Packet, Index, remaining_rows>(lhs, lhsV);
+
+  pger_common<Packet, NegativeAccumulate, N>(acc, lhsV, rhsV);
+}
+
+// 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types real * complex and complex * real.
+template<int N, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Packet &lhsV, const Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi)
+{
+  pger_common<Packet, false, N>(accReal, lhsV, rhsV);
+  if(LhsIsReal)
+  {
+    pger_common<Packet, ConjugateRhs, N>(accImag, lhsV, rhsVi);
+    EIGEN_UNUSED_VARIABLE(lhsVi);
+  } else {
+    if (!RhsIsReal) {
+      pger_common<Packet, ConjugateLhs == ConjugateRhs, N>(accReal, lhsVi, rhsVi);
+      pger_common<Packet, ConjugateRhs, N>(accImag, lhsV, rhsVi);
+    } else {
+      EIGEN_UNUSED_VARIABLE(rhsVi);
+    }
+    pger_common<Packet, ConjugateLhs, N>(accImag, lhsVi, rhsV);
+  }
+}
+
+template<int N, typename Scalar, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi)
+{
+  Packet lhsV = ploadLhs<Scalar, Packet>(lhs_ptr);
+  Packet lhsVi;
+  if(!LhsIsReal) lhsVi = ploadLhs<Scalar, Packet>(lhs_ptr_imag);
+  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+
+  pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
+}
+
+template<typename Scalar, typename Packet, typename Index, bool LhsIsReal, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi)
+{
+#ifdef _ARCH_PWR9
+  lhsV = vec_xl_len((Scalar *)lhs_ptr, remaining_rows * sizeof(Scalar));
+  if(!LhsIsReal) lhsVi = vec_xl_len((Scalar *)lhs_ptr_imag, remaining_rows * sizeof(Scalar));
+  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+#else
+  Index i = 0;
+  do {
+    lhsV[i] = lhs_ptr[i];
+    if(!LhsIsReal) lhsVi[i] = lhs_ptr_imag[i];
+  } while (++i < remaining_rows);
+  if(LhsIsReal) EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+#endif
+}
+
+template<int N, typename Scalar, typename Packet, typename Index, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi)
+{
+  Packet lhsV, lhsVi;
+  loadPacketRemaining<Scalar, Packet, Index, LhsIsReal, remaining_rows>(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi);
+
+  pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
+}
+
+template<typename Scalar, typename Packet>
+EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs)
+{
+  return ploadu<Packet>(lhs);
+}
+
+// Zero the accumulator on PacketBlock.
+template<typename Scalar, typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock<Packet,N>& acc)
+{
+  acc.packet[0] = pset1<Packet>((Scalar)0);
+  if (N > 1) {
+    acc.packet[1] = pset1<Packet>((Scalar)0);
+  }
+  if (N > 2) {
+    acc.packet[2] = pset1<Packet>((Scalar)0);
+  }
+  if (N > 3) {
+    acc.packet[3] = pset1<Packet>((Scalar)0);
+  }
+}
+
+// Scale the PacketBlock vectors by alpha.
+template<typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha)
+{
+  acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]);
+  if (N > 1) {
+    acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]);
+  }
+  if (N > 2) {
+    acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]);
+  }
+  if (N > 3) {
+    acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]);
+  }
+}
+
+template<typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha)
+{
+  acc.packet[0] = pmul<Packet>(accZ.packet[0], pAlpha);
+  if (N > 1) {
+    acc.packet[1] = pmul<Packet>(accZ.packet[1], pAlpha);
+  }
+  if (N > 2) {
+    acc.packet[2] = pmul<Packet>(accZ.packet[2], pAlpha);
+  }
+  if (N > 3) {
+    acc.packet[3] = pmul<Packet>(accZ.packet[3], pAlpha);
+  }
+}
+
+// Complex version of PacketBlock scaling.
+template<typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag)
+{
+  bscalec_common<Packet, N>(cReal, aReal, bReal);
+
+  bscalec_common<Packet, N>(cImag, aImag, bReal);
+
+  pger_common<Packet, true, N>(&cReal, bImag, aImag.packet);
+
+  pger_common<Packet, false, N>(&cImag, bImag, aReal.packet);
+}
+
+template<typename Packet, int N>
+EIGEN_ALWAYS_INLINE void band(PacketBlock<Packet,N>& acc, const Packet& pMask)
+{
+  acc.packet[0] = pand(acc.packet[0], pMask);
+  if (N > 1) {
+    acc.packet[1] = pand(acc.packet[1], pMask);
+  }
+  if (N > 2) {
+    acc.packet[2] = pand(acc.packet[2], pMask);
+  }
+  if (N > 3) {
+    acc.packet[3] = pand(acc.packet[3], pMask);
+  }
+}
+
+template<typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag, const Packet& pMask)
+{
+  band<Packet, N>(aReal, pMask);
+  band<Packet, N>(aImag, pMask);
+
+  bscalec<Packet,N>(aReal, aImag, bReal, bImag, cReal, cImag);
+}
+
+// Load a PacketBlock, the N parameters make tunning gemm easier so we can add more accumulators as needed.
+template<typename DataMapper, typename Packet, typename Index, const Index accCols, int StorageOrder, bool Complex, int N>
+EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,N*(Complex?2:1)>& acc, const DataMapper& res, Index row, Index col)
+{
+  if (StorageOrder == RowMajor) {
+    acc.packet[0] = res.template loadPacket<Packet>(row + 0, col);
+    if (N > 1) {
+      acc.packet[1] = res.template loadPacket<Packet>(row + 1, col);
+    }
+    if (N > 2) {
+      acc.packet[2] = res.template loadPacket<Packet>(row + 2, col);
+    }
+    if (N > 3) {
+      acc.packet[3] = res.template loadPacket<Packet>(row + 3, col);
+    }
+    if (Complex) {
+      acc.packet[0+N] = res.template loadPacket<Packet>(row + 0, col + accCols);
+      if (N > 1) {
+        acc.packet[1+N] = res.template loadPacket<Packet>(row + 1, col + accCols);
+      }
+      if (N > 2) {
+        acc.packet[2+N] = res.template loadPacket<Packet>(row + 2, col + accCols);
+      }
+      if (N > 3) {
+        acc.packet[3+N] = res.template loadPacket<Packet>(row + 3, col + accCols);
+      }
+    }
+  } else {
+    acc.packet[0] = res.template loadPacket<Packet>(row, col + 0);
+    if (N > 1) {
+      acc.packet[1] = res.template loadPacket<Packet>(row, col + 1);
+    }
+    if (N > 2) {
+      acc.packet[2] = res.template loadPacket<Packet>(row, col + 2);
+    }
+    if (N > 3) {
+      acc.packet[3] = res.template loadPacket<Packet>(row, col + 3);
+    }
+    if (Complex) {
+      acc.packet[0+N] = res.template loadPacket<Packet>(row + accCols, col + 0);
+      if (N > 1) {
+        acc.packet[1+N] = res.template loadPacket<Packet>(row + accCols, col + 1);
+      }
+      if (N > 2) {
+        acc.packet[2+N] = res.template loadPacket<Packet>(row + accCols, col + 2);
+      }
+      if (N > 3) {
+        acc.packet[3+N] = res.template loadPacket<Packet>(row + accCols, col + 3);
+      }
+    }
+  }
+}
+
+const static Packet4i mask41 = { -1,  0,  0,  0 };
+const static Packet4i mask42 = { -1, -1,  0,  0 };
+const static Packet4i mask43 = { -1, -1, -1,  0 };
+
+const static Packet2l mask21 = { -1, 0 };
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows)
+{
+  if (remaining_rows == 0) {
+    return pset1<Packet>(float(0.0));  // Not used
+  } else {
+    switch (remaining_rows) {
+      case 1:  return Packet(mask41);
+      case 2:  return Packet(mask42);
+      default: return Packet(mask43);
+    }
+  }
+}
+
+template<>
+EIGEN_ALWAYS_INLINE Packet2d bmask<Packet2d>(const int remaining_rows)
+{
+  if (remaining_rows == 0) {
+    return pset1<Packet2d>(double(0.0));  // Not used
+  } else {
+    return Packet2d(mask21);
+  }
+}
+
+template<typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha, const Packet& pMask)
+{
+  band<Packet, N>(accZ, pMask);
+
+  bscale<Packet, N>(acc, accZ, pAlpha);
+}
+
+template<typename Packet, int N> EIGEN_ALWAYS_INLINE void
+pbroadcastN_old(const __UNPACK_TYPE__(Packet) *a,
+                      Packet& a0, Packet& a1, Packet& a2, Packet& a3)
+{
+  a0 = pset1<Packet>(a[0]);
+  if (N > 1) {
+    a1 = pset1<Packet>(a[1]);
+  } else {
+    EIGEN_UNUSED_VARIABLE(a1);
+  }
+  if (N > 2) {
+    a2 = pset1<Packet>(a[2]);
+  } else {
+    EIGEN_UNUSED_VARIABLE(a2);
+  }
+  if (N > 3) {
+    a3 = pset1<Packet>(a[3]);
+  } else {
+    EIGEN_UNUSED_VARIABLE(a3);
+  }
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void pbroadcastN_old<Packet4f,4>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  pbroadcast4<Packet4f>(a, a0, a1, a2, a3);
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void pbroadcastN_old<Packet2d,4>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
+{
+  a1 = pload<Packet2d>(a);
+  a3 = pload<Packet2d>(a + 2);
+  a0 = vec_splat(a1, 0);
+  a1 = vec_splat(a1, 1);
+  a2 = vec_splat(a3, 0);
+  a3 = vec_splat(a3, 1);
+}
+
+template<typename Packet, int N> EIGEN_ALWAYS_INLINE void
+pbroadcastN(const __UNPACK_TYPE__(Packet) *a,
+                      Packet& a0, Packet& a1, Packet& a2, Packet& a3)
+{
+  a0 = pset1<Packet>(a[0]);
+  if (N > 1) {
+    a1 = pset1<Packet>(a[1]);
+  } else {
+    EIGEN_UNUSED_VARIABLE(a1);
+  }
+  if (N > 2) {
+    a2 = pset1<Packet>(a[2]);
+  } else {
+    EIGEN_UNUSED_VARIABLE(a2);
+  }
+  if (N > 3) {
+    a3 = pset1<Packet>(a[3]);
+  } else {
+    EIGEN_UNUSED_VARIABLE(a3);
+  }
+}
+
+template<> EIGEN_ALWAYS_INLINE void
+pbroadcastN<Packet4f,4>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  a3 = pload<Packet4f>(a);
+  a0 = vec_splat(a3, 0);
+  a1 = vec_splat(a3, 1);
+  a2 = vec_splat(a3, 2);
+  a3 = vec_splat(a3, 3);
+}
+
+// PEEL loop factor.
+#define PEEL 7
+#define PEEL_ROW 7
+
+#define MICRO_UNROLL_PEEL(func) \
+  func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
+
+#define MICRO_ZERO_PEEL(peel) \
+  if ((PEEL_ROW > peel) && (peel != 0)) { \
+    bsetzero<Scalar, Packet, accRows>(accZero##peel); \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(accZero##peel); \
+  }
+
+#define MICRO_ZERO_PEEL_ROW \
+  MICRO_UNROLL_PEEL(MICRO_ZERO_PEEL);
+
+#define MICRO_WORK_PEEL(peel) \
+  if (PEEL_ROW > peel) { \
+    pbroadcastN<Packet,accRows>(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
+    pger<accRows, Scalar, Packet, false>(&accZero##peel, lhs_ptr + (remaining_rows * peel), rhsV##peel); \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+  }
+
+#define MICRO_WORK_PEEL_ROW \
+  Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4], rhsV4[4], rhsV5[4], rhsV6[4], rhsV7[4]; \
+  MICRO_UNROLL_PEEL(MICRO_WORK_PEEL); \
+  lhs_ptr += (remaining_rows * PEEL_ROW); \
+  rhs_ptr += (accRows * PEEL_ROW);
+
+#define MICRO_ADD_PEEL(peel, sum) \
+  if (PEEL_ROW > peel) { \
+    for (Index i = 0; i < accRows; i++) { \
+      accZero##sum.packet[i] += accZero##peel.packet[i]; \
+    } \
+  }
+
+#define MICRO_ADD_PEEL_ROW \
+  MICRO_ADD_PEEL(4, 0) MICRO_ADD_PEEL(5, 1) MICRO_ADD_PEEL(6, 2) MICRO_ADD_PEEL(7, 3) \
+  MICRO_ADD_PEEL(2, 0) MICRO_ADD_PEEL(3, 1) MICRO_ADD_PEEL(1, 0)
+
+template<typename Scalar, typename Packet, typename Index, const Index accRows, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW(
+  const Scalar* &lhs_ptr,
+  const Scalar* &rhs_ptr,
+  PacketBlock<Packet,accRows> &accZero)
+{
+  Packet rhsV[4];
+  pbroadcastN<Packet,accRows>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
+  pger<accRows, Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
+  lhs_ptr += remaining_rows;
+  rhs_ptr += accRows;
+}
+
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void gemm_unrolled_row_iteration(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index row,
+  Index col,
+  Index rows,
+  Index cols,
+  const Packet& pAlpha,
+  const Packet& pMask)
+{
+  const Scalar* rhs_ptr = rhs_base;
+  const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
+  PacketBlock<Packet,accRows> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7, acc;
+
+  bsetzero<Scalar, Packet, accRows>(accZero0);
+
+  Index remaining_depth = (col + quad_traits<Scalar>::rows < cols) ? depth : (depth & -quad_traits<Scalar>::rows);
+  Index k = 0;
+  if (remaining_depth >= PEEL_ROW) {
+    MICRO_ZERO_PEEL_ROW
+    do
+    {
+      EIGEN_POWER_PREFETCH(rhs_ptr);
+      EIGEN_POWER_PREFETCH(lhs_ptr);
+      MICRO_WORK_PEEL_ROW
+    } while ((k += PEEL_ROW) + PEEL_ROW <= remaining_depth);
+    MICRO_ADD_PEEL_ROW
+  }
+  for(; k < remaining_depth; k++)
+  {
+    MICRO_EXTRA_ROW<Scalar, Packet, Index, accRows, remaining_rows>(lhs_ptr, rhs_ptr, accZero0);
+  }
+
+  if ((remaining_depth == depth) && (rows >= accCols))
+  {
+    bload<DataMapper, Packet, Index, 0, ColMajor, false, accRows>(acc, res, row, 0);
+    bscale<Packet,accRows>(acc, accZero0, pAlpha, pMask);
+    res.template storePacketBlock<Packet,accRows>(row, 0, acc);
+  } else {
+    for(; k < depth; k++)
+    {
+      Packet rhsV[4];
+      pbroadcastN<Packet,accRows>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
+      pger<accRows, Scalar, Packet, Index, false, remaining_rows>(&accZero0, lhs_ptr, rhsV);
+      lhs_ptr += remaining_rows;
+      rhs_ptr += accRows;
+    }
+
+    for(Index j = 0; j < accRows; j++) {
+      accZero0.packet[j] = vec_mul(pAlpha, accZero0.packet[j]);
+      for(Index i = 0; i < remaining_rows; i++) {
+        res(row + i, j) += accZero0.packet[j][i];
+      }
+    }
+  }
+}
+
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_extra_row(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index row,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlpha,
+  const Packet& pMask)
+{
+  switch(remaining_rows) {
+    case 1:
+      gemm_unrolled_row_iteration<Scalar, Packet, DataMapper, Index, accRows, accCols, 1>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, pAlpha, pMask);
+      break;
+    case 2:
+      if (sizeof(Scalar) == sizeof(float)) {
+        gemm_unrolled_row_iteration<Scalar, Packet, DataMapper, Index, accRows, accCols, 2>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, pAlpha, pMask);
+      }
+      break;
+    default:
+      if (sizeof(Scalar) == sizeof(float)) {
+        gemm_unrolled_row_iteration<Scalar, Packet, DataMapper, Index, accRows, accCols, 3>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, pAlpha, pMask);
+      }
+      break;
+  }
+}
+
+#define MICRO_UNROLL(func) \
+  func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
+
+#define MICRO_UNROLL_WORK(func, func2, peel) \
+    MICRO_UNROLL(func2); \
+    func(0,peel) func(1,peel) func(2,peel) func(3,peel) \
+    func(4,peel) func(5,peel) func(6,peel) func(7,peel)
+
+#define MICRO_LOAD_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr##iter); \
+    lhs_ptr##iter += accCols; \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhsV##iter); \
+  }
+
+#define MICRO_WORK_ONE(iter, peel) \
+  if (unroll_factor > iter) { \
+    pger_common<Packet, false, accRows>(&accZero##iter, lhsV##iter, rhsV##peel); \
+  }
+
+#define MICRO_TYPE_PEEL4(func, func2, peel) \
+  if (PEEL > peel) { \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
+    pbroadcastN<Packet,accRows>(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
+    MICRO_UNROLL_WORK(func, func2, peel) \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+  }
+
+#define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2) \
+  Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M]; \
+  func(func1,func2,0); func(func1,func2,1); \
+  func(func1,func2,2); func(func1,func2,3); \
+  func(func1,func2,4); func(func1,func2,5); \
+  func(func1,func2,6); func(func1,func2,7);
+
+#define MICRO_UNROLL_TYPE_ONE(M, func, func1, func2) \
+  Packet rhsV0[M]; \
+  func(func1,func2,0);
+
+#define MICRO_ONE_PEEL4 \
+  MICRO_UNROLL_TYPE_PEEL(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
+  rhs_ptr += (accRows * PEEL);
+
+#define MICRO_ONE4 \
+  MICRO_UNROLL_TYPE_ONE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
+  rhs_ptr += accRows;
+
+#define MICRO_DST_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    bsetzero<Scalar, Packet, accRows>(accZero##iter); \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(accZero##iter); \
+  }
+
+#define MICRO_DST_PTR MICRO_UNROLL(MICRO_DST_PTR_ONE)
+
+#define MICRO_SRC_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols; \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
+  }
+
+#define MICRO_SRC_PTR MICRO_UNROLL(MICRO_SRC_PTR_ONE)
+
+#define MICRO_PREFETCH_ONE(iter) \
+  if (unroll_factor > iter) { \
+    EIGEN_POWER_PREFETCH(lhs_ptr##iter); \
+  }
+
+#define MICRO_PREFETCH MICRO_UNROLL(MICRO_PREFETCH_ONE)
+
+#define MICRO_STORE_ONE(iter) \
+  if (unroll_factor > iter) { \
+    bload<DataMapper, Packet, Index, 0, ColMajor, false, accRows>(acc, res, row + iter*accCols, 0); \
+    bscale<Packet,accRows>(acc, accZero##iter, pAlpha); \
+    res.template storePacketBlock<Packet,accRows>(row + iter*accCols, 0, acc); \
+  }
+
+#define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE)
+
+template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
+EIGEN_STRONG_INLINE void gemm_unrolled_iteration(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index& row,
+  const Packet& pAlpha)
+{
+  const Scalar* rhs_ptr = rhs_base;
+  const Scalar* lhs_ptr0 = NULL, *  lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
+  PacketBlock<Packet,accRows> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
+  PacketBlock<Packet,accRows> acc;
+
+  MICRO_SRC_PTR
+  MICRO_DST_PTR
+
+  Index k = 0;
+  for(; k + PEEL <= depth; k+= PEEL)
+  {
+    EIGEN_POWER_PREFETCH(rhs_ptr);
+    MICRO_PREFETCH
+    MICRO_ONE_PEEL4
+  }
+  for(; k < depth; k++)
+  {
+    MICRO_ONE4
+  }
+  MICRO_STORE
+
+  row += unroll_factor*accCols;
+}
+
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_cols(
+  const DataMapper& res,
+  const Scalar* blockA,
+  const Scalar* blockB,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index offsetB,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlpha,
+  const Packet& pMask)
+{
+  const DataMapper res3 = res.getSubMapper(0, col);
+
+  const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
+  const Scalar* lhs_base = blockA + accCols*offsetA;
+  Index row = 0;
+
+#define MAX_UNROLL 6
+  while(row + MAX_UNROLL*accCols <= rows) {
+    gemm_unrolled_iteration<MAX_UNROLL, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+  }
+  switch( (rows-row)/accCols ) {
+#if MAX_UNROLL > 7
+    case 7:
+      gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
+#endif
+#if MAX_UNROLL > 6
+    case 6:
+      gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
+#endif
+#if MAX_UNROLL > 5
+    case 5:
+      gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
+#endif
+#if MAX_UNROLL > 4
+    case 4:
+      gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
+#endif
+#if MAX_UNROLL > 3
+    case 3:
+      gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
+#endif
+#if MAX_UNROLL > 2
+    case 2:
+      gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
+#endif
+#if MAX_UNROLL > 1
+    case 1:
+      gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
+#endif
+    default:
+      break;
+  }
+#undef MAX_UNROLL
+
+  if(remaining_rows > 0)
+  {
+    gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask);
+  }
+}
+
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
+EIGEN_STRONG_INLINE void gemm_extra_cols(
+  const DataMapper& res,
+  const Scalar* blockA,
+  const Scalar* blockB,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index offsetB,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlpha,
+  const Packet& pMask)
+{
+  for (; col < cols; col++) {
+    gemm_cols<Scalar, Packet, DataMapper, Index, 1, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
+  }
+}
+
+/****************
+ * GEMM kernels *
+ * **************/
+template<typename Scalar, typename Index, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
+EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+      const Index remaining_rows = rows % accCols;
+
+      if( strideA == -1 ) strideA = depth;
+      if( strideB == -1 ) strideB = depth;
+
+      const Packet pAlpha = pset1<Packet>(alpha);
+      const Packet pMask  = bmask<Packet>((const int)(remaining_rows));
+
+      Index col = 0;
+      for(; col + accRows <= cols; col += accRows)
+      {
+        gemm_cols<Scalar, Packet, DataMapper, Index, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
+      }
+
+      gemm_extra_cols<Scalar, Packet, DataMapper, Index, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
+}
+
+#define accColsC (accCols / 2)
+#define advanceRows ((LhsIsReal) ? 1 : 2)
+#define advanceCols ((RhsIsReal) ? 1 : 2)
+
+// PEEL_COMPLEX loop factor.
+#define PEEL_COMPLEX 3
+#define PEEL_COMPLEX_ROW 3
+
+#define MICRO_COMPLEX_UNROLL_PEEL(func) \
+  func(0) func(1) func(2) func(3)
+
+#define MICRO_COMPLEX_ZERO_PEEL(peel) \
+  if ((PEEL_COMPLEX_ROW > peel) && (peel != 0)) { \
+    bsetzero<Scalar, Packet, accRows>(accReal##peel); \
+    bsetzero<Scalar, Packet, accRows>(accImag##peel); \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(accReal##peel); \
+    EIGEN_UNUSED_VARIABLE(accImag##peel); \
+  }
+
+#define MICRO_COMPLEX_ZERO_PEEL_ROW \
+  MICRO_COMPLEX_UNROLL_PEEL(MICRO_COMPLEX_ZERO_PEEL);
+
+#define MICRO_COMPLEX_WORK_PEEL(peel) \
+  if (PEEL_COMPLEX_ROW > peel) { \
+    pbroadcastN_old<Packet,accRows>(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
+    if(!RhsIsReal) pbroadcastN_old<Packet,accRows>(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \
+    pgerc<accRows, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##peel, &accImag##peel, lhs_ptr_real + (remaining_rows * peel), lhs_ptr_imag + (remaining_rows * peel), rhsV##peel, rhsVi##peel); \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+    EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+  }
+
+#define MICRO_COMPLEX_WORK_PEEL_ROW \
+  Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4]; \
+  Packet rhsVi0[4], rhsVi1[4], rhsVi2[4], rhsVi3[4]; \
+  MICRO_COMPLEX_UNROLL_PEEL(MICRO_COMPLEX_WORK_PEEL); \
+  lhs_ptr_real += (remaining_rows * PEEL_COMPLEX_ROW); \
+  if(!LhsIsReal) lhs_ptr_imag += (remaining_rows * PEEL_COMPLEX_ROW); \
+  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); \
+  rhs_ptr_real += (accRows * PEEL_COMPLEX_ROW); \
+  if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX_ROW); \
+  else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
+
+#define MICRO_COMPLEX_ADD_PEEL(peel, sum) \
+  if (PEEL_COMPLEX_ROW > peel) { \
+    for (Index i = 0; i < accRows; i++) { \
+      accReal##sum.packet[i] += accReal##peel.packet[i]; \
+      accImag##sum.packet[i] += accImag##peel.packet[i]; \
+    } \
+  }
+
+#define MICRO_COMPLEX_ADD_PEEL_ROW \
+  MICRO_COMPLEX_ADD_PEEL(2, 0) MICRO_COMPLEX_ADD_PEEL(3, 1) \
+  MICRO_COMPLEX_ADD_PEEL(1, 0)
+
+template<typename Scalar, typename Packet, typename Index, const Index accRows, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW(
+  const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag,
+  const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag,
+  PacketBlock<Packet,accRows> &accReal, PacketBlock<Packet,accRows> &accImag)
+{
+  Packet rhsV[4], rhsVi[4];
+  pbroadcastN_old<Packet,accRows>(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
+  if(!RhsIsReal) pbroadcastN_old<Packet,accRows>(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]);
+  pgerc<accRows, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
+  lhs_ptr_real += remaining_rows;
+  if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
+  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+  rhs_ptr_real += accRows;
+  if(!RhsIsReal) rhs_ptr_imag += accRows;
+  else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
+}
+
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void gemm_unrolled_complex_row_iteration(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index row,
+  Index col,
+  Index rows,
+  Index cols,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag,
+  const Packet& pMask)
+{
+  const Scalar* rhs_ptr_real = rhs_base;
+  const Scalar* rhs_ptr_imag = NULL;
+  if(!RhsIsReal) rhs_ptr_imag = rhs_base + accRows*strideB;
+  else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
+  const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA;
+  const Scalar* lhs_ptr_imag = NULL;
+  if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA;
+  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+  PacketBlock<Packet,accRows> accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3;
+  PacketBlock<Packet,accRows> taccReal, taccImag;
+  PacketBlock<Packetc,accRows> acc0, acc1;
+  PacketBlock<Packetc,accRows*2> tRes;
+
+  bsetzero<Scalar, Packet, accRows>(accReal0);
+  bsetzero<Scalar, Packet, accRows>(accImag0);
+
+  Index remaining_depth = (col + quad_traits<Scalar>::rows < cols) ? depth : (depth & -quad_traits<Scalar>::rows);
+  Index k = 0;
+  if (remaining_depth >= PEEL_COMPLEX_ROW) {
+    MICRO_COMPLEX_ZERO_PEEL_ROW
+    do
+    {
+      EIGEN_POWER_PREFETCH(rhs_ptr_real);
+      if(!RhsIsReal) {
+        EIGEN_POWER_PREFETCH(rhs_ptr_imag);
+      }
+      EIGEN_POWER_PREFETCH(lhs_ptr_real);
+      if(!LhsIsReal) {
+        EIGEN_POWER_PREFETCH(lhs_ptr_imag);
+      }
+      MICRO_COMPLEX_WORK_PEEL_ROW
+    } while ((k += PEEL_COMPLEX_ROW) + PEEL_COMPLEX_ROW <= remaining_depth);
+    MICRO_COMPLEX_ADD_PEEL_ROW
+  }
+  for(; k < remaining_depth; k++)
+  {
+    MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, remaining_rows>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal0, accImag0);
+  }
+
+  if ((remaining_depth == depth) && (rows >= accCols))
+  {
+    bload<DataMapper, Packetc, Index, accColsC, ColMajor, true, accRows>(tRes, res, row, 0);
+    bscalec<Packet,accRows>(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
+    bcouple<Packet, Packetc, accRows>(taccReal, taccImag, tRes, acc0, acc1);
+    res.template storePacketBlock<Packetc,accRows>(row + 0, 0, acc0);
+    res.template storePacketBlock<Packetc,accRows>(row + accColsC, 0, acc1);
+  } else {
+    for(; k < depth; k++)
+    {
+      Packet rhsV[4], rhsVi[4];
+      pbroadcastN_old<Packet,accRows>(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
+      if(!RhsIsReal) pbroadcastN_old<Packet,accRows>(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]);
+      pgerc<accRows, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, remaining_rows>(&accReal0, &accImag0, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
+      lhs_ptr_real += remaining_rows;
+      if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
+      rhs_ptr_real += accRows;
+      if(!RhsIsReal) rhs_ptr_imag += accRows;
+    }
+
+    bscalec<Packet,accRows>(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag);
+    bcouple_common<Packet, Packetc, accRows>(taccReal, taccImag, acc0, acc1);
+
+    if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1))
+    {
+      for(Index j = 0; j < accRows; j++) {
+        res(row + 0, j) += pfirst<Packetc>(acc0.packet[j]);
+      }
+    } else {
+      for(Index j = 0; j < accRows; j++) {
+        PacketBlock<Packetc,1> acc2;
+        acc2.packet[0] = res.template loadPacket<Packetc>(row + 0, j) + acc0.packet[j];
+        res.template storePacketBlock<Packetc,1>(row + 0, j, acc2);
+        if(remaining_rows > accColsC) {
+          res(row + accColsC, j) += pfirst<Packetc>(acc1.packet[j]);
+        }
+      }
+    }
+  }
+}
+
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index row,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag,
+  const Packet& pMask)
+{
+  switch(remaining_rows) {
+    case 1:
+      gemm_unrolled_complex_row_iteration<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, 1>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, pAlphaReal, pAlphaImag, pMask);
+      break;
+    case 2:
+      if (sizeof(Scalar) == sizeof(float)) {
+        gemm_unrolled_complex_row_iteration<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, 2>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, pAlphaReal, pAlphaImag, pMask);
+      }
+      break;
+    default:
+      if (sizeof(Scalar) == sizeof(float)) {
+        gemm_unrolled_complex_row_iteration<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, 3>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, pAlphaReal, pAlphaImag, pMask);
+      }
+      break;
+  }
+}
+
+#define MICRO_COMPLEX_UNROLL(func) \
+  func(0) func(1) func(2) func(3)
+
+#define MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
+    MICRO_COMPLEX_UNROLL(func2); \
+    func(0,peel) func(1,peel) func(2,peel) func(3,peel)
+
+#define MICRO_COMPLEX_LOAD_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter); \
+    if(!LhsIsReal) { \
+      lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter + imag_delta); \
+    } else { \
+      EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
+    } \
+    lhs_ptr_real##iter += accCols; \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhsV##iter); \
+    EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
+  }
+
+#define MICRO_COMPLEX_WORK_ONE4(iter, peel) \
+  if (unroll_factor > iter) { \
+    pgerc_common<accRows, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
+  }
+
+#define MICRO_COMPLEX_TYPE_PEEL4(func, func2, peel) \
+  if (PEEL_COMPLEX > peel) { \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3; \
+    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \
+    pbroadcastN_old<Packet,accRows>(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
+    if(!RhsIsReal) { \
+      pbroadcastN_old<Packet,accRows>(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \
+    } else { \
+      EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+    } \
+    MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+    EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+  }
+
+#define MICRO_COMPLEX_UNROLL_TYPE_PEEL(M, func, func1, func2) \
+  Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M]; \
+  Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M]; \
+  func(func1,func2,0); func(func1,func2,1); \
+  func(func1,func2,2); func(func1,func2,3);
+
+#define MICRO_COMPLEX_UNROLL_TYPE_ONE(M, func, func1, func2) \
+  Packet rhsV0[M], rhsVi0[M];\
+  func(func1,func2,0);
+
+#define MICRO_COMPLEX_ONE_PEEL4 \
+  MICRO_COMPLEX_UNROLL_TYPE_PEEL(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \
+  rhs_ptr_real += (accRows * PEEL_COMPLEX); \
+  if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX);
+
+#define MICRO_COMPLEX_ONE4 \
+  MICRO_COMPLEX_UNROLL_TYPE_ONE(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \
+  rhs_ptr_real += accRows; \
+  if(!RhsIsReal) rhs_ptr_imag += accRows;
+
+#define MICRO_COMPLEX_DST_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    bsetzero<Scalar, Packet, accRows>(accReal##iter); \
+    bsetzero<Scalar, Packet, accRows>(accImag##iter); \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(accReal##iter); \
+    EIGEN_UNUSED_VARIABLE(accImag##iter); \
+  }
+
+#define MICRO_COMPLEX_DST_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_DST_PTR_ONE)
+
+#define MICRO_COMPLEX_SRC_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols; \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \
+  }
+
+#define MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE)
+
+#define MICRO_COMPLEX_PREFETCH_ONE(iter) \
+  if (unroll_factor > iter) { \
+    EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \
+  }
+
+#define MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_PREFETCH_ONE)
+
+#define MICRO_COMPLEX_STORE_ONE(iter) \
+  if (unroll_factor > iter) { \
+    bload<DataMapper, Packetc, Index, accColsC, ColMajor, true, accRows>(tRes, res, row + iter*accCols, 0); \
+    bscalec<Packet,accRows>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \
+    bcouple<Packet, Packetc, accRows>(taccReal, taccImag, tRes, acc0, acc1); \
+    res.template storePacketBlock<Packetc,accRows>(row + iter*accCols + 0, 0, acc0); \
+    res.template storePacketBlock<Packetc,accRows>(row + iter*accCols + accColsC, 0, acc1); \
+  }
+
+#define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE)
+
+template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index strideB,
+  Index& row,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag)
+{
+  const Scalar* rhs_ptr_real = rhs_base;
+  const Scalar* rhs_ptr_imag = NULL;
+  const Index imag_delta = accCols*strideA;
+  if(!RhsIsReal) {
+    rhs_ptr_imag = rhs_base + accRows*strideB;
+  } else {
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
+  }
+  const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL;
+  const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL;
+  PacketBlock<Packet,accRows> accReal0, accImag0, accReal1, accImag1;
+  PacketBlock<Packet,accRows> accReal2, accImag2, accReal3, accImag3;
+  PacketBlock<Packet,accRows> taccReal, taccImag;
+  PacketBlock<Packetc,accRows> acc0, acc1;
+  PacketBlock<Packetc,accRows*2> tRes;
+
+  MICRO_COMPLEX_SRC_PTR
+  MICRO_COMPLEX_DST_PTR
+
+  Index k = 0;
+  for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX)
+  {
+    EIGEN_POWER_PREFETCH(rhs_ptr_real);
+    if(!RhsIsReal) {
+      EIGEN_POWER_PREFETCH(rhs_ptr_imag);
+    }
+    MICRO_COMPLEX_PREFETCH
+    MICRO_COMPLEX_ONE_PEEL4
+  }
+  for(; k < depth; k++)
+  {
+    MICRO_COMPLEX_ONE4
+  }
+  MICRO_COMPLEX_STORE
+
+  row += unroll_factor*accCols;
+}
+
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_cols(
+  const DataMapper& res,
+  const Scalar* blockA,
+  const Scalar* blockB,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index offsetB,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag,
+  const Packet& pMask)
+{
+  const DataMapper res3 = res.getSubMapper(0, col);
+
+  const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
+  const Scalar* lhs_base = blockA + accCols*offsetA;
+  Index row = 0;
+
+#define MAX_COMPLEX_UNROLL 3
+  while(row + MAX_COMPLEX_UNROLL*accCols <= rows) {
+    gemm_complex_unrolled_iteration<MAX_COMPLEX_UNROLL, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+  }
+  switch( (rows-row)/accCols ) {
+#if MAX_COMPLEX_UNROLL > 4
+    case 4:
+      gemm_complex_unrolled_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+      break;
+#endif
+#if MAX_COMPLEX_UNROLL > 3
+    case 3:
+      gemm_complex_unrolled_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+      break;
+#endif
+#if MAX_COMPLEX_UNROLL > 2
+    case 2:
+      gemm_complex_unrolled_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+      break;
+#endif
+#if MAX_COMPLEX_UNROLL > 1
+    case 1:
+      gemm_complex_unrolled_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+      break;
+#endif
+    default:
+      break;
+  }
+#undef MAX_COMPLEX_UNROLL
+
+  if(remaining_rows > 0)
+  {
+    gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+  }
+}
+
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex_extra_cols(
+  const DataMapper& res,
+  const Scalar* blockA,
+  const Scalar* blockB,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index offsetB,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag,
+  const Packet& pMask)
+{
+  for (; col < cols; col++) {
+    gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, Index, 1, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+  }
+}
+
+template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+      const Index remaining_rows = rows % accCols;
+
+      if( strideA == -1 ) strideA = depth;
+      if( strideB == -1 ) strideB = depth;
+
+      const Packet pAlphaReal = pset1<Packet>(alpha.real());
+      const Packet pAlphaImag = pset1<Packet>(alpha.imag());
+      const Packet pMask = bmask<Packet>((const int)(remaining_rows));
+
+      const Scalar* blockA = (Scalar *) blockAc;
+      const Scalar* blockB = (Scalar *) blockBc;
+
+      Index col = 0;
+      for(; col + accRows <= cols; col += accRows)
+      {
+        gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+      }
+
+      gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+}
+
+#undef accColsC
+#undef advanceCols
+#undef advanceRows
+
+/************************************
+ * ppc64le template specializations *
+ * **********************************/
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+{
+  void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+  ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+    dhs_pack<double, Index, DataMapper, Packet2d, ColMajor, PanelMode, true> pack;
+    pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+{
+  void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+  ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+    dhs_pack<double, Index, DataMapper, Packet2d, RowMajor, PanelMode, true> pack;
+    pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+#if EIGEN_ALTIVEC_USE_CUSTOM_PACK
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+{
+  void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+  ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+  dhs_pack<double, Index, DataMapper, Packet2d, ColMajor, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+{
+  void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+  ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+  dhs_pack<double, Index, DataMapper, Packet2d, RowMajor, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+#endif
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+{
+  void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+  ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+  dhs_pack<float, Index, DataMapper, Packet4f, RowMajor, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+{
+  void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+  ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+  dhs_pack<float, Index, DataMapper, Packet4f, ColMajor, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+{
+  void operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+  ::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+  dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+{
+  void operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+  ::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+  dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+#if EIGEN_ALTIVEC_USE_CUSTOM_PACK
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+{
+  void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+  ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+  dhs_pack<float, Index, DataMapper, Packet4f, ColMajor, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+{
+  void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+  ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+  dhs_pack<float, Index, DataMapper, Packet4f, RowMajor, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+#endif
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+{
+  void operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+  ::operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+  dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+{
+  void operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+  ::operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+  dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+{
+  void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+  ::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+  dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+{
+  void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+  ::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+  dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+{
+  void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+  ::operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+  dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+{
+  void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+  ::operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+  dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+// ********* gebp specializations *********
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  typedef typename quad_traits<float>::vectortype   Packet;
+  typedef typename quad_traits<float>::rhstype      RhsPacket;
+
+  void operator()(const DataMapper& res, const float* blockA, const float* blockB,
+                  Index rows, Index depth, Index cols, float alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+  ::operator()(const DataMapper& res, const float* blockA, const float* blockB,
+               Index rows, Index depth, Index cols, float alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB)
+  {
+    const Index accRows = quad_traits<float>::rows;
+    const Index accCols = quad_traits<float>::size;
+    void (*gemm_function)(const DataMapper&, const float*, const float*, Index, Index, Index, float, Index, Index, Index, Index);
+
+    #ifdef EIGEN_ALTIVEC_MMA_ONLY
+      //generate with MMA only
+      gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+    #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+      if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+        gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+      }
+      else{
+        gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+      }
+    #else
+      gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+    #endif
+      gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+  }
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  typedef Packet4f   Packet;
+  typedef Packet2cf  Packetc;
+  typedef Packet4f   RhsPacket;
+
+  void operator()(const DataMapper& res, const std::complex<float>* blockA, const std::complex<float>* blockB,
+                  Index rows, Index depth, Index cols, std::complex<float> alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+  ::operator()(const DataMapper& res, const std::complex<float>* blockA, const std::complex<float>* blockB,
+               Index rows, Index depth, Index cols, std::complex<float> alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB)
+  {
+    const Index accRows = quad_traits<float>::rows;
+    const Index accCols = quad_traits<float>::size;
+    void (*gemm_function)(const DataMapper&, const std::complex<float>*, const std::complex<float>*,
+          Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
+
+    #ifdef EIGEN_ALTIVEC_MMA_ONLY
+       //generate with MMA only
+       gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+         gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+       }
+       else{
+         gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+       }
+     #else
+       gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+     #endif
+      gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+  }
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<float, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  typedef Packet4f   Packet;
+  typedef Packet2cf  Packetc;
+  typedef Packet4f   RhsPacket;
+
+  void operator()(const DataMapper& res, const float* blockA, const std::complex<float>* blockB,
+                  Index rows, Index depth, Index cols, std::complex<float> alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<float, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+  ::operator()(const DataMapper& res, const float* blockA, const std::complex<float>* blockB,
+               Index rows, Index depth, Index cols, std::complex<float> alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB)
+  {
+    const Index accRows = quad_traits<float>::rows;
+    const Index accCols = quad_traits<float>::size;
+    void (*gemm_function)(const DataMapper&, const float*, const std::complex<float>*,
+          Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
+    #ifdef EIGEN_ALTIVEC_MMA_ONLY
+       //generate with MMA only
+       gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+         gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+       }
+       else{
+         gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+       }
+     #else
+       gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+     #endif
+       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+  }
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<std::complex<float>, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  typedef Packet4f   Packet;
+  typedef Packet2cf  Packetc;
+  typedef Packet4f   RhsPacket;
+
+  void operator()(const DataMapper& res, const std::complex<float>* blockA, const float* blockB,
+                  Index rows, Index depth, Index cols, std::complex<float> alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<std::complex<float>, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+  ::operator()(const DataMapper& res, const std::complex<float>* blockA, const float* blockB,
+               Index rows, Index depth, Index cols, std::complex<float> alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB)
+  {
+    const Index accRows = quad_traits<float>::rows;
+    const Index accCols = quad_traits<float>::size;
+    void (*gemm_function)(const DataMapper&, const std::complex<float>*, const float*,
+          Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
+    #ifdef EIGEN_ALTIVEC_MMA_ONLY
+       //generate with MMA only
+       gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+         gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+       }
+       else{
+         gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+       }
+     #else
+       gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+     #endif
+       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+  }
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  typedef typename quad_traits<double>::vectortype  Packet;
+  typedef typename quad_traits<double>::rhstype     RhsPacket;
+
+  void operator()(const DataMapper& res, const double* blockA, const double* blockB,
+                  Index rows, Index depth, Index cols, double alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+  ::operator()(const DataMapper& res, const double* blockA, const double* blockB,
+               Index rows, Index depth, Index cols, double alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB)
+  {
+    const Index accRows = quad_traits<double>::rows;
+    const Index accCols = quad_traits<double>::size;
+    void (*gemm_function)(const DataMapper&, const double*, const double*, Index, Index, Index, double, Index, Index, Index, Index);
+
+    #ifdef EIGEN_ALTIVEC_MMA_ONLY
+      //generate with MMA only
+      gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+    #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+      if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+        gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+      }
+      else{
+        gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+      }
+    #else
+      gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+    #endif
+      gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+  }
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  typedef quad_traits<double>::vectortype   Packet;
+  typedef Packet1cd  Packetc;
+  typedef quad_traits<double>::rhstype   RhsPacket;
+
+  void operator()(const DataMapper& res, const std::complex<double>* blockA, const std::complex<double>* blockB,
+                  Index rows, Index depth, Index cols, std::complex<double> alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+  ::operator()(const DataMapper& res, const std::complex<double>* blockA, const std::complex<double>* blockB,
+               Index rows, Index depth, Index cols, std::complex<double> alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB)
+  {
+    const Index accRows = quad_traits<double>::rows;
+    const Index accCols = quad_traits<double>::size;
+    void (*gemm_function)(const DataMapper&, const std::complex<double>*, const std::complex<double>*,
+          Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
+    #ifdef EIGEN_ALTIVEC_MMA_ONLY
+       //generate with MMA only
+       gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+         gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+       }
+       else{
+         gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+       }
+     #else
+       gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+     #endif
+       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+  }
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<std::complex<double>, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  typedef quad_traits<double>::vectortype   Packet;
+  typedef Packet1cd  Packetc;
+  typedef quad_traits<double>::rhstype   RhsPacket;
+
+  void operator()(const DataMapper& res, const std::complex<double>* blockA, const double* blockB,
+                  Index rows, Index depth, Index cols, std::complex<double> alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<std::complex<double>, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+  ::operator()(const DataMapper& res, const std::complex<double>* blockA, const double* blockB,
+               Index rows, Index depth, Index cols, std::complex<double> alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB)
+  {
+    const Index accRows = quad_traits<double>::rows;
+    const Index accCols = quad_traits<double>::size;
+    void (*gemm_function)(const DataMapper&, const std::complex<double>*, const double*,
+          Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
+    #ifdef EIGEN_ALTIVEC_MMA_ONLY
+       //generate with MMA only
+       gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+         gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+       }
+       else{
+         gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+       }
+     #else
+       gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+     #endif
+       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+  }
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<double, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  typedef quad_traits<double>::vectortype   Packet;
+  typedef Packet1cd  Packetc;
+  typedef quad_traits<double>::rhstype   RhsPacket;
+
+  void operator()(const DataMapper& res, const double* blockA, const std::complex<double>* blockB,
+                  Index rows, Index depth, Index cols, std::complex<double> alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<double, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+  ::operator()(const DataMapper& res, const double* blockA, const std::complex<double>* blockB,
+               Index rows, Index depth, Index cols, std::complex<double> alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB)
+  {
+    const Index accRows = quad_traits<double>::rows;
+    const Index accCols = quad_traits<double>::size;
+    void (*gemm_function)(const DataMapper&, const double*, const std::complex<double>*,
+          Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
+    #ifdef EIGEN_ALTIVEC_MMA_ONLY
+       //generate with MMA only
+       gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+         gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+       }
+       else{
+         gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+       }
+     #else
+       gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+     #endif
+       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+  }
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIX_PRODUCT_ALTIVEC_H

diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
new file mode 100644
index 0000000..bf01dba
--- /dev/null
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h

@@ -0,0 +1,159 @@
+//#define EIGEN_POWER_USE_PREFETCH  // Use prefetching in gemm routines
+#ifdef EIGEN_POWER_USE_PREFETCH
+#define EIGEN_POWER_PREFETCH(p)  prefetch(p)
+#else
+#define EIGEN_POWER_PREFETCH(p)
+#endif
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_extra_row(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index row,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlpha,
+  const Packet& pMask);
+
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_extra_cols(
+  const DataMapper& res,
+  const Scalar* blockA,
+  const Scalar* blockB,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index offsetB,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlpha,
+  const Packet& pMask);
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows);
+
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index row,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag,
+  const Packet& pMask);
+
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex_extra_cols(
+  const DataMapper& res,
+  const Scalar* blockA,
+  const Scalar* blockB,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index offsetB,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag,
+  const Packet& pMask);
+
+template<typename Scalar, typename Packet>
+EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs);
+
+template<typename DataMapper, typename Packet, typename Index, const Index accCols, int StorageOrder, bool Complex, int N>
+EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row, Index col);
+
+template<typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha);
+
+template<typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag);
+
+// Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks.
+template<typename Packet, typename Packetc, int N>
+EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2)
+{
+  acc1.packet[0].v = vec_mergeh(taccReal.packet[0], taccImag.packet[0]);
+  if (N > 1) {
+    acc1.packet[1].v = vec_mergeh(taccReal.packet[1], taccImag.packet[1]);
+  }
+  if (N > 2) {
+    acc1.packet[2].v = vec_mergeh(taccReal.packet[2], taccImag.packet[2]);
+  }
+  if (N > 3) {
+    acc1.packet[3].v = vec_mergeh(taccReal.packet[3], taccImag.packet[3]);
+  }
+
+  acc2.packet[0].v = vec_mergel(taccReal.packet[0], taccImag.packet[0]);
+  if (N > 1) {
+    acc2.packet[1].v = vec_mergel(taccReal.packet[1], taccImag.packet[1]);
+  }
+  if (N > 2) {
+    acc2.packet[2].v = vec_mergel(taccReal.packet[2], taccImag.packet[2]);
+  }
+  if (N > 3) {
+    acc2.packet[3].v = vec_mergel(taccReal.packet[3], taccImag.packet[3]);
+  }
+}
+
+template<typename Packet, typename Packetc, int N>
+EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc,N*2>& tRes, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2)
+{
+  bcouple_common<Packet, Packetc, N>(taccReal, taccImag, acc1, acc2);
+
+  acc1.packet[0] = padd<Packetc>(tRes.packet[0], acc1.packet[0]);
+  if (N > 1) {
+    acc1.packet[1] = padd<Packetc>(tRes.packet[1], acc1.packet[1]);
+  }
+  if (N > 2) {
+    acc1.packet[2] = padd<Packetc>(tRes.packet[2], acc1.packet[2]);
+  }
+  if (N > 3) {
+    acc1.packet[3] = padd<Packetc>(tRes.packet[3], acc1.packet[3]);
+  }
+
+  acc2.packet[0] = padd<Packetc>(tRes.packet[0+N], acc2.packet[0]);
+  if (N > 1) {
+    acc2.packet[1] = padd<Packetc>(tRes.packet[1+N], acc2.packet[1]);
+  }
+  if (N > 2) {
+    acc2.packet[2] = padd<Packetc>(tRes.packet[2+N], acc2.packet[2]);
+  }
+  if (N > 3) {
+    acc2.packet[3] = padd<Packetc>(tRes.packet[3+N], acc2.packet[3]);
+  }
+}
+
+// This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
+template<typename Scalar, typename Packet>
+EIGEN_ALWAYS_INLINE Packet ploadRhs(const Scalar* rhs)
+{
+  return ploadu<Packet>(rhs);
+}
+
+} // end namespace internal
+} // end namespace Eigen

diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
new file mode 100644
index 0000000..5b44495
--- /dev/null
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h

@@ -0,0 +1,620 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com)
+// Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+#define EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+
+#pragma GCC target("cpu=power10,htm")
+
+#ifdef __has_builtin
+#if !__has_builtin(__builtin_vsx_assemble_pair)
+#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
+#endif
+#endif
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename Scalar, typename Packet>
+EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc)
+{
+  __builtin_mma_xxsetaccz(acc);
+}
+
+template<typename DataMapper, typename Index, typename Packet, const Index accCols>
+EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, __vector_quad* acc)
+{
+  PacketBlock<Packet, 4> result;
+  __builtin_mma_disassemble_acc(&result.packet, acc);
+
+  PacketBlock<Packet, 4> tRes;
+  bload<DataMapper, Packet, Index, accCols, ColMajor, false, 4>(tRes, data, i, 0);
+
+  bscale<Packet, 4>(tRes, result, alpha);
+
+  data.template storePacketBlock<Packet, 4>(i, 0, tRes);
+}
+
+template<typename DataMapper, typename Index, typename Packet, typename Packetc, const Index accColsC>
+EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag)
+{
+  PacketBlock<Packet, 4> resultReal, resultImag;
+  __builtin_mma_disassemble_acc(&resultReal.packet, accReal);
+  __builtin_mma_disassemble_acc(&resultImag.packet, accImag);
+
+  PacketBlock<Packetc, 8> tRes;
+  bload<DataMapper, Packetc, Index, accColsC, ColMajor, true, 4>(tRes, data, i, 0);
+
+  PacketBlock<Packet,4> taccReal, taccImag;
+  bscalec<Packet,4>(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag);
+
+  PacketBlock<Packetc, 4> acc1, acc2;
+  bcouple<Packet, Packetc, 4>(taccReal, taccImag, tRes, acc1, acc2);
+
+  data.template storePacketBlock<Packetc, 4>(i, 0, acc1);
+  data.template storePacketBlock<Packetc, 4>(i + accColsC, 0, acc2);
+}
+
+// Defaults to float32, since Eigen still supports C++03 we can't use default template arguments
+template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b)
+{
+  if(NegativeAccumulate)
+  {
+    __builtin_mma_xvf32gernp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
+  } else {
+    __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
+  }
+}
+
+template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const PacketBlock<Packet2d,2>& a, const Packet2d& b)
+{
+  __vector_pair* a0 = (__vector_pair *)(&a.packet[0]);
+  if(NegativeAccumulate)
+  {
+    __builtin_mma_xvf64gernp(acc, *a0, (__vector unsigned char)b);
+  } else {
+    __builtin_mma_xvf64gerpp(acc, *a0, (__vector unsigned char)b);
+  }
+}
+
+template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b)
+{
+  if(NegativeAccumulate)
+  {
+    __builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector unsigned char)b);
+  } else {
+    __builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector unsigned char)b);
+  }
+}
+
+template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad*, const __vector_pair&, const Packet4f&)
+{
+  // Just for compilation
+}
+
+template<typename Scalar, typename Packet, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, const Packet& lhsVi, const RhsPacket& rhsV, const RhsPacket& rhsVi)
+{
+  pgerMMA<Packet, RhsPacket, false>(accReal,  rhsV,  lhsV);
+  if(LhsIsReal) {
+    pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi,  lhsV);
+  } else {
+    if(!RhsIsReal) {
+      pgerMMA<Packet, RhsPacket, ConjugateLhs == ConjugateRhs>(accReal, rhsVi, lhsVi);
+      pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi,  lhsV);
+    } else {
+      EIGEN_UNUSED_VARIABLE(rhsVi);
+    }
+    pgerMMA<Packet, RhsPacket, ConjugateLhs>(accImag,  rhsV, lhsVi);
+  }
+}
+
+// This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
+template<typename Scalar, typename Packet>
+EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV)
+{
+  rhsV = ploadRhs<Scalar, Packet>(rhs);
+} 
+
+template<>
+EIGEN_ALWAYS_INLINE void ploadRhsMMA<double, PacketBlock<Packet2d, 2> >(const double* rhs, PacketBlock<Packet2d, 2>& rhsV)
+{
+  rhsV.packet[0] = ploadRhs<double, Packet2d>((const double *)((Packet2d *)rhs      ));
+  rhsV.packet[1] = ploadRhs<double, Packet2d>((const double *)(((Packet2d *)rhs) + 1));
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void ploadRhsMMA<double, __vector_pair>(const double* rhs, __vector_pair& rhsV)
+{
+#if EIGEN_COMP_LLVM
+  __builtin_vsx_assemble_pair(&rhsV,
+    (__vector unsigned char)(ploadRhs<double, Packet2d>((const double *)(((Packet2d *)rhs) + 1))),
+    (__vector unsigned char)(ploadRhs<double, Packet2d>((const double *)((Packet2d *)rhs      ))));
+#else
+  __asm__ ("lxvp %x0,%1" : "=wa" (rhsV) : "Y" (*rhs));
+#endif
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&)
+{
+  // Just for compilation
+}
+
+// PEEL_MMA loop factor.
+#define PEEL_MMA 7
+
+#define MICRO_MMA_UNROLL(func) \
+  func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
+
+#define MICRO_MMA_LOAD_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr##iter); \
+    lhs_ptr##iter += accCols; \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhsV##iter); \
+  }
+
+#define MICRO_MMA_WORK_ONE(iter, type, peel) \
+  if (unroll_factor > iter) { \
+    pgerMMA<Packet, type, false>(&accZero##iter, rhsV##peel, lhsV##iter); \
+  }
+
+#define MICRO_MMA_TYPE_PEEL(func, func2, type, peel) \
+  if (PEEL_MMA > peel) { \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
+    ploadRhsMMA<Scalar, type>(rhs_ptr + (accRows * peel), rhsV##peel); \
+    MICRO_MMA_UNROLL(func2); \
+    func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \
+    func(4,type,peel) func(5,type,peel) func(6,type,peel) func(7,type,peel) \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+  }
+
+#define MICRO_MMA_UNROLL_TYPE_PEEL(func, func2, type) \
+  type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7; \
+  MICRO_MMA_TYPE_PEEL(func,func2,type,0); MICRO_MMA_TYPE_PEEL(func,func2,type,1); \
+  MICRO_MMA_TYPE_PEEL(func,func2,type,2); MICRO_MMA_TYPE_PEEL(func,func2,type,3); \
+  MICRO_MMA_TYPE_PEEL(func,func2,type,4); MICRO_MMA_TYPE_PEEL(func,func2,type,5); \
+  MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7);
+
+#define MICRO_MMA_UNROLL_TYPE_ONE(func, func2, type) \
+  type rhsV0; \
+  MICRO_MMA_TYPE_PEEL(func,func2,type,0);
+
+#define MICRO_MMA_ONE_PEEL \
+  if (sizeof(Scalar) == sizeof(float)) { \
+    MICRO_MMA_UNROLL_TYPE_PEEL(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, RhsPacket); \
+  } else { \
+    MICRO_MMA_UNROLL_TYPE_PEEL(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, __vector_pair); \
+  } \
+  rhs_ptr += (accRows * PEEL_MMA);
+
+#define MICRO_MMA_ONE \
+  if (sizeof(Scalar) == sizeof(float)) { \
+    MICRO_MMA_UNROLL_TYPE_ONE(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, RhsPacket); \
+  } else { \
+    MICRO_MMA_UNROLL_TYPE_ONE(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, __vector_pair); \
+  } \
+  rhs_ptr += accRows;
+
+#define MICRO_MMA_DST_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    bsetzeroMMA<Scalar, Packet>(&accZero##iter); \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(accZero##iter); \
+  }
+
+#define MICRO_MMA_DST_PTR MICRO_MMA_UNROLL(MICRO_MMA_DST_PTR_ONE)
+
+#define MICRO_MMA_SRC_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols; \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
+  }
+
+#define MICRO_MMA_SRC_PTR MICRO_MMA_UNROLL(MICRO_MMA_SRC_PTR_ONE)
+
+#define MICRO_MMA_PREFETCH_ONE(iter) \
+  if (unroll_factor > iter) { \
+    EIGEN_POWER_PREFETCH(lhs_ptr##iter); \
+  }
+
+#define MICRO_MMA_PREFETCH MICRO_MMA_UNROLL(MICRO_MMA_PREFETCH_ONE)
+
+#define MICRO_MMA_STORE_ONE(iter) \
+  if (unroll_factor > iter) { \
+    storeAccumulator<DataMapper, Index, Packet, accCols>(row + iter*accCols, res, pAlpha, &accZero##iter); \
+  }
+
+#define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE)
+
+template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index& row,
+  const Packet& pAlpha)
+{
+  const Scalar* rhs_ptr = rhs_base;
+  const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
+  __vector_quad accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
+
+  MICRO_MMA_SRC_PTR
+  MICRO_MMA_DST_PTR
+
+  Index k = 0;
+  for(; k + PEEL_MMA <= depth; k+= PEEL_MMA)
+  {
+    EIGEN_POWER_PREFETCH(rhs_ptr);
+    MICRO_MMA_PREFETCH
+    MICRO_MMA_ONE_PEEL
+  }
+  for(; k < depth; k++)
+  {
+    MICRO_MMA_ONE
+  }
+  MICRO_MMA_STORE
+
+  row += unroll_factor*accCols;
+}
+
+template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemmMMA_cols(
+  const DataMapper& res,
+  const Scalar* blockA,
+  const Scalar* blockB,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index offsetB,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlpha,
+  const Packet& pMask)
+{
+  const DataMapper res3 = res.getSubMapper(0, col);
+
+  const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
+  const Scalar* lhs_base = blockA + accCols*offsetA;
+  Index row = 0;
+
+#define MAX_MMA_UNROLL 7
+  while(row + MAX_MMA_UNROLL*accCols <= rows) {
+    gemm_unrolled_MMA_iteration<MAX_MMA_UNROLL, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+  }
+  switch( (rows-row)/accCols ) {
+#if MAX_MMA_UNROLL > 7
+    case 7:
+      gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
+#endif
+#if MAX_MMA_UNROLL > 6
+    case 6:
+      gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
+#endif
+#if MAX_MMA_UNROLL > 5
+    case 5:
+      gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
+#endif
+#if MAX_MMA_UNROLL > 4
+    case 4:
+      gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
+#endif
+#if MAX_MMA_UNROLL > 3
+    case 3:
+      gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
+#endif
+#if MAX_MMA_UNROLL > 2
+    case 2:
+      gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
+#endif
+#if MAX_MMA_UNROLL > 1
+    case 1:
+      gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
+#endif
+    default:
+      break;
+  }
+#undef MAX_MMA_UNROLL
+
+  if(remaining_rows > 0)
+  {
+    gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask);
+  }
+}
+
+template<typename Scalar, typename Index, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
+void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+      const Index remaining_rows = rows % accCols;
+
+      if( strideA == -1 ) strideA = depth;
+      if( strideB == -1 ) strideB = depth;
+
+      const Packet pAlpha = pset1<Packet>(alpha);
+      const Packet pMask  = bmask<Packet>((const int)(remaining_rows));
+
+      Index col = 0;
+      for(; col + accRows <= cols; col += accRows)
+      {
+        gemmMMA_cols<Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
+      }
+
+      gemm_extra_cols<Scalar, Packet, DataMapper, Index, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
+}
+
+#define accColsC (accCols / 2)
+#define advanceRows ((LhsIsReal) ? 1 : 2)
+#define advanceCols ((RhsIsReal) ? 1 : 2)
+
+// PEEL_COMPLEX_MMA loop factor.
+#define PEEL_COMPLEX_MMA 3
+
+#define MICRO_COMPLEX_MMA_UNROLL(func) \
+  func(0) func(1) func(2) func(3)
+
+#define MICRO_COMPLEX_MMA_LOAD_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter); \
+    if(!LhsIsReal) { \
+      lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter + imag_delta); \
+    } else { \
+      EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
+    } \
+    lhs_ptr_real##iter += accCols; \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhsV##iter); \
+    EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
+  }
+
+#define MICRO_COMPLEX_MMA_WORK_ONE(iter, type, peel) \
+  if (unroll_factor > iter) { \
+    pgercMMA<Scalar, Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
+  }
+
+#define MICRO_COMPLEX_MMA_TYPE_PEEL(func, func2, type, peel) \
+  if (PEEL_COMPLEX_MMA > peel) { \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3; \
+    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \
+    ploadRhsMMA<Scalar, type>(rhs_ptr_real + (accRows * peel), rhsV##peel); \
+    if(!RhsIsReal) { \
+      ploadRhsMMA<Scalar, type>(rhs_ptr_imag + (accRows * peel), rhsVi##peel); \
+    } else { \
+      EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+    } \
+    MICRO_COMPLEX_MMA_UNROLL(func2); \
+    func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+    EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+  }
+
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(func, func2, type) \
+  type rhsV0, rhsV1, rhsV2, rhsV3; \
+  type rhsVi0, rhsVi1, rhsVi2, rhsVi3; \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,1); \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3);
+
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(func, func2, type) \
+  type rhsV0, rhsVi0; \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0);
+
+#define MICRO_COMPLEX_MMA_ONE_PEEL \
+  if (sizeof(Scalar) == sizeof(float)) { \
+    MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, RhsPacket); \
+  } else { \
+    MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, __vector_pair); \
+  } \
+  rhs_ptr_real += (accRows * PEEL_COMPLEX_MMA); \
+  if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX_MMA);
+
+#define MICRO_COMPLEX_MMA_ONE \
+  if (sizeof(Scalar) == sizeof(float)) { \
+    MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, RhsPacket); \
+  } else { \
+    MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, __vector_pair); \
+  } \
+  rhs_ptr_real += accRows; \
+  if(!RhsIsReal) rhs_ptr_imag += accRows;
+
+#define MICRO_COMPLEX_MMA_DST_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    bsetzeroMMA<Scalar, Packet>(&accReal##iter); \
+    bsetzeroMMA<Scalar, Packet>(&accImag##iter); \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(accReal##iter); \
+    EIGEN_UNUSED_VARIABLE(accImag##iter); \
+  }
+
+#define MICRO_COMPLEX_MMA_DST_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_DST_PTR_ONE)
+
+#define MICRO_COMPLEX_MMA_SRC_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols; \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \
+  }
+
+#define MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_SRC_PTR_ONE)
+
+#define MICRO_COMPLEX_MMA_PREFETCH_ONE(iter) \
+  if (unroll_factor > iter) { \
+    EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \
+  }
+
+#define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_PREFETCH_ONE)
+
+#define MICRO_COMPLEX_MMA_STORE_ONE(iter) \
+  if (unroll_factor > iter) { \
+    storeComplexAccumulator<DataMapper, Index, Packet, Packetc, accColsC>(row + iter*accCols, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \
+  }
+
+#define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE)
+
+template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index strideB,
+  Index& row,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag)
+{
+  const Scalar* rhs_ptr_real = rhs_base;
+  const Scalar* rhs_ptr_imag = NULL;
+  const Index imag_delta = accCols*strideA;
+  if(!RhsIsReal) {
+    rhs_ptr_imag = rhs_base + accRows*strideB;
+  } else {
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
+  }
+  const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL;
+  const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL;
+  __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3;
+
+  MICRO_COMPLEX_MMA_SRC_PTR
+  MICRO_COMPLEX_MMA_DST_PTR
+
+  Index k = 0;
+  for(; k + PEEL_COMPLEX_MMA <= depth; k+= PEEL_COMPLEX_MMA)
+  {
+    EIGEN_POWER_PREFETCH(rhs_ptr_real);
+    if(!RhsIsReal) {
+      EIGEN_POWER_PREFETCH(rhs_ptr_imag);
+    }
+    MICRO_COMPLEX_MMA_PREFETCH
+    MICRO_COMPLEX_MMA_ONE_PEEL
+  }
+  for(; k < depth; k++)
+  {
+    MICRO_COMPLEX_MMA_ONE
+  }
+  MICRO_COMPLEX_MMA_STORE
+
+  row += unroll_factor*accCols;
+}
+
+template<typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols(
+  const DataMapper& res,
+  const Scalar* blockA,
+  const Scalar* blockB,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index offsetB,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag,
+  const Packet& pMask)
+{
+  const DataMapper res3 = res.getSubMapper(0, col);
+
+  const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
+  const Scalar* lhs_base = blockA + accCols*offsetA;
+  Index row = 0;
+
+#define MAX_COMPLEX_MMA_UNROLL 4
+  while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) {
+    gemm_complex_unrolled_MMA_iteration<MAX_COMPLEX_MMA_UNROLL, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+  }
+  switch( (rows-row)/accCols ) {
+#if MAX_COMPLEX_MMA_UNROLL > 4
+    case 4:
+      gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+      break;
+#endif
+#if MAX_COMPLEX_MMA_UNROLL > 3
+    case 3:
+      gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+      break;
+#endif
+#if MAX_COMPLEX_MMA_UNROLL > 2
+    case 2:
+      gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+      break;
+#endif
+#if MAX_COMPLEX_MMA_UNROLL > 1
+    case 1:
+      gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+      break;
+#endif
+    default:
+      break;
+  }
+#undef MAX_COMPLEX_MMA_UNROLL
+
+  if(remaining_rows > 0)
+  {
+    gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+  }
+}
+
+template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+      const Index remaining_rows = rows % accCols;
+
+      if( strideA == -1 ) strideA = depth;
+      if( strideB == -1 ) strideB = depth;
+
+      const Packet pAlphaReal = pset1<Packet>(alpha.real());
+      const Packet pAlphaImag = pset1<Packet>(alpha.imag());
+      const Packet pMask = bmask<Packet>((const int)(remaining_rows));
+
+      const Scalar* blockA = (Scalar *) blockAc;
+      const Scalar* blockB = (Scalar *) blockBc;
+
+      Index col = 0;
+      for(; col + accRows <= cols; col += accRows)
+      {
+        gemmMMA_complex_cols<Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+      }
+
+      gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+}
+
+#undef accColsC
+#undef advanceRows
+#undef advanceCols
+
+#pragma GCC reset_options
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+

diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
old mode 100644
new mode 100755
index cdd0246..2a44054
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Konstantinos Margaritis <markos@codex.gr>
+// Copyright (C) 2008-2016 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -22,31 +22,38 @@
 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 
-#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#endif
-
 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  32
 #endif
 
-typedef __vector float          Packet4f;
-typedef __vector int            Packet4i;
-typedef __vector unsigned int   Packet4ui;
-typedef __vector __bool int     Packet4bi;
-typedef __vector short int      Packet8i;
-typedef __vector unsigned char  Packet16uc;
+typedef __vector float                   Packet4f;
+typedef __vector int                     Packet4i;
+typedef __vector unsigned int            Packet4ui;
+typedef __vector __bool int              Packet4bi;
+typedef __vector short int               Packet8s;
+typedef __vector unsigned short int      Packet8us;
+typedef __vector signed char             Packet16c;
+typedef __vector unsigned char           Packet16uc;
+typedef eigen_packet_wrapper<__vector unsigned short int,0> Packet8bf;
 
 // We don't want to write the same code all the time, but we need to reuse the constants
 // and it doesn't really work to declare them global, so we define macros instead
-
 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
-  Packet4f p4f_##NAME = (Packet4f) vec_splat_s32(X)
+  Packet4f p4f_##NAME = {X, X, X, X}
 
 #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
   Packet4i p4i_##NAME = vec_splat_s32(X)
 
+#define _EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME,X) \
+  Packet4ui p4ui_##NAME = {X, X, X, X}
+
+#define _EIGEN_DECLARE_CONST_FAST_Packet8us(NAME,X) \
+  Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
+
+#define _EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME,X) \
+  Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}
+
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
   Packet4f p4f_##NAME = pset1<Packet4f>(X)
 
@@ -64,43 +71,57 @@
 
 #define DST_CHAN 1
 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
+#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type 
 
 // These constants are endian-agnostic
-static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);
+static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
+static _EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
+static _EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
+static _EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1}
+static _EIGEN_DECLARE_CONST_FAST_Packet16uc(ONE,1);
+static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
 #ifndef __VSX__
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1);
-static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0);
+static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
 #endif
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16);
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1);
-static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1);
 
-static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
-static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
+static Packet4f  p4f_COUNTDOWN  = { 0.0, 1.0, 2.0, 3.0 };
+static Packet4i  p4i_COUNTDOWN  = { 0, 1, 2, 3 };
+static Packet8s  p8s_COUNTDOWN  = { 0, 1, 2, 3, 4, 5, 6, 7 };
+static Packet8us p8us_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 };
+
+static Packet16c  p16c_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7,
+                                    8, 9, 10, 11, 12, 13, 14, 15};
+static Packet16uc p16uc_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7, 
+                                    8, 9, 10, 11, 12, 13, 14, 15};
 
 static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
+static Packet16uc p16uc_REVERSE16 = { 14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1 };
+static Packet16uc p16uc_REVERSE8 = { 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 };
+
 static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
+static Packet16uc p16uc_DUPLICATE16_HI = { 0,1,0,1, 2,3,2,3, 4,5,4,5, 6,7,6,7 };
+static Packet16uc p16uc_DUPLICATE8_HI = { 0,0, 1,1, 2,2, 3,3, 4,4, 5,5, 6,6, 7,7 };
+static const Packet16uc p16uc_DUPLICATE16_EVEN= { 0,1 ,0,1, 4,5, 4,5, 8,9, 8,9, 12,13, 12,13 };
+static const Packet16uc p16uc_DUPLICATE16_ODD = { 2,3 ,2,3, 6,7, 6,7, 10,11, 10,11, 14,15, 14,15 };
 
-// Mask alignment
-#ifdef __PPC64__
-#define _EIGEN_MASK_ALIGNMENT	0xfffffffffffffff0
-#else
-#define _EIGEN_MASK_ALIGNMENT	0xfffffff0
-#endif
-
-#define _EIGEN_ALIGNED_PTR(x)	((ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
+static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3 };
 
 // Handle endianness properly while loading constants
 // Define global static constants:
 #ifdef _BIG_ENDIAN
-static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); 
+static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
+#ifdef __VSX__
 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+#endif
 static Packet16uc p16uc_PSET32_WODD   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
 static Packet16uc p16uc_PSET32_WEVEN  = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
 static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
 #else
-static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; 
+static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
 static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
 static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
@@ -109,8 +130,8 @@
 
 static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
 static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
-static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16);                                         //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
-static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16);                                         //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
+static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16;                                         //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
+static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16;                                         //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
 
 static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);                                         //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
 
@@ -120,41 +141,244 @@
 static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
 #endif // _BIG_ENDIAN
 
-template<> struct packet_traits<float>  : default_packet_traits
-{
+#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+  #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
+#else
+  #define EIGEN_PPC_PREFETCH(ADDR) asm( "   dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+#endif
+
+template <>
+struct packet_traits<float> : default_packet_traits {
   typedef Packet4f type;
   typedef Packet4f half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=4,
+    size = 4,
+    HasHalfPacket = 1,
 
-    // FIXME check the Has*
-#if defined(__VSX__)
-    HasDiv  = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasAbs = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
+#ifdef __VSX__
+    HasSqrt = 1,
+#if !EIGEN_COMP_CLANG
+    HasRsqrt = 1,
+#else
+    HasRsqrt = 0,
 #endif
-    HasSin  = 0,
-    HasCos  = 0,
-    HasLog  = 1,
-    HasExp  = 1,
-    HasSqrt = 0
+#else
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+#endif
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1,
+    HasNegate = 1,
+    HasBlend = 1
   };
 };
-template<> struct packet_traits<int>    : default_packet_traits
-{
+template <>
+struct packet_traits<bfloat16> : default_packet_traits {
+  typedef Packet8bf type;
+  typedef Packet8bf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 0,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasAbs = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
+#ifdef __VSX__
+    HasSqrt = 1,
+#if !EIGEN_COMP_CLANG
+    HasRsqrt = 1,
+#else
+    HasRsqrt = 0,
+#endif
+#else
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+#endif
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1,
+    HasNegate = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<int> : default_packet_traits {
   typedef Packet4i type;
   typedef Packet4i half;
   enum {
-    // FIXME check the Has*
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=4
+    size = 4,
+    HasHalfPacket = 0,
+
+    HasAdd   = 1,
+    HasSub   = 1,
+    HasShift = 1,
+    HasMul   = 1,
+    HasDiv   = 0,
+    HasBlend = 1
   };
 };
 
+template <>
+struct packet_traits<short int> : default_packet_traits {
+  typedef Packet8s type;
+  typedef Packet8s half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 0,
 
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; typedef Packet4i half; };
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 0,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<unsigned short int> : default_packet_traits {
+  typedef Packet8us type;
+  typedef Packet8us half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 0,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 0,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<signed char> : default_packet_traits {
+  typedef Packet16c type;
+  typedef Packet16c half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 0,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 0,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<unsigned char> : default_packet_traits {
+  typedef Packet16uc type;
+  typedef Packet16uc half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 0,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 0,
+    HasBlend = 1
+  };
+};
+
+template<> struct unpacket_traits<Packet4f>
+{
+  typedef float     type;
+  typedef Packet4f  half;
+  typedef Packet4i  integer_packet;
+  enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet4i>
+{
+  typedef int       type;
+  typedef Packet4i  half;
+  enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet8s>
+{
+  typedef short int type;
+  typedef Packet8s  half;
+  enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet8us>
+{
+  typedef unsigned short int type;
+  typedef Packet8us          half;
+  enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+
+template<> struct unpacket_traits<Packet16c>
+{
+  typedef signed char type;
+  typedef Packet16c  half;
+  enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet16uc>
+{
+  typedef unsigned char type;
+  typedef Packet16uc  half;
+  enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+
+template<> struct unpacket_traits<Packet8bf>
+{
+  typedef bfloat16 type;
+  typedef Packet8bf          half;
+  enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+inline std::ostream & operator <<(std::ostream & s, const Packet16c & v)
+{
+  union {
+    Packet16c   v;
+    signed char n[16];
+  } vt;
+  vt.v = v;
+  for (int i=0; i< 16; i++)
+    s << vt.n[i] << ", ";
+  return s;
+}
 
 inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
 {
@@ -164,7 +388,7 @@
   } vt;
   vt.v = v;
   for (int i=0; i< 16; i++)
-    s << (int)vt.n[i] << ", ";
+    s << vt.n[i] << ", ";
   return s;
 }
 
@@ -200,155 +424,385 @@
   s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
   return s;
 }
-/*
-inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
-{
-  union {
-    Packet4bi v;
-    unsigned int n[4];
-  } vt;
-  vt.v = v;
-  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
-  return s;
-}*/
 
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from)
+{
+  // some versions of GCC throw "unused-but-set-parameter".
+  // ignoring these warnings for now.
+  EIGEN_UNUSED_VARIABLE(from);
+  EIGEN_DEBUG_ALIGNED_LOAD
+#ifdef __VSX__
+  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
+#else
+  return vec_ld(0, from);
+#endif
+}
 
 // Need to define them first or we get specialization after instantiation errors
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
+{
+  return pload_common<Packet4f>(from);
+}
 
-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
+{
+  return pload_common<Packet4i>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from)
+{
+  return pload_common<Packet8s>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from)
+{
+  return pload_common<Packet8us>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const signed char*     from)
+{
+  return pload_common<Packet16c>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const unsigned char*     from)
+{
+  return pload_common<Packet16uc>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16*     from)
+{
+  return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
+}
+
+template <typename Packet>
+EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){
+  // some versions of GCC throw "unused-but-set-parameter" (float *to).
+  // ignoring these warnings for now.
+  EIGEN_UNUSED_VARIABLE(to);
+  EIGEN_DEBUG_ALIGNED_STORE
+#ifdef __VSX__
+  vec_xst(from, 0, to);
+#else
+  vec_st(from, 0, to);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
+{
+  pstore_common<Packet4f>(to, from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from)
+{
+  pstore_common<Packet4i>(to, from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<short int>(short int*       to, const Packet8s& from)
+{
+  pstore_common<Packet8s>(to, from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int*       to, const Packet8us& from)
+{
+  pstore_common<Packet8us>(to, from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16*       to, const Packet8bf& from)
+{
+  pstore_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<signed char>(signed char*       to, const Packet16c& from)
+{
+  pstore_common<Packet16c>(to, from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char*       to, const Packet16uc& from)
+{
+  pstore_common<Packet16uc>(to, from);
+}
+
+template<typename Packet>
+EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from)
+{
+  Packet v = {from, from, from, from};
+  return v;
+}
+
+template<typename Packet>
+EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet)& from)
+{
+  Packet v = {from, from, from, from, from, from, from, from};
+  return v;
+}
+
+template<typename Packet>
+EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet)& from)
+{
+  Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from};
+  return v;
+}
 
 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  float EIGEN_ALIGN16 af[4];
-  af[0] = from;
-  Packet4f vc = pload<Packet4f>(af);
-  vc = vec_splat(vc, 0);
-  return vc;
+  return pset1_size4<Packet4f>(from);
 }
 
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
-  int EIGEN_ALIGN16 ai[4];
-  ai[0] = from;
-  Packet4i vc = pload<Packet4i>(ai);
-  vc = vec_splat(vc, 0);
-  return vc;
+  return pset1_size4<Packet4i>(from);
 }
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4f>(const float *a,
-                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+
+template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int&    from)   {
+  return pset1_size8<Packet8s>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int&    from)   {
+  return pset1_size8<Packet8us>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const signed char&    from)   {
+  return pset1_size16<Packet16c>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const unsigned char&    from)   {
+  return pset1_size16<Packet16uc>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
+  return reinterpret_cast<Packet4f>(pset1<Packet4i>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16&    from)   {
+  return pset1_size8<Packet8us>(reinterpret_cast<const unsigned short int&>(from));
+}
+
+template<typename Packet> EIGEN_STRONG_INLINE void
+pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a,
+                      Packet& a0, Packet& a1, Packet& a2, Packet& a3)
 {
-  a3 = pload<Packet4f>(a);
+  a3 = pload<Packet>(a);
   a0 = vec_splat(a3, 0);
   a1 = vec_splat(a3, 1);
   a2 = vec_splat(a3, 2);
   a3 = vec_splat(a3, 3);
 }
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
+}
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet4i>(const int *a,
                       Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
 {
-  a3 = pload<Packet4i>(a);
-  a0 = vec_splat(a3, 0);
-  a1 = vec_splat(a3, 1);
-  a2 = vec_splat(a3, 2);
-  a3 = vec_splat(a3, 3);
+  pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
 }
 
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride)
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride)
 {
-  float EIGEN_ALIGN16 af[4];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
-  af[2] = from[2*stride];
-  af[3] = from[3*stride];
- return pload<Packet4f>(af);
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
+  a[0] = from[0*stride];
+  a[1] = from[1*stride];
+  a[2] = from[2*stride];
+  a[3] = from[3*stride];
+  return pload<Packet>(a);
 }
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride)
+
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
-  int EIGEN_ALIGN16 ai[4];
-  ai[0] = from[0*stride];
-  ai[1] = from[1*stride];
-  ai[2] = from[2*stride];
-  ai[3] = from[3*stride];
- return pload<Packet4i>(ai);
+  return pgather_common<Packet4f>(from, stride);
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride)
+
+template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
 {
-  float EIGEN_ALIGN16 af[4];
-  pstore<float>(af, from);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
-  to[2*stride] = af[2];
-  to[3*stride] = af[3];
+  return pgather_common<Packet4i>(from, stride);
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride)
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size8(const __UNPACK_TYPE__(Packet)* from, Index stride)
 {
-  int EIGEN_ALIGN16 ai[4];
-  pstore<int>((int *)ai, from);
-  to[0*stride] = ai[0];
-  to[1*stride] = ai[1];
-  to[2*stride] = ai[2];
-  to[3*stride] = ai[3];
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
+  a[0] = from[0*stride];
+  a[1] = from[1*stride];
+  a[2] = from[2*stride];
+  a[3] = from[3*stride];
+  a[4] = from[4*stride];
+  a[5] = from[5*stride];
+  a[6] = from[6*stride];
+  a[7] = from[7*stride];
+  return pload<Packet>(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
-template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a)     { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }
+template<> EIGEN_DEVICE_FUNC inline Packet8s pgather<short int, Packet8s>(const short int* from, Index stride)
+{
+  return pgather_size8<Packet8s>(from, stride);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); }
+template<> EIGEN_DEVICE_FUNC inline Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from, Index stride)
+{
+  return pgather_size8<Packet8us>(from, stride);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); }
+template<> EIGEN_DEVICE_FUNC inline Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
+{
+  return pgather_size8<Packet8bf>(from, stride);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return psub<Packet4f>(p4f_ZERO, a); }
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return psub<Packet4i>(p4i_ZERO, a); }
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size16(const __UNPACK_TYPE__(Packet)* from, Index stride)
+{
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
+  a[0] = from[0*stride];
+  a[1] = from[1*stride];
+  a[2] = from[2*stride];
+  a[3] = from[3*stride];
+  a[4] = from[4*stride];
+  a[5] = from[5*stride];
+  a[6] = from[6*stride];
+  a[7] = from[7*stride];
+  a[8] = from[8*stride];
+  a[9] = from[9*stride];
+  a[10] = from[10*stride];
+  a[11] = from[11*stride];
+  a[12] = from[12*stride];
+  a[13] = from[13*stride];
+  a[14] = from[14*stride];
+  a[15] = from[15*stride];
+  return pload<Packet>(a);
+}
+
+
+template<> EIGEN_DEVICE_FUNC inline Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride)
+{
+  return pgather_size16<Packet16c>(from, stride);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from, Index stride)
+{
+  return pgather_size16<Packet16uc>(from, stride);
+}
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
+{
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
+  pstore<__UNPACK_TYPE__(Packet)>(a, from);
+  to[0*stride] = a[0];
+  to[1*stride] = a[1];
+  to[2*stride] = a[2];
+  to[3*stride] = a[3];
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  pscatter_size4<Packet4f>(to, from, stride);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
+{
+  pscatter_size4<Packet4i>(to, from, stride);
+}
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size8(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
+{
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
+  pstore<__UNPACK_TYPE__(Packet)>(a, from);
+  to[0*stride] = a[0];
+  to[1*stride] = a[1];
+  to[2*stride] = a[2];
+  to[3*stride] = a[3];
+  to[4*stride] = a[4];
+  to[5*stride] = a[5];
+  to[6*stride] = a[6];
+  to[7*stride] = a[7];
+}
+
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride)
+{
+  pscatter_size8<Packet8s>(to, from, stride);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride)
+{
+  pscatter_size8<Packet8us>(to, from, stride);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
+{
+  pscatter_size8<Packet8bf>(to, from, stride);
+}
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size16(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
+{
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
+  pstore<__UNPACK_TYPE__(Packet)>(a, from);
+  to[0*stride] = a[0];
+  to[1*stride] = a[1];
+  to[2*stride] = a[2];
+  to[3*stride] = a[3];
+  to[4*stride] = a[4];
+  to[5*stride] = a[5];
+  to[6*stride] = a[6];
+  to[7*stride] = a[7];
+  to[8*stride] = a[8];
+  to[9*stride] = a[9];
+  to[10*stride] = a[10];
+  to[11*stride] = a[11];
+  to[12*stride] = a[12];
+  to[13*stride] = a[13];
+  to[14*stride] = a[14];
+  to[15*stride] = a[15];
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride)
+{
+  pscatter_size16<Packet16c>(to, from, stride);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride)
+{
+  pscatter_size16<Packet16uc>(to, from, stride);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f   plset<Packet4f>(const float&     a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN;  }
+template<> EIGEN_STRONG_INLINE Packet4i   plset<Packet4i>(const int&       a) { return pset1<Packet4i>(a) + p4i_COUNTDOWN;  }
+template<> EIGEN_STRONG_INLINE Packet8s   plset<Packet8s>(const short int& a) { return pset1<Packet8s>(a) + p8s_COUNTDOWN; }
+template<> EIGEN_STRONG_INLINE Packet8us  plset<Packet8us>(const unsigned short int& a) { return pset1<Packet8us>(a) + p8us_COUNTDOWN; }
+template<> EIGEN_STRONG_INLINE Packet16c  plset<Packet16c>(const signed char& a)   { return pset1<Packet16c>(a) + p16c_COUNTDOWN; }
+template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const unsigned char& a)   { return pset1<Packet16uc>(a) + p16uc_COUNTDOWN; }
+
+template<> EIGEN_STRONG_INLINE Packet4f   padd<Packet4f>  (const Packet4f&   a, const Packet4f&   b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet4i   padd<Packet4i>  (const Packet4i&   a, const Packet4i&   b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet4ui   padd<Packet4ui>  (const Packet4ui&   a, const Packet4ui&   b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet8s   padd<Packet8s>  (const Packet8s&   a, const Packet8s&   b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet8us  padd<Packet8us> (const Packet8us&  a, const Packet8us&  b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet16c  padd<Packet16c> (const Packet16c&  a, const Packet16c&  b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a + b; }
+
+template<> EIGEN_STRONG_INLINE Packet4f   psub<Packet4f>  (const Packet4f&   a, const Packet4f&   b) { return a - b; }
+template<> EIGEN_STRONG_INLINE Packet4i   psub<Packet4i>  (const Packet4i&   a, const Packet4i&   b) { return a - b; }
+template<> EIGEN_STRONG_INLINE Packet8s   psub<Packet8s>  (const Packet8s&   a, const Packet8s&   b) { return a - b; }
+template<> EIGEN_STRONG_INLINE Packet8us  psub<Packet8us> (const Packet8us&  a, const Packet8us&  b) { return a - b; }
+template<> EIGEN_STRONG_INLINE Packet16c  psub<Packet16c> (const Packet16c&  a, const Packet16c&  b) { return a - b; }
+template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a - b; }
+
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; }
+template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; }
 
 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
 
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b,p4f_ZERO); }
-/* Commented out: it's actually slower than processing it scalar
- *
-template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
-  // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec
-  //Set up constants, variables
-  Packet4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
+template<> EIGEN_STRONG_INLINE Packet4f   pmul<Packet4f>  (const Packet4f&   a, const Packet4f&   b) { return vec_madd(a,b, p4f_MZERO); }
+template<> EIGEN_STRONG_INLINE Packet4i   pmul<Packet4i>  (const Packet4i&   a, const Packet4i&   b) { return a * b; }
+template<> EIGEN_STRONG_INLINE Packet8s   pmul<Packet8s>  (const Packet8s&   a, const Packet8s&   b) { return vec_mul(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us  pmul<Packet8us> (const Packet8us&  a, const Packet8us&  b) { return vec_mul(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c  pmul<Packet16c> (const Packet16c&  a, const Packet16c&  b) { return vec_mul(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_mul(a,b); }
 
-  // Get the absolute values
-  a1  = vec_abs(a);
-  b1  = vec_abs(b);
 
-  // Get the signs using xor
-  Packet4bi sgn = (Packet4bi) vec_cmplt(vec_xor(a, b), p4i_ZERO);
-
-  // Do the multiplication for the asbolute values.
-  bswap = (Packet4i) vec_rl((Packet4ui) b1, (Packet4ui) p4i_MINUS16 );
-  low_prod = vec_mulo((Packet8i) a1, (Packet8i)b1);
-  high_prod = vec_msum((Packet8i) a1, (Packet8i) bswap, p4i_ZERO);
-  high_prod = (Packet4i) vec_sl((Packet4ui) high_prod, (Packet4ui) p4i_MINUS16);
-  prod = vec_add( low_prod, high_prod );
-
-  // NOR the product and select only the negative elements according to the sign mask
-  prod_ = vec_nor(prod, prod);
-  prod_ = vec_sel(p4i_ZERO, prod_, sgn);
-
-  // Add 1 to the result to get the negative numbers
-  v1sel = vec_sel(p4i_ZERO, p4i_ONE, sgn);
-  prod_ = vec_add(prod_, v1sel);
-
-  // Merge the results back to the final vector.
-  prod = vec_sel(prod, prod_, sgn);
-
-  return prod;
-}
-*/
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
-#if !defined(__VSX__) // VSX actually provides a div instruction
+#ifndef __VSX__  // VSX actually provides a div instruction
   Packet4f t, y_0, y_1;
 
   // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
@@ -358,7 +812,7 @@
   t   = vec_nmsub(y_0, b, p4f_ONE);
   y_1 = vec_madd(y_0, t, y_0);
 
-  return vec_madd(a, y_1, p4f_ZERO);
+  return vec_madd(a, y_1, p4f_MZERO);
 #else
   return vec_div(a, b);
 #endif
@@ -370,83 +824,249 @@
 }
 
 // for some weird raisons, it has to be overloaded for packet of integers
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
+template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
+template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) { return vec_madd(a,b,c); }
+template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) { return vec_madd(a,b,c); }
 
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  #ifdef __VSX__
+  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
+  Packet4f ret;
+  __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+  #else
+  return vec_min(a, b);
+  #endif
+}
 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_min(a, b); }
 
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  #ifdef __VSX__
+  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
+  Packet4f ret;
+  __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+  #else
+  return vec_max(a, b);
+  #endif
+}
 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
+  Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a,b));
+  return vec_nor(c,c);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmpeq(a,b)); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return pand<Packet8us>(a, b);
+}
+
 
 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return por<Packet8us>(a, b);
+}
 
 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
-
-#ifdef _BIG_ENDIAN
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
-{
-  EIGEN_DEBUG_ALIGNED_LOAD
-  Packet16uc MSQ, LSQ;
-  Packet16uc mask;
-  MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
-  LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
-  mask = vec_lvsl(0, from);                        // create the permute mask
-  return (Packet4f) vec_perm(MSQ, LSQ, mask);           // align the data
-
+template<> EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 
+  return pxor<Packet8us>(a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
-{
-  EIGEN_DEBUG_ALIGNED_LOAD
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  Packet16uc MSQ, LSQ;
-  Packet16uc mask;
-  MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
-  LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
-  mask = vec_lvsl(0, from);                        // create the permute mask
-  return (Packet4i) vec_perm(MSQ, LSQ, mask);    // align the data
+
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_andc(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_andc(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+  return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask));
 }
+
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
+{
+    Packet4f t = vec_add(reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
+    Packet4f res;
+
+#ifdef __VSX__
+    __asm__("xvrspiz %x0, %x1\n\t"
+        : "=&wa" (res)
+        : "wa" (t));
 #else
-// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
-{
-  EIGEN_DEBUG_ALIGNED_LOAD
-  return (Packet4i) vec_vsx_ld((long)from & 15, (const Packet4i*) _EIGEN_ALIGNED_PTR(from));
-}
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
-{
-  EIGEN_DEBUG_ALIGNED_LOAD
-  return (Packet4f) vec_vsx_ld((long)from & 15, (const Packet4f*) _EIGEN_ALIGNED_PTR(from));
-}
+    __asm__("vrfiz %0, %1\n\t"
+        : "=v" (res)
+        : "v" (t));
 #endif
 
+    return res;
+}
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a) { return vec_ceil(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
+template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
+{
+    Packet4f res;
+
+    __asm__("xvrspic %x0, %x1\n\t"
+        : "=&wa" (res)
+        : "wa" (a));
+
+    return res;
+}
+
+template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD
+#ifdef _BIG_ENDIAN
+  Packet16uc MSQ, LSQ;
+  Packet16uc mask;
+  MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
+  LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
+  mask = vec_lvsl(0, from);                        // create the permute mask
+  //TODO: Add static_cast here
+  return (Packet) vec_perm(MSQ, LSQ, mask);           // align the data
+#else
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
+{
+  return ploadu_common<Packet4f>(from);
+}
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
+{
+  return ploadu_common<Packet4i>(from);
+}
+template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from)
+{
+  return ploadu_common<Packet8s>(from);
+}
+template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from)
+{
+  return ploadu_common<Packet8us>(from);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from)
+{
+  return ploadu_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
+}
+template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const signed char* from)
+{
+  return ploadu_common<Packet16c>(from);
+}
+template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from)
+{
+  return ploadu_common<Packet16uc>(from);
+}
+
+template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)*   from)
+{
+  Packet p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet>(from);
+  else                                  p = ploadu<Packet>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+}
 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
 {
-  Packet4f p;
-  if((ptrdiff_t(from) % 16) == 0)  p = pload<Packet4f>(from);
-  else                             p = ploadu<Packet4f>(from);
-  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+  return ploaddup_common<Packet4f>(from);
 }
 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
 {
-  Packet4i p;
-  if((ptrdiff_t(from) % 16) == 0)  p = pload<Packet4i>(from);
-  else                             p = ploadu<Packet4i>(from);
-  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+  return ploaddup_common<Packet4i>(from);
 }
 
-#ifdef _BIG_ENDIAN
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from)
+template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int*     from)
+{
+  Packet8s p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet8s>(from);
+  else                                  p = ploadu<Packet8s>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE16_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int*     from)
+{
+  Packet8us p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet8us>(from);
+  else                                  p = ploadu<Packet8us>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE16_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int*     from)
+{
+  Packet8s p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet8s>(from);
+  else                                  p = ploadu<Packet8s>(from);
+  return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int*     from)
+{
+  Packet8us p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet8us>(from);
+  else                                  p = ploadu<Packet8us>(from);
+  return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16*     from)
+{
+  return ploadquad<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char*     from)
+{
+  Packet16c p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet16c>(from);
+  else                                  p = ploadu<Packet16c>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE8_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char*     from)
+{
+  Packet16uc p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet16uc>(from);
+  else                                  p = ploadu<Packet16uc>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE8_HI);
+}
+
+template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)*  to, const Packet& from)
 {
   EIGEN_DEBUG_UNALIGNED_STORE
+#ifdef _BIG_ENDIAN
   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
   // Warning: not thread safe!
   Packet16uc MSQ, LSQ, edges;
@@ -460,92 +1080,377 @@
   MSQ = vec_perm(edges,(Packet16uc)from,align);             // misalign the data (MSQ)
   LSQ = vec_perm((Packet16uc)from,edges,align);             // misalign the data (LSQ)
   vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
-  vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
+  vec_st( MSQ, 0, (unsigned char *)to );                   // Store the MSQ part second
+#else
+  vec_xst(from, 0, to);
+#endif
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from)
+{
+  pstoreu_common<Packet4f>(to, from);
 }
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from)
 {
-  EIGEN_DEBUG_UNALIGNED_STORE
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  // Warning: not thread safe!
-  Packet16uc MSQ, LSQ, edges;
-  Packet16uc edgeAlign, align;
-
-  MSQ = vec_ld(0, (unsigned char *)to);                     // most significant quadword
-  LSQ = vec_ld(15, (unsigned char *)to);                    // least significant quadword
-  edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
-  edges=vec_perm(LSQ, MSQ, edgeAlign);                      // extract the edges
-  align = vec_lvsr( 0, to );                                // permute map to misalign data
-  MSQ = vec_perm(edges, (Packet16uc) from, align);          // misalign the data (MSQ)
-  LSQ = vec_perm((Packet16uc) from, edges, align);          // misalign the data (LSQ)
-  vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
-  vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
+  pstoreu_common<Packet4i>(to, from);
 }
-#else
-// We also need to redefine little endian loading of Packet4i/Packet4f using VSX
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from)
+template<> EIGEN_STRONG_INLINE void pstoreu<short int>(short int*      to, const Packet8s& from)
 {
-  EIGEN_DEBUG_ALIGNED_STORE
-  vec_vsx_st(from, (long)to & 15, (Packet4i*) _EIGEN_ALIGNED_PTR(to));
+  pstoreu_common<Packet8s>(to, from);
 }
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from)
+template<> EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int*      to, const Packet8us& from)
 {
-  EIGEN_DEBUG_ALIGNED_STORE
-  vec_vsx_st(from, (long)to & 15, (Packet4f*) _EIGEN_ALIGNED_PTR(to));
+  pstoreu_common<Packet8us>(to, from);
 }
-#endif
+template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16*      to, const Packet8bf& from)
+{
+  pstoreu_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from);
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<signed char>(signed char*      to, const Packet16c& from)
+{
+  pstoreu_common<Packet16c>(to, from);
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char*      to, const Packet16uc& from)
+{
+  pstoreu_common<Packet16uc>(to, from);
+}
 
-#ifndef __VSX__
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
-#endif
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr)    { EIGEN_PPC_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr)    { EIGEN_PPC_PREFETCH(addr); }
 
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int   x; vec_ste(a, 0, &x); return x; }
 
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE32); }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE32); }
+template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) {
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x;
+  vec_ste(a, 0, &x);
+  return x;
+}
+
+template<> EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) {
+  return pfirst_common<Packet8s>(a);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) {
+  return pfirst_common<Packet8us>(a);
+}
+
+template<> EIGEN_STRONG_INLINE signed char pfirst<Packet16c>(const Packet16c& a)
+{
+  return pfirst_common<Packet16c>(a);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned char pfirst<Packet16uc>(const Packet16uc& a)
+{
+  return pfirst_common<Packet16uc>(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{
+  return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
+{
+  return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a)
+{
+  return reinterpret_cast<Packet8s>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
+}
+template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a)
+{
+  return reinterpret_cast<Packet8us>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
+}
+template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a)
+{
+  return vec_perm(a, a, p16uc_REVERSE8);
+}
+template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a)
+{
+  return vec_perm(a, a, p16uc_REVERSE8);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a)
+{
+  return preverse<Packet8us>(a);
+}
 
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8bf  pabs(const Packet8bf& a) {
+  _EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask,0x7FFF);
+  return pand<Packet8us>(p8us_abs_mask, a);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a)
+{ return vec_sra(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
+template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a)
+{ return vec_sr(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
+template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a)
+{ return vec_sl(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
+template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a)
+{
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
+  Packet4ui r = vec_sl(reinterpret_cast<Packet4ui>(a), p4ui_mask);
+  return reinterpret_cast<Packet4f>(r);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a)
+{
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
+  Packet4ui r = vec_sr(reinterpret_cast<Packet4ui>(a), p4ui_mask);
+  return reinterpret_cast<Packet4f>(r);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a)
+{
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
+  return vec_sr(a, p4ui_mask);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a)
+{
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
+  return vec_sl(a, p4ui_mask);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a)
+{
+  const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
+  return vec_sl(a, p8us_mask);
+}
+template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a)
+{
+  const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
+  return vec_sr(a, p8us_mask);
+}
+
+EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf){
+  return plogical_shift_left<16>(reinterpret_cast<Packet4f>(bf.m_val));
+}
+
+EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf){
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
+  return pand<Packet4f>(
+    reinterpret_cast<Packet4f>(bf.m_val),
+    reinterpret_cast<Packet4f>(p4ui_high_mask)
+  );
+}
+
+// Simple interleaving of bool masks, prevents true values from being
+// converted to NaNs.
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) {
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
+  Packet4f bf_odd, bf_even;
+  bf_odd = pand(reinterpret_cast<Packet4f>(p4ui_high_mask), odd);
+  bf_even = plogical_shift_right<16>(even);
+  return reinterpret_cast<Packet8us>(por<Packet4f>(bf_even, bf_odd));
+}
+
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f){
+  Packet4ui input = reinterpret_cast<Packet4ui>(p4f);
+  Packet4ui lsb = plogical_shift_right<16>(input);
+  lsb = pand<Packet4ui>(lsb, reinterpret_cast<Packet4ui>(p4i_ONE));
+
+  _EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS,0x7FFFu);
+  Packet4ui rounding_bias = padd<Packet4ui>(lsb, p4ui_BIAS);
+  input = padd<Packet4ui>(input, rounding_bias);
+
+  //Test NaN and Subnormal - Begin
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000);
+  Packet4ui exp = pand<Packet4ui>(p4ui_exp_mask, reinterpret_cast<Packet4ui>(p4f));
+
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF);
+  Packet4ui mantissa = pand<Packet4ui>(p4ui_mantissa_mask, reinterpret_cast<Packet4ui>(p4f));
+
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(max_exp, 0x7F800000);
+  Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_max_exp);
+  Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet4ui>(p4i_ZERO));
+
+  Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet4ui>(p4i_ZERO));
+  Packet4ui nan_selector = pandnot<Packet4ui>(
+      reinterpret_cast<Packet4ui>(is_max_exp),
+      reinterpret_cast<Packet4ui>(is_mant_zero)
+  );
+
+  Packet4ui subnormal_selector = pandnot<Packet4ui>(
+      reinterpret_cast<Packet4ui>(is_zero_exp),
+      reinterpret_cast<Packet4ui>(is_mant_zero)
+  );
+
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000);
+  input = vec_sel(input, p4ui_nan, nan_selector);
+  input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
+  //Test NaN and Subnormal - End
+
+  input = plogical_shift_right<16>(input);
+  return reinterpret_cast<Packet8us>(input);
+}
+
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd){
+  Packet4f bf_odd, bf_even;
+  bf_odd = reinterpret_cast<Packet4f>(F32ToBf16(odd).m_val);
+  bf_odd = plogical_shift_left<16>(bf_odd);
+  bf_even = reinterpret_cast<Packet4f>(F32ToBf16(even).m_val);
+  return reinterpret_cast<Packet8us>(por<Packet4f>(bf_even, bf_odd));
+}
+#define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \
+  Packet4f a_even = Bf16ToF32Even(A);\
+  Packet4f a_odd = Bf16ToF32Odd(A);\
+  Packet4f op_even = OP(a_even);\
+  Packet4f op_odd = OP(a_odd);\
+  return F32ToBf16(op_even, op_odd);\
+
+#define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \
+  Packet4f a_even = Bf16ToF32Even(A);\
+  Packet4f a_odd = Bf16ToF32Odd(A);\
+  Packet4f b_even = Bf16ToF32Even(B);\
+  Packet4f b_odd = Bf16ToF32Odd(B);\
+  Packet4f op_even = OP(a_even, b_even);\
+  Packet4f op_odd = OP(a_odd, b_odd);\
+  return F32ToBf16(op_even, op_odd);\
+
+#define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \
+  Packet4f a_even = Bf16ToF32Even(A);\
+  Packet4f a_odd = Bf16ToF32Odd(A);\
+  Packet4f b_even = Bf16ToF32Even(B);\
+  Packet4f b_odd = Bf16ToF32Odd(B);\
+  Packet4f op_even = OP(a_even, b_even);\
+  Packet4f op_odd = OP(a_odd, b_odd);\
+  return F32ToBf16Bool(op_even, op_odd);\
+
+template<> EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(padd<Packet4f>, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(pmul<Packet4f>, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(pdiv<Packet4f>, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(const Packet8bf& a) {
+  BF16_TO_F32_UNARY_OP_WRAPPER(pnegate<Packet4f>, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+  return pldexp_generic(a,exponent);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf> (const Packet8bf& a, const Packet8bf& exponent){
+  BF16_TO_F32_BINARY_OP_WRAPPER(pldexp<Packet4f>, a, exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+  return pfrexp_generic(a,exponent);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf> (const Packet8bf& a, Packet8bf& e){
+  Packet4f a_even = Bf16ToF32Even(a);
+  Packet4f a_odd = Bf16ToF32Odd(a);
+  Packet4f e_even;
+  Packet4f e_odd;
+  Packet4f op_even = pfrexp<Packet4f>(a_even, e_even);
+  Packet4f op_odd = pfrexp<Packet4f>(a_odd, e_odd);
+  e = F32ToBf16(e_even, e_odd);
+  return F32ToBf16(op_even, op_odd);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(pfloor<Packet4f>, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(pceil<Packet4f>, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  Packet4f a_even = Bf16ToF32Even(a);
+  Packet4f a_odd = Bf16ToF32Odd(a);
+  Packet4f b_even = Bf16ToF32Even(b);
+  Packet4f b_odd = Bf16ToF32Odd(b);
+  Packet4f c_even = Bf16ToF32Even(c);
+  Packet4f c_odd = Bf16ToF32Odd(c);
+  Packet4f pmadd_even = pmadd<Packet4f>(a_even, b_even, c_even);
+  Packet4f pmadd_odd = pmadd<Packet4f>(a_odd, b_odd, c_odd);
+  return F32ToBf16(pmadd_even, pmadd_odd);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(pmax<Packet4f>, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt<Packet4f>, a, b);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan<Packet4f>, a, b);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le<Packet4f>, a, b);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq<Packet4f>, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) {
+  return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst<Packet8us>(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const  bfloat16*     from)
+{
+  return ploaddup<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
+  bfloat16 countdown[8] = { bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
+                            bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7) };
+  return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown));
+}
 
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
   Packet4f b, sum;
-  b   = (Packet4f) vec_sld(a, a, 8);
-  sum = vec_add(a, b);
-  b   = (Packet4f) vec_sld(sum, sum, 4);
-  sum = vec_add(sum, b);
+  b   = vec_sld(a, a, 8);
+  sum = a + b;
+  b   = vec_sld(sum, sum, 4);
+  sum += b;
   return pfirst(sum);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
-{
-  Packet4f v[4], sum[4];
-
-  // It's easier and faster to transpose then add as columns
-  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
-  // Do the transpose, first set of moves
-  v[0] = vec_mergeh(vecs[0], vecs[2]);
-  v[1] = vec_mergel(vecs[0], vecs[2]);
-  v[2] = vec_mergeh(vecs[1], vecs[3]);
-  v[3] = vec_mergel(vecs[1], vecs[3]);
-  // Get the resulting vectors
-  sum[0] = vec_mergeh(v[0], v[2]);
-  sum[1] = vec_mergel(v[0], v[2]);
-  sum[2] = vec_mergeh(v[1], v[3]);
-  sum[3] = vec_mergel(v[1], v[3]);
-
-  // Now do the summation:
-  // Lines 0+1
-  sum[0] = vec_add(sum[0], sum[1]);
-  // Lines 2+3
-  sum[1] = vec_add(sum[2], sum[3]);
-  // Add the results
-  sum[0] = vec_add(sum[0], sum[1]);
-
-  return sum[0];
-}
-
 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
   Packet4i sum;
@@ -558,32 +1463,69 @@
   return pfirst(sum);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
+template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a)
 {
-  Packet4i v[4], sum[4];
+  float redux_even = predux<Packet4f>(Bf16ToF32Even(a));
+  float redux_odd  = predux<Packet4f>(Bf16ToF32Odd(a));
+  float f32_result = redux_even + redux_odd;
+  return bfloat16(f32_result);
+}
+template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a)
+{
+  union{
+    Packet v;
+    __UNPACK_TYPE__(Packet) n[8];
+  } vt;
+  vt.v = a;
 
-  // It's easier and faster to transpose then add as columns
-  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
-  // Do the transpose, first set of moves
-  v[0] = vec_mergeh(vecs[0], vecs[2]);
-  v[1] = vec_mergel(vecs[0], vecs[2]);
-  v[2] = vec_mergeh(vecs[1], vecs[3]);
-  v[3] = vec_mergel(vecs[1], vecs[3]);
-  // Get the resulting vectors
-  sum[0] = vec_mergeh(v[0], v[2]);
-  sum[1] = vec_mergel(v[0], v[2]);
-  sum[2] = vec_mergeh(v[1], v[3]);
-  sum[3] = vec_mergel(v[1], v[3]);
+  EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
+  EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
+  Packet4i first_half  = pload<Packet4i>(first_loader);
+  Packet4i second_half = pload<Packet4i>(second_loader);
 
-  // Now do the summation:
-  // Lines 0+1
-  sum[0] = vec_add(sum[0], sum[1]);
-  // Lines 2+3
-  sum[1] = vec_add(sum[2], sum[3]);
-  // Add the results
-  sum[0] = vec_add(sum[0], sum[1]);
+  return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_half) + predux(second_half));
+}
 
-  return sum[0];
+template<> EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a)
+{
+  return predux_size8<Packet8s>(a);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a)
+{
+  return predux_size8<Packet8us>(a);
+}
+
+template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a)
+{
+  union{
+    Packet v;
+    __UNPACK_TYPE__(Packet) n[16];
+  } vt;
+  vt.v = a;
+
+  EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
+  EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
+  EIGEN_ALIGN16 int third_loader[4] = { vt.n[8], vt.n[9], vt.n[10], vt.n[11] };
+  EIGEN_ALIGN16 int fourth_loader[4] = { vt.n[12], vt.n[13], vt.n[14], vt.n[15] };
+
+  Packet4i first_quarter = pload<Packet4i>(first_loader);
+  Packet4i second_quarter = pload<Packet4i>(second_loader);
+  Packet4i third_quarter = pload<Packet4i>(third_loader);
+  Packet4i fourth_quarter = pload<Packet4i>(fourth_loader);
+
+  return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter)
+		                  + predux(third_quarter) + predux(fourth_quarter));
+}
+
+template<> EIGEN_STRONG_INLINE signed char predux<Packet16c>(const Packet16c& a)
+{
+  return predux_size16<Packet16c>(a);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned char predux<Packet16uc>(const Packet16uc& a)
+{
+  return predux_size16<Packet16uc>(a);
 }
 
 // Other reduction functions:
@@ -591,8 +1533,8 @@
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
 {
   Packet4f prod;
-  prod = pmul(a, (Packet4f)vec_sld(a, a, 8));
-  return pfirst(pmul(prod, (Packet4f)vec_sld(prod, prod, 4)));
+  prod = pmul(a, vec_sld(a, a, 8));
+  return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
 }
 
 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
@@ -602,97 +1544,255 @@
   return aux[0] * aux[1] * aux[2] * aux[3];
 }
 
-// min
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+template<> EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a)
 {
-  Packet4f b, res;
+  Packet8s pair, quad, octo;
+
+  pair = vec_mul(a, vec_sld(a, a, 8));
+  quad = vec_mul(pair, vec_sld(pair, pair, 4));
+  octo = vec_mul(quad, vec_sld(quad, quad, 2));
+
+  return pfirst(octo);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a)
+{
+  Packet8us pair, quad, octo;
+
+  pair = vec_mul(a, vec_sld(a, a, 8));
+  quad = vec_mul(pair, vec_sld(pair, pair, 4));
+  octo = vec_mul(quad, vec_sld(quad, quad, 2));
+
+  return pfirst(octo);
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a)
+{
+  float redux_even = predux_mul<Packet4f>(Bf16ToF32Even(a));
+  float redux_odd  = predux_mul<Packet4f>(Bf16ToF32Odd(a));
+  float f32_result = redux_even * redux_odd;
+  return bfloat16(f32_result);
+}
+
+
+template<> EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c& a)
+{
+  Packet16c pair, quad, octo, result;
+
+  pair = vec_mul(a, vec_sld(a, a, 8));
+  quad = vec_mul(pair, vec_sld(pair, pair, 4));
+  octo = vec_mul(quad, vec_sld(quad, quad, 2));
+  result = vec_mul(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet16uc& a)
+{
+  Packet16uc pair, quad, octo, result;
+
+  pair = vec_mul(a, vec_sld(a, a, 8));
+  quad = vec_mul(pair, vec_sld(pair, pair, 4));
+  octo = vec_mul(quad, vec_sld(quad, quad, 2));
+  result = vec_mul(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+
+// min
+template<typename Packet> EIGEN_STRONG_INLINE
+__UNPACK_TYPE__(Packet) predux_min4(const Packet& a)
+{
+  Packet b, res;
   b = vec_min(a, vec_sld(a, a, 8));
   res = vec_min(b, vec_sld(b, b, 4));
   return pfirst(res);
 }
 
+
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  return predux_min4<Packet4f>(a);
+}
+
 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
 {
-  Packet4i b, res;
-  b = vec_min(a, vec_sld(a, a, 8));
-  res = vec_min(b, vec_sld(b, b, 4));
-  return pfirst(res);
+  return predux_min4<Packet4i>(a);
 }
 
-// max
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a)
 {
-  Packet4f b, res;
+  float redux_even = predux_min<Packet4f>(Bf16ToF32Even(a));
+  float redux_odd  = predux_min<Packet4f>(Bf16ToF32Odd(a));
+  float f32_result = (std::min)(redux_even, redux_odd);
+  return bfloat16(f32_result);
+}
+
+template<> EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a)
+{
+  Packet8s pair, quad, octo;
+  
+  //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
+  pair = vec_min(a, vec_sld(a, a, 8)); 
+
+  //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
+  quad = vec_min(pair, vec_sld(pair, pair, 4));
+
+  //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
+  octo = vec_min(quad, vec_sld(quad, quad, 2));
+  return pfirst(octo);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a)
+{
+  Packet8us pair, quad, octo;
+  
+  //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
+  pair = vec_min(a, vec_sld(a, a, 8)); 
+
+  //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
+  quad = vec_min(pair, vec_sld(pair, pair, 4));
+
+  //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
+  octo = vec_min(quad, vec_sld(quad, quad, 2));
+  return pfirst(octo);
+}
+
+template<> EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c& a)
+{
+  Packet16c pair, quad, octo, result;
+
+  pair = vec_min(a, vec_sld(a, a, 8));
+  quad = vec_min(pair, vec_sld(pair, pair, 4));
+  octo = vec_min(quad, vec_sld(quad, quad, 2));
+  result = vec_min(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet16uc& a)
+{
+  Packet16uc pair, quad, octo, result;
+
+  pair = vec_min(a, vec_sld(a, a, 8));
+  quad = vec_min(pair, vec_sld(pair, pair, 4));
+  octo = vec_min(quad, vec_sld(quad, quad, 2));
+  result = vec_min(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+// max
+template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a)
+{
+  Packet b, res;
   b = vec_max(a, vec_sld(a, a, 8));
   res = vec_max(b, vec_sld(b, b, 4));
   return pfirst(res);
 }
 
+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  return predux_max4<Packet4f>(a);
+}
+
 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
 {
-  Packet4i b, res;
-  b = vec_max(a, vec_sld(a, a, 8));
-  res = vec_max(b, vec_sld(b, b, 4));
-  return pfirst(res);
+  return predux_max4<Packet4i>(a);
 }
 
-template<int Offset>
-struct palign_impl<Offset,Packet4f>
+template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a)
 {
-  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
-  {
-#ifdef _BIG_ENDIAN
-    switch (Offset % 4) {
-    case 1:
-      first = vec_sld(first, second, 4); break;
-    case 2:
-      first = vec_sld(first, second, 8); break;
-    case 3:
-      first = vec_sld(first, second, 12); break;
-    }
-#else
-    switch (Offset % 4) {
-    case 1:
-      first = vec_sld(second, first, 12); break;
-    case 2:
-      first = vec_sld(second, first, 8); break;
-    case 3:
-      first = vec_sld(second, first, 4); break;
-    }
-#endif
-  }
-};
+  float redux_even = predux_max<Packet4f>(Bf16ToF32Even(a));
+  float redux_odd  = predux_max<Packet4f>(Bf16ToF32Odd(a));
+  float f32_result = (std::max)(redux_even, redux_odd);
+  return bfloat16(f32_result);
+}
 
-template<int Offset>
-struct palign_impl<Offset,Packet4i>
+template<> EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a)
 {
-  static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
-  {
-#ifdef _BIG_ENDIAN
-    switch (Offset % 4) {
-    case 1:
-      first = vec_sld(first, second, 4); break;
-    case 2:
-      first = vec_sld(first, second, 8); break;
-    case 3:
-      first = vec_sld(first, second, 12); break;
-    }
-#else
-    switch (Offset % 4) {
-    case 1:
-      first = vec_sld(second, first, 12); break;
-    case 2:
-      first = vec_sld(second, first, 8); break;
-    case 3:
-      first = vec_sld(second, first, 4); break;
-    }
-#endif
-  }
-};
+  Packet8s pair, quad, octo;
+  
+  //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
+  pair = vec_max(a, vec_sld(a, a, 8)); 
 
-template<> EIGEN_DEVICE_FUNC inline void
+  //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
+  quad = vec_max(pair, vec_sld(pair, pair, 4));
+
+  //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
+  octo = vec_max(quad, vec_sld(quad, quad, 2));
+  return pfirst(octo);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a)
+{
+  Packet8us pair, quad, octo;
+  
+  //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
+  pair = vec_max(a, vec_sld(a, a, 8)); 
+
+  //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
+  quad = vec_max(pair, vec_sld(pair, pair, 4));
+
+  //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
+  octo = vec_max(quad, vec_sld(quad, quad, 2));
+  return pfirst(octo);
+}
+
+template<> EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c& a)
+{
+  Packet16c pair, quad, octo, result;
+
+  pair = vec_max(a, vec_sld(a, a, 8));
+  quad = vec_max(pair, vec_sld(pair, pair, 4));
+  octo = vec_max(quad, vec_sld(quad, quad, 2));
+  result = vec_max(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet16uc& a)
+{
+  Packet16uc pair, quad, octo, result;
+
+  pair = vec_max(a, vec_sld(a, a, 8));
+  quad = vec_max(pair, vec_sld(pair, pair, 4));
+  octo = vec_max(quad, vec_sld(quad, quad, 2));
+  result = vec_max(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+{
+  return vec_any_ne(x, pzero(x));
+}
+
+template <typename T> EIGEN_DEVICE_FUNC inline void
+ptranpose_common(PacketBlock<T,4>& kernel){
+  T t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4f,4>& kernel) {
-  Packet4f t0, t1, t2, t3;
+  ptranpose_common<Packet4f>(kernel);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4i,4>& kernel) {
+  ptranpose_common<Packet4i>(kernel);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8s,4>& kernel) {
+  Packet8s t0, t1, t2, t3;
   t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
   t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
   t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
@@ -703,9 +1803,9 @@
   kernel.packet[3] = vec_mergel(t1, t3);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4i,4>& kernel) {
-  Packet4i t0, t1, t2, t3;
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8us,4>& kernel) {
+  Packet8us t0, t1, t2, t3;
   t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
   t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
   t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
@@ -717,32 +1817,469 @@
 }
 
 
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8bf,4>& kernel) {
+  Packet8us t0, t1, t2, t3;
+
+  t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val);
+  t1 = vec_mergel(kernel.packet[0].m_val, kernel.packet[2].m_val);
+  t2 = vec_mergeh(kernel.packet[1].m_val, kernel.packet[3].m_val);
+  t3 = vec_mergel(kernel.packet[1].m_val, kernel.packet[3].m_val);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet16c,4>& kernel) {
+  Packet16c t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet16uc,4>& kernel) {
+  Packet16uc t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8s,8>& kernel) {
+  Packet8s v[8], sum[8];
+
+  v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
+  v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
+  v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
+  v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
+  v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
+  v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
+  v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
+  v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
+  sum[0] = vec_mergeh(v[0], v[4]);
+  sum[1] = vec_mergel(v[0], v[4]);
+  sum[2] = vec_mergeh(v[1], v[5]);
+  sum[3] = vec_mergel(v[1], v[5]);
+  sum[4] = vec_mergeh(v[2], v[6]);
+  sum[5] = vec_mergel(v[2], v[6]);
+  sum[6] = vec_mergeh(v[3], v[7]);
+  sum[7] = vec_mergel(v[3], v[7]);
+
+  kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
+  kernel.packet[1] = vec_mergel(sum[0], sum[4]);
+  kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
+  kernel.packet[3] = vec_mergel(sum[1], sum[5]);
+  kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
+  kernel.packet[5] = vec_mergel(sum[2], sum[6]);
+  kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
+  kernel.packet[7] = vec_mergel(sum[3], sum[7]);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8us,8>& kernel) {
+  Packet8us v[8], sum[8];
+
+  v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
+  v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
+  v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
+  v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
+  v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
+  v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
+  v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
+  v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
+  sum[0] = vec_mergeh(v[0], v[4]);
+  sum[1] = vec_mergel(v[0], v[4]);
+  sum[2] = vec_mergeh(v[1], v[5]);
+  sum[3] = vec_mergel(v[1], v[5]);
+  sum[4] = vec_mergeh(v[2], v[6]);
+  sum[5] = vec_mergel(v[2], v[6]);
+  sum[6] = vec_mergeh(v[3], v[7]);
+  sum[7] = vec_mergel(v[3], v[7]);
+
+  kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
+  kernel.packet[1] = vec_mergel(sum[0], sum[4]);
+  kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
+  kernel.packet[3] = vec_mergel(sum[1], sum[5]);
+  kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
+  kernel.packet[5] = vec_mergel(sum[2], sum[6]);
+  kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
+  kernel.packet[7] = vec_mergel(sum[3], sum[7]);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8bf,8>& kernel) {
+  Packet8bf v[8], sum[8];
+
+  v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val);
+  v[1] = vec_mergel(kernel.packet[0].m_val, kernel.packet[4].m_val);
+  v[2] = vec_mergeh(kernel.packet[1].m_val, kernel.packet[5].m_val);
+  v[3] = vec_mergel(kernel.packet[1].m_val, kernel.packet[5].m_val);
+  v[4] = vec_mergeh(kernel.packet[2].m_val, kernel.packet[6].m_val);
+  v[5] = vec_mergel(kernel.packet[2].m_val, kernel.packet[6].m_val);
+  v[6] = vec_mergeh(kernel.packet[3].m_val, kernel.packet[7].m_val);
+  v[7] = vec_mergel(kernel.packet[3].m_val, kernel.packet[7].m_val);
+  sum[0] = vec_mergeh(v[0].m_val, v[4].m_val);
+  sum[1] = vec_mergel(v[0].m_val, v[4].m_val);
+  sum[2] = vec_mergeh(v[1].m_val, v[5].m_val);
+  sum[3] = vec_mergel(v[1].m_val, v[5].m_val);
+  sum[4] = vec_mergeh(v[2].m_val, v[6].m_val);
+  sum[5] = vec_mergel(v[2].m_val, v[6].m_val);
+  sum[6] = vec_mergeh(v[3].m_val, v[7].m_val);
+  sum[7] = vec_mergel(v[3].m_val, v[7].m_val);
+
+  kernel.packet[0] = vec_mergeh(sum[0].m_val, sum[4].m_val);
+  kernel.packet[1] = vec_mergel(sum[0].m_val, sum[4].m_val);
+  kernel.packet[2] = vec_mergeh(sum[1].m_val, sum[5].m_val);
+  kernel.packet[3] = vec_mergel(sum[1].m_val, sum[5].m_val);
+  kernel.packet[4] = vec_mergeh(sum[2].m_val, sum[6].m_val);
+  kernel.packet[5] = vec_mergel(sum[2].m_val, sum[6].m_val);
+  kernel.packet[6] = vec_mergeh(sum[3].m_val, sum[7].m_val);
+  kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet16c,16>& kernel) {
+  Packet16c step1[16], step2[16], step3[16];
+
+  step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
+  step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
+  step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
+  step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
+  step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
+  step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
+  step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
+  step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
+  step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
+  step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
+  step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
+  step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
+  step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
+  step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
+  step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
+  step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
+
+  step2[0]  = vec_mergeh(step1[0], step1[8]);
+  step2[1]  = vec_mergel(step1[0], step1[8]);
+  step2[2]  = vec_mergeh(step1[1], step1[9]);
+  step2[3]  = vec_mergel(step1[1], step1[9]);
+  step2[4]  = vec_mergeh(step1[2], step1[10]);
+  step2[5]  = vec_mergel(step1[2], step1[10]);
+  step2[6]  = vec_mergeh(step1[3], step1[11]);
+  step2[7]  = vec_mergel(step1[3], step1[11]);
+  step2[8]  = vec_mergeh(step1[4], step1[12]);
+  step2[9]  = vec_mergel(step1[4], step1[12]);
+  step2[10] = vec_mergeh(step1[5], step1[13]);
+  step2[11] = vec_mergel(step1[5], step1[13]);
+  step2[12] = vec_mergeh(step1[6], step1[14]);
+  step2[13] = vec_mergel(step1[6], step1[14]);
+  step2[14] = vec_mergeh(step1[7], step1[15]);
+  step2[15] = vec_mergel(step1[7], step1[15]);
+
+  step3[0]  = vec_mergeh(step2[0], step2[8]);
+  step3[1]  = vec_mergel(step2[0], step2[8]);
+  step3[2]  = vec_mergeh(step2[1], step2[9]);
+  step3[3]  = vec_mergel(step2[1], step2[9]);
+  step3[4]  = vec_mergeh(step2[2], step2[10]);
+  step3[5]  = vec_mergel(step2[2], step2[10]);
+  step3[6]  = vec_mergeh(step2[3], step2[11]);
+  step3[7]  = vec_mergel(step2[3], step2[11]);
+  step3[8]  = vec_mergeh(step2[4], step2[12]);
+  step3[9]  = vec_mergel(step2[4], step2[12]);
+  step3[10] = vec_mergeh(step2[5], step2[13]);
+  step3[11] = vec_mergel(step2[5], step2[13]);
+  step3[12] = vec_mergeh(step2[6], step2[14]);
+  step3[13] = vec_mergel(step2[6], step2[14]);
+  step3[14] = vec_mergeh(step2[7], step2[15]);
+  step3[15] = vec_mergel(step2[7], step2[15]);
+
+  kernel.packet[0]  = vec_mergeh(step3[0], step3[8]);
+  kernel.packet[1]  = vec_mergel(step3[0], step3[8]);
+  kernel.packet[2]  = vec_mergeh(step3[1], step3[9]);
+  kernel.packet[3]  = vec_mergel(step3[1], step3[9]);
+  kernel.packet[4]  = vec_mergeh(step3[2], step3[10]);
+  kernel.packet[5]  = vec_mergel(step3[2], step3[10]);
+  kernel.packet[6]  = vec_mergeh(step3[3], step3[11]);
+  kernel.packet[7]  = vec_mergel(step3[3], step3[11]);
+  kernel.packet[8]  = vec_mergeh(step3[4], step3[12]);
+  kernel.packet[9]  = vec_mergel(step3[4], step3[12]);
+  kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
+  kernel.packet[11] = vec_mergel(step3[5], step3[13]);
+  kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
+  kernel.packet[13] = vec_mergel(step3[6], step3[14]);
+  kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
+  kernel.packet[15] = vec_mergel(step3[7], step3[15]);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet16uc,16>& kernel) {
+  Packet16uc step1[16], step2[16], step3[16];
+
+  step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
+  step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
+  step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
+  step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
+  step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
+  step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
+  step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
+  step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
+  step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
+  step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
+  step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
+  step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
+  step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
+  step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
+  step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
+  step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
+
+  step2[0]  = vec_mergeh(step1[0], step1[8]);
+  step2[1]  = vec_mergel(step1[0], step1[8]);
+  step2[2]  = vec_mergeh(step1[1], step1[9]);
+  step2[3]  = vec_mergel(step1[1], step1[9]);
+  step2[4]  = vec_mergeh(step1[2], step1[10]);
+  step2[5]  = vec_mergel(step1[2], step1[10]);
+  step2[6]  = vec_mergeh(step1[3], step1[11]);
+  step2[7]  = vec_mergel(step1[3], step1[11]);
+  step2[8]  = vec_mergeh(step1[4], step1[12]);
+  step2[9]  = vec_mergel(step1[4], step1[12]);
+  step2[10] = vec_mergeh(step1[5], step1[13]);
+  step2[11] = vec_mergel(step1[5], step1[13]);
+  step2[12] = vec_mergeh(step1[6], step1[14]);
+  step2[13] = vec_mergel(step1[6], step1[14]);
+  step2[14] = vec_mergeh(step1[7], step1[15]);
+  step2[15] = vec_mergel(step1[7], step1[15]);
+
+  step3[0]  = vec_mergeh(step2[0], step2[8]);
+  step3[1]  = vec_mergel(step2[0], step2[8]);
+  step3[2]  = vec_mergeh(step2[1], step2[9]);
+  step3[3]  = vec_mergel(step2[1], step2[9]);
+  step3[4]  = vec_mergeh(step2[2], step2[10]);
+  step3[5]  = vec_mergel(step2[2], step2[10]);
+  step3[6]  = vec_mergeh(step2[3], step2[11]);
+  step3[7]  = vec_mergel(step2[3], step2[11]);
+  step3[8]  = vec_mergeh(step2[4], step2[12]);
+  step3[9]  = vec_mergel(step2[4], step2[12]);
+  step3[10] = vec_mergeh(step2[5], step2[13]);
+  step3[11] = vec_mergel(step2[5], step2[13]);
+  step3[12] = vec_mergeh(step2[6], step2[14]);
+  step3[13] = vec_mergel(step2[6], step2[14]);
+  step3[14] = vec_mergeh(step2[7], step2[15]);
+  step3[15] = vec_mergel(step2[7], step2[15]);
+
+  kernel.packet[0]  = vec_mergeh(step3[0], step3[8]);
+  kernel.packet[1]  = vec_mergel(step3[0], step3[8]);
+  kernel.packet[2]  = vec_mergeh(step3[1], step3[9]);
+  kernel.packet[3]  = vec_mergel(step3[1], step3[9]);
+  kernel.packet[4]  = vec_mergeh(step3[2], step3[10]);
+  kernel.packet[5]  = vec_mergel(step3[2], step3[10]);
+  kernel.packet[6]  = vec_mergeh(step3[3], step3[11]);
+  kernel.packet[7]  = vec_mergel(step3[3], step3[11]);
+  kernel.packet[8]  = vec_mergeh(step3[4], step3[12]);
+  kernel.packet[9]  = vec_mergel(step3[4], step3[12]);
+  kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
+  kernel.packet[11] = vec_mergel(step3[5], step3[13]);
+  kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
+  kernel.packet[13] = vec_mergel(step3[6], step3[14]);
+  kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
+  kernel.packet[15] = vec_mergel(step3[7], step3[15]);
+}
+
+template<typename Packet> EIGEN_STRONG_INLINE
+Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+  Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+  return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+  return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket, const Packet8s& elsePacket) {
+  Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
+                       ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
+  Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(select, p8us_ONE));
+  Packet8s result = vec_sel(elsePacket, thenPacket, mask);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket, const Packet8us& elsePacket) {
+  Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
+                       ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
+  Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(reinterpret_cast<Packet8us>(select), p8us_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket, const Packet8bf& elsePacket) {
+  return pblend<Packet8us>(ifPacket, thenPacket, elsePacket);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket, const Packet16c& elsePacket) {
+  Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
+                       ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
+                       ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
+                       ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
+
+  Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket, const Packet16uc& elsePacket) {
+  Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
+                       ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
+                       ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
+                       ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
+
+  Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+template <>
+struct type_casting_traits<float, int> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<int, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<bfloat16, unsigned short int> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<unsigned short int, bfloat16> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return vec_cts(a,0);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) {
+  return vec_ctu(a,0);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return vec_ctf(a,0);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(const Packet4ui& a) {
+  return vec_ctf(a,0);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8us pcast<Packet8bf, Packet8us>(const Packet8bf& a) {
+  Packet4f float_even = Bf16ToF32Even(a);
+  Packet4f float_odd = Bf16ToF32Odd(a);
+  Packet4ui int_even = pcast<Packet4f, Packet4ui>(float_even);
+  Packet4ui int_odd = pcast<Packet4f, Packet4ui>(float_odd);
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
+  Packet4ui low_even = pand<Packet4ui>(int_even, p4ui_low_mask);
+  Packet4ui low_odd = pand<Packet4ui>(int_odd, p4ui_low_mask);
+
+  //Check values that are bigger than USHRT_MAX (0xFFFF)
+  Packet4bi overflow_selector;
+  if(vec_any_gt(int_even, p4ui_low_mask)){
+    overflow_selector = vec_cmpgt(int_even, p4ui_low_mask);
+    low_even = vec_sel(low_even, p4ui_low_mask, overflow_selector);
+  }
+  if(vec_any_gt(int_odd, p4ui_low_mask)){
+    overflow_selector = vec_cmpgt(int_odd, p4ui_low_mask);
+    low_odd = vec_sel(low_even, p4ui_low_mask, overflow_selector);
+  }
+
+  low_odd = plogical_shift_left<16>(low_odd);
+
+  Packet4ui int_final = por<Packet4ui>(low_even, low_odd);
+  return reinterpret_cast<Packet8us>(int_final);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pcast<Packet8us, Packet8bf>(const Packet8us& a) {
+  //short -> int -> float -> bfloat16
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
+  Packet4ui int_cast = reinterpret_cast<Packet4ui>(a);
+  Packet4ui int_even = pand<Packet4ui>(int_cast, p4ui_low_mask);
+  Packet4ui int_odd = plogical_shift_right<16>(int_cast);
+  Packet4f float_even = pcast<Packet4ui, Packet4f>(int_even);
+  Packet4f float_odd = pcast<Packet4ui, Packet4f>(int_odd);
+  return F32ToBf16(float_even, float_odd);
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
+  return reinterpret_cast<Packet4i>(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
+  return reinterpret_cast<Packet4f>(a);
+}
+
+
+
 //---------- double ----------
-#if defined(__VSX__)
+#ifdef __VSX__
 typedef __vector double              Packet2d;
 typedef __vector unsigned long long  Packet2ul;
 typedef __vector long long           Packet2l;
-
-static Packet2l p2l_ZERO = (Packet2l) p4i_ZERO;
-static Packet2d p2d_ONE = { 1.0, 1.0 }; 
-static Packet2d p2d_ZERO = (Packet2d) p4f_ZERO;
-static Packet2d p2d_ZERO_ = { -0.0, -0.0 };
-
-#ifdef _BIG_ENDIAN
-static Packet2d p2d_COUNTDOWN = (Packet2d) vec_sld((Packet16uc) p2d_ZERO, (Packet16uc) p2d_ONE, 8);
+#if EIGEN_COMP_CLANG
+typedef Packet2ul                    Packet2bl;
 #else
-static Packet2d p2d_COUNTDOWN = (Packet2d) vec_sld((Packet16uc) p2d_ONE, (Packet16uc) p2d_ZERO, 8);
+typedef __vector __bool long         Packet2bl;
 #endif
 
-static EIGEN_STRONG_INLINE Packet2d vec_splat_dbl(Packet2d& a, int index)
+static Packet2l  p2l_ONE  = { 1, 1 };
+static Packet2l  p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
+static Packet2ul p2ul_SIGN = { 0x8000000000000000ull, 0x8000000000000000ull };
+static Packet2ul p2ul_PREV0DOT5 = { 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull };
+static Packet2d  p2d_ONE  = { 1.0, 1.0 };
+static Packet2d  p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
+static Packet2d  p2d_MZERO = { numext::bit_cast<double>(0x8000000000000000ull),
+                               numext::bit_cast<double>(0x8000000000000000ull) };
+
+#ifdef _BIG_ENDIAN
+static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
+#else
+static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
+#endif
+
+template<int index> Packet2d vec_splat_dbl(Packet2d& a)
 {
-  switch (index) {
-  case 0:
-    return (Packet2d) vec_perm(a, a, p16uc_PSET64_HI);
-  case 1:
-    return (Packet2d) vec_perm(a, a, p16uc_PSET64_LO);
-  }
-  return a;
+  return vec_splat(a, index);
 }
 
 template<> struct packet_traits<double> : default_packet_traits
@@ -753,16 +2290,42 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=2,
-    HasHalfPacket = 0,
+    HasHalfPacket = 1,
 
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
     HasDiv  = 1,
+    HasMin  = 1,
+    HasMax  = 1,
+    HasAbs  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
     HasExp  = 1,
-    HasSqrt = 0
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1,
+    HasNegate = 1,
+    HasBlend = 1
   };
 };
 
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; };
 
+inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
+{
+  union {
+    Packet2l   v;
+    int64_t n[2];
+  } vt;
+  vt.v = v;
+  s << vt.n[0] << ", " << vt.n[1];
+  return s;
+}
 
 inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
 {
@@ -776,62 +2339,93 @@
 }
 
 // Need to define them first or we get specialization after instantiation errors
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return (Packet2d) vec_ld(0, (const float *) from); } //FIXME
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD
+  return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
+}
 
-template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st((Packet4f)from, 0, (float *)to); }
+template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_xst(from, 0, to);
+}
 
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) {
-  double EIGEN_ALIGN16 af[2];
-  af[0] = from;
-  Packet2d vc = pload<Packet2d>(af);
-  vc = vec_splat_dbl(vc, 0);
-  return vc;
+  Packet2d v = {from, from};
+  return v;
 }
+
+template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
+  Packet2l v = {static_cast<long long>(from), static_cast<long long>(from)};
+  return reinterpret_cast<Packet2d>(v);
+}
+
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet2d>(const double *a,
                       Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
 {
-  a1 = pload<Packet2d>(a);
-  a0 = vec_splat_dbl(a1, 0);
-  a1 = vec_splat_dbl(a1, 1);
-  a3 = pload<Packet2d>(a+2);
-  a2 = vec_splat_dbl(a3, 0);
-  a3 = vec_splat_dbl(a3, 1);
+  //This way is faster than vec_splat (at least for doubles in Power 9)
+  a0 = pset1<Packet2d>(a[0]);
+  a1 = pset1<Packet2d>(a[1]);
+  a2 = pset1<Packet2d>(a[2]);
+  a3 = pset1<Packet2d>(a[3]);
 }
-// Google-local: Change type from DenseIndex to int in patch.
-template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, int/*DenseIndex*/ stride)
+
+template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
-  double EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 double af[2];
   af[0] = from[0*stride];
   af[1] = from[1*stride];
  return pload<Packet2d>(af);
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, /*DenseIndex*/int stride)
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
 {
-  double EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 double af[2];
   pstore<double>(af, from);
   to[0*stride] = af[0];
   to[1*stride] = af[1];
 }
-template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return vec_add(pset1<Packet2d>(a), p2d_COUNTDOWN); }
 
-template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_add(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; }
 
-template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_sub(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return a + b; }
 
-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return psub<Packet2d>(p2d_ZERO, a); }
+template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; }
+
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; }
 
 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
 
-template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_ZERO); }
+template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_MZERO); }
 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); }
 
 // for some weird raisons, it has to be overloaded for packet of integers
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
 
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
+  Packet2d ret;
+  __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+ }
 
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
+  Packet2d ret;
+  __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
+  Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a,b));
+  return vec_nor(c,c);
+}
 
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
 
@@ -841,90 +2435,258 @@
 
 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
 
+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
+{
+    Packet2d t = vec_add(reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
+    Packet2d res;
+
+    __asm__("xvrdpiz %x0, %x1\n\t"
+        : "=&wa" (res)
+        : "wa" (t));
+
+    return res;
+}
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const  Packet2d& a) { return vec_ceil(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
+template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a)
+{
+    Packet2d res;
+
+    __asm__("xvrdpic %x0, %x1\n\t"
+        : "=&wa" (res)
+        : "wa" (a));
+
+    return res;
+}
+
 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
-  return (Packet2d) vec_vsx_ld((long)from & 15, (const Packet2d*) _EIGEN_ALIGNED_PTR(from));
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return vec_xl(0, const_cast<double*>(from));
 }
+
 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
 {
   Packet2d p;
-  if((ptrdiff_t(from) % 16) == 0)  p = pload<Packet2d>(from);
-  else                             p = ploadu<Packet2d>(from);
-  return vec_perm(p, p, p16uc_PSET64_HI);
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet2d>(from);
+  else                                  p = ploadu<Packet2d>(from);
+  return vec_splat_dbl<0>(p);
 }
 
 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from)
 {
-  EIGEN_DEBUG_ALIGNED_STORE
-  vec_vsx_st((Packet4f)from, (long)to & 15, (Packet4f*) _EIGEN_ALIGNED_PTR(to));
+  EIGEN_DEBUG_UNALIGNED_STORE
+  vec_xst(from, 0, to);
 }
 
-#ifndef __VSX__
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { vec_dstt((const float *) addr, DST_CTRL(2,2,32), DST_CHAN); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
+
+template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
+{
+  return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
+}
+template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
+
+// VSX support varies between different compilers and even different
+// versions of the same compiler.  For gcc version >= 4.9.3, we can use
+// vec_cts to efficiently convert Packet2d to Packet2l.  Otherwise, use
+// a slow version that works with older compilers. 
+// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
+// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
+template<>
+inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x) {
+#if EIGEN_GNUC_AT_LEAST(5, 4) || \
+    (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
+  return vec_cts(x, 0);    // TODO: check clang version.
+#else
+  double tmp[2];
+  memcpy(tmp, &x, sizeof(tmp));
+  Packet2l l = { static_cast<long long>(tmp[0]),
+                 static_cast<long long>(tmp[1]) };
+  return l;
+#endif
+}
+
+template<>
+inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x) {
+  unsigned long long tmp[2];
+  memcpy(tmp, &x, sizeof(tmp));
+  Packet2d d = { static_cast<double>(tmp[0]),
+                 static_cast<double>(tmp[1]) };
+  return d;
+}
+
+
+// Packet2l shifts.
+// For POWER8 we simply use vec_sr/l. 
+//
+// Things are more complicated for POWER7. There is actually a
+// vec_xxsxdi intrinsic but it is not supported by some gcc versions.
+// So we need to shift by N % 32 and rearrage bytes.
+#ifdef __POWER8_VECTOR__
+
+template<int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
+  const Packet2ul shift = { N, N };
+  return vec_sl(a, shift); 
+}
+
+template<int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
+  const Packet2ul shift = { N, N };
+  return vec_sr(a, shift); 
+}
+
+#else
+
+// Shifts [A, B, C, D] to [B, 0, D, 0].
+// Used to implement left shifts for Packet2l.
+EIGEN_ALWAYS_INLINE Packet4i shift_even_left(const Packet4i& a) {
+  static const Packet16uc perm = {
+      0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, 
+      0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
+  #ifdef  _BIG_ENDIAN
+    return vec_perm(p4i_ZERO, a, perm);
+  #else
+    return vec_perm(a, p4i_ZERO, perm);
+  #endif
+}
+
+// Shifts [A, B, C, D] to [0, A, 0, C].
+// Used to implement right shifts for Packet2l.
+EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(const Packet4i& a) {
+  static const Packet16uc perm = {
+      0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 
+      0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b };
+  #ifdef  _BIG_ENDIAN
+    return vec_perm(p4i_ZERO, a, perm);
+  #else
+    return vec_perm(a, p4i_ZERO, perm);
+  #endif
+}
+
+template<int N, typename EnableIf = void>
+struct plogical_shift_left_impl;
+
+template<int N>
+struct plogical_shift_left_impl<N, typename enable_if<(N < 32) && (N >= 0)>::type> {
+  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
+    static const unsigned n = static_cast<unsigned>(N);
+    const Packet4ui shift = {n, n, n, n};
+    const Packet4i ai = reinterpret_cast<Packet4i>(a);
+    static const unsigned m = static_cast<unsigned>(32 - N);
+    const Packet4ui shift_right = {m, m, m, m};
+    const Packet4i out_hi = vec_sl(ai, shift);
+    const Packet4i out_lo = shift_even_left(vec_sr(ai, shift_right));
+    return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
+  }
+};
+
+template<int N>
+struct plogical_shift_left_impl<N, typename enable_if<(N >= 32)>::type> {
+  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
+    static const unsigned m = static_cast<unsigned>(N - 32);
+    const Packet4ui shift = {m, m, m, m};
+    const Packet4i ai = reinterpret_cast<Packet4i>(a);
+    return reinterpret_cast<Packet2l>(shift_even_left(vec_sl(ai, shift)));
+  }
+};
+
+template<int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
+  return plogical_shift_left_impl<N>::run(a); 
+}
+
+template<int N, typename EnableIf = void>
+struct plogical_shift_right_impl;
+
+template<int N>
+struct plogical_shift_right_impl<N, typename enable_if<(N < 32) && (N >= 0)>::type> {
+  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
+    static const unsigned n = static_cast<unsigned>(N);
+    const Packet4ui shift = {n, n, n, n};
+    const Packet4i ai = reinterpret_cast<Packet4i>(a);
+    static const unsigned m = static_cast<unsigned>(32 - N);
+    const Packet4ui shift_left = {m, m, m, m};
+    const Packet4i out_lo = vec_sr(ai, shift);
+    const Packet4i out_hi = shift_odd_right(vec_sl(ai, shift_left));
+    return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
+  }
+};
+
+template<int N>
+struct plogical_shift_right_impl<N, typename enable_if<(N >= 32)>::type> {
+  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
+    static const unsigned m = static_cast<unsigned>(N - 32);
+    const Packet4ui shift = {m, m, m, m};
+    const Packet4i ai = reinterpret_cast<Packet4i>(a);
+    return reinterpret_cast<Packet2l>(shift_odd_right(vec_sr(ai, shift)));
+  }
+};
+
+template<int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
+  return plogical_shift_right_impl<N>::run(a); 
+}
 #endif
 
-template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+  // Clamp exponent to [-2099, 2099]
+  const Packet2d max_exponent = pset1<Packet2d>(2099.0);
+  const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
 
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return (Packet2d)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE64); }
+  // Split 2^e into four factors and multiply:
+  const Packet2l  bias = { 1023, 1023 };
+  Packet2l b = plogical_shift_right<2>(e);  // floor(e/4)
+  Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
+  Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
+  b = psub(psub(psub(e, b), b), b);  // e - 3b
+  c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
+  out = pmul(out, c); // a * 2^e
+  return out;
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
+
+// Extract exponent without existence of Packet2l.
+template<>
+EIGEN_STRONG_INLINE  
+Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
+  return pcast<Packet2l, Packet2d>(plogical_shift_right<52>(reinterpret_cast<Packet2l>(pabs(a))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d> (const Packet2d& a, Packet2d& exponent) {
+  return pfrexp_generic(a, exponent);
+}
 
 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
 {
   Packet2d b, sum;
-  b   = (Packet2d) vec_sld((Packet4ui) a, (Packet4ui)a, 8);
-  sum = vec_add(a, b);
-  return pfirst(sum);
+  b   = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
+  sum = a + b;
+  return pfirst<Packet2d>(sum);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
-{
-  Packet2d v[2], sum;
-  v[0] = vec_add(vecs[0], (Packet2d) vec_sld((Packet4ui) vecs[0], (Packet4ui) vecs[0], 8));
-  v[1] = vec_add(vecs[1], (Packet2d) vec_sld((Packet4ui) vecs[1], (Packet4ui) vecs[1], 8));
- 
-#ifdef _BIG_ENDIAN
- sum = (Packet2d) vec_sld((Packet4ui) v[0], (Packet4ui) v[1], 8);
-#else
-  sum = (Packet2d) vec_sld((Packet4ui) v[1], (Packet4ui) v[0], 8);
-#endif
-
-  return sum;
-}
 // Other reduction functions:
 // mul
 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
 {
-  return pfirst(pmul(a, (Packet2d)vec_sld((Packet4ui) a, (Packet4ui) a, 8)));
+  return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
 }
 
 // min
 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
 {
-  return pfirst(vec_min(a, (Packet2d) vec_sld((Packet4ui) a, (Packet4ui) a, 8)));
+  return pfirst(pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
 }
 
 // max
 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
 {
-  return pfirst(vec_max(a, (Packet2d) vec_sld((Packet4ui) a, (Packet4ui) a, 8)));
+  return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
 }
 
-template<int Offset>
-struct palign_impl<Offset,Packet2d>
-{
-  static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
-  {
-    if (Offset == 1)
-#ifdef _BIG_ENDIAN
-      first = (Packet2d) vec_sld((Packet4ui) first, (Packet4ui) second, 8);
-#else
-      first = (Packet2d) vec_sld((Packet4ui) second, (Packet4ui) first, 8);
-#endif
-  }
-};
-
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet2d,2>& kernel) {
   Packet2d t0, t1;
@@ -934,7 +2696,14 @@
   kernel.packet[1] = t1;
 }
 
-#endif  // defined(__VSX__)
+template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
+  Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
+  Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) );
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+
+#endif // __VSX__
 } // end namespace internal
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h
index 850a03a..45f6ddb 100644
--- a/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/Eigen/src/Core/arch/CUDA/Complex.h

@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2021 C. Antonio Sanchez <cantonios@google.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,76 +11,259 @@
 #ifndef EIGEN_COMPLEX_CUDA_H
 #define EIGEN_COMPLEX_CUDA_H
 
-// clang-format off
+// Many std::complex methods such as operator+, operator-, operator* and
+// operator/ are not constexpr. Due to this, GCC and older versions of clang do
+// not treat them as device functions and thus Eigen functors making use of
+// these operators fail to compile. Here, we manually specialize these
+// operators and functors for complex types when building for CUDA to enable
+// their use on-device.
+//
+// NOTES:
+//  - Compound assignment operators +=,-=,*=,/=(Scalar) will not work on device,
+//    since they are already specialized in the standard. Using them will result
+//    in silent kernel failures.
+//  - Compiling with MSVC and using +=,-=,*=,/=(std::complex<Scalar>) will lead
+//    to duplicate definition errors, since these are already specialized in
+//    Visual Studio's <complex> header (contrary to the standard).  This is
+//    preferable to removing such definitions, which will lead to silent kernel
+//    failures.
+//  - Compiling with ICC requires defining _USE_COMPLEX_SPECIALIZATION_ prior
+//    to the first inclusion of <complex>.
+
+#if defined(EIGEN_CUDACC) && defined(EIGEN_GPU_COMPILE_PHASE)
+    
+// ICC already specializes std::complex<float> and std::complex<double>
+// operators, preventing us from making them device functions here.
+// This will lead to silent runtime errors if the operators are used on device.
+//
+// To allow std::complex operator use on device, define _OVERRIDE_COMPLEX_SPECIALIZATION_
+// prior to first inclusion of <complex>.  This prevents ICC from adding
+// its own specializations, so our custom ones below can be used instead.
+#if !(defined(EIGEN_COMP_ICC) && defined(_USE_COMPLEX_SPECIALIZATION_))
+
+// Import Eigen's internal operator specializations.
+#define EIGEN_USING_STD_COMPLEX_OPERATORS           \
+  using Eigen::complex_operator_detail::operator+;  \
+  using Eigen::complex_operator_detail::operator-;  \
+  using Eigen::complex_operator_detail::operator*;  \
+  using Eigen::complex_operator_detail::operator/;  \
+  using Eigen::complex_operator_detail::operator+=; \
+  using Eigen::complex_operator_detail::operator-=; \
+  using Eigen::complex_operator_detail::operator*=; \
+  using Eigen::complex_operator_detail::operator/=; \
+  using Eigen::complex_operator_detail::operator==; \
+  using Eigen::complex_operator_detail::operator!=;
 
 namespace Eigen {
 
-namespace internal {
+// Specialized std::complex overloads.
+namespace complex_operator_detail {
 
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<T> complex_multiply(const std::complex<T>& a, const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
+  const T b_real = numext::real(b);
+  const T b_imag = numext::imag(b);
+  return std::complex<T>(
+      a_real * b_real - a_imag * b_imag,
+      a_imag * b_real + a_real * b_imag);
+}
 
-// Many std::complex methods such as operator+, operator-, operator* and
-// operator/ are not constexpr. Due to this, clang does not treat them as device
-// functions and thus Eigen functors making use of these operators fail to
-// compile. Here, we manually specialize these functors for complex types when
-// building for CUDA to avoid non-constexpr methods.
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<T> complex_divide_fast(const std::complex<T>& a, const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
+  const T b_real = numext::real(b);
+  const T b_imag = numext::imag(b);
+  const T norm = (b_real * b_real + b_imag * b_imag);
+  return std::complex<T>((a_real * b_real + a_imag * b_imag) / norm,
+                          (a_imag * b_real - a_real * b_imag) / norm);
+}
 
-template<typename T> struct scalar_sum_op<const std::complex<T> > {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
-    return std::complex<T>(numext::real(a) + numext::real(b),
-                           numext::imag(a) + numext::imag(b));
-  }
-};
-template<typename T> struct scalar_sum_op<std::complex<T> > : scalar_sum_op<const std::complex<T> > {};
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<T> complex_divide_stable(const std::complex<T>& a, const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
+  const T b_real = numext::real(b);
+  const T b_imag = numext::imag(b);
+  // Smith's complex division (https://arxiv.org/pdf/1210.4539.pdf),
+  // guards against over/under-flow.
+  const bool scale_imag = numext::abs(b_imag) <= numext::abs(b_real);
+  const T rscale = scale_imag ? T(1) : b_real / b_imag;
+  const T iscale = scale_imag ? b_imag / b_real : T(1);
+  const T denominator = b_real * rscale + b_imag * iscale;
+  return std::complex<T>((a_real * rscale + a_imag * iscale) / denominator, 
+                         (a_imag * rscale - a_real * iscale) / denominator);
+}
 
-template<typename T> struct scalar_difference_op<const std::complex<T>> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
-    return std::complex<T>(numext::real(a) - numext::real(b),
-                           numext::imag(a) - numext::imag(b));
-  }
-};
-template<typename T> struct scalar_difference_op<std::complex<T> > : scalar_difference_op<const std::complex<T> > {};
-
-template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T>> {
-  enum {
-    Vectorizable = packet_traits<std::complex<T>>::HasMul
-  };
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
-    const T a_real = numext::real(a);
-    const T a_imag = numext::imag(a);
-    const T b_real = numext::real(b);
-    const T b_imag = numext::imag(b);
-    return std::complex<T>(a_real * b_real - a_imag * b_imag,
-                           a_real * b_imag + a_imag * b_real);
-  }
-};
-template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> > : scalar_product_op<const std::complex<T>, const std::complex<T> > {};
-
-
-template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > {
-  enum {
-    Vectorizable = packet_traits<std::complex<T>>::HasDiv
-  };
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
-    const T a_real = numext::real(a);
-    const T a_imag = numext::imag(a);
-    const T b_real = numext::real(b);
-    const T b_imag = numext::imag(b);
-    const T norm = T(1) / (b_real * b_real + b_imag * b_imag);
-    return std::complex<T>((a_real * b_real + a_imag * b_imag) * norm,
-                           (a_imag * b_real - a_real * b_imag) * norm);
-  }
-};
-template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T> > : scalar_quotient_op<const std::complex<T>, const std::complex<T> > {};
-
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<T> complex_divide(const std::complex<T>& a, const std::complex<T>& b) {
+#if EIGEN_FAST_MATH
+  return complex_divide_fast(a, b);
+#else
+  return complex_divide_stable(a, b);
 #endif
+}
 
-} // end namespace internal
+// NOTE: We cannot specialize compound assignment operators with Scalar T,
+//         (i.e.  operator@=(const T&), for @=+,-,*,/)
+//       since they are already specialized for float/double/long double within
+//       the standard <complex> header. We also do not specialize the stream
+//       operators.
+#define EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(T)                                    \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator+(const std::complex<T>& a) { return a; }                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator-(const std::complex<T>& a) {                                           \
+  return std::complex<T>(-numext::real(a), -numext::imag(a));                                   \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator+(const std::complex<T>& a, const std::complex<T>& b) {                 \
+  return std::complex<T>(numext::real(a) + numext::real(b), numext::imag(a) + numext::imag(b)); \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator+(const std::complex<T>& a, const T& b) {                               \
+  return std::complex<T>(numext::real(a) + b, numext::imag(a));                                 \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator+(const T& a, const std::complex<T>& b) {                               \
+  return std::complex<T>(a + numext::real(b), numext::imag(b));                                 \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator-(const std::complex<T>& a, const std::complex<T>& b) {                 \
+  return std::complex<T>(numext::real(a) - numext::real(b), numext::imag(a) - numext::imag(b)); \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator-(const std::complex<T>& a, const T& b) {                               \
+  return std::complex<T>(numext::real(a) - b, numext::imag(a));                                 \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator-(const T& a, const std::complex<T>& b) {                               \
+  return std::complex<T>(a - numext::real(b), -numext::imag(b));                                \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator*(const std::complex<T>& a, const std::complex<T>& b) {                 \
+  return complex_multiply(a, b);                                                                \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator*(const std::complex<T>& a, const T& b) {                               \
+  return std::complex<T>(numext::real(a) * b, numext::imag(a) * b);                             \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator*(const T& a, const std::complex<T>& b) {                               \
+  return std::complex<T>(a * numext::real(b), a * numext::imag(b));                             \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator/(const std::complex<T>& a, const std::complex<T>& b) {                 \
+  return complex_divide(a, b);                                                                  \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator/(const std::complex<T>& a, const T& b) {                               \
+  return std::complex<T>(numext::real(a) / b, numext::imag(a) / b);                             \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator/(const T& a, const std::complex<T>& b) {                               \
+  return complex_divide(std::complex<T>(a, 0), b);                                              \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T>& operator+=(std::complex<T>& a, const std::complex<T>& b) {                     \
+  numext::real_ref(a) += numext::real(b);                                                       \
+  numext::imag_ref(a) += numext::imag(b);                                                       \
+  return a;                                                                                     \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T>& operator-=(std::complex<T>& a, const std::complex<T>& b) {                     \
+  numext::real_ref(a) -= numext::real(b);                                                       \
+  numext::imag_ref(a) -= numext::imag(b);                                                       \
+  return a;                                                                                     \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T>& operator*=(std::complex<T>& a, const std::complex<T>& b) {                     \
+  a = complex_multiply(a, b);                                                                   \
+  return a;                                                                                     \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T>& operator/=(std::complex<T>& a, const std::complex<T>& b) {                     \
+  a = complex_divide(a, b);                                                                     \
+  return  a;                                                                                    \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator==(const std::complex<T>& a, const std::complex<T>& b) {                           \
+  return numext::real(a) == numext::real(b) && numext::imag(a) == numext::imag(b);              \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator==(const std::complex<T>& a, const T& b) {                                         \
+  return numext::real(a) == b && numext::imag(a) == 0;                                          \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator==(const T& a, const std::complex<T>& b) {                                         \
+  return a  == numext::real(b) && 0 == numext::imag(b);                                         \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator!=(const std::complex<T>& a, const std::complex<T>& b) {                           \
+  return !(a == b);                                                                             \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator!=(const std::complex<T>& a, const T& b) {                                         \
+  return !(a == b);                                                                             \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator!=(const T& a, const std::complex<T>& b) {                                         \
+  return !(a == b);                                                                             \
+}
 
-} // end namespace Eigen
+// Do not specialize for long double, since that reduces to double on device.
+EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(float)
+EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(double)
 
-#endif // EIGEN_COMPLEX_CUDA_H
+#undef EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS
+
+  
+}  // namespace complex_operator_detail
+
+EIGEN_USING_STD_COMPLEX_OPERATORS
+
+namespace numext {
+EIGEN_USING_STD_COMPLEX_OPERATORS
+}  // namespace numext
+
+namespace internal {
+EIGEN_USING_STD_COMPLEX_OPERATORS
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // !(EIGEN_COMP_ICC && _USE_COMPLEX_SPECIALIZATION_)
+
+#endif  // EIGEN_CUDACC && EIGEN_GPU_COMPILE_PHASE
+
+#endif  // EIGEN_COMPLEX_CUDA_H

diff --git a/Eigen/src/Core/arch/Default/BFloat16.h b/Eigen/src/Core/arch/Default/BFloat16.h
new file mode 100644
index 0000000..f21d1a0
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/BFloat16.h

@@ -0,0 +1,688 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef EIGEN_BFLOAT16_H
+#define EIGEN_BFLOAT16_H
+
+#define BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, METHOD)         \
+  template <>                                                       \
+  EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED  \
+  PACKET_BF16 METHOD<PACKET_BF16>(const PACKET_BF16& _x) {          \
+    return F32ToBf16(METHOD<PACKET_F>(Bf16ToF32(_x)));              \
+  }
+
+namespace Eigen {
+
+struct bfloat16;
+
+namespace bfloat16_impl {
+
+// Make our own __bfloat16_raw definition.
+struct __bfloat16_raw {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw() : value(0) {}
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw(unsigned short raw) : value(raw) {}
+  unsigned short value;
+};
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(unsigned short value);
+template <bool AssumeArgumentIsNormalOrInfinityOrZero>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff);
+// Forward declarations of template specializations, to avoid Visual C++ 2019 errors, saying:
+// > error C2908: explicit specialization; 'float_to_bfloat16_rtne' has already been instantiated
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<false>(float ff);
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<true>(float ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h);
+
+struct bfloat16_base : public __bfloat16_raw {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16_base() {}
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16_base(const __bfloat16_raw& h) : __bfloat16_raw(h) {}
+};
+
+} // namespace bfloat16_impl
+
+// Class definition.
+struct bfloat16 : public bfloat16_impl::bfloat16_base {
+
+  typedef bfloat16_impl::__bfloat16_raw __bfloat16_raw;
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(const __bfloat16_raw& h) : bfloat16_impl::bfloat16_base(h) {}
+
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(bool b)
+      : bfloat16_impl::bfloat16_base(bfloat16_impl::raw_uint16_to_bfloat16(b ? 0x3f80 : 0)) {}
+
+  template<class T>
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(T val)
+      : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<internal::is_integral<T>::value>(static_cast<float>(val))) {}
+
+  explicit EIGEN_DEVICE_FUNC bfloat16(float f)
+      : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<false>(f)) {}
+
+  // Following the convention of numpy, converting between complex and
+  // float will lead to loss of imag value.
+  template<typename RealScalar>
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(const std::complex<RealScalar>& val)
+      : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<false>(static_cast<float>(val.real()))) {}
+
+  EIGEN_DEVICE_FUNC operator float() const {  // NOLINT: Allow implicit conversion to float, because it is lossless.
+    return bfloat16_impl::bfloat16_to_float(*this);
+  }
+};
+} // namespace Eigen
+
+namespace std {
+template<>
+struct numeric_limits<Eigen::bfloat16> {
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = true;
+  static const bool has_quiet_NaN = true;
+  static const bool has_signaling_NaN = true;
+  static const float_denorm_style has_denorm = std::denorm_absent;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = numeric_limits<float>::round_style;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = true;
+  static const bool is_modulo = false;
+  static const int digits = 8;
+  static const int digits10 = 2;
+  static const int max_digits10 = 4;
+  static const int radix = 2;
+  static const int min_exponent = numeric_limits<float>::min_exponent;
+  static const int min_exponent10 = numeric_limits<float>::min_exponent10;
+  static const int max_exponent = numeric_limits<float>::max_exponent;
+  static const int max_exponent10 = numeric_limits<float>::max_exponent10;
+  static const bool traps = numeric_limits<float>::traps;
+  static const bool tinyness_before = numeric_limits<float>::tinyness_before;
+
+  static Eigen::bfloat16 (min)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0080); }
+  static Eigen::bfloat16 lowest() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0xff7f); }
+  static Eigen::bfloat16 (max)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f7f); }
+  static Eigen::bfloat16 epsilon() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x3c00); }
+  static Eigen::bfloat16 round_error() { return Eigen::bfloat16(0x3f00); }
+  static Eigen::bfloat16 infinity() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f80); }
+  static Eigen::bfloat16 quiet_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0); }
+  static Eigen::bfloat16 signaling_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f81); }
+  static Eigen::bfloat16 denorm_min() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0001); }
+};
+
+// If std::numeric_limits<T> is specialized, should also specialize
+// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
+// std::numeric_limits<const volatile T>
+// https://stackoverflow.com/a/16519653/
+template<>
+struct numeric_limits<const Eigen::bfloat16> : numeric_limits<Eigen::bfloat16> {};
+template<>
+struct numeric_limits<volatile Eigen::bfloat16> : numeric_limits<Eigen::bfloat16> {};
+template<>
+struct numeric_limits<const volatile Eigen::bfloat16> : numeric_limits<Eigen::bfloat16> {};
+} // namespace std
+
+namespace Eigen {
+
+namespace bfloat16_impl {
+
+// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
+// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
+// of the functions, while the latter can only deal with one of them.
+#if !defined(EIGEN_HAS_NATIVE_BF16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for bfloat16 floats
+
+#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
+// We need to provide emulated *host-side* BF16 operators for clang.
+#pragma push_macro("EIGEN_DEVICE_FUNC")
+#undef EIGEN_DEVICE_FUNC
+#if defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_NATIVE_BF16)
+#define EIGEN_DEVICE_FUNC __host__
+#else // both host and device need emulated ops.
+#define EIGEN_DEVICE_FUNC __host__ __device__
+#endif
+#endif
+
+// Definitions for CPUs, mostly working through conversion
+// to/from fp32.
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const bfloat16& a, const bfloat16& b) {
+  return bfloat16(float(a) + float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const bfloat16& a, const int& b) {
+  return bfloat16(float(a) + static_cast<float>(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const int& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) + float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator * (const bfloat16& a, const bfloat16& b) {
+  return bfloat16(float(a) * float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator - (const bfloat16& a, const bfloat16& b) {
+  return bfloat16(float(a) - float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, const bfloat16& b) {
+  return bfloat16(float(a) / float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator - (const bfloat16& a) {
+  bfloat16 result;
+  result.value = a.value ^ 0x8000;
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator += (bfloat16& a, const bfloat16& b) {
+  a = bfloat16(float(a) + float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator *= (bfloat16& a, const bfloat16& b) {
+  a = bfloat16(float(a) * float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator -= (bfloat16& a, const bfloat16& b) {
+  a = bfloat16(float(a) - float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator /= (bfloat16& a, const bfloat16& b) {
+  a = bfloat16(float(a) / float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator++(bfloat16& a) {
+  a += bfloat16(1);
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator--(bfloat16& a) {
+  a -= bfloat16(1);
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator++(bfloat16& a, int) {
+  bfloat16 original_value = a;
+  ++a;
+  return original_value;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator--(bfloat16& a, int) {
+  bfloat16 original_value = a;
+  --a;
+  return original_value;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const bfloat16& a, const bfloat16& b) {
+  return numext::equal_strict(float(a),float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const bfloat16& a, const bfloat16& b) {
+  return numext::not_equal_strict(float(a), float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const bfloat16& a, const bfloat16& b) {
+  return float(a) < float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const bfloat16& a, const bfloat16& b) {
+  return float(a) <= float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const bfloat16& a, const bfloat16& b) {
+  return float(a) > float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const bfloat16& a, const bfloat16& b) {
+  return float(a) >= float(b);
+}
+
+#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
+#pragma pop_macro("EIGEN_DEVICE_FUNC")
+#endif
+#endif  // Emulate support for bfloat16 floats
+
+// Division by an index. Do it in full float precision to avoid accuracy
+// issues in converting the denominator to bfloat16.
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, Index b) {
+  return bfloat16(static_cast<float>(a) / static_cast<float>(b));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw truncate_to_bfloat16(const float v) {
+  __bfloat16_raw output;
+  if (Eigen::numext::isnan EIGEN_NOT_A_MACRO(v)) {
+    output.value = std::signbit(v) ? 0xFFC0: 0x7FC0;
+    return output;
+  }
+  output.value = static_cast<numext::uint16_t>(numext::bit_cast<numext::uint32_t>(v) >> 16);
+  return output;
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(numext::uint16_t value) {
+  return __bfloat16_raw(value);
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR numext::uint16_t raw_bfloat16_as_uint16(const __bfloat16_raw& bf) {
+  return bf.value;
+}
+
+// float_to_bfloat16_rtne template specialization that does not make any
+// assumption about the value of its function argument (ff).
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<false>(float ff) {
+#if (defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_HIP_BF16))
+  // Nothing to do here
+#else
+  __bfloat16_raw output;
+
+  if (Eigen::numext::isnan EIGEN_NOT_A_MACRO(ff)) {
+    // If the value is a NaN, squash it to a qNaN with msb of fraction set,
+    // this makes sure after truncation we don't end up with an inf.
+    //
+    // qNaN magic: All exponent bits set + most significant bit of fraction
+    // set.
+    output.value = std::signbit(ff) ? 0xFFC0: 0x7FC0;
+  } else {
+    // Fast rounding algorithm that rounds a half value to nearest even. This
+    // reduces expected error when we convert a large number of floats. Here
+    // is how it works:
+    //
+    // Definitions:
+    // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits
+    // with the following tags:
+    //
+    // Sign |  Exp (8 bits) | Frac (23 bits)
+    //  S     EEEEEEEE         FFFFFFLRTTTTTTTTTTTTTTT
+    //
+    //  S: Sign bit.
+    //  E: Exponent bits.
+    //  F: First 6 bits of fraction.
+    //  L: Least significant bit of resulting bfloat16 if we truncate away the
+    //  rest of the float32. This is also the 7th bit of fraction
+    //  R: Rounding bit, 8th bit of fraction.
+    //  T: Sticky bits, rest of fraction, 15 bits.
+    //
+    // To round half to nearest even, there are 3 cases where we want to round
+    // down (simply truncate the result of the bits away, which consists of
+    // rounding bit and sticky bits) and two cases where we want to round up
+    // (truncate then add one to the result).
+    //
+    // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of
+    // 1s) as the rounding bias, adds the rounding bias to the input, then
+    // truncates the last 16 bits away.
+    //
+    // To understand how it works, we can analyze this algorithm case by case:
+    //
+    // 1. L = 0, R = 0:
+    //   Expect: round down, this is less than half value.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input may create any carry, depending on
+    //   whether there is any value set to 1 in T bits.
+    //   - R may be set to 1 if there is a carry.
+    //   - L remains 0.
+    //   - Note that this case also handles Inf and -Inf, where all fraction
+    //   bits, including L, R and Ts are all 0. The output remains Inf after
+    //   this algorithm.
+    //
+    // 2. L = 1, R = 0:
+    //   Expect: round down, this is less than half value.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 1 = 0x8000
+    //   - Adding rounding bias to input doesn't change sticky bits but
+    //   adds 1 to rounding bit.
+    //   - L remains 1.
+    //
+    // 3. L = 0, R = 1, all of T are 0:
+    //   Expect: round down, this is exactly at half, the result is already
+    //   even (L=0).
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input sets all sticky bits to 1, but
+    //   doesn't create a carry.
+    //   - R remains 1.
+    //   - L remains 0.
+    //
+    // 4. L = 1, R = 1:
+    //   Expect: round up, this is exactly at half, the result needs to be
+    //   round to the next even number.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 1 = 0x8000
+    //   - Adding rounding bias to input doesn't change sticky bits, but
+    //   creates a carry from rounding bit.
+    //   - The carry sets L to 0, creates another carry bit and propagate
+    //   forward to F bits.
+    //   - If all the F bits are 1, a carry then propagates to the exponent
+    //   bits, which then creates the minimum value with the next exponent
+    //   value. Note that we won't have the case where exponents are all 1,
+    //   since that's either a NaN (handled in the other if condition) or inf
+    //   (handled in case 1).
+    //
+    // 5. L = 0, R = 1, any of T is 1:
+    //   Expect: round up, this is greater than half.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input creates a carry from sticky bits,
+    //   sets rounding bit to 0, then create another carry.
+    //   - The second carry sets L to 1.
+    //
+    // Examples:
+    //
+    //  Exact half value that is already even:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0     1000000000000000
+    //
+    //     This falls into case 3. We truncate the rest of 16 bits and no
+    //     carry is created into F and L:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
+    //
+    //  Exact half value, round to next even number:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 0 1     1000000000000000
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     which then propagates into L and F:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
+    //
+    //
+    //  Max denormal value round to min normal value:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      1 1 1 1 1 1 1     1111111111111111
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     propagate into L and F, which then propagates into exponent
+    //     bits:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 1      0 0 0 0 0 0 0
+    //
+    //  Max normal value round to Inf:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     1 1 1 1 1 1 1 0      1 1 1 1 1 1 1     1111111111111111
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     propagate into L and F, which then propagates into exponent
+    //     bits:
+    //
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     1 1 1 1 1 1 1 1      0 0 0 0 0 0 0
+
+    // At this point, ff must be either a normal float, or +/-infinity.
+    output = float_to_bfloat16_rtne<true>(ff);
+  }
+  return output;
+#endif
+}
+
+// float_to_bfloat16_rtne template specialization that assumes that its function
+// argument (ff) is either a normal floating point number, or +/-infinity, or
+// zero. Used to improve the runtime performance of conversion from an integer
+// type to bfloat16.
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<true>(float ff) {
+#if (defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_HIP_BF16))
+    // Nothing to do here
+#else
+    numext::uint32_t input = numext::bit_cast<numext::uint32_t>(ff);
+    __bfloat16_raw output;
+
+    // Least significant bit of resulting bfloat.
+    numext::uint32_t lsb = (input >> 16) & 1;
+    numext::uint32_t rounding_bias = 0x7fff + lsb;
+    input += rounding_bias;
+    output.value = static_cast<numext::uint16_t>(input >> 16);
+    return output;
+#endif
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h) {
+    return numext::bit_cast<float>(static_cast<numext::uint32_t>(h.value) << 16);
+}
+// --- standard functions ---
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const bfloat16& a) {
+  EIGEN_USING_STD(isinf);
+  return (isinf)(float(a));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const bfloat16& a) {
+  EIGEN_USING_STD(isnan);
+  return (isnan)(float(a));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const bfloat16& a) {
+  return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 abs(const bfloat16& a) {
+  bfloat16 result;
+  result.value = a.value & 0x7FFF;
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 exp(const bfloat16& a) {
+   return bfloat16(::expf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 expm1(const bfloat16& a) {
+  return bfloat16(numext::expm1(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log(const bfloat16& a) {
+  return bfloat16(::logf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log1p(const bfloat16& a) {
+  return bfloat16(numext::log1p(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log10(const bfloat16& a) {
+  return bfloat16(::log10f(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log2(const bfloat16& a) {
+  return bfloat16(static_cast<float>(EIGEN_LOG2E) * ::logf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sqrt(const bfloat16& a) {
+    return bfloat16(::sqrtf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 pow(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(::powf(float(a), float(b)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sin(const bfloat16& a) {
+  return bfloat16(::sinf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cos(const bfloat16& a) {
+  return bfloat16(::cosf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tan(const bfloat16& a) {
+  return bfloat16(::tanf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asin(const bfloat16& a) {
+  return bfloat16(::asinf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acos(const bfloat16& a) {
+  return bfloat16(::acosf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atan(const bfloat16& a) {
+  return bfloat16(::atanf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sinh(const bfloat16& a) {
+  return bfloat16(::sinhf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cosh(const bfloat16& a) {
+  return bfloat16(::coshf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tanh(const bfloat16& a) {
+  return bfloat16(::tanhf(float(a)));
+}
+#if EIGEN_HAS_CXX11_MATH
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asinh(const bfloat16& a) {
+  return bfloat16(::asinhf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acosh(const bfloat16& a) {
+  return bfloat16(::acoshf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atanh(const bfloat16& a) {
+  return bfloat16(::atanhf(float(a)));
+}
+#endif
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 floor(const bfloat16& a) {
+  return bfloat16(::floorf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16& a) {
+  return bfloat16(::ceilf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 rint(const bfloat16& a) {
+  return bfloat16(::rintf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 round(const bfloat16& a) {
+  return bfloat16(::roundf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmod(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(::fmodf(float(a), float(b)));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 (min)(const bfloat16& a, const bfloat16& b) {
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return f2 < f1 ? b : a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 (max)(const bfloat16& a, const bfloat16& b) {
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return f1 < f2 ? b : a;
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmin(const bfloat16& a, const bfloat16& b) {
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return bfloat16(::fminf(f1, f2));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmax(const bfloat16& a, const bfloat16& b) {
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return bfloat16(::fmaxf(f1, f2));
+}
+
+#ifndef EIGEN_NO_IO
+EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const bfloat16& v) {
+  os << static_cast<float>(v);
+  return os;
+}
+#endif
+
+} // namespace bfloat16_impl
+
+namespace internal {
+
+template<>
+struct random_default_impl<bfloat16, false, false>
+{
+  static inline bfloat16 run(const bfloat16& x, const bfloat16& y)
+  {
+    return x + (y-x) * bfloat16(float(std::rand()) / float(RAND_MAX));
+  }
+  static inline bfloat16 run()
+  {
+    return run(bfloat16(-1.f), bfloat16(1.f));
+  }
+};
+
+template<> struct is_arithmetic<bfloat16> { enum { value = true }; };
+
+} // namespace internal
+
+template<> struct NumTraits<Eigen::bfloat16>
+    : GenericNumTraits<Eigen::bfloat16>
+{
+  enum {
+    IsSigned = true,
+    IsInteger = false,
+    IsComplex = false,
+    RequireInitialization = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 epsilon() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0x3c00);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 dummy_precision() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0x3D4D);  // bfloat16(5e-2f);
+
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 highest() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0x7F7F);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 lowest() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0xFF7F);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 infinity() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0x7f80);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 quiet_NaN() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0);
+  }
+};
+
+} // namespace Eigen
+
+namespace Eigen {
+namespace numext {
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isnan)(const Eigen::bfloat16& h) {
+  return (bfloat16_impl::isnan)(h);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isinf)(const Eigen::bfloat16& h) {
+  return (bfloat16_impl::isinf)(h);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isfinite)(const Eigen::bfloat16& h) {
+  return (bfloat16_impl::isfinite)(h);
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bit_cast<Eigen::bfloat16, uint16_t>(const uint16_t& src) {
+  return Eigen::bfloat16(Eigen::bfloat16_impl::raw_uint16_to_bfloat16(src));
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast<uint16_t, Eigen::bfloat16>(const Eigen::bfloat16& src) {
+  return Eigen::bfloat16_impl::raw_bfloat16_as_uint16(src);
+}
+
+}  // namespace numext
+}  // namespace Eigen
+
+#if EIGEN_HAS_STD_HASH
+namespace std {
+template <>
+struct hash<Eigen::bfloat16> {
+  EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::bfloat16& a) const {
+    return static_cast<std::size_t>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(a));
+  }
+};
+} // namespace std
+#endif
+
+
+#endif // EIGEN_BFLOAT16_H

diff --git a/Eigen/src/Core/arch/Default/ConjHelper.h b/Eigen/src/Core/arch/Default/ConjHelper.h
new file mode 100644
index 0000000..53830b5
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/ConjHelper.h

@@ -0,0 +1,117 @@
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARCH_CONJ_HELPER_H
+#define EIGEN_ARCH_CONJ_HELPER_H
+
+#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL)      \
+  template <>                                                           \
+  struct conj_helper<PACKET_REAL, PACKET_CPLX, false, false> {          \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x,         \
+                                          const PACKET_CPLX& y,         \
+                                          const PACKET_CPLX& c) const { \
+      return padd(c, this->pmul(x, y));                                 \
+    }                                                                   \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x,          \
+                                         const PACKET_CPLX& y) const {  \
+      return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v));   \
+    }                                                                   \
+  };                                                                    \
+                                                                        \
+  template <>                                                           \
+  struct conj_helper<PACKET_CPLX, PACKET_REAL, false, false> {          \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x,         \
+                                          const PACKET_REAL& y,         \
+                                          const PACKET_CPLX& c) const { \
+      return padd(c, this->pmul(x, y));                                 \
+    }                                                                   \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x,          \
+                                         const PACKET_REAL& y) const {  \
+      return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y));   \
+    }                                                                   \
+  };
+
+namespace Eigen {
+namespace internal {
+
+template<bool Conjugate> struct conj_if;
+
+template<> struct conj_if<true> {
+  template<typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { return numext::conj(x); }
+  template<typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T pconj(const T& x) const { return internal::pconj(x); }
+};
+
+template<> struct conj_if<false> {
+  template<typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator()(const T& x) const { return x; }
+  template<typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& pconj(const T& x) const { return x; }
+};
+
+// Generic Implementation, assume scalars since the packet-version is
+// specialized below.
+template<typename LhsType, typename RhsType, bool ConjLhs, bool ConjRhs>
+struct conj_helper {
+  typedef typename ScalarBinaryOpTraits<LhsType, RhsType>::ReturnType ResultType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
+  pmadd(const LhsType& x, const RhsType& y, const ResultType& c) const
+  { return this->pmul(x, y) + c; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
+  pmul(const LhsType& x, const RhsType& y) const
+  { return conj_if<ConjLhs>()(x) * conj_if<ConjRhs>()(y); }
+};
+
+template<typename LhsScalar, typename RhsScalar>
+struct conj_helper<LhsScalar, RhsScalar, true, true> {
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar>::ReturnType ResultType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
+  pmadd(const LhsScalar& x, const RhsScalar& y, const ResultType& c) const
+  { return this->pmul(x, y) + c; }
+
+  // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b).
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
+  pmul(const LhsScalar& x, const RhsScalar& y) const
+  { return numext::conj(x * y); }
+};
+
+// Implementation with equal type, use packet operations.
+template<typename Packet, bool ConjLhs, bool ConjRhs>
+struct conj_helper<Packet, Packet, ConjLhs, ConjRhs>
+{
+  typedef Packet ResultType;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const
+  { return Eigen::internal::pmadd(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y), c); }
+
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const
+  { return Eigen::internal::pmul(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y)); }
+};
+
+template<typename Packet>
+struct conj_helper<Packet, Packet, true, true>
+{
+  typedef Packet ResultType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const
+  { return Eigen::internal::pmadd(pconj(x), pconj(y), c); }
+  // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b).
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const
+  { return pconj(Eigen::internal::pmul(x, y)); }
+};
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_ARCH_CONJ_HELPER_H

diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
new file mode 100644
index 0000000..c9fbaf6
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h

@@ -0,0 +1,1649 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+// Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The exp and log functions of this file initially come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
+#define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+// Creates a Scalar integer type with same bit-width.
+template<typename T> struct make_integer;
+template<> struct make_integer<float>    { typedef numext::int32_t type; };
+template<> struct make_integer<double>   { typedef numext::int64_t type; };
+template<> struct make_integer<half>     { typedef numext::int16_t type; };
+template<> struct make_integer<bfloat16> { typedef numext::int16_t type; };
+
+template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC  
+Packet pfrexp_generic_get_biased_exponent(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  enum { mantissa_bits = numext::numeric_limits<Scalar>::digits - 1};
+  return pcast<PacketI, Packet>(plogical_shift_right<mantissa_bits>(preinterpret<PacketI>(pabs(a))));
+}
+
+// Safely applies frexp, correctly handles denormals.
+// Assumes IEEE floating point format.
+template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+Packet pfrexp_generic(const Packet& a, Packet& exponent) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename make_unsigned<typename make_integer<Scalar>::type>::type ScalarUI;
+  enum {
+    TotalBits = sizeof(Scalar) * CHAR_BIT,
+    MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+    ExponentBits = int(TotalBits) - int(MantissaBits) - 1
+  };
+
+  EIGEN_CONSTEXPR ScalarUI scalar_sign_mantissa_mask = 
+      ~(((ScalarUI(1) << int(ExponentBits)) - ScalarUI(1)) << int(MantissaBits)); // ~0x7f800000
+  const Packet sign_mantissa_mask = pset1frombits<Packet>(static_cast<ScalarUI>(scalar_sign_mantissa_mask)); 
+  const Packet half = pset1<Packet>(Scalar(0.5));
+  const Packet zero = pzero(a);
+  const Packet normal_min = pset1<Packet>((numext::numeric_limits<Scalar>::min)()); // Minimum normal value, 2^-126
+  
+  // To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1).
+  const Packet is_denormal = pcmp_lt(pabs(a), normal_min);
+  EIGEN_CONSTEXPR ScalarUI scalar_normalization_offset = ScalarUI(int(MantissaBits) + 1); // 24
+  // The following cannot be constexpr because bfloat16(uint16_t) is not constexpr.
+  const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset)); // 2^24
+  const Packet normalization_factor = pset1<Packet>(scalar_normalization_factor);  
+  const Packet normalized_a = pselect(is_denormal, pmul(a, normalization_factor), a);
+  
+  // Determine exponent offset: -126 if normal, -126-24 if denormal
+  const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1)<<(int(ExponentBits)-1)) - ScalarUI(2)); // -126
+  Packet exponent_offset = pset1<Packet>(scalar_exponent_offset);
+  const Packet normalization_offset = pset1<Packet>(-Scalar(scalar_normalization_offset)); // -24
+  exponent_offset = pselect(is_denormal, padd(exponent_offset, normalization_offset), exponent_offset);
+  
+  // Determine exponent and mantissa from normalized_a.
+  exponent = pfrexp_generic_get_biased_exponent(normalized_a);
+  // Zero, Inf and NaN return 'a' unmodified, exponent is zero
+  // (technically the exponent is unspecified for inf/NaN, but GCC/Clang set it to zero)
+  const Scalar scalar_non_finite_exponent = Scalar((ScalarUI(1) << int(ExponentBits)) - ScalarUI(1));  // 255
+  const Packet non_finite_exponent = pset1<Packet>(scalar_non_finite_exponent);
+  const Packet is_zero_or_not_finite = por(pcmp_eq(a, zero), pcmp_eq(exponent, non_finite_exponent));
+  const Packet m = pselect(is_zero_or_not_finite, a, por(pand(normalized_a, sign_mantissa_mask), half));
+  exponent = pselect(is_zero_or_not_finite, zero, padd(exponent, exponent_offset));  
+  return m;
+}
+
+// Safely applies ldexp, correctly handles overflows, underflows and denormals.
+// Assumes IEEE floating point format.
+template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+Packet pldexp_generic(const Packet& a, const Packet& exponent) {
+  // We want to return a * 2^exponent, allowing for all possible integer
+  // exponents without overflowing or underflowing in intermediate
+  // computations.
+  //
+  // Since 'a' and the output can be denormal, the maximum range of 'exponent'
+  // to consider for a float is:
+  //   -255-23 -> 255+23
+  // Below -278 any finite float 'a' will become zero, and above +278 any
+  // finite float will become inf, including when 'a' is the smallest possible 
+  // denormal.
+  //
+  // Unfortunately, 2^(278) cannot be represented using either one or two
+  // finite normal floats, so we must split the scale factor into at least
+  // three parts. It turns out to be faster to split 'exponent' into four
+  // factors, since [exponent>>2] is much faster to compute that [exponent/3].
+  //
+  // Set e = min(max(exponent, -278), 278);
+  //     b = floor(e/4);
+  //   out = ((((a * 2^(b)) * 2^(b)) * 2^(b)) * 2^(e-3*b))
+  //
+  // This will avoid any intermediate overflows and correctly handle 0, inf,
+  // NaN cases.
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename unpacket_traits<PacketI>::type ScalarI;
+  enum {
+    TotalBits = sizeof(Scalar) * CHAR_BIT,
+    MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+    ExponentBits = int(TotalBits) - int(MantissaBits) - 1
+  };
+
+  const Packet max_exponent = pset1<Packet>(Scalar((ScalarI(1)<<int(ExponentBits)) + ScalarI(int(MantissaBits) - 1)));  // 278
+  const PacketI bias = pset1<PacketI>((ScalarI(1)<<(int(ExponentBits)-1)) - ScalarI(1));  // 127
+  const PacketI e = pcast<Packet, PacketI>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+  PacketI b = parithmetic_shift_right<2>(e); // floor(e/4);
+  Packet c = preinterpret<Packet>(plogical_shift_left<int(MantissaBits)>(padd(b, bias)));  // 2^b
+  Packet out = pmul(pmul(pmul(a, c), c), c);  // a * 2^(3b)
+  b = psub(psub(psub(e, b), b), b); // e - 3b
+  c = preinterpret<Packet>(plogical_shift_left<int(MantissaBits)>(padd(b, bias)));  // 2^(e-3*b)
+  out = pmul(out, c);
+  return out;
+}
+
+// Explicitly multiplies 
+//    a * (2^e)
+// clamping e to the range
+// [NumTraits<Scalar>::min_exponent()-2, NumTraits<Scalar>::max_exponent()]
+//
+// This is approx 7x faster than pldexp_impl, but will prematurely over/underflow
+// if 2^e doesn't fit into a normal floating-point Scalar.
+//
+// Assumes IEEE floating point format
+template<typename Packet>
+struct pldexp_fast_impl {
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename unpacket_traits<PacketI>::type ScalarI;
+  enum {
+    TotalBits = sizeof(Scalar) * CHAR_BIT,
+    MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+    ExponentBits = int(TotalBits) - int(MantissaBits) - 1
+  };
+  
+  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+  Packet run(const Packet& a, const Packet& exponent) {
+    const Packet bias = pset1<Packet>(Scalar((ScalarI(1)<<(int(ExponentBits)-1)) - ScalarI(1)));  // 127
+    const Packet limit = pset1<Packet>(Scalar((ScalarI(1)<<int(ExponentBits)) - ScalarI(1)));     // 255
+    // restrict biased exponent between 0 and 255 for float.
+    const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127
+    // return a * (2^e)
+    return pmul(a, preinterpret<Packet>(plogical_shift_left<int(MantissaBits)>(e)));
+  }
+};
+
+// Natural or base 2 logarithm.
+// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
+// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
+// be easily approximated by a polynomial centered on m=1 for stability.
+// TODO(gonnet): Further reduce the interval allowing for lower-degree
+//               polynomial interpolants -> ... -> profit!
+template <typename Packet, bool base2>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog_impl_float(const Packet _x)
+{
+  Packet x = _x;
+
+  const Packet cst_1              = pset1<Packet>(1.0f);
+  const Packet cst_neg_half       = pset1<Packet>(-0.5f);
+  // The smallest non denormalized float number.
+  const Packet cst_min_norm_pos   = pset1frombits<Packet>( 0x00800000u);
+  const Packet cst_minus_inf      = pset1frombits<Packet>( 0xff800000u);
+  const Packet cst_pos_inf        = pset1frombits<Packet>( 0x7f800000u);
+
+  // Polynomial coefficients.
+  const Packet cst_cephes_SQRTHF = pset1<Packet>(0.707106781186547524f);
+  const Packet cst_cephes_log_p0 = pset1<Packet>(7.0376836292E-2f);
+  const Packet cst_cephes_log_p1 = pset1<Packet>(-1.1514610310E-1f);
+  const Packet cst_cephes_log_p2 = pset1<Packet>(1.1676998740E-1f);
+  const Packet cst_cephes_log_p3 = pset1<Packet>(-1.2420140846E-1f);
+  const Packet cst_cephes_log_p4 = pset1<Packet>(+1.4249322787E-1f);
+  const Packet cst_cephes_log_p5 = pset1<Packet>(-1.6668057665E-1f);
+  const Packet cst_cephes_log_p6 = pset1<Packet>(+2.0000714765E-1f);
+  const Packet cst_cephes_log_p7 = pset1<Packet>(-2.4999993993E-1f);
+  const Packet cst_cephes_log_p8 = pset1<Packet>(+3.3333331174E-1f);
+
+  // Truncate input values to the minimum positive normal.
+  x = pmax(x, cst_min_norm_pos);
+
+  Packet e;
+  // extract significant in the range [0.5,1) and exponent
+  x = pfrexp(x,e);
+
+  // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
+  // and shift by -1. The values are then centered around 0, which improves
+  // the stability of the polynomial evaluation.
+  //   if( x < SQRTHF ) {
+  //     e -= 1;
+  //     x = x + x - 1.0;
+  //   } else { x = x - 1.0; }
+  Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
+  Packet tmp = pand(x, mask);
+  x = psub(x, cst_1);
+  e = psub(e, pand(cst_1, mask));
+  x = padd(x, tmp);
+
+  Packet x2 = pmul(x, x);
+  Packet x3 = pmul(x2, x);
+
+  // Evaluate the polynomial approximant of degree 8 in three parts, probably
+  // to improve instruction-level parallelism.
+  Packet y, y1, y2;
+  y  = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
+  y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
+  y2 = pmadd(cst_cephes_log_p6, x, cst_cephes_log_p7);
+  y  = pmadd(y, x, cst_cephes_log_p2);
+  y1 = pmadd(y1, x, cst_cephes_log_p5);
+  y2 = pmadd(y2, x, cst_cephes_log_p8);
+  y  = pmadd(y, x3, y1);
+  y  = pmadd(y, x3, y2);
+  y  = pmul(y, x3);
+
+  y = pmadd(cst_neg_half, x2, y);
+  x = padd(x, y);
+
+  // Add the logarithm of the exponent back to the result of the interpolation.
+  if (base2) {
+    const Packet cst_log2e = pset1<Packet>(static_cast<float>(EIGEN_LOG2E));
+    x = pmadd(x, cst_log2e, e);
+  } else {
+    const Packet cst_ln2 = pset1<Packet>(static_cast<float>(EIGEN_LN2));
+    x = pmadd(e, cst_ln2, x);
+  }
+
+  Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
+  Packet iszero_mask  = pcmp_eq(_x,pzero(_x));
+  Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
+  // Filter out invalid inputs, i.e.:
+  //  - negative arg will be NAN
+  //  - 0 will be -INF
+  //  - +INF will be +INF
+  return pselect(iszero_mask, cst_minus_inf,
+                              por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog_float(const Packet _x)
+{
+  return plog_impl_float<Packet, /* base2 */ false>(_x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog2_float(const Packet _x)
+{
+  return plog_impl_float<Packet, /* base2 */ true>(_x);
+}
+
+/* Returns the base e (2.718...) or base 2 logarithm of x.
+ * The argument is separated into its exponent and fractional parts.
+ * The logarithm of the fraction in the interval [sqrt(1/2), sqrt(2)],
+ * is approximated by
+ *
+ *     log(1+x) = x - 0.5 x**2 + x**3 P(x)/Q(x).
+ *
+ * for more detail see: http://www.netlib.org/cephes/
+ */
+template <typename Packet, bool base2>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog_impl_double(const Packet _x)
+{
+  Packet x = _x;
+
+  const Packet cst_1              = pset1<Packet>(1.0);
+  const Packet cst_neg_half       = pset1<Packet>(-0.5);
+  // The smallest non denormalized double.
+  const Packet cst_min_norm_pos   = pset1frombits<Packet>( static_cast<uint64_t>(0x0010000000000000ull));
+  const Packet cst_minus_inf      = pset1frombits<Packet>( static_cast<uint64_t>(0xfff0000000000000ull));
+  const Packet cst_pos_inf        = pset1frombits<Packet>( static_cast<uint64_t>(0x7ff0000000000000ull));
+
+
+ // Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
+ //                             1/sqrt(2) <= x < sqrt(2)
+  const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
+  const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
+  const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
+  const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
+  const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
+  const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
+  const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
+
+  const Packet cst_cephes_log_q0 = pset1<Packet>(1.0);
+  const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
+  const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
+  const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
+  const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
+  const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
+
+  // Truncate input values to the minimum positive normal.
+  x = pmax(x, cst_min_norm_pos);
+
+  Packet e;
+  // extract significant in the range [0.5,1) and exponent
+  x = pfrexp(x,e);
+  
+  // Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
+  // and shift by -1. The values are then centered around 0, which improves
+  // the stability of the polynomial evaluation.
+  //   if( x < SQRTHF ) {
+  //     e -= 1;
+  //     x = x + x - 1.0;
+  //   } else { x = x - 1.0; }
+  Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
+  Packet tmp = pand(x, mask);
+  x = psub(x, cst_1);
+  e = psub(e, pand(cst_1, mask));
+  x = padd(x, tmp);
+
+  Packet x2 = pmul(x, x);
+  Packet x3 = pmul(x2, x);
+
+  // Evaluate the polynomial approximant , probably to improve instruction-level parallelism.
+  // y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
+  Packet y, y1, y_;
+  y  = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
+  y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
+  y  = pmadd(y, x, cst_cephes_log_p2);
+  y1 = pmadd(y1, x, cst_cephes_log_p5);
+  y_ = pmadd(y, x3, y1);
+
+  y  = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1);
+  y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
+  y  = pmadd(y, x, cst_cephes_log_q2);
+  y1 = pmadd(y1, x, cst_cephes_log_q5);
+  y  = pmadd(y, x3, y1);
+
+  y_ = pmul(y_, x3);
+  y  = pdiv(y_, y);
+
+  y = pmadd(cst_neg_half, x2, y);
+  x = padd(x, y);
+
+  // Add the logarithm of the exponent back to the result of the interpolation.
+  if (base2) {
+    const Packet cst_log2e = pset1<Packet>(static_cast<double>(EIGEN_LOG2E));
+    x = pmadd(x, cst_log2e, e);
+  } else {
+    const Packet cst_ln2 = pset1<Packet>(static_cast<double>(EIGEN_LN2));
+    x = pmadd(e, cst_ln2, x);
+  }
+
+  Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
+  Packet iszero_mask  = pcmp_eq(_x,pzero(_x));
+  Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
+  // Filter out invalid inputs, i.e.:
+  //  - negative arg will be NAN
+  //  - 0 will be -INF
+  //  - +INF will be +INF
+  return pselect(iszero_mask, cst_minus_inf,
+                              por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog_double(const Packet _x)
+{
+  return plog_impl_double<Packet, /* base2 */ false>(_x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog2_double(const Packet _x)
+{
+  return plog_impl_double<Packet, /* base2 */ true>(_x);
+}
+
+/** \internal \returns log(1 + x) computed using W. Kahan's formula.
+    See: http://www.plunk.org/~hatch/rightway.php
+ */
+template<typename Packet>
+Packet generic_plog1p(const Packet& x)
+{
+  typedef typename unpacket_traits<Packet>::type ScalarType;
+  const Packet one = pset1<Packet>(ScalarType(1));
+  Packet xp1 = padd(x, one);
+  Packet small_mask = pcmp_eq(xp1, one);
+  Packet log1 = plog(xp1);
+  Packet inf_mask = pcmp_eq(xp1, log1);
+  Packet log_large = pmul(x, pdiv(log1, psub(xp1, one)));
+  return pselect(por(small_mask, inf_mask), x, log_large);
+}
+
+/** \internal \returns exp(x)-1 computed using W. Kahan's formula.
+    See: http://www.plunk.org/~hatch/rightway.php
+ */
+template<typename Packet>
+Packet generic_expm1(const Packet& x)
+{
+  typedef typename unpacket_traits<Packet>::type ScalarType;
+  const Packet one = pset1<Packet>(ScalarType(1));
+  const Packet neg_one = pset1<Packet>(ScalarType(-1));
+  Packet u = pexp(x);
+  Packet one_mask = pcmp_eq(u, one);
+  Packet u_minus_one = psub(u, one);
+  Packet neg_one_mask = pcmp_eq(u_minus_one, neg_one);
+  Packet logu = plog(u);
+  // The following comparison is to catch the case where
+  // exp(x) = +inf. It is written in this way to avoid having
+  // to form the constant +inf, which depends on the packet
+  // type.
+  Packet pos_inf_mask = pcmp_eq(logu, u);
+  Packet expm1 = pmul(u_minus_one, pdiv(x, logu));
+  expm1 = pselect(pos_inf_mask, u, expm1);
+  return pselect(one_mask,
+                 x,
+                 pselect(neg_one_mask,
+                         neg_one,
+                         expm1));
+}
+
+
+// Exponential function. Works by writing "x = m*log(2) + r" where
+// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
+// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet pexp_float(const Packet _x)
+{
+  const Packet cst_1      = pset1<Packet>(1.0f);
+  const Packet cst_half   = pset1<Packet>(0.5f);
+  const Packet cst_exp_hi = pset1<Packet>( 88.723f);
+  const Packet cst_exp_lo = pset1<Packet>(-88.723f);
+
+  const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f);
+  const Packet cst_cephes_exp_p0 = pset1<Packet>(1.9875691500E-4f);
+  const Packet cst_cephes_exp_p1 = pset1<Packet>(1.3981999507E-3f);
+  const Packet cst_cephes_exp_p2 = pset1<Packet>(8.3334519073E-3f);
+  const Packet cst_cephes_exp_p3 = pset1<Packet>(4.1665795894E-2f);
+  const Packet cst_cephes_exp_p4 = pset1<Packet>(1.6666665459E-1f);
+  const Packet cst_cephes_exp_p5 = pset1<Packet>(5.0000001201E-1f);
+
+  // Clamp x.
+  Packet x = pmax(pmin(_x, cst_exp_hi), cst_exp_lo);
+
+  // Express exp(x) as exp(m*ln(2) + r), start by extracting
+  // m = floor(x/ln(2) + 0.5).
+  Packet m = pfloor(pmadd(x, cst_cephes_LOG2EF, cst_half));
+
+  // Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
+  // subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
+  // truncation errors.
+  const Packet cst_cephes_exp_C1 = pset1<Packet>(-0.693359375f);
+  const Packet cst_cephes_exp_C2 = pset1<Packet>(2.12194440e-4f);
+  Packet r = pmadd(m, cst_cephes_exp_C1, x);
+  r = pmadd(m, cst_cephes_exp_C2, r);
+
+  Packet r2 = pmul(r, r);
+  Packet r3 = pmul(r2, r);
+
+  // Evaluate the polynomial approximant,improved by instruction-level parallelism.
+  Packet y, y1, y2;
+  y  = pmadd(cst_cephes_exp_p0, r, cst_cephes_exp_p1);
+  y1 = pmadd(cst_cephes_exp_p3, r, cst_cephes_exp_p4);
+  y2 = padd(r, cst_1);
+  y  = pmadd(y, r, cst_cephes_exp_p2);
+  y1 = pmadd(y1, r, cst_cephes_exp_p5);
+  y  = pmadd(y, r3, y1);
+  y  = pmadd(y, r2, y2);
+
+  // Return 2^m * exp(r).
+  // TODO: replace pldexp with faster implementation since y in [-1, 1).
+  return pmax(pldexp(y,m), _x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet pexp_double(const Packet _x)
+{
+  Packet x = _x;
+
+  const Packet cst_1 = pset1<Packet>(1.0);
+  const Packet cst_2 = pset1<Packet>(2.0);
+  const Packet cst_half = pset1<Packet>(0.5);
+
+  const Packet cst_exp_hi = pset1<Packet>(709.784);
+  const Packet cst_exp_lo = pset1<Packet>(-709.784);
+
+  const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599);
+  const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4);
+  const Packet cst_cephes_exp_p1 = pset1<Packet>(3.02994407707441961300e-2);
+  const Packet cst_cephes_exp_p2 = pset1<Packet>(9.99999999999999999910e-1);
+  const Packet cst_cephes_exp_q0 = pset1<Packet>(3.00198505138664455042e-6);
+  const Packet cst_cephes_exp_q1 = pset1<Packet>(2.52448340349684104192e-3);
+  const Packet cst_cephes_exp_q2 = pset1<Packet>(2.27265548208155028766e-1);
+  const Packet cst_cephes_exp_q3 = pset1<Packet>(2.00000000000000000009e0);
+  const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693145751953125);
+  const Packet cst_cephes_exp_C2 = pset1<Packet>(1.42860682030941723212e-6);
+
+  Packet tmp, fx;
+
+  // clamp x
+  x = pmax(pmin(x, cst_exp_hi), cst_exp_lo);
+  // Express exp(x) as exp(g + n*log(2)).
+  fx = pmadd(cst_cephes_LOG2EF, x, cst_half);
+
+  // Get the integer modulus of log(2), i.e. the "n" described above.
+  fx = pfloor(fx);
+
+  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
+  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
+  // digits right.
+  tmp = pmul(fx, cst_cephes_exp_C1);
+  Packet z = pmul(fx, cst_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  Packet x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial of the rational interpolant.
+  Packet px = cst_cephes_exp_p0;
+  px = pmadd(px, x2, cst_cephes_exp_p1);
+  px = pmadd(px, x2, cst_cephes_exp_p2);
+  px = pmul(px, x);
+
+  // Evaluate the denominator polynomial of the rational interpolant.
+  Packet qx = cst_cephes_exp_q0;
+  qx = pmadd(qx, x2, cst_cephes_exp_q1);
+  qx = pmadd(qx, x2, cst_cephes_exp_q2);
+  qx = pmadd(qx, x2, cst_cephes_exp_q3);
+
+  // I don't really get this bit, copied from the SSE2 routines, so...
+  // TODO(gonnet): Figure out what is going on here, perhaps find a better
+  // rational interpolant?
+  x = pdiv(px, psub(qx, px));
+  x = pmadd(cst_2, x, cst_1);
+
+  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
+  // non-finite values in the input.
+  // TODO: replace pldexp with faster implementation since x in [-1, 1).
+  return pmax(pldexp(x,fx), _x);
+}
+
+// The following code is inspired by the following stack-overflow answer:
+//   https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
+// It has been largely optimized:
+//  - By-pass calls to frexp.
+//  - Aligned loads of required 96 bits of 2/pi. This is accomplished by
+//    (1) balancing the mantissa and exponent to the required bits of 2/pi are
+//    aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi.
+//  - Avoid a branch in rounding and extraction of the remaining fractional part.
+// Overall, I measured a speed up higher than x2 on x86-64.
+inline float trig_reduce_huge (float xf, int *quadrant)
+{
+  using Eigen::numext::int32_t;
+  using Eigen::numext::uint32_t;
+  using Eigen::numext::int64_t;
+  using Eigen::numext::uint64_t;
+
+  const double pio2_62 = 3.4061215800865545e-19;    // pi/2 * 2^-62
+  const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point foramt
+
+  // 192 bits of 2/pi for Payne-Hanek reduction
+  // Bits are introduced by packet of 8 to enable aligned reads.
+  static const uint32_t two_over_pi [] = 
+  {
+    0x00000028, 0x000028be, 0x0028be60, 0x28be60db,
+    0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a,
+    0x91054a7f, 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4,
+    0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770,
+    0x4d377036, 0x377036d8, 0x7036d8a5, 0x36d8a566,
+    0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410,
+    0x10e41000, 0xe4100000
+  };
+  
+  uint32_t xi = numext::bit_cast<uint32_t>(xf);
+  // Below, -118 = -126 + 8.
+  //   -126 is to get the exponent,
+  //   +8 is to enable alignment of 2/pi's bits on 8 bits.
+  // This is possible because the fractional part of x as only 24 meaningful bits.
+  uint32_t e = (xi >> 23) - 118;
+  // Extract the mantissa and shift it to align it wrt the exponent
+  xi = ((xi & 0x007fffffu)| 0x00800000u) << (e & 0x7);
+
+  uint32_t i = e >> 3;
+  uint32_t twoopi_1  = two_over_pi[i-1];
+  uint32_t twoopi_2  = two_over_pi[i+3];
+  uint32_t twoopi_3  = two_over_pi[i+7];
+
+  // Compute x * 2/pi in 2.62-bit fixed-point format.
+  uint64_t p;
+  p = uint64_t(xi) * twoopi_3;
+  p = uint64_t(xi) * twoopi_2 + (p >> 32);
+  p = (uint64_t(xi * twoopi_1) << 32) + p;
+
+  // Round to nearest: add 0.5 and extract integral part.
+  uint64_t q = (p + zero_dot_five) >> 62;
+  *quadrant = int(q);
+  // Now it remains to compute "r = x - q*pi/2" with high accuracy,
+  // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as:
+  //   r = (p-q)*pi/2,
+  // where the product can be be carried out with sufficient accuracy using double precision.
+  p -= q<<62;
+  return float(double(int64_t(p)) * pio2_62);
+}
+
+template<bool ComputeSine,typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+#if EIGEN_GNUC_AT_LEAST(4,4) && EIGEN_COMP_GNUC_STRICT
+__attribute__((optimize("-fno-unsafe-math-optimizations")))
+#endif
+Packet psincos_float(const Packet& _x)
+{
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+
+  const Packet  cst_2oPI            = pset1<Packet>(0.636619746685028076171875f); // 2/PI
+  const Packet  cst_rounding_magic  = pset1<Packet>(12582912); // 2^23 for rounding
+  const PacketI csti_1              = pset1<PacketI>(1);
+  const Packet  cst_sign_mask       = pset1frombits<Packet>(0x80000000u);
+
+  Packet x = pabs(_x);
+
+  // Scale x by 2/Pi to find x's octant.
+  Packet y = pmul(x, cst_2oPI);
+
+  // Rounding trick:
+  Packet y_round = padd(y, cst_rounding_magic);
+  EIGEN_OPTIMIZATION_BARRIER(y_round)
+  PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
+  y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi
+
+  // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4
+  // using "Extended precision modular arithmetic"
+  #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
+  // This version requires true FMA for high accuracy
+  // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
+  const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
+  x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
+  x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
+  x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
+  #else
+  // Without true FMA, the previous set of coefficients maintain 1ULP accuracy
+  // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
+  // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
+
+  // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
+  // and 2 ULP up to:
+  const float huge_th = ComputeSine ? 25966.f : 18838.f;
+  x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
+  EIGEN_OPTIMIZATION_BARRIER(x)
+  x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
+  EIGEN_OPTIMIZATION_BARRIER(x)
+  x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
+  x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
+
+  // For the record, the following set of coefficients maintain 2ULP up
+  // to a slightly larger range:
+  // const float huge_th = ComputeSine ? 51981.f : 39086.125f;
+  // but it slightly fails to maintain 1ULP for two values of sin below pi.
+  // x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
+  // x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
+  // x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
+  // x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
+
+  // For the record, with only 3 iterations it is possible to maintain
+  // 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
+  // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
+  #endif
+
+  if(predux_any(pcmp_le(pset1<Packet>(huge_th),pabs(_x))))
+  {
+    const int PacketSize = unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize];
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) int y_int2[PacketSize];
+    pstoreu(vals, pabs(_x));
+    pstoreu(x_cpy, x);
+    pstoreu(y_int2, y_int);
+    for(int k=0; k<PacketSize;++k)
+    {
+      float val = vals[k];
+      if(val>=huge_th && (numext::isfinite)(val))
+        x_cpy[k] = trig_reduce_huge(val,&y_int2[k]);
+    }
+    x = ploadu<Packet>(x_cpy);
+    y_int = ploadu<PacketI>(y_int2);
+  }
+
+  // Compute the sign to apply to the polynomial.
+  // sin: sign = second_bit(y_int) xor signbit(_x)
+  // cos: sign = second_bit(y_int+1)
+  Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
+                                : preinterpret<Packet>(plogical_shift_left<30>(padd(y_int,csti_1)));
+  sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
+
+  // Get the polynomial selection mask from the second bit of y_int
+  // We'll calculate both (sin and cos) polynomials and then select from the two.
+  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
+
+  Packet x2 = pmul(x,x);
+
+  // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
+  Packet y1 =        pset1<Packet>(2.4372266125283204019069671630859375e-05f);
+  y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f     ));
+  y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f           ));
+  y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
+  y1 = pmadd(y1, x2, pset1<Packet>(1.f));
+
+  // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4)
+  // octave/matlab code to compute those coefficients:
+  //    x = (0:0.0001:pi/4)';
+  //    A = [x.^3 x.^5 x.^7];
+  //    w = ((1.-(x/(pi/4)).^2).^5)*2000+1;         # weights trading relative accuracy
+  //    c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
+  //    printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
+  //
+  Packet y2 =        pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
+  y2 = pmadd(y2, x2, pset1<Packet>( 0.0083326873655616851693794799871284340042620897293090820312500000f));
+  y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
+  y2 = pmul(y2, x2);
+  y2 = pmadd(y2, x, x);
+
+  // Select the correct result from the two polynomials.
+  y = ComputeSine ? pselect(poly_mask,y2,y1)
+                  : pselect(poly_mask,y1,y2);
+
+  // Update the sign and filter huge inputs
+  return pxor(y, sign_bit);
+}
+
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet psin_float(const Packet& x)
+{
+  return psincos_float<true>(x);
+}
+
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet pcos_float(const Packet& x)
+{
+  return psincos_float<false>(x);
+}
+
+
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet psqrt_complex(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename Scalar::value_type RealScalar;
+  typedef typename unpacket_traits<Packet>::as_real RealPacket;
+
+  // Computes the principal sqrt of the complex numbers in the input.
+  //
+  // For example, for packets containing 2 complex numbers stored in interleaved format
+  //    a = [a0, a1] = [x0, y0, x1, y1],
+  // where x0 = real(a0), y0 = imag(a0) etc., this function returns
+  //    b = [b0, b1] = [u0, v0, u1, v1],
+  // such that b0^2 = a0, b1^2 = a1.
+  //
+  // To derive the formula for the complex square roots, let's consider the equation for
+  // a single complex square root of the number x + i*y. We want to find real numbers
+  // u and v such that
+  //    (u + i*v)^2 = x + i*y  <=>
+  //    u^2 - v^2 + i*2*u*v = x + i*v.
+  // By equating the real and imaginary parts we get:
+  //    u^2 - v^2 = x
+  //    2*u*v = y.
+  //
+  // For x >= 0, this has the numerically stable solution
+  //    u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
+  //    v = 0.5 * (y / u)
+  // and for x < 0,
+  //    v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
+  //    u = 0.5 * (y / v)
+  //
+  //  To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as
+  //     l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) ,
+
+  // In the following, without lack of generality, we have annotated the code, assuming
+  // that the input is a packet of 2 complex numbers.
+  //
+  // Step 1. Compute l = [l0, l0, l1, l1], where
+  //    l0 = sqrt(x0^2 + y0^2),  l1 = sqrt(x1^2 + y1^2)
+  // To avoid over- and underflow, we use the stable formula for each hypotenuse
+  //    l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)),
+  // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1.
+
+  RealPacket a_abs = pabs(a.v);           // [|x0|, |y0|, |x1|, |y1|]
+  RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v; // [|y0|, |x0|, |y1|, |x1|]
+  RealPacket a_max = pmax(a_abs, a_abs_flip);
+  RealPacket a_min = pmin(a_abs, a_abs_flip);
+  RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
+  RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max));
+  RealPacket r = pdiv(a_min, a_max);
+  const RealPacket cst_one  = pset1<RealPacket>(RealScalar(1));
+  RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r))));  // [l0, l0, l1, l1]
+  // Set l to a_max if a_min is zero.
+  l = pselect(a_min_zero_mask, a_max, l);
+
+  // Step 2. Compute [rho0, *, rho1, *], where
+  // rho0 = sqrt(0.5 * (l0 + |x0|)), rho1 =  sqrt(0.5 * (l1 + |x1|))
+  // We don't care about the imaginary parts computed here. They will be overwritten later.
+  const RealPacket cst_half = pset1<RealPacket>(RealScalar(0.5));
+  Packet rho;
+  rho.v = psqrt(pmul(cst_half, padd(a_abs, l)));
+
+  // Step 3. Compute [rho0, eta0, rho1, eta1], where
+  // eta0 = (y0 / l0) / 2, and eta1 = (y1 / l1) / 2.
+  // set eta = 0 of input is 0 + i0.
+  RealPacket eta = pandnot(pmul(cst_half, pdiv(a.v, pcplxflip(rho).v)), a_max_zero_mask);
+  RealPacket real_mask = peven_mask(a.v);
+  Packet positive_real_result;
+  // Compute result for inputs with positive real part.
+  positive_real_result.v = pselect(real_mask, rho.v, eta);
+
+  // Step 4. Compute solution for inputs with negative real part:
+  //         [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1]
+  const RealScalar neg_zero = RealScalar(numext::bit_cast<float>(0x80000000u));
+  const RealPacket cst_imag_sign_mask = pset1<Packet>(Scalar(RealScalar(0.0), neg_zero)).v;
+  RealPacket imag_signs = pand(a.v, cst_imag_sign_mask);
+  Packet negative_real_result;
+  // Notice that rho is positive, so taking it's absolute value is a noop.
+  negative_real_result.v = por(pabs(pcplxflip(positive_real_result).v), imag_signs);
+
+  // Step 5. Select solution branch based on the sign of the real parts.
+  Packet negative_real_mask;
+  negative_real_mask.v = pcmp_lt(pand(real_mask, a.v), pzero(a.v));
+  negative_real_mask.v = por(negative_real_mask.v, pcplxflip(negative_real_mask).v);
+  Packet result = pselect(negative_real_mask, negative_real_result, positive_real_result);
+
+  // Step 6. Handle special cases for infinities:
+  // * If z is (x,+∞), the result is (+∞,+∞) even if x is NaN
+  // * If z is (x,-∞), the result is (+∞,-∞) even if x is NaN
+  // * If z is (-∞,y), the result is (0*|y|,+∞) for finite or NaN y
+  // * If z is (+∞,y), the result is (+∞,0*|y|) for finite or NaN y
+  const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
+  Packet is_inf;
+  is_inf.v = pcmp_eq(a_abs, cst_pos_inf);
+  Packet is_real_inf;
+  is_real_inf.v = pand(is_inf.v, real_mask);
+  is_real_inf = por(is_real_inf, pcplxflip(is_real_inf));
+  // prepare packet of (+∞,0*|y|) or (0*|y|,+∞), depending on the sign of the infinite real part.
+  Packet real_inf_result;
+  real_inf_result.v = pmul(a_abs, pset1<Packet>(Scalar(RealScalar(1.0), RealScalar(0.0))).v);
+  real_inf_result.v = pselect(negative_real_mask.v, pcplxflip(real_inf_result).v, real_inf_result.v);
+  // prepare packet of (+∞,+∞) or (+∞,-∞), depending on the sign of the infinite imaginary part.
+  Packet is_imag_inf;
+  is_imag_inf.v = pandnot(is_inf.v, real_mask);
+  is_imag_inf = por(is_imag_inf, pcplxflip(is_imag_inf));
+  Packet imag_inf_result;
+  imag_inf_result.v = por(pand(cst_pos_inf, real_mask), pandnot(a.v, real_mask));
+
+  return  pselect(is_imag_inf, imag_inf_result,
+                  pselect(is_real_inf, real_inf_result,result));
+}
+
+// TODO(rmlarsen): The following set of utilities for double word arithmetic
+// should perhaps be refactored as a separate file, since it would be generally
+// useful for special function implementation etc. Writing the algorithms in
+// terms if a double word type would also make the code more readable.
+
+// This function splits x into the nearest integer n and fractional part r,
+// such that x = n + r holds exactly.
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void absolute_split(const Packet& x, Packet& n, Packet& r) {
+  n = pround(x);
+  r = psub(x, n);
+}
+
+// This function computes the sum {s, r}, such that x + y = s_hi + s_lo
+// holds exactly, and s_hi = fl(x+y), if |x| >= |y|.
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
+  s_hi = padd(x, y);
+  const Packet t = psub(s_hi, x);
+  s_lo = psub(y, t);
+}
+
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+// This function implements the extended precision product of
+// a pair of floating point numbers. Given {x, y}, it computes the pair
+// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
+// p_hi = fl(x * y).
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void twoprod(const Packet& x, const Packet& y,
+             Packet& p_hi, Packet& p_lo) {
+  p_hi = pmul(x, y);
+  p_lo = pmadd(x, y, pnegate(p_hi));
+}
+
+#else
+
+// This function implements the Veltkamp splitting. Given a floating point
+// number x it returns the pair {x_hi, x_lo} such that x_hi + x_lo = x holds
+// exactly and that half of the significant of x fits in x_hi.
+// This is Algorithm 3 from Jean-Michel Muller, "Elementary Functions",
+// 3rd edition, Birkh\"auser, 2016.
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  EIGEN_CONSTEXPR int shift = (NumTraits<Scalar>::digits() + 1) / 2;
+  const Scalar shift_scale = Scalar(uint64_t(1) << shift);  // Scalar constructor not necessarily constexpr.
+  const Packet gamma = pmul(pset1<Packet>(shift_scale + Scalar(1)), x);
+  Packet rho = psub(x, gamma);
+  x_hi = padd(rho, gamma);
+  x_lo = psub(x, x_hi);
+}
+
+// This function implements Dekker's algorithm for products x * y.
+// Given floating point numbers {x, y} computes the pair
+// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
+// p_hi = fl(x * y).
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void twoprod(const Packet& x, const Packet& y,
+             Packet& p_hi, Packet& p_lo) {
+  Packet x_hi, x_lo, y_hi, y_lo;
+  veltkamp_splitting(x, x_hi, x_lo);
+  veltkamp_splitting(y, y_hi, y_lo);
+
+  p_hi = pmul(x, y);
+  p_lo = pmadd(x_hi, y_hi, pnegate(p_hi));
+  p_lo = pmadd(x_hi, y_lo, p_lo);
+  p_lo = pmadd(x_lo, y_hi, p_lo);
+  p_lo = pmadd(x_lo, y_lo, p_lo);
+}
+
+#endif  // EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+
+
+// This function implements Dekker's algorithm for the addition
+// of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
+// It returns the result as a pair {s_hi, s_lo} such that
+// x_hi + x_lo + y_hi + y_lo = s_hi + s_lo holds exactly.
+// This is Algorithm 5 from Jean-Michel Muller, "Elementary Functions",
+// 3rd edition, Birkh\"auser, 2016.
+template<typename Packet>
+EIGEN_STRONG_INLINE
+  void twosum(const Packet& x_hi, const Packet& x_lo,
+              const Packet& y_hi, const Packet& y_lo,
+              Packet& s_hi, Packet& s_lo) {
+  const Packet x_greater_mask = pcmp_lt(pabs(y_hi), pabs(x_hi));
+  Packet r_hi_1, r_lo_1;
+  fast_twosum(x_hi, y_hi,r_hi_1, r_lo_1);
+  Packet r_hi_2, r_lo_2;
+  fast_twosum(y_hi, x_hi,r_hi_2, r_lo_2);
+  const Packet r_hi = pselect(x_greater_mask, r_hi_1, r_hi_2);
+
+  const Packet s1 = padd(padd(y_lo, r_lo_1), x_lo);
+  const Packet s2 = padd(padd(x_lo, r_lo_2), y_lo);
+  const Packet s = pselect(x_greater_mask, s1, s2);
+
+  fast_twosum(r_hi, s, s_hi, s_lo);
+}
+
+// This is a version of twosum for double word numbers,
+// which assumes that |x_hi| >= |y_hi|.
+template<typename Packet>
+EIGEN_STRONG_INLINE
+  void fast_twosum(const Packet& x_hi, const Packet& x_lo,
+              const Packet& y_hi, const Packet& y_lo,
+              Packet& s_hi, Packet& s_lo) {
+  Packet r_hi, r_lo;
+  fast_twosum(x_hi, y_hi, r_hi, r_lo);
+  const Packet s = padd(padd(y_lo, r_lo), x_lo);
+  fast_twosum(r_hi, s, s_hi, s_lo);
+}
+
+// This is a version of twosum for adding a floating point number x to
+// double word number {y_hi, y_lo} number, with the assumption
+// that |x| >= |y_hi|.
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void fast_twosum(const Packet& x,
+                 const Packet& y_hi, const Packet& y_lo,
+                 Packet& s_hi, Packet& s_lo) {
+  Packet r_hi, r_lo;
+  fast_twosum(x, y_hi, r_hi, r_lo);
+  const Packet s = padd(y_lo, r_lo);
+  fast_twosum(r_hi, s, s_hi, s_lo);
+}
+
+// This function implements the multiplication of a double word
+// number represented by {x_hi, x_lo} by a floating point number y.
+// It returns the result as a pair {p_hi, p_lo} such that
+// (x_hi + x_lo) * y = p_hi + p_lo hold with a relative error
+// of less than 2*2^{-2p}, where p is the number of significand bit
+// in the floating point type.
+// This is Algorithm 7 from Jean-Michel Muller, "Elementary Functions",
+// 3rd edition, Birkh\"auser, 2016.
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y,
+             Packet& p_hi, Packet& p_lo) {
+  Packet c_hi, c_lo1;
+  twoprod(x_hi, y, c_hi, c_lo1);
+  const Packet c_lo2 = pmul(x_lo, y);
+  Packet t_hi, t_lo1;
+  fast_twosum(c_hi, c_lo2, t_hi, t_lo1);
+  const Packet t_lo2 = padd(t_lo1, c_lo1);
+  fast_twosum(t_hi, t_lo2, p_hi, p_lo);
+}
+
+// This function implements the multiplication of two double word
+// numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
+// It returns the result as a pair {p_hi, p_lo} such that
+// (x_hi + x_lo) * (y_hi + y_lo) = p_hi + p_lo holds with a relative error
+// of less than 2*2^{-2p}, where p is the number of significand bit
+// in the floating point type.
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void twoprod(const Packet& x_hi, const Packet& x_lo,
+             const Packet& y_hi, const Packet& y_lo,
+             Packet& p_hi, Packet& p_lo) {
+  Packet p_hi_hi, p_hi_lo;
+  twoprod(x_hi, x_lo, y_hi, p_hi_hi, p_hi_lo);
+  Packet p_lo_hi, p_lo_lo;
+  twoprod(x_hi, x_lo, y_lo, p_lo_hi, p_lo_lo);
+  fast_twosum(p_hi_hi, p_hi_lo, p_lo_hi, p_lo_lo, p_hi, p_lo);
+}
+
+// This function computes the reciprocal of a floating point number
+// with extra precision and returns the result as a double word.
+template <typename Packet>
+void doubleword_reciprocal(const Packet& x, Packet& recip_hi, Packet& recip_lo) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  // 1. Approximate the reciprocal as the reciprocal of the high order element.
+  Packet approx_recip = prsqrt(x);
+  approx_recip = pmul(approx_recip, approx_recip);
+
+  // 2. Run one step of Newton-Raphson iteration in double word arithmetic
+  // to get the bottom half. The NR iteration for reciprocal of 'a' is
+  //    x_{i+1} = x_i * (2 - a * x_i)
+
+  // -a*x_i
+  Packet t1_hi, t1_lo;
+  twoprod(pnegate(x), approx_recip, t1_hi, t1_lo);
+  // 2 - a*x_i
+  Packet t2_hi, t2_lo;
+  fast_twosum(pset1<Packet>(Scalar(2)), t1_hi, t2_hi, t2_lo);
+  Packet t3_hi, t3_lo;
+  fast_twosum(t2_hi, padd(t2_lo, t1_lo), t3_hi, t3_lo);
+  // x_i * (2 - a * x_i)
+  twoprod(t3_hi, t3_lo, approx_recip, recip_hi, recip_lo);
+}
+
+
+// This function computes log2(x) and returns the result as a double word.
+template <typename Scalar>
+struct accurate_log2 {
+  template <typename Packet>
+  EIGEN_STRONG_INLINE
+  void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
+    log2_x_hi = plog2(x);
+    log2_x_lo = pzero(x);
+  }
+};
+
+// This specialization uses a more accurate algorithm to compute log2(x) for
+// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.42e-10.
+// This additional accuracy is needed to counter the error-magnification
+// inherent in multiplying by a potentially large exponent in pow(x,y).
+// The minimax polynomial used was calculated using the Sollya tool.
+// See sollya.org.
+template <>
+struct accurate_log2<float> {
+  template <typename Packet>
+  EIGEN_STRONG_INLINE
+  void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) {
+    // The function log(1+x)/x is approximated in the interval
+    // [1/sqrt(2)-1;sqrt(2)-1] by a degree 10 polynomial of the form
+    //  Q(x) = (C0 + x * (C1 + x * (C2 + x * (C3 + x * P(x))))),
+    // where the degree 6 polynomial P(x) is evaluated in single precision,
+    // while the remaining 4 terms of Q(x), as well as the final multiplication by x
+    // to reconstruct log(1+x) are evaluated in extra precision using
+    // double word arithmetic. C0 through C3 are extra precise constants
+    // stored as double words.
+    //
+    // The polynomial coefficients were calculated using Sollya commands:
+    // > n = 10;
+    // > f = log2(1+x)/x;
+    // > interval = [sqrt(0.5)-1;sqrt(2)-1];
+    // > p = fpminimax(f,n,[|double,double,double,double,single...|],interval,relative,floating);
+    
+    const Packet p6 = pset1<Packet>( 9.703654795885e-2f);
+    const Packet p5 = pset1<Packet>(-0.1690667718648f);
+    const Packet p4 = pset1<Packet>( 0.1720575392246f);
+    const Packet p3 = pset1<Packet>(-0.1789081543684f);
+    const Packet p2 = pset1<Packet>( 0.2050433009862f);
+    const Packet p1 = pset1<Packet>(-0.2404672354459f);
+    const Packet p0 = pset1<Packet>( 0.2885761857032f);
+
+    const Packet C3_hi = pset1<Packet>(-0.360674142838f);
+    const Packet C3_lo = pset1<Packet>(-6.13283912543e-09f);
+    const Packet C2_hi = pset1<Packet>(0.480897903442f);
+    const Packet C2_lo = pset1<Packet>(-1.44861207474e-08f);
+    const Packet C1_hi = pset1<Packet>(-0.721347510815f);
+    const Packet C1_lo = pset1<Packet>(-4.84483164698e-09f);
+    const Packet C0_hi = pset1<Packet>(1.44269502163f);
+    const Packet C0_lo = pset1<Packet>(2.01711713999e-08f);
+    const Packet one = pset1<Packet>(1.0f);
+
+    const Packet x = psub(z, one);
+    // Evaluate P(x) in working precision.
+    // We evaluate it in multiple parts to improve instruction level
+    // parallelism.
+    Packet x2 = pmul(x,x);
+    Packet p_even = pmadd(p6, x2, p4);
+    p_even = pmadd(p_even, x2, p2);
+    p_even = pmadd(p_even, x2, p0);
+    Packet p_odd = pmadd(p5, x2, p3);
+    p_odd = pmadd(p_odd, x2, p1);
+    Packet p = pmadd(p_odd, x, p_even);
+
+    // Now evaluate the low-order tems of Q(x) in double word precision.
+    // In the following, due to the alternating signs and the fact that
+    // |x| < sqrt(2)-1, we can assume that |C*_hi| >= q_i, and use
+    // fast_twosum instead of the slower twosum.
+    Packet q_hi, q_lo;
+    Packet t_hi, t_lo;
+    // C3 + x * p(x)
+    twoprod(p, x, t_hi, t_lo);
+    fast_twosum(C3_hi, C3_lo, t_hi, t_lo, q_hi, q_lo);
+    // C2 + x * p(x)
+    twoprod(q_hi, q_lo, x, t_hi, t_lo);
+    fast_twosum(C2_hi, C2_lo, t_hi, t_lo, q_hi, q_lo);
+    // C1 + x * p(x)
+    twoprod(q_hi, q_lo, x, t_hi, t_lo);
+    fast_twosum(C1_hi, C1_lo, t_hi, t_lo, q_hi, q_lo);
+    // C0 + x * p(x)
+    twoprod(q_hi, q_lo, x, t_hi, t_lo);
+    fast_twosum(C0_hi, C0_lo, t_hi, t_lo, q_hi, q_lo);
+
+    // log(z) ~= x * Q(x)
+    twoprod(q_hi, q_lo, x, log2_x_hi, log2_x_lo);
+  }
+};
+
+// This specialization uses a more accurate algorithm to compute log2(x) for
+// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~1.27e-18.
+// This additional accuracy is needed to counter the error-magnification
+// inherent in multiplying by a potentially large exponent in pow(x,y).
+// The minimax polynomial used was calculated using the Sollya tool.
+// See sollya.org.
+
+template <>
+struct accurate_log2<double> {
+  template <typename Packet>
+  EIGEN_STRONG_INLINE
+  void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
+    // We use a transformation of variables:
+    //    r = c * (x-1) / (x+1),
+    // such that
+    //    log2(x) = log2((1 + r/c) / (1 - r/c)) = f(r).
+    // The function f(r) can be approximated well using an odd polynomial
+    // of the form
+    //   P(r) = ((Q(r^2) * r^2 + C) * r^2 + 1) * r,
+    // For the implementation of log2<double> here, Q is of degree 6 with
+    // coefficient represented in working precision (double), while C is a
+    // constant represented in extra precision as a double word to achieve
+    // full accuracy.
+    //
+    // The polynomial coefficients were computed by the Sollya script:
+    //
+    // c = 2 / log(2);
+    // trans = c * (x-1)/(x+1);
+    // itrans = (1+x/c)/(1-x/c);
+    // interval=[trans(sqrt(0.5)); trans(sqrt(2))];
+    // print(interval);
+    // f = log2(itrans(x));
+    // p=fpminimax(f,[|1,3,5,7,9,11,13,15,17|],[|1,DD,double...|],interval,relative,floating);
+    const Packet q12 = pset1<Packet>(2.87074255468000586e-9);
+    const Packet q10 = pset1<Packet>(2.38957980901884082e-8);
+    const Packet q8 = pset1<Packet>(2.31032094540014656e-7);
+    const Packet q6 = pset1<Packet>(2.27279857398537278e-6);
+    const Packet q4 = pset1<Packet>(2.31271023278625638e-5);
+    const Packet q2 = pset1<Packet>(2.47556738444535513e-4);
+    const Packet q0 = pset1<Packet>(2.88543873228900172e-3);
+    const Packet C_hi = pset1<Packet>(0.0400377511598501157);
+    const Packet C_lo = pset1<Packet>(-4.77726582251425391e-19);
+    const Packet one = pset1<Packet>(1.0);
+
+    const Packet cst_2_log2e_hi = pset1<Packet>(2.88539008177792677);
+    const Packet cst_2_log2e_lo = pset1<Packet>(4.07660016854549667e-17);
+    // c * (x - 1)
+    Packet num_hi, num_lo;
+    twoprod(cst_2_log2e_hi, cst_2_log2e_lo, psub(x, one), num_hi, num_lo);
+    // TODO(rmlarsen): Investigate if using the division algorithm by
+    // Muller et al. is faster/more accurate.
+    // 1 / (x + 1)
+    Packet denom_hi, denom_lo;
+    doubleword_reciprocal(padd(x, one), denom_hi, denom_lo);
+    // r =  c * (x-1) / (x+1),
+    Packet r_hi, r_lo;
+    twoprod(num_hi, num_lo, denom_hi, denom_lo, r_hi, r_lo);
+    // r2 = r * r
+    Packet r2_hi, r2_lo;
+    twoprod(r_hi, r_lo, r_hi, r_lo, r2_hi, r2_lo);
+    // r4 = r2 * r2
+    Packet r4_hi, r4_lo;
+    twoprod(r2_hi, r2_lo, r2_hi, r2_lo, r4_hi, r4_lo);
+
+    // Evaluate Q(r^2) in working precision. We evaluate it in two parts
+    // (even and odd in r^2) to improve instruction level parallelism.
+    Packet q_even = pmadd(q12, r4_hi, q8);
+    Packet q_odd = pmadd(q10, r4_hi, q6);
+    q_even = pmadd(q_even, r4_hi, q4);
+    q_odd = pmadd(q_odd, r4_hi, q2);
+    q_even = pmadd(q_even, r4_hi, q0);
+    Packet q = pmadd(q_odd, r2_hi, q_even);
+
+    // Now evaluate the low order terms of P(x) in double word precision.
+    // In the following, due to the increasing magnitude of the coefficients
+    // and r being constrained to [-0.5, 0.5] we can use fast_twosum instead
+    // of the slower twosum.
+    // Q(r^2) * r^2
+    Packet p_hi, p_lo;
+    twoprod(r2_hi, r2_lo, q, p_hi, p_lo);
+    // Q(r^2) * r^2 + C
+    Packet p1_hi, p1_lo;
+    fast_twosum(C_hi, C_lo, p_hi, p_lo, p1_hi, p1_lo);
+    // (Q(r^2) * r^2 + C) * r^2
+    Packet p2_hi, p2_lo;
+    twoprod(r2_hi, r2_lo, p1_hi, p1_lo, p2_hi, p2_lo);
+    // ((Q(r^2) * r^2 + C) * r^2 + 1)
+    Packet p3_hi, p3_lo;
+    fast_twosum(one, p2_hi, p2_lo, p3_hi, p3_lo);
+
+    // log(z) ~= ((Q(r^2) * r^2 + C) * r^2 + 1) * r
+    twoprod(p3_hi, p3_lo, r_hi, r_lo, log2_x_hi, log2_x_lo);
+  }
+};
+
+// This function computes exp2(x) (i.e. 2**x).
+template <typename Scalar>
+struct fast_accurate_exp2 {
+  template <typename Packet>
+  EIGEN_STRONG_INLINE
+  Packet operator()(const Packet& x) {
+    // TODO(rmlarsen): Add a pexp2 packetop.
+    return pexp(pmul(pset1<Packet>(Scalar(EIGEN_LN2)), x));
+  }
+};
+
+// This specialization uses a faster algorithm to compute exp2(x) for floats
+// in [-0.5;0.5] with a relative accuracy of 1 ulp.
+// The minimax polynomial used was calculated using the Sollya tool.
+// See sollya.org.
+template <>
+struct fast_accurate_exp2<float> {
+  template <typename Packet>
+  EIGEN_STRONG_INLINE
+  Packet operator()(const Packet& x) {
+    // This function approximates exp2(x) by a degree 6 polynomial of the form
+    // Q(x) = 1 + x * (C + x * P(x)), where the degree 4 polynomial P(x) is evaluated in
+    // single precision, and the remaining steps are evaluated with extra precision using
+    // double word arithmetic. C is an extra precise constant stored as a double word.
+    //
+    // The polynomial coefficients were calculated using Sollya commands:
+    // > n = 6;
+    // > f = 2^x;
+    // > interval = [-0.5;0.5];
+    // > p = fpminimax(f,n,[|1,double,single...|],interval,relative,floating);
+
+    const Packet p4 = pset1<Packet>(1.539513905e-4f);
+    const Packet p3 = pset1<Packet>(1.340007293e-3f);
+    const Packet p2 = pset1<Packet>(9.618283249e-3f);
+    const Packet p1 = pset1<Packet>(5.550328270e-2f);
+    const Packet p0 = pset1<Packet>(0.2402264923f);
+
+    const Packet C_hi = pset1<Packet>(0.6931471825f);
+    const Packet C_lo = pset1<Packet>(2.36836577e-08f);
+    const Packet one = pset1<Packet>(1.0f);
+
+    // Evaluate P(x) in working precision.
+    // We evaluate even and odd parts of the polynomial separately
+    // to gain some instruction level parallelism.
+    Packet x2 = pmul(x,x);
+    Packet p_even = pmadd(p4, x2, p2);
+    Packet p_odd = pmadd(p3, x2, p1);
+    p_even = pmadd(p_even, x2, p0);
+    Packet p = pmadd(p_odd, x, p_even);
+
+    // Evaluate the remaining terms of Q(x) with extra precision using
+    // double word arithmetic.
+    Packet p_hi, p_lo;
+    // x * p(x)
+    twoprod(p, x, p_hi, p_lo);
+    // C + x * p(x)
+    Packet q1_hi, q1_lo;
+    twosum(p_hi, p_lo, C_hi, C_lo, q1_hi, q1_lo);
+    // x * (C + x * p(x))
+    Packet q2_hi, q2_lo;
+    twoprod(q1_hi, q1_lo, x, q2_hi, q2_lo);
+    // 1 + x * (C + x * p(x))
+    Packet q3_hi, q3_lo;
+    // Since |q2_hi| <= sqrt(2)-1 < 1, we can use fast_twosum
+    // for adding it to unity here.
+    fast_twosum(one, q2_hi, q3_hi, q3_lo);
+    return padd(q3_hi, padd(q2_lo, q3_lo));
+  }
+};
+
+// in [-0.5;0.5] with a relative accuracy of 1 ulp.
+// The minimax polynomial used was calculated using the Sollya tool.
+// See sollya.org.
+template <>
+struct fast_accurate_exp2<double> {
+  template <typename Packet>
+  EIGEN_STRONG_INLINE
+  Packet operator()(const Packet& x) {
+    // This function approximates exp2(x) by a degree 10 polynomial of the form
+    // Q(x) = 1 + x * (C + x * P(x)), where the degree 8 polynomial P(x) is evaluated in
+    // single precision, and the remaining steps are evaluated with extra precision using
+    // double word arithmetic. C is an extra precise constant stored as a double word.
+    //
+    // The polynomial coefficients were calculated using Sollya commands:
+    // > n = 11;
+    // > f = 2^x;
+    // > interval = [-0.5;0.5];
+    // > p = fpminimax(f,n,[|1,DD,double...|],interval,relative,floating);
+
+    const Packet p9 = pset1<Packet>(4.431642109085495276e-10);
+    const Packet p8 = pset1<Packet>(7.073829923303358410e-9);
+    const Packet p7 = pset1<Packet>(1.017822306737031311e-7);
+    const Packet p6 = pset1<Packet>(1.321543498017646657e-6);
+    const Packet p5 = pset1<Packet>(1.525273342728892877e-5);
+    const Packet p4 = pset1<Packet>(1.540353045780084423e-4);
+    const Packet p3 = pset1<Packet>(1.333355814685869807e-3);
+    const Packet p2 = pset1<Packet>(9.618129107593478832e-3);
+    const Packet p1 = pset1<Packet>(5.550410866481961247e-2);
+    const Packet p0 = pset1<Packet>(0.240226506959101332);
+    const Packet C_hi = pset1<Packet>(0.693147180559945286); 
+    const Packet C_lo = pset1<Packet>(4.81927865669806721e-17);
+    const Packet one = pset1<Packet>(1.0);
+
+    // Evaluate P(x) in working precision.
+    // We evaluate even and odd parts of the polynomial separately
+    // to gain some instruction level parallelism.
+    Packet x2 = pmul(x,x);
+    Packet p_even = pmadd(p8, x2, p6);
+    Packet p_odd = pmadd(p9, x2, p7);
+    p_even = pmadd(p_even, x2, p4);
+    p_odd = pmadd(p_odd, x2, p5);
+    p_even = pmadd(p_even, x2, p2);
+    p_odd = pmadd(p_odd, x2, p3);
+    p_even = pmadd(p_even, x2, p0);
+    p_odd = pmadd(p_odd, x2, p1);
+    Packet p = pmadd(p_odd, x, p_even);
+
+    // Evaluate the remaining terms of Q(x) with extra precision using
+    // double word arithmetic.
+    Packet p_hi, p_lo;
+    // x * p(x)
+    twoprod(p, x, p_hi, p_lo);
+    // C + x * p(x)
+    Packet q1_hi, q1_lo;
+    twosum(p_hi, p_lo, C_hi, C_lo, q1_hi, q1_lo);
+    // x * (C + x * p(x))
+    Packet q2_hi, q2_lo;
+    twoprod(q1_hi, q1_lo, x, q2_hi, q2_lo);
+    // 1 + x * (C + x * p(x))
+    Packet q3_hi, q3_lo;
+    // Since |q2_hi| <= sqrt(2)-1 < 1, we can use fast_twosum
+    // for adding it to unity here.
+    fast_twosum(one, q2_hi, q3_hi, q3_lo);
+    return padd(q3_hi, padd(q2_lo, q3_lo));
+  }
+};
+
+// This function implements the non-trivial case of pow(x,y) where x is
+// positive and y is (possibly) non-integer.
+// Formally, pow(x,y) = exp2(y * log2(x)), where exp2(x) is shorthand for 2^x.
+// TODO(rmlarsen): We should probably add this as a packet up 'ppow', to make it
+// easier to specialize or turn off for specific types and/or backends.x
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  // Split x into exponent e_x and mantissa m_x.
+  Packet e_x;
+  Packet m_x = pfrexp(x, e_x);
+
+  // Adjust m_x to lie in [1/sqrt(2):sqrt(2)] to minimize absolute error in log2(m_x).
+  EIGEN_CONSTEXPR Scalar sqrt_half = Scalar(0.70710678118654752440);
+  const Packet m_x_scale_mask = pcmp_lt(m_x, pset1<Packet>(sqrt_half));
+  m_x = pselect(m_x_scale_mask, pmul(pset1<Packet>(Scalar(2)), m_x), m_x);
+  e_x = pselect(m_x_scale_mask, psub(e_x, pset1<Packet>(Scalar(1))), e_x);
+
+  // Compute log2(m_x) with 6 extra bits of accuracy.
+  Packet rx_hi, rx_lo;
+  accurate_log2<Scalar>()(m_x, rx_hi, rx_lo);
+
+  // Compute the two terms {y * e_x, y * r_x} in f = y * log2(x) with doubled
+  // precision using double word arithmetic.
+  Packet f1_hi, f1_lo, f2_hi, f2_lo;
+  twoprod(e_x, y, f1_hi, f1_lo);
+  twoprod(rx_hi, rx_lo, y, f2_hi, f2_lo);
+  // Sum the two terms in f using double word arithmetic. We know
+  // that |e_x| > |log2(m_x)|, except for the case where e_x==0.
+  // This means that we can use fast_twosum(f1,f2).
+  // In the case e_x == 0, e_x * y = f1 = 0, so we don't lose any
+  // accuracy by violating the assumption of fast_twosum, because
+  // it's a no-op.
+  Packet f_hi, f_lo;
+  fast_twosum(f1_hi, f1_lo, f2_hi, f2_lo, f_hi, f_lo);
+
+  // Split f into integer and fractional parts.
+  Packet n_z, r_z;
+  absolute_split(f_hi, n_z, r_z);
+  r_z = padd(r_z, f_lo);
+  Packet n_r;
+  absolute_split(r_z, n_r, r_z);
+  n_z = padd(n_z, n_r);
+
+  // We now have an accurate split of f = n_z + r_z and can compute
+  //   x^y = 2**{n_z + r_z) = exp2(r_z) * 2**{n_z}.
+  // Since r_z is in [-0.5;0.5], we compute the first factor to high accuracy
+  // using a specialized algorithm. Multiplication by the second factor can
+  // be done exactly using pldexp(), since it is an integer power of 2.
+  const Packet e_r = fast_accurate_exp2<Scalar>()(r_z);
+  return pldexp(e_r, n_z);
+}
+
+// Generic implementation of pow(x,y).
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet generic_pow(const Packet& x, const Packet& y) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+
+  const Packet cst_pos_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
+  const Packet cst_zero = pset1<Packet>(Scalar(0));
+  const Packet cst_one = pset1<Packet>(Scalar(1));
+  const Packet cst_nan = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
+
+  const Packet abs_x = pabs(x);
+  // Predicates for sign and magnitude of x.
+  const Packet x_is_zero = pcmp_eq(x, cst_zero);
+  const Packet x_is_neg = pcmp_lt(x, cst_zero);
+  const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
+  const Packet abs_x_is_one =  pcmp_eq(abs_x, cst_one);
+  const Packet abs_x_is_gt_one = pcmp_lt(cst_one, abs_x);
+  const Packet abs_x_is_lt_one = pcmp_lt(abs_x, cst_one);
+  const Packet x_is_one =  pandnot(abs_x_is_one, x_is_neg);
+  const Packet x_is_neg_one =  pand(abs_x_is_one, x_is_neg);
+  const Packet x_is_nan = pandnot(ptrue(x), pcmp_eq(x, x));
+
+  // Predicates for sign and magnitude of y.
+  const Packet y_is_one = pcmp_eq(y, cst_one);
+  const Packet y_is_zero = pcmp_eq(y, cst_zero);
+  const Packet y_is_neg = pcmp_lt(y, cst_zero);
+  const Packet y_is_pos = pandnot(ptrue(y), por(y_is_zero, y_is_neg));
+  const Packet y_is_nan = pandnot(ptrue(y), pcmp_eq(y, y));
+  const Packet abs_y_is_inf = pcmp_eq(pabs(y), cst_pos_inf);
+  EIGEN_CONSTEXPR Scalar huge_exponent =
+      (NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) /
+       NumTraits<Scalar>::epsilon();
+  const Packet abs_y_is_huge = pcmp_le(pset1<Packet>(huge_exponent), pabs(y));
+
+  // Predicates for whether y is integer and/or even.
+  const Packet y_is_int = pcmp_eq(pfloor(y), y);
+  const Packet y_div_2 = pmul(y, pset1<Packet>(Scalar(0.5)));
+  const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2);
+
+  // Predicates encoding special cases for the value of pow(x,y)
+  const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf),
+                                                    y_is_int),
+                                            abs_y_is_inf);
+  const Packet pow_is_one = por(por(x_is_one, y_is_zero),
+                                pand(x_is_neg_one,
+                                     por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x))));
+  const Packet pow_is_nan = por(invalid_negative_x, por(x_is_nan, y_is_nan));
+  const Packet pow_is_zero = por(por(por(pand(x_is_zero, y_is_pos),
+                                         pand(abs_x_is_inf, y_is_neg)),
+                                     pand(pand(abs_x_is_lt_one, abs_y_is_huge),
+                                          y_is_pos)),
+                                 pand(pand(abs_x_is_gt_one, abs_y_is_huge),
+                                      y_is_neg));
+  const Packet pow_is_inf = por(por(por(pand(x_is_zero, y_is_neg),
+                                        pand(abs_x_is_inf, y_is_pos)),
+                                    pand(pand(abs_x_is_lt_one, abs_y_is_huge),
+                                         y_is_neg)),
+                                pand(pand(abs_x_is_gt_one, abs_y_is_huge),
+                                     y_is_pos));
+
+  // General computation of pow(x,y) for positive x or negative x and integer y.
+  const Packet negate_pow_abs = pandnot(x_is_neg, y_is_even);
+  const Packet pow_abs = generic_pow_impl(abs_x, y);
+  return pselect(y_is_one, x,
+                 pselect(pow_is_one, cst_one,
+                         pselect(pow_is_nan, cst_nan,
+                                 pselect(pow_is_inf, cst_pos_inf,
+                                         pselect(pow_is_zero, cst_zero,
+                                                 pselect(negate_pow_abs, pnegate(pow_abs), pow_abs))))));
+}
+
+
+
+/* polevl (modified for Eigen)
+ *
+ *      Evaluate polynomial
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N+1];
+ *
+ * y = polevl<decltype(x), N>( x, coef);
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates polynomial of degree N:
+ *
+ *                     2          N
+ * y  =  C  + C x + C x  +...+ C x
+ *        0    1     2          N
+ *
+ * Coefficients are stored in reverse order:
+ *
+ * coef[0] = C  , ..., coef[N] = C  .
+ *            N                   0
+ *
+ *  The function p1evl() assumes that coef[N] = 1.0 and is
+ * omitted from the array.  Its calling arguments are
+ * otherwise the same as polevl().
+ *
+ *
+ * The Eigen implementation is templatized.  For best speed, store
+ * coef as a const array (constexpr), e.g.
+ *
+ * const double coef[] = {1.0, 2.0, 3.0, ...};
+ *
+ */
+template <typename Packet, int N>
+struct ppolevl {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
+    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return pmadd(ppolevl<Packet, N-1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
+  }
+};
+
+template <typename Packet>
+struct ppolevl<Packet, 0> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
+    EIGEN_UNUSED_VARIABLE(x);
+    return pset1<Packet>(coeff[0]);
+  }
+};
+
+/* chbevl (modified for Eigen)
+ *
+ *     Evaluate Chebyshev series
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N], chebevl();
+ *
+ * y = chbevl( x, coef, N );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates the series
+ *
+ *        N-1
+ *         - '
+ *  y  =   >   coef[i] T (x/2)
+ *         -            i
+ *        i=0
+ *
+ * of Chebyshev polynomials Ti at argument x/2.
+ *
+ * Coefficients are stored in reverse order, i.e. the zero
+ * order term is last in the array.  Note N is the number of
+ * coefficients, not the order.
+ *
+ * If coefficients are for the interval a to b, x must
+ * have been transformed to x -> 2(2x - b - a)/(b-a) before
+ * entering the routine.  This maps x from (a, b) to (-1, 1),
+ * over which the Chebyshev polynomials are defined.
+ *
+ * If the coefficients are for the inverted interval, in
+ * which (a, b) is mapped to (1/b, 1/a), the transformation
+ * required is x -> 2(2ab/x - b - a)/(b-a).  If b is infinity,
+ * this becomes x -> 4a/x - 1.
+ *
+ *
+ *
+ * SPEED:
+ *
+ * Taking advantage of the recurrence properties of the
+ * Chebyshev polynomials, the routine requires one more
+ * addition per loop than evaluating a nested polynomial of
+ * the same degree.
+ *
+ */
+
+template <typename Packet, int N>
+struct pchebevl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) {
+    typedef typename unpacket_traits<Packet>::type Scalar;
+    Packet b0 = pset1<Packet>(coef[0]);
+    Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
+    Packet b2;
+
+    for (int i = 1; i < N; i++) {
+      b2 = b1;
+      b1 = b0;
+      b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
+    }
+
+    return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
+  }
+};
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H

diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
new file mode 100644
index 0000000..177a04e
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h

@@ -0,0 +1,110 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H
+#define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H
+
+namespace Eigen {
+namespace internal {
+
+// Forward declarations of the generic math functions
+// implemented in GenericPacketMathFunctions.h
+// This is needed to workaround a circular dependency.
+
+/***************************************************************************
+ * Some generic implementations to be used by implementors
+***************************************************************************/
+
+/** Default implementation of pfrexp.
+  * It is expected to be called by implementers of template<> pfrexp.
+  */
+template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+Packet pfrexp_generic(const Packet& a, Packet& exponent);
+
+// Extracts the biased exponent value from Packet p, and casts the results to
+// a floating-point Packet type. Used by pfrexp_generic. Override this if
+// there is no unpacket_traits<Packet>::integer_packet.
+template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+Packet pfrexp_generic_get_biased_exponent(const Packet& p);
+
+/** Default implementation of pldexp.
+  * It is expected to be called by implementers of template<> pldexp.
+  */
+template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+Packet pldexp_generic(const Packet& a, const Packet& exponent);
+
+/** \internal \returns log(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog_float(const Packet _x);
+
+/** \internal \returns log2(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog2_float(const Packet _x);
+
+/** \internal \returns log(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog_double(const Packet _x);
+
+/** \internal \returns log2(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog2_double(const Packet _x);
+
+/** \internal \returns log(1 + x) */
+template<typename Packet>
+Packet generic_plog1p(const Packet& x);
+
+/** \internal \returns exp(x)-1 */
+template<typename Packet>
+Packet generic_expm1(const Packet& x);
+
+/** \internal \returns exp(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet pexp_float(const Packet _x);
+
+/** \internal \returns exp(x) for double precision real numbers */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet pexp_double(const Packet _x);
+
+/** \internal \returns sin(x) for single precision float */
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet psin_float(const Packet& x);
+
+/** \internal \returns cos(x) for single precision float */
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet pcos_float(const Packet& x);
+
+/** \internal \returns sqrt(x) for complex types */
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet psqrt_complex(const Packet& a);
+
+template <typename Packet, int N> struct ppolevl;
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H

diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
new file mode 100644
index 0000000..9f8e8cc
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/Half.h

@@ -0,0 +1,942 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+// The conversion routines are Copyright (c) Fabian Giesen, 2016.
+// The original license follows:
+//
+// Copyright (c) Fabian Giesen, 2016
+// All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted.
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Standard 16-bit float type, mostly useful for GPUs. Defines a new
+// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with
+// operator overloads such that it behaves basically as an arithmetic
+// type. It will be quite slow on CPUs (so it is recommended to stay
+// in fp32 for CPUs, except for simple parameter conversions, I/O
+// to disk and the likes), but fast on GPUs.
+
+
+#ifndef EIGEN_HALF_H
+#define EIGEN_HALF_H
+
+#include <sstream>
+
+#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+// When compiling with GPU support, the "__half_raw" base class as well as
+// some other routines are defined in the GPU compiler header files
+// (cuda_fp16.h, hip_fp16.h), and they are not tagged constexpr
+// As a consequence, we get compile failures when compiling Eigen with
+// GPU support. Hence the need to disable EIGEN_CONSTEXPR when building
+// Eigen with GPU support
+  #pragma push_macro("EIGEN_CONSTEXPR")
+  #undef EIGEN_CONSTEXPR
+  #define EIGEN_CONSTEXPR
+#endif
+
+#define F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, METHOD)           \
+  template <>                                                       \
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_UNUSED                \
+  PACKET_F16 METHOD<PACKET_F16>(const PACKET_F16& _x) {             \
+    return float2half(METHOD<PACKET_F>(half2float(_x)));            \
+  }
+
+namespace Eigen {
+
+struct half;
+
+namespace half_impl {
+
+// We want to use the __half_raw struct from the HIP header file only during the device compile phase.
+// This is required because of a quirk in the way TensorFlow GPU builds are done.
+// When compiling TensorFlow source code with GPU support, files that
+//  * contain GPU kernels (i.e. *.cu.cc files) are compiled via hipcc
+//  * do not contain GPU kernels ( i.e. *.cc files) are compiled via gcc (typically)
+//
+// Tensorflow uses the Eigen::half type as its FP16 type, and there are functions that
+//  * are defined in a file that gets compiled via hipcc AND
+//  * have Eigen::half as a pass-by-value argument AND
+//  * are called in a file that gets compiled via gcc
+//
+// In the scenario described above the caller and callee will see different versions
+// of the Eigen::half base class __half_raw, and they will be compiled by different compilers
+//
+// There appears to be an ABI mismatch between gcc and clang (which is called by hipcc) that results in
+// the callee getting corrupted values for the Eigen::half argument.
+//
+// Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves
+// this error, and hence the following convoluted #if condition
+#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
+// Make our own __half_raw definition that is similar to CUDA's.
+struct __half_raw {
+#if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE))
+  // Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF)
+  // The element type for shared memory cannot have non-trivial constructors
+  // and hence the following special casing (which skips the zero-initilization).
+  // Note that this check gets done even in the host compilation phase, and
+  // hence the need for this
+  EIGEN_DEVICE_FUNC __half_raw() {}
+#else
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw() : x(0) {}
+#endif
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) {
+  }
+  __fp16 x;
+#else
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(raw) {}
+  numext::uint16_t x;
+#endif
+};
+
+#elif defined(EIGEN_HAS_HIP_FP16)
+  // Nothing to do here
+  // HIP fp16 header file has a definition for __half_raw
+#elif defined(EIGEN_HAS_CUDA_FP16)
+  #if EIGEN_CUDA_SDK_VER < 90000
+    // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
+    typedef __half __half_raw;
+  #endif // defined(EIGEN_HAS_CUDA_FP16)
+#elif defined(SYCL_DEVICE_ONLY)
+  typedef cl::sycl::half __half_raw;
+#endif
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
+
+struct half_base : public __half_raw {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base() {}
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {}
+
+#if defined(EIGEN_HAS_GPU_FP16)
+ #if defined(EIGEN_HAS_HIP_FP16)
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); }
+ #elif defined(EIGEN_HAS_CUDA_FP16)
+  #if EIGEN_CUDA_SDK_VER >= 90000
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
+  #endif
+ #endif
+#endif
+};
+
+} // namespace half_impl
+
+// Class definition.
+struct half : public half_impl::half_base {
+
+  // Writing this out as separate #if-else blocks to make the code easier to follow
+  // The same applies to most #if-else blocks in this file
+#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
+  // Use the same base class for the following two scenarios
+  // * when compiling without GPU support enabled
+  // * during host compile phase when compiling with GPU support enabled
+  typedef half_impl::__half_raw __half_raw;
+#elif defined(EIGEN_HAS_HIP_FP16)
+  // Nothing to do here
+  // HIP fp16 header file has a definition for __half_raw
+#elif defined(EIGEN_HAS_CUDA_FP16)
+  // Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so
+  // (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP!  So keeping this within
+  // #if defined(EIGEN_HAS_CUDA_FP16) is needed
+  #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+    typedef half_impl::__half_raw __half_raw;
+  #endif
+#endif
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {}
+
+#if defined(EIGEN_HAS_GPU_FP16)
+ #if defined(EIGEN_HAS_HIP_FP16)
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
+ #elif defined(EIGEN_HAS_CUDA_FP16)
+  #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
+  #endif
+ #endif
+#endif
+
+
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(bool b)
+      : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
+  template<class T>
+  explicit EIGEN_DEVICE_FUNC half(T val)
+      : half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(val))) {}
+  explicit EIGEN_DEVICE_FUNC half(float f)
+      : half_impl::half_base(half_impl::float_to_half_rtne(f)) {}
+
+  // Following the convention of numpy, converting between complex and
+  // float will lead to loss of imag value.
+  template<typename RealScalar>
+  explicit EIGEN_DEVICE_FUNC half(std::complex<RealScalar> c)
+      : half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(c.real()))) {}
+
+   EIGEN_DEVICE_FUNC operator float() const {  // NOLINT: Allow implicit conversion to float, because it is lossless.
+    return half_impl::half_to_float(*this);
+  }
+
+#if defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE)
+  EIGEN_DEVICE_FUNC operator __half() const {
+    ::__half_raw hr;
+    hr.x = x;
+    return __half(hr);
+  }
+#endif
+};
+
+} // end namespace Eigen
+
+namespace std {
+template<>
+struct numeric_limits<Eigen::half> {
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = true;
+  static const bool has_quiet_NaN = true;
+  static const bool has_signaling_NaN = true;
+  static const float_denorm_style has_denorm = denorm_present;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_to_nearest;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 11;
+  static const int digits10 = 3;      // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+  static const int max_digits10 = 5;  // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+  static const int radix = 2;
+  static const int min_exponent = -13;
+  static const int min_exponent10 = -4;
+  static const int max_exponent = 16;
+  static const int max_exponent10 = 4;
+  static const bool traps = true;
+  static const bool tinyness_before = false;
+
+  static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); }
+  static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
+  static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
+  static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); }
+  static Eigen::half round_error() { return Eigen::half(0.5); }
+  static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
+  static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
+  static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7d00); }
+  static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
+};
+
+// If std::numeric_limits<T> is specialized, should also specialize
+// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
+// std::numeric_limits<const volatile T>
+// https://stackoverflow.com/a/16519653/
+template<>
+struct numeric_limits<const Eigen::half> : numeric_limits<Eigen::half> {};
+template<>
+struct numeric_limits<volatile Eigen::half> : numeric_limits<Eigen::half> {};
+template<>
+struct numeric_limits<const volatile Eigen::half> : numeric_limits<Eigen::half> {};
+} // end namespace std
+
+namespace Eigen {
+
+namespace half_impl {
+
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
+     EIGEN_CUDA_ARCH >= 530) ||                                  \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
+// Note: We deliberatly do *not* define this to 1 even if we have Arm's native
+// fp16 type since GPU halfs are rather different from native CPU halfs.
+// TODO: Rename to something like EIGEN_HAS_NATIVE_GPU_FP16
+#define EIGEN_HAS_NATIVE_FP16
+#endif
+
+// Intrinsics for native fp16 support. Note that on current hardware,
+// these are no faster than fp32 arithmetic (you need to use the half2
+// versions to get the ALU speed increased), but you do save the
+// conversion steps back and forth.
+
+#if defined(EIGEN_HAS_NATIVE_FP16)
+EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
+#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
+  return __hadd(::__half(a), ::__half(b));
+#else
+  return __hadd(a, b);
+#endif
+}
+EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
+  return __hmul(a, b);
+}
+EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
+  return __hsub(a, b);
+}
+EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
+#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
+  return __hdiv(a, b);
+#else
+  float num = __half2float(a);
+  float denom = __half2float(b);
+  return __float2half(num / denom);
+#endif
+}
+EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
+  return __hneg(a);
+}
+EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
+  a = a + b;
+  return a;
+}
+EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) {
+  a = a * b;
+  return a;
+}
+EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) {
+  a = a - b;
+  return a;
+}
+EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
+  a = a / b;
+  return a;
+}
+EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
+  return __heq(a, b);
+}
+EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
+  return __hne(a, b);
+}
+EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
+  return __hlt(a, b);
+}
+EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
+  return __hle(a, b);
+}
+EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
+  return __hgt(a, b);
+}
+EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
+  return __hge(a, b);
+}
+#endif
+
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
+  return half(vaddh_f16(a.x, b.x));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) {
+  return half(vmulh_f16(a.x, b.x));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) {
+  return half(vsubh_f16(a.x, b.x));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) {
+  return half(vdivh_f16(a.x, b.x));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) {
+  return half(vnegh_f16(a.x));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) {
+  a = half(vaddh_f16(a.x, b.x));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) {
+  a = half(vmulh_f16(a.x, b.x));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) {
+  a = half(vsubh_f16(a.x, b.x));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) {
+  a = half(vdivh_f16(a.x, b.x));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
+  return vceqh_f16(a.x, b.x);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
+  return !vceqh_f16(a.x, b.x);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
+  return vclth_f16(a.x, b.x);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) {
+  return vcleh_f16(a.x, b.x);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
+  return vcgth_f16(a.x, b.x);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) {
+  return vcgeh_f16(a.x, b.x);
+}
+// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
+// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
+// of the functions, while the latter can only deal with one of them.
+#elif !defined(EIGEN_HAS_NATIVE_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for half floats
+
+#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
+// We need to provide emulated *host-side* FP16 operators for clang.
+#pragma push_macro("EIGEN_DEVICE_FUNC")
+#undef EIGEN_DEVICE_FUNC
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_FP16)
+#define EIGEN_DEVICE_FUNC __host__
+#else // both host and device need emulated ops.
+#define EIGEN_DEVICE_FUNC __host__ __device__
+#endif
+#endif
+
+// Definitions for CPUs and older HIP+CUDA, mostly working through conversion
+// to/from fp32.
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
+  return half(float(a) + float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) {
+  return half(float(a) * float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) {
+  return half(float(a) - float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) {
+  return half(float(a) / float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) {
+  half result;
+  result.x = a.x ^ 0x8000;
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) {
+  a = half(float(a) + float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) {
+  a = half(float(a) * float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) {
+  a = half(float(a) - float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) {
+  a = half(float(a) / float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
+  return numext::equal_strict(float(a),float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
+  return numext::not_equal_strict(float(a), float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
+  return float(a) < float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) {
+  return float(a) <= float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
+  return float(a) > float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) {
+  return float(a) >= float(b);
+}
+
+#if defined(__clang__) && defined(__CUDA__)
+#pragma pop_macro("EIGEN_DEVICE_FUNC")
+#endif
+#endif  // Emulate support for half floats
+
+// Division by an index. Do it in full float precision to avoid accuracy
+// issues in converting the denominator to half.
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
+  return half(static_cast<float>(a) / static_cast<float>(b));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator++(half& a) {
+  a += half(1);
+  return a;
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator--(half& a) {
+  a -= half(1);
+  return a;
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator++(half& a, int) {
+  half original_value = a;
+  ++a;
+  return original_value;
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator--(half& a, int) {
+  half original_value = a;
+  --a;
+  return original_value;
+}
+
+// Conversion routines, including fallbacks for the host or older CUDA.
+// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
+// these in hardware. If we need more performance on older/other CPUs, they are
+// also possible to vectorize directly.
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x) {
+  // We cannot simply do a "return __half_raw(x)" here, because __half_raw is union type
+  // in the hip_fp16 header file, and that will trigger a compile error
+  // On the other hand, having anything but a return statement also triggers a compile error
+  // because this is constexpr function.
+  // Fortunately, since we need to disable EIGEN_CONSTEXPR for GPU anyway, we can get out
+  // of this catch22 by having separate bodies for GPU / non GPU
+#if defined(EIGEN_HAS_GPU_FP16)
+   __half_raw h;
+   h.x = x;
+  return h;
+#else
+  return __half_raw(x);
+#endif
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC numext::uint16_t raw_half_as_uint16(const __half_raw& h) {
+  // HIP/CUDA/Default have a member 'x' of type uint16_t.
+  // For ARM64 native half, the member 'x' is of type __fp16, so we need to bit-cast.
+  // For SYCL, cl::sycl::half is _Float16, so cast directly.
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  return numext::bit_cast<numext::uint16_t>(h.x);
+#elif defined(SYCL_DEVICE_ONLY)
+  return numext::bit_cast<numext::uint16_t>(h);
+#else
+  return h.x;
+#endif
+}
+
+union float32_bits {
+  unsigned int u;
+  float f;
+};
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+  __half tmp_ff = __float2half(ff);
+  return *(__half_raw*)&tmp_ff;
+
+#elif defined(EIGEN_HAS_FP16_C)
+  __half_raw h;
+  h.x = _cvtss_sh(ff, 0);
+  return h;
+
+#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  __half_raw h;
+  h.x = static_cast<__fp16>(ff);
+  return h;
+
+#else
+  float32_bits f; f.f = ff;
+
+  const float32_bits f32infty = { 255 << 23 };
+  const float32_bits f16max = { (127 + 16) << 23 };
+  const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
+  unsigned int sign_mask = 0x80000000u;
+  __half_raw o;
+  o.x = static_cast<numext::uint16_t>(0x0u);
+
+  unsigned int sign = f.u & sign_mask;
+  f.u ^= sign;
+
+  // NOTE all the integer compares in this function can be safely
+  // compiled into signed compares since all operands are below
+  // 0x80000000. Important if you want fast straight SSE2 code
+  // (since there's no unsigned PCMPGTD).
+
+  if (f.u >= f16max.u) {  // result is Inf or NaN (all exponent bits set)
+    o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
+  } else {  // (De)normalized number or zero
+    if (f.u < (113 << 23)) {  // resulting FP16 is subnormal or zero
+      // use a magic value to align our 10 mantissa bits at the bottom of
+      // the float. as long as FP addition is round-to-nearest-even this
+      // just works.
+      f.f += denorm_magic.f;
+
+      // and one integer subtract of the bias later, we have our final float!
+      o.x = static_cast<numext::uint16_t>(f.u - denorm_magic.u);
+    } else {
+      unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
+
+      // update exponent, rounding bias part 1
+      // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but
+      // without arithmetic overflow.
+      f.u += 0xc8000fffU;
+      // rounding bias part 2
+      f.u += mant_odd;
+      // take the bits!
+      o.x = static_cast<numext::uint16_t>(f.u >> 13);
+    }
+  }
+
+  o.x |= static_cast<numext::uint16_t>(sign >> 16);
+  return o;
+#endif
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+  return __half2float(h);
+#elif defined(EIGEN_HAS_FP16_C)
+  return _cvtsh_ss(h.x);
+#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  return static_cast<float>(h.x);
+#else
+  const float32_bits magic = { 113 << 23 };
+  const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
+  float32_bits o;
+
+  o.u = (h.x & 0x7fff) << 13;             // exponent/mantissa bits
+  unsigned int exp = shifted_exp & o.u;   // just the exponent
+  o.u += (127 - 15) << 23;                // exponent adjust
+
+  // handle exponent special cases
+  if (exp == shifted_exp) {     // Inf/NaN?
+    o.u += (128 - 16) << 23;    // extra exp adjust
+  } else if (exp == 0) {        // Zero/Denormal?
+    o.u += 1 << 23;             // extra exp adjust
+    o.f -= magic.f;             // renormalize
+  }
+
+  o.u |= (h.x & 0x8000) << 16;    // sign bit
+  return o.f;
+#endif
+}
+
+// --- standard functions ---
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
+#ifdef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC
+  return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) == 0x7c00;
+#else
+  return (a.x & 0x7fff) == 0x7c00;
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+  return __hisnan(a);
+#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) > 0x7c00;
+#else
+  return (a.x & 0x7fff) > 0x7c00;
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) {
+  return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  return half(vabsh_f16(a.x));
+#else
+  half result;
+  result.x = a.x & 0x7FFF;
+  return result;
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
+  return half(hexp(a));
+#else
+   return half(::expf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
+  return half(numext::expm1(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
+#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+  return half(::hlog(a));
+#else
+  return half(::logf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) {
+  return half(numext::log1p(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
+  return half(::log10f(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log2(const half& a) {
+  return half(static_cast<float>(EIGEN_LOG2E) * ::logf(float(a)));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
+  return half(hsqrt(a));
+#else
+    return half(::sqrtf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
+  return half(::powf(float(a), float(b)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) {
+  return half(::sinf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) {
+  return half(::cosf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) {
+  return half(::tanf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
+  return half(::tanhf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half asin(const half& a) {
+  return half(::asinf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) {
+  return half(::acosf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
+  return half(hfloor(a));
+#else
+  return half(::floorf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
+  return half(hceil(a));
+#else
+  return half(::ceilf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half rint(const half& a) {
+  return half(::rintf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half round(const half& a) {
+  return half(::roundf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half fmod(const half& a, const half& b) {
+  return half(::fmodf(float(a), float(b)));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+  return __hlt(b, a) ? b : a;
+#else
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return f2 < f1 ? b : a;
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+  return __hlt(a, b) ? b : a;
+#else
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return f1 < f2 ? b : a;
+#endif
+}
+
+#ifndef EIGEN_NO_IO
+EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) {
+  os << static_cast<float>(v);
+  return os;
+}
+#endif
+
+} // end namespace half_impl
+
+// import Eigen::half_impl::half into Eigen namespace
+// using half_impl::half;
+
+namespace internal {
+
+template<>
+struct random_default_impl<half, false, false>
+{
+  static inline half run(const half& x, const half& y)
+  {
+    return x + (y-x) * half(float(std::rand()) / float(RAND_MAX));
+  }
+  static inline half run()
+  {
+    return run(half(-1.f), half(1.f));
+  }
+};
+
+template<> struct is_arithmetic<half> { enum { value = true }; };
+
+} // end namespace internal
+
+template<> struct NumTraits<Eigen::half>
+    : GenericNumTraits<Eigen::half>
+{
+  enum {
+    IsSigned = true,
+    IsInteger = false,
+    IsComplex = false,
+    RequireInitialization = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half epsilon() {
+    return half_impl::raw_uint16_to_half(0x0800);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half dummy_precision() {
+    return half_impl::raw_uint16_to_half(0x211f); //  Eigen::half(1e-2f);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half highest() {
+    return half_impl::raw_uint16_to_half(0x7bff);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half lowest() {
+    return half_impl::raw_uint16_to_half(0xfbff);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half infinity() {
+    return half_impl::raw_uint16_to_half(0x7c00);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
+    return half_impl::raw_uint16_to_half(0x7e00);
+  }
+};
+
+} // end namespace Eigen
+
+#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  #pragma pop_macro("EIGEN_CONSTEXPR")
+#endif
+
+namespace Eigen {
+namespace numext {
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(const Eigen::half& h) {
+  return (half_impl::isnan)(h);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(const Eigen::half& h) {
+  return (half_impl::isinf)(h);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(const Eigen::half& h) {
+  return (half_impl::isfinite)(h);
+}
+
+#endif
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bit_cast<Eigen::half, uint16_t>(const uint16_t& src) {
+  return Eigen::half(Eigen::half_impl::raw_uint16_to_half(src));
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast<uint16_t, Eigen::half>(const Eigen::half& src) {
+  return Eigen::half_impl::raw_half_as_uint16(src);
+}
+
+}  // namespace numext
+}  // namespace Eigen
+
+// Add the missing shfl* intrinsics.
+// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
+//   CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
+//
+// HIP and CUDA prior to SDK 9.0 define
+//    __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
+// CUDA since 9.0 deprecates those and instead defines
+//    __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
+//    with native support for __half and __nv_bfloat16
+//
+// Note that the following are __device__ - only functions.
+#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) \
+    || defined(EIGEN_HIPCC)
+
+#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane, int width=warpSize) {
+  const __half h = var;
+  return static_cast<Eigen::half>(__shfl_sync(mask, h, srcLane, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) {
+  const __half h = var;
+  return static_cast<Eigen::half>(__shfl_up_sync(mask, h, delta, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) {
+  const __half h = var;
+  return static_cast<Eigen::half>(__shfl_down_sync(mask, h, delta, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen::half var, int laneMask, int width=warpSize) {
+  const __half h = var;
+  return static_cast<Eigen::half>(__shfl_xor_sync(mask, h, laneMask, width));
+}
+
+#else // HIP or CUDA SDK < 9.0
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width=warpSize) {
+  const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
+  return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl(ivar, srcLane, width)));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width=warpSize) {
+  const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
+  return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_up(ivar, delta, width)));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width=warpSize) {
+  const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
+  return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_down(ivar, delta, width)));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
+  const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
+  return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_xor(ivar, laneMask, width)));
+}
+
+#endif // HIP vs CUDA
+#endif // __shfl*
+
+// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
+#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) \
+    || defined(EIGEN_HIPCC)
+EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) {
+  return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast<const Eigen::numext::uint16_t*>(ptr)));
+}
+#endif // __ldg
+
+#if EIGEN_HAS_STD_HASH
+namespace std {
+template <>
+struct hash<Eigen::half> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const {
+    return static_cast<std::size_t>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(a));
+  }
+};
+} // end namespace std
+#endif
+
+#endif // EIGEN_HALF_H

diff --git a/Eigen/src/Core/arch/Default/Settings.h b/Eigen/src/Core/arch/Default/Settings.h
index 097373c..a5c3ada 100644
--- a/Eigen/src/Core/arch/Default/Settings.h
+++ b/Eigen/src/Core/arch/Default/Settings.h

@@ -21,7 +21,7 @@
   * it does not correspond to the number of iterations or the number of instructions
   */
 #ifndef EIGEN_UNROLLING_LIMIT
-#define EIGEN_UNROLLING_LIMIT 100
+#define EIGEN_UNROLLING_LIMIT 110
 #endif
 
 /** Defines the threshold between a "small" and a "large" matrix.

diff --git a/Eigen/src/Core/arch/Default/TypeCasting.h b/Eigen/src/Core/arch/Default/TypeCasting.h
new file mode 100644
index 0000000..fb8183b
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/TypeCasting.h

@@ -0,0 +1,120 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2019 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERIC_TYPE_CASTING_H
+#define EIGEN_GENERIC_TYPE_CASTING_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<>
+struct scalar_cast_op<float, Eigen::half> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef Eigen::half result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
+    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+      return __float2half(a);
+    #else
+      return Eigen::half(a);
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<float, Eigen::half> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<int, Eigen::half> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef Eigen::half result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
+    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+      return __float2half(static_cast<float>(a));
+    #else
+      return Eigen::half(static_cast<float>(a));
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<int, Eigen::half> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<Eigen::half, float> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef float result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
+    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+      return __half2float(a);
+    #else
+      return static_cast<float>(a);
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<Eigen::half, float> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<float, Eigen::bfloat16> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef Eigen::bfloat16 result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 operator() (const float& a) const {
+    return Eigen::bfloat16(a);
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<float, Eigen::bfloat16> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<int, Eigen::bfloat16> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef Eigen::bfloat16 result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 operator() (const int& a) const {
+    return Eigen::bfloat16(static_cast<float>(a));
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<int, Eigen::bfloat16> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<Eigen::bfloat16, float> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef float result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::bfloat16& a) const {
+    return static_cast<float>(a);
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<Eigen::bfloat16, float> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+}
+}
+
+#endif  // EIGEN_GENERIC_TYPE_CASTING_H

diff --git a/Eigen/src/Core/arch/GPU/MathFunctions.h b/Eigen/src/Core/arch/GPU/MathFunctions.h
new file mode 100644
index 0000000..d2b3a25
--- /dev/null
+++ b/Eigen/src/Core/arch/GPU/MathFunctions.h

@@ -0,0 +1,103 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_GPU_H
+#define EIGEN_MATH_FUNCTIONS_GPU_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 plog<float4>(const float4& a)
+{
+  return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w));
+}
+
+template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 plog<double2>(const double2& a)
+{
+  using ::log;
+  return make_double2(log(a.x), log(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 plog1p<float4>(const float4& a)
+{
+  return make_float4(log1pf(a.x), log1pf(a.y), log1pf(a.z), log1pf(a.w));
+}
+
+template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 plog1p<double2>(const double2& a)
+{
+  return make_double2(log1p(a.x), log1p(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pexp<float4>(const float4& a)
+{
+  return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pexp<double2>(const double2& a)
+{
+  using ::exp;
+  return make_double2(exp(a.x), exp(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pexpm1<float4>(const float4& a)
+{
+  return make_float4(expm1f(a.x), expm1f(a.y), expm1f(a.z), expm1f(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pexpm1<double2>(const double2& a)
+{
+  return make_double2(expm1(a.x), expm1(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 psqrt<float4>(const float4& a)
+{
+  return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 psqrt<double2>(const double2& a)
+{
+  using ::sqrt;
+  return make_double2(sqrt(a.x), sqrt(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 prsqrt<float4>(const float4& a)
+{
+  return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 prsqrt<double2>(const double2& a)
+{
+  return make_double2(rsqrt(a.x), rsqrt(a.y));
+}
+
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATH_FUNCTIONS_GPU_H

diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
new file mode 100644
index 0000000..25c45fd
--- /dev/null
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h

@@ -0,0 +1,1649 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_GPU_H
+#define EIGEN_PACKET_MATH_GPU_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Read-only data cached load available.
+#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
+#define EIGEN_GPU_HAS_LDG 1
+#endif
+
+// FP16 math available.
+#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
+#define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
+#endif
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
+#endif
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+
+template<> struct is_arithmetic<float4>  { enum { value = true }; };
+template<> struct is_arithmetic<double2> { enum { value = true }; };
+
+template<> struct packet_traits<float> : default_packet_traits
+{
+  typedef float4 type;
+  typedef float4 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=4,
+    HasHalfPacket = 0,
+
+    HasDiv  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLGamma = 1,
+    HasDiGamma = 1,
+    HasZeta = 1,
+    HasPolygamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
+    HasNdtri = 1,
+    HasBessel = 1,
+    HasIGamma = 1,
+    HasIGammaDerA = 1,
+    HasGammaSampleDerAlpha = 1,
+    HasIGammac = 1,
+    HasBetaInc = 1,
+
+    HasBlend = 0,
+    HasFloor = 1,
+  };
+};
+
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef double2 type;
+  typedef double2 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+    HasHalfPacket = 0,
+
+    HasDiv  = 1,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLGamma = 1,
+    HasDiGamma = 1,
+    HasZeta = 1,
+    HasPolygamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
+    HasNdtri = 1,
+    HasBessel = 1,
+    HasIGamma = 1,
+    HasIGammaDerA = 1,
+    HasGammaSampleDerAlpha = 1,
+    HasIGammac = 1,
+    HasBetaInc = 1,
+
+    HasBlend = 0,
+    HasFloor = 1,
+  };
+};
+
+
+template<> struct unpacket_traits<float4>  { typedef float  type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef float4 half; };
+template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef double2 half; };
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float&  from) {
+  return make_float4(from, from, from, from);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
+  return make_double2(from, from);
+}
+
+// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
+// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
+// of the functions, while the latter can only deal with one of them.
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
+namespace {
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
+                                                        const float& b) {
+  return __int_as_float(__float_as_int(a) & __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a,
+                                                         const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) &
+                              __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a,
+                                                       const float& b) {
+  return __int_as_float(__float_as_int(a) | __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a,
+                                                        const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) |
+                              __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a,
+                                                        const float& b) {
+  return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a,
+                                                         const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) ^
+                              __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a,
+                                                           const float& b) {
+  return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a,
+                                                            const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) &
+                              ~__double_as_longlong(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a,
+                                                    const float& b) {
+  return __int_as_float(a == b ? 0xffffffffu : 0u);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a,
+                                                     const double& b) {
+  return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a,
+                                                    const float& b) {
+  return __int_as_float(a < b ? 0xffffffffu : 0u);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a,
+                                                     const double& b) {
+  return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull);
+}
+
+}  // namespace
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a,
+                                                          const float4& b) {
+  return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y),
+                     bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a,
+                                                            const double2& b) {
+  return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a,
+                                                         const float4& b) {
+  return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y),
+                     bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a,
+                                                           const double2& b) {
+  return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a,
+                                                          const float4& b) {
+  return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y),
+                     bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a,
+                                                            const double2& b) {
+  return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a,
+                                                             const float4& b) {
+  return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y),
+                     bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pandnot<double2>(const double2& a, const double2& b) {
+  return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a,
+                                                             const float4& b) {
+  return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z),
+                     eq_mask(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt<float4>(const float4& a,
+                                                             const float4& b) {
+  return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z),
+                     lt_mask(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pcmp_eq<double2>(const double2& a, const double2& b) {
+  return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pcmp_lt<double2>(const double2& a, const double2& b) {
+  return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y));
+}
+#endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
+  return make_float4(a, a+1, a+2, a+3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
+  return make_double2(a, a+1);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x+b.x, a.y+b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x-b.x, a.y-b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
+  return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
+  return make_double2(-a.x, -a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x*b.x, a.y*b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x/b.x, a.y/b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
+  return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
+  return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
+  return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
+  return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
+  return *reinterpret_cast<const float4*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
+  return *reinterpret_cast<const double2*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
+  return make_float4(from[0], from[1], from[2], from[3]);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
+  return make_double2(from[0], from[1]);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
+  return make_float4(from[0], from[0], from[1], from[1]);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
+  return make_double2(from[0], from[0]);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float*   to, const float4& from) {
+  *reinterpret_cast<float4*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
+  *reinterpret_cast<double2*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const float4& from) {
+  to[0] = from.x;
+  to[1] = from.y;
+  to[2] = from.z;
+  to[3] = from.w;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
+  to[0] = from.x;
+  to[1] = from.y;
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  return __ldg((const float4*)from);
+#else
+  return make_float4(from[0], from[1], from[2], from[3]);
+#endif
+}
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  return __ldg((const double2*)from);
+#else
+  return make_double2(from[0], from[1]);
+#endif
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
+#else
+  return make_float4(from[0], from[1], from[2], from[3]);
+#endif
+}
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  return make_double2(__ldg(from+0), __ldg(from+1));
+#else
+  return make_double2(from[0], from[1]);
+#endif
+}
+
+template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
+  return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
+  return make_double2(from[0*stride], from[1*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
+  to[stride*0] = from.x;
+  to[stride*1] = from.y;
+  to[stride*2] = from.z;
+  to[stride*3] = from.w;
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
+  to[stride*0] = from.x;
+  to[stride*1] = from.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  pfirst<float4>(const float4& a) {
+  return a.x;
+}
+template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
+  return a.x;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux<float4>(const float4& a) {
+  return a.x + a.y + a.z + a.w;
+}
+template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
+  return a.x + a.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_max<float4>(const float4& a) {
+  return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
+  return fmax(a.x, a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_min<float4>(const float4& a) {
+  return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
+  return fmin(a.x, a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_mul<float4>(const float4& a) {
+  return a.x * a.y * a.z * a.w;
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
+  return a.x * a.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
+  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
+  return make_double2(fabs(a.x), fabs(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC inline float4  pfloor<float4>(const float4& a) {
+  return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) {
+  return make_double2(floor(a.x), floor(a.y));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<float4,4>& kernel) {
+  float tmp = kernel.packet[0].y;
+  kernel.packet[0].y = kernel.packet[1].x;
+  kernel.packet[1].x = tmp;
+
+  tmp = kernel.packet[0].z;
+  kernel.packet[0].z = kernel.packet[2].x;
+  kernel.packet[2].x = tmp;
+
+  tmp = kernel.packet[0].w;
+  kernel.packet[0].w = kernel.packet[3].x;
+  kernel.packet[3].x = tmp;
+
+  tmp = kernel.packet[1].z;
+  kernel.packet[1].z = kernel.packet[2].y;
+  kernel.packet[2].y = tmp;
+
+  tmp = kernel.packet[1].w;
+  kernel.packet[1].w = kernel.packet[3].y;
+  kernel.packet[3].y = tmp;
+
+  tmp = kernel.packet[2].w;
+  kernel.packet[2].w = kernel.packet[3].z;
+  kernel.packet[3].z = tmp;
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<double2,2>& kernel) {
+  double tmp = kernel.packet[0].y;
+  kernel.packet[0].y = kernel.packet[1].x;
+  kernel.packet[1].x = tmp;
+}
+
+#endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+
+// Half-packet functions are not available on the host for CUDA 9.0-9.2, only
+// on device. There is no benefit to using them on the host anyways, since they are
+// emulated.
+#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
+
+typedef ulonglong2 Packet4h2;
+template<> struct unpacket_traits<Packet4h2> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; };
+template<> struct is_arithmetic<Packet4h2> { enum { value = true }; };
+
+template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; };
+template<> struct is_arithmetic<half2> { enum { value = true }; };
+
+template<> struct packet_traits<Eigen::half> : default_packet_traits
+{
+  typedef Packet4h2 type;
+  typedef Packet4h2 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=8,
+    HasHalfPacket = 0,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasSqrt   = 1,
+    HasRsqrt  = 1,
+    HasExp    = 1,
+    HasExpm1  = 1,
+    HasLog    = 1,
+    HasLog1p  = 1
+  };
+};
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
+  return __half2half2(from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+pset1<Packet4h2>(const Eigen::half& from) {
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = pset1<half2>(from);
+  p_alias[1] = pset1<half2>(from);
+  p_alias[2] = pset1<half2>(from);
+  p_alias[3] = pset1<half2>(from);
+  return r;
+}
+
+namespace {
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
+  return *reinterpret_cast<const half2*>(from);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) {
+  return __halves2half2(from[0], from[1]);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half*  from) {
+  return __halves2half2(from[0], from[0]);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
+                                                  const half2& from) {
+  *reinterpret_cast<half2*>(to) = from;
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to,
+                                                   const half2& from) {
+  to[0] = __low2half(from);
+  to[1] = __high2half(from);
+}
+
+
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(
+    const Eigen::half* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  // Input is guaranteed to be properly aligned.
+  return __ldg(reinterpret_cast<const half2*>(from));
+#else
+  return __halves2half2(*(from+0), *(from+1));
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(
+    const Eigen::half* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  return __halves2half2(__ldg(from+0), __ldg(from+1));
+#else
+  return __halves2half2(*(from+0), *(from+1));
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from,
+                                                    Index stride) {
+  return __halves2half2(from[0*stride], from[1*stride]);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(
+    Eigen::half* to, const half2& from, Index stride) {
+  to[stride*0] = __low2half(from);
+  to[stride*1] = __high2half(from);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) {
+  return __low2half(a);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
+  half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
+  return __halves2half2(result1, result2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  return pset1<half2>(true_half);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) {
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  return pset1<half2>(false_half);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<half2,2>& kernel) {
+  __half a1 = __low2half(kernel.packet[0]);
+  __half a2 = __high2half(kernel.packet[0]);
+  __half b1 = __low2half(kernel.packet[1]);
+  __half b2 = __high2half(kernel.packet[1]);
+  kernel.packet[0] = __halves2half2(a1, b1);
+  kernel.packet[1] = __halves2half2(a2, b2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
+#else
+  float f = __half2float(a) + 1.0f;
+  return __halves2half2(a, __float2half(f));
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask,
+                                                    const half2& a,
+                                                    const half2& b) {
+  half mask_low = __low2half(mask);
+  half mask_high = __high2half(mask);
+  half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a);
+  half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a);
+  return __halves2half2(result_low, result_high);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a,
+                                                    const half2& b) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
+  half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
+  return __halves2half2(eq1, eq2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a,
+                                                    const half2& b) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
+  half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
+  return __halves2half2(eq1, eq2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a,
+                                                 const half2& b) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
+  return __halves2half2(result1, result2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a,
+                                                const half2& b) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
+  return __halves2half2(result1, result2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a,
+                                                 const half2& b) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
+  return __halves2half2(result1, result2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a,
+                                                    const half2& b) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
+  return __halves2half2(result1, result2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a,
+                                                 const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hadd2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 + b1;
+  float r2 = a2 + b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a,
+                                                 const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hsub2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 - b1;
+  float r2 = a2 - b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hneg2(a);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return __floats2half2_rn(-a1, -a2);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a,
+                                                 const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hmul2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 * b1;
+  float r2 = a2 * b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a,
+                                                  const half2& b,
+                                                  const half2& c) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+   return __hfma2(a, b, c);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float c1 = __low2float(c);
+  float c2 = __high2float(c);
+  float r1 = a1 * b1 + c1;
+  float r2 = a2 * b2 + c2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a,
+                                                 const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __h2div(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 / b1;
+  float r2 = a2 / b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a,
+                                                 const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
+                                                 const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hadd(__low2half(a), __high2half(a));
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return Eigen::half(__float2half(a1 + a2));
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  __half first = __low2half(a);
+  __half second = __high2half(a);
+  return __hgt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return a1 > a2 ? __low2half(a) : __high2half(a);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  __half first = __low2half(a);
+  __half second = __high2half(a);
+  return __hlt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return a1 < a2 ? __low2half(a) : __high2half(a);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hmul(__low2half(a), __high2half(a));
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return Eigen::half(__float2half(a1 * a2));
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = log1pf(a1);
+  float r2 = log1pf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = expm1f(a1);
+  float r2 = expm1f(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+half2 plog(const half2& a) {
+  return h2log(a);
+}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+half2 pexp(const half2& a) {
+  return h2exp(a);
+}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+half2 psqrt(const half2& a) {
+  return h2sqrt(a);
+}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+half2 prsqrt(const half2& a) {
+  return h2rsqrt(a);
+}
+
+#else
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = logf(a1);
+  float r2 = logf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = expf(a1);
+  float r2 = expf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = sqrtf(a1);
+  float r2 = sqrtf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = rsqrtf(a1);
+  float r2 = rsqrtf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+#endif
+} // namespace
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+pload<Packet4h2>(const Eigen::half* from) {
+  return *reinterpret_cast<const Packet4h2*>(from);
+}
+
+// unaligned load;
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+ploadu<Packet4h2>(const Eigen::half* from) {
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = ploadu(from + 0);
+  p_alias[1] = ploadu(from + 2);
+  p_alias[2] = ploadu(from + 4);
+  p_alias[3] = ploadu(from + 6);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+ploaddup<Packet4h2>(const Eigen::half* from) {
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = ploaddup(from + 0);
+  p_alias[1] = ploaddup(from + 1);
+  p_alias[2] = ploaddup(from + 2);
+  p_alias[3] = ploaddup(from + 3);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(
+    Eigen::half* to, const Packet4h2& from) {
+  *reinterpret_cast<Packet4h2*>(to) = from;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(
+    Eigen::half* to, const Packet4h2& from) {
+  const half2* from_alias = reinterpret_cast<const half2*>(&from);
+  pstoreu(to + 0,from_alias[0]);
+  pstoreu(to + 2,from_alias[1]);
+  pstoreu(to + 4,from_alias[2]);
+  pstoreu(to + 6,from_alias[3]);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
+ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  Packet4h2 r;
+  r = __ldg(reinterpret_cast<const Packet4h2*>(from));
+  return r;
+#else
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  r_alias[0] = ploadt_ro_aligned(from + 0);
+  r_alias[1] = ploadt_ro_aligned(from + 2);
+  r_alias[2] = ploadt_ro_aligned(from + 4);
+  r_alias[3] = ploadt_ro_aligned(from + 6);
+  return r;
+#endif
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
+ploadt_ro<Packet4h2, Unaligned>(const Eigen::half* from) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  r_alias[0] = ploadt_ro_unaligned(from + 0);
+  r_alias[1] = ploadt_ro_unaligned(from + 2);
+  r_alias[2] = ploadt_ro_unaligned(from + 4);
+  r_alias[3] = ploadt_ro_unaligned(from + 6);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]);
+  p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]);
+  p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]);
+  p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h2>(
+    Eigen::half* to, const Packet4h2& from, Index stride) {
+  const half2* from_alias = reinterpret_cast<const half2*>(&from);
+  pscatter(to + stride * 0, from_alias[0], stride);
+  pscatter(to + stride * 2, from_alias[1], stride);
+  pscatter(to + stride * 4, from_alias[2], stride);
+  pscatter(to + stride * 6, from_alias[3], stride);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h2>(
+    const Packet4h2& a) {
+  return pfirst(*(reinterpret_cast<const half2*>(&a)));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(
+    const Packet4h2& a) {
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  p_alias[0] = pabs(a_alias[0]);
+  p_alias[1] = pabs(a_alias[1]);
+  p_alias[2] = pabs(a_alias[2]);
+  p_alias[3] = pabs(a_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(
+    const Packet4h2& /*a*/) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  return pset1<Packet4h2>(true_half);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero<Packet4h2>(const Packet4h2& /*a*/) {
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  return pset1<Packet4h2>(false_half);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double(
+    double* d_row0, double* d_row1, double* d_row2, double* d_row3,
+    double* d_row4, double* d_row5, double* d_row6, double* d_row7) {
+  double d_tmp;
+  d_tmp = d_row0[1];
+  d_row0[1] = d_row4[0];
+  d_row4[0] = d_tmp;
+
+  d_tmp = d_row1[1];
+  d_row1[1] = d_row5[0];
+  d_row5[0] = d_tmp;
+
+  d_tmp = d_row2[1];
+  d_row2[1] = d_row6[0];
+  d_row6[0] = d_tmp;
+
+  d_tmp = d_row3[1];
+  d_row3[1] = d_row7[0];
+  d_row7[0] = d_tmp;
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(
+    half2* f_row0, half2* f_row1, half2* f_row2, half2* f_row3) {
+  half2 f_tmp;
+  f_tmp = f_row0[1];
+  f_row0[1] = f_row2[0];
+  f_row2[0] = f_tmp;
+
+  f_tmp = f_row1[1];
+  f_row1[1] = f_row3[0];
+  f_row3[0] = f_tmp;
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
+ptranspose_half(half2& f0, half2& f1) {
+  __half a1 = __low2half(f0);
+  __half a2 = __high2half(f0);
+  __half b1 = __low2half(f1);
+  __half b2 = __high2half(f1);
+  f0 = __halves2half2(a1, b1);
+  f1 = __halves2half2(a2, b2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet4h2,8>& kernel) {
+  double* d_row0 = reinterpret_cast<double*>(&kernel.packet[0]);
+  double* d_row1 = reinterpret_cast<double*>(&kernel.packet[1]);
+  double* d_row2 = reinterpret_cast<double*>(&kernel.packet[2]);
+  double* d_row3 = reinterpret_cast<double*>(&kernel.packet[3]);
+  double* d_row4 = reinterpret_cast<double*>(&kernel.packet[4]);
+  double* d_row5 = reinterpret_cast<double*>(&kernel.packet[5]);
+  double* d_row6 = reinterpret_cast<double*>(&kernel.packet[6]);
+  double* d_row7 = reinterpret_cast<double*>(&kernel.packet[7]);
+  ptranspose_double(d_row0, d_row1, d_row2, d_row3,
+                    d_row4, d_row5, d_row6, d_row7);
+
+
+  half2* f_row0 = reinterpret_cast<half2*>(d_row0);
+  half2* f_row1 = reinterpret_cast<half2*>(d_row1);
+  half2* f_row2 = reinterpret_cast<half2*>(d_row2);
+  half2* f_row3 = reinterpret_cast<half2*>(d_row3);
+  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
+  ptranspose_half(f_row0[0], f_row1[0]);
+  ptranspose_half(f_row0[1], f_row1[1]);
+  ptranspose_half(f_row2[0], f_row3[0]);
+  ptranspose_half(f_row2[1], f_row3[1]);
+
+  f_row0 = reinterpret_cast<half2*>(d_row0 + 1);
+  f_row1 = reinterpret_cast<half2*>(d_row1 + 1);
+  f_row2 = reinterpret_cast<half2*>(d_row2 + 1);
+  f_row3 = reinterpret_cast<half2*>(d_row3 + 1);
+  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
+  ptranspose_half(f_row0[0], f_row1[0]);
+  ptranspose_half(f_row0[1], f_row1[1]);
+  ptranspose_half(f_row2[0], f_row3[0]);
+  ptranspose_half(f_row2[1], f_row3[1]);
+
+  f_row0 = reinterpret_cast<half2*>(d_row4);
+  f_row1 = reinterpret_cast<half2*>(d_row5);
+  f_row2 = reinterpret_cast<half2*>(d_row6);
+  f_row3 = reinterpret_cast<half2*>(d_row7);
+  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
+  ptranspose_half(f_row0[0], f_row1[0]);
+  ptranspose_half(f_row0[1], f_row1[1]);
+  ptranspose_half(f_row2[0], f_row3[0]);
+  ptranspose_half(f_row2[1], f_row3[1]);
+
+  f_row0 = reinterpret_cast<half2*>(d_row4 + 1);
+  f_row1 = reinterpret_cast<half2*>(d_row5 + 1);
+  f_row2 = reinterpret_cast<half2*>(d_row6 + 1);
+  f_row3 = reinterpret_cast<half2*>(d_row7 + 1);
+  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
+  ptranspose_half(f_row0[0], f_row1[0]);
+  ptranspose_half(f_row0[1], f_row1[1]);
+  ptranspose_half(f_row2[0], f_row3[0]);
+  ptranspose_half(f_row2[1], f_row3[1]);
+
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+plset<Packet4h2>(const Eigen::half& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = __halves2half2(a, __hadd(a, __float2half(1.0f)));
+  p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)),
+                              __hadd(a, __float2half(3.0f)));
+  p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)),
+                              __hadd(a, __float2half(5.0f)));
+  p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)),
+                              __hadd(a, __float2half(7.0f)));
+  return r;
+#elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+
+  half2 b = pset1<half2>(a);
+  half2 c;
+  half2 half_offset0 = __halves2half2(__float2half(0.0f),__float2half(2.0f));
+  half2 half_offset1 = __halves2half2(__float2half(4.0f),__float2half(6.0f));
+
+  c = __hadd2(b, half_offset0);
+  r_alias[0] = plset(__low2half(c));
+  r_alias[1] = plset(__high2half(c));
+
+  c = __hadd2(b, half_offset1);
+  r_alias[2] = plset(__low2half(c));
+  r_alias[3] = plset(__high2half(c));
+
+  return r;
+
+#else
+  float f = __half2float(a);
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
+  p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
+  p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
+  p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
+  return r;
+#endif
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+pselect<Packet4h2>(const Packet4h2& mask, const Packet4h2& a,
+                   const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* mask_alias = reinterpret_cast<const half2*>(&mask);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pselect(mask_alias[0], a_alias[0], b_alias[0]);
+  r_alias[1] = pselect(mask_alias[1], a_alias[1], b_alias[1]);
+  r_alias[2] = pselect(mask_alias[2], a_alias[2], b_alias[2]);
+  r_alias[3] = pselect(mask_alias[3], a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+pcmp_eq<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pcmp_eq(a_alias[0], b_alias[0]);
+  r_alias[1] = pcmp_eq(a_alias[1], b_alias[1]);
+  r_alias[2] = pcmp_eq(a_alias[2], b_alias[2]);
+  r_alias[3] = pcmp_eq(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>(
+    const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pand(a_alias[0], b_alias[0]);
+  r_alias[1] = pand(a_alias[1], b_alias[1]);
+  r_alias[2] = pand(a_alias[2], b_alias[2]);
+  r_alias[3] = pand(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(
+    const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = por(a_alias[0], b_alias[0]);
+  r_alias[1] = por(a_alias[1], b_alias[1]);
+  r_alias[2] = por(a_alias[2], b_alias[2]);
+  r_alias[3] = por(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(
+    const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pxor(a_alias[0], b_alias[0]);
+  r_alias[1] = pxor(a_alias[1], b_alias[1]);
+  r_alias[2] = pxor(a_alias[2], b_alias[2]);
+  r_alias[3] = pxor(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+pandnot<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pandnot(a_alias[0], b_alias[0]);
+  r_alias[1] = pandnot(a_alias[1], b_alias[1]);
+  r_alias[2] = pandnot(a_alias[2], b_alias[2]);
+  r_alias[3] = pandnot(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(
+    const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = padd(a_alias[0], b_alias[0]);
+  r_alias[1] = padd(a_alias[1], b_alias[1]);
+  r_alias[2] = padd(a_alias[2], b_alias[2]);
+  r_alias[3] = padd(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub<Packet4h2>(
+    const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = psub(a_alias[0], b_alias[0]);
+  r_alias[1] = psub(a_alias[1], b_alias[1]);
+  r_alias[2] = psub(a_alias[2], b_alias[2]);
+  r_alias[3] = psub(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pnegate(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = pnegate(a_alias[0]);
+  r_alias[1] = pnegate(a_alias[1]);
+  r_alias[2] = pnegate(a_alias[2]);
+  r_alias[3] = pnegate(a_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pconj(const Packet4h2& a) {
+  return a;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(
+    const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pmul(a_alias[0], b_alias[0]);
+  r_alias[1] = pmul(a_alias[1], b_alias[1]);
+  r_alias[2] = pmul(a_alias[2], b_alias[2]);
+  r_alias[3] = pmul(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(
+    const Packet4h2& a, const Packet4h2& b, const Packet4h2& c) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  const half2* c_alias = reinterpret_cast<const half2*>(&c);
+  r_alias[0] = pmadd(a_alias[0], b_alias[0], c_alias[0]);
+  r_alias[1] = pmadd(a_alias[1], b_alias[1], c_alias[1]);
+  r_alias[2] = pmadd(a_alias[2], b_alias[2], c_alias[2]);
+  r_alias[3] = pmadd(a_alias[3], b_alias[3], c_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(
+    const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pdiv(a_alias[0], b_alias[0]);
+  r_alias[1] = pdiv(a_alias[1], b_alias[1]);
+  r_alias[2] = pdiv(a_alias[2], b_alias[2]);
+  r_alias[3] = pdiv(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(
+    const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pmin(a_alias[0], b_alias[0]);
+  r_alias[1] = pmin(a_alias[1], b_alias[1]);
+  r_alias[2] = pmin(a_alias[2], b_alias[2]);
+  r_alias[3] = pmin(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(
+    const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pmax(a_alias[0], b_alias[0]);
+  r_alias[1] = pmax(a_alias[1], b_alias[1]);
+  r_alias[2] = pmax(a_alias[2], b_alias[2]);
+  r_alias[3] = pmax(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<Packet4h2>(
+    const Packet4h2& a) {
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+
+  return predux(a_alias[0]) + predux(a_alias[1]) +
+         predux(a_alias[2]) + predux(a_alias[3]);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(
+    const Packet4h2& a) {
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  half2 m0 = __halves2half2(predux_max(a_alias[0]),
+                            predux_max(a_alias[1]));
+  half2 m1 = __halves2half2(predux_max(a_alias[2]),
+                            predux_max(a_alias[3]));
+  __half first  = predux_max(m0);
+  __half second = predux_max(m1);
+#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+  return (__hgt(first, second) ? first : second);
+#else
+  float ffirst  = __half2float(first);
+  float fsecond = __half2float(second);
+  return (ffirst > fsecond)? first: second;
+#endif
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(
+    const Packet4h2& a) {
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  half2 m0 = __halves2half2(predux_min(a_alias[0]),
+                            predux_min(a_alias[1]));
+  half2 m1 = __halves2half2(predux_min(a_alias[2]),
+                            predux_min(a_alias[3]));
+  __half first  = predux_min(m0);
+  __half second = predux_min(m1);
+#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+  return (__hlt(first, second) ? first : second);
+#else
+  float ffirst  = __half2float(first);
+  float fsecond = __half2float(second);
+  return (ffirst < fsecond)? first: second;
+#endif
+}
+
+// likely overflow/underflow
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4h2>(
+    const Packet4h2& a) {
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  return predux_mul(pmul(pmul(a_alias[0], a_alias[1]),
+                                       pmul(a_alias[2], a_alias[3])));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+plog1p<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = plog1p(a_alias[0]);
+  r_alias[1] = plog1p(a_alias[1]);
+  r_alias[2] = plog1p(a_alias[2]);
+  r_alias[3] = plog1p(a_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+pexpm1<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = pexpm1(a_alias[0]);
+  r_alias[1] = pexpm1(a_alias[1]);
+  r_alias[2] = pexpm1(a_alias[2]);
+  r_alias[3] = pexpm1(a_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = plog(a_alias[0]);
+  r_alias[1] = plog(a_alias[1]);
+  r_alias[2] = plog(a_alias[2]);
+  r_alias[3] = plog(a_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexp<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = pexp(a_alias[0]);
+  r_alias[1] = pexp(a_alias[1]);
+  r_alias[2] = pexp(a_alias[2]);
+  r_alias[3] = pexp(a_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psqrt<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = psqrt(a_alias[0]);
+  r_alias[1] = psqrt(a_alias[1]);
+  r_alias[2] = psqrt(a_alias[2]);
+  r_alias[3] = psqrt(a_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+prsqrt<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = prsqrt(a_alias[0]);
+  r_alias[1] = prsqrt(a_alias[1]);
+  r_alias[2] = prsqrt(a_alias[2]);
+  r_alias[3] = prsqrt(a_alias[3]);
+  return r;
+}
+
+// The following specialized padd, pmul, pdiv, pmin, pmax, pset1 are needed for
+// the implementation of GPU half reduction.
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a,
+                                                        const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hadd2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 + b1;
+  float r2 = a2 + b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a,
+                                                        const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hmul2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 * b1;
+  float r2 = a2 * b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a,
+                                                        const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __h2div(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 / b1;
+  float r2 = a2 / b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a,
+                                                        const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a,
+                                                        const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
+}
+
+#endif // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
+
+#undef EIGEN_GPU_HAS_LDG
+#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
+#undef EIGEN_GPU_HAS_FP16_ARITHMETIC
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+
+#endif // EIGEN_PACKET_MATH_GPU_H

diff --git a/Eigen/src/Core/arch/GPU/TypeCasting.h b/Eigen/src/Core/arch/GPU/TypeCasting.h
new file mode 100644
index 0000000..c8195bb
--- /dev/null
+++ b/Eigen/src/Core/arch/GPU/TypeCasting.h

@@ -0,0 +1,79 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_GPU_H
+#define EIGEN_TYPE_CASTING_GPU_H
+
+namespace Eigen {
+
+namespace internal {
+
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 2
+  };
+};
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) {
+  float2 r1 = __half22float2(a);
+  float2 r2 = __half22float2(b);
+  return make_float4(r1.x, r1.y, r2.x, r2.y);
+}
+
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcast<float4, Packet4h2>(const float4& a, const float4& b) {
+  Packet4h2 r;
+  half2* r_alias=reinterpret_cast<half2*>(&r);
+  r_alias[0]=__floats2half2_rn(a.x,a.y);
+  r_alias[1]=__floats2half2_rn(a.z,a.w);
+  r_alias[2]=__floats2half2_rn(b.x,b.y);
+  r_alias[3]=__floats2half2_rn(b.z,b.w);
+  return r;
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 2,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<Packet4h2, float4>(const Packet4h2& a) {
+  // Simply discard the second half of the input
+  float4 r;
+  const half2* a_alias=reinterpret_cast<const half2*>(&a);
+  float2 r1 = __half22float2(a_alias[0]);
+  float2 r2 = __half22float2(a_alias[1]);
+  r.x=static_cast<float>(r1.x);
+  r.y=static_cast<float>(r1.y);
+  r.z=static_cast<float>(r2.x);
+  r.w=static_cast<float>(r2.y);
+  return r;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
+  // Simply discard the second half of the input
+  return __floats2half2_rn(a.x, a.y);
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_GPU_H

diff --git a/Eigen/src/Core/arch/HIP/hcc/math_constants.h b/Eigen/src/Core/arch/HIP/hcc/math_constants.h
new file mode 100644
index 0000000..25375a0
--- /dev/null
+++ b/Eigen/src/Core/arch/HIP/hcc/math_constants.h

@@ -0,0 +1,23 @@
+/*
+ * math_constants.h - 
+ *  HIP equivalent of the CUDA header of the same name
+ */
+
+#ifndef __MATH_CONSTANTS_H__
+#define __MATH_CONSTANTS_H__
+
+/* single precision constants */
+
+#define HIPRT_INF_F        __int_as_float(0x7f800000)
+#define HIPRT_NAN_F        __int_as_float(0x7fffffff)
+#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001)
+#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff)
+#define HIPRT_NEG_ZERO_F   __int_as_float(0x80000000)
+#define HIPRT_ZERO_F       0.0f
+#define HIPRT_ONE_F        1.0f
+
+/* double precision constants */
+#define HIPRT_INF          __hiloint2double(0x7ff00000, 0x00000000)
+#define HIPRT_NAN          __hiloint2double(0xfff80000, 0x00000000)
+
+#endif

diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h
new file mode 100644
index 0000000..53dacfa
--- /dev/null
+++ b/Eigen/src/Core/arch/MSA/Complex.h

@@ -0,0 +1,648 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Wave Computing, Inc.
+// Written by:
+//   Chris Larsen
+//   Alexey Frunze (afrunze@wavecomp.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_MSA_H
+#define EIGEN_COMPLEX_MSA_H
+
+#include <iostream>
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- float ----------
+struct Packet2cf {
+  EIGEN_STRONG_INLINE Packet2cf() {
+  }
+  EIGEN_STRONG_INLINE explicit Packet2cf(const std::complex<float>& a,
+                                         const std::complex<float>& b) {
+    Packet4f t = { std::real(a), std::imag(a), std::real(b), std::imag(b) };
+    v = t;
+  }
+  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {
+  }
+  EIGEN_STRONG_INLINE Packet2cf(const Packet2cf& a) : v(a.v) {
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator=(const Packet2cf& b) {
+    v = b.v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf conjugate(void) const {
+    return Packet2cf((Packet4f)__builtin_msa_bnegi_d((v2u64)v, 63));
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) {
+    Packet4f v1, v2;
+
+    // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
+    v1 = (Packet4f)__builtin_msa_ilvev_w((v4i32)v, (v4i32)v);
+    // Get the imag values of a | a1_im | a1_im | a2_im | a2_im |
+    v2 = (Packet4f)__builtin_msa_ilvod_w((v4i32)v, (v4i32)v);
+    // Multiply the real a with b
+    v1 = pmul(v1, b.v);
+    // Multiply the imag a with b
+    v2 = pmul(v2, b.v);
+    // Conjugate v2
+    v2 = Packet2cf(v2).conjugate().v;
+    // Swap real/imag elements in v2.
+    v2 = (Packet4f)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(1, 0, 3, 2));
+    // Add and return the result
+    v = padd(v1, v2);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
+    return Packet2cf(*this) *= b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
+    return Packet2cf(*this) += b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
+    return Packet2cf(*this) -= b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) {
+    *this *= b.conjugate();
+    Packet4f s = pmul<Packet4f>(b.v, b.v);
+    s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+    v = pdiv(v, s);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const {
+    return Packet2cf(*this) /= b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
+    return Packet2cf(pnegate(v));
+  }
+
+  Packet4f v;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2cf& value) {
+  os << "[ (" << value.v[0] << ", " << value.v[1]
+     << "i),"
+        "  ("
+     << value.v[2] << ", " << value.v[3] << "i) ]";
+  return os;
+}
+
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
+  typedef Packet2cf type;
+  typedef Packet2cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSetLinear = 0,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2cf> {
+  typedef std::complex<float> type;
+  enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
+  typedef Packet2cf half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
+  EIGEN_MSA_DEBUG;
+
+  float f0 = from.real(), f1 = from.imag();
+  Packet4f v0 = { f0, f0, f0, f0 };
+  Packet4f v1 = { f1, f1, f1, f1 };
+  return Packet2cf((Packet4f)__builtin_msa_ilvr_w((Packet4i)v1, (Packet4i)v0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a + b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a - b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return -a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a.conjugate();
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a * b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(pand(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(por(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(pxor(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(pandnot(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_MSA_DEBUG;
+
+  return pset1<Packet2cf>(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to,
+                                                      const Packet2cf& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE pstore<float>((float*)to, from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to,
+                                                       const Packet2cf& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu<float>((float*)to, from.v);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(
+    const std::complex<float>* from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(from[0 * stride], from[1 * stride]);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to,
+                                                                       const Packet2cf& from,
+                                                                       Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  *to = std::complex<float>(from.v[0], from.v[1]);
+  to += stride;
+  *to = std::complex<float>(from.v[2], from.v[3]);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
+  EIGEN_MSA_DEBUG;
+
+  prefetch(reinterpret_cast<const float*>(addr));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return std::complex<float>(a.v[0], a.v[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f value = (Packet4f)preverse((Packet2d)a.v);
+  value += a.v;
+  return std::complex<float>(value[0], value[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return std::complex<float>((a.v[0] * a.v[2]) - (a.v[1] * a.v[3]),
+                             (a.v[0] * a.v[3]) + (a.v[1] * a.v[2]));
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a / b;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2cf, 2>& value) {
+  os << "[ " << value.packet[0] << ", " << std::endl << "  " << value.packet[1] << " ]";
+  return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f tmp =
+      (Packet4f)__builtin_msa_ilvl_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
+  kernel.packet[0].v =
+      (Packet4f)__builtin_msa_ilvr_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
+  kernel.packet[1].v = tmp;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
+                                     const Packet2cf& elsePacket) {
+  return (Packet2cf)(Packet4f)pblend<Packet2d>(ifPacket, (Packet2d)thenPacket.v,
+                                               (Packet2d)elsePacket.v);
+}
+
+//---------- double ----------
+
+struct Packet1cd {
+  EIGEN_STRONG_INLINE Packet1cd() {
+  }
+  EIGEN_STRONG_INLINE explicit Packet1cd(const std::complex<double>& a) {
+    v[0] = std::real(a);
+    v[1] = std::imag(a);
+  }
+  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {
+  }
+  EIGEN_STRONG_INLINE Packet1cd(const Packet1cd& a) : v(a.v) {
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator=(const Packet1cd& b) {
+    v = b.v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd conjugate(void) const {
+    static const v2u64 p2ul_CONJ_XOR = { 0x0, 0x8000000000000000 };
+    return (Packet1cd)pxor(v, (Packet2d)p2ul_CONJ_XOR);
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
+    Packet2d v1, v2;
+
+    // Get the real values of a | a1_re | a1_re
+    v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)v, (v2i64)v);
+    // Get the imag values of a | a1_im | a1_im
+    v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)v, (v2i64)v);
+    // Multiply the real a with b
+    v1 = pmul(v1, b.v);
+    // Multiply the imag a with b
+    v2 = pmul(v2, b.v);
+    // Conjugate v2
+    v2 = Packet1cd(v2).conjugate().v;
+    // Swap real/imag elements in v2.
+    v2 = (Packet2d)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+    // Add and return the result
+    v = padd(v1, v2);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
+    return Packet1cd(*this) *= b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
+    return Packet1cd(*this) += b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
+    return Packet1cd(*this) -= b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator/=(const Packet1cd& b) {
+    *this *= b.conjugate();
+    Packet2d s = pmul<Packet2d>(b.v, b.v);
+    s = padd(s, preverse<Packet2d>(s));
+    v = pdiv(v, s);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator/(const Packet1cd& b) const {
+    return Packet1cd(*this) /= b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
+    return Packet1cd(pnegate(v));
+  }
+
+  Packet2d v;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Packet1cd& value) {
+  os << "[ (" << value.v[0] << ", " << value.v[1] << "i) ]";
+  return os;
+}
+
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 1,
+    HasHalfPacket = 0,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSetLinear = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet1cd> {
+  typedef std::complex<double> type;
+  enum { size = 1, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
+  typedef Packet1cd half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a + b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a - b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return -a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a.conjugate();
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a * b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(pand(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(por(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(pxor(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(pandnot(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_MSA_DEBUG;
+
+  return pset1<Packet1cd>(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to,
+                                                       const Packet1cd& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE pstore<double>((double*)to, from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to,
+                                                        const Packet1cd& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu<double>((double*)to, from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
+  EIGEN_MSA_DEBUG;
+
+  prefetch(reinterpret_cast<const double*>(addr));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(
+    const std::complex<double>* from, Index stride __attribute__((unused))) {
+  EIGEN_MSA_DEBUG;
+
+  Packet1cd res;
+  res.v[0] = std::real(from[0]);
+  res.v[1] = std::imag(from[0]);
+  return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to,
+                                                                        const Packet1cd& from,
+                                                                        Index stride
+                                                                        __attribute__((unused))) {
+  EIGEN_MSA_DEBUG;
+
+  pstore(to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return std::complex<double>(a.v[0], a.v[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return pfirst(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return pfirst(a);
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a / b;
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet1cd, 2>& value) {
+  os << "[ " << value.packet[0] << ", " << std::endl << "  " << value.packet[1] << " ]";
+  return os;
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d v1, v2;
+
+  v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
+  // Get the imag values of a
+  v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
+
+  kernel.packet[0].v = v1;
+  kernel.packet[1].v = v2;
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_COMPLEX_MSA_H

diff --git a/Eigen/src/Core/arch/MSA/MathFunctions.h b/Eigen/src/Core/arch/MSA/MathFunctions.h
new file mode 100644
index 0000000..f5181b9
--- /dev/null
+++ b/Eigen/src/Core/arch/MSA/MathFunctions.h

@@ -0,0 +1,387 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// Copyright (C) 2018 Wave Computing, Inc.
+// Written by:
+//   Chris Larsen
+//   Alexey Frunze (afrunze@wavecomp.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+/* The tanh function of this file is an adaptation of
+ * template<typename T> T generic_fast_tanh_float(const T&)
+ * from MathFunctionsImpl.h.
+ */
+
+#ifndef EIGEN_MATH_FUNCTIONS_MSA_H
+#define EIGEN_MATH_FUNCTIONS_MSA_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+plog<Packet4f>(const Packet4f& _x) {
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292e-2f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+  static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
+
+  // Convert negative argument into NAN (quiet negative, to be specific).
+  Packet4f zero = (Packet4f)__builtin_msa_ldi_w(0);
+  Packet4i neg_mask = __builtin_msa_fclt_w(_x, zero);
+  Packet4i zero_mask = __builtin_msa_fceq_w(_x, zero);
+  Packet4f non_neg_x_or_nan = padd(_x, (Packet4f)neg_mask);  // Add 0.0 or NAN.
+  Packet4f x = non_neg_x_or_nan;
+
+  // Extract exponent from x = mantissa * 2**exponent, where 1.0 <= mantissa < 2.0.
+  // N.B. the exponent is one less of what frexpf() would return.
+  Packet4i e_int = __builtin_msa_ftint_s_w(__builtin_msa_flog2_w(x));
+  // Multiply x by 2**(-exponent-1) to get 0.5 <= x < 1.0 as from frexpf().
+  x = __builtin_msa_fexp2_w(x, (Packet4i)__builtin_msa_nori_b((v16u8)e_int, 0));
+
+  /*
+     if (x < SQRTHF) {
+       x = x + x - 1.0;
+     } else {
+       e += 1;
+       x = x - 1.0;
+     }
+  */
+  Packet4f xx = padd(x, x);
+  Packet4i ge_mask = __builtin_msa_fcle_w(p4f_cephes_SQRTHF, x);
+  e_int = psub(e_int, ge_mask);
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)ge_mask, (v16u8)xx, (v16u8)x);
+  x = psub(x, p4f_1);
+  Packet4f e = __builtin_msa_ffint_s_w(e_int);
+
+  Packet4f x2 = pmul(x, x);
+  Packet4f x3 = pmul(x2, x);
+
+  Packet4f y, y1, y2;
+  y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
+  y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
+  y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
+  y = pmadd(y, x, p4f_cephes_log_p2);
+  y1 = pmadd(y1, x, p4f_cephes_log_p5);
+  y2 = pmadd(y2, x, p4f_cephes_log_p8);
+  y = pmadd(y, x3, y1);
+  y = pmadd(y, x3, y2);
+  y = pmul(y, x3);
+
+  y = pmadd(e, p4f_cephes_log_q1, y);
+  x = __builtin_msa_fmsub_w(x, x2, p4f_half);
+  x = padd(x, y);
+  x = pmadd(e, p4f_cephes_log_q2, x);
+
+  // x is now the logarithm result candidate. We still need to handle the
+  // extreme arguments of zero and positive infinity, though.
+  // N.B. if the argument is +INFINITY, x is NAN because the polynomial terms
+  // contain infinities of both signs (see the coefficients and code above).
+  // INFINITY - INFINITY is NAN.
+
+  // If the argument is +INFINITY, make it the new result candidate.
+  // To achieve that we choose the smaller of the result candidate and the
+  // argument.
+  // This is correct for all finite pairs of values (the logarithm is smaller
+  // than the argument).
+  // This is also correct in the special case when the argument is +INFINITY
+  // and the result candidate is NAN. This is because the fmin.df instruction
+  // prefers non-NANs to NANs.
+  x = __builtin_msa_fmin_w(x, non_neg_x_or_nan);
+
+  // If the argument is zero (including -0.0), the result becomes -INFINITY.
+  Packet4i neg_infs = __builtin_msa_slli_w(zero_mask, 23);
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)zero_mask, (v16u8)x, (v16u8)neg_infs);
+
+  return x;
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+pexp<Packet4f>(const Packet4f& _x) {
+  // Limiting single-precision pexp's argument to [-128, +128] lets pexp
+  // reach 0 and INFINITY naturally.
+  static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -128.0f);
+  static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, +128.0f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894e-2f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
+
+  Packet4f x = _x;
+
+  // Clamp x.
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(x, p4f_exp_lo), (v16u8)x,
+                                     (v16u8)p4f_exp_lo);
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_exp_hi, x), (v16u8)x,
+                                     (v16u8)p4f_exp_hi);
+
+  // Round to nearest integer by adding 0.5 (with x's sign) and truncating.
+  Packet4f x2_add = (Packet4f)__builtin_msa_binsli_w((v4u32)p4f_half, (v4u32)x, 0);
+  Packet4f x2 = pmadd(x, p4f_cephes_LOG2EF, x2_add);
+  Packet4i x2_int = __builtin_msa_ftrunc_s_w(x2);
+  Packet4f x2_int_f = __builtin_msa_ffint_s_w(x2_int);
+
+  x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C1);
+  x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C2);
+
+  Packet4f z = pmul(x, x);
+
+  Packet4f y = p4f_cephes_exp_p0;
+  y = pmadd(y, x, p4f_cephes_exp_p1);
+  y = pmadd(y, x, p4f_cephes_exp_p2);
+  y = pmadd(y, x, p4f_cephes_exp_p3);
+  y = pmadd(y, x, p4f_cephes_exp_p4);
+  y = pmadd(y, x, p4f_cephes_exp_p5);
+  y = pmadd(y, z, x);
+  y = padd(y, p4f_1);
+
+  // y *= 2**exponent.
+  y = __builtin_msa_fexp2_w(y, x2_int);
+
+  return y;
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+ptanh<Packet4f>(const Packet4f& _x) {
+  static _EIGEN_DECLARE_CONST_Packet4f(tanh_tiny, 1e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(tanh_hi, 9.0f);
+  // The monomial coefficients of the numerator polynomial (odd).
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-5f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-8f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f);
+  // The monomial coefficients of the denominator polynomial (even).
+  static _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-6f);
+
+  Packet4f x = pabs(_x);
+  Packet4i tiny_mask = __builtin_msa_fclt_w(x, p4f_tanh_tiny);
+
+  // Clamp the inputs to the range [-9, 9] since anything outside
+  // this range is -/+1.0f in single-precision.
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_tanh_hi, x), (v16u8)x,
+                                     (v16u8)p4f_tanh_hi);
+
+  // Since the polynomials are odd/even, we need x**2.
+  Packet4f x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial p.
+  Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11);
+  p = pmadd(x2, p, p4f_alpha_9);
+  p = pmadd(x2, p, p4f_alpha_7);
+  p = pmadd(x2, p, p4f_alpha_5);
+  p = pmadd(x2, p, p4f_alpha_3);
+  p = pmadd(x2, p, p4f_alpha_1);
+  p = pmul(x, p);
+
+  // Evaluate the denominator polynomial q.
+  Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4);
+  q = pmadd(x2, q, p4f_beta_2);
+  q = pmadd(x2, q, p4f_beta_0);
+
+  // Divide the numerator by the denominator.
+  p = pdiv(p, q);
+
+  // Reinstate the sign.
+  p = (Packet4f)__builtin_msa_binsli_w((v4u32)p, (v4u32)_x, 0);
+
+  // When the argument is very small in magnitude it's more accurate to just return it.
+  p = (Packet4f)__builtin_msa_bsel_v((v16u8)tiny_mask, (v16u8)p, (v16u8)_x);
+
+  return p;
+}
+
+template <bool sine>
+Packet4f psincos_inner_msa_float(const Packet4f& _x) {
+  static _EIGEN_DECLARE_CONST_Packet4f(sincos_max_arg, 13176795.0f);  // Approx. (2**24) / (4/Pi).
+  static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1, -0.78515625f);
+  static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
+  static _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948e-5f);
+  static _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827e-2f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f);  // 4/Pi.
+  static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
+
+  Packet4f x = pabs(_x);
+
+  // Translate infinite arguments into NANs.
+  Packet4f zero_or_nan_if_inf = psub(_x, _x);
+  x = padd(x, zero_or_nan_if_inf);
+  // Prevent sin/cos from generating values larger than 1.0 in magnitude
+  // for very large arguments by setting x to 0.0.
+  Packet4i small_or_nan_mask = __builtin_msa_fcult_w(x, p4f_sincos_max_arg);
+  x = pand(x, (Packet4f)small_or_nan_mask);
+
+  // Scale x by 4/Pi to find x's octant.
+  Packet4f y = pmul(x, p4f_cephes_FOPI);
+  // Get the octant. We'll reduce x by this number of octants or by one more than it.
+  Packet4i y_int = __builtin_msa_ftrunc_s_w(y);
+  // x's from even-numbered octants will translate to octant 0: [0, +Pi/4].
+  // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
+  // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
+  Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1);
+  Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0); // bclri = bit-clear
+  y = __builtin_msa_ffint_s_w(y_int2);
+
+  // Compute the sign to apply to the polynomial.
+  Packet4i sign_mask = sine ? pxor(__builtin_msa_slli_w(y_int1, 29), (Packet4i)_x)
+                            : __builtin_msa_slli_w(__builtin_msa_addvi_w(y_int, 3), 29);
+
+  // Get the polynomial selection mask.
+  // We'll calculate both (sin and cos) polynomials and then select from the two.
+  Packet4i poly_mask = __builtin_msa_ceqi_w(__builtin_msa_slli_w(y_int2, 30), 0);
+
+  // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4.
+  // The magic pass: "Extended precision modular arithmetic"
+  // x = ((x - y * DP1) - y * DP2) - y * DP3
+  Packet4f tmp1 = pmul(y, p4f_minus_cephes_DP1);
+  Packet4f tmp2 = pmul(y, p4f_minus_cephes_DP2);
+  Packet4f tmp3 = pmul(y, p4f_minus_cephes_DP3);
+  x = padd(x, tmp1);
+  x = padd(x, tmp2);
+  x = padd(x, tmp3);
+
+  // Evaluate the cos(x) polynomial.
+  y = p4f_coscof_p0;
+  Packet4f z = pmul(x, x);
+  y = pmadd(y, z, p4f_coscof_p1);
+  y = pmadd(y, z, p4f_coscof_p2);
+  y = pmul(y, z);
+  y = pmul(y, z);
+  y = __builtin_msa_fmsub_w(y, z, p4f_half);
+  y = padd(y, p4f_1);
+
+  // Evaluate the sin(x) polynomial.
+  Packet4f y2 = p4f_sincof_p0;
+  y2 = pmadd(y2, z, p4f_sincof_p1);
+  y2 = pmadd(y2, z, p4f_sincof_p2);
+  y2 = pmul(y2, z);
+  y2 = pmadd(y2, x, x);
+
+  // Select the correct result from the two polynomials.
+  y = sine ? (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y, (v16u8)y2)
+           : (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y2, (v16u8)y);
+
+  // Update the sign.
+  sign_mask = pxor(sign_mask, (Packet4i)y);
+  y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0); // binsli = bit-insert-left
+  return y;
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+psin<Packet4f>(const Packet4f& x) {
+  return psincos_inner_msa_float</* sine */ true>(x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+pcos<Packet4f>(const Packet4f& x) {
+  return psincos_inner_msa_float</* sine */ false>(x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d
+pexp<Packet2d>(const Packet2d& _x) {
+  // Limiting double-precision pexp's argument to [-1024, +1024] lets pexp
+  // reach 0 and INFINITY naturally.
+  static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -1024.0);
+  static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, +1024.0);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
+  static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
+  static _EIGEN_DECLARE_CONST_Packet2d(1, 1.0);
+  static _EIGEN_DECLARE_CONST_Packet2d(2, 2.0);
+
+  Packet2d x = _x;
+
+  // Clamp x.
+  x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(x, p2d_exp_lo), (v16u8)x,
+                                     (v16u8)p2d_exp_lo);
+  x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(p2d_exp_hi, x), (v16u8)x,
+                                     (v16u8)p2d_exp_hi);
+
+  // Round to nearest integer by adding 0.5 (with x's sign) and truncating.
+  Packet2d x2_add = (Packet2d)__builtin_msa_binsli_d((v2u64)p2d_half, (v2u64)x, 0);
+  Packet2d x2 = pmadd(x, p2d_cephes_LOG2EF, x2_add);
+  Packet2l x2_long = __builtin_msa_ftrunc_s_d(x2);
+  Packet2d x2_long_d = __builtin_msa_ffint_s_d(x2_long);
+
+  x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C1);
+  x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C2);
+
+  x2 = pmul(x, x);
+
+  Packet2d px = p2d_cephes_exp_p0;
+  px = pmadd(px, x2, p2d_cephes_exp_p1);
+  px = pmadd(px, x2, p2d_cephes_exp_p2);
+  px = pmul(px, x);
+
+  Packet2d qx = p2d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
+
+  x = pdiv(px, psub(qx, px));
+  x = pmadd(p2d_2, x, p2d_1);
+
+  // x *= 2**exponent.
+  x = __builtin_msa_fexp2_d(x, x2_long);
+
+  return x;
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_MSA_H

diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h
new file mode 100644
index 0000000..afe8f33
--- /dev/null
+++ b/Eigen/src/Core/arch/MSA/PacketMath.h

@@ -0,0 +1,1233 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Wave Computing, Inc.
+// Written by:
+//   Chris Larsen
+//   Alexey Frunze (afrunze@wavecomp.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_MSA_H
+#define EIGEN_PACKET_MATH_MSA_H
+
+#include <iostream>
+#include <string>
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+#endif
+
+#if 0
+#define EIGEN_MSA_DEBUG                                                             \
+  static bool firstTime = true;                                                     \
+  do {                                                                              \
+    if (firstTime) {                                                                \
+      std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \
+      firstTime = false;                                                            \
+    }                                                                               \
+  } while (0)
+#else
+#define EIGEN_MSA_DEBUG
+#endif
+
+#define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a))
+
+typedef v4f32 Packet4f;
+typedef v4i32 Packet4i;
+typedef v4u32 Packet4ui;
+
+#define _EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = { X, X, X, X }
+#define _EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = { X, X, X, X }
+#define _EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = { X, X, X, X }
+
+inline std::ostream& operator<<(std::ostream& os, const Packet4f& value) {
+  os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet4i& value) {
+  os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet4ui& value) {
+  os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
+  return os;
+}
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef Packet4f type;
+  typedef Packet4f half;  // Packet2f intrinsics not implemented yet
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,  // Packet2f intrinsics not implemented yet
+    // FIXME check the Has*
+    HasDiv = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<int32_t> : default_packet_traits {
+  typedef Packet4i type;
+  typedef Packet4i half;  // Packet2i intrinsics not implemented yet
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,  // Packet2i intrinsics not implemented yet
+    // FIXME check the Has*
+    HasDiv = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet4f> {
+  typedef float type;
+  enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
+  typedef Packet4f half;
+};
+
+template <>
+struct unpacket_traits<Packet4i> {
+  typedef int32_t type;
+  enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
+  typedef Packet4i half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f v = { from, from, from, from };
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fill_w(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float* from) {
+  EIGEN_MSA_DEBUG;
+
+  float f = *from;
+  Packet4f v = { f, f, f, f };
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pload1<Packet4i>(const int32_t* from) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fill_w(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fadd_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_addv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
+  EIGEN_MSA_DEBUG;
+
+  static const Packet4f countdown = { 0.0f, 1.0f, 2.0f, 3.0f };
+  return padd(pset1<Packet4f>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
+  EIGEN_MSA_DEBUG;
+
+  static const Packet4i countdown = { 0, 1, 2, 3 };
+  return padd(pset1<Packet4i>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fsub_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_subv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_bnegi_w((v4u32)a, 31);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)a, 0), 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fmul_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_mulv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fdiv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_div_s_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fmadd_w(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+  EIGEN_MSA_DEBUG;
+
+  // Use "asm" construct to avoid __builtin_msa_maddv_w GNU C bug.
+  Packet4i value = c;
+  __asm__("maddv.w %w[value], %w[a], %w[b]\n"
+          // Outputs
+          : [value] "+f"(value)
+          // Inputs
+          : [a] "f"(a), [b] "f"(b));
+  return value;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_and_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4i)__builtin_msa_and_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_or_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4i)__builtin_msa_or_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4i)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return pand(a, (Packet4f)__builtin_msa_xori_b((v16u8)b, 255));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return pand(a, (Packet4i)__builtin_msa_xori_b((v16u8)b, 255));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  // This prefers numbers to NaNs.
+  return __builtin_msa_fmin_w(a, b);
+#else
+  // This prefers NaNs to numbers.
+  Packet4i aNaN = __builtin_msa_fcun_w(a, a);
+  Packet4i aMinOrNaN = por(__builtin_msa_fclt_w(a, b), aNaN);
+  return (Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_min_s_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  // This prefers numbers to NaNs.
+  return __builtin_msa_fmax_w(a, b);
+#else
+  // This prefers NaNs to numbers.
+  Packet4i aNaN = __builtin_msa_fcun_w(a, a);
+  Packet4i aMaxOrNaN = por(__builtin_msa_fclt_w(b, a), aNaN);
+  return (Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_max_s_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_LOAD return __builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4i)__builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
+  EIGEN_MSA_DEBUG;
+
+  float f0 = from[0], f1 = from[1];
+  Packet4f v0 = { f0, f0, f0, f0 };
+  Packet4f v1 = { f1, f1, f1, f1 };
+  return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
+  EIGEN_MSA_DEBUG;
+
+  int32_t i0 = from[0], i1 = from[1];
+  Packet4i v0 = { i0, i0, i0, i0 };
+  Packet4i v1 = { i1, i1, i1, i1 };
+  return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w(from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w(from, to, 0);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  float f = *from;
+  Packet4f v = { f, f, f, f };
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  return v;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  int32_t i = *from;
+  Packet4i v = { i, i, i, i };
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  return v;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from,
+                                                        Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  *to = from[0];
+  to += stride;
+  *to = from[1];
+  to += stride;
+  *to = from[2];
+  to += stride;
+  *to = from[3];
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
+                                                          Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  *to = from[0];
+  to += stride;
+  *to = from[1];
+  to += stride;
+  *to = from[2];
+  to += stride;
+  *to = from[3];
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+  EIGEN_MSA_DEBUG;
+
+  __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
+  EIGEN_MSA_DEBUG;
+
+  __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_bclri_w((v4u32)a, 31);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i zero = __builtin_msa_ldi_w(0);
+  return __builtin_msa_add_a_w(zero, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f s = padd(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return s[0];
+}
+
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i s = padd(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  s = padd(s, __builtin_msa_shf_w(s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return s[0];
+}
+
+// Other reduction functions:
+// mul
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f p = pmul(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  p = pmul(p, (Packet4f)__builtin_msa_shf_w((v4i32)p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return p[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i p = pmul(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  p = pmul(p, __builtin_msa_shf_w(p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return p[0];
+}
+
+// min
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  // Swap 64-bit halves of a.
+  Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+#if !EIGEN_FAST_MATH
+  // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
+  // masks of all zeroes/ones in low 64 bits.
+  v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
+  // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
+  unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
+#endif
+  // Continue with min computation.
+  Packet4f v = __builtin_msa_fmin_w(a, swapped);
+  v = __builtin_msa_fmin_w(
+      v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+#if !EIGEN_FAST_MATH
+  // Based on the mask select between v and 4 qNaNs.
+  v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
+  v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
+#endif
+  return v[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i m = pmin(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  m = pmin(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return m[0];
+}
+
+// max
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  // Swap 64-bit halves of a.
+  Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+#if !EIGEN_FAST_MATH
+  // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
+  // masks of all zeroes/ones in low 64 bits.
+  v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
+  // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
+  unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
+#endif
+  // Continue with max computation.
+  Packet4f v = __builtin_msa_fmax_w(a, swapped);
+  v = __builtin_msa_fmax_w(
+      v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+#if !EIGEN_FAST_MATH
+  // Based on the mask select between v and 4 qNaNs.
+  v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
+  v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
+#endif
+  return v[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i m = pmax(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  m = pmax(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return m[0];
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4f, 4>& value) {
+  os << "[ " << value.packet[0] << "," << std::endl
+     << "  " << value.packet[1] << "," << std::endl
+     << "  " << value.packet[2] << "," << std::endl
+     << "  " << value.packet[3] << " ]";
+  return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  v4i32 tmp1, tmp2, tmp3, tmp4;
+
+  tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
+  tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
+  tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
+  tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
+
+  kernel.packet[0] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
+  kernel.packet[1] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
+  kernel.packet[2] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
+  kernel.packet[3] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4i, 4>& value) {
+  os << "[ " << value.packet[0] << "," << std::endl
+     << "  " << value.packet[1] << "," << std::endl
+     << "  " << value.packet[2] << "," << std::endl
+     << "  " << value.packet[3] << " ]";
+  return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  v4i32 tmp1, tmp2, tmp3, tmp4;
+
+  tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]);
+  tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]);
+  tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]);
+  tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]);
+
+  kernel.packet[0] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
+  kernel.packet[1] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
+  kernel.packet[2] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
+  kernel.packet[3] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fsqrt_w(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  return __builtin_msa_frsqrt_w(a);
+#else
+  Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1));
+  return pdiv(ones, psqrt(a));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
+  Packet4f v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"  // 3 = round towards -INFINITY.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.w %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
+  Packet4f v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 1\n"  // 2 = round towards +INFINITY.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.w %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
+  Packet4f v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 3\n"  // 0 = round to nearest, ties to even.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.w %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
+                                    const Packet4f& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
+                       ifPacket.select[3] };
+  Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
+  return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
+                                    const Packet4i& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
+                       ifPacket.select[3] };
+  Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
+  return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
+}
+
+//---------- double ----------
+
+typedef v2f64 Packet2d;
+typedef v2i64 Packet2l;
+typedef v2u64 Packet2ul;
+
+#define _EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = { X, X }
+#define _EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = { X, X }
+#define _EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = { X, X }
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2d& value) {
+  os << "[ " << value[0] << ", " << value[1] << " ]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2l& value) {
+  os << "[ " << value[0] << ", " << value[1] << " ]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2ul& value) {
+  os << "[ " << value[0] << ", " << value[1] << " ]";
+  return os;
+}
+
+template <>
+struct packet_traits<double> : default_packet_traits {
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+    // FIXME check the Has*
+    HasDiv = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2d> {
+  typedef double type;
+  enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
+  typedef Packet2d half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d value = { from, from };
+  return value;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fadd_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
+  EIGEN_MSA_DEBUG;
+
+  static const Packet2d countdown = { 0.0, 1.0 };
+  return padd(pset1<Packet2d>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fsub_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_bnegi_d((v2u64)a, 63);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fmul_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fdiv_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fmadd_d(c, a, b);
+}
+
+// Logical Operations are not supported for float, so we have to reinterpret casts using MSA
+// intrinsics
+template <>
+EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_and_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_or_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return pand(a, (Packet2d)__builtin_msa_xori_b((v16u8)b, 255));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  // This prefers numbers to NaNs.
+  return __builtin_msa_fmin_d(a, b);
+#else
+  // This prefers NaNs to numbers.
+  v2i64 aNaN = __builtin_msa_fcun_d(a, a);
+  v2i64 aMinOrNaN = por(__builtin_msa_fclt_d(a, b), aNaN);
+  return (Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  // This prefers numbers to NaNs.
+  return __builtin_msa_fmax_d(a, b);
+#else
+  // This prefers NaNs to numbers.
+  v2i64 aNaN = __builtin_msa_fcun_d(a, a);
+  v2i64 aMaxOrNaN = por(__builtin_msa_fclt_d(b, a), aNaN);
+  return (Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d value = { *from, *from };
+  return value;
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d value;
+  value[0] = *from;
+  from += stride;
+  value[1] = *from;
+  return value;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from,
+                                                         Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  *to = from[0];
+  to += stride;
+  *to = from[1];
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+  EIGEN_MSA_DEBUG;
+
+  __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_bclri_d((v2u64)a, 63);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d s = padd(a, preverse(a));
+  return s[0];
+}
+
+// Other reduction functions:
+// mul
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d p = pmul(a, preverse(a));
+  return p[0];
+}
+
+// min
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+  Packet2d v = __builtin_msa_fmin_d(a, swapped);
+  return v[0];
+#else
+  double a0 = a[0], a1 = a[1];
+  return ((numext::isnan)(a0) || a0 < a1) ? a0 : a1;
+#endif
+}
+
+// max
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+  Packet2d v = __builtin_msa_fmax_d(a, swapped);
+  return v[0];
+#else
+  double a0 = a[0], a1 = a[1];
+  return ((numext::isnan)(a0) || a0 > a1) ? a0 : a1;
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fsqrt_d(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  return __builtin_msa_frsqrt_d(a);
+#else
+  Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1));
+  return pdiv(ones, psqrt(a));
+#endif
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2d, 2>& value) {
+  os << "[ " << value.packet[0] << "," << std::endl << "  " << value.packet[1] << " ]";
+  return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d trn1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
+  Packet2d trn2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
+  kernel.packet[0] = trn1;
+  kernel.packet[1] = trn2;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
+  Packet2d v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"  // 3 = round towards -INFINITY.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.d %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
+  Packet2d v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 1\n"  // 2 = round towards +INFINITY.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.d %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
+  Packet2d v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 3\n"  // 0 = round to nearest, ties to even.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.d %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
+                                    const Packet2d& elsePacket) {
+  Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
+  Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0);
+  return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_MSA_H

diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 4844137..f40af7f 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h

@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -14,79 +15,161 @@
 
 namespace internal {
 
-inline uint32x4_t p4ui_CONJ_XOR() {
+inline uint32x4_t p4ui_CONJ_XOR()
+{
+// See bug 1325, clang fails to call vld1q_u64.
+#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML
+  uint32x4_t ret = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+  return ret;
+#else
   static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
   return vld1q_u32( conj_XOR_DATA );
+#endif
 }
 
-inline uint32x2_t p2ui_CONJ_XOR() {
+inline uint32x2_t p2ui_CONJ_XOR()
+{
   static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000 };
   return vld1_u32( conj_XOR_DATA );
 }
 
 //---------- float ----------
+
+struct Packet1cf
+{
+  EIGEN_STRONG_INLINE Packet1cf() {}
+  EIGEN_STRONG_INLINE explicit Packet1cf(const Packet2f& a) : v(a) {}
+  Packet2f v;
+};
 struct Packet2cf
 {
   EIGEN_STRONG_INLINE Packet2cf() {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
-  Packet4f  v;
+  Packet4f v;
 };
 
-template<> struct packet_traits<std::complex<float> >  : default_packet_traits
+template<> struct packet_traits<std::complex<float> > : default_packet_traits
 {
   typedef Packet2cf type;
-  typedef Packet2cf half;
-  enum {
+  typedef Packet1cf half;
+  enum
+  {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
-    HasHalfPacket = 0,
+    HasHalfPacket = 1,
 
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasMul       = 1,
+    HasDiv       = 1,
+    HasNegate    = 1,
+    HasAbs       = 0,
+    HasAbs2      = 0,
+    HasMin       = 0,
+    HasMax       = 0,
     HasSetLinear = 0
   };
 };
 
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
-
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
+template<> struct unpacket_traits<Packet1cf>
 {
-  float32x2_t r64;
-  r64 = vld1_f32((float *)&from);
+  typedef std::complex<float> type;
+  typedef Packet1cf half;
+  typedef Packet2f as_real;
+  enum
+  {
+    size = 1,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet2cf>
+{
+  typedef std::complex<float> type;
+  typedef Packet1cf half;
+  typedef Packet4f as_real;
+  enum
+  {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
 
+template<> EIGEN_STRONG_INLINE Packet1cf pcast<float,Packet1cf>(const float& a)
+{ return Packet1cf(vset_lane_f32(a, vdup_n_f32(0.f), 0)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pcast<Packet2f,Packet2cf>(const Packet2f& a)
+{ return Packet2cf(vreinterpretq_f32_u64(vmovl_u32(vreinterpret_u32_f32(a)))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf pset1<Packet1cf>(const std::complex<float>& from)
+{ return Packet1cf(vld1_f32(reinterpret_cast<const float*>(&from))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
+{
+  const float32x2_t r64 = vld1_f32(reinterpret_cast<const float*>(&from));
   return Packet2cf(vcombine_f32(r64, r64));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cf padd<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{ return Packet1cf(padd<Packet2f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{ return Packet2cf(padd<Packet4f>(a.v, b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf psub<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{ return Packet1cf(psub<Packet2f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{ return Packet2cf(psub<Packet4f>(a.v, b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf pnegate(const Packet1cf& a) { return Packet1cf(pnegate<Packet2f>(a.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate<Packet4f>(a.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf pconj(const Packet1cf& a)
+{
+  const Packet2ui b = vreinterpret_u32_f32(a.v);
+  return Packet1cf(vreinterpret_f32_u32(veor_u32(b, p2ui_CONJ_XOR())));
+}
 template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
 {
-  Packet4ui b = vreinterpretq_u32_f32(a.v);
+  const Packet4ui b = vreinterpretq_u32_f32(a.v);
   return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR())));
 }
 
+template<> EIGEN_STRONG_INLINE Packet1cf pmul<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{
+  Packet2f v1, v2;
+
+  // Get the real values of a | a1_re | a1_re |
+  v1 = vdup_lane_f32(a.v, 0);
+  // Get the imag values of a | a1_im | a1_im |
+  v2 = vdup_lane_f32(a.v, 1);
+  // Multiply the real a with b
+  v1 = vmul_f32(v1, b.v);
+  // Multiply the imag a with b
+  v2 = vmul_f32(v2, b.v);
+  // Conjugate v2
+  v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR()));
+  // Swap real/imag elements in v2.
+  v2 = vrev64_f32(v2);
+  // Add and return the result
+  return Packet1cf(vadd_f32(v1, v2));
+}
 template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
   Packet4f v1, v2;
 
   // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
   v1 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 0), vdup_lane_f32(vget_high_f32(a.v), 0));
-  // Get the real values of a | a1_im | a1_im | a2_im | a2_im |
+  // Get the imag values of a | a1_im | a1_im | a2_im | a2_im |
   v2 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 1), vdup_lane_f32(vget_high_f32(a.v), 1));
   // Multiply the real a with b
   v1 = vmulq_f32(v1, b.v);
   // Multiply the imag a with b
   v2 = vmulq_f32(v2, b.v);
-  // Conjugate v2 
+  // Conjugate v2
   v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR()));
   // Swap real/imag elements in v2.
   v2 = vrev64q_f32(v2);
@@ -94,98 +177,144 @@
   return Packet2cf(vaddq_f32(v1, v2));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+template<> EIGEN_STRONG_INLINE Packet1cf pcmp_eq(const Packet1cf& a, const Packet1cf& b)
 {
-  return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+  // Compare real and imaginary parts of a and b to get the mask vector:
+  // [re(a[0])==re(b[0]), im(a[0])==im(b[0])]
+  Packet2f eq = pcmp_eq<Packet2f>(a.v, b.v);
+  // Swap real/imag elements in the mask in to get:
+  // [im(a[0])==im(b[0]), re(a[0])==re(b[0])]
+  Packet2f eq_swapped = vrev64_f32(eq);
+  // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped
+  return Packet1cf(pand<Packet2f>(eq, eq_swapped));
 }
-template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b)
 {
-  return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+  // Compare real and imaginary parts of a and b to get the mask vector:
+  // [re(a[0])==re(b[0]), im(a[0])==im(b[0]), re(a[1])==re(b[1]), im(a[1])==im(b[1])]
+  Packet4f eq = pcmp_eq<Packet4f>(a.v, b.v);
+  // Swap real/imag elements in the mask in to get:
+  // [im(a[0])==im(b[0]), re(a[0])==re(b[0]), im(a[1])==im(b[1]), re(a[1])==re(b[1])]
+  Packet4f eq_swapped = vrev64q_f32(eq);
+  // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped
+  return Packet2cf(pand<Packet4f>(eq, eq_swapped));
 }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
-}
+
+template<> EIGEN_STRONG_INLINE Packet1cf pand<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{ return Packet1cf(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{ return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf por<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{ return Packet1cf(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
+template<> EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{ return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf pxor<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{ return Packet1cf(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{ return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf pandnot<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{ return Packet1cf(vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
 template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{ return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf pload<Packet1cf>(const std::complex<float>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf(pload<Packet2f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(reinterpret_cast<const float*>(from))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf ploadu<Packet1cf>(const std::complex<float>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cf(ploadu<Packet2f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(reinterpret_cast<const float*>(from))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf ploaddup<Packet1cf>(const std::complex<float>* from)
+{ return pset1<Packet1cf>(*from); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from)
+{ return pset1<Packet2cf>(*from); }
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *to, const Packet1cf& from)
+{ EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *to, const Packet2cf& from)
+{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<float*>(to), from.v); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *to, const Packet1cf& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *to, const Packet2cf& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<float*>(to), from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet1cf pgather<std::complex<float>, Packet1cf>(
+    const std::complex<float>* from, Index stride)
 {
-  return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+  const Packet2f tmp = vdup_n_f32(std::real(from[0*stride]));
+  return Packet1cf(vset_lane_f32(std::imag(from[0*stride]), tmp, 1));
 }
-
-template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride)
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(
+    const std::complex<float>* from, Index stride)
 {
-  Packet4f res = pset1<Packet4f>(0.f);
-  res = vsetq_lane_f32(std::real(from[0*stride]), res, 0);
+  Packet4f res = vdupq_n_f32(std::real(from[0*stride]));
   res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1);
   res = vsetq_lane_f32(std::real(from[1*stride]), res, 2);
   res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3);
   return Packet2cf(res);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride)
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet1cf>(
+    std::complex<float>* to, const Packet1cf& from, Index stride)
+{ to[stride*0] = std::complex<float>(vget_lane_f32(from.v, 0), vget_lane_f32(from.v, 1)); }
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(
+    std::complex<float>* to, const Packet2cf& from, Index stride)
 {
   to[stride*0] = std::complex<float>(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1));
   to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { EIGEN_ARM_PREFETCH((float *)addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *addr)
+{ EIGEN_ARM_PREFETCH(reinterpret_cast<const float*>(addr)); }
 
-template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
+template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet1cf>(const Packet1cf& a)
 {
-  std::complex<float> EIGEN_ALIGN16 x[2];
-  vst1q_f32((float *)x, a.v);
+  EIGEN_ALIGN16 std::complex<float> x;
+  vst1_f32(reinterpret_cast<float*>(&x), a.v);
+  return x;
+}
+template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
+{
+  EIGEN_ALIGN16 std::complex<float> x[2];
+  vst1q_f32(reinterpret_cast<float*>(x), a.v);
   return x[0];
 }
 
+template<> EIGEN_STRONG_INLINE Packet1cf preverse(const Packet1cf& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
-{
-  float32x2_t a_lo, a_hi;
-  Packet4f a_r128;
+{ return Packet2cf(vcombine_f32(vget_high_f32(a.v), vget_low_f32(a.v))); }
 
-  a_lo = vget_low_f32(a.v);
-  a_hi = vget_high_f32(a.v);
-  a_r128 = vcombine_f32(a_hi, a_lo);
-
-  return Packet2cf(a_r128);
-}
-
+template<> EIGEN_STRONG_INLINE Packet1cf pcplxflip<Packet1cf>(const Packet1cf& a)
+{ return Packet1cf(vrev64_f32(a.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a)
-{
-  return Packet2cf(vrev64q_f32(a.v));
-}
+{ return Packet2cf(vrev64q_f32(a.v)); }
 
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet1cf>(const Packet1cf& a)
+{
+  std::complex<float> s;
+  vst1_f32((float *)&s, a.v);
+  return s;
+}
 template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
 {
-  float32x2_t a1, a2;
   std::complex<float> s;
-
-  a1 = vget_low_f32(a.v);
-  a2 = vget_high_f32(a.v);
-  a2 = vadd_f32(a1, a2);
-  vst1_f32((float *)&s, a2);
-
+  vst1_f32(reinterpret_cast<float*>(&s), vadd_f32(vget_low_f32(a.v), vget_high_f32(a.v)));
   return s;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet1cf>(const Packet1cf& a)
 {
-  Packet4f sum1, sum2, sum;
-
-  // Add the first two 64-bit float32x2_t of vecs[0]
-  sum1 = vcombine_f32(vget_low_f32(vecs[0].v), vget_low_f32(vecs[1].v));
-  sum2 = vcombine_f32(vget_high_f32(vecs[0].v), vget_high_f32(vecs[1].v));
-  sum = vaddq_f32(sum1, sum2);
-
-  return Packet2cf(sum);
+  std::complex<float> s;
+  vst1_f32((float *)&s, a.v);
+  return s;
 }
-
 template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
 {
   float32x2_t a1, a2, v1, v2, prod;
@@ -201,87 +330,72 @@
   v1 = vmul_f32(v1, a2);
   // Multiply the imag a with b
   v2 = vmul_f32(v2, a2);
-  // Conjugate v2 
+  // Conjugate v2
   v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR()));
   // Swap real/imag elements in v2.
   v2 = vrev64_f32(v2);
   // Add v1, v2
   prod = vadd_f32(v1, v2);
 
-  vst1_f32((float *)&s, prod);
+  vst1_f32(reinterpret_cast<float*>(&s), prod);
 
   return s;
 }
 
-template<int Offset>
-struct palign_impl<Offset,Packet2cf>
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cf,Packet2f)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+
+template<> EIGEN_STRONG_INLINE Packet1cf pdiv<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
 {
-  EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second)
-  {
-    if (Offset==1)
-    {
-      first.v = vextq_f32(first.v, second.v, 2);
-    }
-  }
-};
+  // TODO optimize it for NEON
+  Packet1cf res = pmul(a, pconj(b));
+  Packet2f s, rev_s;
 
-template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
+  // this computes the norm
+  s = vmul_f32(b.v, b.v);
+  rev_s = vrev64_f32(s);
 
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
+  return Packet1cf(pdiv<Packet2f>(res.v, vadd_f32(s, rev_s)));
+}
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
   // TODO optimize it for NEON
-  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
+  Packet2cf res = pmul(a,pconj(b));
   Packet4f s, rev_s;
 
   // this computes the norm
   s = vmulq_f32(b.v, b.v);
   rev_s = vrev64q_f32(s);
 
-  return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
+  return Packet2cf(pdiv<Packet4f>(res.v, vaddq_f32(s, rev_s)));
 }
 
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2cf,2>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet1cf, 1>& /*kernel*/) {}
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel)
+{
   Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v));
   kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v));
   kernel.packet[1].v = tmp;
 }
 
+template<> EIGEN_STRONG_INLINE Packet1cf psqrt<Packet1cf>(const Packet1cf& a) {
+  return psqrt_complex<Packet1cf>(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
+  return psqrt_complex<Packet2cf>(a);
+}
+
 //---------- double ----------
 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
 
-static uint64x2_t p2ul_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x0, 0x8000000000000000);
+// See bug 1325, clang fails to call vld1q_u64.
+#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML
+  static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000};
+#else
+  const uint64_t  p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 };
+  static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );
+#endif
 
 struct Packet1cd
 {
@@ -294,7 +408,8 @@
 {
   typedef Packet1cd type;
   typedef Packet1cd half;
-  enum {
+  enum
+  {
     Vectorizable = 1,
     AlignedOnScalar = 0,
     size = 1,
@@ -313,32 +428,58 @@
   };
 };
 
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet1cd>
+{
+  typedef std::complex<double> type;
+  typedef Packet1cd half;
+  typedef Packet2d as_real;
+  enum
+  {
+    size=1,
+    alignment=Aligned16,
+    vectorizable=true,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
 
-template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>(reinterpret_cast<const double*>(from))); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
-{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>(reinterpret_cast<const double*>(from))); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(padd<Packet2d>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(psub<Packet2d>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate<Packet2d>(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); }
+template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
+{
+  /* here we really have to use unaligned loads :( */
+  return ploadu<Packet1cd>(&from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{ return Packet1cd(padd<Packet2d>(a.v, b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{ return Packet1cd(psub<Packet2d>(a.v, b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a)
+{ return Packet1cd(pnegate<Packet2d>(a.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
+{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); }
 
 template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
   Packet2d v1, v2;
 
-  // Get the real values of a 
+  // Get the real values of a
   v1 = vdupq_lane_f64(vget_low_f64(a.v), 0);
-  // Get the real values of a 
+  // Get the imag values of a
   v2 = vdupq_lane_f64(vget_high_f64(a.v), 0);
   // Multiply the real a with b
   v1 = vmulq_f64(v1, b.v);
   // Multiply the imag a with b
   v2 = vmulq_f64(v2, b.v);
-  // Conjugate v2 
+  // Conjugate v2
   v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR));
   // Swap real/imag elements in v2.
   v2 = preverse<Packet2d>(v2);
@@ -346,31 +487,44 @@
   return Packet1cd(vaddq_f64(v1, v2));
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b)
 {
-  return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
+  // Compare real and imaginary parts of a and b to get the mask vector:
+  // [re(a)==re(b), im(a)==im(b)]
+  Packet2d eq = pcmp_eq<Packet2d>(a.v, b.v);
+  // Swap real/imag elements in the mask in to get:
+  // [im(a)==im(b), re(a)==re(b)]
+  Packet2d eq_swapped = vreinterpretq_f64_u32(vrev64q_u32(vreinterpretq_u32_f64(eq)));
+  // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped
+  return Packet1cd(pand<Packet2d>(eq, eq_swapped));
 }
-template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
-  return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
-}
-template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
-  return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
-}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{ return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{ return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+
 template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
-  return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
-}
+{ return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from)
+{ return pset1<Packet1cd>(*from); }
 
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *to, const Packet1cd& from)
+{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<double*>(to), from.v); }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ARM_PREFETCH((double *)addr); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *to, const Packet1cd& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), from.v); }
 
-template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, int stride)
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *addr)
+{ EIGEN_ARM_PREFETCH(reinterpret_cast<const double*>(addr)); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(
+    const std::complex<double>* from, Index stride)
 {
   Packet2d res = pset1<Packet2d>(0.0);
   res = vsetq_lane_f64(std::real(from[0*stride]), res, 0);
@@ -378,17 +532,14 @@
   return Packet1cd(res);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, int stride)
-{
-  to[stride*0] = std::complex<double>(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1));
-}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(
+    std::complex<double>* to, const Packet1cd& from, Index stride)
+{ to[stride*0] = std::complex<double>(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); }
 
-
-template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
+template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
 {
-  std::complex<double> EIGEN_ALIGN16 res;
+  EIGEN_ALIGN16 std::complex<double> res;
   pstore<std::complex<double> >(&res, a);
-
   return res;
 }
 
@@ -396,57 +547,14 @@
 
 template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs) { return vecs[0]; }
-
 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
 
-template<int Offset>
-struct palign_impl<Offset,Packet1cd>
-{
-  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
-  {
-    // FIXME is it sure we never have to align a Packet1cd?
-    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
 
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
   // TODO optimize it for NEON
-  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
+  Packet1cd res = pmul(a,pconj(b));
   Packet2d s = pmul<Packet2d>(b.v, b.v);
   Packet2d rev_s = preverse<Packet2d>(s);
 
@@ -454,9 +562,7 @@
 }
 
 EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
-{
-  return Packet1cd(preverse(Packet2d(x.v)));
-}
+{ return Packet1cd(preverse(Packet2d(x.v))); }
 
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
 {
@@ -464,8 +570,12 @@
   kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v));
   kernel.packet[1].v = tmp;
 }
-#endif // EIGEN_ARCH_ARM64
 
+template<> EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
+  return psqrt_complex<Packet1cd>(a);
+}
+
+#endif // EIGEN_ARCH_ARM64
 
 } // end namespace internal
 

diff --git a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
new file mode 100644
index 0000000..3481f33
--- /dev/null
+++ b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h

@@ -0,0 +1,183 @@
+namespace Eigen {
+namespace internal {
+  
+#if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
+
+// Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm.
+// Here we specialize gebp_traits to eliminate these register spills.
+// See #2138.
+template<>
+struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
+ : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
+{
+  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+  { 
+    // This volatile inline ASM both acts as a barrier to prevent reordering,
+    // as well as enforces strict register use.
+    asm volatile(
+      "vmla.f32 %q[r], %q[c], %q[alpha]"
+      : [r] "+w" (r)
+      : [c] "w" (c),
+        [alpha] "w" (alpha)
+      : );
+  }
+
+  template <typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b,
+                                Packet4f& c, Packet4f& tmp,
+                                const LaneIdType&) const {
+    acc(a, b, c);
+  }
+  
+  template <typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b,
+                                Packet4f& c, Packet4f& tmp,
+                                const LaneIdType& lane) const {
+    madd(a, b.get(lane), c, tmp, lane);
+  }
+};
+
+#endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
+
+#if EIGEN_ARCH_ARM64
+
+template<>
+struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
+ : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
+{
+  typedef float RhsPacket;
+  typedef float32x4_t RhsPacketx4;
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  {
+    dest = *b;
+  }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+  {
+    dest = vld1q_f32(b);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
+  {
+    dest = *b;
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+  {}
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+  {
+    loadRhs(b,dest);
+  }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+  {
+    c = vfmaq_n_f32(c, a, b);
+  }
+
+  // NOTE: Template parameter inference failed when compiled with Android NDK:
+  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+  { madd_helper<0>(a, b, c); }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
+  { madd_helper<1>(a, b, c); }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
+  { madd_helper<2>(a, b, c); }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
+  { madd_helper<3>(a, b, c); }
+
+ private:
+  template<int LaneID>
+  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
+  {
+    #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
+    // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
+    // vfmaq_laneq_f32 is implemented through a costly dup
+         if(LaneID==0)  asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) :  );
+    else if(LaneID==1)  asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) :  );
+    else if(LaneID==2)  asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) :  );
+    else if(LaneID==3)  asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) :  );
+    #else
+    c = vfmaq_laneq_f32(c, a, b, LaneID);
+    #endif
+  }
+};
+
+
+template<>
+struct gebp_traits <double,double,false,false,Architecture::NEON>
+ : gebp_traits<double,double,false,false,Architecture::Generic>
+{
+  typedef double RhsPacket;
+
+  struct RhsPacketx4 {
+    float64x2_t B_0, B_1;
+  };
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  {
+    dest = *b;
+  }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+  {
+    dest.B_0 = vld1q_f64(b);
+    dest.B_1 = vld1q_f64(b+2);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
+  {
+    loadRhs(b,dest);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+  {}
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+  {
+    loadRhs(b,dest);
+  }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+  {
+    c = vfmaq_n_f64(c, a, b);
+  }
+
+  // NOTE: Template parameter inference failed when compiled with Android NDK:
+  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+  { madd_helper<0>(a, b, c); }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
+  { madd_helper<1>(a, b, c); }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
+  { madd_helper<2>(a, b, c); }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
+  { madd_helper<3>(a, b, c); }
+
+ private:
+  template <int LaneID>
+  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
+  {
+    #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
+    // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
+    // vfmaq_laneq_f64 is implemented through a costly dup
+         if(LaneID==0)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
+    else if(LaneID==1)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
+    else if(LaneID==2)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
+    else if(LaneID==3)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
+    #else
+         if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
+    else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
+    else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
+    else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
+    #endif
+  }
+};
+
+#endif // EIGEN_ARCH_ARM64
+
+}  // namespace internal
+}  // namespace Eigen

diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h
index 6bb05bb..fa6615a 100644
--- a/Eigen/src/Core/arch/NEON/MathFunctions.h
+++ b/Eigen/src/Core/arch/NEON/MathFunctions.h

@@ -5,10 +5,6 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-/* The sin, cos, exp, and log functions of this file come from
- * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
- */
-
 #ifndef EIGEN_MATH_FUNCTIONS_NEON_H
 #define EIGEN_MATH_FUNCTIONS_NEON_H
 
@@ -16,74 +12,62 @@
 
 namespace internal {
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f pexp<Packet4f>(const Packet4f& _x)
-{
-  Packet4f x = _x;
-  Packet4f tmp, fx;
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f pexp<Packet2f>(const Packet2f& x)
+{ return pexp_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp<Packet4f>(const Packet4f& x)
+{ return pexp_float(x); }
 
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-  _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
-  _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f plog<Packet2f>(const Packet2f& x)
+{ return plog_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f plog<Packet4f>(const Packet4f& x)
+{ return plog_float(x); }
 
-  x = vminq_f32(x, p4f_exp_hi);
-  x = vmaxq_f32(x, p4f_exp_lo);
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f psin<Packet2f>(const Packet2f& x)
+{ return psin_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psin<Packet4f>(const Packet4f& x)
+{ return psin_float(x); }
 
-  /* express exp(x) as exp(g + n*log(2)) */
-  fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF);
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f pcos<Packet2f>(const Packet2f& x)
+{ return pcos_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pcos<Packet4f>(const Packet4f& x)
+{ return pcos_float(x); }
 
-  /* perform a floorf */
-  tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
+// Hyperbolic Tangent function.
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f ptanh<Packet2f>(const Packet2f& x)
+{ return internal::generic_fast_tanh_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f ptanh<Packet4f>(const Packet4f& x)
+{ return internal::generic_fast_tanh_float(x); }
 
-  /* if greater, substract 1 */
-  Packet4ui mask = vcgtq_f32(tmp, fx);
-  mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1));
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, psin)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pcos)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, plog)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pexp)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, ptanh)
 
-  fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
-
-  tmp = vmulq_f32(fx, p4f_cephes_exp_C1);
-  Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2);
-  x = vsubq_f32(x, tmp);
-  x = vsubq_f32(x, z);
-
-  Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x);
-  z = vmulq_f32(x, x);
-  y = vaddq_f32(y, p4f_cephes_exp_p1);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, p4f_cephes_exp_p2);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, p4f_cephes_exp_p3);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, p4f_cephes_exp_p4);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, p4f_cephes_exp_p5);
-
-  y = vmulq_f32(y, z);
-  y = vaddq_f32(y, x);
-  y = vaddq_f32(y, p4f_1);
-
-  /* build 2^n */
-  int32x4_t mm;
-  mm = vcvtq_s32_f32(fx);
-  mm = vaddq_s32(mm, p4i_0x7f);
-  mm = vshlq_n_s32(mm, 23);
-  Packet4f pow2n = vreinterpretq_f32_s32(mm);
-
-  y = vmulq_f32(y, pow2n);
-  return y;
+template <>
+EIGEN_STRONG_INLINE Packet4bf pfrexp(const Packet4bf& a, Packet4bf& exponent) {
+  Packet4f fexponent;
+  const Packet4bf out = F32ToBf16(pfrexp<Packet4f>(Bf16ToF32(a), fexponent));
+  exponent = F32ToBf16(fexponent);
+  return out;
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet4bf pldexp(const Packet4bf& a, const Packet4bf& exponent) {
+  return F32ToBf16(pldexp<Packet4f>(Bf16ToF32(a), Bf16ToF32(exponent)));
+}
+
+//---------- double ----------
+
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp<Packet2d>(const Packet2d& x)
+{ return pexp_double(x); }
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d plog<Packet2d>(const Packet2d& x)
+{ return plog_double(x); }
+
+#endif
+
 } // end namespace internal
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index bb8bce3..6996cc8 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h

@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2010 Konstantinos Margaritis <markos@codex.gr>
+// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
 // Heavily based on Gael's SSE version.
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -17,142 +17,961 @@
 namespace internal {
 
 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
-#endif
-
-// FIXME NEON has 16 quad registers, but since the current register allocator
-// is so bad, it is much better to reduce it to 8
-#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
 #endif
 
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 
-#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#if EIGEN_ARCH_ARM64
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+#else
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
+#endif
 #endif
 
-typedef float32x2_t Packet2f;
-typedef float32x4_t Packet4f;
-typedef int32x4_t   Packet4i;
-typedef int32x2_t   Packet2i;
-typedef uint32x4_t  Packet4ui;
+#if EIGEN_COMP_MSVC_STRICT
+
+// In MSVC's arm_neon.h header file, all NEON vector types
+// are aliases to the same underlying type __n128.
+// We thus have to wrap them to make them different C++ types.
+// (See also bug 1428)
+typedef eigen_packet_wrapper<float32x2_t,0>  Packet2f;
+typedef eigen_packet_wrapper<float32x4_t,1>  Packet4f;
+typedef eigen_packet_wrapper<int32_t    ,2>  Packet4c;
+typedef eigen_packet_wrapper<int8x8_t   ,3>  Packet8c;
+typedef eigen_packet_wrapper<int8x16_t  ,4>  Packet16c;
+typedef eigen_packet_wrapper<uint32_t   ,5>  Packet4uc;
+typedef eigen_packet_wrapper<uint8x8_t  ,6>  Packet8uc;
+typedef eigen_packet_wrapper<uint8x16_t ,7>  Packet16uc;
+typedef eigen_packet_wrapper<int16x4_t  ,8>  Packet4s;
+typedef eigen_packet_wrapper<int16x8_t  ,9>  Packet8s;
+typedef eigen_packet_wrapper<uint16x4_t ,10> Packet4us;
+typedef eigen_packet_wrapper<uint16x8_t ,11> Packet8us;
+typedef eigen_packet_wrapper<int32x2_t  ,12> Packet2i;
+typedef eigen_packet_wrapper<int32x4_t  ,13> Packet4i;
+typedef eigen_packet_wrapper<uint32x2_t ,14> Packet2ui;
+typedef eigen_packet_wrapper<uint32x4_t ,15> Packet4ui;
+typedef eigen_packet_wrapper<int64x2_t  ,16> Packet2l;
+typedef eigen_packet_wrapper<uint64x2_t ,17> Packet2ul;
+
+#else
+
+typedef float32x2_t                          Packet2f;
+typedef float32x4_t                          Packet4f;
+typedef eigen_packet_wrapper<int32_t    ,2>  Packet4c;
+typedef int8x8_t                             Packet8c;
+typedef int8x16_t                            Packet16c;
+typedef eigen_packet_wrapper<uint32_t   ,5>  Packet4uc;
+typedef uint8x8_t                            Packet8uc;
+typedef uint8x16_t                           Packet16uc;
+typedef int16x4_t                            Packet4s;
+typedef int16x8_t                            Packet8s;
+typedef uint16x4_t                           Packet4us;
+typedef uint16x8_t                           Packet8us;
+typedef int32x2_t                            Packet2i;
+typedef int32x4_t                            Packet4i;
+typedef uint32x2_t                           Packet2ui;
+typedef uint32x4_t                           Packet4ui;
+typedef int64x2_t                            Packet2l;
+typedef uint64x2_t                           Packet2ul;
+
+#endif // EIGEN_COMP_MSVC_STRICT
+
+EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){
+  const float* a = reinterpret_cast<const float*>(&m);
+  Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))};
+  return res;
+}
+
+// fuctionally equivalent to _mm_shuffle_ps in SSE when interleave
+// == false (i.e. shuffle<false>(m, n, mask) equals _mm_shuffle_ps(m, n, mask)),
+// interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h
+// to enable a shared implementation for fast inversion of matrices of size 4. 
+template<bool interleave> 
+EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int mask)
+{
+  const float* a = reinterpret_cast<const float*>(&m);
+  const float* b = reinterpret_cast<const float*>(&n);
+  Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
+  return res;
+}
+
+template<> 
+EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f &m, const Packet4f &n, int mask) 
+{
+  const float* a = reinterpret_cast<const float*>(&m);
+  const float* b = reinterpret_cast<const float*>(&n);
+  Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
+  return res;
+}
+
+EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {return ((s)<<6|(r)<<4|(q)<<2|(p));}
+
+EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s)
+{ 
+  return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s)
+{ 
+  return shuffle2<false>(a,b,eigen_neon_shuffle_mask(p, q, r, s));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
+{
+  return shuffle2<false>(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b)
+{
+  return shuffle2<false>(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b)
+{
+  return shuffle2<true>(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b)
+{
+  return shuffle2<true>(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3));
+}
+#define vec4f_duplane(a, p) \
+  vdupq_lane_f32(vget_low_f32(a), p)
 
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
   const Packet4f p4f_##NAME = pset1<Packet4f>(X)
 
 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
-  const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
+  const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
 
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
   const Packet4i p4i_##NAME = pset1<Packet4i>(X)
 
-#if EIGEN_COMP_LLVM && !EIGEN_COMP_CLANG
-  //Special treatment for Apple's llvm-gcc, its NEON packet types are unions
-  #define EIGEN_INIT_NEON_PACKET2(X, Y)       {{X, Y}}
-  #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}}
-#else
-  //Default initializer for packets
-  #define EIGEN_INIT_NEON_PACKET2(X, Y)       {X, Y}
-  #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W}
-#endif
-
-// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
-// which available on LLVM and GCC (at least)
-#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+#if EIGEN_ARCH_ARM64
+  // __builtin_prefetch tends to do nothing on ARM64 compilers because the
+  // prefetch instructions there are too detailed for __builtin_prefetch to map
+  // meaningfully to them.
+  #define EIGEN_ARM_PREFETCH(ADDR)  __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
+#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
   #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
 #elif defined __pld
   #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
-#elif !EIGEN_ARCH_ARM64
-  #define EIGEN_ARM_PREFETCH(ADDR) asm volatile ( "   pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+#elif EIGEN_ARCH_ARM
+  #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
 #else
   // by default no explicit prefetching
   #define EIGEN_ARM_PREFETCH(ADDR)
 #endif
 
-template<> struct packet_traits<float>  : default_packet_traits
+template <>
+struct packet_traits<float> : default_packet_traits
 {
   typedef Packet4f type;
-  typedef Packet4f half; // Packet2f intrinsics not implemented yet
-  enum {
+  typedef Packet2f half;
+  enum
+  {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-    HasHalfPacket=0, // Packet2f intrinsics not implemented yet
+    HasHalfPacket = 1,
 
-    HasDiv  = 1,
-    // FIXME check the Has*
-    HasSin  = 0,
-    HasCos  = 0,
-    HasTanH = 1,
-    HasLog  = 0,
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0,
+
+    HasDiv   = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1,
+
+    HasSin  = EIGEN_FAST_MATH,
+    HasCos  = EIGEN_FAST_MATH,
+    HasLog  = 1,
     HasExp  = 1,
-    HasSqrt = 0
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf  = EIGEN_FAST_MATH,
+    HasBessel = 0,  // Issues with accuracy.
+    HasNdtri = 0
   };
 };
-template<> struct packet_traits<int>    : default_packet_traits
+
+template <>
+struct packet_traits<int8_t> : default_packet_traits
 {
-  typedef Packet4i type;
-  typedef Packet4i half; // Packet2i intrinsics not implemented yet
-  enum {
+  typedef Packet16c type;
+  typedef Packet8c half;
+  enum
+  {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=4,
-    HasHalfPacket=0 // Packet2i intrinsics not implemented yet
-    // FIXME check the Has*
+    size = 16,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasAbsDiff   = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0
   };
 };
 
-#if EIGEN_GNUC_AT_MOST(4,4) && !EIGEN_COMP_LLVM
-// workaround gcc 4.2, 4.3 and 4.4 compilatin issue
+template <>
+struct packet_traits<uint8_t> : default_packet_traits
+{
+  typedef Packet16uc type;
+  typedef Packet8uc half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 0,
+    HasAbs       = 1,
+    HasAbsDiff   = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0,
+
+    HasSqrt = 1
+  };
+};
+
+template <>
+struct packet_traits<int16_t> : default_packet_traits
+{
+  typedef Packet8s type;
+  typedef Packet4s half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasAbsDiff   = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0
+  };
+};
+
+template <>
+struct packet_traits<uint16_t> : default_packet_traits
+{
+  typedef Packet8us type;
+  typedef Packet4us half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 0,
+    HasAbs       = 0,
+    HasAbsDiff   = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0,
+    HasSqrt = 1
+  };
+};
+
+template <>
+struct packet_traits<int32_t> : default_packet_traits
+{
+  typedef Packet4i type;
+  typedef Packet2i half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0
+  };
+};
+
+template <>
+struct packet_traits<uint32_t> : default_packet_traits
+{
+  typedef Packet4ui type;
+  typedef Packet2ui half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 0,
+    HasAbs       = 0,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0,
+
+    HasSqrt = 1
+  };
+};
+
+template <>
+struct packet_traits<int64_t> : default_packet_traits
+{
+  typedef Packet2l type;
+  typedef Packet2l half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+
+    HasCmp       = 1,
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0
+  };
+};
+
+template <>
+struct packet_traits<uint64_t> : default_packet_traits
+{
+  typedef Packet2ul type;
+  typedef Packet2ul half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+
+    HasCmp       = 1,
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 0,
+    HasAbs       = 0,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0
+  };
+};
+
+#if EIGEN_GNUC_AT_MOST(4, 4) && !EIGEN_COMP_LLVM
+// workaround gcc 4.2, 4.3 and 4.4 compilation issue
 EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
-EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); }
-EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); }
-EIGEN_STRONG_INLINE void        vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
-EIGEN_STRONG_INLINE void        vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
+EIGEN_STRONG_INLINE float32x2_t vld1_f32(const float* x) { return ::vld1_f32 ((const float32_t*)x); }
+EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32(const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); }
+EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
+EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
 #endif
 
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; typedef Packet4i half; };
-
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return vdupq_n_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   { return vdupq_n_s32(from); }
-
-template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a)
+template<> struct unpacket_traits<Packet2f>
 {
-  Packet4f countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
-  return vaddq_f32(pset1<Packet4f>(a), countdown);
+  typedef float type;
+  typedef Packet2f half;
+  typedef Packet2i integer_packet;
+  enum
+  {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet4f>
+{
+  typedef float type;
+  typedef Packet2f half;
+  typedef Packet4i integer_packet;
+  enum
+  {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet4c>
+{
+  typedef int8_t type;
+  typedef Packet4c half;
+  enum
+  {
+    size = 4,
+    alignment = Unaligned,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet8c>
+{
+  typedef int8_t type;
+  typedef Packet4c half;
+  enum
+  {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet16c>
+{
+  typedef int8_t type;
+  typedef Packet8c half;
+  enum
+  {
+    size = 16,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet4uc>
+{
+  typedef uint8_t type;
+  typedef Packet4uc half;
+  enum
+  {
+    size = 4,
+    alignment = Unaligned,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet8uc>
+{
+  typedef uint8_t type;
+  typedef Packet4uc half;
+  enum
+  {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet16uc>
+{
+  typedef uint8_t type;
+  typedef Packet8uc half;
+  enum
+  {
+    size = 16,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false};
+};
+template<> struct unpacket_traits<Packet4s>
+{
+  typedef int16_t type;
+  typedef Packet4s half;
+  enum
+  {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet8s>
+{
+  typedef int16_t type;
+  typedef Packet4s half;
+  enum
+  {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet4us>
+{
+  typedef uint16_t type;
+  typedef Packet4us half;
+  enum
+  {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet8us>
+{
+  typedef uint16_t type;
+  typedef Packet4us half;
+  enum
+  {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet2i>
+{
+  typedef int32_t type;
+  typedef Packet2i half;
+  enum
+  {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet4i>
+{
+  typedef int32_t type;
+  typedef Packet2i half;
+  enum
+  {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet2ui>
+{
+  typedef uint32_t type;
+  typedef Packet2ui half;
+  enum
+  {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet4ui>
+{
+  typedef uint32_t type;
+  typedef Packet2ui half;
+  enum
+  {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet2l>
+{
+  typedef int64_t type;
+  typedef Packet2l half;
+  enum
+  {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet2ul>
+{
+  typedef uint64_t type;
+  typedef Packet2ul half;
+  enum
+  {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet2f pset1<Packet2f>(const float& from) { return vdup_n_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4c pset1<Packet4c>(const int8_t& from)
+{ return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0); }
+template<> EIGEN_STRONG_INLINE Packet8c pset1<Packet8c>(const int8_t& from) { return vdup_n_s8(from); }
+template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) { return vdupq_n_s8(from); }
+template<> EIGEN_STRONG_INLINE Packet4uc pset1<Packet4uc>(const uint8_t& from)
+{ return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0); }
+template<> EIGEN_STRONG_INLINE Packet8uc pset1<Packet8uc>(const uint8_t& from) { return vdup_n_u8(from); }
+template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) { return vdupq_n_u8(from); }
+template<> EIGEN_STRONG_INLINE Packet4s pset1<Packet4s>(const int16_t& from) { return vdup_n_s16(from); }
+template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) { return vdupq_n_s16(from); }
+template<> EIGEN_STRONG_INLINE Packet4us pset1<Packet4us>(const uint16_t& from) { return vdup_n_u16(from); }
+template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) { return vdupq_n_u16(from); }
+template<> EIGEN_STRONG_INLINE Packet2i pset1<Packet2i>(const int32_t& from) { return vdup_n_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) { return vdupq_n_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet2ui pset1<Packet2ui>(const uint32_t& from) { return vdup_n_u32(from); }
+template<> EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) { return vdupq_n_u32(from); }
+template<> EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) { return vdupq_n_s64(from); }
+template<> EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) { return vdupq_n_u64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pset1frombits<Packet2f>(unsigned int from)
+{ return vreinterpret_f32_u32(vdup_n_u32(from)); }
+template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from)
+{ return vreinterpretq_f32_u32(vdupq_n_u32(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet2f plset<Packet2f>(const float& a)
+{
+  const float c[] = {0.0f,1.0f};
+  return vadd_f32(pset1<Packet2f>(a), vld1_f32(c));
 }
-template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a)
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
 {
-  Packet4i countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
-  return vaddq_s32(pset1<Packet4i>(a), countdown);
+  const float c[] = {0.0f,1.0f,2.0f,3.0f};
+  return vaddq_f32(pset1<Packet4f>(a), vld1q_f32(c));
+}
+template<> EIGEN_STRONG_INLINE Packet4c plset<Packet4c>(const int8_t& a)
+{ return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0); }
+template<> EIGEN_STRONG_INLINE Packet8c plset<Packet8c>(const int8_t& a)
+{
+  const int8_t c[] = {0,1,2,3,4,5,6,7};
+  return vadd_s8(pset1<Packet8c>(a), vld1_s8(c));
+}
+template<> EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a)
+{
+  const int8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+  return vaddq_s8(pset1<Packet16c>(a), vld1q_s8(c));
+}
+template<> EIGEN_STRONG_INLINE Packet4uc plset<Packet4uc>(const uint8_t& a)
+{ return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0); }
+template<> EIGEN_STRONG_INLINE Packet8uc plset<Packet8uc>(const uint8_t& a)
+{
+  const uint8_t c[] = {0,1,2,3,4,5,6,7};
+  return vadd_u8(pset1<Packet8uc>(a), vld1_u8(c));
+}
+template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a)
+{
+  const uint8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+  return vaddq_u8(pset1<Packet16uc>(a), vld1q_u8(c));
+}
+template<> EIGEN_STRONG_INLINE Packet4s plset<Packet4s>(const int16_t& a)
+{
+  const int16_t c[] = {0,1,2,3};
+  return vadd_s16(pset1<Packet4s>(a), vld1_s16(c));
+}
+template<> EIGEN_STRONG_INLINE Packet4us plset<Packet4us>(const uint16_t& a)
+{
+  const uint16_t c[] = {0,1,2,3};
+  return vadd_u16(pset1<Packet4us>(a), vld1_u16(c));
+}
+template<> EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a)
+{
+  const int16_t c[] = {0,1,2,3,4,5,6,7};
+  return vaddq_s16(pset1<Packet8s>(a), vld1q_s16(c));
+}
+template<> EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a)
+{
+  const uint16_t c[] = {0,1,2,3,4,5,6,7};
+  return vaddq_u16(pset1<Packet8us>(a), vld1q_u16(c));
+}
+template<> EIGEN_STRONG_INLINE Packet2i plset<Packet2i>(const int32_t& a)
+{
+  const int32_t c[] = {0,1};
+  return vadd_s32(pset1<Packet2i>(a), vld1_s32(c));
+}
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a)
+{
+  const int32_t c[] = {0,1,2,3};
+  return vaddq_s32(pset1<Packet4i>(a), vld1q_s32(c));
+}
+template<> EIGEN_STRONG_INLINE Packet2ui plset<Packet2ui>(const uint32_t& a)
+{
+  const uint32_t c[] = {0,1};
+  return vadd_u32(pset1<Packet2ui>(a), vld1_u32(c));
+}
+template<> EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a)
+{
+  const uint32_t c[] = {0,1,2,3};
+  return vaddq_u32(pset1<Packet4ui>(a), vld1q_u32(c));
+}
+template<> EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a)
+{
+  const int64_t c[] = {0,1};
+  return vaddq_s64(pset1<Packet2l>(a), vld1q_s64(c));
+}
+template<> EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a)
+{
+  const uint64_t c[] = {0,1};
+  return vaddq_u64(pset1<Packet2ul>(a), vld1q_u64(c));
 }
 
+template<> EIGEN_STRONG_INLINE Packet2f padd<Packet2f>(const Packet2f& a, const Packet2f& b) { return vadd_f32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4c padd<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c padd<Packet8c>(const Packet8c& a, const Packet8c& b) { return vadd_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) { return vaddq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc padd<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc padd<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vadd_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vaddq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s padd<Packet4s>(const Packet4s& a, const Packet4s& b) { return vadd_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) { return vaddq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us padd<Packet4us>(const Packet4us& a, const Packet4us& b) { return vadd_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) { return vaddq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i padd<Packet2i>(const Packet2i& a, const Packet2i& b) { return vadd_s32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui padd<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vadd_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vaddq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) { return vaddq_s64(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { return vaddq_u64(a,b); }
 
+template<> EIGEN_STRONG_INLINE Packet2f psub<Packet2f>(const Packet2f& a, const Packet2f& b) { return vsub_f32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4c psub<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vsub_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c psub<Packet8c>(const Packet8c& a, const Packet8c& b) { return vsub_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) { return vsubq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc psub<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vsub_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc psub<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vsub_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vsubq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s psub<Packet4s>(const Packet4s& a, const Packet4s& b) { return vsub_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) { return vsubq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us psub<Packet4us>(const Packet4us& a, const Packet4us& b) { return vsub_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) { return vsubq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i psub<Packet2i>(const Packet2i& a, const Packet2i& b) { return vsub_s32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui psub<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vsub_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vsubq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) { return vsubq_s64(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { return vsubq_u64(a,b); }
 
+template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b);
+template<> EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(const Packet2f& a, const Packet2f & b) {
+  Packet2f mask = {numext::bit_cast<float>(0x80000000u), 0.0f};
+  return padd(a, pxor(mask, b));
+}
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
+template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f mask = {numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f};
+  return padd(a, pxor(mask, b));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) { return vneg_f32(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); }
+template<> EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a)
+{ return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
+template<> EIGEN_STRONG_INLINE Packet8c pnegate(const Packet8c& a) { return vneg_s8(a); }
+template<> EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) { return vnegq_s8(a); }
+template<> EIGEN_STRONG_INLINE Packet4s pnegate(const Packet4s& a) { return vneg_s16(a); }
+template<> EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) { return vnegq_s16(a); }
+template<> EIGEN_STRONG_INLINE Packet2i pnegate(const Packet2i& a) { return vneg_s32(a); }
 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); }
+template<> EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
+#if EIGEN_ARCH_ARM64
+  return vnegq_s64(a);
+#else
+  return vcombine_s64(
+      vdup_n_s64(-vgetq_lane_s64(a, 0)),
+      vdup_n_s64(-vgetq_lane_s64(a, 1)));
+#endif
+}
 
+template<> EIGEN_STRONG_INLINE Packet2f pconj(const Packet2f& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4c pconj(const Packet4c& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8c pconj(const Packet8c& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4uc pconj(const Packet4uc& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8uc pconj(const Packet8uc& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4s pconj(const Packet4s& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4us pconj(const Packet4us& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2i pconj(const Packet2i& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2ui pconj(const Packet2ui& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) { return a; }
 
+template<> EIGEN_STRONG_INLINE Packet2f pmul<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmul_f32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4c pmul<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vmul_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pmul<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmul_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) { return vmulq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pmul<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vmul_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pmul<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmul_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vmulq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pmul<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmul_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) { return vmulq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pmul<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmul_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) { return vmulq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pmul<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmul_s32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pselect<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& false_mask) {
-  return vbslq_f32(vreinterpretq_u32_f32(false_mask), b, a);
+template<> EIGEN_STRONG_INLINE Packet2ui pmul<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmul_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vmulq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vcombine_s64(
+    vdup_n_s64(vgetq_lane_s64(a, 0)*vgetq_lane_s64(b, 0)),
+    vdup_n_s64(vgetq_lane_s64(a, 1)*vgetq_lane_s64(b, 1)));
 }
-template<> EIGEN_STRONG_INLINE Packet4i pselect<Packet4i>(const Packet4i& a, const Packet4i& b, const Packet4i& false_mask) {
-  return vbslq_s32(vreinterpretq_u32_s32(false_mask), b, a);
+template<> EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vcombine_u64(
+    vdup_n_u64(vgetq_lane_u64(a, 0)*vgetq_lane_u64(b, 0)),
+    vdup_n_u64(vgetq_lane_u64(a, 1)*vgetq_lane_u64(b, 1)));
 }
 
+template<> EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vdiv_f32(a,b);
+#else
+  Packet2f inv, restep, div;
+
+  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
+  // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
+  // a reciprocal estimate AND a reciprocal step -which saves a few instructions
+  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
+  // Newton-Raphson and vrecpsq_f32()
+  inv = vrecpe_f32(b);
+
+  // This returns a differential, by which we will have to multiply inv to get a better
+  // approximation of 1/b.
+  restep = vrecps_f32(b, inv);
+  inv = vmul_f32(restep, inv);
+
+  // Finally, multiply a by 1/b and get the wanted result of the division.
+  div = vmul_f32(a, inv);
+
+  return div;
+#endif
+}
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
 #if EIGEN_ARCH_ARM64
@@ -179,356 +998,2629 @@
 #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(const Packet4c& /*a*/, const Packet4c& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4c>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pdiv<Packet8c>(const Packet8c& /*a*/, const Packet8c& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet8c>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet16c pdiv<Packet16c>(const Packet16c& /*a*/, const Packet16c& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet16c>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet4uc pdiv<Packet4uc>(const Packet4uc& /*a*/, const Packet4uc& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4uc>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pdiv<Packet8uc>(const Packet8uc& /*a*/, const Packet8uc& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet8uc>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& /*a*/, const Packet16uc& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet16uc>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet4s pdiv<Packet4s>(const Packet4s& /*a*/, const Packet4s& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4s>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& /*a*/, const Packet8s& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet8s>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet4us pdiv<Packet4us>(const Packet4us& /*a*/, const Packet4us& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4us>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& /*a*/, const Packet8us& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet8us>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet2i pdiv<Packet2i>(const Packet2i& /*a*/, const Packet2i& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet2i>(0);
+}
 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
-{ eigen_assert(false && "packet integer division are not supported by NEON");
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
   return pset1<Packet4i>(0);
 }
+template<> EIGEN_STRONG_INLINE Packet2ui pdiv<Packet2ui>(const Packet2ui& /*a*/, const Packet2ui& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet2ui>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& /*a*/, const Packet4ui& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4ui>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& /*a*/, const Packet2l& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet2l>(0LL);
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& /*a*/, const Packet2ul& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet2ul>(0ULL);
+}
 
-// Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available,
-// then implements a slow software scalar fallback calling fmaf()!
-#if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM)
-// See bug 936.
-// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4.
-// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding.
-// MLA is not fused i.e. does 2 roundings.
-// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4):
-// MLA: 10 GFlop/s ; FMA: 12 GFlops/s.
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); }
+
+#ifdef __ARM_FEATURE_FMA
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
+{ return vfmaq_f32(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
+{ return vfma_f32(c,a,b); }
 #else
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
-#if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM
-  // Clang/ARM will replace VMLA by VMUL+VADD at least for some values of -mcpu,
-  // at least -mcpu=cortex-a8 and -mcpu=cortex-a7. Since the former is the default on
-  // -march=armv7-a, that is a very common case.
-  // See e.g. this thread:
-  //     http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html
-  Packet4f r = c;
-  asm volatile(
-    "vmla.f32 %q[r], %q[a], %q[b]"
-    : [r] "+w" (r)
-    : [a] "w" (a),
-      [b] "w" (b)
-    : );
-  return r;
-#else
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
+{
   return vmlaq_f32(c,a,b);
-#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
+{
+  return vmla_f32(c,a,b);
 }
 #endif
 
 // No FMA instruction for int, so use MLA unconditionally.
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet4c pmadd(const Packet4c& a, const Packet4c& b, const Packet4c& c)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vmla_s8(
+      vreinterpret_s8_s32(vdup_n_s32(c)),
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pmadd(const Packet8c& a, const Packet8c& b, const Packet8c& c)
+{ return vmla_s8(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c)
+{ return vmlaq_s8(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pmadd(const Packet4uc& a, const Packet4uc& b, const Packet4uc& c)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vmla_u8(
+      vreinterpret_u8_u32(vdup_n_u32(c)),
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pmadd(const Packet8uc& a, const Packet8uc& b, const Packet8uc& c)
+{ return vmla_u8(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c)
+{ return vmlaq_u8(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pmadd(const Packet4s& a, const Packet4s& b, const Packet4s& c)
+{ return vmla_s16(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c)
+{ return vmlaq_s16(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pmadd(const Packet4us& a, const Packet4us& b, const Packet4us& c)
+{ return vmla_u16(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c)
+{ return vmlaq_u16(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pmadd(const Packet2i& a, const Packet2i& b, const Packet2i& c)
+{ return vmla_s32(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c)
+{ return vmlaq_s32(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pmadd(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c)
+{ return vmla_u32(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c)
+{ return vmlaq_u32(c,a,b); }
 
+template<> EIGEN_STRONG_INLINE Packet2f pabsdiff<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vabd_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vabdq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4c pabsdiff<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vabd_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pabsdiff<Packet8c>(const Packet8c& a, const Packet8c& b)
+{ return vabd_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b)
+{ return vabdq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pabsdiff<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vabd_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pabsdiff<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return vabd_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return vabdq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pabsdiff<Packet4s>(const Packet4s& a, const Packet4s& b)
+{ return vabd_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b)
+{ return vabdq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pabsdiff<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return vabd_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return vabdq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pabsdiff<Packet2i>(const Packet2i& a, const Packet2i& b)
+{ return vabd_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b)
+{ return vabdq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pabsdiff<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return vabd_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return vabdq_u32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pmin<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmin_f32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
+
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
+template<> EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) { return vminnmq_f32(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2f pmin<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) { return vminnm_f32(a, b); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) { return pmin<Packet4f>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pmin<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) { return pmin<Packet2f>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4c pmin<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vmin_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pmin<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmin_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vminq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pmin<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vmin_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pmin<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmin_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vminq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pmin<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmin_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vminq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pmin<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmin_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vminq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pmin<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmin_s32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pmin<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmin_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vminq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vcombine_s64(
+      vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
+      vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vcombine_u64(
+      vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
+      vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
+}
 
+template<> EIGEN_STRONG_INLINE Packet2f pmax<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmax_f32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
 
-// TODO(ebrevdo): add support for ple, plt, peq using vcle_f32/s32 or
-// vcleq_f32/s32, and their ilk, respectively, once it's clear which condition code to use.
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
+template<> EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxnmq_f32(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2f pmax<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) { return vmaxnm_f32(a, b); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) { return pmax<Packet4f>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pmax<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) { return pmax<Packet2f>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4c pmax<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vmax_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pmax<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmax_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vmaxq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pmax<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vmax_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pmax<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmax_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vmaxq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pmax<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmax_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vmaxq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pmax<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmax_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vmaxq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pmax<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmax_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pmax<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmax_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vmaxq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vcombine_s64(
+      vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
+      vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vcombine_u64(
+      vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
+      vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pcmp_le<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(vcle_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vreinterpretq_f32_u32(vcleq_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4c pcmp_le<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_u8(vcle_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pcmp_le<Packet8c>(const Packet8c& a, const Packet8c& b)
+{ return vreinterpret_s8_u8(vcle_s8(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b)
+{ return vreinterpretq_s8_u8(vcleq_s8(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4uc pcmp_le<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vcle_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pcmp_le<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return vcle_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return vcleq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pcmp_le<Packet4s>(const Packet4s& a, const Packet4s& b)
+{ return vreinterpret_s16_u16(vcle_s16(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b)
+{ return vreinterpretq_s16_u16(vcleq_s16(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4us pcmp_le<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return vcle_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return vcleq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pcmp_le<Packet2i>(const Packet2i& a, const Packet2i& b)
+{ return vreinterpret_s32_u32(vcle_s32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b)
+{ return vreinterpretq_s32_u32(vcleq_s32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2ui pcmp_le<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return vcle_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return vcleq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(const Packet2l& a, const Packet2l& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vreinterpretq_s64_u64(vcleq_s64(a,b));
+#else
+  return vcombine_s64(
+      vdup_n_s64(vgetq_lane_s64(a, 0) <= vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
+      vdup_n_s64(vgetq_lane_s64(a, 1) <= vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vcleq_u64(a,b);
+#else
+  return vcombine_u64(
+      vdup_n_u64(vgetq_lane_u64(a, 0) <= vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
+      vdup_n_u64(vgetq_lane_u64(a, 1) <= vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(vclt_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vreinterpretq_f32_u32(vcltq_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4c pcmp_lt<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_u8(vclt_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pcmp_lt<Packet8c>(const Packet8c& a, const Packet8c& b)
+{ return vreinterpret_s8_u8(vclt_s8(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b)
+{ return vreinterpretq_s8_u8(vcltq_s8(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4uc pcmp_lt<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vclt_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pcmp_lt<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return vclt_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return vcltq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pcmp_lt<Packet4s>(const Packet4s& a, const Packet4s& b)
+{ return vreinterpret_s16_u16(vclt_s16(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b)
+{ return vreinterpretq_s16_u16(vcltq_s16(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4us pcmp_lt<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return vclt_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return vcltq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pcmp_lt<Packet2i>(const Packet2i& a, const Packet2i& b)
+{ return vreinterpret_s32_u32(vclt_s32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b)
+{ return vreinterpretq_s32_u32(vcltq_s32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2ui pcmp_lt<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return vclt_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return vcltq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(const Packet2l& a, const Packet2l& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vreinterpretq_s64_u64(vcltq_s64(a,b));
+#else
+  return vcombine_s64(
+      vdup_n_s64(vgetq_lane_s64(a, 0) < vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
+      vdup_n_s64(vgetq_lane_s64(a, 1) < vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vcltq_u64(a,b);
+#else
+  return vcombine_u64(
+      vdup_n_u64(vgetq_lane_u64(a, 0) < vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
+      vdup_n_u64(vgetq_lane_u64(a, 1) < vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pcmp_eq<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(vceq_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vreinterpretq_f32_u32(vceqq_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4c pcmp_eq<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_u8(vceq_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pcmp_eq<Packet8c>(const Packet8c& a, const Packet8c& b)
+{ return vreinterpret_s8_u8(vceq_s8(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b)
+{ return vreinterpretq_s8_u8(vceqq_s8(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4uc pcmp_eq<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vceq_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pcmp_eq<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return vceq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return vceqq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pcmp_eq<Packet4s>(const Packet4s& a, const Packet4s& b)
+{ return vreinterpret_s16_u16(vceq_s16(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b)
+{ return vreinterpretq_s16_u16(vceqq_s16(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4us pcmp_eq<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return vceq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return vceqq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pcmp_eq<Packet2i>(const Packet2i& a, const Packet2i& b)
+{ return vreinterpret_s32_u32(vceq_s32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b)
+{ return vreinterpretq_s32_u32(vceqq_s32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2ui pcmp_eq<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return vceq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return vceqq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(const Packet2l& a, const Packet2l& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vreinterpretq_s64_u64(vceqq_s64(a,b));
+#else
+  return vcombine_s64(
+      vdup_n_s64(vgetq_lane_s64(a, 0) == vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
+      vdup_n_s64(vgetq_lane_s64(a, 1) == vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vceqq_u64(a,b);
+#else
+  return vcombine_u64(
+      vdup_n_u64(vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
+      vdup_n_u64(vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a,b))); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); }
 
 // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
+template<> EIGEN_STRONG_INLINE Packet2f pand<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
-}
+{ return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
+template<> EIGEN_STRONG_INLINE Packet4c pand<Packet4c>(const Packet4c& a, const Packet4c& b)
+{ return a & b; }
+template<> EIGEN_STRONG_INLINE Packet8c pand<Packet8c>(const Packet8c& a, const Packet8c& b)
+{ return vand_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b)
+{ return vandq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pand<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{ return a & b; }
+template<> EIGEN_STRONG_INLINE Packet8uc pand<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return vand_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return vandq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pand<Packet4s>(const Packet4s& a, const Packet4s& b) { return vand_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) { return vandq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pand<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return vand_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return vandq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pand<Packet2i>(const Packet2i& a, const Packet2i& b) { return vand_s32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pand<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return vand_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return vandq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) { return vandq_s64(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
+{ return vandq_u64(a,b); }
 
+template<> EIGEN_STRONG_INLINE Packet2f por<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
-}
+{ return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
+template<> EIGEN_STRONG_INLINE Packet4c por<Packet4c>(const Packet4c& a, const Packet4c& b)
+{ return a | b; }
+template<> EIGEN_STRONG_INLINE Packet8c por<Packet8c>(const Packet8c& a, const Packet8c& b) { return vorr_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b)
+{ return vorrq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc por<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{ return a | b; }
+template<> EIGEN_STRONG_INLINE Packet8uc por<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return vorr_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return vorrq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s por<Packet4s>(const Packet4s& a, const Packet4s& b)
+{ return vorr_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b)
+{ return vorrq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us por<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return vorr_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return vorrq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i por<Packet2i>(const Packet2i& a, const Packet2i& b) { return vorr_s32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui por<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return vorr_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return vorrq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b)
+{ return vorrq_s64(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
+{ return vorrq_u64(a,b); }
 
+template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
-}
+{ return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
+template<> EIGEN_STRONG_INLINE Packet4c pxor<Packet4c>(const Packet4c& a, const Packet4c& b)
+{ return a ^ b; }
+template<> EIGEN_STRONG_INLINE Packet8c pxor<Packet8c>(const Packet8c& a, const Packet8c& b)
+{ return veor_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b)
+{ return veorq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pxor<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{ return a ^ b; }
+template<> EIGEN_STRONG_INLINE Packet8uc pxor<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return veor_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return veorq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pxor<Packet4s>(const Packet4s& a, const Packet4s& b) { return veor_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) { return veorq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pxor<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return veor_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return veorq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pxor<Packet2i>(const Packet2i& a, const Packet2i& b) { return veor_s32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pxor<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return veor_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return veorq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b)
+{ return veorq_s64(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
+{ return veorq_u64(a,b); }
 
+template<> EIGEN_STRONG_INLINE Packet2f pandnot<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
+template<> EIGEN_STRONG_INLINE Packet4c pandnot<Packet4c>(const Packet4c& a, const Packet4c& b)
+{ return a & ~b; }
+template<> EIGEN_STRONG_INLINE Packet8c pandnot<Packet8c>(const Packet8c& a, const Packet8c& b) { return vbic_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) { return vbicq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pandnot<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{ return a & ~b; }
+template<> EIGEN_STRONG_INLINE Packet8uc pandnot<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return vbic_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return vbicq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pandnot<Packet4s>(const Packet4s& a, const Packet4s& b)
+{ return vbic_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b)
+{ return vbicq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pandnot<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return vbic_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return vbicq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pandnot<Packet2i>(const Packet2i& a, const Packet2i& b)
+{ return vbic_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b)
+{ return vbicq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pandnot<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return vbic_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return vbicq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b)
+{ return vbicq_s64(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
+{ return vbicq_u64(a,b); }
+
+
+template<int N> EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a)
+{ return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
+template<int N> EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) { return vshr_n_s8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(Packet16c a) { return vshrq_n_s8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4uc parithmetic_shift_right(Packet4uc& a)
+{ return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); }
+template<int N> EIGEN_STRONG_INLINE Packet8uc parithmetic_shift_right(Packet8uc a) { return vshr_n_u8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4s parithmetic_shift_right(Packet4s a) { return vshr_n_s16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) { return vshrq_n_s16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4us parithmetic_shift_right(Packet4us a) { return vshr_n_u16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(Packet8us a) { return vshrq_n_u16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2i parithmetic_shift_right(Packet2i a) { return vshr_n_s32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) { return vshrq_n_s32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2ui parithmetic_shift_right(Packet2ui a) { return vshr_n_u32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(Packet2l a) { return vshrq_n_s64(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); }
+
+template<int N> EIGEN_STRONG_INLINE Packet4c plogical_shift_right(Packet4c& a)
+{ return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0); }
+template<int N> EIGEN_STRONG_INLINE Packet8c plogical_shift_right(Packet8c a)
+{ return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a),N)); }
+template<int N> EIGEN_STRONG_INLINE Packet16c plogical_shift_right(Packet16c a)
+{ return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a),N)); }
+template<int N> EIGEN_STRONG_INLINE Packet4uc plogical_shift_right(Packet4uc& a)
+{ return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0); }
+template<int N> EIGEN_STRONG_INLINE Packet8uc plogical_shift_right(Packet8uc a) { return vshr_n_u8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4s plogical_shift_right(Packet4s a)
+{ return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a),N)); }
+template<int N> EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a)
+{ return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a),N)); }
+template<int N> EIGEN_STRONG_INLINE Packet4us plogical_shift_right(Packet4us a) { return vshr_n_u16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(Packet8us a) { return vshrq_n_u16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2i plogical_shift_right(Packet2i a)
+{ return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a),N)); }
+template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a)
+{ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a),N)); }
+template<int N> EIGEN_STRONG_INLINE Packet2ui plogical_shift_right(Packet2ui a) { return vshr_n_u32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a)
+{ return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a),N)); }
+template<int N> EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); }
+
+template<int N> EIGEN_STRONG_INLINE Packet4c plogical_shift_left(Packet4c& a)
+{ return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
+template<int N> EIGEN_STRONG_INLINE Packet8c plogical_shift_left(Packet8c a) { return vshl_n_s8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet16c plogical_shift_left(Packet16c a) { return vshlq_n_s8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4uc plogical_shift_left(Packet4uc& a)
+{ return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); }
+template<int N> EIGEN_STRONG_INLINE Packet8uc plogical_shift_left(Packet8uc a) { return vshl_n_u8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(Packet16uc a) { return vshlq_n_u8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4s plogical_shift_left(Packet4s a) { return vshl_n_s16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) { return vshlq_n_s16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4us plogical_shift_left(Packet4us a) { return vshl_n_u16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) { return vshlq_n_u16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2i plogical_shift_left(Packet2i a) { return vshl_n_s32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) { return vshlq_n_s32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2ui plogical_shift_left(Packet2ui a) { return vshl_n_u32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) { return vshlq_n_u32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) { return vshlq_n_s64(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) { return vshlq_n_u64(a,N); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pload<Packet2f>(const float* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4c pload<Packet4c>(const int8_t* from)
 {
-  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
-}
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*   from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
-
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)   { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
-
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
-{
-  float32x2_t lo, hi;
-  lo = vld1_dup_f32(from);
-  hi = vld1_dup_f32(from+1);
-  return vcombine_f32(lo, hi);
-}
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
-{
-  int32x2_t lo, hi;
-  lo = vld1_dup_s32(from);
-  hi = vld1_dup_s32(from+1);
-  return vcombine_s32(lo, hi);
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
-
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride)
-{
-  Packet4f res = pset1<Packet4f>(0);
-  res = vsetq_lane_f32(from[0*stride], res, 0);
-  res = vsetq_lane_f32(from[1*stride], res, 1);
-  res = vsetq_lane_f32(from[2*stride], res, 2);
-  res = vsetq_lane_f32(from[3*stride], res, 3);
+  Packet4c res;
+  memcpy(&res, from, sizeof(Packet4c));
   return res;
 }
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride)
+template<> EIGEN_STRONG_INLINE Packet8c pload<Packet8c>(const int8_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from); }
+template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from); }
+template<> EIGEN_STRONG_INLINE Packet4uc pload<Packet4uc>(const uint8_t* from)
 {
-  Packet4i res = pset1<Packet4i>(0);
-  res = vsetq_lane_s32(from[0*stride], res, 0);
-  res = vsetq_lane_s32(from[1*stride], res, 1);
-  res = vsetq_lane_s32(from[2*stride], res, 2);
-  res = vsetq_lane_s32(from[3*stride], res, 3);
+  Packet4uc res;
+  memcpy(&res, from, sizeof(Packet4uc));
+  return res;
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pload<Packet8uc>(const uint8_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from); }
+template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from); }
+template<> EIGEN_STRONG_INLINE Packet4s pload<Packet4s>(const int16_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from); }
+template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from); }
+template<> EIGEN_STRONG_INLINE Packet4us pload<Packet4us>(const uint16_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from); }
+template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from); }
+template<> EIGEN_STRONG_INLINE Packet2i pload<Packet2i>(const int32_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet2ui pload<Packet2ui>(const uint32_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from); }
+template<> EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from); }
+template<> EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from); }
+template<> EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2f ploadu<Packet2f>(const float* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4c ploadu<Packet4c>(const int8_t* from)
+{
+  Packet4c res;
+  memcpy(&res, from, sizeof(Packet4c));
+  return res;
+}
+template<> EIGEN_STRONG_INLINE Packet8c ploadu<Packet8c>(const int8_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s8(from); }
+template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s8(from); }
+template<> EIGEN_STRONG_INLINE Packet4uc ploadu<Packet4uc>(const uint8_t* from)
+{
+  Packet4uc res;
+  memcpy(&res, from, sizeof(Packet4uc));
+  return res;
+}
+template<> EIGEN_STRONG_INLINE Packet8uc ploadu<Packet8uc>(const uint8_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u8(from); }
+template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u8(from); }
+template<> EIGEN_STRONG_INLINE Packet4s ploadu<Packet4s>(const int16_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s16(from); }
+template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s16(from); }
+template<> EIGEN_STRONG_INLINE Packet4us ploadu<Packet4us>(const uint16_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u16(from); }
+template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u16(from); }
+template<> EIGEN_STRONG_INLINE Packet2i ploadu<Packet2i>(const int32_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet2ui ploadu<Packet2ui>(const uint32_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u32(from); }
+template<> EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u32(from); }
+template<> EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s64(from); }
+template<> EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2f ploaddup<Packet2f>(const float* from)
+{ return vld1_dup_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
+{ return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from+1)); }
+template<> EIGEN_STRONG_INLINE Packet4c ploaddup<Packet4c>(const int8_t* from)
+{
+  const int8x8_t a = vreinterpret_s8_s32(vdup_n_s32(pload<Packet4c>(from)));
+  return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a,a).val[0]), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c ploaddup<Packet8c>(const int8_t* from)
+{
+  const int8x8_t a = vld1_s8(from);
+  return vzip_s8(a,a).val[0];
+}
+template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from)
+{
+  const int8x8_t a = vld1_s8(from);
+  const int8x8x2_t b = vzip_s8(a,a);
+  return vcombine_s8(b.val[0], b.val[1]);
+}
+template<> EIGEN_STRONG_INLINE Packet4uc ploaddup<Packet4uc>(const uint8_t* from)
+{
+  const uint8x8_t a = vreinterpret_u8_u32(vdup_n_u32(pload<Packet4uc>(from)));
+  return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a,a).val[0]), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc ploaddup<Packet8uc>(const uint8_t* from)
+{
+  const uint8x8_t a = vld1_u8(from);
+  return vzip_u8(a,a).val[0];
+}
+template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from)
+{
+  const uint8x8_t a = vld1_u8(from);
+  const uint8x8x2_t b = vzip_u8(a,a);
+  return vcombine_u8(b.val[0], b.val[1]);
+}
+template<> EIGEN_STRONG_INLINE Packet4s ploaddup<Packet4s>(const int16_t* from)
+{
+  return vreinterpret_s16_u32(vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)),
+      vreinterpret_u32_s16(vld1_dup_s16(from+1))).val[0]);
+}
+template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from)
+{
+  const int16x4_t a = vld1_s16(from);
+  const int16x4x2_t b = vzip_s16(a,a);
+  return vcombine_s16(b.val[0], b.val[1]);
+}
+template<> EIGEN_STRONG_INLINE Packet4us ploaddup<Packet4us>(const uint16_t* from)
+{
+  return vreinterpret_u16_u32(vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)),
+      vreinterpret_u32_u16(vld1_dup_u16(from+1))).val[0]);
+}
+template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from)
+{
+  const uint16x4_t a = vld1_u16(from);
+  const uint16x4x2_t b = vzip_u16(a,a);
+  return vcombine_u16(b.val[0], b.val[1]);
+}
+template<> EIGEN_STRONG_INLINE Packet2i ploaddup<Packet2i>(const int32_t* from)
+{ return vld1_dup_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from)
+{ return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from+1)); }
+template<> EIGEN_STRONG_INLINE Packet2ui ploaddup<Packet2ui>(const uint32_t* from)
+{ return vld1_dup_u32(from); }
+template<> EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from)
+{ return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from+1)); }
+template<> EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from)
+{ return vld1q_dup_s64(from); }
+template<> EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from)
+{ return vld1q_dup_u64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) { return vld1q_dup_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4c ploadquad<Packet4c>(const int8_t* from)
+{ return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0); }
+template<> EIGEN_STRONG_INLINE Packet8c ploadquad<Packet8c>(const int8_t* from)
+{
+  return vreinterpret_s8_u32(vzip_u32(
+      vreinterpret_u32_s8(vld1_dup_s8(from)),
+      vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]);
+}
+template<> EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from)
+{
+  const int8x8_t a = vreinterpret_s8_u32(vzip_u32(
+      vreinterpret_u32_s8(vld1_dup_s8(from)),
+      vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]);
+  const int8x8_t b = vreinterpret_s8_u32(vzip_u32(
+      vreinterpret_u32_s8(vld1_dup_s8(from+2)),
+      vreinterpret_u32_s8(vld1_dup_s8(from+3))).val[0]);
+  return vcombine_s8(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet4uc ploadquad<Packet4uc>(const uint8_t* from)
+{ return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0); }
+template<> EIGEN_STRONG_INLINE Packet8uc ploadquad<Packet8uc>(const uint8_t* from)
+{
+  return vreinterpret_u8_u32(vzip_u32(
+      vreinterpret_u32_u8(vld1_dup_u8(from)),
+      vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]);
+}
+template<> EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from)
+{
+  const uint8x8_t a = vreinterpret_u8_u32(vzip_u32(
+      vreinterpret_u32_u8(vld1_dup_u8(from)),
+      vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]);
+  const uint8x8_t b = vreinterpret_u8_u32(vzip_u32(
+      vreinterpret_u32_u8(vld1_dup_u8(from+2)),
+      vreinterpret_u32_u8(vld1_dup_u8(from+3))).val[0]);
+  return vcombine_u8(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from)
+{ return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from+1)); }
+template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from)
+{ return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from+1)); }
+template<> EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) { return vld1q_dup_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) { return vld1q_dup_u32(from); }
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet2f& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet4c& from)
+{ memcpy(to, &from, sizeof(from)); }
+template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet8c& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet4uc& from)
+{ memcpy(to, &from, sizeof(from)); }
+template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet8uc& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet4s& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet4us& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet2i& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet2ui& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to,from); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet2f& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet4c& from)
+{ memcpy(to, &from, sizeof(from)); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet8c& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet4uc& from)
+{ memcpy(to, &from, sizeof(from)); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet8uc& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet4s& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet4us& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet2i& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet2ui& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to,from); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pgather<float, Packet2f>(const float* from, Index stride)
+{
+  Packet2f res = vld1_dup_f32(from);
+  res = vld1_lane_f32(from + 1*stride, res, 1);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+  Packet4f res = vld1q_dup_f32(from);
+  res = vld1q_lane_f32(from + 1*stride, res, 1);
+  res = vld1q_lane_f32(from + 2*stride, res, 2);
+  res = vld1q_lane_f32(from + 3*stride, res, 3);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pgather<int8_t, Packet4c>(const int8_t* from, Index stride)
+{
+  Packet4c res;
+  for (int i = 0; i != 4; i++)
+    reinterpret_cast<int8_t*>(&res)[i] = *(from + i * stride);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pgather<int8_t, Packet8c>(const int8_t* from, Index stride)
+{
+  Packet8c res = vld1_dup_s8(from);
+  res = vld1_lane_s8(from + 1*stride, res, 1);
+  res = vld1_lane_s8(from + 2*stride, res, 2);
+  res = vld1_lane_s8(from + 3*stride, res, 3);
+  res = vld1_lane_s8(from + 4*stride, res, 4);
+  res = vld1_lane_s8(from + 5*stride, res, 5);
+  res = vld1_lane_s8(from + 6*stride, res, 6);
+  res = vld1_lane_s8(from + 7*stride, res, 7);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride)
+{
+  Packet16c res = vld1q_dup_s8(from);
+  res = vld1q_lane_s8(from + 1*stride, res, 1);
+  res = vld1q_lane_s8(from + 2*stride, res, 2);
+  res = vld1q_lane_s8(from + 3*stride, res, 3);
+  res = vld1q_lane_s8(from + 4*stride, res, 4);
+  res = vld1q_lane_s8(from + 5*stride, res, 5);
+  res = vld1q_lane_s8(from + 6*stride, res, 6);
+  res = vld1q_lane_s8(from + 7*stride, res, 7);
+  res = vld1q_lane_s8(from + 8*stride, res, 8);
+  res = vld1q_lane_s8(from + 9*stride, res, 9);
+  res = vld1q_lane_s8(from + 10*stride, res, 10);
+  res = vld1q_lane_s8(from + 11*stride, res, 11);
+  res = vld1q_lane_s8(from + 12*stride, res, 12);
+  res = vld1q_lane_s8(from + 13*stride, res, 13);
+  res = vld1q_lane_s8(from + 14*stride, res, 14);
+  res = vld1q_lane_s8(from + 15*stride, res, 15);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pgather<uint8_t, Packet4uc>(const uint8_t* from, Index stride)
+{
+  Packet4uc res;
+  for (int i = 0; i != 4; i++)
+    reinterpret_cast<uint8_t*>(&res)[i] = *(from + i * stride);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pgather<uint8_t, Packet8uc>(const uint8_t* from, Index stride)
+{
+  Packet8uc res = vld1_dup_u8(from);
+  res = vld1_lane_u8(from + 1*stride, res, 1);
+  res = vld1_lane_u8(from + 2*stride, res, 2);
+  res = vld1_lane_u8(from + 3*stride, res, 3);
+  res = vld1_lane_u8(from + 4*stride, res, 4);
+  res = vld1_lane_u8(from + 5*stride, res, 5);
+  res = vld1_lane_u8(from + 6*stride, res, 6);
+  res = vld1_lane_u8(from + 7*stride, res, 7);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride)
+{
+  Packet16uc res = vld1q_dup_u8(from);
+  res = vld1q_lane_u8(from + 1*stride, res, 1);
+  res = vld1q_lane_u8(from + 2*stride, res, 2);
+  res = vld1q_lane_u8(from + 3*stride, res, 3);
+  res = vld1q_lane_u8(from + 4*stride, res, 4);
+  res = vld1q_lane_u8(from + 5*stride, res, 5);
+  res = vld1q_lane_u8(from + 6*stride, res, 6);
+  res = vld1q_lane_u8(from + 7*stride, res, 7);
+  res = vld1q_lane_u8(from + 8*stride, res, 8);
+  res = vld1q_lane_u8(from + 9*stride, res, 9);
+  res = vld1q_lane_u8(from + 10*stride, res, 10);
+  res = vld1q_lane_u8(from + 11*stride, res, 11);
+  res = vld1q_lane_u8(from + 12*stride, res, 12);
+  res = vld1q_lane_u8(from + 13*stride, res, 13);
+  res = vld1q_lane_u8(from + 14*stride, res, 14);
+  res = vld1q_lane_u8(from + 15*stride, res, 15);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pgather<int16_t, Packet4s>(const int16_t* from, Index stride)
+{
+  Packet4s res = vld1_dup_s16(from);
+  res = vld1_lane_s16(from + 1*stride, res, 1);
+  res = vld1_lane_s16(from + 2*stride, res, 2);
+  res = vld1_lane_s16(from + 3*stride, res, 3);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride)
+{
+  Packet8s res = vld1q_dup_s16(from);
+  res = vld1q_lane_s16(from + 1*stride, res, 1);
+  res = vld1q_lane_s16(from + 2*stride, res, 2);
+  res = vld1q_lane_s16(from + 3*stride, res, 3);
+  res = vld1q_lane_s16(from + 4*stride, res, 4);
+  res = vld1q_lane_s16(from + 5*stride, res, 5);
+  res = vld1q_lane_s16(from + 6*stride, res, 6);
+  res = vld1q_lane_s16(from + 7*stride, res, 7);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pgather<uint16_t, Packet4us>(const uint16_t* from, Index stride)
+{
+  Packet4us res = vld1_dup_u16(from);
+  res = vld1_lane_u16(from + 1*stride, res, 1);
+  res = vld1_lane_u16(from + 2*stride, res, 2);
+  res = vld1_lane_u16(from + 3*stride, res, 3);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride)
+{
+  Packet8us res = vld1q_dup_u16(from);
+  res = vld1q_lane_u16(from + 1*stride, res, 1);
+  res = vld1q_lane_u16(from + 2*stride, res, 2);
+  res = vld1q_lane_u16(from + 3*stride, res, 3);
+  res = vld1q_lane_u16(from + 4*stride, res, 4);
+  res = vld1q_lane_u16(from + 5*stride, res, 5);
+  res = vld1q_lane_u16(from + 6*stride, res, 6);
+  res = vld1q_lane_u16(from + 7*stride, res, 7);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pgather<int32_t, Packet2i>(const int32_t* from, Index stride)
+{
+  Packet2i res = vld1_dup_s32(from);
+  res = vld1_lane_s32(from + 1*stride, res, 1);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
+{
+  Packet4i res = vld1q_dup_s32(from);
+  res = vld1q_lane_s32(from + 1*stride, res, 1);
+  res = vld1q_lane_s32(from + 2*stride, res, 2);
+  res = vld1q_lane_s32(from + 3*stride, res, 3);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pgather<uint32_t, Packet2ui>(const uint32_t* from, Index stride)
+{
+  Packet2ui res = vld1_dup_u32(from);
+  res = vld1_lane_u32(from + 1*stride, res, 1);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride)
+{
+  Packet4ui res = vld1q_dup_u32(from);
+  res = vld1q_lane_u32(from + 1*stride, res, 1);
+  res = vld1q_lane_u32(from + 2*stride, res, 2);
+  res = vld1q_lane_u32(from + 3*stride, res, 3);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride)
+{
+  Packet2l res = vld1q_dup_s64(from);
+  res = vld1q_lane_s64(from + 1*stride, res, 1);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride)
+{
+  Packet2ul res = vld1q_dup_u64(from);
+  res = vld1q_lane_u64(from + 1*stride, res, 1);
   return res;
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet2f>(float* to, const Packet2f& from, Index stride)
 {
-  to[stride*0] = vgetq_lane_f32(from, 0);
-  to[stride*1] = vgetq_lane_f32(from, 1);
-  to[stride*2] = vgetq_lane_f32(from, 2);
-  to[stride*3] = vgetq_lane_f32(from, 3);
+  vst1_lane_f32(to + stride*0, from, 0);
+  vst1_lane_f32(to + stride*1, from, 1);
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
 {
-  to[stride*0] = vgetq_lane_s32(from, 0);
-  to[stride*1] = vgetq_lane_s32(from, 1);
-  to[stride*2] = vgetq_lane_s32(from, 2);
-  to[stride*3] = vgetq_lane_s32(from, 3);
+  vst1q_lane_f32(to + stride*0, from, 0);
+  vst1q_lane_f32(to + stride*1, from, 1);
+  vst1q_lane_f32(to + stride*2, from, 2);
+  vst1q_lane_f32(to + stride*3, from, 3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet4c>(int8_t* to, const Packet4c& from, Index stride)
+{
+  for (int i = 0; i != 4; i++)
+    *(to + i * stride) = reinterpret_cast<const int8_t*>(&from)[i];
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet8c>(int8_t* to, const Packet8c& from, Index stride)
+{
+  vst1_lane_s8(to + stride*0, from, 0);
+  vst1_lane_s8(to + stride*1, from, 1);
+  vst1_lane_s8(to + stride*2, from, 2);
+  vst1_lane_s8(to + stride*3, from, 3);
+  vst1_lane_s8(to + stride*4, from, 4);
+  vst1_lane_s8(to + stride*5, from, 5);
+  vst1_lane_s8(to + stride*6, from, 6);
+  vst1_lane_s8(to + stride*7, from, 7);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from, Index stride)
+{
+  vst1q_lane_s8(to + stride*0, from, 0);
+  vst1q_lane_s8(to + stride*1, from, 1);
+  vst1q_lane_s8(to + stride*2, from, 2);
+  vst1q_lane_s8(to + stride*3, from, 3);
+  vst1q_lane_s8(to + stride*4, from, 4);
+  vst1q_lane_s8(to + stride*5, from, 5);
+  vst1q_lane_s8(to + stride*6, from, 6);
+  vst1q_lane_s8(to + stride*7, from, 7);
+  vst1q_lane_s8(to + stride*8, from, 8);
+  vst1q_lane_s8(to + stride*9, from, 9);
+  vst1q_lane_s8(to + stride*10, from, 10);
+  vst1q_lane_s8(to + stride*11, from, 11);
+  vst1q_lane_s8(to + stride*12, from, 12);
+  vst1q_lane_s8(to + stride*13, from, 13);
+  vst1q_lane_s8(to + stride*14, from, 14);
+  vst1q_lane_s8(to + stride*15, from, 15);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet4uc>(uint8_t* to, const Packet4uc& from, Index stride)
+{
+  for (int i = 0; i != 4; i++)
+    *(to + i * stride) = reinterpret_cast<const uint8_t*>(&from)[i];
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet8uc>(uint8_t* to, const Packet8uc& from, Index stride)
+{
+  vst1_lane_u8(to + stride*0, from, 0);
+  vst1_lane_u8(to + stride*1, from, 1);
+  vst1_lane_u8(to + stride*2, from, 2);
+  vst1_lane_u8(to + stride*3, from, 3);
+  vst1_lane_u8(to + stride*4, from, 4);
+  vst1_lane_u8(to + stride*5, from, 5);
+  vst1_lane_u8(to + stride*6, from, 6);
+  vst1_lane_u8(to + stride*7, from, 7);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from, Index stride)
+{
+  vst1q_lane_u8(to + stride*0, from, 0);
+  vst1q_lane_u8(to + stride*1, from, 1);
+  vst1q_lane_u8(to + stride*2, from, 2);
+  vst1q_lane_u8(to + stride*3, from, 3);
+  vst1q_lane_u8(to + stride*4, from, 4);
+  vst1q_lane_u8(to + stride*5, from, 5);
+  vst1q_lane_u8(to + stride*6, from, 6);
+  vst1q_lane_u8(to + stride*7, from, 7);
+  vst1q_lane_u8(to + stride*8, from, 8);
+  vst1q_lane_u8(to + stride*9, from, 9);
+  vst1q_lane_u8(to + stride*10, from, 10);
+  vst1q_lane_u8(to + stride*11, from, 11);
+  vst1q_lane_u8(to + stride*12, from, 12);
+  vst1q_lane_u8(to + stride*13, from, 13);
+  vst1q_lane_u8(to + stride*14, from, 14);
+  vst1q_lane_u8(to + stride*15, from, 15);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet4s>(int16_t* to, const Packet4s& from, Index stride)
+{
+  vst1_lane_s16(to + stride*0, from, 0);
+  vst1_lane_s16(to + stride*1, from, 1);
+  vst1_lane_s16(to + stride*2, from, 2);
+  vst1_lane_s16(to + stride*3, from, 3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from, Index stride)
+{
+  vst1q_lane_s16(to + stride*0, from, 0);
+  vst1q_lane_s16(to + stride*1, from, 1);
+  vst1q_lane_s16(to + stride*2, from, 2);
+  vst1q_lane_s16(to + stride*3, from, 3);
+  vst1q_lane_s16(to + stride*4, from, 4);
+  vst1q_lane_s16(to + stride*5, from, 5);
+  vst1q_lane_s16(to + stride*6, from, 6);
+  vst1q_lane_s16(to + stride*7, from, 7);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet4us>(uint16_t* to, const Packet4us& from, Index stride)
+{
+  vst1_lane_u16(to + stride*0, from, 0);
+  vst1_lane_u16(to + stride*1, from, 1);
+  vst1_lane_u16(to + stride*2, from, 2);
+  vst1_lane_u16(to + stride*3, from, 3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from, Index stride)
+{
+  vst1q_lane_u16(to + stride*0, from, 0);
+  vst1q_lane_u16(to + stride*1, from, 1);
+  vst1q_lane_u16(to + stride*2, from, 2);
+  vst1q_lane_u16(to + stride*3, from, 3);
+  vst1q_lane_u16(to + stride*4, from, 4);
+  vst1q_lane_u16(to + stride*5, from, 5);
+  vst1q_lane_u16(to + stride*6, from, 6);
+  vst1q_lane_u16(to + stride*7, from, 7);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet2i>(int32_t* to, const Packet2i& from, Index stride)
+{
+  vst1_lane_s32(to + stride*0, from, 0);
+  vst1_lane_s32(to + stride*1, from, 1);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
+{
+  vst1q_lane_s32(to + stride*0, from, 0);
+  vst1q_lane_s32(to + stride*1, from, 1);
+  vst1q_lane_s32(to + stride*2, from, 2);
+  vst1q_lane_s32(to + stride*3, from, 3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet2ui>(uint32_t* to, const Packet2ui& from, Index stride)
+{
+  vst1_lane_u32(to + stride*0, from, 0);
+  vst1_lane_u32(to + stride*1, from, 1);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride)
+{
+  vst1q_lane_u32(to + stride*0, from, 0);
+  vst1q_lane_u32(to + stride*1, from, 1);
+  vst1q_lane_u32(to + stride*2, from, 2);
+  vst1q_lane_u32(to + stride*3, from, 3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index stride)
+{
+  vst1q_lane_s64(to + stride*0, from, 0);
+  vst1q_lane_s64(to + stride*1, from, 1);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from, Index stride)
+{
+  vst1q_lane_u64(to + stride*0, from, 0);
+  vst1q_lane_u64(to + stride*1, from, 1);
 }
 
 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) { EIGEN_ARM_PREFETCH(addr); }
 
-// FIXME only store the 2 first elements ?
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE float pfirst<Packet2f>(const Packet2f& a) { return vget_lane_f32(a,0); }
+template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return vgetq_lane_f32(a,0); }
+template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet4c>(const Packet4c& a) { return static_cast<int8_t>(a & 0xff); }
+template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet8c>(const Packet8c& a) { return vget_lane_s8(a,0); }
+template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) { return vgetq_lane_s8(a,0); }
+template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet4uc>(const Packet4uc& a) { return static_cast<uint8_t>(a & 0xff); }
+template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet8uc>(const Packet8uc& a) { return vget_lane_u8(a,0); }
+template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) { return vgetq_lane_u8(a,0); }
+template<> EIGEN_STRONG_INLINE int16_t pfirst<Packet4s>(const Packet4s& a) { return vget_lane_s16(a,0); }
+template<> EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) { return vgetq_lane_s16(a,0); }
+template<> EIGEN_STRONG_INLINE uint16_t pfirst<Packet4us>(const Packet4us& a) { return vget_lane_u16(a,0); }
+template<> EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) { return vgetq_lane_u16(a,0); }
+template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet2i>(const Packet2i& a) { return vget_lane_s32(a,0); }
+template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { return vgetq_lane_s32(a,0); }
+template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet2ui>(const Packet2ui& a) { return vget_lane_u32(a,0); }
+template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) { return vgetq_lane_u32(a,0); }
+template<> EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) { return vgetq_lane_s64(a,0); }
+template<> EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) { return vgetq_lane_u64(a,0); }
 
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
-  float32x2_t a_lo, a_hi;
-  Packet4f a_r64;
-
-  a_r64 = vrev64q_f32(a);
-  a_lo = vget_low_f32(a_r64);
-  a_hi = vget_high_f32(a_r64);
-  return vcombine_f32(a_hi, a_lo);
+template<> EIGEN_STRONG_INLINE Packet2f preverse(const Packet2f& a) { return vrev64_f32(a); }
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{
+  const float32x4_t a_r64 = vrev64q_f32(a);
+  return vcombine_f32(vget_high_f32(a_r64), vget_low_f32(a_r64));
 }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
-  int32x2_t a_lo, a_hi;
-  Packet4i a_r64;
-
-  a_r64 = vrev64q_s32(a);
-  a_lo = vget_low_s32(a_r64);
-  a_hi = vget_high_s32(a_r64);
-  return vcombine_s32(a_hi, a_lo);
+template<> EIGEN_STRONG_INLINE Packet4c preverse(const Packet4c& a)
+{ return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
+template<> EIGEN_STRONG_INLINE Packet8c preverse(const Packet8c& a) { return vrev64_s8(a); }
+template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a)
+{
+  const int8x16_t a_r64 = vrev64q_s8(a);
+  return vcombine_s8(vget_high_s8(a_r64), vget_low_s8(a_r64));
 }
+template<> EIGEN_STRONG_INLINE Packet4uc preverse(const Packet4uc& a)
+{ return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0); }
+template<> EIGEN_STRONG_INLINE Packet8uc preverse(const Packet8uc& a) { return vrev64_u8(a); }
+template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a)
+{
+  const uint8x16_t a_r64 = vrev64q_u8(a);
+  return vcombine_u8(vget_high_u8(a_r64), vget_low_u8(a_r64));
+}
+template<> EIGEN_STRONG_INLINE Packet4s preverse(const Packet4s& a) { return vrev64_s16(a); }
+template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a)
+{
+  const int16x8_t a_r64 = vrev64q_s16(a);
+  return vcombine_s16(vget_high_s16(a_r64), vget_low_s16(a_r64));
+}
+template<> EIGEN_STRONG_INLINE Packet4us preverse(const Packet4us& a) { return vrev64_u16(a); }
+template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a)
+{
+  const uint16x8_t a_r64 = vrev64q_u16(a);
+  return vcombine_u16(vget_high_u16(a_r64), vget_low_u16(a_r64));
+}
+template<> EIGEN_STRONG_INLINE Packet2i preverse(const Packet2i& a) { return vrev64_s32(a); }
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
+{
+  const int32x4_t a_r64 = vrev64q_s32(a);
+  return vcombine_s32(vget_high_s32(a_r64), vget_low_s32(a_r64));
+}
+template<> EIGEN_STRONG_INLINE Packet2ui preverse(const Packet2ui& a) { return vrev64_u32(a); }
+template<> EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a)
+{
+  const uint32x4_t a_r64 = vrev64q_u32(a);
+  return vcombine_u32(vget_high_u32(a_r64), vget_low_u32(a_r64));
+}
+template<> EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a)
+{ return vcombine_s64(vget_high_s64(a), vget_low_s64(a)); }
+template<> EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a)
+{ return vcombine_u64(vget_high_u64(a), vget_low_u64(a)); }
 
+template<> EIGEN_STRONG_INLINE Packet2f pabs(const Packet2f& a) { return vabs_f32(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
+template<> EIGEN_STRONG_INLINE Packet4c pabs<Packet4c>(const Packet4c& a)
+{ return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
+template<> EIGEN_STRONG_INLINE Packet8c pabs(const Packet8c& a) { return vabs_s8(a); }
+template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vabsq_s8(a); }
+template<> EIGEN_STRONG_INLINE Packet4uc pabs(const Packet4uc& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8uc pabs(const Packet8uc& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4s pabs(const Packet4s& a) { return vabs_s16(a); }
+template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vabsq_s16(a); }
+template<> EIGEN_STRONG_INLINE Packet4us pabs(const Packet4us& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2i pabs(const Packet2i& a) { return vabs_s32(a); }
 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
+template<> EIGEN_STRONG_INLINE Packet2ui pabs(const Packet2ui& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
+#if EIGEN_ARCH_ARM64
+  return vabsq_s64(a);
+#else
+  return vcombine_s64(
+      vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))),
+      vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1))));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) { return a; }
 
+template<> EIGEN_STRONG_INLINE Packet2f pfrexp<Packet2f>(const Packet2f& a, Packet2f& exponent)
+{ return pfrexp_generic(a,exponent); }
+template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent)
+{ return pfrexp_generic(a,exponent); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pldexp<Packet2f>(const Packet2f& a, const Packet2f& exponent)
+{ return pldexp_generic(a,exponent); }
+template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent)
+{ return pldexp_generic(a,exponent); }
+
+template<> EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) { return vget_lane_f32(vpadd_f32(a,a), 0); }
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
-  float32x2_t a_lo, a_hi, sum;
-
-  a_lo = vget_low_f32(a);
-  a_hi = vget_high_f32(a);
-  sum = vpadd_f32(a_lo, a_hi);
-  sum = vpadd_f32(sum, sum);
-  return vget_lane_f32(sum, 0);
+  const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a));
+  return vget_lane_f32(vpadd_f32(sum, sum), 0);
 }
-
-template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
+template<> EIGEN_STRONG_INLINE int8_t predux<Packet4c>(const Packet4c& a)
 {
-  float32x4x2_t vtrn1, vtrn2, res1, res2;
-  Packet4f sum1, sum2, sum;
-
-  // NEON zip performs interleaving of the supplied vectors.
-  // We perform two interleaves in a row to acquire the transposed vector
-  vtrn1 = vzipq_f32(vecs[0], vecs[2]);
-  vtrn2 = vzipq_f32(vecs[1], vecs[3]);
-  res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]);
-  res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]);
-
-  // Do the addition of the resulting vectors
-  sum1 = vaddq_f32(res1.val[0], res1.val[1]);
-  sum2 = vaddq_f32(res2.val[0], res2.val[1]);
-  sum = vaddq_f32(sum1, sum2);
-
-  return sum;
+  const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
+  int8x8_t sum = vpadd_s8(a_dup, a_dup);
+  sum = vpadd_s8(sum, sum);
+  return vget_lane_s8(sum, 0);
 }
-
-template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
+template<> EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a)
 {
-  int32x2_t a_lo, a_hi, sum;
-
-  a_lo = vget_low_s32(a);
-  a_hi = vget_high_s32(a);
-  sum = vpadd_s32(a_lo, a_hi);
-  sum = vpadd_s32(sum, sum);
-  return vget_lane_s32(sum, 0);
+  int8x8_t sum = vpadd_s8(a,a);
+  sum = vpadd_s8(sum, sum);
+  sum = vpadd_s8(sum, sum);
+  return vget_lane_s8(sum, 0);
 }
-
-template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
+template<> EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a)
 {
-  int32x4x2_t vtrn1, vtrn2, res1, res2;
-  Packet4i sum1, sum2, sum;
-
-  // NEON zip performs interleaving of the supplied vectors.
-  // We perform two interleaves in a row to acquire the transposed vector
-  vtrn1 = vzipq_s32(vecs[0], vecs[2]);
-  vtrn2 = vzipq_s32(vecs[1], vecs[3]);
-  res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]);
-  res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]);
-
-  // Do the addition of the resulting vectors
-  sum1 = vaddq_s32(res1.val[0], res1.val[1]);
-  sum2 = vaddq_s32(res2.val[0], res2.val[1]);
-  sum = vaddq_s32(sum1, sum2);
-
-  return sum;
+  int8x8_t sum = vadd_s8(vget_low_s8(a), vget_high_s8(a));
+  sum = vpadd_s8(sum, sum);
+  sum = vpadd_s8(sum, sum);
+  sum = vpadd_s8(sum, sum);
+  return vget_lane_s8(sum, 0);
 }
+template<> EIGEN_STRONG_INLINE uint8_t predux<Packet4uc>(const Packet4uc& a)
+{
+  const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
+  uint8x8_t sum = vpadd_u8(a_dup, a_dup);
+  sum = vpadd_u8(sum, sum);
+  return vget_lane_u8(sum, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a)
+{
+  uint8x8_t sum = vpadd_u8(a,a);
+  sum = vpadd_u8(sum, sum);
+  sum = vpadd_u8(sum, sum);
+  return vget_lane_u8(sum, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a)
+{
+  uint8x8_t sum = vadd_u8(vget_low_u8(a), vget_high_u8(a));
+  sum = vpadd_u8(sum, sum);
+  sum = vpadd_u8(sum, sum);
+  sum = vpadd_u8(sum, sum);
+  return vget_lane_u8(sum, 0);
+}
+template<> EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a)
+{
+  const int16x4_t sum = vpadd_s16(a,a);
+  return vget_lane_s16(vpadd_s16(sum, sum), 0);
+}
+template<> EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a)
+{
+  int16x4_t sum = vadd_s16(vget_low_s16(a), vget_high_s16(a));
+  sum = vpadd_s16(sum, sum);
+  sum = vpadd_s16(sum, sum);
+  return vget_lane_s16(sum, 0);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a)
+{
+  const uint16x4_t sum = vpadd_u16(a,a);
+  return vget_lane_u16(vpadd_u16(sum, sum), 0);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a)
+{
+  uint16x4_t sum = vadd_u16(vget_low_u16(a), vget_high_u16(a));
+  sum = vpadd_u16(sum, sum);
+  sum = vpadd_u16(sum, sum);
+  return vget_lane_u16(sum, 0);
+}
+template<> EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) { return vget_lane_s32(vpadd_s32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a)
+{
+  const int32x2_t sum = vadd_s32(vget_low_s32(a), vget_high_s32(a));
+  return vget_lane_s32(vpadd_s32(sum, sum), 0);
+}
+template<> EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) { return vget_lane_u32(vpadd_u32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a)
+{
+  const uint32x2_t sum = vadd_u32(vget_low_u32(a), vget_high_u32(a));
+  return vget_lane_u32(vpadd_u32(sum, sum), 0);
+}
+template<> EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a)
+{ return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); }
+template<> EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a)
+{ return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a,
+      vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(const Packet16c& a)
+{ return vadd_s8(vget_high_s8(a), vget_low_s8(a)); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(const Packet8uc& a)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a,
+      vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(const Packet16uc& a)
+{ return vadd_u8(vget_high_u8(a), vget_low_u8(a)); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(const Packet8s& a)
+{ return vadd_s16(vget_high_s16(a), vget_low_s16(a)); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(const Packet8us& a)
+{ return vadd_u16(vget_high_u16(a), vget_low_u16(a)); }
 
 // Other reduction functions:
 // mul
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet2f>(const Packet2f& a)
+{ return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); }
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{ return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); }
+template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet4c>(const Packet4c& a)
 {
-  float32x2_t a_lo, a_hi, prod;
-
-  // Get a_lo = |a1|a2| and a_hi = |a3|a4|
-  a_lo = vget_low_f32(a);
-  a_hi = vget_high_f32(a);
-  // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
-  prod = vmul_f32(a_lo, a_hi);
-  // Multiply prod with its swapped value |a2*a4|a1*a3|
-  prod = vmul_f32(prod, vrev64_f32(prod));
-
-  return vget_lane_f32(prod, 0);
+  int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a));
+  prod = vmul_s8(prod, vrev16_s8(prod));
+  return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 2);
 }
-template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
+template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet8c>(const Packet8c& a)
 {
-  int32x2_t a_lo, a_hi, prod;
-
-  // Get a_lo = |a1|a2| and a_hi = |a3|a4|
-  a_lo = vget_low_s32(a);
-  a_hi = vget_high_s32(a);
-  // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
-  prod = vmul_s32(a_lo, a_hi);
-  // Multiply prod with its swapped value |a2*a4|a1*a3|
-  prod = vmul_s32(prod, vrev64_s32(prod));
-
-  return vget_lane_s32(prod, 0);
+  int8x8_t prod = vmul_s8(a, vrev16_s8(a));
+  prod = vmul_s8(prod, vrev32_s8(prod));
+  return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4);
 }
+template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a)
+{ return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); }
+template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet4uc>(const Packet4uc& a)
+{
+  uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a));
+  prod = vmul_u8(prod, vrev16_u8(prod));
+  return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 2);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet8uc>(const Packet8uc& a)
+{
+  uint8x8_t prod = vmul_u8(a, vrev16_u8(a));
+  prod = vmul_u8(prod, vrev32_u8(prod));
+  return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a)
+{ return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); }
+template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet4s>(const Packet4s& a)
+{
+  const int16x4_t prod = vmul_s16(a, vrev32_s16(a));
+  return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
+}
+template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a)
+{
+  int16x4_t prod;
+
+  // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
+  prod = vmul_s16(vget_low_s16(a), vget_high_s16(a));
+  // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
+  prod = vmul_s16(prod, vrev32_s16(prod));
+  // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
+  return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet4us>(const Packet4us& a)
+{
+  const uint16x4_t prod = vmul_u16(a, vrev32_u16(a));
+  return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a)
+{
+  uint16x4_t prod;
+
+  // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
+  prod = vmul_u16(vget_low_u16(a), vget_high_u16(a));
+  // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
+  prod = vmul_u16(prod, vrev32_u16(prod));
+  // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
+  return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
+}
+template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet2i>(const Packet2i& a)
+{ return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); }
+template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
+{ return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); }
+template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet2ui>(const Packet2ui& a)
+{ return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); }
+template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a)
+{ return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); }
+template<> EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a)
+{ return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); }
+template<> EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a)
+{ return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1); }
 
 // min
+template<> EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a)
+{ return vget_lane_f32(vpmin_f32(a,a), 0); }
 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
 {
-  float32x2_t a_lo, a_hi, min;
-
-  a_lo = vget_low_f32(a);
-  a_hi = vget_high_f32(a);
-  min = vpmin_f32(a_lo, a_hi);
-  min = vpmin_f32(min, min);
-
-  return vget_lane_f32(min, 0);
+  const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a));
+  return vget_lane_f32(vpmin_f32(min, min), 0);
 }
-
-template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
+template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet4c>(const Packet4c& a)
 {
-  int32x2_t a_lo, a_hi, min;
-
-  a_lo = vget_low_s32(a);
-  a_hi = vget_high_s32(a);
-  min = vpmin_s32(a_lo, a_hi);
-  min = vpmin_s32(min, min);
-
-  return vget_lane_s32(min, 0);
+  const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
+  int8x8_t min = vpmin_s8(a_dup, a_dup);
+  min = vpmin_s8(min, min);
+  return vget_lane_s8(min, 0);
 }
+template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a)
+{
+  int8x8_t min = vpmin_s8(a,a);
+  min = vpmin_s8(min, min);
+  min = vpmin_s8(min, min);
+  return vget_lane_s8(min, 0);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a)
+{
+  int8x8_t min = vmin_s8(vget_low_s8(a), vget_high_s8(a));
+  min = vpmin_s8(min, min);
+  min = vpmin_s8(min, min);
+  min = vpmin_s8(min, min);
+  return vget_lane_s8(min, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet4uc>(const Packet4uc& a)
+{
+  const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
+  uint8x8_t min = vpmin_u8(a_dup, a_dup);
+  min = vpmin_u8(min, min);
+  return vget_lane_u8(min, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a)
+{
+  uint8x8_t min = vpmin_u8(a,a);
+  min = vpmin_u8(min, min);
+  min = vpmin_u8(min, min);
+  return vget_lane_u8(min, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a)
+{
+  uint8x8_t min = vmin_u8(vget_low_u8(a), vget_high_u8(a));
+  min = vpmin_u8(min, min);
+  min = vpmin_u8(min, min);
+  min = vpmin_u8(min, min);
+  return vget_lane_u8(min, 0);
+}
+template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a)
+{
+  const int16x4_t min = vpmin_s16(a,a);
+  return vget_lane_s16(vpmin_s16(min, min), 0);
+}
+template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a)
+{
+  int16x4_t min = vmin_s16(vget_low_s16(a), vget_high_s16(a));
+  min = vpmin_s16(min, min);
+  min = vpmin_s16(min, min);
+  return vget_lane_s16(min, 0);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a)
+{
+  const uint16x4_t min = vpmin_u16(a,a);
+  return vget_lane_u16(vpmin_u16(min, min), 0);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a)
+{
+  uint16x4_t min = vmin_u16(vget_low_u16(a), vget_high_u16(a));
+  min = vpmin_u16(min, min);
+  min = vpmin_u16(min, min);
+  return vget_lane_u16(min, 0);
+}
+template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a)
+{ return vget_lane_s32(vpmin_s32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
+{
+  const int32x2_t min = vmin_s32(vget_low_s32(a), vget_high_s32(a));
+  return vget_lane_s32(vpmin_s32(min, min), 0);
+}
+template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a)
+{ return vget_lane_u32(vpmin_u32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a)
+{
+  const uint32x2_t min = vmin_u32(vget_low_u32(a), vget_high_u32(a));
+  return vget_lane_u32(vpmin_u32(min, min), 0);
+}
+template<> EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a)
+{ return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); }
+template<> EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a)
+{ return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); }
 
 // max
+template<> EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a)
+{ return vget_lane_f32(vpmax_f32(a,a), 0); }
 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
 {
-  float32x2_t a_lo, a_hi, max;
-
-  a_lo = vget_low_f32(a);
-  a_hi = vget_high_f32(a);
-  max = vpmax_f32(a_lo, a_hi);
-  max = vpmax_f32(max, max);
-
-  return vget_lane_f32(max, 0);
+  const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a));
+  return vget_lane_f32(vpmax_f32(max, max), 0);
 }
-
-template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
+template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet4c>(const Packet4c& a)
 {
-  int32x2_t a_lo, a_hi, max;
+  const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
+  int8x8_t max = vpmax_s8(a_dup, a_dup);
+  max = vpmax_s8(max, max);
+  return vget_lane_s8(max, 0);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a)
+{
+  int8x8_t max = vpmax_s8(a,a);
+  max = vpmax_s8(max, max);
+  max = vpmax_s8(max, max);
+  return vget_lane_s8(max, 0);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a)
+{
+  int8x8_t max = vmax_s8(vget_low_s8(a), vget_high_s8(a));
+  max = vpmax_s8(max, max);
+  max = vpmax_s8(max, max);
+  max = vpmax_s8(max, max);
+  return vget_lane_s8(max, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet4uc>(const Packet4uc& a)
+{
+  const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
+  uint8x8_t max = vpmax_u8(a_dup, a_dup);
+  max = vpmax_u8(max, max);
+  return vget_lane_u8(max, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a)
+{
+  uint8x8_t max = vpmax_u8(a,a);
+  max = vpmax_u8(max, max);
+  max = vpmax_u8(max, max);
+  return vget_lane_u8(max, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a)
+{
+  uint8x8_t max = vmax_u8(vget_low_u8(a), vget_high_u8(a));
+  max = vpmax_u8(max, max);
+  max = vpmax_u8(max, max);
+  max = vpmax_u8(max, max);
+  return vget_lane_u8(max, 0);
+}
+template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a)
+{
+  const int16x4_t max = vpmax_s16(a,a);
+  return vget_lane_s16(vpmax_s16(max, max), 0);
+}
+template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a)
+{
+  int16x4_t max = vmax_s16(vget_low_s16(a), vget_high_s16(a));
+  max = vpmax_s16(max, max);
+  max = vpmax_s16(max, max);
+  return vget_lane_s16(max, 0);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a)
+{
+  const uint16x4_t max = vpmax_u16(a,a);
+  return vget_lane_u16(vpmax_u16(max, max), 0);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a)
+{
+  uint16x4_t max = vmax_u16(vget_low_u16(a), vget_high_u16(a));
+  max = vpmax_u16(max, max);
+  max = vpmax_u16(max, max);
+  return vget_lane_u16(max, 0);
+}
+template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a)
+{ return vget_lane_s32(vpmax_s32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
+{
+  const int32x2_t max = vmax_s32(vget_low_s32(a), vget_high_s32(a));
+  return vget_lane_s32(vpmax_s32(max, max), 0);
+}
+template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a)
+{ return vget_lane_u32(vpmax_u32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a)
+{
+  const uint32x2_t max = vmax_u32(vget_low_u32(a), vget_high_u32(a));
+  return vget_lane_u32(vpmax_u32(max, max), 0);
+}
+template<> EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a)
+{ return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); }
+template<> EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a)
+{ return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); }
 
-  a_lo = vget_low_s32(a);
-  a_hi = vget_high_s32(a);
-  max = vpmax_s32(a_lo, a_hi);
-  max = vpmax_s32(max, max);
-
-  return vget_lane_s32(max, 0);
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+{
+  uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
+                            vget_high_u32(vreinterpretq_u32_f32(x)));
+  return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
 }
 
-// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
-// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
-#define PALIGN_NEON(Offset,Type,Command) \
-template<>\
-struct palign_impl<Offset,Type>\
-{\
-    EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
-    {\
-        if (Offset!=0)\
-            first = Command(first, second, Offset);\
-    }\
-};\
+// Helpers for ptranspose.
+namespace detail {
+  
+template<typename Packet>
+void zip_in_place(Packet& p1, Packet& p2);
 
-PALIGN_NEON(0,Packet4f,vextq_f32)
-PALIGN_NEON(1,Packet4f,vextq_f32)
-PALIGN_NEON(2,Packet4f,vextq_f32)
-PALIGN_NEON(3,Packet4f,vextq_f32)
-PALIGN_NEON(0,Packet4i,vextq_s32)
-PALIGN_NEON(1,Packet4i,vextq_s32)
-PALIGN_NEON(2,Packet4i,vextq_s32)
-PALIGN_NEON(3,Packet4i,vextq_s32)
-
-#undef PALIGN_NEON
-
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4f,4>& kernel) {
-  float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]);
-  float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]);
-
-  kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0]));
-  kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0]));
-  kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1]));
-  kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1]));
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet2f>(Packet2f& p1, Packet2f& p2) {
+  const float32x2x2_t tmp = vzip_f32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
 }
 
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4i,4>& kernel) {
-  int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]);
-  int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]);
-  kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0]));
-  kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0]));
-  kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1]));
-  kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4f>(Packet4f& p1, Packet4f& p2) {
+  const float32x4x2_t tmp = vzipq_f32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet8c>(Packet8c& p1, Packet8c& p2) {
+  const int8x8x2_t tmp = vzip_s8(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet16c>(Packet16c& p1, Packet16c& p2) {
+  const int8x16x2_t tmp = vzipq_s8(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet8uc>(Packet8uc& p1, Packet8uc& p2) {
+  const uint8x8x2_t tmp = vzip_u8(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet16uc>(Packet16uc& p1, Packet16uc& p2) {
+  const uint8x16x2_t tmp = vzipq_u8(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet2i>(Packet2i& p1, Packet2i& p2) {
+  const int32x2x2_t tmp = vzip_s32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4i>(Packet4i& p1, Packet4i& p2) {
+  const int32x4x2_t tmp = vzipq_s32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet2ui>(Packet2ui& p1, Packet2ui& p2) {
+  const uint32x2x2_t tmp = vzip_u32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4ui>(Packet4ui& p1, Packet4ui& p2) {
+  const uint32x4x2_t tmp = vzipq_u32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4s>(Packet4s& p1, Packet4s& p2) {
+  const int16x4x2_t tmp = vzip_s16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet8s>(Packet8s& p1, Packet8s& p2) {
+  const int16x8x2_t tmp = vzipq_s16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4us>(Packet4us& p1, Packet4us& p2) {
+  const uint16x4x2_t tmp = vzip_u16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet8us>(Packet8us& p1, Packet8us& p2) {
+  const uint16x8x2_t tmp = vzipq_u16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 2>& kernel) {
+  zip_in_place(kernel.packet[0], kernel.packet[1]);
+}
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 4>& kernel) {
+  zip_in_place(kernel.packet[0], kernel.packet[2]);
+  zip_in_place(kernel.packet[1], kernel.packet[3]);
+  zip_in_place(kernel.packet[0], kernel.packet[1]);
+  zip_in_place(kernel.packet[2], kernel.packet[3]);
+}
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 8>& kernel) {
+  zip_in_place(kernel.packet[0], kernel.packet[4]);
+  zip_in_place(kernel.packet[1], kernel.packet[5]);
+  zip_in_place(kernel.packet[2], kernel.packet[6]);
+  zip_in_place(kernel.packet[3], kernel.packet[7]);
+
+  zip_in_place(kernel.packet[0], kernel.packet[2]);
+  zip_in_place(kernel.packet[1], kernel.packet[3]);
+  zip_in_place(kernel.packet[4], kernel.packet[6]);
+  zip_in_place(kernel.packet[5], kernel.packet[7]);
+  
+  zip_in_place(kernel.packet[0], kernel.packet[1]);
+  zip_in_place(kernel.packet[2], kernel.packet[3]);
+  zip_in_place(kernel.packet[4], kernel.packet[5]);
+  zip_in_place(kernel.packet[6], kernel.packet[7]);
+}
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 16>& kernel) {
+  EIGEN_UNROLL_LOOP
+  for (int i=0; i<4; ++i) {
+    const int m = (1 << i);
+    EIGEN_UNROLL_LOOP
+    for (int j=0; j<m; ++j) {
+      const int n = (1 << (3-i));
+      EIGEN_UNROLL_LOOP
+      for (int k=0; k<n; ++k) {
+        const int idx = 2*j*n+k;
+        zip_in_place(kernel.packet[idx], kernel.packet[idx + n]);
+      }
+    }
+  }
+}
+
+} // namespace detail
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2f, 2>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4c, 4>& kernel)
+{
+  const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1));
+  const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1));
+
+  const int8x8x2_t zip8 = vzip_s8(a,b);
+  const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1]));
+
+  kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0);
+  kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1);
+  kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0);
+  kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4uc, 4>& kernel)
+{
+  const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1));
+  const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1));
+
+  const uint8x8x2_t zip8 = vzip_u8(a,b);
+  const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1]));
+
+  kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0);
+  kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1);
+  kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0);
+  kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4s, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4us, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2i, 2>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
+    detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ui, 2>& kernel) {
+  detail::zip_in_place(kernel.packet[0], kernel.packet[1]);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet2l, 2>& kernel)
+{
+#if EIGEN_ARCH_ARM64
+  const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[1] = vzip2q_s64(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[0] = tmp1;
+#else
+  const int64x1_t tmp[2][2] = {
+    { vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0]) },
+    { vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1]) }
+  };
+
+  kernel.packet[0] = vcombine_s64(tmp[0][0], tmp[1][0]);
+  kernel.packet[1] = vcombine_s64(tmp[0][1], tmp[1][1]);
+#endif
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet2ul, 2>& kernel)
+{
+#if EIGEN_ARCH_ARM64
+  const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[1] = vzip2q_u64(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[0] = tmp1;
+#else
+  const uint64x1_t tmp[2][2] = {
+    { vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0]) },
+    { vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1]) }
+  };
+
+  kernel.packet[0] = vcombine_u64(tmp[0][0], tmp[1][0]);
+  kernel.packet[1] = vcombine_u64(tmp[0][1], tmp[1][1]);
+#endif
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pselect( const Packet2f& mask, const Packet2f& a, const Packet2f& b)
+{ return vbsl_f32(vreinterpret_u32_f32(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b)
+{ return vbslq_f32(vreinterpretq_u32_f32(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b)
+{ return vbsl_s8(vreinterpret_u8_s8(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b)
+{ return vbslq_s8(vreinterpretq_u8_s8(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b)
+{ return vbsl_u8(mask, a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a, const Packet16uc& b)
+{ return vbslq_u8(mask, a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b)
+{ return vbsl_s16(vreinterpret_u16_s16(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b)
+{ return vbslq_s16(vreinterpretq_u16_s16(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b)
+{ return vbsl_u16(mask, a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b)
+{ return vbslq_u16(mask, a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b)
+{ return vbsl_s32(vreinterpret_u32_s32(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b)
+{ return vbslq_s32(vreinterpretq_u32_s32(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b)
+{ return vbsl_u32(mask, a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b)
+{ return vbslq_u32(mask, a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b)
+{ return vbslq_s64(vreinterpretq_u64_s64(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b)
+{ return vbslq_u64(mask, a, b); }
+
+// Use armv8 rounding intinsics if available.
+#if EIGEN_ARCH_ARMV8
+template<> EIGEN_STRONG_INLINE Packet2f print<Packet2f>(const Packet2f& a)
+{ return vrndn_f32(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
+{ return vrndnq_f32(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
+{ return vrndm_f32(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{ return vrndmq_f32(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a)
+{ return vrndp_f32(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
+{ return vrndpq_f32(a); }
+
+#else
+
+template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
+  // Adds and subtracts signum(a) * 2^23 to force rounding.
+  const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
+  const Packet4f abs_a = pabs(a);
+  Packet4f r = padd(abs_a, limit);
+  // Don't compile-away addition and subtraction.
+  EIGEN_OPTIMIZATION_BARRIER(r);
+  r = psub(r, limit);
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) {
+  // Adds and subtracts signum(a) * 2^23 to force rounding.
+  const Packet2f limit = pset1<Packet2f>(static_cast<float>(1<<23));
+  const Packet2f abs_a = pabs(a);
+  Packet2f r = padd(abs_a, limit);
+  // Don't compile-away addition and subtraction.
+  EIGEN_OPTIMIZATION_BARRIER(r);
+  r = psub(r, limit);
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{
+  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+  Packet4f tmp  = print<Packet4f>(a);
+  // If greater, subtract one.
+  Packet4f mask = pcmp_lt(a, tmp);
+  mask = pand(mask, cst_1);
+  return psub(tmp, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
+{
+  const Packet2f cst_1 = pset1<Packet2f>(1.0f);
+  Packet2f tmp  = print<Packet2f>(a);
+  // If greater, subtract one.
+  Packet2f mask = pcmp_lt(a, tmp);
+  mask = pand(mask, cst_1);
+  return psub(tmp, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
+{
+  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+  Packet4f tmp  = print<Packet4f>(a);
+  // If smaller, add one.
+  Packet4f mask = pcmp_lt(tmp, a);
+  mask = pand(mask, cst_1);
+  return padd(tmp, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a)
+{
+  const Packet2f cst_1 = pset1<Packet2f>(1.0);
+  Packet2f tmp  = print<Packet2f>(a);
+  // If smaller, add one.
+  Packet2f mask = pcmp_lt(tmp, a);
+  mask = pand(mask, cst_1);
+  return padd(tmp, mask);
+}
+
+#endif
+
+/**
+ * Computes the integer square root
+ * @remarks The calculation is performed using an algorithm which iterates through each binary digit of the result
+ *   and tests whether setting that digit to 1 would cause the square of the value to be greater than the argument
+ *   value. The algorithm is described in detail here: http://ww1.microchip.com/downloads/en/AppNotes/91040a.pdf .
+ */
+template<> EIGEN_STRONG_INLINE Packet4uc psqrt(const Packet4uc& a) {
+  uint8x8_t x = vreinterpret_u8_u32(vdup_n_u32(a));
+  uint8x8_t res = vdup_n_u8(0);
+  uint8x8_t add = vdup_n_u8(0x8);
+  for (int i = 0; i < 4; i++)
+  {
+    const uint8x8_t temp = vorr_u8(res, add);
+    res = vbsl_u8(vcge_u8(x, vmul_u8(temp, temp)), temp, res);
+    add = vshr_n_u8(add, 1);
+  }
+  return vget_lane_u32(vreinterpret_u32_u8(res), 0);
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template<> EIGEN_STRONG_INLINE Packet8uc psqrt(const Packet8uc& a) {
+  uint8x8_t res = vdup_n_u8(0);
+  uint8x8_t add = vdup_n_u8(0x8);
+  for (int i = 0; i < 4; i++)
+  {
+    const uint8x8_t temp = vorr_u8(res, add);
+    res = vbsl_u8(vcge_u8(a, vmul_u8(temp, temp)), temp, res);
+    add = vshr_n_u8(add, 1);
+  }
+  return res;
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template<> EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
+  uint8x16_t res = vdupq_n_u8(0);
+  uint8x16_t add = vdupq_n_u8(0x8);
+  for (int i = 0; i < 4; i++)
+  {
+    const uint8x16_t temp = vorrq_u8(res, add);
+    res = vbslq_u8(vcgeq_u8(a, vmulq_u8(temp, temp)), temp, res);
+    add = vshrq_n_u8(add, 1);
+  }
+  return res;
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template<> EIGEN_STRONG_INLINE Packet4us psqrt(const Packet4us& a) {
+  uint16x4_t res = vdup_n_u16(0);
+  uint16x4_t add = vdup_n_u16(0x80);
+  for (int i = 0; i < 8; i++)
+  {
+    const uint16x4_t temp = vorr_u16(res, add);
+    res = vbsl_u16(vcge_u16(a, vmul_u16(temp, temp)), temp, res);
+    add = vshr_n_u16(add, 1);
+  }
+  return res;
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template<> EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
+  uint16x8_t res = vdupq_n_u16(0);
+  uint16x8_t add = vdupq_n_u16(0x80);
+  for (int i = 0; i < 8; i++)
+  {
+    const uint16x8_t temp = vorrq_u16(res, add);
+    res = vbslq_u16(vcgeq_u16(a, vmulq_u16(temp, temp)), temp, res);
+    add = vshrq_n_u16(add, 1);
+  }
+  return res;
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template<> EIGEN_STRONG_INLINE Packet2ui psqrt(const Packet2ui& a) {
+  uint32x2_t res = vdup_n_u32(0);
+  uint32x2_t add = vdup_n_u32(0x8000);
+  for (int i = 0; i < 16; i++)
+  {
+    const uint32x2_t temp = vorr_u32(res, add);
+    res = vbsl_u32(vcge_u32(a, vmul_u32(temp, temp)), temp, res);
+    add = vshr_n_u32(add, 1);
+  }
+  return res;
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template<> EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
+  uint32x4_t res = vdupq_n_u32(0);
+  uint32x4_t add = vdupq_n_u32(0x8000);
+  for (int i = 0; i < 16; i++)
+  {
+    const uint32x4_t temp = vorrq_u32(res, add);
+    res = vbslq_u32(vcgeq_u32(a, vmulq_u32(temp, temp)), temp, res);
+    add = vshrq_n_u32(add, 1);
+  }
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
+  // Compute approximate reciprocal sqrt.
+  Packet4f x = vrsqrteq_f32(a);
+  // Do Newton iterations for 1/sqrt(x).
+  x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x);
+  x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x);
+  const Packet4f infinity = pset1<Packet4f>(NumTraits<float>::infinity());
+  return pselect(pcmp_eq(a, pzero(a)), infinity, x);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) {
+  // Compute approximate reciprocal sqrt.
+  Packet2f x = vrsqrte_f32(a);
+  // Do Newton iterations for 1/sqrt(x).
+  x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x);
+  x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x);
+  const Packet2f infinity = pset1<Packet2f>(NumTraits<float>::infinity());
+  return pselect(pcmp_eq(a, pzero(a)), infinity, x);
+}
+
+// Unfortunately vsqrt_f32 is only available for A64.
+#if EIGEN_ARCH_ARM64
+template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& _x){return vsqrtq_f32(_x);}
+template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& _x){return vsqrt_f32(_x); }
+#else
+template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
+  const Packet4f infinity = pset1<Packet4f>(NumTraits<float>::infinity());
+  const Packet4f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity));
+  return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a)));
+}
+template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
+  const Packet2f infinity = pset1<Packet2f>(NumTraits<float>::infinity());
+  const Packet2f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity));
+  return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a)));
+}
+#endif
+
+//---------- bfloat16 ----------
+// TODO: Add support for native armv8.6-a bfloat16_t
+
+// TODO: Guard if we have native bfloat16 support
+typedef eigen_packet_wrapper<uint16x4_t, 19> Packet4bf;
+
+template<> struct is_arithmetic<Packet4bf> { enum { value = true }; };
+
+template<> struct packet_traits<bfloat16> : default_packet_traits
+{
+  typedef Packet4bf type;
+  typedef Packet4bf half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,
+
+    HasCmp       = 1,
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0,
+    HasDiv       = 1,
+    HasFloor     = 1,
+    HasCeil      = 1,
+    HasRint      = 1,
+
+    HasSin  = EIGEN_FAST_MATH,
+    HasCos  = EIGEN_FAST_MATH,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 0,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf  = EIGEN_FAST_MATH,
+    HasBessel = 0,  // Issues with accuracy.
+    HasNdtri = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet4bf>
+{
+  typedef bfloat16 type;
+  typedef Packet4bf half;
+  enum
+  {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+namespace detail {  
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4bf>(Packet4bf& p1, Packet4bf& p2) {
+  const uint16x4x2_t tmp = vzip_u16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+} // namespace detail
+
+EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p)
+{
+  // See the scalar implemention in BFloat16.h for a comprehensible explanation
+  // of this fast rounding algorithm
+  Packet4ui input = reinterpret_cast<Packet4ui>(p);
+
+  // lsb = (input >> 16) & 1
+  Packet4ui lsb =  vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1));
+
+  // rounding_bias = 0x7fff + lsb
+  Packet4ui rounding_bias = vaddq_u32(lsb, vdupq_n_u32(0x7fff));
+
+  // input += rounding_bias
+  input = vaddq_u32(input, rounding_bias);
+
+  // input = input >> 16
+  input = vshrq_n_u32(input, 16);
+
+  // Replace float-nans by bfloat16-nans, that is 0x7fc0
+  const Packet4ui bf16_nan = vdupq_n_u32(0x7fc0);
+  const Packet4ui mask = vceqq_f32(p, p);
+  input = vbslq_u32(mask, input, bf16_nan);
+
+  // output = static_cast<uint16_t>(input)
+  return vmovn_u32(input);
+}
+
+EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p)
+{
+  return reinterpret_cast<Packet4f>(vshlq_n_u32(vmovl_u16(p), 16));
+}
+
+EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) {
+  return vmovn_u32(vreinterpretq_u32_f32(p));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pset1<Packet4bf>(const bfloat16& from) {
+  return pset1<Packet4us>(from.value);
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(const Packet4bf& from) {
+  return bfloat16_impl::raw_uint16_to_bfloat16(static_cast<uint16_t>(pfirst<Packet4us>(from)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from)
+{
+  return pload<Packet4us>(reinterpret_cast<const uint16_t*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(const bfloat16* from)
+{
+  return ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet4bf& from)
+{
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf ploaddup<Packet4bf>(const bfloat16* from)
+{
+  return ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from));
+}
+
+template <> EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) {
+  return F32ToBf16(pabs<Packet4f>(Bf16ToF32(a)));
+}
+
+template <> EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNumbers, Packet4bf>(const Packet4bf &a,
+                                                                            const Packet4bf &b)
+{
+  return F32ToBf16(pmin<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+template <> EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNaN, Packet4bf>(const Packet4bf &a,
+                                                                        const Packet4bf &b)
+{
+  return F32ToBf16(pmin<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <> EIGEN_STRONG_INLINE Packet4bf pmin<Packet4bf>(const Packet4bf &a,
+                                                          const Packet4bf &b)
+{
+  return F32ToBf16(pmin<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <> EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNumbers, Packet4bf>(const Packet4bf &a,
+                                                                            const Packet4bf &b)
+{
+  return F32ToBf16(pmax<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+template <> EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNaN, Packet4bf>(const Packet4bf &a,
+                                                                        const Packet4bf &b)
+{
+  return F32ToBf16(pmax<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <> EIGEN_STRONG_INLINE Packet4bf pmax<Packet4bf>(const Packet4bf &a,
+                                                          const Packet4bf &b)
+{
+  return F32ToBf16(pmax<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf plset<Packet4bf>(const bfloat16& a)
+{
+  return F32ToBf16(plset<Packet4f>(static_cast<float>(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a,const Packet4bf& b) {
+  return por<Packet4us>(a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a,const Packet4bf& b) {
+  return pxor<Packet4us>(a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a,const Packet4bf& b) {
+  return pand<Packet4us>(a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a,const Packet4bf& b) {
+  return pandnot<Packet4us>(a, b);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a,
+                                                      const Packet4bf& b)
+{
+  return pselect<Packet4us>(mask, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf print<Packet4bf>(const Packet4bf& a)
+{
+  return F32ToBf16(print<Packet4f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pfloor<Packet4bf>(const Packet4bf& a)
+{
+  return F32ToBf16(pfloor<Packet4f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pceil<Packet4bf>(const Packet4bf& a)
+{
+  return F32ToBf16(pceil<Packet4f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pconj(const Packet4bf& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4bf padd<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(padd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf psub<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(psub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pmul<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(pmul<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(pdiv<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<>
+EIGEN_STRONG_INLINE Packet4bf pgather<bfloat16, Packet4bf>(const bfloat16* from, Index stride)
+{
+  return pgather<uint16_t, Packet4us>(reinterpret_cast<const uint16_t*>(from), stride);
+}
+
+template<>
+EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet4bf>(bfloat16* to, const Packet4bf& from, Index stride)
+{
+  pscatter<uint16_t, Packet4us>(reinterpret_cast<uint16_t*>(to), from, stride);
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet4bf>(const Packet4bf& a)
+{
+  return static_cast<bfloat16>(predux<Packet4f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet4bf>(const Packet4bf& a)
+{
+  return static_cast<bfloat16>(predux_max<Packet4f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet4bf>(const Packet4bf& a)
+{
+  return static_cast<bfloat16>(predux_min<Packet4f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet4bf>(const Packet4bf& a)
+{
+  return static_cast<bfloat16>(predux_mul<Packet4f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf preverse<Packet4bf>(const Packet4bf& a)
+{
+  return preverse<Packet4us>(a);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4bf, 4>& kernel)
+{
+  detail::ptranspose_impl(kernel);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pabsdiff<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
+{
+  return F32ToBf16(pabsdiff<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pcmp_eq<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
+{
+  return F32MaskToBf16Mask(pcmp_eq<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
+{
+  return F32MaskToBf16Mask(pcmp_lt<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
+{
+  return F32MaskToBf16Mask(pcmp_lt_or_nan<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
+{
+  return F32MaskToBf16Mask(pcmp_le<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a)
+{
+  return pxor<Packet4us>(a, pset1<Packet4us>(static_cast<uint16_t>(0x8000)));
 }
 
 //---------- double ----------
@@ -550,190 +3642,943 @@
 // Defining these functions as templates ensures that if these intrinsics are
 // already defined in arm_neon.h, then our workaround doesn't cause a conflict
 // and has lower priority in overload resolution.
-template <typename T>
-uint64x2_t vreinterpretq_u64_f64(T a)
-{
-  return (uint64x2_t) a;
-}
+template <typename T> uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; }
 
-template <typename T>
-float64x2_t vreinterpretq_f64_u64(T a)
-{
-  return (float64x2_t) a;
-}
+template <typename T> float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; }
 
 typedef float64x2_t Packet2d;
 typedef float64x1_t Packet1d;
 
+// fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
+// Currently used in LU/arch/InverseSize4.h to enable a shared implementation
+// for fast inversion of matrices of size 4.
+EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask)
+{
+  const double* a = reinterpret_cast<const double*>(&m);
+  const double* b = reinterpret_cast<const double*>(&n);
+  Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))};
+  return res;
+}
+
+EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask)
+{
+  return shuffle(a, b, mask);
+}
+EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a,const Packet2d& b)
+{
+  return shuffle(a, b, 0);
+}
+EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a,const Packet2d& b)
+{
+  return shuffle(a, b, 3);
+}
+#define vec2d_duplane(a, p) \
+  vdupq_laneq_f64(a, p)
+
 template<> struct packet_traits<double>  : default_packet_traits
 {
   typedef Packet2d type;
   typedef Packet2d half;
-  enum {
+  enum
+  {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
-    HasHalfPacket=0,
-   
-    HasDiv  = 1,
-    // FIXME check the Has*
+    HasHalfPacket = 0,
+
+    HasCmp       = 1,
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0,
+
+    HasDiv   = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1,
+
     HasSin  = 0,
     HasCos  = 0,
-    HasLog  = 0,
-    HasExp  = 0,
-    HasSqrt = 0
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasTanh = 0,
+    HasErf  = 0
   };
 };
 
-template<> struct unpacket_traits<Packet2d> { typedef double  type; enum {size=2}; typedef Packet2d half; };
+template<> struct unpacket_traits<Packet2d>
+{
+  typedef double type;
+  typedef Packet2d half;
+  typedef Packet2l integer_packet;
+  enum
+  {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
 
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) { return vdupq_n_f64(from); }
 
-template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a)
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a)
 {
-  Packet2d countdown = EIGEN_INIT_NEON_PACKET2(0, 1);
-  return vaddq_f64(pset1<Packet2d>(a), countdown);
+  const double c[] = {0.0,1.0};
+  return vaddq_f64(pset1<Packet2d>(a), vld1q_f64(c));
 }
+
 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }
 
 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); }
 
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
+template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b){
+  const Packet2d mask = {numext::bit_cast<double>(0x8000000000000000ull),0.0};
+  return padd(a, pxor(mask, b));
+}
+
 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); }
 
 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
 
-template<> EIGEN_STRONG_INLINE Packet2d pselect<Packet2d>(const Packet2d& a, const Packet2d& b, const Packet2d& false_mask) {
-  return vbslq_f64(vreinterpretq_u64_f64(false_mask), b, a);
-}
-
 template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); }
 
 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
 
 #ifdef __ARM_FEATURE_FMA
 // See bug 936. See above comment about FMA for float.
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
+{ return vfmaq_f64(c,a,b); }
 #else
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
+{ return vmlaq_f64(c,a,b); }
 #endif
 
 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
 
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
+template<> EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) { return vminnmq_f64(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxnmq_f64(a, b); }
+
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) { return pmin<Packet2d>(a, b); }
+
 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); }
 
+
+template<> EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) { return pmax<Packet2d>(a, b); }
+
 // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b)
-{
-  return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
-}
+{ return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
 
 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b)
-{
-  return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
-}
+{ return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
 
 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b)
-{
-  return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
-}
+{ return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
 
 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b)
-{
-  return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
-}
+{ return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
 
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b)
+{ return vreinterpretq_f64_u64(vcleq_f64(a,b)); }
 
-template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b)
+{ return vreinterpretq_f64_u64(vcltq_f64(a,b)); }
 
-template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
-{
-  return vld1q_dup_f64(from);
-}
-template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b)
+{ return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_u64(vcgeq_f64(a,b)))); }
 
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b)
+{ return vreinterpretq_f64_u64(vceqq_f64(a,b)); }
 
-template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, int stride)
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) { return vld1q_dup_f64(from); }
+template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to,from); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to,from); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
   Packet2d res = pset1<Packet2d>(0.0);
-  res = vsetq_lane_f64(from[0*stride], res, 0);
-  res = vsetq_lane_f64(from[1*stride], res, 1);
+  res = vld1q_lane_f64(from + 0*stride, res, 0);
+  res = vld1q_lane_f64(from + 1*stride, res, 1);
   return res;
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, int stride)
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
 {
-  to[stride*0] = vgetq_lane_f64(from, 0);
-  to[stride*1] = vgetq_lane_f64(from, 1);
+  vst1q_lane_f64(to + stride*0, from, 0);
+  vst1q_lane_f64(to + stride*1, from, 1);
 }
+
 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ARM_PREFETCH(addr); }
 
 // FIXME only store the 2 first elements ?
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a, 0); }
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a,0); }
 
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
+{ return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
 
 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
 
 #if EIGEN_COMP_CLANG && defined(__apple_build_version__)
 // workaround ICE, see bug 907
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; }
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
+{ return (vget_low_f64(a) + vget_high_f64(a))[0]; }
 #else
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
+{ return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
-{
-  float64x2_t trn1, trn2;
-
-  // NEON zip performs interleaving of the supplied vectors.
-  // We perform two interleaves in a row to acquire the transposed vector
-  trn1 = vzip1q_f64(vecs[0], vecs[1]);
-  trn2 = vzip2q_f64(vecs[0], vecs[1]);
-
-  // Do the addition of the resulting vectors
-  return vaddq_f64(trn1, trn2);
-}
 // Other reduction functions:
 // mul
 #if EIGEN_COMP_CLANG && defined(__apple_build_version__)
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
+{ return (vget_low_f64(a) * vget_high_f64(a))[0]; }
 #else
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
+{ return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
 #endif
 
 // min
-template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); }
+template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
+{ return vgetq_lane_f64(vpminq_f64(a,a), 0); }
 
 // max
-template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); }
+template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
+{ return vgetq_lane_f64(vpmaxq_f64(a,a), 0); }
 
-// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
-// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
-#define PALIGN_NEON(Offset,Type,Command) \
-template<>\
-struct palign_impl<Offset,Type>\
-{\
-    EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
-    {\
-        if (Offset!=0)\
-            first = Command(first, second, Offset);\
-    }\
-};\
 
-PALIGN_NEON(0,Packet2d,vextq_f64)
-PALIGN_NEON(1,Packet2d,vextq_f64)
-#undef PALIGN_NEON
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet2d, 2>& kernel)
+{
+  const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
+  const float64x2_t tmp2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2d,2>& kernel) {
-  float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
-  float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
-
-  kernel.packet[0] = trn1;
-  kernel.packet[1] = trn2;
+  kernel.packet[0] = tmp1;
+  kernel.packet[1] = tmp2;
 }
-#endif // EIGEN_ARCH_ARM64 
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b)
+{ return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a)
+{ return vrndnq_f64(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
+{ return vrndmq_f64(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
+{ return vrndpq_f64(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent)
+{ return pldexp_generic(a, exponent); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent)
+{ return pfrexp_generic(a,exponent); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from)
+{ return vreinterpretq_f64_u64(vdupq_n_u64(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
+  // Compute approximate reciprocal sqrt.
+  Packet2d x = vrsqrteq_f64(a);
+  // Do Newton iterations for 1/sqrt(x).
+  x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x);
+  x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x);
+  x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x);
+  const Packet2d infinity = pset1<Packet2d>(NumTraits<double>::infinity());
+  return pselect(pcmp_eq(a, pzero(a)), infinity, x);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); }
+
+// Do we have an fp16 types and supporting Neon intrinsics?
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+typedef float16x4_t Packet4hf;
+typedef float16x8_t Packet8hf;
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet8hf type;
+  typedef Packet4hf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 1,
+
+    HasCmp = 1,
+    HasCast = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasAbsDiff = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasInsert = 1,
+    HasReduxp = 1,
+    HasDiv = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1,
+    HasSin = 0,
+    HasCos = 0,
+    HasLog = 0,
+    HasExp = 0,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasErf = EIGEN_FAST_MATH,
+    HasBessel = 0,  // Issues with accuracy.
+    HasNdtri = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet4hf> {
+  typedef Eigen::half type;
+  typedef Packet4hf half;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet8hf> {
+  typedef Eigen::half type;
+  typedef Packet4hf half;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4<Packet8hf>(const Packet8hf& a) {
+  return vadd_f16(vget_low_f16(a), vget_high_f16(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pset1<Packet8hf>(const Eigen::half& from) {
+  return vdupq_n_f16(from.x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pset1<Packet4hf>(const Eigen::half& from) {
+  return vdup_n_f16(from.x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf plset<Packet8hf>(const Eigen::half& a) {
+  const float16_t f[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  Packet8hf countdown = vld1q_f16(f);
+  return vaddq_f16(pset1<Packet8hf>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf plset<Packet4hf>(const Eigen::half& a) {
+  const float16_t f[] = {0, 1, 2, 3};
+  Packet4hf countdown = vld1_f16(f);
+  return vadd_f16(pset1<Packet4hf>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf padd<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vaddq_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf padd<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vadd_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf psub<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vsubq_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf psub<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vsub_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pnegate(const Packet8hf& a) {
+  return vnegq_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pnegate(const Packet4hf& a) {
+  return vneg_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pconj(const Packet8hf& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pconj(const Packet4hf& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmul<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vmulq_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmul<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vmul_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pdiv<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vdivq_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pdiv<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vdiv_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
+  return vfmaq_f16(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
+  return vfma_f16(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmin<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vminq_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmin<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vmin_f16(a, b);
+}
+
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
+template<> EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return vminnm_f16(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return vminnmq_f16(a, b); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return pmin<Packet4hf>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return pmin<Packet8hf>(a, b); }
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmax<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vmaxq_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmax<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vmax_f16(a, b);
+}
+
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
+template<> EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return vmaxnm_f16(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return vmaxnmq_f16(a, b); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return pmax<Packet4hf>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return pmax<Packet8hf>(a, b); }
+
+#define EIGEN_MAKE_ARM_FP16_CMP_8(name)                                               \
+  template <>                                                                         \
+  EIGEN_STRONG_INLINE Packet8hf pcmp_##name(const Packet8hf& a, const Packet8hf& b) { \
+    return vreinterpretq_f16_u16(vc##name##q_f16(a, b));                              \
+  }
+
+#define EIGEN_MAKE_ARM_FP16_CMP_4(name)                                               \
+  template <>                                                                         \
+  EIGEN_STRONG_INLINE Packet4hf pcmp_##name(const Packet4hf& a, const Packet4hf& b) { \
+    return vreinterpret_f16_u16(vc##name##_f16(a, b));                                \
+  }
+
+EIGEN_MAKE_ARM_FP16_CMP_8(eq)
+EIGEN_MAKE_ARM_FP16_CMP_8(lt)
+EIGEN_MAKE_ARM_FP16_CMP_8(le)
+
+EIGEN_MAKE_ARM_FP16_CMP_4(eq)
+EIGEN_MAKE_ARM_FP16_CMP_4(lt)
+EIGEN_MAKE_ARM_FP16_CMP_4(le)
+
+#undef EIGEN_MAKE_ARM_FP16_CMP_8
+#undef EIGEN_MAKE_ARM_FP16_CMP_4
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pcmp_lt_or_nan<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vreinterpretq_f16_u16(vmvnq_u16(vcgeq_f16(a, b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pcmp_lt_or_nan<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vreinterpret_f16_u16(vmvn_u16(vcge_f16(a, b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf print<Packet8hf>(const Packet8hf& a)
+{ return vrndnq_f16(a); }
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf print<Packet4hf>(const Packet4hf& a)
+{ return vrndn_f16(a); }
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pfloor<Packet8hf>(const Packet8hf& a)
+{ return vrndmq_f16(a); }
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pfloor<Packet4hf>(const Packet4hf& a)
+{ return vrndm_f16(a); }
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pceil<Packet8hf>(const Packet8hf& a)
+{ return vrndpq_f16(a); }
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pceil<Packet4hf>(const Packet4hf& a)
+{ return vrndp_f16(a); }
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf psqrt<Packet8hf>(const Packet8hf& a) {
+  return vsqrtq_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf psqrt<Packet4hf>(const Packet4hf& a) {
+  return vsqrt_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pand<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vreinterpretq_f16_u16(vandq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pand<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf por<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vreinterpretq_f16_u16(vorrq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf por<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vreinterpret_f16_u16(vorr_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pxor<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vreinterpretq_f16_u16(veorq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pxor<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vreinterpret_f16_u16(veor_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pandnot<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pandnot<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vreinterpret_f16_u16(vbic_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pload<Packet8hf>(const Eigen::half* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pload<Packet4hf>(const Eigen::half* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf ploadu<Packet8hf>(const Eigen::half* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf ploadu<Packet4hf>(const Eigen::half* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf ploaddup<Packet8hf>(const Eigen::half* from) {
+  Packet8hf packet;
+  packet[0] = from[0].x;
+  packet[1] = from[0].x;
+  packet[2] = from[1].x;
+  packet[3] = from[1].x;
+  packet[4] = from[2].x;
+  packet[5] = from[2].x;
+  packet[6] = from[3].x;
+  packet[7] = from[3].x;
+  return packet;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf ploaddup<Packet4hf>(const Eigen::half* from) {
+  float16x4_t packet;
+  float16_t* tmp;
+  tmp = (float16_t*)&packet;
+  tmp[0] = from[0].x;
+  tmp[1] = from[0].x;
+  tmp[2] = from[1].x;
+  tmp[3] = from[1].x;
+  return packet;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf ploadquad<Packet8hf>(const Eigen::half* from) {
+  Packet4hf lo, hi;
+  lo = vld1_dup_f16(reinterpret_cast<const float16_t*>(from));
+  hi = vld1_dup_f16(reinterpret_cast<const float16_t*>(from+1));
+  return vcombine_f16(lo, hi);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertfirst(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 0); }
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertfirst(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 0); }
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pselect(const Packet8hf& mask, const Packet8hf& a, const Packet8hf& b) {
+  return vbslq_f16(vreinterpretq_u16_f16(mask), a, b);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pselect(const Packet4hf& mask, const Packet4hf& a, const Packet4hf& b) {
+  return vbsl_f16(vreinterpret_u16_f16(mask), a, b);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertlast(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 7); }
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 3); }
+
+template <>
+EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pgather<Eigen::half, Packet8hf>(const Eigen::half* from, Index stride) {
+  Packet8hf res = pset1<Packet8hf>(Eigen::half(0.f));
+  res = vsetq_lane_f16(from[0 * stride].x, res, 0);
+  res = vsetq_lane_f16(from[1 * stride].x, res, 1);
+  res = vsetq_lane_f16(from[2 * stride].x, res, 2);
+  res = vsetq_lane_f16(from[3 * stride].x, res, 3);
+  res = vsetq_lane_f16(from[4 * stride].x, res, 4);
+  res = vsetq_lane_f16(from[5 * stride].x, res, 5);
+  res = vsetq_lane_f16(from[6 * stride].x, res, 6);
+  res = vsetq_lane_f16(from[7 * stride].x, res, 7);
+  return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pgather<Eigen::half, Packet4hf>(const Eigen::half* from, Index stride) {
+  Packet4hf res = pset1<Packet4hf>(Eigen::half(0.f));
+  res = vset_lane_f16(from[0 * stride].x, res, 0);
+  res = vset_lane_f16(from[1 * stride].x, res, 1);
+  res = vset_lane_f16(from[2 * stride].x, res, 2);
+  res = vset_lane_f16(from[3 * stride].x, res, 3);
+  return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8hf>(Eigen::half* to, const Packet8hf& from, Index stride) {
+  to[stride * 0].x = vgetq_lane_f16(from, 0);
+  to[stride * 1].x = vgetq_lane_f16(from, 1);
+  to[stride * 2].x = vgetq_lane_f16(from, 2);
+  to[stride * 3].x = vgetq_lane_f16(from, 3);
+  to[stride * 4].x = vgetq_lane_f16(from, 4);
+  to[stride * 5].x = vgetq_lane_f16(from, 5);
+  to[stride * 6].x = vgetq_lane_f16(from, 6);
+  to[stride * 7].x = vgetq_lane_f16(from, 7);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4hf>(Eigen::half* to, const Packet4hf& from, Index stride) {
+  to[stride * 0].x = vget_lane_f16(from, 0);
+  to[stride * 1].x = vget_lane_f16(from, 1);
+  to[stride * 2].x = vget_lane_f16(from, 2);
+  to[stride * 3].x = vget_lane_f16(from, 3);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<Eigen::half>(const Eigen::half* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8hf>(const Packet8hf& a) {
+  float16_t x[8];
+  vst1q_f16(x, a);
+  Eigen::half h;
+  h.x = x[0];
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4hf>(const Packet4hf& a) {
+  float16_t x[4];
+  vst1_f16(x, a);
+  Eigen::half h;
+  h.x = x[0];
+  return h;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8hf preverse(const Packet8hf& a) {
+  float16x4_t a_lo, a_hi;
+  Packet8hf a_r64;
+
+  a_r64 = vrev64q_f16(a);
+  a_lo = vget_low_f16(a_r64);
+  a_hi = vget_high_f16(a_r64);
+  return vcombine_f16(a_hi, a_lo);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf preverse<Packet4hf>(const Packet4hf& a) {
+  return vrev64_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pabs<Packet8hf>(const Packet8hf& a) {
+  return vabsq_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pabs<Packet4hf>(const Packet4hf& a) {
+  return vabs_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux<Packet8hf>(const Packet8hf& a) {
+  float16x4_t a_lo, a_hi, sum;
+
+  a_lo = vget_low_f16(a);
+  a_hi = vget_high_f16(a);
+  sum = vpadd_f16(a_lo, a_hi);
+  sum = vpadd_f16(sum, sum);
+  sum = vpadd_f16(sum, sum);
+
+  Eigen::half h;
+  h.x = vget_lane_f16(sum, 0);
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux<Packet4hf>(const Packet4hf& a) {
+  float16x4_t sum;
+
+  sum = vpadd_f16(a, a);
+  sum = vpadd_f16(sum, sum);
+  Eigen::half h;
+  h.x = vget_lane_f16(sum, 0);
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8hf>(const Packet8hf& a) {
+  float16x4_t a_lo, a_hi, prod;
+
+  a_lo = vget_low_f16(a);
+  a_hi = vget_high_f16(a);
+  prod = vmul_f16(a_lo, a_hi);
+  prod = vmul_f16(prod, vrev64_f16(prod));
+
+  Eigen::half h;
+  h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4hf>(const Packet4hf& a) {
+  float16x4_t prod;
+  prod = vmul_f16(a, vrev64_f16(a));
+  Eigen::half h;
+  h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8hf>(const Packet8hf& a) {
+  float16x4_t a_lo, a_hi, min;
+
+  a_lo = vget_low_f16(a);
+  a_hi = vget_high_f16(a);
+  min = vpmin_f16(a_lo, a_hi);
+  min = vpmin_f16(min, min);
+  min = vpmin_f16(min, min);
+
+  Eigen::half h;
+  h.x = vget_lane_f16(min, 0);
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4hf>(const Packet4hf& a) {
+  Packet4hf tmp;
+  tmp = vpmin_f16(a, a);
+  tmp = vpmin_f16(tmp, tmp);
+  Eigen::half h;
+  h.x = vget_lane_f16(tmp, 0);
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8hf>(const Packet8hf& a) {
+  float16x4_t a_lo, a_hi, max;
+
+  a_lo = vget_low_f16(a);
+  a_hi = vget_high_f16(a);
+  max = vpmax_f16(a_lo, a_hi);
+  max = vpmax_f16(max, max);
+  max = vpmax_f16(max, max);
+
+  Eigen::half h;
+  h.x = vget_lane_f16(max, 0);
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4hf>(const Packet4hf& a) {
+  Packet4hf tmp;
+  tmp = vpmax_f16(a, a);
+  tmp = vpmax_f16(tmp, tmp);
+  Eigen::half h;
+  h.x = vget_lane_f16(tmp, 0);
+  return h;
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 4>& kernel)
+{
+  const float16x8x2_t zip16_1 = vzipq_f16(kernel.packet[0], kernel.packet[1]);
+  const float16x8x2_t zip16_2 = vzipq_f16(kernel.packet[2], kernel.packet[3]);
+
+  const float32x4x2_t zip32_1 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[0]), vreinterpretq_f32_f16(zip16_2.val[0]));
+  const float32x4x2_t zip32_2 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[1]), vreinterpretq_f32_f16(zip16_2.val[1]));
+
+  kernel.packet[0] = vreinterpretq_f16_f32(zip32_1.val[0]);
+  kernel.packet[1] = vreinterpretq_f16_f32(zip32_1.val[1]);
+  kernel.packet[2] = vreinterpretq_f16_f32(zip32_2.val[0]);
+  kernel.packet[3] = vreinterpretq_f16_f32(zip32_2.val[1]);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4hf, 4>& kernel) {
+  EIGEN_ALIGN16 float16x4x4_t tmp_x4;
+  float16_t* tmp = (float16_t*)&kernel;
+  tmp_x4 = vld4_f16(tmp);
+
+  kernel.packet[0] = tmp_x4.val[0];
+  kernel.packet[1] = tmp_x4.val[1];
+  kernel.packet[2] = tmp_x4.val[2];
+  kernel.packet[3] = tmp_x4.val[3];
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 8>& kernel) {
+  float16x8x2_t T_1[4];
+
+  T_1[0] = vuzpq_f16(kernel.packet[0], kernel.packet[1]);
+  T_1[1] = vuzpq_f16(kernel.packet[2], kernel.packet[3]);
+  T_1[2] = vuzpq_f16(kernel.packet[4], kernel.packet[5]);
+  T_1[3] = vuzpq_f16(kernel.packet[6], kernel.packet[7]);
+
+  float16x8x2_t T_2[4];
+  T_2[0] = vuzpq_f16(T_1[0].val[0], T_1[1].val[0]);
+  T_2[1] = vuzpq_f16(T_1[0].val[1], T_1[1].val[1]);
+  T_2[2] = vuzpq_f16(T_1[2].val[0], T_1[3].val[0]);
+  T_2[3] = vuzpq_f16(T_1[2].val[1], T_1[3].val[1]);
+
+  float16x8x2_t T_3[4];
+  T_3[0] = vuzpq_f16(T_2[0].val[0], T_2[2].val[0]);
+  T_3[1] = vuzpq_f16(T_2[0].val[1], T_2[2].val[1]);
+  T_3[2] = vuzpq_f16(T_2[1].val[0], T_2[3].val[0]);
+  T_3[3] = vuzpq_f16(T_2[1].val[1], T_2[3].val[1]);
+
+  kernel.packet[0] = T_3[0].val[0];
+  kernel.packet[1] = T_3[2].val[0];
+  kernel.packet[2] = T_3[1].val[0];
+  kernel.packet[3] = T_3[3].val[0];
+  kernel.packet[4] = T_3[0].val[1];
+  kernel.packet[5] = T_3[2].val[1];
+  kernel.packet[6] = T_3[1].val[1];
+  kernel.packet[7] = T_3[3].val[1];
+}
+#endif // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+
+#endif // EIGEN_ARCH_ARM64
 
 } // end namespace internal
 

diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h
new file mode 100644
index 0000000..54f9733
--- /dev/null
+++ b/Eigen/src/Core/arch/NEON/TypeCasting.h

@@ -0,0 +1,1419 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
+// Copyright (C) 2020 Antonio Sanchez <cantonios@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_NEON_H
+#define EIGEN_TYPE_CASTING_NEON_H
+
+namespace Eigen {
+
+namespace internal {
+
+//==============================================================================
+// pcast, SrcType = float
+//==============================================================================
+template <>
+struct type_casting_traits<float, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4f, Packet4f>(const Packet4f& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet2f, Packet2f>(const Packet2f& a) {
+  return a;
+}
+
+template <>
+struct type_casting_traits<float, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+struct type_casting_traits<float, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+// If float64 exists, first convert to that to keep as much precision as possible.
+#if EIGEN_ARCH_ARM64
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet4f, Packet2l>(const Packet4f& a) {
+  // Discard second half of input.
+  return vcvtq_s64_f64(vcvt_f64_f32(vget_low_f32(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet4f, Packet2ul>(const Packet4f& a) {
+  // Discard second half of input.
+  return vcvtq_u64_f64(vcvt_f64_f32(vget_low_f32(a)));
+}
+#else
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet4f, Packet2l>(const Packet4f& a) {
+  // Discard second half of input.
+  return vmovl_s32(vget_low_s32(vcvtq_s32_f32(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet4f, Packet2ul>(const Packet4f& a) {
+  // Discard second half of input.
+  return vmovl_u32(vget_low_u32(vcvtq_u32_f32(a)));
+}
+#endif  // EIGEN_ARCH_ARM64
+
+template <>
+struct type_casting_traits<float, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return vcvtq_s32_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet2f, Packet2i>(const Packet2f& a) {
+  return vcvt_s32_f32(a);
+}
+
+template <>
+struct type_casting_traits<float, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) {
+  return vcvtq_u32_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet2f, Packet2ui>(const Packet2f& a) {
+  return vcvt_u32_f32(a);
+}
+
+template <>
+struct type_casting_traits<float, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet4f, Packet8s>(const Packet4f& a, const Packet4f& b) {
+  return vcombine_s16(vmovn_s32(vcvtq_s32_f32(a)), vmovn_s32(vcvtq_s32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet2f, Packet4s>(const Packet2f& a, const Packet2f& b) {
+  return vmovn_s32(vcombine_s32(vcvt_s32_f32(a), vcvt_s32_f32(b)));
+}
+
+template <>
+struct type_casting_traits<float, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet4f, Packet8us>(const Packet4f& a, const Packet4f& b) {
+  return vcombine_u16(vmovn_u32(vcvtq_u32_f32(a)), vmovn_u32(vcvtq_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet2f, Packet4us>(const Packet2f& a, const Packet2f& b) {
+  return vmovn_u32(vcombine_u32(vcvt_u32_f32(a), vcvt_u32_f32(b)));
+}
+
+template <>
+struct type_casting_traits<float, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet4f, Packet16c>(const Packet4f& a, const Packet4f& b, const Packet4f& c,
+                                                         const Packet4f& d) {
+  const int16x8_t ab_s16 = pcast<Packet4f, Packet8s>(a, b);
+  const int16x8_t cd_s16 = pcast<Packet4f, Packet8s>(c, d);
+  return vcombine_s8(vmovn_s16(ab_s16), vmovn_s16(cd_s16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet2f, Packet8c>(const Packet2f& a, const Packet2f& b, const Packet2f& c,
+                                                       const Packet2f& d) {
+  const int16x4_t ab_s16 = pcast<Packet2f, Packet4s>(a, b);
+  const int16x4_t cd_s16 = pcast<Packet2f, Packet4s>(c, d);
+  return vmovn_s16(vcombine_s16(ab_s16, cd_s16));
+}
+
+template <>
+struct type_casting_traits<float, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet4f, Packet16uc>(const Packet4f& a, const Packet4f& b, const Packet4f& c,
+                                                           const Packet4f& d) {
+  const uint16x8_t ab_u16 = pcast<Packet4f, Packet8us>(a, b);
+  const uint16x8_t cd_u16 = pcast<Packet4f, Packet8us>(c, d);
+  return vcombine_u8(vmovn_u16(ab_u16), vmovn_u16(cd_u16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet2f, Packet8uc>(const Packet2f& a, const Packet2f& b, const Packet2f& c,
+                                                         const Packet2f& d) {
+  const uint16x4_t ab_u16 = pcast<Packet2f, Packet4us>(a, b);
+  const uint16x4_t cd_u16 = pcast<Packet2f, Packet4us>(c, d);
+  return vmovn_u16(vcombine_u16(ab_u16, cd_u16));
+}
+
+//==============================================================================
+// pcast, SrcType = int8_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::int8_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet16c, Packet4f>(const Packet16c& a) {
+  // Discard all but first 4 bytes.
+  return vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet8c, Packet2f>(const Packet8c& a) {
+  // Discard all but first 2 bytes.
+  return vcvt_f32_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(a)))));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet16c, Packet2l>(const Packet16c& a) {
+  // Discard all but first two bytes.
+  return vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a))))));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet16c, Packet2ul>(const Packet16c& a) {
+  return vreinterpretq_u64_s64(pcast<Packet16c, Packet2l>(a));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet16c, Packet4i>(const Packet16c& a) {
+  // Discard all but first 4 bytes.
+  return vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet8c, Packet2i>(const Packet8c& a) {
+  // Discard all but first 2 bytes.
+  return vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(a))));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet16c, Packet4ui>(const Packet16c& a) {
+  return vreinterpretq_u32_s32(pcast<Packet16c, Packet4i>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet8c, Packet2ui>(const Packet8c& a) {
+  return vreinterpret_u32_s32(pcast<Packet8c, Packet2i>(a));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet16c, Packet8s>(const Packet16c& a) {
+  // Discard second half of input.
+  return vmovl_s8(vget_low_s8(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet8c, Packet4s>(const Packet8c& a) {
+  // Discard second half of input.
+  return vget_low_s16(vmovl_s8(a));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet16c, Packet8us>(const Packet16c& a) {
+  return vreinterpretq_u16_s16(pcast<Packet16c, Packet8s>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet8c, Packet4us>(const Packet8c& a) {
+  return vreinterpret_u16_s16(pcast<Packet8c, Packet4s>(a));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet16c, Packet16c>(const Packet16c& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet8c, Packet8c>(const Packet8c& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pcast<Packet4c, Packet4c>(const Packet4c& a) {
+  return a;
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet16c, Packet16uc>(const Packet16c& a) {
+  return vreinterpretq_u8_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet8c, Packet8uc>(const Packet8c& a) {
+  return vreinterpret_u8_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pcast<Packet4c, Packet4uc>(const Packet4c& a) {
+  return static_cast<Packet4uc>(a);
+}
+
+//==============================================================================
+// pcast, SrcType = uint8_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::uint8_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet16uc, Packet4f>(const Packet16uc& a) {
+  // Discard all but first 4 bytes.
+  return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a)))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet8uc, Packet2f>(const Packet8uc& a) {
+  // Discard all but first 2 bytes.
+  return vcvt_f32_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(a)))));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet16uc, Packet2ul>(const Packet16uc& a) {
+  // Discard all but first two bytes.
+  return vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet16uc, Packet2l>(const Packet16uc& a) {
+  return vreinterpretq_s64_u64(pcast<Packet16uc, Packet2ul>(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet16uc, Packet4ui>(const Packet16uc& a) {
+  // Discard all but first 4 bytes.
+  return vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet8uc, Packet2ui>(const Packet8uc& a) {
+  // Discard all but first 2 bytes.
+  return vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(a))));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet16uc, Packet4i>(const Packet16uc& a) {
+  return vreinterpretq_s32_u32(pcast<Packet16uc, Packet4ui>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet8uc, Packet2i>(const Packet8uc& a) {
+  return vreinterpret_s32_u32(pcast<Packet8uc, Packet2ui>(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet16uc, Packet8us>(const Packet16uc& a) {
+  // Discard second half of input.
+  return vmovl_u8(vget_low_u8(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet8uc, Packet4us>(const Packet8uc& a) {
+  // Discard second half of input.
+  return vget_low_u16(vmovl_u8(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet16uc, Packet8s>(const Packet16uc& a) {
+  return vreinterpretq_s16_u16(pcast<Packet16uc, Packet8us>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet8uc, Packet4s>(const Packet8uc& a) {
+  return vreinterpret_s16_u16(pcast<Packet8uc, Packet4us>(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet16uc, Packet16uc>(const Packet16uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet8uc, Packet8uc>(const Packet8uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pcast<Packet4uc, Packet4uc>(const Packet4uc& a) {
+  return a;
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet16uc, Packet16c>(const Packet16uc& a) {
+  return vreinterpretq_s8_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet8uc, Packet8c>(const Packet8uc& a) {
+  return vreinterpret_s8_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pcast<Packet4uc, Packet4c>(const Packet4uc& a) {
+  return static_cast<Packet4c>(a);
+}
+
+//==============================================================================
+// pcast, SrcType = int16_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::int16_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet8s, Packet4f>(const Packet8s& a) {
+  // Discard second half of input.
+  return vcvtq_f32_s32(vmovl_s16(vget_low_s16(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet4s, Packet2f>(const Packet4s& a) {
+  // Discard second half of input.
+  return vcvt_f32_s32(vget_low_s32(vmovl_s16(a)));
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet8s, Packet2l>(const Packet8s& a) {
+  // Discard all but first two values.
+  return vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(a))));
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet8s, Packet2ul>(const Packet8s& a) {
+  return vreinterpretq_u64_s64(pcast<Packet8s, Packet2l>(a));
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet8s, Packet4i>(const Packet8s& a) {
+  // Discard second half of input.
+  return vmovl_s16(vget_low_s16(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet4s, Packet2i>(const Packet4s& a) {
+  // Discard second half of input.
+  return vget_low_s32(vmovl_s16(a));
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet8s, Packet4ui>(const Packet8s& a) {
+  return vreinterpretq_u32_s32(pcast<Packet8s, Packet4i>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet4s, Packet2ui>(const Packet4s& a) {
+  return vreinterpret_u32_s32(pcast<Packet4s, Packet2i>(a));
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet8s, Packet8s>(const Packet8s& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet4s, Packet4s>(const Packet4s& a) {
+  return a;
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet8s, Packet8us>(const Packet8s& a) {
+  return vreinterpretq_u16_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet4s, Packet4us>(const Packet4s& a) {
+  return vreinterpret_u16_s16(a);
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet8s, Packet16c>(const Packet8s& a, const Packet8s& b) {
+  return vcombine_s8(vmovn_s16(a), vmovn_s16(b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet4s, Packet8c>(const Packet4s& a, const Packet4s& b) {
+  return vmovn_s16(vcombine_s16(a, b));
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet8s, Packet16uc>(const Packet8s& a, const Packet8s& b) {
+  return vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(a)), vmovn_u16(vreinterpretq_u16_s16(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet4s, Packet8uc>(const Packet4s& a, const Packet4s& b) {
+  return vmovn_u16(vcombine_u16(vreinterpret_u16_s16(a), vreinterpret_u16_s16(b)));
+}
+
+//==============================================================================
+// pcast, SrcType = uint16_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::uint16_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet8us, Packet4f>(const Packet8us& a) {
+  // Discard second half of input.
+  return vcvtq_f32_u32(vmovl_u16(vget_low_u16(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet4us, Packet2f>(const Packet4us& a) {
+  // Discard second half of input.
+  return vcvt_f32_u32(vget_low_u32(vmovl_u16(a)));
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet8us, Packet2ul>(const Packet8us& a) {
+  // Discard all but first two values.
+  return vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(a))));
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet8us, Packet2l>(const Packet8us& a) {
+  return vreinterpretq_s64_u64(pcast<Packet8us, Packet2ul>(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet8us, Packet4ui>(const Packet8us& a) {
+  // Discard second half of input.
+  return vmovl_u16(vget_low_u16(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet4us, Packet2ui>(const Packet4us& a) {
+  // Discard second half of input.
+  return vget_low_u32(vmovl_u16(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet8us, Packet4i>(const Packet8us& a) {
+  return vreinterpretq_s32_u32(pcast<Packet8us, Packet4ui>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet4us, Packet2i>(const Packet4us& a) {
+  return vreinterpret_s32_u32(pcast<Packet4us, Packet2ui>(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet8us, Packet8us>(const Packet8us& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet4us, Packet4us>(const Packet4us& a) {
+  return a;
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet8us, Packet8s>(const Packet8us& a) {
+  return vreinterpretq_s16_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet4us, Packet4s>(const Packet4us& a) {
+  return vreinterpret_s16_u16(a);
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet8us, Packet16uc>(const Packet8us& a, const Packet8us& b) {
+  return vcombine_u8(vmovn_u16(a), vmovn_u16(b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet4us, Packet8uc>(const Packet4us& a, const Packet4us& b) {
+  return vmovn_u16(vcombine_u16(a, b));
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet8us, Packet16c>(const Packet8us& a, const Packet8us& b) {
+  return vreinterpretq_s8_u8(pcast<Packet8us, Packet16uc>(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet4us, Packet8c>(const Packet4us& a, const Packet4us& b) {
+  return vreinterpret_s8_u8(pcast<Packet4us, Packet8uc>(a, b));
+}
+
+//==============================================================================
+// pcast, SrcType = int32_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::int32_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return vcvtq_f32_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet2i, Packet2f>(const Packet2i& a) {
+  return vcvt_f32_s32(a);
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet4i, Packet2l>(const Packet4i& a) {
+  // Discard second half of input.
+  return vmovl_s32(vget_low_s32(a));
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet4i, Packet2ul>(const Packet4i& a) {
+  return vreinterpretq_u64_s64(pcast<Packet4i, Packet2l>(a));
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4i, Packet4i>(const Packet4i& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet2i, Packet2i>(const Packet2i& a) {
+  return a;
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet4i, Packet4ui>(const Packet4i& a) {
+  return vreinterpretq_u32_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet2i, Packet2ui>(const Packet2i& a) {
+  return vreinterpret_u32_s32(a);
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet4i, Packet8s>(const Packet4i& a, const Packet4i& b) {
+  return vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet2i, Packet4s>(const Packet2i& a, const Packet2i& b) {
+  return vmovn_s32(vcombine_s32(a, b));
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet4i, Packet8us>(const Packet4i& a, const Packet4i& b) {
+  return vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(a)), vmovn_u32(vreinterpretq_u32_s32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet2i, Packet4us>(const Packet2i& a, const Packet2i& b) {
+  return vmovn_u32(vreinterpretq_u32_s32(vcombine_s32(a, b)));
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet4i, Packet16c>(const Packet4i& a, const Packet4i& b, const Packet4i& c,
+                                                         const Packet4i& d) {
+  const int16x8_t ab_s16 = pcast<Packet4i, Packet8s>(a, b);
+  const int16x8_t cd_s16 = pcast<Packet4i, Packet8s>(c, d);
+  return vcombine_s8(vmovn_s16(ab_s16), vmovn_s16(cd_s16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet2i, Packet8c>(const Packet2i& a, const Packet2i& b, const Packet2i& c,
+                                                       const Packet2i& d) {
+  const int16x4_t ab_s16 = vmovn_s32(vcombine_s32(a, b));
+  const int16x4_t cd_s16 = vmovn_s32(vcombine_s32(c, d));
+  return vmovn_s16(vcombine_s16(ab_s16, cd_s16));
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet4i, Packet16uc>(const Packet4i& a, const Packet4i& b, const Packet4i& c,
+                                                           const Packet4i& d) {
+  const uint16x8_t ab_u16 = pcast<Packet4i, Packet8us>(a, b);
+  const uint16x8_t cd_u16 = pcast<Packet4i, Packet8us>(c, d);
+  return vcombine_u8(vmovn_u16(ab_u16), vmovn_u16(cd_u16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet2i, Packet8uc>(const Packet2i& a, const Packet2i& b, const Packet2i& c,
+                                                         const Packet2i& d) {
+  const uint16x4_t ab_u16 = pcast<Packet2i, Packet4us>(a, b);
+  const uint16x4_t cd_u16 = pcast<Packet2i, Packet4us>(c, d);
+  return vmovn_u16(vcombine_u16(ab_u16, cd_u16));
+}
+
+//==============================================================================
+// pcast, SrcType = uint32_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::uint32_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(const Packet4ui& a) {
+  return vcvtq_f32_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet2ui, Packet2f>(const Packet2ui& a) {
+  return vcvt_f32_u32(a);
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet4ui, Packet2ul>(const Packet4ui& a) {
+  // Discard second half of input.
+  return vmovl_u32(vget_low_u32(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet4ui, Packet2l>(const Packet4ui& a) {
+  return vreinterpretq_s64_u64(pcast<Packet4ui, Packet2ul>(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet4ui, Packet4ui>(const Packet4ui& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet2ui, Packet2ui>(const Packet2ui& a) {
+  return a;
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4ui, Packet4i>(const Packet4ui& a) {
+  return vreinterpretq_s32_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet2ui, Packet2i>(const Packet2ui& a) {
+  return vreinterpret_s32_u32(a);
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet4ui, Packet8us>(const Packet4ui& a, const Packet4ui& b) {
+  return vcombine_u16(vmovn_u32(a), vmovn_u32(b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet2ui, Packet4us>(const Packet2ui& a, const Packet2ui& b) {
+  return vmovn_u32(vcombine_u32(a, b));
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet4ui, Packet8s>(const Packet4ui& a, const Packet4ui& b) {
+  return vreinterpretq_s16_u16(pcast<Packet4ui, Packet8us>(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet2ui, Packet4s>(const Packet2ui& a, const Packet2ui& b) {
+  return vreinterpret_s16_u16(pcast<Packet2ui, Packet4us>(a, b));
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet4ui, Packet16uc>(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c,
+                                                            const Packet4ui& d) {
+  const uint16x8_t ab_u16 = vcombine_u16(vmovn_u32(a), vmovn_u32(b));
+  const uint16x8_t cd_u16 = vcombine_u16(vmovn_u32(c), vmovn_u32(d));
+  return vcombine_u8(vmovn_u16(ab_u16), vmovn_u16(cd_u16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet2ui, Packet8uc>(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c,
+                                                          const Packet2ui& d) {
+  const uint16x4_t ab_u16 = vmovn_u32(vcombine_u32(a, b));
+  const uint16x4_t cd_u16 = vmovn_u32(vcombine_u32(c, d));
+  return vmovn_u16(vcombine_u16(ab_u16, cd_u16));
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet4ui, Packet16c>(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c,
+                                                          const Packet4ui& d) {
+  return vreinterpretq_s8_u8(pcast<Packet4ui, Packet16uc>(a, b, c, d));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet2ui, Packet8c>(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c,
+                                                        const Packet2ui& d) {
+  return vreinterpret_s8_u8(pcast<Packet2ui, Packet8uc>(a, b, c, d));
+}
+
+//==============================================================================
+// pcast, SrcType = int64_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::int64_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet2l, Packet4f>(const Packet2l& a, const Packet2l& b) {
+  return vcvtq_f32_s32(vcombine_s32(vmovn_s64(a), vmovn_s64(b)));
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet2l, Packet2l>(const Packet2l& a) {
+  return a;
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet2l, Packet2ul>(const Packet2l& a) {
+  return vreinterpretq_u64_s64(a);
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet2l, Packet4i>(const Packet2l& a, const Packet2l& b) {
+  return vcombine_s32(vmovn_s64(a), vmovn_s64(b));
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet2l, Packet4ui>(const Packet2l& a, const Packet2l& b) {
+  return vcombine_u32(vmovn_u64(vreinterpretq_u64_s64(a)), vmovn_u64(vreinterpretq_u64_s64(b)));
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet2l, Packet8s>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                       const Packet2l& d) {
+  const int32x4_t ab_s32 = pcast<Packet2l, Packet4i>(a, b);
+  const int32x4_t cd_s32 = pcast<Packet2l, Packet4i>(c, d);
+  return vcombine_s16(vmovn_s32(ab_s32), vmovn_s32(cd_s32));
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet2l, Packet8us>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                         const Packet2l& d) {
+  const uint32x4_t ab_u32 = pcast<Packet2l, Packet4ui>(a, b);
+  const uint32x4_t cd_u32 = pcast<Packet2l, Packet4ui>(c, d);
+  return vcombine_u16(vmovn_u32(ab_u32), vmovn_u32(cd_u32));
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet2l, Packet16c>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                         const Packet2l& d, const Packet2l& e, const Packet2l& f,
+                                                         const Packet2l& g, const Packet2l& h) {
+  const int16x8_t abcd_s16 = pcast<Packet2l, Packet8s>(a, b, c, d);
+  const int16x8_t efgh_s16 = pcast<Packet2l, Packet8s>(e, f, g, h);
+  return vcombine_s8(vmovn_s16(abcd_s16), vmovn_s16(efgh_s16));
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet2l, Packet16uc>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                           const Packet2l& d, const Packet2l& e, const Packet2l& f,
+                                                           const Packet2l& g, const Packet2l& h) {
+  const uint16x8_t abcd_u16 = pcast<Packet2l, Packet8us>(a, b, c, d);
+  const uint16x8_t efgh_u16 = pcast<Packet2l, Packet8us>(e, f, g, h);
+  return vcombine_u8(vmovn_u16(abcd_u16), vmovn_u16(efgh_u16));
+}
+
+//==============================================================================
+// pcast, SrcType = uint64_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::uint64_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet2ul, Packet4f>(const Packet2ul& a, const Packet2ul& b) {
+  return vcvtq_f32_u32(vcombine_u32(vmovn_u64(a), vmovn_u64(b)));
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet2ul, Packet2ul>(const Packet2ul& a) {
+  return a;
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet2ul, Packet2l>(const Packet2ul& a) {
+  return vreinterpretq_s64_u64(a);
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet2ul, Packet4ui>(const Packet2ul& a, const Packet2ul& b) {
+  return vcombine_u32(vmovn_u64(a), vmovn_u64(b));
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet2ul, Packet4i>(const Packet2ul& a, const Packet2ul& b) {
+  return vreinterpretq_s32_u32(pcast<Packet2ul, Packet4ui>(a, b));
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet2ul, Packet8us>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                          const Packet2ul& d) {
+  const uint16x4_t ab_u16 = vmovn_u32(vcombine_u32(vmovn_u64(a), vmovn_u64(b)));
+  const uint16x4_t cd_u16 = vmovn_u32(vcombine_u32(vmovn_u64(c), vmovn_u64(d)));
+  return vcombine_u16(ab_u16, cd_u16);
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet2ul, Packet8s>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                        const Packet2ul& d) {
+  return vreinterpretq_s16_u16(pcast<Packet2ul, Packet8us>(a, b, c, d));
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet2ul, Packet16uc>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                            const Packet2ul& d, const Packet2ul& e, const Packet2ul& f,
+                                                            const Packet2ul& g, const Packet2ul& h) {
+  const uint16x8_t abcd_u16 = pcast<Packet2ul, Packet8us>(a, b, c, d);
+  const uint16x8_t efgh_u16 = pcast<Packet2ul, Packet8us>(e, f, g, h);
+  return vcombine_u8(vmovn_u16(abcd_u16), vmovn_u16(efgh_u16));
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet2ul, Packet16c>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                          const Packet2ul& d, const Packet2ul& e, const Packet2ul& f,
+                                                          const Packet2ul& g, const Packet2ul& h) {
+  return vreinterpretq_s8_u8(pcast<Packet2ul, Packet16uc>(a, b, c, d, e, f, g, h));
+}
+
+//==============================================================================
+// preinterpret
+//==============================================================================
+template <>
+EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f, Packet2i>(const Packet2i& a) {
+  return vreinterpret_f32_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f, Packet2ui>(const Packet2ui& a) {
+  return vreinterpret_f32_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a) {
+  return vreinterpretq_f32_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4ui>(const Packet4ui& a) {
+  return vreinterpretq_f32_u32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4c preinterpret<Packet4c, Packet4uc>(const Packet4uc& a) {
+  return static_cast<Packet4c>(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c preinterpret<Packet8c, Packet8uc>(const Packet8uc& a) {
+  return vreinterpret_s8_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c preinterpret<Packet16c, Packet16uc>(const Packet16uc& a) {
+  return vreinterpretq_s8_u8(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4uc preinterpret<Packet4uc, Packet4c>(const Packet4c& a) {
+  return static_cast<Packet4uc>(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc preinterpret<Packet8uc, Packet8c>(const Packet8c& a) {
+  return vreinterpret_u8_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc preinterpret<Packet16uc, Packet16c>(const Packet16c& a) {
+  return vreinterpretq_u8_s8(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4s preinterpret<Packet4s, Packet4us>(const Packet4us& a) {
+  return vreinterpret_s16_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s preinterpret<Packet8s, Packet8us>(const Packet8us& a) {
+  return vreinterpretq_s16_u16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4us preinterpret<Packet4us, Packet4s>(const Packet4s& a) {
+  return vreinterpret_u16_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us preinterpret<Packet8us, Packet8s>(const Packet8s& a) {
+  return vreinterpretq_u16_s16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i, Packet2f>(const Packet2f& a) {
+  return vreinterpret_s32_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i, Packet2ui>(const Packet2ui& a) {
+  return vreinterpret_s32_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4f>(const Packet4f& a) {
+  return vreinterpretq_s32_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
+  return vreinterpretq_s32_u32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui, Packet2f>(const Packet2f& a) {
+  return vreinterpret_u32_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui, Packet2i>(const Packet2i& a) {
+  return vreinterpret_u32_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4f>(const Packet4f& a) {
+  return vreinterpretq_u32_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4i>(const Packet4i& a) {
+  return vreinterpretq_u32_s32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2ul>(const Packet2ul& a) {
+  return vreinterpretq_s64_u64(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2l>(const Packet2l& a) {
+  return vreinterpretq_u64_s64(a);
+}
+
+#if EIGEN_ARCH_ARM64
+
+//==============================================================================
+// pcast/preinterpret, Double
+//==============================================================================
+
+template <>
+struct type_casting_traits<double, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet2d, Packet2d>(const Packet2d& a) {
+  return a;
+}
+
+template <>
+struct type_casting_traits<double, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
+  return vcombine_f32(vcvt_f32_f64(a), vcvt_f32_f64(b));
+}
+
+template <>
+struct type_casting_traits<double, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet2d, Packet2l>(const Packet2d& a) {
+  return vcvtq_s64_f64(a);
+}
+
+template <>
+struct type_casting_traits<double, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet2d, Packet2ul>(const Packet2d& a) {
+  return vcvtq_u64_f64(a);
+}
+
+template <>
+struct type_casting_traits<double, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet2d, Packet4i>(const Packet2d& a, const Packet2d& b) {
+  return vcombine_s32(vmovn_s64(vcvtq_s64_f64(a)), vmovn_s64(vcvtq_s64_f64(b)));
+}
+
+template <>
+struct type_casting_traits<double, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet2d, Packet4ui>(const Packet2d& a, const Packet2d& b) {
+  return vcombine_u32(vmovn_u64(vcvtq_u64_f64(a)), vmovn_u64(vcvtq_u64_f64(b)));
+}
+
+template <>
+struct type_casting_traits<double, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet2d, Packet8s>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                       const Packet2d& d) {
+  const int32x4_t ab_s32 = pcast<Packet2d, Packet4i>(a, b);
+  const int32x4_t cd_s32 = pcast<Packet2d, Packet4i>(c, d);
+  return vcombine_s16(vmovn_s32(ab_s32), vmovn_s32(cd_s32));
+}
+
+template <>
+struct type_casting_traits<double, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet2d, Packet8us>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                         const Packet2d& d) {
+  const uint32x4_t ab_u32 = pcast<Packet2d, Packet4ui>(a, b);
+  const uint32x4_t cd_u32 = pcast<Packet2d, Packet4ui>(c, d);
+  return vcombine_u16(vmovn_u32(ab_u32), vmovn_u32(cd_u32));
+}
+
+template <>
+struct type_casting_traits<double, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet2d, Packet16c>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                         const Packet2d& d, const Packet2d& e, const Packet2d& f,
+                                                         const Packet2d& g, const Packet2d& h) {
+  const int16x8_t abcd_s16 = pcast<Packet2d, Packet8s>(a, b, c, d);
+  const int16x8_t efgh_s16 = pcast<Packet2d, Packet8s>(e, f, g, h);
+  return vcombine_s8(vmovn_s16(abcd_s16), vmovn_s16(efgh_s16));
+}
+
+template <>
+struct type_casting_traits<double, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet2d, Packet16uc>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                           const Packet2d& d, const Packet2d& e, const Packet2d& f,
+                                                           const Packet2d& g, const Packet2d& h) {
+  const uint16x8_t abcd_u16 = pcast<Packet2d, Packet8us>(a, b, c, d);
+  const uint16x8_t efgh_u16 = pcast<Packet2d, Packet8us>(e, f, g, h);
+  return vcombine_u8(vmovn_u16(abcd_u16), vmovn_u16(efgh_u16));
+}
+
+template <>
+struct type_casting_traits<float, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
+  // Discard second-half of input.
+  return vcvt_f64_f32(vget_low_f32(a));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet16c, Packet2d>(const Packet16c& a) {
+  // Discard all but first two values.
+  return vcvt_f64_f32(pcast<Packet8c, Packet2f>(vget_low_s8(a)));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet16uc, Packet2d>(const Packet16uc& a) {
+  // Discard all but first two values.
+  return vcvt_f64_f32(pcast<Packet8uc, Packet2f>(vget_low_u8(a)));
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet8s, Packet2d>(const Packet8s& a) {
+  // Discard all but first two values.
+  return vcvt_f64_f32(pcast<Packet4s, Packet2f>(vget_low_s16(a)));
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet8us, Packet2d>(const Packet8us& a) {
+  // Discard all but first two values.
+  return vcvt_f64_f32(pcast<Packet4us, Packet2f>(vget_low_u16(a)));
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet4i, Packet2d>(const Packet4i& a) {
+  // Discard second half of input.
+  return vcvtq_f64_s64(vmovl_s32(vget_low_s32(a)));
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet4ui, Packet2d>(const Packet4ui& a) {
+  // Discard second half of input.
+  return vcvtq_f64_u64(vmovl_u32(vget_low_u32(a)));
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet2l, Packet2d>(const Packet2l& a) {
+  return vcvtq_f64_s64(a);
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet2ul, Packet2d>(const Packet2ul& a) {
+  return vcvtq_f64_u64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2l>(const Packet2l& a) {
+  return vreinterpretq_f64_s64(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2ul>(const Packet2ul& a) {
+  return vreinterpretq_f64_u64(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2d>(const Packet2d& a) {
+  return vreinterpretq_s64_f64(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2d>(const Packet2d& a) {
+  return vreinterpretq_u64_f64(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4i>(const Packet4i& a) {
+  return vreinterpretq_f64_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
+  return vreinterpretq_s32_f64(a);
+}
+
+#endif  // EIGEN_ARCH_ARM64
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_TYPE_CASTING_NEON_H

diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 2722893..215bfd7 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h

@@ -19,7 +19,7 @@
 {
   EIGEN_STRONG_INLINE Packet2cf() {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {}
-  __m128  v;
+  Packet4f v;
 };
 
 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
@@ -40,20 +40,33 @@
     HasMul    = 1,
     HasDiv    = 1,
     HasNegate = 1,
+    HasSqrt   = 1,
     HasAbs    = 0,
     HasAbs2   = 0,
     HasMin    = 0,
     HasMax    = 0,
     HasSetLinear = 0,
-    HasBlend = 1,
+    HasBlend  = 1
   };
 };
 #endif
 
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> {
+  typedef std::complex<float> type;
+  typedef Packet2cf half;
+  typedef Packet4f as_real;
+  enum {
+    size=2,
+    alignment=Aligned16,
+    vectorizable=true,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
 
 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
+
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a)
 {
   const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
@@ -67,7 +80,6 @@
 
 template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
-  // TODO optimize it for SSE3 and 4
   #ifdef EIGEN_VECTORIZE_SSE3
   return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v),
                                  _mm_mul_ps(_mm_movehdup_ps(a.v),
@@ -83,30 +95,20 @@
   #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet2cf ptrue  <Packet2cf>(const Packet2cf& a) { return Packet2cf(ptrue(Packet4f(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(b.v,a.v)); }
 
 template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&numext::real_ref(*from))); }
 template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&numext::real_ref(*from))); }
 
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
-  Packet2cf res;
-#if EIGEN_GNUC_AT_MOST(4,2)
-  // Workaround annoying "may be used uninitialized in this function" warning with gcc 4.2
-  res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast<const __m64*>(&from));
-#elif EIGEN_GNUC_AT_LEAST(4,6)
-  // Suppress annoying "may be used uninitialized in this function" warning with gcc >= 4.6
-  #pragma GCC diagnostic push
-  #pragma GCC diagnostic ignored "-Wuninitialized"
-  res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
-  #pragma GCC diagnostic pop
-#else
-  res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
-#endif
-  return Packet2cf(_mm_movelh_ps(res.v,res.v));
+  const float re = std::real(from);
+  const float im = std::imag(from);
+  return Packet2cf(_mm_set_ps(im, re, im, re));
 }
 
 template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
@@ -115,13 +117,13 @@
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); }
 
 
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride)
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
 {
   return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]),
                               std::imag(from[0*stride]), std::real(from[0*stride])));
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride)
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
 {
   to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)),
                                      _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1)));
@@ -129,7 +131,7 @@
                                      _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
@@ -153,113 +155,26 @@
   return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v))));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
-{
-  return Packet2cf(_mm_add_ps(_mm_movelh_ps(vecs[0].v,vecs[1].v), _mm_movehl_ps(vecs[1].v,vecs[0].v)));
-}
-
 template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
 {
   return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v))));
 }
 
-template<int Offset>
-struct palign_impl<Offset,Packet2cf>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip/* <Packet2cf> */(const Packet2cf& x)
 {
-  static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
-  {
-    if (Offset==1)
-    {
-      first.v = _mm_movehl_ps(first.v, first.v);
-      first.v = _mm_movelh_ps(first.v, second.v);
-    }
-  }
-};
+  return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2));
+}
 
-template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return internal::pmul(a, pconj(b));
-    #else
-    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
-    return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
-                                _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
-                                           vec4f_swizzle1(b.v, 1, 0, 3, 2))));
-    #endif
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return internal::pmul(pconj(a), b);
-    #else
-    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
-    return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
-                                _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
-                                                      vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
-    #endif
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return pconj(internal::pmul(a, b));
-    #else
-    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
-    return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
-                                _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
-                                           vec4f_swizzle1(b.v, 1, 0, 3, 2))));
-    #endif
-  }
-};
-
-template<> struct conj_helper<Packet4f, Packet2cf, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
-  { return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet2cf, Packet4f, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
-  { return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
   // TODO optimize it for SSE3 and 4
-  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
+  Packet2cf res = pmul(a, pconj(b));
   __m128 s = _mm_mul_ps(b.v,b.v);
-  return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1)))));
+  return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,vec4f_swizzle1(s, 1, 0, 3, 2))));
 }
 
-EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
-{
-  return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2));
-}
 
 
 //---------- double ----------
@@ -267,7 +182,7 @@
 {
   EIGEN_STRONG_INLINE Packet1cd() {}
   EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {}
-  __m128d  v;
+  Packet2d v;
 };
 
 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
@@ -288,6 +203,7 @@
     HasMul    = 1,
     HasDiv    = 1,
     HasNegate = 1,
+    HasSqrt   = 1,
     HasAbs    = 0,
     HasAbs2   = 0,
     HasMin    = 0,
@@ -297,7 +213,18 @@
 };
 #endif
 
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet1cd> {
+  typedef std::complex<double> type;
+  typedef Packet1cd half;
+  typedef Packet2d as_real;
+  enum {
+    size=1,
+    alignment=Aligned16,
+    vectorizable=true,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
 
 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
@@ -310,9 +237,8 @@
 
 template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
-  // TODO optimize it for SSE3 and 4
   #ifdef EIGEN_VECTORIZE_SSE3
-  return Packet1cd(_mm_addsub_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
+  return Packet1cd(_mm_addsub_pd(_mm_mul_pd(_mm_movedup_pd(a.v), b.v),
                                  _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
                                             vec2d_swizzle1(b.v, 1, 0))));
   #else
@@ -323,10 +249,11 @@
   #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet1cd ptrue  <Packet1cd>(const Packet1cd& a) { return Packet1cd(ptrue(Packet2d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(b.v,a.v)); }
 
 // FIXME force unaligned load, this is a temporary fix
 template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from)
@@ -342,7 +269,7 @@
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 
 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {
@@ -358,112 +285,27 @@
   return pfirst(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
-{
-  return vecs[0];
-}
-
 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
 {
   return pfirst(a);
 }
 
-template<int Offset>
-struct palign_impl<Offset,Packet1cd>
-{
-  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
-  {
-    // FIXME is it sure we never have to align a Packet1cd?
-    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return internal::pmul(a, pconj(b));
-    #else
-    const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
-    return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask),
-                                _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
-                                           vec2d_swizzle1(b.v, 1, 0))));
-    #endif
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return internal::pmul(pconj(a), b);
-    #else
-    const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
-    return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
-                                _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
-                                                      vec2d_swizzle1(b.v, 1, 0)), mask)));
-    #endif
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return pconj(internal::pmul(a, b));
-    #else
-    const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
-    return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask),
-                                _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
-                                           vec2d_swizzle1(b.v, 1, 0))));
-    #endif
-  }
-};
-
-template<> struct conj_helper<Packet2d, Packet1cd, false,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
-  { return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet1cd, Packet2d, false,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
-  { return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
 
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
   // TODO optimize it for SSE3 and 4
-  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
+  Packet1cd res = pmul(a,pconj(b));
   __m128d s = _mm_mul_pd(b.v,b.v);
   return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1))));
 }
 
-EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
+EIGEN_STRONG_INLINE Packet1cd pcplxflip/* <Packet1cd> */(const Packet1cd& x)
 {
   return Packet1cd(preverse(Packet2d(x.v)));
 }
 
-template<> EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet2cf,2>& kernel) {
   __m128d w1 = _mm_castps_pd(kernel.packet[0].v);
   __m128d w2 = _mm_castps_pd(kernel.packet[1].v);
@@ -473,14 +315,32 @@
   kernel.packet[1].v = tmp;
 }
 
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b)
+{
+  __m128 eq = _mm_cmpeq_ps(a.v, b.v);
+  return Packet2cf(pand<Packet4f>(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b)
+{
+  __m128d eq = _mm_cmpeq_pd(a.v, b.v);
+  return Packet1cd(pand<Packet2d>(eq, vec2d_swizzle1(eq, 1, 0)));
+}
+
 template<>  EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
-  __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
+  __m128d result = pblend<Packet2d>(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
   return Packet2cf(_mm_castpd_ps(result));
 }
 
+template<> EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
+  return psqrt_complex<Packet1cd>(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
+  return psqrt_complex<Packet2cf>(a);
+}
 
 } // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_COMPLEX_SSE_H

diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index ca3ed54..8736d0d 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h

@@ -8,7 +8,7 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-/* The sin, cos, exp, and log functions of this file come from
+/* The sin and cos and functions of this file come from
  * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
  */
 
@@ -19,517 +19,58 @@
 
 namespace internal {
 
-// Functions for division.
-// The EIGEN_FAST_MATH version uses the _mm_rcp_ps approximation and one step of
-// Newton's method, at a cost of 1-2 bits of precision as opposed to the exact
-// solution. The main advantage of this approach is not just speed, but also the
-// fact that it can be inlined and pipelined with other computations, further
-// reducing its effective latency.
-#if EIGEN_FAST_DIV
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
-pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
-  _EIGEN_DECLARE_CONST_Packet4f(two, 2.0f);
-
-  /* Start with an estimate of the reciprocal of b. */
-  Packet4f x = _mm_rcp_ps(b);
-
-  /* One step of Newton's method on b - x^-1 == 0. */
-  x = pmul(x, pmadd(-b, x, p4f_two));
-
-  // Multiply the inverse of b with a.
-  return pmul(a, x);
-}
-#else
-template <>
-EIGEN_STRONG_INLINE Packet4f
-pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
-  return _mm_div_ps(a, b);
-}
-#endif
-template <>
-EIGEN_STRONG_INLINE Packet2d
-pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
-  return _mm_div_pd(a, b);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4i
-pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) {
-  eigen_assert(false && "packet integer division are not supported by SSE");
-  return pset1<Packet4i>(0);
-}
-
-// Hyperbolic Tangent function.
-// Doesn't do anything fancy, just a 13/6-degree rational interpolant which
-// is accurate up to a couple of ulp in the range [-9, 9], outside of which the
-// fl(tanh(x)) = +/-1.
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
-ptanh<Packet4f>(const Packet4f& _x) {
-  // Clamp the inputs to the range [-9, 9] since anything outside
-  // this range is +/-1.0f in single-precision.
-  _EIGEN_DECLARE_CONST_Packet4f(plus_9, 9.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(minus_9, -9.0f);
-  const Packet4f x = pmax(p4f_minus_9, pmin(p4f_plus_9, _x));
-
-  // The monomial coefficients of the numerator polynomial (odd).
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-03f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-04f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-05f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-08f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f);
-
-  // The monomial coefficients of the denominator polynomial (even).
-  _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-03f);
-  _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-03f);
-  _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-04f);
-  _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-06f);
-
-  // Since the polynomials are odd/even, we need x^2.
-  const Packet4f x2 = pmul(x, x);
-
-  // Evaluate the numerator polynomial p.
-  Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11);
-  p = pmadd(x2, p, p4f_alpha_9);
-  p = pmadd(x2, p, p4f_alpha_7);
-  p = pmadd(x2, p, p4f_alpha_5);
-  p = pmadd(x2, p, p4f_alpha_3);
-  p = pmadd(x2, p, p4f_alpha_1);
-  p = pmul(x, p);
-
-  // Evaluate the denominator polynomial p.
-  Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4);
-  q = pmadd(x2, q, p4f_beta_2);
-  q = pmadd(x2, q, p4f_beta_0);
-
-  // Divide the numerator by the denominator.
-  return pdiv(p, q);
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f plog<Packet4f>(const Packet4f& _x) {
+  return plog_float(_x);
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f plog<Packet4f>(const Packet4f& _x)
-{
-  Packet4f x = _x;
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+Packet2d plog<Packet2d>(const Packet2d& _x) {
+  return plog_double(_x);
+}
 
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f plog2<Packet4f>(const Packet4f& _x) {
+  return plog2_float(_x);
+}
 
-  /* the smallest non denormalized float number */
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000);//-1.f/0.f);
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d plog2<Packet2d>(const Packet2d& _x) {
+  return plog2_double(_x);
+}
 
-  /* natural logarithm computed for 4 simultaneous float
-    return NaN for x <= 0
-  */
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f plog1p<Packet4f>(const Packet4f& _x) {
+  return generic_plog1p(_x);
+}
 
-
-  Packet4i emm0;
-
-  // invalid_mask is set to true when x is NaN
-  Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps());
-  Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
-
-  x = pmax(x, p4f_min_norm_pos);  /* cut off denormalized stuff */
-  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
-
-  /* keep only the fractional part */
-  x = _mm_and_ps(x, p4f_inv_mant_mask);
-  x = _mm_or_ps(x, p4f_half);
-
-  emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
-  Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1);
-
-  /* part2:
-     if( x < SQRTHF ) {
-       e -= 1;
-       x = x + x - 1.0;
-     } else { x = x - 1.0; }
-  */
-  Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
-  Packet4f tmp = pand(x, mask);
-  x = psub(x, p4f_1);
-  e = psub(e, pand(p4f_1, mask));
-  x = padd(x, tmp);
-
-  Packet4f x2 = pmul(x,x);
-  Packet4f x3 = pmul(x2,x);
-
-  Packet4f y, y1, y2;
-  y  = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
-  y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
-  y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
-  y  = pmadd(y , x, p4f_cephes_log_p2);
-  y1 = pmadd(y1, x, p4f_cephes_log_p5);
-  y2 = pmadd(y2, x, p4f_cephes_log_p8);
-  y = pmadd(y, x3, y1);
-  y = pmadd(y, x3, y2);
-  y = pmul(y, x3);
-
-  y1 = pmul(e, p4f_cephes_log_q1);
-  tmp = pmul(x2, p4f_half);
-  y = padd(y, y1);
-  x = psub(x, tmp);
-  y2 = pmul(e, p4f_cephes_log_q2);
-  x = padd(x, y);
-  x = padd(x, y2);
-  // negative arg will be NAN, 0 will be -INF
-  return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)),
-                   _mm_and_ps(iszero_mask, p4f_minus_inf));
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pexpm1<Packet4f>(const Packet4f& _x) {
+  return generic_expm1(_x);
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f pexp<Packet4f>(const Packet4f& _x)
 {
-  Packet4f x = _x;
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-
-
-  _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
-  _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
-
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
-
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
-
-  Packet4f tmp, fx;
-  Packet4i emm0;
-
-  // clamp x
-  x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
-
-  /* express exp(x) as exp(g + n*log(2)) */
-  fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
-
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  fx = _mm_floor_ps(fx);
-#else
-  emm0 = _mm_cvttps_epi32(fx);
-  tmp  = _mm_cvtepi32_ps(emm0);
-  /* if greater, substract 1 */
-  Packet4f mask = _mm_cmpgt_ps(tmp, fx);
-  mask = _mm_and_ps(mask, p4f_1);
-  fx = psub(tmp, mask);
-#endif
-
-  tmp = pmul(fx, p4f_cephes_exp_C1);
-  Packet4f z = pmul(fx, p4f_cephes_exp_C2);
-  x = psub(x, tmp);
-  x = psub(x, z);
-
-  z = pmul(x,x);
-
-  Packet4f y = p4f_cephes_exp_p0;
-  y = pmadd(y, x, p4f_cephes_exp_p1);
-  y = pmadd(y, x, p4f_cephes_exp_p2);
-  y = pmadd(y, x, p4f_cephes_exp_p3);
-  y = pmadd(y, x, p4f_cephes_exp_p4);
-  y = pmadd(y, x, p4f_cephes_exp_p5);
-  y = pmadd(y, z, x);
-  y = padd(y, p4f_1);
-
-  // build 2^n
-  emm0 = _mm_cvttps_epi32(fx);
-  emm0 = _mm_add_epi32(emm0, p4i_0x7f);
-  emm0 = _mm_slli_epi32(emm0, 23);
-  return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x);
+  return pexp_float(_x);
 }
+
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet2d pexp<Packet2d>(const Packet2d& _x)
+Packet2d pexp<Packet2d>(const Packet2d& x)
 {
-  Packet2d x = _x;
-
-  _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
-  _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
-  _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
-
-  _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
-  _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
-  static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
-
-  Packet2d tmp, fx;
-  Packet4i emm0;
-
-  // clamp x
-  x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
-  /* express exp(x) as exp(g + n*log(2)) */
-  fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
-
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  fx = _mm_floor_pd(fx);
-#else
-  emm0 = _mm_cvttpd_epi32(fx);
-  tmp  = _mm_cvtepi32_pd(emm0);
-  /* if greater, substract 1 */
-  Packet2d mask = _mm_cmpgt_pd(tmp, fx);
-  mask = _mm_and_pd(mask, p2d_1);
-  fx = psub(tmp, mask);
-#endif
-
-  tmp = pmul(fx, p2d_cephes_exp_C1);
-  Packet2d z = pmul(fx, p2d_cephes_exp_C2);
-  x = psub(x, tmp);
-  x = psub(x, z);
-
-  Packet2d x2 = pmul(x,x);
-
-  Packet2d px = p2d_cephes_exp_p0;
-  px = pmadd(px, x2, p2d_cephes_exp_p1);
-  px = pmadd(px, x2, p2d_cephes_exp_p2);
-  px = pmul (px, x);
-
-  Packet2d qx = p2d_cephes_exp_q0;
-  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
-  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
-  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
-
-  x = pdiv(px,psub(qx,px));
-  x = pmadd(p2d_2,x,p2d_1);
-
-  // build 2^n
-  emm0 = _mm_cvttpd_epi32(fx);
-  emm0 = _mm_add_epi32(emm0, p4i_1023_0);
-  emm0 = _mm_slli_epi32(emm0, 20);
-  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
-  return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
+  return pexp_double(x);
 }
 
-/* evaluation of 4 sines at onces, using SSE2 intrinsics.
-
-   The code is the exact rewriting of the cephes sinf function.
-   Precision is excellent as long as x < 8192 (I did not bother to
-   take into account the special handling they have for greater values
-   -- it does not return garbage for arguments over 8192, though, but
-   the extra precision is missing).
-
-   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
-   surprising but correct result.
-*/
-
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f psin<Packet4f>(const Packet4f& _x)
 {
-  Packet4f x = _x;
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-
-  _EIGEN_DECLARE_CONST_Packet4i(1, 1);
-  _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
-  _EIGEN_DECLARE_CONST_Packet4i(2, 2);
-  _EIGEN_DECLARE_CONST_Packet4i(4, 4);
-
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
-
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p1,  8.3321608736E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p0,  2.443315711809948E-005f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
-
-  Packet4f xmm1, xmm2, xmm3, sign_bit, y;
-
-  Packet4i emm0, emm2;
-  sign_bit = x;
-  /* take the absolute value */
-  x = pabs(x);
-
-  /* take the modulo */
-
-  /* extract the sign bit (upper one) */
-  sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask);
-
-  /* scale by 4/Pi */
-  y = pmul(x, p4f_cephes_FOPI);
-
-  /* store the integer part of y in mm0 */
-  emm2 = _mm_cvttps_epi32(y);
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  emm2 = _mm_add_epi32(emm2, p4i_1);
-  emm2 = _mm_and_si128(emm2, p4i_not1);
-  y = _mm_cvtepi32_ps(emm2);
-  /* get the swap sign flag */
-  emm0 = _mm_and_si128(emm2, p4i_4);
-  emm0 = _mm_slli_epi32(emm0, 29);
-  /* get the polynom selection mask
-     there is one polynom for 0 <= x <= Pi/4
-     and another one for Pi/4<x<=Pi/2
-
-     Both branches will be computed.
-  */
-  emm2 = _mm_and_si128(emm2, p4i_2);
-  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-
-  Packet4f swap_sign_bit = _mm_castsi128_ps(emm0);
-  Packet4f poly_mask = _mm_castsi128_ps(emm2);
-  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = pmul(y, p4f_minus_cephes_DP1);
-  xmm2 = pmul(y, p4f_minus_cephes_DP2);
-  xmm3 = pmul(y, p4f_minus_cephes_DP3);
-  x = padd(x, xmm1);
-  x = padd(x, xmm2);
-  x = padd(x, xmm3);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = p4f_coscof_p0;
-  Packet4f z = _mm_mul_ps(x,x);
-
-  y = pmadd(y, z, p4f_coscof_p1);
-  y = pmadd(y, z, p4f_coscof_p2);
-  y = pmul(y, z);
-  y = pmul(y, z);
-  Packet4f tmp = pmul(z, p4f_half);
-  y = psub(y, tmp);
-  y = padd(y, p4f_1);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-  Packet4f y2 = p4f_sincof_p0;
-  y2 = pmadd(y2, z, p4f_sincof_p1);
-  y2 = pmadd(y2, z, p4f_sincof_p2);
-  y2 = pmul(y2, z);
-  y2 = pmul(y2, x);
-  y2 = padd(y2, x);
-
-  /* select the correct result from the two polynoms */
-  y2 = _mm_and_ps(poly_mask, y2);
-  y = _mm_andnot_ps(poly_mask, y);
-  y = _mm_or_ps(y,y2);
-  /* update the sign */
-  return _mm_xor_ps(y, sign_bit);
+  return psin_float(_x);
 }
 
-/* almost the same as psin */
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f pcos<Packet4f>(const Packet4f& _x)
 {
-  Packet4f x = _x;
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-
-  _EIGEN_DECLARE_CONST_Packet4i(1, 1);
-  _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
-  _EIGEN_DECLARE_CONST_Packet4i(2, 2);
-  _EIGEN_DECLARE_CONST_Packet4i(4, 4);
-
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p1,  8.3321608736E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p0,  2.443315711809948E-005f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
-
-  Packet4f xmm1, xmm2, xmm3, y;
-  Packet4i emm0, emm2;
-
-  x = pabs(x);
-
-  /* scale by 4/Pi */
-  y = pmul(x, p4f_cephes_FOPI);
-
-  /* get the integer part of y */
-  emm2 = _mm_cvttps_epi32(y);
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  emm2 = _mm_add_epi32(emm2, p4i_1);
-  emm2 = _mm_and_si128(emm2, p4i_not1);
-  y = _mm_cvtepi32_ps(emm2);
-
-  emm2 = _mm_sub_epi32(emm2, p4i_2);
-
-  /* get the swap sign flag */
-  emm0 = _mm_andnot_si128(emm2, p4i_4);
-  emm0 = _mm_slli_epi32(emm0, 29);
-  /* get the polynom selection mask */
-  emm2 = _mm_and_si128(emm2, p4i_2);
-  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-
-  Packet4f sign_bit = _mm_castsi128_ps(emm0);
-  Packet4f poly_mask = _mm_castsi128_ps(emm2);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = pmul(y, p4f_minus_cephes_DP1);
-  xmm2 = pmul(y, p4f_minus_cephes_DP2);
-  xmm3 = pmul(y, p4f_minus_cephes_DP3);
-  x = padd(x, xmm1);
-  x = padd(x, xmm2);
-  x = padd(x, xmm3);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = p4f_coscof_p0;
-  Packet4f z = pmul(x,x);
-
-  y = pmadd(y,z,p4f_coscof_p1);
-  y = pmadd(y,z,p4f_coscof_p2);
-  y = pmul(y, z);
-  y = pmul(y, z);
-  Packet4f tmp = _mm_mul_ps(z, p4f_half);
-  y = psub(y, tmp);
-  y = padd(y, p4f_1);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-  Packet4f y2 = p4f_sincof_p0;
-  y2 = pmadd(y2, z, p4f_sincof_p1);
-  y2 = pmadd(y2, z, p4f_sincof_p2);
-  y2 = pmul(y2, z);
-  y2 = pmadd(y2, x, x);
-
-  /* select the correct result from the two polynoms */
-  y2 = _mm_and_ps(poly_mask, y2);
-  y  = _mm_andnot_ps(poly_mask, y);
-  y  = _mm_or_ps(y,y2);
-
-  /* update the sign */
-  return _mm_xor_ps(y, sign_bit);
+  return pcos_float(_x);
 }
 
 #if EIGEN_FAST_MATH
@@ -537,79 +78,79 @@
 // Functions for sqrt.
 // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
 // of Newton's method, at a cost of 1-2 bits of precision as opposed to the
-// exact solution. The main advantage of this approach is not just speed, but
-// also the fact that it can be inlined and pipelined with other computations,
-// further reducing its effective latency.
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
-psqrt<Packet4f>(const Packet4f& _x) {
-  _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f);
-  _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f);
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000);
-
-  Packet4f neg_half = pmul(_x, p4f_minus_half);
-  Packet4f denormal_mask = _mm_and_ps(_mm_cmpge_ps(_x, _mm_setzero_ps()),
-                                      _mm_cmplt_ps(_x, p4f_flt_min));
+// exact solution. It does not handle +inf, or denormalized numbers correctly.
+// The main advantage of this approach is not just speed, but also the fact that
+// it can be inlined and pipelined with other computations, further reducing its
+// effective latency. This is similar to Quake3's fast inverse square root.
+// For detail see here: http://www.beyond3d.com/content/articles/8/
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psqrt<Packet4f>(const Packet4f& _x)
+{
+  Packet4f minus_half_x = pmul(_x, pset1<Packet4f>(-0.5f));
+  Packet4f denormal_mask = pandnot(
+      pcmp_lt(_x, pset1<Packet4f>((std::numeric_limits<float>::min)())),
+      pcmp_lt(_x, pzero(_x)));
 
   // Compute approximate reciprocal sqrt.
   Packet4f x = _mm_rsqrt_ps(_x);
-
   // Do a single step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five));
-
-  // Multiply the original _x by it's reciprocal square root to extract the
-  // square root.
-  x = pmul(_x, x);
-
+  x = pmul(x, pmadd(minus_half_x, pmul(x,x), pset1<Packet4f>(1.5f)));
   // Flush results for denormals to zero.
-  return _mm_andnot_ps(denormal_mask, x);
+  return pandnot(pmul(_x,x), denormal_mask);
 }
 
 #else
 
-template<> EIGEN_STRONG_INLINE Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
+template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
 
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
 
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet16b psqrt<Packet16b>(const Packet16b& x) { return x; }
 
 #if EIGEN_FAST_MATH
 
-// Functions for rsqrt.
-// Almost identical to the sqrt routine, just leave out the last multiplication
-// and fill in NaN/Inf where needed. Note that this function only exists as an
-// iterative version since there is no instruction for diretly computing the
-// reciprocal square root in AVX/AVX2 (there will be one in AVX-512).
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
-prsqrt<Packet4f>(const Packet4f& _x) {
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& _x) {
   _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f);
   _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f);
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000);
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000u);
 
-  // Remeber which entries were zero (or almost).
-  Packet4f is_zero = _mm_and_ps(_mm_cmplt_ps(_x, p4f_flt_min),
-                                _mm_cmpge_ps(_x, _mm_setzero_ps()));
-
-  // select only the inverse sqrt of positive normal inputs (denormals are
-  // flushed to zero and cause infs).
-  Packet4f x = _mm_rsqrt_ps(_x);
-
-  // Do a single step of Newton's iteration.
   Packet4f neg_half = pmul(_x, p4f_minus_half);
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five));
 
-  // Mask-out the zero terms.
-  return _mm_or_ps(_mm_andnot_ps(is_zero, x), _mm_and_ps(is_zero, p4f_inf));
+  // Identity infinite, zero, negative and denormal arguments.
+  Packet4f lt_min_mask = _mm_cmplt_ps(_x, p4f_flt_min);
+  Packet4f inf_mask = _mm_cmpeq_ps(_x, p4f_inf);
+  Packet4f not_normal_finite_mask = _mm_or_ps(lt_min_mask, inf_mask);
+
+  // Compute an approximate result using the rsqrt intrinsic.
+  Packet4f y_approx = _mm_rsqrt_ps(_x);
+
+  // Do a single step of Newton-Raphson iteration to improve the approximation.
+  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
+  // It is essential to evaluate the inner term like this because forming
+  // y_n^2 may over- or underflow.
+  Packet4f y_newton = pmul(
+      y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p4f_one_point_five));
+
+  // Select the result of the Newton-Raphson step for positive normal arguments.
+  // For other arguments, choose the output of the intrinsic. This will
+  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if
+  // x is zero or a positive denormalized float (equivalent to flushing positive
+  // denormalized inputs to zero).
+  return pselect<Packet4f>(not_normal_finite_mask, y_approx, y_newton);
 }
 
 #else
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f prsqrt<Packet4f>(const Packet4f& x) {
-  // Unfortunately we can't use the much faster mm_rqsrt_ps since it only provides an approximation.
+  // Unfortunately we can't use the much faster mm_rsqrt_ps since it only provides an approximation.
   return _mm_div_ps(pset1<Packet4f>(1.0f), _mm_sqrt_ps(x));
 }
 
@@ -617,36 +158,37 @@
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d prsqrt<Packet2d>(const Packet2d& x) {
-  // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation.
   return _mm_div_pd(pset1<Packet2d>(1.0), _mm_sqrt_pd(x));
 }
 
-// Identical to the ptanh in GenericPacketMath.h, but for doubles use
-// a small/medium approximation threshold of 0.001.
-template<> EIGEN_STRONG_INLINE Packet2d ptanh_approx_threshold() {
-  return pset1<Packet2d>(0.001);
+// Hyperbolic Tangent function.
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+ptanh<Packet4f>(const Packet4f& x) {
+  return internal::generic_fast_tanh_float(x);
 }
 
 } // end namespace internal
 
-
 namespace numext {
 
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float sqrt(const float &x)
 {
-  return internal::pfirst(_mm_sqrt_ss(_mm_set_ss(x)));
+  return internal::pfirst(internal::Packet4f(_mm_sqrt_ss(_mm_set_ss(x))));
 }
 
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double sqrt(const double &x)
 {
-#if EIGEN_COMP_GNUC
-  return internal::pfirst(__builtin_ia32_sqrtsd(_mm_set_sd(x)));
+#if EIGEN_COMP_GNUC_STRICT
+  // This works around a GCC bug generating poor code for _mm_sqrt_pd
+  // See https://gitlab.com/libeigen/eigen/commit/8dca9f97e38970
+  return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x))));
 #else
-  return internal::pfirst(_mm_sqrt_pd(_mm_set_sd(x)));
+  return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x))));
 #endif
 }
 

diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
old mode 100644
new mode 100755
index f2bdd58..db102c7
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h

@@ -18,38 +18,93 @@
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
 #endif
 
-#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
+// 32 bits =>  8 registers
+// 64 bits => 16 registers
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
 #endif
 
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 #endif
 
+#if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX
+// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
+// have overloads for both types without linking error.
+// One solution is to increase ABI version using -fabi-version=4 (or greater).
+// Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper
+// structure:
+typedef eigen_packet_wrapper<__m128>  Packet4f;
+typedef eigen_packet_wrapper<__m128d> Packet2d;
+#else
 typedef __m128  Packet4f;
-typedef __m128i Packet4i;
 typedef __m128d Packet2d;
+#endif
+
+typedef eigen_packet_wrapper<__m128i, 0> Packet4i;
+typedef eigen_packet_wrapper<__m128i, 1> Packet16b;
 
 template<> struct is_arithmetic<__m128>  { enum { value = true }; };
 template<> struct is_arithmetic<__m128i> { enum { value = true }; };
 template<> struct is_arithmetic<__m128d> { enum { value = true }; };
+template<> struct is_arithmetic<Packet4i>  { enum { value = true }; };
+template<> struct is_arithmetic<Packet16b>  { enum { value = true }; };
 
+template<int p, int q, int r, int s>
+struct shuffle_mask{
+ enum { mask = (s)<<6|(r)<<4|(q)<<2|(p) };
+};
+
+// TODO: change the implementation of all swizzle* ops from macro to template,
 #define vec4f_swizzle1(v,p,q,r,s) \
-  (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p)))))
+  Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), (shuffle_mask<p,q,r,s>::mask))))
 
 #define vec4i_swizzle1(v,p,q,r,s) \
-  (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p))))
+  Packet4i(_mm_shuffle_epi32( v, (shuffle_mask<p,q,r,s>::mask)))
 
 #define vec2d_swizzle1(v,p,q) \
-  (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2)))))
+  Packet2d(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), (shuffle_mask<2*p,2*p+1,2*q,2*q+1>::mask))))
 
 #define vec4f_swizzle2(a,b,p,q,r,s) \
-  (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))
+  Packet4f(_mm_shuffle_ps( (a), (b), (shuffle_mask<p,q,r,s>::mask)))
 
 #define vec4i_swizzle2(a,b,p,q,r,s) \
-  (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p))))))
+  Packet4i(_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask<p,q,r,s>::mask)))))
+
+EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
+{
+  return Packet4f(_mm_movelh_ps(a,b));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b)
+{
+  return Packet4f(_mm_movehl_ps(a,b));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b)
+{
+  return Packet4f(_mm_unpacklo_ps(a,b));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b)
+{
+  return Packet4f(_mm_unpackhi_ps(a,b));
+}
+#define vec4f_duplane(a,p) \
+  vec4f_swizzle2(a,a,p,p,p,p)
+
+#define vec2d_swizzle2(a,b,mask) \
+  Packet2d(_mm_shuffle_pd(a,b,mask))
+
+EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b)
+{
+  return Packet2d(_mm_unpacklo_pd(a,b));
+}
+EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b)
+{
+  return Packet2d(_mm_unpackhi_pd(a,b));
+}
+#define vec2d_duplane(a,p) \
+  vec2d_swizzle2(a,a,(p<<1)|p)
 
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
   const Packet4f p4f_##NAME = pset1<Packet4f>(X)
@@ -58,7 +113,7 @@
   const Packet2d p2d_##NAME = pset1<Packet2d>(X)
 
 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
-  const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X))
+  const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
 
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
   const Packet4i p4i_##NAME = pset1<Packet4i>(X)
@@ -66,34 +121,42 @@
 
 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
 // to leverage AVX instructions.
-#ifndef EIGEN_VECTORIZE_AVX512
 #ifndef EIGEN_VECTORIZE_AVX
-template<> struct packet_traits<float>  : default_packet_traits
-{
+template <>
+struct packet_traits<float> : default_packet_traits {
   typedef Packet4f type;
   typedef Packet4f half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=4,
+    size = 4,
     HasHalfPacket = 0,
 
-    HasDiv  = 1,
-    HasSin  = EIGEN_FAST_MATH,
-    HasCos  = EIGEN_FAST_MATH,
-    HasTanH = 1,
-    HasLog  = 1,
-    HasExp  = 1,
+    HasCmp  = 1,
+    HasDiv = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasNdtri = 1,
+    HasExp = 1,
+    HasBessel = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
     HasBlend = 1,
-    HasSelect = 1,
-    HasEq = 1,
+    HasCeil = 1,
+    HasFloor = 1,
+#ifdef EIGEN_VECTORIZE_SSE4_1
+    HasRound = 1,
+#endif
+    HasRint = 1
   };
 };
-template<> struct packet_traits<double> : default_packet_traits
-{
+template <>
+struct packet_traits<double> : default_packet_traits {
   typedef Packet2d type;
   typedef Packet2d half;
   enum {
@@ -102,15 +165,19 @@
     size=2,
     HasHalfPacket = 0,
 
+    HasCmp  = 1,
     HasDiv  = 1,
-    HasTanH = 1,
+    HasLog  = 1,
     HasExp  = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-
     HasBlend = 1,
-    HasSelect = 1,
-    HasEq = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+#ifdef EIGEN_VECTORIZE_SSE4_1
+    HasRound = 1,
+#endif
+    HasRint = 1
   };
 };
 #endif
@@ -119,19 +186,65 @@
   typedef Packet4i type;
   typedef Packet4i half;
   enum {
-    // FIXME check the Has*
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=4,
 
-    HasBlend = 1,
+    HasShift = 1,
+    HasBlend = 1
   };
 };
-#endif
 
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; typedef Packet4i half; };
+template<> struct packet_traits<bool> : default_packet_traits
+{
+  typedef Packet16b type;
+  typedef Packet16b half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    HasHalfPacket = 0,
+    size=16,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 0,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 0,
+    HasAbs2      = 0,
+    HasMin       = 0,
+    HasMax       = 0,
+    HasConj      = 0,
+    HasSqrt      = 1
+  };
+};
+
+template<> struct unpacket_traits<Packet4f> {
+  typedef float     type;
+  typedef Packet4f  half;
+  typedef Packet4i  integer_packet;
+  enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet2d> {
+  typedef double    type;
+  typedef Packet2d  half;
+  enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet4i> {
+  typedef int       type;
+  typedef Packet4i  half;
+  enum {size=4, alignment=Aligned16, vectorizable=false, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet16b> {
+  typedef bool       type;
+  typedef Packet16b  half;
+  enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+
+#ifndef EIGEN_VECTORIZE_AVX
+template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
+template<> struct scalar_div_cost<double,true> { enum { value = 8 }; };
+#endif
 
 #if EIGEN_COMP_MSVC==1500
 // Workaround MSVC 9 internal compiler error.
@@ -145,6 +258,18 @@
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set1_epi32(from); }
 #endif
+template<> EIGEN_STRONG_INLINE Packet16b pset1<Packet16b>(const bool&    from) { return _mm_set1_epi8(static_cast<char>(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) { return _mm_castsi128_ps(pset1<Packet4i>(from)); }
+template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) { return _mm_castsi128_pd(_mm_set1_epi64x(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f peven_mask(const Packet4f& /*a*/) { return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1)); }
+template<> EIGEN_STRONG_INLINE Packet4i peven_mask(const Packet4i& /*a*/) { return _mm_set_epi32(0, -1, 0, -1); }
+template<> EIGEN_STRONG_INLINE Packet2d peven_mask(const Packet2d& /*a*/) { return _mm_castsi128_pd(_mm_set_epi32(0, 0, -1, -1)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); }
+template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); }
+template<> EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) { return _mm_setzero_si128(); }
 
 // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
 // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
@@ -157,43 +282,40 @@
 }
 #endif
 
-#ifndef EIGEN_VECTORIZE_AVX512
-#ifndef EIGEN_VECTORIZE_AVX
-template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
-template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
-#endif
-template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
-#endif
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
 
 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
 
+template<> EIGEN_STRONG_INLINE Packet16b padd<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); }
+
 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b psub<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet4f ple<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d ple<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_cmple_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f plt<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d plt<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_cmplt_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f peq<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d peq<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pselect<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& false_mask) {
-#if defined(EIGEN_VECTORIZE_SSE4_1)
-  return _mm_blendv_ps(a, b, false_mask);
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
+template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+#ifdef EIGEN_VECTORIZE_SSE3
+  return _mm_addsub_ps(a,b);
 #else
-  return _mm_or_ps(_mm_andnot_ps(false_mask, a), _mm_and_ps(false_mask, b));
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x0,0x80000000,0x0));
+  return padd(a, pxor(mask, b));
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet2d pselect<Packet2d>(const Packet2d& a, const Packet2d& b, const Packet2d& false_mask) {
-#if defined(EIGEN_VECTORIZE_SSE4_1)
-  return _mm_blendv_pd(a, b, false_mask);
+
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
+template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) 
+{
+#ifdef EIGEN_VECTORIZE_SSE3  
+  return _mm_addsub_pd(a,b); 
 #else
-  return _mm_or_pd(_mm_andnot_pd(false_mask, a), _mm_and_pd(false_mask, b));
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x0)); 
+  return padd(a, pxor(mask, b));
 #endif
 }
 
@@ -212,6 +334,11 @@
   return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
 }
 
+template<> EIGEN_STRONG_INLINE Packet16b pnegate(const Packet16b& a)
+{
+  return psub(pset1<Packet16b>(false), a);
+}
+
 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
@@ -234,15 +361,126 @@
 #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet16b pmul<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
+
 // for some weird raisons, it has to be overloaded for packet of integers
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
+#ifdef EIGEN_VECTORIZE_SSE4_1
+template<> EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+  return _mm_blendv_ps(b,a,mask);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
+  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b),_mm_castsi128_ps(a),_mm_castsi128_ps(mask)));
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {  return _mm_blendv_pd(b,a,mask); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
+  return _mm_blendv_epi8(b,a,mask);
+}
+#else
+template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
+  Packet16b a_part = _mm_and_si128(mask, a);
+  Packet16b b_part = _mm_andnot_si128(mask, b);
+  return _mm_or_si128(a_part, b_part);
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); }
+template<> EIGEN_STRONG_INLINE Packet16b ptrue<Packet16b>(const Packet16b& a) { return _mm_cmpeq_epi8(a, a); }
+template<> EIGEN_STRONG_INLINE Packet4f
+ptrue<Packet4f>(const Packet4f& a) {
+  Packet4i b = _mm_castps_si128(a);
+  return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b));
+}
+template<> EIGEN_STRONG_INLINE Packet2d
+ptrue<Packet2d>(const Packet2d& a) {
+  Packet4i b = _mm_castpd_si128(a);
+  return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b pand<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b por<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b pxor<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); }
+template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return _mm_cmple_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return _mm_cmplt_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { return _mm_cmpnge_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return _mm_cmplt_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) { return _mm_cmpeq_epi8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return por(pcmp_lt(a,b), pcmp_eq(a,b)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // There appears to be a bug in GCC, by which the optimizer may
+  // flip the argument order in calls to _mm_min_ps, so we have to
+  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  #ifdef EIGEN_VECTORIZE_AVX
+  Packet4f res;
+  asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  #else
+  Packet4f res = b;
+  asm("minps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+  #endif
+  return res;
+#else
+  // Arguments are reversed to match NaN propagation behavior of std::min.
+  return _mm_min_ps(b, a);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // There appears to be a bug in GCC, by which the optimizer may
+  // flip the argument order in calls to _mm_min_pd, so we have to
+  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  #ifdef EIGEN_VECTORIZE_AVX
+  Packet2d res;
+  asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  #else
+  Packet2d res = b;
+  asm("minpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+  #endif
+  return res;
+#else
+  // Arguments are reversed to match NaN propagation behavior of std::min.
+  return _mm_min_pd(b, a);
+#endif
+}
 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b)
 {
 #ifdef EIGEN_VECTORIZE_SSE4_1
@@ -254,8 +492,45 @@
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // There appears to be a bug in GCC, by which the optimizer may
+  // flip the argument order in calls to _mm_max_ps, so we have to
+  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  #ifdef EIGEN_VECTORIZE_AVX
+  Packet4f res;
+  asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  #else
+  Packet4f res = b;
+  asm("maxps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+  #endif
+  return res;
+#else
+  // Arguments are reversed to match NaN propagation behavior of std::max.
+  return _mm_max_ps(b, a);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // There appears to be a bug in GCC, by which the optimizer may
+  // flip the argument order in calls to _mm_max_pd, so we have to
+  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  #ifdef EIGEN_VECTORIZE_AVX
+  Packet2d res;
+  asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  #else
+  Packet2d res = b;
+  asm("maxpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+  #endif
+  return res;
+#else
+  // Arguments are reversed to match NaN propagation behavior of std::max.
+  return _mm_max_pd(b, a);
+#endif
+}
 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b)
 {
 #ifdef EIGEN_VECTORIZE_SSE4_1
@@ -267,206 +542,61 @@
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
-
-#if EIGEN_COMP_MSVC
-  template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*  from) {
-    EIGEN_DEBUG_UNALIGNED_LOAD
-    #if (EIGEN_COMP_MSVC==1600)
-    // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps
-    // (i.e., it does not generate an unaligned load!!
-    // TODO On most architectures this version should also be faster than a single _mm_loadu_ps
-    // so we could also enable it for MSVC08 but first we have to make this later does not generate crap when doing so...
-    __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from));
-    res = _mm_loadh_pi(res, (const __m64*)(from+2));
-    return res;
-    #else
-    return _mm_loadu_ps(from);
-    #endif
-  }
-  template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
-  template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*    from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); }
-#else
-// Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
-// require pointer casting to incompatible pointer types and leads to invalid code
-// because of the strict aliasing rule. The "dummy" stuff are required to enforce
-// a correct instruction dependency.
-// TODO: do the same for MSVC (ICC is compatible)
-// NOTE: with the code below, MSVC's compiler crashes!
-
-#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386 || (EIGEN_ARCH_x86_64 && EIGEN_GNUC_AT_LEAST(4, 8)))
-  // bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd
-  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
-  #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 1
-#elif EIGEN_COMP_CLANG
-  // bug 201: Segfaults in __mm_loadh_pd with clang 2.8
-  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
-  #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0
-#else
-  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0
-  #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0
-#endif
-
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
-{
-  EIGEN_DEBUG_UNALIGNED_LOAD
-#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
-  return _mm_loadu_ps(from);
-#else
-  __m128d res;
-  res =  _mm_load_sd((const double*)(from)) ;
-  res =  _mm_loadh_pd(res, (const double*)(from+2)) ;
-  return _mm_castpd_ps(res);
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
-{
-  EIGEN_DEBUG_UNALIGNED_LOAD
-#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
-  return _mm_loadu_pd(from);
-#else
-  __m128d res;
-  res = _mm_load_sd(from) ;
-  res = _mm_loadh_pd(res,from+1);
-  return res;
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
-{
-  EIGEN_DEBUG_UNALIGNED_LOAD
-#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
-  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
-#else
-  __m128d res;
-  res =  _mm_load_sd((const double*)(from)) ;
-  res =  _mm_loadh_pd(res, (const double*)(from+2)) ;
-  return _mm_castpd_si128(res);
-#endif
-}
-#endif
-
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
-{
-  return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
-}
-template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*  from)
-{ return pset1<Packet2d>(from[0]); }
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
-{
-  Packet4i tmp;
-  tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
-  return vec4i_swizzle1(tmp, 0, 0, 1, 1);
+template <typename Packet, typename Op>
+EIGEN_STRONG_INLINE Packet pminmax_propagate_numbers(const Packet& a, const Packet& b, Op op) {
+  // In this implementation, we take advantage of the fact that pmin/pmax for SSE
+  // always return a if either a or b is NaN.
+  Packet not_nan_mask_a = pcmp_eq(a, a);
+  Packet m = op(a, b);
+  return pselect<Packet>(not_nan_mask_a, m, b);
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
-
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE
-#if EIGEN_AVOID_CUSTOM_UNALIGNED_STORES
-  _mm_storeu_pd(to, from);
-#else
-  _mm_storel_pd((to), from);
-  _mm_storeh_pd((to+1), from);
-#endif
-}
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castps_pd(from))); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castsi128_pd(from))); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride)
-{
- return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
-}
-template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, int stride)
-{
- return _mm_set_pd(from[1*stride], from[0*stride]);
-}
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride)
-{
- return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
- }
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride)
-{
-  to[stride*0] = _mm_cvtss_f32(from);
-  to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
-  to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
-  to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, int stride)
-{
-  to[stride*0] = _mm_cvtsd_f64(from);
-  to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride)
-{
-  to[stride*0] = _mm_cvtsi128_si32(from);
-  to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
-  to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
-  to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
+template <typename Packet, typename Op>
+EIGEN_STRONG_INLINE Packet pminmax_propagate_nan(const Packet& a, const Packet& b, Op op) {
+  // In this implementation, we take advantage of the fact that pmin/pmax for SSE
+  // always return a if either a or b is NaN.
+  Packet not_nan_mask_a = pcmp_eq(a, a);
+  Packet m = op(b, a);
+  return pselect<Packet>(not_nan_mask_a, m, a);
 }
 
-// some compilers might be tempted to perform multiple moves instead of using a vector path.
-template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
-{
-  Packet4f pa = _mm_set_ss(a);
-  pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0)));
+// Add specializations for min/max with prescribed NaN progation.
+template<>
+EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet4f>);
 }
-// some compilers might be tempted to perform multiple moves instead of using a vector path.
-template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
-{
-  Packet2d pa = _mm_set_sd(a);
-  pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
+template<>
+EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet2d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet4f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet2d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet4f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet2d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet4f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet2d>);
 }
 
-#ifndef EIGEN_VECTORIZE_AVX
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-#endif
-
-#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
-// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
-// Direct of the struct members fixed bug #62.
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
-#elif EIGEN_COMP_MSVC_STRICT
-// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
-#else
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
-#endif
-
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
-{ return _mm_shuffle_ps(a,a,0x1B); }
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
-{ return _mm_shuffle_pd(a,a,0x1); }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
-{ return _mm_shuffle_epi32(a,0x1B); }
+template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) { return _mm_srai_epi32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right   (const Packet4i& a) { return _mm_srli_epi32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left    (const Packet4i& a) { return _mm_slli_epi32(a,N); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
 {
@@ -488,6 +618,332 @@
   #endif
 }
 
+#ifdef EIGEN_VECTORIZE_SSE4_1
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
+{
+  // Unfortunatly _mm_round_ps doesn't have a rounding mode to implement numext::round.
+  const Packet4f mask = pset1frombits<Packet4f>(0x80000000u);
+  const Packet4f prev0dot5 = pset1frombits<Packet4f>(0x3EFFFFFFu);
+  return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
+{
+  const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull));
+  const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull));
+  return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
+template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
+#else
+template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
+  // Adds and subtracts signum(a) * 2^23 to force rounding.
+  const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
+  const Packet4f abs_a = pabs(a);
+  Packet4f r = padd(abs_a, limit);
+  // Don't compile-away addition and subtraction.
+  EIGEN_OPTIMIZATION_BARRIER(r);
+  r = psub(r, limit);
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) {
+  // Adds and subtracts signum(a) * 2^52 to force rounding.
+  const Packet2d limit = pset1<Packet2d>(static_cast<double>(1ull<<52));
+  const Packet2d abs_a = pabs(a);
+  Packet2d r = padd(abs_a, limit);
+  // Don't compile-away addition and subtraction.
+  EIGEN_OPTIMIZATION_BARRIER(r);
+  r = psub(r, limit);
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{
+  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+  Packet4f tmp  = print<Packet4f>(a);
+  // If greater, subtract one.
+  Packet4f mask = _mm_cmpgt_ps(tmp, a);
+  mask = pand(mask, cst_1);
+  return psub(tmp, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
+{
+  const Packet2d cst_1 = pset1<Packet2d>(1.0);
+  Packet2d tmp  = print<Packet2d>(a);
+  // If greater, subtract one.
+  Packet2d mask = _mm_cmpgt_pd(tmp, a);
+  mask = pand(mask, cst_1);
+  return psub(tmp, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
+{
+  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+  Packet4f tmp  = print<Packet4f>(a);
+  // If smaller, add one.
+  Packet4f mask = _mm_cmplt_ps(tmp, a);
+  mask = pand(mask, cst_1);
+  return padd(tmp, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
+{
+  const Packet2d cst_1 = pset1<Packet2d>(1.0);
+  Packet2d tmp  = print<Packet2d>(a);
+  // If smaller, add one.
+  Packet2d mask = _mm_cmplt_pd(tmp, a);
+  mask = pand(mask, cst_1);
+  return padd(tmp, mask);
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
+template<> EIGEN_STRONG_INLINE Packet16b pload<Packet16b>(const bool*     from) { EIGEN_DEBUG_ALIGNED_LOAD return  _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
+
+#if EIGEN_COMP_MSVC
+  template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*  from) {
+    EIGEN_DEBUG_UNALIGNED_LOAD
+    #if (EIGEN_COMP_MSVC==1600)
+    // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps
+    // (i.e., it does not generate an unaligned load!!
+    __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from));
+    res = _mm_loadh_pi(res, (const __m64*)(from+2));
+    return res;
+    #else
+    return _mm_loadu_ps(from);
+    #endif
+  }
+#else
+// NOTE: with the code below, MSVC's compiler crashes!
+
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return _mm_loadu_ps(from);
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return _mm_loadu_pd(from);
+}
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+}
+template<> EIGEN_STRONG_INLINE Packet16b ploadu<Packet16b>(const bool*     from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
+{
+  return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
+}
+template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*  from)
+{ return pset1<Packet2d>(from[0]); }
+template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
+{
+  Packet4i tmp;
+  tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
+  return vec4i_swizzle1(tmp, 0, 0, 1, 1);
+}
+
+// Loads 8 bools from memory and returns the packet
+// {b0, b0, b1, b1, b2, b2, b3, b3, b4, b4, b5, b5, b6, b6, b7, b7}
+template<> EIGEN_STRONG_INLINE Packet16b ploaddup<Packet16b>(const bool*     from)
+{
+  __m128i tmp = _mm_castpd_si128(pload1<Packet2d>(reinterpret_cast<const double*>(from)));
+  return  _mm_unpacklo_epi8(tmp, tmp);
+}
+
+// Loads 4 bools from memory and returns the packet
+// {b0, b0  b0, b0, b1, b1, b1, b1, b2, b2, b2, b2, b3, b3, b3, b3}
+template<> EIGEN_STRONG_INLINE Packet16b
+ploadquad<Packet16b>(const bool* from) {
+  __m128i tmp = _mm_castps_si128(pload1<Packet4f>(reinterpret_cast<const float*>(from)));
+  tmp = _mm_unpacklo_epi8(tmp, tmp);
+  return  _mm_unpacklo_epi16(tmp, tmp);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
+template<> EIGEN_STRONG_INLINE void pstore<bool>(bool*     to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<bool>(bool*     to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+ return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
+{
+ return _mm_set_pd(from[1*stride], from[0*stride]);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
+{
+ return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet16b pgather<bool, Packet16b>(const bool* from, Index stride)
+{
+  return _mm_set_epi8(from[15*stride], from[14*stride], from[13*stride], from[12*stride],
+                      from[11*stride], from[10*stride], from[9*stride], from[8*stride],
+                      from[7*stride], from[6*stride], from[5*stride], from[4*stride],
+                      from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  to[stride*0] = _mm_cvtss_f32(from);
+  to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
+  to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
+  to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
+{
+  to[stride*0] = _mm_cvtsd_f64(from);
+  to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
+{
+  to[stride*0] = _mm_cvtsi128_si32(from);
+  to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
+  to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
+  to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride)
+{
+  to[4*stride*0] = _mm_cvtsi128_si32(from);
+  to[4*stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
+  to[4*stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
+  to[4*stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
+}
+
+
+// some compilers might be tempted to perform multiple moves instead of using a vector path.
+template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
+{
+  Packet4f pa = _mm_set_ss(a);
+  pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0)));
+}
+// some compilers might be tempted to perform multiple moves instead of using a vector path.
+template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
+{
+  Packet2d pa = _mm_set_sd(a);
+  pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
+}
+
+#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
+typedef const void * SsePrefetchPtrType;
+#else
+typedef const char * SsePrefetchPtrType;
+#endif
+
+#ifndef EIGEN_VECTORIZE_AVX
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+#endif
+
+#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
+// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
+// Direct of the struct members fixed bug #62.
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
+#elif EIGEN_COMP_MSVC_STRICT
+// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
+#else
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
+#endif
+template<> EIGEN_STRONG_INLINE bool   pfirst<Packet16b>(const Packet16b& a) { int x = _mm_cvtsi128_si32(a); return static_cast<bool>(x & 1); }
+
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return _mm_shuffle_ps(a,a,0x1B); }
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return _mm_shuffle_pd(a,a,0x1); }
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return _mm_shuffle_epi32(a,0x1B); }
+template<> EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) {
+#ifdef EIGEN_VECTORIZE_SSSE3
+  __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  return _mm_shuffle_epi8(a, mask);
+#else
+  Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
+  tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
+  return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+  return pfrexp_generic(a,exponent);
+}
+
+// Extract exponent without existence of Packet2l.
+template<>
+EIGEN_STRONG_INLINE  
+Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
+  const Packet2d cst_exp_mask  = pset1frombits<Packet2d>(static_cast<uint64_t>(0x7ff0000000000000ull));
+  __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(pand(a, cst_exp_mask)), 52);
+  return _mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+  return pldexp_generic(a,exponent);
+}
+
+// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
+// supported by SSE, and has more range than is needed for exponents.
+template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+  // Clamp exponent to [-2099, 2099]
+  const Packet2d max_exponent = pset1<Packet2d>(2099.0);
+  const Packet2d e = pmin(pmax(exponent, pnegate(max_exponent)), max_exponent);
+  
+  // Convert e to integer and swizzle to low-order bits.
+  const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
+  
+  // Split 2^e into four factors and multiply:
+  const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
+  Packet4i b = parithmetic_shift_right<2>(ei);  // floor(e/4)
+  Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52));  // 2^b
+  Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
+  b = psub(psub(psub(ei, b), b), b);  // e - 3b
+  c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52));  // 2^(e - 3b)
+  out = pmul(out, c);  // a * 2^e
+  return out;
+}
+
 // with AVX, the default implementations based on pload1 are faster
 #ifndef __AVX__
 template<> EIGEN_STRONG_INLINE void
@@ -528,90 +984,53 @@
   vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
 }
 
-#ifdef EIGEN_VECTORIZE_SSE3
-// TODO implement SSE2 versions as well as integer versions
-template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
-{
-  return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
-}
-template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
-{
-  return _mm_hadd_pd(vecs[0], vecs[1]);
-}
-// SSSE3 version:
-// EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs)
-// {
-//   return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
-// }
-
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
-  Packet4f tmp0 = _mm_hadd_ps(a,a);
-  return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0));
-}
-
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst<Packet2d>(_mm_hadd_pd(a, a)); }
-
-// SSSE3 version:
-// EIGEN_STRONG_INLINE float predux(const Packet4i& a)
-// {
-//   Packet4i tmp0 = _mm_hadd_epi32(a,a);
-//   return pfirst(_mm_hadd_epi32(tmp0, tmp0));
-// }
-#else
-// SSE2 versions
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
+  // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
+  // (from Nehalem to Haswell)
+// #ifdef EIGEN_VECTORIZE_SSE3
+//   Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3));
+//   return pfirst<Packet4f>(_mm_hadd_ps(tmp, tmp));
+// #else
   Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
-  return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+  return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+// #endif
 }
+
 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
 {
-  return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
+  // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
+  // (from Nehalem to Haswell)
+// #ifdef EIGEN_VECTORIZE_SSE3
+//   return pfirst<Packet2d>(_mm_hadd_pd(a, a));
+// #else
+  return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
+// #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
+#ifdef EIGEN_VECTORIZE_SSSE3
+template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
-  Packet4f tmp0, tmp1, tmp2;
-  tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]);
-  tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]);
-  tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]);
-  tmp0 = _mm_add_ps(tmp0, tmp1);
-  tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]);
-  tmp1 = _mm_add_ps(tmp1, tmp2);
-  tmp2 = _mm_movehl_ps(tmp1, tmp0);
-  tmp0 = _mm_movelh_ps(tmp0, tmp1);
-  return _mm_add_ps(tmp0, tmp2);
+  Packet4i tmp0 = _mm_hadd_epi32(a,a);
+  return pfirst<Packet4i>(_mm_hadd_epi32(tmp0,tmp0));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
-{
-  return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1]));
-}
-#endif  // SSE3
-
+#else
 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
   Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
   return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
 }
+#endif
 
-template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
-{
-  Packet4i tmp0, tmp1, tmp2;
-  tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
-  tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
-  tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
-  tmp0 = _mm_add_epi32(tmp0, tmp1);
-  tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
-  tmp1 = _mm_add_epi32(tmp1, tmp2);
-  tmp2 = _mm_unpacklo_epi64(tmp0, tmp1);
-  tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);
-  return _mm_add_epi32(tmp0, tmp2);
+template<> EIGEN_STRONG_INLINE bool predux<Packet16b>(const Packet16b& a) {
+  Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a));
+  return (pfirst(tmp) != 0) || (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) != 0);
 }
 
 // Other reduction functions:
 
+
 // mul
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
 {
@@ -629,7 +1048,13 @@
   // TODO try to call _mm_mul_epu32 directly
   EIGEN_ALIGN16 int aux[4];
   pstore(aux, a);
-  return  (aux[0] * aux[1]) * (aux[2] * aux[3]);;
+  return  (aux[0] * aux[1]) * (aux[2] * aux[3]);
+}
+
+template<> EIGEN_STRONG_INLINE bool predux_mul<Packet16b>(const Packet16b& a) {
+  Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a,a));
+  return ((pfirst<Packet4i>(tmp) == 0x01010101) &&
+          (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) == 0x01010101));
 }
 
 // min
@@ -684,127 +1109,30 @@
 #endif // EIGEN_VECTORIZE_SSE4_1
 }
 
-#if EIGEN_COMP_GNUC
-// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f&  a, const Packet4f&  b, const Packet4f&  c)
+// not needed yet
+// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x)
 // {
-//   Packet4f res = b;
-//   asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));
-//   return res;
+//   return _mm_movemask_ps(x) == 0xF;
 // }
-// EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i&  a, const Packet4i&  b, const int i)
-// {
-//   Packet4i res = a;
-//   asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i));
-//   return res;
-// }
-#endif
 
-#ifdef EIGEN_VECTORIZE_SSSE3
-// SSSE3 versions
-template<int Offset>
-struct palign_impl<Offset,Packet4f>
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
 {
-  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
-  {
-    if (Offset!=0)
-      first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4));
-  }
-};
+  return _mm_movemask_ps(x) != 0x0;
+}
 
-template<int Offset>
-struct palign_impl<Offset,Packet4i>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
-  {
-    if (Offset!=0)
-      first = _mm_alignr_epi8(second,first, Offset*4);
-  }
-};
-
-template<int Offset>
-struct palign_impl<Offset,Packet2d>
-{
-  static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
-  {
-    if (Offset==1)
-      first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8));
-  }
-};
-#else
-// SSE2 versions
-template<int Offset>
-struct palign_impl<Offset,Packet4f>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
-  {
-    if (Offset==1)
-    {
-      first = _mm_move_ss(first,second);
-      first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39));
-    }
-    else if (Offset==2)
-    {
-      first = _mm_movehl_ps(first,first);
-      first = _mm_movelh_ps(first,second);
-    }
-    else if (Offset==3)
-    {
-      first = _mm_move_ss(first,second);
-      first = _mm_shuffle_ps(first,second,0x93);
-    }
-  }
-};
-
-template<int Offset>
-struct palign_impl<Offset,Packet4i>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
-  {
-    if (Offset==1)
-    {
-      first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
-      first = _mm_shuffle_epi32(first,0x39);
-    }
-    else if (Offset==2)
-    {
-      first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first)));
-      first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
-    }
-    else if (Offset==3)
-    {
-      first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
-      first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93));
-    }
-  }
-};
-
-template<int Offset>
-struct palign_impl<Offset,Packet2d>
-{
-  static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
-  {
-    if (Offset==1)
-    {
-      first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first)));
-      first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second)));
-    }
-  }
-};
-#endif
-
-template<> EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4f,4>& kernel) {
   _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet2d,2>& kernel) {
   __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
   kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
   kernel.packet[1] = tmp;
 }
 
-template<> EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4i,4>& kernel) {
   __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
   __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
@@ -817,6 +1145,100 @@
   kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
 }
 
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet16b,4>& kernel) {
+  __m128i T0 =  _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
+  __m128i T1 =  _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
+  __m128i T2 =  _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
+  __m128i T3 =  _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
+  kernel.packet[0] = _mm_unpacklo_epi16(T0, T2);
+  kernel.packet[1] = _mm_unpackhi_epi16(T0, T2);
+  kernel.packet[2] = _mm_unpacklo_epi16(T1, T3);
+  kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet16b,16>& kernel) {
+  // If we number the elements in the input thus:
+  // kernel.packet[ 0] = {00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f}
+  // kernel.packet[ 1] = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f}
+  // ...
+  // kernel.packet[15] = {f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, fa, fb, fc, fd, fe, ff},
+  //
+  // the desired output is:
+  // kernel.packet[ 0] = {00, 10, 20, 30, 40, 50, 60, 70, 80, 90, a0, b0, c0, d0, e0, f0}
+  // kernel.packet[ 1] = {01, 11, 21, 31, 41, 51, 61, 71, 81, 91, a1, b1, c1, d1, e1, f1}
+  // ...
+  // kernel.packet[15] = {0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, af, bf, cf, df, ef, ff},
+  __m128i t0 =  _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  __m128i t1 =  _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f
+  __m128i t2 =  _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); // 20 30 21 31 22 32 ...                     27 37
+  __m128i t3 =  _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]); // 28 38 29 39 2a 3a ...                     2f 3f
+  __m128i t4 =  _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]); // 40 50 41 51 42 52                         47 57
+  __m128i t5 =  _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]); // 48 58 49 59 4a 5a
+  __m128i t6 =  _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]);
+  __m128i t7 =  _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]);
+  __m128i t8 =  _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]);
+  __m128i t9 =  _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]);
+  __m128i ta =  _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]);
+  __m128i tb =  _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]);
+  __m128i tc =  _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]);
+  __m128i td =  _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]);
+  __m128i te =  _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]);
+  __m128i tf =  _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]);
+
+  __m128i s0 =  _mm_unpacklo_epi16(t0, t2); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  __m128i s1 =  _mm_unpackhi_epi16(t0, t2); // 04 14 24 34
+  __m128i s2 =  _mm_unpacklo_epi16(t1, t3); // 08 18 28 38 ...
+  __m128i s3 =  _mm_unpackhi_epi16(t1, t3); // 0c 1c 2c 3c ...
+  __m128i s4 =  _mm_unpacklo_epi16(t4, t6); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+  __m128i s5 =  _mm_unpackhi_epi16(t4, t6); // 44 54 64 74 ...
+  __m128i s6 =  _mm_unpacklo_epi16(t5, t7);
+  __m128i s7 =  _mm_unpackhi_epi16(t5, t7);
+  __m128i s8 =  _mm_unpacklo_epi16(t8, ta);
+  __m128i s9 =  _mm_unpackhi_epi16(t8, ta);
+  __m128i sa =  _mm_unpacklo_epi16(t9, tb);
+  __m128i sb =  _mm_unpackhi_epi16(t9, tb);
+  __m128i sc =  _mm_unpacklo_epi16(tc, te);
+  __m128i sd =  _mm_unpackhi_epi16(tc, te);
+  __m128i se =  _mm_unpacklo_epi16(td, tf);
+  __m128i sf =  _mm_unpackhi_epi16(td, tf);
+
+  __m128i u0 =  _mm_unpacklo_epi32(s0, s4); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  __m128i u1 =  _mm_unpackhi_epi32(s0, s4); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+  __m128i u2 =  _mm_unpacklo_epi32(s1, s5);
+  __m128i u3 =  _mm_unpackhi_epi32(s1, s5);
+  __m128i u4 =  _mm_unpacklo_epi32(s2, s6);
+  __m128i u5 =  _mm_unpackhi_epi32(s2, s6);
+  __m128i u6 =  _mm_unpacklo_epi32(s3, s7);
+  __m128i u7 =  _mm_unpackhi_epi32(s3, s7);
+  __m128i u8 =  _mm_unpacklo_epi32(s8, sc);
+  __m128i u9 =  _mm_unpackhi_epi32(s8, sc);
+  __m128i ua =  _mm_unpacklo_epi32(s9, sd);
+  __m128i ub =  _mm_unpackhi_epi32(s9, sd);
+  __m128i uc =  _mm_unpacklo_epi32(sa, se);
+  __m128i ud =  _mm_unpackhi_epi32(sa, se);
+  __m128i ue =  _mm_unpacklo_epi32(sb, sf);
+  __m128i uf =  _mm_unpackhi_epi32(sb, sf);
+
+  kernel.packet[0]  = _mm_unpacklo_epi64(u0, u8);
+  kernel.packet[1]  = _mm_unpackhi_epi64(u0, u8);
+  kernel.packet[2]  = _mm_unpacklo_epi64(u1, u9);
+  kernel.packet[3]  = _mm_unpackhi_epi64(u1, u9);
+  kernel.packet[4]  = _mm_unpacklo_epi64(u2, ua);
+  kernel.packet[5]  = _mm_unpackhi_epi64(u2, ua);
+  kernel.packet[6]  = _mm_unpacklo_epi64(u3, ub);
+  kernel.packet[7]  = _mm_unpackhi_epi64(u3, ub);
+  kernel.packet[8]  = _mm_unpacklo_epi64(u4, uc);
+  kernel.packet[9]  = _mm_unpackhi_epi64(u4, uc);
+  kernel.packet[10] = _mm_unpacklo_epi64(u5, ud);
+  kernel.packet[11] = _mm_unpackhi_epi64(u5, ud);
+  kernel.packet[12] = _mm_unpacklo_epi64(u6, ue);
+  kernel.packet[13] = _mm_unpackhi_epi64(u6, ue);
+  kernel.packet[14] = _mm_unpacklo_epi64(u7, uf);
+  kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
@@ -837,7 +1259,6 @@
   return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
 #endif
 }
-
 template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
   const __m128d zero = _mm_setzero_pd();
   const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
@@ -849,8 +1270,236 @@
 #endif
 }
 
+// Scalar path for pmadd with FMA to ensure consistency with vectorized path.
+#ifdef EIGEN_VECTORIZE_FMA
+template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
+  return ::fmaf(a,b,c);
+}
+template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
+  return ::fma(a,b,c);
+}
+#endif
+
+
+// Packet math for Eigen::half
+// Disable the following code since it's broken on too many platforms / compilers.
+//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+#if 0
+
+typedef struct {
+  __m64 x;
+} Packet4h;
+
+
+template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet4h type;
+  // There is no half-size packet for Packet4h.
+  typedef Packet4h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 0,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0,
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasExp = 0,
+    HasLog = 0,
+    HasBlend = 0
+  };
+};
+
+
+template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h half; };
+
+template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
+  Packet4h result;
+  result.x = _mm_set1_pi16(from.x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
+  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha + hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h psub<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha - hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha - hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha - hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha - hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha * hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pdiv<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha / hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha / hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha / hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha / hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
+  Packet4h result;
+  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
+  Packet4h result;
+  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
+  __int64_t r = _mm_cvtm64_si64(from.x);
+  *(reinterpret_cast<__int64_t*>(to)) = r;
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
+  __int64_t r = _mm_cvtm64_si64(from.x);
+  *(reinterpret_cast<__int64_t*>(to)) = r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h
+ploadquad<Packet4h>(const Eigen::half* from) {
+  return pset1<Packet4h>(*from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
+{
+  Packet4h result;
+  result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
+{
+  __int64_t a = _mm_cvtm64_si64(from.x);
+  to[stride*0].x = static_cast<unsigned short>(a);
+  to[stride*1].x = static_cast<unsigned short>(a >> 16);
+  to[stride*2].x = static_cast<unsigned short>(a >> 32);
+  to[stride*3].x = static_cast<unsigned short>(a >> 48);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet4h,4>& kernel) {
+  __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
+  __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
+  __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
+  __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
+
+  kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
+  kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
+  kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
+  kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
+}
+
+#endif
+
+
 } // end namespace internal
 
 } // end namespace Eigen
 
+#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
+// PGI++ does not define the following intrinsics in C++ mode.
+static inline __m128  _mm_castpd_ps   (__m128d x) { return reinterpret_cast<__m128&>(x);  }
+static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
+static inline __m128d _mm_castps_pd   (__m128  x) { return reinterpret_cast<__m128d&>(x); }
+static inline __m128i _mm_castps_si128(__m128  x) { return reinterpret_cast<__m128i&>(x); }
+static inline __m128  _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x);  }
+static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
+#endif
+
 #endif // EIGEN_PACKET_MATH_SSE_H

diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
index c848932..d2a0037 100644
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h

@@ -14,6 +14,7 @@
 
 namespace internal {
 
+#ifndef EIGEN_VECTORIZE_AVX
 template <>
 struct type_casting_traits<float, int> {
   enum {
@@ -23,11 +24,6 @@
   };
 };
 
-template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
-  return _mm_cvttps_epi32(a);
-}
-
-
 template <>
 struct type_casting_traits<int, float> {
   enum {
@@ -37,11 +33,6 @@
   };
 };
 
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
-  return _mm_cvtepi32_ps(a);
-}
-
-
 template <>
 struct type_casting_traits<double, float> {
   enum {
@@ -51,10 +42,6 @@
   };
 };
 
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
-  return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
-}
-
 template <>
 struct type_casting_traits<float, double> {
   enum {
@@ -63,12 +50,90 @@
     TgtCoeffRatio = 2
   };
 };
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return _mm_cvttps_epi32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return _mm_cvtepi32_ps(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
+  return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
+}
 
 template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
   // Simply discard the second half of the input
   return _mm_cvtps_pd(a);
 }
 
+template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
+  return _mm_castps_si128(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
+  return _mm_castsi128_ps(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d,Packet4i>(const Packet4i& a) {
+  return _mm_castsi128_pd(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet2d>(const Packet2d& a) {
+  return _mm_castpd_si128(a);
+}
+
+// Disable the following code since it's broken on too many platforms / compilers.
+//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+#if 0
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4h, Packet4f>(const Packet4h& a) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  Eigen::half h = raw_uint16_to_half(static_cast<unsigned short>(a64));
+  float f1 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  float f2 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  float f3 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  float f4 = static_cast<float>(h);
+  return _mm_set_ps(f4, f3, f2, f1);
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f& a) {
+  EIGEN_ALIGN16 float aux[4];
+  pstore(aux, a);
+  Eigen::half h0(aux[0]);
+  Eigen::half h1(aux[1]);
+  Eigen::half h2(aux[2]);
+  Eigen::half h3(aux[3]);
+
+  Packet4h result;
+  result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x);
+  return result;
+}
+
+#endif
 
 } // end namespace internal
 

diff --git a/Eigen/src/Core/arch/SVE/MathFunctions.h b/Eigen/src/Core/arch/SVE/MathFunctions.h
new file mode 100644
index 0000000..b139ea2
--- /dev/null
+++ b/Eigen/src/Core/arch/SVE/MathFunctions.h

@@ -0,0 +1,44 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020, Arm Limited and Contributors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_SVE_H
+#define EIGEN_MATH_FUNCTIONS_SVE_H
+
+namespace Eigen {
+namespace internal {
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf pexp<PacketXf>(const PacketXf& x) {
+  return pexp_float(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf plog<PacketXf>(const PacketXf& x) {
+  return plog_float(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf psin<PacketXf>(const PacketXf& x) {
+  return psin_float(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf pcos<PacketXf>(const PacketXf& x) {
+  return pcos_float(x);
+}
+
+// Hyperbolic Tangent function.
+template <>
+EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf ptanh<PacketXf>(const PacketXf& x) {
+  return internal::generic_fast_tanh_float(x);
+}
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_SVE_H

diff --git a/Eigen/src/Core/arch/SVE/PacketMath.h b/Eigen/src/Core/arch/SVE/PacketMath.h
new file mode 100644
index 0000000..9060b37
--- /dev/null
+++ b/Eigen/src/Core/arch/SVE/PacketMath.h

@@ -0,0 +1,752 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020, Arm Limited and Contributors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_SVE_H
+#define EIGEN_PACKET_MATH_SVE_H
+
+namespace Eigen
+{
+namespace internal
+{
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+
+template <typename Scalar, int SVEVectorLength>
+struct sve_packet_size_selector {
+  enum { size = SVEVectorLength / (sizeof(Scalar) * CHAR_BIT) };
+};
+
+/********************************* int32 **************************************/
+typedef svint32_t PacketXi __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
+
+template <>
+struct packet_traits<numext::int32_t> : default_packet_traits {
+  typedef PacketXi type;
+  typedef PacketXi half;  // Half not implemented yet
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = sve_packet_size_selector<numext::int32_t, EIGEN_ARM64_SVE_VL>::size,
+    HasHalfPacket = 0,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0  // Not implemented in SVE
+  };
+};
+
+template <>
+struct unpacket_traits<PacketXi> {
+  typedef numext::int32_t type;
+  typedef PacketXi half;  // Half not yet implemented
+  enum {
+    size = sve_packet_size_selector<numext::int32_t, EIGEN_ARM64_SVE_VL>::size,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<numext::int32_t>(const numext::int32_t* addr)
+{
+  svprfw(svptrue_b32(), addr, SV_PLDL1KEEP);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pset1<PacketXi>(const numext::int32_t& from)
+{
+  return svdup_n_s32(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const numext::int32_t& a)
+{
+  numext::int32_t c[packet_traits<numext::int32_t>::size];
+  for (int i = 0; i < packet_traits<numext::int32_t>::size; i++) c[i] = i;
+  return svadd_s32_z(svptrue_b32(), pset1<PacketXi>(a), svld1_s32(svptrue_b32(), c));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi padd<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svadd_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi psub<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svsub_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pnegate(const PacketXi& a)
+{
+  return svneg_s32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pconj(const PacketXi& a)
+{
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pmul<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svmul_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pdiv<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svdiv_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c)
+{
+  return svmla_s32_z(svptrue_b32(), c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pmin<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svmin_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pmax<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svmax_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pcmp_le<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pcmp_lt<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pcmp_eq<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svdup_n_s32_z(svcmpeq_s32(svptrue_b32(), a, b), 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi ptrue<PacketXi>(const PacketXi& /*a*/)
+{
+  return svdup_n_s32_z(svptrue_b32(), 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pzero<PacketXi>(const PacketXi& /*a*/)
+{
+  return svdup_n_s32_z(svptrue_b32(), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pand<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svand_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi por<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svorr_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pxor<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return sveor_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pandnot<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svbic_s32_z(svptrue_b32(), a, b);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a)
+{
+  return svasrd_n_s32_z(svptrue_b32(), a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a)
+{
+  return svreinterpret_s32_u32(svlsr_u32_z(svptrue_b32(), svreinterpret_u32_s32(a), svdup_n_u32_z(svptrue_b32(), N)));
+}
+
+template <int N>
+EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a)
+{
+  return svlsl_s32_z(svptrue_b32(), a, svdup_n_u32_z(svptrue_b32(), N));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pload<PacketXi>(const numext::int32_t* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD return svld1_s32(svptrue_b32(), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi ploadu<PacketXi>(const numext::int32_t* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD return svld1_s32(svptrue_b32(), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const numext::int32_t* from)
+{
+  svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
+  return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const numext::int32_t* from)
+{
+  svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...}
+  return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int32_t>(numext::int32_t* to, const PacketXi& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int32_t>(numext::int32_t* to, const PacketXi& from)
+{
+  EIGEN_DEBUG_UNALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline PacketXi pgather<numext::int32_t, PacketXi>(const numext::int32_t* from, Index stride)
+{
+  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
+  svint32_t indices = svindex_s32(0, stride);
+  return svld1_gather_s32index_s32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<numext::int32_t, PacketXi>(numext::int32_t* to, const PacketXi& from, Index stride)
+{
+  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
+  svint32_t indices = svindex_s32(0, stride);
+  svst1_scatter_s32index_s32(svptrue_b32(), to, indices, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t pfirst<PacketXi>(const PacketXi& a)
+{
+  // svlasta returns the first element if all predicate bits are 0
+  return svlasta_s32(svpfalse_b(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a)
+{
+  return svrev_s32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pabs(const PacketXi& a)
+{
+  return svabs_s32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux<PacketXi>(const PacketXi& a)
+{
+  return static_cast<numext::int32_t>(svaddv_s32(svptrue_b32(), a));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_mul<PacketXi>(const PacketXi& a)
+{
+  EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0),
+                      EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
+
+  // Multiply the vector by its reverse
+  svint32_t prod = svmul_s32_z(svptrue_b32(), a, svrev_s32(a));
+  svint32_t half_prod;
+
+  // Extract the high half of the vector. Depending on the VL more reductions need to be done
+  if (EIGEN_ARM64_SVE_VL >= 2048) {
+    half_prod = svtbl_s32(prod, svindex_u32(32, 1));
+    prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 1024) {
+    half_prod = svtbl_s32(prod, svindex_u32(16, 1));
+    prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 512) {
+    half_prod = svtbl_s32(prod, svindex_u32(8, 1));
+    prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 256) {
+    half_prod = svtbl_s32(prod, svindex_u32(4, 1));
+    prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+  }
+  // Last reduction
+  half_prod = svtbl_s32(prod, svindex_u32(2, 1));
+  prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+
+  // The reduction is done to the first element.
+  return pfirst<PacketXi>(prod);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_min<PacketXi>(const PacketXi& a)
+{
+  return svminv_s32(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_max<PacketXi>(const PacketXi& a)
+{
+  return svmaxv_s32(svptrue_b32(), a);
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<PacketXi, N>& kernel) {
+  int buffer[packet_traits<numext::int32_t>::size * N] = {0};
+  int i = 0;
+
+  PacketXi stride_index = svindex_s32(0, N);
+
+  for (i = 0; i < N; i++) {
+    svst1_scatter_s32index_s32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]);
+  }
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] = svld1_s32(svptrue_b32(), buffer + i * packet_traits<numext::int32_t>::size);
+  }
+}
+
+/********************************* float32 ************************************/
+
+typedef svfloat32_t PacketXf __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef PacketXf type;
+  typedef PacketXf half;
+
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = sve_packet_size_selector<float, EIGEN_ARM64_SVE_VL>::size,
+    HasHalfPacket = 0,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0,  // Not implemented in SVE
+
+    HasDiv = 1,
+    HasFloor = 1,
+
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 0,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH
+  };
+};
+
+template <>
+struct unpacket_traits<PacketXf> {
+  typedef float type;
+  typedef PacketXf half;  // Half not yet implemented
+  typedef PacketXi integer_packet;
+
+  enum {
+    size = sve_packet_size_selector<float, EIGEN_ARM64_SVE_VL>::size,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pset1<PacketXf>(const float& from)
+{
+  return svdup_n_f32(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pset1frombits<PacketXf>(numext::uint32_t from)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), from));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a)
+{
+  float c[packet_traits<float>::size];
+  for (int i = 0; i < packet_traits<float>::size; i++) c[i] = i;
+  return svadd_f32_z(svptrue_b32(), pset1<PacketXf>(a), svld1_f32(svptrue_b32(), c));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf padd<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svadd_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf psub<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svsub_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pnegate(const PacketXf& a)
+{
+  return svneg_f32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pconj(const PacketXf& a)
+{
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmul<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svmul_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pdiv<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svdiv_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c)
+{
+  return svmla_f32_z(svptrue_b32(), c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmin<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svmin_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmin<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return pmin<PacketXf>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmin<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svminnm_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmax<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svmax_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmax<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return pmax<PacketXf>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmax<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svmaxnm_f32_z(svptrue_b32(), a, b);
+}
+
+// Float comparisons in SVE return svbool (predicate). Use svdup to set active
+// lanes to 1 (0xffffffffu) and inactive lanes to 0.
+template <>
+EIGEN_STRONG_INLINE PacketXf pcmp_le<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pcmp_lt<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pcmp_eq<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svcmpeq_f32(svptrue_b32(), a, b), 0xffffffffu));
+}
+
+// Do a predicate inverse (svnot_b_z) on the predicate resulted from the
+// greater/equal comparison (svcmpge_f32). Then fill a float vector with the
+// active elements.
+template <>
+EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svnot_b_z(svptrue_b32(), svcmpge_f32(svptrue_b32(), a, b)), 0xffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pfloor<PacketXf>(const PacketXf& a)
+{
+  return svrintm_f32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ptrue<PacketXf>(const PacketXf& /*a*/)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), 0xffffffffu));
+}
+
+// Logical Operations are not supported for float, so reinterpret casts
+template <>
+EIGEN_STRONG_INLINE PacketXf pand<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svand_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf por<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svorr_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pxor<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(sveor_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pandnot<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svbic_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pload<PacketXf>(const float* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD return svld1_f32(svptrue_b32(), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ploadu<PacketXf>(const float* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD return svld1_f32(svptrue_b32(), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from)
+{
+  svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
+  return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from)
+{
+  svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...}
+  return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const PacketXf& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const PacketXf& from)
+{
+  EIGEN_DEBUG_UNALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline PacketXf pgather<float, PacketXf>(const float* from, Index stride)
+{
+  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
+  svint32_t indices = svindex_s32(0, stride);
+  return svld1_gather_s32index_f32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, PacketXf>(float* to, const PacketXf& from, Index stride)
+{
+  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
+  svint32_t indices = svindex_s32(0, stride);
+  svst1_scatter_s32index_f32(svptrue_b32(), to, indices, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<PacketXf>(const PacketXf& a)
+{
+  // svlasta returns the first element if all predicate bits are 0
+  return svlasta_f32(svpfalse_b(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a)
+{
+  return svrev_f32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pabs(const PacketXf& a)
+{
+  return svabs_f32_z(svptrue_b32(), a);
+}
+
+// TODO(tellenbach): Should this go into MathFunctions.h? If so, change for 
+// all vector extensions and the generic version.
+template <>
+EIGEN_STRONG_INLINE PacketXf pfrexp<PacketXf>(const PacketXf& a, PacketXf& exponent)
+{
+  return pfrexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a)
+{
+  return svaddv_f32(svptrue_b32(), a);
+}
+
+// Other reduction functions:
+// mul
+// Only works for SVE Vls multiple of 128
+template <>
+EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a)
+{
+  EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0),
+                      EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
+  // Multiply the vector by its reverse
+  svfloat32_t prod = svmul_f32_z(svptrue_b32(), a, svrev_f32(a));
+  svfloat32_t half_prod;
+
+  // Extract the high half of the vector. Depending on the VL more reductions need to be done
+  if (EIGEN_ARM64_SVE_VL >= 2048) {
+    half_prod = svtbl_f32(prod, svindex_u32(32, 1));
+    prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 1024) {
+    half_prod = svtbl_f32(prod, svindex_u32(16, 1));
+    prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 512) {
+    half_prod = svtbl_f32(prod, svindex_u32(8, 1));
+    prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 256) {
+    half_prod = svtbl_f32(prod, svindex_u32(4, 1));
+    prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+  }
+  // Last reduction
+  half_prod = svtbl_f32(prod, svindex_u32(2, 1));
+  prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+
+  // The reduction is done to the first element.
+  return pfirst<PacketXf>(prod);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PacketXf>(const PacketXf& a)
+{
+  return svminv_f32(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PacketXf>(const PacketXf& a)
+{
+  return svmaxv_f32(svptrue_b32(), a);
+}
+
+template<int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<PacketXf, N>& kernel)
+{
+  float buffer[packet_traits<float>::size * N] = {0};
+  int i = 0;
+
+  PacketXi stride_index = svindex_s32(0, N);
+
+  for (i = 0; i < N; i++) {
+    svst1_scatter_s32index_f32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]);
+  }
+
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] = svld1_f32(svptrue_b32(), buffer + i * packet_traits<float>::size);
+  }
+}
+
+template<>
+EIGEN_STRONG_INLINE PacketXf pldexp<PacketXf>(const PacketXf& a, const PacketXf& exponent)
+{
+  return pldexp_generic(a, exponent);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_SVE_H

diff --git a/Eigen/src/Core/arch/SVE/TypeCasting.h b/Eigen/src/Core/arch/SVE/TypeCasting.h
new file mode 100644
index 0000000..7ba5d9c
--- /dev/null
+++ b/Eigen/src/Core/arch/SVE/TypeCasting.h

@@ -0,0 +1,49 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020, Arm Limited and Contributors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_SVE_H
+#define EIGEN_TYPE_CASTING_SVE_H
+
+namespace Eigen {
+namespace internal {
+
+template <>
+struct type_casting_traits<float, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+struct type_casting_traits<numext::int32_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pcast<PacketXi, PacketXf>(const PacketXi& a) {
+  return svcvt_f32_s32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pcast<PacketXf, PacketXi>(const PacketXf& a) {
+  return svcvt_s32_f32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf preinterpret<PacketXf, PacketXi>(const PacketXi& a) {
+  return svreinterpret_f32_s32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi preinterpret<PacketXi, PacketXf>(const PacketXf& a) {
+  return svreinterpret_s32_f32(a);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_SVE_H

diff --git a/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
new file mode 100644
index 0000000..10856ff
--- /dev/null
+++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h

@@ -0,0 +1,232 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * InteropHeaders.h
+ *
+ * \brief:
+ *  InteropHeaders
+ *
+ *****************************************************************/
+
+#ifndef EIGEN_INTEROP_HEADERS_SYCL_H
+#define EIGEN_INTEROP_HEADERS_SYCL_H
+
+namespace Eigen {
+
+#if !defined(EIGEN_DONT_VECTORIZE_SYCL)
+
+namespace internal {
+
+template <int has_blend, int lengths>
+struct sycl_packet_traits : default_packet_traits {
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = lengths,
+    HasHalfPacket = 0,
+    HasDiv = 1,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasSin = 1,
+    HasCos = 1,
+    HasTan = 1,
+    HasASin = 1,
+    HasACos = 1,
+    HasATan = 1,
+    HasSinh = 1,
+    HasCosh = 1,
+    HasTanh = 1,
+    HasLGamma = 0,
+    HasDiGamma = 0,
+    HasZeta = 0,
+    HasPolygamma = 0,
+    HasErf = 0,
+    HasErfc = 0,
+    HasNdtri = 0,
+    HasIGamma = 0,
+    HasIGammac = 0,
+    HasBetaInc = 0,
+    HasBlend = has_blend,
+    // This flag is used to indicate whether packet comparison is supported.
+    // pcmp_eq, pcmp_lt and pcmp_le should be defined for it to be true.
+    HasCmp = 1,
+    HasMax = 1,
+    HasMin = 1,
+    HasMul = 1,
+    HasAdd = 1,
+    HasFloor = 1,
+    HasRound = 1,
+    HasRint = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasCeil = 1,
+  };
+};
+
+#ifdef SYCL_DEVICE_ONLY
+#define SYCL_PACKET_TRAITS(packet_type, has_blend, unpacket_type, lengths) \
+  template <>                                                              \
+  struct packet_traits<unpacket_type>                                      \
+      : sycl_packet_traits<has_blend, lengths> {                           \
+    typedef packet_type type;                                              \
+    typedef packet_type half;                                              \
+  };
+
+SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4)
+SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4)
+SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2)
+SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2)
+#undef SYCL_PACKET_TRAITS
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#define SYCL_ARITHMETIC(packet_type)  \
+  template <>                         \
+  struct is_arithmetic<packet_type> { \
+    enum { value = true };            \
+  };
+SYCL_ARITHMETIC(cl::sycl::cl_float4)
+SYCL_ARITHMETIC(cl::sycl::cl_double2)
+#undef SYCL_ARITHMETIC
+
+#define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths)        \
+  template <>                                                            \
+  struct unpacket_traits<packet_type> {                                  \
+    typedef unpacket_type type;                                          \
+    enum { size = lengths, vectorizable = true, alignment = Aligned16 }; \
+    typedef packet_type half;                                            \
+  };
+SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4)
+SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2)
+
+#undef SYCL_UNPACKET_TRAITS
+#endif
+
+}  // end namespace internal
+
+#endif
+
+namespace TensorSycl {
+namespace internal {
+
+template <typename PacketReturnType, int PacketSize>
+struct PacketWrapper;
+// This function should never get called on the device
+#ifndef SYCL_DEVICE_ONLY
+template <typename PacketReturnType, int PacketSize>
+struct PacketWrapper {
+  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type
+      Scalar;
+  template <typename Index>
+  EIGEN_DEVICE_FUNC static Scalar scalarize(Index, PacketReturnType &) {
+    eigen_assert(false && "THERE IS NO PACKETIZE VERSION FOR  THE CHOSEN TYPE");
+    abort();
+  }
+  EIGEN_DEVICE_FUNC static PacketReturnType convert_to_packet_type(Scalar in,
+                                                                   Scalar) {
+    return ::Eigen::internal::template plset<PacketReturnType>(in);
+  }
+  EIGEN_DEVICE_FUNC static void set_packet(PacketReturnType, Scalar *) {
+    eigen_assert(false && "THERE IS NO PACKETIZE VERSION FOR  THE CHOSEN TYPE");
+    abort();
+  }
+};
+
+#elif defined(SYCL_DEVICE_ONLY)
+template <typename PacketReturnType>
+struct PacketWrapper<PacketReturnType, 4> {
+  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type
+      Scalar;
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) {
+    switch (index) {
+      case 0:
+        return in.x();
+      case 1:
+        return in.y();
+      case 2:
+        return in.z();
+      case 3:
+        return in.w();
+      default:
+      //INDEX MUST BE BETWEEN 0 and 3.There is no abort function in SYCL kernel. so we cannot use abort here. 
+      // The code will never reach here
+      __builtin_unreachable();
+    }
+    __builtin_unreachable();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(
+      Scalar in, Scalar other) {
+    return PacketReturnType(in, other, other, other);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) {
+    lhs = PacketReturnType(rhs[0], rhs[1], rhs[2], rhs[3]);
+  }
+};
+
+template <typename PacketReturnType>
+struct PacketWrapper<PacketReturnType, 1> {
+  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type
+      Scalar;
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index, PacketReturnType &in) {
+    return in;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in,
+                                                                   Scalar) {
+    return PacketReturnType(in);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) {
+    lhs = rhs[0];
+  }
+};
+
+template <typename PacketReturnType>
+struct PacketWrapper<PacketReturnType, 2> {
+  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type
+      Scalar;
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) {
+    switch (index) {
+      case 0:
+        return in.x();
+      case 1:
+        return in.y();
+      default:
+        //INDEX MUST BE BETWEEN 0 and 1.There is no abort function in SYCL kernel. so we cannot use abort here. 
+      // The code will never reach here
+        __builtin_unreachable();
+    }
+    __builtin_unreachable();
+  }
+  
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(
+      Scalar in, Scalar other) {
+    return PacketReturnType(in, other);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) {
+    lhs = PacketReturnType(rhs[0], rhs[1]);
+  }
+};
+
+#endif
+
+}  // end namespace internal
+}  // end namespace TensorSycl
+}  // end namespace Eigen
+
+#endif  // EIGEN_INTEROP_HEADERS_SYCL_H

diff --git a/Eigen/src/Core/arch/SYCL/MathFunctions.h b/Eigen/src/Core/arch/SYCL/MathFunctions.h
new file mode 100644
index 0000000..2ab0f2a
--- /dev/null
+++ b/Eigen/src/Core/arch/SYCL/MathFunctions.h

@@ -0,0 +1,301 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * MathFunctions.h
+ *
+ * \brief:
+ *  MathFunctions
+ *
+ *****************************************************************/
+
+#ifndef EIGEN_MATH_FUNCTIONS_SYCL_H
+#define EIGEN_MATH_FUNCTIONS_SYCL_H
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(SYCL_DEVICE_ONLY)
+#define SYCL_PLOG(packet_type)                                         \
+  template <>                                                          \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog<packet_type>( \
+      const packet_type& a) {                                          \
+    return cl::sycl::log(a);                                           \
+  }
+
+SYCL_PLOG(cl::sycl::cl_float4)
+SYCL_PLOG(cl::sycl::cl_double2)
+#undef SYCL_PLOG
+
+#define SYCL_PLOG1P(packet_type)                                         \
+  template <>                                                            \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog1p<packet_type>( \
+      const packet_type& a) {                                            \
+    return cl::sycl::log1p(a);                                           \
+  }
+
+SYCL_PLOG1P(cl::sycl::cl_float4)
+SYCL_PLOG1P(cl::sycl::cl_double2)
+#undef SYCL_PLOG1P
+
+#define SYCL_PLOG10(packet_type)                                         \
+  template <>                                                            \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog10<packet_type>( \
+      const packet_type& a) {                                            \
+    return cl::sycl::log10(a);                                           \
+  }
+
+SYCL_PLOG10(cl::sycl::cl_float4)
+SYCL_PLOG10(cl::sycl::cl_double2)
+#undef SYCL_PLOG10
+
+#define SYCL_PEXP(packet_type)                                         \
+  template <>                                                          \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexp<packet_type>( \
+      const packet_type& a) {                                          \
+    return cl::sycl::exp(a);                                           \
+  }
+
+SYCL_PEXP(cl::sycl::cl_float4)
+SYCL_PEXP(cl::sycl::cl_float)
+SYCL_PEXP(cl::sycl::cl_double2)
+#undef SYCL_PEXP
+
+#define SYCL_PEXPM1(packet_type)                                         \
+  template <>                                                            \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexpm1<packet_type>( \
+      const packet_type& a) {                                            \
+    return cl::sycl::expm1(a);                                           \
+  }
+
+SYCL_PEXPM1(cl::sycl::cl_float4)
+SYCL_PEXPM1(cl::sycl::cl_double2)
+#undef SYCL_PEXPM1
+
+#define SYCL_PSQRT(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psqrt<packet_type>( \
+      const packet_type& a) {                                           \
+    return cl::sycl::sqrt(a);                                           \
+  }
+
+SYCL_PSQRT(cl::sycl::cl_float4)
+SYCL_PSQRT(cl::sycl::cl_double2)
+#undef SYCL_PSQRT
+
+#define SYCL_PRSQRT(packet_type)                                         \
+  template <>                                                            \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type prsqrt<packet_type>( \
+      const packet_type& a) {                                            \
+    return cl::sycl::rsqrt(a);                                           \
+  }
+
+SYCL_PRSQRT(cl::sycl::cl_float4)
+SYCL_PRSQRT(cl::sycl::cl_double2)
+#undef SYCL_PRSQRT
+
+/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
+#define SYCL_PSIN(packet_type)                                         \
+  template <>                                                          \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psin<packet_type>( \
+      const packet_type& a) {                                          \
+    return cl::sycl::sin(a);                                           \
+  }
+
+SYCL_PSIN(cl::sycl::cl_float4)
+SYCL_PSIN(cl::sycl::cl_double2)
+#undef SYCL_PSIN
+
+/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
+#define SYCL_PCOS(packet_type)                                         \
+  template <>                                                          \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcos<packet_type>( \
+      const packet_type& a) {                                          \
+    return cl::sycl::cos(a);                                           \
+  }
+
+SYCL_PCOS(cl::sycl::cl_float4)
+SYCL_PCOS(cl::sycl::cl_double2)
+#undef SYCL_PCOS
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
+#define SYCL_PTAN(packet_type)                                         \
+  template <>                                                          \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptan<packet_type>( \
+      const packet_type& a) {                                          \
+    return cl::sycl::tan(a);                                           \
+  }
+
+SYCL_PTAN(cl::sycl::cl_float4)
+SYCL_PTAN(cl::sycl::cl_double2)
+#undef SYCL_PTAN
+
+/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
+#define SYCL_PASIN(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pasin<packet_type>( \
+      const packet_type& a) {                                           \
+    return cl::sycl::asin(a);                                           \
+  }
+
+SYCL_PASIN(cl::sycl::cl_float4)
+SYCL_PASIN(cl::sycl::cl_double2)
+#undef SYCL_PASIN
+
+/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
+#define SYCL_PACOS(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pacos<packet_type>( \
+      const packet_type& a) {                                           \
+    return cl::sycl::acos(a);                                           \
+  }
+
+SYCL_PACOS(cl::sycl::cl_float4)
+SYCL_PACOS(cl::sycl::cl_double2)
+#undef SYCL_PACOS
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
+#define SYCL_PATAN(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type patan<packet_type>( \
+      const packet_type& a) {                                           \
+    return cl::sycl::atan(a);                                           \
+  }
+
+SYCL_PATAN(cl::sycl::cl_float4)
+SYCL_PATAN(cl::sycl::cl_double2)
+#undef SYCL_PATAN
+
+/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
+#define SYCL_PSINH(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psinh<packet_type>( \
+      const packet_type& a) {                                           \
+    return cl::sycl::sinh(a);                                           \
+  }
+
+SYCL_PSINH(cl::sycl::cl_float4)
+SYCL_PSINH(cl::sycl::cl_double2)
+#undef SYCL_PSINH
+
+/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
+#define SYCL_PCOSH(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcosh<packet_type>( \
+      const packet_type& a) {                                           \
+    return cl::sycl::cosh(a);                                           \
+  }
+
+SYCL_PCOSH(cl::sycl::cl_float4)
+SYCL_PCOSH(cl::sycl::cl_double2)
+#undef SYCL_PCOSH
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
+#define SYCL_PTANH(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptanh<packet_type>( \
+      const packet_type& a) {                                           \
+    return cl::sycl::tanh(a);                                           \
+  }
+
+SYCL_PTANH(cl::sycl::cl_float4)
+SYCL_PTANH(cl::sycl::cl_double2)
+#undef SYCL_PTANH
+
+#define SYCL_PCEIL(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pceil<packet_type>( \
+      const packet_type& a) {                                           \
+    return cl::sycl::ceil(a);                                           \
+  }
+
+SYCL_PCEIL(cl::sycl::cl_float4)
+SYCL_PCEIL(cl::sycl::cl_double2)
+#undef SYCL_PCEIL
+
+#define SYCL_PROUND(packet_type)                                         \
+  template <>                                                            \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pround<packet_type>( \
+      const packet_type& a) {                                            \
+    return cl::sycl::round(a);                                           \
+  }
+
+SYCL_PROUND(cl::sycl::cl_float4)
+SYCL_PROUND(cl::sycl::cl_double2)
+#undef SYCL_PROUND
+
+#define SYCL_PRINT(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type print<packet_type>( \
+      const packet_type& a) {                                           \
+    return cl::sycl::rint(a);                                           \
+  }
+
+SYCL_PRINT(cl::sycl::cl_float4)
+SYCL_PRINT(cl::sycl::cl_double2)
+#undef SYCL_PRINT
+
+#define SYCL_FLOOR(packet_type)                                          \
+  template <>                                                            \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pfloor<packet_type>( \
+      const packet_type& a) {                                            \
+    return cl::sycl::floor(a);                                           \
+  }
+
+SYCL_FLOOR(cl::sycl::cl_float4)
+SYCL_FLOOR(cl::sycl::cl_double2)
+#undef SYCL_FLOOR
+
+#define SYCL_PMIN(packet_type, expr)                                   \
+  template <>                                                          \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmin<packet_type>( \
+      const packet_type& a, const packet_type& b) {                    \
+    return expr;                                                       \
+  }
+
+SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b))
+SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
+#undef SYCL_PMIN
+
+#define SYCL_PMAX(packet_type, expr)                                   \
+  template <>                                                          \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmax<packet_type>( \
+      const packet_type& a, const packet_type& b) {                    \
+    return expr;                                                       \
+  }
+
+SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b))
+SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
+#undef SYCL_PMAX
+
+#define SYCL_PLDEXP(packet_type)                                             \
+  template <>                                                                \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pldexp(                  \
+      const packet_type& a, const packet_type& exponent) {                   \
+    return cl::sycl::ldexp(                                                  \
+        a, exponent.template convert<cl::sycl::cl_int,                       \
+                                     cl::sycl::rounding_mode::automatic>()); \
+  }
+
+SYCL_PLDEXP(cl::sycl::cl_float4)
+SYCL_PLDEXP(cl::sycl::cl_double2)
+#undef SYCL_PLDEXP
+
+#endif
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_SYCL_H

diff --git a/Eigen/src/Core/arch/SYCL/PacketMath.h b/Eigen/src/Core/arch/SYCL/PacketMath.h
new file mode 100644
index 0000000..87badc0
--- /dev/null
+++ b/Eigen/src/Core/arch/SYCL/PacketMath.h

@@ -0,0 +1,670 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * PacketMath.h
+ *
+ * \brief:
+ *  PacketMath
+ *
+ *****************************************************************/
+
+#ifndef EIGEN_PACKET_MATH_SYCL_H
+#define EIGEN_PACKET_MATH_SYCL_H
+#include <type_traits>
+namespace Eigen {
+
+namespace internal {
+#ifdef SYCL_DEVICE_ONLY
+
+#define SYCL_PLOADT_RO(address_space_target)                                 \
+  template <typename packet_type, int Alignment>                             \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt_ro(               \
+      typename cl::sycl::multi_ptr<                                          \
+          const typename unpacket_traits<packet_type>::type,                 \
+          cl::sycl::access::address_space::address_space_target>::pointer_t  \
+          from) {                                                            \
+    typedef typename unpacket_traits<packet_type>::type scalar;              \
+    typedef cl::sycl::multi_ptr<                                             \
+        scalar, cl::sycl::access::address_space::address_space_target>       \
+        multi_ptr;                                                           \
+    auto res = packet_type(                                                  \
+        static_cast<typename unpacket_traits<packet_type>::type>(0));        \
+    res.load(0, multi_ptr(const_cast<typename multi_ptr::pointer_t>(from))); \
+    return res;                                                              \
+  }
+
+SYCL_PLOADT_RO(global_space)
+SYCL_PLOADT_RO(local_space)
+#undef SYCL_PLOADT_RO
+#endif
+
+template <typename packet_type, int Alignment, typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
+ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess<
+          cl::sycl::access::mode::read_write, T>& from) {
+  return ploadt_ro<packet_type, Alignment>(from.get_pointer());
+}
+
+#ifdef SYCL_DEVICE_ONLY
+#define SYCL_PLOAD(address_space_target, Alignment, AlignedType)            \
+  template <typename packet_type>                                           \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType(     \
+      typename cl::sycl::multi_ptr<                                         \
+          const typename unpacket_traits<packet_type>::type,                \
+          cl::sycl::access::address_space::address_space_target>::pointer_t \
+          from) {                                                           \
+    return ploadt_ro<packet_type, Alignment>(from);                         \
+  }
+
+// global space
+SYCL_PLOAD(global_space, Unaligned, u)
+SYCL_PLOAD(global_space, Aligned, )
+// local space
+SYCL_PLOAD(local_space, Unaligned, u)
+SYCL_PLOAD(local_space, Aligned, )
+
+#undef SYCL_PLOAD
+#endif
+
+#define SYCL_PLOAD(Alignment, AlignedType)                              \
+  template <typename packet_type>                                       \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
+      const Eigen::TensorSycl::internal::RangeAccess<                   \
+          cl::sycl::access::mode::read_write,                           \
+          typename unpacket_traits<packet_type>::type>                  \
+          from) {                                                       \
+    return ploadt_ro<packet_type, Alignment>(from);                     \
+  }
+SYCL_PLOAD(Unaligned, u)
+SYCL_PLOAD(Aligned, )
+#undef SYCL_PLOAD
+
+#ifdef SYCL_DEVICE_ONLY
+/** \internal \returns a packet version of \a *from.
+ * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+#define SYCL_PLOADT(address_space_target)                                   \
+  template <typename packet_type, int Alignment>                            \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt(                 \
+      typename cl::sycl::multi_ptr<                                         \
+          const typename unpacket_traits<packet_type>::type,                \
+          cl::sycl::access::address_space::address_space_target>::pointer_t \
+          from) {                                                           \
+    if (Alignment >= unpacket_traits<packet_type>::alignment)               \
+      return pload<packet_type>(from);                                      \
+    else                                                                    \
+      return ploadu<packet_type>(from);                                     \
+  }
+
+// global space
+SYCL_PLOADT(global_space)
+// local space
+SYCL_PLOADT(local_space)
+#undef SYCL_PLOADT
+#endif
+
+template <typename packet_type, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
+ploadt(const Eigen::TensorSycl::internal::RangeAccess<
+       cl::sycl::access::mode::read_write,
+       typename unpacket_traits<packet_type>::type>& from) {
+  return ploadt<packet_type, Alignment>(from.get_pointer());
+}
+#ifdef SYCL_DEVICE_ONLY
+
+// private_space
+#define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment)                 \
+  template <>                                                          \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type                    \
+  ploadt_ro<packet_type, Alignment>(                                   \
+      const typename unpacket_traits<packet_type>::type* from) {       \
+    typedef typename unpacket_traits<packet_type>::type scalar;        \
+    auto res = packet_type(static_cast<scalar>(0));                    \
+    res.template load<cl::sycl::access::address_space::private_space>( \
+        0, const_cast<scalar*>(from));                                 \
+    return res;                                                        \
+  }
+
+SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned)
+SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned)
+SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned)
+SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned)
+
+#define SYCL_PLOAD_SPECIAL(packet_type, alignment_type)                    \
+  template <>                                                              \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##alignment_type( \
+      const typename unpacket_traits<packet_type>::type* from) {           \
+    typedef typename unpacket_traits<packet_type>::type scalar;            \
+    auto res = packet_type(static_cast<scalar>(0));                        \
+    res.template load<cl::sycl::access::address_space::private_space>(     \
+        0, const_cast<scalar*>(from));                                     \
+    return res;                                                            \
+  }
+SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, )
+SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, )
+SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u)
+SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u)
+
+#undef SYCL_PLOAD_SPECIAL
+
+#define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment)   \
+  template <>                                                               \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment(             \
+      typename cl::sycl::multi_ptr<                                         \
+          scalar,                                                           \
+          cl::sycl::access::address_space::address_space_target>::pointer_t \
+          to,                                                               \
+      const packet_type& from) {                                            \
+    typedef cl::sycl::multi_ptr<                                            \
+        scalar, cl::sycl::access::address_space::address_space_target>      \
+        multi_ptr;                                                          \
+    from.store(0, multi_ptr(to));                                           \
+  }
+
+// global space
+SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, )
+SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u)
+SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, )
+SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u)
+SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, )
+SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u)
+SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, )
+SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u)
+
+SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, )
+SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u)
+SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, )
+SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u)
+#undef SYCL_PSTORE
+
+#define SYCL_PSTORE_T(address_space_target)                                 \
+  template <typename scalar, typename packet_type, int Alignment>           \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(                       \
+      typename cl::sycl::multi_ptr<                                         \
+          scalar,                                                           \
+          cl::sycl::access::address_space::address_space_target>::pointer_t \
+          to,                                                               \
+      const packet_type& from) {                                            \
+    if (Alignment)                                                          \
+      pstore(to, from);                                                     \
+    else                                                                    \
+      pstoreu(to, from);                                                    \
+  }
+
+SYCL_PSTORE_T(global_space)
+
+SYCL_PSTORE_T(local_space)
+
+#undef SYCL_PSTORE_T
+
+#define SYCL_PSET1(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>( \
+      const typename unpacket_traits<packet_type>::type& from) {        \
+    return packet_type(from);                                           \
+  }
+
+// global space
+SYCL_PSET1(cl::sycl::cl_float4)
+SYCL_PSET1(cl::sycl::cl_double2)
+
+#undef SYCL_PSET1
+
+template <typename packet_type>
+struct get_base_packet {
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type
+  get_ploaddup(sycl_multi_pointer) {}
+
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type
+  get_pgather(sycl_multi_pointer, Index) {}
+};
+
+template <>
+struct get_base_packet<cl::sycl::cl_float4> {
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_ploaddup(
+      sycl_multi_pointer from) {
+    return cl::sycl::cl_float4(from[0], from[0], from[1], from[1]);
+  }
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_pgather(
+      sycl_multi_pointer from, Index stride) {
+    return cl::sycl::cl_float4(from[0 * stride], from[1 * stride],
+                               from[2 * stride], from[3 * stride]);
+  }
+
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(
+      sycl_multi_pointer to, const cl::sycl::cl_float4& from, Index stride) {
+    auto tmp = stride;
+    to[0] = from.x();
+    to[tmp] = from.y();
+    to[tmp += stride] = from.z();
+    to[tmp += stride] = from.w();
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 set_plset(
+      const float& a) {
+    return cl::sycl::cl_float4(static_cast<float>(a), static_cast<float>(a + 1),
+                               static_cast<float>(a + 2),
+                               static_cast<float>(a + 3));
+  }
+};
+
+template <>
+struct get_base_packet<cl::sycl::cl_double2> {
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2
+  get_ploaddup(const sycl_multi_pointer from) {
+    return cl::sycl::cl_double2(from[0], from[0]);
+  }
+
+  template <typename sycl_multi_pointer, typename Index>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_pgather(
+      const sycl_multi_pointer from, Index stride) {
+    return cl::sycl::cl_double2(from[0 * stride], from[1 * stride]);
+  }
+
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(
+      sycl_multi_pointer to, const cl::sycl::cl_double2& from, Index stride) {
+    to[0] = from.x();
+    to[stride] = from.y();
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 set_plset(
+      const double& a) {
+    return cl::sycl::cl_double2(static_cast<double>(a),
+                                static_cast<double>(a + 1));
+  }
+};
+
+#define SYCL_PLOAD_DUP(address_space_target)                                \
+  template <typename packet_type>                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup(               \
+      typename cl::sycl::multi_ptr<                                         \
+          const typename unpacket_traits<packet_type>::type,                \
+          cl::sycl::access::address_space::address_space_target>::pointer_t \
+          from) {                                                           \
+    return get_base_packet<packet_type>::get_ploaddup(from);                \
+  }
+
+// global space
+SYCL_PLOAD_DUP(global_space)
+// local_space
+SYCL_PLOAD_DUP(local_space)
+#undef SYCL_PLOAD_DUP
+
+#define SYCL_PLOAD_DUP_SPECILIZE(packet_type)                              \
+  template <>                                                              \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup<packet_type>( \
+      const typename unpacket_traits<packet_type>::type* from) {           \
+    return get_base_packet<packet_type>::get_ploaddup(from);               \
+  }
+
+SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4)
+SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
+
+#undef SYCL_PLOAD_DUP_SPECILIZE
+
+#define SYCL_PLSET(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type plset<packet_type>( \
+      const typename unpacket_traits<packet_type>::type& a) {           \
+    return get_base_packet<packet_type>::set_plset(a);                  \
+  }
+
+SYCL_PLSET(cl::sycl::cl_float4)
+SYCL_PLSET(cl::sycl::cl_double2)
+
+#undef SYCL_PLSET
+
+#define SYCL_PGATHER(address_space_target)                                  \
+  template <typename Scalar, typename packet_type>                          \
+  EIGEN_DEVICE_FUNC inline packet_type pgather(                             \
+      typename cl::sycl::multi_ptr<                                         \
+          const typename unpacket_traits<packet_type>::type,                \
+          cl::sycl::access::address_space::address_space_target>::pointer_t \
+          from,                                                             \
+      Index stride) {                                                       \
+    return get_base_packet<packet_type>::get_pgather(from, stride);         \
+  }
+
+// global space
+SYCL_PGATHER(global_space)
+// local space
+SYCL_PGATHER(local_space)
+
+#undef SYCL_PGATHER
+
+#define SYCL_PGATHER_SPECILIZE(scalar, packet_type)                            \
+  template <>                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type                            \
+  pgather<scalar, packet_type>(                                                \
+      const typename unpacket_traits<packet_type>::type* from, Index stride) { \
+    return get_base_packet<packet_type>::get_pgather(from, stride);            \
+  }
+
+SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4)
+SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
+
+#undef SYCL_PGATHER_SPECILIZE
+
+#define SYCL_PSCATTER(address_space_target)                                 \
+  template <typename Scalar, typename packet_type>                          \
+  EIGEN_DEVICE_FUNC inline void pscatter(                                   \
+      typename cl::sycl::multi_ptr<                                         \
+          typename unpacket_traits<packet_type>::type,                      \
+          cl::sycl::access::address_space::address_space_target>::pointer_t \
+          to,                                                               \
+      const packet_type& from, Index stride) {                              \
+    get_base_packet<packet_type>::set_pscatter(to, from, stride);           \
+  }
+
+// global space
+SYCL_PSCATTER(global_space)
+// local space
+SYCL_PSCATTER(local_space)
+
+#undef SYCL_PSCATTER
+
+#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type)                        \
+  template <>                                                               \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<scalar, packet_type>( \
+      typename unpacket_traits<packet_type>::type * to,                     \
+      const packet_type& from, Index stride) {                              \
+    get_base_packet<packet_type>::set_pscatter(to, from, stride);           \
+  }
+
+SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4)
+SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2)
+
+#undef SYCL_PSCATTER_SPECILIZE
+
+#define SYCL_PMAD(packet_type)                                            \
+  template <>                                                             \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pmadd(                \
+      const packet_type& a, const packet_type& b, const packet_type& c) { \
+    return cl::sycl::mad(a, b, c);                                        \
+  }
+
+SYCL_PMAD(cl::sycl::cl_float4)
+SYCL_PMAD(cl::sycl::cl_double2)
+#undef SYCL_PMAD
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst<cl::sycl::cl_float4>(
+    const cl::sycl::cl_float4& a) {
+  return a.x();
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst<cl::sycl::cl_double2>(
+    const cl::sycl::cl_double2& a) {
+  return a.x();
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux<cl::sycl::cl_float4>(
+    const cl::sycl::cl_float4& a) {
+  return a.x() + a.y() + a.z() + a.w();
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux<cl::sycl::cl_double2>(
+    const cl::sycl::cl_double2& a) {
+  return a.x() + a.y();
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max<cl::sycl::cl_float4>(
+    const cl::sycl::cl_float4& a) {
+  return cl::sycl::fmax(cl::sycl::fmax(a.x(), a.y()),
+                        cl::sycl::fmax(a.z(), a.w()));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max<cl::sycl::cl_double2>(
+    const cl::sycl::cl_double2& a) {
+  return cl::sycl::fmax(a.x(), a.y());
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min<cl::sycl::cl_float4>(
+    const cl::sycl::cl_float4& a) {
+  return cl::sycl::fmin(cl::sycl::fmin(a.x(), a.y()),
+                        cl::sycl::fmin(a.z(), a.w()));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min<cl::sycl::cl_double2>(
+    const cl::sycl::cl_double2& a) {
+  return cl::sycl::fmin(a.x(), a.y());
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul<cl::sycl::cl_float4>(
+    const cl::sycl::cl_float4& a) {
+  return a.x() * a.y() * a.z() * a.w();
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul<cl::sycl::cl_double2>(
+    const cl::sycl::cl_double2& a) {
+  return a.x() * a.y();
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
+pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
+  return cl::sycl::cl_float4(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()),
+                             cl::sycl::fabs(a.z()), cl::sycl::fabs(a.w()));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2
+pabs<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
+  return cl::sycl::cl_double2(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()));
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_le(const Packet &a,
+                                                          const Packet &b) {
+  return ((a <= b)
+              .template convert<typename unpacket_traits<Packet>::type,
+                                cl::sycl::rounding_mode::automatic>());
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_lt(const Packet &a,
+                                                          const Packet &b) {
+  return ((a < b)
+              .template convert<typename unpacket_traits<Packet>::type,
+                                cl::sycl::rounding_mode::automatic>());
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_eq(const Packet &a,
+                                                          const Packet &b) {
+  return ((a == b)
+              .template convert<typename unpacket_traits<Packet>::type,
+                                cl::sycl::rounding_mode::automatic>());
+}
+
+#define SYCL_PCMP(OP, TYPE)                                                    \
+  template <>                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TYPE pcmp_##OP<TYPE>(const TYPE &a,    \
+                                                             const TYPE &b) {  \
+    return sycl_pcmp_##OP<TYPE>(a, b);                                         \
+  }
+
+SYCL_PCMP(le, cl::sycl::cl_float4)
+SYCL_PCMP(lt, cl::sycl::cl_float4)
+SYCL_PCMP(eq, cl::sycl::cl_float4)
+SYCL_PCMP(le, cl::sycl::cl_double2)
+SYCL_PCMP(lt, cl::sycl::cl_double2)
+SYCL_PCMP(eq, cl::sycl::cl_double2)
+#undef SYCL_PCMP
+
+template <typename T> struct convert_to_integer;
+
+template <> struct convert_to_integer<float> {
+  using type = std::int32_t;
+  using packet_type = cl::sycl::cl_int4;
+};
+template <> struct convert_to_integer<double> {
+  using type = std::int64_t;
+  using packet_type = cl::sycl::cl_long2;
+};
+
+template <typename PacketIn>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename convert_to_integer<
+    typename unpacket_traits<PacketIn>::type>::packet_type
+vector_as_int(const PacketIn &p) {
+  return (
+      p.template convert<typename convert_to_integer<
+                             typename unpacket_traits<PacketIn>::type>::type,
+                         cl::sycl::rounding_mode::automatic>());
+}
+
+template <typename packetOut, typename PacketIn>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packetOut
+convert_vector(const PacketIn &p) {
+  return (p.template convert<typename unpacket_traits<packetOut>::type,
+                             cl::sycl::rounding_mode::automatic>());
+}
+
+#define SYCL_PAND(TYPE)                                                        \
+  template <>                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE pand<TYPE>(const TYPE &a,         \
+                                                        const TYPE &b) {       \
+    return convert_vector<TYPE>(vector_as_int(a) & vector_as_int(b));          \
+  }
+SYCL_PAND(cl::sycl::cl_float4)
+SYCL_PAND(cl::sycl::cl_double2)
+#undef SYCL_PAND
+
+#define SYCL_POR(TYPE)                                                         \
+  template <>                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE por<TYPE>(const TYPE &a,          \
+                                                       const TYPE &b) {        \
+    return convert_vector<TYPE>(vector_as_int(a) | vector_as_int(b));          \
+  }
+
+SYCL_POR(cl::sycl::cl_float4)
+SYCL_POR(cl::sycl::cl_double2)
+#undef SYCL_POR
+
+#define SYCL_PXOR(TYPE)                                                        \
+  template <>                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE pxor<TYPE>(const TYPE &a,         \
+                                                        const TYPE &b) {       \
+    return convert_vector<TYPE>(vector_as_int(a) ^ vector_as_int(b));          \
+  }
+
+SYCL_PXOR(cl::sycl::cl_float4)
+SYCL_PXOR(cl::sycl::cl_double2)
+#undef SYCL_PXOR
+
+#define SYCL_PANDNOT(TYPE)                                                     \
+  template <>                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE pandnot<TYPE>(const TYPE &a,      \
+                                                           const TYPE &b) {    \
+    return convert_vector<TYPE>(vector_as_int(a) & (~vector_as_int(b)));       \
+  }
+SYCL_PANDNOT(cl::sycl::cl_float4)
+SYCL_PANDNOT(cl::sycl::cl_double2)
+#undef SYCL_PANDNOT
+
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
+    PacketBlock<cl::sycl::cl_float4, 4>& kernel) {
+  float tmp = kernel.packet[0].y();
+  kernel.packet[0].y() = kernel.packet[1].x();
+  kernel.packet[1].x() = tmp;
+
+  tmp = kernel.packet[0].z();
+  kernel.packet[0].z() = kernel.packet[2].x();
+  kernel.packet[2].x() = tmp;
+
+  tmp = kernel.packet[0].w();
+  kernel.packet[0].w() = kernel.packet[3].x();
+  kernel.packet[3].x() = tmp;
+
+  tmp = kernel.packet[1].z();
+  kernel.packet[1].z() = kernel.packet[2].y();
+  kernel.packet[2].y() = tmp;
+
+  tmp = kernel.packet[1].w();
+  kernel.packet[1].w() = kernel.packet[3].y();
+  kernel.packet[3].y() = tmp;
+
+  tmp = kernel.packet[2].w();
+  kernel.packet[2].w() = kernel.packet[3].z();
+  kernel.packet[3].z() = tmp;
+}
+
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
+    PacketBlock<cl::sycl::cl_double2, 2>& kernel) {
+  double tmp = kernel.packet[0].y();
+  kernel.packet[0].y() = kernel.packet[1].x();
+  kernel.packet[1].x() = tmp;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pblend(
+    const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket,
+    const cl::sycl::cl_float4& thenPacket,
+    const cl::sycl::cl_float4& elsePacket) {
+  cl::sycl::cl_int4 condition(
+      ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1,
+      ifPacket.select[2] ? 0 : -1, ifPacket.select[3] ? 0 : -1);
+  return cl::sycl::select(thenPacket, elsePacket, condition);
+}
+
+template <>
+inline cl::sycl::cl_double2 pblend(
+    const Selector<unpacket_traits<cl::sycl::cl_double2>::size>& ifPacket,
+    const cl::sycl::cl_double2& thenPacket,
+    const cl::sycl::cl_double2& elsePacket) {
+  cl::sycl::cl_long2 condition(ifPacket.select[0] ? 0 : -1,
+                               ifPacket.select[1] ? 0 : -1);
+  return cl::sycl::select(thenPacket, elsePacket, condition);
+}
+#endif  // SYCL_DEVICE_ONLY
+
+#define SYCL_PSTORE(alignment)                                  \
+  template <typename packet_type>                               \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
+      const Eigen::TensorSycl::internal::RangeAccess<           \
+          cl::sycl::access::mode::read_write,                   \
+          typename unpacket_traits<packet_type>::type>& to,     \
+      const packet_type& from) {                                \
+    pstore##alignment(to.get_pointer(), from);                  \
+  }
+
+// global space
+SYCL_PSTORE()
+SYCL_PSTORE(u)
+
+#undef SYCL_PSTORE
+
+template <typename scalar, typename packet_type, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(
+    Eigen::TensorSycl::internal::RangeAccess<
+        cl::sycl::access::mode::read_write,
+        typename unpacket_traits<packet_type>::type>
+        to,
+    const packet_type& from) {
+  pstoret<scalar, packet_type, Alignment>(to.get_pointer(), from);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_SYCL_H

diff --git a/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h b/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h
new file mode 100644
index 0000000..f81e59d
--- /dev/null
+++ b/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h

@@ -0,0 +1,694 @@
+/***************************************************************************
+ *  Copyright (C) 2017 Codeplay Software Limited
+ *  This Source Code Form is subject to the terms of the Mozilla
+ *  Public License v. 2.0. If a copy of the MPL was not distributed
+ *  with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ *
+ *  SyclMemoryModel.h
+ *
+ *  Description:
+ *    Interface for SYCL buffers to behave as a non-dereferenceable pointer
+ *    Interface for Placeholder accessor to behave as a pointer on both host
+ *    and device
+ *
+ * Authors:
+ *
+ *    Ruyman Reyes   Codeplay Software Ltd.
+ *    Mehdi Goli     Codeplay Software Ltd.
+ *    Vanya Yaneva   Codeplay Software Ltd.
+ *
+ **************************************************************************/
+
+#if defined(EIGEN_USE_SYCL) && \
+    !defined(EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H
+
+#include <CL/sycl.hpp>
+#ifdef EIGEN_EXCEPTIONS
+#include <stdexcept>
+#endif
+#include <cstddef>
+#include <queue>
+#include <set>
+#include <unordered_map>
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+
+using sycl_acc_target = cl::sycl::access::target;
+using sycl_acc_mode = cl::sycl::access::mode;
+
+/**
+ * Default values for template arguments
+ */
+using buffer_data_type_t = uint8_t;
+const sycl_acc_target default_acc_target = sycl_acc_target::global_buffer;
+const sycl_acc_mode default_acc_mode = sycl_acc_mode::read_write;
+
+/**
+ * PointerMapper
+ *  Associates fake pointers with buffers.
+ *
+ */
+class PointerMapper {
+ public:
+  using base_ptr_t = std::intptr_t;
+
+  /* Structure of a virtual pointer
+   *
+   * |================================================|
+   * |               POINTER ADDRESS                  |
+   * |================================================|
+   */
+  struct virtual_pointer_t {
+    /* Type for the pointers
+     */
+    base_ptr_t m_contents;
+
+    /** Conversions from virtual_pointer_t to
+     * void * should just reinterpret_cast the integer number
+     */
+    operator void *() const { return reinterpret_cast<void *>(m_contents); }
+
+    /**
+     * Convert back to the integer number.
+     */
+    operator base_ptr_t() const { return m_contents; }
+
+    /**
+     * Add a certain value to the pointer to create a
+     * new pointer to that offset
+     */
+    virtual_pointer_t operator+(size_t off) { return m_contents + off; }
+
+    /* Numerical order for sorting pointers in containers. */
+    bool operator<(virtual_pointer_t rhs) const {
+      return (static_cast<base_ptr_t>(m_contents) <
+              static_cast<base_ptr_t>(rhs.m_contents));
+    }
+
+    bool operator>(virtual_pointer_t rhs) const {
+      return (static_cast<base_ptr_t>(m_contents) >
+              static_cast<base_ptr_t>(rhs.m_contents));
+    }
+
+    /**
+     * Numerical order for sorting pointers in containers
+     */
+    bool operator==(virtual_pointer_t rhs) const {
+      return (static_cast<base_ptr_t>(m_contents) ==
+              static_cast<base_ptr_t>(rhs.m_contents));
+    }
+
+    /**
+     * Simple forward to the equality overload.
+     */
+    bool operator!=(virtual_pointer_t rhs) const {
+      return !(this->operator==(rhs));
+    }
+
+    /**
+     * Converts a void * into a virtual pointer structure.
+     * Note that this will only work if the void * was
+     * already a virtual_pointer_t, but we have no way of
+     * checking
+     */
+    virtual_pointer_t(const void *ptr)
+        : m_contents(reinterpret_cast<base_ptr_t>(ptr)){};
+
+    /**
+     * Creates a virtual_pointer_t from the given integer
+     * number
+     */
+    virtual_pointer_t(base_ptr_t u) : m_contents(u){};
+  };
+
+  /* Definition of a null pointer
+   */
+  const virtual_pointer_t null_virtual_ptr = nullptr;
+
+  /**
+   * Whether if a pointer is null or not.
+   * A pointer is nullptr if the value is of null_virtual_ptr
+   */
+  static inline bool is_nullptr(virtual_pointer_t ptr) {
+    return (static_cast<void *>(ptr) == nullptr);
+  }
+
+  /* basic type for all buffers
+   */
+  using buffer_t = cl::sycl::buffer_mem;
+
+  /**
+   * Node that stores information about a device allocation.
+   * Nodes are sorted by size to organise a free list of nodes
+   * that can be recovered.
+   */
+  struct pMapNode_t {
+    buffer_t m_buffer;
+    size_t m_size;
+    bool m_free;
+
+    pMapNode_t(buffer_t b, size_t size, bool f)
+        : m_buffer{b}, m_size{size}, m_free{f} {
+      m_buffer.set_final_data(nullptr);
+    }
+
+    bool operator<=(const pMapNode_t &rhs) { return (m_size <= rhs.m_size); }
+  };
+
+  /** Storage of the pointer / buffer tree
+   */
+  using pointerMap_t = std::map<virtual_pointer_t, pMapNode_t>;
+
+  /**
+   * Obtain the insertion point in the pointer map for
+   * a pointer of the given size.
+   * \param requiredSize Size attemted to reclaim
+   */
+  typename pointerMap_t::iterator get_insertion_point(size_t requiredSize) {
+    typename pointerMap_t::iterator retVal;
+    bool reuse = false;
+    if (!m_freeList.empty()) {
+      // try to re-use an existing block
+      for (auto freeElem : m_freeList) {
+        if (freeElem->second.m_size >= requiredSize) {
+          retVal = freeElem;
+          reuse = true;
+          // Element is not going to be free anymore
+          m_freeList.erase(freeElem);
+          break;
+        }
+      }
+    }
+    if (!reuse) {
+      retVal = std::prev(m_pointerMap.end());
+    }
+    return retVal;
+  }
+
+  /**
+   * Returns an iterator to the node that stores the information
+   * of the given virtual pointer from the given pointer map structure.
+   * If pointer is not found, throws std::out_of_range.
+   * If the pointer map structure is empty, throws std::out_of_range
+   *
+   * \param pMap the pointerMap_t structure storing all the pointers
+   * \param virtual_pointer_ptr The virtual pointer to obtain the node of
+   * \throws std::out:of_range if the pointer is not found or pMap is empty
+   */
+  typename pointerMap_t::iterator get_node(const virtual_pointer_t ptr) {
+    if (this->count() == 0) {
+      m_pointerMap.clear();
+      EIGEN_THROW_X(std::out_of_range("There are no pointers allocated\n"));
+
+    }
+    if (is_nullptr(ptr)) {
+      m_pointerMap.clear();
+      EIGEN_THROW_X(std::out_of_range("Cannot access null pointer\n"));
+    }
+    // The previous element to the lower bound is the node that
+    // holds this memory address
+    auto node = m_pointerMap.lower_bound(ptr);
+    // If the value of the pointer is not the one of the node
+    // then we return the previous one
+    if (node == std::end(m_pointerMap)) {
+      --node;
+    } else if (node->first != ptr) {
+      if (node == std::begin(m_pointerMap)) {
+        m_pointerMap.clear();
+        EIGEN_THROW_X(
+            std::out_of_range("The pointer is not registered in the map\n"));
+
+      }
+      --node;
+    }
+
+    return node;
+  }
+
+  /* get_buffer.
+   * Returns a buffer from the map using the pointer address
+   */
+  template <typename buffer_data_type = buffer_data_type_t>
+  cl::sycl::buffer<buffer_data_type, 1> get_buffer(
+      const virtual_pointer_t ptr) {
+    using sycl_buffer_t = cl::sycl::buffer<buffer_data_type, 1>;
+
+    // get_node() returns a `buffer_mem`, so we need to cast it to a `buffer<>`.
+    // We can do this without the `buffer_mem` being a pointer, as we
+    // only declare member variables in the base class (`buffer_mem`) and not in
+    // the child class (`buffer<>).
+    auto node = get_node(ptr);
+    eigen_assert(node->first == ptr || node->first < ptr);
+    eigen_assert(ptr < static_cast<virtual_pointer_t>(node->second.m_size +
+                                                      node->first));
+    return *(static_cast<sycl_buffer_t *>(&node->second.m_buffer));
+  }
+
+  /**
+   * @brief Returns an accessor to the buffer of the given virtual pointer
+   * @param accessMode
+   * @param accessTarget
+   * @param ptr The virtual pointer
+   */
+  template <sycl_acc_mode access_mode = default_acc_mode,
+            sycl_acc_target access_target = default_acc_target,
+            typename buffer_data_type = buffer_data_type_t>
+  cl::sycl::accessor<buffer_data_type, 1, access_mode, access_target>
+  get_access(const virtual_pointer_t ptr) {
+    auto buf = get_buffer<buffer_data_type>(ptr);
+    return buf.template get_access<access_mode, access_target>();
+  }
+
+  /**
+   * @brief Returns an accessor to the buffer of the given virtual pointer
+   *        in the given command group scope
+   * @param accessMode
+   * @param accessTarget
+   * @param ptr The virtual pointer
+   * @param cgh Reference to the command group scope
+   */
+  template <sycl_acc_mode access_mode = default_acc_mode,
+            sycl_acc_target access_target = default_acc_target,
+            typename buffer_data_type = buffer_data_type_t>
+  cl::sycl::accessor<buffer_data_type, 1, access_mode, access_target>
+  get_access(const virtual_pointer_t ptr, cl::sycl::handler &cgh) {
+    auto buf = get_buffer<buffer_data_type>(ptr);
+    return buf.template get_access<access_mode, access_target>(cgh);
+  }
+
+  /*
+   * Returns the offset from the base address of this pointer.
+   */
+  inline std::ptrdiff_t get_offset(const virtual_pointer_t ptr) {
+    // The previous element to the lower bound is the node that
+    // holds this memory address
+    auto node = get_node(ptr);
+    auto start = node->first;
+    eigen_assert(start == ptr || start < ptr);
+    eigen_assert(ptr < start + node->second.m_size);
+    return (ptr - start);
+  }
+
+  /*
+   * Returns the number of elements by which the given pointer is offset from
+   * the base address.
+   */
+  template <typename buffer_data_type>
+  inline size_t get_element_offset(const virtual_pointer_t ptr) {
+    return get_offset(ptr) / sizeof(buffer_data_type);
+  }
+
+  /**
+   * Constructs the PointerMapper structure.
+   */
+  PointerMapper(base_ptr_t baseAddress = 4096)
+      : m_pointerMap{}, m_freeList{}, m_baseAddress{baseAddress} {
+    if (m_baseAddress == 0) {
+      EIGEN_THROW_X(std::invalid_argument("Base address cannot be zero\n"));
+    }
+  };
+
+  /**
+   * PointerMapper cannot be copied or moved
+   */
+  PointerMapper(const PointerMapper &) = delete;
+
+  /**
+   * Empty the pointer list
+   */
+  inline void clear() {
+    m_freeList.clear();
+    m_pointerMap.clear();
+  }
+
+  /* add_pointer.
+   * Adds an existing pointer to the map and returns the virtual pointer id.
+   */
+  inline virtual_pointer_t add_pointer(const buffer_t &b) {
+    return add_pointer_impl(b);
+  }
+
+  /* add_pointer.
+   * Adds a pointer to the map and returns the virtual pointer id.
+   */
+  inline virtual_pointer_t add_pointer(buffer_t &&b) {
+    return add_pointer_impl(b);
+  }
+
+  /**
+   * @brief Fuses the given node with the previous nodes in the
+   *        pointer map if they are free
+   *
+   * @param node A reference to the free node to be fused
+   */
+  void fuse_forward(typename pointerMap_t::iterator &node) {
+    while (node != std::prev(m_pointerMap.end())) {
+      // if following node is free
+      // remove it and extend the current node with its size
+      auto fwd_node = std::next(node);
+      if (!fwd_node->second.m_free) {
+        break;
+      }
+      auto fwd_size = fwd_node->second.m_size;
+      m_freeList.erase(fwd_node);
+      m_pointerMap.erase(fwd_node);
+
+      node->second.m_size += fwd_size;
+    }
+  }
+
+  /**
+   * @brief Fuses the given node with the following nodes in the
+   *        pointer map if they are free
+   *
+   * @param node A reference to the free node to be fused
+   */
+  void fuse_backward(typename pointerMap_t::iterator &node) {
+    while (node != m_pointerMap.begin()) {
+      // if previous node is free, extend it
+      // with the size of the current one
+      auto prev_node = std::prev(node);
+      if (!prev_node->second.m_free) {
+        break;
+      }
+      prev_node->second.m_size += node->second.m_size;
+
+      // remove the current node
+      m_freeList.erase(node);
+      m_pointerMap.erase(node);
+
+      // point to the previous node
+      node = prev_node;
+    }
+  }
+
+  /* remove_pointer.
+   * Removes the given pointer from the map.
+   * The pointer is allowed to be reused only if ReUse if true.
+   */
+  template <bool ReUse = true>
+  void remove_pointer(const virtual_pointer_t ptr) {
+    if (is_nullptr(ptr)) {
+      return;
+    }
+    auto node = this->get_node(ptr);
+
+    node->second.m_free = true;
+    m_freeList.emplace(node);
+
+    // Fuse the node
+    // with free nodes before and after it
+    fuse_forward(node);
+    fuse_backward(node);
+
+    // If after fusing the node is the last one
+    // simply remove it (since it is free)
+    if (node == std::prev(m_pointerMap.end())) {
+      m_freeList.erase(node);
+      m_pointerMap.erase(node);
+    }
+  }
+
+  /* count.
+   * Return the number of active pointers (i.e, pointers that
+   * have been malloc but not freed).
+   */
+  size_t count() const { return (m_pointerMap.size() - m_freeList.size()); }
+
+ private:
+  /* add_pointer_impl.
+   * Adds a pointer to the map and returns the virtual pointer id.
+   * BufferT is either a const buffer_t& or a buffer_t&&.
+   */
+  template <class BufferT>
+  virtual_pointer_t add_pointer_impl(BufferT b) {
+    virtual_pointer_t retVal = nullptr;
+    size_t bufSize = b.get_count();
+    pMapNode_t p{b, bufSize, false};
+    // If this is the first pointer:
+    if (m_pointerMap.empty()) {
+      virtual_pointer_t initialVal{m_baseAddress};
+      m_pointerMap.emplace(initialVal, p);
+      return initialVal;
+    }
+
+    auto lastElemIter = get_insertion_point(bufSize);
+    // We are recovering an existing free node
+    if (lastElemIter->second.m_free) {
+      lastElemIter->second.m_buffer = b;
+      lastElemIter->second.m_free = false;
+
+      // If the recovered node is bigger than the inserted one
+      // add a new free node with the remaining space
+      if (lastElemIter->second.m_size > bufSize) {
+        // create a new node with the remaining space
+        auto remainingSize = lastElemIter->second.m_size - bufSize;
+        pMapNode_t p2{b, remainingSize, true};
+
+        // update size of the current node
+        lastElemIter->second.m_size = bufSize;
+
+        // add the new free node
+        auto newFreePtr = lastElemIter->first + bufSize;
+        auto freeNode = m_pointerMap.emplace(newFreePtr, p2).first;
+        m_freeList.emplace(freeNode);
+      }
+
+      retVal = lastElemIter->first;
+    } else {
+      size_t lastSize = lastElemIter->second.m_size;
+      retVal = lastElemIter->first + lastSize;
+      m_pointerMap.emplace(retVal, p);
+    }
+    return retVal;
+  }
+
+  /**
+   * Compare two iterators to pointer map entries according to
+   * the size of the allocation on the device.
+   */
+  struct SortBySize {
+    bool operator()(typename pointerMap_t::iterator a,
+                    typename pointerMap_t::iterator b) const {
+      return ((a->first < b->first) && (a->second <= b->second)) ||
+             ((a->first < b->first) && (b->second <= a->second));
+    }
+  };
+
+  /* Maps the pointer addresses to buffer and size pairs.
+   */
+  pointerMap_t m_pointerMap;
+
+  /* List of free nodes available for re-using
+   */
+  std::set<typename pointerMap_t::iterator, SortBySize> m_freeList;
+
+  /* Base address used when issuing the first virtual pointer, allows users
+   * to specify alignment. Cannot be zero. */
+  std::intptr_t m_baseAddress;
+};
+
+/* remove_pointer.
+ * Removes the given pointer from the map.
+ * The pointer is allowed to be reused only if ReUse if true.
+ */
+template <>
+inline void PointerMapper::remove_pointer<false>(const virtual_pointer_t ptr) {
+  if (is_nullptr(ptr)) {
+    return;
+  }
+  m_pointerMap.erase(this->get_node(ptr));
+}
+
+/**
+ * Malloc-like interface to the pointer-mapper.
+ * Given a size, creates a byte-typed buffer and returns a
+ * fake pointer to keep track of it.
+ * \param size Size in bytes of the desired allocation
+ * \throw cl::sycl::exception if error while creating the buffer
+ */
+inline void *SYCLmalloc(size_t size, PointerMapper &pMap) {
+  if (size == 0) {
+    return nullptr;
+  }
+  // Create a generic buffer of the given size
+  using buffer_t = cl::sycl::buffer<buffer_data_type_t, 1>;
+  auto thePointer = pMap.add_pointer(buffer_t(cl::sycl::range<1>{size}));
+  // Store the buffer on the global list
+  return static_cast<void *>(thePointer);
+}
+
+/**
+ * Free-like interface to the pointer mapper.
+ * Given a fake-pointer created with the virtual-pointer malloc,
+ * destroys the buffer and remove it from the list.
+ * If ReUse is false, the pointer is not added to the freeList,
+ * it should be false only for sub-buffers.
+ */
+template <bool ReUse = true, typename PointerMapper>
+inline void SYCLfree(void *ptr, PointerMapper &pMap) {
+  pMap.template remove_pointer<ReUse>(ptr);
+}
+
+/**
+ * Clear all the memory allocated by SYCL.
+ */
+template <typename PointerMapper>
+inline void SYCLfreeAll(PointerMapper &pMap) {
+  pMap.clear();
+}
+
+template <cl::sycl::access::mode AcMd, typename T>
+struct RangeAccess {
+  static const auto global_access = cl::sycl::access::target::global_buffer;
+  static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
+  typedef T scalar_t;
+  typedef scalar_t &ref_t;
+  typedef typename cl::sycl::global_ptr<scalar_t>::pointer_t ptr_t;
+
+  // the accessor type does not necessarily the same as T
+  typedef cl::sycl::accessor<scalar_t, 1, AcMd, global_access, is_place_holder>
+      accessor;
+
+  typedef RangeAccess<AcMd, T> self_t;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RangeAccess(accessor access,
+                                                    size_t offset,
+                                                    std::intptr_t virtual_ptr)
+      : access_(access), offset_(offset), virtual_ptr_(virtual_ptr) {}
+
+  RangeAccess(cl::sycl::buffer<scalar_t, 1> buff =
+                  cl::sycl::buffer<scalar_t, 1>(cl::sycl::range<1>(1)))
+      : access_{accessor{buff}}, offset_(0), virtual_ptr_(-1) {}
+
+  // This should be only used for null constructor on the host side
+  RangeAccess(std::nullptr_t) : RangeAccess() {}
+  // This template parameter must be removed and scalar_t should be replaced
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t get_pointer() const {
+    return (access_.get_pointer().get() + offset_);
+  }
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator+=(Index offset) {
+    offset_ += (offset);
+    return *this;
+  }
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator+(Index offset) const {
+    return self_t(access_, offset_ + offset, virtual_ptr_);
+  }
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator-(Index offset) const {
+    return self_t(access_, offset_ - offset, virtual_ptr_);
+  }
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator-=(Index offset) {
+    offset_ -= offset;
+    return *this;
+  }
+
+  // THIS IS FOR NULL COMPARISON ONLY
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator==(
+      const RangeAccess &lhs, std::nullptr_t) {
+    return ((lhs.virtual_ptr_ == -1));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator!=(
+      const RangeAccess &lhs, std::nullptr_t i) {
+    return !(lhs == i);
+  }
+
+  // THIS IS FOR NULL COMPARISON ONLY
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator==(
+      std::nullptr_t, const RangeAccess &rhs) {
+    return ((rhs.virtual_ptr_ == -1));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator!=(
+      std::nullptr_t i, const RangeAccess &rhs) {
+    return !(i == rhs);
+  }
+  // Prefix operator (Increment and return value)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator++() {
+    offset_++;
+    return (*this);
+  }
+
+  // Postfix operator (Return value and increment)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator++(int i) {
+    EIGEN_UNUSED_VARIABLE(i);
+    self_t temp_iterator(*this);
+    offset_++;
+    return temp_iterator;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t get_size() const {
+    return (access_.get_count() - offset_);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t get_offset() const {
+    return offset_;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_offset(std::ptrdiff_t offset) {
+    offset_ = offset;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator*() const {
+    return *get_pointer();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator*() {
+    return *get_pointer();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t operator->() = delete;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator[](int x) {
+    return *(get_pointer() + x);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator[](int x) const {
+    return *(get_pointer() + x);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_t *get_virtual_pointer() const {
+    return reinterpret_cast<scalar_t *>(virtual_ptr_ +
+                                        (offset_ * sizeof(scalar_t)));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit operator bool() const {
+    return (virtual_ptr_ != -1);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator RangeAccess<AcMd, const T>() {
+    return RangeAccess<AcMd, const T>(access_, offset_, virtual_ptr_);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  operator RangeAccess<AcMd, const T>() const {
+    return RangeAccess<AcMd, const T>(access_, offset_, virtual_ptr_);
+  }
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(
+      cl::sycl::handler &cgh) const {
+    cgh.require(access_);
+  }
+
+ private:
+  accessor access_;
+  size_t offset_;
+  std::intptr_t virtual_ptr_;  // the location of the buffer in the map
+};
+
+template <cl::sycl::access::mode AcMd, typename T>
+struct RangeAccess<AcMd, const T> : RangeAccess<AcMd, T> {
+  typedef RangeAccess<AcMd, T> Base;
+  using Base::Base;
+};
+
+}  // namespace internal
+}  // namespace TensorSycl
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H

diff --git a/Eigen/src/Core/arch/SYCL/TypeCasting.h b/Eigen/src/Core/arch/SYCL/TypeCasting.h
new file mode 100644
index 0000000..9208ab2
--- /dev/null
+++ b/Eigen/src/Core/arch/SYCL/TypeCasting.h

@@ -0,0 +1,85 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TypeCasting.h
+ *
+ * \brief:
+ *  TypeCasting
+ *
+ *****************************************************************/
+
+#ifndef EIGEN_TYPE_CASTING_SYCL_H
+#define EIGEN_TYPE_CASTING_SYCL_H
+
+namespace Eigen {
+
+namespace internal {
+#ifdef SYCL_DEVICE_ONLY
+template <>
+struct type_casting_traits<float, int> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_int4
+pcast<cl::sycl::cl_float4, cl::sycl::cl_int4>(const cl::sycl::cl_float4& a) {
+  return a
+      .template convert<cl::sycl::cl_int, cl::sycl::rounding_mode::automatic>();
+}
+
+template <>
+struct type_casting_traits<int, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
+pcast<cl::sycl::cl_int4, cl::sycl::cl_float4>(const cl::sycl::cl_int4& a) {
+  return a.template convert<cl::sycl::cl_float,
+                            cl::sycl::rounding_mode::automatic>();
+}
+
+template <>
+struct type_casting_traits<double, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
+pcast<cl::sycl::cl_double2, cl::sycl::cl_float4>(
+    const cl::sycl::cl_double2& a, const cl::sycl::cl_double2& b) {
+  auto a1 = a.template convert<cl::sycl::cl_float,
+                               cl::sycl::rounding_mode::automatic>();
+  auto b1 = b.template convert<cl::sycl::cl_float,
+                               cl::sycl::rounding_mode::automatic>();
+  return cl::sycl::float4(a1.x(), a1.y(), b1.x(), b1.y());
+}
+
+template <>
+struct type_casting_traits<float, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2
+pcast<cl::sycl::cl_float4, cl::sycl::cl_double2>(const cl::sycl::cl_float4& a) {
+  // Simply discard the second half of the input
+  return cl::sycl::cl_double2(a.x(), a.y());
+}
+
+#endif
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_TYPE_CASTING_SYCL_H

diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h
new file mode 100644
index 0000000..6c67cfe
--- /dev/null
+++ b/Eigen/src/Core/arch/ZVector/Complex.h

@@ -0,0 +1,436 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX32_ALTIVEC_H
+#define EIGEN_COMPLEX32_ALTIVEC_H
+
+namespace Eigen {
+
+namespace internal {
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+static Packet4ui  p4ui_CONJ_XOR = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; //vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);
+#endif
+
+static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+
+struct Packet1cd
+{
+  EIGEN_STRONG_INLINE Packet1cd() {}
+  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+  Packet2d v;
+};
+
+struct Packet2cf
+{
+  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
+  union {
+    Packet4f v;
+    Packet1cd cd[2];
+  };
+#else
+  Packet4f v;
+#endif
+};
+
+template<> struct packet_traits<std::complex<float> >  : default_packet_traits
+{
+  typedef Packet2cf type;
+  typedef Packet2cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasBlend  = 1,
+    HasSetLinear = 0
+  };
+};
+
+
+template<> struct packet_traits<std::complex<double> >  : default_packet_traits
+{
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 1,
+    HasHalfPacket = 0,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet2cf> {
+  typedef std::complex<float>  type;
+  enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  typedef Packet2cf half;
+  typedef Packet4f as_real;
+};
+template<> struct unpacket_traits<Packet1cd> {
+  typedef std::complex<double> type;
+  enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  typedef Packet1cd half;
+  typedef Packet2d as_real;
+};
+
+/* Forward declaration */
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel);
+
+/* complex<double> first */
+template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
+{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride EIGEN_UNUSED)
+{
+  return pload<Packet1cd>(from);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride EIGEN_UNUSED)
+{
+  pstore<std::complex<double> >(to, from);
+}
+template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
+template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  Packet2d a_re, a_im, v1, v2;
+
+  // Permute and multiply the real parts of a and b
+  a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
+  // Get the imaginary parts of a
+  a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
+  // multiply a_re * b
+  v1 = vec_madd(a_re, b.v, p2d_ZERO);
+  // multiply a_im * b and get the conjugate result
+  v2 = vec_madd(a_im, b.v, p2d_ZERO);
+  v2 = (Packet2d) vec_sld((Packet4ui)v2, (Packet4ui)v2, 8);
+  v2 = (Packet2d) vec_xor((Packet2d)v2, (Packet2d) p2ul_CONJ_XOR1);
+
+  return Packet1cd(v1 + v2);
+}
+template<> EIGEN_STRONG_INLINE Packet1cd pand    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd por     <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pxor    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from) {  return pset1<Packet1cd>(*from); }
+template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
+  Packet2d eq = vec_cmpeq (a.v, b.v);
+  Packet2d tmp = { eq[1], eq[0] };
+  return (Packet1cd)pand<Packet2d>(eq, tmp);
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+
+template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
+{
+  EIGEN_ALIGN16 std::complex<double> res;
+  pstore<std::complex<double> >(&res, a);
+
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
+{
+  return pfirst(a);
+}
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
+{
+  return pfirst(a);
+}
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+
+template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  // TODO optimize it for AltiVec
+  Packet1cd res = pmul(a,pconj(b));
+  Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
+  return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64)));
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
+{
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
+{
+  Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
+  kernel.packet[0].v = tmp;
+}
+
+/* complex<float> follows */
+template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
+{
+  EIGEN_ALIGN16 std::complex<float> res[2];
+  pstore<std::complex<float> >(res, a);
+
+  return res[0];
+}
+
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
+{
+  Packet2cf res;
+  res.cd[0] = Packet1cd(vec_ld2f((const float *)&from));
+  res.cd[1] = res.cd[0];
+  return res;
+}
+#else
+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
+{
+  Packet2cf res;
+  if((std::ptrdiff_t(&from) % 16) == 0)
+    res.v = pload<Packet4f>((const float *)&from);
+  else
+    res.v = ploadu<Packet4f>((const float *)&from);
+  res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI);
+  return res;
+}
+#endif
+
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+{
+  EIGEN_ALIGN16 std::complex<float> af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  return pload<Packet2cf>(af);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
+{
+  EIGEN_ALIGN16 std::complex<float> af[2];
+  pstore<std::complex<float> >((std::complex<float> *) af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); }
+
+template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v,b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*      from) {  return pset1<Packet2cf>(*from); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *     addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
+
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+  Packet4f eq = pcmp_eq<Packet4f> (a.v, b.v);
+  Packet2cf res;
+  Packet2d tmp1 = { eq.v4f[0][1], eq.v4f[0][0] };
+  Packet2d tmp2 = { eq.v4f[1][1], eq.v4f[1][0] };
+  res.v.v4f[0] = pand<Packet2d>(eq.v4f[0], tmp1);
+  res.v.v4f[1] = pand<Packet2d>(eq.v4f[1], tmp2);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
+{
+  Packet2cf res;
+  res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0]))).v;
+  res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1]))).v;
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  Packet2cf res;
+  res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[0]))).v;
+  res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[1]))).v;
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
+{
+  Packet2cf res;
+  res.cd[0] = a.cd[1];
+  res.cd[1] = a.cd[0];
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> res;
+  Packet1cd b = padd<Packet1cd>(a.cd[0], a.cd[1]);
+  vec_st2f(b.v, (float*)&res);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> res;
+  Packet1cd b = pmul<Packet1cd>(a.cd[0], a.cd[1]);
+  vec_st2f(b.v, (float*)&res);
+  return res;
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+
+template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for AltiVec
+  Packet2cf res;
+  res.cd[0] = pdiv<Packet1cd>(a.cd[0], b.cd[0]);
+  res.cd[1] = pdiv<Packet1cd>(a.cd[1], b.cd[1]);
+  return res;
+}
+
+EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
+{
+  Packet2cf res;
+  res.cd[0] = pcplxflip(x.cd[0]);
+  res.cd[1] = pcplxflip(x.cd[1]);
+  return res;
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
+{
+  Packet1cd tmp = kernel.packet[0].cd[1];
+  kernel.packet[0].cd[1] = kernel.packet[1].cd[0];
+  kernel.packet[1].cd[0] = tmp;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+  Packet2cf result;
+  const Selector<4> ifPacket4 = { ifPacket.select[0], ifPacket.select[0], ifPacket.select[1], ifPacket.select[1] };
+  result.v = pblend<Packet4f>(ifPacket4, thenPacket.v, elsePacket.v);
+  return result;
+}
+#else
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+  Packet4f eq = vec_cmpeq (a.v, b.v);
+  Packet4f tmp = { eq[1], eq[0], eq[3], eq[2] };
+  return (Packet2cf)pand<Packet4f>(eq, tmp);
+}
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  Packet4f a_re, a_im, prod, prod_im;
+
+  // Permute and multiply the real parts of a and b
+  a_re = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
+  
+  // Get the imaginary parts of a
+  a_im = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
+
+  // multiply a_im * b and get the conjugate result
+  prod_im = a_im * b.v;
+  prod_im = pxor<Packet4f>(prod_im, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR));
+  // permute back to a proper order
+  prod_im = vec_perm(prod_im, prod_im, p16uc_COMPLEX32_REV);
+
+  // multiply a_re * b, add prod_im
+  prod = pmadd<Packet4f>(a_re, b.v, prod_im);
+ 
+  return Packet2cf(prod);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
+{
+  Packet4f rev_a;
+  rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2);
+  return Packet2cf(rev_a);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
+{
+  Packet4f b;
+  b = vec_sld(a.v, a.v, 8);
+  b = padd<Packet4f>(a.v, b);
+  return pfirst<Packet2cf>(Packet2cf(b));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
+{
+  Packet4f b;
+  Packet2cf prod;
+  b = vec_sld(a.v, a.v, 8);
+  prod = pmul<Packet2cf>(a, Packet2cf(b));
+
+  return pfirst<Packet2cf>(prod);
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+
+template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for AltiVec
+  Packet2cf res = pmul(a, pconj(b));
+  Packet4f s = pmul<Packet4f>(b.v, b.v);
+  return Packet2cf(pdiv(res.v, padd<Packet4f>(s, vec_perm(s, s, p16uc_COMPLEX32_REV))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
+{
+  return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
+{
+  Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
+  kernel.packet[0].v = tmp;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+  Packet2cf result;
+  result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
+  return result;
+}
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX32_ALTIVEC_H

diff --git a/Eigen/src/Core/arch/ZVector/MathFunctions.h b/Eigen/src/Core/arch/ZVector/MathFunctions.h
new file mode 100644
index 0000000..1635e12
--- /dev/null
+++ b/Eigen/src/Core/arch/ZVector/MathFunctions.h

@@ -0,0 +1,233 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+#ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H
+#define EIGEN_MATH_FUNCTIONS_ALTIVEC_H
+
+namespace Eigen {
+
+namespace internal {
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+static _EIGEN_DECLARE_CONST_Packet4i(23, 23);
+
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
+
+/* the smallest non denormalized float number */
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000); // -1.f/0.f
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan,     0xffffffff);
+  
+/* natural logarithm computed for 4 simultaneous float
+  return NaN for x <= 0
+*/
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
+static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+#endif
+
+static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
+static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
+static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
+
+static _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
+static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d pexp<Packet2d>(const Packet2d& _x)
+{
+  Packet2d x = _x;
+
+  Packet2d tmp, fx;
+  Packet2l emm0;
+
+  // clamp x
+  x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
+
+  fx = vec_floor(fx);
+
+  tmp = pmul(fx, p2d_cephes_exp_C1);
+  Packet2d z = pmul(fx, p2d_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  Packet2d x2 = pmul(x,x);
+
+  Packet2d px = p2d_cephes_exp_p0;
+  px = pmadd(px, x2, p2d_cephes_exp_p1);
+  px = pmadd(px, x2, p2d_cephes_exp_p2);
+  px = pmul (px, x);
+
+  Packet2d qx = p2d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
+
+  x = pdiv(px,psub(qx,px));
+  x = pmadd(p2d_2,x,p2d_1);
+
+  // build 2^n
+  emm0 = vec_ctsl(fx, 0);
+
+  static const Packet2l p2l_1023 = { 1023, 1023 };
+  static const Packet2ul p2ul_52 = { 52, 52 };
+
+  emm0 = emm0 + p2l_1023;
+  emm0 = emm0 << reinterpret_cast<Packet2l>(p2ul_52);
+
+  // Altivec's max & min operators just drop silent NaNs. Check NaNs in 
+  // inputs and return them unmodified.
+  Packet2ul isnumber_mask = reinterpret_cast<Packet2ul>(vec_cmpeq(_x, _x));
+  return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x),
+                 isnumber_mask);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pexp<Packet4f>(const Packet4f& _x)
+{
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+  Packet4f x = _x;
+
+  Packet4f tmp, fx;
+  Packet4i emm0;
+
+  // clamp x
+  x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
+
+  // express exp(x) as exp(g + n*log(2))
+  fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
+
+  fx = pfloor(fx);
+
+  tmp = pmul(fx, p4f_cephes_exp_C1);
+  Packet4f z = pmul(fx, p4f_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  z = pmul(x,x);
+
+  Packet4f y = p4f_cephes_exp_p0;
+  y = pmadd(y, x, p4f_cephes_exp_p1);
+  y = pmadd(y, x, p4f_cephes_exp_p2);
+  y = pmadd(y, x, p4f_cephes_exp_p3);
+  y = pmadd(y, x, p4f_cephes_exp_p4);
+  y = pmadd(y, x, p4f_cephes_exp_p5);
+  y = pmadd(y, z, x);
+  y = padd(y, p4f_1);
+
+  // build 2^n
+  emm0 = (Packet4i){ (int)fx[0], (int)fx[1], (int)fx[2], (int)fx[3] };
+  emm0 = emm0 + p4i_0x7f;
+  emm0 = emm0 << reinterpret_cast<Packet4i>(p4i_23);
+
+  return pmax(pmul(y, reinterpret_cast<Packet4f>(emm0)), _x);
+#else
+  Packet4f res;
+  res.v4f[0] = pexp<Packet2d>(_x.v4f[0]);
+  res.v4f[1] = pexp<Packet2d>(_x.v4f[1]);
+  return res;
+#endif
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d psqrt<Packet2d>(const Packet2d& x)
+{
+  return vec_sqrt(x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psqrt<Packet4f>(const Packet4f& x)
+{
+  Packet4f res;
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+  res = vec_sqrt(x);
+#else
+  res.v4f[0] = psqrt<Packet2d>(x.v4f[0]);
+  res.v4f[1] = psqrt<Packet2d>(x.v4f[1]);
+#endif
+  return res;
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d prsqrt<Packet2d>(const Packet2d& x) {
+  return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& x) {
+  Packet4f res;
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+  res = pset1<Packet4f>(1.0) / psqrt<Packet4f>(x);
+#else
+  res.v4f[0] = prsqrt<Packet2d>(x.v4f[0]);
+  res.v4f[1] = prsqrt<Packet2d>(x.v4f[1]);
+#endif
+  return res;
+}
+
+// Hyperbolic Tangent function.
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+ptanh<Packet4f>(const Packet4f& x) {
+  return internal::generic_fast_tanh_float(x);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_ALTIVEC_H

diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h
new file mode 100755
index 0000000..a7b59c8
--- /dev/null
+++ b/Eigen/src/Core/arch/ZVector/PacketMath.h

@@ -0,0 +1,1060 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_ZVECTOR_H
+#define EIGEN_PACKET_MATH_ZVECTOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  32
+#endif
+
+typedef __vector int                 Packet4i;
+typedef __vector unsigned int        Packet4ui;
+typedef __vector __bool int          Packet4bi;
+typedef __vector short int           Packet8i;
+typedef __vector unsigned char       Packet16uc;
+typedef __vector double              Packet2d;
+typedef __vector unsigned long long  Packet2ul;
+typedef __vector long long           Packet2l;
+
+// Z14 has builtin support for float vectors
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+typedef __vector float               Packet4f;
+#else
+typedef struct {
+	Packet2d  v4f[2];
+} Packet4f;
+#endif
+
+typedef union {
+  numext::int32_t   i[4];
+  numext::uint32_t ui[4];
+  numext::int64_t   l[2];
+  numext::uint64_t ul[2];
+  double    d[2];
+  float     f[4];
+  Packet4i  v4i;
+  Packet4ui v4ui;
+  Packet2l  v2l;
+  Packet2ul v2ul;
+  Packet2d  v2d;
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+  Packet4f  v4f;
+#endif
+} Packet;
+
+// We don't want to write the same code all the time, but we need to reuse the constants
+// and it doesn't really work to declare them global, so we define macros instead
+
+#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
+  Packet4i p4i_##NAME = reinterpret_cast<Packet4i>(vec_splat_s32(X))
+
+#define _EIGEN_DECLARE_CONST_FAST_Packet2d(NAME,X) \
+  Packet2d p2d_##NAME = reinterpret_cast<Packet2d>(vec_splat_s64(X))
+
+#define _EIGEN_DECLARE_CONST_FAST_Packet2l(NAME,X) \
+  Packet2l p2l_##NAME = reinterpret_cast<Packet2l>(vec_splat_s64(X))
+
+#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
+  Packet4i p4i_##NAME = pset1<Packet4i>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
+  Packet2d p2d_##NAME = pset1<Packet2d>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
+  Packet2l p2l_##NAME = pset1<Packet2l>(X)
+
+// These constants are endian-agnostic
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
+
+static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0);
+static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0);
+static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);
+
+static Packet2d p2d_ONE = { 1.0, 1.0 };
+static Packet2d p2d_ZERO_ = { numext::bit_cast<double>(0x8000000000000000ull),
+                              numext::bit_cast<double>(0x8000000000000000ull) };
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
+  Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
+
+#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
+  Packet4f p4f_##NAME = pset1<Packet4f>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+  const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
+
+static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
+static Packet4f p4f_MZERO = { 0x80000000, 0x80000000, 0x80000000, 0x80000000};
+#endif
+
+static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
+static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
+static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8));
+
+static Packet16uc p16uc_PSET64_HI = { 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
+
+// Mask alignment
+#define _EIGEN_MASK_ALIGNMENT	0xfffffffffffffff0
+
+#define _EIGEN_ALIGNED_PTR(x)	((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
+
+// Handle endianness properly while loading constants
+// Define global static constants:
+
+static Packet16uc p16uc_FORWARD =   { 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 };
+static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
+static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+
+static Packet16uc p16uc_PSET32_WODD   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc p16uc_PSET32_WEVEN  = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+/*static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
+
+static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };*/
+static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
+/*static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16);                                         //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
+static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16);                                         //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};*/
+static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
+static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
+
+static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);                                         //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
+
+static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+
+
+#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+  #define EIGEN_ZVECTOR_PREFETCH(ADDR) __builtin_prefetch(ADDR);
+#else
+  #define EIGEN_ZVECTOR_PREFETCH(ADDR) asm( "   pfd [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+#endif
+
+template<> struct packet_traits<int>    : default_packet_traits
+{
+  typedef Packet4i type;
+  typedef Packet4i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef Packet4f type;
+  typedef Packet4f half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasAbs = 1,
+    HasSin = 0,
+    HasCos = 0,
+    HasLog = 0,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasTanh = 1,
+    HasErf = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasNegate = 1,
+    HasBlend = 1
+  };
+};
+
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+    HasHalfPacket = 1,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 1,
+    HasMin  = 1,
+    HasMax  = 1,
+    HasAbs  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasNegate = 1,
+    HasBlend = 1
+  };
+};
+
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; };
+
+/* Forward declaration */
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel);
+ 
+inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
+{
+  Packet vt;
+  vt.v4i = v;
+  s << vt.i[0] << ", " << vt.i[1] << ", " << vt.i[2] << ", " << vt.i[3];
+  return s;
+}
+
+inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
+{
+  Packet vt;
+  vt.v4ui = v;
+  s << vt.ui[0] << ", " << vt.ui[1] << ", " << vt.ui[2] << ", " << vt.ui[3];
+  return s;
+}
+
+inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
+{
+  Packet vt;
+  vt.v2l = v;
+  s << vt.l[0] << ", " << vt.l[1];
+  return s;
+}
+
+inline std::ostream & operator <<(std::ostream & s, const Packet2ul & v)
+{
+  Packet vt;
+  vt.v2ul = v;
+  s << vt.ul[0] << ", " << vt.ul[1] ;
+  return s;
+}
+
+inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
+{
+  Packet vt;
+  vt.v2d = v;
+  s << vt.d[0] << ", " << vt.d[1];
+  return s;
+}
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
+{
+  Packet vt;
+  vt.v4f = v;
+  s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3];
+  return s;
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_LOAD
+  Packet *vfrom;
+  vfrom = (Packet *) from;
+  return vfrom->v4i;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_LOAD
+  Packet *vfrom;
+  vfrom = (Packet *) from;
+  return vfrom->v2d;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_STORE
+  Packet *vto;
+  vto = (Packet *) to;
+  vto->v4i = from;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_STORE
+  Packet *vto;
+  vto = (Packet *) to;
+  vto->v2d = from;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)
+{
+  return vec_splats(from);
+}
+template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+  return vec_splats(from);
+}
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4i>(const int *a,
+                      Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
+{
+  a3 = pload<Packet4i>(a);
+  a0 = vec_splat(a3, 0);
+  a1 = vec_splat(a3, 1);
+  a2 = vec_splat(a3, 2);
+  a3 = vec_splat(a3, 3);
+}
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet2d>(const double *a,
+                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
+{
+  a1 = pload<Packet2d>(a);
+  a0 = vec_splat(a1, 0);
+  a1 = vec_splat(a1, 1);
+  a3 = pload<Packet2d>(a+2);
+  a2 = vec_splat(a3, 0);
+  a3 = vec_splat(a3, 1);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
+{
+  EIGEN_ALIGN16 int ai[4];
+  ai[0] = from[0*stride];
+  ai[1] = from[1*stride];
+  ai[2] = from[2*stride];
+  ai[3] = from[3*stride];
+ return pload<Packet4i>(ai);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
+{
+  EIGEN_ALIGN16 double af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+ return pload<Packet2d>(af);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
+{
+  EIGEN_ALIGN16 int ai[4];
+  pstore<int>((int *)ai, from);
+  to[0*stride] = ai[0];
+  to[1*stride] = ai[1];
+  to[2*stride] = ai[2];
+  to[3*stride] = ai[3];
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
+{
+  EIGEN_ALIGN16 double af[2];
+  pstore<double>(af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a + b); }
+template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a + b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a - b); }
+template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a - b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a * b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a * b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a / b); }
+template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a / b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); }
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd<Packet4i>(pmul<Packet4i>(a, b), c); }
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
+
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)    { return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return pand<Packet4i>(a, vec_nor(b, b)); }
+template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const  Packet2d& a) { return vec_ceil(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*       from) { return pload<Packet4i>(from); }
+template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double*    from) { return pload<Packet2d>(from); }
+
+
+template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
+{
+  Packet4i p = pload<Packet4i>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
+{
+  Packet2d p = pload<Packet2d>(from);
+  return vec_perm(p, p, p16uc_PSET64_HI);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*        to, const Packet4i& from) { pstore<int>(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from) { pstore<double>(to, from); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int    x[4]; pstore(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore(x, a); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
+{
+  return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
+{
+  return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) { return vec_abs(a); }
+
+template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
+{
+  Packet4i b, sum;
+  b   = vec_sld(a, a, 8);
+  sum = padd<Packet4i>(a, b);
+  b   = vec_sld(sum, sum, 4);
+  sum = padd<Packet4i>(sum, b);
+  return pfirst(sum);
+}
+
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
+{
+  Packet2d b, sum;
+  b   = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8));
+  sum = padd<Packet2d>(a, b);
+  return pfirst(sum);
+}
+
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
+{
+  EIGEN_ALIGN16 int aux[4];
+  pstore(aux, a);
+  return aux[0] * aux[1] * aux[2] * aux[3];
+}
+
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
+{
+  return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
+}
+
+// min
+template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
+{
+  Packet4i b, res;
+  b   = pmin<Packet4i>(a, vec_sld(a, a, 8));
+  res = pmin<Packet4i>(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
+{
+  return pfirst(pmin<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
+}
+
+// max
+template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
+{
+  Packet4i b, res;
+  b = pmax<Packet4i>(a, vec_sld(a, a, 8));
+  res = pmax<Packet4i>(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+// max
+template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
+{
+  return pfirst(pmax<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4i,4>& kernel) {
+  Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  Packet4i t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2d,2>& kernel) {
+  Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
+  Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
+  kernel.packet[0] = t0;
+  kernel.packet[1] = t1;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
+  Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
+  Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+/* z13 has no vector float support so we emulate that with double
+   z14 has proper vector float support.
+*/
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
+/* Helper function to simulate a vec_splat_packet4f
+ */
+template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f&   from)
+{
+  Packet4f splat;
+  switch (element) {
+  case 0:
+    splat.v4f[0] = vec_splat(from.v4f[0], 0);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  case 1:
+    splat.v4f[0] = vec_splat(from.v4f[0], 1);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  case 2:
+    splat.v4f[0] = vec_splat(from.v4f[1], 0);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  case 3:
+    splat.v4f[0] = vec_splat(from.v4f[1], 1);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  }
+  return splat;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_LOAD
+  Packet4f vfrom;
+  vfrom.v4f[0] = vec_ld2f(&from[0]);
+  vfrom.v4f[1] = vec_ld2f(&from[2]);
+  return vfrom;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_st2f(from.v4f[0], &to[0]);
+  vec_st2f(from.v4f[1], &to[2]);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&    from)
+{
+  Packet4f to;
+  to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
+  to.v4f[1] = to.v4f[0];
+  return to;
+}
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  a3 = pload<Packet4f>(a);
+  a0 = vec_splat_packet4f<0>(a3);
+  a1 = vec_splat_packet4f<1>(a3);
+  a2 = vec_splat_packet4f<2>(a3);
+  a3 = vec_splat_packet4f<3>(a3);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+  EIGEN_ALIGN16 float ai[4];
+  ai[0] = from[0*stride];
+  ai[1] = from[1*stride];
+  ai[2] = from[2*stride];
+  ai[3] = from[3*stride];
+ return pload<Packet4f>(ai);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  EIGEN_ALIGN16 float ai[4];
+  pstore<float>((float *)ai, from);
+  to[0*stride] = ai[0];
+  to[1*stride] = ai[1];
+  to[2*stride] = ai[2];
+  to[3*stride] = ai[3];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] + b.v4f[0];
+  c.v4f[1] = a.v4f[1] + b.v4f[1];
+  return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] - b.v4f[0];
+  c.v4f[1] = a.v4f[1] - b.v4f[1];
+  return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] * b.v4f[0];
+  c.v4f[1] = a.v4f[1] * b.v4f[1];
+  return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] / b.v4f[0];
+  c.v4f[1] = a.v4f[1] / b.v4f[1];
+  return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
+{
+  Packet4f c;
+  c.v4f[0] = -a.v4f[0];
+  c.v4f[1] = -a.v4f[1];
+  return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
+{
+  Packet4f res;
+  res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
+  res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = por(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = por(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pxor(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pxor(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = vec_round(a.v4f[0]);
+  res.v4f[1] = vec_round(a.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = vec_ceil(a.v4f[0]);
+  res.v4f[1] = vec_ceil(a.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = vec_floor(a.v4f[0]);
+  res.v4f[1] = vec_floor(a.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*    from)
+{
+  Packet4f p = pload<Packet4f>(from);
+  p.v4f[1] = vec_splat(p.v4f[0], 1);
+  p.v4f[0] = vec_splat(p.v4f[0], 0);
+  return p;
+}
+
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{
+  Packet4f rev;
+  rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
+  rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
+  return rev;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = pabs(a.v4f[0]);
+  res.v4f[1] = pabs(a.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  Packet2d sum;
+  sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
+  double first = predux<Packet2d>(sum);
+  return static_cast<float>(first);
+}
+
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{
+  // Return predux_mul<Packet2d> of the subvectors product
+  return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  Packet2d b, res;
+  b   = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
+  res = pmin<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+  return static_cast<float>(pfirst(res));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  Packet2d b, res;
+  b   = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
+  res = pmax<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+  return static_cast<float>(pfirst(res));
+}
+
+/* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one
+ */
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  PacketBlock<Packet2d,2> t0,t1,t2,t3;
+  // copy top-left 2x2 Packet2d block
+  t0.packet[0] = kernel.packet[0].v4f[0];
+  t0.packet[1] = kernel.packet[1].v4f[0];
+
+  // copy top-right 2x2 Packet2d block
+  t1.packet[0] = kernel.packet[0].v4f[1];
+  t1.packet[1] = kernel.packet[1].v4f[1];
+
+  // copy bottom-left 2x2 Packet2d block
+  t2.packet[0] = kernel.packet[2].v4f[0];
+  t2.packet[1] = kernel.packet[3].v4f[0];
+
+  // copy bottom-right 2x2 Packet2d block
+  t3.packet[0] = kernel.packet[2].v4f[1];
+  t3.packet[1] = kernel.packet[3].v4f[1];
+
+  // Transpose all 2x2 blocks
+  ptranspose(t0);
+  ptranspose(t1);
+  ptranspose(t2);
+  ptranspose(t3);
+
+  // Copy back transposed blocks, but exchange t1 and t2 due to transposition
+  kernel.packet[0].v4f[0] = t0.packet[0];
+  kernel.packet[0].v4f[1] = t2.packet[0];
+  kernel.packet[1].v4f[0] = t0.packet[1];
+  kernel.packet[1].v4f[1] = t2.packet[1];
+  kernel.packet[2].v4f[0] = t1.packet[0];
+  kernel.packet[2].v4f[1] = t3.packet[0];
+  kernel.packet[3].v4f[0] = t1.packet[1];
+  kernel.packet[3].v4f[1] = t3.packet[1];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+  Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] };
+  Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] };
+  Packet2ul mask_hi = vec_cmpeq(select_hi, reinterpret_cast<Packet2ul>(p2l_ONE));
+  Packet2ul mask_lo = vec_cmpeq(select_lo, reinterpret_cast<Packet2ul>(p2l_ONE));
+  Packet4f result;
+  result.v4f[0] = vec_sel(elsePacket.v4f[0], thenPacket.v4f[0], mask_hi);
+  result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo);
+  return result;
+}
+
+template<> Packet4f EIGEN_STRONG_INLINE pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pcmp_le(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pcmp_le(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> Packet4f EIGEN_STRONG_INLINE pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pcmp_lt(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pcmp_lt(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> Packet4f EIGEN_STRONG_INLINE pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pcmp_eq(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pcmp_eq(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+#else
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_LOAD
+  Packet *vfrom;
+  vfrom = (Packet *) from;
+  return vfrom->v4f;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_STORE
+  Packet *vto;
+  vto = (Packet *) to;
+  vto->v4f = from;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
+{
+  return vec_splats(from);
+}
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  a3 = pload<Packet4f>(a);
+  a0 = vec_splat(a3, 0);
+  a1 = vec_splat(a3, 1);
+  a2 = vec_splat(a3, 2);
+  a3 = vec_splat(a3, 3);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+  EIGEN_ALIGN16 float af[4];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  af[2] = from[2*stride];
+  af[3] = from[3*stride];
+ return pload<Packet4f>(af);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  EIGEN_ALIGN16 float af[4];
+  pstore<float>((float*)af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+  to[2*stride] = af[2];
+  to[3*stride] = af[3];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a + b); }
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a - b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a * b); }
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a / b); }
+template<> EIGEN_STRONG_INLINE Packet4f pnegate<Packet4f>(const Packet4f& a) { return (-a); }
+template<> EIGEN_STRONG_INLINE Packet4f pconj<Packet4f>  (const Packet4f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4f pmadd<Packet4f>  (const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>    (const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f> (const Packet4f& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>  (const Packet4f& a) { return vec_ceil(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f> (const Packet4f& a) { return vec_floor(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>   (const Packet4f& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x[4]; pstore(x, a); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
+{
+  Packet4f p = pload<Packet4f>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{
+  return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  Packet4f b, sum;
+  b   = vec_sld(a, a, 8);
+  sum = padd<Packet4f>(a, b);
+  b   = vec_sld(sum, sum, 4);
+  sum = padd<Packet4f>(sum, b);
+  return pfirst(sum);
+}
+
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{
+  Packet4f prod;
+  prod = pmul(a, vec_sld(a, a, 8));
+  return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
+}
+
+// min
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  Packet4f b, res;
+  b   = pmin<Packet4f>(a, vec_sld(a, a, 8));
+  res = pmin<Packet4f>(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+// max
+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  Packet4f b, res;
+  b = pmax<Packet4f>(a, vec_sld(a, a, 8));
+  res = pmax<Packet4f>(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  Packet4f t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+#endif
+
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f> (const float* from) { return pload<Packet4f>(from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { pstore<float>(to, from); }
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>  (const float& a)  { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PACKET_MATH_ZVECTOR_H

diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h
index ae264aa..bf64ef4 100644
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h

@@ -18,20 +18,24 @@
   * \brief Template functor for scalar/packet assignment
   *
   */
-template<typename Scalar> struct assign_op {
+template<typename DstScalar,typename SrcScalar> struct assign_op {
 
   EIGEN_EMPTY_STRUCT_CTOR(assign_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a = b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a = b; }
   
   template<int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
-  { internal::pstoret<Scalar,Packet,Alignment>(a,b); }
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,b); }
 };
-template<typename Scalar>
-struct functor_traits<assign_op<Scalar> > {
+
+// Empty overload for void type (used by PermutationMatrix)
+template<typename DstScalar> struct assign_op<DstScalar,void> {};
+
+template<typename DstScalar,typename SrcScalar>
+struct functor_traits<assign_op<DstScalar,SrcScalar> > {
   enum {
-    Cost = NumTraits<Scalar>::ReadCost,
-    PacketAccess = packet_traits<Scalar>::IsVectorized
+    Cost = NumTraits<DstScalar>::ReadCost,
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::Vectorizable && packet_traits<SrcScalar>::Vectorizable
   };
 };
 
@@ -39,20 +43,20 @@
   * \brief Template functor for scalar/packet assignment with addition
   *
   */
-template<typename Scalar> struct add_assign_op {
+template<typename DstScalar,typename SrcScalar> struct add_assign_op {
 
   EIGEN_EMPTY_STRUCT_CTOR(add_assign_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a += b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a += b; }
   
   template<int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
-  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::padd(internal::ploadt<Packet,Alignment>(a),b)); }
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,internal::padd(internal::ploadt<Packet,Alignment>(a),b)); }
 };
-template<typename Scalar>
-struct functor_traits<add_assign_op<Scalar> > {
+template<typename DstScalar,typename SrcScalar>
+struct functor_traits<add_assign_op<DstScalar,SrcScalar> > {
   enum {
-    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasAdd
+    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::AddCost,
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasAdd
   };
 };
 
@@ -60,20 +64,20 @@
   * \brief Template functor for scalar/packet assignment with subtraction
   *
   */
-template<typename Scalar> struct sub_assign_op {
+template<typename DstScalar,typename SrcScalar> struct sub_assign_op {
 
   EIGEN_EMPTY_STRUCT_CTOR(sub_assign_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a -= b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a -= b; }
   
   template<int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
-  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::psub(internal::ploadt<Packet,Alignment>(a),b)); }
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,internal::psub(internal::ploadt<Packet,Alignment>(a),b)); }
 };
-template<typename Scalar>
-struct functor_traits<sub_assign_op<Scalar> > {
+template<typename DstScalar,typename SrcScalar>
+struct functor_traits<sub_assign_op<DstScalar,SrcScalar> > {
   enum {
-    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasAdd
+    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::AddCost,
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasSub
   };
 };
 
@@ -81,20 +85,21 @@
   * \brief Template functor for scalar/packet assignment with multiplication
   *
   */
-template<typename Scalar> struct mul_assign_op {
+template<typename DstScalar, typename SrcScalar=DstScalar>
+struct mul_assign_op {
 
   EIGEN_EMPTY_STRUCT_CTOR(mul_assign_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a *= b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a *= b; }
   
   template<int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
-  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::pmul(internal::ploadt<Packet,Alignment>(a),b)); }
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,internal::pmul(internal::ploadt<Packet,Alignment>(a),b)); }
 };
-template<typename Scalar>
-struct functor_traits<mul_assign_op<Scalar> > {
+template<typename DstScalar, typename SrcScalar>
+struct functor_traits<mul_assign_op<DstScalar,SrcScalar> > {
   enum {
-    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasMul
+    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::MulCost,
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasMul
   };
 };
 
@@ -102,26 +107,25 @@
   * \brief Template functor for scalar/packet assignment with diviving
   *
   */
-template<typename Scalar> struct div_assign_op {
+template<typename DstScalar, typename SrcScalar=DstScalar> struct div_assign_op {
 
   EIGEN_EMPTY_STRUCT_CTOR(div_assign_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a /= b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a /= b; }
   
   template<int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
-  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::pdiv(internal::ploadt<Packet,Alignment>(a),b)); }
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,internal::pdiv(internal::ploadt<Packet,Alignment>(a),b)); }
 };
-template<typename Scalar>
-struct functor_traits<div_assign_op<Scalar> > {
+template<typename DstScalar, typename SrcScalar>
+struct functor_traits<div_assign_op<DstScalar,SrcScalar> > {
   enum {
-    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasMul
+    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::MulCost,
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasDiv
   };
 };
 
-
 /** \internal
-  * \brief Template functor for scalar/packet assignment with swaping
+  * \brief Template functor for scalar/packet assignment with swapping
   *
   * It works as follow. For a non-vectorized evaluation loop, we have:
   *   for(i) func(A.coeffRef(i), B.coeff(i));
@@ -140,23 +144,29 @@
   EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const
   {
+#ifdef EIGEN_GPUCC
+    // FIXME is there some kind of cuda::swap?
+    Scalar t=b; const_cast<Scalar&>(b)=a; a=t;
+#else
     using std::swap;
     swap(a,const_cast<Scalar&>(b));
-  }
-  
-  template<int LhsAlignment, int RhsAlignment, typename Packet>
-  EIGEN_STRONG_INLINE void swapPacket(Scalar* a, Scalar* b) const
-  {
-    Packet tmp = internal::ploadt<Packet,RhsAlignment>(b);
-    internal::pstoret<Scalar,Packet,RhsAlignment>(b, internal::ploadt<Packet,LhsAlignment>(a));
-    internal::pstoret<Scalar,Packet,LhsAlignment>(a, tmp);
+#endif
   }
 };
 template<typename Scalar>
 struct functor_traits<swap_assign_op<Scalar> > {
   enum {
     Cost = 3 * NumTraits<Scalar>::ReadCost,
-    PacketAccess = packet_traits<Scalar>::IsVectorized
+    PacketAccess = 
+    #if defined(EIGEN_VECTORIZE_AVX) && EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<800 || defined(__apple_build_version__))
+    // This is a partial workaround for a bug in clang generating bad code
+    // when mixing 256/512 bits loads and 128 bits moves.
+    // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1684
+    //     https://bugs.llvm.org/show_bug.cgi?id=40815
+    0
+    #else
+    packet_traits<Scalar>::Vectorizable
+    #endif
   };
 };
 

diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index 94eea1b..63f09ab 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h

@@ -10,47 +10,55 @@
 #ifndef EIGEN_BINARY_FUNCTORS_H
 #define EIGEN_BINARY_FUNCTORS_H
 
-// clang-format off
-
 namespace Eigen {
 
 namespace internal {
 
 //---------- associative binary functors ----------
 
+template<typename Arg1, typename Arg2>
+struct binary_op_base
+{
+  typedef Arg1 first_argument_type;
+  typedef Arg2 second_argument_type;
+};
+
 /** \internal
   * \brief Template functor to compute the sum of two scalars
   *
   * \sa class CwiseBinaryOp, MatrixBase::operator+, class VectorwiseOp, DenseBase::sum()
   */
-template<typename Scalar> struct scalar_sum_op {
-//   typedef Scalar result_type;
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_sum_op : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_sum_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a + b; }
+#else
+  scalar_sum_op() {
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
   { return internal::padd(a,b); }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const
   { return internal::predux(a); }
 };
-template<typename Scalar>
-struct functor_traits<scalar_sum_op<Scalar> > {
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_sum_op<LhsScalar,RhsScalar> > {
   enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasAdd
+    Cost = (int(NumTraits<LhsScalar>::AddCost) + int(NumTraits<RhsScalar>::AddCost)) / 2, // rough estimate!
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasAdd && packet_traits<RhsScalar>::HasAdd
+    // TODO vectorize mixed sum
   };
 };
 
-/** \internal
-  * \brief Template specialization to deprecate the summation of boolean expressions.
-  * This is required to solve Bug 426.
-  * \sa DenseBase::count(), DenseBase::any(), ArrayBase::cast(), MatrixBase::cast()
-  */
-template<> struct scalar_sum_op<bool> : scalar_sum_op<int> {
-  EIGEN_DEPRECATED
-  scalar_sum_op() {}
-};
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_sum_op<bool,bool>::operator() (const bool& a, const bool& b) const { return a || b; }
 
 
 /** \internal
@@ -58,48 +66,59 @@
   *
   * \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux()
   */
-template<typename LhsScalar,typename RhsScalar> struct scalar_product_op {
-  enum {
-    // TODO vectorize mixed product
-    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
-  };
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_product_op  : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_product_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
   EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
+#else
+  scalar_product_op() {
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
   { return internal::pmul(a,b); }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const
   { return internal::predux_mul(a); }
 };
 template<typename LhsScalar,typename RhsScalar>
 struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
   enum {
-    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost)/2, // rough estimate!
-    PacketAccess = scalar_product_op<LhsScalar,RhsScalar>::Vectorizable
+    Cost = (int(NumTraits<LhsScalar>::MulCost) + int(NumTraits<RhsScalar>::MulCost))/2, // rough estimate!
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
+    // TODO vectorize mixed product
   };
 };
 
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_product_op<bool,bool>::operator() (const bool& a, const bool& b) const { return a && b; }
+
+
 /** \internal
   * \brief Template functor to compute the conjugate product of two scalars
   *
   * This is a short cut for conj(x) * y which is needed for optimization purpose; in Eigen2 support mode, this becomes x * conj(y)
   */
-template<typename LhsScalar,typename RhsScalar> struct scalar_conj_product_op {
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_conj_product_op  : binary_op_base<LhsScalar,RhsScalar>
+{
 
   enum {
     Conj = NumTraits<LhsScalar>::IsComplex
   };
-
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
-
+  
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_conj_product_op>::ReturnType result_type;
+  
   EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const
   { return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
-
+  
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
   { return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
 };
 template<typename LhsScalar,typename RhsScalar>
@@ -115,21 +134,31 @@
   *
   * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff()
   */
-template<typename Scalar> struct scalar_min_op {
+template<typename LhsScalar,typename RhsScalar, int NaNPropagation>
+struct scalar_min_op : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_min_op>::ReturnType result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return numext::mini(a, b); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const {
+    return internal::pmin<NaNPropagation>(a, b);
+  }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmin(a,b); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
+  {
+    return internal::pmin<NaNPropagation>(a,b);
+  }
   template<typename Packet>
-  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
-  { return internal::predux_min(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const
+  {
+    return internal::predux_min<NaNPropagation>(a);
+  }
 };
-template<typename Scalar>
-struct functor_traits<scalar_min_op<Scalar> > {
+
+template<typename LhsScalar,typename RhsScalar, int NaNPropagation>
+struct functor_traits<scalar_min_op<LhsScalar,RhsScalar, NaNPropagation> > {
   enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasMin
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
+    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMin
   };
 };
 
@@ -138,123 +167,176 @@
   *
   * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff()
   */
-template<typename Scalar> struct scalar_max_op {
+template<typename LhsScalar,typename RhsScalar, int NaNPropagation>
+struct scalar_max_op : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_max_op>::ReturnType result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const  { return numext::maxi(a, b); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const {
+    return internal::pmax<NaNPropagation>(a,b);
+  }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmax(a,b); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
+  {
+    return internal::pmax<NaNPropagation>(a,b);
+  }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
-  { return internal::predux_max(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_max_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasMax
-  };
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const
+  {
+    return internal::predux_max<NaNPropagation>(a);
+  }
 };
 
+template<typename LhsScalar,typename RhsScalar, int NaNPropagation>
+struct functor_traits<scalar_max_op<LhsScalar,RhsScalar, NaNPropagation> > {
+  enum {
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
+    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMax
+  };
+};
 
 /** \internal
   * \brief Template functors for comparison of two scalars
   * \todo Implement packet-comparisons
   */
-template<typename Scalar, ComparisonName cmp> struct scalar_cmp_op;
+template<typename LhsScalar, typename RhsScalar, ComparisonName cmp> struct scalar_cmp_op;
 
-template<typename Scalar, ComparisonName cmp>
-struct functor_traits<scalar_cmp_op<Scalar, cmp> > {
+template<typename LhsScalar, typename RhsScalar, ComparisonName cmp>
+struct functor_traits<scalar_cmp_op<LhsScalar,RhsScalar, cmp> > {
   enum {
-    Cost = NumTraits<Scalar>::AddCost,
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
     PacketAccess = false
   };
 };
 
-template<ComparisonName Cmp, typename Scalar>
-struct result_of<scalar_cmp_op<Scalar, Cmp>(Scalar,Scalar)> {
+template<ComparisonName Cmp, typename LhsScalar, typename RhsScalar>
+struct result_of<scalar_cmp_op<LhsScalar, RhsScalar, Cmp>(LhsScalar,RhsScalar)> {
   typedef bool type;
 };
 
 
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_EQ> {
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_EQ> : binary_op_base<LhsScalar,RhsScalar>
+{
   typedef bool result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a==b;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a==b;}
 };
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_LT> {
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_LT> : binary_op_base<LhsScalar,RhsScalar>
+{
   typedef bool result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a<b;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a<b;}
 };
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_LE> {
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_LE> : binary_op_base<LhsScalar,RhsScalar>
+{
   typedef bool result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a<=b;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a<=b;}
 };
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_GT> {
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_GT> : binary_op_base<LhsScalar,RhsScalar>
+{
   typedef bool result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a>b;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>b;}
 };
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_GE> {
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_GE> : binary_op_base<LhsScalar,RhsScalar>
+{
   typedef bool result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a>=b;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>=b;}
 };
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_UNORD> {
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_UNORD> : binary_op_base<LhsScalar,RhsScalar>
+{
   typedef bool result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return !(a<=b || b<=a);}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return !(a<=b || b<=a);}
 };
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_NEQ> {
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_NEQ> : binary_op_base<LhsScalar,RhsScalar>
+{
   typedef bool result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a!=b;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a!=b;}
 };
 
-
 /** \internal
-  * \brief Template functor to compute the hypot of two scalars
+  * \brief Template functor to compute the hypot of two \b positive \b and \b real scalars
   *
   * \sa MatrixBase::stableNorm(), class Redux
   */
-template<typename Scalar> struct scalar_hypot_op {
+template<typename Scalar>
+struct scalar_hypot_op<Scalar,Scalar> : binary_op_base<Scalar,Scalar>
+{
   EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
-//   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar &x, const Scalar &y) const
   {
-    using std::sqrt;
-    Scalar p = numext::maxi(_x, _y);
-    Scalar q = numext::mini(_x, _y);
-    Scalar qp = q/p;
-    return p * sqrt(Scalar(1) + qp*qp);
+    // This functor is used by hypotNorm only for which it is faster to first apply abs
+    // on all coefficients prior to reduction through hypot.
+    // This way we avoid calling abs on positive and real entries, and this also permits
+    // to seamlessly handle complexes. Otherwise we would have to handle both real and complexes
+    // through the same functor...
+    return internal::positive_real_hypot(x,y);
   }
 };
 template<typename Scalar>
-struct functor_traits<scalar_hypot_op<Scalar> > {
+struct functor_traits<scalar_hypot_op<Scalar,Scalar> > {
   enum
   {
     Cost = 3 * NumTraits<Scalar>::AddCost +
            2 * NumTraits<Scalar>::MulCost +
-           2 * NumTraits<Scalar>::template Div<false>::Cost,
+           2 * scalar_div_cost<Scalar,false>::value,
     PacketAccess = false
   };
 };
 
 /** \internal
   * \brief Template functor to compute the pow of two scalars
+  * See the specification of pow in https://en.cppreference.com/w/cpp/numeric/math/pow
   */
-template<typename Scalar, typename OtherScalar> struct scalar_binary_pow_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_binary_pow_op)
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const OtherScalar& b) const { return numext::pow(a, b); }
-};
-template<typename Scalar, typename OtherScalar>
-struct functor_traits<scalar_binary_pow_op<Scalar,OtherScalar> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+template<typename Scalar, typename Exponent>
+struct scalar_pow_op  : binary_op_base<Scalar,Exponent>
+{
+  typedef typename ScalarBinaryOpTraits<Scalar,Exponent,scalar_pow_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_pow_op)
+#else
+  scalar_pow_op() {
+    typedef Scalar LhsScalar;
+    typedef Exponent RhsScalar;
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
+
+  EIGEN_DEVICE_FUNC
+  inline result_type operator() (const Scalar& a, const Exponent& b) const { return numext::pow(a, b); }
+
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  {
+    return generic_pow(a,b);
+  }
 };
 
-
+template<typename Scalar, typename Exponent>
+struct functor_traits<scalar_pow_op<Scalar,Exponent> > {
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = (!NumTraits<Scalar>::IsComplex && !NumTraits<Scalar>::IsInteger &&
+                    packet_traits<Scalar>::HasExp && packet_traits<Scalar>::HasLog &&
+                    packet_traits<Scalar>::HasRound && packet_traits<Scalar>::HasCmp &&
+                    // Temporarly disable packet access for half/bfloat16 until
+                    // accuracy is improved.
+                    !is_same<Scalar, half>::value && !is_same<Scalar, bfloat16>::value
+                    )
+  };
+};
 
 //---------- non associative binary functors ----------
 
@@ -263,18 +345,27 @@
   *
   * \sa class CwiseBinaryOp, MatrixBase::operator-
   */
-template<typename Scalar> struct scalar_difference_op {
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_difference_op : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_difference_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
   EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a - b; }
+#else
+  scalar_difference_op() {
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a - b; }
   template<typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return internal::psub(a,b); }
 };
-template<typename Scalar>
-struct functor_traits<scalar_difference_op<Scalar> > {
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_difference_op<LhsScalar,RhsScalar> > {
   enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasSub
+    Cost = (int(NumTraits<LhsScalar>::AddCost) + int(NumTraits<RhsScalar>::AddCost)) / 2,
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasSub && packet_traits<RhsScalar>::HasSub
   };
 };
 
@@ -283,13 +374,17 @@
   *
   * \sa class CwiseBinaryOp, Cwise::operator/()
   */
-template<typename LhsScalar,typename RhsScalar> struct scalar_quotient_op {
-  enum {
-    // TODO vectorize mixed product
-    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasDiv && packet_traits<RhsScalar>::HasDiv
-  };
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_quotient_op  : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_quotient_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
   EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
+#else
+  scalar_quotient_op() {
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; }
   template<typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
@@ -297,9 +392,10 @@
 };
 template<typename LhsScalar,typename RhsScalar>
 struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
+  typedef typename scalar_quotient_op<LhsScalar,RhsScalar>::result_type result_type;
   enum {
-    PacketAccess = scalar_quotient_op<LhsScalar,RhsScalar>::Vectorizable,
-    Cost = NumTraits<typename promote_scalar_type<LhsScalar, RhsScalar>::type>::template Div<PacketAccess>::Cost
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasDiv && packet_traits<RhsScalar>::HasDiv,
+    Cost = scalar_div_cost<result_type,PacketAccess>::value
   };
 };
 
@@ -313,11 +409,14 @@
 struct scalar_boolean_and_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pand(a,b); }
 };
 template<> struct functor_traits<scalar_boolean_and_op> {
   enum {
     Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
+    PacketAccess = true
   };
 };
 
@@ -329,227 +428,110 @@
 struct scalar_boolean_or_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::por(a,b); }
 };
 template<> struct functor_traits<scalar_boolean_or_op> {
   enum {
     Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
+    PacketAccess = true
   };
 };
 
 /** \internal
-  * \brief Template functor to compute the xor of two booleans
-  *
-  * \sa class CwiseBinaryOp, ArrayBase::operator^
-  */
+ * \brief Template functor to compute the xor of two booleans
+ *
+ * \sa class CwiseBinaryOp, ArrayBase::operator^
+ */
 struct scalar_boolean_xor_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pxor(a,b); }
 };
 template<> struct functor_traits<scalar_boolean_xor_op> {
   enum {
     Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
+    PacketAccess = true
   };
 };
 
+/** \internal
+  * \brief Template functor to compute the absolute difference of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::absolute_difference
+  */
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_absolute_difference_op : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_absolute_difference_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_absolute_difference_op)
+#else
+  scalar_absolute_difference_op() {
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
+  { return numext::absdiff(a,b); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pabsdiff(a,b); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_absolute_difference_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasAbsDiff
+  };
+};
+
+
+
 //---------- binary functors bound to a constant, thus appearing as a unary functor ----------
 
-/** \internal
-  * \brief Template functor to multiply a scalar by a fixed other one
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::operator*, MatrixBase::operator/
-  */
-/* NOTE why doing the pset1() in packetOp *is* an optimization ?
- * indeed it seems better to declare m_other as a Packet and do the pset1() once
- * in the constructor. However, in practice:
- *  - GCC does not like m_other as a Packet and generate a load every time it needs it
- *  - on the other hand GCC is able to moves the pset1() outside the loop :)
- *  - simpler code ;)
- * (ICC and gcc 4.4 seems to perform well in both cases, the issue is visible with y = a*x + b*y)
- */
-template<typename Scalar>
-struct scalar_multiple_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  // FIXME default copy constructors seems bugged with std::complex<>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_multiple_op(const scalar_multiple_op& other) : m_other(other.m_other) { }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_multiple_op(const Scalar& other) : m_other(other) { }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a * m_other; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a, pset1<Packet>(m_other)); }
-  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_multiple_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
+// The following two classes permits to turn any binary functor into a unary one with one argument bound to a constant value.
+// They are analogues to std::binder1st/binder2nd but with the following differences:
+//  - they are compatible with packetOp
+//  - they are portable across C++ versions (the std::binder* are deprecated in C++11)
+template<typename BinaryOp> struct bind1st_op : BinaryOp {
 
-template<typename Scalar1, typename Scalar2>
-struct scalar_multiple2_op {
-  typedef typename packet_traits<Scalar1>::type Packet1;
-  typedef typename scalar_product_traits<Scalar1,Scalar2>::ReturnType result_type;
-  typedef typename packet_traits<result_type>::type packet_result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_multiple2_op(const scalar_multiple2_op& other) : m_other(other.m_other) { }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_multiple2_op(const Scalar2& other) : m_other(other) { }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar1& a) const { return a * m_other; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const packet_result_type packetOp(const Packet1& a) const
-  { eigen_assert("packetOp is not defined"); }
-  typename add_const_on_value_type<typename NumTraits<Scalar2>::Nested>::type m_other;
-};
-template<typename Scalar1,typename Scalar2>
-struct functor_traits<scalar_multiple2_op<Scalar1,Scalar2> >
-{ enum { Cost = NumTraits<Scalar1>::MulCost, PacketAccess = false }; };
+  typedef typename BinaryOp::first_argument_type  first_argument_type;
+  typedef typename BinaryOp::second_argument_type second_argument_type;
+  typedef typename BinaryOp::result_type          result_type;
 
-/** \internal
-  * \brief Template functor to divide a scalar by a fixed other one
-  *
-  * This functor is used to implement the quotient of a matrix by
-  * a scalar where the scalar type is not necessarily a floating point type.
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::operator/
-  */
-template<typename Scalar>
-struct scalar_quotient1_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  // FIXME default copy constructors seems bugged with std::complex<>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient1_op(const scalar_quotient1_op& other) : m_other(other.m_other) { }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient1_op(const Scalar& other) : m_other(other) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a / m_other; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pdiv(a, pset1<Packet>(m_other)); }
-  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_quotient1_op<Scalar> >
-{ enum { Cost = 2 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
+  EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {}
 
-// In Eigen, any binary op (Product, CwiseBinaryOp) require the Lhs and Rhs to have the same scalar type, except for multiplication
-// where the mixing of different types is handled by scalar_product_traits
-// In particular, real * complex<real> is allowed.
-// FIXME move this to functor_traits adding a functor_default
-template<typename Functor> struct functor_is_product_like { enum { ret = 0 }; };
-template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
-template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_conj_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
-template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_quotient_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); }
 
-
-/** \internal
-  * \brief Template functor to add a scalar to a fixed other one
-  * \sa class CwiseUnaryOp, Array::operator+
-  */
-/* If you wonder why doing the pset1() in packetOp() is an optimization check scalar_multiple_op */
-template<typename Scalar>
-struct scalar_add_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  // FIXME default copy constructors seems bugged with std::complex<>
-  EIGEN_DEVICE_FUNC inline scalar_add_op(const scalar_add_op& other) : m_other(other.m_other) { }
-  EIGEN_DEVICE_FUNC inline scalar_add_op(const Scalar& other) : m_other(other) { }
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a + m_other; }
-  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
-  { return internal::padd(a, pset1<Packet>(m_other)); }
-  const Scalar m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_add_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
-
-/** \internal
-  * \brief Template functor to subtract a fixed scalar to another one
-  * \sa class CwiseUnaryOp, Array::operator-, struct scalar_add_op, struct scalar_rsub_op
-  */
-template<typename Scalar>
-struct scalar_sub_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline scalar_sub_op(const scalar_sub_op& other) : m_other(other.m_other) { }
-  EIGEN_DEVICE_FUNC inline scalar_sub_op(const Scalar& other) : m_other(other) { }
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a - m_other; }
-  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
-  { return internal::psub(a, pset1<Packet>(m_other)); }
-  const Scalar m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_sub_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
-
-/** \internal
-  * \brief Template functor to subtract a scalar to fixed another one
-  * \sa class CwiseUnaryOp, Array::operator-, struct scalar_add_op, struct scalar_sub_op
-  */
-template<typename Scalar>
-struct scalar_rsub_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline scalar_rsub_op(const scalar_rsub_op& other) : m_other(other.m_other) { }
-  EIGEN_DEVICE_FUNC inline scalar_rsub_op(const Scalar& other) : m_other(other) { }
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return m_other - a; }
-  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
-  { return internal::psub(pset1<Packet>(m_other), a); }
-  const Scalar m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_rsub_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
-
-/** \internal
-  * \brief Template functor to raise a scalar to a power
-  * \sa class CwiseUnaryOp, Cwise::pow
-  */
-template<typename Scalar>
-struct scalar_pow_op {
-  // FIXME default copy constructors seems bugged with std::complex<>
-  EIGEN_DEVICE_FUNC inline scalar_pow_op(const scalar_pow_op& other) : m_exponent(other.m_exponent) { }
-  EIGEN_DEVICE_FUNC inline scalar_pow_op(const Scalar& exponent) : m_exponent(exponent) {}
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return numext::pow(a, m_exponent); }
-  const Scalar m_exponent;
-};
-template<typename Scalar>
-struct functor_traits<scalar_pow_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to compute the quotient between a scalar and array entries.
-  * \sa class CwiseUnaryOp, Cwise::inverse()
-  */
-template<typename Scalar>
-struct scalar_inverse_mult_op {
-  EIGEN_DEVICE_FUNC scalar_inverse_mult_op(const Scalar& other) : m_other(other) {}
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return m_other / a; }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
-  { return internal::pdiv(pset1<Packet>(m_other),a); }
-  Scalar m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_inverse_mult_op<Scalar> >
-{ enum { PacketAccess = packet_traits<Scalar>::HasDiv, Cost = NumTraits<Scalar>::template Div<PacketAccess>::Cost }; };
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& b) const
+  { return BinaryOp::packetOp(internal::pset1<Packet>(m_value), b); }
 
-/** \internal
- * \brief Template functor to compute the modulo between an array and a scalar.
- */
-template <typename Scalar>
-struct scalar_mod_op {
-  EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {}
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; }
-  const Scalar m_divisor;
+  first_argument_type m_value;
 };
-template <typename Scalar>
-struct functor_traits<scalar_mod_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
+template<typename BinaryOp> struct functor_traits<bind1st_op<BinaryOp> > : functor_traits<BinaryOp> {};
 
-/** \internal
- * \brief Template functor to compute the float modulo between an array and a scalar.
- */
-template <typename Scalar>
-struct scalar_fmod_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op);
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
-  operator()(const Scalar& a, const Scalar& b) const {
-    return numext::fmod(a, b);
-  }
+
+template<typename BinaryOp> struct bind2nd_op : BinaryOp {
+
+  typedef typename BinaryOp::first_argument_type  first_argument_type;
+  typedef typename BinaryOp::second_argument_type second_argument_type;
+  typedef typename BinaryOp::result_type          result_type;
+
+  EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); }
+
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return BinaryOp::packetOp(a,internal::pset1<Packet>(m_value)); }
+
+  second_argument_type m_value;
 };
-template <typename Scalar>
-struct functor_traits<scalar_fmod_op<Scalar> > {
-  enum { Cost = 13,  // Reciprocal throughput of FPREM on Haswell.
-         PacketAccess = false };
-};
+template<typename BinaryOp> struct functor_traits<bind2nd_op<BinaryOp> > : functor_traits<BinaryOp> {};
 
 
 } // end namespace internal

diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h
index a0040db..192f225 100644
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -16,89 +16,99 @@
 
 template<typename Scalar>
 struct scalar_constant_op {
-  typedef typename packet_traits<Scalar>::type Packet;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const scalar_constant_op& other) : m_other(other.m_other) { }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const Scalar& other) : m_other(other) { }
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index, Index = 0) const { return m_other; }
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(Index, Index = 0) const { return internal::pset1<Packet>(m_other); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() () const { return m_other; }
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp() const { return internal::pset1<PacketType>(m_other); }
   const Scalar m_other;
 };
 template<typename Scalar>
 struct functor_traits<scalar_constant_op<Scalar> >
-// FIXME replace this packet test by a safe one
-{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::Vectorizable, IsRepeatable = true }; };
+{ enum { Cost = 0 /* as the constant value should be loaded in register only once for the whole expression */,
+         PacketAccess = packet_traits<Scalar>::Vectorizable, IsRepeatable = true }; };
 
 template<typename Scalar> struct scalar_identity_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_identity_op)
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const { return row==col ? Scalar(1) : Scalar(0); }
+  template<typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType row, IndexType col) const { return row==col ? Scalar(1) : Scalar(0); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_identity_op<Scalar> >
 { enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };
 
-template <typename Scalar, bool RandomAccess> struct linspaced_op_impl;
+template <typename Scalar, bool IsInteger> struct linspaced_op_impl;
 
-// linear access for packet ops:
-// 1) initialization
-//   base = [low, ..., low] + ([step, ..., step] * [-size, ..., 0])
-// 2) each step (where size is 1 for coeff access or PacketSize for packet access)
-//   base += [size*step, ..., size*step]
-//
-// TODO: Perhaps it's better to initialize lazily (so not in the constructor but in packetOp)
-//       in order to avoid the padd() in operator() ?
 template <typename Scalar>
-struct linspaced_op_impl<Scalar,false>
+struct linspaced_op_impl<Scalar,/*IsInteger*/false>
 {
-  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
 
-  linspaced_op_impl(const Scalar& low, const Scalar& step) :
-  m_low(low), m_step(step),
-  m_packetStep(pset1<Packet>(packet_traits<Scalar>::size*step)),
-  m_base(padd(pset1<Packet>(low), pmul(pset1<Packet>(step),plset<Scalar>(-packet_traits<Scalar>::size)))) {}
+  EIGEN_DEVICE_FUNC linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
+    m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : Scalar((high-low)/RealScalar(num_steps-1))),
+    m_flip(numext::abs(high)<numext::abs(low))
+  {}
 
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const
-  {
-    m_base = padd(m_base, pset1<Packet>(m_step));
-    return m_low+Scalar(i)*m_step;
+  template<typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const {
+    if(m_flip)
+      return (i==0)? m_low : Scalar(m_high - RealScalar(m_size1-i)*m_step);
+    else
+      return (i==m_size1)? m_high : Scalar(m_low + RealScalar(i)*m_step);
   }
 
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index) const { return m_base = padd(m_base,m_packetStep); }
+  template<typename Packet, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const
+  {
+    // Principle:
+    // [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
+    if(m_flip)
+    {
+      Packet pi = plset<Packet>(Scalar(i-m_size1));
+      Packet res = padd(pset1<Packet>(m_high), pmul(pset1<Packet>(m_step), pi));
+      if (EIGEN_PREDICT_TRUE(i != 0)) return res;
+      Packet mask = pcmp_lt(pset1<Packet>(0), plset<Packet>(0));
+      return pselect<Packet>(mask, res, pset1<Packet>(m_low));
+    }
+    else
+    {
+      Packet pi = plset<Packet>(Scalar(i));
+      Packet res = padd(pset1<Packet>(m_low), pmul(pset1<Packet>(m_step), pi));
+      if(EIGEN_PREDICT_TRUE(i != m_size1-unpacket_traits<Packet>::size+1)) return res;
+      Packet mask = pcmp_lt(plset<Packet>(0), pset1<Packet>(unpacket_traits<Packet>::size-1));
+      return pselect<Packet>(mask, res, pset1<Packet>(m_high));
+    }
+  }
 
   const Scalar m_low;
+  const Scalar m_high;
+  const Index m_size1;
   const Scalar m_step;
-  const Packet m_packetStep;
-  mutable Packet m_base;
+  const bool m_flip;
 };
 
-// random access for packet ops:
-// 1) each step
-//   [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
 template <typename Scalar>
-struct linspaced_op_impl<Scalar,true>
+struct linspaced_op_impl<Scalar,/*IsInteger*/true>
 {
-  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
+    m_low(low),
+    m_multiplier((high-low)/convert_index<Scalar>(num_steps<=1 ? 1 : num_steps-1)),
+    m_divisor(convert_index<Scalar>((high>=low?num_steps:-num_steps)+(high-low))/((numext::abs(high-low)+1)==0?1:(numext::abs(high-low)+1))),
+    m_use_divisor(num_steps>1 && (numext::abs(high-low)+1)<num_steps)
+  {}
 
-  linspaced_op_impl(const Scalar& low, const Scalar& step) :
-  m_low(low), m_step(step),
-  m_lowPacket(pset1<Packet>(m_low)), m_stepPacket(pset1<Packet>(m_step)), m_interPacket(plset<Scalar>(0)) {}
-
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; }
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index i) const
-  { return internal::padd(m_lowPacket, pmul(m_stepPacket, padd(pset1<Packet>(i),m_interPacket))); }
+  template<typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar operator() (IndexType i) const
+  {
+    if(m_use_divisor) return m_low + convert_index<Scalar>(i)/m_divisor;
+    else              return m_low + convert_index<Scalar>(i)*m_multiplier;
+  }
 
   const Scalar m_low;
-  const Scalar m_step;
-  const Packet m_lowPacket;
-  const Packet m_stepPacket;
-  const Packet m_interPacket;
+  const Scalar m_multiplier;
+  const Scalar m_divisor;
+  const bool m_use_divisor;
 };
 
 // ----- Linspace functor ----------------------------------------------------------------
@@ -106,50 +116,71 @@
 // Forward declaration (we default to random access which does not really give
 // us a speed gain when using packet access but it allows to use the functor in
 // nested expressions).
-template <typename Scalar, bool RandomAccess = true> struct linspaced_op;
-template <typename Scalar, bool RandomAccess> struct functor_traits< linspaced_op<Scalar,RandomAccess> >
-{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::HasSetLinear, IsRepeatable = true }; };
-template <typename Scalar, bool RandomAccess> struct linspaced_op
+template <typename Scalar> struct linspaced_op;
+template <typename Scalar> struct functor_traits< linspaced_op<Scalar> >
 {
-  typedef typename packet_traits<Scalar>::type Packet;
-  linspaced_op(const Scalar& low, const Scalar& high, DenseIndex num_steps) : impl(num_steps==1 ? high : low, num_steps==1 ? Scalar() : (high-low)/static_cast<Scalar>(num_steps-1)) {}
-
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return impl(i); }
-
-  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
-  // there row==0 and col is used for the actual iteration.
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const
+  enum
   {
-    eigen_assert(col==0 || row==0);
-    return impl(col + row);
-  }
+    Cost = 1,
+    PacketAccess =   (!NumTraits<Scalar>::IsInteger) && packet_traits<Scalar>::HasSetLinear && packet_traits<Scalar>::HasBlend,
+                  /*&& ((!NumTraits<Scalar>::IsInteger) || packet_traits<Scalar>::HasDiv),*/ // <- vectorization for integer is currently disabled
+    IsRepeatable = true
+  };
+};
+template <typename Scalar> struct linspaced_op
+{
+  EIGEN_DEVICE_FUNC linspaced_op(const Scalar& low, const Scalar& high, Index num_steps)
+    : impl((num_steps==1 ? high : low),high,num_steps)
+  {}
 
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index i) const { return impl.packetOp(i); }
+  template<typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { return impl(i); }
 
-  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
-  // there row==0 and col is used for the actual iteration.
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index row, Index col) const
-  {
-    eigen_assert(col==0 || row==0);
-    return impl.packetOp(col + row);
-  }
+  template<typename Packet,typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.template packetOp<Packet>(i); }
 
-  // This proxy object handles the actual required temporaries, the different
-  // implementations (random vs. sequential access) as well as the
-  // correct piping to size 2/4 packet operations.
-  const linspaced_op_impl<Scalar,RandomAccess> impl;
+  // This proxy object handles the actual required temporaries and the different
+  // implementations (integer vs. floating point).
+  const linspaced_op_impl<Scalar,NumTraits<Scalar>::IsInteger> impl;
 };
 
-// all functors allow linear access, except scalar_identity_op. So we fix here a quick meta
-// to indicate whether a functor allows linear access, just always answering 'yes' except for
-// scalar_identity_op.
-// FIXME move this to functor_traits adding a functor_default
-template<typename Functor> struct functor_has_linear_access { enum { ret = 1 }; };
-template<typename Scalar> struct functor_has_linear_access<scalar_identity_op<Scalar> > { enum { ret = 0 }; };
+// Linear access is automatically determined from the operator() prototypes available for the given functor.
+// If it exposes an operator()(i,j), then we assume the i and j coefficients are required independently
+// and linear access is not possible. In all other cases, linear access is enabled.
+// Users should not have to deal with this structure.
+template<typename Functor> struct functor_has_linear_access { enum { ret = !has_binary_operator<Functor>::value }; };
+
+// For unreliable compilers, let's specialize the has_*ary_operator
+// helpers so that at least built-in nullary functors work fine.
+#if !( (EIGEN_COMP_MSVC>1600) || (EIGEN_GNUC_AT_LEAST(4,8)) || (EIGEN_COMP_ICC>=1600))
+template<typename Scalar,typename IndexType>
+struct has_nullary_operator<scalar_constant_op<Scalar>,IndexType> { enum { value = 1}; };
+template<typename Scalar,typename IndexType>
+struct has_unary_operator<scalar_constant_op<Scalar>,IndexType> { enum { value = 0}; };
+template<typename Scalar,typename IndexType>
+struct has_binary_operator<scalar_constant_op<Scalar>,IndexType> { enum { value = 0}; };
+
+template<typename Scalar,typename IndexType>
+struct has_nullary_operator<scalar_identity_op<Scalar>,IndexType> { enum { value = 0}; };
+template<typename Scalar,typename IndexType>
+struct has_unary_operator<scalar_identity_op<Scalar>,IndexType> { enum { value = 0}; };
+template<typename Scalar,typename IndexType>
+struct has_binary_operator<scalar_identity_op<Scalar>,IndexType> { enum { value = 1}; };
+
+template<typename Scalar,typename IndexType>
+struct has_nullary_operator<linspaced_op<Scalar>,IndexType> { enum { value = 0}; };
+template<typename Scalar,typename IndexType>
+struct has_unary_operator<linspaced_op<Scalar>,IndexType> { enum { value = 1}; };
+template<typename Scalar,typename IndexType>
+struct has_binary_operator<linspaced_op<Scalar>,IndexType> { enum { value = 0}; };
+
+template<typename Scalar,typename IndexType>
+struct has_nullary_operator<scalar_random_op<Scalar>,IndexType> { enum { value = 1}; };
+template<typename Scalar,typename IndexType>
+struct has_unary_operator<scalar_random_op<Scalar>,IndexType> { enum { value = 0}; };
+template<typename Scalar,typename IndexType>
+struct has_binary_operator<scalar_random_op<Scalar>,IndexType> { enum { value = 0}; };
+#endif
 
 } // end namespace internal
 

diff --git a/Eigen/src/Core/functors/StlFunctors.h b/Eigen/src/Core/functors/StlFunctors.h
index 863fd45..4570c9b 100644
--- a/Eigen/src/Core/functors/StlFunctors.h
+++ b/Eigen/src/Core/functors/StlFunctors.h

@@ -12,6 +12,28 @@
 
 namespace Eigen {
 
+// Portable replacements for certain functors.
+namespace numext {
+
+template<typename T = void>
+struct equal_to {
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC bool operator()(const T& lhs, const T& rhs) const {
+    return lhs == rhs;
+  }
+};
+
+template<typename T = void>
+struct not_equal_to {
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC bool operator()(const T& lhs, const T& rhs) const {
+    return lhs != rhs;
+  }
+};
+
+}
+
+
 namespace internal {
 
 // default functor traits for STL functors:
@@ -69,24 +91,39 @@
 { enum { Cost = 1, PacketAccess = false }; };
 
 template<typename T>
+struct functor_traits<numext::equal_to<T> >
+  : functor_traits<std::equal_to<T> > {};
+
+template<typename T>
 struct functor_traits<std::not_equal_to<T> >
 { enum { Cost = 1, PacketAccess = false }; };
 
 template<typename T>
+struct functor_traits<numext::not_equal_to<T> >
+  : functor_traits<std::not_equal_to<T> > {};
+
+#if (EIGEN_COMP_CXXVER < 11)
+// std::binder* are deprecated since c++11 and will be removed in c++17
+template<typename T>
 struct functor_traits<std::binder2nd<T> >
 { enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
 
 template<typename T>
 struct functor_traits<std::binder1st<T> >
 { enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
+#endif
 
+#if (EIGEN_COMP_CXXVER < 17)
+// std::unary_negate is deprecated since c++17 and will be removed in c++20
 template<typename T>
 struct functor_traits<std::unary_negate<T> >
 { enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
 
+// std::binary_negate is deprecated since c++17 and will be removed in c++20
 template<typename T>
 struct functor_traits<std::binary_negate<T> >
 { enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
+#endif
 
 #ifdef EIGEN_STDEXT_SUPPORT
 

diff --git a/Eigen/src/Core/functors/TernaryFunctors.h b/Eigen/src/Core/functors/TernaryFunctors.h
index 665be2d..b254e96 100644
--- a/Eigen/src/Core/functors/TernaryFunctors.h
+++ b/Eigen/src/Core/functors/TernaryFunctors.h

@@ -17,6 +17,7 @@
 //---------- associative ternary functors ----------
 
 
+
 } // end namespace internal
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index 8938904..16136d1 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -56,6 +56,34 @@
 };
 
 /** \internal
+  * \brief Template functor to compute the score of a scalar, to chose a pivot
+  *
+  * \sa class CwiseUnaryOp
+  */
+template<typename Scalar> struct scalar_score_coeff_op : scalar_abs_op<Scalar>
+{
+  typedef void Score_is_abs;
+};
+template<typename Scalar>
+struct functor_traits<scalar_score_coeff_op<Scalar> > : functor_traits<scalar_abs_op<Scalar> > {};
+
+/* Avoid recomputing abs when we know the score and they are the same. Not a true Eigen functor.  */
+template<typename Scalar, typename=void> struct abs_knowing_score
+{
+  EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  template<typename Score>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a, const Score&) const { return numext::abs(a); }
+};
+template<typename Scalar> struct abs_knowing_score<Scalar, typename scalar_score_coeff_op<Scalar>::Score_is_abs>
+{
+  EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  template<typename Scal>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scal&, const result_type& a) const { return a; }
+};
+
+/** \internal
   * \brief Template functor to compute the squared absolute value of a scalar
   *
   * \sa class CwiseUnaryOp, Cwise::abs2
@@ -81,7 +109,7 @@
 template<typename Scalar> struct scalar_conjugate_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_conjugate_op)
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return Eigen::numext::conj(a); }
+  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::conj(a); }
   template<typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pconj(a); }
 };
@@ -89,12 +117,41 @@
 struct functor_traits<scalar_conjugate_op<Scalar> >
 {
   enum {
-    Cost = NumTraits<Scalar>::IsComplex ? NumTraits<Scalar>::AddCost : 0,
+    Cost = 0,
+    // Yes the cost is zero even for complexes because in most cases for which
+    // the cost is used, conjugation turns to be a no-op. Some examples:
+    //   cost(a*conj(b)) == cost(a*b)
+    //   cost(a+conj(b)) == cost(a+b)
+    //   <etc.
+    // If we don't set it to zero, then:
+    //   A.conjugate().lazyProduct(B.conjugate())
+    // will bake its operands. We definitely don't want that!
     PacketAccess = packet_traits<Scalar>::HasConj
   };
 };
 
 /** \internal
+  * \brief Template functor to compute the phase angle of a complex
+  *
+  * \sa class CwiseUnaryOp, Cwise::arg
+  */
+template<typename Scalar> struct scalar_arg_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_arg_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::arg(a); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::parg(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_arg_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::IsComplex ? 5 * NumTraits<Scalar>::MulCost : NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasArg
+  };
+};
+/** \internal
   * \brief Template functor to cast a scalar to another type
   *
   * \sa class CwiseUnaryOp, MatrixBase::cast()
@@ -107,7 +164,45 @@
 };
 template<typename Scalar, typename NewType>
 struct functor_traits<scalar_cast_op<Scalar,NewType> >
-{ enum { Cost = is_same<Scalar, NewType>::value ? 0 : NumTraits<Scalar>::AddCost + NumTraits<NewType>::AddCost, PacketAccess = false }; };
+{ enum { Cost = is_same<Scalar, NewType>::value ? 0 : NumTraits<NewType>::AddCost, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to arithmetically shift a scalar right by a number of bits
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::shift_right()
+  */
+template<typename Scalar, int N>
+struct scalar_shift_right_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_shift_right_op)
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const
+  { return a >> N; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::parithmetic_shift_right<N>(a); }
+};
+template<typename Scalar, int N>
+struct functor_traits<scalar_shift_right_op<Scalar,N> >
+{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasShift }; };
+
+/** \internal
+  * \brief Template functor to logically shift a scalar left by a number of bits
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::shift_left()
+  */
+template<typename Scalar, int N>
+struct scalar_shift_left_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_shift_left_op)
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const
+  { return a << N; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::plogical_shift_left<N>(a); }
+};
+template<typename Scalar, int N>
+struct functor_traits<scalar_shift_left_op<Scalar,N> >
+{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasShift }; };
 
 /** \internal
   * \brief Template functor to extract the real part of a complex
@@ -182,7 +277,7 @@
 template<typename Scalar> struct scalar_exp_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_exp_op)
   EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::exp(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
+  template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexp(a); }
 };
 template <typename Scalar>
@@ -199,7 +294,7 @@
      // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div,  13 other
      : (14 * NumTraits<Scalar>::AddCost +
         6 * NumTraits<Scalar>::MulCost +
-        NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost)),
+        scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value))
 #else
     Cost =
     (sizeof(Scalar) == 4
@@ -208,7 +303,7 @@
      // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div,  13 other
      : (23 * NumTraits<Scalar>::AddCost +
         12 * NumTraits<Scalar>::MulCost +
-        NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost))
+        scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value))
 #endif
   };
 };
@@ -219,25 +314,20 @@
   *
   * \sa class CwiseUnaryOp, ArrayBase::expm1()
   */
-template <typename Scalar> struct scalar_expm1_op {
+template<typename Scalar> struct scalar_expm1_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_expm1_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const {
-    return numext::expm1(a);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
-    return internal::pexpm1(a);
-  }
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::expm1(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexpm1(a); }
 };
 template <typename Scalar>
 struct functor_traits<scalar_expm1_op<Scalar> > {
   enum {
     PacketAccess = packet_traits<Scalar>::HasExpm1,
-    Cost = functor_traits<scalar_exp_op<Scalar> >::Cost  // TODO measure cost of expm1.
+    Cost = functor_traits<scalar_exp_op<Scalar> >::Cost // TODO measure cost of expm1
   };
 };
 
-
 /** \internal
   *
   * \brief Template functor to compute the logarithm of a scalar
@@ -247,7 +337,7 @@
 template<typename Scalar> struct scalar_log_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_log_op)
   EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::log(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
+  template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog(a); }
 };
 template <typename Scalar>
@@ -275,33 +365,60 @@
   *
   * \sa class CwiseUnaryOp, ArrayBase::log1p()
   */
-template <typename Scalar> struct scalar_log1p_op {
+template<typename Scalar> struct scalar_log1p_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_log1p_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const {
-    return numext::log1p(a);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
-    return internal::plog1p(a);
-  }
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::log1p(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog1p(a); }
 };
 template <typename Scalar>
 struct functor_traits<scalar_log1p_op<Scalar> > {
   enum {
     PacketAccess = packet_traits<Scalar>::HasLog1p,
-    Cost = functor_traits<scalar_log_op<Scalar> >::Cost  // TODO measure cost of
-                                                         // log1p
+    Cost = functor_traits<scalar_log_op<Scalar> >::Cost // TODO measure cost of log1p
   };
 };
 
 /** \internal
+  *
+  * \brief Template functor to compute the base-10 logarithm of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::log10()
+  */
+template<typename Scalar> struct scalar_log10_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_log10_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { EIGEN_USING_STD(log10) return log10(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog10(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_log10_op<Scalar> >
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog10 }; };
+
+/** \internal
+  *
+  * \brief Template functor to compute the base-2 logarithm of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::log2()
+  */
+template<typename Scalar> struct scalar_log2_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_log2_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar(EIGEN_LOG2E) * numext::log(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog2(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_log2_op<Scalar> >
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog }; };
+
+/** \internal
   * \brief Template functor to compute the square root of a scalar
   * \sa class CwiseUnaryOp, Cwise::sqrt()
   */
 template<typename Scalar> struct scalar_sqrt_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op)
   EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sqrt(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
+  template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); }
 };
 template <typename Scalar>
@@ -321,14 +438,26 @@
   };
 };
 
+// Boolean specialization to eliminate -Wimplicit-conversion-floating-point-to-bool warnings.
+template<> struct scalar_sqrt_op<bool> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op)
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; }
+  template <typename Packet>
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return a; }
+};
+template <>
+struct functor_traits<scalar_sqrt_op<bool> > {
+  enum { Cost = 1, PacketAccess = packet_traits<bool>::Vectorizable };
+};
+
 /** \internal
   * \brief Template functor to compute the reciprocal square root of a scalar
   * \sa class CwiseUnaryOp, Cwise::rsqrt()
   */
 template<typename Scalar> struct scalar_rsqrt_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_rsqrt_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar(1)/numext::sqrt(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::rsqrt(a); }
+  template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::prsqrt(a); }
 };
 
@@ -340,7 +469,6 @@
   };
 };
 
-
 /** \internal
   * \brief Template functor to compute the cosine of a scalar
   * \sa class CwiseUnaryOp, ArrayBase::cos()
@@ -348,7 +476,7 @@
 template<typename Scalar> struct scalar_cos_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cos_op)
   EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return numext::cos(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
+  template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcos(a); }
 };
 template<typename Scalar>
@@ -367,7 +495,7 @@
 template<typename Scalar> struct scalar_sin_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sin_op)
   EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sin(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
+  template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psin(a); }
 };
 template<typename Scalar>
@@ -387,7 +515,7 @@
 template<typename Scalar> struct scalar_tan_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_tan_op)
   EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::tan(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
+  template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptan(a); }
 };
 template<typename Scalar>
@@ -405,8 +533,8 @@
   */
 template<typename Scalar> struct scalar_acos_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_acos_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return acos(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::acos(a); }
+  template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pacos(a); }
 };
 template<typename Scalar>
@@ -424,8 +552,8 @@
   */
 template<typename Scalar> struct scalar_asin_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_asin_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return asin(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::asin(a); }
+  template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pasin(a); }
 };
 template<typename Scalar>
@@ -444,8 +572,8 @@
   */
 template<typename Scalar> struct scalar_atan_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_atan_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return atan(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::atan(a); }
+  template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::patan(a); }
 };
 template<typename Scalar>
@@ -457,138 +585,132 @@
   };
 };
 
- /** \internal
+/** \internal
   * \brief Template functor to compute the tanh of a scalar
   * \sa class CwiseUnaryOp, ArrayBase::tanh()
   */
-template<typename Scalar> struct scalar_tanh_op {
+template <typename Scalar>
+struct scalar_tanh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::tanh(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptanh(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_tanh_op<Scalar> >
-{
-  enum {
-    PacketAccess = packet_traits<Scalar>::HasTanH,
-    Cost =
-    (PacketAccess
-     // The following numbers are based on the AVX implementation,
-#ifdef EIGEN_VECTORIZE_FMA
-     // Haswell can issue 2 add/mul/madd per cycle.
-     // 9 pmadd, 2 pmul, 1 div, 2 other
-     ? (2 * NumTraits<Scalar>::AddCost + 6 * NumTraits<Scalar>::MulCost +
-     NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost)
-#else
-     ? (11 * NumTraits<Scalar>::AddCost +
-        11 * NumTraits<Scalar>::MulCost +
-        NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost)
-#endif
-     // These number are based on the tanh implementation in
-     // GenericPacketMath.h.
-     // 3 padd/psub, 3 pmul, 2 pdiv, 1 pexp, 3 other
-     : (6 * NumTraits<Scalar>::AddCost + 3 * NumTraits<Scalar>::MulCost +
-        2 * NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost +
-        functor_traits<scalar_exp_op<Scalar> >::Cost))
-  };
-};
-
-/** \internal
- * \brief Template functor to compute the sigmoid of a scalar
- * \sa class CwiseUnaryOp, ArrayBase::sigmoid()
- */
-template <typename Scalar>
-struct scalar_sigmoid_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
-  operator()(const Scalar& x) const {
-    const Scalar one = Scalar(1);
-    return one / (one + numext::exp(-x));
-  }
-
-  // Doesn't do anything fancy, just a 9/10-degree rational interpolant which
-  // interpolates 1/(1+exp(-x)) - 0.5 up to a couple of ulp in the range
-  // [-18, 18], outside of which the fl(sigmoid(x)) = {0|1}. The shifted
-  // sigmoid is interpolated because it was easier to make the fit converge.
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::tanh(a); }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& _x) const {
-    // Clamp the inputs to the range [-18, 18] since anything outside
-    // this range is 0.0f or 1.0f in single-precision.
-    const Packet x = pmax(pset1<Packet>(Scalar(-18.0)),
-                          pmin(pset1<Packet>(Scalar(18.0)), _x));
-
-    // The monomial coefficients of the numerator polynomial (odd).
-    const Packet alpha_1 = pset1<Packet>(Scalar(2.48287947061529e-01));
-    const Packet alpha_3 = pset1<Packet>(Scalar(8.51377133304701e-03));
-    const Packet alpha_5 = pset1<Packet>(Scalar(6.08574864600143e-05));
-    const Packet alpha_7 = pset1<Packet>(Scalar(1.15627324459942e-07));
-    const Packet alpha_9 = pset1<Packet>(Scalar(4.37031012579801e-11));
-
-    // The monomial coefficients of the denominator polynomial (even).
-    const Packet beta_0 = pset1<Packet>(Scalar(9.93151921023180e-01));
-    const Packet beta_2 = pset1<Packet>(Scalar(1.16817656904453e-01));
-    const Packet beta_4 = pset1<Packet>(Scalar(1.70198817374094e-03));
-    const Packet beta_6 = pset1<Packet>(Scalar(6.29106785017040e-06));
-    const Packet beta_8 = pset1<Packet>(Scalar(5.76102136993427e-09));
-    const Packet beta_10 = pset1<Packet>(Scalar(6.10247389755681e-13));
-
-    // Since the polynomials are odd/even, we need x^2.
-    const Packet x2 = pmul(x, x);
-
-    // Evaluate the numerator polynomial p.
-    Packet p = pmadd(x2, alpha_9, alpha_7);
-    p = pmadd(x2, p, alpha_5);
-    p = pmadd(x2, p, alpha_3);
-    p = pmadd(x2, p, alpha_1);
-    p = pmul(x, p);
-
-    // Evaluate the denominator polynomial p.
-    Packet q = pmadd(x2, beta_10, beta_8);
-    q = pmadd(x2, q, beta_6);
-    q = pmadd(x2, q, beta_4);
-    q = pmadd(x2, q, beta_2);
-    q = pmadd(x2, q, beta_0);
-
-    // Divide the numerator by the denominator and shift it up.
-    return pmax(pset1<Packet>(Scalar(0.0)),
-                pmin(pset1<Packet>(Scalar(1.0)),
-                     padd(pdiv(p, q), pset1<Packet>(Scalar(0.5)))));
-  }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const { return ptanh(x); }
 };
 
 template <typename Scalar>
-struct functor_traits<scalar_sigmoid_op<Scalar> > {
+struct functor_traits<scalar_tanh_op<Scalar> > {
   enum {
-    PacketAccess =
-        packet_traits<Scalar>::HasAdd && packet_traits<Scalar>::HasDiv &&
-        packet_traits<Scalar>::HasMul && packet_traits<Scalar>::HasMin &&
-        packet_traits<Scalar>::HasMax,
-    Cost = (PacketAccess
+    PacketAccess = packet_traits<Scalar>::HasTanh,
+    Cost = ( (EIGEN_FAST_MATH && is_same<Scalar,float>::value)
+// The following numbers are based on the AVX implementation,
 #ifdef EIGEN_VECTORIZE_FMA
                 // Haswell can issue 2 add/mul/madd per cycle.
                 // 9 pmadd, 2 pmul, 1 div, 2 other
                 ? (2 * NumTraits<Scalar>::AddCost +
                    6 * NumTraits<Scalar>::MulCost +
-                   NumTraits<Scalar>::template Div<
-                       packet_traits<Scalar>::HasDiv>::Cost)
+                   scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value)
 #else
                 ? (11 * NumTraits<Scalar>::AddCost +
                    11 * NumTraits<Scalar>::MulCost +
-                   NumTraits<Scalar>::template Div<
-                       packet_traits<Scalar>::HasDiv>::Cost)
+                   scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value)
 #endif
-                // These number are based on the tanh implementation in
-                // GenericPacketMath.h.
-                // 1 padd/psub, 1 pdiv, 1 pexp, 1 other
-                : (1 * NumTraits<Scalar>::AddCost +
+                // This number assumes a naive implementation of tanh
+                : (6 * NumTraits<Scalar>::AddCost +
                    3 * NumTraits<Scalar>::MulCost +
-                   1 * NumTraits<Scalar>::template Div<
-                           packet_traits<Scalar>::HasDiv>::Cost +
+                   2 * scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value +
                    functor_traits<scalar_exp_op<Scalar> >::Cost))
   };
 };
 
+#if EIGEN_HAS_CXX11_MATH
+/** \internal
+  * \brief Template functor to compute the atanh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::atanh()
+  */
+template <typename Scalar>
+struct scalar_atanh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atanh(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_atanh_op<Scalar> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+#endif
+
+/** \internal
+  * \brief Template functor to compute the sinh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::sinh()
+  */
+template<typename Scalar> struct scalar_sinh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sinh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sinh(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psinh(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_sinh_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasSinh
+  };
+};
+
+#if EIGEN_HAS_CXX11_MATH
+/** \internal
+  * \brief Template functor to compute the asinh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::asinh()
+  */
+template <typename Scalar>
+struct scalar_asinh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::asinh(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_asinh_op<Scalar> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+#endif
+
+/** \internal
+  * \brief Template functor to compute the cosh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::cosh()
+  */
+template<typename Scalar> struct scalar_cosh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cosh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::cosh(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcosh(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_cosh_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasCosh
+  };
+};
+
+#if EIGEN_HAS_CXX11_MATH
+/** \internal
+  * \brief Template functor to compute the acosh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::acosh()
+  */
+template <typename Scalar>
+struct scalar_acosh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::acosh(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_acosh_op<Scalar> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+#endif
+
 /** \internal
   * \brief Template functor to compute the inverse of a scalar
   * \sa class CwiseUnaryOp, Cwise::inverse()
@@ -601,9 +723,13 @@
   EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
   { return internal::pdiv(pset1<Packet>(Scalar(1)),a); }
 };
-template<typename Scalar>
-struct functor_traits<scalar_inverse_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
+template <typename Scalar>
+struct functor_traits<scalar_inverse_op<Scalar> > {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasDiv,
+    Cost = scalar_div_cost<Scalar, PacketAccess>::value
+  };
+};
 
 /** \internal
   * \brief Template functor to compute the square of a scalar
@@ -621,6 +747,19 @@
 struct functor_traits<scalar_square_op<Scalar> >
 { enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
 
+// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC.
+template<>
+struct scalar_square_op<bool> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_square_op)
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; }
+  template<typename Packet>
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
+  { return a; }
+};
+template<>
+struct functor_traits<scalar_square_op<bool> >
+{ enum { Cost = 0, PacketAccess = packet_traits<bool>::Vectorizable }; };
+
 /** \internal
   * \brief Template functor to compute the cube of a scalar
   * \sa class CwiseUnaryOp, Cwise::cube()
@@ -637,6 +776,19 @@
 struct functor_traits<scalar_cube_op<Scalar> >
 { enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
 
+// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC.
+template<>
+struct scalar_cube_op<bool> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cube_op)
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; }
+  template<typename Packet>
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
+  { return a; }
+};
+template<>
+struct functor_traits<scalar_cube_op<bool> >
+{ enum { Cost = 0, PacketAccess = packet_traits<bool>::Vectorizable }; };
+
 /** \internal
   * \brief Template functor to compute the rounded value of a scalar
   * \sa class CwiseUnaryOp, ArrayBase::round()
@@ -676,6 +828,25 @@
 };
 
 /** \internal
+  * \brief Template functor to compute the rounded (with current rounding mode)  value of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::rint()
+  */
+template<typename Scalar> struct scalar_rint_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_rint_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::rint(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::print(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_rint_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasRint
+  };
+};
+
+/** \internal
   * \brief Template functor to compute the ceil of a scalar
   * \sa class CwiseUnaryOp, ArrayBase::ceil()
   */
@@ -701,7 +872,13 @@
 template<typename Scalar> struct scalar_isnan_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_isnan_op)
   typedef bool result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isnan)(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const {
+#if defined(SYCL_DEVICE_ONLY)
+    return numext::isnan(a);
+#else
+    return (numext::isnan)(a);
+#endif
+  }
 };
 template<typename Scalar>
 struct functor_traits<scalar_isnan_op<Scalar> >
@@ -719,7 +896,13 @@
 template<typename Scalar> struct scalar_isinf_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_isinf_op)
   typedef bool result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isinf)(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const {
+#if defined(SYCL_DEVICE_ONLY)
+    return numext::isinf(a);
+#else
+    return (numext::isinf)(a);
+#endif
+  }
 };
 template<typename Scalar>
 struct functor_traits<scalar_isinf_op<Scalar> >
@@ -737,7 +920,13 @@
 template<typename Scalar> struct scalar_isfinite_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_isfinite_op)
   typedef bool result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isfinite)(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const {
+#if defined(SYCL_DEVICE_ONLY)
+    return numext::isfinite(a);
+#else
+    return (numext::isfinite)(a);
+#endif
+  }
 };
 template<typename Scalar>
 struct functor_traits<scalar_isfinite_op<Scalar> >
@@ -753,41 +942,62 @@
   *
   * \sa class CwiseUnaryOp, ArrayBase::operator!
   */
-template <typename Scalar>
-struct scalar_boolean_not_op {
+template<typename Scalar> struct scalar_boolean_not_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_not_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const bool& a) const {
-    return !a;
-  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a) const { return !a; }
 };
-template <typename Scalar>
+template<typename Scalar>
 struct functor_traits<scalar_boolean_not_op<Scalar> > {
-  enum { Cost = NumTraits<bool>::AddCost, PacketAccess = false };
+  enum {
+    Cost = NumTraits<bool>::AddCost,
+    PacketAccess = false
+  };
 };
 
 /** \internal
   * \brief Template functor to compute the signum of a scalar
   * \sa class CwiseUnaryOp, Cwise::sign()
   */
-template<typename Scalar,bool iscpx=(NumTraits<Scalar>::IsComplex!=0) > struct scalar_sign_op;
+template<typename Scalar,bool is_complex=(NumTraits<Scalar>::IsComplex!=0), bool is_integer=(NumTraits<Scalar>::IsInteger!=0) > struct scalar_sign_op;
 template<typename Scalar>
-struct scalar_sign_op<Scalar,false> {
+struct scalar_sign_op<Scalar, false, true> {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op)
   EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const
   {
-    return Scalar( (a>Scalar(0)) - (a<Scalar(0)) );
+      return Scalar( (a>Scalar(0)) - (a<Scalar(0)) );
   }
+  //TODO
+  //template <typename Packet>
+  //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); }
 };
+
 template<typename Scalar>
-struct scalar_sign_op<Scalar,true> {
+struct scalar_sign_op<Scalar, false, false> {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op)
   EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const
   {
-    typedef typename NumTraits<Scalar>::Real Real;
-    Real aa = numext::abs(a);
-    const Real divisor = (aa == 0) ? Real(0) : Real(1) / aa;
-    return Scalar(real(a) * divisor, imag(a) * divisor);
+    return (numext::isnan)(a) ? a : Scalar( (a>Scalar(0)) - (a<Scalar(0)) );
   }
+  //TODO
+  //template <typename Packet>
+  //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); }
+};
+
+template<typename Scalar, bool is_integer>
+struct scalar_sign_op<Scalar,true, is_integer> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const
+  {
+    typedef typename NumTraits<Scalar>::Real real_type;
+    real_type aa = numext::abs(a);
+    if (aa==real_type(0))
+      return Scalar(0);
+    aa = real_type(1)/aa;
+    return Scalar(a.real()*aa, a.imag()*aa );
+  }
+  //TODO
+  //template <typename Packet>
+  //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_sign_op<Scalar> >
@@ -796,7 +1006,121 @@
         NumTraits<Scalar>::IsComplex
         ? ( 8*NumTraits<Scalar>::MulCost  ) // roughly
         : ( 3*NumTraits<Scalar>::AddCost),
-    PacketAccess = false,
+    PacketAccess = packet_traits<Scalar>::HasSign
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the logistic function of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::logistic()
+  */
+template <typename T>
+struct scalar_logistic_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
+    return packetOp(x);
+  }
+
+  template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(const Packet& x) const {
+    const Packet one = pset1<Packet>(T(1));
+    return pdiv(one, padd(one, pexp(pnegate(x))));
+  }
+};
+
+#ifndef EIGEN_GPU_COMPILE_PHASE
+/** \internal
+  * \brief Template specialization of the logistic function for float.
+  *
+  *  Uses just a 9/10-degree rational interpolant which
+  *  interpolates 1/(1+exp(-x)) - 0.5 up to a couple of ulps in the range
+  *  [-9, 18]. Below -9 we use the more accurate approximation
+  *  1/(1+exp(-x)) ~= exp(x), and above 18 the logistic function is 1 withing
+  *  one ulp. The shifted logistic is interpolated because it was easier to
+  *  make the fit converge.
+  *
+  */
+template <>
+struct scalar_logistic_op<float> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator()(const float& x) const {
+    return packetOp(x);
+  }
+
+  template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(const Packet& _x) const {
+    const Packet cutoff_lower = pset1<Packet>(-9.f);
+    const Packet lt_mask = pcmp_lt<Packet>(_x, cutoff_lower);
+    const bool any_small = predux_any(lt_mask);
+
+    // The upper cut-off is the smallest x for which the rational approximation evaluates to 1.
+    // Choosing this value saves us a few instructions clamping the results at the end.
+#ifdef EIGEN_VECTORIZE_FMA
+    const Packet cutoff_upper = pset1<Packet>(15.7243833541870117f);
+#else
+    const Packet cutoff_upper = pset1<Packet>(15.6437711715698242f);
+#endif
+    const Packet x = pmin(_x, cutoff_upper);
+
+    // The monomial coefficients of the numerator polynomial (odd).
+    const Packet alpha_1 = pset1<Packet>(2.48287947061529e-01f);
+    const Packet alpha_3 = pset1<Packet>(8.51377133304701e-03f);
+    const Packet alpha_5 = pset1<Packet>(6.08574864600143e-05f);
+    const Packet alpha_7 = pset1<Packet>(1.15627324459942e-07f);
+    const Packet alpha_9 = pset1<Packet>(4.37031012579801e-11f);
+
+    // The monomial coefficients of the denominator polynomial (even).
+    const Packet beta_0 = pset1<Packet>(9.93151921023180e-01f);
+    const Packet beta_2 = pset1<Packet>(1.16817656904453e-01f);
+    const Packet beta_4 = pset1<Packet>(1.70198817374094e-03f);
+    const Packet beta_6 = pset1<Packet>(6.29106785017040e-06f);
+    const Packet beta_8 = pset1<Packet>(5.76102136993427e-09f);
+    const Packet beta_10 = pset1<Packet>(6.10247389755681e-13f);
+
+    // Since the polynomials are odd/even, we need x^2.
+    const Packet x2 = pmul(x, x);
+
+    // Evaluate the numerator polynomial p.
+    Packet p = pmadd(x2, alpha_9, alpha_7);
+    p = pmadd(x2, p, alpha_5);
+    p = pmadd(x2, p, alpha_3);
+    p = pmadd(x2, p, alpha_1);
+    p = pmul(x, p);
+
+    // Evaluate the denominator polynomial q.
+    Packet q = pmadd(x2, beta_10, beta_8);
+    q = pmadd(x2, q, beta_6);
+    q = pmadd(x2, q, beta_4);
+    q = pmadd(x2, q, beta_2);
+    q = pmadd(x2, q, beta_0);
+    // Divide the numerator by the denominator and shift it up.
+    const Packet logistic = padd(pdiv(p, q), pset1<Packet>(0.5f));
+    if (EIGEN_PREDICT_FALSE(any_small)) {
+      const Packet exponential = pexp(_x);
+      return pselect(lt_mask, exponential, logistic);
+    } else {
+      return logistic;
+    }
+  }
+};
+#endif  // #ifndef EIGEN_GPU_COMPILE_PHASE
+
+template <typename T>
+struct functor_traits<scalar_logistic_op<T> > {
+  enum {
+    // The cost estimate for float here here is for the common(?) case where
+    // all arguments are greater than -9.
+    Cost = scalar_div_cost<T, packet_traits<T>::HasDiv>::value +
+           (internal::is_same<T, float>::value
+                ? NumTraits<T>::AddCost * 15 + NumTraits<T>::MulCost * 11
+                : NumTraits<T>::AddCost * 2 +
+                      functor_traits<scalar_exp_op<T> >::Cost),
+    PacketAccess =
+        packet_traits<T>::HasAdd && packet_traits<T>::HasDiv &&
+        (internal::is_same<T, float>::value
+             ? packet_traits<T>::HasMul && packet_traits<T>::HasMax &&
+                   packet_traits<T>::HasMin
+             : packet_traits<T>::HasNegate && packet_traits<T>::HasExp)
   };
 };
 

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 77fcf54..f35b760 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h

@@ -15,7 +15,13 @@
 
 namespace internal {
 
-template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
+enum GEBPPacketSizeType {
+  GEBPPacketFull = 0,
+  GEBPPacketHalf,
+  GEBPPacketQuarter
+};
+
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target, int _PacketSize=GEBPPacketFull>
 class gebp_traits;
 
 
@@ -25,16 +31,42 @@
   return a<=0 ? b : a;
 }
 
-#if EIGEN_ARCH_i386_OR_x86_64
-const std::ptrdiff_t defaultL1CacheSize = 32*1024;
-const std::ptrdiff_t defaultL2CacheSize = 256*1024;
-const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
+#if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
+#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
 #else
-const std::ptrdiff_t defaultL1CacheSize = 16*1024;
-const std::ptrdiff_t defaultL2CacheSize = 512*1024;
-const std::ptrdiff_t defaultL3CacheSize = 512*1024;
+#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
+#endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
+
+#if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
+#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
+#else
+#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
+#endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
+
+#if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
+#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
+#else
+#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
+#endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
+  
+#if EIGEN_ARCH_i386_OR_x86_64
+const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024);
+const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024);
+const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024);
+#elif EIGEN_ARCH_PPC
+const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024);
+const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
+const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024);
+#else
+const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024);
+const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
+const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024);
 #endif
 
+#undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
+#undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
+#undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
+
 /** \internal */
 struct CacheSizes {
   CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
@@ -50,7 +82,6 @@
   std::ptrdiff_t m_l3;
 };
 
-
 /** \internal */
 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
 {
@@ -77,8 +108,6 @@
   }
 }
 
-#define CEIL(a, b) ((a)+(b)-1)/(b)
-
 /* Helper for computeProductBlockingSizes.
  *
  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
@@ -90,73 +119,196 @@
  * - the number of scalars that fit into a packet (when vectorization is enabled).
  *
  * \sa setCpuCacheSizes */
+
 template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
 void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
 {
-  // Explanations:
-  // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
-  // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
-  // per kc x nr vertical small panels where nr is the blocking size along the n dimension
-  // at the register level. For vectorization purpose, these small vertical panels are unpacked,
-  // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
-  // stay in L1 cache.
   typedef gebp_traits<LhsScalar,RhsScalar> Traits;
-  typedef typename Traits::ResScalar ResScalar;
-  enum {
-    kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
-    ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
-    mr = Traits::mr,
-    nr = Traits::nr,
-  };
 
+  // Explanations:
+  // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
+  // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
+  // per mr x kc horizontal small panels where mr is the blocking size along the m dimension
+  // at the register level. This small horizontal panel has to stay within L1 cache.
   std::ptrdiff_t l1, l2, l3;
   manage_caching_sizes(GetAction, &l1, &l2, &l3);
+  #ifdef EIGEN_VECTORIZE_AVX512
+  // We need to find a rationale for that, but without this adjustment,
+  // performance with AVX512 is pretty bad, like -20% slower.
+  // One reason is that with increasing packet-size, the blocking size k
+  // has to become pretty small if we want that 1 lhs panel fit within L1.
+  // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are:
+  //   k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144.
+  // This is quite small for a good reuse of the accumulation registers.
+  l1 *= 4;
+  #endif
 
-  // Increasing k gives us more time to prefetch the content of the "C"
-  // registers. However once the latency is hidden there is no point in
-  // increasing the value of k, so we'll cap it at 320 (value determined
-  // experimentally).
-  const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320);
-  if (k_cache < k) {
-    k = k_cache - (k_cache % 8);
-    eigen_assert(k > 0);
-  }
-
-  const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
-  Index n_per_thread = CEIL(n, num_threads);
-  if (n_cache <= n_per_thread) {
-    // Don't exceed the capacity of the l2 cache.
-    if (n_cache < nr) {
-      n = nr;
-    } else {
-      n = n_cache - (n_cache % nr);
-      eigen_assert(n > 0);
+  if (num_threads > 1) {
+    typedef typename Traits::ResScalar ResScalar;
+    enum {
+      kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+      ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
+      kr = 8,
+      mr = Traits::mr,
+      nr = Traits::nr
+    };
+    // Increasing k gives us more time to prefetch the content of the "C"
+    // registers. However once the latency is hidden there is no point in
+    // increasing the value of k, so we'll cap it at 320 (value determined
+    // experimentally).
+    // To avoid that k vanishes, we make k_cache at least as big as kr
+    const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
+    if (k_cache < k) {
+      k = k_cache - (k_cache % kr);
+      eigen_internal_assert(k > 0);
     }
-  } else {
-    n = (std::min<Index>)(n, n_per_thread + nr - 1 - ((n_per_thread + nr - 1) % nr));
-  }
 
-  if (l3 > l2) {
-    // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
-    const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
-    const Index m_per_thread = CEIL(m, num_threads);
-    if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
-      m = m_cache - (m_cache % mr);
-      eigen_assert(m > 0);
+    const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
+    const Index n_per_thread = numext::div_ceil(n, num_threads);
+    if (n_cache <= n_per_thread) {
+      // Don't exceed the capacity of the l2 cache.
+      eigen_internal_assert(n_cache >= static_cast<Index>(nr));
+      n = n_cache - (n_cache % nr);
+      eigen_internal_assert(n > 0);
     } else {
-      m = (std::min<Index>)(m, m_per_thread + mr - 1 - ((m_per_thread + mr - 1) % mr));
+      n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
+    }
+
+    if (l3 > l2) {
+      // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
+      const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
+      const Index m_per_thread = numext::div_ceil(m, num_threads);
+      if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
+        m = m_cache - (m_cache % mr);
+        eigen_internal_assert(m > 0);
+      } else {
+        m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
+      }
+    }
+  }
+  else {
+    // In unit tests we do not want to use extra large matrices,
+    // so we reduce the cache size to check the blocking strategy is not flawed
+#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
+    l1 = 9*1024;
+    l2 = 32*1024;
+    l3 = 512*1024;
+#endif
+
+    // Early return for small problems because the computation below are time consuming for small problems.
+    // Perhaps it would make more sense to consider k*n*m??
+    // Note that for very tiny problem, this function should be bypassed anyway
+    // because we use the coefficient-based implementation for them.
+    if((numext::maxi)(k,(numext::maxi)(m,n))<48)
+      return;
+
+    typedef typename Traits::ResScalar ResScalar;
+    enum {
+      k_peeling = 8,
+      k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+      k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
+    };
+
+    // ---- 1st level of blocking on L1, yields kc ----
+
+    // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
+    // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
+    // We also include a register-level block of the result (mx x nr).
+    // (In an ideal world only the lhs panel would stay in L1)
+    // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
+    const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
+    const Index old_k = k;
+    if(k>max_kc)
+    {
+      // We are really blocking on the third dimension:
+      // -> reduce blocking size to make sure the last block is as large as possible
+      //    while keeping the same number of sweeps over the result.
+      k = (k%max_kc)==0 ? max_kc
+                        : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
+
+      eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
+    }
+
+    // ---- 2nd level of blocking on max(L2,L3), yields nc ----
+
+    // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
+    //      actual_l2 = max(l2, l3/nb_core_sharing_l3)
+    // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
+    // For instance, it corresponds to 6MB of L3 shared among 4 cores.
+    #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
+    const Index actual_l2 = l3;
+    #else
+    const Index actual_l2 = 1572864; // == 1.5 MB
+    #endif
+
+    // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
+    // The second half is implicitly reserved to access the result and lhs coefficients.
+    // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
+    // to limit this growth: we bound nc to growth by a factor x1.5.
+    // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
+    // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
+    Index max_nc;
+    const Index lhs_bytes = m * k * sizeof(LhsScalar);
+    const Index remaining_l1 = l1- k_sub - lhs_bytes;
+    if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
+    {
+      // L1 blocking
+      max_nc = remaining_l1 / (k*sizeof(RhsScalar));
+    }
+    else
+    {
+      // L2 blocking
+      max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
+    }
+    // WARNING Below, we assume that Traits::nr is a power of two.
+    Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
+    if(n>nc)
+    {
+      // We are really blocking over the columns:
+      // -> reduce blocking size to make sure the last block is as large as possible
+      //    while keeping the same number of sweeps over the packed lhs.
+      //    Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
+      n = (n%nc)==0 ? nc
+                    : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
+    }
+    else if(old_k==k)
+    {
+      // So far, no blocking at all, i.e., kc==k, and nc==n.
+      // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
+      // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
+      Index problem_size = k*n*sizeof(LhsScalar);
+      Index actual_lm = actual_l2;
+      Index max_mc = m;
+      if(problem_size<=1024)
+      {
+        // problem is small enough to keep in L1
+        // Let's choose m such that lhs's block fit in 1/3 of L1
+        actual_lm = l1;
+      }
+      else if(l3!=0 && problem_size<=32768)
+      {
+        // we have both L2 and L3, and problem is small enough to be kept in L2
+        // Let's choose m such that lhs's block fit in 1/3 of L2
+        actual_lm = l2;
+        max_mc = (numext::mini<Index>)(576,max_mc);
+      }
+      Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
+      if (mc > Traits::mr) mc -= mc % Traits::mr;
+      else if (mc==0) return;
+      m = (m%mc)==0 ? mc
+                    : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
     }
   }
 }
 
 template <typename Index>
-bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
+inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
 {
 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
   if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
-    k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
-    m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
-    n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
+    k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
+    m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
+    n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
     return true;
   }
 #else
@@ -186,98 +338,102 @@
 template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
 void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
 {
-  if (!k || !m || !n) {
-    return;
-  }
-
   if (!useSpecificBlockingSizes(k, m, n)) {
-    evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor>(k, m, n, num_threads);
+    evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
   }
-
-#if !EIGEN_ARCH_i386_OR_x86_64
-  // The following code rounds k,m,n down to the nearest multiple of register-level blocking sizes.
-  // We should always do that, and in upstream Eigen we always do that.
-  // Unfortunately, we can't do that in Google3 on x86[-64] because this makes tiny differences in results and
-  // we have some unfortunate tests require very specific relative errors which fail because of that,
-  // at least //learning/laser/algorithms/wals:wals_batch_solver_test.
-  // Note that this wouldn't make any difference if we had been using only correctly rounded values,
-  // but we've not! See how in evaluateProductBlockingSizesHeuristic, we do the rounding down by
-  // bit-masking, e.g. mr_mask = (0xffffffff/mr)*mr, implicitly assuming that mr is always a power of
-  // two, which is not the case with the 3px4 kernel.
-  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
-  enum {
-    kr = 8,
-    mr = Traits::mr,
-    nr = Traits::nr
-  };
-  if (k > kr) k -= k % kr;
-  if (m > mr) m -= m % mr;
-  if (n > nr) n -= n % nr;
-#endif
 }
 
 template<typename LhsScalar, typename RhsScalar, typename Index>
-inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads)
+inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
 {
-  computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads);
+  computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
 }
 
-#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-  #define CJMADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
-#else
+template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
+struct RhsPanelHelper {
+ private:
+  static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken;
+ public:
+  typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type;
+};
 
-  // FIXME (a bit overkill maybe ?)
+template <typename Packet>
+struct QuadPacket
+{
+  Packet B_0, B1, B2, B3;
+  const Packet& get(const FixedInt<0>&) const { return B_0; }
+  const Packet& get(const FixedInt<1>&) const { return B1; }
+  const Packet& get(const FixedInt<2>&) const { return B2; }
+  const Packet& get(const FixedInt<3>&) const { return B3; }
+};
 
-  template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
-    EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
-    {
-      c = cj.pmadd(a,b,c);
-    }
-  };
+template <int N, typename T1, typename T2, typename T3>
+struct packet_conditional { typedef T3 type; };
 
-  template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
-    EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
-    {
-      t = b; t = cj.pmul(a,t); c = padd(c,t);
-    }
-  };
+template <typename T1, typename T2, typename T3>
+struct packet_conditional<GEBPPacketFull, T1, T2, T3> { typedef T1 type; };
 
-  template<typename CJ, typename A, typename B, typename C, typename T>
-  EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
-  {
-    gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
-  }
+template <typename T1, typename T2, typename T3>
+struct packet_conditional<GEBPPacketHalf, T1, T2, T3> { typedef T2 type; };
 
-  #define CJMADD(CJ,A,B,C,T)  gebp_madd(CJ,A,B,C,T);
-//   #define CJMADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = padd(C,T);
-#endif
+#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size)         \
+  typedef typename packet_conditional<packet_size,                 \
+                                      typename packet_traits<name ## Scalar>::type, \
+                                      typename packet_traits<name ## Scalar>::half, \
+                                      typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
+  prefix ## name ## Packet
+
+#define PACKET_DECL_COND(name, packet_size)                        \
+  typedef typename packet_conditional<packet_size,                 \
+                                      typename packet_traits<name ## Scalar>::type, \
+                                      typename packet_traits<name ## Scalar>::half, \
+                                      typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
+  name ## Packet
+
+#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size)        \
+  typedef typename packet_conditional<packet_size,                 \
+                                      typename packet_traits<Scalar>::type, \
+                                      typename packet_traits<Scalar>::half, \
+                                      typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
+  prefix ## ScalarPacket
+
+#define PACKET_DECL_COND_SCALAR(packet_size)                       \
+  typedef typename packet_conditional<packet_size,                 \
+                                      typename packet_traits<Scalar>::type, \
+                                      typename packet_traits<Scalar>::half, \
+                                      typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
+  ScalarPacket
 
 /* Vectorization logic
  *  real*real: unpack rhs to constant packets, ...
- *
+ * 
  *  cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
  *          storing each res packet into two packets (2x2),
- *          at the end combine them: swap the second and addsub them
+ *          at the end combine them: swap the second and addsub them 
  *  cf*cf : same but with 2x4 blocks
  *  cplx*real : unpack rhs to constant packets, ...
  *  real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
  */
-template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
 class gebp_traits
 {
 public:
   typedef _LhsScalar LhsScalar;
   typedef _RhsScalar RhsScalar;
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
 
   enum {
     ConjLhs = _ConjLhs,
     ConjRhs = _ConjRhs,
-    Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
-    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
-    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
-    ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
-
+    Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
+    LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+    RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
+    ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
+    
     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
 
     // register block size along the N direction must be 1 or 4
@@ -285,48 +441,56 @@
 
     // register block size along the M direction (currently, this one cannot be modified)
     default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
-#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
-    // we assume 16 registers
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \
+    && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914))
+    // we assume 16 registers or more
+    // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
+    // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
+    // Bug 1515: MSVC prior to v19.14 yields to register spilling.
     mr = Vectorizable ? 3*LhsPacketSize : default_mr,
 #else
     mr = default_mr,
 #endif
-
+    
     LhsProgress = LhsPacketSize,
     RhsProgress = 1
   };
 
-  typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
-  typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
-  typedef typename packet_traits<ResScalar>::type  _ResPacket;
 
   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+  typedef LhsPacket LhsPacket4Packing;
 
+  typedef QuadPacket<RhsPacket> RhsPacketx4;
   typedef ResPacket AccPacket;
-
+  
   EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
   {
     p = pset1<ResPacket>(ResScalar(0));
   }
 
-  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
-  {
-    pbroadcast4(b, b0, b1, b2, b3);
-  }
-
-//   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
-//   {
-//     pbroadcast2(b, b0, b1);
-//   }
-
   template<typename RhsPacketType>
   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
   {
     dest = pset1<RhsPacketType>(*b);
   }
 
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+  {
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+  }
+
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
+  {
+    loadRhs(b, dest);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+  {
+  }
+
   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
   {
     dest = ploadquad<RhsPacket>(b);
@@ -344,53 +508,61 @@
     dest = ploadu<LhsPacketType>(a);
   }
 
-  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
-  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
+  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
   {
+    conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
     // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
     // let gcc allocate the register in which to store the result of the pmul
     // (in the case where there is no FMA) gcc fails to figure out how to avoid
     // spilling register.
 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
     EIGEN_UNUSED_VARIABLE(tmp);
-    c = pmadd(a,b,c);
+    c = cj.pmadd(a,b,c);
 #else
-    tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
+    tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
 #endif
   }
 
+  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+  {
+    madd(a, b.get(lane), c, tmp, lane);
+  }
+
   EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
   {
     r = pmadd(c,alpha,r);
   }
-
+  
   template<typename ResPacketHalf>
   EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const
   {
     r = pmadd(c,alpha,r);
   }
 
-protected:
-//   conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
-//   conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
 };
 
-template<typename RealScalar, bool _ConjLhs>
-class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
+template<typename RealScalar, bool _ConjLhs, int Arch, int _PacketSize>
+class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize>
 {
 public:
   typedef std::complex<RealScalar> LhsScalar;
   typedef RealScalar RhsScalar;
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
 
   enum {
     ConjLhs = _ConjLhs,
     ConjRhs = false,
-    Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
-    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
-    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
-    ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
-
+    Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
+    LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+    RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
+    ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
+    
     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
     nr = 4,
 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
@@ -404,13 +576,12 @@
     RhsProgress = 1
   };
 
-  typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
-  typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
-  typedef typename packet_traits<ResScalar>::type  _ResPacket;
-
   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
+  typedef QuadPacket<RhsPacket> RhsPacketx4;
 
   typedef ResPacket AccPacket;
 
@@ -419,13 +590,42 @@
     p = pset1<ResPacket>(ResScalar(0));
   }
 
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
   {
-    dest = pset1<RhsPacket>(*b);
+    dest = pset1<RhsPacketType>(*b);
   }
 
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+  {
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+  }
+
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
+  {
+    loadRhs(b, dest);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+  {}
+  
   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
   {
+    loadRhsQuad_impl(b,dest, typename conditional<RhsPacketSize==16,true_type,false_type>::type());
+  }
+
+  EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const
+  {
+    // FIXME we can do better!
+    // what we want here is a ploadheight
+    RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
+    dest = ploadquad<RhsPacket>(tmp);
+  }
+
+  EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const
+  {
+    eigen_internal_assert(RhsPacketSize<=8);
     dest = pset1<RhsPacket>(*b);
   }
 
@@ -434,27 +634,20 @@
     dest = pload<LhsPacket>(a);
   }
 
-  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
+  template<typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
   {
-    dest = ploadu<LhsPacket>(a);
+    dest = ploadu<LhsPacketType>(a);
   }
 
-  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
-  {
-    pbroadcast4(b, b0, b1, b2, b3);
-  }
-
-//   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
-//   {
-//     pbroadcast2(b, b0, b1);
-//   }
-
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
+  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
   {
     madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
   }
 
-  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
+  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
+  EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
   {
 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
     EIGEN_UNUSED_VARIABLE(tmp);
@@ -469,13 +662,20 @@
     c += a * b;
   }
 
-  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
   {
+    madd(a, b.get(lane), c, tmp, lane);
+  }
+
+  template <typename ResPacketType, typename AccPacketType>
+  EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
+  {
+    conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj;
     r = cj.pmadd(c,alpha,r);
   }
 
 protected:
-  conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
 };
 
 template<typename Packet>
@@ -494,13 +694,57 @@
   return res;
 }
 
+// note that for DoublePacket<RealPacket> the "4" in "downto4"
+// corresponds to the number of complexes, so it means "8"
+// it terms of real coefficients.
+
 template<typename Packet>
-const DoublePacket<Packet>& predux4(const DoublePacket<Packet> &a)
+const DoublePacket<Packet>&
+predux_half_dowto4(const DoublePacket<Packet> &a,
+                   typename enable_if<unpacket_traits<Packet>::size<=8>::type* = 0)
 {
   return a;
 }
 
-template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; };
+template<typename Packet>
+DoublePacket<typename unpacket_traits<Packet>::half>
+predux_half_dowto4(const DoublePacket<Packet> &a,
+                   typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0)
+{
+  // yes, that's pretty hackish :(
+  DoublePacket<typename unpacket_traits<Packet>::half> res;
+  typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
+  typedef typename packet_traits<Cplx>::type CplxPacket;
+  res.first  = predux_half_dowto4(CplxPacket(a.first)).v;
+  res.second = predux_half_dowto4(CplxPacket(a.second)).v;
+  return res;
+}
+
+// same here, "quad" actually means "8" in terms of real coefficients
+template<typename Scalar, typename RealPacket>
+void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
+                            typename enable_if<unpacket_traits<RealPacket>::size<=8>::type* = 0)
+{
+  dest.first  = pset1<RealPacket>(numext::real(*b));
+  dest.second = pset1<RealPacket>(numext::imag(*b));
+}
+
+template<typename Scalar, typename RealPacket>
+void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
+                            typename enable_if<unpacket_traits<RealPacket>::size==16>::type* = 0)
+{
+  // yes, that's pretty hackish too :(
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
+  RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
+  dest.first  = ploadquad<RealPacket>(r);
+  dest.second = ploadquad<RealPacket>(i);
+}
+
+
+template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > {
+  typedef DoublePacket<typename unpacket_traits<Packet>::half> half;
+};
 // template<typename Packet>
 // DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
 // {
@@ -510,24 +754,30 @@
 //   return res;
 // }
 
-template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
+template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
+class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize >
 {
 public:
   typedef std::complex<RealScalar>  Scalar;
   typedef std::complex<RealScalar>  LhsScalar;
   typedef std::complex<RealScalar>  RhsScalar;
   typedef std::complex<RealScalar>  ResScalar;
+  
+  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+  PACKET_DECL_COND(Real, _PacketSize);
+  PACKET_DECL_COND_SCALAR(_PacketSize);
 
   enum {
     ConjLhs = _ConjLhs,
     ConjRhs = _ConjRhs,
-    Vectorizable = packet_traits<RealScalar>::Vectorizable
-                && packet_traits<Scalar>::Vectorizable,
-    RealPacketSize  = Vectorizable ? packet_traits<RealScalar>::size : 1,
-    ResPacketSize   = Vectorizable ? packet_traits<ResScalar>::size : 1,
-    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
-    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+    Vectorizable = unpacket_traits<RealPacket>::vectorizable
+                && unpacket_traits<ScalarPacket>::vectorizable,
+    ResPacketSize   = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
+    LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+    RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
+    RealPacketSize  = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
 
     // FIXME: should depend on NumberOfRegisters
     nr = 4,
@@ -536,16 +786,18 @@
     LhsProgress = ResPacketSize,
     RhsProgress = 1
   };
+  
+  typedef DoublePacket<RealPacket>                 DoublePacketType;
 
-  typedef typename packet_traits<RealScalar>::type RealPacket;
-  typedef typename packet_traits<Scalar>::type     ScalarPacket;
-  typedef DoublePacket<RealPacket> DoublePacketType;
-
+  typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type LhsPacket4Packing;
   typedef typename conditional<Vectorizable,RealPacket,  Scalar>::type LhsPacket;
   typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
   typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
   typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
 
+  // this actualy holds 8 packets!
+  typedef QuadPacket<RhsPacket> RhsPacketx4;
+  
   EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
 
   EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p)
@@ -555,51 +807,49 @@
   }
 
   // Scalar path
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const
   {
-    dest = pset1<ResPacket>(*b);
+    dest = pset1<ScalarPacket>(*b);
   }
 
   // Vectorized path
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
+  template<typename RealPacketType>
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
   {
-    dest.first  = pset1<RealPacket>(real(*b));
-    dest.second = pset1<RealPacket>(imag(*b));
+    dest.first  = pset1<RealPacketType>(numext::real(*b));
+    dest.second = pset1<RealPacketType>(numext::imag(*b));
   }
 
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+  {
+    loadRhs(b, dest.B_0);
+    loadRhs(b + 1, dest.B1);
+    loadRhs(b + 2, dest.B2);
+    loadRhs(b + 3, dest.B3);
+  }
+
+  // Scalar path
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const
+  {
+    loadRhs(b, dest);
+  }
+
+  // Vectorized path
+  template<typename RealPacketType>
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
+  {
+    loadRhs(b, dest);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
+  
   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
   {
     loadRhs(b,dest);
   }
   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
   {
-    eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4);
-    loadRhs(b,dest);
-  }
-
-  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
-  {
-    // FIXME not sure that's the best way to implement it!
-    loadRhs(b+0, b0);
-    loadRhs(b+1, b1);
-    loadRhs(b+2, b2);
-    loadRhs(b+3, b3);
-  }
-
-  // Vectorized path
-  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
-  {
-    // FIXME not sure that's the best way to implement it!
-    loadRhs(b+0, b0);
-    loadRhs(b+1, b1);
-  }
-
-  // Scalar path
-  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
-  {
-    // FIXME not sure that's the best way to implement it!
-    loadRhs(b+0, b0);
-    loadRhs(b+1, b1);
+    loadQuadToDoublePacket(b,dest);
   }
 
   // nothing special here
@@ -608,49 +858,61 @@
     dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
   }
 
-  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
+  template<typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
   {
-    dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
+    dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a));
   }
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const
+  template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType>
+  EIGEN_STRONG_INLINE
+  typename enable_if<!is_same<RhsPacketType,RhsPacketx4>::value>::type
+  madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const
   {
     c.first   = padd(pmul(a,b.first), c.first);
     c.second  = padd(pmul(a,b.second),c.second);
   }
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
+  template<typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const
   {
     c = cj.pmadd(a,b,c);
   }
 
+  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+  {
+    madd(a, b.get(lane), c, tmp, lane);
+  }
+  
   EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
-
-  EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const
+  
+  template<typename RealPacketType, typename ResPacketType>
+  EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha, ResPacketType& r) const
   {
     // assemble c
-    ResPacket tmp;
+    ResPacketType tmp;
     if((!ConjLhs)&&(!ConjRhs))
     {
-      tmp = pcplxflip(pconj(ResPacket(c.second)));
-      tmp = padd(ResPacket(c.first),tmp);
+      tmp = pcplxflip(pconj(ResPacketType(c.second)));
+      tmp = padd(ResPacketType(c.first),tmp);
     }
     else if((!ConjLhs)&&(ConjRhs))
     {
-      tmp = pconj(pcplxflip(ResPacket(c.second)));
-      tmp = padd(ResPacket(c.first),tmp);
+      tmp = pconj(pcplxflip(ResPacketType(c.second)));
+      tmp = padd(ResPacketType(c.first),tmp);
     }
     else if((ConjLhs)&&(!ConjRhs))
     {
-      tmp = pcplxflip(ResPacket(c.second));
-      tmp = padd(pconj(ResPacket(c.first)),tmp);
+      tmp = pcplxflip(ResPacketType(c.second));
+      tmp = padd(pconj(ResPacketType(c.first)),tmp);
     }
     else if((ConjLhs)&&(ConjRhs))
     {
-      tmp = pcplxflip(ResPacket(c.second));
-      tmp = psub(pconj(ResPacket(c.first)),tmp);
+      tmp = pcplxflip(ResPacketType(c.second));
+      tmp = psub(pconj(ResPacketType(c.first)),tmp);
     }
-
+    
     r = pmadd(tmp,alpha,r);
   }
 
@@ -658,8 +920,8 @@
   conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
 };
 
-template<typename RealScalar, bool _ConjRhs>
-class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
+template<typename RealScalar, bool _ConjRhs, int Arch, int _PacketSize>
+class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize >
 {
 public:
   typedef std::complex<RealScalar>  Scalar;
@@ -667,15 +929,26 @@
   typedef Scalar      RhsScalar;
   typedef Scalar      ResScalar;
 
+  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Real, _PacketSize);
+  PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize);
+
+#undef PACKET_DECL_COND_SCALAR_PREFIX
+#undef PACKET_DECL_COND_PREFIX
+#undef PACKET_DECL_COND_SCALAR
+#undef PACKET_DECL_COND
+
   enum {
     ConjLhs = false,
     ConjRhs = _ConjRhs,
-    Vectorizable = packet_traits<RealScalar>::Vectorizable
-                && packet_traits<Scalar>::Vectorizable,
-    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
-    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
-    ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
-
+    Vectorizable = unpacket_traits<_RealPacket>::vectorizable
+                && unpacket_traits<_ScalarPacket>::vectorizable,
+    LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+    RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
+    ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
+    
     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
     // FIXME: should depend on NumberOfRegisters
     nr = 4,
@@ -685,14 +958,11 @@
     RhsProgress = 1
   };
 
-  typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
-  typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
-  typedef typename packet_traits<ResScalar>::type  _ResPacket;
-
   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
-
+  typedef LhsPacket LhsPacket4Packing;
+  typedef QuadPacket<RhsPacket> RhsPacketx4;
   typedef ResPacket AccPacket;
 
   EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
@@ -700,45 +970,50 @@
     p = pset1<ResPacket>(ResScalar(0));
   }
 
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
   {
-    dest = pset1<RhsPacket>(*b);
+    dest = pset1<RhsPacketType>(*b);
   }
 
-  void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
   {
-    pbroadcast4(b, b0, b1, b2, b3);
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
   }
 
-//   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
-//   {
-//     // FIXME not sure that's the best way to implement it!
-//     b0 = pload1<RhsPacket>(b+0);
-//     b1 = pload1<RhsPacket>(b+1);
-//   }
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
+  {
+    loadRhs(b, dest);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+  {}
 
   EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
   {
     dest = ploaddup<LhsPacket>(a);
   }
-
+  
   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
   {
-    eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4);
-    loadRhs(b,dest);
+    dest = ploadquad<RhsPacket>(b);
   }
 
-  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
+  template<typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
   {
-    dest = ploaddup<LhsPacket>(a);
+    dest = ploaddup<LhsPacketType>(a);
   }
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
+  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
   {
     madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
   }
 
-  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
+  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
+  EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
   {
 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
     EIGEN_UNUSED_VARIABLE(tmp);
@@ -746,7 +1021,7 @@
 #else
     tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
 #endif
-
+    
   }
 
   EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
@@ -754,16 +1029,24 @@
     c += a * b;
   }
 
-  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
   {
+    madd(a, b.get(lane), c, tmp, lane);
+  }
+
+  template <typename ResPacketType, typename AccPacketType>
+  EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
+  {
+    conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj;
     r = cj.pmadd(alpha,c,r);
   }
 
 protected:
-  conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
+
 };
 
-/* optimized GEneral packed Block * packed Panel product kernel
+/* optimized General packed Block * packed Panel product kernel
  *
  * Mixing type logic: C += A * B
  *  |  A  |  B  | comments
@@ -773,26 +1056,47 @@
 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
 struct gebp_kernel
 {
-  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
+  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
+  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketHalf> HalfTraits;
+  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketQuarter> QuarterTraits;
+  
   typedef typename Traits::ResScalar ResScalar;
   typedef typename Traits::LhsPacket LhsPacket;
   typedef typename Traits::RhsPacket RhsPacket;
   typedef typename Traits::ResPacket ResPacket;
   typedef typename Traits::AccPacket AccPacket;
+  typedef typename Traits::RhsPacketx4 RhsPacketx4;
 
-  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
+  typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
+
+  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
+
   typedef typename SwappedTraits::ResScalar SResScalar;
   typedef typename SwappedTraits::LhsPacket SLhsPacket;
   typedef typename SwappedTraits::RhsPacket SRhsPacket;
   typedef typename SwappedTraits::ResPacket SResPacket;
   typedef typename SwappedTraits::AccPacket SAccPacket;
 
+  typedef typename HalfTraits::LhsPacket LhsPacketHalf;
+  typedef typename HalfTraits::RhsPacket RhsPacketHalf;
+  typedef typename HalfTraits::ResPacket ResPacketHalf;
+  typedef typename HalfTraits::AccPacket AccPacketHalf;
+
+  typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
+  typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
+  typedef typename QuarterTraits::ResPacket ResPacketQuarter;
+  typedef typename QuarterTraits::AccPacket AccPacketQuarter;
+
   typedef typename DataMapper::LinearMapper LinearMapper;
 
   enum {
     Vectorizable  = Traits::Vectorizable,
     LhsProgress   = Traits::LhsProgress,
+    LhsProgressHalf      = HalfTraits::LhsProgress,
+    LhsProgressQuarter   = QuarterTraits::LhsProgress,
     RhsProgress   = Traits::RhsProgress,
+    RhsProgressHalf      = HalfTraits::RhsProgress,
+    RhsProgressQuarter   = QuarterTraits::RhsProgress,
     ResPacketSize = Traits::ResPacketSize
   };
 
@@ -802,26 +1106,321 @@
                   Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
 };
 
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs,
+int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target>::LhsProgress>
+struct last_row_process_16_packets
+{
+  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
+  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
+
+  typedef typename Traits::ResScalar ResScalar;
+  typedef typename SwappedTraits::LhsPacket SLhsPacket;
+  typedef typename SwappedTraits::RhsPacket SRhsPacket;
+  typedef typename SwappedTraits::ResPacket SResPacket;
+  typedef typename SwappedTraits::AccPacket SAccPacket;
+
+  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
+                  const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
+                  ResScalar alpha, SAccPacket &C0)
+    {
+      EIGEN_UNUSED_VARIABLE(res);
+      EIGEN_UNUSED_VARIABLE(straits);
+      EIGEN_UNUSED_VARIABLE(blA);
+      EIGEN_UNUSED_VARIABLE(blB);
+      EIGEN_UNUSED_VARIABLE(depth);
+      EIGEN_UNUSED_VARIABLE(endk);
+      EIGEN_UNUSED_VARIABLE(i);
+      EIGEN_UNUSED_VARIABLE(j2);
+      EIGEN_UNUSED_VARIABLE(alpha);
+      EIGEN_UNUSED_VARIABLE(C0);
+    }
+};
+
+
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper,  mr,  nr, ConjugateLhs,  ConjugateRhs, 16> {
+  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
+  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
+
+  typedef typename Traits::ResScalar ResScalar;
+  typedef typename SwappedTraits::LhsPacket SLhsPacket;
+  typedef typename SwappedTraits::RhsPacket SRhsPacket;
+  typedef typename SwappedTraits::ResPacket SResPacket;
+  typedef typename SwappedTraits::AccPacket SAccPacket;
+
+  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
+                  const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
+                  ResScalar alpha, SAccPacket &C0)
+  {
+    typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
+    typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
+    typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
+    typedef typename unpacket_traits<typename unpacket_traits<SAccPacket>::half>::half SAccPacketQuarter;
+
+    SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
+    SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
+
+    if (depth - endk > 0)
+      {
+	// We have to handle the last row(s) of the rhs, which
+	// correspond to a half-packet
+	SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
+
+	for (Index kk = endk; kk < depth; kk++)
+	  {
+	    SLhsPacketQuarter a0;
+	    SRhsPacketQuarter b0;
+	    straits.loadLhsUnaligned(blB, a0);
+	    straits.loadRhs(blA, b0);
+	    straits.madd(a0,b0,c0,b0, fix<0>);
+	    blB += SwappedTraits::LhsProgress/4;
+	    blA += 1;
+	  }
+	straits.acc(c0, alphav, R);
+      }
+    else
+      {
+	straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
+      }
+    res.scatterPacket(i, j2, R);
+  }
+};
+
+template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
+struct lhs_process_one_packet
+{
+  typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
+
+  EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
+  {
+    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
+    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
+    traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
+    traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
+    traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
+    traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
+    traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
+    traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
+    #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
+    __asm__  ("" : "+x,m" (*A0));
+    #endif
+    EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
+  }
+
+  EIGEN_STRONG_INLINE void operator()(
+    const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha,
+    Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB,
+    int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
+  {
+    GEBPTraits traits;
+
+    // loops on each largest micro horizontal panel of lhs
+    // (LhsProgress x depth)
+    for(Index i=peelStart; i<peelEnd; i+=LhsProgress)
+    {
+      // loops on each largest micro vertical panel of rhs (depth * nr)
+      for(Index j2=0; j2<packet_cols4; j2+=nr)
+      {
+        // We select a LhsProgress x nr micro block of res
+        // which is entirely stored into 1 x nr registers.
+
+        const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
+        prefetch(&blA[0]);
+
+        // gets res block as register
+        AccPacket C0, C1, C2, C3;
+        traits.initAcc(C0);
+        traits.initAcc(C1);
+        traits.initAcc(C2);
+        traits.initAcc(C3);
+        // To improve instruction pipelining, let's double the accumulation registers:
+        //  even k will accumulate in C*, while odd k will accumulate in D*.
+        // This trick is crutial to get good performance with FMA, otherwise it is 
+        // actually faster to perform separated MUL+ADD because of a naturally
+        // better instruction-level parallelism.
+        AccPacket D0, D1, D2, D3;
+        traits.initAcc(D0);
+        traits.initAcc(D1);
+        traits.initAcc(D2);
+        traits.initAcc(D3);
+
+        LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+        LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+        LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+        LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+        r0.prefetch(prefetch_res_offset);
+        r1.prefetch(prefetch_res_offset);
+        r2.prefetch(prefetch_res_offset);
+        r3.prefetch(prefetch_res_offset);
+
+        // performs "inner" products
+        const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+        prefetch(&blB[0]);
+        LhsPacket A0, A1;
+
+        for(Index k=0; k<peeled_kc; k+=pk)
+        {
+          EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
+          RhsPacketx4 rhs_panel;
+          RhsPacket T0;
+
+          internal::prefetch(blB+(48+0));
+          peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+          peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+          internal::prefetch(blB+(48+16));
+          peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+          peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+
+          blB += pk*4*RhsProgress;
+          blA += pk*LhsProgress;
+
+          EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
+        }
+        C0 = padd(C0,D0);
+        C1 = padd(C1,D1);
+        C2 = padd(C2,D2);
+        C3 = padd(C3,D3);
+
+        // process remaining peeled loop
+        for(Index k=peeled_kc; k<depth; k++)
+        {
+          RhsPacketx4 rhs_panel;
+          RhsPacket T0;
+          peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          blB += 4*RhsProgress;
+          blA += LhsProgress;
+        }
+
+        ResPacket R0, R1;
+        ResPacket alphav = pset1<ResPacket>(alpha);
+
+        R0 = r0.template loadPacket<ResPacket>(0);
+        R1 = r1.template loadPacket<ResPacket>(0);
+        traits.acc(C0, alphav, R0);
+        traits.acc(C1,  alphav, R1);
+        r0.storePacket(0, R0);
+        r1.storePacket(0, R1);
+
+        R0 = r2.template loadPacket<ResPacket>(0);
+        R1 = r3.template loadPacket<ResPacket>(0);
+        traits.acc(C2,  alphav, R0);
+        traits.acc(C3,  alphav, R1);
+        r2.storePacket(0, R0);
+        r3.storePacket(0, R1);
+      }
+
+      // Deal with remaining columns of the rhs
+      for(Index j2=packet_cols4; j2<cols; j2++)
+      {
+        // One column at a time
+        const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
+        prefetch(&blA[0]);
+
+        // gets res block as register
+        AccPacket C0;
+        traits.initAcc(C0);
+
+        LinearMapper r0 = res.getLinearMapper(i, j2);
+
+        // performs "inner" products
+        const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+        LhsPacket A0;
+
+        for(Index k= 0; k<peeled_kc; k+=pk)
+        {
+          EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
+          RhsPacket B_0;
+
+#define EIGEN_GEBGP_ONESTEP(K)                                          \
+	      do {                                                      \
+		EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
+		EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+    /* FIXME: why unaligned???? */ \
+		traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
+		traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);		\
+		traits.madd(A0, B_0, C0, B_0, fix<0>);				\
+		EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
+	      } while(false);
+
+          EIGEN_GEBGP_ONESTEP(0);
+          EIGEN_GEBGP_ONESTEP(1);
+          EIGEN_GEBGP_ONESTEP(2);
+          EIGEN_GEBGP_ONESTEP(3);
+          EIGEN_GEBGP_ONESTEP(4);
+          EIGEN_GEBGP_ONESTEP(5);
+          EIGEN_GEBGP_ONESTEP(6);
+          EIGEN_GEBGP_ONESTEP(7);
+
+          blB += pk*RhsProgress;
+          blA += pk*LhsProgress;
+
+          EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
+        }
+
+        // process remaining peeled loop
+        for(Index k=peeled_kc; k<depth; k++)
+        {
+          RhsPacket B_0;
+          EIGEN_GEBGP_ONESTEP(0);
+          blB += RhsProgress;
+          blA += LhsProgress;
+        }
+#undef EIGEN_GEBGP_ONESTEP
+        ResPacket R0;
+        ResPacket alphav = pset1<ResPacket>(alpha);
+        R0 = r0.template loadPacket<ResPacket>(0);
+        traits.acc(C0, alphav, R0);
+        r0.storePacket(0, R0);
+      }
+    }
+  }
+};
+
+template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
+struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
+{
+
+EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
+  {
+        EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
+        EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
+        traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
+        traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
+        traits.madd(*A0, *B_0, *C0, *B_0);
+        traits.madd(*A0, *B1,  *C1, *B1);
+        traits.madd(*A0, *B2,  *C2, *B2);
+        traits.madd(*A0, *B3,  *C3, *B3);
+        EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
+  }
+};
+
 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
 EIGEN_DONT_INLINE
-void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
   ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
                Index rows, Index depth, Index cols, ResScalar alpha,
                Index strideA, Index strideB, Index offsetA, Index offsetB)
   {
     Traits traits;
     SwappedTraits straits;
-
+    
     if(strideA==-1) strideA = depth;
     if(strideB==-1) strideB = depth;
     conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
     Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
     const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
     const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
-    const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
+    const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
+    const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
+    const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
     enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
     const Index peeled_kc  = depth & ~(pk-1);
-    const Index prefetch_res_offset = 0;
+    const int prefetch_res_offset = 32/sizeof(ResScalar);    
 //     const Index depth2     = depth & ~1;
 
     //---------- Process 3 * LhsProgress rows at once ----------
@@ -829,20 +1428,29 @@
     // Usually, make sense only with FMA
     if(mr>=3*Traits::LhsProgress)
     {
-      // loops on each largest micro horizontal panel of lhs (3*Traits::LhsProgress x depth)
-      for(Index i=0; i<peeled_mc3; i+=3*Traits::LhsProgress)
+      // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
+      // and on each largest micro vertical panel of the rhs (depth * nr).
+      // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
+      // However, if depth is too small, we can extend the number of rows of these horizontal panels.
+      // This actual number of rows is computed as follow:
+      const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
+      // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
+      // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
+      // or because we are testing specific blocking sizes.
+      const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
+      for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
       {
-        // loops on each largest micro vertical panel of rhs (depth * nr)
+        const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
         for(Index j2=0; j2<packet_cols4; j2+=nr)
         {
-          // We select a 3*Traits::LhsProgress x nr micro block of res which is entirely
+          for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
+          {
+          
+          // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
           // stored into 3 x nr registers.
-
-          const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
+          
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
           prefetch(&blA[0]);
-          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
-          prefetch(&blB[0]);
-          LhsPacket A0, A1;
 
           // gets res block as register
           AccPacket C0, C1, C2,  C3,
@@ -863,39 +1471,55 @@
           r3.prefetch(0);
 
           // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+          prefetch(&blB[0]);
+          LhsPacket A0, A1;
+
           for(Index k=0; k<peeled_kc; k+=pk)
           {
             EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
-            RhsPacket B_0, T0;
+            // 15 registers are taken (12 for acc, 2 for lhs).
+            RhsPanel15 rhs_panel;
+            RhsPacket T0;
             LhsPacket A2;
-
-#define EIGEN_GEBP_ONESTEP(K) \
-            do { \
-              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
+            #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
+            // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
+            // without this workaround A0, A1, and A2 are loaded in the same register,
+            // which is not good for pipelining
+            #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__  ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
+            #else
+            #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
+            #endif
+#define EIGEN_GEBP_ONESTEP(K)                                                     \
+            do {                                                                  \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4");          \
               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
-              internal::prefetch(blA+(3*K+16)*LhsProgress); \
-              if (EIGEN_ARCH_ARM) internal::prefetch(blB+(4*K+16)*RhsProgress); /* Bug 953 */ \
-              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
-              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
-              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
-              traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
-              traits.madd(A0, B_0, C0, T0); \
-              traits.madd(A1, B_0, C4, T0); \
-              traits.madd(A2, B_0, C8, B_0); \
-              traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
-              traits.madd(A0, B_0, C1, T0); \
-              traits.madd(A1, B_0, C5, T0); \
-              traits.madd(A2, B_0, C9, B_0); \
-              traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
-              traits.madd(A0, B_0, C2,  T0); \
-              traits.madd(A1, B_0, C6,  T0); \
-              traits.madd(A2, B_0, C10, B_0); \
-              traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
-              traits.madd(A0, B_0, C3 , T0); \
-              traits.madd(A1, B_0, C7,  T0); \
-              traits.madd(A2, B_0, C11, B_0); \
-              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
-            } while(false)
+              internal::prefetch(blA + (3 * K + 16) * LhsProgress);               \
+              if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) {                            \
+                internal::prefetch(blB + (4 * K + 16) * RhsProgress);             \
+              } /* Bug 953 */                                                     \
+              traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0);                \
+              traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1);                \
+              traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2);                \
+              EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
+              traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel);     \
+              traits.madd(A0, rhs_panel, C0, T0, fix<0>);                         \
+              traits.madd(A1, rhs_panel, C4, T0, fix<0>);                         \
+              traits.madd(A2, rhs_panel, C8, T0, fix<0>);                         \
+              traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel);   \
+              traits.madd(A0, rhs_panel, C1, T0, fix<1>);                         \
+              traits.madd(A1, rhs_panel, C5, T0, fix<1>);                         \
+              traits.madd(A2, rhs_panel, C9, T0, fix<1>);                         \
+              traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel);   \
+              traits.madd(A0, rhs_panel, C2, T0, fix<2>);                         \
+              traits.madd(A1, rhs_panel, C6, T0, fix<2>);                         \
+              traits.madd(A2, rhs_panel, C10, T0, fix<2>);                        \
+              traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel);   \
+              traits.madd(A0, rhs_panel, C3, T0, fix<3>);                         \
+              traits.madd(A1, rhs_panel, C7, T0, fix<3>);                         \
+              traits.madd(A2, rhs_panel, C11, T0, fix<3>);                        \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4");            \
+            } while (false)
 
             internal::prefetch(blB);
             EIGEN_GEBP_ONESTEP(0);
@@ -915,20 +1539,22 @@
           // process remaining peeled loop
           for(Index k=peeled_kc; k<depth; k++)
           {
-            RhsPacket B_0, T0;
+            RhsPanel15 rhs_panel;
+            RhsPacket T0;
             LhsPacket A2;
             EIGEN_GEBP_ONESTEP(0);
             blB += 4*RhsProgress;
             blA += 3*Traits::LhsProgress;
           }
+
 #undef EIGEN_GEBP_ONESTEP
 
           ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
-          R2 = r0.loadPacket(2 * Traits::ResPacketSize);
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
           traits.acc(C8, alphav, R2);
@@ -936,9 +1562,9 @@
           r0.storePacket(1 * Traits::ResPacketSize, R1);
           r0.storePacket(2 * Traits::ResPacketSize, R2);
 
-          R0 = r1.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r1.loadPacket(1 * Traits::ResPacketSize);
-          R2 = r1.loadPacket(2 * Traits::ResPacketSize);
+          R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
           traits.acc(C1, alphav, R0);
           traits.acc(C5, alphav, R1);
           traits.acc(C9, alphav, R2);
@@ -946,9 +1572,9 @@
           r1.storePacket(1 * Traits::ResPacketSize, R1);
           r1.storePacket(2 * Traits::ResPacketSize, R2);
 
-          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r2.loadPacket(1 * Traits::ResPacketSize);
-          R2 = r2.loadPacket(2 * Traits::ResPacketSize);
+          R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
           traits.acc(C2, alphav, R0);
           traits.acc(C6, alphav, R1);
           traits.acc(C10, alphav, R2);
@@ -956,25 +1582,27 @@
           r2.storePacket(1 * Traits::ResPacketSize, R1);
           r2.storePacket(2 * Traits::ResPacketSize, R2);
 
-          R0 = r3.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r3.loadPacket(1 * Traits::ResPacketSize);
-          R2 = r3.loadPacket(2 * Traits::ResPacketSize);
+          R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
           traits.acc(C3, alphav, R0);
           traits.acc(C7, alphav, R1);
           traits.acc(C11, alphav, R2);
           r3.storePacket(0 * Traits::ResPacketSize, R0);
           r3.storePacket(1 * Traits::ResPacketSize, R1);
-          r3.storePacket(2 * Traits::ResPacketSize, R2);
+          r3.storePacket(2 * Traits::ResPacketSize, R2);          
+          }
         }
 
         // Deal with remaining columns of the rhs
         for(Index j2=packet_cols4; j2<cols; j2++)
         {
+          for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
+          {
           // One column at a time
           const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
           prefetch(&blA[0]);
-          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
-          prefetch(&blB[0]);
+
           // gets res block as register
           AccPacket C0, C4, C8;
           traits.initAcc(C0);
@@ -983,26 +1611,28 @@
 
           LinearMapper r0 = res.getLinearMapper(i, j2);
           r0.prefetch(0);
-          LhsPacket A0, A1, A2;
 
           // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+          LhsPacket A0, A1, A2;
+          
           for(Index k=0; k<peeled_kc; k+=pk)
           {
             EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
             RhsPacket B_0;
-#define EIGEN_GEBGP_ONESTEP(K) \
-            do { \
-              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
+#define EIGEN_GEBGP_ONESTEP(K)                                                    \
+            do {                                                                  \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1");          \
               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
-              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
-              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
-              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
-              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);   \
-              traits.madd(A0, B_0, C0, B_0); \
-              traits.madd(A1, B_0, C4, B_0); \
-              traits.madd(A2, B_0, C8, B_0); \
-              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
-            } while(false)
+              traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0);                \
+              traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1);                \
+              traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2);                \
+              traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0);                   \
+              traits.madd(A0, B_0, C0, B_0, fix<0>);                              \
+              traits.madd(A1, B_0, C4, B_0, fix<0>);                              \
+              traits.madd(A2, B_0, C8, B_0, fix<0>);                              \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1");            \
+            } while (false)
 
             EIGEN_GEBGP_ONESTEP(0);
             EIGEN_GEBGP_ONESTEP(1);
@@ -1013,8 +1643,8 @@
             EIGEN_GEBGP_ONESTEP(6);
             EIGEN_GEBGP_ONESTEP(7);
 
-            blB += pk*RhsProgress;
-            blA += pk*3*Traits::LhsProgress;
+            blB += int(pk) * int(RhsProgress);
+            blA += int(pk) * 3 * int(Traits::LhsProgress);
 
             EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
           }
@@ -1031,15 +1661,16 @@
           ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
-          R2 = r0.loadPacket(2 * Traits::ResPacketSize);
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
           traits.acc(C8, alphav, R2);
           r0.storePacket(0 * Traits::ResPacketSize, R0);
           r0.storePacket(1 * Traits::ResPacketSize, R1);
-          r0.storePacket(2 * Traits::ResPacketSize, R2);
+          r0.storePacket(2 * Traits::ResPacketSize, R2);          
+          }
         }
       }
     }
@@ -1047,19 +1678,25 @@
     //---------- Process 2 * LhsProgress rows at once ----------
     if(mr>=2*Traits::LhsProgress)
     {
-      // loops on each largest micro horizontal panel of lhs (2*LhsProgress x depth)
-      for(Index i=peeled_mc3; i<peeled_mc2; i+=2*LhsProgress)
+      const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
+      // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
+      // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
+      // or because we are testing specific blocking sizes.
+      Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
+
+      for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
       {
-        // loops on each largest micro vertical panel of rhs (depth * nr)
+        Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
         for(Index j2=0; j2<packet_cols4; j2+=nr)
         {
-          // We select a 2*Traits::LhsProgress x nr micro block of res which is entirely
+          for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
+          {
+          
+          // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
           // stored into 2 x nr registers.
-
+          
           const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
           prefetch(&blA[0]);
-          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
-          prefetch(&blB[0]);
 
           // gets res block as register
           AccPacket C0, C1, C2, C3,
@@ -1077,40 +1714,48 @@
           r2.prefetch(prefetch_res_offset);
           r3.prefetch(prefetch_res_offset);
 
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+          prefetch(&blB[0]);
           LhsPacket A0, A1;
 
-          // performs "inner" products
           for(Index k=0; k<peeled_kc; k+=pk)
           {
             EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
-            RhsPacket B_0, B1, B2, B3, T0;
+            RhsPacketx4 rhs_panel;
+            RhsPacket T0;
 
-            // The 2 ASM comments in the #define are intended to prevent gcc
-            // from optimizing the code accross steps since it ends up spilling
-            // registers in this case.
-   #define EIGEN_GEBGP_ONESTEP(K) \
-            do {                                                                \
-              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");        \
-              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
-              traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                    \
-              traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                    \
-              traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
-              traits.madd(A0, B_0, C0, T0);                                     \
-              traits.madd(A1, B_0, C4, B_0);                                    \
-              traits.madd(A0, B1,  C1, T0);                                     \
-              traits.madd(A1, B1,  C5, B1);                                     \
-              traits.madd(A0, B2,  C2, T0);                                     \
-              traits.madd(A1, B2,  C6, B2);                                     \
-              traits.madd(A0, B3,  C3, T0);                                     \
-              traits.madd(A1, B3,  C7, B3);                                     \
-              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");          \
-            } while(false)
+          // NOTE: the begin/end asm comments below work around bug 935!
+          // but they are not enough for gcc>=6 without FMA (bug 1637)
+          #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
+            #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__  ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
+          #else
+            #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
+          #endif
+#define EIGEN_GEBGP_ONESTEP(K)                                            \
+            do {                                                          \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");  \
+              traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0);        \
+              traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1);        \
+              traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
+              traits.madd(A0, rhs_panel, C0, T0, fix<0>);                 \
+              traits.madd(A1, rhs_panel, C4, T0, fix<0>);                 \
+              traits.madd(A0, rhs_panel, C1, T0, fix<1>);                 \
+              traits.madd(A1, rhs_panel, C5, T0, fix<1>);                 \
+              traits.madd(A0, rhs_panel, C2, T0, fix<2>);                 \
+              traits.madd(A1, rhs_panel, C6, T0, fix<2>);                 \
+              traits.madd(A0, rhs_panel, C3, T0, fix<3>);                 \
+              traits.madd(A1, rhs_panel, C7, T0, fix<3>);                 \
+              EIGEN_GEBP_2PX4_SPILLING_WORKAROUND                         \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");    \
+            } while (false)
 
-            prefetch(&blB[pk*4*RhsProgress]);
+            internal::prefetch(blB+(48+0));
             EIGEN_GEBGP_ONESTEP(0);
             EIGEN_GEBGP_ONESTEP(1);
             EIGEN_GEBGP_ONESTEP(2);
             EIGEN_GEBGP_ONESTEP(3);
+            internal::prefetch(blB+(48+16));
             EIGEN_GEBGP_ONESTEP(4);
             EIGEN_GEBGP_ONESTEP(5);
             EIGEN_GEBGP_ONESTEP(6);
@@ -1124,7 +1769,8 @@
           // process remaining peeled loop
           for(Index k=peeled_kc; k<depth; k++)
           {
-            RhsPacket B_0, B1, B2, B3, T0;
+            RhsPacketx4 rhs_panel;
+            RhsPacket T0;
             EIGEN_GEBGP_ONESTEP(0);
             blB += 4*RhsProgress;
             blA += 2*Traits::LhsProgress;
@@ -1134,10 +1780,10 @@
           ResPacket R0, R1, R2, R3;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
-          R2 = r1.loadPacket(0 * Traits::ResPacketSize);
-          R3 = r1.loadPacket(1 * Traits::ResPacketSize);
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
           traits.acc(C1, alphav, R2);
@@ -1147,10 +1793,10 @@
           r1.storePacket(0 * Traits::ResPacketSize, R2);
           r1.storePacket(1 * Traits::ResPacketSize, R3);
 
-          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r2.loadPacket(1 * Traits::ResPacketSize);
-          R2 = r3.loadPacket(0 * Traits::ResPacketSize);
-          R3 = r3.loadPacket(1 * Traits::ResPacketSize);
+          R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
           traits.acc(C2,  alphav, R0);
           traits.acc(C6,  alphav, R1);
           traits.acc(C3,  alphav, R2);
@@ -1159,16 +1805,17 @@
           r2.storePacket(1 * Traits::ResPacketSize, R1);
           r3.storePacket(0 * Traits::ResPacketSize, R2);
           r3.storePacket(1 * Traits::ResPacketSize, R3);
+          }
         }
-
+      
         // Deal with remaining columns of the rhs
         for(Index j2=packet_cols4; j2<cols; j2++)
         {
+          for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
+          {
           // One column at a time
           const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
           prefetch(&blA[0]);
-          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
-          prefetch(&blB[0]);
 
           // gets res block as register
           AccPacket C0, C4;
@@ -1177,14 +1824,16 @@
 
           LinearMapper r0 = res.getLinearMapper(i, j2);
           r0.prefetch(prefetch_res_offset);
-          LhsPacket A0, A1;
 
           // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+          LhsPacket A0, A1;
+
           for(Index k=0; k<peeled_kc; k+=pk)
           {
             EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
             RhsPacket B_0, B1;
-
+        
 #define EIGEN_GEBGP_ONESTEP(K) \
             do {                                                                  \
               EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");          \
@@ -1192,11 +1841,11 @@
               traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                      \
               traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                      \
               traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                       \
-              traits.madd(A0, B_0, C0, B1);                                       \
-              traits.madd(A1, B_0, C4, B_0);                                      \
+              traits.madd(A0, B_0, C0, B1, fix<0>);                               \
+              traits.madd(A1, B_0, C4, B_0, fix<0>);                              \
               EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");            \
             } while(false)
-
+        
             EIGEN_GEBGP_ONESTEP(0);
             EIGEN_GEBGP_ONESTEP(1);
             EIGEN_GEBGP_ONESTEP(2);
@@ -1206,8 +1855,8 @@
             EIGEN_GEBGP_ONESTEP(6);
             EIGEN_GEBGP_ONESTEP(7);
 
-            blB += pk*RhsProgress;
-            blA += pk*2*Traits::LhsProgress;
+            blB += int(pk) * int(RhsProgress);
+            blA += int(pk) * 2 * int(Traits::LhsProgress);
 
             EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
           }
@@ -1224,335 +1873,204 @@
           ResPacket R0, R1;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
           r0.storePacket(0 * Traits::ResPacketSize, R0);
           r0.storePacket(1 * Traits::ResPacketSize, R1);
+          }
         }
       }
     }
     //---------- Process 1 * LhsProgress rows at once ----------
     if(mr>=1*Traits::LhsProgress)
     {
-      // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
-      for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
-      {
-        // loops on each largest micro vertical panel of rhs (depth * nr)
-        for(Index j2=0; j2<packet_cols4; j2+=nr)
-        {
-          // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
-          // stored into 1 x nr registers.
-
-          const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
-          prefetch(&blA[0]);
-          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
-          prefetch(&blB[0]);
-
-          // gets res block as register
-          AccPacket C0, C1, C2, C3;
-          traits.initAcc(C0);
-          traits.initAcc(C1);
-          traits.initAcc(C2);
-          traits.initAcc(C3);
-
-          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
-          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
-          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
-          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
-
-          r0.prefetch(prefetch_res_offset);
-          r1.prefetch(prefetch_res_offset);
-          r2.prefetch(prefetch_res_offset);
-          r3.prefetch(prefetch_res_offset);
-          LhsPacket A0;
-
-          // performs "inner" products
-          for(Index k=0; k<peeled_kc; k+=pk)
-          {
-            EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
-            RhsPacket B_0, B1, B2, B3;
-
-#define EIGEN_GEBGP_ONESTEP(K) \
-            do {                                                                \
-              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4");        \
-              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
-              traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
-              traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
-              traits.madd(A0, B_0, C0, B_0);                                    \
-              traits.madd(A0, B1,  C1, B1);                                     \
-              traits.madd(A0, B2,  C2, B2);                                     \
-              traits.madd(A0, B3,  C3, B3);                                     \
-              EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4");          \
-            } while(false)
-
-            EIGEN_GEBGP_ONESTEP(0);
-            EIGEN_GEBGP_ONESTEP(1);
-            EIGEN_GEBGP_ONESTEP(2);
-            EIGEN_GEBGP_ONESTEP(3);
-            EIGEN_GEBGP_ONESTEP(4);
-            EIGEN_GEBGP_ONESTEP(5);
-            EIGEN_GEBGP_ONESTEP(6);
-            EIGEN_GEBGP_ONESTEP(7);
-
-            blB += pk*4*RhsProgress;
-            blA += pk*1*LhsProgress;
-
-            EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
-          }
-          // process remaining peeled loop
-          for(Index k=peeled_kc; k<depth; k++)
-          {
-            RhsPacket B_0, B1, B2, B3;
-            EIGEN_GEBGP_ONESTEP(0);
-            blB += 4*RhsProgress;
-            blA += 1*LhsProgress;
-          }
-#undef EIGEN_GEBGP_ONESTEP
-
-          ResPacket R0, R1;
-          ResPacket alphav = pset1<ResPacket>(alpha);
-
-          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r1.loadPacket(0 * Traits::ResPacketSize);
-          traits.acc(C0, alphav, R0);
-          traits.acc(C1,  alphav, R1);
-          r0.storePacket(0 * Traits::ResPacketSize, R0);
-          r1.storePacket(0 * Traits::ResPacketSize, R1);
-
-          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r3.loadPacket(0 * Traits::ResPacketSize);
-          traits.acc(C2,  alphav, R0);
-          traits.acc(C3,  alphav, R1);
-          r2.storePacket(0 * Traits::ResPacketSize, R0);
-          r3.storePacket(0 * Traits::ResPacketSize, R1);
-        }
-
-        // Deal with remaining columns of the rhs
-        for(Index j2=packet_cols4; j2<cols; j2++)
-        {
-          // One column at a time
-          const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
-          prefetch(&blA[0]);
-          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
-          prefetch(&blB[0]);
-
-          // gets res block as register
-          AccPacket C0;
-          traits.initAcc(C0);
-
-          LinearMapper r0 = res.getLinearMapper(i, j2);
-          LhsPacket A0;
-
-          // performs "inner" products
-          for(Index k=0; k<peeled_kc; k+=pk)
-          {
-            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
-            RhsPacket B_0;
-
-#define EIGEN_GEBGP_ONESTEP(K) \
-            do {                                                                \
-              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");        \
-              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
-              traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
-              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                     \
-              traits.madd(A0, B_0, C0, B_0);                                    \
-              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");          \
-            } while(false)
-
-            EIGEN_GEBGP_ONESTEP(0);
-            EIGEN_GEBGP_ONESTEP(1);
-            EIGEN_GEBGP_ONESTEP(2);
-            EIGEN_GEBGP_ONESTEP(3);
-            EIGEN_GEBGP_ONESTEP(4);
-            EIGEN_GEBGP_ONESTEP(5);
-            EIGEN_GEBGP_ONESTEP(6);
-            EIGEN_GEBGP_ONESTEP(7);
-
-            blB += pk*RhsProgress;
-            blA += pk*1*Traits::LhsProgress;
-
-            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
-          }
-
-          // process remaining peeled loop
-          for(Index k=peeled_kc; k<depth; k++)
-          {
-            RhsPacket B_0;
-            EIGEN_GEBGP_ONESTEP(0);
-            blB += RhsProgress;
-            blA += 1*Traits::LhsProgress;
-          }
-#undef EIGEN_GEBGP_ONESTEP
-          ResPacket R0;
-          ResPacket alphav = pset1<ResPacket>(alpha);
-          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
-          traits.acc(C0, alphav, R0);
-          r0.storePacket(0 * Traits::ResPacketSize, R0);
-        }
-      }
+      lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p;
+      p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
     }
-    //---------- Process remaining rows, 1 by 1 ----------
-    for(Index i=peeled_mc1; i<rows; i+=1)
+    //---------- Process LhsProgressHalf rows at once ----------
+    if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
+    {
+      lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p;
+      p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
+    }
+    //---------- Process LhsProgressQuarter rows at once ----------
+    if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
+    {
+      lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p;
+      p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
+    }
+    //---------- Process remaining rows, 1 at once ----------
+    if(peeled_mc_quarter<rows)
     {
       // loop on each panel of the rhs
       for(Index j2=0; j2<packet_cols4; j2+=nr)
       {
-        const LhsScalar* blA = &blockA[i*strideA+offsetA];
-        prefetch(&blA[0]);
-        const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
-        prefetch(&blB[0]);
+        // loop on each row of the lhs (1*LhsProgress x depth)
+        for(Index i=peeled_mc_quarter; i<rows; i+=1)
+        {
+          const LhsScalar* blA = &blockA[i*strideA+offsetA];
+          prefetch(&blA[0]);
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
 
-        typedef typename unpacket_traits<SResPacket>::half SResPacketHalf;
-        if ((SwappedTraits::LhsProgress % 4) == 0 &&
-            (SwappedTraits::LhsProgress <= 8) &&
-            unpacket_traits<SResPacketHalf>::size==4) {
-          // NOTE The following piece of code wont work for 512 bit registers
-          SAccPacket C0, C1, C2, C3;
-          straits.initAcc(C0);
-          straits.initAcc(C1);
-          straits.initAcc(C2);
-          straits.initAcc(C3);
-
-          const Index spk   = (std::max)(1,SwappedTraits::LhsProgress/4);
-          const Index endk  = (depth/spk)*spk;
-          const Index endk4 = (depth/(spk*4))*(spk*4);
-
-          Index k=0;
-          for(; k<endk4; k+=4*spk)
+          // If LhsProgress is 8 or 16, it assumes that there is a
+          // half or quarter packet, respectively, of the same size as
+          // nr (which is currently 4) for the return type.
+          const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
+          const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
+          if ((SwappedTraits::LhsProgress % 4) == 0 &&
+              (SwappedTraits::LhsProgress<=16) &&
+              (SwappedTraits::LhsProgress!=8  || SResPacketHalfSize==nr) &&
+              (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
           {
-            prefetch(&blB[4*SwappedTraits::LhsProgress]);
+            SAccPacket C0, C1, C2, C3;
+            straits.initAcc(C0);
+            straits.initAcc(C1);
+            straits.initAcc(C2);
+            straits.initAcc(C3);
 
-            SLhsPacket A0,A1,A2,A3;
-            SRhsPacket B_0,B_1,B_2,B_3;
+            const Index spk   = (std::max)(1,SwappedTraits::LhsProgress/4);
+            const Index endk  = (depth/spk)*spk;
+            const Index endk4 = (depth/(spk*4))*(spk*4);
 
-            straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
-            straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
-            straits.loadRhsQuad(blA+0*spk, B_0);
-            straits.loadRhsQuad(blA+1*spk, B_1);
-            straits.madd(A0,B_0,C0,B_0);
-            straits.madd(A1,B_1,C1,B_1);
-
-            straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A2);
-            straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A3);
-            straits.loadRhsQuad(blA+2*spk, B_2);
-            straits.loadRhsQuad(blA+3*spk, B_3);
-            straits.madd(A2,B_2,C2,B_2);
-            straits.madd(A3,B_3,C3,B_3);
-
-            blB += 4*SwappedTraits::LhsProgress;
-            blA += 4*spk;
-          }
-          C0 = padd(padd(C0,C1),padd(C2,C3));
-          for(; k<endk; k+=spk)
-          {
-            SLhsPacket A0;
-            SRhsPacket B_0;
-
-            straits.loadLhsUnaligned(blB, A0);
-            straits.loadRhsQuad(blA, B_0);
-            straits.madd(A0,B_0,C0,B_0);
-
-            blB += SwappedTraits::LhsProgress;
-            blA += spk;
-          }
-
-          if(SwappedTraits::LhsProgress==8)
-          {
-            // Special case where we have to first reduce the accumulation register C0
-#ifndef __AVX512F__
-	    typedef typename conditional<SwappedTraits::LhsProgress==8,
-	      typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
-            typedef typename conditional<SwappedTraits::LhsProgress==8,
-	      typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
-            typedef typename conditional<SwappedTraits::LhsProgress==8,
-	      typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
-            typedef typename conditional<SwappedTraits::LhsProgress==8,
-	      typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
-#else
-            typedef typename unpacket_traits<SResPacket>::half SResPacketHalf;
-            typedef typename unpacket_traits<SLhsPacket>::half SLhsPacketHalf;
-            typedef typename unpacket_traits<SLhsPacket>::half SRhsPacketHalf;
-            typedef typename unpacket_traits<SAccPacket>::half SAccPacketHalf;
-#endif
-
-            SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
-            SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
-
-            if(depth-endk>0)
+            Index k=0;
+            for(; k<endk4; k+=4*spk)
             {
-              // We have to handle the last row of the rhs which corresponds to a half-packet
-              SLhsPacketHalf a0;
-              SRhsPacketHalf b0;
-              straits.loadLhsUnaligned(blB, a0);
-              straits.loadRhs(blA, b0);
-              SAccPacketHalf c0 = predux4(C0);
-              straits.madd(a0,b0,c0,b0);
-              straits.acc(c0, alphav, R);
+              SLhsPacket A0,A1;
+              SRhsPacket B_0,B_1;
+
+              straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
+              straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
+
+              straits.loadRhsQuad(blA+0*spk, B_0);
+              straits.loadRhsQuad(blA+1*spk, B_1);
+              straits.madd(A0,B_0,C0,B_0, fix<0>);
+              straits.madd(A1,B_1,C1,B_1, fix<0>);
+
+              straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
+              straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
+              straits.loadRhsQuad(blA+2*spk, B_0);
+              straits.loadRhsQuad(blA+3*spk, B_1);
+              straits.madd(A0,B_0,C2,B_0, fix<0>);
+              straits.madd(A1,B_1,C3,B_1, fix<0>);
+
+              blB += 4*SwappedTraits::LhsProgress;
+              blA += 4*spk;
+            }
+            C0 = padd(padd(C0,C1),padd(C2,C3));
+            for(; k<endk; k+=spk)
+            {
+              SLhsPacket A0;
+              SRhsPacket B_0;
+
+              straits.loadLhsUnaligned(blB, A0);
+              straits.loadRhsQuad(blA, B_0);
+              straits.madd(A0,B_0,C0,B_0, fix<0>);
+
+              blB += SwappedTraits::LhsProgress;
+              blA += spk;
+            }
+            if(SwappedTraits::LhsProgress==8)
+            {
+              // Special case where we have to first reduce the accumulation register C0
+              typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
+              typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
+              typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SRhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
+              typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
+
+              SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
+              SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
+
+              if(depth-endk>0)
+              {
+                // We have to handle the last row of the rhs which corresponds to a half-packet
+                SLhsPacketHalf a0;
+                SRhsPacketHalf b0;
+                straits.loadLhsUnaligned(blB, a0);
+                straits.loadRhs(blA, b0);
+                SAccPacketHalf c0 = predux_half_dowto4(C0);
+                straits.madd(a0,b0,c0,b0, fix<0>);
+                straits.acc(c0, alphav, R);
+              }
+              else
+              {
+                straits.acc(predux_half_dowto4(C0), alphav, R);
+              }
+              res.scatterPacket(i, j2, R);
+            }
+            else if (SwappedTraits::LhsProgress==16)
+            {
+              // Special case where we have to first reduce the
+              // accumulation register C0. We specialize the block in
+              // template form, so that LhsProgress < 16 paths don't
+              // fail to compile
+              last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
+	            p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
             }
             else
             {
-                straits.acc(predux4(C0), alphav, R);
+              SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
+              SResPacket alphav = pset1<SResPacket>(alpha);
+              straits.acc(C0, alphav, R);
+              res.scatterPacket(i, j2, R);
             }
-            res.scatterPacket(i, j2, R);
           }
-          else
+          else // scalar path
           {
-            SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
-            SResPacket alphav = pset1<SResPacket>(alpha);
-            straits.acc(C0, alphav, R);
-            res.scatterPacket(i, j2, R);
-          }
-        }
-        else // scalar path
-        {
-          // get a 1 x 4 res block as registers
-          ResScalar C0(0), C1(0), C2(0), C3(0);
+            // get a 1 x 4 res block as registers
+            ResScalar C0(0), C1(0), C2(0), C3(0);
 
-          for(Index k=0; k<depth; k++)
-          {
-            LhsScalar A0 = blA[k];
-            RhsScalar B_0 = blB[0];
-            RhsScalar B_1 = blB[1];
-            CJMADD(cj,A0,B_0,C0, B_0);
-            CJMADD(cj,A0,B_1,C1, B_1);
-            RhsScalar B_2 = blB[2];
-            RhsScalar B_3 = blB[3];
-            CJMADD(cj,A0,B_2,C2, B_2);
-            CJMADD(cj,A0,B_3,C3, B_3);
+            for(Index k=0; k<depth; k++)
+            {
+              LhsScalar A0;
+              RhsScalar B_0, B_1;
 
-            blB += 4;
+              A0 = blA[k];
+
+              B_0 = blB[0];
+              B_1 = blB[1];
+              C0 = cj.pmadd(A0,B_0,C0);
+              C1 = cj.pmadd(A0,B_1,C1);
+
+              B_0 = blB[2];
+              B_1 = blB[3];
+              C2 = cj.pmadd(A0,B_0,C2);
+              C3 = cj.pmadd(A0,B_1,C3);
+
+              blB += 4;
+            }
+            res(i, j2 + 0) += alpha * C0;
+            res(i, j2 + 1) += alpha * C1;
+            res(i, j2 + 2) += alpha * C2;
+            res(i, j2 + 3) += alpha * C3;
           }
-          res(i, j2 + 0) += alpha * C0;
-          res(i, j2 + 1) += alpha * C1;
-          res(i, j2 + 2) += alpha * C2;
-          res(i, j2 + 3) += alpha * C3;
         }
       }
-
       // remaining columns
       for(Index j2=packet_cols4; j2<cols; j2++)
       {
-        const LhsScalar* blA = &blockA[i*strideA+offsetA];
-        //          prefetch(blA);
-        // gets a 1 x 1 res block as registers
-        ResScalar C0(0);
-        const RhsScalar* blB = &blockB[j2*strideB+offsetB];
-        for(Index k=0; k<depth; k++)
+        // loop on each row of the lhs (1*LhsProgress x depth)
+        for(Index i=peeled_mc_quarter; i<rows; i+=1)
         {
-          LhsScalar A0 = blA[k];
-          RhsScalar B_0 = blB[k];
-          CJMADD(cj, A0, B_0, C0, B_0);
+          const LhsScalar* blA = &blockA[i*strideA+offsetA];
+          prefetch(&blA[0]);
+          // gets a 1 x 1 res block as registers
+          ResScalar C0(0);
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+          for(Index k=0; k<depth; k++)
+          {
+            LhsScalar A0 = blA[k];
+            RhsScalar B_0 = blB[k];
+            C0 = cj.pmadd(A0, B_0, C0);
+          }
+          res(i, j2) += alpha * C0;
         }
-        res(i, j2) += alpha * C0;
       }
     }
   }
 
 
-#undef CJMADD
-
 // pack a block of the lhs
 // The traversal is as follow (mr==4):
 //   0  4  8 12 ...
@@ -1567,19 +2085,24 @@
 //
 //  32 33 34 35 ...
 //  36 36 38 39 ...
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
 {
   typedef typename DataMapper::LinearMapper LinearMapper;
   EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
   ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-  typedef typename packet_traits<Scalar>::type Packet;
-  enum { PacketSize = packet_traits<Scalar>::size };
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
+  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
+  enum { PacketSize = unpacket_traits<Packet>::size,
+         HalfPacketSize = unpacket_traits<HalfPacket>::size,
+         QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
+         HasHalf = (int)HalfPacketSize < (int)PacketSize,
+         HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
 
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
   EIGEN_UNUSED_VARIABLE(stride);
@@ -1587,220 +2110,207 @@
   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
   eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+  Index count = 0;
 
   const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
   const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
-  const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
-  const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
-                         : Pack2>1             ? (rows/Pack2)*Pack2 : 0;
+  const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
+  const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
+  const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
+  const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
+  const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
+                         : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0;
 
   Index i=0;
 
   // Pack 3 packets
   if(Pack1>=3*PacketSize)
   {
-    if(PanelMode)
+    for(; i<peeled_mc3; i+=3*PacketSize)
     {
-      for(; i<peeled_mc3; i+=3*PacketSize)
-      {
-        blockA += (3*PacketSize) * offset;
+      if(PanelMode) count += (3*PacketSize) * offset;
 
-        for(Index k=0; k<depth; k++)
-        {
-          Packet A, B, C;
-          A = lhs.loadPacket(i+0*PacketSize, k);
-          B = lhs.loadPacket(i+1*PacketSize, k);
-          C = lhs.loadPacket(i+2*PacketSize, k);
-          pstore(blockA+0*PacketSize, cj.pconj(A));
-          pstore(blockA+1*PacketSize, cj.pconj(B));
-          pstore(blockA+2*PacketSize, cj.pconj(C));
-          blockA += 3*PacketSize;
-        }
-        blockA += (3*PacketSize) * (stride-offset-depth);
-      }
-    }
-    else
-    {
-      // Read the data from DRAM as sequentially as possible. We're writing to
-      // SRAM so the order of the writes shouldn't impact performance.
       for(Index k=0; k<depth; k++)
       {
-        Scalar* localBlockA = blockA + 3*PacketSize*k;
-        for(Index local_i = i; local_i<peeled_mc3; local_i+=3*PacketSize)
-        {
-          Packet A, B, C;
-          A = lhs.loadPacket(local_i+0*PacketSize, k);
-          B = lhs.loadPacket(local_i+1*PacketSize, k);
-          C = lhs.loadPacket(local_i+2*PacketSize, k);
-          pstore(localBlockA+0*PacketSize, cj.pconj(A));
-          pstore(localBlockA+1*PacketSize, cj.pconj(B));
-          pstore(localBlockA+2*PacketSize, cj.pconj(C));
-          localBlockA += 3*PacketSize*depth;
-        }
+        Packet A, B, C;
+        A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
+        B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
+        C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
+        pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
+        pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
+        pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
       }
-      blockA += depth*peeled_mc3;
-      i = peeled_mc3;
+      if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
     }
   }
   // Pack 2 packets
   if(Pack1>=2*PacketSize)
   {
-    if(PanelMode)
+    for(; i<peeled_mc2; i+=2*PacketSize)
     {
-      for(; i<peeled_mc2; i+=2*PacketSize)
-      {
-        blockA += (2*PacketSize) * offset;
+      if(PanelMode) count += (2*PacketSize) * offset;
 
-        for(Index k=0; k<depth; k++)
-        {
-          Packet A, B;
-          A = lhs.loadPacket(i+0*PacketSize, k);
-          B = lhs.loadPacket(i+1*PacketSize, k);
-          pstore(blockA+0*PacketSize, cj.pconj(A));
-          pstore(blockA+1*PacketSize, cj.pconj(B));
-          blockA += 2*PacketSize;
-        }
-        blockA += (2*PacketSize) * (stride-offset-depth);
-      }
-    }
-    else
-    {
-      // Read the data from RAM as sequentially as possible.
       for(Index k=0; k<depth; k++)
       {
-        Scalar* localBlockA = blockA + 2*PacketSize*k;
-        for(Index local_i = i; local_i<peeled_mc2; local_i+=2*PacketSize)
-        {
-          Packet A, B;
-          A = lhs.loadPacket(local_i+0*PacketSize, k);
-          B = lhs.loadPacket(local_i+1*PacketSize, k);
-          pstore(localBlockA+0*PacketSize, cj.pconj(A));
-          pstore(localBlockA+1*PacketSize, cj.pconj(B));
-          localBlockA += 2*PacketSize*depth;
-        }
+        Packet A, B;
+        A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
+        B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
+        pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
+        pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
       }
-      blockA += depth*(peeled_mc2-i);
-      i = peeled_mc2;
+      if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
     }
   }
   // Pack 1 packets
   if(Pack1>=1*PacketSize)
   {
-    if(PanelMode)
+    for(; i<peeled_mc1; i+=1*PacketSize)
     {
-      for(; i<peeled_mc1; i+=1*PacketSize)
-      {
-        blockA += (1*PacketSize) * offset;
+      if(PanelMode) count += (1*PacketSize) * offset;
 
-        for(Index k=0; k<depth; k++)
-        {
-          Packet A;
-          A = lhs.loadPacket(i+0*PacketSize, k);
-          pstore(blockA, cj.pconj(A));
-          blockA+=PacketSize;
-        }
-        blockA += (1*PacketSize) * (stride-offset-depth);
-      }
-    }
-    else
-    {
-      // Read the data from RAM as sequentially as possible.
       for(Index k=0; k<depth; k++)
       {
-        Scalar* localBlockA = blockA + PacketSize*k;
-        for(Index local_i = i; local_i<peeled_mc1; local_i+=1*PacketSize)
-        {
-          Packet A;
-          A = lhs.loadPacket(local_i+0*PacketSize, k);
-          pstore(localBlockA, cj.pconj(A));
-          localBlockA += PacketSize*depth;
-        }
+        Packet A;
+        A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
+        pstore(blockA+count, cj.pconj(A));
+        count+=PacketSize;
       }
-      blockA += depth*(peeled_mc1-i);
-      i = peeled_mc1;
+      if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
+    }
+  }
+  // Pack half packets
+  if(HasHalf && Pack1>=HalfPacketSize)
+  {
+    for(; i<peeled_mc_half; i+=HalfPacketSize)
+    {
+      if(PanelMode) count += (HalfPacketSize) * offset;
+
+      for(Index k=0; k<depth; k++)
+      {
+        HalfPacket A;
+        A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
+        pstoreu(blockA+count, cj.pconj(A));
+        count+=HalfPacketSize;
+      }
+      if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
+    }
+  }
+  // Pack quarter packets
+  if(HasQuarter && Pack1>=QuarterPacketSize)
+  {
+    for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
+    {
+      if(PanelMode) count += (QuarterPacketSize) * offset;
+
+      for(Index k=0; k<depth; k++)
+      {
+        QuarterPacket A;
+        A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
+        pstoreu(blockA+count, cj.pconj(A));
+        count+=QuarterPacketSize;
+      }
+      if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
+    }
+  }
+  // Pack2 may be *smaller* than PacketSize—that happens for
+  // products like real * complex, where we have to go half the
+  // progress on the lhs in order to duplicate those operands to
+  // address both real & imaginary parts on the rhs. This portion will
+  // pack those half ones until they match the number expected on the
+  // last peeling loop at this point (for the rhs).
+  if(Pack2<PacketSize && Pack2>1)
+  {
+    for(; i<peeled_mc0; i+=last_lhs_progress)
+    {
+      if(PanelMode) count += last_lhs_progress * offset;
+
+      for(Index k=0; k<depth; k++)
+        for(Index w=0; w<last_lhs_progress; w++)
+          blockA[count++] = cj(lhs(i+w, k));
+
+      if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
     }
   }
   // Pack scalars
-  if(Pack2<PacketSize && Pack2>1)
-  {
-    for(; i<peeled_mc0; i+=Pack2)
-    {
-      if (PanelMode) {
-        blockA += Pack2 * offset;
-      }
-
-      for(Index k=0; k<depth; k++) {
-        const LinearMapper dm0 = lhs.getLinearMapper(i, k);
-        for(Index w=0; w<Pack2; w++) {
-          *blockA = cj(dm0(w));
-          blockA += 1;
-        }
-      }
-
-      if(PanelMode) blockA += Pack2 * (stride-offset-depth);
-    }
-  }
   for(; i<rows; i++)
   {
-    if(PanelMode) blockA += offset;
-    for(Index k=0; k<depth; k++) {
-      *blockA = cj(lhs(i, k));
-      blockA += 1;
-    }
-    if(PanelMode) blockA += (stride-offset-depth);
+    if(PanelMode) count += offset;
+    for(Index k=0; k<depth; k++)
+      blockA[count++] = cj(lhs(i, k));
+    if(PanelMode) count += (stride-offset-depth);
   }
 }
 
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
 {
   typedef typename DataMapper::LinearMapper LinearMapper;
   EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
   ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-  typedef typename packet_traits<Scalar>::type Packet;
-  enum { PacketSize = packet_traits<Scalar>::size };
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
+  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
+  enum { PacketSize = unpacket_traits<Packet>::size,
+         HalfPacketSize = unpacket_traits<HalfPacket>::size,
+         QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
+         HasHalf = (int)HalfPacketSize < (int)PacketSize,
+         HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
 
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
   EIGEN_UNUSED_VARIABLE(stride);
   EIGEN_UNUSED_VARIABLE(offset);
   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+  Index count = 0;
+  bool gone_half = false, gone_quarter = false, gone_last = false;
 
-//   const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
-//   const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
-//   const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
-
-  int pack = Pack1;
   Index i = 0;
+  int pack = Pack1;
+  int psize = PacketSize;
   while(pack>0)
   {
     Index remaining_rows = rows-i;
-    Index peeled_mc = i+(remaining_rows/pack)*pack;
+    Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
+    Index starting_pos = i;
     for(; i<peeled_mc; i+=pack)
     {
-      if(PanelMode) blockA += pack * offset;
+      if(PanelMode) count += pack * offset;
 
-      const Index peeled_k = (depth/PacketSize)*PacketSize;
       Index k=0;
-      if(pack>=PacketSize)
+      if(pack>=psize && psize >= QuarterPacketSize)
       {
-        for(; k<peeled_k; k+=PacketSize)
+        const Index peeled_k = (depth/psize)*psize;
+        for(; k<peeled_k; k+=psize)
         {
-          for (Index m = 0; m < pack; m += PacketSize)
+          for (Index m = 0; m < pack; m += psize)
           {
-            PacketBlock<Packet> kernel;
-            for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
-            ptranspose(kernel);
-            for (int p = 0; p < PacketSize; ++p) pstore(blockA+m+(pack)*p, cj.pconj(kernel.packet[p]));
+            if (psize == PacketSize) {
+              PacketBlock<Packet> kernel;
+              for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
+              ptranspose(kernel);
+              for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
+            } else if (HasHalf && psize == HalfPacketSize) {
+              gone_half = true;
+              PacketBlock<HalfPacket> kernel_half;
+              for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
+              ptranspose(kernel_half);
+              for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
+            } else if (HasQuarter && psize == QuarterPacketSize) {
+              gone_quarter = true;
+              PacketBlock<QuarterPacket> kernel_quarter;
+              for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
+              ptranspose(kernel_quarter);
+              for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
+	    }
           }
-          blockA += PacketSize*pack;
+          count += psize*pack;
         }
       }
+
       for(; k<depth; k++)
       {
         Index w=0;
@@ -1810,35 +2320,49 @@
                  b(cj(lhs(i+w+1, k))),
                  c(cj(lhs(i+w+2, k))),
                  d(cj(lhs(i+w+3, k)));
-          blockA[0] = a;
-          blockA[1] = b;
-          blockA[2] = c;
-          blockA[3] = d;
-          blockA += 4;
+          blockA[count++] = a;
+          blockA[count++] = b;
+          blockA[count++] = c;
+          blockA[count++] = d;
         }
         if(pack%4)
-          for(;w<pack;++w) {
-            *blockA = cj(lhs(i+w, k));
-            blockA += 1;
-          }
+          for(;w<pack;++w)
+            blockA[count++] = cj(lhs(i+w, k));
       }
 
-      if(PanelMode) blockA += pack * (stride-offset-depth);
+      if(PanelMode) count += pack * (stride-offset-depth);
     }
 
-    pack -= PacketSize;
-    if(pack<Pack2 && (pack+PacketSize)!=Pack2)
-      pack = Pack2;
+    pack -= psize;
+    Index left = rows - i;
+    if (pack <= 0) {
+      if (!gone_last &&
+          (starting_pos == i || left >= psize/2 || left >= psize/4) &&
+          ((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
+           (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
+        psize /= 2;
+        pack = psize;
+        continue;
+      }
+      // Pack2 may be *smaller* than PacketSize—that happens for
+      // products like real * complex, where we have to go half the
+      // progress on the lhs in order to duplicate those operands to
+      // address both real & imaginary parts on the rhs. This portion will
+      // pack those half ones until they match the number expected on the
+      // last peeling loop at this point (for the rhs).
+      if (Pack2 < PacketSize && !gone_last) {
+        gone_last = true;
+        psize = pack = left & ~1;
+      }
+    }
   }
 
   for(; i<rows; i++)
   {
-    if(PanelMode) blockA += offset;
-    for(Index k=0; k<depth; k++) {
-      *blockA = cj(lhs(i, k));
-      blockA += 1;
-    }
-    if(PanelMode) blockA += (stride-offset-depth);
+    if(PanelMode) count += offset;
+    for(Index k=0; k<depth; k++)
+      blockA[count++] = cj(lhs(i, k));
+    if(PanelMode) count += (stride-offset-depth);
   }
 }
 
@@ -1860,7 +2384,7 @@
 
 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
 EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
-::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
   EIGEN_UNUSED_VARIABLE(stride);
@@ -1869,6 +2393,7 @@
   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
   Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
   Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+  Index count = 0;
   const Index peeled_k = (depth/PacketSize)*PacketSize;
 //   if(nr>=8)
 //   {
@@ -1885,7 +2410,7 @@
 //       const Scalar* b6 = &rhs[(j2+6)*rhsStride];
 //       const Scalar* b7 = &rhs[(j2+7)*rhsStride];
 //       Index k=0;
-//       if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
+//       if(PacketSize==8) // TODO enable vectorized transposition for PacketSize==4
 //       {
 //         for(; k<peeled_k; k+=PacketSize) {
 //           PacketBlock<Packet> kernel;
@@ -1921,9 +2446,7 @@
     for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
     {
       // skip what we have before
-      if(PanelMode) blockB += 4 * offset;
-
-      // TODO: each of these makes a copy of the stride :(
+      if(PanelMode) count += 4 * offset;
       const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
       const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
       const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
@@ -1933,43 +2456,43 @@
       if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ??
       {
         for(; k<peeled_k; k+=PacketSize) {
-          PacketBlock<Packet, 4> kernel;
-          kernel.packet[0] = dm0.loadPacket(k);
-          kernel.packet[1] = dm1.loadPacket(k);
-          kernel.packet[2] = dm2.loadPacket(k);
-          kernel.packet[3] = dm3.loadPacket(k);
+          PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
+          kernel.packet[0           ] = dm0.template loadPacket<Packet>(k);
+          kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
+          kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
+          kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
           ptranspose(kernel);
-          pstoreu(blockB+0*PacketSize, cj.pconj(kernel.packet[0]));
-          pstoreu(blockB+1*PacketSize, cj.pconj(kernel.packet[1]));
-          pstoreu(blockB+2*PacketSize, cj.pconj(kernel.packet[2]));
-          pstoreu(blockB+3*PacketSize, cj.pconj(kernel.packet[3]));
-          blockB+=4*PacketSize;
+          pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
+          pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
+          pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
+          pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
+          count+=4*PacketSize;
         }
       }
       for(; k<depth; k++)
       {
-        blockB[0] = cj(dm0(k));
-        blockB[1] = cj(dm1(k));
-        blockB[2] = cj(dm2(k));
-        blockB[3] = cj(dm3(k));
-        blockB += 4;
+        blockB[count+0] = cj(dm0(k));
+        blockB[count+1] = cj(dm1(k));
+        blockB[count+2] = cj(dm2(k));
+        blockB[count+3] = cj(dm3(k));
+        count += 4;
       }
       // skip what we have after
-      if(PanelMode) blockB += 4 * (stride-offset-depth);
+      if(PanelMode) count += 4 * (stride-offset-depth);
     }
   }
 
   // copy the remaining columns one at a time (nr==1)
   for(Index j2=packet_cols4; j2<cols; ++j2)
   {
+    if(PanelMode) count += offset;
     const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
-    if(PanelMode) blockB += offset;
     for(Index k=0; k<depth; k++)
     {
-      *blockB = cj(dm0(k));
-      blockB += 1;
+      blockB[count] = cj(dm0(k));
+      count += 1;
     }
-    if(PanelMode) blockB += (stride-offset-depth);
+    if(PanelMode) count += (stride-offset-depth);
   }
 }
 
@@ -1978,103 +2501,104 @@
 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
 {
   typedef typename packet_traits<Scalar>::type Packet;
-  typedef typename packet_traits<Scalar>::half HalfPacket;
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
+  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
   typedef typename DataMapper::LinearMapper LinearMapper;
-  enum {
-    PacketSize = packet_traits<Scalar>::size,
-    HalfPacketSize = packet_traits<Scalar>::HasHalfPacket ? unpacket_traits<typename packet_traits<Scalar>::half>::size : 0
-  };
-  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
-};
-
-template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
-  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
-  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
-  EIGEN_UNUSED_VARIABLE(stride);
-  EIGEN_UNUSED_VARIABLE(offset);
-  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
-  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
-  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
-
-//   if(nr>=8)
-//   {
-//     for(Index j2=0; j2<packet_cols8; j2+=8)
-//     {
-//       // skip what we have before
-//       if(PanelMode) count += 8 * offset;
-//       for(Index k=0; k<depth; k++)
-//       {
-//         if (PacketSize==8) {
-//           Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
-//           pstoreu(blockB+count, cj.pconj(A));
-//         } else if (PacketSize==4) {
-//           Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
-//           Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
-//           pstoreu(blockB+count, cj.pconj(A));
-//           pstoreu(blockB+count+PacketSize, cj.pconj(B));
-//         } else {
-//           const Scalar* b0 = &rhs[k*rhsStride + j2];
-//           blockB[count+0] = cj(b0[0]);
-//           blockB[count+1] = cj(b0[1]);
-//           blockB[count+2] = cj(b0[2]);
-//           blockB[count+3] = cj(b0[3]);
-//           blockB[count+4] = cj(b0[4]);
-//           blockB[count+5] = cj(b0[5]);
-//           blockB[count+6] = cj(b0[6]);
-//           blockB[count+7] = cj(b0[7]);
-//         }
-//         count += 8;
-//       }
-//       // skip what we have after
-//       if(PanelMode) count += 8 * (stride-offset-depth);
-//     }
-//   }
-  if(nr>=4)
+  enum { PacketSize = packet_traits<Scalar>::size,
+         HalfPacketSize = unpacket_traits<HalfPacket>::size,
+		 QuarterPacketSize = unpacket_traits<QuarterPacket>::size};
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0)
   {
-    for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
+    EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
+    EIGEN_UNUSED_VARIABLE(stride);
+    EIGEN_UNUSED_VARIABLE(offset);
+    eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+    const bool HasHalf = (int)HalfPacketSize < (int)PacketSize;
+    const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize;
+    conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+    Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
+    Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+    Index count = 0;
+
+  //   if(nr>=8)
+  //   {
+  //     for(Index j2=0; j2<packet_cols8; j2+=8)
+  //     {
+  //       // skip what we have before
+  //       if(PanelMode) count += 8 * offset;
+  //       for(Index k=0; k<depth; k++)
+  //       {
+  //         if (PacketSize==8) {
+  //           Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
+  //           pstoreu(blockB+count, cj.pconj(A));
+  //         } else if (PacketSize==4) {
+  //           Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
+  //           Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
+  //           pstoreu(blockB+count, cj.pconj(A));
+  //           pstoreu(blockB+count+PacketSize, cj.pconj(B));
+  //         } else {
+  //           const Scalar* b0 = &rhs[k*rhsStride + j2];
+  //           blockB[count+0] = cj(b0[0]);
+  //           blockB[count+1] = cj(b0[1]);
+  //           blockB[count+2] = cj(b0[2]);
+  //           blockB[count+3] = cj(b0[3]);
+  //           blockB[count+4] = cj(b0[4]);
+  //           blockB[count+5] = cj(b0[5]);
+  //           blockB[count+6] = cj(b0[6]);
+  //           blockB[count+7] = cj(b0[7]);
+  //         }
+  //         count += 8;
+  //       }
+  //       // skip what we have after
+  //       if(PanelMode) count += 8 * (stride-offset-depth);
+  //     }
+  //   }
+    if(nr>=4)
     {
-      // skip what we have before
-      if(PanelMode) blockB += 4 * offset;
+      for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
+      {
+        // skip what we have before
+        if(PanelMode) count += 4 * offset;
+        for(Index k=0; k<depth; k++)
+        {
+          if (PacketSize==4) {
+            Packet A = rhs.template loadPacket<Packet>(k, j2);
+            pstoreu(blockB+count, cj.pconj(A));
+            count += PacketSize;
+          } else if (HasHalf && HalfPacketSize==4) {
+            HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
+            pstoreu(blockB+count, cj.pconj(A));
+            count += HalfPacketSize;
+          } else if (HasQuarter && QuarterPacketSize==4) {
+            QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
+            pstoreu(blockB+count, cj.pconj(A));
+            count += QuarterPacketSize;
+          } else {
+            const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
+            blockB[count+0] = cj(dm0(0));
+            blockB[count+1] = cj(dm0(1));
+            blockB[count+2] = cj(dm0(2));
+            blockB[count+3] = cj(dm0(3));
+            count += 4;
+          }
+        }
+        // skip what we have after
+        if(PanelMode) count += 4 * (stride-offset-depth);
+      }
+    }
+    // copy the remaining columns one at a time (nr==1)
+    for(Index j2=packet_cols4; j2<cols; ++j2)
+    {
+      if(PanelMode) count += offset;
       for(Index k=0; k<depth; k++)
       {
-        if (PacketSize==4) {
-          Packet A = rhs.loadPacket(k, j2);
-          pstore(blockB, cj.pconj(A));
-          blockB += PacketSize;
-        }
-        else if (HalfPacketSize==4) {
-          HalfPacket A = rhs.loadHalfPacket(k, j2);
-          pstore<Scalar, HalfPacket>(blockB, cj.pconj(A));
-          blockB += HalfPacketSize;
-        }
-        else {
-          const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
-          blockB[0] = cj(dm0(0));
-          blockB[1] = cj(dm0(1));
-          blockB[2] = cj(dm0(2));
-          blockB[3] = cj(dm0(3));
-          blockB += 4;
-        }
+        blockB[count] = cj(rhs(k, j2));
+        count += 1;
       }
-      // skip what we have after
-      if(PanelMode) blockB += 4 * (stride-offset-depth);
+      if(PanelMode) count += stride-offset-depth;
     }
   }
-  // copy the remaining columns one at a time (nr==1)
-  for(Index j2=packet_cols4; j2<cols; ++j2)
-  {
-    if(PanelMode) blockB += offset;
-    for(Index k=0; k<depth; k++)
-    {
-      *blockB = cj(rhs(k, j2));
-      blockB += 1;
-    }
-    if(PanelMode) blockB += stride-offset-depth;
-  }
-}
+};
 
 } // end namespace internal
 
@@ -2096,8 +2620,9 @@
   return l2;
 }
 
-/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
-  * \sa setCpuCacheSize */
+/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size paramete\
+rs.                                                                                                                
+* \sa setCpuCacheSize */
 inline std::ptrdiff_t l3CacheSize()
 {
   std::ptrdiff_t l1, l2, l3;

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 9bed9af..caa65fc 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h

@@ -20,17 +20,18 @@
 template<
   typename Index,
   typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
-  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
-struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
+  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+  int ResInnerStride>
+struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride>
 {
   typedef gebp_traits<RhsScalar,LhsScalar> Traits;
 
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   static EIGEN_STRONG_INLINE void run(
     Index rows, Index cols, Index depth,
     const LhsScalar* lhs, Index lhsStride,
     const RhsScalar* rhs, Index rhsStride,
-    ResScalar* res, Index resStride,
+    ResScalar* res, Index resIncr, Index resStride,
     ResScalar alpha,
     level3_blocking<RhsScalar,LhsScalar>& blocking,
     GemmParallelInfo<Index>* info = 0)
@@ -39,8 +40,8 @@
     general_matrix_matrix_product<Index,
       RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
       LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
-      ColMajor>
-    ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info);
+      ColMajor,ResInnerStride>
+    ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking,info);
   }
 };
 
@@ -49,33 +50,34 @@
 template<
   typename Index,
   typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
-  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
-struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
+  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+  int ResInnerStride>
+struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride>
 {
 
 typedef gebp_traits<LhsScalar,RhsScalar> Traits;
 
-typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 static void run(Index rows, Index cols, Index depth,
   const LhsScalar* _lhs, Index lhsStride,
   const RhsScalar* _rhs, Index rhsStride,
-  ResScalar* _res, Index resStride,
+  ResScalar* _res, Index resIncr, Index resStride,
   ResScalar alpha,
   level3_blocking<LhsScalar,RhsScalar>& blocking,
   GemmParallelInfo<Index>* info = 0)
 {
   typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
   typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
-  typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
-  LhsMapper lhs(_lhs,lhsStride);
-  RhsMapper rhs(_rhs,rhsStride);
-  ResMapper res(_res, resStride);
+  typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor,Unaligned,ResInnerStride> ResMapper;
+  LhsMapper lhs(_lhs, lhsStride);
+  RhsMapper rhs(_rhs, rhsStride);
+  ResMapper res(_res, resStride, resIncr);
 
   Index kc = blocking.kc();                   // cache block size along the K direction
   Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
   Index nc = (std::min)(cols,blocking.nc());  // cache block size along the N direction
 
-  gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+  gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
   gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
   gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
 
@@ -83,8 +85,8 @@
   if(info)
   {
     // this is the parallel version!
-    Index tid = omp_get_thread_num();
-    Index threads = omp_get_num_threads();
+    int tid = omp_get_thread_num();
+    int threads = omp_get_num_threads();
 
     LhsScalar* blockA = blocking.blockA();
     eigen_internal_assert(blockA!=0);
@@ -108,7 +110,7 @@
       // i.e., we test that info[tid].users equals 0.
       // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
       while(info[tid].users!=0) {}
-      info[tid].users += threads;
+      info[tid].users = threads;
 
       pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length);
 
@@ -116,15 +118,16 @@
       info[tid].sync = k;
 
       // Computes C_i += A' * B' per A'_i
-      for(Index shift=0; shift<threads; ++shift)
+      for(int shift=0; shift<threads; ++shift)
       {
-        Index i = (tid+shift)%threads;
+        int i = (tid+shift)%threads;
 
         // At this point we have to make sure that A'_i has been updated by the thread i,
         // we use testAndSetOrdered to mimic a volatile access.
         // However, no need to wait for the B' part which has been updated by the current thread!
         if (shift>0) {
-          while(info[i].sync!=k) {}
+          while(info[i].sync!=k) {
+          }
         }
 
         gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
@@ -144,12 +147,11 @@
 
       // Release all the sub blocks A'_i of A' for the current thread,
       // i.e., we simply decrement the number of users by 1
-      #pragma omp critical
-      {
       for(Index i=0; i<threads; ++i)
+#if !EIGEN_HAS_CXX11_ATOMIC
         #pragma omp atomic
-        --(info[i].users);
-      }
+#endif
+        info[i].users -= 1;
     }
   }
   else
@@ -203,15 +205,10 @@
 };
 
 /*********************************************************************************
-*  Specialization of GeneralProduct<> for "large" GEMM, i.e.,
+*  Specialization of generic_product_impl for "large" GEMM, i.e.,
 *  implementation of the high level wrapper to general_matrix_matrix_product
 **********************************************************************************/
 
-template<typename Lhs, typename Rhs>
-struct traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
- : traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> >
-{};
-
 template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType>
 struct gemm_functor
 {
@@ -219,8 +216,9 @@
     : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking)
   {}
 
-  void initParallelSession() const
+  void initParallelSession(Index num_threads) const
   {
+    m_blocking.initParallel(m_lhs.rows(), m_rhs.cols(), m_lhs.cols(), num_threads);
     m_blocking.allocateA();
   }
 
@@ -230,9 +228,9 @@
       cols = m_rhs.cols();
 
     Gemm::run(rows, cols, m_lhs.cols(),
-              /*(const Scalar*)*/&m_lhs.coeffRef(row,0), m_lhs.outerStride(),
-              /*(const Scalar*)*/&m_rhs.coeffRef(0,col), m_rhs.outerStride(),
-              (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
+              &m_lhs.coeffRef(row,0), m_lhs.outerStride(),
+              &m_rhs.coeffRef(0,col), m_rhs.outerStride(),
+              (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.innerStride(), m_dest.outerStride(),
               m_actualAlpha, m_blocking, info);
   }
 
@@ -259,9 +257,9 @@
     LhsScalar* m_blockA;
     RhsScalar* m_blockB;
 
-    DenseIndex m_mc;
-    DenseIndex m_nc;
-    DenseIndex m_kc;
+    Index m_mc;
+    Index m_nc;
+    Index m_kc;
 
   public:
 
@@ -269,16 +267,16 @@
       : m_blockA(0), m_blockB(0), m_mc(0), m_nc(0), m_kc(0)
     {}
 
-    inline DenseIndex mc() const { return m_mc; }
-    inline DenseIndex nc() const { return m_nc; }
-    inline DenseIndex kc() const { return m_kc; }
+    inline Index mc() const { return m_mc; }
+    inline Index nc() const { return m_nc; }
+    inline Index kc() const { return m_kc; }
 
     inline LhsScalar* blockA() { return m_blockA; }
     inline RhsScalar* blockB() { return m_blockB; }
 };
 
 template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
-class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true>
+class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true /* == FiniteAtCompileTime */>
   : public level3_blocking<
       typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type,
       typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type>
@@ -296,20 +294,33 @@
       SizeB = ActualCols * MaxDepth
     };
 
-    EIGEN_ALIGN_DEFAULT LhsScalar m_staticA[SizeA];
-    EIGEN_ALIGN_DEFAULT RhsScalar m_staticB[SizeB];
+#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES
+    EIGEN_ALIGN_MAX LhsScalar m_staticA[SizeA];
+    EIGEN_ALIGN_MAX RhsScalar m_staticB[SizeB];
+#else
+    EIGEN_ALIGN_MAX char m_staticA[SizeA * sizeof(LhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES-1];
+    EIGEN_ALIGN_MAX char m_staticB[SizeB * sizeof(RhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES-1];
+#endif
 
   public:
 
-    gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, int /*num_threads*/, bool /*full_rows = false*/)
+    gemm_blocking_space(Index /*rows*/, Index /*cols*/, Index /*depth*/, Index /*num_threads*/, bool /*full_rows = false*/)
     {
       this->m_mc = ActualRows;
       this->m_nc = ActualCols;
       this->m_kc = MaxDepth;
+#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES
       this->m_blockA = m_staticA;
       this->m_blockB = m_staticB;
+#else
+      this->m_blockA = reinterpret_cast<LhsScalar*>((internal::UIntPtr(m_staticA) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
+      this->m_blockB = reinterpret_cast<RhsScalar*>((internal::UIntPtr(m_staticB) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
+#endif
     }
 
+    void initParallel(Index, Index, Index, Index)
+    {}
+
     inline void allocateA() {}
     inline void allocateB() {}
     inline void allocateAll() {}
@@ -328,12 +339,12 @@
     typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar;
     typedef gebp_traits<LhsScalar,RhsScalar> Traits;
 
-    DenseIndex m_sizeA;
-    DenseIndex m_sizeB;
+    Index m_sizeA;
+    Index m_sizeB;
 
   public:
 
-    gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, DenseIndex num_threads, bool l3_blocking)
+    gemm_blocking_space(Index rows, Index cols, Index depth, Index num_threads, bool l3_blocking)
     {
       this->m_mc = Transpose ? cols : rows;
       this->m_nc = Transpose ? rows : cols;
@@ -341,20 +352,31 @@
 
       if(l3_blocking)
       {
-        DenseIndex m = this->m_mc;
-        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc, num_threads);
+        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads);
       }
-      else // no l3 blocking
+      else  // no l3 blocking
       {
-        DenseIndex m = this->m_mc;
-        DenseIndex n = this->m_nc;
-        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, n, num_threads);
+        Index n = this->m_nc;
+        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n, num_threads);
       }
 
       m_sizeA = this->m_mc * this->m_kc;
       m_sizeB = this->m_kc * this->m_nc;
     }
 
+    void initParallel(Index rows, Index cols, Index depth, Index num_threads)
+    {
+      this->m_mc = Transpose ? cols : rows;
+      this->m_nc = Transpose ? rows : cols;
+      this->m_kc = depth;
+
+      eigen_internal_assert(this->m_blockA==0 && this->m_blockB==0);
+      Index m = this->m_mc;
+      computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc, num_threads);
+      m_sizeA = this->m_mc * this->m_kc;
+      m_sizeB = this->m_kc * this->m_nc;
+    }
+
     void allocateA()
     {
       if(this->m_blockA==0)
@@ -382,183 +404,114 @@
 
 } // end namespace internal
 
+namespace internal {
+
 template<typename Lhs, typename Rhs>
-class GeneralProduct<Lhs, Rhs, GemmProduct>
-  : public ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
+  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> >
 {
-    enum {
-      MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime)
-    };
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  typedef typename Lhs::Scalar LhsScalar;
+  typedef typename Rhs::Scalar RhsScalar;
 
-    typedef typename  Lhs::Scalar LhsScalar;
-    typedef typename  Rhs::Scalar RhsScalar;
-    typedef           Scalar      ResScalar;
-    typedef typename  NumTraits<Scalar>::Real RealResScalar;
+  typedef internal::blas_traits<Lhs> LhsBlasTraits;
+  typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+  typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
 
-    GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
+  typedef internal::blas_traits<Rhs> RhsBlasTraits;
+  typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+
+  enum {
+    MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime)
+  };
+
+  typedef generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> lazyproduct;
+
+  template<typename Dst>
+  static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program
+    // to determine the following heuristic.
+    // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h,
+    // unless it has been specialized by the user or for a given architecture.
+    // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs.
+    // I'm not sure it is still required.
+    if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar,Scalar>());
+    else
     {
-      typedef internal::scalar_product_op<LhsScalar,RhsScalar> BinOp;
-      EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar);
+      dst.setZero();
+      scaleAndAddTo(dst, lhs, rhs, Scalar(1));
     }
+  }
 
-    template<typename Dest>
-    inline void evalTo(Dest& dst) const
+  template<typename Dst>
+  static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar,Scalar>());
+    else
+      scaleAndAddTo(dst,lhs, rhs, Scalar(1));
+  }
+
+  template<typename Dst>
+  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar,Scalar>());
+    else
+      scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
+  }
+
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha)
+  {
+    eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols());
+    if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0)
+      return;
+
+    if (dst.cols() == 1)
     {
-      if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
-        dst.noalias() = m_lhs .lazyProduct( m_rhs );
-      else
-      {
-        dst.setZero();
-        scaleAndAddTo(dst,Scalar(1));
-      }
+      // Fallback to GEMV if either the lhs or rhs is a runtime vector
+      typename Dest::ColXpr dst_vec(dst.col(0));
+      return internal::generic_product_impl<Lhs,typename Rhs::ConstColXpr,DenseShape,DenseShape,GemvProduct>
+        ::scaleAndAddTo(dst_vec, a_lhs, a_rhs.col(0), alpha);
     }
-
-    template<typename Dest>
-    inline void addTo(Dest& dst) const
+    else if (dst.rows() == 1)
     {
-      if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
-        dst.noalias() += m_lhs .lazyProduct( m_rhs );
-      else
-        scaleAndAddTo(dst,Scalar(1));
+      // Fallback to GEMV if either the lhs or rhs is a runtime vector
+      typename Dest::RowXpr dst_vec(dst.row(0));
+      return internal::generic_product_impl<typename Lhs::ConstRowXpr,Rhs,DenseShape,DenseShape,GemvProduct>
+        ::scaleAndAddTo(dst_vec, a_lhs.row(0), a_rhs, alpha);
     }
 
-    template<typename Dest>
-    inline void subTo(Dest& dst) const
-    {
-      if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
-        dst.noalias() -= m_lhs .lazyProduct( m_rhs );
-      else
-        scaleAndAddTo(dst,Scalar(-1));
-    }
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
 
-    template <typename Dest>
-    void scaleAndAddTo(Dest& dst, const Scalar& alpha) const {
-      eigen_assert(dst.rows() == m_lhs.rows() && dst.cols() == m_rhs.cols());
+    Scalar actualAlpha = combine_scalar_factors(alpha, a_lhs, a_rhs);
 
-      Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) *
-                           RhsBlasTraits::extractScalarFactor(m_rhs);
+    typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar,
+            Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType;
 
-      typename internal::add_const_on_value_type<ActualLhsType>::type lhs =
-          LhsBlasTraits::extract(m_lhs);
-      typename internal::add_const_on_value_type<ActualRhsType>::type rhs =
-          RhsBlasTraits::extract(m_rhs);
-      const int lhs_storage_order =
-          _ActualLhsType::Flags & RowMajorBit ? RowMajor : ColMajor;
-      const int rhs_storage_order =
-          _ActualRhsType::Flags & RowMajorBit ? RowMajor : ColMajor;
+    typedef internal::gemm_functor<
+      Scalar, Index,
+      internal::general_matrix_matrix_product<
+        Index,
+        LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
+        RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
+        (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,
+        Dest::InnerStrideAtCompileTime>,
+      ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor;
 
-      // Call the specialized matrix-vector kernel if the lhs or rhs is actually
-      // a vector, since it is up to 4x faster.
-      if (rhs.cols() == 1) {
-        const int dst_storage_order =
-            Dest::Flags & RowMajorBit ? RowMajor : ColMajor;
-        const Index dst_stride = dst_storage_order == RowMajor ? dst.outerStride()
-             : dst.innerStride();
-        const Index rhs_stride = rhs_storage_order == RowMajor ? rhs.outerStride()
-            : rhs.innerStride();
-        const Index lhs_stride = lhs.outerStride();
-
-        // Check that we don't violate one of the limitations of the
-        // general_matrix_vector_product implementation.
-        if (!(lhs_storage_order == RowMajor && rhs_stride != 1) &&
-            !(lhs_storage_order == ColMajor &&
-              (dst_stride != 1 || (NumTraits<Scalar>::IsComplex &&
-                numext::imag(actualAlpha) != RealResScalar(0) &&
-                !NumTraits<RhsScalar>::IsComplex)))) {
-          typedef typename internal::const_blas_data_mapper<LhsScalar, Index,
-                                                            lhs_storage_order>
-              LhsMapper;
-          typedef typename internal::const_blas_data_mapper<RhsScalar, Index,
-                                                            rhs_storage_order>
-              RhsMapper;
-          internal::general_matrix_vector_product<
-              Index, LhsScalar, LhsMapper, lhs_storage_order,
-              LhsBlasTraits::NeedToConjugate, RhsScalar, RhsMapper,
-              RhsBlasTraits::NeedToConjugate>::run(lhs.rows(), lhs.cols(),
-                                                   LhsMapper(lhs.data(),
-                                                             lhs_stride),
-                                                   RhsMapper(rhs.data(),
-                                                             rhs_stride),
-                                                   dst.data(),
-                                                   dst_stride,
-                                                   actualAlpha);
-          return;
-        }
-      }
-
-      if (lhs.rows() == 1) {
-          // Matrix is on the right c = v * A. Use transposition and compute
-          // c' = A' * v'.
-        const int lhs_transposed_storage_order =
-            _ActualLhsType::Flags & RowMajorBit ? ColMajor : RowMajor;
-        const int rhs_transposed_storage_order =
-            _ActualRhsType::Flags & RowMajorBit ? ColMajor : RowMajor;
-        const int dst_transposed_storage_order =
-            Dest::Flags & RowMajorBit ? ColMajor : RowMajor;
-        const Index dst_stride = dst_transposed_storage_order == RowMajor
-                               ? dst.outerStride()
-                               : dst.innerStride();
-        const Index rhs_stride = rhs.outerStride();
-        const Index lhs_stride = lhs_transposed_storage_order == RowMajor
-                               ? lhs.outerStride()
-                               : lhs.innerStride();
-
-        // Check that we don't violate one of the limitations of the
-        // general_matrix_vector_product implementation.
-        if (!(rhs_transposed_storage_order == RowMajor && lhs_stride != 1) &&
-            !(rhs_transposed_storage_order == ColMajor &&
-              (dst_stride != 1 ||
-               (NumTraits<Scalar>::IsComplex &&
-                numext::imag(actualAlpha) != RealResScalar(0) &&
-                !NumTraits<LhsScalar>::IsComplex)))) {
-          typedef typename internal::const_blas_data_mapper<
-              LhsScalar, Index, lhs_transposed_storage_order>
-              LhsMapper;
-          typedef typename internal::const_blas_data_mapper<
-              RhsScalar, Index, rhs_transposed_storage_order>
-              RhsMapper;
-          internal::general_matrix_vector_product<
-              Index, RhsScalar, RhsMapper, rhs_transposed_storage_order,
-              RhsBlasTraits::NeedToConjugate, LhsScalar, LhsMapper,
-              LhsBlasTraits::NeedToConjugate>::run(rhs.cols(), rhs.rows(),
-                                                   RhsMapper(rhs.data(),
-                                                             rhs_stride),
-                                                   LhsMapper(lhs.data(),
-                                                             lhs_stride),
-                                                   dst.data(),
-                                                   dst_stride,
-                                                   actualAlpha);
-          return;
-        }
-      }
-
-      typedef internal::gemm_blocking_space<
-          (Dest::Flags & RowMajorBit) ? RowMajor : ColMajor, LhsScalar,
-          RhsScalar, Dest::MaxRowsAtCompileTime, Dest::MaxColsAtCompileTime,
-          MaxDepthAtCompileTime>
-          BlockingType;
-
-      typedef internal::gemm_functor<
-          Scalar, Index,
-          internal::general_matrix_matrix_product<
-              Index, LhsScalar, lhs_storage_order,
-              bool(LhsBlasTraits::NeedToConjugate), RhsScalar,
-              rhs_storage_order, bool(RhsBlasTraits::NeedToConjugate),
-              (Dest::Flags & RowMajorBit) ? RowMajor : ColMajor>,
-          _ActualLhsType, _ActualRhsType, Dest, BlockingType>
-          GemmFunctor;
-
-      BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
-
-      internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime > 32 ||
-                                  Dest::MaxRowsAtCompileTime == Dynamic)>(
-          GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(),
-          this->cols(), lhs.cols(), Dest::Flags & RowMajorBit);
-    }
+    BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
+    internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
+        (GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(), Dest::Flags&RowMajorBit);
+  }
 };
 
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_GENERAL_MATRIX_MATRIX_H

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index 86733ff..6ba0d9b 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h

@@ -20,70 +20,77 @@
 /**********************************************************************
 * This file implements a general A * B product while
 * evaluating only one triangular part of the product.
-* This is more general version of self adjoint product (C += A A^T)
+* This is a more general version of self adjoint product (C += A A^T)
 * as the level 3 SYRK Blas routine.
 **********************************************************************/
 
 // forward declarations (defined at the end of this file)
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int ResInnerStride, int UpLo>
 struct tribb_kernel;
   
 /* Optimized matrix-matrix product evaluating only one triangular half */
 template <typename Index,
           typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
           typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
-                              int ResStorageOrder, int  UpLo, int Version = Specialized>
+                              int ResStorageOrder, int ResInnerStride, int  UpLo, int Version = Specialized>
 struct general_matrix_matrix_triangular_product;
 
 // as usual if the result is row major => we transpose the product
 template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
-                          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int  UpLo, int Version>
-struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,UpLo,Version>
+                          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+                          int ResInnerStride, int  UpLo, int Version>
+struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride,UpLo,Version>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
-                                      const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha)
+                                      const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resIncr, Index resStride,
+                                      const ResScalar& alpha, level3_blocking<RhsScalar,LhsScalar>& blocking)
   {
     general_matrix_matrix_triangular_product<Index,
         RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
         LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
-        ColMajor, UpLo==Lower?Upper:Lower>
-      ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha);
+        ColMajor, ResInnerStride, UpLo==Lower?Upper:Lower>
+      ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking);
   }
 };
 
 template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
-                          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int  UpLo, int Version>
-struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Version>
+                          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+                          int ResInnerStride, int  UpLo, int Version>
+struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,UpLo,Version>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
-                                      const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha)
+                                      const RhsScalar* _rhs, Index rhsStride,
+                                      ResScalar* _res, Index resIncr, Index resStride,
+                                      const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking)
   {
     typedef gebp_traits<LhsScalar,RhsScalar> Traits;
 
     typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
     typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
-    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
     LhsMapper lhs(_lhs,lhsStride);
     RhsMapper rhs(_rhs,rhsStride);
-    ResMapper res(_res, resStride);
+    ResMapper res(_res, resStride, resIncr);
 
-    Index kc = depth; // cache block size along the K direction
-    Index mc = size;  // cache block size along the M direction
-    Index nc = size;  // cache block size along the N direction
-    computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc, Index(1));
+    Index kc = blocking.kc();
+    Index mc = (std::min)(size,blocking.mc());
+
     // !!! mc must be a multiple of nr:
     if(mc > Traits::nr)
       mc = (mc/Traits::nr)*Traits::nr;
 
-    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
-    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0);
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*size;
 
-    gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
+
+    gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
     gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
     gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
-    tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb;
+    tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, ResInnerStride, UpLo> sybb;
 
     for(Index k2=0; k2<depth; k2+=kc)
     {
@@ -106,8 +113,7 @@
           gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
                (std::min)(size,i2), alpha, -1, -1, 0, 0);
 
-
-        sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
+        sybb(_res+resStride*i2 + resIncr*i2, resIncr, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
 
         if (UpLo==Upper)
         {
@@ -129,7 +135,7 @@
 //   while the triangular block overlapping the diagonal is evaluated into a
 //   small temporary buffer which is then accumulated into the result using a
 //   triangular traversal.
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int ResInnerStride, int UpLo>
 struct tribb_kernel
 {
   typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits;
@@ -138,13 +144,15 @@
   enum {
     BlockSize  = meta_least_common_multiple<EIGEN_PLAIN_ENUM_MAX(mr,nr),EIGEN_PLAIN_ENUM_MIN(mr,nr)>::ret
   };
-  void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
+  void operator()(ResScalar* _res, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
   {
-    typedef blas_data_mapper<ResScalar, Index, ColMajor> ResMapper;
-    ResMapper res(_res, resStride);
-    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
+    typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
+    typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned> BufferMapper;
+    ResMapper res(_res, resStride, resIncr);
+    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel1;
+    gebp_kernel<LhsScalar, RhsScalar, Index, BufferMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel2;
 
-    Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
+    Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer((internal::constructor_without_unaligned_array_assert()));
 
     // let's process the block per panel of actual_mc x BlockSize,
     // again, each is split into three parts, etc.
@@ -154,31 +162,32 @@
       const RhsScalar* actual_b = blockB+j*depth;
 
       if(UpLo==Upper)
-        gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0);
-
+        gebp_kernel1(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
+                     -1, -1, 0, 0);
+      
       // selfadjoint micro block
       {
         Index i = j;
         buffer.setZero();
         // 1 - apply the kernel on the temporary buffer
-        gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0);
+        gebp_kernel2(BufferMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
+                     -1, -1, 0, 0);
+
         // 2 - triangular accumulation
         for(Index j1=0; j1<actualBlockSize; ++j1)
         {
-          ResScalar* r = &res(i, j + j1);
+          typename ResMapper::LinearMapper r = res.getLinearMapper(i,j+j1);
           for(Index i1=UpLo==Lower ? j1 : 0;
               UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
-            r[i1] += buffer(i1,j1);
+            r(i1) += buffer(i1,j1);
         }
       }
 
       if(UpLo==Lower)
       {
         Index i = j+actualBlockSize;
-        gebp_kernel(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i, 
-                    depth, actualBlockSize, alpha, -1, -1, 0, 0);
+        gebp_kernel1(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i, 
+                     depth, actualBlockSize, alpha, -1, -1, 0, 0);
       }
     }
   }
@@ -195,10 +204,9 @@
 template<typename MatrixType, typename ProductType, int UpLo>
 struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
 {
-  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha)
+  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha, bool beta)
   {
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
     
     typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
     typedef internal::blas_traits<Lhs> LhsBlasTraits;
@@ -214,6 +222,9 @@
 
     Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
 
+    if(!beta)
+      mat.template triangularView<UpLo>().setZero();
+
     enum {
       StorageOrder = (internal::traits<MatrixType>::Flags&RowMajorBit) ? RowMajor : ColMajor,
       UseLhsDirectly = _ActualLhs::InnerStrideAtCompileTime==1,
@@ -241,10 +252,8 @@
 template<typename MatrixType, typename ProductType, int UpLo>
 struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
 {
-  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha)
+  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha, bool beta)
   {
-    typedef typename MatrixType::Index Index;
-    
     typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
     typedef internal::blas_traits<Lhs> LhsBlasTraits;
     typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs;
@@ -259,26 +268,48 @@
 
     typename ProductType::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
 
+    if(!beta)
+      mat.template triangularView<UpLo>().setZero();
+
+    enum {
+      IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0,
+      LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0,
+      RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0,
+      SkipDiag = (UpLo&(UnitDiag|ZeroDiag))!=0
+    };
+
+    Index size = mat.cols();
+    if(SkipDiag)
+      size--;
+    Index depth = actualLhs.cols();
+
+    typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,typename Lhs::Scalar,typename Rhs::Scalar,
+          MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime, _ActualRhs::MaxColsAtCompileTime> BlockingType;
+
+    BlockingType blocking(size, size, depth, 1, false);
+
     internal::general_matrix_matrix_triangular_product<Index,
-      typename Lhs::Scalar, _ActualLhs::Flags&RowMajorBit ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
-      typename Rhs::Scalar, _ActualRhs::Flags&RowMajorBit ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
-      MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo>
-      ::run(mat.cols(), actualLhs.cols(),
-            &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(),
-            mat.data(), mat.outerStride(), actualAlpha);
+      typename Lhs::Scalar, LhsIsRowMajor ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
+      typename Rhs::Scalar, RhsIsRowMajor ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
+      IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo&(Lower|Upper)>
+      ::run(size, depth,
+            &actualLhs.coeffRef(SkipDiag&&(UpLo&Lower)==Lower ? 1 : 0,0), actualLhs.outerStride(),
+            &actualRhs.coeffRef(0,SkipDiag&&(UpLo&Upper)==Upper ? 1 : 0), actualRhs.outerStride(),
+            mat.data() + (SkipDiag ? (bool(IsRowMajor) != ((UpLo&Lower)==Lower) ? mat.innerStride() : mat.outerStride() ) : 0),
+            mat.innerStride(), mat.outerStride(), actualAlpha, blocking);
   }
 };
 
 template<typename MatrixType, unsigned int UpLo>
-template<typename ProductDerived, typename _Lhs, typename _Rhs>
-EIGEN_DEVICE_FUNC
-TriangularView<MatrixType,UpLo>& TriangularView<MatrixType,UpLo>::assignProduct(const ProductBase<ProductDerived, _Lhs,_Rhs>& prod, const Scalar& alpha)
+template<typename ProductType>
+EIGEN_DEVICE_FUNC TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
 {
-  eigen_assert(m_matrix.rows() == prod.rows() && m_matrix.cols() == prod.cols());
+  EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED);
+  eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols());
 
-  general_product_to_triangular_selector<MatrixType, ProductDerived, UpLo, (_Lhs::ColsAtCompileTime==1) || (_Rhs::RowsAtCompileTime==1)>::run(m_matrix.const_cast_derived(), prod.derived(), alpha);
-  
-  return *this;
+  general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta);
+
+  return derived();
 }
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
new file mode 100644
index 0000000..9a650ec
--- /dev/null
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h

@@ -0,0 +1,145 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   Level 3 BLAS SYRK/HERK implementation.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int UpLo>
+struct general_matrix_matrix_rankupdate :
+       general_matrix_matrix_triangular_product<
+         Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,1,UpLo,BuiltIn> {};
+
+
+// try to go to BLAS specialization
+#define EIGEN_BLAS_RANKUPDATE_SPECIALIZE(Scalar) \
+template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \
+                          int RhsStorageOrder, bool ConjugateRhs, int  UpLo> \
+struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \
+               Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,1,UpLo,Specialized> { \
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
+                          const Scalar* rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \
+  { \
+    if ( lhs==rhs && ((UpLo&(Lower|Upper))==UpLo) ) { \
+      general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
+      ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \
+    } else { \
+      general_matrix_matrix_triangular_product<Index, \
+        Scalar, LhsStorageOrder, ConjugateLhs, \
+        Scalar, RhsStorageOrder, ConjugateRhs, \
+        ColMajor, 1, UpLo, BuiltIn> \
+      ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resIncr,resStride,alpha,blocking); \
+    } \
+  } \
+};
+
+EIGEN_BLAS_RANKUPDATE_SPECIALIZE(double)
+EIGEN_BLAS_RANKUPDATE_SPECIALIZE(float)
+// TODO handle complex cases
+// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(dcomplex)
+// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(scomplex)
+
+// SYRK for float/double
+#define EIGEN_BLAS_RANKUPDATE_R(EIGTYPE, BLASTYPE, BLASFUNC) \
+template <typename Index, int AStorageOrder, bool ConjugateA, int  UpLo> \
+struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
+  enum { \
+    IsLower = (UpLo&Lower) == Lower, \
+    LowUp = IsLower ? Lower : Upper, \
+    conjA = ((AStorageOrder==ColMajor) && ConjugateA) ? 1 : 0 \
+  }; \
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
+                          const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
+  { \
+  /* typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;*/ \
+\
+   BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
+   char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \
+   EIGTYPE beta(1); \
+   BLASFUNC(&uplo, &trans, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), lhs, &lda, (const BLASTYPE*)&numext::real_ref(beta), res, &ldc); \
+  } \
+};
+
+// HERK for complex data
+#define EIGEN_BLAS_RANKUPDATE_C(EIGTYPE, BLASTYPE, RTYPE, BLASFUNC) \
+template <typename Index, int AStorageOrder, bool ConjugateA, int  UpLo> \
+struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
+  enum { \
+    IsLower = (UpLo&Lower) == Lower, \
+    LowUp = IsLower ? Lower : Upper, \
+    conjA = (((AStorageOrder==ColMajor) && ConjugateA) || ((AStorageOrder==RowMajor) && !ConjugateA)) ? 1 : 0 \
+  }; \
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
+                          const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
+  { \
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, AStorageOrder> MatrixType; \
+\
+   BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
+   char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'C':'N'); \
+   RTYPE alpha_, beta_; \
+   const EIGTYPE* a_ptr; \
+\
+   alpha_ = alpha.real(); \
+   beta_ = 1.0; \
+/* Copy with conjugation in some cases*/ \
+   MatrixType a; \
+   if (conjA) { \
+     Map<const MatrixType, 0, OuterStride<> > mapA(lhs,n,k,OuterStride<>(lhsStride)); \
+     a = mapA.conjugate(); \
+     lda = a.outerStride(); \
+     a_ptr = a.data(); \
+   } else a_ptr=lhs; \
+   BLASFUNC(&uplo, &trans, &n, &k, &alpha_, (BLASTYPE*)a_ptr, &lda, &beta_, (BLASTYPE*)res, &ldc); \
+  } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk)
+EIGEN_BLAS_RANKUPDATE_R(float,  float,  ssyrk)
+#else
+EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_)
+EIGEN_BLAS_RANKUPDATE_R(float,  float,  ssyrk_)
+#endif
+
+// TODO hanlde complex cases
+// EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_)
+// EIGEN_BLAS_RANKUPDATE_C(scomplex, float,  float, cherk_)
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
new file mode 100644
index 0000000..71abf40
--- /dev/null
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h

@@ -0,0 +1,124 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   General matrix-matrix product functionality based on ?GEMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/**********************************************************************
+* This file implements general matrix-matrix multiplication using BLAS
+* gemm function via partial specialization of
+* general_matrix_matrix_product::run(..) method for float, double,
+* std::complex<float> and std::complex<double> types
+**********************************************************************/
+
+// gemm specialization
+
+#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASFUNC) \
+template< \
+  typename Index, \
+  int LhsStorageOrder, bool ConjugateLhs, \
+  int RhsStorageOrder, bool ConjugateRhs> \
+struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1> \
+{ \
+typedef gebp_traits<EIGTYPE,EIGTYPE> Traits; \
+\
+static void run(Index rows, Index cols, Index depth, \
+  const EIGTYPE* _lhs, Index lhsStride, \
+  const EIGTYPE* _rhs, Index rhsStride, \
+  EIGTYPE* res, Index resIncr, Index resStride, \
+  EIGTYPE alpha, \
+  level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, \
+  GemmParallelInfo<Index>* /*info = 0*/) \
+{ \
+  using std::conj; \
+\
+  EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+  eigen_assert(resIncr == 1); \
+  char transa, transb; \
+  BlasIndex m, n, k, lda, ldb, ldc; \
+  const EIGTYPE *a, *b; \
+  EIGTYPE beta(1); \
+  MatrixX##EIGPREFIX a_tmp, b_tmp; \
+\
+/* Set transpose options */ \
+  transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
+  transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
+\
+/* Set m, n, k */ \
+  m = convert_index<BlasIndex>(rows);  \
+  n = convert_index<BlasIndex>(cols);  \
+  k = convert_index<BlasIndex>(depth); \
+\
+/* Set lda, ldb, ldc */ \
+  lda = convert_index<BlasIndex>(lhsStride); \
+  ldb = convert_index<BlasIndex>(rhsStride); \
+  ldc = convert_index<BlasIndex>(resStride); \
+\
+/* Set a, b, c */ \
+  if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \
+    Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \
+    a_tmp = lhs.conjugate(); \
+    a = a_tmp.data(); \
+    lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
+  } else a = _lhs; \
+\
+  if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \
+    Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \
+    b_tmp = rhs.conjugate(); \
+    b = b_tmp.data(); \
+    ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
+  } else b = _rhs; \
+\
+  BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+}};
+
+#ifdef EIGEN_USE_MKL
+GEMM_SPECIALIZATION(double,   d,  double, dgemm)
+GEMM_SPECIALIZATION(float,    f,  float,  sgemm)
+GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm)
+GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8,  cgemm)
+#else
+GEMM_SPECIALIZATION(double,   d,  double, dgemm_)
+GEMM_SPECIALIZATION(float,    f,  float,  sgemm_)
+GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
+GEMM_SPECIALIZATION(scomplex, cf, float,  cgemm_)
+#endif
+
+} // end namespase internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H

diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index 323d75b..dfb6aeb 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -14,11 +14,57 @@
 
 namespace internal {
 
+enum GEMVPacketSizeType {
+  GEMVPacketFull = 0,
+  GEMVPacketHalf,
+  GEMVPacketQuarter
+};
+
+template <int N, typename T1, typename T2, typename T3>
+struct gemv_packet_cond { typedef T3 type; };
+
+template <typename T1, typename T2, typename T3>
+struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> { typedef T1 type; };
+
+template <typename T1, typename T2, typename T3>
+struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> { typedef T2 type; };
+
+template<typename LhsScalar, typename RhsScalar, int _PacketSize=GEMVPacketFull>
+class gemv_traits
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size)                        \
+  typedef typename gemv_packet_cond<packet_size,                                  \
+                                    typename packet_traits<name ## Scalar>::type, \
+                                    typename packet_traits<name ## Scalar>::half, \
+                                    typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
+  prefix ## name ## Packet
+
+  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+#undef PACKET_DECL_COND_PREFIX
+
+public:
+  enum {
+        Vectorizable = unpacket_traits<_LhsPacket>::vectorizable &&
+        unpacket_traits<_RhsPacket>::vectorizable &&
+        int(unpacket_traits<_LhsPacket>::size)==int(unpacket_traits<_RhsPacket>::size),
+        LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+        RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
+        ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1
+  };
+
+  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+};
+
+
 /* Optimized col-major matrix * vector product:
- * This algorithm processes 4 columns at onces that allows to both reduce
- * the number of load/stores of the result by a factor 4 and to reduce
- * the instruction dependency. Moreover, we know that all bands have the
- * same alignment pattern.
+ * This algorithm processes the matrix per vertical panels,
+ * which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments.
  *
  * Mixing type logic: C += alpha * A * B
  *  |  A  |  B  |alpha| comments
@@ -27,305 +73,219 @@
  *  |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
  *  |cplx |real |real | optimal case, vectorization possible via real-cplx mul
  *
- * Accesses to the matrix coefficients follow the following logic:
- *
- * - if all columns have the same alignment then
- *   - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case)
- *   - otherwise perform unaligned loads only (-> NoneAligned case)
- * - otherwise
- *   - if even columns have the same alignment then
- *     // odd columns are guaranteed to have the same alignment too
- *     - if even or odd columns have the same alignment as the result, then
- *       // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double)
- *       - perform half aligned and half unaligned loads (-> EvenAligned case)
- *     - otherwise perform unaligned loads only (-> NoneAligned case)
- *   - otherwise, if the register size is 4 scalars (e.g., SSE with float) then
- *     - one over 4 consecutive columns is guaranteed to be aligned with the result vector,
- *       perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case)
- *       // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h
- *   - otherwise,
- *     // if we get here, this means the register size is greater than 4 (e.g., AVX with floats),
- *     // we currently fall back to the NoneAligned case
- *
  * The same reasoning apply for the transposed case.
- *
- * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
- * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
- * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
- * compared to unaligned loads on a 4 byte boundary.
- *
  */
 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
 struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef gemv_traits<LhsScalar,RhsScalar> Traits;
+  typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits;
+  typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits;
 
-enum {
-  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
-              && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
-  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
-  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
-  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
-};
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
-typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
-typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
-typedef typename packet_traits<ResScalar>::type  _ResPacket;
+  typedef typename Traits::LhsPacket LhsPacket;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::ResPacket ResPacket;
 
-typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
-typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
-typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+  typedef typename HalfTraits::LhsPacket LhsPacketHalf;
+  typedef typename HalfTraits::RhsPacket RhsPacketHalf;
+  typedef typename HalfTraits::ResPacket ResPacketHalf;
 
-EIGEN_DONT_INLINE static void run(
+  typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
+  typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
+  typedef typename QuarterTraits::ResPacket ResPacketQuarter;
+
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
   Index rows, Index cols,
   const LhsMapper& lhs,
   const RhsMapper& rhs,
         ResScalar* res, Index resIncr,
-  ResScalar alpha_in);
+  RhsScalar alpha);
 };
 
 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
   Index rows, Index cols,
-  const LhsMapper& lhs,
+  const LhsMapper& alhs,
   const RhsMapper& rhs,
         ResScalar* res, Index resIncr,
-  ResScalar alpha_in)
+  RhsScalar alpha)
 {
   EIGEN_UNUSED_VARIABLE(resIncr);
   eigen_internal_assert(resIncr==1);
-  #ifdef _EIGEN_ACCUMULATE_PACKETS
-  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
-  #endif
-  #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
-    pstore(&res[j], \
-      padd(pload<ResPacket>(&res[j]), \
-        padd( \
-      padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j),    ptmp0), \
-      pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j),   ptmp1)),   \
-      padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j),    ptmp2), \
-      pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j),   ptmp3)) )))
 
-  typedef typename LhsMapper::VectorMapper LhsScalars;
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate propoer code.
+  LhsMapper lhs(alhs);
 
   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
-
-  RhsScalar alpha = get_factor<ResScalar, RhsScalar>::run(alpha_in);
-
-  if(ConjugateRhs)
-    alpha = numext::conj(alpha);
-
-  enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
-  const Index columnsAtOnce = 4;
-  const Index peels = 2;
-  const Index LhsPacketAlignedMask = LhsPacketSize-1;
-  const Index ResPacketAlignedMask = ResPacketSize-1;
-//  const Index PeelAlignedMask = ResPacketSize*peels-1;
-  const Index size = rows;
+  conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
+  conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
 
   const Index lhsStride = lhs.stride();
+  // TODO: for padded aligned inputs, we could enable aligned reads
+  enum { LhsAlignment = Unaligned,
+         ResPacketSize = Traits::ResPacketSize,
+         ResPacketSizeHalf = HalfTraits::ResPacketSize,
+         ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
+         LhsPacketSize = Traits::LhsPacketSize,
+         HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
+         HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
+  };
 
-  // How many coeffs of the result do we have to skip to be aligned.
-  // Here we assume data are at least aligned on the base scalar type.
-  Index alignedStart = internal::first_aligned(res,size);
-  Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
-  const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
+  const Index n8 = rows-8*ResPacketSize+1;
+  const Index n4 = rows-4*ResPacketSize+1;
+  const Index n3 = rows-3*ResPacketSize+1;
+  const Index n2 = rows-2*ResPacketSize+1;
+  const Index n1 = rows-1*ResPacketSize+1;
+  const Index n_half = rows-1*ResPacketSizeHalf+1;
+  const Index n_quarter = rows-1*ResPacketSizeQuarter+1;
 
-  const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
-  Index alignmentPattern = alignmentStep==0 ? AllAligned
-                       : alignmentStep==(LhsPacketSize/2) ? EvenAligned
-                       : FirstAligned;
+  // TODO: improve the following heuristic:
+  const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4);
+  ResPacket palpha = pset1<ResPacket>(alpha);
+  ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha);
+  ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha);
 
-  // we cannot assume the first element is aligned because of sub-matrices
-  const Index lhsAlignmentOffset = lhs.firstAligned(size);
-
-  // find how many columns do we have to skip to be aligned with the result (if possible)
-  Index skipColumns = 0;
-  // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (size_t(res)%sizeof(ResScalar)) )
+  for(Index j2=0; j2<cols; j2+=block_cols)
   {
-    alignedSize = 0;
-    alignedStart = 0;
-    alignmentPattern = NoneAligned;
-  }
-  else if(LhsPacketSize > 4)
-  {
-    // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
-    // Currently, it seems to be better to perform unaligned loads anyway
-    alignmentPattern = NoneAligned;
-  }
-  else if (LhsPacketSize>1)
-  {
-  //    eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
-
-    while (skipColumns<LhsPacketSize &&
-          alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
-      ++skipColumns;
-    if (skipColumns==LhsPacketSize)
+    Index jend = numext::mini(j2+block_cols,cols);
+    Index i=0;
+    for(; i<n8; i+=ResPacketSize*8)
     {
-      // nothing can be aligned, no need to skip any column
-      alignmentPattern = NoneAligned;
-      skipColumns = 0;
-    }
-    else
-    {
-      skipColumns = (std::min)(skipColumns,cols);
-      // note that the skiped columns are processed later.
-    }
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+                c1 = pset1<ResPacket>(ResScalar(0)),
+                c2 = pset1<ResPacket>(ResScalar(0)),
+                c3 = pset1<ResPacket>(ResScalar(0)),
+                c4 = pset1<ResPacket>(ResScalar(0)),
+                c5 = pset1<ResPacket>(ResScalar(0)),
+                c6 = pset1<ResPacket>(ResScalar(0)),
+                c7 = pset1<ResPacket>(ResScalar(0));
 
-    /*    eigen_internal_assert(  (alignmentPattern==NoneAligned)
-                      || (skipColumns + columnsAtOnce >= cols)
-                      || LhsPacketSize > size
-                      || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/
-  }
-  else if(Vectorizable)
-  {
-    alignedStart = 0;
-    alignedSize = size;
-    alignmentPattern = AllAligned;
-  }
-
-  const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
-  const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
-
-  Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
-  for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
-  {
-    RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
-              ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
-              ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
-              ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
-
-    // this helps a lot generating better binary code
-    const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0),   lhs1 = lhs.getVectorMapper(0, i+offset1),
-                     lhs2 = lhs.getVectorMapper(0, i+2),   lhs3 = lhs.getVectorMapper(0, i+offset3);
-
-    if (Vectorizable)
-    {
-      /* explicit vectorization */
-      // process initial unaligned coeffs
-      for (Index j=0; j<alignedStart; ++j)
+      for(Index j=j2; j<jend; j+=1)
       {
-        res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
-        res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
-        res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
-        res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
+        c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
+        c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
+        c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
+        c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*4,j),b0,c4);
+        c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*5,j),b0,c5);
+        c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*6,j),b0,c6);
+        c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*7,j),b0,c7);
       }
-
-      if (alignedSize>alignedStart)
-      {
-        switch(alignmentPattern)
-        {
-          case AllAligned:
-            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
-            break;
-          case EvenAligned:
-            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
-            break;
-          case FirstAligned:
-          {
-            Index j = alignedStart;
-            if(peels>1)
-            {
-              LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
-              ResPacket T0, T1;
-
-              A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
-              A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
-              A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
-
-              for (; j<peeledSize; j+=peels*ResPacketSize)
-              {
-                A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize);  palign<1>(A01,A11);
-                A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize);  palign<2>(A02,A12);
-                A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize);  palign<3>(A03,A13);
-
-                A00 = lhs0.template load<LhsPacket, Aligned>(j);
-                A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
-                T0  = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
-                T1  = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
-
-                T0  = pcj.pmadd(A01, ptmp1, T0);
-                A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize);  palign<1>(A11,A01);
-                T0  = pcj.pmadd(A02, ptmp2, T0);
-                A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize);  palign<2>(A12,A02);
-                T0  = pcj.pmadd(A03, ptmp3, T0);
-                pstore(&res[j],T0);
-                A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize);  palign<3>(A13,A03);
-                T1  = pcj.pmadd(A11, ptmp1, T1);
-                T1  = pcj.pmadd(A12, ptmp2, T1);
-                T1  = pcj.pmadd(A13, ptmp3, T1);
-                pstore(&res[j+ResPacketSize],T1);
-              }
-            }
-            for (; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
-            break;
-          }
-          default:
-            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
-            break;
-        }
-      }
-    } // end explicit vectorization
-
-    /* process remaining coeffs (or all if there is no explicit vectorization) */
-    for (Index j=alignedSize; j<size; ++j)
+      pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+      pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
+      pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
+      pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
+      pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu<ResPacket>(res+i+ResPacketSize*4)));
+      pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu<ResPacket>(res+i+ResPacketSize*5)));
+      pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu<ResPacket>(res+i+ResPacketSize*6)));
+      pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu<ResPacket>(res+i+ResPacketSize*7)));
+    }
+    if(i<n4)
     {
-      res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
-      res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
-      res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
-      res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+                c1 = pset1<ResPacket>(ResScalar(0)),
+                c2 = pset1<ResPacket>(ResScalar(0)),
+                c3 = pset1<ResPacket>(ResScalar(0));
+
+      for(Index j=j2; j<jend; j+=1)
+      {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
+        c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
+        c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
+        c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
+      }
+      pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+      pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
+      pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
+      pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
+
+      i+=ResPacketSize*4;
+    }
+    if(i<n3)
+    {
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+                c1 = pset1<ResPacket>(ResScalar(0)),
+                c2 = pset1<ResPacket>(ResScalar(0));
+
+      for(Index j=j2; j<jend; j+=1)
+      {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
+        c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
+        c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
+      }
+      pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+      pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
+      pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
+
+      i+=ResPacketSize*3;
+    }
+    if(i<n2)
+    {
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+                c1 = pset1<ResPacket>(ResScalar(0));
+
+      for(Index j=j2; j<jend; j+=1)
+      {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
+        c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
+      }
+      pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+      pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
+      i+=ResPacketSize*2;
+    }
+    if(i<n1)
+    {
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0));
+      for(Index j=j2; j<jend; j+=1)
+      {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
+      }
+      pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+      i+=ResPacketSize;
+    }
+    if(HasHalf && i<n_half)
+    {
+      ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0));
+      for(Index j=j2; j<jend; j+=1)
+      {
+        RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j,0));
+        c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i+0,j),b0,c0);
+      }
+      pstoreu(res+i+ResPacketSizeHalf*0, pmadd(c0,palpha_half,ploadu<ResPacketHalf>(res+i+ResPacketSizeHalf*0)));
+      i+=ResPacketSizeHalf;
+    }
+    if(HasQuarter && i<n_quarter)
+    {
+      ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0));
+      for(Index j=j2; j<jend; j+=1)
+      {
+        RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j,0));
+        c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i+0,j),b0,c0);
+      }
+      pstoreu(res+i+ResPacketSizeQuarter*0, pmadd(c0,palpha_quarter,ploadu<ResPacketQuarter>(res+i+ResPacketSizeQuarter*0)));
+      i+=ResPacketSizeQuarter;
+    }
+    for(;i<rows;++i)
+    {
+      ResScalar c0(0);
+      for(Index j=j2; j<jend; j+=1)
+        c0 += cj.pmul(lhs(i,j), rhs(j,0));
+      res[i] += alpha*c0;
     }
   }
-
-  // process remaining first and last columns (at most columnsAtOnce-1)
-  Index end = cols;
-  Index start = columnBound;
-  do
-  {
-    for (Index k=start; k<end; ++k)
-    {
-      RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
-      const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
-
-      if (Vectorizable)
-      {
-        /* explicit vectorization */
-        // process first unaligned result's coeffs
-        for (Index j=0; j<alignedStart; ++j)
-          res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
-        // process aligned result's coeffs
-        if (lhs0.template aligned<LhsPacket>(alignedStart))
-          for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
-            pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
-        else
-          for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
-            pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
-      }
-
-      // process remaining scalars (or all if no explicit vectorization)
-      for (Index i=alignedSize; i<size; ++i)
-        res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
-    }
-    if (skipColumns)
-    {
-      start = 0;
-      end = skipColumns;
-      skipColumns = 0;
-    }
-    else
-      break;
-  } while(Vectorizable);
-  #undef _EIGEN_ACCUMULATE_PACKETS
 }
 
 /* Optimized row-major matrix * vector product:
- * This algorithm processes 4 rows at onces that allows to both reduce
+ * This algorithm processes 4 rows at once that allows to both reduce
  * the number of load/stores of the result by a factor 4 and to reduce
  * the instruction dependency. Moreover, we know that all bands have the
  * same alignment pattern.
@@ -337,25 +297,25 @@
 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
 struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
 {
-typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef gemv_traits<LhsScalar,RhsScalar> Traits;
+  typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits;
+  typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits;
 
-enum {
-  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
-              && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
-  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
-  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
-  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
-};
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
-typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
-typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
-typedef typename packet_traits<ResScalar>::type  _ResPacket;
+  typedef typename Traits::LhsPacket LhsPacket;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::ResPacket ResPacket;
 
-typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
-typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
-typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+  typedef typename HalfTraits::LhsPacket LhsPacketHalf;
+  typedef typename HalfTraits::RhsPacket RhsPacketHalf;
+  typedef typename HalfTraits::ResPacket ResPacketHalf;
 
-EIGEN_DONT_INLINE static void run(
+  typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
+  typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
+  typedef typename QuarterTraits::ResPacket ResPacketQuarter;
+
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
   Index rows, Index cols,
   const LhsMapper& lhs,
   const RhsMapper& rhs,
@@ -364,254 +324,191 @@
 };
 
 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
   Index rows, Index cols,
-  const LhsMapper& lhs,
+  const LhsMapper& alhs,
   const RhsMapper& rhs,
   ResScalar* res, Index resIncr,
   ResScalar alpha)
 {
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate propoer code.
+  LhsMapper lhs(alhs);
+
   eigen_internal_assert(rhs.stride()==1);
-
-  #ifdef _EIGEN_ACCUMULATE_PACKETS
-  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
-  #endif
-
-  #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
-    RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);  \
-    ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
-    ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
-    ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
-    ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
-
   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
+  conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
+  conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
 
-  typedef typename LhsMapper::VectorMapper LhsScalars;
+  // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
+  //       processing 8 rows at once might be counter productive wrt cache.
+  const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7;
+  const Index n4 = rows-3;
+  const Index n2 = rows-1;
 
-  enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
-  const Index rowsAtOnce = 4;
-  const Index peels = 2;
-  const Index RhsPacketAlignedMask = RhsPacketSize-1;
-  const Index LhsPacketAlignedMask = LhsPacketSize-1;
-  const Index depth = cols;
-  const Index lhsStride = lhs.stride();
+  // TODO: for padded aligned inputs, we could enable aligned reads
+  enum { LhsAlignment = Unaligned,
+         ResPacketSize = Traits::ResPacketSize,
+         ResPacketSizeHalf = HalfTraits::ResPacketSize,
+         ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
+         LhsPacketSize = Traits::LhsPacketSize,
+         LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
+         LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
+         HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
+         HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
+  };
 
-  // How many coeffs of the result do we have to skip to be aligned.
-  // Here we assume data are at least aligned on the base scalar type
-  // if that's not the case then vectorization is discarded, see below.
-  Index alignedStart = rhs.firstAligned(depth);
-  Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
-  const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
-
-  const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
-  Index alignmentPattern = alignmentStep==0 ? AllAligned
-                           : alignmentStep==(LhsPacketSize/2) ? EvenAligned
-                           : FirstAligned;
-
-  // we cannot assume the first element is aligned because of sub-matrices
-  const Index lhsAlignmentOffset = lhs.firstAligned(depth);
-  const Index rhsAlignmentOffset = rhs.firstAligned(rows);
-
-  // find how many rows do we have to skip to be aligned with rhs (if possible)
-  Index skipRows = 0;
-  // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (sizeof(LhsScalar)!=sizeof(RhsScalar))
-      || (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth)
-      || (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows))
+  Index i=0;
+  for(; i<n8; i+=8)
   {
-    alignedSize = 0;
-    alignedStart = 0;
-    alignmentPattern = NoneAligned;
-  }
-  else if(LhsPacketSize > 4)
-  {
-    // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
-    alignmentPattern = NoneAligned;
-  }
-  else if (LhsPacketSize>1)
-  {
-  //    eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
+    ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+              c1 = pset1<ResPacket>(ResScalar(0)),
+              c2 = pset1<ResPacket>(ResScalar(0)),
+              c3 = pset1<ResPacket>(ResScalar(0)),
+              c4 = pset1<ResPacket>(ResScalar(0)),
+              c5 = pset1<ResPacket>(ResScalar(0)),
+              c6 = pset1<ResPacket>(ResScalar(0)),
+              c7 = pset1<ResPacket>(ResScalar(0));
 
-    while (skipRows<LhsPacketSize &&
-           alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
-      ++skipRows;
-    if (skipRows==LhsPacketSize)
+    Index j=0;
+    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
     {
-      // nothing can be aligned, no need to skip any column
-      alignmentPattern = NoneAligned;
-      skipRows = 0;
+      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
+
+      c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
+      c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
+      c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
+      c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
+      c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+4,j),b0,c4);
+      c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+5,j),b0,c5);
+      c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+6,j),b0,c6);
+      c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+7,j),b0,c7);
     }
-    else
+    ResScalar cc0 = predux(c0);
+    ResScalar cc1 = predux(c1);
+    ResScalar cc2 = predux(c2);
+    ResScalar cc3 = predux(c3);
+    ResScalar cc4 = predux(c4);
+    ResScalar cc5 = predux(c5);
+    ResScalar cc6 = predux(c6);
+    ResScalar cc7 = predux(c7);
+    for(; j<cols; ++j)
     {
-      skipRows = (std::min)(skipRows,Index(rows));
-      // note that the skiped columns are processed later.
+      RhsScalar b0 = rhs(j,0);
+
+      cc0 += cj.pmul(lhs(i+0,j), b0);
+      cc1 += cj.pmul(lhs(i+1,j), b0);
+      cc2 += cj.pmul(lhs(i+2,j), b0);
+      cc3 += cj.pmul(lhs(i+3,j), b0);
+      cc4 += cj.pmul(lhs(i+4,j), b0);
+      cc5 += cj.pmul(lhs(i+5,j), b0);
+      cc6 += cj.pmul(lhs(i+6,j), b0);
+      cc7 += cj.pmul(lhs(i+7,j), b0);
     }
-    /*    eigen_internal_assert(  alignmentPattern==NoneAligned
-                      || LhsPacketSize==1
-                      || (skipRows + rowsAtOnce >= rows)
-                      || LhsPacketSize > depth
-                      || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/
+    res[(i+0)*resIncr] += alpha*cc0;
+    res[(i+1)*resIncr] += alpha*cc1;
+    res[(i+2)*resIncr] += alpha*cc2;
+    res[(i+3)*resIncr] += alpha*cc3;
+    res[(i+4)*resIncr] += alpha*cc4;
+    res[(i+5)*resIncr] += alpha*cc5;
+    res[(i+6)*resIncr] += alpha*cc6;
+    res[(i+7)*resIncr] += alpha*cc7;
   }
-  else if(Vectorizable)
+  for(; i<n4; i+=4)
   {
-    alignedStart = 0;
-    alignedSize = depth;
-    alignmentPattern = AllAligned;
-  }
+    ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+              c1 = pset1<ResPacket>(ResScalar(0)),
+              c2 = pset1<ResPacket>(ResScalar(0)),
+              c3 = pset1<ResPacket>(ResScalar(0));
 
-  const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
-  const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
-
-  Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
-  for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
-  {
-    EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
-    ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
-
-    // this helps the compiler generating good binary code
-    const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0),    lhs1 = lhs.getVectorMapper(i+offset1, 0),
-                     lhs2 = lhs.getVectorMapper(i+2, 0),    lhs3 = lhs.getVectorMapper(i+offset3, 0);
-
-    if (Vectorizable)
+    Index j=0;
+    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
     {
-      /* explicit vectorization */
-      ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
-                ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
+      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
 
-      // process initial unaligned coeffs
-      // FIXME this loop get vectorized by the compiler !
-      for (Index j=0; j<alignedStart; ++j)
-      {
-        RhsScalar b = rhs(j, 0);
-        tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
-        tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
-      }
+      c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
+      c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
+      c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
+      c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
+    }
+    ResScalar cc0 = predux(c0);
+    ResScalar cc1 = predux(c1);
+    ResScalar cc2 = predux(c2);
+    ResScalar cc3 = predux(c3);
+    for(; j<cols; ++j)
+    {
+      RhsScalar b0 = rhs(j,0);
 
-      if (alignedSize>alignedStart)
-      {
-        switch(alignmentPattern)
+      cc0 += cj.pmul(lhs(i+0,j), b0);
+      cc1 += cj.pmul(lhs(i+1,j), b0);
+      cc2 += cj.pmul(lhs(i+2,j), b0);
+      cc3 += cj.pmul(lhs(i+3,j), b0);
+    }
+    res[(i+0)*resIncr] += alpha*cc0;
+    res[(i+1)*resIncr] += alpha*cc1;
+    res[(i+2)*resIncr] += alpha*cc2;
+    res[(i+3)*resIncr] += alpha*cc3;
+  }
+  for(; i<n2; i+=2)
+  {
+    ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+              c1 = pset1<ResPacket>(ResScalar(0));
+
+    Index j=0;
+    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
+    {
+      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
+
+      c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
+      c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
+    }
+    ResScalar cc0 = predux(c0);
+    ResScalar cc1 = predux(c1);
+    for(; j<cols; ++j)
+    {
+      RhsScalar b0 = rhs(j,0);
+
+      cc0 += cj.pmul(lhs(i+0,j), b0);
+      cc1 += cj.pmul(lhs(i+1,j), b0);
+    }
+    res[(i+0)*resIncr] += alpha*cc0;
+    res[(i+1)*resIncr] += alpha*cc1;
+  }
+  for(; i<rows; ++i)
+  {
+    ResPacket c0 = pset1<ResPacket>(ResScalar(0));
+    ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
+    ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
+    Index j=0;
+    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
+    {
+      RhsPacket b0 = rhs.template load<RhsPacket,Unaligned>(j,0);
+      c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i,j),b0,c0);
+    }
+    ResScalar cc0 = predux(c0);
+    if (HasHalf) {
+      for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf)
         {
-          case AllAligned:
-            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
-            break;
-          case EvenAligned:
-            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
-            break;
-          case FirstAligned:
-          {
-            Index j = alignedStart;
-            if (peels>1)
-            {
-              /* Here we proccess 4 rows with with two peeled iterations to hide
-               * the overhead of unaligned loads. Moreover unaligned loads are handled
-               * using special shift/move operations between the two aligned packets
-               * overlaping the desired unaligned packet. This is *much* more efficient
-               * than basic unaligned loads.
-               */
-              LhsPacket A01, A02, A03, A11, A12, A13;
-              A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
-              A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
-              A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
-
-              for (; j<peeledSize; j+=peels*RhsPacketSize)
-              {
-                RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
-                A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize);  palign<1>(A01,A11);
-                A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize);  palign<2>(A02,A12);
-                A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize);  palign<3>(A03,A13);
-
-                ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
-                ptmp1 = pcj.pmadd(A01, b, ptmp1);
-                A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize);  palign<1>(A11,A01);
-                ptmp2 = pcj.pmadd(A02, b, ptmp2);
-                A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize);  palign<2>(A12,A02);
-                ptmp3 = pcj.pmadd(A03, b, ptmp3);
-                A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize);  palign<3>(A13,A03);
-
-                b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
-                ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
-                ptmp1 = pcj.pmadd(A11, b, ptmp1);
-                ptmp2 = pcj.pmadd(A12, b, ptmp2);
-                ptmp3 = pcj.pmadd(A13, b, ptmp3);
-              }
-            }
-            for (; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
-            break;
-          }
-          default:
-            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
-            break;
+          RhsPacketHalf b0 = rhs.template load<RhsPacketHalf,Unaligned>(j,0);
+          c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i,j),b0,c0_h);
         }
-        tmp0 += predux(ptmp0);
-        tmp1 += predux(ptmp1);
-        tmp2 += predux(ptmp2);
-        tmp3 += predux(ptmp3);
-      }
-    } // end explicit vectorization
-
-    // process remaining coeffs (or all if no explicit vectorization)
-    // FIXME this loop get vectorized by the compiler !
-    for (Index j=alignedSize; j<depth; ++j)
-    {
-      RhsScalar b = rhs(j, 0);
-      tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
-      tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
+      cc0 += predux(c0_h);
     }
-    res[i*resIncr]            += alpha*tmp0;
-    res[(i+offset1)*resIncr]  += alpha*tmp1;
-    res[(i+2)*resIncr]        += alpha*tmp2;
-    res[(i+offset3)*resIncr]  += alpha*tmp3;
+    if (HasQuarter) {
+      for(; j+LhsPacketSizeQuarter<=cols; j+=LhsPacketSizeQuarter)
+        {
+          RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter,Unaligned>(j,0);
+          c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i,j),b0,c0_q);
+        }
+      cc0 += predux(c0_q);
+    }
+    for(; j<cols; ++j)
+    {
+      cc0 += cj.pmul(lhs(i,j), rhs(j,0));
+    }
+    res[i*resIncr] += alpha*cc0;
   }
-
-  // process remaining first and last rows (at most columnsAtOnce-1)
-  Index end = rows;
-  Index start = rowBound;
-  do
-  {
-    for (Index i=start; i<end; ++i)
-    {
-      EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
-      ResPacket ptmp0 = pset1<ResPacket>(tmp0);
-      const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
-      // process first unaligned result's coeffs
-      // FIXME this loop get vectorized by the compiler !
-      for (Index j=0; j<alignedStart; ++j)
-        tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
-
-      if (alignedSize>alignedStart)
-      {
-        // process aligned rhs coeffs
-        if (lhs0.template aligned<LhsPacket>(alignedStart))
-          for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
-            ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
-        else
-          for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
-            ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
-        tmp0 += predux(ptmp0);
-      }
-
-      // process remaining scalars
-      // FIXME this loop get vectorized by the compiler !
-      for (Index j=alignedSize; j<depth; ++j)
-        tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
-      res[i*resIncr] += alpha*tmp0;
-    }
-    if (skipRows)
-    {
-      start = 0;
-      end = skipRows;
-      skipRows = 0;
-    }
-    else
-      break;
-  } while(Vectorizable);
-
-  #undef _EIGEN_ACCUMULATE_PACKETS
 }
 
 } // end namespace internal

diff --git a/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
new file mode 100644
index 0000000..6e36c2b
--- /dev/null
+++ b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h

@@ -0,0 +1,136 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   General matrix-vector product functionality based on ?GEMV.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H
+#define EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/**********************************************************************
+* This file implements general matrix-vector multiplication using BLAS
+* gemv function via partial specialization of
+* general_matrix_vector_product::run(..) method for float, double,
+* std::complex<float> and std::complex<double> types
+**********************************************************************/
+
+// gemv specialization
+
+template<typename Index, typename LhsScalar, int StorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
+struct general_matrix_vector_product_gemv;
+
+#define EIGEN_BLAS_GEMV_SPECIALIZE(Scalar) \
+template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
+struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,Specialized> { \
+static void run( \
+  Index rows, Index cols, \
+  const const_blas_data_mapper<Scalar,Index,ColMajor> &lhs, \
+  const const_blas_data_mapper<Scalar,Index,RowMajor> &rhs, \
+  Scalar* res, Index resIncr, Scalar alpha) \
+{ \
+  if (ConjugateLhs) { \
+    general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,BuiltIn>::run( \
+      rows, cols, lhs, rhs, res, resIncr, alpha); \
+  } else { \
+    general_matrix_vector_product_gemv<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
+      rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \
+  } \
+} \
+}; \
+template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
+struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,RowMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ConjugateRhs,Specialized> { \
+static void run( \
+  Index rows, Index cols, \
+  const const_blas_data_mapper<Scalar,Index,RowMajor> &lhs, \
+  const const_blas_data_mapper<Scalar,Index,ColMajor> &rhs, \
+  Scalar* res, Index resIncr, Scalar alpha) \
+{ \
+    general_matrix_vector_product_gemv<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
+      rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \
+} \
+}; \
+
+EIGEN_BLAS_GEMV_SPECIALIZE(double)
+EIGEN_BLAS_GEMV_SPECIALIZE(float)
+EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex)
+EIGEN_BLAS_GEMV_SPECIALIZE(scomplex)
+
+#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \
+template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \
+struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \
+{ \
+typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> GEMVVector;\
+\
+static void run( \
+  Index rows, Index cols, \
+  const EIGTYPE* lhs, Index lhsStride, \
+  const EIGTYPE* rhs, Index rhsIncr, \
+  EIGTYPE* res, Index resIncr, EIGTYPE alpha) \
+{ \
+  BlasIndex m=convert_index<BlasIndex>(rows), n=convert_index<BlasIndex>(cols), \
+            lda=convert_index<BlasIndex>(lhsStride), incx=convert_index<BlasIndex>(rhsIncr), incy=convert_index<BlasIndex>(resIncr); \
+  const EIGTYPE beta(1); \
+  const EIGTYPE *x_ptr; \
+  char trans=(LhsStorageOrder==ColMajor) ? 'N' : (ConjugateLhs) ? 'C' : 'T'; \
+  if (LhsStorageOrder==RowMajor) { \
+    m = convert_index<BlasIndex>(cols); \
+    n = convert_index<BlasIndex>(rows); \
+  }\
+  GEMVVector x_tmp; \
+  if (ConjugateRhs) { \
+    Map<const GEMVVector, 0, InnerStride<> > map_x(rhs,cols,1,InnerStride<>(incx)); \
+    x_tmp=map_x.conjugate(); \
+    x_ptr=x_tmp.data(); \
+    incx=1; \
+  } else x_ptr=rhs; \
+  BLASFUNC(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
+}\
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_GEMV_SPECIALIZATION(double,   double, dgemv)
+EIGEN_BLAS_GEMV_SPECIALIZATION(float,    float,  sgemv)
+EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, zgemv)
+EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, MKL_Complex8 , cgemv)
+#else
+EIGEN_BLAS_GEMV_SPECIALIZATION(double,   double, dgemv_)
+EIGEN_BLAS_GEMV_SPECIALIZATION(float,    float,  sgemv_)
+EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, zgemv_)
+EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float,  cgemv_)
+#endif
+
+} // end namespase internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H

diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 3749939..8f91879 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h

@@ -10,89 +10,108 @@
 #ifndef EIGEN_PARALLELIZER_H
 #define EIGEN_PARALLELIZER_H
 
-#ifdef EIGEN_HAS_OPENMP
-#ifndef EIGEN_OPENMP_ENV_GET_MAX_THREADS
-// By default, call the OpenMP runtime to get the maximum number of threads.
-#define MAX_OPENMP_THREADS_FUNC omp_get_max_threads
-#else
-// We have been given an external function to call to get the maximum number
-// of threads.
-#ifdef EIGEN_OPENMP_ENV_NAMESPACE
-namespace EIGEN_OPENMP_ENV_NAMESPACE {
-  extern int EIGEN_OPENMP_ENV_GET_MAX_THREADS();
-}
-#define MAX_OPENMP_THREADS_FUNC ::EIGEN_OPENMP_ENV_NAMESPACE::EIGEN_OPENMP_ENV_GET_MAX_THREADS
-#else
-extern int EIGEN_OPENMP_ENV_GET_MAX_THREADS();
-#define MAX_OPENMP_THREADS_FUNC EIGEN_OPENMP_ENV_GET_MAX_THREADS
-#endif  // EIGEN_OPENMP_ENV_NAMESPACE
-#endif  // !EIGEN_OPENMP_ENV_GET_MAX_THREADS
-#endif  // EIGEN_HAS_OPENMP
+#if EIGEN_HAS_CXX11_ATOMIC
+#include <atomic>
+#endif
 
 namespace Eigen {
+
 namespace internal {
 
-inline int getAndMaybeSetMaxThreads(int new_max, bool set) {
-#ifdef EIGEN_HAS_OPENMP
-  static int m_maxThreads_ = MAX_OPENMP_THREADS_FUNC();
-  if (set) {
-    m_maxThreads_ = new_max;
+/** \internal */
+inline void manage_multi_threading(Action action, int* v)
+{
+  static int m_maxThreads = -1;
+  EIGEN_UNUSED_VARIABLE(m_maxThreads)
+
+  if(action==SetAction)
+  {
+    eigen_internal_assert(v!=0);
+    m_maxThreads = *v;
   }
-  return m_maxThreads_;
-#else
-  EIGEN_UNUSED_VARIABLE(set);
-  EIGEN_UNUSED_VARIABLE(new_max);
-  return 1;
-#endif
+  else if(action==GetAction)
+  {
+    eigen_internal_assert(v!=0);
+    #ifdef EIGEN_HAS_OPENMP
+    if(m_maxThreads>0)
+      *v = m_maxThreads;
+    else
+      *v = omp_get_max_threads();
+    #else
+    *v = 1;
+    #endif
+  }
+  else
+  {
+    eigen_internal_assert(false);
+  }
 }
 
-}  // namespace internal
-
-/** \returns the max number of threads reserved for Eigen. Always returns 1
-    unless EIGEN_HAS_OPENMP is set.
-  * \sa setNbThreads */
-inline int nbThreads() {
-  return internal::getAndMaybeSetMaxThreads(0, false);
 }
 
-/** Sets the max number of threads reserved for Eigen. A noop unless
-    EIGEN_HAS_OPENMP is set.
-  * \sa nbThreads */
-inline void setNbThreads(int new_max_threads) {
-  internal::getAndMaybeSetMaxThreads(new_max_threads, true);
-}
-
-inline void initParallel() {
-  internal::getAndMaybeSetMaxThreads(0, false);
+/** Must be call first when calling Eigen from multiple threads */
+inline void initParallel()
+{
+  int nbt;
+  internal::manage_multi_threading(GetAction, &nbt);
   std::ptrdiff_t l1, l2, l3;
   internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
 }
 
+/** \returns the max number of threads reserved for Eigen
+  * \sa setNbThreads */
+inline int nbThreads()
+{
+  int ret;
+  internal::manage_multi_threading(GetAction, &ret);
+  return ret;
+}
+
+/** Sets the max number of threads reserved for Eigen
+  * \sa nbThreads */
+inline void setNbThreads(int v)
+{
+  internal::manage_multi_threading(SetAction, &v);
+}
+
 namespace internal {
 
-template <typename Index>
-struct GemmParallelInfo {
+template<typename Index> struct GemmParallelInfo
+{
   GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
 
-  int volatile sync;
+  // volatile is not enough on all architectures (see bug 1572)
+  // to guarantee that when thread A says to thread B that it is
+  // done with packing a block, then all writes have been really
+  // carried out... C++11 memory model+atomic guarantees this.
+#if EIGEN_HAS_CXX11_ATOMIC
+  std::atomic<Index> sync;
+  std::atomic<int> users;
+#else
+  Index volatile sync;
   int volatile users;
+#endif
 
   Index lhs_start;
   Index lhs_length;
 };
 
-template <bool Condition, typename Functor, typename Index>
-void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
-                      bool transpose) {
-// TODO when EIGEN_USE_BLAS is defined,
-// we should still enable OMP for other scalar types
-#if !(defined(EIGEN_HAS_OPENMP)) || defined(EIGEN_USE_BLAS)
+template<bool Condition, typename Functor, typename Index>
+void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, bool transpose)
+{
+  // TODO when EIGEN_USE_BLAS is defined,
+  // we should still enable OMP for other scalar types
+  // Without C++11, we have to disable GEMM's parallelization on
+  // non x86 architectures because there volatile is not enough for our purpose.
+  // See bug 1572.
+#if (! defined(EIGEN_HAS_OPENMP)) || defined(EIGEN_USE_BLAS) || ((!EIGEN_HAS_CXX11_ATOMIC) && !(EIGEN_ARCH_i386_OR_x86_64))
   // FIXME the transpose variable is only needed to properly split
   // the matrix product when multithreading is enabled. This is a temporary
   // fix to support row-major destination matrices. This whole
-  // parallelizer mechanism has to be redisigned anyway.
+  // parallelizer mechanism has to be redesigned anyway.
+  EIGEN_UNUSED_VARIABLE(depth);
   EIGEN_UNUSED_VARIABLE(transpose);
-  func(0, rows, 0, cols);
+  func(0,rows, 0,cols);
 #else
 
   // Dynamically check whether we should enable or disable OpenMP.
@@ -101,64 +120,61 @@
   // - we are not already in a parallel code
   // - the sizes are large enough
 
-  // 1- are we already in a parallel session?
-  if ((!Condition) || (omp_get_num_threads() > 1))
-    return func(0, rows, 0, cols);
-
+  // compute the maximal number of threads from the size of the product:
+  // This first heuristic takes into account that the product kernel is fully optimized when working with nr columns at once.
   Index size = transpose ? rows : cols;
+  Index pb_max_threads = std::max<Index>(1,size / Functor::Traits::nr);
 
-  // 2- compute the maximal number of threads from the size of the product:
-  // FIXME this has to be fine tuned
-  Index max_threads = std::max<Index>(1, size / 32);
-
-  // 3- compute a maximum number of threads based on the total amount of work.
+  // compute the maximal number of threads from the total amount of work:
   double work = static_cast<double>(rows) * static_cast<double>(cols) *
-                static_cast<double>(depth);
-  double kMinTaskSize = 50000;  // Heuristic.
-  max_threads =
-      std::max<Index>(1, std::min<Index>(max_threads, work / kMinTaskSize));
+      static_cast<double>(depth);
+  double kMinTaskSize = 50000;  // FIXME improve this heuristic.
+  pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, static_cast<Index>( work / kMinTaskSize ) ));
 
-  // 4 - compute the number of threads we are going to use
-  Index threads = std::min<Index>(nbThreads(), max_threads);
+  // compute the number of threads we are going to use
+  Index threads = std::min<Index>(nbThreads(), pb_max_threads);
 
-  if (threads == 1) return func(0, rows, 0, cols);
+  // if multi-threading is explicitly disabled, not useful, or if we already are in a parallel session,
+  // then abort multi-threading
+  // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
+  if((!Condition) || (threads==1) || (omp_get_num_threads()>1))
+    return func(0,rows, 0,cols);
 
-  func.initParallelSession();
+  Eigen::initParallel();
+  func.initParallelSession(threads);
 
-  if (transpose) std::swap(rows, cols);
+  if(transpose)
+    std::swap(rows,cols);
 
-  Index blockCols = (cols / threads) & ~Index(0x3);
-  Index blockRows = (rows / threads);
-  blockRows = (blockRows / Functor::Traits::mr) * Functor::Traits::mr;
+  ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0);
 
-  GemmParallelInfo<Index>* info = new GemmParallelInfo<Index>[ threads ];
-
-#pragma omp parallel num_threads(threads)
+  #pragma omp parallel num_threads(threads)
   {
     Index i = omp_get_thread_num();
-    Index r0 = i * blockRows;
-    Index actualBlockRows = (i + 1 == threads) ? rows - r0 : blockRows;
+    // Note that the actual number of threads might be lower than the number of request ones.
+    Index actual_threads = omp_get_num_threads();
 
-    Index c0 = i * blockCols;
-    Index actualBlockCols = (i + 1 == threads) ? cols - c0 : blockCols;
+    Index blockCols = (cols / actual_threads) & ~Index(0x3);
+    Index blockRows = (rows / actual_threads);
+    blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
+
+    Index r0 = i*blockRows;
+    Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows;
+
+    Index c0 = i*blockCols;
+    Index actualBlockCols = (i+1==actual_threads) ? cols-c0 : blockCols;
 
     info[i].lhs_start = r0;
     info[i].lhs_length = actualBlockRows;
 
-    if (transpose)
-      func(c0, actualBlockCols, 0, rows, info);
-    else
-      func(0, rows, c0, actualBlockCols, info);
+    if(transpose) func(c0, actualBlockCols, 0, rows, info);
+    else          func(0, rows, c0, actualBlockCols, info);
   }
-
-  delete[] info;
-
-#undef MAX_OPENMP_THREADS_FUNC
 #endif
 }
 
-}  // end namespace internal
+} // end namespace internal
 
-}  // end namespace Eigen
+} // end namespace Eigen
 
-#endif  // EIGEN_PARALLELIZER_H
+#endif // EIGEN_PARALLELIZER_H

diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 4a60ef7..33ecf10 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h

@@ -10,7 +10,7 @@
 #ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_H
 #define EIGEN_SELFADJOINT_MATRIX_MATRIX_H
 
-namespace Eigen {
+namespace Eigen { 
 
 namespace internal {
 
@@ -45,29 +45,46 @@
   }
   void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
   {
-    enum { PacketSize = packet_traits<Scalar>::size };
+    typedef typename unpacket_traits<typename packet_traits<Scalar>::type>::half HalfPacket;
+    typedef typename unpacket_traits<typename unpacket_traits<typename packet_traits<Scalar>::type>::half>::half QuarterPacket;
+    enum { PacketSize = packet_traits<Scalar>::size,
+           HalfPacketSize = unpacket_traits<HalfPacket>::size,
+           QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
+           HasHalf = (int)HalfPacketSize < (int)PacketSize,
+           HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
+
     const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
     Index count = 0;
     //Index peeled_mc3 = (rows/Pack1)*Pack1;
-
+    
     const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
     const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
-    const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
-
+    const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
+    const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
+    const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? peeled_mc_half+((rows-peeled_mc_half)/(QuarterPacketSize))*(QuarterPacketSize) : 0;
+    
     if(Pack1>=3*PacketSize)
       for(Index i=0; i<peeled_mc3; i+=3*PacketSize)
         pack<3*PacketSize>(blockA, lhs, cols, i, count);
-
+    
     if(Pack1>=2*PacketSize)
       for(Index i=peeled_mc3; i<peeled_mc2; i+=2*PacketSize)
         pack<2*PacketSize>(blockA, lhs, cols, i, count);
-
+    
     if(Pack1>=1*PacketSize)
       for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize)
         pack<1*PacketSize>(blockA, lhs, cols, i, count);
 
+    if(HasHalf && Pack1>=HalfPacketSize)
+      for(Index i=peeled_mc1; i<peeled_mc_half; i+=HalfPacketSize)
+        pack<HalfPacketSize>(blockA, lhs, cols, i, count);
+
+    if(HasQuarter && Pack1>=QuarterPacketSize)
+      for(Index i=peeled_mc_half; i<peeled_mc_quarter; i+=QuarterPacketSize)
+        pack<QuarterPacketSize>(blockA, lhs, cols, i, count);
+
     // do the same with mr==1
-    for(Index i=peeled_mc1; i<rows; i++)
+    for(Index i=peeled_mc_quarter; i<rows; i++)
     {
       for(Index k=0; k<i; k++)
         blockA[count++] = lhs(i, k);                   // normal
@@ -277,55 +294,58 @@
 template <typename Scalar, typename Index,
           int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
           int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs,
-          int ResStorageOrder>
+          int ResStorageOrder, int ResInnerStride>
 struct product_selfadjoint_matrix;
 
 template <typename Scalar, typename Index,
           int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
-          int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs>
-struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor>
+          int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs,
+          int ResInnerStride>
+struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor,ResInnerStride>
 {
 
   static EIGEN_STRONG_INLINE void run(
     Index rows, Index cols,
     const Scalar* lhs, Index lhsStride,
     const Scalar* rhs, Index rhsStride,
-    Scalar* res,       Index resStride,
-    const Scalar& alpha)
+    Scalar* res,       Index resIncr, Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     product_selfadjoint_matrix<Scalar, Index,
       EIGEN_LOGICAL_XOR(RhsSelfAdjoint,RhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
       RhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs),
       EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
       LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs),
-      ColMajor>
-      ::run(cols, rows,  rhs, rhsStride,  lhs, lhsStride,  res, resStride,  alpha);
+      ColMajor,ResInnerStride>
+      ::run(cols, rows,  rhs, rhsStride,  lhs, lhsStride,  res, resIncr, resStride,  alpha, blocking);
   }
 };
 
 template <typename Scalar, typename Index,
           int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs>
-struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor>
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride>
+struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride>
 {
 
   static EIGEN_DONT_INLINE void run(
     Index rows, Index cols,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
-    const Scalar& alpha);
+    Scalar* res,        Index resIncr, Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };
 
 template <typename Scalar, typename Index,
           int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs>
-EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor>::run(
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride>
+EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride>::run(
     Index rows, Index cols,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* _res,        Index resStride,
-    const Scalar& alpha)
+    Scalar* _res,       Index resIncr, Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     Index size = rows;
 
@@ -334,28 +354,25 @@
     typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
     typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
     typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
-    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
     LhsMapper lhs(_lhs,lhsStride);
     LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
     RhsMapper rhs(_rhs,rhsStride);
-    ResMapper res(_res, resStride);
+    ResMapper res(_res, resStride, resIncr);
 
-    Index kc = size;  // cache block size along the K direction
-    Index mc = rows;  // cache block size along the M direction
-    Index nc = cols;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, Index(1));
-    // kc must smaller than mc
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+    // kc must be smaller than mc
     kc = (std::min)(kc,mc);
-
+    std::size_t sizeA = kc*mc;
     std::size_t sizeB = kc*cols;
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
-    ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
-    Scalar* blockB = allocatedBlockB;
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
 
     gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
     symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
-    gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
+    gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
 
     for(Index k2=0; k2<size; k2+=kc)
     {
@@ -390,7 +407,7 @@
       for(Index i2=k2+kc; i2<size; i2+=mc)
       {
         const Index actual_mc = (std::min)(i2+mc,size)-i2;
-        gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
+        gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder,false>()
           (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
 
         gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
@@ -401,48 +418,48 @@
 // matrix * selfadjoint product
 template <typename Scalar, typename Index,
           int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs>
-struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor>
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride>
+struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride>
 {
 
   static EIGEN_DONT_INLINE void run(
     Index rows, Index cols,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
-    const Scalar& alpha);
+    Scalar* res,        Index resIncr, Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };
 
 template <typename Scalar, typename Index,
           int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs>
-EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor>::run(
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride>
+EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride>::run(
     Index rows, Index cols,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* _res,        Index resStride,
-    const Scalar& alpha)
+    Scalar* _res,       Index resIncr, Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     Index size = cols;
 
     typedef gebp_traits<Scalar,Scalar> Traits;
 
     typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
-    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
     LhsMapper lhs(_lhs,lhsStride);
-    ResMapper res(_res,resStride);
+    ResMapper res(_res,resStride, resIncr);
 
-    Index kc = size; // cache block size along the K direction
-    Index mc = rows;  // cache block size along the M direction
-    Index nc = cols;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, Index(1));
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+    std::size_t sizeA = kc*mc;
     std::size_t sizeB = kc*cols;
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
-    ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
-    Scalar* blockB = allocatedBlockB;
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
 
     gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
     symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
 
     for(Index k2=0; k2<size; k2+=kc)
@@ -469,55 +486,59 @@
 ***************************************************************************/
 
 namespace internal {
+  
 template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
-struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> >
-  : traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs> >
-{};
-}
-
-template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
-struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
-  : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs >
+struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,RhsMode,false>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
-
-  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  typedef internal::blas_traits<Lhs> LhsBlasTraits;
+  typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+  typedef internal::blas_traits<Rhs> RhsBlasTraits;
+  typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  
   enum {
     LhsIsUpper = (LhsMode&(Upper|Lower))==Upper,
     LhsIsSelfAdjoint = (LhsMode&SelfAdjoint)==SelfAdjoint,
     RhsIsUpper = (RhsMode&(Upper|Lower))==Upper,
     RhsIsSelfAdjoint = (RhsMode&SelfAdjoint)==SelfAdjoint
   };
-
-  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  
+  template<typename Dest>
+  static void run(Dest &dst, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
   {
-    eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+    eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols());
 
-    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
-    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
 
-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                               * RhsBlasTraits::extractScalarFactor(m_rhs);
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
+                               * RhsBlasTraits::extractScalarFactor(a_rhs);
+
+    typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
+              Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,1> BlockingType;
+
+    BlockingType blocking(lhs.rows(), rhs.cols(), lhs.cols(), 1, false);
 
     internal::product_selfadjoint_matrix<Scalar, Index,
-      EIGEN_LOGICAL_XOR(LhsIsUpper,
-                        internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
+      EIGEN_LOGICAL_XOR(LhsIsUpper,internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
       NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)),
-      EIGEN_LOGICAL_XOR(RhsIsUpper,
-                        internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
+      EIGEN_LOGICAL_XOR(RhsIsUpper,internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
       NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
-      internal::traits<Dest>::Flags&RowMajorBit  ? RowMajor : ColMajor>
+      internal::traits<Dest>::Flags&RowMajorBit  ? RowMajor : ColMajor,
+      Dest::InnerStrideAtCompileTime>
       ::run(
-        lhs.rows(), rhs.cols(),                     // sizes
-        &lhs.coeffRef(0,0),    lhs.outerStride(),   // lhs info
-        &rhs.coeffRef(0,0),    rhs.outerStride(),   // rhs info
-        &dst.coeffRef(0,0), dst.outerStride(),      // result info
-        actualAlpha                                 // alpha
+        lhs.rows(), rhs.cols(),                 // sizes
+        &lhs.coeffRef(0,0), lhs.outerStride(),  // lhs info
+        &rhs.coeffRef(0,0), rhs.outerStride(),  // rhs info
+        &dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(),  // result info
+        actualAlpha, blocking                   // alpha
       );
   }
 };
 
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H

diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
new file mode 100644
index 0000000..61396db
--- /dev/null
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h

@@ -0,0 +1,295 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   Self adjoint matrix * matrix product functionality based on ?SYMM/?HEMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H
+#define EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H
+
+namespace Eigen { 
+
+namespace internal {
+
+
+/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
+
+#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
+template <typename Index, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor,1> \
+{\
+\
+  static void run( \
+    Index rows, Index cols, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resIncr, Index resStride, \
+    EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
+  { \
+    EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+    eigen_assert(resIncr == 1); \
+    char side='L', uplo='L'; \
+    BlasIndex m, n, lda, ldb, ldc; \
+    const EIGTYPE *a, *b; \
+    EIGTYPE beta(1); \
+    MatrixX##EIGPREFIX b_tmp; \
+\
+/* Set transpose options */ \
+/* Set m, n, k */ \
+    m = convert_index<BlasIndex>(rows);  \
+    n = convert_index<BlasIndex>(cols);  \
+\
+/* Set lda, ldb, ldc */ \
+    lda = convert_index<BlasIndex>(lhsStride); \
+    ldb = convert_index<BlasIndex>(rhsStride); \
+    ldc = convert_index<BlasIndex>(resStride); \
+\
+/* Set a, b, c */ \
+    if (LhsStorageOrder==RowMajor) uplo='U'; \
+    a = _lhs; \
+\
+    if (RhsStorageOrder==RowMajor) { \
+      Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
+      b_tmp = rhs.adjoint(); \
+      b = b_tmp.data(); \
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
+    } else b = _rhs; \
+\
+    BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+\
+  } \
+};
+
+
+#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
+template <typename Index, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor,1> \
+{\
+  static void run( \
+    Index rows, Index cols, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resIncr, Index resStride, \
+    EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
+  { \
+    EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+    eigen_assert(resIncr == 1); \
+    char side='L', uplo='L'; \
+    BlasIndex m, n, lda, ldb, ldc; \
+    const EIGTYPE *a, *b; \
+    EIGTYPE beta(1); \
+    MatrixX##EIGPREFIX b_tmp; \
+    Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> a_tmp; \
+\
+/* Set transpose options */ \
+/* Set m, n, k */ \
+    m = convert_index<BlasIndex>(rows); \
+    n = convert_index<BlasIndex>(cols); \
+\
+/* Set lda, ldb, ldc */ \
+    lda = convert_index<BlasIndex>(lhsStride); \
+    ldb = convert_index<BlasIndex>(rhsStride); \
+    ldc = convert_index<BlasIndex>(resStride); \
+\
+/* Set a, b, c */ \
+    if (((LhsStorageOrder==ColMajor) && ConjugateLhs) || ((LhsStorageOrder==RowMajor) && (!ConjugateLhs))) { \
+      Map<const Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder>, 0, OuterStride<> > lhs(_lhs,m,m,OuterStride<>(lhsStride)); \
+      a_tmp = lhs.conjugate(); \
+      a = a_tmp.data(); \
+      lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
+    } else a = _lhs; \
+    if (LhsStorageOrder==RowMajor) uplo='U'; \
+\
+    if (RhsStorageOrder==ColMajor && (!ConjugateRhs)) { \
+       b = _rhs; } \
+    else { \
+      if (RhsStorageOrder==ColMajor && ConjugateRhs) { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,m,n,OuterStride<>(rhsStride)); \
+        b_tmp = rhs.conjugate(); \
+      } else \
+      if (ConjugateRhs) { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
+        b_tmp = rhs.adjoint(); \
+      } else { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
+        b_tmp = rhs.transpose(); \
+      } \
+      b = b_tmp.data(); \
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
+    } \
+\
+    BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+\
+  } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_SYMM_L(double, double, d, dsymm)
+EIGEN_BLAS_SYMM_L(float, float, f, ssymm)
+EIGEN_BLAS_HEMM_L(dcomplex, MKL_Complex16, cd, zhemm)
+EIGEN_BLAS_HEMM_L(scomplex, MKL_Complex8, cf, chemm)
+#else
+EIGEN_BLAS_SYMM_L(double, double, d, dsymm_)
+EIGEN_BLAS_SYMM_L(float, float, f, ssymm_)
+EIGEN_BLAS_HEMM_L(dcomplex, double, cd, zhemm_)
+EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_)
+#endif
+
+/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
+
+#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
+template <typename Index, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor,1> \
+{\
+\
+  static void run( \
+    Index rows, Index cols, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resIncr, Index resStride, \
+    EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
+  { \
+    EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+    eigen_assert(resIncr == 1); \
+    char side='R', uplo='L'; \
+    BlasIndex m, n, lda, ldb, ldc; \
+    const EIGTYPE *a, *b; \
+    EIGTYPE beta(1); \
+    MatrixX##EIGPREFIX b_tmp; \
+\
+/* Set m, n, k */ \
+    m = convert_index<BlasIndex>(rows);  \
+    n = convert_index<BlasIndex>(cols);  \
+\
+/* Set lda, ldb, ldc */ \
+    lda = convert_index<BlasIndex>(rhsStride); \
+    ldb = convert_index<BlasIndex>(lhsStride); \
+    ldc = convert_index<BlasIndex>(resStride); \
+\
+/* Set a, b, c */ \
+    if (RhsStorageOrder==RowMajor) uplo='U'; \
+    a = _rhs; \
+\
+    if (LhsStorageOrder==RowMajor) { \
+      Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(rhsStride)); \
+      b_tmp = lhs.adjoint(); \
+      b = b_tmp.data(); \
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
+    } else b = _lhs; \
+\
+    BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+\
+  } \
+};
+
+
+#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
+template <typename Index, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor,1> \
+{\
+  static void run( \
+    Index rows, Index cols, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resIncr, Index resStride, \
+    EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
+  { \
+    EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+    eigen_assert(resIncr == 1); \
+    char side='R', uplo='L'; \
+    BlasIndex m, n, lda, ldb, ldc; \
+    const EIGTYPE *a, *b; \
+    EIGTYPE beta(1); \
+    MatrixX##EIGPREFIX b_tmp; \
+    Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> a_tmp; \
+\
+/* Set m, n, k */ \
+    m = convert_index<BlasIndex>(rows); \
+    n = convert_index<BlasIndex>(cols); \
+\
+/* Set lda, ldb, ldc */ \
+    lda = convert_index<BlasIndex>(rhsStride); \
+    ldb = convert_index<BlasIndex>(lhsStride); \
+    ldc = convert_index<BlasIndex>(resStride); \
+\
+/* Set a, b, c */ \
+    if (((RhsStorageOrder==ColMajor) && ConjugateRhs) || ((RhsStorageOrder==RowMajor) && (!ConjugateRhs))) { \
+      Map<const Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder>, 0, OuterStride<> > rhs(_rhs,n,n,OuterStride<>(rhsStride)); \
+      a_tmp = rhs.conjugate(); \
+      a = a_tmp.data(); \
+      lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
+    } else a = _rhs; \
+    if (RhsStorageOrder==RowMajor) uplo='U'; \
+\
+    if (LhsStorageOrder==ColMajor && (!ConjugateLhs)) { \
+       b = _lhs; } \
+    else { \
+      if (LhsStorageOrder==ColMajor && ConjugateLhs) { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,n,OuterStride<>(lhsStride)); \
+        b_tmp = lhs.conjugate(); \
+      } else \
+      if (ConjugateLhs) { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \
+        b_tmp = lhs.adjoint(); \
+      } else { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \
+        b_tmp = lhs.transpose(); \
+      } \
+      b = b_tmp.data(); \
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
+    } \
+\
+    BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+  } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_SYMM_R(double, double, d, dsymm)
+EIGEN_BLAS_SYMM_R(float, float, f, ssymm)
+EIGEN_BLAS_HEMM_R(dcomplex, MKL_Complex16, cd, zhemm)
+EIGEN_BLAS_HEMM_R(scomplex, MKL_Complex8, cf, chemm)
+#else
+EIGEN_BLAS_SYMM_R(double, double, d, dsymm_)
+EIGEN_BLAS_SYMM_R(float, float, f, ssymm_)
+EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_)
+EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_)
+#endif
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H

diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h
index fdc8120..d38fd72 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h

@@ -15,7 +15,7 @@
 namespace internal {
 
 /* Optimized selfadjoint matrix * vector product:
- * This algorithm processes 2 columns at onces that allows to both reduce
+ * This algorithm processes 2 columns at once that allows to both reduce
  * the number of load/stores of the result by a factor 2 and to reduce
  * the instruction dependency.
  */
@@ -27,23 +27,26 @@
 struct selfadjoint_matrix_vector_product
 
 {
-static EIGEN_DONT_INLINE void run(
+static EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
+void run(
   Index size,
   const Scalar*  lhs, Index lhsStride,
-  const Scalar* _rhs, Index rhsIncr,
+  const Scalar*  rhs,
   Scalar* res,
   Scalar alpha);
 };
 
 template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
+EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
+void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
   Index size,
   const Scalar*  lhs, Index lhsStride,
-  const Scalar* _rhs, Index rhsIncr,
+  const Scalar*  rhs,
   Scalar* res,
   Scalar alpha)
 {
   typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
   const Index PacketSize = sizeof(Packet)/sizeof(Scalar);
 
   enum {
@@ -54,25 +57,14 @@
 
   conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs,  IsRowMajor), ConjugateRhs> cj0;
   conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1;
-  conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex, ConjugateRhs> cjd;
+  conj_helper<RealScalar,Scalar,false, ConjugateRhs> cjd;
 
   conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs,  IsRowMajor), ConjugateRhs> pcj0;
   conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1;
 
   Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
 
-  // FIXME this copy is now handled outside product_selfadjoint_vector, so it could probably be removed.
-  // if the rhs is not sequentially stored in memory we copy it to a temporary buffer,
-  // this is because we need to extract packets
-  ei_declare_aligned_stack_constructed_variable(Scalar,rhs,size,rhsIncr==1 ? const_cast<Scalar*>(_rhs) : 0);  
-  if (rhsIncr!=1)
-  {
-    const Scalar* it = _rhs;
-    for (Index i=0; i<size; ++i, it+=rhsIncr)
-      rhs[i] = *it;
-  }
-
-  Index bound = (std::max)(Index(0),size-8) & 0xfffffffe;
+  Index bound = numext::maxi(Index(0), size-8) & 0xfffffffe;
   if (FirstTriangular)
     bound = size - bound;
 
@@ -92,12 +84,11 @@
     Scalar t3(0);
     Packet ptmp3 = pset1<Packet>(t3);
 
-    size_t starti = FirstTriangular ? 0 : j+2;
-    size_t endi   = FirstTriangular ? j : size;
-    size_t alignedStart = (starti) + internal::first_aligned(&res[starti], endi-starti);
-    size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
+    Index starti = FirstTriangular ? 0 : j+2;
+    Index endi   = FirstTriangular ? j : size;
+    Index alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti);
+    Index alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
 
-    // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
     res[j]   += cjd.pmul(numext::real(A0[j]), t0);
     res[j+1] += cjd.pmul(numext::real(A1[j+1]), t1);
     if(FirstTriangular)
@@ -111,7 +102,7 @@
       t2 += cj1.pmul(A0[j+1], rhs[j+1]);
     }
 
-    for (size_t i=starti; i<alignedStart; ++i)
+    for (Index i=starti; i<alignedStart; ++i)
     {
       res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
       t2 += cj1.pmul(A0[i], rhs[i]);
@@ -123,7 +114,7 @@
     const Scalar* EIGEN_RESTRICT a1It  = A1  + alignedStart;
     const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
           Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
-    for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize)
+    for (Index i=alignedStart; i<alignedEnd; i+=PacketSize)
     {
       Packet A0i = ploadu<Packet>(a0It);  a0It  += PacketSize;
       Packet A1i = ploadu<Packet>(a1It);  a1It  += PacketSize;
@@ -135,7 +126,7 @@
       ptmp3 = pcj1.pmadd(A1i,  Bi, ptmp3);
       pstore(resIt,Xi); resIt += PacketSize;
     }
-    for (size_t i=alignedEnd; i<endi; i++)
+    for (Index i=alignedEnd; i<endi; i++)
     {
       res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
       t2 += cj1.pmul(A0[i], rhs[i]);
@@ -151,7 +142,6 @@
 
     Scalar t1 = cjAlpha * rhs[j];
     Scalar t2(0);
-    // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
     res[j] += cjd.pmul(numext::real(A0[j]), t1);
     for (Index i=FirstTriangular ? 0 : j+1; i<(FirstTriangular ? j : size); i++)
     {
@@ -169,45 +159,45 @@
 ***************************************************************************/
 
 namespace internal {
-template<typename Lhs, int LhsMode, typename Rhs>
-struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> >
-  : traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs> >
-{};
-}
 
 template<typename Lhs, int LhsMode, typename Rhs>
-struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
-  : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs >
+struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  typedef internal::blas_traits<Lhs> LhsBlasTraits;
+  typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+  typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
+  
+  typedef internal::blas_traits<Rhs> RhsBlasTraits;
+  typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
 
-  enum {
-    LhsUpLo = LhsMode&(Upper|Lower)
-  };
+  enum { LhsUpLo = LhsMode&(Upper|Lower) };
 
-  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC
+  void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
   {
     typedef typename Dest::Scalar ResScalar;
-    typedef typename Base::RhsScalar RhsScalar;
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    typedef typename Rhs::Scalar RhsScalar;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
     
-    eigen_assert(dest.rows()==m_lhs.rows() && dest.cols()==m_rhs.cols());
+    eigen_assert(dest.rows()==a_lhs.rows() && dest.cols()==a_rhs.cols());
 
-    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
-    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
 
-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                               * RhsBlasTraits::extractScalarFactor(m_rhs);
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
+                               * RhsBlasTraits::extractScalarFactor(a_rhs);
 
     enum {
       EvalToDest = (Dest::InnerStrideAtCompileTime==1),
-      UseRhs = (_ActualRhsType::InnerStrideAtCompileTime==1)
+      UseRhs = (ActualRhsTypeCleaned::InnerStrideAtCompileTime==1)
     };
     
     internal::gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,!EvalToDest> static_dest;
-    internal::gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!UseRhs> static_rhs;
+    internal::gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!UseRhs> static_rhs;
 
     ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
                                                   EvalToDest ? dest.data() : static_dest.data());
@@ -218,7 +208,7 @@
     if(!EvalToDest)
     {
       #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = dest.size();
+      Index size = dest.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
       #endif
       MappedDest(actualDestPtr, dest.size()) = dest;
@@ -227,18 +217,19 @@
     if(!UseRhs)
     {
       #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = rhs.size();
+      Index size = rhs.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
       #endif
-      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
+      Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
     }
       
       
-    internal::selfadjoint_matrix_vector_product<Scalar, Index, (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>::run
+    internal::selfadjoint_matrix_vector_product<Scalar, Index, (internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+                                                int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>::run
       (
         lhs.rows(),                             // size
         &lhs.coeffRef(0,0),  lhs.outerStride(), // lhs info
-        actualRhsPtr, 1,                        // rhs info
+        actualRhsPtr,                           // rhs info
         actualDestPtr,                          // result info
         actualAlpha                             // scale factor
       );
@@ -248,34 +239,24 @@
   }
 };
 
-namespace internal {
 template<typename Lhs, typename Rhs, int RhsMode>
-struct traits<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false> >
-  : traits<ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs> >
-{};
-}
-
-template<typename Lhs, typename Rhs, int RhsMode>
-struct SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>
-  : public ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs >
+struct selfadjoint_product_impl<Lhs,0,true,Rhs,RhsMode,false>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  enum { RhsUpLo = RhsMode&(Upper|Lower)  };
 
-  enum {
-    RhsUpLo = RhsMode&(Upper|Lower)
-  };
-
-  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+  template<typename Dest>
+  static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
   {
     // let's simply transpose the product
     Transpose<Dest> destT(dest);
-    SelfadjointProductMatrix<Transpose<const Rhs>, int(RhsUpLo)==Upper ? Lower : Upper, false,
-                             Transpose<const Lhs>, 0, true>(m_rhs.transpose(), m_lhs.transpose()).scaleAndAddTo(destT, alpha);
+    selfadjoint_product_impl<Transpose<const Rhs>, int(RhsUpLo)==Upper ? Lower : Upper, false,
+                             Transpose<const Lhs>, 0, true>::run(destT, a_rhs.transpose(), a_lhs.transpose(), alpha);
   }
 };
 
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H

diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
new file mode 100644
index 0000000..1238345
--- /dev/null
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h

@@ -0,0 +1,118 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   Selfadjoint matrix-vector product functionality based on ?SYMV/HEMV.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H
+#define EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/**********************************************************************
+* This file implements selfadjoint matrix-vector multiplication using BLAS
+**********************************************************************/
+
+// symv/hemv specialization
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs>
+struct selfadjoint_matrix_vector_product_symv :
+  selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn> {};
+
+#define EIGEN_BLAS_SYMV_SPECIALIZE(Scalar) \
+template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \
+struct selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Specialized> { \
+static void run( \
+  Index size, const Scalar*  lhs, Index lhsStride, \
+  const Scalar* _rhs, Scalar* res, Scalar alpha) { \
+    enum {\
+      IsColMajor = StorageOrder==ColMajor \
+    }; \
+    if (IsColMajor == ConjugateLhs) {\
+      selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn>::run( \
+        size, lhs, lhsStride, _rhs, res, alpha);  \
+    } else {\
+      selfadjoint_matrix_vector_product_symv<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs>::run( \
+        size, lhs, lhsStride, _rhs, res, alpha);  \
+    }\
+  } \
+}; \
+
+EIGEN_BLAS_SYMV_SPECIALIZE(double)
+EIGEN_BLAS_SYMV_SPECIALIZE(float)
+EIGEN_BLAS_SYMV_SPECIALIZE(dcomplex)
+EIGEN_BLAS_SYMV_SPECIALIZE(scomplex)
+
+#define EIGEN_BLAS_SYMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \
+template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \
+struct selfadjoint_matrix_vector_product_symv<EIGTYPE,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs> \
+{ \
+typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> SYMVVector;\
+\
+static void run( \
+Index size, const EIGTYPE*  lhs, Index lhsStride, \
+const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \
+{ \
+  enum {\
+    IsRowMajor = StorageOrder==RowMajor ? 1 : 0, \
+    IsLower = UpLo == Lower ? 1 : 0 \
+  }; \
+  BlasIndex n=convert_index<BlasIndex>(size), lda=convert_index<BlasIndex>(lhsStride), incx=1, incy=1; \
+  EIGTYPE beta(1); \
+  const EIGTYPE *x_ptr; \
+  char uplo=(IsRowMajor) ? (IsLower ? 'U' : 'L') : (IsLower ? 'L' : 'U'); \
+  SYMVVector x_tmp; \
+  if (ConjugateRhs) { \
+    Map<const SYMVVector, 0 > map_x(_rhs,size,1); \
+    x_tmp=map_x.conjugate(); \
+    x_ptr=x_tmp.data(); \
+  } else x_ptr=_rhs; \
+  BLASFUNC(&uplo, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
+}\
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_SYMV_SPECIALIZATION(double,   double, dsymv)
+EIGEN_BLAS_SYMV_SPECIALIZATION(float,    float,  ssymv)
+EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv)
+EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, MKL_Complex8,  chemv)
+#else
+EIGEN_BLAS_SYMV_SPECIALIZATION(double,   double, dsymv_)
+EIGEN_BLAS_SYMV_SPECIALIZATION(float,    float,  ssymv_)
+EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_)
+EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float,  chemv_)
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H

diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h
index 32750e1..a21be80 100644
--- a/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/Eigen/src/Core/products/SelfadjointProduct.h

@@ -53,7 +53,6 @@
   static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha)
   {
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
     typedef internal::blas_traits<OtherType> OtherBlasTraits;
     typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType;
     typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType;
@@ -86,7 +85,6 @@
   static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha)
   {
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
     typedef internal::blas_traits<OtherType> OtherBlasTraits;
     typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType;
     typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType;
@@ -94,15 +92,27 @@
 
     Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived());
 
-    enum { IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0 };
+    enum {
+      IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0,
+      OtherIsRowMajor = _ActualOtherType::Flags&RowMajorBit ? 1 : 0
+    };
+
+    Index size = mat.cols();
+    Index depth = actualOther.cols();
+
+    typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,Scalar,Scalar,
+              MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime, _ActualOtherType::MaxColsAtCompileTime> BlockingType;
+
+    BlockingType blocking(size, size, depth, 1, false);
+
 
     internal::general_matrix_matrix_triangular_product<Index,
-      Scalar, _ActualOtherType::Flags&RowMajorBit ? RowMajor : ColMajor,   OtherBlasTraits::NeedToConjugate  && NumTraits<Scalar>::IsComplex,
-      Scalar, _ActualOtherType::Flags&RowMajorBit ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex,
-      MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo>
-      ::run(mat.cols(), actualOther.cols(),
-            &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(),
-            mat.data(), mat.outerStride(), actualAlpha);
+      Scalar, OtherIsRowMajor ? RowMajor : ColMajor,   OtherBlasTraits::NeedToConjugate  && NumTraits<Scalar>::IsComplex,
+      Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex,
+      IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo>
+      ::run(size, depth,
+            actualOther.data(), actualOther.outerStride(), actualOther.data(), actualOther.outerStride(),
+            mat.data(), mat.innerStride(), mat.outerStride(), actualAlpha, blocking);
   }
 };
 
@@ -110,8 +120,7 @@
 
 template<typename MatrixType, unsigned int UpLo>
 template<typename DerivedU>
-EIGEN_DEVICE_FUNC
-SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
 ::rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha)
 {
   selfadjoint_product_selector<MatrixType,DerivedU,UpLo>::run(_expression().const_cast_derived(), u.derived(), alpha);

diff --git a/Eigen/src/Core/products/SelfadjointRank2Update.h b/Eigen/src/Core/products/SelfadjointRank2Update.h
index 42af3f1..f752a0b 100644
--- a/Eigen/src/Core/products/SelfadjointRank2Update.h
+++ b/Eigen/src/Core/products/SelfadjointRank2Update.h

@@ -24,7 +24,8 @@
 template<typename Scalar, typename Index, typename UType, typename VType>
 struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Lower>
 {
-  static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC
+  void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
   {
     const Index size = u.size();
     for (Index i=0; i<size; ++i)
@@ -57,8 +58,7 @@
 
 template<typename MatrixType, unsigned int UpLo>
 template<typename DerivedU, typename DerivedV>
-EIGEN_DEVICE_FUNC
-SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
 ::rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha)
 {
   typedef internal::blas_traits<DerivedU> UBlasTraits;
@@ -80,11 +80,11 @@
   if (IsRowMajor)
     actualAlpha = numext::conj(actualAlpha);
 
-  internal::selfadjoint_rank2_update_selector<Scalar, Index,
-    typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ UBlasTraits::NeedToConjugate,_ActualUType>::type>::type,
-    typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ VBlasTraits::NeedToConjugate,_ActualVType>::type>::type,
+  typedef typename internal::remove_all<typename internal::conj_expr_if<int(IsRowMajor) ^ int(UBlasTraits::NeedToConjugate), _ActualUType>::type>::type UType;
+  typedef typename internal::remove_all<typename internal::conj_expr_if<int(IsRowMajor) ^ int(VBlasTraits::NeedToConjugate), _ActualVType>::type>::type VType;
+  internal::selfadjoint_rank2_update_selector<Scalar, Index, UType, VType,
     (IsRowMajor ? int(UpLo==Upper ? Lower : Upper) : UpLo)>
-    ::run(_expression().const_cast_derived().data(),_expression().outerStride(),actualU,actualV,actualAlpha);
+    ::run(_expression().const_cast_derived().data(),_expression().outerStride(),UType(actualU),VType(actualV),actualAlpha);
 
   return *this;
 }

diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index 4cbb79d..f0c6050 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h

@@ -45,22 +45,24 @@
           int Mode, bool LhsIsTriangular,
           int LhsStorageOrder, bool ConjugateLhs,
           int RhsStorageOrder, bool ConjugateRhs,
-          int ResStorageOrder, int Version = Specialized>
+          int ResStorageOrder, int ResInnerStride,
+          int Version = Specialized>
 struct product_triangular_matrix_matrix;
 
 template <typename Scalar, typename Index,
           int Mode, bool LhsIsTriangular,
           int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs, int Version>
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride, int Version>
 struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular,
                                            LhsStorageOrder,ConjugateLhs,
-                                           RhsStorageOrder,ConjugateRhs,RowMajor,Version>
+                                           RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride,Version>
 {
   static EIGEN_STRONG_INLINE void run(
     Index rows, Index cols, Index depth,
     const Scalar* lhs, Index lhsStride,
     const Scalar* rhs, Index rhsStride,
-    Scalar* res,       Index resStride,
+    Scalar* res,       Index resIncr, Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     product_triangular_matrix_matrix<Scalar, Index,
@@ -70,18 +72,19 @@
       ConjugateRhs,
       LhsStorageOrder==RowMajor ? ColMajor : RowMajor,
       ConjugateLhs,
-      ColMajor>
-      ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking);
+      ColMajor, ResInnerStride>
+      ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking);
   }
 };
 
 // implements col-major += alpha * op(triangular) * op(general)
 template <typename Scalar, typename Index, int Mode,
           int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs, int Version>
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride, int Version>
 struct product_triangular_matrix_matrix<Scalar,Index,Mode,true,
                                            LhsStorageOrder,ConjugateLhs,
-                                           RhsStorageOrder,ConjugateRhs,ColMajor,Version>
+                                           RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>
 {
   
   typedef gebp_traits<Scalar,Scalar> Traits;
@@ -95,20 +98,21 @@
     Index _rows, Index _cols, Index _depth,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
+    Scalar* res,        Index resIncr, Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };
 
 template <typename Scalar, typename Index, int Mode,
           int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs, int Version>
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride, int Version>
 EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
                                                         LhsStorageOrder,ConjugateLhs,
-                                                        RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run(
+                                                        RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run(
     Index _rows, Index _cols, Index _depth,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* _res,        Index resStride,
+    Scalar* _res,       Index resIncr, Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     // strip zeros
@@ -119,13 +123,17 @@
     
     typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
     typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
-    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
     LhsMapper lhs(_lhs,lhsStride);
     RhsMapper rhs(_rhs,rhsStride);
-    ResMapper res(_res, resStride);
+    ResMapper res(_res, resStride, resIncr);
 
     Index kc = blocking.kc();                   // cache block size along the K direction
     Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+    // The small panel size must not be larger than blocking size.
+    // Usually this should never be the case because SmallPanelWidth^2 is very small
+    // compared to L2 cache size, but let's be safe:
+    Index panelWidth = (std::min)(Index(SmallPanelWidth),(std::min)(kc,mc));
 
     std::size_t sizeA = kc*mc;
     std::size_t sizeB = kc*cols;
@@ -133,7 +141,13 @@
     ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
     ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
 
-    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
+    // To work around an "error: member reference base type 'Matrix<...>
+    // (Eigen::internal::constructor_without_unaligned_array_assert (*)())' is
+    // not a structure or union" compilation error in nvcc (tested V8.0.61),
+    // create a dummy internal::constructor_without_unaligned_array_assert
+    // object to pass to the Matrix constructor.
+    internal::constructor_without_unaligned_array_assert a;
+    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer(a);
     triangularBuffer.setZero();
     if((Mode&ZeroDiag)==ZeroDiag)
       triangularBuffer.diagonal().setZero();
@@ -141,7 +155,7 @@
       triangularBuffer.diagonal().setOnes();
 
     gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
 
     for(Index k2=IsLower ? depth : 0;
@@ -169,9 +183,9 @@
       if(IsLower || actual_k2<rows)
       {
         // for each small vertical panels of lhs
-        for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth)
+        for (Index k1=0; k1<actual_kc; k1+=panelWidth)
         {
-          Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth);
+          Index actualPanelWidth = std::min<Index>(actual_kc-k1, panelWidth);
           Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1;
           Index startBlock   = actual_k2+k1;
           Index blockBOffset = k1;
@@ -212,7 +226,7 @@
         for(Index i2=start; i2<end; i2+=mc)
         {
           const Index actual_mc = (std::min)(i2+mc,end)-i2;
-          gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
+          gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder,false>()
             (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
 
           gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc,
@@ -225,10 +239,11 @@
 // implements col-major += alpha * op(general) * op(triangular)
 template <typename Scalar, typename Index, int Mode,
           int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs, int Version>
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride, int Version>
 struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
                                         LhsStorageOrder,ConjugateLhs,
-                                        RhsStorageOrder,ConjugateRhs,ColMajor,Version>
+                                        RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>
 {
   typedef gebp_traits<Scalar,Scalar> Traits;
   enum {
@@ -241,22 +256,24 @@
     Index _rows, Index _cols, Index _depth,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
+    Scalar* res,        Index resIncr, Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };
 
 template <typename Scalar, typename Index, int Mode,
           int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs, int Version>
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride, int Version>
 EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
                                                         LhsStorageOrder,ConjugateLhs,
-                                                        RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run(
+                                                        RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run(
     Index _rows, Index _cols, Index _depth,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* _res,        Index resStride,
+    Scalar* _res,       Index resIncr, Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
+    const Index PacketBytes = packet_traits<Scalar>::size*sizeof(Scalar);
     // strip zeros
     Index diagSize  = (std::min)(_cols,_depth);
     Index rows      = _rows;
@@ -265,21 +282,22 @@
     
     typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
     typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
-    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
     LhsMapper lhs(_lhs,lhsStride);
     RhsMapper rhs(_rhs,rhsStride);
-    ResMapper res(_res, resStride);
+    ResMapper res(_res, resStride, resIncr);
 
     Index kc = blocking.kc();                   // cache block size along the K direction
     Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
 
     std::size_t sizeA = kc*mc;
-    std::size_t sizeB = kc*cols+EIGEN_ALIGN_BYTES/sizeof(Scalar);
+    std::size_t sizeB = kc*cols+EIGEN_MAX_ALIGN_BYTES/sizeof(Scalar);
 
     ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
     ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
 
-    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
+    internal::constructor_without_unaligned_array_assert a;
+    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer(a);
     triangularBuffer.setZero();
     if((Mode&ZeroDiag)==ZeroDiag)
       triangularBuffer.diagonal().setZero();
@@ -287,7 +305,7 @@
       triangularBuffer.diagonal().setOnes();
 
     gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
 
@@ -311,7 +329,7 @@
       Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;
 
       Scalar* geb = blockB+ts*ts;
-      geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar));
+      geb = geb + internal::first_aligned<PacketBytes>(geb,PacketBytes/sizeof(Scalar));
 
       pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs);
 
@@ -380,28 +398,31 @@
 * Wrapper to product_triangular_matrix_matrix
 ***************************************************************************/
 
-template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
-struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false> >
-  : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs> >
-{};
-
 } // end namespace internal
 
+namespace internal {
 template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
-struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
-  : public ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs >
+struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
-
-  TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  template<typename Dest> static void run(Dest& dst, const Lhs &a_lhs, const Rhs &a_rhs, const typename Dest::Scalar& alpha)
   {
-    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
-    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+    typedef typename Lhs::Scalar  LhsScalar;
+    typedef typename Rhs::Scalar  RhsScalar;
+    typedef typename Dest::Scalar Scalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+    
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
 
-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                               * RhsBlasTraits::extractScalarFactor(m_rhs);
+    LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(a_lhs);
+    RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(a_rhs);
+    Scalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
 
     typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
               Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType;
@@ -416,19 +437,36 @@
 
     internal::product_triangular_matrix_matrix<Scalar, Index,
       Mode, LhsIsTriangular,
-      (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
-      (internal::traits<_ActualRhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
-      (internal::traits<Dest          >::Flags&RowMajorBit) ? RowMajor : ColMajor>
+      (internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
+      (internal::traits<ActualRhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
+      (internal::traits<Dest          >::Flags&RowMajorBit) ? RowMajor : ColMajor, Dest::InnerStrideAtCompileTime>
       ::run(
         stripedRows, stripedCols, stripedDepth,   // sizes
-        &lhs.coeffRef(0,0),    lhs.outerStride(), // lhs info
-        &rhs.coeffRef(0,0),    rhs.outerStride(), // rhs info
-        &dst.coeffRef(0,0), dst.outerStride(),    // result info
+        &lhs.coeffRef(0,0), lhs.outerStride(),    // lhs info
+        &rhs.coeffRef(0,0), rhs.outerStride(),    // rhs info
+        &dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(),    // result info
         actualAlpha, blocking
       );
+
+    // Apply correction if the diagonal is unit and a scalar factor was nested:
+    if ((Mode&UnitDiag)==UnitDiag)
+    {
+      if (LhsIsTriangular && lhs_alpha!=LhsScalar(1))
+      {
+        Index diagSize = (std::min)(lhs.rows(),lhs.cols());
+        dst.topRows(diagSize) -= ((lhs_alpha-LhsScalar(1))*a_rhs).topRows(diagSize);
+      }
+      else if ((!LhsIsTriangular) && rhs_alpha!=RhsScalar(1))
+      {
+        Index diagSize = (std::min)(rhs.rows(),rhs.cols());
+        dst.leftCols(diagSize) -= (rhs_alpha-RhsScalar(1))*a_lhs.leftCols(diagSize);
+      }
+    }
   }
 };
 
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_H

diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
new file mode 100644
index 0000000..a98d12e
--- /dev/null
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h

@@ -0,0 +1,317 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   Triangular matrix * matrix product functionality based on ?TRMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H
+#define EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H
+
+namespace Eigen { 
+
+namespace internal {
+
+
+template <typename Scalar, typename Index,
+          int Mode, bool LhsIsTriangular,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResStorageOrder>
+struct product_triangular_matrix_matrix_trmm :
+       product_triangular_matrix_matrix<Scalar,Index,Mode,
+          LhsIsTriangular,LhsStorageOrder,ConjugateLhs,
+          RhsStorageOrder, ConjugateRhs, ResStorageOrder, 1, BuiltIn> {};
+
+
+// try to go to BLAS specialization
+#define EIGEN_BLAS_TRMM_SPECIALIZE(Scalar, LhsIsTriangular) \
+template <typename Index, int Mode, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_triangular_matrix_matrix<Scalar,Index, Mode, LhsIsTriangular, \
+           LhsStorageOrder,ConjugateLhs, RhsStorageOrder,ConjugateRhs,ColMajor,1,Specialized> { \
+  static inline void run(Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride,\
+    const Scalar* _rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking<Scalar,Scalar>& blocking) { \
+      EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+      eigen_assert(resIncr == 1); \
+      product_triangular_matrix_matrix_trmm<Scalar,Index,Mode, \
+        LhsIsTriangular,LhsStorageOrder,ConjugateLhs, \
+        RhsStorageOrder, ConjugateRhs, ColMajor>::run( \
+          _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
+  } \
+};
+
+EIGEN_BLAS_TRMM_SPECIALIZE(double, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(double, false)
+EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, false)
+EIGEN_BLAS_TRMM_SPECIALIZE(float, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(float, false)
+EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false)
+
+// implements col-major += alpha * op(triangular) * op(general)
+#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
+template <typename Index, int Mode, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
+         LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,ColMajor> \
+{ \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    LowUp = IsLower ? Lower : Upper, \
+    conjA = ((LhsStorageOrder==ColMajor) && ConjugateLhs) ? 1 : 0 \
+  }; \
+\
+  static void run( \
+    Index _rows, Index _cols, Index _depth, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resStride, \
+    EIGTYPE alpha, level3_blocking<EIGTYPE,EIGTYPE>& blocking) \
+  { \
+   Index diagSize  = (std::min)(_rows,_depth); \
+   Index rows      = IsLower ? _rows : diagSize; \
+   Index depth     = IsLower ? diagSize : _depth; \
+   Index cols      = _cols; \
+\
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \
+\
+/* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/ \
+   if (rows != depth) { \
+\
+     /* FIXME handle mkl_domain_get_max_threads */ \
+     /*int nthr = mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS);*/ int nthr = 1;\
+\
+     if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \
+     /* Most likely no benefit to call TRMM or GEMM from BLAS */ \
+       product_triangular_matrix_matrix<EIGTYPE,Index,Mode,true, \
+       LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \
+           _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \
+     /*std::cout << "TRMM_L: A is not square! Go to Eigen TRMM implementation!\n";*/ \
+     } else { \
+     /* Make sense to call GEMM */ \
+       Map<const MatrixLhs, 0, OuterStride<> > lhsMap(_lhs,rows,depth,OuterStride<>(lhsStride)); \
+       MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \
+       BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \
+       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
+       general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1>::run( \
+       rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, 1, resStride, alpha, gemm_blocking, 0); \
+\
+     /*std::cout << "TRMM_L: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \
+     } \
+     return; \
+   } \
+   char side = 'L', transa, uplo, diag = 'N'; \
+   EIGTYPE *b; \
+   const EIGTYPE *a; \
+   BlasIndex m, n, lda, ldb; \
+\
+/* Set m, n */ \
+   m = convert_index<BlasIndex>(diagSize); \
+   n = convert_index<BlasIndex>(cols); \
+\
+/* Set trans */ \
+   transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
+\
+/* Set b, ldb */ \
+   Map<const MatrixRhs, 0, OuterStride<> > rhs(_rhs,depth,cols,OuterStride<>(rhsStride)); \
+   MatrixX##EIGPREFIX b_tmp; \
+\
+   if (ConjugateRhs) b_tmp = rhs.conjugate(); else b_tmp = rhs; \
+   b = b_tmp.data(); \
+   ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
+\
+/* Set uplo */ \
+   uplo = IsLower ? 'L' : 'U'; \
+   if (LhsStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+   Map<const MatrixLhs, 0, OuterStride<> > lhs(_lhs,rows,depth,OuterStride<>(lhsStride)); \
+   MatrixLhs a_tmp; \
+\
+   if ((conjA!=0) || (SetDiag==0)) { \
+     if (conjA) a_tmp = lhs.conjugate(); else a_tmp = lhs; \
+     if (IsZeroDiag) \
+       a_tmp.diagonal().setZero(); \
+     else if (IsUnitDiag) \
+       a_tmp.diagonal().setOnes();\
+     a = a_tmp.data(); \
+     lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
+   } else { \
+     a = _lhs; \
+     lda = convert_index<BlasIndex>(lhsStride); \
+   } \
+   /*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/ \
+/* call ?trmm*/ \
+   BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
+\
+/* Add op(a_triangular)*b into res*/ \
+   Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
+   res_tmp=res_tmp+b_tmp; \
+  } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMM_L(double, double, d, dtrmm)
+EIGEN_BLAS_TRMM_L(dcomplex, MKL_Complex16, cd, ztrmm)
+EIGEN_BLAS_TRMM_L(float, float, f, strmm)
+EIGEN_BLAS_TRMM_L(scomplex, MKL_Complex8, cf, ctrmm)
+#else
+EIGEN_BLAS_TRMM_L(double, double, d, dtrmm_)
+EIGEN_BLAS_TRMM_L(dcomplex, double, cd, ztrmm_)
+EIGEN_BLAS_TRMM_L(float, float, f, strmm_)
+EIGEN_BLAS_TRMM_L(scomplex, float, cf, ctrmm_)
+#endif
+
+// implements col-major += alpha * op(general) * op(triangular)
+#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
+template <typename Index, int Mode, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
+         LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,ColMajor> \
+{ \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    LowUp = IsLower ? Lower : Upper, \
+    conjA = ((RhsStorageOrder==ColMajor) && ConjugateRhs) ? 1 : 0 \
+  }; \
+\
+  static void run( \
+    Index _rows, Index _cols, Index _depth, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resStride, \
+    EIGTYPE alpha, level3_blocking<EIGTYPE,EIGTYPE>& blocking) \
+  { \
+   Index diagSize  = (std::min)(_cols,_depth); \
+   Index rows      = _rows; \
+   Index depth     = IsLower ? _depth : diagSize; \
+   Index cols      = IsLower ? diagSize : _cols; \
+\
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \
+\
+/* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/ \
+   if (cols != depth) { \
+\
+     int nthr = 1 /*mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS)*/; \
+\
+     if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \
+     /* Most likely no benefit to call TRMM or GEMM from BLAS*/ \
+       product_triangular_matrix_matrix<EIGTYPE,Index,Mode,false, \
+       LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \
+           _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \
+       /*std::cout << "TRMM_R: A is not square! Go to Eigen TRMM implementation!\n";*/ \
+     } else { \
+     /* Make sense to call GEMM */ \
+       Map<const MatrixRhs, 0, OuterStride<> > rhsMap(_rhs,depth,cols, OuterStride<>(rhsStride)); \
+       MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \
+       BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \
+       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
+       general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1>::run( \
+       rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, 1, resStride, alpha, gemm_blocking, 0); \
+\
+     /*std::cout << "TRMM_R: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \
+     } \
+     return; \
+   } \
+   char side = 'R', transa, uplo, diag = 'N'; \
+   EIGTYPE *b; \
+   const EIGTYPE *a; \
+   BlasIndex m, n, lda, ldb; \
+\
+/* Set m, n */ \
+   m = convert_index<BlasIndex>(rows); \
+   n = convert_index<BlasIndex>(diagSize); \
+\
+/* Set trans */ \
+   transa = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
+\
+/* Set b, ldb */ \
+   Map<const MatrixLhs, 0, OuterStride<> > lhs(_lhs,rows,depth,OuterStride<>(lhsStride)); \
+   MatrixX##EIGPREFIX b_tmp; \
+\
+   if (ConjugateLhs) b_tmp = lhs.conjugate(); else b_tmp = lhs; \
+   b = b_tmp.data(); \
+   ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
+\
+/* Set uplo */ \
+   uplo = IsLower ? 'L' : 'U'; \
+   if (RhsStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+   Map<const MatrixRhs, 0, OuterStride<> > rhs(_rhs,depth,cols, OuterStride<>(rhsStride)); \
+   MatrixRhs a_tmp; \
+\
+   if ((conjA!=0) || (SetDiag==0)) { \
+     if (conjA) a_tmp = rhs.conjugate(); else a_tmp = rhs; \
+     if (IsZeroDiag) \
+       a_tmp.diagonal().setZero(); \
+     else if (IsUnitDiag) \
+       a_tmp.diagonal().setOnes();\
+     a = a_tmp.data(); \
+     lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
+   } else { \
+     a = _rhs; \
+     lda = convert_index<BlasIndex>(rhsStride); \
+   } \
+   /*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/ \
+/* call ?trmm*/ \
+   BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
+\
+/* Add op(a_triangular)*b into res*/ \
+   Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
+   res_tmp=res_tmp+b_tmp; \
+  } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMM_R(double, double, d, dtrmm)
+EIGEN_BLAS_TRMM_R(dcomplex, MKL_Complex16, cd, ztrmm)
+EIGEN_BLAS_TRMM_R(float, float, f, strmm)
+EIGEN_BLAS_TRMM_R(scomplex, MKL_Complex8, cf, ctrmm)
+#else
+EIGEN_BLAS_TRMM_R(double, double, d, dtrmm_)
+EIGEN_BLAS_TRMM_R(dcomplex, double, cd, ztrmm_)
+EIGEN_BLAS_TRMM_R(float, float, f, strmm_)
+EIGEN_BLAS_TRMM_R(scomplex, float, cf, ctrmm_)
+#endif
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H

diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index 9863076..76bfa15 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h

@@ -20,20 +20,20 @@
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
 struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   enum {
     IsLower = ((Mode&Lower)==Lower),
     HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
     HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
   };
   static EIGEN_DONT_INLINE  void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
-                                     const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha);
+                                     const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha);
 };
 
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
 EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
   ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
-        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
+        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha)
   {
     static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
     Index size = (std::min)(_rows,_cols);
@@ -91,7 +91,7 @@
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
 struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   enum {
     IsLower = ((Mode&Lower)==Lower),
     HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
@@ -163,83 +163,67 @@
 * Wrapper to product_triangular_vector
 ***************************************************************************/
 
-template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
-struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,true> >
- : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,true>, Lhs, Rhs> >
-{};
-
-template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
-struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,true,Rhs,false> >
- : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,true,Rhs,false>, Lhs, Rhs> >
-{};
-
-
-template<int StorageOrder>
+template<int Mode,int StorageOrder>
 struct trmv_selector;
 
 } // end namespace internal
 
+namespace internal {
+
 template<int Mode, typename Lhs, typename Rhs>
-struct TriangularProduct<Mode,true,Lhs,false,Rhs,true>
-  : public ProductBase<TriangularProduct<Mode,true,Lhs,false,Rhs,true>, Lhs, Rhs >
+struct triangular_product_impl<Mode,true,Lhs,false,Rhs,true>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
-
-  TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  template<typename Dest> static void run(Dest& dst, const Lhs &lhs, const Rhs &rhs, const typename Dest::Scalar& alpha)
   {
-    eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
-
-    internal::trmv_selector<(int(internal::traits<Lhs>::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(*this, dst, alpha);
+    eigen_assert(dst.rows()==lhs.rows() && dst.cols()==rhs.cols());
+  
+    internal::trmv_selector<Mode,(int(internal::traits<Lhs>::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(lhs, rhs, dst, alpha);
   }
 };
 
 template<int Mode, typename Lhs, typename Rhs>
-struct TriangularProduct<Mode,false,Lhs,true,Rhs,false>
-  : public ProductBase<TriangularProduct<Mode,false,Lhs,true,Rhs,false>, Lhs, Rhs >
+struct triangular_product_impl<Mode,false,Lhs,true,Rhs,false>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
-
-  TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  template<typename Dest> static void run(Dest& dst, const Lhs &lhs, const Rhs &rhs, const typename Dest::Scalar& alpha)
   {
-    eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+    eigen_assert(dst.rows()==lhs.rows() && dst.cols()==rhs.cols());
 
-    typedef TriangularProduct<(Mode & (UnitDiag|ZeroDiag)) | ((Mode & Lower) ? Upper : Lower),true,Transpose<const Rhs>,false,Transpose<const Lhs>,true> TriangularProductTranspose;
     Transpose<Dest> dstT(dst);
-    internal::trmv_selector<(int(internal::traits<Rhs>::Flags)&RowMajorBit) ? ColMajor : RowMajor>::run(
-      TriangularProductTranspose(m_rhs.transpose(),m_lhs.transpose()), dstT, alpha);
+    internal::trmv_selector<(Mode & (UnitDiag|ZeroDiag)) | ((Mode & Lower) ? Upper : Lower),
+                            (int(internal::traits<Rhs>::Flags)&RowMajorBit) ? ColMajor : RowMajor>
+            ::run(rhs.transpose(),lhs.transpose(), dstT, alpha);
   }
 };
 
+} // end namespace internal
+
 namespace internal {
 
 // TODO: find a way to factorize this piece of code with gemv_selector since the logic is exactly the same.
-
-template<> struct trmv_selector<ColMajor>
+  
+template<int Mode> struct trmv_selector<Mode,ColMajor>
 {
-  template<int Mode, typename Lhs, typename Rhs, typename Dest>
-  static void run(const TriangularProduct<Mode,true,Lhs,false,Rhs,true>& prod, Dest& dest, const typename TriangularProduct<Mode,true,Lhs,false,Rhs,true>::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
   {
-    typedef TriangularProduct<Mode,true,Lhs,false,Rhs,true> ProductType;
-    typedef typename ProductType::Index Index;
-    typedef typename ProductType::LhsScalar   LhsScalar;
-    typedef typename ProductType::RhsScalar   RhsScalar;
-    typedef typename ProductType::Scalar      ResScalar;
-    typedef typename ProductType::RealScalar  RealScalar;
-    typedef typename ProductType::ActualLhsType ActualLhsType;
-    typedef typename ProductType::ActualRhsType ActualRhsType;
-    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
-    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    typedef typename Lhs::Scalar      LhsScalar;
+    typedef typename Rhs::Scalar      RhsScalar;
+    typedef typename Dest::Scalar     ResScalar;
+    typedef typename Dest::RealScalar RealScalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
 
-    typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
-    typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+    typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
 
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
-                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+    LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs);
+    RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs);
+    ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
 
     enum {
       // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
@@ -291,36 +275,42 @@
       else
         dest = MappedDest(actualDestPtr, dest.size());
     }
+
+    if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) )
+    {
+      Index diagSize = (std::min)(lhs.rows(),lhs.cols());
+      dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize);
+    }
   }
 };
 
-template<> struct trmv_selector<RowMajor>
+template<int Mode> struct trmv_selector<Mode,RowMajor>
 {
-  template<int Mode, typename Lhs, typename Rhs, typename Dest>
-  static void run(const TriangularProduct<Mode,true,Lhs,false,Rhs,true>& prod, Dest& dest, const typename TriangularProduct<Mode,true,Lhs,false,Rhs,true>::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
   {
-    typedef TriangularProduct<Mode,true,Lhs,false,Rhs,true> ProductType;
-    typedef typename ProductType::LhsScalar LhsScalar;
-    typedef typename ProductType::RhsScalar RhsScalar;
-    typedef typename ProductType::Scalar    ResScalar;
-    typedef typename ProductType::Index Index;
-    typedef typename ProductType::ActualLhsType ActualLhsType;
-    typedef typename ProductType::ActualRhsType ActualRhsType;
-    typedef typename ProductType::_ActualRhsType _ActualRhsType;
-    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
-    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
+    typedef typename Lhs::Scalar      LhsScalar;
+    typedef typename Rhs::Scalar      RhsScalar;
+    typedef typename Dest::Scalar     ResScalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
 
-    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
-    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
+    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
 
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
-                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+    LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs);
+    RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs);
+    ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
 
     enum {
-      DirectlyUseRhs = _ActualRhsType::InnerStrideAtCompileTime==1
+      DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1
     };
 
-    gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
+    gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
 
     ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(),
         DirectlyUseRhs ? const_cast<RhsScalar*>(actualRhs.data()) : static_rhs.data());
@@ -328,10 +318,10 @@
     if(!DirectlyUseRhs)
     {
       #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = actualRhs.size();
+      Index size = actualRhs.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
       #endif
-      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+      Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
     }
 
     internal::triangular_matrix_vector_product
@@ -344,6 +334,12 @@
             actualRhsPtr,1,
             dest.data(),dest.innerStride(),
             actualAlpha);
+
+    if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) )
+    {
+      Index diagSize = (std::min)(lhs.rows(),lhs.cols());
+      dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize);
+    }
   }
 };
 

diff --git a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
new file mode 100644
index 0000000..3d47a2b
--- /dev/null
+++ b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h

@@ -0,0 +1,255 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   Triangular matrix-vector product functionality based on ?TRMV.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H
+#define EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/**********************************************************************
+* This file implements triangular matrix-vector multiplication using BLAS
+**********************************************************************/
+
+// trmv/hemv specialization
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int StorageOrder>
+struct triangular_matrix_vector_product_trmv :
+  triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,StorageOrder,BuiltIn> {};
+
+#define EIGEN_BLAS_TRMV_SPECIALIZE(Scalar) \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor,Specialized> { \
+ static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
+                                     const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
+      triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor>::run( \
+        _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+  } \
+}; \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor,Specialized> { \
+ static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
+                                     const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
+      triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor>::run( \
+        _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+  } \
+};
+
+EIGEN_BLAS_TRMV_SPECIALIZE(double)
+EIGEN_BLAS_TRMV_SPECIALIZE(float)
+EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex)
+EIGEN_BLAS_TRMV_SPECIALIZE(scomplex)
+
+// implements col-major: res += alpha * op(triangular) * vector
+#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    LowUp = IsLower ? Lower : Upper \
+  }; \
+ static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
+                 const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
+ { \
+   if (ConjLhs || IsZeroDiag) { \
+     triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor,BuiltIn>::run( \
+       _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+     return; \
+   }\
+   Index size = (std::min)(_rows,_cols); \
+   Index rows = IsLower ? _rows : size; \
+   Index cols = IsLower ? size : _cols; \
+\
+   typedef VectorX##EIGPREFIX VectorRhs; \
+   EIGTYPE *x, *y;\
+\
+/* Set x*/ \
+   Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \
+   VectorRhs x_tmp; \
+   if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+   x = x_tmp.data(); \
+\
+/* Square part handling */\
+\
+   char trans, uplo, diag; \
+   BlasIndex m, n, lda, incx, incy; \
+   EIGTYPE const *a; \
+   EIGTYPE beta(1); \
+\
+/* Set m, n */ \
+   n = convert_index<BlasIndex>(size); \
+   lda = convert_index<BlasIndex>(lhsStride); \
+   incx = 1; \
+   incy = convert_index<BlasIndex>(resIncr); \
+\
+/* Set uplo, trans and diag*/ \
+   trans = 'N'; \
+   uplo = IsLower ? 'L' : 'U'; \
+   diag = IsUnitDiag ? 'U' : 'N'; \
+\
+/* call ?TRMV*/ \
+   BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
+\
+/* Add op(a_tr)rhs into res*/ \
+   BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
+/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
+   if (size<(std::max)(rows,cols)) { \
+     if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+     x = x_tmp.data(); \
+     if (size<rows) { \
+       y = _res + size*resIncr; \
+       a = _lhs + size; \
+       m = convert_index<BlasIndex>(rows-size); \
+       n = convert_index<BlasIndex>(size); \
+     } \
+     else { \
+       x += size; \
+       y = _res; \
+       a = _lhs + size*lda; \
+       m = convert_index<BlasIndex>(size); \
+       n = convert_index<BlasIndex>(cols-size); \
+     } \
+     BLASPREFIX##gemv##BLASPOSTFIX(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
+   } \
+  } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMV_CM(double,   double, d,  d,)
+EIGEN_BLAS_TRMV_CM(dcomplex, MKL_Complex16, cd, z,)
+EIGEN_BLAS_TRMV_CM(float,    float,  f,  s,)
+EIGEN_BLAS_TRMV_CM(scomplex, MKL_Complex8,  cf, c,)
+#else
+EIGEN_BLAS_TRMV_CM(double,   double, d,  d, _)
+EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z, _)
+EIGEN_BLAS_TRMV_CM(float,    float,  f,  s, _)
+EIGEN_BLAS_TRMV_CM(scomplex, float,  cf, c, _)
+#endif
+
+// implements row-major: res += alpha * op(triangular) * vector
+#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    LowUp = IsLower ? Lower : Upper \
+  }; \
+ static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
+                 const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
+ { \
+   if (IsZeroDiag) { \
+     triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor,BuiltIn>::run( \
+       _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+     return; \
+   }\
+   Index size = (std::min)(_rows,_cols); \
+   Index rows = IsLower ? _rows : size; \
+   Index cols = IsLower ? size : _cols; \
+\
+   typedef VectorX##EIGPREFIX VectorRhs; \
+   EIGTYPE *x, *y;\
+\
+/* Set x*/ \
+   Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \
+   VectorRhs x_tmp; \
+   if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+   x = x_tmp.data(); \
+\
+/* Square part handling */\
+\
+   char trans, uplo, diag; \
+   BlasIndex m, n, lda, incx, incy; \
+   EIGTYPE const *a; \
+   EIGTYPE beta(1); \
+\
+/* Set m, n */ \
+   n = convert_index<BlasIndex>(size); \
+   lda = convert_index<BlasIndex>(lhsStride); \
+   incx = 1; \
+   incy = convert_index<BlasIndex>(resIncr); \
+\
+/* Set uplo, trans and diag*/ \
+   trans = ConjLhs ? 'C' : 'T'; \
+   uplo = IsLower ? 'U' : 'L'; \
+   diag = IsUnitDiag ? 'U' : 'N'; \
+\
+/* call ?TRMV*/ \
+   BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
+\
+/* Add op(a_tr)rhs into res*/ \
+   BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
+/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
+   if (size<(std::max)(rows,cols)) { \
+     if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+     x = x_tmp.data(); \
+     if (size<rows) { \
+       y = _res + size*resIncr; \
+       a = _lhs + size*lda; \
+       m = convert_index<BlasIndex>(rows-size); \
+       n = convert_index<BlasIndex>(size); \
+     } \
+     else { \
+       x += size; \
+       y = _res; \
+       a = _lhs + size; \
+       m = convert_index<BlasIndex>(size); \
+       n = convert_index<BlasIndex>(cols-size); \
+     } \
+     BLASPREFIX##gemv##BLASPOSTFIX(&trans, &n, &m, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
+   } \
+  } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMV_RM(double,   double, d,  d,)
+EIGEN_BLAS_TRMV_RM(dcomplex, MKL_Complex16, cd, z,)
+EIGEN_BLAS_TRMV_RM(float,    float,  f,  s,)
+EIGEN_BLAS_TRMV_RM(scomplex, MKL_Complex8,  cf, c,)
+#else
+EIGEN_BLAS_TRMV_RM(double,   double, d,  d,_)
+EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z,_)
+EIGEN_BLAS_TRMV_RM(float,    float,  f,  s,_)
+EIGEN_BLAS_TRMV_RM(scomplex, float,  cf, c,_)
+#endif
+
+} // end namespase internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H

diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index f5de67c..6d879ba 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h

@@ -15,48 +15,48 @@
 namespace internal {
 
 // if the rhs is row major, let's transpose the product
-template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder>
-struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor>
+template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor,OtherInnerStride>
 {
   static void run(
     Index size, Index cols,
     const Scalar*  tri, Index triStride,
-    Scalar* _other, Index otherStride,
+    Scalar* _other, Index otherIncr, Index otherStride,
     level3_blocking<Scalar,Scalar>& blocking)
   {
     triangular_solve_matrix<
       Scalar, Index, Side==OnTheLeft?OnTheRight:OnTheLeft,
       (Mode&UnitDiag) | ((Mode&Upper) ? Lower : Upper),
       NumTraits<Scalar>::IsComplex && Conjugate,
-      TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor>
-      ::run(size, cols, tri, triStride, _other, otherStride, blocking);
+      TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor, OtherInnerStride>
+      ::run(size, cols, tri, triStride, _other, otherIncr, otherStride, blocking);
   }
 };
 
 /* Optimized triangular solver with multiple right hand side and the triangular matrix on the left
  */
-template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
-struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder,int OtherInnerStride>
+struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>
 {
   static EIGEN_DONT_INLINE void run(
     Index size, Index otherSize,
     const Scalar* _tri, Index triStride,
-    Scalar* _other, Index otherStride,
+    Scalar* _other, Index otherIncr, Index otherStride,
     level3_blocking<Scalar,Scalar>& blocking);
 };
-template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
-EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>::run(
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>::run(
     Index size, Index otherSize,
     const Scalar* _tri, Index triStride,
-    Scalar* _other, Index otherStride,
+    Scalar* _other, Index otherIncr, Index otherStride,
     level3_blocking<Scalar,Scalar>& blocking)
   {
     Index cols = otherSize;
 
     typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper;
-    typedef blas_data_mapper<Scalar, Index, ColMajor> OtherMapper;
+    typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> OtherMapper;
     TriMapper tri(_tri, triStride);
-    OtherMapper other(_other, otherStride);
+    OtherMapper other(_other, otherStride, otherIncr);
 
     typedef gebp_traits<Scalar,Scalar> Traits;
 
@@ -76,14 +76,14 @@
 
     conj_if<Conjugate> conj;
     gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
+    gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, TriStorageOrder> pack_lhs;
     gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs;
 
     // the goal here is to subdivise the Rhs panels such that we keep some cache
     // coherence when accessing the rhs elements
     std::ptrdiff_t l1, l2, l3;
     manage_caching_sizes(GetAction, &l1, &l2, &l3);
-    Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0;
+    Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * std::max<Index>(otherStride,size)) : 0;
     subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr);
 
     for(Index k2=IsLower ? 0 : size;
@@ -117,8 +117,9 @@
           {
             // TODO write a small kernel handling this (can be shared with trsv)
             Index i  = IsLower ? k2+k1+k : k2-k1-k-1;
-            Index s  = IsLower ? k2+k1 : i+1;
             Index rs = actualPanelWidth - k - 1; // remaining size
+            Index s  = TriStorageOrder==RowMajor ? (IsLower ? k2+k1 : i+1)
+                                                 :  IsLower ? i+1 : i-rs;
 
             Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(tri(i,i));
             for (Index j=j2; j<j2+actual_cols; ++j)
@@ -127,20 +128,21 @@
               {
                 Scalar b(0);
                 const Scalar* l = &tri(i,s);
-                Scalar* r = &other(s,j);
+                typename OtherMapper::LinearMapper r = other.getLinearMapper(s,j);
                 for (Index i3=0; i3<k; ++i3)
-                  b += conj(l[i3]) * r[i3];
+                  b += conj(l[i3]) * r(i3);
 
                 other(i,j) = (other(i,j) - b)*a;
               }
               else
               {
-                Index s = IsLower ? i+1 : i-rs;
-                Scalar b = (other(i,j) *= a);
-                Scalar* r = &other(s,j);
-                const Scalar* l = &tri(s,i);
+                Scalar& otherij = other(i,j);
+                otherij *= a;
+                Scalar b = otherij;
+                typename OtherMapper::LinearMapper r = other.getLinearMapper(s,j);
+                typename TriMapper::LinearMapper l = tri.getLinearMapper(s,i);
                 for (Index i3=0;i3<rs;++i3)
-                  r[i3] -= b * conj(l[i3]);
+                  r(i3) -= b * conj(l(i3));
               }
             }
           }
@@ -183,29 +185,30 @@
     }
   }
 
-/* Optimized triangular solver with multiple left hand sides and the trinagular matrix on the right
+/* Optimized triangular solver with multiple left hand sides and the triangular matrix on the right
  */
-template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
-struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>
 {
   static EIGEN_DONT_INLINE void run(
     Index size, Index otherSize,
     const Scalar* _tri, Index triStride,
-    Scalar* _other, Index otherStride,
+    Scalar* _other, Index otherIncr, Index otherStride,
     level3_blocking<Scalar,Scalar>& blocking);
 };
-template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
-EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>::run(
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>::run(
     Index size, Index otherSize,
     const Scalar* _tri, Index triStride,
-    Scalar* _other, Index otherStride,
+    Scalar* _other, Index otherIncr, Index otherStride,
     level3_blocking<Scalar,Scalar>& blocking)
   {
     Index rows = otherSize;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
 
-    typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper;
+    typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> LhsMapper;
     typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
-    LhsMapper lhs(_other, otherStride);
+    LhsMapper lhs(_other, otherStride, otherIncr);
     RhsMapper rhs(_tri, triStride);
 
     typedef gebp_traits<Scalar,Scalar> Traits;
@@ -228,7 +231,7 @@
     gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel;
-    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor, false, true> pack_lhs_panel;
 
     for(Index k2=IsLower ? size : 0;
         IsLower ? k2>0 : k2<size;
@@ -296,21 +299,24 @@
             {
               Index j = IsLower ? absolute_j2+actualPanelWidth-k-1 : absolute_j2+k;
 
-              Scalar* r = &lhs(i2,j);
+              typename LhsMapper::LinearMapper r = lhs.getLinearMapper(i2,j);
               for (Index k3=0; k3<k; ++k3)
               {
                 Scalar b = conj(rhs(IsLower ? j+1+k3 : absolute_j2+k3,j));
-                Scalar* a = &lhs(i2,IsLower ? j+1+k3 : absolute_j2+k3);
+                typename LhsMapper::LinearMapper a = lhs.getLinearMapper(i2,IsLower ? j+1+k3 : absolute_j2+k3);
                 for (Index i=0; i<actual_mc; ++i)
-                  r[i] -= a[i] * b;
+                  r(i) -= a(i) * b;
               }
-              Scalar b = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(rhs(j,j));
-              for (Index i=0; i<actual_mc; ++i)
-                r[i] *= b;
+              if((Mode & UnitDiag)==0)
+              {
+                Scalar inv_rjj = RealScalar(1)/conj(rhs(j,j));
+                for (Index i=0; i<actual_mc; ++i)
+                  r(i) *= inv_rjj;
+              }
             }
 
             // pack the just computed part of lhs to A
-            pack_lhs_panel(blockA, LhsMapper(_other+absolute_j2*otherStride+i2, otherStride),
+            pack_lhs_panel(blockA, lhs.getSubMapper(i2,absolute_j2),
                            actualPanelWidth, actual_mc,
                            actual_kc, j2);
           }

diff --git a/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
new file mode 100644
index 0000000..621194c
--- /dev/null
+++ b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h

@@ -0,0 +1,167 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   Triangular matrix * matrix product functionality based on ?TRMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H
+#define EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// implements LeftSide op(triangular)^-1 * general
+#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASFUNC) \
+template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
+struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,1> \
+{ \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \
+  }; \
+  static void run( \
+      Index size, Index otherSize, \
+      const EIGTYPE* _tri, Index triStride, \
+      EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
+  { \
+   EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \
+   eigen_assert(otherIncr == 1); \
+   BlasIndex m = convert_index<BlasIndex>(size), n = convert_index<BlasIndex>(otherSize), lda, ldb; \
+   char side = 'L', uplo, diag='N', transa; \
+   /* Set alpha_ */ \
+   EIGTYPE alpha(1); \
+   ldb = convert_index<BlasIndex>(otherStride);\
+\
+   const EIGTYPE *a; \
+/* Set trans */ \
+   transa = (TriStorageOrder==RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N'; \
+/* Set uplo */ \
+   uplo = IsLower ? 'L' : 'U'; \
+   if (TriStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, TriStorageOrder> MatrixTri; \
+   Map<const MatrixTri, 0, OuterStride<> > tri(_tri,size,size,OuterStride<>(triStride)); \
+   MatrixTri a_tmp; \
+\
+   if (conjA) { \
+     a_tmp = tri.conjugate(); \
+     a = a_tmp.data(); \
+     lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
+   } else { \
+     a = _tri; \
+     lda = convert_index<BlasIndex>(triStride); \
+   } \
+   if (IsUnitDiag) diag='U'; \
+/* call ?trsm*/ \
+   BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
+ } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRSM_L(double,   double, dtrsm)
+EIGEN_BLAS_TRSM_L(dcomplex, MKL_Complex16, ztrsm)
+EIGEN_BLAS_TRSM_L(float,    float,  strsm)
+EIGEN_BLAS_TRSM_L(scomplex, MKL_Complex8, ctrsm)
+#else
+EIGEN_BLAS_TRSM_L(double,   double, dtrsm_)
+EIGEN_BLAS_TRSM_L(dcomplex, double, ztrsm_)
+EIGEN_BLAS_TRSM_L(float,    float,  strsm_)
+EIGEN_BLAS_TRSM_L(scomplex, float,  ctrsm_)
+#endif
+
+// implements RightSide general * op(triangular)^-1
+#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASFUNC) \
+template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
+struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,1> \
+{ \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \
+  }; \
+  static void run( \
+      Index size, Index otherSize, \
+      const EIGTYPE* _tri, Index triStride, \
+      EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
+  { \
+   EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \
+   eigen_assert(otherIncr == 1); \
+   BlasIndex m = convert_index<BlasIndex>(otherSize), n = convert_index<BlasIndex>(size), lda, ldb; \
+   char side = 'R', uplo, diag='N', transa; \
+   /* Set alpha_ */ \
+   EIGTYPE alpha(1); \
+   ldb = convert_index<BlasIndex>(otherStride);\
+\
+   const EIGTYPE *a; \
+/* Set trans */ \
+   transa = (TriStorageOrder==RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N'; \
+/* Set uplo */ \
+   uplo = IsLower ? 'L' : 'U'; \
+   if (TriStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, TriStorageOrder> MatrixTri; \
+   Map<const MatrixTri, 0, OuterStride<> > tri(_tri,size,size,OuterStride<>(triStride)); \
+   MatrixTri a_tmp; \
+\
+   if (conjA) { \
+     a_tmp = tri.conjugate(); \
+     a = a_tmp.data(); \
+     lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
+   } else { \
+     a = _tri; \
+     lda = convert_index<BlasIndex>(triStride); \
+   } \
+   if (IsUnitDiag) diag='U'; \
+/* call ?trsm*/ \
+   BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
+   /*std::cout << "TRMS_L specialization!\n";*/ \
+ } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRSM_R(double,   double, dtrsm)
+EIGEN_BLAS_TRSM_R(dcomplex, MKL_Complex16, ztrsm)
+EIGEN_BLAS_TRSM_R(float,    float,  strsm)
+EIGEN_BLAS_TRSM_R(scomplex, MKL_Complex8,  ctrsm)
+#else
+EIGEN_BLAS_TRSM_R(double,   double, dtrsm_)
+EIGEN_BLAS_TRSM_R(dcomplex, double, ztrsm_)
+EIGEN_BLAS_TRSM_R(float,    float,  strsm_)
+EIGEN_BLAS_TRSM_R(scomplex, float,  ctrsm_)
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H

diff --git a/Eigen/src/Core/products/TriangularSolverVector.h b/Eigen/src/Core/products/TriangularSolverVector.h
index b994759..6473170 100644
--- a/Eigen/src/Core/products/TriangularSolverVector.h
+++ b/Eigen/src/Core/products/TriangularSolverVector.h

@@ -58,7 +58,7 @@
       {
         // let's directly call the low level product function because:
         // 1 - it is faster to compile
-        // 2 - it is slighlty faster at runtime
+        // 2 - it is slightly faster at runtime
         Index startRow = IsLower ? pi : pi-actualPanelWidth;
         Index startCol = IsLower ? 0 : pi;
 
@@ -77,7 +77,7 @@
         if (k>0)
           rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
 
-        if(!(Mode & UnitDiag))
+        if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0)))
           rhs[i] /= cjLhs(i,i);
       }
     }
@@ -114,20 +114,23 @@
       for(Index k=0; k<actualPanelWidth; ++k)
       {
         Index i = IsLower ? pi+k : pi-k-1;
-        if(!(Mode & UnitDiag))
-          rhs[i] /= cjLhs.coeff(i,i);
+        if(numext::not_equal_strict(rhs[i],RhsScalar(0)))
+        {
+          if(!(Mode & UnitDiag))
+            rhs[i] /= cjLhs.coeff(i,i);
 
-        Index r = actualPanelWidth - k - 1; // remaining size
-        Index s = IsLower ? i+1 : i-r;
-        if (r>0)
-          Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
+          Index r = actualPanelWidth - k - 1; // remaining size
+          Index s = IsLower ? i+1 : i-r;
+          if (r>0)
+            Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
+        }
       }
       Index r = IsLower ? size - endBlock : startBlock; // remaining size
       if (r > 0)
       {
         // let's directly call the low level product function because:
         // 1 - it is faster to compile
-        // 2 - it is slighlty faster at runtime
+        // 2 - it is slightly faster at runtime
         general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
             r, actualPanelWidth,
             LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride),

diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
old mode 100644
new mode 100755
index b42a4a6..e16a564
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h

@@ -24,92 +24,23 @@
 template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
 struct gemm_pack_rhs;
 
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
 struct gemm_pack_lhs;
 
 template<
   typename Index,
   typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
   typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
-  int ResStorageOrder>
+  int ResStorageOrder, int ResInnerStride>
 struct general_matrix_matrix_product;
 
-template<typename Index, typename LhsScalar, typename LhsMapper, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version=Specialized>
+template<typename Index,
+         typename LhsScalar, typename LhsMapper, int LhsStorageOrder, bool ConjugateLhs,
+         typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version=Specialized>
 struct general_matrix_vector_product;
 
-
-template<bool Conjugate> struct conj_if;
-
-template<> struct conj_if<true> {
-  template<typename T>
-  inline T operator()(const T& x) { return numext::conj(x); }
-  template<typename T>
-  inline T pconj(const T& x) { return internal::pconj(x); }
-};
-
-template<> struct conj_if<false> {
-  template<typename T>
-  inline const T& operator()(const T& x) { return x; }
-  template<typename T>
-  inline const T& pconj(const T& x) { return x; }
-};
-
-template<typename Scalar> struct conj_helper<Scalar,Scalar,false,false>
-{
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return internal::pmadd(x,y,c); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const { return internal::pmul(x,y); }
-};
-
-template<typename RealScalar> struct conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, false,true>
-{
-  typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const
-  { return c + pmul(x,y); }
-
-  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const
-  { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::imag(x)*numext::real(y) - numext::real(x)*numext::imag(y)); }
-};
-
-template<typename RealScalar> struct conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, true,false>
-{
-  typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const
-  { return c + pmul(x,y); }
-
-  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const
-  { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); }
-};
-
-template<typename RealScalar> struct conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, true,true>
-{
-  typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const
-  { return c + pmul(x,y); }
-
-  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const
-  { return Scalar(numext::real(x)*numext::real(y) - numext::imag(x)*numext::imag(y), - numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); }
-};
-
-template<typename RealScalar,bool Conj> struct conj_helper<std::complex<RealScalar>, RealScalar, Conj,false>
-{
-  typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const RealScalar& y, const Scalar& c) const
-  { return padd(c, pmul(x,y)); }
-  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const RealScalar& y) const
-  { return conj_if<Conj>()(x)*y; }
-};
-
-template<typename RealScalar,bool Conj> struct conj_helper<RealScalar, std::complex<RealScalar>, false,Conj>
-{
-  typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const RealScalar& x, const Scalar& y, const Scalar& c) const
-  { return padd(c, pmul(x,y)); }
-  EIGEN_STRONG_INLINE Scalar pmul(const RealScalar& x, const Scalar& y) const
-  { return x*conj_if<Conj>()(y); }
-};
-
 template<typename From,typename To> struct get_factor {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return x; }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return To(x); }
 };
 
 template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::Real> {
@@ -118,6 +49,353 @@
 };
 
 
+template<typename Scalar, typename Index>
+class BlasVectorMapper {
+  public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    return m_data[i];
+  }
+  template <typename Packet, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet load(Index i) const {
+    return ploadt<Packet, AlignmentType>(m_data + i);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC bool aligned(Index i) const {
+    return (UIntPtr(m_data+i)%sizeof(Packet))==0;
+  }
+
+  protected:
+  Scalar* m_data;
+};
+
+template<typename Scalar, typename Index, int AlignmentType, int Incr=1>
+class BlasLinearMapper;
+
+template<typename Scalar, typename Index, int AlignmentType>
+class BlasLinearMapper<Scalar,Index,AlignmentType>
+{
+public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data, Index incr=1)
+    : m_data(data)
+  {
+    EIGEN_ONLY_USED_FOR_DEBUG(incr);
+    eigen_assert(incr==1);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
+    internal::prefetch(&operator()(i));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
+    return m_data[i];
+  }
+
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const {
+    return ploadt<PacketType, AlignmentType>(m_data + i);
+  }
+
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const {
+    pstoret<Scalar, PacketType, AlignmentType>(m_data + i, p);
+  }
+
+protected:
+  Scalar *m_data;
+};
+
+// Lightweight helper class to access matrix coefficients.
+template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned, int Incr = 1>
+class blas_data_mapper;
+
+// TMP to help PacketBlock store implementation.
+// There's currently no known use case for PacketBlock load.
+// The default implementation assumes ColMajor order.
+// It always store each packet sequentially one `stride` apart.
+template<typename Index, typename Scalar, typename Packet, int n, int idx, int StorageOrder>
+struct PacketBlockManagement
+{
+  PacketBlockManagement<Index, Scalar, Packet, n, idx - 1, StorageOrder> pbm;
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock<Packet, n> &block) const {
+    pbm.store(to, stride, i, j, block);
+    pstoreu<Scalar>(to + i + (j + idx)*stride, block.packet[idx]);
+  }
+};
+
+// PacketBlockManagement specialization to take care of RowMajor order without ifs.
+template<typename Index, typename Scalar, typename Packet, int n, int idx>
+struct PacketBlockManagement<Index, Scalar, Packet, n, idx, RowMajor>
+{
+  PacketBlockManagement<Index, Scalar, Packet, n, idx - 1, RowMajor> pbm;
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock<Packet, n> &block) const {
+    pbm.store(to, stride, i, j, block);
+    pstoreu<Scalar>(to + j + (i + idx)*stride, block.packet[idx]);
+  }
+};
+
+template<typename Index, typename Scalar, typename Packet, int n, int StorageOrder>
+struct PacketBlockManagement<Index, Scalar, Packet, n, -1, StorageOrder>
+{
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock<Packet, n> &block) const {
+    EIGEN_UNUSED_VARIABLE(to);
+    EIGEN_UNUSED_VARIABLE(stride);
+    EIGEN_UNUSED_VARIABLE(i);
+    EIGEN_UNUSED_VARIABLE(j);
+    EIGEN_UNUSED_VARIABLE(block);
+  }
+};
+
+template<typename Index, typename Scalar, typename Packet, int n>
+struct PacketBlockManagement<Index, Scalar, Packet, n, -1, RowMajor>
+{
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock<Packet, n> &block) const {
+    EIGEN_UNUSED_VARIABLE(to);
+    EIGEN_UNUSED_VARIABLE(stride);
+    EIGEN_UNUSED_VARIABLE(i);
+    EIGEN_UNUSED_VARIABLE(j);
+    EIGEN_UNUSED_VARIABLE(block);
+  }
+};
+
+template<typename Scalar, typename Index, int StorageOrder, int AlignmentType>
+class blas_data_mapper<Scalar,Index,StorageOrder,AlignmentType,1>
+{
+public:
+  typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
+  typedef BlasVectorMapper<Scalar, Index> VectorMapper;
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr=1)
+   : m_data(data), m_stride(stride)
+  {
+    EIGEN_ONLY_USED_FOR_DEBUG(incr);
+    eigen_assert(incr==1);
+  }
+
+  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>
+  getSubMapper(Index i, Index j) const {
+    return blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>(&operator()(i, j), m_stride);
+  }
+
+  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(&operator()(i, j));
+  }
+
+  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+    return VectorMapper(&operator()(i, j));
+  }
+
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
+    return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride];
+  }
+
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const {
+    return ploadt<PacketType, AlignmentType>(&operator()(i, j));
+  }
+
+  template <typename PacketT, int AlignmentT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
+    return ploadt<PacketT, AlignmentT>(&operator()(i, j));
+  }
+
+  template<typename SubPacket>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
+    pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
+  }
+
+  template<typename SubPacket>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
+    return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
+  }
+
+  EIGEN_DEVICE_FUNC const Index stride() const { return m_stride; }
+  EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
+
+  EIGEN_DEVICE_FUNC Index firstAligned(Index size) const {
+    if (UIntPtr(m_data)%sizeof(Scalar)) {
+      return -1;
+    }
+    return internal::first_default_aligned(m_data, size);
+  }
+
+  template<typename SubPacket, int n>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketBlock(Index i, Index j, const PacketBlock<SubPacket, n> &block) const {
+    PacketBlockManagement<Index, Scalar, SubPacket, n, n-1, StorageOrder> pbm;
+    pbm.store(m_data, m_stride, i, j, block);
+  }
+protected:
+  Scalar* EIGEN_RESTRICT m_data;
+  const Index m_stride;
+};
+
+// Implementation of non-natural increment (i.e. inner-stride != 1)
+// The exposed API is not complete yet compared to the Incr==1 case
+// because some features makes less sense in this case.
+template<typename Scalar, typename Index, int AlignmentType, int Incr>
+class BlasLinearMapper
+{
+public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data,Index incr) : m_data(data), m_incr(incr) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
+    internal::prefetch(&operator()(i));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
+    return m_data[i*m_incr.value()];
+  }
+
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const {
+    return pgather<Scalar,PacketType>(m_data + i*m_incr.value(), m_incr.value());
+  }
+
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const {
+    pscatter<Scalar, PacketType>(m_data + i*m_incr.value(), p, m_incr.value());
+  }
+
+protected:
+  Scalar *m_data;
+  const internal::variable_if_dynamic<Index,Incr> m_incr;
+};
+
+template<typename Scalar, typename Index, int StorageOrder, int AlignmentType,int Incr>
+class blas_data_mapper
+{
+public:
+  typedef BlasLinearMapper<Scalar, Index, AlignmentType,Incr> LinearMapper;
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr) : m_data(data), m_stride(stride), m_incr(incr) {}
+
+  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE blas_data_mapper
+  getSubMapper(Index i, Index j) const {
+    return blas_data_mapper(&operator()(i, j), m_stride, m_incr.value());
+  }
+
+  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(&operator()(i, j), m_incr.value());
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
+    return m_data[StorageOrder==RowMajor ? j*m_incr.value() + i*m_stride : i*m_incr.value() + j*m_stride];
+  }
+
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const {
+    return pgather<Scalar,PacketType>(&operator()(i, j),m_incr.value());
+  }
+
+  template <typename PacketT, int AlignmentT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
+    return pgather<Scalar,PacketT>(&operator()(i, j),m_incr.value());
+  }
+
+  template<typename SubPacket>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
+    pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
+  }
+
+  template<typename SubPacket>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
+    return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
+  }
+
+  // storePacketBlock_helper defines a way to access values inside the PacketBlock, this is essentially required by the Complex types.
+  template<typename SubPacket, typename ScalarT, int n, int idx>
+  struct storePacketBlock_helper
+  {
+    storePacketBlock_helper<SubPacket, ScalarT, n, idx-1> spbh;
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>* sup, Index i, Index j, const PacketBlock<SubPacket, n>& block) const {
+      spbh.store(sup, i,j,block);
+      for(int l = 0; l < unpacket_traits<SubPacket>::size; l++)
+      {
+        ScalarT *v = &sup->operator()(i+l, j+idx);
+        *v = block.packet[idx][l];
+      }
+    }
+  };
+
+  template<typename SubPacket, int n, int idx>
+  struct storePacketBlock_helper<SubPacket, std::complex<float>, n, idx>
+  {
+    storePacketBlock_helper<SubPacket, std::complex<float>, n, idx-1> spbh;
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>* sup, Index i, Index j, const PacketBlock<SubPacket, n>& block) const {
+      spbh.store(sup,i,j,block);
+      for(int l = 0; l < unpacket_traits<SubPacket>::size; l++)
+      {
+        std::complex<float> *v = &sup->operator()(i+l, j+idx);
+        v->real(block.packet[idx].v[2*l+0]);
+        v->imag(block.packet[idx].v[2*l+1]);
+      }
+    }
+  };
+
+  template<typename SubPacket, int n, int idx>
+  struct storePacketBlock_helper<SubPacket, std::complex<double>, n, idx>
+  {
+    storePacketBlock_helper<SubPacket, std::complex<double>, n, idx-1> spbh;
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>* sup, Index i, Index j, const PacketBlock<SubPacket, n>& block) const {
+      spbh.store(sup,i,j,block);
+      for(int l = 0; l < unpacket_traits<SubPacket>::size; l++)
+      {
+        std::complex<double> *v = &sup->operator()(i+l, j+idx);
+        v->real(block.packet[idx].v[2*l+0]);
+        v->imag(block.packet[idx].v[2*l+1]);
+      }
+    }
+  };
+
+  template<typename SubPacket, typename ScalarT, int n>
+  struct storePacketBlock_helper<SubPacket, ScalarT, n, -1>
+  {
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>*, Index, Index, const PacketBlock<SubPacket, n>& ) const {
+    }
+  };
+
+  template<typename SubPacket, int n>
+  struct storePacketBlock_helper<SubPacket, std::complex<float>, n, -1>
+  {
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>*, Index, Index, const PacketBlock<SubPacket, n>& ) const {
+    }
+  };
+
+  template<typename SubPacket, int n>
+  struct storePacketBlock_helper<SubPacket, std::complex<double>, n, -1>
+  {
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>*, Index, Index, const PacketBlock<SubPacket, n>& ) const {
+    }
+  };
+  // This function stores a PacketBlock on m_data, this approach is really quite slow compare to Incr=1 and should be avoided when possible.
+  template<typename SubPacket, int n>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketBlock(Index i, Index j, const PacketBlock<SubPacket, n>&block) const {
+    storePacketBlock_helper<SubPacket, Scalar, n, n-1> spb;
+    spb.store(this, i,j,block);
+  }
+protected:
+  Scalar* EIGEN_RESTRICT m_data;
+  const Index m_stride;
+  const internal::variable_if_dynamic<Index,Incr> m_incr;
+};
+
+// lightweight helper class to access matrix coefficients (const version)
+template<typename Scalar, typename Index, int StorageOrder>
+class const_blas_data_mapper : public blas_data_mapper<const Scalar, Index, StorageOrder> {
+  public:
+  EIGEN_ALWAYS_INLINE const_blas_data_mapper(const Scalar *data, Index stride) : blas_data_mapper<const Scalar, Index, StorageOrder>(data, stride) {}
+
+  EIGEN_ALWAYS_INLINE const_blas_data_mapper<Scalar, Index, StorageOrder> getSubMapper(Index i, Index j) const {
+    return const_blas_data_mapper<Scalar, Index, StorageOrder>(&(this->operator()(i, j)), this->m_stride);
+  }
+};
+
+
 /* Helper class to analyze the factors of a Product expression.
  * In particular it allows to pop out operator-, scalar multiples,
  * and conjugate */
@@ -133,24 +411,24 @@
     HasUsableDirectAccess = (    (int(XprType::Flags)&DirectAccessBit)
                               && (   bool(XprType::IsVectorAtCompileTime)
                                   || int(inner_stride_at_compile_time<XprType>::ret) == 1)
-                             ) ?  1 : 0
+                             ) ?  1 : 0,
+    HasScalarFactor = false
   };
   typedef typename conditional<bool(HasUsableDirectAccess),
     ExtractType,
     typename _ExtractType::PlainObject
     >::type DirectLinearAccessType;
-  static inline ExtractType extract(const XprType& x) { return x; }
-  static inline const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
+  static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; }
+  static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
 };
 
 // pop conjugate
-template<typename Scalar, typename Xpr>
-struct blas_traits<CwiseUnaryOp<scalar_conjugate_op<Scalar>, Xpr> >
- : blas_traits<typename internal::remove_all<typename Xpr::Nested>::type>
+template<typename Scalar, typename NestedXpr>
+struct blas_traits<CwiseUnaryOp<scalar_conjugate_op<Scalar>, NestedXpr> >
+ : blas_traits<NestedXpr>
 {
-  typedef typename internal::remove_all<typename Xpr::Nested>::type NestedXpr;
   typedef blas_traits<NestedXpr> Base;
-  typedef CwiseUnaryOp<scalar_conjugate_op<Scalar>, Xpr> XprType;
+  typedef CwiseUnaryOp<scalar_conjugate_op<Scalar>, NestedXpr> XprType;
   typedef typename Base::ExtractType ExtractType;
 
   enum {
@@ -162,27 +440,50 @@
 };
 
 // pop scalar multiple
-template<typename Scalar, typename Xpr>
-struct blas_traits<CwiseUnaryOp<scalar_multiple_op<Scalar>, Xpr> >
- : blas_traits<typename internal::remove_all<typename Xpr::Nested>::type>
+template<typename Scalar, typename NestedXpr, typename Plain>
+struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> >
+ : blas_traits<NestedXpr>
 {
-  typedef typename internal::remove_all<typename Xpr::Nested>::type NestedXpr;
+  enum {
+    HasScalarFactor = true
+  };
   typedef blas_traits<NestedXpr> Base;
-  typedef CwiseUnaryOp<scalar_multiple_op<Scalar>, Xpr> XprType;
+  typedef CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> XprType;
   typedef typename Base::ExtractType ExtractType;
-  static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
-  static inline Scalar extractScalarFactor(const XprType& x)
-  { return x.functor().m_other * Base::extractScalarFactor(x.nestedExpression()); }
+  static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
+  static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x)
+  { return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); }
 };
+template<typename Scalar, typename NestedXpr, typename Plain>
+struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain> > >
+ : blas_traits<NestedXpr>
+{
+  enum {
+    HasScalarFactor = true
+  };
+  typedef blas_traits<NestedXpr> Base;
+  typedef CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain> > XprType;
+  typedef typename Base::ExtractType ExtractType;
+  static inline ExtractType extract(const XprType& x) { return Base::extract(x.lhs()); }
+  static inline Scalar extractScalarFactor(const XprType& x)
+  { return Base::extractScalarFactor(x.lhs()) * x.rhs().functor().m_other; }
+};
+template<typename Scalar, typename Plain1, typename Plain2>
+struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain1>,
+                                                            const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain2> > >
+ : blas_traits<CwiseNullaryOp<scalar_constant_op<Scalar>,Plain1> >
+{};
 
 // pop opposite
-template<typename Scalar, typename Xpr>
-struct blas_traits<CwiseUnaryOp<scalar_opposite_op<Scalar>, Xpr> >
- : blas_traits<typename internal::remove_all<typename Xpr::Nested>::type>
+template<typename Scalar, typename NestedXpr>
+struct blas_traits<CwiseUnaryOp<scalar_opposite_op<Scalar>, NestedXpr> >
+ : blas_traits<NestedXpr>
 {
-  typedef typename internal::remove_all<typename Xpr::Nested>::type NestedXpr;
+  enum {
+    HasScalarFactor = true
+  };
   typedef blas_traits<NestedXpr> Base;
-  typedef CwiseUnaryOp<scalar_opposite_op<Scalar>, Xpr> XprType;
+  typedef CwiseUnaryOp<scalar_opposite_op<Scalar>, NestedXpr> XprType;
   typedef typename Base::ExtractType ExtractType;
   static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
   static inline Scalar extractScalarFactor(const XprType& x)
@@ -190,14 +491,13 @@
 };
 
 // pop/push transpose
-template<typename Xpr>
-struct blas_traits<Transpose<Xpr> >
- : blas_traits<typename internal::remove_all<typename Xpr::Nested>::type>
+template<typename NestedXpr>
+struct blas_traits<Transpose<NestedXpr> >
+ : blas_traits<NestedXpr>
 {
-  typedef typename internal::remove_all<typename Xpr::Nested>::type NestedXpr;
   typedef typename NestedXpr::Scalar Scalar;
   typedef blas_traits<NestedXpr> Base;
-  typedef Transpose<Xpr> XprType;
+  typedef Transpose<NestedXpr> XprType;
   typedef Transpose<const typename Base::_ExtractType>  ExtractType; // const to get rid of a compile error; anyway blas traits are only used on the RHS
   typedef Transpose<const typename Base::_ExtractType> _ExtractType;
   typedef typename conditional<bool(Base::HasUsableDirectAccess),
@@ -207,7 +507,7 @@
   enum {
     IsTransposed = Base::IsTransposed ? 0 : 1
   };
-  static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
+  static inline ExtractType extract(const XprType& x) { return ExtractType(Base::extract(x.nestedExpression())); }
   static inline Scalar extractScalarFactor(const XprType& x) { return Base::extractScalarFactor(x.nestedExpression()); }
 };
 
@@ -218,7 +518,7 @@
 
 template<typename T, bool HasUsableDirectAccess=blas_traits<T>::HasUsableDirectAccess>
 struct extract_data_selector {
-  static const typename T::Scalar* run(const T& m)
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename T::Scalar* run(const T& m)
   {
     return blas_traits<T>::extract(m).data();
   }
@@ -229,11 +529,53 @@
   static typename T::Scalar* run(const T&) { return 0; }
 };
 
-template<typename T> const typename T::Scalar* extract_data(const T& m)
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename T::Scalar* extract_data(const T& m)
 {
   return extract_data_selector<T>::run(m);
 }
 
+/**
+ * \c combine_scalar_factors extracts and multiplies factors from GEMM and GEMV products.
+ * There is a specialization for booleans
+ */
+template<typename ResScalar, typename Lhs, typename Rhs>
+struct combine_scalar_factors_impl
+{
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static ResScalar run(const Lhs& lhs, const Rhs& rhs)
+  {
+    return blas_traits<Lhs>::extractScalarFactor(lhs) * blas_traits<Rhs>::extractScalarFactor(rhs);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static ResScalar run(const ResScalar& alpha, const Lhs& lhs, const Rhs& rhs)
+  {
+    return alpha * blas_traits<Lhs>::extractScalarFactor(lhs) * blas_traits<Rhs>::extractScalarFactor(rhs);
+  }
+};
+template<typename Lhs, typename Rhs>
+struct combine_scalar_factors_impl<bool, Lhs, Rhs>
+{
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const Lhs& lhs, const Rhs& rhs)
+  {
+    return blas_traits<Lhs>::extractScalarFactor(lhs) && blas_traits<Rhs>::extractScalarFactor(rhs);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const bool& alpha, const Lhs& lhs, const Rhs& rhs)
+  {
+    return alpha && blas_traits<Lhs>::extractScalarFactor(lhs) && blas_traits<Rhs>::extractScalarFactor(rhs);
+  }
+};
+
+template<typename ResScalar, typename Lhs, typename Rhs>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const ResScalar& alpha, const Lhs& lhs, const Rhs& rhs)
+{
+  return combine_scalar_factors_impl<ResScalar,Lhs,Rhs>::run(alpha, lhs, rhs);
+}
+template<typename ResScalar, typename Lhs, typename Rhs>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const Lhs& lhs, const Rhs& rhs)
+{
+  return combine_scalar_factors_impl<ResScalar,Lhs,Rhs>::run(lhs, rhs);
+}
+
+
 } // end namespace internal
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
new file mode 100644
index 0000000..2d12e1d
--- /dev/null
+++ b/Eigen/src/Core/util/ConfigureVectorization.h

@@ -0,0 +1,514 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2020, Arm Limited and Contributors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CONFIGURE_VECTORIZATION_H
+#define EIGEN_CONFIGURE_VECTORIZATION_H
+
+//------------------------------------------------------------------------------------------
+// Static and dynamic alignment control
+//
+// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES
+// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively.
+// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not,
+// a default value is automatically computed based on architecture, compiler, and OS.
+//
+// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX}
+// to be used to declare statically aligned buffers.
+//------------------------------------------------------------------------------------------
+
+
+/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements.
+ * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
+ * so that vectorization doesn't affect binary compatibility.
+ *
+ * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
+ * vectorized and non-vectorized code.
+ * 
+ * FIXME: this code can be cleaned up once we switch to proper C++11 only.
+ */
+#if (defined EIGEN_CUDACC)
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
+  #define EIGEN_ALIGNOF(x) __alignof(x)
+#elif EIGEN_HAS_ALIGNAS
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) alignas(n)
+  #define EIGEN_ALIGNOF(x) alignof(x)
+#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
+  #define EIGEN_ALIGNOF(x) __alignof(x)
+#elif EIGEN_COMP_MSVC
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
+  #define EIGEN_ALIGNOF(x) __alignof(x)
+#elif EIGEN_COMP_SUNCC
+  // FIXME not sure about this one:
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
+  #define EIGEN_ALIGNOF(x) __alignof(x)
+#else
+  #error Please tell me what is the equivalent of alignas(n) and alignof(x) for your compiler
+#endif
+
+// If the user explicitly disable vectorization, then we also disable alignment
+#if defined(EIGEN_DONT_VECTORIZE)
+  #if defined(EIGEN_GPUCC)
+    // GPU code is always vectorized and requires memory alignment for
+    // statically allocated buffers.
+    #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
+  #else
+    #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
+  #endif
+#elif defined(__AVX512F__)
+  // 64 bytes static alignment is preferred only if really required
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64
+#elif defined(__AVX__)
+  // 32 bytes static alignment is preferred only if really required
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32
+#else
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
+#endif
+
+
+// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense
+#define EIGEN_MIN_ALIGN_BYTES 16
+
+// Defined the boundary (in bytes) on which the data needs to be aligned. Note
+// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
+// aligned at all regardless of the value of this #define.
+
+#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN))  && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
+#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY.
+#endif
+
+// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprecated
+// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0
+#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)
+  #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES
+    #undef EIGEN_MAX_STATIC_ALIGN_BYTES
+  #endif
+  #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
+#endif
+
+#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES
+
+  // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES
+
+  // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
+  // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
+  // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
+  // certain common platform (compiler+architecture combinations) to avoid these problems.
+  // Only static alignment is really problematic (relies on nonstandard compiler extensions),
+  // try to keep heap alignment even when we have to disable static alignment.
+  #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS)
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+  #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6)
+  // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support.
+  // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use.
+  // 4.8 and newer seem definitely unaffected.
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+  #else
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
+  #endif
+
+  // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
+  #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
+  && !EIGEN_GCC3_OR_OLDER \
+  && !EIGEN_COMP_SUNCC \
+  && !EIGEN_OS_QNX
+    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
+  #else
+    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
+  #endif
+
+  #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT
+    #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+  #else
+    #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
+  #endif
+
+#endif
+
+// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_STATIC_ALIGN_BYTES
+#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES<EIGEN_MAX_STATIC_ALIGN_BYTES
+#undef EIGEN_MAX_STATIC_ALIGN_BYTES
+#define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
+#endif
+
+#if EIGEN_MAX_STATIC_ALIGN_BYTES==0 && !defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
+  #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
+#endif
+
+// At this stage, EIGEN_MAX_STATIC_ALIGN_BYTES>0 is the true test whether we want to align arrays on the stack or not.
+// It takes into account both the user choice to explicitly enable/disable alignment (by setting EIGEN_MAX_STATIC_ALIGN_BYTES)
+// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT).
+// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used.
+
+
+// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY
+#define EIGEN_ALIGN8  EIGEN_ALIGN_TO_BOUNDARY(8)
+#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
+#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
+#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64)
+#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
+#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES)
+#else
+#define EIGEN_ALIGN_MAX
+#endif
+
+
+// Dynamic alignment control
+
+#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0
+#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN.
+#endif
+
+#ifdef EIGEN_DONT_ALIGN
+  #ifdef EIGEN_MAX_ALIGN_BYTES
+    #undef EIGEN_MAX_ALIGN_BYTES
+  #endif
+  #define EIGEN_MAX_ALIGN_BYTES 0
+#elif !defined(EIGEN_MAX_ALIGN_BYTES)
+  #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+#endif
+
+#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES
+#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+#else
+#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
+#endif
+
+
+#ifndef EIGEN_UNALIGNED_VECTORIZE
+#define EIGEN_UNALIGNED_VECTORIZE 1
+#endif
+
+//----------------------------------------------------------------------
+
+// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
+// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
+#if EIGEN_MAX_ALIGN_BYTES==0
+  #ifndef EIGEN_DONT_VECTORIZE
+    #define EIGEN_DONT_VECTORIZE
+  #endif
+#endif
+
+
+// The following (except #include <malloc.h> and _M_IX86_FP ??) can likely be
+// removed as gcc 4.1 and msvc 2008 are not supported anyways.
+#if EIGEN_COMP_MSVC
+  #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
+  #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
+    // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
+    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
+      #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
+    #endif
+  #endif
+#else
+  #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
+    #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
+  #endif
+#endif
+
+#if !(defined(EIGEN_DONT_VECTORIZE) || defined(EIGEN_GPUCC))
+
+  #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
+
+    // Defines symbols for compile-time detection of which instructions are
+    // used.
+    // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_SSE
+    #define EIGEN_VECTORIZE_SSE2
+
+    // Detect sse3/ssse3/sse4:
+    // gcc and icc defines __SSE3__, ...
+    // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
+    // want to force the use of those instructions with msvc.
+    #ifdef __SSE3__
+      #define EIGEN_VECTORIZE_SSE3
+    #endif
+    #ifdef __SSSE3__
+      #define EIGEN_VECTORIZE_SSSE3
+    #endif
+    #ifdef __SSE4_1__
+      #define EIGEN_VECTORIZE_SSE4_1
+    #endif
+    #ifdef __SSE4_2__
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #ifdef __AVX__
+      #ifndef EIGEN_USE_SYCL 
+        #define EIGEN_VECTORIZE_AVX
+      #endif
+      #define EIGEN_VECTORIZE_SSE3
+      #define EIGEN_VECTORIZE_SSSE3
+      #define EIGEN_VECTORIZE_SSE4_1
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #ifdef __AVX2__
+      #ifndef EIGEN_USE_SYCL 
+        #define EIGEN_VECTORIZE_AVX2
+        #define EIGEN_VECTORIZE_AVX
+      #endif
+      #define EIGEN_VECTORIZE_SSE3
+      #define EIGEN_VECTORIZE_SSSE3
+      #define EIGEN_VECTORIZE_SSE4_1
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #if defined(__FMA__) || (EIGEN_COMP_MSVC && defined(__AVX2__))
+      // MSVC does not expose a switch dedicated for FMA
+      // For MSVC, AVX2 => FMA
+      #define EIGEN_VECTORIZE_FMA
+    #endif
+    #if defined(__AVX512F__)
+      #ifndef EIGEN_VECTORIZE_FMA
+      #if EIGEN_COMP_GNUC
+      #error Please add -mfma to your compiler flags: compiling with -mavx512f alone without SSE/AVX FMA is not supported (bug 1638).
+      #else
+      #error Please enable FMA in your compiler flags (e.g. -mfma): compiling with AVX512 alone without SSE/AVX FMA is not supported (bug 1638).
+      #endif
+      #endif
+      #ifndef EIGEN_USE_SYCL
+        #define EIGEN_VECTORIZE_AVX512
+        #define EIGEN_VECTORIZE_AVX2
+        #define EIGEN_VECTORIZE_AVX
+      #endif
+      #define EIGEN_VECTORIZE_FMA
+      #define EIGEN_VECTORIZE_SSE3
+      #define EIGEN_VECTORIZE_SSSE3
+      #define EIGEN_VECTORIZE_SSE4_1
+      #define EIGEN_VECTORIZE_SSE4_2
+      #ifndef EIGEN_USE_SYCL
+        #ifdef __AVX512DQ__
+          #define EIGEN_VECTORIZE_AVX512DQ
+        #endif
+        #ifdef __AVX512ER__
+          #define EIGEN_VECTORIZE_AVX512ER
+        #endif
+        #ifdef __AVX512BF16__
+          #define EIGEN_VECTORIZE_AVX512BF16
+        #endif
+      #endif
+    #endif
+
+    // Disable AVX support on broken xcode versions
+    #if defined(__apple_build_version__) && (__apple_build_version__ == 11000033 ) && ( __MAC_OS_X_VERSION_MIN_REQUIRED == 101500 )
+      // A nasty bug in the clang compiler shipped with xcode in a common compilation situation
+      // when XCode 11.0 and Mac deployment target macOS 10.15 is https://trac.macports.org/ticket/58776#no1
+      #ifdef EIGEN_VECTORIZE_AVX
+        #undef EIGEN_VECTORIZE_AVX
+        #warning "Disabling AVX support: clang compiler shipped with XCode 11.[012] generates broken assembly with -macosx-version-min=10.15 and AVX enabled. "
+        #ifdef EIGEN_VECTORIZE_AVX2
+          #undef EIGEN_VECTORIZE_AVX2
+        #endif
+        #ifdef EIGEN_VECTORIZE_FMA
+          #undef EIGEN_VECTORIZE_FMA
+        #endif
+        #ifdef EIGEN_VECTORIZE_AVX512
+          #undef EIGEN_VECTORIZE_AVX512
+        #endif
+        #ifdef EIGEN_VECTORIZE_AVX512DQ
+          #undef EIGEN_VECTORIZE_AVX512DQ
+        #endif
+        #ifdef EIGEN_VECTORIZE_AVX512ER
+          #undef EIGEN_VECTORIZE_AVX512ER
+        #endif
+      #endif
+      // NOTE: Confirmed test failures in XCode 11.0, and XCode 11.2 with  -macosx-version-min=10.15 and AVX
+      // NOTE using -macosx-version-min=10.15 with Xcode 11.0 results in runtime segmentation faults in many tests, 11.2 produce core dumps in 3 tests
+      // NOTE using -macosx-version-min=10.14 produces functioning and passing tests in all cases
+      // NOTE __clang_version__ "11.0.0 (clang-1100.0.33.8)"  XCode 11.0 <- Produces many segfault and core dumping tests
+      //                                                                    with  -macosx-version-min=10.15 and AVX
+      // NOTE __clang_version__ "11.0.0 (clang-1100.0.33.12)" XCode 11.2 <- Produces 3 core dumping tests with  
+      //                                                                    -macosx-version-min=10.15 and AVX
+    #endif
+
+    // include files
+
+    // This extern "C" works around a MINGW-w64 compilation issue
+    // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
+    // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
+    // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
+    // with conflicting linkage.  The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
+    // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
+    // notice that since these are C headers, the extern "C" is theoretically needed anyways.
+    extern "C" {
+      // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
+      // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
+      #if EIGEN_COMP_ICC >= 1110
+        #include <immintrin.h>
+      #else
+        #include <mmintrin.h>
+        #include <emmintrin.h>
+        #include <xmmintrin.h>
+        #ifdef  EIGEN_VECTORIZE_SSE3
+        #include <pmmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSSE3
+        #include <tmmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSE4_1
+        #include <smmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSE4_2
+        #include <nmmintrin.h>
+        #endif
+        #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512)
+        #include <immintrin.h>
+        #endif
+      #endif
+    } // end extern "C"
+
+  #elif defined __VSX__
+
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_VSX
+    #include <altivec.h>
+    // We need to #undef all these ugly tokens defined in <altivec.h>
+    // => use __vector instead of vector
+    #undef bool
+    #undef vector
+    #undef pixel
+
+  #elif defined __ALTIVEC__
+
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_ALTIVEC
+    #include <altivec.h>
+    // We need to #undef all these ugly tokens defined in <altivec.h>
+    // => use __vector instead of vector
+    #undef bool
+    #undef vector
+    #undef pixel
+
+  #elif ((defined  __ARM_NEON) || (defined __ARM_NEON__)) && !(defined EIGEN_ARM64_USE_SVE)
+
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_NEON
+    #include <arm_neon.h>
+
+  // We currently require SVE to be enabled explicitly via EIGEN_ARM64_USE_SVE and
+  // will not select the backend automatically
+  #elif (defined __ARM_FEATURE_SVE) && (defined EIGEN_ARM64_USE_SVE)
+
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_SVE
+    #include <arm_sve.h>
+
+    // Since we depend on knowing SVE vector lengths at compile-time, we need
+    // to ensure a fixed lengths is set
+    #if defined __ARM_FEATURE_SVE_BITS
+      #define EIGEN_ARM64_SVE_VL __ARM_FEATURE_SVE_BITS
+    #else
+#error "Eigen requires a fixed SVE lector length but EIGEN_ARM64_SVE_VL is not set."
+#endif
+
+#elif (defined __s390x__ && defined __VEC__)
+
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_ZVECTOR
+#include <vecintrin.h>
+
+#elif defined __mips_msa
+
+// Limit MSA optimizations to little-endian CPUs for now.
+// TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs?
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#if defined(__LP64__)
+#define EIGEN_MIPS_64
+#else
+#define EIGEN_MIPS_32
+#endif
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_MSA
+#include <msa.h>
+#endif
+
+#endif
+#endif
+
+// Following the Arm ACLE arm_neon.h should also include arm_fp16.h but not all
+// compilers seem to follow this. We therefore include it explicitly.
+// See also: https://bugs.llvm.org/show_bug.cgi?id=47955
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  #include <arm_fp16.h>
+#endif
+
+#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG || EIGEN_COMP_CLANG>=380))
+  // We can use the optimized fp16 to float and float to fp16 conversion routines
+  #define EIGEN_HAS_FP16_C
+
+  #if EIGEN_COMP_GNUC
+    // Make sure immintrin.h is included, even if e.g. vectorization is
+    // explicitly disabled (see also issue #2395).
+    // Note that FP16C intrinsics for gcc and clang are included by immintrin.h,
+    // as opposed to emmintrin.h as suggested by Intel:
+    // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711
+    #include <immintrin.h>
+  #endif
+#endif
+
+#if defined EIGEN_CUDACC
+  #define EIGEN_VECTORIZE_GPU
+  #include <vector_types.h>
+  #if EIGEN_CUDA_SDK_VER >= 70500
+    #define EIGEN_HAS_CUDA_FP16
+  #endif
+#endif
+
+#if defined(EIGEN_HAS_CUDA_FP16)
+  #include <cuda_runtime_api.h>
+  #include <cuda_fp16.h>
+#endif
+
+#if defined(EIGEN_HIPCC)
+  #define EIGEN_VECTORIZE_GPU
+  #include <hip/hip_vector_types.h>
+  #define EIGEN_HAS_HIP_FP16
+  #include <hip/hip_fp16.h>
+#endif
+
+
+/** \brief Namespace containing all symbols from the %Eigen library. */
+namespace Eigen {
+
+inline static const char *SimdInstructionSetsInUse(void) {
+#if defined(EIGEN_VECTORIZE_AVX512)
+  return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_AVX)
+  return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_2)
+  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_1)
+  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
+#elif defined(EIGEN_VECTORIZE_SSSE3)
+  return "SSE, SSE2, SSE3, SSSE3";
+#elif defined(EIGEN_VECTORIZE_SSE3)
+  return "SSE, SSE2, SSE3";
+#elif defined(EIGEN_VECTORIZE_SSE2)
+  return "SSE, SSE2";
+#elif defined(EIGEN_VECTORIZE_ALTIVEC)
+  return "AltiVec";
+#elif defined(EIGEN_VECTORIZE_VSX)
+  return "VSX";
+#elif defined(EIGEN_VECTORIZE_NEON)
+  return "ARM NEON";
+#elif defined(EIGEN_VECTORIZE_SVE)
+  return "ARM SVE";
+#elif defined(EIGEN_VECTORIZE_ZVECTOR)
+  return "S390X ZVECTOR";
+#elif defined(EIGEN_VECTORIZE_MSA)
+  return "MIPS MSA";
+#else
+  return "None";
+#endif
+}
+
+} // end namespace Eigen
+
+
+#endif // EIGEN_CONFIGURE_VECTORIZATION_H

diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index 75b91cd..35dcaa7 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h

@@ -1,8 +1,9 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2020, Arm Limited and Contributors
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -25,11 +26,23 @@
   */
 const int DynamicIndex = 0xffffff;
 
+/** This value means that the increment to go from one value to another in a sequence is not constant for each step.
+  */
+const int UndefinedIncr = 0xfffffe;
+
 /** This value means +Infinity; it is currently used only as the p parameter to MatrixBase::lpNorm<int>().
   * The value Infinity there means the L-infinity norm.
   */
 const int Infinity = -1;
 
+/** This value means that the cost to evaluate an expression coefficient is either very expensive or
+  * cannot be known at compile time.
+  *
+  * This value has to be positive to (1) simplify cost computation, and (2) allow to distinguish between a very expensive and very very expensive expressions.
+  * It thus must also be large enough to make sure unrolling won't happen and that sub expressions will be evaluated, but not too large to avoid overflow.
+  */
+const int HugeCost = 10000;
+
 /** \defgroup flags Flags
   * \ingroup Core_Module
   *
@@ -49,18 +62,18 @@
   * If this bit is not set, the storage order is column-major.
   * For an expression, this determines the storage order of
   * the matrix created by evaluation of that expression.
-  * \sa \ref TopicStorageOrders */
+  * \sa \blank  \ref TopicStorageOrders */
 const unsigned int RowMajorBit = 0x1;
 
 /** \ingroup flags
-  *
   * means the expression should be evaluated by the calling expression */
 const unsigned int EvalBeforeNestingBit = 0x2;
 
 /** \ingroup flags
-  *
+  * \deprecated
   * means the expression should be evaluated before any assignment */
-const unsigned int EvalBeforeAssigningBit = 0x4;
+EIGEN_DEPRECATED
+const unsigned int EvalBeforeAssigningBit = 0x4; // FIXME deprecated
 
 /** \ingroup flags
   *
@@ -141,24 +154,46 @@
   */
 const unsigned int DirectAccessBit = 0x40;
 
-/** \ingroup flags
+/** \deprecated \ingroup flags
   *
   * means the first coefficient packet is guaranteed to be aligned.
-  * An expression cannot has the AlignedBit without the PacketAccessBit flag.
+  * An expression cannot have the AlignedBit without the PacketAccessBit flag.
   * In other words, this means we are allow to perform an aligned packet access to the first element regardless
   * of the expression kind:
   * \code
   * expression.packet<Aligned>(0);
   * \endcode
   */
-const unsigned int AlignedBit = 0x80;
+EIGEN_DEPRECATED const unsigned int AlignedBit = 0x80;
 
 const unsigned int NestByRefBit = 0x100;
 
+/** \ingroup flags
+  *
+  * for an expression, this means that the storage order
+  * can be either row-major or column-major.
+  * The precise choice will be decided at evaluation time or when
+  * combined with other expressions.
+  * \sa \blank  \ref RowMajorBit, \ref TopicStorageOrders */
+const unsigned int NoPreferredStorageOrderBit = 0x200;
+
+/** \ingroup flags
+  *
+  * Means that the underlying coefficients can be accessed through pointers to the sparse (un)compressed storage format,
+  * that is, the expression provides:
+  * \code
+    inline const Scalar* valuePtr() const;
+    inline const Index* innerIndexPtr() const;
+    inline const Index* outerIndexPtr() const;
+    inline const Index* innerNonZeroPtr() const;
+    \endcode
+  */
+const unsigned int CompressedAccessBit = 0x400;
+
+
 // list of flags that are inherited by default
 const unsigned int HereditaryBits = RowMajorBit
-                                  | EvalBeforeNestingBit
-                                  | EvalBeforeAssigningBit;
+                                  | EvalBeforeNestingBit;
 
 /** \defgroup enums Enumerations
   * \ingroup Core_Module
@@ -167,23 +202,23 @@
   */
 
 /** \ingroup enums
-  * Enum containing possible values for the \p Mode parameter of
-  * MatrixBase::selfadjointView() and MatrixBase::triangularView(). */
-enum {
+  * Enum containing possible values for the \c Mode or \c UpLo parameter of
+  * MatrixBase::selfadjointView() and MatrixBase::triangularView(), and selfadjoint solvers. */
+enum UpLoType {
   /** View matrix as a lower triangular matrix. */
-  Lower=0x1,
+  Lower=0x1,                      
   /** View matrix as an upper triangular matrix. */
-  Upper=0x2,
+  Upper=0x2,                      
   /** %Matrix has ones on the diagonal; to be used in combination with #Lower or #Upper. */
-  UnitDiag=0x4,
+  UnitDiag=0x4, 
   /** %Matrix has zeros on the diagonal; to be used in combination with #Lower or #Upper. */
   ZeroDiag=0x8,
   /** View matrix as a lower triangular matrix with ones on the diagonal. */
-  UnitLower=UnitDiag|Lower,
+  UnitLower=UnitDiag|Lower, 
   /** View matrix as an upper triangular matrix with ones on the diagonal. */
   UnitUpper=UnitDiag|Upper,
   /** View matrix as a lower triangular matrix with zeros on the diagonal. */
-  StrictlyLower=ZeroDiag|Lower,
+  StrictlyLower=ZeroDiag|Lower, 
   /** View matrix as an upper triangular matrix with zeros on the diagonal. */
   StrictlyUpper=ZeroDiag|Upper,
   /** Used in BandMatrix and SelfAdjointView to indicate that the matrix is self-adjoint. */
@@ -193,38 +228,51 @@
 };
 
 /** \ingroup enums
-  * Enum for indicating whether an object is aligned or not. */
-enum {
-  /** Object is not correctly aligned for vectorization. */
-  Unaligned=0,
-  /** Object is aligned for vectorization. */
-  Aligned=1
+  * Enum for indicating whether a buffer is aligned or not. */
+enum AlignmentType {
+  Unaligned=0,        /**< Data pointer has no specific alignment. */
+  Aligned8=8,         /**< Data pointer is aligned on a 8 bytes boundary. */
+  Aligned16=16,       /**< Data pointer is aligned on a 16 bytes boundary. */
+  Aligned32=32,       /**< Data pointer is aligned on a 32 bytes boundary. */
+  Aligned64=64,       /**< Data pointer is aligned on a 64 bytes boundary. */
+  Aligned128=128,     /**< Data pointer is aligned on a 128 bytes boundary. */
+  AlignedMask=255,
+  Aligned=16,         /**< \deprecated Synonym for Aligned16. */
+#if EIGEN_MAX_ALIGN_BYTES==128
+  AlignedMax = Aligned128
+#elif EIGEN_MAX_ALIGN_BYTES==64
+  AlignedMax = Aligned64
+#elif EIGEN_MAX_ALIGN_BYTES==32
+  AlignedMax = Aligned32
+#elif EIGEN_MAX_ALIGN_BYTES==16
+  AlignedMax = Aligned16
+#elif EIGEN_MAX_ALIGN_BYTES==8
+  AlignedMax = Aligned8
+#elif EIGEN_MAX_ALIGN_BYTES==0
+  AlignedMax = Unaligned
+#else
+#error Invalid value for EIGEN_MAX_ALIGN_BYTES
+#endif
 };
 
 /** \ingroup enums
- * Enum used by DenseBase::corner() in Eigen2 compatibility mode. */
-// FIXME after the corner() API change, this was not needed anymore, except by AlignedBox
-// TODO: find out what to do with that. Adapt the AlignedBox API ?
-enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };
-
-/** \ingroup enums
   * Enum containing possible values for the \p Direction parameter of
   * Reverse, PartialReduxExpr and VectorwiseOp. */
-enum DirectionType {
-  /** For Reverse, all columns are reversed;
+enum DirectionType { 
+  /** For Reverse, all columns are reversed; 
     * for PartialReduxExpr and VectorwiseOp, act on columns. */
-  Vertical,
-  /** For Reverse, all rows are reversed;
+  Vertical, 
+  /** For Reverse, all rows are reversed; 
     * for PartialReduxExpr and VectorwiseOp, act on rows. */
-  Horizontal,
-  /** For Reverse, both rows and columns are reversed;
+  Horizontal, 
+  /** For Reverse, both rows and columns are reversed; 
     * not used for PartialReduxExpr and VectorwiseOp. */
-  BothDirections
+  BothDirections 
 };
 
 /** \internal \ingroup enums
   * Enum to specify how to traverse the entries of a matrix. */
-enum {
+enum TraversalType {
   /** \internal Default traversal, no vectorization, no index-based access */
   DefaultTraversal,
   /** \internal No vectorization, use index-based access to have only one for loop instead of 2 nested loops */
@@ -246,19 +294,19 @@
 
 /** \internal \ingroup enums
   * Enum to specify whether to unroll loops when traversing over the entries of a matrix. */
-enum {
+enum UnrollingType {
   /** \internal Do not unroll loops. */
   NoUnrolling,
   /** \internal Unroll only the inner loop, but not the outer loop. */
   InnerUnrolling,
-  /** \internal Unroll both the inner and the outer loop. If there is only one loop,
+  /** \internal Unroll both the inner and the outer loop. If there is only one loop, 
     * because linear traversal is used, then unroll that loop. */
   CompleteUnrolling
 };
 
 /** \internal \ingroup enums
   * Enum to specify whether to use the default (built-in) implementation or the specialization. */
-enum {
+enum SpecializedType {
   Specialized,
   BuiltIn
 };
@@ -266,7 +314,7 @@
 /** \ingroup enums
   * Enum containing possible values for the \p _Options template parameter of
   * Matrix, Array and BandMatrix. */
-enum {
+enum StorageOptions {
   /** Storage order is column major (see \ref TopicStorageOrders). */
   ColMajor = 0,
   /** Storage order is row major (see \ref TopicStorageOrders). */
@@ -274,20 +322,29 @@
   /** Align the matrix itself if it is vectorizable fixed-size */
   AutoAlign = 0,
   /** Don't require alignment for the matrix itself (the array of coefficients, if dynamically allocated, may still be requested to be aligned) */ // FIXME --- clarify the situation
-  DontAlign = 0x2,
-  AllocateDefault = 0,
-  AllocateUVM = 0x8
+  DontAlign = 0x2
 };
 
 /** \ingroup enums
   * Enum for specifying whether to apply or solve on the left or right. */
-enum {
+enum SideType {
   /** Apply transformation on the left. */
   OnTheLeft = 1,
   /** Apply transformation on the right. */
   OnTheRight = 2
 };
 
+/** \ingroup enums
+ * Enum for specifying NaN-propagation behavior, e.g. for coeff-wise min/max. */
+enum NaNPropagationOptions {
+  /**  Implementation defined behavior if NaNs are present. */
+  PropagateFast = 0,
+  /**  Always propagate NaNs. */
+  PropagateNaN,
+  /**  Always propagate not-NaNs. */
+  PropagateNumbers
+};
+
 /* the following used to be written as:
  *
  *   struct NoChange_t {};
@@ -295,7 +352,7 @@
  *     EIGEN_UNUSED NoChange_t NoChange;
  *   }
  *
- * on the ground that it feels dangerous to disambiguate overloaded functions on enum/integer types.
+ * on the ground that it feels dangerous to disambiguate overloaded functions on enum/integer types.  
  * However, this leads to "variable declared but never referenced" warnings on Intel Composer XE,
  * and we do not know how to get rid of them (bug 450).
  */
@@ -306,21 +363,21 @@
 
 /** \internal \ingroup enums
   * Used in AmbiVector. */
-enum {
+enum AmbiVectorMode {
   IsDense         = 0,
   IsSparse
 };
 
 /** \ingroup enums
-  * Used as template parameter in DenseCoeffBase and MapBase to indicate
+  * Used as template parameter in DenseCoeffBase and MapBase to indicate 
   * which accessors should be provided. */
 enum AccessorLevels {
   /** Read-only access via a member function. */
-  ReadOnlyAccessors,
+  ReadOnlyAccessors, 
   /** Read/write access via member functions. */
-  WriteAccessors,
+  WriteAccessors, 
   /** Direct read-only access to the coefficients. */
-  DirectAccessors,
+  DirectAccessors, 
   /** Direct read/write access to the coefficients. */
   DirectWriteAccessors
 };
@@ -329,9 +386,9 @@
   * Enum with options to give to various decompositions. */
 enum DecompositionOptions {
   /** \internal Not used (meant for LDLT?). */
-  Pivoting            = 0x01,
+  Pivoting            = 0x01, 
   /** \internal Not used (meant for LDLT?). */
-  NoPivoting          = 0x02,
+  NoPivoting          = 0x02, 
   /** Used in JacobiSVD to indicate that the square matrix U is to be computed. */
   ComputeFullU        = 0x04,
   /** Used in JacobiSVD to indicate that the thin matrix U is to be computed. */
@@ -382,9 +439,9 @@
   * Enum for reporting the status of a computation. */
 enum ComputationInfo {
   /** Computation was successful. */
-  Success = 0,
+  Success = 0,        
   /** The provided data did not satisfy the prerequisites. */
-  NumericalIssue = 1,
+  NumericalIssue = 1, 
   /** Iterative procedure did not converge. */
   NoConvergence = 2,
   /** The inputs are invalid, or the algorithm has been improperly called.
@@ -398,7 +455,7 @@
 enum TransformTraits {
   /** Transformation is an isometry. */
   Isometry      = 0x1,
-  /** Transformation is an affine transformation stored as a (Dim+1)^2 matrix whose last row is
+  /** Transformation is an affine transformation stored as a (Dim+1)^2 matrix whose last row is 
     * assumed to be [0 ... 0 1]. */
   Affine        = 0x2,
   /** Transformation is an affine transformation stored as a (Dim) x (Dim+1) matrix. */
@@ -417,6 +474,8 @@
     AltiVec = 0x2,
     VSX = 0x3,
     NEON = 0x4,
+    MSA = 0x5,
+    SVE = 0x6,
 #if defined EIGEN_VECTORIZE_SSE
     Target = SSE
 #elif defined EIGEN_VECTORIZE_ALTIVEC
@@ -425,6 +484,10 @@
     Target = VSX
 #elif defined EIGEN_VECTORIZE_NEON
     Target = NEON
+#elif defined EIGEN_VECTORIZE_SVE
+    Target = SVE
+#elif defined EIGEN_VECTORIZE_MSA
+    Target = MSA
 #else
     Target = Generic
 #endif
@@ -432,8 +495,9 @@
 }
 
 /** \internal \ingroup enums
-  * Enum used as template parameter in GeneralProduct. */
-enum { CoeffBasedProductMode, LazyCoeffBasedProductMode, OuterProduct, InnerProduct, GemvProduct, GemmProduct };
+  * Enum used as template parameter in Product and product evaluators. */
+enum ProductImplType
+{ DefaultProduct=0, LazyProduct, AliasFreeProduct, CoeffBasedProductMode, LazyCoeffBasedProductMode, OuterProduct, InnerProduct, GemvProduct, GemmProduct };
 
 /** \internal \ingroup enums
   * Enum used in experimental parallel implementation. */
@@ -442,14 +506,44 @@
 /** The type used to identify a dense storage. */
 struct Dense {};
 
+/** The type used to identify a general sparse storage. */
+struct Sparse {};
+
+/** The type used to identify a general solver (factored) storage. */
+struct SolverStorage {};
+
+/** The type used to identify a permutation storage. */
+struct PermutationStorage {};
+
+/** The type used to identify a permutation storage. */
+struct TranspositionsStorage {};
+
 /** The type used to identify a matrix expression */
 struct MatrixXpr {};
 
 /** The type used to identify an array expression */
 struct ArrayXpr {};
 
+// An evaluator must define its shape. By default, it can be one of the following:
+struct DenseShape             { static std::string debugName() { return "DenseShape"; } };
+struct SolverShape            { static std::string debugName() { return "SolverShape"; } };
+struct HomogeneousShape       { static std::string debugName() { return "HomogeneousShape"; } };
+struct DiagonalShape          { static std::string debugName() { return "DiagonalShape"; } };
+struct BandShape              { static std::string debugName() { return "BandShape"; } };
+struct TriangularShape        { static std::string debugName() { return "TriangularShape"; } };
+struct SelfAdjointShape       { static std::string debugName() { return "SelfAdjointShape"; } };
+struct PermutationShape       { static std::string debugName() { return "PermutationShape"; } };
+struct TranspositionsShape    { static std::string debugName() { return "TranspositionsShape"; } };
+struct SparseShape            { static std::string debugName() { return "SparseShape"; } };
+
 namespace internal {
 
+  // random access iterators based on coeff*() accessors.
+struct IndexBased {};
+
+// evaluator based on iterators to access coefficients. 
+struct IteratorBased {};
+
 /** \internal
  * Constants for comparison functors
  */
@@ -462,7 +556,7 @@
   cmp_GT = 5,
   cmp_GE = 6
 };
-}  // end namespace internal
+} // end namespace internal
 
 } // end namespace Eigen
 

diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
old mode 100644
new mode 100755
index dd44c7c..e950749
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h

@@ -14,12 +14,13 @@
   // 4512 - assignment operator could not be generated
   // 4522 - 'class' : multiple assignment operators specified
   // 4700 - uninitialized local variable 'xyz' used
+  // 4714 - function marked as __forceinline not inlined
   // 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow
   // 4800 - 'type' : forcing value to bool 'true' or 'false' (performance warning)
   #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
     #pragma warning( push )
   #endif
-  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 4800)
+  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
 
 #elif defined __INTEL_COMPILER
   // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
@@ -41,28 +42,73 @@
     #pragma clang diagnostic push
   #endif
   #pragma clang diagnostic ignored "-Wconstant-logical-operand"
+  #if __clang_major__ >= 3 && __clang_minor__ >= 5
+    #pragma clang diagnostic ignored "-Wabsolute-value"
+  #endif
+  #if __clang_major__ >= 10
+    #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion"
+  #endif
+  #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L
+    // warning: generic selections are a C11-specific feature
+    // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h
+    #pragma clang diagnostic ignored "-Wc11-extensions"
+  #endif
 
-#elif defined __GNUC__ && __GNUC__>=6
+#elif defined __GNUC__
 
-  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+  #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) &&  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
     #pragma GCC diagnostic push
   #endif
-  #pragma GCC diagnostic ignored "-Wignored-attributes"
-
+  // g++ warns about local variables shadowing member functions, which is too strict
+  #pragma GCC diagnostic ignored "-Wshadow"
+  #if __GNUC__ == 4 && __GNUC_MINOR__ < 8
+    // Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions:
+    #pragma GCC diagnostic ignored "-Wtype-limits"
+  #endif
+  #if __GNUC__>=6
+    #pragma GCC diagnostic ignored "-Wignored-attributes"
+  #endif
+  #if __GNUC__==7
+    // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89325
+    #pragma GCC diagnostic ignored "-Wattributes"
+  #endif
 #endif
 
 #if defined __NVCC__
+  #pragma diag_suppress boolean_controlling_expr_is_constant
   // Disable the "statement is unreachable" message
   #pragma diag_suppress code_is_unreachable
   // Disable the "dynamic initialization in unreachable code" message
   #pragma diag_suppress initialization_not_reachable
-  // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are 4 of them)
+  // Disable the "invalid error number" message that we get with older versions of nvcc
+  #pragma diag_suppress 1222
+  // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are many of them and they seem to change with every version of the compiler)
+  #pragma diag_suppress 2527
+  #pragma diag_suppress 2529
   #pragma diag_suppress 2651
   #pragma diag_suppress 2653
   #pragma diag_suppress 2668
   #pragma diag_suppress 2669
   #pragma diag_suppress 2670
   #pragma diag_suppress 2671
+  #pragma diag_suppress 2735
+  #pragma diag_suppress 2737
+  #pragma diag_suppress 2739
+  #pragma diag_suppress 2976
+  #pragma diag_suppress 2979
+  // Disable the "// __device__ annotation is ignored on a function(...) that is
+  //              explicitly defaulted on its first declaration" message.
+  // The __device__ annotation seems to actually be needed in some cases,
+  // otherwise resulting in kernel runtime errors.
+  #pragma diag_suppress 2977
 #endif
 
+#else
+// warnings already disabled:
+# ifndef EIGEN_WARNINGS_DISABLED_2
+#  define EIGEN_WARNINGS_DISABLED_2
+# elif defined(EIGEN_INTERNAL_DEBUGGING)
+#  error "Do not include \"DisableStupidWarnings.h\" recursively more than twice!"
+# endif
+
 #endif // not EIGEN_WARNINGS_DISABLED

diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 5aed959..2f9cc44 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h

@@ -36,6 +36,10 @@
   };
 };
 
+template<typename T> struct evaluator_traits;
+
+template< typename T> struct evaluator;
+
 } // end namespace internal
 
 template<typename T> struct NumTraits;
@@ -43,11 +47,7 @@
 template<typename Derived> struct EigenBase;
 template<typename Derived> class DenseBase;
 template<typename Derived> class PlainObjectBase;
-
-
-template<typename Derived,
-         int Level = internal::accessors_level<Derived>::value >
-class DenseCoeffsBase;
+template<typename Derived, int Level> class DenseCoeffsBase;
 
 template<typename _Scalar, int _Rows, int _Cols,
          int _Options = AutoAlign |
@@ -79,6 +79,8 @@
 template<typename ExpressionType> class SwapWrapper;
 
 template<typename XprType, int BlockRows=Dynamic, int BlockCols=Dynamic, bool InnerPanel = false> class Block;
+template<typename XprType, typename RowIndices, typename ColIndices> class IndexedView;
+template<typename XprType, int Rows=Dynamic, int Cols=Dynamic, int Order=0> class Reshaped;
 
 template<typename MatrixType, int Size=Dynamic> class VectorBlock;
 template<typename MatrixType> class Transpose;
@@ -88,11 +90,10 @@
 template<typename ViewOp,    typename MatrixType>         class CwiseUnaryView;
 template<typename BinaryOp,  typename Lhs, typename Rhs>  class CwiseBinaryOp;
 template<typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>  class CwiseTernaryOp;
-template<typename BinOp,     typename Lhs, typename Rhs>  class SelfCwiseBinaryOp;
-template<typename Derived,   typename Lhs, typename Rhs>  class ProductBase;
-template<typename Lhs, typename Rhs>                      class Product;
-template<typename Lhs, typename Rhs, int Mode>            class GeneralProduct;
-template<typename Lhs, typename Rhs, int NestingFlags>    class CoeffBasedProduct;
+template<typename Decomposition, typename Rhstype>        class Solve;
+template<typename XprType>                                class Inverse;
+
+template<typename Lhs, typename Rhs, int Option = DefaultProduct> class Product;
 
 template<typename Derived> class DiagonalBase;
 template<typename _DiagonalVectorType> class DiagonalWrapper;
@@ -109,8 +110,13 @@
 template<typename Derived,
          int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors
 > class MapBase;
-template<int InnerStrideAtCompileTime, int OuterStrideAtCompileTime> class Stride;
+template<int OuterStrideAtCompileTime, int InnerStrideAtCompileTime> class Stride;
+template<int Value = Dynamic> class InnerStride;
+template<int Value = Dynamic> class OuterStride;
 template<typename MatrixType, int MapOptions=Unaligned, typename StrideType = Stride<0,0> > class Map;
+template<typename Derived> class RefBase;
+template<typename PlainObjectType, int Options = 0,
+         typename StrideType = typename internal::conditional<PlainObjectType::IsVectorAtCompileTime,InnerStride<1>,OuterStride<> >::type > class Ref;
 
 template<typename Derived> class TriangularBase;
 template<typename MatrixType, unsigned int Mode> class TriangularView;
@@ -121,10 +127,14 @@
 template<typename Derived> class ReturnByValue;
 template<typename ExpressionType> class ArrayWrapper;
 template<typename ExpressionType> class MatrixWrapper;
+template<typename Derived> class SolverBase;
+template<typename XprType> class InnerIterator;
 
 namespace internal {
-template<typename DecompositionType, typename Rhs> struct solve_retval_base;
-template<typename DecompositionType, typename Rhs> struct solve_retval;
+template<typename XprType> class generic_randaccess_stl_iterator;
+template<typename XprType> class pointer_based_stl_iterator;
+template<typename XprType, DirectionType Direction> class subvector_stl_iterator;
+template<typename XprType, DirectionType Direction> class subvector_stl_reverse_iterator;
 template<typename DecompositionType> struct kernel_retval_base;
 template<typename DecompositionType> struct kernel_retval;
 template<typename DecompositionType> struct image_retval_base;
@@ -137,6 +147,21 @@
 
 namespace internal {
 template<typename Lhs, typename Rhs> struct product_type;
+
+template<bool> struct EnableIf;
+
+/** \internal
+  * \class product_evaluator
+  * Products need their own evaluator with more template arguments allowing for
+  * easier partial template specializations.
+  */
+template< typename T,
+          int ProductTag = internal::product_type<typename T::Lhs,typename T::Rhs>::ret,
+          typename LhsShape = typename evaluator_traits<typename T::Lhs>::Shape,
+          typename RhsShape = typename evaluator_traits<typename T::Rhs>::Shape,
+          typename LhsScalar = typename traits<typename T::Lhs>::Scalar,
+          typename RhsScalar = typename traits<typename T::Rhs>::Scalar
+        > struct product_evaluator;
 }
 
 template<typename Lhs, typename Rhs,
@@ -152,15 +177,18 @@
 // with optional conjugation of the arguments.
 template<typename LhsScalar, typename RhsScalar, bool ConjLhs=false, bool ConjRhs=false> struct conj_helper;
 
-template<typename Scalar> struct scalar_sum_op;
-template<typename Scalar> struct scalar_difference_op;
-template<typename LhsScalar,typename RhsScalar> struct scalar_conj_product_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_sum_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_difference_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_conj_product_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar, int NaNPropagation=PropagateFast> struct scalar_min_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar, int NaNPropagation=PropagateFast> struct scalar_max_op;
 template<typename Scalar> struct scalar_opposite_op;
 template<typename Scalar> struct scalar_conjugate_op;
 template<typename Scalar> struct scalar_real_op;
 template<typename Scalar> struct scalar_imag_op;
 template<typename Scalar> struct scalar_abs_op;
 template<typename Scalar> struct scalar_abs2_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_absolute_difference_op;
 template<typename Scalar> struct scalar_sqrt_op;
 template<typename Scalar> struct scalar_rsqrt_op;
 template<typename Scalar> struct scalar_exp_op;
@@ -170,25 +198,44 @@
 template<typename Scalar> struct scalar_acos_op;
 template<typename Scalar> struct scalar_asin_op;
 template<typename Scalar> struct scalar_tan_op;
-template<typename Scalar> struct scalar_pow_op;
 template<typename Scalar> struct scalar_inverse_op;
 template<typename Scalar> struct scalar_square_op;
 template<typename Scalar> struct scalar_cube_op;
 template<typename Scalar, typename NewType> struct scalar_cast_op;
-template<typename Scalar> struct scalar_multiple_op;
-template<typename Scalar> struct scalar_quotient1_op;
-template<typename Scalar> struct scalar_min_op;
-template<typename Scalar> struct scalar_max_op;
 template<typename Scalar> struct scalar_random_op;
-template<typename Scalar> struct scalar_add_op;
 template<typename Scalar> struct scalar_constant_op;
 template<typename Scalar> struct scalar_identity_op;
+template<typename Scalar,bool is_complex, bool is_integer> struct scalar_sign_op;
+template<typename Scalar,typename ScalarExponent> struct scalar_pow_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_hypot_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_product_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_quotient_op;
+
+// SpecialFunctions module
+template<typename Scalar> struct scalar_lgamma_op;
+template<typename Scalar> struct scalar_digamma_op;
+template<typename Scalar> struct scalar_erf_op;
+template<typename Scalar> struct scalar_erfc_op;
+template<typename Scalar> struct scalar_ndtri_op;
 template<typename Scalar> struct scalar_igamma_op;
 template<typename Scalar> struct scalar_igammac_op;
+template<typename Scalar> struct scalar_zeta_op;
+template<typename Scalar> struct scalar_betainc_op;
 
-template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_product_op;
-template<typename LhsScalar,typename RhsScalar> struct scalar_multiple2_op;
-template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_quotient_op;
+// Bessel functions in SpecialFunctions module
+template<typename Scalar> struct scalar_bessel_i0_op;
+template<typename Scalar> struct scalar_bessel_i0e_op;
+template<typename Scalar> struct scalar_bessel_i1_op;
+template<typename Scalar> struct scalar_bessel_i1e_op;
+template<typename Scalar> struct scalar_bessel_j0_op;
+template<typename Scalar> struct scalar_bessel_y0_op;
+template<typename Scalar> struct scalar_bessel_j1_op;
+template<typename Scalar> struct scalar_bessel_y1_op;
+template<typename Scalar> struct scalar_bessel_k0_op;
+template<typename Scalar> struct scalar_bessel_k0e_op;
+template<typename Scalar> struct scalar_bessel_k1_op;
+template<typename Scalar> struct scalar_bessel_k1e_op;
+
 
 } // end namespace internal
 
@@ -227,6 +274,7 @@
 template<typename MatrixType> class ColPivHouseholderQR;
 template<typename MatrixType> class FullPivHouseholderQR;
 template<typename MatrixType> class CompleteOrthogonalDecomposition;
+template<typename MatrixType> class SVDBase;
 template<typename MatrixType, int QRPreconditioner = ColPivHouseholderQRPreconditioner> class JacobiSVD;
 template<typename MatrixType> class BDCSVD;
 template<typename MatrixType, int UpLo = Lower> class LLT;
@@ -241,36 +289,16 @@
 template<typename Scalar> class Rotation2D;
 template<typename Scalar> class AngleAxis;
 template<typename Scalar,int Dim> class Translation;
-
-#ifdef EIGEN2_SUPPORT
-template<typename Derived, int _Dim> class eigen2_RotationBase;
-template<typename Lhs, typename Rhs> class eigen2_Cross;
-template<typename Scalar> class eigen2_Quaternion;
-template<typename Scalar> class eigen2_Rotation2D;
-template<typename Scalar> class eigen2_AngleAxis;
-template<typename Scalar,int Dim> class eigen2_Transform;
-template <typename _Scalar, int _AmbientDim> class eigen2_ParametrizedLine;
-template <typename _Scalar, int _AmbientDim> class eigen2_Hyperplane;
-template<typename Scalar,int Dim> class eigen2_Translation;
-template<typename Scalar,int Dim> class eigen2_Scaling;
-#endif
-
-#if EIGEN2_SUPPORT_STAGE < STAGE20_RESOLVE_API_CONFLICTS
-template<typename Scalar> class Quaternion;
-template<typename Scalar,int Dim> class Transform;
-template <typename _Scalar, int _AmbientDim> class ParametrizedLine;
-template <typename _Scalar, int _AmbientDim> class Hyperplane;
-template<typename Scalar,int Dim> class Scaling;
-#endif
-
-#if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
+template<typename Scalar,int Dim> class AlignedBox;
 template<typename Scalar, int Options = AutoAlign> class Quaternion;
 template<typename Scalar,int Dim,int Mode,int _Options=AutoAlign> class Transform;
 template <typename _Scalar, int _AmbientDim, int Options=AutoAlign> class ParametrizedLine;
 template <typename _Scalar, int _AmbientDim, int Options=AutoAlign> class Hyperplane;
 template<typename Scalar> class UniformScaling;
 template<typename MatrixType,int Direction> class Homogeneous;
-#endif
+
+// Sparse module:
+template<typename Derived> class SparseMatrixBase;
 
 // MatrixFunctions module
 template<typename Derived> struct MatrixExponentialReturnValue;
@@ -289,18 +317,6 @@
 };
 }
 
-
-#ifdef EIGEN2_SUPPORT
-template<typename ExpressionType> class Cwise;
-template<typename MatrixType> class Minor;
-template<typename MatrixType> class LU;
-template<typename MatrixType> class QR;
-template<typename MatrixType> class SVD;
-namespace internal {
-template<typename MatrixType, unsigned int Mode> struct eigen2_part_return_type;
-}
-#endif
-
 } // end namespace Eigen
 
 #endif // EIGEN_FORWARDDECLARATIONS_H

diff --git a/Eigen/src/Core/util/IndexedViewHelper.h b/Eigen/src/Core/util/IndexedViewHelper.h
new file mode 100644
index 0000000..f85de30
--- /dev/null
+++ b/Eigen/src/Core/util/IndexedViewHelper.h

@@ -0,0 +1,186 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_INDEXED_VIEW_HELPER_H
+#define EIGEN_INDEXED_VIEW_HELPER_H
+
+namespace Eigen {
+
+namespace internal {
+struct symbolic_last_tag {};
+}
+
+/** \var last
+  * \ingroup Core_Module
+  *
+  * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically reference the last element/row/columns
+  * of the underlying vector or matrix once passed to DenseBase::operator()(const RowIndices&, const ColIndices&).
+  *
+  * This symbolic placeholder supports standard arithmetic operations.
+  *
+  * A typical usage example would be:
+  * \code
+  * using namespace Eigen;
+  * using Eigen::last;
+  * VectorXd v(n);
+  * v(seq(2,last-2)).setOnes();
+  * \endcode
+  *
+  * \sa end
+  */
+static const symbolic::SymbolExpr<internal::symbolic_last_tag> last; // PLEASE use Eigen::last   instead of Eigen::placeholders::last
+
+/** \var lastp1
+  * \ingroup Core_Module
+  *
+  * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically
+  * reference the last+1 element/row/columns of the underlying vector or matrix once
+  * passed to DenseBase::operator()(const RowIndices&, const ColIndices&).
+  *
+  * This symbolic placeholder supports standard arithmetic operations.
+  * It is essentially an alias to last+fix<1>.
+  *
+  * \sa last
+  */
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+static const auto lastp1 = last+fix<1>;
+#else
+// Using a FixedExpr<1> expression is important here to make sure the compiler
+// can fully optimize the computation starting indices with zero overhead.
+static const symbolic::AddExpr<symbolic::SymbolExpr<internal::symbolic_last_tag>,symbolic::ValueExpr<Eigen::internal::FixedInt<1> > > lastp1(last+fix<1>());
+#endif
+
+namespace internal {
+
+ // Replace symbolic last/end "keywords" by their true runtime value
+inline Index eval_expr_given_size(Index x, Index /* size */)   { return x; }
+
+template<int N>
+FixedInt<N> eval_expr_given_size(FixedInt<N> x, Index /*size*/)   { return x; }
+
+template<typename Derived>
+Index eval_expr_given_size(const symbolic::BaseExpr<Derived> &x, Index size)
+{
+  return x.derived().eval(last=size-1);
+}
+
+// Extract increment/step at compile time
+template<typename T, typename EnableIf = void> struct get_compile_time_incr {
+  enum { value = UndefinedIncr };
+};
+
+// Analogue of std::get<0>(x), but tailored for our needs.
+template<typename T>
+EIGEN_CONSTEXPR Index first(const T& x) EIGEN_NOEXCEPT { return x.first(); }
+
+// IndexedViewCompatibleType/makeIndexedViewCompatible turn an arbitrary object of type T into something usable by MatrixSlice
+// The generic implementation is a no-op
+template<typename T,int XprSize,typename EnableIf=void>
+struct IndexedViewCompatibleType {
+  typedef T type;
+};
+
+template<typename T,typename Q>
+const T& makeIndexedViewCompatible(const T& x, Index /*size*/, Q) { return x; }
+
+//--------------------------------------------------------------------------------
+// Handling of a single Index
+//--------------------------------------------------------------------------------
+
+struct SingleRange {
+  enum {
+    SizeAtCompileTime = 1
+  };
+  SingleRange(Index val) : m_value(val) {}
+  Index operator[](Index) const { return m_value; }
+  static EIGEN_CONSTEXPR Index size() EIGEN_NOEXCEPT { return 1; }
+  Index first() const EIGEN_NOEXCEPT { return m_value; }
+  Index m_value;
+};
+
+template<> struct get_compile_time_incr<SingleRange> {
+  enum { value = 1 }; // 1 or 0 ??
+};
+
+// Turn a single index into something that looks like an array (i.e., that exposes a .size(), and operator[](int) methods)
+template<typename T, int XprSize>
+struct IndexedViewCompatibleType<T,XprSize,typename internal::enable_if<internal::is_integral<T>::value>::type> {
+  // Here we could simply use Array, but maybe it's less work for the compiler to use
+  // a simpler wrapper as SingleRange
+  //typedef Eigen::Array<Index,1,1> type;
+  typedef SingleRange type;
+};
+
+template<typename T, int XprSize>
+struct IndexedViewCompatibleType<T, XprSize, typename enable_if<symbolic::is_symbolic<T>::value>::type> {
+  typedef SingleRange type;
+};
+
+
+template<typename T>
+typename enable_if<symbolic::is_symbolic<T>::value,SingleRange>::type
+makeIndexedViewCompatible(const T& id, Index size, SpecializedType) {
+  return eval_expr_given_size(id,size);
+}
+
+//--------------------------------------------------------------------------------
+// Handling of all
+//--------------------------------------------------------------------------------
+
+struct all_t { all_t() {} };
+
+// Convert a symbolic 'all' into a usable range type
+template<int XprSize>
+struct AllRange {
+  enum { SizeAtCompileTime = XprSize };
+  AllRange(Index size = XprSize) : m_size(size) {}
+  EIGEN_CONSTEXPR Index operator[](Index i) const EIGEN_NOEXCEPT { return i; }
+  EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_size.value(); }
+  EIGEN_CONSTEXPR Index first() const EIGEN_NOEXCEPT { return 0; }
+  variable_if_dynamic<Index,XprSize> m_size;
+};
+
+template<int XprSize>
+struct IndexedViewCompatibleType<all_t,XprSize> {
+  typedef AllRange<XprSize> type;
+};
+
+template<typename XprSizeType>
+inline AllRange<get_fixed_value<XprSizeType>::value> makeIndexedViewCompatible(all_t , XprSizeType size, SpecializedType) {
+  return AllRange<get_fixed_value<XprSizeType>::value>(size);
+}
+
+template<int Size> struct get_compile_time_incr<AllRange<Size> > {
+  enum { value = 1 };
+};
+
+} // end namespace internal
+
+
+/** \var all
+  * \ingroup Core_Module
+  * Can be used as a parameter to DenseBase::operator()(const RowIndices&, const ColIndices&) to index all rows or columns
+  */
+static const Eigen::internal::all_t all; // PLEASE use Eigen::all instead of Eigen::placeholders::all
+
+
+namespace placeholders {
+  typedef symbolic::SymbolExpr<internal::symbolic_last_tag> last_t;
+  typedef symbolic::AddExpr<symbolic::SymbolExpr<internal::symbolic_last_tag>,symbolic::ValueExpr<Eigen::internal::FixedInt<1> > > end_t;
+  typedef Eigen::internal::all_t all_t;
+
+  EIGEN_DEPRECATED static const all_t  all  = Eigen::all;    // PLEASE use Eigen::all    instead of Eigen::placeholders::all
+  EIGEN_DEPRECATED static const last_t last = Eigen::last;   // PLEASE use Eigen::last   instead of Eigen::placeholders::last
+  EIGEN_DEPRECATED static const end_t  end  = Eigen::lastp1; // PLEASE use Eigen::lastp1 instead of Eigen::placeholders::end
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_INDEXED_VIEW_HELPER_H

diff --git a/Eigen/src/Core/util/IntegralConstant.h b/Eigen/src/Core/util/IntegralConstant.h
new file mode 100644
index 0000000..e0092f6
--- /dev/null
+++ b/Eigen/src/Core/util/IntegralConstant.h

@@ -0,0 +1,272 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_INTEGRAL_CONSTANT_H
+#define EIGEN_INTEGRAL_CONSTANT_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<int N> class FixedInt;
+template<int N> class VariableAndFixedInt;
+
+/** \internal
+  * \class FixedInt
+  *
+  * This class embeds a compile-time integer \c N.
+  *
+  * It is similar to c++11 std::integral_constant<int,N> but with some additional features
+  * such as:
+  *  - implicit conversion to int
+  *  - arithmetic and some bitwise operators: -, +, *, /, %, &, |
+  *  - c++98/14 compatibility with fix<N> and fix<N>() syntax to define integral constants.
+  *
+  * It is strongly discouraged to directly deal with this class FixedInt. Instances are expcected to
+  * be created by the user using Eigen::fix<N> or Eigen::fix<N>(). In C++98-11, the former syntax does
+  * not create a FixedInt<N> instance but rather a point to function that needs to be \em cleaned-up
+  * using the generic helper:
+  * \code
+  * internal::cleanup_index_type<T>::type
+  * internal::cleanup_index_type<T,DynamicKey>::type
+  * \endcode
+  * where T can a FixedInt<N>, a pointer to function FixedInt<N> (*)(), or numerous other integer-like representations.
+  * \c DynamicKey is either Dynamic (default) or DynamicIndex and used to identify true compile-time values.
+  *
+  * For convenience, you can extract the compile-time value \c N in a generic way using the following helper:
+  * \code
+  * internal::get_fixed_value<T,DefaultVal>::value
+  * \endcode
+  * that will give you \c N if T equals FixedInt<N> or FixedInt<N> (*)(), and \c DefaultVal if T does not embed any compile-time value (e.g., T==int).
+  *
+  * \sa fix<N>, class VariableAndFixedInt
+  */
+template<int N> class FixedInt
+{
+public:
+  static const int value = N;
+  EIGEN_CONSTEXPR operator int() const { return value; }
+  FixedInt() {}
+  FixedInt( VariableAndFixedInt<N> other) {
+    #ifndef EIGEN_INTERNAL_DEBUGGING
+    EIGEN_UNUSED_VARIABLE(other);
+    #endif
+    eigen_internal_assert(int(other)==N);
+  }
+
+  FixedInt<-N> operator-() const { return FixedInt<-N>(); }
+  template<int M>
+  FixedInt<N+M> operator+( FixedInt<M>) const { return FixedInt<N+M>(); }
+  template<int M>
+  FixedInt<N-M> operator-( FixedInt<M>) const { return FixedInt<N-M>(); }
+  template<int M>
+  FixedInt<N*M> operator*( FixedInt<M>) const { return FixedInt<N*M>(); }
+  template<int M>
+  FixedInt<N/M> operator/( FixedInt<M>) const { return FixedInt<N/M>(); }
+  template<int M>
+  FixedInt<N%M> operator%( FixedInt<M>) const { return FixedInt<N%M>(); }
+  template<int M>
+  FixedInt<N|M> operator|( FixedInt<M>) const { return FixedInt<N|M>(); }
+  template<int M>
+  FixedInt<N&M> operator&( FixedInt<M>) const { return FixedInt<N&M>(); }
+
+#if EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
+  // Needed in C++14 to allow fix<N>():
+  FixedInt operator() () const { return *this; }
+
+  VariableAndFixedInt<N> operator() (int val) const { return VariableAndFixedInt<N>(val); }
+#else
+  FixedInt ( FixedInt<N> (*)() ) {}
+#endif
+
+#if EIGEN_HAS_CXX11
+  FixedInt(std::integral_constant<int,N>) {}
+#endif
+};
+
+/** \internal
+  * \class VariableAndFixedInt
+  *
+  * This class embeds both a compile-time integer \c N and a runtime integer.
+  * Both values are supposed to be equal unless the compile-time value \c N has a special
+  * value meaning that the runtime-value should be used. Depending on the context, this special
+  * value can be either Eigen::Dynamic (for positive quantities) or Eigen::DynamicIndex (for
+  * quantities that can be negative).
+  *
+  * It is the return-type of the function Eigen::fix<N>(int), and most of the time this is the only
+  * way it is used. It is strongly discouraged to directly deal with instances of VariableAndFixedInt.
+  * Indeed, in order to write generic code, it is the responsibility of the callee to properly convert
+  * it to either a true compile-time quantity (i.e. a FixedInt<N>), or to a runtime quantity (e.g., an Index)
+  * using the following generic helper:
+  * \code
+  * internal::cleanup_index_type<T>::type
+  * internal::cleanup_index_type<T,DynamicKey>::type
+  * \endcode
+  * where T can be a template instantiation of VariableAndFixedInt or numerous other integer-like representations.
+  * \c DynamicKey is either Dynamic (default) or DynamicIndex and used to identify true compile-time values.
+  *
+  * For convenience, you can also extract the compile-time value \c N using the following helper:
+  * \code
+  * internal::get_fixed_value<T,DefaultVal>::value
+  * \endcode
+  * that will give you \c N if T equals VariableAndFixedInt<N>, and \c DefaultVal if T does not embed any compile-time value (e.g., T==int).
+  *
+  * \sa fix<N>(int), class FixedInt
+  */
+template<int N> class VariableAndFixedInt
+{
+public:
+  static const int value = N;
+  operator int() const { return m_value; }
+  VariableAndFixedInt(int val) { m_value = val; }
+protected:
+  int m_value;
+};
+
+template<typename T, int Default=Dynamic> struct get_fixed_value {
+  static const int value = Default;
+};
+
+template<int N,int Default> struct get_fixed_value<FixedInt<N>,Default> {
+  static const int value = N;
+};
+
+#if !EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
+template<int N,int Default> struct get_fixed_value<FixedInt<N> (*)(),Default> {
+  static const int value = N;
+};
+#endif
+
+template<int N,int Default> struct get_fixed_value<VariableAndFixedInt<N>,Default> {
+  static const int value = N ;
+};
+
+template<typename T, int N, int Default>
+struct get_fixed_value<variable_if_dynamic<T,N>,Default> {
+  static const int value = N;
+};
+
+template<typename T> EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) { return x; }
+#if !EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
+template<int N> EIGEN_DEVICE_FUNC Index get_runtime_value(FixedInt<N> (*)()) { return N; }
+#endif
+
+// Cleanup integer/FixedInt/VariableAndFixedInt/etc types:
+
+// By default, no cleanup:
+template<typename T, int DynamicKey=Dynamic, typename EnableIf=void> struct cleanup_index_type { typedef T type; };
+
+// Convert any integral type (e.g., short, int, unsigned int, etc.) to Eigen::Index
+template<typename T, int DynamicKey> struct cleanup_index_type<T,DynamicKey,typename internal::enable_if<internal::is_integral<T>::value>::type> { typedef Index type; };
+
+#if !EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
+// In c++98/c++11, fix<N> is a pointer to function that we better cleanup to a true FixedInt<N>:
+template<int N, int DynamicKey> struct cleanup_index_type<FixedInt<N> (*)(), DynamicKey> { typedef FixedInt<N> type; };
+#endif
+
+// If VariableAndFixedInt does not match DynamicKey, then we turn it to a pure compile-time value:
+template<int N, int DynamicKey> struct cleanup_index_type<VariableAndFixedInt<N>, DynamicKey> { typedef FixedInt<N> type; };
+// If VariableAndFixedInt matches DynamicKey, then we turn it to a pure runtime-value (aka Index):
+template<int DynamicKey> struct cleanup_index_type<VariableAndFixedInt<DynamicKey>, DynamicKey> { typedef Index type; };
+
+#if EIGEN_HAS_CXX11
+template<int N, int DynamicKey> struct cleanup_index_type<std::integral_constant<int,N>, DynamicKey> { typedef FixedInt<N> type; };
+#endif
+
+} // end namespace internal
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+#if EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
+template<int N>
+static const internal::FixedInt<N> fix{};
+#else
+template<int N>
+inline internal::FixedInt<N> fix() { return internal::FixedInt<N>(); }
+
+// The generic typename T is mandatory. Otherwise, a code like fix<N> could refer to either the function above or this next overload.
+// This way a code like fix<N> can only refer to the previous function.
+template<int N,typename T>
+inline internal::VariableAndFixedInt<N> fix(T val) { return internal::VariableAndFixedInt<N>(internal::convert_index<int>(val)); }
+#endif
+
+#else // EIGEN_PARSED_BY_DOXYGEN
+
+/** \var fix<N>()
+  * \ingroup Core_Module
+  *
+  * This \em identifier permits to construct an object embedding a compile-time integer \c N.
+  *
+  * \tparam N the compile-time integer value
+  *
+  * It is typically used in conjunction with the Eigen::seq and Eigen::seqN functions to pass compile-time values to them:
+  * \code
+  * seqN(10,fix<4>,fix<-3>)   // <=> [10 7 4 1]
+  * \endcode
+  *
+  * See also the function fix(int) to pass both a compile-time and runtime value.
+  *
+  * In c++14, it is implemented as:
+  * \code
+  * template<int N> static const internal::FixedInt<N> fix{};
+  * \endcode
+  * where internal::FixedInt<N> is an internal template class similar to
+  * <a href="http://en.cppreference.com/w/cpp/types/integral_constant">\c std::integral_constant </a><tt> <int,N> </tt>
+  * Here, \c fix<N> is thus an object of type \c internal::FixedInt<N>.
+  *
+  * In c++98/11, it is implemented as a function:
+  * \code
+  * template<int N> inline internal::FixedInt<N> fix();
+  * \endcode
+  * Here internal::FixedInt<N> is thus a pointer to function.
+  *
+  * If for some reason you want a true object in c++98 then you can write: \code fix<N>() \endcode which is also valid in c++14.
+  *
+  * \sa fix<N>(int), seq, seqN
+  */
+template<int N>
+static const auto fix();
+
+/** \fn fix<N>(int)
+  * \ingroup Core_Module
+  *
+  * This function returns an object embedding both a compile-time integer \c N, and a fallback runtime value \a val.
+  *
+  * \tparam N the compile-time integer value
+  * \param  val the fallback runtime integer value
+  *
+  * This function is a more general version of the \ref fix identifier/function that can be used in template code
+  * where the compile-time value could turn out to actually mean "undefined at compile-time". For positive integers
+  * such as a size or a dimension, this case is identified by Eigen::Dynamic, whereas runtime signed integers
+  * (e.g., an increment/stride) are identified as Eigen::DynamicIndex. In such a case, the runtime value \a val
+  * will be used as a fallback.
+  *
+  * A typical use case would be:
+  * \code
+  * template<typename Derived> void foo(const MatrixBase<Derived> &mat) {
+  *   const int N = Derived::RowsAtCompileTime==Dynamic ? Dynamic : Derived::RowsAtCompileTime/2;
+  *   const int n = mat.rows()/2;
+  *   ... mat( seqN(0,fix<N>(n) ) ...;
+  * }
+  * \endcode
+  * In this example, the function Eigen::seqN knows that the second argument is expected to be a size.
+  * If the passed compile-time value N equals Eigen::Dynamic, then the proxy object returned by fix will be dissmissed, and converted to an Eigen::Index of value \c n.
+  * Otherwise, the runtime-value \c n will be dissmissed, and the returned ArithmeticSequence will be of the exact same type as <tt> seqN(0,fix<N>) </tt>.
+  *
+  * \sa fix, seqN, class ArithmeticSequence
+  */
+template<int N>
+static const auto fix(int val);
+
+#endif // EIGEN_PARSED_BY_DOXYGEN
+
+} // end namespace Eigen
+
+#endif // EIGEN_INTEGRAL_CONSTANT_H

diff --git a/Eigen/src/Core/util/MKL_support.h b/Eigen/src/Core/util/MKL_support.h
old mode 100644
new mode 100755
index 8acca9c..17963fa
--- a/Eigen/src/Core/util/MKL_support.h
+++ b/Eigen/src/Core/util/MKL_support.h

@@ -49,12 +49,17 @@
   #define EIGEN_USE_LAPACKE
 #endif
 
-#if defined(EIGEN_USE_BLAS) || defined(EIGEN_USE_LAPACKE) || defined(EIGEN_USE_MKL_VML)
+#if defined(EIGEN_USE_MKL_VML) && !defined(EIGEN_USE_MKL)
   #define EIGEN_USE_MKL
 #endif
 
+
 #if defined EIGEN_USE_MKL
-#   include <mkl.h> 
+#   if (!defined MKL_DIRECT_CALL) && (!defined EIGEN_MKL_NO_DIRECT_CALL)
+#       define MKL_DIRECT_CALL
+#       define MKL_DIRECT_CALL_JUST_SET
+#   endif
+#   include <mkl.h>
 /*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/
 #   ifndef INTEL_MKL_VERSION
 #       undef EIGEN_USE_MKL /* INTEL_MKL_VERSION is not even defined on older versions */
@@ -64,63 +69,69 @@
 #   ifndef EIGEN_USE_MKL
     /*If the MKL version is too old, undef everything*/
 #       undef   EIGEN_USE_MKL_ALL
-#       undef   EIGEN_USE_BLAS
 #       undef   EIGEN_USE_LAPACKE
 #       undef   EIGEN_USE_MKL_VML
 #       undef   EIGEN_USE_LAPACKE_STRICT
 #       undef   EIGEN_USE_LAPACKE
+#       ifdef   MKL_DIRECT_CALL_JUST_SET
+#           undef MKL_DIRECT_CALL
+#       endif
 #   endif
 #endif
 
 #if defined EIGEN_USE_MKL
-#include <mkl_lapacke.h>
+
 #define EIGEN_MKL_VML_THRESHOLD 128
 
+/* MKL_DOMAIN_BLAS, etc are defined only in 10.3 update 7 */
+/* MKL_BLAS, etc are not defined in 11.2 */
+#ifdef MKL_DOMAIN_ALL
+#define EIGEN_MKL_DOMAIN_ALL MKL_DOMAIN_ALL
+#else
+#define EIGEN_MKL_DOMAIN_ALL MKL_ALL
+#endif
+
+#ifdef MKL_DOMAIN_BLAS
+#define EIGEN_MKL_DOMAIN_BLAS MKL_DOMAIN_BLAS
+#else
+#define EIGEN_MKL_DOMAIN_BLAS MKL_BLAS
+#endif
+
+#ifdef MKL_DOMAIN_FFT
+#define EIGEN_MKL_DOMAIN_FFT MKL_DOMAIN_FFT
+#else
+#define EIGEN_MKL_DOMAIN_FFT MKL_FFT
+#endif
+
+#ifdef MKL_DOMAIN_VML
+#define EIGEN_MKL_DOMAIN_VML MKL_DOMAIN_VML
+#else
+#define EIGEN_MKL_DOMAIN_VML MKL_VML
+#endif
+
+#ifdef MKL_DOMAIN_PARDISO
+#define EIGEN_MKL_DOMAIN_PARDISO MKL_DOMAIN_PARDISO
+#else
+#define EIGEN_MKL_DOMAIN_PARDISO MKL_PARDISO
+#endif
+#endif
+
+#if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL)
+#include "../../misc/blas.h"
+#endif
+
 namespace Eigen {
 
 typedef std::complex<double> dcomplex;
 typedef std::complex<float>  scomplex;
 
-namespace internal {
-
-template<typename MKLType, typename EigenType>
-static inline void assign_scalar_eig2mkl(MKLType& mklScalar, const EigenType& eigenScalar) {
-  mklScalar=eigenScalar;
-}
-
-template<typename MKLType, typename EigenType>
-static inline void assign_conj_scalar_eig2mkl(MKLType& mklScalar, const EigenType& eigenScalar) {
-  mklScalar=eigenScalar;
-}
-
-template <>
-inline void assign_scalar_eig2mkl<MKL_Complex16,dcomplex>(MKL_Complex16& mklScalar, const dcomplex& eigenScalar) {
-  mklScalar.real=eigenScalar.real();
-  mklScalar.imag=eigenScalar.imag();
-}
-
-template <>
-inline void assign_scalar_eig2mkl<MKL_Complex8,scomplex>(MKL_Complex8& mklScalar, const scomplex& eigenScalar) {
-  mklScalar.real=eigenScalar.real();
-  mklScalar.imag=eigenScalar.imag();
-}
-
-template <>
-inline void assign_conj_scalar_eig2mkl<MKL_Complex16,dcomplex>(MKL_Complex16& mklScalar, const dcomplex& eigenScalar) {
-  mklScalar.real=eigenScalar.real();
-  mklScalar.imag=-eigenScalar.imag();
-}
-
-template <>
-inline void assign_conj_scalar_eig2mkl<MKL_Complex8,scomplex>(MKL_Complex8& mklScalar, const scomplex& eigenScalar) {
-  mklScalar.real=eigenScalar.real();
-  mklScalar.imag=-eigenScalar.imag();
-}
-
-} // end namespace internal
+#if defined(EIGEN_USE_MKL)
+typedef MKL_INT BlasIndex;
+#else
+typedef int BlasIndex;
+#endif
 
 } // end namespace Eigen
 
-#endif
 
 #endif // EIGEN_MKL_SUPPORT_H

diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index b2aab32..b436dfa 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,29 +11,73 @@
 #ifndef EIGEN_MACROS_H
 #define EIGEN_MACROS_H
 
+//------------------------------------------------------------------------------------------
+// Eigen version and basic defaults
+//------------------------------------------------------------------------------------------
+
 #define EIGEN_WORLD_VERSION 3
-#define EIGEN_MAJOR_VERSION 2
-#define EIGEN_MINOR_VERSION 90
+#define EIGEN_MAJOR_VERSION 4
+#define EIGEN_MINOR_VERSION 0
 
 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                       (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
                                                                  EIGEN_MINOR_VERSION>=z))))
 
+#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
+#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor
+#else
+#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor
+#endif
+
+#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t
+#endif
+
+// Upperbound on the C++ version to use.
+// Expected values are 03, 11, 14, 17, etc.
+// By default, let's use an arbitrarily large C++ version.
+#ifndef EIGEN_MAX_CPP_VER
+#define EIGEN_MAX_CPP_VER 99
+#endif
+
+/** Allows to disable some optimizations which might affect the accuracy of the result.
+  * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
+  * They currently include:
+  *   - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization.
+  */
+#ifndef EIGEN_FAST_MATH
+#define EIGEN_FAST_MATH 1
+#endif
+
+#ifndef EIGEN_STACK_ALLOCATION_LIMIT
+// 131072 == 128 KB
+#define EIGEN_STACK_ALLOCATION_LIMIT 131072
+#endif
+
+//------------------------------------------------------------------------------------------
 // Compiler identification, EIGEN_COMP_*
+//------------------------------------------------------------------------------------------
+
 /// \internal EIGEN_COMP_GNUC set to 1 for all compilers compatible with GCC
 #ifdef __GNUC__
-  #define EIGEN_COMP_GNUC 1
+  #define EIGEN_COMP_GNUC (__GNUC__*10+__GNUC_MINOR__)
 #else
   #define EIGEN_COMP_GNUC 0
 #endif
 
-/// \internal EIGEN_COMP_CLANG set to 1 if the compiler is clang (alias for __clang__)
+/// \internal EIGEN_COMP_CLANG set to major+minor version (e.g., 307 for clang 3.7) if the compiler is clang
 #if defined(__clang__)
-  #define EIGEN_COMP_CLANG 1
+  #define EIGEN_COMP_CLANG (__clang_major__*100+__clang_minor__)
 #else
   #define EIGEN_COMP_CLANG 0
 #endif
 
+/// \internal EIGEN_COMP_CASTXML set to 1 if being preprocessed by CastXML
+#if defined(__castxml__)
+  #define EIGEN_COMP_CASTXML 1
+#else
+  #define EIGEN_COMP_CASTXML 0
+#endif
 
 /// \internal EIGEN_COMP_LLVM set to 1 if the compiler backend is llvm
 #if defined(__llvm__)
@@ -70,23 +114,67 @@
   #define EIGEN_COMP_MSVC 0
 #endif
 
-/// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC
-#if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC)
-  #define EIGEN_COMP_MSVC_STRICT 1
+#if defined(__NVCC__)
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+  #define EIGEN_COMP_NVCC  ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
+#elif defined(__CUDACC_VER__)
+  #define EIGEN_COMP_NVCC __CUDACC_VER__
+#else
+  #error "NVCC did not define compiler version."
+#endif
+#else
+  #define EIGEN_COMP_NVCC 0
+#endif
+
+// For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC:
+//  name        ver   MSC_VER
+//  2008         9      1500
+//  2010        10      1600
+//  2012        11      1700
+//  2013        12      1800
+//  2015        14      1900
+//  "15"        15      1900
+//  2017-14.1   15.0    1910
+//  2017-14.11  15.3    1911
+//  2017-14.12  15.5    1912
+//  2017-14.13  15.6    1913
+//  2017-14.14  15.7    1914
+
+/// \internal EIGEN_COMP_MSVC_LANG set to _MSVC_LANG if the compiler is Microsoft Visual C++, 0 otherwise.
+#if defined(_MSVC_LANG)
+  #define EIGEN_COMP_MSVC_LANG _MSVC_LANG
+#else
+  #define EIGEN_COMP_MSVC_LANG 0
+#endif
+
+// For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC_LANG:
+// MSVC option                          Standard  MSVC_LANG
+// /std:c++14 (default as of VS 2019)   C++14     201402L
+// /std:c++17                           C++17     201703L
+// /std:c++latest                       >C++17    >201703L
+
+/// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC or clang-cl
+#if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC || EIGEN_COMP_LLVM || EIGEN_COMP_CLANG)
+  #define EIGEN_COMP_MSVC_STRICT _MSC_VER
 #else
   #define EIGEN_COMP_MSVC_STRICT 0
 #endif
 
-/// \internal EIGEN_COMP_IBM set to 1 if the compiler is IBM XL C++
-#if defined(__IBMCPP__) || defined(__xlc__)
-  #define EIGEN_COMP_IBM 1
+/// \internal EIGEN_COMP_IBM set to xlc version if the compiler is IBM XL C++
+// XLC   version
+// 3.1   0x0301
+// 4.5   0x0405
+// 5.0   0x0500
+// 12.1  0x0C01
+#if defined(__IBMCPP__) || defined(__xlc__) || defined(__ibmxl__)
+  #define EIGEN_COMP_IBM __xlC__
 #else
   #define EIGEN_COMP_IBM 0
 #endif
 
-/// \internal EIGEN_COMP_PGI set to 1 if the compiler is Portland Group Compiler
+/// \internal EIGEN_COMP_PGI set to PGI version if the compiler is Portland Group Compiler
 #if defined(__PGI)
-  #define EIGEN_COMP_PGI 1
+  #define EIGEN_COMP_PGI (__PGIC__*100+__PGIC_MINOR__)
 #else
   #define EIGEN_COMP_PGI 0
 #endif
@@ -98,9 +186,16 @@
   #define EIGEN_COMP_ARM 0
 #endif
 
+/// \internal EIGEN_COMP_EMSCRIPTEN set to 1 if the compiler is Emscripten Compiler
+#if defined(__EMSCRIPTEN__)
+  #define EIGEN_COMP_EMSCRIPTEN 1
+#else
+  #define EIGEN_COMP_EMSCRIPTEN 0
+#endif
+
 
 /// \internal EIGEN_GNUC_STRICT set to 1 if the compiler is really GCC and not a compatible compiler (e.g., ICC, clang, mingw, etc.)
-#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_CLANG || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM )
+#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN)
   #define EIGEN_COMP_GNUC_STRICT 1
 #else
   #define EIGEN_COMP_GNUC_STRICT 0
@@ -125,9 +220,13 @@
 #endif
 
 
-// Architecture identification, EIGEN_ARCH_*
 
-#if defined(__x86_64__) || defined(_M_X64) || defined(__amd64)
+//------------------------------------------------------------------------------------------
+// Architecture identification, EIGEN_ARCH_*
+//------------------------------------------------------------------------------------------
+
+
+#if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)) || defined(__amd64)
   #define EIGEN_ARCH_x86_64 1
 #else
   #define EIGEN_ARCH_x86_64 0
@@ -153,18 +252,61 @@
 #endif
 
 /// \internal EIGEN_ARCH_ARM64 set to 1 if the architecture is ARM64
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
   #define EIGEN_ARCH_ARM64 1
 #else
   #define EIGEN_ARCH_ARM64 0
 #endif
 
+/// \internal EIGEN_ARCH_ARM_OR_ARM64 set to 1 if the architecture is ARM or ARM64
 #if EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64
   #define EIGEN_ARCH_ARM_OR_ARM64 1
 #else
   #define EIGEN_ARCH_ARM_OR_ARM64 0
 #endif
 
+/// \internal EIGEN_ARCH_ARMV8 set to 1 if the architecture is armv8 or greater.
+#if EIGEN_ARCH_ARM_OR_ARM64 && defined(__ARM_ARCH) && __ARM_ARCH >= 8
+#define EIGEN_ARCH_ARMV8 1
+#else
+#define EIGEN_ARCH_ARMV8 0
+#endif
+
+
+/// \internal EIGEN_HAS_ARM64_FP16 set to 1 if the architecture provides an IEEE
+/// compliant Arm fp16 type
+#if EIGEN_ARCH_ARM64
+  #ifndef EIGEN_HAS_ARM64_FP16
+    #if defined(__ARM_FP16_FORMAT_IEEE)
+      #define EIGEN_HAS_ARM64_FP16 1
+    #else
+      #define EIGEN_HAS_ARM64_FP16 0
+    #endif
+  #endif
+#endif
+
+/// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture
+/// supports Neon vector intrinsics for fp16.
+#if EIGEN_ARCH_ARM64
+  #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+    #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+      #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1
+    #else
+      #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 0
+    #endif
+  #endif
+#endif
+
+/// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture
+/// supports Neon scalar intrinsics for fp16.
+#if EIGEN_ARCH_ARM64
+  #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC
+    #if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+      #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1
+    #endif
+  #endif
+#endif
+
 /// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS
 #if defined(__mips__) || defined(__mips)
   #define EIGEN_ARCH_MIPS 1
@@ -195,7 +337,9 @@
 
 
 
+//------------------------------------------------------------------------------------------
 // Operating system identification, EIGEN_OS_*
+//------------------------------------------------------------------------------------------
 
 /// \internal EIGEN_OS_UNIX set to 1 if the OS is a unix variant
 #if defined(__unix__) || defined(__unix)
@@ -282,8 +426,154 @@
   #define EIGEN_OS_WIN_STRICT 0
 #endif
 
+/// \internal EIGEN_OS_SUN set to __SUNPRO_C if the OS is SUN
+// compiler  solaris   __SUNPRO_C
+// version   studio
+// 5.7       10        0x570
+// 5.8       11        0x580
+// 5.9       12        0x590
+// 5.10	     12.1      0x5100
+// 5.11	     12.2      0x5110
+// 5.12	     12.3      0x5120
+#if (defined(sun) || defined(__sun)) && !(defined(__SVR4) || defined(__svr4__))
+  #define EIGEN_OS_SUN __SUNPRO_C
+#else
+  #define EIGEN_OS_SUN 0
+#endif
+
+/// \internal EIGEN_OS_SOLARIS set to 1 if the OS is Solaris
+#if (defined(sun) || defined(__sun)) && (defined(__SVR4) || defined(__svr4__))
+  #define EIGEN_OS_SOLARIS 1
+#else
+  #define EIGEN_OS_SOLARIS 0
+#endif
 
 
+//------------------------------------------------------------------------------------------
+// Detect GPU compilers and architectures
+//------------------------------------------------------------------------------------------
+
+// NVCC is not supported as the target platform for HIPCC
+// Note that this also makes EIGEN_CUDACC and EIGEN_HIPCC mutually exclusive
+#if defined(__NVCC__) && defined(__HIPCC__)
+  #error "NVCC as the target platform for HIPCC is currently not supported."
+#endif
+
+#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
+  // Means the compiler is either nvcc or clang with CUDA enabled
+  #define EIGEN_CUDACC __CUDACC__
+#endif
+
+#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA)
+  // Means we are generating code for the device
+  #define EIGEN_CUDA_ARCH __CUDA_ARCH__
+#endif
+
+#if defined(EIGEN_CUDACC)
+#include <cuda.h>
+  #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10)
+#else
+  #define EIGEN_CUDA_SDK_VER 0
+#endif
+
+#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP)
+  // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
+  #define EIGEN_HIPCC __HIPCC__
+
+  // We need to include hip_runtime.h here because it pulls in
+  // ++ hip_common.h which contains the define for  __HIP_DEVICE_COMPILE__
+  // ++ host_defines.h which contains the defines for the __host__ and __device__ macros
+  #include <hip/hip_runtime.h>
+
+  #if defined(__HIP_DEVICE_COMPILE__)
+    // analogous to EIGEN_CUDA_ARCH, but for HIP
+    #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
+  #endif
+
+  // For HIP (ROCm 3.5 and higher), we need to explicitly set the launch_bounds attribute
+  // value to 1024. The compiler assigns a default value of 256 when the attribute is not
+  // specified. This results in failures on the HIP platform, for cases when a GPU kernel
+  // without an explicit launch_bounds attribute is called with a threads_per_block value
+  // greater than 256.
+  //
+  // This is a regression in functioanlity and is expected to be fixed within the next
+  // couple of ROCm releases (compiler will go back to using 1024 value as the default)
+  //
+  // In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds
+  // attribute.
+
+  #define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024)
+
+#endif
+
+#if !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024)
+#define EIGEN_HIP_LAUNCH_BOUNDS_1024
+#endif // !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024)
+
+// Unify CUDA/HIPCC
+
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
+//
+// If either EIGEN_CUDACC or EIGEN_HIPCC is defined, then define EIGEN_GPUCC
+//
+#define EIGEN_GPUCC
+//
+// EIGEN_HIPCC implies the HIP compiler and is used to tweak Eigen code for use in HIP kernels
+// EIGEN_CUDACC implies the CUDA compiler and is used to tweak Eigen code for use in CUDA kernels
+//
+// In most cases the same tweaks are required to the Eigen code to enable in both the HIP and CUDA kernels.
+// For those cases, the corresponding code should be guarded with
+//      #if defined(EIGEN_GPUCC)
+// instead of
+//      #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
+//
+// For cases where the tweak is specific to HIP, the code should be guarded with
+//      #if defined(EIGEN_HIPCC)
+//
+// For cases where the tweak is specific to CUDA, the code should be guarded with
+//      #if defined(EIGEN_CUDACC)
+//
+#endif
+
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
+//
+// If either EIGEN_CUDA_ARCH or EIGEN_HIP_DEVICE_COMPILE is defined, then define EIGEN_GPU_COMPILE_PHASE
+//
+#define EIGEN_GPU_COMPILE_PHASE
+//
+// GPU compilers (HIPCC, NVCC) typically do two passes over the source code,
+//   + one to compile the source for the "host" (ie CPU)
+//   + another to compile the source for the "device" (ie. GPU)
+//
+// Code that needs to enabled only during the either the "host" or "device" compilation phase
+// needs to be guarded with a macro that indicates the current compilation phase
+//
+// EIGEN_HIP_DEVICE_COMPILE implies the device compilation phase in HIP
+// EIGEN_CUDA_ARCH implies the device compilation phase in CUDA
+//
+// In most cases, the "host" / "device" specific code is the same for both HIP and CUDA
+// For those cases, the code should be guarded with
+//       #if defined(EIGEN_GPU_COMPILE_PHASE)
+// instead of
+//       #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
+//
+// For cases where the tweak is specific to HIP, the code should be guarded with
+//      #if defined(EIGEN_HIP_DEVICE_COMPILE)
+//
+// For cases where the tweak is specific to CUDA, the code should be guarded with
+//      #if defined(EIGEN_CUDA_ARCH)
+//
+#endif
+
+#if defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
+// EIGEN_USE_SYCL is a user-defined macro while __SYCL_DEVICE_ONLY__ is a compiler-defined macro.
+// In most cases we want to check if both macros are defined which can be done using the define below.
+#define SYCL_DEVICE_ONLY
+#endif
+
+//------------------------------------------------------------------------------------------
+// Detect Compiler/Architecture/OS specific features
+//------------------------------------------------------------------------------------------
 
 #if EIGEN_GNUC_AT_MOST(4,3) && !EIGEN_COMP_CLANG
   // see bug 89
@@ -292,84 +582,6 @@
   #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1
 #endif
 
-// 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
-// 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
-// enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
-// certain common platform (compiler+architecture combinations) to avoid these problems.
-// Only static alignment is really problematic (relies on nonstandard compiler extensions),
-// try to keep heap alignment even when we have to disable static alignment.
-#if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64)
-#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
-#elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6)
-// Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support.
-// Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use.
-// 4.8 and newer seem definitely unaffected.
-#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
-#else
-#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
-#endif
-
-// static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
-#if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
- && !EIGEN_GCC3_OR_OLDER \
- && !EIGEN_COMP_SUNCC \
- && !EIGEN_OS_QNX
-  #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
-#else
-  #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
-#endif
-
-// Defined the boundary (in bytes) on which the data needs to be aligned. Note
-// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
-// aligned at all regardless of the value of this #define.
-#define EIGEN_ALIGN_BYTES 16
-
-#ifdef EIGEN_DONT_ALIGN
-  #ifndef EIGEN_DONT_ALIGN_STATICALLY
-    #define EIGEN_DONT_ALIGN_STATICALLY
-  #endif
-  #define EIGEN_ALIGN 0
-#elif !defined(EIGEN_DONT_VECTORIZE)
-#if defined(__AVX512F__)
-#undef EIGEN_ALIGN_BYTES
-#define EIGEN_ALIGN_BYTES 64
-#elif defined(__AVX__)
-#undef EIGEN_ALIGN_BYTES
-#define EIGEN_ALIGN_BYTES 32
-#endif
-#define EIGEN_ALIGN 1
-#else
-  #define EIGEN_ALIGN 0
-#endif
-
-#define EIGEN_MAX_ALIGN_BYTES EIGEN_ALIGN_BYTES
-
-
-// This macro can be used to prevent from macro expansion, e.g.:
-//   std::max EIGEN_NOT_A_MACRO(a,b)
-#define EIGEN_NOT_A_MACRO
-
-// EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable
-// alignment (EIGEN_DONT_ALIGN_STATICALLY) and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). Henceforth, only EIGEN_ALIGN_STATICALLY should be used.
-#if EIGEN_ARCH_WANTS_STACK_ALIGNMENT && !defined(EIGEN_DONT_ALIGN_STATICALLY)
-  #define EIGEN_ALIGN_STATICALLY 1
-#else
-  #define EIGEN_ALIGN_STATICALLY 0
-  #ifndef EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
-    #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
-  #endif
-#endif
-
-#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
-#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor
-#else
-#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor
-#endif
-
-#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t
-#endif
-
 // Cross compiler wrapper around LLVM's __has_builtin
 #ifdef __has_builtin
 #  define EIGEN_HAS_BUILTIN(x) __has_builtin(x)
@@ -383,45 +595,314 @@
 # define __has_feature(x) 0
 #endif
 
+// Some old compilers do not support template specializations like:
+// template<typename T,int N> void foo(const T x[N]);
+#if !(   EIGEN_COMP_CLANG && (   (EIGEN_COMP_CLANG<309)                                                       \
+                              || (defined(__apple_build_version__) && (__apple_build_version__ < 9000000)))  \
+      || EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<49)
+#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 1
+#else
+#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 0
+#endif
+
+// The macro EIGEN_CPLUSPLUS is a replacement for __cplusplus/_MSVC_LANG that
+// works for both platforms, indicating the C++ standard version number.
+//
+// With MSVC, without defining /Zc:__cplusplus, the __cplusplus macro will
+// report 199711L regardless of the language standard specified via /std.
+// We need to rely on _MSVC_LANG instead, which is only available after
+// VS2015.3.
+#if EIGEN_COMP_MSVC_LANG > 0
+#define EIGEN_CPLUSPLUS EIGEN_COMP_MSVC_LANG
+#elif EIGEN_COMP_MSVC >= 1900
+#define EIGEN_CPLUSPLUS 201103L
+#elif defined(__cplusplus)
+#define EIGEN_CPLUSPLUS __cplusplus
+#else
+#define EIGEN_CPLUSPLUS 0
+#endif
+
+// The macro EIGEN_COMP_CXXVER defines the c++ verson expected by the compiler.
+// For instance, if compiling with gcc and -std=c++17, then EIGEN_COMP_CXXVER
+// is defined to 17.
+#if EIGEN_CPLUSPLUS > 201703L
+  #define EIGEN_COMP_CXXVER 20
+#elif EIGEN_CPLUSPLUS > 201402L
+  #define EIGEN_COMP_CXXVER 17
+#elif EIGEN_CPLUSPLUS > 201103L
+  #define EIGEN_COMP_CXXVER 14
+#elif EIGEN_CPLUSPLUS >= 201103L
+  #define EIGEN_COMP_CXXVER 11
+#else
+  #define EIGEN_COMP_CXXVER 03
+#endif
+
+#ifndef EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
+  #if defined(__cpp_variable_templates) && __cpp_variable_templates >= 201304 && EIGEN_MAX_CPP_VER>=14
+    #define EIGEN_HAS_CXX14_VARIABLE_TEMPLATES 1
+  #else
+    #define EIGEN_HAS_CXX14_VARIABLE_TEMPLATES 0
+  #endif
+#endif
+
+
+// The macros EIGEN_HAS_CXX?? defines a rough estimate of available c++ features
+// but in practice we should not rely on them but rather on the availabilty of
+// individual features as defined later.
+// This is why there is no EIGEN_HAS_CXX17.
+// FIXME: get rid of EIGEN_HAS_CXX14 and maybe even EIGEN_HAS_CXX11.
+#if EIGEN_MAX_CPP_VER>=11 && EIGEN_COMP_CXXVER>=11
+#define EIGEN_HAS_CXX11 1
+#else
+#define EIGEN_HAS_CXX11 0
+#endif
+
+#if EIGEN_MAX_CPP_VER>=14 && EIGEN_COMP_CXXVER>=14
+#define EIGEN_HAS_CXX14 1
+#else
+#define EIGEN_HAS_CXX14 0
+#endif
+
 // Do we support r-value references?
-#if (__has_feature(cxx_rvalue_references) || \
-     (defined(__cplusplus) && __cplusplus >= 201103L) || \
-     (defined(_MSC_VER) && _MSC_VER >= 1600))
-  #define EIGEN_HAVE_RVALUE_REFERENCES
+#ifndef EIGEN_HAS_RVALUE_REFERENCES
+#if EIGEN_MAX_CPP_VER>=11 && \
+    (__has_feature(cxx_rvalue_references) || \
+     (EIGEN_COMP_CXXVER >= 11) || (EIGEN_COMP_MSVC >= 1600))
+  #define EIGEN_HAS_RVALUE_REFERENCES 1
+#else
+  #define EIGEN_HAS_RVALUE_REFERENCES 0
+#endif
 #endif
 
 // Does the compiler support C99?
-#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901))       \
-|| (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \
-    || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
-#define EIGEN_HAS_C99_MATH 1
+// Need to include <cmath> to make sure _GLIBCXX_USE_C99 gets defined
+#include <cmath>
+#ifndef EIGEN_HAS_C99_MATH
+#if EIGEN_MAX_CPP_VER>=11 && \
+    ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901))       \
+  || (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \
+  || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) \
+  || (EIGEN_COMP_MSVC >= 1900) || defined(SYCL_DEVICE_ONLY))
+  #define EIGEN_HAS_C99_MATH 1
+#else
+  #define EIGEN_HAS_C99_MATH 0
+#endif
+#endif
+
+// Does the compiler support result_of?
+// result_of was deprecated in c++17 and removed in c++ 20
+#ifndef EIGEN_HAS_STD_RESULT_OF
+#if EIGEN_HAS_CXX11 && EIGEN_COMP_CXXVER < 17
+#define EIGEN_HAS_STD_RESULT_OF 1
+#else
+#define EIGEN_HAS_STD_RESULT_OF 0
+#endif
+#endif
+
+// Does the compiler support std::hash?
+#ifndef EIGEN_HAS_STD_HASH
+// The std::hash struct is defined in C++11 but is not labelled as a __device__
+// function and is not constexpr, so cannot be used on device.
+#if EIGEN_HAS_CXX11 && !defined(EIGEN_GPU_COMPILE_PHASE)
+#define EIGEN_HAS_STD_HASH 1
+#else
+#define EIGEN_HAS_STD_HASH 0
+#endif
+#endif  // EIGEN_HAS_STD_HASH
+
+#ifndef EIGEN_HAS_STD_INVOKE_RESULT
+#if EIGEN_MAX_CPP_VER >= 17 && EIGEN_COMP_CXXVER >= 17
+#define EIGEN_HAS_STD_INVOKE_RESULT 1
+#else
+#define EIGEN_HAS_STD_INVOKE_RESULT 0
+#endif
+#endif
+
+#ifndef EIGEN_HAS_ALIGNAS
+#if EIGEN_MAX_CPP_VER>=11 && EIGEN_HAS_CXX11 &&   \
+      (     __has_feature(cxx_alignas)            \
+        ||  EIGEN_HAS_CXX14                       \
+        || (EIGEN_COMP_MSVC >= 1800)              \
+        || (EIGEN_GNUC_AT_LEAST(4,8))             \
+        || (EIGEN_COMP_CLANG>=305)                \
+        || (EIGEN_COMP_ICC>=1500)                 \
+        || (EIGEN_COMP_PGI>=1500)                 \
+        || (EIGEN_COMP_SUNCC>=0x5130))
+#define EIGEN_HAS_ALIGNAS 1
+#else
+#define EIGEN_HAS_ALIGNAS 0
+#endif
+#endif
+
+// Does the compiler support type_traits?
+// - full support of type traits was added only to GCC 5.1.0.
+// - 20150626 corresponds to the last release of 4.x libstdc++
+#ifndef EIGEN_HAS_TYPE_TRAITS
+#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || EIGEN_COMP_MSVC >= 1700) \
+  && ((!EIGEN_COMP_GNUC_STRICT) || EIGEN_GNUC_AT_LEAST(5, 1)) \
+  && ((!defined(__GLIBCXX__))   || __GLIBCXX__ > 20150626)
+#define EIGEN_HAS_TYPE_TRAITS 1
+#define EIGEN_INCLUDE_TYPE_TRAITS
+#else
+#define EIGEN_HAS_TYPE_TRAITS 0
+#endif
 #endif
 
 // Does the compiler support variadic templates?
-#if __cplusplus > 199711L
+#ifndef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_COMP_CXXVER >= 11) \
+  && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_COMP_NVCC >= 80000) )
+    // ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices:
+    //    this prevents nvcc from crashing when compiling Eigen on Tegra X1
 #define EIGEN_HAS_VARIADIC_TEMPLATES 1
+#elif  EIGEN_MAX_CPP_VER>=11 && (EIGEN_COMP_CXXVER >= 11) && defined(SYCL_DEVICE_ONLY)
+#define EIGEN_HAS_VARIADIC_TEMPLATES 1
+#else
+#define EIGEN_HAS_VARIADIC_TEMPLATES 0
+#endif
 #endif
 
-// Does the compiler support const expressions?
-#if __cplusplus > 199711L && !defined(__NVCC__) && !defined(GOOGLE_LIBCXX) && !defined(__APPLE__)
-#define EIGEN_HAS_CONSTEXPR 1
+// Does the compiler fully support const expressions? (as in c++14)
+#ifndef EIGEN_HAS_CONSTEXPR
+  #if defined(EIGEN_CUDACC)
+  // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
+    #if EIGEN_MAX_CPP_VER>=14 && (EIGEN_COMP_CXXVER >= 11 && (EIGEN_COMP_CLANG || EIGEN_COMP_NVCC >= 70500))
+      #define EIGEN_HAS_CONSTEXPR 1
+    #endif
+  #elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (EIGEN_COMP_CXXVER >= 14) || \
+    (EIGEN_GNUC_AT_LEAST(4,8) && (EIGEN_COMP_CXXVER >= 11)) || \
+    (EIGEN_COMP_CLANG >= 306 && (EIGEN_COMP_CXXVER >= 11)))
+    #define EIGEN_HAS_CONSTEXPR 1
+  #endif
+
+  #ifndef EIGEN_HAS_CONSTEXPR
+    #define EIGEN_HAS_CONSTEXPR 0
+  #endif
+
+#endif // EIGEN_HAS_CONSTEXPR
+
+#if EIGEN_HAS_CONSTEXPR
+#define EIGEN_CONSTEXPR constexpr
+#else
+#define EIGEN_CONSTEXPR
 #endif
 
-/** Allows to disable some optimizations which might affect the accuracy of the result.
-  * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
-  * They currently include:
-  *   - single precision Cwise::sin() and Cwise::cos() when SSE vectorization is enabled.
-  */
-#ifndef EIGEN_FAST_MATH
-#define EIGEN_FAST_MATH 1
+// Does the compiler support C++11 math?
+// Let's be conservative and enable the default C++11 implementation only if we are sure it exists
+#ifndef EIGEN_HAS_CXX11_MATH
+  #if EIGEN_MAX_CPP_VER>=11 && ((EIGEN_COMP_CXXVER > 11) || (EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC)  \
+      && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC))
+    #define EIGEN_HAS_CXX11_MATH 1
+  #else
+    #define EIGEN_HAS_CXX11_MATH 0
+  #endif
 #endif
 
+// Does the compiler support proper C++11 containers?
+#ifndef EIGEN_HAS_CXX11_CONTAINERS
+  #if    EIGEN_MAX_CPP_VER>=11 && \
+         ((EIGEN_COMP_CXXVER > 11) \
+      || ((EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC>=1400)))
+    #define EIGEN_HAS_CXX11_CONTAINERS 1
+  #else
+    #define EIGEN_HAS_CXX11_CONTAINERS 0
+  #endif
+#endif
+
+// Does the compiler support C++11 noexcept?
+#ifndef EIGEN_HAS_CXX11_NOEXCEPT
+  #if    EIGEN_MAX_CPP_VER>=11 && \
+         (__has_feature(cxx_noexcept) \
+      || (EIGEN_COMP_CXXVER > 11) \
+      || ((EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC>=1400)))
+    #define EIGEN_HAS_CXX11_NOEXCEPT 1
+  #else
+    #define EIGEN_HAS_CXX11_NOEXCEPT 0
+  #endif
+#endif
+
+#ifndef EIGEN_HAS_CXX11_ATOMIC
+  #if    EIGEN_MAX_CPP_VER>=11 && \
+         (__has_feature(cxx_atomic) \
+      || (EIGEN_COMP_CXXVER > 11) \
+      || ((EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_MSVC==0 || EIGEN_COMP_MSVC >= 1700)))
+    #define EIGEN_HAS_CXX11_ATOMIC 1
+  #else
+    #define EIGEN_HAS_CXX11_ATOMIC 0
+  #endif
+#endif
+
+#ifndef EIGEN_HAS_CXX11_OVERRIDE_FINAL
+  #if    EIGEN_MAX_CPP_VER>=11 && \
+       (EIGEN_COMP_CXXVER >= 11 || EIGEN_COMP_MSVC >= 1700)
+    #define EIGEN_HAS_CXX11_OVERRIDE_FINAL 1
+  #else
+    #define EIGEN_HAS_CXX11_OVERRIDE_FINAL 0
+  #endif
+#endif
+
+// NOTE: the required Apple's clang version is very conservative
+//       and it could be that XCode 9 works just fine.
+// NOTE: the MSVC version is based on https://en.cppreference.com/w/cpp/compiler_support
+//       and not tested.
+#ifndef EIGEN_HAS_CXX17_OVERALIGN
+#if EIGEN_MAX_CPP_VER>=17 && EIGEN_COMP_CXXVER>=17 && (                                 \
+           (EIGEN_COMP_MSVC >= 1912)                                                    \
+        || (EIGEN_GNUC_AT_LEAST(7,0))                                                   \
+        || ((!defined(__apple_build_version__)) && (EIGEN_COMP_CLANG>=500))             \
+        || (( defined(__apple_build_version__)) && (__apple_build_version__>=10000000)) \
+      )
+#define EIGEN_HAS_CXX17_OVERALIGN 1
+#else
+#define EIGEN_HAS_CXX17_OVERALIGN 0
+#endif
+#endif
+
+#if defined(EIGEN_CUDACC) && EIGEN_HAS_CONSTEXPR
+  // While available already with c++11, this is useful mostly starting with c++14 and relaxed constexpr rules
+  #if defined(__NVCC__)
+    // nvcc considers constexpr functions as __host__ __device__ with the option --expt-relaxed-constexpr
+    #ifdef __CUDACC_RELAXED_CONSTEXPR__
+      #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC
+    #endif
+  #elif defined(__clang__) && defined(__CUDA__) && __has_feature(cxx_relaxed_constexpr)
+    // clang++ always considers constexpr functions as implicitly __host__ __device__
+    #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC
+  #endif
+#endif
+
+// Does the compiler support the __int128 and __uint128_t extensions for 128-bit
+// integer arithmetic?
+//
+// Clang and GCC define __SIZEOF_INT128__ when these extensions are supported,
+// but we avoid using them in certain cases:
+//
+// * Building using Clang for Windows, where the Clang runtime library has
+//   128-bit support only on LP64 architectures, but Windows is LLP64.
+#ifndef EIGEN_HAS_BUILTIN_INT128
+#if defined(__SIZEOF_INT128__) && !(EIGEN_OS_WIN && EIGEN_COMP_CLANG)
+#define EIGEN_HAS_BUILTIN_INT128 1
+#else
+#define EIGEN_HAS_BUILTIN_INT128 0
+#endif
+#endif
+
+//------------------------------------------------------------------------------------------
+// Preprocessor programming helpers
+//------------------------------------------------------------------------------------------
+
+// This macro can be used to prevent from macro expansion, e.g.:
+//   std::max EIGEN_NOT_A_MACRO(a,b)
+#define EIGEN_NOT_A_MACRO
+
 #define EIGEN_DEBUG_VAR(x) std::cerr << #x << " = " << x << std::endl;
 
 // concatenate two tokens
 #define EIGEN_CAT2(a,b) a ## b
 #define EIGEN_CAT(a,b) EIGEN_CAT2(a,b)
 
+#define EIGEN_COMMA ,
+
 // convert a token to a string
 #define EIGEN_MAKESTRING2(a) #a
 #define EIGEN_MAKESTRING(a) EIGEN_MAKESTRING2(a)
@@ -429,20 +910,23 @@
 // EIGEN_STRONG_INLINE is a stronger version of the inline, using __forceinline on MSVC,
 // but it still doesn't use GCC's always_inline. This is useful in (common) situations where MSVC needs forceinline
 // but GCC is still doing fine with just inline.
-#if EIGEN_COMP_MSVC || EIGEN_COMP_ICC
+#ifndef EIGEN_STRONG_INLINE
+#if (EIGEN_COMP_MSVC || EIGEN_COMP_ICC) && !defined(EIGEN_GPUCC)
 #define EIGEN_STRONG_INLINE __forceinline
 #else
 #define EIGEN_STRONG_INLINE inline
 #endif
+#endif
 
 // EIGEN_ALWAYS_INLINE is the stronget, it has the effect of making the function inline and adding every possible
 // attribute to maximize inlining. This should only be used when really necessary: in particular,
 // it uses __attribute__((always_inline)) on GCC, which most of the time is useless and can severely harm compile times.
 // FIXME with the always_inline attribute,
-// gcc 3.4.x reports the following compilation error:
+// gcc 3.4.x and 4.1 reports the following compilation error:
 //   Eval.h:91: sorry, unimplemented: inlining failed in call to 'const Eigen::Eval<Derived> Eigen::MatrixBase<Scalar, Derived>::eval() const'
 //    : function body not available
-#if EIGEN_GNUC_AT_LEAST(4,0)
+//   See also bug 1367
+#if EIGEN_GNUC_AT_LEAST(4,2) && !defined(SYCL_DEVICE_ONLY)
 #define EIGEN_ALWAYS_INLINE __attribute__((always_inline)) inline
 #else
 #define EIGEN_ALWAYS_INLINE EIGEN_STRONG_INLINE
@@ -462,20 +946,43 @@
 #define EIGEN_PERMISSIVE_EXPR
 #endif
 
-#if EIGEN_COMP_GNUC
-#define EIGEN_LIKELY(x) __builtin_expect((x), 1)
-#define EIGEN_UNLIKELY(x) __builtin_expect((x), 0)
-#else
-#define EIGEN_LIKELY(x) (x)
-#define EIGEN_UNLIKELY(x) (x)
+// GPU stuff
+
+// Disable some features when compiling with GPU compilers (NVCC/clang-cuda/SYCL/HIPCC)
+#if defined(EIGEN_CUDACC) || defined(SYCL_DEVICE_ONLY) || defined(EIGEN_HIPCC)
+  // Do not try asserts on device code
+  #ifndef EIGEN_NO_DEBUG
+  #define EIGEN_NO_DEBUG
+  #endif
+
+  #ifdef EIGEN_INTERNAL_DEBUGGING
+  #undef EIGEN_INTERNAL_DEBUGGING
+  #endif
+
+  #ifdef EIGEN_EXCEPTIONS
+  #undef EIGEN_EXCEPTIONS
+  #endif
 #endif
 
+#if defined(SYCL_DEVICE_ONLY)
+  #ifndef EIGEN_DONT_VECTORIZE
+    #define EIGEN_DONT_VECTORIZE
+  #endif
+  #define EIGEN_DEVICE_FUNC __attribute__((flatten)) __attribute__((always_inline))
+// All functions callable from CUDA/HIP code must be qualified with __device__
+#elif defined(EIGEN_GPUCC)
+    #define EIGEN_DEVICE_FUNC __host__ __device__
+#else
+  #define EIGEN_DEVICE_FUNC
+#endif
+
+
 // this macro allows to get rid of linking errors about multiply defined functions.
 //  - static is not very good because it prevents definitions from different object files to be merged.
 //           So static causes the resulting linked executable to be bloated with multiple copies of the same function.
 //  - inline is not perfect either as it unwantedly hints the compiler toward inlining the function.
-#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS inline
+#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC
+#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC inline
 
 #ifdef NDEBUG
 # ifndef EIGEN_NO_DEBUG
@@ -483,51 +990,13 @@
 # endif
 #endif
 
-#if !defined(EIGEN_NO_CHECK) || (!defined(EIGEN_NO_DEBUG) && !EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO)
-  // Custom assertion code that works regardless of the compilation mode.
-  #include <cstdlib>   // for abort
-  #include <iostream>  // for std::cerr
-using ::std::cout;
-using ::std::cin;
-using ::std::cerr;
-using ::std::ios;
-using ::std::endl;
-using ::std::iostream;
-using ::std::ios_base;
-using ::std::ostream;
-using ::std::istream;
-
-  namespace Eigen {
-  namespace internal {
-  // trivial function copying a bool. Must be EIGEN_DONT_INLINE, so we implement it after including Eigen headers.
-  // see bug 89.
-  namespace {
-  EIGEN_DONT_INLINE bool copy_bool(bool b) { return b; }
-  }
-  inline void assert_fail(const char *condition, const char *function, const char *file, int line)
-  {
-    copy_bool(true);  // dummy call to avoid warnings about unused functions.
-    std::cerr << "assertion failed: " << condition << " in function " << function << " at " << file << ":" << line << std::endl;
-    abort();
-  }
-  }
-  }
-  #define eigen_internal_check(x) \
-    do { \
-      if(!Eigen::internal::copy_bool(x)) \
-        Eigen::internal::assert_fail(EIGEN_MAKESTRING(x), __PRETTY_FUNCTION__, "", 0); \
-    } while(false)
-#endif
-
-#ifdef EIGEN_NO_CHECK
-  #define eigen_check(x)
-#else
-  #define eigen_check(x) eigen_internal_check(x)
-#endif
-
 // eigen_plain_assert is where we implement the workaround for the assert() bug in GCC <= 4.3, see bug 89
 #ifdef EIGEN_NO_DEBUG
-  #define eigen_plain_assert(x)
+  #ifdef SYCL_DEVICE_ONLY // used to silence the warning on SYCL device
+    #define eigen_plain_assert(x) EIGEN_UNUSED_VARIABLE(x)
+  #else
+    #define eigen_plain_assert(x)
+  #endif
 #else
   #if EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO
     namespace Eigen {
@@ -538,7 +1007,28 @@
     #define eigen_plain_assert(x) assert(x)
   #else
     // work around bug 89
-    #define eigen_plain_assert(x) eigen_internal_check(x)
+    #include <cstdlib>   // for abort
+    #include <iostream>  // for std::cerr
+
+    namespace Eigen {
+    namespace internal {
+    // trivial function copying a bool. Must be EIGEN_DONT_INLINE, so we implement it after including Eigen headers.
+    // see bug 89.
+    namespace {
+    EIGEN_DONT_INLINE bool copy_bool(bool b) { return b; }
+    }
+    inline void assert_fail(const char *condition, const char *function, const char *file, int line)
+    {
+      std::cerr << "assertion failed: " << condition << " in function " << function << " at " << file << ":" << line << std::endl;
+      abort();
+    }
+    }
+    }
+    #define eigen_plain_assert(x) \
+      do { \
+        if(!Eigen::internal::copy_bool(x)) \
+          Eigen::internal::assert_fail(EIGEN_MAKESTRING(x), __PRETTY_FUNCTION__, __FILE__, __LINE__); \
+      } while(false)
   #endif
 #endif
 
@@ -554,7 +1044,7 @@
 #endif
 
 #ifdef EIGEN_NO_DEBUG
-#define EIGEN_ONLY_USED_FOR_DEBUG(x) (void)x
+#define EIGEN_ONLY_USED_FOR_DEBUG(x) EIGEN_UNUSED_VARIABLE(x)
 #else
 #define EIGEN_ONLY_USED_FOR_DEBUG(x)
 #endif
@@ -562,7 +1052,7 @@
 #ifndef EIGEN_NO_DEPRECATED_WARNING
   #if EIGEN_COMP_GNUC
     #define EIGEN_DEPRECATED __attribute__((deprecated))
-  #elif (defined _MSC_VER)
+  #elif EIGEN_COMP_MSVC
     #define EIGEN_DEPRECATED __declspec(deprecated)
   #else
     #define EIGEN_DEPRECATED
@@ -580,55 +1070,96 @@
 // Suppresses 'unused variable' warnings.
 namespace Eigen {
   namespace internal {
-    template<typename T> void ignore_unused_variable(const T&) {}
+    template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ignore_unused_variable(const T&) {}
   }
 }
 #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var);
 
 #if !defined(EIGEN_ASM_COMMENT)
   #if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64)
-    #define EIGEN_ASM_COMMENT(X)  asm("#" X)
+    #define EIGEN_ASM_COMMENT(X)  __asm__("#" X)
   #else
     #define EIGEN_ASM_COMMENT(X)
   #endif
 #endif
 
-/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements.
- * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
- * so that vectorization doesn't affect binary compatibility.
- *
- * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
- * vectorized and non-vectorized code.
- */
-#if (defined __CUDACC__)
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
-#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
-#elif EIGEN_COMP_MSVC
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
-#elif EIGEN_COMP_SUNCC
-  // FIXME not sure about this one:
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
-#else
-  #error Please tell me what is the equivalent of __attribute__((aligned(n))) for your compiler
+
+// Acts as a barrier preventing operations involving `X` from crossing. This
+// occurs, for example, in the fast rounding trick where a magic constant is
+// added then subtracted, which is otherwise compiled away with -ffast-math.
+//
+// See bug 1674
+#if !defined(EIGEN_OPTIMIZATION_BARRIER)
+  #if EIGEN_COMP_GNUC
+    // According to https://gcc.gnu.org/onlinedocs/gcc/Constraints.html:
+    //   X: Any operand whatsoever.
+    //   r: A register operand is allowed provided that it is in a general
+    //      register.
+    //   g: Any register, memory or immediate integer operand is allowed, except
+    //      for registers that are not general registers.
+    //   w: (AArch32/AArch64) Floating point register, Advanced SIMD vector
+    //      register or SVE vector register.
+    //   x: (SSE) Any SSE register.
+    //      (AArch64) Like w, but restricted to registers 0 to 15 inclusive.
+    //   v: (PowerPC) An Altivec vector register.
+    //   wa:(PowerPC) A VSX register.
+    //
+    // "X" (uppercase) should work for all cases, though this seems to fail for
+    // some versions of GCC for arm/aarch64 with
+    //   "error: inconsistent operand constraints in an 'asm'"
+    // Clang x86_64/arm/aarch64 seems to require "g" to support both scalars and
+    // vectors, otherwise
+    //   "error: non-trivial scalar-to-vector conversion, possible invalid
+    //    constraint for vector type"
+    //
+    // GCC for ppc64le generates an internal compiler error with x/X/g.
+    // GCC for AVX generates an internal compiler error with X.
+    //
+    // Tested on icc/gcc/clang for sse, avx, avx2, avx512dq
+    //           gcc for arm, aarch64,
+    //           gcc for ppc64le,
+    // both vectors and scalars.
+    //
+    // Note that this is restricted to plain types - this will not work
+    // directly for std::complex<T>, Eigen::half, Eigen::bfloat16. For these,
+    // you will need to apply to the underlying POD type.
+    #if EIGEN_ARCH_PPC && EIGEN_COMP_GNUC_STRICT
+      // This seems to be broken on clang.  Packet4f is loaded into a single
+      //   register rather than a vector, zeroing out some entries.  Integer
+      //   types also generate a compile error.
+      // General, Altivec, VSX.
+      #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+r,v,wa" (X));
+    #elif EIGEN_ARCH_ARM_OR_ARM64
+      // General, NEON.
+      // Clang doesn't like "r",
+      //    error: non-trivial scalar-to-vector conversion, possible invalid
+      //           constraint for vector type
+      // GCC < 5 doesn't like "g",
+      //    error: 'asm' operand requires impossible reload
+      #if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(5, 0)
+        #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+r,w" (X));
+      #else
+        #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+g,w" (X));
+      #endif
+    #elif EIGEN_ARCH_i386_OR_x86_64
+      // General, SSE.
+      #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+g,x" (X));
+    #else
+      // Not implemented for other architectures.
+      #define EIGEN_OPTIMIZATION_BARRIER(X)
+    #endif
+  #else
+    // Not implemented for other compilers.
+    #define EIGEN_OPTIMIZATION_BARRIER(X)
+  #endif
 #endif
 
-#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
-#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
-#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64)
-#define EIGEN_ALIGN_DEFAULT EIGEN_ALIGN_TO_BOUNDARY(EIGEN_ALIGN_BYTES)
-#define EIGEN_ALIGN_MAX EIGEN_ALIGN_DEFAULT
-
-#if EIGEN_ALIGN_STATICALLY
-#define EIGEN_USER_ALIGN_TO_BOUNDARY(n) EIGEN_ALIGN_TO_BOUNDARY(n)
-#define EIGEN_USER_ALIGN16 EIGEN_ALIGN16
-#define EIGEN_USER_ALIGN32 EIGEN_ALIGN32
-#define EIGEN_USER_ALIGN_DEFAULT EIGEN_ALIGN_DEFAULT
+#if EIGEN_COMP_MSVC
+  // NOTE MSVC often gives C4127 warnings with compiletime if statements. See bug 1362.
+  // This workaround is ugly, but it does the job.
+#  define EIGEN_CONST_CONDITIONAL(cond)  (void)0, cond
 #else
-#define EIGEN_USER_ALIGN_TO_BOUNDARY(n)
-#define EIGEN_USER_ALIGN16
-#define EIGEN_USER_ALIGN32
-#define EIGEN_USER_ALIGN_DEFAULT
+#  define EIGEN_CONST_CONDITIONAL(cond)  cond
 #endif
 
 #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD
@@ -638,21 +1169,11 @@
   #define EIGEN_RESTRICT __restrict
 #endif
 
-#ifndef EIGEN_STACK_ALLOCATION_LIMIT
-#if defined(__AVX512F__)
-// When compiled with avx512f option, stack allocation limit 20000 is too small
-// and causes static assert.
-#define EIGEN_STACK_ALLOCATION_LIMIT 131072
-#else
-#define EIGEN_STACK_ALLOCATION_LIMIT 20000
-#endif
-#endif
 
 #ifndef EIGEN_DEFAULT_IO_FORMAT
 #ifdef EIGEN_MAKING_DOCS
 // format used in Eigen's documentation
-// needed to define it here as escaping characters in CMake add_definition's
-// argument seems very problematic.
+// needed to define it here as escaping characters in CMake add_definition's argument seems very problematic.
 #define EIGEN_DEFAULT_IO_FORMAT Eigen::IOFormat(3, 0, " ", "\n", "", "")
 #else
 #define EIGEN_DEFAULT_IO_FORMAT Eigen::IOFormat()
@@ -662,7 +1183,23 @@
 // just an empty macro !
 #define EIGEN_EMPTY
 
-#if EIGEN_COMP_MSVC_STRICT && EIGEN_COMP_MSVC < 1800 // for older MSVC versions using the base operator is sufficient (cf Bug 1000)
+
+// When compiling CUDA/HIP device code with NVCC or HIPCC
+// pull in math functions from the global namespace.
+// In host mode, and when device code is compiled with clang,
+// use the std versions.
+#if (defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)) || defined(EIGEN_HIP_DEVICE_COMPILE)
+  #define EIGEN_USING_STD(FUNC) using ::FUNC;
+#else
+  #define EIGEN_USING_STD(FUNC) using std::FUNC;
+#endif
+
+#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || (EIGEN_COMP_MSVC == 1900 && EIGEN_COMP_NVCC))
+  // For older MSVC versions, as well as 1900 && CUDA 8, using the base operator is necessary,
+  //   otherwise we get duplicate definition errors
+  // For later MSVC versions, we require explicit operator= definition, otherwise we get
+  //   use of implicitly deleted operator errors.
+  // (cf Bugs 920, 1000, 1324, 2291)
   #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
     using Base::operator =;
 #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
@@ -681,7 +1218,49 @@
     }
 #endif
 
-#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)
+
+/**
+ * \internal
+ * \brief Macro to explicitly define the default copy constructor.
+ * This is necessary, because the implicit definition is deprecated if the copy-assignment is overridden.
+ */
+#if EIGEN_HAS_CXX11
+#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) EIGEN_DEVICE_FUNC CLASS(const CLASS&) = default;
+#else
+#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS)
+#endif
+
+
+
+/** \internal
+ * \brief Macro to manually inherit assignment operators.
+ * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined.
+ * With C++11 or later this also default-implements the copy-constructor
+ */
+#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived)  \
+    EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived)
+
+/** \internal
+ * \brief Macro to manually define default constructors and destructors.
+ * This is necessary when the copy constructor is re-defined.
+ * For empty helper classes this should usually be protected, to avoid accidentally creating empty objects.
+ *
+ * Hiding the default destructor lead to problems in C++03 mode together with boost::multiprecision
+ */
+#if EIGEN_HAS_CXX11
+#define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived)  \
+    EIGEN_DEVICE_FUNC Derived() = default; \
+    EIGEN_DEVICE_FUNC ~Derived() = default;
+#else
+#define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived)  \
+    EIGEN_DEVICE_FUNC Derived() {}; \
+    /* EIGEN_DEVICE_FUNC ~Derived() {}; */
+#endif
+
+
+
+
 
 /**
 * Just a side note. Commenting within defines works only by documenting
@@ -695,32 +1274,13 @@
   typedef typename Eigen::internal::traits<Derived>::Scalar Scalar; /*!< \brief Numeric type, e.g. float, double, int or std::complex<float>. */ \
   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; /*!< \brief The underlying numeric type for composed scalar types. \details In cases where Scalar is e.g. std::complex<T>, T were corresponding to RealScalar. */ \
   typedef typename Base::CoeffReturnType CoeffReturnType; /*!< \brief The return type for coefficient access. \details Depending on whether the object allows direct coefficient access (e.g. for a MatrixXd), this type is either 'const Scalar&' or simply 'Scalar' for objects that do not allow direct coefficient access. */ \
-  typedef typename Eigen::internal::nested<Derived>::type Nested; \
+  typedef typename Eigen::internal::ref_selector<Derived>::type Nested; \
   typedef typename Eigen::internal::traits<Derived>::StorageKind StorageKind; \
-  typedef typename Eigen::internal::traits<Derived>::Index Index; \
-  enum { RowsAtCompileTime = Eigen::internal::traits<Derived>::RowsAtCompileTime, \
+  typedef typename Eigen::internal::traits<Derived>::StorageIndex StorageIndex; \
+  enum CompileTimeTraits \
+      { RowsAtCompileTime = Eigen::internal::traits<Derived>::RowsAtCompileTime, \
         ColsAtCompileTime = Eigen::internal::traits<Derived>::ColsAtCompileTime, \
         Flags = Eigen::internal::traits<Derived>::Flags, \
-        CoeffReadCost = Eigen::internal::traits<Derived>::CoeffReadCost, \
-        SizeAtCompileTime = Base::SizeAtCompileTime, \
-        MaxSizeAtCompileTime = Base::MaxSizeAtCompileTime, \
-        IsVectorAtCompileTime = Base::IsVectorAtCompileTime };
-
-
-#define EIGEN_DENSE_PUBLIC_INTERFACE(Derived) \
-  typedef typename Eigen::internal::traits<Derived>::Scalar Scalar; /*!< \brief Numeric type, e.g. float, double, int or std::complex<float>. */ \
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; /*!< \brief The underlying numeric type for composed scalar types. \details In cases where Scalar is e.g. std::complex<T>, T were corresponding to RealScalar. */ \
-  typedef typename Base::PacketScalar PacketScalar; \
-  typedef typename Base::CoeffReturnType CoeffReturnType; /*!< \brief The return type for coefficient access. \details Depending on whether the object allows direct coefficient access (e.g. for a MatrixXd), this type is either 'const Scalar&' or simply 'Scalar' for objects that do not allow direct coefficient access. */ \
-  typedef typename Eigen::internal::nested<Derived>::type Nested; \
-  typedef typename Eigen::internal::traits<Derived>::StorageKind StorageKind; \
-  typedef typename Eigen::internal::traits<Derived>::Index Index; \
-  enum { RowsAtCompileTime = Eigen::internal::traits<Derived>::RowsAtCompileTime, \
-        ColsAtCompileTime = Eigen::internal::traits<Derived>::ColsAtCompileTime, \
-        MaxRowsAtCompileTime = Eigen::internal::traits<Derived>::MaxRowsAtCompileTime, \
-        MaxColsAtCompileTime = Eigen::internal::traits<Derived>::MaxColsAtCompileTime, \
-        Flags = Eigen::internal::traits<Derived>::Flags, \
-        CoeffReadCost = Eigen::internal::traits<Derived>::CoeffReadCost, \
         SizeAtCompileTime = Base::SizeAtCompileTime, \
         MaxSizeAtCompileTime = Base::MaxSizeAtCompileTime, \
         IsVectorAtCompileTime = Base::IsVectorAtCompileTime }; \
@@ -728,6 +1288,12 @@
   using Base::const_cast_derived;
 
 
+// FIXME Maybe the EIGEN_DENSE_PUBLIC_INTERFACE could be removed as importing PacketScalar is rarely needed
+#define EIGEN_DENSE_PUBLIC_INTERFACE(Derived) \
+  EIGEN_GENERIC_PUBLIC_INTERFACE(Derived) \
+  typedef typename Base::PacketScalar PacketScalar;
+
+
 #define EIGEN_PLAIN_ENUM_MIN(a,b) (((int)a <= (int)b) ? (int)a : (int)b)
 #define EIGEN_PLAIN_ENUM_MAX(a,b) (((int)a >= (int)b) ? (int)a : (int)b)
 
@@ -757,18 +1323,18 @@
 
 #define EIGEN_IMPLIES(a,b) (!(a) || (b))
 
-#define EIGEN_MAKE_CWISE_BINARY_OP(METHOD,FUNCTOR) \
-  template<typename OtherDerived> \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<FUNCTOR<Scalar>, const Derived, const OtherDerived> \
-  (METHOD)(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
-  { \
-    return CwiseBinaryOp<FUNCTOR<Scalar>, const Derived, const OtherDerived>(derived(), other.derived()); \
-  }
+#if EIGEN_HAS_BUILTIN(__builtin_expect) || EIGEN_COMP_GNUC
+#define EIGEN_PREDICT_FALSE(x) (__builtin_expect(x, false))
+#define EIGEN_PREDICT_TRUE(x) (__builtin_expect(false || (x), true))
+#else
+#define EIGEN_PREDICT_FALSE(x) (x)
+#define EIGEN_PREDICT_TRUE(x) (x)
+#endif
 
-// the expression type of a cwise product
-#define EIGEN_CWISE_PRODUCT_RETURN_TYPE(LHS,RHS) \
+// the expression type of a standard coefficient wise binary operation
+#define EIGEN_CWISE_BINARY_RETURN_TYPE(LHS,RHS,OPNAME) \
     CwiseBinaryOp< \
-      internal::scalar_product_op< \
+      EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)< \
           typename internal::traits<LHS>::Scalar, \
           typename internal::traits<RHS>::Scalar \
       >, \
@@ -776,15 +1342,132 @@
       const RHS \
     >
 
-// Expression to disable exact quality warnings on floats with gcc
-#if EIGEN_COMP_GNUC && __cplusplus > 199711L && !defined(__CUDACC__)
-#define EIGEN_DISABLE_FLOAT_EQUALITY_WARNING \
-  _Pragma("GCC diagnostic push")             \
-  _Pragma("GCC diagnostic ignored \"-Wfloat-equal\"")
-#define EIGEN_ENABLE_FLOAT_EQUALITY_WARNING _Pragma("GCC diagnostic pop")
+#define EIGEN_MAKE_CWISE_BINARY_OP(METHOD,OPNAME) \
+  template<typename OtherDerived> \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,OPNAME) \
+  (METHOD)(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
+  { \
+    return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,OPNAME)(derived(), other.derived()); \
+  }
+
+#define EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,TYPEA,TYPEB) \
+  (Eigen::internal::has_ReturnType<Eigen::ScalarBinaryOpTraits<TYPEA,TYPEB,EIGEN_CAT(EIGEN_CAT(Eigen::internal::scalar_,OPNAME),_op)<TYPEA,TYPEB>  > >::value)
+
+#define EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(EXPR,SCALAR,OPNAME) \
+  CwiseBinaryOp<EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)<typename internal::traits<EXPR>::Scalar,SCALAR>, const EXPR, \
+                const typename internal::plain_constant_type<EXPR,SCALAR>::type>
+
+#define EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(SCALAR,EXPR,OPNAME) \
+  CwiseBinaryOp<EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)<SCALAR,typename internal::traits<EXPR>::Scalar>, \
+                const typename internal::plain_constant_type<EXPR,SCALAR>::type, const EXPR>
+
+// Workaround for MSVC 2010 (see ML thread "patch with compile for for MSVC 2010")
+#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC_STRICT<=1600)
+#define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) typename internal::enable_if<true,X>::type
 #else
-#define EIGEN_DISABLE_FLOAT_EQUALITY_WARNING
-#define EIGEN_ENABLE_FLOAT_EQUALITY_WARNING
+#define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) X
+#endif
+
+#define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) \
+  template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,Scalar,T)>::type,OPNAME))\
+  (METHOD)(const T& scalar) const { \
+    typedef typename internal::promote_scalar_arg<Scalar,T,EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,Scalar,T)>::type PromotedT; \
+    return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedT,OPNAME)(derived(), \
+           typename internal::plain_constant_type<Derived,PromotedT>::type(derived().rows(), derived().cols(), internal::scalar_constant_op<PromotedT>(scalar))); \
+  }
+
+#define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \
+  template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend \
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,T,Scalar)>::type,Derived,OPNAME)) \
+  (METHOD)(const T& scalar, const StorageBaseType& matrix) { \
+    typedef typename internal::promote_scalar_arg<Scalar,T,EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,T,Scalar)>::type PromotedT; \
+    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedT,Derived,OPNAME)( \
+           typename internal::plain_constant_type<Derived,PromotedT>::type(matrix.derived().rows(), matrix.derived().cols(), internal::scalar_constant_op<PromotedT>(scalar)), matrix.derived()); \
+  }
+
+#define EIGEN_MAKE_SCALAR_BINARY_OP(METHOD,OPNAME) \
+  EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \
+  EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME)
+
+
+#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) && !defined(EIGEN_HIP_DEVICE_COMPILE)
+  #define EIGEN_EXCEPTIONS
+#endif
+
+
+#ifdef EIGEN_EXCEPTIONS
+#  define EIGEN_THROW_X(X) throw X
+#  define EIGEN_THROW throw
+#  define EIGEN_TRY try
+#  define EIGEN_CATCH(X) catch (X)
+#else
+#  if defined(EIGEN_CUDA_ARCH)
+#    define EIGEN_THROW_X(X) asm("trap;")
+#    define EIGEN_THROW asm("trap;")
+#  elif defined(EIGEN_HIP_DEVICE_COMPILE)
+#    define EIGEN_THROW_X(X) asm("s_trap 0")
+#    define EIGEN_THROW asm("s_trap 0")
+#  else
+#    define EIGEN_THROW_X(X) std::abort()
+#    define EIGEN_THROW std::abort()
+#  endif
+#  define EIGEN_TRY if (true)
+#  define EIGEN_CATCH(X) else
+#endif
+
+
+#if EIGEN_HAS_CXX11_NOEXCEPT
+#   define EIGEN_INCLUDE_TYPE_TRAITS
+#   define EIGEN_NOEXCEPT noexcept
+#   define EIGEN_NOEXCEPT_IF(x) noexcept(x)
+#   define EIGEN_NO_THROW noexcept(true)
+#   define EIGEN_EXCEPTION_SPEC(X) noexcept(false)
+#else
+#   define EIGEN_NOEXCEPT
+#   define EIGEN_NOEXCEPT_IF(x)
+#   define EIGEN_NO_THROW throw()
+#   if EIGEN_COMP_MSVC || EIGEN_COMP_CXXVER>=17
+      // MSVC does not support exception specifications (warning C4290),
+      // and they are deprecated in c++11 anyway. This is even an error in c++17.
+#     define EIGEN_EXCEPTION_SPEC(X) throw()
+#   else
+#     define EIGEN_EXCEPTION_SPEC(X) throw(X)
+#   endif
+#endif
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+// The all function is used to enable a variadic version of eigen_assert which can take a parameter pack as its input.
+namespace Eigen {
+namespace internal {
+
+inline bool all(){ return true; }
+
+template<typename T, typename ...Ts>
+bool all(T t, Ts ... ts){ return t && all(ts...); }
+
+}
+}
+#endif
+
+#if EIGEN_HAS_CXX11_OVERRIDE_FINAL
+// provide override and final specifiers if they are available:
+#   define EIGEN_OVERRIDE override
+#   define EIGEN_FINAL final
+#else
+#   define EIGEN_OVERRIDE
+#   define EIGEN_FINAL
+#endif
+
+// Wrapping #pragma unroll in a macro since it is required for SYCL
+#if defined(SYCL_DEVICE_ONLY)
+  #if defined(_MSC_VER)
+    #define EIGEN_UNROLL_LOOP __pragma(unroll)
+  #else
+    #define EIGEN_UNROLL_LOOP _Pragma("unroll")
+  #endif
+#else
+  #define EIGEN_UNROLL_LOOP
 #endif
 
 #endif // EIGEN_MACROS_H

diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 4f3f672..875318c 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2008-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
 // Copyright (C) 2009 Kenneth Riddile <kfriddile@yahoo.com>
 // Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
@@ -20,40 +20,72 @@
 #ifndef EIGEN_MEMORY_H
 #define EIGEN_MEMORY_H
 
-// See bug 554 (http://eigen.tuxfamily.org/bz/show_bug.cgi?id=554)
-// It seems to be unsafe to check _POSIX_ADVISORY_INFO without including unistd.h first.
-// Currently, let's include it only on unix systems:
-#if defined(__unix__) || defined(__unix)
-  #include <unistd.h>
-  #if ((defined __QNXNTO__) || (defined _GNU_SOURCE) || ((defined _XOPEN_SOURCE) && (_XOPEN_SOURCE >= 600))) && (defined _POSIX_ADVISORY_INFO) && (_POSIX_ADVISORY_INFO > 0)
-    #define EIGEN_HAS_POSIX_MEMALIGN 1
-  #endif
-#endif
+#ifndef EIGEN_MALLOC_ALREADY_ALIGNED
 
-#ifndef EIGEN_HAS_POSIX_MEMALIGN
-  #define EIGEN_HAS_POSIX_MEMALIGN 0
-#endif
+// Try to determine automatically if malloc is already aligned.
 
-#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_AVX
-  #define EIGEN_HAS_MM_MALLOC 1
+// On 64-bit systems, glibc's malloc returns 16-byte-aligned pointers, see:
+//   http://www.gnu.org/s/libc/manual/html_node/Aligned-Memory-Blocks.html
+// This is true at least since glibc 2.8.
+// This leaves the question how to detect 64-bit. According to this document,
+//   http://gcc.fyxm.net/summit/2003/Porting%20to%2064%20bit.pdf
+// page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed
+// quite safe, at least within the context of glibc, to equate 64-bit with LP64.
+#if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \
+ && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && (EIGEN_DEFAULT_ALIGN_BYTES == 16)
+  #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1
 #else
-  #define EIGEN_HAS_MM_MALLOC 0
+  #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0
+#endif
+
+// FreeBSD 6 seems to have 16-byte aligned malloc
+//   See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup
+// FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures
+//   See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup
+#if defined(__FreeBSD__) && !(EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) && (EIGEN_DEFAULT_ALIGN_BYTES == 16)
+  #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1
+#else
+  #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0
+#endif
+
+#if (EIGEN_OS_MAC && (EIGEN_DEFAULT_ALIGN_BYTES == 16))     \
+ || (EIGEN_OS_WIN64 && (EIGEN_DEFAULT_ALIGN_BYTES == 16))   \
+ || EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED              \
+ || EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED
+  #define EIGEN_MALLOC_ALREADY_ALIGNED 1
+#else
+  #define EIGEN_MALLOC_ALREADY_ALIGNED 0
+#endif
+
 #endif
 
 namespace Eigen {
 
 namespace internal {
 
-EIGEN_DEVICE_FUNC inline void throw_std_bad_alloc()
+EIGEN_DEVICE_FUNC
+inline void throw_std_bad_alloc()
 {
-#ifndef __CUDA_ARCH__
   #ifdef EIGEN_EXCEPTIONS
     throw std::bad_alloc();
   #else
     std::size_t huge = static_cast<std::size_t>(-1);
+    #if defined(EIGEN_HIPCC)
+    //
+    // calls to "::operator new" are to be treated as opaque function calls (i.e no inlining),
+    // and as a consequence the code in the #else block triggers the hipcc warning :
+    // "no overloaded function has restriction specifiers that are compatible with the ambient context"
+    //
+    // "throw_std_bad_alloc" has the EIGEN_DEVICE_FUNC attribute, so it seems that hipcc expects
+    // the same on "operator new"
+    // Reverting code back to the old version in this #if block for the hipcc compiler
+    //
     new int[huge];
+    #else
+    void* unused = ::operator new(huge);
+    EIGEN_UNUSED_VARIABLE(unused);
+    #endif
   #endif
-#endif
 }
 
 /*****************************************************************************
@@ -65,24 +97,31 @@
 /** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned.
   * Fast, but wastes 16 additional bytes of memory. Does not throw any exception.
   */
-inline void* handmade_aligned_malloc(std::size_t size)
+EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES)
 {
-  void *original = std::malloc(size+EIGEN_ALIGN_BYTES);
+  eigen_assert(alignment >= sizeof(void*) && (alignment & (alignment-1)) == 0 && "Alignment must be at least sizeof(void*) and a power of 2");
+
+  EIGEN_USING_STD(malloc)
+  void *original = malloc(size+alignment);
+  
   if (original == 0) return 0;
-  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES);
+  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(alignment-1))) + alignment);
   *(reinterpret_cast<void**>(aligned) - 1) = original;
   return aligned;
 }
 
 /** \internal Frees memory allocated with handmade_aligned_malloc */
-inline void handmade_aligned_free(void *ptr)
+EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void *ptr)
 {
-  if (ptr) std::free(*(reinterpret_cast<void**>(ptr) - 1));
+  if (ptr) {
+    EIGEN_USING_STD(free)
+    free(*(reinterpret_cast<void**>(ptr) - 1));
+  }
 }
 
 /** \internal
   * \brief Reallocates aligned memory.
-  * Since we know that our handmade version is based on std::realloc
+  * Since we know that our handmade version is based on std::malloc
   * we can use std::realloc to implement efficient reallocation.
   */
 inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = 0)
@@ -90,9 +129,9 @@
   if (ptr == 0) return handmade_aligned_malloc(size);
   void *original = *(reinterpret_cast<void**>(ptr) - 1);
   std::ptrdiff_t previous_offset = static_cast<char *>(ptr)-static_cast<char *>(original);
-  original = std::realloc(original,size+EIGEN_ALIGN_BYTES);
+  original = std::realloc(original,size+EIGEN_DEFAULT_ALIGN_BYTES);
   if (original == 0) return 0;
-  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES);
+  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) + EIGEN_DEFAULT_ALIGN_BYTES);
   void *previous_aligned = static_cast<char *>(original)+previous_offset;
   if(aligned!=previous_aligned)
     std::memmove(aligned, previous_aligned, size);
@@ -102,47 +141,6 @@
 }
 
 /*****************************************************************************
-*** Implementation of generic aligned realloc (when no realloc can be used)***
-*****************************************************************************/
-
-EIGEN_DEVICE_FUNC void* aligned_malloc(std::size_t size);
-EIGEN_DEVICE_FUNC void  aligned_free(void *ptr);
-
-/** \internal
-  * \brief Reallocates aligned memory.
-  * Allows reallocation with aligned ptr types. This implementation will
-  * always create a new memory chunk and copy the old data.
-  */
-inline void* generic_aligned_realloc(void* ptr, size_t size, size_t old_size)
-{
-  if (ptr==0)
-    return aligned_malloc(size);
-
-  if (size==0)
-  {
-    aligned_free(ptr);
-    return 0;
-  }
-
-  void* newptr = aligned_malloc(size);
-  if (newptr == 0)
-  {
-    #ifdef EIGEN_HAS_ERRNO
-    errno = ENOMEM; // according to the standard
-    #endif
-    return 0;
-  }
-
-  if (ptr != 0)
-  {
-    std::memcpy(newptr, ptr, (std::min)(size,old_size));
-    aligned_free(ptr);
-  }
-
-  return newptr;
-}
-
-/*****************************************************************************
 *** Implementation of portable aligned versions of malloc/free/realloc     ***
 *****************************************************************************/
 
@@ -173,20 +171,19 @@
 /** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements.
   * On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
   */
-EIGEN_DEVICE_FUNC
-inline void* aligned_malloc(size_t size)
+EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
 {
   check_that_malloc_is_allowed();
 
   void *result;
-  #if !EIGEN_ALIGN
-    result = std::malloc(size);
-  #elif EIGEN_HAS_POSIX_MEMALIGN
-    if(posix_memalign(&result, EIGEN_ALIGN_BYTES, size)) result = 0;
-  #elif EIGEN_HAS_MM_MALLOC
-    result = _mm_malloc(size, EIGEN_ALIGN_BYTES);
-  #elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
-    result = _aligned_malloc(size, EIGEN_ALIGN_BYTES);
+  #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
+
+    EIGEN_USING_STD(malloc)
+    result = malloc(size);
+
+    #if EIGEN_DEFAULT_ALIGN_BYTES==16
+    eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade aligned memory allocator.");
+    #endif
   #else
     result = handmade_aligned_malloc(size);
   #endif
@@ -198,47 +195,30 @@
 }
 
 /** \internal Frees memory allocated with aligned_malloc. */
-EIGEN_DEVICE_FUNC
-inline void aligned_free(void *ptr)
+EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
 {
-  #if !EIGEN_ALIGN
-    std::free(ptr);
-  #elif EIGEN_HAS_POSIX_MEMALIGN
-    std::free(ptr);
-  #elif EIGEN_HAS_MM_MALLOC
-    _mm_free(ptr);
-  #elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
-    _aligned_free(ptr);
+  #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
+
+    EIGEN_USING_STD(free)
+    free(ptr);
+
   #else
     handmade_aligned_free(ptr);
   #endif
 }
 
 /**
-* \internal
-* \brief Reallocates an aligned block of memory.
-* \throws std::bad_alloc on allocation failure
-**/
-inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
+  * \internal
+  * \brief Reallocates an aligned block of memory.
+  * \throws std::bad_alloc on allocation failure
+  */
+inline void* aligned_realloc(void *ptr, std::size_t new_size, std::size_t old_size)
 {
-  EIGEN_UNUSED_VARIABLE(old_size);
+  EIGEN_UNUSED_VARIABLE(old_size)
 
   void *result;
-#if !EIGEN_ALIGN
+#if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
   result = std::realloc(ptr,new_size);
-#elif EIGEN_HAS_POSIX_MEMALIGN
-  result = generic_aligned_realloc(ptr,new_size,old_size);
-#elif EIGEN_HAS_MM_MALLOC
-  // The defined(_mm_free) is just here to verify that this MSVC version
-  // implements _mm_malloc/_mm_free based on the corresponding _aligned_
-  // functions. This may not always be the case and we just try to be safe.
-  #if EIGEN_OS_WIN_STRICT && defined(_mm_free)
-    result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN_BYTES);
-  #else
-    result = generic_aligned_realloc(ptr,new_size,old_size);
-  #endif
-#elif EIGEN_OS_WIN_STRICT
-  result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN_BYTES);
 #else
   result = handmade_aligned_realloc(ptr,new_size,old_size);
 #endif
@@ -256,16 +236,18 @@
 /** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned.
   * On allocation error, the returned pointer is null, and a std::bad_alloc is thrown.
   */
-template<bool Align> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(size_t size)
+template<bool Align> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(std::size_t size)
 {
   return aligned_malloc(size);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(size_t size)
+template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(std::size_t size)
 {
   check_that_malloc_is_allowed();
 
-  void *result = std::malloc(size);
+  EIGEN_USING_STD(malloc)
+  void *result = malloc(size);
+
   if(!result && size)
     throw_std_bad_alloc();
   return result;
@@ -279,15 +261,16 @@
 
 template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr)
 {
-  std::free(ptr);
+  EIGEN_USING_STD(free)
+  free(ptr);
 }
 
-template<bool Align> inline void* conditional_aligned_realloc(void* ptr, size_t new_size, size_t old_size)
+template<bool Align> inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size)
 {
   return aligned_realloc(ptr, new_size, old_size);
 }
 
-template<> inline void* conditional_aligned_realloc<false>(void* ptr, size_t new_size, size_t)
+template<> inline void* conditional_aligned_realloc<false>(void* ptr, std::size_t new_size, std::size_t)
 {
   return std::realloc(ptr, new_size);
 }
@@ -296,33 +279,43 @@
 *** Construction/destruction of array elements                             ***
 *****************************************************************************/
 
-/** \internal Constructs the elements of an array.
-  * The \a size parameter tells on how many objects to call the constructor of T.
-  */
-template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, size_t size)
-{
-  for (size_t i=0; i < size; ++i) ::new (ptr + i) T;
-  return ptr;
-}
-
 /** \internal Destructs the elements of an array.
   * The \a size parameters tells on how many objects to call the destructor of T.
   */
-template<typename T> EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, std::size_t size)
 {
   // always destruct an array starting from the end.
   if(ptr)
     while(size) ptr[--size].~T();
 }
 
+/** \internal Constructs the elements of an array.
+  * The \a size parameter tells on how many objects to call the constructor of T.
+  */
+template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, std::size_t size)
+{
+  std::size_t i;
+  EIGEN_TRY
+  {
+      for (i = 0; i < size; ++i) ::new (ptr + i) T;
+      return ptr;
+  }
+  EIGEN_CATCH(...)
+  {
+    destruct_elements_of_array(ptr, i);
+    EIGEN_THROW;
+  }
+  return NULL;
+}
+
 /*****************************************************************************
 *** Implementation of aligned new/delete-like functions                    ***
 *****************************************************************************/
 
 template<typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(std::size_t size)
 {
-  if(size > size_t(-1) / sizeof(T))
+  if(size > std::size_t(-1) / sizeof(T))
     throw_std_bad_alloc();
 }
 
@@ -330,67 +323,57 @@
   * On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown.
   * The default constructor of T is called.
   */
-template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size)
 {
   check_size_for_overflow<T>(size);
   T *result = reinterpret_cast<T*>(aligned_malloc(sizeof(T)*size));
-  return construct_elements_of_array(result, size);
+  EIGEN_TRY
+  {
+    return construct_elements_of_array(result, size);
+  }
+  EIGEN_CATCH(...)
+  {
+    aligned_free(result);
+    EIGEN_THROW;
+  }
+  return result;
 }
 
-template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(std::size_t size)
 {
   check_size_for_overflow<T>(size);
   T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
-  return construct_elements_of_array(result, size);
-}
-
-template<typename T> EIGEN_DEVICE_FUNC inline T* allocate_uvm(size_t size)
-{
-#if defined(EIGEN_USE_GPU) && defined(__CUDA_ARCH__)
-  return (T*)malloc(size);
-#elif defined(EIGEN_USE_GPU) && defined(__NVCC__)
-  T* result = NULL;
-  if (cudaMallocManaged(&result, size) != cudaSuccess) {
-    throw_std_bad_alloc();
+  EIGEN_TRY
+  {
+    return construct_elements_of_array(result, size);
+  }
+  EIGEN_CATCH(...)
+  {
+    conditional_aligned_free<Align>(result);
+    EIGEN_THROW;
   }
   return result;
-#else
-  return reinterpret_cast<T*>(conditional_aligned_malloc<true>(sizeof(T)*size));
-#endif
-}
-
-template<typename T> EIGEN_DEVICE_FUNC void deallocate_uvm(T* ptr)
-{
-#if defined(EIGEN_USE_GPU) && defined(__CUDA_ARCH__)
-  free(ptr);
-#elif defined(EIGEN_USE_GPU) && defined(__NVCC__)
-  if (cudaFree(ptr) != cudaSuccess) {
-    throw_std_bad_alloc();
-  }
-#else
-  return conditional_aligned_free<true>(ptr);
-#endif
 }
 
 /** \internal Deletes objects constructed with aligned_new
   * The \a size parameters tells on how many objects to call the destructor of T.
   */
-template<typename T> EIGEN_DEVICE_FUNC  inline void aligned_delete(T *ptr, size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, std::size_t size)
 {
   destruct_elements_of_array<T>(ptr, size);
-  internal::aligned_free(ptr);
+  Eigen::internal::aligned_free(ptr);
 }
 
 /** \internal Deletes objects constructed with conditional_aligned_new
   * The \a size parameters tells on how many objects to call the destructor of T.
   */
-template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, std::size_t size)
 {
   destruct_elements_of_array<T>(ptr, size);
   conditional_aligned_free<Align>(ptr);
 }
 
-template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, std::size_t new_size, std::size_t old_size)
 {
   check_size_for_overflow<T>(new_size);
   check_size_for_overflow<T>(old_size);
@@ -398,48 +381,43 @@
     destruct_elements_of_array(pts+new_size, old_size-new_size);
   T *result = reinterpret_cast<T*>(conditional_aligned_realloc<Align>(reinterpret_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
   if(new_size > old_size)
-    construct_elements_of_array(result+old_size, new_size-old_size);
+  {
+    EIGEN_TRY
+    {
+      construct_elements_of_array(result+old_size, new_size-old_size);
+    }
+    EIGEN_CATCH(...)
+    {
+      conditional_aligned_free<Align>(result);
+      EIGEN_THROW;
+    }
+  }
   return result;
 }
 
 
-template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(std::size_t size)
 {
+  if(size==0)
+    return 0; // short-cut. Also fixes Bug 884
   check_size_for_overflow<T>(size);
   T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
   if(NumTraits<T>::RequireInitialization)
-    construct_elements_of_array(result, size);
+  {
+    EIGEN_TRY
+    {
+      construct_elements_of_array(result, size);
+    }
+    EIGEN_CATCH(...)
+    {
+      conditional_aligned_free<Align>(result);
+      EIGEN_THROW;
+    }
+  }
   return result;
 }
 
-template<typename T, bool Align, bool UseUVM> EIGEN_DEVICE_FUNC inline T* conditional_managed_new_auto(size_t size)
-{
-  check_size_for_overflow<T>(size);
-  T *result;
-  if (UseUVM) {
-    result = allocate_uvm<T>(size*sizeof(T));
-  }
-  else {
-    result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
-  }
-  if(NumTraits<T>::RequireInitialization)
-    construct_elements_of_array(result, size);
-  return result;
-}
-
-template<typename T, bool Align, bool UseUVM> EIGEN_DEVICE_FUNC inline void conditional_managed_delete_auto(T* ptr, size_t size)
-{
-  if(NumTraits<T>::RequireInitialization)
-    destruct_elements_of_array<T>(ptr, size);
-  if (UseUVM) {
-    deallocate_uvm(ptr);
-  }
-  else {
-    conditional_aligned_free<Align>(ptr);
-  }
-}
-
-template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(T* pts, size_t new_size, size_t old_size)
+template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(T* pts, std::size_t new_size, std::size_t old_size)
 {
   check_size_for_overflow<T>(new_size);
   check_size_for_overflow<T>(old_size);
@@ -447,11 +425,21 @@
     destruct_elements_of_array(pts+new_size, old_size-new_size);
   T *result = reinterpret_cast<T*>(conditional_aligned_realloc<Align>(reinterpret_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
   if(NumTraits<T>::RequireInitialization && (new_size > old_size))
-    construct_elements_of_array(result+old_size, new_size-old_size);
+  {
+    EIGEN_TRY
+    {
+      construct_elements_of_array(result+old_size, new_size-old_size);
+    }
+    EIGEN_CATCH(...)
+    {
+      conditional_aligned_free<Align>(result);
+      EIGEN_THROW;
+    }
+  }
   return result;
 }
 
-template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, std::size_t size)
 {
   if(NumTraits<T>::RequireInitialization)
     destruct_elements_of_array<T>(ptr, size);
@@ -460,48 +448,58 @@
 
 /****************************************************************************/
 
-/** \internal Returns the index of the first element of the array that is well aligned for vectorization.
+/** \internal Returns the index of the first element of the array that is well aligned with respect to the requested \a Alignment.
   *
+  * \tparam Alignment requested alignment in Bytes.
   * \param array the address of the start of the array
   * \param size the size of the array
   *
-  * \note If no element of the array is well aligned, the size of the array is returned. Typically,
-  * for example with SSE, "well aligned" means 16-byte-aligned. If vectorization is disabled or if the
+  * \note If no element of the array is well aligned or the requested alignment is not a multiple of a scalar,
+  * the size of the array is returned. For example with SSE, the requested alignment is typically 16-bytes. If
   * packet size for the given scalar type is 1, then everything is considered well-aligned.
   *
-  * \note If the scalar type is vectorizable, we rely on the following assumptions: sizeof(Scalar) is a
-  * power of 2, the packet size in bytes is also a power of 2, and is a multiple of sizeof(Scalar). On the
-  * other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for
+  * \note Otherwise, if the Alignment is larger that the scalar size, we rely on the assumptions that sizeof(Scalar) is a
+  * power of 2. On the other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for
   * example with Scalar=double on certain 32-bit platforms, see bug #79.
   *
   * There is also the variant first_aligned(const MatrixBase&) defined in DenseCoeffsBase.h.
+  * \sa first_default_aligned()
   */
-template<typename Scalar, typename Index>
-inline Index first_aligned(const Scalar* array, Index size)
+template<int Alignment, typename Scalar, typename Index>
+EIGEN_DEVICE_FUNC inline Index first_aligned(const Scalar* array, Index size)
 {
-  enum { PacketSize = packet_traits<Scalar>::size,
-         PacketAlignedMask = PacketSize-1
-  };
+  const Index ScalarSize = sizeof(Scalar);
+  const Index AlignmentSize = Alignment / ScalarSize;
+  const Index AlignmentMask = AlignmentSize-1;
 
-  if(PacketSize==1)
+  if(AlignmentSize<=1)
   {
-    // Either there is no vectorization, or a packet consists of exactly 1 scalar so that all elements
-    // of the array have the same alignment.
+    // Either the requested alignment if smaller than a scalar, or it exactly match a 1 scalar
+    // so that all elements of the array have the same alignment.
     return 0;
   }
-  else if(size_t(array) & (sizeof(Scalar)-1))
+  else if( (UIntPtr(array) & (sizeof(Scalar)-1)) || (Alignment%ScalarSize)!=0)
   {
-    // There is vectorization for this scalar type, but the array is not aligned to the size of a single scalar.
+    // The array is not aligned to the size of a single scalar, or the requested alignment is not a multiple of the scalar size.
     // Consequently, no element of the array is well aligned.
     return size;
   }
   else
   {
-    return std::min<Index>( (PacketSize - (Index((size_t(array)/sizeof(Scalar))) & PacketAlignedMask))
-                           & PacketAlignedMask, size);
+    Index first = (AlignmentSize - (Index((UIntPtr(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask;
+    return (first < size) ? first : size;
   }
 }
 
+/** \internal Returns the index of the first element of the array that is well aligned with respect the largest packet requirement.
+   * \sa first_aligned(Scalar*,Index) and first_default_aligned(DenseBase<Derived>) */
+template<typename Scalar, typename Index>
+EIGEN_DEVICE_FUNC inline Index first_default_aligned(const Scalar* array, Index size)
+{
+  typedef typename packet_traits<Scalar>::type DefaultPacketType;
+  return first_aligned<unpacket_traits<DefaultPacketType>::alignment>(array, size);
+}
+
 /** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size
   */
 template<typename Index>
@@ -520,12 +518,18 @@
 }
 
 template<typename T> struct smart_copy_helper<T,true> {
-  static inline EIGEN_DEVICE_FUNC void run(const T* start, const T* end, T* target)
-  { memcpy(target, start, std::ptrdiff_t(end)-std::ptrdiff_t(start)); }
+  EIGEN_DEVICE_FUNC static inline void run(const T* start, const T* end, T* target)
+  {
+    IntPtr size = IntPtr(end)-IntPtr(start);
+    if(size==0) return;
+    eigen_internal_assert(start!=0 && end!=0 && target!=0);
+    EIGEN_USING_STD(memcpy)
+    memcpy(target, start, size);
+  }
 };
 
 template<typename T> struct smart_copy_helper<T,false> {
-  static inline EIGEN_DEVICE_FUNC void run(const T* start, const T* end, T* target)
+  EIGEN_DEVICE_FUNC static inline void run(const T* start, const T* end, T* target)
   { std::copy(start, end, target); }
 };
 
@@ -534,29 +538,45 @@
 
 template<typename T> void smart_memmove(const T* start, const T* end, T* target)
 {
-    smart_memmove_helper<T,!NumTraits<T>::RequireInitialization>::run(start, end, target);
+  smart_memmove_helper<T,!NumTraits<T>::RequireInitialization>::run(start, end, target);
 }
 
 template<typename T> struct smart_memmove_helper<T,true> {
-    static inline void run(const T* start, const T* end, T* target)
-    { std::memmove(target, start, std::ptrdiff_t(end)-std::ptrdiff_t(start)); }
+  static inline void run(const T* start, const T* end, T* target)
+  {
+    IntPtr size = IntPtr(end)-IntPtr(start);
+    if(size==0) return;
+    eigen_internal_assert(start!=0 && end!=0 && target!=0);
+    std::memmove(target, start, size);
+  }
 };
 
 template<typename T> struct smart_memmove_helper<T,false> {
-    static inline void run(const T* start, const T* end, T* target)
+  static inline void run(const T* start, const T* end, T* target)
+  {
+    if (UIntPtr(target) < UIntPtr(start))
     {
-        if (uintptr_t(target) < uintptr_t(start))
-        {
-            std::copy(start, end, target);
-        }
-        else
-        {
-            std::ptrdiff_t count = (std::ptrdiff_t(end)-std::ptrdiff_t(start)) / sizeof(T);
-            std::copy_backward(start, end, target + count);
-        }
+      std::copy(start, end, target);
     }
+    else
+    {
+      std::ptrdiff_t count = (std::ptrdiff_t(end)-std::ptrdiff_t(start)) / sizeof(T);
+      std::copy_backward(start, end, target + count);
+    }
+  }
 };
 
+#if EIGEN_HAS_RVALUE_REFERENCES
+template<typename T> EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target)
+{
+  return std::move(start, end, target);
+}
+#else
+template<typename T> EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target)
+{
+  return std::copy(start, end, target);
+}
+#endif
 
 /*****************************************************************************
 *** Implementation of runtime stack allocation (falling back to malloc)    ***
@@ -564,17 +584,26 @@
 
 // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA
 // to the appropriate stack allocation function
-#ifndef EIGEN_ALLOCA
-  #if (defined __linux__) || (defined __APPLE__)
+#if ! defined EIGEN_ALLOCA && ! defined EIGEN_GPU_COMPILE_PHASE
+  #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca)
     #define EIGEN_ALLOCA alloca
-  #elif defined(_MSC_VER)
+  #elif EIGEN_COMP_MSVC
     #define EIGEN_ALLOCA _alloca
   #endif
 #endif
 
+// With clang -Oz -mthumb, alloca changes the stack pointer in a way that is
+// not allowed in Thumb2. -DEIGEN_STACK_ALLOCATION_LIMIT=0 doesn't work because
+// the compiler still emits bad code because stack allocation checks use "<=".
+// TODO: Eliminate after https://bugs.llvm.org/show_bug.cgi?id=23772
+// is fixed.
+#if defined(__clang__) && defined(__thumb__)
+  #undef EIGEN_ALLOCA
+#endif
+
 // This helper class construct the allocated memory, and takes care of destructing and freeing the handled data
 // at destruction time. In practice this helper class is mainly useful to avoid memory leak in case of exceptions.
-template<typename T> class aligned_stack_memory_handler
+template<typename T> class aligned_stack_memory_handler : noncopyable
 {
   public:
     /* Creates a stack_memory_handler responsible for the buffer \a ptr of size \a size.
@@ -583,12 +612,14 @@
      * In this case, the buffer elements will also be destructed when this handler will be destructed.
      * Finally, if \a dealloc is true, then the pointer \a ptr is freed.
      **/
-    aligned_stack_memory_handler(T* ptr, size_t size, bool dealloc)
+    EIGEN_DEVICE_FUNC
+    aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc)
       : m_ptr(ptr), m_size(size), m_deallocate(dealloc)
     {
       if(NumTraits<T>::RequireInitialization && m_ptr)
         Eigen::internal::construct_elements_of_array(m_ptr, size);
     }
+    EIGEN_DEVICE_FUNC
     ~aligned_stack_memory_handler()
     {
       if(NumTraits<T>::RequireInitialization && m_ptr)
@@ -598,16 +629,96 @@
     }
   protected:
     T* m_ptr;
-    size_t m_size;
+    std::size_t m_size;
     bool m_deallocate;
 };
 
+#ifdef EIGEN_ALLOCA
+
+template<typename Xpr, int NbEvaluations,
+         bool MapExternalBuffer = nested_eval<Xpr,NbEvaluations>::Evaluate && Xpr::MaxSizeAtCompileTime==Dynamic
+         >
+struct local_nested_eval_wrapper
+{
+  static const bool NeedExternalBuffer = false;
+  typedef typename Xpr::Scalar Scalar;
+  typedef typename nested_eval<Xpr,NbEvaluations>::type ObjectType;
+  ObjectType object;
+
+  EIGEN_DEVICE_FUNC
+  local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) : object(xpr)
+  {
+    EIGEN_UNUSED_VARIABLE(ptr);
+    eigen_internal_assert(ptr==0);
+  }
+};
+
+template<typename Xpr, int NbEvaluations>
+struct local_nested_eval_wrapper<Xpr,NbEvaluations,true>
+{
+  static const bool NeedExternalBuffer = true;
+  typedef typename Xpr::Scalar Scalar;
+  typedef typename plain_object_eval<Xpr>::type PlainObject;
+  typedef Map<PlainObject,EIGEN_DEFAULT_ALIGN_BYTES> ObjectType;
+  ObjectType object;
+
+  EIGEN_DEVICE_FUNC
+  local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr)
+    : object(ptr==0 ? reinterpret_cast<Scalar*>(Eigen::internal::aligned_malloc(sizeof(Scalar)*xpr.size())) : ptr, xpr.rows(), xpr.cols()),
+      m_deallocate(ptr==0)
+  {
+    if(NumTraits<Scalar>::RequireInitialization && object.data())
+      Eigen::internal::construct_elements_of_array(object.data(), object.size());
+    object = xpr;
+  }
+
+  EIGEN_DEVICE_FUNC
+  ~local_nested_eval_wrapper()
+  {
+    if(NumTraits<Scalar>::RequireInitialization && object.data())
+      Eigen::internal::destruct_elements_of_array(object.data(), object.size());
+    if(m_deallocate)
+      Eigen::internal::aligned_free(object.data());
+  }
+
+private:
+  bool m_deallocate;
+};
+
+#endif // EIGEN_ALLOCA
+
+template<typename T> class scoped_array : noncopyable
+{
+  T* m_ptr;
+public:
+  explicit scoped_array(std::ptrdiff_t size)
+  {
+    m_ptr = new T[size];
+  }
+  ~scoped_array()
+  {
+    delete[] m_ptr;
+  }
+  T& operator[](std::ptrdiff_t i) { return m_ptr[i]; }
+  const T& operator[](std::ptrdiff_t i) const { return m_ptr[i]; }
+  T* &ptr() { return m_ptr; }
+  const T* ptr() const { return m_ptr; }
+  operator const T*() const { return m_ptr; }
+};
+
+template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
+{
+  std::swap(a.ptr(),b.ptr());
+}
+
 } // end namespace internal
 
 /** \internal
-  * Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
-  * if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
-  * (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap.
+  *
+  * The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates,
+  * and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
+  * if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
+  * (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap.
   * The allocated buffer is automatically deleted when exiting the scope of this declaration.
   * If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs.
   * Here is an example:
@@ -618,13 +729,23 @@
   * }
   * \endcode
   * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
+  *
+  * The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to
+  * \code
+  *   typename internal::nested_eval<XPRT_T,N>::type NAME(XPR);
+  * \endcode
+  * with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown.
+  * This is accomplished through alloca if this later is supported and if the required number of bytes
+  * is below EIGEN_STACK_ALLOCATION_LIMIT.
   */
 #ifdef EIGEN_ALLOCA
-  // The native alloca() that comes with llvm aligns buffer on 16 bytes even when AVX is enabled.
-#if defined(__arm__) || defined(_WIN32) || EIGEN_ALIGN_BYTES > 16
-    #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<size_t>(EIGEN_ALLOCA(SIZE+EIGEN_ALIGN_BYTES)) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES)
+
+  #if EIGEN_DEFAULT_ALIGN_BYTES>0
+    // We always manually re-align the result of EIGEN_ALLOCA.
+    // If alloca is already aligned, the compiler should be smart enough to optimize away the re-alignment.
+    #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((internal::UIntPtr(EIGEN_ALLOCA(SIZE+EIGEN_DEFAULT_ALIGN_BYTES-1)) + EIGEN_DEFAULT_ALIGN_BYTES-1) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1)))
   #else
-    #define EIGEN_ALIGNED_ALLOCA EIGEN_ALLOCA
+    #define EIGEN_ALIGNED_ALLOCA(SIZE) EIGEN_ALLOCA(SIZE)
   #endif
 
   #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
@@ -635,6 +756,13 @@
                     : Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) );  \
     Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT)
 
+
+  #define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) \
+    Eigen::internal::local_nested_eval_wrapper<XPR_T,N> EIGEN_CAT(NAME,_wrapper)(XPR, reinterpret_cast<typename XPR_T::Scalar*>( \
+      ( (Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::NeedExternalBuffer) && ((sizeof(typename XPR_T::Scalar)*XPR.size())<=EIGEN_STACK_ALLOCATION_LIMIT) ) \
+        ? EIGEN_ALIGNED_ALLOCA( sizeof(typename XPR_T::Scalar)*XPR.size() ) : 0 ) ) ; \
+    typename Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::ObjectType NAME(EIGEN_CAT(NAME,_wrapper).object)
+
 #else
 
   #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
@@ -642,6 +770,9 @@
     TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE));    \
     Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true)
 
+
+#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) typename Eigen::internal::nested_eval<XPR_T,N>::type NAME(XPR)
+
 #endif
 
 
@@ -649,40 +780,57 @@
 *** Implementation of EIGEN_MAKE_ALIGNED_OPERATOR_NEW [_IF]                ***
 *****************************************************************************/
 
-#if EIGEN_ALIGN
-  #ifdef EIGEN_EXCEPTIONS
-    #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
-      void* operator new(size_t size, const std::nothrow_t&) throw() { \
-        try { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); } \
-        catch (...) { return 0; } \
-        return 0; \
-      }
-  #else
-    #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
-      void* operator new(size_t size, const std::nothrow_t&) throw() { \
-        return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
-      }
-  #endif
+#if EIGEN_HAS_CXX17_OVERALIGN
 
+// C++17 -> no need to bother about alignment anymore :)
+
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign)
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size)
+
+#else
+
+// HIP does not support new/delete on device.
+#if EIGEN_MAX_ALIGN_BYTES!=0 && !defined(EIGEN_HIP_DEVICE_COMPILE)
+  #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
+      EIGEN_DEVICE_FUNC \
+      void* operator new(std::size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \
+        EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); } \
+        EIGEN_CATCH (...) { return 0; } \
+      }
   #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \
-      void *operator new(size_t size) { \
+      EIGEN_DEVICE_FUNC \
+      void *operator new(std::size_t size) { \
         return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
       } \
-      void *operator new[](size_t size) { \
+      EIGEN_DEVICE_FUNC \
+      void *operator new[](std::size_t size) { \
         return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
       } \
-      void operator delete(void * ptr) throw() { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
-      void operator delete[](void * ptr) throw() { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      EIGEN_DEVICE_FUNC \
+      void operator delete(void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      EIGEN_DEVICE_FUNC \
+      void operator delete[](void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      EIGEN_DEVICE_FUNC \
+      void operator delete(void * ptr, std::size_t /* sz */) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      EIGEN_DEVICE_FUNC \
+      void operator delete[](void * ptr, std::size_t /* sz */) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
       /* in-place new and delete. since (at least afaik) there is no actual   */ \
       /* memory allocated we can safely let the default implementation handle */ \
       /* this particular case. */ \
-      static void *operator new(size_t size, void *ptr) { return ::operator new(size,ptr); } \
-      static void *operator new[](size_t size, void* ptr) { return ::operator new[](size,ptr); } \
-      void operator delete(void * memory, void *ptr) throw() { return ::operator delete(memory,ptr); } \
-      void operator delete[](void * memory, void *ptr) throw() { return ::operator delete[](memory,ptr); } \
+      EIGEN_DEVICE_FUNC \
+      static void *operator new(std::size_t size, void *ptr) { return ::operator new(size,ptr); } \
+      EIGEN_DEVICE_FUNC \
+      static void *operator new[](std::size_t size, void* ptr) { return ::operator new[](size,ptr); } \
+      EIGEN_DEVICE_FUNC \
+      void operator delete(void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete(memory,ptr); } \
+      EIGEN_DEVICE_FUNC \
+      void operator delete[](void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete[](memory,ptr); } \
       /* nothrow-new (returns zero instead of std::bad_alloc) */ \
       EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
-      void operator delete(void *ptr, const std::nothrow_t&) throw() { \
+      EIGEN_DEVICE_FUNC \
+      void operator delete(void *ptr, const std::nothrow_t&) EIGEN_NO_THROW { \
         Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \
       } \
       typedef void eigen_aligned_operator_new_marker_type;
@@ -691,15 +839,29 @@
 #endif
 
 #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true)
-#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_ALIGN_BYTES==0)))
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size)                        \
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(                                                             \
+        ((Size)!=Eigen::Dynamic) &&                                                                    \
+        (((EIGEN_MAX_ALIGN_BYTES>=16) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES  )==0)) ||    \
+         ((EIGEN_MAX_ALIGN_BYTES>=32) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/2)==0)) ||    \
+         ((EIGEN_MAX_ALIGN_BYTES>=64) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/4)==0))   )))
+
+#endif
 
 /****************************************************************************/
 
 /** \class aligned_allocator
 * \ingroup Core_Module
 *
-* \brief STL compatible allocator to use with with 16 byte aligned types
+* \brief STL compatible allocator to use with types requiring a non standrad alignment.
+*
+* The memory is aligned as for dynamically aligned matrix/array types such as MatrixXd.
+* By default, it will thus provide at least 16 bytes alignment and more in following cases:
+*  - 32 bytes alignment if AVX is enabled.
+*  - 64 bytes alignment if AVX512 is enabled.
+*
+* This can be controlled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented
+* \link TopicPreprocessorDirectivesPerformance there \endlink.
 *
 * Example:
 * \code
@@ -710,13 +872,13 @@
 * std::map< int, Vector3f > my_map_vec3;
 * \endcode
 *
-* \sa \ref TopicStlContainers.
+* \sa \blank \ref TopicStlContainers.
 */
 template<class T>
 class aligned_allocator : public std::allocator<T>
 {
 public:
-  typedef size_t          size_type;
+  typedef std::size_t     size_type;
   typedef std::ptrdiff_t  difference_type;
   typedef T*              pointer;
   typedef const T*        const_pointer;
@@ -739,6 +901,15 @@
 
   ~aligned_allocator() {}
 
+  #if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(7,0)
+  // In gcc std::allocator::max_size() is bugged making gcc triggers a warning:
+  // eigen/Eigen/src/Core/util/Memory.h:189:12: warning: argument 1 value '18446744073709551612' exceeds maximum object size 9223372036854775807
+  // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87544
+  size_type max_size() const {
+    return (std::numeric_limits<std::ptrdiff_t>::max)()/sizeof(T);
+  }
+  #endif
+
   pointer allocate(size_type num, const void* /*hint*/ = 0)
   {
     internal::check_size_for_overflow<T>(num);
@@ -780,9 +951,9 @@
 
 #ifdef EIGEN_CPUID
 
-inline bool cpuid_is_vendor(int abcd[4], const char* vendor)
+inline bool cpuid_is_vendor(int abcd[4], const int vendor[3])
 {
-  return abcd[1]==(reinterpret_cast<const int*>(vendor))[0] && abcd[3]==(reinterpret_cast<const int*>(vendor))[1] && abcd[2]==(reinterpret_cast<const int*>(vendor))[2];
+  return abcd[1]==vendor[0] && abcd[3]==vendor[1] && abcd[2]==vendor[2];
 }
 
 inline void queryCacheSizes_intel_direct(int& l1, int& l2, int& l3)
@@ -901,20 +1072,32 @@
 {
   if(max_std_funcs>=4)
     queryCacheSizes_intel_direct(l1,l2,l3);
-  else
+  else if(max_std_funcs>=2)
     queryCacheSizes_intel_codes(l1,l2,l3);
+  else
+    l1 = l2 = l3 = 0;
 }
 
 inline void queryCacheSizes_amd(int& l1, int& l2, int& l3)
 {
   int abcd[4];
   abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
-  EIGEN_CPUID(abcd,0x80000005,0);
-  l1 = (abcd[2] >> 24) * 1024; // C[31:24] = L1 size in KB
-  abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
-  EIGEN_CPUID(abcd,0x80000006,0);
-  l2 = (abcd[2] >> 16) * 1024; // C[31;16] = l2 cache size in KB
-  l3 = ((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024; // D[31;18] = l3 cache size in 512KB
+  
+  // First query the max supported function.
+  EIGEN_CPUID(abcd,0x80000000,0);
+  if(static_cast<numext::uint32_t>(abcd[0]) >= static_cast<numext::uint32_t>(0x80000006))
+  {
+    EIGEN_CPUID(abcd,0x80000005,0);
+    l1 = (abcd[2] >> 24) * 1024; // C[31:24] = L1 size in KB
+    abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
+    EIGEN_CPUID(abcd,0x80000006,0);
+    l2 = (abcd[2] >> 16) * 1024; // C[31;16] = l2 cache size in KB
+    l3 = ((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024; // D[31;18] = l3 cache size in 512KB
+  }
+  else
+  {
+    l1 = l2 = l3 = 0;
+  }
 }
 #endif
 
@@ -924,13 +1107,16 @@
 {
   #ifdef EIGEN_CPUID
   int abcd[4];
+  const int GenuineIntel[] = {0x756e6547, 0x49656e69, 0x6c65746e};
+  const int AuthenticAMD[] = {0x68747541, 0x69746e65, 0x444d4163};
+  const int AMDisbetter_[] = {0x69444d41, 0x74656273, 0x21726574}; // "AMDisbetter!"
 
   // identify the CPU vendor
   EIGEN_CPUID(abcd,0x0,0);
-  int max_std_funcs = abcd[1];
-  if(cpuid_is_vendor(abcd,"GenuineIntel"))
+  int max_std_funcs = abcd[0];
+  if(cpuid_is_vendor(abcd,GenuineIntel))
     queryCacheSizes_intel(l1,l2,l3,max_std_funcs);
-  else if(cpuid_is_vendor(abcd,"AuthenticAMD") || cpuid_is_vendor(abcd,"AMDisbetter!"))
+  else if(cpuid_is_vendor(abcd,AuthenticAMD) || cpuid_is_vendor(abcd,AMDisbetter_))
     queryCacheSizes_amd(l1,l2,l3);
   else
     // by default let's use Intel's API
@@ -961,13 +1147,6 @@
   return l1;
 }
 
-inline int queryL2CacheSize()
-{
-  int l1, l2(-1), l3;
-  queryCacheSizes(l1,l2,l3);
-  return l2;
-}
-
 /** \internal
  * \returns the size in Bytes of the L2 or L3 cache if this later is present */
 inline int queryTopLevelCacheSize()

diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
old mode 100644
new mode 100755
index 1f67c01..81ae2a3
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,12 +11,60 @@
 #ifndef EIGEN_META_H
 #define EIGEN_META_H
 
-#if defined(__CUDA_ARCH__)
-#include <math_constants.h>
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+
+ #include <cfloat>
+
+ #if defined(EIGEN_CUDA_ARCH)
+  #include <math_constants.h>
+ #endif
+
+ #if defined(EIGEN_HIP_DEVICE_COMPILE)
+  #include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
+  #endif
+
+#endif
+
+// Recent versions of ICC require <cstdint> for pointer types below.
+#define EIGEN_ICC_NEEDS_CSTDINT (EIGEN_COMP_ICC>=1600 && EIGEN_COMP_CXXVER >= 11)
+
+// Define portable (u)int{32,64} types
+#if EIGEN_HAS_CXX11 || EIGEN_ICC_NEEDS_CSTDINT
+#include <cstdint>
+namespace Eigen {
+namespace numext {
+typedef std::uint8_t  uint8_t;
+typedef std::int8_t   int8_t;
+typedef std::uint16_t uint16_t;
+typedef std::int16_t  int16_t;
+typedef std::uint32_t uint32_t;
+typedef std::int32_t  int32_t;
+typedef std::uint64_t uint64_t;
+typedef std::int64_t  int64_t;
+}
+}
+#else
+// Without c++11, all compilers able to compile Eigen also
+// provide the C99 stdint.h header file.
+#include <stdint.h>
+namespace Eigen {
+namespace numext {
+typedef ::uint8_t  uint8_t;
+typedef ::int8_t   int8_t;
+typedef ::uint16_t uint16_t;
+typedef ::int16_t  int16_t;
+typedef ::uint32_t uint32_t;
+typedef ::int32_t  int32_t;
+typedef ::uint64_t uint64_t;
+typedef ::int64_t  int64_t;
+}
+}
 #endif
 
 namespace Eigen {
 
+typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE DenseIndex;
+
 /**
  * \brief The Index type as used for the API.
  * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
@@ -25,7 +73,6 @@
 
 typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE Index;
 
-
 namespace internal {
 
 /** \internal
@@ -35,18 +82,35 @@
   * we however don't want to add a dependency to Boost.
   */
 
+// Only recent versions of ICC complain about using ptrdiff_t to hold pointers,
+// and older versions do not provide *intptr_t types.
+#if EIGEN_ICC_NEEDS_CSTDINT
+typedef std::intptr_t  IntPtr;
+typedef std::uintptr_t UIntPtr;
+#else
+typedef std::ptrdiff_t IntPtr;
+typedef std::size_t UIntPtr;
+#endif
+#undef EIGEN_ICC_NEEDS_CSTDINT
+
 struct true_type {  enum { value = 1 }; };
 struct false_type { enum { value = 0 }; };
 
+template<bool Condition>
+struct bool_constant;
+
+template<>
+struct bool_constant<true> : true_type {};
+
+template<>
+struct bool_constant<false> : false_type {};
+
 template<bool Condition, typename Then, typename Else>
 struct conditional { typedef Then type; };
 
 template<typename Then, typename Else>
 struct conditional <false, Then, Else> { typedef Else type; };
 
-template<typename T, typename U> struct is_same { enum { value = 0 }; };
-template<typename T> struct is_same<T,T> { enum { value = 1 }; };
-
 template<typename T> struct remove_reference { typedef T type; };
 template<typename T> struct remove_reference<T&> { typedef T type; };
 
@@ -81,6 +145,66 @@
 template<> struct is_arithmetic<signed long>   { enum { value = true }; };
 template<> struct is_arithmetic<unsigned long> { enum { value = true }; };
 
+template<typename T, typename U> struct is_same { enum { value = 0 }; };
+template<typename T> struct is_same<T,T> { enum { value = 1 }; };
+
+template< class T >
+struct is_void : is_same<void, typename remove_const<T>::type> {};
+
+#if EIGEN_HAS_CXX11
+template<> struct is_arithmetic<signed long long>   { enum { value = true }; };
+template<> struct is_arithmetic<unsigned long long> { enum { value = true }; };
+using std::is_integral;
+#else
+template<typename T> struct is_integral               { enum { value = false }; };
+template<> struct is_integral<bool>                   { enum { value = true }; };
+template<> struct is_integral<char>                   { enum { value = true }; };
+template<> struct is_integral<signed char>            { enum { value = true }; };
+template<> struct is_integral<unsigned char>          { enum { value = true }; };
+template<> struct is_integral<signed short>           { enum { value = true }; };
+template<> struct is_integral<unsigned short>         { enum { value = true }; };
+template<> struct is_integral<signed int>             { enum { value = true }; };
+template<> struct is_integral<unsigned int>           { enum { value = true }; };
+template<> struct is_integral<signed long>            { enum { value = true }; };
+template<> struct is_integral<unsigned long>          { enum { value = true }; };
+#if EIGEN_COMP_MSVC
+template<> struct is_integral<signed __int64>         { enum { value = true }; };
+template<> struct is_integral<unsigned __int64>       { enum { value = true }; };
+#endif
+#endif
+
+#if EIGEN_HAS_CXX11
+using std::make_unsigned;
+#else
+// TODO: Possibly improve this implementation of make_unsigned.
+// It is currently used only by
+// template<typename Scalar> struct random_default_impl<Scalar, false, true>.
+template<typename> struct make_unsigned;
+template<> struct make_unsigned<char>             { typedef unsigned char type; };
+template<> struct make_unsigned<signed char>      { typedef unsigned char type; };
+template<> struct make_unsigned<unsigned char>    { typedef unsigned char type; };
+template<> struct make_unsigned<signed short>     { typedef unsigned short type; };
+template<> struct make_unsigned<unsigned short>   { typedef unsigned short type; };
+template<> struct make_unsigned<signed int>       { typedef unsigned int type; };
+template<> struct make_unsigned<unsigned int>     { typedef unsigned int type; };
+template<> struct make_unsigned<signed long>      { typedef unsigned long type; };
+template<> struct make_unsigned<unsigned long>    { typedef unsigned long type; };
+#if EIGEN_COMP_MSVC
+template<> struct make_unsigned<signed __int64>   { typedef unsigned __int64 type; };
+template<> struct make_unsigned<unsigned __int64> { typedef unsigned __int64 type; };
+#endif
+
+// Some platforms define int64_t as `long long` even for C++03, where
+// `long long` is not guaranteed by the standard. In this case we are missing
+// the definition for make_unsigned. If we just define it, we run into issues
+// where `long long` doesn't exist in some compilers for C++03. We therefore add
+// the specialization for these platforms only.
+#if EIGEN_OS_MAC || EIGEN_COMP_MINGW
+template<> struct make_unsigned<unsigned long long> { typedef unsigned long long type; };
+template<> struct make_unsigned<long long>          { typedef unsigned long long type; };
+#endif
+#endif
+
 template <typename T> struct add_const { typedef const T type; };
 template <typename T> struct add_const<T&> { typedef T& type; };
 
@@ -93,6 +217,56 @@
 template<typename T> struct add_const_on_value_type<T* const>  { typedef T const* const type; };
 template<typename T> struct add_const_on_value_type<T const* const>  { typedef T const* const type; };
 
+#if EIGEN_HAS_CXX11
+
+using std::is_convertible;
+
+#else
+
+template<typename From, typename To>
+struct is_convertible_impl
+{
+private:
+  struct any_conversion
+  {
+    template <typename T> any_conversion(const volatile T&);
+    template <typename T> any_conversion(T&);
+  };
+  struct yes {int a[1];};
+  struct no  {int a[2];};
+
+  template<typename T>
+  static yes test(T, int);
+
+  template<typename T>
+  static no  test(any_conversion, ...);
+
+public:
+  static typename internal::remove_reference<From>::type* ms_from;
+#ifdef __INTEL_COMPILER
+  #pragma warning push
+  #pragma warning ( disable : 2259 )
+#endif
+  enum { value = sizeof(test<To>(*ms_from, 0))==sizeof(yes) };
+#ifdef __INTEL_COMPILER
+  #pragma warning pop
+#endif
+};
+
+template<typename From, typename To>
+struct is_convertible
+{
+  enum { value = is_convertible_impl<From,To>::value };
+};
+
+template<typename T>
+struct is_convertible<T,T&> { enum { value = false }; };
+
+template<typename T>
+struct is_convertible<const T,const T&> { enum { value = true }; };
+
+#endif
+
 /** \internal Allows to enable/disable an overload
   * according to a compile time condition.
   */
@@ -101,105 +275,271 @@
 template<typename T> struct enable_if<true,T>
 { typedef T type; };
 
-#if defined(__CUDA_ARCH__)
- 
+#if defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11
+#if !defined(__FLT_EPSILON__)
+#define __FLT_EPSILON__ FLT_EPSILON
+#define __DBL_EPSILON__ DBL_EPSILON
+#endif
+
 namespace device {
 
 template<typename T> struct numeric_limits
 {
   EIGEN_DEVICE_FUNC
-  static T epsilon() { return 0; }
-  static T max() { assert(false && "Max not suppoted for this type"); }
-  static T lowest() { assert(false && "Lowest not suppoted for this type"); }
+  static EIGEN_CONSTEXPR T epsilon() { return 0; }
+  static T (max)() { assert(false && "Highest not supported for this type"); }
+  static T (min)() { assert(false && "Lowest not supported for this type"); }
   static T infinity() { assert(false && "Infinity not supported for this type"); }
   static T quiet_NaN() { assert(false && "quiet_NaN not supported for this type"); }
 };
 template<> struct numeric_limits<float>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static float epsilon() { return __FLT_EPSILON__; }
   EIGEN_DEVICE_FUNC
-  static float max() { return CUDART_MAX_NORMAL_F; }
+  static float (max)() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_MAX_NORMAL_F;
+  #else
+    return HIPRT_MAX_NORMAL_F;
+  #endif
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static float (min)() { return FLT_MIN; }
   EIGEN_DEVICE_FUNC
-  static float lowest() { return -CUDART_MAX_NORMAL_F; }
+  static float infinity() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_INF_F;
+  #else
+    return HIPRT_INF_F;
+  #endif
+  }
   EIGEN_DEVICE_FUNC
-  static float infinity() { return CUDART_INF_F; }
-  EIGEN_DEVICE_FUNC
-  static float quiet_NaN() { return CUDART_NAN_F; }
+  static float quiet_NaN() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_NAN_F;
+  #else
+    return HIPRT_NAN_F;
+  #endif
+  }
 };
 template<> struct numeric_limits<double>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static double epsilon() { return __DBL_EPSILON__; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static double (max)() { return DBL_MAX; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static double (min)() { return DBL_MIN; }
   EIGEN_DEVICE_FUNC
-  static double max() { return CUDART_INF; }
+  static double infinity() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_INF;
+  #else
+    return HIPRT_INF;
+  #endif
+  }
   EIGEN_DEVICE_FUNC
-  static double lowest() { return -CUDART_INF; }
-  EIGEN_DEVICE_FUNC
-  static double infinity() { return CUDART_INF; }
-  EIGEN_DEVICE_FUNC
-  static double quiet_NaN() { return CUDART_NAN; }
+  static double quiet_NaN() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_NAN;
+  #else
+    return HIPRT_NAN;
+  #endif
+  }
 };
 template<> struct numeric_limits<int>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static int epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC
-  static int max() { return INT_MAX; }
-  EIGEN_DEVICE_FUNC
-  static int lowest() { return INT_MIN; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static int (max)() { return INT_MAX; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static int (min)() { return INT_MIN; }
+};
+template<> struct numeric_limits<unsigned int>
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static unsigned int epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static unsigned int (max)() { return UINT_MAX; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static unsigned int (min)() { return 0; }
 };
 template<> struct numeric_limits<long>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static long epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC
-  static long max() { return LONG_MAX; }
-  EIGEN_DEVICE_FUNC
-  static long lowest() { return LONG_MIN; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static long (max)() { return LONG_MAX; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static long (min)() { return LONG_MIN; }
+};
+template<> struct numeric_limits<unsigned long>
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static unsigned long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static unsigned long (max)() { return ULONG_MAX; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static unsigned long (min)() { return 0; }
 };
 template<> struct numeric_limits<long long>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static long long epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC
-  static long long max() { return LLONG_MAX; }
-  EIGEN_DEVICE_FUNC
-  static long long lowest() { return LLONG_MIN; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static long long (max)() { return LLONG_MAX; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static long long (min)() { return LLONG_MIN; }
+};
+template<> struct numeric_limits<unsigned long long>
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static unsigned long long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static unsigned long long (max)() { return ULLONG_MAX; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static unsigned long long (min)() { return 0; }
+};
+template<> struct numeric_limits<bool>
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static bool epsilon() { return false; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static bool (max)() { return true; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR 
+  static bool (min)() { return false; }
 };
 
 }
 
-#endif
+#endif // defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11
 
 /** \internal
-  * A base class do disable default copy ctor and copy assignement operator.
+  * A base class do disable default copy ctor and copy assignment operator.
   */
 class noncopyable
 {
-  noncopyable(const noncopyable&);
-  const noncopyable& operator=(const noncopyable&);
+  EIGEN_DEVICE_FUNC noncopyable(const noncopyable&);
+  EIGEN_DEVICE_FUNC const noncopyable& operator=(const noncopyable&);
 protected:
-  noncopyable() {}
-  ~noncopyable() {}
+  EIGEN_DEVICE_FUNC noncopyable() {}
+  EIGEN_DEVICE_FUNC ~noncopyable() {}
 };
 
+/** \internal
+  * Provides access to the number of elements in the object of as a compile-time constant expression.
+  * It "returns" Eigen::Dynamic if the size cannot be resolved at compile-time (default).
+  *
+  * Similar to std::tuple_size, but more general.
+  *
+  * It currently supports:
+  *  - any types T defining T::SizeAtCompileTime
+  *  - plain C arrays as T[N]
+  *  - std::array (c++11)
+  *  - some internal types such as SingleRange and AllRange
+  *
+  * The second template parameter eases SFINAE-based specializations.
+  */
+template<typename T, typename EnableIf = void> struct array_size {
+  enum { value = Dynamic };
+};
+
+template<typename T> struct array_size<T,typename internal::enable_if<((T::SizeAtCompileTime&0)==0)>::type> {
+  enum { value = T::SizeAtCompileTime };
+};
+
+template<typename T, int N> struct array_size<const T (&)[N]> {
+  enum { value = N };
+};
+template<typename T, int N> struct array_size<T (&)[N]> {
+  enum { value = N };
+};
+
+#if EIGEN_HAS_CXX11
+template<typename T, std::size_t N> struct array_size<const std::array<T,N> > {
+  enum { value = N };
+};
+template<typename T, std::size_t N> struct array_size<std::array<T,N> > {
+  enum { value = N };
+};
+#endif
 
 /** \internal
-  * Convenient struct to get the result type of a unary or binary functor.
+  * Analogue of the std::size free function.
+  * It returns the size of the container or view \a x of type \c T
   *
-  * It supports both the current STL mechanism (using the result_type member) as well as
-  * upcoming next STL generation (using a templated result member).
-  * If none of these members is provided, then the type of the first argument is returned. FIXME, that behavior is a pretty bad hack.
+  * It currently supports:
+  *  - any types T defining a member T::size() const
+  *  - plain C arrays as T[N]
+  *
   */
-template<typename T> struct result_of {};
+template<typename T>
+EIGEN_CONSTEXPR Index size(const T& x) { return x.size(); }
+
+template<typename T,std::size_t N>
+EIGEN_CONSTEXPR Index size(const T (&) [N]) { return N; }
+
+/** \internal
+  * Convenient struct to get the result type of a nullary, unary, binary, or
+  * ternary functor.
+  * 
+  * Pre C++11:
+  * Supports both a Func::result_type member and templated
+  * Func::result<Func(ArgTypes...)>::type member.
+  * 
+  * If none of these members is provided, then the type of the first
+  * argument is returned.
+  * 
+  * Post C++11:
+  * This uses std::result_of. However, note the `type` member removes
+  * const and converts references/pointers to their corresponding value type.
+  */
+#if EIGEN_HAS_STD_INVOKE_RESULT
+template<typename T> struct result_of;
+
+template<typename F, typename... ArgTypes>
+struct result_of<F(ArgTypes...)> {
+  typedef typename std::invoke_result<F, ArgTypes...>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#elif EIGEN_HAS_STD_RESULT_OF
+template<typename T> struct result_of {
+  typedef typename std::result_of<T>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#else
+template<typename T> struct result_of { };
 
 struct has_none {int a[1];};
 struct has_std_result_type {int a[2];};
 struct has_tr1_result {int a[3];};
 
+template<typename Func, int SizeOf>
+struct nullary_result_of_select {};
+
+template<typename Func>
+struct nullary_result_of_select<Func, sizeof(has_std_result_type)> {typedef typename Func::result_type type;};
+
+template<typename Func>
+struct nullary_result_of_select<Func, sizeof(has_tr1_result)> {typedef typename Func::template result<Func()>::type type;};
+
+template<typename Func>
+struct result_of<Func()> {
+    template<typename T>
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
+    template<typename T>
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T()>::type const * = 0);
+    static has_none               testFunctor(...);
+
+    // note that the following indirection is needed for gcc-3.3
+    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
+    typedef typename nullary_result_of_select<Func, FunctorType>::type type;
+};
+
 template<typename Func, typename ArgType, int SizeOf=sizeof(has_none)>
-struct unary_result_of_select {typedef ArgType type;};
+struct unary_result_of_select {typedef typename internal::remove_all<ArgType>::type type;};
 
 template<typename Func, typename ArgType>
 struct unary_result_of_select<Func, ArgType, sizeof(has_std_result_type)> {typedef typename Func::result_type type;};
@@ -210,10 +550,10 @@
 template<typename Func, typename ArgType>
 struct result_of<Func(ArgType)> {
     template<typename T>
-    static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0);
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
     template<typename T>
-    static has_tr1_result      testFunctor(T const *, typename T::template result<T(ArgType)>::type const * = 0);
-    static has_none            testFunctor(...);
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType)>::type const * = 0);
+    static has_none               testFunctor(...);
 
     // note that the following indirection is needed for gcc-3.3
     enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
@@ -221,7 +561,7 @@
 };
 
 template<typename Func, typename ArgType0, typename ArgType1, int SizeOf=sizeof(has_none)>
-struct binary_result_of_select {typedef ArgType0 type;};
+struct binary_result_of_select {typedef typename internal::remove_all<ArgType0>::type type;};
 
 template<typename Func, typename ArgType0, typename ArgType1>
 struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_std_result_type)>
@@ -234,10 +574,10 @@
 template<typename Func, typename ArgType0, typename ArgType1>
 struct result_of<Func(ArgType0,ArgType1)> {
     template<typename T>
-    static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0);
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
     template<typename T>
-    static has_tr1_result      testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1)>::type const * = 0);
-    static has_none            testFunctor(...);
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1)>::type const * = 0);
+    static has_none               testFunctor(...);
 
     // note that the following indirection is needed for gcc-3.3
     enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
@@ -268,6 +608,46 @@
     typedef typename ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, FunctorType>::type type;
 };
 
+#endif
+
+#if EIGEN_HAS_STD_INVOKE_RESULT
+template<typename F, typename... ArgTypes>
+struct invoke_result {
+  typedef typename std::invoke_result<F, ArgTypes...>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#elif EIGEN_HAS_CXX11
+template<typename F, typename... ArgTypes>
+struct invoke_result {
+  typedef typename result_of<F(ArgTypes...)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#else
+template<typename F, typename ArgType0 = void, typename ArgType1 = void, typename ArgType2 = void>
+struct invoke_result {
+  typedef typename result_of<F(ArgType0, ArgType1, ArgType2)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+
+template<typename F>
+struct invoke_result<F, void, void, void> {
+  typedef typename result_of<F()>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+
+template<typename F, typename ArgType0>
+struct invoke_result<F, ArgType0, void, void> {
+  typedef typename result_of<F(ArgType0)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+
+template<typename F, typename ArgType0, typename ArgType1>
+struct invoke_result<F, ArgType0, ArgType1, void> {
+  typedef typename result_of<F(ArgType0, ArgType1)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#endif
+
 struct meta_yes { char a[1]; };
 struct meta_no  { char a[2]; };
 
@@ -275,42 +655,41 @@
 template <typename T>
 struct has_ReturnType
 {
-  template <typename C> static meta_yes testFunctor(typename C::ReturnType const *);
-  template <typename C> static meta_no testFunctor(...);
+  template <typename C> static meta_yes testFunctor(C const *, typename C::ReturnType const * = 0);
+  template <typename C> static meta_no  testFunctor(...);
 
-  enum { value = sizeof(testFunctor<T>(0)) == sizeof(meta_yes) };
+  enum { value = sizeof(testFunctor<T>(static_cast<T*>(0))) == sizeof(meta_yes) };
 };
 
-template<typename T> const T& return_ref();
+template<typename T> const T* return_ptr();
 
-template <typename T>
+template <typename T, typename IndexType=Index>
 struct has_nullary_operator
 {
-  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ref<C>().operator()())>0)>::type * = 0);
+  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr<C>()->operator()())>0)>::type * = 0);
   static meta_no testFunctor(...);
 
   enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
 };
 
-template <typename T>
+template <typename T, typename IndexType=Index>
 struct has_unary_operator
 {
-  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ref<C>().operator()(Index(0)))>0)>::type * = 0);
+  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr<C>()->operator()(IndexType(0)))>0)>::type * = 0);
   static meta_no testFunctor(...);
 
   enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
 };
 
-template <typename T>
+template <typename T, typename IndexType=Index>
 struct has_binary_operator
 {
-  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ref<C>().operator()(Index(0),Index(0)))>0)>::type * = 0);
+  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr<C>()->operator()(IndexType(0),IndexType(0)))>0)>::type * = 0);
   static meta_no testFunctor(...);
 
   enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
 };
 
-
 /** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer.
   * Usage example: \code meta_sqrt<1023>::ret \endcode
   */
@@ -334,100 +713,98 @@
 template<int Y, int InfX, int SupX>
 class meta_sqrt<Y, InfX, SupX, true> { public:  enum { ret = (SupX*SupX <= Y) ? SupX : InfX }; };
 
+
+/** \internal Computes the least common multiple of two positive integer A and B
+  * at compile-time. 
+  */
+template<int A, int B, int K=1, bool Done = ((A*K)%B)==0, bool Big=(A>=B)>
+struct meta_least_common_multiple
+{
+  enum { ret = meta_least_common_multiple<A,B,K+1>::ret };
+};
+template<int A, int B, int K, bool Done>
+struct meta_least_common_multiple<A,B,K,Done,false>
+{
+  enum { ret = meta_least_common_multiple<B,A,K>::ret };
+};
+template<int A, int B, int K>
+struct meta_least_common_multiple<A,B,K,true,true>
+{
+  enum { ret = A*K };
+};
+
+
 /** \internal determines whether the product of two numeric types is allowed and what the return type is */
 template<typename T, typename U> struct scalar_product_traits
 {
   enum { Defined = 0 };
 };
 
-template<typename T> struct scalar_product_traits<T,T>
-{
-  enum {
-    // Cost = NumTraits<T>::MulCost,
-    Defined = 1
-  };
-  typedef T ReturnType;
-};
-
-template<typename T> struct scalar_product_traits<T, const T>
-{
-  enum {
-    // Cost = NumTraits<T>::MulCost,
-    Defined = 1
-  };
-  typedef T ReturnType;
-};
-
-template<typename T> struct scalar_product_traits<const T, T>
-{
-  enum {
-    // Cost = NumTraits<T>::MulCost,
-    Defined = 1
-  };
-  typedef T ReturnType;
-};
-
-template<typename T> struct scalar_product_traits<T,std::complex<T> >
-{
-  enum {
-    // Cost = 2*NumTraits<T>::MulCost,
-    Defined = 1
-  };
-  typedef std::complex<T> ReturnType;
-};
-
-template<typename T> struct scalar_product_traits<std::complex<T>, T>
-{
-  enum {
-    // Cost = 2*NumTraits<T>::MulCost,
-    Defined = 1
-  };
-  typedef std::complex<T> ReturnType;
-};
-
 // FIXME quick workaround around current limitation of result_of
 // template<typename Scalar, typename ArgType0, typename ArgType1>
 // struct result_of<scalar_product_op<Scalar>(ArgType0,ArgType1)> {
 // typedef typename scalar_product_traits<typename remove_all<ArgType0>::type, typename remove_all<ArgType1>::type>::ReturnType type;
 // };
 
-template<typename T> struct is_diagonal
-{ enum { ret = false }; };
-
-template<typename T> struct is_diagonal<DiagonalBase<T> >
-{ enum { ret = true }; };
-
-template<typename T> struct is_diagonal<DiagonalWrapper<T> >
-{ enum { ret = true }; };
-
-template<typename T, int S> struct is_diagonal<DiagonalMatrix<T,S> >
-{ enum { ret = true }; };
-
-/** \internal Computes the least common multiple of two positive integer A and B
-  * at compile-time. It implements a naive algorithm testing all multiples of A.
-  * It thus works better if A>=B.
+/** \internal Obtains a POD type suitable to use as storage for an object of a size
+  * of at most Len bytes, aligned as specified by \c Align.
   */
-template<int A, int B, int K=1, bool Done = ((A*K)%B)==0>
-struct meta_least_common_multiple
-{
-  enum { ret = meta_least_common_multiple<A,B,K+1>::ret };
-};
-template<int A, int B, int K>
-struct meta_least_common_multiple<A,B,K,true>
-{
-  enum { ret = A*K };
+template<unsigned Len, unsigned Align>
+struct aligned_storage {
+  struct type {
+    EIGEN_ALIGN_TO_BOUNDARY(Align) unsigned char data[Len];
+  };
 };
 
 } // end namespace internal
 
 namespace numext {
-  
-#if defined(__CUDA_ARCH__)
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 template<typename T> EIGEN_DEVICE_FUNC   void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
 #else
 template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
 #endif
 
+#if defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11
+using internal::device::numeric_limits;
+#else
+using std::numeric_limits;
+#endif
+
+// Integer division with rounding up.
+// T is assumed to be an integer type with a>=0, and b>0
+template<typename T>
+EIGEN_DEVICE_FUNC
+T div_ceil(const T &a, const T &b)
+{
+  return (a+b-1) / b;
+}
+
+// The aim of the following functions is to bypass -Wfloat-equal warnings
+// when we really want a strict equality comparison on floating points.
+template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool equal_strict(const X& x,const Y& y) { return x == y; }
+
+#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool equal_strict(const float& x,const float& y) { return std::equal_to<float>()(x,y); }
+
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool equal_strict(const double& x,const double& y) { return std::equal_to<double>()(x,y); }
+#endif
+
+template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool not_equal_strict(const X& x,const Y& y) { return x != y; }
+
+#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to<float>()(x,y); }
+
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to<double>()(x,y); }
+#endif
+
 } // end namespace numext
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/util/NonMPL2.h b/Eigen/src/Core/util/NonMPL2.h
new file mode 100644
index 0000000..1af67cf
--- /dev/null
+++ b/Eigen/src/Core/util/NonMPL2.h

@@ -0,0 +1,3 @@
+#ifdef EIGEN_MPL2_ONLY
+#error Including non-MPL2 code in EIGEN_MPL2_ONLY mode
+#endif

diff --git a/Eigen/src/Core/util/ReenableStupidWarnings.h b/Eigen/src/Core/util/ReenableStupidWarnings.h
index 86b60f5..1ce6fd1 100644
--- a/Eigen/src/Core/util/ReenableStupidWarnings.h
+++ b/Eigen/src/Core/util/ReenableStupidWarnings.h

@@ -1,4 +1,8 @@
-#ifdef EIGEN_WARNINGS_DISABLED
+#ifdef EIGEN_WARNINGS_DISABLED_2
+// "DisableStupidWarnings.h" was included twice recursively: Do not reenable warnings yet!
+#  undef EIGEN_WARNINGS_DISABLED_2
+
+#elif defined(EIGEN_WARNINGS_DISABLED)
 #undef EIGEN_WARNINGS_DISABLED
 
 #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
@@ -8,7 +12,7 @@
     #pragma warning pop
   #elif defined __clang__
     #pragma clang diagnostic pop
-  #elif defined __GNUC__ && __GNUC__>=6
+  #elif defined __GNUC__  &&  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
     #pragma GCC diagnostic pop
   #endif
 

diff --git a/Eigen/src/Core/util/ReshapedHelper.h b/Eigen/src/Core/util/ReshapedHelper.h
new file mode 100644
index 0000000..4124321
--- /dev/null
+++ b/Eigen/src/Core/util/ReshapedHelper.h

@@ -0,0 +1,51 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_RESHAPED_HELPER_H
+#define EIGEN_RESHAPED_HELPER_H
+
+namespace Eigen {
+
+enum AutoSize_t   { AutoSize };
+const int AutoOrder = 2;
+
+namespace internal {
+
+template<typename SizeType,typename OtherSize, int TotalSize>
+struct get_compiletime_reshape_size {
+  enum { value = get_fixed_value<SizeType>::value };
+};
+
+template<typename SizeType>
+Index get_runtime_reshape_size(SizeType size, Index /*other*/, Index /*total*/) {
+  return internal::get_runtime_value(size);
+}
+
+template<typename OtherSize, int TotalSize>
+struct get_compiletime_reshape_size<AutoSize_t,OtherSize,TotalSize> {
+  enum {
+    other_size = get_fixed_value<OtherSize>::value,
+    value = (TotalSize==Dynamic || other_size==Dynamic) ? Dynamic : TotalSize / other_size };
+};
+
+inline Index get_runtime_reshape_size(AutoSize_t /*size*/, Index other, Index total) {
+  return total/other;
+}
+
+template<int Flags, int Order>
+struct get_compiletime_reshape_order {
+  enum { value = Order == AutoOrder ? Flags & RowMajorBit : Order };
+};
+
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_RESHAPED_HELPER_H

diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index 461c52f..c45de59 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h

@@ -24,9 +24,10 @@
  *
  */
 
+#ifndef EIGEN_STATIC_ASSERT
 #ifndef EIGEN_NO_STATIC_ASSERT
 
-  #if defined(__GXX_EXPERIMENTAL_CXX0X__) || (EIGEN_COMP_MSVC >= 1600)
+  #if EIGEN_MAX_CPP_VER>=11 && (__has_feature(cxx_static_assert) || (EIGEN_COMP_CXXVER >= 11) || (EIGEN_COMP_MSVC >= 1600))
 
     // if native static_assert is enabled, let's use it
     #define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG);
@@ -44,54 +45,68 @@
     struct static_assertion<true>
     {
       enum {
-        YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX,
-        YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES,
-        YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES,
-        THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE,
-        THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE,
-        THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE,
-        YOU_MADE_A_PROGRAMMING_MISTAKE,
-        EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT,
-        EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE,
-        YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR,
-        YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR,
-        UNALIGNED_LOAD_AND_STORE_OPERATIONS_UNIMPLEMENTED_ON_ALTIVEC,
-        THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES,
-        FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED,
-        NUMERIC_TYPE_MUST_BE_REAL,
-        COEFFICIENT_WRITE_ACCESS_TO_SELFADJOINT_NOT_SUPPORTED,
-        WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED,
-        THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE,
-        INVALID_MATRIX_PRODUCT,
-        INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS,
-        INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION,
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY,
-        THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES,
-        THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES,
-        INVALID_MATRIX_TEMPLATE_PARAMETERS,
-        INVALID_MATRIXBASE_TEMPLATE_PARAMETERS,
-        BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER,
-        THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX,
-        THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE,
-        THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES,
-        YOU_ALREADY_SPECIFIED_THIS_STRIDE,
-        INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION,
-        THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD,
-        PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1,
-        THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS,
-        YOU_CANNOT_MIX_ARRAYS_AND_MATRICES,
-        YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION,
-        THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY,
-        YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT,
-        THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS,
-        THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL,
-        THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES,
-        YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED,
-        YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED,
-        THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE,
-        THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH,
-        OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG,
-        THIS_TYPE_IS_NOT_SUPPORTED
+        YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX=1,
+        YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES=1,
+        YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES=1,
+        THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE=1,
+        THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE=1,
+        THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE=1,
+        OUT_OF_RANGE_ACCESS=1,
+        YOU_MADE_A_PROGRAMMING_MISTAKE=1,
+        EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT=1,
+        EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE=1,
+        YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR=1,
+        YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR=1,
+        UNALIGNED_LOAD_AND_STORE_OPERATIONS_UNIMPLEMENTED_ON_ALTIVEC=1,
+        THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES=1,
+        FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED=1,
+        NUMERIC_TYPE_MUST_BE_REAL=1,
+        COEFFICIENT_WRITE_ACCESS_TO_SELFADJOINT_NOT_SUPPORTED=1,
+        WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED=1,
+        THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE=1,
+        INVALID_MATRIX_PRODUCT=1,
+        INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS=1,
+        INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION=1,
+        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY=1,
+        THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES=1,
+        THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES=1,
+        INVALID_MATRIX_TEMPLATE_PARAMETERS=1,
+        INVALID_MATRIXBASE_TEMPLATE_PARAMETERS=1,
+        BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER=1,
+        THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX=1,
+        THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE=1,
+        THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES=1,
+        YOU_ALREADY_SPECIFIED_THIS_STRIDE=1,
+        INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION=1,
+        THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD=1,
+        PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1=1,
+        THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS=1,
+        YOU_CANNOT_MIX_ARRAYS_AND_MATRICES=1,
+        YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION=1,
+        THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY=1,
+        YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT=1,
+        THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS=1,
+        THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS=1,
+        THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL=1,
+        THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES=1,
+        YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED=1,
+        YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED=1,
+        THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE=1,
+        THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH=1,
+        OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG=1,
+        IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY=1,
+        STORAGE_LAYOUT_DOES_NOT_MATCH=1,
+        EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE=1,
+        THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS=1,
+        MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY=1,
+        THIS_TYPE_IS_NOT_SUPPORTED=1,
+        STORAGE_KIND_MUST_MATCH=1,
+        STORAGE_INDEX_MUST_MATCH=1,
+        CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1,
+        SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1,
+        INVALID_TEMPLATE_PARAMETER=1,
+        GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1,
+        THE_ARRAY_SIZE_SHOULD_EQUAL_WITH_PACKET_SIZE=1
       };
     };
 
@@ -121,7 +136,7 @@
   #define EIGEN_STATIC_ASSERT(CONDITION,MSG) eigen_assert((CONDITION) && #MSG);
 
 #endif // EIGEN_NO_STATIC_ASSERT
-
+#endif // EIGEN_STATIC_ASSERT
 
 // static assertion failing if the type \a TYPE is not a vector type
 #define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) \
@@ -158,7 +173,7 @@
 
 #define EIGEN_PREDICATE_SAME_MATRIX_SIZE(TYPE0,TYPE1) \
      ( \
-        (int(TYPE0::SizeAtCompileTime)==0 && int(TYPE1::SizeAtCompileTime)==0) \
+        (int(Eigen::internal::size_of_xpr_at_compile_time<TYPE0>::ret)==0 && int(Eigen::internal::size_of_xpr_at_compile_time<TYPE1>::ret)==0) \
     || (\
           (int(TYPE0::RowsAtCompileTime)==Eigen::Dynamic \
         || int(TYPE1::RowsAtCompileTime)==Eigen::Dynamic \
@@ -169,13 +184,8 @@
        ) \
      )
 
-#ifdef EIGEN2_SUPPORT
-  #define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \
-    eigen_assert(!NumTraits<Scalar>::IsInteger);
-#else
-  #define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \
-    EIGEN_STATIC_ASSERT(!NumTraits<TYPE>::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
-#endif
+#define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \
+    EIGEN_STATIC_ASSERT(!Eigen::NumTraits<TYPE>::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
 
 
 // static assertion failing if it is guaranteed at compile-time that the two matrix expression types have different sizes
@@ -185,23 +195,27 @@
     YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES)
 
 #define EIGEN_STATIC_ASSERT_SIZE_1x1(TYPE) \
-      EIGEN_STATIC_ASSERT((TYPE::RowsAtCompileTime == 1 || TYPE::RowsAtCompileTime == Dynamic) && \
-                          (TYPE::ColsAtCompileTime == 1 || TYPE::ColsAtCompileTime == Dynamic), \
+      EIGEN_STATIC_ASSERT((TYPE::RowsAtCompileTime == 1 || TYPE::RowsAtCompileTime == Eigen::Dynamic) && \
+                          (TYPE::ColsAtCompileTime == 1 || TYPE::ColsAtCompileTime == Eigen::Dynamic), \
                           THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS)
 
 #define EIGEN_STATIC_ASSERT_LVALUE(Derived) \
-      EIGEN_STATIC_ASSERT(internal::is_lvalue<Derived>::value, \
+      EIGEN_STATIC_ASSERT(Eigen::internal::is_lvalue<Derived>::value, \
                           THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY)
 
 #define EIGEN_STATIC_ASSERT_ARRAYXPR(Derived) \
-      EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Derived>::XprKind, ArrayXpr>::value), \
+      EIGEN_STATIC_ASSERT((Eigen::internal::is_same<typename Eigen::internal::traits<Derived>::XprKind, ArrayXpr>::value), \
                           THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES)
 
 #define EIGEN_STATIC_ASSERT_SAME_XPR_KIND(Derived1, Derived2) \
-      EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Derived1>::XprKind, \
-                                             typename internal::traits<Derived2>::XprKind \
+      EIGEN_STATIC_ASSERT((Eigen::internal::is_same<typename Eigen::internal::traits<Derived1>::XprKind, \
+                                             typename Eigen::internal::traits<Derived2>::XprKind \
                                             >::value), \
                           YOU_CANNOT_MIX_ARRAYS_AND_MATRICES)
 
+// Check that a cost value is positive, and that is stay within a reasonable range
+// TODO this check could be enabled for internal debugging only
+#define EIGEN_INTERNAL_CHECK_COST_VALUE(C) \
+      EIGEN_STATIC_ASSERT((C)>=0 && (C)<=HugeCost*HugeCost, EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE);
 
 #endif // EIGEN_STATIC_ASSERT_H

diff --git a/Eigen/src/Core/util/SymbolicIndex.h b/Eigen/src/Core/util/SymbolicIndex.h
new file mode 100644
index 0000000..354dd9a
--- /dev/null
+++ b/Eigen/src/Core/util/SymbolicIndex.h

@@ -0,0 +1,293 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SYMBOLIC_INDEX_H
+#define EIGEN_SYMBOLIC_INDEX_H
+
+namespace Eigen {
+
+/** \namespace Eigen::symbolic
+  * \ingroup Core_Module
+  *
+  * This namespace defines a set of classes and functions to build and evaluate symbolic expressions of scalar type Index.
+  * Here is a simple example:
+  *
+  * \code
+  * // First step, defines symbols:
+  * struct x_tag {};  static const symbolic::SymbolExpr<x_tag> x;
+  * struct y_tag {};  static const symbolic::SymbolExpr<y_tag> y;
+  * struct z_tag {};  static const symbolic::SymbolExpr<z_tag> z;
+  *
+  * // Defines an expression:
+  * auto expr = (x+3)/y+z;
+  *
+  * // And evaluate it: (c++14)
+  * std::cout << expr.eval(x=6,y=3,z=-13) << "\n";
+  *
+  * // In c++98/11, only one symbol per expression is supported for now:
+  * auto expr98 = (3-x)/2;
+  * std::cout << expr98.eval(x=6) << "\n";
+  * \endcode
+  *
+  * It is currently only used internally to define and manipulate the Eigen::last and Eigen::lastp1 symbols in Eigen::seq and Eigen::seqN.
+  *
+  */
+namespace symbolic {
+
+template<typename Tag> class Symbol;
+template<typename Arg0> class NegateExpr;
+template<typename Arg1,typename Arg2> class AddExpr;
+template<typename Arg1,typename Arg2> class ProductExpr;
+template<typename Arg1,typename Arg2> class QuotientExpr;
+
+// A simple wrapper around an integral value to provide the eval method.
+// We could also use a free-function symbolic_eval...
+template<typename IndexType=Index>
+class ValueExpr {
+public:
+  ValueExpr(IndexType val) : m_value(val) {}
+  template<typename T>
+  IndexType eval_impl(const T&) const { return m_value; }
+protected:
+  IndexType m_value;
+};
+
+// Specialization for compile-time value,
+// It is similar to ValueExpr(N) but this version helps the compiler to generate better code.
+template<int N>
+class ValueExpr<internal::FixedInt<N> > {
+public:
+  ValueExpr() {}
+  template<typename T>
+  EIGEN_CONSTEXPR Index eval_impl(const T&) const { return N; }
+};
+
+
+/** \class BaseExpr
+  * \ingroup Core_Module
+  * Common base class of any symbolic expressions
+  */
+template<typename Derived>
+class BaseExpr
+{
+public:
+  const Derived& derived() const { return *static_cast<const Derived*>(this); }
+
+  /** Evaluate the expression given the \a values of the symbols.
+    *
+    * \param values defines the values of the symbols, it can either be a SymbolValue or a std::tuple of SymbolValue
+    *               as constructed by SymbolExpr::operator= operator.
+    *
+    */
+  template<typename T>
+  Index eval(const T& values) const { return derived().eval_impl(values); }
+
+#if EIGEN_HAS_CXX14
+  template<typename... Types>
+  Index eval(Types&&... values) const { return derived().eval_impl(std::make_tuple(values...)); }
+#endif
+
+  NegateExpr<Derived> operator-() const { return NegateExpr<Derived>(derived()); }
+
+  AddExpr<Derived,ValueExpr<> > operator+(Index b) const
+  { return AddExpr<Derived,ValueExpr<> >(derived(),  b); }
+  AddExpr<Derived,ValueExpr<> > operator-(Index a) const
+  { return AddExpr<Derived,ValueExpr<> >(derived(), -a); }
+  ProductExpr<Derived,ValueExpr<> > operator*(Index a) const
+  { return ProductExpr<Derived,ValueExpr<> >(derived(),a); }
+  QuotientExpr<Derived,ValueExpr<> > operator/(Index a) const
+  { return QuotientExpr<Derived,ValueExpr<> >(derived(),a); }
+
+  friend AddExpr<Derived,ValueExpr<> > operator+(Index a, const BaseExpr& b)
+  { return AddExpr<Derived,ValueExpr<> >(b.derived(), a); }
+  friend AddExpr<NegateExpr<Derived>,ValueExpr<> > operator-(Index a, const BaseExpr& b)
+  { return AddExpr<NegateExpr<Derived>,ValueExpr<> >(-b.derived(), a); }
+  friend ProductExpr<ValueExpr<>,Derived> operator*(Index a, const BaseExpr& b)
+  { return ProductExpr<ValueExpr<>,Derived>(a,b.derived()); }
+  friend QuotientExpr<ValueExpr<>,Derived> operator/(Index a, const BaseExpr& b)
+  { return QuotientExpr<ValueExpr<>,Derived>(a,b.derived()); }
+
+  template<int N>
+  AddExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator+(internal::FixedInt<N>) const
+  { return AddExpr<Derived,ValueExpr<internal::FixedInt<N> > >(derived(), ValueExpr<internal::FixedInt<N> >()); }
+  template<int N>
+  AddExpr<Derived,ValueExpr<internal::FixedInt<-N> > > operator-(internal::FixedInt<N>) const
+  { return AddExpr<Derived,ValueExpr<internal::FixedInt<-N> > >(derived(), ValueExpr<internal::FixedInt<-N> >()); }
+  template<int N>
+  ProductExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator*(internal::FixedInt<N>) const
+  { return ProductExpr<Derived,ValueExpr<internal::FixedInt<N> > >(derived(),ValueExpr<internal::FixedInt<N> >()); }
+  template<int N>
+  QuotientExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator/(internal::FixedInt<N>) const
+  { return QuotientExpr<Derived,ValueExpr<internal::FixedInt<N> > >(derived(),ValueExpr<internal::FixedInt<N> >()); }
+
+  template<int N>
+  friend AddExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator+(internal::FixedInt<N>, const BaseExpr& b)
+  { return AddExpr<Derived,ValueExpr<internal::FixedInt<N> > >(b.derived(), ValueExpr<internal::FixedInt<N> >()); }
+  template<int N>
+  friend AddExpr<NegateExpr<Derived>,ValueExpr<internal::FixedInt<N> > > operator-(internal::FixedInt<N>, const BaseExpr& b)
+  { return AddExpr<NegateExpr<Derived>,ValueExpr<internal::FixedInt<N> > >(-b.derived(), ValueExpr<internal::FixedInt<N> >()); }
+  template<int N>
+  friend ProductExpr<ValueExpr<internal::FixedInt<N> >,Derived> operator*(internal::FixedInt<N>, const BaseExpr& b)
+  { return ProductExpr<ValueExpr<internal::FixedInt<N> >,Derived>(ValueExpr<internal::FixedInt<N> >(),b.derived()); }
+  template<int N>
+  friend QuotientExpr<ValueExpr<internal::FixedInt<N> >,Derived> operator/(internal::FixedInt<N>, const BaseExpr& b)
+  { return QuotientExpr<ValueExpr<internal::FixedInt<N> > ,Derived>(ValueExpr<internal::FixedInt<N> >(),b.derived()); }
+
+#if (!EIGEN_HAS_CXX14)
+  template<int N>
+  AddExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator+(internal::FixedInt<N> (*)()) const
+  { return AddExpr<Derived,ValueExpr<internal::FixedInt<N> > >(derived(), ValueExpr<internal::FixedInt<N> >()); }
+  template<int N>
+  AddExpr<Derived,ValueExpr<internal::FixedInt<-N> > > operator-(internal::FixedInt<N> (*)()) const
+  { return AddExpr<Derived,ValueExpr<internal::FixedInt<-N> > >(derived(), ValueExpr<internal::FixedInt<-N> >()); }
+  template<int N>
+  ProductExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator*(internal::FixedInt<N> (*)()) const
+  { return ProductExpr<Derived,ValueExpr<internal::FixedInt<N> > >(derived(),ValueExpr<internal::FixedInt<N> >()); }
+  template<int N>
+  QuotientExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator/(internal::FixedInt<N> (*)()) const
+  { return QuotientExpr<Derived,ValueExpr<internal::FixedInt<N> > >(derived(),ValueExpr<internal::FixedInt<N> >()); }
+
+  template<int N>
+  friend AddExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator+(internal::FixedInt<N> (*)(), const BaseExpr& b)
+  { return AddExpr<Derived,ValueExpr<internal::FixedInt<N> > >(b.derived(), ValueExpr<internal::FixedInt<N> >()); }
+  template<int N>
+  friend AddExpr<NegateExpr<Derived>,ValueExpr<internal::FixedInt<N> > > operator-(internal::FixedInt<N> (*)(), const BaseExpr& b)
+  { return AddExpr<NegateExpr<Derived>,ValueExpr<internal::FixedInt<N> > >(-b.derived(), ValueExpr<internal::FixedInt<N> >()); }
+  template<int N>
+  friend ProductExpr<ValueExpr<internal::FixedInt<N> >,Derived> operator*(internal::FixedInt<N> (*)(), const BaseExpr& b)
+  { return ProductExpr<ValueExpr<internal::FixedInt<N> >,Derived>(ValueExpr<internal::FixedInt<N> >(),b.derived()); }
+  template<int N>
+  friend QuotientExpr<ValueExpr<internal::FixedInt<N> >,Derived> operator/(internal::FixedInt<N> (*)(), const BaseExpr& b)
+  { return QuotientExpr<ValueExpr<internal::FixedInt<N> > ,Derived>(ValueExpr<internal::FixedInt<N> >(),b.derived()); }
+#endif
+
+
+  template<typename OtherDerived>
+  AddExpr<Derived,OtherDerived> operator+(const BaseExpr<OtherDerived> &b) const
+  { return AddExpr<Derived,OtherDerived>(derived(),  b.derived()); }
+
+  template<typename OtherDerived>
+  AddExpr<Derived,NegateExpr<OtherDerived> > operator-(const BaseExpr<OtherDerived> &b) const
+  { return AddExpr<Derived,NegateExpr<OtherDerived> >(derived(), -b.derived()); }
+
+  template<typename OtherDerived>
+  ProductExpr<Derived,OtherDerived> operator*(const BaseExpr<OtherDerived> &b) const
+  { return ProductExpr<Derived,OtherDerived>(derived(), b.derived()); }
+
+  template<typename OtherDerived>
+  QuotientExpr<Derived,OtherDerived> operator/(const BaseExpr<OtherDerived> &b) const
+  { return QuotientExpr<Derived,OtherDerived>(derived(), b.derived()); }
+};
+
+template<typename T>
+struct is_symbolic {
+  // BaseExpr has no conversion ctor, so we only have to check whether T can be statically cast to its base class BaseExpr<T>.
+  enum { value = internal::is_convertible<T,BaseExpr<T> >::value };
+};
+
+/** Represents the actual value of a symbol identified by its tag
+  *
+  * It is the return type of SymbolValue::operator=, and most of the time this is only way it is used.
+  */
+template<typename Tag>
+class SymbolValue
+{
+public:
+  /** Default constructor from the value \a val */
+  SymbolValue(Index val) : m_value(val) {}
+
+  /** \returns the stored value of the symbol */
+  Index value() const { return m_value; }
+protected:
+  Index m_value;
+};
+
+/** Expression of a symbol uniquely identified by the template parameter type \c tag */
+template<typename tag>
+class SymbolExpr : public BaseExpr<SymbolExpr<tag> >
+{
+public:
+  /** Alias to the template parameter \c tag */
+  typedef tag Tag;
+
+  SymbolExpr() {}
+
+  /** Associate the value \a val to the given symbol \c *this, uniquely identified by its \c Tag.
+    *
+    * The returned object should be passed to ExprBase::eval() to evaluate a given expression with this specified runtime-time value.
+    */
+  SymbolValue<Tag> operator=(Index val) const {
+    return SymbolValue<Tag>(val);
+  }
+
+  Index eval_impl(const SymbolValue<Tag> &values) const { return values.value(); }
+
+#if EIGEN_HAS_CXX14
+  // C++14 versions suitable for multiple symbols
+  template<typename... Types>
+  Index eval_impl(const std::tuple<Types...>& values) const { return std::get<SymbolValue<Tag> >(values).value(); }
+#endif
+};
+
+template<typename Arg0>
+class NegateExpr : public BaseExpr<NegateExpr<Arg0> >
+{
+public:
+  NegateExpr(const Arg0& arg0) : m_arg0(arg0) {}
+
+  template<typename T>
+  Index eval_impl(const T& values) const { return -m_arg0.eval_impl(values); }
+protected:
+  Arg0 m_arg0;
+};
+
+template<typename Arg0, typename Arg1>
+class AddExpr : public BaseExpr<AddExpr<Arg0,Arg1> >
+{
+public:
+  AddExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {}
+
+  template<typename T>
+  Index eval_impl(const T& values) const { return m_arg0.eval_impl(values) + m_arg1.eval_impl(values); }
+protected:
+  Arg0 m_arg0;
+  Arg1 m_arg1;
+};
+
+template<typename Arg0, typename Arg1>
+class ProductExpr : public BaseExpr<ProductExpr<Arg0,Arg1> >
+{
+public:
+  ProductExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {}
+
+  template<typename T>
+  Index eval_impl(const T& values) const { return m_arg0.eval_impl(values) * m_arg1.eval_impl(values); }
+protected:
+  Arg0 m_arg0;
+  Arg1 m_arg1;
+};
+
+template<typename Arg0, typename Arg1>
+class QuotientExpr : public BaseExpr<QuotientExpr<Arg0,Arg1> >
+{
+public:
+  QuotientExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {}
+
+  template<typename T>
+  Index eval_impl(const T& values) const { return m_arg0.eval_impl(values) / m_arg1.eval_impl(values); }
+protected:
+  Arg0 m_arg0;
+  Arg1 m_arg1;
+};
+
+} // end namespace symbolic
+
+} // end namespace Eigen
+
+#endif // EIGEN_SYMBOLIC_INDEX_H

diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 5003eb0..71c32b8 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h

@@ -24,23 +24,95 @@
 
 namespace Eigen {
 
-typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE DenseIndex;
-
-/**
- * \brief The Index type as used for the API.
- * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
- * \sa \blank \ref TopicPreprocessorDirectives, StorageIndex.
- */
-typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE Index;
-
-
 namespace internal {
 
+template<typename IndexDest, typename IndexSrc>
+EIGEN_DEVICE_FUNC
+inline IndexDest convert_index(const IndexSrc& idx) {
+  // for sizeof(IndexDest)>=sizeof(IndexSrc) compilers should be able to optimize this away:
+  eigen_internal_assert(idx <= NumTraits<IndexDest>::highest() && "Index value to big for target type");
+  return IndexDest(idx);
+}
+
+// true if T can be considered as an integral index (i.e., and integral type or enum)
+template<typename T> struct is_valid_index_type
+{
+  enum { value =
+#if EIGEN_HAS_TYPE_TRAITS
+    internal::is_integral<T>::value || std::is_enum<T>::value
+#elif EIGEN_COMP_MSVC
+    internal::is_integral<T>::value || __is_enum(T)
+#else
+    // without C++11, we use is_convertible to Index instead of is_integral in order to treat enums as Index.
+    internal::is_convertible<T,Index>::value && !internal::is_same<T,float>::value && !is_same<T,double>::value
+#endif
+  };
+};
+
+// true if both types are not valid index types
+template<typename RowIndices, typename ColIndices>
+struct valid_indexed_view_overload {
+  enum { value = !(internal::is_valid_index_type<RowIndices>::value && internal::is_valid_index_type<ColIndices>::value) };
+};
+
+// promote_scalar_arg is an helper used in operation between an expression and a scalar, like:
+//    expression * scalar
+// Its role is to determine how the type T of the scalar operand should be promoted given the scalar type ExprScalar of the given expression.
+// The IsSupported template parameter must be provided by the caller as: internal::has_ReturnType<ScalarBinaryOpTraits<ExprScalar,T,op> >::value using the proper order for ExprScalar and T.
+// Then the logic is as follows:
+//  - if the operation is natively supported as defined by IsSupported, then the scalar type is not promoted, and T is returned.
+//  - otherwise, NumTraits<ExprScalar>::Literal is returned if T is implicitly convertible to NumTraits<ExprScalar>::Literal AND that this does not imply a float to integer conversion.
+//  - otherwise, ExprScalar is returned if T is implicitly convertible to ExprScalar AND that this does not imply a float to integer conversion.
+//  - In all other cases, the promoted type is not defined, and the respective operation is thus invalid and not available (SFINAE).
+template<typename ExprScalar,typename T, bool IsSupported>
+struct promote_scalar_arg;
+
+template<typename S,typename T>
+struct promote_scalar_arg<S,T,true>
+{
+  typedef T type;
+};
+
+// Recursively check safe conversion to PromotedType, and then ExprScalar if they are different.
+template<typename ExprScalar,typename T,typename PromotedType,
+  bool ConvertibleToLiteral = internal::is_convertible<T,PromotedType>::value,
+  bool IsSafe = NumTraits<T>::IsInteger || !NumTraits<PromotedType>::IsInteger>
+struct promote_scalar_arg_unsupported;
+
+// Start recursion with NumTraits<ExprScalar>::Literal
+template<typename S,typename T>
+struct promote_scalar_arg<S,T,false> : promote_scalar_arg_unsupported<S,T,typename NumTraits<S>::Literal> {};
+
+// We found a match!
+template<typename S,typename T, typename PromotedType>
+struct promote_scalar_arg_unsupported<S,T,PromotedType,true,true>
+{
+  typedef PromotedType type;
+};
+
+// No match, but no real-to-integer issues, and ExprScalar and current PromotedType are different,
+// so let's try to promote to ExprScalar
+template<typename ExprScalar,typename T, typename PromotedType>
+struct promote_scalar_arg_unsupported<ExprScalar,T,PromotedType,false,true>
+   : promote_scalar_arg_unsupported<ExprScalar,T,ExprScalar>
+{};
+
+// Unsafe real-to-integer, let's stop.
+template<typename S,typename T, typename PromotedType, bool ConvertibleToLiteral>
+struct promote_scalar_arg_unsupported<S,T,PromotedType,ConvertibleToLiteral,false> {};
+
+// T is not even convertible to ExprScalar, let's stop.
+template<typename S,typename T>
+struct promote_scalar_arg_unsupported<S,T,S,false,true> {};
+
 //classes inheriting no_assignment_operator don't generate a default operator=.
 class no_assignment_operator
 {
   private:
     no_assignment_operator& operator=(const no_assignment_operator&);
+  protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(no_assignment_operator)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(no_assignment_operator)
 };
 
 /** \internal return the index type with the largest number of bits */
@@ -50,19 +122,6 @@
   typedef typename conditional<(sizeof(I1)<sizeof(I2)), I2, I1>::type type;
 };
 
-/** \internal return the scalar type according to C promotion rules */
-template<typename T1, typename T2>
-struct promote_scalar_type
-{
-  // If both types are integer or both are fp, use the larger type.
-  // Otherwise, use the fp type.
-  // Note: don't support signedness yet.
-  typedef typename conditional<(sizeof(T1)>sizeof(T2)), T1, T2>::type max_type;
-  typedef typename conditional<std::numeric_limits<T1>::is_integer, T2, T1>::type fp_type;
-  typedef typename conditional<std::numeric_limits<T1>::is_integer ^
-      std::numeric_limits<T2>::is_integer, fp_type, max_type>::type type;
-};
-
 /** \internal If the template parameter Value is Dynamic, this class is just a wrapper around a T variable that
   * can be accessed using value() and setValue().
   * Otherwise, this class is an empty structure and value() just returns the template parameter Value.
@@ -70,20 +129,24 @@
 template<typename T, int Value> class variable_if_dynamic
 {
   public:
-    EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamic)
-    EIGEN_DEVICE_FUNC explicit variable_if_dynamic(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
-    EIGEN_DEVICE_FUNC static T value() { return T(Value); }
-    EIGEN_DEVICE_FUNC void setValue(T) {}
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(variable_if_dynamic)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
+    EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    T value() { return T(Value); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    operator T() const { return T(Value); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    void setValue(T v) const { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
 };
 
 template<typename T> class variable_if_dynamic<T, Dynamic>
 {
     T m_value;
-    EIGEN_DEVICE_FUNC variable_if_dynamic() { eigen_assert(false); }
   public:
-    EIGEN_DEVICE_FUNC explicit variable_if_dynamic(T value) : m_value(value) {}
-    EIGEN_DEVICE_FUNC T value() const { return m_value; }
-    EIGEN_DEVICE_FUNC void setValue(T value) { m_value = value; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value = 0) EIGEN_NO_THROW : m_value(value) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T value() const { return m_value; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator T() const { return m_value; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; }
 };
 
 /** \internal like variable_if_dynamic but for DynamicIndex
@@ -92,9 +155,11 @@
 {
   public:
     EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamicindex)
-    EIGEN_DEVICE_FUNC explicit variable_if_dynamicindex(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
-    EIGEN_DEVICE_FUNC static T value() { return T(Value); }
-    EIGEN_DEVICE_FUNC void setValue(T) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
+    EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    T value() { return T(Value); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    void setValue(T) {}
 };
 
 template<typename T> class variable_if_dynamicindex<T, DynamicIndex>
@@ -102,9 +167,9 @@
     T m_value;
     EIGEN_DEVICE_FUNC variable_if_dynamicindex() { eigen_assert(false); }
   public:
-    EIGEN_DEVICE_FUNC explicit variable_if_dynamicindex(T value) : m_value(value) {}
-    EIGEN_DEVICE_FUNC T value() const { return m_value; }
-    EIGEN_DEVICE_FUNC void setValue(T value) { m_value = value; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T value) : m_value(value) {}
+    EIGEN_DEVICE_FUNC T EIGEN_STRONG_INLINE value() const { return m_value; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; }
 };
 
 template<typename T> struct functor_traits
@@ -119,11 +184,67 @@
 
 template<typename T> struct packet_traits;
 
-template<typename T> struct unpacket_traits
+template<typename T> struct unpacket_traits;
+
+template<int Size, typename PacketType,
+         bool Stop = Size==Dynamic || (Size%unpacket_traits<PacketType>::size)==0 || is_same<PacketType,typename unpacket_traits<PacketType>::half>::value>
+struct find_best_packet_helper;
+
+template< int Size, typename PacketType>
+struct find_best_packet_helper<Size,PacketType,true>
 {
-  typedef T type;
-  typedef T half;
-  enum {size=1};
+  typedef PacketType type;
+};
+
+template<int Size, typename PacketType>
+struct find_best_packet_helper<Size,PacketType,false>
+{
+  typedef typename find_best_packet_helper<Size,typename unpacket_traits<PacketType>::half>::type type;
+};
+
+template<typename T, int Size>
+struct find_best_packet
+{
+  typedef typename find_best_packet_helper<Size,typename packet_traits<T>::type>::type type;
+};
+
+#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
+template<int ArrayBytes, int AlignmentBytes,
+         bool Match     =  bool((ArrayBytes%AlignmentBytes)==0),
+         bool TryHalf   =  bool(EIGEN_MIN_ALIGN_BYTES<AlignmentBytes) >
+struct compute_default_alignment_helper
+{
+  enum { value = 0 };
+};
+
+template<int ArrayBytes, int AlignmentBytes, bool TryHalf>
+struct compute_default_alignment_helper<ArrayBytes, AlignmentBytes, true, TryHalf> // Match
+{
+  enum { value = AlignmentBytes };
+};
+
+template<int ArrayBytes, int AlignmentBytes>
+struct compute_default_alignment_helper<ArrayBytes, AlignmentBytes, false, true> // Try-half
+{
+  // current packet too large, try with an half-packet
+  enum { value = compute_default_alignment_helper<ArrayBytes, AlignmentBytes/2>::value };
+};
+#else
+// If static alignment is disabled, no need to bother.
+// This also avoids a division by zero in "bool Match =  bool((ArrayBytes%AlignmentBytes)==0)"
+template<int ArrayBytes, int AlignmentBytes>
+struct compute_default_alignment_helper
+{
+  enum { value = 0 };
+};
+#endif
+
+template<typename T, int Size> struct compute_default_alignment {
+  enum { value = compute_default_alignment_helper<Size*sizeof(T),EIGEN_MAX_STATIC_ALIGN_BYTES>::value };
+};
+
+template<typename T> struct compute_default_alignment<T,Dynamic> {
+  enum { value = EIGEN_MAX_ALIGN_BYTES };
 };
 
 template<typename _Scalar, int _Rows, int _Cols,
@@ -149,35 +270,12 @@
 template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
 class compute_matrix_flags
 {
-    enum {
-      row_major_bit = Options&RowMajor ? RowMajorBit : 0,
-      is_dynamic_size_storage = MaxRows==Dynamic || MaxCols==Dynamic,
-
-      aligned_bit =
-      (
-            ((Options&DontAlign)==0)
-        && (
-#if EIGEN_ALIGN_STATICALLY
-             ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0))
-#else
-             0
-#endif
-
-          ||
-
-#if EIGEN_ALIGN
-             is_dynamic_size_storage
-#else
-             0
-#endif
-
-          )
-      ) ? AlignedBit : 0,
-      packet_access_bit = packet_traits<Scalar>::Vectorizable && aligned_bit ? PacketAccessBit : 0
-    };
-
+    enum { row_major_bit = Options&RowMajor ? RowMajorBit : 0 };
   public:
-    enum { ret = LinearAccessBit | LvalueBit | DirectAccessBit | NestByRefBit | packet_access_bit | row_major_bit | aligned_bit };
+    // FIXME currently we still have to handle DirectAccessBit at the expression level to handle DenseCoeffsBase<>
+    // and then propagate this information to the evaluator's flags.
+    // However, I (Gael) think that DirectAccessBit should only matter at the evaluation stage.
+    enum { ret = DirectAccessBit | LvalueBit | NestByRefBit | row_major_bit };
 };
 
 template<int _Rows, int _Cols> struct size_at_compile_time
@@ -185,34 +283,43 @@
   enum { ret = (_Rows==Dynamic || _Cols==Dynamic) ? Dynamic : _Rows * _Cols };
 };
 
+template<typename XprType> struct size_of_xpr_at_compile_time
+{
+  enum { ret = size_at_compile_time<traits<XprType>::RowsAtCompileTime,traits<XprType>::ColsAtCompileTime>::ret };
+};
+
 /* plain_matrix_type : the difference from eval is that plain_matrix_type is always a plain matrix type,
  * whereas eval is a const reference in the case of a matrix
  */
 
 template<typename T, typename StorageKind = typename traits<T>::StorageKind> struct plain_matrix_type;
-template<typename T, typename BaseClassType> struct plain_matrix_type_dense;
+template<typename T, typename BaseClassType, int Flags> struct plain_matrix_type_dense;
 template<typename T> struct plain_matrix_type<T,Dense>
 {
-  typedef typename plain_matrix_type_dense<T,typename traits<T>::XprKind>::type type;
+  typedef typename plain_matrix_type_dense<T,typename traits<T>::XprKind, traits<T>::Flags>::type type;
+};
+template<typename T> struct plain_matrix_type<T,DiagonalShape>
+{
+  typedef typename T::PlainObject type;
 };
 
-template<typename T> struct plain_matrix_type_dense<T,MatrixXpr>
+template<typename T, int Flags> struct plain_matrix_type_dense<T,MatrixXpr,Flags>
 {
   typedef Matrix<typename traits<T>::Scalar,
                 traits<T>::RowsAtCompileTime,
                 traits<T>::ColsAtCompileTime,
-                AutoAlign | (traits<T>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                AutoAlign | (Flags&RowMajorBit ? RowMajor : ColMajor),
                 traits<T>::MaxRowsAtCompileTime,
                 traits<T>::MaxColsAtCompileTime
           > type;
 };
 
-template<typename T> struct plain_matrix_type_dense<T,ArrayXpr>
+template<typename T, int Flags> struct plain_matrix_type_dense<T,ArrayXpr,Flags>
 {
   typedef Array<typename traits<T>::Scalar,
                 traits<T>::RowsAtCompileTime,
                 traits<T>::ColsAtCompileTime,
-                AutoAlign | (traits<T>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                AutoAlign | (Flags&RowMajorBit ? RowMajor : ColMajor),
                 traits<T>::MaxRowsAtCompileTime,
                 traits<T>::MaxColsAtCompileTime
           > type;
@@ -237,6 +344,11 @@
 //           > type;
 };
 
+template<typename T> struct eval<T,DiagonalShape>
+{
+  typedef typename plain_matrix_type<T>::type type;
+};
+
 // for matrices, no need to evaluate, just use a const reference to avoid a useless copy
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
 struct eval<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>, Dense>
@@ -251,6 +363,15 @@
 };
 
 
+/* similar to plain_matrix_type, but using the evaluator's Flags */
+template<typename T, typename StorageKind = typename traits<T>::StorageKind> struct plain_object_eval;
+
+template<typename T>
+struct plain_object_eval<T,Dense>
+{
+  typedef typename plain_matrix_type_dense<T,typename traits<T>::XprKind, evaluator<T>::Flags>::type type;
+};
+
 
 /* plain_matrix_type_column_major : same as plain_matrix_type but guaranteed to be column-major
  */
@@ -282,15 +403,12 @@
   typedef Matrix<typename traits<T>::Scalar,
                 Rows,
                 Cols,
-                (MaxCols==1&&MaxRows!=1) ? RowMajor : ColMajor,
+                (MaxCols==1&&MaxRows!=1) ? ColMajor : RowMajor,
                 MaxRows,
                 MaxCols
           > type;
 };
 
-// we should be able to get rid of this one too
-template<typename T> struct must_nest_by_value { enum { ret = false }; };
-
 /** \internal The reference selector for template expressions. The idea is that we don't
   * need to use references for expressions since they are light weight proxy
   * objects which should generate no copying overhead. */
@@ -302,6 +420,12 @@
     T const&,
     const T
   >::type type;
+
+  typedef typename conditional<
+    bool(traits<T>::Flags & NestByRefBit),
+    T &,
+    T
+  >::type non_const_type;
 };
 
 /** \internal Adds the const qualifier on the value-type of T2 if and only if T1 is a const type */
@@ -315,56 +439,42 @@
   >::type type;
 };
 
-/** \internal Determines how a given expression should be nested into another one.
+
+// However, we still need a mechanism to detect whether an expression which is evaluated multiple time
+// has to be evaluated into a temporary.
+// That's the purpose of this new nested_eval helper:
+/** \internal Determines how a given expression should be nested when evaluated multiple times.
   * For example, when you do a * (b+c), Eigen will determine how the expression b+c should be
-  * nested into the bigger product expression. The choice is between nesting the expression b+c as-is, or
+  * evaluated into the bigger product expression. The choice is between nesting the expression b+c as-is, or
   * evaluating that expression b+c into a temporary variable d, and nest d so that the resulting expression is
   * a*d. Evaluating can be beneficial for example if every coefficient access in the resulting expression causes
   * many coefficient accesses in the nested expressions -- as is the case with matrix product for example.
   *
-  * \param T the type of the expression being nested
-  * \param n the number of coefficient accesses in the nested expression for each coefficient access in the bigger expression.
-  *
-  * Note that if no evaluation occur, then the constness of T is preserved.
-  *
-  * Example. Suppose that a, b, and c are of type Matrix3d. The user forms the expression a*(b+c).
-  * b+c is an expression "sum of matrices", which we will denote by S. In order to determine how to nest it,
-  * the Product expression uses: nested<S, 3>::type, which turns out to be Matrix3d because the internal logic of
-  * nested determined that in this case it was better to evaluate the expression b+c into a temporary. On the other hand,
-  * since a is of type Matrix3d, the Product expression nests it as nested<Matrix3d, 3>::type, which turns out to be
-  * const Matrix3d&, because the internal logic of nested determined that since a was already a matrix, there was no point
-  * in copying it into another matrix.
+  * \tparam T the type of the expression being nested.
+  * \tparam n the number of coefficient accesses in the nested expression for each coefficient access in the bigger expression.
+  * \tparam PlainObject the type of the temporary if needed.
   */
-template<typename T, int n=1, typename PlainObject = typename eval<T>::type> struct nested
+template<typename T, int n, typename PlainObject = typename plain_object_eval<T>::type> struct nested_eval
 {
   enum {
-    // for the purpose of this test, to keep it reasonably simple, we arbitrarily choose a value of Dynamic values.
-    // the choice of 10000 makes it larger than any practical fixed value and even most dynamic values.
-    // in extreme cases where these assumptions would be wrong, we would still at worst suffer performance issues
-    // (poor choice of temporaries).
-    // it's important that this value can still be squared without integer overflowing.
-    DynamicAsInteger = 10000,
     ScalarReadCost = NumTraits<typename traits<T>::Scalar>::ReadCost,
-    ScalarReadCostAsInteger = ScalarReadCost == Dynamic ? int(DynamicAsInteger) : int(ScalarReadCost),
-    CoeffReadCost = traits<T>::CoeffReadCost,
-    CoeffReadCostAsInteger = CoeffReadCost == Dynamic ? int(DynamicAsInteger) : int(CoeffReadCost),
-    NAsInteger = n == Dynamic ? int(DynamicAsInteger) : n,
-    CostEvalAsInteger   = (NAsInteger+1) * ScalarReadCostAsInteger + CoeffReadCostAsInteger,
-    CostNoEvalAsInteger = NAsInteger * CoeffReadCostAsInteger
+    CoeffReadCost = evaluator<T>::CoeffReadCost,  // NOTE What if an evaluator evaluate itself into a temporary?
+                                                  //      Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1.
+                                                  //      This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON
+                                                  //      for all evaluator creating a temporary. This flag is then propagated by the parent evaluators.
+                                                  //      Another solution could be to count the number of temps?
+    NAsInteger = n == Dynamic ? HugeCost : n,
+    CostEval   = (NAsInteger+1) * ScalarReadCost + CoeffReadCost,
+    CostNoEval = NAsInteger * CoeffReadCost,
+    Evaluate = (int(evaluator<T>::Flags) & EvalBeforeNestingBit) || (int(CostEval) < int(CostNoEval))
   };
 
-  typedef typename conditional<
-      ( (int(traits<T>::Flags) & EvalBeforeNestingBit) ||
-        int(CostEvalAsInteger) < int(CostNoEvalAsInteger)
-      ),
-      PlainObject,
-      typename ref_selector<T>::type
-  >::type type;
+  typedef typename conditional<Evaluate, PlainObject, typename ref_selector<T>::type>::type type;
 };
 
 template<typename T>
 EIGEN_DEVICE_FUNC
-T* const_cast_ptr(const T* ptr)
+inline T* const_cast_ptr(const T* ptr)
 {
   return const_cast<T*>(ptr);
 }
@@ -387,30 +497,13 @@
   typedef ArrayBase<Derived> type;
 };
 
-/** \internal Helper base class to add a scalar multiple operator
-  * overloads for complex types */
-template<typename Derived,typename Scalar,typename OtherScalar,
-         bool EnableIt = !is_same<Scalar,OtherScalar>::value >
-struct special_scalar_op_base : public DenseCoeffsBase<Derived>
-{
-  // dummy operator* so that the
-  // "using special_scalar_op_base::operator*" compiles
-  void operator*() const;
-};
+template<typename Derived, typename XprKind = typename traits<Derived>::XprKind, typename StorageKind = typename traits<Derived>::StorageKind>
+struct generic_xpr_base;
 
-template<typename Derived,typename Scalar,typename OtherScalar>
-struct special_scalar_op_base<Derived,Scalar,OtherScalar,true>  : public DenseCoeffsBase<Derived>
+template<typename Derived, typename XprKind>
+struct generic_xpr_base<Derived, XprKind, Dense>
 {
-  const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
-  operator*(const OtherScalar& scalar) const
-  {
-    return CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
-      (*static_cast<const Derived*>(this), scalar_multiple2_op<Scalar,OtherScalar>(scalar));
-  }
-
-  inline friend const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
-  operator*(const OtherScalar& scalar, const Derived& matrix)
-  { return static_cast<const special_scalar_op_base&>(matrix).operator*(scalar); }
+  typedef typename dense_xpr_base<Derived,XprKind>::type type;
 };
 
 template<typename XprType, typename CastType> struct cast_return_type
@@ -437,23 +530,83 @@
   typedef A ret;
 };
 
+/** \internal Specify the "storage kind" of applying a coefficient-wise
+  * binary operations between two expressions of kinds A and B respectively.
+  * The template parameter Functor permits to specialize the resulting storage kind wrt to
+  * the functor.
+  * The default rules are as follows:
+  * \code
+  * A      op A      -> A
+  * A      op dense  -> dense
+  * dense  op B      -> dense
+  * sparse op dense  -> sparse
+  * dense  op sparse -> sparse
+  * \endcode
+  */
+template <typename A, typename B, typename Functor> struct cwise_promote_storage_type;
 
+template <typename A, typename Functor>                   struct cwise_promote_storage_type<A,A,Functor>                                      { typedef A      ret; };
+template <typename Functor>                               struct cwise_promote_storage_type<Dense,Dense,Functor>                              { typedef Dense  ret; };
+template <typename A, typename Functor>                   struct cwise_promote_storage_type<A,Dense,Functor>                                  { typedef Dense  ret; };
+template <typename B, typename Functor>                   struct cwise_promote_storage_type<Dense,B,Functor>                                  { typedef Dense  ret; };
+template <typename Functor>                               struct cwise_promote_storage_type<Sparse,Dense,Functor>                             { typedef Sparse ret; };
+template <typename Functor>                               struct cwise_promote_storage_type<Dense,Sparse,Functor>                             { typedef Sparse ret; };
+
+template <typename LhsKind, typename RhsKind, int LhsOrder, int RhsOrder> struct cwise_promote_storage_order {
+  enum { value = LhsOrder };
+};
+
+template <typename LhsKind, int LhsOrder, int RhsOrder>   struct cwise_promote_storage_order<LhsKind,Sparse,LhsOrder,RhsOrder>                { enum { value = RhsOrder }; };
+template <typename RhsKind, int LhsOrder, int RhsOrder>   struct cwise_promote_storage_order<Sparse,RhsKind,LhsOrder,RhsOrder>                { enum { value = LhsOrder }; };
+template <int Order>                                      struct cwise_promote_storage_order<Sparse,Sparse,Order,Order>                       { enum { value = Order }; };
+
+
+/** \internal Specify the "storage kind" of multiplying an expression of kind A with kind B.
+  * The template parameter ProductTag permits to specialize the resulting storage kind wrt to
+  * some compile-time properties of the product: GemmProduct, GemvProduct, OuterProduct, InnerProduct.
+  * The default rules are as follows:
+  * \code
+  *  K * K            -> K
+  *  dense * K        -> dense
+  *  K * dense        -> dense
+  *  diag * K         -> K
+  *  K * diag         -> K
+  *  Perm * K         -> K
+  * K * Perm          -> K
+  * \endcode
+  */
+template <typename A, typename B, int ProductTag> struct product_promote_storage_type;
+
+template <typename A, int ProductTag> struct product_promote_storage_type<A,                  A,                  ProductTag> { typedef A     ret;};
+template <int ProductTag>             struct product_promote_storage_type<Dense,              Dense,              ProductTag> { typedef Dense ret;};
+template <typename A, int ProductTag> struct product_promote_storage_type<A,                  Dense,              ProductTag> { typedef Dense ret; };
+template <typename B, int ProductTag> struct product_promote_storage_type<Dense,              B,                  ProductTag> { typedef Dense ret; };
+
+template <typename A, int ProductTag> struct product_promote_storage_type<A,                  DiagonalShape,      ProductTag> { typedef A ret; };
+template <typename B, int ProductTag> struct product_promote_storage_type<DiagonalShape,      B,                  ProductTag> { typedef B ret; };
+template <int ProductTag>             struct product_promote_storage_type<Dense,              DiagonalShape,      ProductTag> { typedef Dense ret; };
+template <int ProductTag>             struct product_promote_storage_type<DiagonalShape,      Dense,              ProductTag> { typedef Dense ret; };
+
+template <typename A, int ProductTag> struct product_promote_storage_type<A,                  PermutationStorage, ProductTag> { typedef A ret; };
+template <typename B, int ProductTag> struct product_promote_storage_type<PermutationStorage, B,                  ProductTag> { typedef B ret; };
+template <int ProductTag>             struct product_promote_storage_type<Dense,              PermutationStorage, ProductTag> { typedef Dense ret; };
+template <int ProductTag>             struct product_promote_storage_type<PermutationStorage, Dense,              ProductTag> { typedef Dense ret; };
 
 /** \internal gives the plain matrix or array type to store a row/column/diagonal of a matrix type.
-  * \param Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType.
+  * \tparam Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType.
   */
 template<typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
 struct plain_row_type
 {
   typedef Matrix<Scalar, 1, ExpressionType::ColsAtCompileTime,
-                 ExpressionType::PlainObject::Options | RowMajor, 1, ExpressionType::MaxColsAtCompileTime> MatrixRowType;
+                 int(ExpressionType::PlainObject::Options) | int(RowMajor), 1, ExpressionType::MaxColsAtCompileTime> MatrixRowType;
   typedef Array<Scalar, 1, ExpressionType::ColsAtCompileTime,
-                 ExpressionType::PlainObject::Options | RowMajor, 1, ExpressionType::MaxColsAtCompileTime> ArrayRowType;
+                 int(ExpressionType::PlainObject::Options) | int(RowMajor), 1, ExpressionType::MaxColsAtCompileTime> ArrayRowType;
 
   typedef typename conditional<
     is_same< typename traits<ExpressionType>::XprKind, MatrixXpr >::value,
     MatrixRowType,
-    ArrayRowType 
+    ArrayRowType
   >::type type;
 };
 
@@ -468,7 +621,7 @@
   typedef typename conditional<
     is_same< typename traits<ExpressionType>::XprKind, MatrixXpr >::value,
     MatrixColType,
-    ArrayColType 
+    ArrayColType
   >::type type;
 };
 
@@ -484,19 +637,220 @@
   typedef typename conditional<
     is_same< typename traits<ExpressionType>::XprKind, MatrixXpr >::value,
     MatrixDiagType,
-    ArrayDiagType 
+    ArrayDiagType
   >::type type;
 };
 
+template<typename Expr,typename Scalar = typename Expr::Scalar>
+struct plain_constant_type
+{
+  enum { Options = (traits<Expr>::Flags&RowMajorBit)?RowMajor:0 };
+
+  typedef Array<Scalar,  traits<Expr>::RowsAtCompileTime,   traits<Expr>::ColsAtCompileTime,
+                Options, traits<Expr>::MaxRowsAtCompileTime,traits<Expr>::MaxColsAtCompileTime> array_type;
+
+  typedef Matrix<Scalar,  traits<Expr>::RowsAtCompileTime,   traits<Expr>::ColsAtCompileTime,
+                 Options, traits<Expr>::MaxRowsAtCompileTime,traits<Expr>::MaxColsAtCompileTime> matrix_type;
+
+  typedef CwiseNullaryOp<scalar_constant_op<Scalar>, const typename conditional<is_same< typename traits<Expr>::XprKind, MatrixXpr >::value, matrix_type, array_type>::type > type;
+};
+
 template<typename ExpressionType>
 struct is_lvalue
 {
-  enum { value = !bool(is_const<ExpressionType>::value) &&
+  enum { value = (!bool(is_const<ExpressionType>::value)) &&
                  bool(traits<ExpressionType>::Flags & LvalueBit) };
 };
 
+template<typename T> struct is_diagonal
+{ enum { ret = false }; };
+
+template<typename T> struct is_diagonal<DiagonalBase<T> >
+{ enum { ret = true }; };
+
+template<typename T> struct is_diagonal<DiagonalWrapper<T> >
+{ enum { ret = true }; };
+
+template<typename T, int S> struct is_diagonal<DiagonalMatrix<T,S> >
+{ enum { ret = true }; };
+
+
+template<typename T> struct is_identity
+{ enum { value = false }; };
+
+template<typename T> struct is_identity<CwiseNullaryOp<internal::scalar_identity_op<typename T::Scalar>, T> >
+{ enum { value = true }; };
+
+
+template<typename S1, typename S2> struct glue_shapes;
+template<> struct glue_shapes<DenseShape,TriangularShape> { typedef TriangularShape type;  };
+
+template<typename T1, typename T2>
+struct possibly_same_dense {
+  enum { value = has_direct_access<T1>::ret && has_direct_access<T2>::ret && is_same<typename T1::Scalar,typename T2::Scalar>::value };
+};
+
+template<typename T1, typename T2>
+EIGEN_DEVICE_FUNC
+bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<possibly_same_dense<T1,T2>::value>::type * = 0)
+{
+  return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride());
+}
+
+template<typename T1, typename T2>
+EIGEN_DEVICE_FUNC
+bool is_same_dense(const T1 &, const T2 &, typename enable_if<!possibly_same_dense<T1,T2>::value>::type * = 0)
+{
+  return false;
+}
+
+// Internal helper defining the cost of a scalar division for the type T.
+// The default heuristic can be specialized for each scalar type and architecture.
+template<typename T,bool Vectorized=false,typename EnableIf = void>
+struct scalar_div_cost {
+  enum { value = 8*NumTraits<T>::MulCost };
+};
+
+template<typename T,bool Vectorized>
+struct scalar_div_cost<std::complex<T>, Vectorized> {
+  enum { value = 2*scalar_div_cost<T>::value
+               + 6*NumTraits<T>::MulCost
+               + 3*NumTraits<T>::AddCost
+  };
+};
+
+
+template<bool Vectorized>
+struct scalar_div_cost<signed long,Vectorized,typename conditional<sizeof(long)==8,void,false_type>::type> { enum { value = 24 }; };
+template<bool Vectorized>
+struct scalar_div_cost<unsigned long,Vectorized,typename conditional<sizeof(long)==8,void,false_type>::type> { enum { value = 21 }; };
+
+
+#ifdef EIGEN_DEBUG_ASSIGN
+std::string demangle_traversal(int t)
+{
+  if(t==DefaultTraversal) return "DefaultTraversal";
+  if(t==LinearTraversal) return "LinearTraversal";
+  if(t==InnerVectorizedTraversal) return "InnerVectorizedTraversal";
+  if(t==LinearVectorizedTraversal) return "LinearVectorizedTraversal";
+  if(t==SliceVectorizedTraversal) return "SliceVectorizedTraversal";
+  return "?";
+}
+std::string demangle_unrolling(int t)
+{
+  if(t==NoUnrolling) return "NoUnrolling";
+  if(t==InnerUnrolling) return "InnerUnrolling";
+  if(t==CompleteUnrolling) return "CompleteUnrolling";
+  return "?";
+}
+std::string demangle_flags(int f)
+{
+  std::string res;
+  if(f&RowMajorBit)                 res += " | RowMajor";
+  if(f&PacketAccessBit)             res += " | Packet";
+  if(f&LinearAccessBit)             res += " | Linear";
+  if(f&LvalueBit)                   res += " | Lvalue";
+  if(f&DirectAccessBit)             res += " | Direct";
+  if(f&NestByRefBit)                res += " | NestByRef";
+  if(f&NoPreferredStorageOrderBit)  res += " | NoPreferredStorageOrderBit";
+
+  return res;
+}
+#endif
+
 } // end namespace internal
 
+
+/** \class ScalarBinaryOpTraits
+  * \ingroup Core_Module
+  *
+  * \brief Determines whether the given binary operation of two numeric types is allowed and what the scalar return type is.
+  *
+  * This class permits to control the scalar return type of any binary operation performed on two different scalar types through (partial) template specializations.
+  *
+  * For instance, let \c U1, \c U2 and \c U3 be three user defined scalar types for which most operations between instances of \c U1 and \c U2 returns an \c U3.
+  * You can let %Eigen knows that by defining:
+    \code
+    template<typename BinaryOp>
+    struct ScalarBinaryOpTraits<U1,U2,BinaryOp> { typedef U3 ReturnType;  };
+    template<typename BinaryOp>
+    struct ScalarBinaryOpTraits<U2,U1,BinaryOp> { typedef U3 ReturnType;  };
+    \endcode
+  * You can then explicitly disable some particular operations to get more explicit error messages:
+    \code
+    template<>
+    struct ScalarBinaryOpTraits<U1,U2,internal::scalar_max_op<U1,U2> > {};
+    \endcode
+  * Or customize the return type for individual operation:
+    \code
+    template<>
+    struct ScalarBinaryOpTraits<U1,U2,internal::scalar_sum_op<U1,U2> > { typedef U1 ReturnType; };
+    \endcode
+  *
+  * By default, the following generic combinations are supported:
+  <table class="manual">
+  <tr><th>ScalarA</th><th>ScalarB</th><th>BinaryOp</th><th>ReturnType</th><th>Note</th></tr>
+  <tr            ><td>\c T </td><td>\c T </td><td>\c * </td><td>\c T </td><td></td></tr>
+  <tr class="alt"><td>\c NumTraits<T>::Real </td><td>\c T </td><td>\c * </td><td>\c T </td><td>Only if \c NumTraits<T>::IsComplex </td></tr>
+  <tr            ><td>\c T </td><td>\c NumTraits<T>::Real </td><td>\c * </td><td>\c T </td><td>Only if \c NumTraits<T>::IsComplex </td></tr>
+  </table>
+  *
+  * \sa CwiseBinaryOp
+  */
+template<typename ScalarA, typename ScalarB, typename BinaryOp=internal::scalar_product_op<ScalarA,ScalarB> >
+struct ScalarBinaryOpTraits
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  // for backward compatibility, use the hints given by the (deprecated) internal::scalar_product_traits class.
+  : internal::scalar_product_traits<ScalarA,ScalarB>
+#endif // EIGEN_PARSED_BY_DOXYGEN
+{};
+
+template<typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<T,T,BinaryOp>
+{
+  typedef T ReturnType;
+};
+
+template <typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<T, typename NumTraits<typename internal::enable_if<NumTraits<T>::IsComplex,T>::type>::Real, BinaryOp>
+{
+  typedef T ReturnType;
+};
+template <typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<typename NumTraits<typename internal::enable_if<NumTraits<T>::IsComplex,T>::type>::Real, T, BinaryOp>
+{
+  typedef T ReturnType;
+};
+
+// For Matrix * Permutation
+template<typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<T,void,BinaryOp>
+{
+  typedef T ReturnType;
+};
+
+// For Permutation * Matrix
+template<typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<void,T,BinaryOp>
+{
+  typedef T ReturnType;
+};
+
+// for Permutation*Permutation
+template<typename BinaryOp>
+struct ScalarBinaryOpTraits<void,void,BinaryOp>
+{
+  typedef void ReturnType;
+};
+
+// We require Lhs and Rhs to have "compatible" scalar types.
+// It is tempting to always allow mixing different types but remember that this is often impossible in the vectorized paths.
+// So allowing mixing different types gives very unexpected errors when enabling vectorization, when the user tries to
+// add together a float matrix and a double matrix.
+#define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \
+  EIGEN_STATIC_ASSERT((Eigen::internal::has_ReturnType<ScalarBinaryOpTraits<LHS, RHS,BINOP> >::value), \
+    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
 } // end namespace Eigen
 
 #endif // EIGEN_XPRHELPER_H

diff --git a/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/Eigen/src/Eigenvalues/ComplexEigenSolver.h
index af434bc..081e918 100644
--- a/Eigen/src/Eigenvalues/ComplexEigenSolver.h
+++ b/Eigen/src/Eigenvalues/ComplexEigenSolver.h

@@ -60,7 +60,7 @@
     /** \brief Scalar type for matrices of type #MatrixType. */
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     /** \brief Complex scalar type for #MatrixType.
       *
@@ -104,7 +104,7 @@
       * according to the specified problem \a size.
       * \sa ComplexEigenSolver()
       */
-    ComplexEigenSolver(Index size)
+    explicit ComplexEigenSolver(Index size)
             : m_eivec(size, size),
               m_eivalues(size),
               m_schur(size),
@@ -122,7 +122,8 @@
       *
       * This constructor calls compute() to compute the eigendecomposition.
       */
-      ComplexEigenSolver(const MatrixType& matrix, bool computeEigenvectors = true)
+    template<typename InputType>
+    explicit ComplexEigenSolver(const EigenBase<InputType>& matrix, bool computeEigenvectors = true)
             : m_eivec(matrix.rows(),matrix.cols()),
               m_eivalues(matrix.cols()),
               m_schur(matrix.rows()),
@@ -130,7 +131,7 @@
               m_eigenvectorsOk(false),
               m_matX(matrix.rows(),matrix.cols())
     {
-      compute(matrix, computeEigenvectors);
+      compute(matrix.derived(), computeEigenvectors);
     }
 
     /** \brief Returns the eigenvectors of given matrix.
@@ -208,11 +209,12 @@
       * Example: \include ComplexEigenSolver_compute.cpp
       * Output: \verbinclude ComplexEigenSolver_compute.out
       */
-    ComplexEigenSolver& compute(const MatrixType& matrix, bool computeEigenvectors = true);
+    template<typename InputType>
+    ComplexEigenSolver& compute(const EigenBase<InputType>& matrix, bool computeEigenvectors = true);
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+      * \returns \c Success if computation was successful, \c NoConvergence otherwise.
       */
     ComputationInfo info() const
     {
@@ -234,6 +236,12 @@
     }
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
     EigenvectorType m_eivec;
     EigenvalueType m_eivalues;
     ComplexSchur<MatrixType> m_schur;
@@ -242,27 +250,30 @@
     EigenvectorType m_matX;
 
   private:
-    void doComputeEigenvectors(const RealScalar& matrixnorm);
+    void doComputeEigenvectors(RealScalar matrixnorm);
     void sortEigenvalues(bool computeEigenvectors);
 };
 
 
 template<typename MatrixType>
+template<typename InputType>
 ComplexEigenSolver<MatrixType>& 
-ComplexEigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvectors)
+ComplexEigenSolver<MatrixType>::compute(const EigenBase<InputType>& matrix, bool computeEigenvectors)
 {
+  check_template_parameters();
+  
   // this code is inspired from Jampack
   eigen_assert(matrix.cols() == matrix.rows());
 
   // Do a complex Schur decomposition, A = U T U^*
   // The eigenvalues are on the diagonal of T.
-  m_schur.compute(matrix, computeEigenvectors);
+  m_schur.compute(matrix.derived(), computeEigenvectors);
 
   if(m_schur.info() == Success)
   {
     m_eivalues = m_schur.matrixT().diagonal();
     if(computeEigenvectors)
-      doComputeEigenvectors(matrix.norm());
+      doComputeEigenvectors(m_schur.matrixT().norm());
     sortEigenvalues(computeEigenvectors);
   }
 
@@ -273,10 +284,12 @@
 
 
 template<typename MatrixType>
-void ComplexEigenSolver<MatrixType>::doComputeEigenvectors(const RealScalar& matrixnorm)
+void ComplexEigenSolver<MatrixType>::doComputeEigenvectors(RealScalar matrixnorm)
 {
   const Index n = m_eivalues.size();
 
+  matrixnorm = numext::maxi(matrixnorm,(std::numeric_limits<RealScalar>::min)());
+
   // Compute X such that T = X D X^(-1), where D is the diagonal of T.
   // The matrix X is unit triangular.
   m_matX = EigenvectorType::Zero(n, n);

diff --git a/Eigen/src/Eigenvalues/ComplexSchur.h b/Eigen/src/Eigenvalues/ComplexSchur.h
index 89e6cad..fc71468 100644
--- a/Eigen/src/Eigenvalues/ComplexSchur.h
+++ b/Eigen/src/Eigenvalues/ComplexSchur.h

@@ -63,7 +63,7 @@
     /** \brief Scalar type for matrices of type \p _MatrixType. */
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     /** \brief Complex scalar type for \p _MatrixType. 
       *
@@ -91,7 +91,7 @@
       *
       * \sa compute() for an example.
       */
-    ComplexSchur(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime)
+    explicit ComplexSchur(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime)
       : m_matT(size,size),
         m_matU(size,size),
         m_hess(size),
@@ -109,7 +109,8 @@
       *
       * \sa matrixT() and matrixU() for examples.
       */
-    ComplexSchur(const MatrixType& matrix, bool computeU = true)
+    template<typename InputType>
+    explicit ComplexSchur(const EigenBase<InputType>& matrix, bool computeU = true)
       : m_matT(matrix.rows(),matrix.cols()),
         m_matU(matrix.rows(),matrix.cols()),
         m_hess(matrix.rows()),
@@ -117,7 +118,7 @@
         m_matUisUptodate(false),
         m_maxIters(-1)
     {
-      compute(matrix, computeU);
+      compute(matrix.derived(), computeU);
     }
 
     /** \brief Returns the unitary matrix in the Schur decomposition. 
@@ -186,7 +187,8 @@
       *
       * \sa compute(const MatrixType&, bool, Index)
       */
-    ComplexSchur& compute(const MatrixType& matrix, bool computeU = true);
+    template<typename InputType>
+    ComplexSchur& compute(const EigenBase<InputType>& matrix, bool computeU = true);
     
     /** \brief Compute Schur decomposition from a given Hessenberg matrix
      *  \param[in] matrixH Matrix in Hessenberg form H
@@ -210,7 +212,7 @@
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+      * \returns \c Success if computation was successful, \c NoConvergence otherwise.
       */
     ComputationInfo info() const
     {
@@ -298,10 +300,13 @@
   ComplexScalar trace = t.coeff(0,0) + t.coeff(1,1);
   ComplexScalar eival1 = (trace + disc) / RealScalar(2);
   ComplexScalar eival2 = (trace - disc) / RealScalar(2);
-
-  if(numext::norm1(eival1) > numext::norm1(eival2))
+  RealScalar eival1_norm = numext::norm1(eival1);
+  RealScalar eival2_norm = numext::norm1(eival2);
+  // A division by zero can only occur if eival1==eival2==0.
+  // In this case, det==0, and all we have to do is checking that eival2_norm!=0
+  if(eival1_norm > eival2_norm)
     eival2 = det / eival1;
-  else
+  else if(eival2_norm!=RealScalar(0))
     eival1 = det / eival2;
 
   // choose the eigenvalue closest to the bottom entry of the diagonal
@@ -313,14 +318,15 @@
 
 
 template<typename MatrixType>
-ComplexSchur<MatrixType>& ComplexSchur<MatrixType>::compute(const MatrixType& matrix, bool computeU)
+template<typename InputType>
+ComplexSchur<MatrixType>& ComplexSchur<MatrixType>::compute(const EigenBase<InputType>& matrix, bool computeU)
 {
   m_matUisUptodate = false;
   eigen_assert(matrix.cols() == matrix.rows());
 
   if(matrix.cols() == 1)
   {
-    m_matT = matrix.template cast<ComplexScalar>();
+    m_matT = matrix.derived().template cast<ComplexScalar>();
     if(computeU)  m_matU = ComplexMatrixType::Identity(1,1);
     m_info = Success;
     m_isInitialized = true;
@@ -328,7 +334,7 @@
     return *this;
   }
 
-  internal::complex_schur_reduce_to_hessenberg<MatrixType, NumTraits<Scalar>::IsComplex>::run(*this, matrix, computeU);
+  internal::complex_schur_reduce_to_hessenberg<MatrixType, NumTraits<Scalar>::IsComplex>::run(*this, matrix.derived(), computeU);
   computeFromHessenberg(m_matT, m_matU, computeU);
   return *this;
 }

diff --git a/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h b/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h
new file mode 100644
index 0000000..4980a3e
--- /dev/null
+++ b/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h

@@ -0,0 +1,91 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to LAPACKe
+ *    Complex Schur needed to complex unsymmetrical eigenvalues/eigenvectors.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_COMPLEX_SCHUR_LAPACKE_H
+#define EIGEN_COMPLEX_SCHUR_LAPACKE_H
+
+namespace Eigen { 
+
+/** \internal Specialization for the data types supported by LAPACKe */
+
+#define EIGEN_LAPACKE_SCHUR_COMPLEX(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX, LAPACKE_PREFIX_U, EIGCOLROW, LAPACKE_COLROW) \
+template<> template<typename InputType> inline \
+ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
+ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix, bool computeU) \
+{ \
+  typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> MatrixType; \
+  typedef MatrixType::RealScalar RealScalar; \
+  typedef std::complex<RealScalar> ComplexScalar; \
+\
+  eigen_assert(matrix.cols() == matrix.rows()); \
+\
+  m_matUisUptodate = false; \
+  if(matrix.cols() == 1) \
+  { \
+    m_matT = matrix.derived().template cast<ComplexScalar>(); \
+    if(computeU)  m_matU = ComplexMatrixType::Identity(1,1); \
+      m_info = Success; \
+      m_isInitialized = true; \
+      m_matUisUptodate = computeU; \
+      return *this; \
+  } \
+  lapack_int n = internal::convert_index<lapack_int>(matrix.cols()), sdim, info; \
+  lapack_int matrix_order = LAPACKE_COLROW; \
+  char jobvs, sort='N'; \
+  LAPACK_##LAPACKE_PREFIX_U##_SELECT1 select = 0; \
+  jobvs = (computeU) ? 'V' : 'N'; \
+  m_matU.resize(n, n); \
+  lapack_int ldvs  = internal::convert_index<lapack_int>(m_matU.outerStride()); \
+  m_matT = matrix; \
+  lapack_int lda = internal::convert_index<lapack_int>(m_matT.outerStride()); \
+  Matrix<EIGTYPE, Dynamic, Dynamic> w; \
+  w.resize(n, 1);\
+  info = LAPACKE_##LAPACKE_PREFIX##gees( matrix_order, jobvs, sort, select, n, (LAPACKE_TYPE*)m_matT.data(), lda, &sdim, (LAPACKE_TYPE*)w.data(), (LAPACKE_TYPE*)m_matU.data(), ldvs ); \
+  if(info == 0) \
+    m_info = Success; \
+  else \
+    m_info = NoConvergence; \
+\
+  m_isInitialized = true; \
+  m_matUisUptodate = computeU; \
+  return *this; \
+\
+}
+
+EIGEN_LAPACKE_SCHUR_COMPLEX(dcomplex, lapack_complex_double, z, Z, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SCHUR_COMPLEX(scomplex, lapack_complex_float,  c, C, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SCHUR_COMPLEX(dcomplex, lapack_complex_double, z, Z, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SCHUR_COMPLEX(scomplex, lapack_complex_float,  c, C, RowMajor, LAPACK_ROW_MAJOR)
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_SCHUR_LAPACKE_H

diff --git a/Eigen/src/Eigenvalues/EigenSolver.h b/Eigen/src/Eigenvalues/EigenSolver.h
index 1763fed..572b29e 100644
--- a/Eigen/src/Eigenvalues/EigenSolver.h
+++ b/Eigen/src/Eigenvalues/EigenSolver.h

@@ -79,7 +79,7 @@
     /** \brief Scalar type for matrices of type #MatrixType. */
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     /** \brief Complex scalar type for #MatrixType. 
       *
@@ -110,7 +110,7 @@
       *
       * \sa compute() for an example.
       */
- EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_realSchur(), m_matT(), m_tmp() {}
+    EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_eigenvectorsOk(false), m_realSchur(), m_matT(), m_tmp() {}
 
     /** \brief Default constructor with memory preallocation
       *
@@ -118,7 +118,7 @@
       * according to the specified problem \a size.
       * \sa EigenSolver()
       */
-    EigenSolver(Index size)
+    explicit EigenSolver(Index size)
       : m_eivec(size, size),
         m_eivalues(size),
         m_isInitialized(false),
@@ -143,7 +143,8 @@
       *
       * \sa compute()
       */
-    EigenSolver(const MatrixType& matrix, bool computeEigenvectors = true)
+    template<typename InputType>
+    explicit EigenSolver(const EigenBase<InputType>& matrix, bool computeEigenvectors = true)
       : m_eivec(matrix.rows(), matrix.cols()),
         m_eivalues(matrix.cols()),
         m_isInitialized(false),
@@ -152,7 +153,7 @@
         m_matT(matrix.rows(), matrix.cols()), 
         m_tmp(matrix.cols())
     {
-      compute(matrix, computeEigenvectors);
+      compute(matrix.derived(), computeEigenvectors);
     }
 
     /** \brief Returns the eigenvectors of given matrix. 
@@ -273,9 +274,10 @@
       * Example: \include EigenSolver_compute.cpp
       * Output: \verbinclude EigenSolver_compute.out
       */
-    EigenSolver& compute(const MatrixType& matrix, bool computeEigenvectors = true);
+    template<typename InputType>
+    EigenSolver& compute(const EigenBase<InputType>& matrix, bool computeEigenvectors = true);
 
-    /** \returns NumericalIssue if the input contains INF or NaN values or overflow occured. Returns Success otherwise. */
+    /** \returns NumericalIssue if the input contains INF or NaN values or overflow occurred. Returns Success otherwise. */
     ComputationInfo info() const
     {
       eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
@@ -299,6 +301,13 @@
     void doComputeEigenvectors();
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+      EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL);
+    }
+    
     MatrixType m_eivec;
     EigenvalueType m_eivalues;
     bool m_isInitialized;
@@ -315,11 +324,12 @@
 MatrixType EigenSolver<MatrixType>::pseudoEigenvalueMatrix() const
 {
   eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
+  const RealScalar precision = RealScalar(2)*NumTraits<RealScalar>::epsilon();
   Index n = m_eivalues.rows();
   MatrixType matD = MatrixType::Zero(n,n);
   for (Index i=0; i<n; ++i)
   {
-    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(i)), numext::real(m_eivalues.coeff(i))))
+    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(i)), numext::real(m_eivalues.coeff(i)), precision))
       matD.coeffRef(i,i) = numext::real(m_eivalues.coeff(i));
     else
     {
@@ -336,11 +346,12 @@
 {
   eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
   eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+  const RealScalar precision = RealScalar(2)*NumTraits<RealScalar>::epsilon();
   Index n = m_eivec.cols();
   EigenvectorsType matV(n,n);
   for (Index j=0; j<n; ++j)
   {
-    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(j)), numext::real(m_eivalues.coeff(j))) || j+1==n)
+    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(j)), numext::real(m_eivalues.coeff(j)), precision) || j+1==n)
     {
       // we have a real eigen value
       matV.col(j) = m_eivec.col(j).template cast<ComplexScalar>();
@@ -363,17 +374,19 @@
 }
 
 template<typename MatrixType>
+template<typename InputType>
 EigenSolver<MatrixType>& 
-EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvectors)
+EigenSolver<MatrixType>::compute(const EigenBase<InputType>& matrix, bool computeEigenvectors)
 {
+  check_template_parameters();
+  
   using std::sqrt;
   using std::abs;
-  using std::max;
   using numext::isfinite;
   eigen_assert(matrix.cols() == matrix.rows());
 
   // Reduce to real Schur form.
-  m_realSchur.compute(matrix, computeEigenvectors);
+  m_realSchur.compute(matrix.derived(), computeEigenvectors);
   
   m_info = m_realSchur.info();
 
@@ -391,7 +404,7 @@
       if (i == matrix.cols() - 1 || m_matT.coeff(i+1, i) == Scalar(0)) 
       {
         m_eivalues.coeffRef(i) = m_matT.coeff(i, i);
-        if(!isfinite(m_eivalues.coeffRef(i)))
+        if(!(isfinite)(m_eivalues.coeffRef(i)))
         {
           m_isInitialized = true;
           m_eigenvectorsOk = false;
@@ -409,7 +422,7 @@
         {
           Scalar t0 = m_matT.coeff(i+1, i);
           Scalar t1 = m_matT.coeff(i, i+1);
-          Scalar maxval = (max)(abs(p),(max)(abs(t0),abs(t1)));
+          Scalar maxval = numext::maxi<Scalar>(abs(p),numext::maxi<Scalar>(abs(t0),abs(t1)));
           t0 /= maxval;
           t1 /= maxval;
           Scalar p0 = p/maxval;
@@ -418,7 +431,7 @@
         
         m_eivalues.coeffRef(i)   = ComplexScalar(m_matT.coeff(i+1, i+1) + p, z);
         m_eivalues.coeffRef(i+1) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, -z);
-        if(!(isfinite(m_eivalues.coeffRef(i)) && isfinite(m_eivalues.coeffRef(i+1))))
+        if(!((isfinite)(m_eivalues.coeffRef(i)) && (isfinite)(m_eivalues.coeffRef(i+1))))
         {
           m_isInitialized = true;
           m_eigenvectorsOk = false;
@@ -440,26 +453,6 @@
   return *this;
 }
 
-// Complex scalar division.
-template<typename Scalar>
-std::complex<Scalar> cdiv(const Scalar& xr, const Scalar& xi, const Scalar& yr, const Scalar& yi)
-{
-  using std::abs;
-  Scalar r,d;
-  if (abs(yr) > abs(yi))
-  {
-      r = yi/yr;
-      d = yr + r*yi;
-      return std::complex<Scalar>((xr + r*xi)/d, (xi - r*xr)/d);
-  }
-  else
-  {
-      r = yr/yi;
-      d = yi + r*yr;
-      return std::complex<Scalar>((r*xr + xi)/d, (r*xi - xr)/d);
-  }
-}
-
 
 template<typename MatrixType>
 void EigenSolver<MatrixType>::doComputeEigenvectors()
@@ -476,7 +469,7 @@
   }
   
   // Backsubstitute to find vectors of upper triangular form
-  if (norm == 0.0)
+  if (norm == Scalar(0))
   {
     return;
   }
@@ -492,13 +485,13 @@
       Scalar lastr(0), lastw(0);
       Index l = n;
 
-      m_matT.coeffRef(n,n) = 1.0;
+      m_matT.coeffRef(n,n) = Scalar(1);
       for (Index i = n-1; i >= 0; i--)
       {
         Scalar w = m_matT.coeff(i,i) - p;
         Scalar r = m_matT.row(i).segment(l,n-l+1).dot(m_matT.col(n).segment(l, n-l+1));
 
-        if (m_eivalues.coeff(i).imag() < 0.0)
+        if (m_eivalues.coeff(i).imag() < Scalar(0))
         {
           lastw = w;
           lastr = r;
@@ -506,9 +499,9 @@
         else
         {
           l = i;
-          if (m_eivalues.coeff(i).imag() == 0.0)
+          if (m_eivalues.coeff(i).imag() == Scalar(0))
           {
-            if (w != 0.0)
+            if (w != Scalar(0))
               m_matT.coeffRef(i,n) = -r / w;
             else
               m_matT.coeffRef(i,n) = -r / (eps * norm);
@@ -546,19 +539,19 @@
       }
       else
       {
-        std::complex<Scalar> cc = cdiv<Scalar>(0.0,-m_matT.coeff(n-1,n),m_matT.coeff(n-1,n-1)-p,q);
+        ComplexScalar cc = ComplexScalar(Scalar(0),-m_matT.coeff(n-1,n)) / ComplexScalar(m_matT.coeff(n-1,n-1)-p,q);
         m_matT.coeffRef(n-1,n-1) = numext::real(cc);
         m_matT.coeffRef(n-1,n) = numext::imag(cc);
       }
-      m_matT.coeffRef(n,n-1) = 0.0;
-      m_matT.coeffRef(n,n) = 1.0;
+      m_matT.coeffRef(n,n-1) = Scalar(0);
+      m_matT.coeffRef(n,n) = Scalar(1);
       for (Index i = n-2; i >= 0; i--)
       {
         Scalar ra = m_matT.row(i).segment(l, n-l+1).dot(m_matT.col(n-1).segment(l, n-l+1));
         Scalar sa = m_matT.row(i).segment(l, n-l+1).dot(m_matT.col(n).segment(l, n-l+1));
         Scalar w = m_matT.coeff(i,i) - p;
 
-        if (m_eivalues.coeff(i).imag() < 0.0)
+        if (m_eivalues.coeff(i).imag() < Scalar(0))
         {
           lastw = w;
           lastra = ra;
@@ -569,7 +562,7 @@
           l = i;
           if (m_eivalues.coeff(i).imag() == RealScalar(0))
           {
-            std::complex<Scalar> cc = cdiv(-ra,-sa,w,q);
+            ComplexScalar cc = ComplexScalar(-ra,-sa) / ComplexScalar(w,q);
             m_matT.coeffRef(i,n-1) = numext::real(cc);
             m_matT.coeffRef(i,n) = numext::imag(cc);
           }
@@ -580,10 +573,10 @@
             Scalar y = m_matT.coeff(i+1,i);
             Scalar vr = (m_eivalues.coeff(i).real() - p) * (m_eivalues.coeff(i).real() - p) + m_eivalues.coeff(i).imag() * m_eivalues.coeff(i).imag() - q * q;
             Scalar vi = (m_eivalues.coeff(i).real() - p) * Scalar(2) * q;
-            if ((vr == 0.0) && (vi == 0.0))
+            if ((vr == Scalar(0)) && (vi == Scalar(0)))
               vr = eps * norm * (abs(w) + abs(q) + abs(x) + abs(y) + abs(lastw));
 
-            std::complex<Scalar> cc = cdiv(x*lastra-lastw*ra+q*sa,x*lastsa-lastw*sa-q*ra,vr,vi);
+            ComplexScalar cc = ComplexScalar(x*lastra-lastw*ra+q*sa,x*lastsa-lastw*sa-q*ra) / ComplexScalar(vr,vi);
             m_matT.coeffRef(i,n-1) = numext::real(cc);
             m_matT.coeffRef(i,n) = numext::imag(cc);
             if (abs(x) > (abs(lastw) + abs(q)))
@@ -593,14 +586,14 @@
             }
             else
             {
-              cc = cdiv(-lastra-y*m_matT.coeff(i,n-1),-lastsa-y*m_matT.coeff(i,n),lastw,q);
+              cc = ComplexScalar(-lastra-y*m_matT.coeff(i,n-1),-lastsa-y*m_matT.coeff(i,n)) / ComplexScalar(lastw,q);
               m_matT.coeffRef(i+1,n-1) = numext::real(cc);
               m_matT.coeffRef(i+1,n) = numext::imag(cc);
             }
           }
 
           // Overflow control
-          Scalar t = numext::maxi(abs(m_matT.coeff(i,n-1)),abs(m_matT.coeff(i,n)));
+          Scalar t = numext::maxi<Scalar>(abs(m_matT.coeff(i,n-1)),abs(m_matT.coeff(i,n)));
           if ((eps * t) * t > Scalar(1))
             m_matT.block(i, n-1, size-i, 2) /= t;
 

diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
index dc240e1..87d789b 100644
--- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
+++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h

@@ -1,8 +1,9 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2012-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2010,2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2016 Tobias Wood <tobias@spinicist.org.uk>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -72,7 +73,7 @@
     /** \brief Scalar type for matrices of type #MatrixType. */
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     /** \brief Complex scalar type for #MatrixType. 
       *
@@ -89,7 +90,7 @@
       */
     typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> VectorType;
 
-    /** \brief Type for vector of complex scalar values eigenvalues as returned by betas().
+    /** \brief Type for vector of complex scalar values eigenvalues as returned by alphas().
       *
       * This is a column vector with entries of type #ComplexScalar.
       * The length of the vector is the size of #MatrixType.
@@ -114,7 +115,14 @@
       *
       * \sa compute() for an example.
       */
-    GeneralizedEigenSolver() : m_eivec(), m_alphas(), m_betas(), m_isInitialized(false), m_realQZ(), m_matS(), m_tmp() {}
+    GeneralizedEigenSolver()
+      : m_eivec(),
+        m_alphas(),
+        m_betas(),
+        m_valuesOkay(false),
+        m_vectorsOkay(false),
+        m_realQZ()
+    {}
 
     /** \brief Default constructor with memory preallocation
       *
@@ -122,14 +130,13 @@
       * according to the specified problem \a size.
       * \sa GeneralizedEigenSolver()
       */
-    GeneralizedEigenSolver(Index size)
+    explicit GeneralizedEigenSolver(Index size)
       : m_eivec(size, size),
         m_alphas(size),
         m_betas(size),
-        m_isInitialized(false),
-        m_eigenvectorsOk(false),
+        m_valuesOkay(false),
+        m_vectorsOkay(false),
         m_realQZ(size),
-        m_matS(size, size),
         m_tmp(size)
     {}
 
@@ -149,10 +156,9 @@
       : m_eivec(A.rows(), A.cols()),
         m_alphas(A.cols()),
         m_betas(A.cols()),
-        m_isInitialized(false),
-        m_eigenvectorsOk(false),
+        m_valuesOkay(false),
+        m_vectorsOkay(false),
         m_realQZ(A.cols()),
-        m_matS(A.rows(), A.cols()),
         m_tmp(A.cols())
     {
       compute(A, B, computeEigenvectors);
@@ -160,22 +166,20 @@
 
     /* \brief Returns the computed generalized eigenvectors.
       *
-      * \returns  %Matrix whose columns are the (possibly complex) eigenvectors.
+      * \returns  %Matrix whose columns are the (possibly complex) right eigenvectors.
+      * i.e. the eigenvectors that solve (A - l*B)x = 0. The ordering matches the eigenvalues.
       *
       * \pre Either the constructor 
       * GeneralizedEigenSolver(const MatrixType&,const MatrixType&, bool) or the member function
       * compute(const MatrixType&, const MatrixType& bool) has been called before, and
       * \p computeEigenvectors was set to true (the default).
       *
-      * Column \f$ k \f$ of the returned matrix is an eigenvector corresponding
-      * to eigenvalue number \f$ k \f$ as returned by eigenvalues().  The
-      * eigenvectors are normalized to have (Euclidean) norm equal to one. The
-      * matrix returned by this function is the matrix \f$ V \f$ in the
-      * generalized eigendecomposition \f$ A = B V D V^{-1} \f$, if it exists.
-      *
       * \sa eigenvalues()
       */
-//    EigenvectorsType eigenvectors() const;
+    EigenvectorsType eigenvectors() const {
+      eigen_assert(m_vectorsOkay && "Eigenvectors for GeneralizedEigenSolver were not calculated.");
+      return m_eivec;
+    }
 
     /** \brief Returns an expression of the computed generalized eigenvalues.
       *
@@ -197,7 +201,7 @@
       */
     EigenvalueType eigenvalues() const
     {
-      eigen_assert(m_isInitialized && "GeneralizedEigenSolver is not initialized.");
+      eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized.");
       return EigenvalueType(m_alphas,m_betas);
     }
 
@@ -208,7 +212,7 @@
       * \sa betas(), eigenvalues() */
     ComplexVectorType alphas() const
     {
-      eigen_assert(m_isInitialized && "GeneralizedEigenSolver is not initialized.");
+      eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized.");
       return m_alphas;
     }
 
@@ -219,7 +223,7 @@
       * \sa alphas(), eigenvalues() */
     VectorType betas() const
     {
-      eigen_assert(m_isInitialized && "GeneralizedEigenSolver is not initialized.");
+      eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized.");
       return m_betas;
     }
 
@@ -250,7 +254,7 @@
 
     ComputationInfo info() const
     {
-      eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
+      eigen_assert(m_valuesOkay && "EigenSolver is not initialized.");
       return m_realQZ.info();
     }
 
@@ -263,76 +267,149 @@
     }
 
   protected:
-    MatrixType m_eivec;
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+      EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL);
+    }
+    
+    EigenvectorsType m_eivec;
     ComplexVectorType m_alphas;
     VectorType m_betas;
-    bool m_isInitialized;
-    bool m_eigenvectorsOk;
+    bool m_valuesOkay, m_vectorsOkay;
     RealQZ<MatrixType> m_realQZ;
-    MatrixType m_matS;
-
-    typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ColumnVectorType;
-    ColumnVectorType m_tmp;
+    ComplexVectorType m_tmp;
 };
 
-//template<typename MatrixType>
-//typename GeneralizedEigenSolver<MatrixType>::EigenvectorsType GeneralizedEigenSolver<MatrixType>::eigenvectors() const
-//{
-//  eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
-//  eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
-//  Index n = m_eivec.cols();
-//  EigenvectorsType matV(n,n);
-//  // TODO
-//  return matV;
-//}
-
 template<typename MatrixType>
 GeneralizedEigenSolver<MatrixType>&
 GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixType& B, bool computeEigenvectors)
 {
+  check_template_parameters();
+  
   using std::sqrt;
   using std::abs;
   eigen_assert(A.cols() == A.rows() && B.cols() == A.rows() && B.cols() == B.rows());
-
+  Index size = A.cols();
+  m_valuesOkay = false;
+  m_vectorsOkay = false;
   // Reduce to generalized real Schur form:
   // A = Q S Z and B = Q T Z
   m_realQZ.compute(A, B, computeEigenvectors);
-
   if (m_realQZ.info() == Success)
   {
-    m_matS = m_realQZ.matrixS();
+    // Resize storage
+    m_alphas.resize(size);
+    m_betas.resize(size);
     if (computeEigenvectors)
-      m_eivec = m_realQZ.matrixZ().transpose();
-  
-    // Compute eigenvalues from matS
-    m_alphas.resize(A.cols());
-    m_betas.resize(A.cols());
-    Index i = 0;
-    while (i < A.cols())
     {
-      if (i == A.cols() - 1 || m_matS.coeff(i+1, i) == Scalar(0))
+      m_eivec.resize(size,size);
+      m_tmp.resize(size);
+    }
+
+    // Aliases:
+    Map<VectorType> v(reinterpret_cast<Scalar*>(m_tmp.data()), size);
+    ComplexVectorType &cv = m_tmp;
+    const MatrixType &mS = m_realQZ.matrixS();
+    const MatrixType &mT = m_realQZ.matrixT();
+
+    Index i = 0;
+    while (i < size)
+    {
+      if (i == size - 1 || mS.coeff(i+1, i) == Scalar(0))
       {
-        m_alphas.coeffRef(i) = m_matS.coeff(i, i);
-        m_betas.coeffRef(i)  = m_realQZ.matrixT().coeff(i,i);
+        // Real eigenvalue
+        m_alphas.coeffRef(i) = mS.diagonal().coeff(i);
+        m_betas.coeffRef(i)  = mT.diagonal().coeff(i);
+        if (computeEigenvectors)
+        {
+          v.setConstant(Scalar(0.0));
+          v.coeffRef(i) = Scalar(1.0);
+          // For singular eigenvalues do nothing more
+          if(abs(m_betas.coeffRef(i)) >= (std::numeric_limits<RealScalar>::min)())
+          {
+            // Non-singular eigenvalue
+            const Scalar alpha = real(m_alphas.coeffRef(i));
+            const Scalar beta = m_betas.coeffRef(i);
+            for (Index j = i-1; j >= 0; j--)
+            {
+              const Index st = j+1;
+              const Index sz = i-j;
+              if (j > 0 && mS.coeff(j, j-1) != Scalar(0))
+              {
+                // 2x2 block
+                Matrix<Scalar, 2, 1> rhs = (alpha*mT.template block<2,Dynamic>(j-1,st,2,sz) - beta*mS.template block<2,Dynamic>(j-1,st,2,sz)) .lazyProduct( v.segment(st,sz) );
+                Matrix<Scalar, 2, 2> lhs = beta * mS.template block<2,2>(j-1,j-1) - alpha * mT.template block<2,2>(j-1,j-1);
+                v.template segment<2>(j-1) = lhs.partialPivLu().solve(rhs);
+                j--;
+              }
+              else
+              {
+                v.coeffRef(j) = -v.segment(st,sz).transpose().cwiseProduct(beta*mS.block(j,st,1,sz) - alpha*mT.block(j,st,1,sz)).sum() / (beta*mS.coeffRef(j,j) - alpha*mT.coeffRef(j,j));
+              }
+            }
+          }
+          m_eivec.col(i).real().noalias() = m_realQZ.matrixZ().transpose() * v;
+          m_eivec.col(i).real().normalize();
+          m_eivec.col(i).imag().setConstant(0);
+        }
         ++i;
       }
       else
       {
-        Scalar p = Scalar(0.5) * (m_matS.coeff(i, i) - m_matS.coeff(i+1, i+1));
-        Scalar z = sqrt(abs(p * p + m_matS.coeff(i+1, i) * m_matS.coeff(i, i+1)));
-        m_alphas.coeffRef(i)   = ComplexScalar(m_matS.coeff(i+1, i+1) + p, z);
-        m_alphas.coeffRef(i+1) = ComplexScalar(m_matS.coeff(i+1, i+1) + p, -z);
+        // We need to extract the generalized eigenvalues of the pair of a general 2x2 block S and a positive diagonal 2x2 block T
+        // Then taking beta=T_00*T_11, we can avoid any division, and alpha is the eigenvalues of A = (U^-1 * S * U) * diag(T_11,T_00):
 
-        m_betas.coeffRef(i)   = m_realQZ.matrixT().coeff(i,i);
-        m_betas.coeffRef(i+1) = m_realQZ.matrixT().coeff(i,i);
+        // T =  [a 0]
+        //      [0 b]
+        RealScalar a = mT.diagonal().coeff(i),
+                   b = mT.diagonal().coeff(i+1);
+        const RealScalar beta = m_betas.coeffRef(i) = m_betas.coeffRef(i+1) = a*b;
+
+        // ^^ NOTE: using diagonal()(i) instead of coeff(i,i) workarounds a MSVC bug.
+        Matrix<RealScalar,2,2> S2 = mS.template block<2,2>(i,i) * Matrix<Scalar,2,1>(b,a).asDiagonal();
+
+        Scalar p = Scalar(0.5) * (S2.coeff(0,0) - S2.coeff(1,1));
+        Scalar z = sqrt(abs(p * p + S2.coeff(1,0) * S2.coeff(0,1)));
+        const ComplexScalar alpha = ComplexScalar(S2.coeff(1,1) + p, (beta > 0) ? z : -z);
+        m_alphas.coeffRef(i)   = conj(alpha);
+        m_alphas.coeffRef(i+1) = alpha;
+
+        if (computeEigenvectors) {
+          // Compute eigenvector in position (i+1) and then position (i) is just the conjugate
+          cv.setZero();
+          cv.coeffRef(i+1) = Scalar(1.0);
+          // here, the "static_cast" workaound expression template issues.
+          cv.coeffRef(i) = -(static_cast<Scalar>(beta*mS.coeffRef(i,i+1)) - alpha*mT.coeffRef(i,i+1))
+                          / (static_cast<Scalar>(beta*mS.coeffRef(i,i))   - alpha*mT.coeffRef(i,i));
+          for (Index j = i-1; j >= 0; j--)
+          {
+            const Index st = j+1;
+            const Index sz = i+1-j;
+            if (j > 0 && mS.coeff(j, j-1) != Scalar(0))
+            {
+              // 2x2 block
+              Matrix<ComplexScalar, 2, 1> rhs = (alpha*mT.template block<2,Dynamic>(j-1,st,2,sz) - beta*mS.template block<2,Dynamic>(j-1,st,2,sz)) .lazyProduct( cv.segment(st,sz) );
+              Matrix<ComplexScalar, 2, 2> lhs = beta * mS.template block<2,2>(j-1,j-1) - alpha * mT.template block<2,2>(j-1,j-1);
+              cv.template segment<2>(j-1) = lhs.partialPivLu().solve(rhs);
+              j--;
+            } else {
+              cv.coeffRef(j) =  cv.segment(st,sz).transpose().cwiseProduct(beta*mS.block(j,st,1,sz) - alpha*mT.block(j,st,1,sz)).sum()
+                              / (alpha*mT.coeffRef(j,j) - static_cast<Scalar>(beta*mS.coeffRef(j,j)));
+            }
+          }
+          m_eivec.col(i+1).noalias() = (m_realQZ.matrixZ().transpose() * cv);
+          m_eivec.col(i+1).normalize();
+          m_eivec.col(i) = m_eivec.col(i+1).conjugate();
+        }
         i += 2;
       }
     }
+
+    m_valuesOkay = true;
+    m_vectorsOkay = computeEigenvectors;
   }
-
-  m_isInitialized = true;
-  m_eigenvectorsOk = false;//computeEigenvectors;
-
   return *this;
 }
 

diff --git a/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
index 07bf1ea..d0f9091 100644
--- a/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h

@@ -50,7 +50,6 @@
     typedef SelfAdjointEigenSolver<_MatrixType> Base;
   public:
 
-    typedef typename Base::Index Index;
     typedef _MatrixType MatrixType;
 
     /** \brief Default constructor for fixed-size matrices.
@@ -74,7 +73,7 @@
       *
       * \sa compute() for an example
       */
-    GeneralizedSelfAdjointEigenSolver(Index size)
+    explicit GeneralizedSelfAdjointEigenSolver(Index size)
         : Base(size)
     {}
 
@@ -122,7 +121,7 @@
       *
       * \returns    Reference to \c *this
       *
-      * Accoring to \p options, this function computes eigenvalues and (if requested)
+      * According to \p options, this function computes eigenvalues and (if requested)
       * the eigenvectors of one of the following three generalized eigenproblems:
       * - \c Ax_lBx: \f$ Ax = \lambda B x \f$
       * - \c ABx_lx: \f$ ABx = \lambda x \f$

diff --git a/Eigen/src/Eigenvalues/HessenbergDecomposition.h b/Eigen/src/Eigenvalues/HessenbergDecomposition.h
index 3db0c01..1f21139 100644
--- a/Eigen/src/Eigenvalues/HessenbergDecomposition.h
+++ b/Eigen/src/Eigenvalues/HessenbergDecomposition.h

@@ -71,7 +71,7 @@
 
     /** \brief Scalar type for matrices of type #MatrixType. */
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     /** \brief Type for vector of Householder coefficients.
       *
@@ -97,7 +97,7 @@
       *
       * \sa compute() for an example.
       */
-    HessenbergDecomposition(Index size = Size==Dynamic ? 2 : Size)
+    explicit HessenbergDecomposition(Index size = Size==Dynamic ? 2 : Size)
       : m_matrix(size,size),
         m_temp(size),
         m_isInitialized(false)
@@ -115,8 +115,9 @@
       *
       * \sa matrixH() for an example.
       */
-    HessenbergDecomposition(const MatrixType& matrix)
-      : m_matrix(matrix),
+    template<typename InputType>
+    explicit HessenbergDecomposition(const EigenBase<InputType>& matrix)
+      : m_matrix(matrix.derived()),
         m_temp(matrix.rows()),
         m_isInitialized(false)
     {
@@ -147,9 +148,10 @@
       * Example: \include HessenbergDecomposition_compute.cpp
       * Output: \verbinclude HessenbergDecomposition_compute.out
       */
-    HessenbergDecomposition& compute(const MatrixType& matrix)
+    template<typename InputType>
+    HessenbergDecomposition& compute(const EigenBase<InputType>& matrix)
     {
-      m_matrix = matrix;
+      m_matrix = matrix.derived();
       if(matrix.rows()<2)
       {
         m_isInitialized = true;
@@ -265,7 +267,7 @@
 
   private:
 
-    typedef Matrix<Scalar, 1, Size, Options | RowMajor, 1, MaxSize> VectorType;
+    typedef Matrix<Scalar, 1, Size, int(Options) | int(RowMajor), 1, MaxSize> VectorType;
     typedef typename NumTraits<Scalar>::Real RealScalar;
     static void _compute(MatrixType& matA, CoeffVectorType& hCoeffs, VectorType& temp);
 
@@ -313,7 +315,7 @@
 
     // A = A H'
     matA.rightCols(remainingSize)
-        .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1).conjugate(), numext::conj(h), &temp.coeffRef(0));
+        .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1), numext::conj(h), &temp.coeffRef(0));
   }
 }
 
@@ -337,7 +339,6 @@
 template<typename MatrixType> struct HessenbergDecompositionMatrixHReturnType
 : public ReturnByValue<HessenbergDecompositionMatrixHReturnType<MatrixType> >
 {
-    typedef typename MatrixType::Index Index;
   public:
     /** \brief Constructor.
       *

diff --git a/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h b/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
index b9632ef..66e5a3d 100644
--- a/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
+++ b/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h

@@ -66,7 +66,6 @@
 inline typename MatrixBase<Derived>::EigenvaluesReturnType
 MatrixBase<Derived>::eigenvalues() const
 {
-  typedef typename internal::traits<Derived>::Scalar Scalar;
   return internal::eigenvalues_selector<Derived, NumTraits<Scalar>::IsComplex>::run(derived());
 }
 
@@ -84,12 +83,10 @@
   *
   * \sa SelfAdjointEigenSolver::eigenvalues(), MatrixBase::eigenvalues()
   */
-template<typename MatrixType, unsigned int UpLo>
-EIGEN_DEVICE_FUNC
-inline typename SelfAdjointView<MatrixType, UpLo>::EigenvaluesReturnType
+template<typename MatrixType, unsigned int UpLo> 
+EIGEN_DEVICE_FUNC inline typename SelfAdjointView<MatrixType, UpLo>::EigenvaluesReturnType
 SelfAdjointView<MatrixType, UpLo>::eigenvalues() const
 {
-  typedef typename SelfAdjointView<MatrixType, UpLo>::PlainObject PlainObject;
   PlainObject thisAsMatrix(*this);
   return SelfAdjointEigenSolver<PlainObject>(thisAsMatrix, false).eigenvalues();
 }
@@ -150,8 +147,7 @@
   * \sa eigenvalues(), MatrixBase::operatorNorm()
   */
 template<typename MatrixType, unsigned int UpLo>
-EIGEN_DEVICE_FUNC
-inline typename SelfAdjointView<MatrixType, UpLo>::RealScalar
+EIGEN_DEVICE_FUNC inline typename SelfAdjointView<MatrixType, UpLo>::RealScalar
 SelfAdjointView<MatrixType, UpLo>::operatorNorm() const
 {
   return eigenvalues().cwiseAbs().maxCoeff();

diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h
index 5706eee..5091301 100644
--- a/Eigen/src/Eigenvalues/RealQZ.h
+++ b/Eigen/src/Eigenvalues/RealQZ.h

@@ -67,7 +67,7 @@
       };
       typedef typename MatrixType::Scalar Scalar;
       typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-      typedef typename MatrixType::Index Index;
+      typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
       typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> EigenvalueType;
       typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ColumnVectorType;
@@ -83,15 +83,16 @@
        *
        * \sa compute() for an example.
        */
-      RealQZ(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime) : 
+      explicit RealQZ(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime) :
         m_S(size, size),
         m_T(size, size),
         m_Q(size, size),
         m_Z(size, size),
         m_workspace(size*2),
         m_maxIters(400),
-        m_isInitialized(false)
-        { }
+        m_isInitialized(false),
+        m_computeQZ(true)
+      {}
 
       /** \brief Constructor; computes real QZ decomposition of given matrices
        * 
@@ -108,9 +109,11 @@
         m_Z(A.rows(),A.cols()),
         m_workspace(A.rows()*2),
         m_maxIters(400),
-        m_isInitialized(false) {
-          compute(A, B, computeQZ);
-        }
+        m_isInitialized(false),
+        m_computeQZ(true)
+      {
+        compute(A, B, computeQZ);
+      }
 
       /** \brief Returns matrix Q in the QZ decomposition. 
        *
@@ -161,7 +164,7 @@
 
       /** \brief Reports whether previous computation was successful.
        *
-       * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+       * \returns \c Success if computation was successful, \c NoConvergence otherwise.
        */
       ComputationInfo info() const
       {
@@ -240,10 +243,10 @@
             m_S.coeffRef(i,j) = Scalar(0.0);
             m_S.rightCols(dim-j-1).applyOnTheLeft(i-1,i,G.adjoint());
             m_T.rightCols(dim-i+1).applyOnTheLeft(i-1,i,G.adjoint());
+            // update Q
+            if (m_computeQZ)
+              m_Q.applyOnTheRight(i-1,i,G);
           }
-          // update Q
-          if (m_computeQZ)
-            m_Q.applyOnTheRight(i-1,i,G);
           // kill T(i,i-1)
           if(m_T.coeff(i,i-1)!=Scalar(0))
           {
@@ -251,10 +254,10 @@
             m_T.coeffRef(i,i-1) = Scalar(0.0);
             m_S.applyOnTheRight(i,i-1,G);
             m_T.topRows(i).applyOnTheRight(i,i-1,G);
+            // update Z
+            if (m_computeQZ)
+              m_Z.applyOnTheLeft(i,i-1,G.adjoint());
           }
-          // update Z
-          if (m_computeQZ)
-            m_Z.applyOnTheLeft(i,i-1,G.adjoint());
         }
       }
     }
@@ -276,7 +279,7 @@
 
   /** \internal Look for single small sub-diagonal element S(res, res-1) and return res (or 0) */
   template<typename MatrixType>
-    inline typename MatrixType::Index RealQZ<MatrixType>::findSmallSubdiagEntry(Index iu)
+    inline Index RealQZ<MatrixType>::findSmallSubdiagEntry(Index iu)
     {
       using std::abs;
       Index res = iu;
@@ -294,7 +297,7 @@
 
   /** \internal Look for single small diagonal element T(res, res) for res between f and l, and return res (or f-1)  */
   template<typename MatrixType>
-    inline typename MatrixType::Index RealQZ<MatrixType>::findSmallDiagEntry(Index f, Index l)
+    inline Index RealQZ<MatrixType>::findSmallDiagEntry(Index f, Index l)
     {
       using std::abs;
       Index res = l;
@@ -313,10 +316,10 @@
       using std::abs;
       using std::sqrt;
       const Index dim=m_S.cols();
-      if (abs(m_S.coeff(i+1,i)==Scalar(0)))
+      if (abs(m_S.coeff(i+1,i))==Scalar(0))
         return;
-      Index z = findSmallDiagEntry(i,i+1);
-      if (z==i-1)
+      Index j = findSmallDiagEntry(i,i+1);
+      if (j==i-1)
       {
         // block of (S T^{-1})
         Matrix2s STi = m_T.template block<2,2>(i,i).template triangularView<Upper>().
@@ -352,7 +355,7 @@
       }
       else
       {
-        pushDownZero(z,i,i+1);
+        pushDownZero(j,i,i+1);
       }
     }
 
@@ -552,7 +555,6 @@
       m_T.coeffRef(l,l-1) = Scalar(0.0);
     }
 
-
   template<typename MatrixType>
     RealQZ<MatrixType>& RealQZ<MatrixType>::compute(const MatrixType& A_in, const MatrixType& B_in, bool computeQZ)
     {
@@ -616,6 +618,37 @@
       }
       // check if we converged before reaching iterations limit
       m_info = (local_iter<m_maxIters) ? Success : NoConvergence;
+
+      // For each non triangular 2x2 diagonal block of S,
+      //    reduce the respective 2x2 diagonal block of T to positive diagonal form using 2x2 SVD.
+      // This step is not mandatory for QZ, but it does help further extraction of eigenvalues/eigenvectors,
+      // and is in par with Lapack/Matlab QZ.
+      if(m_info==Success)
+      {
+        for(Index i=0; i<dim-1; ++i)
+        {
+          if(m_S.coeff(i+1, i) != Scalar(0))
+          {
+            JacobiRotation<Scalar> j_left, j_right;
+            internal::real_2x2_jacobi_svd(m_T, i, i+1, &j_left, &j_right);
+
+            // Apply resulting Jacobi rotations
+            m_S.applyOnTheLeft(i,i+1,j_left);
+            m_S.applyOnTheRight(i,i+1,j_right);
+            m_T.applyOnTheLeft(i,i+1,j_left);
+            m_T.applyOnTheRight(i,i+1,j_right);
+            m_T(i+1,i) = m_T(i,i+1) = Scalar(0);
+
+            if(m_computeQZ) {
+              m_Q.applyOnTheRight(i,i+1,j_left.transpose());
+              m_Z.applyOnTheLeft(i,i+1,j_right.transpose());
+            }
+
+            i++;
+          }
+        }
+      }
+
       return *this;
     } // end compute
 

diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h
index 64d1363..7304ef3 100644
--- a/Eigen/src/Eigenvalues/RealSchur.h
+++ b/Eigen/src/Eigenvalues/RealSchur.h

@@ -64,7 +64,7 @@
     };
     typedef typename MatrixType::Scalar Scalar;
     typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> EigenvalueType;
     typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ColumnVectorType;
@@ -80,7 +80,7 @@
       *
       * \sa compute() for an example.
       */
-    RealSchur(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime)
+    explicit RealSchur(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime)
             : m_matT(size, size),
               m_matU(size, size),
               m_workspaceVector(size),
@@ -100,7 +100,8 @@
       * Example: \include RealSchur_RealSchur_MatrixType.cpp
       * Output: \verbinclude RealSchur_RealSchur_MatrixType.out
       */
-    RealSchur(const MatrixType& matrix, bool computeU = true)
+    template<typename InputType>
+    explicit RealSchur(const EigenBase<InputType>& matrix, bool computeU = true)
             : m_matT(matrix.rows(),matrix.cols()),
               m_matU(matrix.rows(),matrix.cols()),
               m_workspaceVector(matrix.rows()),
@@ -109,7 +110,7 @@
               m_matUisUptodate(false),
               m_maxIters(-1)
     {
-      compute(matrix, computeU);
+      compute(matrix.derived(), computeU);
     }
 
     /** \brief Returns the orthogonal matrix in the Schur decomposition. 
@@ -165,7 +166,8 @@
       *
       * \sa compute(const MatrixType&, bool, Index)
       */
-    RealSchur& compute(const MatrixType& matrix, bool computeU = true);
+    template<typename InputType>
+    RealSchur& compute(const EigenBase<InputType>& matrix, bool computeU = true);
 
     /** \brief Computes Schur decomposition of a Hessenberg matrix H = Z T Z^T
      *  \param[in] matrixH Matrix in Hessenberg form H
@@ -188,7 +190,7 @@
     RealSchur& computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ,  bool computeU);
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+      * \returns \c Success if computation was successful, \c NoConvergence otherwise.
       */
     ComputationInfo info() const
     {
@@ -234,7 +236,7 @@
     typedef Matrix<Scalar,3,1> Vector3s;
 
     Scalar computeNormOfT();
-    Index findSmallSubdiagEntry(Index iu, const Scalar& norm);
+    Index findSmallSubdiagEntry(Index iu, const Scalar& considerAsZero);
     void splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift);
     void computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo);
     void initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector);
@@ -243,33 +245,57 @@
 
 
 template<typename MatrixType>
-RealSchur<MatrixType>& RealSchur<MatrixType>::compute(const MatrixType& matrix, bool computeU)
+template<typename InputType>
+RealSchur<MatrixType>& RealSchur<MatrixType>::compute(const EigenBase<InputType>& matrix, bool computeU)
 {
+  const Scalar considerAsZero = (std::numeric_limits<Scalar>::min)();
+
   eigen_assert(matrix.cols() == matrix.rows());
   Index maxIters = m_maxIters;
   if (maxIters == -1)
     maxIters = m_maxIterationsPerRow * matrix.rows();
 
-  // Step 1. Reduce to Hessenberg form
-  m_hess.compute(matrix);
+  Scalar scale = matrix.derived().cwiseAbs().maxCoeff();
+  if(scale<considerAsZero)
+  {
+    m_matT.setZero(matrix.rows(),matrix.cols());
+    if(computeU)
+      m_matU.setIdentity(matrix.rows(),matrix.cols());
+    m_info = Success;
+    m_isInitialized = true;
+    m_matUisUptodate = computeU;
+    return *this;
+  }
 
-  // Step 2. Reduce to real Schur form  
-  computeFromHessenberg(m_hess.matrixH(), m_hess.matrixQ(), computeU);
+  // Step 1. Reduce to Hessenberg form
+  m_hess.compute(matrix.derived()/scale);
+
+  // Step 2. Reduce to real Schur form
+  // Note: we copy m_hess.matrixQ() into m_matU here and not in computeFromHessenberg
+  //       to be able to pass our working-space buffer for the Householder to Dense evaluation.
+  m_workspaceVector.resize(matrix.cols());
+  if(computeU)
+    m_hess.matrixQ().evalTo(m_matU, m_workspaceVector);
+  computeFromHessenberg(m_hess.matrixH(), m_matU, computeU);
+
+  m_matT *= scale;
   
   return *this;
 }
 template<typename MatrixType>
 template<typename HessMatrixType, typename OrthMatrixType>
 RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ,  bool computeU)
-{  
-  m_matT = matrixH; 
-  if(computeU)
+{
+  using std::abs;
+
+  m_matT = matrixH;
+  m_workspaceVector.resize(m_matT.cols());
+  if(computeU && !internal::is_same_dense(m_matU,matrixQ))
     m_matU = matrixQ;
   
   Index maxIters = m_maxIters;
   if (maxIters == -1)
     maxIters = m_maxIterationsPerRow * matrixH.rows();
-  m_workspaceVector.resize(m_matT.cols());
   Scalar* workspace = &m_workspaceVector.coeffRef(0);
 
   // The matrix m_matT is divided in three parts. 
@@ -281,12 +307,16 @@
   Index totalIter = 0; // iteration count for whole matrix
   Scalar exshift(0);   // sum of exceptional shifts
   Scalar norm = computeNormOfT();
+  // sub-diagonal entries smaller than considerAsZero will be treated as zero.
+  // We use eps^2 to enable more precision in small eigenvalues.
+  Scalar considerAsZero = numext::maxi<Scalar>( norm * numext::abs2(NumTraits<Scalar>::epsilon()),
+                                                (std::numeric_limits<Scalar>::min)() );
 
-  if(norm!=0)
+  if(norm!=Scalar(0))
   {
     while (iu >= 0)
     {
-      Index il = findSmallSubdiagEntry(iu, norm);
+      Index il = findSmallSubdiagEntry(iu,considerAsZero);
 
       // Check for convergence
       if (il == iu) // One root found
@@ -306,7 +336,7 @@
       else // No convergence yet
       {
         // The firstHouseholderVector vector has to be initialized to something to get rid of a silly GCC warning (-O1 -Wall -DNDEBUG )
-        Vector3s firstHouseholderVector(0,0,0), shiftInfo;
+        Vector3s firstHouseholderVector = Vector3s::Zero(), shiftInfo;
         computeShift(iu, iter, exshift, shiftInfo);
         iter = iter + 1;
         totalIter = totalIter + 1;
@@ -343,16 +373,17 @@
 
 /** \internal Look for single small sub-diagonal element and returns its index */
 template<typename MatrixType>
-inline typename MatrixType::Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu, const Scalar& norm)
+inline Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu, const Scalar& considerAsZero)
 {
   using std::abs;
   Index res = iu;
   while (res > 0)
   {
     Scalar s = abs(m_matT.coeff(res-1,res-1)) + abs(m_matT.coeff(res,res));
-    if (s == 0.0)
-      s = norm;
-    if (abs(m_matT.coeff(res,res-1)) < NumTraits<Scalar>::epsilon() * s)
+
+    s = numext::maxi<Scalar>(s * NumTraits<Scalar>::epsilon(), considerAsZero);
+    
+    if (abs(m_matT.coeff(res,res-1)) <= s)
       break;
     res--;
   }
@@ -457,9 +488,7 @@
     const Scalar lhs = m_matT.coeff(im,im-1) * (abs(v.coeff(1)) + abs(v.coeff(2)));
     const Scalar rhs = v.coeff(0) * (abs(m_matT.coeff(im-1,im-1)) + abs(Tmm) + abs(m_matT.coeff(im+1,im+1)));
     if (abs(lhs) < NumTraits<Scalar>::epsilon() * rhs)
-    {
       break;
-    }
   }
 }
 

diff --git a/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h b/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h
new file mode 100644
index 0000000..2c22517
--- /dev/null
+++ b/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h

@@ -0,0 +1,77 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to LAPACKe
+ *    Real Schur needed to real unsymmetrical eigenvalues/eigenvectors.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_REAL_SCHUR_LAPACKE_H
+#define EIGEN_REAL_SCHUR_LAPACKE_H
+
+namespace Eigen { 
+
+/** \internal Specialization for the data types supported by LAPACKe */
+
+#define EIGEN_LAPACKE_SCHUR_REAL(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX, LAPACKE_PREFIX_U, EIGCOLROW, LAPACKE_COLROW) \
+template<> template<typename InputType> inline \
+RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
+RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix, bool computeU) \
+{ \
+  eigen_assert(matrix.cols() == matrix.rows()); \
+\
+  lapack_int n = internal::convert_index<lapack_int>(matrix.cols()), sdim, info; \
+  lapack_int matrix_order = LAPACKE_COLROW; \
+  char jobvs, sort='N'; \
+  LAPACK_##LAPACKE_PREFIX_U##_SELECT2 select = 0; \
+  jobvs = (computeU) ? 'V' : 'N'; \
+  m_matU.resize(n, n); \
+  lapack_int ldvs  = internal::convert_index<lapack_int>(m_matU.outerStride()); \
+  m_matT = matrix; \
+  lapack_int lda = internal::convert_index<lapack_int>(m_matT.outerStride()); \
+  Matrix<EIGTYPE, Dynamic, Dynamic> wr, wi; \
+  wr.resize(n, 1); wi.resize(n, 1); \
+  info = LAPACKE_##LAPACKE_PREFIX##gees( matrix_order, jobvs, sort, select, n, (LAPACKE_TYPE*)m_matT.data(), lda, &sdim, (LAPACKE_TYPE*)wr.data(), (LAPACKE_TYPE*)wi.data(), (LAPACKE_TYPE*)m_matU.data(), ldvs ); \
+  if(info == 0) \
+    m_info = Success; \
+  else \
+    m_info = NoConvergence; \
+\
+  m_isInitialized = true; \
+  m_matUisUptodate = computeU; \
+  return *this; \
+\
+}
+
+EIGEN_LAPACKE_SCHUR_REAL(double,   double, d, D, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SCHUR_REAL(float,    float,  s, S, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SCHUR_REAL(double,   double, d, D, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SCHUR_REAL(float,    float,  s, S, RowMajor, LAPACK_ROW_MAJOR)
+
+} // end namespace Eigen
+
+#endif // EIGEN_REAL_SCHUR_LAPACKE_H

diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index d97d905..1469236 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h

@@ -20,8 +20,10 @@
 
 namespace internal {
 template<typename SolverType,int Size,bool IsComplex> struct direct_selfadjoint_eigenvalues;
+
 template<typename MatrixType, typename DiagType, typename SubDiagType>
-ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const typename MatrixType::Index maxIterations, bool computeEigenvectors, MatrixType& eivec);
+EIGEN_DEVICE_FUNC
+ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec);
 }
 
 /** \eigenvalues_module \ingroup Eigenvalues_Module
@@ -42,10 +44,14 @@
   * \f$ v \f$ such that \f$ Av = \lambda v \f$.  The eigenvalues of a
   * selfadjoint matrix are always real. If \f$ D \f$ is a diagonal matrix with
   * the eigenvalues on the diagonal, and \f$ V \f$ is a matrix with the
-  * eigenvectors as its columns, then \f$ A = V D V^{-1} \f$ (for selfadjoint
-  * matrices, the matrix \f$ V \f$ is always invertible). This is called the
+  * eigenvectors as its columns, then \f$ A = V D V^{-1} \f$. This is called the
   * eigendecomposition.
   *
+  * For a selfadjoint matrix, \f$ V \f$ is unitary, meaning its inverse is equal
+  * to its adjoint, \f$ V^{-1} = V^{\dagger} \f$. If \f$ A \f$ is real, then
+  * \f$ V \f$ is also real and therefore orthogonal, meaning its inverse is
+  * equal to its transpose, \f$ V^{-1} = V^T \f$.
+  *
   * The algorithm exploits the fact that the matrix is selfadjoint, making it
   * faster and more accurate than the general purpose eigenvalue algorithms
   * implemented in EigenSolver and ComplexEigenSolver.
@@ -81,7 +87,9 @@
     
     /** \brief Scalar type for matrices of type \p _MatrixType. */
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
+    
+    typedef Matrix<Scalar,Size,Size,ColMajor,MaxColsAtCompileTime,MaxColsAtCompileTime> EigenvectorsType;
 
     /** \brief Real scalar type for \p _MatrixType.
       *
@@ -117,7 +125,10 @@
         : m_eivec(),
           m_eivalues(),
           m_subdiag(),
-          m_isInitialized(false)
+          m_hcoeffs(),
+          m_info(InvalidInput),
+          m_isInitialized(false),
+          m_eigenvectorsOk(false)
     { }
 
     /** \brief Constructor, pre-allocates memory for dynamic-size matrices.
@@ -133,11 +144,13 @@
       * \sa compute() for an example
       */
     EIGEN_DEVICE_FUNC
-    SelfAdjointEigenSolver(Index size)
+    explicit SelfAdjointEigenSolver(Index size)
         : m_eivec(size, size),
           m_eivalues(size),
           m_subdiag(size > 1 ? size - 1 : 1),
-          m_isInitialized(false)
+          m_hcoeffs(size > 1 ? size - 1 : 1),
+          m_isInitialized(false),
+          m_eigenvectorsOk(false)
     {}
 
     /** \brief Constructor; computes eigendecomposition of given matrix.
@@ -155,14 +168,17 @@
       *
       * \sa compute(const MatrixType&, int)
       */
+    template<typename InputType>
     EIGEN_DEVICE_FUNC
-    SelfAdjointEigenSolver(const MatrixType& matrix, int options = ComputeEigenvectors)
+    explicit SelfAdjointEigenSolver(const EigenBase<InputType>& matrix, int options = ComputeEigenvectors)
       : m_eivec(matrix.rows(), matrix.cols()),
         m_eivalues(matrix.cols()),
         m_subdiag(matrix.rows() > 1 ? matrix.rows() - 1 : 1),
-        m_isInitialized(false)
+        m_hcoeffs(matrix.cols() > 1 ? matrix.cols() - 1 : 1),
+        m_isInitialized(false),
+        m_eigenvectorsOk(false)
     {
-      compute(matrix, options);
+      compute(matrix.derived(), options);
     }
 
     /** \brief Computes eigendecomposition of given matrix.
@@ -195,20 +211,25 @@
       *
       * \sa SelfAdjointEigenSolver(const MatrixType&, int)
       */
+    template<typename InputType>
     EIGEN_DEVICE_FUNC
-    SelfAdjointEigenSolver& compute(const MatrixType& matrix, int options = ComputeEigenvectors);
+    SelfAdjointEigenSolver& compute(const EigenBase<InputType>& matrix, int options = ComputeEigenvectors);
     
-    /** \brief Computes eigendecomposition of given matrix using a direct algorithm
+    /** \brief Computes eigendecomposition of given matrix using a closed-form algorithm
       *
       * This is a variant of compute(const MatrixType&, int options) which
       * directly solves the underlying polynomial equation.
       * 
-      * Currently only 3x3 matrices for which the sizes are known at compile time are supported (e.g., Matrix3d).
+      * Currently only 2x2 and 3x3 matrices for which the sizes are known at compile time are supported (e.g., Matrix3d).
       * 
-      * This method is usually significantly faster than the QR algorithm
+      * This method is usually significantly faster than the QR iterative algorithm
       * but it might also be less accurate. It is also worth noting that
       * for 3x3 matrices it involves trigonometric operations which are
       * not necessarily available for all scalar types.
+      * 
+      * For the 3x3 case, we observed the following worst case relative error regarding the eigenvalues:
+      *   - double: 1e-8
+      *   - float:  1e-3
       *
       * \sa compute(const MatrixType&, int options)
       */
@@ -220,6 +241,7 @@
       *
       * \param[in] diag The vector containing the diagonal of the matrix.
       * \param[in] subdiag The subdiagonal of the matrix.
+      * \param[in] options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly.
       * \returns Reference to \c *this
       *
       * This function assumes that the matrix has been reduced to tridiagonal form.
@@ -241,13 +263,18 @@
       * matrix \f$ A \f$, then the matrix returned by this function is the
       * matrix \f$ V \f$ in the eigendecomposition \f$ A = V D V^{-1} \f$.
       *
+      * For a selfadjoint matrix, \f$ V \f$ is unitary, meaning its inverse is equal
+      * to its adjoint, \f$ V^{-1} = V^{\dagger} \f$. If \f$ A \f$ is real, then
+      * \f$ V \f$ is also real and therefore orthogonal, meaning its inverse is
+      * equal to its transpose, \f$ V^{-1} = V^T \f$.
+      *
       * Example: \include SelfAdjointEigenSolver_eigenvectors.cpp
       * Output: \verbinclude SelfAdjointEigenSolver_eigenvectors.out
       *
       * \sa eigenvalues()
       */
     EIGEN_DEVICE_FUNC
-    const MatrixType& eigenvectors() const
+    const EigenvectorsType& eigenvectors() const
     {
       eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
       eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
@@ -291,8 +318,7 @@
       * Example: \include SelfAdjointEigenSolver_operatorSqrt.cpp
       * Output: \verbinclude SelfAdjointEigenSolver_operatorSqrt.out
       *
-      * \sa operatorInverseSqrt(),
-      *     \ref MatrixFunctions_Module "MatrixFunctions Module"
+      * \sa operatorInverseSqrt(), <a href="unsupported/group__MatrixFunctions__Module.html">MatrixFunctions Module</a>
       */
     EIGEN_DEVICE_FUNC
     MatrixType operatorSqrt() const
@@ -317,8 +343,7 @@
       * Example: \include SelfAdjointEigenSolver_operatorInverseSqrt.cpp
       * Output: \verbinclude SelfAdjointEigenSolver_operatorInverseSqrt.out
       *
-      * \sa operatorSqrt(), MatrixBase::inverse(),
-      *     \ref MatrixFunctions_Module "MatrixFunctions Module"
+      * \sa operatorSqrt(), MatrixBase::inverse(), <a href="unsupported/group__MatrixFunctions__Module.html">MatrixFunctions Module</a>
       */
     EIGEN_DEVICE_FUNC
     MatrixType operatorInverseSqrt() const
@@ -330,7 +355,7 @@
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+      * \returns \c Success if computation was successful, \c NoConvergence otherwise.
       */
     EIGEN_DEVICE_FUNC
     ComputationInfo info() const
@@ -346,49 +371,23 @@
       */
     static const int m_maxIterations = 30;
 
-    #ifdef EIGEN2_SUPPORT
-    EIGEN_DEVICE_FUNC
-    SelfAdjointEigenSolver(const MatrixType& matrix, bool computeEigenvectors)
-      : m_eivec(matrix.rows(), matrix.cols()),
-        m_eivalues(matrix.cols()),
-        m_subdiag(matrix.rows() > 1 ? matrix.rows() - 1 : 1),
-        m_isInitialized(false)
-    {
-      compute(matrix, computeEigenvectors);
-    }
-    
-    EIGEN_DEVICE_FUNC
-    SelfAdjointEigenSolver(const MatrixType& matA, const MatrixType& matB, bool computeEigenvectors = true)
-        : m_eivec(matA.cols(), matA.cols()),
-          m_eivalues(matA.cols()),
-          m_subdiag(matA.cols() > 1 ? matA.cols() - 1 : 1),
-          m_isInitialized(false)
-    {
-      static_cast<GeneralizedSelfAdjointEigenSolver<MatrixType>*>(this)->compute(matA, matB, computeEigenvectors ? ComputeEigenvectors : EigenvaluesOnly);
-    }
-    
-    EIGEN_DEVICE_FUNC
-    void compute(const MatrixType& matrix, bool computeEigenvectors)
-    {
-      compute(matrix, computeEigenvectors ? ComputeEigenvectors : EigenvaluesOnly);
-    }
-
-    EIGEN_DEVICE_FUNC
-    void compute(const MatrixType& matA, const MatrixType& matB, bool computeEigenvectors = true)
-    {
-      compute(matA, matB, computeEigenvectors ? ComputeEigenvectors : EigenvaluesOnly);
-    }
-    #endif // EIGEN2_SUPPORT
-
   protected:
-    MatrixType m_eivec;
+    static EIGEN_DEVICE_FUNC
+    void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
+    EigenvectorsType m_eivec;
     RealVectorType m_eivalues;
     typename TridiagonalizationType::SubDiagonalType m_subdiag;
+    typename TridiagonalizationType::CoeffVectorType m_hcoeffs;
     ComputationInfo m_info;
     bool m_isInitialized;
     bool m_eigenvectorsOk;
 };
 
+namespace internal {
 /** \internal
   *
   * \eigenvalues_module \ingroup Eigenvalues_Module
@@ -396,8 +395,12 @@
   * Performs a QR step on a tridiagonal symmetric matrix represented as a
   * pair of two vectors \a diag and \a subdiag.
   *
-  * \param matA the input selfadjoint matrix
-  * \param hCoeffs returned Householder coefficients
+  * \param diag the diagonal part of the input selfadjoint tridiagonal matrix
+  * \param subdiag the sub-diagonal part of the input selfadjoint tridiagonal matrix
+  * \param start starting index of the submatrix to work on
+  * \param end last+1 index of the submatrix to work on
+  * \param matrixQ pointer to the column-major matrix holding the eigenvectors, can be 0
+  * \param n size of the input matrix
   *
   * For compilation efficiency reasons, this procedure does not use eigen expression
   * for its arguments.
@@ -405,18 +408,22 @@
   * Implemented from Golub's "Matrix Computations", algorithm 8.3.2:
   * "implicit symmetric QR step with Wilkinson shift"
   */
-namespace internal {
 template<int StorageOrder,typename RealScalar, typename Scalar, typename Index>
 EIGEN_DEVICE_FUNC
 static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n);
 }
 
 template<typename MatrixType>
+template<typename InputType>
 EIGEN_DEVICE_FUNC
 SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
-::compute(const MatrixType& matrix, int options)
+::compute(const EigenBase<InputType>& a_matrix, int options)
 {
-  using std::abs;
+  check_template_parameters();
+  
+  const InputType &matrix(a_matrix.derived());
+  
+  EIGEN_USING_STD(abs);
   eigen_assert(matrix.cols() == matrix.rows());
   eigen_assert((options&~(EigVecMask|GenEigMask))==0
           && (options&EigVecMask)!=EigVecMask
@@ -427,7 +434,8 @@
 
   if(n==1)
   {
-    m_eivalues.coeffRef(0,0) = numext::real(matrix.coeff(0,0));
+    m_eivec = matrix;
+    m_eivalues.coeffRef(0,0) = numext::real(m_eivec.coeff(0,0));
     if(computeEigenvectors)
       m_eivec.setOnes(n,n);
     m_info = Success;
@@ -438,7 +446,7 @@
 
   // declare some aliases
   RealVectorType& diag = m_eivalues;
-  MatrixType& mat = m_eivec;
+  EigenvectorsType& mat = m_eivec;
 
   // map the matrix coefficients to [-1:1] to avoid over- and underflow.
   mat = matrix.template triangularView<Lower>();
@@ -446,7 +454,8 @@
   if(scale==RealScalar(0)) scale = RealScalar(1);
   mat.template triangularView<Lower>() /= scale;
   m_subdiag.resize(n-1);
-  internal::tridiagonalization_inplace(mat, diag, m_subdiag, computeEigenvectors);
+  m_hcoeffs.resize(n-1);
+  internal::tridiagonalization_inplace(mat, diag, m_subdiag, m_hcoeffs, computeEigenvectors);
 
   m_info = internal::computeFromTridiagonal_impl(diag, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
   
@@ -471,7 +480,7 @@
   {
     m_eivec.setIdentity(diag.size(), diag.size());
   }
-  m_info = computeFromTridiagonal_impl(m_eivalues, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
+  m_info = internal::computeFromTridiagonal_impl(m_eivalues, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
 
   m_isInitialized = true;
   m_eigenvectorsOk = computeEigenvectors;
@@ -484,33 +493,44 @@
   * \brief Compute the eigendecomposition from a tridiagonal matrix
   *
   * \param[in,out] diag : On input, the diagonal of the matrix, on output the eigenvalues
-  * \param[in] subdiag : The subdiagonal part of the matrix.
-  * \param[in,out] : On input, the maximum number of iterations, on output, the effective number of iterations.
-  * \param[out] eivec : The matrix to store the eigenvectors... if needed. allocated on input
+  * \param[in,out] subdiag : The subdiagonal part of the matrix (entries are modified during the decomposition)
+  * \param[in] maxIterations : the maximum number of iterations
+  * \param[in] computeEigenvectors : whether the eigenvectors have to be computed or not
+  * \param[out] eivec : The matrix to store the eigenvectors if computeEigenvectors==true. Must be allocated on input.
   * \returns \c Success or \c NoConvergence
   */
 template<typename MatrixType, typename DiagType, typename SubDiagType>
-ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const typename MatrixType::Index maxIterations, bool computeEigenvectors, MatrixType& eivec)
+EIGEN_DEVICE_FUNC
+ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec)
 {
-  using std::abs;
-
   ComputationInfo info;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
 
   Index n = diag.size();
   Index end = n-1;
   Index start = 0;
   Index iter = 0; // total number of iterations
-
+  
+  typedef typename DiagType::RealScalar RealScalar;
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+  const RealScalar precision_inv = RealScalar(1)/NumTraits<RealScalar>::epsilon();
   while (end>0)
   {
-    for (Index i = start; i<end; ++i)
-      if (internal::isMuchSmallerThan(abs(subdiag[i]),(abs(diag[i])+abs(diag[i+1]))))
-        subdiag[i] = 0;
+    for (Index i = start; i<end; ++i) {
+      if (numext::abs(subdiag[i]) < considerAsZero) {
+        subdiag[i] = RealScalar(0);
+      } else {
+        // abs(subdiag[i]) <= epsilon * sqrt(abs(diag[i]) + abs(diag[i+1]))
+        // Scaled to prevent underflows.
+        const RealScalar scaled_subdiag = precision_inv * subdiag[i];
+        if (scaled_subdiag * scaled_subdiag <= (numext::abs(diag[i])+numext::abs(diag[i+1]))) {
+          subdiag[i] = RealScalar(0);
+        }
+      }
+    }
 
-    // find the largest unreduced block
-    while (end>0 && subdiag[end-1]==0)
+    // find the largest unreduced block at the end of the matrix.
+    while (end>0 && subdiag[end-1]==RealScalar(0))
     {
       end--;
     }
@@ -543,7 +563,7 @@
       diag.segment(i,n-i).minCoeff(&k);
       if (k > 0)
       {
-        std::swap(diag[i], diag[k+i]);
+        numext::swap(diag[i], diag[k+i]);
         if(computeEigenvectors)
           eivec.col(i).swap(eivec.col(k+i));
       }
@@ -564,16 +584,22 @@
   typedef typename SolverType::MatrixType MatrixType;
   typedef typename SolverType::RealVectorType VectorType;
   typedef typename SolverType::Scalar Scalar;
+  typedef typename SolverType::EigenvectorsType EigenvectorsType;
   
+
+  /** \internal
+   * Computes the roots of the characteristic polynomial of \a m.
+   * For numerical stability m.trace() should be near zero and to avoid over- or underflow m should be normalized.
+   */
   EIGEN_DEVICE_FUNC
   static inline void computeRoots(const MatrixType& m, VectorType& roots)
   {
-    EIGEN_USING_STD_MATH(sqrt)
-    EIGEN_USING_STD_MATH(atan2)
-    EIGEN_USING_STD_MATH(cos)
-    EIGEN_USING_STD_MATH(sin)
-    const Scalar s_inv3 = Scalar(1.0)/Scalar(3.0);
-    const Scalar s_sqrt3 = sqrt(Scalar(3.0));
+    EIGEN_USING_STD(sqrt)
+    EIGEN_USING_STD(atan2)
+    EIGEN_USING_STD(cos)
+    EIGEN_USING_STD(sin)
+    const Scalar s_inv3 = Scalar(1)/Scalar(3);
+    const Scalar s_sqrt3 = sqrt(Scalar(3));
 
     // The characteristic equation is x^3 - c2*x^2 + c1*x - c0 = 0.  The
     // eigenvalues are the roots to this equation, all guaranteed to be
@@ -585,149 +611,124 @@
     // Construct the parameters used in classifying the roots of the equation
     // and in solving the equation for the roots in closed form.
     Scalar c2_over_3 = c2*s_inv3;
-    Scalar a_over_3 = (c1 - c2*c2_over_3)*s_inv3;
-    if (a_over_3 > Scalar(0))
-      a_over_3 = Scalar(0);
+    Scalar a_over_3 = (c2*c2_over_3 - c1)*s_inv3;
+    a_over_3 = numext::maxi(a_over_3, Scalar(0));
 
     Scalar half_b = Scalar(0.5)*(c0 + c2_over_3*(Scalar(2)*c2_over_3*c2_over_3 - c1));
 
-    Scalar q = half_b*half_b + a_over_3*a_over_3*a_over_3;
-    if (q > Scalar(0))
-      q = Scalar(0);
+    Scalar q = a_over_3*a_over_3*a_over_3 - half_b*half_b;
+    q = numext::maxi(q, Scalar(0));
 
     // Compute the eigenvalues by solving for the roots of the polynomial.
-    Scalar rho = sqrt(-a_over_3);
-    Scalar theta = atan2(sqrt(-q),half_b)*s_inv3;
+    Scalar rho = sqrt(a_over_3);
+    Scalar theta = atan2(sqrt(q),half_b)*s_inv3;  // since sqrt(q) > 0, atan2 is in [0, pi] and theta is in [0, pi/3]
     Scalar cos_theta = cos(theta);
     Scalar sin_theta = sin(theta);
-    roots(0) = c2_over_3 + Scalar(2)*rho*cos_theta;
-    roots(1) = c2_over_3 - rho*(cos_theta + s_sqrt3*sin_theta);
-    roots(2) = c2_over_3 - rho*(cos_theta - s_sqrt3*sin_theta);
-
-    // Sort in increasing order.
-    if (roots(0) >= roots(1))
-      numext::swap(roots(0),roots(1));
-    if (roots(1) >= roots(2))
-    {
-      numext::swap(roots(1),roots(2));
-      if (roots(0) >= roots(1))
-        numext::swap(roots(0),roots(1));
-    }
+    // roots are already sorted, since cos is monotonically decreasing on [0, pi]
+    roots(0) = c2_over_3 - rho*(cos_theta + s_sqrt3*sin_theta); // == 2*rho*cos(theta+2pi/3)
+    roots(1) = c2_over_3 - rho*(cos_theta - s_sqrt3*sin_theta); // == 2*rho*cos(theta+ pi/3)
+    roots(2) = c2_over_3 + Scalar(2)*rho*cos_theta;
   }
-  
+
+  EIGEN_DEVICE_FUNC
+  static inline bool extract_kernel(MatrixType& mat, Ref<VectorType> res, Ref<VectorType> representative)
+  {
+    EIGEN_USING_STD(abs);
+    EIGEN_USING_STD(sqrt);
+    Index i0;
+    // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal):
+    mat.diagonal().cwiseAbs().maxCoeff(&i0);
+    // mat.col(i0) is a good candidate for an orthogonal vector to the current eigenvector,
+    // so let's save it:
+    representative = mat.col(i0);
+    Scalar n0, n1;
+    VectorType c0, c1;
+    n0 = (c0 = representative.cross(mat.col((i0+1)%3))).squaredNorm();
+    n1 = (c1 = representative.cross(mat.col((i0+2)%3))).squaredNorm();
+    if(n0>n1) res = c0/sqrt(n0);
+    else      res = c1/sqrt(n1);
+
+    return true;
+  }
+
   EIGEN_DEVICE_FUNC
   static inline void run(SolverType& solver, const MatrixType& mat, int options)
   {
-    using std::sqrt;
     eigen_assert(mat.cols() == 3 && mat.cols() == mat.rows());
     eigen_assert((options&~(EigVecMask|GenEigMask))==0
             && (options&EigVecMask)!=EigVecMask
             && "invalid option parameter");
     bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors;
     
-    MatrixType& eivecs = solver.m_eivec;
+    EigenvectorsType& eivecs = solver.m_eivec;
     VectorType& eivals = solver.m_eivalues;
   
-    // map the matrix coefficients to [-1:1] to avoid over- and underflow.
-    Scalar scale = mat.cwiseAbs().maxCoeff();
-    MatrixType scaledMat = mat / scale;
+    // Shift the matrix to the mean eigenvalue and map the matrix coefficients to [-1:1] to avoid over- and underflow.
+    Scalar shift = mat.trace() / Scalar(3);
+    // TODO Avoid this copy. Currently it is necessary to suppress bogus values when determining maxCoeff and for computing the eigenvectors later
+    MatrixType scaledMat = mat.template selfadjointView<Lower>();
+    scaledMat.diagonal().array() -= shift;
+    Scalar scale = scaledMat.cwiseAbs().maxCoeff();
+    if(scale > 0) scaledMat /= scale;   // TODO for scale==0 we could save the remaining operations
 
     // compute the eigenvalues
     computeRoots(scaledMat,eivals);
 
-    // compute the eigen vectors
+    // compute the eigenvectors
     if(computeEigenvectors)
     {
-      Scalar safeNorm2 = Eigen::NumTraits<Scalar>::epsilon();
-      safeNorm2 *= safeNorm2;
       if((eivals(2)-eivals(0))<=Eigen::NumTraits<Scalar>::epsilon())
       {
+        // All three eigenvalues are numerically the same
         eivecs.setIdentity();
       }
       else
       {
-        scaledMat = scaledMat.template selfadjointView<Lower>();
         MatrixType tmp;
         tmp = scaledMat;
 
+        // Compute the eigenvector of the most distinct eigenvalue
         Scalar d0 = eivals(2) - eivals(1);
         Scalar d1 = eivals(1) - eivals(0);
-        int k =  d0 > d1 ? 2 : 0;
-        d0 = d0 > d1 ? d1 : d0;
-
-        tmp.diagonal().array () -= eivals(k);
-        VectorType cross;
-        Scalar n;
-        n = (cross = tmp.row(0).cross(tmp.row(1))).squaredNorm();
-
-        if(n>safeNorm2)
-          eivecs.col(k) = cross / sqrt(n);
-        else
+        Index k(0), l(2);
+        if(d0 > d1)
         {
-          n = (cross = tmp.row(0).cross(tmp.row(2))).squaredNorm();
-
-          if(n>safeNorm2)
-            eivecs.col(k) = cross / sqrt(n);
-          else
-          {
-            n = (cross = tmp.row(1).cross(tmp.row(2))).squaredNorm();
-
-            if(n>safeNorm2)
-              eivecs.col(k) = cross / sqrt(n);
-            else
-            {
-              // the input matrix and/or the eigenvaues probably contains some inf/NaN,
-              // => exit
-              // scale back to the original size.
-              eivals *= scale;
-
-              solver.m_info = NumericalIssue;
-              solver.m_isInitialized = true;
-              solver.m_eigenvectorsOk = computeEigenvectors;
-              return;
-            }
-          }
+          numext::swap(k,l);
+          d0 = d1;
         }
 
-        tmp = scaledMat;
-        tmp.diagonal().array() -= eivals(1);
-
-        if(d0<=Eigen::NumTraits<Scalar>::epsilon())
-          eivecs.col(1) = eivecs.col(k).unitOrthogonal();
-        else
+        // Compute the eigenvector of index k
         {
-          n = (cross = eivecs.col(k).cross(tmp.row(0).normalized())).squaredNorm();
-          if(n>safeNorm2)
-            eivecs.col(1) = cross / sqrt(n);
-          else
-          {
-            n = (cross = eivecs.col(k).cross(tmp.row(1))).squaredNorm();
-            if(n>safeNorm2)
-              eivecs.col(1) = cross / sqrt(n);
-            else
-            {
-              n = (cross = eivecs.col(k).cross(tmp.row(2))).squaredNorm();
-              if(n>safeNorm2)
-                eivecs.col(1) = cross / sqrt(n);
-              else
-              {
-                // we should never reach this point,
-                // if so the last two eigenvalues are likely to ve very closed to each other
-                eivecs.col(1) = eivecs.col(k).unitOrthogonal();
-              }
-            }
-          }
-
-          // make sure that eivecs[1] is orthogonal to eivecs[2]
-          Scalar d = eivecs.col(1).dot(eivecs.col(k));
-          eivecs.col(1) = (eivecs.col(1) - d * eivecs.col(k)).normalized();
+          tmp.diagonal().array () -= eivals(k);
+          // By construction, 'tmp' is of rank 2, and its kernel corresponds to the respective eigenvector.
+          extract_kernel(tmp, eivecs.col(k), eivecs.col(l));
         }
 
-        eivecs.col(k==2 ? 0 : 2) = eivecs.col(k).cross(eivecs.col(1)).normalized();
+        // Compute eigenvector of index l
+        if(d0<=2*Eigen::NumTraits<Scalar>::epsilon()*d1)
+        {
+          // If d0 is too small, then the two other eigenvalues are numerically the same,
+          // and thus we only have to ortho-normalize the near orthogonal vector we saved above.
+          eivecs.col(l) -= eivecs.col(k).dot(eivecs.col(l))*eivecs.col(l);
+          eivecs.col(l).normalize();
+        }
+        else
+        {
+          tmp = scaledMat;
+          tmp.diagonal().array () -= eivals(l);
+
+          VectorType dummy;
+          extract_kernel(tmp, eivecs.col(l), dummy);
+        }
+
+        // Compute last eigenvector from the other two
+        eivecs.col(1) = eivecs.col(2).cross(eivecs.col(0)).normalized();
       }
     }
+
     // Rescale back to the original size.
     eivals *= scale;
+    eivals.array() += shift;
     
     solver.m_info = Success;
     solver.m_isInitialized = true;
@@ -742,12 +743,13 @@
   typedef typename SolverType::MatrixType MatrixType;
   typedef typename SolverType::RealVectorType VectorType;
   typedef typename SolverType::Scalar Scalar;
+  typedef typename SolverType::EigenvectorsType EigenvectorsType;
   
   EIGEN_DEVICE_FUNC
   static inline void computeRoots(const MatrixType& m, VectorType& roots)
   {
-    using std::sqrt;
-    const Scalar t0 = Scalar(0.5) * sqrt( numext::abs2(m(0,0)-m(1,1)) + Scalar(4)*m(1,0)*m(1,0));
+    EIGEN_USING_STD(sqrt);
+    const Scalar t0 = Scalar(0.5) * sqrt( numext::abs2(m(0,0)-m(1,1)) + Scalar(4)*numext::abs2(m(1,0)));
     const Scalar t1 = Scalar(0.5) * (m(0,0) + m(1,1));
     roots(0) = t1 - t0;
     roots(1) = t1 + t0;
@@ -756,7 +758,8 @@
   EIGEN_DEVICE_FUNC
   static inline void run(SolverType& solver, const MatrixType& mat, int options)
   {
-    EIGEN_USING_STD_MATH(sqrt);
+    EIGEN_USING_STD(sqrt);
+    EIGEN_USING_STD(abs);
     
     eigen_assert(mat.cols() == 2 && mat.cols() == mat.rows());
     eigen_assert((options&~(EigVecMask|GenEigMask))==0
@@ -764,41 +767,53 @@
             && "invalid option parameter");
     bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors;
     
-    MatrixType& eivecs = solver.m_eivec;
+    EigenvectorsType& eivecs = solver.m_eivec;
     VectorType& eivals = solver.m_eivalues;
   
-    // map the matrix coefficients to [-1:1] to avoid over- and underflow.
-    Scalar scale = mat.cwiseAbs().maxCoeff();
-    scale = numext::maxi(scale,Scalar(1));
-    MatrixType scaledMat = mat / scale;
-    
+    // Shift the matrix to the mean eigenvalue and map the matrix coefficients to [-1:1] to avoid over- and underflow.
+    Scalar shift = mat.trace() / Scalar(2);
+    MatrixType scaledMat = mat;
+    scaledMat.coeffRef(0,1) = mat.coeff(1,0);
+    scaledMat.diagonal().array() -= shift;
+    Scalar scale = scaledMat.cwiseAbs().maxCoeff();
+    if(scale > Scalar(0))
+      scaledMat /= scale;
+
     // Compute the eigenvalues
     computeRoots(scaledMat,eivals);
-    
+
     // compute the eigen vectors
     if(computeEigenvectors)
     {
-      scaledMat.diagonal().array () -= eivals(1);
-      Scalar a2 = numext::abs2(scaledMat(0,0));
-      Scalar c2 = numext::abs2(scaledMat(1,1));
-      Scalar b2 = numext::abs2(scaledMat(1,0));
-      if(a2>c2)
+      if((eivals(1)-eivals(0))<=abs(eivals(1))*Eigen::NumTraits<Scalar>::epsilon())
       {
-        eivecs.col(1) << -scaledMat(1,0), scaledMat(0,0);
-        eivecs.col(1) /= sqrt(a2+b2);
+        eivecs.setIdentity();
       }
       else
       {
-        eivecs.col(1) << -scaledMat(1,1), scaledMat(1,0);
-        eivecs.col(1) /= sqrt(c2+b2);
-      }
+        scaledMat.diagonal().array () -= eivals(1);
+        Scalar a2 = numext::abs2(scaledMat(0,0));
+        Scalar c2 = numext::abs2(scaledMat(1,1));
+        Scalar b2 = numext::abs2(scaledMat(1,0));
+        if(a2>c2)
+        {
+          eivecs.col(1) << -scaledMat(1,0), scaledMat(0,0);
+          eivecs.col(1) /= sqrt(a2+b2);
+        }
+        else
+        {
+          eivecs.col(1) << -scaledMat(1,1), scaledMat(1,0);
+          eivecs.col(1) /= sqrt(c2+b2);
+        }
 
-      eivecs.col(0) << eivecs.col(1).unitOrthogonal();
+        eivecs.col(0) << eivecs.col(1).unitOrthogonal();
+      }
     }
-    
+
     // Rescale back to the original size.
     eivals *= scale;
-    
+    eivals.array() += shift;
+
     solver.m_info = Success;
     solver.m_isInitialized = true;
     solver.m_eigenvectorsOk = computeEigenvectors;
@@ -817,32 +832,38 @@
 }
 
 namespace internal {
+
+// Francis implicit QR step.
 template<int StorageOrder,typename RealScalar, typename Scalar, typename Index>
 EIGEN_DEVICE_FUNC
 static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n)
 {
-  using std::abs;
+  // Wilkinson Shift.
   RealScalar td = (diag[end-1] - diag[end])*RealScalar(0.5);
   RealScalar e = subdiag[end-1];
   // Note that thanks to scaling, e^2 or td^2 cannot overflow, however they can still
   // underflow thus leading to inf/NaN values when using the following commented code:
-//   RealScalar e2 = numext::abs2(subdiag[end-1]);
-//   RealScalar mu = diag[end] - e2 / (td + (td>0 ? 1 : -1) * sqrt(td*td + e2));
+  //   RealScalar e2 = numext::abs2(subdiag[end-1]);
+  //   RealScalar mu = diag[end] - e2 / (td + (td>0 ? 1 : -1) * sqrt(td*td + e2));
   // This explain the following, somewhat more complicated, version:
   RealScalar mu = diag[end];
-  if(td==0)
-    mu -= abs(e);
-  else
-  {
-    RealScalar e2 = numext::abs2(subdiag[end-1]);
-    RealScalar h = numext::hypot(td,e);
-    if(e2==0)  mu -= (e / (td + (td>0 ? 1 : -1))) * (e / h);
-    else       mu -= e2 / (td + (td>0 ? h : -h));
+  if(td==RealScalar(0)) {
+    mu -= numext::abs(e);
+  } else if (e != RealScalar(0)) {
+    const RealScalar e2 = numext::abs2(e);
+    const RealScalar h = numext::hypot(td,e);
+    if(e2 == RealScalar(0)) {
+      mu -= e / ((td + (td>RealScalar(0) ? h : -h)) / e);
+    } else {
+      mu -= e2 / (td + (td>RealScalar(0) ? h : -h)); 
+    }
   }
-  
+
   RealScalar x = diag[start] - mu;
   RealScalar z = subdiag[start];
-  for (Index k = start; k < end; ++k)
+  // If z ever becomes zero, the Givens rotation will be the identity and
+  // z will stay zero for all future iterations.
+  for (Index k = start; k < end && z != RealScalar(0); ++k)
   {
     JacobiRotation<RealScalar> rot;
     rot.makeGivens(x, z);
@@ -855,12 +876,11 @@
     diag[k+1] = rot.s() * sdk + rot.c() * dkp1;
     subdiag[k] = rot.c() * sdk - rot.s() * dkp1;
     
-
     if (k > start)
       subdiag[k - 1] = rot.c() * subdiag[k-1] - rot.s() * z;
 
+    // "Chasing the bulge" to return to triangular form.
     x = subdiag[k];
-
     if (k < end - 1)
     {
       z = -rot.s() * subdiag[k+1];

diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h
new file mode 100644
index 0000000..b0c947d
--- /dev/null
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h

@@ -0,0 +1,87 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to LAPACKe
+ *    Self-adjoint eigenvalues/eigenvectors.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_SAEIGENSOLVER_LAPACKE_H
+#define EIGEN_SAEIGENSOLVER_LAPACKE_H
+
+namespace Eigen { 
+
+/** \internal Specialization for the data types supported by LAPACKe */
+
+#define EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, EIGCOLROW ) \
+template<> template<typename InputType> inline \
+SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
+SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix, int options) \
+{ \
+  eigen_assert(matrix.cols() == matrix.rows()); \
+  eigen_assert((options&~(EigVecMask|GenEigMask))==0 \
+          && (options&EigVecMask)!=EigVecMask \
+          && "invalid option parameter"); \
+  bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors; \
+  lapack_int n = internal::convert_index<lapack_int>(matrix.cols()), lda, info; \
+  m_eivalues.resize(n,1); \
+  m_subdiag.resize(n-1); \
+  m_eivec = matrix; \
+\
+  if(n==1) \
+  { \
+    m_eivalues.coeffRef(0,0) = numext::real(m_eivec.coeff(0,0)); \
+    if(computeEigenvectors) m_eivec.setOnes(n,n); \
+    m_info = Success; \
+    m_isInitialized = true; \
+    m_eigenvectorsOk = computeEigenvectors; \
+    return *this; \
+  } \
+\
+  lda = internal::convert_index<lapack_int>(m_eivec.outerStride()); \
+  char jobz, uplo='L'/*, range='A'*/; \
+  jobz = computeEigenvectors ? 'V' : 'N'; \
+\
+  info = LAPACKE_##LAPACKE_NAME( LAPACK_COL_MAJOR, jobz, uplo, n, (LAPACKE_TYPE*)m_eivec.data(), lda, (LAPACKE_RTYPE*)m_eivalues.data() ); \
+  m_info = (info==0) ? Success : NoConvergence; \
+  m_isInitialized = true; \
+  m_eigenvectorsOk = computeEigenvectors; \
+  return *this; \
+}
+
+#define EIGEN_LAPACKE_EIG_SELFADJ(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME )              \
+        EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, ColMajor )  \
+        EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, RowMajor ) 
+
+EIGEN_LAPACKE_EIG_SELFADJ(double,   double,                double, dsyev)
+EIGEN_LAPACKE_EIG_SELFADJ(float,    float,                 float,  ssyev)
+EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev)
+EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float,  float,  cheev)
+
+} // end namespace Eigen
+
+#endif // EIGEN_SAEIGENSOLVER_H

diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h
index 192278d..eda8279 100644
--- a/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/Eigen/src/Eigenvalues/Tridiagonalization.h

@@ -11,18 +11,21 @@
 #ifndef EIGEN_TRIDIAGONALIZATION_H
 #define EIGEN_TRIDIAGONALIZATION_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
-  
+
 template<typename MatrixType> struct TridiagonalizationMatrixTReturnType;
 template<typename MatrixType>
 struct traits<TridiagonalizationMatrixTReturnType<MatrixType> >
+  : public traits<typename MatrixType::PlainObject>
 {
-  typedef typename MatrixType::PlainObject ReturnType;
+  typedef typename MatrixType::PlainObject ReturnType; // FIXME shall it be a BandMatrix?
+  enum { Flags = 0 };
 };
 
 template<typename MatrixType, typename CoeffVectorType>
+EIGEN_DEVICE_FUNC
 void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs);
 }
 
@@ -67,7 +70,7 @@
 
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     enum {
       Size = MatrixType::RowsAtCompileTime,
@@ -89,10 +92,8 @@
             >::type DiagonalReturnType;
 
     typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
-              typename internal::add_const_on_value_type<typename Diagonal<
-                Block<const MatrixType,SizeMinusOne,SizeMinusOne> >::RealReturnType>::type,
-              const Diagonal<
-                Block<const MatrixType,SizeMinusOne,SizeMinusOne> >
+              typename internal::add_const_on_value_type<typename Diagonal<const MatrixType, -1>::RealReturnType>::type,
+              const Diagonal<const MatrixType, -1>
             >::type SubDiagonalReturnType;
 
     /** \brief Return type of matrixQ() */
@@ -110,7 +111,7 @@
       *
       * \sa compute() for an example.
       */
-    Tridiagonalization(Index size = Size==Dynamic ? 2 : Size)
+    explicit Tridiagonalization(Index size = Size==Dynamic ? 2 : Size)
       : m_matrix(size,size),
         m_hCoeffs(size > 1 ? size-1 : 1),
         m_isInitialized(false)
@@ -126,8 +127,9 @@
       * Example: \include Tridiagonalization_Tridiagonalization_MatrixType.cpp
       * Output: \verbinclude Tridiagonalization_Tridiagonalization_MatrixType.out
       */
-    Tridiagonalization(const MatrixType& matrix)
-      : m_matrix(matrix),
+    template<typename InputType>
+    explicit Tridiagonalization(const EigenBase<InputType>& matrix)
+      : m_matrix(matrix.derived()),
         m_hCoeffs(matrix.cols() > 1 ? matrix.cols()-1 : 1),
         m_isInitialized(false)
     {
@@ -152,9 +154,10 @@
       * Example: \include Tridiagonalization_compute.cpp
       * Output: \verbinclude Tridiagonalization_compute.out
       */
-    Tridiagonalization& compute(const MatrixType& matrix)
+    template<typename InputType>
+    Tridiagonalization& compute(const EigenBase<InputType>& matrix)
     {
-      m_matrix = matrix;
+      m_matrix = matrix.derived();
       m_hCoeffs.resize(matrix.rows()-1, 1);
       internal::tridiagonalization_inplace(m_matrix, m_hCoeffs);
       m_isInitialized = true;
@@ -305,7 +308,7 @@
 Tridiagonalization<MatrixType>::diagonal() const
 {
   eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
-  return m_matrix.diagonal();
+  return m_matrix.diagonal().real();
 }
 
 template<typename MatrixType>
@@ -313,8 +316,7 @@
 Tridiagonalization<MatrixType>::subDiagonal() const
 {
   eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
-  Index n = m_matrix.rows();
-  return Block<const MatrixType,SizeMinusOne,SizeMinusOne>(m_matrix, 1, 0, n-1,n-1).diagonal();
+  return m_matrix.template diagonal<-1>().real();
 }
 
 namespace internal {
@@ -343,16 +345,16 @@
   * \sa Tridiagonalization::packedMatrix()
   */
 template<typename MatrixType, typename CoeffVectorType>
+EIGEN_DEVICE_FUNC
 void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs)
 {
   using numext::conj;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
   Index n = matA.rows();
   eigen_assert(n==matA.cols());
   eigen_assert(n==hCoeffs.size()+1 || n==1);
-  
+
   for (Index i = 0; i<n-1; ++i)
   {
     Index remainingSize = n-i-1;
@@ -367,10 +369,10 @@
     hCoeffs.tail(n-i-1).noalias() = (matA.bottomRightCorner(remainingSize,remainingSize).template selfadjointView<Lower>()
                                   * (conj(h) * matA.col(i).tail(remainingSize)));
 
-    hCoeffs.tail(n-i-1) += (conj(h)*Scalar(-0.5)*(hCoeffs.tail(remainingSize).dot(matA.col(i).tail(remainingSize)))) * matA.col(i).tail(n-i-1);
+    hCoeffs.tail(n-i-1) += (conj(h)*RealScalar(-0.5)*(hCoeffs.tail(remainingSize).dot(matA.col(i).tail(remainingSize)))) * matA.col(i).tail(n-i-1);
 
     matA.bottomRightCorner(remainingSize, remainingSize).template selfadjointView<Lower>()
-      .rankUpdate(matA.col(i).tail(remainingSize), hCoeffs.tail(remainingSize), -1);
+      .rankUpdate(matA.col(i).tail(remainingSize), hCoeffs.tail(remainingSize), Scalar(-1));
 
     matA.col(i).coeffRef(i+1) = beta;
     hCoeffs.coeffRef(i) = h;
@@ -423,11 +425,13 @@
   *
   * \sa class Tridiagonalization
   */
-template<typename MatrixType, typename DiagonalType, typename SubDiagonalType>
-void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
+template<typename MatrixType, typename DiagonalType, typename SubDiagonalType, typename CoeffVectorType>
+EIGEN_DEVICE_FUNC
+void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag,
+                                CoeffVectorType& hcoeffs, bool extractQ)
 {
   eigen_assert(mat.cols()==mat.rows() && diag.size()==mat.rows() && subdiag.size()==mat.rows()-1);
-  tridiagonalization_inplace_selector<MatrixType>::run(mat, diag, subdiag, extractQ);
+  tridiagonalization_inplace_selector<MatrixType>::run(mat, diag, subdiag, hcoeffs, extractQ);
 }
 
 /** \internal
@@ -436,14 +440,12 @@
 template<typename MatrixType, int Size, bool IsComplex>
 struct tridiagonalization_inplace_selector
 {
-  typedef typename Tridiagonalization<MatrixType>::CoeffVectorType CoeffVectorType;
   typedef typename Tridiagonalization<MatrixType>::HouseholderSequenceType HouseholderSequenceType;
-  typedef typename MatrixType::Index Index;
-  template<typename DiagonalType, typename SubDiagonalType>
-  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
+  template<typename DiagonalType, typename SubDiagonalType, typename CoeffVectorType>
+  static EIGEN_DEVICE_FUNC
+      void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType& hCoeffs, bool extractQ)
   {
-    CoeffVectorType hCoeffs(mat.cols()-1);
-    tridiagonalization_inplace(mat,hCoeffs);
+    tridiagonalization_inplace(mat, hCoeffs);
     diag = mat.diagonal().real();
     subdiag = mat.template diagonal<-1>().real();
     if(extractQ)
@@ -463,13 +465,14 @@
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
 
-  template<typename DiagonalType, typename SubDiagonalType>
-  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
+  template<typename DiagonalType, typename SubDiagonalType, typename CoeffVectorType>
+  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType&, bool extractQ)
   {
     using std::sqrt;
+    const RealScalar tol = (std::numeric_limits<RealScalar>::min)();
     diag[0] = mat(0,0);
     RealScalar v1norm2 = numext::abs2(mat(2,0));
-    if(v1norm2 == RealScalar(0))
+    if(v1norm2 <= tol)
     {
       diag[1] = mat(1,1);
       diag[2] = mat(2,2);
@@ -507,8 +510,9 @@
 {
   typedef typename MatrixType::Scalar Scalar;
 
-  template<typename DiagonalType, typename SubDiagonalType>
-  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ)
+  template<typename DiagonalType, typename SubDiagonalType, typename CoeffVectorType>
+  static EIGEN_DEVICE_FUNC
+  void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, CoeffVectorType&, bool extractQ)
   {
     diag(0,0) = numext::real(mat(0,0));
     if(extractQ)
@@ -526,7 +530,6 @@
 template<typename MatrixType> struct TridiagonalizationMatrixTReturnType
 : public ReturnByValue<TridiagonalizationMatrixTReturnType<MatrixType> >
 {
-    typedef typename MatrixType::Index Index;
   public:
     /** \brief Constructor.
       *
@@ -543,8 +546,8 @@
       result.template diagonal<-1>() = m_matrix.template diagonal<-1>();
     }
 
-    Index rows() const { return m_matrix.rows(); }
-    Index cols() const { return m_matrix.cols(); }
+    EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
 
   protected:
     typename MatrixType::Nested m_matrix;

diff --git a/Eigen/src/Geometry/AlignedBox.h b/Eigen/src/Geometry/AlignedBox.h
index b6a2f0e..55a9d0a 100644
--- a/Eigen/src/Geometry/AlignedBox.h
+++ b/Eigen/src/Geometry/AlignedBox.h

@@ -7,10 +7,46 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+// Function void Eigen::AlignedBox::transform(const Transform& transform)
+// is provided under the following license agreement:
+//
+// Software License Agreement (BSD License)
+//
+// Copyright (c) 2011-2014, Willow Garage, Inc.
+// Copyright (c) 2014-2015, Open Source Robotics Foundation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//  * Neither the name of Open Source Robotics Foundation nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
 #ifndef EIGEN_ALIGNEDBOX_H
 #define EIGEN_ALIGNEDBOX_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \geometry_module \ingroup Geometry_Module
   *
@@ -19,10 +55,12 @@
   *
   * \brief An axis aligned box
   *
-  * \param _Scalar the type of the scalar coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
+  * \tparam _Scalar the type of the scalar coefficients
+  * \tparam _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
   *
   * This class represents an axis aligned box as a pair of the minimal and maximal corners.
+  * \warning The result of most methods is undefined when applied to an empty box. You can check for empty boxes using isEmpty().
+  * \sa alignedboxtypedefs
   */
 template <typename _Scalar, int _AmbientDim>
 class AlignedBox
@@ -32,102 +70,104 @@
   enum { AmbientDimAtCompileTime = _AmbientDim };
   typedef _Scalar                                   Scalar;
   typedef NumTraits<Scalar>                         ScalarTraits;
-  typedef DenseIndex                                Index;
+  typedef Eigen::Index                              Index; ///< \deprecated since Eigen 3.3
   typedef typename ScalarTraits::Real               RealScalar;
-  typedef typename ScalarTraits::NonInteger      NonInteger;
+  typedef typename ScalarTraits::NonInteger         NonInteger;
   typedef Matrix<Scalar,AmbientDimAtCompileTime,1>  VectorType;
+  typedef CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const VectorType, const VectorType> VectorTypeSum;
 
   /** Define constants to name the corners of a 1D, 2D or 3D axis aligned bounding box */
   enum CornerType
   {
-    /** 1D names */
+    /** 1D names @{ */
     Min=0, Max=1,
+    /** @} */
 
-    /** Added names for 2D */
+    /** Identifier for 2D corner @{ */
     BottomLeft=0, BottomRight=1,
     TopLeft=2, TopRight=3,
+    /** @} */
 
-    /** Added names for 3D */
+    /** Identifier for 3D corner  @{ */
     BottomLeftFloor=0, BottomRightFloor=1,
     TopLeftFloor=2, TopRightFloor=3,
     BottomLeftCeil=4, BottomRightCeil=5,
     TopLeftCeil=6, TopRightCeil=7
+    /** @} */
   };
 
 
   /** Default constructor initializing a null box. */
-  inline AlignedBox()
-  { if (AmbientDimAtCompileTime!=Dynamic) setEmpty(); }
+  EIGEN_DEVICE_FUNC inline AlignedBox()
+  { if (EIGEN_CONST_CONDITIONAL(AmbientDimAtCompileTime!=Dynamic)) setEmpty(); }
 
   /** Constructs a null box with \a _dim the dimension of the ambient space. */
-  inline explicit AlignedBox(Index _dim) : m_min(_dim), m_max(_dim)
+  EIGEN_DEVICE_FUNC inline explicit AlignedBox(Index _dim) : m_min(_dim), m_max(_dim)
   { setEmpty(); }
 
-  /** Constructs a box with extremities \a _min and \a _max. */
+  /** Constructs a box with extremities \a _min and \a _max.
+   * \warning If either component of \a _min is larger than the same component of \a _max, the constructed box is empty. */
   template<typename OtherVectorType1, typename OtherVectorType2>
-  inline AlignedBox(const OtherVectorType1& _min, const OtherVectorType2& _max) : m_min(_min), m_max(_max) {}
+  EIGEN_DEVICE_FUNC inline AlignedBox(const OtherVectorType1& _min, const OtherVectorType2& _max) : m_min(_min), m_max(_max) {}
 
   /** Constructs a box containing a single point \a p. */
   template<typename Derived>
-  inline explicit AlignedBox(const MatrixBase<Derived>& a_p)
-  {
-    typename internal::nested<Derived,2>::type p(a_p.derived());
-    m_min = p;
-    m_max = p;
-  }
+  EIGEN_DEVICE_FUNC inline explicit AlignedBox(const MatrixBase<Derived>& p) : m_min(p), m_max(m_min)
+  { }
 
-  ~AlignedBox() {}
+  EIGEN_DEVICE_FUNC ~AlignedBox() {}
 
   /** \returns the dimension in which the box holds */
-  inline Index dim() const { return AmbientDimAtCompileTime==Dynamic ? m_min.size() : Index(AmbientDimAtCompileTime); }
+  EIGEN_DEVICE_FUNC inline Index dim() const { return AmbientDimAtCompileTime==Dynamic ? m_min.size() : Index(AmbientDimAtCompileTime); }
 
-  /** \deprecated use isEmpty */
-  inline bool isNull() const { return isEmpty(); }
+  /** \deprecated use isEmpty() */
+  EIGEN_DEVICE_FUNC inline bool isNull() const { return isEmpty(); }
 
-  /** \deprecated use setEmpty */
-  inline void setNull() { setEmpty(); }
+  /** \deprecated use setEmpty() */
+  EIGEN_DEVICE_FUNC inline void setNull() { setEmpty(); }
 
-  /** \returns true if the box is empty. */
-  inline bool isEmpty() const { return (m_min.array() > m_max.array()).any(); }
+  /** \returns true if the box is empty.
+   * \sa setEmpty */
+  EIGEN_DEVICE_FUNC inline bool isEmpty() const { return (m_min.array() > m_max.array()).any(); }
 
-  /** Makes \c *this an empty box. */
-  inline void setEmpty()
+  /** Makes \c *this an empty box.
+   * \sa isEmpty */
+  EIGEN_DEVICE_FUNC inline void setEmpty()
   {
     m_min.setConstant( ScalarTraits::highest() );
     m_max.setConstant( ScalarTraits::lowest() );
   }
 
   /** \returns the minimal corner */
-  inline const VectorType& (min)() const { return m_min; }
+  EIGEN_DEVICE_FUNC inline const VectorType& (min)() const { return m_min; }
   /** \returns a non const reference to the minimal corner */
-  inline VectorType& (min)() { return m_min; }
+  EIGEN_DEVICE_FUNC inline VectorType& (min)() { return m_min; }
   /** \returns the maximal corner */
-  inline const VectorType& (max)() const { return m_max; }
+  EIGEN_DEVICE_FUNC inline const VectorType& (max)() const { return m_max; }
   /** \returns a non const reference to the maximal corner */
-  inline VectorType& (max)() { return m_max; }
+  EIGEN_DEVICE_FUNC inline VectorType& (max)() { return m_max; }
 
   /** \returns the center of the box */
-  inline const CwiseUnaryOp<internal::scalar_quotient1_op<Scalar>,
-                            const CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const VectorType, const VectorType> >
+  EIGEN_DEVICE_FUNC inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(VectorTypeSum, RealScalar, quotient)
   center() const
-  { return (m_min+m_max)/2; }
+  { return (m_min+m_max)/RealScalar(2); }
 
   /** \returns the lengths of the sides of the bounding box.
     * Note that this function does not get the same
     * result for integral or floating scalar types: see
     */
-  inline const CwiseBinaryOp< internal::scalar_difference_op<Scalar>, const VectorType, const VectorType> sizes() const
+  EIGEN_DEVICE_FUNC inline const CwiseBinaryOp< internal::scalar_difference_op<Scalar,Scalar>, const VectorType, const VectorType> sizes() const
   { return m_max - m_min; }
 
   /** \returns the volume of the bounding box */
-  inline Scalar volume() const
+  EIGEN_DEVICE_FUNC inline Scalar volume() const
   { return sizes().prod(); }
 
   /** \returns an expression for the bounding box diagonal vector
     * if the length of the diagonal is needed: diagonal().norm()
     * will provide it.
     */
-  inline CwiseBinaryOp< internal::scalar_difference_op<Scalar>, const VectorType, const VectorType> diagonal() const
+  EIGEN_DEVICE_FUNC inline CwiseBinaryOp< internal::scalar_difference_op<Scalar,Scalar>, const VectorType, const VectorType> diagonal() const
   { return sizes(); }
 
   /** \returns the vertex of the bounding box at the corner defined by
@@ -139,7 +179,7 @@
     * For 3D bounding boxes, the following names are added:
     * BottomLeftCeil, BottomRightCeil, TopLeftCeil, TopRightCeil.
     */
-  inline VectorType corner(CornerType corner) const
+  EIGEN_DEVICE_FUNC inline VectorType corner(CornerType corner) const
   {
     EIGEN_STATIC_ASSERT(_AmbientDim <= 3, THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE);
 
@@ -157,9 +197,9 @@
 
   /** \returns a random point inside the bounding box sampled with
    * a uniform distribution */
-  inline VectorType sample() const
+  EIGEN_DEVICE_FUNC inline VectorType sample() const
   {
-    VectorType r;
+    VectorType r(dim());
     for(Index d=0; d<dim(); ++d)
     {
       if(!ScalarTraits::IsInteger)
@@ -175,91 +215,158 @@
 
   /** \returns true if the point \a p is inside the box \c *this. */
   template<typename Derived>
-  inline bool contains(const MatrixBase<Derived>& a_p) const
+  EIGEN_DEVICE_FUNC inline bool contains(const MatrixBase<Derived>& p) const
   {
-    typename internal::nested<Derived,2>::type p(a_p.derived());
-    return (m_min.array()<=p.array()).all() && (p.array()<=m_max.array()).all();
+    typename internal::nested_eval<Derived,2>::type p_n(p.derived());
+    return (m_min.array()<=p_n.array()).all() && (p_n.array()<=m_max.array()).all();
   }
 
   /** \returns true if the box \a b is entirely inside the box \c *this. */
-  inline bool contains(const AlignedBox& b) const
+  EIGEN_DEVICE_FUNC inline bool contains(const AlignedBox& b) const
   { return (m_min.array()<=(b.min)().array()).all() && ((b.max)().array()<=m_max.array()).all(); }
 
-  /** \returns true if the box \a b is intersecting the box \c *this. */
-  inline bool intersects(const AlignedBox& b) const
+  /** \returns true if the box \a b is intersecting the box \c *this.
+   * \sa intersection, clamp */
+  EIGEN_DEVICE_FUNC inline bool intersects(const AlignedBox& b) const
   { return (m_min.array()<=(b.max)().array()).all() && ((b.min)().array()<=m_max.array()).all(); }
 
-  /** Extends \c *this such that it contains the point \a p and returns a reference to \c *this. */
+  /** Extends \c *this such that it contains the point \a p and returns a reference to \c *this.
+   * \sa extend(const AlignedBox&) */
   template<typename Derived>
-  inline AlignedBox& extend(const MatrixBase<Derived>& a_p)
+  EIGEN_DEVICE_FUNC inline AlignedBox& extend(const MatrixBase<Derived>& p)
   {
-    typename internal::nested<Derived,2>::type p(a_p.derived());
-    m_min = m_min.cwiseMin(p);
-    m_max = m_max.cwiseMax(p);
+    typename internal::nested_eval<Derived,2>::type p_n(p.derived());
+    m_min = m_min.cwiseMin(p_n);
+    m_max = m_max.cwiseMax(p_n);
     return *this;
   }
 
-  /** Extends \c *this such that it contains the box \a b and returns a reference to \c *this. */
-  inline AlignedBox& extend(const AlignedBox& b)
+  /** Extends \c *this such that it contains the box \a b and returns a reference to \c *this.
+   * \sa merged, extend(const MatrixBase&) */
+  EIGEN_DEVICE_FUNC inline AlignedBox& extend(const AlignedBox& b)
   {
     m_min = m_min.cwiseMin(b.m_min);
     m_max = m_max.cwiseMax(b.m_max);
     return *this;
   }
 
-  /** Clamps \c *this by the box \a b and returns a reference to \c *this. */
-  inline AlignedBox& clamp(const AlignedBox& b)
+  /** Clamps \c *this by the box \a b and returns a reference to \c *this.
+   * \note If the boxes don't intersect, the resulting box is empty.
+   * \sa intersection(), intersects() */
+  EIGEN_DEVICE_FUNC inline AlignedBox& clamp(const AlignedBox& b)
   {
     m_min = m_min.cwiseMax(b.m_min);
     m_max = m_max.cwiseMin(b.m_max);
     return *this;
   }
 
-  /** Returns an AlignedBox that is the intersection of \a b and \c *this */
-  inline AlignedBox intersection(const AlignedBox& b) const
+  /** Returns an AlignedBox that is the intersection of \a b and \c *this
+   * \note If the boxes don't intersect, the resulting box is empty.
+   * \sa intersects(), clamp, contains()  */
+  EIGEN_DEVICE_FUNC inline AlignedBox intersection(const AlignedBox& b) const
   {return AlignedBox(m_min.cwiseMax(b.m_min), m_max.cwiseMin(b.m_max)); }
 
-  /** Returns an AlignedBox that is the union of \a b and \c *this */
-  inline AlignedBox merged(const AlignedBox& b) const
+  /** Returns an AlignedBox that is the union of \a b and \c *this.
+   * \note Merging with an empty box may result in a box bigger than \c *this.
+   * \sa extend(const AlignedBox&) */
+  EIGEN_DEVICE_FUNC inline AlignedBox merged(const AlignedBox& b) const
   { return AlignedBox(m_min.cwiseMin(b.m_min), m_max.cwiseMax(b.m_max)); }
 
   /** Translate \c *this by the vector \a t and returns a reference to \c *this. */
   template<typename Derived>
-  inline AlignedBox& translate(const MatrixBase<Derived>& a_t)
+  EIGEN_DEVICE_FUNC inline AlignedBox& translate(const MatrixBase<Derived>& a_t)
   {
-    const typename internal::nested<Derived,2>::type t(a_t.derived());
+    const typename internal::nested_eval<Derived,2>::type t(a_t.derived());
     m_min += t;
     m_max += t;
     return *this;
   }
 
+  /** \returns a copy of \c *this translated by the vector \a t. */
+  template<typename Derived>
+  EIGEN_DEVICE_FUNC inline AlignedBox translated(const MatrixBase<Derived>& a_t) const
+  {
+    AlignedBox result(m_min, m_max);
+    result.translate(a_t);
+    return result;
+  }
+
   /** \returns the squared distance between the point \a p and the box \c *this,
     * and zero if \a p is inside the box.
-    * \sa exteriorDistance()
+    * \sa exteriorDistance(const MatrixBase&), squaredExteriorDistance(const AlignedBox&)
     */
   template<typename Derived>
-  inline Scalar squaredExteriorDistance(const MatrixBase<Derived>& a_p) const;
+  EIGEN_DEVICE_FUNC inline Scalar squaredExteriorDistance(const MatrixBase<Derived>& p) const;
 
   /** \returns the squared distance between the boxes \a b and \c *this,
     * and zero if the boxes intersect.
-    * \sa exteriorDistance()
+    * \sa exteriorDistance(const AlignedBox&), squaredExteriorDistance(const MatrixBase&)
     */
-  inline Scalar squaredExteriorDistance(const AlignedBox& b) const;
+  EIGEN_DEVICE_FUNC inline Scalar squaredExteriorDistance(const AlignedBox& b) const;
 
   /** \returns the distance between the point \a p and the box \c *this,
     * and zero if \a p is inside the box.
-    * \sa squaredExteriorDistance()
+    * \sa squaredExteriorDistance(const MatrixBase&), exteriorDistance(const AlignedBox&)
     */
   template<typename Derived>
-  inline NonInteger exteriorDistance(const MatrixBase<Derived>& p) const
-  { using std::sqrt; return sqrt(NonInteger(squaredExteriorDistance(p))); }
+  EIGEN_DEVICE_FUNC inline NonInteger exteriorDistance(const MatrixBase<Derived>& p) const
+  { EIGEN_USING_STD(sqrt) return sqrt(NonInteger(squaredExteriorDistance(p))); }
 
   /** \returns the distance between the boxes \a b and \c *this,
     * and zero if the boxes intersect.
-    * \sa squaredExteriorDistance()
+    * \sa squaredExteriorDistance(const AlignedBox&), exteriorDistance(const MatrixBase&)
     */
-  inline NonInteger exteriorDistance(const AlignedBox& b) const
-  { using std::sqrt; return sqrt(NonInteger(squaredExteriorDistance(b))); }
+  EIGEN_DEVICE_FUNC inline NonInteger exteriorDistance(const AlignedBox& b) const
+  { EIGEN_USING_STD(sqrt) return sqrt(NonInteger(squaredExteriorDistance(b))); }
+
+  /**
+   * Specialization of transform for pure translation.
+   */
+  template<int Mode, int Options>
+  EIGEN_DEVICE_FUNC inline void transform(
+      const typename Transform<Scalar, AmbientDimAtCompileTime, Mode, Options>::TranslationType& translation)
+  {
+    this->translate(translation);
+  }
+
+  /**
+   * Transforms this box by \a transform and recomputes it to
+   * still be an axis-aligned box.
+   *
+   * \note This method is provided under BSD license (see the top of this file).
+   */
+  template<int Mode, int Options>
+  EIGEN_DEVICE_FUNC inline void transform(const Transform<Scalar, AmbientDimAtCompileTime, Mode, Options>& transform)
+  {
+    // Only Affine and Isometry transforms are currently supported.
+    EIGEN_STATIC_ASSERT(Mode == Affine || Mode == AffineCompact || Mode == Isometry, THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS);
+
+    // Method adapted from FCL src/shape/geometric_shapes_utility.cpp#computeBV<AABB, Box>(...)
+    // https://github.com/flexible-collision-library/fcl/blob/fcl-0.4/src/shape/geometric_shapes_utility.cpp#L292
+    //
+    // Here's a nice explanation why it works: https://zeuxcg.org/2010/10/17/aabb-from-obb-with-component-wise-abs/
+
+    // two times rotated extent
+    const VectorType rotated_extent_2 = transform.linear().cwiseAbs() * sizes();
+    // two times new center
+    const VectorType rotated_center_2 = transform.linear() * (this->m_max + this->m_min) +
+        Scalar(2) * transform.translation();
+
+    this->m_max = (rotated_center_2 + rotated_extent_2) / Scalar(2);
+    this->m_min = (rotated_center_2 - rotated_extent_2) / Scalar(2);
+  }
+
+  /**
+   * \returns a copy of \c *this transformed by \a transform and recomputed to
+   * still be an axis-aligned box.
+   */
+  template<int Mode, int Options>
+  EIGEN_DEVICE_FUNC AlignedBox transformed(const Transform<Scalar, AmbientDimAtCompileTime, Mode, Options>& transform) const
+  {
+    AlignedBox result(m_min, m_max);
+    result.transform(transform);
+    return result;
+  }
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
     *
@@ -267,7 +374,7 @@
     * then this function smartly returns a const reference to \c *this.
     */
   template<typename NewScalarType>
-  inline typename internal::cast_return_type<AlignedBox,
+  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<AlignedBox,
            AlignedBox<NewScalarType,AmbientDimAtCompileTime> >::type cast() const
   {
     return typename internal::cast_return_type<AlignedBox,
@@ -276,7 +383,7 @@
 
   /** Copy constructor with scalar type conversion */
   template<typename OtherScalarType>
-  inline explicit AlignedBox(const AlignedBox<OtherScalarType,AmbientDimAtCompileTime>& other)
+  EIGEN_DEVICE_FUNC inline explicit AlignedBox(const AlignedBox<OtherScalarType,AmbientDimAtCompileTime>& other)
   {
     m_min = (other.min)().template cast<Scalar>();
     m_max = (other.max)().template cast<Scalar>();
@@ -286,7 +393,7 @@
     * determined by \a prec.
     *
     * \sa MatrixBase::isApprox() */
-  bool isApprox(const AlignedBox& other, const RealScalar& prec = ScalarTraits::dummy_precision()) const
+  EIGEN_DEVICE_FUNC bool isApprox(const AlignedBox& other, const RealScalar& prec = ScalarTraits::dummy_precision()) const
   { return m_min.isApprox(other.m_min, prec) && m_max.isApprox(other.m_max, prec); }
 
 protected:
@@ -298,9 +405,9 @@
 
 template<typename Scalar,int AmbientDim>
 template<typename Derived>
-inline Scalar AlignedBox<Scalar,AmbientDim>::squaredExteriorDistance(const MatrixBase<Derived>& a_p) const
+EIGEN_DEVICE_FUNC inline Scalar AlignedBox<Scalar,AmbientDim>::squaredExteriorDistance(const MatrixBase<Derived>& a_p) const
 {
-  typename internal::nested<Derived,2*AmbientDim>::type p(a_p.derived());
+  typename internal::nested_eval<Derived,2*AmbientDim>::type p(a_p.derived());
   Scalar dist2(0);
   Scalar aux;
   for (Index k=0; k<dim(); ++k)
@@ -320,7 +427,7 @@
 }
 
 template<typename Scalar,int AmbientDim>
-inline Scalar AlignedBox<Scalar,AmbientDim>::squaredExteriorDistance(const AlignedBox& b) const
+EIGEN_DEVICE_FUNC inline Scalar AlignedBox<Scalar,AmbientDim>::squaredExteriorDistance(const AlignedBox& b) const
 {
   Scalar dist2(0);
   Scalar aux;

diff --git a/Eigen/src/Geometry/AngleAxis.h b/Eigen/src/Geometry/AngleAxis.h
index 636712c..78328b6 100644
--- a/Eigen/src/Geometry/AngleAxis.h
+++ b/Eigen/src/Geometry/AngleAxis.h

@@ -69,52 +69,61 @@
 public:
 
   /** Default constructor without initialization. */
-  AngleAxis() {}
+  EIGEN_DEVICE_FUNC AngleAxis() {}
   /** Constructs and initialize the angle-axis rotation from an \a angle in radian
     * and an \a axis which \b must \b be \b normalized.
     *
     * \warning If the \a axis vector is not normalized, then the angle-axis object
     *          represents an invalid rotation. */
   template<typename Derived>
+  EIGEN_DEVICE_FUNC 
   inline AngleAxis(const Scalar& angle, const MatrixBase<Derived>& axis) : m_axis(axis), m_angle(angle) {}
   /** Constructs and initialize the angle-axis rotation from a quaternion \a q.
     * This function implicitly normalizes the quaternion \a q.
     */
-  template<typename QuatDerived> inline explicit AngleAxis(const QuaternionBase<QuatDerived>& q) { *this = q; }
+  template<typename QuatDerived> 
+  EIGEN_DEVICE_FUNC inline explicit AngleAxis(const QuaternionBase<QuatDerived>& q) { *this = q; }
   /** Constructs and initialize the angle-axis rotation from a 3x3 rotation matrix. */
   template<typename Derived>
-  inline explicit AngleAxis(const MatrixBase<Derived>& m) { *this = m; }
+  EIGEN_DEVICE_FUNC inline explicit AngleAxis(const MatrixBase<Derived>& m) { *this = m; }
 
-  Scalar angle() const { return m_angle; }
-  Scalar& angle() { return m_angle; }
+  /** \returns the value of the rotation angle in radian */
+  EIGEN_DEVICE_FUNC Scalar angle() const { return m_angle; }
+  /** \returns a read-write reference to the stored angle in radian */
+  EIGEN_DEVICE_FUNC Scalar& angle() { return m_angle; }
 
-  const Vector3& axis() const { return m_axis; }
-  Vector3& axis() { return m_axis; }
+  /** \returns the rotation axis */
+  EIGEN_DEVICE_FUNC const Vector3& axis() const { return m_axis; }
+  /** \returns a read-write reference to the stored rotation axis.
+    *
+    * \warning The rotation axis must remain a \b unit vector.
+    */
+  EIGEN_DEVICE_FUNC Vector3& axis() { return m_axis; }
 
   /** Concatenates two rotations */
-  inline QuaternionType operator* (const AngleAxis& other) const
+  EIGEN_DEVICE_FUNC inline QuaternionType operator* (const AngleAxis& other) const
   { return QuaternionType(*this) * QuaternionType(other); }
 
   /** Concatenates two rotations */
-  inline QuaternionType operator* (const QuaternionType& other) const
+  EIGEN_DEVICE_FUNC inline QuaternionType operator* (const QuaternionType& other) const
   { return QuaternionType(*this) * other; }
 
   /** Concatenates two rotations */
-  friend inline QuaternionType operator* (const QuaternionType& a, const AngleAxis& b)
+  friend EIGEN_DEVICE_FUNC inline QuaternionType operator* (const QuaternionType& a, const AngleAxis& b)
   { return a * QuaternionType(b); }
 
   /** \returns the inverse rotation, i.e., an angle-axis with opposite rotation angle */
-  AngleAxis inverse() const
+  EIGEN_DEVICE_FUNC AngleAxis inverse() const
   { return AngleAxis(-m_angle, m_axis); }
 
   template<class QuatDerived>
-  AngleAxis& operator=(const QuaternionBase<QuatDerived>& q);
+  EIGEN_DEVICE_FUNC AngleAxis& operator=(const QuaternionBase<QuatDerived>& q);
   template<typename Derived>
-  AngleAxis& operator=(const MatrixBase<Derived>& m);
+  EIGEN_DEVICE_FUNC AngleAxis& operator=(const MatrixBase<Derived>& m);
 
   template<typename Derived>
-  AngleAxis& fromRotationMatrix(const MatrixBase<Derived>& m);
-  Matrix3 toRotationMatrix(void) const;
+  EIGEN_DEVICE_FUNC AngleAxis& fromRotationMatrix(const MatrixBase<Derived>& m);
+  EIGEN_DEVICE_FUNC Matrix3 toRotationMatrix(void) const;
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
     *
@@ -122,24 +131,24 @@
     * then this function smartly returns a const reference to \c *this.
     */
   template<typename NewScalarType>
-  inline typename internal::cast_return_type<AngleAxis,AngleAxis<NewScalarType> >::type cast() const
+  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<AngleAxis,AngleAxis<NewScalarType> >::type cast() const
   { return typename internal::cast_return_type<AngleAxis,AngleAxis<NewScalarType> >::type(*this); }
 
   /** Copy constructor with scalar type conversion */
   template<typename OtherScalarType>
-  inline explicit AngleAxis(const AngleAxis<OtherScalarType>& other)
+  EIGEN_DEVICE_FUNC inline explicit AngleAxis(const AngleAxis<OtherScalarType>& other)
   {
     m_axis = other.axis().template cast<Scalar>();
     m_angle = Scalar(other.angle());
   }
 
-  static inline const AngleAxis Identity() { return AngleAxis(0, Vector3::UnitX()); }
+  EIGEN_DEVICE_FUNC static inline const AngleAxis Identity() { return AngleAxis(Scalar(0), Vector3::UnitX()); }
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
     * determined by \a prec.
     *
     * \sa MatrixBase::isApprox() */
-  bool isApprox(const AngleAxis& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
+  EIGEN_DEVICE_FUNC bool isApprox(const AngleAxis& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
   { return m_axis.isApprox(other.m_axis, prec) && internal::isApprox(m_angle,other.m_angle, prec); }
 };
 
@@ -151,27 +160,32 @@
 typedef AngleAxis<double> AngleAxisd;
 
 /** Set \c *this from a \b unit quaternion.
-  * The resulting axis is normalized.
+  *
+  * The resulting axis is normalized, and the computed angle is in the [0,pi] range.
   * 
   * This function implicitly normalizes the quaternion \a q.
   */
 template<typename Scalar>
 template<typename QuatDerived>
-AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q)
+EIGEN_DEVICE_FUNC AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q)
 {
-  using std::atan2;
+  EIGEN_USING_STD(atan2)
+  EIGEN_USING_STD(abs)
   Scalar n = q.vec().norm();
   if(n<NumTraits<Scalar>::epsilon())
     n = q.vec().stableNorm();
-  if (n > Scalar(0))
+
+  if (n != Scalar(0))
   {
-    m_angle = Scalar(2)*atan2(n, q.w());
+    m_angle = Scalar(2)*atan2(n, abs(q.w()));
+    if(q.w() < Scalar(0))
+      n = -n;
     m_axis  = q.vec() / n;
   }
   else
   {
-    m_angle = 0;
-    m_axis << 1, 0, 0;
+    m_angle = Scalar(0);
+    m_axis << Scalar(1), Scalar(0), Scalar(0);
   }
   return *this;
 }
@@ -180,7 +194,7 @@
   */
 template<typename Scalar>
 template<typename Derived>
-AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const MatrixBase<Derived>& mat)
+EIGEN_DEVICE_FUNC AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const MatrixBase<Derived>& mat)
 {
   // Since a direct conversion would not be really faster,
   // let's use the robust Quaternion implementation:
@@ -192,7 +206,7 @@
 **/
 template<typename Scalar>
 template<typename Derived>
-AngleAxis<Scalar>& AngleAxis<Scalar>::fromRotationMatrix(const MatrixBase<Derived>& mat)
+EIGEN_DEVICE_FUNC AngleAxis<Scalar>& AngleAxis<Scalar>::fromRotationMatrix(const MatrixBase<Derived>& mat)
 {
   return *this = QuaternionType(mat);
 }
@@ -201,10 +215,10 @@
   */
 template<typename Scalar>
 typename AngleAxis<Scalar>::Matrix3
-AngleAxis<Scalar>::toRotationMatrix(void) const
+EIGEN_DEVICE_FUNC AngleAxis<Scalar>::toRotationMatrix(void) const
 {
-  using std::sin;
-  using std::cos;
+  EIGEN_USING_STD(sin)
+  EIGEN_USING_STD(cos)
   Matrix3 res;
   Vector3 sin_axis  = sin(m_angle) * m_axis;
   Scalar c = cos(m_angle);

diff --git a/Eigen/src/Geometry/EulerAngles.h b/Eigen/src/Geometry/EulerAngles.h
index 82802fb..19b734c 100644
--- a/Eigen/src/Geometry/EulerAngles.h
+++ b/Eigen/src/Geometry/EulerAngles.h

@@ -33,12 +33,12 @@
   * \sa class AngleAxis
   */
 template<typename Derived>
-inline Matrix<typename MatrixBase<Derived>::Scalar,3,1>
+EIGEN_DEVICE_FUNC inline Matrix<typename MatrixBase<Derived>::Scalar,3,1>
 MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
 {
-  using std::atan2;
-  using std::sin;
-  using std::cos;
+  EIGEN_USING_STD(atan2)
+  EIGEN_USING_STD(sin)
+  EIGEN_USING_STD(cos)
   /* Implemented from Graphics Gems IV */
   EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived,3,3)
 
@@ -55,7 +55,12 @@
     res[0] = atan2(coeff(j,i), coeff(k,i));
     if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0)))
     {
-      res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(M_PI) : res[0] + Scalar(M_PI);
+      if(res[0] > Scalar(0)) {
+        res[0] -= Scalar(EIGEN_PI);
+      }
+      else {
+        res[0] += Scalar(EIGEN_PI);
+      }
       Scalar s2 = Vector2(coeff(j,i), coeff(k,i)).norm();
       res[1] = -atan2(s2, coeff(i,i));
     }
@@ -84,7 +89,12 @@
     res[0] = atan2(coeff(j,k), coeff(k,k));
     Scalar c2 = Vector2(coeff(i,i), coeff(i,j)).norm();
     if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0))) {
-      res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(M_PI) : res[0] + Scalar(M_PI);
+      if(res[0] > Scalar(0)) {
+        res[0] -= Scalar(EIGEN_PI);
+      }
+      else {
+        res[0] += Scalar(EIGEN_PI);
+      }
       res[1] = atan2(-coeff(i,k), -c2);
     }
     else

diff --git a/Eigen/src/Geometry/Homogeneous.h b/Eigen/src/Geometry/Homogeneous.h
index 00e71d1..94083ac 100644
--- a/Eigen/src/Geometry/Homogeneous.h
+++ b/Eigen/src/Geometry/Homogeneous.h

@@ -10,7 +10,7 @@
 #ifndef EIGEN_HOMOGENEOUS_H
 #define EIGEN_HOMOGENEOUS_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \geometry_module \ingroup Geometry_Module
   *
@@ -34,7 +34,7 @@
  : traits<MatrixType>
 {
   typedef typename traits<MatrixType>::StorageKind StorageKind;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
   enum {
     RowsPlusOne = (MatrixType::RowsAtCompileTime != Dynamic) ?
@@ -48,8 +48,7 @@
     TmpFlags = _MatrixTypeNested::Flags & HereditaryBits,
     Flags = ColsAtCompileTime==1 ? (TmpFlags & ~RowMajorBit)
           : RowsAtCompileTime==1 ? (TmpFlags | RowMajorBit)
-          : TmpFlags,
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+          : TmpFlags
   };
 };
 
@@ -59,102 +58,119 @@
 } // end namespace internal
 
 template<typename MatrixType,int _Direction> class Homogeneous
-  : internal::no_assignment_operator, public MatrixBase<Homogeneous<MatrixType,_Direction> >
+  : public MatrixBase<Homogeneous<MatrixType,_Direction> >, internal::no_assignment_operator
 {
   public:
 
+    typedef MatrixType NestedExpression;
     enum { Direction = _Direction };
 
     typedef MatrixBase<Homogeneous> Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(Homogeneous)
 
-    inline Homogeneous(const MatrixType& matrix)
+    EIGEN_DEVICE_FUNC explicit inline Homogeneous(const MatrixType& matrix)
       : m_matrix(matrix)
     {}
 
-    inline Index rows() const { return m_matrix.rows() + (int(Direction)==Vertical   ? 1 : 0); }
-    inline Index cols() const { return m_matrix.cols() + (int(Direction)==Horizontal ? 1 : 0); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows() + (int(Direction)==Vertical   ? 1 : 0); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols() + (int(Direction)==Horizontal ? 1 : 0); }
 
-    inline Scalar coeff(Index row, Index col) const
-    {
-      if(  (int(Direction)==Vertical   && row==m_matrix.rows())
-        || (int(Direction)==Horizontal && col==m_matrix.cols()))
-        return 1;
-      return m_matrix.coeff(row, col);
-    }
+    EIGEN_DEVICE_FUNC const NestedExpression& nestedExpression() const { return m_matrix; }
 
     template<typename Rhs>
-    inline const internal::homogeneous_right_product_impl<Homogeneous,Rhs>
+    EIGEN_DEVICE_FUNC inline const Product<Homogeneous,Rhs>
     operator* (const MatrixBase<Rhs>& rhs) const
     {
       eigen_assert(int(Direction)==Horizontal);
-      return internal::homogeneous_right_product_impl<Homogeneous,Rhs>(m_matrix,rhs.derived());
+      return Product<Homogeneous,Rhs>(*this,rhs.derived());
     }
 
     template<typename Lhs> friend
-    inline const internal::homogeneous_left_product_impl<Homogeneous,Lhs>
+    EIGEN_DEVICE_FUNC inline const Product<Lhs,Homogeneous>
     operator* (const MatrixBase<Lhs>& lhs, const Homogeneous& rhs)
     {
       eigen_assert(int(Direction)==Vertical);
-      return internal::homogeneous_left_product_impl<Homogeneous,Lhs>(lhs.derived(),rhs.m_matrix);
+      return Product<Lhs,Homogeneous>(lhs.derived(),rhs);
     }
 
     template<typename Scalar, int Dim, int Mode, int Options> friend
-    inline const internal::homogeneous_left_product_impl<Homogeneous,Transform<Scalar,Dim,Mode,Options> >
+    EIGEN_DEVICE_FUNC inline const Product<Transform<Scalar,Dim,Mode,Options>, Homogeneous >
     operator* (const Transform<Scalar,Dim,Mode,Options>& lhs, const Homogeneous& rhs)
     {
       eigen_assert(int(Direction)==Vertical);
-      return internal::homogeneous_left_product_impl<Homogeneous,Transform<Scalar,Dim,Mode,Options> >(lhs,rhs.m_matrix);
+      return Product<Transform<Scalar,Dim,Mode,Options>, Homogeneous>(lhs,rhs);
+    }
+
+    template<typename Func>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::result_of<Func(Scalar,Scalar)>::type
+    redux(const Func& func) const
+    {
+      return func(m_matrix.redux(func), Scalar(1));
     }
 
   protected:
     typename MatrixType::Nested m_matrix;
 };
 
-/** \geometry_module
+/** \geometry_module \ingroup Geometry_Module
   *
-  * \return an expression of the equivalent homogeneous vector
+  * \returns a vector expression that is one longer than the vector argument, with the value 1 symbolically appended as the last coefficient.
+  *
+  * This can be used to convert affine coordinates to homogeneous coordinates.
   *
   * \only_for_vectors
   *
   * Example: \include MatrixBase_homogeneous.cpp
   * Output: \verbinclude MatrixBase_homogeneous.out
   *
-  * \sa class Homogeneous
+  * \sa VectorwiseOp::homogeneous(), class Homogeneous
   */
 template<typename Derived>
-inline typename MatrixBase<Derived>::HomogeneousReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::HomogeneousReturnType
 MatrixBase<Derived>::homogeneous() const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  return derived();
+  return HomogeneousReturnType(derived());
 }
 
-/** \geometry_module
+/** \geometry_module \ingroup Geometry_Module
   *
-  * \returns a matrix expression of homogeneous column (or row) vectors
+  * \returns an expression where the value 1 is symbolically appended as the final coefficient to each column (or row) of the matrix.
+  *
+  * This can be used to convert affine coordinates to homogeneous coordinates.
   *
   * Example: \include VectorwiseOp_homogeneous.cpp
   * Output: \verbinclude VectorwiseOp_homogeneous.out
   *
-  * \sa MatrixBase::homogeneous() */
+  * \sa MatrixBase::homogeneous(), class Homogeneous */
 template<typename ExpressionType, int Direction>
-inline Homogeneous<ExpressionType,Direction>
+EIGEN_DEVICE_FUNC inline Homogeneous<ExpressionType,Direction>
 VectorwiseOp<ExpressionType,Direction>::homogeneous() const
 {
-  return _expression();
+  return HomogeneousReturnType(_expression());
 }
 
-/** \geometry_module
+/** \geometry_module \ingroup Geometry_Module
   *
-  * \returns an expression of the homogeneous normalized vector of \c *this
+  * \brief homogeneous normalization
+  *
+  * \returns a vector expression of the N-1 first coefficients of \c *this divided by that last coefficient.
+  *
+  * This can be used to convert homogeneous coordinates to affine coordinates.
+  *
+  * It is essentially a shortcut for:
+  * \code
+    this->head(this->size()-1)/this->coeff(this->size()-1);
+    \endcode
   *
   * Example: \include MatrixBase_hnormalized.cpp
   * Output: \verbinclude MatrixBase_hnormalized.out
   *
   * \sa VectorwiseOp::hnormalized() */
 template<typename Derived>
-inline const typename MatrixBase<Derived>::HNormalizedReturnType
+EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::HNormalizedReturnType
 MatrixBase<Derived>::hnormalized() const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
@@ -163,16 +179,22 @@
     ColsAtCompileTime==1?1:size()-1) / coeff(size()-1);
 }
 
-/** \geometry_module
+/** \geometry_module \ingroup Geometry_Module
   *
-  * \returns an expression of the homogeneous normalized vector of \c *this
+  * \brief column or row-wise homogeneous normalization
+  *
+  * \returns an expression of the first N-1 coefficients of each column (or row) of \c *this divided by the last coefficient of each column (or row).
+  *
+  * This can be used to convert homogeneous coordinates to affine coordinates.
+  *
+  * It is conceptually equivalent to calling MatrixBase::hnormalized() to each column (or row) of \c *this.
   *
   * Example: \include DirectionWise_hnormalized.cpp
   * Output: \verbinclude DirectionWise_hnormalized.out
   *
   * \sa MatrixBase::hnormalized() */
 template<typename ExpressionType, int Direction>
-inline const typename VectorwiseOp<ExpressionType,Direction>::HNormalizedReturnType
+EIGEN_DEVICE_FUNC inline const typename VectorwiseOp<ExpressionType,Direction>::HNormalizedReturnType
 VectorwiseOp<ExpressionType,Direction>::hnormalized() const
 {
   return HNormalized_Block(_expression(),0,0,
@@ -196,7 +218,7 @@
 struct take_matrix_for_product
 {
   typedef MatrixOrTransformType type;
-  static const type& run(const type &x) { return x; }
+  EIGEN_DEVICE_FUNC static const type& run(const type &x) { return x; }
 };
 
 template<typename Scalar, int Dim, int Mode,int Options>
@@ -204,7 +226,7 @@
 {
   typedef Transform<Scalar, Dim, Mode, Options> TransformType;
   typedef typename internal::add_const<typename TransformType::ConstAffinePart>::type type;
-  static type run (const TransformType& x) { return x.affine(); }
+  EIGEN_DEVICE_FUNC static type run (const TransformType& x) { return x.affine(); }
 };
 
 template<typename Scalar, int Dim, int Options>
@@ -212,7 +234,7 @@
 {
   typedef Transform<Scalar, Dim, Projective, Options> TransformType;
   typedef typename TransformType::MatrixType type;
-  static const type& run (const TransformType& x) { return x.matrix(); }
+  EIGEN_DEVICE_FUNC static const type& run (const TransformType& x) { return x.matrix(); }
 };
 
 template<typename MatrixType,typename Lhs>
@@ -237,16 +259,17 @@
   typedef typename traits<homogeneous_left_product_impl>::LhsMatrixType LhsMatrixType;
   typedef typename remove_all<LhsMatrixType>::type LhsMatrixTypeCleaned;
   typedef typename remove_all<typename LhsMatrixTypeCleaned::Nested>::type LhsMatrixTypeNested;
-  typedef typename MatrixType::Index Index;
-  homogeneous_left_product_impl(const Lhs& lhs, const MatrixType& rhs)
+  EIGEN_DEVICE_FUNC homogeneous_left_product_impl(const Lhs& lhs, const MatrixType& rhs)
     : m_lhs(take_matrix_for_product<Lhs>::run(lhs)),
       m_rhs(rhs)
   {}
 
-  inline Index rows() const { return m_lhs.rows(); }
-  inline Index cols() const { return m_rhs.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  inline Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  inline Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
 
-  template<typename Dest> void evalTo(Dest& dst) const
+  template<typename Dest> EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const
   {
     // FIXME investigate how to allow lazy evaluation of this product when possible
     dst = Block<const LhsMatrixTypeNested,
@@ -277,15 +300,14 @@
   : public ReturnByValue<homogeneous_right_product_impl<Homogeneous<MatrixType,Horizontal>,Rhs> >
 {
   typedef typename remove_all<typename Rhs::Nested>::type RhsNested;
-  typedef typename MatrixType::Index Index;
-  homogeneous_right_product_impl(const MatrixType& lhs, const Rhs& rhs)
+  EIGEN_DEVICE_FUNC homogeneous_right_product_impl(const MatrixType& lhs, const Rhs& rhs)
     : m_lhs(lhs), m_rhs(rhs)
   {}
 
-  inline Index rows() const { return m_lhs.rows(); }
-  inline Index cols() const { return m_rhs.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
 
-  template<typename Dest> void evalTo(Dest& dst) const
+  template<typename Dest> EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const
   {
     // FIXME investigate how to allow lazy evaluation of this product when possible
     dst = m_lhs * Block<const RhsNested,
@@ -300,6 +322,178 @@
   typename Rhs::Nested m_rhs;
 };
 
+template<typename ArgType,int Direction>
+struct evaluator_traits<Homogeneous<ArgType,Direction> >
+{
+  typedef typename storage_kind_to_evaluator_kind<typename ArgType::StorageKind>::Kind Kind;
+  typedef HomogeneousShape Shape;
+};
+
+template<> struct AssignmentKind<DenseShape,HomogeneousShape> { typedef Dense2Dense Kind; };
+
+
+template<typename ArgType,int Direction>
+struct unary_evaluator<Homogeneous<ArgType,Direction>, IndexBased>
+  : evaluator<typename Homogeneous<ArgType,Direction>::PlainObject >
+{
+  typedef Homogeneous<ArgType,Direction> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op)
+    : Base(), m_temp(op)
+  {
+    ::new (static_cast<Base*>(this)) Base(m_temp);
+  }
+
+protected:
+  PlainObject m_temp;
+};
+
+// dense = homogeneous
+template< typename DstXprType, typename ArgType, typename Scalar>
+struct Assignment<DstXprType, Homogeneous<ArgType,Vertical>, internal::assign_op<Scalar,typename ArgType::Scalar>, Dense2Dense>
+{
+  typedef Homogeneous<ArgType,Vertical> SrcXprType;
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename ArgType::Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    dst.template topRows<ArgType::RowsAtCompileTime>(src.nestedExpression().rows()) = src.nestedExpression();
+    dst.row(dst.rows()-1).setOnes();
+  }
+};
+
+// dense = homogeneous
+template< typename DstXprType, typename ArgType, typename Scalar>
+struct Assignment<DstXprType, Homogeneous<ArgType,Horizontal>, internal::assign_op<Scalar,typename ArgType::Scalar>, Dense2Dense>
+{
+  typedef Homogeneous<ArgType,Horizontal> SrcXprType;
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename ArgType::Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    dst.template leftCols<ArgType::ColsAtCompileTime>(src.nestedExpression().cols()) = src.nestedExpression();
+    dst.col(dst.cols()-1).setOnes();
+  }
+};
+
+template<typename LhsArg, typename Rhs, int ProductTag>
+struct generic_product_impl<Homogeneous<LhsArg,Horizontal>, Rhs, HomogeneousShape, DenseShape, ProductTag>
+{
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC static void evalTo(Dest& dst, const Homogeneous<LhsArg,Horizontal>& lhs, const Rhs& rhs)
+  {
+    homogeneous_right_product_impl<Homogeneous<LhsArg,Horizontal>, Rhs>(lhs.nestedExpression(), rhs).evalTo(dst);
+  }
+};
+
+template<typename Lhs,typename Rhs>
+struct homogeneous_right_product_refactoring_helper
+{
+  enum {
+    Dim  = Lhs::ColsAtCompileTime,
+    Rows = Lhs::RowsAtCompileTime
+  };
+  typedef typename Rhs::template ConstNRowsBlockXpr<Dim>::Type          LinearBlockConst;
+  typedef typename remove_const<LinearBlockConst>::type                 LinearBlock;
+  typedef typename Rhs::ConstRowXpr                                     ConstantColumn;
+  typedef Replicate<const ConstantColumn,Rows,1>                        ConstantBlock;
+  typedef Product<Lhs,LinearBlock,LazyProduct>                          LinearProduct;
+  typedef CwiseBinaryOp<internal::scalar_sum_op<typename Lhs::Scalar,typename Rhs::Scalar>, const LinearProduct, const ConstantBlock> Xpr;
+};
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, HomogeneousShape, DenseShape>
+ : public evaluator<typename homogeneous_right_product_refactoring_helper<typename Lhs::NestedExpression,Rhs>::Xpr>
+{
+  typedef Product<Lhs, Rhs, LazyProduct> XprType;
+  typedef homogeneous_right_product_refactoring_helper<typename Lhs::NestedExpression,Rhs> helper;
+  typedef typename helper::ConstantBlock ConstantBlock;
+  typedef typename helper::Xpr RefactoredXpr;
+  typedef evaluator<RefactoredXpr> Base;
+
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+    : Base(  xpr.lhs().nestedExpression() .lazyProduct(  xpr.rhs().template topRows<helper::Dim>(xpr.lhs().nestedExpression().cols()) )
+            + ConstantBlock(xpr.rhs().row(xpr.rhs().rows()-1),xpr.lhs().rows(), 1) )
+  {}
+};
+
+template<typename Lhs, typename RhsArg, int ProductTag>
+struct generic_product_impl<Lhs, Homogeneous<RhsArg,Vertical>, DenseShape, HomogeneousShape, ProductTag>
+{
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC static void evalTo(Dest& dst, const Lhs& lhs, const Homogeneous<RhsArg,Vertical>& rhs)
+  {
+    homogeneous_left_product_impl<Homogeneous<RhsArg,Vertical>, Lhs>(lhs, rhs.nestedExpression()).evalTo(dst);
+  }
+};
+
+// TODO: the following specialization is to address a regression from 3.2 to 3.3
+// In the future, this path should be optimized.
+template<typename Lhs, typename RhsArg, int ProductTag>
+struct generic_product_impl<Lhs, Homogeneous<RhsArg,Vertical>, TriangularShape, HomogeneousShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Homogeneous<RhsArg,Vertical>& rhs)
+  {
+    dst.noalias() = lhs * rhs.eval();
+  }
+};
+
+template<typename Lhs,typename Rhs>
+struct homogeneous_left_product_refactoring_helper
+{
+  enum {
+    Dim = Rhs::RowsAtCompileTime,
+    Cols = Rhs::ColsAtCompileTime
+  };
+  typedef typename Lhs::template ConstNColsBlockXpr<Dim>::Type          LinearBlockConst;
+  typedef typename remove_const<LinearBlockConst>::type                 LinearBlock;
+  typedef typename Lhs::ConstColXpr                                     ConstantColumn;
+  typedef Replicate<const ConstantColumn,1,Cols>                        ConstantBlock;
+  typedef Product<LinearBlock,Rhs,LazyProduct>                          LinearProduct;
+  typedef CwiseBinaryOp<internal::scalar_sum_op<typename Lhs::Scalar,typename Rhs::Scalar>, const LinearProduct, const ConstantBlock> Xpr;
+};
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, HomogeneousShape>
+ : public evaluator<typename homogeneous_left_product_refactoring_helper<Lhs,typename Rhs::NestedExpression>::Xpr>
+{
+  typedef Product<Lhs, Rhs, LazyProduct> XprType;
+  typedef homogeneous_left_product_refactoring_helper<Lhs,typename Rhs::NestedExpression> helper;
+  typedef typename helper::ConstantBlock ConstantBlock;
+  typedef typename helper::Xpr RefactoredXpr;
+  typedef evaluator<RefactoredXpr> Base;
+
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+    : Base(   xpr.lhs().template leftCols<helper::Dim>(xpr.rhs().nestedExpression().rows()) .lazyProduct( xpr.rhs().nestedExpression() )
+            + ConstantBlock(xpr.lhs().col(xpr.lhs().cols()-1),1,xpr.rhs().cols()) )
+  {}
+};
+
+template<typename Scalar, int Dim, int Mode,int Options, typename RhsArg, int ProductTag>
+struct generic_product_impl<Transform<Scalar,Dim,Mode,Options>, Homogeneous<RhsArg,Vertical>, DenseShape, HomogeneousShape, ProductTag>
+{
+  typedef Transform<Scalar,Dim,Mode,Options> TransformType;
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC static void evalTo(Dest& dst, const TransformType& lhs, const Homogeneous<RhsArg,Vertical>& rhs)
+  {
+    homogeneous_left_product_impl<Homogeneous<RhsArg,Vertical>, TransformType>(lhs, rhs.nestedExpression()).evalTo(dst);
+  }
+};
+
+template<typename ExpressionType, int Side, bool Transposed>
+struct permutation_matrix_product<ExpressionType, Side, Transposed, HomogeneousShape>
+  : public permutation_matrix_product<ExpressionType, Side, Transposed, DenseShape>
+{};
+
 } // end namespace internal
 
 } // end namespace Eigen

diff --git a/Eigen/src/Geometry/Hyperplane.h b/Eigen/src/Geometry/Hyperplane.h
index aeff43f..cebe035 100644
--- a/Eigen/src/Geometry/Hyperplane.h
+++ b/Eigen/src/Geometry/Hyperplane.h

@@ -22,8 +22,8 @@
   * A hyperplane is an affine subspace of dimension n-1 in a space of dimension n.
   * For example, a hyperplane in a plane is a line; a hyperplane in 3-space is a plane.
   *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients
+  * \tparam _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
   *             Notice that the dimension of the hyperplane is _AmbientDim-1.
   *
   * This class represents an hyperplane as the zero set of the implicit equation
@@ -41,7 +41,7 @@
   };
   typedef _Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef DenseIndex Index;
+  typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
   typedef Matrix<Scalar,AmbientDimAtCompileTime,1> VectorType;
   typedef Matrix<Scalar,Index(AmbientDimAtCompileTime)==Dynamic
                         ? Dynamic
@@ -50,21 +50,21 @@
   typedef const Block<const Coefficients,AmbientDimAtCompileTime,1> ConstNormalReturnType;
 
   /** Default constructor without initialization */
-  inline Hyperplane() {}
+  EIGEN_DEVICE_FUNC inline Hyperplane() {}
   
   template<int OtherOptions>
-  Hyperplane(const Hyperplane<Scalar,AmbientDimAtCompileTime,OtherOptions>& other)
+  EIGEN_DEVICE_FUNC Hyperplane(const Hyperplane<Scalar,AmbientDimAtCompileTime,OtherOptions>& other)
    : m_coeffs(other.coeffs())
   {}
 
   /** Constructs a dynamic-size hyperplane with \a _dim the dimension
     * of the ambient space */
-  inline explicit Hyperplane(Index _dim) : m_coeffs(_dim+1) {}
+  EIGEN_DEVICE_FUNC inline explicit Hyperplane(Index _dim) : m_coeffs(_dim+1) {}
 
   /** Construct a plane from its normal \a n and a point \a e onto the plane.
     * \warning the vector normal is assumed to be normalized.
     */
-  inline Hyperplane(const VectorType& n, const VectorType& e)
+  EIGEN_DEVICE_FUNC inline Hyperplane(const VectorType& n, const VectorType& e)
     : m_coeffs(n.size()+1)
   {
     normal() = n;
@@ -75,7 +75,7 @@
     * such that the algebraic equation of the plane is \f$ n \cdot x + d = 0 \f$.
     * \warning the vector normal is assumed to be normalized.
     */
-  inline Hyperplane(const VectorType& n, const Scalar& d)
+  EIGEN_DEVICE_FUNC inline Hyperplane(const VectorType& n, const Scalar& d)
     : m_coeffs(n.size()+1)
   {
     normal() = n;
@@ -85,7 +85,7 @@
   /** Constructs a hyperplane passing through the two points. If the dimension of the ambient space
     * is greater than 2, then there isn't uniqueness, so an arbitrary choice is made.
     */
-  static inline Hyperplane Through(const VectorType& p0, const VectorType& p1)
+  EIGEN_DEVICE_FUNC static inline Hyperplane Through(const VectorType& p0, const VectorType& p1)
   {
     Hyperplane result(p0.size());
     result.normal() = (p1 - p0).unitOrthogonal();
@@ -96,11 +96,21 @@
   /** Constructs a hyperplane passing through the three points. The dimension of the ambient space
     * is required to be exactly 3.
     */
-  static inline Hyperplane Through(const VectorType& p0, const VectorType& p1, const VectorType& p2)
+  EIGEN_DEVICE_FUNC static inline Hyperplane Through(const VectorType& p0, const VectorType& p1, const VectorType& p2)
   {
     EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 3)
     Hyperplane result(p0.size());
-    result.normal() = (p2 - p0).cross(p1 - p0).normalized();
+    VectorType v0(p2 - p0), v1(p1 - p0);
+    result.normal() = v0.cross(v1);
+    RealScalar norm = result.normal().norm();
+    if(norm <= v0.norm() * v1.norm() * NumTraits<RealScalar>::epsilon())
+    {
+      Matrix<Scalar,2,3> m; m << v0.transpose(), v1.transpose();
+      JacobiSVD<Matrix<Scalar,2,3> > svd(m, ComputeFullV);
+      result.normal() = svd.matrixV().col(2);
+    }
+    else
+      result.normal() /= norm;
     result.offset() = -p0.dot(result.normal());
     return result;
   }
@@ -109,20 +119,20 @@
     * If the dimension of the ambient space is greater than 2, then there isn't uniqueness,
     * so an arbitrary choice is made.
     */
-  // FIXME to be consitent with the rest this could be implemented as a static Through function ??
-  explicit Hyperplane(const ParametrizedLine<Scalar, AmbientDimAtCompileTime>& parametrized)
+  // FIXME to be consistent with the rest this could be implemented as a static Through function ??
+  EIGEN_DEVICE_FUNC explicit Hyperplane(const ParametrizedLine<Scalar, AmbientDimAtCompileTime>& parametrized)
   {
     normal() = parametrized.direction().unitOrthogonal();
     offset() = -parametrized.origin().dot(normal());
   }
 
-  ~Hyperplane() {}
+  EIGEN_DEVICE_FUNC ~Hyperplane() {}
 
   /** \returns the dimension in which the plane holds */
-  inline Index dim() const { return AmbientDimAtCompileTime==Dynamic ? m_coeffs.size()-1 : Index(AmbientDimAtCompileTime); }
+  EIGEN_DEVICE_FUNC inline Index dim() const { return AmbientDimAtCompileTime==Dynamic ? m_coeffs.size()-1 : Index(AmbientDimAtCompileTime); }
 
   /** normalizes \c *this */
-  void normalize(void)
+  EIGEN_DEVICE_FUNC void normalize(void)
   {
     m_coeffs /= normal().norm();
   }
@@ -130,45 +140,45 @@
   /** \returns the signed distance between the plane \c *this and a point \a p.
     * \sa absDistance()
     */
-  inline Scalar signedDistance(const VectorType& p) const { return normal().dot(p) + offset(); }
+  EIGEN_DEVICE_FUNC inline Scalar signedDistance(const VectorType& p) const { return normal().dot(p) + offset(); }
 
   /** \returns the absolute distance between the plane \c *this and a point \a p.
     * \sa signedDistance()
     */
-  inline Scalar absDistance(const VectorType& p) const { using std::abs; return abs(signedDistance(p)); }
+  EIGEN_DEVICE_FUNC inline Scalar absDistance(const VectorType& p) const { return numext::abs(signedDistance(p)); }
 
   /** \returns the projection of a point \a p onto the plane \c *this.
     */
-  inline VectorType projection(const VectorType& p) const { return p - signedDistance(p) * normal(); }
+  EIGEN_DEVICE_FUNC inline VectorType projection(const VectorType& p) const { return p - signedDistance(p) * normal(); }
 
   /** \returns a constant reference to the unit normal vector of the plane, which corresponds
     * to the linear part of the implicit equation.
     */
-  inline ConstNormalReturnType normal() const { return ConstNormalReturnType(m_coeffs,0,0,dim(),1); }
+  EIGEN_DEVICE_FUNC inline ConstNormalReturnType normal() const { return ConstNormalReturnType(m_coeffs,0,0,dim(),1); }
 
   /** \returns a non-constant reference to the unit normal vector of the plane, which corresponds
     * to the linear part of the implicit equation.
     */
-  inline NormalReturnType normal() { return NormalReturnType(m_coeffs,0,0,dim(),1); }
+  EIGEN_DEVICE_FUNC inline NormalReturnType normal() { return NormalReturnType(m_coeffs,0,0,dim(),1); }
 
   /** \returns the distance to the origin, which is also the "constant term" of the implicit equation
     * \warning the vector normal is assumed to be normalized.
     */
-  inline const Scalar& offset() const { return m_coeffs.coeff(dim()); }
+  EIGEN_DEVICE_FUNC inline const Scalar& offset() const { return m_coeffs.coeff(dim()); }
 
   /** \returns a non-constant reference to the distance to the origin, which is also the constant part
     * of the implicit equation */
-  inline Scalar& offset() { return m_coeffs(dim()); }
+  EIGEN_DEVICE_FUNC inline Scalar& offset() { return m_coeffs(dim()); }
 
   /** \returns a constant reference to the coefficients c_i of the plane equation:
     * \f$ c_0*x_0 + ... + c_{d-1}*x_{d-1} + c_d = 0 \f$
     */
-  inline const Coefficients& coeffs() const { return m_coeffs; }
+  EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs; }
 
   /** \returns a non-constant reference to the coefficients c_i of the plane equation:
     * \f$ c_0*x_0 + ... + c_{d-1}*x_{d-1} + c_d = 0 \f$
     */
-  inline Coefficients& coeffs() { return m_coeffs; }
+  EIGEN_DEVICE_FUNC inline Coefficients& coeffs() { return m_coeffs; }
 
   /** \returns the intersection of *this with \a other.
     *
@@ -176,16 +186,15 @@
     *
     * \note If \a other is approximately parallel to *this, this method will return any point on *this.
     */
-  VectorType intersection(const Hyperplane& other) const
+  EIGEN_DEVICE_FUNC VectorType intersection(const Hyperplane& other) const
   {
-    using std::abs;
     EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 2)
     Scalar det = coeffs().coeff(0) * other.coeffs().coeff(1) - coeffs().coeff(1) * other.coeffs().coeff(0);
     // since the line equations ax+by=c are normalized with a^2+b^2=1, the following tests
     // whether the two lines are approximately parallel.
     if(internal::isMuchSmallerThan(det, Scalar(1)))
     {   // special case where the two lines are approximately parallel. Pick any point on the first line.
-        if(abs(coeffs().coeff(1))>abs(coeffs().coeff(0)))
+        if(numext::abs(coeffs().coeff(1))>numext::abs(coeffs().coeff(0)))
             return VectorType(coeffs().coeff(1), -coeffs().coeff(2)/coeffs().coeff(1)-coeffs().coeff(0));
         else
             return VectorType(-coeffs().coeff(2)/coeffs().coeff(0)-coeffs().coeff(1), coeffs().coeff(0));
@@ -205,10 +214,13 @@
     *               or a more generic #Affine transformation. The default is #Affine.
     */
   template<typename XprType>
-  inline Hyperplane& transform(const MatrixBase<XprType>& mat, TransformTraits traits = Affine)
+  EIGEN_DEVICE_FUNC inline Hyperplane& transform(const MatrixBase<XprType>& mat, TransformTraits traits = Affine)
   {
     if (traits==Affine)
+    {
       normal() = mat.inverse().transpose() * normal();
+      m_coeffs /= normal().norm();
+    }
     else if (traits==Isometry)
       normal() = mat * normal();
     else
@@ -226,7 +238,7 @@
     *               Other kind of transformations are not supported.
     */
   template<int TrOptions>
-  inline Hyperplane& transform(const Transform<Scalar,AmbientDimAtCompileTime,Affine,TrOptions>& t,
+  EIGEN_DEVICE_FUNC inline Hyperplane& transform(const Transform<Scalar,AmbientDimAtCompileTime,Affine,TrOptions>& t,
                                 TransformTraits traits = Affine)
   {
     transform(t.linear(), traits);
@@ -240,7 +252,7 @@
     * then this function smartly returns a const reference to \c *this.
     */
   template<typename NewScalarType>
-  inline typename internal::cast_return_type<Hyperplane,
+  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<Hyperplane,
            Hyperplane<NewScalarType,AmbientDimAtCompileTime,Options> >::type cast() const
   {
     return typename internal::cast_return_type<Hyperplane,
@@ -249,7 +261,7 @@
 
   /** Copy constructor with scalar type conversion */
   template<typename OtherScalarType,int OtherOptions>
-  inline explicit Hyperplane(const Hyperplane<OtherScalarType,AmbientDimAtCompileTime,OtherOptions>& other)
+  EIGEN_DEVICE_FUNC inline explicit Hyperplane(const Hyperplane<OtherScalarType,AmbientDimAtCompileTime,OtherOptions>& other)
   { m_coeffs = other.coeffs().template cast<Scalar>(); }
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
@@ -257,7 +269,7 @@
     *
     * \sa MatrixBase::isApprox() */
   template<int OtherOptions>
-  bool isApprox(const Hyperplane<Scalar,AmbientDimAtCompileTime,OtherOptions>& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
+  EIGEN_DEVICE_FUNC bool isApprox(const Hyperplane<Scalar,AmbientDimAtCompileTime,OtherOptions>& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
   { return m_coeffs.isApprox(other.m_coeffs, prec); }
 
 protected:

diff --git a/Eigen/src/Geometry/OrthoMethods.h b/Eigen/src/Geometry/OrthoMethods.h
index 46458f5..524aebe 100644
--- a/Eigen/src/Geometry/OrthoMethods.h
+++ b/Eigen/src/Geometry/OrthoMethods.h

@@ -13,17 +13,25 @@
 
 namespace Eigen { 
 
-/** \geometry_module
+/** \geometry_module \ingroup Geometry_Module
   *
   * \returns the cross product of \c *this and \a other
   *
   * Here is a very good explanation of cross-product: http://xkcd.com/199/
+  * 
+  * With complex numbers, the cross product is implemented as
+  * \f$ (\mathbf{a}+i\mathbf{b}) \times (\mathbf{c}+i\mathbf{d}) = (\mathbf{a} \times \mathbf{c} - \mathbf{b} \times \mathbf{d}) - i(\mathbf{a} \times \mathbf{d} - \mathbf{b} \times \mathbf{c})\f$
+  * 
   * \sa MatrixBase::cross3()
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-inline typename MatrixBase<Derived>::template cross_product_return_type<OtherDerived>::type
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename MatrixBase<Derived>::template cross_product_return_type<OtherDerived>::type
+#else
+typename MatrixBase<Derived>::PlainObject
+#endif
 MatrixBase<Derived>::cross(const MatrixBase<OtherDerived>& other) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived,3)
@@ -31,8 +39,8 @@
 
   // Note that there is no need for an expression here since the compiler
   // optimize such a small temporary very well (even within a complex expression)
-  typename internal::nested<Derived,2>::type lhs(derived());
-  typename internal::nested<OtherDerived,2>::type rhs(other.derived());
+  typename internal::nested_eval<Derived,2>::type lhs(derived());
+  typename internal::nested_eval<OtherDerived,2>::type rhs(other.derived());
   return typename cross_product_return_type<OtherDerived>::type(
     numext::conj(lhs.coeff(1) * rhs.coeff(2) - lhs.coeff(2) * rhs.coeff(1)),
     numext::conj(lhs.coeff(2) * rhs.coeff(0) - lhs.coeff(0) * rhs.coeff(2)),
@@ -46,7 +54,7 @@
           typename Scalar = typename VectorLhs::Scalar,
           bool Vectorizable = bool((VectorLhs::Flags&VectorRhs::Flags)&PacketAccessBit)>
 struct cross3_impl {
-  static inline typename internal::plain_matrix_type<VectorLhs>::type
+  EIGEN_DEVICE_FUNC static inline typename internal::plain_matrix_type<VectorLhs>::type
   run(const VectorLhs& lhs, const VectorRhs& rhs)
   {
     return typename internal::plain_matrix_type<VectorLhs>::type(
@@ -60,7 +68,7 @@
 
 }
 
-/** \geometry_module
+/** \geometry_module \ingroup Geometry_Module
   *
   * \returns the cross product of \c *this and \a other using only the x, y, and z coefficients
   *
@@ -71,15 +79,14 @@
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-inline typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::cross3(const MatrixBase<OtherDerived>& other) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived,4)
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,4)
 
-  typedef typename internal::nested<Derived,2>::type DerivedNested;
-  typedef typename internal::nested<OtherDerived,2>::type OtherDerivedNested;
+  typedef typename internal::nested_eval<Derived,2>::type DerivedNested;
+  typedef typename internal::nested_eval<OtherDerived,2>::type OtherDerivedNested;
   DerivedNested lhs(derived());
   OtherDerivedNested rhs(other.derived());
 
@@ -88,38 +95,42 @@
                         typename internal::remove_all<OtherDerivedNested>::type>::run(lhs,rhs);
 }
 
-/** \returns a matrix expression of the cross product of each column or row
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \returns a matrix expression of the cross product of each column or row
   * of the referenced expression with the \a other vector.
   *
   * The referenced matrix must have one dimension equal to 3.
   * The result matrix has the same dimensions than the referenced one.
   *
-  * \geometry_module
-  *
   * \sa MatrixBase::cross() */
 template<typename ExpressionType, int Direction>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC 
 const typename VectorwiseOp<ExpressionType,Direction>::CrossReturnType
 VectorwiseOp<ExpressionType,Direction>::cross(const MatrixBase<OtherDerived>& other) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,3)
   EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
     YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+  
+  typename internal::nested_eval<ExpressionType,2>::type mat(_expression());
+  typename internal::nested_eval<OtherDerived,2>::type vec(other.derived());
 
   CrossReturnType res(_expression().rows(),_expression().cols());
   if(Direction==Vertical)
   {
     eigen_assert(CrossReturnType::RowsAtCompileTime==3 && "the matrix must have exactly 3 rows");
-    res.row(0) = (_expression().row(1) * other.coeff(2) - _expression().row(2) * other.coeff(1)).conjugate();
-    res.row(1) = (_expression().row(2) * other.coeff(0) - _expression().row(0) * other.coeff(2)).conjugate();
-    res.row(2) = (_expression().row(0) * other.coeff(1) - _expression().row(1) * other.coeff(0)).conjugate();
+    res.row(0) = (mat.row(1) * vec.coeff(2) - mat.row(2) * vec.coeff(1)).conjugate();
+    res.row(1) = (mat.row(2) * vec.coeff(0) - mat.row(0) * vec.coeff(2)).conjugate();
+    res.row(2) = (mat.row(0) * vec.coeff(1) - mat.row(1) * vec.coeff(0)).conjugate();
   }
   else
   {
     eigen_assert(CrossReturnType::ColsAtCompileTime==3 && "the matrix must have exactly 3 columns");
-    res.col(0) = (_expression().col(1) * other.coeff(2) - _expression().col(2) * other.coeff(1)).conjugate();
-    res.col(1) = (_expression().col(2) * other.coeff(0) - _expression().col(0) * other.coeff(2)).conjugate();
-    res.col(2) = (_expression().col(0) * other.coeff(1) - _expression().col(1) * other.coeff(0)).conjugate();
+    res.col(0) = (mat.col(1) * vec.coeff(2) - mat.col(2) * vec.coeff(1)).conjugate();
+    res.col(1) = (mat.col(2) * vec.coeff(0) - mat.col(0) * vec.coeff(2)).conjugate();
+    res.col(2) = (mat.col(0) * vec.coeff(1) - mat.col(1) * vec.coeff(0)).conjugate();
   }
   return res;
 }
@@ -132,7 +143,6 @@
   typedef typename plain_matrix_type<Derived>::type VectorType;
   typedef typename traits<Derived>::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef typename Derived::Index Index;
   typedef Matrix<Scalar,2,1> Vector2;
   EIGEN_DEVICE_FUNC
   static inline VectorType run(const Derived& src)
@@ -203,7 +213,9 @@
 
 } // end namespace internal
 
-/** \returns a unit vector which is orthogonal to \c *this
+/** \geometry_module \ingroup Geometry_Module
+  *
+  * \returns a unit vector which is orthogonal to \c *this
   *
   * The size of \c *this must be at least 2. If the size is exactly 2,
   * then the returned vector is a counter clock wise rotation of \c *this, i.e., (-y,x).normalized().
@@ -211,8 +223,7 @@
   * \sa cross()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
-typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::unitOrthogonal() const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)

diff --git a/Eigen/src/Geometry/ParametrizedLine.h b/Eigen/src/Geometry/ParametrizedLine.h
index 77fa228..584f500 100644
--- a/Eigen/src/Geometry/ParametrizedLine.h
+++ b/Eigen/src/Geometry/ParametrizedLine.h

@@ -23,8 +23,8 @@
   * direction vector \f$ \mathbf{d} \f$ such that the line corresponds to
   * the set \f$ l(t) = \mathbf{o} + t \mathbf{d} \f$, \f$ t \in \mathbf{R} \f$.
   *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients
+  * \tparam _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
   */
 template <typename _Scalar, int _AmbientDim, int _Options>
 class ParametrizedLine
@@ -37,49 +37,49 @@
   };
   typedef _Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef DenseIndex Index;
+  typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
   typedef Matrix<Scalar,AmbientDimAtCompileTime,1,Options> VectorType;
 
   /** Default constructor without initialization */
-  inline ParametrizedLine() {}
+  EIGEN_DEVICE_FUNC inline ParametrizedLine() {}
   
   template<int OtherOptions>
-  ParametrizedLine(const ParametrizedLine<Scalar,AmbientDimAtCompileTime,OtherOptions>& other)
+  EIGEN_DEVICE_FUNC ParametrizedLine(const ParametrizedLine<Scalar,AmbientDimAtCompileTime,OtherOptions>& other)
    : m_origin(other.origin()), m_direction(other.direction())
   {}
 
   /** Constructs a dynamic-size line with \a _dim the dimension
     * of the ambient space */
-  inline explicit ParametrizedLine(Index _dim) : m_origin(_dim), m_direction(_dim) {}
+  EIGEN_DEVICE_FUNC inline explicit ParametrizedLine(Index _dim) : m_origin(_dim), m_direction(_dim) {}
 
   /** Initializes a parametrized line of direction \a direction and origin \a origin.
     * \warning the vector direction is assumed to be normalized.
     */
-  ParametrizedLine(const VectorType& origin, const VectorType& direction)
+  EIGEN_DEVICE_FUNC ParametrizedLine(const VectorType& origin, const VectorType& direction)
     : m_origin(origin), m_direction(direction) {}
 
   template <int OtherOptions>
-  explicit ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane);
+  EIGEN_DEVICE_FUNC explicit ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane);
 
   /** Constructs a parametrized line going from \a p0 to \a p1. */
-  static inline ParametrizedLine Through(const VectorType& p0, const VectorType& p1)
+  EIGEN_DEVICE_FUNC static inline ParametrizedLine Through(const VectorType& p0, const VectorType& p1)
   { return ParametrizedLine(p0, (p1-p0).normalized()); }
 
-  ~ParametrizedLine() {}
+  EIGEN_DEVICE_FUNC ~ParametrizedLine() {}
 
   /** \returns the dimension in which the line holds */
-  inline Index dim() const { return m_direction.size(); }
+  EIGEN_DEVICE_FUNC inline Index dim() const { return m_direction.size(); }
 
-  const VectorType& origin() const { return m_origin; }
-  VectorType& origin() { return m_origin; }
+  EIGEN_DEVICE_FUNC const VectorType& origin() const { return m_origin; }
+  EIGEN_DEVICE_FUNC VectorType& origin() { return m_origin; }
 
-  const VectorType& direction() const { return m_direction; }
-  VectorType& direction() { return m_direction; }
+  EIGEN_DEVICE_FUNC const VectorType& direction() const { return m_direction; }
+  EIGEN_DEVICE_FUNC VectorType& direction() { return m_direction; }
 
   /** \returns the squared distance of a point \a p to its projection onto the line \c *this.
     * \sa distance()
     */
-  RealScalar squaredDistance(const VectorType& p) const
+  EIGEN_DEVICE_FUNC RealScalar squaredDistance(const VectorType& p) const
   {
     VectorType diff = p - origin();
     return (diff - direction().dot(diff) * direction()).squaredNorm();
@@ -87,30 +87,67 @@
   /** \returns the distance of a point \a p to its projection onto the line \c *this.
     * \sa squaredDistance()
     */
-  RealScalar distance(const VectorType& p) const { using std::sqrt; return sqrt(squaredDistance(p)); }
+  EIGEN_DEVICE_FUNC RealScalar distance(const VectorType& p) const { EIGEN_USING_STD(sqrt) return sqrt(squaredDistance(p)); }
 
   /** \returns the projection of a point \a p onto the line \c *this. */
-  VectorType projection(const VectorType& p) const
+  EIGEN_DEVICE_FUNC VectorType projection(const VectorType& p) const
   { return origin() + direction().dot(p-origin()) * direction(); }
 
-  VectorType pointAt(const Scalar& t) const;
+  EIGEN_DEVICE_FUNC VectorType pointAt(const Scalar& t) const;
   
   template <int OtherOptions>
-  Scalar intersectionParameter(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const;
+  EIGEN_DEVICE_FUNC Scalar intersectionParameter(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const;
  
   template <int OtherOptions>
-  Scalar intersection(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const;
+  EIGEN_DEVICE_FUNC Scalar intersection(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const;
   
   template <int OtherOptions>
-  VectorType intersectionPoint(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const;
+  EIGEN_DEVICE_FUNC VectorType intersectionPoint(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const;
 
-  /** \returns \c *this with scalar type casted to \a NewScalarType
+  /** Applies the transformation matrix \a mat to \c *this and returns a reference to \c *this.
+    *
+    * \param mat the Dim x Dim transformation matrix
+    * \param traits specifies whether the matrix \a mat represents an #Isometry
+    *               or a more generic #Affine transformation. The default is #Affine.
+    */
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC inline ParametrizedLine& transform(const MatrixBase<XprType>& mat, TransformTraits traits = Affine)
+  {
+    if (traits==Affine)
+      direction() = (mat * direction()).normalized();
+    else if (traits==Isometry)
+      direction() = mat * direction();
+    else
+    {
+      eigen_assert(0 && "invalid traits value in ParametrizedLine::transform()");
+    }
+    origin() = mat * origin();
+    return *this;
+  }
+
+  /** Applies the transformation \a t to \c *this and returns a reference to \c *this.
+    *
+    * \param t the transformation of dimension Dim
+    * \param traits specifies whether the transformation \a t represents an #Isometry
+    *               or a more generic #Affine transformation. The default is #Affine.
+    *               Other kind of transformations are not supported.
+    */
+  template<int TrOptions>
+  EIGEN_DEVICE_FUNC inline ParametrizedLine& transform(const Transform<Scalar,AmbientDimAtCompileTime,Affine,TrOptions>& t,
+                                                       TransformTraits traits = Affine)
+  {
+    transform(t.linear(), traits);
+    origin() += t.translation();
+    return *this;
+  }
+
+/** \returns \c *this with scalar type casted to \a NewScalarType
     *
     * Note that if \a NewScalarType is equal to the current scalar type of \c *this
     * then this function smartly returns a const reference to \c *this.
     */
   template<typename NewScalarType>
-  inline typename internal::cast_return_type<ParametrizedLine,
+  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<ParametrizedLine,
            ParametrizedLine<NewScalarType,AmbientDimAtCompileTime,Options> >::type cast() const
   {
     return typename internal::cast_return_type<ParametrizedLine,
@@ -119,7 +156,7 @@
 
   /** Copy constructor with scalar type conversion */
   template<typename OtherScalarType,int OtherOptions>
-  inline explicit ParametrizedLine(const ParametrizedLine<OtherScalarType,AmbientDimAtCompileTime,OtherOptions>& other)
+  EIGEN_DEVICE_FUNC inline explicit ParametrizedLine(const ParametrizedLine<OtherScalarType,AmbientDimAtCompileTime,OtherOptions>& other)
   {
     m_origin = other.origin().template cast<Scalar>();
     m_direction = other.direction().template cast<Scalar>();
@@ -129,7 +166,7 @@
     * determined by \a prec.
     *
     * \sa MatrixBase::isApprox() */
-  bool isApprox(const ParametrizedLine& other, typename NumTraits<Scalar>::Real prec = NumTraits<Scalar>::dummy_precision()) const
+  EIGEN_DEVICE_FUNC bool isApprox(const ParametrizedLine& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
   { return m_origin.isApprox(other.m_origin, prec) && m_direction.isApprox(other.m_direction, prec); }
 
 protected:
@@ -143,7 +180,7 @@
   */
 template <typename _Scalar, int _AmbientDim, int _Options>
 template <int OtherOptions>
-inline ParametrizedLine<_Scalar, _AmbientDim,_Options>::ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim,OtherOptions>& hyperplane)
+EIGEN_DEVICE_FUNC inline ParametrizedLine<_Scalar, _AmbientDim,_Options>::ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim,OtherOptions>& hyperplane)
 {
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 2)
   direction() = hyperplane.normal().unitOrthogonal();
@@ -153,7 +190,7 @@
 /** \returns the point at \a t along this line
   */
 template <typename _Scalar, int _AmbientDim, int _Options>
-inline typename ParametrizedLine<_Scalar, _AmbientDim,_Options>::VectorType
+EIGEN_DEVICE_FUNC inline typename ParametrizedLine<_Scalar, _AmbientDim,_Options>::VectorType
 ParametrizedLine<_Scalar, _AmbientDim,_Options>::pointAt(const _Scalar& t) const
 {
   return origin() + (direction()*t); 
@@ -163,7 +200,7 @@
   */
 template <typename _Scalar, int _AmbientDim, int _Options>
 template <int OtherOptions>
-inline _Scalar ParametrizedLine<_Scalar, _AmbientDim,_Options>::intersectionParameter(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const
+EIGEN_DEVICE_FUNC inline _Scalar ParametrizedLine<_Scalar, _AmbientDim,_Options>::intersectionParameter(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const
 {
   return -(hyperplane.offset()+hyperplane.normal().dot(origin()))
           / hyperplane.normal().dot(direction());
@@ -175,7 +212,7 @@
   */
 template <typename _Scalar, int _AmbientDim, int _Options>
 template <int OtherOptions>
-inline _Scalar ParametrizedLine<_Scalar, _AmbientDim,_Options>::intersection(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const
+EIGEN_DEVICE_FUNC inline _Scalar ParametrizedLine<_Scalar, _AmbientDim,_Options>::intersection(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const
 {
   return intersectionParameter(hyperplane);
 }
@@ -184,7 +221,7 @@
   */
 template <typename _Scalar, int _AmbientDim, int _Options>
 template <int OtherOptions>
-inline typename ParametrizedLine<_Scalar, _AmbientDim,_Options>::VectorType
+EIGEN_DEVICE_FUNC inline typename ParametrizedLine<_Scalar, _AmbientDim,_Options>::VectorType
 ParametrizedLine<_Scalar, _AmbientDim,_Options>::intersectionPoint(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const
 {
   return pointAt(intersectionParameter(hyperplane));

diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h
index 8524bef..3259e59 100644
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h

@@ -43,6 +43,11 @@
   typedef typename internal::traits<Derived>::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef typename internal::traits<Derived>::Coefficients Coefficients;
+  typedef typename Coefficients::CoeffReturnType CoeffReturnType;
+  typedef typename internal::conditional<bool(internal::traits<Derived>::Flags&LvalueBit),
+                                        Scalar&, CoeffReturnType>::type NonConstCoeffReturnType;
+
+
   enum {
     Flags = Eigen::internal::traits<Derived>::Flags
   };
@@ -58,37 +63,37 @@
 
 
   /** \returns the \c x coefficient */
-  inline Scalar x() const { return this->derived().coeffs().coeff(0); }
+  EIGEN_DEVICE_FUNC inline CoeffReturnType x() const { return this->derived().coeffs().coeff(0); }
   /** \returns the \c y coefficient */
-  inline Scalar y() const { return this->derived().coeffs().coeff(1); }
+  EIGEN_DEVICE_FUNC inline CoeffReturnType y() const { return this->derived().coeffs().coeff(1); }
   /** \returns the \c z coefficient */
-  inline Scalar z() const { return this->derived().coeffs().coeff(2); }
+  EIGEN_DEVICE_FUNC inline CoeffReturnType z() const { return this->derived().coeffs().coeff(2); }
   /** \returns the \c w coefficient */
-  inline Scalar w() const { return this->derived().coeffs().coeff(3); }
+  EIGEN_DEVICE_FUNC inline CoeffReturnType w() const { return this->derived().coeffs().coeff(3); }
 
-  /** \returns a reference to the \c x coefficient */
-  inline Scalar& x() { return this->derived().coeffs().coeffRef(0); }
-  /** \returns a reference to the \c y coefficient */
-  inline Scalar& y() { return this->derived().coeffs().coeffRef(1); }
-  /** \returns a reference to the \c z coefficient */
-  inline Scalar& z() { return this->derived().coeffs().coeffRef(2); }
-  /** \returns a reference to the \c w coefficient */
-  inline Scalar& w() { return this->derived().coeffs().coeffRef(3); }
+  /** \returns a reference to the \c x coefficient (if Derived is a non-const lvalue) */
+  EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType x() { return this->derived().coeffs().x(); }
+  /** \returns a reference to the \c y coefficient (if Derived is a non-const lvalue) */
+  EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType y() { return this->derived().coeffs().y(); }
+  /** \returns a reference to the \c z coefficient (if Derived is a non-const lvalue) */
+  EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType z() { return this->derived().coeffs().z(); }
+  /** \returns a reference to the \c w coefficient (if Derived is a non-const lvalue) */
+  EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType w() { return this->derived().coeffs().w(); }
 
   /** \returns a read-only vector expression of the imaginary part (x,y,z) */
-  inline const VectorBlock<const Coefficients,3> vec() const { return coeffs().template head<3>(); }
+  EIGEN_DEVICE_FUNC inline const VectorBlock<const Coefficients,3> vec() const { return coeffs().template head<3>(); }
 
   /** \returns a vector expression of the imaginary part (x,y,z) */
-  inline VectorBlock<Coefficients,3> vec() { return coeffs().template head<3>(); }
+  EIGEN_DEVICE_FUNC inline VectorBlock<Coefficients,3> vec() { return coeffs().template head<3>(); }
 
   /** \returns a read-only vector expression of the coefficients (x,y,z,w) */
-  inline const typename internal::traits<Derived>::Coefficients& coeffs() const { return derived().coeffs(); }
+  EIGEN_DEVICE_FUNC inline const typename internal::traits<Derived>::Coefficients& coeffs() const { return derived().coeffs(); }
 
   /** \returns a vector expression of the coefficients (x,y,z,w) */
-  inline typename internal::traits<Derived>::Coefficients& coeffs() { return derived().coeffs(); }
+  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Coefficients& coeffs() { return derived().coeffs(); }
 
-  EIGEN_STRONG_INLINE QuaternionBase<Derived>& operator=(const QuaternionBase<Derived>& other);
-  template<class OtherDerived> EIGEN_STRONG_INLINE Derived& operator=(const QuaternionBase<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE QuaternionBase<Derived>& operator=(const QuaternionBase<Derived>& other);
+  template<class OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const QuaternionBase<OtherDerived>& other);
 
 // disabled this copy operator as it is giving very strange compilation errors when compiling
 // test_stdvector with GCC 4.4.2. This looks like a GCC bug though, so feel free to re-enable it if it's
@@ -97,87 +102,128 @@
 //  Derived& operator=(const QuaternionBase& other)
 //  { return operator=<Derived>(other); }
 
-  Derived& operator=(const AngleAxisType& aa);
-  template<class OtherDerived> Derived& operator=(const MatrixBase<OtherDerived>& m);
+  EIGEN_DEVICE_FUNC Derived& operator=(const AngleAxisType& aa);
+  template<class OtherDerived> EIGEN_DEVICE_FUNC Derived& operator=(const MatrixBase<OtherDerived>& m);
 
   /** \returns a quaternion representing an identity rotation
     * \sa MatrixBase::Identity()
     */
-  static inline Quaternion<Scalar> Identity() { return Quaternion<Scalar>(1, 0, 0, 0); }
+  EIGEN_DEVICE_FUNC static inline Quaternion<Scalar> Identity() { return Quaternion<Scalar>(Scalar(1), Scalar(0), Scalar(0), Scalar(0)); }
 
   /** \sa QuaternionBase::Identity(), MatrixBase::setIdentity()
     */
-  inline QuaternionBase& setIdentity() { coeffs() << 0, 0, 0, 1; return *this; }
+  EIGEN_DEVICE_FUNC inline QuaternionBase& setIdentity() { coeffs() << Scalar(0), Scalar(0), Scalar(0), Scalar(1); return *this; }
 
   /** \returns the squared norm of the quaternion's coefficients
     * \sa QuaternionBase::norm(), MatrixBase::squaredNorm()
     */
-  inline Scalar squaredNorm() const { return coeffs().squaredNorm(); }
+  EIGEN_DEVICE_FUNC inline Scalar squaredNorm() const { return coeffs().squaredNorm(); }
 
   /** \returns the norm of the quaternion's coefficients
     * \sa QuaternionBase::squaredNorm(), MatrixBase::norm()
     */
-  inline Scalar norm() const { return coeffs().norm(); }
+  EIGEN_DEVICE_FUNC inline Scalar norm() const { return coeffs().norm(); }
 
   /** Normalizes the quaternion \c *this
     * \sa normalized(), MatrixBase::normalize() */
-  inline void normalize() { coeffs().normalize(); }
+  EIGEN_DEVICE_FUNC inline void normalize() { coeffs().normalize(); }
   /** \returns a normalized copy of \c *this
     * \sa normalize(), MatrixBase::normalized() */
-  inline Quaternion<Scalar> normalized() const { return Quaternion<Scalar>(coeffs().normalized()); }
+  EIGEN_DEVICE_FUNC inline Quaternion<Scalar> normalized() const { return Quaternion<Scalar>(coeffs().normalized()); }
 
     /** \returns the dot product of \c *this and \a other
     * Geometrically speaking, the dot product of two unit quaternions
     * corresponds to the cosine of half the angle between the two rotations.
     * \sa angularDistance()
     */
-  template<class OtherDerived> inline Scalar dot(const QuaternionBase<OtherDerived>& other) const { return coeffs().dot(other.coeffs()); }
+  template<class OtherDerived> EIGEN_DEVICE_FUNC inline Scalar dot(const QuaternionBase<OtherDerived>& other) const { return coeffs().dot(other.coeffs()); }
 
-  template<class OtherDerived> Scalar angularDistance(const QuaternionBase<OtherDerived>& other) const;
+  template<class OtherDerived> EIGEN_DEVICE_FUNC Scalar angularDistance(const QuaternionBase<OtherDerived>& other) const;
 
   /** \returns an equivalent 3x3 rotation matrix */
-  Matrix3 toRotationMatrix() const;
+  EIGEN_DEVICE_FUNC inline Matrix3 toRotationMatrix() const;
 
   /** \returns the quaternion which transform \a a into \a b through a rotation */
   template<typename Derived1, typename Derived2>
-  Derived& setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
+  EIGEN_DEVICE_FUNC Derived& setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
 
-  template<class OtherDerived> EIGEN_STRONG_INLINE Quaternion<Scalar> operator* (const QuaternionBase<OtherDerived>& q) const;
-  template<class OtherDerived> EIGEN_STRONG_INLINE Derived& operator*= (const QuaternionBase<OtherDerived>& q);
+  template<class OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Quaternion<Scalar> operator* (const QuaternionBase<OtherDerived>& q) const;
+  template<class OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator*= (const QuaternionBase<OtherDerived>& q);
 
   /** \returns the quaternion describing the inverse rotation */
-  Quaternion<Scalar> inverse() const;
+  EIGEN_DEVICE_FUNC Quaternion<Scalar> inverse() const;
 
   /** \returns the conjugated quaternion */
-  Quaternion<Scalar> conjugate() const;
+  EIGEN_DEVICE_FUNC Quaternion<Scalar> conjugate() const;
 
-  template<class OtherDerived> Quaternion<Scalar> slerp(const Scalar& t, const QuaternionBase<OtherDerived>& other) const;
+  template<class OtherDerived> EIGEN_DEVICE_FUNC Quaternion<Scalar> slerp(const Scalar& t, const QuaternionBase<OtherDerived>& other) const;
+
+  /** \returns true if each coefficients of \c *this and \a other are all exactly equal.
+    * \warning When using floating point scalar values you probably should rather use a
+    *          fuzzy comparison such as isApprox()
+    * \sa isApprox(), operator!= */
+  template<class OtherDerived>
+  EIGEN_DEVICE_FUNC inline bool operator==(const QuaternionBase<OtherDerived>& other) const
+  { return coeffs() == other.coeffs(); }
+
+  /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other.
+    * \warning When using floating point scalar values you probably should rather use a
+    *          fuzzy comparison such as isApprox()
+    * \sa isApprox(), operator== */
+  template<class OtherDerived>
+  EIGEN_DEVICE_FUNC inline bool operator!=(const QuaternionBase<OtherDerived>& other) const
+  { return coeffs() != other.coeffs(); }
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
     * determined by \a prec.
     *
     * \sa MatrixBase::isApprox() */
   template<class OtherDerived>
-  bool isApprox(const QuaternionBase<OtherDerived>& other, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
+  EIGEN_DEVICE_FUNC bool isApprox(const QuaternionBase<OtherDerived>& other, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
   { return coeffs().isApprox(other.coeffs(), prec); }
 
-	/** return the result vector of \a v through the rotation*/
-  EIGEN_STRONG_INLINE Vector3 _transformVector(Vector3 v) const;
+  /** return the result vector of \a v through the rotation*/
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Vector3 _transformVector(const Vector3& v) const;
 
+  #ifdef EIGEN_PARSED_BY_DOXYGEN
   /** \returns \c *this with scalar type casted to \a NewScalarType
     *
     * Note that if \a NewScalarType is equal to the current scalar type of \c *this
     * then this function smartly returns a const reference to \c *this.
     */
   template<typename NewScalarType>
-  inline typename internal::cast_return_type<Derived,Quaternion<NewScalarType> >::type cast() const
+  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<Derived,Quaternion<NewScalarType> >::type cast() const;
+
+  #else
+
+  template<typename NewScalarType>
+  EIGEN_DEVICE_FUNC inline
+  typename internal::enable_if<internal::is_same<Scalar,NewScalarType>::value,const Derived&>::type cast() const
   {
-    return typename internal::cast_return_type<Derived,Quaternion<NewScalarType> >::type(derived());
+    return derived();
   }
 
+  template<typename NewScalarType>
+  EIGEN_DEVICE_FUNC inline
+  typename internal::enable_if<!internal::is_same<Scalar,NewScalarType>::value,Quaternion<NewScalarType> >::type cast() const
+  {
+    return Quaternion<NewScalarType>(coeffs().template cast<NewScalarType>());
+  }
+  #endif
+
+#ifndef EIGEN_NO_IO
+  friend std::ostream& operator<<(std::ostream& s, const QuaternionBase<Derived>& q) {
+    s << q.x() << "i + " << q.y() << "j + " << q.z() << "k" << " + " << q.w();
+    return s;
+  }
+#endif
+
 #ifdef EIGEN_QUATERNIONBASE_PLUGIN
 # include EIGEN_QUATERNIONBASE_PLUGIN
 #endif
+protected:
+  EIGEN_DEFAULT_COPY_CONSTRUCTOR(QuaternionBase)
+  EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(QuaternionBase)
 };
 
 /***************************************************************************
@@ -217,8 +263,8 @@
   typedef _Scalar Scalar;
   typedef Matrix<_Scalar,4,1,_Options> Coefficients;
   enum{
-    IsAligned = internal::traits<Coefficients>::Flags & AlignedBit,
-    Flags = IsAligned ? (AlignedBit | LvalueBit) : LvalueBit
+    Alignment = internal::traits<Coefficients>::Alignment,
+    Flags = LvalueBit
   };
 };
 }
@@ -228,18 +274,18 @@
 {
 public:
   typedef QuaternionBase<Quaternion<_Scalar,_Options> > Base;
-  enum { IsAligned = internal::traits<Quaternion>::IsAligned };
+  enum { NeedsAlignment = internal::traits<Quaternion>::Alignment>0 };
 
   typedef _Scalar Scalar;
 
-  EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Quaternion)
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Quaternion)
   using Base::operator*=;
 
   typedef typename internal::traits<Quaternion>::Coefficients Coefficients;
   typedef typename Base::AngleAxisType AngleAxisType;
 
   /** Default constructor leaving the quaternion uninitialized. */
-  inline Quaternion() {}
+  EIGEN_DEVICE_FUNC inline Quaternion() {}
 
   /** Constructs and initializes the quaternion \f$ w+xi+yj+zk \f$ from
     * its four coefficients \a w, \a x, \a y and \a z.
@@ -248,36 +294,57 @@
     * while internally the coefficients are stored in the following order:
     * [\c x, \c y, \c z, \c w]
     */
-  inline Quaternion(const Scalar& w, const Scalar& x, const Scalar& y, const Scalar& z) : m_coeffs(x, y, z, w){}
+  EIGEN_DEVICE_FUNC inline Quaternion(const Scalar& w, const Scalar& x, const Scalar& y, const Scalar& z) : m_coeffs(x, y, z, w){}
 
   /** Constructs and initialize a quaternion from the array data */
-  inline Quaternion(const Scalar* data) : m_coeffs(data) {}
+  EIGEN_DEVICE_FUNC explicit inline Quaternion(const Scalar* data) : m_coeffs(data) {}
 
   /** Copy constructor */
-  template<class Derived> EIGEN_STRONG_INLINE Quaternion(const QuaternionBase<Derived>& other) { this->Base::operator=(other); }
+  template<class Derived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Quaternion(const QuaternionBase<Derived>& other) { this->Base::operator=(other); }
 
   /** Constructs and initializes a quaternion from the angle-axis \a aa */
-  explicit inline Quaternion(const AngleAxisType& aa) { *this = aa; }
+  EIGEN_DEVICE_FUNC explicit inline Quaternion(const AngleAxisType& aa) { *this = aa; }
 
   /** Constructs and initializes a quaternion from either:
     *  - a rotation matrix expression,
     *  - a 4D vector expression representing quaternion coefficients.
     */
   template<typename Derived>
-  explicit inline Quaternion(const MatrixBase<Derived>& other) { *this = other; }
+  EIGEN_DEVICE_FUNC explicit inline Quaternion(const MatrixBase<Derived>& other) { *this = other; }
 
   /** Explicit copy constructor with scalar conversion */
   template<typename OtherScalar, int OtherOptions>
-  explicit inline Quaternion(const Quaternion<OtherScalar, OtherOptions>& other)
+  EIGEN_DEVICE_FUNC explicit inline Quaternion(const Quaternion<OtherScalar, OtherOptions>& other)
   { m_coeffs = other.coeffs().template cast<Scalar>(); }
 
+#if EIGEN_HAS_RVALUE_REFERENCES
+  // We define a copy constructor, which means we don't get an implicit move constructor or assignment operator.
+  /** Default move constructor */
+  EIGEN_DEVICE_FUNC inline Quaternion(Quaternion&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
+    : m_coeffs(std::move(other.coeffs()))
+  {}
+
+  /** Default move assignment operator */
+  EIGEN_DEVICE_FUNC Quaternion& operator=(Quaternion&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
+  {
+    m_coeffs = std::move(other.coeffs());
+    return *this;
+  }
+#endif
+
+  EIGEN_DEVICE_FUNC static Quaternion UnitRandom();
+
   template<typename Derived1, typename Derived2>
-  static Quaternion FromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
+  EIGEN_DEVICE_FUNC static Quaternion FromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
 
-  inline Coefficients& coeffs() { return m_coeffs;}
-  inline const Coefficients& coeffs() const { return m_coeffs;}
+  EIGEN_DEVICE_FUNC inline Coefficients& coeffs() { return m_coeffs;}
+  EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs;}
 
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(IsAligned)
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(NeedsAlignment))
+  
+#ifdef EIGEN_QUATERNION_PLUGIN
+# include EIGEN_QUATERNION_PLUGIN
+#endif
 
 protected:
   Coefficients m_coeffs;
@@ -342,7 +409,7 @@
 
     typedef _Scalar Scalar;
     typedef typename internal::traits<Map>::Coefficients Coefficients;
-    EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Map)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
     using Base::operator*=;
 
     /** Constructs a Mapped Quaternion object from the pointer \a coeffs
@@ -351,9 +418,9 @@
       * \code *coeffs == {x, y, z, w} \endcode
       *
       * If the template parameter _Options is set to #Aligned, then the pointer coeffs must be aligned. */
-    EIGEN_STRONG_INLINE Map(const Scalar* coeffs) : m_coeffs(coeffs) {}
+    EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE Map(const Scalar* coeffs) : m_coeffs(coeffs) {}
 
-    inline const Coefficients& coeffs() const { return m_coeffs;}
+    EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs;}
 
   protected:
     const Coefficients m_coeffs;
@@ -379,7 +446,7 @@
 
     typedef _Scalar Scalar;
     typedef typename internal::traits<Map>::Coefficients Coefficients;
-    EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Map)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
     using Base::operator*=;
 
     /** Constructs a Mapped Quaternion object from the pointer \a coeffs
@@ -388,10 +455,10 @@
       * \code *coeffs == {x, y, z, w} \endcode
       *
       * If the template parameter _Options is set to #Aligned, then the pointer coeffs must be aligned. */
-    EIGEN_STRONG_INLINE Map(Scalar* coeffs) : m_coeffs(coeffs) {}
+    EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE Map(Scalar* coeffs) : m_coeffs(coeffs) {}
 
-    inline Coefficients& coeffs() { return m_coeffs; }
-    inline const Coefficients& coeffs() const { return m_coeffs; }
+    EIGEN_DEVICE_FUNC inline Coefficients& coeffs() { return m_coeffs; }
+    EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs; }
 
   protected:
     Coefficients m_coeffs;
@@ -417,9 +484,9 @@
 // Generic Quaternion * Quaternion product
 // This product can be specialized for a given architecture via the Arch template argument.
 namespace internal {
-template<int Arch, class Derived1, class Derived2, typename Scalar, int _Options> struct quat_product
+template<int Arch, class Derived1, class Derived2, typename Scalar> struct quat_product
 {
-  static EIGEN_STRONG_INLINE Quaternion<Scalar> run(const QuaternionBase<Derived1>& a, const QuaternionBase<Derived2>& b){
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Quaternion<Scalar> run(const QuaternionBase<Derived1>& a, const QuaternionBase<Derived2>& b){
     return Quaternion<Scalar>
     (
       a.w() * b.w() - a.x() * b.x() - a.y() * b.y() - a.z() * b.z(),
@@ -434,20 +501,19 @@
 /** \returns the concatenation of two rotations as a quaternion-quaternion product */
 template <class Derived>
 template <class OtherDerived>
-EIGEN_STRONG_INLINE Quaternion<typename internal::traits<Derived>::Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Quaternion<typename internal::traits<Derived>::Scalar>
 QuaternionBase<Derived>::operator* (const QuaternionBase<OtherDerived>& other) const
 {
   EIGEN_STATIC_ASSERT((internal::is_same<typename Derived::Scalar, typename OtherDerived::Scalar>::value),
    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
   return internal::quat_product<Architecture::Target, Derived, OtherDerived,
-                         typename internal::traits<Derived>::Scalar,
-                         internal::traits<Derived>::IsAligned && internal::traits<OtherDerived>::IsAligned>::run(*this, other);
+                         typename internal::traits<Derived>::Scalar>::run(*this, other);
 }
 
 /** \sa operator*(Quaternion) */
 template <class Derived>
 template <class OtherDerived>
-EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator*= (const QuaternionBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator*= (const QuaternionBase<OtherDerived>& other)
 {
   derived() = derived() * other.derived();
   return derived();
@@ -461,8 +527,8 @@
   *   - Via a Matrix3: 24 + 15n
   */
 template <class Derived>
-EIGEN_STRONG_INLINE typename QuaternionBase<Derived>::Vector3
-QuaternionBase<Derived>::_transformVector(Vector3 v) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename QuaternionBase<Derived>::Vector3
+QuaternionBase<Derived>::_transformVector(const Vector3& v) const
 {
     // Note that this algorithm comes from the optimization by hand
     // of the conversion to a Matrix followed by a Matrix/Vector product.
@@ -475,7 +541,7 @@
 }
 
 template<class Derived>
-EIGEN_STRONG_INLINE QuaternionBase<Derived>& QuaternionBase<Derived>::operator=(const QuaternionBase<Derived>& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE QuaternionBase<Derived>& QuaternionBase<Derived>::operator=(const QuaternionBase<Derived>& other)
 {
   coeffs() = other.coeffs();
   return derived();
@@ -483,7 +549,7 @@
 
 template<class Derived>
 template<class OtherDerived>
-EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator=(const QuaternionBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator=(const QuaternionBase<OtherDerived>& other)
 {
   coeffs() = other.coeffs();
   return derived();
@@ -492,10 +558,10 @@
 /** Set \c *this from an angle-axis \a aa and returns a reference to \c *this
   */
 template<class Derived>
-EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator=(const AngleAxisType& aa)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator=(const AngleAxisType& aa)
 {
-  using std::cos;
-  using std::sin;
+  EIGEN_USING_STD(cos)
+  EIGEN_USING_STD(sin)
   Scalar ha = Scalar(0.5)*aa.angle(); // Scalar(0.5) to suppress precision loss warnings
   this->w() = cos(ha);
   this->vec() = sin(ha) * aa.axis();
@@ -510,7 +576,7 @@
 
 template<class Derived>
 template<class MatrixDerived>
-inline Derived& QuaternionBase<Derived>::operator=(const MatrixBase<MatrixDerived>& xpr)
+EIGEN_DEVICE_FUNC inline Derived& QuaternionBase<Derived>::operator=(const MatrixBase<MatrixDerived>& xpr)
 {
   EIGEN_STATIC_ASSERT((internal::is_same<typename Derived::Scalar, typename MatrixDerived::Scalar>::value),
    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
@@ -522,7 +588,7 @@
   * be normalized, otherwise the result is undefined.
   */
 template<class Derived>
-inline typename QuaternionBase<Derived>::Matrix3
+EIGEN_DEVICE_FUNC inline typename QuaternionBase<Derived>::Matrix3
 QuaternionBase<Derived>::toRotationMatrix(void) const
 {
   // NOTE if inlined, then gcc 4.2 and 4.4 get rid of the temporary (not gcc 4.3 !!)
@@ -569,9 +635,9 @@
   */
 template<class Derived>
 template<typename Derived1, typename Derived2>
-inline Derived& QuaternionBase<Derived>::setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b)
+EIGEN_DEVICE_FUNC inline Derived& QuaternionBase<Derived>::setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b)
 {
-  using std::sqrt;
+  EIGEN_USING_STD(sqrt)
   Vector3 v0 = a.normalized();
   Vector3 v1 = b.normalized();
   Scalar c = v1.dot(v0);
@@ -605,6 +671,24 @@
   return derived();
 }
 
+/** \returns a random unit quaternion following a uniform distribution law on SO(3)
+  *
+  * \note The implementation is based on http://planning.cs.uiuc.edu/node198.html
+  */
+template<typename Scalar, int Options>
+EIGEN_DEVICE_FUNC Quaternion<Scalar,Options> Quaternion<Scalar,Options>::UnitRandom()
+{
+  EIGEN_USING_STD(sqrt)
+  EIGEN_USING_STD(sin)
+  EIGEN_USING_STD(cos)
+  const Scalar u1 = internal::random<Scalar>(0, 1),
+               u2 = internal::random<Scalar>(0, 2*EIGEN_PI),
+               u3 = internal::random<Scalar>(0, 2*EIGEN_PI);
+  const Scalar a = sqrt(Scalar(1) - u1),
+               b = sqrt(u1);
+  return Quaternion (a * sin(u2), a * cos(u2), b * sin(u3), b * cos(u3));
+}
+
 
 /** Returns a quaternion representing a rotation between
   * the two arbitrary vectors \a a and \a b. In other words, the built
@@ -618,7 +702,7 @@
   */
 template<typename Scalar, int Options>
 template<typename Derived1, typename Derived2>
-Quaternion<Scalar,Options> Quaternion<Scalar,Options>::FromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b)
+EIGEN_DEVICE_FUNC Quaternion<Scalar,Options> Quaternion<Scalar,Options>::FromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b)
 {
     Quaternion quat;
     quat.setFromTwoVectors(a, b);
@@ -633,11 +717,11 @@
   * \sa QuaternionBase::conjugate()
   */
 template <class Derived>
-inline Quaternion<typename internal::traits<Derived>::Scalar> QuaternionBase<Derived>::inverse() const
+EIGEN_DEVICE_FUNC inline Quaternion<typename internal::traits<Derived>::Scalar> QuaternionBase<Derived>::inverse() const
 {
   // FIXME should this function be called multiplicativeInverse and conjugate() be called inverse() or opposite()  ??
   Scalar n2 = this->squaredNorm();
-  if (n2 > 0)
+  if (n2 > Scalar(0))
     return Quaternion<Scalar>(conjugate().coeffs() / n2);
   else
   {
@@ -646,6 +730,16 @@
   }
 }
 
+// Generic conjugate of a Quaternion
+namespace internal {
+template<int Arch, class Derived, typename Scalar> struct quat_conj
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Quaternion<Scalar> run(const QuaternionBase<Derived>& q){
+    return Quaternion<Scalar>(q.w(),-q.x(),-q.y(),-q.z());
+  }
+};
+}
+                         
 /** \returns the conjugate of the \c *this which is equal to the multiplicative inverse
   * if the quaternion is normalized.
   * The conjugate of a quaternion represents the opposite rotation.
@@ -653,10 +747,12 @@
   * \sa Quaternion2::inverse()
   */
 template <class Derived>
-inline Quaternion<typename internal::traits<Derived>::Scalar>
+EIGEN_DEVICE_FUNC inline Quaternion<typename internal::traits<Derived>::Scalar>
 QuaternionBase<Derived>::conjugate() const
 {
-  return Quaternion<Scalar>(this->w(),-this->x(),-this->y(),-this->z());
+  return internal::quat_conj<Architecture::Target, Derived,
+                         typename internal::traits<Derived>::Scalar>::run(*this);
+                         
 }
 
 /** \returns the angle (in radian) between two rotations
@@ -664,15 +760,12 @@
   */
 template <class Derived>
 template <class OtherDerived>
-inline typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar
 QuaternionBase<Derived>::angularDistance(const QuaternionBase<OtherDerived>& other) const
 {
-  using std::acos;
-  using std::abs;
-  Scalar d = abs(this->dot(other));
-  if (d>=Scalar(1))
-    return Scalar(0);
-  return Scalar(2) * acos(d);
+  EIGEN_USING_STD(atan2)
+  Quaternion<Scalar> d = (*this) * other.conjugate();
+  return Scalar(2) * atan2( d.vec().norm(), numext::abs(d.w()) );
 }
 
  
@@ -685,15 +778,14 @@
   */
 template <class Derived>
 template <class OtherDerived>
-Quaternion<typename internal::traits<Derived>::Scalar>
+EIGEN_DEVICE_FUNC Quaternion<typename internal::traits<Derived>::Scalar>
 QuaternionBase<Derived>::slerp(const Scalar& t, const QuaternionBase<OtherDerived>& other) const
 {
-  using std::acos;
-  using std::sin;
-  using std::abs;
-  static const Scalar one = Scalar(1) - NumTraits<Scalar>::epsilon();
+  EIGEN_USING_STD(acos)
+  EIGEN_USING_STD(sin)
+  const Scalar one = Scalar(1) - NumTraits<Scalar>::epsilon();
   Scalar d = this->dot(other);
-  Scalar absD = abs(d);
+  Scalar absD = numext::abs(d);
 
   Scalar scale0;
   Scalar scale1;
@@ -712,7 +804,7 @@
     scale0 = sin( ( Scalar(1) - t ) * theta) / sinTheta;
     scale1 = sin( ( t * theta) ) / sinTheta;
   }
-  if(d<0) scale1 = -scale1;
+  if(d<Scalar(0)) scale1 = -scale1;
 
   return Quaternion<Scalar>(scale0 * coeffs() + scale1 * other.coeffs());
 }
@@ -724,10 +816,10 @@
 struct quaternionbase_assign_impl<Other,3,3>
 {
   typedef typename Other::Scalar Scalar;
-  typedef DenseIndex Index;
-  template<class Derived> static inline void run(QuaternionBase<Derived>& q, const Other& mat)
+  template<class Derived> EIGEN_DEVICE_FUNC static inline void run(QuaternionBase<Derived>& q, const Other& a_mat)
   {
-    using std::sqrt;
+    const typename internal::nested_eval<Other,2>::type mat(a_mat);
+    EIGEN_USING_STD(sqrt)
     // This algorithm comes from  "Quaternion Calculus and Fast Animation",
     // Ken Shoemake, 1987 SIGGRAPH course notes
     Scalar t = mat.trace();
@@ -742,13 +834,13 @@
     }
     else
     {
-      DenseIndex i = 0;
+      Index i = 0;
       if (mat.coeff(1,1) > mat.coeff(0,0))
         i = 1;
       if (mat.coeff(2,2) > mat.coeff(i,i))
         i = 2;
-      DenseIndex j = (i+1)%3;
-      DenseIndex k = (j+1)%3;
+      Index j = (i+1)%3;
+      Index k = (j+1)%3;
 
       t = sqrt(mat.coeff(i,i)-mat.coeff(j,j)-mat.coeff(k,k) + Scalar(1.0));
       q.coeffs().coeffRef(i) = Scalar(0.5) * t;
@@ -765,7 +857,7 @@
 struct quaternionbase_assign_impl<Other,4,1>
 {
   typedef typename Other::Scalar Scalar;
-  template<class Derived> static inline void run(QuaternionBase<Derived>& q, const Other& vec)
+  template<class Derived> EIGEN_DEVICE_FUNC static inline void run(QuaternionBase<Derived>& q, const Other& vec)
   {
     q.coeffs() = vec;
   }

diff --git a/Eigen/src/Geometry/Rotation2D.h b/Eigen/src/Geometry/Rotation2D.h
index 1cac343..d0bd575 100644
--- a/Eigen/src/Geometry/Rotation2D.h
+++ b/Eigen/src/Geometry/Rotation2D.h

@@ -18,7 +18,7 @@
   *
   * \brief Represents a rotation/orientation in a 2 dimensional space.
   *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients
   *
   * This class is equivalent to a single scalar representing a counter clock wise rotation
   * as a single angle in radian. It provides some additional features such as the automatic
@@ -59,38 +59,79 @@
 public:
 
   /** Construct a 2D counter clock wise rotation from the angle \a a in radian. */
-  inline Rotation2D(const Scalar& a) : m_angle(a) {}
+  EIGEN_DEVICE_FUNC explicit inline Rotation2D(const Scalar& a) : m_angle(a) {}
+  
+  /** Default constructor wihtout initialization. The represented rotation is undefined. */
+  EIGEN_DEVICE_FUNC Rotation2D() {}
+
+  /** Construct a 2D rotation from a 2x2 rotation matrix \a mat.
+    *
+    * \sa fromRotationMatrix()
+    */
+  template<typename Derived>
+  EIGEN_DEVICE_FUNC explicit Rotation2D(const MatrixBase<Derived>& m)
+  {
+    fromRotationMatrix(m.derived());
+  }
 
   /** \returns the rotation angle */
-  inline Scalar angle() const { return m_angle; }
+  EIGEN_DEVICE_FUNC inline Scalar angle() const { return m_angle; }
 
   /** \returns a read-write reference to the rotation angle */
-  inline Scalar& angle() { return m_angle; }
+  EIGEN_DEVICE_FUNC inline Scalar& angle() { return m_angle; }
+  
+  /** \returns the rotation angle in [0,2pi] */
+  EIGEN_DEVICE_FUNC inline Scalar smallestPositiveAngle() const {
+    Scalar tmp = numext::fmod(m_angle,Scalar(2*EIGEN_PI));
+    return tmp<Scalar(0) ? tmp + Scalar(2*EIGEN_PI) : tmp;
+  }
+  
+  /** \returns the rotation angle in [-pi,pi] */
+  EIGEN_DEVICE_FUNC inline Scalar smallestAngle() const {
+    Scalar tmp = numext::fmod(m_angle,Scalar(2*EIGEN_PI));
+    if(tmp>Scalar(EIGEN_PI))       tmp -= Scalar(2*EIGEN_PI);
+    else if(tmp<-Scalar(EIGEN_PI)) tmp += Scalar(2*EIGEN_PI);
+    return tmp;
+  }
 
   /** \returns the inverse rotation */
-  inline Rotation2D inverse() const { return -m_angle; }
+  EIGEN_DEVICE_FUNC inline Rotation2D inverse() const { return Rotation2D(-m_angle); }
 
   /** Concatenates two rotations */
-  inline Rotation2D operator*(const Rotation2D& other) const
-  { return m_angle + other.m_angle; }
+  EIGEN_DEVICE_FUNC inline Rotation2D operator*(const Rotation2D& other) const
+  { return Rotation2D(m_angle + other.m_angle); }
 
   /** Concatenates two rotations */
-  inline Rotation2D& operator*=(const Rotation2D& other)
+  EIGEN_DEVICE_FUNC inline Rotation2D& operator*=(const Rotation2D& other)
   { m_angle += other.m_angle; return *this; }
 
   /** Applies the rotation to a 2D vector */
-  Vector2 operator* (const Vector2& vec) const
+  EIGEN_DEVICE_FUNC Vector2 operator* (const Vector2& vec) const
   { return toRotationMatrix() * vec; }
-
+  
   template<typename Derived>
-  Rotation2D& fromRotationMatrix(const MatrixBase<Derived>& m);
-  Matrix2 toRotationMatrix(void) const;
+  EIGEN_DEVICE_FUNC Rotation2D& fromRotationMatrix(const MatrixBase<Derived>& m);
+  EIGEN_DEVICE_FUNC Matrix2 toRotationMatrix() const;
+
+  /** Set \c *this from a 2x2 rotation matrix \a mat.
+    * In other words, this function extract the rotation angle from the rotation matrix.
+    *
+    * This method is an alias for fromRotationMatrix()
+    *
+    * \sa fromRotationMatrix()
+    */
+  template<typename Derived>
+  EIGEN_DEVICE_FUNC Rotation2D& operator=(const  MatrixBase<Derived>& m)
+  { return fromRotationMatrix(m.derived()); }
 
   /** \returns the spherical interpolation between \c *this and \a other using
     * parameter \a t. It is in fact equivalent to a linear interpolation.
     */
-  inline Rotation2D slerp(const Scalar& t, const Rotation2D& other) const
-  { return m_angle * (1-t) + other.angle() * t; }
+  EIGEN_DEVICE_FUNC inline Rotation2D slerp(const Scalar& t, const Rotation2D& other) const
+  {
+    Scalar dist = Rotation2D(other.m_angle-m_angle).smallestAngle();
+    return Rotation2D(m_angle + dist*t);
+  }
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
     *
@@ -98,24 +139,25 @@
     * then this function smartly returns a const reference to \c *this.
     */
   template<typename NewScalarType>
-  inline typename internal::cast_return_type<Rotation2D,Rotation2D<NewScalarType> >::type cast() const
+  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<Rotation2D,Rotation2D<NewScalarType> >::type cast() const
   { return typename internal::cast_return_type<Rotation2D,Rotation2D<NewScalarType> >::type(*this); }
 
   /** Copy constructor with scalar type conversion */
   template<typename OtherScalarType>
-  inline explicit Rotation2D(const Rotation2D<OtherScalarType>& other)
+  EIGEN_DEVICE_FUNC inline explicit Rotation2D(const Rotation2D<OtherScalarType>& other)
   {
     m_angle = Scalar(other.angle());
   }
 
-  static inline Rotation2D Identity() { return Rotation2D(0); }
+  EIGEN_DEVICE_FUNC static inline Rotation2D Identity() { return Rotation2D(0); }
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
     * determined by \a prec.
     *
     * \sa MatrixBase::isApprox() */
-  bool isApprox(const Rotation2D& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
+  EIGEN_DEVICE_FUNC bool isApprox(const Rotation2D& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
   { return internal::isApprox(m_angle,other.m_angle, prec); }
+  
 };
 
 /** \ingroup Geometry_Module
@@ -131,9 +173,9 @@
   */
 template<typename Scalar>
 template<typename Derived>
-Rotation2D<Scalar>& Rotation2D<Scalar>::fromRotationMatrix(const MatrixBase<Derived>& mat)
+EIGEN_DEVICE_FUNC Rotation2D<Scalar>& Rotation2D<Scalar>::fromRotationMatrix(const MatrixBase<Derived>& mat)
 {
-  using std::atan2;
+  EIGEN_USING_STD(atan2)
   EIGEN_STATIC_ASSERT(Derived::RowsAtCompileTime==2 && Derived::ColsAtCompileTime==2,YOU_MADE_A_PROGRAMMING_MISTAKE)
   m_angle = atan2(mat.coeff(1,0), mat.coeff(0,0));
   return *this;
@@ -143,10 +185,10 @@
   */
 template<typename Scalar>
 typename Rotation2D<Scalar>::Matrix2
-Rotation2D<Scalar>::toRotationMatrix(void) const
+EIGEN_DEVICE_FUNC Rotation2D<Scalar>::toRotationMatrix(void) const
 {
-  using std::sin;
-  using std::cos;
+  EIGEN_USING_STD(sin)
+  EIGEN_USING_STD(cos)
   Scalar sinA = sin(m_angle);
   Scalar cosA = cos(m_angle);
   return (Matrix2() << cosA, -sinA, sinA, cosA).finished();

diff --git a/Eigen/src/Geometry/RotationBase.h b/Eigen/src/Geometry/RotationBase.h
index 4ef0288..f0ee0bd 100644
--- a/Eigen/src/Geometry/RotationBase.h
+++ b/Eigen/src/Geometry/RotationBase.h

@@ -22,8 +22,8 @@
   *
   * \brief Common base class for compact rotation representations
   *
-  * \param Derived is the derived type, i.e., a rotation type
-  * \param _Dim the dimension of the space
+  * \tparam Derived is the derived type, i.e., a rotation type
+  * \tparam _Dim the dimension of the space
   */
 template<typename Derived, int _Dim>
 class RotationBase
@@ -38,26 +38,26 @@
     typedef Matrix<Scalar,Dim,1> VectorType;
 
   public:
-    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
-    inline Derived& derived() { return *static_cast<Derived*>(this); }
+    EIGEN_DEVICE_FUNC inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    EIGEN_DEVICE_FUNC inline Derived& derived() { return *static_cast<Derived*>(this); }
 
     /** \returns an equivalent rotation matrix */
-    inline RotationMatrixType toRotationMatrix() const { return derived().toRotationMatrix(); }
+    EIGEN_DEVICE_FUNC inline RotationMatrixType toRotationMatrix() const { return derived().toRotationMatrix(); }
 
     /** \returns an equivalent rotation matrix 
       * This function is added to be conform with the Transform class' naming scheme.
       */
-    inline RotationMatrixType matrix() const { return derived().toRotationMatrix(); }
+    EIGEN_DEVICE_FUNC inline RotationMatrixType matrix() const { return derived().toRotationMatrix(); }
 
     /** \returns the inverse rotation */
-    inline Derived inverse() const { return derived().inverse(); }
+    EIGEN_DEVICE_FUNC inline Derived inverse() const { return derived().inverse(); }
 
     /** \returns the concatenation of the rotation \c *this with a translation \a t */
-    inline Transform<Scalar,Dim,Isometry> operator*(const Translation<Scalar,Dim>& t) const
+    EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Isometry> operator*(const Translation<Scalar,Dim>& t) const
     { return Transform<Scalar,Dim,Isometry>(*this) * t; }
 
     /** \returns the concatenation of the rotation \c *this with a uniform scaling \a s */
-    inline RotationMatrixType operator*(const UniformScaling<Scalar>& s) const
+    EIGEN_DEVICE_FUNC inline RotationMatrixType operator*(const UniformScaling<Scalar>& s) const
     { return toRotationMatrix() * s.factor(); }
 
     /** \returns the concatenation of the rotation \c *this with a generic expression \a e
@@ -67,17 +67,17 @@
       *  - a vector of size Dim
       */
     template<typename OtherDerived>
-    EIGEN_STRONG_INLINE typename internal::rotation_base_generic_product_selector<Derived,OtherDerived,OtherDerived::IsVectorAtCompileTime>::ReturnType
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::rotation_base_generic_product_selector<Derived,OtherDerived,OtherDerived::IsVectorAtCompileTime>::ReturnType
     operator*(const EigenBase<OtherDerived>& e) const
     { return internal::rotation_base_generic_product_selector<Derived,OtherDerived>::run(derived(), e.derived()); }
 
     /** \returns the concatenation of a linear transformation \a l with the rotation \a r */
     template<typename OtherDerived> friend
-    inline RotationMatrixType operator*(const EigenBase<OtherDerived>& l, const Derived& r)
+    EIGEN_DEVICE_FUNC inline RotationMatrixType operator*(const EigenBase<OtherDerived>& l, const Derived& r)
     { return l.derived() * r.toRotationMatrix(); }
 
     /** \returns the concatenation of a scaling \a l with the rotation \a r */
-    friend inline Transform<Scalar,Dim,Affine> operator*(const DiagonalMatrix<Scalar,Dim>& l, const Derived& r)
+    EIGEN_DEVICE_FUNC friend inline Transform<Scalar,Dim,Affine> operator*(const DiagonalMatrix<Scalar,Dim>& l, const Derived& r)
     { 
       Transform<Scalar,Dim,Affine> res(r);
       res.linear().applyOnTheLeft(l);
@@ -86,11 +86,11 @@
 
     /** \returns the concatenation of the rotation \c *this with a transformation \a t */
     template<int Mode, int Options>
-    inline Transform<Scalar,Dim,Mode> operator*(const Transform<Scalar,Dim,Mode,Options>& t) const
+    EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode> operator*(const Transform<Scalar,Dim,Mode,Options>& t) const
     { return toRotationMatrix() * t; }
 
     template<typename OtherVectorType>
-    inline VectorType _transformVector(const OtherVectorType& v) const
+    EIGEN_DEVICE_FUNC inline VectorType _transformVector(const OtherVectorType& v) const
     { return toRotationMatrix() * v; }
 };
 
@@ -102,7 +102,7 @@
 {
   enum { Dim = RotationDerived::Dim };
   typedef Matrix<typename RotationDerived::Scalar,Dim,Dim> ReturnType;
-  static inline ReturnType run(const RotationDerived& r, const MatrixType& m)
+  EIGEN_DEVICE_FUNC static inline ReturnType run(const RotationDerived& r, const MatrixType& m)
   { return r.toRotationMatrix() * m; }
 };
 
@@ -110,7 +110,7 @@
 struct rotation_base_generic_product_selector< RotationDerived, DiagonalMatrix<Scalar,Dim,MaxDim>, false >
 {
   typedef Transform<Scalar,Dim,Affine> ReturnType;
-  static inline ReturnType run(const RotationDerived& r, const DiagonalMatrix<Scalar,Dim,MaxDim>& m)
+  EIGEN_DEVICE_FUNC static inline ReturnType run(const RotationDerived& r, const DiagonalMatrix<Scalar,Dim,MaxDim>& m)
   {
     ReturnType res(r);
     res.linear() *= m;
@@ -123,7 +123,7 @@
 {
   enum { Dim = RotationDerived::Dim };
   typedef Matrix<typename RotationDerived::Scalar,Dim,1> ReturnType;
-  static EIGEN_STRONG_INLINE ReturnType run(const RotationDerived& r, const OtherVectorType& v)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE ReturnType run(const RotationDerived& r, const OtherVectorType& v)
   {
     return r._transformVector(v);
   }
@@ -137,8 +137,7 @@
   */
 template<typename _Scalar, int _Rows, int _Cols, int _Storage, int _MaxRows, int _MaxCols>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>
+EIGEN_DEVICE_FUNC Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>
 ::Matrix(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
 {
   EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Matrix,int(OtherDerived::Dim),int(OtherDerived::Dim))
@@ -151,8 +150,7 @@
   */
 template<typename _Scalar, int _Rows, int _Cols, int _Storage, int _MaxRows, int _MaxCols>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>&
+EIGEN_DEVICE_FUNC Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>&
 Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>
 ::operator=(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
 {
@@ -166,8 +164,8 @@
   *
   * Helper function to return an arbitrary rotation object to a rotation matrix.
   *
-  * \param Scalar the numeric type of the matrix coefficients
-  * \param Dim the dimension of the current space
+  * \tparam Scalar the numeric type of the matrix coefficients
+  * \tparam Dim the dimension of the current space
   *
   * It returns a Dim x Dim fixed size matrix.
   *
@@ -181,20 +179,20 @@
   * \sa class Transform, class Rotation2D, class Quaternion, class AngleAxis
   */
 template<typename Scalar, int Dim>
-static inline Matrix<Scalar,2,2> toRotationMatrix(const Scalar& s)
+EIGEN_DEVICE_FUNC static inline Matrix<Scalar,2,2> toRotationMatrix(const Scalar& s)
 {
   EIGEN_STATIC_ASSERT(Dim==2,YOU_MADE_A_PROGRAMMING_MISTAKE)
   return Rotation2D<Scalar>(s).toRotationMatrix();
 }
 
 template<typename Scalar, int Dim, typename OtherDerived>
-static inline Matrix<Scalar,Dim,Dim> toRotationMatrix(const RotationBase<OtherDerived,Dim>& r)
+EIGEN_DEVICE_FUNC static inline Matrix<Scalar,Dim,Dim> toRotationMatrix(const RotationBase<OtherDerived,Dim>& r)
 {
   return r.toRotationMatrix();
 }
 
 template<typename Scalar, int Dim, typename OtherDerived>
-static inline const MatrixBase<OtherDerived>& toRotationMatrix(const MatrixBase<OtherDerived>& mat)
+EIGEN_DEVICE_FUNC static inline const MatrixBase<OtherDerived>& toRotationMatrix(const MatrixBase<OtherDerived>& mat)
 {
   EIGEN_STATIC_ASSERT(OtherDerived::RowsAtCompileTime==Dim && OtherDerived::ColsAtCompileTime==Dim,
     YOU_MADE_A_PROGRAMMING_MISTAKE)

diff --git a/Eigen/src/Geometry/Scaling.h b/Eigen/src/Geometry/Scaling.h
index 023fba2..d352f1f 100644
--- a/Eigen/src/Geometry/Scaling.h
+++ b/Eigen/src/Geometry/Scaling.h

@@ -14,11 +14,11 @@
 
 /** \geometry_module \ingroup Geometry_Module
   *
-  * \class Scaling
+  * \class UniformScaling
   *
   * \brief Represents a generic uniform scaling transformation
   *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients.
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients.
   *
   * This class represent a uniform scaling transformation. It is the return
   * type of Scaling(Scalar), and most of the time this is the only way it
@@ -29,6 +29,22 @@
   *
   * \sa Scaling(), class DiagonalMatrix, MatrixBase::asDiagonal(), class Translation, class Transform
   */
+
+namespace internal
+{
+  // This helper helps nvcc+MSVC to properly parse this file.
+  // See bug 1412.
+  template <typename Scalar, int Dim, int Mode>
+  struct uniformscaling_times_affine_returntype
+  {
+    enum
+    {
+      NewMode = int(Mode) == int(Isometry) ? Affine : Mode
+    };
+    typedef Transform <Scalar, Dim, NewMode> type;
+  };
+}
+
 template<typename _Scalar>
 class UniformScaling
 {
@@ -60,9 +76,11 @@
 
   /** Concatenates a uniform scaling and an affine transformation */
   template<int Dim, int Mode, int Options>
-  inline Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Mode)> operator* (const Transform<Scalar,Dim, Mode, Options>& t) const
+  inline typename
+	internal::uniformscaling_times_affine_returntype<Scalar,Dim,Mode>::type
+	operator* (const Transform<Scalar, Dim, Mode, Options>& t) const
   {
-    Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Mode)> res = t;
+    typename internal::uniformscaling_times_affine_returntype<Scalar,Dim,Mode>::type res = t;
     res.prescale(factor());
     return res;
   }
@@ -70,7 +88,7 @@
   /** Concatenates a uniform scaling and a linear transformation matrix */
   // TODO returns an expression
   template<typename Derived>
-  inline typename internal::plain_matrix_type<Derived>::type operator* (const MatrixBase<Derived>& other) const
+  inline typename Eigen::internal::plain_matrix_type<Derived>::type operator* (const MatrixBase<Derived>& other) const
   { return other * m_factor; }
 
   template<typename Derived,int Dim>
@@ -104,40 +122,44 @@
 
 };
 
-/** Concatenates a linear transformation matrix and a uniform scaling */
-// NOTE this operator is defiend in MatrixBase and not as a friend function
+/** \addtogroup Geometry_Module */
+//@{
+
+/** Concatenates a linear transformation matrix and a uniform scaling
+  * \relates UniformScaling
+  */
+// NOTE this operator is defined in MatrixBase and not as a friend function
 // of UniformScaling to fix an internal crash of Intel's ICC
-template<typename Derived> typename MatrixBase<Derived>::ScalarMultipleReturnType
-MatrixBase<Derived>::operator*(const UniformScaling<Scalar>& s) const
-{ return derived() * s.factor(); }
+template<typename Derived,typename Scalar>
+EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,Scalar,product)
+operator*(const MatrixBase<Derived>& matrix, const UniformScaling<Scalar>& s)
+{ return matrix.derived() * s.factor(); }
 
 /** Constructs a uniform scaling from scale factor \a s */
-static inline UniformScaling<float> Scaling(float s) { return UniformScaling<float>(s); }
+inline UniformScaling<float> Scaling(float s) { return UniformScaling<float>(s); }
 /** Constructs a uniform scaling from scale factor \a s */
-static inline UniformScaling<double> Scaling(double s) { return UniformScaling<double>(s); }
+inline UniformScaling<double> Scaling(double s) { return UniformScaling<double>(s); }
 /** Constructs a uniform scaling from scale factor \a s */
 template<typename RealScalar>
-static inline UniformScaling<std::complex<RealScalar> > Scaling(const std::complex<RealScalar>& s)
+inline UniformScaling<std::complex<RealScalar> > Scaling(const std::complex<RealScalar>& s)
 { return UniformScaling<std::complex<RealScalar> >(s); }
 
 /** Constructs a 2D axis aligned scaling */
 template<typename Scalar>
-static inline DiagonalMatrix<Scalar,2> Scaling(const Scalar& sx, const Scalar& sy)
+inline DiagonalMatrix<Scalar,2> Scaling(const Scalar& sx, const Scalar& sy)
 { return DiagonalMatrix<Scalar,2>(sx, sy); }
 /** Constructs a 3D axis aligned scaling */
 template<typename Scalar>
-static inline DiagonalMatrix<Scalar,3> Scaling(const Scalar& sx, const Scalar& sy, const Scalar& sz)
+inline DiagonalMatrix<Scalar,3> Scaling(const Scalar& sx, const Scalar& sy, const Scalar& sz)
 { return DiagonalMatrix<Scalar,3>(sx, sy, sz); }
 
 /** Constructs an axis aligned scaling expression from vector expression \a coeffs
   * This is an alias for coeffs.asDiagonal()
   */
 template<typename Derived>
-static inline const DiagonalWrapper<const Derived> Scaling(const MatrixBase<Derived>& coeffs)
+inline const DiagonalWrapper<const Derived> Scaling(const MatrixBase<Derived>& coeffs)
 { return coeffs.asDiagonal(); }
 
-/** \addtogroup Geometry_Module */
-//@{
 /** \deprecated */
 typedef DiagonalMatrix<float, 2> AlignedScaling2f;
 /** \deprecated */

diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index b44c032..52b8c2a 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h

@@ -12,7 +12,7 @@
 #ifndef EIGEN_TRANSFORM_H
 #define EIGEN_TRANSFORM_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -32,7 +32,8 @@
           typename MatrixType,
           int Case = transform_traits<TransformType>::IsProjective ? 0
                    : int(MatrixType::RowsAtCompileTime) == int(transform_traits<TransformType>::HDim) ? 1
-                   : 2>
+                   : 2,
+          int RhsCols = MatrixType::ColsAtCompileTime>
 struct transform_right_product_impl;
 
 template< typename Other,
@@ -46,7 +47,7 @@
 
 template< typename Lhs,
           typename Rhs,
-          bool AnyProjective = 
+          bool AnyProjective =
             transform_traits<Lhs>::IsProjective ||
             transform_traits<Rhs>::IsProjective>
 struct transform_transform_product_impl;
@@ -62,6 +63,24 @@
 
 template<typename TransformType> struct transform_take_affine_part;
 
+template<typename _Scalar, int _Dim, int _Mode, int _Options>
+struct traits<Transform<_Scalar,_Dim,_Mode,_Options> >
+{
+  typedef _Scalar Scalar;
+  typedef Eigen::Index StorageIndex;
+  typedef Dense StorageKind;
+  enum {
+    Dim1 = _Dim==Dynamic ? _Dim : _Dim + 1,
+    RowsAtCompileTime = _Mode==Projective ? Dim1 : _Dim,
+    ColsAtCompileTime = Dim1,
+    MaxRowsAtCompileTime = RowsAtCompileTime,
+    MaxColsAtCompileTime = ColsAtCompileTime,
+    Flags = 0
+  };
+};
+
+template<int Mode> struct transform_make_affine;
+
 } // end namespace internal
 
 /** \geometry_module \ingroup Geometry_Module
@@ -78,6 +97,9 @@
   *              - #AffineCompact: the transformation is stored as a (Dim)x(Dim+1) matrix.
   *              - #Projective: the transformation is stored as a (Dim+1)^2 matrix
   *                             without any assumption.
+  *              - #Isometry: same as #Affine with the additional assumption that
+  *                           the linear part represents a rotation. This assumption is exploited
+  *                           to speed up some functions such as inverse() and rotation().
   * \tparam _Options has the same meaning as in class Matrix. It allows to specify DontAlign and/or RowMajor.
   *                  These Options are passed directly to the underlying matrix type.
   *
@@ -96,19 +118,19 @@
   * \end{array} \right) \f$
   *
   * Note that for a projective transformation the last row can be anything,
-  * and then the interpretation of different parts might be sightly different.
+  * and then the interpretation of different parts might be slightly different.
   *
   * However, unlike a plain matrix, the Transform class provides many features
   * simplifying both its assembly and usage. In particular, it can be composed
-  * with any other transformations (Transform,Translation,RotationBase,Matrix)
+  * with any other transformations (Transform,Translation,RotationBase,DiagonalMatrix)
   * and can be directly used to transform implicit homogeneous vectors. All these
   * operations are handled via the operator*. For the composition of transformations,
   * its principle consists to first convert the right/left hand sides of the product
   * to a compatible (Dim+1)^2 matrix and then perform a pure matrix product.
   * Of course, internally, operator* tries to perform the minimal number of operations
   * according to the nature of each terms. Likewise, when applying the transform
-  * to non homogeneous vectors, the latters are automatically promoted to homogeneous
-  * one before doing the matrix product. The convertions to homogeneous representations
+  * to points, the latters are automatically promoted to homogeneous vectors
+  * before doing the matrix product. The conventions to homogeneous representations
   * are performed as follow:
   *
   * \b Translation t (Dim)x(1):
@@ -122,7 +144,7 @@
   * R & 0\\
   * 0\,...\,0 & 1
   * \end{array} \right) \f$
-  *
+  *<!--
   * \b Linear \b Matrix L (Dim)x(Dim):
   * \f$ \left( \begin{array}{cc}
   * L & 0\\
@@ -134,14 +156,20 @@
   * A\\
   * 0\,...\,0\,1
   * \end{array} \right) \f$
+  *-->
+  * \b Scaling \b DiagonalMatrix S (Dim)x(Dim):
+  * \f$ \left( \begin{array}{cc}
+  * S & 0\\
+  * 0\,...\,0 & 1
+  * \end{array} \right) \f$
   *
-  * \b Column \b vector v (Dim)x(1):
+  * \b Column \b point v (Dim)x(1):
   * \f$ \left( \begin{array}{c}
   * v\\
   * 1
   * \end{array} \right) \f$
   *
-  * \b Set \b of \b column \b vectors V1...Vn (Dim)x(n):
+  * \b Set \b of \b column \b points V1...Vn (Dim)x(n):
   * \f$ \left( \begin{array}{ccc}
   * v_1 & ... & v_n\\
   * 1 & ... & 1
@@ -168,7 +196,7 @@
   * preprocessor token EIGEN_QT_SUPPORT is defined.
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_TRANSFORM_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_TRANSFORM_PLUGIN.
   *
   * \sa class Matrix, class Quaternion
   */
@@ -186,7 +214,8 @@
   };
   /** the scalar type of the coefficients */
   typedef _Scalar Scalar;
-  typedef DenseIndex Index;
+  typedef Eigen::Index StorageIndex;
+  typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
   /** type of the matrix used to represent the transformation */
   typedef typename internal::make_proper_matrix_type<Scalar,Rows,HDim,Options>::type MatrixType;
   /** constified MatrixType */
@@ -194,9 +223,9 @@
   /** type of the matrix used to represent the linear part of the transformation */
   typedef Matrix<Scalar,Dim,Dim,Options> LinearMatrixType;
   /** type of read/write reference to the linear part of the transformation */
-  typedef Block<MatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (Options&RowMajor)==0> LinearPart;
+  typedef Block<MatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (int(Options)&RowMajor)==0> LinearPart;
   /** type of read reference to the linear part of the transformation */
-  typedef const Block<ConstMatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (Options&RowMajor)==0> ConstLinearPart;
+  typedef const Block<ConstMatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (int(Options)&RowMajor)==0> ConstLinearPart;
   /** type of read/write reference to the affine part of the transformation */
   typedef typename internal::conditional<int(Mode)==int(AffineCompact),
                               MatrixType&,
@@ -213,7 +242,7 @@
   typedef const Block<ConstMatrixType,Dim,1,!(internal::traits<MatrixType>::Flags & RowMajorBit)> ConstTranslationPart;
   /** corresponding translation type */
   typedef Translation<Scalar,Dim> TranslationType;
-  
+
   // this intermediate enum is needed to avoid an ICE with gcc 3.4 and 4.0
   enum { TransformTimeDiagonalMode = ((Mode==int(Isometry))?Affine:int(Mode)) };
   /** The return type of the product between a diagonal matrix and a transform */
@@ -226,45 +255,35 @@
 public:
 
   /** Default constructor without initialization of the meaningful coefficients.
-    * If Mode==Affine, then the last row is set to [0 ... 0 1] */
-  inline Transform()
+    * If Mode==Affine or Mode==Isometry, then the last row is set to [0 ... 0 1] */
+  EIGEN_DEVICE_FUNC inline Transform()
   {
     check_template_params();
-    if (int(Mode)==Affine)
-      makeAffine();
+    internal::transform_make_affine<(int(Mode)==Affine || int(Mode)==Isometry) ? Affine : AffineCompact>::run(m_matrix);
   }
 
-  inline Transform(const Transform& other)
-  {
-    check_template_params();
-    m_matrix = other.m_matrix;
-  }
-
-  inline explicit Transform(const TranslationType& t)
+  EIGEN_DEVICE_FUNC inline explicit Transform(const TranslationType& t)
   {
     check_template_params();
     *this = t;
   }
-  inline explicit Transform(const UniformScaling<Scalar>& s)
+  EIGEN_DEVICE_FUNC inline explicit Transform(const UniformScaling<Scalar>& s)
   {
     check_template_params();
     *this = s;
   }
   template<typename Derived>
-  inline explicit Transform(const RotationBase<Derived, Dim>& r)
+  EIGEN_DEVICE_FUNC inline explicit Transform(const RotationBase<Derived, Dim>& r)
   {
     check_template_params();
     *this = r;
   }
 
-  inline Transform& operator=(const Transform& other)
-  { m_matrix = other.m_matrix; return *this; }
-
   typedef internal::transform_take_affine_part<Transform> take_affine_part;
 
   /** Constructs and initializes a transformation from a Dim^2 or a (Dim+1)^2 matrix. */
   template<typename OtherDerived>
-  inline explicit Transform(const EigenBase<OtherDerived>& other)
+  EIGEN_DEVICE_FUNC inline explicit Transform(const EigenBase<OtherDerived>& other)
   {
     EIGEN_STATIC_ASSERT((internal::is_same<Scalar,typename OtherDerived::Scalar>::value),
       YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY);
@@ -275,7 +294,7 @@
 
   /** Set \c *this from a Dim^2 or (Dim+1)^2 matrix. */
   template<typename OtherDerived>
-  inline Transform& operator=(const EigenBase<OtherDerived>& other)
+  EIGEN_DEVICE_FUNC inline Transform& operator=(const EigenBase<OtherDerived>& other)
   {
     EIGEN_STATIC_ASSERT((internal::is_same<Scalar,typename OtherDerived::Scalar>::value),
       YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY);
@@ -283,9 +302,9 @@
     internal::transform_construct_from_matrix<OtherDerived,Mode,Options,Dim,HDim>::run(this, other.derived());
     return *this;
   }
-  
+
   template<int OtherOptions>
-  inline Transform(const Transform<Scalar,Dim,Mode,OtherOptions>& other)
+  EIGEN_DEVICE_FUNC inline Transform(const Transform<Scalar,Dim,Mode,OtherOptions>& other)
   {
     check_template_params();
     // only the options change, we can directly copy the matrices
@@ -293,7 +312,7 @@
   }
 
   template<int OtherMode,int OtherOptions>
-  inline Transform(const Transform<Scalar,Dim,OtherMode,OtherOptions>& other)
+  EIGEN_DEVICE_FUNC inline Transform(const Transform<Scalar,Dim,OtherMode,OtherOptions>& other)
   {
     check_template_params();
     // prevent conversions as:
@@ -310,7 +329,7 @@
            OtherModeIsAffineCompact = OtherMode == int(AffineCompact)
     };
 
-    if(ModeIsAffineCompact == OtherModeIsAffineCompact)
+    if(EIGEN_CONST_CONDITIONAL(ModeIsAffineCompact == OtherModeIsAffineCompact))
     {
       // We need the block expression because the code is compiled for all
       // combinations of transformations and will trigger a compile time error
@@ -318,7 +337,7 @@
       m_matrix.template block<Dim,Dim+1>(0,0) = other.matrix().template block<Dim,Dim+1>(0,0);
       makeAffine();
     }
-    else if(OtherModeIsAffineCompact)
+    else if(EIGEN_CONST_CONDITIONAL(OtherModeIsAffineCompact))
     {
       typedef typename Transform<Scalar,Dim,OtherMode,OtherOptions>::MatrixType OtherMatrixType;
       internal::transform_construct_from_matrix<OtherMatrixType,Mode,Options,Dim,HDim>::run(this, other.matrix());
@@ -334,14 +353,14 @@
   }
 
   template<typename OtherDerived>
-  Transform(const ReturnByValue<OtherDerived>& other)
+  EIGEN_DEVICE_FUNC Transform(const ReturnByValue<OtherDerived>& other)
   {
     check_template_params();
     other.evalTo(*this);
   }
 
   template<typename OtherDerived>
-  Transform& operator=(const ReturnByValue<OtherDerived>& other)
+  EIGEN_DEVICE_FUNC Transform& operator=(const ReturnByValue<OtherDerived>& other)
   {
     other.evalTo(*this);
     return *this;
@@ -356,104 +375,120 @@
   inline QTransform toQTransform(void) const;
   #endif
 
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return int(Mode)==int(Projective) ? m_matrix.cols() : (m_matrix.cols()-1); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+
   /** shortcut for m_matrix(row,col);
     * \sa MatrixBase::operator(Index,Index) const */
-  inline Scalar operator() (Index row, Index col) const { return m_matrix(row,col); }
+  EIGEN_DEVICE_FUNC inline Scalar operator() (Index row, Index col) const { return m_matrix(row,col); }
   /** shortcut for m_matrix(row,col);
     * \sa MatrixBase::operator(Index,Index) */
-  inline Scalar& operator() (Index row, Index col) { return m_matrix(row,col); }
+  EIGEN_DEVICE_FUNC inline Scalar& operator() (Index row, Index col) { return m_matrix(row,col); }
 
   /** \returns a read-only expression of the transformation matrix */
-  inline const MatrixType& matrix() const { return m_matrix; }
+  EIGEN_DEVICE_FUNC inline const MatrixType& matrix() const { return m_matrix; }
   /** \returns a writable expression of the transformation matrix */
-  inline MatrixType& matrix() { return m_matrix; }
+  EIGEN_DEVICE_FUNC inline MatrixType& matrix() { return m_matrix; }
 
   /** \returns a read-only expression of the linear part of the transformation */
-  inline ConstLinearPart linear() const { return ConstLinearPart(m_matrix,0,0); }
+  EIGEN_DEVICE_FUNC inline ConstLinearPart linear() const { return ConstLinearPart(m_matrix,0,0); }
   /** \returns a writable expression of the linear part of the transformation */
-  inline LinearPart linear() { return LinearPart(m_matrix,0,0); }
+  EIGEN_DEVICE_FUNC inline LinearPart linear() { return LinearPart(m_matrix,0,0); }
 
   /** \returns a read-only expression of the Dim x HDim affine part of the transformation */
-  inline ConstAffinePart affine() const { return take_affine_part::run(m_matrix); }
+  EIGEN_DEVICE_FUNC inline ConstAffinePart affine() const { return take_affine_part::run(m_matrix); }
   /** \returns a writable expression of the Dim x HDim affine part of the transformation */
-  inline AffinePart affine() { return take_affine_part::run(m_matrix); }
+  EIGEN_DEVICE_FUNC inline AffinePart affine() { return take_affine_part::run(m_matrix); }
 
   /** \returns a read-only expression of the translation vector of the transformation */
-  inline ConstTranslationPart translation() const { return ConstTranslationPart(m_matrix,0,Dim); }
+  EIGEN_DEVICE_FUNC inline ConstTranslationPart translation() const { return ConstTranslationPart(m_matrix,0,Dim); }
   /** \returns a writable expression of the translation vector of the transformation */
-  inline TranslationPart translation() { return TranslationPart(m_matrix,0,Dim); }
+  EIGEN_DEVICE_FUNC inline TranslationPart translation() { return TranslationPart(m_matrix,0,Dim); }
 
-  /** \returns an expression of the product between the transform \c *this and a matrix expression \a other
+  /** \returns an expression of the product between the transform \c *this and a matrix expression \a other.
     *
-    * The right hand side \a other might be either:
-    * \li a vector of size Dim,
+    * The right-hand-side \a other can be either:
     * \li an homogeneous vector of size Dim+1,
-    * \li a set of vectors of size Dim x Dynamic,
-    * \li a set of homogeneous vectors of size Dim+1 x Dynamic,
-    * \li a linear transformation matrix of size Dim x Dim,
-    * \li an affine transformation matrix of size Dim x Dim+1,
+    * \li a set of homogeneous vectors of size Dim+1 x N,
     * \li a transformation matrix of size Dim+1 x Dim+1.
+    *
+    * Moreover, if \c *this represents an affine transformation (i.e., Mode!=Projective), then \a other can also be:
+    * \li a point of size Dim (computes: \code this->linear() * other + this->translation()\endcode),
+    * \li a set of N points as a Dim x N matrix (computes: \code (this->linear() * other).colwise() + this->translation()\endcode),
+    *
+    * In all cases, the return type is a matrix or vector of same sizes as the right-hand-side \a other.
+    *
+    * If you want to interpret \a other as a linear or affine transformation, then first convert it to a Transform<> type,
+    * or do your own cooking.
+    *
+    * Finally, if you want to apply Affine transformations to vectors, then explicitly apply the linear part only:
+    * \code
+    * Affine3f A;
+    * Vector3f v1, v2;
+    * v2 = A.linear() * v1;
+    * \endcode
+    *
     */
   // note: this function is defined here because some compilers cannot find the respective declaration
   template<typename OtherDerived>
-  EIGEN_STRONG_INLINE const typename internal::transform_right_product_impl<Transform, OtherDerived>::ResultType
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename internal::transform_right_product_impl<Transform, OtherDerived>::ResultType
   operator * (const EigenBase<OtherDerived> &other) const
   { return internal::transform_right_product_impl<Transform, OtherDerived>::run(*this,other.derived()); }
 
   /** \returns the product expression of a transformation matrix \a a times a transform \a b
     *
-    * The left hand side \a other might be either:
+    * The left hand side \a other can be either:
     * \li a linear transformation matrix of size Dim x Dim,
     * \li an affine transformation matrix of size Dim x Dim+1,
     * \li a general transformation matrix of size Dim+1 x Dim+1.
     */
   template<typename OtherDerived> friend
-  inline const typename internal::transform_left_product_impl<OtherDerived,Mode,Options,_Dim,_Dim+1>::ResultType
+  EIGEN_DEVICE_FUNC inline const typename internal::transform_left_product_impl<OtherDerived,Mode,Options,_Dim,_Dim+1>::ResultType
     operator * (const EigenBase<OtherDerived> &a, const Transform &b)
   { return internal::transform_left_product_impl<OtherDerived,Mode,Options,Dim,HDim>::run(a.derived(),b); }
 
   /** \returns The product expression of a transform \a a times a diagonal matrix \a b
     *
     * The rhs diagonal matrix is interpreted as an affine scaling transformation. The
-    * product results in a Transform of the same type (mode) as the lhs only if the lhs 
+    * product results in a Transform of the same type (mode) as the lhs only if the lhs
     * mode is no isometry. In that case, the returned transform is an affinity.
     */
   template<typename DiagonalDerived>
-  inline const TransformTimeDiagonalReturnType
+  EIGEN_DEVICE_FUNC inline const TransformTimeDiagonalReturnType
     operator * (const DiagonalBase<DiagonalDerived> &b) const
   {
     TransformTimeDiagonalReturnType res(*this);
-    res.linear() *= b;
+    res.linearExt() *= b;
     return res;
   }
 
   /** \returns The product expression of a diagonal matrix \a a times a transform \a b
     *
     * The lhs diagonal matrix is interpreted as an affine scaling transformation. The
-    * product results in a Transform of the same type (mode) as the lhs only if the lhs 
+    * product results in a Transform of the same type (mode) as the lhs only if the lhs
     * mode is no isometry. In that case, the returned transform is an affinity.
     */
   template<typename DiagonalDerived>
-  friend inline TransformTimeDiagonalReturnType
+  EIGEN_DEVICE_FUNC friend inline TransformTimeDiagonalReturnType
     operator * (const DiagonalBase<DiagonalDerived> &a, const Transform &b)
   {
     TransformTimeDiagonalReturnType res;
     res.linear().noalias() = a*b.linear();
     res.translation().noalias() = a*b.translation();
-    if (Mode!=int(AffineCompact))
+    if (EIGEN_CONST_CONDITIONAL(Mode!=int(AffineCompact)))
       res.matrix().row(Dim) = b.matrix().row(Dim);
     return res;
   }
 
   template<typename OtherDerived>
-  inline Transform& operator*=(const EigenBase<OtherDerived>& other) { return *this = *this * other; }
+  EIGEN_DEVICE_FUNC inline Transform& operator*=(const EigenBase<OtherDerived>& other) { return *this = *this * other; }
 
   /** Concatenates two transformations */
-  inline const Transform operator * (const Transform& other) const
+  EIGEN_DEVICE_FUNC inline const Transform operator * (const Transform& other) const
   {
     return internal::transform_transform_product_impl<Transform,Transform>::run(*this,other);
   }
-  
+
   #if EIGEN_COMP_ICC
 private:
   // this intermediate structure permits to workaround a bug in ICC 11:
@@ -462,13 +497,13 @@
   //  (the meaning of a name may have changed since the template declaration -- the type of the template is:
   // "Eigen::internal::transform_transform_product_impl<Eigen::Transform<double, 3, 32, 0>,
   //     Eigen::Transform<double, 3, Mode, Options>, <expression>>::ResultType (const Eigen::Transform<double, 3, Mode, Options> &) const")
-  // 
+  //
   template<int OtherMode,int OtherOptions> struct icc_11_workaround
   {
     typedef internal::transform_transform_product_impl<Transform,Transform<Scalar,Dim,OtherMode,OtherOptions> > ProductType;
     typedef typename ProductType::ResultType ResultType;
   };
-  
+
 public:
   /** Concatenates two different transformations */
   template<int OtherMode,int OtherOptions>
@@ -481,7 +516,7 @@
   #else
   /** Concatenates two different transformations */
   template<int OtherMode,int OtherOptions>
-  inline typename internal::transform_transform_product_impl<Transform,Transform<Scalar,Dim,OtherMode,OtherOptions> >::ResultType
+  EIGEN_DEVICE_FUNC inline typename internal::transform_transform_product_impl<Transform,Transform<Scalar,Dim,OtherMode,OtherOptions> >::ResultType
     operator * (const Transform<Scalar,Dim,OtherMode,OtherOptions>& other) const
   {
     return internal::transform_transform_product_impl<Transform,Transform<Scalar,Dim,OtherMode,OtherOptions> >::run(*this,other);
@@ -489,47 +524,61 @@
   #endif
 
   /** \sa MatrixBase::setIdentity() */
-  void setIdentity() { m_matrix.setIdentity(); }
+  EIGEN_DEVICE_FUNC void setIdentity() { m_matrix.setIdentity(); }
 
   /**
    * \brief Returns an identity transformation.
    * \todo In the future this function should be returning a Transform expression.
    */
-  static const Transform Identity()
+  EIGEN_DEVICE_FUNC static const Transform Identity()
   {
     return Transform(MatrixType::Identity());
   }
 
   template<typename OtherDerived>
+  EIGEN_DEVICE_FUNC
   inline Transform& scale(const MatrixBase<OtherDerived> &other);
 
   template<typename OtherDerived>
+  EIGEN_DEVICE_FUNC
   inline Transform& prescale(const MatrixBase<OtherDerived> &other);
 
-  inline Transform& scale(const Scalar& s);
-  inline Transform& prescale(const Scalar& s);
+  EIGEN_DEVICE_FUNC inline Transform& scale(const Scalar& s);
+  EIGEN_DEVICE_FUNC inline Transform& prescale(const Scalar& s);
 
   template<typename OtherDerived>
+  EIGEN_DEVICE_FUNC
   inline Transform& translate(const MatrixBase<OtherDerived> &other);
 
   template<typename OtherDerived>
+  EIGEN_DEVICE_FUNC
   inline Transform& pretranslate(const MatrixBase<OtherDerived> &other);
 
   template<typename RotationType>
+  EIGEN_DEVICE_FUNC
   inline Transform& rotate(const RotationType& rotation);
 
   template<typename RotationType>
+  EIGEN_DEVICE_FUNC
   inline Transform& prerotate(const RotationType& rotation);
 
-  Transform& shear(const Scalar& sx, const Scalar& sy);
-  Transform& preshear(const Scalar& sx, const Scalar& sy);
+  EIGEN_DEVICE_FUNC Transform& shear(const Scalar& sx, const Scalar& sy);
+  EIGEN_DEVICE_FUNC Transform& preshear(const Scalar& sx, const Scalar& sy);
 
-  inline Transform& operator=(const TranslationType& t);
+  EIGEN_DEVICE_FUNC inline Transform& operator=(const TranslationType& t);
+
+  EIGEN_DEVICE_FUNC
   inline Transform& operator*=(const TranslationType& t) { return translate(t.vector()); }
-  inline Transform operator*(const TranslationType& t) const;
 
+  EIGEN_DEVICE_FUNC inline Transform operator*(const TranslationType& t) const;
+
+  EIGEN_DEVICE_FUNC
   inline Transform& operator=(const UniformScaling<Scalar>& t);
+
+  EIGEN_DEVICE_FUNC
   inline Transform& operator*=(const UniformScaling<Scalar>& s) { return scale(s.factor()); }
+
+  EIGEN_DEVICE_FUNC
   inline TransformTimeDiagonalReturnType operator*(const UniformScaling<Scalar>& s) const
   {
     TransformTimeDiagonalReturnType res = *this;
@@ -537,31 +586,38 @@
     return res;
   }
 
-  inline Transform& operator*=(const DiagonalMatrix<Scalar,Dim>& s) { linear() *= s; return *this; }
+  EIGEN_DEVICE_FUNC
+  inline Transform& operator*=(const DiagonalMatrix<Scalar,Dim>& s) { linearExt() *= s; return *this; }
 
   template<typename Derived>
-  inline Transform& operator=(const RotationBase<Derived,Dim>& r);
+  EIGEN_DEVICE_FUNC inline Transform& operator=(const RotationBase<Derived,Dim>& r);
   template<typename Derived>
-  inline Transform& operator*=(const RotationBase<Derived,Dim>& r) { return rotate(r.toRotationMatrix()); }
+  EIGEN_DEVICE_FUNC inline Transform& operator*=(const RotationBase<Derived,Dim>& r) { return rotate(r.toRotationMatrix()); }
   template<typename Derived>
-  inline Transform operator*(const RotationBase<Derived,Dim>& r) const;
+  EIGEN_DEVICE_FUNC inline Transform operator*(const RotationBase<Derived,Dim>& r) const;
 
-  const LinearMatrixType rotation() const;
+  typedef typename internal::conditional<int(Mode)==Isometry,ConstLinearPart,const LinearMatrixType>::type RotationReturnType;
+  EIGEN_DEVICE_FUNC RotationReturnType rotation() const;
+
   template<typename RotationMatrixType, typename ScalingMatrixType>
+  EIGEN_DEVICE_FUNC
   void computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const;
   template<typename ScalingMatrixType, typename RotationMatrixType>
+  EIGEN_DEVICE_FUNC
   void computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const;
 
   template<typename PositionDerived, typename OrientationType, typename ScaleDerived>
+  EIGEN_DEVICE_FUNC
   Transform& fromPositionOrientationScale(const MatrixBase<PositionDerived> &position,
     const OrientationType& orientation, const MatrixBase<ScaleDerived> &scale);
 
+  EIGEN_DEVICE_FUNC
   inline Transform inverse(TransformTraits traits = (TransformTraits)Mode) const;
 
   /** \returns a const pointer to the column major internal matrix */
-  const Scalar* data() const { return m_matrix.data(); }
+  EIGEN_DEVICE_FUNC const Scalar* data() const { return m_matrix.data(); }
   /** \returns a non-const pointer to the column major internal matrix */
-  Scalar* data() { return m_matrix.data(); }
+  EIGEN_DEVICE_FUNC Scalar* data() { return m_matrix.data(); }
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
     *
@@ -569,12 +625,12 @@
     * then this function smartly returns a const reference to \c *this.
     */
   template<typename NewScalarType>
-  inline typename internal::cast_return_type<Transform,Transform<NewScalarType,Dim,Mode,Options> >::type cast() const
+  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<Transform,Transform<NewScalarType,Dim,Mode,Options> >::type cast() const
   { return typename internal::cast_return_type<Transform,Transform<NewScalarType,Dim,Mode,Options> >::type(*this); }
 
   /** Copy constructor with scalar type conversion */
   template<typename OtherScalarType>
-  inline explicit Transform(const Transform<OtherScalarType,Dim,Mode,Options>& other)
+  EIGEN_DEVICE_FUNC inline explicit Transform(const Transform<OtherScalarType,Dim,Mode,Options>& other)
   {
     check_template_params();
     m_matrix = other.matrix().template cast<Scalar>();
@@ -584,54 +640,50 @@
     * determined by \a prec.
     *
     * \sa MatrixBase::isApprox() */
-  bool isApprox(const Transform& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
+  EIGEN_DEVICE_FUNC bool isApprox(const Transform& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
   { return m_matrix.isApprox(other.m_matrix, prec); }
 
   /** Sets the last row to [0 ... 0 1]
     */
-  void makeAffine()
+  EIGEN_DEVICE_FUNC void makeAffine()
   {
-    if(int(Mode)!=int(AffineCompact))
-    {
-      matrix().template block<1,Dim>(Dim,0).setZero();
-      matrix().coeffRef(Dim,Dim) = Scalar(1);
-    }
+    internal::transform_make_affine<int(Mode)>::run(m_matrix);
   }
 
   /** \internal
     * \returns the Dim x Dim linear part if the transformation is affine,
     *          and the HDim x Dim part for projective transformations.
     */
-  inline Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,Dim> linearExt()
+  EIGEN_DEVICE_FUNC inline Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,Dim> linearExt()
   { return m_matrix.template block<int(Mode)==int(Projective)?HDim:Dim,Dim>(0,0); }
   /** \internal
     * \returns the Dim x Dim linear part if the transformation is affine,
     *          and the HDim x Dim part for projective transformations.
     */
-  inline const Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,Dim> linearExt() const
+  EIGEN_DEVICE_FUNC inline const Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,Dim> linearExt() const
   { return m_matrix.template block<int(Mode)==int(Projective)?HDim:Dim,Dim>(0,0); }
 
   /** \internal
     * \returns the translation part if the transformation is affine,
     *          and the last column for projective transformations.
     */
-  inline Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,1> translationExt()
+  EIGEN_DEVICE_FUNC inline Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,1> translationExt()
   { return m_matrix.template block<int(Mode)==int(Projective)?HDim:Dim,1>(0,Dim); }
   /** \internal
     * \returns the translation part if the transformation is affine,
     *          and the last column for projective transformations.
     */
-  inline const Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,1> translationExt() const
+  EIGEN_DEVICE_FUNC inline const Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,1> translationExt() const
   { return m_matrix.template block<int(Mode)==int(Projective)?HDim:Dim,1>(0,Dim); }
 
 
   #ifdef EIGEN_TRANSFORM_PLUGIN
   #include EIGEN_TRANSFORM_PLUGIN
   #endif
-  
+
 protected:
   #ifndef EIGEN_PARSED_BY_DOXYGEN
-    static EIGEN_STRONG_INLINE void check_template_params()
+    EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void check_template_params()
     {
       EIGEN_STATIC_ASSERT((Options & (DontAlign|RowMajor)) == Options, INVALID_MATRIX_TEMPLATE_PARAMETERS)
     }
@@ -699,7 +751,7 @@
 Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const QMatrix& other)
 {
   EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  if (Mode == int(AffineCompact))
+  if (EIGEN_CONST_CONDITIONAL(Mode == int(AffineCompact)))
     m_matrix << other.m11(), other.m21(), other.dx(),
                 other.m12(), other.m22(), other.dy();
   else
@@ -745,7 +797,7 @@
 {
   check_template_params();
   EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  if (Mode == int(AffineCompact))
+  if (EIGEN_CONST_CONDITIONAL(Mode == int(AffineCompact)))
     m_matrix << other.m11(), other.m21(), other.dx(),
                 other.m12(), other.m22(), other.dy();
   else
@@ -763,7 +815,7 @@
 QTransform Transform<Scalar,Dim,Mode,Options>::toQTransform(void) const
 {
   EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  if (Mode == int(AffineCompact))
+  if (EIGEN_CONST_CONDITIONAL(Mode == int(AffineCompact)))
     return QTransform(m_matrix.coeff(0,0), m_matrix.coeff(1,0),
                       m_matrix.coeff(0,1), m_matrix.coeff(1,1),
                       m_matrix.coeff(0,2), m_matrix.coeff(1,2));
@@ -784,7 +836,7 @@
   */
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename OtherDerived>
-Transform<Scalar,Dim,Mode,Options>&
+EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>&
 Transform<Scalar,Dim,Mode,Options>::scale(const MatrixBase<OtherDerived> &other)
 {
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
@@ -798,7 +850,7 @@
   * \sa prescale(Scalar)
   */
 template<typename Scalar, int Dim, int Mode, int Options>
-inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::scale(const Scalar& s)
+EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::scale(const Scalar& s)
 {
   EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
   linearExt() *= s;
@@ -811,12 +863,12 @@
   */
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename OtherDerived>
-Transform<Scalar,Dim,Mode,Options>&
+EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>&
 Transform<Scalar,Dim,Mode,Options>::prescale(const MatrixBase<OtherDerived> &other)
 {
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
   EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
-  m_matrix.template block<Dim,HDim>(0,0).noalias() = (other.asDiagonal() * m_matrix.template block<Dim,HDim>(0,0));
+  affine().noalias() = (other.asDiagonal() * affine());
   return *this;
 }
 
@@ -825,7 +877,7 @@
   * \sa scale(Scalar)
   */
 template<typename Scalar, int Dim, int Mode, int Options>
-inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::prescale(const Scalar& s)
+EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::prescale(const Scalar& s)
 {
   EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
   m_matrix.template topRows<Dim>() *= s;
@@ -838,7 +890,7 @@
   */
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename OtherDerived>
-Transform<Scalar,Dim,Mode,Options>&
+EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>&
 Transform<Scalar,Dim,Mode,Options>::translate(const MatrixBase<OtherDerived> &other)
 {
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
@@ -852,11 +904,11 @@
   */
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename OtherDerived>
-Transform<Scalar,Dim,Mode,Options>&
+EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>&
 Transform<Scalar,Dim,Mode,Options>::pretranslate(const MatrixBase<OtherDerived> &other)
 {
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
-  if(int(Mode)==int(Projective))
+  if(EIGEN_CONST_CONDITIONAL(int(Mode)==int(Projective)))
     affine() += other * m_matrix.row(Dim);
   else
     translation() += other;
@@ -882,7 +934,7 @@
   */
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename RotationType>
-Transform<Scalar,Dim,Mode,Options>&
+EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>&
 Transform<Scalar,Dim,Mode,Options>::rotate(const RotationType& rotation)
 {
   linearExt() *= internal::toRotationMatrix<Scalar,Dim>(rotation);
@@ -898,7 +950,7 @@
   */
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename RotationType>
-Transform<Scalar,Dim,Mode,Options>&
+EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>&
 Transform<Scalar,Dim,Mode,Options>::prerotate(const RotationType& rotation)
 {
   m_matrix.template block<Dim,HDim>(0,0) = internal::toRotationMatrix<Scalar,Dim>(rotation)
@@ -912,7 +964,7 @@
   * \sa preshear()
   */
 template<typename Scalar, int Dim, int Mode, int Options>
-Transform<Scalar,Dim,Mode,Options>&
+EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>&
 Transform<Scalar,Dim,Mode,Options>::shear(const Scalar& sx, const Scalar& sy)
 {
   EIGEN_STATIC_ASSERT(int(Dim)==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -928,7 +980,7 @@
   * \sa shear()
   */
 template<typename Scalar, int Dim, int Mode, int Options>
-Transform<Scalar,Dim,Mode,Options>&
+EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>&
 Transform<Scalar,Dim,Mode,Options>::preshear(const Scalar& sx, const Scalar& sy)
 {
   EIGEN_STATIC_ASSERT(int(Dim)==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -942,7 +994,7 @@
 ******************************************************/
 
 template<typename Scalar, int Dim, int Mode, int Options>
-inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const TranslationType& t)
+EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const TranslationType& t)
 {
   linear().setIdentity();
   translation() = t.vector();
@@ -951,7 +1003,7 @@
 }
 
 template<typename Scalar, int Dim, int Mode, int Options>
-inline Transform<Scalar,Dim,Mode,Options> Transform<Scalar,Dim,Mode,Options>::operator*(const TranslationType& t) const
+EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode,Options> Transform<Scalar,Dim,Mode,Options>::operator*(const TranslationType& t) const
 {
   Transform res = *this;
   res.translate(t.vector());
@@ -959,7 +1011,7 @@
 }
 
 template<typename Scalar, int Dim, int Mode, int Options>
-inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const UniformScaling<Scalar>& s)
+EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const UniformScaling<Scalar>& s)
 {
   m_matrix.setZero();
   linear().diagonal().fill(s.factor());
@@ -969,7 +1021,7 @@
 
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename Derived>
-inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const RotationBase<Derived,Dim>& r)
+EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const RotationBase<Derived,Dim>& r)
 {
   linear() = internal::toRotationMatrix<Scalar,Dim>(r);
   translation().setZero();
@@ -979,7 +1031,7 @@
 
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename Derived>
-inline Transform<Scalar,Dim,Mode,Options> Transform<Scalar,Dim,Mode,Options>::operator*(const RotationBase<Derived,Dim>& r) const
+EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode,Options> Transform<Scalar,Dim,Mode,Options>::operator*(const RotationBase<Derived,Dim>& r) const
 {
   Transform res = *this;
   res.rotate(r.derived());
@@ -990,20 +1042,43 @@
 *** Special functions ***
 ************************/
 
+namespace internal {
+template<int Mode> struct transform_rotation_impl {
+  template<typename TransformType>
+  EIGEN_DEVICE_FUNC static inline
+  const typename TransformType::LinearMatrixType run(const TransformType& t)
+  {
+    typedef typename TransformType::LinearMatrixType LinearMatrixType;
+    LinearMatrixType result;
+    t.computeRotationScaling(&result, (LinearMatrixType*)0);
+    return result;
+  }
+};
+template<> struct transform_rotation_impl<Isometry> {
+  template<typename TransformType>
+  EIGEN_DEVICE_FUNC static inline
+  typename TransformType::ConstLinearPart run(const TransformType& t)
+  {
+    return t.linear();
+  }
+};
+}
 /** \returns the rotation part of the transformation
   *
+  * If Mode==Isometry, then this method is an alias for linear(),
+  * otherwise it calls computeRotationScaling() to extract the rotation
+  * through a SVD decomposition.
   *
   * \svd_module
   *
   * \sa computeRotationScaling(), computeScalingRotation(), class SVD
   */
 template<typename Scalar, int Dim, int Mode, int Options>
-const typename Transform<Scalar,Dim,Mode,Options>::LinearMatrixType
+EIGEN_DEVICE_FUNC
+typename Transform<Scalar,Dim,Mode,Options>::RotationReturnType
 Transform<Scalar,Dim,Mode,Options>::rotation() const
 {
-  LinearMatrixType result;
-  computeRotationScaling(&result, (LinearMatrixType*)0);
-  return result;
+  return internal::transform_rotation_impl<Mode>::run(*this);
 }
 
 
@@ -1020,23 +1095,24 @@
   */
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename RotationMatrixType, typename ScalingMatrixType>
-void Transform<Scalar,Dim,Mode,Options>::computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const
+EIGEN_DEVICE_FUNC void Transform<Scalar,Dim,Mode,Options>::computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const
 {
+  // Note that JacobiSVD is faster than BDCSVD for small matrices.
   JacobiSVD<LinearMatrixType> svd(linear(), ComputeFullU | ComputeFullV);
 
-  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1
+  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant() < Scalar(0) ? Scalar(-1) : Scalar(1); // so x has absolute value 1
   VectorType sv(svd.singularValues());
-  sv.coeffRef(0) *= x;
-  if(scaling) scaling->lazyAssign(svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint());
+  sv.coeffRef(Dim-1) *= x;
+  if(scaling) *scaling = svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint();
   if(rotation)
   {
     LinearMatrixType m(svd.matrixU());
-    m.col(0) /= x;
-    rotation->lazyAssign(m * svd.matrixV().adjoint());
+    m.col(Dim-1) *= x;
+    *rotation = m * svd.matrixV().adjoint();
   }
 }
 
-/** decomposes the linear part of the transformation as a product rotation x scaling, the scaling being
+/** decomposes the linear part of the transformation as a product scaling x rotation, the scaling being
   * not necessarily positive.
   *
   * If either pointer is zero, the corresponding computation is skipped.
@@ -1049,19 +1125,20 @@
   */
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename ScalingMatrixType, typename RotationMatrixType>
-void Transform<Scalar,Dim,Mode,Options>::computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const
+EIGEN_DEVICE_FUNC void Transform<Scalar,Dim,Mode,Options>::computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const
 {
+  // Note that JacobiSVD is faster than BDCSVD for small matrices.
   JacobiSVD<LinearMatrixType> svd(linear(), ComputeFullU | ComputeFullV);
 
-  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1
+  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant() < Scalar(0) ? Scalar(-1) : Scalar(1); // so x has absolute value 1
   VectorType sv(svd.singularValues());
-  sv.coeffRef(0) *= x;
-  if(scaling) scaling->lazyAssign(svd.matrixU() * sv.asDiagonal() * svd.matrixU().adjoint());
+  sv.coeffRef(Dim-1) *= x;
+  if(scaling) *scaling = svd.matrixU() * sv.asDiagonal() * svd.matrixU().adjoint();
   if(rotation)
   {
     LinearMatrixType m(svd.matrixU());
-    m.col(0) /= x;
-    rotation->lazyAssign(m * svd.matrixV().adjoint());
+    m.col(Dim-1) *= x;
+    *rotation = m * svd.matrixV().adjoint();
   }
 }
 
@@ -1070,7 +1147,7 @@
   */
 template<typename Scalar, int Dim, int Mode, int Options>
 template<typename PositionDerived, typename OrientationType, typename ScaleDerived>
-Transform<Scalar,Dim,Mode,Options>&
+EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>&
 Transform<Scalar,Dim,Mode,Options>::fromPositionOrientationScale(const MatrixBase<PositionDerived> &position,
   const OrientationType& orientation, const MatrixBase<ScaleDerived> &scale)
 {
@@ -1083,18 +1160,36 @@
 
 namespace internal {
 
+template<int Mode>
+struct transform_make_affine
+{
+  template<typename MatrixType>
+  EIGEN_DEVICE_FUNC static void run(MatrixType &mat)
+  {
+    static const int Dim = MatrixType::ColsAtCompileTime-1;
+    mat.template block<1,Dim>(Dim,0).setZero();
+    mat.coeffRef(Dim,Dim) = typename MatrixType::Scalar(1);
+  }
+};
+
+template<>
+struct transform_make_affine<AffineCompact>
+{
+  template<typename MatrixType> EIGEN_DEVICE_FUNC static void run(MatrixType &) { }
+};
+
 // selector needed to avoid taking the inverse of a 3x4 matrix
 template<typename TransformType, int Mode=TransformType::Mode>
 struct projective_transform_inverse
 {
-  static inline void run(const TransformType&, TransformType&)
+  EIGEN_DEVICE_FUNC static inline void run(const TransformType&, TransformType&)
   {}
 };
 
 template<typename TransformType>
 struct projective_transform_inverse<TransformType, Projective>
 {
-  static inline void run(const TransformType& m, TransformType& res)
+  EIGEN_DEVICE_FUNC static inline void run(const TransformType& m, TransformType& res)
   {
     res.matrix() = m.matrix().inverse();
   }
@@ -1124,7 +1219,7 @@
   * \sa MatrixBase::inverse()
   */
 template<typename Scalar, int Dim, int Mode, int Options>
-Transform<Scalar,Dim,Mode,Options>
+EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>
 Transform<Scalar,Dim,Mode,Options>::inverse(TransformTraits hint) const
 {
   Transform res;
@@ -1223,8 +1318,8 @@
 template<int LhsMode,int RhsMode>
 struct transform_product_result
 {
-  enum 
-  { 
+  enum
+  {
     Mode =
       (LhsMode == (int)Projective    || RhsMode == (int)Projective    ) ? Projective :
       (LhsMode == (int)Affine        || RhsMode == (int)Affine        ) ? Affine :
@@ -1233,22 +1328,22 @@
   };
 };
 
-template< typename TransformType, typename MatrixType >
-struct transform_right_product_impl< TransformType, MatrixType, 0 >
+template< typename TransformType, typename MatrixType, int RhsCols>
+struct transform_right_product_impl< TransformType, MatrixType, 0, RhsCols>
 {
   typedef typename MatrixType::PlainObject ResultType;
 
-  static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
   {
     return T.matrix() * other;
   }
 };
 
-template< typename TransformType, typename MatrixType >
-struct transform_right_product_impl< TransformType, MatrixType, 1 >
+template< typename TransformType, typename MatrixType, int RhsCols>
+struct transform_right_product_impl< TransformType, MatrixType, 1, RhsCols>
 {
-  enum { 
-    Dim = TransformType::Dim, 
+  enum {
+    Dim = TransformType::Dim,
     HDim = TransformType::HDim,
     OtherRows = MatrixType::RowsAtCompileTime,
     OtherCols = MatrixType::ColsAtCompileTime
@@ -1256,7 +1351,7 @@
 
   typedef typename MatrixType::PlainObject ResultType;
 
-  static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
   {
     EIGEN_STATIC_ASSERT(OtherRows==HDim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES);
 
@@ -1265,16 +1360,16 @@
     ResultType res(other.rows(),other.cols());
     TopLeftLhs(res, 0, 0, Dim, other.cols()).noalias() = T.affine() * other;
     res.row(OtherRows-1) = other.row(OtherRows-1);
-    
+
     return res;
   }
 };
 
-template< typename TransformType, typename MatrixType >
-struct transform_right_product_impl< TransformType, MatrixType, 2 >
+template< typename TransformType, typename MatrixType, int RhsCols>
+struct transform_right_product_impl< TransformType, MatrixType, 2, RhsCols>
 {
-  enum { 
-    Dim = TransformType::Dim, 
+  enum {
+    Dim = TransformType::Dim,
     HDim = TransformType::HDim,
     OtherRows = MatrixType::RowsAtCompileTime,
     OtherCols = MatrixType::ColsAtCompileTime
@@ -1282,7 +1377,7 @@
 
   typedef typename MatrixType::PlainObject ResultType;
 
-  static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
   {
     EIGEN_STATIC_ASSERT(OtherRows==Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES);
 
@@ -1294,6 +1389,30 @@
   }
 };
 
+template< typename TransformType, typename MatrixType >
+struct transform_right_product_impl< TransformType, MatrixType, 2, 1> // rhs is a vector of size Dim
+{
+  typedef typename TransformType::MatrixType TransformMatrix;
+  enum {
+    Dim = TransformType::Dim,
+    HDim = TransformType::HDim,
+    OtherRows = MatrixType::RowsAtCompileTime,
+    WorkingRows = EIGEN_PLAIN_ENUM_MIN(TransformMatrix::RowsAtCompileTime,HDim)
+  };
+
+  typedef typename MatrixType::PlainObject ResultType;
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
+  {
+    EIGEN_STATIC_ASSERT(OtherRows==Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES);
+
+    Matrix<typename ResultType::Scalar, Dim+1, 1> rhs;
+    rhs.template head<Dim>() = other; rhs[Dim] = typename ResultType::Scalar(1);
+    Matrix<typename ResultType::Scalar, WorkingRows, 1> res(T.matrix() * rhs);
+    return res.template head<Dim>();
+  }
+};
+
 /**********************************************************
 ***   Specializations of operator* with lhs EigenBase   ***
 **********************************************************/

diff --git a/Eigen/src/Geometry/Translation.h b/Eigen/src/Geometry/Translation.h
index 7fda179..8c22901 100644
--- a/Eigen/src/Geometry/Translation.h
+++ b/Eigen/src/Geometry/Translation.h

@@ -18,8 +18,8 @@
   *
   * \brief Represents a translation transformation
   *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients.
-  * \param _Dim the  dimension of the space, can be a compile time value or Dynamic
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients.
+  * \tparam _Dim the  dimension of the space, can be a compile time value or Dynamic
   *
   * \note This class is not aimed to be used to store a translation transformation,
   * but rather to make easier the constructions and updates of Transform objects.
@@ -51,16 +51,16 @@
 public:
 
   /** Default constructor without initialization. */
-  Translation() {}
+  EIGEN_DEVICE_FUNC Translation() {}
   /**  */
-  inline Translation(const Scalar& sx, const Scalar& sy)
+  EIGEN_DEVICE_FUNC inline Translation(const Scalar& sx, const Scalar& sy)
   {
     eigen_assert(Dim==2);
     m_coeffs.x() = sx;
     m_coeffs.y() = sy;
   }
   /**  */
-  inline Translation(const Scalar& sx, const Scalar& sy, const Scalar& sz)
+  EIGEN_DEVICE_FUNC inline Translation(const Scalar& sx, const Scalar& sy, const Scalar& sz)
   {
     eigen_assert(Dim==3);
     m_coeffs.x() = sx;
@@ -68,48 +68,48 @@
     m_coeffs.z() = sz;
   }
   /** Constructs and initialize the translation transformation from a vector of translation coefficients */
-  explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {}
+  EIGEN_DEVICE_FUNC explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {}
 
-  /** \brief Retruns the x-translation by value. **/
-  inline Scalar x() const { return m_coeffs.x(); }
-  /** \brief Retruns the y-translation by value. **/
-  inline Scalar y() const { return m_coeffs.y(); }
-  /** \brief Retruns the z-translation by value. **/
-  inline Scalar z() const { return m_coeffs.z(); }
+  /** \brief Returns the x-translation by value. **/
+  EIGEN_DEVICE_FUNC inline Scalar x() const { return m_coeffs.x(); }
+  /** \brief Returns the y-translation by value. **/
+  EIGEN_DEVICE_FUNC inline Scalar y() const { return m_coeffs.y(); }
+  /** \brief Returns the z-translation by value. **/
+  EIGEN_DEVICE_FUNC inline Scalar z() const { return m_coeffs.z(); }
 
-  /** \brief Retruns the x-translation as a reference. **/
-  inline Scalar& x() { return m_coeffs.x(); }
-  /** \brief Retruns the y-translation as a reference. **/
-  inline Scalar& y() { return m_coeffs.y(); }
-  /** \brief Retruns the z-translation as a reference. **/
-  inline Scalar& z() { return m_coeffs.z(); }
+  /** \brief Returns the x-translation as a reference. **/
+  EIGEN_DEVICE_FUNC inline Scalar& x() { return m_coeffs.x(); }
+  /** \brief Returns the y-translation as a reference. **/
+  EIGEN_DEVICE_FUNC inline Scalar& y() { return m_coeffs.y(); }
+  /** \brief Returns the z-translation as a reference. **/
+  EIGEN_DEVICE_FUNC inline Scalar& z() { return m_coeffs.z(); }
 
-  const VectorType& vector() const { return m_coeffs; }
-  VectorType& vector() { return m_coeffs; }
+  EIGEN_DEVICE_FUNC const VectorType& vector() const { return m_coeffs; }
+  EIGEN_DEVICE_FUNC VectorType& vector() { return m_coeffs; }
 
-  const VectorType& translation() const { return m_coeffs; }
-  VectorType& translation() { return m_coeffs; }
+  EIGEN_DEVICE_FUNC const VectorType& translation() const { return m_coeffs; }
+  EIGEN_DEVICE_FUNC VectorType& translation() { return m_coeffs; }
 
   /** Concatenates two translation */
-  inline Translation operator* (const Translation& other) const
+  EIGEN_DEVICE_FUNC inline Translation operator* (const Translation& other) const
   { return Translation(m_coeffs + other.m_coeffs); }
 
   /** Concatenates a translation and a uniform scaling */
-  inline AffineTransformType operator* (const UniformScaling<Scalar>& other) const;
+  EIGEN_DEVICE_FUNC inline AffineTransformType operator* (const UniformScaling<Scalar>& other) const;
 
   /** Concatenates a translation and a linear transformation */
   template<typename OtherDerived>
-  inline AffineTransformType operator* (const EigenBase<OtherDerived>& linear) const;
+  EIGEN_DEVICE_FUNC inline AffineTransformType operator* (const EigenBase<OtherDerived>& linear) const;
 
   /** Concatenates a translation and a rotation */
   template<typename Derived>
-  inline IsometryTransformType operator*(const RotationBase<Derived,Dim>& r) const
+  EIGEN_DEVICE_FUNC inline IsometryTransformType operator*(const RotationBase<Derived,Dim>& r) const
   { return *this * IsometryTransformType(r); }
 
   /** \returns the concatenation of a linear transformation \a l with the translation \a t */
   // its a nightmare to define a templated friend function outside its declaration
   template<typename OtherDerived> friend
-  inline AffineTransformType operator*(const EigenBase<OtherDerived>& linear, const Translation& t)
+  EIGEN_DEVICE_FUNC inline AffineTransformType operator*(const EigenBase<OtherDerived>& linear, const Translation& t)
   {
     AffineTransformType res;
     res.matrix().setZero();
@@ -122,7 +122,7 @@
 
   /** Concatenates a translation and a transformation */
   template<int Mode, int Options>
-  inline Transform<Scalar,Dim,Mode> operator* (const Transform<Scalar,Dim,Mode,Options>& t) const
+  EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode> operator* (const Transform<Scalar,Dim,Mode,Options>& t) const
   {
     Transform<Scalar,Dim,Mode> res = t;
     res.pretranslate(m_coeffs);
@@ -130,18 +130,14 @@
   }
 
   /** Applies translation to vector */
-  inline VectorType operator* (const VectorType& other) const
-  { return m_coeffs + other; }
+  template<typename Derived>
+  inline typename internal::enable_if<Derived::IsVectorAtCompileTime,VectorType>::type
+  operator* (const MatrixBase<Derived>& vec) const
+  { return m_coeffs + vec.derived(); }
 
   /** \returns the inverse translation (opposite) */
   Translation inverse() const { return Translation(-m_coeffs); }
 
-  Translation& operator=(const Translation& other)
-  {
-    m_coeffs = other.m_coeffs;
-    return *this;
-  }
-
   static const Translation Identity() { return Translation(VectorType::Zero()); }
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
@@ -150,19 +146,19 @@
     * then this function smartly returns a const reference to \c *this.
     */
   template<typename NewScalarType>
-  inline typename internal::cast_return_type<Translation,Translation<NewScalarType,Dim> >::type cast() const
+  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<Translation,Translation<NewScalarType,Dim> >::type cast() const
   { return typename internal::cast_return_type<Translation,Translation<NewScalarType,Dim> >::type(*this); }
 
   /** Copy constructor with scalar type conversion */
   template<typename OtherScalarType>
-  inline explicit Translation(const Translation<OtherScalarType,Dim>& other)
+  EIGEN_DEVICE_FUNC inline explicit Translation(const Translation<OtherScalarType,Dim>& other)
   { m_coeffs = other.vector().template cast<Scalar>(); }
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
     * determined by \a prec.
     *
     * \sa MatrixBase::isApprox() */
-  bool isApprox(const Translation& other, typename NumTraits<Scalar>::Real prec = NumTraits<Scalar>::dummy_precision()) const
+  EIGEN_DEVICE_FUNC bool isApprox(const Translation& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
   { return m_coeffs.isApprox(other.m_coeffs, prec); }
 
 };
@@ -176,7 +172,7 @@
 //@}
 
 template<typename Scalar, int Dim>
-inline typename Translation<Scalar,Dim>::AffineTransformType
+EIGEN_DEVICE_FUNC inline typename Translation<Scalar,Dim>::AffineTransformType
 Translation<Scalar,Dim>::operator* (const UniformScaling<Scalar>& other) const
 {
   AffineTransformType res;
@@ -189,7 +185,7 @@
 
 template<typename Scalar, int Dim>
 template<typename OtherDerived>
-inline typename Translation<Scalar,Dim>::AffineTransformType
+EIGEN_DEVICE_FUNC inline typename Translation<Scalar,Dim>::AffineTransformType
 Translation<Scalar,Dim>::operator* (const EigenBase<OtherDerived>& linear) const
 {
   AffineTransformType res;

diff --git a/Eigen/src/Geometry/Umeyama.h b/Eigen/src/Geometry/Umeyama.h
index 5e20662..6b75500 100644
--- a/Eigen/src/Geometry/Umeyama.h
+++ b/Eigen/src/Geometry/Umeyama.h

@@ -87,7 +87,7 @@
 * \f{align*}
 *   T = \begin{bmatrix} c\mathbf{R} & \mathbf{t} \\ \mathbf{0} & 1 \end{bmatrix}
 * \f}
-* minimizing the resudiual above. This transformation is always returned as an 
+* minimizing the residual above. This transformation is always returned as an 
 * Eigen::Matrix.
 */
 template <typename Derived, typename OtherDerived>
@@ -97,7 +97,6 @@
   typedef typename internal::umeyama_transform_matrix_type<Derived, OtherDerived>::type TransformationMatrixType;
   typedef typename internal::traits<TransformationMatrixType>::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef typename Derived::Index Index;
 
   EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL)
   EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename internal::traits<OtherDerived>::Scalar>::value),
@@ -136,22 +135,12 @@
 
   // Eq. (39)
   VectorType S = VectorType::Ones(m);
-  if (sigma.determinant()<Scalar(0)) S(m-1) = Scalar(-1);
+
+  if  ( svd.matrixU().determinant() * svd.matrixV().determinant() < 0 )
+    S(m-1) = -1;
 
   // Eq. (40) and (43)
-  const VectorType& d = svd.singularValues();
-  Index rank = 0; for (Index i=0; i<m; ++i) if (!internal::isMuchSmallerThan(d.coeff(i),d.coeff(0))) ++rank;
-  if (rank == m-1) {
-    if ( svd.matrixU().determinant() * svd.matrixV().determinant() > Scalar(0) ) {
-      Rt.block(0,0,m,m).noalias() = svd.matrixU()*svd.matrixV().transpose();
-    } else {
-      const Scalar s = S(m-1); S(m-1) = Scalar(-1);
-      Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose();
-      S(m-1) = s;
-    }
-  } else {
-    Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose();
-  }
+  Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose();
 
   if (with_scaling)
   {

diff --git a/Eigen/src/Geometry/arch/Geometry_SIMD.h b/Eigen/src/Geometry/arch/Geometry_SIMD.h
new file mode 100644
index 0000000..9af6a9a
--- /dev/null
+++ b/Eigen/src/Geometry/arch/Geometry_SIMD.h

@@ -0,0 +1,168 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Rohit Garg <rpg.314@gmail.com>
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GEOMETRY_SIMD_H
+#define EIGEN_GEOMETRY_SIMD_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<class Derived, class OtherDerived>
+struct quat_product<Architecture::Target, Derived, OtherDerived, float>
+{
+  enum {
+    AAlignment = traits<Derived>::Alignment,
+    BAlignment = traits<OtherDerived>::Alignment,
+    ResAlignment = traits<Quaternion<float> >::Alignment
+  };
+  static inline Quaternion<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
+  {
+    evaluator<typename Derived::Coefficients> ae(_a.coeffs());
+    evaluator<typename OtherDerived::Coefficients> be(_b.coeffs());
+    Quaternion<float> res;
+    const float neg_zero = numext::bit_cast<float>(0x80000000u);
+    const float arr[4] = {0.f, 0.f, 0.f, neg_zero};
+    const Packet4f mask = ploadu<Packet4f>(arr);
+    Packet4f a = ae.template packet<AAlignment,Packet4f>(0);
+    Packet4f b = be.template packet<BAlignment,Packet4f>(0);
+    Packet4f s1 = pmul(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
+    Packet4f s2 = pmul(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1));
+    pstoret<float,Packet4f,ResAlignment>(
+              &res.x(),
+              padd(psub(pmul(a,vec4f_swizzle1(b,3,3,3,3)),
+                                    pmul(vec4f_swizzle1(a,2,0,1,0),
+                                               vec4f_swizzle1(b,1,2,0,0))),
+                         pxor(mask,padd(s1,s2))));
+    
+    return res;
+  }
+};
+
+template<class Derived>
+struct quat_conj<Architecture::Target, Derived, float>
+{
+  enum {
+    ResAlignment = traits<Quaternion<float> >::Alignment
+  };
+  static inline Quaternion<float> run(const QuaternionBase<Derived>& q)
+  {
+    evaluator<typename Derived::Coefficients> qe(q.coeffs());
+    Quaternion<float> res;
+    const float neg_zero = numext::bit_cast<float>(0x80000000u);
+    const float arr[4] = {neg_zero, neg_zero, neg_zero,0.f};
+    const Packet4f mask = ploadu<Packet4f>(arr);
+    pstoret<float,Packet4f,ResAlignment>(&res.x(), pxor(mask, qe.template packet<traits<Derived>::Alignment,Packet4f>(0)));
+    return res;
+  }
+};
+
+
+template<typename VectorLhs,typename VectorRhs>
+struct cross3_impl<Architecture::Target,VectorLhs,VectorRhs,float,true>
+{
+  enum {
+    ResAlignment = traits<typename plain_matrix_type<VectorLhs>::type>::Alignment
+  };
+  static inline typename plain_matrix_type<VectorLhs>::type
+  run(const VectorLhs& lhs, const VectorRhs& rhs)
+  {
+    evaluator<VectorLhs> lhs_eval(lhs);
+    evaluator<VectorRhs> rhs_eval(rhs);
+    Packet4f a = lhs_eval.template packet<traits<VectorLhs>::Alignment,Packet4f>(0);
+    Packet4f b = rhs_eval.template packet<traits<VectorRhs>::Alignment,Packet4f>(0);
+    Packet4f mul1 = pmul(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3));
+    Packet4f mul2 = pmul(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3));
+    typename plain_matrix_type<VectorLhs>::type res;
+    pstoret<float,Packet4f,ResAlignment>(&res.x(),psub(mul1,mul2));
+    return res;
+  }
+};
+
+
+
+#if (defined EIGEN_VECTORIZE_SSE) || (EIGEN_ARCH_ARM64)
+
+template<class Derived, class OtherDerived>
+struct quat_product<Architecture::Target, Derived, OtherDerived, double>
+{
+  enum {
+    BAlignment = traits<OtherDerived>::Alignment,
+    ResAlignment = traits<Quaternion<double> >::Alignment
+  };
+
+  static inline Quaternion<double> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
+  {
+  Quaternion<double> res;
+
+  evaluator<typename Derived::Coefficients> ae(_a.coeffs());
+  evaluator<typename OtherDerived::Coefficients> be(_b.coeffs());
+
+  const double* a = _a.coeffs().data();
+  Packet2d b_xy = be.template packet<BAlignment,Packet2d>(0);
+  Packet2d b_zw = be.template packet<BAlignment,Packet2d>(2);
+  Packet2d a_xx = pset1<Packet2d>(a[0]);
+  Packet2d a_yy = pset1<Packet2d>(a[1]);
+  Packet2d a_zz = pset1<Packet2d>(a[2]);
+  Packet2d a_ww = pset1<Packet2d>(a[3]);
+
+  // two temporaries:
+  Packet2d t1, t2;
+
+  /*
+   * t1 = ww*xy + yy*zw
+   * t2 = zz*xy - xx*zw
+   * res.xy = t1 +/- swap(t2)
+   */
+  t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw));
+  t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw));
+  pstoret<double,Packet2d,ResAlignment>(&res.x(), paddsub(t1, preverse(t2)));
+  
+  /*
+   * t1 = ww*zw - yy*xy
+   * t2 = zz*zw + xx*xy
+   * res.zw = t1 -/+ swap(t2) = swap( swap(t1) +/- t2)
+   */
+  t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy));
+  t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy));
+  pstoret<double,Packet2d,ResAlignment>(&res.z(), preverse(paddsub(preverse(t1), t2)));
+
+  return res;
+}
+};
+
+template<class Derived>
+struct quat_conj<Architecture::Target, Derived, double>
+{
+  enum {
+    ResAlignment = traits<Quaternion<double> >::Alignment
+  };
+  static inline Quaternion<double> run(const QuaternionBase<Derived>& q)
+  {
+    evaluator<typename Derived::Coefficients> qe(q.coeffs());
+    Quaternion<double> res;
+    const double neg_zero = numext::bit_cast<double>(0x8000000000000000ull);
+    const double arr1[2] = {neg_zero, neg_zero};
+    const double arr2[2] = {neg_zero,  0.0};
+    const Packet2d mask0 = ploadu<Packet2d>(arr1);
+    const Packet2d mask2 = ploadu<Packet2d>(arr2);
+    pstoret<double,Packet2d,ResAlignment>(&res.x(), pxor(mask0, qe.template packet<traits<Derived>::Alignment,Packet2d>(0)));
+    pstoret<double,Packet2d,ResAlignment>(&res.z(), pxor(mask2, qe.template packet<traits<Derived>::Alignment,Packet2d>(2)));
+    return res;
+  }
+};
+
+#endif // end EIGEN_VECTORIZE_SSE_OR_EIGEN_ARCH_ARM64
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GEOMETRY_SIMD_H

diff --git a/Eigen/src/Householder/BlockHouseholder.h b/Eigen/src/Householder/BlockHouseholder.h
index f7e9939..39ce1c2 100644
--- a/Eigen/src/Householder/BlockHouseholder.h
+++ b/Eigen/src/Householder/BlockHouseholder.h

@@ -13,10 +13,10 @@
 
 // This file contains some helper function to deal with block householder reflectors
 
-namespace Eigen {
+namespace Eigen { 
 
 namespace internal {
-
+  
 /** \internal */
 // template<typename TriangularFactorType,typename VectorsType,typename CoeffsType>
 // void make_block_householder_triangular_factor(TriangularFactorType& triFactor, const VectorsType& vectors, const CoeffsType& hCoeffs)
@@ -24,12 +24,12 @@
 //   typedef typename VectorsType::Scalar Scalar;
 //   const Index nbVecs = vectors.cols();
 //   eigen_assert(triFactor.rows() == nbVecs && triFactor.cols() == nbVecs && vectors.rows()>=nbVecs);
-//
+// 
 //   for(Index i = 0; i < nbVecs; i++)
 //   {
 //     Index rs = vectors.rows() - i;
 //     // Warning, note that hCoeffs may alias with vectors.
-//     // It is then necessary to copy it before modifying vectors(i,i).
+//     // It is then necessary to copy it before modifying vectors(i,i). 
 //     typename CoeffsType::Scalar h = hCoeffs(i);
 //     // This hack permits to pass trough nested Block<> and Transpose<> expressions.
 //     Scalar *Vii_ptr = const_cast<Scalar*>(vectors.data() + vectors.outerStride()*i + vectors.innerStride()*i);
@@ -62,10 +62,17 @@
     {
       triFactor.row(i).tail(rt).noalias() = -hCoeffs(i) * vectors.col(i).tail(rs).adjoint()
                                                         * vectors.bottomRightCorner(rs, rt).template triangularView<UnitLower>();
-
-      // FIXME add .noalias() once the triangular product can work inplace
-      triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView<Upper>();
-
+            
+      // FIXME use the following line with .noalias() once the triangular product can work inplace
+      // triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView<Upper>();
+      for(Index j=nbVecs-1; j>i; --j)
+      {
+        typename TriangularFactorType::Scalar z = triFactor(i,j);
+        triFactor(i,j) = z * triFactor(j,j);
+        if(nbVecs-j-1>0)
+          triFactor.row(i).tail(nbVecs-j-1) += z * triFactor.row(j).tail(nbVecs-j-1);
+      }
+      
     }
     triFactor(i,i) = hCoeffs(i);
   }
@@ -81,14 +88,15 @@
   enum { TFactorSize = MatrixType::ColsAtCompileTime };
   Index nbVecs = vectors.cols();
   Matrix<typename MatrixType::Scalar, TFactorSize, TFactorSize, RowMajor> T(nbVecs,nbVecs);
-
+  
   if(forward) make_block_householder_triangular_factor(T, vectors, hCoeffs);
-  else        make_block_householder_triangular_factor(T, vectors, hCoeffs.conjugate());
+  else        make_block_householder_triangular_factor(T, vectors, hCoeffs.conjugate());  
   const TriangularView<const VectorsType, UnitLower> V(vectors);
 
   // A -= V T V^* A
-  Matrix<typename MatrixType::Scalar,VectorsType::ColsAtCompileTime,MatrixType::ColsAtCompileTime,0,
-         VectorsType::MaxColsAtCompileTime,MatrixType::MaxColsAtCompileTime> tmp = (V.adjoint() * mat).eval();
+  Matrix<typename MatrixType::Scalar,VectorsType::ColsAtCompileTime,MatrixType::ColsAtCompileTime,
+         (VectorsType::MaxColsAtCompileTime==1 && MatrixType::MaxColsAtCompileTime!=1)?RowMajor:ColMajor,
+         VectorsType::MaxColsAtCompileTime,MatrixType::MaxColsAtCompileTime> tmp = V.adjoint() * mat;
   // FIXME add .noalias() once the triangular product can work inplace
   if(forward) tmp = T.template triangularView<Upper>()           * tmp;
   else        tmp = T.template triangularView<Upper>().adjoint() * tmp;

diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h
index 32112af..5bc037f 100644
--- a/Eigen/src/Householder/Householder.h
+++ b/Eigen/src/Householder/Householder.h

@@ -39,6 +39,7 @@
   *     MatrixBase::applyHouseholderOnTheRight()
   */
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::makeHouseholderInPlace(Scalar& tau, RealScalar& beta)
 {
   VectorBlock<Derived, internal::decrement_size<Base::SizeAtCompileTime>::ret> essentialPart(derived(), 1, size()-1);
@@ -62,6 +63,7 @@
   */
 template<typename Derived>
 template<typename EssentialPart>
+EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::makeHouseholder(
   EssentialPart& essential,
   Scalar& tau,
@@ -75,8 +77,9 @@
   
   RealScalar tailSqNorm = size()==1 ? RealScalar(0) : tail.squaredNorm();
   Scalar c0 = coeff(0);
+  const RealScalar tol = (std::numeric_limits<RealScalar>::min)();
 
-  if(tailSqNorm == RealScalar(0) && numext::imag(c0)==RealScalar(0))
+  if(tailSqNorm <= tol && numext::abs2(numext::imag(c0))<=tol)
   {
     tau = RealScalar(0);
     beta = numext::real(c0);
@@ -102,13 +105,14 @@
   * \param essential the essential part of the vector \c v
   * \param tau the scaling factor of the Householder transformation
   * \param workspace a pointer to working space with at least
-  *                  this->cols() * essential.size() entries
+  *                  this->cols() entries
   *
   * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), 
   *     MatrixBase::applyHouseholderOnTheRight()
   */
 template<typename Derived>
 template<typename EssentialPart>
+EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::applyHouseholderOnTheLeft(
   const EssentialPart& essential,
   const Scalar& tau,
@@ -118,7 +122,7 @@
   {
     *this *= Scalar(1)-tau;
   }
-  else
+  else if(tau!=Scalar(0))
   {
     Map<typename internal::plain_row_type<PlainObject>::type> tmp(workspace,cols());
     Block<Derived, EssentialPart::SizeAtCompileTime, Derived::ColsAtCompileTime> bottom(derived(), 1, 0, rows()-1, cols());
@@ -139,13 +143,14 @@
   * \param essential the essential part of the vector \c v
   * \param tau the scaling factor of the Householder transformation
   * \param workspace a pointer to working space with at least
-  *                  this->cols() * essential.size() entries
+  *                  this->rows() entries
   *
   * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), 
   *     MatrixBase::applyHouseholderOnTheLeft()
   */
 template<typename Derived>
 template<typename EssentialPart>
+EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::applyHouseholderOnTheRight(
   const EssentialPart& essential,
   const Scalar& tau,
@@ -155,14 +160,14 @@
   {
     *this *= Scalar(1)-tau;
   }
-  else
+  else if(tau!=Scalar(0))
   {
     Map<typename internal::plain_col_type<PlainObject>::type> tmp(workspace,rows());
     Block<Derived, Derived::RowsAtCompileTime, EssentialPart::SizeAtCompileTime> right(derived(), 0, 1, rows(), cols()-1);
-    tmp.noalias() = right * essential.conjugate();
+    tmp.noalias() = right * essential;
     tmp += this->col(0);
     this->col(0) -= tau * tmp;
-    right.noalias() -= tau * tmp * essential.transpose();
+    right.noalias() -= tau * tmp * essential.adjoint();
   }
 }
 

diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h
index 340e89f..022f6c3 100644
--- a/Eigen/src/Householder/HouseholderSequence.h
+++ b/Eigen/src/Householder/HouseholderSequence.h

@@ -60,7 +60,7 @@
 struct traits<HouseholderSequence<VectorsType,CoeffsType,Side> >
 {
   typedef typename VectorsType::Scalar Scalar;
-  typedef typename VectorsType::Index Index;
+  typedef typename VectorsType::StorageIndex StorageIndex;
   typedef typename VectorsType::StorageKind StorageKind;
   enum {
     RowsAtCompileTime = Side==OnTheLeft ? traits<VectorsType>::RowsAtCompileTime
@@ -73,13 +73,21 @@
   };
 };
 
+struct HouseholderSequenceShape {};
+
+template<typename VectorsType, typename CoeffsType, int Side>
+struct evaluator_traits<HouseholderSequence<VectorsType,CoeffsType,Side> >
+  : public evaluator_traits_base<HouseholderSequence<VectorsType,CoeffsType,Side> >
+{
+  typedef HouseholderSequenceShape Shape;
+};
+
 template<typename VectorsType, typename CoeffsType, int Side>
 struct hseq_side_dependent_impl
 {
   typedef Block<const VectorsType, Dynamic, 1> EssentialVectorType;
   typedef HouseholderSequence<VectorsType, CoeffsType, OnTheLeft> HouseholderSequenceType;
-  typedef typename VectorsType::Index Index;
-  static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
+  static EIGEN_DEVICE_FUNC inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
   {
     Index start = k+1+h.m_shift;
     return Block<const VectorsType,Dynamic,1>(h.m_vectors, start, k, h.rows()-start, 1);
@@ -91,7 +99,6 @@
 {
   typedef Transpose<Block<const VectorsType, 1, Dynamic> > EssentialVectorType;
   typedef HouseholderSequence<VectorsType, CoeffsType, OnTheRight> HouseholderSequenceType;
-  typedef typename VectorsType::Index Index;
   static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
   {
     Index start = k+1+h.m_shift;
@@ -101,7 +108,7 @@
 
 template<typename OtherScalarType, typename MatrixType> struct matrix_type_times_scalar_type
 {
-  typedef typename scalar_product_traits<OtherScalarType, typename MatrixType::Scalar>::ReturnType
+  typedef typename ScalarBinaryOpTraits<OtherScalarType, typename MatrixType::Scalar>::ReturnType
     ResultScalar;
   typedef Matrix<ResultScalar, MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime,
                  0, MatrixType::MaxRowsAtCompileTime, MatrixType::MaxColsAtCompileTime> Type;
@@ -122,7 +129,6 @@
       MaxColsAtCompileTime = internal::traits<HouseholderSequence>::MaxColsAtCompileTime
     };
     typedef typename internal::traits<HouseholderSequence>::Scalar Scalar;
-    typedef typename VectorsType::Index Index;
 
     typedef HouseholderSequence<
       typename internal::conditional<NumTraits<Scalar>::IsComplex,
@@ -134,6 +140,28 @@
       Side
     > ConjugateReturnType;
 
+    typedef HouseholderSequence<
+      VectorsType,
+      typename internal::conditional<NumTraits<Scalar>::IsComplex,
+        typename internal::remove_all<typename CoeffsType::ConjugateReturnType>::type,
+        CoeffsType>::type,
+      Side
+    > AdjointReturnType;
+
+    typedef HouseholderSequence<
+      typename internal::conditional<NumTraits<Scalar>::IsComplex,
+        typename internal::remove_all<typename VectorsType::ConjugateReturnType>::type,
+        VectorsType>::type,
+      CoeffsType,
+      Side
+    > TransposeReturnType;
+
+    typedef HouseholderSequence<
+      typename internal::add_const<VectorsType>::type,
+      typename internal::add_const<CoeffsType>::type,
+      Side
+    > ConstHouseholderSequence;
+
     /** \brief Constructor.
       * \param[in]  v      %Matrix containing the essential parts of the Householder vectors
       * \param[in]  h      Vector containing the Householder coefficients
@@ -151,17 +179,19 @@
       *
       * \sa setLength(), setShift()
       */
+    EIGEN_DEVICE_FUNC
     HouseholderSequence(const VectorsType& v, const CoeffsType& h)
-      : m_vectors(v), m_coeffs(h), m_trans(false), m_length(v.diagonalSize()),
+      : m_vectors(v), m_coeffs(h), m_reverse(false), m_length(v.diagonalSize()),
         m_shift(0)
     {
     }
 
     /** \brief Copy constructor. */
+    EIGEN_DEVICE_FUNC
     HouseholderSequence(const HouseholderSequence& other)
       : m_vectors(other.m_vectors),
         m_coeffs(other.m_coeffs),
-        m_trans(other.m_trans),
+        m_reverse(other.m_reverse),
         m_length(other.m_length),
         m_shift(other.m_shift)
     {
@@ -171,13 +201,15 @@
       * \returns Number of rows
       * \details This equals the dimension of the space that the transformation acts on.
       */
-    Index rows() const { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); }
 
     /** \brief Number of columns of transformation viewed as a matrix.
       * \returns Number of columns
       * \details This equals the dimension of the space that the transformation acts on.
       */
-    Index cols() const { return rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return rows(); }
 
     /** \brief Essential part of a Householder vector.
       * \param[in]  k  Index of Householder reflection
@@ -193,6 +225,7 @@
       *
       * \sa setShift(), shift()
       */
+    EIGEN_DEVICE_FUNC
     const EssentialVectorType essentialVector(Index k) const
     {
       eigen_assert(k >= 0 && k < m_length);
@@ -200,31 +233,51 @@
     }
 
     /** \brief %Transpose of the Householder sequence. */
-    HouseholderSequence transpose() const
+    TransposeReturnType transpose() const
     {
-      return HouseholderSequence(*this).setTrans(!m_trans);
+      return TransposeReturnType(m_vectors.conjugate(), m_coeffs)
+              .setReverseFlag(!m_reverse)
+              .setLength(m_length)
+              .setShift(m_shift);
     }
 
     /** \brief Complex conjugate of the Householder sequence. */
     ConjugateReturnType conjugate() const
     {
       return ConjugateReturnType(m_vectors.conjugate(), m_coeffs.conjugate())
-             .setTrans(m_trans)
+             .setReverseFlag(m_reverse)
              .setLength(m_length)
              .setShift(m_shift);
     }
 
-    /** \brief Adjoint (conjugate transpose) of the Householder sequence. */
-    ConjugateReturnType adjoint() const
+    /** \returns an expression of the complex conjugate of \c *this if Cond==true,
+     *           returns \c *this otherwise.
+     */
+    template<bool Cond>
+    EIGEN_DEVICE_FUNC
+    inline typename internal::conditional<Cond,ConjugateReturnType,ConstHouseholderSequence>::type
+    conjugateIf() const
     {
-      return conjugate().setTrans(!m_trans);
+      typedef typename internal::conditional<Cond,ConjugateReturnType,ConstHouseholderSequence>::type ReturnType;
+      return ReturnType(m_vectors.template conjugateIf<Cond>(), m_coeffs.template conjugateIf<Cond>());
+    }
+
+    /** \brief Adjoint (conjugate transpose) of the Householder sequence. */
+    AdjointReturnType adjoint() const
+    {
+      return AdjointReturnType(m_vectors, m_coeffs.conjugate())
+              .setReverseFlag(!m_reverse)
+              .setLength(m_length)
+              .setShift(m_shift);
     }
 
     /** \brief Inverse of the Householder sequence (equals the adjoint). */
-    ConjugateReturnType inverse() const { return adjoint(); }
+    AdjointReturnType inverse() const { return adjoint(); }
 
     /** \internal */
-    template<typename DestType> inline void evalTo(DestType& dst) const
+    template<typename DestType>
+    inline EIGEN_DEVICE_FUNC
+    void evalTo(DestType& dst) const
     {
       Matrix<Scalar, DestType::RowsAtCompileTime, 1,
              AutoAlign|ColMajor, DestType::MaxRowsAtCompileTime, 1> workspace(rows());
@@ -233,12 +286,12 @@
 
     /** \internal */
     template<typename Dest, typename Workspace>
+    EIGEN_DEVICE_FUNC
     void evalTo(Dest& dst, Workspace& workspace) const
     {
       workspace.resize(rows());
       Index vecs = m_length;
-      if(    internal::is_same<typename internal::remove_all<VectorsType>::type,Dest>::value
-          && internal::extract_data(dst) == internal::extract_data(m_vectors))
+      if(internal::is_same_dense(dst,m_vectors))
       {
         // in-place
         dst.diagonal().setOnes();
@@ -246,7 +299,7 @@
         for(Index k = vecs-1; k >= 0; --k)
         {
           Index cornerSize = rows() - k - m_shift;
-          if(m_trans)
+          if(m_reverse)
             dst.bottomRightCorner(cornerSize, cornerSize)
                .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data());
           else
@@ -260,18 +313,26 @@
         for(Index k = 0; k<cols()-vecs ; ++k)
           dst.col(k).tail(rows()-k-1).setZero();
       }
+      else if(m_length>BlockSize)
+      {
+        dst.setIdentity(rows(), rows());
+        if(m_reverse)
+          applyThisOnTheLeft(dst,workspace,true);
+        else
+          applyThisOnTheLeft(dst,workspace,true);
+      }
       else
       {
         dst.setIdentity(rows(), rows());
         for(Index k = vecs-1; k >= 0; --k)
         {
           Index cornerSize = rows() - k - m_shift;
-          if(m_trans)
+          if(m_reverse)
             dst.bottomRightCorner(cornerSize, cornerSize)
-               .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0));
+               .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data());
           else
             dst.bottomRightCorner(cornerSize, cornerSize)
-               .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0));
+               .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), workspace.data());
         }
       }
     }
@@ -290,32 +351,34 @@
       workspace.resize(dst.rows());
       for(Index k = 0; k < m_length; ++k)
       {
-        Index actual_k = m_trans ? m_length-k-1 : k;
+        Index actual_k = m_reverse ? m_length-k-1 : k;
         dst.rightCols(rows()-m_shift-actual_k)
            .applyHouseholderOnTheRight(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
       }
     }
 
     /** \internal */
-    template<typename Dest> inline void applyThisOnTheLeft(Dest& dst) const
+    template<typename Dest> inline void applyThisOnTheLeft(Dest& dst, bool inputIsIdentity = false) const
     {
-      Matrix<Scalar,1,Dest::ColsAtCompileTime,RowMajor,1,Dest::MaxColsAtCompileTime> workspace(dst.cols());
-      applyThisOnTheLeft(dst, workspace);
+      Matrix<Scalar,1,Dest::ColsAtCompileTime,RowMajor,1,Dest::MaxColsAtCompileTime> workspace;
+      applyThisOnTheLeft(dst, workspace, inputIsIdentity);
     }
 
     /** \internal */
     template<typename Dest, typename Workspace>
-    inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace) const
+    inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace, bool inputIsIdentity = false) const
     {
-#if defined(NDEBUG) || !defined(EIGEN_COMP_MSVC_STRICT)
-      const Index BlockSize = 48;
+      if(inputIsIdentity && m_reverse)
+        inputIsIdentity = false;
       // if the entries are large enough, then apply the reflectors by block
       if(m_length>=BlockSize && dst.cols()>1)
       {
-        for(Index i = 0; i < m_length; i+=BlockSize)
+        // Make sure we have at least 2 useful blocks, otherwise it is point-less:
+        Index blockSize = m_length<Index(2*BlockSize) ? (m_length+1)/2 : Index(BlockSize);
+        for(Index i = 0; i < m_length; i+=blockSize)
         {
-          Index end = m_trans ? (std::min)(m_length,i+BlockSize) : m_length-i;
-          Index k = m_trans ? i : (std::max)(Index(0),end-BlockSize);
+          Index end = m_reverse ? (std::min)(m_length,i+blockSize) : m_length-i;
+          Index k = m_reverse ? i : (std::max)(Index(0),end-blockSize);
           Index bs = end-k;
           Index start = k + m_shift;
 
@@ -325,8 +388,15 @@
                                                                    Side==OnTheRight ? bs : m_vectors.rows()-start,
                                                                    Side==OnTheRight ? m_vectors.cols()-start : bs);
           typename internal::conditional<Side==OnTheRight, Transpose<SubVectorsType>, SubVectorsType&>::type sub_vecs(sub_vecs1);
-          Block<Dest,Dynamic,Dynamic> sub_dst(dst,dst.rows()-rows()+m_shift+k,0, rows()-m_shift-k,dst.cols());
-          apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_trans);
+
+          Index dstStart = dst.rows()-rows()+m_shift+k;
+          Index dstRows  = rows()-m_shift-k;
+          Block<Dest,Dynamic,Dynamic> sub_dst(dst,
+                                              dstStart,
+                                              inputIsIdentity ? dstStart : 0,
+                                              dstRows,
+                                              inputIsIdentity ? dstRows : dst.cols());
+          apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_reverse);
         }
       }
       else
@@ -334,21 +404,12 @@
         workspace.resize(dst.cols());
         for(Index k = 0; k < m_length; ++k)
         {
-          Index actual_k = m_trans ? k : m_length-k-1;
-          dst.bottomRows(rows()-m_shift-actual_k)
+          Index actual_k = m_reverse ? k : m_length-k-1;
+          Index dstStart = rows()-m_shift-actual_k;
+          dst.bottomRightCorner(dstStart, inputIsIdentity ? dstStart : dst.cols())
             .applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
         }
       }
-#else
-      // The blocked code above does not compile in debug mode with MSVC.
-      workspace.resize(dst.cols());
-      for(Index k = 0; k < m_length; ++k)
-      {
-        Index actual_k = m_trans ? k : m_length-k-1;
-        dst.bottomRows(rows()-m_shift-actual_k)
-            .applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
-      }
-#endif
     }
 
     /** \brief Computes the product of a Householder sequence with a matrix.
@@ -363,7 +424,7 @@
     {
       typename internal::matrix_type_times_scalar_type<Scalar, OtherDerived>::Type
         res(other.template cast<typename internal::matrix_type_times_scalar_type<Scalar,OtherDerived>::ResultScalar>());
-      applyThisOnTheLeft(res);
+      applyThisOnTheLeft(res, internal::is_identity<OtherDerived>::value && res.rows()==res.cols());
       return res;
     }
 
@@ -378,6 +439,7 @@
       *
       * \sa length()
       */
+    EIGEN_DEVICE_FUNC
     HouseholderSequence& setLength(Index length)
     {
       m_length = length;
@@ -395,13 +457,17 @@
       *
       * \sa shift()
       */
+    EIGEN_DEVICE_FUNC
     HouseholderSequence& setShift(Index shift)
     {
       m_shift = shift;
       return *this;
     }
 
+    EIGEN_DEVICE_FUNC
     Index length() const { return m_length; }  /**< \brief Returns the length of the Householder sequence. */
+
+    EIGEN_DEVICE_FUNC
     Index shift() const { return m_shift; }    /**< \brief Returns the shift of the Householder sequence. */
 
     /* Necessary for .adjoint() and .conjugate() */
@@ -409,27 +475,30 @@
 
   protected:
 
-    /** \brief Sets the transpose flag.
-      * \param [in]  trans  New value of the transpose flag.
+    /** \internal
+      * \brief Sets the reverse flag.
+      * \param [in]  reverse  New value of the reverse flag.
       *
-      * By default, the transpose flag is not set. If the transpose flag is set, then this object represents
-      * \f$ H^T = H_{n-1}^T \ldots H_1^T H_0^T \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$.
+      * By default, the reverse flag is not set. If the reverse flag is set, then this object represents
+      * \f$ H^r = H_{n-1} \ldots H_1 H_0 \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$.
+      * \note For real valued HouseholderSequence this is equivalent to transposing \f$ H \f$.
       *
-      * \sa trans()
+      * \sa reverseFlag(), transpose(), adjoint()
       */
-    HouseholderSequence& setTrans(bool trans)
+    HouseholderSequence& setReverseFlag(bool reverse)
     {
-      m_trans = trans;
+      m_reverse = reverse;
       return *this;
     }
 
-    bool trans() const { return m_trans; }     /**< \brief Returns the transpose flag. */
+    bool reverseFlag() const { return m_reverse; }     /**< \internal \brief Returns the reverse flag. */
 
     typename VectorsType::Nested m_vectors;
     typename CoeffsType::Nested m_coeffs;
-    bool m_trans;
+    bool m_reverse;
     Index m_length;
     Index m_shift;
+    enum { BlockSize = 48 };
 };
 
 /** \brief Computes the product of a matrix with a Householder sequence.

diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
index 1f3c060..a117fc1 100644
--- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
+++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,53 +10,57 @@
 #ifndef EIGEN_BASIC_PRECONDITIONERS_H
 #define EIGEN_BASIC_PRECONDITIONERS_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \ingroup IterativeLinearSolvers_Module
   * \brief A preconditioner based on the digonal entries
   *
   * This class allows to approximately solve for A.x = b problems assuming A is a diagonal matrix.
   * In other words, this preconditioner neglects all off diagonal entries and, in Eigen's language, solves for:
-  * \code
-  * A.diagonal().asDiagonal() . x = b
-  * \endcode
+    \code
+    A.diagonal().asDiagonal() . x = b
+    \endcode
   *
   * \tparam _Scalar the type of the scalar.
   *
+  * \implsparsesolverconcept
+  *
   * This preconditioner is suitable for both selfadjoint and general problems.
   * The diagonal entries are pre-inverted and stored into a dense vector.
   *
   * \note A variant that has yet to be implemented would attempt to preserve the norm of each column.
   *
+  * \sa class LeastSquareDiagonalPreconditioner, class ConjugateGradient
   */
 template <typename _Scalar>
 class DiagonalPreconditioner
 {
     typedef _Scalar Scalar;
     typedef Matrix<Scalar,Dynamic,1> Vector;
-    typedef typename Vector::Index Index;
-
   public:
-    // this typedef is only to export the scalar type and compile-time dimensions to solve_retval
-    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
+    typedef typename Vector::StorageIndex StorageIndex;
+    enum {
+      ColsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic
+    };
 
     DiagonalPreconditioner() : m_isInitialized(false) {}
 
     template<typename MatType>
-    DiagonalPreconditioner(const MatType& mat) : m_invdiag(mat.cols())
+    explicit DiagonalPreconditioner(const MatType& mat) : m_invdiag(mat.cols())
     {
       compute(mat);
     }
 
-    Index rows() const { return m_invdiag.size(); }
-    Index cols() const { return m_invdiag.size(); }
-    
+    EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_invdiag.size(); }
+    EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_invdiag.size(); }
+
     template<typename MatType>
     DiagonalPreconditioner& analyzePattern(const MatType& )
     {
       return *this;
     }
-    
+
     template<typename MatType>
     DiagonalPreconditioner& factorize(const MatType& mat)
     {
@@ -73,53 +77,124 @@
       m_isInitialized = true;
       return *this;
     }
-    
+
     template<typename MatType>
     DiagonalPreconditioner& compute(const MatType& mat)
     {
       return factorize(mat);
     }
 
+    /** \internal */
     template<typename Rhs, typename Dest>
-    void _solve(const Rhs& b, Dest& x) const
+    void _solve_impl(const Rhs& b, Dest& x) const
     {
       x = m_invdiag.array() * b.array() ;
     }
 
-    template<typename Rhs> inline const internal::solve_retval<DiagonalPreconditioner, Rhs>
+    template<typename Rhs> inline const Solve<DiagonalPreconditioner, Rhs>
     solve(const MatrixBase<Rhs>& b) const
     {
       eigen_assert(m_isInitialized && "DiagonalPreconditioner is not initialized.");
       eigen_assert(m_invdiag.size()==b.rows()
                 && "DiagonalPreconditioner::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<DiagonalPreconditioner, Rhs>(*this, b.derived());
+      return Solve<DiagonalPreconditioner, Rhs>(*this, b.derived());
     }
 
+    ComputationInfo info() { return Success; }
+
   protected:
     Vector m_invdiag;
     bool m_isInitialized;
 };
 
-namespace internal {
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<DiagonalPreconditioner<_MatrixType>, Rhs>
-  : solve_retval_base<DiagonalPreconditioner<_MatrixType>, Rhs>
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief Jacobi preconditioner for LeastSquaresConjugateGradient
+  *
+  * This class allows to approximately solve for A' A x  = A' b problems assuming A' A is a diagonal matrix.
+  * In other words, this preconditioner neglects all off diagonal entries and, in Eigen's language, solves for:
+    \code
+    (A.adjoint() * A).diagonal().asDiagonal() * x = b
+    \endcode
+  *
+  * \tparam _Scalar the type of the scalar.
+  *
+  * \implsparsesolverconcept
+  *
+  * The diagonal entries are pre-inverted and stored into a dense vector.
+  *
+  * \sa class LeastSquaresConjugateGradient, class DiagonalPreconditioner
+  */
+template <typename _Scalar>
+class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar>
 {
-  typedef DiagonalPreconditioner<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+    typedef _Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef DiagonalPreconditioner<_Scalar> Base;
+    using Base::m_invdiag;
+  public:
 
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
+    LeastSquareDiagonalPreconditioner() : Base() {}
+
+    template<typename MatType>
+    explicit LeastSquareDiagonalPreconditioner(const MatType& mat) : Base()
+    {
+      compute(mat);
+    }
+
+    template<typename MatType>
+    LeastSquareDiagonalPreconditioner& analyzePattern(const MatType& )
+    {
+      return *this;
+    }
+
+    template<typename MatType>
+    LeastSquareDiagonalPreconditioner& factorize(const MatType& mat)
+    {
+      // Compute the inverse squared-norm of each column of mat
+      m_invdiag.resize(mat.cols());
+      if(MatType::IsRowMajor)
+      {
+        m_invdiag.setZero();
+        for(Index j=0; j<mat.outerSize(); ++j)
+        {
+          for(typename MatType::InnerIterator it(mat,j); it; ++it)
+            m_invdiag(it.index()) += numext::abs2(it.value());
+        }
+        for(Index j=0; j<mat.cols(); ++j)
+          if(numext::real(m_invdiag(j))>RealScalar(0))
+            m_invdiag(j) = RealScalar(1)/numext::real(m_invdiag(j));
+      }
+      else
+      {
+        for(Index j=0; j<mat.outerSize(); ++j)
+        {
+          RealScalar sum = mat.col(j).squaredNorm();
+          if(sum>RealScalar(0))
+            m_invdiag(j) = RealScalar(1)/sum;
+          else
+            m_invdiag(j) = RealScalar(1);
+        }
+      }
+      Base::m_isInitialized = true;
+      return *this;
+    }
+
+    template<typename MatType>
+    LeastSquareDiagonalPreconditioner& compute(const MatType& mat)
+    {
+      return factorize(mat);
+    }
+
+    ComputationInfo info() { return Success; }
+
+  protected:
 };
 
-}
-
 /** \ingroup IterativeLinearSolvers_Module
   * \brief A naive preconditioner which approximates any matrix as the identity matrix
   *
+  * \implsparsesolverconcept
+  *
   * \sa class DiagonalPreconditioner
   */
 class IdentityPreconditioner
@@ -129,19 +204,21 @@
     IdentityPreconditioner() {}
 
     template<typename MatrixType>
-    IdentityPreconditioner(const MatrixType& ) {}
-    
+    explicit IdentityPreconditioner(const MatrixType& ) {}
+
     template<typename MatrixType>
     IdentityPreconditioner& analyzePattern(const MatrixType& ) { return *this; }
-    
+
     template<typename MatrixType>
     IdentityPreconditioner& factorize(const MatrixType& ) { return *this; }
 
     template<typename MatrixType>
     IdentityPreconditioner& compute(const MatrixType& ) { return *this; }
-    
+
     template<typename Rhs>
     inline const Rhs& solve(const Rhs& b) const { return b; }
+
+    ComputationInfo info() { return Success; }
 };
 
 } // end namespace Eigen

diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
index 7a46b51..153acef 100644
--- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
+++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -27,7 +27,7 @@
   */
 template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
 bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x,
-              const Preconditioner& precond, int& iters,
+              const Preconditioner& precond, Index& iters,
               typename Dest::RealScalar& tol_error)
 {
   using std::sqrt;
@@ -36,10 +36,9 @@
   typedef typename Dest::Scalar Scalar;
   typedef Matrix<Scalar,Dynamic,1> VectorType;
   RealScalar tol = tol_error;
-  int maxIters = iters;
+  Index maxIters = iters;
 
-  int n = mat.cols();
-  x = precond.solve(x);
+  Index n = mat.cols();
   VectorType r  = rhs - mat * x;
   VectorType r0 = r;
   
@@ -60,19 +59,21 @@
 
   VectorType s(n), t(n);
 
-  RealScalar tol2 = tol*tol;
-  int i = 0;
-  int restarts = 0;
+  RealScalar tol2 = tol*tol*rhs_sqnorm;
+  RealScalar eps2 = NumTraits<Scalar>::epsilon()*NumTraits<Scalar>::epsilon();
+  Index i = 0;
+  Index restarts = 0;
 
-  while ( r.squaredNorm()/rhs_sqnorm > tol2 && i<maxIters )
+  while ( r.squaredNorm() > tol2 && i<maxIters )
   {
     Scalar rho_old = rho;
 
     rho = r0.dot(r);
-    if (internal::isMuchSmallerThan(rho,r0_sqnorm))
+    if (abs(rho) < eps2*r0_sqnorm)
     {
-      // The new residual vector became too orthogonal to the arbitrarily choosen direction r0
+      // The new residual vector became too orthogonal to the arbitrarily chosen direction r0
       // Let's restart with a new r0:
+      r  = rhs - mat * x;
       r0 = r;
       rho = r0_sqnorm = r.squaredNorm();
       if(restarts++ == 0)
@@ -131,27 +132,33 @@
   * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
   * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
   *
+  * \implsparsesolverconcept
+  *
   * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
   * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
   * and NumTraits<Scalar>::epsilon() for the tolerance.
   * 
+  * The tolerance corresponds to the relative residual error: |Ax-b|/|b|
+  * 
+  * \b Performance: when using sparse matrices, best performance is achied for a row-major sparse matrix format.
+  * Moreover, in this case multi-threading can be exploited if the user code is compiled with OpenMP enabled.
+  * See \ref TopicMultiThreading for details.
+  * 
   * This class can be used as the direct solver classes. Here is a typical usage example:
   * \include BiCGSTAB_simple.cpp
   * 
   * By default the iterations start with x=0 as an initial guess of the solution.
-  * One can control the start using the solveWithGuess() method. Here is a step by
-  * step execution example starting with a random guess and printing the evolution
-  * of the estimated error:
-  * \include BiCGSTAB_step_by_step.cpp
-  * Note that such a step by step excution is slightly slower.
+  * One can control the start using the solveWithGuess() method.
   * 
+  * BiCGSTAB can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+  *
   * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
 template< typename _MatrixType, typename _Preconditioner>
 class BiCGSTAB : public IterativeSolverBase<BiCGSTAB<_MatrixType,_Preconditioner> >
 {
   typedef IterativeSolverBase<BiCGSTAB> Base;
-  using Base::mp_matrix;
+  using Base::matrix;
   using Base::m_error;
   using Base::m_iterations;
   using Base::m_info;
@@ -159,7 +166,6 @@
 public:
   typedef _MatrixType MatrixType;
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::RealScalar RealScalar;
   typedef _Preconditioner Preconditioner;
 
@@ -178,77 +184,29 @@
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  BiCGSTAB(const MatrixType& A) : Base(A) {}
+  template<typename MatrixDerived>
+  explicit BiCGSTAB(const EigenBase<MatrixDerived>& A) : Base(A.derived()) {}
 
   ~BiCGSTAB() {}
-  
-  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
-    * \a x0 as an initial solution.
-    *
-    * \sa compute()
-    */
-  template<typename Rhs,typename Guess>
-  inline const internal::solve_retval_with_guess<BiCGSTAB, Rhs, Guess>
-  solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
-  {
-    eigen_assert(m_isInitialized && "BiCGSTAB is not initialized.");
-    eigen_assert(Base::rows()==b.rows()
-              && "BiCGSTAB::solve(): invalid number of rows of the right hand side matrix b");
-    return internal::solve_retval_with_guess
-            <BiCGSTAB, Rhs, Guess>(*this, b.derived(), x0);
-  }
-  
-  /** \internal */
-  template<typename Rhs,typename Dest>
-  void _solveWithGuess(const Rhs& b, Dest& x) const
-  {    
-    bool failed = false;
-    for(int j=0; j<b.cols(); ++j)
-    {
-      m_iterations = Base::maxIterations();
-      m_error = Base::m_tolerance;
-      
-      typename Dest::ColXpr xj(x,j);
-      if(!internal::bicgstab(*mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error))
-        failed = true;
-    }
-    m_info = failed ? NumericalIssue
-           : m_error <= Base::m_tolerance ? Success
-           : NoConvergence;
-    m_isInitialized = true;
-  }
 
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solve(const Rhs& b, Dest& x) const
-  {
-//     x.setZero();
-  x = b;
-    _solveWithGuess(b,x);
+  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
+  {    
+    m_iterations = Base::maxIterations();
+    m_error = Base::m_tolerance;
+    
+    bool ret = internal::bicgstab(matrix(), b, x, Base::m_preconditioner, m_iterations, m_error);
+
+    m_info = (!ret) ? NumericalIssue
+           : m_error <= Base::m_tolerance ? Success
+           : NoConvergence;
   }
 
 protected:
 
 };
 
-
-namespace internal {
-
-  template<typename _MatrixType, typename _Preconditioner, typename Rhs>
-struct solve_retval<BiCGSTAB<_MatrixType, _Preconditioner>, Rhs>
-  : solve_retval_base<BiCGSTAB<_MatrixType, _Preconditioner>, Rhs>
-{
-  typedef BiCGSTAB<_MatrixType, _Preconditioner> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_BICGSTAB_H

diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
index 3ce5179..5d8c6b4 100644
--- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -26,7 +26,7 @@
 template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
 EIGEN_DONT_INLINE
 void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
-                        const Preconditioner& precond, int& iters,
+                        const Preconditioner& precond, Index& iters,
                         typename Dest::RealScalar& tol_error)
 {
   using std::sqrt;
@@ -36,9 +36,9 @@
   typedef Matrix<Scalar,Dynamic,1> VectorType;
   
   RealScalar tol = tol_error;
-  int maxIters = iters;
+  Index maxIters = iters;
   
-  int n = mat.cols();
+  Index n = mat.cols();
 
   VectorType residual = rhs - mat * x; //initial residual
 
@@ -50,7 +50,8 @@
     tol_error = 0;
     return;
   }
-  RealScalar threshold = tol*tol*rhsNorm2;
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+  RealScalar threshold = numext::maxi(RealScalar(tol*tol*rhsNorm2),considerAsZero);
   RealScalar residualNorm2 = residual.squaredNorm();
   if (residualNorm2 < threshold)
   {
@@ -58,31 +59,31 @@
     tol_error = sqrt(residualNorm2 / rhsNorm2);
     return;
   }
-  
+
   VectorType p(n);
-  p = precond.solve(residual);      //initial search direction
+  p = precond.solve(residual);      // initial search direction
 
   VectorType z(n), tmp(n);
   RealScalar absNew = numext::real(residual.dot(p));  // the square of the absolute value of r scaled by invM
-  int i = 0;
+  Index i = 0;
   while(i < maxIters)
   {
-    tmp.noalias() = mat * p;              // the bottleneck of the algorithm
+    tmp.noalias() = mat * p;                    // the bottleneck of the algorithm
 
-    Scalar alpha = absNew / p.dot(tmp);   // the amount we travel on dir
-    x += alpha * p;                       // update solution
-    residual -= alpha * tmp;              // update residue
+    Scalar alpha = absNew / p.dot(tmp);         // the amount we travel on dir
+    x += alpha * p;                             // update solution
+    residual -= alpha * tmp;                    // update residual
     
     residualNorm2 = residual.squaredNorm();
     if(residualNorm2 < threshold)
       break;
     
-    z = precond.solve(residual);          // approximately solve for "A z = residual"
+    z = precond.solve(residual);                // approximately solve for "A z = residual"
 
     RealScalar absOld = absNew;
     absNew = numext::real(residual.dot(z));     // update the absolute value of r
-    RealScalar beta = absNew / absOld;            // calculate the Gram-Schmidt value used to create the new search direction
-    p = z + beta * p;                             // update search direction
+    RealScalar beta = absNew / absOld;          // calculate the Gram-Schmidt value used to create the new search direction
+    p = z + beta * p;                           // update search direction
     i++;
   }
   tol_error = sqrt(residualNorm2 / rhsNorm2);
@@ -113,52 +114,51 @@
   * The matrix A must be selfadjoint. The matrix A and the vectors x and b can be either dense or sparse.
   *
   * \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix.
-  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
-  *               or Upper. Default is Lower.
+  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower,
+  *               \c Upper, or \c Lower|Upper in which the full matrix entries will be considered.
+  *               Default is \c Lower, best performance is \c Lower|Upper.
   * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
   *
+  * \implsparsesolverconcept
+  *
   * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
   * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
   * and NumTraits<Scalar>::epsilon() for the tolerance.
   * 
+  * The tolerance corresponds to the relative residual error: |Ax-b|/|b|
+  * 
+  * \b Performance: Even though the default value of \c _UpLo is \c Lower, significantly higher performance is
+  * achieved when using a complete matrix and \b Lower|Upper as the \a _UpLo template parameter. Moreover, in this
+  * case multi-threading can be exploited if the user code is compiled with OpenMP enabled.
+  * See \ref TopicMultiThreading for details.
+  * 
   * This class can be used as the direct solver classes. Here is a typical usage example:
-  * \code
-  * int n = 10000;
-  * VectorXd x(n), b(n);
-  * SparseMatrix<double> A(n,n);
-  * // fill A and b
-  * ConjugateGradient<SparseMatrix<double> > cg;
-  * cg.compute(A);
-  * x = cg.solve(b);
-  * std::cout << "#iterations:     " << cg.iterations() << std::endl;
-  * std::cout << "estimated error: " << cg.error()      << std::endl;
-  * // update b, and solve again
-  * x = cg.solve(b);
-  * \endcode
+    \code
+    int n = 10000;
+    VectorXd x(n), b(n);
+    SparseMatrix<double> A(n,n);
+    // fill A and b
+    ConjugateGradient<SparseMatrix<double>, Lower|Upper> cg;
+    cg.compute(A);
+    x = cg.solve(b);
+    std::cout << "#iterations:     " << cg.iterations() << std::endl;
+    std::cout << "estimated error: " << cg.error()      << std::endl;
+    // update b, and solve again
+    x = cg.solve(b);
+    \endcode
   * 
   * By default the iterations start with x=0 as an initial guess of the solution.
-  * One can control the start using the solveWithGuess() method. Here is a step by
-  * step execution example starting with a random guess and printing the evolution
-  * of the estimated error:
-  * * \code
-  * x = VectorXd::Random(n);
-  * cg.setMaxIterations(1);
-  * int i = 0;
-  * do {
-  *   x = cg.solveWithGuess(b,x);
-  *   std::cout << i << " : " << cg.error() << std::endl;
-  *   ++i;
-  * } while (cg.info()!=Success && i<100);
-  * \endcode
-  * Note that such a step by step excution is slightly slower.
+  * One can control the start using the solveWithGuess() method.
   * 
-  * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+  * ConjugateGradient can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+  *
+  * \sa class LeastSquaresConjugateGradient, class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
 template< typename _MatrixType, int _UpLo, typename _Preconditioner>
 class ConjugateGradient : public IterativeSolverBase<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
 {
   typedef IterativeSolverBase<ConjugateGradient> Base;
-  using Base::mp_matrix;
+  using Base::matrix;
   using Base::m_error;
   using Base::m_iterations;
   using Base::m_info;
@@ -166,7 +166,6 @@
 public:
   typedef _MatrixType MatrixType;
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::RealScalar RealScalar;
   typedef _Preconditioner Preconditioner;
 
@@ -189,77 +188,42 @@
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  ConjugateGradient(const MatrixType& A) : Base(A) {}
+  template<typename MatrixDerived>
+  explicit ConjugateGradient(const EigenBase<MatrixDerived>& A) : Base(A.derived()) {}
 
   ~ConjugateGradient() {}
-  
-  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
-    * \a x0 as an initial solution.
-    *
-    * \sa compute()
-    */
-  template<typename Rhs,typename Guess>
-  inline const internal::solve_retval_with_guess<ConjugateGradient, Rhs, Guess>
-  solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
-  {
-    eigen_assert(m_isInitialized && "ConjugateGradient is not initialized.");
-    eigen_assert(Base::rows()==b.rows()
-              && "ConjugateGradient::solve(): invalid number of rows of the right hand side matrix b");
-    return internal::solve_retval_with_guess
-            <ConjugateGradient, Rhs, Guess>(*this, b.derived(), x0);
-  }
 
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solveWithGuess(const Rhs& b, Dest& x) const
+  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
   {
+    typedef typename Base::MatrixWrapper MatrixWrapper;
+    typedef typename Base::ActualMatrixType ActualMatrixType;
+    enum {
+      TransposeInput  =   (!MatrixWrapper::MatrixFree)
+                      &&  (UpLo==(Lower|Upper))
+                      &&  (!MatrixType::IsRowMajor)
+                      &&  (!NumTraits<Scalar>::IsComplex)
+    };
+    typedef typename internal::conditional<TransposeInput,Transpose<const ActualMatrixType>, ActualMatrixType const&>::type RowMajorWrapper;
+    EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(MatrixWrapper::MatrixFree,UpLo==(Lower|Upper)),MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY);
+    typedef typename internal::conditional<UpLo==(Lower|Upper),
+                                           RowMajorWrapper,
+                                           typename MatrixWrapper::template ConstSelfAdjointViewReturnType<UpLo>::Type
+                                          >::type SelfAdjointWrapper;
+
     m_iterations = Base::maxIterations();
     m_error = Base::m_tolerance;
 
-    for(int j=0; j<b.cols(); ++j)
-    {
-      m_iterations = Base::maxIterations();
-      m_error = Base::m_tolerance;
-
-      typename Dest::ColXpr xj(x,j);
-      internal::conjugate_gradient(mp_matrix->template selfadjointView<UpLo>(), b.col(j), xj,
-                                   Base::m_preconditioner, m_iterations, m_error);
-    }
-
-    m_isInitialized = true;
+    RowMajorWrapper row_mat(matrix());
+    internal::conjugate_gradient(SelfAdjointWrapper(row_mat), b, x, Base::m_preconditioner, m_iterations, m_error);
     m_info = m_error <= Base::m_tolerance ? Success : NoConvergence;
   }
-  
-  /** \internal */
-  template<typename Rhs,typename Dest>
-  void _solve(const Rhs& b, Dest& x) const
-  {
-    x.setOnes();
-    _solveWithGuess(b,x);
-  }
 
 protected:
 
 };
 
-
-namespace internal {
-
-template<typename _MatrixType, int _UpLo, typename _Preconditioner, typename Rhs>
-struct solve_retval<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner>, Rhs>
-  : solve_retval_base<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner>, Rhs>
-{
-  typedef ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_CONJUGATE_GRADIENT_H

diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
new file mode 100644
index 0000000..7803fd8
--- /dev/null
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h

@@ -0,0 +1,394 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INCOMPLETE_CHOlESKY_H
+#define EIGEN_INCOMPLETE_CHOlESKY_H
+
+#include <vector>
+#include <list>
+
+namespace Eigen {
+/**
+  * \brief Modified Incomplete Cholesky with dual threshold
+  *
+  * References : C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
+  *              Limited memory, SIAM J. Sci. Comput.  21(1), pp. 24-45, 1999
+  *
+  * \tparam Scalar the scalar type of the input matrices
+  * \tparam _UpLo The triangular part that will be used for the computations. It can be Lower
+    *               or Upper. Default is Lower.
+  * \tparam _OrderingType The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<int>,
+  *                       unless EIGEN_MPL2_ONLY is defined, in which case the default is NaturalOrdering<int>.
+  *
+  * \implsparsesolverconcept
+  *
+  * It performs the following incomplete factorization: \f$ S P A P' S \approx L L' \f$
+  * where L is a lower triangular factor, S is a diagonal scaling matrix, and P is a
+  * fill-in reducing permutation as computed by the ordering method.
+  *
+  * \b Shifting \b strategy: Let \f$ B = S P A P' S \f$  be the scaled matrix on which the factorization is carried out,
+  * and \f$ \beta \f$ be the minimum value of the diagonal. If \f$ \beta > 0 \f$ then, the factorization is directly performed
+  * on the matrix B. Otherwise, the factorization is performed on the shifted matrix \f$ B + (\sigma+|\beta| I \f$ where
+  * \f$ \sigma \f$ is the initial shift value as returned and set by setInitialShift() method. The default value is \f$ \sigma = 10^{-3} \f$.
+  * If the factorization fails, then the shift in doubled until it succeed or a maximum of ten attempts. If it still fails, as returned by
+  * the info() method, then you can either increase the initial shift, or better use another preconditioning technique.
+  *
+  */
+template <typename Scalar, int _UpLo = Lower, typename _OrderingType = AMDOrdering<int> >
+class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,_UpLo,_OrderingType> >
+{
+  protected:
+    typedef SparseSolverBase<IncompleteCholesky<Scalar,_UpLo,_OrderingType> > Base;
+    using Base::m_isInitialized;
+  public:
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef _OrderingType OrderingType;
+    typedef typename OrderingType::PermutationType PermutationType;
+    typedef typename PermutationType::StorageIndex StorageIndex;
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> FactorType;
+    typedef Matrix<Scalar,Dynamic,1> VectorSx;
+    typedef Matrix<RealScalar,Dynamic,1> VectorRx;
+    typedef Matrix<StorageIndex,Dynamic, 1> VectorIx;
+    typedef std::vector<std::list<StorageIndex> > VectorList;
+    enum { UpLo = _UpLo };
+    enum {
+      ColsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic
+    };
+  public:
+
+    /** Default constructor leaving the object in a partly non-initialized stage.
+      *
+      * You must call compute() or the pair analyzePattern()/factorize() to make it valid.
+      *
+      * \sa IncompleteCholesky(const MatrixType&)
+      */
+    IncompleteCholesky() : m_initialShift(1e-3),m_analysisIsOk(false),m_factorizationIsOk(false) {}
+
+    /** Constructor computing the incomplete factorization for the given matrix \a matrix.
+      */
+    template<typename MatrixType>
+    IncompleteCholesky(const MatrixType& matrix) : m_initialShift(1e-3),m_analysisIsOk(false),m_factorizationIsOk(false)
+    {
+      compute(matrix);
+    }
+
+    /** \returns number of rows of the factored matrix */
+    EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_L.rows(); }
+
+    /** \returns number of columns of the factored matrix */
+    EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_L.cols(); }
+
+
+    /** \brief Reports whether previous computation was successful.
+      *
+      * It triggers an assertion if \c *this has not been initialized through the respective constructor,
+      * or a call to compute() or analyzePattern().
+      *
+      * \returns \c Success if computation was successful,
+      *          \c NumericalIssue if the matrix appears to be negative.
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "IncompleteCholesky is not initialized.");
+      return m_info;
+    }
+
+    /** \brief Set the initial shift parameter \f$ \sigma \f$.
+      */
+    void setInitialShift(RealScalar shift) { m_initialShift = shift; }
+
+    /** \brief Computes the fill reducing permutation vector using the sparsity pattern of \a mat
+      */
+    template<typename MatrixType>
+    void analyzePattern(const MatrixType& mat)
+    {
+      OrderingType ord;
+      PermutationType pinv;
+      ord(mat.template selfadjointView<UpLo>(), pinv);
+      if(pinv.size()>0) m_perm = pinv.inverse();
+      else              m_perm.resize(0);
+      m_L.resize(mat.rows(), mat.cols());
+      m_analysisIsOk = true;
+      m_isInitialized = true;
+      m_info = Success;
+    }
+
+    /** \brief Performs the numerical factorization of the input matrix \a mat
+      *
+      * The method analyzePattern() or compute() must have been called beforehand
+      * with a matrix having the same pattern.
+      *
+      * \sa compute(), analyzePattern()
+      */
+    template<typename MatrixType>
+    void factorize(const MatrixType& mat);
+
+    /** Computes or re-computes the incomplete Cholesky factorization of the input matrix \a mat
+      *
+      * It is a shortcut for a sequential call to the analyzePattern() and factorize() methods.
+      *
+      * \sa analyzePattern(), factorize()
+      */
+    template<typename MatrixType>
+    void compute(const MatrixType& mat)
+    {
+      analyzePattern(mat);
+      factorize(mat);
+    }
+
+    // internal
+    template<typename Rhs, typename Dest>
+    void _solve_impl(const Rhs& b, Dest& x) const
+    {
+      eigen_assert(m_factorizationIsOk && "factorize() should be called first");
+      if (m_perm.rows() == b.rows())  x = m_perm * b;
+      else                            x = b;
+      x = m_scale.asDiagonal() * x;
+      x = m_L.template triangularView<Lower>().solve(x);
+      x = m_L.adjoint().template triangularView<Upper>().solve(x);
+      x = m_scale.asDiagonal() * x;
+      if (m_perm.rows() == b.rows())
+        x = m_perm.inverse() * x;
+    }
+
+    /** \returns the sparse lower triangular factor L */
+    const FactorType& matrixL() const { eigen_assert("m_factorizationIsOk"); return m_L; }
+
+    /** \returns a vector representing the scaling factor S */
+    const VectorRx& scalingS() const { eigen_assert("m_factorizationIsOk"); return m_scale; }
+
+    /** \returns the fill-in reducing permutation P (can be empty for a natural ordering) */
+    const PermutationType& permutationP() const { eigen_assert("m_analysisIsOk"); return m_perm; }
+
+  protected:
+    FactorType m_L;              // The lower part stored in CSC
+    VectorRx m_scale;            // The vector for scaling the matrix
+    RealScalar m_initialShift;   // The initial shift parameter
+    bool m_analysisIsOk;
+    bool m_factorizationIsOk;
+    ComputationInfo m_info;
+    PermutationType m_perm;
+
+  private:
+    inline void updateList(Ref<const VectorIx> colPtr, Ref<VectorIx> rowIdx, Ref<VectorSx> vals, const Index& col, const Index& jk, VectorIx& firstElt, VectorList& listCol);
+};
+
+// Based on the following paper:
+//   C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
+//   Limited memory, SIAM J. Sci. Comput.  21(1), pp. 24-45, 1999
+//   http://ftp.mcs.anl.gov/pub/tech_reports/reports/P682.pdf
+template<typename Scalar, int _UpLo, typename OrderingType>
+template<typename _MatrixType>
+void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType& mat)
+{
+  using std::sqrt;
+  eigen_assert(m_analysisIsOk && "analyzePattern() should be called first");
+
+  // Dropping strategy : Keep only the p largest elements per column, where p is the number of elements in the column of the original matrix. Other strategies will be added
+
+  // Apply the fill-reducing permutation computed in analyzePattern()
+  if (m_perm.rows() == mat.rows() ) // To detect the null permutation
+  {
+    // The temporary is needed to make sure that the diagonal entry is properly sorted
+    FactorType tmp(mat.rows(), mat.cols());
+    tmp = mat.template selfadjointView<_UpLo>().twistedBy(m_perm);
+    m_L.template selfadjointView<Lower>() = tmp.template selfadjointView<Lower>();
+  }
+  else
+  {
+    m_L.template selfadjointView<Lower>() = mat.template selfadjointView<_UpLo>();
+  }
+
+  Index n = m_L.cols();
+  Index nnz = m_L.nonZeros();
+  Map<VectorSx> vals(m_L.valuePtr(), nnz);         //values
+  Map<VectorIx> rowIdx(m_L.innerIndexPtr(), nnz);  //Row indices
+  Map<VectorIx> colPtr( m_L.outerIndexPtr(), n+1); // Pointer to the beginning of each row
+  VectorIx firstElt(n-1); // for each j, points to the next entry in vals that will be used in the factorization
+  VectorList listCol(n);  // listCol(j) is a linked list of columns to update column j
+  VectorSx col_vals(n);   // Store a  nonzero values in each column
+  VectorIx col_irow(n);   // Row indices of nonzero elements in each column
+  VectorIx col_pattern(n);
+  col_pattern.fill(-1);
+  StorageIndex col_nnz;
+
+
+  // Computes the scaling factors
+  m_scale.resize(n);
+  m_scale.setZero();
+  for (Index j = 0; j < n; j++)
+    for (Index k = colPtr[j]; k < colPtr[j+1]; k++)
+    {
+      m_scale(j) += numext::abs2(vals(k));
+      if(rowIdx[k]!=j)
+        m_scale(rowIdx[k]) += numext::abs2(vals(k));
+    }
+
+  m_scale = m_scale.cwiseSqrt().cwiseSqrt();
+
+  for (Index j = 0; j < n; ++j)
+    if(m_scale(j)>(std::numeric_limits<RealScalar>::min)())
+      m_scale(j) = RealScalar(1)/m_scale(j);
+    else
+      m_scale(j) = 1;
+
+  // TODO disable scaling if not needed, i.e., if it is roughly uniform? (this will make solve() faster)
+
+  // Scale and compute the shift for the matrix
+  RealScalar mindiag = NumTraits<RealScalar>::highest();
+  for (Index j = 0; j < n; j++)
+  {
+    for (Index k = colPtr[j]; k < colPtr[j+1]; k++)
+      vals[k] *= (m_scale(j)*m_scale(rowIdx[k]));
+    eigen_internal_assert(rowIdx[colPtr[j]]==j && "IncompleteCholesky: only the lower triangular part must be stored");
+    mindiag = numext::mini(numext::real(vals[colPtr[j]]), mindiag);
+  }
+
+  FactorType L_save = m_L;
+
+  RealScalar shift = 0;
+  if(mindiag <= RealScalar(0.))
+    shift = m_initialShift - mindiag;
+
+  m_info = NumericalIssue;
+
+  // Try to perform the incomplete factorization using the current shift
+  int iter = 0;
+  do
+  {
+    // Apply the shift to the diagonal elements of the matrix
+    for (Index j = 0; j < n; j++)
+      vals[colPtr[j]] += shift;
+
+    // jki version of the Cholesky factorization
+    Index j=0;
+    for (; j < n; ++j)
+    {
+      // Left-looking factorization of the j-th column
+      // First, load the j-th column into col_vals
+      Scalar diag = vals[colPtr[j]];  // It is assumed that only the lower part is stored
+      col_nnz = 0;
+      for (Index i = colPtr[j] + 1; i < colPtr[j+1]; i++)
+      {
+        StorageIndex l = rowIdx[i];
+        col_vals(col_nnz) = vals[i];
+        col_irow(col_nnz) = l;
+        col_pattern(l) = col_nnz;
+        col_nnz++;
+      }
+      {
+        typename std::list<StorageIndex>::iterator k;
+        // Browse all previous columns that will update column j
+        for(k = listCol[j].begin(); k != listCol[j].end(); k++)
+        {
+          Index jk = firstElt(*k); // First element to use in the column
+          eigen_internal_assert(rowIdx[jk]==j);
+          Scalar v_j_jk = numext::conj(vals[jk]);
+
+          jk += 1;
+          for (Index i = jk; i < colPtr[*k+1]; i++)
+          {
+            StorageIndex l = rowIdx[i];
+            if(col_pattern[l]<0)
+            {
+              col_vals(col_nnz) = vals[i] * v_j_jk;
+              col_irow[col_nnz] = l;
+              col_pattern(l) = col_nnz;
+              col_nnz++;
+            }
+            else
+              col_vals(col_pattern[l]) -= vals[i] * v_j_jk;
+          }
+          updateList(colPtr,rowIdx,vals, *k, jk, firstElt, listCol);
+        }
+      }
+
+      // Scale the current column
+      if(numext::real(diag) <= 0)
+      {
+        if(++iter>=10)
+          return;
+
+        // increase shift
+        shift = numext::maxi(m_initialShift,RealScalar(2)*shift);
+        // restore m_L, col_pattern, and listCol
+        vals = Map<const VectorSx>(L_save.valuePtr(), nnz);
+        rowIdx = Map<const VectorIx>(L_save.innerIndexPtr(), nnz);
+        colPtr = Map<const VectorIx>(L_save.outerIndexPtr(), n+1);
+        col_pattern.fill(-1);
+        for(Index i=0; i<n; ++i)
+          listCol[i].clear();
+
+        break;
+      }
+
+      RealScalar rdiag = sqrt(numext::real(diag));
+      vals[colPtr[j]] = rdiag;
+      for (Index k = 0; k<col_nnz; ++k)
+      {
+        Index i = col_irow[k];
+        //Scale
+        col_vals(k) /= rdiag;
+        //Update the remaining diagonals with col_vals
+        vals[colPtr[i]] -= numext::abs2(col_vals(k));
+      }
+      // Select the largest p elements
+      // p is the original number of elements in the column (without the diagonal)
+      Index p = colPtr[j+1] - colPtr[j] - 1 ;
+      Ref<VectorSx> cvals = col_vals.head(col_nnz);
+      Ref<VectorIx> cirow = col_irow.head(col_nnz);
+      internal::QuickSplit(cvals,cirow, p);
+      // Insert the largest p elements in the matrix
+      Index cpt = 0;
+      for (Index i = colPtr[j]+1; i < colPtr[j+1]; i++)
+      {
+        vals[i] = col_vals(cpt);
+        rowIdx[i] = col_irow(cpt);
+        // restore col_pattern:
+        col_pattern(col_irow(cpt)) = -1;
+        cpt++;
+      }
+      // Get the first smallest row index and put it after the diagonal element
+      Index jk = colPtr(j)+1;
+      updateList(colPtr,rowIdx,vals,j,jk,firstElt,listCol);
+    }
+
+    if(j==n)
+    {
+      m_factorizationIsOk = true;
+      m_info = Success;
+    }
+  } while(m_info!=Success);
+}
+
+template<typename Scalar, int _UpLo, typename OrderingType>
+inline void IncompleteCholesky<Scalar,_UpLo, OrderingType>::updateList(Ref<const VectorIx> colPtr, Ref<VectorIx> rowIdx, Ref<VectorSx> vals, const Index& col, const Index& jk, VectorIx& firstElt, VectorList& listCol)
+{
+  if (jk < colPtr(col+1) )
+  {
+    Index p = colPtr(col+1) - jk;
+    Index minpos;
+    rowIdx.segment(jk,p).minCoeff(&minpos);
+    minpos += jk;
+    if (rowIdx(minpos) != rowIdx(jk))
+    {
+      //Swap
+      std::swap(rowIdx(jk),rowIdx(minpos));
+      std::swap(vals(jk),vals(minpos));
+    }
+    firstElt(col) = internal::convert_index<StorageIndex,Index>(jk);
+    listCol[rowIdx(jk)].push_back(internal::convert_index<StorageIndex,Index>(col));
+  }
+}
+
+} // end namespace Eigen
+
+#endif

diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
index b55afc1..cdcf709 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h

@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -11,20 +12,20 @@
 #define EIGEN_INCOMPLETE_LUT_H
 
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
-    
+
 /** \internal
-  * Compute a quick-sort split of a vector 
+  * Compute a quick-sort split of a vector
   * On output, the vector row is permuted such that its elements satisfy
   * abs(row(i)) >= abs(row(ncut)) if i<ncut
-  * abs(row(i)) <= abs(row(ncut)) if i>ncut 
+  * abs(row(i)) <= abs(row(ncut)) if i>ncut
   * \param row The vector of values
   * \param ind The array of index for the elements in @p row
   * \param ncut  The number of largest elements to keep
-  **/ 
-template <typename VectorV, typename VectorI, typename Index>
+  **/
+template <typename VectorV, typename VectorI>
 Index QuickSplit(VectorV &row, VectorI &ind, Index ncut)
 {
   typedef typename VectorV::RealScalar RealScalar;
@@ -33,15 +34,15 @@
   Index mid;
   Index n = row.size(); /* length of the vector */
   Index first, last ;
-  
+
   ncut--; /* to fit the zero-based indices */
-  first = 0; 
-  last = n-1; 
+  first = 0;
+  last = n-1;
   if (ncut < first || ncut > last ) return 0;
-  
+
   do {
-    mid = first; 
-    RealScalar abskey = abs(row(mid)); 
+    mid = first;
+    RealScalar abskey = abs(row(mid));
     for (Index j = first + 1; j <= last; j++) {
       if ( abs(row(j)) > abskey) {
         ++mid;
@@ -52,12 +53,12 @@
     /* Interchange for the pivot element */
     swap(row(mid), row(first));
     swap(ind(mid), ind(first));
-    
+
     if (mid > ncut) last = mid - 1;
-    else if (mid < ncut ) first = mid + 1; 
+    else if (mid < ncut ) first = mid + 1;
   } while (mid != ncut );
-  
-  return 0; /* mid is equal to ncut */ 
+
+  return 0; /* mid is equal to ncut */
 }
 
 }// end namespace internal
@@ -66,25 +67,27 @@
   * \class IncompleteLUT
   * \brief Incomplete LU factorization with dual-threshold strategy
   *
+  * \implsparsesolverconcept
+  *
   * During the numerical factorization, two dropping rules are used :
   *  1) any element whose magnitude is less than some tolerance is dropped.
-  *    This tolerance is obtained by multiplying the input tolerance @p droptol 
+  *    This tolerance is obtained by multiplying the input tolerance @p droptol
   *    by the average magnitude of all the original elements in the current row.
-  *  2) After the elimination of the row, only the @p fill largest elements in 
-  *    the L part and the @p fill largest elements in the U part are kept 
-  *    (in addition to the diagonal element ). Note that @p fill is computed from 
-  *    the input parameter @p fillfactor which is used the ratio to control the fill_in 
+  *  2) After the elimination of the row, only the @p fill largest elements in
+  *    the L part and the @p fill largest elements in the U part are kept
+  *    (in addition to the diagonal element ). Note that @p fill is computed from
+  *    the input parameter @p fillfactor which is used the ratio to control the fill_in
   *    relatively to the initial number of nonzero elements.
-  * 
+  *
   * The two extreme cases are when @p droptol=0 (to keep all the @p fill*2 largest elements)
-  * and when @p fill=n/2 with @p droptol being different to zero. 
-  * 
-  * References : Yousef Saad, ILUT: A dual threshold incomplete LU factorization, 
+  * and when @p fill=n/2 with @p droptol being different to zero.
+  *
+  * References : Yousef Saad, ILUT: A dual threshold incomplete LU factorization,
   *              Numerical Linear Algebra with Applications, 1(4), pp 387-402, 1994.
-  * 
+  *
   * NOTE : The following implementation is derived from the ILUT implementation
-  * in the SPARSKIT package, Copyright (C) 2005, the Regents of the University of Minnesota 
-  *  released under the terms of the GNU LGPL: 
+  * in the SPARSKIT package, Copyright (C) 2005, the Regents of the University of Minnesota
+  *  released under the terms of the GNU LGPL:
   *    http://www-users.cs.umn.edu/~saad/software/SPARSKIT/README
   * However, Yousef Saad gave us permission to relicense his ILUT code to MPL2.
   * See the Eigen mailing list archive, thread: ILUT, date: July 8, 2012:
@@ -92,40 +95,48 @@
   * alternatively, on GMANE:
   *   http://comments.gmane.org/gmane.comp.lib.eigen/3302
   */
-template <typename _Scalar>
-class IncompleteLUT : internal::noncopyable
+template <typename _Scalar, typename _StorageIndex = int>
+class IncompleteLUT : public SparseSolverBase<IncompleteLUT<_Scalar, _StorageIndex> >
 {
+  protected:
+    typedef SparseSolverBase<IncompleteLUT> Base;
+    using Base::m_isInitialized;
+  public:
     typedef _Scalar Scalar;
+    typedef _StorageIndex StorageIndex;
     typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef Matrix<Scalar,Dynamic,1> Vector;
-    typedef SparseMatrix<Scalar,RowMajor> FactorType;
-    typedef SparseMatrix<Scalar,ColMajor> PermutType;
-    typedef typename FactorType::Index Index;
+    typedef Matrix<StorageIndex,Dynamic,1> VectorI;
+    typedef SparseMatrix<Scalar,RowMajor,StorageIndex> FactorType;
+
+    enum {
+      ColsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic
+    };
 
   public:
-    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
-    
+
     IncompleteLUT()
       : m_droptol(NumTraits<Scalar>::dummy_precision()), m_fillfactor(10),
-        m_analysisIsOk(false), m_factorizationIsOk(false), m_isInitialized(false)
+        m_analysisIsOk(false), m_factorizationIsOk(false)
     {}
-    
+
     template<typename MatrixType>
-    IncompleteLUT(const MatrixType& mat, const RealScalar& droptol=NumTraits<Scalar>::dummy_precision(), int fillfactor = 10)
+    explicit IncompleteLUT(const MatrixType& mat, const RealScalar& droptol=NumTraits<Scalar>::dummy_precision(), int fillfactor = 10)
       : m_droptol(droptol),m_fillfactor(fillfactor),
-        m_analysisIsOk(false),m_factorizationIsOk(false),m_isInitialized(false)
+        m_analysisIsOk(false),m_factorizationIsOk(false)
     {
       eigen_assert(fillfactor != 0);
-      compute(mat); 
+      compute(mat);
     }
-    
-    Index rows() const { return m_lu.rows(); }
-    
-    Index cols() const { return m_lu.cols(); }
+
+    EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); }
+
+    EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); }
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
@@ -133,46 +144,36 @@
       eigen_assert(m_isInitialized && "IncompleteLUT is not initialized.");
       return m_info;
     }
-    
+
     template<typename MatrixType>
     void analyzePattern(const MatrixType& amat);
-    
+
     template<typename MatrixType>
     void factorize(const MatrixType& amat);
-    
+
     /**
       * Compute an incomplete LU factorization with dual threshold on the matrix mat
       * No pivoting is done in this version
-      * 
+      *
       **/
     template<typename MatrixType>
-    IncompleteLUT<Scalar>& compute(const MatrixType& amat)
+    IncompleteLUT& compute(const MatrixType& amat)
     {
-      analyzePattern(amat); 
+      analyzePattern(amat);
       factorize(amat);
-      m_isInitialized = m_factorizationIsOk;
       return *this;
     }
 
-    void setDroptol(const RealScalar& droptol); 
-    void setFillfactor(int fillfactor); 
-    
+    void setDroptol(const RealScalar& droptol);
+    void setFillfactor(int fillfactor);
+
     template<typename Rhs, typename Dest>
-    void _solve(const Rhs& b, Dest& x) const
+    void _solve_impl(const Rhs& b, Dest& x) const
     {
-      x = m_Pinv * b;  
+      x = m_Pinv * b;
       x = m_lu.template triangularView<UnitLower>().solve(x);
       x = m_lu.template triangularView<Upper>().solve(x);
-      x = m_P * x; 
-    }
-
-    template<typename Rhs> inline const internal::solve_retval<IncompleteLUT, Rhs>
-     solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "IncompleteLUT is not initialized.");
-      eigen_assert(cols()==b.rows()
-                && "IncompleteLUT::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<IncompleteLUT, Rhs>(*this, b.derived());
+      x = m_P * x;
     }
 
 protected:
@@ -192,70 +193,72 @@
     int m_fillfactor;
     bool m_analysisIsOk;
     bool m_factorizationIsOk;
-    bool m_isInitialized;
     ComputationInfo m_info;
-    PermutationMatrix<Dynamic,Dynamic,Index> m_P;     // Fill-reducing permutation
-    PermutationMatrix<Dynamic,Dynamic,Index> m_Pinv;  // Inverse permutation
+    PermutationMatrix<Dynamic,Dynamic,StorageIndex> m_P;     // Fill-reducing permutation
+    PermutationMatrix<Dynamic,Dynamic,StorageIndex> m_Pinv;  // Inverse permutation
 };
 
 /**
  * Set control parameter droptol
- *  \param droptol   Drop any element whose magnitude is less than this tolerance 
- **/ 
-template<typename Scalar>
-void IncompleteLUT<Scalar>::setDroptol(const RealScalar& droptol)
+ *  \param droptol   Drop any element whose magnitude is less than this tolerance
+ **/
+template<typename Scalar, typename StorageIndex>
+void IncompleteLUT<Scalar,StorageIndex>::setDroptol(const RealScalar& droptol)
 {
-  this->m_droptol = droptol;   
+  this->m_droptol = droptol;
 }
 
 /**
  * Set control parameter fillfactor
- * \param fillfactor  This is used to compute the  number @p fill_in of largest elements to keep on each row. 
- **/ 
-template<typename Scalar>
-void IncompleteLUT<Scalar>::setFillfactor(int fillfactor)
+ * \param fillfactor  This is used to compute the  number @p fill_in of largest elements to keep on each row.
+ **/
+template<typename Scalar, typename StorageIndex>
+void IncompleteLUT<Scalar,StorageIndex>::setFillfactor(int fillfactor)
 {
-  this->m_fillfactor = fillfactor;   
+  this->m_fillfactor = fillfactor;
 }
 
-template <typename Scalar>
+template <typename Scalar, typename StorageIndex>
 template<typename _MatrixType>
-void IncompleteLUT<Scalar>::analyzePattern(const _MatrixType& amat)
+void IncompleteLUT<Scalar,StorageIndex>::analyzePattern(const _MatrixType& amat)
 {
   // Compute the Fill-reducing permutation
-  SparseMatrix<Scalar,ColMajor, Index> mat1 = amat;
-  SparseMatrix<Scalar,ColMajor, Index> mat2 = amat.transpose();
-  // Symmetrize the pattern
+  // Since ILUT does not perform any numerical pivoting,
+  // it is highly preferable to keep the diagonal through symmetric permutations.
+  // To this end, let's symmetrize the pattern and perform AMD on it.
+  SparseMatrix<Scalar,ColMajor, StorageIndex> mat1 = amat;
+  SparseMatrix<Scalar,ColMajor, StorageIndex> mat2 = amat.transpose();
   // FIXME for a matrix with nearly symmetric pattern, mat2+mat1 is the appropriate choice.
-  //       on the other hand for a really non-symmetric pattern, mat2*mat1 should be prefered...
-  SparseMatrix<Scalar,ColMajor, Index> AtA = mat2 + mat1;
-  AtA.prune(keep_diag());
-  internal::minimum_degree_ordering<Scalar, Index>(AtA, m_P);  // Then compute the AMD ordering...
-
-  m_Pinv  = m_P.inverse(); // ... and the inverse permutation
-
+  //       on the other hand for a really non-symmetric pattern, mat2*mat1 should be preferred...
+  SparseMatrix<Scalar,ColMajor, StorageIndex> AtA = mat2 + mat1;
+  AMDOrdering<StorageIndex> ordering;
+  ordering(AtA,m_P);
+  m_Pinv  = m_P.inverse(); // cache the inverse permutation
   m_analysisIsOk = true;
+  m_factorizationIsOk = false;
+  m_isInitialized = true;
 }
 
-template <typename Scalar>
+template <typename Scalar, typename StorageIndex>
 template<typename _MatrixType>
-void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
+void IncompleteLUT<Scalar,StorageIndex>::factorize(const _MatrixType& amat)
 {
   using std::sqrt;
   using std::swap;
   using std::abs;
+  using internal::convert_index;
 
   eigen_assert((amat.rows() == amat.cols()) && "The factorization should be done on a square matrix");
   Index n = amat.cols();  // Size of the matrix
   m_lu.resize(n,n);
   // Declare Working vectors and variables
   Vector u(n) ;     // real values of the row -- maximum size is n --
-  VectorXi ju(n);   // column position of the values in u -- maximum size  is n
-  VectorXi jr(n);   // Indicate the position of the nonzero elements in the vector u -- A zero location is indicated by -1
+  VectorI ju(n);   // column position of the values in u -- maximum size  is n
+  VectorI jr(n);   // Indicate the position of the nonzero elements in the vector u -- A zero location is indicated by -1
 
   // Apply the fill-reducing permutation
   eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
-  SparseMatrix<Scalar,RowMajor, Index> mat;
+  SparseMatrix<Scalar,RowMajor, StorageIndex> mat;
   mat = amat.twistedBy(m_Pinv);
 
   // Initialization
@@ -264,7 +267,7 @@
   u.fill(0);
 
   // number of largest elements to keep in each row:
-  Index fill_in =   static_cast<Index> (amat.nonZeros()*m_fillfactor)/n+1;
+  Index fill_in = (amat.nonZeros()*m_fillfactor)/n + 1;
   if (fill_in > n) fill_in = n;
 
   // number of largest nonzero elements to keep in the L and the U part of the current row:
@@ -279,9 +282,9 @@
 
     Index sizeu = 1; // number of nonzero elements in the upper part of the current row
     Index sizel = 0; // number of nonzero elements in the lower part of the current row
-    ju(ii)    = ii;
+    ju(ii)    = convert_index<StorageIndex>(ii);
     u(ii)     = 0;
-    jr(ii)    = ii;
+    jr(ii)    = convert_index<StorageIndex>(ii);
     RealScalar rownorm = 0;
 
     typename FactorType::InnerIterator j_it(mat, ii); // Iterate through the current row ii
@@ -291,9 +294,9 @@
       if (k < ii)
       {
         // copy the lower part
-        ju(sizel) = k;
+        ju(sizel) = convert_index<StorageIndex>(k);
         u(sizel) = j_it.value();
-        jr(k) = sizel;
+        jr(k) = convert_index<StorageIndex>(sizel);
         ++sizel;
       }
       else if (k == ii)
@@ -304,9 +307,9 @@
       {
         // copy the upper part
         Index jpos = ii + sizeu;
-        ju(jpos) = k;
+        ju(jpos) = convert_index<StorageIndex>(k);
         u(jpos) = j_it.value();
-        jr(k) = jpos;
+        jr(k) = convert_index<StorageIndex>(jpos);
         ++sizeu;
       }
       rownorm += numext::abs2(j_it.value());
@@ -336,7 +339,8 @@
         // swap the two locations
         Index j = ju(jj);
         swap(ju(jj), ju(k));
-        jr(minrow) = jj;   jr(j) = k;
+        jr(minrow) = convert_index<StorageIndex>(jj);
+        jr(j) = convert_index<StorageIndex>(k);
         swap(u(jj), u(k));
       }
       // Reset this location
@@ -360,8 +364,8 @@
       for (; ki_it; ++ki_it)
       {
         Scalar prod = fact * ki_it.value();
-        Index j       = ki_it.index();
-        Index jpos    = jr(j);
+        Index j     = ki_it.index();
+        Index jpos  = jr(j);
         if (jpos == -1) // fill-in element
         {
           Index newpos;
@@ -377,16 +381,16 @@
             sizel++;
             eigen_internal_assert(sizel<=ii);
           }
-          ju(newpos) = j;
+          ju(newpos) = convert_index<StorageIndex>(j);
           u(newpos) = -prod;
-          jr(j) = newpos;
+          jr(j) = convert_index<StorageIndex>(newpos);
         }
         else
           u(jpos) -= prod;
       }
       // store the pivot element
-      u(len) = fact;
-      ju(len) = minrow;
+      u(len)  = fact;
+      ju(len) = convert_index<StorageIndex>(minrow);
       ++len;
 
       jj++;
@@ -401,7 +405,7 @@
     sizel = len;
     len = (std::min)(sizel, nnzL);
     typename Vector::SegmentReturnType ul(u.segment(0, sizel));
-    typename VectorXi::SegmentReturnType jul(ju.segment(0, sizel));
+    typename VectorI::SegmentReturnType jul(ju.segment(0, sizel));
     internal::QuickSplit(ul, jul, len);
 
     // store the largest m_fill elements of the L part
@@ -430,14 +434,13 @@
     sizeu = len + 1; // +1 to take into account the diagonal element
     len = (std::min)(sizeu, nnzU);
     typename Vector::SegmentReturnType uu(u.segment(ii+1, sizeu-1));
-    typename VectorXi::SegmentReturnType juu(ju.segment(ii+1, sizeu-1));
+    typename VectorI::SegmentReturnType juu(ju.segment(ii+1, sizeu-1));
     internal::QuickSplit(uu, juu, len);
 
     // store the largest elements of the U part
     for(Index k = ii + 1; k < ii + len; k++)
       m_lu.insertBackByOuterInnerUnordered(ii,ju(k)) = u(k);
   }
-
   m_lu.finalize();
   m_lu.makeCompressed();
 
@@ -445,23 +448,6 @@
   m_info = Success;
 }
 
-namespace internal {
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<IncompleteLUT<_MatrixType>, Rhs>
-  : solve_retval_base<IncompleteLUT<_MatrixType>, Rhs>
-{
-  typedef IncompleteLUT<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_INCOMPLETE_LUT_H

diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
index 2036922..28a0c51 100644
--- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,7 +10,129 @@
 #ifndef EIGEN_ITERATIVE_SOLVER_BASE_H
 #define EIGEN_ITERATIVE_SOLVER_BASE_H
 
-namespace Eigen { 
+namespace Eigen {
+
+namespace internal {
+
+template<typename MatrixType>
+struct is_ref_compatible_impl
+{
+private:
+  template <typename T0>
+  struct any_conversion
+  {
+    template <typename T> any_conversion(const volatile T&);
+    template <typename T> any_conversion(T&);
+  };
+  struct yes {int a[1];};
+  struct no  {int a[2];};
+
+  template<typename T>
+  static yes test(const Ref<const T>&, int);
+  template<typename T>
+  static no  test(any_conversion<T>, ...);
+
+public:
+  static MatrixType ms_from;
+  enum { value = sizeof(test<MatrixType>(ms_from, 0))==sizeof(yes) };
+};
+
+template<typename MatrixType>
+struct is_ref_compatible
+{
+  enum { value = is_ref_compatible_impl<typename remove_all<MatrixType>::type>::value };
+};
+
+template<typename MatrixType, bool MatrixFree = !internal::is_ref_compatible<MatrixType>::value>
+class generic_matrix_wrapper;
+
+// We have an explicit matrix at hand, compatible with Ref<>
+template<typename MatrixType>
+class generic_matrix_wrapper<MatrixType,false>
+{
+public:
+  typedef Ref<const MatrixType> ActualMatrixType;
+  template<int UpLo> struct ConstSelfAdjointViewReturnType {
+    typedef typename ActualMatrixType::template ConstSelfAdjointViewReturnType<UpLo>::Type Type;
+  };
+
+  enum {
+    MatrixFree = false
+  };
+
+  generic_matrix_wrapper()
+    : m_dummy(0,0), m_matrix(m_dummy)
+  {}
+
+  template<typename InputType>
+  generic_matrix_wrapper(const InputType &mat)
+    : m_matrix(mat)
+  {}
+
+  const ActualMatrixType& matrix() const
+  {
+    return m_matrix;
+  }
+
+  template<typename MatrixDerived>
+  void grab(const EigenBase<MatrixDerived> &mat)
+  {
+    m_matrix.~Ref<const MatrixType>();
+    ::new (&m_matrix) Ref<const MatrixType>(mat.derived());
+  }
+
+  void grab(const Ref<const MatrixType> &mat)
+  {
+    if(&(mat.derived()) != &m_matrix)
+    {
+      m_matrix.~Ref<const MatrixType>();
+      ::new (&m_matrix) Ref<const MatrixType>(mat);
+    }
+  }
+
+protected:
+  MatrixType m_dummy; // used to default initialize the Ref<> object
+  ActualMatrixType m_matrix;
+};
+
+// MatrixType is not compatible with Ref<> -> matrix-free wrapper
+template<typename MatrixType>
+class generic_matrix_wrapper<MatrixType,true>
+{
+public:
+  typedef MatrixType ActualMatrixType;
+  template<int UpLo> struct ConstSelfAdjointViewReturnType
+  {
+    typedef ActualMatrixType Type;
+  };
+
+  enum {
+    MatrixFree = true
+  };
+
+  generic_matrix_wrapper()
+    : mp_matrix(0)
+  {}
+
+  generic_matrix_wrapper(const MatrixType &mat)
+    : mp_matrix(&mat)
+  {}
+
+  const ActualMatrixType& matrix() const
+  {
+    return *mp_matrix;
+  }
+
+  void grab(const MatrixType &mat)
+  {
+    mp_matrix = &mat;
+  }
+
+protected:
+  const ActualMatrixType *mp_matrix;
+};
+
+}
 
 /** \ingroup IterativeLinearSolvers_Module
   * \brief Base class for linear iterative solvers
@@ -18,108 +140,128 @@
   * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
 template< typename Derived>
-class IterativeSolverBase : internal::noncopyable
+class IterativeSolverBase : public SparseSolverBase<Derived>
 {
+protected:
+  typedef SparseSolverBase<Derived> Base;
+  using Base::m_isInitialized;
+
 public:
   typedef typename internal::traits<Derived>::MatrixType MatrixType;
   typedef typename internal::traits<Derived>::Preconditioner Preconditioner;
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::StorageIndex StorageIndex;
   typedef typename MatrixType::RealScalar RealScalar;
 
+  enum {
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+
 public:
 
-  Derived& derived() { return *static_cast<Derived*>(this); }
-  const Derived& derived() const { return *static_cast<const Derived*>(this); }
+  using Base::derived;
 
   /** Default constructor. */
   IterativeSolverBase()
-    : mp_matrix(0)
   {
     init();
   }
 
   /** Initialize the solver with matrix \a A for further \c Ax=b solving.
-    * 
+    *
     * This constructor is a shortcut for the default constructor followed
     * by a call to compute().
-    * 
+    *
     * \warning this class stores a reference to the matrix A as well as some
     * precomputed values that depend on it. Therefore, if \a A is changed
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  IterativeSolverBase(const MatrixType& A)
+  template<typename MatrixDerived>
+  explicit IterativeSolverBase(const EigenBase<MatrixDerived>& A)
+    : m_matrixWrapper(A.derived())
   {
     init();
-    compute(A);
+    compute(matrix());
   }
 
   ~IterativeSolverBase() {}
-  
-  /** Initializes the iterative solver for the sparcity pattern of the matrix \a A for further solving \c Ax=b problems.
+
+  /** Initializes the iterative solver for the sparsity pattern of the matrix \a A for further solving \c Ax=b problems.
     *
-    * Currently, this function mostly call analyzePattern on the preconditioner. In the future
-    * we might, for instance, implement column reodering for faster matrix vector products.
+    * Currently, this function mostly calls analyzePattern on the preconditioner. In the future
+    * we might, for instance, implement column reordering for faster matrix vector products.
     */
-  Derived& analyzePattern(const MatrixType& A)
+  template<typename MatrixDerived>
+  Derived& analyzePattern(const EigenBase<MatrixDerived>& A)
   {
-    m_preconditioner.analyzePattern(A);
+    grab(A.derived());
+    m_preconditioner.analyzePattern(matrix());
     m_isInitialized = true;
     m_analysisIsOk = true;
-    m_info = Success;
+    m_info = m_preconditioner.info();
     return derived();
   }
-  
+
   /** Initializes the iterative solver with the numerical values of the matrix \a A for further solving \c Ax=b problems.
     *
-    * Currently, this function mostly call factorize on the preconditioner.
+    * Currently, this function mostly calls factorize on the preconditioner.
     *
     * \warning this class stores a reference to the matrix A as well as some
     * precomputed values that depend on it. Therefore, if \a A is changed
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  Derived& factorize(const MatrixType& A)
+  template<typename MatrixDerived>
+  Derived& factorize(const EigenBase<MatrixDerived>& A)
   {
-    eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); 
-    mp_matrix = &A;
-    m_preconditioner.factorize(A);
+    eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
+    grab(A.derived());
+    m_preconditioner.factorize(matrix());
     m_factorizationIsOk = true;
-    m_info = Success;
+    m_info = m_preconditioner.info();
     return derived();
   }
 
   /** Initializes the iterative solver with the matrix \a A for further solving \c Ax=b problems.
     *
-    * Currently, this function mostly initialized/compute the preconditioner. In the future
-    * we might, for instance, implement column reodering for faster matrix vector products.
+    * Currently, this function mostly initializes/computes the preconditioner. In the future
+    * we might, for instance, implement column reordering for faster matrix vector products.
     *
     * \warning this class stores a reference to the matrix A as well as some
     * precomputed values that depend on it. Therefore, if \a A is changed
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  Derived& compute(const MatrixType& A)
+  template<typename MatrixDerived>
+  Derived& compute(const EigenBase<MatrixDerived>& A)
   {
-    mp_matrix = &A;
-    m_preconditioner.compute(A);
+    grab(A.derived());
+    m_preconditioner.compute(matrix());
     m_isInitialized = true;
     m_analysisIsOk = true;
     m_factorizationIsOk = true;
-    m_info = Success;
+    m_info = m_preconditioner.info();
     return derived();
   }
 
   /** \internal */
-  Index rows() const { return mp_matrix ? mp_matrix->rows() : 0; }
-  /** \internal */
-  Index cols() const { return mp_matrix ? mp_matrix->cols() : 0; }
+  EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return matrix().rows(); }
 
-  /** \returns the tolerance threshold used by the stopping criteria */
+  /** \internal */
+  EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return matrix().cols(); }
+
+  /** \returns the tolerance threshold used by the stopping criteria.
+    * \sa setTolerance()
+    */
   RealScalar tolerance() const { return m_tolerance; }
-  
-  /** Sets the tolerance threshold used by the stopping criteria */
+
+  /** Sets the tolerance threshold used by the stopping criteria.
+    *
+    * This value is used as an upper bound to the relative residual error: |Ax-b|/|b|.
+    * The default value is the machine precision given by NumTraits<Scalar>::epsilon()
+    */
   Derived& setTolerance(const RealScalar& tolerance)
   {
     m_tolerance = tolerance;
@@ -128,62 +270,56 @@
 
   /** \returns a read-write reference to the preconditioner for custom configuration. */
   Preconditioner& preconditioner() { return m_preconditioner; }
-  
+
   /** \returns a read-only reference to the preconditioner. */
   const Preconditioner& preconditioner() const { return m_preconditioner; }
 
-  /** \returns the max number of iterations */
-  int maxIterations() const
+  /** \returns the max number of iterations.
+    * It is either the value set by setMaxIterations or, by default,
+    * twice the number of columns of the matrix.
+    */
+  Index maxIterations() const
   {
-    return (mp_matrix && m_maxIterations<0) ? mp_matrix->cols() : m_maxIterations;
+    return (m_maxIterations<0) ? 2*matrix().cols() : m_maxIterations;
   }
-  
-  /** Sets the max number of iterations */
-  Derived& setMaxIterations(int maxIters)
+
+  /** Sets the max number of iterations.
+    * Default is twice the number of columns of the matrix.
+    */
+  Derived& setMaxIterations(Index maxIters)
   {
     m_maxIterations = maxIters;
     return derived();
   }
 
   /** \returns the number of iterations performed during the last solve */
-  int iterations() const
+  Index iterations() const
   {
     eigen_assert(m_isInitialized && "ConjugateGradient is not initialized.");
     return m_iterations;
   }
 
-  /** \returns the tolerance error reached during the last solve */
+  /** \returns the tolerance error reached during the last solve.
+    * It is a close approximation of the true relative residual error |Ax-b|/|b|.
+    */
   RealScalar error() const
   {
     eigen_assert(m_isInitialized && "ConjugateGradient is not initialized.");
     return m_error;
   }
 
-  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
+    * and \a x0 as an initial solution.
     *
-    * \sa compute()
+    * \sa solve(), compute()
     */
-  template<typename Rhs> inline const internal::solve_retval<Derived, Rhs>
-  solve(const MatrixBase<Rhs>& b) const
+  template<typename Rhs,typename Guess>
+  inline const SolveWithGuess<Derived, Rhs, Guess>
+  solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
   {
-    eigen_assert(m_isInitialized && "IterativeSolverBase is not initialized.");
-    eigen_assert(rows()==b.rows()
-              && "IterativeSolverBase::solve(): invalid number of rows of the right hand side matrix b");
-    return internal::solve_retval<Derived, Rhs>(derived(), b.derived());
-  }
-  
-  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-    *
-    * \sa compute()
-    */
-  template<typename Rhs>
-  inline const internal::sparse_solve_retval<IterativeSolverBase, Rhs>
-  solve(const SparseMatrixBase<Rhs>& b) const
-  {
-    eigen_assert(m_isInitialized && "IterativeSolverBase is not initialized.");
-    eigen_assert(rows()==b.rows()
-              && "IterativeSolverBase::solve(): invalid number of rows of the right hand side matrix b");
-    return internal::sparse_solve_retval<IterativeSolverBase, Rhs>(*this, b.derived());
+    eigen_assert(m_isInitialized && "Solver is not initialized.");
+    eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
+    return SolveWithGuess<Derived, Rhs, Guess>(derived(), b.derived(), x0);
   }
 
   /** \returns Success if the iterations converged, and NoConvergence otherwise. */
@@ -192,23 +328,79 @@
     eigen_assert(m_isInitialized && "IterativeSolverBase is not initialized.");
     return m_info;
   }
-  
+
   /** \internal */
-  template<typename Rhs, typename DestScalar, int DestOptions, typename DestIndex>
-  void _solve_sparse(const Rhs& b, SparseMatrix<DestScalar,DestOptions,DestIndex> &dest) const
+  template<typename Rhs, typename DestDerived>
+  void _solve_with_guess_impl(const Rhs& b, SparseMatrixBase<DestDerived> &aDest) const
   {
     eigen_assert(rows()==b.rows());
-    
-    int rhsCols = b.cols();
-    int size = b.rows();
+
+    Index rhsCols = b.cols();
+    Index size = b.rows();
+    DestDerived& dest(aDest.derived());
+    typedef typename DestDerived::Scalar DestScalar;
     Eigen::Matrix<DestScalar,Dynamic,1> tb(size);
-    Eigen::Matrix<DestScalar,Dynamic,1> tx(size);
-    for(int k=0; k<rhsCols; ++k)
+    Eigen::Matrix<DestScalar,Dynamic,1> tx(cols());
+    // We do not directly fill dest because sparse expressions have to be free of aliasing issue.
+    // For non square least-square problems, b and dest might not have the same size whereas they might alias each-other.
+    typename DestDerived::PlainObject tmp(cols(),rhsCols);
+    ComputationInfo global_info = Success;
+    for(Index k=0; k<rhsCols; ++k)
     {
       tb = b.col(k);
-      tx = derived().solve(tb);
-      dest.col(k) = tx.sparseView(0);
+      tx = dest.col(k);
+      derived()._solve_vector_with_guess_impl(tb,tx);
+      tmp.col(k) = tx.sparseView(0);
+
+      // The call to _solve_vector_with_guess_impl updates m_info, so if it failed for a previous column
+      // we need to restore it to the worst value.
+      if(m_info==NumericalIssue)
+        global_info = NumericalIssue;
+      else if(m_info==NoConvergence)
+        global_info = NoConvergence;
     }
+    m_info = global_info;
+    dest.swap(tmp);
+  }
+
+  template<typename Rhs, typename DestDerived>
+  typename internal::enable_if<Rhs::ColsAtCompileTime!=1 && DestDerived::ColsAtCompileTime!=1>::type
+  _solve_with_guess_impl(const Rhs& b, MatrixBase<DestDerived> &aDest) const
+  {
+    eigen_assert(rows()==b.rows());
+
+    Index rhsCols = b.cols();
+    DestDerived& dest(aDest.derived());
+    ComputationInfo global_info = Success;
+    for(Index k=0; k<rhsCols; ++k)
+    {
+      typename DestDerived::ColXpr xk(dest,k);
+      typename Rhs::ConstColXpr bk(b,k);
+      derived()._solve_vector_with_guess_impl(bk,xk);
+
+      // The call to _solve_vector_with_guess updates m_info, so if it failed for a previous column
+      // we need to restore it to the worst value.
+      if(m_info==NumericalIssue)
+        global_info = NumericalIssue;
+      else if(m_info==NoConvergence)
+        global_info = NoConvergence;
+    }
+    m_info = global_info;
+  }
+
+  template<typename Rhs, typename DestDerived>
+  typename internal::enable_if<Rhs::ColsAtCompileTime==1 || DestDerived::ColsAtCompileTime==1>::type
+  _solve_with_guess_impl(const Rhs& b, MatrixBase<DestDerived> &dest) const
+  {
+    derived()._solve_vector_with_guess_impl(b,dest.derived());
+  }
+
+  /** \internal default initial guess = 0 */
+  template<typename Rhs,typename Dest>
+  void _solve_impl(const Rhs& b, Dest& x) const
+  {
+    x.setZero();
+    derived()._solve_with_guess_impl(b,x);
   }
 
 protected:
@@ -220,35 +412,33 @@
     m_maxIterations = -1;
     m_tolerance = NumTraits<Scalar>::epsilon();
   }
-  const MatrixType* mp_matrix;
+
+  typedef internal::generic_matrix_wrapper<MatrixType> MatrixWrapper;
+  typedef typename MatrixWrapper::ActualMatrixType ActualMatrixType;
+
+  const ActualMatrixType& matrix() const
+  {
+    return m_matrixWrapper.matrix();
+  }
+
+  template<typename InputType>
+  void grab(const InputType &A)
+  {
+    m_matrixWrapper.grab(A);
+  }
+
+  MatrixWrapper m_matrixWrapper;
   Preconditioner m_preconditioner;
 
-  int m_maxIterations;
+  Index m_maxIterations;
   RealScalar m_tolerance;
-  
+
   mutable RealScalar m_error;
-  mutable int m_iterations;
+  mutable Index m_iterations;
   mutable ComputationInfo m_info;
-  mutable bool m_isInitialized, m_analysisIsOk, m_factorizationIsOk;
+  mutable bool m_analysisIsOk, m_factorizationIsOk;
 };
 
-namespace internal {
- 
-template<typename Derived, typename Rhs>
-struct sparse_solve_retval<IterativeSolverBase<Derived>, Rhs>
-  : sparse_solve_retval_base<IterativeSolverBase<Derived>, Rhs>
-{
-  typedef IterativeSolverBase<Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec().derived()._solve_sparse(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_ITERATIVE_SOLVER_BASE_H

diff --git a/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
new file mode 100644
index 0000000..203fd0e
--- /dev/null
+++ b/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h

@@ -0,0 +1,198 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H
+#define EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \internal Low-level conjugate gradient algorithm for least-square problems
+  * \param mat The matrix A
+  * \param rhs The right hand side vector b
+  * \param x On input and initial solution, on output the computed solution.
+  * \param precond A preconditioner being able to efficiently solve for an
+  *                approximation of A'Ax=b (regardless of b)
+  * \param iters On input the max number of iteration, on output the number of performed iterations.
+  * \param tol_error On input the tolerance error, on output an estimation of the relative error.
+  */
+template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+EIGEN_DONT_INLINE
+void least_square_conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
+                                     const Preconditioner& precond, Index& iters,
+                                     typename Dest::RealScalar& tol_error)
+{
+  using std::sqrt;
+  using std::abs;
+  typedef typename Dest::RealScalar RealScalar;
+  typedef typename Dest::Scalar Scalar;
+  typedef Matrix<Scalar,Dynamic,1> VectorType;
+  
+  RealScalar tol = tol_error;
+  Index maxIters = iters;
+  
+  Index m = mat.rows(), n = mat.cols();
+
+  VectorType residual        = rhs - mat * x;
+  VectorType normal_residual = mat.adjoint() * residual;
+
+  RealScalar rhsNorm2 = (mat.adjoint()*rhs).squaredNorm();
+  if(rhsNorm2 == 0) 
+  {
+    x.setZero();
+    iters = 0;
+    tol_error = 0;
+    return;
+  }
+  RealScalar threshold = tol*tol*rhsNorm2;
+  RealScalar residualNorm2 = normal_residual.squaredNorm();
+  if (residualNorm2 < threshold)
+  {
+    iters = 0;
+    tol_error = sqrt(residualNorm2 / rhsNorm2);
+    return;
+  }
+  
+  VectorType p(n);
+  p = precond.solve(normal_residual);                         // initial search direction
+
+  VectorType z(n), tmp(m);
+  RealScalar absNew = numext::real(normal_residual.dot(p));  // the square of the absolute value of r scaled by invM
+  Index i = 0;
+  while(i < maxIters)
+  {
+    tmp.noalias() = mat * p;
+
+    Scalar alpha = absNew / tmp.squaredNorm();      // the amount we travel on dir
+    x += alpha * p;                                 // update solution
+    residual -= alpha * tmp;                        // update residual
+    normal_residual = mat.adjoint() * residual;     // update residual of the normal equation
+    
+    residualNorm2 = normal_residual.squaredNorm();
+    if(residualNorm2 < threshold)
+      break;
+    
+    z = precond.solve(normal_residual);             // approximately solve for "A'A z = normal_residual"
+
+    RealScalar absOld = absNew;
+    absNew = numext::real(normal_residual.dot(z));  // update the absolute value of r
+    RealScalar beta = absNew / absOld;              // calculate the Gram-Schmidt value used to create the new search direction
+    p = z + beta * p;                               // update search direction
+    i++;
+  }
+  tol_error = sqrt(residualNorm2 / rhsNorm2);
+  iters = i;
+}
+
+}
+
+template< typename _MatrixType,
+          typename _Preconditioner = LeastSquareDiagonalPreconditioner<typename _MatrixType::Scalar> >
+class LeastSquaresConjugateGradient;
+
+namespace internal {
+
+template< typename _MatrixType, typename _Preconditioner>
+struct traits<LeastSquaresConjugateGradient<_MatrixType,_Preconditioner> >
+{
+  typedef _MatrixType MatrixType;
+  typedef _Preconditioner Preconditioner;
+};
+
+}
+
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief A conjugate gradient solver for sparse (or dense) least-square problems
+  *
+  * This class allows to solve for A x = b linear problems using an iterative conjugate gradient algorithm.
+  * The matrix A can be non symmetric and rectangular, but the matrix A' A should be positive-definite to guaranty stability.
+  * Otherwise, the SparseLU or SparseQR classes might be preferable.
+  * The matrix A and the vectors x and b can be either dense or sparse.
+  *
+  * \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix.
+  * \tparam _Preconditioner the type of the preconditioner. Default is LeastSquareDiagonalPreconditioner
+  *
+  * \implsparsesolverconcept
+  * 
+  * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+  * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+  * and NumTraits<Scalar>::epsilon() for the tolerance.
+  * 
+  * This class can be used as the direct solver classes. Here is a typical usage example:
+    \code
+    int m=1000000, n = 10000;
+    VectorXd x(n), b(m);
+    SparseMatrix<double> A(m,n);
+    // fill A and b
+    LeastSquaresConjugateGradient<SparseMatrix<double> > lscg;
+    lscg.compute(A);
+    x = lscg.solve(b);
+    std::cout << "#iterations:     " << lscg.iterations() << std::endl;
+    std::cout << "estimated error: " << lscg.error()      << std::endl;
+    // update b, and solve again
+    x = lscg.solve(b);
+    \endcode
+  * 
+  * By default the iterations start with x=0 as an initial guess of the solution.
+  * One can control the start using the solveWithGuess() method.
+  * 
+  * \sa class ConjugateGradient, SparseLU, SparseQR
+  */
+template< typename _MatrixType, typename _Preconditioner>
+class LeastSquaresConjugateGradient : public IterativeSolverBase<LeastSquaresConjugateGradient<_MatrixType,_Preconditioner> >
+{
+  typedef IterativeSolverBase<LeastSquaresConjugateGradient> Base;
+  using Base::matrix;
+  using Base::m_error;
+  using Base::m_iterations;
+  using Base::m_info;
+  using Base::m_isInitialized;
+public:
+  typedef _MatrixType MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef _Preconditioner Preconditioner;
+
+public:
+
+  /** Default constructor. */
+  LeastSquaresConjugateGradient() : Base() {}
+
+  /** Initialize the solver with matrix \a A for further \c Ax=b solving.
+    * 
+    * This constructor is a shortcut for the default constructor followed
+    * by a call to compute().
+    * 
+    * \warning this class stores a reference to the matrix A as well as some
+    * precomputed values that depend on it. Therefore, if \a A is changed
+    * this class becomes invalid. Call compute() to update it with the new
+    * matrix A, or modify a copy of A.
+    */
+  template<typename MatrixDerived>
+  explicit LeastSquaresConjugateGradient(const EigenBase<MatrixDerived>& A) : Base(A.derived()) {}
+
+  ~LeastSquaresConjugateGradient() {}
+
+  /** \internal */
+  template<typename Rhs,typename Dest>
+  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
+  {
+    m_iterations = Base::maxIterations();
+    m_error = Base::m_tolerance;
+
+    internal::least_square_conjugate_gradient(matrix(), b, x, Base::m_preconditioner, m_iterations, m_error);
+    m_info = m_error <= Base::m_tolerance ? Success : NoConvergence;
+  }
+
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H

diff --git a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
new file mode 100644
index 0000000..7b89657
--- /dev/null
+++ b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h

@@ -0,0 +1,117 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SOLVEWITHGUESS_H
+#define EIGEN_SOLVEWITHGUESS_H
+
+namespace Eigen {
+
+template<typename Decomposition, typename RhsType, typename GuessType> class SolveWithGuess;
+
+/** \class SolveWithGuess
+  * \ingroup IterativeLinearSolvers_Module
+  *
+  * \brief Pseudo expression representing a solving operation
+  *
+  * \tparam Decomposition the type of the matrix or decomposion object
+  * \tparam Rhstype the type of the right-hand side
+  *
+  * This class represents an expression of A.solve(B)
+  * and most of the time this is the only way it is used.
+  *
+  */
+namespace internal {
+
+
+template<typename Decomposition, typename RhsType, typename GuessType>
+struct traits<SolveWithGuess<Decomposition, RhsType, GuessType> >
+  : traits<Solve<Decomposition,RhsType> >
+{};
+
+}
+
+
+template<typename Decomposition, typename RhsType, typename GuessType>
+class SolveWithGuess : public internal::generic_xpr_base<SolveWithGuess<Decomposition,RhsType,GuessType>, MatrixXpr, typename internal::traits<RhsType>::StorageKind>::type
+{
+public:
+  typedef typename internal::traits<SolveWithGuess>::Scalar Scalar;
+  typedef typename internal::traits<SolveWithGuess>::PlainObject PlainObject;
+  typedef typename internal::generic_xpr_base<SolveWithGuess<Decomposition,RhsType,GuessType>, MatrixXpr, typename internal::traits<RhsType>::StorageKind>::type Base;
+  typedef typename internal::ref_selector<SolveWithGuess>::type Nested;
+
+  SolveWithGuess(const Decomposition &dec, const RhsType &rhs, const GuessType &guess)
+    : m_dec(dec), m_rhs(rhs), m_guess(guess)
+  {}
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  Index rows() const EIGEN_NOEXCEPT { return m_dec.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+
+  EIGEN_DEVICE_FUNC const Decomposition& dec()   const { return m_dec; }
+  EIGEN_DEVICE_FUNC const RhsType&       rhs()   const { return m_rhs; }
+  EIGEN_DEVICE_FUNC const GuessType&     guess() const { return m_guess; }
+
+protected:
+  const Decomposition &m_dec;
+  const RhsType       &m_rhs;
+  const GuessType     &m_guess;
+
+private:
+  Scalar coeff(Index row, Index col) const;
+  Scalar coeff(Index i) const;
+};
+
+namespace internal {
+
+// Evaluator of SolveWithGuess -> eval into a temporary
+template<typename Decomposition, typename RhsType, typename GuessType>
+struct evaluator<SolveWithGuess<Decomposition,RhsType, GuessType> >
+  : public evaluator<typename SolveWithGuess<Decomposition,RhsType,GuessType>::PlainObject>
+{
+  typedef SolveWithGuess<Decomposition,RhsType,GuessType> SolveType;
+  typedef typename SolveType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  evaluator(const SolveType& solve)
+    : m_result(solve.rows(), solve.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    m_result = solve.guess();
+    solve.dec()._solve_with_guess_impl(solve.rhs(), m_result);
+  }
+
+protected:
+  PlainObject m_result;
+};
+
+// Specialization for "dst = dec.solveWithGuess(rhs)"
+// NOTE we need to specialize it for Dense2Dense to avoid ambiguous specialization error and a Sparse2Sparse specialization must exist somewhere
+template<typename DstXprType, typename DecType, typename RhsType, typename GuessType, typename Scalar>
+struct Assignment<DstXprType, SolveWithGuess<DecType,RhsType,GuessType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
+{
+  typedef SolveWithGuess<DecType,RhsType,GuessType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    dst = src.guess();
+    src.dec()._solve_with_guess_impl(src.rhs(), dst/*, src.guess()*/);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SOLVEWITHGUESS_H

diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h
index 956f72d..76668a5 100644
--- a/Eigen/src/Jacobi/Jacobi.h
+++ b/Eigen/src/Jacobi/Jacobi.h

@@ -11,7 +11,7 @@
 #ifndef EIGEN_JACOBI_H
 #define EIGEN_JACOBI_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \ingroup Jacobi_Module
   * \jacobi_module
@@ -37,17 +37,20 @@
     typedef typename NumTraits<Scalar>::Real RealScalar;
 
     /** Default constructor without any initialization. */
+    EIGEN_DEVICE_FUNC
     JacobiRotation() {}
 
     /** Construct a planar rotation from a cosine-sine pair (\a c, \c s). */
+    EIGEN_DEVICE_FUNC
     JacobiRotation(const Scalar& c, const Scalar& s) : m_c(c), m_s(s) {}
 
-    Scalar& c() { return m_c; }
-    Scalar c() const { return m_c; }
-    Scalar& s() { return m_s; }
-    Scalar s() const { return m_s; }
+    EIGEN_DEVICE_FUNC Scalar& c() { return m_c; }
+    EIGEN_DEVICE_FUNC Scalar c() const { return m_c; }
+    EIGEN_DEVICE_FUNC Scalar& s() { return m_s; }
+    EIGEN_DEVICE_FUNC Scalar s() const { return m_s; }
 
     /** Concatenates two planar rotation */
+    EIGEN_DEVICE_FUNC
     JacobiRotation operator*(const JacobiRotation& other)
     {
       using numext::conj;
@@ -56,20 +59,27 @@
     }
 
     /** Returns the transposed transformation */
+    EIGEN_DEVICE_FUNC
     JacobiRotation transpose() const { using numext::conj; return JacobiRotation(m_c, -conj(m_s)); }
 
     /** Returns the adjoint transformation */
+    EIGEN_DEVICE_FUNC
     JacobiRotation adjoint() const { using numext::conj; return JacobiRotation(conj(m_c), -m_s); }
 
     template<typename Derived>
-    bool makeJacobi(const MatrixBase<Derived>&, typename Derived::Index p, typename Derived::Index q);
+    EIGEN_DEVICE_FUNC
+    bool makeJacobi(const MatrixBase<Derived>&, Index p, Index q);
+    EIGEN_DEVICE_FUNC
     bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z);
 
-    void makeGivens(const Scalar& p, const Scalar& q, Scalar* z=0);
+    EIGEN_DEVICE_FUNC
+    void makeGivens(const Scalar& p, const Scalar& q, Scalar* r=0);
 
   protected:
-    void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::true_type);
-    void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::false_type);
+    EIGEN_DEVICE_FUNC
+    void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type);
+    EIGEN_DEVICE_FUNC
+    void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type);
 
     Scalar m_c, m_s;
 };
@@ -80,12 +90,14 @@
   * \sa MatrixBase::makeJacobi(const MatrixBase<Derived>&, Index, Index), MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
   */
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z)
 {
   using std::sqrt;
   using std::abs;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  if(y == Scalar(0))
+
+  RealScalar deno = RealScalar(2)*abs(y);
+  if(deno < (std::numeric_limits<RealScalar>::min)())
   {
     m_c = Scalar(1);
     m_s = Scalar(0);
@@ -93,7 +105,7 @@
   }
   else
   {
-    RealScalar tau = (x-z)/(RealScalar(2)*abs(y));
+    RealScalar tau = (x-z)/deno;
     RealScalar w = sqrt(numext::abs2(tau) + RealScalar(1));
     RealScalar t;
     if(tau>RealScalar(0))
@@ -123,7 +135,8 @@
   */
 template<typename Scalar>
 template<typename Derived>
-inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, typename Derived::Index p, typename Derived::Index q)
+EIGEN_DEVICE_FUNC
+inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, Index p, Index q)
 {
   return makeJacobi(numext::real(m.coeff(p,p)), m.coeff(p,q), numext::real(m.coeff(q,q)));
 }
@@ -132,7 +145,7 @@
   * \f$ V = \left ( \begin{array}{c} p \\ q \end{array} \right )\f$ yields:
   * \f$ G^* V = \left ( \begin{array}{c} r \\ 0 \end{array} \right )\f$.
   *
-  * The value of \a z is returned if \a z is not null (the default is null).
+  * The value of \a r is returned if \a r is not null (the default is null).
   * Also note that G is built such that the cosine is always real.
   *
   * Example: \include Jacobi_makeGivens.cpp
@@ -145,20 +158,22 @@
   * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
   */
 template<typename Scalar>
-void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* z)
+EIGEN_DEVICE_FUNC
+void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r)
 {
-  makeGivens(p, q, z, typename internal::conditional<NumTraits<Scalar>::IsComplex, internal::true_type, internal::false_type>::type());
+  makeGivens(p, q, r, typename internal::conditional<NumTraits<Scalar>::IsComplex, internal::true_type, internal::false_type>::type());
 }
 
 
 // specialization for complexes
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type)
 {
   using std::sqrt;
   using std::abs;
   using numext::conj;
-  
+
   if(q==Scalar(0))
   {
     m_c = numext::real(p)<0 ? Scalar(-1) : Scalar(1);
@@ -212,6 +227,7 @@
 
 // specialization for reals
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type)
 {
   using std::sqrt;
@@ -255,15 +271,16 @@
 *   Implementation of MatrixBase methods
 ****************************************************************************************/
 
+namespace internal {
 /** \jacobi_module
-  * Applies the clock wise 2D rotation \a j to the set of 2D vectors of cordinates \a x and \a y:
+  * Applies the clock wise 2D rotation \a j to the set of 2D vectors of coordinates \a x and \a y:
   * \f$ \left ( \begin{array}{cc} x \\ y \end{array} \right )  =  J \left ( \begin{array}{cc} x \\ y \end{array} \right ) \f$
   *
   * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
   */
-namespace internal {
 template<typename VectorX, typename VectorY, typename OtherScalar>
-void apply_rotation_in_the_plane(VectorX& _x, VectorY& _y, const JacobiRotation<OtherScalar>& j);
+EIGEN_DEVICE_FUNC
+void apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j);
 }
 
 /** \jacobi_module
@@ -274,6 +291,7 @@
   */
 template<typename Derived>
 template<typename OtherScalar>
+EIGEN_DEVICE_FUNC
 inline void MatrixBase<Derived>::applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j)
 {
   RowXpr x(this->row(p));
@@ -289,6 +307,7 @@
   */
 template<typename Derived>
 template<typename OtherScalar>
+EIGEN_DEVICE_FUNC
 inline void MatrixBase<Derived>::applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j)
 {
   ColXpr x(this->col(p));
@@ -297,122 +316,13 @@
 }
 
 namespace internal {
-template<typename VectorX, typename VectorY, typename OtherScalar>
-void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(VectorX& _x, VectorY& _y, const JacobiRotation<OtherScalar>& j)
+
+template<typename Scalar, typename OtherScalar,
+         int SizeAtCompileTime, int MinAlignment, bool Vectorizable>
+struct apply_rotation_in_the_plane_selector
 {
-  typedef typename VectorX::Index Index;
-  typedef typename VectorX::Scalar Scalar;
-  enum { PacketSize = packet_traits<Scalar>::size };
-  typedef typename packet_traits<Scalar>::type Packet;
-  eigen_assert(_x.size() == _y.size());
-  Index size = _x.size();
-  Index incrx = _x.innerStride();
-  Index incry = _y.innerStride();
-
-  Scalar* EIGEN_RESTRICT x = &_x.coeffRef(0);
-  Scalar* EIGEN_RESTRICT y = &_y.coeffRef(0);
-  
-  OtherScalar c = j.c();
-  OtherScalar s = j.s();
-  if (c==OtherScalar(1) && s==OtherScalar(0))
-    return;
-
-  /*** dynamic-size vectorized paths ***/
-
-  if(VectorX::SizeAtCompileTime == Dynamic &&
-    (VectorX::Flags & VectorY::Flags & PacketAccessBit) &&
-    ((incrx==1 && incry==1) || PacketSize == 1))
-  {
-    // both vectors are sequentially stored in memory => vectorization
-    enum { Peeling = 2 };
-
-    Index alignedStart = internal::first_aligned(y, size);
-    Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize;
-
-    const Packet pc = pset1<Packet>(c);
-    const Packet ps = pset1<Packet>(s);
-    conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex,false> pcj;
-
-    for(Index i=0; i<alignedStart; ++i)
-    {
-      Scalar xi = x[i];
-      Scalar yi = y[i];
-      x[i] =  c * xi + numext::conj(s) * yi;
-      y[i] = -s * xi + numext::conj(c) * yi;
-    }
-
-    Scalar* EIGEN_RESTRICT px = x + alignedStart;
-    Scalar* EIGEN_RESTRICT py = y + alignedStart;
-
-    if(internal::first_aligned(x, size)==alignedStart)
-    {
-      for(Index i=alignedStart; i<alignedEnd; i+=PacketSize)
-      {
-        Packet xi = pload<Packet>(px);
-        Packet yi = pload<Packet>(py);
-        pstore(px, padd(pmul(pc,xi),pcj.pmul(ps,yi)));
-        pstore(py, psub(pcj.pmul(pc,yi),pmul(ps,xi)));
-        px += PacketSize;
-        py += PacketSize;
-      }
-    }
-    else
-    {
-      Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize);
-      for(Index i=alignedStart; i<peelingEnd; i+=Peeling*PacketSize)
-      {
-        Packet xi   = ploadu<Packet>(px);
-        Packet xi1  = ploadu<Packet>(px+PacketSize);
-        Packet yi   = pload <Packet>(py);
-        Packet yi1  = pload <Packet>(py+PacketSize);
-        pstoreu(px, padd(pmul(pc,xi),pcj.pmul(ps,yi)));
-        pstoreu(px+PacketSize, padd(pmul(pc,xi1),pcj.pmul(ps,yi1)));
-        pstore (py, psub(pcj.pmul(pc,yi),pmul(ps,xi)));
-        pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pmul(ps,xi1)));
-        px += Peeling*PacketSize;
-        py += Peeling*PacketSize;
-      }
-      if(alignedEnd!=peelingEnd)
-      {
-        Packet xi = ploadu<Packet>(x+peelingEnd);
-        Packet yi = pload <Packet>(y+peelingEnd);
-        pstoreu(x+peelingEnd, padd(pmul(pc,xi),pcj.pmul(ps,yi)));
-        pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pmul(ps,xi)));
-      }
-    }
-
-    for(Index i=alignedEnd; i<size; ++i)
-    {
-      Scalar xi = x[i];
-      Scalar yi = y[i];
-      x[i] =  c * xi + numext::conj(s) * yi;
-      y[i] = -s * xi + numext::conj(c) * yi;
-    }
-  }
-
-  /*** fixed-size vectorized path ***/
-  else if(VectorX::SizeAtCompileTime != Dynamic &&
-          (VectorX::Flags & VectorY::Flags & PacketAccessBit) &&
-          (VectorX::Flags & VectorY::Flags & AlignedBit))
-  {
-    const Packet pc = pset1<Packet>(c);
-    const Packet ps = pset1<Packet>(s);
-    conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex,false> pcj;
-    Scalar* EIGEN_RESTRICT px = x;
-    Scalar* EIGEN_RESTRICT py = y;
-    for(Index i=0; i<size; i+=PacketSize)
-    {
-      Packet xi = pload<Packet>(px);
-      Packet yi = pload<Packet>(py);
-      pstore(px, padd(pmul(pc,xi),pcj.pmul(ps,yi)));
-      pstore(py, psub(pcj.pmul(pc,yi),pmul(ps,xi)));
-      px += PacketSize;
-      py += PacketSize;
-    }
-  }
-
-  /*** non-vectorized path ***/
-  else
+  static EIGEN_DEVICE_FUNC
+  inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s)
   {
     for(Index i=0; i<size; ++i)
     {
@@ -424,6 +334,146 @@
       y += incry;
     }
   }
+};
+
+template<typename Scalar, typename OtherScalar,
+         int SizeAtCompileTime, int MinAlignment>
+struct apply_rotation_in_the_plane_selector<Scalar,OtherScalar,SizeAtCompileTime,MinAlignment,true /* vectorizable */>
+{
+  static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s)
+  {
+    enum {
+      PacketSize = packet_traits<Scalar>::size,
+      OtherPacketSize = packet_traits<OtherScalar>::size
+    };
+    typedef typename packet_traits<Scalar>::type Packet;
+    typedef typename packet_traits<OtherScalar>::type OtherPacket;
+
+    /*** dynamic-size vectorized paths ***/
+    if(SizeAtCompileTime == Dynamic && ((incrx==1 && incry==1) || PacketSize == 1))
+    {
+      // both vectors are sequentially stored in memory => vectorization
+      enum { Peeling = 2 };
+
+      Index alignedStart = internal::first_default_aligned(y, size);
+      Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize;
+
+      const OtherPacket pc = pset1<OtherPacket>(c);
+      const OtherPacket ps = pset1<OtherPacket>(s);
+      conj_helper<OtherPacket,Packet,NumTraits<OtherScalar>::IsComplex,false> pcj;
+      conj_helper<OtherPacket,Packet,false,false> pm;
+
+      for(Index i=0; i<alignedStart; ++i)
+      {
+        Scalar xi = x[i];
+        Scalar yi = y[i];
+        x[i] =  c * xi + numext::conj(s) * yi;
+        y[i] = -s * xi + numext::conj(c) * yi;
+      }
+
+      Scalar* EIGEN_RESTRICT px = x + alignedStart;
+      Scalar* EIGEN_RESTRICT py = y + alignedStart;
+
+      if(internal::first_default_aligned(x, size)==alignedStart)
+      {
+        for(Index i=alignedStart; i<alignedEnd; i+=PacketSize)
+        {
+          Packet xi = pload<Packet>(px);
+          Packet yi = pload<Packet>(py);
+          pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi)));
+          pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi)));
+          px += PacketSize;
+          py += PacketSize;
+        }
+      }
+      else
+      {
+        Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize);
+        for(Index i=alignedStart; i<peelingEnd; i+=Peeling*PacketSize)
+        {
+          Packet xi   = ploadu<Packet>(px);
+          Packet xi1  = ploadu<Packet>(px+PacketSize);
+          Packet yi   = pload <Packet>(py);
+          Packet yi1  = pload <Packet>(py+PacketSize);
+          pstoreu(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi)));
+          pstoreu(px+PacketSize, padd(pm.pmul(pc,xi1),pcj.pmul(ps,yi1)));
+          pstore (py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi)));
+          pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pm.pmul(ps,xi1)));
+          px += Peeling*PacketSize;
+          py += Peeling*PacketSize;
+        }
+        if(alignedEnd!=peelingEnd)
+        {
+          Packet xi = ploadu<Packet>(x+peelingEnd);
+          Packet yi = pload <Packet>(y+peelingEnd);
+          pstoreu(x+peelingEnd, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi)));
+          pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi)));
+        }
+      }
+
+      for(Index i=alignedEnd; i<size; ++i)
+      {
+        Scalar xi = x[i];
+        Scalar yi = y[i];
+        x[i] =  c * xi + numext::conj(s) * yi;
+        y[i] = -s * xi + numext::conj(c) * yi;
+      }
+    }
+
+    /*** fixed-size vectorized path ***/
+    else if(SizeAtCompileTime != Dynamic && MinAlignment>0) // FIXME should be compared to the required alignment
+    {
+      const OtherPacket pc = pset1<OtherPacket>(c);
+      const OtherPacket ps = pset1<OtherPacket>(s);
+      conj_helper<OtherPacket,Packet,NumTraits<OtherPacket>::IsComplex,false> pcj;
+      conj_helper<OtherPacket,Packet,false,false> pm;
+      Scalar* EIGEN_RESTRICT px = x;
+      Scalar* EIGEN_RESTRICT py = y;
+      for(Index i=0; i<size; i+=PacketSize)
+      {
+        Packet xi = pload<Packet>(px);
+        Packet yi = pload<Packet>(py);
+        pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi)));
+        pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi)));
+        px += PacketSize;
+        py += PacketSize;
+      }
+    }
+
+    /*** non-vectorized path ***/
+    else
+    {
+      apply_rotation_in_the_plane_selector<Scalar,OtherScalar,SizeAtCompileTime,MinAlignment,false>::run(x,incrx,y,incry,size,c,s);
+    }
+  }
+};
+
+template<typename VectorX, typename VectorY, typename OtherScalar>
+EIGEN_DEVICE_FUNC
+void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j)
+{
+  typedef typename VectorX::Scalar Scalar;
+  const bool Vectorizable =    (int(VectorX::Flags) & int(VectorY::Flags) & PacketAccessBit)
+                            && (int(packet_traits<Scalar>::size) == int(packet_traits<OtherScalar>::size));
+
+  eigen_assert(xpr_x.size() == xpr_y.size());
+  Index size = xpr_x.size();
+  Index incrx = xpr_x.derived().innerStride();
+  Index incry = xpr_y.derived().innerStride();
+
+  Scalar* EIGEN_RESTRICT x = &xpr_x.derived().coeffRef(0);
+  Scalar* EIGEN_RESTRICT y = &xpr_y.derived().coeffRef(0);
+
+  OtherScalar c = j.c();
+  OtherScalar s = j.s();
+  if (c==OtherScalar(1) && s==OtherScalar(0))
+    return;
+
+  apply_rotation_in_the_plane_selector<
+    Scalar,OtherScalar,
+    VectorX::SizeAtCompileTime,
+    EIGEN_PLAIN_ENUM_MIN(evaluator<VectorX>::Alignment, evaluator<VectorY>::Alignment),
+    Vectorizable>::run(x,incrx,y,incry,size,c,s);
 }
 
 } // end namespace internal

diff --git a/Eigen/src/KLUSupport/KLUSupport.h b/Eigen/src/KLUSupport/KLUSupport.h
new file mode 100644
index 0000000..215db35
--- /dev/null
+++ b/Eigen/src/KLUSupport/KLUSupport.h

@@ -0,0 +1,358 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Kyle Macfarlan <kyle.macfarlan@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_KLUSUPPORT_H
+#define EIGEN_KLUSUPPORT_H
+
+namespace Eigen {
+
+/* TODO extract L, extract U, compute det, etc... */
+
+/** \ingroup KLUSupport_Module
+  * \brief A sparse LU factorization and solver based on KLU
+  *
+  * This class allows to solve for A.X = B sparse linear problems via a LU factorization
+  * using the KLU library. The sparse matrix A must be squared and full rank.
+  * The vectors or matrices X and B can be either dense or sparse.
+  *
+  * \warning The input matrix A should be in a \b compressed and \b column-major form.
+  * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix.
+  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+  *
+  * \implsparsesolverconcept
+  *
+  * \sa \ref TutorialSparseSolverConcept, class UmfPackLU, class SparseLU
+  */
+
+
+inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, double B [ ], klu_common *Common, double) {
+   return klu_solve(Symbolic, Numeric, internal::convert_index<int>(ldim), internal::convert_index<int>(nrhs), B, Common);
+}
+
+inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, std::complex<double>B[], klu_common *Common, std::complex<double>) {
+   return klu_z_solve(Symbolic, Numeric, internal::convert_index<int>(ldim), internal::convert_index<int>(nrhs), &numext::real_ref(B[0]), Common);
+}
+
+inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, double B[], klu_common *Common, double) {
+   return klu_tsolve(Symbolic, Numeric, internal::convert_index<int>(ldim), internal::convert_index<int>(nrhs), B, Common);
+}
+
+inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, std::complex<double>B[], klu_common *Common, std::complex<double>) {
+   return klu_z_tsolve(Symbolic, Numeric, internal::convert_index<int>(ldim), internal::convert_index<int>(nrhs), &numext::real_ref(B[0]), 0, Common);
+}
+
+inline klu_numeric* klu_factor(int Ap [ ], int Ai [ ], double Ax [ ], klu_symbolic *Symbolic, klu_common *Common, double) {
+   return klu_factor(Ap, Ai, Ax, Symbolic, Common);
+}
+
+inline klu_numeric* klu_factor(int Ap[], int Ai[], std::complex<double> Ax[], klu_symbolic *Symbolic, klu_common *Common, std::complex<double>) {
+   return klu_z_factor(Ap, Ai, &numext::real_ref(Ax[0]), Symbolic, Common);
+}
+
+
+template<typename _MatrixType>
+class KLU : public SparseSolverBase<KLU<_MatrixType> >
+{
+  protected:
+    typedef SparseSolverBase<KLU<_MatrixType> > Base;
+    using Base::m_isInitialized;
+  public:
+    using Base::_solve_impl;
+    typedef _MatrixType MatrixType;
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef Matrix<Scalar,Dynamic,1> Vector;
+    typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
+    typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
+    typedef SparseMatrix<Scalar> LUMatrixType;
+    typedef SparseMatrix<Scalar,ColMajor,int> KLUMatrixType;
+    typedef Ref<const KLUMatrixType, StandardCompressedFormat> KLUMatrixRef;
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+
+  public:
+
+    KLU()
+      : m_dummy(0,0), mp_matrix(m_dummy)
+    {
+      init();
+    }
+
+    template<typename InputMatrixType>
+    explicit KLU(const InputMatrixType& matrix)
+      : mp_matrix(matrix)
+    {
+      init();
+      compute(matrix);
+    }
+
+    ~KLU()
+    {
+      if(m_symbolic) klu_free_symbolic(&m_symbolic,&m_common);
+      if(m_numeric)  klu_free_numeric(&m_numeric,&m_common);
+    }
+
+    EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return mp_matrix.rows(); }
+    EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return mp_matrix.cols(); }
+
+    /** \brief Reports whether previous computation was successful.
+      *
+      * \returns \c Success if computation was successful,
+      *          \c NumericalIssue if the matrix.appears to be negative.
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+      return m_info;
+    }
+#if 0 // not implemented yet
+    inline const LUMatrixType& matrixL() const
+    {
+      if (m_extractedDataAreDirty) extractData();
+      return m_l;
+    }
+
+    inline const LUMatrixType& matrixU() const
+    {
+      if (m_extractedDataAreDirty) extractData();
+      return m_u;
+    }
+
+    inline const IntColVectorType& permutationP() const
+    {
+      if (m_extractedDataAreDirty) extractData();
+      return m_p;
+    }
+
+    inline const IntRowVectorType& permutationQ() const
+    {
+      if (m_extractedDataAreDirty) extractData();
+      return m_q;
+    }
+#endif
+    /** Computes the sparse Cholesky decomposition of \a matrix
+     *  Note that the matrix should be column-major, and in compressed format for best performance.
+     *  \sa SparseMatrix::makeCompressed().
+     */
+    template<typename InputMatrixType>
+    void compute(const InputMatrixType& matrix)
+    {
+      if(m_symbolic) klu_free_symbolic(&m_symbolic, &m_common);
+      if(m_numeric)  klu_free_numeric(&m_numeric, &m_common);
+      grab(matrix.derived());
+      analyzePattern_impl();
+      factorize_impl();
+    }
+
+    /** Performs a symbolic decomposition on the sparcity of \a matrix.
+      *
+      * This function is particularly useful when solving for several problems having the same structure.
+      *
+      * \sa factorize(), compute()
+      */
+    template<typename InputMatrixType>
+    void analyzePattern(const InputMatrixType& matrix)
+    {
+      if(m_symbolic) klu_free_symbolic(&m_symbolic, &m_common);
+      if(m_numeric)  klu_free_numeric(&m_numeric, &m_common);
+
+      grab(matrix.derived());
+
+      analyzePattern_impl();
+    }
+
+
+    /** Provides access to the control settings array used by KLU.
+      *
+      * See KLU documentation for details.
+      */
+    inline const klu_common& kluCommon() const
+    {
+      return m_common;
+    }
+
+    /** Provides access to the control settings array used by UmfPack.
+      *
+      * If this array contains NaN's, the default values are used.
+      *
+      * See KLU documentation for details.
+      */
+    inline klu_common& kluCommon()
+    {
+      return m_common;
+    }
+
+    /** Performs a numeric decomposition of \a matrix
+      *
+      * The given matrix must has the same sparcity than the matrix on which the pattern anylysis has been performed.
+      *
+      * \sa analyzePattern(), compute()
+      */
+    template<typename InputMatrixType>
+    void factorize(const InputMatrixType& matrix)
+    {
+      eigen_assert(m_analysisIsOk && "KLU: you must first call analyzePattern()");
+      if(m_numeric)
+        klu_free_numeric(&m_numeric,&m_common);
+
+      grab(matrix.derived());
+
+      factorize_impl();
+    }
+
+    /** \internal */
+    template<typename BDerived,typename XDerived>
+    bool _solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const;
+
+#if 0 // not implemented yet
+    Scalar determinant() const;
+
+    void extractData() const;
+#endif
+
+  protected:
+
+    void init()
+    {
+      m_info                  = InvalidInput;
+      m_isInitialized         = false;
+      m_numeric               = 0;
+      m_symbolic              = 0;
+      m_extractedDataAreDirty = true;
+
+      klu_defaults(&m_common);
+    }
+
+    void analyzePattern_impl()
+    {
+      m_info = InvalidInput;
+      m_analysisIsOk = false;
+      m_factorizationIsOk = false;
+      m_symbolic = klu_analyze(internal::convert_index<int>(mp_matrix.rows()),
+                                     const_cast<StorageIndex*>(mp_matrix.outerIndexPtr()), const_cast<StorageIndex*>(mp_matrix.innerIndexPtr()),
+                                     &m_common);
+      if (m_symbolic) {
+         m_isInitialized = true;
+         m_info = Success;
+         m_analysisIsOk = true;
+         m_extractedDataAreDirty = true;
+      }
+    }
+
+    void factorize_impl()
+    {
+
+      m_numeric = klu_factor(const_cast<StorageIndex*>(mp_matrix.outerIndexPtr()), const_cast<StorageIndex*>(mp_matrix.innerIndexPtr()), const_cast<Scalar*>(mp_matrix.valuePtr()),
+                                    m_symbolic, &m_common, Scalar());
+
+
+      m_info = m_numeric ? Success : NumericalIssue;
+      m_factorizationIsOk = m_numeric ? 1 : 0;
+      m_extractedDataAreDirty = true;
+    }
+
+    template<typename MatrixDerived>
+    void grab(const EigenBase<MatrixDerived> &A)
+    {
+      mp_matrix.~KLUMatrixRef();
+      ::new (&mp_matrix) KLUMatrixRef(A.derived());
+    }
+
+    void grab(const KLUMatrixRef &A)
+    {
+      if(&(A.derived()) != &mp_matrix)
+      {
+        mp_matrix.~KLUMatrixRef();
+        ::new (&mp_matrix) KLUMatrixRef(A);
+      }
+    }
+
+    // cached data to reduce reallocation, etc.
+#if 0 // not implemented yet
+    mutable LUMatrixType m_l;
+    mutable LUMatrixType m_u;
+    mutable IntColVectorType m_p;
+    mutable IntRowVectorType m_q;
+#endif
+
+    KLUMatrixType m_dummy;
+    KLUMatrixRef mp_matrix;
+
+    klu_numeric* m_numeric;
+    klu_symbolic* m_symbolic;
+    klu_common m_common;
+    mutable ComputationInfo m_info;
+    int m_factorizationIsOk;
+    int m_analysisIsOk;
+    mutable bool m_extractedDataAreDirty;
+
+  private:
+    KLU(const KLU& ) { }
+};
+
+#if 0 // not implemented yet
+template<typename MatrixType>
+void KLU<MatrixType>::extractData() const
+{
+  if (m_extractedDataAreDirty)
+  {
+     eigen_assert(false && "KLU: extractData Not Yet Implemented");
+
+    // get size of the data
+    int lnz, unz, rows, cols, nz_udiag;
+    umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar());
+
+    // allocate data
+    m_l.resize(rows,(std::min)(rows,cols));
+    m_l.resizeNonZeros(lnz);
+
+    m_u.resize((std::min)(rows,cols),cols);
+    m_u.resizeNonZeros(unz);
+
+    m_p.resize(rows);
+    m_q.resize(cols);
+
+    // extract
+    umfpack_get_numeric(m_l.outerIndexPtr(), m_l.innerIndexPtr(), m_l.valuePtr(),
+                        m_u.outerIndexPtr(), m_u.innerIndexPtr(), m_u.valuePtr(),
+                        m_p.data(), m_q.data(), 0, 0, 0, m_numeric);
+
+    m_extractedDataAreDirty = false;
+  }
+}
+
+template<typename MatrixType>
+typename KLU<MatrixType>::Scalar KLU<MatrixType>::determinant() const
+{
+  eigen_assert(false && "KLU: extractData Not Yet Implemented");
+  return Scalar();
+}
+#endif
+
+template<typename MatrixType>
+template<typename BDerived,typename XDerived>
+bool KLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const
+{
+  Index rhsCols = b.cols();
+  EIGEN_STATIC_ASSERT((XDerived::Flags&RowMajorBit)==0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+  eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()");
+
+  x = b;
+  int info = klu_solve(m_symbolic, m_numeric, b.rows(), rhsCols, x.const_cast_derived().data(), const_cast<klu_common*>(&m_common), Scalar());
+
+  m_info = info!=0 ? Success : NumericalIssue;
+  return true;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_KLUSUPPORT_H

diff --git a/Eigen/src/LU/Determinant.h b/Eigen/src/LU/Determinant.h
index bb8e78a..3a41e6f 100644
--- a/Eigen/src/LU/Determinant.h
+++ b/Eigen/src/LU/Determinant.h

@@ -15,6 +15,7 @@
 namespace internal {
 
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 inline const typename Derived::Scalar bruteforce_det3_helper
 (const MatrixBase<Derived>& matrix, int a, int b, int c)
 {
@@ -22,14 +23,6 @@
          * (matrix.coeff(1,b) * matrix.coeff(2,c) - matrix.coeff(1,c) * matrix.coeff(2,b));
 }
 
-template<typename Derived>
-const typename Derived::Scalar bruteforce_det4_helper
-(const MatrixBase<Derived>& matrix, int j, int k, int m, int n)
-{
-  return (matrix.coeff(j,0) * matrix.coeff(k,1) - matrix.coeff(k,0) * matrix.coeff(j,1))
-       * (matrix.coeff(m,2) * matrix.coeff(n,3) - matrix.coeff(n,2) * matrix.coeff(m,3));
-}
-
 template<typename Derived,
          int DeterminantType = Derived::RowsAtCompileTime
 > struct determinant_impl
@@ -44,7 +37,8 @@
 
 template<typename Derived> struct determinant_impl<Derived, 1>
 {
-  static inline typename traits<Derived>::Scalar run(const Derived& m)
+  static inline EIGEN_DEVICE_FUNC
+  typename traits<Derived>::Scalar run(const Derived& m)
   {
     return m.coeff(0,0);
   }
@@ -52,7 +46,8 @@
 
 template<typename Derived> struct determinant_impl<Derived, 2>
 {
-  static inline typename traits<Derived>::Scalar run(const Derived& m)
+  static inline EIGEN_DEVICE_FUNC
+  typename traits<Derived>::Scalar run(const Derived& m)
   {
     return m.coeff(0,0) * m.coeff(1,1) - m.coeff(1,0) * m.coeff(0,1);
   }
@@ -60,7 +55,8 @@
 
 template<typename Derived> struct determinant_impl<Derived, 3>
 {
-  static inline typename traits<Derived>::Scalar run(const Derived& m)
+  static inline EIGEN_DEVICE_FUNC
+  typename traits<Derived>::Scalar run(const Derived& m)
   {
     return bruteforce_det3_helper(m,0,1,2)
           - bruteforce_det3_helper(m,1,0,2)
@@ -70,15 +66,34 @@
 
 template<typename Derived> struct determinant_impl<Derived, 4>
 {
-  static typename traits<Derived>::Scalar run(const Derived& m)
+  typedef typename traits<Derived>::Scalar Scalar;
+  static EIGEN_DEVICE_FUNC
+  Scalar run(const Derived& m)
   {
-    // trick by Martin Costabel to compute 4x4 det with only 30 muls
-    return bruteforce_det4_helper(m,0,1,2,3)
-          - bruteforce_det4_helper(m,0,2,1,3)
-          + bruteforce_det4_helper(m,0,3,1,2)
-          + bruteforce_det4_helper(m,1,2,0,3)
-          - bruteforce_det4_helper(m,1,3,0,2)
-          + bruteforce_det4_helper(m,2,3,0,1);
+    Scalar d2_01 = det2(m, 0, 1);
+    Scalar d2_02 = det2(m, 0, 2);
+    Scalar d2_03 = det2(m, 0, 3);
+    Scalar d2_12 = det2(m, 1, 2);
+    Scalar d2_13 = det2(m, 1, 3);
+    Scalar d2_23 = det2(m, 2, 3);
+    Scalar d3_0 = det3(m, 1,d2_23, 2,d2_13, 3,d2_12);
+    Scalar d3_1 = det3(m, 0,d2_23, 2,d2_03, 3,d2_02);
+    Scalar d3_2 = det3(m, 0,d2_13, 1,d2_03, 3,d2_01);
+    Scalar d3_3 = det3(m, 0,d2_12, 1,d2_02, 2,d2_01);
+    return internal::pmadd(-m(0,3),d3_0, m(1,3)*d3_1) +
+           internal::pmadd(-m(2,3),d3_2, m(3,3)*d3_3);
+  }
+protected:
+  static EIGEN_DEVICE_FUNC
+  Scalar det2(const Derived& m, Index i0, Index i1)
+  {
+    return m(i0,0) * m(i1,1) - m(i1,0) * m(i0,1);
+  }
+
+  static EIGEN_DEVICE_FUNC
+  Scalar det3(const Derived& m, Index i0, const Scalar& d0, Index i1, const Scalar& d1, Index i2, const Scalar& d2)
+  {
+    return internal::pmadd(m(i0,2), d0, internal::pmadd(-m(i1,2), d1, m(i2,2)*d2));
   }
 };
 
@@ -89,10 +104,11 @@
   * \returns the determinant of this matrix
   */
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 inline typename internal::traits<Derived>::Scalar MatrixBase<Derived>::determinant() const
 {
   eigen_assert(rows() == cols());
-  typedef typename internal::nested<Derived,Base::RowsAtCompileTime>::type Nested;
+  typedef typename internal::nested_eval<Derived,Base::RowsAtCompileTime>::type Nested;
   return internal::determinant_impl<typename internal::remove_all<Nested>::type>::run(derived());
 }
 

diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 971b9da..ba1749f 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h

@@ -10,7 +10,19 @@
 #ifndef EIGEN_LU_H
 #define EIGEN_LU_H
 
-namespace Eigen { 
+namespace Eigen {
+
+namespace internal {
+template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
+ : traits<_MatrixType>
+{
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
+  enum { Flags = 0 };
+};
+
+} // end namespace internal
 
 /** \ingroup LU_Module
   *
@@ -18,7 +30,7 @@
   *
   * \brief LU decomposition of a matrix with complete pivoting, and related features
   *
-  * \param MatrixType the type of the matrix of which we are computing the LU decomposition
+  * \tparam _MatrixType the type of the matrix of which we are computing the LU decomposition
   *
   * This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A is
   * decomposed as \f$ A = P^{-1} L U Q^{-1} \f$ where L is unit-lower-triangular, U is
@@ -37,31 +49,32 @@
   * The data of the LU decomposition can be directly accessed through the methods matrixLU(),
   * permutationP(), permutationQ().
   *
-  * As an exemple, here is how the original matrix can be retrieved:
+  * As an example, here is how the original matrix can be retrieved:
   * \include class_FullPivLU.cpp
   * Output: \verbinclude class_FullPivLU.out
   *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  *
   * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse()
   */
 template<typename _MatrixType> class FullPivLU
+  : public SolverBase<FullPivLU<_MatrixType> >
 {
   public:
     typedef _MatrixType MatrixType;
+    typedef SolverBase<FullPivLU> Base;
+    friend class SolverBase<FullPivLU>;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivLU)
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename internal::traits<MatrixType>::StorageKind StorageKind;
-    typedef typename MatrixType::Index Index;
-    typedef typename internal::plain_row_type<MatrixType, Index>::type IntRowVectorType;
-    typedef typename internal::plain_col_type<MatrixType, Index>::type IntColVectorType;
+    typedef typename internal::plain_row_type<MatrixType, StorageIndex>::type IntRowVectorType;
+    typedef typename internal::plain_col_type<MatrixType, StorageIndex>::type IntColVectorType;
     typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationQType;
     typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime> PermutationPType;
+    typedef typename MatrixType::PlainObject PlainObject;
 
     /**
       * \brief Default Constructor.
@@ -84,7 +97,17 @@
       * \param matrix the matrix of which to compute the LU decomposition.
       *               It is required to be nonzero.
       */
-    FullPivLU(const MatrixType& matrix);
+    template<typename InputType>
+    explicit FullPivLU(const EigenBase<InputType>& matrix);
+
+    /** \brief Constructs a LU factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
+      *
+      * \sa FullPivLU(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit FullPivLU(EigenBase<InputType>& matrix);
 
     /** Computes the LU decomposition of the given matrix.
       *
@@ -93,7 +116,12 @@
       *
       * \returns a reference to *this
       */
-    FullPivLU& compute(const MatrixType& matrix);
+    template<typename InputType>
+    FullPivLU& compute(const EigenBase<InputType>& matrix) {
+      m_lu = matrix.derived();
+      computeInPlace();
+      return *this;
+    }
 
     /** \returns the LU decomposition matrix: the upper-triangular part is U, the
       * unit-lower-triangular part is L (at least for square matrices; in the non-square
@@ -129,7 +157,7 @@
       *
       * \sa permutationQ()
       */
-    inline const PermutationPType& permutationP() const
+    EIGEN_DEVICE_FUNC inline const PermutationPType& permutationP() const
     {
       eigen_assert(m_isInitialized && "LU is not initialized.");
       return m_p;
@@ -166,7 +194,7 @@
     }
 
     /** \returns the image of the matrix, also called its column-space. The columns of the returned matrix
-      * will form a basis of the kernel.
+      * will form a basis of the image (column-space).
       *
       * \param originalMatrix the original matrix, of which *this is the LU decomposition.
       *                       The reason why it is needed to pass it here, is that this allows
@@ -191,6 +219,7 @@
       return internal::image_retval<FullPivLU>(*this, originalMatrix);
     }
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** \return a solution x to the equation Ax=b, where A is the matrix of which
       * *this is the LU decomposition.
       *
@@ -211,11 +240,17 @@
       * \sa TriangularView::solve(), kernel(), inverse()
       */
     template<typename Rhs>
-    inline const internal::solve_retval<FullPivLU, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
+    inline const Solve<FullPivLU, Rhs>
+    solve(const MatrixBase<Rhs>& b) const;
+    #endif
+
+    /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
+        the LU decomposition.
+      */
+    inline RealScalar rcond() const
     {
-      eigen_assert(m_isInitialized && "LU is not initialized.");
-      return internal::solve_retval<FullPivLU, Rhs>(*this, b.derived());
+      eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
+      return internal::rcond_estimate_helper(m_l1_norm, *this);
     }
 
     /** \returns the determinant of the matrix of which
@@ -283,7 +318,7 @@
       return m_usePrescribedThreshold ? m_prescribedThreshold
       // this formula comes from experimenting (see "LU precision tuning" thread on the list)
       // and turns out to be identical to Higham's formula used already in LDLt.
-                                      : NumTraits<Scalar>::epsilon() * m_lu.diagonalSize();
+          : NumTraits<Scalar>::epsilon() * RealScalar(m_lu.diagonalSize());
     }
 
     /** \returns the rank of the matrix of which *this is the LU decomposition.
@@ -360,27 +395,46 @@
       *
       * \sa MatrixBase::inverse()
       */
-    inline const internal::solve_retval<FullPivLU,typename MatrixType::IdentityReturnType> inverse() const
+    inline const Inverse<FullPivLU> inverse() const
     {
       eigen_assert(m_isInitialized && "LU is not initialized.");
       eigen_assert(m_lu.rows() == m_lu.cols() && "You can't take the inverse of a non-square matrix!");
-      return internal::solve_retval<FullPivLU,typename MatrixType::IdentityReturnType>
-               (*this, MatrixType::Identity(m_lu.rows(), m_lu.cols()));
+      return Inverse<FullPivLU>(*this);
     }
 
     MatrixType reconstructedMatrix() const;
 
-    inline Index rows() const { return m_lu.rows(); }
-    inline Index cols() const { return m_lu.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
+    #endif
 
   protected:
+
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+
+    void computeInPlace();
+
     MatrixType m_lu;
     PermutationPType m_p;
     PermutationQType m_q;
     IntColVectorType m_rowsTranspositions;
     IntRowVectorType m_colsTranspositions;
-    Index m_det_pq, m_nonzero_pivots;
+    Index m_nonzero_pivots;
+    RealScalar m_l1_norm;
     RealScalar m_maxpivot, m_prescribedThreshold;
+    signed char m_det_pq;
     bool m_isInitialized, m_usePrescribedThreshold;
 };
 
@@ -403,7 +457,8 @@
 }
 
 template<typename MatrixType>
-FullPivLU<MatrixType>::FullPivLU(const MatrixType& matrix)
+template<typename InputType>
+FullPivLU<MatrixType>::FullPivLU(const EigenBase<InputType>& matrix)
   : m_lu(matrix.rows(), matrix.cols()),
     m_p(matrix.rows()),
     m_q(matrix.cols()),
@@ -412,26 +467,41 @@
     m_isInitialized(false),
     m_usePrescribedThreshold(false)
 {
-  compute(matrix);
+  compute(matrix.derived());
 }
 
 template<typename MatrixType>
-FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
+template<typename InputType>
+FullPivLU<MatrixType>::FullPivLU(EigenBase<InputType>& matrix)
+  : m_lu(matrix.derived()),
+    m_p(matrix.rows()),
+    m_q(matrix.cols()),
+    m_rowsTranspositions(matrix.rows()),
+    m_colsTranspositions(matrix.cols()),
+    m_isInitialized(false),
+    m_usePrescribedThreshold(false)
 {
-  // the permutations are stored as int indices, so just to be sure:
-  eigen_assert(matrix.rows()<=NumTraits<int>::highest() && matrix.cols()<=NumTraits<int>::highest());
-  
-  m_isInitialized = true;
-  m_lu = matrix;
+  computeInPlace();
+}
 
-  const Index size = matrix.diagonalSize();
-  const Index rows = matrix.rows();
-  const Index cols = matrix.cols();
+template<typename MatrixType>
+void FullPivLU<MatrixType>::computeInPlace()
+{
+  check_template_parameters();
+
+  // the permutations are stored as int indices, so just to be sure:
+  eigen_assert(m_lu.rows()<=NumTraits<int>::highest() && m_lu.cols()<=NumTraits<int>::highest());
+
+  m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
+
+  const Index size = m_lu.diagonalSize();
+  const Index rows = m_lu.rows();
+  const Index cols = m_lu.cols();
 
   // will store the transpositions, before we accumulate them at the end.
   // can't accumulate on-the-fly because that will be done in reverse order for the rows.
-  m_rowsTranspositions.resize(matrix.rows());
-  m_colsTranspositions.resize(matrix.cols());
+  m_rowsTranspositions.resize(m_lu.rows());
+  m_colsTranspositions.resize(m_lu.cols());
   Index number_of_transpositions = 0; // number of NONTRIVIAL transpositions, i.e. m_rowsTranspositions[i]!=i
 
   m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case)
@@ -443,33 +513,36 @@
 
     // biggest coefficient in the remaining bottom-right corner (starting at row k, col k)
     Index row_of_biggest_in_corner, col_of_biggest_in_corner;
-    RealScalar biggest_in_corner;
+    typedef internal::scalar_score_coeff_op<Scalar> Scoring;
+    typedef typename Scoring::result_type Score;
+    Score biggest_in_corner;
     biggest_in_corner = m_lu.bottomRightCorner(rows-k, cols-k)
-                        .cwiseAbs()
+                        .unaryExpr(Scoring())
                         .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
     row_of_biggest_in_corner += k; // correct the values! since they were computed in the corner,
     col_of_biggest_in_corner += k; // need to add k to them.
 
-    if(biggest_in_corner==RealScalar(0))
+    if(biggest_in_corner==Score(0))
     {
       // before exiting, make sure to initialize the still uninitialized transpositions
       // in a sane state without destroying what we already have.
       m_nonzero_pivots = k;
       for(Index i = k; i < size; ++i)
       {
-        m_rowsTranspositions.coeffRef(i) = i;
-        m_colsTranspositions.coeffRef(i) = i;
+        m_rowsTranspositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
+        m_colsTranspositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
       }
       break;
     }
 
-    if(biggest_in_corner > m_maxpivot) m_maxpivot = biggest_in_corner;
+    RealScalar abs_pivot = internal::abs_knowing_score<Scalar>()(m_lu(row_of_biggest_in_corner, col_of_biggest_in_corner), biggest_in_corner);
+    if(abs_pivot > m_maxpivot) m_maxpivot = abs_pivot;
 
     // Now that we've found the pivot, we need to apply the row/col swaps to
     // bring it to the location (k,k).
 
-    m_rowsTranspositions.coeffRef(k) = row_of_biggest_in_corner;
-    m_colsTranspositions.coeffRef(k) = col_of_biggest_in_corner;
+    m_rowsTranspositions.coeffRef(k) = internal::convert_index<StorageIndex>(row_of_biggest_in_corner);
+    m_colsTranspositions.coeffRef(k) = internal::convert_index<StorageIndex>(col_of_biggest_in_corner);
     if(k != row_of_biggest_in_corner) {
       m_lu.row(k).swap(m_lu.row(row_of_biggest_in_corner));
       ++number_of_transpositions;
@@ -500,7 +573,8 @@
     m_q.applyTranspositionOnTheRight(k, m_colsTranspositions.coeff(k));
 
   m_det_pq = (number_of_transpositions%2) ? -1 : 1;
-  return *this;
+
+  m_isInitialized = true;
 }
 
 template<typename MatrixType>
@@ -663,64 +737,124 @@
 
 /***** Implementation of solve() *****************************************************/
 
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<FullPivLU<_MatrixType>, Rhs>
-  : solve_retval_base<FullPivLU<_MatrixType>, Rhs>
+} // end namespace internal
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename _MatrixType>
+template<typename RhsType, typename DstType>
+void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  EIGEN_MAKE_SOLVE_HELPERS(FullPivLU<_MatrixType>,Rhs)
+  /* The decomposition PAQ = LU can be rewritten as A = P^{-1} L U Q^{-1}.
+  * So we proceed as follows:
+  * Step 1: compute c = P * rhs.
+  * Step 2: replace c by the solution x to Lx = c. Exists because L is invertible.
+  * Step 3: replace c by the solution x to Ux = c. May or may not exist.
+  * Step 4: result = Q * c;
+  */
 
-  template<typename Dest> void evalTo(Dest& dst) const
+  const Index rows = this->rows(),
+              cols = this->cols(),
+              nonzero_pivots = this->rank();
+  const Index smalldim = (std::min)(rows, cols);
+
+  if(nonzero_pivots == 0)
   {
-    /* The decomposition PAQ = LU can be rewritten as A = P^{-1} L U Q^{-1}.
-     * So we proceed as follows:
-     * Step 1: compute c = P * rhs.
-     * Step 2: replace c by the solution x to Lx = c. Exists because L is invertible.
-     * Step 3: replace c by the solution x to Ux = c. May or may not exist.
-     * Step 4: result = Q * c;
-     */
+    dst.setZero();
+    return;
+  }
 
-    const Index rows = dec().rows(), cols = dec().cols(),
-              nonzero_pivots = dec().nonzeroPivots();
-    eigen_assert(rhs().rows() == rows);
-    const Index smalldim = (std::min)(rows, cols);
+  typename RhsType::PlainObject c(rhs.rows(), rhs.cols());
 
-    if(nonzero_pivots == 0)
-    {
-      dst.setZero();
-      return;
-    }
+  // Step 1
+  c = permutationP() * rhs;
 
-    typename Rhs::PlainObject c(rhs().rows(), rhs().cols());
+  // Step 2
+  m_lu.topLeftCorner(smalldim,smalldim)
+      .template triangularView<UnitLower>()
+      .solveInPlace(c.topRows(smalldim));
+  if(rows>cols)
+    c.bottomRows(rows-cols) -= m_lu.bottomRows(rows-cols) * c.topRows(cols);
 
-    // Step 1
-    c = dec().permutationP() * rhs();
+  // Step 3
+  m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
+      .template triangularView<Upper>()
+      .solveInPlace(c.topRows(nonzero_pivots));
 
-    // Step 2
-    dec().matrixLU()
-        .topLeftCorner(smalldim,smalldim)
-        .template triangularView<UnitLower>()
-        .solveInPlace(c.topRows(smalldim));
-    if(rows>cols)
-    {
-      c.bottomRows(rows-cols)
-        -= dec().matrixLU().bottomRows(rows-cols)
-         * c.topRows(cols);
-    }
+  // Step 4
+  for(Index i = 0; i < nonzero_pivots; ++i)
+    dst.row(permutationQ().indices().coeff(i)) = c.row(i);
+  for(Index i = nonzero_pivots; i < m_lu.cols(); ++i)
+    dst.row(permutationQ().indices().coeff(i)).setZero();
+}
 
-    // Step 3
-    dec().matrixLU()
-        .topLeftCorner(nonzero_pivots, nonzero_pivots)
-        .template triangularView<Upper>()
-        .solveInPlace(c.topRows(nonzero_pivots));
+template<typename _MatrixType>
+template<bool Conjugate, typename RhsType, typename DstType>
+void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+  /* The decomposition PAQ = LU can be rewritten as A = P^{-1} L U Q^{-1},
+   * and since permutations are real and unitary, we can write this
+   * as   A^T = Q U^T L^T P,
+   * So we proceed as follows:
+   * Step 1: compute c = Q^T rhs.
+   * Step 2: replace c by the solution x to U^T x = c. May or may not exist.
+   * Step 3: replace c by the solution x to L^T x = c.
+   * Step 4: result = P^T c.
+   * If Conjugate is true, replace "^T" by "^*" above.
+   */
 
-    // Step 4
-    for(Index i = 0; i < nonzero_pivots; ++i)
-      dst.row(dec().permutationQ().indices().coeff(i)) = c.row(i);
-    for(Index i = nonzero_pivots; i < dec().matrixLU().cols(); ++i)
-      dst.row(dec().permutationQ().indices().coeff(i)).setZero();
+  const Index rows = this->rows(), cols = this->cols(),
+    nonzero_pivots = this->rank();
+  const Index smalldim = (std::min)(rows, cols);
+
+  if(nonzero_pivots == 0)
+  {
+    dst.setZero();
+    return;
+  }
+
+  typename RhsType::PlainObject c(rhs.rows(), rhs.cols());
+
+  // Step 1
+  c = permutationQ().inverse() * rhs;
+
+  // Step 2
+  m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
+      .template triangularView<Upper>()
+      .transpose()
+      .template conjugateIf<Conjugate>()
+      .solveInPlace(c.topRows(nonzero_pivots));
+
+  // Step 3
+  m_lu.topLeftCorner(smalldim, smalldim)
+      .template triangularView<UnitLower>()
+      .transpose()
+      .template conjugateIf<Conjugate>()
+      .solveInPlace(c.topRows(smalldim));
+
+  // Step 4
+  PermutationPType invp = permutationP().inverse().eval();
+  for(Index i = 0; i < smalldim; ++i)
+    dst.row(invp.indices().coeff(i)) = c.row(i);
+  for(Index i = smalldim; i < rows; ++i)
+    dst.row(invp.indices().coeff(i)).setZero();
+}
+
+#endif
+
+namespace internal {
+
+
+/***** Implementation of inverse() *****************************************************/
+template<typename DstXprType, typename MatrixType>
+struct Assignment<DstXprType, Inverse<FullPivLU<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename FullPivLU<MatrixType>::Scalar>, Dense2Dense>
+{
+  typedef FullPivLU<MatrixType> LuType;
+  typedef Inverse<LuType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename MatrixType::Scalar> &)
+  {
+    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
   }
 };
-
 } // end namespace internal
 
 /******* MatrixBase methods *****************************************************************/
@@ -731,14 +865,12 @@
   *
   * \sa class FullPivLU
   */
-#ifndef __CUDACC__
 template<typename Derived>
 inline const FullPivLU<typename MatrixBase<Derived>::PlainObject>
 MatrixBase<Derived>::fullPivLu() const
 {
   return FullPivLU<PlainObject>(eval());
 }
-#endif
 
 } // end namespace Eigen
 

diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h
new file mode 100644
index 0000000..a40cefa
--- /dev/null
+++ b/Eigen/src/LU/InverseImpl.h

@@ -0,0 +1,432 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INVERSE_IMPL_H
+#define EIGEN_INVERSE_IMPL_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/**********************************
+*** General case implementation ***
+**********************************/
+
+template<typename MatrixType, typename ResultType, int Size = MatrixType::RowsAtCompileTime>
+struct compute_inverse
+{
+  EIGEN_DEVICE_FUNC
+  static inline void run(const MatrixType& matrix, ResultType& result)
+  {
+    result = matrix.partialPivLu().inverse();
+  }
+};
+
+template<typename MatrixType, typename ResultType, int Size = MatrixType::RowsAtCompileTime>
+struct compute_inverse_and_det_with_check { /* nothing! general case not supported. */ };
+
+/****************************
+*** Size 1 implementation ***
+****************************/
+
+template<typename MatrixType, typename ResultType>
+struct compute_inverse<MatrixType, ResultType, 1>
+{
+  EIGEN_DEVICE_FUNC
+  static inline void run(const MatrixType& matrix, ResultType& result)
+  {
+    typedef typename MatrixType::Scalar Scalar;
+    internal::evaluator<MatrixType> matrixEval(matrix);
+    result.coeffRef(0,0) = Scalar(1) / matrixEval.coeff(0,0);
+  }
+};
+
+template<typename MatrixType, typename ResultType>
+struct compute_inverse_and_det_with_check<MatrixType, ResultType, 1>
+{
+  EIGEN_DEVICE_FUNC
+  static inline void run(
+    const MatrixType& matrix,
+    const typename MatrixType::RealScalar& absDeterminantThreshold,
+    ResultType& result,
+    typename ResultType::Scalar& determinant,
+    bool& invertible
+  )
+  {
+    using std::abs;
+    determinant = matrix.coeff(0,0);
+    invertible = abs(determinant) > absDeterminantThreshold;
+    if(invertible) result.coeffRef(0,0) = typename ResultType::Scalar(1) / determinant;
+  }
+};
+
+/****************************
+*** Size 2 implementation ***
+****************************/
+
+template<typename MatrixType, typename ResultType>
+EIGEN_DEVICE_FUNC 
+inline void compute_inverse_size2_helper(
+    const MatrixType& matrix, const typename ResultType::Scalar& invdet,
+    ResultType& result)
+{
+  typename ResultType::Scalar temp = matrix.coeff(0,0);
+  result.coeffRef(0,0) =  matrix.coeff(1,1) * invdet;
+  result.coeffRef(1,0) = -matrix.coeff(1,0) * invdet;
+  result.coeffRef(0,1) = -matrix.coeff(0,1) * invdet;
+  result.coeffRef(1,1) =  temp * invdet;
+}
+
+template<typename MatrixType, typename ResultType>
+struct compute_inverse<MatrixType, ResultType, 2>
+{
+  EIGEN_DEVICE_FUNC
+  static inline void run(const MatrixType& matrix, ResultType& result)
+  {
+    typedef typename ResultType::Scalar Scalar;
+    const Scalar invdet = typename MatrixType::Scalar(1) / matrix.determinant();
+    compute_inverse_size2_helper(matrix, invdet, result);
+  }
+};
+
+template<typename MatrixType, typename ResultType>
+struct compute_inverse_and_det_with_check<MatrixType, ResultType, 2>
+{
+  EIGEN_DEVICE_FUNC
+  static inline void run(
+    const MatrixType& matrix,
+    const typename MatrixType::RealScalar& absDeterminantThreshold,
+    ResultType& inverse,
+    typename ResultType::Scalar& determinant,
+    bool& invertible
+  )
+  {
+    using std::abs;
+    typedef typename ResultType::Scalar Scalar;
+    determinant = matrix.determinant();
+    invertible = abs(determinant) > absDeterminantThreshold;
+    if(!invertible) return;
+    const Scalar invdet = Scalar(1) / determinant;
+    compute_inverse_size2_helper(matrix, invdet, inverse);
+  }
+};
+
+/****************************
+*** Size 3 implementation ***
+****************************/
+
+template<typename MatrixType, int i, int j>
+EIGEN_DEVICE_FUNC 
+inline typename MatrixType::Scalar cofactor_3x3(const MatrixType& m)
+{
+  enum {
+    i1 = (i+1) % 3,
+    i2 = (i+2) % 3,
+    j1 = (j+1) % 3,
+    j2 = (j+2) % 3
+  };
+  return m.coeff(i1, j1) * m.coeff(i2, j2)
+       - m.coeff(i1, j2) * m.coeff(i2, j1);
+}
+
+template<typename MatrixType, typename ResultType>
+EIGEN_DEVICE_FUNC
+inline void compute_inverse_size3_helper(
+    const MatrixType& matrix,
+    const typename ResultType::Scalar& invdet,
+    const Matrix<typename ResultType::Scalar,3,1>& cofactors_col0,
+    ResultType& result)
+{
+  // Compute cofactors in a way that avoids aliasing issues.
+  typedef typename ResultType::Scalar Scalar;
+  const Scalar c01 = cofactor_3x3<MatrixType,0,1>(matrix) * invdet;
+  const Scalar c11 = cofactor_3x3<MatrixType,1,1>(matrix) * invdet;
+  const Scalar c02 = cofactor_3x3<MatrixType,0,2>(matrix) * invdet;
+  result.coeffRef(1,2) =  cofactor_3x3<MatrixType,2,1>(matrix) * invdet;
+  result.coeffRef(2,1) =  cofactor_3x3<MatrixType,1,2>(matrix) * invdet;
+  result.coeffRef(2,2) =  cofactor_3x3<MatrixType,2,2>(matrix) * invdet;
+  result.coeffRef(1,0) =  c01;
+  result.coeffRef(1,1) =  c11;
+  result.coeffRef(2,0) =  c02;  
+  result.row(0) = cofactors_col0 * invdet;
+}
+
+template<typename MatrixType, typename ResultType>
+struct compute_inverse<MatrixType, ResultType, 3>
+{
+  EIGEN_DEVICE_FUNC
+  static inline void run(const MatrixType& matrix, ResultType& result)
+  {
+    typedef typename ResultType::Scalar Scalar;
+    Matrix<typename MatrixType::Scalar,3,1> cofactors_col0;
+    cofactors_col0.coeffRef(0) =  cofactor_3x3<MatrixType,0,0>(matrix);
+    cofactors_col0.coeffRef(1) =  cofactor_3x3<MatrixType,1,0>(matrix);
+    cofactors_col0.coeffRef(2) =  cofactor_3x3<MatrixType,2,0>(matrix);
+    const Scalar det = (cofactors_col0.cwiseProduct(matrix.col(0))).sum();
+    const Scalar invdet = Scalar(1) / det;
+    compute_inverse_size3_helper(matrix, invdet, cofactors_col0, result);
+  }
+};
+
+template<typename MatrixType, typename ResultType>
+struct compute_inverse_and_det_with_check<MatrixType, ResultType, 3>
+{
+  EIGEN_DEVICE_FUNC
+  static inline void run(
+    const MatrixType& matrix,
+    const typename MatrixType::RealScalar& absDeterminantThreshold,
+    ResultType& inverse,
+    typename ResultType::Scalar& determinant,
+    bool& invertible
+  )
+  {
+    typedef typename ResultType::Scalar Scalar;
+    Matrix<Scalar,3,1> cofactors_col0;
+    cofactors_col0.coeffRef(0) =  cofactor_3x3<MatrixType,0,0>(matrix);
+    cofactors_col0.coeffRef(1) =  cofactor_3x3<MatrixType,1,0>(matrix);
+    cofactors_col0.coeffRef(2) =  cofactor_3x3<MatrixType,2,0>(matrix);
+    determinant = (cofactors_col0.cwiseProduct(matrix.col(0))).sum();
+    invertible = Eigen::numext::abs(determinant) > absDeterminantThreshold;
+    if(!invertible) return;
+    const Scalar invdet = Scalar(1) / determinant;
+    compute_inverse_size3_helper(matrix, invdet, cofactors_col0, inverse);
+  }
+};
+
+/****************************
+*** Size 4 implementation ***
+****************************/
+
+template<typename Derived>
+EIGEN_DEVICE_FUNC 
+inline const typename Derived::Scalar general_det3_helper
+(const MatrixBase<Derived>& matrix, int i1, int i2, int i3, int j1, int j2, int j3)
+{
+  return matrix.coeff(i1,j1)
+         * (matrix.coeff(i2,j2) * matrix.coeff(i3,j3) - matrix.coeff(i2,j3) * matrix.coeff(i3,j2));
+}
+
+template<typename MatrixType, int i, int j>
+EIGEN_DEVICE_FUNC 
+inline typename MatrixType::Scalar cofactor_4x4(const MatrixType& matrix)
+{
+  enum {
+    i1 = (i+1) % 4,
+    i2 = (i+2) % 4,
+    i3 = (i+3) % 4,
+    j1 = (j+1) % 4,
+    j2 = (j+2) % 4,
+    j3 = (j+3) % 4
+  };
+  return general_det3_helper(matrix, i1, i2, i3, j1, j2, j3)
+       + general_det3_helper(matrix, i2, i3, i1, j1, j2, j3)
+       + general_det3_helper(matrix, i3, i1, i2, j1, j2, j3);
+}
+
+template<int Arch, typename Scalar, typename MatrixType, typename ResultType>
+struct compute_inverse_size4
+{
+  EIGEN_DEVICE_FUNC
+  static void run(const MatrixType& matrix, ResultType& result)
+  {
+    result.coeffRef(0,0) =  cofactor_4x4<MatrixType,0,0>(matrix);
+    result.coeffRef(1,0) = -cofactor_4x4<MatrixType,0,1>(matrix);
+    result.coeffRef(2,0) =  cofactor_4x4<MatrixType,0,2>(matrix);
+    result.coeffRef(3,0) = -cofactor_4x4<MatrixType,0,3>(matrix);
+    result.coeffRef(0,2) =  cofactor_4x4<MatrixType,2,0>(matrix);
+    result.coeffRef(1,2) = -cofactor_4x4<MatrixType,2,1>(matrix);
+    result.coeffRef(2,2) =  cofactor_4x4<MatrixType,2,2>(matrix);
+    result.coeffRef(3,2) = -cofactor_4x4<MatrixType,2,3>(matrix);
+    result.coeffRef(0,1) = -cofactor_4x4<MatrixType,1,0>(matrix);
+    result.coeffRef(1,1) =  cofactor_4x4<MatrixType,1,1>(matrix);
+    result.coeffRef(2,1) = -cofactor_4x4<MatrixType,1,2>(matrix);
+    result.coeffRef(3,1) =  cofactor_4x4<MatrixType,1,3>(matrix);
+    result.coeffRef(0,3) = -cofactor_4x4<MatrixType,3,0>(matrix);
+    result.coeffRef(1,3) =  cofactor_4x4<MatrixType,3,1>(matrix);
+    result.coeffRef(2,3) = -cofactor_4x4<MatrixType,3,2>(matrix);
+    result.coeffRef(3,3) =  cofactor_4x4<MatrixType,3,3>(matrix);
+    result /= (matrix.col(0).cwiseProduct(result.row(0).transpose())).sum();
+  }
+};
+
+template<typename MatrixType, typename ResultType>
+struct compute_inverse<MatrixType, ResultType, 4>
+ : compute_inverse_size4<Architecture::Target, typename MatrixType::Scalar,
+                            MatrixType, ResultType>
+{
+};
+
+template<typename MatrixType, typename ResultType>
+struct compute_inverse_and_det_with_check<MatrixType, ResultType, 4>
+{
+  EIGEN_DEVICE_FUNC
+  static inline void run(
+    const MatrixType& matrix,
+    const typename MatrixType::RealScalar& absDeterminantThreshold,
+    ResultType& inverse,
+    typename ResultType::Scalar& determinant,
+    bool& invertible
+  )
+  {
+    using std::abs;
+    determinant = matrix.determinant();
+    invertible = abs(determinant) > absDeterminantThreshold;
+    if(invertible && extract_data(matrix) != extract_data(inverse)) {
+      compute_inverse<MatrixType, ResultType>::run(matrix, inverse);
+    }
+    else if(invertible) {
+      MatrixType matrix_t = matrix;
+      compute_inverse<MatrixType, ResultType>::run(matrix_t, inverse);
+    }
+  }
+};
+
+/*************************
+*** MatrixBase methods ***
+*************************/
+
+} // end namespace internal
+
+namespace internal {
+
+// Specialization for "dense = dense_xpr.inverse()"
+template<typename DstXprType, typename XprType>
+struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar>, Dense2Dense>
+{
+  typedef Inverse<XprType> SrcXprType;
+  EIGEN_DEVICE_FUNC
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+    
+    const int Size = EIGEN_PLAIN_ENUM_MIN(XprType::ColsAtCompileTime,DstXprType::ColsAtCompileTime);
+    EIGEN_ONLY_USED_FOR_DEBUG(Size);
+    eigen_assert(( (Size<=1) || (Size>4) || (extract_data(src.nestedExpression())!=extract_data(dst)))
+              && "Aliasing problem detected in inverse(), you need to do inverse().eval() here.");
+
+    typedef typename internal::nested_eval<XprType,XprType::ColsAtCompileTime>::type  ActualXprType;
+    typedef typename internal::remove_all<ActualXprType>::type                        ActualXprTypeCleanded;
+    
+    ActualXprType actual_xpr(src.nestedExpression());
+    
+    compute_inverse<ActualXprTypeCleanded, DstXprType>::run(actual_xpr, dst);
+  }
+};
+
+  
+} // end namespace internal
+
+/** \lu_module
+  *
+  * \returns the matrix inverse of this matrix.
+  *
+  * For small fixed sizes up to 4x4, this method uses cofactors.
+  * In the general case, this method uses class PartialPivLU.
+  *
+  * \note This matrix must be invertible, otherwise the result is undefined. If you need an
+  * invertibility check, do the following:
+  * \li for fixed sizes up to 4x4, use computeInverseAndDetWithCheck().
+  * \li for the general case, use class FullPivLU.
+  *
+  * Example: \include MatrixBase_inverse.cpp
+  * Output: \verbinclude MatrixBase_inverse.out
+  *
+  * \sa computeInverseAndDetWithCheck()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC
+inline const Inverse<Derived> MatrixBase<Derived>::inverse() const
+{
+  EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
+  eigen_assert(rows() == cols());
+  return Inverse<Derived>(derived());
+}
+
+/** \lu_module
+  *
+  * Computation of matrix inverse and determinant, with invertibility check.
+  *
+  * This is only for fixed-size square matrices of size up to 4x4.
+  *
+  * Notice that it will trigger a copy of input matrix when trying to do the inverse in place.
+  *
+  * \param inverse Reference to the matrix in which to store the inverse.
+  * \param determinant Reference to the variable in which to store the determinant.
+  * \param invertible Reference to the bool variable in which to store whether the matrix is invertible.
+  * \param absDeterminantThreshold Optional parameter controlling the invertibility check.
+  *                                The matrix will be declared invertible if the absolute value of its
+  *                                determinant is greater than this threshold.
+  *
+  * Example: \include MatrixBase_computeInverseAndDetWithCheck.cpp
+  * Output: \verbinclude MatrixBase_computeInverseAndDetWithCheck.out
+  *
+  * \sa inverse(), computeInverseWithCheck()
+  */
+template<typename Derived>
+template<typename ResultType>
+inline void MatrixBase<Derived>::computeInverseAndDetWithCheck(
+    ResultType& inverse,
+    typename ResultType::Scalar& determinant,
+    bool& invertible,
+    const RealScalar& absDeterminantThreshold
+  ) const
+{
+  // i'd love to put some static assertions there, but SFINAE means that they have no effect...
+  eigen_assert(rows() == cols());
+  // for 2x2, it's worth giving a chance to avoid evaluating.
+  // for larger sizes, evaluating has negligible cost and limits code size.
+  typedef typename internal::conditional<
+    RowsAtCompileTime == 2,
+    typename internal::remove_all<typename internal::nested_eval<Derived, 2>::type>::type,
+    PlainObject
+  >::type MatrixType;
+  internal::compute_inverse_and_det_with_check<MatrixType, ResultType>::run
+    (derived(), absDeterminantThreshold, inverse, determinant, invertible);
+}
+
+/** \lu_module
+  *
+  * Computation of matrix inverse, with invertibility check.
+  *
+  * This is only for fixed-size square matrices of size up to 4x4.
+  *
+  * Notice that it will trigger a copy of input matrix when trying to do the inverse in place.
+  *
+  * \param inverse Reference to the matrix in which to store the inverse.
+  * \param invertible Reference to the bool variable in which to store whether the matrix is invertible.
+  * \param absDeterminantThreshold Optional parameter controlling the invertibility check.
+  *                                The matrix will be declared invertible if the absolute value of its
+  *                                determinant is greater than this threshold.
+  *
+  * Example: \include MatrixBase_computeInverseWithCheck.cpp
+  * Output: \verbinclude MatrixBase_computeInverseWithCheck.out
+  *
+  * \sa inverse(), computeInverseAndDetWithCheck()
+  */
+template<typename Derived>
+template<typename ResultType>
+inline void MatrixBase<Derived>::computeInverseWithCheck(
+    ResultType& inverse,
+    bool& invertible,
+    const RealScalar& absDeterminantThreshold
+  ) const
+{
+  Scalar determinant;
+  // i'd love to put some static assertions there, but SFINAE means that they have no effect...
+  eigen_assert(rows() == cols());
+  computeInverseAndDetWithCheck(inverse,determinant,invertible,absDeterminantThreshold);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_INVERSE_IMPL_H

diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index 5d21af5..34aed72 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h

@@ -11,7 +11,34 @@
 #ifndef EIGEN_PARTIALLU_H
 #define EIGEN_PARTIALLU_H
 
-namespace Eigen { 
+namespace Eigen {
+
+namespace internal {
+template<typename _MatrixType> struct traits<PartialPivLU<_MatrixType> >
+ : traits<_MatrixType>
+{
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
+  typedef traits<_MatrixType> BaseTraits;
+  enum {
+    Flags = BaseTraits::Flags & RowMajorBit,
+    CoeffReadCost = Dynamic
+  };
+};
+
+template<typename T,typename Derived>
+struct enable_if_ref;
+// {
+//   typedef Derived type;
+// };
+
+template<typename T,typename Derived>
+struct enable_if_ref<Ref<T>,Derived> {
+  typedef Derived type;
+};
+
+} // end namespace internal
 
 /** \ingroup LU_Module
   *
@@ -19,7 +46,7 @@
   *
   * \brief LU decomposition of a matrix with partial pivoting, and related features
   *
-  * \param MatrixType the type of the matrix of which we are computing the LU decomposition
+  * \tparam _MatrixType the type of the matrix of which we are computing the LU decomposition
   *
   * This class represents a LU decomposition of a \b square \b invertible matrix, with partial pivoting: the matrix A
   * is decomposed as A = PLU where L is unit-lower-triangular, U is upper-triangular, and P
@@ -42,34 +69,34 @@
   *
   * The data of the LU decomposition can be directly accessed through the methods matrixLU(), permutationP().
   *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  *
   * \sa MatrixBase::partialPivLu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse(), class FullPivLU
   */
 template<typename _MatrixType> class PartialPivLU
+  : public SolverBase<PartialPivLU<_MatrixType> >
 {
   public:
 
     typedef _MatrixType MatrixType;
+    typedef SolverBase<PartialPivLU> Base;
+    friend class SolverBase<PartialPivLU>;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(PartialPivLU)
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename internal::traits<MatrixType>::StorageKind StorageKind;
-    typedef typename MatrixType::Index Index;
     typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime> PermutationType;
     typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
-
+    typedef typename MatrixType::PlainObject PlainObject;
 
     /**
-    * \brief Default Constructor.
-    *
-    * The default constructor is useful in cases in which the user intends to
-    * perform decompositions via PartialPivLU::compute(const MatrixType&).
-    */
+      * \brief Default Constructor.
+      *
+      * The default constructor is useful in cases in which the user intends to
+      * perform decompositions via PartialPivLU::compute(const MatrixType&).
+      */
     PartialPivLU();
 
     /** \brief Default Constructor with memory preallocation
@@ -78,7 +105,7 @@
       * according to the specified problem \a size.
       * \sa PartialPivLU()
       */
-    PartialPivLU(Index size);
+    explicit PartialPivLU(Index size);
 
     /** Constructor.
       *
@@ -87,9 +114,25 @@
       * \warning The matrix should have full rank (e.g. if it's square, it should be invertible).
       * If you need to deal with non-full rank, use class FullPivLU instead.
       */
-    PartialPivLU(const MatrixType& matrix);
+    template<typename InputType>
+    explicit PartialPivLU(const EigenBase<InputType>& matrix);
 
-    PartialPivLU& compute(const MatrixType& matrix);
+    /** Constructor for \link InplaceDecomposition inplace decomposition \endlink
+      *
+      * \param matrix the matrix of which to compute the LU decomposition.
+      *
+      * \warning The matrix should have full rank (e.g. if it's square, it should be invertible).
+      * If you need to deal with non-full rank, use class FullPivLU instead.
+      */
+    template<typename InputType>
+    explicit PartialPivLU(EigenBase<InputType>& matrix);
+
+    template<typename InputType>
+    PartialPivLU& compute(const EigenBase<InputType>& matrix) {
+      m_lu = matrix.derived();
+      compute();
+      return *this;
+    }
 
     /** \returns the LU decomposition matrix: the upper-triangular part is U, the
       * unit-lower-triangular part is L (at least for square matrices; in the non-square
@@ -111,6 +154,7 @@
       return m_p;
     }
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** This method returns the solution x to the equation Ax=b, where A is the matrix of which
       * *this is the LU decomposition.
       *
@@ -129,11 +173,17 @@
       * \sa TriangularView::solve(), inverse(), computeInverse()
       */
     template<typename Rhs>
-    inline const internal::solve_retval<PartialPivLU, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
+    inline const Solve<PartialPivLU, Rhs>
+    solve(const MatrixBase<Rhs>& b) const;
+    #endif
+
+    /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
+        the LU decomposition.
+      */
+    inline RealScalar rcond() const
     {
       eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
-      return internal::solve_retval<PartialPivLU, Rhs>(*this, b.derived());
+      return internal::rcond_estimate_helper(m_l1_norm, *this);
     }
 
     /** \returns the inverse of the matrix of which *this is the LU decomposition.
@@ -143,11 +193,10 @@
       *
       * \sa MatrixBase::inverse(), LU::inverse()
       */
-    inline const internal::solve_retval<PartialPivLU,typename MatrixType::IdentityReturnType> inverse() const
+    inline const Inverse<PartialPivLU> inverse() const
     {
       eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
-      return internal::solve_retval<PartialPivLU,typename MatrixType::IdentityReturnType>
-               (*this, MatrixType::Identity(m_lu.rows(), m_lu.cols()));
+      return Inverse<PartialPivLU>(*this);
     }
 
     /** \returns the determinant of the matrix of which
@@ -163,18 +212,71 @@
       *
       * \sa MatrixBase::determinant()
       */
-    typename internal::traits<MatrixType>::Scalar determinant() const;
+    Scalar determinant() const;
 
     MatrixType reconstructedMatrix() const;
 
-    inline Index rows() const { return m_lu.rows(); }
-    inline Index cols() const { return m_lu.cols(); }
+    EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); }
+    EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl(const RhsType &rhs, DstType &dst) const {
+     /* The decomposition PA = LU can be rewritten as A = P^{-1} L U.
+      * So we proceed as follows:
+      * Step 1: compute c = Pb.
+      * Step 2: replace c by the solution x to Lx = c.
+      * Step 3: replace c by the solution x to Ux = c.
+      */
+
+      // Step 1
+      dst = permutationP() * rhs;
+
+      // Step 2
+      m_lu.template triangularView<UnitLower>().solveInPlace(dst);
+
+      // Step 3
+      m_lu.template triangularView<Upper>().solveInPlace(dst);
+    }
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const {
+     /* The decomposition PA = LU can be rewritten as A^T = U^T L^T P.
+      * So we proceed as follows:
+      * Step 1: compute c as the solution to L^T c = b
+      * Step 2: replace c by the solution x to U^T x = c.
+      * Step 3: update  c = P^-1 c.
+      */
+
+      eigen_assert(rhs.rows() == m_lu.cols());
+
+      // Step 1
+      dst = m_lu.template triangularView<Upper>().transpose()
+                .template conjugateIf<Conjugate>().solve(rhs);
+      // Step 2
+      m_lu.template triangularView<UnitLower>().transpose()
+          .template conjugateIf<Conjugate>().solveInPlace(dst);
+      // Step 3
+      dst = permutationP().transpose() * dst;
+    }
+    #endif
 
   protected:
+
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+
+    void compute();
+
     MatrixType m_lu;
     PermutationType m_p;
     TranspositionType m_rowsTranspositions;
-    Index m_det_p;
+    RealScalar m_l1_norm;
+    signed char m_det_p;
     bool m_isInitialized;
 };
 
@@ -183,6 +285,7 @@
   : m_lu(),
     m_p(),
     m_rowsTranspositions(),
+    m_l1_norm(0),
     m_det_p(0),
     m_isInitialized(false)
 {
@@ -193,38 +296,54 @@
   : m_lu(size, size),
     m_p(size),
     m_rowsTranspositions(size),
+    m_l1_norm(0),
     m_det_p(0),
     m_isInitialized(false)
 {
 }
 
 template<typename MatrixType>
-PartialPivLU<MatrixType>::PartialPivLU(const MatrixType& matrix)
-    : m_lu(matrix.rows(), matrix.rows()),
-      m_p(static_cast<typename PermutationType::Index>(matrix.rows())),
-      m_rowsTranspositions(
-          static_cast<typename TranspositionType::Index>(matrix.rows())),
-      m_det_p(0),
-      m_isInitialized(false) {
-  compute(matrix);
+template<typename InputType>
+PartialPivLU<MatrixType>::PartialPivLU(const EigenBase<InputType>& matrix)
+  : m_lu(matrix.rows(),matrix.cols()),
+    m_p(matrix.rows()),
+    m_rowsTranspositions(matrix.rows()),
+    m_l1_norm(0),
+    m_det_p(0),
+    m_isInitialized(false)
+{
+  compute(matrix.derived());
+}
+
+template<typename MatrixType>
+template<typename InputType>
+PartialPivLU<MatrixType>::PartialPivLU(EigenBase<InputType>& matrix)
+  : m_lu(matrix.derived()),
+    m_p(matrix.rows()),
+    m_rowsTranspositions(matrix.rows()),
+    m_l1_norm(0),
+    m_det_p(0),
+    m_isInitialized(false)
+{
+  compute();
 }
 
 namespace internal {
 
 /** \internal This is the blocked version of fullpivlu_unblocked() */
-template<typename Scalar, int StorageOrder, typename PivIndex>
+template<typename Scalar, int StorageOrder, typename PivIndex, int SizeAtCompileTime=Dynamic>
 struct partial_lu_impl
 {
-  // FIXME add a stride to Map, so that the following mapping becomes easier,
-  // another option would be to create an expression being able to automatically
-  // warp any Map, Matrix, and Block expressions as a unique type, but since that's exactly
-  // a Map + stride, why not adding a stride to Map, and convenient ctors from a Matrix,
-  // and Block.
-  typedef Map<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > MapLU;
-  typedef Block<MapLU, Dynamic, Dynamic> MatrixType;
-  typedef Block<MatrixType,Dynamic,Dynamic> BlockType;
+  static const int UnBlockedBound = 16;
+  static const bool UnBlockedAtCompileTime = SizeAtCompileTime!=Dynamic && SizeAtCompileTime<=UnBlockedBound;
+  static const int ActualSizeAtCompileTime = UnBlockedAtCompileTime ? SizeAtCompileTime : Dynamic;
+  // Remaining rows and columns at compile-time:
+  static const int RRows = SizeAtCompileTime==2 ? 1 : Dynamic;
+  static const int RCols = SizeAtCompileTime==2 ? 1 : Dynamic;
+  typedef Matrix<Scalar, ActualSizeAtCompileTime, ActualSizeAtCompileTime, StorageOrder> MatrixType;
+  typedef Ref<MatrixType> MatrixTypeRef;
+  typedef Ref<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > BlockType;
   typedef typename MatrixType::RealScalar RealScalar;
-  typedef typename MatrixType::Index Index;
 
   /** \internal performs the LU decomposition in-place of the matrix \a lu
     * using an unblocked algorithm.
@@ -236,26 +355,31 @@
     *
     * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise.
     */
-  static Index unblocked_lu(MatrixType& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions)
+  static Index unblocked_lu(MatrixTypeRef& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions)
   {
+    typedef scalar_score_coeff_op<Scalar> Scoring;
+    typedef typename Scoring::result_type Score;
     const Index rows = lu.rows();
     const Index cols = lu.cols();
     const Index size = (std::min)(rows,cols);
+    // For small compile-time matrices it is worth processing the last row separately:
+    //  speedup: +100% for 2x2, +10% for others.
+    const Index endk = UnBlockedAtCompileTime ? size-1 : size;
     nb_transpositions = 0;
     Index first_zero_pivot = -1;
-    for(Index k = 0; k < size; ++k)
+    for(Index k = 0; k < endk; ++k)
     {
-      Index rrows = rows-k-1;
-      Index rcols = cols-k-1;
-        
+      int rrows = internal::convert_index<int>(rows-k-1);
+      int rcols = internal::convert_index<int>(cols-k-1);
+
       Index row_of_biggest_in_col;
-      RealScalar biggest_in_corner
-        = lu.col(k).tail(rows-k).cwiseAbs().maxCoeff(&row_of_biggest_in_col);
+      Score biggest_in_corner
+        = lu.col(k).tail(rows-k).unaryExpr(Scoring()).maxCoeff(&row_of_biggest_in_col);
       row_of_biggest_in_col += k;
 
       row_transpositions[k] = PivIndex(row_of_biggest_in_col);
 
-      if(biggest_in_corner != RealScalar(0))
+      if(biggest_in_corner != Score(0))
       {
         if(k != row_of_biggest_in_col)
         {
@@ -263,9 +387,7 @@
           ++nb_transpositions;
         }
 
-        // FIXME shall we introduce a safe quotient expression in cas 1/lu.coeff(k,k)
-        // overflow but not the actual quotient?
-        lu.col(k).tail(rrows) /= lu.coeff(k,k);
+        lu.col(k).tail(fix<RRows>(rrows)) /= lu.coeff(k,k);
       }
       else if(first_zero_pivot==-1)
       {
@@ -275,8 +397,18 @@
       }
 
       if(k<rows-1)
-        lu.bottomRightCorner(rrows,rcols).noalias() -= lu.col(k).tail(rrows) * lu.row(k).tail(rcols);
+        lu.bottomRightCorner(fix<RRows>(rrows),fix<RCols>(rcols)).noalias() -= lu.col(k).tail(fix<RRows>(rrows)) * lu.row(k).tail(fix<RCols>(rcols));
     }
+
+    // special handling of the last entry
+    if(UnBlockedAtCompileTime)
+    {
+      Index k = endk;
+      row_transpositions[k] = PivIndex(k);
+      if (Scoring()(lu(k, k)) == Score(0) && first_zero_pivot == -1)
+        first_zero_pivot = k;
+    }
+
     return first_zero_pivot;
   }
 
@@ -292,18 +424,17 @@
     * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise.
     *
     * \note This very low level interface using pointers, etc. is to:
-    *   1 - reduce the number of instanciations to the strict minimum
-    *   2 - avoid infinite recursion of the instanciations with Block<Block<Block<...> > >
+    *   1 - reduce the number of instantiations to the strict minimum
+    *   2 - avoid infinite recursion of the instantiations with Block<Block<Block<...> > >
     */
   static Index blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, PivIndex* row_transpositions, PivIndex& nb_transpositions, Index maxBlockSize=256)
   {
-    MapLU lu1(lu_data,StorageOrder==RowMajor?rows:luStride,StorageOrder==RowMajor?luStride:cols);
-    MatrixType lu(lu1,0,0,rows,cols);
+    MatrixTypeRef lu = MatrixType::Map(lu_data,rows, cols, OuterStride<>(luStride));
 
     const Index size = (std::min)(rows,cols);
 
     // if the matrix is too small, no blocking:
-    if(size<=16)
+    if(UnBlockedAtCompileTime || size<=UnBlockedBound)
     {
       return unblocked_lu(lu, row_transpositions, nb_transpositions);
     }
@@ -329,12 +460,12 @@
       //                          A00 | A01 | A02
       // lu  = A_0 | A_1 | A_2 =  A10 | A11 | A12
       //                          A20 | A21 | A22
-      BlockType A_0(lu,0,0,rows,k);
-      BlockType A_2(lu,0,k+bs,rows,tsize);
-      BlockType A11(lu,k,k,bs,bs);
-      BlockType A12(lu,k,k+bs,bs,tsize);
-      BlockType A21(lu,k+bs,k,trows,bs);
-      BlockType A22(lu,k+bs,k+bs,trows,tsize);
+      BlockType A_0 = lu.block(0,0,rows,k);
+      BlockType A_2 = lu.block(0,k+bs,rows,tsize);
+      BlockType A11 = lu.block(k,k,bs,bs);
+      BlockType A12 = lu.block(k,k+bs,bs,tsize);
+      BlockType A21 = lu.block(k+bs,k,trows,bs);
+      BlockType A22 = lu.block(k+bs,k+bs,trows,tsize);
 
       PivIndex nb_transpositions_in_panel;
       // recursively call the blocked LU algorithm on [A11^T A21^T]^T
@@ -348,8 +479,7 @@
       // update permutations and apply them to A_0
       for(Index i=k; i<k+bs; ++i)
       {
-        row_transpositions[i] += static_cast<PivIndex>(k);
-        Index piv = static_cast<Index>(row_transpositions[i]);
+        Index piv = (row_transpositions[i] += internal::convert_index<PivIndex>(k));
         A_0.row(i).swap(A_0.row(piv));
       }
 
@@ -372,44 +502,54 @@
 /** \internal performs the LU decomposition with partial pivoting in-place.
   */
 template<typename MatrixType, typename TranspositionType>
-void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, typename TranspositionType::Index& nb_transpositions)
+void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, typename TranspositionType::StorageIndex& nb_transpositions)
 {
+  // Special-case of zero matrix.
+  if (lu.rows() == 0 || lu.cols() == 0) {
+    nb_transpositions = 0;
+    return;
+  }
   eigen_assert(lu.cols() == row_transpositions.size());
-  eigen_assert((&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1);
+  eigen_assert(row_transpositions.size() < 2 || (&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1);
 
   partial_lu_impl
-    <typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor, typename TranspositionType::Index>
+    < typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor,
+      typename TranspositionType::StorageIndex,
+      EIGEN_SIZE_MIN_PREFER_FIXED(MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime)>
     ::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.outerStride(), &row_transpositions.coeffRef(0), nb_transpositions);
 }
 
 } // end namespace internal
 
 template<typename MatrixType>
-PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const MatrixType& matrix)
+void PartialPivLU<MatrixType>::compute()
 {
+  check_template_parameters();
+
   // the row permutation is stored as int indices, so just to be sure:
-  eigen_assert(matrix.rows()<NumTraits<int>::highest());
-  
-  m_lu = matrix;
+  eigen_assert(m_lu.rows()<NumTraits<int>::highest());
 
-  eigen_assert(matrix.rows() == matrix.cols() && "PartialPivLU is only for square (and moreover invertible) matrices");
-  const Index size = matrix.rows();
+  if(m_lu.cols()>0)
+    m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
+  else
+    m_l1_norm = RealScalar(0);
 
-  m_rowsTranspositions.resize(
-      static_cast<const typename TranspositionType::Index>(size));
+  eigen_assert(m_lu.rows() == m_lu.cols() && "PartialPivLU is only for square (and moreover invertible) matrices");
+  const Index size = m_lu.rows();
 
-  typename TranspositionType::Index nb_transpositions;
+  m_rowsTranspositions.resize(size);
+
+  typename TranspositionType::StorageIndex nb_transpositions;
   internal::partial_lu_inplace(m_lu, m_rowsTranspositions, nb_transpositions);
   m_det_p = (nb_transpositions%2) ? -1 : 1;
 
   m_p = m_rowsTranspositions;
 
   m_isInitialized = true;
-  return *this;
 }
 
 template<typename MatrixType>
-typename internal::traits<MatrixType>::Scalar PartialPivLU<MatrixType>::determinant() const
+typename PartialPivLU<MatrixType>::Scalar PartialPivLU<MatrixType>::determinant() const
 {
   eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
   return Scalar(m_det_p) * m_lu.diagonal().prod();
@@ -432,38 +572,21 @@
   return res;
 }
 
-/***** Implementation of solve() *****************************************************/
+/***** Implementation details *****************************************************/
 
 namespace internal {
 
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<PartialPivLU<_MatrixType>, Rhs>
-  : solve_retval_base<PartialPivLU<_MatrixType>, Rhs>
+/***** Implementation of inverse() *****************************************************/
+template<typename DstXprType, typename MatrixType>
+struct Assignment<DstXprType, Inverse<PartialPivLU<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename PartialPivLU<MatrixType>::Scalar>, Dense2Dense>
 {
-  EIGEN_MAKE_SOLVE_HELPERS(PartialPivLU<_MatrixType>,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
+  typedef PartialPivLU<MatrixType> LuType;
+  typedef Inverse<LuType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename LuType::Scalar> &)
   {
-    /* The decomposition PA = LU can be rewritten as A = P^{-1} L U.
-    * So we proceed as follows:
-    * Step 1: compute c = Pb.
-    * Step 2: replace c by the solution x to Lx = c.
-    * Step 3: replace c by the solution x to Ux = c.
-    */
-
-    eigen_assert(rhs().rows() == dec().matrixLU().rows());
-
-    // Step 1
-    dst = dec().permutationP() * rhs();
-
-    // Step 2
-    dec().matrixLU().template triangularView<UnitLower>().solveInPlace(dst);
-
-    // Step 3
-    dec().matrixLU().template triangularView<Upper>().solveInPlace(dst);
+    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
   }
 };
-
 } // end namespace internal
 
 /******** MatrixBase methods *******/
@@ -474,16 +597,13 @@
   *
   * \sa class PartialPivLU
   */
-#ifndef __CUDACC__
 template<typename Derived>
 inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject>
 MatrixBase<Derived>::partialPivLu() const
 {
   return PartialPivLU<PlainObject>(eval());
 }
-#endif
 
-#if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
 /** \lu_module
   *
   * Synonym of partialPivLu().
@@ -492,16 +612,12 @@
   *
   * \sa class PartialPivLU
   */
-#ifndef __CUDACC__
 template<typename Derived>
 inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject>
 MatrixBase<Derived>::lu() const
 {
   return PartialPivLU<PlainObject>(eval());
 }
-#endif
-
-#endif
 
 } // end namespace Eigen
 

diff --git a/Eigen/src/LU/PartialPivLU_LAPACKE.h b/Eigen/src/LU/PartialPivLU_LAPACKE.h
new file mode 100644
index 0000000..755168a
--- /dev/null
+++ b/Eigen/src/LU/PartialPivLU_LAPACKE.h

@@ -0,0 +1,83 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to LAPACKe
+ *     LU decomposition with partial pivoting based on LAPACKE_?getrf function.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_PARTIALLU_LAPACK_H
+#define EIGEN_PARTIALLU_LAPACK_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \internal Specialization for the data types supported by LAPACKe */
+
+#define EIGEN_LAPACKE_LU_PARTPIV(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX) \
+template<int StorageOrder> \
+struct partial_lu_impl<EIGTYPE, StorageOrder, lapack_int> \
+{ \
+  /* \internal performs the LU decomposition in-place of the matrix represented */ \
+  static lapack_int blocked_lu(Index rows, Index cols, EIGTYPE* lu_data, Index luStride, lapack_int* row_transpositions, lapack_int& nb_transpositions, lapack_int maxBlockSize=256) \
+  { \
+    EIGEN_UNUSED_VARIABLE(maxBlockSize);\
+    lapack_int matrix_order, first_zero_pivot; \
+    lapack_int m, n, lda, *ipiv, info; \
+    EIGTYPE* a; \
+/* Set up parameters for ?getrf */ \
+    matrix_order = StorageOrder==RowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \
+    lda = convert_index<lapack_int>(luStride); \
+    a = lu_data; \
+    ipiv = row_transpositions; \
+    m = convert_index<lapack_int>(rows); \
+    n = convert_index<lapack_int>(cols); \
+    nb_transpositions = 0; \
+\
+    info = LAPACKE_##LAPACKE_PREFIX##getrf( matrix_order, m, n, (LAPACKE_TYPE*)a, lda, ipiv ); \
+\
+    for(int i=0;i<m;i++) { ipiv[i]--; if (ipiv[i]!=i) nb_transpositions++; } \
+\
+    eigen_assert(info >= 0); \
+/* something should be done with nb_transpositions */ \
+\
+    first_zero_pivot = info; \
+    return first_zero_pivot; \
+  } \
+};
+
+EIGEN_LAPACKE_LU_PARTPIV(double, double, d)
+EIGEN_LAPACKE_LU_PARTPIV(float, float, s)
+EIGEN_LAPACKE_LU_PARTPIV(dcomplex, lapack_complex_double, z)
+EIGEN_LAPACKE_LU_PARTPIV(scomplex, lapack_complex_float,  c)
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PARTIALLU_LAPACK_H

diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h
new file mode 100644
index 0000000..a232ffc
--- /dev/null
+++ b/Eigen/src/LU/arch/InverseSize4.h

@@ -0,0 +1,351 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2001 Intel Corporation
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+// The algorithm below is a reimplementation of former \src\LU\Inverse_SSE.h using PacketMath.
+// inv(M) = M#/|M|, where inv(M), M# and |M| denote the inverse of M,
+// adjugate of M and determinant of M respectively. M# is computed block-wise
+// using specific formulae. For proof, see:
+// https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html
+// Variable names are adopted from \src\LU\Inverse_SSE.h.
+//
+// The SSE code for the 4x4 float and double matrix inverse in former (deprecated) \src\LU\Inverse_SSE.h
+// comes from the following Intel's library:
+// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/
+//
+// Here is the respective copyright and license statement:
+//
+//   Copyright (c) 2001 Intel Corporation.
+//
+// Permition is granted to use, copy, distribute and prepare derivative works
+// of this library for any purpose and without fee, provided, that the above
+// copyright notice and this statement appear in all copies.
+// Intel makes no representations about the suitability of this software for
+// any purpose, and specifically disclaims all warranties.
+// See LEGAL.TXT for all the legal information.
+//
+// TODO: Unify implementations of different data types (i.e. float and double).
+#ifndef EIGEN_INVERSE_SIZE_4_H
+#define EIGEN_INVERSE_SIZE_4_H
+
+namespace Eigen
+{
+namespace internal
+{
+template <typename MatrixType, typename ResultType>
+struct compute_inverse_size4<Architecture::Target, float, MatrixType, ResultType>
+{
+  enum
+  {
+    MatrixAlignment = traits<MatrixType>::Alignment,
+    ResultAlignment = traits<ResultType>::Alignment,
+    StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
+  };
+  typedef typename conditional<(MatrixType::Flags & LinearAccessBit), MatrixType const &, typename MatrixType::PlainObject>::type ActualMatrixType;
+
+  static void run(const MatrixType &mat, ResultType &result)
+  {
+    ActualMatrixType matrix(mat);
+
+    const float* data = matrix.data();
+    const Index stride = matrix.innerStride();
+    Packet4f _L1 = ploadt<Packet4f,MatrixAlignment>(data);
+    Packet4f _L2 = ploadt<Packet4f,MatrixAlignment>(data + stride*4);
+    Packet4f _L3 = ploadt<Packet4f,MatrixAlignment>(data + stride*8);
+    Packet4f _L4 = ploadt<Packet4f,MatrixAlignment>(data + stride*12);
+
+    // Four 2x2 sub-matrices of the input matrix
+    // input = [[A, B],
+    //          [C, D]]
+    Packet4f A, B, C, D;
+
+    if (!StorageOrdersMatch)
+    {
+      A = vec4f_unpacklo(_L1, _L2);
+      B = vec4f_unpacklo(_L3, _L4);
+      C = vec4f_unpackhi(_L1, _L2);
+      D = vec4f_unpackhi(_L3, _L4);
+    }
+    else
+    {
+      A = vec4f_movelh(_L1, _L2);
+      B = vec4f_movehl(_L2, _L1);
+      C = vec4f_movelh(_L3, _L4);
+      D = vec4f_movehl(_L4, _L3);
+    }
+
+    Packet4f AB, DC;
+
+    // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
+    AB = pmul(vec4f_swizzle2(A, A, 3, 3, 0, 0), B);
+    AB = psub(AB, pmul(vec4f_swizzle2(A, A, 1, 1, 2, 2), vec4f_swizzle2(B, B, 2, 3, 0, 1)));
+
+    // DC = D#*C
+    DC = pmul(vec4f_swizzle2(D, D, 3, 3, 0, 0), C);
+    DC = psub(DC, pmul(vec4f_swizzle2(D, D, 1, 1, 2, 2), vec4f_swizzle2(C, C, 2, 3, 0, 1)));
+
+    // determinants of the sub-matrices
+    Packet4f dA, dB, dC, dD;
+
+    dA = pmul(vec4f_swizzle2(A, A, 3, 3, 1, 1), A);
+    dA = psub(dA, vec4f_movehl(dA, dA));
+
+    dB = pmul(vec4f_swizzle2(B, B, 3, 3, 1, 1), B);
+    dB = psub(dB, vec4f_movehl(dB, dB));
+
+    dC = pmul(vec4f_swizzle2(C, C, 3, 3, 1, 1), C);
+    dC = psub(dC, vec4f_movehl(dC, dC));
+
+    dD = pmul(vec4f_swizzle2(D, D, 3, 3, 1, 1), D);
+    dD = psub(dD, vec4f_movehl(dD, dD));
+
+    Packet4f d, d1, d2;
+
+    d = pmul(vec4f_swizzle2(DC, DC, 0, 2, 1, 3), AB);
+    d = padd(d, vec4f_movehl(d, d));
+    d = padd(d, vec4f_swizzle2(d, d, 1, 0, 0, 0));
+    d1 = pmul(dA, dD);
+    d2 = pmul(dB, dC);
+
+    // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
+    Packet4f det = vec4f_duplane(psub(padd(d1, d2), d), 0);
+
+    // reciprocal of the determinant of the input matrix, rd = 1/det
+    Packet4f rd = pdiv(pset1<Packet4f>(1.0f), det);
+
+    // Four sub-matrices of the inverse
+    Packet4f iA, iB, iC, iD;
+
+    // iD = D*|A| - C*A#*B
+    iD = pmul(vec4f_swizzle2(C, C, 0, 0, 2, 2), vec4f_movelh(AB, AB));
+    iD = padd(iD, pmul(vec4f_swizzle2(C, C, 1, 1, 3, 3), vec4f_movehl(AB, AB)));
+    iD = psub(pmul(D, vec4f_duplane(dA, 0)), iD);
+
+    // iA = A*|D| - B*D#*C
+    iA = pmul(vec4f_swizzle2(B, B, 0, 0, 2, 2), vec4f_movelh(DC, DC));
+    iA = padd(iA, pmul(vec4f_swizzle2(B, B, 1, 1, 3, 3), vec4f_movehl(DC, DC)));
+    iA = psub(pmul(A, vec4f_duplane(dD, 0)), iA);
+
+    // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
+    iB = pmul(D, vec4f_swizzle2(AB, AB, 3, 0, 3, 0));
+    iB = psub(iB, pmul(vec4f_swizzle2(D, D, 1, 0, 3, 2), vec4f_swizzle2(AB, AB, 2, 1, 2, 1)));
+    iB = psub(pmul(C, vec4f_duplane(dB, 0)), iB);
+
+    // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
+    iC = pmul(A, vec4f_swizzle2(DC, DC, 3, 0, 3, 0));
+    iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1)));
+    iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC);
+
+    const float sign_mask[4] = {0.0f, numext::bit_cast<float>(0x80000000u), numext::bit_cast<float>(0x80000000u), 0.0f};
+    const Packet4f p4f_sign_PNNP = ploadu<Packet4f>(sign_mask);
+    rd = pxor(rd, p4f_sign_PNNP);
+    iA = pmul(iA, rd);
+    iB = pmul(iB, rd);
+    iC = pmul(iC, rd);
+    iD = pmul(iD, rd);
+
+    Index res_stride = result.outerStride();
+    float *res = result.data();
+
+    pstoret<float, Packet4f, ResultAlignment>(res + 0, vec4f_swizzle2(iA, iB, 3, 1, 3, 1));
+    pstoret<float, Packet4f, ResultAlignment>(res + res_stride, vec4f_swizzle2(iA, iB, 2, 0, 2, 0));
+    pstoret<float, Packet4f, ResultAlignment>(res + 2 * res_stride, vec4f_swizzle2(iC, iD, 3, 1, 3, 1));
+    pstoret<float, Packet4f, ResultAlignment>(res + 3 * res_stride, vec4f_swizzle2(iC, iD, 2, 0, 2, 0));
+  }
+};
+
+#if !(defined EIGEN_VECTORIZE_NEON && !(EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG))
+// same algorithm as above, except that each operand is split into
+// halves for two registers to hold.
+template <typename MatrixType, typename ResultType>
+struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultType>
+{
+  enum
+  {
+    MatrixAlignment = traits<MatrixType>::Alignment,
+    ResultAlignment = traits<ResultType>::Alignment,
+    StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
+  };
+  typedef typename conditional<(MatrixType::Flags & LinearAccessBit),
+                               MatrixType const &,
+                               typename MatrixType::PlainObject>::type
+      ActualMatrixType;
+
+  static void run(const MatrixType &mat, ResultType &result)
+  {
+    ActualMatrixType matrix(mat);
+
+    // Four 2x2 sub-matrices of the input matrix, each is further divided into upper and lower
+    // row e.g. A1, upper row of A, A2, lower row of A
+    // input = [[A, B],  =  [[[A1, [B1,
+    //          [C, D]]        A2], B2]],
+    //                       [[C1, [D1,
+    //                         C2], D2]]]
+
+    Packet2d A1, A2, B1, B2, C1, C2, D1, D2;
+
+    const double* data = matrix.data();
+    const Index stride = matrix.innerStride();
+    if (StorageOrdersMatch)
+    {
+      A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0);
+      B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2);
+      A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4);
+      B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6);
+      C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8);
+      D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10);
+      C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12);
+      D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14);
+    }
+    else
+    {
+      Packet2d temp;
+      A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0);
+      C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2);
+      A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4);
+      C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6);
+      temp = A1;
+      A1 = vec2d_unpacklo(A1, A2);
+      A2 = vec2d_unpackhi(temp, A2);
+
+      temp = C1;
+      C1 = vec2d_unpacklo(C1, C2);
+      C2 = vec2d_unpackhi(temp, C2);
+
+      B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8);
+      D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10);
+      B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12);
+      D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14);
+
+      temp = B1;
+      B1 = vec2d_unpacklo(B1, B2);
+      B2 = vec2d_unpackhi(temp, B2);
+
+      temp = D1;
+      D1 = vec2d_unpacklo(D1, D2);
+      D2 = vec2d_unpackhi(temp, D2);
+    }
+
+    // determinants of the sub-matrices
+    Packet2d dA, dB, dC, dD;
+
+    dA = vec2d_swizzle2(A2, A2, 1);
+    dA = pmul(A1, dA);
+    dA = psub(dA, vec2d_duplane(dA, 1));
+
+    dB = vec2d_swizzle2(B2, B2, 1);
+    dB = pmul(B1, dB);
+    dB = psub(dB, vec2d_duplane(dB, 1));
+
+    dC = vec2d_swizzle2(C2, C2, 1);
+    dC = pmul(C1, dC);
+    dC = psub(dC, vec2d_duplane(dC, 1));
+
+    dD = vec2d_swizzle2(D2, D2, 1);
+    dD = pmul(D1, dD);
+    dD = psub(dD, vec2d_duplane(dD, 1));
+
+    Packet2d DC1, DC2, AB1, AB2;
+
+    // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
+    AB1 = pmul(B1, vec2d_duplane(A2, 1));
+    AB2 = pmul(B2, vec2d_duplane(A1, 0));
+    AB1 = psub(AB1, pmul(B2, vec2d_duplane(A1, 1)));
+    AB2 = psub(AB2, pmul(B1, vec2d_duplane(A2, 0)));
+
+    // DC = D#*C
+    DC1 = pmul(C1, vec2d_duplane(D2, 1));
+    DC2 = pmul(C2, vec2d_duplane(D1, 0));
+    DC1 = psub(DC1, pmul(C2, vec2d_duplane(D1, 1)));
+    DC2 = psub(DC2, pmul(C1, vec2d_duplane(D2, 0)));
+
+    Packet2d d1, d2;
+
+    // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
+    Packet2d det;
+
+    // reciprocal of the determinant of the input matrix, rd = 1/det
+    Packet2d rd;
+
+    d1 = pmul(AB1, vec2d_swizzle2(DC1, DC2, 0));
+    d2 = pmul(AB2, vec2d_swizzle2(DC1, DC2, 3));
+    rd = padd(d1, d2);
+    rd = padd(rd, vec2d_duplane(rd, 1));
+
+    d1 = pmul(dA, dD);
+    d2 = pmul(dB, dC);
+
+    det = padd(d1, d2);
+    det = psub(det, rd);
+    det = vec2d_duplane(det, 0);
+    rd = pdiv(pset1<Packet2d>(1.0), det);
+
+    // rows of four sub-matrices of the inverse
+    Packet2d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2;
+
+    // iD = D*|A| - C*A#*B
+    iD1 = pmul(AB1, vec2d_duplane(C1, 0));
+    iD2 = pmul(AB1, vec2d_duplane(C2, 0));
+    iD1 = padd(iD1, pmul(AB2, vec2d_duplane(C1, 1)));
+    iD2 = padd(iD2, pmul(AB2, vec2d_duplane(C2, 1)));
+    dA = vec2d_duplane(dA, 0);
+    iD1 = psub(pmul(D1, dA), iD1);
+    iD2 = psub(pmul(D2, dA), iD2);
+
+    // iA = A*|D| - B*D#*C
+    iA1 = pmul(DC1, vec2d_duplane(B1, 0));
+    iA2 = pmul(DC1, vec2d_duplane(B2, 0));
+    iA1 = padd(iA1, pmul(DC2, vec2d_duplane(B1, 1)));
+    iA2 = padd(iA2, pmul(DC2, vec2d_duplane(B2, 1)));
+    dD = vec2d_duplane(dD, 0);
+    iA1 = psub(pmul(A1, dD), iA1);
+    iA2 = psub(pmul(A2, dD), iA2);
+
+    // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
+    iB1 = pmul(D1, vec2d_swizzle2(AB2, AB1, 1));
+    iB2 = pmul(D2, vec2d_swizzle2(AB2, AB1, 1));
+    iB1 = psub(iB1, pmul(vec2d_swizzle2(D1, D1, 1), vec2d_swizzle2(AB2, AB1, 2)));
+    iB2 = psub(iB2, pmul(vec2d_swizzle2(D2, D2, 1), vec2d_swizzle2(AB2, AB1, 2)));
+    dB = vec2d_duplane(dB, 0);
+    iB1 = psub(pmul(C1, dB), iB1);
+    iB2 = psub(pmul(C2, dB), iB2);
+
+    // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
+    iC1 = pmul(A1, vec2d_swizzle2(DC2, DC1, 1));
+    iC2 = pmul(A2, vec2d_swizzle2(DC2, DC1, 1));
+    iC1 = psub(iC1, pmul(vec2d_swizzle2(A1, A1, 1), vec2d_swizzle2(DC2, DC1, 2)));
+    iC2 = psub(iC2, pmul(vec2d_swizzle2(A2, A2, 1), vec2d_swizzle2(DC2, DC1, 2)));
+    dC = vec2d_duplane(dC, 0);
+    iC1 = psub(pmul(B1, dC), iC1);
+    iC2 = psub(pmul(B2, dC), iC2);
+
+    const double sign_mask1[2] = {0.0, numext::bit_cast<double>(0x8000000000000000ull)};
+    const double sign_mask2[2] = {numext::bit_cast<double>(0x8000000000000000ull), 0.0};
+    const Packet2d sign_PN = ploadu<Packet2d>(sign_mask1);
+    const Packet2d sign_NP = ploadu<Packet2d>(sign_mask2);
+    d1 = pxor(rd, sign_PN);
+    d2 = pxor(rd, sign_NP);
+
+    Index res_stride = result.outerStride();
+    double *res = result.data();
+    pstoret<double, Packet2d, ResultAlignment>(res + 0, pmul(vec2d_swizzle2(iA2, iA1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res + res_stride, pmul(vec2d_swizzle2(iA2, iA1, 0), d2));
+    pstoret<double, Packet2d, ResultAlignment>(res + 2, pmul(vec2d_swizzle2(iB2, iB1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res + res_stride + 2, pmul(vec2d_swizzle2(iB2, iB1, 0), d2));
+    pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 0), d2));
+    pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 0), d2));
+  }
+};
+#endif
+} // namespace internal
+} // namespace Eigen
+#endif

diff --git a/Eigen/src/MetisSupport/MetisSupport.h b/Eigen/src/MetisSupport/MetisSupport.h
index f2bbef2..4c15304 100644
--- a/Eigen/src/MetisSupport/MetisSupport.h
+++ b/Eigen/src/MetisSupport/MetisSupport.h

@@ -18,12 +18,12 @@
  * Row (column) i of A is the matperm(i) row (column) of Ap. 
  * WARNING: As computed by METIS, this corresponds to the vector iperm (instead of perm)
  */
-template <typename Index>
+template <typename StorageIndex>
 class MetisOrdering
 {
 public:
-  typedef PermutationMatrix<Dynamic,Dynamic,Index> PermutationType;
-  typedef Matrix<Index,Dynamic,1> IndexVector; 
+  typedef PermutationMatrix<Dynamic,Dynamic,StorageIndex> PermutationType;
+  typedef Matrix<StorageIndex,Dynamic,1> IndexVector; 
   
   template <typename MatrixType>
   void get_symmetrized_graph(const MatrixType& A)
@@ -36,7 +36,7 @@
     Index TotNz = 0; 
     IndexVector visited(m); 
     visited.setConstant(-1); 
-    for (int j = 0; j < m; j++)
+    for (StorageIndex j = 0; j < m; j++)
     {
       // Compute the union structure of of A(j,:) and At(j,:)
       visited(j) = j; // Do not include the diagonal element
@@ -67,8 +67,8 @@
 
     // Now compute the real adjacency list of each column/row 
     visited.setConstant(-1); 
-    Index CurNz = 0; 
-    for (int j = 0; j < m; j++)
+    StorageIndex CurNz = 0; 
+    for (StorageIndex j = 0; j < m; j++)
     {
       m_indexPtr(j) = CurNz; 
       
@@ -76,7 +76,7 @@
       // Add the pattern of row/column j of A to A+At
       for (typename MatrixType::InnerIterator it(A,j); it; ++it)
       {
-        Index idx = it.index(); // Get the row index (for column major) or column index (for row major)
+        StorageIndex idx = it.index(); // Get the row index (for column major) or column index (for row major)
         if (visited(idx) != j ) 
         {
           visited(idx) = j; 
@@ -87,7 +87,7 @@
       //Add the pattern of row/column j of At to A+At
       for (typename MatrixType::InnerIterator it(At, j); it; ++it)
       {
-        Index idx = it.index(); 
+        StorageIndex idx = it.index(); 
         if(visited(idx) != j)
         {
           visited(idx) = j; 
@@ -102,7 +102,7 @@
   template <typename MatrixType>
   void operator() (const MatrixType& A, PermutationType& matperm)
   {
-     Index m = A.cols();
+     StorageIndex m = internal::convert_index<StorageIndex>(A.cols()); // must be StorageIndex, because it is passed by address to METIS
      IndexVector perm(m),iperm(m); 
     // First, symmetrize the matrix graph. 
      get_symmetrized_graph(A); 

diff --git a/Eigen/src/OrderingMethods/Amd.h b/Eigen/src/OrderingMethods/Amd.h
new file mode 100644
index 0000000..7ca3f33
--- /dev/null
+++ b/Eigen/src/OrderingMethods/Amd.h

@@ -0,0 +1,435 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*
+NOTE: this routine has been adapted from the CSparse library:
+
+Copyright (c) 2006, Timothy A. Davis.
+http://www.suitesparse.com
+
+The author of CSparse, Timothy A. Davis., has executed a license with Google LLC
+to permit distribution of this code and derivative works as part of Eigen under
+the Mozilla Public License v. 2.0, as stated at the top of this file.
+*/
+
+#ifndef EIGEN_SPARSE_AMD_H
+#define EIGEN_SPARSE_AMD_H
+
+namespace Eigen { 
+
+namespace internal {
+  
+template<typename T> inline T amd_flip(const T& i) { return -i-2; }
+template<typename T> inline T amd_unflip(const T& i) { return i<0 ? amd_flip(i) : i; }
+template<typename T0, typename T1> inline bool amd_marked(const T0* w, const T1& j) { return w[j]<0; }
+template<typename T0, typename T1> inline void amd_mark(const T0* w, const T1& j) { return w[j] = amd_flip(w[j]); }
+
+/* clear w */
+template<typename StorageIndex>
+static StorageIndex cs_wclear (StorageIndex mark, StorageIndex lemax, StorageIndex *w, StorageIndex n)
+{
+  StorageIndex k;
+  if(mark < 2 || (mark + lemax < 0))
+  {
+    for(k = 0; k < n; k++)
+      if(w[k] != 0)
+        w[k] = 1;
+    mark = 2;
+  }
+  return (mark);     /* at this point, w[0..n-1] < mark holds */
+}
+
+/* depth-first search and postorder of a tree rooted at node j */
+template<typename StorageIndex>
+StorageIndex cs_tdfs(StorageIndex j, StorageIndex k, StorageIndex *head, const StorageIndex *next, StorageIndex *post, StorageIndex *stack)
+{
+  StorageIndex i, p, top = 0;
+  if(!head || !next || !post || !stack) return (-1);    /* check inputs */
+  stack[0] = j;                 /* place j on the stack */
+  while (top >= 0)                /* while (stack is not empty) */
+  {
+    p = stack[top];           /* p = top of stack */
+    i = head[p];              /* i = youngest child of p */
+    if(i == -1)
+    {
+      top--;                 /* p has no unordered children left */
+      post[k++] = p;        /* node p is the kth postordered node */
+    }
+    else
+    {
+      head[p] = next[i];   /* remove i from children of p */
+      stack[++top] = i;     /* start dfs on child node i */
+    }
+  }
+  return k;
+}
+
+
+/** \internal
+  * \ingroup OrderingMethods_Module 
+  * Approximate minimum degree ordering algorithm.
+  *
+  * \param[in] C the input selfadjoint matrix stored in compressed column major format.
+  * \param[out] perm the permutation P reducing the fill-in of the input matrix \a C
+  *
+  * Note that the input matrix \a C must be complete, that is both the upper and lower parts have to be stored, as well as the diagonal entries.
+  * On exit the values of C are destroyed */
+template<typename Scalar, typename StorageIndex>
+void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,StorageIndex>& C, PermutationMatrix<Dynamic,Dynamic,StorageIndex>& perm)
+{
+  using std::sqrt;
+  
+  StorageIndex d, dk, dext, lemax = 0, e, elenk, eln, i, j, k, k1,
+                k2, k3, jlast, ln, dense, nzmax, mindeg = 0, nvi, nvj, nvk, mark, wnvi,
+                ok, nel = 0, p, p1, p2, p3, p4, pj, pk, pk1, pk2, pn, q, t, h;
+  
+  StorageIndex n = StorageIndex(C.cols());
+  dense = std::max<StorageIndex> (16, StorageIndex(10 * sqrt(double(n))));   /* find dense threshold */
+  dense = (std::min)(n-2, dense);
+  
+  StorageIndex cnz = StorageIndex(C.nonZeros());
+  perm.resize(n+1);
+  t = cnz + cnz/5 + 2*n;                 /* add elbow room to C */
+  C.resizeNonZeros(t);
+  
+  // get workspace
+  ei_declare_aligned_stack_constructed_variable(StorageIndex,W,8*(n+1),0);
+  StorageIndex* len     = W;
+  StorageIndex* nv      = W +   (n+1);
+  StorageIndex* next    = W + 2*(n+1);
+  StorageIndex* head    = W + 3*(n+1);
+  StorageIndex* elen    = W + 4*(n+1);
+  StorageIndex* degree  = W + 5*(n+1);
+  StorageIndex* w       = W + 6*(n+1);
+  StorageIndex* hhead   = W + 7*(n+1);
+  StorageIndex* last    = perm.indices().data();                              /* use P as workspace for last */
+  
+  /* --- Initialize quotient graph ---------------------------------------- */
+  StorageIndex* Cp = C.outerIndexPtr();
+  StorageIndex* Ci = C.innerIndexPtr();
+  for(k = 0; k < n; k++)
+    len[k] = Cp[k+1] - Cp[k];
+  len[n] = 0;
+  nzmax = t;
+  
+  for(i = 0; i <= n; i++)
+  {
+    head[i]   = -1;                     // degree list i is empty
+    last[i]   = -1;
+    next[i]   = -1;
+    hhead[i]  = -1;                     // hash list i is empty 
+    nv[i]     = 1;                      // node i is just one node
+    w[i]      = 1;                      // node i is alive
+    elen[i]   = 0;                      // Ek of node i is empty
+    degree[i] = len[i];                 // degree of node i
+  }
+  mark = internal::cs_wclear<StorageIndex>(0, 0, w, n);         /* clear w */
+  
+  /* --- Initialize degree lists ------------------------------------------ */
+  for(i = 0; i < n; i++)
+  {
+    bool has_diag = false;
+    for(p = Cp[i]; p<Cp[i+1]; ++p)
+      if(Ci[p]==i)
+      {
+        has_diag = true;
+        break;
+      }
+   
+    d = degree[i];
+    if(d == 1 && has_diag)           /* node i is empty */
+    {
+      elen[i] = -2;                 /* element i is dead */
+      nel++;
+      Cp[i] = -1;                   /* i is a root of assembly tree */
+      w[i] = 0;
+    }
+    else if(d > dense || !has_diag)  /* node i is dense or has no structural diagonal element */
+    {
+      nv[i] = 0;                    /* absorb i into element n */
+      elen[i] = -1;                 /* node i is dead */
+      nel++;
+      Cp[i] = amd_flip (n);
+      nv[n]++;
+    }
+    else
+    {
+      if(head[d] != -1) last[head[d]] = i;
+      next[i] = head[d];           /* put node i in degree list d */
+      head[d] = i;
+    }
+  }
+  
+  elen[n] = -2;                         /* n is a dead element */
+  Cp[n] = -1;                           /* n is a root of assembly tree */
+  w[n] = 0;                             /* n is a dead element */
+  
+  while (nel < n)                         /* while (selecting pivots) do */
+  {
+    /* --- Select node of minimum approximate degree -------------------- */
+    for(k = -1; mindeg < n && (k = head[mindeg]) == -1; mindeg++) {}
+    if(next[k] != -1) last[next[k]] = -1;
+    head[mindeg] = next[k];          /* remove k from degree list */
+    elenk = elen[k];                  /* elenk = |Ek| */
+    nvk = nv[k];                      /* # of nodes k represents */
+    nel += nvk;                        /* nv[k] nodes of A eliminated */
+    
+    /* --- Garbage collection ------------------------------------------- */
+    if(elenk > 0 && cnz + mindeg >= nzmax)
+    {
+      for(j = 0; j < n; j++)
+      {
+        if((p = Cp[j]) >= 0)      /* j is a live node or element */
+        {
+          Cp[j] = Ci[p];          /* save first entry of object */
+          Ci[p] = amd_flip (j);    /* first entry is now amd_flip(j) */
+        }
+      }
+      for(q = 0, p = 0; p < cnz; ) /* scan all of memory */
+      {
+        if((j = amd_flip (Ci[p++])) >= 0)  /* found object j */
+        {
+          Ci[q] = Cp[j];       /* restore first entry of object */
+          Cp[j] = q++;          /* new pointer to object j */
+          for(k3 = 0; k3 < len[j]-1; k3++) Ci[q++] = Ci[p++];
+        }
+      }
+      cnz = q;                       /* Ci[cnz...nzmax-1] now free */
+    }
+    
+    /* --- Construct new element ---------------------------------------- */
+    dk = 0;
+    nv[k] = -nvk;                     /* flag k as in Lk */
+    p = Cp[k];
+    pk1 = (elenk == 0) ? p : cnz;      /* do in place if elen[k] == 0 */
+    pk2 = pk1;
+    for(k1 = 1; k1 <= elenk + 1; k1++)
+    {
+      if(k1 > elenk)
+      {
+        e = k;                     /* search the nodes in k */
+        pj = p;                    /* list of nodes starts at Ci[pj]*/
+        ln = len[k] - elenk;      /* length of list of nodes in k */
+      }
+      else
+      {
+        e = Ci[p++];              /* search the nodes in e */
+        pj = Cp[e];
+        ln = len[e];              /* length of list of nodes in e */
+      }
+      for(k2 = 1; k2 <= ln; k2++)
+      {
+        i = Ci[pj++];
+        if((nvi = nv[i]) <= 0) continue; /* node i dead, or seen */
+        dk += nvi;                 /* degree[Lk] += size of node i */
+        nv[i] = -nvi;             /* negate nv[i] to denote i in Lk*/
+        Ci[pk2++] = i;            /* place i in Lk */
+        if(next[i] != -1) last[next[i]] = last[i];
+        if(last[i] != -1)         /* remove i from degree list */
+        {
+          next[last[i]] = next[i];
+        }
+        else
+        {
+          head[degree[i]] = next[i];
+        }
+      }
+      if(e != k)
+      {
+        Cp[e] = amd_flip (k);      /* absorb e into k */
+        w[e] = 0;                 /* e is now a dead element */
+      }
+    }
+    if(elenk != 0) cnz = pk2;         /* Ci[cnz...nzmax] is free */
+    degree[k] = dk;                   /* external degree of k - |Lk\i| */
+    Cp[k] = pk1;                      /* element k is in Ci[pk1..pk2-1] */
+    len[k] = pk2 - pk1;
+    elen[k] = -2;                     /* k is now an element */
+    
+    /* --- Find set differences ----------------------------------------- */
+    mark = internal::cs_wclear<StorageIndex>(mark, lemax, w, n);  /* clear w if necessary */
+    for(pk = pk1; pk < pk2; pk++)    /* scan 1: find |Le\Lk| */
+    {
+      i = Ci[pk];
+      if((eln = elen[i]) <= 0) continue;/* skip if elen[i] empty */
+      nvi = -nv[i];                      /* nv[i] was negated */
+      wnvi = mark - nvi;
+      for(p = Cp[i]; p <= Cp[i] + eln - 1; p++)  /* scan Ei */
+      {
+        e = Ci[p];
+        if(w[e] >= mark)
+        {
+          w[e] -= nvi;          /* decrement |Le\Lk| */
+        }
+        else if(w[e] != 0)        /* ensure e is a live element */
+        {
+          w[e] = degree[e] + wnvi; /* 1st time e seen in scan 1 */
+        }
+      }
+    }
+    
+    /* --- Degree update ------------------------------------------------ */
+    for(pk = pk1; pk < pk2; pk++)    /* scan2: degree update */
+    {
+      i = Ci[pk];                   /* consider node i in Lk */
+      p1 = Cp[i];
+      p2 = p1 + elen[i] - 1;
+      pn = p1;
+      for(h = 0, d = 0, p = p1; p <= p2; p++)    /* scan Ei */
+      {
+        e = Ci[p];
+        if(w[e] != 0)             /* e is an unabsorbed element */
+        {
+          dext = w[e] - mark;   /* dext = |Le\Lk| */
+          if(dext > 0)
+          {
+            d += dext;         /* sum up the set differences */
+            Ci[pn++] = e;     /* keep e in Ei */
+            h += e;            /* compute the hash of node i */
+          }
+          else
+          {
+            Cp[e] = amd_flip (k);  /* aggressive absorb. e->k */
+            w[e] = 0;             /* e is a dead element */
+          }
+        }
+      }
+      elen[i] = pn - p1 + 1;        /* elen[i] = |Ei| */
+      p3 = pn;
+      p4 = p1 + len[i];
+      for(p = p2 + 1; p < p4; p++) /* prune edges in Ai */
+      {
+        j = Ci[p];
+        if((nvj = nv[j]) <= 0) continue; /* node j dead or in Lk */
+        d += nvj;                  /* degree(i) += |j| */
+        Ci[pn++] = j;             /* place j in node list of i */
+        h += j;                    /* compute hash for node i */
+      }
+      if(d == 0)                     /* check for mass elimination */
+      {
+        Cp[i] = amd_flip (k);      /* absorb i into k */
+        nvi = -nv[i];
+        dk -= nvi;                 /* |Lk| -= |i| */
+        nvk += nvi;                /* |k| += nv[i] */
+        nel += nvi;
+        nv[i] = 0;
+        elen[i] = -1;             /* node i is dead */
+      }
+      else
+      {
+        degree[i] = std::min<StorageIndex> (degree[i], d);   /* update degree(i) */
+        Ci[pn] = Ci[p3];         /* move first node to end */
+        Ci[p3] = Ci[p1];         /* move 1st el. to end of Ei */
+        Ci[p1] = k;               /* add k as 1st element in of Ei */
+        len[i] = pn - p1 + 1;     /* new len of adj. list of node i */
+        h %= n;                    /* finalize hash of i */
+        next[i] = hhead[h];      /* place i in hash bucket */
+        hhead[h] = i;
+        last[i] = h;      /* save hash of i in last[i] */
+      }
+    }                                   /* scan2 is done */
+    degree[k] = dk;                   /* finalize |Lk| */
+    lemax = std::max<StorageIndex>(lemax, dk);
+    mark = internal::cs_wclear<StorageIndex>(mark+lemax, lemax, w, n);    /* clear w */
+    
+    /* --- Supernode detection ------------------------------------------ */
+    for(pk = pk1; pk < pk2; pk++)
+    {
+      i = Ci[pk];
+      if(nv[i] >= 0) continue;         /* skip if i is dead */
+      h = last[i];                      /* scan hash bucket of node i */
+      i = hhead[h];
+      hhead[h] = -1;                    /* hash bucket will be empty */
+      for(; i != -1 && next[i] != -1; i = next[i], mark++)
+      {
+        ln = len[i];
+        eln = elen[i];
+        for(p = Cp[i]+1; p <= Cp[i] + ln-1; p++) w[Ci[p]] = mark;
+        jlast = i;
+        for(j = next[i]; j != -1; ) /* compare i with all j */
+        {
+          ok = (len[j] == ln) && (elen[j] == eln);
+          for(p = Cp[j] + 1; ok && p <= Cp[j] + ln - 1; p++)
+          {
+            if(w[Ci[p]] != mark) ok = 0;    /* compare i and j*/
+          }
+          if(ok)                     /* i and j are identical */
+          {
+            Cp[j] = amd_flip (i);  /* absorb j into i */
+            nv[i] += nv[j];
+            nv[j] = 0;
+            elen[j] = -1;         /* node j is dead */
+            j = next[j];          /* delete j from hash bucket */
+            next[jlast] = j;
+          }
+          else
+          {
+            jlast = j;             /* j and i are different */
+            j = next[j];
+          }
+        }
+      }
+    }
+    
+    /* --- Finalize new element------------------------------------------ */
+    for(p = pk1, pk = pk1; pk < pk2; pk++)   /* finalize Lk */
+    {
+      i = Ci[pk];
+      if((nvi = -nv[i]) <= 0) continue;/* skip if i is dead */
+      nv[i] = nvi;                      /* restore nv[i] */
+      d = degree[i] + dk - nvi;         /* compute external degree(i) */
+      d = std::min<StorageIndex> (d, n - nel - nvi);
+      if(head[d] != -1) last[head[d]] = i;
+      next[i] = head[d];               /* put i back in degree list */
+      last[i] = -1;
+      head[d] = i;
+      mindeg = std::min<StorageIndex> (mindeg, d);       /* find new minimum degree */
+      degree[i] = d;
+      Ci[p++] = i;                      /* place i in Lk */
+    }
+    nv[k] = nvk;                      /* # nodes absorbed into k */
+    if((len[k] = p-pk1) == 0)         /* length of adj list of element k*/
+    {
+      Cp[k] = -1;                   /* k is a root of the tree */
+      w[k] = 0;                     /* k is now a dead element */
+    }
+    if(elenk != 0) cnz = p;           /* free unused space in Lk */
+  }
+  
+  /* --- Postordering ----------------------------------------------------- */
+  for(i = 0; i < n; i++) Cp[i] = amd_flip (Cp[i]);/* fix assembly tree */
+  for(j = 0; j <= n; j++) head[j] = -1;
+  for(j = n; j >= 0; j--)              /* place unordered nodes in lists */
+  {
+    if(nv[j] > 0) continue;          /* skip if j is an element */
+    next[j] = head[Cp[j]];          /* place j in list of its parent */
+    head[Cp[j]] = j;
+  }
+  for(e = n; e >= 0; e--)              /* place elements in lists */
+  {
+    if(nv[e] <= 0) continue;         /* skip unless e is an element */
+    if(Cp[e] != -1)
+    {
+      next[e] = head[Cp[e]];      /* place e in list of its parent */
+      head[Cp[e]] = e;
+    }
+  }
+  for(k = 0, i = 0; i <= n; i++)       /* postorder the assembly tree */
+  {
+    if(Cp[i] == -1) k = internal::cs_tdfs<StorageIndex>(i, k, head, next, perm.indices().data(), w);
+  }
+  
+  perm.indices().conservativeResize(n);
+}
+
+} // namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_AMD_H

diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h
index 44548f6..8e339a7 100644
--- a/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h

@@ -13,184 +13,197 @@
 //   Davis (davis@cise.ufl.edu), University of Florida.  The algorithm was
 //   developed in collaboration with John Gilbert, Xerox PARC, and Esmond
 //   Ng, Oak Ridge National Laboratory.
-// 
+//
 //     Date:
-// 
+//
 //   September 8, 2003.  Version 2.3.
-// 
+//
 //     Acknowledgements:
-// 
+//
 //   This work was supported by the National Science Foundation, under
 //   grants DMS-9504974 and DMS-9803599.
-// 
+//
 //     Notice:
-// 
+//
 //   Copyright (c) 1998-2003 by the University of Florida.
 //   All Rights Reserved.
-// 
+//
 //   THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
 //   EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
-// 
+//
 //   Permission is hereby granted to use, copy, modify, and/or distribute
 //   this program, provided that the Copyright, this License, and the
 //   Availability of the original version is retained on all copies and made
 //   accessible to the end-user of any code or package that includes COLAMD
-//   or any modified version of COLAMD. 
-// 
+//   or any modified version of COLAMD.
+//
 //     Availability:
-// 
+//
 //   The colamd/symamd library is available at
-// 
-//       http://www.cise.ufl.edu/research/sparse/colamd/
+//
+//       http://www.suitesparse.com
 
-//   This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.h
-//   file.  It is required by the colamd.c, colamdmex.c, and symamdmex.c
-//   files, and by any C code that calls the routines whose prototypes are
-//   listed below, or that uses the colamd/symamd definitions listed below.
-  
+
 #ifndef EIGEN_COLAMD_H
 #define EIGEN_COLAMD_H
 
 namespace internal {
+
+namespace Colamd {
+
 /* Ensure that debugging is turned off: */
 #ifndef COLAMD_NDEBUG
 #define COLAMD_NDEBUG
 #endif /* NDEBUG */
+
+
 /* ========================================================================== */
 /* === Knob and statistics definitions ====================================== */
 /* ========================================================================== */
 
 /* size of the knobs [ ] array.  Only knobs [0..1] are currently used. */
-#define COLAMD_KNOBS 20
+const int NKnobs = 20;
 
 /* number of output statistics.  Only stats [0..6] are currently used. */
-#define COLAMD_STATS 20 
+const int NStats = 20;
 
-/* knobs [0] and stats [0]: dense row knob and output statistic. */
-#define COLAMD_DENSE_ROW 0
+/* Indices into knobs and stats array. */
+enum KnobsStatsIndex {
+  /* knobs [0] and stats [0]: dense row knob and output statistic. */
+  DenseRow = 0,
 
-/* knobs [1] and stats [1]: dense column knob and output statistic. */
-#define COLAMD_DENSE_COL 1
+  /* knobs [1] and stats [1]: dense column knob and output statistic. */
+  DenseCol = 1,
 
-/* stats [2]: memory defragmentation count output statistic */
-#define COLAMD_DEFRAG_COUNT 2
+  /* stats [2]: memory defragmentation count output statistic */
+  DefragCount = 2,
 
-/* stats [3]: colamd status:  zero OK, > 0 warning or notice, < 0 error */
-#define COLAMD_STATUS 3
+  /* stats [3]: colamd status:  zero OK, > 0 warning or notice, < 0 error */
+  Status = 3,
 
-/* stats [4..6]: error info, or info on jumbled columns */ 
-#define COLAMD_INFO1 4
-#define COLAMD_INFO2 5
-#define COLAMD_INFO3 6
+  /* stats [4..6]: error info, or info on jumbled columns */
+  Info1 = 4,
+  Info2 = 5,
+  Info3 = 6
+};
 
 /* error codes returned in stats [3]: */
-#define COLAMD_OK       (0)
-#define COLAMD_OK_BUT_JUMBLED     (1)
-#define COLAMD_ERROR_A_not_present    (-1)
-#define COLAMD_ERROR_p_not_present    (-2)
-#define COLAMD_ERROR_nrow_negative    (-3)
-#define COLAMD_ERROR_ncol_negative    (-4)
-#define COLAMD_ERROR_nnz_negative   (-5)
-#define COLAMD_ERROR_p0_nonzero     (-6)
-#define COLAMD_ERROR_A_too_small    (-7)
-#define COLAMD_ERROR_col_length_negative  (-8)
-#define COLAMD_ERROR_row_index_out_of_bounds  (-9)
-#define COLAMD_ERROR_out_of_memory    (-10)
-#define COLAMD_ERROR_internal_error   (-999)
-
+enum Status {
+  Ok = 0,
+  OkButJumbled = 1,
+  ErrorANotPresent = -1,
+  ErrorPNotPresent = -2,
+  ErrorNrowNegative = -3,
+  ErrorNcolNegative = -4,
+  ErrorNnzNegative = -5,
+  ErrorP0Nonzero = -6,
+  ErrorATooSmall = -7,
+  ErrorColLengthNegative = -8,
+  ErrorRowIndexOutOfBounds = -9,
+  ErrorOutOfMemory = -10,
+  ErrorInternalError = -999
+};
 /* ========================================================================== */
 /* === Definitions ========================================================== */
 /* ========================================================================== */
 
-#define COLAMD_MAX(a,b) (((a) > (b)) ? (a) : (b))
-#define COLAMD_MIN(a,b) (((a) < (b)) ? (a) : (b))
-
-#define ONES_COMPLEMENT(r) (-(r)-1)
+template <typename IndexType>
+IndexType ones_complement(const IndexType r) {
+  return (-(r)-1);
+}
 
 /* -------------------------------------------------------------------------- */
-
-#define COLAMD_EMPTY (-1)
+const int Empty = -1;
 
 /* Row and column status */
-#define ALIVE (0)
-#define DEAD  (-1)
+enum RowColumnStatus {
+  Alive = 0,
+  Dead = -1
+};
 
 /* Column status */
-#define DEAD_PRINCIPAL    (-1)
-#define DEAD_NON_PRINCIPAL  (-2)
-
-/* Macros for row and column status update and checking. */
-#define ROW_IS_DEAD(r)      ROW_IS_MARKED_DEAD (Row[r].shared2.mark)
-#define ROW_IS_MARKED_DEAD(row_mark)  (row_mark < ALIVE)
-#define ROW_IS_ALIVE(r)     (Row [r].shared2.mark >= ALIVE)
-#define COL_IS_DEAD(c)      (Col [c].start < ALIVE)
-#define COL_IS_ALIVE(c)     (Col [c].start >= ALIVE)
-#define COL_IS_DEAD_PRINCIPAL(c)  (Col [c].start == DEAD_PRINCIPAL)
-#define KILL_ROW(r)     { Row [r].shared2.mark = DEAD ; }
-#define KILL_PRINCIPAL_COL(c)   { Col [c].start = DEAD_PRINCIPAL ; }
-#define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = DEAD_NON_PRINCIPAL ; }
+enum ColumnStatus {
+  DeadPrincipal = -1,
+  DeadNonPrincipal = -2
+};
 
 /* ========================================================================== */
 /* === Colamd reporting mechanism =========================================== */
 /* ========================================================================== */
 
 // == Row and Column structures ==
-template <typename Index>
-struct colamd_col
+template <typename IndexType>
+struct ColStructure
 {
-  Index start ;   /* index for A of first row in this column, or DEAD */
+  IndexType start ;   /* index for A of first row in this column, or Dead */
   /* if column is dead */
-  Index length ;  /* number of rows in this column */
+  IndexType length ;  /* number of rows in this column */
   union
   {
-    Index thickness ; /* number of original columns represented by this */
+    IndexType thickness ; /* number of original columns represented by this */
     /* col, if the column is alive */
-    Index parent ;  /* parent in parent tree super-column structure, if */
+    IndexType parent ;  /* parent in parent tree super-column structure, if */
     /* the column is dead */
   } shared1 ;
   union
   {
-    Index score ; /* the score used to maintain heap, if col is alive */
-    Index order ; /* pivot ordering of this column, if col is dead */
+    IndexType score ; /* the score used to maintain heap, if col is alive */
+    IndexType order ; /* pivot ordering of this column, if col is dead */
   } shared2 ;
   union
   {
-    Index headhash ;  /* head of a hash bucket, if col is at the head of */
+    IndexType headhash ;  /* head of a hash bucket, if col is at the head of */
     /* a degree list */
-    Index hash ;  /* hash value, if col is not in a degree list */
-    Index prev ;  /* previous column in degree list, if col is in a */
+    IndexType hash ;  /* hash value, if col is not in a degree list */
+    IndexType prev ;  /* previous column in degree list, if col is in a */
     /* degree list (but not at the head of a degree list) */
   } shared3 ;
   union
   {
-    Index degree_next ; /* next column, if col is in a degree list */
-    Index hash_next ;   /* next column, if col is in a hash list */
+    IndexType degree_next ; /* next column, if col is in a degree list */
+    IndexType hash_next ;   /* next column, if col is in a hash list */
   } shared4 ;
-  
+
+  inline bool is_dead() const { return start < Alive; }
+
+  inline bool is_alive() const { return start >= Alive; }
+
+  inline bool is_dead_principal() const { return start == DeadPrincipal; }
+
+  inline void kill_principal() { start = DeadPrincipal; }
+
+  inline void kill_non_principal() { start = DeadNonPrincipal; }
+
 };
- 
-template <typename Index>
-struct Colamd_Row
+
+template <typename IndexType>
+struct RowStructure
 {
-  Index start ;   /* index for A of first col in this row */
-  Index length ;  /* number of principal columns in this row */
+  IndexType start ;   /* index for A of first col in this row */
+  IndexType length ;  /* number of principal columns in this row */
   union
   {
-    Index degree ;  /* number of principal & non-principal columns in row */
-    Index p ;   /* used as a row pointer in init_rows_cols () */
+    IndexType degree ;  /* number of principal & non-principal columns in row */
+    IndexType p ;   /* used as a row pointer in init_rows_cols () */
   } shared1 ;
   union
   {
-    Index mark ;  /* for computing set differences and marking dead rows*/
-    Index first_column ;/* first column in row (used in garbage collection) */
+    IndexType mark ;  /* for computing set differences and marking dead rows*/
+    IndexType first_column ;/* first column in row (used in garbage collection) */
   } shared2 ;
-  
+
+  inline bool is_dead() const { return shared2.mark < Alive; }
+
+  inline bool is_alive() const { return shared2.mark >= Alive; }
+
+  inline void kill() { shared2.mark = Dead; }
+
 };
- 
+
 /* ========================================================================== */
 /* === Colamd recommended memory size ======================================= */
 /* ========================================================================== */
- 
+
 /*
   The recommended length Alen of the array A passed to colamd is given by
   the COLAMD_RECOMMENDED (nnz, n_row, n_col) macro.  It returns -1 if any
@@ -199,41 +212,41 @@
   required for the Col and Row arrays, respectively, which are internal to
   colamd.  An additional n_col space is the minimal amount of "elbow room",
   and nnz/5 more space is recommended for run time efficiency.
-  
+
   This macro is not needed when using symamd.
-  
-  Explicit typecast to Index added Sept. 23, 2002, COLAMD version 2.2, to avoid
+
+  Explicit typecast to IndexType added Sept. 23, 2002, COLAMD version 2.2, to avoid
   gcc -pedantic warning messages.
 */
-template <typename Index>
-inline Index colamd_c(Index n_col) 
-{ return Index( ((n_col) + 1) * sizeof (colamd_col<Index>) / sizeof (Index) ) ; }
+template <typename IndexType>
+inline IndexType colamd_c(IndexType n_col)
+{ return IndexType( ((n_col) + 1) * sizeof (ColStructure<IndexType>) / sizeof (IndexType) ) ; }
 
-template <typename Index>
-inline Index  colamd_r(Index n_row)
-{ return Index(((n_row) + 1) * sizeof (Colamd_Row<Index>) / sizeof (Index)); }
+template <typename IndexType>
+inline IndexType  colamd_r(IndexType n_row)
+{ return IndexType(((n_row) + 1) * sizeof (RowStructure<IndexType>) / sizeof (IndexType)); }
 
 // Prototypes of non-user callable routines
-template <typename Index>
-static Index init_rows_cols (Index n_row, Index n_col, Colamd_Row<Index> Row [], colamd_col<Index> col [], Index A [], Index p [], Index stats[COLAMD_STATS] ); 
+template <typename IndexType>
+static IndexType init_rows_cols (IndexType n_row, IndexType n_col, RowStructure<IndexType> Row [], ColStructure<IndexType> col [], IndexType A [], IndexType p [], IndexType stats[NStats] );
 
-template <typename Index>
-static void init_scoring (Index n_row, Index n_col, Colamd_Row<Index> Row [], colamd_col<Index> Col [], Index A [], Index head [], double knobs[COLAMD_KNOBS], Index *p_n_row2, Index *p_n_col2, Index *p_max_deg);
+template <typename IndexType>
+static void init_scoring (IndexType n_row, IndexType n_col, RowStructure<IndexType> Row [], ColStructure<IndexType> Col [], IndexType A [], IndexType head [], double knobs[NKnobs], IndexType *p_n_row2, IndexType *p_n_col2, IndexType *p_max_deg);
 
-template <typename Index>
-static Index find_ordering (Index n_row, Index n_col, Index Alen, Colamd_Row<Index> Row [], colamd_col<Index> Col [], Index A [], Index head [], Index n_col2, Index max_deg, Index pfree);
+template <typename IndexType>
+static IndexType find_ordering (IndexType n_row, IndexType n_col, IndexType Alen, RowStructure<IndexType> Row [], ColStructure<IndexType> Col [], IndexType A [], IndexType head [], IndexType n_col2, IndexType max_deg, IndexType pfree);
 
-template <typename Index>
-static void order_children (Index n_col, colamd_col<Index> Col [], Index p []);
+template <typename IndexType>
+static void order_children (IndexType n_col, ColStructure<IndexType> Col [], IndexType p []);
 
-template <typename Index>
-static void detect_super_cols (colamd_col<Index> Col [], Index A [], Index head [], Index row_start, Index row_length ) ;
+template <typename IndexType>
+static void detect_super_cols (ColStructure<IndexType> Col [], IndexType A [], IndexType head [], IndexType row_start, IndexType row_length ) ;
 
-template <typename Index>
-static Index garbage_collection (Index n_row, Index n_col, Colamd_Row<Index> Row [], colamd_col<Index> Col [], Index A [], Index *pfree) ;
+template <typename IndexType>
+static IndexType garbage_collection (IndexType n_row, IndexType n_col, RowStructure<IndexType> Row [], ColStructure<IndexType> Col [], IndexType A [], IndexType *pfree) ;
 
-template <typename Index>
-static inline  Index clear_mark (Index n_row, Colamd_Row<Index> Row [] ) ;
+template <typename IndexType>
+static inline  IndexType clear_mark (IndexType n_row, RowStructure<IndexType> Row [] ) ;
 
 /* === No debugging ========================================================= */
 
@@ -247,37 +260,37 @@
 
 
 /**
- * \brief Returns the recommended value of Alen 
- * 
- * Returns recommended value of Alen for use by colamd.  
- * Returns -1 if any input argument is negative.  
- * The use of this routine or macro is optional.  
- * Note that the macro uses its arguments   more than once, 
- * so be careful for side effects, if you pass expressions as arguments to COLAMD_RECOMMENDED.  
- * 
+ * \brief Returns the recommended value of Alen
+ *
+ * Returns recommended value of Alen for use by colamd.
+ * Returns -1 if any input argument is negative.
+ * The use of this routine or macro is optional.
+ * Note that the macro uses its arguments   more than once,
+ * so be careful for side effects, if you pass expressions as arguments to COLAMD_RECOMMENDED.
+ *
  * \param nnz nonzeros in A
  * \param n_row number of rows in A
  * \param n_col number of columns in A
  * \return recommended value of Alen for use by colamd
  */
-template <typename Index>
-inline Index colamd_recommended ( Index nnz, Index n_row, Index n_col)
+template <typename IndexType>
+inline IndexType recommended ( IndexType nnz, IndexType n_row, IndexType n_col)
 {
   if ((nnz) < 0 || (n_row) < 0 || (n_col) < 0)
     return (-1);
   else
-    return (2 * (nnz) + colamd_c (n_col) + colamd_r (n_row) + (n_col) + ((nnz) / 5)); 
+    return (2 * (nnz) + colamd_c (n_col) + colamd_r (n_row) + (n_col) + ((nnz) / 5));
 }
 
 /**
  * \brief set default parameters  The use of this routine is optional.
- * 
- * Colamd: rows with more than (knobs [COLAMD_DENSE_ROW] * n_col)
- * entries are removed prior to ordering.  Columns with more than
- * (knobs [COLAMD_DENSE_COL] * n_row) entries are removed prior to
- * ordering, and placed last in the output column ordering. 
  *
- * COLAMD_DENSE_ROW and COLAMD_DENSE_COL are defined as 0 and 1,
+ * Colamd: rows with more than (knobs [DenseRow] * n_col)
+ * entries are removed prior to ordering.  Columns with more than
+ * (knobs [DenseCol] * n_row) entries are removed prior to
+ * ordering, and placed last in the output column ordering.
+ *
+ * DenseRow and DenseCol are defined as 0 and 1,
  * respectively, in colamd.h.  Default values of these two knobs
  * are both 0.5.  Currently, only knobs [0] and knobs [1] are
  * used, but future versions may use more knobs.  If so, they will
@@ -286,37 +299,37 @@
  * not need to change, assuming that you either use
  * colamd_set_defaults, or pass a (double *) NULL pointer as the
  * knobs array to colamd or symamd.
- * 
+ *
  * \param knobs parameter settings for colamd
  */
 
-static inline void colamd_set_defaults(double knobs[COLAMD_KNOBS])
+static inline void set_defaults(double knobs[NKnobs])
 {
   /* === Local variables ================================================== */
-  
+
   int i ;
 
   if (!knobs)
   {
     return ;      /* no knobs to initialize */
   }
-  for (i = 0 ; i < COLAMD_KNOBS ; i++)
+  for (i = 0 ; i < NKnobs ; i++)
   {
     knobs [i] = 0 ;
   }
-  knobs [COLAMD_DENSE_ROW] = 0.5 ;  /* ignore rows over 50% dense */
-  knobs [COLAMD_DENSE_COL] = 0.5 ;  /* ignore columns over 50% dense */
+  knobs [Colamd::DenseRow] = 0.5 ;  /* ignore rows over 50% dense */
+  knobs [Colamd::DenseCol] = 0.5 ;  /* ignore columns over 50% dense */
 }
 
-/** 
+/**
  * \brief  Computes a column ordering using the column approximate minimum degree ordering
- * 
+ *
  * Computes a column ordering (Q) of A such that P(AQ)=LU or
  * (AQ)'AQ=LL' have less fill-in and require fewer floating point
  * operations than factorizing the unpermuted matrix A or A'A,
  * respectively.
- * 
- * 
+ *
+ *
  * \param n_row number of rows in A
  * \param n_col number of columns in A
  * \param Alen, size of the array A
@@ -325,144 +338,144 @@
  * \param knobs parameter settings for colamd
  * \param stats colamd output statistics and error codes
  */
-template <typename Index>
-static bool colamd(Index n_row, Index n_col, Index Alen, Index *A, Index *p, double knobs[COLAMD_KNOBS], Index stats[COLAMD_STATS])
+template <typename IndexType>
+static bool compute_ordering(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p, double knobs[NKnobs], IndexType stats[NStats])
 {
   /* === Local variables ================================================== */
-  
-  Index i ;     /* loop index */
-  Index nnz ;     /* nonzeros in A */
-  Index Row_size ;    /* size of Row [], in integers */
-  Index Col_size ;    /* size of Col [], in integers */
-  Index need ;      /* minimum required length of A */
-  Colamd_Row<Index> *Row ;   /* pointer into A of Row [0..n_row] array */
-  colamd_col<Index> *Col ;   /* pointer into A of Col [0..n_col] array */
-  Index n_col2 ;    /* number of non-dense, non-empty columns */
-  Index n_row2 ;    /* number of non-dense, non-empty rows */
-  Index ngarbage ;    /* number of garbage collections performed */
-  Index max_deg ;   /* maximum row degree */
-  double default_knobs [COLAMD_KNOBS] ; /* default knobs array */
-  
-  
+
+  IndexType i ;     /* loop index */
+  IndexType nnz ;     /* nonzeros in A */
+  IndexType Row_size ;    /* size of Row [], in integers */
+  IndexType Col_size ;    /* size of Col [], in integers */
+  IndexType need ;      /* minimum required length of A */
+  Colamd::RowStructure<IndexType> *Row ;   /* pointer into A of Row [0..n_row] array */
+  Colamd::ColStructure<IndexType> *Col ;   /* pointer into A of Col [0..n_col] array */
+  IndexType n_col2 ;    /* number of non-dense, non-empty columns */
+  IndexType n_row2 ;    /* number of non-dense, non-empty rows */
+  IndexType ngarbage ;    /* number of garbage collections performed */
+  IndexType max_deg ;   /* maximum row degree */
+  double default_knobs [NKnobs] ; /* default knobs array */
+
+
   /* === Check the input arguments ======================================== */
-  
+
   if (!stats)
   {
     COLAMD_DEBUG0 (("colamd: stats not present\n")) ;
     return (false) ;
   }
-  for (i = 0 ; i < COLAMD_STATS ; i++)
+  for (i = 0 ; i < NStats ; i++)
   {
     stats [i] = 0 ;
   }
-  stats [COLAMD_STATUS] = COLAMD_OK ;
-  stats [COLAMD_INFO1] = -1 ;
-  stats [COLAMD_INFO2] = -1 ;
-  
+  stats [Colamd::Status] = Colamd::Ok ;
+  stats [Colamd::Info1] = -1 ;
+  stats [Colamd::Info2] = -1 ;
+
   if (!A)   /* A is not present */
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ;
+    stats [Colamd::Status] = Colamd::ErrorANotPresent ;
     COLAMD_DEBUG0 (("colamd: A not present\n")) ;
     return (false) ;
   }
-  
+
   if (!p)   /* p is not present */
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ;
+    stats [Colamd::Status] = Colamd::ErrorPNotPresent ;
     COLAMD_DEBUG0 (("colamd: p not present\n")) ;
     return (false) ;
   }
-  
+
   if (n_row < 0)  /* n_row must be >= 0 */
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_nrow_negative ;
-    stats [COLAMD_INFO1] = n_row ;
+    stats [Colamd::Status] = Colamd::ErrorNrowNegative ;
+    stats [Colamd::Info1] = n_row ;
     COLAMD_DEBUG0 (("colamd: nrow negative %d\n", n_row)) ;
     return (false) ;
   }
-  
+
   if (n_col < 0)  /* n_col must be >= 0 */
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ;
-    stats [COLAMD_INFO1] = n_col ;
+    stats [Colamd::Status] = Colamd::ErrorNcolNegative ;
+    stats [Colamd::Info1] = n_col ;
     COLAMD_DEBUG0 (("colamd: ncol negative %d\n", n_col)) ;
     return (false) ;
   }
-  
+
   nnz = p [n_col] ;
   if (nnz < 0)  /* nnz must be >= 0 */
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ;
-    stats [COLAMD_INFO1] = nnz ;
+    stats [Colamd::Status] = Colamd::ErrorNnzNegative ;
+    stats [Colamd::Info1] = nnz ;
     COLAMD_DEBUG0 (("colamd: number of entries negative %d\n", nnz)) ;
     return (false) ;
   }
-  
+
   if (p [0] != 0)
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero ;
-    stats [COLAMD_INFO1] = p [0] ;
+    stats [Colamd::Status] = Colamd::ErrorP0Nonzero ;
+    stats [Colamd::Info1] = p [0] ;
     COLAMD_DEBUG0 (("colamd: p[0] not zero %d\n", p [0])) ;
     return (false) ;
   }
-  
+
   /* === If no knobs, set default knobs =================================== */
-  
+
   if (!knobs)
   {
-    colamd_set_defaults (default_knobs) ;
+    set_defaults (default_knobs) ;
     knobs = default_knobs ;
   }
-  
+
   /* === Allocate the Row and Col arrays from array A ===================== */
-  
+
   Col_size = colamd_c (n_col) ;
   Row_size = colamd_r (n_row) ;
   need = 2*nnz + n_col + Col_size + Row_size ;
-  
+
   if (need > Alen)
   {
     /* not enough space in array A to perform the ordering */
-    stats [COLAMD_STATUS] = COLAMD_ERROR_A_too_small ;
-    stats [COLAMD_INFO1] = need ;
-    stats [COLAMD_INFO2] = Alen ;
+    stats [Colamd::Status] = Colamd::ErrorATooSmall ;
+    stats [Colamd::Info1] = need ;
+    stats [Colamd::Info2] = Alen ;
     COLAMD_DEBUG0 (("colamd: Need Alen >= %d, given only Alen = %d\n", need,Alen));
     return (false) ;
   }
-  
+
   Alen -= Col_size + Row_size ;
-  Col = (colamd_col<Index> *) &A [Alen] ;
-  Row = (Colamd_Row<Index> *) &A [Alen + Col_size] ;
+  Col = (ColStructure<IndexType> *) &A [Alen] ;
+  Row = (RowStructure<IndexType> *) &A [Alen + Col_size] ;
 
   /* === Construct the row and column data structures ===================== */
-  
-  if (!Eigen::internal::init_rows_cols (n_row, n_col, Row, Col, A, p, stats))
+
+  if (!Colamd::init_rows_cols (n_row, n_col, Row, Col, A, p, stats))
   {
     /* input matrix is invalid */
     COLAMD_DEBUG0 (("colamd: Matrix invalid\n")) ;
     return (false) ;
   }
-  
+
   /* === Initialize scores, kill dense rows/columns ======================= */
 
-  Eigen::internal::init_scoring (n_row, n_col, Row, Col, A, p, knobs,
+  Colamd::init_scoring (n_row, n_col, Row, Col, A, p, knobs,
 		&n_row2, &n_col2, &max_deg) ;
-  
+
   /* === Order the supercolumns =========================================== */
-  
-  ngarbage = Eigen::internal::find_ordering (n_row, n_col, Alen, Row, Col, A, p,
+
+  ngarbage = Colamd::find_ordering (n_row, n_col, Alen, Row, Col, A, p,
 			    n_col2, max_deg, 2*nnz) ;
-  
+
   /* === Order the non-principal columns ================================== */
-  
-  Eigen::internal::order_children (n_col, Col, p) ;
-  
+
+  Colamd::order_children (n_col, Col, p) ;
+
   /* === Return statistics in stats ======================================= */
-  
-  stats [COLAMD_DENSE_ROW] = n_row - n_row2 ;
-  stats [COLAMD_DENSE_COL] = n_col - n_col2 ;
-  stats [COLAMD_DEFRAG_COUNT] = ngarbage ;
-  COLAMD_DEBUG0 (("colamd: done.\n")) ; 
+
+  stats [Colamd::DenseRow] = n_row - n_row2 ;
+  stats [Colamd::DenseCol] = n_col - n_col2 ;
+  stats [Colamd::DefragCount] = ngarbage ;
+  COLAMD_DEBUG0 (("colamd: done.\n")) ;
   return (true) ;
 }
 
@@ -472,7 +485,6 @@
 
 /* There are no user-callable routines beyond this point in the file */
 
-
 /* ========================================================================== */
 /* === init_rows_cols ======================================================= */
 /* ========================================================================== */
@@ -485,29 +497,29 @@
   column form of the matrix.  Returns false if the matrix is invalid,
   true otherwise.  Not user-callable.
 */
-template <typename Index>
-static Index init_rows_cols  /* returns true if OK, or false otherwise */
+template <typename IndexType>
+static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
   (
     /* === Parameters ======================================================= */
 
-    Index n_row,      /* number of rows of A */
-    Index n_col,      /* number of columns of A */
-    Colamd_Row<Index> Row [],    /* of size n_row+1 */
-    colamd_col<Index> Col [],    /* of size n_col+1 */
-    Index A [],     /* row indices of A, of size Alen */
-    Index p [],     /* pointers to columns in A, of size n_col+1 */
-    Index stats [COLAMD_STATS]  /* colamd statistics */ 
+    IndexType n_row,      /* number of rows of A */
+    IndexType n_col,      /* number of columns of A */
+    RowStructure<IndexType> Row [],    /* of size n_row+1 */
+    ColStructure<IndexType> Col [],    /* of size n_col+1 */
+    IndexType A [],     /* row indices of A, of size Alen */
+    IndexType p [],     /* pointers to columns in A, of size n_col+1 */
+    IndexType stats [NStats]  /* colamd statistics */
     )
 {
   /* === Local variables ================================================== */
 
-  Index col ;     /* a column index */
-  Index row ;     /* a row index */
-  Index *cp ;     /* a column pointer */
-  Index *cp_end ;   /* a pointer to the end of a column */
-  Index *rp ;     /* a row pointer */
-  Index *rp_end ;   /* a pointer to the end of a row */
-  Index last_row ;    /* previous row */
+  IndexType col ;     /* a column index */
+  IndexType row ;     /* a row index */
+  IndexType *cp ;     /* a column pointer */
+  IndexType *cp_end ;   /* a pointer to the end of a column */
+  IndexType *rp ;     /* a row pointer */
+  IndexType *rp_end ;   /* a pointer to the end of a row */
+  IndexType last_row ;    /* previous row */
 
   /* === Initialize columns, and check column pointers ==================== */
 
@@ -516,27 +528,27 @@
     Col [col].start = p [col] ;
     Col [col].length = p [col+1] - p [col] ;
 
-    if (Col [col].length < 0)
+    if ((Col [col].length) < 0) // extra parentheses to work-around gcc bug 10200
     {
       /* column pointers must be non-decreasing */
-      stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ;
-      stats [COLAMD_INFO1] = col ;
-      stats [COLAMD_INFO2] = Col [col].length ;
+      stats [Colamd::Status] = Colamd::ErrorColLengthNegative ;
+      stats [Colamd::Info1] = col ;
+      stats [Colamd::Info2] = Col [col].length ;
       COLAMD_DEBUG0 (("colamd: col %d length %d < 0\n", col, Col [col].length)) ;
       return (false) ;
     }
 
     Col [col].shared1.thickness = 1 ;
     Col [col].shared2.score = 0 ;
-    Col [col].shared3.prev = COLAMD_EMPTY ;
-    Col [col].shared4.degree_next = COLAMD_EMPTY ;
+    Col [col].shared3.prev = Empty ;
+    Col [col].shared4.degree_next = Empty ;
   }
 
   /* p [0..n_col] no longer needed, used as "head" in subsequent routines */
 
   /* === Scan columns, compute row degrees, and check row indices ========= */
 
-  stats [COLAMD_INFO3] = 0 ;  /* number of duplicate or unsorted row indices*/
+  stats [Info3] = 0 ;  /* number of duplicate or unsorted row indices*/
 
   for (row = 0 ; row < n_row ; row++)
   {
@@ -558,10 +570,10 @@
       /* make sure row indices within range */
       if (row < 0 || row >= n_row)
       {
-	stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ;
-	stats [COLAMD_INFO1] = col ;
-	stats [COLAMD_INFO2] = row ;
-	stats [COLAMD_INFO3] = n_row ;
+	stats [Colamd::Status] = Colamd::ErrorRowIndexOutOfBounds ;
+	stats [Colamd::Info1] = col ;
+	stats [Colamd::Info2] = row ;
+	stats [Colamd::Info3] = n_row ;
 	COLAMD_DEBUG0 (("colamd: row %d col %d out of bounds\n", row, col)) ;
 	return (false) ;
       }
@@ -570,10 +582,10 @@
       {
 	/* row index are unsorted or repeated (or both), thus col */
 	/* is jumbled.  This is a notice, not an error condition. */
-	stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ;
-	stats [COLAMD_INFO1] = col ;
-	stats [COLAMD_INFO2] = row ;
-	(stats [COLAMD_INFO3]) ++ ;
+	stats [Colamd::Status] = Colamd::OkButJumbled ;
+	stats [Colamd::Info1] = col ;
+	stats [Colamd::Info2] = row ;
+	(stats [Colamd::Info3]) ++ ;
 	COLAMD_DEBUG1 (("colamd: row %d col %d unsorted/duplicate\n",row,col));
       }
 
@@ -611,7 +623,7 @@
 
   /* === Create row form ================================================== */
 
-  if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED)
+  if (stats [Status] == OkButJumbled)
   {
     /* if cols jumbled, watch for repeated row indices */
     for (col = 0 ; col < n_col ; col++)
@@ -653,7 +665,7 @@
 
   /* === See if we need to re-create columns ============================== */
 
-  if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED)
+  if (stats [Status] == OkButJumbled)
   {
     COLAMD_DEBUG0 (("colamd: reconstructing column form, matrix jumbled\n")) ;
 
@@ -701,46 +713,46 @@
   Kills dense or empty columns and rows, calculates an initial score for
   each column, and places all columns in the degree lists.  Not user-callable.
 */
-template <typename Index>
+template <typename IndexType>
 static void init_scoring
   (
     /* === Parameters ======================================================= */
 
-    Index n_row,      /* number of rows of A */
-    Index n_col,      /* number of columns of A */
-    Colamd_Row<Index> Row [],    /* of size n_row+1 */
-    colamd_col<Index> Col [],    /* of size n_col+1 */
-    Index A [],     /* column form and row form of A */
-    Index head [],    /* of size n_col+1 */
-    double knobs [COLAMD_KNOBS],/* parameters */
-    Index *p_n_row2,    /* number of non-dense, non-empty rows */
-    Index *p_n_col2,    /* number of non-dense, non-empty columns */
-    Index *p_max_deg    /* maximum row degree */
+    IndexType n_row,      /* number of rows of A */
+    IndexType n_col,      /* number of columns of A */
+    RowStructure<IndexType> Row [],    /* of size n_row+1 */
+    ColStructure<IndexType> Col [],    /* of size n_col+1 */
+    IndexType A [],     /* column form and row form of A */
+    IndexType head [],    /* of size n_col+1 */
+    double knobs [NKnobs],/* parameters */
+    IndexType *p_n_row2,    /* number of non-dense, non-empty rows */
+    IndexType *p_n_col2,    /* number of non-dense, non-empty columns */
+    IndexType *p_max_deg    /* maximum row degree */
     )
 {
   /* === Local variables ================================================== */
 
-  Index c ;     /* a column index */
-  Index r, row ;    /* a row index */
-  Index *cp ;     /* a column pointer */
-  Index deg ;     /* degree of a row or column */
-  Index *cp_end ;   /* a pointer to the end of a column */
-  Index *new_cp ;   /* new column pointer */
-  Index col_length ;    /* length of pruned column */
-  Index score ;     /* current column score */
-  Index n_col2 ;    /* number of non-dense, non-empty columns */
-  Index n_row2 ;    /* number of non-dense, non-empty rows */
-  Index dense_row_count ; /* remove rows with more entries than this */
-  Index dense_col_count ; /* remove cols with more entries than this */
-  Index min_score ;   /* smallest column score */
-  Index max_deg ;   /* maximum row degree */
-  Index next_col ;    /* Used to add to degree list.*/
+  IndexType c ;     /* a column index */
+  IndexType r, row ;    /* a row index */
+  IndexType *cp ;     /* a column pointer */
+  IndexType deg ;     /* degree of a row or column */
+  IndexType *cp_end ;   /* a pointer to the end of a column */
+  IndexType *new_cp ;   /* new column pointer */
+  IndexType col_length ;    /* length of pruned column */
+  IndexType score ;     /* current column score */
+  IndexType n_col2 ;    /* number of non-dense, non-empty columns */
+  IndexType n_row2 ;    /* number of non-dense, non-empty rows */
+  IndexType dense_row_count ; /* remove rows with more entries than this */
+  IndexType dense_col_count ; /* remove cols with more entries than this */
+  IndexType min_score ;   /* smallest column score */
+  IndexType max_deg ;   /* maximum row degree */
+  IndexType next_col ;    /* Used to add to degree list.*/
 
 
   /* === Extract knobs ==================================================== */
 
-  dense_row_count = COLAMD_MAX (0, COLAMD_MIN (knobs [COLAMD_DENSE_ROW] * n_col, n_col)) ;
-  dense_col_count = COLAMD_MAX (0, COLAMD_MIN (knobs [COLAMD_DENSE_COL] * n_row, n_row)) ;
+  dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [Colamd::DenseRow] * n_col), n_col)) ;
+  dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [Colamd::DenseCol] * n_row), n_row)) ;
   COLAMD_DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ;
   max_deg = 0 ;
   n_col2 = n_col ;
@@ -757,7 +769,7 @@
     {
       /* this is a empty column, kill and order it last */
       Col [c].shared2.order = --n_col2 ;
-      KILL_PRINCIPAL_COL (c) ;
+      Col[c].kill_principal() ;
     }
   }
   COLAMD_DEBUG1 (("colamd: null columns killed: %d\n", n_col - n_col2)) ;
@@ -768,7 +780,7 @@
   for (c = n_col-1 ; c >= 0 ; c--)
   {
     /* skip any dead columns */
-    if (COL_IS_DEAD (c))
+    if (Col[c].is_dead())
     {
       continue ;
     }
@@ -784,7 +796,7 @@
       {
 	Row [*cp++].shared1.degree-- ;
       }
-      KILL_PRINCIPAL_COL (c) ;
+      Col[c].kill_principal() ;
     }
   }
   COLAMD_DEBUG1 (("colamd: Dense and null columns killed: %d\n", n_col - n_col2)) ;
@@ -798,13 +810,13 @@
     if (deg > dense_row_count || deg == 0)
     {
       /* kill a dense or empty row */
-      KILL_ROW (r) ;
+      Row[r].kill() ;
       --n_row2 ;
     }
     else
     {
       /* keep track of max degree of remaining rows */
-      max_deg = COLAMD_MAX (max_deg, deg) ;
+      max_deg = numext::maxi(max_deg, deg) ;
     }
   }
   COLAMD_DEBUG1 (("colamd: Dense and null rows killed: %d\n", n_row - n_row2)) ;
@@ -820,7 +832,7 @@
   for (c = n_col-1 ; c >= 0 ; c--)
   {
     /* skip dead column */
-    if (COL_IS_DEAD (c))
+    if (Col[c].is_dead())
     {
       continue ;
     }
@@ -833,7 +845,7 @@
       /* get a row */
       row = *cp++ ;
       /* skip if dead */
-      if (ROW_IS_DEAD (row))
+      if (Row[row].is_dead())
       {
 	continue ;
       }
@@ -842,17 +854,17 @@
       /* add row's external degree */
       score += Row [row].shared1.degree - 1 ;
       /* guard against integer overflow */
-      score = COLAMD_MIN (score, n_col) ;
+      score = numext::mini(score, n_col) ;
     }
     /* determine pruned column length */
-    col_length = (Index) (new_cp - &A [Col [c].start]) ;
+    col_length = (IndexType) (new_cp - &A [Col [c].start]) ;
     if (col_length == 0)
     {
       /* a newly-made null column (all rows in this col are "dense" */
       /* and have already been killed) */
       COLAMD_DEBUG2 (("Newly null killed: %d\n", c)) ;
       Col [c].shared2.order = --n_col2 ;
-      KILL_PRINCIPAL_COL (c) ;
+      Col[c].kill_principal() ;
     }
     else
     {
@@ -877,7 +889,7 @@
   /* clear the hash buckets */
   for (c = 0 ; c <= n_col ; c++)
   {
-    head [c] = COLAMD_EMPTY ;
+    head [c] = Empty ;
   }
   min_score = n_col ;
   /* place in reverse order, so low column indices are at the front */
@@ -885,7 +897,7 @@
   for (c = n_col-1 ; c >= 0 ; c--)
   {
     /* only add principal columns to degree lists */
-    if (COL_IS_ALIVE (c))
+    if (Col[c].is_alive())
     {
       COLAMD_DEBUG4 (("place %d score %d minscore %d ncol %d\n",
 		      c, Col [c].shared2.score, min_score, n_col)) ;
@@ -898,23 +910,23 @@
       COLAMD_ASSERT (min_score <= n_col) ;
       COLAMD_ASSERT (score >= 0) ;
       COLAMD_ASSERT (score <= n_col) ;
-      COLAMD_ASSERT (head [score] >= COLAMD_EMPTY) ;
+      COLAMD_ASSERT (head [score] >= Empty) ;
 
       /* now add this column to dList at proper score location */
       next_col = head [score] ;
-      Col [c].shared3.prev = COLAMD_EMPTY ;
+      Col [c].shared3.prev = Empty ;
       Col [c].shared4.degree_next = next_col ;
 
       /* if there already was a column with the same score, set its */
       /* previous pointer to this new column */
-      if (next_col != COLAMD_EMPTY)
+      if (next_col != Empty)
       {
 	Col [next_col].shared3.prev = c ;
       }
       head [score] = c ;
 
       /* see if this score is less than current min */
-      min_score = COLAMD_MIN (min_score, score) ;
+      min_score = numext::mini(min_score, score) ;
 
 
     }
@@ -938,62 +950,62 @@
   (no supercolumns on input).  Uses a minimum approximate column minimum
   degree ordering method.  Not user-callable.
 */
-template <typename Index>
-static Index find_ordering /* return the number of garbage collections */
+template <typename IndexType>
+static IndexType find_ordering /* return the number of garbage collections */
   (
     /* === Parameters ======================================================= */
 
-    Index n_row,      /* number of rows of A */
-    Index n_col,      /* number of columns of A */
-    Index Alen,     /* size of A, 2*nnz + n_col or larger */
-    Colamd_Row<Index> Row [],    /* of size n_row+1 */
-    colamd_col<Index> Col [],    /* of size n_col+1 */
-    Index A [],     /* column form and row form of A */
-    Index head [],    /* of size n_col+1 */
-    Index n_col2,     /* Remaining columns to order */
-    Index max_deg,    /* Maximum row degree */
-    Index pfree     /* index of first free slot (2*nnz on entry) */
+    IndexType n_row,      /* number of rows of A */
+    IndexType n_col,      /* number of columns of A */
+    IndexType Alen,     /* size of A, 2*nnz + n_col or larger */
+    RowStructure<IndexType> Row [],    /* of size n_row+1 */
+    ColStructure<IndexType> Col [],    /* of size n_col+1 */
+    IndexType A [],     /* column form and row form of A */
+    IndexType head [],    /* of size n_col+1 */
+    IndexType n_col2,     /* Remaining columns to order */
+    IndexType max_deg,    /* Maximum row degree */
+    IndexType pfree     /* index of first free slot (2*nnz on entry) */
     )
 {
   /* === Local variables ================================================== */
 
-  Index k ;     /* current pivot ordering step */
-  Index pivot_col ;   /* current pivot column */
-  Index *cp ;     /* a column pointer */
-  Index *rp ;     /* a row pointer */
-  Index pivot_row ;   /* current pivot row */
-  Index *new_cp ;   /* modified column pointer */
-  Index *new_rp ;   /* modified row pointer */
-  Index pivot_row_start ; /* pointer to start of pivot row */
-  Index pivot_row_degree ;  /* number of columns in pivot row */
-  Index pivot_row_length ;  /* number of supercolumns in pivot row */
-  Index pivot_col_score ; /* score of pivot column */
-  Index needed_memory ;   /* free space needed for pivot row */
-  Index *cp_end ;   /* pointer to the end of a column */
-  Index *rp_end ;   /* pointer to the end of a row */
-  Index row ;     /* a row index */
-  Index col ;     /* a column index */
-  Index max_score ;   /* maximum possible score */
-  Index cur_score ;   /* score of current column */
+  IndexType k ;     /* current pivot ordering step */
+  IndexType pivot_col ;   /* current pivot column */
+  IndexType *cp ;     /* a column pointer */
+  IndexType *rp ;     /* a row pointer */
+  IndexType pivot_row ;   /* current pivot row */
+  IndexType *new_cp ;   /* modified column pointer */
+  IndexType *new_rp ;   /* modified row pointer */
+  IndexType pivot_row_start ; /* pointer to start of pivot row */
+  IndexType pivot_row_degree ;  /* number of columns in pivot row */
+  IndexType pivot_row_length ;  /* number of supercolumns in pivot row */
+  IndexType pivot_col_score ; /* score of pivot column */
+  IndexType needed_memory ;   /* free space needed for pivot row */
+  IndexType *cp_end ;   /* pointer to the end of a column */
+  IndexType *rp_end ;   /* pointer to the end of a row */
+  IndexType row ;     /* a row index */
+  IndexType col ;     /* a column index */
+  IndexType max_score ;   /* maximum possible score */
+  IndexType cur_score ;   /* score of current column */
   unsigned int hash ;   /* hash value for supernode detection */
-  Index head_column ;   /* head of hash bucket */
-  Index first_col ;   /* first column in hash bucket */
-  Index tag_mark ;    /* marker value for mark array */
-  Index row_mark ;    /* Row [row].shared2.mark */
-  Index set_difference ;  /* set difference size of row with pivot row */
-  Index min_score ;   /* smallest column score */
-  Index col_thickness ;   /* "thickness" (no. of columns in a supercol) */
-  Index max_mark ;    /* maximum value of tag_mark */
-  Index pivot_col_thickness ; /* number of columns represented by pivot col */
-  Index prev_col ;    /* Used by Dlist operations. */
-  Index next_col ;    /* Used by Dlist operations. */
-  Index ngarbage ;    /* number of garbage collections performed */
+  IndexType head_column ;   /* head of hash bucket */
+  IndexType first_col ;   /* first column in hash bucket */
+  IndexType tag_mark ;    /* marker value for mark array */
+  IndexType row_mark ;    /* Row [row].shared2.mark */
+  IndexType set_difference ;  /* set difference size of row with pivot row */
+  IndexType min_score ;   /* smallest column score */
+  IndexType col_thickness ;   /* "thickness" (no. of columns in a supercol) */
+  IndexType max_mark ;    /* maximum value of tag_mark */
+  IndexType pivot_col_thickness ; /* number of columns represented by pivot col */
+  IndexType prev_col ;    /* Used by Dlist operations. */
+  IndexType next_col ;    /* Used by Dlist operations. */
+  IndexType ngarbage ;    /* number of garbage collections performed */
 
 
   /* === Initialization and clear mark ==================================== */
 
   max_mark = INT_MAX - n_col ;  /* INT_MAX defined in <limits.h> */
-  tag_mark = Eigen::internal::clear_mark (n_row, Row) ;
+  tag_mark = Colamd::clear_mark (n_row, Row) ;
   min_score = 0 ;
   ngarbage = 0 ;
   COLAMD_DEBUG1 (("colamd: Ordering, n_col2=%d\n", n_col2)) ;
@@ -1008,10 +1020,10 @@
     /* make sure degree list isn't empty */
     COLAMD_ASSERT (min_score >= 0) ;
     COLAMD_ASSERT (min_score <= n_col) ;
-    COLAMD_ASSERT (head [min_score] >= COLAMD_EMPTY) ;
+    COLAMD_ASSERT (head [min_score] >= Empty) ;
 
     /* get pivot column from head of minimum degree list */
-    while (head [min_score] == COLAMD_EMPTY && min_score < n_col)
+    while (min_score < n_col && head [min_score] == Empty)
     {
       min_score++ ;
     }
@@ -1019,12 +1031,12 @@
     COLAMD_ASSERT (pivot_col >= 0 && pivot_col <= n_col) ;
     next_col = Col [pivot_col].shared4.degree_next ;
     head [min_score] = next_col ;
-    if (next_col != COLAMD_EMPTY)
+    if (next_col != Empty)
     {
-      Col [next_col].shared3.prev = COLAMD_EMPTY ;
+      Col [next_col].shared3.prev = Empty ;
     }
 
-    COLAMD_ASSERT (COL_IS_ALIVE (pivot_col)) ;
+    COLAMD_ASSERT (Col[pivot_col].is_alive()) ;
     COLAMD_DEBUG3 (("Pivot col: %d\n", pivot_col)) ;
 
     /* remember score for defrag check */
@@ -1040,15 +1052,15 @@
 
     /* === Garbage_collection, if necessary ============================= */
 
-    needed_memory = COLAMD_MIN (pivot_col_score, n_col - k) ;
+    needed_memory = numext::mini(pivot_col_score, n_col - k) ;
     if (pfree + needed_memory >= Alen)
     {
-      pfree = Eigen::internal::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ;
+      pfree = Colamd::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ;
       ngarbage++ ;
       /* after garbage collection we will have enough */
       COLAMD_ASSERT (pfree + needed_memory < Alen) ;
       /* garbage collection has wiped out the Row[].shared2.mark array */
-      tag_mark = Eigen::internal::clear_mark (n_row, Row) ;
+      tag_mark = Colamd::clear_mark (n_row, Row) ;
 
     }
 
@@ -1071,9 +1083,9 @@
     {
       /* get a row */
       row = *cp++ ;
-      COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ;
+      COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", Row[row].is_alive(), row)) ;
       /* skip if row is dead */
-      if (ROW_IS_DEAD (row))
+      if (Row[row].is_dead())
       {
 	continue ;
       }
@@ -1085,7 +1097,7 @@
 	col = *rp++ ;
 	/* add the column, if alive and untagged */
 	col_thickness = Col [col].shared1.thickness ;
-	if (col_thickness > 0 && COL_IS_ALIVE (col))
+	if (col_thickness > 0 && Col[col].is_alive())
 	{
 	  /* tag column in pivot row */
 	  Col [col].shared1.thickness = -col_thickness ;
@@ -1099,7 +1111,7 @@
 
     /* clear tag on pivot column */
     Col [pivot_col].shared1.thickness = pivot_col_thickness ;
-    max_deg = COLAMD_MAX (max_deg, pivot_row_degree) ;
+    max_deg = numext::maxi(max_deg, pivot_row_degree) ;
 
 
     /* === Kill all rows used to construct pivot row ==================== */
@@ -1112,7 +1124,7 @@
       /* may be killing an already dead row */
       row = *cp++ ;
       COLAMD_DEBUG3 (("Kill row in pivot col: %d\n", row)) ;
-      KILL_ROW (row) ;
+      Row[row].kill() ;
     }
 
     /* === Select a row index to use as the new pivot row =============== */
@@ -1127,7 +1139,7 @@
     else
     {
       /* there is no pivot row, since it is of zero length */
-      pivot_row = COLAMD_EMPTY ;
+      pivot_row = Empty ;
       COLAMD_ASSERT (pivot_row_length == 0) ;
     }
     COLAMD_ASSERT (Col [pivot_col].length > 0 || pivot_row_length == 0) ;
@@ -1164,7 +1176,7 @@
     while (rp < rp_end)
     {
       col = *rp++ ;
-      COLAMD_ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ;
+      COLAMD_ASSERT (Col[col].is_alive() && col != pivot_col) ;
       COLAMD_DEBUG3 (("Col: %d\n", col)) ;
 
       /* clear tags used to construct pivot row pattern */
@@ -1179,8 +1191,8 @@
       next_col = Col [col].shared4.degree_next ;
       COLAMD_ASSERT (cur_score >= 0) ;
       COLAMD_ASSERT (cur_score <= n_col) ;
-      COLAMD_ASSERT (cur_score >= COLAMD_EMPTY) ;
-      if (prev_col == COLAMD_EMPTY)
+      COLAMD_ASSERT (cur_score >= Empty) ;
+      if (prev_col == Empty)
       {
 	head [cur_score] = next_col ;
       }
@@ -1188,7 +1200,7 @@
       {
 	Col [prev_col].shared4.degree_next = next_col ;
       }
-      if (next_col != COLAMD_EMPTY)
+      if (next_col != Empty)
       {
 	Col [next_col].shared3.prev = prev_col ;
       }
@@ -1201,12 +1213,12 @@
       {
 	/* get a row */
 	row = *cp++ ;
-	row_mark = Row [row].shared2.mark ;
 	/* skip if dead */
-	if (ROW_IS_MARKED_DEAD (row_mark))
+	if (Row[row].is_dead())
 	{
 	  continue ;
 	}
+  row_mark = Row [row].shared2.mark ;
 	COLAMD_ASSERT (row != pivot_row) ;
 	set_difference = row_mark - tag_mark ;
 	/* check if the row has been seen yet */
@@ -1222,7 +1234,7 @@
 	if (set_difference == 0)
 	{
 	  COLAMD_DEBUG3 (("aggressive absorption. Row: %d\n", row)) ;
-	  KILL_ROW (row) ;
+	  Row[row].kill() ;
 	}
 	else
 	{
@@ -1244,7 +1256,7 @@
     {
       /* get a column */
       col = *rp++ ;
-      COLAMD_ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ;
+      COLAMD_ASSERT (Col[col].is_alive() && col != pivot_col) ;
       hash = 0 ;
       cur_score = 0 ;
       cp = &A [Col [col].start] ;
@@ -1259,12 +1271,12 @@
 	/* get a row */
 	row = *cp++ ;
 	COLAMD_ASSERT(row >= 0 && row < n_row) ;
-	row_mark = Row [row].shared2.mark ;
 	/* skip if dead */
-	if (ROW_IS_MARKED_DEAD (row_mark))
+	if (Row [row].is_dead())
 	{
 	  continue ;
 	}
+  row_mark = Row [row].shared2.mark ;
 	COLAMD_ASSERT (row_mark > tag_mark) ;
 	/* compact the column */
 	*new_cp++ = row ;
@@ -1273,11 +1285,11 @@
 	/* add set difference */
 	cur_score += row_mark - tag_mark ;
 	/* integer overflow... */
-	cur_score = COLAMD_MIN (cur_score, n_col) ;
+	cur_score = numext::mini(cur_score, n_col) ;
       }
 
       /* recompute the column's length */
-      Col [col].length = (Index) (new_cp - &A [Col [col].start]) ;
+      Col [col].length = (IndexType) (new_cp - &A [Col [col].start]) ;
 
       /* === Further mass elimination ================================= */
 
@@ -1285,7 +1297,7 @@
       {
 	COLAMD_DEBUG4 (("further mass elimination. Col: %d\n", col)) ;
 	/* nothing left but the pivot row in this column */
-	KILL_PRINCIPAL_COL (col) ;
+	Col[col].kill_principal() ;
 	pivot_row_degree -= Col [col].shared1.thickness ;
 	COLAMD_ASSERT (pivot_row_degree >= 0) ;
 	/* order it */
@@ -1309,7 +1321,7 @@
 	COLAMD_ASSERT (hash <= n_col) ;
 
 	head_column = head [hash] ;
-	if (head_column > COLAMD_EMPTY)
+	if (head_column > Empty)
 	{
 	  /* degree list "hash" is non-empty, use prev (shared3) of */
 	  /* first column in degree list as head of hash bucket */
@@ -1325,8 +1337,8 @@
 	Col [col].shared4.hash_next = first_col ;
 
 	/* save hash function in Col [col].shared3.hash */
-	Col [col].shared3.hash = (Index) hash ;
-	COLAMD_ASSERT (COL_IS_ALIVE (col)) ;
+	Col [col].shared3.hash = (IndexType) hash ;
+	COLAMD_ASSERT (Col[col].is_alive()) ;
       }
     }
 
@@ -1336,11 +1348,11 @@
 
     COLAMD_DEBUG3 (("** Supercolumn detection phase. **\n")) ;
 
-    Eigen::internal::detect_super_cols (Col, A, head, pivot_row_start, pivot_row_length) ;
+    Colamd::detect_super_cols (Col, A, head, pivot_row_start, pivot_row_length) ;
 
     /* === Kill the pivotal column ====================================== */
 
-    KILL_PRINCIPAL_COL (pivot_col) ;
+    Col[pivot_col].kill_principal() ;
 
     /* === Clear mark =================================================== */
 
@@ -1348,7 +1360,7 @@
     if (tag_mark >= max_mark)
     {
       COLAMD_DEBUG2 (("clearing tag_mark\n")) ;
-      tag_mark = Eigen::internal::clear_mark (n_row, Row) ;
+      tag_mark = Colamd::clear_mark (n_row, Row) ;
     }
 
     /* === Finalize the new pivot row, and column scores ================ */
@@ -1364,7 +1376,7 @@
     {
       col = *rp++ ;
       /* skip dead columns */
-      if (COL_IS_DEAD (col))
+      if (Col[col].is_dead())
       {
 	continue ;
       }
@@ -1386,7 +1398,7 @@
       cur_score -= Col [col].shared1.thickness ;
 
       /* make sure score is less or equal than the max score */
-      cur_score = COLAMD_MIN (cur_score, max_score) ;
+      cur_score = numext::mini(cur_score, max_score) ;
       COLAMD_ASSERT (cur_score >= 0) ;
 
       /* store updated score */
@@ -1398,18 +1410,18 @@
       COLAMD_ASSERT (min_score <= n_col) ;
       COLAMD_ASSERT (cur_score >= 0) ;
       COLAMD_ASSERT (cur_score <= n_col) ;
-      COLAMD_ASSERT (head [cur_score] >= COLAMD_EMPTY) ;
+      COLAMD_ASSERT (head [cur_score] >= Empty) ;
       next_col = head [cur_score] ;
       Col [col].shared4.degree_next = next_col ;
-      Col [col].shared3.prev = COLAMD_EMPTY ;
-      if (next_col != COLAMD_EMPTY)
+      Col [col].shared3.prev = Empty ;
+      if (next_col != Empty)
       {
 	Col [next_col].shared3.prev = col ;
       }
       head [cur_score] = col ;
 
       /* see if this score is less than current min */
-      min_score = COLAMD_MIN (min_score, cur_score) ;
+      min_score = numext::mini(min_score, cur_score) ;
 
     }
 
@@ -1420,7 +1432,7 @@
       /* update pivot row length to reflect any cols that were killed */
       /* during super-col detection and mass elimination */
       Row [pivot_row].start  = pivot_row_start ;
-      Row [pivot_row].length = (Index) (new_rp - &A[pivot_row_start]) ;
+      Row [pivot_row].length = (IndexType) (new_rp - &A[pivot_row_start]) ;
       Row [pivot_row].shared1.degree = pivot_row_degree ;
       Row [pivot_row].shared2.mark = 0 ;
       /* pivot row is no longer dead */
@@ -1449,37 +1461,37 @@
   taken by this routine is O (n_col), that is, linear in the number of
   columns.  Not user-callable.
 */
-template <typename Index>
+template <typename IndexType>
 static inline  void order_children
 (
   /* === Parameters ======================================================= */
 
-  Index n_col,      /* number of columns of A */
-  colamd_col<Index> Col [],    /* of size n_col+1 */
-  Index p []      /* p [0 ... n_col-1] is the column permutation*/
+  IndexType n_col,      /* number of columns of A */
+  ColStructure<IndexType> Col [],    /* of size n_col+1 */
+  IndexType p []      /* p [0 ... n_col-1] is the column permutation*/
   )
 {
   /* === Local variables ================================================== */
 
-  Index i ;     /* loop counter for all columns */
-  Index c ;     /* column index */
-  Index parent ;    /* index of column's parent */
-  Index order ;     /* column's order */
+  IndexType i ;     /* loop counter for all columns */
+  IndexType c ;     /* column index */
+  IndexType parent ;    /* index of column's parent */
+  IndexType order ;     /* column's order */
 
   /* === Order each non-principal column ================================== */
 
   for (i = 0 ; i < n_col ; i++)
   {
     /* find an un-ordered non-principal column */
-    COLAMD_ASSERT (COL_IS_DEAD (i)) ;
-    if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == COLAMD_EMPTY)
+    COLAMD_ASSERT (col_is_dead(Col, i)) ;
+    if (!Col[i].is_dead_principal() && Col [i].shared2.order == Empty)
     {
       parent = i ;
       /* once found, find its principal parent */
       do
       {
 	parent = Col [parent].shared1.parent ;
-      } while (!COL_IS_DEAD_PRINCIPAL (parent)) ;
+      } while (!Col[parent].is_dead_principal()) ;
 
       /* now, order all un-ordered non-principal columns along path */
       /* to this parent.  collapse tree at the same time */
@@ -1489,7 +1501,7 @@
 
       do
       {
-	COLAMD_ASSERT (Col [c].shared2.order == COLAMD_EMPTY) ;
+	COLAMD_ASSERT (Col [c].shared2.order == Empty) ;
 
 	/* order this column */
 	Col [c].shared2.order = order++ ;
@@ -1500,9 +1512,9 @@
 	c = Col [c].shared1.parent ;
 
 	/* continue until we hit an ordered column.  There are */
-	/* guarranteed not to be anymore unordered columns */
+	/* guaranteed not to be anymore unordered columns */
 	/* above an ordered column */
-      } while (Col [c].shared2.order == COLAMD_EMPTY) ;
+      } while (Col [c].shared2.order == Empty) ;
 
       /* re-order the super_col parent to largest order for this group */
       Col [parent].shared2.order = order ;
@@ -1550,33 +1562,33 @@
   just been computed in the approximate degree computation.
   Not user-callable.
 */
-template <typename Index>
+template <typename IndexType>
 static void detect_super_cols
 (
   /* === Parameters ======================================================= */
-  
-  colamd_col<Index> Col [],    /* of size n_col+1 */
-  Index A [],     /* row indices of A */
-  Index head [],    /* head of degree lists and hash buckets */
-  Index row_start,    /* pointer to set of columns to check */
-  Index row_length    /* number of columns to check */
+
+  ColStructure<IndexType> Col [],    /* of size n_col+1 */
+  IndexType A [],     /* row indices of A */
+  IndexType head [],    /* head of degree lists and hash buckets */
+  IndexType row_start,    /* pointer to set of columns to check */
+  IndexType row_length    /* number of columns to check */
 )
 {
   /* === Local variables ================================================== */
 
-  Index hash ;      /* hash value for a column */
-  Index *rp ;     /* pointer to a row */
-  Index c ;     /* a column index */
-  Index super_c ;   /* column index of the column to absorb into */
-  Index *cp1 ;      /* column pointer for column super_c */
-  Index *cp2 ;      /* column pointer for column c */
-  Index length ;    /* length of column super_c */
-  Index prev_c ;    /* column preceding c in hash bucket */
-  Index i ;     /* loop counter */
-  Index *rp_end ;   /* pointer to the end of the row */
-  Index col ;     /* a column index in the row to check */
-  Index head_column ;   /* first column in hash bucket or degree list */
-  Index first_col ;   /* first column in hash bucket */
+  IndexType hash ;      /* hash value for a column */
+  IndexType *rp ;     /* pointer to a row */
+  IndexType c ;     /* a column index */
+  IndexType super_c ;   /* column index of the column to absorb into */
+  IndexType *cp1 ;      /* column pointer for column super_c */
+  IndexType *cp2 ;      /* column pointer for column c */
+  IndexType length ;    /* length of column super_c */
+  IndexType prev_c ;    /* column preceding c in hash bucket */
+  IndexType i ;     /* loop counter */
+  IndexType *rp_end ;   /* pointer to the end of the row */
+  IndexType col ;     /* a column index in the row to check */
+  IndexType head_column ;   /* first column in hash bucket or degree list */
+  IndexType first_col ;   /* first column in hash bucket */
 
   /* === Consider each column in the row ================================== */
 
@@ -1585,7 +1597,7 @@
   while (rp < rp_end)
   {
     col = *rp++ ;
-    if (COL_IS_DEAD (col))
+    if (Col[col].is_dead())
     {
       continue ;
     }
@@ -1597,7 +1609,7 @@
     /* === Get the first column in this hash bucket ===================== */
 
     head_column = head [hash] ;
-    if (head_column > COLAMD_EMPTY)
+    if (head_column > Empty)
     {
       first_col = Col [head_column].shared3.headhash ;
     }
@@ -1608,10 +1620,10 @@
 
     /* === Consider each column in the hash bucket ====================== */
 
-    for (super_c = first_col ; super_c != COLAMD_EMPTY ;
+    for (super_c = first_col ; super_c != Empty ;
 	 super_c = Col [super_c].shared4.hash_next)
     {
-      COLAMD_ASSERT (COL_IS_ALIVE (super_c)) ;
+      COLAMD_ASSERT (Col [super_c].is_alive()) ;
       COLAMD_ASSERT (Col [super_c].shared3.hash == hash) ;
       length = Col [super_c].length ;
 
@@ -1621,10 +1633,10 @@
       /* === Compare super_c with all columns after it ================ */
 
       for (c = Col [super_c].shared4.hash_next ;
-	   c != COLAMD_EMPTY ; c = Col [c].shared4.hash_next)
+	   c != Empty ; c = Col [c].shared4.hash_next)
       {
 	COLAMD_ASSERT (c != super_c) ;
-	COLAMD_ASSERT (COL_IS_ALIVE (c)) ;
+	COLAMD_ASSERT (Col[c].is_alive()) ;
 	COLAMD_ASSERT (Col [c].shared3.hash == hash) ;
 
 	/* not identical if lengths or scores are different */
@@ -1642,10 +1654,10 @@
 	for (i = 0 ; i < length ; i++)
 	{
 	  /* the columns are "clean" (no dead rows) */
-	  COLAMD_ASSERT (ROW_IS_ALIVE (*cp1))  ;
-	  COLAMD_ASSERT (ROW_IS_ALIVE (*cp2))  ;
+	  COLAMD_ASSERT ( cp1->is_alive() );
+	  COLAMD_ASSERT ( cp2->is_alive() );
 	  /* row indices will same order for both supercols, */
-	  /* no gather scatter nessasary */
+	  /* no gather scatter necessary */
 	  if (*cp1++ != *cp2++)
 	  {
 	    break ;
@@ -1665,9 +1677,9 @@
 
 	Col [super_c].shared1.thickness += Col [c].shared1.thickness ;
 	Col [c].shared1.parent = super_c ;
-	KILL_NON_PRINCIPAL_COL (c) ;
+	Col[c].kill_non_principal() ;
 	/* order c later, in order_children() */
-	Col [c].shared2.order = COLAMD_EMPTY ;
+	Col [c].shared2.order = Empty ;
 	/* remove c from hash bucket */
 	Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ;
       }
@@ -1675,15 +1687,15 @@
 
     /* === Empty this hash bucket ======================================= */
 
-    if (head_column > COLAMD_EMPTY)
+    if (head_column > Empty)
     {
       /* corresponding degree list "hash" is not empty */
-      Col [head_column].shared3.headhash = COLAMD_EMPTY ;
+      Col [head_column].shared3.headhash = Empty ;
     }
     else
     {
       /* corresponding degree list "hash" is empty */
-      head [hash] = COLAMD_EMPTY ;
+      head [hash] = Empty ;
     }
   }
 }
@@ -1695,56 +1707,56 @@
 
 /*
   Defragments and compacts columns and rows in the workspace A.  Used when
-  all avaliable memory has been used while performing row merging.  Returns
+  all available memory has been used while performing row merging.  Returns
   the index of the first free position in A, after garbage collection.  The
   time taken by this routine is linear is the size of the array A, which is
   itself linear in the number of nonzeros in the input matrix.
   Not user-callable.
 */
-template <typename Index>
-static Index garbage_collection  /* returns the new value of pfree */
+template <typename IndexType>
+static IndexType garbage_collection  /* returns the new value of pfree */
   (
     /* === Parameters ======================================================= */
-    
-    Index n_row,      /* number of rows */
-    Index n_col,      /* number of columns */
-    Colamd_Row<Index> Row [],    /* row info */
-    colamd_col<Index> Col [],    /* column info */
-    Index A [],     /* A [0 ... Alen-1] holds the matrix */
-    Index *pfree      /* &A [0] ... pfree is in use */
+
+    IndexType n_row,      /* number of rows */
+    IndexType n_col,      /* number of columns */
+    RowStructure<IndexType> Row [],    /* row info */
+    ColStructure<IndexType> Col [],    /* column info */
+    IndexType A [],     /* A [0 ... Alen-1] holds the matrix */
+    IndexType *pfree      /* &A [0] ... pfree is in use */
     )
 {
   /* === Local variables ================================================== */
 
-  Index *psrc ;     /* source pointer */
-  Index *pdest ;    /* destination pointer */
-  Index j ;     /* counter */
-  Index r ;     /* a row index */
-  Index c ;     /* a column index */
-  Index length ;    /* length of a row or column */
+  IndexType *psrc ;     /* source pointer */
+  IndexType *pdest ;    /* destination pointer */
+  IndexType j ;     /* counter */
+  IndexType r ;     /* a row index */
+  IndexType c ;     /* a column index */
+  IndexType length ;    /* length of a row or column */
 
   /* === Defragment the columns =========================================== */
 
   pdest = &A[0] ;
   for (c = 0 ; c < n_col ; c++)
   {
-    if (COL_IS_ALIVE (c))
+    if (Col[c].is_alive())
     {
       psrc = &A [Col [c].start] ;
 
       /* move and compact the column */
       COLAMD_ASSERT (pdest <= psrc) ;
-      Col [c].start = (Index) (pdest - &A [0]) ;
+      Col [c].start = (IndexType) (pdest - &A [0]) ;
       length = Col [c].length ;
       for (j = 0 ; j < length ; j++)
       {
 	r = *psrc++ ;
-	if (ROW_IS_ALIVE (r))
+	if (Row[r].is_alive())
 	{
 	  *pdest++ = r ;
 	}
       }
-      Col [c].length = (Index) (pdest - &A [Col [c].start]) ;
+      Col [c].length = (IndexType) (pdest - &A [Col [c].start]) ;
     }
   }
 
@@ -1752,22 +1764,22 @@
 
   for (r = 0 ; r < n_row ; r++)
   {
-    if (ROW_IS_ALIVE (r))
+    if (Row[r].is_alive())
     {
       if (Row [r].length == 0)
       {
-	/* this row is of zero length.  cannot compact it, so kill it */
-	COLAMD_DEBUG3 (("Defrag row kill\n")) ;
-	KILL_ROW (r) ;
+        /* this row is of zero length.  cannot compact it, so kill it */
+        COLAMD_DEBUG3 (("Defrag row kill\n")) ;
+        Row[r].kill() ;
       }
       else
       {
-	/* save first column index in Row [r].shared2.first_column */
-	psrc = &A [Row [r].start] ;
-	Row [r].shared2.first_column = *psrc ;
-	COLAMD_ASSERT (ROW_IS_ALIVE (r)) ;
-	/* flag the start of the row with the one's complement of row */
-	*psrc = ONES_COMPLEMENT (r) ;
+        /* save first column index in Row [r].shared2.first_column */
+        psrc = &A [Row [r].start] ;
+        Row [r].shared2.first_column = *psrc ;
+        COLAMD_ASSERT (Row[r].is_alive()) ;
+        /* flag the start of the row with the one's complement of row */
+        *psrc = ones_complement(r) ;
 
       }
     }
@@ -1783,25 +1795,25 @@
     {
       psrc-- ;
       /* get the row index */
-      r = ONES_COMPLEMENT (*psrc) ;
+      r = ones_complement(*psrc) ;
       COLAMD_ASSERT (r >= 0 && r < n_row) ;
       /* restore first column index */
       *psrc = Row [r].shared2.first_column ;
-      COLAMD_ASSERT (ROW_IS_ALIVE (r)) ;
+      COLAMD_ASSERT (Row[r].is_alive()) ;
 
       /* move and compact the row */
       COLAMD_ASSERT (pdest <= psrc) ;
-      Row [r].start = (Index) (pdest - &A [0]) ;
+      Row [r].start = (IndexType) (pdest - &A [0]) ;
       length = Row [r].length ;
       for (j = 0 ; j < length ; j++)
       {
 	c = *psrc++ ;
-	if (COL_IS_ALIVE (c))
+	if (Col[c].is_alive())
 	{
 	  *pdest++ = c ;
 	}
       }
-      Row [r].length = (Index) (pdest - &A [Row [r].start]) ;
+      Row [r].length = (IndexType) (pdest - &A [Row [r].start]) ;
 
     }
   }
@@ -1810,7 +1822,7 @@
 
   /* === Return the new value of pfree ==================================== */
 
-  return ((Index) (pdest - &A [0])) ;
+  return ((IndexType) (pdest - &A [0])) ;
 }
 
 
@@ -1822,22 +1834,22 @@
   Clears the Row [].shared2.mark array, and returns the new tag_mark.
   Return value is the new tag_mark.  Not user-callable.
 */
-template <typename Index>
-static inline  Index clear_mark  /* return the new value for tag_mark */
+template <typename IndexType>
+static inline  IndexType clear_mark  /* return the new value for tag_mark */
   (
       /* === Parameters ======================================================= */
 
-    Index n_row,    /* number of rows in A */
-    Colamd_Row<Index> Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */
+    IndexType n_row,    /* number of rows in A */
+    RowStructure<IndexType> Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */
     )
 {
   /* === Local variables ================================================== */
 
-  Index r ;
+  IndexType r ;
 
   for (r = 0 ; r < n_row ; r++)
   {
-    if (ROW_IS_ALIVE (r))
+    if (Row[r].is_alive())
     {
       Row [r].shared2.mark = 0 ;
     }
@@ -1845,6 +1857,7 @@
   return (1) ;
 }
 
+} // namespace Colamd
 
-} // namespace internal 
+} // namespace internal
 #endif

diff --git a/Eigen/src/OrderingMethods/Ordering.h b/Eigen/src/OrderingMethods/Ordering.h
index 4e06097..c578970 100644
--- a/Eigen/src/OrderingMethods/Ordering.h
+++ b/Eigen/src/OrderingMethods/Ordering.h

@@ -19,39 +19,38 @@
     
 /** \internal
   * \ingroup OrderingMethods_Module
-  * \returns the symmetric pattern A^T+A from the input matrix A. 
+  * \param[in] A the input non-symmetric matrix
+  * \param[out] symmat the symmetric pattern A^T+A from the input matrix \a A.
   * FIXME: The values should not be considered here
   */
 template<typename MatrixType> 
-void ordering_helper_at_plus_a(const MatrixType& mat, MatrixType& symmat)
+void ordering_helper_at_plus_a(const MatrixType& A, MatrixType& symmat)
 {
   MatrixType C;
-  C = mat.transpose(); // NOTE: Could be  costly
+  C = A.transpose(); // NOTE: Could be  costly
   for (int i = 0; i < C.rows(); i++) 
   {
       for (typename MatrixType::InnerIterator it(C, i); it; ++it)
-        it.valueRef() = 0.0;
+        it.valueRef() = typename MatrixType::Scalar(0);
   }
-  symmat = C + mat; 
+  symmat = C + A;
 }
     
 }
 
-#ifndef EIGEN_MPL2_ONLY
-
 /** \ingroup OrderingMethods_Module
   * \class AMDOrdering
   *
   * Functor computing the \em approximate \em minimum \em degree ordering
   * If the matrix is not structurally symmetric, an ordering of A^T+A is computed
-  * \tparam  Index The type of indices of the matrix 
+  * \tparam  StorageIndex The type of indices of the matrix 
   * \sa COLAMDOrdering
   */
-template <typename Index>
+template <typename StorageIndex>
 class AMDOrdering
 {
   public:
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
+    typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
     
     /** Compute the permutation vector from a sparse matrix
      * This routine is much faster if the input matrix is column-major     
@@ -60,7 +59,7 @@
     void operator()(const MatrixType& mat, PermutationType& perm)
     {
       // Compute the symmetric pattern
-      SparseMatrix<typename MatrixType::Scalar, ColMajor, Index> symm;
+      SparseMatrix<typename MatrixType::Scalar, ColMajor, StorageIndex> symm;
       internal::ordering_helper_at_plus_a(mat,symm); 
     
       // Call the AMD routine 
@@ -72,7 +71,7 @@
     template <typename SrcType, unsigned int SrcUpLo> 
     void operator()(const SparseSelfAdjointView<SrcType, SrcUpLo>& mat, PermutationType& perm)
     { 
-      SparseMatrix<typename SrcType::Scalar, ColMajor, Index> C; C = mat;
+      SparseMatrix<typename SrcType::Scalar, ColMajor, StorageIndex> C; C = mat;
       
       // Call the AMD routine 
       // m_mat.prune(keep_diag()); //Remove the diagonal elements 
@@ -80,21 +79,19 @@
     }
 };
 
-#endif // EIGEN_MPL2_ONLY
-
 /** \ingroup OrderingMethods_Module
   * \class NaturalOrdering
   *
   * Functor computing the natural ordering (identity)
   * 
   * \note Returns an empty permutation matrix
-  * \tparam  Index The type of indices of the matrix 
+  * \tparam  StorageIndex The type of indices of the matrix 
   */
-template <typename Index>
+template <typename StorageIndex>
 class NaturalOrdering
 {
   public:
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
+    typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
     
     /** Compute the permutation vector from a column-major sparse matrix */
     template <typename MatrixType>
@@ -108,15 +105,17 @@
 /** \ingroup OrderingMethods_Module
   * \class COLAMDOrdering
   *
+  * \tparam  StorageIndex The type of indices of the matrix 
+  * 
   * Functor computing the \em column \em approximate \em minimum \em degree ordering 
   * The matrix should be in column-major and \b compressed format (see SparseMatrix::makeCompressed()).
   */
-template<typename Index>
+template<typename StorageIndex>
 class COLAMDOrdering
 {
   public:
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType; 
-    typedef Matrix<Index, Dynamic, 1> IndexVector;
+    typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType; 
+    typedef Matrix<StorageIndex, Dynamic, 1> IndexVector;
     
     /** Compute the permutation vector \a perm form the sparse matrix \a mat
       * \warning The input sparse matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
@@ -126,26 +125,26 @@
     {
       eigen_assert(mat.isCompressed() && "COLAMDOrdering requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to COLAMDOrdering");
       
-      Index m = mat.rows();
-      Index n = mat.cols();
-      Index nnz = mat.nonZeros();
+      StorageIndex m = StorageIndex(mat.rows());
+      StorageIndex n = StorageIndex(mat.cols());
+      StorageIndex nnz = StorageIndex(mat.nonZeros());
       // Get the recommended value of Alen to be used by colamd
-      Index Alen = internal::colamd_recommended(nnz, m, n); 
+      StorageIndex Alen = internal::Colamd::recommended(nnz, m, n); 
       // Set the default parameters
-      double knobs [COLAMD_KNOBS]; 
-      Index stats [COLAMD_STATS];
-      internal::colamd_set_defaults(knobs);
+      double knobs [internal::Colamd::NKnobs]; 
+      StorageIndex stats [internal::Colamd::NStats];
+      internal::Colamd::set_defaults(knobs);
       
-      Index info;
       IndexVector p(n+1), A(Alen); 
-      for(Index i=0; i <= n; i++)   p(i) = mat.outerIndexPtr()[i];
-      for(Index i=0; i < nnz; i++)  A(i) = mat.innerIndexPtr()[i];
+      for(StorageIndex i=0; i <= n; i++)   p(i) = mat.outerIndexPtr()[i];
+      for(StorageIndex i=0; i < nnz; i++)  A(i) = mat.innerIndexPtr()[i];
       // Call Colamd routine to compute the ordering 
-      info = internal::colamd(m, n, Alen, A.data(), p.data(), knobs, stats); 
+      StorageIndex info = internal::Colamd::compute_ordering(m, n, Alen, A.data(), p.data(), knobs, stats); 
+      EIGEN_UNUSED_VARIABLE(info);
       eigen_assert( info && "COLAMD failed " );
       
       perm.resize(n);
-      for (Index i = 0; i < n; i++) perm.indices()(p(i)) = i;
+      for (StorageIndex i = 0; i < n; i++) perm.indices()(p(i)) = i;
     }
 };
 

diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h
index 8a546dc..3742687 100644
--- a/Eigen/src/PaStiXSupport/PaStiXSupport.h
+++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h

@@ -43,7 +43,7 @@
     typedef _MatrixType MatrixType;
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
+    typedef typename _MatrixType::StorageIndex StorageIndex;
   };
 
   template<typename _MatrixType, int Options>
@@ -52,7 +52,7 @@
     typedef _MatrixType MatrixType;
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
+    typedef typename _MatrixType::StorageIndex StorageIndex;
   };
 
   template<typename _MatrixType, int Options>
@@ -61,31 +61,31 @@
     typedef _MatrixType MatrixType;
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
+    typedef typename _MatrixType::StorageIndex StorageIndex;
   };
   
-  void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm)
+  inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm)
   {
     if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; }
     if (nbrhs == 0) {x = NULL; nbrhs=1;}
     s_pastix(pastix_data, pastix_comm, n, ptr, idx, vals, perm, invp, x, nbrhs, iparm, dparm); 
   }
   
-  void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, double *vals, int *perm, int * invp, double *x, int nbrhs, int *iparm, double *dparm)
+  inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, double *vals, int *perm, int * invp, double *x, int nbrhs, int *iparm, double *dparm)
   {
     if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; }
     if (nbrhs == 0) {x = NULL; nbrhs=1;}
     d_pastix(pastix_data, pastix_comm, n, ptr, idx, vals, perm, invp, x, nbrhs, iparm, dparm); 
   }
   
-  void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex<float> *vals, int *perm, int * invp, std::complex<float> *x, int nbrhs, int *iparm, double *dparm)
+  inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex<float> *vals, int *perm, int * invp, std::complex<float> *x, int nbrhs, int *iparm, double *dparm)
   {
     if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; }
     if (nbrhs == 0) {x = NULL; nbrhs=1;}
     c_pastix(pastix_data, pastix_comm, n, ptr, idx, reinterpret_cast<PASTIX_COMPLEX*>(vals), perm, invp, reinterpret_cast<PASTIX_COMPLEX*>(x), nbrhs, iparm, dparm); 
   }
   
-  void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex<double> *vals, int *perm, int * invp, std::complex<double> *x, int nbrhs, int *iparm, double *dparm)
+  inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex<double> *vals, int *perm, int * invp, std::complex<double> *x, int nbrhs, int *iparm, double *dparm)
   {
     if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; }
     if (nbrhs == 0) {x = NULL; nbrhs=1;}
@@ -125,20 +125,30 @@
 // This is the base class to interface with PaStiX functions. 
 // Users should not used this class directly. 
 template <class Derived>
-class PastixBase : internal::noncopyable
+class PastixBase : public SparseSolverBase<Derived>
 {
+  protected:
+    typedef SparseSolverBase<Derived> Base;
+    using Base::derived;
+    using Base::m_isInitialized;
   public:
+    using Base::_solve_impl;
+    
     typedef typename internal::pastix_traits<Derived>::MatrixType _MatrixType;
     typedef _MatrixType MatrixType;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<Scalar,Dynamic,1> Vector;
     typedef SparseMatrix<Scalar, ColMajor> ColSpMatrix;
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
     
   public:
     
-    PastixBase() : m_initisOk(false), m_analysisIsOk(false), m_factorizationIsOk(false), m_isInitialized(false), m_pastixdata(0), m_size(0)
+    PastixBase() : m_initisOk(false), m_analysisIsOk(false), m_factorizationIsOk(false), m_pastixdata(0), m_size(0)
     {
       init();
     }
@@ -147,39 +157,16 @@
     {
       clean();
     }
-
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<PastixBase, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "Pastix solver is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "PastixBase::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<PastixBase, Rhs>(*this, b.derived());
-    }
     
     template<typename Rhs,typename Dest>
-    bool _solve (const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) const;
+    bool _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) const;
     
-    Derived& derived()
-    {
-      return *static_cast<Derived*>(this);
-    }
-    const Derived& derived() const
-    {
-      return *static_cast<const Derived*>(this);
-    }
-
     /** Returns a reference to the integer vector IPARM of PaStiX parameters
       * to modify the default parameters. 
       * The statistics related to the different phases of factorization and solve are saved here as well
       * \sa analyzePattern() factorize()
       */
-    Array<Index,IPARM_SIZE,1>& iparm()
+    Array<StorageIndex,IPARM_SIZE,1>& iparm()
     {
       return m_iparm; 
     }
@@ -197,7 +184,7 @@
       * The statistics related to the different phases of factorization and solve are saved here as well
       * \sa analyzePattern() factorize()
       */
-    Array<RealScalar,IPARM_SIZE,1>& dparm()
+    Array<double,DPARM_SIZE,1>& dparm()
     {
       return m_dparm; 
     }
@@ -216,7 +203,7 @@
     
      /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the PaStiX reports a problem
       *          \c InvalidInput if the input matrix is invalid
       *
@@ -228,20 +215,6 @@
       return m_info;
     }
     
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<PastixBase, Rhs>
-    solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "Pastix LU, LLT or LDLT is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "PastixBase::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<PastixBase, Rhs>(*this, b.derived());
-    }
-    
   protected:
 
     // Initialize the Pastix data structure, check the matrix
@@ -268,14 +241,13 @@
     int m_initisOk; 
     int m_analysisIsOk;
     int m_factorizationIsOk;
-    bool m_isInitialized;
     mutable ComputationInfo m_info; 
     mutable pastix_data_t *m_pastixdata; // Data structure for pastix
     mutable int m_comm; // The MPI communicator identifier
-    mutable Matrix<int,IPARM_SIZE,1> m_iparm; // integer vector for the input parameters
-    mutable Matrix<double,DPARM_SIZE,1> m_dparm; // Scalar vector for the input parameters
-    mutable Matrix<Index,Dynamic,1> m_perm;  // Permutation vector
-    mutable Matrix<Index,Dynamic,1> m_invp;  // Inverse permutation vector
+    mutable Array<int,IPARM_SIZE,1> m_iparm; // integer vector for the input parameters
+    mutable Array<double,DPARM_SIZE,1> m_dparm; // Scalar vector for the input parameters
+    mutable Matrix<StorageIndex,Dynamic,1> m_perm;  // Permutation vector
+    mutable Matrix<StorageIndex,Dynamic,1> m_invp;  // Inverse permutation vector
     mutable int m_size; // Size of the matrix 
 }; 
 
@@ -296,7 +268,7 @@
          0, 0, 0, 1, m_iparm.data(), m_dparm.data());
   
   m_iparm[IPARM_MATRIX_VERIFICATION] = API_NO;
-  m_iparm[IPARM_VERBOSE]             = 2;
+  m_iparm[IPARM_VERBOSE]             = API_VERBOSE_NOT;
   m_iparm[IPARM_ORDERING]            = API_ORDER_SCOTCH;
   m_iparm[IPARM_INCOMPLETE]          = API_NO;
   m_iparm[IPARM_OOC_LIMIT]           = 2000;
@@ -328,7 +300,6 @@
   factorize(mat);
   
   m_iparm(IPARM_MATRIX_VERIFICATION) = API_NO;
-  m_isInitialized = m_factorizationIsOk;
 }
 
 
@@ -341,7 +312,7 @@
   if(m_size>0)
     clean();
   
-  m_size = mat.rows();
+  m_size = internal::convert_index<int>(mat.rows());
   m_perm.resize(m_size);
   m_invp.resize(m_size);
   
@@ -370,7 +341,7 @@
   eigen_assert(m_analysisIsOk && "The analysis phase should be called before the factorization phase");
   m_iparm(IPARM_START_TASK) = API_TASK_NUMFACT;
   m_iparm(IPARM_END_TASK) = API_TASK_NUMFACT;
-  m_size = mat.rows();
+  m_size = internal::convert_index<int>(mat.rows());
   
   internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, m_size, mat.outerIndexPtr(), mat.innerIndexPtr(),
                mat.valuePtr(), m_perm.data(), m_invp.data(), 0, 0, m_iparm.data(), m_dparm.data());
@@ -393,7 +364,7 @@
 /* Solve the system */
 template<typename Base>
 template<typename Rhs,typename Dest>
-bool PastixBase<Base>::_solve (const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) const
+bool PastixBase<Base>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) const
 {
   eigen_assert(m_isInitialized && "The matrix should be factorized first");
   EIGEN_STATIC_ASSERT((Dest::Flags&RowMajorBit)==0,
@@ -406,7 +377,7 @@
     m_iparm[IPARM_START_TASK]          = API_TASK_SOLVE;
     m_iparm[IPARM_END_TASK]            = API_TASK_REFINE;
   
-    internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, x.rows(), 0, 0, 0,
+    internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, internal::convert_index<int>(x.rows()), 0, 0, 0,
                            m_perm.data(), m_invp.data(), &x(0, i), rhs, m_iparm.data(), m_dparm.data());
   }
   
@@ -431,8 +402,10 @@
   * NOTE : Note that if the analysis and factorization phase are called separately, 
   * the input matrix will be symmetrized at each call, hence it is advised to 
   * symmetrize the matrix in a end-user program and set \p IsStrSym to true
-  * 
-  * \sa \ref TutorialSparseDirectSolvers
+  *
+  * \implsparsesolverconcept
+  *
+  * \sa \ref TutorialSparseSolverConcept, class SparseLU
   * 
   */
 template<typename _MatrixType, bool IsStrSym>
@@ -442,7 +415,7 @@
     typedef _MatrixType MatrixType;
     typedef PastixBase<PastixLU<MatrixType> > Base;
     typedef typename Base::ColSpMatrix ColSpMatrix;
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
     
   public:
     PastixLU() : Base()
@@ -450,7 +423,7 @@
       init();
     }
     
-    PastixLU(const MatrixType& matrix):Base()
+    explicit PastixLU(const MatrixType& matrix):Base()
     {
       init();
       compute(matrix);
@@ -542,8 +515,10 @@
   * 
   * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   * \tparam UpLo The part of the matrix to use : Lower or Upper. The default is Lower as required by PaStiX
-  * 
-  * \sa \ref TutorialSparseDirectSolvers
+  *
+  * \implsparsesolverconcept
+  *
+  * \sa \ref TutorialSparseSolverConcept, class SimplicialLLT
   */
 template<typename _MatrixType, int _UpLo>
 class PastixLLT : public PastixBase< PastixLLT<_MatrixType, _UpLo> >
@@ -560,7 +535,7 @@
       init();
     }
     
-    PastixLLT(const MatrixType& matrix):Base()
+    explicit PastixLLT(const MatrixType& matrix):Base()
     {
       init();
       compute(matrix);
@@ -606,6 +581,7 @@
     
     void grabMatrix(const MatrixType& matrix, ColSpMatrix& out)
     {
+      out.resize(matrix.rows(), matrix.cols());
       // Pastix supports only lower, column-major matrices 
       out.template selfadjointView<Lower>() = matrix.template selfadjointView<UpLo>();
       internal::c_to_fortran_numbering(out);
@@ -623,8 +599,10 @@
   * 
   * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   * \tparam UpLo The part of the matrix to use : Lower or Upper. The default is Lower as required by PaStiX
-  * 
-  * \sa \ref TutorialSparseDirectSolvers
+  *
+  * \implsparsesolverconcept
+  *
+  * \sa \ref TutorialSparseSolverConcept, class SimplicialLDLT
   */
 template<typename _MatrixType, int _UpLo>
 class PastixLDLT : public PastixBase< PastixLDLT<_MatrixType, _UpLo> >
@@ -641,7 +619,7 @@
       init();
     }
     
-    PastixLDLT(const MatrixType& matrix):Base()
+    explicit PastixLDLT(const MatrixType& matrix):Base()
     {
       init();
       compute(matrix);
@@ -689,41 +667,12 @@
     void grabMatrix(const MatrixType& matrix, ColSpMatrix& out)
     {
       // Pastix supports only lower, column-major matrices 
+      out.resize(matrix.rows(), matrix.cols());
       out.template selfadjointView<Lower>() = matrix.template selfadjointView<UpLo>();
       internal::c_to_fortran_numbering(out);
     }
 };
 
-namespace internal {
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<PastixBase<_MatrixType>, Rhs>
-  : solve_retval_base<PastixBase<_MatrixType>, Rhs>
-{
-  typedef PastixBase<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-template<typename _MatrixType, typename Rhs>
-struct sparse_solve_retval<PastixBase<_MatrixType>, Rhs>
-  : sparse_solve_retval_base<PastixBase<_MatrixType>, Rhs>
-{
-  typedef PastixBase<_MatrixType> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif

diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h
index b657106..f89b79b 100644
--- a/Eigen/src/PardisoSupport/PardisoSupport.h
+++ b/Eigen/src/PardisoSupport/PardisoSupport.h

@@ -40,13 +40,13 @@
 
 namespace internal
 {
-  template<typename Index>
+  template<typename IndexType>
   struct pardiso_run_selector
   {
-    static Index run( _MKL_DSS_HANDLE_t pt, Index maxfct, Index mnum, Index type, Index phase, Index n, void *a,
-                      Index *ia, Index *ja, Index *perm, Index nrhs, Index *iparm, Index msglvl, void *b, void *x)
+    static IndexType run( _MKL_DSS_HANDLE_t pt, IndexType maxfct, IndexType mnum, IndexType type, IndexType phase, IndexType n, void *a,
+                      IndexType *ia, IndexType *ja, IndexType *perm, IndexType nrhs, IndexType *iparm, IndexType msglvl, void *b, void *x)
     {
-      Index error = 0;
+      IndexType error = 0;
       ::pardiso(pt, &maxfct, &mnum, &type, &phase, &n, a, ia, ja, perm, &nrhs, iparm, &msglvl, b, x, &error);
       return error;
     }
@@ -54,11 +54,11 @@
   template<>
   struct pardiso_run_selector<long long int>
   {
-    typedef long long int Index;
-    static Index run( _MKL_DSS_HANDLE_t pt, Index maxfct, Index mnum, Index type, Index phase, Index n, void *a,
-                      Index *ia, Index *ja, Index *perm, Index nrhs, Index *iparm, Index msglvl, void *b, void *x)
+    typedef long long int IndexType;
+    static IndexType run( _MKL_DSS_HANDLE_t pt, IndexType maxfct, IndexType mnum, IndexType type, IndexType phase, IndexType n, void *a,
+                      IndexType *ia, IndexType *ja, IndexType *perm, IndexType nrhs, IndexType *iparm, IndexType msglvl, void *b, void *x)
     {
-      Index error = 0;
+      IndexType error = 0;
       ::pardiso_64(pt, &maxfct, &mnum, &type, &phase, &n, a, ia, ja, perm, &nrhs, iparm, &msglvl, b, x, &error);
       return error;
     }
@@ -72,7 +72,7 @@
     typedef _MatrixType MatrixType;
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
+    typedef typename _MatrixType::StorageIndex StorageIndex;
   };
 
   template<typename _MatrixType, int Options>
@@ -81,7 +81,7 @@
     typedef _MatrixType MatrixType;
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
+    typedef typename _MatrixType::StorageIndex StorageIndex;
   };
 
   template<typename _MatrixType, int Options>
@@ -90,35 +90,45 @@
     typedef _MatrixType MatrixType;
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;    
+    typedef typename _MatrixType::StorageIndex StorageIndex;    
   };
 
-}
+} // end namespace internal
 
 template<class Derived>
-class PardisoImpl : internal::noncopyable
+class PardisoImpl : public SparseSolverBase<Derived>
 {
+  protected:
+    typedef SparseSolverBase<Derived> Base;
+    using Base::derived;
+    using Base::m_isInitialized;
+    
     typedef internal::pardiso_traits<Derived> Traits;
   public:
+    using Base::_solve_impl;
+    
     typedef typename Traits::MatrixType MatrixType;
     typedef typename Traits::Scalar Scalar;
     typedef typename Traits::RealScalar RealScalar;
-    typedef typename Traits::Index Index;
-    typedef SparseMatrix<Scalar,RowMajor,Index> SparseMatrixType;
+    typedef typename Traits::StorageIndex StorageIndex;
+    typedef SparseMatrix<Scalar,RowMajor,StorageIndex> SparseMatrixType;
     typedef Matrix<Scalar,Dynamic,1> VectorType;
-    typedef Matrix<Index, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
-    typedef Matrix<Index, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
-    typedef Array<Index,64,1,DontAlign> ParameterType;
+    typedef Matrix<StorageIndex, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
+    typedef Matrix<StorageIndex, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
+    typedef Array<StorageIndex,64,1,DontAlign> ParameterType;
     enum {
-      ScalarIsComplex = NumTraits<Scalar>::IsComplex
+      ScalarIsComplex = NumTraits<Scalar>::IsComplex,
+      ColsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic
     };
 
     PardisoImpl()
+      : m_analysisIsOk(false), m_factorizationIsOk(false)
     {
-      eigen_assert((sizeof(Index) >= sizeof(_INTEGER_t) && sizeof(Index) <= 8) && "Non-supported index type");
+      eigen_assert((sizeof(StorageIndex) >= sizeof(_INTEGER_t) && sizeof(StorageIndex) <= 8) && "Non-supported index type");
       m_iparm.setZero();
       m_msglvl = 0; // No output
-      m_initialized = false;
+      m_isInitialized = false;
     }
 
     ~PardisoImpl()
@@ -131,12 +141,12 @@
   
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix appears to be negative.
       */
     ComputationInfo info() const
     {
-      eigen_assert(m_initialized && "Decomposition is not initialized.");
+      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
       return m_info;
     }
 
@@ -165,54 +175,18 @@
     Derived& factorize(const MatrixType& matrix);
 
     Derived& compute(const MatrixType& matrix);
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<PardisoImpl, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_initialized && "Pardiso solver is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "PardisoImpl::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<PardisoImpl, Rhs>(*this, b.derived());
-    }
 
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<PardisoImpl, Rhs>
-    solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_initialized && "Pardiso solver is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "PardisoImpl::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<PardisoImpl, Rhs>(*this, b.derived());
-    }
-
-    Derived& derived()
-    {
-      return *static_cast<Derived*>(this);
-    }
-    const Derived& derived() const
-    {
-      return *static_cast<const Derived*>(this);
-    }
-
-    template<typename BDerived, typename XDerived>
-    bool _solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>& x) const;
+    template<typename Rhs,typename Dest>
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
 
   protected:
     void pardisoRelease()
     {
-      if(m_initialized) // Factorization ran at least once
+      if(m_isInitialized) // Factorization ran at least once
       {
-        internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, -1, m_size, 0, 0, 0, m_perm.data(), 0,
-                                                   m_iparm.data(), m_msglvl, 0, 0);
+        internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, -1, internal::convert_index<StorageIndex>(m_size),0, 0, 0, m_perm.data(), 0,
+                                                          m_iparm.data(), m_msglvl, NULL, NULL);
+        m_isInitialized = false;
       }
     }
 
@@ -221,11 +195,11 @@
       m_type = type;
       bool symmetric = std::abs(m_type) < 10;
       m_iparm[0] = 1;   // No solver default
-      m_iparm[1] = 3;   // use Metis for the ordering
-      m_iparm[2] = 1;   // Numbers of processors, value of OMP_NUM_THREADS
+      m_iparm[1] = 2;   // use Metis for the ordering
+      m_iparm[2] = 0;   // Reserved. Set to zero. (??Numbers of processors, value of OMP_NUM_THREADS??)
       m_iparm[3] = 0;   // No iterative-direct algorithm
       m_iparm[4] = 0;   // No user fill-in reducing permutation
-      m_iparm[5] = 0;   // Write solution into x
+      m_iparm[5] = 0;   // Write solution into x, b is left unchanged
       m_iparm[6] = 0;   // Not in use
       m_iparm[7] = 2;   // Max numbers of iterative refinement steps
       m_iparm[8] = 0;   // Not in use
@@ -246,13 +220,16 @@
       m_iparm[26] = 0;  // No matrix checker
       m_iparm[27] = (sizeof(RealScalar) == 4) ? 1 : 0;
       m_iparm[34] = 1;  // C indexing
-      m_iparm[59] = 1;  // Automatic switch between In-Core and Out-of-Core modes
+      m_iparm[36] = 0;  // CSR
+      m_iparm[59] = 0;  // 0 - In-Core ; 1 - Automatic switch between In-Core and Out-of-Core modes ; 2 - Out-of-Core
+      
+      memset(m_pt, 0, sizeof(m_pt));
     }
 
   protected:
     // cached data to reduce reallocation, etc.
     
-    void manageErrorCode(Index error)
+    void manageErrorCode(Index error) const
     {
       switch(error)
       {
@@ -269,9 +246,9 @@
     }
 
     mutable SparseMatrixType m_matrix;
-    ComputationInfo m_info;
-    bool m_initialized, m_analysisIsOk, m_factorizationIsOk;
-    Index m_type, m_msglvl;
+    mutable ComputationInfo m_info;
+    bool m_analysisIsOk, m_factorizationIsOk;
+    StorageIndex m_type, m_msglvl;
     mutable void *m_pt[64];
     mutable ParameterType m_iparm;
     mutable IntColVectorType m_perm;
@@ -286,19 +263,17 @@
   eigen_assert(a.rows() == a.cols());
 
   pardisoRelease();
-  memset(m_pt, 0, sizeof(m_pt));
   m_perm.setZero(m_size);
   derived().getMatrix(a);
   
   Index error;
-  error = internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, 12, m_size,
-                                                     m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
-                                                     m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
-
+  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 12, internal::convert_index<StorageIndex>(m_size),
+                                                            m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
+                                                            m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
   manageErrorCode(error);
   m_analysisIsOk = true;
   m_factorizationIsOk = true;
-  m_initialized = true;
+  m_isInitialized = true;
   return derived();
 }
 
@@ -309,19 +284,18 @@
   eigen_assert(m_size == a.cols());
 
   pardisoRelease();
-  memset(m_pt, 0, sizeof(m_pt));
   m_perm.setZero(m_size);
   derived().getMatrix(a);
   
   Index error;
-  error = internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, 11, m_size,
-                                                     m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
-                                                     m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
+  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 11, internal::convert_index<StorageIndex>(m_size),
+                                                            m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
+                                                            m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
   
   manageErrorCode(error);
   m_analysisIsOk = true;
   m_factorizationIsOk = false;
-  m_initialized = true;
+  m_isInitialized = true;
   return derived();
 }
 
@@ -333,22 +307,25 @@
   
   derived().getMatrix(a);
 
-  Index error;  
-  error = internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, 22, m_size,
-                                                     m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
-                                                     m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
+  Index error;
+  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 22, internal::convert_index<StorageIndex>(m_size),
+                                                            m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
+                                                            m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
   
   manageErrorCode(error);
   m_factorizationIsOk = true;
   return derived();
 }
 
-template<class Base>
+template<class Derived>
 template<typename BDerived,typename XDerived>
-bool PardisoImpl<Base>::_solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>& x) const
+void PardisoImpl<Derived>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived>& x) const
 {
   if(m_iparm[0] == 0) // Factorization was not computed
-    return false;
+  {
+    m_info = InvalidInput;
+    return;
+  }
 
   //Index n = m_matrix.rows();
   Index nrhs = Index(b.cols());
@@ -378,12 +355,12 @@
   }
   
   Index error;
-  error = internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, 33, m_size,
-                                                     m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
-                                                     m_perm.data(), nrhs, m_iparm.data(), m_msglvl,
-                                                     rhs_ptr, x.derived().data());
+  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 33, internal::convert_index<StorageIndex>(m_size),
+                                                            m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
+                                                            m_perm.data(), internal::convert_index<StorageIndex>(nrhs), m_iparm.data(), m_msglvl,
+                                                            rhs_ptr, x.derived().data());
 
-  return error==0;
+  manageErrorCode(error);
 }
 
 
@@ -395,23 +372,29 @@
   * using the Intel MKL PARDISO library. The sparse matrix A must be squared and invertible.
   * The vectors or matrices X and B can be either dense or sparse.
   *
+  * By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set:
+  * \code solver.pardisoParameterArray()[59] = 1; \endcode
+  *
   * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   *
-  * \sa \ref TutorialSparseDirectSolvers
+  * \implsparsesolverconcept
+  *
+  * \sa \ref TutorialSparseSolverConcept, class SparseLU
   */
 template<typename MatrixType>
 class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
 {
   protected:
-    typedef PardisoImpl< PardisoLU<MatrixType> > Base;
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::RealScalar RealScalar;
+    typedef PardisoImpl<PardisoLU> Base;
     using Base::pardisoInit;
     using Base::m_matrix;
     friend class PardisoImpl< PardisoLU<MatrixType> >;
 
   public:
 
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::RealScalar RealScalar;
+
     using Base::compute;
     using Base::solve;
 
@@ -421,7 +404,7 @@
       pardisoInit(Base::ScalarIsComplex ? 13 : 11);
     }
 
-    PardisoLU(const MatrixType& matrix)
+    explicit PardisoLU(const MatrixType& matrix)
       : Base()
     {
       pardisoInit(Base::ScalarIsComplex ? 13 : 11);
@@ -431,6 +414,7 @@
     void getMatrix(const MatrixType& matrix)
     {
       m_matrix = matrix;
+      m_matrix.makeCompressed();
     }
 };
 
@@ -442,29 +426,33 @@
   * using the Intel MKL PARDISO library. The sparse matrix A must be selfajoint and positive definite.
   * The vectors or matrices X and B can be either dense or sparse.
   *
+  * By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set:
+  * \code solver.pardisoParameterArray()[59] = 1; \endcode
+  *
   * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   * \tparam UpLo can be any bitwise combination of Upper, Lower. The default is Upper, meaning only the upper triangular part has to be used.
   *         Upper|Lower can be used to tell both triangular parts can be used as input.
   *
-  * \sa \ref TutorialSparseDirectSolvers
+  * \implsparsesolverconcept
+  *
+  * \sa \ref TutorialSparseSolverConcept, class SimplicialLLT
   */
 template<typename MatrixType, int _UpLo>
 class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
 {
   protected:
     typedef PardisoImpl< PardisoLLT<MatrixType,_UpLo> > Base;
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::Index Index;
-    typedef typename Base::RealScalar RealScalar;
     using Base::pardisoInit;
     using Base::m_matrix;
     friend class PardisoImpl< PardisoLLT<MatrixType,_UpLo> >;
 
   public:
 
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::RealScalar RealScalar;
+    typedef typename Base::StorageIndex StorageIndex;
     enum { UpLo = _UpLo };
     using Base::compute;
-    using Base::solve;
 
     PardisoLLT()
       : Base()
@@ -472,7 +460,7 @@
       pardisoInit(Base::ScalarIsComplex ? 4 : 2);
     }
 
-    PardisoLLT(const MatrixType& matrix)
+    explicit PardisoLLT(const MatrixType& matrix)
       : Base()
     {
       pardisoInit(Base::ScalarIsComplex ? 4 : 2);
@@ -484,9 +472,10 @@
     void getMatrix(const MatrixType& matrix)
     {
       // PARDISO supports only upper, row-major matrices
-      PermutationMatrix<Dynamic,Dynamic,Index> p_null;
+      PermutationMatrix<Dynamic,Dynamic,StorageIndex> p_null;
       m_matrix.resize(matrix.rows(), matrix.cols());
       m_matrix.template selfadjointView<Upper>() = matrix.template selfadjointView<UpLo>().twistedBy(p_null);
+      m_matrix.makeCompressed();
     }
 };
 
@@ -499,29 +488,33 @@
   * For complex matrices, A can also be symmetric only, see the \a Options template parameter.
   * The vectors or matrices X and B can be either dense or sparse.
   *
+  * By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set:
+  * \code solver.pardisoParameterArray()[59] = 1; \endcode
+  *
   * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   * \tparam Options can be any bitwise combination of Upper, Lower, and Symmetric. The default is Upper, meaning only the upper triangular part has to be used.
   *         Symmetric can be used for symmetric, non-selfadjoint complex matrices, the default being to assume a selfadjoint matrix.
   *         Upper|Lower can be used to tell both triangular parts can be used as input.
   *
-  * \sa \ref TutorialSparseDirectSolvers
+  * \implsparsesolverconcept
+  *
+  * \sa \ref TutorialSparseSolverConcept, class SimplicialLDLT
   */
 template<typename MatrixType, int Options>
 class PardisoLDLT : public PardisoImpl< PardisoLDLT<MatrixType,Options> >
 {
   protected:
     typedef PardisoImpl< PardisoLDLT<MatrixType,Options> > Base;
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::Index Index;
-    typedef typename Base::RealScalar RealScalar;
     using Base::pardisoInit;
     using Base::m_matrix;
     friend class PardisoImpl< PardisoLDLT<MatrixType,Options> >;
 
   public:
 
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::RealScalar RealScalar;
+    typedef typename Base::StorageIndex StorageIndex;
     using Base::compute;
-    using Base::solve;
     enum { UpLo = Options&(Upper|Lower) };
 
     PardisoLDLT()
@@ -530,7 +523,7 @@
       pardisoInit(Base::ScalarIsComplex ? ( bool(Options&Symmetric) ? 6 : -4 ) : -2);
     }
 
-    PardisoLDLT(const MatrixType& matrix)
+    explicit PardisoLDLT(const MatrixType& matrix)
       : Base()
     {
       pardisoInit(Base::ScalarIsComplex ? ( bool(Options&Symmetric) ? 6 : -4 ) : -2);
@@ -540,42 +533,13 @@
     void getMatrix(const MatrixType& matrix)
     {
       // PARDISO supports only upper, row-major matrices
-      PermutationMatrix<Dynamic,Dynamic,Index> p_null;
+      PermutationMatrix<Dynamic,Dynamic,StorageIndex> p_null;
       m_matrix.resize(matrix.rows(), matrix.cols());
       m_matrix.template selfadjointView<Upper>() = matrix.template selfadjointView<UpLo>().twistedBy(p_null);
+      m_matrix.makeCompressed();
     }
 };
 
-namespace internal {
-  
-template<typename _Derived, typename Rhs>
-struct solve_retval<PardisoImpl<_Derived>, Rhs>
-  : solve_retval_base<PardisoImpl<_Derived>, Rhs>
-{
-  typedef PardisoImpl<_Derived> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-template<typename Derived, typename Rhs>
-struct sparse_solve_retval<PardisoImpl<Derived>, Rhs>
-  : sparse_solve_retval_base<PardisoImpl<Derived>, Rhs>
-{
-  typedef PardisoImpl<Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_PARDISOSUPPORT_H

diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h
index 3f4155b..9b677e9 100644
--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h

@@ -13,58 +13,66 @@
 
 namespace Eigen {
 
+namespace internal {
+template<typename _MatrixType> struct traits<ColPivHouseholderQR<_MatrixType> >
+ : traits<_MatrixType>
+{
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
+  enum { Flags = 0 };
+};
+
+} // end namespace internal
+
 /** \ingroup QR_Module
   *
   * \class ColPivHouseholderQR
   *
-  * \brief Householder rank-revealing QR decomposition of a matrix with
- * column-pivoting
+  * \brief Householder rank-revealing QR decomposition of a matrix with column-pivoting
   *
-  * \param MatrixType the type of the matrix of which we are computing the QR
- * decomposition
+  * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition
   *
-  * This class performs a rank-revealing QR decomposition of a matrix \b A into
- * matrices \b P, \b Q and \b R
+  * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b Q and \b R
   * such that
   * \f[
   *  \mathbf{A} \, \mathbf{P} = \mathbf{Q} \, \mathbf{R}
   * \f]
-  * by using Householder transformations. Here, \b P is a permutation matrix, \b
- * Q a unitary matrix and \b R an
+  * by using Householder transformations. Here, \b P is a permutation matrix, \b Q a unitary matrix and \b R an
   * upper triangular matrix.
   *
-  * This decomposition performs column pivoting in order to be rank-revealing
- * and improve
-  * numerical stability. It is slower than HouseholderQR, and faster than
- * FullPivHouseholderQR.
+  * This decomposition performs column pivoting in order to be rank-revealing and improve
+  * numerical stability. It is slower than HouseholderQR, and faster than FullPivHouseholderQR.
   *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
   * \sa MatrixBase::colPivHouseholderQr()
   */
 template<typename _MatrixType> class ColPivHouseholderQR
+        : public SolverBase<ColPivHouseholderQR<_MatrixType> >
 {
   public:
 
     typedef _MatrixType MatrixType;
+    typedef SolverBase<ColPivHouseholderQR> Base;
+    friend class SolverBase<ColPivHouseholderQR>;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(ColPivHouseholderQR)
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, Options, MaxRowsAtCompileTime, MaxRowsAtCompileTime> MatrixQType;
     typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
     typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType;
     typedef typename internal::plain_row_type<MatrixType, Index>::type IntRowVectorType;
     typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
     typedef typename internal::plain_row_type<MatrixType, RealScalar>::type RealRowVectorType;
     typedef HouseholderSequence<MatrixType,typename internal::remove_all<typename HCoeffsType::ConjugateReturnType>::type> HouseholderSequenceType;
+    typedef typename MatrixType::PlainObject PlainObject;
 
   private:
-    typedef typename PermutationType::Index PermIndexType;
+
+    typedef typename PermutationType::StorageIndex PermIndexType;
 
   public:
 
@@ -74,60 +82,83 @@
     * The default constructor is useful in cases in which the user intends to
     * perform decompositions via ColPivHouseholderQR::compute(const MatrixType&).
     */
-   ColPivHouseholderQR()
-       : m_qr(),
-         m_hCoeffs(),
-         m_colsPermutation(),
-         m_colsTranspositions(),
-         m_temp(),
-         m_colNormsDirect(),
-         m_colNormsUpdated(),
-         m_isInitialized(false),
-         m_usePrescribedThreshold(false) {}
+    ColPivHouseholderQR()
+      : m_qr(),
+        m_hCoeffs(),
+        m_colsPermutation(),
+        m_colsTranspositions(),
+        m_temp(),
+        m_colNormsUpdated(),
+        m_colNormsDirect(),
+        m_isInitialized(false),
+        m_usePrescribedThreshold(false) {}
 
-   /** \brief Default Constructor with memory preallocation
-     *
-     * Like the default constructor but with preallocation of the internal data
-     * according to the specified problem \a size.
-     * \sa ColPivHouseholderQR()
-     */
-   ColPivHouseholderQR(Index rows, Index cols)
-       : m_qr(rows, cols),
-         m_hCoeffs((std::min)(rows, cols)),
-         m_colsPermutation(PermIndexType(cols)),
-         m_colsTranspositions(cols),
-         m_temp(cols),
-         m_colNormsDirect(cols),
-         m_colNormsUpdated(cols),
-         m_isInitialized(false),
-         m_usePrescribedThreshold(false) {}
+    /** \brief Default Constructor with memory preallocation
+      *
+      * Like the default constructor but with preallocation of the internal data
+      * according to the specified problem \a size.
+      * \sa ColPivHouseholderQR()
+      */
+    ColPivHouseholderQR(Index rows, Index cols)
+      : m_qr(rows, cols),
+        m_hCoeffs((std::min)(rows,cols)),
+        m_colsPermutation(PermIndexType(cols)),
+        m_colsTranspositions(cols),
+        m_temp(cols),
+        m_colNormsUpdated(cols),
+        m_colNormsDirect(cols),
+        m_isInitialized(false),
+        m_usePrescribedThreshold(false) {}
 
-   /** \brief Constructs a QR factorization from a given matrix
-     *
-     * This constructor computes the QR factorization of the matrix \a matrix by
-    * calling
-     * the method compute(). It is a short cut for:
-     *
-     * \code
-     * ColPivHouseholderQR<MatrixType> qr(matrix.rows(), matrix.cols());
-     * qr.compute(matrix);
-     * \endcode
-     *
-     * \sa compute()
-     */
-   ColPivHouseholderQR(const MatrixType& matrix)
-       : m_qr(matrix.rows(), matrix.cols()),
-         m_hCoeffs((std::min)(matrix.rows(), matrix.cols())),
-         m_colsPermutation(PermIndexType(matrix.cols())),
-         m_colsTranspositions(matrix.cols()),
-         m_temp(matrix.cols()),
-         m_colNormsDirect(matrix.cols()),
-         m_colNormsUpdated(matrix.cols()),
-         m_isInitialized(false),
-         m_usePrescribedThreshold(false) {
-     compute(matrix);
+    /** \brief Constructs a QR factorization from a given matrix
+      *
+      * This constructor computes the QR factorization of the matrix \a matrix by calling
+      * the method compute(). It is a short cut for:
+      *
+      * \code
+      * ColPivHouseholderQR<MatrixType> qr(matrix.rows(), matrix.cols());
+      * qr.compute(matrix);
+      * \endcode
+      *
+      * \sa compute()
+      */
+    template<typename InputType>
+    explicit ColPivHouseholderQR(const EigenBase<InputType>& matrix)
+      : m_qr(matrix.rows(), matrix.cols()),
+        m_hCoeffs((std::min)(matrix.rows(),matrix.cols())),
+        m_colsPermutation(PermIndexType(matrix.cols())),
+        m_colsTranspositions(matrix.cols()),
+        m_temp(matrix.cols()),
+        m_colNormsUpdated(matrix.cols()),
+        m_colNormsDirect(matrix.cols()),
+        m_isInitialized(false),
+        m_usePrescribedThreshold(false)
+    {
+      compute(matrix.derived());
     }
 
+    /** \brief Constructs a QR factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
+      *
+      * \sa ColPivHouseholderQR(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit ColPivHouseholderQR(EigenBase<InputType>& matrix)
+      : m_qr(matrix.derived()),
+        m_hCoeffs((std::min)(matrix.rows(),matrix.cols())),
+        m_colsPermutation(PermIndexType(matrix.cols())),
+        m_colsTranspositions(matrix.cols()),
+        m_temp(matrix.cols()),
+        m_colNormsUpdated(matrix.cols()),
+        m_colNormsDirect(matrix.cols()),
+        m_isInitialized(false),
+        m_usePrescribedThreshold(false)
+    {
+      computeInPlace();
+    }
+
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
       * *this is the QR decomposition, if any exists.
       *
@@ -135,9 +166,6 @@
       *
       * \returns a solution.
       *
-      * \note The case where b is a matrix is not yet implemented. Also, this
-      *       code is space inefficient.
-      *
       * \note_about_checking_solutions
       *
       * \note_about_arbitrary_choice_of_solution
@@ -146,15 +174,15 @@
       * Output: \verbinclude ColPivHouseholderQR_solve.out
       */
     template<typename Rhs>
-    inline const internal::solve_retval<ColPivHouseholderQR, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-      return internal::solve_retval<ColPivHouseholderQR, Rhs>(*this, b.derived());
-    }
+    inline const Solve<ColPivHouseholderQR, Rhs>
+    solve(const MatrixBase<Rhs>& b) const;
+    #endif
 
-    HouseholderSequenceType householderQ(void) const;
-    HouseholderSequenceType matrixQ(void) const { return householderQ(); }
+    HouseholderSequenceType householderQ() const;
+    HouseholderSequenceType matrixQ() const
+    {
+      return householderQ();
+    }
 
     /** \returns a reference to the matrix where the Householder QR decomposition is stored
       */
@@ -164,8 +192,7 @@
       return m_qr;
     }
 
-    /** \returns a reference to the matrix where the result Householder QR is
-     * stored
+    /** \returns a reference to the matrix where the result Householder QR is stored
      * \warning The strict lower part of this matrix contains internal values.
      * Only the upper triangular part should be referenced. To get it, use
      * \code matrixR().template triangularView<Upper>() \endcode
@@ -180,7 +207,8 @@
       return m_qr;
     }
 
-    ColPivHouseholderQR& compute(const MatrixType& matrix);
+    template<typename InputType>
+    ColPivHouseholderQR& compute(const EigenBase<InputType>& matrix);
 
     /** \returns a const reference to the column permutation matrix */
     const PermutationType& colsPermutation() const
@@ -290,20 +318,16 @@
       * \note If this matrix is not invertible, the returned matrix has undefined coefficients.
       *       Use isInvertible() to first determine whether this matrix is invertible.
       */
-    inline const
-    internal::solve_retval<ColPivHouseholderQR, typename MatrixType::IdentityReturnType>
-    inverse() const
+    inline const Inverse<ColPivHouseholderQR> inverse() const
     {
       eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-      return internal::solve_retval<ColPivHouseholderQR,typename MatrixType::IdentityReturnType>
-               (*this, MatrixType::Identity(m_qr.rows(), m_qr.cols()));
+      return Inverse<ColPivHouseholderQR>(*this);
     }
 
     inline Index rows() const { return m_qr.rows(); }
     inline Index cols() const { return m_qr.cols(); }
 
-    /** \returns a const reference to the vector of Householder coefficients
-     * used to represent the factor \c Q.
+    /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q.
       *
       * For advanced uses only.
       */
@@ -378,10 +402,9 @@
       */
     RealScalar maxPivot() const { return m_maxpivot; }
 
-    /** \brief Reports whether the QR factorization was succesful.
+    /** \brief Reports whether the QR factorization was successful.
       *
-      * \note This function always returns \c Success. It is provided for
-     * compatibility
+      * \note This function always returns \c Success. It is provided for compatibility
       * with other factorization routines.
       * \returns \c Success
       */
@@ -391,17 +414,32 @@
       return Success;
     }
 
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
+    #endif
+
   protected:
 
     friend class CompleteOrthogonalDecomposition<MatrixType>;
 
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+
+    void computeInPlace();
+
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     PermutationType m_colsPermutation;
     IntRowVectorType m_colsTranspositions;
     RowVectorType m_temp;
-    RealRowVectorType m_colNormsDirect;
     RealRowVectorType m_colNormsUpdated;
+    RealRowVectorType m_colNormsDirect;
     bool m_isInitialized, m_usePrescribedThreshold;
     RealScalar m_prescribedThreshold, m_maxpivot;
     Index m_nonzero_pivots;
@@ -432,36 +470,47 @@
   * \sa class ColPivHouseholderQR, ColPivHouseholderQR(const MatrixType&)
   */
 template<typename MatrixType>
-ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const MatrixType& matrix)
+template<typename InputType>
+ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const EigenBase<InputType>& matrix)
 {
-  using std::abs;
+  m_qr = matrix.derived();
+  computeInPlace();
+  return *this;
+}
 
-  Index rows = matrix.rows();
-  Index cols = matrix.cols();
-  Index size = matrix.diagonalSize();
+template<typename MatrixType>
+void ColPivHouseholderQR<MatrixType>::computeInPlace()
+{
+  check_template_parameters();
 
   // the column permutation is stored as int indices, so just to be sure:
-  eigen_assert(cols<=NumTraits<int>::highest());
+  eigen_assert(m_qr.cols()<=NumTraits<int>::highest());
 
-  m_qr = matrix;
+  using std::abs;
+
+  Index rows = m_qr.rows();
+  Index cols = m_qr.cols();
+  Index size = m_qr.diagonalSize();
+
   m_hCoeffs.resize(size);
 
   m_temp.resize(cols);
 
-  m_colsTranspositions.resize(matrix.cols());
+  m_colsTranspositions.resize(m_qr.cols());
   Index number_of_transpositions = 0;
 
+  m_colNormsUpdated.resize(cols);
   m_colNormsDirect.resize(cols);
   for (Index k = 0; k < cols; ++k) {
     // colNormsDirect(k) caches the most recent directly computed norm of
     // column k.
     m_colNormsDirect.coeffRef(k) = m_qr.col(k).norm();
+    m_colNormsUpdated.coeffRef(k) = m_colNormsDirect.coeffRef(k);
   }
-  m_colNormsUpdated = m_colNormsDirect;
 
-  RealScalar threshold_helper =  numext::abs2(m_colNormsUpdated.maxCoeff() * NumTraits<Scalar>::epsilon()) /
-                                RealScalar(rows);
-  RealScalar norm_downdate_threshold = numext::sqrt(NumTraits<Scalar>::epsilon());
+  RealScalar threshold_helper =  numext::abs2<RealScalar>(m_colNormsUpdated.maxCoeff() * NumTraits<RealScalar>::epsilon()) / RealScalar(rows);
+  RealScalar norm_downdate_threshold = numext::sqrt(NumTraits<RealScalar>::epsilon());
+
   m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case)
   m_maxpivot = RealScalar(0);
 
@@ -469,32 +518,20 @@
   {
     // first, we look up in our table m_colNormsUpdated which column has the biggest norm
     Index biggest_col_index;
-    RealScalar biggest_col_sq_norm =
-        numext::abs2(m_colNormsUpdated.tail(cols - k).maxCoeff(&biggest_col_index));
+    RealScalar biggest_col_sq_norm = numext::abs2(m_colNormsUpdated.tail(cols-k).maxCoeff(&biggest_col_index));
     biggest_col_index += k;
 
-    // if the current biggest column is smaller than epsilon times the initial biggest column,
-    // terminate to avoid generating nan/inf values.
-    // Note that here, if we test instead for "biggest == 0", we get a failure every 1000 (or so)
-    // repetitions of the unit test, with the result of solve() filled with large values of the order
-    // of 1/(size*epsilon).
-    if(biggest_col_sq_norm < threshold_helper * RealScalar(rows-k))
-    {
+    // Track the number of meaningful pivots but do not stop the decomposition to make
+    // sure that the initial matrix is properly reproduced. See bug 941.
+    if(m_nonzero_pivots==size && biggest_col_sq_norm < threshold_helper * RealScalar(rows-k))
       m_nonzero_pivots = k;
-      m_hCoeffs.tail(size-k).setZero();
-      m_qr.bottomRightCorner(rows-k,cols-k)
-          .template triangularView<StrictlyLower>()
-          .setZero();
-      break;
-    }
 
     // apply the transposition to the columns
     m_colsTranspositions.coeffRef(k) = biggest_col_index;
     if(k != biggest_col_index) {
       m_qr.col(k).swap(m_qr.col(biggest_col_index));
       std::swap(m_colNormsUpdated.coeffRef(k), m_colNormsUpdated.coeffRef(biggest_col_index));
-      std::swap(m_colNormsDirect.coeffRef(k),
-                m_colNormsDirect.coeffRef(biggest_col_index));
+      std::swap(m_colNormsDirect.coeffRef(k), m_colNormsDirect.coeffRef(biggest_col_index));
       ++number_of_transpositions;
     }
 
@@ -516,18 +553,16 @@
     for (Index j = k + 1; j < cols; ++j) {
       // The following implements the stable norm downgrade step discussed in
       // http://www.netlib.org/lapack/lawnspdf/lawn176.pdf
-      // and used in LAPACK routines xGEQPF and xGEQP3. See lines 278-297 in
-      // http://www.netlib.org/lapack/explore-html/dc/df4/sgeqpf_8f_source.html
-      if (m_colNormsUpdated.coeffRef(j) != 0) {
-        RealScalar temp =
-            abs(m_qr.coeffRef(k, j)) / m_colNormsUpdated.coeffRef(j);
+      // and used in LAPACK routines xGEQPF and xGEQP3.
+      // See lines 278-297 in http://www.netlib.org/lapack/explore-html/dc/df4/sgeqpf_8f_source.html
+      if (m_colNormsUpdated.coeffRef(j) != RealScalar(0)) {
+        RealScalar temp = abs(m_qr.coeffRef(k, j)) / m_colNormsUpdated.coeffRef(j);
         temp = (RealScalar(1) + temp) * (RealScalar(1) - temp);
-        temp = temp < 0 ? 0 : temp;
-        RealScalar temp2 = temp * numext::abs2(m_colNormsUpdated.coeffRef(j) /
-                                               m_colNormsDirect.coeffRef(j));
+        temp = temp <  RealScalar(0) ? RealScalar(0) : temp;
+        RealScalar temp2 = temp * numext::abs2<RealScalar>(m_colNormsUpdated.coeffRef(j) /
+                                                           m_colNormsDirect.coeffRef(j));
         if (temp2 <= norm_downdate_threshold) {
-          // The updated norm has become too inaccurate, so re-compute the
-          // column
+          // The updated norm has become too inaccurate so re-compute the column
           // norm directly.
           m_colNormsDirect.coeffRef(j) = m_qr.col(j).tail(rows - k - 1).norm();
           m_colNormsUpdated.coeffRef(j) = m_colNormsDirect.coeffRef(j);
@@ -539,65 +574,90 @@
   }
 
   m_colsPermutation.setIdentity(PermIndexType(cols));
-  for(PermIndexType k = 0; k < m_nonzero_pivots; ++k)
+  for(PermIndexType k = 0; k < size/*m_nonzero_pivots*/; ++k)
     m_colsPermutation.applyTranspositionOnTheRight(k, PermIndexType(m_colsTranspositions.coeff(k)));
 
   m_det_pq = (number_of_transpositions%2) ? -1 : 1;
   m_isInitialized = true;
-
-  return *this;
 }
 
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename _MatrixType>
+template<typename RhsType, typename DstType>
+void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
+{
+  const Index nonzero_pivots = nonzeroPivots();
+
+  if(nonzero_pivots == 0)
+  {
+    dst.setZero();
+    return;
+  }
+
+  typename RhsType::PlainObject c(rhs);
+
+  c.applyOnTheLeft(householderQ().setLength(nonzero_pivots).adjoint() );
+
+  m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots)
+      .template triangularView<Upper>()
+      .solveInPlace(c.topRows(nonzero_pivots));
+
+  for(Index i = 0; i < nonzero_pivots; ++i) dst.row(m_colsPermutation.indices().coeff(i)) = c.row(i);
+  for(Index i = nonzero_pivots; i < cols(); ++i) dst.row(m_colsPermutation.indices().coeff(i)).setZero();
+}
+
+template<typename _MatrixType>
+template<bool Conjugate, typename RhsType, typename DstType>
+void ColPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+  const Index nonzero_pivots = nonzeroPivots();
+
+  if(nonzero_pivots == 0)
+  {
+    dst.setZero();
+    return;
+  }
+
+  typename RhsType::PlainObject c(m_colsPermutation.transpose()*rhs);
+
+  m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots)
+        .template triangularView<Upper>()
+        .transpose().template conjugateIf<Conjugate>()
+        .solveInPlace(c.topRows(nonzero_pivots));
+
+  dst.topRows(nonzero_pivots) = c.topRows(nonzero_pivots);
+  dst.bottomRows(rows()-nonzero_pivots).setZero();
+
+  dst.applyOnTheLeft(householderQ().setLength(nonzero_pivots).template conjugateIf<!Conjugate>() );
+}
+#endif
+
 namespace internal {
 
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<ColPivHouseholderQR<_MatrixType>, Rhs>
-  : solve_retval_base<ColPivHouseholderQR<_MatrixType>, Rhs>
+template<typename DstXprType, typename MatrixType>
+struct Assignment<DstXprType, Inverse<ColPivHouseholderQR<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename ColPivHouseholderQR<MatrixType>::Scalar>, Dense2Dense>
 {
-  EIGEN_MAKE_SOLVE_HELPERS(ColPivHouseholderQR<_MatrixType>,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
+  typedef ColPivHouseholderQR<MatrixType> QrType;
+  typedef Inverse<QrType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename QrType::Scalar> &)
   {
-    eigen_assert(rhs().rows() == dec().rows());
-
-    const Index cols = dec().cols(), nonzero_pivots = dec().nonzeroPivots();
-
-    if(nonzero_pivots == 0)
-    {
-      dst.setZero();
-      return;
-    }
-
-    typename Rhs::PlainObject c(rhs());
-
-    // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
-    c.applyOnTheLeft(householderSequence(dec().matrixQR(), dec().hCoeffs())
-                     .setLength(dec().nonzeroPivots())
-                     .transpose()
-      );
-
-    dec().matrixR()
-       .topLeftCorner(nonzero_pivots, nonzero_pivots)
-       .template triangularView<Upper>()
-       .solveInPlace(c.topRows(nonzero_pivots));
-
-    for(Index i = 0; i < nonzero_pivots; ++i) dst.row(dec().colsPermutation().indices().coeff(i)) = c.row(i);
-    for(Index i = nonzero_pivots; i < cols; ++i) dst.row(dec().colsPermutation().indices().coeff(i)).setZero();
+    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
   }
 };
 
 } // end namespace internal
 
-/** \returns the matrix Q as a sequence of householder transformations */
+/** \returns the matrix Q as a sequence of householder transformations.
+  * You can extract the meaningful part only by using:
+  * \code qr.householderQ().setLength(qr.nonzeroPivots()) \endcode*/
 template<typename MatrixType>
 typename ColPivHouseholderQR<MatrixType>::HouseholderSequenceType ColPivHouseholderQR<MatrixType>
   ::householderQ() const
 {
   eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-  return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate()).setLength(m_nonzero_pivots);
+  return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate());
 }
 
-#ifndef __CUDACC__
 /** \return the column-pivoting Householder QR decomposition of \c *this.
   *
   * \sa class ColPivHouseholderQR
@@ -608,7 +668,6 @@
 {
   return ColPivHouseholderQR<PlainObject>(eval());
 }
-#endif // __CUDACC__
 
 } // end namespace Eigen
 

diff --git a/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h b/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
new file mode 100644
index 0000000..4e9651f
--- /dev/null
+++ b/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h

@@ -0,0 +1,97 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to LAPACKe
+ *    Householder QR decomposition of a matrix with column pivoting based on
+ *    LAPACKE_?geqp3 function.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_COLPIVOTINGHOUSEHOLDERQR_LAPACKE_H
+#define EIGEN_COLPIVOTINGHOUSEHOLDERQR_LAPACKE_H
+
+namespace Eigen { 
+
+/** \internal Specialization for the data types supported by LAPACKe */
+
+#define EIGEN_LAPACKE_QR_COLPIV(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX, EIGCOLROW, LAPACKE_COLROW) \
+template<> template<typename InputType> inline \
+ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> >& \
+ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> >::compute( \
+              const EigenBase<InputType>& matrix) \
+\
+{ \
+  using std::abs; \
+  typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> MatrixType; \
+  typedef MatrixType::RealScalar RealScalar; \
+  Index rows = matrix.rows();\
+  Index cols = matrix.cols();\
+\
+  m_qr = matrix;\
+  Index size = m_qr.diagonalSize();\
+  m_hCoeffs.resize(size);\
+\
+  m_colsTranspositions.resize(cols);\
+  /*Index number_of_transpositions = 0;*/ \
+\
+  m_nonzero_pivots = 0; \
+  m_maxpivot = RealScalar(0);\
+  m_colsPermutation.resize(cols); \
+  m_colsPermutation.indices().setZero(); \
+\
+  lapack_int lda = internal::convert_index<lapack_int,Index>(m_qr.outerStride()); \
+  lapack_int matrix_order = LAPACKE_COLROW; \
+  LAPACKE_##LAPACKE_PREFIX##geqp3( matrix_order, internal::convert_index<lapack_int,Index>(rows), internal::convert_index<lapack_int,Index>(cols), \
+                              (LAPACKE_TYPE*)m_qr.data(), lda, (lapack_int*)m_colsPermutation.indices().data(), (LAPACKE_TYPE*)m_hCoeffs.data()); \
+  m_isInitialized = true; \
+  m_maxpivot=m_qr.diagonal().cwiseAbs().maxCoeff(); \
+  m_hCoeffs.adjointInPlace(); \
+  RealScalar premultiplied_threshold = abs(m_maxpivot) * threshold(); \
+  lapack_int *perm = m_colsPermutation.indices().data(); \
+  for(Index i=0;i<size;i++) { \
+    m_nonzero_pivots += (abs(m_qr.coeff(i,i)) > premultiplied_threshold);\
+  } \
+  for(Index i=0;i<cols;i++) perm[i]--;\
+\
+  /*m_det_pq = (number_of_transpositions%2) ? -1 : 1;  // TODO: It's not needed now; fix upon availability in Eigen */ \
+\
+  return *this; \
+}
+
+EIGEN_LAPACKE_QR_COLPIV(double,   double,        d, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(float,    float,         s, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(dcomplex, lapack_complex_double, z, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(scomplex, lapack_complex_float,  c, ColMajor, LAPACK_COL_MAJOR)
+
+EIGEN_LAPACKE_QR_COLPIV(double,   double,        d, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(float,    float,         s, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(dcomplex, lapack_complex_double, z, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(scomplex, lapack_complex_float,  c, RowMajor, LAPACK_ROW_MAJOR)
+
+} // end namespace Eigen
+
+#endif // EIGEN_COLPIVOTINGHOUSEHOLDERQR_LAPACKE_H

diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
index d3f7b2f..486d337 100644
--- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h
+++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h

@@ -12,6 +12,18 @@
 
 namespace Eigen {
 
+namespace internal {
+template <typename _MatrixType>
+struct traits<CompleteOrthogonalDecomposition<_MatrixType> >
+    : traits<_MatrixType> {
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
+  enum { Flags = 0 };
+};
+
+}  // end namespace internal
+
 /** \ingroup QR_Module
   *
   * \class CompleteOrthogonalDecomposition
@@ -20,35 +32,36 @@
   *
   * \param MatrixType the type of the matrix of which we are computing the COD.
   *
-  * This class performs a rank-revealing complete ortogonal decomposition of a
+  * This class performs a rank-revealing complete orthogonal decomposition of a
   * matrix  \b A into matrices \b P, \b Q, \b T, and \b Z such that
   * \f[
-  *  \mathbf{A} \, \mathbf{P} = \mathbf{Q} \, \begin{matrix} \mathbf{T} &
-  *  \mathbf{0} \\ \mathbf{0} & \mathbf{0} \end{matrix} \, \mathbf{Z}
+  *  \mathbf{A} \, \mathbf{P} = \mathbf{Q} \,
+  *                     \begin{bmatrix} \mathbf{T} &  \mathbf{0} \\
+  *                                     \mathbf{0} & \mathbf{0} \end{bmatrix} \, \mathbf{Z}
   * \f]
   * by using Householder transformations. Here, \b P is a permutation matrix,
   * \b Q and \b Z are unitary matrices and \b T an upper triangular matrix of
   * size rank-by-rank. \b A may be rank deficient.
   *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
   * \sa MatrixBase::completeOrthogonalDecomposition()
   */
-template <typename _MatrixType>
-class CompleteOrthogonalDecomposition {
+template <typename _MatrixType> class CompleteOrthogonalDecomposition
+          : public SolverBase<CompleteOrthogonalDecomposition<_MatrixType> >
+{
  public:
   typedef _MatrixType MatrixType;
+  typedef SolverBase<CompleteOrthogonalDecomposition> Base;
+
+  template<typename Derived>
+  friend struct internal::solve_assertion;
+
+  EIGEN_GENERIC_PUBLIC_INTERFACE(CompleteOrthogonalDecomposition)
   enum {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-    Options = MatrixType::Options,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
   };
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::RealScalar RealScalar;
-  typedef typename MatrixType::Index Index;
-  typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, Options,
-                 MaxRowsAtCompileTime, MaxRowsAtCompileTime>
-      MatrixQType;
   typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
   typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime>
       PermutationType;
@@ -61,6 +74,7 @@
       MatrixType, typename internal::remove_all<
                       typename HCoeffsType::ConjugateReturnType>::type>
       HouseholderSequenceType;
+  typedef typename MatrixType::PlainObject PlainObject;
 
  private:
   typedef typename PermutationType::Index PermIndexType;
@@ -100,45 +114,54 @@
    *
    * \sa compute()
    */
-  CompleteOrthogonalDecomposition(const MatrixType& matrix)
+  template <typename InputType>
+  explicit CompleteOrthogonalDecomposition(const EigenBase<InputType>& matrix)
       : m_cpqr(matrix.rows(), matrix.cols()),
         m_zCoeffs((std::min)(matrix.rows(), matrix.cols())),
-        m_temp(matrix.cols()) {
-    compute(matrix);
+        m_temp(matrix.cols())
+  {
+    compute(matrix.derived());
   }
 
+  /** \brief Constructs a complete orthogonal decomposition from a given matrix
+    *
+    * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
+    *
+    * \sa CompleteOrthogonalDecomposition(const EigenBase&)
+    */
+  template<typename InputType>
+  explicit CompleteOrthogonalDecomposition(EigenBase<InputType>& matrix)
+    : m_cpqr(matrix.derived()),
+      m_zCoeffs((std::min)(matrix.rows(), matrix.cols())),
+      m_temp(matrix.cols())
+  {
+    computeInPlace();
+  } 
+
+  #ifdef EIGEN_PARSED_BY_DOXYGEN
   /** This method computes the minimum-norm solution X to a least squares
-   * problem \f[\mathrm{minimize} ||A X - B|| \f], where \b A is the matrix of
+   * problem \f[\mathrm{minimize} \|A X - B\|, \f] where \b A is the matrix of
    * which \c *this is the complete orthogonal decomposition.
    *
-   * \param B the right-hand sides of the problem to solve.
+   * \param b the right-hand sides of the problem to solve.
    *
    * \returns a solution.
    *
    */
   template <typename Rhs>
-  inline const internal::solve_retval<CompleteOrthogonalDecomposition, Rhs>
-  solve(const MatrixBase<Rhs>& b) const {
-    eigen_assert(m_cpqr.m_isInitialized &&
-                 "CompleteOrthogonalDecomposition is not initialized.");
-    return internal::solve_retval<CompleteOrthogonalDecomposition, Rhs>(
-        *this, b.derived());
-  }
+  inline const Solve<CompleteOrthogonalDecomposition, Rhs> solve(
+      const MatrixBase<Rhs>& b) const;
+  #endif
 
   HouseholderSequenceType householderQ(void) const;
   HouseholderSequenceType matrixQ(void) const { return m_cpqr.householderQ(); }
 
-  /** Overwrites \b rhs with \f$ \mathbf{Z}^* * \mathbf{rhs} \f$.
-   */
-  template <typename Rhs>
-  void applyZAdjointOnTheLeftInPlace(Rhs& rhs) const;
-
   /** \returns the matrix \b Z.
    */
   MatrixType matrixZ() const {
     MatrixType Z = MatrixType::Identity(m_cpqr.cols(), m_cpqr.cols());
-    applyZAdjointOnTheLeftInPlace(Z);
-    return Z.adjoint();
+    applyZOnTheLeftInPlace<false>(Z);
+    return Z;
   }
 
   /** \returns a reference to the matrix where the complete orthogonal
@@ -159,7 +182,13 @@
    */
   const MatrixType& matrixT() const { return m_cpqr.matrixQR(); }
 
-  CompleteOrthogonalDecomposition& compute(const MatrixType& matrix);
+  template <typename InputType>
+  CompleteOrthogonalDecomposition& compute(const EigenBase<InputType>& matrix) {
+    // Compute the column pivoted QR factorization A P = Q R.
+    m_cpqr.compute(matrix);
+    computeInPlace();
+    return *this;
+  }
 
   /** \returns a const reference to the column permutation matrix */
   const PermutationType& colsPermutation() const {
@@ -243,16 +272,13 @@
 
   /** \returns the pseudo-inverse of the matrix of which *this is the complete
    * orthogonal decomposition.
-   *
-   * \warning: Do not compute \c this->pseudoInverse()*rhs to solve linear systems.
+   * \warning: Do not compute \c this->pseudoInverse()*rhs to solve a linear systems.
    * It is more efficient and numerically stable to call \c this->solve(rhs).
    */
-  inline const internal::solve_retval<CompleteOrthogonalDecomposition,
-                                      typename MatrixType::IdentityReturnType>
-  pseudoInverse() const {
-    return internal::solve_retval<CompleteOrthogonalDecomposition,
-                                  typename MatrixType::IdentityReturnType>(
-        *this, MatrixType::Identity(m_cpqr.rows(), m_cpqr.rows()));
+  inline const Inverse<CompleteOrthogonalDecomposition> pseudoInverse() const
+  {
+    eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized.");
+    return Inverse<CompleteOrthogonalDecomposition>(*this);
   }
 
   inline Index rows() const { return m_cpqr.rows(); }
@@ -316,10 +342,9 @@
   RealScalar threshold() const { return m_cpqr.threshold(); }
 
   /** \returns the number of nonzero pivots in the complete orthogonal
-   * decomposition.
-   * Here nonzero is meant in the exact sense, not in a fuzzy sense.
-   * So that notion isn't really intrinsically interesting, but it is
-   * still useful when implementing algorithms.
+   * decomposition. Here nonzero is meant in the exact sense, not in a
+   * fuzzy sense. So that notion isn't really intrinsically interesting,
+   * but it is still useful when implementing algorithms.
    *
    * \sa rank()
    */
@@ -331,7 +356,7 @@
   inline RealScalar maxPivot() const { return m_cpqr.maxPivot(); }
 
   /** \brief Reports whether the complete orthogonal decomposition was
-   * succesful.
+   * successful.
    *
    * \note This function always returns \c Success. It is provided for
    * compatibility
@@ -343,7 +368,40 @@
     return Success;
   }
 
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  template <typename RhsType, typename DstType>
+  void _solve_impl(const RhsType& rhs, DstType& dst) const;
+
+  template<bool Conjugate, typename RhsType, typename DstType>
+  void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
+#endif
+
  protected:
+  static void check_template_parameters() {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+  }
+
+  template<bool Transpose_, typename Rhs>
+  void _check_solve_assertion(const Rhs& b) const {
+      EIGEN_ONLY_USED_FOR_DEBUG(b);
+      eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized.");
+      eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "CompleteOrthogonalDecomposition::solve(): invalid number of rows of the right hand side matrix b");
+  }
+
+  void computeInPlace();
+
+  /** Overwrites \b rhs with \f$ \mathbf{Z} * \mathbf{rhs} \f$ or
+   *  \f$ \mathbf{\overline Z} * \mathbf{rhs} \f$ if \c Conjugate 
+   *  is set to \c true.
+   */
+  template <bool Conjugate, typename Rhs>
+  void applyZOnTheLeftInPlace(Rhs& rhs) const;
+
+  /** Overwrites \b rhs with \f$ \mathbf{Z}^* * \mathbf{rhs} \f$.
+   */
+  template <typename Rhs>
+  void applyZAdjointOnTheLeftInPlace(Rhs& rhs) const;
+
   ColPivHouseholderQR<MatrixType> m_cpqr;
   HCoeffsType m_zCoeffs;
   RowVectorType m_temp;
@@ -369,13 +427,19 @@
  * CompleteOrthogonalDecomposition(const MatrixType&)
  */
 template <typename MatrixType>
-CompleteOrthogonalDecomposition<MatrixType>&
-CompleteOrthogonalDecomposition<MatrixType>::compute(const MatrixType& matrix) {
-  // Compute the column pivoted QR factorization A P = Q R.
-  m_cpqr.compute(matrix);
+void CompleteOrthogonalDecomposition<MatrixType>::computeInPlace()
+{
+  check_template_parameters();
+
+  // the column permutation is stored as int indices, so just to be sure:
+  eigen_assert(m_cpqr.cols() <= NumTraits<int>::highest());
 
   const Index rank = m_cpqr.rank();
-  const Index cols = matrix.cols();
+  const Index cols = m_cpqr.cols();
+  const Index rows = m_cpqr.rows();
+  m_zCoeffs.resize((std::min)(rows, cols));
+  m_temp.resize(cols);
+
   if (rank < cols) {
     // We have reduced the (permuted) matrix to the form
     //   [R11 R12]
@@ -408,7 +472,7 @@
         // Apply Z(k) to the first k rows of X_k
         m_cpqr.m_qr.topRightCorner(k, cols - rank + 1)
             .applyHouseholderOnTheRight(
-                m_cpqr.m_qr.row(k).tail(cols - rank).transpose(), m_zCoeffs(k),
+                m_cpqr.m_qr.row(k).tail(cols - rank).adjoint(), m_zCoeffs(k),
                 &m_temp(0));
       }
       if (k != rank - 1) {
@@ -418,7 +482,28 @@
       }
     }
   }
-  return *this;
+}
+
+template <typename MatrixType>
+template <bool Conjugate, typename Rhs>
+void CompleteOrthogonalDecomposition<MatrixType>::applyZOnTheLeftInPlace(
+    Rhs& rhs) const {
+  const Index cols = this->cols();
+  const Index nrhs = rhs.cols();
+  const Index rank = this->rank();
+  Matrix<typename Rhs::Scalar, Dynamic, 1> temp((std::max)(cols, nrhs));
+  for (Index k = rank-1; k >= 0; --k) {
+    if (k != rank - 1) {
+      rhs.row(k).swap(rhs.row(rank - 1));
+    }
+    rhs.middleRows(rank - 1, cols - rank + 1)
+        .applyHouseholderOnTheLeft(
+            matrixQTZ().row(k).tail(cols - rank).transpose().template conjugateIf<!Conjugate>(), zCoeffs().template conjugateIf<Conjugate>()(k),
+            &temp(0));
+    if (k != rank - 1) {
+      rhs.row(k).swap(rhs.row(rank - 1));
+    }
+  }
 }
 
 template <typename MatrixType>
@@ -428,7 +513,7 @@
   const Index cols = this->cols();
   const Index nrhs = rhs.cols();
   const Index rank = this->rank();
-  Matrix<typename MatrixType::Scalar, Dynamic, 1> temp(std::max(cols, nrhs));
+  Matrix<typename Rhs::Scalar, Dynamic, 1> temp((std::max)(cols, nrhs));
   for (Index k = 0; k < rank; ++k) {
     if (k != rank - 1) {
       rhs.row(k).swap(rhs.row(rank - 1));
@@ -443,53 +528,90 @@
   }
 }
 
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template <typename _MatrixType>
+template <typename RhsType, typename DstType>
+void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl(
+    const RhsType& rhs, DstType& dst) const {
+  const Index rank = this->rank();
+  if (rank == 0) {
+    dst.setZero();
+    return;
+  }
+
+  // Compute c = Q^* * rhs
+  typename RhsType::PlainObject c(rhs);
+  c.applyOnTheLeft(matrixQ().setLength(rank).adjoint());
+
+  // Solve T z = c(1:rank, :)
+  dst.topRows(rank) = matrixT()
+                          .topLeftCorner(rank, rank)
+                          .template triangularView<Upper>()
+                          .solve(c.topRows(rank));
+
+  const Index cols = this->cols();
+  if (rank < cols) {
+    // Compute y = Z^* * [ z ]
+    //                   [ 0 ]
+    dst.bottomRows(cols - rank).setZero();
+    applyZAdjointOnTheLeftInPlace(dst);
+  }
+
+  // Undo permutation to get x = P^{-1} * y.
+  dst = colsPermutation() * dst;
+}
+
+template<typename _MatrixType>
+template<bool Conjugate, typename RhsType, typename DstType>
+void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+  const Index rank = this->rank();
+
+  if (rank == 0) {
+    dst.setZero();
+    return;
+  }
+
+  typename RhsType::PlainObject c(colsPermutation().transpose()*rhs);
+
+  if (rank < cols()) {
+    applyZOnTheLeftInPlace<!Conjugate>(c);
+  }
+
+  matrixT().topLeftCorner(rank, rank)
+           .template triangularView<Upper>()
+           .transpose().template conjugateIf<Conjugate>()
+           .solveInPlace(c.topRows(rank));
+
+  dst.topRows(rank) = c.topRows(rank);
+  dst.bottomRows(rows()-rank).setZero();
+
+  dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf<!Conjugate>() );
+}
+#endif
+
 namespace internal {
 
-template <typename _MatrixType, typename Rhs>
-struct solve_retval<CompleteOrthogonalDecomposition<_MatrixType>, Rhs>
-    : solve_retval_base<CompleteOrthogonalDecomposition<_MatrixType>, Rhs> {
-  EIGEN_MAKE_SOLVE_HELPERS(CompleteOrthogonalDecomposition<_MatrixType>, Rhs)
-  typedef typename internal::plain_row_type<_MatrixType>::type RowVectorType;
+template<typename MatrixType>
+struct traits<Inverse<CompleteOrthogonalDecomposition<MatrixType> > >
+  : traits<typename Transpose<typename MatrixType::PlainObject>::PlainObject>
+{
+  enum { Flags = 0 };
+};
 
-  template <typename Dest>
-  void evalTo(Dest& dst) const {
-    eigen_assert(rhs().rows() == dec().rows());
-
-    const Index rank = dec().rank();
-    if (rank == 0) {
-      dst.setZero();
-      return;
-    }
-
-    // Compute c = Q^* * rhs
-    // Note that the matrix Q = H_0^* H_1^*... so its inverse is
-    // Q^* = (H_0 H_1 ...)^T
-    typename Rhs::PlainObject c(rhs());
-    c.applyOnTheLeft(householderSequence(dec().matrixQTZ(), dec().hCoeffs())
-                         .setLength(rank)
-                         .transpose());
-
-    // Solve T z = c(1:rank, :)
-    dst.topRows(rank) = dec()
-                            .matrixT()
-                            .topLeftCorner(rank, rank)
-                            .template triangularView<Upper>()
-                            .solve(c.topRows(rank));
-
-    const Index cols = dec().cols();
-    if (rank < cols) {
-      // Compute y = Z^* * [ z ]
-      //                   [ 0 ]
-      dst.bottomRows(cols - rank).setZero();
-      dec().applyZAdjointOnTheLeftInPlace(dst);
-    }
-
-    // Undo permutation to get x = P^{-1} * y.
-    dst = dec().colsPermutation() * dst;
+template<typename DstXprType, typename MatrixType>
+struct Assignment<DstXprType, Inverse<CompleteOrthogonalDecomposition<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename CompleteOrthogonalDecomposition<MatrixType>::Scalar>, Dense2Dense>
+{
+  typedef CompleteOrthogonalDecomposition<MatrixType> CodType;
+  typedef Inverse<CodType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename CodType::Scalar> &)
+  {
+    typedef Matrix<typename CodType::Scalar, CodType::RowsAtCompileTime, CodType::RowsAtCompileTime, 0, CodType::MaxRowsAtCompileTime, CodType::MaxRowsAtCompileTime> IdentityMatrixType;
+    dst = src.nestedExpression().solve(IdentityMatrixType::Identity(src.cols(), src.cols()));
   }
 };
 
-}  // end namespace internal
+} // end namespace internal
 
 /** \returns the matrix Q as a sequence of householder transformations */
 template <typename MatrixType>
@@ -498,7 +620,6 @@
   return m_cpqr.householderQ();
 }
 
-#ifndef __CUDACC__
 /** \return the complete orthogonal decomposition of \c *this.
   *
   * \sa class CompleteOrthogonalDecomposition
@@ -508,7 +629,6 @@
 MatrixBase<Derived>::completeOrthogonalDecomposition() const {
   return CompleteOrthogonalDecomposition<PlainObject>(eval());
 }
-#endif  // __CUDACC__
 
 }  // end namespace Eigen
 

diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h
index a7b0fc1..d0664a1 100644
--- a/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/Eigen/src/QR/FullPivHouseholderQR.h

@@ -15,6 +15,15 @@
 
 namespace internal {
 
+template<typename _MatrixType> struct traits<FullPivHouseholderQR<_MatrixType> >
+ : traits<_MatrixType>
+{
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
+  enum { Flags = 0 };
+};
+
 template<typename MatrixType> struct FullPivHouseholderQRMatrixQReturnType;
 
 template<typename MatrixType>
@@ -23,7 +32,7 @@
   typedef typename MatrixType::PlainObject ReturnType;
 };
 
-}
+} // end namespace internal
 
 /** \ingroup QR_Module
   *
@@ -31,7 +40,7 @@
   *
   * \brief Householder rank-revealing QR decomposition of a matrix with full pivoting
   *
-  * \param MatrixType the type of the matrix of which we are computing the QR decomposition
+  * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition
   *
   * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b P', \b Q and \b R
   * such that 
@@ -44,31 +53,33 @@
   * This decomposition performs a very prudent full pivoting in order to be rank-revealing and achieve optimal
   * numerical stability. The trade-off is that it is slower than HouseholderQR and ColPivHouseholderQR.
   *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
   * \sa MatrixBase::fullPivHouseholderQr()
   */
 template<typename _MatrixType> class FullPivHouseholderQR
+        : public SolverBase<FullPivHouseholderQR<_MatrixType> >
 {
   public:
 
     typedef _MatrixType MatrixType;
+    typedef SolverBase<FullPivHouseholderQR> Base;
+    friend class SolverBase<FullPivHouseholderQR>;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivHouseholderQR)
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
     typedef internal::FullPivHouseholderQRMatrixQReturnType<MatrixType> MatrixQReturnType;
     typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
-    typedef Matrix<Index, 1,
+    typedef Matrix<StorageIndex, 1,
                    EIGEN_SIZE_MIN_PREFER_DYNAMIC(ColsAtCompileTime,RowsAtCompileTime), RowMajor, 1,
                    EIGEN_SIZE_MIN_PREFER_FIXED(MaxColsAtCompileTime,MaxRowsAtCompileTime)> IntDiagSizeVectorType;
     typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType;
     typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
     typedef typename internal::plain_col_type<MatrixType>::type ColVectorType;
+    typedef typename MatrixType::PlainObject PlainObject;
 
     /** \brief Default Constructor.
       *
@@ -113,7 +124,8 @@
       * 
       * \sa compute()
       */
-    FullPivHouseholderQR(const MatrixType& matrix)
+    template<typename InputType>
+    explicit FullPivHouseholderQR(const EigenBase<InputType>& matrix)
       : m_qr(matrix.rows(), matrix.cols()),
         m_hCoeffs((std::min)(matrix.rows(), matrix.cols())),
         m_rows_transpositions((std::min)(matrix.rows(), matrix.cols())),
@@ -123,9 +135,30 @@
         m_isInitialized(false),
         m_usePrescribedThreshold(false)
     {
-      compute(matrix);
+      compute(matrix.derived());
     }
 
+    /** \brief Constructs a QR factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
+      *
+      * \sa FullPivHouseholderQR(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit FullPivHouseholderQR(EigenBase<InputType>& matrix)
+      : m_qr(matrix.derived()),
+        m_hCoeffs((std::min)(matrix.rows(), matrix.cols())),
+        m_rows_transpositions((std::min)(matrix.rows(), matrix.cols())),
+        m_cols_transpositions((std::min)(matrix.rows(), matrix.cols())),
+        m_cols_permutation(matrix.cols()),
+        m_temp(matrix.cols()),
+        m_isInitialized(false),
+        m_usePrescribedThreshold(false)
+    {
+      computeInPlace();
+    }
+
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
       * \c *this is the QR decomposition.
       *
@@ -134,9 +167,6 @@
       * \returns the exact or least-square solution if the rank is greater or equal to the number of columns of A,
       * and an arbitrary solution otherwise.
       *
-      * \note The case where b is a matrix is not yet implemented. Also, this
-      *       code is space inefficient.
-      *
       * \note_about_checking_solutions
       *
       * \note_about_arbitrary_choice_of_solution
@@ -145,12 +175,9 @@
       * Output: \verbinclude FullPivHouseholderQR_solve.out
       */
     template<typename Rhs>
-    inline const internal::solve_retval<FullPivHouseholderQR, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-      return internal::solve_retval<FullPivHouseholderQR, Rhs>(*this, b.derived());
-    }
+    inline const Solve<FullPivHouseholderQR, Rhs>
+    solve(const MatrixBase<Rhs>& b) const;
+    #endif
 
     /** \returns Expression object representing the matrix Q
       */
@@ -164,7 +191,8 @@
       return m_qr;
     }
 
-    FullPivHouseholderQR& compute(const MatrixType& matrix);
+    template<typename InputType>
+    FullPivHouseholderQR& compute(const EigenBase<InputType>& matrix);
 
     /** \returns a const reference to the column permutation matrix */
     const PermutationType& colsPermutation() const
@@ -280,13 +308,11 @@
       *
       * \note If this matrix is not invertible, the returned matrix has undefined coefficients.
       *       Use isInvertible() to first determine whether this matrix is invertible.
-      */    inline const
-    internal::solve_retval<FullPivHouseholderQR, typename MatrixType::IdentityReturnType>
-    inverse() const
+      */
+    inline const Inverse<FullPivHouseholderQR> inverse() const
     {
       eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-      return internal::solve_retval<FullPivHouseholderQR,typename MatrixType::IdentityReturnType>
-               (*this, MatrixType::Identity(m_qr.rows(), m_qr.cols()));
+      return Inverse<FullPivHouseholderQR>(*this);
     }
 
     inline Index rows() const { return m_qr.rows(); }
@@ -367,7 +393,23 @@
       */
     RealScalar maxPivot() const { return m_maxpivot; }
 
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
+    #endif
+
   protected:
+
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+
+    void computeInPlace();
+
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     IntDiagSizeVectorType m_rows_transpositions;
@@ -405,14 +447,25 @@
   * \sa class FullPivHouseholderQR, FullPivHouseholderQR(const MatrixType&)
   */
 template<typename MatrixType>
-FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(const MatrixType& matrix)
+template<typename InputType>
+FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(const EigenBase<InputType>& matrix)
 {
+  m_qr = matrix.derived();
+  computeInPlace();
+  return *this;
+}
+
+template<typename MatrixType>
+void FullPivHouseholderQR<MatrixType>::computeInPlace()
+{
+  check_template_parameters();
+
   using std::abs;
-  Index rows = matrix.rows();
-  Index cols = matrix.cols();
+  Index rows = m_qr.rows();
+  Index cols = m_qr.cols();
   Index size = (std::min)(rows,cols);
 
-  m_qr = matrix;
+  
   m_hCoeffs.resize(size);
 
   m_temp.resize(cols);
@@ -431,13 +484,15 @@
   for (Index k = 0; k < size; ++k)
   {
     Index row_of_biggest_in_corner, col_of_biggest_in_corner;
-    RealScalar biggest_in_corner;
+    typedef internal::scalar_score_coeff_op<Scalar> Scoring;
+    typedef typename Scoring::result_type Score;
 
-    biggest_in_corner = m_qr.bottomRightCorner(rows-k, cols-k)
-                            .cwiseAbs()
-                            .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
+    Score score = m_qr.bottomRightCorner(rows-k, cols-k)
+                      .unaryExpr(Scoring())
+                      .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
     row_of_biggest_in_corner += k;
     col_of_biggest_in_corner += k;
+    RealScalar biggest_in_corner = internal::abs_knowing_score<Scalar>()(m_qr(row_of_biggest_in_corner, col_of_biggest_in_corner), score);
     if(k==0) biggest = biggest_in_corner;
 
     // if the corner is negligible, then we have less than full rank, and we can finish early
@@ -446,15 +501,15 @@
       m_nonzero_pivots = k;
       for(Index i = k; i < size; i++)
       {
-        m_rows_transpositions.coeffRef(i) = i;
-        m_cols_transpositions.coeffRef(i) = i;
+        m_rows_transpositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
+        m_cols_transpositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
         m_hCoeffs.coeffRef(i) = Scalar(0);
       }
       break;
     }
 
-    m_rows_transpositions.coeffRef(k) = row_of_biggest_in_corner;
-    m_cols_transpositions.coeffRef(k) = col_of_biggest_in_corner;
+    m_rows_transpositions.coeffRef(k) = internal::convert_index<StorageIndex>(row_of_biggest_in_corner);
+    m_cols_transpositions.coeffRef(k) = internal::convert_index<StorageIndex>(col_of_biggest_in_corner);
     if(k != row_of_biggest_in_corner) {
       m_qr.row(k).tail(cols-k).swap(m_qr.row(row_of_biggest_in_corner).tail(cols-k));
       ++number_of_transpositions;
@@ -481,50 +536,90 @@
 
   m_det_pq = (number_of_transpositions%2) ? -1 : 1;
   m_isInitialized = true;
-
-  return *this;
 }
 
-namespace internal {
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<FullPivHouseholderQR<_MatrixType>, Rhs>
-  : solve_retval_base<FullPivHouseholderQR<_MatrixType>, Rhs>
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename _MatrixType>
+template<typename RhsType, typename DstType>
+void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  EIGEN_MAKE_SOLVE_HELPERS(FullPivHouseholderQR<_MatrixType>,Rhs)
+  const Index l_rank = rank();
 
-  template<typename Dest> void evalTo(Dest& dst) const
+  // FIXME introduce nonzeroPivots() and use it here. and more generally,
+  // make the same improvements in this dec as in FullPivLU.
+  if(l_rank==0)
   {
-    const Index rows = dec().rows(), cols = dec().cols();
-    eigen_assert(rhs().rows() == rows);
+    dst.setZero();
+    return;
+  }
 
-    // FIXME introduce nonzeroPivots() and use it here. and more generally,
-    // make the same improvements in this dec as in FullPivLU.
-    if(dec().rank()==0)
-    {
-      dst.setZero();
-      return;
-    }
+  typename RhsType::PlainObject c(rhs);
 
-    typename Rhs::PlainObject c(rhs());
+  Matrix<typename RhsType::Scalar,1,RhsType::ColsAtCompileTime> temp(rhs.cols());
+  for (Index k = 0; k < l_rank; ++k)
+  {
+    Index remainingSize = rows()-k;
+    c.row(k).swap(c.row(m_rows_transpositions.coeff(k)));
+    c.bottomRightCorner(remainingSize, rhs.cols())
+      .applyHouseholderOnTheLeft(m_qr.col(k).tail(remainingSize-1),
+                               m_hCoeffs.coeff(k), &temp.coeffRef(0));
+  }
 
-    Matrix<Scalar,1,Rhs::ColsAtCompileTime> temp(rhs().cols());
-    for (Index k = 0; k < dec().rank(); ++k)
-    {
-      Index remainingSize = rows-k;
-      c.row(k).swap(c.row(dec().rowsTranspositions().coeff(k)));
-      c.bottomRightCorner(remainingSize, rhs().cols())
-       .applyHouseholderOnTheLeft(dec().matrixQR().col(k).tail(remainingSize-1),
-                                  dec().hCoeffs().coeff(k), &temp.coeffRef(0));
-    }
+  m_qr.topLeftCorner(l_rank, l_rank)
+      .template triangularView<Upper>()
+      .solveInPlace(c.topRows(l_rank));
 
-    dec().matrixQR()
-       .topLeftCorner(dec().rank(), dec().rank())
-       .template triangularView<Upper>()
-       .solveInPlace(c.topRows(dec().rank()));
+  for(Index i = 0; i < l_rank; ++i) dst.row(m_cols_permutation.indices().coeff(i)) = c.row(i);
+  for(Index i = l_rank; i < cols(); ++i) dst.row(m_cols_permutation.indices().coeff(i)).setZero();
+}
 
-    for(Index i = 0; i < dec().rank(); ++i) dst.row(dec().colsPermutation().indices().coeff(i)) = c.row(i);
-    for(Index i = dec().rank(); i < cols; ++i) dst.row(dec().colsPermutation().indices().coeff(i)).setZero();
+template<typename _MatrixType>
+template<bool Conjugate, typename RhsType, typename DstType>
+void FullPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+  const Index l_rank = rank();
+
+  if(l_rank == 0)
+  {
+    dst.setZero();
+    return;
+  }
+
+  typename RhsType::PlainObject c(m_cols_permutation.transpose()*rhs);
+
+  m_qr.topLeftCorner(l_rank, l_rank)
+         .template triangularView<Upper>()
+         .transpose().template conjugateIf<Conjugate>()
+         .solveInPlace(c.topRows(l_rank));
+
+  dst.topRows(l_rank) = c.topRows(l_rank);
+  dst.bottomRows(rows()-l_rank).setZero();
+
+  Matrix<Scalar, 1, DstType::ColsAtCompileTime> temp(dst.cols());
+  const Index size = (std::min)(rows(), cols());
+  for (Index k = size-1; k >= 0; --k)
+  {
+    Index remainingSize = rows()-k;
+
+    dst.bottomRightCorner(remainingSize, dst.cols())
+       .applyHouseholderOnTheLeft(m_qr.col(k).tail(remainingSize-1).template conjugateIf<!Conjugate>(),
+                                  m_hCoeffs.template conjugateIf<Conjugate>().coeff(k), &temp.coeffRef(0));
+
+    dst.row(k).swap(dst.row(m_rows_transpositions.coeff(k)));
+  }
+}
+#endif
+
+namespace internal {
+  
+template<typename DstXprType, typename MatrixType>
+struct Assignment<DstXprType, Inverse<FullPivHouseholderQR<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename FullPivHouseholderQR<MatrixType>::Scalar>, Dense2Dense>
+{
+  typedef FullPivHouseholderQR<MatrixType> QrType;
+  typedef Inverse<QrType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename QrType::Scalar> &)
+  {    
+    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
   }
 };
 
@@ -538,7 +633,6 @@
   : public ReturnByValue<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
 {
 public:
-  typedef typename MatrixType::Index Index;
   typedef typename FullPivHouseholderQR<MatrixType>::IntDiagSizeVectorType IntDiagSizeVectorType;
   typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
   typedef Matrix<typename MatrixType::Scalar, 1, MatrixType::RowsAtCompileTime, RowMajor, 1,
@@ -550,7 +644,7 @@
     : m_qr(qr),
       m_hCoeffs(hCoeffs),
       m_rowsTranspositions(rowsTranspositions)
-      {}
+  {}
 
   template <typename ResultType>
   void evalTo(ResultType& result) const
@@ -580,8 +674,8 @@
     }
   }
 
-    Index rows() const { return m_qr.rows(); }
-    Index cols() const { return m_qr.rows(); }
+  Index rows() const { return m_qr.rows(); }
+  Index cols() const { return m_qr.rows(); }
 
 protected:
   typename MatrixType::Nested m_qr;
@@ -589,6 +683,11 @@
   typename IntDiagSizeVectorType::Nested m_rowsTranspositions;
 };
 
+// template<typename MatrixType>
+// struct evaluator<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
+//  : public evaluator<ReturnByValue<FullPivHouseholderQRMatrixQReturnType<MatrixType> > >
+// {};
+
 } // end namespace internal
 
 template<typename MatrixType>
@@ -598,7 +697,6 @@
   return MatrixQReturnType(m_qr, m_hCoeffs, m_rows_transpositions);
 }
 
-#ifndef __CUDACC__
 /** \return the full-pivoting Householder QR decomposition of \c *this.
   *
   * \sa class FullPivHouseholderQR
@@ -609,7 +707,6 @@
 {
   return FullPivHouseholderQR<PlainObject>(eval());
 }
-#endif // __CUDACC__
 
 } // end namespace Eigen
 

diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h
index 03c1f1b..801739f 100644
--- a/Eigen/src/QR/HouseholderQR.h
+++ b/Eigen/src/QR/HouseholderQR.h

@@ -12,7 +12,19 @@
 #ifndef EIGEN_QR_H
 #define EIGEN_QR_H
 
-namespace Eigen {
+namespace Eigen { 
+
+namespace internal {
+template<typename _MatrixType> struct traits<HouseholderQR<_MatrixType> >
+ : traits<_MatrixType>
+{
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
+  enum { Flags = 0 };
+};
+
+} // end namespace internal
 
 /** \ingroup QR_Module
   *
@@ -21,10 +33,10 @@
   *
   * \brief Householder QR decomposition of a matrix
   *
-  * \param MatrixType the type of the matrix of which we are computing the QR decomposition
+  * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition
   *
   * This class performs a QR decomposition of a matrix \b A into matrices \b Q and \b R
-  * such that
+  * such that 
   * \f[
   *  \mathbf{A} = \mathbf{Q} \, \mathbf{R}
   * \f]
@@ -37,22 +49,24 @@
   * This Householder QR decomposition is faster, but less numerically stable and less feature-full than
   * FullPivHouseholderQR or ColPivHouseholderQR.
   *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  *
   * \sa MatrixBase::householderQr()
   */
 template<typename _MatrixType> class HouseholderQR
+        : public SolverBase<HouseholderQR<_MatrixType> >
 {
   public:
 
     typedef _MatrixType MatrixType;
+    typedef SolverBase<HouseholderQR> Base;
+    friend class SolverBase<HouseholderQR>;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(HouseholderQR)
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
     typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, (MatrixType::Flags&RowMajorBit) ? RowMajor : ColMajor, MaxRowsAtCompileTime, MaxRowsAtCompileTime> MatrixQType;
     typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
     typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
@@ -82,29 +96,31 @@
       *
       * This constructor computes the QR factorization of the matrix \a matrix by calling
       * the method compute(). It is a short cut for:
-      *
+      * 
       * \code
       * HouseholderQR<MatrixType> qr(matrix.rows(), matrix.cols());
       * qr.compute(matrix);
       * \endcode
-      *
+      * 
       * \sa compute()
       */
-    HouseholderQR(const MatrixType& matrix)
+    template<typename InputType>
+    explicit HouseholderQR(const EigenBase<InputType>& matrix)
       : m_qr(matrix.rows(), matrix.cols()),
         m_hCoeffs((std::min)(matrix.rows(),matrix.cols())),
         m_temp(matrix.cols()),
         m_isInitialized(false)
     {
-      compute(matrix);
+      compute(matrix.derived());
     }
 
+
     /** \brief Constructs a QR factorization from a given matrix
       *
-      * This overloaded constructor is provided for inplace solving when
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when
       * \c MatrixType is a Eigen::Ref.
       *
-      * \sa HouseholderQR(const MatrixType&)
+      * \sa HouseholderQR(const EigenBase&)
       */
     template<typename InputType>
     explicit HouseholderQR(EigenBase<InputType>& matrix)
@@ -116,6 +132,7 @@
       computeInPlace();
     }
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
       * *this is the QR decomposition, if any exists.
       *
@@ -123,9 +140,6 @@
       *
       * \returns a solution.
       *
-      * \note The case where b is a matrix is not yet implemented. Also, this
-      *       code is space inefficient.
-      *
       * \note_about_checking_solutions
       *
       * \note_about_arbitrary_choice_of_solution
@@ -134,12 +148,9 @@
       * Output: \verbinclude HouseholderQR_solve.out
       */
     template<typename Rhs>
-    inline const internal::solve_retval<HouseholderQR, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
-      return internal::solve_retval<HouseholderQR, Rhs>(*this, b.derived());
-    }
+    inline const Solve<HouseholderQR, Rhs>
+    solve(const MatrixBase<Rhs>& b) const;
+    #endif
 
     /** This method returns an expression of the unitary matrix Q as a sequence of Householder transformations.
       *
@@ -164,8 +175,9 @@
         return m_qr;
     }
 
-    HouseholderQR& compute(const MatrixType& matrix) {
-      m_qr = matrix;
+    template<typename InputType>
+    HouseholderQR& compute(const EigenBase<InputType>& matrix) {
+      m_qr = matrix.derived();
       computeInPlace();
       return *this;
     }
@@ -203,12 +215,26 @@
     inline Index cols() const { return m_qr.cols(); }
 
     /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q.
-      *
+      * 
       * For advanced uses only.
       */
     const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
 
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
+    #endif
+
   protected:
+
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+
     void computeInPlace();
 
     MatrixType m_qr;
@@ -240,7 +266,6 @@
 template<typename MatrixQR, typename HCoeffs>
 void householder_qr_inplace_unblocked(MatrixQR& mat, HCoeffs& hCoeffs, typename MatrixQR::Scalar* tempData = 0)
 {
-  typedef typename MatrixQR::Index Index;
   typedef typename MatrixQR::Scalar Scalar;
   typedef typename MatrixQR::RealScalar RealScalar;
   Index rows = mat.rows();
@@ -278,12 +303,10 @@
   bool InnerStrideIsOne = (MatrixQR::InnerStrideAtCompileTime == 1 && HCoeffs::InnerStrideAtCompileTime == 1)>
 struct householder_qr_inplace_blocked
 {
-  // This is specialized for MKL-supported Scalar types in HouseholderQR_MKL.h
-  static void run(MatrixQR& mat, HCoeffs& hCoeffs,
-      typename MatrixQR::Index maxBlockSize=32,
+  // This is specialized for LAPACK-supported Scalar types in HouseholderQR_LAPACKE.h
+  static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index maxBlockSize=32,
       typename MatrixQR::Scalar* tempData = 0)
   {
-    typedef typename MatrixQR::Index Index;
     typedef typename MatrixQR::Scalar Scalar;
     typedef Block<MatrixQR,Dynamic,Dynamic> BlockType;
 
@@ -324,52 +347,64 @@
       if(tcols)
       {
         BlockType A21_22 = mat.block(k,k+bs,brows,tcols);
-        apply_block_householder_on_the_left(A21_22,A11_21,hCoeffsSegment, false);  // false == backward
+        apply_block_householder_on_the_left(A21_22,A11_21,hCoeffsSegment, false); // false == backward
       }
     }
   }
 };
 
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<HouseholderQR<_MatrixType>, Rhs>
-  : solve_retval_base<HouseholderQR<_MatrixType>, Rhs>
-{
-  EIGEN_MAKE_SOLVE_HELPERS(HouseholderQR<_MatrixType>,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    const Index rows = dec().rows(), cols = dec().cols();
-    const Index rank = (std::min)(rows, cols);
-    eigen_assert(rhs().rows() == rows);
-
-    typename Rhs::PlainObject c(rhs());
-
-    // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
-    c.applyOnTheLeft(householderSequence(
-      dec().matrixQR().leftCols(rank),
-      dec().hCoeffs().head(rank)).transpose()
-    );
-
-    dec().matrixQR()
-       .topLeftCorner(rank, rank)
-       .template triangularView<Upper>()
-       .solveInPlace(c.topRows(rank));
-
-    dst.topRows(rank) = c.topRows(rank);
-    dst.bottomRows(cols-rank).setZero();
-  }
-};
-
 } // end namespace internal
 
-/** Performs the QR factorization of m_qr and stores the factorization into
-  * \c *this.
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename _MatrixType>
+template<typename RhsType, typename DstType>
+void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
+{
+  const Index rank = (std::min)(rows(), cols());
+
+  typename RhsType::PlainObject c(rhs);
+
+  c.applyOnTheLeft(householderQ().setLength(rank).adjoint() );
+
+  m_qr.topLeftCorner(rank, rank)
+      .template triangularView<Upper>()
+      .solveInPlace(c.topRows(rank));
+
+  dst.topRows(rank) = c.topRows(rank);
+  dst.bottomRows(cols()-rank).setZero();
+}
+
+template<typename _MatrixType>
+template<bool Conjugate, typename RhsType, typename DstType>
+void HouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+  const Index rank = (std::min)(rows(), cols());
+
+  typename RhsType::PlainObject c(rhs);
+
+  m_qr.topLeftCorner(rank, rank)
+      .template triangularView<Upper>()
+      .transpose().template conjugateIf<Conjugate>()
+      .solveInPlace(c.topRows(rank));
+
+  dst.topRows(rank) = c.topRows(rank);
+  dst.bottomRows(rows()-rank).setZero();
+
+  dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf<!Conjugate>() );
+}
+#endif
+
+/** Performs the QR factorization of the given matrix \a matrix. The result of
+  * the factorization is stored into \c *this, and a reference to \c *this
+  * is returned.
   *
   * \sa class HouseholderQR, HouseholderQR(const MatrixType&)
   */
 template<typename MatrixType>
 void HouseholderQR<MatrixType>::computeInPlace()
 {
+  check_template_parameters();
+  
   Index rows = m_qr.rows();
   Index cols = m_qr.cols();
   Index size = (std::min)(rows,cols);
@@ -383,7 +418,6 @@
   m_isInitialized = true;
 }
 
-#ifndef __CUDACC__
 /** \return the Householder QR decomposition of \c *this.
   *
   * \sa class HouseholderQR
@@ -394,7 +428,6 @@
 {
   return HouseholderQR<PlainObject>(eval());
 }
-#endif // __CUDACC__
 
 } // end namespace Eigen
 

diff --git a/Eigen/src/QR/HouseholderQR_LAPACKE.h b/Eigen/src/QR/HouseholderQR_LAPACKE.h
new file mode 100644
index 0000000..1dc7d53
--- /dev/null
+++ b/Eigen/src/QR/HouseholderQR_LAPACKE.h

@@ -0,0 +1,68 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to LAPACKe
+ *    Householder QR decomposition of a matrix w/o pivoting based on
+ *    LAPACKE_?geqrf function.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_QR_LAPACKE_H
+#define EIGEN_QR_LAPACKE_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \internal Specialization for the data types supported by LAPACKe */
+
+#define EIGEN_LAPACKE_QR_NOPIV(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX) \
+template<typename MatrixQR, typename HCoeffs> \
+struct householder_qr_inplace_blocked<MatrixQR, HCoeffs, EIGTYPE, true> \
+{ \
+  static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index = 32, \
+      typename MatrixQR::Scalar* = 0) \
+  { \
+    lapack_int m = (lapack_int) mat.rows(); \
+    lapack_int n = (lapack_int) mat.cols(); \
+    lapack_int lda = (lapack_int) mat.outerStride(); \
+    lapack_int matrix_order = (MatrixQR::IsRowMajor) ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \
+    LAPACKE_##LAPACKE_PREFIX##geqrf( matrix_order, m, n, (LAPACKE_TYPE*)mat.data(), lda, (LAPACKE_TYPE*)hCoeffs.data()); \
+    hCoeffs.adjointInPlace(); \
+  } \
+};
+
+EIGEN_LAPACKE_QR_NOPIV(double, double, d)
+EIGEN_LAPACKE_QR_NOPIV(float, float, s)
+EIGEN_LAPACKE_QR_NOPIV(dcomplex, lapack_complex_double, z)
+EIGEN_LAPACKE_QR_NOPIV(scomplex, lapack_complex_float, c)
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_QR_LAPACKE_H

diff --git a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
index a2cc2a9..013c7ae 100644
--- a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
+++ b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h

@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2012 Desire Nuentsa <desire.nuentsa_wakam@inria.fr>
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -32,51 +33,76 @@
   } // End namespace internal
   
 /**
- * \ingroup SPQRSupport_Module
- * \class SPQR
- * \brief Sparse QR factorization based on SuiteSparseQR library
- * 
- * This class is used to perform a multithreaded and multifrontal rank-revealing QR decomposition 
- * of sparse matrices. The result is then used to solve linear leasts_square systems.
- * Clearly, a QR factorization is returned such that A*P = Q*R where :
- * 
- * P is the column permutation. Use colsPermutation() to get it.
- * 
- * Q is the orthogonal matrix represented as Householder reflectors. 
- * Use matrixQ() to get an expression and matrixQ().transpose() to get the transpose.
- * You can then apply it to a vector.
- * 
- * R is the sparse triangular factor. Use matrixQR() to get it as SparseMatrix.
- * NOTE : The Index type of R is always UF_long. You can get it with SPQR::Index
- * 
- * \tparam _MatrixType The type of the sparse matrix A, must be a column-major SparseMatrix<>
- * NOTE 
- * 
- */
+  * \ingroup SPQRSupport_Module
+  * \class SPQR
+  * \brief Sparse QR factorization based on SuiteSparseQR library
+  *
+  * This class is used to perform a multithreaded and multifrontal rank-revealing QR decomposition
+  * of sparse matrices. The result is then used to solve linear leasts_square systems.
+  * Clearly, a QR factorization is returned such that A*P = Q*R where :
+  *
+  * P is the column permutation. Use colsPermutation() to get it.
+  *
+  * Q is the orthogonal matrix represented as Householder reflectors.
+  * Use matrixQ() to get an expression and matrixQ().transpose() to get the transpose.
+  * You can then apply it to a vector.
+  *
+  * R is the sparse triangular factor. Use matrixQR() to get it as SparseMatrix.
+  * NOTE : The Index type of R is always SuiteSparse_long. You can get it with SPQR::Index
+  *
+  * \tparam _MatrixType The type of the sparse matrix A, must be a column-major SparseMatrix<>
+  *
+  * \implsparsesolverconcept
+  *
+  *
+  */
 template<typename _MatrixType>
-class SPQR
+class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
 {
+  protected:
+    typedef SparseSolverBase<SPQR<_MatrixType> > Base;
+    using Base::m_isInitialized;
   public:
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef UF_long Index ; 
-    typedef SparseMatrix<Scalar, ColMajor, Index> MatrixType;
-    typedef PermutationMatrix<Dynamic, Dynamic> PermutationType;
+    typedef SuiteSparse_long StorageIndex ;
+    typedef SparseMatrix<Scalar, ColMajor, StorageIndex> MatrixType;
+    typedef Map<PermutationMatrix<Dynamic, Dynamic, StorageIndex> > PermutationType;
+    enum {
+      ColsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic
+    };
   public:
     SPQR() 
-      : m_isInitialized(false),
-      m_ordering(SPQR_ORDERING_DEFAULT),
-      m_allow_tol(SPQR_DEFAULT_TOL),
-      m_tolerance (NumTraits<Scalar>::epsilon())
+      : m_analysisIsOk(false),
+        m_factorizationIsOk(false),
+        m_isRUpToDate(false),
+        m_ordering(SPQR_ORDERING_DEFAULT),
+        m_allow_tol(SPQR_DEFAULT_TOL),
+        m_tolerance (NumTraits<Scalar>::epsilon()),
+        m_cR(0),
+        m_E(0),
+        m_H(0),
+        m_HPinv(0),
+        m_HTau(0),
+        m_useDefaultThreshold(true)
     { 
       cholmod_l_start(&m_cc);
     }
     
-    SPQR(const _MatrixType& matrix) 
-    : m_isInitialized(false),
-      m_ordering(SPQR_ORDERING_DEFAULT),
-      m_allow_tol(SPQR_DEFAULT_TOL),
-      m_tolerance (NumTraits<Scalar>::epsilon())
+    explicit SPQR(const _MatrixType& matrix)
+      : m_analysisIsOk(false),
+        m_factorizationIsOk(false),
+        m_isRUpToDate(false),
+        m_ordering(SPQR_ORDERING_DEFAULT),
+        m_allow_tol(SPQR_DEFAULT_TOL),
+        m_tolerance (NumTraits<Scalar>::epsilon()),
+        m_cR(0),
+        m_E(0),
+        m_H(0),
+        m_HPinv(0),
+        m_HTau(0),
+        m_useDefaultThreshold(true)
     {
       cholmod_l_start(&m_cc);
       compute(matrix);
@@ -101,15 +127,30 @@
       if(m_isInitialized) SPQR_free();
 
       MatrixType mat(matrix);
+      
+      /* Compute the default threshold as in MatLab, see:
+       * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing
+       * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011, Page 8:3 
+       */
+      RealScalar pivotThreshold = m_tolerance;
+      if(m_useDefaultThreshold) 
+      {
+        RealScalar max2Norm = 0.0;
+        for (int j = 0; j < mat.cols(); j++) max2Norm = numext::maxi(max2Norm, mat.col(j).norm());
+        if(max2Norm==RealScalar(0))
+          max2Norm = RealScalar(1);
+        pivotThreshold = 20 * (mat.rows() + mat.cols()) * max2Norm * NumTraits<RealScalar>::epsilon();
+      }
       cholmod_sparse A; 
       A = viewAsCholmod(mat);
+      m_rows = matrix.rows();
       Index col = matrix.cols();
-      m_rank = SuiteSparseQR<Scalar>(m_ordering, m_tolerance, col, &A, 
+      m_rank = SuiteSparseQR<Scalar>(m_ordering, pivotThreshold, col, &A, 
                              &m_cR, &m_E, &m_H, &m_HPinv, &m_HTau, &m_cc);
 
       if (!m_cR)
       {
-        m_info = NumericalIssue; 
+        m_info = NumericalIssue;
         m_isInitialized = false;
         return;
       }
@@ -120,41 +161,37 @@
     /** 
      * Get the number of rows of the input matrix and the Q matrix
      */
-    inline Index rows() const {return m_H->nrow; }
+    inline Index rows() const {return m_rows; }
     
     /** 
      * Get the number of columns of the input matrix. 
      */
     inline Index cols() const { return m_cR->ncol; }
-   
-      /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<SPQR, Rhs> solve(const MatrixBase<Rhs>& B) const 
-    {
-      eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
-      eigen_assert(this->rows()==B.rows()
-                    && "SPQR::solve(): invalid number of rows of the right hand side matrix B");
-          return internal::solve_retval<SPQR, Rhs>(*this, B.derived());
-    }
     
     template<typename Rhs, typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
     {
       eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
       eigen_assert(b.cols()==1 && "This method is for vectors only");
-      
+
       //Compute Q^T * b
-      typename Dest::PlainObject y;
+      typename Dest::PlainObject y, y2;
       y = matrixQ().transpose() * b;
-        // Solves with the triangular matrix R
+      
+      // Solves with the triangular matrix R
       Index rk = this->rank();
-      y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView<Upper>().solve(y.topRows(rk));
-      y.bottomRows(cols()-rk).setZero();
+      y2 = y;
+      y.resize((std::max)(cols(),Index(y.rows())),y.cols());
+      y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView<Upper>().solve(y2.topRows(rk));
+
       // Apply the column permutation 
-      dest.topRows(cols()) = colsPermutation() * y.topRows(cols());
+      // colsPermutation() performs a copy of the permutation,
+      // so let's apply it manually:
+      for(Index i = 0; i < rk; ++i) dest.row(m_E[i]) = y.row(i);
+      for(Index i = rk; i < cols(); ++i) dest.row(m_E[i]).setZero();
+      
+//       y.bottomRows(y.rows()-rk).setZero();
+//       dest = colsPermutation() * y.topRows(cols());
       
       m_info = Success;
     }
@@ -165,7 +202,7 @@
     {
       eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
       if(!m_isRUpToDate) {
-        m_R = viewAsEigen<Scalar,ColMajor, typename MatrixType::Index>(*m_cR);
+        m_R = viewAsEigen<Scalar,ColMajor, typename MatrixType::StorageIndex>(*m_cR);
         m_isRUpToDate = true;
       }
       return m_R;
@@ -179,11 +216,7 @@
     PermutationType colsPermutation() const
     { 
       eigen_assert(m_isInitialized && "Decomposition is not initialized.");
-      Index n = m_cR->ncol;
-      PermutationType colsPerm(n);
-      for(Index j = 0; j <n; j++) colsPerm.indices()(j) = m_E[j];
-      return colsPerm; 
-      
+      return PermutationType(m_E, m_cR->ncol);
     }
     /**
      * Gets the rank of the matrix. 
@@ -197,7 +230,11 @@
     /// Set the fill-reducing ordering method to be used
     void setSPQROrdering(int ord) { m_ordering = ord;}
     /// Set the tolerance tol to treat columns with 2-norm < =tol as zero
-    void setPivotThreshold(const RealScalar& tol) { m_tolerance = tol; }
+    void setPivotThreshold(const RealScalar& tol)
+    {
+      m_useDefaultThreshold = false;
+      m_tolerance = tol;
+    }
     
     /** \returns a pointer to the SPQR workspace */
     cholmod_common *cholmodCommon() const { return &m_cc; }
@@ -205,7 +242,7 @@
     
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the sparse QR can not be computed
       */
     ComputationInfo info() const
@@ -214,7 +251,6 @@
       return m_info;
     }
   protected:
-    bool m_isInitialized;
     bool m_analysisIsOk;
     bool m_factorizationIsOk;
     mutable bool m_isRUpToDate;
@@ -224,12 +260,14 @@
     RealScalar m_tolerance; // treat columns with 2-norm below this tolerance as zero
     mutable cholmod_sparse *m_cR; // The sparse R factor in cholmod format
     mutable MatrixType m_R; // The sparse matrix R in Eigen format
-    mutable Index *m_E; // The permutation applied to columns
+    mutable StorageIndex *m_E; // The permutation applied to columns
     mutable cholmod_sparse *m_H;  //The householder vectors
-    mutable Index *m_HPinv; // The row permutation of H
+    mutable StorageIndex *m_HPinv; // The row permutation of H
     mutable cholmod_dense *m_HTau; // The Householder coefficients
     mutable Index m_rank; // The rank of the matrix
     mutable cholmod_common m_cc; // Workspace and parameters
+    bool m_useDefaultThreshold;     // Use default threshold
+    Index m_rows;
     template<typename ,typename > friend struct SPQR_QProduct;
 };
 
@@ -237,7 +275,7 @@
 struct SPQR_QProduct : ReturnByValue<SPQR_QProduct<SPQRType,Derived> >
 {
   typedef typename SPQRType::Scalar Scalar;
-  typedef typename SPQRType::Index Index;
+  typedef typename SPQRType::StorageIndex StorageIndex;
   //Define the constructor to get reference to argument types
   SPQR_QProduct(const SPQRType& spqr, const Derived& other, bool transpose) : m_spqr(spqr),m_other(other),m_transpose(transpose) {}
   
@@ -293,22 +331,5 @@
   const SPQRType& m_spqr;
 };
 
-namespace internal {
-  
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<SPQR<_MatrixType>, Rhs>
-  : solve_retval_base<SPQR<_MatrixType>, Rhs>
-{
-  typedef SPQR<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
 }// End namespace Eigen
 #endif

diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index debbf70..a76a8dd 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h

@@ -1,9 +1,9 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
-//
+// 
 // We used the "A Divide-And-Conquer Algorithm for the Bidiagonal SVD"
 // research report written by Ming Gu and Stanley C.Eisenstat
-// The code variable names correspond to the names they used in their
+// The code variable names correspond to the names they used in their 
 // report
 //
 // Copyright (C) 2013 Gauthier Brun <brun.gauthier@gmail.com>
@@ -11,7 +11,7 @@
 // Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
 // Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
 // Copyright (C) 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
-// Copyright (C) 2014-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2014-2017 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -22,25 +22,35 @@
 // #define EIGEN_BDCSVD_DEBUG_VERBOSE
 // #define EIGEN_BDCSVD_SANITY_CHECKS
 
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+#undef eigen_internal_assert
+#define eigen_internal_assert(X) assert(X);
+#endif
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+#include <iostream>
+#endif
+
 namespace Eigen {
 
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
 IOFormat bdcsvdfmt(8, 0, ", ", "\n", "  [", "]");
 #endif
-
+  
 template<typename _MatrixType> class BDCSVD;
 
 namespace internal {
 
-template<typename _MatrixType>
+template<typename _MatrixType> 
 struct traits<BDCSVD<_MatrixType> >
+        : traits<_MatrixType>
 {
   typedef _MatrixType MatrixType;
-};
+};  
 
 } // end namespace internal
-
-
+  
+  
 /** \ingroup SVD_Module
  *
  *
@@ -57,40 +67,41 @@
  * recommended and can several order of magnitude faster.
  *
  * \warning this algorithm is unlikely to provide accurate result when compiled with unsafe math optimizations.
- * For instance, this concerns Intel's compiler (ICC), which perfroms such optimization by default unless
+ * For instance, this concerns Intel's compiler (ICC), which performs such optimization by default unless
  * you compile with the \c -fp-model \c precise option. Likewise, the \c -ffast-math option of GCC or clang will
  * significantly degrade the accuracy.
  *
  * \sa class JacobiSVD
  */
-template<typename _MatrixType>
+template<typename _MatrixType> 
 class BDCSVD : public SVDBase<BDCSVD<_MatrixType> >
 {
   typedef SVDBase<BDCSVD> Base;
-
+    
 public:
   using Base::rows;
   using Base::cols;
   using Base::computeU;
   using Base::computeV;
-
+  
   typedef _MatrixType MatrixType;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+  typedef typename NumTraits<RealScalar>::Literal Literal;
   enum {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-    DiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime, ColsAtCompileTime),
-    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    MaxDiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(MaxRowsAtCompileTime, MaxColsAtCompileTime),
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime, 
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime, 
+    DiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime, ColsAtCompileTime), 
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, 
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, 
+    MaxDiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(MaxRowsAtCompileTime, MaxColsAtCompileTime), 
     MatrixOptions = MatrixType::Options
   };
 
   typedef typename Base::MatrixUType MatrixUType;
   typedef typename Base::MatrixVType MatrixVType;
   typedef typename Base::SingularValuesType SingularValuesType;
-
+  
   typedef Matrix<Scalar, Dynamic, Dynamic, ColMajor> MatrixX;
   typedef Matrix<RealScalar, Dynamic, Dynamic, ColMajor> MatrixXr;
   typedef Matrix<RealScalar, Dynamic, 1> VectorType;
@@ -104,7 +115,7 @@
    * The default constructor is useful in cases in which the user intends to
    * perform decompositions via BDCSVD::compute(const MatrixType&).
    */
-  BDCSVD() : m_algoswap(16), m_numIters(0)
+  BDCSVD() : m_algoswap(16), m_isTranspose(false), m_compU(false), m_compV(false), m_numIters(0)
   {}
 
 
@@ -124,7 +135,7 @@
    *
    * \param matrix the matrix to decompose
    * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
-   *                           By default, none is computed. This is a bit - field, the possible bits are #ComputeFullU, #ComputeThinU,
+   *                           By default, none is computed. This is a bit - field, the possible bits are #ComputeFullU, #ComputeThinU, 
    *                           #ComputeFullV, #ComputeThinV.
    *
    * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
@@ -136,15 +147,15 @@
     compute(matrix, computationOptions);
   }
 
-  ~BDCSVD()
+  ~BDCSVD() 
   {
   }
-
+  
   /** \brief Method performing the decomposition of given matrix using custom options.
    *
    * \param matrix the matrix to decompose
    * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
-   *                           By default, none is computed. This is a bit - field, the possible bits are #ComputeFullU, #ComputeThinU,
+   *                           By default, none is computed. This is a bit - field, the possible bits are #ComputeFullU, #ComputeThinU, 
    *                           #ComputeFullV, #ComputeThinV.
    *
    * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
@@ -163,30 +174,12 @@
     return compute(matrix, this->m_computationOptions);
   }
 
-  void setSwitchSize(int s)
+  void setSwitchSize(int s) 
   {
-    eigen_assert(s>3 && "BDCSVD the size of the algo switch has to be greater than 3");
+    eigen_assert(s>=3 && "BDCSVD the size of the algo switch has to be at least 3.");
     m_algoswap = s;
   }
-
-  /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A.
-   *
-   * \param b the right-hand-side of the equation to solve.
-   *
-   * \note Solving requires both U and V to be computed. Thin U and V are enough, there is no need for full U or V.
-   *
-   * \note SVD solving is implicitly least-squares. Thus, this method serves both purposes of exact solving and least-squares solving.
-   * In other words, the returned solution is guaranteed to minimize the Euclidean norm \f$ \Vert A x - b \Vert \f$.
-   */
-  template<typename Rhs>
-  inline const internal::solve_retval<BDCSVD, Rhs>
-  solve(const MatrixBase<Rhs>& b) const
-  {
-    eigen_assert(this->m_isInitialized && "BDCSVD is not initialized.");
-    eigen_assert(Base::computeU() && Base::computeV() && "BDCSVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice).");
-    return internal::solve_retval<BDCSVD<_MatrixType>, Rhs>(*this, b.derived());
-  }
-
+ 
 private:
   void allocate(Index rows, Index cols, unsigned int computationOptions);
   void divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift);
@@ -210,7 +203,7 @@
   ArrayXi m_workspaceI;
   int m_algoswap;
   bool m_isTranspose, m_compU, m_compV;
-
+  
   using Base::m_singularValues;
   using Base::m_diagSize;
   using Base::m_computeFullU;
@@ -219,40 +212,41 @@
   using Base::m_computeThinV;
   using Base::m_matrixU;
   using Base::m_matrixV;
+  using Base::m_info;
   using Base::m_isInitialized;
   using Base::m_nonzeroSingularValues;
 
-public:
+public:  
   int m_numIters;
 }; //end class BDCSVD
 
 
 // Method to allocate and initialize matrix and attributes
 template<typename MatrixType>
-void BDCSVD<MatrixType>::allocate(Index rows, Index cols, unsigned int computationOptions)
+void BDCSVD<MatrixType>::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions)
 {
   m_isTranspose = (cols > rows);
 
   if (Base::allocate(rows, cols, computationOptions))
     return;
-
+  
   m_computed = MatrixXr::Zero(m_diagSize + 1, m_diagSize );
   m_compU = computeV();
   m_compV = computeU();
   if (m_isTranspose)
     std::swap(m_compU, m_compV);
-
+  
   if (m_compU) m_naiveU = MatrixXr::Zero(m_diagSize + 1, m_diagSize + 1 );
   else         m_naiveU = MatrixXr::Zero(2, m_diagSize + 1 );
-
+  
   if (m_compV) m_naiveV = MatrixXr::Zero(m_diagSize, m_diagSize);
-
+  
   m_workspace.resize((m_diagSize+1)*(m_diagSize+1)*3);
   m_workspaceI.resize(3*m_diagSize);
 }// end allocate
 
 template<typename MatrixType>
-BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsigned int computationOptions)
+BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsigned int computationOptions) 
 {
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
   std::cout << "\n\n\n======================================================================================================================\n\n\n";
@@ -261,27 +255,36 @@
   using std::abs;
 
   const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
-
+  
   //**** step -1 - If the problem is too small, directly falls back to JacobiSVD and return
   if(matrix.cols() < m_algoswap)
   {
     // FIXME this line involves temporaries
     JacobiSVD<MatrixType> jsvd(matrix,computationOptions);
-    if(computeU()) m_matrixU = jsvd.matrixU();
-    if(computeV()) m_matrixV = jsvd.matrixV();
-    m_singularValues = jsvd.singularValues();
-    m_nonzeroSingularValues = jsvd.nonzeroSingularValues();
     m_isInitialized = true;
+    m_info = jsvd.info();
+    if (m_info == Success || m_info == NoConvergence) {
+      if(computeU()) m_matrixU = jsvd.matrixU();
+      if(computeV()) m_matrixV = jsvd.matrixV();
+      m_singularValues = jsvd.singularValues();
+      m_nonzeroSingularValues = jsvd.nonzeroSingularValues();
+    }
+    return *this;
+  }
+  
+  //**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows
+  RealScalar scale = matrix.cwiseAbs().template maxCoeff<PropagateNaN>();
+  if (!(numext::isfinite)(scale)) {
+    m_isInitialized = true;
+    m_info = InvalidInput;
     return *this;
   }
 
-  //**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows
-  RealScalar scale = matrix.cwiseAbs().maxCoeff();
-  if(scale==RealScalar(0)) scale = RealScalar(1);
+  if(scale==Literal(0)) scale = Literal(1);
   MatrixX copy;
   if (m_isTranspose) copy = matrix.adjoint()/scale;
   else               copy = matrix/scale;
-
+  
   //**** step 1 - Bidiagonalization
   // FIXME this line involves temporaries
   internal::UpperBidiagonalization<MatrixX> bid(copy);
@@ -293,7 +296,11 @@
   m_computed.topRows(m_diagSize) = bid.bidiagonal().toDenseMatrix().transpose();
   m_computed.template bottomRows<1>().setZero();
   divide(0, m_diagSize - 1, 0, 0, 0);
-
+  if (m_info != Success && m_info != NoConvergence) {
+    m_isInitialized = true;
+    return *this;
+  }
+    
   //**** step 3 - Copy singular values and vectors
   for (int i=0; i<m_diagSize; i++)
   {
@@ -369,20 +376,20 @@
     Index k1=0, k2=0;
     for(Index j=0; j<n; ++j)
     {
-      if( (A.col(j).head(n1).array()!=0).any() )
+      if( (A.col(j).head(n1).array()!=Literal(0)).any() )
       {
         A1.col(k1) = A.col(j).head(n1);
         B1.row(k1) = B.row(j);
         ++k1;
       }
-      if( (A.col(j).tail(n2).array()!=0).any() )
+      if( (A.col(j).tail(n2).array()!=Literal(0)).any() )
       {
         A2.col(k2) = A.col(j).tail(n2);
         B2.row(k2) = B.row(j);
         ++k2;
       }
     }
-
+  
     A.topRows(n1).noalias()    = A1.leftCols(k1) * B1.topRows(k1);
     A.bottomRows(n2).noalias() = A2.leftCols(k2) * B2.topRows(k2);
   }
@@ -394,18 +401,18 @@
   }
 }
 
-// The divide algorithm is done "in place", we are always working on subsets of the same matrix. The divide methods takes as argument the
+// The divide algorithm is done "in place", we are always working on subsets of the same matrix. The divide methods takes as argument the 
 // place of the submatrix we are currently working on.
 
 //@param firstCol : The Index of the first column of the submatrix of m_computed and for m_naiveU;
-//@param lastCol : The Index of the last column of the submatrix of m_computed and for m_naiveU;
+//@param lastCol : The Index of the last column of the submatrix of m_computed and for m_naiveU; 
 // lastCol + 1 - firstCol is the size of the submatrix.
 //@param firstRowW : The Index of the first row of the matrix W that we are to change. (see the reference paper section 1 for more information on W)
-//@param firstRowW : Same as firstRowW with the column.
-//@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix
+//@param firstColW : Same as firstRowW with the column.
+//@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix 
 // to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper.
 template<typename MatrixType>
-void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift)
+void BDCSVD<MatrixType>::divide(Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift)
 {
   // requires rows = cols + 1;
   using std::pow;
@@ -415,19 +422,21 @@
   const Index k = n/2;
   const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
   RealScalar alphaK;
-  RealScalar betaK;
-  RealScalar r0;
+  RealScalar betaK; 
+  RealScalar r0; 
   RealScalar lambda, phi, c0, s0;
   VectorType l, f;
-  // We use the other algorithm which is more efficient for small
+  // We use the other algorithm which is more efficient for small 
   // matrices.
   if (n < m_algoswap)
   {
     // FIXME this line involves temporaries
     JacobiSVD<MatrixXr> b(m_computed.block(firstCol, firstCol, n + 1, n), ComputeFullU | (m_compV ? ComputeFullV : 0));
+    m_info = b.info();
+    if (m_info != Success && m_info != NoConvergence) return;
     if (m_compU)
       m_naiveU.block(firstCol, firstCol, n + 1, n + 1).real() = b.matrixU();
-    else
+    else 
     {
       m_naiveU.row(0).segment(firstCol, n + 1).real() = b.matrixU().row(0);
       m_naiveU.row(1).segment(firstCol, n + 1).real() = b.matrixU().row(n);
@@ -441,17 +450,19 @@
   alphaK =  m_computed(firstCol + k, firstCol + k);
   betaK = m_computed(firstCol + k + 1, firstCol + k);
   // The divide must be done in that order in order to have good results. Divide change the data inside the submatrices
-  // and the divide of the right submatrice reads one column of the left submatrice. That's why we need to treat the
-  // right submatrix before the left one.
+  // and the divide of the right submatrice reads one column of the left submatrice. That's why we need to treat the 
+  // right submatrix before the left one. 
   divide(k + 1 + firstCol, lastCol, k + 1 + firstRowW, k + 1 + firstColW, shift);
+  if (m_info != Success && m_info != NoConvergence) return;
   divide(firstCol, k - 1 + firstCol, firstRowW, firstColW + 1, shift + 1);
+  if (m_info != Success && m_info != NoConvergence) return;
 
   if (m_compU)
   {
     lambda = m_naiveU(firstCol + k, firstCol + k);
     phi = m_naiveU(firstCol + k + 1, lastCol + 1);
-  }
-  else
+  } 
+  else 
   {
     lambda = m_naiveU(1, firstCol + k);
     phi = m_naiveU(0, lastCol + 1);
@@ -461,69 +472,69 @@
   {
     l = m_naiveU.row(firstCol + k).segment(firstCol, k);
     f = m_naiveU.row(firstCol + k + 1).segment(firstCol + k + 1, n - k - 1);
-  }
-  else
+  } 
+  else 
   {
     l = m_naiveU.row(1).segment(firstCol, k);
     f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1);
   }
-  if (m_compV) m_naiveV(firstRowW+k, firstColW) = 1;
+  if (m_compV) m_naiveV(firstRowW+k, firstColW) = Literal(1);
   if (r0<considerZero)
   {
-    c0 = 1;
-    s0 = 0;
+    c0 = Literal(1);
+    s0 = Literal(0);
   }
   else
   {
     c0 = alphaK * lambda / r0;
     s0 = betaK * phi / r0;
   }
-
+  
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   assert(m_naiveU.allFinite());
   assert(m_naiveV.allFinite());
   assert(m_computed.allFinite());
 #endif
-
+  
   if (m_compU)
   {
-    MatrixXr q1 (m_naiveU.col(firstCol + k).segment(firstCol, k + 1));
+    MatrixXr q1 (m_naiveU.col(firstCol + k).segment(firstCol, k + 1));     
     // we shiftW Q1 to the right
-    for (Index i = firstCol + k - 1; i >= firstCol; i--)
+    for (Index i = firstCol + k - 1; i >= firstCol; i--) 
       m_naiveU.col(i + 1).segment(firstCol, k + 1) = m_naiveU.col(i).segment(firstCol, k + 1);
     // we shift q1 at the left with a factor c0
     m_naiveU.col(firstCol).segment( firstCol, k + 1) = (q1 * c0);
     // last column = q1 * - s0
     m_naiveU.col(lastCol + 1).segment(firstCol, k + 1) = (q1 * ( - s0));
     // first column = q2 * s0
-    m_naiveU.col(firstCol).segment(firstCol + k + 1, n - k) = m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) * s0;
+    m_naiveU.col(firstCol).segment(firstCol + k + 1, n - k) = m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) * s0; 
     // q2 *= c0
     m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) *= c0;
-  }
-  else
+  } 
+  else 
   {
     RealScalar q1 = m_naiveU(0, firstCol + k);
     // we shift Q1 to the right
-    for (Index i = firstCol + k - 1; i >= firstCol; i--)
+    for (Index i = firstCol + k - 1; i >= firstCol; i--) 
       m_naiveU(0, i + 1) = m_naiveU(0, i);
     // we shift q1 at the left with a factor c0
     m_naiveU(0, firstCol) = (q1 * c0);
     // last column = q1 * - s0
     m_naiveU(0, lastCol + 1) = (q1 * ( - s0));
     // first column = q2 * s0
-    m_naiveU(1, firstCol) = m_naiveU(1, lastCol + 1) *s0;
+    m_naiveU(1, firstCol) = m_naiveU(1, lastCol + 1) *s0; 
     // q2 *= c0
     m_naiveU(1, lastCol + 1) *= c0;
     m_naiveU.row(1).segment(firstCol + 1, k).setZero();
     m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1).setZero();
   }
-
+  
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   assert(m_naiveU.allFinite());
   assert(m_naiveV.allFinite());
   assert(m_computed.allFinite());
 #endif
-
+  
   m_computed(firstCol + shift, firstCol + shift) = r0;
   m_computed.col(firstCol + shift).segment(firstCol + shift + 1, k) = alphaK * l.transpose().real();
   m_computed.col(firstCol + shift).segment(firstCol + shift + k + 1, n - k - 1) = betaK * f.transpose().real();
@@ -544,17 +555,17 @@
 //   assert(count<681);
 //   assert(((tmp1-tmp2).abs()<1e-13*tmp2.abs()).all());
 #endif
-
+  
   // Third part: compute SVD of combined matrix
   MatrixXr UofSVD, VofSVD;
   VectorType singVals;
   computeSVDofM(firstCol + shift, n, UofSVD, singVals, VofSVD);
-
+  
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   assert(UofSVD.allFinite());
   assert(VofSVD.allFinite());
 #endif
-
+  
   if (m_compU)
     structured_update(m_naiveU.block(firstCol, firstCol, n + 1, n + 1), UofSVD, (n+2)/2);
   else
@@ -563,15 +574,15 @@
     tmp.noalias() = m_naiveU.middleCols(firstCol, n+1) * UofSVD;
     m_naiveU.middleCols(firstCol, n + 1) = tmp;
   }
-
+  
   if (m_compV)  structured_update(m_naiveV.block(firstRowW, firstColW, n, n), VofSVD, (n+1)/2);
-
+  
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   assert(m_naiveU.allFinite());
   assert(m_naiveV.allFinite());
   assert(m_computed.allFinite());
 #endif
-
+  
   m_computed.block(firstCol + shift, firstCol + shift, n, n).setZero();
   m_computed.block(firstCol + shift, firstCol + shift, n, n).diagonal() = singVals;
 }// end divide
@@ -585,14 +596,14 @@
 // handling of round-off errors, be consistent in ordering
 // For instance, to solve the secular equation using FMM, see http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf
 template <typename MatrixType>
-void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V)
+void BDCSVD<MatrixType>::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V)
 {
   const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
   using std::abs;
   ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n);
   m_workspace.head(n) =  m_computed.block(firstCol, firstCol, n, n).diagonal();
   ArrayRef diag = m_workspace.head(n);
-  diag(0) = 0;
+  diag(0) = Literal(0);
 
   // Allocate space for singular values and vectors
   singVals.resize(n);
@@ -603,18 +614,18 @@
   if (col0.hasNaN() || diag.hasNaN())
     std::cout << "\n\nHAS NAN\n\n";
 #endif
-
+  
   // Many singular values might have been deflated, the zero ones have been moved to the end,
   // but others are interleaved and we must ignore them at this stage.
   // To this end, let's compute a permutation skipping them:
   Index actual_n = n;
-  while(actual_n>1 && diag(actual_n-1)==0) --actual_n;
+  while(actual_n>1 && diag(actual_n-1)==Literal(0)) {--actual_n; eigen_internal_assert(col0(actual_n)==Literal(0)); }
   Index m = 0; // size of the deflated problem
   for(Index k=0;k<actual_n;++k)
     if(abs(col0(k))>considerZero)
       m_workspaceI(m++) = k;
   Map<ArrayXi> perm(m_workspaceI.data(),m);
-
+  
   Map<ArrayXr> shifts(m_workspace.data()+1*n, n);
   Map<ArrayXr> mus(m_workspace.data()+2*n, n);
   Map<ArrayXr> zhat(m_workspace.data()+3*n, n);
@@ -624,60 +635,58 @@
   std::cout << "  z: " << col0.transpose() << "\n";
   std::cout << "  d: " << diag.transpose() << "\n";
 #endif
-
+  
   // Compute singVals, shifts, and mus
   computeSingVals(col0, diag, perm, singVals, shifts, mus);
-
+  
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
   std::cout << "  j:        " << (m_computed.block(firstCol, firstCol, n, n)).jacobiSvd().singularValues().transpose().reverse() << "\n\n";
   std::cout << "  sing-val: " << singVals.transpose() << "\n";
   std::cout << "  mu:       " << mus.transpose() << "\n";
   std::cout << "  shift:    " << shifts.transpose() << "\n";
-
+  
   {
-    Index actual_n = n;
-    while(actual_n>1 && abs(col0(actual_n-1))<considerZero) --actual_n;
     std::cout << "\n\n    mus:    " << mus.head(actual_n).transpose() << "\n\n";
     std::cout << "    check1 (expect0) : " << ((singVals.array()-(shifts+mus)) / singVals.array()).head(actual_n).transpose() << "\n\n";
+    assert((((singVals.array()-(shifts+mus)) / singVals.array()).head(actual_n) >= 0).all());
     std::cout << "    check2 (>0)      : " << ((singVals.array()-diag) / singVals.array()).head(actual_n).transpose() << "\n\n";
-    std::cout << "    check3 (>0)      : " << ((diag.segment(1,actual_n-1)-singVals.head(actual_n-1).array()) / singVals.head(actual_n-1).array()).transpose() << "\n\n\n";
-    std::cout << "    check4 (>0)      : " << ((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).transpose() << "\n\n\n";
+    assert((((singVals.array()-diag) / singVals.array()).head(actual_n) >= 0).all());
   }
 #endif
-
+  
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   assert(singVals.allFinite());
   assert(mus.allFinite());
   assert(shifts.allFinite());
 #endif
-
+  
   // Compute zhat
   perturbCol0(col0, diag, perm, singVals, shifts, mus, zhat);
 #ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
   std::cout << "  zhat: " << zhat.transpose() << "\n";
 #endif
-
+  
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   assert(zhat.allFinite());
 #endif
-
+  
   computeSingVecs(zhat, diag, perm, singVals, shifts, mus, U, V);
-
+  
 #ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
   std::cout << "U^T U: " << (U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() << "\n";
   std::cout << "V^T V: " << (V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() << "\n";
 #endif
-
+  
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  assert(U.allFinite());
-  assert(V.allFinite());
-  assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 1e-14 * n);
-  assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 1e-14 * n);
   assert(m_naiveU.allFinite());
   assert(m_naiveV.allFinite());
   assert(m_computed.allFinite());
+  assert(U.allFinite());
+  assert(V.allFinite());
+//   assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 100*NumTraits<RealScalar>::epsilon() * n);
+//   assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 100*NumTraits<RealScalar>::epsilon() * n);
 #endif
-
+  
   // Because of deflation, the singular values might not be completely sorted.
   // Fortunately, reordering them is a O(n) problem
   for(Index i=0; i<actual_n-1; ++i)
@@ -691,12 +700,21 @@
     }
   }
 
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  {
+    bool singular_values_sorted = (((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).array() >= 0).all();
+    if(!singular_values_sorted)
+      std::cout << "Singular values are not sorted: " << singVals.segment(1,actual_n).transpose() << "\n";
+    assert(singular_values_sorted);
+  }
+#endif
+  
   // Reverse order so that singular values in increased order
   // Because of deflation, the zeros singular-values are already at the end
   singVals.head(actual_n).reverseInPlace();
   U.leftCols(actual_n).rowwise().reverseInPlace();
   if (m_compV) V.leftCols(actual_n).rowwise().reverseInPlace();
-
+  
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
   JacobiSVD<MatrixXr> jsvd(m_computed.block(firstCol, firstCol, n, n) );
   std::cout << "  * j:        " << jsvd.singularValues().transpose() << "\n\n";
@@ -709,11 +727,13 @@
 typename BDCSVD<MatrixType>::RealScalar BDCSVD<MatrixType>::secularEq(RealScalar mu, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const ArrayRef& diagShifted, RealScalar shift)
 {
   Index m = perm.size();
-  RealScalar res = 1;
+  RealScalar res = Literal(1);
   for(Index i=0; i<m; ++i)
   {
     Index j = perm(i);
-    res += numext::abs2(col0(j)) / ((diagShifted(j) - mu) * (diag(j) + shift + mu));
+    // The following expression could be rewritten to involve only a single division,
+    // but this would make the expression more sensitive to overflow.
+    res += (col0(j) / (diagShifted(j) - mu)) * (col0(j) / (diag(j) + shift + mu));
   }
   return res;
 
@@ -725,22 +745,25 @@
 {
   using std::abs;
   using std::swap;
+  using std::sqrt;
 
   Index n = col0.size();
   Index actual_n = n;
-  while(actual_n>1 && col0(actual_n-1)==0) --actual_n;
+  // Note that here actual_n is computed based on col0(i)==0 instead of diag(i)==0 as above
+  // because 1) we have diag(i)==0 => col0(i)==0 and 2) if col0(i)==0, then diag(i) is already a singular value.
+  while(actual_n>1 && col0(actual_n-1)==Literal(0)) --actual_n;
 
   for (Index k = 0; k < n; ++k)
   {
-    if (col0(k) == 0 || actual_n==1)
+    if (col0(k) == Literal(0) || actual_n==1)
     {
       // if col0(k) == 0, then entry is deflated, so singular value is on diagonal
       // if actual_n==1, then the deflated problem is already diagonalized
       singVals(k) = k==0 ? col0(0) : diag(k);
-      mus(k) = 0;
+      mus(k) = Literal(0);
       shifts(k) = k==0 ? col0(0) : diag(k);
       continue;
-    }
+    } 
 
     // otherwise, use secular equation to find singular value
     RealScalar left = diag(k);
@@ -749,36 +772,56 @@
       right = (diag(actual_n-1) + col0.matrix().norm());
     else
     {
-      // Skip deflated singular values
+      // Skip deflated singular values,
+      // recall that at this stage we assume that z[j]!=0 and all entries for which z[j]==0 have been put aside.
+      // This should be equivalent to using perm[]
       Index l = k+1;
-      while(col0(l)==0) { ++l; eigen_internal_assert(l<actual_n); }
+      while(col0(l)==Literal(0)) { ++l; eigen_internal_assert(l<actual_n); }
       right = diag(l);
     }
 
     // first decide whether it's closer to the left end or the right end
-    RealScalar mid = left + (right-left) / 2;
-    RealScalar fMid = secularEq(mid, col0, diag, perm, diag, 0);
+    RealScalar mid = left + (right-left) / Literal(2);
+    RealScalar fMid = secularEq(mid, col0, diag, perm, diag, Literal(0));
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-    std::cout << right-left << "\n";
-    std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, diag-left, left) << " " << secularEq(mid-right, col0, diag, perm, diag-right, right)   << "\n";
-    std::cout << "     = " << secularEq(0.1*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.2*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.3*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.4*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.49*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.5*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.51*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.6*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.7*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.8*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.9*(left+right), col0, diag, perm, diag, 0) << "\n";
+    std::cout << "right-left = " << right-left << "\n";
+//     std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, ArrayXr(diag-left), left)
+//                            << " " << secularEq(mid-right, col0, diag, perm, ArrayXr(diag-right), right)   << "\n";
+    std::cout << "     = " << secularEq(left+RealScalar(0.000001)*(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.1)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.2)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.3)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.4)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.49)    *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.5)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.51)    *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.6)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.7)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.8)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.9)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.999999)*(right-left), col0, diag, perm, diag, 0) << "\n";
 #endif
-    RealScalar shift = (k == actual_n-1 || fMid > 0) ? left : right;
-
+    RealScalar shift = (k == actual_n-1 || fMid > Literal(0)) ? left : right;
+    
     // measure everything relative to shift
     Map<ArrayXr> diagShifted(m_workspace.data()+4*n, n);
     diagShifted = diag - shift;
 
+    if(k!=actual_n-1)
+    {
+      // check that after the shift, f(mid) is still negative:
+      RealScalar midShifted = (right - left) / RealScalar(2);
+      if(shift==right)
+        midShifted = -midShifted;
+      RealScalar fMidShifted = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
+      if(fMidShifted>0)
+      {
+        // fMid was erroneous, fix it:
+        shift =  fMidShifted > Literal(0) ? left : right;
+        diagShifted = diag - shift;
+      }
+    }
+    
     // initial guess
     RealScalar muPrev, muCur;
     if (shift == left)
@@ -803,26 +846,29 @@
 
     // rational interpolation: fit a function of the form a / mu + b through the two previous
     // iterates and use its zero to compute the next iterate
-    bool useBisection = fPrev*fCur>0;
-    while (fCur!=0 && abs(muCur - muPrev) > 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits<RealScalar>::epsilon() && !useBisection)
+    bool useBisection = fPrev*fCur>Literal(0);
+    while (fCur!=Literal(0) && abs(muCur - muPrev) > Literal(8) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits<RealScalar>::epsilon() && !useBisection)
     {
       ++m_numIters;
 
       // Find a and b such that the function f(mu) = a / mu + b matches the current and previous samples.
-      RealScalar a = (fCur - fPrev) / (1/muCur - 1/muPrev);
+      RealScalar a = (fCur - fPrev) / (Literal(1)/muCur - Literal(1)/muPrev);
       RealScalar b = fCur - a / muCur;
       // And find mu such that f(mu)==0:
       RealScalar muZero = -a/b;
       RealScalar fZero = secularEq(muZero, col0, diag, perm, diagShifted, shift);
 
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+      assert((numext::isfinite)(fZero));
+#endif
+      
       muPrev = muCur;
       fPrev = fCur;
       muCur = muZero;
       fCur = fZero;
-
-
-      if (shift == left  && (muCur < 0 || muCur > right - left)) useBisection = true;
-      if (shift == right && (muCur < -(right - left) || muCur > 0)) useBisection = true;
+      
+      if (shift == left  && (muCur < Literal(0) || muCur > right - left)) useBisection = true;
+      if (shift == right && (muCur < -(right - left) || muCur > Literal(0))) useBisection = true;
       if (abs(fCur)>abs(fPrev)) useBisection = true;
     }
 
@@ -835,54 +881,100 @@
       RealScalar leftShifted, rightShifted;
       if (shift == left)
       {
-        leftShifted = (std::numeric_limits<RealScalar>::min)();
+        // to avoid overflow, we must have mu > max(real_min, |z(k)|/sqrt(real_max)),
+        // the factor 2 is to be more conservative
+        leftShifted = numext::maxi<RealScalar>( (std::numeric_limits<RealScalar>::min)(), Literal(2) * abs(col0(k)) / sqrt((std::numeric_limits<RealScalar>::max)()) );
+
+        // check that we did it right:
+        eigen_internal_assert( (numext::isfinite)( (col0(k)/leftShifted)*(col0(k)/(diag(k)+shift+leftShifted)) ) );
         // I don't understand why the case k==0 would be special there:
         // if (k == 0) rightShifted = right - left; else
-        rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.6)); // theoretically we can take 0.5, but let's be safe
+        rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.51)); // theoretically we can take 0.5, but let's be safe
       }
       else
       {
-        leftShifted = -(right - left) * RealScalar(0.6);
-        rightShifted = -(std::numeric_limits<RealScalar>::min)();
+        leftShifted = -(right - left) * RealScalar(0.51);
+        if(k+1<n)
+          rightShifted = -numext::maxi<RealScalar>( (std::numeric_limits<RealScalar>::min)(), abs(col0(k+1)) / sqrt((std::numeric_limits<RealScalar>::max)()) );
+        else
+          rightShifted = -(std::numeric_limits<RealScalar>::min)();
       }
 
       RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
+      eigen_internal_assert(fLeft<Literal(0));
 
-#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_DEBUG_VERBOSE
+#if defined EIGEN_BDCSVD_DEBUG_VERBOSE || defined EIGEN_BDCSVD_SANITY_CHECKS || defined EIGEN_INTERNAL_DEBUGGING
       RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift);
 #endif
 
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+      if(!(numext::isfinite)(fLeft))
+        std::cout << "f(" << leftShifted << ") =" << fLeft << " ; " << left << " " << shift << " " << right << "\n";
+      assert((numext::isfinite)(fLeft));
+
+      if(!(numext::isfinite)(fRight))
+        std::cout << "f(" << rightShifted << ") =" << fRight << " ; " << left << " " << shift << " " << right << "\n";
+      // assert((numext::isfinite)(fRight));
+#endif
+    
 #ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
       if(!(fLeft * fRight<0))
       {
-        std::cout << "fLeft: " << leftShifted << " - " << diagShifted.head(10).transpose()  << "\n ; " << bool(left==shift) << " " << (left-shift) << "\n";
-        std::cout << k << " : " <<  fLeft << " * " << fRight << " == " << fLeft * fRight << "  ;  " << left << " - " << right << " -> " <<  leftShifted << " " << rightShifted << "   shift=" << shift << "\n";
+        std::cout << "f(leftShifted) using  leftShifted=" << leftShifted << " ;  diagShifted(1:10):" << diagShifted.head(10).transpose()  << "\n ; "
+                  << "left==shift=" << bool(left==shift) << " ; left-shift = " << (left-shift) << "\n";
+        std::cout << "k=" << k << ", " <<  fLeft << " * " << fRight << " == " << fLeft * fRight << "  ;  "
+                  << "[" << left << " .. " << right << "] -> [" << leftShifted << " " << rightShifted << "], shift=" << shift
+                  << " ,  f(right)=" << secularEq(0,     col0, diag, perm, diagShifted, shift)
+                           << " == " << secularEq(right, col0, diag, perm, diag, 0) << " == " << fRight << "\n";
       }
 #endif
-      eigen_internal_assert(fLeft * fRight < 0);
+      eigen_internal_assert(fLeft * fRight < Literal(0));
 
-      while (rightShifted - leftShifted > 2 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(leftShifted), abs(rightShifted)))
+      if(fLeft<Literal(0))
       {
-        RealScalar midShifted = (leftShifted + rightShifted) / 2;
-        fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
-        if (fLeft * fMid < 0)
+        while (rightShifted - leftShifted > Literal(2) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(leftShifted), abs(rightShifted)))
         {
-          rightShifted = midShifted;
+          RealScalar midShifted = (leftShifted + rightShifted) / Literal(2);
+          fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
+          eigen_internal_assert((numext::isfinite)(fMid));
+
+          if (fLeft * fMid < Literal(0))
+          {
+            rightShifted = midShifted;
+          }
+          else
+          {
+            leftShifted = midShifted;
+            fLeft = fMid;
+          }
         }
-        else
-        {
-          leftShifted = midShifted;
-          fLeft = fMid;
-        }
+        muCur = (leftShifted + rightShifted) / Literal(2);
       }
-
-      muCur = (leftShifted + rightShifted) / 2;
+      else 
+      {
+        // We have a problem as shifting on the left or right give either a positive or negative value
+        // at the middle of [left,right]...
+        // Instead fo abbording or entering an infinite loop,
+        // let's just use the middle as the estimated zero-crossing:
+        muCur = (right - left) * RealScalar(0.5);
+        if(shift == right)
+          muCur = -muCur;
+      }
     }
-
+      
     singVals[k] = shift + muCur;
     shifts[k] = shift;
     mus[k] = muCur;
 
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
+    if(k+1<n)
+      std::cout << "found " << singVals[k] << " == " << shift << " + " << muCur << " from " << diag(k) << " .. "  << diag(k+1) << "\n";
+#endif
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+    assert(k==0 || singVals[k]>=singVals[k-1]);
+    assert(singVals[k]>=diag(k));
+#endif
+
     // perturb singular value slightly if it equals diagonal entry to avoid division by zero later
     // (deflation is supposed to avoid this from happening)
     // - this does no seem to be necessary anymore -
@@ -906,37 +998,68 @@
     zhat.setZero();
     return;
   }
-  Index last = perm(m-1);
+  Index lastIdx = perm(m-1);
   // The offset permits to skip deflated entries while computing zhat
   for (Index k = 0; k < n; ++k)
   {
-    if (col0(k) == 0) // deflated
-      zhat(k) = 0;
+    if (col0(k) == Literal(0)) // deflated
+      zhat(k) = Literal(0);
     else
     {
       // see equation (3.6)
       RealScalar dk = diag(k);
-      RealScalar prod = (singVals(last) + dk) * (mus(last) + (shifts(last) - dk));
+      RealScalar prod = (singVals(lastIdx) + dk) * (mus(lastIdx) + (shifts(lastIdx) - dk));
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+      if(prod<0) {
+        std::cout << "k = " << k << " ;  z(k)=" << col0(k) << ", diag(k)=" << dk << "\n";
+        std::cout << "prod = " << "(" << singVals(lastIdx) << " + " << dk << ") * (" << mus(lastIdx) << " + (" << shifts(lastIdx) << " - " << dk << "))" << "\n";
+        std::cout << "     = " << singVals(lastIdx) + dk << " * " << mus(lastIdx) + (shifts(lastIdx) - dk) <<  "\n";
+      }
+      assert(prod>=0);
+#endif
 
       for(Index l = 0; l<m; ++l)
       {
         Index i = perm(l);
         if(i!=k)
         {
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+          if(i>=k && (l==0 || l-1>=m))
+          {
+            std::cout << "Error in perturbCol0\n";
+            std::cout << "  " << k << "/" << n << " "  << l << "/" << m << " " << i << "/" << n << " ; " << col0(k) << " " << diag(k) << " "  <<  "\n";
+            std::cout << "  " <<diag(i) << "\n";
+            Index j = (i<k /*|| l==0*/) ? i : perm(l-1);
+            std::cout << "  " << "j=" << j << "\n";
+          }
+#endif
           Index j = i<k ? i : perm(l-1);
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+          if(!(dk!=Literal(0) || diag(i)!=Literal(0)))
+          {
+            std::cout << "k=" << k << ", i=" << i << ", l=" << l << ", perm.size()=" << perm.size() << "\n";
+          }
+          assert(dk!=Literal(0) || diag(i)!=Literal(0));
+#endif
           prod *= ((singVals(j)+dk) / ((diag(i)+dk))) * ((mus(j)+(shifts(j)-dk)) / ((diag(i)-dk)));
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+          assert(prod>=0);
+#endif
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-          if(i!=k && std::abs(((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) - 1) > 0.9 )
+          if(i!=k && numext::abs(((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) - 1) > 0.9 )
             std::cout << "     " << ((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) << " == (" << (singVals(j)+dk) << " * " << (mus(j)+(shifts(j)-dk))
                        << ") / (" << (diag(i)+dk) << " * " << (diag(i)-dk) << ")\n";
 #endif
         }
       }
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-      std::cout << "zhat(" << k << ") =  sqrt( " << prod << ")  ;  " << (singVals(last) + dk) << " * " << mus(last) + shifts(last) << " - " << dk << "\n";
+      std::cout << "zhat(" << k << ") =  sqrt( " << prod << ")  ;  " << (singVals(lastIdx) + dk) << " * " << mus(lastIdx) + shifts(lastIdx) << " - " << dk << "\n";
 #endif
       RealScalar tmp = sqrt(prod);
-      zhat(k) = col0(k) > 0 ? tmp : -tmp;
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+      assert((numext::isfinite)(tmp));
+#endif
+      zhat(k) = col0(k) > Literal(0) ? RealScalar(tmp) : RealScalar(-tmp);
     }
   }
 }
@@ -949,10 +1072,10 @@
 {
   Index n = zhat.size();
   Index m = perm.size();
-
+  
   for (Index k = 0; k < n; ++k)
   {
-    if (zhat(k) == 0)
+    if (zhat(k) == Literal(0))
     {
       U.col(k) = VectorType::Unit(n+1, k);
       if (m_compV) V.col(k) = VectorType::Unit(n, k);
@@ -965,9 +1088,9 @@
         Index i = perm(l);
         U(i,k) = zhat(i)/(((diag(i) - shifts(k)) - mus(k)) )/( (diag(i) + singVals[k]));
       }
-      U(n,k) = 0;
+      U(n,k) = Literal(0);
       U.col(k).normalize();
-
+    
       if (m_compV)
       {
         V.col(k).setZero();
@@ -976,7 +1099,7 @@
           Index i = perm(l);
           V(i,k) = diag(i) * zhat(i) / (((diag(i) - shifts(k)) - mus(k)) )/( (diag(i) + singVals[k]));
         }
-        V(0,k) = -1;
+        V(0,k) = Literal(-1);
         V.col(k).normalize();
       }
     }
@@ -989,7 +1112,7 @@
 // i >= 1, di almost null and zi non null.
 // We use a rotation to zero out zi applied to the left of M
 template <typename MatrixType>
-void BDCSVD<MatrixType>::deflation43(Index firstCol, Index shift, Index i, Index size)
+void BDCSVD<MatrixType>::deflation43(Eigen::Index firstCol, Eigen::Index shift, Eigen::Index i, Eigen::Index size)
 {
   using std::abs;
   using std::sqrt;
@@ -997,16 +1120,16 @@
   Index start = firstCol + shift;
   RealScalar c = m_computed(start, start);
   RealScalar s = m_computed(start+i, start);
-  RealScalar r = sqrt(numext::abs2(c) + numext::abs2(s));
-  if (r == 0)
+  RealScalar r = numext::hypot(c,s);
+  if (r == Literal(0))
   {
-    m_computed(start+i, start+i) = 0;
+    m_computed(start+i, start+i) = Literal(0);
     return;
   }
-  m_computed(start,start) = r;
-  m_computed(start+i, start) = 0;
-  m_computed(start+i, start+i) = 0;
-
+  m_computed(start,start) = r;  
+  m_computed(start+i, start) = Literal(0);
+  m_computed(start+i, start+i) = Literal(0);
+  
   JacobiRotation<RealScalar> J(c/r,-s/r);
   if (m_compU)  m_naiveU.middleRows(firstCol, size+1).applyOnTheRight(firstCol, firstCol+i, J);
   else          m_naiveU.applyOnTheRight(firstCol, firstCol+i, J);
@@ -1018,7 +1141,7 @@
 // We apply two rotations to have zj = 0;
 // TODO deflation44 is still broken and not properly tested
 template <typename MatrixType>
-void BDCSVD<MatrixType>::deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size)
+void BDCSVD<MatrixType>::deflation44(Eigen::Index firstColu , Eigen::Index firstColm, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index i, Eigen::Index j, Eigen::Index size)
 {
   using std::abs;
   using std::sqrt;
@@ -1038,7 +1161,7 @@
     << m_computed(firstColm + i+1, firstColm+i+1) << " "
     << m_computed(firstColm + i+2, firstColm+i+2) << "\n";
 #endif
-  if (r==0)
+  if (r==Literal(0))
   {
     m_computed(firstColm + i, firstColm + i) = m_computed(firstColm + j, firstColm + j);
     return;
@@ -1047,7 +1170,7 @@
   s/=r;
   m_computed(firstColm + i, firstColm) = r;
   m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i);
-  m_computed(firstColm + j, firstColm) = 0;
+  m_computed(firstColm + j, firstColm) = Literal(0);
 
   JacobiRotation<RealScalar> J(c,-s);
   if (m_compU)  m_naiveU.middleRows(firstColu, size+1).applyOnTheRight(firstColu + i, firstColu + j, J);
@@ -1058,34 +1181,34 @@
 
 // acts on block from (firstCol+shift, firstCol+shift) to (lastCol+shift, lastCol+shift) [inclusive]
 template <typename MatrixType>
-void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift)
+void BDCSVD<MatrixType>::deflation(Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index k, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift)
 {
   using std::sqrt;
   using std::abs;
   const Index length = lastCol + 1 - firstCol;
-
+  
   Block<MatrixXr,Dynamic,1> col0(m_computed, firstCol+shift, firstCol+shift, length, 1);
   Diagonal<MatrixXr> fulldiag(m_computed);
   VectorBlock<Diagonal<MatrixXr>,Dynamic> diag(fulldiag, firstCol+shift, length);
-
+  
   const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
   RealScalar maxDiag = diag.tail((std::max)(Index(1),length-1)).cwiseAbs().maxCoeff();
   RealScalar epsilon_strict = numext::maxi<RealScalar>(considerZero,NumTraits<RealScalar>::epsilon() * maxDiag);
-  RealScalar epsilon_coarse = 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag);
-
+  RealScalar epsilon_coarse = Literal(8) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag);
+  
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   assert(m_naiveU.allFinite());
   assert(m_naiveV.allFinite());
   assert(m_computed.allFinite());
 #endif
 
-#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE  
   std::cout << "\ndeflate:" << diag.head(k+1).transpose() << "  |  " << diag.segment(k+1,length-k-1).transpose() << "\n";
 #endif
-
+  
   //condition 4.1
   if (diag(0) < epsilon_coarse)
-  {
+  { 
 #ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
     std::cout << "deflation 4.1, because " << diag(0) << " < " << epsilon_coarse << "\n";
 #endif
@@ -1099,7 +1222,7 @@
 #ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
       std::cout << "deflation 4.2, set z(" << i << ") to zero because " << abs(col0(i)) << " < " << epsilon_strict << "  (diag(" << i << ")=" << diag(i) << ")\n";
 #endif
-      col0(i) = 0;
+      col0(i) = Literal(0);
     }
 
   //condition 4.3
@@ -1119,24 +1242,25 @@
 #endif
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
   std::cout << "to be sorted: " << diag.transpose() << "\n\n";
+  std::cout << "            : " << col0.transpose() << "\n\n";
 #endif
   {
     // Check for total deflation
-    // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting
-    bool total_deflation = (col0.tail(length-1).array()<considerZero).all();
-
+    // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting.
+    const bool total_deflation = (col0.tail(length-1).array().abs()<considerZero).all();
+    
     // Sort the diagonal entries, since diag(1:k-1) and diag(k:length) are already sorted, let's do a sorted merge.
     // First, compute the respective permutation.
     Index *permutation = m_workspaceI.data();
     {
       permutation[0] = 0;
       Index p = 1;
-
+      
       // Move deflated diagonal entries at the end.
       for(Index i=1; i<length; ++i)
         if(abs(diag(i))<considerZero)
           permutation[p++] = i;
-
+        
       Index i=1, j=k+1;
       for( ; p < length; ++p)
       {
@@ -1146,7 +1270,7 @@
         else                        permutation[p] = i++;
       }
     }
-
+    
     // If we have a total deflation, then we have to insert diag(0) at the right place
     if(total_deflation)
     {
@@ -1162,22 +1286,22 @@
         }
       }
     }
-
+    
     // Current index of each col, and current column of each index
     Index *realInd = m_workspaceI.data()+length;
     Index *realCol = m_workspaceI.data()+2*length;
-
+    
     for(int pos = 0; pos< length; pos++)
     {
       realCol[pos] = pos;
       realInd[pos] = pos;
     }
-
+    
     for(Index i = total_deflation?0:1; i < length; i++)
     {
       const Index pi = permutation[length - (total_deflation ? i+1 : i)];
       const Index J = realCol[pi];
-
+      
       using std::swap;
       // swap diagonal and first column entries:
       swap(diag(i), diag(J));
@@ -1200,7 +1324,7 @@
   std::cout << "sorted: " << diag.transpose().format(bdcsvdfmt) << "\n";
   std::cout << "      : " << col0.transpose() << "\n\n";
 #endif
-
+    
   //condition 4.4
   {
     Index i = length-1;
@@ -1209,18 +1333,18 @@
        if( (diag(i) - diag(i-1)) < NumTraits<RealScalar>::epsilon()*maxDiag )
       {
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-        std::cout << "deflation 4.4 with i = " << i << " because " << (diag(i) - diag(i-1)) << " < " << NumTraits<RealScalar>::epsilon()*diag(i) << "\n";
+        std::cout << "deflation 4.4 with i = " << i << " because " << diag(i) << " - " << diag(i-1) << " == " << (diag(i) - diag(i-1)) << " < " << NumTraits<RealScalar>::epsilon()*/*diag(i)*/maxDiag << "\n";
 #endif
         eigen_internal_assert(abs(diag(i) - diag(i-1))<epsilon_coarse && " diagonal entries are not properly sorted");
         deflation44(firstCol, firstCol + shift, firstRowW, firstColW, i-1, i, length);
       }
   }
-
+  
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   for(Index j=2;j<length;++j)
     assert(diag(j-1)<=diag(j) || abs(diag(j))<considerZero);
 #endif
-
+  
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   assert(m_naiveU.allFinite());
   assert(m_naiveV.allFinite());
@@ -1228,24 +1352,6 @@
 #endif
 }//end deflation
 
-
-namespace internal {
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<BDCSVD<_MatrixType>, Rhs>
-  : solve_retval_base<BDCSVD<_MatrixType>, Rhs>
-{
-  typedef BDCSVD<_MatrixType> BDCSVDType;
-  EIGEN_MAKE_SOLVE_HELPERS(BDCSVDType,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve_impl(rhs(), dst);
-  }
-};
-} // end namespace internal
-
-
-#ifndef __CUDACC__
 /** \svd_module
   *
   * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm
@@ -1258,7 +1364,6 @@
 {
   return BDCSVD<PlainObject>(*this, computationOptions);
 }
-#endif
 
 } // end namespace Eigen
 

diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
index 7e91250..9d95acd 100644
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h

@@ -11,7 +11,7 @@
 #ifndef EIGEN_JACOBISVD_H
 #define EIGEN_JACOBISVD_H
 
-namespace Eigen {
+namespace Eigen { 
 
 namespace internal {
 // forward declaration (needed by ICC)
@@ -114,8 +114,10 @@
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
     Options = MatrixType::Options
   };
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
-          TransposeTypeWithSameStorageOrder;
+
+  typedef typename internal::make_proper_matrix_type<
+    Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime
+  >::type TransposeTypeWithSameStorageOrder;
 
   void allocate(const JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd)
   {
@@ -203,8 +205,9 @@
     Options = MatrixType::Options
   };
 
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
-          TransposeTypeWithSameStorageOrder;
+  typedef typename internal::make_proper_matrix_type<
+    Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime
+  >::type TransposeTypeWithSameStorageOrder;
 
   void allocate(const JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd)
   {
@@ -299,8 +302,9 @@
     Options = MatrixType::Options
   };
 
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
-          TransposeTypeWithSameStorageOrder;
+  typedef typename internal::make_proper_matrix_type<
+    Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime
+  >::type TransposeTypeWithSameStorageOrder;
 
   void allocate(const JacobiSVD<MatrixType, HouseholderQRPreconditioner>& svd)
   {
@@ -350,8 +354,8 @@
 struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, false>
 {
   typedef JacobiSVD<MatrixType, QRPreconditioner> SVD;
-  typedef typename SVD::Index Index;
-  static void run(typename SVD::WorkMatrixType&, SVD&, Index, Index) {}
+  typedef typename MatrixType::RealScalar RealScalar;
+  static bool run(typename SVD::WorkMatrixType&, SVD&, Index, Index, RealScalar&) { return true; }
 };
 
 template<typename MatrixType, int QRPreconditioner>
@@ -360,24 +364,36 @@
   typedef JacobiSVD<MatrixType, QRPreconditioner> SVD;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
-  typedef typename SVD::Index Index;
-  static void run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q)
+  static bool run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q, RealScalar& maxDiagEntry)
   {
     using std::sqrt;
+    using std::abs;
     Scalar z;
     JacobiRotation<Scalar> rot;
     RealScalar n = sqrt(numext::abs2(work_matrix.coeff(p,p)) + numext::abs2(work_matrix.coeff(q,p)));
+
+    const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+    const RealScalar precision = NumTraits<Scalar>::epsilon();
+
     if(n==0)
     {
-      z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
-      work_matrix.row(p) *= z;
-      if(svd.computeU()) svd.m_matrixU.col(p) *= conj(z);
-      if(work_matrix.coeff(q,q)!=Scalar(0))
+      // make sure first column is zero
+      work_matrix.coeffRef(p,p) = work_matrix.coeffRef(q,p) = Scalar(0);
+
+      if(abs(numext::imag(work_matrix.coeff(p,q)))>considerAsZero)
+      {
+        // work_matrix.coeff(p,q) can be zero if work_matrix.coeff(q,p) is not zero but small enough to underflow when computing n
+        z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
+        work_matrix.row(p) *= z;
+        if(svd.computeU()) svd.m_matrixU.col(p) *= conj(z);
+      }
+      if(abs(numext::imag(work_matrix.coeff(q,q)))>considerAsZero)
+      {
         z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q);
-      else
-        z = Scalar(0);
-      work_matrix.row(q) *= z;
-      if(svd.computeU()) svd.m_matrixU.col(q) *= conj(z);
+        work_matrix.row(q) *= z;
+        if(svd.computeU()) svd.m_matrixU.col(q) *= conj(z);
+      }
+      // otherwise the second row is already zero, so we have nothing to do.
     }
     else
     {
@@ -385,59 +401,35 @@
       rot.s() = work_matrix.coeff(q,p) / n;
       work_matrix.applyOnTheLeft(p,q,rot);
       if(svd.computeU()) svd.m_matrixU.applyOnTheRight(p,q,rot.adjoint());
-      if(work_matrix.coeff(p,q) != Scalar(0))
+      if(abs(numext::imag(work_matrix.coeff(p,q)))>considerAsZero)
       {
-        Scalar z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
+        z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
         work_matrix.col(q) *= z;
         if(svd.computeV()) svd.m_matrixV.col(q) *= z;
       }
-      if(work_matrix.coeff(q,q) != Scalar(0))
+      if(abs(numext::imag(work_matrix.coeff(q,q)))>considerAsZero)
       {
         z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q);
         work_matrix.row(q) *= z;
         if(svd.computeU()) svd.m_matrixU.col(q) *= conj(z);
       }
     }
+
+    // update largest diagonal entry
+    maxDiagEntry = numext::maxi<RealScalar>(maxDiagEntry,numext::maxi<RealScalar>(abs(work_matrix.coeff(p,p)), abs(work_matrix.coeff(q,q))));
+    // and check whether the 2x2 block is already diagonal
+    RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+    return abs(work_matrix.coeff(p,q))>threshold || abs(work_matrix.coeff(q,p)) > threshold;
   }
 };
 
-template<typename _MatrixType, int QRPreconditioner>
+template<typename _MatrixType, int QRPreconditioner> 
 struct traits<JacobiSVD<_MatrixType,QRPreconditioner> >
+        : traits<_MatrixType>
 {
   typedef _MatrixType MatrixType;
 };
 
-template<typename MatrixType, typename RealScalar, typename Index>
-void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
-                         JacobiRotation<RealScalar> *j_left,
-                         JacobiRotation<RealScalar> *j_right)
-{
-  using std::sqrt;
-  using std::abs;
-  Matrix<RealScalar,2,2> m;
-  m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)),
-       numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q));
-  JacobiRotation<RealScalar> rot1;
-  RealScalar t = m.coeff(0,0) + m.coeff(1,1);
-  RealScalar d = m.coeff(1,0) - m.coeff(0,1);
-  if(t == RealScalar(0))
-  {
-    rot1.c() = RealScalar(0);
-    rot1.s() = d > RealScalar(0) ? RealScalar(1) : RealScalar(-1);
-  }
-  else
-  {
-    RealScalar t2d2 = numext::hypot(t,d);
-    rot1.c() = abs(t)/t2d2;
-    rot1.s() = d/t2d2;
-    if(t<RealScalar(0))
-      rot1.s() = -rot1.s();
-  }
-  m.applyOnTheLeft(0,1,rot1);
-  j_right->makeJacobi(m,0,1);
-  *j_left  = rot1 * j_right->transpose();
-}
-
 } // end namespace internal
 
 /** \ingroup SVD_Module
@@ -515,7 +507,7 @@
     typedef typename Base::MatrixUType MatrixUType;
     typedef typename Base::MatrixVType MatrixVType;
     typedef typename Base::SingularValuesType SingularValuesType;
-
+    
     typedef typename internal::plain_row_type<MatrixType>::type RowType;
     typedef typename internal::plain_col_type<MatrixType>::type ColType;
     typedef Matrix<Scalar, DiagSizeAtCompileTime, DiagSizeAtCompileTime,
@@ -586,25 +578,6 @@
     using Base::cols;
     using Base::rank;
 
-   /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A.
-      *
-      * \param b the right-hand-side of the equation to solve.
-      *
-      * \note Solving requires both U and V to be computed. Thin U and V are enough, there is no need for full U or V.
-      *
-      * \note SVD solving is implicitly least-squares. Thus, this method serves both purposes of exact solving and least-squares solving.
-      * In other words, the returned solution is guaranteed to minimize the Euclidean norm \f$ \Vert A x - b \Vert \f$.
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<JacobiSVD, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(this->m_isInitialized && "JacobiSVD is not initialized.");
-      eigen_assert(Base::computeU() && Base::computeV() && "JacobiSVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice).");
-      return internal::solve_retval<JacobiSVD<_MatrixType,QRPreconditioner>, Rhs>(*this, b.derived());
-    }
-
-
   private:
     void allocate(Index rows, Index cols, unsigned int computationOptions);
 
@@ -612,6 +585,7 @@
     using Base::m_matrixU;
     using Base::m_matrixV;
     using Base::m_singularValues;
+    using Base::m_info;
     using Base::m_isInitialized;
     using Base::m_isAllocated;
     using Base::m_usePrescribedThreshold;
@@ -634,10 +608,11 @@
 
     internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreColsThanRows> m_qr_precond_morecols;
     internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreRowsThanCols> m_qr_precond_morerows;
+    MatrixType m_scaledMatrix;
 };
 
 template<typename MatrixType, int QRPreconditioner>
-void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Index rows, Index cols, unsigned int computationOptions)
+void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions)
 {
   eigen_assert(rows >= 0 && cols >= 0);
 
@@ -651,6 +626,7 @@
 
   m_rows = rows;
   m_cols = cols;
+  m_info = Success;
   m_isInitialized = false;
   m_isAllocated = true;
   m_computationOptions = computationOptions;
@@ -679,9 +655,10 @@
                             : m_computeThinV ? m_diagSize
                             : 0);
   m_workMatrix.resize(m_diagSize, m_diagSize);
-
-  if(m_cols>m_rows) m_qr_precond_morecols.allocate(*this);
-  if(m_rows>m_cols) m_qr_precond_morerows.allocate(*this);
+  
+  if(m_cols>m_rows)   m_qr_precond_morecols.allocate(*this);
+  if(m_rows>m_cols)   m_qr_precond_morerows.allocate(*this);
+  if(m_rows!=m_cols)  m_scaledMatrix.resize(rows,cols);
 }
 
 template<typename MatrixType, int QRPreconditioner>
@@ -695,26 +672,37 @@
   // only worsening the precision of U and V as we accumulate more rotations
   const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();
 
-  // limit for very small denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
-  const RealScalar considerAsZero = RealScalar(2) * std::numeric_limits<RealScalar>::denorm_min();
+  // limit for denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
 
+  // Scaling factor to reduce over/under-flows
+  RealScalar scale = matrix.cwiseAbs().template maxCoeff<PropagateNaN>();
+  if (!(numext::isfinite)(scale)) {
+    m_isInitialized = true;
+    m_info = InvalidInput;
+    return *this;
+  }
+  if(scale==RealScalar(0)) scale = RealScalar(1);
+  
   /*** step 1. The R-SVD step: we use a QR decomposition to reduce to the case of a square matrix */
 
-  if(!m_qr_precond_morecols.run(*this, matrix) && !m_qr_precond_morerows.run(*this, matrix))
+  if(m_rows!=m_cols)
   {
-    m_workMatrix = matrix.block(0,0,m_diagSize,m_diagSize);
+    m_scaledMatrix = matrix / scale;
+    m_qr_precond_morecols.run(*this, m_scaledMatrix);
+    m_qr_precond_morerows.run(*this, m_scaledMatrix);
+  }
+  else
+  {
+    m_workMatrix = matrix.block(0,0,m_diagSize,m_diagSize) / scale;
     if(m_computeFullU) m_matrixU.setIdentity(m_rows,m_rows);
     if(m_computeThinU) m_matrixU.setIdentity(m_rows,m_diagSize);
     if(m_computeFullV) m_matrixV.setIdentity(m_cols,m_cols);
     if(m_computeThinV) m_matrixV.setIdentity(m_cols, m_diagSize);
   }
 
-  // Scaling factor to reducover/under-flows
-  RealScalar scale = m_workMatrix.cwiseAbs().maxCoeff();
-  if(scale==RealScalar(0)) scale = RealScalar(1);
-  m_workMatrix /= scale;
-
   /*** step 2. The main Jacobi SVD iteration. ***/
+  RealScalar maxDiagEntry = m_workMatrix.cwiseAbs().diagonal().maxCoeff();
 
   bool finished = false;
   while(!finished)
@@ -730,23 +718,27 @@
         // if this 2x2 sub-matrix is not diagonal already...
         // notice that this comparison will evaluate to false if any NaN is involved, ensuring that NaN's don't
         // keep us iterating forever. Similarly, small denormal numbers are considered zero.
-        RealScalar threshold = numext::maxi(considerAsZero, precision * numext::maxi(abs(m_workMatrix.coeff(p,p)),
-                                                                                     abs(m_workMatrix.coeff(q,q))));
-        if(numext::maxi(abs(m_workMatrix.coeff(p,q)),abs(m_workMatrix.coeff(q,p))) > threshold)
+        RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+        if(abs(m_workMatrix.coeff(p,q))>threshold || abs(m_workMatrix.coeff(q,p)) > threshold)
         {
           finished = false;
-
           // perform SVD decomposition of 2x2 sub-matrix corresponding to indices p,q to make it diagonal
-          internal::svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner>::run(m_workMatrix, *this, p, q);
-          JacobiRotation<RealScalar> j_left, j_right;
-          internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
+          // the complex to real operation returns true if the updated 2x2 block is not already diagonal
+          if(internal::svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner>::run(m_workMatrix, *this, p, q, maxDiagEntry))
+          {
+            JacobiRotation<RealScalar> j_left, j_right;
+            internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
 
-          // accumulate resulting Jacobi rotations
-          m_workMatrix.applyOnTheLeft(p,q,j_left);
-          if(computeU()) m_matrixU.applyOnTheRight(p,q,j_left.transpose());
+            // accumulate resulting Jacobi rotations
+            m_workMatrix.applyOnTheLeft(p,q,j_left);
+            if(computeU()) m_matrixU.applyOnTheRight(p,q,j_left.transpose());
 
-          m_workMatrix.applyOnTheRight(p,q,j_right);
-          if(computeV()) m_matrixV.applyOnTheRight(p,q,j_right);
+            m_workMatrix.applyOnTheRight(p,q,j_right);
+            if(computeV()) m_matrixV.applyOnTheRight(p,q,j_right);
+
+            // keep track of the largest diagonal coefficient
+            maxDiagEntry = numext::maxi<RealScalar>(maxDiagEntry,numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p,p)), abs(m_workMatrix.coeff(q,q))));
+          }
         }
       }
     }
@@ -756,11 +748,24 @@
 
   for(Index i = 0; i < m_diagSize; ++i)
   {
-    RealScalar a = abs(m_workMatrix.coeff(i,i));
-    m_singularValues.coeffRef(i) = a;
-    if(computeU() && (a!=RealScalar(0))) m_matrixU.col(i) *= m_workMatrix.coeff(i,i)/a;
+    // For a complex matrix, some diagonal coefficients might note have been
+    // treated by svd_precondition_2x2_block_to_be_real, and the imaginary part
+    // of some diagonal entry might not be null.
+    if(NumTraits<Scalar>::IsComplex && abs(numext::imag(m_workMatrix.coeff(i,i)))>considerAsZero)
+    {
+      RealScalar a = abs(m_workMatrix.coeff(i,i));
+      m_singularValues.coeffRef(i) = abs(a);
+      if(computeU()) m_matrixU.col(i) *= m_workMatrix.coeff(i,i)/a;
+    }
+    else
+    {
+      // m_workMatrix.coeff(i,i) is already real, no difficulty:
+      RealScalar a = numext::real(m_workMatrix.coeff(i,i));
+      m_singularValues.coeffRef(i) = abs(a);
+      if(computeU() && (a<RealScalar(0))) m_matrixU.col(i) = -m_matrixU.col(i);
+    }
   }
-
+  
   m_singularValues *= scale;
 
   /*** step 4. Sort singular values in descending order and compute the number of nonzero singular values ***/
@@ -788,23 +793,6 @@
   return *this;
 }
 
-namespace internal {
-template<typename _MatrixType, int QRPreconditioner, typename Rhs>
-struct solve_retval<JacobiSVD<_MatrixType, QRPreconditioner>, Rhs>
-  : solve_retval_base<JacobiSVD<_MatrixType, QRPreconditioner>, Rhs>
-{
-  typedef JacobiSVD<_MatrixType, QRPreconditioner> JacobiSVDType;
-  EIGEN_MAKE_SOLVE_HELPERS(JacobiSVDType,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve_impl(rhs(), dst);
-  }
-};
-} // end namespace internal
-
-
-#ifndef __CUDACC__
 /** \svd_module
   *
   * \return the singular value decomposition of \c *this computed by two-sided
@@ -818,7 +806,6 @@
 {
   return JacobiSVD<PlainObject>(*this, computationOptions);
 }
-#endif // __CUDACC__
 
 } // end namespace Eigen
 

diff --git a/Eigen/src/SVD/JacobiSVD_LAPACKE.h b/Eigen/src/SVD/JacobiSVD_LAPACKE.h
index 5027215..ff0516f 100644
--- a/Eigen/src/SVD/JacobiSVD_LAPACKE.h
+++ b/Eigen/src/SVD/JacobiSVD_LAPACKE.h

@@ -61,9 +61,10 @@
     u    = (LAPACKE_TYPE*)m_matrixU.data(); \
   } else { ldu=1; u=&dummy; }\
   MatrixType localV; \
-  ldvt = (m_computeFullV) ? internal::convert_index<lapack_int>(m_cols) : (m_computeThinV) ? internal::convert_index<lapack_int>(m_diagSize) : 1; \
+  lapack_int vt_rows = (m_computeFullV) ? internal::convert_index<lapack_int>(m_cols) : (m_computeThinV) ? internal::convert_index<lapack_int>(m_diagSize) : 1; \
   if (computeV()) { \
-    localV.resize(ldvt, m_cols); \
+    localV.resize(vt_rows, m_cols); \
+    ldvt  = internal::convert_index<lapack_int>(localV.outerStride()); \
     vt   = (LAPACKE_TYPE*)localV.data(); \
   } else { ldvt=1; vt=&dummy; }\
   Matrix<LAPACKE_RTYPE, Dynamic, Dynamic> superb; superb.resize(m_diagSize, 1); \

diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h
index f25e301..bc7ab88 100644
--- a/Eigen/src/SVD/SVDBase.h
+++ b/Eigen/src/SVD/SVDBase.h

@@ -17,6 +17,18 @@
 #define EIGEN_SVDBASE_H
 
 namespace Eigen {
+
+namespace internal {
+template<typename Derived> struct traits<SVDBase<Derived> >
+ : traits<Derived>
+{
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
+  enum { Flags = 0 };
+};
+}
+
 /** \ingroup SVD_Module
  *
  *
@@ -34,25 +46,31 @@
  *
  * Singular values are always sorted in decreasing order.
  *
- *
+ * 
  * You can ask for only \em thin \a U or \a V to be computed, meaning the following. In case of a rectangular n-by-p matrix, letting \a m be the
  * smaller value among \a n and \a p, there are only \a m singular vectors; the remaining columns of \a U and \a V do not correspond to actual
  * singular vectors. Asking for \em thin \a U or \a V means asking for only their \a m first columns to be formed. So \a U is then a n-by-m matrix,
  * and \a V is then a p-by-m matrix. Notice that thin \a U and \a V are all you need for (least squares) solving.
- *
- * If the input matrix has inf or nan coefficients, the result of the computation is undefined, but the computation is guaranteed to
+ * 
+ * The status of the computation can be retrived using the \a info() method. Unless \a info() returns \a Success, the results should be not
+ * considered well defined.
+ *  
+ * If the input matrix has inf or nan coefficients, the result of the computation is undefined, and \a info() will return \a InvalidInput, but the computation is guaranteed to
  * terminate in finite (and reasonable) time.
  * \sa class BDCSVD, class JacobiSVD
  */
-template<typename Derived>
-class SVDBase
+template<typename Derived> class SVDBase
+ : public SolverBase<SVDBase<Derived> >
 {
+public: 
+   
+  template<typename Derived_>
+  friend struct internal::solve_assertion;
 
-public:
   typedef typename internal::traits<Derived>::MatrixType MatrixType;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-  typedef typename MatrixType::Index StorageIndex;
+  typedef typename Eigen::internal::traits<SVDBase>::StorageIndex StorageIndex;
   typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
   enum {
     RowsAtCompileTime = MatrixType::RowsAtCompileTime,
@@ -67,7 +85,7 @@
   typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, MatrixOptions, MaxRowsAtCompileTime, MaxRowsAtCompileTime> MatrixUType;
   typedef Matrix<Scalar, ColsAtCompileTime, ColsAtCompileTime, MatrixOptions, MaxColsAtCompileTime, MaxColsAtCompileTime> MatrixVType;
   typedef typename internal::plain_diag_type<MatrixType, RealScalar>::type SingularValuesType;
-
+  
   Derived& derived() { return *static_cast<Derived*>(this); }
   const Derived& derived() const { return *static_cast<const Derived*>(this); }
 
@@ -82,7 +100,7 @@
    */
   const MatrixUType& matrixU() const
   {
-    eigen_assert(m_isInitialized && "SVD is not initialized.");
+    _check_compute_assertions();
     eigen_assert(computeU() && "This SVD decomposition didn't compute U. Did you ask for it?");
     return m_matrixU;
   }
@@ -98,7 +116,7 @@
    */
   const MatrixVType& matrixV() const
   {
-    eigen_assert(m_isInitialized && "SVD is not initialized.");
+    _check_compute_assertions();
     eigen_assert(computeV() && "This SVD decomposition didn't compute V. Did you ask for it?");
     return m_matrixV;
   }
@@ -110,17 +128,17 @@
    */
   const SingularValuesType& singularValues() const
   {
-    eigen_assert(m_isInitialized && "SVD is not initialized.");
+    _check_compute_assertions();
     return m_singularValues;
   }
 
   /** \returns the number of singular values that are not exactly 0 */
   Index nonzeroSingularValues() const
   {
-    eigen_assert(m_isInitialized && "SVD is not initialized.");
+    _check_compute_assertions();
     return m_nonzeroSingularValues;
   }
-
+  
   /** \returns the rank of the matrix of which \c *this is the SVD.
     *
     * \note This method has to determine which singular values should be considered nonzero.
@@ -130,14 +148,14 @@
   inline Index rank() const
   {
     using std::abs;
-    eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
+    _check_compute_assertions();
     if(m_singularValues.size()==0) return 0;
     RealScalar premultiplied_threshold = numext::maxi<RealScalar>(m_singularValues.coeff(0) * threshold(), (std::numeric_limits<RealScalar>::min)());
     Index i = m_nonzeroSingularValues-1;
     while(i>=0 && m_singularValues.coeff(i) < premultiplied_threshold) --i;
     return i+1;
   }
-
+  
   /** Allows to prescribe a threshold to be used by certain methods, such as rank() and solve(),
     * which need to determine when singular values are to be considered nonzero.
     * This is not used for the SVD decomposition itself.
@@ -180,10 +198,10 @@
   RealScalar threshold() const
   {
     eigen_assert(m_isInitialized || m_usePrescribedThreshold);
-    return m_usePrescribedThreshold
-               ? m_prescribedThreshold
-               : static_cast<RealScalar>(numext::maxi<Index>(1, m_diagSize)) *
-                     NumTraits<Scalar>::epsilon();
+    // this temporary is needed to workaround a MSVC issue
+    Index diagSize = (std::max<Index>)(1,m_diagSize);
+    return m_usePrescribedThreshold ? m_prescribedThreshold
+                                    : RealScalar(diagSize)*NumTraits<Scalar>::epsilon();
   }
 
   /** \returns true if \a U (full or thin) is asked for in this SVD decomposition */
@@ -193,7 +211,8 @@
 
   inline Index rows() const { return m_rows; }
   inline Index cols() const { return m_cols; }
-
+  
+  #ifdef EIGEN_PARSED_BY_DOXYGEN
   /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A.
     *
     * \param b the right-hand-side of the equation to solve.
@@ -204,18 +223,28 @@
     * In other words, the returned solution is guaranteed to minimize the Euclidean norm \f$ \Vert A x - b \Vert \f$.
     */
   template<typename Rhs>
-  inline const internal::solve_retval<Derived, Rhs>
-  solve(const MatrixBase<Rhs>& b) const
+  inline const Solve<Derived, Rhs>
+  solve(const MatrixBase<Rhs>& b) const;
+  #endif
+
+
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful.
+   */
+  EIGEN_DEVICE_FUNC
+  ComputationInfo info() const
   {
     eigen_assert(m_isInitialized && "SVD is not initialized.");
-    eigen_assert(computeU() && computeV() && "SVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice).");
-    return internal::solve_retval<Derived, Rhs>(derived(), b.derived());
+    return m_info;
   }
 
   #ifndef EIGEN_PARSED_BY_DOXYGEN
   template<typename RhsType, typename DstType>
-  EIGEN_DEVICE_FUNC
   void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+  template<bool Conjugate, typename RhsType, typename DstType>
+  void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
   #endif
 
 protected:
@@ -225,12 +254,25 @@
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
   }
 
+  void _check_compute_assertions() const {
+    eigen_assert(m_isInitialized && "SVD is not initialized.");
+  }
+
+  template<bool Transpose_, typename Rhs>
+  void _check_solve_assertion(const Rhs& b) const {
+      EIGEN_ONLY_USED_FOR_DEBUG(b);
+      _check_compute_assertions();
+      eigen_assert(computeU() && computeV() && "SVDBase::solve(): Both unitaries U and V are required to be computed (thin unitaries suffice).");
+      eigen_assert((Transpose_?cols():rows())==b.rows() && "SVDBase::solve(): invalid number of rows of the right hand side matrix b");
+  }
+
   // return true if already allocated
   bool allocate(Index rows, Index cols, unsigned int computationOptions) ;
 
   MatrixUType m_matrixU;
   MatrixVType m_matrixV;
   SingularValuesType m_singularValues;
+  ComputationInfo m_info;
   bool m_isInitialized, m_isAllocated, m_usePrescribedThreshold;
   bool m_computeFullU, m_computeThinU;
   bool m_computeFullV, m_computeThinV;
@@ -243,9 +285,14 @@
    * Default constructor of SVDBase
    */
   SVDBase()
-    : m_isInitialized(false),
+    : m_info(Success),
+      m_isInitialized(false),
       m_isAllocated(false),
       m_usePrescribedThreshold(false),
+      m_computeFullU(false),
+      m_computeThinU(false),
+      m_computeFullV(false),
+      m_computeThinV(false),
       m_computationOptions(0),
       m_rows(-1), m_cols(-1), m_diagSize(0)
   {
@@ -258,20 +305,32 @@
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename Derived>
 template<typename RhsType, typename DstType>
-EIGEN_DEVICE_FUNC
 void SVDBase<Derived>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  eigen_assert(rhs.rows() == rows());
-
   // A = U S V^*
   // So A^{-1} = V S^{-1} U^*
 
-  Matrix<Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime, RhsType::MaxColsAtCompileTime> tmp;
+  Matrix<typename RhsType::Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime, RhsType::MaxColsAtCompileTime> tmp;
   Index l_rank = rank();
   tmp.noalias() =  m_matrixU.leftCols(l_rank).adjoint() * rhs;
   tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp;
   dst = m_matrixV.leftCols(l_rank) * tmp;
 }
+
+template<typename Derived>
+template<bool Conjugate, typename RhsType, typename DstType>
+void SVDBase<Derived>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+  // A = U S V^*
+  // So  A^{-*} = U S^{-1} V^*
+  // And A^{-T} = U_conj S^{-1} V^T
+  Matrix<typename RhsType::Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime, RhsType::MaxColsAtCompileTime> tmp;
+  Index l_rank = rank();
+
+  tmp.noalias() =  m_matrixV.leftCols(l_rank).transpose().template conjugateIf<Conjugate>() * rhs;
+  tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp;
+  dst = m_matrixU.template conjugateIf<!Conjugate>().leftCols(l_rank) * tmp;
+}
 #endif
 
 template<typename MatrixType>
@@ -289,6 +348,7 @@
 
   m_rows = rows;
   m_cols = cols;
+  m_info = Success;
   m_isInitialized = false;
   m_isAllocated = true;
   m_computationOptions = computationOptions;

diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h
index 0b14608..997defc 100644
--- a/Eigen/src/SVD/UpperBidiagonalization.h
+++ b/Eigen/src/SVD/UpperBidiagonalization.h

@@ -127,7 +127,7 @@
        .makeHouseholderInPlace(mat.coeffRef(k,k+1), upper_diagonal[k]);
     // apply householder transform to remaining part of mat on the left
     mat.bottomRightCorner(remainingRows-1, remainingCols)
-       .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).transpose(), mat.coeff(k,k+1), tempData);
+       .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).adjoint(), mat.coeff(k,k+1), tempData);
   }
 }
 
@@ -159,6 +159,8 @@
                                                       traits<MatrixType>::Flags & RowMajorBit> > Y)
 {
   typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename NumTraits<RealScalar>::Literal Literal;
   enum { StorageOrder = traits<MatrixType>::Flags & RowMajorBit };
   typedef InnerStride<int(StorageOrder) == int(ColMajor) ? 1 : Dynamic> ColInnerStride;
   typedef InnerStride<int(StorageOrder) == int(ColMajor) ? Dynamic : 1> RowInnerStride;
@@ -200,7 +202,7 @@
       {
         SubColumnType y_k( Y.col(k).tail(remainingCols) );
         
-        // let's use the begining of column k of Y as a temporary vector
+        // let's use the beginning of column k of Y as a temporary vector
         SubColumnType tmp( Y.col(k).head(k) );
         y_k.noalias()  = A.block(k,k+1, remainingRows,remainingCols).adjoint() * v_k; // bottleneck
         tmp.noalias()  = V_k1.adjoint()  * v_k;
@@ -229,7 +231,7 @@
       {
         SubColumnType x_k ( X.col(k).tail(remainingRows-1) );
         
-        // let's use the begining of column k of X as a temporary vectors
+        // let's use the beginning of column k of X as a temporary vectors
         // note that tmp0 and tmp1 overlaps
         SubColumnType tmp0 ( X.col(k).head(k) ),
                       tmp1 ( X.col(k).head(k+1) );
@@ -263,7 +265,7 @@
     SubMatType A10( A.block(bs,0, brows-bs,bs) );
     SubMatType A01( A.block(0,bs, bs,bcols-bs) );
     Scalar tmp = A01(bs-1,0);
-    A01(bs-1,0) = 1;
+    A01(bs-1,0) = Literal(1);
     A11.noalias() -= A10 * Y.topLeftCorner(bcols,bs).bottomRows(bcols-bs).adjoint();
     A11.noalias() -= X.topLeftCorner(brows,bs).bottomRows(brows-bs) * A01;
     A01(bs-1,0) = tmp;

diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h
new file mode 100644
index 0000000..9f93e32
--- /dev/null
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h

@@ -0,0 +1,697 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SIMPLICIAL_CHOLESKY_H
+#define EIGEN_SIMPLICIAL_CHOLESKY_H
+
+namespace Eigen { 
+
+enum SimplicialCholeskyMode {
+  SimplicialCholeskyLLT,
+  SimplicialCholeskyLDLT
+};
+
+namespace internal {
+  template<typename CholMatrixType, typename InputMatrixType>
+  struct simplicial_cholesky_grab_input {
+    typedef CholMatrixType const * ConstCholMatrixPtr;
+    static void run(const InputMatrixType& input, ConstCholMatrixPtr &pmat, CholMatrixType &tmp)
+    {
+      tmp = input;
+      pmat = &tmp;
+    }
+  };
+  
+  template<typename MatrixType>
+  struct simplicial_cholesky_grab_input<MatrixType,MatrixType> {
+    typedef MatrixType const * ConstMatrixPtr;
+    static void run(const MatrixType& input, ConstMatrixPtr &pmat, MatrixType &/*tmp*/)
+    {
+      pmat = &input;
+    }
+  };
+} // end namespace internal
+
+/** \ingroup SparseCholesky_Module
+  * \brief A base class for direct sparse Cholesky factorizations
+  *
+  * This is a base class for LL^T and LDL^T Cholesky factorizations of sparse matrices that are
+  * selfadjoint and positive definite. These factorizations allow for solving A.X = B where
+  * X and B can be either dense or sparse.
+  * 
+  * In order to reduce the fill-in, a symmetric permutation P is applied prior to the factorization
+  * such that the factorized matrix is P A P^-1.
+  *
+  * \tparam Derived the type of the derived class, that is the actual factorization type.
+  *
+  */
+template<typename Derived>
+class SimplicialCholeskyBase : public SparseSolverBase<Derived>
+{
+    typedef SparseSolverBase<Derived> Base;
+    using Base::m_isInitialized;
+    
+  public:
+    typedef typename internal::traits<Derived>::MatrixType MatrixType;
+    typedef typename internal::traits<Derived>::OrderingType OrderingType;
+    enum { UpLo = internal::traits<Derived>::UpLo };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> CholMatrixType;
+    typedef CholMatrixType const * ConstCholMatrixPtr;
+    typedef Matrix<Scalar,Dynamic,1> VectorType;
+    typedef Matrix<StorageIndex,Dynamic,1> VectorI;
+
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+
+  public:
+    
+    using Base::derived;
+
+    /** Default constructor */
+    SimplicialCholeskyBase()
+      : m_info(Success),
+        m_factorizationIsOk(false),
+        m_analysisIsOk(false),
+        m_shiftOffset(0),
+        m_shiftScale(1)
+    {}
+
+    explicit SimplicialCholeskyBase(const MatrixType& matrix)
+      : m_info(Success),
+        m_factorizationIsOk(false),
+        m_analysisIsOk(false),
+        m_shiftOffset(0),
+        m_shiftScale(1)
+    {
+      derived().compute(matrix);
+    }
+
+    ~SimplicialCholeskyBase()
+    {
+    }
+
+    Derived& derived() { return *static_cast<Derived*>(this); }
+    const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    
+    inline Index cols() const { return m_matrix.cols(); }
+    inline Index rows() const { return m_matrix.rows(); }
+    
+    /** \brief Reports whether previous computation was successful.
+      *
+      * \returns \c Success if computation was successful,
+      *          \c NumericalIssue if the matrix.appears to be negative.
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+      return m_info;
+    }
+    
+    /** \returns the permutation P
+      * \sa permutationPinv() */
+    const PermutationMatrix<Dynamic,Dynamic,StorageIndex>& permutationP() const
+    { return m_P; }
+    
+    /** \returns the inverse P^-1 of the permutation P
+      * \sa permutationP() */
+    const PermutationMatrix<Dynamic,Dynamic,StorageIndex>& permutationPinv() const
+    { return m_Pinv; }
+
+    /** Sets the shift parameters that will be used to adjust the diagonal coefficients during the numerical factorization.
+      *
+      * During the numerical factorization, the diagonal coefficients are transformed by the following linear model:\n
+      * \c d_ii = \a offset + \a scale * \c d_ii
+      *
+      * The default is the identity transformation with \a offset=0, and \a scale=1.
+      *
+      * \returns a reference to \c *this.
+      */
+    Derived& setShift(const RealScalar& offset, const RealScalar& scale = 1)
+    {
+      m_shiftOffset = offset;
+      m_shiftScale = scale;
+      return derived();
+    }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** \internal */
+    template<typename Stream>
+    void dumpMemory(Stream& s)
+    {
+      int total = 0;
+      s << "  L:        " << ((total+=(m_matrix.cols()+1) * sizeof(int) + m_matrix.nonZeros()*(sizeof(int)+sizeof(Scalar))) >> 20) << "Mb" << "\n";
+      s << "  diag:     " << ((total+=m_diag.size() * sizeof(Scalar)) >> 20) << "Mb" << "\n";
+      s << "  tree:     " << ((total+=m_parent.size() * sizeof(int)) >> 20) << "Mb" << "\n";
+      s << "  nonzeros: " << ((total+=m_nonZerosPerCol.size() * sizeof(int)) >> 20) << "Mb" << "\n";
+      s << "  perm:     " << ((total+=m_P.size() * sizeof(int)) >> 20) << "Mb" << "\n";
+      s << "  perm^-1:  " << ((total+=m_Pinv.size() * sizeof(int)) >> 20) << "Mb" << "\n";
+      s << "  TOTAL:    " << (total>> 20) << "Mb" << "\n";
+    }
+
+    /** \internal */
+    template<typename Rhs,typename Dest>
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
+    {
+      eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
+      eigen_assert(m_matrix.rows()==b.rows());
+
+      if(m_info!=Success)
+        return;
+
+      if(m_P.size()>0)
+        dest = m_P * b;
+      else
+        dest = b;
+
+      if(m_matrix.nonZeros()>0) // otherwise L==I
+        derived().matrixL().solveInPlace(dest);
+
+      if(m_diag.size()>0)
+        dest = m_diag.asDiagonal().inverse() * dest;
+
+      if (m_matrix.nonZeros()>0) // otherwise U==I
+        derived().matrixU().solveInPlace(dest);
+
+      if(m_P.size()>0)
+        dest = m_Pinv * dest;
+    }
+    
+    template<typename Rhs,typename Dest>
+    void _solve_impl(const SparseMatrixBase<Rhs> &b, SparseMatrixBase<Dest> &dest) const
+    {
+      internal::solve_sparse_through_dense_panels(derived(), b, dest);
+    }
+
+#endif // EIGEN_PARSED_BY_DOXYGEN
+
+  protected:
+    
+    /** Computes the sparse Cholesky decomposition of \a matrix */
+    template<bool DoLDLT>
+    void compute(const MatrixType& matrix)
+    {
+      eigen_assert(matrix.rows()==matrix.cols());
+      Index size = matrix.cols();
+      CholMatrixType tmp(size,size);
+      ConstCholMatrixPtr pmat;
+      ordering(matrix, pmat, tmp);
+      analyzePattern_preordered(*pmat, DoLDLT);
+      factorize_preordered<DoLDLT>(*pmat);
+    }
+    
+    template<bool DoLDLT>
+    void factorize(const MatrixType& a)
+    {
+      eigen_assert(a.rows()==a.cols());
+      Index size = a.cols();
+      CholMatrixType tmp(size,size);
+      ConstCholMatrixPtr pmat;
+      
+      if(m_P.size() == 0 && (int(UpLo) & int(Upper)) == Upper)
+      {
+        // If there is no ordering, try to directly use the input matrix without any copy
+        internal::simplicial_cholesky_grab_input<CholMatrixType,MatrixType>::run(a, pmat, tmp);
+      }
+      else
+      {
+        tmp.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
+        pmat = &tmp;
+      }
+      
+      factorize_preordered<DoLDLT>(*pmat);
+    }
+
+    template<bool DoLDLT>
+    void factorize_preordered(const CholMatrixType& a);
+
+    void analyzePattern(const MatrixType& a, bool doLDLT)
+    {
+      eigen_assert(a.rows()==a.cols());
+      Index size = a.cols();
+      CholMatrixType tmp(size,size);
+      ConstCholMatrixPtr pmat;
+      ordering(a, pmat, tmp);
+      analyzePattern_preordered(*pmat,doLDLT);
+    }
+    void analyzePattern_preordered(const CholMatrixType& a, bool doLDLT);
+    
+    void ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap);
+
+    /** keeps off-diagonal entries; drops diagonal entries */
+    struct keep_diag {
+      inline bool operator() (const Index& row, const Index& col, const Scalar&) const
+      {
+        return row!=col;
+      }
+    };
+
+    mutable ComputationInfo m_info;
+    bool m_factorizationIsOk;
+    bool m_analysisIsOk;
+    
+    CholMatrixType m_matrix;
+    VectorType m_diag;                                // the diagonal coefficients (LDLT mode)
+    VectorI m_parent;                                 // elimination tree
+    VectorI m_nonZerosPerCol;
+    PermutationMatrix<Dynamic,Dynamic,StorageIndex> m_P;     // the permutation
+    PermutationMatrix<Dynamic,Dynamic,StorageIndex> m_Pinv;  // the inverse permutation
+
+    RealScalar m_shiftOffset;
+    RealScalar m_shiftScale;
+};
+
+template<typename _MatrixType, int _UpLo = Lower, typename _Ordering = AMDOrdering<typename _MatrixType::StorageIndex> > class SimplicialLLT;
+template<typename _MatrixType, int _UpLo = Lower, typename _Ordering = AMDOrdering<typename _MatrixType::StorageIndex> > class SimplicialLDLT;
+template<typename _MatrixType, int _UpLo = Lower, typename _Ordering = AMDOrdering<typename _MatrixType::StorageIndex> > class SimplicialCholesky;
+
+namespace internal {
+
+template<typename _MatrixType, int _UpLo, typename _Ordering> struct traits<SimplicialLLT<_MatrixType,_UpLo,_Ordering> >
+{
+  typedef _MatrixType MatrixType;
+  typedef _Ordering OrderingType;
+  enum { UpLo = _UpLo };
+  typedef typename MatrixType::Scalar                         Scalar;
+  typedef typename MatrixType::StorageIndex                   StorageIndex;
+  typedef SparseMatrix<Scalar, ColMajor, StorageIndex>        CholMatrixType;
+  typedef TriangularView<const CholMatrixType, Eigen::Lower>  MatrixL;
+  typedef TriangularView<const typename CholMatrixType::AdjointReturnType, Eigen::Upper>   MatrixU;
+  static inline MatrixL getL(const CholMatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const CholMatrixType& m) { return MatrixU(m.adjoint()); }
+};
+
+template<typename _MatrixType,int _UpLo, typename _Ordering> struct traits<SimplicialLDLT<_MatrixType,_UpLo,_Ordering> >
+{
+  typedef _MatrixType MatrixType;
+  typedef _Ordering OrderingType;
+  enum { UpLo = _UpLo };
+  typedef typename MatrixType::Scalar                             Scalar;
+  typedef typename MatrixType::StorageIndex                       StorageIndex;
+  typedef SparseMatrix<Scalar, ColMajor, StorageIndex>            CholMatrixType;
+  typedef TriangularView<const CholMatrixType, Eigen::UnitLower>  MatrixL;
+  typedef TriangularView<const typename CholMatrixType::AdjointReturnType, Eigen::UnitUpper> MatrixU;
+  static inline MatrixL getL(const CholMatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const CholMatrixType& m) { return MatrixU(m.adjoint()); }
+};
+
+template<typename _MatrixType, int _UpLo, typename _Ordering> struct traits<SimplicialCholesky<_MatrixType,_UpLo,_Ordering> >
+{
+  typedef _MatrixType MatrixType;
+  typedef _Ordering OrderingType;
+  enum { UpLo = _UpLo };
+};
+
+}
+
+/** \ingroup SparseCholesky_Module
+  * \class SimplicialLLT
+  * \brief A direct sparse LLT Cholesky factorizations
+  *
+  * This class provides a LL^T Cholesky factorizations of sparse matrices that are
+  * selfadjoint and positive definite. The factorization allows for solving A.X = B where
+  * X and B can be either dense or sparse.
+  * 
+  * In order to reduce the fill-in, a symmetric permutation P is applied prior to the factorization
+  * such that the factorized matrix is P A P^-1.
+  *
+  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
+  *               or Upper. Default is Lower.
+  * \tparam _Ordering The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<>
+  *
+  * \implsparsesolverconcept
+  *
+  * \sa class SimplicialLDLT, class AMDOrdering, class NaturalOrdering
+  */
+template<typename _MatrixType, int _UpLo, typename _Ordering>
+    class SimplicialLLT : public SimplicialCholeskyBase<SimplicialLLT<_MatrixType,_UpLo,_Ordering> >
+{
+public:
+    typedef _MatrixType MatrixType;
+    enum { UpLo = _UpLo };
+    typedef SimplicialCholeskyBase<SimplicialLLT> Base;
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef SparseMatrix<Scalar,ColMajor,Index> CholMatrixType;
+    typedef Matrix<Scalar,Dynamic,1> VectorType;
+    typedef internal::traits<SimplicialLLT> Traits;
+    typedef typename Traits::MatrixL  MatrixL;
+    typedef typename Traits::MatrixU  MatrixU;
+public:
+    /** Default constructor */
+    SimplicialLLT() : Base() {}
+    /** Constructs and performs the LLT factorization of \a matrix */
+    explicit SimplicialLLT(const MatrixType& matrix)
+        : Base(matrix) {}
+
+    /** \returns an expression of the factor L */
+    inline const MatrixL matrixL() const {
+        eigen_assert(Base::m_factorizationIsOk && "Simplicial LLT not factorized");
+        return Traits::getL(Base::m_matrix);
+    }
+
+    /** \returns an expression of the factor U (= L^*) */
+    inline const MatrixU matrixU() const {
+        eigen_assert(Base::m_factorizationIsOk && "Simplicial LLT not factorized");
+        return Traits::getU(Base::m_matrix);
+    }
+    
+    /** Computes the sparse Cholesky decomposition of \a matrix */
+    SimplicialLLT& compute(const MatrixType& matrix)
+    {
+      Base::template compute<false>(matrix);
+      return *this;
+    }
+
+    /** Performs a symbolic decomposition on the sparcity of \a matrix.
+      *
+      * This function is particularly useful when solving for several problems having the same structure.
+      *
+      * \sa factorize()
+      */
+    void analyzePattern(const MatrixType& a)
+    {
+      Base::analyzePattern(a, false);
+    }
+
+    /** Performs a numeric decomposition of \a matrix
+      *
+      * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+      *
+      * \sa analyzePattern()
+      */
+    void factorize(const MatrixType& a)
+    {
+      Base::template factorize<false>(a);
+    }
+
+    /** \returns the determinant of the underlying matrix from the current factorization */
+    Scalar determinant() const
+    {
+      Scalar detL = Base::m_matrix.diagonal().prod();
+      return numext::abs2(detL);
+    }
+};
+
+/** \ingroup SparseCholesky_Module
+  * \class SimplicialLDLT
+  * \brief A direct sparse LDLT Cholesky factorizations without square root.
+  *
+  * This class provides a LDL^T Cholesky factorizations without square root of sparse matrices that are
+  * selfadjoint and positive definite. The factorization allows for solving A.X = B where
+  * X and B can be either dense or sparse.
+  * 
+  * In order to reduce the fill-in, a symmetric permutation P is applied prior to the factorization
+  * such that the factorized matrix is P A P^-1.
+  *
+  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
+  *               or Upper. Default is Lower.
+  * \tparam _Ordering The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<>
+  *
+  * \implsparsesolverconcept
+  *
+  * \sa class SimplicialLLT, class AMDOrdering, class NaturalOrdering
+  */
+template<typename _MatrixType, int _UpLo, typename _Ordering>
+    class SimplicialLDLT : public SimplicialCholeskyBase<SimplicialLDLT<_MatrixType,_UpLo,_Ordering> >
+{
+public:
+    typedef _MatrixType MatrixType;
+    enum { UpLo = _UpLo };
+    typedef SimplicialCholeskyBase<SimplicialLDLT> Base;
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> CholMatrixType;
+    typedef Matrix<Scalar,Dynamic,1> VectorType;
+    typedef internal::traits<SimplicialLDLT> Traits;
+    typedef typename Traits::MatrixL  MatrixL;
+    typedef typename Traits::MatrixU  MatrixU;
+public:
+    /** Default constructor */
+    SimplicialLDLT() : Base() {}
+
+    /** Constructs and performs the LLT factorization of \a matrix */
+    explicit SimplicialLDLT(const MatrixType& matrix)
+        : Base(matrix) {}
+
+    /** \returns a vector expression of the diagonal D */
+    inline const VectorType vectorD() const {
+        eigen_assert(Base::m_factorizationIsOk && "Simplicial LDLT not factorized");
+        return Base::m_diag;
+    }
+    /** \returns an expression of the factor L */
+    inline const MatrixL matrixL() const {
+        eigen_assert(Base::m_factorizationIsOk && "Simplicial LDLT not factorized");
+        return Traits::getL(Base::m_matrix);
+    }
+
+    /** \returns an expression of the factor U (= L^*) */
+    inline const MatrixU matrixU() const {
+        eigen_assert(Base::m_factorizationIsOk && "Simplicial LDLT not factorized");
+        return Traits::getU(Base::m_matrix);
+    }
+
+    /** Computes the sparse Cholesky decomposition of \a matrix */
+    SimplicialLDLT& compute(const MatrixType& matrix)
+    {
+      Base::template compute<true>(matrix);
+      return *this;
+    }
+    
+    /** Performs a symbolic decomposition on the sparcity of \a matrix.
+      *
+      * This function is particularly useful when solving for several problems having the same structure.
+      *
+      * \sa factorize()
+      */
+    void analyzePattern(const MatrixType& a)
+    {
+      Base::analyzePattern(a, true);
+    }
+
+    /** Performs a numeric decomposition of \a matrix
+      *
+      * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+      *
+      * \sa analyzePattern()
+      */
+    void factorize(const MatrixType& a)
+    {
+      Base::template factorize<true>(a);
+    }
+
+    /** \returns the determinant of the underlying matrix from the current factorization */
+    Scalar determinant() const
+    {
+      return Base::m_diag.prod();
+    }
+};
+
+/** \deprecated use SimplicialLDLT or class SimplicialLLT
+  * \ingroup SparseCholesky_Module
+  * \class SimplicialCholesky
+  *
+  * \sa class SimplicialLDLT, class SimplicialLLT
+  */
+template<typename _MatrixType, int _UpLo, typename _Ordering>
+    class SimplicialCholesky : public SimplicialCholeskyBase<SimplicialCholesky<_MatrixType,_UpLo,_Ordering> >
+{
+public:
+    typedef _MatrixType MatrixType;
+    enum { UpLo = _UpLo };
+    typedef SimplicialCholeskyBase<SimplicialCholesky> Base;
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> CholMatrixType;
+    typedef Matrix<Scalar,Dynamic,1> VectorType;
+    typedef internal::traits<SimplicialCholesky> Traits;
+    typedef internal::traits<SimplicialLDLT<MatrixType,UpLo> > LDLTTraits;
+    typedef internal::traits<SimplicialLLT<MatrixType,UpLo>  > LLTTraits;
+  public:
+    SimplicialCholesky() : Base(), m_LDLT(true) {}
+
+    explicit SimplicialCholesky(const MatrixType& matrix)
+      : Base(), m_LDLT(true)
+    {
+      compute(matrix);
+    }
+
+    SimplicialCholesky& setMode(SimplicialCholeskyMode mode)
+    {
+      switch(mode)
+      {
+      case SimplicialCholeskyLLT:
+        m_LDLT = false;
+        break;
+      case SimplicialCholeskyLDLT:
+        m_LDLT = true;
+        break;
+      default:
+        break;
+      }
+
+      return *this;
+    }
+
+    inline const VectorType vectorD() const {
+        eigen_assert(Base::m_factorizationIsOk && "Simplicial Cholesky not factorized");
+        return Base::m_diag;
+    }
+    inline const CholMatrixType rawMatrix() const {
+        eigen_assert(Base::m_factorizationIsOk && "Simplicial Cholesky not factorized");
+        return Base::m_matrix;
+    }
+    
+    /** Computes the sparse Cholesky decomposition of \a matrix */
+    SimplicialCholesky& compute(const MatrixType& matrix)
+    {
+      if(m_LDLT)
+        Base::template compute<true>(matrix);
+      else
+        Base::template compute<false>(matrix);
+      return *this;
+    }
+
+    /** Performs a symbolic decomposition on the sparcity of \a matrix.
+      *
+      * This function is particularly useful when solving for several problems having the same structure.
+      *
+      * \sa factorize()
+      */
+    void analyzePattern(const MatrixType& a)
+    {
+      Base::analyzePattern(a, m_LDLT);
+    }
+
+    /** Performs a numeric decomposition of \a matrix
+      *
+      * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+      *
+      * \sa analyzePattern()
+      */
+    void factorize(const MatrixType& a)
+    {
+      if(m_LDLT)
+        Base::template factorize<true>(a);
+      else
+        Base::template factorize<false>(a);
+    }
+
+    /** \internal */
+    template<typename Rhs,typename Dest>
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
+    {
+      eigen_assert(Base::m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
+      eigen_assert(Base::m_matrix.rows()==b.rows());
+
+      if(Base::m_info!=Success)
+        return;
+
+      if(Base::m_P.size()>0)
+        dest = Base::m_P * b;
+      else
+        dest = b;
+
+      if(Base::m_matrix.nonZeros()>0) // otherwise L==I
+      {
+        if(m_LDLT)
+          LDLTTraits::getL(Base::m_matrix).solveInPlace(dest);
+        else
+          LLTTraits::getL(Base::m_matrix).solveInPlace(dest);
+      }
+
+      if(Base::m_diag.size()>0)
+        dest = Base::m_diag.real().asDiagonal().inverse() * dest;
+
+      if (Base::m_matrix.nonZeros()>0) // otherwise I==I
+      {
+        if(m_LDLT)
+          LDLTTraits::getU(Base::m_matrix).solveInPlace(dest);
+        else
+          LLTTraits::getU(Base::m_matrix).solveInPlace(dest);
+      }
+
+      if(Base::m_P.size()>0)
+        dest = Base::m_Pinv * dest;
+    }
+    
+    /** \internal */
+    template<typename Rhs,typename Dest>
+    void _solve_impl(const SparseMatrixBase<Rhs> &b, SparseMatrixBase<Dest> &dest) const
+    {
+      internal::solve_sparse_through_dense_panels(*this, b, dest);
+    }
+    
+    Scalar determinant() const
+    {
+      if(m_LDLT)
+      {
+        return Base::m_diag.prod();
+      }
+      else
+      {
+        Scalar detL = Diagonal<const CholMatrixType>(Base::m_matrix).prod();
+        return numext::abs2(detL);
+      }
+    }
+    
+  protected:
+    bool m_LDLT;
+};
+
+template<typename Derived>
+void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap)
+{
+  eigen_assert(a.rows()==a.cols());
+  const Index size = a.rows();
+  pmat = &ap;
+  // Note that ordering methods compute the inverse permutation
+  if(!internal::is_same<OrderingType,NaturalOrdering<Index> >::value)
+  {
+    {
+      CholMatrixType C;
+      C = a.template selfadjointView<UpLo>();
+      
+      OrderingType ordering;
+      ordering(C,m_Pinv);
+    }
+
+    if(m_Pinv.size()>0) m_P = m_Pinv.inverse();
+    else                m_P.resize(0);
+    
+    ap.resize(size,size);
+    ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
+  }
+  else
+  {
+    m_Pinv.resize(0);
+    m_P.resize(0);
+    if(int(UpLo)==int(Lower) || MatrixType::IsRowMajor)
+    {
+      // we have to transpose the lower part to to the upper one
+      ap.resize(size,size);
+      ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>();
+    }
+    else
+      internal::simplicial_cholesky_grab_input<CholMatrixType,MatrixType>::run(a, pmat, ap);
+  }  
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SIMPLICIAL_CHOLESKY_H

diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
new file mode 100644
index 0000000..72e1740
--- /dev/null
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h

@@ -0,0 +1,174 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*
+NOTE: these functions have been adapted from the LDL library:
+
+LDL Copyright (c) 2005 by Timothy A. Davis.  All Rights Reserved.
+
+The author of LDL, Timothy A. Davis., has executed a license with Google LLC
+to permit distribution of this code and derivative works as part of Eigen under
+the Mozilla Public License v. 2.0, as stated at the top of this file.
+ */
+
+#ifndef EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H
+#define EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H
+
+namespace Eigen {
+
+template<typename Derived>
+void SimplicialCholeskyBase<Derived>::analyzePattern_preordered(const CholMatrixType& ap, bool doLDLT)
+{
+  const StorageIndex size = StorageIndex(ap.rows());
+  m_matrix.resize(size, size);
+  m_parent.resize(size);
+  m_nonZerosPerCol.resize(size);
+
+  ei_declare_aligned_stack_constructed_variable(StorageIndex, tags, size, 0);
+
+  for(StorageIndex k = 0; k < size; ++k)
+  {
+    /* L(k,:) pattern: all nodes reachable in etree from nz in A(0:k-1,k) */
+    m_parent[k] = -1;             /* parent of k is not yet known */
+    tags[k] = k;                  /* mark node k as visited */
+    m_nonZerosPerCol[k] = 0;      /* count of nonzeros in column k of L */
+    for(typename CholMatrixType::InnerIterator it(ap,k); it; ++it)
+    {
+      StorageIndex i = it.index();
+      if(i < k)
+      {
+        /* follow path from i to root of etree, stop at flagged node */
+        for(; tags[i] != k; i = m_parent[i])
+        {
+          /* find parent of i if not yet determined */
+          if (m_parent[i] == -1)
+            m_parent[i] = k;
+          m_nonZerosPerCol[i]++;        /* L (k,i) is nonzero */
+          tags[i] = k;                  /* mark i as visited */
+        }
+      }
+    }
+  }
+
+  /* construct Lp index array from m_nonZerosPerCol column counts */
+  StorageIndex* Lp = m_matrix.outerIndexPtr();
+  Lp[0] = 0;
+  for(StorageIndex k = 0; k < size; ++k)
+    Lp[k+1] = Lp[k] + m_nonZerosPerCol[k] + (doLDLT ? 0 : 1);
+
+  m_matrix.resizeNonZeros(Lp[size]);
+
+  m_isInitialized     = true;
+  m_info              = Success;
+  m_analysisIsOk      = true;
+  m_factorizationIsOk = false;
+}
+
+
+template<typename Derived>
+template<bool DoLDLT>
+void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType& ap)
+{
+  using std::sqrt;
+
+  eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
+  eigen_assert(ap.rows()==ap.cols());
+  eigen_assert(m_parent.size()==ap.rows());
+  eigen_assert(m_nonZerosPerCol.size()==ap.rows());
+
+  const StorageIndex size = StorageIndex(ap.rows());
+  const StorageIndex* Lp = m_matrix.outerIndexPtr();
+  StorageIndex* Li = m_matrix.innerIndexPtr();
+  Scalar* Lx = m_matrix.valuePtr();
+
+  ei_declare_aligned_stack_constructed_variable(Scalar, y, size, 0);
+  ei_declare_aligned_stack_constructed_variable(StorageIndex,  pattern, size, 0);
+  ei_declare_aligned_stack_constructed_variable(StorageIndex,  tags, size, 0);
+
+  bool ok = true;
+  m_diag.resize(DoLDLT ? size : 0);
+
+  for(StorageIndex k = 0; k < size; ++k)
+  {
+    // compute nonzero pattern of kth row of L, in topological order
+    y[k] = Scalar(0);                     // Y(0:k) is now all zero
+    StorageIndex top = size;               // stack for pattern is empty
+    tags[k] = k;                    // mark node k as visited
+    m_nonZerosPerCol[k] = 0;        // count of nonzeros in column k of L
+    for(typename CholMatrixType::InnerIterator it(ap,k); it; ++it)
+    {
+      StorageIndex i = it.index();
+      if(i <= k)
+      {
+        y[i] += numext::conj(it.value());            /* scatter A(i,k) into Y (sum duplicates) */
+        Index len;
+        for(len = 0; tags[i] != k; i = m_parent[i])
+        {
+          pattern[len++] = i;     /* L(k,i) is nonzero */
+          tags[i] = k;            /* mark i as visited */
+        }
+        while(len > 0)
+          pattern[--top] = pattern[--len];
+      }
+    }
+
+    /* compute numerical values kth row of L (a sparse triangular solve) */
+
+    RealScalar d = numext::real(y[k]) * m_shiftScale + m_shiftOffset;    // get D(k,k), apply the shift function, and clear Y(k)
+    y[k] = Scalar(0);
+    for(; top < size; ++top)
+    {
+      Index i = pattern[top];       /* pattern[top:n-1] is pattern of L(:,k) */
+      Scalar yi = y[i];             /* get and clear Y(i) */
+      y[i] = Scalar(0);
+
+      /* the nonzero entry L(k,i) */
+      Scalar l_ki;
+      if(DoLDLT)
+        l_ki = yi / numext::real(m_diag[i]);
+      else
+        yi = l_ki = yi / Lx[Lp[i]];
+
+      Index p2 = Lp[i] + m_nonZerosPerCol[i];
+      Index p;
+      for(p = Lp[i] + (DoLDLT ? 0 : 1); p < p2; ++p)
+        y[Li[p]] -= numext::conj(Lx[p]) * yi;
+      d -= numext::real(l_ki * numext::conj(yi));
+      Li[p] = k;                          /* store L(k,i) in column form of L */
+      Lx[p] = l_ki;
+      ++m_nonZerosPerCol[i];              /* increment count of nonzeros in col i */
+    }
+    if(DoLDLT)
+    {
+      m_diag[k] = d;
+      if(d == RealScalar(0))
+      {
+        ok = false;                         /* failure, D(k,k) is zero */
+        break;
+      }
+    }
+    else
+    {
+      Index p = Lp[k] + m_nonZerosPerCol[k]++;
+      Li[p] = k ;                /* store L(k,k) = sqrt (d) in column k */
+      if(d <= RealScalar(0)) {
+        ok = false;              /* failure, matrix is not positive definite */
+        break;
+      }
+      Lx[p] = sqrt(d) ;
+    }
+  }
+
+  m_info = ok ? Success : NumericalIssue;
+  m_factorizationIsOk = true;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H

diff --git a/Eigen/src/SparseCore/AmbiVector.h b/Eigen/src/SparseCore/AmbiVector.h
index 17fff96..2cb7747 100644
--- a/Eigen/src/SparseCore/AmbiVector.h
+++ b/Eigen/src/SparseCore/AmbiVector.h

@@ -19,16 +19,16 @@
   *
   * See BasicSparseLLT and SparseProduct for usage examples.
   */
-template<typename _Scalar, typename _Index>
+template<typename _Scalar, typename _StorageIndex>
 class AmbiVector
 {
   public:
     typedef _Scalar Scalar;
-    typedef _Index Index;
+    typedef _StorageIndex StorageIndex;
     typedef typename NumTraits<Scalar>::Real RealScalar;
 
-    AmbiVector(Index size)
-      : m_buffer(0), m_zero(0), m_size(0), m_allocatedSize(0), m_allocatedElements(0), m_mode(-1)
+    explicit AmbiVector(Index size)
+      : m_buffer(0), m_zero(0), m_size(0), m_end(0), m_allocatedSize(0), m_allocatedElements(0), m_mode(-1)
     {
       resize(size);
     }
@@ -39,7 +39,7 @@
     Index nonZeros() const;
 
     /** Specifies a sub-vector to work on */
-    void setBounds(Index start, Index end) { m_start = start; m_end = end; }
+    void setBounds(Index start, Index end) { m_start = convert_index(start); m_end = convert_index(end); }
 
     void setZero();
 
@@ -55,12 +55,16 @@
     {
       if (m_allocatedSize < size)
         reallocate(size);
-      m_size = size;
+      m_size = convert_index(size);
     }
 
-    Index size() const { return m_size; }
+    StorageIndex size() const { return m_size; }
 
   protected:
+    StorageIndex convert_index(Index idx)
+    {
+      return internal::convert_index<StorageIndex>(idx);
+    }
 
     void reallocate(Index size)
     {
@@ -69,16 +73,16 @@
       delete[] m_buffer;
       if (size<1000)
       {
-        Index allocSize = (size * sizeof(ListEl))/sizeof(Scalar);
-        m_allocatedElements = (allocSize*sizeof(Scalar))/sizeof(ListEl);
+        Index allocSize = (size * sizeof(ListEl) + sizeof(Scalar) - 1)/sizeof(Scalar);
+        m_allocatedElements = convert_index((allocSize*sizeof(Scalar))/sizeof(ListEl));
         m_buffer = new Scalar[allocSize];
       }
       else
       {
-        m_allocatedElements = (size*sizeof(Scalar))/sizeof(ListEl);
+        m_allocatedElements = convert_index((size*sizeof(Scalar))/sizeof(ListEl));
         m_buffer = new Scalar[size];
       }
-      m_size = size;
+      m_size = convert_index(size);
       m_start = 0;
       m_end = m_size;
     }
@@ -86,11 +90,11 @@
     void reallocateSparse()
     {
       Index copyElements = m_allocatedElements;
-      m_allocatedElements = (std::min)(Index(m_allocatedElements*1.5),m_size);
+      m_allocatedElements = (std::min)(StorageIndex(m_allocatedElements*1.5),m_size);
       Index allocSize = m_allocatedElements * sizeof(ListEl);
-      allocSize = allocSize/sizeof(Scalar) + (allocSize%sizeof(Scalar)>0?1:0);
+      allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar);
       Scalar* newBuffer = new Scalar[allocSize];
-      memcpy(newBuffer,  m_buffer,  copyElements * sizeof(ListEl));
+      std::memcpy(newBuffer,  m_buffer,  copyElements * sizeof(ListEl));
       delete[] m_buffer;
       m_buffer = newBuffer;
     }
@@ -99,30 +103,30 @@
     // element type of the linked list
     struct ListEl
     {
-      Index next;
-      Index index;
+      StorageIndex next;
+      StorageIndex index;
       Scalar value;
     };
 
     // used to store data in both mode
     Scalar* m_buffer;
     Scalar m_zero;
-    Index m_size;
-    Index m_start;
-    Index m_end;
-    Index m_allocatedSize;
-    Index m_allocatedElements;
-    Index m_mode;
+    StorageIndex m_size;
+    StorageIndex m_start;
+    StorageIndex m_end;
+    StorageIndex m_allocatedSize;
+    StorageIndex m_allocatedElements;
+    StorageIndex m_mode;
 
     // linked list mode
-    Index m_llStart;
-    Index m_llCurrent;
-    Index m_llSize;
+    StorageIndex m_llStart;
+    StorageIndex m_llCurrent;
+    StorageIndex m_llSize;
 };
 
 /** \returns the number of non zeros in the current sub vector */
-template<typename _Scalar,typename _Index>
-_Index AmbiVector<_Scalar,_Index>::nonZeros() const
+template<typename _Scalar,typename _StorageIndex>
+Index AmbiVector<_Scalar,_StorageIndex>::nonZeros() const
 {
   if (m_mode==IsSparse)
     return m_llSize;
@@ -130,8 +134,8 @@
     return m_end - m_start;
 }
 
-template<typename _Scalar,typename _Index>
-void AmbiVector<_Scalar,_Index>::init(double estimatedDensity)
+template<typename _Scalar,typename _StorageIndex>
+void AmbiVector<_Scalar,_StorageIndex>::init(double estimatedDensity)
 {
   if (estimatedDensity>0.1)
     init(IsDense);
@@ -139,11 +143,12 @@
     init(IsSparse);
 }
 
-template<typename _Scalar,typename _Index>
-void AmbiVector<_Scalar,_Index>::init(int mode)
+template<typename _Scalar,typename _StorageIndex>
+void AmbiVector<_Scalar,_StorageIndex>::init(int mode)
 {
   m_mode = mode;
-  if (m_mode==IsSparse)
+  // This is only necessary in sparse mode, but we set these unconditionally to avoid some maybe-uninitialized warnings
+  // if (m_mode==IsSparse)
   {
     m_llSize = 0;
     m_llStart = -1;
@@ -155,15 +160,15 @@
   *
   * Don't worry, this function is extremely cheap.
   */
-template<typename _Scalar,typename _Index>
-void AmbiVector<_Scalar,_Index>::restart()
+template<typename _Scalar,typename _StorageIndex>
+void AmbiVector<_Scalar,_StorageIndex>::restart()
 {
   m_llCurrent = m_llStart;
 }
 
 /** Set all coefficients of current subvector to zero */
-template<typename _Scalar,typename _Index>
-void AmbiVector<_Scalar,_Index>::setZero()
+template<typename _Scalar,typename _StorageIndex>
+void AmbiVector<_Scalar,_StorageIndex>::setZero()
 {
   if (m_mode==IsDense)
   {
@@ -178,8 +183,8 @@
   }
 }
 
-template<typename _Scalar,typename _Index>
-_Scalar& AmbiVector<_Scalar,_Index>::coeffRef(_Index i)
+template<typename _Scalar,typename _StorageIndex>
+_Scalar& AmbiVector<_Scalar,_StorageIndex>::coeffRef(Index i)
 {
   if (m_mode==IsDense)
     return m_buffer[i];
@@ -195,7 +200,7 @@
       m_llCurrent = 0;
       ++m_llSize;
       llElements[0].value = Scalar(0);
-      llElements[0].index = i;
+      llElements[0].index = convert_index(i);
       llElements[0].next = -1;
       return llElements[0].value;
     }
@@ -204,7 +209,7 @@
       // this is going to be the new first element of the list
       ListEl& el = llElements[m_llSize];
       el.value = Scalar(0);
-      el.index = i;
+      el.index = convert_index(i);
       el.next = m_llStart;
       m_llStart = m_llSize;
       ++m_llSize;
@@ -213,7 +218,7 @@
     }
     else
     {
-      Index nextel = llElements[m_llCurrent].next;
+      StorageIndex nextel = llElements[m_llCurrent].next;
       eigen_assert(i>=llElements[m_llCurrent].index && "you must call restart() before inserting an element with lower or equal index");
       while (nextel >= 0 && llElements[nextel].index<=i)
       {
@@ -237,7 +242,7 @@
         // let's insert a new coefficient
         ListEl& el = llElements[m_llSize];
         el.value = Scalar(0);
-        el.index = i;
+        el.index = convert_index(i);
         el.next = llElements[m_llCurrent].next;
         llElements[m_llCurrent].next = m_llSize;
         ++m_llSize;
@@ -247,8 +252,8 @@
   }
 }
 
-template<typename _Scalar,typename _Index>
-_Scalar& AmbiVector<_Scalar,_Index>::coeff(_Index i)
+template<typename _Scalar,typename _StorageIndex>
+_Scalar& AmbiVector<_Scalar,_StorageIndex>::coeff(Index i)
 {
   if (m_mode==IsDense)
     return m_buffer[i];
@@ -275,8 +280,8 @@
 }
 
 /** Iterator over the nonzero coefficients */
-template<typename _Scalar,typename _Index>
-class AmbiVector<_Scalar,_Index>::Iterator
+template<typename _Scalar,typename _StorageIndex>
+class AmbiVector<_Scalar,_StorageIndex>::Iterator
 {
   public:
     typedef _Scalar Scalar;
@@ -288,7 +293,7 @@
       * In practice, all coefficients having a magnitude smaller than \a epsilon
       * are skipped.
       */
-    Iterator(const AmbiVector& vec, const RealScalar& epsilon = 0)
+    explicit Iterator(const AmbiVector& vec, const RealScalar& epsilon = 0)
       : m_vector(vec)
     {
       using std::abs;
@@ -320,7 +325,7 @@
       }
     }
 
-    Index index() const { return m_cachedIndex; }
+    StorageIndex index() const { return m_cachedIndex; }
     Scalar value() const { return m_cachedValue; }
 
     operator bool() const { return m_cachedIndex>=0; }
@@ -332,7 +337,7 @@
       {
         do {
           ++m_cachedIndex;
-        } while (m_cachedIndex<m_vector.m_end && abs(m_vector.m_buffer[m_cachedIndex])<m_epsilon);
+        } while (m_cachedIndex<m_vector.m_end && abs(m_vector.m_buffer[m_cachedIndex])<=m_epsilon);
         if (m_cachedIndex<m_vector.m_end)
           m_cachedValue = m_vector.m_buffer[m_cachedIndex];
         else
@@ -343,7 +348,7 @@
         ListEl* EIGEN_RESTRICT llElements = reinterpret_cast<ListEl*>(m_vector.m_buffer);
         do {
           m_currentEl = llElements[m_currentEl].next;
-        } while (m_currentEl>=0 && abs(llElements[m_currentEl].value)<m_epsilon);
+        } while (m_currentEl>=0 && abs(llElements[m_currentEl].value)<=m_epsilon);
         if (m_currentEl<0)
         {
           m_cachedIndex = -1;
@@ -359,9 +364,9 @@
 
   protected:
     const AmbiVector& m_vector; // the target vector
-    Index m_currentEl;            // the current element in sparse/linked-list mode
+    StorageIndex m_currentEl;   // the current element in sparse/linked-list mode
     RealScalar m_epsilon;       // epsilon used to prune zero coefficients
-    Index m_cachedIndex;          // current coordinate
+    StorageIndex m_cachedIndex; // current coordinate
     Scalar m_cachedValue;       // current value
     bool m_isDense;             // mode of the vector
 };

diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h
index 1a851d0..acd986f 100644
--- a/Eigen/src/SparseCore/CompressedStorage.h
+++ b/Eigen/src/SparseCore/CompressedStorage.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,7 +10,7 @@
 #ifndef EIGEN_COMPRESSED_STORAGE_H
 #define EIGEN_COMPRESSED_STORAGE_H
 
-namespace Eigen {
+namespace Eigen { 
 
 namespace internal {
 
@@ -18,13 +18,13 @@
   * Stores a sparse set of values as a list of values and a list of indices.
   *
   */
-template<typename _Scalar,typename _Index>
+template<typename _Scalar,typename _StorageIndex>
 class CompressedStorage
 {
   public:
 
     typedef _Scalar Scalar;
-    typedef _Index Index;
+    typedef _StorageIndex StorageIndex;
 
   protected:
 
@@ -36,7 +36,7 @@
       : m_values(0), m_indices(0), m_size(0), m_allocatedSize(0)
     {}
 
-    CompressedStorage(size_t size)
+    explicit CompressedStorage(Index size)
       : m_values(0), m_indices(0), m_size(0), m_allocatedSize(0)
     {
       resize(size);
@@ -51,8 +51,11 @@
     CompressedStorage& operator=(const CompressedStorage& other)
     {
       resize(other.size());
-      internal::smart_copy(other.m_values,  other.m_values  + m_size, m_values);
-      internal::smart_copy(other.m_indices, other.m_indices + m_size, m_indices);
+      if(other.size()>0)
+      {
+        internal::smart_copy(other.m_values,  other.m_values  + m_size, m_values);
+        internal::smart_copy(other.m_indices, other.m_indices + m_size, m_indices);
+      }
       return *this;
     }
 
@@ -70,9 +73,9 @@
       delete[] m_indices;
     }
 
-    void reserve(size_t size)
+    void reserve(Index size)
     {
-      size_t newAllocatedSize = m_size + size;
+      Index newAllocatedSize = m_size + size;
       if (newAllocatedSize > m_allocatedSize)
         reallocate(newAllocatedSize);
     }
@@ -83,40 +86,40 @@
         reallocate(m_size);
     }
 
-    void resize(size_t size, float reserveSizeFactor = 0)
+    void resize(Index size, double reserveSizeFactor = 0)
     {
       if (m_allocatedSize<size)
-        reallocate(size + static_cast<size_t>(reserveSizeFactor *
-                                            static_cast<float>(size)));
+      {
+        Index realloc_size = (std::min<Index>)(NumTraits<StorageIndex>::highest(),  size + Index(reserveSizeFactor*double(size)));
+        if(realloc_size<size)
+          internal::throw_std_bad_alloc();
+        reallocate(realloc_size);
+      }
       m_size = size;
     }
 
     void append(const Scalar& v, Index i)
     {
-      Index id = static_cast<Index>(m_size);
+      Index id = m_size;
       resize(m_size+1, 1);
       m_values[id] = v;
-      m_indices[id] = i;
+      m_indices[id] = internal::convert_index<StorageIndex>(i);
     }
 
-    inline size_t size() const { return m_size; }
-    inline size_t allocatedSize() const { return m_allocatedSize; }
+    inline Index size() const { return m_size; }
+    inline Index allocatedSize() const { return m_allocatedSize; }
     inline void clear() { m_size = 0; }
 
-    inline Scalar& value(size_t i) { return m_values[i]; }
-    inline const Scalar& value(size_t i) const { return m_values[i]; }
+    const Scalar* valuePtr() const { return m_values; }
+    Scalar* valuePtr() { return m_values; }
+    const StorageIndex* indexPtr() const { return m_indices; }
+    StorageIndex* indexPtr() { return m_indices; }
 
-    inline Index& index(size_t i) { return m_indices[i]; }
-    inline const Index& index(size_t i) const { return m_indices[i]; }
+    inline Scalar& value(Index i) { eigen_internal_assert(m_values!=0); return m_values[i]; }
+    inline const Scalar& value(Index i) const { eigen_internal_assert(m_values!=0); return m_values[i]; }
 
-    static CompressedStorage Map(Index* indices, Scalar* values, size_t size)
-    {
-      CompressedStorage res;
-      res.m_indices = indices;
-      res.m_values = values;
-      res.m_allocatedSize = res.m_size = size;
-      return res;
-    }
+    inline StorageIndex& index(Index i) { eigen_internal_assert(m_indices!=0); return m_indices[i]; }
+    inline const StorageIndex& index(Index i) const { eigen_internal_assert(m_indices!=0); return m_indices[i]; }
 
     /** \returns the largest \c k such that for all \c j in [0,k) index[\c j]\<\a key */
     inline Index searchLowerIndex(Index key) const
@@ -125,17 +128,17 @@
     }
 
     /** \returns the largest \c k in [start,end) such that for all \c j in [start,k) index[\c j]\<\a key */
-    inline Index searchLowerIndex(size_t start, size_t end, Index key) const
+    inline Index searchLowerIndex(Index start, Index end, Index key) const
     {
       while(end>start)
       {
-        size_t mid = (end+start)>>1;
+        Index mid = (end+start)>>1;
         if (m_indices[mid]<key)
           start = mid+1;
         else
           end = mid;
       }
-      return static_cast<Index>(start);
+      return start;
     }
 
     /** \returns the stored value at index \a key
@@ -148,20 +151,20 @@
         return m_values[m_size-1];
       // ^^  optimization: let's first check if it is the last coefficient
       // (very common in high level algorithms)
-      const size_t id = searchLowerIndex(0,m_size-1,key);
+      const Index id = searchLowerIndex(0,m_size-1,key);
       return ((id<m_size) && (m_indices[id]==key)) ? m_values[id] : defaultValue;
     }
 
     /** Like at(), but the search is performed in the range [start,end) */
-    inline Scalar atInRange(size_t start, size_t end, Index key, const Scalar& defaultValue = Scalar(0)) const
+    inline Scalar atInRange(Index start, Index end, Index key, const Scalar &defaultValue = Scalar(0)) const
     {
       if (start>=end)
-        return Scalar(0);
+        return defaultValue;
       else if (end>start && key==m_indices[end-1])
         return m_values[end-1];
       // ^^  optimization: let's first check if it is the last coefficient
       // (very common in high level algorithms)
-      const size_t id = searchLowerIndex(start,end-1,key);
+      const Index id = searchLowerIndex(start,end-1,key);
       return ((id<end) && (m_indices[id]==key)) ? m_values[id] : defaultValue;
     }
 
@@ -170,26 +173,61 @@
       * such that the keys are sorted. */
     inline Scalar& atWithInsertion(Index key, const Scalar& defaultValue = Scalar(0))
     {
-      size_t id = searchLowerIndex(0,m_size,key);
+      Index id = searchLowerIndex(0,m_size,key);
       if (id>=m_size || m_indices[id]!=key)
       {
-        resize(m_size+1,1);
-        for (size_t j=m_size-1; j>id; --j)
+        if (m_allocatedSize<m_size+1)
         {
-          m_indices[j] = m_indices[j-1];
-          m_values[j] = m_values[j-1];
+          m_allocatedSize = 2*(m_size+1);
+          internal::scoped_array<Scalar> newValues(m_allocatedSize);
+          internal::scoped_array<StorageIndex> newIndices(m_allocatedSize);
+
+          // copy first chunk
+          internal::smart_copy(m_values,  m_values +id, newValues.ptr());
+          internal::smart_copy(m_indices, m_indices+id, newIndices.ptr());
+
+          // copy the rest
+          if(m_size>id)
+          {
+            internal::smart_copy(m_values +id,  m_values +m_size, newValues.ptr() +id+1);
+            internal::smart_copy(m_indices+id,  m_indices+m_size, newIndices.ptr()+id+1);
+          }
+          std::swap(m_values,newValues.ptr());
+          std::swap(m_indices,newIndices.ptr());
         }
-        m_indices[id] = key;
+        else if(m_size>id)
+        {
+          internal::smart_memmove(m_values +id, m_values +m_size, m_values +id+1);
+          internal::smart_memmove(m_indices+id, m_indices+m_size, m_indices+id+1);
+        }
+        m_size++;
+        m_indices[id] = internal::convert_index<StorageIndex>(key);
         m_values[id] = defaultValue;
       }
       return m_values[id];
     }
 
+    void moveChunk(Index from, Index to, Index chunkSize)
+    {
+      eigen_internal_assert(to+chunkSize <= m_size);
+      if(to>from && from+chunkSize>to)
+      {
+        // move backward
+        internal::smart_memmove(m_values+from,  m_values+from+chunkSize,  m_values+to);
+        internal::smart_memmove(m_indices+from, m_indices+from+chunkSize, m_indices+to);
+      }
+      else
+      {
+        internal::smart_copy(m_values+from,  m_values+from+chunkSize,  m_values+to);
+        internal::smart_copy(m_indices+from, m_indices+from+chunkSize, m_indices+to);
+      }
+    }
+
     void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
     {
-      size_t k = 0;
-      size_t n = size();
-      for (size_t i=0; i<n; ++i)
+      Index k = 0;
+      Index n = size();
+      for (Index i=0; i<n; ++i)
       {
         if (!internal::isMuchSmallerThan(value(i), reference, epsilon))
         {
@@ -203,29 +241,29 @@
 
   protected:
 
-    inline void reallocate(size_t size)
+    inline void reallocate(Index size)
     {
-      Scalar* newValues  = new Scalar[size];
-      Index* newIndices = new Index[size];
-      size_t copySize = (std::min)(size, m_size);
-      // copy
+      #ifdef EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN
+        EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN
+      #endif
+      eigen_internal_assert(size!=m_allocatedSize);
+      internal::scoped_array<Scalar> newValues(size);
+      internal::scoped_array<StorageIndex> newIndices(size);
+      Index copySize = (std::min)(size, m_size);
       if (copySize>0) {
-        internal::smart_copy(m_values, m_values+copySize, newValues);
-        internal::smart_copy(m_indices, m_indices+copySize, newIndices);
+        internal::smart_copy(m_values, m_values+copySize, newValues.ptr());
+        internal::smart_copy(m_indices, m_indices+copySize, newIndices.ptr());
       }
-      // delete old stuff
-      delete[] m_values;
-      delete[] m_indices;
-      m_values = newValues;
-      m_indices = newIndices;
+      std::swap(m_values,newValues.ptr());
+      std::swap(m_indices,newIndices.ptr());
       m_allocatedSize = size;
     }
 
   protected:
     Scalar* m_values;
-    Index* m_indices;
-    size_t m_size;
-    size_t m_allocatedSize;
+    StorageIndex* m_indices;
+    Index m_size;
+    Index m_allocatedSize;
 
 };
 

diff --git a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
index bab6391..9486502 100644
--- a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
+++ b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,24 +10,30 @@
 #ifndef EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H
 #define EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
 template<typename Lhs, typename Rhs, typename ResultType>
-static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res, bool sortedInsertion = false)
 {
-  typedef typename remove_all<Lhs>::type::Scalar Scalar;
-  typedef typename remove_all<Lhs>::type::Index Index;
+  typedef typename remove_all<Lhs>::type::Scalar LhsScalar;
+  typedef typename remove_all<Rhs>::type::Scalar RhsScalar;
+  typedef typename remove_all<ResultType>::type::Scalar ResScalar;
 
   // make sure to call innerSize/outerSize since we fake the storage order.
   Index rows = lhs.innerSize();
   Index cols = rhs.outerSize();
   eigen_assert(lhs.outerSize() == rhs.innerSize());
 
-  std::vector<bool> mask(rows,false);
-  Matrix<Scalar,Dynamic,1> values(rows);
-  Matrix<Index,Dynamic,1>  indices(rows);
+  ei_declare_aligned_stack_constructed_variable(bool,   mask,     rows, 0);
+  ei_declare_aligned_stack_constructed_variable(ResScalar, values,   rows, 0);
+  ei_declare_aligned_stack_constructed_variable(Index,  indices,  rows, 0);
+
+  std::memset(mask,0,sizeof(bool)*rows);
+
+  evaluator<Lhs> lhsEval(lhs);
+  evaluator<Rhs> rhsEval(rhs);
 
   // estimate the number of non zero entries
   // given a rhs column containing Y non zeros, we assume that the respective Y columns
@@ -35,12 +41,7 @@
   // the product of a rhs column with the lhs is X+Y where X is the average number of non zero
   // per column of the lhs.
   // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
-  // Also, nnz can not be more than the total number of elements, i.e. rows(lhs) * cols(rhs).
-  // For example, in the case of a row vector multiplied by column vector, this limit is 1,
-  // but nnz(lhs) + nnz(rhs) can be more than 1.
-  // We use double to avoid integer overflows.
-  double estimated_nnz_prod = std::min(double(lhs.nonZeros()) + double(rhs.nonZeros()),
-                                       double(lhs.rows()) * double(rhs.cols()) + 1);
+  Index estimated_nnz_prod = lhsEval.nonZerosEstimate() + rhsEval.nonZerosEstimate();
 
   res.setZero();
   res.reserve(Index(estimated_nnz_prod));
@@ -50,14 +51,14 @@
 
     res.startVec(j);
     Index nnz = 0;
-    for (typename Rhs::InnerIterator rhsIt(rhs, j); rhsIt; ++rhsIt)
+    for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt)
     {
-      Scalar y = rhsIt.value();
+      RhsScalar y = rhsIt.value();
       Index k = rhsIt.index();
-      for (typename Lhs::InnerIterator lhsIt(lhs, k); lhsIt; ++lhsIt)
+      for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt)
       {
         Index i = lhsIt.index();
-        Scalar x = lhsIt.value();
+        LhsScalar x = lhsIt.value();
         if(!mask[i])
         {
           mask[i] = true;
@@ -69,53 +70,51 @@
           values[i] += x * y;
       }
     }
-
-    // unordered insertion
-    for(Index k=0; k<nnz; ++k)
+    if(!sortedInsertion)
     {
-      Index i = indices[k];
-      res.insertBackByOuterInnerUnordered(j,i) = values[i];
-      mask[i] = false;
-    }
-
-#if 0
-    // alternative ordered insertion code:
-
-    Index t200 = rows/(log2(200)*1.39);
-    Index t = (rows*100)/139;
-
-    // FIXME reserve nnz non zeros
-    // FIXME implement fast sort algorithms for very small nnz
-    // if the result is sparse enough => use a quick sort
-    // otherwise => loop through the entire vector
-    // In order to avoid to perform an expensive log2 when the
-    // result is clearly very sparse we use a linear bound up to 200.
-    //if((nnz<200 && nnz<t200) || nnz * log2(nnz) < t)
-    //res.startVec(j);
-    if(true)
-    {
-      if(nnz>1) std::sort(indices.data(),indices.data()+nnz);
+      // unordered insertion
       for(Index k=0; k<nnz; ++k)
       {
         Index i = indices[k];
-        res.insertBackByOuterInner(j,i) = values[i];
+        res.insertBackByOuterInnerUnordered(j,i) = values[i];
         mask[i] = false;
       }
     }
     else
     {
-      // dense path
-      for(Index i=0; i<rows; ++i)
+      // alternative ordered insertion code:
+      const Index t200 = rows/11; // 11 == (log2(200)*1.39)
+      const Index t = (rows*100)/139;
+
+      // FIXME reserve nnz non zeros
+      // FIXME implement faster sorting algorithms for very small nnz
+      // if the result is sparse enough => use a quick sort
+      // otherwise => loop through the entire vector
+      // In order to avoid to perform an expensive log2 when the
+      // result is clearly very sparse we use a linear bound up to 200.
+      if((nnz<200 && nnz<t200) || nnz * numext::log2(int(nnz)) < t)
       {
-        if(mask[i])
+        if(nnz>1) std::sort(indices,indices+nnz);
+        for(Index k=0; k<nnz; ++k)
         {
-          mask[i] = false;
+          Index i = indices[k];
           res.insertBackByOuterInner(j,i) = values[i];
+          mask[i] = false;
+        }
+      }
+      else
+      {
+        // dense path
+        for(Index i=0; i<rows; ++i)
+        {
+          if(mask[i])
+          {
+            mask[i] = false;
+            res.insertBackByOuterInner(j,i) = values[i];
+          }
         }
       }
     }
-#endif
-
   }
   res.finalize();
 }
@@ -139,13 +138,28 @@
 
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
-    ColMajorMatrix resCol(lhs.rows(),rhs.cols());
-    internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrix>(lhs, rhs, resCol);
-    // sort the non zeros:
-    RowMajorMatrix resRow(resCol);
-    res = resRow;
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrixAux;
+    typedef typename sparse_eval<ColMajorMatrixAux,ResultType::RowsAtCompileTime,ResultType::ColsAtCompileTime,ColMajorMatrixAux::Flags>::type ColMajorMatrix;
+
+    // If the result is tall and thin (in the extreme case a column vector)
+    // then it is faster to sort the coefficients inplace instead of transposing twice.
+    // FIXME, the following heuristic is probably not very good.
+    if(lhs.rows()>rhs.cols())
+    {
+      ColMajorMatrix resCol(lhs.rows(),rhs.cols());
+      // perform sorted insertion
+      internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrix>(lhs, rhs, resCol, true);
+      res = resCol.markAsRValue();
+    }
+    else
+    {
+      ColMajorMatrixAux resCol(lhs.rows(),rhs.cols());
+      // resort to transpose to sort the entries
+      internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrixAux>(lhs, rhs, resCol, false);
+      RowMajorMatrix resRow(resCol);
+      res = resRow.markAsRValue();
+    }
   }
 };
 
@@ -154,11 +168,12 @@
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-     typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
-     RowMajorMatrix rhsRow = rhs;
-     RowMajorMatrix resRow(lhs.rows(), rhs.cols());
-     internal::conservative_sparse_sparse_product_impl<RowMajorMatrix,Lhs,RowMajorMatrix>(rhsRow, lhs, resRow);
-     res = resRow;
+    typedef SparseMatrix<typename Rhs::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorRhs;
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorRes;
+    RowMajorRhs rhsRow = rhs;
+    RowMajorRes resRow(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<RowMajorRhs,Lhs,RowMajorRes>(rhsRow, lhs, resRow);
+    res = resRow;
   }
 };
 
@@ -167,10 +182,11 @@
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
-    RowMajorMatrix lhsRow = lhs;
-    RowMajorMatrix resRow(lhs.rows(), rhs.cols());
-    internal::conservative_sparse_sparse_product_impl<Rhs,RowMajorMatrix,RowMajorMatrix>(rhs, lhsRow, resRow);
+    typedef SparseMatrix<typename Lhs::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorLhs;
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorRes;
+    RowMajorLhs lhsRow = lhs;
+    RowMajorRes resRow(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<Rhs,RowMajorLhs,RowMajorRes>(rhs, lhsRow, resRow);
     res = resRow;
   }
 };
@@ -180,7 +196,7 @@
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
     RowMajorMatrix resRow(lhs.rows(), rhs.cols());
     internal::conservative_sparse_sparse_product_impl<Rhs,Lhs,RowMajorMatrix>(rhs, lhs, resRow);
     res = resRow;
@@ -195,7 +211,7 @@
 
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
     ColMajorMatrix resCol(lhs.rows(), rhs.cols());
     internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrix>(lhs, rhs, resCol);
     res = resCol;
@@ -207,10 +223,11 @@
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
-    ColMajorMatrix lhsCol = lhs;
-    ColMajorMatrix resCol(lhs.rows(), rhs.cols());
-    internal::conservative_sparse_sparse_product_impl<ColMajorMatrix,Rhs,ColMajorMatrix>(lhsCol, rhs, resCol);
+    typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorLhs;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRes;
+    ColMajorLhs lhsCol = lhs;
+    ColMajorRes resCol(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<ColMajorLhs,Rhs,ColMajorRes>(lhsCol, rhs, resCol);
     res = resCol;
   }
 };
@@ -220,10 +237,11 @@
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
-    ColMajorMatrix rhsCol = rhs;
-    ColMajorMatrix resCol(lhs.rows(), rhs.cols());
-    internal::conservative_sparse_sparse_product_impl<Lhs,ColMajorMatrix,ColMajorMatrix>(lhs, rhsCol, resCol);
+    typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRhs;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRes;
+    ColMajorRhs rhsCol = rhs;
+    ColMajorRes resCol(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<Lhs,ColMajorRhs,ColMajorRes>(lhs, rhsCol, resCol);
     res = resCol;
   }
 };
@@ -233,8 +251,8 @@
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
     RowMajorMatrix resRow(lhs.rows(),rhs.cols());
     internal::conservative_sparse_sparse_product_impl<Rhs,Lhs,RowMajorMatrix>(rhs, lhs, resRow);
     // sort the non zeros:
@@ -245,6 +263,90 @@
 
 } // end namespace internal
 
+
+namespace internal {
+
+template<typename Lhs, typename Rhs, typename ResultType>
+static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+{
+  typedef typename remove_all<Lhs>::type::Scalar LhsScalar;
+  typedef typename remove_all<Rhs>::type::Scalar RhsScalar;
+  Index cols = rhs.outerSize();
+  eigen_assert(lhs.outerSize() == rhs.innerSize());
+
+  evaluator<Lhs> lhsEval(lhs);
+  evaluator<Rhs> rhsEval(rhs);
+
+  for (Index j=0; j<cols; ++j)
+  {
+    for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt)
+    {
+      RhsScalar y = rhsIt.value();
+      Index k = rhsIt.index();
+      for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt)
+      {
+        Index i = lhsIt.index();
+        LhsScalar x = lhsIt.value();
+        res.coeffRef(i,j) += x * y;
+      }
+    }
+  }
+}
+
+
+} // end namespace internal
+
+namespace internal {
+
+template<typename Lhs, typename Rhs, typename ResultType,
+  int LhsStorageOrder = (traits<Lhs>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+  int RhsStorageOrder = (traits<Rhs>::Flags&RowMajorBit) ? RowMajor : ColMajor>
+struct sparse_sparse_to_dense_product_selector;
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    internal::sparse_sparse_to_dense_product_impl<Lhs,Rhs,ResultType>(lhs, rhs, res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,RowMajor,ColMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorLhs;
+    ColMajorLhs lhsCol(lhs);
+    internal::sparse_sparse_to_dense_product_impl<ColMajorLhs,Rhs,ResultType>(lhsCol, rhs, res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,ColMajor,RowMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRhs;
+    ColMajorRhs rhsCol(rhs);
+    internal::sparse_sparse_to_dense_product_impl<Lhs,ColMajorRhs,ResultType>(lhs, rhsCol, res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,RowMajor,RowMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    Transpose<ResultType> trRes(res);
+    internal::sparse_sparse_to_dense_product_impl<Rhs,Lhs,Transpose<ResultType> >(rhs, lhs, trRes);
+  }
+};
+
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H

diff --git a/Eigen/src/SparseCore/MappedSparseMatrix.h b/Eigen/src/SparseCore/MappedSparseMatrix.h
index ab1a266..67718c8 100644
--- a/Eigen/src/SparseCore/MappedSparseMatrix.h
+++ b/Eigen/src/SparseCore/MappedSparseMatrix.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,9 +10,10 @@
 #ifndef EIGEN_MAPPED_SPARSEMATRIX_H
 #define EIGEN_MAPPED_SPARSEMATRIX_H
 
-namespace Eigen { 
+namespace Eigen {
 
-/** \class MappedSparseMatrix
+/** \deprecated Use Map<SparseMatrix<> >
+  * \class MappedSparseMatrix
   *
   * \brief Sparse matrix
   *
@@ -22,159 +23,44 @@
   *
   */
 namespace internal {
-template<typename _Scalar, int _Flags, typename _Index>
-struct traits<MappedSparseMatrix<_Scalar, _Flags, _Index> > : traits<SparseMatrix<_Scalar, _Flags, _Index> >
+template<typename _Scalar, int _Flags, typename _StorageIndex>
+struct traits<MappedSparseMatrix<_Scalar, _Flags, _StorageIndex> > : traits<SparseMatrix<_Scalar, _Flags, _StorageIndex> >
 {};
-}
+} // end namespace internal
 
-template<typename _Scalar, int _Flags, typename _Index>
+template<typename _Scalar, int _Flags, typename _StorageIndex>
 class MappedSparseMatrix
-  : public SparseMatrixBase<MappedSparseMatrix<_Scalar, _Flags, _Index> >
+  : public Map<SparseMatrix<_Scalar, _Flags, _StorageIndex> >
 {
-  public:
-    EIGEN_SPARSE_PUBLIC_INTERFACE(MappedSparseMatrix)
-    enum { IsRowMajor = Base::IsRowMajor };
-
-  protected:
-
-    Index   m_outerSize;
-    Index   m_innerSize;
-    Index   m_nnz;
-    Index*  m_outerIndex;
-    Index*  m_innerIndices;
-    Scalar* m_values;
+    typedef Map<SparseMatrix<_Scalar, _Flags, _StorageIndex> > Base;
 
   public:
-
-    inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
-    inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
-    inline Index innerSize() const { return m_innerSize; }
-    inline Index outerSize() const { return m_outerSize; }
     
-    bool isCompressed() const { return true; }
+    typedef typename Base::StorageIndex StorageIndex;
+    typedef typename Base::Scalar Scalar;
 
-    //----------------------------------------
-    // direct access interface
-    inline const Scalar* valuePtr() const { return m_values; }
-    inline Scalar* valuePtr() { return m_values; }
-
-    inline const Index* innerIndexPtr() const { return m_innerIndices; }
-    inline Index* innerIndexPtr() { return m_innerIndices; }
-
-    inline const Index* outerIndexPtr() const { return m_outerIndex; }
-    inline Index* outerIndexPtr() { return m_outerIndex; }
-    //----------------------------------------
-
-    inline Scalar coeff(Index row, Index col) const
-    {
-      const Index outer = IsRowMajor ? row : col;
-      const Index inner = IsRowMajor ? col : row;
-
-      Index start = m_outerIndex[outer];
-      Index end = m_outerIndex[outer+1];
-      if (start==end)
-        return Scalar(0);
-      else if (end>0 && inner==m_innerIndices[end-1])
-        return m_values[end-1];
-      // ^^  optimization: let's first check if it is the last coefficient
-      // (very common in high level algorithms)
-
-      const Index* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner);
-      const Index id = r-&m_innerIndices[0];
-      return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0);
-    }
-
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      const Index outer = IsRowMajor ? row : col;
-      const Index inner = IsRowMajor ? col : row;
-
-      Index start = m_outerIndex[outer];
-      Index end = m_outerIndex[outer+1];
-      eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
-      eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient");
-      Index* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end],inner);
-      const Index id = r-&m_innerIndices[0];
-      eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient");
-      return m_values[id];
-    }
-
-    class InnerIterator;
-    class ReverseInnerIterator;
-
-    /** \returns the number of non zero coefficients */
-    inline Index nonZeros() const  { return m_nnz; }
-
-    inline MappedSparseMatrix(Index rows, Index cols, Index nnz, Index* outerIndexPtr, Index* innerIndexPtr, Scalar* valuePtr)
-      : m_outerSize(IsRowMajor?rows:cols), m_innerSize(IsRowMajor?cols:rows), m_nnz(nnz), m_outerIndex(outerIndexPtr),
-        m_innerIndices(innerIndexPtr), m_values(valuePtr)
+    inline MappedSparseMatrix(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZeroPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZeroPtr)
     {}
 
     /** Empty destructor */
     inline ~MappedSparseMatrix() {}
 };
 
-template<typename Scalar, int _Flags, typename _Index>
-class MappedSparseMatrix<Scalar,_Flags,_Index>::InnerIterator
+namespace internal {
+
+template<typename _Scalar, int _Options, typename _StorageIndex>
+struct evaluator<MappedSparseMatrix<_Scalar,_Options,_StorageIndex> >
+  : evaluator<SparseCompressedBase<MappedSparseMatrix<_Scalar,_Options,_StorageIndex> > >
 {
-  public:
-    InnerIterator(const MappedSparseMatrix& mat, Index outer)
-      : m_matrix(mat),
-        m_outer(outer),
-        m_id(mat.outerIndexPtr()[outer]),
-        m_start(m_id),
-        m_end(mat.outerIndexPtr()[outer+1])
-    {}
-
-    inline InnerIterator& operator++() { m_id++; return *this; }
-
-    inline Scalar value() const { return m_matrix.valuePtr()[m_id]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id]); }
-
-    inline Index index() const { return m_matrix.innerIndexPtr()[m_id]; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id < m_end) && (m_id>=m_start); }
-
-  protected:
-    const MappedSparseMatrix& m_matrix;
-    const Index m_outer;
-    Index m_id;
-    const Index m_start;
-    const Index m_end;
+  typedef MappedSparseMatrix<_Scalar,_Options,_StorageIndex> XprType;
+  typedef evaluator<SparseCompressedBase<XprType> > Base;
+  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
 };
 
-template<typename Scalar, int _Flags, typename _Index>
-class MappedSparseMatrix<Scalar,_Flags,_Index>::ReverseInnerIterator
-{
-  public:
-    ReverseInnerIterator(const MappedSparseMatrix& mat, Index outer)
-      : m_matrix(mat),
-        m_outer(outer),
-        m_id(mat.outerIndexPtr()[outer+1]),
-        m_start(mat.outerIndexPtr()[outer]),
-        m_end(m_id)
-    {}
-
-    inline ReverseInnerIterator& operator--() { m_id--; return *this; }
-
-    inline Scalar value() const { return m_matrix.valuePtr()[m_id-1]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id-1]); }
-
-    inline Index index() const { return m_matrix.innerIndexPtr()[m_id-1]; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id <= m_end) && (m_id>m_start); }
-
-  protected:
-    const MappedSparseMatrix& m_matrix;
-    const Index m_outer;
-    Index m_id;
-    const Index m_start;
-    const Index m_end;
-};
+}
 
 } // end namespace Eigen
 

diff --git a/Eigen/src/SparseCore/SparseAssign.h b/Eigen/src/SparseCore/SparseAssign.h
new file mode 100644
index 0000000..905485c
--- /dev/null
+++ b/Eigen/src/SparseCore/SparseAssign.h

@@ -0,0 +1,270 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSEASSIGN_H
+#define EIGEN_SPARSEASSIGN_H
+
+namespace Eigen { 
+
+template<typename Derived>    
+template<typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
+{
+  internal::call_assignment_no_alias(derived(), other.derived());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
+{
+  // TODO use the evaluator mechanism
+  other.evalTo(derived());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+inline Derived& SparseMatrixBase<Derived>::operator=(const SparseMatrixBase<OtherDerived>& other)
+{
+  // by default sparse evaluation do not alias, so we can safely bypass the generic call_assignment routine
+  internal::Assignment<Derived,OtherDerived,internal::assign_op<Scalar,typename OtherDerived::Scalar> >
+          ::run(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
+template<typename Derived>
+inline Derived& SparseMatrixBase<Derived>::operator=(const Derived& other)
+{
+  internal::call_assignment_no_alias(derived(), other.derived());
+  return derived();
+}
+
+namespace internal {
+
+template<>
+struct storage_kind_to_evaluator_kind<Sparse> {
+  typedef IteratorBased Kind;
+};
+
+template<>
+struct storage_kind_to_shape<Sparse> {
+  typedef SparseShape Shape;
+};
+
+struct Sparse2Sparse {};
+struct Sparse2Dense  {};
+
+template<> struct AssignmentKind<SparseShape, SparseShape>           { typedef Sparse2Sparse Kind; };
+template<> struct AssignmentKind<SparseShape, SparseTriangularShape> { typedef Sparse2Sparse Kind; };
+template<> struct AssignmentKind<DenseShape,  SparseShape>           { typedef Sparse2Dense  Kind; };
+template<> struct AssignmentKind<DenseShape,  SparseTriangularShape> { typedef Sparse2Dense  Kind; };
+
+
+template<typename DstXprType, typename SrcXprType>
+void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src)
+{
+  typedef typename DstXprType::Scalar Scalar;
+  typedef internal::evaluator<DstXprType> DstEvaluatorType;
+  typedef internal::evaluator<SrcXprType> SrcEvaluatorType;
+
+  SrcEvaluatorType srcEvaluator(src);
+
+  const bool transpose = (DstEvaluatorType::Flags & RowMajorBit) != (SrcEvaluatorType::Flags & RowMajorBit);
+  const Index outerEvaluationSize = (SrcEvaluatorType::Flags&RowMajorBit) ? src.rows() : src.cols();
+  if ((!transpose) && src.isRValue())
+  {
+    // eval without temporary
+    dst.resize(src.rows(), src.cols());
+    dst.setZero();
+    dst.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2));
+    for (Index j=0; j<outerEvaluationSize; ++j)
+    {
+      dst.startVec(j);
+      for (typename SrcEvaluatorType::InnerIterator it(srcEvaluator, j); it; ++it)
+      {
+        Scalar v = it.value();
+        dst.insertBackByOuterInner(j,it.index()) = v;
+      }
+    }
+    dst.finalize();
+  }
+  else
+  {
+    // eval through a temporary
+    eigen_assert(( ((internal::traits<DstXprType>::SupportedAccessPatterns & OuterRandomAccessPattern)==OuterRandomAccessPattern) ||
+              (!((DstEvaluatorType::Flags & RowMajorBit) != (SrcEvaluatorType::Flags & RowMajorBit)))) &&
+              "the transpose operation is supposed to be handled in SparseMatrix::operator=");
+
+    enum { Flip = (DstEvaluatorType::Flags & RowMajorBit) != (SrcEvaluatorType::Flags & RowMajorBit) };
+
+    
+    DstXprType temp(src.rows(), src.cols());
+
+    temp.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2));
+    for (Index j=0; j<outerEvaluationSize; ++j)
+    {
+      temp.startVec(j);
+      for (typename SrcEvaluatorType::InnerIterator it(srcEvaluator, j); it; ++it)
+      {
+        Scalar v = it.value();
+        temp.insertBackByOuterInner(Flip?it.index():j,Flip?j:it.index()) = v;
+      }
+    }
+    temp.finalize();
+
+    dst = temp.markAsRValue();
+  }
+}
+
+// Generic Sparse to Sparse assignment
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Sparse>
+{
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  {
+    assign_sparse_to_sparse(dst.derived(), src.derived());
+  }
+};
+
+// Generic Sparse to Dense assignment
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
+struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Dense, Weak>
+{
+  static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  {
+    if(internal::is_same<Functor,internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> >::value)
+      dst.setZero();
+    
+    internal::evaluator<SrcXprType> srcEval(src);
+    resize_if_allowed(dst, src, func);
+    internal::evaluator<DstXprType> dstEval(dst);
+    
+    const Index outerEvaluationSize = (internal::evaluator<SrcXprType>::Flags&RowMajorBit) ? src.rows() : src.cols();
+    for (Index j=0; j<outerEvaluationSize; ++j)
+      for (typename internal::evaluator<SrcXprType>::InnerIterator i(srcEval,j); i; ++i)
+        func.assignCoeff(dstEval.coeffRef(i.row(),i.col()), i.value());
+  }
+};
+
+// Specialization for dense ?= dense +/- sparse and dense ?= sparse +/- dense
+template<typename DstXprType, typename Func1, typename Func2>
+struct assignment_from_dense_op_sparse
+{
+  template<typename SrcXprType, typename InitialFunc>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/)
+  {
+    #ifdef EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN
+    EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN
+    #endif
+
+    call_assignment_no_alias(dst, src.lhs(), Func1());
+    call_assignment_no_alias(dst, src.rhs(), Func2());
+  }
+
+  // Specialization for dense1 = sparse + dense2; -> dense1 = dense2; dense1 += sparse;
+  template<typename Lhs, typename Rhs, typename Scalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  typename internal::enable_if<internal::is_same<typename internal::evaluator_traits<Rhs>::Shape,DenseShape>::value>::type
+  run(DstXprType &dst, const CwiseBinaryOp<internal::scalar_sum_op<Scalar,Scalar>, const Lhs, const Rhs> &src,
+      const internal::assign_op<typename DstXprType::Scalar,Scalar>& /*func*/)
+  {
+    #ifdef EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN
+    EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN
+    #endif
+
+    // Apply the dense matrix first, then the sparse one.
+    call_assignment_no_alias(dst, src.rhs(), Func1());
+    call_assignment_no_alias(dst, src.lhs(), Func2());
+  }
+
+  // Specialization for dense1 = sparse - dense2; -> dense1 = -dense2; dense1 += sparse;
+  template<typename Lhs, typename Rhs, typename Scalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  typename internal::enable_if<internal::is_same<typename internal::evaluator_traits<Rhs>::Shape,DenseShape>::value>::type
+  run(DstXprType &dst, const CwiseBinaryOp<internal::scalar_difference_op<Scalar,Scalar>, const Lhs, const Rhs> &src,
+      const internal::assign_op<typename DstXprType::Scalar,Scalar>& /*func*/)
+  {
+    #ifdef EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN
+    EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN
+    #endif
+
+    // Apply the dense matrix first, then the sparse one.
+    call_assignment_no_alias(dst, -src.rhs(), Func1());
+    call_assignment_no_alias(dst,  src.lhs(), add_assign_op<typename DstXprType::Scalar,typename Lhs::Scalar>());
+  }
+};
+
+#define EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(ASSIGN_OP,BINOP,ASSIGN_OP2) \
+  template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar> \
+  struct Assignment<DstXprType, CwiseBinaryOp<internal::BINOP<Scalar,Scalar>, const Lhs, const Rhs>, internal::ASSIGN_OP<typename DstXprType::Scalar,Scalar>, \
+                    Sparse2Dense, \
+                    typename internal::enable_if<   internal::is_same<typename internal::evaluator_traits<Lhs>::Shape,DenseShape>::value \
+                                                 || internal::is_same<typename internal::evaluator_traits<Rhs>::Shape,DenseShape>::value>::type> \
+    : assignment_from_dense_op_sparse<DstXprType, internal::ASSIGN_OP<typename DstXprType::Scalar,typename Lhs::Scalar>, internal::ASSIGN_OP2<typename DstXprType::Scalar,typename Rhs::Scalar> > \
+  {}
+
+EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(assign_op,    scalar_sum_op,add_assign_op);
+EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(add_assign_op,scalar_sum_op,add_assign_op);
+EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(sub_assign_op,scalar_sum_op,sub_assign_op);
+
+EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(assign_op,    scalar_difference_op,sub_assign_op);
+EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(add_assign_op,scalar_difference_op,sub_assign_op);
+EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(sub_assign_op,scalar_difference_op,add_assign_op);
+
+
+// Specialization for "dst = dec.solve(rhs)"
+// NOTE we need to specialize it for Sparse2Sparse to avoid ambiguous specialization error
+template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar,Scalar>, Sparse2Sparse>
+{
+  typedef Solve<DecType,RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    src.dec()._solve_impl(src.rhs(), dst);
+  }
+};
+
+struct Diagonal2Sparse {};
+
+template<> struct AssignmentKind<SparseShape,DiagonalShape> { typedef Diagonal2Sparse Kind; };
+
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Sparse>
+{
+  typedef typename DstXprType::StorageIndex StorageIndex;
+  typedef typename DstXprType::Scalar Scalar;
+
+  template<int Options, typename AssignFunc>
+  static void run(SparseMatrix<Scalar,Options,StorageIndex> &dst, const SrcXprType &src, const AssignFunc &func)
+  { dst.assignDiagonal(src.diagonal(), func); }
+  
+  template<typename DstDerived>
+  static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  { dst.derived().diagonal() = src.diagonal(); }
+  
+  template<typename DstDerived>
+  static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  { dst.derived().diagonal() += src.diagonal(); }
+  
+  template<typename DstDerived>
+  static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  { dst.derived().diagonal() -= src.diagonal(); }
+};
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSEASSIGN_H

diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h
index 3a6d8a2..5b4f6cc 100644
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,6 +12,7 @@
 
 namespace Eigen {
 
+// Subset of columns or rows
 template<typename XprType, int BlockRows, int BlockCols>
 class BlockImpl<XprType,BlockRows,BlockCols,true,Sparse>
   : public SparseMatrixBase<Block<XprType,BlockRows,BlockCols,true> >
@@ -22,40 +23,17 @@
     enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
 protected:
     enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
+    typedef SparseMatrixBase<BlockType> Base;
+    using Base::convert_index;
 public:
     EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
 
-    class InnerIterator: public XprType::InnerIterator
-    {
-        typedef typename BlockImpl::Index Index;
-      public:
-        inline InnerIterator(const BlockType& xpr, Index outer)
-          : XprType::InnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
-        {}
-        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
-        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
-      protected:
-        Index m_outer;
-    };
-    class ReverseInnerIterator: public XprType::ReverseInnerIterator
-    {
-        typedef typename BlockImpl::Index Index;
-      public:
-        inline ReverseInnerIterator(const BlockType& xpr, Index outer)
-          : XprType::ReverseInnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
-        {}
-        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
-        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
-      protected:
-        Index m_outer;
-    };
-
-    inline BlockImpl(const XprType& xpr, int i)
-      : m_matrix(xpr), m_outerStart(i), m_outerSize(OuterSize)
+    inline BlockImpl(XprType& xpr, Index i)
+      : m_matrix(xpr), m_outerStart(convert_index(i)), m_outerSize(OuterSize)
     {}
 
-    inline BlockImpl(const XprType& xpr, int startRow, int startCol, int blockRows, int blockCols)
-      : m_matrix(xpr), m_outerStart(IsRowMajor ? startRow : startCol), m_outerSize(IsRowMajor ? blockRows : blockCols)
+    inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+      : m_matrix(xpr), m_outerStart(convert_index(IsRowMajor ? startRow : startCol)), m_outerSize(convert_index(IsRowMajor ? blockRows : blockCols))
     {}
 
     EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
@@ -63,22 +41,48 @@
 
     Index nonZeros() const
     {
+      typedef internal::evaluator<XprType> EvaluatorType;
+      EvaluatorType matEval(m_matrix);
       Index nnz = 0;
       Index end = m_outerStart + m_outerSize.value();
-      for(int j=m_outerStart; j<end; ++j)
-        for(typename XprType::InnerIterator it(m_matrix, j); it; ++it)
+      for(Index j=m_outerStart; j<end; ++j)
+        for(typename EvaluatorType::InnerIterator it(matEval, j); it; ++it)
           ++nnz;
       return nnz;
     }
 
+    inline const Scalar coeff(Index row, Index col) const
+    {
+      return m_matrix.coeff(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
+    }
+
+    inline const Scalar coeff(Index index) const
+    {
+      return m_matrix.coeff(IsRowMajor ? m_outerStart : index, IsRowMajor ? index :  m_outerStart);
+    }
+
+    inline const XprType& nestedExpression() const { return m_matrix; }
+    inline XprType& nestedExpression() { return m_matrix; }
+    Index startRow() const { return IsRowMajor ? m_outerStart : 0; }
+    Index startCol() const { return IsRowMajor ? 0 : m_outerStart; }
+    Index blockRows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
+    Index blockCols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
+
   protected:
 
-    typename XprType::Nested m_matrix;
+    typename internal::ref_selector<XprType>::non_const_type m_matrix;
     Index m_outerStart;
     const internal::variable_if_dynamic<Index, OuterSize> m_outerSize;
 
-  public:
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
+  protected:
+    // Disable assignment with clear error message.
+    // Note that simply removing operator= yields compilation errors with ICC+MSVC
+    template<typename T>
+    BlockImpl& operator=(const T&)
+    {
+      EIGEN_STATIC_ASSERT(sizeof(T)==0, THIS_SPARSE_BLOCK_SUBEXPRESSION_IS_READ_ONLY);
+      return *this;
+    }
 };
 
 
@@ -90,62 +94,43 @@
 
 template<typename SparseMatrixType, int BlockRows, int BlockCols>
 class sparse_matrix_block_impl
-  : public SparseMatrixBase<Block<SparseMatrixType,BlockRows,BlockCols,true> >
+  : public SparseCompressedBase<Block<SparseMatrixType,BlockRows,BlockCols,true> >
 {
     typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _MatrixTypeNested;
     typedef Block<SparseMatrixType, BlockRows, BlockCols, true> BlockType;
+    typedef SparseCompressedBase<Block<SparseMatrixType,BlockRows,BlockCols,true> > Base;
+    using Base::convert_index;
 public:
     enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
     EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
 protected:
+    typedef typename Base::IndexVector IndexVector;
     enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
 public:
 
-    class InnerIterator: public SparseMatrixType::InnerIterator
-    {
-      public:
-        inline InnerIterator(const BlockType& xpr, Index outer)
-          : SparseMatrixType::InnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
-        {}
-        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
-        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
-      protected:
-        Index m_outer;
-    };
-    class ReverseInnerIterator: public SparseMatrixType::ReverseInnerIterator
-    {
-      public:
-        inline ReverseInnerIterator(const BlockType& xpr, Index outer)
-          : SparseMatrixType::ReverseInnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
-        {}
-        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
-        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
-      protected:
-        Index m_outer;
-    };
-
-    inline sparse_matrix_block_impl(const SparseMatrixType& xpr, int i)
-      : m_matrix(xpr), m_outerStart(i), m_outerSize(OuterSize)
+    inline sparse_matrix_block_impl(SparseMatrixType& xpr, Index i)
+      : m_matrix(xpr), m_outerStart(convert_index(i)), m_outerSize(OuterSize)
     {}
 
-    inline sparse_matrix_block_impl(const SparseMatrixType& xpr, int startRow, int startCol, int blockRows, int blockCols)
-      : m_matrix(xpr), m_outerStart(IsRowMajor ? startRow : startCol), m_outerSize(IsRowMajor ? blockRows : blockCols)
+    inline sparse_matrix_block_impl(SparseMatrixType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+      : m_matrix(xpr), m_outerStart(convert_index(IsRowMajor ? startRow : startCol)), m_outerSize(convert_index(IsRowMajor ? blockRows : blockCols))
     {}
 
     template<typename OtherDerived>
     inline BlockType& operator=(const SparseMatrixBase<OtherDerived>& other)
     {
       typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _NestedMatrixType;
-      _NestedMatrixType& matrix = const_cast<_NestedMatrixType&>(m_matrix);;
+      _NestedMatrixType& matrix = m_matrix;
       // This assignment is slow if this vector set is not empty
       // and/or it is not at the end of the nonzeros of the underlying matrix.
 
       // 1 - eval to a temporary to avoid transposition and/or aliasing issues
-      SparseMatrix<Scalar, IsRowMajor ? RowMajor : ColMajor, Index> tmp(other);
+      Ref<const SparseMatrix<Scalar, IsRowMajor ? RowMajor : ColMajor, StorageIndex> > tmp(other.derived());
+      eigen_internal_assert(tmp.outerSize()==m_outerSize.value());
 
       // 2 - let's check whether there is enough allocated memory
       Index nnz           = tmp.nonZeros();
-      Index start         = m_outerStart==0 ? 0 : matrix.outerIndexPtr()[m_outerStart]; // starting position of the current block
+      Index start         = m_outerStart==0 ? 0 : m_matrix.outerIndexPtr()[m_outerStart]; // starting position of the current block
       Index end           = m_matrix.outerIndexPtr()[m_outerStart+m_outerSize.value()]; // ending position of the current block
       Index block_size    = end - start;                                                // available room in the current block
       Index tail_size     = m_matrix.outerIndexPtr()[m_matrix.outerSize()] - end;
@@ -154,52 +139,73 @@
                           ? Index(matrix.data().allocatedSize()) + block_size
                           : block_size;
 
+      Index tmp_start = tmp.outerIndexPtr()[0];
+
+      bool update_trailing_pointers = false;
       if(nnz>free_size)
       {
         // realloc manually to reduce copies
         typename SparseMatrixType::Storage newdata(m_matrix.data().allocatedSize() - block_size + nnz);
 
-        internal::smart_copy(&m_matrix.data().value(0),  &m_matrix.data().value(0) + start, &newdata.value(0));
-        internal::smart_copy(&m_matrix.data().index(0),  &m_matrix.data().index(0) + start, &newdata.index(0));
+        internal::smart_copy(m_matrix.valuePtr(),       m_matrix.valuePtr() + start,      newdata.valuePtr());
+        internal::smart_copy(m_matrix.innerIndexPtr(),  m_matrix.innerIndexPtr() + start, newdata.indexPtr());
 
-        internal::smart_copy(&tmp.data().value(0),  &tmp.data().value(0) + nnz, &newdata.value(start));
-        internal::smart_copy(&tmp.data().index(0),  &tmp.data().index(0) + nnz, &newdata.index(start));
+        internal::smart_copy(tmp.valuePtr() + tmp_start,      tmp.valuePtr() + tmp_start + nnz,       newdata.valuePtr() + start);
+        internal::smart_copy(tmp.innerIndexPtr() + tmp_start, tmp.innerIndexPtr() + tmp_start + nnz,  newdata.indexPtr() + start);
 
-        internal::smart_copy(&matrix.data().value(end),  &matrix.data().value(end) + tail_size, &newdata.value(start+nnz));
-        internal::smart_copy(&matrix.data().index(end),  &matrix.data().index(end) + tail_size, &newdata.index(start+nnz));
+        internal::smart_copy(matrix.valuePtr()+end,       matrix.valuePtr()+end + tail_size,      newdata.valuePtr()+start+nnz);
+        internal::smart_copy(matrix.innerIndexPtr()+end,  matrix.innerIndexPtr()+end + tail_size, newdata.indexPtr()+start+nnz);
 
         newdata.resize(m_matrix.outerIndexPtr()[m_matrix.outerSize()] - block_size + nnz);
 
         matrix.data().swap(newdata);
+
+        update_trailing_pointers = true;
       }
       else
       {
-        // no need to realloc, simply copy the tail at its respective position and insert tmp
-        matrix.data().resize(start + nnz + tail_size);
+        if(m_matrix.isCompressed() && nnz!=block_size)
+        {
+          // no need to realloc, simply copy the tail at its respective position and insert tmp
+          matrix.data().resize(start + nnz + tail_size);
 
-        internal::smart_memmove(&matrix.data().value(end),  &matrix.data().value(end) + tail_size, &matrix.data().value(start + nnz));
-        internal::smart_memmove(&matrix.data().index(end),  &matrix.data().index(end) + tail_size, &matrix.data().index(start + nnz));
+          internal::smart_memmove(matrix.valuePtr()+end,      matrix.valuePtr() + end+tail_size,      matrix.valuePtr() + start+nnz);
+          internal::smart_memmove(matrix.innerIndexPtr()+end, matrix.innerIndexPtr() + end+tail_size, matrix.innerIndexPtr() + start+nnz);
 
-        internal::smart_copy(&tmp.data().value(0),  &tmp.data().value(0) + nnz, &matrix.data().value(start));
-        internal::smart_copy(&tmp.data().index(0),  &tmp.data().index(0) + nnz, &matrix.data().index(start));
+          update_trailing_pointers = true;
+        }
+
+        internal::smart_copy(tmp.valuePtr() + tmp_start,      tmp.valuePtr() + tmp_start + nnz,       matrix.valuePtr() + start);
+        internal::smart_copy(tmp.innerIndexPtr() + tmp_start, tmp.innerIndexPtr() + tmp_start + nnz,  matrix.innerIndexPtr() + start);
       }
 
-      // update innerNonZeros
-      if(!m_matrix.isCompressed())
-        for(Index j=0; j<m_outerSize.value(); ++j)
-          matrix.innerNonZeroPtr()[m_outerStart+j] = tmp.innerVector(j).nonZeros();
-
-      // update outer index pointers
-      Index p = start;
-      for(Index k=0; k<m_outerSize.value(); ++k)
+      // update outer index pointers and innerNonZeros
+      if(IsVectorAtCompileTime)
       {
-        matrix.outerIndexPtr()[m_outerStart+k] = p;
-        p += tmp.innerVector(k).nonZeros();
+        if(!m_matrix.isCompressed())
+          matrix.innerNonZeroPtr()[m_outerStart] = StorageIndex(nnz);
+        matrix.outerIndexPtr()[m_outerStart] = StorageIndex(start);
       }
-      std::ptrdiff_t offset = nnz - block_size;
-      for(Index k = m_outerStart + m_outerSize.value(); k<=matrix.outerSize(); ++k)
+      else
       {
-        matrix.outerIndexPtr()[k] += offset;
+        StorageIndex p = StorageIndex(start);
+        for(Index k=0; k<m_outerSize.value(); ++k)
+        {
+          StorageIndex nnz_k = internal::convert_index<StorageIndex>(tmp.innerVector(k).nonZeros());
+          if(!m_matrix.isCompressed())
+            matrix.innerNonZeroPtr()[m_outerStart+k] = nnz_k;
+          matrix.outerIndexPtr()[m_outerStart+k] = p;
+          p += nnz_k;
+        }
+      }
+
+      if(update_trailing_pointers)
+      {
+        StorageIndex offset = internal::convert_index<StorageIndex>(nnz - block_size);
+        for(Index k = m_outerStart + m_outerSize.value(); k<=matrix.outerSize(); ++k)
+        {
+          matrix.outerIndexPtr()[k] += offset;
+        }
       }
 
       return derived();
@@ -211,35 +217,46 @@
     }
 
     inline const Scalar* valuePtr() const
-    { return m_matrix.valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+    { return m_matrix.valuePtr(); }
     inline Scalar* valuePtr()
-    { return m_matrix.const_cast_derived().valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+    { return m_matrix.valuePtr(); }
 
-    inline const Index* innerIndexPtr() const
-    { return m_matrix.innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
-    inline Index* innerIndexPtr()
-    { return m_matrix.const_cast_derived().innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+    inline const StorageIndex* innerIndexPtr() const
+    { return m_matrix.innerIndexPtr(); }
+    inline StorageIndex* innerIndexPtr()
+    { return m_matrix.innerIndexPtr(); }
 
-    inline const Index* outerIndexPtr() const
+    inline const StorageIndex* outerIndexPtr() const
     { return m_matrix.outerIndexPtr() + m_outerStart; }
-    inline Index* outerIndexPtr()
-    { return m_matrix.const_cast_derived().outerIndexPtr() + m_outerStart; }
+    inline StorageIndex* outerIndexPtr()
+    { return m_matrix.outerIndexPtr() + m_outerStart; }
 
-    Index nonZeros() const
+    inline const StorageIndex* innerNonZeroPtr() const
+    { return isCompressed() ? 0 : (m_matrix.innerNonZeroPtr()+m_outerStart); }
+    inline StorageIndex* innerNonZeroPtr()
+    { return isCompressed() ? 0 : (m_matrix.innerNonZeroPtr()+m_outerStart); }
+
+    bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; }
+
+    inline Scalar& coeffRef(Index row, Index col)
     {
-      if(m_matrix.isCompressed())
-        return  std::size_t(m_matrix.outerIndexPtr()[m_outerStart+m_outerSize.value()])
-              - std::size_t(m_matrix.outerIndexPtr()[m_outerStart]);
-      else if(m_outerSize.value()==0)
-        return 0;
-      else
-        return Map<const Matrix<Index,OuterSize,1> >(m_matrix.innerNonZeroPtr()+m_outerStart, m_outerSize.value()).sum();
+      return m_matrix.coeffRef(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
+    }
+
+    inline const Scalar coeff(Index row, Index col) const
+    {
+      return m_matrix.coeff(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
+    }
+
+    inline const Scalar coeff(Index index) const
+    {
+      return m_matrix.coeff(IsRowMajor ? m_outerStart : index, IsRowMajor ? index :  m_outerStart);
     }
 
     const Scalar& lastCoeff() const
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(sparse_matrix_block_impl);
-      eigen_assert(nonZeros()>0);
+      eigen_assert(Base::nonZeros()>0);
       if(m_matrix.isCompressed())
         return m_matrix.valuePtr()[m_matrix.outerIndexPtr()[m_outerStart+1]-1];
       else
@@ -249,9 +266,16 @@
     EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
     EIGEN_STRONG_INLINE Index cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
 
+    inline const SparseMatrixType& nestedExpression() const { return m_matrix; }
+    inline SparseMatrixType& nestedExpression() { return m_matrix; }
+    Index startRow() const { return IsRowMajor ? m_outerStart : 0; }
+    Index startCol() const { return IsRowMajor ? 0 : m_outerStart; }
+    Index blockRows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
+    Index blockCols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
+
   protected:
 
-    typename SparseMatrixType::Nested m_matrix;
+    typename internal::ref_selector<SparseMatrixType>::non_const_type m_matrix;
     Index m_outerStart;
     const internal::variable_if_dynamic<Index, OuterSize> m_outerSize;
 
@@ -259,93 +283,49 @@
 
 } // namespace internal
 
-template<typename _Scalar, int _Options, typename _Index, int BlockRows, int BlockCols>
-class BlockImpl<SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols,true,Sparse>
-  : public internal::sparse_matrix_block_impl<SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols>
+template<typename _Scalar, int _Options, typename _StorageIndex, int BlockRows, int BlockCols>
+class BlockImpl<SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true,Sparse>
+  : public internal::sparse_matrix_block_impl<SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols>
 {
 public:
-  typedef SparseMatrix<_Scalar, _Options, _Index> SparseMatrixType;
+  typedef _StorageIndex StorageIndex;
+  typedef SparseMatrix<_Scalar, _Options, _StorageIndex> SparseMatrixType;
   typedef internal::sparse_matrix_block_impl<SparseMatrixType,BlockRows,BlockCols> Base;
-  inline BlockImpl(SparseMatrixType& xpr, int i)
+  inline BlockImpl(SparseMatrixType& xpr, Index i)
     : Base(xpr, i)
   {}
 
-  inline BlockImpl(SparseMatrixType& xpr, int startRow, int startCol, int blockRows, int blockCols)
+  inline BlockImpl(SparseMatrixType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
     : Base(xpr, startRow, startCol, blockRows, blockCols)
   {}
 
   using Base::operator=;
 };
 
-template<typename _Scalar, int _Options, typename _Index, int BlockRows, int BlockCols>
-class BlockImpl<const SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols,true,Sparse>
-  : public internal::sparse_matrix_block_impl<const SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols>
+template<typename _Scalar, int _Options, typename _StorageIndex, int BlockRows, int BlockCols>
+class BlockImpl<const SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true,Sparse>
+  : public internal::sparse_matrix_block_impl<const SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols>
 {
 public:
-  typedef const SparseMatrix<_Scalar, _Options, _Index> SparseMatrixType;
+  typedef _StorageIndex StorageIndex;
+  typedef const SparseMatrix<_Scalar, _Options, _StorageIndex> SparseMatrixType;
   typedef internal::sparse_matrix_block_impl<SparseMatrixType,BlockRows,BlockCols> Base;
-  inline BlockImpl(SparseMatrixType& xpr, int i)
+  inline BlockImpl(SparseMatrixType& xpr, Index i)
     : Base(xpr, i)
   {}
 
-  inline BlockImpl(SparseMatrixType& xpr, int startRow, int startCol, int blockRows, int blockCols)
+  inline BlockImpl(SparseMatrixType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
     : Base(xpr, startRow, startCol, blockRows, blockCols)
   {}
 
   using Base::operator=;
+private:
+  template<typename Derived> BlockImpl(const SparseMatrixBase<Derived>& xpr, Index i);
+  template<typename Derived> BlockImpl(const SparseMatrixBase<Derived>& xpr);
 };
 
 //----------
 
-/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-  * is col-major (resp. row-major).
-  */
-template<typename Derived>
-typename SparseMatrixBase<Derived>::InnerVectorReturnType SparseMatrixBase<Derived>::innerVector(Index outer)
-{ return InnerVectorReturnType(derived(), outer); }
-
-/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-  * is col-major (resp. row-major). Read-only.
-  */
-template<typename Derived>
-const typename SparseMatrixBase<Derived>::ConstInnerVectorReturnType SparseMatrixBase<Derived>::innerVector(Index outer) const
-{ return ConstInnerVectorReturnType(derived(), outer); }
-
-/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-  * is col-major (resp. row-major).
-  */
-template<typename Derived>
-Block<Derived,Dynamic,Dynamic,true> SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize)
-{
-  return Block<Derived,Dynamic,Dynamic,true>(derived(),
-                                             IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
-                                             IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
-
-}
-
-/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-  * is col-major (resp. row-major). Read-only.
-  */
-template<typename Derived>
-const Block<const Derived,Dynamic,Dynamic,true> SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize) const
-{
-  return Block<const Derived,Dynamic,Dynamic,true>(derived(),
-                                                  IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
-                                                  IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
-
-}
-
-namespace internal {
-
-template< typename XprType, int BlockRows, int BlockCols, bool InnerPanel,
-          bool OuterVector =  (BlockCols==1 && XprType::IsRowMajor)
-                               | // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
-                                 // revert to || as soon as not needed anymore.
-                              (BlockRows==1 && !XprType::IsRowMajor)>
-class GenericSparseBlockInnerIteratorImpl;
-
-}
-
 /** Generic implementation of sparse Block expression.
   * Real-only.
   */
@@ -353,7 +333,9 @@
 class BlockImpl<XprType,BlockRows,BlockCols,InnerPanel,Sparse>
   : public SparseMatrixBase<Block<XprType,BlockRows,BlockCols,InnerPanel> >, internal::no_assignment_operator
 {
-  typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+    typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+    typedef SparseMatrixBase<BlockType> Base;
+    using Base::convert_index;
 public:
     enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
     EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
@@ -362,182 +344,224 @@
 
     /** Column or Row constructor
       */
-    inline BlockImpl(const XprType& xpr, int i)
+    inline BlockImpl(XprType& xpr, Index i)
       : m_matrix(xpr),
-        m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0),
-        m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0),
+        m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? convert_index(i) : 0),
+        m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? convert_index(i) : 0),
         m_blockRows(BlockRows==1 ? 1 : xpr.rows()),
         m_blockCols(BlockCols==1 ? 1 : xpr.cols())
     {}
 
     /** Dynamic-size constructor
       */
-    inline BlockImpl(const XprType& xpr, int startRow, int startCol, int blockRows, int blockCols)
-      : m_matrix(xpr), m_startRow(startRow), m_startCol(startCol), m_blockRows(blockRows), m_blockCols(blockCols)
+    inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+      : m_matrix(xpr), m_startRow(convert_index(startRow)), m_startCol(convert_index(startCol)), m_blockRows(convert_index(blockRows)), m_blockCols(convert_index(blockCols))
     {}
 
-    inline int rows() const { return m_blockRows.value(); }
-    inline int cols() const { return m_blockCols.value(); }
+    inline Index rows() const { return m_blockRows.value(); }
+    inline Index cols() const { return m_blockCols.value(); }
 
-    inline Scalar& coeffRef(int row, int col)
+    inline Scalar& coeffRef(Index row, Index col)
     {
-      return m_matrix.const_cast_derived()
-               .coeffRef(row + m_startRow.value(), col + m_startCol.value());
+      return m_matrix.coeffRef(row + m_startRow.value(), col + m_startCol.value());
     }
 
-    inline const Scalar coeff(int row, int col) const
+    inline const Scalar coeff(Index row, Index col) const
     {
       return m_matrix.coeff(row + m_startRow.value(), col + m_startCol.value());
     }
 
-    inline Scalar& coeffRef(int index)
+    inline Scalar& coeffRef(Index index)
     {
-      return m_matrix.const_cast_derived()
-             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_matrix.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                               m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
     }
 
-    inline const Scalar coeff(int index) const
+    inline const Scalar coeff(Index index) const
     {
-      return m_matrix
-             .coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                    m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_matrix.coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                            m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
     }
 
-    inline const _MatrixTypeNested& nestedExpression() const { return m_matrix; }
+    inline const XprType& nestedExpression() const { return m_matrix; }
+    inline XprType& nestedExpression() { return m_matrix; }
+    Index startRow() const { return m_startRow.value(); }
+    Index startCol() const { return m_startCol.value(); }
+    Index blockRows() const { return m_blockRows.value(); }
+    Index blockCols() const { return m_blockCols.value(); }
 
-    typedef internal::GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel> InnerIterator;
-
-    class ReverseInnerIterator : public _MatrixTypeNested::ReverseInnerIterator
-    {
-      typedef typename _MatrixTypeNested::ReverseInnerIterator Base;
-      const BlockType& m_block;
-      Index m_begin;
-    public:
-
-      EIGEN_STRONG_INLINE ReverseInnerIterator(const BlockType& block, Index outer)
-        : Base(block.derived().nestedExpression(), outer + (IsRowMajor ? block.m_startRow.value() : block.m_startCol.value())),
-          m_block(block),
-          m_begin(IsRowMajor ? block.m_startCol.value() : block.m_startRow.value())
-      {
-        while( (Base::operator bool()) && (Base::index() >= (IsRowMajor ? m_block.m_startCol.value()+block.m_blockCols.value() : m_block.m_startRow.value()+block.m_blockRows.value())) )
-          Base::operator--();
-      }
-
-      inline Index index()  const { return Base::index() - (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value()); }
-      inline Index outer()  const { return Base::outer() - (IsRowMajor ? m_block.m_startRow.value() : m_block.m_startCol.value()); }
-      inline Index row()    const { return Base::row()   - m_block.m_startRow.value(); }
-      inline Index col()    const { return Base::col()   - m_block.m_startCol.value(); }
-
-      inline operator bool() const { return Base::operator bool() && Base::index() >= m_begin; }
-    };
   protected:
-    friend class internal::GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel>;
-    friend class ReverseInnerIterator;
+//     friend class internal::GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel>;
+    friend struct internal::unary_evaluator<Block<XprType,BlockRows,BlockCols,InnerPanel>, internal::IteratorBased, Scalar >;
 
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
+    Index nonZeros() const { return Dynamic; }
 
-    typename XprType::Nested m_matrix;
+    typename internal::ref_selector<XprType>::non_const_type m_matrix;
     const internal::variable_if_dynamic<Index, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
     const internal::variable_if_dynamic<Index, XprType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
     const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_blockRows;
     const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_blockCols;
 
-};
-
-namespace internal {
-  template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
-  class GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel,false> : public Block<XprType, BlockRows, BlockCols, InnerPanel>::_MatrixTypeNested::InnerIterator
-  {
-    typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
-    enum {
-      IsRowMajor = BlockType::IsRowMajor
-    };
-    typedef typename BlockType::_MatrixTypeNested _MatrixTypeNested;
-    typedef typename BlockType::Index Index;
-    typedef typename _MatrixTypeNested::InnerIterator Base;
-    const BlockType& m_block;
-    Index m_end;
-  public:
-
-    EIGEN_STRONG_INLINE GenericSparseBlockInnerIteratorImpl(const BlockType& block, Index outer)
-      : Base(block.derived().nestedExpression(), outer + (IsRowMajor ? block.m_startRow.value() : block.m_startCol.value())),
-        m_block(block),
-        m_end(IsRowMajor ? block.m_startCol.value()+block.m_blockCols.value() : block.m_startRow.value()+block.m_blockRows.value())
+  protected:
+    // Disable assignment with clear error message.
+    // Note that simply removing operator= yields compilation errors with ICC+MSVC
+    template<typename T>
+    BlockImpl& operator=(const T&)
     {
-      while( (Base::operator bool()) && (Base::index() < (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value())) )
-        Base::operator++();
-    }
-
-    inline Index index()  const { return Base::index() - (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value()); }
-    inline Index outer()  const { return Base::outer() - (IsRowMajor ? m_block.m_startRow.value() : m_block.m_startCol.value()); }
-    inline Index row()    const { return Base::row()   - m_block.m_startRow.value(); }
-    inline Index col()    const { return Base::col()   - m_block.m_startCol.value(); }
-
-    inline operator bool() const { return Base::operator bool() && Base::index() < m_end; }
-  };
-
-  // Row vector of a column-major sparse matrix or column of a row-major one.
-  template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
-  class GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel,true>
-  {
-    typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
-    enum {
-      IsRowMajor = BlockType::IsRowMajor
-    };
-    typedef typename BlockType::_MatrixTypeNested _MatrixTypeNested;
-    typedef typename BlockType::Index Index;
-    typedef typename BlockType::Scalar Scalar;
-    const BlockType& m_block;
-    Index m_outerPos;
-    Index m_innerIndex;
-    Scalar m_value;
-    Index m_end;
-  public:
-
-    EIGEN_STRONG_INLINE GenericSparseBlockInnerIteratorImpl(const BlockType& block, Index outer = 0)
-      :
-        m_block(block),
-        m_outerPos( (IsRowMajor ? block.m_startCol.value() : block.m_startRow.value()) - 1), // -1 so that operator++ finds the first non-zero entry
-        m_innerIndex(IsRowMajor ? block.m_startRow.value() : block.m_startCol.value()),
-        m_end(IsRowMajor ? block.m_startCol.value()+block.m_blockCols.value() : block.m_startRow.value()+block.m_blockRows.value())
-    {
-      EIGEN_UNUSED_VARIABLE(outer);
-      eigen_assert(outer==0);
-
-      ++(*this);
-    }
-
-    inline Index index()  const { return m_outerPos - (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value()); }
-    inline Index outer()  const { return 0; }
-    inline Index row()    const { return IsRowMajor ? 0 : index(); }
-    inline Index col()    const { return IsRowMajor ? index() : 0; }
-
-    inline Scalar value() const { return m_value; }
-
-    inline GenericSparseBlockInnerIteratorImpl& operator++()
-    {
-      // At end already?
-      if (m_outerPos >= m_end)
-        return *this;
-
-      // search next non-zero entry.
-      while(++m_outerPos<m_end)
-      {
-        typename XprType::InnerIterator it(m_block.m_matrix, m_outerPos);
-        // search for the key m_innerIndex in the current outer-vector
-        while(it && it.index() < m_innerIndex) ++it;
-        if(it && it.index()==m_innerIndex)
-        {
-          m_value = it.value();
-          break;
-        }
-      }
+      EIGEN_STATIC_ASSERT(sizeof(T)==0, THIS_SPARSE_BLOCK_SUBEXPRESSION_IS_READ_ONLY);
       return *this;
     }
 
-    inline operator bool() const { return m_outerPos < m_end; }
-  };
+};
+
+namespace internal {
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+struct unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBased >
+ : public evaluator_base<Block<ArgType,BlockRows,BlockCols,InnerPanel> >
+{
+    class InnerVectorInnerIterator;
+    class OuterVectorInnerIterator;
+  public:
+    typedef Block<ArgType,BlockRows,BlockCols,InnerPanel> XprType;
+    typedef typename XprType::StorageIndex StorageIndex;
+    typedef typename XprType::Scalar Scalar;
+
+    enum {
+      IsRowMajor = XprType::IsRowMajor,
+
+      OuterVector =  (BlockCols==1 && ArgType::IsRowMajor)
+                    | // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
+                      // revert to || as soon as not needed anymore.
+                     (BlockRows==1 && !ArgType::IsRowMajor),
+
+      CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+      Flags = XprType::Flags
+    };
+
+    typedef typename internal::conditional<OuterVector,OuterVectorInnerIterator,InnerVectorInnerIterator>::type InnerIterator;
+
+    explicit unary_evaluator(const XprType& op)
+      : m_argImpl(op.nestedExpression()), m_block(op)
+    {}
+
+    inline Index nonZerosEstimate() const {
+      const Index nnz = m_block.nonZeros();
+      if(nnz < 0) {
+        // Scale the non-zero estimate for the underlying expression linearly with block size.
+        // Return zero if the underlying block is empty.
+        const Index nested_sz = m_block.nestedExpression().size();        
+        return nested_sz == 0 ? 0 : m_argImpl.nonZerosEstimate() * m_block.size() / nested_sz;
+      }
+      return nnz;
+    }
+
+  protected:
+    typedef typename evaluator<ArgType>::InnerIterator EvalIterator;
+
+    evaluator<ArgType> m_argImpl;
+    const XprType &m_block;
+};
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+class unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBased>::InnerVectorInnerIterator
+ : public EvalIterator
+{
+  // NOTE MSVC fails to compile if we don't explicitely "import" IsRowMajor from unary_evaluator
+  //      because the base class EvalIterator has a private IsRowMajor enum too. (bug #1786)
+  // NOTE We cannot call it IsRowMajor because it would shadow unary_evaluator::IsRowMajor
+  enum { XprIsRowMajor = unary_evaluator::IsRowMajor };
+  const XprType& m_block;
+  Index m_end;
+public:
+
+  EIGEN_STRONG_INLINE InnerVectorInnerIterator(const unary_evaluator& aEval, Index outer)
+    : EvalIterator(aEval.m_argImpl, outer + (XprIsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol())),
+      m_block(aEval.m_block),
+      m_end(XprIsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows())
+  {
+    while( (EvalIterator::operator bool()) && (EvalIterator::index() < (XprIsRowMajor ? m_block.startCol() : m_block.startRow())) )
+      EvalIterator::operator++();
+  }
+
+  inline StorageIndex index() const { return EvalIterator::index() - convert_index<StorageIndex>(XprIsRowMajor ? m_block.startCol() : m_block.startRow()); }
+  inline Index outer()  const { return EvalIterator::outer() - (XprIsRowMajor ? m_block.startRow() : m_block.startCol()); }
+  inline Index row()    const { return EvalIterator::row()   - m_block.startRow(); }
+  inline Index col()    const { return EvalIterator::col()   - m_block.startCol(); }
+
+  inline operator bool() const { return EvalIterator::operator bool() && EvalIterator::index() < m_end; }
+};
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+class unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBased>::OuterVectorInnerIterator
+{
+  // NOTE see above
+  enum { XprIsRowMajor = unary_evaluator::IsRowMajor };
+  const unary_evaluator& m_eval;
+  Index m_outerPos;
+  const Index m_innerIndex;
+  Index m_end;
+  EvalIterator m_it;
+public:
+
+  EIGEN_STRONG_INLINE OuterVectorInnerIterator(const unary_evaluator& aEval, Index outer)
+    : m_eval(aEval),
+      m_outerPos( (XprIsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow()) ),
+      m_innerIndex(XprIsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol()),
+      m_end(XprIsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows()),
+      m_it(m_eval.m_argImpl, m_outerPos)
+  {
+    EIGEN_UNUSED_VARIABLE(outer);
+    eigen_assert(outer==0);
+
+    while(m_it && m_it.index() < m_innerIndex) ++m_it;
+    if((!m_it) || (m_it.index()!=m_innerIndex))
+      ++(*this);
+  }
+
+  inline StorageIndex index() const { return convert_index<StorageIndex>(m_outerPos - (XprIsRowMajor ? m_eval.m_block.startCol() : m_eval.m_block.startRow())); }
+  inline Index outer()  const { return 0; }
+  inline Index row()    const { return XprIsRowMajor ? 0 : index(); }
+  inline Index col()    const { return XprIsRowMajor ? index() : 0; }
+
+  inline Scalar value() const { return m_it.value(); }
+  inline Scalar& valueRef() { return m_it.valueRef(); }
+
+  inline OuterVectorInnerIterator& operator++()
+  {
+    // search next non-zero entry
+    while(++m_outerPos<m_end)
+    {
+      // Restart iterator at the next inner-vector:
+      m_it.~EvalIterator();
+      ::new (&m_it) EvalIterator(m_eval.m_argImpl, m_outerPos);
+      // search for the key m_innerIndex in the current outer-vector
+      while(m_it && m_it.index() < m_innerIndex) ++m_it;
+      if(m_it && m_it.index()==m_innerIndex) break;
+    }
+    return *this;
+  }
+
+  inline operator bool() const { return m_outerPos < m_end; }
+};
+
+template<typename _Scalar, int _Options, typename _StorageIndex, int BlockRows, int BlockCols>
+struct unary_evaluator<Block<SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true>, IteratorBased>
+  : evaluator<SparseCompressedBase<Block<SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true> > >
+{
+  typedef Block<SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true> XprType;
+  typedef evaluator<SparseCompressedBase<XprType> > Base;
+  explicit unary_evaluator(const XprType &xpr) : Base(xpr) {}
+};
+
+template<typename _Scalar, int _Options, typename _StorageIndex, int BlockRows, int BlockCols>
+struct unary_evaluator<Block<const SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true>, IteratorBased>
+  : evaluator<SparseCompressedBase<Block<const SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true> > >
+{
+  typedef Block<const SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true> XprType;
+  typedef evaluator<SparseCompressedBase<XprType> > Base;
+  explicit unary_evaluator(const XprType &xpr) : Base(xpr) {}
+};
 
 } // end namespace internal
 

diff --git a/Eigen/src/SparseCore/SparseColEtree.h b/Eigen/src/SparseCore/SparseColEtree.h
index f8745f4..ebe02d1 100644
--- a/Eigen/src/SparseCore/SparseColEtree.h
+++ b/Eigen/src/SparseCore/SparseColEtree.h

@@ -58,30 +58,29 @@
   * \param perm The permutation to apply to the column of \b mat
   */
 template <typename MatrixType, typename IndexVector>
-int coletree(const MatrixType& mat, IndexVector& parent, IndexVector& firstRowElt, typename MatrixType::Index *perm=0)
+int coletree(const MatrixType& mat, IndexVector& parent, IndexVector& firstRowElt, typename MatrixType::StorageIndex *perm=0)
 {
-  typedef typename MatrixType::Index Index;
-  Index nc = mat.cols(); // Number of columns 
-  Index m = mat.rows();
-  Index diagSize = (std::min)(nc,m);
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  StorageIndex nc = convert_index<StorageIndex>(mat.cols()); // Number of columns
+  StorageIndex m = convert_index<StorageIndex>(mat.rows());
+  StorageIndex diagSize = (std::min)(nc,m);
   IndexVector root(nc); // root of subtree of etree 
   root.setZero();
   IndexVector pp(nc); // disjoint sets 
   pp.setZero(); // Initialize disjoint sets 
   parent.resize(mat.cols());
   //Compute first nonzero column in each row 
-  Index row,col; 
   firstRowElt.resize(m);
   firstRowElt.setConstant(nc);
   firstRowElt.segment(0, diagSize).setLinSpaced(diagSize, 0, diagSize-1);
   bool found_diag;
-  for (col = 0; col < nc; col++)
+  for (StorageIndex col = 0; col < nc; col++)
   {
-    Index pcol = col;
+    StorageIndex pcol = col;
     if(perm) pcol  = perm[col];
     for (typename MatrixType::InnerIterator it(mat, pcol); it; ++it)
     { 
-      row = it.row();
+      Index row = it.row();
       firstRowElt(row) = (std::min)(firstRowElt(row), col);
     }
   }
@@ -89,8 +88,8 @@
           except use (firstRowElt[r],c) in place of an edge (r,c) of A.
     Thus each row clique in A'*A is replaced by a star
     centered at its first vertex, which has the same fill. */
-  Index rset, cset, rroot; 
-  for (col = 0; col < nc; col++) 
+  StorageIndex rset, cset, rroot;
+  for (StorageIndex col = 0; col < nc; col++) 
   {
     found_diag = col>=m;
     pp(col) = col; 
@@ -99,7 +98,7 @@
     parent(col) = nc; 
     /* The diagonal element is treated here even if it does not exist in the matrix
      * hence the loop is executed once more */ 
-    Index pcol = col;
+    StorageIndex pcol = col;
     if(perm) pcol  = perm[col];
     for (typename MatrixType::InnerIterator it(mat, pcol); it||!found_diag; ++it)
     { //  A sequence of interleaved find and union is performed 
@@ -107,7 +106,7 @@
       if(it) i = it.index();
       if (i == col) found_diag = true;
       
-      row = firstRowElt(i);
+      StorageIndex row = firstRowElt(i);
       if (row >= col) continue; 
       rset = internal::etree_find(row, pp); // Find the name of the set containing row
       rroot = root(rset);
@@ -127,10 +126,11 @@
   * Depth-first search from vertex n.  No recursion.
   * This routine was contributed by Cédric Doucet, CEDRAT Group, Meylan, France.
 */
-template <typename Index, typename IndexVector>
-void nr_etdfs (Index n, IndexVector& parent, IndexVector& first_kid, IndexVector& next_kid, IndexVector& post, Index postnum)
+template <typename IndexVector>
+void nr_etdfs (typename IndexVector::Scalar n, IndexVector& parent, IndexVector& first_kid, IndexVector& next_kid, IndexVector& post, typename IndexVector::Scalar postnum)
 {
-  Index current = n, first, next;
+  typedef typename IndexVector::Scalar StorageIndex;
+  StorageIndex current = n, first, next;
   while (postnum != n) 
   {
     // No kid for the current node
@@ -174,22 +174,22 @@
   * \param parent Input tree
   * \param post postordered tree
   */
-template <typename Index, typename IndexVector>
-void treePostorder(Index n, IndexVector& parent, IndexVector& post)
+template <typename IndexVector>
+void treePostorder(typename IndexVector::Scalar n, IndexVector& parent, IndexVector& post)
 {
+  typedef typename IndexVector::Scalar StorageIndex;
   IndexVector first_kid, next_kid; // Linked list of children 
-  Index postnum; 
+  StorageIndex postnum; 
   // Allocate storage for working arrays and results 
   first_kid.resize(n+1); 
   next_kid.setZero(n+1);
   post.setZero(n+1);
   
   // Set up structure describing children
-  Index v, dad; 
   first_kid.setConstant(-1); 
-  for (v = n-1; v >= 0; v--) 
+  for (StorageIndex v = n-1; v >= 0; v--) 
   {
-    dad = parent(v);
+    StorageIndex dad = parent(v);
     next_kid(v) = first_kid(dad); 
     first_kid(dad) = v; 
   }

diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h
new file mode 100644
index 0000000..6a2c7a8
--- /dev/null
+++ b/Eigen/src/SparseCore/SparseCompressedBase.h

@@ -0,0 +1,370 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_COMPRESSED_BASE_H
+#define EIGEN_SPARSE_COMPRESSED_BASE_H
+
+namespace Eigen { 
+
+template<typename Derived> class SparseCompressedBase;
+  
+namespace internal {
+
+template<typename Derived>
+struct traits<SparseCompressedBase<Derived> > : traits<Derived>
+{};
+
+} // end namespace internal
+
+/** \ingroup SparseCore_Module
+  * \class SparseCompressedBase
+  * \brief Common base class for sparse [compressed]-{row|column}-storage format.
+  *
+  * This class defines the common interface for all derived classes implementing the compressed sparse storage format, such as:
+  *  - SparseMatrix
+  *  - Ref<SparseMatrixType,Options>
+  *  - Map<SparseMatrixType>
+  *
+  */
+template<typename Derived>
+class SparseCompressedBase
+  : public SparseMatrixBase<Derived>
+{
+  public:
+    typedef SparseMatrixBase<Derived> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseCompressedBase)
+    using Base::operator=;
+    using Base::IsRowMajor;
+    
+    class InnerIterator;
+    class ReverseInnerIterator;
+    
+  protected:
+    typedef typename Base::IndexVector IndexVector;
+    Eigen::Map<IndexVector> innerNonZeros() { return Eigen::Map<IndexVector>(innerNonZeroPtr(), isCompressed()?0:derived().outerSize()); }
+    const  Eigen::Map<const IndexVector> innerNonZeros() const { return Eigen::Map<const IndexVector>(innerNonZeroPtr(), isCompressed()?0:derived().outerSize()); }
+        
+  public:
+    
+    /** \returns the number of non zero coefficients */
+    inline Index nonZeros() const
+    {
+      if(Derived::IsVectorAtCompileTime && outerIndexPtr()==0)
+        return derived().nonZeros();
+      else if(isCompressed())
+        return outerIndexPtr()[derived().outerSize()]-outerIndexPtr()[0];
+      else if(derived().outerSize()==0)
+        return 0;
+      else
+        return innerNonZeros().sum();
+    }
+    
+    /** \returns a const pointer to the array of values.
+      * This function is aimed at interoperability with other libraries.
+      * \sa innerIndexPtr(), outerIndexPtr() */
+    inline const Scalar* valuePtr() const { return derived().valuePtr(); }
+    /** \returns a non-const pointer to the array of values.
+      * This function is aimed at interoperability with other libraries.
+      * \sa innerIndexPtr(), outerIndexPtr() */
+    inline Scalar* valuePtr() { return derived().valuePtr(); }
+
+    /** \returns a const pointer to the array of inner indices.
+      * This function is aimed at interoperability with other libraries.
+      * \sa valuePtr(), outerIndexPtr() */
+    inline const StorageIndex* innerIndexPtr() const { return derived().innerIndexPtr(); }
+    /** \returns a non-const pointer to the array of inner indices.
+      * This function is aimed at interoperability with other libraries.
+      * \sa valuePtr(), outerIndexPtr() */
+    inline StorageIndex* innerIndexPtr() { return derived().innerIndexPtr(); }
+
+    /** \returns a const pointer to the array of the starting positions of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \warning it returns the null pointer 0 for SparseVector
+      * \sa valuePtr(), innerIndexPtr() */
+    inline const StorageIndex* outerIndexPtr() const { return derived().outerIndexPtr(); }
+    /** \returns a non-const pointer to the array of the starting positions of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \warning it returns the null pointer 0 for SparseVector
+      * \sa valuePtr(), innerIndexPtr() */
+    inline StorageIndex* outerIndexPtr() { return derived().outerIndexPtr(); }
+
+    /** \returns a const pointer to the array of the number of non zeros of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \warning it returns the null pointer 0 in compressed mode */
+    inline const StorageIndex* innerNonZeroPtr() const { return derived().innerNonZeroPtr(); }
+    /** \returns a non-const pointer to the array of the number of non zeros of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \warning it returns the null pointer 0 in compressed mode */
+    inline StorageIndex* innerNonZeroPtr() { return derived().innerNonZeroPtr(); }
+    
+    /** \returns whether \c *this is in compressed form. */
+    inline bool isCompressed() const { return innerNonZeroPtr()==0; }
+
+    /** \returns a read-only view of the stored coefficients as a 1D array expression.
+      *
+      * \warning this method is for \b compressed \b storage \b only, and it will trigger an assertion otherwise.
+      *
+      * \sa valuePtr(), isCompressed() */
+    const Map<const Array<Scalar,Dynamic,1> > coeffs() const { eigen_assert(isCompressed()); return Array<Scalar,Dynamic,1>::Map(valuePtr(),nonZeros()); }
+
+    /** \returns a read-write view of the stored coefficients as a 1D array expression
+      *
+      * \warning this method is for \b compressed \b storage \b only, and it will trigger an assertion otherwise.
+      *
+      * Here is an example:
+      * \include SparseMatrix_coeffs.cpp
+      * and the output is:
+      * \include SparseMatrix_coeffs.out
+      *
+      * \sa valuePtr(), isCompressed() */
+    Map<Array<Scalar,Dynamic,1> > coeffs() { eigen_assert(isCompressed()); return Array<Scalar,Dynamic,1>::Map(valuePtr(),nonZeros()); }
+
+  protected:
+    /** Default constructor. Do nothing. */
+    SparseCompressedBase() {}
+
+    /** \internal return the index of the coeff at (row,col) or just before if it does not exist.
+      * This is an analogue of std::lower_bound.
+      */
+    internal::LowerBoundIndex lower_bound(Index row, Index col) const
+    {
+      eigen_internal_assert(row>=0 && row<this->rows() && col>=0 && col<this->cols());
+
+      const Index outer = Derived::IsRowMajor ? row : col;
+      const Index inner = Derived::IsRowMajor ? col : row;
+
+      Index start = this->outerIndexPtr()[outer];
+      Index end = this->isCompressed() ? this->outerIndexPtr()[outer+1] : this->outerIndexPtr()[outer] + this->innerNonZeroPtr()[outer];
+      eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist");
+      internal::LowerBoundIndex p;
+      p.value = std::lower_bound(this->innerIndexPtr()+start, this->innerIndexPtr()+end,inner) - this->innerIndexPtr();
+      p.found = (p.value<end) && (this->innerIndexPtr()[p.value]==inner);
+      return p;
+    }
+
+    friend struct internal::evaluator<SparseCompressedBase<Derived> >;
+
+  private:
+    template<typename OtherDerived> explicit SparseCompressedBase(const SparseCompressedBase<OtherDerived>&);
+};
+
+template<typename Derived>
+class SparseCompressedBase<Derived>::InnerIterator
+{
+  public:
+    InnerIterator()
+      : m_values(0), m_indices(0), m_outer(0), m_id(0), m_end(0)
+    {}
+
+    InnerIterator(const InnerIterator& other)
+      : m_values(other.m_values), m_indices(other.m_indices), m_outer(other.m_outer), m_id(other.m_id), m_end(other.m_end)
+    {}
+
+    InnerIterator& operator=(const InnerIterator& other)
+    {
+      m_values = other.m_values;
+      m_indices = other.m_indices;
+      const_cast<OuterType&>(m_outer).setValue(other.m_outer.value());
+      m_id = other.m_id;
+      m_end = other.m_end;
+      return *this;
+    }
+
+    InnerIterator(const SparseCompressedBase& mat, Index outer)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer)
+    {
+      if(Derived::IsVectorAtCompileTime && mat.outerIndexPtr()==0)
+      {
+        m_id = 0;
+        m_end = mat.nonZeros();
+      }
+      else
+      {
+        m_id = mat.outerIndexPtr()[outer];
+        if(mat.isCompressed())
+          m_end = mat.outerIndexPtr()[outer+1];
+        else
+          m_end = m_id + mat.innerNonZeroPtr()[outer];
+      }
+    }
+
+    explicit InnerIterator(const SparseCompressedBase& mat)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(0), m_id(0), m_end(mat.nonZeros())
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+    }
+
+    explicit InnerIterator(const internal::CompressedStorage<Scalar,StorageIndex>& data)
+      : m_values(data.valuePtr()), m_indices(data.indexPtr()), m_outer(0), m_id(0), m_end(data.size())
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+    }
+
+    inline InnerIterator& operator++() { m_id++; return *this; }
+    inline InnerIterator& operator+=(Index i) { m_id += i ; return *this; }
+
+    inline InnerIterator operator+(Index i) 
+    { 
+        InnerIterator result = *this;
+        result += i;
+        return result;
+    }
+
+    inline const Scalar& value() const { return m_values[m_id]; }
+    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); }
+
+    inline StorageIndex index() const { return m_indices[m_id]; }
+    inline Index outer() const { return m_outer.value(); }
+    inline Index row() const { return IsRowMajor ? m_outer.value() : index(); }
+    inline Index col() const { return IsRowMajor ? index() : m_outer.value(); }
+
+    inline operator bool() const { return (m_id < m_end); }
+
+  protected:
+    const Scalar* m_values;
+    const StorageIndex* m_indices;
+    typedef internal::variable_if_dynamic<Index,Derived::IsVectorAtCompileTime?0:Dynamic> OuterType;
+    const OuterType m_outer;
+    Index m_id;
+    Index m_end;
+  private:
+    // If you get here, then you're not using the right InnerIterator type, e.g.:
+    //   SparseMatrix<double,RowMajor> A;
+    //   SparseMatrix<double>::InnerIterator it(A,0);
+    template<typename T> InnerIterator(const SparseMatrixBase<T>&, Index outer);
+};
+
+template<typename Derived>
+class SparseCompressedBase<Derived>::ReverseInnerIterator
+{
+  public:
+    ReverseInnerIterator(const SparseCompressedBase& mat, Index outer)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer)
+    {
+      if(Derived::IsVectorAtCompileTime && mat.outerIndexPtr()==0)
+      {
+        m_start = 0;
+        m_id = mat.nonZeros();
+      }
+      else
+      {
+        m_start = mat.outerIndexPtr()[outer];
+        if(mat.isCompressed())
+          m_id = mat.outerIndexPtr()[outer+1];
+        else
+          m_id = m_start + mat.innerNonZeroPtr()[outer];
+      }
+    }
+
+    explicit ReverseInnerIterator(const SparseCompressedBase& mat)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(0), m_start(0), m_id(mat.nonZeros())
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+    }
+
+    explicit ReverseInnerIterator(const internal::CompressedStorage<Scalar,StorageIndex>& data)
+      : m_values(data.valuePtr()), m_indices(data.indexPtr()), m_outer(0), m_start(0), m_id(data.size())
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+    }
+
+    inline ReverseInnerIterator& operator--() { --m_id; return *this; }
+    inline ReverseInnerIterator& operator-=(Index i) { m_id -= i; return *this; }
+
+    inline ReverseInnerIterator operator-(Index i) 
+    {
+        ReverseInnerIterator result = *this;
+        result -= i;
+        return result;
+    }
+
+    inline const Scalar& value() const { return m_values[m_id-1]; }
+    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); }
+
+    inline StorageIndex index() const { return m_indices[m_id-1]; }
+    inline Index outer() const { return m_outer.value(); }
+    inline Index row() const { return IsRowMajor ? m_outer.value() : index(); }
+    inline Index col() const { return IsRowMajor ? index() : m_outer.value(); }
+
+    inline operator bool() const { return (m_id > m_start); }
+
+  protected:
+    const Scalar* m_values;
+    const StorageIndex* m_indices;
+    typedef internal::variable_if_dynamic<Index,Derived::IsVectorAtCompileTime?0:Dynamic> OuterType;
+    const OuterType m_outer;
+    Index m_start;
+    Index m_id;
+};
+
+namespace internal {
+
+template<typename Derived>
+struct evaluator<SparseCompressedBase<Derived> >
+  : evaluator_base<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::InnerIterator InnerIterator;
+  
+  enum {
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Flags = Derived::Flags
+  };
+  
+  evaluator() : m_matrix(0), m_zero(0)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  explicit evaluator(const Derived &mat) : m_matrix(&mat), m_zero(0)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  inline Index nonZerosEstimate() const {
+    return m_matrix->nonZeros();
+  }
+  
+  operator Derived&() { return m_matrix->const_cast_derived(); }
+  operator const Derived&() const { return *m_matrix; }
+  
+  typedef typename DenseCoeffsBase<Derived,ReadOnlyAccessors>::CoeffReturnType CoeffReturnType;
+  const Scalar& coeff(Index row, Index col) const
+  {
+    Index p = find(row,col);
+
+    if(p==Dynamic)
+      return m_zero;
+    else
+      return m_matrix->const_cast_derived().valuePtr()[p];
+  }
+
+  Scalar& coeffRef(Index row, Index col)
+  {
+    Index p = find(row,col);
+    eigen_assert(p!=Dynamic && "written coefficient does not exist");
+    return m_matrix->const_cast_derived().valuePtr()[p];
+  }
+
+protected:
+
+  Index find(Index row, Index col) const
+  {
+    internal::LowerBoundIndex p = m_matrix->lower_bound(row,col);
+    return p.found ? p.value : Dynamic;
+  }
+
+  const Derived *m_matrix;
+  const Scalar m_zero;
+};
+
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_COMPRESSED_BASE_H

diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
index ec86ca9..9b0d3f9 100644
--- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -28,88 +28,57 @@
 //                         generic      sparse
 //  4 - dense op dense     product      dense
 //                         generic      dense
-
-namespace internal {
-
-template<> struct promote_storage_type<Dense,Sparse>
-{ typedef Sparse ret; };
-
-template<> struct promote_storage_type<Sparse,Dense>
-{ typedef Sparse ret; };
-
-template<typename BinaryOp, typename Lhs, typename Rhs, typename Derived,
-  typename _LhsStorageMode = typename traits<Lhs>::StorageKind,
-  typename _RhsStorageMode = typename traits<Rhs>::StorageKind>
-class sparse_cwise_binary_op_inner_iterator_selector;
-
-} // end namespace internal
+//
+// TODO to ease compiler job, we could specialize product/quotient with a scalar
+//      and fallback to cwise-unary evaluator using bind1st_op and bind2nd_op.
 
 template<typename BinaryOp, typename Lhs, typename Rhs>
 class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Sparse>
   : public SparseMatrixBase<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
 {
   public:
-    class InnerIterator;
-    class ReverseInnerIterator;
     typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> Derived;
+    typedef SparseMatrixBase<Derived> Base;
     EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
     CwiseBinaryOpImpl()
     {
-      typedef typename internal::traits<Lhs>::StorageKind LhsStorageKind;
-      typedef typename internal::traits<Rhs>::StorageKind RhsStorageKind;
       EIGEN_STATIC_ASSERT((
-                (!internal::is_same<LhsStorageKind,RhsStorageKind>::value)
-            ||  ((Lhs::Flags&RowMajorBit) == (Rhs::Flags&RowMajorBit))),
+                (!internal::is_same<typename internal::traits<Lhs>::StorageKind,
+                                    typename internal::traits<Rhs>::StorageKind>::value)
+            ||  ((internal::evaluator<Lhs>::Flags&RowMajorBit) == (internal::evaluator<Rhs>::Flags&RowMajorBit))),
             THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH);
     }
 };
 
-template<typename BinaryOp, typename Lhs, typename Rhs>
-class CwiseBinaryOpImpl<BinaryOp,Lhs,Rhs,Sparse>::InnerIterator
-  : public internal::sparse_cwise_binary_op_inner_iterator_selector<BinaryOp,Lhs,Rhs,typename CwiseBinaryOpImpl<BinaryOp,Lhs,Rhs,Sparse>::InnerIterator>
-{
-  public:
-    typedef typename Lhs::Index Index;
-    typedef internal::sparse_cwise_binary_op_inner_iterator_selector<
-      BinaryOp,Lhs,Rhs, InnerIterator> Base;
-
-    EIGEN_STRONG_INLINE InnerIterator(const CwiseBinaryOpImpl& binOp, Index outer)
-      : Base(binOp.derived(),outer)
-    {}
-};
-
-/***************************************************************************
-* Implementation of inner-iterators
-***************************************************************************/
-
-// template<typename T> struct internal::func_is_conjunction { enum { ret = false }; };
-// template<typename T> struct internal::func_is_conjunction<internal::scalar_product_op<T> > { enum { ret = true }; };
-
-// TODO generalize the internal::scalar_product_op specialization to all conjunctions if any !
-
 namespace internal {
 
-// sparse - sparse  (generic)
-template<typename BinaryOp, typename Lhs, typename Rhs, typename Derived>
-class sparse_cwise_binary_op_inner_iterator_selector<BinaryOp, Lhs, Rhs, Derived, Sparse, Sparse>
+  
+// Generic "sparse OP sparse"
+template<typename XprType> struct binary_sparse_evaluator;
+
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IteratorBased, IteratorBased>
+  : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
 {
-    typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> CwiseBinaryXpr;
-    typedef typename traits<CwiseBinaryXpr>::Scalar Scalar;
-    typedef typename traits<CwiseBinaryXpr>::_LhsNested _LhsNested;
-    typedef typename traits<CwiseBinaryXpr>::_RhsNested _RhsNested;
-    typedef typename _LhsNested::InnerIterator LhsIterator;
-    typedef typename _RhsNested::InnerIterator RhsIterator;
-    typedef typename Lhs::Index Index;
+protected:
+  typedef typename evaluator<Lhs>::InnerIterator  LhsIterator;
+  typedef typename evaluator<Rhs>::InnerIterator  RhsIterator;
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename XprType::StorageIndex StorageIndex;
+public:
 
+  class InnerIterator
+  {
   public:
-
-    EIGEN_STRONG_INLINE sparse_cwise_binary_op_inner_iterator_selector(const CwiseBinaryXpr& xpr, Index outer)
-      : m_lhsIter(xpr.lhs(),outer), m_rhsIter(xpr.rhs(),outer), m_functor(xpr.functor())
+    
+    EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
+      : m_lhsIter(aEval.m_lhsImpl,outer), m_rhsIter(aEval.m_rhsImpl,outer), m_functor(aEval.m_functor)
     {
       this->operator++();
     }
 
-    EIGEN_STRONG_INLINE Derived& operator++()
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
     {
       if (m_lhsIter && m_rhsIter && (m_lhsIter.index() == m_rhsIter.index()))
       {
@@ -132,15 +101,16 @@
       }
       else
       {
-        m_value = 0; // this is to avoid a compilation warning
+        m_value = Scalar(0); // this is to avoid a compilation warning
         m_id = -1;
       }
-      return *static_cast<Derived*>(this);
+      return *this;
     }
 
     EIGEN_STRONG_INLINE Scalar value() const { return m_value; }
 
-    EIGEN_STRONG_INLINE Index index() const { return m_id; }
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
+    EIGEN_STRONG_INLINE Index outer() const { return m_lhsIter.outer(); }
     EIGEN_STRONG_INLINE Index row() const { return Lhs::IsRowMajor ? m_lhsIter.row() : index(); }
     EIGEN_STRONG_INLINE Index col() const { return Lhs::IsRowMajor ? index() : m_lhsIter.col(); }
 
@@ -151,25 +121,301 @@
     RhsIterator m_rhsIter;
     const BinaryOp& m_functor;
     Scalar m_value;
-    Index m_id;
+    StorageIndex m_id;
+  };
+  
+  
+  enum {
+    CoeffReadCost = int(evaluator<Lhs>::CoeffReadCost) + int(evaluator<Rhs>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
+    Flags = XprType::Flags
+  };
+  
+  explicit binary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()), 
+      m_rhsImpl(xpr.rhs())  
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  inline Index nonZerosEstimate() const {
+    return m_lhsImpl.nonZerosEstimate() + m_rhsImpl.nonZerosEstimate();
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
 };
 
-// sparse - sparse  (product)
-template<typename T, typename Lhs, typename Rhs, typename Derived>
-class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs, Rhs, Derived, Sparse, Sparse>
+// dense op sparse
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IteratorBased>
+  : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
 {
-    typedef scalar_product_op<T> BinaryFunc;
-    typedef CwiseBinaryOp<BinaryFunc, Lhs, Rhs> CwiseBinaryXpr;
-    typedef typename CwiseBinaryXpr::Scalar Scalar;
-    typedef typename traits<CwiseBinaryXpr>::_LhsNested _LhsNested;
-    typedef typename _LhsNested::InnerIterator LhsIterator;
-    typedef typename traits<CwiseBinaryXpr>::_RhsNested _RhsNested;
-    typedef typename _RhsNested::InnerIterator RhsIterator;
-    typedef typename Lhs::Index Index;
+protected:
+  typedef typename evaluator<Rhs>::InnerIterator  RhsIterator;
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename XprType::StorageIndex StorageIndex;
+public:
+
+  class InnerIterator
+  {
+    enum { IsRowMajor = (int(Rhs::Flags)&RowMajorBit)==RowMajorBit };
   public:
 
-    EIGEN_STRONG_INLINE sparse_cwise_binary_op_inner_iterator_selector(const CwiseBinaryXpr& xpr, Index outer)
-      : m_lhsIter(xpr.lhs(),outer), m_rhsIter(xpr.rhs(),outer), m_functor(xpr.functor())
+    EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
+      : m_lhsEval(aEval.m_lhsImpl), m_rhsIter(aEval.m_rhsImpl,outer), m_functor(aEval.m_functor), m_value(0), m_id(-1), m_innerSize(aEval.m_expr.rhs().innerSize())
+    {
+      this->operator++();
+    }
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
+    {
+      ++m_id;
+      if(m_id<m_innerSize)
+      {
+        Scalar lhsVal = m_lhsEval.coeff(IsRowMajor?m_rhsIter.outer():m_id,
+                                        IsRowMajor?m_id:m_rhsIter.outer());
+        if(m_rhsIter && m_rhsIter.index()==m_id)
+        {
+          m_value = m_functor(lhsVal, m_rhsIter.value());
+          ++m_rhsIter;
+        }
+        else
+          m_value = m_functor(lhsVal, Scalar(0));
+      }
+
+      return *this;
+    }
+
+    EIGEN_STRONG_INLINE Scalar value() const { eigen_internal_assert(m_id<m_innerSize); return m_value; }
+
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
+    EIGEN_STRONG_INLINE Index outer() const { return m_rhsIter.outer(); }
+    EIGEN_STRONG_INLINE Index row() const { return IsRowMajor ? m_rhsIter.outer() : m_id; }
+    EIGEN_STRONG_INLINE Index col() const { return IsRowMajor ? m_id : m_rhsIter.outer(); }
+
+    EIGEN_STRONG_INLINE operator bool() const { return m_id<m_innerSize; }
+
+  protected:
+    const evaluator<Lhs> &m_lhsEval;
+    RhsIterator m_rhsIter;
+    const BinaryOp& m_functor;
+    Scalar m_value;
+    StorageIndex m_id;
+    StorageIndex m_innerSize;
+  };
+
+
+  enum {
+    CoeffReadCost = int(evaluator<Lhs>::CoeffReadCost) + int(evaluator<Rhs>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
+    Flags = XprType::Flags
+  };
+
+  explicit binary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()),
+      m_rhsImpl(xpr.rhs()),
+      m_expr(xpr)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  inline Index nonZerosEstimate() const {
+    return m_expr.size();
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
+  const XprType &m_expr;
+};
+
+// sparse op dense
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IteratorBased, IndexBased>
+  : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+{
+protected:
+  typedef typename evaluator<Lhs>::InnerIterator  LhsIterator;
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename XprType::StorageIndex StorageIndex;
+public:
+
+  class InnerIterator
+  {
+    enum { IsRowMajor = (int(Lhs::Flags)&RowMajorBit)==RowMajorBit };
+  public:
+
+    EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
+      : m_lhsIter(aEval.m_lhsImpl,outer), m_rhsEval(aEval.m_rhsImpl), m_functor(aEval.m_functor), m_value(0), m_id(-1), m_innerSize(aEval.m_expr.lhs().innerSize())
+    {
+      this->operator++();
+    }
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
+    {
+      ++m_id;
+      if(m_id<m_innerSize)
+      {
+        Scalar rhsVal = m_rhsEval.coeff(IsRowMajor?m_lhsIter.outer():m_id,
+                                        IsRowMajor?m_id:m_lhsIter.outer());
+        if(m_lhsIter && m_lhsIter.index()==m_id)
+        {
+          m_value = m_functor(m_lhsIter.value(), rhsVal);
+          ++m_lhsIter;
+        }
+        else
+          m_value = m_functor(Scalar(0),rhsVal);
+      }
+
+      return *this;
+    }
+
+    EIGEN_STRONG_INLINE Scalar value() const { eigen_internal_assert(m_id<m_innerSize); return m_value; }
+
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
+    EIGEN_STRONG_INLINE Index outer() const { return m_lhsIter.outer(); }
+    EIGEN_STRONG_INLINE Index row() const { return IsRowMajor ? m_lhsIter.outer() : m_id; }
+    EIGEN_STRONG_INLINE Index col() const { return IsRowMajor ? m_id : m_lhsIter.outer(); }
+
+    EIGEN_STRONG_INLINE operator bool() const { return m_id<m_innerSize; }
+
+  protected:
+    LhsIterator m_lhsIter;
+    const evaluator<Rhs> &m_rhsEval;
+    const BinaryOp& m_functor;
+    Scalar m_value;
+    StorageIndex m_id;
+    StorageIndex m_innerSize;
+  };
+
+
+  enum {
+    CoeffReadCost = int(evaluator<Lhs>::CoeffReadCost) + int(evaluator<Rhs>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
+    Flags = XprType::Flags
+  };
+
+  explicit binary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()),
+      m_rhsImpl(xpr.rhs()),
+      m_expr(xpr)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  inline Index nonZerosEstimate() const {
+    return m_expr.size();
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
+  const XprType &m_expr;
+};
+
+template<typename T,
+         typename LhsKind   = typename evaluator_traits<typename T::Lhs>::Kind,
+         typename RhsKind   = typename evaluator_traits<typename T::Rhs>::Kind,
+         typename LhsScalar = typename traits<typename T::Lhs>::Scalar,
+         typename RhsScalar = typename traits<typename T::Rhs>::Scalar> struct sparse_conjunction_evaluator;
+
+// "sparse .* sparse"
+template<typename T1, typename T2, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs>, IteratorBased, IteratorBased>
+  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+// "dense .* sparse"
+template<typename T1, typename T2, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs>, IndexBased, IteratorBased>
+  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+// "sparse .* dense"
+template<typename T1, typename T2, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs>, IteratorBased, IndexBased>
+  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+// "sparse ./ dense"
+template<typename T1, typename T2, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_quotient_op<T1,T2>, Lhs, Rhs>, IteratorBased, IndexBased>
+  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_quotient_op<T1,T2>, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<scalar_quotient_op<T1,T2>, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+// "sparse && sparse"
+template<typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs>, IteratorBased, IteratorBased>
+  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+// "dense && sparse"
+template<typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs>, IndexBased, IteratorBased>
+  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+// "sparse && dense"
+template<typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs>, IteratorBased, IndexBased>
+  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+// "sparse ^ sparse"
+template<typename XprType>
+struct sparse_conjunction_evaluator<XprType, IteratorBased, IteratorBased>
+  : evaluator_base<XprType>
+{
+protected:
+  typedef typename XprType::Functor BinaryOp;
+  typedef typename XprType::Lhs LhsArg;
+  typedef typename XprType::Rhs RhsArg;
+  typedef typename evaluator<LhsArg>::InnerIterator  LhsIterator;
+  typedef typename evaluator<RhsArg>::InnerIterator  RhsIterator;
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename traits<XprType>::Scalar Scalar;
+public:
+
+  class InnerIterator
+  {
+  public:
+    
+    EIGEN_STRONG_INLINE InnerIterator(const sparse_conjunction_evaluator& aEval, Index outer)
+      : m_lhsIter(aEval.m_lhsImpl,outer), m_rhsIter(aEval.m_rhsImpl,outer), m_functor(aEval.m_functor)
     {
       while (m_lhsIter && m_rhsIter && (m_lhsIter.index() != m_rhsIter.index()))
       {
@@ -180,7 +426,7 @@
       }
     }
 
-    EIGEN_STRONG_INLINE Derived& operator++()
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
     {
       ++m_lhsIter;
       ++m_rhsIter;
@@ -191,12 +437,13 @@
         else
           ++m_rhsIter;
       }
-      return *static_cast<Derived*>(this);
+      return *this;
     }
-
+    
     EIGEN_STRONG_INLINE Scalar value() const { return m_functor(m_lhsIter.value(), m_rhsIter.value()); }
 
-    EIGEN_STRONG_INLINE Index index() const { return m_lhsIter.index(); }
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_lhsIter.index(); }
+    EIGEN_STRONG_INLINE Index outer() const { return m_lhsIter.outer(); }
     EIGEN_STRONG_INLINE Index row() const { return m_lhsIter.row(); }
     EIGEN_STRONG_INLINE Index col() const { return m_lhsIter.col(); }
 
@@ -205,91 +452,182 @@
   protected:
     LhsIterator m_lhsIter;
     RhsIterator m_rhsIter;
-    const BinaryFunc& m_functor;
+    const BinaryOp& m_functor;
+  };
+  
+  
+  enum {
+    CoeffReadCost = int(evaluator<LhsArg>::CoeffReadCost) + int(evaluator<RhsArg>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
+    Flags = XprType::Flags
+  };
+  
+  explicit sparse_conjunction_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()), 
+      m_rhsImpl(xpr.rhs())  
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  inline Index nonZerosEstimate() const {
+    return (std::min)(m_lhsImpl.nonZerosEstimate(), m_rhsImpl.nonZerosEstimate());
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<LhsArg> m_lhsImpl;
+  evaluator<RhsArg> m_rhsImpl;
 };
 
-// sparse - dense  (product)
-template<typename T, typename Lhs, typename Rhs, typename Derived>
-class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs, Rhs, Derived, Sparse, Dense>
+// "dense ^ sparse"
+template<typename XprType>
+struct sparse_conjunction_evaluator<XprType, IndexBased, IteratorBased>
+  : evaluator_base<XprType>
 {
-    typedef scalar_product_op<T> BinaryFunc;
-    typedef CwiseBinaryOp<BinaryFunc, Lhs, Rhs> CwiseBinaryXpr;
-    typedef typename CwiseBinaryXpr::Scalar Scalar;
-    typedef typename traits<CwiseBinaryXpr>::_LhsNested _LhsNested;
-    typedef typename traits<CwiseBinaryXpr>::RhsNested RhsNested;
-    typedef typename _LhsNested::InnerIterator LhsIterator;
-    typedef typename Lhs::Index Index;
-    enum { IsRowMajor = (int(Lhs::Flags)&RowMajorBit)==RowMajorBit };
-  public:
+protected:
+  typedef typename XprType::Functor BinaryOp;
+  typedef typename XprType::Lhs LhsArg;
+  typedef typename XprType::Rhs RhsArg;
+  typedef evaluator<LhsArg> LhsEvaluator;
+  typedef typename evaluator<RhsArg>::InnerIterator  RhsIterator;
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename traits<XprType>::Scalar Scalar;
+public:
 
-    EIGEN_STRONG_INLINE sparse_cwise_binary_op_inner_iterator_selector(const CwiseBinaryXpr& xpr, Index outer)
-      : m_rhs(xpr.rhs()), m_lhsIter(xpr.lhs(),outer), m_functor(xpr.functor()), m_outer(outer)
+  class InnerIterator
+  {
+    enum { IsRowMajor = (int(RhsArg::Flags)&RowMajorBit)==RowMajorBit };
+
+  public:
+    
+    EIGEN_STRONG_INLINE InnerIterator(const sparse_conjunction_evaluator& aEval, Index outer)
+      : m_lhsEval(aEval.m_lhsImpl), m_rhsIter(aEval.m_rhsImpl,outer), m_functor(aEval.m_functor), m_outer(outer)
     {}
 
-    EIGEN_STRONG_INLINE Derived& operator++()
-    {
-      ++m_lhsIter;
-      return *static_cast<Derived*>(this);
-    }
-
-    EIGEN_STRONG_INLINE Scalar value() const
-    { return m_functor(m_lhsIter.value(),
-                       m_rhs.coeff(IsRowMajor?m_outer:m_lhsIter.index(),IsRowMajor?m_lhsIter.index():m_outer)); }
-
-    EIGEN_STRONG_INLINE Index index() const { return m_lhsIter.index(); }
-    EIGEN_STRONG_INLINE Index row() const { return m_lhsIter.row(); }
-    EIGEN_STRONG_INLINE Index col() const { return m_lhsIter.col(); }
-
-    EIGEN_STRONG_INLINE operator bool() const { return m_lhsIter; }
-
-  protected:
-    RhsNested m_rhs;
-    LhsIterator m_lhsIter;
-    const BinaryFunc m_functor;
-    const Index m_outer;
-};
-
-// sparse - dense  (product)
-template<typename T, typename Lhs, typename Rhs, typename Derived>
-class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs, Rhs, Derived, Dense, Sparse>
-{
-    typedef scalar_product_op<T> BinaryFunc;
-    typedef CwiseBinaryOp<BinaryFunc, Lhs, Rhs> CwiseBinaryXpr;
-    typedef typename CwiseBinaryXpr::Scalar Scalar;
-    typedef typename traits<CwiseBinaryXpr>::_RhsNested _RhsNested;
-    typedef typename _RhsNested::InnerIterator RhsIterator;
-    typedef typename Lhs::Index Index;
-
-    enum { IsRowMajor = (int(Rhs::Flags)&RowMajorBit)==RowMajorBit };
-  public:
-
-    EIGEN_STRONG_INLINE sparse_cwise_binary_op_inner_iterator_selector(const CwiseBinaryXpr& xpr, Index outer)
-      : m_xpr(xpr), m_rhsIter(xpr.rhs(),outer), m_functor(xpr.functor()), m_outer(outer)
-    {}
-
-    EIGEN_STRONG_INLINE Derived& operator++()
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
     {
       ++m_rhsIter;
-      return *static_cast<Derived*>(this);
+      return *this;
     }
 
     EIGEN_STRONG_INLINE Scalar value() const
-    { return m_functor(m_xpr.lhs().coeff(IsRowMajor?m_outer:m_rhsIter.index(),IsRowMajor?m_rhsIter.index():m_outer), m_rhsIter.value()); }
+    { return m_functor(m_lhsEval.coeff(IsRowMajor?m_outer:m_rhsIter.index(),IsRowMajor?m_rhsIter.index():m_outer), m_rhsIter.value()); }
 
-    EIGEN_STRONG_INLINE Index index() const { return m_rhsIter.index(); }
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_rhsIter.index(); }
+    EIGEN_STRONG_INLINE Index outer() const { return m_rhsIter.outer(); }
     EIGEN_STRONG_INLINE Index row() const { return m_rhsIter.row(); }
     EIGEN_STRONG_INLINE Index col() const { return m_rhsIter.col(); }
 
     EIGEN_STRONG_INLINE operator bool() const { return m_rhsIter; }
-
+    
   protected:
-    const CwiseBinaryXpr& m_xpr;
+    const LhsEvaluator &m_lhsEval;
     RhsIterator m_rhsIter;
-    const BinaryFunc& m_functor;
+    const BinaryOp& m_functor;
     const Index m_outer;
+  };
+  
+  
+  enum {
+    CoeffReadCost = int(evaluator<LhsArg>::CoeffReadCost) + int(evaluator<RhsArg>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
+    Flags = XprType::Flags
+  };
+  
+  explicit sparse_conjunction_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()), 
+      m_rhsImpl(xpr.rhs())  
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  inline Index nonZerosEstimate() const {
+    return m_rhsImpl.nonZerosEstimate();
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<LhsArg> m_lhsImpl;
+  evaluator<RhsArg> m_rhsImpl;
 };
 
-} // end namespace internal
+// "sparse ^ dense"
+template<typename XprType>
+struct sparse_conjunction_evaluator<XprType, IteratorBased, IndexBased>
+  : evaluator_base<XprType>
+{
+protected:
+  typedef typename XprType::Functor BinaryOp;
+  typedef typename XprType::Lhs LhsArg;
+  typedef typename XprType::Rhs RhsArg;
+  typedef typename evaluator<LhsArg>::InnerIterator LhsIterator;
+  typedef evaluator<RhsArg> RhsEvaluator;
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename traits<XprType>::Scalar Scalar;
+public:
+
+  class InnerIterator
+  {
+    enum { IsRowMajor = (int(LhsArg::Flags)&RowMajorBit)==RowMajorBit };
+
+  public:
+    
+    EIGEN_STRONG_INLINE InnerIterator(const sparse_conjunction_evaluator& aEval, Index outer)
+      : m_lhsIter(aEval.m_lhsImpl,outer), m_rhsEval(aEval.m_rhsImpl), m_functor(aEval.m_functor), m_outer(outer)
+    {}
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
+    {
+      ++m_lhsIter;
+      return *this;
+    }
+
+    EIGEN_STRONG_INLINE Scalar value() const
+    { return m_functor(m_lhsIter.value(),
+                       m_rhsEval.coeff(IsRowMajor?m_outer:m_lhsIter.index(),IsRowMajor?m_lhsIter.index():m_outer)); }
+
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_lhsIter.index(); }
+    EIGEN_STRONG_INLINE Index outer() const { return m_lhsIter.outer(); }
+    EIGEN_STRONG_INLINE Index row() const { return m_lhsIter.row(); }
+    EIGEN_STRONG_INLINE Index col() const { return m_lhsIter.col(); }
+
+    EIGEN_STRONG_INLINE operator bool() const { return m_lhsIter; }
+    
+  protected:
+    LhsIterator m_lhsIter;
+    const evaluator<RhsArg> &m_rhsEval;
+    const BinaryOp& m_functor;
+    const Index m_outer;
+  };
+  
+  
+  enum {
+    CoeffReadCost = int(evaluator<LhsArg>::CoeffReadCost) + int(evaluator<RhsArg>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
+    Flags = XprType::Flags
+  };
+  
+  explicit sparse_conjunction_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()), 
+      m_rhsImpl(xpr.rhs())  
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  inline Index nonZerosEstimate() const {
+    return m_lhsImpl.nonZerosEstimate();
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<LhsArg> m_lhsImpl;
+  evaluator<RhsArg> m_rhsImpl;
+};
+
+}
 
 /***************************************************************************
 * Implementation of SparseMatrixBase and SparseCwise functions/operators
@@ -297,6 +635,22 @@
 
 template<typename Derived>
 template<typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
+{
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
+{
+  call_assignment(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 SparseMatrixBase<Derived>::operator-=(const SparseMatrixBase<OtherDerived> &other)
 {
@@ -313,10 +667,54 @@
 
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE const EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE
+Derived& SparseMatrixBase<Derived>::operator+=(const DiagonalBase<OtherDerived>& other)
+{
+  call_assignment_no_alias(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator-=(const DiagonalBase<OtherDerived>& other)
+{
+  call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+    
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE const typename SparseMatrixBase<Derived>::template CwiseProductDenseReturnType<OtherDerived>::Type
 SparseMatrixBase<Derived>::cwiseProduct(const MatrixBase<OtherDerived> &other) const
 {
-  return EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE(derived(), other.derived());
+  return typename CwiseProductDenseReturnType<OtherDerived>::Type(derived(), other.derived());
+}
+
+template<typename DenseDerived, typename SparseDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_sum_op<typename DenseDerived::Scalar,typename SparseDerived::Scalar>, const DenseDerived, const SparseDerived>
+operator+(const MatrixBase<DenseDerived> &a, const SparseMatrixBase<SparseDerived> &b)
+{
+  return CwiseBinaryOp<internal::scalar_sum_op<typename DenseDerived::Scalar,typename SparseDerived::Scalar>, const DenseDerived, const SparseDerived>(a.derived(), b.derived());
+}
+
+template<typename SparseDerived, typename DenseDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_sum_op<typename SparseDerived::Scalar,typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>
+operator+(const SparseMatrixBase<SparseDerived> &a, const MatrixBase<DenseDerived> &b)
+{
+  return CwiseBinaryOp<internal::scalar_sum_op<typename SparseDerived::Scalar,typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>(a.derived(), b.derived());
+}
+
+template<typename DenseDerived, typename SparseDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_difference_op<typename DenseDerived::Scalar,typename SparseDerived::Scalar>, const DenseDerived, const SparseDerived>
+operator-(const MatrixBase<DenseDerived> &a, const SparseMatrixBase<SparseDerived> &b)
+{
+  return CwiseBinaryOp<internal::scalar_difference_op<typename DenseDerived::Scalar,typename SparseDerived::Scalar>, const DenseDerived, const SparseDerived>(a.derived(), b.derived());
+}
+
+template<typename SparseDerived, typename DenseDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_difference_op<typename SparseDerived::Scalar,typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>
+operator-(const SparseMatrixBase<SparseDerived> &a, const MatrixBase<DenseDerived> &b)
+{
+  return CwiseBinaryOp<internal::scalar_difference_op<typename SparseDerived::Scalar,typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>(a.derived(), b.derived());
 }
 
 } // end namespace Eigen

diff --git a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
index 5a50c78..32dac0f 100644
--- a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,138 +12,123 @@
 
 namespace Eigen { 
 
-template<typename UnaryOp, typename MatrixType>
-class CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>
-  : public SparseMatrixBase<CwiseUnaryOp<UnaryOp, MatrixType> >
+namespace internal {
+  
+template<typename UnaryOp, typename ArgType>
+struct unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>
+  : public evaluator_base<CwiseUnaryOp<UnaryOp,ArgType> >
 {
   public:
+    typedef CwiseUnaryOp<UnaryOp, ArgType> XprType;
 
     class InnerIterator;
-    class ReverseInnerIterator;
-
-    typedef CwiseUnaryOp<UnaryOp, MatrixType> Derived;
-    EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
+    
+    enum {
+      CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<UnaryOp>::Cost),
+      Flags = XprType::Flags
+    };
+    
+    explicit unary_evaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression())
+    {
+      EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
+      EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+    }
+    
+    inline Index nonZerosEstimate() const {
+      return m_argImpl.nonZerosEstimate();
+    }
 
   protected:
-    typedef typename internal::traits<Derived>::_XprTypeNested _MatrixTypeNested;
-    typedef typename _MatrixTypeNested::InnerIterator MatrixTypeIterator;
-    typedef typename _MatrixTypeNested::ReverseInnerIterator MatrixTypeReverseIterator;
+    typedef typename evaluator<ArgType>::InnerIterator        EvalIterator;
+    
+    const UnaryOp m_functor;
+    evaluator<ArgType> m_argImpl;
 };
 
-template<typename UnaryOp, typename MatrixType>
-class CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::InnerIterator
-    : public CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::MatrixTypeIterator
+template<typename UnaryOp, typename ArgType>
+class unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::InnerIterator
+    : public unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::EvalIterator
 {
-    typedef typename CwiseUnaryOpImpl::Scalar Scalar;
-    typedef typename CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::MatrixTypeIterator Base;
+  protected:
+    typedef typename XprType::Scalar Scalar;
+    typedef typename unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::EvalIterator Base;
   public:
 
-    EIGEN_STRONG_INLINE InnerIterator(const CwiseUnaryOpImpl& unaryOp, typename CwiseUnaryOpImpl::Index outer)
-      : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
+    EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& unaryOp, Index outer)
+      : Base(unaryOp.m_argImpl,outer), m_functor(unaryOp.m_functor)
     {}
 
     EIGEN_STRONG_INLINE InnerIterator& operator++()
     { Base::operator++(); return *this; }
 
-    EIGEN_STRONG_INLINE typename CwiseUnaryOpImpl::Scalar value() const { return m_functor(Base::value()); }
+    EIGEN_STRONG_INLINE Scalar value() const { return m_functor(Base::value()); }
 
   protected:
     const UnaryOp m_functor;
   private:
-    typename CwiseUnaryOpImpl::Scalar& valueRef();
+    Scalar& valueRef();
 };
 
-template<typename UnaryOp, typename MatrixType>
-class CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::ReverseInnerIterator
-    : public CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::MatrixTypeReverseIterator
-{
-    typedef typename CwiseUnaryOpImpl::Scalar Scalar;
-    typedef typename CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::MatrixTypeReverseIterator Base;
-  public:
-
-    EIGEN_STRONG_INLINE ReverseInnerIterator(const CwiseUnaryOpImpl& unaryOp, typename CwiseUnaryOpImpl::Index outer)
-      : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
-    {}
-
-    EIGEN_STRONG_INLINE ReverseInnerIterator& operator--()
-    { Base::operator--(); return *this; }
-
-    EIGEN_STRONG_INLINE typename CwiseUnaryOpImpl::Scalar value() const { return m_functor(Base::value()); }
-
-  protected:
-    const UnaryOp m_functor;
-  private:
-    typename CwiseUnaryOpImpl::Scalar& valueRef();
-};
-
-template<typename ViewOp, typename MatrixType>
-class CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>
-  : public SparseMatrixBase<CwiseUnaryView<ViewOp, MatrixType> >
+template<typename ViewOp, typename ArgType>
+struct unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>
+  : public evaluator_base<CwiseUnaryView<ViewOp,ArgType> >
 {
   public:
+    typedef CwiseUnaryView<ViewOp, ArgType> XprType;
 
     class InnerIterator;
-    class ReverseInnerIterator;
-
-    typedef CwiseUnaryView<ViewOp, MatrixType> Derived;
-    EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
+    
+    enum {
+      CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<ViewOp>::Cost),
+      Flags = XprType::Flags
+    };
+    
+    explicit unary_evaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression())
+    {
+      EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<ViewOp>::Cost);
+      EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+    }
 
   protected:
-    typedef typename internal::traits<Derived>::_MatrixTypeNested _MatrixTypeNested;
-    typedef typename _MatrixTypeNested::InnerIterator MatrixTypeIterator;
-    typedef typename _MatrixTypeNested::ReverseInnerIterator MatrixTypeReverseIterator;
+    typedef typename evaluator<ArgType>::InnerIterator        EvalIterator;
+    
+    const ViewOp m_functor;
+    evaluator<ArgType> m_argImpl;
 };
 
-template<typename ViewOp, typename MatrixType>
-class CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::InnerIterator
-    : public CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::MatrixTypeIterator
+template<typename ViewOp, typename ArgType>
+class unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::InnerIterator
+    : public unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::EvalIterator
 {
-    typedef typename CwiseUnaryViewImpl::Scalar Scalar;
-    typedef typename CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::MatrixTypeIterator Base;
+  protected:
+    typedef typename XprType::Scalar Scalar;
+    typedef typename unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::EvalIterator Base;
   public:
 
-    EIGEN_STRONG_INLINE InnerIterator(const CwiseUnaryViewImpl& unaryOp, typename CwiseUnaryViewImpl::Index outer)
-      : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
+    EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& unaryOp, Index outer)
+      : Base(unaryOp.m_argImpl,outer), m_functor(unaryOp.m_functor)
     {}
 
     EIGEN_STRONG_INLINE InnerIterator& operator++()
     { Base::operator++(); return *this; }
 
-    EIGEN_STRONG_INLINE typename CwiseUnaryViewImpl::Scalar value() const { return m_functor(Base::value()); }
-    EIGEN_STRONG_INLINE typename CwiseUnaryViewImpl::Scalar& valueRef() { return m_functor(Base::valueRef()); }
+    EIGEN_STRONG_INLINE Scalar value() const { return m_functor(Base::value()); }
+    EIGEN_STRONG_INLINE Scalar& valueRef() { return m_functor(Base::valueRef()); }
 
   protected:
     const ViewOp m_functor;
 };
 
-template<typename ViewOp, typename MatrixType>
-class CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::ReverseInnerIterator
-    : public CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::MatrixTypeReverseIterator
-{
-    typedef typename CwiseUnaryViewImpl::Scalar Scalar;
-    typedef typename CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::MatrixTypeReverseIterator Base;
-  public:
-
-    EIGEN_STRONG_INLINE ReverseInnerIterator(const CwiseUnaryViewImpl& unaryOp, typename CwiseUnaryViewImpl::Index outer)
-      : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
-    {}
-
-    EIGEN_STRONG_INLINE ReverseInnerIterator& operator--()
-    { Base::operator--(); return *this; }
-
-    EIGEN_STRONG_INLINE typename CwiseUnaryViewImpl::Scalar value() const { return m_functor(Base::value()); }
-    EIGEN_STRONG_INLINE typename CwiseUnaryViewImpl::Scalar& valueRef() { return m_functor(Base::valueRef()); }
-
-  protected:
-    const ViewOp m_functor;
-};
+} // end namespace internal
 
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
 SparseMatrixBase<Derived>::operator*=(const Scalar& other)
 {
+  typedef typename internal::evaluator<Derived>::InnerIterator EvalIterator;
+  internal::evaluator<Derived> thisEval(derived());
   for (Index j=0; j<outerSize(); ++j)
-    for (typename Derived::InnerIterator i(derived(),j); i; ++i)
+    for (EvalIterator i(thisEval,j); i; ++i)
       i.valueRef() *= other;
   return derived();
 }
@@ -152,8 +137,10 @@
 EIGEN_STRONG_INLINE Derived&
 SparseMatrixBase<Derived>::operator/=(const Scalar& other)
 {
+  typedef typename internal::evaluator<Derived>::InnerIterator EvalIterator;
+  internal::evaluator<Derived> thisEval(derived());
   for (Index j=0; j<outerSize(); ++j)
-    for (typename Derived::InnerIterator i(derived(),j); i; ++i)
+    for (EvalIterator i(thisEval,j); i; ++i)
       i.valueRef() /= other;
   return derived();
 }

diff --git a/Eigen/src/SparseCore/SparseDenseProduct.h b/Eigen/src/SparseCore/SparseDenseProduct.h
index 610833f..f005a18 100644
--- a/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/Eigen/src/SparseCore/SparseDenseProduct.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,131 +12,10 @@
 
 namespace Eigen { 
 
-template<typename Lhs, typename Rhs, int InnerSize> struct SparseDenseProductReturnType
-{
-  typedef SparseTimeDenseProduct<Lhs,Rhs> Type;
-};
-
-template<typename Lhs, typename Rhs> struct SparseDenseProductReturnType<Lhs,Rhs,1>
-{
-  typedef SparseDenseOuterProduct<Lhs,Rhs,false> Type;
-};
-
-template<typename Lhs, typename Rhs, int InnerSize> struct DenseSparseProductReturnType
-{
-  typedef DenseTimeSparseProduct<Lhs,Rhs> Type;
-};
-
-template<typename Lhs, typename Rhs> struct DenseSparseProductReturnType<Lhs,Rhs,1>
-{
-  typedef SparseDenseOuterProduct<Rhs,Lhs,true> Type;
-};
-
 namespace internal {
 
-template<typename Lhs, typename Rhs, bool Tr>
-struct traits<SparseDenseOuterProduct<Lhs,Rhs,Tr> >
-{
-  typedef Sparse StorageKind;
-  typedef typename scalar_product_traits<typename traits<Lhs>::Scalar,
-                                         typename traits<Rhs>::Scalar>::ReturnType Scalar;
-  typedef typename Lhs::Index Index;
-  typedef typename Lhs::Nested LhsNested;
-  typedef typename Rhs::Nested RhsNested;
-  typedef typename remove_all<LhsNested>::type _LhsNested;
-  typedef typename remove_all<RhsNested>::type _RhsNested;
-
-  enum {
-    LhsCoeffReadCost = traits<_LhsNested>::CoeffReadCost,
-    RhsCoeffReadCost = traits<_RhsNested>::CoeffReadCost,
-
-    RowsAtCompileTime    = Tr ? int(traits<Rhs>::RowsAtCompileTime)     : int(traits<Lhs>::RowsAtCompileTime),
-    ColsAtCompileTime    = Tr ? int(traits<Lhs>::ColsAtCompileTime)     : int(traits<Rhs>::ColsAtCompileTime),
-    MaxRowsAtCompileTime = Tr ? int(traits<Rhs>::MaxRowsAtCompileTime)  : int(traits<Lhs>::MaxRowsAtCompileTime),
-    MaxColsAtCompileTime = Tr ? int(traits<Lhs>::MaxColsAtCompileTime)  : int(traits<Rhs>::MaxColsAtCompileTime),
-
-    Flags = Tr ? RowMajorBit : 0,
-
-    CoeffReadCost = LhsCoeffReadCost + RhsCoeffReadCost + NumTraits<Scalar>::MulCost
-  };
-};
-
-} // end namespace internal
-
-template<typename Lhs, typename Rhs, bool Tr>
-class SparseDenseOuterProduct
- : public SparseMatrixBase<SparseDenseOuterProduct<Lhs,Rhs,Tr> >
-{
-  public:
-
-    typedef SparseMatrixBase<SparseDenseOuterProduct> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(SparseDenseOuterProduct)
-    typedef internal::traits<SparseDenseOuterProduct> Traits;
-
-  private:
-
-    typedef typename Traits::LhsNested LhsNested;
-    typedef typename Traits::RhsNested RhsNested;
-    typedef typename Traits::_LhsNested _LhsNested;
-    typedef typename Traits::_RhsNested _RhsNested;
-
-  public:
-
-    class InnerIterator;
-
-    EIGEN_STRONG_INLINE SparseDenseOuterProduct(const Lhs& lhs, const Rhs& rhs)
-      : m_lhs(lhs), m_rhs(rhs)
-    {
-      EIGEN_STATIC_ASSERT(!Tr,YOU_MADE_A_PROGRAMMING_MISTAKE);
-    }
-
-    EIGEN_STRONG_INLINE SparseDenseOuterProduct(const Rhs& rhs, const Lhs& lhs)
-      : m_lhs(lhs), m_rhs(rhs)
-    {
-      EIGEN_STATIC_ASSERT(Tr,YOU_MADE_A_PROGRAMMING_MISTAKE);
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const { return Tr ? m_rhs.rows() : m_lhs.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return Tr ? m_lhs.cols() : m_rhs.cols(); }
-
-    EIGEN_STRONG_INLINE const _LhsNested& lhs() const { return m_lhs; }
-    EIGEN_STRONG_INLINE const _RhsNested& rhs() const { return m_rhs; }
-
-  protected:
-    LhsNested m_lhs;
-    RhsNested m_rhs;
-};
-
-template<typename Lhs, typename Rhs, bool Transpose>
-class SparseDenseOuterProduct<Lhs,Rhs,Transpose>::InnerIterator : public _LhsNested::InnerIterator
-{
-    typedef typename _LhsNested::InnerIterator Base;
-    typedef typename SparseDenseOuterProduct::Index Index;
-  public:
-    EIGEN_STRONG_INLINE InnerIterator(const SparseDenseOuterProduct& prod, Index outer)
-      : Base(prod.lhs(), 0), m_outer(outer), m_factor(prod.rhs().coeff(outer))
-    {
-    }
-
-    inline Index outer() const { return m_outer; }
-    inline Index row() const { return Transpose ? Base::row() : m_outer; }
-    inline Index col() const { return Transpose ? m_outer : Base::row(); }
-
-    inline Scalar value() const { return Base::value() * m_factor; }
-
-  protected:
-    Index m_outer;
-    Scalar m_factor;
-};
-
-namespace internal {
-template<typename Lhs, typename Rhs>
-struct traits<SparseTimeDenseProduct<Lhs,Rhs> >
- : traits<ProductBase<SparseTimeDenseProduct<Lhs,Rhs>, Lhs, Rhs> >
-{
-  typedef Dense StorageKind;
-  typedef MatrixXpr XprKind;
-};
+template <> struct product_promote_storage_type<Sparse,Dense, OuterProduct> { typedef Sparse ret; };
+template <> struct product_promote_storage_type<Dense,Sparse, OuterProduct> { typedef Sparse ret; };
 
 template<typename SparseLhsType, typename DenseRhsType, typename DenseResType,
          typename AlphaType,
@@ -150,49 +29,77 @@
   typedef typename internal::remove_all<SparseLhsType>::type Lhs;
   typedef typename internal::remove_all<DenseRhsType>::type Rhs;
   typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename Lhs::Index Index;
-  typedef typename Lhs::InnerIterator LhsInnerIterator;
+  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
+  typedef evaluator<Lhs> LhsEval;
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
   {
+    LhsEval lhsEval(lhs);
+    
+    Index n = lhs.outerSize();
+#ifdef EIGEN_HAS_OPENMP
+    Eigen::initParallel();
+    Index threads = Eigen::nbThreads();
+#endif
+    
     for(Index c=0; c<rhs.cols(); ++c)
     {
-      Index n = lhs.outerSize();
-      for(Index j=0; j<n; ++j)
+#ifdef EIGEN_HAS_OPENMP
+      // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
+      // It basically represents the minimal amount of work to be done to be worth it.
+      if(threads>1 && lhsEval.nonZerosEstimate() > 20000)
       {
-        typename Res::Scalar tmp(0);
-        for(LhsInnerIterator it(lhs,j); it ;++it)
-          tmp += it.value() * rhs.coeff(it.index(),c);
-        res.coeffRef(j,c) = alpha * tmp;
+        #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
+        for(Index i=0; i<n; ++i)
+          processRow(lhsEval,rhs,res,alpha,i,c);
+      }
+      else
+#endif
+      {
+        for(Index i=0; i<n; ++i)
+          processRow(lhsEval,rhs,res,alpha,i,c);
       }
     }
   }
+  
+  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha, Index i, Index col)
+  {
+    typename Res::Scalar tmp(0);
+    for(LhsInnerIterator it(lhsEval,i); it ;++it)
+      tmp += it.value() * rhs.coeff(it.index(),col);
+    res.coeffRef(i,col) += alpha * tmp;
+  }
+  
 };
 
-template<typename T1, typename T2/*, int _Options, typename _StrideType*/>
-struct scalar_product_traits<T1, Ref<T2/*, _Options, _StrideType*/> >
-{
-  enum {
-    Defined = 1
-  };
-  typedef typename CwiseUnaryOp<scalar_multiple2_op<T1, typename T2::Scalar>, T2>::PlainObject ReturnType;
-};
+// FIXME: what is the purpose of the following specialization? Is it for the BlockedSparse format?
+// -> let's disable it for now as it is conflicting with generic scalar*matrix and matrix*scalar operators
+// template<typename T1, typename T2/*, int _Options, typename _StrideType*/>
+// struct ScalarBinaryOpTraits<T1, Ref<T2/*, _Options, _StrideType*/> >
+// {
+//   enum {
+//     Defined = 1
+//   };
+//   typedef typename CwiseUnaryOp<scalar_multiple2_op<T1, typename T2::Scalar>, T2>::PlainObject ReturnType;
+// };
+
 template<typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType>
 struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, AlphaType, ColMajor, true>
 {
   typedef typename internal::remove_all<SparseLhsType>::type Lhs;
   typedef typename internal::remove_all<DenseRhsType>::type Rhs;
   typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename Lhs::InnerIterator LhsInnerIterator;
-  typedef typename Lhs::Index Index;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename LhsEval::InnerIterator LhsInnerIterator;
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha)
   {
+    LhsEval lhsEval(lhs);
     for(Index c=0; c<rhs.cols(); ++c)
     {
       for(Index j=0; j<lhs.outerSize(); ++j)
       {
 //        typename Res::Scalar rhs_j = alpha * rhs.coeff(j,c);
-        typename internal::scalar_product_traits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j,c));
-        for(LhsInnerIterator it(lhs,j); it ;++it)
+        typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j,c));
+        for(LhsInnerIterator it(lhsEval,j); it ;++it)
           res.coeffRef(it.index(),c) += it.value() * rhs_j;
       }
     }
@@ -205,16 +112,37 @@
   typedef typename internal::remove_all<SparseLhsType>::type Lhs;
   typedef typename internal::remove_all<DenseRhsType>::type Rhs;
   typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename Lhs::InnerIterator LhsInnerIterator;
-  typedef typename Lhs::Index Index;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename LhsEval::InnerIterator LhsInnerIterator;
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
   {
-    for(Index j=0; j<lhs.outerSize(); ++j)
+    Index n = lhs.rows();
+    LhsEval lhsEval(lhs);
+
+#ifdef EIGEN_HAS_OPENMP
+    Eigen::initParallel();
+    Index threads = Eigen::nbThreads();
+    // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
+    // It basically represents the minimal amount of work to be done to be worth it.
+    if(threads>1 && lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
     {
-      typename Res::RowXpr res_j(res.row(j));
-      for(LhsInnerIterator it(lhs,j); it ;++it)
-        res_j += (alpha*it.value()) * rhs.row(it.index());
+      #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
+      for(Index i=0; i<n; ++i)
+        processRow(lhsEval,rhs,res,alpha,i);
     }
+    else
+#endif
+    {
+      for(Index i=0; i<n; ++i)
+        processRow(lhsEval, rhs, res, alpha, i);
+    }
+  }
+
+  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, Res& res, const typename Res::Scalar& alpha, Index i)
+  {
+    typename Res::RowXpr res_i(res.row(i));
+    for(LhsInnerIterator it(lhsEval,i); it ;++it)
+      res_i += (alpha*it.value()) * rhs.row(it.index());
   }
 };
 
@@ -224,14 +152,14 @@
   typedef typename internal::remove_all<SparseLhsType>::type Lhs;
   typedef typename internal::remove_all<DenseRhsType>::type Rhs;
   typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename Lhs::InnerIterator LhsInnerIterator;
-  typedef typename Lhs::Index Index;
+  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
   {
+    evaluator<Lhs> lhsEval(lhs);
     for(Index j=0; j<lhs.outerSize(); ++j)
     {
       typename Rhs::ConstRowXpr rhs_j(rhs.row(j));
-      for(LhsInnerIterator it(lhs,j); it ;++it)
+      for(LhsInnerIterator it(lhsEval,j); it ;++it)
         res.row(it.index()) += (alpha*it.value()) * rhs_j;
     }
   }
@@ -245,66 +173,169 @@
 
 } // end namespace internal
 
-template<typename Lhs, typename Rhs>
-class SparseTimeDenseProduct
-  : public ProductBase<SparseTimeDenseProduct<Lhs,Rhs>, Lhs, Rhs>
-{
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(SparseTimeDenseProduct)
-
-    SparseTimeDenseProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
-    {}
-
-    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
-    {
-      internal::sparse_time_dense_product(m_lhs, m_rhs, dest, alpha);
-    }
-
-  private:
-    SparseTimeDenseProduct& operator=(const SparseTimeDenseProduct&);
-};
-
-
-// dense = dense * sparse
 namespace internal {
-template<typename Lhs, typename Rhs>
-struct traits<DenseTimeSparseProduct<Lhs,Rhs> >
- : traits<ProductBase<DenseTimeSparseProduct<Lhs,Rhs>, Lhs, Rhs> >
-{
-  typedef Dense StorageKind;
-};
-} // end namespace internal
 
-template<typename Lhs, typename Rhs>
-class DenseTimeSparseProduct
-  : public ProductBase<DenseTimeSparseProduct<Lhs,Rhs>, Lhs, Rhs>
+template<typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, SparseShape, DenseShape, ProductType>
+ : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,SparseShape,DenseShape,ProductType> >
 {
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    typedef typename nested_eval<Lhs,((Rhs::Flags&RowMajorBit)==0) ? 1 : Rhs::ColsAtCompileTime>::type LhsNested;
+    typedef typename nested_eval<Rhs,((Lhs::Flags&RowMajorBit)==0) ? 1 : Dynamic>::type RhsNested;
+    LhsNested lhsNested(lhs);
+    RhsNested rhsNested(rhs);
+    internal::sparse_time_dense_product(lhsNested, rhsNested, dst, alpha);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, SparseTriangularShape, DenseShape, ProductType>
+  : generic_product_impl<Lhs, Rhs, SparseShape, DenseShape, ProductType>
+{};
+
+template<typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, DenseShape, SparseShape, ProductType>
+  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,SparseShape,ProductType> >
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  template<typename Dst>
+  static void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    typedef typename nested_eval<Lhs,((Rhs::Flags&RowMajorBit)==0) ? Dynamic : 1>::type LhsNested;
+    typedef typename nested_eval<Rhs,((Lhs::Flags&RowMajorBit)==RowMajorBit) ? 1 : Lhs::RowsAtCompileTime>::type RhsNested;
+    LhsNested lhsNested(lhs);
+    RhsNested rhsNested(rhs);
+    
+    // transpose everything
+    Transpose<Dst> dstT(dst);
+    internal::sparse_time_dense_product(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, DenseShape, SparseTriangularShape, ProductType>
+  : generic_product_impl<Lhs, Rhs, DenseShape, SparseShape, ProductType>
+{};
+
+template<typename LhsT, typename RhsT, bool NeedToTranspose>
+struct sparse_dense_outer_product_evaluator
+{
+protected:
+  typedef typename conditional<NeedToTranspose,RhsT,LhsT>::type Lhs1;
+  typedef typename conditional<NeedToTranspose,LhsT,RhsT>::type ActualRhs;
+  typedef Product<LhsT,RhsT,DefaultProduct> ProdXprType;
+  
+  // if the actual left-hand side is a dense vector,
+  // then build a sparse-view so that we can seamlessly iterate over it.
+  typedef typename conditional<is_same<typename internal::traits<Lhs1>::StorageKind,Sparse>::value,
+            Lhs1, SparseView<Lhs1> >::type ActualLhs;
+  typedef typename conditional<is_same<typename internal::traits<Lhs1>::StorageKind,Sparse>::value,
+            Lhs1 const&, SparseView<Lhs1> >::type LhsArg;
+            
+  typedef evaluator<ActualLhs> LhsEval;
+  typedef evaluator<ActualRhs> RhsEval;
+  typedef typename evaluator<ActualLhs>::InnerIterator LhsIterator;
+  typedef typename ProdXprType::Scalar Scalar;
+  
+public:
+  enum {
+    Flags = NeedToTranspose ? RowMajorBit : 0,
+    CoeffReadCost = HugeCost
+  };
+  
+  class InnerIterator : public LhsIterator
+  {
   public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(DenseTimeSparseProduct)
-
-    DenseTimeSparseProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
+    InnerIterator(const sparse_dense_outer_product_evaluator &xprEval, Index outer)
+      : LhsIterator(xprEval.m_lhsXprImpl, 0),
+        m_outer(outer),
+        m_empty(false),
+        m_factor(get(xprEval.m_rhsXprImpl, outer, typename internal::traits<ActualRhs>::StorageKind() ))
     {}
+    
+    EIGEN_STRONG_INLINE Index outer() const { return m_outer; }
+    EIGEN_STRONG_INLINE Index row()   const { return NeedToTranspose ? m_outer : LhsIterator::index(); }
+    EIGEN_STRONG_INLINE Index col()   const { return NeedToTranspose ? LhsIterator::index() : m_outer; }
 
-    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+    EIGEN_STRONG_INLINE Scalar value() const { return LhsIterator::value() * m_factor; }
+    EIGEN_STRONG_INLINE operator bool() const { return LhsIterator::operator bool() && (!m_empty); }
+    
+  protected:
+    Scalar get(const RhsEval &rhs, Index outer, Dense = Dense()) const
     {
-      Transpose<const _LhsNested> lhs_t(m_lhs);
-      Transpose<const _RhsNested> rhs_t(m_rhs);
-      Transpose<Dest> dest_t(dest);
-      internal::sparse_time_dense_product(rhs_t, lhs_t, dest_t, alpha);
+      return rhs.coeff(outer);
     }
-
-  private:
-    DenseTimeSparseProduct& operator=(const DenseTimeSparseProduct&);
+    
+    Scalar get(const RhsEval &rhs, Index outer, Sparse = Sparse())
+    {
+      typename RhsEval::InnerIterator it(rhs, outer);
+      if (it && it.index()==0 && it.value()!=Scalar(0))
+        return it.value();
+      m_empty = true;
+      return Scalar(0);
+    }
+    
+    Index m_outer;
+    bool m_empty;
+    Scalar m_factor;
+  };
+  
+  sparse_dense_outer_product_evaluator(const Lhs1 &lhs, const ActualRhs &rhs)
+     : m_lhs(lhs), m_lhsXprImpl(m_lhs), m_rhsXprImpl(rhs)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  // transpose case
+  sparse_dense_outer_product_evaluator(const ActualRhs &rhs, const Lhs1 &lhs)
+     : m_lhs(lhs), m_lhsXprImpl(m_lhs), m_rhsXprImpl(rhs)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+    
+protected:
+  const LhsArg m_lhs;
+  evaluator<ActualLhs> m_lhsXprImpl;
+  evaluator<ActualRhs> m_rhsXprImpl;
 };
 
-// sparse * dense
-template<typename Derived>
-template<typename OtherDerived>
-inline const typename SparseDenseProductReturnType<Derived,OtherDerived>::Type
-SparseMatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
+// sparse * dense outer product
+template<typename Lhs, typename Rhs>
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, OuterProduct, SparseShape, DenseShape>
+  : sparse_dense_outer_product_evaluator<Lhs,Rhs, Lhs::IsRowMajor>
 {
-  return typename SparseDenseProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
-}
+  typedef sparse_dense_outer_product_evaluator<Lhs,Rhs, Lhs::IsRowMajor> Base;
+  
+  typedef Product<Lhs, Rhs> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+
+  explicit product_evaluator(const XprType& xpr)
+    : Base(xpr.lhs(), xpr.rhs())
+  {}
+  
+};
+
+template<typename Lhs, typename Rhs>
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, OuterProduct, DenseShape, SparseShape>
+  : sparse_dense_outer_product_evaluator<Lhs,Rhs, Rhs::IsRowMajor>
+{
+  typedef sparse_dense_outer_product_evaluator<Lhs,Rhs, Rhs::IsRowMajor> Base;
+  
+  typedef Product<Lhs, Rhs> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+
+  explicit product_evaluator(const XprType& xpr)
+    : Base(xpr.lhs(), xpr.rhs())
+  {}
+  
+};
+
+} // end namespace internal
 
 } // end namespace Eigen
 

diff --git a/Eigen/src/SparseCore/SparseDiagonalProduct.h b/Eigen/src/SparseCore/SparseDiagonalProduct.h
index 1bb590e..941c03b 100644
--- a/Eigen/src/SparseCore/SparseDiagonalProduct.h
+++ b/Eigen/src/SparseCore/SparseDiagonalProduct.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -26,171 +26,113 @@
 
 namespace internal {
 
-template<typename Lhs, typename Rhs>
-struct traits<SparseDiagonalProduct<Lhs, Rhs> >
+enum {
+  SDP_AsScalarProduct,
+  SDP_AsCwiseProduct
+};
+  
+template<typename SparseXprType, typename DiagonalCoeffType, int SDP_Tag>
+struct sparse_diagonal_product_evaluator;
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, ProductTag, DiagonalShape, SparseShape>
+  : public sparse_diagonal_product_evaluator<Rhs, typename Lhs::DiagonalVectorType, Rhs::Flags&RowMajorBit?SDP_AsScalarProduct:SDP_AsCwiseProduct>
 {
-  typedef typename remove_all<Lhs>::type _Lhs;
-  typedef typename remove_all<Rhs>::type _Rhs;
-  typedef typename _Lhs::Scalar Scalar;
-  typedef typename promote_index_type<typename traits<Lhs>::Index,
-                                         typename traits<Rhs>::Index>::type Index;
-  typedef Sparse StorageKind;
-  typedef MatrixXpr XprKind;
-  enum {
-    RowsAtCompileTime = _Lhs::RowsAtCompileTime,
-    ColsAtCompileTime = _Rhs::ColsAtCompileTime,
-
-    MaxRowsAtCompileTime = _Lhs::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = _Rhs::MaxColsAtCompileTime,
-
-    SparseFlags = is_diagonal<_Lhs>::ret ? int(_Rhs::Flags) : int(_Lhs::Flags),
-    Flags = (SparseFlags&RowMajorBit),
-    CoeffReadCost = Dynamic
-  };
+  typedef Product<Lhs, Rhs, DefaultProduct> XprType;
+  enum { CoeffReadCost = HugeCost, Flags = Rhs::Flags&RowMajorBit, Alignment = 0 }; // FIXME CoeffReadCost & Flags
+  
+  typedef sparse_diagonal_product_evaluator<Rhs, typename Lhs::DiagonalVectorType, Rhs::Flags&RowMajorBit?SDP_AsScalarProduct:SDP_AsCwiseProduct> Base;
+  explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) {}
 };
 
-enum {SDP_IsDiagonal, SDP_IsSparseRowMajor, SDP_IsSparseColMajor};
-template<typename Lhs, typename Rhs, typename SparseDiagonalProductType, int RhsMode, int LhsMode>
-class sparse_diagonal_product_inner_iterator_selector;
-
-} // end namespace internal
-
-template<typename Lhs, typename Rhs>
-class SparseDiagonalProduct
-  : public SparseMatrixBase<SparseDiagonalProduct<Lhs,Rhs> >,
-    internal::no_assignment_operator
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, ProductTag, SparseShape, DiagonalShape>
+  : public sparse_diagonal_product_evaluator<Lhs, Transpose<const typename Rhs::DiagonalVectorType>, Lhs::Flags&RowMajorBit?SDP_AsCwiseProduct:SDP_AsScalarProduct>
 {
-    typedef typename Lhs::Nested LhsNested;
-    typedef typename Rhs::Nested RhsNested;
+  typedef Product<Lhs, Rhs, DefaultProduct> XprType;
+  enum { CoeffReadCost = HugeCost, Flags = Lhs::Flags&RowMajorBit, Alignment = 0 }; // FIXME CoeffReadCost & Flags
+  
+  typedef sparse_diagonal_product_evaluator<Lhs, Transpose<const typename Rhs::DiagonalVectorType>, Lhs::Flags&RowMajorBit?SDP_AsCwiseProduct:SDP_AsScalarProduct> Base;
+  explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal().transpose()) {}
+};
 
-    typedef typename internal::remove_all<LhsNested>::type _LhsNested;
-    typedef typename internal::remove_all<RhsNested>::type _RhsNested;
-
-    enum {
-      LhsMode = internal::is_diagonal<_LhsNested>::ret ? internal::SDP_IsDiagonal
-              : (_LhsNested::Flags&RowMajorBit) ? internal::SDP_IsSparseRowMajor : internal::SDP_IsSparseColMajor,
-      RhsMode = internal::is_diagonal<_RhsNested>::ret ? internal::SDP_IsDiagonal
-              : (_RhsNested::Flags&RowMajorBit) ? internal::SDP_IsSparseRowMajor : internal::SDP_IsSparseColMajor
-    };
-
+template<typename SparseXprType, typename DiagonalCoeffType>
+struct sparse_diagonal_product_evaluator<SparseXprType, DiagonalCoeffType, SDP_AsScalarProduct>
+{
+protected:
+  typedef typename evaluator<SparseXprType>::InnerIterator SparseXprInnerIterator;
+  typedef typename SparseXprType::Scalar Scalar;
+  
+public:
+  class InnerIterator : public SparseXprInnerIterator
+  {
   public:
-
-    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseDiagonalProduct)
-
-    typedef internal::sparse_diagonal_product_inner_iterator_selector
-                      <_LhsNested,_RhsNested,SparseDiagonalProduct,LhsMode,RhsMode> InnerIterator;
+    InnerIterator(const sparse_diagonal_product_evaluator &xprEval, Index outer)
+      : SparseXprInnerIterator(xprEval.m_sparseXprImpl, outer),
+        m_coeff(xprEval.m_diagCoeffImpl.coeff(outer))
+    {}
     
-    // We do not want ReverseInnerIterator for diagonal-sparse products,
-    // but this dummy declaration is needed to make diag * sparse * diag compile.
-    class ReverseInnerIterator;
-
-    EIGEN_STRONG_INLINE SparseDiagonalProduct(const Lhs& lhs, const Rhs& rhs)
-      : m_lhs(lhs), m_rhs(rhs)
-    {
-      eigen_assert(lhs.cols() == rhs.rows() && "invalid sparse matrix * diagonal matrix product");
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
-
-    EIGEN_STRONG_INLINE const _LhsNested& lhs() const { return m_lhs; }
-    EIGEN_STRONG_INLINE const _RhsNested& rhs() const { return m_rhs; }
-
+    EIGEN_STRONG_INLINE Scalar value() const { return m_coeff * SparseXprInnerIterator::value(); }
   protected:
-    LhsNested m_lhs;
-    RhsNested m_rhs;
+    typename DiagonalCoeffType::Scalar m_coeff;
+  };
+  
+  sparse_diagonal_product_evaluator(const SparseXprType &sparseXpr, const DiagonalCoeffType &diagCoeff)
+    : m_sparseXprImpl(sparseXpr), m_diagCoeffImpl(diagCoeff)
+  {}
+
+  Index nonZerosEstimate() const { return m_sparseXprImpl.nonZerosEstimate(); }
+    
+protected:
+  evaluator<SparseXprType> m_sparseXprImpl;
+  evaluator<DiagonalCoeffType> m_diagCoeffImpl;
 };
 
-namespace internal {
 
-template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
-class sparse_diagonal_product_inner_iterator_selector
-<Lhs,Rhs,SparseDiagonalProductType,SDP_IsDiagonal,SDP_IsSparseRowMajor>
-  : public CwiseUnaryOp<scalar_multiple_op<typename Lhs::Scalar>,const Rhs>::InnerIterator
+template<typename SparseXprType, typename DiagCoeffType>
+struct sparse_diagonal_product_evaluator<SparseXprType, DiagCoeffType, SDP_AsCwiseProduct>
 {
-    typedef typename CwiseUnaryOp<scalar_multiple_op<typename Lhs::Scalar>,const Rhs>::InnerIterator Base;
-    typedef typename Lhs::Index Index;
+  typedef typename SparseXprType::Scalar Scalar;
+  typedef typename SparseXprType::StorageIndex StorageIndex;
+  
+  typedef typename nested_eval<DiagCoeffType,SparseXprType::IsRowMajor ? SparseXprType::RowsAtCompileTime
+                                                                       : SparseXprType::ColsAtCompileTime>::type DiagCoeffNested;
+  
+  class InnerIterator
+  {
+    typedef typename evaluator<SparseXprType>::InnerIterator SparseXprIter;
   public:
-    inline sparse_diagonal_product_inner_iterator_selector(
-              const SparseDiagonalProductType& expr, Index outer)
-      : Base(expr.rhs()*(expr.lhs().diagonal().coeff(outer)), outer)
-    {}
-};
-
-template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
-class sparse_diagonal_product_inner_iterator_selector
-<Lhs,Rhs,SparseDiagonalProductType,SDP_IsDiagonal,SDP_IsSparseColMajor>
-  : public CwiseBinaryOp<
-      scalar_product_op<typename Lhs::Scalar>,
-      const typename Rhs::ConstInnerVectorReturnType,
-      const typename Lhs::DiagonalVectorType>::InnerIterator
-{
-    typedef typename CwiseBinaryOp<
-      scalar_product_op<typename Lhs::Scalar>,
-      const typename Rhs::ConstInnerVectorReturnType,
-      const typename Lhs::DiagonalVectorType>::InnerIterator Base;
-    typedef typename Lhs::Index Index;
-    Index m_outer;
-  public:
-    inline sparse_diagonal_product_inner_iterator_selector(
-              const SparseDiagonalProductType& expr, Index outer)
-      : Base(expr.rhs().innerVector(outer) .cwiseProduct(expr.lhs().diagonal()), 0), m_outer(outer)
+    InnerIterator(const sparse_diagonal_product_evaluator &xprEval, Index outer)
+      : m_sparseIter(xprEval.m_sparseXprEval, outer), m_diagCoeffNested(xprEval.m_diagCoeffNested)
     {}
     
-    inline Index outer() const { return m_outer; }
-    inline Index col() const { return m_outer; }
-};
-
-template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
-class sparse_diagonal_product_inner_iterator_selector
-<Lhs,Rhs,SparseDiagonalProductType,SDP_IsSparseColMajor,SDP_IsDiagonal>
-  : public CwiseUnaryOp<scalar_multiple_op<typename Rhs::Scalar>,const Lhs>::InnerIterator
-{
-    typedef typename CwiseUnaryOp<scalar_multiple_op<typename Rhs::Scalar>,const Lhs>::InnerIterator Base;
-    typedef typename Lhs::Index Index;
-  public:
-    inline sparse_diagonal_product_inner_iterator_selector(
-              const SparseDiagonalProductType& expr, Index outer)
-      : Base(expr.lhs()*expr.rhs().diagonal().coeff(outer), outer)
-    {}
-};
-
-template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
-class sparse_diagonal_product_inner_iterator_selector
-<Lhs,Rhs,SparseDiagonalProductType,SDP_IsSparseRowMajor,SDP_IsDiagonal>
-  : public CwiseBinaryOp<
-      scalar_product_op<typename Rhs::Scalar>,
-      const typename Lhs::ConstInnerVectorReturnType,
-      const Transpose<const typename Rhs::DiagonalVectorType> >::InnerIterator
-{
-    typedef typename CwiseBinaryOp<
-      scalar_product_op<typename Rhs::Scalar>,
-      const typename Lhs::ConstInnerVectorReturnType,
-      const Transpose<const typename Rhs::DiagonalVectorType> >::InnerIterator Base;
-    typedef typename Lhs::Index Index;
-    Index m_outer;
-  public:
-    inline sparse_diagonal_product_inner_iterator_selector(
-              const SparseDiagonalProductType& expr, Index outer)
-      : Base(expr.lhs().innerVector(outer) .cwiseProduct(expr.rhs().diagonal().transpose()), 0), m_outer(outer)
-    {}
+    inline Scalar value() const { return m_sparseIter.value() * m_diagCoeffNested.coeff(index()); }
+    inline StorageIndex index() const  { return m_sparseIter.index(); }
+    inline Index outer() const  { return m_sparseIter.outer(); }
+    inline Index col() const    { return SparseXprType::IsRowMajor ? m_sparseIter.index() : m_sparseIter.outer(); }
+    inline Index row() const    { return SparseXprType::IsRowMajor ? m_sparseIter.outer() : m_sparseIter.index(); }
     
-    inline Index outer() const { return m_outer; }
-    inline Index row() const { return m_outer; }
+    EIGEN_STRONG_INLINE InnerIterator& operator++() { ++m_sparseIter; return *this; }
+    inline operator bool() const  { return m_sparseIter; }
+    
+  protected:
+    SparseXprIter m_sparseIter;
+    DiagCoeffNested m_diagCoeffNested;
+  };
+  
+  sparse_diagonal_product_evaluator(const SparseXprType &sparseXpr, const DiagCoeffType &diagCoeff)
+    : m_sparseXprEval(sparseXpr), m_diagCoeffNested(diagCoeff)
+  {}
+
+  Index nonZerosEstimate() const { return m_sparseXprEval.nonZerosEstimate(); }
+    
+protected:
+  evaluator<SparseXprType> m_sparseXprEval;
+  DiagCoeffNested m_diagCoeffNested;
 };
 
 } // end namespace internal
 
-// SparseMatrixBase functions
-
-template<typename Derived>
-template<typename OtherDerived>
-const SparseDiagonalProduct<Derived,OtherDerived>
-SparseMatrixBase<Derived>::operator*(const DiagonalBase<OtherDerived> &other) const
-{
-  return SparseDiagonalProduct<Derived,OtherDerived>(this->derived(), other.derived());
-}
-
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSE_DIAGONAL_PRODUCT_H

diff --git a/Eigen/src/SparseCore/SparseDot.h b/Eigen/src/SparseCore/SparseDot.h
index db39c9a..38bc4aa 100644
--- a/Eigen/src/SparseCore/SparseDot.h
+++ b/Eigen/src/SparseCore/SparseDot.h

@@ -26,7 +26,8 @@
   eigen_assert(size() == other.size());
   eigen_assert(other.size()>0 && "you are using a non initialized vector");
 
-  typename Derived::InnerIterator i(derived(),0);
+  internal::evaluator<Derived> thisEval(derived());
+  typename internal::evaluator<Derived>::InnerIterator i(thisEval, 0);
   Scalar res(0);
   while (i)
   {
@@ -49,16 +50,12 @@
 
   eigen_assert(size() == other.size());
 
-  typedef typename Derived::Nested  Nested;
-  typedef typename OtherDerived::Nested  OtherNested;
-  typedef typename internal::remove_all<Nested>::type  NestedCleaned;
-  typedef typename internal::remove_all<OtherNested>::type  OtherNestedCleaned;
+  internal::evaluator<Derived> thisEval(derived());
+  typename internal::evaluator<Derived>::InnerIterator i(thisEval, 0);
+  
+  internal::evaluator<OtherDerived>  otherEval(other.derived());
+  typename internal::evaluator<OtherDerived>::InnerIterator j(otherEval, 0);
 
-  Nested nthis(derived());
-  OtherNested nother(other.derived());
-
-  typename NestedCleaned::InnerIterator i(nthis,0);
-  typename OtherNestedCleaned::InnerIterator j(nother,0);
   Scalar res(0);
   while (i && j)
   {

diff --git a/Eigen/src/SparseCore/SparseFuzzy.h b/Eigen/src/SparseCore/SparseFuzzy.h
index 45f36e9..7d47eb9 100644
--- a/Eigen/src/SparseCore/SparseFuzzy.h
+++ b/Eigen/src/SparseCore/SparseFuzzy.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,17 +10,20 @@
 #ifndef EIGEN_SPARSE_FUZZY_H
 #define EIGEN_SPARSE_FUZZY_H
 
-// template<typename Derived>
-// template<typename OtherDerived>
-// bool SparseMatrixBase<Derived>::isApprox(
-//   const OtherDerived& other,
-//   typename NumTraits<Scalar>::Real prec
-// ) const
-// {
-//   const typename internal::nested<Derived,2>::type nested(derived());
-//   const typename internal::nested<OtherDerived,2>::type otherNested(other.derived());
-//   return    (nested - otherNested).cwise().abs2().sum()
-//          <= prec * prec * (std::min)(nested.cwise().abs2().sum(), otherNested.cwise().abs2().sum());
-// }
+namespace Eigen {
+  
+template<typename Derived>
+template<typename OtherDerived>
+bool SparseMatrixBase<Derived>::isApprox(const SparseMatrixBase<OtherDerived>& other, const RealScalar &prec) const
+{
+  const typename internal::nested_eval<Derived,2,PlainObject>::type actualA(derived());
+  typename internal::conditional<bool(IsRowMajor)==bool(OtherDerived::IsRowMajor),
+    const typename internal::nested_eval<OtherDerived,2,PlainObject>::type,
+    const PlainObject>::type actualB(other.derived());
+
+  return (actualA - actualB).squaredNorm() <= prec * prec * numext::mini(actualA.squaredNorm(), actualB.squaredNorm());
+}
+
+} // end namespace Eigen
 
 #endif // EIGEN_SPARSE_FUZZY_H

diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h
new file mode 100644
index 0000000..f99be33
--- /dev/null
+++ b/Eigen/src/SparseCore/SparseMap.h

@@ -0,0 +1,305 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_MAP_H
+#define EIGEN_SPARSE_MAP_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct traits<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
+{
+  typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
+  typedef traits<PlainObjectType> TraitsBase;
+  enum {
+    Flags = TraitsBase::Flags & (~NestByRefBit)
+  };
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct traits<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
+{
+  typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
+  typedef traits<PlainObjectType> TraitsBase;
+  enum {
+    Flags = TraitsBase::Flags & (~ (NestByRefBit | LvalueBit))
+  };
+};
+
+} // end namespace internal
+
+template<typename Derived,
+         int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors
+> class SparseMapBase;
+
+/** \ingroup SparseCore_Module
+  * class SparseMapBase
+  * \brief Common base class for Map and Ref instance of sparse matrix and vector.
+  */
+template<typename Derived>
+class SparseMapBase<Derived,ReadOnlyAccessors>
+  : public SparseCompressedBase<Derived>
+{
+  public:
+    typedef SparseCompressedBase<Derived> Base;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::StorageIndex StorageIndex;
+    enum { IsRowMajor = Base::IsRowMajor };
+    using Base::operator=;
+  protected:
+    
+    typedef typename internal::conditional<
+                         bool(internal::is_lvalue<Derived>::value),
+                         Scalar *, const Scalar *>::type ScalarPointer;
+    typedef typename internal::conditional<
+                         bool(internal::is_lvalue<Derived>::value),
+                         StorageIndex *, const StorageIndex *>::type IndexPointer;
+
+    Index   m_outerSize;
+    Index   m_innerSize;
+    Array<StorageIndex,2,1>  m_zero_nnz;
+    IndexPointer  m_outerIndex;
+    IndexPointer  m_innerIndices;
+    ScalarPointer m_values;
+    IndexPointer  m_innerNonZeros;
+
+  public:
+
+    /** \copydoc SparseMatrixBase::rows() */
+    inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
+    /** \copydoc SparseMatrixBase::cols() */
+    inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
+    /** \copydoc SparseMatrixBase::innerSize() */
+    inline Index innerSize() const { return m_innerSize; }
+    /** \copydoc SparseMatrixBase::outerSize() */
+    inline Index outerSize() const { return m_outerSize; }
+    /** \copydoc SparseCompressedBase::nonZeros */
+    inline Index nonZeros() const { return m_zero_nnz[1]; }
+    
+    /** \copydoc SparseCompressedBase::isCompressed */
+    bool isCompressed() const { return m_innerNonZeros==0; }
+
+    //----------------------------------------
+    // direct access interface
+    /** \copydoc SparseMatrix::valuePtr */
+    inline const Scalar* valuePtr() const { return m_values; }
+    /** \copydoc SparseMatrix::innerIndexPtr */
+    inline const StorageIndex* innerIndexPtr() const { return m_innerIndices; }
+    /** \copydoc SparseMatrix::outerIndexPtr */
+    inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; }
+    /** \copydoc SparseMatrix::innerNonZeroPtr */
+    inline const StorageIndex* innerNonZeroPtr() const { return m_innerNonZeros; }
+    //----------------------------------------
+
+    /** \copydoc SparseMatrix::coeff */
+    inline Scalar coeff(Index row, Index col) const
+    {
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+
+      Index start = m_outerIndex[outer];
+      Index end = isCompressed() ? m_outerIndex[outer+1] : start + m_innerNonZeros[outer];
+      if (start==end)
+        return Scalar(0);
+      else if (end>0 && inner==m_innerIndices[end-1])
+        return m_values[end-1];
+      // ^^  optimization: let's first check if it is the last coefficient
+      // (very common in high level algorithms)
+
+      const StorageIndex* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner);
+      const Index id = r-&m_innerIndices[0];
+      return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0);
+    }
+
+    inline SparseMapBase(Index rows, Index cols, Index nnz, IndexPointer outerIndexPtr, IndexPointer innerIndexPtr,
+                              ScalarPointer valuePtr, IndexPointer innerNonZerosPtr = 0)
+      : m_outerSize(IsRowMajor?rows:cols), m_innerSize(IsRowMajor?cols:rows), m_zero_nnz(0,internal::convert_index<StorageIndex>(nnz)), m_outerIndex(outerIndexPtr),
+        m_innerIndices(innerIndexPtr), m_values(valuePtr), m_innerNonZeros(innerNonZerosPtr)
+    {}
+
+    // for vectors
+    inline SparseMapBase(Index size, Index nnz, IndexPointer innerIndexPtr, ScalarPointer valuePtr)
+      : m_outerSize(1), m_innerSize(size), m_zero_nnz(0,internal::convert_index<StorageIndex>(nnz)), m_outerIndex(m_zero_nnz.data()),
+        m_innerIndices(innerIndexPtr), m_values(valuePtr), m_innerNonZeros(0)
+    {}
+
+    /** Empty destructor */
+    inline ~SparseMapBase() {}
+
+  protected:
+    inline SparseMapBase() {}
+};
+
+/** \ingroup SparseCore_Module
+  * class SparseMapBase
+  * \brief Common base class for writable Map and Ref instance of sparse matrix and vector.
+  */
+template<typename Derived>
+class SparseMapBase<Derived,WriteAccessors>
+  : public SparseMapBase<Derived,ReadOnlyAccessors>
+{
+    typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase;
+    
+  public:
+    typedef SparseMapBase<Derived, ReadOnlyAccessors> Base;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::StorageIndex StorageIndex;
+    enum { IsRowMajor = Base::IsRowMajor };
+    
+    using Base::operator=;
+
+  public:
+    
+    //----------------------------------------
+    // direct access interface
+    using Base::valuePtr;
+    using Base::innerIndexPtr;
+    using Base::outerIndexPtr;
+    using Base::innerNonZeroPtr;
+    /** \copydoc SparseMatrix::valuePtr */
+    inline Scalar* valuePtr()              { return Base::m_values; }
+    /** \copydoc SparseMatrix::innerIndexPtr */
+    inline StorageIndex* innerIndexPtr()   { return Base::m_innerIndices; }
+    /** \copydoc SparseMatrix::outerIndexPtr */
+    inline StorageIndex* outerIndexPtr()   { return Base::m_outerIndex; }
+    /** \copydoc SparseMatrix::innerNonZeroPtr */
+    inline StorageIndex* innerNonZeroPtr() { return Base::m_innerNonZeros; }
+    //----------------------------------------
+
+    /** \copydoc SparseMatrix::coeffRef */
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+
+      Index start = Base::m_outerIndex[outer];
+      Index end = Base::isCompressed() ? Base::m_outerIndex[outer+1] : start + Base::m_innerNonZeros[outer];
+      eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
+      eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient");
+      StorageIndex* r = std::lower_bound(&Base::m_innerIndices[start],&Base::m_innerIndices[end],inner);
+      const Index id = r - &Base::m_innerIndices[0];
+      eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient");
+      return const_cast<Scalar*>(Base::m_values)[id];
+    }
+    
+    inline SparseMapBase(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr,
+                         Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
+    {}
+
+    // for vectors
+    inline SparseMapBase(Index size, Index nnz, StorageIndex* innerIndexPtr, Scalar* valuePtr)
+      : Base(size, nnz, innerIndexPtr, valuePtr)
+    {}
+
+    /** Empty destructor */
+    inline ~SparseMapBase() {}
+
+  protected:
+    inline SparseMapBase() {}
+};
+
+/** \ingroup SparseCore_Module
+  *
+  * \brief Specialization of class Map for SparseMatrix-like storage.
+  *
+  * \tparam SparseMatrixType the equivalent sparse matrix type of the referenced data, it must be a template instance of class SparseMatrix.
+  *
+  * \sa class Map, class SparseMatrix, class Ref<SparseMatrixType,Options>
+  */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
+  : public SparseMapBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+#else
+template<typename SparseMatrixType>
+class Map<SparseMatrixType>
+  : public SparseMapBase<Derived,WriteAccessors>
+#endif
+{
+  public:
+    typedef SparseMapBase<Map> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
+    enum { IsRowMajor = Base::IsRowMajor };
+
+  public:
+
+    /** Constructs a read-write Map to a sparse matrix of size \a rows x \a cols, containing \a nnz non-zero coefficients,
+      * stored as a sparse format as defined by the pointers \a outerIndexPtr, \a innerIndexPtr, and \a valuePtr.
+      * If the optional parameter \a innerNonZerosPtr is the null pointer, then a standard compressed format is assumed.
+      *
+      * This constructor is available only if \c SparseMatrixType is non-const.
+      *
+      * More details on the expected storage schemes are given in the \ref TutorialSparse "manual pages".
+      */
+    inline Map(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr,
+               StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
+    {}
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** Empty destructor */
+    inline ~Map() {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
+  : public SparseMapBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+{
+  public:
+    typedef SparseMapBase<Map> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
+    enum { IsRowMajor = Base::IsRowMajor };
+
+  public:
+#endif
+    /** This is the const version of the above constructor.
+      *
+      * This constructor is available only if \c SparseMatrixType is const, e.g.:
+      * \code Map<const SparseMatrix<double> >  \endcode
+      */
+    inline Map(Index rows, Index cols, Index nnz, const StorageIndex* outerIndexPtr,
+               const StorageIndex* innerIndexPtr, const Scalar* valuePtr, const StorageIndex* innerNonZerosPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
+    {}
+
+    /** Empty destructor */
+    inline ~Map() {}
+};
+
+namespace internal {
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_MAP_H

diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 17e3c66..616b4a0 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -21,7 +21,7 @@
   * This class implements a more versatile variants of the common \em compressed row/column storage format.
   * Each colmun's (resp. row) non zeros are stored as a pair of value with associated row (resp. colmiun) index.
   * All the non zeros are stored in a single large buffer. Unlike the \em compressed format, there might be extra
-  * space inbetween the nonzeros of two successive colmuns (resp. rows) such that insertion of new non-zero
+  * space in between the nonzeros of two successive colmuns (resp. rows) such that insertion of new non-zero
   * can be done with limited memory reallocation and copies.
   *
   * A call to the function makeCompressed() turns the matrix into the standard \em compressed format
@@ -32,18 +32,22 @@
   * \tparam _Scalar the scalar type, i.e. the type of the coefficients
   * \tparam _Options Union of bit flags controlling the storage scheme. Currently the only possibility
   *                 is ColMajor or RowMajor. The default is 0 which means column-major.
-  * \tparam _Index the type of the indices. It has to be a \b signed type (e.g., short, int, std::ptrdiff_t). Default is \c int.
+  * \tparam _StorageIndex the type of the indices. It has to be a \b signed type (e.g., short, int, std::ptrdiff_t). Default is \c int.
+  *
+  * \warning In %Eigen 3.2, the undocumented type \c SparseMatrix::Index was improperly defined as the storage index type (e.g., int),
+  *          whereas it is now (starting from %Eigen 3.3) deprecated and always defined as Eigen::Index.
+  *          Codes making use of \c SparseMatrix::Index, might thus likely have to be changed to use \c SparseMatrix::StorageIndex instead.
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_SPARSEMATRIX_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_SPARSEMATRIX_PLUGIN.
   */
 
 namespace internal {
-template<typename _Scalar, int _Options, typename _Index>
-struct traits<SparseMatrix<_Scalar, _Options, _Index> >
+template<typename _Scalar, int _Options, typename _StorageIndex>
+struct traits<SparseMatrix<_Scalar, _Options, _StorageIndex> >
 {
   typedef _Scalar Scalar;
-  typedef _Index Index;
+  typedef _StorageIndex StorageIndex;
   typedef Sparse StorageKind;
   typedef MatrixXpr XprKind;
   enum {
@@ -51,22 +55,21 @@
     ColsAtCompileTime = Dynamic,
     MaxRowsAtCompileTime = Dynamic,
     MaxColsAtCompileTime = Dynamic,
-    Flags = _Options | NestByRefBit | LvalueBit,
-    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Flags = _Options | NestByRefBit | LvalueBit | CompressedAccessBit,
     SupportedAccessPatterns = InnerRandomAccessPattern
   };
 };
 
-template<typename _Scalar, int _Options, typename _Index, int DiagIndex>
-struct traits<Diagonal<const SparseMatrix<_Scalar, _Options, _Index>, DiagIndex> >
+template<typename _Scalar, int _Options, typename _StorageIndex, int DiagIndex>
+struct traits<Diagonal<SparseMatrix<_Scalar, _Options, _StorageIndex>, DiagIndex> >
 {
-  typedef SparseMatrix<_Scalar, _Options, _Index> MatrixType;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef SparseMatrix<_Scalar, _Options, _StorageIndex> MatrixType;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
 
   typedef _Scalar Scalar;
   typedef Dense StorageKind;
-  typedef _Index Index;
+  typedef _StorageIndex StorageIndex;
   typedef MatrixXpr XprKind;
 
   enum {
@@ -74,47 +77,63 @@
     ColsAtCompileTime = 1,
     MaxRowsAtCompileTime = Dynamic,
     MaxColsAtCompileTime = 1,
-    Flags = 0,
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost*10
+    Flags = LvalueBit
+  };
+};
+
+template<typename _Scalar, int _Options, typename _StorageIndex, int DiagIndex>
+struct traits<Diagonal<const SparseMatrix<_Scalar, _Options, _StorageIndex>, DiagIndex> >
+ : public traits<Diagonal<SparseMatrix<_Scalar, _Options, _StorageIndex>, DiagIndex> >
+{
+  enum {
+    Flags = 0
   };
 };
 
 } // end namespace internal
 
-template<typename _Scalar, int _Options, typename _Index>
+template<typename _Scalar, int _Options, typename _StorageIndex>
 class SparseMatrix
-  : public SparseMatrixBase<SparseMatrix<_Scalar, _Options, _Index> >
+  : public SparseCompressedBase<SparseMatrix<_Scalar, _Options, _StorageIndex> >
 {
+    typedef SparseCompressedBase<SparseMatrix> Base;
+    using Base::convert_index;
+    friend class SparseVector<_Scalar,0,_StorageIndex>;
+    template<typename, typename, typename, typename, typename>
+    friend struct internal::Assignment;
   public:
+    using Base::isCompressed;
+    using Base::nonZeros;
     EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix)
-    EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, +=)
-    EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, -=)
+    using Base::operator+=;
+    using Base::operator-=;
 
     typedef MappedSparseMatrix<Scalar,Flags> Map;
+    typedef Diagonal<SparseMatrix> DiagonalReturnType;
+    typedef Diagonal<const SparseMatrix> ConstDiagonalReturnType;
+    typedef typename Base::InnerIterator InnerIterator;
+    typedef typename Base::ReverseInnerIterator ReverseInnerIterator;
+    
+
     using Base::IsRowMajor;
-    typedef internal::CompressedStorage<Scalar,Index> Storage;
+    typedef internal::CompressedStorage<Scalar,StorageIndex> Storage;
     enum {
       Options = _Options
     };
 
+    typedef typename Base::IndexVector IndexVector;
+    typedef typename Base::ScalarVector ScalarVector;
   protected:
-
     typedef SparseMatrix<Scalar,(Flags&~RowMajorBit)|(IsRowMajor?RowMajorBit:0)> TransposedSparseMatrix;
 
     Index m_outerSize;
     Index m_innerSize;
-    Index* m_outerIndex;
-    Index* m_innerNonZeros;     // optional, if null then the data is compressed
+    StorageIndex* m_outerIndex;
+    StorageIndex* m_innerNonZeros;     // optional, if null then the data is compressed
     Storage m_data;
-    
-    Eigen::Map<Matrix<Index,Dynamic,1> > innerNonZeros() { return Eigen::Map<Matrix<Index,Dynamic,1> >(m_innerNonZeros, m_innerNonZeros?m_outerSize:0); }
-    const  Eigen::Map<const Matrix<Index,Dynamic,1> > innerNonZeros() const { return Eigen::Map<const Matrix<Index,Dynamic,1> >(m_innerNonZeros, m_innerNonZeros?m_outerSize:0); }
 
   public:
     
-    /** \returns whether \c *this is in compressed form. */
-    inline bool isCompressed() const { return m_innerNonZeros==0; }
-
     /** \returns the number of rows of the matrix */
     inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
     /** \returns the number of columns of the matrix */
@@ -128,38 +147,38 @@
     /** \returns a const pointer to the array of values.
       * This function is aimed at interoperability with other libraries.
       * \sa innerIndexPtr(), outerIndexPtr() */
-    inline const Scalar* valuePtr() const { return &m_data.value(0); }
+    inline const Scalar* valuePtr() const { return m_data.valuePtr(); }
     /** \returns a non-const pointer to the array of values.
       * This function is aimed at interoperability with other libraries.
       * \sa innerIndexPtr(), outerIndexPtr() */
-    inline Scalar* valuePtr() { return &m_data.value(0); }
+    inline Scalar* valuePtr() { return m_data.valuePtr(); }
 
     /** \returns a const pointer to the array of inner indices.
       * This function is aimed at interoperability with other libraries.
       * \sa valuePtr(), outerIndexPtr() */
-    inline const Index* innerIndexPtr() const { return &m_data.index(0); }
+    inline const StorageIndex* innerIndexPtr() const { return m_data.indexPtr(); }
     /** \returns a non-const pointer to the array of inner indices.
       * This function is aimed at interoperability with other libraries.
       * \sa valuePtr(), outerIndexPtr() */
-    inline Index* innerIndexPtr() { return &m_data.index(0); }
+    inline StorageIndex* innerIndexPtr() { return m_data.indexPtr(); }
 
     /** \returns a const pointer to the array of the starting positions of the inner vectors.
       * This function is aimed at interoperability with other libraries.
       * \sa valuePtr(), innerIndexPtr() */
-    inline const Index* outerIndexPtr() const { return m_outerIndex; }
+    inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; }
     /** \returns a non-const pointer to the array of the starting positions of the inner vectors.
       * This function is aimed at interoperability with other libraries.
       * \sa valuePtr(), innerIndexPtr() */
-    inline Index* outerIndexPtr() { return m_outerIndex; }
+    inline StorageIndex* outerIndexPtr() { return m_outerIndex; }
 
     /** \returns a const pointer to the array of the number of non zeros of the inner vectors.
       * This function is aimed at interoperability with other libraries.
       * \warning it returns the null pointer 0 in compressed mode */
-    inline const Index* innerNonZeroPtr() const { return m_innerNonZeros; }
+    inline const StorageIndex* innerNonZeroPtr() const { return m_innerNonZeros; }
     /** \returns a non-const pointer to the array of the number of non zeros of the inner vectors.
       * This function is aimed at interoperability with other libraries.
       * \warning it returns the null pointer 0 in compressed mode */
-    inline Index* innerNonZeroPtr() { return m_innerNonZeros; }
+    inline StorageIndex* innerNonZeroPtr() { return m_innerNonZeros; }
 
     /** \internal */
     inline Storage& data() { return m_data; }
@@ -175,7 +194,7 @@
       const Index outer = IsRowMajor ? row : col;
       const Index inner = IsRowMajor ? col : row;
       Index end = m_innerNonZeros ? m_outerIndex[outer] + m_innerNonZeros[outer] : m_outerIndex[outer+1];
-      return m_data.atInRange(m_outerIndex[outer], end, inner);
+      return m_data.atInRange(m_outerIndex[outer], end, StorageIndex(inner));
     }
 
     /** \returns a non-const reference to the value of the matrix at position \a i, \a j
@@ -198,7 +217,7 @@
       eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
       if(end<=start)
         return insert(row,col);
-      const Index p = m_data.searchLowerIndex(start,end-1,inner);
+      const Index p = m_data.searchLowerIndex(start,end-1,StorageIndex(inner));
       if((p<end) && (m_data.index(p)==inner))
         return m_data.value(p);
       else
@@ -209,45 +228,34 @@
       * The non zero coefficient must \b not already exist.
       *
       * If the matrix \c *this is in compressed mode, then \c *this is turned into uncompressed
-      * mode while reserving room for 2 non zeros per inner vector. It is strongly recommended to first
-      * call reserve(const SizesType &) to reserve a more appropriate number of elements per
-      * inner vector that better match your scenario.
+      * mode while reserving room for 2 x this->innerSize() non zeros if reserve(Index) has not been called earlier.
+      * In this case, the insertion procedure is optimized for a \e sequential insertion mode where elements are assumed to be
+      * inserted by increasing outer-indices.
+      * 
+      * If that's not the case, then it is strongly recommended to either use a triplet-list to assemble the matrix, or to first
+      * call reserve(const SizesType &) to reserve the appropriate number of non-zero elements per inner vector.
       *
-      * This function performs a sorted insertion in O(1) if the elements of each inner vector are
-      * inserted in increasing inner index order, and in O(nnz_j) for a random insertion.
+      * Assuming memory has been appropriately reserved, this function performs a sorted insertion in O(1)
+      * if the elements of each inner vector are inserted in increasing inner index order, and in O(nnz_j) for a random insertion.
       *
       */
-    Scalar& insert(Index row, Index col)
-    {
-      eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
-      
-      if(isCompressed())
-      {
-        reserve(Matrix<Index,Dynamic,1>::Constant(outerSize(), 2));
-      }
-      return insertUncompressed(row,col);
-    }
+    Scalar& insert(Index row, Index col);
 
   public:
 
-    class InnerIterator;
-    class ReverseInnerIterator;
-
-    /** Removes all non zeros but keep allocated memory */
+    /** Removes all non zeros but keep allocated memory
+      *
+      * This function does not free the currently allocated memory. To release as much as memory as possible,
+      * call \code mat.data().squeeze(); \endcode after resizing it.
+      * 
+      * \sa resize(Index,Index), data()
+      */
     inline void setZero()
     {
       m_data.clear();
-      memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(Index));
+      memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(StorageIndex));
       if(m_innerNonZeros)
-        memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(Index));
-    }
-
-    /** \returns the number of non zero coefficients */
-    inline Index nonZeros() const
-    {
-      if(m_innerNonZeros)
-        return innerNonZeros().sum();
-      return static_cast<Index>(m_data.size());
+        memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex));
     }
 
     /** Preallocates \a reserveSize non zeros.
@@ -262,22 +270,25 @@
     #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** Preallocates \a reserveSize[\c j] non zeros for each column (resp. row) \c j.
       *
-      * This function turns the matrix in non-compressed mode */
+      * This function turns the matrix in non-compressed mode.
+      * 
+      * The type \c SizesType must expose the following interface:
+        \code
+        typedef value_type;
+        const value_type& operator[](i) const;
+        \endcode
+      * for \c i in the [0,this->outerSize()[ range.
+      * Typical choices include std::vector<int>, Eigen::VectorXi, Eigen::VectorXi::Constant, etc.
+      */
     template<class SizesType>
     inline void reserve(const SizesType& reserveSizes);
     #else
     template<class SizesType>
-    inline void reserve(const SizesType& reserveSizes, const typename SizesType::value_type& enableif = typename SizesType::value_type())
-    {
-      EIGEN_UNUSED_VARIABLE(enableif);
-      reserveInnerVectors(reserveSizes);
-    }
-    template<class SizesType>
-    inline void reserve(const SizesType& reserveSizes, const typename SizesType::Scalar& enableif =
+    inline void reserve(const SizesType& reserveSizes, const typename SizesType::value_type& enableif =
     #if (!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1500) // MSVC 2005 fails to compile with this typename
         typename
     #endif
-        SizesType::Scalar())
+        SizesType::value_type())
     {
       EIGEN_UNUSED_VARIABLE(enableif);
       reserveInnerVectors(reserveSizes);
@@ -289,15 +300,15 @@
     {
       if(isCompressed())
       {
-        std::size_t totalReserveSize = 0;
+        Index totalReserveSize = 0;
         // turn the matrix into non-compressed mode
-        m_innerNonZeros = static_cast<Index*>(std::malloc(m_outerSize * sizeof(Index)));
+        m_innerNonZeros = static_cast<StorageIndex*>(std::malloc(m_outerSize * sizeof(StorageIndex)));
         if (!m_innerNonZeros) internal::throw_std_bad_alloc();
         
         // temporarily use m_innerSizes to hold the new starting points.
-        Index* newOuterIndex = m_innerNonZeros;
+        StorageIndex* newOuterIndex = m_innerNonZeros;
         
-        Index count = 0;
+        StorageIndex count = 0;
         for(Index j=0; j<m_outerSize; ++j)
         {
           newOuterIndex[j] = count;
@@ -305,10 +316,10 @@
           totalReserveSize += reserveSizes[j];
         }
         m_data.reserve(totalReserveSize);
-        Index previousOuterIndex = m_outerIndex[m_outerSize];
+        StorageIndex previousOuterIndex = m_outerIndex[m_outerSize];
         for(Index j=m_outerSize-1; j>=0; --j)
         {
-          Index innerNNZ = previousOuterIndex - m_outerIndex[j];
+          StorageIndex innerNNZ = previousOuterIndex - m_outerIndex[j];
           for(Index i=innerNNZ-1; i>=0; --i)
           {
             m_data.index(newOuterIndex[j]+i) = m_data.index(m_outerIndex[j]+i);
@@ -318,21 +329,22 @@
           m_outerIndex[j] = newOuterIndex[j];
           m_innerNonZeros[j] = innerNNZ;
         }
-        m_outerIndex[m_outerSize] = m_outerIndex[m_outerSize-1] + m_innerNonZeros[m_outerSize-1] + reserveSizes[m_outerSize-1];
+        if(m_outerSize>0)
+          m_outerIndex[m_outerSize] = m_outerIndex[m_outerSize-1] + m_innerNonZeros[m_outerSize-1] + reserveSizes[m_outerSize-1];
         
         m_data.resize(m_outerIndex[m_outerSize]);
       }
       else
       {
-        Index* newOuterIndex = static_cast<Index*>(std::malloc((m_outerSize+1)*sizeof(Index)));
+        StorageIndex* newOuterIndex = static_cast<StorageIndex*>(std::malloc((m_outerSize+1)*sizeof(StorageIndex)));
         if (!newOuterIndex) internal::throw_std_bad_alloc();
         
-        Index count = 0;
+        StorageIndex count = 0;
         for(Index j=0; j<m_outerSize; ++j)
         {
           newOuterIndex[j] = count;
-          Index alreadyReserved = (m_outerIndex[j+1]-m_outerIndex[j]) - m_innerNonZeros[j];
-          Index toReserve = std::max<Index>(reserveSizes[j], alreadyReserved);
+          StorageIndex alreadyReserved = (m_outerIndex[j+1]-m_outerIndex[j]) - m_innerNonZeros[j];
+          StorageIndex toReserve = std::max<StorageIndex>(reserveSizes[j], alreadyReserved);
           count += toReserve + m_innerNonZeros[j];
         }
         newOuterIndex[m_outerSize] = count;
@@ -343,7 +355,7 @@
           Index offset = newOuterIndex[j] - m_outerIndex[j];
           if(offset>0)
           {
-            Index innerNNZ = m_innerNonZeros[j];
+            StorageIndex innerNNZ = m_innerNonZeros[j];
             for(Index i=innerNNZ-1; i>=0; --i)
             {
               m_data.index(newOuterIndex[j]+i) = m_data.index(m_outerIndex[j]+i);
@@ -380,7 +392,7 @@
       * \sa insertBack, startVec */
     inline Scalar& insertBackByOuterInner(Index outer, Index inner)
     {
-      eigen_assert(size_t(m_outerIndex[outer+1]) == m_data.size() && "Invalid ordered insertion (invalid outer index)");
+      eigen_assert(Index(m_outerIndex[outer+1]) == m_data.size() && "Invalid ordered insertion (invalid outer index)");
       eigen_assert( (m_outerIndex[outer+1]-m_outerIndex[outer]==0 || m_data.index(m_data.size()-1)<inner) && "Invalid ordered insertion (invalid inner index)");
       Index p = m_outerIndex[outer+1];
       ++m_outerIndex[outer+1];
@@ -414,7 +426,7 @@
     {
       if(isCompressed())
       {
-        Index size = static_cast<Index>(m_data.size());
+        StorageIndex size = internal::convert_index<StorageIndex>(m_data.size());
         Index i = m_outerSize;
         // find the last filled column
         while (i>=0 && m_outerIndex[i]==0)
@@ -433,7 +445,13 @@
     template<typename InputIterators>
     void setFromTriplets(const InputIterators& begin, const InputIterators& end);
 
-    void sumupDuplicates();
+    template<typename InputIterators,typename DupFunctor>
+    void setFromTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func);
+
+    void sumupDuplicates() { collapseDuplicates(internal::scalar_sum_op<Scalar,Scalar>()); }
+
+    template<typename DupFunctor>
+    void collapseDuplicates(DupFunctor dup_func = DupFunctor());
 
     //---
     
@@ -451,6 +469,8 @@
       if(isCompressed())
         return;
       
+      eigen_internal_assert(m_outerIndex!=0 && m_outerSize>0);
+      
       Index oldStart = m_outerIndex[1];
       m_outerIndex[1] = m_innerNonZeros[0];
       for(Index j=1; j<m_outerSize; ++j)
@@ -479,14 +499,14 @@
     {
       if(m_innerNonZeros != 0)
         return; 
-      m_innerNonZeros = static_cast<Index*>(std::malloc(m_outerSize * sizeof(Index)));
+      m_innerNonZeros = static_cast<StorageIndex*>(std::malloc(m_outerSize * sizeof(StorageIndex)));
       for (Index i = 0; i < m_outerSize; i++)
       {
         m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i]; 
       }
     }
-    
-    /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerence \a epsilon */
+
+    /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerance \a epsilon */
     void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
     {
       prune(default_prunning_func(reference,epsilon));
@@ -503,10 +523,9 @@
     void prune(const KeepFunc& keep = KeepFunc())
     {
       // TODO optimize the uncompressed mode to avoid moving and allocating the data twice
-      // TODO also implement a unit test
       makeCompressed();
 
-      Index k = 0;
+      StorageIndex k = 0;
       for(Index j=0; j<m_outerSize; ++j)
       {
         Index previousStart = m_outerIndex[j];
@@ -527,7 +546,12 @@
     }
 
     /** Resizes the matrix to a \a rows x \a cols matrix leaving old values untouched.
-      * \sa resizeNonZeros(Index), reserve(), setZero()
+      *
+      * If the sizes of the matrix are decreased, then the matrix is turned to \b uncompressed-mode
+      * and the storage of the out of bounds coefficients is kept and reserved.
+      * Call makeCompressed() to pack the entries and squeeze extra memory.
+      *
+      * \sa reserve(), setZero(), makeCompressed()
       */
     void conservativeResize(Index rows, Index cols) 
     {
@@ -539,13 +563,13 @@
 
       Index innerChange = IsRowMajor ? cols - this->cols() : rows - this->rows();
       Index outerChange = IsRowMajor ? rows - this->rows() : cols - this->cols();
-      Index newInnerSize = IsRowMajor ? cols : rows;
+      StorageIndex newInnerSize = convert_index(IsRowMajor ? cols : rows);
 
       // Deals with inner non zeros
       if (m_innerNonZeros)
       {
         // Resize m_innerNonZeros
-        Index *newInnerNonZeros = static_cast<Index*>(std::realloc(m_innerNonZeros, (m_outerSize + outerChange) * sizeof(Index)));
+        StorageIndex *newInnerNonZeros = static_cast<StorageIndex*>(std::realloc(m_innerNonZeros, (m_outerSize + outerChange) * sizeof(StorageIndex)));
         if (!newInnerNonZeros) internal::throw_std_bad_alloc();
         m_innerNonZeros = newInnerNonZeros;
         
@@ -555,10 +579,12 @@
       else if (innerChange < 0) 
       {
         // Inner size decreased: allocate a new m_innerNonZeros
-        m_innerNonZeros = static_cast<Index*>(std::malloc((m_outerSize+outerChange+1) * sizeof(Index)));
+        m_innerNonZeros = static_cast<StorageIndex*>(std::malloc((m_outerSize + outerChange) * sizeof(StorageIndex)));
         if (!m_innerNonZeros) internal::throw_std_bad_alloc();
-        for(Index i = 0; i < m_outerSize; i++)
+        for(Index i = 0; i < m_outerSize + (std::min)(outerChange, Index(0)); i++)
           m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i];
+        for(Index i = m_outerSize; i < m_outerSize + outerChange; i++)
+          m_innerNonZeros[i] = 0;
       }
       
       // Change the m_innerNonZeros in case of a decrease of inner size
@@ -566,8 +592,8 @@
       {
         for(Index i = 0; i < m_outerSize + (std::min)(outerChange, Index(0)); i++)
         {
-          Index &n = m_innerNonZeros[i];
-          Index start = m_outerIndex[i];
+          StorageIndex &n = m_innerNonZeros[i];
+          StorageIndex start = m_outerIndex[i];
           while (n > 0 && m_data.index(start+n-1) >= newInnerSize) --n; 
         }
       }
@@ -578,20 +604,24 @@
       if (outerChange == 0)
         return;
           
-      Index *newOuterIndex = static_cast<Index*>(std::realloc(m_outerIndex, (m_outerSize + outerChange + 1) * sizeof(Index)));
+      StorageIndex *newOuterIndex = static_cast<StorageIndex*>(std::realloc(m_outerIndex, (m_outerSize + outerChange + 1) * sizeof(StorageIndex)));
       if (!newOuterIndex) internal::throw_std_bad_alloc();
       m_outerIndex = newOuterIndex;
       if (outerChange > 0)
       {
-        Index last = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize];
+        StorageIndex lastIdx = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize];
         for(Index i=m_outerSize; i<m_outerSize+outerChange+1; i++)          
-          m_outerIndex[i] = last; 
+          m_outerIndex[i] = lastIdx; 
       }
       m_outerSize += outerChange;
     }
     
     /** Resizes the matrix to a \a rows x \a cols matrix and initializes it to zero.
-      * \sa resizeNonZeros(Index), reserve(), setZero()
+      * 
+      * This function does not free the currently allocated memory. To release as much as memory as possible,
+      * call \code mat.data().squeeze(); \endcode after resizing it.
+      * 
+      * \sa reserve(), setZero()
       */
     void resize(Index rows, Index cols)
     {
@@ -601,7 +631,7 @@
       if (m_outerSize != outerSize || m_outerSize==0)
       {
         std::free(m_outerIndex);
-        m_outerIndex = static_cast<Index*>(std::malloc((outerSize + 1) * sizeof(Index)));
+        m_outerIndex = static_cast<StorageIndex*>(std::malloc((outerSize + 1) * sizeof(StorageIndex)));
         if (!m_outerIndex) internal::throw_std_bad_alloc();
         
         m_outerSize = outerSize;
@@ -611,19 +641,24 @@
         std::free(m_innerNonZeros);
         m_innerNonZeros = 0;
       }
-      memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(Index));
+      memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(StorageIndex));
     }
 
     /** \internal
       * Resize the nonzero vector to \a size */
     void resizeNonZeros(Index size)
     {
-      // TODO remove this function
       m_data.resize(size);
     }
 
-    /** \returns a const expression of the diagonal coefficients */
-    const Diagonal<const SparseMatrix> diagonal() const { return *this; }
+    /** \returns a const expression of the diagonal coefficients. */
+    const ConstDiagonalReturnType diagonal() const { return ConstDiagonalReturnType(*this); }
+    
+    /** \returns a read-write expression of the diagonal coefficients.
+      * \warning If the diagonal entries are written, then all diagonal
+      * entries \b must already exist, otherwise an assertion will be raised.
+      */
+    DiagonalReturnType diagonal() { return DiagonalReturnType(*this); }
 
     /** Default constructor yielding an empty \c 0 \c x \c 0 matrix */
     inline SparseMatrix()
@@ -649,7 +684,16 @@
       EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
       check_template_parameters();
-      *this = other.derived();
+      const bool needToTranspose = (Flags & RowMajorBit) != (internal::evaluator<OtherDerived>::Flags & RowMajorBit);
+      if (needToTranspose)
+        *this = other.derived();
+      else
+      {
+        #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+          EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+        #endif
+        internal::call_assignment_no_alias(*this, other.derived());
+      }
     }
     
     /** Constructs a sparse matrix from the sparse selfadjoint view \a other */
@@ -658,7 +702,7 @@
       : m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
     {
       check_template_parameters();
-      *this = other;
+      Base::operator=(other);
     }
 
     /** Copy constructor (it performs a deep copy) */
@@ -678,6 +722,15 @@
       initAssignment(other);
       other.evalTo(*this);
     }
+    
+    /** \brief Copy constructor with in-place evaluation */
+    template<typename OtherDerived>
+    explicit SparseMatrix(const DiagonalBase<OtherDerived>& other)
+      : Base(), m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
+    {
+      check_template_parameters();
+      *this = other.derived();
+    }
 
     /** Swaps the content of two sparse matrices of the same type.
       * This is a fast operation that simply swaps the underlying pointers and parameters. */
@@ -691,14 +744,17 @@
       m_data.swap(other.m_data);
     }
 
-    /** Sets *this to the identity matrix */
+    /** Sets *this to the identity matrix.
+      * This function also turns the matrix into compressed mode, and drop any reserved memory. */
     inline void setIdentity()
     {
       eigen_assert(rows() == cols() && "ONLY FOR SQUARED MATRICES");
       this->m_data.resize(rows());
-      Eigen::Map<Matrix<Index, Dynamic, 1> >(&this->m_data.index(0), rows()).setLinSpaced(0, rows()-1);
-      Eigen::Map<Matrix<Scalar, Dynamic, 1> >(&this->m_data.value(0), rows()).setOnes();
-      Eigen::Map<Matrix<Index, Dynamic, 1> >(this->m_outerIndex, rows()+1).setLinSpaced(0, rows());
+      Eigen::Map<IndexVector>(this->m_data.indexPtr(), rows()).setLinSpaced(0, StorageIndex(rows()-1));
+      Eigen::Map<ScalarVector>(this->m_data.valuePtr(), rows()).setOnes();
+      Eigen::Map<IndexVector>(this->m_outerIndex, rows()+1).setLinSpaced(0, StorageIndex(rows()));
+      std::free(m_innerNonZeros);
+      m_innerNonZeros = 0;
     }
     inline SparseMatrix& operator=(const SparseMatrix& other)
     {
@@ -708,6 +764,9 @@
       }
       else if(this!=&other)
       {
+        #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+          EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+        #endif
         initAssignment(other);
         if(other.isCompressed())
         {
@@ -722,22 +781,14 @@
       return *this;
     }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename Lhs, typename Rhs>
-    inline SparseMatrix& operator=(const SparseSparseProduct<Lhs,Rhs>& product)
-    { return Base::operator=(product); }
-    
-    template<typename OtherDerived>
-    inline SparseMatrix& operator=(const ReturnByValue<OtherDerived>& other)
-    {
-      initAssignment(other);
-      return Base::operator=(other.derived());
-    }
-    
+#ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename OtherDerived>
     inline SparseMatrix& operator=(const EigenBase<OtherDerived>& other)
     { return Base::operator=(other.derived()); }
-    #endif
+
+    template<typename Lhs, typename Rhs>
+    inline SparseMatrix& operator=(const Product<Lhs,Rhs,AliasFreeProduct>& other);
+#endif // EIGEN_PARSED_BY_DOXYGEN
 
     template<typename OtherDerived>
     EIGEN_DONT_INLINE SparseMatrix& operator=(const SparseMatrixBase<OtherDerived>& other);
@@ -747,30 +798,38 @@
       EIGEN_DBG_SPARSE(
         s << "Nonzero entries:\n";
         if(m.isCompressed())
+        {
           for (Index i=0; i<m.nonZeros(); ++i)
             s << "(" << m.m_data.value(i) << "," << m.m_data.index(i) << ") ";
+        }
         else
+        {
           for (Index i=0; i<m.outerSize(); ++i)
           {
             Index p = m.m_outerIndex[i];
             Index pe = m.m_outerIndex[i]+m.m_innerNonZeros[i];
             Index k=p;
-            for (; k<pe; ++k)
+            for (; k<pe; ++k) {
               s << "(" << m.m_data.value(k) << "," << m.m_data.index(k) << ") ";
-            for (; k<m.m_outerIndex[i+1]; ++k)
+            }
+            for (; k<m.m_outerIndex[i+1]; ++k) {
               s << "(_,_) ";
+            }
           }
+        }
         s << std::endl;
         s << std::endl;
         s << "Outer pointers:\n";
-        for (Index i=0; i<m.outerSize(); ++i)
+        for (Index i=0; i<m.outerSize(); ++i) {
           s << m.m_outerIndex[i] << " ";
+        }
         s << " $" << std::endl;
         if(!m.isCompressed())
         {
           s << "Inner non zeros:\n";
-          for (Index i=0; i<m.outerSize(); ++i)
+          for (Index i=0; i<m.outerSize(); ++i) {
             s << m.m_innerNonZeros[i] << " ";
+          }
           s << " $" << std::endl;
         }
         s << std::endl;
@@ -786,10 +845,8 @@
       std::free(m_innerNonZeros);
     }
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
     /** Overloaded for performance */
     Scalar sum() const;
-#endif
     
 #   ifdef EIGEN_SPARSEMATRIX_PLUGIN
 #     include EIGEN_SPARSEMATRIX_PLUGIN
@@ -800,9 +857,9 @@
     template<typename Other>
     void initAssignment(const Other& other)
     {
-      resize(static_cast<typename SparseMatrix::Index>(other.rows()),
-             static_cast<typename SparseMatrix::Index>(other.cols()));
-      if (m_innerNonZeros) {
+      resize(other.rows(), other.cols());
+      if(m_innerNonZeros)
+      {
         std::free(m_innerNonZeros);
         m_innerNonZeros = 0;
       }
@@ -816,15 +873,15 @@
       * A vector object that is equal to 0 everywhere but v at the position i */
     class SingletonVector
     {
-        Index m_index;
-        Index m_value;
+        StorageIndex m_index;
+        StorageIndex m_value;
       public:
-        typedef Index value_type;
+        typedef StorageIndex value_type;
         SingletonVector(Index i, Index v)
-          : m_index(i), m_value(v)
+          : m_index(convert_index(i)), m_value(convert_index(v))
         {}
 
-        Index operator[](Index i) const { return i==m_index ? m_value : 0; }
+        StorageIndex operator[](Index i) const { return i==m_index ? m_value : 0; }
     };
 
     /** \internal
@@ -843,14 +900,121 @@
       eigen_assert(m_innerNonZeros[outer]<=(m_outerIndex[outer+1] - m_outerIndex[outer]));
 
       Index p = m_outerIndex[outer] + m_innerNonZeros[outer]++;
-      m_data.index(p) = inner;
-      return (m_data.value(p) = 0);
+      m_data.index(p) = convert_index(inner);
+      return (m_data.value(p) = Scalar(0));
+    }
+protected:
+    struct IndexPosPair {
+      IndexPosPair(Index a_i, Index a_p) : i(a_i), p(a_p) {}
+      Index i;
+      Index p;
+    };
+
+    /** \internal assign \a diagXpr to the diagonal of \c *this
+      * There are different strategies:
+      *   1 - if *this is overwritten (Func==assign_op) or *this is empty, then we can work treat *this as a dense vector expression.
+      *   2 - otherwise, for each diagonal coeff,
+      *     2.a - if it already exists, then we update it,
+      *     2.b - otherwise, if *this is uncompressed and that the current inner-vector has empty room for at least 1 element, then we perform an in-place insertion.
+      *     2.c - otherwise, we'll have to reallocate and copy everything, so instead of doing so for each new element, it is recorded in a std::vector.
+      *   3 - at the end, if some entries failed to be inserted in-place, then we alloc a new buffer, copy each chunk at the right position, and insert the new elements.
+      * 
+      * TODO: some piece of code could be isolated and reused for a general in-place update strategy.
+      * TODO: if we start to defer the insertion of some elements (i.e., case 2.c executed once),
+      *       then it *might* be better to disable case 2.b since they will have to be copied anyway.
+      */
+    template<typename DiagXpr, typename Func>
+    void assignDiagonal(const DiagXpr diagXpr, const Func& assignFunc)
+    {
+      Index n = diagXpr.size();
+
+      const bool overwrite = internal::is_same<Func, internal::assign_op<Scalar,Scalar> >::value;
+      if(overwrite)
+      {
+        if((this->rows()!=n) || (this->cols()!=n))
+          this->resize(n, n);
+      }
+
+      if(m_data.size()==0 || overwrite)
+      {
+        typedef Array<StorageIndex,Dynamic,1> ArrayXI;  
+        this->makeCompressed();
+        this->resizeNonZeros(n);
+        Eigen::Map<ArrayXI>(this->innerIndexPtr(), n).setLinSpaced(0,StorageIndex(n)-1);
+        Eigen::Map<ArrayXI>(this->outerIndexPtr(), n+1).setLinSpaced(0,StorageIndex(n));
+        Eigen::Map<Array<Scalar,Dynamic,1> > values = this->coeffs();
+        values.setZero();
+        internal::call_assignment_no_alias(values, diagXpr, assignFunc);
+      }
+      else
+      {
+        bool isComp = isCompressed();
+        internal::evaluator<DiagXpr> diaEval(diagXpr);
+        std::vector<IndexPosPair> newEntries;
+
+        // 1 - try in-place update and record insertion failures
+        for(Index i = 0; i<n; ++i)
+        {
+          internal::LowerBoundIndex lb = this->lower_bound(i,i);
+          Index p = lb.value;
+          if(lb.found)
+          {
+            // the coeff already exists
+            assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i));
+          }
+          else if((!isComp) && m_innerNonZeros[i] < (m_outerIndex[i+1]-m_outerIndex[i]))
+          {
+            // non compressed mode with local room for inserting one element
+            m_data.moveChunk(p, p+1, m_outerIndex[i]+m_innerNonZeros[i]-p);
+            m_innerNonZeros[i]++;
+            m_data.value(p) = Scalar(0);
+            m_data.index(p) = StorageIndex(i);
+            assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i));
+          }
+          else
+          {
+            // defer insertion
+            newEntries.push_back(IndexPosPair(i,p));
+          }
+        }
+        // 2 - insert deferred entries
+        Index n_entries = Index(newEntries.size());
+        if(n_entries>0)
+        {
+          Storage newData(m_data.size()+n_entries);
+          Index prev_p = 0;
+          Index prev_i = 0;
+          for(Index k=0; k<n_entries;++k)
+          {
+            Index i = newEntries[k].i;
+            Index p = newEntries[k].p;
+            internal::smart_copy(m_data.valuePtr()+prev_p, m_data.valuePtr()+p, newData.valuePtr()+prev_p+k);
+            internal::smart_copy(m_data.indexPtr()+prev_p, m_data.indexPtr()+p, newData.indexPtr()+prev_p+k);
+            for(Index j=prev_i;j<i;++j)
+              m_outerIndex[j+1] += k;
+            if(!isComp)
+              m_innerNonZeros[i]++;
+            prev_p = p;
+            prev_i = i;
+            newData.value(p+k) = Scalar(0);
+            newData.index(p+k) = StorageIndex(i);
+            assignFunc.assignCoeff(newData.value(p+k), diaEval.coeff(i));
+          }
+          {
+            internal::smart_copy(m_data.valuePtr()+prev_p, m_data.valuePtr()+m_data.size(), newData.valuePtr()+prev_p+n_entries);
+            internal::smart_copy(m_data.indexPtr()+prev_p, m_data.indexPtr()+m_data.size(), newData.indexPtr()+prev_p+n_entries);
+            for(Index j=prev_i+1;j<=m_outerSize;++j)
+              m_outerIndex[j] += n_entries;
+          }
+          m_data.swap(newData);
+        }
+      }
     }
 
 private:
   static void check_template_parameters()
   {
-    EIGEN_STATIC_ASSERT(NumTraits<Index>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE);
+    EIGEN_STATIC_ASSERT(NumTraits<StorageIndex>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE);
     EIGEN_STATIC_ASSERT((Options&(ColMajor|RowMajor))==Options,INVALID_MATRIX_TEMPLATE_PARAMETERS);
   }
 
@@ -865,87 +1029,20 @@
   };
 };
 
-template<typename Scalar, int _Options, typename _Index>
-class SparseMatrix<Scalar,_Options,_Index>::InnerIterator
-{
-  public:
-    InnerIterator(const SparseMatrix& mat, Index outer)
-      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_id(mat.m_outerIndex[outer])
-    {
-      if(mat.isCompressed())
-        m_end = mat.m_outerIndex[outer+1];
-      else
-        m_end = m_id + mat.m_innerNonZeros[outer];
-    }
-
-    inline InnerIterator& operator++() { m_id++; return *this; }
-
-    inline const Scalar& value() const { return m_values[m_id]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); }
-
-    inline Index index() const { return m_indices[m_id]; }
-    inline Index outer() const { return m_outer; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id < m_end); }
-
-  protected:
-    const Scalar* m_values;
-    const Index* m_indices;
-    const Index m_outer;
-    Index m_id;
-    Index m_end;
-};
-
-template<typename Scalar, int _Options, typename _Index>
-class SparseMatrix<Scalar,_Options,_Index>::ReverseInnerIterator
-{
-  public:
-    ReverseInnerIterator(const SparseMatrix& mat, Index outer)
-      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.m_outerIndex[outer])
-    {
-      if(mat.isCompressed())
-        m_id = mat.m_outerIndex[outer+1];
-      else
-        m_id = m_start + mat.m_innerNonZeros[outer];
-    }
-
-    inline ReverseInnerIterator& operator--() { --m_id; return *this; }
-
-    inline const Scalar& value() const { return m_values[m_id-1]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); }
-
-    inline Index index() const { return m_indices[m_id-1]; }
-    inline Index outer() const { return m_outer; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id > m_start); }
-
-  protected:
-    const Scalar* m_values;
-    const Index* m_indices;
-    const Index m_outer;
-    Index m_id;
-    const Index m_start;
-};
-
 namespace internal {
 
-template<typename InputIterator, typename SparseMatrixType>
-void set_from_triplets(const InputIterator& begin, const InputIterator& end, SparseMatrixType& mat, int Options = 0)
+template<typename InputIterator, typename SparseMatrixType, typename DupFunctor>
+void set_from_triplets(const InputIterator& begin, const InputIterator& end, SparseMatrixType& mat, DupFunctor dup_func)
 {
-  EIGEN_UNUSED_VARIABLE(Options);
   enum { IsRowMajor = SparseMatrixType::IsRowMajor };
   typedef typename SparseMatrixType::Scalar Scalar;
-  typedef typename SparseMatrixType::Index Index;
-  SparseMatrix<Scalar,IsRowMajor?ColMajor:RowMajor> trMat(mat.rows(),mat.cols());
+  typedef typename SparseMatrixType::StorageIndex StorageIndex;
+  SparseMatrix<Scalar,IsRowMajor?ColMajor:RowMajor,StorageIndex> trMat(mat.rows(),mat.cols());
 
   if(begin!=end)
   {
     // pass 1: count the nnz per inner-vector
-    Matrix<Index,Dynamic,1> wi(trMat.outerSize());
+    typename SparseMatrixType::IndexVector wi(trMat.outerSize());
     wi.setZero();
     for(InputIterator it(begin); it!=end; ++it)
     {
@@ -959,7 +1056,7 @@
       trMat.insertBackUncompressed(it->row(),it->col()) = it->value();
 
     // pass 3:
-    trMat.sumupDuplicates();
+    trMat.collapseDuplicates(dup_func);
   }
 
   // pass 4: transposed copy -> implicit sorting
@@ -991,7 +1088,7 @@
   * \code
     typedef Triplet<double> T;
     std::vector<T> tripletList;
-    triplets.reserve(estimation_of_entries);
+    tripletList.reserve(estimation_of_entries);
     for(...)
     {
       // ...
@@ -1004,28 +1101,45 @@
   *
   * \warning The list of triplets is read multiple times (at least twice). Therefore, it is not recommended to define
   * an abstract iterator over a complex data-structure that would be expensive to evaluate. The triplets should rather
-  * be explicitely stored into a std::vector for instance.
+  * be explicitly stored into a std::vector for instance.
   */
-template<typename Scalar, int _Options, typename _Index>
+template<typename Scalar, int _Options, typename _StorageIndex>
 template<typename InputIterators>
-void SparseMatrix<Scalar,_Options,_Index>::setFromTriplets(const InputIterators& begin, const InputIterators& end)
+void SparseMatrix<Scalar,_Options,_StorageIndex>::setFromTriplets(const InputIterators& begin, const InputIterators& end)
 {
-  internal::set_from_triplets(begin, end, *this);
+  internal::set_from_triplets<InputIterators, SparseMatrix<Scalar,_Options,_StorageIndex> >(begin, end, *this, internal::scalar_sum_op<Scalar,Scalar>());
+}
+
+/** The same as setFromTriplets but when duplicates are met the functor \a dup_func is applied:
+  * \code
+  * value = dup_func(OldValue, NewValue)
+  * \endcode 
+  * Here is a C++11 example keeping the latest entry only:
+  * \code
+  * mat.setFromTriplets(triplets.begin(), triplets.end(), [] (const Scalar&,const Scalar &b) { return b; });
+  * \endcode
+  */
+template<typename Scalar, int _Options, typename _StorageIndex>
+template<typename InputIterators,typename DupFunctor>
+void SparseMatrix<Scalar,_Options,_StorageIndex>::setFromTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func)
+{
+  internal::set_from_triplets<InputIterators, SparseMatrix<Scalar,_Options,_StorageIndex>, DupFunctor>(begin, end, *this, dup_func);
 }
 
 /** \internal */
-template<typename Scalar, int _Options, typename _Index>
-void SparseMatrix<Scalar,_Options,_Index>::sumupDuplicates()
+template<typename Scalar, int _Options, typename _StorageIndex>
+template<typename DupFunctor>
+void SparseMatrix<Scalar,_Options,_StorageIndex>::collapseDuplicates(DupFunctor dup_func)
 {
   eigen_assert(!isCompressed());
   // TODO, in practice we should be able to use m_innerNonZeros for that task
-  Matrix<Index,Dynamic,1> wi(innerSize());
+  IndexVector wi(innerSize());
   wi.fill(-1);
-  Index count = 0;
+  StorageIndex count = 0;
   // for each inner-vector, wi[inner_index] will hold the position of first element into the index/value buffers
   for(Index j=0; j<outerSize(); ++j)
   {
-    Index start   = count;
+    StorageIndex start   = count;
     Index oldEnd  = m_outerIndex[j]+m_innerNonZeros[j];
     for(Index k=m_outerIndex[j]; k<oldEnd; ++k)
     {
@@ -1033,7 +1147,7 @@
       if(wi(i)>=start)
       {
         // we already meet this entry => accumulate it
-        m_data.value(wi(i)) += m_data.value(k);
+        m_data.value(wi(i)) = dup_func(m_data.value(wi(i)), m_data.value(k));
       }
       else
       {
@@ -1053,39 +1167,48 @@
   m_data.resize(m_outerIndex[m_outerSize]);
 }
 
-template<typename Scalar, int _Options, typename _Index>
+template<typename Scalar, int _Options, typename _StorageIndex>
 template<typename OtherDerived>
-EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Options,_Index>::operator=(const SparseMatrixBase<OtherDerived>& other)
+EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_StorageIndex>& SparseMatrix<Scalar,_Options,_StorageIndex>::operator=(const SparseMatrixBase<OtherDerived>& other)
 {
   EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-  
-  const bool needToTranspose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
+
+  #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+    EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+  #endif
+      
+  const bool needToTranspose = (Flags & RowMajorBit) != (internal::evaluator<OtherDerived>::Flags & RowMajorBit);
   if (needToTranspose)
   {
+    #ifdef EIGEN_SPARSE_TRANSPOSED_COPY_PLUGIN
+      EIGEN_SPARSE_TRANSPOSED_COPY_PLUGIN
+    #endif
     // two passes algorithm:
     //  1 - compute the number of coeffs per dest inner vector
     //  2 - do the actual copy/eval
     // Since each coeff of the rhs has to be evaluated twice, let's evaluate it if needed
-    typedef typename internal::nested<OtherDerived,2>::type OtherCopy;
+    typedef typename internal::nested_eval<OtherDerived,2,typename internal::plain_matrix_type<OtherDerived>::type >::type OtherCopy;
     typedef typename internal::remove_all<OtherCopy>::type _OtherCopy;
+    typedef internal::evaluator<_OtherCopy> OtherCopyEval;
     OtherCopy otherCopy(other.derived());
+    OtherCopyEval otherCopyEval(otherCopy);
 
-    SparseMatrix dest(static_cast<typename SparseMatrix::Index>(other.rows()),static_cast<typename SparseMatrix::Index>(other.cols()));
-    Eigen::Map<Matrix<Index, Dynamic, 1> > (dest.m_outerIndex,dest.outerSize()).setZero();
+    SparseMatrix dest(other.rows(),other.cols());
+    Eigen::Map<IndexVector> (dest.m_outerIndex,dest.outerSize()).setZero();
 
     // pass 1
     // FIXME the above copy could be merged with that pass
     for (Index j=0; j<otherCopy.outerSize(); ++j)
-      for (typename _OtherCopy::InnerIterator it(otherCopy, j); it; ++it)
+      for (typename OtherCopyEval::InnerIterator it(otherCopyEval, j); it; ++it)
         ++dest.m_outerIndex[it.index()];
 
     // prefix sum
-    Index count = 0;
-    Matrix<Index,Dynamic,1> positions(dest.outerSize());
+    StorageIndex count = 0;
+    IndexVector positions(dest.outerSize());
     for (Index j=0; j<dest.outerSize(); ++j)
     {
-      Index tmp = dest.m_outerIndex[j];
+      StorageIndex tmp = dest.m_outerIndex[j];
       dest.m_outerIndex[j] = count;
       positions[j] = count;
       count += tmp;
@@ -1094,9 +1217,9 @@
     // alloc
     dest.m_data.resize(count);
     // pass 2
-    for (Index j=0; j<otherCopy.outerSize(); ++j)
+    for (StorageIndex j=0; j<otherCopy.outerSize(); ++j)
     {
-      for (typename _OtherCopy::InnerIterator it(otherCopy, j); it; ++it)
+      for (typename OtherCopyEval::InnerIterator it(otherCopyEval, j); it; ++it)
       {
         Index pos = positions[it.index()]++;
         dest.m_data.index(pos) = j;
@@ -1109,26 +1232,148 @@
   else
   {
     if(other.isRValue())
+    {
       initAssignment(other.derived());
+    }
     // there is no special optimization
     return Base::operator=(other.derived());
   }
 }
 
-template<typename _Scalar, int _Options, typename _Index>
-EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Options,_Index>::insertUncompressed(Index row, Index col)
+template<typename _Scalar, int _Options, typename _StorageIndex>
+typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Scalar,_Options,_StorageIndex>::insert(Index row, Index col)
+{
+  eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
+  
+  const Index outer = IsRowMajor ? row : col;
+  const Index inner = IsRowMajor ? col : row;
+  
+  if(isCompressed())
+  {
+    if(nonZeros()==0)
+    {
+      // reserve space if not already done
+      if(m_data.allocatedSize()==0)
+        m_data.reserve(2*m_innerSize);
+      
+      // turn the matrix into non-compressed mode
+      m_innerNonZeros = static_cast<StorageIndex*>(std::malloc(m_outerSize * sizeof(StorageIndex)));
+      if(!m_innerNonZeros) internal::throw_std_bad_alloc();
+      
+      memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex));
+      
+      // pack all inner-vectors to the end of the pre-allocated space
+      // and allocate the entire free-space to the first inner-vector
+      StorageIndex end = convert_index(m_data.allocatedSize());
+      for(Index j=1; j<=m_outerSize; ++j)
+        m_outerIndex[j] = end;
+    }
+    else
+    {
+      // turn the matrix into non-compressed mode
+      m_innerNonZeros = static_cast<StorageIndex*>(std::malloc(m_outerSize * sizeof(StorageIndex)));
+      if(!m_innerNonZeros) internal::throw_std_bad_alloc();
+      for(Index j=0; j<m_outerSize; ++j)
+        m_innerNonZeros[j] = m_outerIndex[j+1]-m_outerIndex[j];
+    }
+  }
+  
+  // check whether we can do a fast "push back" insertion
+  Index data_end = m_data.allocatedSize();
+  
+  // First case: we are filling a new inner vector which is packed at the end.
+  // We assume that all remaining inner-vectors are also empty and packed to the end.
+  if(m_outerIndex[outer]==data_end)
+  {
+    eigen_internal_assert(m_innerNonZeros[outer]==0);
+    
+    // pack previous empty inner-vectors to end of the used-space
+    // and allocate the entire free-space to the current inner-vector.
+    StorageIndex p = convert_index(m_data.size());
+    Index j = outer;
+    while(j>=0 && m_innerNonZeros[j]==0)
+      m_outerIndex[j--] = p;
+    
+    // push back the new element
+    ++m_innerNonZeros[outer];
+    m_data.append(Scalar(0), inner);
+    
+    // check for reallocation
+    if(data_end != m_data.allocatedSize())
+    {
+      // m_data has been reallocated
+      //  -> move remaining inner-vectors back to the end of the free-space
+      //     so that the entire free-space is allocated to the current inner-vector.
+      eigen_internal_assert(data_end < m_data.allocatedSize());
+      StorageIndex new_end = convert_index(m_data.allocatedSize());
+      for(Index k=outer+1; k<=m_outerSize; ++k)
+        if(m_outerIndex[k]==data_end)
+          m_outerIndex[k] = new_end;
+    }
+    return m_data.value(p);
+  }
+  
+  // Second case: the next inner-vector is packed to the end
+  // and the current inner-vector end match the used-space.
+  if(m_outerIndex[outer+1]==data_end && m_outerIndex[outer]+m_innerNonZeros[outer]==m_data.size())
+  {
+    eigen_internal_assert(outer+1==m_outerSize || m_innerNonZeros[outer+1]==0);
+    
+    // add space for the new element
+    ++m_innerNonZeros[outer];
+    m_data.resize(m_data.size()+1);
+    
+    // check for reallocation
+    if(data_end != m_data.allocatedSize())
+    {
+      // m_data has been reallocated
+      //  -> move remaining inner-vectors back to the end of the free-space
+      //     so that the entire free-space is allocated to the current inner-vector.
+      eigen_internal_assert(data_end < m_data.allocatedSize());
+      StorageIndex new_end = convert_index(m_data.allocatedSize());
+      for(Index k=outer+1; k<=m_outerSize; ++k)
+        if(m_outerIndex[k]==data_end)
+          m_outerIndex[k] = new_end;
+    }
+    
+    // and insert it at the right position (sorted insertion)
+    Index startId = m_outerIndex[outer];
+    Index p = m_outerIndex[outer]+m_innerNonZeros[outer]-1;
+    while ( (p > startId) && (m_data.index(p-1) > inner) )
+    {
+      m_data.index(p) = m_data.index(p-1);
+      m_data.value(p) = m_data.value(p-1);
+      --p;
+    }
+    
+    m_data.index(p) = convert_index(inner);
+    return (m_data.value(p) = Scalar(0));
+  }
+  
+  if(m_data.size() != m_data.allocatedSize())
+  {
+    // make sure the matrix is compatible to random un-compressed insertion:
+    m_data.resize(m_data.allocatedSize());
+    this->reserveInnerVectors(Array<StorageIndex,Dynamic,1>::Constant(m_outerSize, 2));
+  }
+  
+  return insertUncompressed(row,col);
+}
+    
+template<typename _Scalar, int _Options, typename _StorageIndex>
+EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Scalar,_Options,_StorageIndex>::insertUncompressed(Index row, Index col)
 {
   eigen_assert(!isCompressed());
 
   const Index outer = IsRowMajor ? row : col;
-  const Index inner = IsRowMajor ? col : row;
+  const StorageIndex inner = convert_index(IsRowMajor ? col : row);
 
   Index room = m_outerIndex[outer+1] - m_outerIndex[outer];
-  Index innerNNZ = m_innerNonZeros[outer];
+  StorageIndex innerNNZ = m_innerNonZeros[outer];
   if(innerNNZ>=room)
   {
     // this inner vector is full, we need to reallocate the whole buffer :(
-    reserve(SingletonVector(outer,std::max<Index>(2,innerNNZ)));
+    reserve(SingletonVector(outer,std::max<StorageIndex>(2,innerNNZ)));
   }
 
   Index startId = m_outerIndex[outer];
@@ -1144,11 +1389,11 @@
   m_innerNonZeros[outer]++;
 
   m_data.index(p) = inner;
-  return (m_data.value(p) = 0);
+  return (m_data.value(p) = Scalar(0));
 }
 
-template<typename _Scalar, int _Options, typename _Index>
-EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Options,_Index>::insertCompressed(Index row, Index col)
+template<typename _Scalar, int _Options, typename _StorageIndex>
+EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Scalar,_Options,_StorageIndex>::insertCompressed(Index row, Index col)
 {
   eigen_assert(isCompressed());
 
@@ -1161,7 +1406,7 @@
     // we start a new inner vector
     while (previousOuter>=0 && m_outerIndex[previousOuter]==0)
     {
-      m_outerIndex[previousOuter] = static_cast<Index>(m_data.size());
+      m_outerIndex[previousOuter] = convert_index(m_data.size());
       --previousOuter;
     }
     m_outerIndex[outer+1] = m_outerIndex[outer];
@@ -1171,14 +1416,14 @@
   // starts with: [ 0 0 0 0 0 1 ...] and we are inserted in, e.g.,
   // the 2nd inner vector...
   bool isLastVec = (!(previousOuter==-1 && m_data.size()!=0))
-                && (size_t(m_outerIndex[outer+1]) == m_data.size());
+                && (std::size_t(m_outerIndex[outer+1]) == m_data.size());
 
-  size_t startId = m_outerIndex[outer];
-  // FIXME let's make sure sizeof(long int) == sizeof(size_t)
-  size_t p = m_outerIndex[outer+1];
+  std::size_t startId = m_outerIndex[outer];
+  // FIXME let's make sure sizeof(long int) == sizeof(std::size_t)
+  std::size_t p = m_outerIndex[outer+1];
   ++m_outerIndex[outer+1];
 
-  float reallocRatio = 1;
+  double reallocRatio = 1;
   if (m_data.allocatedSize()<=m_data.size())
   {
     // if there is no preallocated memory, let's reserve a minimum of 32 elements
@@ -1190,13 +1435,13 @@
     {
       // we need to reallocate the data, to reduce multiple reallocations
       // we use a smart resize algorithm based on the current filling ratio
-      // in addition, we use float to avoid integers overflows
-      float nnzEstimate = float(m_outerIndex[outer])*float(m_outerSize)/float(outer+1);
-      reallocRatio = (nnzEstimate-float(m_data.size()))/float(m_data.size());
+      // in addition, we use double to avoid integers overflows
+      double nnzEstimate = double(m_outerIndex[outer])*double(m_outerSize)/double(outer+1);
+      reallocRatio = (nnzEstimate-double(m_data.size()))/double(m_data.size());
       // furthermore we bound the realloc ratio to:
       //   1) reduce multiple minor realloc when the matrix is almost filled
       //   2) avoid to allocate too much memory when the matrix is almost empty
-      reallocRatio = (std::min)((std::max)(reallocRatio,1.5f),8.f);
+      reallocRatio = (std::min)((std::max)(reallocRatio,1.5),8.);
     }
   }
   m_data.resize(m_data.size()+1,reallocRatio);
@@ -1251,7 +1496,21 @@
   }
 
   m_data.index(p) = inner;
-  return (m_data.value(p) = 0);
+  return (m_data.value(p) = Scalar(0));
+}
+
+namespace internal {
+
+template<typename _Scalar, int _Options, typename _StorageIndex>
+struct evaluator<SparseMatrix<_Scalar,_Options,_StorageIndex> >
+  : evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_StorageIndex> > >
+{
+  typedef evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_StorageIndex> > > Base;
+  typedef SparseMatrix<_Scalar,_Options,_StorageIndex> SparseMatrixType;
+  evaluator() : Base() {}
+  explicit evaluator(const SparseMatrixType &mat) : Base(mat) {}
+};
+
 }
 
 } // end namespace Eigen

diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h
index 1983f5e..229449f 100644
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -18,32 +18,41 @@
   *
   * \brief Base class of any sparse matrices or sparse expressions
   *
-  * \tparam Derived
+  * \tparam Derived is the derived type, e.g. a sparse matrix type, or an expression, etc.
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_SPARSEMATRIXBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_SPARSEMATRIXBASE_PLUGIN.
   */
-template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
+template<typename Derived> class SparseMatrixBase
+  : public EigenBase<Derived>
 {
   public:
 
     typedef typename internal::traits<Derived>::Scalar Scalar;
+    
+    /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc.
+      *
+      * It is an alias for the Scalar type */
+    typedef Scalar value_type;
+    
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
+
+    /** The integer type used to \b store indices within a SparseMatrix.
+      * For a \c SparseMatrix<Scalar,Options,IndexType> it an alias of the third template parameter \c IndexType. */
+    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
+
     typedef typename internal::add_const_on_value_type_if_arithmetic<
                          typename internal::packet_traits<Scalar>::type
                      >::type PacketReturnType;
 
     typedef SparseMatrixBase StorageBaseType;
-    typedef EigenBase<Derived> Base;
+
+    typedef Matrix<StorageIndex,Dynamic,1> IndexVector;
+    typedef Matrix<Scalar,Dynamic,1> ScalarVector;
     
     template<typename OtherDerived>
-    Derived& operator=(const EigenBase<OtherDerived> &other)
-    {
-      other.derived().evalTo(derived());
-      return derived();
-    }
+    Derived& operator=(const EigenBase<OtherDerived> &other);
 
     enum {
 
@@ -78,16 +87,16 @@
           * we are dealing with a column-vector (if there is only one column) or with
           * a row-vector (if there is only one row). */
 
+      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2,
+        /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors,
+         * and 2 for matrices.
+         */
+
       Flags = internal::traits<Derived>::Flags,
         /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
           * constructed from this one. See the \ref flags "list of flags".
           */
 
-      CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
-        /**< This is a rough measure of how expensive it is to read one coefficient from
-          * this expression.
-          */
-
       IsRowMajor = Flags&RowMajorBit ? 1 : 0,
       
       InnerSizeAtCompileTime = int(IsVectorAtCompileTime) ? int(SizeAtCompileTime)
@@ -103,10 +112,11 @@
                         CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, Eigen::Transpose<const Derived> >,
                         Transpose<const Derived>
                      >::type AdjointReturnType;
+    typedef Transpose<Derived> TransposeReturnType;
+    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
 
-
-    typedef SparseMatrix<Scalar, Flags&RowMajorBit ? RowMajor : ColMajor, Index> PlainObject;
-
+    // FIXME storage order do not match evaluator storage order
+    typedef SparseMatrix<Scalar, Flags&RowMajorBit ? RowMajor : ColMajor, StorageIndex> PlainObject;
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** This is the "real scalar" type; if the \a Scalar type is already real numbers
@@ -124,6 +134,8 @@
     /** \internal Represents a matrix with all coefficients equal to one another*/
     typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Matrix<Scalar,Dynamic,Dynamic> > ConstantReturnType;
 
+    /** type of the equivalent dense matrix */
+    typedef Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime> DenseMatrixType;
     /** type of the equivalent square matrix */
     typedef Matrix<Scalar,EIGEN_SIZE_MAX(RowsAtCompileTime,ColsAtCompileTime),
                           EIGEN_SIZE_MAX(RowsAtCompileTime,ColsAtCompileTime)> SquareMatrixType;
@@ -132,9 +144,21 @@
     inline Derived& derived() { return *static_cast<Derived*>(this); }
     inline Derived& const_cast_derived() const
     { return *static_cast<Derived*>(const_cast<SparseMatrixBase*>(this)); }
+
+    typedef EigenBase<Derived> Base;
+
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::SparseMatrixBase
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+#define EIGEN_DOC_UNARY_ADDONS(METHOD,OP)           /** <p>This method does not change the sparsity of \c *this: the OP is applied to explicitly stored coefficients only. \sa SparseCompressedBase::coeffs() </p> */
+#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL      /** <p> \warning This method returns a read-only expression for any sparse matrices. \sa \ref TutorialSparse_SubMatrices "Sparse block operations" </p> */
+#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND) /** <p> \warning This method returns a read-write expression for COND sparse matrices only. Otherwise, the returned expression is read-only. \sa \ref TutorialSparse_SubMatrices "Sparse block operations" </p> */
+#else
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
+#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
+#endif
 #   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/CommonCwiseBinaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
@@ -143,8 +167,10 @@
 #   ifdef EIGEN_SPARSEMATRIXBASE_PLUGIN
 #     include EIGEN_SPARSEMATRIXBASE_PLUGIN
 #   endif
-#   undef EIGEN_CURRENT_STORAGE_BASE_CLASS
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_DOC_UNARY_ADDONS
+#undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
 
     /** \returns the number of rows. \sa cols() */
     inline Index rows() const { return derived().rows(); }
@@ -153,9 +179,6 @@
     /** \returns the number of coefficients, which is \a rows()*cols().
       * \sa rows(), cols(). */
     inline Index size() const { return rows() * cols(); }
-    /** \returns the number of nonzero coefficients which is in practice the number
-      * of stored coefficients. */
-    inline Index nonZeros() const { return derived().nonZeros(); }
     /** \returns true if either the number of rows or the number of columns is equal to 1.
       * In other words, this function returns
       * \code rows()==1 || cols()==1 \endcode
@@ -175,105 +198,23 @@
 
     
     template<typename OtherDerived>
-    Derived& operator=(const ReturnByValue<OtherDerived>& other)
-    {
-      other.evalTo(derived());
-      return derived();
-    }
-
+    Derived& operator=(const ReturnByValue<OtherDerived>& other);
 
     template<typename OtherDerived>
-    inline Derived& operator=(const SparseMatrixBase<OtherDerived>& other)
-    {
-      return assign(other.derived());
-    }
+    inline Derived& operator=(const SparseMatrixBase<OtherDerived>& other);
 
-    inline Derived& operator=(const Derived& other)
-    {
-//       if (other.isRValue())
-//         derived().swap(other.const_cast_derived());
-//       else
-      return assign(other.derived());
-    }
+    inline Derived& operator=(const Derived& other);
 
   protected:
 
     template<typename OtherDerived>
-    inline Derived& assign(const OtherDerived& other)
-    {
-      const bool transpose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
-      const Index outerSize =
-          (int(OtherDerived::Flags) & RowMajorBit)
-              ? static_cast<typename SparseMatrixBase::Index>(other.rows())
-              : static_cast<typename SparseMatrixBase::Index>(other.cols());
-      if ((!transpose) && other.isRValue()) {
-        // eval without temporary
-        derived().resize(
-            static_cast<typename SparseMatrixBase::Index>(other.rows()),
-            static_cast<typename SparseMatrixBase::Index>(other.cols()));
-        derived().setZero();
-        derived().reserve((std::max)(this->rows(),this->cols())*2);
-        for (Index j=0; j<outerSize; ++j)
-        {
-          derived().startVec(j);
-          for (typename OtherDerived::InnerIterator it(other, j); it; ++it)
-          {
-            Scalar v = it.value();
-            derived().insertBackByOuterInner(
-                j, static_cast<typename SparseMatrixBase::Index>(it.index())) =
-                v;
-          }
-        }
-        derived().finalize();
-      }
-      else
-      {
-        assignGeneric(other);
-      }
-      return derived();
-    }
+    inline Derived& assign(const OtherDerived& other);
 
     template<typename OtherDerived>
-    inline void assignGeneric(const OtherDerived& other)
-    {
-      //const bool transpose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
-      eigen_assert(( ((internal::traits<Derived>::SupportedAccessPatterns&OuterRandomAccessPattern)==OuterRandomAccessPattern) ||
-                  (!((Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit)))) &&
-                  "the transpose operation is supposed to be handled in SparseMatrix::operator=");
-
-      enum { Flip = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit) };
-
-      const Index outerSize =
-          static_cast<typename SparseMatrixBase::Index>(other.outerSize());
-      // typedef typename internal::conditional<transpose, LinkedVectorMatrix<Scalar,Flags&RowMajorBit>, Derived>::type TempType;
-      // thanks to shallow copies, we always eval to a tempary
-      Derived temp(static_cast<typename SparseMatrixBase::Index>(other.rows()),
-                   static_cast<typename SparseMatrixBase::Index>(other.cols()));
-
-      temp.reserve((std::max)(this->rows(),this->cols())*2);
-      for (Index j=0; j<outerSize; ++j)
-      {
-        temp.startVec(j);
-        for (typename OtherDerived::InnerIterator it(other.derived(), j); it; ++it)
-        {
-          Scalar v = it.value();
-          temp.insertBackByOuterInner(
-              Flip ? static_cast<typename SparseMatrixBase::Index>(it.index())
-                   : j,
-              Flip ? j : static_cast<typename SparseMatrixBase::Index>(
-                             it.index())) = v;
-        }
-      }
-      temp.finalize();
-
-      derived() = temp.markAsRValue();
-    }
+    inline void assignGeneric(const OtherDerived& other);
 
   public:
 
-    template<typename Lhs, typename Rhs>
-    inline Derived& operator=(const SparseSparseProduct<Lhs,Rhs>& product);
-
     friend std::ostream & operator << (std::ostream & s, const SparseMatrixBase& m)
     {
       typedef typename Derived::Nested Nested;
@@ -281,11 +222,12 @@
 
       if (Flags&RowMajorBit)
       {
-        const Nested nm(m.derived());
+        Nested nm(m.derived());
+        internal::evaluator<NestedCleaned> thisEval(nm);
         for (Index row=0; row<nm.outerSize(); ++row)
         {
           Index col = 0;
-          for (typename NestedCleaned::InnerIterator it(nm.derived(), row); it; ++it)
+          for (typename internal::evaluator<NestedCleaned>::InnerIterator it(thisEval, row); it; ++it)
           {
             for ( ; col<it.index(); ++col)
               s << "0 ";
@@ -299,10 +241,11 @@
       }
       else
       {
-        const Nested nm(m.derived());
+        Nested nm(m.derived());
+        internal::evaluator<NestedCleaned> thisEval(nm);
         if (m.cols() == 1) {
           Index row = 0;
-          for (typename NestedCleaned::InnerIterator it(nm.derived(), 0); it; ++it)
+          for (typename internal::evaluator<NestedCleaned>::InnerIterator it(thisEval, 0); it; ++it)
           {
             for ( ; row<it.index(); ++row)
               s << "0" << std::endl;
@@ -314,8 +257,8 @@
         }
         else
         {
-          SparseMatrix<Scalar, RowMajorBit, Index> trans = m;
-          s << static_cast<const SparseMatrixBase<SparseMatrix<Scalar, RowMajorBit, Index> >&>(trans);
+          SparseMatrix<Scalar, RowMajorBit, StorageIndex> trans = m;
+          s << static_cast<const SparseMatrixBase<SparseMatrix<Scalar, RowMajorBit, StorageIndex> >&>(trans);
         }
       }
       return s;
@@ -325,55 +268,65 @@
     Derived& operator+=(const SparseMatrixBase<OtherDerived>& other);
     template<typename OtherDerived>
     Derived& operator-=(const SparseMatrixBase<OtherDerived>& other);
+    
+    template<typename OtherDerived>
+    Derived& operator+=(const DiagonalBase<OtherDerived>& other);
+    template<typename OtherDerived>
+    Derived& operator-=(const DiagonalBase<OtherDerived>& other);
+
+    template<typename OtherDerived>
+    Derived& operator+=(const EigenBase<OtherDerived> &other);
+    template<typename OtherDerived>
+    Derived& operator-=(const EigenBase<OtherDerived> &other);
 
     Derived& operator*=(const Scalar& other);
     Derived& operator/=(const Scalar& other);
 
-    #define EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE \
-      CwiseBinaryOp< \
-        internal::scalar_product_op< \
-          typename internal::scalar_product_traits< \
-            typename internal::traits<Derived>::Scalar, \
-            typename internal::traits<OtherDerived>::Scalar \
-          >::ReturnType \
-        >, \
-        const Derived, \
-        const OtherDerived \
-      >
+    template<typename OtherDerived> struct CwiseProductDenseReturnType {
+      typedef CwiseBinaryOp<internal::scalar_product_op<typename ScalarBinaryOpTraits<
+                                                          typename internal::traits<Derived>::Scalar,
+                                                          typename internal::traits<OtherDerived>::Scalar
+                                                        >::ReturnType>,
+                            const Derived,
+                            const OtherDerived
+                          > Type;
+    };
 
     template<typename OtherDerived>
-    EIGEN_STRONG_INLINE const EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE
+    EIGEN_STRONG_INLINE const typename CwiseProductDenseReturnType<OtherDerived>::Type
     cwiseProduct(const MatrixBase<OtherDerived> &other) const;
 
-    // sparse * sparse
-    template<typename OtherDerived>
-    const typename SparseSparseProductReturnType<Derived,OtherDerived>::Type
-    operator*(const SparseMatrixBase<OtherDerived> &other) const;
-
     // sparse * diagonal
     template<typename OtherDerived>
-    const SparseDiagonalProduct<Derived,OtherDerived>
-    operator*(const DiagonalBase<OtherDerived> &other) const;
+    const Product<Derived,OtherDerived>
+    operator*(const DiagonalBase<OtherDerived> &other) const
+    { return Product<Derived,OtherDerived>(derived(), other.derived()); }
 
     // diagonal * sparse
     template<typename OtherDerived> friend
-    const SparseDiagonalProduct<OtherDerived,Derived>
+    const Product<OtherDerived,Derived>
     operator*(const DiagonalBase<OtherDerived> &lhs, const SparseMatrixBase& rhs)
-    { return SparseDiagonalProduct<OtherDerived,Derived>(lhs.derived(), rhs.derived()); }
-
-    /** dense * sparse (return a dense object unless it is an outer product) */
-    template<typename OtherDerived> friend
-    const typename DenseSparseProductReturnType<OtherDerived,Derived>::Type
-    operator*(const MatrixBase<OtherDerived>& lhs, const Derived& rhs)
-    { return typename DenseSparseProductReturnType<OtherDerived,Derived>::Type(lhs.derived(),rhs); }
-
-    /** sparse * dense (returns a dense object unless it is an outer product) */
+    { return Product<OtherDerived,Derived>(lhs.derived(), rhs.derived()); }
+    
+    // sparse * sparse
     template<typename OtherDerived>
-    const typename SparseDenseProductReturnType<Derived,OtherDerived>::Type
-    operator*(const MatrixBase<OtherDerived> &other) const;
+    const Product<Derived,OtherDerived,AliasFreeProduct>
+    operator*(const SparseMatrixBase<OtherDerived> &other) const;
+    
+    // sparse * dense
+    template<typename OtherDerived>
+    const Product<Derived,OtherDerived>
+    operator*(const MatrixBase<OtherDerived> &other) const
+    { return Product<Derived,OtherDerived>(derived(), other.derived()); }
+    
+    // dense * sparse
+    template<typename OtherDerived> friend
+    const Product<OtherDerived,Derived>
+    operator*(const MatrixBase<OtherDerived> &lhs, const SparseMatrixBase& rhs)
+    { return Product<OtherDerived,Derived>(lhs.derived(), rhs.derived()); }
     
      /** \returns an expression of P H P^-1 where H is the matrix represented by \c *this */
-    SparseSymmetricPermutationProduct<Derived,Upper|Lower> twistedBy(const PermutationMatrix<Dynamic,Dynamic,Index>& perm) const
+    SparseSymmetricPermutationProduct<Derived,Upper|Lower> twistedBy(const PermutationMatrix<Dynamic,Dynamic,StorageIndex>& perm) const
     {
       return SparseSymmetricPermutationProduct<Derived,Upper|Lower>(derived(), perm);
     }
@@ -381,22 +334,16 @@
     template<typename OtherDerived>
     Derived& operator*=(const SparseMatrixBase<OtherDerived>& other);
 
-    #ifdef EIGEN2_SUPPORT
-    // deprecated
-    template<typename OtherDerived>
-    typename internal::plain_matrix_type_column_major<OtherDerived>::type
-    solveTriangular(const MatrixBase<OtherDerived>& other) const;
-
-    // deprecated
-    template<typename OtherDerived>
-    void solveTriangularInPlace(MatrixBase<OtherDerived>& other) const;
-    #endif // EIGEN2_SUPPORT
-
     template<int Mode>
-    inline const SparseTriangularView<Derived, Mode> triangularView() const;
+    inline const TriangularView<const Derived, Mode> triangularView() const;
+    
+    template<unsigned int UpLo> struct SelfAdjointViewReturnType { typedef SparseSelfAdjointView<Derived, UpLo> Type; };
+    template<unsigned int UpLo> struct ConstSelfAdjointViewReturnType { typedef const SparseSelfAdjointView<const Derived, UpLo> Type; };
 
-    template<unsigned int UpLo> inline const SparseSelfAdjointView<Derived, UpLo> selfadjointView() const;
-    template<unsigned int UpLo> inline SparseSelfAdjointView<Derived, UpLo> selfadjointView();
+    template<unsigned int UpLo> inline 
+    typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;
+    template<unsigned int UpLo> inline
+    typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
 
     template<typename OtherDerived> Scalar dot(const MatrixBase<OtherDerived>& other) const;
     template<typename OtherDerived> Scalar dot(const SparseMatrixBase<OtherDerived>& other) const;
@@ -404,39 +351,18 @@
     RealScalar norm()  const;
     RealScalar blueNorm() const;
 
-    Transpose<Derived> transpose() { return derived(); }
-    const Transpose<const Derived> transpose() const { return derived(); }
-    const AdjointReturnType adjoint() const { return transpose(); }
+    TransposeReturnType transpose() { return TransposeReturnType(derived()); }
+    const ConstTransposeReturnType transpose() const { return ConstTransposeReturnType(derived()); }
+    const AdjointReturnType adjoint() const { return AdjointReturnType(transpose()); }
 
-    // inner-vector
-    typedef Block<Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true>       InnerVectorReturnType;
-    typedef Block<const Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true> ConstInnerVectorReturnType;
-    InnerVectorReturnType innerVector(Index outer);
-    const ConstInnerVectorReturnType innerVector(Index outer) const;
-
-    // set of inner-vectors
-    Block<Derived,Dynamic,Dynamic,true> innerVectors(Index outerStart, Index outerSize);
-    const Block<const Derived,Dynamic,Dynamic,true> innerVectors(Index outerStart, Index outerSize) const;
-
-    /** \internal use operator= */
-    template<typename DenseDerived>
-    void evalTo(MatrixBase<DenseDerived>& dst) const
+    DenseMatrixType toDense() const
     {
-      dst.setZero();
-      for (Index j=0; j<outerSize(); ++j)
-        for (typename Derived::InnerIterator i(derived(),j); i; ++i)
-          dst.coeffRef(i.row(),i.col()) = i.value();
-    }
-
-    Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime> toDense() const
-    {
-      return derived();
+      return DenseMatrixType(derived());
     }
 
     template<typename OtherDerived>
     bool isApprox(const SparseMatrixBase<OtherDerived>& other,
-                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
-    { return toDense().isApprox(other.toDense(),prec); }
+                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
 
     template<typename OtherDerived>
     bool isApprox(const MatrixBase<OtherDerived>& other,
@@ -452,10 +378,19 @@
     { return typename internal::eval<Derived>::type(derived()); }
 
     Scalar sum() const;
+    
+    inline const SparseView<Derived>
+    pruned(const Scalar& reference = Scalar(0), const RealScalar& epsilon = NumTraits<Scalar>::dummy_precision()) const;
 
   protected:
 
     bool m_isRValue;
+
+    static inline StorageIndex convert_index(const Index idx) {
+      return internal::convert_index<StorageIndex>(idx);
+    }
+  private:
+    template<typename Dest> void evalTo(Dest &) const;
 };
 
 } // end namespace Eigen

diff --git a/Eigen/src/SparseCore/SparsePermutation.h b/Eigen/src/SparseCore/SparsePermutation.h
index b85be93..ef38357 100644
--- a/Eigen/src/SparseCore/SparsePermutation.h
+++ b/Eigen/src/SparseCore/SparsePermutation.h

@@ -16,131 +16,161 @@
 
 namespace internal {
 
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
-struct traits<permut_sparsematrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
+template<typename ExpressionType, int Side, bool Transposed>
+struct permutation_matrix_product<ExpressionType, Side, Transposed, SparseShape>
 {
-  typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
-  typedef typename MatrixTypeNestedCleaned::Scalar Scalar;
-  typedef typename MatrixTypeNestedCleaned::Index Index;
-  enum {
-    SrcStorageOrder = MatrixTypeNestedCleaned::Flags&RowMajorBit ? RowMajor : ColMajor,
-    MoveOuter = SrcStorageOrder==RowMajor ? Side==OnTheLeft : Side==OnTheRight
-  };
+    typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
+    typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
 
-  typedef typename internal::conditional<MoveOuter,
-        SparseMatrix<Scalar,SrcStorageOrder,Index>,
-        SparseMatrix<Scalar,int(SrcStorageOrder)==RowMajor?ColMajor:RowMajor,Index> >::type ReturnType;
-};
-
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
-struct permut_sparsematrix_product_retval
- : public ReturnByValue<permut_sparsematrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
-{
-    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
-    typedef typename MatrixTypeNestedCleaned::Scalar Scalar;
-    typedef typename MatrixTypeNestedCleaned::Index Index;
+    typedef typename MatrixTypeCleaned::Scalar Scalar;
+    typedef typename MatrixTypeCleaned::StorageIndex StorageIndex;
 
     enum {
-      SrcStorageOrder = MatrixTypeNestedCleaned::Flags&RowMajorBit ? RowMajor : ColMajor,
+      SrcStorageOrder = MatrixTypeCleaned::Flags&RowMajorBit ? RowMajor : ColMajor,
       MoveOuter = SrcStorageOrder==RowMajor ? Side==OnTheLeft : Side==OnTheRight
     };
+    
+    typedef typename internal::conditional<MoveOuter,
+        SparseMatrix<Scalar,SrcStorageOrder,StorageIndex>,
+        SparseMatrix<Scalar,int(SrcStorageOrder)==RowMajor?ColMajor:RowMajor,StorageIndex> >::type ReturnType;
 
-    permut_sparsematrix_product_retval(const PermutationType& perm, const MatrixType& matrix)
-      : m_permutation(perm), m_matrix(matrix)
-    {}
-
-    inline int rows() const { return m_matrix.rows(); }
-    inline int cols() const { return m_matrix.cols(); }
-
-    template<typename Dest> inline void evalTo(Dest& dst) const
+    template<typename Dest,typename PermutationType>
+    static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
     {
+      MatrixType mat(xpr);
       if(MoveOuter)
       {
-        SparseMatrix<Scalar,SrcStorageOrder,Index> tmp(m_matrix.rows(), m_matrix.cols());
-        Matrix<Index,Dynamic,1> sizes(m_matrix.outerSize());
-        for(Index j=0; j<m_matrix.outerSize(); ++j)
+        SparseMatrix<Scalar,SrcStorageOrder,StorageIndex> tmp(mat.rows(), mat.cols());
+        Matrix<StorageIndex,Dynamic,1> sizes(mat.outerSize());
+        for(Index j=0; j<mat.outerSize(); ++j)
         {
-          Index jp = m_permutation.indices().coeff(j);
-          sizes[((Side==OnTheLeft) ^ Transposed) ? jp : j] = m_matrix.innerVector(((Side==OnTheRight) ^ Transposed) ? jp : j).size();
+          Index jp = perm.indices().coeff(j);
+          sizes[((Side==OnTheLeft) ^ Transposed) ? jp : j] = StorageIndex(mat.innerVector(((Side==OnTheRight) ^ Transposed) ? jp : j).nonZeros());
         }
         tmp.reserve(sizes);
-        for(Index j=0; j<m_matrix.outerSize(); ++j)
+        for(Index j=0; j<mat.outerSize(); ++j)
         {
-          Index jp = m_permutation.indices().coeff(j);
+          Index jp = perm.indices().coeff(j);
           Index jsrc = ((Side==OnTheRight) ^ Transposed) ? jp : j;
           Index jdst = ((Side==OnTheLeft) ^ Transposed) ? jp : j;
-          for(typename MatrixTypeNestedCleaned::InnerIterator it(m_matrix,jsrc); it; ++it)
+          for(typename MatrixTypeCleaned::InnerIterator it(mat,jsrc); it; ++it)
             tmp.insertByOuterInner(jdst,it.index()) = it.value();
         }
         dst = tmp;
       }
       else
       {
-        SparseMatrix<Scalar,int(SrcStorageOrder)==RowMajor?ColMajor:RowMajor,Index> tmp(m_matrix.rows(), m_matrix.cols());
-        Matrix<Index,Dynamic,1> sizes(tmp.outerSize());
+        SparseMatrix<Scalar,int(SrcStorageOrder)==RowMajor?ColMajor:RowMajor,StorageIndex> tmp(mat.rows(), mat.cols());
+        Matrix<StorageIndex,Dynamic,1> sizes(tmp.outerSize());
         sizes.setZero();
-        PermutationMatrix<Dynamic,Dynamic,Index> perm;
+        PermutationMatrix<Dynamic,Dynamic,StorageIndex> perm_cpy;
         if((Side==OnTheLeft) ^ Transposed)
-          perm = m_permutation;
+          perm_cpy = perm;
         else
-          perm = m_permutation.transpose();
+          perm_cpy = perm.transpose();
 
-        for(Index j=0; j<m_matrix.outerSize(); ++j)
-          for(typename MatrixTypeNestedCleaned::InnerIterator it(m_matrix,j); it; ++it)
-            sizes[perm.indices().coeff(it.index())]++;
+        for(Index j=0; j<mat.outerSize(); ++j)
+          for(typename MatrixTypeCleaned::InnerIterator it(mat,j); it; ++it)
+            sizes[perm_cpy.indices().coeff(it.index())]++;
         tmp.reserve(sizes);
-        for(Index j=0; j<m_matrix.outerSize(); ++j)
-          for(typename MatrixTypeNestedCleaned::InnerIterator it(m_matrix,j); it; ++it)
-            tmp.insertByOuterInner(perm.indices().coeff(it.index()),j) = it.value();
+        for(Index j=0; j<mat.outerSize(); ++j)
+          for(typename MatrixTypeCleaned::InnerIterator it(mat,j); it; ++it)
+            tmp.insertByOuterInner(perm_cpy.indices().coeff(it.index()),j) = it.value();
         dst = tmp;
       }
     }
-
-  protected:
-    const PermutationType& m_permutation;
-    typename MatrixType::Nested m_matrix;
 };
 
 }
 
+namespace internal {
 
+template <int ProductTag> struct product_promote_storage_type<Sparse,             PermutationStorage, ProductTag> { typedef Sparse ret; };
+template <int ProductTag> struct product_promote_storage_type<PermutationStorage, Sparse,             ProductTag> { typedef Sparse ret; };
+
+// TODO, the following two overloads are only needed to define the right temporary type through 
+// typename traits<permutation_sparse_matrix_product<Rhs,Lhs,OnTheRight,false> >::ReturnType
+// whereas it should be correctly handled by traits<Product<> >::PlainObject
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, AliasFreeProduct>, ProductTag, PermutationShape, SparseShape>
+  : public evaluator<typename permutation_matrix_product<Rhs,OnTheLeft,false,SparseShape>::ReturnType>
+{
+  typedef Product<Lhs, Rhs, AliasFreeProduct> XprType;
+  typedef typename permutation_matrix_product<Rhs,OnTheLeft,false,SparseShape>::ReturnType PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  enum {
+    Flags = Base::Flags | EvalBeforeNestingBit
+  };
+
+  explicit product_evaluator(const XprType& xpr)
+    : m_result(xpr.rows(), xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    generic_product_impl<Lhs, Rhs, PermutationShape, SparseShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
+  }
+
+protected:
+  PlainObject m_result;
+};
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, AliasFreeProduct>, ProductTag, SparseShape, PermutationShape >
+  : public evaluator<typename permutation_matrix_product<Lhs,OnTheRight,false,SparseShape>::ReturnType>
+{
+  typedef Product<Lhs, Rhs, AliasFreeProduct> XprType;
+  typedef typename permutation_matrix_product<Lhs,OnTheRight,false,SparseShape>::ReturnType PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  enum {
+    Flags = Base::Flags | EvalBeforeNestingBit
+  };
+
+  explicit product_evaluator(const XprType& xpr)
+    : m_result(xpr.rows(), xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    generic_product_impl<Lhs, Rhs, SparseShape, PermutationShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
+  }
+
+protected:
+  PlainObject m_result;
+};
+
+} // end namespace internal
 
 /** \returns the matrix with the permutation applied to the columns
   */
 template<typename SparseDerived, typename PermDerived>
-inline const internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheRight, false>
+inline const Product<SparseDerived, PermDerived, AliasFreeProduct>
 operator*(const SparseMatrixBase<SparseDerived>& matrix, const PermutationBase<PermDerived>& perm)
-{
-  return internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheRight, false>(perm, matrix.derived());
-}
+{ return Product<SparseDerived, PermDerived, AliasFreeProduct>(matrix.derived(), perm.derived()); }
 
 /** \returns the matrix with the permutation applied to the rows
   */
 template<typename SparseDerived, typename PermDerived>
-inline const internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheLeft, false>
+inline const Product<PermDerived, SparseDerived, AliasFreeProduct>
 operator*( const PermutationBase<PermDerived>& perm, const SparseMatrixBase<SparseDerived>& matrix)
-{
-  return internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheLeft, false>(perm, matrix.derived());
-}
-
+{ return  Product<PermDerived, SparseDerived, AliasFreeProduct>(perm.derived(), matrix.derived()); }
 
 
 /** \returns the matrix with the inverse permutation applied to the columns.
   */
-template<typename SparseDerived, typename PermDerived>
-inline const internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheRight, true>
-operator*(const SparseMatrixBase<SparseDerived>& matrix, const Transpose<PermutationBase<PermDerived> >& tperm)
+template<typename SparseDerived, typename PermutationType>
+inline const Product<SparseDerived, Inverse<PermutationType>, AliasFreeProduct>
+operator*(const SparseMatrixBase<SparseDerived>& matrix, const InverseImpl<PermutationType, PermutationStorage>& tperm)
 {
-  return internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheRight, true>(tperm.nestedPermutation(), matrix.derived());
+  return Product<SparseDerived, Inverse<PermutationType>, AliasFreeProduct>(matrix.derived(), tperm.derived());
 }
 
 /** \returns the matrix with the inverse permutation applied to the rows.
   */
-template<typename SparseDerived, typename PermDerived>
-inline const internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheLeft, true>
-operator*(const Transpose<PermutationBase<PermDerived> >& tperm, const SparseMatrixBase<SparseDerived>& matrix)
+template<typename SparseDerived, typename PermutationType>
+inline const Product<Inverse<PermutationType>, SparseDerived, AliasFreeProduct>
+operator*(const InverseImpl<PermutationType,PermutationStorage>& tperm, const SparseMatrixBase<SparseDerived>& matrix)
 {
-  return internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheLeft, true>(tperm.nestedPermutation(), matrix.derived());
+  return Product<Inverse<PermutationType>, SparseDerived, AliasFreeProduct>(tperm.derived(), matrix.derived());
 }
 
 } // end namespace Eigen

diff --git a/Eigen/src/SparseCore/SparseProduct.h b/Eigen/src/SparseCore/SparseProduct.h
index cf76630..af8a774 100644
--- a/Eigen/src/SparseCore/SparseProduct.h
+++ b/Eigen/src/SparseCore/SparseProduct.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,164 +12,12 @@
 
 namespace Eigen { 
 
-template<typename Lhs, typename Rhs>
-struct SparseSparseProductReturnType
-{
-  typedef typename internal::traits<Lhs>::Scalar Scalar;
-  typedef typename internal::traits<Lhs>::Index Index;
-  enum {
-    LhsRowMajor = internal::traits<Lhs>::Flags & RowMajorBit,
-    RhsRowMajor = internal::traits<Rhs>::Flags & RowMajorBit,
-    TransposeRhs = (!LhsRowMajor) && RhsRowMajor,
-    TransposeLhs = LhsRowMajor && (!RhsRowMajor)
-  };
-
-  typedef typename internal::conditional<TransposeLhs,
-    SparseMatrix<Scalar,0,Index>,
-    typename internal::nested<Lhs,Rhs::RowsAtCompileTime>::type>::type LhsNested;
-
-  typedef typename internal::conditional<TransposeRhs,
-    SparseMatrix<Scalar,0,Index>,
-    typename internal::nested<Rhs,Lhs::RowsAtCompileTime>::type>::type RhsNested;
-
-  typedef SparseSparseProduct<LhsNested, RhsNested> Type;
-};
-
-namespace internal {
-template<typename LhsNested, typename RhsNested>
-struct traits<SparseSparseProduct<LhsNested, RhsNested> >
-{
-  typedef MatrixXpr XprKind;
-  // clean the nested types:
-  typedef typename remove_all<LhsNested>::type _LhsNested;
-  typedef typename remove_all<RhsNested>::type _RhsNested;
-  typedef typename _LhsNested::Scalar Scalar;
-  typedef typename promote_index_type<typename traits<_LhsNested>::Index,
-                                         typename traits<_RhsNested>::Index>::type Index;
-
-  enum {
-    LhsCoeffReadCost = _LhsNested::CoeffReadCost,
-    RhsCoeffReadCost = _RhsNested::CoeffReadCost,
-    LhsFlags = _LhsNested::Flags,
-    RhsFlags = _RhsNested::Flags,
-
-    RowsAtCompileTime    = _LhsNested::RowsAtCompileTime,
-    ColsAtCompileTime    = _RhsNested::ColsAtCompileTime,
-    MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
-
-    InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
-
-    EvalToRowMajor = (RhsFlags & LhsFlags & RowMajorBit),
-
-    RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit),
-
-    Flags = (int(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
-          | EvalBeforeAssigningBit
-          | EvalBeforeNestingBit,
-
-    CoeffReadCost = Dynamic
-  };
-
-  typedef Sparse StorageKind;
-};
-
-} // end namespace internal
-
-template<typename LhsNested, typename RhsNested>
-class SparseSparseProduct : internal::no_assignment_operator,
-  public SparseMatrixBase<SparseSparseProduct<LhsNested, RhsNested> >
-{
-  public:
-
-    typedef SparseMatrixBase<SparseSparseProduct> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(SparseSparseProduct)
-
-  private:
-
-    typedef typename internal::traits<SparseSparseProduct>::_LhsNested _LhsNested;
-    typedef typename internal::traits<SparseSparseProduct>::_RhsNested _RhsNested;
-
-  public:
-
-    template<typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE SparseSparseProduct(const Lhs& lhs, const Rhs& rhs)
-      : m_lhs(lhs), m_rhs(rhs), m_tolerance(0), m_conservative(true)
-    {
-      init();
-    }
-
-    template<typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE SparseSparseProduct(const Lhs& lhs, const Rhs& rhs, const RealScalar& tolerance)
-      : m_lhs(lhs), m_rhs(rhs), m_tolerance(tolerance), m_conservative(false)
-    {
-      init();
-    }
-
-    SparseSparseProduct pruned(const Scalar& reference = 0, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision()) const
-    {
-      using std::abs;
-      return SparseSparseProduct(m_lhs,m_rhs,abs(reference)*epsilon);
-    }
-
-    template<typename Dest>
-    void evalTo(Dest& result) const
-    {
-      if(m_conservative)
-        internal::conservative_sparse_sparse_product_selector<_LhsNested, _RhsNested, Dest>::run(lhs(),rhs(),result);
-      else
-        internal::sparse_sparse_product_with_pruning_selector<_LhsNested, _RhsNested, Dest>::run(lhs(),rhs(),result,m_tolerance);
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
-
-    EIGEN_STRONG_INLINE const _LhsNested& lhs() const { return m_lhs; }
-    EIGEN_STRONG_INLINE const _RhsNested& rhs() const { return m_rhs; }
-
-  protected:
-    void init()
-    {
-      eigen_assert(m_lhs.cols() == m_rhs.rows());
-
-      enum {
-        ProductIsValid = _LhsNested::ColsAtCompileTime==Dynamic
-                      || _RhsNested::RowsAtCompileTime==Dynamic
-                      || int(_LhsNested::ColsAtCompileTime)==int(_RhsNested::RowsAtCompileTime),
-        AreVectors = _LhsNested::IsVectorAtCompileTime && _RhsNested::IsVectorAtCompileTime,
-        SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(_LhsNested,_RhsNested)
-      };
-      // note to the lost user:
-      //    * for a dot product use: v1.dot(v2)
-      //    * for a coeff-wise product use: v1.cwise()*v2
-      EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes),
-        INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS)
-      EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors),
-        INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
-      EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
-    }
-
-    LhsNested m_lhs;
-    RhsNested m_rhs;
-    RealScalar m_tolerance;
-    bool m_conservative;
-};
-
-// sparse = sparse * sparse
-template<typename Derived>
-template<typename Lhs, typename Rhs>
-inline Derived& SparseMatrixBase<Derived>::operator=(const SparseSparseProduct<Lhs,Rhs>& product)
-{
-  product.evalTo(derived());
-  return derived();
-}
-
 /** \returns an expression of the product of two sparse matrices.
   * By default a conservative product preserving the symbolic non zeros is performed.
   * The automatic pruning of the small values can be achieved by calling the pruned() function
   * in which case a totally different product algorithm is employed:
   * \code
-  * C = (A*B).pruned();             // supress numerical zeros (exact)
+  * C = (A*B).pruned();             // suppress numerical zeros (exact)
   * C = (A*B).pruned(ref);
   * C = (A*B).pruned(ref,epsilon);
   * \endcode
@@ -177,10 +25,155 @@
   * */
 template<typename Derived>
 template<typename OtherDerived>
-inline const typename SparseSparseProductReturnType<Derived,OtherDerived>::Type
+inline const Product<Derived,OtherDerived,AliasFreeProduct>
 SparseMatrixBase<Derived>::operator*(const SparseMatrixBase<OtherDerived> &other) const
 {
-  return typename SparseSparseProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
+  return Product<Derived,OtherDerived,AliasFreeProduct>(derived(), other.derived());
+}
+
+namespace internal {
+
+// sparse * sparse
+template<typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, SparseShape, SparseShape, ProductType>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    evalTo(dst, lhs, rhs, typename evaluator_traits<Dest>::Shape());
+  }
+
+  // dense += sparse * sparse
+  template<typename Dest,typename ActualLhs>
+  static void addTo(Dest& dst, const ActualLhs& lhs, const Rhs& rhs, typename enable_if<is_same<typename evaluator_traits<Dest>::Shape,DenseShape>::value,int*>::type* = 0)
+  {
+    typedef typename nested_eval<ActualLhs,Dynamic>::type LhsNested;
+    typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
+    LhsNested lhsNested(lhs);
+    RhsNested rhsNested(rhs);
+    internal::sparse_sparse_to_dense_product_selector<typename remove_all<LhsNested>::type,
+                                                      typename remove_all<RhsNested>::type, Dest>::run(lhsNested,rhsNested,dst);
+  }
+
+  // dense -= sparse * sparse
+  template<typename Dest>
+  static void subTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, typename enable_if<is_same<typename evaluator_traits<Dest>::Shape,DenseShape>::value,int*>::type* = 0)
+  {
+    addTo(dst, -lhs, rhs);
+  }
+
+protected:
+
+  // sparse = sparse * sparse
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, SparseShape)
+  {
+    typedef typename nested_eval<Lhs,Dynamic>::type LhsNested;
+    typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
+    LhsNested lhsNested(lhs);
+    RhsNested rhsNested(rhs);
+    internal::conservative_sparse_sparse_product_selector<typename remove_all<LhsNested>::type,
+                                                          typename remove_all<RhsNested>::type, Dest>::run(lhsNested,rhsNested,dst);
+  }
+
+  // dense = sparse * sparse
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, DenseShape)
+  {
+    dst.setZero();
+    addTo(dst, lhs, rhs);
+  }
+};
+
+// sparse * sparse-triangular
+template<typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, SparseShape, SparseTriangularShape, ProductType>
+ : public generic_product_impl<Lhs, Rhs, SparseShape, SparseShape, ProductType>
+{};
+
+// sparse-triangular * sparse
+template<typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, SparseTriangularShape, SparseShape, ProductType>
+ : public generic_product_impl<Lhs, Rhs, SparseShape, SparseShape, ProductType>
+{};
+
+// dense = sparse-product (can be sparse*sparse, sparse*perm, etc.)
+template< typename DstXprType, typename Lhs, typename Rhs>
+struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::assign_op<typename DstXprType::Scalar,typename Product<Lhs,Rhs,AliasFreeProduct>::Scalar>, Sparse2Dense>
+{
+  typedef Product<Lhs,Rhs,AliasFreeProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+    
+    generic_product_impl<Lhs, Rhs>::evalTo(dst,src.lhs(),src.rhs());
+  }
+};
+
+// dense += sparse-product (can be sparse*sparse, sparse*perm, etc.)
+template< typename DstXprType, typename Lhs, typename Rhs>
+struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::add_assign_op<typename DstXprType::Scalar,typename Product<Lhs,Rhs,AliasFreeProduct>::Scalar>, Sparse2Dense>
+{
+  typedef Product<Lhs,Rhs,AliasFreeProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &)
+  {
+    generic_product_impl<Lhs, Rhs>::addTo(dst,src.lhs(),src.rhs());
+  }
+};
+
+// dense -= sparse-product (can be sparse*sparse, sparse*perm, etc.)
+template< typename DstXprType, typename Lhs, typename Rhs>
+struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::sub_assign_op<typename DstXprType::Scalar,typename Product<Lhs,Rhs,AliasFreeProduct>::Scalar>, Sparse2Dense>
+{
+  typedef Product<Lhs,Rhs,AliasFreeProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &)
+  {
+    generic_product_impl<Lhs, Rhs>::subTo(dst,src.lhs(),src.rhs());
+  }
+};
+
+template<typename Lhs, typename Rhs, int Options>
+struct unary_evaluator<SparseView<Product<Lhs, Rhs, Options> >, IteratorBased>
+ : public evaluator<typename Product<Lhs, Rhs, DefaultProduct>::PlainObject>
+{
+  typedef SparseView<Product<Lhs, Rhs, Options> > XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  explicit unary_evaluator(const XprType& xpr)
+    : m_result(xpr.rows(), xpr.cols())
+  {
+    using std::abs;
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    typedef typename nested_eval<Lhs,Dynamic>::type LhsNested;
+    typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
+    LhsNested lhsNested(xpr.nestedExpression().lhs());
+    RhsNested rhsNested(xpr.nestedExpression().rhs());
+
+    internal::sparse_sparse_product_with_pruning_selector<typename remove_all<LhsNested>::type,
+                                                          typename remove_all<RhsNested>::type, PlainObject>::run(lhsNested,rhsNested,m_result,
+                                                                                                                  abs(xpr.reference())*xpr.epsilon());
+  }
+
+protected:
+  PlainObject m_result;
+};
+
+} // end namespace internal
+
+// sparse matrix = sparse-product (can be sparse*sparse, sparse*perm, etc.)
+template<typename Scalar, int _Options, typename _StorageIndex>
+template<typename Lhs, typename Rhs>
+SparseMatrix<Scalar,_Options,_StorageIndex>& SparseMatrix<Scalar,_Options,_StorageIndex>::operator=(const Product<Lhs,Rhs,AliasFreeProduct>& src)
+{
+  // std::cout << "in Assignment : " << DstOptions << "\n";
+  SparseMatrix dst(src.rows(),src.cols());
+  internal::generic_product_impl<Lhs, Rhs>::evalTo(dst,src.lhs(),src.rhs());
+  this->swap(dst);
+  return *this;
 }
 
 } // end namespace Eigen

diff --git a/Eigen/src/SparseCore/SparseRedux.h b/Eigen/src/SparseCore/SparseRedux.h
index f3da93a..4587749 100644
--- a/Eigen/src/SparseCore/SparseRedux.h
+++ b/Eigen/src/SparseCore/SparseRedux.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -18,8 +18,9 @@
 {
   eigen_assert(rows()>0 && cols()>0 && "you are using a non initialized matrix");
   Scalar res(0);
+  internal::evaluator<Derived> thisEval(derived());
   for (Index j=0; j<outerSize(); ++j)
-    for (typename Derived::InnerIterator iter(derived(),j); iter; ++iter)
+    for (typename internal::evaluator<Derived>::InnerIterator iter(thisEval,j); iter; ++iter)
       res += iter.value();
   return res;
 }
@@ -29,7 +30,10 @@
 SparseMatrix<_Scalar,_Options,_Index>::sum() const
 {
   eigen_assert(rows()>0 && cols()>0 && "you are using a non initialized matrix");
-  return Matrix<Scalar,1,Dynamic>::Map(&m_data.value(0), m_data.size()).sum();
+  if(this->isCompressed())
+    return Matrix<Scalar,1,Dynamic>::Map(m_data.valuePtr(), m_data.size()).sum();
+  else
+    return Base::sum();
 }
 
 template<typename _Scalar, int _Options, typename _Index>
@@ -37,7 +41,7 @@
 SparseVector<_Scalar,_Options,_Index>::sum() const
 {
   eigen_assert(rows()>0 && cols()>0 && "you are using a non initialized matrix");
-  return Matrix<Scalar,1,Dynamic>::Map(&m_data.value(0), m_data.size()).sum();
+  return Matrix<Scalar,1,Dynamic>::Map(m_data.valuePtr(), m_data.size()).sum();
 }
 
 } // end namespace Eigen

diff --git a/Eigen/src/SparseCore/SparseRef.h b/Eigen/src/SparseCore/SparseRef.h
new file mode 100644
index 0000000..748f87d
--- /dev/null
+++ b/Eigen/src/SparseCore/SparseRef.h

@@ -0,0 +1,397 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_REF_H
+#define EIGEN_SPARSE_REF_H
+
+namespace Eigen {
+
+enum {
+  StandardCompressedFormat = 2 /**< used by Ref<SparseMatrix> to specify whether the input storage must be in standard compressed form */
+};
+  
+namespace internal {
+
+template<typename Derived> class SparseRefBase;
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
+struct traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+  : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
+{
+  typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
+  enum {
+    Options = _Options,
+    Flags = traits<PlainObjectType>::Flags | CompressedAccessBit | NestByRefBit
+  };
+
+  template<typename Derived> struct match {
+    enum {
+      StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
+      MatchAtCompileTime = (Derived::Flags&CompressedAccessBit) && StorageOrderMatch
+    };
+    typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
+  };
+  
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
+struct traits<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+  : public traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+{
+  enum {
+    Flags = (traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit
+  };
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
+struct traits<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+  : public traits<SparseVector<MatScalar,MatOptions,MatIndex> >
+{
+  typedef SparseVector<MatScalar,MatOptions,MatIndex> PlainObjectType;
+  enum {
+    Options = _Options,
+    Flags = traits<PlainObjectType>::Flags | CompressedAccessBit | NestByRefBit
+  };
+
+  template<typename Derived> struct match {
+    enum {
+      MatchAtCompileTime = (Derived::Flags&CompressedAccessBit) && Derived::IsVectorAtCompileTime
+    };
+    typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
+  };
+
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
+struct traits<Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+  : public traits<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+{
+  enum {
+    Flags = (traits<SparseVector<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit
+  };
+};
+
+template<typename Derived>
+struct traits<SparseRefBase<Derived> > : public traits<Derived> {};
+
+template<typename Derived> class SparseRefBase
+  : public SparseMapBase<Derived>
+{
+public:
+
+  typedef SparseMapBase<Derived> Base;
+  EIGEN_SPARSE_PUBLIC_INTERFACE(SparseRefBase)
+
+  SparseRefBase()
+    : Base(RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime, 0, 0, 0, 0, 0)
+  {}
+  
+protected:
+
+  template<typename Expression>
+  void construct(Expression& expr)
+  {
+    if(expr.outerIndexPtr()==0)
+      ::new (static_cast<Base*>(this)) Base(expr.size(), expr.nonZeros(), expr.innerIndexPtr(), expr.valuePtr());
+    else
+      ::new (static_cast<Base*>(this)) Base(expr.rows(), expr.cols(), expr.nonZeros(), expr.outerIndexPtr(), expr.innerIndexPtr(), expr.valuePtr(), expr.innerNonZeroPtr());
+  }
+};
+
+} // namespace internal
+
+
+/** 
+  * \ingroup SparseCore_Module
+  *
+  * \brief A sparse matrix expression referencing an existing sparse expression
+  *
+  * \tparam SparseMatrixType the equivalent sparse matrix type of the referenced data, it must be a template instance of class SparseMatrix.
+  * \tparam Options specifies whether the a standard compressed format is required \c Options is  \c #StandardCompressedFormat, or \c 0.
+  *                The default is \c 0.
+  *
+  * \sa class Ref
+  */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType >
+  : public internal::SparseRefBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType > >
+#else
+template<typename SparseMatrixType, int Options>
+class Ref<SparseMatrixType, Options>
+  : public SparseMapBase<Derived,WriteAccessors> // yes, that's weird to use Derived here, but that works!
+#endif
+{
+    typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
+    typedef internal::traits<Ref> Traits;
+    template<int OtherOptions>
+    inline Ref(const SparseMatrix<MatScalar,OtherOptions,MatIndex>& expr);
+    template<int OtherOptions>
+    inline Ref(const MappedSparseMatrix<MatScalar,OtherOptions,MatIndex>& expr);
+  public:
+
+    typedef internal::SparseRefBase<Ref> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<int OtherOptions>
+    inline Ref(SparseMatrix<MatScalar,OtherOptions,MatIndex>& expr)
+    {
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseMatrix<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      eigen_assert( ((Options & int(StandardCompressedFormat))==0) || (expr.isCompressed()) );
+      Base::construct(expr.derived());
+    }
+    
+    template<int OtherOptions>
+    inline Ref(MappedSparseMatrix<MatScalar,OtherOptions,MatIndex>& expr)
+    {
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseMatrix<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      eigen_assert( ((Options & int(StandardCompressedFormat))==0) || (expr.isCompressed()) );
+      Base::construct(expr.derived());
+    }
+    
+    template<typename Derived>
+    inline Ref(const SparseCompressedBase<Derived>& expr)
+    #else
+    /** Implicit constructor from any sparse expression (2D matrix or 1D vector) */
+    template<typename Derived>
+    inline Ref(SparseCompressedBase<Derived>& expr)
+    #endif
+    {
+      EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      eigen_assert( ((Options & int(StandardCompressedFormat))==0) || (expr.isCompressed()) );
+      Base::construct(expr.const_cast_derived());
+    }
+};
+
+// this is the const ref version
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
+  : public internal::SparseRefBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+{
+    typedef SparseMatrix<MatScalar,MatOptions,MatIndex> TPlainObjectType;
+    typedef internal::traits<Ref> Traits;
+  public:
+
+    typedef internal::SparseRefBase<Ref> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+    template<typename Derived>
+    inline Ref(const SparseMatrixBase<Derived>& expr) : m_hasCopy(false)
+    {
+      construct(expr.derived(), typename Traits::template match<Derived>::type());
+    }
+
+    inline Ref(const Ref& other) : Base(other), m_hasCopy(false) {
+      // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
+    }
+
+    template<typename OtherRef>
+    inline Ref(const RefBase<OtherRef>& other) : m_hasCopy(false) {
+      construct(other.derived(), typename Traits::template match<OtherRef>::type());
+    }
+
+    ~Ref() {
+      if(m_hasCopy) {
+        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(&m_storage);
+        obj->~TPlainObjectType();
+      }
+    }
+
+  protected:
+
+    template<typename Expression>
+    void construct(const Expression& expr,internal::true_type)
+    {
+      if((Options & int(StandardCompressedFormat)) && (!expr.isCompressed()))
+      {
+        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(&m_storage);
+        ::new (obj) TPlainObjectType(expr);
+        m_hasCopy = true;
+        Base::construct(*obj);
+      }
+      else
+      {
+        Base::construct(expr);
+      }
+    }
+
+    template<typename Expression>
+    void construct(const Expression& expr, internal::false_type)
+    {
+      TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(&m_storage);
+      ::new (obj) TPlainObjectType(expr);
+      m_hasCopy = true;
+      Base::construct(*obj);
+    }
+
+  protected:
+    typename internal::aligned_storage<sizeof(TPlainObjectType), EIGEN_ALIGNOF(TPlainObjectType)>::type m_storage;
+    bool m_hasCopy;
+};
+
+
+
+/**
+  * \ingroup SparseCore_Module
+  *
+  * \brief A sparse vector expression referencing an existing sparse vector expression
+  *
+  * \tparam SparseVectorType the equivalent sparse vector type of the referenced data, it must be a template instance of class SparseVector.
+  *
+  * \sa class Ref
+  */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType >
+  : public internal::SparseRefBase<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType > >
+#else
+template<typename SparseVectorType>
+class Ref<SparseVectorType>
+  : public SparseMapBase<Derived,WriteAccessors>
+#endif
+{
+    typedef SparseVector<MatScalar,MatOptions,MatIndex> PlainObjectType;
+    typedef internal::traits<Ref> Traits;
+    template<int OtherOptions>
+    inline Ref(const SparseVector<MatScalar,OtherOptions,MatIndex>& expr);
+  public:
+
+    typedef internal::SparseRefBase<Ref> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<int OtherOptions>
+    inline Ref(SparseVector<MatScalar,OtherOptions,MatIndex>& expr)
+    {
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseVector<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      Base::construct(expr.derived());
+    }
+
+    template<typename Derived>
+    inline Ref(const SparseCompressedBase<Derived>& expr)
+    #else
+    /** Implicit constructor from any 1D sparse vector expression */
+    template<typename Derived>
+    inline Ref(SparseCompressedBase<Derived>& expr)
+    #endif
+    {
+      EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      Base::construct(expr.const_cast_derived());
+    }
+};
+
+// this is the const ref version
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType>
+  : public internal::SparseRefBase<Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+{
+    typedef SparseVector<MatScalar,MatOptions,MatIndex> TPlainObjectType;
+    typedef internal::traits<Ref> Traits;
+  public:
+
+    typedef internal::SparseRefBase<Ref> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+    template<typename Derived>
+    inline Ref(const SparseMatrixBase<Derived>& expr) : m_hasCopy(false)
+    {
+      construct(expr.derived(), typename Traits::template match<Derived>::type());
+    }
+
+    inline Ref(const Ref& other) : Base(other), m_hasCopy(false) {
+      // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
+    }
+
+    template<typename OtherRef>
+    inline Ref(const RefBase<OtherRef>& other) : m_hasCopy(false) {
+      construct(other.derived(), typename Traits::template match<OtherRef>::type());
+    }
+
+    ~Ref() {
+      if(m_hasCopy) {
+        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(&m_storage);
+        obj->~TPlainObjectType();
+      }
+    }
+
+  protected:
+
+    template<typename Expression>
+    void construct(const Expression& expr,internal::true_type)
+    {
+      Base::construct(expr);
+    }
+
+    template<typename Expression>
+    void construct(const Expression& expr, internal::false_type)
+    {
+      TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(&m_storage);
+      ::new (obj) TPlainObjectType(expr);
+      m_hasCopy = true;
+      Base::construct(*obj);
+    }
+
+  protected:
+    typename internal::aligned_storage<sizeof(TPlainObjectType), EIGEN_ALIGNOF(TPlainObjectType)>::type m_storage;
+    bool m_hasCopy;
+};
+
+namespace internal {
+
+// FIXME shall we introduce a general evaluatior_ref that we can specialize for any sparse object once, and thus remove this copy-pasta thing...
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_REF_H

diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h
index 0eda96b..85b00e1 100644
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -11,14 +11,14 @@
 #define EIGEN_SPARSE_SELFADJOINTVIEW_H
 
 namespace Eigen { 
-
+  
 /** \ingroup SparseCore_Module
   * \class SparseSelfAdjointView
   *
   * \brief Pseudo expression to manipulate a triangular sparse matrix as a selfadjoint matrix.
   *
   * \param MatrixType the type of the dense matrix storing the coefficients
-  * \param UpLo can be either \c #Lower or \c #Upper
+  * \param Mode can be either \c #Lower or \c #Upper
   *
   * This class is an expression of a sefladjoint matrix from a triangular part of a matrix
   * with given dense storage of the coefficients. It is the return type of MatrixBase::selfadjointView()
@@ -26,38 +26,40 @@
   *
   * \sa SparseMatrixBase::selfadjointView()
   */
-template<typename Lhs, typename Rhs, int UpLo>
-class SparseSelfAdjointTimeDenseProduct;
-
-template<typename Lhs, typename Rhs, int UpLo>
-class DenseTimeSparseSelfAdjointProduct;
-
 namespace internal {
   
-template<typename MatrixType, unsigned int UpLo>
-struct traits<SparseSelfAdjointView<MatrixType,UpLo> > : traits<MatrixType> {
+template<typename MatrixType, unsigned int Mode>
+struct traits<SparseSelfAdjointView<MatrixType,Mode> > : traits<MatrixType> {
 };
 
-template<int SrcUpLo,int DstUpLo,typename MatrixType,int DestOrder>
-void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::Index>& _dest, const typename MatrixType::Index* perm = 0);
+template<int SrcMode,int DstMode,typename MatrixType,int DestOrder>
+void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::StorageIndex>& _dest, const typename MatrixType::StorageIndex* perm = 0);
 
-template<int UpLo,typename MatrixType,int DestOrder>
-void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::Index>& _dest, const typename MatrixType::Index* perm = 0);
+template<int Mode,typename MatrixType,int DestOrder>
+void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::StorageIndex>& _dest, const typename MatrixType::StorageIndex* perm = 0);
 
 }
 
-template<typename MatrixType, unsigned int UpLo> class SparseSelfAdjointView
-  : public EigenBase<SparseSelfAdjointView<MatrixType,UpLo> >
+template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
+  : public EigenBase<SparseSelfAdjointView<MatrixType,_Mode> >
 {
   public:
+    
+    enum {
+      Mode = _Mode,
+      TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0),
+      RowsAtCompileTime = internal::traits<SparseSelfAdjointView>::RowsAtCompileTime,
+      ColsAtCompileTime = internal::traits<SparseSelfAdjointView>::ColsAtCompileTime
+    };
 
+    typedef EigenBase<SparseSelfAdjointView> Base;
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
-    typedef Matrix<Index,Dynamic,1> VectorI;
-    typedef typename MatrixType::Nested MatrixTypeNested;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef Matrix<StorageIndex,Dynamic,1> VectorI;
+    typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
     typedef typename internal::remove_all<MatrixTypeNested>::type _MatrixTypeNested;
-
-    inline SparseSelfAdjointView(const MatrixType& matrix) : m_matrix(matrix)
+    
+    explicit inline SparseSelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
     {
       eigen_assert(rows()==cols() && "SelfAdjointView is only for squared matrices");
     }
@@ -67,7 +69,7 @@
 
     /** \internal \returns a reference to the nested matrix */
     const _MatrixTypeNested& matrix() const { return m_matrix; }
-    _MatrixTypeNested& matrix() { return m_matrix.const_cast_derived(); }
+    typename internal::remove_reference<MatrixTypeNested>::type& matrix() { return m_matrix; }
 
     /** \returns an expression of the matrix product between a sparse self-adjoint matrix \c *this and a sparse matrix \a rhs.
       *
@@ -75,10 +77,10 @@
       * Indeed, the SparseSelfadjointView operand is first copied into a temporary SparseMatrix before computing the product.
       */
     template<typename OtherDerived>
-    SparseSparseProduct<typename OtherDerived::PlainObject, OtherDerived>
+    Product<SparseSelfAdjointView, OtherDerived>
     operator*(const SparseMatrixBase<OtherDerived>& rhs) const
     {
-      return SparseSparseProduct<typename OtherDerived::PlainObject, OtherDerived>(*this, rhs.derived());
+      return Product<SparseSelfAdjointView, OtherDerived>(*this, rhs.derived());
     }
 
     /** \returns an expression of the matrix product between a sparse matrix \a lhs and a sparse self-adjoint matrix \a rhs.
@@ -87,26 +89,26 @@
       * Indeed, the SparseSelfadjointView operand is first copied into a temporary SparseMatrix before computing the product.
       */
     template<typename OtherDerived> friend
-    SparseSparseProduct<OtherDerived, typename OtherDerived::PlainObject >
+    Product<OtherDerived, SparseSelfAdjointView>
     operator*(const SparseMatrixBase<OtherDerived>& lhs, const SparseSelfAdjointView& rhs)
     {
-      return SparseSparseProduct<OtherDerived, typename OtherDerived::PlainObject>(lhs.derived(), rhs);
+      return Product<OtherDerived, SparseSelfAdjointView>(lhs.derived(), rhs);
     }
     
     /** Efficient sparse self-adjoint matrix times dense vector/matrix product */
     template<typename OtherDerived>
-    SparseSelfAdjointTimeDenseProduct<MatrixType,OtherDerived,UpLo>
+    Product<SparseSelfAdjointView,OtherDerived>
     operator*(const MatrixBase<OtherDerived>& rhs) const
     {
-      return SparseSelfAdjointTimeDenseProduct<MatrixType,OtherDerived,UpLo>(m_matrix, rhs.derived());
+      return Product<SparseSelfAdjointView,OtherDerived>(*this, rhs.derived());
     }
 
     /** Efficient dense vector/matrix times sparse self-adjoint matrix product */
     template<typename OtherDerived> friend
-    DenseTimeSparseSelfAdjointProduct<OtherDerived,MatrixType,UpLo>
+    Product<OtherDerived,SparseSelfAdjointView>
     operator*(const MatrixBase<OtherDerived>& lhs, const SparseSelfAdjointView& rhs)
     {
-      return DenseTimeSparseSelfAdjointProduct<OtherDerived,_MatrixTypeNested,UpLo>(lhs.derived(), rhs.m_matrix);
+      return Product<OtherDerived,SparseSelfAdjointView>(lhs.derived(), rhs);
     }
 
     /** Perform a symmetric rank K update of the selfadjoint matrix \c *this:
@@ -120,56 +122,51 @@
     template<typename DerivedU>
     SparseSelfAdjointView& rankUpdate(const SparseMatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));
     
-    /** \internal triggered by sparse_matrix = SparseSelfadjointView; */
-    template<typename DestScalar,int StorageOrder> void evalTo(SparseMatrix<DestScalar,StorageOrder,Index>& _dest) const
-    {
-      internal::permute_symm_to_fullsymm<UpLo>(m_matrix, _dest);
-    }
-    
-    template<typename DestScalar> void evalTo(DynamicSparseMatrix<DestScalar,ColMajor,Index>& _dest) const
-    {
-      // TODO directly evaluate into _dest;
-      SparseMatrix<DestScalar,ColMajor,Index> tmp(_dest.rows(),_dest.cols());
-      internal::permute_symm_to_fullsymm<UpLo>(m_matrix, tmp);
-      _dest = tmp;
-    }
-    
     /** \returns an expression of P H P^-1 */
-    SparseSymmetricPermutationProduct<_MatrixTypeNested,UpLo> twistedBy(const PermutationMatrix<Dynamic,Dynamic,Index>& perm) const
+    // TODO implement twists in a more evaluator friendly fashion
+    SparseSymmetricPermutationProduct<_MatrixTypeNested,Mode> twistedBy(const PermutationMatrix<Dynamic,Dynamic,StorageIndex>& perm) const
     {
-      return SparseSymmetricPermutationProduct<_MatrixTypeNested,UpLo>(m_matrix, perm);
+      return SparseSymmetricPermutationProduct<_MatrixTypeNested,Mode>(m_matrix, perm);
     }
-    
-    template<typename SrcMatrixType,int SrcUpLo>
-    SparseSelfAdjointView& operator=(const SparseSymmetricPermutationProduct<SrcMatrixType,SrcUpLo>& permutedMatrix)
+
+    template<typename SrcMatrixType,int SrcMode>
+    SparseSelfAdjointView& operator=(const SparseSymmetricPermutationProduct<SrcMatrixType,SrcMode>& permutedMatrix)
     {
-      permutedMatrix.evalTo(*this);
+      internal::call_assignment_no_alias_no_transpose(*this, permutedMatrix);
       return *this;
     }
 
-
     SparseSelfAdjointView& operator=(const SparseSelfAdjointView& src)
     {
-      PermutationMatrix<Dynamic> pnull;
+      PermutationMatrix<Dynamic,Dynamic,StorageIndex> pnull;
       return *this = src.twistedBy(pnull);
     }
 
-    template<typename SrcMatrixType,unsigned int SrcUpLo>
-    SparseSelfAdjointView& operator=(const SparseSelfAdjointView<SrcMatrixType,SrcUpLo>& src)
+    // Since we override the copy-assignment operator, we need to explicitly re-declare the copy-constructor
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(SparseSelfAdjointView)
+
+    template<typename SrcMatrixType,unsigned int SrcMode>
+    SparseSelfAdjointView& operator=(const SparseSelfAdjointView<SrcMatrixType,SrcMode>& src)
     {
-      PermutationMatrix<Dynamic> pnull;
+      PermutationMatrix<Dynamic,Dynamic,StorageIndex> pnull;
       return *this = src.twistedBy(pnull);
     }
     
-
-    // const SparseLLT<PlainObject, UpLo> llt() const;
-    // const SparseLDLT<PlainObject, UpLo> ldlt() const;
-
+    void resize(Index rows, Index cols)
+    {
+      EIGEN_ONLY_USED_FOR_DEBUG(rows);
+      EIGEN_ONLY_USED_FOR_DEBUG(cols);
+      eigen_assert(rows == this->rows() && cols == this->cols()
+                && "SparseSelfadjointView::resize() does not actually allow to resize.");
+    }
+    
   protected:
 
-    typename MatrixType::Nested m_matrix;
-    mutable VectorI m_countPerRow;
-    mutable VectorI m_countPerCol;
+    MatrixTypeNested m_matrix;
+    //mutable VectorI m_countPerRow;
+    //mutable VectorI m_countPerCol;
+  private:
+    template<typename Dest> void evalTo(Dest &) const;
 };
 
 /***************************************************************************
@@ -178,146 +175,268 @@
 
 template<typename Derived>
 template<unsigned int UpLo>
-const SparseSelfAdjointView<Derived, UpLo> SparseMatrixBase<Derived>::selfadjointView() const
+typename SparseMatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type SparseMatrixBase<Derived>::selfadjointView() const
 {
-  return derived();
+  return SparseSelfAdjointView<const Derived, UpLo>(derived());
 }
 
 template<typename Derived>
 template<unsigned int UpLo>
-SparseSelfAdjointView<Derived, UpLo> SparseMatrixBase<Derived>::selfadjointView()
+typename SparseMatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type SparseMatrixBase<Derived>::selfadjointView()
 {
-  return derived();
+  return SparseSelfAdjointView<Derived, UpLo>(derived());
 }
 
 /***************************************************************************
 * Implementation of SparseSelfAdjointView methods
 ***************************************************************************/
 
-template<typename MatrixType, unsigned int UpLo>
+template<typename MatrixType, unsigned int Mode>
 template<typename DerivedU>
-SparseSelfAdjointView<MatrixType,UpLo>&
-SparseSelfAdjointView<MatrixType,UpLo>::rankUpdate(const SparseMatrixBase<DerivedU>& u, const Scalar& alpha)
+SparseSelfAdjointView<MatrixType,Mode>&
+SparseSelfAdjointView<MatrixType,Mode>::rankUpdate(const SparseMatrixBase<DerivedU>& u, const Scalar& alpha)
 {
-  SparseMatrix<Scalar,MatrixType::Flags&RowMajorBit?RowMajor:ColMajor> tmp = u * u.adjoint();
+  SparseMatrix<Scalar,(MatrixType::Flags&RowMajorBit)?RowMajor:ColMajor> tmp = u * u.adjoint();
   if(alpha==Scalar(0))
-    m_matrix.const_cast_derived() = tmp.template triangularView<UpLo>();
+    m_matrix = tmp.template triangularView<Mode>();
   else
-    m_matrix.const_cast_derived() += alpha * tmp.template triangularView<UpLo>();
+    m_matrix += alpha * tmp.template triangularView<Mode>();
 
   return *this;
 }
 
+namespace internal {
+  
+// TODO currently a selfadjoint expression has the form SelfAdjointView<.,.>
+//      in the future selfadjoint-ness should be defined by the expression traits
+//      such that Transpose<SelfAdjointView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
+template<typename MatrixType, unsigned int Mode>
+struct evaluator_traits<SparseSelfAdjointView<MatrixType,Mode> >
+{
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef SparseSelfAdjointShape Shape;
+};
+
+struct SparseSelfAdjoint2Sparse {};
+
+template<> struct AssignmentKind<SparseShape,SparseSelfAdjointShape> { typedef SparseSelfAdjoint2Sparse Kind; };
+template<> struct AssignmentKind<SparseSelfAdjointShape,SparseShape> { typedef Sparse2Sparse Kind; };
+
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, SparseSelfAdjoint2Sparse>
+{
+  typedef typename DstXprType::StorageIndex StorageIndex;
+  typedef internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> AssignOpType;
+
+  template<typename DestScalar,int StorageOrder>
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src, const AssignOpType&/*func*/)
+  {
+    internal::permute_symm_to_fullsymm<SrcXprType::Mode>(src.matrix(), dst);
+  }
+
+  // FIXME: the handling of += and -= in sparse matrices should be cleanup so that next two overloads could be reduced to:
+  template<typename DestScalar,int StorageOrder,typename AssignFunc>
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src, const AssignFunc& func)
+  {
+    SparseMatrix<DestScalar,StorageOrder,StorageIndex> tmp(src.rows(),src.cols());
+    run(tmp, src, AssignOpType());
+    call_assignment_no_alias_no_transpose(dst, tmp, func);
+  }
+
+  template<typename DestScalar,int StorageOrder>
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src,
+                  const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>& /* func */)
+  {
+    SparseMatrix<DestScalar,StorageOrder,StorageIndex> tmp(src.rows(),src.cols());
+    run(tmp, src, AssignOpType());
+    dst += tmp;
+  }
+
+  template<typename DestScalar,int StorageOrder>
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src,
+                  const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>& /* func */)
+  {
+    SparseMatrix<DestScalar,StorageOrder,StorageIndex> tmp(src.rows(),src.cols());
+    run(tmp, src, AssignOpType());
+    dst -= tmp;
+  }
+  
+  template<typename DestScalar>
+  static void run(DynamicSparseMatrix<DestScalar,ColMajor,StorageIndex>& dst, const SrcXprType &src, const AssignOpType&/*func*/)
+  {
+    // TODO directly evaluate into dst;
+    SparseMatrix<DestScalar,ColMajor,StorageIndex> tmp(dst.rows(),dst.cols());
+    internal::permute_symm_to_fullsymm<SrcXprType::Mode>(src.matrix(), tmp);
+    dst = tmp;
+  }
+};
+
+} // end namespace internal
+
 /***************************************************************************
 * Implementation of sparse self-adjoint time dense matrix
 ***************************************************************************/
 
 namespace internal {
-template<typename Lhs, typename Rhs, int UpLo>
-struct traits<SparseSelfAdjointTimeDenseProduct<Lhs,Rhs,UpLo> >
- : traits<ProductBase<SparseSelfAdjointTimeDenseProduct<Lhs,Rhs,UpLo>, Lhs, Rhs> >
+
+template<int Mode, typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType>
+inline void sparse_selfadjoint_time_dense_product(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha)
 {
-  typedef Dense StorageKind;
-};
-}
+  EIGEN_ONLY_USED_FOR_DEBUG(alpha);
+  
+  typedef typename internal::nested_eval<SparseLhsType,DenseRhsType::MaxColsAtCompileTime>::type SparseLhsTypeNested;
+  typedef typename internal::remove_all<SparseLhsTypeNested>::type SparseLhsTypeNestedCleaned;
+  typedef evaluator<SparseLhsTypeNestedCleaned> LhsEval;
+  typedef typename LhsEval::InnerIterator LhsIterator;
+  typedef typename SparseLhsType::Scalar LhsScalar;
+  
+  enum {
+    LhsIsRowMajor = (LhsEval::Flags&RowMajorBit)==RowMajorBit,
+    ProcessFirstHalf =
+              ((Mode&(Upper|Lower))==(Upper|Lower))
+          || ( (Mode&Upper) && !LhsIsRowMajor)
+          || ( (Mode&Lower) && LhsIsRowMajor),
+    ProcessSecondHalf = !ProcessFirstHalf
+  };
+  
+  SparseLhsTypeNested lhs_nested(lhs);
+  LhsEval lhsEval(lhs_nested);
 
-template<typename Lhs, typename Rhs, int UpLo>
-class SparseSelfAdjointTimeDenseProduct
-  : public ProductBase<SparseSelfAdjointTimeDenseProduct<Lhs,Rhs,UpLo>, Lhs, Rhs>
-{
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(SparseSelfAdjointTimeDenseProduct)
-
-    SparseSelfAdjointTimeDenseProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
-    {}
-
-    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+  // work on one column at once
+  for (Index k=0; k<rhs.cols(); ++k)
+  {
+    for (Index j=0; j<lhs.outerSize(); ++j)
     {
-      EIGEN_ONLY_USED_FOR_DEBUG(alpha);
-      // TODO use alpha
-      eigen_assert(alpha==Scalar(1) && "alpha != 1 is not implemented yet, sorry");
-      typedef typename internal::remove_all<Lhs>::type _Lhs;
-      typedef typename _Lhs::InnerIterator LhsInnerIterator;
-      enum {
-        LhsIsRowMajor = (_Lhs::Flags&RowMajorBit)==RowMajorBit,
-        ProcessFirstHalf =
-                 ((UpLo&(Upper|Lower))==(Upper|Lower))
-              || ( (UpLo&Upper) && !LhsIsRowMajor)
-              || ( (UpLo&Lower) && LhsIsRowMajor),
-        ProcessSecondHalf = !ProcessFirstHalf
-      };
-      for (Index j=0; j<m_lhs.outerSize(); ++j)
+      LhsIterator i(lhsEval,j);
+      // handle diagonal coeff
+      if (ProcessSecondHalf)
       {
-        LhsInnerIterator i(m_lhs,j);
-        if (ProcessSecondHalf)
+        while (i && i.index()<j) ++i;
+        if(i && i.index()==j)
         {
-          while (i && i.index()<j) ++i;
-          if(i && i.index()==j)
-          {
-            dest.row(j) += i.value() * m_rhs.row(j);
-            ++i;
-          }
+          res.coeffRef(j,k) += alpha * i.value() * rhs.coeff(j,k);
+          ++i;
         }
-        for(; (ProcessFirstHalf ? i && i.index() < j : i) ; ++i)
-        {
-          Index a = LhsIsRowMajor ? j : i.index();
-          Index b = LhsIsRowMajor ? i.index() : j;
-          typename Lhs::Scalar v = i.value();
-          dest.row(a) += (v) * m_rhs.row(b);
-          dest.row(b) += numext::conj(v) * m_rhs.row(a);
-        }
-        if (ProcessFirstHalf && i && (i.index()==j))
-          dest.row(j) += i.value() * m_rhs.row(j);
       }
+
+      // premultiplied rhs for scatters
+      typename ScalarBinaryOpTraits<AlphaType, typename DenseRhsType::Scalar>::ReturnType rhs_j(alpha*rhs(j,k));
+      // accumulator for partial scalar product
+      typename DenseResType::Scalar res_j(0);
+      for(; (ProcessFirstHalf ? i && i.index() < j : i) ; ++i)
+      {
+        LhsScalar lhs_ij = i.value();
+        if(!LhsIsRowMajor) lhs_ij = numext::conj(lhs_ij);
+        res_j += lhs_ij * rhs.coeff(i.index(),k);
+        res(i.index(),k) += numext::conj(lhs_ij) * rhs_j;
+      }
+      res.coeffRef(j,k) += alpha * res_j;
+
+      // handle diagonal coeff
+      if (ProcessFirstHalf && i && (i.index()==j))
+        res.coeffRef(j,k) += alpha * i.value() * rhs.coeff(j,k);
     }
-
-  private:
-    SparseSelfAdjointTimeDenseProduct& operator=(const SparseSelfAdjointTimeDenseProduct&);
-};
-
-namespace internal {
-template<typename Lhs, typename Rhs, int UpLo>
-struct traits<DenseTimeSparseSelfAdjointProduct<Lhs,Rhs,UpLo> >
- : traits<ProductBase<DenseTimeSparseSelfAdjointProduct<Lhs,Rhs,UpLo>, Lhs, Rhs> >
-{};
+  }
 }
 
-template<typename Lhs, typename Rhs, int UpLo>
-class DenseTimeSparseSelfAdjointProduct
-  : public ProductBase<DenseTimeSparseSelfAdjointProduct<Lhs,Rhs,UpLo>, Lhs, Rhs>
+
+template<typename LhsView, typename Rhs, int ProductType>
+struct generic_product_impl<LhsView, Rhs, SparseSelfAdjointShape, DenseShape, ProductType>
+: generic_product_impl_base<LhsView, Rhs, generic_product_impl<LhsView, Rhs, SparseSelfAdjointShape, DenseShape, ProductType> >
 {
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(DenseTimeSparseSelfAdjointProduct)
-
-    DenseTimeSparseSelfAdjointProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
-    {}
-
-    template<typename Dest> void scaleAndAddTo(Dest& /*dest*/, const Scalar& /*alpha*/) const
-    {
-      // TODO
-    }
-
-  private:
-    DenseTimeSparseSelfAdjointProduct& operator=(const DenseTimeSparseSelfAdjointProduct&);
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const LhsView& lhsView, const Rhs& rhs, const typename Dest::Scalar& alpha)
+  {
+    typedef typename LhsView::_MatrixTypeNested Lhs;
+    typedef typename nested_eval<Lhs,Dynamic>::type LhsNested;
+    typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
+    LhsNested lhsNested(lhsView.matrix());
+    RhsNested rhsNested(rhs);
+    
+    internal::sparse_selfadjoint_time_dense_product<LhsView::Mode>(lhsNested, rhsNested, dst, alpha);
+  }
 };
 
+template<typename Lhs, typename RhsView, int ProductType>
+struct generic_product_impl<Lhs, RhsView, DenseShape, SparseSelfAdjointShape, ProductType>
+: generic_product_impl_base<Lhs, RhsView, generic_product_impl<Lhs, RhsView, DenseShape, SparseSelfAdjointShape, ProductType> >
+{
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const RhsView& rhsView, const typename Dest::Scalar& alpha)
+  {
+    typedef typename RhsView::_MatrixTypeNested Rhs;
+    typedef typename nested_eval<Lhs,Dynamic>::type LhsNested;
+    typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
+    LhsNested lhsNested(lhs);
+    RhsNested rhsNested(rhsView.matrix());
+    
+    // transpose everything
+    Transpose<Dest> dstT(dst);
+    internal::sparse_selfadjoint_time_dense_product<RhsView::TransposeMode>(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha);
+  }
+};
+
+// NOTE: these two overloads are needed to evaluate the sparse selfadjoint view into a full sparse matrix
+// TODO: maybe the copy could be handled by generic_product_impl so that these overloads would not be needed anymore
+
+template<typename LhsView, typename Rhs, int ProductTag>
+struct product_evaluator<Product<LhsView, Rhs, DefaultProduct>, ProductTag, SparseSelfAdjointShape, SparseShape>
+  : public evaluator<typename Product<typename Rhs::PlainObject, Rhs, DefaultProduct>::PlainObject>
+{
+  typedef Product<LhsView, Rhs, DefaultProduct> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  product_evaluator(const XprType& xpr)
+    : m_lhs(xpr.lhs()), m_result(xpr.rows(), xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    generic_product_impl<typename Rhs::PlainObject, Rhs, SparseShape, SparseShape, ProductTag>::evalTo(m_result, m_lhs, xpr.rhs());
+  }
+  
+protected:
+  typename Rhs::PlainObject m_lhs;
+  PlainObject m_result;
+};
+
+template<typename Lhs, typename RhsView, int ProductTag>
+struct product_evaluator<Product<Lhs, RhsView, DefaultProduct>, ProductTag, SparseShape, SparseSelfAdjointShape>
+  : public evaluator<typename Product<Lhs, typename Lhs::PlainObject, DefaultProduct>::PlainObject>
+{
+  typedef Product<Lhs, RhsView, DefaultProduct> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  product_evaluator(const XprType& xpr)
+    : m_rhs(xpr.rhs()), m_result(xpr.rows(), xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    generic_product_impl<Lhs, typename Lhs::PlainObject, SparseShape, SparseShape, ProductTag>::evalTo(m_result, xpr.lhs(), m_rhs);
+  }
+  
+protected:
+  typename Lhs::PlainObject m_rhs;
+  PlainObject m_result;
+};
+
+} // namespace internal
+
 /***************************************************************************
 * Implementation of symmetric copies and permutations
 ***************************************************************************/
 namespace internal {
-  
-template<typename MatrixType, int UpLo>
-struct traits<SparseSymmetricPermutationProduct<MatrixType,UpLo> > : traits<MatrixType> {
-};
 
-template<int UpLo,typename MatrixType,int DestOrder>
-void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::Index>& _dest, const typename MatrixType::Index* perm)
+template<int Mode,typename MatrixType,int DestOrder>
+void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::StorageIndex>& _dest, const typename MatrixType::StorageIndex* perm)
 {
-  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::StorageIndex StorageIndex;
   typedef typename MatrixType::Scalar Scalar;
-  typedef SparseMatrix<Scalar,DestOrder,Index> Dest;
-  typedef Matrix<Index,Dynamic,1> VectorI;
+  typedef SparseMatrix<Scalar,DestOrder,StorageIndex> Dest;
+  typedef Matrix<StorageIndex,Dynamic,1> VectorI;
+  typedef evaluator<MatrixType> MatEval;
+  typedef typename evaluator<MatrixType>::InnerIterator MatIterator;
   
+  MatEval matEval(mat);
   Dest& dest(_dest.derived());
   enum {
     StorageOrderMatch = int(Dest::IsRowMajor) == int(MatrixType::IsRowMajor)
@@ -331,17 +450,17 @@
   for(Index j = 0; j<size; ++j)
   {
     Index jp = perm ? perm[j] : j;
-    for(typename MatrixType::InnerIterator it(mat,j); it; ++it)
+    for(MatIterator it(matEval,j); it; ++it)
     {
       Index i = it.index();
       Index r = it.row();
       Index c = it.col();
       Index ip = perm ? perm[i] : i;
-      if(UpLo==(Upper|Lower))
+      if(Mode==int(Upper|Lower))
         count[StorageOrderMatch ? jp : ip]++;
       else if(r==c)
         count[ip]++;
-      else if(( UpLo==Lower && r>c) || ( UpLo==Upper && r<c))
+      else if(( Mode==Lower && r>c) || ( Mode==Upper && r<c))
       {
         count[ip]++;
         count[jp]++;
@@ -359,18 +478,18 @@
     count[j] = dest.outerIndexPtr()[j];
   
   // copy data
-  for(Index j = 0; j<size; ++j)
+  for(StorageIndex j = 0; j<size; ++j)
   {
-    for(typename MatrixType::InnerIterator it(mat,j); it; ++it)
+    for(MatIterator it(matEval,j); it; ++it)
     {
-      Index i = it.index();
+      StorageIndex i = internal::convert_index<StorageIndex>(it.index());
       Index r = it.row();
       Index c = it.col();
       
-      Index jp = perm ? perm[j] : j;
-      Index ip = perm ? perm[i] : i;
+      StorageIndex jp = perm ? perm[j] : j;
+      StorageIndex ip = perm ? perm[i] : i;
       
-      if(UpLo==(Upper|Lower))
+      if(Mode==int(Upper|Lower))
       {
         Index k = count[StorageOrderMatch ? jp : ip]++;
         dest.innerIndexPtr()[k] = StorageOrderMatch ? ip : jp;
@@ -382,7 +501,7 @@
         dest.innerIndexPtr()[k] = ip;
         dest.valuePtr()[k] = it.value();
       }
-      else if(( (UpLo&Lower)==Lower && r>c) || ( (UpLo&Upper)==Upper && r<c))
+      else if(( (Mode&Lower)==Lower && r>c) || ( (Mode&Upper)==Upper && r<c))
       {
         if(!StorageOrderMatch)
           std::swap(ip,jp);
@@ -397,35 +516,40 @@
   }
 }
 
-template<int _SrcUpLo,int _DstUpLo,typename MatrixType,int DstOrder>
-void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DstOrder,typename MatrixType::Index>& _dest, const typename MatrixType::Index* perm)
+template<int _SrcMode,int _DstMode,typename MatrixType,int DstOrder>
+void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DstOrder,typename MatrixType::StorageIndex>& _dest, const typename MatrixType::StorageIndex* perm)
 {
-  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::StorageIndex StorageIndex;
   typedef typename MatrixType::Scalar Scalar;
-  SparseMatrix<Scalar,DstOrder,Index>& dest(_dest.derived());
-  typedef Matrix<Index,Dynamic,1> VectorI;
+  SparseMatrix<Scalar,DstOrder,StorageIndex>& dest(_dest.derived());
+  typedef Matrix<StorageIndex,Dynamic,1> VectorI;
+  typedef evaluator<MatrixType> MatEval;
+  typedef typename evaluator<MatrixType>::InnerIterator MatIterator;
+
   enum {
     SrcOrder = MatrixType::IsRowMajor ? RowMajor : ColMajor,
     StorageOrderMatch = int(SrcOrder) == int(DstOrder),
-    DstUpLo = DstOrder==RowMajor ? (_DstUpLo==Upper ? Lower : Upper) : _DstUpLo,
-    SrcUpLo = SrcOrder==RowMajor ? (_SrcUpLo==Upper ? Lower : Upper) : _SrcUpLo
+    DstMode = DstOrder==RowMajor ? (_DstMode==Upper ? Lower : Upper) : _DstMode,
+    SrcMode = SrcOrder==RowMajor ? (_SrcMode==Upper ? Lower : Upper) : _SrcMode
   };
+
+  MatEval matEval(mat);
   
   Index size = mat.rows();
   VectorI count(size);
   count.setZero();
   dest.resize(size,size);
-  for(Index j = 0; j<size; ++j)
+  for(StorageIndex j = 0; j<size; ++j)
   {
-    Index jp = perm ? perm[j] : j;
-    for(typename MatrixType::InnerIterator it(mat,j); it; ++it)
+    StorageIndex jp = perm ? perm[j] : j;
+    for(MatIterator it(matEval,j); it; ++it)
     {
-      Index i = it.index();
-      if((int(SrcUpLo)==int(Lower) && i<j) || (int(SrcUpLo)==int(Upper) && i>j))
+      StorageIndex i = it.index();
+      if((int(SrcMode)==int(Lower) && i<j) || (int(SrcMode)==int(Upper) && i>j))
         continue;
                   
-      Index ip = perm ? perm[i] : i;
-      count[int(DstUpLo)==int(Lower) ? (std::min)(ip,jp) : (std::max)(ip,jp)]++;
+      StorageIndex ip = perm ? perm[i] : i;
+      count[int(DstMode)==int(Lower) ? (std::min)(ip,jp) : (std::max)(ip,jp)]++;
     }
   }
   dest.outerIndexPtr()[0] = 0;
@@ -435,23 +559,23 @@
   for(Index j=0; j<size; ++j)
     count[j] = dest.outerIndexPtr()[j];
   
-  for(Index j = 0; j<size; ++j)
+  for(StorageIndex j = 0; j<size; ++j)
   {
     
-    for(typename MatrixType::InnerIterator it(mat,j); it; ++it)
+    for(MatIterator it(matEval,j); it; ++it)
     {
-      Index i = it.index();
-      if((int(SrcUpLo)==int(Lower) && i<j) || (int(SrcUpLo)==int(Upper) && i>j))
+      StorageIndex i = it.index();
+      if((int(SrcMode)==int(Lower) && i<j) || (int(SrcMode)==int(Upper) && i>j))
         continue;
                   
-      Index jp = perm ? perm[j] : j;
-      Index ip = perm? perm[i] : i;
+      StorageIndex jp = perm ? perm[j] : j;
+      StorageIndex ip = perm? perm[i] : i;
       
-      Index k = count[int(DstUpLo)==int(Lower) ? (std::min)(ip,jp) : (std::max)(ip,jp)]++;
-      dest.innerIndexPtr()[k] = int(DstUpLo)==int(Lower) ? (std::max)(ip,jp) : (std::min)(ip,jp);
+      Index k = count[int(DstMode)==int(Lower) ? (std::min)(ip,jp) : (std::max)(ip,jp)]++;
+      dest.innerIndexPtr()[k] = int(DstMode)==int(Lower) ? (std::max)(ip,jp) : (std::min)(ip,jp);
       
       if(!StorageOrderMatch) std::swap(ip,jp);
-      if( ((int(DstUpLo)==int(Lower) && ip<jp) || (int(DstUpLo)==int(Upper) && ip>jp)))
+      if( ((int(DstMode)==int(Lower) && ip<jp) || (int(DstMode)==int(Upper) && ip>jp)))
         dest.valuePtr()[k] = numext::conj(it.value());
       else
         dest.valuePtr()[k] = it.value();
@@ -461,19 +585,33 @@
 
 }
 
-template<typename MatrixType,int UpLo>
+// TODO implement twists in a more evaluator friendly fashion
+
+namespace internal {
+
+template<typename MatrixType, int Mode>
+struct traits<SparseSymmetricPermutationProduct<MatrixType,Mode> > : traits<MatrixType> {
+};
+
+}
+
+template<typename MatrixType,int Mode>
 class SparseSymmetricPermutationProduct
-  : public EigenBase<SparseSymmetricPermutationProduct<MatrixType,UpLo> >
+  : public EigenBase<SparseSymmetricPermutationProduct<MatrixType,Mode> >
 {
   public:
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    enum {
+      RowsAtCompileTime = internal::traits<SparseSymmetricPermutationProduct>::RowsAtCompileTime,
+      ColsAtCompileTime = internal::traits<SparseSymmetricPermutationProduct>::ColsAtCompileTime
+    };
   protected:
-    typedef PermutationMatrix<Dynamic,Dynamic,Index> Perm;
+    typedef PermutationMatrix<Dynamic,Dynamic,StorageIndex> Perm;
   public:
-    typedef Matrix<Index,Dynamic,1> VectorI;
+    typedef Matrix<StorageIndex,Dynamic,1> VectorI;
     typedef typename MatrixType::Nested MatrixTypeNested;
-    typedef typename internal::remove_all<MatrixTypeNested>::type _MatrixTypeNested;
+    typedef typename internal::remove_all<MatrixTypeNested>::type NestedExpression;
     
     SparseSymmetricPermutationProduct(const MatrixType& mat, const Perm& perm)
       : m_matrix(mat), m_perm(perm)
@@ -481,20 +619,9 @@
     
     inline Index rows() const { return m_matrix.rows(); }
     inline Index cols() const { return m_matrix.cols(); }
-    
-    template<typename DestScalar, int Options, typename DstIndex>
-    void evalTo(SparseMatrix<DestScalar,Options,DstIndex>& _dest) const
-    {
-//       internal::permute_symm_to_fullsymm<UpLo>(m_matrix,_dest,m_perm.indices().data());
-      SparseMatrix<DestScalar,(Options&RowMajor)==RowMajor ? ColMajor : RowMajor, DstIndex> tmp;
-      internal::permute_symm_to_fullsymm<UpLo>(m_matrix,tmp,m_perm.indices().data());
-      _dest = tmp;
-    }
-    
-    template<typename DestType,unsigned int DestUpLo> void evalTo(SparseSelfAdjointView<DestType,DestUpLo>& dest) const
-    {
-      internal::permute_symm_to_symm<UpLo,DestUpLo>(m_matrix,dest.matrix(),m_perm.indices().data());
-    }
+        
+    const NestedExpression& matrix() const { return m_matrix; }
+    const Perm& perm() const { return m_perm; }
     
   protected:
     MatrixTypeNested m_matrix;
@@ -502,6 +629,31 @@
 
 };
 
+namespace internal {
+  
+template<typename DstXprType, typename MatrixType, int Mode, typename Scalar>
+struct Assignment<DstXprType, SparseSymmetricPermutationProduct<MatrixType,Mode>, internal::assign_op<Scalar,typename MatrixType::Scalar>, Sparse2Sparse>
+{
+  typedef SparseSymmetricPermutationProduct<MatrixType,Mode> SrcXprType;
+  typedef typename DstXprType::StorageIndex DstIndex;
+  template<int Options>
+  static void run(SparseMatrix<Scalar,Options,DstIndex> &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename MatrixType::Scalar> &)
+  {
+    // internal::permute_symm_to_fullsymm<Mode>(m_matrix,_dest,m_perm.indices().data());
+    SparseMatrix<Scalar,(Options&RowMajor)==RowMajor ? ColMajor : RowMajor, DstIndex> tmp;
+    internal::permute_symm_to_fullsymm<Mode>(src.matrix(),tmp,src.perm().indices().data());
+    dst = tmp;
+  }
+  
+  template<typename DestType,unsigned int DestMode>
+  static void run(SparseSelfAdjointView<DestType,DestMode>& dst, const SrcXprType &src, const internal::assign_op<Scalar,typename MatrixType::Scalar> &)
+  {
+    internal::permute_symm_to_symm<Mode,DestMode>(src.matrix(),dst.matrix(),src.perm().indices().data());
+  }
+};
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSE_SELFADJOINTVIEW_H

diff --git a/Eigen/src/SparseCore/SparseSolverBase.h b/Eigen/src/SparseCore/SparseSolverBase.h
new file mode 100644
index 0000000..b4c9a42
--- /dev/null
+++ b/Eigen/src/SparseCore/SparseSolverBase.h

@@ -0,0 +1,124 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSESOLVERBASE_H
+#define EIGEN_SPARSESOLVERBASE_H
+
+namespace Eigen { 
+
+namespace internal {
+
+  /** \internal
+  * Helper functions to solve with a sparse right-hand-side and result.
+  * The rhs is decomposed into small vertical panels which are solved through dense temporaries.
+  */
+template<typename Decomposition, typename Rhs, typename Dest>
+typename enable_if<Rhs::ColsAtCompileTime!=1 && Dest::ColsAtCompileTime!=1>::type
+solve_sparse_through_dense_panels(const Decomposition &dec, const Rhs& rhs, Dest &dest)
+{
+  EIGEN_STATIC_ASSERT((Dest::Flags&RowMajorBit)==0,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+  typedef typename Dest::Scalar DestScalar;
+  // we process the sparse rhs per block of NbColsAtOnce columns temporarily stored into a dense matrix.
+  static const Index NbColsAtOnce = 4;
+  Index rhsCols = rhs.cols();
+  Index size = rhs.rows();
+  // the temporary matrices do not need more columns than NbColsAtOnce:
+  Index tmpCols = (std::min)(rhsCols, NbColsAtOnce); 
+  Eigen::Matrix<DestScalar,Dynamic,Dynamic> tmp(size,tmpCols);
+  Eigen::Matrix<DestScalar,Dynamic,Dynamic> tmpX(size,tmpCols);
+  for(Index k=0; k<rhsCols; k+=NbColsAtOnce)
+  {
+    Index actualCols = std::min<Index>(rhsCols-k, NbColsAtOnce);
+    tmp.leftCols(actualCols) = rhs.middleCols(k,actualCols);
+    tmpX.leftCols(actualCols) = dec.solve(tmp.leftCols(actualCols));
+    dest.middleCols(k,actualCols) = tmpX.leftCols(actualCols).sparseView();
+  }
+}
+
+// Overload for vector as rhs
+template<typename Decomposition, typename Rhs, typename Dest>
+typename enable_if<Rhs::ColsAtCompileTime==1 || Dest::ColsAtCompileTime==1>::type
+solve_sparse_through_dense_panels(const Decomposition &dec, const Rhs& rhs, Dest &dest)
+{
+  typedef typename Dest::Scalar DestScalar;
+  Index size = rhs.rows();
+  Eigen::Matrix<DestScalar,Dynamic,1> rhs_dense(rhs);
+  Eigen::Matrix<DestScalar,Dynamic,1> dest_dense(size);
+  dest_dense = dec.solve(rhs_dense);
+  dest = dest_dense.sparseView();
+}
+
+} // end namespace internal
+
+/** \class SparseSolverBase
+  * \ingroup SparseCore_Module
+  * \brief A base class for sparse solvers
+  *
+  * \tparam Derived the actual type of the solver.
+  *
+  */
+template<typename Derived>
+class SparseSolverBase : internal::noncopyable
+{
+  public:
+
+    /** Default constructor */
+    SparseSolverBase()
+      : m_isInitialized(false)
+    {}
+
+    ~SparseSolverBase()
+    {}
+
+    Derived& derived() { return *static_cast<Derived*>(this); }
+    const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    
+    /** \returns an expression of the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const Solve<Derived, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "Solver is not initialized.");
+      eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
+      return Solve<Derived, Rhs>(derived(), b.derived());
+    }
+    
+    /** \returns an expression of the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const Solve<Derived, Rhs>
+    solve(const SparseMatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "Solver is not initialized.");
+      eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
+      return Solve<Derived, Rhs>(derived(), b.derived());
+    }
+    
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** \internal default implementation of solving with a sparse rhs */
+    template<typename Rhs,typename Dest>
+    void _solve_impl(const SparseMatrixBase<Rhs> &b, SparseMatrixBase<Dest> &dest) const
+    {
+      internal::solve_sparse_through_dense_panels(derived(), b.derived(), dest.derived());
+    }
+    #endif // EIGEN_PARSED_BY_DOXYGEN
+
+  protected:
+    
+    mutable bool m_isInitialized;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSESOLVERBASE_H

diff --git a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
index b78825f..88820a4 100644
--- a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
+++ b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -21,8 +21,9 @@
 {
   // return sparse_sparse_product_with_pruning_impl2(lhs,rhs,res);
 
-  typedef typename remove_all<Lhs>::type::Scalar Scalar;
-  typedef typename remove_all<Lhs>::type::Index Index;
+  typedef typename remove_all<Rhs>::type::Scalar RhsScalar;
+  typedef typename remove_all<ResultType>::type::Scalar ResScalar;
+  typedef typename remove_all<Lhs>::type::StorageIndex StorageIndex;
 
   // make sure to call innerSize/outerSize since we fake the storage order.
   Index rows = lhs.innerSize();
@@ -31,29 +32,27 @@
   eigen_assert(lhs.outerSize() == rhs.innerSize());
 
   // allocate a temporary buffer
-  AmbiVector<Scalar,Index> tempVector(rows);
-
-  // estimate the number of non zero entries
-  // given a rhs column containing Y non zeros, we assume that the respective Y columns
-  // of the lhs differs in average of one non zeros, thus the number of non zeros for
-  // the product of a rhs column with the lhs is X+Y where X is the average number of non zero
-  // per column of the lhs.
-  // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
-  // Also, nnz can not be more than the total number of elements, i.e. rows(lhs) * cols(rhs).
-  // For example, in the case of a row vector multiplied by column vector, this limit is 1,
-  // but nnz(lhs) + nnz(rhs) can be more than 1.
-  // We use double to avoid integer overflows.
-  double estimated_nnz_prod = std::min(double(lhs.nonZeros()) + double(rhs.nonZeros()),
-                                       double(lhs.rows()) * double(rhs.cols()) + 1);
+  AmbiVector<ResScalar,StorageIndex> tempVector(rows);
 
   // mimics a resizeByInnerOuter:
   if(ResultType::IsRowMajor)
     res.resize(cols, rows);
   else
     res.resize(rows, cols);
+  
+  evaluator<Lhs> lhsEval(lhs);
+  evaluator<Rhs> rhsEval(rhs);
+  
+  // estimate the number of non zero entries
+  // given a rhs column containing Y non zeros, we assume that the respective Y columns
+  // of the lhs differs in average of one non zeros, thus the number of non zeros for
+  // the product of a rhs column with the lhs is X+Y where X is the average number of non zero
+  // per column of the lhs.
+  // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
+  Index estimated_nnz_prod = lhsEval.nonZerosEstimate() + rhsEval.nonZerosEstimate();
 
-  res.reserve(Index(estimated_nnz_prod));
-  double ratioColRes = double(estimated_nnz_prod)/(double(lhs.rows()) * double(rhs.cols()));
+  res.reserve(estimated_nnz_prod);
+  double ratioColRes = double(estimated_nnz_prod)/(double(lhs.rows())*double(rhs.cols()));
   for (Index j=0; j<cols; ++j)
   {
     // FIXME:
@@ -61,18 +60,18 @@
     // let's do a more accurate determination of the nnz ratio for the current column j of res
     tempVector.init(ratioColRes);
     tempVector.setZero();
-    for (typename Rhs::InnerIterator rhsIt(rhs, j); rhsIt; ++rhsIt)
+    for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt)
     {
       // FIXME should be written like this: tmp += rhsIt.value() * lhs.col(rhsIt.index())
       tempVector.restart();
-      Scalar x = rhsIt.value();
-      for (typename Lhs::InnerIterator lhsIt(lhs, rhsIt.index()); lhsIt; ++lhsIt)
+      RhsScalar x = rhsIt.value();
+      for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, rhsIt.index()); lhsIt; ++lhsIt)
       {
         tempVector.coeffRef(lhsIt.index()) += lhsIt.value() * x;
       }
     }
     res.startVec(j);
-    for (typename AmbiVector<Scalar,Index>::Iterator it(tempVector,tolerance); it; ++it)
+    for (typename AmbiVector<ResScalar,StorageIndex>::Iterator it(tempVector,tolerance); it; ++it)
       res.insertBackByOuterInner(j,it.index()) = it.value();
   }
   res.finalize();
@@ -87,7 +86,6 @@
 template<typename Lhs, typename Rhs, typename ResultType>
 struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor,ColMajor>
 {
-  typedef typename traits<typename remove_all<Lhs>::type>::Scalar Scalar;
   typedef typename ResultType::RealScalar RealScalar;
 
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
@@ -105,7 +103,7 @@
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
   {
     // we need a col-major matrix to hold the result
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> SparseTemporaryType;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> SparseTemporaryType;
     SparseTemporaryType _res(res.rows(), res.cols());
     internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,SparseTemporaryType>(lhs, rhs, _res, tolerance);
     res = _res;
@@ -131,8 +129,8 @@
   typedef typename ResultType::RealScalar RealScalar;
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::Index> ColMajorMatrixLhs;
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::Index> ColMajorMatrixRhs;
+    typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs;
+    typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs;
     ColMajorMatrixLhs colLhs(lhs);
     ColMajorMatrixRhs colRhs(rhs);
     internal::sparse_sparse_product_with_pruning_impl<ColMajorMatrixLhs,ColMajorMatrixRhs,ResultType>(colLhs, colRhs, res, tolerance);
@@ -145,8 +143,53 @@
   }
 };
 
-// NOTE the 2 others cases (col row *) must never occur since they are caught
-// by ProductReturnType which transforms it to (col col *) by evaluating rhs.
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,RowMajor,RowMajor>
+{
+  typedef typename ResultType::RealScalar RealScalar;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
+  {
+    typedef SparseMatrix<typename Lhs::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixLhs;
+    RowMajorMatrixLhs rowLhs(lhs);
+    sparse_sparse_product_with_pruning_selector<RowMajorMatrixLhs,Rhs,ResultType,RowMajor,RowMajor>(rowLhs,rhs,res,tolerance);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,ColMajor,RowMajor>
+{
+  typedef typename ResultType::RealScalar RealScalar;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
+  {
+    typedef SparseMatrix<typename Rhs::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixRhs;
+    RowMajorMatrixRhs rowRhs(rhs);
+    sparse_sparse_product_with_pruning_selector<Lhs,RowMajorMatrixRhs,ResultType,RowMajor,RowMajor,RowMajor>(lhs,rowRhs,res,tolerance);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,RowMajor,ColMajor>
+{
+  typedef typename ResultType::RealScalar RealScalar;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
+  {
+    typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs;
+    ColMajorMatrixRhs colRhs(rhs);
+    internal::sparse_sparse_product_with_pruning_impl<Lhs,ColMajorMatrixRhs,ResultType>(lhs, colRhs, res, tolerance);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,ColMajor,ColMajor>
+{
+  typedef typename ResultType::RealScalar RealScalar;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
+  {
+    typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs;
+    ColMajorMatrixLhs colLhs(lhs);
+    internal::sparse_sparse_product_with_pruning_impl<ColMajorMatrixLhs,Rhs,ResultType>(colLhs, rhs, res, tolerance);
+  }
+};
 
 } // end namespace internal
 

diff --git a/Eigen/src/SparseCore/SparseTranspose.h b/Eigen/src/SparseCore/SparseTranspose.h
index 7c300ee..3757d4c 100644
--- a/Eigen/src/SparseCore/SparseTranspose.h
+++ b/Eigen/src/SparseCore/SparseTranspose.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,52 +12,81 @@
 
 namespace Eigen { 
 
-template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>
-  : public SparseMatrixBase<Transpose<MatrixType> >
-{
-    typedef typename internal::remove_all<typename MatrixType::Nested>::type _MatrixTypeNested;
+namespace internal {
+  template<typename MatrixType,int CompressedAccess=int(MatrixType::Flags&CompressedAccessBit)>
+  class SparseTransposeImpl
+    : public SparseMatrixBase<Transpose<MatrixType> >
+  {};
+  
+  template<typename MatrixType>
+  class SparseTransposeImpl<MatrixType,CompressedAccessBit>
+    : public SparseCompressedBase<Transpose<MatrixType> >
+  {
+    typedef SparseCompressedBase<Transpose<MatrixType> > Base;
   public:
-
-    EIGEN_SPARSE_PUBLIC_INTERFACE(Transpose<MatrixType> )
-
-    class InnerIterator;
-    class ReverseInnerIterator;
+    using Base::derived;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::StorageIndex StorageIndex;
 
     inline Index nonZeros() const { return derived().nestedExpression().nonZeros(); }
-};
+    
+    inline const Scalar* valuePtr() const { return derived().nestedExpression().valuePtr(); }
+    inline const StorageIndex* innerIndexPtr() const { return derived().nestedExpression().innerIndexPtr(); }
+    inline const StorageIndex* outerIndexPtr() const { return derived().nestedExpression().outerIndexPtr(); }
+    inline const StorageIndex* innerNonZeroPtr() const { return derived().nestedExpression().innerNonZeroPtr(); }
 
-// NOTE: VC10 trigger an ICE if don't put typename TransposeImpl<MatrixType,Sparse>:: in front of Index,
-// a typedef typename TransposeImpl<MatrixType,Sparse>::Index Index;
-// does not fix the issue.
-// An alternative is to define the nested class in the parent class itself.
-template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>::InnerIterator
-  : public _MatrixTypeNested::InnerIterator
+    inline Scalar* valuePtr() { return derived().nestedExpression().valuePtr(); }
+    inline StorageIndex* innerIndexPtr() { return derived().nestedExpression().innerIndexPtr(); }
+    inline StorageIndex* outerIndexPtr() { return derived().nestedExpression().outerIndexPtr(); }
+    inline StorageIndex* innerNonZeroPtr() { return derived().nestedExpression().innerNonZeroPtr(); }
+  };
+}
+  
+template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>
+  : public internal::SparseTransposeImpl<MatrixType>
 {
-    typedef typename _MatrixTypeNested::InnerIterator Base;
-    typedef typename TransposeImpl::Index Index;
-  public:
-
-    EIGEN_STRONG_INLINE InnerIterator(const TransposeImpl& trans, typename TransposeImpl<MatrixType,Sparse>::Index outer)
-      : Base(trans.derived().nestedExpression(), outer)
-    {}
-    Index row() const { return Base::col(); }
-    Index col() const { return Base::row(); }
+  protected:
+    typedef internal::SparseTransposeImpl<MatrixType> Base;
 };
 
-template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>::ReverseInnerIterator
-  : public _MatrixTypeNested::ReverseInnerIterator
+namespace internal {
+  
+template<typename ArgType>
+struct unary_evaluator<Transpose<ArgType>, IteratorBased>
+  : public evaluator_base<Transpose<ArgType> >
 {
-    typedef typename _MatrixTypeNested::ReverseInnerIterator Base;
-    typedef typename TransposeImpl::Index Index;
+    typedef typename evaluator<ArgType>::InnerIterator        EvalIterator;
   public:
+    typedef Transpose<ArgType> XprType;
+    
+    inline Index nonZerosEstimate() const {
+      return m_argImpl.nonZerosEstimate();
+    }
 
-    EIGEN_STRONG_INLINE ReverseInnerIterator(const TransposeImpl& xpr, typename TransposeImpl<MatrixType,Sparse>::Index outer)
-      : Base(xpr.derived().nestedExpression(), outer)
-    {}
-    Index row() const { return Base::col(); }
-    Index col() const { return Base::row(); }
+    class InnerIterator : public EvalIterator
+    {
+    public:
+      EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& unaryOp, Index outer)
+        : EvalIterator(unaryOp.m_argImpl,outer)
+      {}
+      
+      Index row() const { return EvalIterator::col(); }
+      Index col() const { return EvalIterator::row(); }
+    };
+    
+    enum {
+      CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+      Flags = XprType::Flags
+    };
+    
+    explicit unary_evaluator(const XprType& op) :m_argImpl(op.nestedExpression()) {}
+
+  protected:
+    evaluator<ArgType> m_argImpl;
 };
 
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSETRANSPOSE_H

diff --git a/Eigen/src/SparseCore/SparseTriangularView.h b/Eigen/src/SparseCore/SparseTriangularView.h
index 333127b..9ac1202 100644
--- a/Eigen/src/SparseCore/SparseTriangularView.h
+++ b/Eigen/src/SparseCore/SparseTriangularView.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,19 +11,19 @@
 #ifndef EIGEN_SPARSE_TRIANGULARVIEW_H
 #define EIGEN_SPARSE_TRIANGULARVIEW_H
 
-namespace Eigen { 
+namespace Eigen {
 
-namespace internal {
-  
-template<typename MatrixType, int Mode>
-struct traits<SparseTriangularView<MatrixType,Mode> >
-: public traits<MatrixType>
-{};
-
-} // namespace internal
-
-template<typename MatrixType, int Mode> class SparseTriangularView
-  : public SparseMatrixBase<SparseTriangularView<MatrixType,Mode> >
+/** \ingroup SparseCore_Module
+  *
+  * \brief Base class for a triangular part in a \b sparse matrix
+  *
+  * This class is an abstract base class of class TriangularView, and objects of type TriangularViewImpl cannot be instantiated.
+  * It extends class TriangularView with additional methods which are available for sparse expressions only.
+  *
+  * \sa class TriangularView, SparseMatrixBase::triangularView()
+  */
+template<typename MatrixType, unsigned int Mode> class TriangularViewImpl<MatrixType,Mode,Sparse>
+  : public SparseMatrixBase<TriangularView<MatrixType,Mode> >
 {
     enum { SkipFirst = ((Mode&Lower) && !(MatrixType::Flags&RowMajorBit))
                     || ((Mode&Upper) &&  (MatrixType::Flags&RowMajorBit)),
@@ -31,147 +31,157 @@
            SkipDiag = (Mode&ZeroDiag) ? 1 : 0,
            HasUnitDiag = (Mode&UnitDiag) ? 1 : 0
     };
+    
+    typedef TriangularView<MatrixType,Mode> TriangularViewType;
+    
+  protected:
+    // dummy solve function to make TriangularView happy.
+    void solve() const;
 
+    typedef SparseMatrixBase<TriangularViewType> Base;
   public:
     
-    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseTriangularView)
-
-    class InnerIterator;
-    class ReverseInnerIterator;
-
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-
+    EIGEN_SPARSE_PUBLIC_INTERFACE(TriangularViewType)
+    
     typedef typename MatrixType::Nested MatrixTypeNested;
     typedef typename internal::remove_reference<MatrixTypeNested>::type MatrixTypeNestedNonRef;
     typedef typename internal::remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
 
-    inline SparseTriangularView(const MatrixType& matrix) : m_matrix(matrix) {}
-
-    /** \internal */
-    inline const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
-
-    template<typename OtherDerived>
-    typename internal::plain_matrix_type_column_major<OtherDerived>::type
-    solve(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename OtherDerived> void solveInPlace(MatrixBase<OtherDerived>& other) const;
-    template<typename OtherDerived> void solveInPlace(SparseMatrixBase<OtherDerived>& other) const;
-
-  protected:
-    MatrixTypeNested m_matrix;
-};
-
-template<typename MatrixType, int Mode>
-class SparseTriangularView<MatrixType,Mode>::InnerIterator : public MatrixTypeNestedCleaned::InnerIterator
-{
-    typedef typename MatrixTypeNestedCleaned::InnerIterator Base;
-    typedef typename SparseTriangularView::Index Index;
-  public:
-
-    EIGEN_STRONG_INLINE InnerIterator(const SparseTriangularView& view, Index outer)
-      : Base(view.nestedExpression(), outer), m_returnOne(false)
-    {
-      if(SkipFirst)
-      {
-        while((*this) && ((HasUnitDiag||SkipDiag)  ? this->index()<=outer : this->index()<outer))
-          Base::operator++();
-        if(HasUnitDiag)
-          m_returnOne = true;
-      }
-      else if(HasUnitDiag && ((!Base::operator bool()) || Base::index()>=Base::outer()))
-      {
-        if((!SkipFirst) && Base::operator bool())
-          Base::operator++();
-        m_returnOne = true;
-      }
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _solve_impl(const RhsType &rhs, DstType &dst) const {
+      if(!(internal::is_same<RhsType,DstType>::value && internal::extract_data(dst) == internal::extract_data(rhs)))
+        dst = rhs;
+      this->solveInPlace(dst);
     }
 
-    EIGEN_STRONG_INLINE InnerIterator& operator++()
-    {
-      if(HasUnitDiag && m_returnOne)
-        m_returnOne = false;
-      else
+    /** Applies the inverse of \c *this to the dense vector or matrix \a other, "in-place" */
+    template<typename OtherDerived> void solveInPlace(MatrixBase<OtherDerived>& other) const;
+
+    /** Applies the inverse of \c *this to the sparse vector or matrix \a other, "in-place" */
+    template<typename OtherDerived> void solveInPlace(SparseMatrixBase<OtherDerived>& other) const;
+  
+};
+
+namespace internal {
+
+template<typename ArgType, unsigned int Mode>
+struct unary_evaluator<TriangularView<ArgType,Mode>, IteratorBased>
+ : evaluator_base<TriangularView<ArgType,Mode> >
+{
+  typedef TriangularView<ArgType,Mode> XprType;
+  
+protected:
+  
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename evaluator<ArgType>::InnerIterator EvalIterator;
+  
+  enum { SkipFirst = ((Mode&Lower) && !(ArgType::Flags&RowMajorBit))
+                    || ((Mode&Upper) &&  (ArgType::Flags&RowMajorBit)),
+         SkipLast = !SkipFirst,
+         SkipDiag = (Mode&ZeroDiag) ? 1 : 0,
+         HasUnitDiag = (Mode&UnitDiag) ? 1 : 0
+  };
+  
+public:
+  
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    Flags = XprType::Flags
+  };
+    
+  explicit unary_evaluator(const XprType &xpr) : m_argImpl(xpr.nestedExpression()), m_arg(xpr.nestedExpression()) {}
+  
+  inline Index nonZerosEstimate() const {
+    return m_argImpl.nonZerosEstimate();
+  }
+  
+  class InnerIterator : public EvalIterator
+  {
+      typedef EvalIterator Base;
+    public:
+
+      EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& xprEval, Index outer)
+        : Base(xprEval.m_argImpl,outer), m_returnOne(false), m_containsDiag(Base::outer()<xprEval.m_arg.innerSize())
       {
-        Base::operator++();
-        if(HasUnitDiag && (!SkipFirst) && ((!Base::operator bool()) || Base::index()>=Base::outer()))
+        if(SkipFirst)
+        {
+          while((*this) && ((HasUnitDiag||SkipDiag)  ? this->index()<=outer : this->index()<outer))
+            Base::operator++();
+          if(HasUnitDiag)
+            m_returnOne = m_containsDiag;
+        }
+        else if(HasUnitDiag && ((!Base::operator bool()) || Base::index()>=Base::outer()))
         {
           if((!SkipFirst) && Base::operator bool())
             Base::operator++();
-          m_returnOne = true;
+          m_returnOne = m_containsDiag;
         }
       }
-      return *this;
-    }
 
-    inline Index row() const { return (MatrixType::Flags&RowMajorBit ? Base::outer() : this->index()); }
-    inline Index col() const { return (MatrixType::Flags&RowMajorBit ? this->index() : Base::outer()); }
-    inline Index index() const
-    {
-      if(HasUnitDiag && m_returnOne)  return Base::outer();
-      else                            return Base::index();
-    }
-    inline Scalar value() const
-    {
-      if(HasUnitDiag && m_returnOne)  return Scalar(1);
-      else                            return Base::value();
-    }
-
-    EIGEN_STRONG_INLINE operator bool() const
-    {
-      if(HasUnitDiag && m_returnOne)
-        return true;
-      if(SkipFirst) return  Base::operator bool();
-      else
+      EIGEN_STRONG_INLINE InnerIterator& operator++()
       {
-        if (SkipDiag) return (Base::operator bool() && this->index() < this->outer());
-        else return (Base::operator bool() && this->index() <= this->outer());
+        if(HasUnitDiag && m_returnOne)
+          m_returnOne = false;
+        else
+        {
+          Base::operator++();
+          if(HasUnitDiag && (!SkipFirst) && ((!Base::operator bool()) || Base::index()>=Base::outer()))
+          {
+            if((!SkipFirst) && Base::operator bool())
+              Base::operator++();
+            m_returnOne = m_containsDiag;
+          }
+        }
+        return *this;
       }
-    }
-  protected:
-    bool m_returnOne;
+      
+      EIGEN_STRONG_INLINE operator bool() const
+      {
+        if(HasUnitDiag && m_returnOne)
+          return true;
+        if(SkipFirst) return  Base::operator bool();
+        else
+        {
+          if (SkipDiag) return (Base::operator bool() && this->index() < this->outer());
+          else return (Base::operator bool() && this->index() <= this->outer());
+        }
+      }
+
+//       inline Index row() const { return (ArgType::Flags&RowMajorBit ? Base::outer() : this->index()); }
+//       inline Index col() const { return (ArgType::Flags&RowMajorBit ? this->index() : Base::outer()); }
+      inline StorageIndex index() const
+      {
+        if(HasUnitDiag && m_returnOne)  return internal::convert_index<StorageIndex>(Base::outer());
+        else                            return Base::index();
+      }
+      inline Scalar value() const
+      {
+        if(HasUnitDiag && m_returnOne)  return Scalar(1);
+        else                            return Base::value();
+      }
+
+    protected:
+      bool m_returnOne;
+      bool m_containsDiag;
+    private:
+      Scalar& valueRef();
+  };
+  
+protected:
+  evaluator<ArgType> m_argImpl;
+  const ArgType& m_arg;
 };
 
-template<typename MatrixType, int Mode>
-class SparseTriangularView<MatrixType,Mode>::ReverseInnerIterator : public MatrixTypeNestedCleaned::ReverseInnerIterator
-{
-    typedef typename MatrixTypeNestedCleaned::ReverseInnerIterator Base;
-    typedef typename SparseTriangularView::Index Index;
-  public:
-
-    EIGEN_STRONG_INLINE ReverseInnerIterator(const SparseTriangularView& view, Index outer)
-      : Base(view.nestedExpression(), outer)
-    {
-      eigen_assert((!HasUnitDiag) && "ReverseInnerIterator does not support yet triangular views with a unit diagonal");
-      if(SkipLast) {
-        while((*this) && (SkipDiag ? this->index()>=outer : this->index()>outer))
-          --(*this);
-      }
-    }
-
-    EIGEN_STRONG_INLINE ReverseInnerIterator& operator--()
-    { Base::operator--(); return *this; }
-
-    inline Index row() const { return Base::row(); }
-    inline Index col() const { return Base::col(); }
-
-    EIGEN_STRONG_INLINE operator bool() const
-    {
-      if (SkipLast) return Base::operator bool() ;
-      else
-      {
-        if(SkipDiag) return (Base::operator bool() && this->index() > this->outer());
-        else return (Base::operator bool() && this->index() >= this->outer());
-      }
-    }
-};
+} // end namespace internal
 
 template<typename Derived>
 template<int Mode>
-inline const SparseTriangularView<Derived, Mode>
+inline const TriangularView<const Derived, Mode>
 SparseMatrixBase<Derived>::triangularView() const
 {
-  return derived();
+  return TriangularView<const Derived, Mode>(derived());
 }
 
 } // end namespace Eigen

diff --git a/Eigen/src/SparseCore/SparseUtil.h b/Eigen/src/SparseCore/SparseUtil.h
index 0502385..ceb9368 100644
--- a/Eigen/src/SparseCore/SparseUtil.h
+++ b/Eigen/src/SparseCore/SparseUtil.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -37,43 +37,23 @@
 }
 
 #define EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
-EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, =) \
-EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, +=) \
-EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, -=) \
-EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, *=) \
-EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=)
+EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, =)
 
-#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, BaseClass) \
-  typedef BaseClass Base; \
-  typedef typename Eigen::internal::traits<Derived >::Scalar Scalar; \
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; \
-  typedef typename Eigen::internal::nested<Derived >::type Nested; \
-  typedef typename Eigen::internal::traits<Derived >::StorageKind StorageKind; \
-  typedef typename Eigen::internal::traits<Derived >::Index Index; \
-  enum { RowsAtCompileTime = Eigen::internal::traits<Derived >::RowsAtCompileTime, \
-        ColsAtCompileTime = Eigen::internal::traits<Derived >::ColsAtCompileTime, \
-        Flags = Eigen::internal::traits<Derived >::Flags, \
-        CoeffReadCost = Eigen::internal::traits<Derived >::CoeffReadCost, \
-        SizeAtCompileTime = Base::SizeAtCompileTime, \
-        IsVectorAtCompileTime = Base::IsVectorAtCompileTime }; \
-  using Base::derived; \
-  using Base::const_cast_derived;
 
 #define EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \
-  _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, Eigen::SparseMatrixBase<Derived >)
+  EIGEN_GENERIC_PUBLIC_INTERFACE(Derived)
 
+  
 const int CoherentAccessPattern     = 0x1;
 const int InnerRandomAccessPattern  = 0x2 | CoherentAccessPattern;
 const int OuterRandomAccessPattern  = 0x4 | CoherentAccessPattern;
 const int RandomAccessPattern       = 0x8 | OuterRandomAccessPattern | InnerRandomAccessPattern;
 
-template<typename Derived> class SparseMatrixBase;
-template<typename _Scalar, int _Flags = 0, typename _Index = int>  class SparseMatrix;
-template<typename _Scalar, int _Flags = 0, typename _Index = int>  class DynamicSparseMatrix;
-template<typename _Scalar, int _Flags = 0, typename _Index = int>  class SparseVector;
-template<typename _Scalar, int _Flags = 0, typename _Index = int>  class MappedSparseMatrix;
+template<typename _Scalar, int _Flags = 0, typename _StorageIndex = int>  class SparseMatrix;
+template<typename _Scalar, int _Flags = 0, typename _StorageIndex = int>  class DynamicSparseMatrix;
+template<typename _Scalar, int _Flags = 0, typename _StorageIndex = int>  class SparseVector;
+template<typename _Scalar, int _Flags = 0, typename _StorageIndex = int>  class MappedSparseMatrix;
 
-template<typename MatrixType, int Mode>           class SparseTriangularView;
 template<typename MatrixType, unsigned int UpLo>  class SparseSelfAdjointView;
 template<typename Lhs, typename Rhs>              class SparseDiagonalProduct;
 template<typename MatrixType> class SparseView;
@@ -84,41 +64,45 @@
 template<typename Lhs, typename Rhs, bool Transpose> class SparseDenseOuterProduct;
 
 template<typename Lhs, typename Rhs> struct SparseSparseProductReturnType;
-template<typename Lhs, typename Rhs, int InnerSize = internal::traits<Lhs>::ColsAtCompileTime> struct DenseSparseProductReturnType;
-template<typename Lhs, typename Rhs, int InnerSize = internal::traits<Lhs>::ColsAtCompileTime> struct SparseDenseProductReturnType;
+template<typename Lhs, typename Rhs,
+         int InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(internal::traits<Lhs>::ColsAtCompileTime,internal::traits<Rhs>::RowsAtCompileTime)> struct DenseSparseProductReturnType;
+         
+template<typename Lhs, typename Rhs,
+         int InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(internal::traits<Lhs>::ColsAtCompileTime,internal::traits<Rhs>::RowsAtCompileTime)> struct SparseDenseProductReturnType;
 template<typename MatrixType,int UpLo> class SparseSymmetricPermutationProduct;
 
 namespace internal {
 
-template<typename T,int Rows,int Cols> struct sparse_eval;
+template<typename T,int Rows,int Cols,int Flags> struct sparse_eval;
 
 template<typename T> struct eval<T,Sparse>
-  : public sparse_eval<T, traits<T>::RowsAtCompileTime,traits<T>::ColsAtCompileTime>
+  : sparse_eval<T, traits<T>::RowsAtCompileTime,traits<T>::ColsAtCompileTime,traits<T>::Flags>
 {};
 
-template<typename T,int Cols> struct sparse_eval<T,1,Cols> {
+template<typename T,int Cols,int Flags> struct sparse_eval<T,1,Cols,Flags> {
     typedef typename traits<T>::Scalar _Scalar;
-    typedef typename traits<T>::Index _Index;
+    typedef typename traits<T>::StorageIndex _StorageIndex;
   public:
-    typedef SparseVector<_Scalar, RowMajor, _Index> type;
+    typedef SparseVector<_Scalar, RowMajor, _StorageIndex> type;
 };
 
-template<typename T,int Rows> struct sparse_eval<T,Rows,1> {
+template<typename T,int Rows,int Flags> struct sparse_eval<T,Rows,1,Flags> {
     typedef typename traits<T>::Scalar _Scalar;
-    typedef typename traits<T>::Index _Index;
+    typedef typename traits<T>::StorageIndex _StorageIndex;
   public:
-    typedef SparseVector<_Scalar, ColMajor, _Index> type;
+    typedef SparseVector<_Scalar, ColMajor, _StorageIndex> type;
 };
 
-template<typename T,int Rows,int Cols> struct sparse_eval {
+// TODO this seems almost identical to plain_matrix_type<T, Sparse>
+template<typename T,int Rows,int Cols,int Flags> struct sparse_eval {
     typedef typename traits<T>::Scalar _Scalar;
-    typedef typename traits<T>::Index _Index;
-    enum { _Options = ((traits<T>::Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor };
+    typedef typename traits<T>::StorageIndex _StorageIndex;
+    enum { _Options = ((Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor };
   public:
-    typedef SparseMatrix<_Scalar, _Options, _Index> type;
+    typedef SparseMatrix<_Scalar, _Options, _StorageIndex> type;
 };
 
-template<typename T> struct sparse_eval<T,1,1> {
+template<typename T,int Flags> struct sparse_eval<T,1,1,Flags> {
     typedef typename traits<T>::Scalar _Scalar;
   public:
     typedef Matrix<_Scalar, 1, 1> type;
@@ -127,10 +111,41 @@
 template<typename T> struct plain_matrix_type<T,Sparse>
 {
   typedef typename traits<T>::Scalar _Scalar;
-  typedef typename traits<T>::Index _Index;
-  enum { _Options = ((traits<T>::Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor };
+  typedef typename traits<T>::StorageIndex _StorageIndex;
+  enum { _Options = ((evaluator<T>::Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor };
   public:
-    typedef SparseMatrix<_Scalar, _Options, _Index> type;
+    typedef SparseMatrix<_Scalar, _Options, _StorageIndex> type;
+};
+
+template<typename T>
+struct plain_object_eval<T,Sparse>
+  : sparse_eval<T, traits<T>::RowsAtCompileTime,traits<T>::ColsAtCompileTime, evaluator<T>::Flags>
+{};
+
+template<typename Decomposition, typename RhsType>
+struct solve_traits<Decomposition,RhsType,Sparse>
+{
+  typedef typename sparse_eval<RhsType, RhsType::RowsAtCompileTime, RhsType::ColsAtCompileTime,traits<RhsType>::Flags>::type PlainObject;
+};
+
+template<typename Derived>
+struct generic_xpr_base<Derived, MatrixXpr, Sparse>
+{
+  typedef SparseMatrixBase<Derived> type;
+};
+
+struct SparseTriangularShape  { static std::string debugName() { return "SparseTriangularShape"; } };
+struct SparseSelfAdjointShape { static std::string debugName() { return "SparseSelfAdjointShape"; } };
+
+template<> struct glue_shapes<SparseShape,SelfAdjointShape> { typedef SparseSelfAdjointShape type;  };
+template<> struct glue_shapes<SparseShape,TriangularShape > { typedef SparseTriangularShape  type;  };
+
+// return type of SparseCompressedBase::lower_bound;
+struct LowerBoundIndex {
+  LowerBoundIndex() : value(-1), found(false) {}
+  LowerBoundIndex(Index val, bool ok) : value(val), found(ok) {}
+  Index value;
+  bool found;
 };
 
 } // end namespace internal
@@ -143,26 +158,26 @@
   *
   * \sa SparseMatrix::setFromTriplets()
   */
-template<typename Scalar, typename Index=typename SparseMatrix<Scalar>::Index >
+template<typename Scalar, typename StorageIndex=typename SparseMatrix<Scalar>::StorageIndex >
 class Triplet
 {
 public:
   Triplet() : m_row(0), m_col(0), m_value(0) {}
 
-  Triplet(const Index& i, const Index& j, const Scalar& v = Scalar(0))
+  Triplet(const StorageIndex& i, const StorageIndex& j, const Scalar& v = Scalar(0))
     : m_row(i), m_col(j), m_value(v)
   {}
 
   /** \returns the row index of the element */
-  const Index& row() const { return m_row; }
+  const StorageIndex& row() const { return m_row; }
 
   /** \returns the column index of the element */
-  const Index& col() const { return m_col; }
+  const StorageIndex& col() const { return m_col; }
 
   /** \returns the value of the element */
   const Scalar& value() const { return m_value; }
 protected:
-  Index m_row, m_col;
+  StorageIndex m_row, m_col;
   Scalar m_value;
 };
 

diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h
index 7e15c81..05779be 100644
--- a/Eigen/src/SparseCore/SparseVector.h
+++ b/Eigen/src/SparseCore/SparseVector.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -22,15 +22,15 @@
   * See http://www.netlib.org/linalg/html_templates/node91.html for details on the storage scheme.
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_SPARSEVECTOR_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_SPARSEVECTOR_PLUGIN.
   */
 
 namespace internal {
-template<typename _Scalar, int _Options, typename _Index>
-struct traits<SparseVector<_Scalar, _Options, _Index> >
+template<typename _Scalar, int _Options, typename _StorageIndex>
+struct traits<SparseVector<_Scalar, _Options, _StorageIndex> >
 {
   typedef _Scalar Scalar;
-  typedef _Index Index;
+  typedef _StorageIndex StorageIndex;
   typedef Sparse StorageKind;
   typedef MatrixXpr XprKind;
   enum {
@@ -40,8 +40,7 @@
     ColsAtCompileTime = IsColVector ? 1 : Dynamic,
     MaxRowsAtCompileTime = RowsAtCompileTime,
     MaxColsAtCompileTime = ColsAtCompileTime,
-    Flags = _Options | NestByRefBit | LvalueBit | (IsColVector ? 0 : RowMajorBit),
-    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Flags = _Options | NestByRefBit | LvalueBit | (IsColVector ? 0 : RowMajorBit) | CompressedAccessBit,
     SupportedAccessPatterns = InnerRandomAccessPattern
   };
 };
@@ -61,18 +60,18 @@
 
 }
 
-template<typename _Scalar, int _Options, typename _Index>
+template<typename _Scalar, int _Options, typename _StorageIndex>
 class SparseVector
-  : public SparseMatrixBase<SparseVector<_Scalar, _Options, _Index> >
+  : public SparseCompressedBase<SparseVector<_Scalar, _Options, _StorageIndex> >
 {
-    typedef SparseMatrixBase<SparseVector> SparseBase;
-    
+    typedef SparseCompressedBase<SparseVector> Base;
+    using Base::convert_index;
   public:
     EIGEN_SPARSE_PUBLIC_INTERFACE(SparseVector)
     EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseVector, +=)
     EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseVector, -=)
     
-    typedef internal::CompressedStorage<Scalar,Index> Storage;
+    typedef internal::CompressedStorage<Scalar,StorageIndex> Storage;
     enum { IsColVector = internal::traits<SparseVector>::IsColVector };
     
     enum {
@@ -84,11 +83,16 @@
     EIGEN_STRONG_INLINE Index innerSize() const { return m_size; }
     EIGEN_STRONG_INLINE Index outerSize() const { return 1; }
 
-    EIGEN_STRONG_INLINE const Scalar* valuePtr() const { return &m_data.value(0); }
-    EIGEN_STRONG_INLINE Scalar* valuePtr() { return &m_data.value(0); }
+    EIGEN_STRONG_INLINE const Scalar* valuePtr() const { return m_data.valuePtr(); }
+    EIGEN_STRONG_INLINE Scalar* valuePtr() { return m_data.valuePtr(); }
 
-    EIGEN_STRONG_INLINE const Index* innerIndexPtr() const { return &m_data.index(0); }
-    EIGEN_STRONG_INLINE Index* innerIndexPtr() { return &m_data.index(0); }
+    EIGEN_STRONG_INLINE const StorageIndex* innerIndexPtr() const { return m_data.indexPtr(); }
+    EIGEN_STRONG_INLINE StorageIndex* innerIndexPtr() { return m_data.indexPtr(); }
+
+    inline const StorageIndex* outerIndexPtr() const { return 0; }
+    inline StorageIndex* outerIndexPtr() { return 0; }
+    inline const StorageIndex* innerNonZeroPtr() const { return 0; }
+    inline StorageIndex* innerNonZeroPtr() { return 0; }
     
     /** \internal */
     inline Storage& data() { return m_data; }
@@ -103,13 +107,13 @@
     inline Scalar coeff(Index i) const
     {
       eigen_assert(i>=0 && i<m_size);
-      return m_data.at(i);
+      return m_data.at(StorageIndex(i));
     }
 
     inline Scalar& coeffRef(Index row, Index col)
     {
       eigen_assert(IsColVector ? (col==0 && row>=0 && row<m_size) : (row==0 && col>=0 && col<m_size));
-      return coeff(IsColVector ? row : col);
+      return coeffRef(IsColVector ? row : col);
     }
 
     /** \returns a reference to the coefficient value at given index \a i
@@ -121,18 +125,19 @@
     inline Scalar& coeffRef(Index i)
     {
       eigen_assert(i>=0 && i<m_size);
-      return m_data.atWithInsertion(i);
+
+      return m_data.atWithInsertion(StorageIndex(i));
     }
 
   public:
 
-    class InnerIterator;
-    class ReverseInnerIterator;
+    typedef typename Base::InnerIterator InnerIterator;
+    typedef typename Base::ReverseInnerIterator ReverseInnerIterator;
 
     inline void setZero() { m_data.clear(); }
 
     /** \returns the number of non zero coefficients */
-    inline Index nonZeros() const  { return static_cast<Index>(m_data.size()); }
+    inline Index nonZeros() const  { return m_data.size(); }
 
     inline void startVec(Index outer)
     {
@@ -151,6 +156,18 @@
       m_data.append(0, i);
       return m_data.value(m_data.size()-1);
     }
+    
+    Scalar& insertBackByOuterInnerUnordered(Index outer, Index inner)
+    {
+      EIGEN_UNUSED_VARIABLE(outer);
+      eigen_assert(outer==0);
+      return insertBackUnordered(inner);
+    }
+    inline Scalar& insertBackUnordered(Index i)
+    {
+      m_data.append(0, i);
+      return m_data.value(m_data.size()-1);
+    }
 
     inline Scalar& insert(Index row, Index col)
     {
@@ -158,6 +175,7 @@
       
       Index inner = IsColVector ? row : col;
       Index outer = IsColVector ? col : row;
+      EIGEN_ONLY_USED_FOR_DEBUG(outer);
       eigen_assert(outer==0);
       return insert(inner);
     }
@@ -176,7 +194,7 @@
         m_data.value(p+1) = m_data.value(p);
         --p;
       }
-      m_data.index(p+1) = i;
+      m_data.index(p+1) = convert_index(i);
       m_data.value(p+1) = 0;
       return m_data.value(p+1);
     }
@@ -188,28 +206,59 @@
 
     inline void finalize() {}
 
+    /** \copydoc SparseMatrix::prune(const Scalar&,const RealScalar&) */
     void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
     {
       m_data.prune(reference,epsilon);
     }
 
+    /** Resizes the sparse vector to \a rows x \a cols
+      *
+      * This method is provided for compatibility with matrices.
+      * For a column vector, \a cols must be equal to 1.
+      * For a row vector, \a rows must be equal to 1.
+      *
+      * \sa resize(Index)
+      */
     void resize(Index rows, Index cols)
     {
-      eigen_assert(rows==1 || cols==1);
+      eigen_assert((IsColVector ? cols : rows)==1 && "Outer dimension must equal 1");
       resize(IsColVector ? rows : cols);
     }
 
+    /** Resizes the sparse vector to \a newSize
+      * This method deletes all entries, thus leaving an empty sparse vector
+      *
+      * \sa  conservativeResize(), setZero() */
     void resize(Index newSize)
     {
       m_size = newSize;
       m_data.clear();
     }
 
+    /** Resizes the sparse vector to \a newSize, while leaving old values untouched.
+      *
+      * If the size of the vector is decreased, then the storage of the out-of bounds coefficients is kept and reserved.
+      * Call .data().squeeze() to free extra memory.
+      *
+      * \sa reserve(), setZero()
+      */
+    void conservativeResize(Index newSize)
+    {
+      if (newSize < m_size)
+      {
+        Index i = 0;
+        while (i<m_data.size() && m_data.index(i)<newSize) ++i;
+        m_data.resize(i);
+      }
+      m_size = newSize;
+    }
+
     void resizeNonZeros(Index size) { m_data.resize(size); }
 
     inline SparseVector() : m_size(0) { check_template_parameters(); resize(0); }
 
-    inline SparseVector(Index size) : m_size(0) { check_template_parameters(); resize(size); }
+    explicit inline SparseVector(Index size) : m_size(0) { check_template_parameters(); resize(size); }
 
     inline SparseVector(Index rows, Index cols) : m_size(0) { check_template_parameters(); resize(rows,cols); }
 
@@ -217,19 +266,22 @@
     inline SparseVector(const SparseMatrixBase<OtherDerived>& other)
       : m_size(0)
     {
+      #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+        EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+      #endif
       check_template_parameters();
       *this = other.derived();
     }
 
     inline SparseVector(const SparseVector& other)
-      : SparseBase(other), m_size(0)
+      : Base(other), m_size(0)
     {
       check_template_parameters();
       *this = other.derived();
     }
 
     /** Swaps the values of \c *this and \a other.
-      * Overloaded for performance: this version performs a \em shallow swap by swaping pointers and attributes only.
+      * Overloaded for performance: this version performs a \em shallow swap by swapping pointers and attributes only.
       * \sa SparseMatrixBase::swap()
       */
     inline void swap(SparseVector& other)
@@ -238,6 +290,14 @@
       m_data.swap(other.m_data);
     }
 
+    template<int OtherOptions>
+    inline void swap(SparseMatrix<Scalar,OtherOptions,StorageIndex>& other)
+    {
+      eigen_assert(other.outerSize()==1);
+      std::swap(m_size, other.m_innerSize);
+      m_data.swap(other.m_data);
+    }
+
     inline SparseVector& operator=(const SparseVector& other)
     {
       if (other.isRValue())
@@ -336,7 +396,7 @@
   
     static void check_template_parameters()
     {
-      EIGEN_STATIC_ASSERT(NumTraits<Index>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE);
+      EIGEN_STATIC_ASSERT(NumTraits<StorageIndex>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE);
       EIGEN_STATIC_ASSERT((_Options&(ColMajor|RowMajor))==Options,INVALID_MATRIX_TEMPLATE_PARAMETERS);
     }
     
@@ -344,77 +404,46 @@
     Index m_size;
 };
 
-template<typename Scalar, int _Options, typename _Index>
-class SparseVector<Scalar,_Options,_Index>::InnerIterator
-{
-  public:
-    InnerIterator(const SparseVector& vec, Index outer=0)
-      : m_data(vec.m_data), m_id(0), m_end(static_cast<Index>(m_data.size()))
-    {
-      EIGEN_UNUSED_VARIABLE(outer);
-      eigen_assert(outer==0);
-    }
-
-    InnerIterator(const internal::CompressedStorage<Scalar,Index>& data)
-      : m_data(data), m_id(0), m_end(static_cast<Index>(m_data.size()))
-    {}
-
-    inline InnerIterator& operator++() { m_id++; return *this; }
-
-    inline Scalar value() const { return m_data.value(m_id); }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_data.value(m_id)); }
-
-    inline Index index() const { return m_data.index(m_id); }
-    inline Index row() const { return IsColVector ? index() : 0; }
-    inline Index col() const { return IsColVector ? 0 : index(); }
-
-    inline operator bool() const { return (m_id < m_end); }
-
-  protected:
-    const internal::CompressedStorage<Scalar,Index>& m_data;
-    Index m_id;
-    const Index m_end;
-};
-
-template<typename Scalar, int _Options, typename _Index>
-class SparseVector<Scalar,_Options,_Index>::ReverseInnerIterator
-{
-  public:
-    ReverseInnerIterator(const SparseVector& vec, Index outer=0)
-      : m_data(vec.m_data), m_id(static_cast<Index>(m_data.size())), m_start(0)
-    {
-      EIGEN_UNUSED_VARIABLE(outer);
-      eigen_assert(outer==0);
-    }
-
-    ReverseInnerIterator(const internal::CompressedStorage<Scalar,Index>& data)
-      : m_data(data), m_id(static_cast<Index>(m_data.size())), m_start(0)
-    {}
-
-    inline ReverseInnerIterator& operator--() { m_id--; return *this; }
-
-    inline Scalar value() const { return m_data.value(m_id-1); }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_data.value(m_id-1)); }
-
-    inline Index index() const { return m_data.index(m_id-1); }
-    inline Index row() const { return IsColVector ? index() : 0; }
-    inline Index col() const { return IsColVector ? 0 : index(); }
-
-    inline operator bool() const { return (m_id > m_start); }
-
-  protected:
-    const internal::CompressedStorage<Scalar,Index>& m_data;
-    Index m_id;
-    const Index m_start;
-};
-
 namespace internal {
 
+template<typename _Scalar, int _Options, typename _Index>
+struct evaluator<SparseVector<_Scalar,_Options,_Index> >
+  : evaluator_base<SparseVector<_Scalar,_Options,_Index> >
+{
+  typedef SparseVector<_Scalar,_Options,_Index> SparseVectorType;
+  typedef evaluator_base<SparseVectorType> Base;
+  typedef typename SparseVectorType::InnerIterator InnerIterator;
+  typedef typename SparseVectorType::ReverseInnerIterator ReverseInnerIterator;
+  
+  enum {
+    CoeffReadCost = NumTraits<_Scalar>::ReadCost,
+    Flags = SparseVectorType::Flags
+  };
+
+  evaluator() : Base() {}
+  
+  explicit evaluator(const SparseVectorType &mat) : m_matrix(&mat)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  inline Index nonZerosEstimate() const {
+    return m_matrix->nonZeros();
+  }
+  
+  operator SparseVectorType&() { return m_matrix->const_cast_derived(); }
+  operator const SparseVectorType&() const { return *m_matrix; }
+  
+  const SparseVectorType *m_matrix;
+};
+
 template< typename Dest, typename Src>
 struct sparse_vector_assign_selector<Dest,Src,SVA_Inner> {
   static void run(Dest& dst, const Src& src) {
     eigen_internal_assert(src.innerSize()==src.size());
-    for(typename Src::InnerIterator it(src, 0); it; ++it)
+    typedef internal::evaluator<Src> SrcEvaluatorType;
+    SrcEvaluatorType srcEval(src);
+    for(typename SrcEvaluatorType::InnerIterator it(srcEval, 0); it; ++it)
       dst.insert(it.index()) = it.value();
   }
 };
@@ -423,9 +452,11 @@
 struct sparse_vector_assign_selector<Dest,Src,SVA_Outer> {
   static void run(Dest& dst, const Src& src) {
     eigen_internal_assert(src.outerSize()==src.size());
-    for(typename Dest::Index i=0; i<src.size(); ++i)
+    typedef internal::evaluator<Src> SrcEvaluatorType;
+    SrcEvaluatorType srcEval(src);
+    for(Index i=0; i<src.size(); ++i)
     {
-      typename Src::InnerIterator it(src, i);
+      typename SrcEvaluatorType::InnerIterator it(srcEval, i);
       if(it)
         dst.insert(i) = it.value();
     }

diff --git a/Eigen/src/SparseCore/SparseView.h b/Eigen/src/SparseCore/SparseView.h
index fd84504..92b3d1f 100644
--- a/Eigen/src/SparseCore/SparseView.h
+++ b/Eigen/src/SparseCore/SparseView.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2010 Daniel Lowengrub <lowdanie@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -18,7 +18,7 @@
 template<typename MatrixType>
 struct traits<SparseView<MatrixType> > : traits<MatrixType>
 {
-  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::StorageIndex StorageIndex;
   typedef Sparse StorageKind;
   enum {
     Flags = int(traits<MatrixType>::Flags) & (RowMajorBit)
@@ -27,71 +27,226 @@
 
 } // end namespace internal
 
+/** \ingroup SparseCore_Module
+  * \class SparseView
+  *
+  * \brief Expression of a dense or sparse matrix with zero or too small values removed
+  *
+  * \tparam MatrixType the type of the object of which we are removing the small entries
+  *
+  * This class represents an expression of a given dense or sparse matrix with
+  * entries smaller than \c reference * \c epsilon are removed.
+  * It is the return type of MatrixBase::sparseView() and SparseMatrixBase::pruned()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::sparseView(), SparseMatrixBase::pruned()
+  */
 template<typename MatrixType>
 class SparseView : public SparseMatrixBase<SparseView<MatrixType> >
 {
   typedef typename MatrixType::Nested MatrixTypeNested;
   typedef typename internal::remove_all<MatrixTypeNested>::type _MatrixTypeNested;
+  typedef SparseMatrixBase<SparseView > Base;
 public:
   EIGEN_SPARSE_PUBLIC_INTERFACE(SparseView)
+  typedef typename internal::remove_all<MatrixType>::type NestedExpression;
 
-  SparseView(const MatrixType& mat, const Scalar& m_reference = Scalar(0),
-             typename NumTraits<Scalar>::Real m_epsilon = NumTraits<Scalar>::dummy_precision()) : 
-    m_matrix(mat), m_reference(m_reference), m_epsilon(m_epsilon) {}
-
-  class InnerIterator;
+  explicit SparseView(const MatrixType& mat, const Scalar& reference = Scalar(0),
+                      const RealScalar &epsilon = NumTraits<Scalar>::dummy_precision())
+    : m_matrix(mat), m_reference(reference), m_epsilon(epsilon) {}
 
   inline Index rows() const { return m_matrix.rows(); }
   inline Index cols() const { return m_matrix.cols(); }
 
   inline Index innerSize() const { return m_matrix.innerSize(); }
   inline Index outerSize() const { return m_matrix.outerSize(); }
-
+  
+  /** \returns the nested expression */
+  const typename internal::remove_all<MatrixTypeNested>::type&
+  nestedExpression() const { return m_matrix; }
+  
+  Scalar reference() const { return m_reference; }
+  RealScalar epsilon() const { return m_epsilon; }
+  
 protected:
   MatrixTypeNested m_matrix;
   Scalar m_reference;
-  typename NumTraits<Scalar>::Real m_epsilon;
+  RealScalar m_epsilon;
 };
 
-template<typename MatrixType>
-class SparseView<MatrixType>::InnerIterator : public _MatrixTypeNested::InnerIterator
+namespace internal {
+
+// TODO find a way to unify the two following variants
+// This is tricky because implementing an inner iterator on top of an IndexBased evaluator is
+// not easy because the evaluators do not expose the sizes of the underlying expression.
+  
+template<typename ArgType>
+struct unary_evaluator<SparseView<ArgType>, IteratorBased>
+  : public evaluator_base<SparseView<ArgType> >
 {
-  typedef typename SparseView::Index Index;
-public:
-  typedef typename _MatrixTypeNested::InnerIterator IterBase;
-  InnerIterator(const SparseView& view, Index outer) :
-  IterBase(view.m_matrix, outer), m_view(view)
-  {
-    incrementToNonZero();
-  }
-
-  EIGEN_STRONG_INLINE InnerIterator& operator++()
-  {
-    IterBase::operator++();
-    incrementToNonZero();
-    return *this;
-  }
-
-  using IterBase::value;
-
-protected:
-  const SparseView& m_view;
-
-private:
-  void incrementToNonZero()
-  {
-    while((bool(*this)) && internal::isMuchSmallerThan(value(), m_view.m_reference, m_view.m_epsilon))
+    typedef typename evaluator<ArgType>::InnerIterator EvalIterator;
+  public:
+    typedef SparseView<ArgType> XprType;
+    
+    class InnerIterator : public EvalIterator
     {
-      IterBase::operator++();
-    }
-  }
+      protected:
+        typedef typename XprType::Scalar Scalar;
+      public:
+
+        EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& sve, Index outer)
+          : EvalIterator(sve.m_argImpl,outer), m_view(sve.m_view)
+        {
+          incrementToNonZero();
+        }
+
+        EIGEN_STRONG_INLINE InnerIterator& operator++()
+        {
+          EvalIterator::operator++();
+          incrementToNonZero();
+          return *this;
+        }
+
+        using EvalIterator::value;
+
+      protected:
+        const XprType &m_view;
+
+      private:
+        void incrementToNonZero()
+        {
+          while((bool(*this)) && internal::isMuchSmallerThan(value(), m_view.reference(), m_view.epsilon()))
+          {
+            EvalIterator::operator++();
+          }
+        }
+    };
+    
+    enum {
+      CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+      Flags = XprType::Flags
+    };
+    
+    explicit unary_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_view(xpr) {}
+
+  protected:
+    evaluator<ArgType> m_argImpl;
+    const XprType &m_view;
 };
 
-template<typename Derived>
-const SparseView<Derived> MatrixBase<Derived>::sparseView(const Scalar& m_reference,
-                                                          const typename NumTraits<Scalar>::Real& m_epsilon) const
+template<typename ArgType>
+struct unary_evaluator<SparseView<ArgType>, IndexBased>
+  : public evaluator_base<SparseView<ArgType> >
 {
-  return SparseView<Derived>(derived(), m_reference, m_epsilon);
+  public:
+    typedef SparseView<ArgType> XprType;
+  protected:
+    enum { IsRowMajor = (XprType::Flags&RowMajorBit)==RowMajorBit };
+    typedef typename XprType::Scalar Scalar;
+    typedef typename XprType::StorageIndex StorageIndex;
+  public:
+    
+    class InnerIterator
+    {
+      public:
+
+        EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& sve, Index outer)
+          : m_sve(sve), m_inner(0), m_outer(outer), m_end(sve.m_view.innerSize())
+        {
+          incrementToNonZero();
+        }
+
+        EIGEN_STRONG_INLINE InnerIterator& operator++()
+        {
+          m_inner++;
+          incrementToNonZero();
+          return *this;
+        }
+
+        EIGEN_STRONG_INLINE Scalar value() const
+        {
+          return (IsRowMajor) ? m_sve.m_argImpl.coeff(m_outer, m_inner)
+                              : m_sve.m_argImpl.coeff(m_inner, m_outer);
+        }
+
+        EIGEN_STRONG_INLINE StorageIndex index() const { return m_inner; }
+        inline Index row() const { return IsRowMajor ? m_outer : index(); }
+        inline Index col() const { return IsRowMajor ? index() : m_outer; }
+
+        EIGEN_STRONG_INLINE operator bool() const { return m_inner < m_end && m_inner>=0; }
+
+      protected:
+        const unary_evaluator &m_sve;
+        Index m_inner;
+        const Index m_outer;
+        const Index m_end;
+
+      private:
+        void incrementToNonZero()
+        {
+          while((bool(*this)) && internal::isMuchSmallerThan(value(), m_sve.m_view.reference(), m_sve.m_view.epsilon()))
+          {
+            m_inner++;
+          }
+        }
+    };
+    
+    enum {
+      CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+      Flags = XprType::Flags
+    };
+    
+    explicit unary_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_view(xpr) {}
+
+  protected:
+    evaluator<ArgType> m_argImpl;
+    const XprType &m_view;
+};
+
+} // end namespace internal
+
+/** \ingroup SparseCore_Module
+  *
+  * \returns a sparse expression of the dense expression \c *this with values smaller than
+  * \a reference * \a epsilon removed.
+  *
+  * This method is typically used when prototyping to convert a quickly assembled dense Matrix \c D to a SparseMatrix \c S:
+  * \code
+  * MatrixXd D(n,m);
+  * SparseMatrix<double> S;
+  * S = D.sparseView();             // suppress numerical zeros (exact)
+  * S = D.sparseView(reference);
+  * S = D.sparseView(reference,epsilon);
+  * \endcode
+  * where \a reference is a meaningful non zero reference value,
+  * and \a epsilon is a tolerance factor defaulting to NumTraits<Scalar>::dummy_precision().
+  *
+  * \sa SparseMatrixBase::pruned(), class SparseView */
+template<typename Derived>
+const SparseView<Derived> MatrixBase<Derived>::sparseView(const Scalar& reference,
+                                                          const typename NumTraits<Scalar>::Real& epsilon) const
+{
+  return SparseView<Derived>(derived(), reference, epsilon);
+}
+
+/** \returns an expression of \c *this with values smaller than
+  * \a reference * \a epsilon removed.
+  *
+  * This method is typically used in conjunction with the product of two sparse matrices
+  * to automatically prune the smallest values as follows:
+  * \code
+  * C = (A*B).pruned();             // suppress numerical zeros (exact)
+  * C = (A*B).pruned(ref);
+  * C = (A*B).pruned(ref,epsilon);
+  * \endcode
+  * where \c ref is a meaningful non zero reference value.
+  * */
+template<typename Derived>
+const SparseView<Derived>
+SparseMatrixBase<Derived>::pruned(const Scalar& reference,
+                                  const RealScalar& epsilon) const
+{
+  return SparseView<Derived>(derived(), reference, epsilon);
 }
 
 } // end namespace Eigen

diff --git a/Eigen/src/SparseCore/TriangularSolver.h b/Eigen/src/SparseCore/TriangularSolver.h
index cb8ad82..f9c56ba 100644
--- a/Eigen/src/SparseCore/TriangularSolver.h
+++ b/Eigen/src/SparseCore/TriangularSolver.h

@@ -28,16 +28,19 @@
 struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Lower,RowMajor>
 {
   typedef typename Rhs::Scalar Scalar;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename evaluator<Lhs>::InnerIterator LhsIterator;
   static void run(const Lhs& lhs, Rhs& other)
   {
-    for(int col=0 ; col<other.cols() ; ++col)
+    LhsEval lhsEval(lhs);
+    for(Index col=0 ; col<other.cols() ; ++col)
     {
-      for(int i=0; i<lhs.rows(); ++i)
+      for(Index i=0; i<lhs.rows(); ++i)
       {
         Scalar tmp = other.coeff(i,col);
         Scalar lastVal(0);
-        int lastIndex = 0;
-        for(typename Lhs::InnerIterator it(lhs, i); it; ++it)
+        Index lastIndex = 0;
+        for(LhsIterator it(lhsEval, i); it; ++it)
         {
           lastVal = it.value();
           lastIndex = it.index();
@@ -62,15 +65,18 @@
 struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Upper,RowMajor>
 {
   typedef typename Rhs::Scalar Scalar;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename evaluator<Lhs>::InnerIterator LhsIterator;
   static void run(const Lhs& lhs, Rhs& other)
   {
-    for(int col=0 ; col<other.cols() ; ++col)
+    LhsEval lhsEval(lhs);
+    for(Index col=0 ; col<other.cols() ; ++col)
     {
-      for(int i=lhs.rows()-1 ; i>=0 ; --i)
+      for(Index i=lhs.rows()-1 ; i>=0 ; --i)
       {
         Scalar tmp = other.coeff(i,col);
-        Scalar l_ii = 0;
-        typename Lhs::InnerIterator it(lhs, i);
+        Scalar l_ii(0);
+        LhsIterator it(lhsEval, i);
         while(it && it.index()<i)
           ++it;
         if(!(Mode & UnitDiag))
@@ -86,10 +92,8 @@
           tmp -= it.value() * other.coeff(it.index(),col);
         }
 
-        if (Mode & UnitDiag)
-          other.coeffRef(i,col) = tmp;
-        else
-          other.coeffRef(i,col) = tmp/l_ii;
+        if (Mode & UnitDiag)  other.coeffRef(i,col) = tmp;
+        else                  other.coeffRef(i,col) = tmp/l_ii;
       }
     }
   }
@@ -100,16 +104,19 @@
 struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Lower,ColMajor>
 {
   typedef typename Rhs::Scalar Scalar;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename evaluator<Lhs>::InnerIterator LhsIterator;
   static void run(const Lhs& lhs, Rhs& other)
   {
-    for(int col=0 ; col<other.cols() ; ++col)
+    LhsEval lhsEval(lhs);
+    for(Index col=0 ; col<other.cols() ; ++col)
     {
-      for(int i=0; i<lhs.cols(); ++i)
+      for(Index i=0; i<lhs.cols(); ++i)
       {
         Scalar& tmp = other.coeffRef(i,col);
         if (tmp!=Scalar(0)) // optimization when other is actually sparse
         {
-          typename Lhs::InnerIterator it(lhs, i);
+          LhsIterator it(lhsEval, i);
           while(it && it.index()<i)
             ++it;
           if(!(Mode & UnitDiag))
@@ -132,11 +139,14 @@
 struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Upper,ColMajor>
 {
   typedef typename Rhs::Scalar Scalar;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename evaluator<Lhs>::InnerIterator LhsIterator;
   static void run(const Lhs& lhs, Rhs& other)
   {
-    for(int col=0 ; col<other.cols() ; ++col)
+    LhsEval lhsEval(lhs);
+    for(Index col=0 ; col<other.cols() ; ++col)
     {
-      for(int i=lhs.cols()-1; i>=0; --i)
+      for(Index i=lhs.cols()-1; i>=0; --i)
       {
         Scalar& tmp = other.coeffRef(i,col);
         if (tmp!=Scalar(0)) // optimization when other is actually sparse
@@ -144,13 +154,13 @@
           if(!(Mode & UnitDiag))
           {
             // TODO replace this by a binary search. make sure the binary search is safe for partially sorted elements
-            typename Lhs::ReverseInnerIterator it(lhs, i);
+            LhsIterator it(lhsEval, i);
             while(it && it.index()!=i)
-              --it;
+              ++it;
             eigen_assert(it && it.index()==i);
             other.coeffRef(i,col) /= it.value();
           }
-          typename Lhs::InnerIterator it(lhs, i);
+          LhsIterator it(lhsEval, i);
           for(; it && it.index()<i; ++it)
             other.coeffRef(it.index(), col) -= tmp * it.value();
         }
@@ -161,11 +171,13 @@
 
 } // end namespace internal
 
-template<typename ExpressionType,int Mode>
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+template<typename ExpressionType,unsigned int Mode>
 template<typename OtherDerived>
-void SparseTriangularView<ExpressionType,Mode>::solveInPlace(MatrixBase<OtherDerived>& other) const
+void TriangularViewImpl<ExpressionType,Mode,Sparse>::solveInPlace(MatrixBase<OtherDerived>& other) const
 {
-  eigen_assert(m_matrix.cols() == m_matrix.rows() && m_matrix.cols() == other.rows());
+  eigen_assert(derived().cols() == derived().rows() && derived().cols() == other.rows());
   eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
 
   enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit };
@@ -174,21 +186,12 @@
     typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::type OtherCopy;
   OtherCopy otherCopy(other.derived());
 
-  internal::sparse_solve_triangular_selector<ExpressionType, typename internal::remove_reference<OtherCopy>::type, Mode>::run(m_matrix, otherCopy);
+  internal::sparse_solve_triangular_selector<ExpressionType, typename internal::remove_reference<OtherCopy>::type, Mode>::run(derived().nestedExpression(), otherCopy);
 
   if (copy)
     other = otherCopy;
 }
-
-template<typename ExpressionType,int Mode>
-template<typename OtherDerived>
-typename internal::plain_matrix_type_column_major<OtherDerived>::type
-SparseTriangularView<ExpressionType,Mode>::solve(const MatrixBase<OtherDerived>& other) const
-{
-  typename internal::plain_matrix_type_column_major<OtherDerived>::type res(other);
-  solveInPlace(res);
-  return res;
-}
+#endif
 
 // pure sparse path
 
@@ -208,18 +211,18 @@
 struct sparse_solve_triangular_sparse_selector<Lhs,Rhs,Mode,UpLo,ColMajor>
 {
   typedef typename Rhs::Scalar Scalar;
-  typedef typename promote_index_type<typename traits<Lhs>::Index,
-                                         typename traits<Rhs>::Index>::type Index;
+  typedef typename promote_index_type<typename traits<Lhs>::StorageIndex,
+                                      typename traits<Rhs>::StorageIndex>::type StorageIndex;
   static void run(const Lhs& lhs, Rhs& other)
   {
     const bool IsLower = (UpLo==Lower);
-    AmbiVector<Scalar,Index> tempVector(other.rows()*2);
+    AmbiVector<Scalar,StorageIndex> tempVector(other.rows()*2);
     tempVector.setBounds(0,other.rows());
 
     Rhs res(other.rows(), other.cols());
     res.reserve(other.nonZeros());
 
-    for(int col=0 ; col<other.cols() ; ++col)
+    for(Index col=0 ; col<other.cols() ; ++col)
     {
       // FIXME estimate number of non zeros
       tempVector.init(.99/*float(other.col(col).nonZeros())/float(other.rows())*/);
@@ -230,7 +233,7 @@
         tempVector.coeffRef(rhsIt.index()) = rhsIt.value();
       }
 
-      for(int i=IsLower?0:lhs.cols()-1;
+      for(Index i=IsLower?0:lhs.cols()-1;
           IsLower?i<lhs.cols():i>=0;
           i+=IsLower?1:-1)
       {
@@ -267,9 +270,9 @@
       }
 
 
-      int count = 0;
+      Index count = 0;
       // FIXME compute a reference value to filter zeros
-      for (typename AmbiVector<Scalar,Index>::Iterator it(tempVector/*,1e-12*/); it; ++it)
+      for (typename AmbiVector<Scalar,StorageIndex>::Iterator it(tempVector/*,1e-12*/); it; ++it)
       {
         ++ count;
 //         std::cerr << "fill " << it.index() << ", " << col << "\n";
@@ -286,11 +289,12 @@
 
 } // end namespace internal
 
-template<typename ExpressionType,int Mode>
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename ExpressionType,unsigned int Mode>
 template<typename OtherDerived>
-void SparseTriangularView<ExpressionType,Mode>::solveInPlace(SparseMatrixBase<OtherDerived>& other) const
+void TriangularViewImpl<ExpressionType,Mode,Sparse>::solveInPlace(SparseMatrixBase<OtherDerived>& other) const
 {
-  eigen_assert(m_matrix.cols() == m_matrix.rows() && m_matrix.cols() == other.rows());
+  eigen_assert(derived().cols() == derived().rows() && derived().cols() == other.rows());
   eigen_assert( (!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
 
 //   enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit };
@@ -299,35 +303,12 @@
 //     typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::type OtherCopy;
 //   OtherCopy otherCopy(other.derived());
 
-  internal::sparse_solve_triangular_sparse_selector<ExpressionType, OtherDerived, Mode>::run(m_matrix, other.derived());
+  internal::sparse_solve_triangular_sparse_selector<ExpressionType, OtherDerived, Mode>::run(derived().nestedExpression(), other.derived());
 
 //   if (copy)
 //     other = otherCopy;
 }
-
-#ifdef EIGEN2_SUPPORT
-
-// deprecated stuff:
-
-/** \deprecated */
-template<typename Derived>
-template<typename OtherDerived>
-void SparseMatrixBase<Derived>::solveTriangularInPlace(MatrixBase<OtherDerived>& other) const
-{
-  this->template triangular<Flags&(Upper|Lower)>().solveInPlace(other);
-}
-
-/** \deprecated */
-template<typename Derived>
-template<typename OtherDerived>
-typename internal::plain_matrix_type_column_major<OtherDerived>::type
-SparseMatrixBase<Derived>::solveTriangular(const MatrixBase<OtherDerived>& other) const
-{
-  typename internal::plain_matrix_type_column_major<OtherDerived>::type res(other);
-  derived().solveTriangularInPlace(res);
-  return res;
-}
-#endif // EIGEN2_SUPPORT
+#endif
 
 } // end namespace Eigen
 

diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h
index 7a9aeec..0c8d893 100644
--- a/Eigen/src/SparseLU/SparseLU.h
+++ b/Eigen/src/SparseLU/SparseLU.h

@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
-// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2012-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -14,10 +14,67 @@
 
 namespace Eigen {
 
-template <typename _MatrixType, typename _OrderingType = COLAMDOrdering<typename _MatrixType::Index> > class SparseLU;
+template <typename _MatrixType, typename _OrderingType = COLAMDOrdering<typename _MatrixType::StorageIndex> > class SparseLU;
 template <typename MappedSparseMatrixType> struct SparseLUMatrixLReturnType;
 template <typename MatrixLType, typename MatrixUType> struct SparseLUMatrixUReturnType;
 
+template <bool Conjugate,class SparseLUType>
+class SparseLUTransposeView : public SparseSolverBase<SparseLUTransposeView<Conjugate,SparseLUType> >
+{
+protected:
+  typedef SparseSolverBase<SparseLUTransposeView<Conjugate,SparseLUType> > APIBase;
+  using APIBase::m_isInitialized;
+public:
+  typedef typename SparseLUType::Scalar Scalar;
+  typedef typename SparseLUType::StorageIndex StorageIndex;
+  typedef typename SparseLUType::MatrixType MatrixType;
+  typedef typename SparseLUType::OrderingType OrderingType;
+
+  enum {
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+
+  SparseLUTransposeView() : m_sparseLU(NULL) {}
+  SparseLUTransposeView(const SparseLUTransposeView& view) {
+    this->m_sparseLU = view.m_sparseLU;
+  }
+  void setIsInitialized(const bool isInitialized) {this->m_isInitialized = isInitialized;}
+  void setSparseLU(SparseLUType* sparseLU) {m_sparseLU = sparseLU;}
+  using APIBase::_solve_impl;
+  template<typename Rhs, typename Dest>
+  bool _solve_impl(const MatrixBase<Rhs> &B, MatrixBase<Dest> &X_base) const
+  {
+    Dest& X(X_base.derived());
+    eigen_assert(m_sparseLU->info() == Success && "The matrix should be factorized first");
+    EIGEN_STATIC_ASSERT((Dest::Flags&RowMajorBit)==0,
+                        THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+
+
+    // this ugly const_cast_derived() helps to detect aliasing when applying the permutations
+    for(Index j = 0; j < B.cols(); ++j){
+      X.col(j) = m_sparseLU->colsPermutation() * B.const_cast_derived().col(j);
+    }
+    //Forward substitution with transposed or adjoint of U
+    m_sparseLU->matrixU().template solveTransposedInPlace<Conjugate>(X);
+
+    //Backward substitution with transposed or adjoint of L
+    m_sparseLU->matrixL().template solveTransposedInPlace<Conjugate>(X);
+
+    // Permute back the solution
+    for (Index j = 0; j < B.cols(); ++j)
+      X.col(j) = m_sparseLU->rowsPermutation().transpose() * X.col(j);
+    return true;
+  }
+  inline Index rows() const { return m_sparseLU->rows(); }
+  inline Index cols() const { return m_sparseLU->cols(); }
+
+private:
+  SparseLUType *m_sparseLU;
+  SparseLUTransposeView& operator=(const SparseLUTransposeView&);
+};
+
+
 /** \ingroup SparseLU_Module
   * \class SparseLU
   * 
@@ -26,7 +83,7 @@
   * This class implements the supernodal LU factorization for general matrices.
   * It uses the main techniques from the sequential SuperLU package 
   * (http://crd-legacy.lbl.gov/~xiaoye/SuperLU/). It handles transparently real 
-  * and complex arithmetics with single and double precision, depending on the 
+  * and complex arithmetic with single and double precision, depending on the 
   * scalar type of your input matrix. 
   * The code has been optimized to provide BLAS-3 operations during supernode-panel updates. 
   * It benefits directly from the built-in high-performant Eigen BLAS routines. 
@@ -43,8 +100,8 @@
   * Simple example with key steps 
   * \code
   * VectorXd x(n), b(n);
-  * SparseMatrix<double, ColMajor> A;
-  * SparseLU<SparseMatrix<scalar, ColMajor>, COLAMDOrdering<Index> >   solver;
+  * SparseMatrix<double> A;
+  * SparseLU<SparseMatrix<double>, COLAMDOrdering<int> >   solver;
   * // fill A and b;
   * // Compute the ordering permutation vector from the structural pattern of A
   * solver.analyzePattern(A); 
@@ -64,33 +121,46 @@
   * 
   * \tparam _MatrixType The type of the sparse matrix. It must be a column-major SparseMatrix<>
   * \tparam _OrderingType The ordering method to use, either AMD, COLAMD or METIS. Default is COLMAD
+  *
+  * \implsparsesolverconcept
   * 
-  * 
-  * \sa \ref TutorialSparseDirectSolvers
+  * \sa \ref TutorialSparseSolverConcept
   * \sa \ref OrderingMethods_Module
   */
 template <typename _MatrixType, typename _OrderingType>
-class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typename _MatrixType::Index>
+class SparseLU : public SparseSolverBase<SparseLU<_MatrixType,_OrderingType> >, public internal::SparseLUImpl<typename _MatrixType::Scalar, typename _MatrixType::StorageIndex>
 {
+  protected:
+    typedef SparseSolverBase<SparseLU<_MatrixType,_OrderingType> > APIBase;
+    using APIBase::m_isInitialized;
   public:
+    using APIBase::_solve_impl;
+    
     typedef _MatrixType MatrixType; 
     typedef _OrderingType OrderingType;
     typedef typename MatrixType::Scalar Scalar; 
     typedef typename MatrixType::RealScalar RealScalar; 
-    typedef typename MatrixType::Index Index; 
-    typedef SparseMatrix<Scalar,ColMajor,Index> NCMatrix;
-    typedef internal::MappedSuperNodalMatrix<Scalar, Index> SCMatrix; 
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> NCMatrix;
+    typedef internal::MappedSuperNodalMatrix<Scalar, StorageIndex> SCMatrix;
     typedef Matrix<Scalar,Dynamic,1> ScalarVector;
-    typedef Matrix<Index,Dynamic,1> IndexVector;
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
-    typedef internal::SparseLUImpl<Scalar, Index> Base;
+    typedef Matrix<StorageIndex,Dynamic,1> IndexVector;
+    typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
+    typedef internal::SparseLUImpl<Scalar, StorageIndex> Base;
+
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
     
   public:
-    SparseLU():m_isInitialized(true),m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1)
+
+    SparseLU():m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1)
     {
       initperfvalues(); 
     }
-    SparseLU(const MatrixType& matrix):m_isInitialized(true),m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1)
+    explicit SparseLU(const MatrixType& matrix)
+      : m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1)
     {
       initperfvalues(); 
       compute(matrix);
@@ -116,6 +186,45 @@
       //Factorize
       factorize(matrix);
     } 
+
+    /** \returns an expression of the transposed of the factored matrix.
+      *
+      * A typical usage is to solve for the transposed problem A^T x = b:
+      * \code
+      * solver.compute(A);
+      * x = solver.transpose().solve(b);
+      * \endcode
+      *
+      * \sa adjoint(), solve()
+      */
+    const SparseLUTransposeView<false,SparseLU<_MatrixType,_OrderingType> > transpose()
+    {
+      SparseLUTransposeView<false,  SparseLU<_MatrixType,_OrderingType> > transposeView;
+      transposeView.setSparseLU(this);
+      transposeView.setIsInitialized(this->m_isInitialized);
+      return transposeView;
+    }
+
+
+    /** \returns an expression of the adjoint of the factored matrix
+      *
+      * A typical usage is to solve for the adjoint problem A' x = b:
+      * \code
+      * solver.compute(A);
+      * x = solver.adjoint().solve(b);
+      * \endcode
+      *
+      * For real scalar types, this function is equivalent to transpose().
+      *
+      * \sa transpose(), solve()
+      */
+    const SparseLUTransposeView<true, SparseLU<_MatrixType,_OrderingType> > adjoint()
+    {
+      SparseLUTransposeView<true,  SparseLU<_MatrixType,_OrderingType> > adjointView;
+      adjointView.setSparseLU(this);
+      adjointView.setIsInitialized(this->m_isInitialized);
+      return adjointView;
+    }
     
     inline Index rows() const { return m_mat.rows(); }
     inline Index cols() const { return m_mat.cols(); }
@@ -141,9 +250,9 @@
       * y = b; matrixU().solveInPlace(y);
       * \endcode
       */
-    SparseLUMatrixUReturnType<SCMatrix,MappedSparseMatrix<Scalar,ColMajor,Index> > matrixU() const
+    SparseLUMatrixUReturnType<SCMatrix,MappedSparseMatrix<Scalar,ColMajor,StorageIndex> > matrixU() const
     {
-      return SparseLUMatrixUReturnType<SCMatrix, MappedSparseMatrix<Scalar,ColMajor,Index> >(m_Lstore, m_Ustore);
+      return SparseLUMatrixUReturnType<SCMatrix, MappedSparseMatrix<Scalar,ColMajor,StorageIndex> >(m_Lstore, m_Ustore);
     }
 
     /**
@@ -168,6 +277,7 @@
       m_diagpivotthresh = thresh; 
     }
 
+#ifdef EIGEN_PARSED_BY_DOXYGEN
     /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
       *
       * \warning the destination matrix X in X = this->solve(B) must be colmun-major.
@@ -175,30 +285,12 @@
       * \sa compute()
       */
     template<typename Rhs>
-    inline const internal::solve_retval<SparseLU, Rhs> solve(const MatrixBase<Rhs>& B) const 
-    {
-      eigen_assert(m_factorizationIsOk && "SparseLU is not initialized."); 
-      eigen_assert(rows()==B.rows()
-                    && "SparseLU::solve(): invalid number of rows of the right hand side matrix B");
-          return internal::solve_retval<SparseLU, Rhs>(*this, B.derived());
-    }
-
-    /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<SparseLU, Rhs> solve(const SparseMatrixBase<Rhs>& B) const 
-    {
-      eigen_assert(m_factorizationIsOk && "SparseLU is not initialized."); 
-      eigen_assert(rows()==B.rows()
-                    && "SparseLU::solve(): invalid number of rows of the right hand side matrix B");
-          return internal::sparse_solve_retval<SparseLU, Rhs>(*this, B.derived());
-    }
+    inline const Solve<SparseLU, Rhs> solve(const MatrixBase<Rhs>& B) const;
+#endif // EIGEN_PARSED_BY_DOXYGEN
     
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the LU factorization reports a problem, zero diagonal for instance
       *          \c InvalidInput if the input matrix is invalid
       *
@@ -219,7 +311,7 @@
     }
 
     template<typename Rhs, typename Dest>
-    bool _solve(const MatrixBase<Rhs> &B, MatrixBase<Dest> &X_base) const
+    bool _solve_impl(const MatrixBase<Rhs> &B, MatrixBase<Dest> &X_base) const
     {
       Dest& X(X_base.derived());
       eigen_assert(m_factorizationIsOk && "The matrix should be factorized first");
@@ -261,14 +353,13 @@
       eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
       // Initialize with the determinant of the row matrix
       Scalar det = Scalar(1.);
-      //Note that the diagonal blocks of U are stored in supernodes,
+      // Note that the diagonal blocks of U are stored in supernodes,
       // which are available in the  L part :)
       for (Index j = 0; j < this->cols(); ++j)
       {
         for (typename SCMatrix::InnerIterator it(m_Lstore, j); it; ++it)
         {
-          if(it.row() < j) continue;
-          if(it.row() == j)
+          if(it.index() == j)
           {
             det *= abs(it.value());
             break;
@@ -315,14 +406,60 @@
     Scalar signDeterminant()
     {
       eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
-      return Scalar(m_detPermR);
+      // Initialize with the determinant of the row matrix
+      Index det = 1;
+      // Note that the diagonal blocks of U are stored in supernodes,
+      // which are available in the  L part :)
+      for (Index j = 0; j < this->cols(); ++j)
+      {
+        for (typename SCMatrix::InnerIterator it(m_Lstore, j); it; ++it)
+        {
+          if(it.index() == j)
+          {
+            if(it.value()<0)
+              det = -det;
+            else if(it.value()==0)
+              return 0;
+            break;
+          }
+        }
+      }
+      return det * m_detPermR * m_detPermC;
     }
+    
+    /** \returns The determinant of the matrix.
+      *
+      * \sa absDeterminant(), logAbsDeterminant()
+      */
+    Scalar determinant()
+    {
+      eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
+      // Initialize with the determinant of the row matrix
+      Scalar det = Scalar(1.);
+      // Note that the diagonal blocks of U are stored in supernodes,
+      // which are available in the  L part :)
+      for (Index j = 0; j < this->cols(); ++j)
+      {
+        for (typename SCMatrix::InnerIterator it(m_Lstore, j); it; ++it)
+        {
+          if(it.index() == j)
+          {
+            det *= it.value();
+            break;
+          }
+        }
+      }
+      return (m_detPermR * m_detPermC) > 0 ? det : -det;
+    }
+
+    Index nnzL() const { return m_nnzL; };
+    Index nnzU() const { return m_nnzU; };
 
   protected:
     // Functions 
     void initperfvalues()
     {
-      m_perfv.panel_size = 1;
+      m_perfv.panel_size = 16;
       m_perfv.relax = 1; 
       m_perfv.maxsuper = 128; 
       m_perfv.rowblk = 16; 
@@ -332,13 +469,12 @@
       
     // Variables 
     mutable ComputationInfo m_info;
-    bool m_isInitialized;
     bool m_factorizationIsOk;
     bool m_analysisIsOk;
     std::string m_lastError;
     NCMatrix m_mat; // The input (permuted ) matrix 
     SCMatrix m_Lstore; // The lower triangular matrix (supernodal)
-    MappedSparseMatrix<Scalar,ColMajor,Index> m_Ustore; // The upper triangular matrix
+    MappedSparseMatrix<Scalar,ColMajor,StorageIndex> m_Ustore; // The upper triangular matrix
     PermutationType m_perm_c; // Column permutation 
     PermutationType m_perm_r ; // Row permutation
     IndexVector m_etree; // Column elimination tree 
@@ -348,14 +484,13 @@
     // SparseLU options 
     bool m_symmetricmode;
     // values for performance 
-    internal::perfvalues<Index> m_perfv; 
+    internal::perfvalues m_perfv;
     RealScalar m_diagpivotthresh; // Specifies the threshold used for a diagonal entry to be an acceptable pivot
-    Index m_nnzL, m_nnzU; // Nonzeros in L and U factors 
-    Index m_detPermR; // Determinant of the coefficient matrix
+    Index m_nnzL, m_nnzU; // Nonzeros in L and U factors
+    Index m_detPermR, m_detPermC; // Determinants of the permutation matrices
   private:
     // Disable copy constructor 
     SparseLU (const SparseLU& );
-  
 }; // End class SparseLU
 
 
@@ -377,30 +512,32 @@
   
   //TODO  It is possible as in SuperLU to compute row and columns scaling vectors to equilibrate the matrix mat.
   
+  // Firstly, copy the whole input matrix. 
+  m_mat = mat;
+  
+  // Compute fill-in ordering
   OrderingType ord; 
-  ord(mat,m_perm_c);
+  ord(m_mat,m_perm_c);
   
   // Apply the permutation to the column of the input  matrix
-  //First copy the whole input matrix. 
-  m_mat = mat;
-  if (m_perm_c.size()) {
+  if (m_perm_c.size())
+  {
     m_mat.uncompress(); //NOTE: The effect of this command is only to create the InnerNonzeros pointers. FIXME : This vector is filled but not subsequently used.  
-    //Then, permute only the column pointers
-    const Index * outerIndexPtr;
-    if (mat.isCompressed()) outerIndexPtr = mat.outerIndexPtr();
-    else
-    {
-      Index *outerIndexPtr_t = new Index[mat.cols()+1];
-      for(Index i = 0; i <= mat.cols(); i++) outerIndexPtr_t[i] = m_mat.outerIndexPtr()[i];
-      outerIndexPtr = outerIndexPtr_t;
-    }
+    // Then, permute only the column pointers
+    ei_declare_aligned_stack_constructed_variable(StorageIndex,outerIndexPtr,mat.cols()+1,mat.isCompressed()?const_cast<StorageIndex*>(mat.outerIndexPtr()):0);
+    
+    // If the input matrix 'mat' is uncompressed, then the outer-indices do not match the ones of m_mat, and a copy is thus needed.
+    if(!mat.isCompressed()) 
+      IndexVector::Map(outerIndexPtr, mat.cols()+1) = IndexVector::Map(m_mat.outerIndexPtr(),mat.cols()+1);
+    
+    // Apply the permutation and compute the nnz per column.
     for (Index i = 0; i < mat.cols(); i++)
     {
       m_mat.outerIndexPtr()[m_perm_c.indices()(i)] = outerIndexPtr[i];
       m_mat.innerNonZeroPtr()[m_perm_c.indices()(i)] = outerIndexPtr[i+1] - outerIndexPtr[i];
     }
-    if(!mat.isCompressed()) delete[] outerIndexPtr;
   }
+  
   // Compute the column elimination tree of the permuted matrix 
   IndexVector firstRowElt;
   internal::coletree(m_mat, m_etree,firstRowElt); 
@@ -409,7 +546,7 @@
   if (!m_symmetricmode) {
     IndexVector post, iwork; 
     // Post order etree
-    internal::treePostorder(m_mat.cols(), m_etree, post); 
+    internal::treePostorder(StorageIndex(m_mat.cols()), m_etree, post); 
       
    
     // Renumber etree in postorder 
@@ -461,8 +598,7 @@
   eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); 
   eigen_assert((matrix.rows() == matrix.cols()) && "Only for squared matrices");
   
-  typedef typename IndexVector::Scalar Index; 
-  
+  m_isInitialized = true;
   
   // Apply the column permutation computed in analyzepattern()
   //   m_mat = matrix * m_perm_c.inverse(); 
@@ -471,11 +607,11 @@
   {
     m_mat.uncompress(); //NOTE: The effect of this command is only to create the InnerNonzeros pointers.
     //Then, permute only the column pointers
-    const Index * outerIndexPtr;
+    const StorageIndex * outerIndexPtr;
     if (matrix.isCompressed()) outerIndexPtr = matrix.outerIndexPtr();
     else
     {
-      Index* outerIndexPtr_t = new Index[matrix.cols()+1];
+      StorageIndex* outerIndexPtr_t = new StorageIndex[matrix.cols()+1];
       for(Index i = 0; i <= matrix.cols(); i++) outerIndexPtr_t[i] = m_mat.outerIndexPtr()[i];
       outerIndexPtr = outerIndexPtr_t;
     }
@@ -489,7 +625,7 @@
   else 
   { //FIXME This should not be needed if the empty permutation is handled transparently
     m_perm_c.resize(matrix.cols());
-    for(Index i = 0; i < matrix.cols(); ++i) m_perm_c.indices()(i) = i;
+    for(StorageIndex i = 0; i < matrix.cols(); ++i) m_perm_c.indices()(i) = i;
   }
   
   Index m = m_mat.rows();
@@ -547,7 +683,6 @@
   //  (a) a relaxed supernode at the bottom of the etree, or
   //  (b) panel_size contiguous columns, <panel_size> defined by the user
   Index jcol; 
-  IndexVector panel_histo(n);
   Index pivrow; // Pivotal row number in the original row matrix
   Index nseg1; // Number of segments in U-column above panel row jcol
   Index nseg; // Number of segments in each U-column 
@@ -627,7 +762,8 @@
       }
       
       // Update the determinant of the row permutation matrix
-      if (pivrow != jj) m_detPermR *= -1;
+      // FIXME: the following test is not correct, we should probably take iperm_c into account and pivrow is not directly the row pivot.
+      if (pivrow != jj) m_detPermR = -m_detPermR;
 
       // Prune columns (0:jj-1) using column jj
       Base::pruneL(jj, m_perm_r.indices(), pivrow, nseg, segrep, repfnz_k, xprune, m_glu); 
@@ -642,15 +778,18 @@
     jcol += panel_size;  // Move to the next panel
   } // end for -- end elimination 
   
+  m_detPermR = m_perm_r.determinant();
+  m_detPermC = m_perm_c.determinant();
+  
   // Count the number of nonzeros in factors 
   Base::countnz(n, m_nnzL, m_nnzU, m_glu); 
   // Apply permutation  to the L subscripts 
-  Base::fixupL(n, m_perm_r.indices(), m_glu); 
+  Base::fixupL(n, m_perm_r.indices(), m_glu);
   
   // Create supernode matrix L 
   m_Lstore.setInfos(m, n, m_glu.lusup, m_glu.xlusup, m_glu.lsub, m_glu.xlsub, m_glu.supno, m_glu.xsup); 
   // Create the column major upper sparse matrix  U; 
-  new (&m_Ustore) MappedSparseMatrix<Scalar, ColMajor, Index> ( m, n, m_nnzU, m_glu.xusub.data(), m_glu.usub.data(), m_glu.ucol.data() ); 
+  new (&m_Ustore) MappedSparseMatrix<Scalar, ColMajor, StorageIndex> ( m, n, m_nnzU, m_glu.xusub.data(), m_glu.usub.data(), m_glu.ucol.data() );
   
   m_info = Success;
   m_factorizationIsOk = true;
@@ -659,35 +798,39 @@
 template<typename MappedSupernodalType>
 struct SparseLUMatrixLReturnType : internal::no_assignment_operator
 {
-  typedef typename MappedSupernodalType::Index Index;
   typedef typename MappedSupernodalType::Scalar Scalar;
-  SparseLUMatrixLReturnType(const MappedSupernodalType& mapL) : m_mapL(mapL)
+  explicit SparseLUMatrixLReturnType(const MappedSupernodalType& mapL) : m_mapL(mapL)
   { }
-  Index rows() { return m_mapL.rows(); }
-  Index cols() { return m_mapL.cols(); }
+  Index rows() const { return m_mapL.rows(); }
+  Index cols() const { return m_mapL.cols(); }
   template<typename Dest>
   void solveInPlace( MatrixBase<Dest> &X) const
   {
     m_mapL.solveInPlace(X);
   }
+  template<bool Conjugate, typename Dest>
+  void solveTransposedInPlace( MatrixBase<Dest> &X) const
+  {
+    m_mapL.template solveTransposedInPlace<Conjugate>(X);
+  }
+
   const MappedSupernodalType& m_mapL;
 };
 
 template<typename MatrixLType, typename MatrixUType>
 struct SparseLUMatrixUReturnType : internal::no_assignment_operator
 {
-  typedef typename MatrixLType::Index Index;
   typedef typename MatrixLType::Scalar Scalar;
   SparseLUMatrixUReturnType(const MatrixLType& mapL, const MatrixUType& mapU)
   : m_mapL(mapL),m_mapU(mapU)
   { }
-  Index rows() { return m_mapL.rows(); }
-  Index cols() { return m_mapL.cols(); }
+  Index rows() const { return m_mapL.rows(); }
+  Index cols() const { return m_mapL.cols(); }
 
   template<typename Dest>   void solveInPlace(MatrixBase<Dest> &X) const
   {
     Index nrhs = X.cols();
-    Index n = X.rows();
+    Index n    = X.rows();
     // Backward solve with U
     for (Index k = m_mapL.nsuper(); k >= 0; k--)
     {
@@ -705,8 +848,9 @@
       }
       else
       {
-        Map<const Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
-        Map< Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+        // FIXME: the following lines should use Block expressions and not Map!
+        Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
+        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X.coeffRef(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
         U = A.template triangularView<Upper>().solve(U);
       }
 
@@ -724,39 +868,56 @@
       }
     } // End For U-solve
   }
+
+  template<bool Conjugate, typename Dest>   void solveTransposedInPlace(MatrixBase<Dest> &X) const
+  {
+    using numext::conj;
+    Index nrhs = X.cols();
+    Index n    = X.rows();
+    // Forward solve with U
+    for (Index k = 0; k <=  m_mapL.nsuper(); k++)
+    {
+      Index fsupc = m_mapL.supToCol()[k];
+      Index lda = m_mapL.colIndexPtr()[fsupc+1] - m_mapL.colIndexPtr()[fsupc]; // leading dimension
+      Index nsupc = m_mapL.supToCol()[k+1] - fsupc;
+      Index luptr = m_mapL.colIndexPtr()[fsupc];
+
+      for (Index j = 0; j < nrhs; ++j)
+      {
+        for (Index jcol = fsupc; jcol < fsupc + nsupc; jcol++)
+        {
+          typename MatrixUType::InnerIterator it(m_mapU, jcol);
+          for ( ; it; ++it)
+          {
+            Index irow = it.index();
+            X(jcol, j) -= X(irow, j) * (Conjugate? conj(it.value()): it.value());
+          }
+        }
+      }
+      if (nsupc == 1)
+      {
+        for (Index j = 0; j < nrhs; j++)
+        {
+          X(fsupc, j) /= (Conjugate? conj(m_mapL.valuePtr()[luptr]) : m_mapL.valuePtr()[luptr]);
+        }
+      }
+      else
+      {
+        Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
+        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+        if(Conjugate)
+          U = A.adjoint().template triangularView<Lower>().solve(U);
+        else
+          U = A.transpose().template triangularView<Lower>().solve(U);
+      }
+    }// End For U-solve
+  }
+
+
   const MatrixLType& m_mapL;
   const MatrixUType& m_mapU;
 };
 
-namespace internal {
-  
-template<typename _MatrixType, typename Derived, typename Rhs>
-struct solve_retval<SparseLU<_MatrixType,Derived>, Rhs>
-  : solve_retval_base<SparseLU<_MatrixType,Derived>, Rhs>
-{
-  typedef SparseLU<_MatrixType,Derived> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-template<typename _MatrixType, typename Derived, typename Rhs>
-struct sparse_solve_retval<SparseLU<_MatrixType,Derived>, Rhs>
-  : sparse_solve_retval_base<SparseLU<_MatrixType,Derived>, Rhs>
-{
-  typedef SparseLU<_MatrixType,Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
-};
-} // end namespace internal
-
 } // End namespace Eigen 
 
 #endif

diff --git a/Eigen/src/SparseLU/SparseLUImpl.h b/Eigen/src/SparseLU/SparseLUImpl.h
index 14d7089..fc0cfc4 100644
--- a/Eigen/src/SparseLU/SparseLUImpl.h
+++ b/Eigen/src/SparseLU/SparseLUImpl.h

@@ -16,17 +16,19 @@
   * \class SparseLUImpl
   * Base class for sparseLU
   */
-template <typename Scalar, typename Index>
+template <typename Scalar, typename StorageIndex>
 class SparseLUImpl
 {
   public:
     typedef Matrix<Scalar,Dynamic,1> ScalarVector;
-    typedef Matrix<Index,Dynamic,1> IndexVector; 
+    typedef Matrix<StorageIndex,Dynamic,1> IndexVector; 
+    typedef Matrix<Scalar,Dynamic,Dynamic,ColMajor> ScalarMatrix;
+    typedef Map<ScalarMatrix, 0,  OuterStride<> > MappedMatrixBlock;
     typedef typename ScalarVector::RealScalar RealScalar; 
     typedef Ref<Matrix<Scalar,Dynamic,1> > BlockScalarVector;
-    typedef Ref<Matrix<Index,Dynamic,1> > BlockIndexVector;
+    typedef Ref<Matrix<StorageIndex,Dynamic,1> > BlockIndexVector;
     typedef LU_GlobalLU_t<IndexVector, ScalarVector> GlobalLU_t; 
-    typedef SparseMatrix<Scalar,ColMajor,Index> MatrixType; 
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> MatrixType; 
     
   protected:
      template <typename VectorType>
@@ -40,7 +42,7 @@
      Index snode_bmod (const Index jcol, const Index fsupc, ScalarVector& dense, GlobalLU_t& glu);
      Index pivotL(const Index jcol, const RealScalar& diagpivotthresh, IndexVector& perm_r, IndexVector& iperm_c, Index& pivrow, GlobalLU_t& glu);
      template <typename Traits>
-     void dfs_kernel(const Index jj, IndexVector& perm_r,
+     void dfs_kernel(const StorageIndex jj, IndexVector& perm_r,
                     Index& nseg, IndexVector& panel_lsub, IndexVector& segrep,
                     Ref<IndexVector> repfnz_col, IndexVector& xprune, Ref<IndexVector> marker, IndexVector& parent,
                     IndexVector& xplore, GlobalLU_t& glu, Index& nextl_col, Index krow, Traits& traits);

diff --git a/Eigen/src/SparseLU/SparseLU_Memory.h b/Eigen/src/SparseLU/SparseLU_Memory.h
index 1ffa7d5..349bfd5 100644
--- a/Eigen/src/SparseLU/SparseLU_Memory.h
+++ b/Eigen/src/SparseLU/SparseLU_Memory.h

@@ -36,13 +36,12 @@
   
 enum { LUNoMarker = 3 };
 enum {emptyIdxLU = -1};
-template<typename Index>
 inline Index LUnumTempV(Index& m, Index& w, Index& t, Index& b)
 {
   return (std::max)(m, (t+b)*w);
 }
 
-template< typename Scalar, typename Index>
+template< typename Scalar>
 inline Index LUTempSpace(Index&m, Index& w)
 {
   return (2*w + 4 + LUNoMarker) * m * sizeof(Index) + (w + 1) * m * sizeof(Scalar);
@@ -52,16 +51,16 @@
 
 
 /** 
-  * Expand the existing storage to accomodate more fill-ins
+  * Expand the existing storage to accommodate more fill-ins
   * \param vec Valid pointer to the vector to allocate or expand
   * \param[in,out] length  At input, contain the current length of the vector that is to be increased. At output, length of the newly allocated vector
   * \param[in] nbElts Current number of elements in the factors
   * \param keep_prev  1: use length  and do not expand the vector; 0: compute new_len and expand
   * \param[in,out] num_expansions Number of times the memory has been expanded
   */
-template <typename Scalar, typename Index>
+template <typename Scalar, typename StorageIndex>
 template <typename VectorType>
-Index  SparseLUImpl<Scalar,Index>::expand(VectorType& vec, Index& length, Index nbElts, Index keep_prev, Index& num_expansions) 
+Index  SparseLUImpl<Scalar,StorageIndex>::expand(VectorType& vec, Index& length, Index nbElts, Index keep_prev, Index& num_expansions) 
 {
   
   float alpha = 1.5; // Ratio of the memory increase 
@@ -148,13 +147,13 @@
  * \return an estimated size of the required memory if lwork = -1; otherwise, return the size of actually allocated memory when allocation failed, and 0 on success
  * \note Unlike SuperLU, this routine does not support successive factorization with the same pattern and the same row permutation
  */
-template <typename Scalar, typename Index>
-Index SparseLUImpl<Scalar,Index>::memInit(Index m, Index n, Index annz, Index lwork, Index fillratio, Index panel_size,  GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+Index SparseLUImpl<Scalar,StorageIndex>::memInit(Index m, Index n, Index annz, Index lwork, Index fillratio, Index panel_size,  GlobalLU_t& glu)
 {
   Index& num_expansions = glu.num_expansions; //No memory expansions so far
   num_expansions = 0;
-  glu.nzumax = glu.nzlumax = (std::min)(fillratio * annz / n, m) * n; // estimated number of nonzeros in U 
-  glu.nzlmax = (std::max)(Index(4), fillratio) * annz / 4; // estimated  nnz in L factor
+  glu.nzumax = glu.nzlumax = (std::min)(fillratio * (annz+1) / n, m) * n; // estimated number of nonzeros in U 
+  glu.nzlmax = (std::max)(Index(4), fillratio) * (annz+1) / 4; // estimated  nnz in L factor
   // Return the estimated size to the user if necessary
   Index tempSpace;
   tempSpace = (2*panel_size + 4 + LUNoMarker) * m * sizeof(Index) + (panel_size + 1) * m * sizeof(Scalar);
@@ -205,9 +204,9 @@
  * \param num_expansions Number of expansions 
  * \return 0 on success, > 0 size of the memory allocated so far
  */
-template <typename Scalar, typename Index>
+template <typename Scalar, typename StorageIndex>
 template <typename VectorType>
-Index SparseLUImpl<Scalar,Index>::memXpand(VectorType& vec, Index& maxlen, Index nbElts, MemType memtype, Index& num_expansions)
+Index SparseLUImpl<Scalar,StorageIndex>::memXpand(VectorType& vec, Index& maxlen, Index nbElts, MemType memtype, Index& num_expansions)
 {
   Index failed_size; 
   if (memtype == USUB)

diff --git a/Eigen/src/SparseLU/SparseLU_Structs.h b/Eigen/src/SparseLU/SparseLU_Structs.h
index 24d6bf1..cf5ec44 100644
--- a/Eigen/src/SparseLU/SparseLU_Structs.h
+++ b/Eigen/src/SparseLU/SparseLU_Structs.h

@@ -75,7 +75,7 @@
 
 template <typename IndexVector, typename ScalarVector>
 struct LU_GlobalLU_t {
-  typedef typename IndexVector::Scalar Index; 
+  typedef typename IndexVector::Scalar StorageIndex; 
   IndexVector xsup; //First supernode column ... xsup(s) points to the beginning of the s-th supernode
   IndexVector supno; // Supernode number corresponding to this column (column to supernode mapping)
   ScalarVector  lusup; // nonzero values of L ordered by columns 
@@ -93,7 +93,6 @@
 };
 
 // Values to set for performance
-template <typename Index>
 struct perfvalues {
   Index panel_size; // a panel consists of at most <panel_size> consecutive columns
   Index relax; // To control degree of relaxing supernodes. If the number of nodes (columns) 

diff --git a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
index ad6f218..0be293d 100644
--- a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
+++ b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h

@@ -29,20 +29,20 @@
  * SuperInnerIterator to iterate through all supernodes 
  * Function for triangular solve
  */
-template <typename _Scalar, typename _Index>
+template <typename _Scalar, typename _StorageIndex>
 class MappedSuperNodalMatrix
 {
   public:
     typedef _Scalar Scalar; 
-    typedef _Index Index;
-    typedef Matrix<Index,Dynamic,1> IndexVector; 
+    typedef _StorageIndex StorageIndex;
+    typedef Matrix<StorageIndex,Dynamic,1> IndexVector;
     typedef Matrix<Scalar,Dynamic,1> ScalarVector;
   public:
     MappedSuperNodalMatrix()
     {
       
     }
-    MappedSuperNodalMatrix(Index m, Index n,  ScalarVector& nzval, IndexVector& nzval_colptr, IndexVector& rowind, 
+    MappedSuperNodalMatrix(Index m, Index n,  ScalarVector& nzval, IndexVector& nzval_colptr, IndexVector& rowind,
              IndexVector& rowind_colptr, IndexVector& col_to_sup, IndexVector& sup_to_col )
     {
       setInfos(m, n, nzval, nzval_colptr, rowind, rowind_colptr, col_to_sup, sup_to_col);
@@ -58,7 +58,7 @@
      * FIXME This class will be modified such that it can be use in the course 
      * of the factorization.
      */
-    void setInfos(Index m, Index n, ScalarVector& nzval, IndexVector& nzval_colptr, IndexVector& rowind, 
+    void setInfos(Index m, Index n, ScalarVector& nzval, IndexVector& nzval_colptr, IndexVector& rowind,
              IndexVector& rowind_colptr, IndexVector& col_to_sup, IndexVector& sup_to_col )
     {
       m_row = m;
@@ -75,12 +75,12 @@
     /**
      * Number of rows
      */
-    Index rows() { return m_row; }
+    Index rows() const { return m_row; }
     
     /**
      * Number of columns
      */
-    Index cols() { return m_col; }
+    Index cols() const { return m_col; }
     
     /**
      * Return the array of nonzero values packed by column
@@ -96,12 +96,12 @@
     /**
      * Return the pointers to the beginning of each column in \ref valuePtr()
      */
-    Index* colIndexPtr()
+    StorageIndex* colIndexPtr()
     {
       return m_nzval_colptr; 
     }
     
-    const Index* colIndexPtr() const
+    const StorageIndex* colIndexPtr() const
     {
       return m_nzval_colptr; 
     }
@@ -109,9 +109,9 @@
     /**
      * Return the array of compressed row indices of all supernodes
      */
-    Index* rowIndex()  { return m_rowind; }
+    StorageIndex* rowIndex()  { return m_rowind; }
     
-    const Index* rowIndex() const
+    const StorageIndex* rowIndex() const
     {
       return m_rowind; 
     }
@@ -119,9 +119,9 @@
     /**
      * Return the location in \em rowvaluePtr() which starts each column
      */
-    Index* rowIndexPtr() { return m_rowind_colptr; }
+    StorageIndex* rowIndexPtr() { return m_rowind_colptr; }
     
-    const Index* rowIndexPtr() const 
+    const StorageIndex* rowIndexPtr() const
     {
       return m_rowind_colptr; 
     }
@@ -129,18 +129,18 @@
     /** 
      * Return the array of column-to-supernode mapping 
      */
-    Index* colToSup()  { return m_col_to_sup; }
+    StorageIndex* colToSup()  { return m_col_to_sup; }
     
-    const Index* colToSup() const
+    const StorageIndex* colToSup() const
     {
       return m_col_to_sup;       
     }
     /**
      * Return the array of supernode-to-column mapping
      */
-    Index* supToCol() { return m_sup_to_col; }
+    StorageIndex* supToCol() { return m_sup_to_col; }
     
-    const Index* supToCol() const 
+    const StorageIndex* supToCol() const
     {
       return m_sup_to_col;
     }
@@ -148,7 +148,7 @@
     /**
      * Return the number of supernodes
      */
-    Index nsuper() const 
+    Index nsuper() const
     {
       return m_nsuper; 
     }
@@ -156,20 +156,23 @@
     class InnerIterator; 
     template<typename Dest>
     void solveInPlace( MatrixBase<Dest>&X) const;
+    template<bool Conjugate, typename Dest>
+    void solveTransposedInPlace( MatrixBase<Dest>&X) const;
+
     
       
       
     
   protected:
     Index m_row; // Number of rows
-    Index m_col; // Number of columns 
-    Index m_nsuper; // Number of supernodes 
+    Index m_col; // Number of columns
+    Index m_nsuper; // Number of supernodes
     Scalar* m_nzval; //array of nonzero values packed by column
-    Index* m_nzval_colptr; //nzval_colptr[j] Stores the location in nzval[] which starts column j 
-    Index* m_rowind; // Array of compressed row indices of rectangular supernodes
-    Index* m_rowind_colptr; //rowind_colptr[j] stores the location in rowind[] which starts column j
-    Index* m_col_to_sup; // col_to_sup[j] is the supernode number to which column j belongs
-    Index* m_sup_to_col; //sup_to_col[s] points to the starting column of the s-th supernode
+    StorageIndex* m_nzval_colptr; //nzval_colptr[j] Stores the location in nzval[] which starts column j
+    StorageIndex* m_rowind; // Array of compressed row indices of rectangular supernodes
+    StorageIndex* m_rowind_colptr; //rowind_colptr[j] stores the location in rowind[] which starts column j
+    StorageIndex* m_col_to_sup; // col_to_sup[j] is the supernode number to which column j belongs
+    StorageIndex* m_sup_to_col; //sup_to_col[s] points to the starting column of the s-th supernode
     
   private :
 };
@@ -178,19 +181,19 @@
   * \brief InnerIterator class to iterate over nonzero values of the current column in the supernodal matrix L
   * 
   */
-template<typename Scalar, typename Index>
-class MappedSuperNodalMatrix<Scalar,Index>::InnerIterator
+template<typename Scalar, typename StorageIndex>
+class MappedSuperNodalMatrix<Scalar,StorageIndex>::InnerIterator
 {
   public:
      InnerIterator(const MappedSuperNodalMatrix& mat, Index outer)
       : m_matrix(mat),
-        m_outer(outer), 
+        m_outer(outer),
         m_supno(mat.colToSup()[outer]),
         m_idval(mat.colIndexPtr()[outer]),
         m_startidval(m_idval),
         m_endidval(mat.colIndexPtr()[outer+1]),
-        m_idrow(mat.rowIndexPtr()[outer]),
-        m_endidrow(mat.rowIndexPtr()[outer+1])
+        m_idrow(mat.rowIndexPtr()[mat.supToCol()[mat.colToSup()[outer]]]),
+        m_endidrow(mat.rowIndexPtr()[mat.supToCol()[mat.colToSup()[outer]]+1])
     {}
     inline InnerIterator& operator++()
     { 
@@ -229,14 +232,17 @@
  * \brief Solve with the supernode triangular matrix
  * 
  */
-template<typename Scalar, typename Index>
+template<typename Scalar, typename Index_>
 template<typename Dest>
-void MappedSuperNodalMatrix<Scalar,Index>::solveInPlace( MatrixBase<Dest>&X) const
+void MappedSuperNodalMatrix<Scalar,Index_>::solveInPlace( MatrixBase<Dest>&X) const
 {
-    Index n = X.rows(); 
-    Index nrhs = X.cols(); 
+    /* Explicit type conversion as the Index type of MatrixBase<Dest> may be wider than Index */
+//    eigen_assert(X.rows() <= NumTraits<Index>::highest());
+//    eigen_assert(X.cols() <= NumTraits<Index>::highest());
+    Index n    = int(X.rows());
+    Index nrhs = Index(X.cols());
     const Scalar * Lval = valuePtr();                 // Nonzero values 
-    Matrix<Scalar,Dynamic,Dynamic> work(n, nrhs);     // working vector
+    Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor> work(n, nrhs);     // working vector
     work.setZero();
     for (Index k = 0; k <= nsuper(); k ++)
     {
@@ -267,13 +273,13 @@
         Index lda = colIndexPtr()[fsupc+1] - luptr;
         
         // Triangular solve 
-        Map<const Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > A( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) ); 
-        Map< Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); 
+        Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) );
+        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
         U = A.template triangularView<UnitLower>().solve(U); 
         
         // Matrix-vector product 
-        new (&A) Map<const Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > ( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) ); 
-        work.block(0, 0, nrow, nrhs) = A * U; 
+        new (&A) Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > ( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) );
+        work.topRows(nrow).noalias() = A * U;
         
         //Begin Scatter 
         for (Index j = 0; j < nrhs; j++)
@@ -291,6 +297,77 @@
     } 
 }
 
+template<typename Scalar, typename Index_>
+template<bool Conjugate, typename Dest>
+void MappedSuperNodalMatrix<Scalar,Index_>::solveTransposedInPlace( MatrixBase<Dest>&X) const
+{
+    using numext::conj;
+  Index n    = int(X.rows());
+  Index nrhs = Index(X.cols());
+  const Scalar * Lval = valuePtr();                 // Nonzero values
+  Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor> work(n, nrhs);     // working vector
+  work.setZero();
+  for (Index k = nsuper(); k >= 0; k--)
+  {
+    Index fsupc = supToCol()[k];                    // First column of the current supernode
+    Index istart = rowIndexPtr()[fsupc];            // Pointer index to the subscript of the current column
+    Index nsupr = rowIndexPtr()[fsupc+1] - istart;  // Number of rows in the current supernode
+    Index nsupc = supToCol()[k+1] - fsupc;          // Number of columns in the current supernode
+    Index nrow = nsupr - nsupc;                     // Number of rows in the non-diagonal part of the supernode
+    Index irow;                                     //Current index row
+
+    if (nsupc == 1 )
+    {
+      for (Index j = 0; j < nrhs; j++)
+      {
+        InnerIterator it(*this, fsupc);
+        ++it; // Skip the diagonal element
+        for (; it; ++it)
+        {
+          irow = it.row();
+          X(fsupc,j) -= X(irow, j) * (Conjugate?conj(it.value()):it.value());
+        }
+      }
+    }
+    else
+    {
+      // The supernode has more than one column
+      Index luptr = colIndexPtr()[fsupc];
+      Index lda = colIndexPtr()[fsupc+1] - luptr;
+
+      //Begin Gather
+      for (Index j = 0; j < nrhs; j++)
+      {
+        Index iptr = istart + nsupc;
+        for (Index i = 0; i < nrow; i++)
+        {
+          irow = rowIndex()[iptr];
+          work.topRows(nrow)(i,j)= X(irow,j); // Gather operation
+          iptr++;
+        }
+      }
+
+      // Matrix-vector product with transposed submatrix
+      Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) );
+      Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+      if(Conjugate)
+        U = U - A.adjoint() * work.topRows(nrow);
+      else
+        U = U - A.transpose() * work.topRows(nrow);
+
+      // Triangular solve (of transposed diagonal block)
+      new (&A) Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > ( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) );
+      if(Conjugate)
+        U = A.adjoint().template triangularView<UnitUpper>().solve(U);
+      else
+        U = A.transpose().template triangularView<UnitUpper>().solve(U);
+
+    }
+
+  }
+}
+
+
 } // end namespace internal
 
 } // end namespace Eigen

diff --git a/Eigen/src/SparseLU/SparseLU_Utils.h b/Eigen/src/SparseLU/SparseLU_Utils.h
index 15352ac..9e3dab4 100644
--- a/Eigen/src/SparseLU/SparseLU_Utils.h
+++ b/Eigen/src/SparseLU/SparseLU_Utils.h

@@ -17,8 +17,8 @@
 /**
  * \brief Count Nonzero elements in the factors
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::countnz(const Index n, Index& nnzL, Index& nnzU, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar,StorageIndex>::countnz(const Index n, Index& nnzL, Index& nnzU, GlobalLU_t& glu)
 {
  nnzL = 0; 
  nnzU = (glu.xusub)(n); 
@@ -48,12 +48,12 @@
  * and applies permutation to the remaining subscripts
  * 
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::fixupL(const Index n, const IndexVector& perm_r, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar,StorageIndex>::fixupL(const Index n, const IndexVector& perm_r, GlobalLU_t& glu)
 {
   Index fsupc, i, j, k, jstart; 
   
-  Index nextl = 0; 
+  StorageIndex nextl = 0; 
   Index nsuper = (glu.supno)(n); 
   
   // For each supernode 

diff --git a/Eigen/src/SparseLU/SparseLU_column_bmod.h b/Eigen/src/SparseLU/SparseLU_column_bmod.h
index f24bd87..b57f068 100644
--- a/Eigen/src/SparseLU/SparseLU_column_bmod.h
+++ b/Eigen/src/SparseLU/SparseLU_column_bmod.h

@@ -49,8 +49,9 @@
  *         > 0 - number of bytes allocated when run out of space
  * 
  */
-template <typename Scalar, typename Index>
-Index SparseLUImpl<Scalar,Index>::column_bmod(const Index jcol, const Index nseg, BlockScalarVector dense, ScalarVector& tempv, BlockIndexVector segrep, BlockIndexVector repfnz, Index fpanelc, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+Index SparseLUImpl<Scalar,StorageIndex>::column_bmod(const Index jcol, const Index nseg, BlockScalarVector dense, ScalarVector& tempv,
+                                                     BlockIndexVector segrep, BlockIndexVector repfnz, Index fpanelc, GlobalLU_t& glu)
 {
   Index  jsupno, k, ksub, krep, ksupno; 
   Index lptr, nrow, isub, irow, nextlu, new_next, ufirst; 
@@ -137,7 +138,7 @@
     glu.lusup.segment(nextlu,offset).setZero();
     nextlu += offset;
   }
-  glu.xlusup(jcol + 1) = nextlu;  // close L\U(*,jcol); 
+  glu.xlusup(jcol + 1) = StorageIndex(nextlu);  // close L\U(*,jcol); 
   
   /* For more updates within the panel (also within the current supernode),
    * should start from the first column of the panel, or the first column
@@ -162,11 +163,11 @@
     // points to the beginning of jcol in snode L\U(jsupno) 
     ufirst = glu.xlusup(jcol) + d_fsupc; 
     Index lda = glu.xlusup(jcol+1) - glu.xlusup(jcol);
-    Map<Matrix<Scalar,Dynamic,Dynamic>, 0,  OuterStride<> > A( &(glu.lusup.data()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); 
+    MappedMatrixBlock A( &(glu.lusup.data()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
     VectorBlock<ScalarVector> u(glu.lusup, ufirst, nsupc); 
     u = A.template triangularView<UnitLower>().solve(u); 
     
-    new (&A) Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > ( &(glu.lusup.data()[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) ); 
+    new (&A) MappedMatrixBlock ( &(glu.lusup.data()[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) );
     VectorBlock<ScalarVector> l(glu.lusup, ufirst+nsupc, nrow); 
     l.noalias() -= A * u;
     

diff --git a/Eigen/src/SparseLU/SparseLU_column_dfs.h b/Eigen/src/SparseLU/SparseLU_column_dfs.h
index 4c04b0e..5a2c941 100644
--- a/Eigen/src/SparseLU/SparseLU_column_dfs.h
+++ b/Eigen/src/SparseLU/SparseLU_column_dfs.h

@@ -30,7 +30,7 @@
 #ifndef SPARSELU_COLUMN_DFS_H
 #define SPARSELU_COLUMN_DFS_H
 
-template <typename Scalar, typename Index> class SparseLUImpl;
+template <typename Scalar, typename StorageIndex> class SparseLUImpl;
 namespace Eigen {
 
 namespace internal {
@@ -39,8 +39,8 @@
 struct column_dfs_traits : no_assignment_operator
 {
   typedef typename ScalarVector::Scalar Scalar;
-  typedef typename IndexVector::Scalar Index;
-  column_dfs_traits(Index jcol, Index& jsuper, typename SparseLUImpl<Scalar, Index>::GlobalLU_t& glu, SparseLUImpl<Scalar, Index>& luImpl)
+  typedef typename IndexVector::Scalar StorageIndex;
+  column_dfs_traits(Index jcol, Index& jsuper, typename SparseLUImpl<Scalar, StorageIndex>::GlobalLU_t& glu, SparseLUImpl<Scalar, StorageIndex>& luImpl)
    : m_jcol(jcol), m_jsuper_ref(jsuper), m_glu(glu), m_luImpl(luImpl)
  {}
   bool update_segrep(Index /*krep*/, Index /*jj*/)
@@ -57,8 +57,8 @@
   
   Index m_jcol;
   Index& m_jsuper_ref;
-  typename SparseLUImpl<Scalar, Index>::GlobalLU_t& m_glu;
-  SparseLUImpl<Scalar, Index>& m_luImpl;
+  typename SparseLUImpl<Scalar, StorageIndex>::GlobalLU_t& m_glu;
+  SparseLUImpl<Scalar, StorageIndex>& m_luImpl;
 };
 
 
@@ -89,8 +89,10 @@
  *         > 0 number of bytes allocated when run out of space
  * 
  */
-template <typename Scalar, typename Index>
-Index SparseLUImpl<Scalar,Index>::column_dfs(const Index m, const Index jcol, IndexVector& perm_r, Index maxsuper, Index& nseg,  BlockIndexVector lsub_col, IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune, IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+Index SparseLUImpl<Scalar,StorageIndex>::column_dfs(const Index m, const Index jcol, IndexVector& perm_r, Index maxsuper, Index& nseg,
+                                                    BlockIndexVector lsub_col, IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune,
+                                                    IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu)
 {
   
   Index jsuper = glu.supno(jcol); 
@@ -110,13 +112,13 @@
     // krow was visited before, go to the next nonz; 
     if (kmark == jcol) continue;
     
-    dfs_kernel(jcol, perm_r, nseg, glu.lsub, segrep, repfnz, xprune, marker2, parent,
+    dfs_kernel(StorageIndex(jcol), perm_r, nseg, glu.lsub, segrep, repfnz, xprune, marker2, parent,
                    xplore, glu, nextl, krow, traits);
   } // for each nonzero ... 
   
-  Index fsupc, jptr, jm1ptr, ito, ifrom, istop;
-  Index nsuper = glu.supno(jcol);
-  Index jcolp1 = jcol + 1;
+  Index fsupc;
+  StorageIndex nsuper = glu.supno(jcol);
+  StorageIndex jcolp1 = StorageIndex(jcol) + 1;
   Index jcolm1 = jcol - 1;
   
   // check to see if j belongs in the same supernode as j-1
@@ -127,8 +129,8 @@
   else 
   {
     fsupc = glu.xsup(nsuper); 
-    jptr = glu.xlsub(jcol); // Not yet compressed
-    jm1ptr = glu.xlsub(jcolm1); 
+    StorageIndex jptr = glu.xlsub(jcol); // Not yet compressed
+    StorageIndex jm1ptr = glu.xlsub(jcolm1); 
     
     // Use supernodes of type T2 : see SuperLU paper
     if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = emptyIdxLU;
@@ -146,13 +148,13 @@
     { // starts a new supernode 
       if ( (fsupc < jcolm1-1) ) 
       { // >= 3 columns in nsuper
-        ito = glu.xlsub(fsupc+1);
+        StorageIndex ito = glu.xlsub(fsupc+1);
         glu.xlsub(jcolm1) = ito; 
-        istop = ito + jptr - jm1ptr; 
-        xprune(jcolm1) = istop; // intialize xprune(jcol-1)
+        StorageIndex istop = ito + jptr - jm1ptr; 
+        xprune(jcolm1) = istop; // initialize xprune(jcol-1)
         glu.xlsub(jcol) = istop; 
         
-        for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito)
+        for (StorageIndex ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito)
           glu.lsub(ito) = glu.lsub(ifrom); 
         nextl = ito;  // = istop + length(jcol)
       }
@@ -164,8 +166,8 @@
   // Tidy up the pointers before exit
   glu.xsup(nsuper+1) = jcolp1; 
   glu.supno(jcolp1) = nsuper; 
-  xprune(jcol) = nextl;  // Intialize upper bound for pruning
-  glu.xlsub(jcolp1) = nextl; 
+  xprune(jcol) = StorageIndex(nextl);  // Initialize upper bound for pruning
+  glu.xlsub(jcolp1) = StorageIndex(nextl); 
   
   return 0; 
 }

diff --git a/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h b/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h
index 170610d..c32d8d8 100644
--- a/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h
+++ b/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h

@@ -46,8 +46,9 @@
  *         > 0 - number of bytes allocated when run out of space
  * 
  */
-template <typename Scalar, typename Index>
-Index SparseLUImpl<Scalar,Index>::copy_to_ucol(const Index jcol, const Index nseg, IndexVector& segrep, BlockIndexVector repfnz ,IndexVector& perm_r, BlockScalarVector dense, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+Index SparseLUImpl<Scalar,StorageIndex>::copy_to_ucol(const Index jcol, const Index nseg, IndexVector& segrep,
+                                                      BlockIndexVector repfnz ,IndexVector& perm_r, BlockScalarVector dense, GlobalLU_t& glu)
 {  
   Index ksub, krep, ksupno; 
     
@@ -55,7 +56,7 @@
   
   // For each nonzero supernode segment of U[*,j] in topological order 
   Index k = nseg - 1, i; 
-  Index nextu = glu.xusub(jcol); 
+  StorageIndex nextu = glu.xusub(jcol); 
   Index kfnz, isub, segsize; 
   Index new_next,irow; 
   Index fsupc, mem; 

diff --git a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
index 9e4e3e7..e37c2fe 100644
--- a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
+++ b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h

@@ -21,7 +21,7 @@
   *  - lda and ldc must be multiples of the respective packet size
   *  - C must have the same alignment as A
   */
-template<typename Scalar,typename Index>
+template<typename Scalar>
 EIGEN_DONT_INLINE
 void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const Scalar* B, Index ldb, Scalar* C, Index ldc)
 {
@@ -39,9 +39,9 @@
   };
   Index d_end = (d/RK)*RK;    // number of columns of A (rows of B) suitable for full register blocking
   Index n_end = (n/RN)*RN;    // number of columns of B-C suitable for processing RN columns at once
-  Index i0 = internal::first_aligned(A,m);
+  Index i0 = internal::first_default_aligned(A,m);
   
-  eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_aligned(C,m)));
+  eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_default_aligned(C,m)));
   
   // handle the non aligned rows of A and C without any optimization:
   for(Index i=0; i<i0; ++i)
@@ -72,14 +72,14 @@
         
         // load and expand a RN x RK block of B
         Packet b00, b10, b20, b30, b01, b11, b21, b31;
-                  b00 = pset1<Packet>(Bc0[0]);
-                  b10 = pset1<Packet>(Bc0[1]);
-        if(RK==4) b20 = pset1<Packet>(Bc0[2]);
-        if(RK==4) b30 = pset1<Packet>(Bc0[3]);
-                  b01 = pset1<Packet>(Bc1[0]);
-                  b11 = pset1<Packet>(Bc1[1]);
-        if(RK==4) b21 = pset1<Packet>(Bc1[2]);
-        if(RK==4) b31 = pset1<Packet>(Bc1[3]);
+                  { b00 = pset1<Packet>(Bc0[0]); }
+                  { b10 = pset1<Packet>(Bc0[1]); }
+        if(RK==4) { b20 = pset1<Packet>(Bc0[2]); }
+        if(RK==4) { b30 = pset1<Packet>(Bc0[3]); }
+                  { b01 = pset1<Packet>(Bc1[0]); }
+                  { b11 = pset1<Packet>(Bc1[1]); }
+        if(RK==4) { b21 = pset1<Packet>(Bc1[2]); }
+        if(RK==4) { b31 = pset1<Packet>(Bc1[3]); }
         
         Packet a0, a1, a2, a3, c0, c1, t0, t1;
         
@@ -106,22 +106,22 @@
         
 #define KMADD(c, a, b, tmp) {tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);}
 #define WORK(I)  \
-                    c0 = pload<Packet>(C0+i+(I)*PacketSize);   \
-                    c1 = pload<Packet>(C1+i+(I)*PacketSize);   \
-                    KMADD(c0, a0, b00, t0)      \
-                    KMADD(c1, a0, b01, t1)      \
-                    a0 = pload<Packet>(A0+i+(I+1)*PacketSize); \
-                    KMADD(c0, a1, b10, t0)      \
-                    KMADD(c1, a1, b11, t1)       \
-                    a1 = pload<Packet>(A1+i+(I+1)*PacketSize); \
-          if(RK==4) KMADD(c0, a2, b20, t0)       \
-          if(RK==4) KMADD(c1, a2, b21, t1)       \
-          if(RK==4) a2 = pload<Packet>(A2+i+(I+1)*PacketSize); \
-          if(RK==4) KMADD(c0, a3, b30, t0)       \
-          if(RK==4) KMADD(c1, a3, b31, t1)       \
-          if(RK==4) a3 = pload<Packet>(A3+i+(I+1)*PacketSize); \
-                    pstore(C0+i+(I)*PacketSize, c0);           \
-                    pstore(C1+i+(I)*PacketSize, c1)
+                     c0 = pload<Packet>(C0+i+(I)*PacketSize);    \
+                     c1 = pload<Packet>(C1+i+(I)*PacketSize);    \
+                     KMADD(c0, a0, b00, t0)                      \
+                     KMADD(c1, a0, b01, t1)                      \
+                     a0 = pload<Packet>(A0+i+(I+1)*PacketSize);  \
+                     KMADD(c0, a1, b10, t0)                      \
+                     KMADD(c1, a1, b11, t1)                      \
+                     a1 = pload<Packet>(A1+i+(I+1)*PacketSize);  \
+          if(RK==4){ KMADD(c0, a2, b20, t0)                     }\
+          if(RK==4){ KMADD(c1, a2, b21, t1)                     }\
+          if(RK==4){ a2 = pload<Packet>(A2+i+(I+1)*PacketSize); }\
+          if(RK==4){ KMADD(c0, a3, b30, t0)                     }\
+          if(RK==4){ KMADD(c1, a3, b31, t1)                     }\
+          if(RK==4){ a3 = pload<Packet>(A3+i+(I+1)*PacketSize); }\
+                     pstore(C0+i+(I)*PacketSize, c0);            \
+                     pstore(C1+i+(I)*PacketSize, c1)
         
         // process rows of A' - C' with aggressive vectorization and peeling 
         for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
@@ -131,14 +131,15 @@
                     prefetch((A1+i+(5)*PacketSize));
           if(RK==4) prefetch((A2+i+(5)*PacketSize));
           if(RK==4) prefetch((A3+i+(5)*PacketSize));
-                    WORK(0);
-                    WORK(1);
-                    WORK(2);
-                    WORK(3);
-                    WORK(4);
-                    WORK(5);
-                    WORK(6);
-                    WORK(7);
+
+          WORK(0);
+          WORK(1);
+          WORK(2);
+          WORK(3);
+          WORK(4);
+          WORK(5);
+          WORK(6);
+          WORK(7);
         }
         // process the remaining rows with vectorization only
         for(Index i=actual_b_end1; i<actual_b_end2; i+=PacketSize)
@@ -165,7 +166,7 @@
         Bc1 += RK;
       } // peeled loop on k
     } // peeled loop on the columns j
-    // process the last column (we now perform a matrux-vector product)
+    // process the last column (we now perform a matrix-vector product)
     if((n-n_end)>0)
     {
       const Scalar* Bc0 = B+(n-1)*ldb;
@@ -203,18 +204,18 @@
         }
         
 #define WORK(I) \
-                  c0 = pload<Packet>(C0+i+(I)*PacketSize);   \
-                  KMADD(c0, a0, b00, t0)       \
-                  a0 = pload<Packet>(A0+i+(I+1)*PacketSize); \
-                  KMADD(c0, a1, b10, t0)       \
-                  a1 = pload<Packet>(A1+i+(I+1)*PacketSize); \
-        if(RK==4) KMADD(c0, a2, b20, t0)       \
-        if(RK==4) a2 = pload<Packet>(A2+i+(I+1)*PacketSize); \
-        if(RK==4) KMADD(c0, a3, b30, t0)       \
-        if(RK==4) a3 = pload<Packet>(A3+i+(I+1)*PacketSize); \
-                  pstore(C0+i+(I)*PacketSize, c0);
+                   c0 = pload<Packet>(C0+i+(I)*PacketSize);     \
+                   KMADD(c0, a0, b00, t0)                       \
+                   a0 = pload<Packet>(A0+i+(I+1)*PacketSize);   \
+                   KMADD(c0, a1, b10, t0)                       \
+                   a1 = pload<Packet>(A1+i+(I+1)*PacketSize);   \
+        if(RK==4){ KMADD(c0, a2, b20, t0)                      }\
+        if(RK==4){ a2 = pload<Packet>(A2+i+(I+1)*PacketSize);  }\
+        if(RK==4){ KMADD(c0, a3, b30, t0)                      }\
+        if(RK==4){ a3 = pload<Packet>(A3+i+(I+1)*PacketSize);  }\
+                   pstore(C0+i+(I)*PacketSize, c0);
         
-        // agressive vectorization and peeling
+        // aggressive vectorization and peeling
         for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
         {
           EIGEN_ASM_COMMENT("SPARSELU_GEMML_KERNEL2");

diff --git a/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h b/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
index 7a4e430..6f75d50 100644
--- a/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
+++ b/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h

@@ -42,21 +42,20 @@
  * \param descendants Number of descendants of each node in the etree
  * \param relax_end last column in a supernode
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::heap_relax_snode (const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants, IndexVector& relax_end)
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar,StorageIndex>::heap_relax_snode (const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants, IndexVector& relax_end)
 {
   
   // The etree may not be postordered, but its heap ordered  
   IndexVector post;
-  internal::treePostorder(n, et, post); // Post order etree
+  internal::treePostorder(StorageIndex(n), et, post); // Post order etree
   IndexVector inv_post(n+1); 
-  Index i;
-  for (i = 0; i < n+1; ++i) inv_post(post(i)) = i; // inv_post = post.inverse()???
+  for (StorageIndex i = 0; i < n+1; ++i) inv_post(post(i)) = i; // inv_post = post.inverse()???
   
   // Renumber etree in postorder 
   IndexVector iwork(n);
   IndexVector et_save(n+1);
-  for (i = 0; i < n; ++i)
+  for (Index i = 0; i < n; ++i)
   {
     iwork(post(i)) = post(et(i));
   }
@@ -75,10 +74,10 @@
   }
   // Identify the relaxed supernodes by postorder traversal of the etree
   Index snode_start; // beginning of a snode 
-  Index k;
+  StorageIndex k;
   Index nsuper_et_post = 0; // Number of relaxed snodes in postordered etree 
   Index nsuper_et = 0; // Number of relaxed snodes in the original etree 
-  Index l; 
+  StorageIndex l; 
   for (j = 0; j < n; )
   {
     parent = et(j);
@@ -90,8 +89,8 @@
     }
     // Found a supernode in postordered etree, j is the last column 
     ++nsuper_et_post;
-    k = n;
-    for (i = snode_start; i <= j; ++i)
+    k = StorageIndex(n);
+    for (Index i = snode_start; i <= j; ++i)
       k = (std::min)(k, inv_post(i));
     l = inv_post(j);
     if ( (l - k) == (j - snode_start) )  // Same number of columns in the snode
@@ -102,7 +101,7 @@
     }
     else 
     {
-      for (i = snode_start; i <= j; ++i) 
+      for (Index i = snode_start; i <= j; ++i) 
       {
         l = inv_post(i);
         if (descendants(i) == 0) 

diff --git a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
index 0d0283b..8c1b3e8 100644
--- a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
+++ b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h

@@ -14,30 +14,29 @@
 namespace Eigen {
 namespace internal {
   
-/**
- * \brief Performs numeric block updates from a given supernode to a single column
- * 
- * \param segsize Size of the segment (and blocks ) to use for updates
- * \param[in,out] dense Packed values of the original matrix
- * \param tempv temporary vector to use for updates
- * \param lusup array containing the supernodes
- * \param lda Leading dimension in the supernode
- * \param nrow Number of rows in the rectangular part of the supernode
- * \param lsub compressed row subscripts of supernodes
- * \param lptr pointer to the first column of the current supernode in lsub
- * \param no_zeros Number of nonzeros elements before the diagonal part of the supernode
- * \return 0 on success
- */
 template <int SegSizeAtCompileTime> struct LU_kernel_bmod
 {
-  template <typename BlockScalarVector, typename ScalarVector, typename IndexVector, typename Index>
-  static EIGEN_DONT_INLINE void run(const int segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, Index& luptr, const Index lda,
+  /** \internal
+    * \brief Performs numeric block updates from a given supernode to a single column
+    *
+    * \param segsize Size of the segment (and blocks ) to use for updates
+    * \param[in,out] dense Packed values of the original matrix
+    * \param tempv temporary vector to use for updates
+    * \param lusup array containing the supernodes
+    * \param lda Leading dimension in the supernode
+    * \param nrow Number of rows in the rectangular part of the supernode
+    * \param lsub compressed row subscripts of supernodes
+    * \param lptr pointer to the first column of the current supernode in lsub
+    * \param no_zeros Number of nonzeros elements before the diagonal part of the supernode
+    */
+  template <typename BlockScalarVector, typename ScalarVector, typename IndexVector>
+  static EIGEN_DONT_INLINE void run(const Index segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, Index& luptr, const Index lda,
                                     const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros);
 };
 
 template <int SegSizeAtCompileTime>
-template <typename BlockScalarVector, typename ScalarVector, typename IndexVector, typename Index>
-EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const int segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, Index& luptr, const Index lda,
+template <typename BlockScalarVector, typename ScalarVector, typename IndexVector>
+EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const Index segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, Index& luptr, const Index lda,
                                                                   const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros)
 {
   typedef typename ScalarVector::Scalar Scalar;
@@ -45,7 +44,7 @@
   // The result of triangular solve is in tempv[*]; 
     // The result of matric-vector update is in dense[*]
   Index isub = lptr + no_zeros; 
-  int i;
+  Index i;
   Index irow;
   for (i = 0; i < ((SegSizeAtCompileTime==Dynamic)?segsize:SegSizeAtCompileTime); i++)
   {
@@ -56,7 +55,7 @@
   // Dense triangular solve -- start effective triangle
   luptr += lda * no_zeros + no_zeros; 
   // Form Eigen matrix and vector 
-  Map<Matrix<Scalar,SegSizeAtCompileTime,SegSizeAtCompileTime>, 0, OuterStride<> > A( &(lusup.data()[luptr]), segsize, segsize, OuterStride<>(lda) );
+  Map<Matrix<Scalar,SegSizeAtCompileTime,SegSizeAtCompileTime, ColMajor>, 0, OuterStride<> > A( &(lusup.data()[luptr]), segsize, segsize, OuterStride<>(lda) );
   Map<Matrix<Scalar,SegSizeAtCompileTime,1> > u(tempv.data(), segsize);
   
   u = A.template triangularView<UnitLower>().solve(u); 
@@ -65,9 +64,9 @@
   luptr += segsize;
   const Index PacketSize = internal::packet_traits<Scalar>::size;
   Index ldl = internal::first_multiple(nrow, PacketSize);
-  Map<Matrix<Scalar,Dynamic,SegSizeAtCompileTime>, 0, OuterStride<> > B( &(lusup.data()[luptr]), nrow, segsize, OuterStride<>(lda) );
-  Index aligned_offset = internal::first_aligned(tempv.data()+segsize, PacketSize);
-  Index aligned_with_B_offset = (PacketSize-internal::first_aligned(B.data(), PacketSize))%PacketSize;
+  Map<Matrix<Scalar,Dynamic,SegSizeAtCompileTime, ColMajor>, 0, OuterStride<> > B( &(lusup.data()[luptr]), nrow, segsize, OuterStride<>(lda) );
+  Index aligned_offset = internal::first_default_aligned(tempv.data()+segsize, PacketSize);
+  Index aligned_with_B_offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize))%PacketSize;
   Map<Matrix<Scalar,Dynamic,1>, 0, OuterStride<> > l(tempv.data()+segsize+aligned_offset+aligned_with_B_offset, nrow, OuterStride<>(ldl) );
   
   l.setZero();
@@ -91,21 +90,22 @@
 
 template <> struct LU_kernel_bmod<1>
 {
-  template <typename BlockScalarVector, typename ScalarVector, typename IndexVector, typename Index>
-  static EIGEN_DONT_INLINE void run(const int /*segsize*/, BlockScalarVector& dense, ScalarVector& /*tempv*/, ScalarVector& lusup, Index& luptr,
+  template <typename BlockScalarVector, typename ScalarVector, typename IndexVector>
+  static EIGEN_DONT_INLINE void run(const Index /*segsize*/, BlockScalarVector& dense, ScalarVector& /*tempv*/, ScalarVector& lusup, Index& luptr,
                                     const Index lda, const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros);
 };
 
 
-template <typename BlockScalarVector, typename ScalarVector, typename IndexVector, typename Index>
-EIGEN_DONT_INLINE void LU_kernel_bmod<1>::run(const int /*segsize*/, BlockScalarVector& dense, ScalarVector& /*tempv*/, ScalarVector& lusup, Index& luptr,
+template <typename BlockScalarVector, typename ScalarVector, typename IndexVector>
+EIGEN_DONT_INLINE void LU_kernel_bmod<1>::run(const Index /*segsize*/, BlockScalarVector& dense, ScalarVector& /*tempv*/, ScalarVector& lusup, Index& luptr,
                                               const Index lda, const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros)
 {
   typedef typename ScalarVector::Scalar Scalar;
+  typedef typename IndexVector::Scalar StorageIndex;
   Scalar f = dense(lsub(lptr + no_zeros));
   luptr += lda * no_zeros + no_zeros + 1;
   const Scalar* a(lusup.data() + luptr);
-  const /*typename IndexVector::Scalar*/Index*  irow(lsub.data()+lptr + no_zeros + 1);
+  const StorageIndex*  irow(lsub.data()+lptr + no_zeros + 1);
   Index i = 0;
   for (; i+1 < nrow; i+=2)
   {

diff --git a/Eigen/src/SparseLU/SparseLU_panel_bmod.h b/Eigen/src/SparseLU/SparseLU_panel_bmod.h
index da0e0fc..f052001 100644
--- a/Eigen/src/SparseLU/SparseLU_panel_bmod.h
+++ b/Eigen/src/SparseLU/SparseLU_panel_bmod.h

@@ -38,7 +38,7 @@
  * \brief Performs numeric block updates (sup-panel) in topological order.
  * 
  * Before entering this routine, the original nonzeros in the panel
- * were already copied i nto the spa[m,w]
+ * were already copied into the spa[m,w]
  * 
  * \param m number of rows in the matrix
  * \param w Panel size
@@ -52,8 +52,8 @@
  * 
  * 
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::panel_bmod(const Index m, const Index w, const Index jcol, 
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar,StorageIndex>::panel_bmod(const Index m, const Index w, const Index jcol, 
                                             const Index nseg, ScalarVector& dense, ScalarVector& tempv,
                                             IndexVector& segrep, IndexVector& repfnz, GlobalLU_t& glu)
 {
@@ -102,7 +102,7 @@
     if(nsupc >= 2)
     { 
       Index ldu = internal::first_multiple<Index>(u_rows, PacketSize);
-      Map<Matrix<Scalar,Dynamic,Dynamic>, Aligned,  OuterStride<> > U(tempv.data(), u_rows, u_cols, OuterStride<>(ldu));
+      Map<ScalarMatrix, Aligned,  OuterStride<> > U(tempv.data(), u_rows, u_cols, OuterStride<>(ldu));
       
       // gather U
       Index u_col = 0;
@@ -136,17 +136,17 @@
       Index lda = glu.xlusup(fsupc+1) - glu.xlusup(fsupc);
       no_zeros = (krep - u_rows + 1) - fsupc;
       luptr += lda * no_zeros + no_zeros;
-      Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > A(glu.lusup.data()+luptr, u_rows, u_rows, OuterStride<>(lda) );
+      MappedMatrixBlock A(glu.lusup.data()+luptr, u_rows, u_rows, OuterStride<>(lda) );
       U = A.template triangularView<UnitLower>().solve(U);
       
       // update
       luptr += u_rows;
-      Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > B(glu.lusup.data()+luptr, nrow, u_rows, OuterStride<>(lda) );
+      MappedMatrixBlock B(glu.lusup.data()+luptr, nrow, u_rows, OuterStride<>(lda) );
       eigen_assert(tempv.size()>w*ldu + nrow*w + 1);
       
       Index ldl = internal::first_multiple<Index>(nrow, PacketSize);
-      Index offset = (PacketSize-internal::first_aligned(B.data(), PacketSize)) % PacketSize;
-      Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > L(tempv.data()+w*ldu+offset, nrow, u_cols, OuterStride<>(ldl));
+      Index offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize)) % PacketSize;
+      MappedMatrixBlock L(tempv.data()+w*ldu+offset, nrow, u_cols, OuterStride<>(ldl));
       
       L.setZero();
       internal::sparselu_gemm<Scalar>(L.rows(), L.cols(), B.cols(), B.data(), B.outerStride(), U.data(), U.outerStride(), L.data(), L.outerStride());

diff --git a/Eigen/src/SparseLU/SparseLU_panel_dfs.h b/Eigen/src/SparseLU/SparseLU_panel_dfs.h
index dc0054e..155df73 100644
--- a/Eigen/src/SparseLU/SparseLU_panel_dfs.h
+++ b/Eigen/src/SparseLU/SparseLU_panel_dfs.h

@@ -37,11 +37,11 @@
 template<typename IndexVector>
 struct panel_dfs_traits
 {
-  typedef typename IndexVector::Scalar Index;
-  panel_dfs_traits(Index jcol, Index* marker)
+  typedef typename IndexVector::Scalar StorageIndex;
+  panel_dfs_traits(Index jcol, StorageIndex* marker)
     : m_jcol(jcol), m_marker(marker)
   {}
-  bool update_segrep(Index krep, Index jj)
+  bool update_segrep(Index krep, StorageIndex jj)
   {
     if(m_marker[krep]<m_jcol)
     {
@@ -53,13 +53,13 @@
   void mem_expand(IndexVector& /*glu.lsub*/, Index /*nextl*/, Index /*chmark*/) {}
   enum { ExpandMem = false };
   Index m_jcol;
-  Index* m_marker;
+  StorageIndex* m_marker;
 };
 
 
-template <typename Scalar, typename Index>
+template <typename Scalar, typename StorageIndex>
 template <typename Traits>
-void SparseLUImpl<Scalar,Index>::dfs_kernel(const Index jj, IndexVector& perm_r,
+void SparseLUImpl<Scalar,StorageIndex>::dfs_kernel(const StorageIndex jj, IndexVector& perm_r,
                    Index& nseg, IndexVector& panel_lsub, IndexVector& segrep,
                    Ref<IndexVector> repfnz_col, IndexVector& xprune, Ref<IndexVector> marker, IndexVector& parent,
                    IndexVector& xplore, GlobalLU_t& glu,
@@ -67,14 +67,14 @@
                   )
 {
   
-  Index kmark = marker(krow);
+  StorageIndex kmark = marker(krow);
       
   // For each unmarked krow of jj
   marker(krow) = jj; 
-  Index kperm = perm_r(krow); 
+  StorageIndex kperm = perm_r(krow); 
   if (kperm == emptyIdxLU ) {
     // krow is in L : place it in structure of L(*, jj)
-    panel_lsub(nextl_col++) = krow;  // krow is indexed into A
+    panel_lsub(nextl_col++) = StorageIndex(krow);  // krow is indexed into A
     
     traits.mem_expand(panel_lsub, nextl_col, kmark);
   }
@@ -83,9 +83,9 @@
     // krow is in U : if its supernode-representative krep
     // has been explored, update repfnz(*)
     // krep = supernode representative of the current row
-    Index krep = glu.xsup(glu.supno(kperm)+1) - 1; 
+    StorageIndex krep = glu.xsup(glu.supno(kperm)+1) - 1; 
     // First nonzero element in the current column:
-    Index myfnz = repfnz_col(krep); 
+    StorageIndex myfnz = repfnz_col(krep); 
     
     if (myfnz != emptyIdxLU )
     {
@@ -96,26 +96,26 @@
     else 
     {
       // Otherwise, perform dfs starting at krep
-      Index oldrep = emptyIdxLU; 
+      StorageIndex oldrep = emptyIdxLU; 
       parent(krep) = oldrep; 
       repfnz_col(krep) = kperm; 
-      Index xdfs =  glu.xlsub(krep); 
+      StorageIndex xdfs =  glu.xlsub(krep); 
       Index maxdfs = xprune(krep); 
       
-      Index kpar;
+      StorageIndex kpar;
       do 
       {
         // For each unmarked kchild of krep
         while (xdfs < maxdfs) 
         {
-          Index kchild = glu.lsub(xdfs); 
+          StorageIndex kchild = glu.lsub(xdfs); 
           xdfs++; 
-          Index chmark = marker(kchild); 
+          StorageIndex chmark = marker(kchild); 
           
           if (chmark != jj ) 
           {
             marker(kchild) = jj; 
-            Index chperm = perm_r(kchild); 
+            StorageIndex chperm = perm_r(kchild); 
             
             if (chperm == emptyIdxLU) 
             {
@@ -128,7 +128,7 @@
               // case kchild is in U :
               // chrep = its supernode-rep. If its rep has been explored, 
               // update its repfnz(*)
-              Index chrep = glu.xsup(glu.supno(chperm)+1) - 1; 
+              StorageIndex chrep = glu.xsup(glu.supno(chperm)+1) - 1; 
               myfnz = repfnz_col(chrep); 
               
               if (myfnz != emptyIdxLU) 
@@ -215,8 +215,8 @@
  * 
  */
 
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::panel_dfs(const Index m, const Index w, const Index jcol, MatrixType& A, IndexVector& perm_r, Index& nseg, ScalarVector& dense, IndexVector& panel_lsub, IndexVector& segrep, IndexVector& repfnz, IndexVector& xprune, IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar,StorageIndex>::panel_dfs(const Index m, const Index w, const Index jcol, MatrixType& A, IndexVector& perm_r, Index& nseg, ScalarVector& dense, IndexVector& panel_lsub, IndexVector& segrep, IndexVector& repfnz, IndexVector& xprune, IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu)
 {
   Index nextl_col; // Next available position in panel_lsub[*,jj] 
   
@@ -227,7 +227,7 @@
   panel_dfs_traits<IndexVector> traits(jcol, marker1.data());
   
   // For each column in the panel 
-  for (Index jj = jcol; jj < jcol + w; jj++) 
+  for (StorageIndex jj = StorageIndex(jcol); jj < jcol + w; jj++) 
   {
     nextl_col = (jj - jcol) * m; 
     
@@ -241,7 +241,7 @@
       Index krow = it.row(); 
       dense_col(krow) = it.value();
       
-      Index kmark = marker(krow); 
+      StorageIndex kmark = marker(krow); 
       if (kmark == jj) 
         continue; // krow visited before, go to the next nonzero
       

diff --git a/Eigen/src/SparseLU/SparseLU_pivotL.h b/Eigen/src/SparseLU/SparseLU_pivotL.h
index 457789c..a86dac9 100644
--- a/Eigen/src/SparseLU/SparseLU_pivotL.h
+++ b/Eigen/src/SparseLU/SparseLU_pivotL.h

@@ -56,8 +56,8 @@
  * \return 0 if success, i > 0 if U(i,i) is exactly zero 
  * 
  */
-template <typename Scalar, typename Index>
-Index SparseLUImpl<Scalar,Index>::pivotL(const Index jcol, const RealScalar& diagpivotthresh, IndexVector& perm_r, IndexVector& iperm_c, Index& pivrow, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+Index SparseLUImpl<Scalar,StorageIndex>::pivotL(const Index jcol, const RealScalar& diagpivotthresh, IndexVector& perm_r, IndexVector& iperm_c, Index& pivrow, GlobalLU_t& glu)
 {
   
   Index fsupc = (glu.xsup)((glu.supno)(jcol)); // First column in the supernode containing the column jcol
@@ -67,11 +67,11 @@
   Index lda = glu.xlusup(fsupc+1) - glu.xlusup(fsupc); // leading dimension
   Scalar* lu_sup_ptr = &(glu.lusup.data()[glu.xlusup(fsupc)]); // Start of the current supernode
   Scalar* lu_col_ptr = &(glu.lusup.data()[glu.xlusup(jcol)]); // Start of jcol in the supernode
-  Index* lsub_ptr = &(glu.lsub.data()[lptr]); // Start of row indices of the supernode
+  StorageIndex* lsub_ptr = &(glu.lsub.data()[lptr]); // Start of row indices of the supernode
   
   // Determine the largest abs numerical value for partial pivoting 
   Index diagind = iperm_c(jcol); // diagonal index 
-  RealScalar pivmax = 0.0; 
+  RealScalar pivmax(-1.0);
   Index pivptr = nsupc; 
   Index diag = emptyIdxLU; 
   RealScalar rtemp;
@@ -87,9 +87,10 @@
   }
   
   // Test for singularity
-  if ( pivmax == 0.0 ) {
-    pivrow = lsub_ptr[pivptr];
-    perm_r(pivrow) = jcol;
+  if ( pivmax <= RealScalar(0.0) ) {
+    // if pivmax == -1, the column is structurally empty, otherwise it is only numerically zero
+    pivrow = pivmax < RealScalar(0.0) ? diagind : lsub_ptr[pivptr];
+    perm_r(pivrow) = StorageIndex(jcol);
     return (jcol+1);
   }
   
@@ -104,13 +105,13 @@
       // Diagonal element exists
       using std::abs;
       rtemp = abs(lu_col_ptr[diag]);
-      if (rtemp != 0.0 && rtemp >= thresh) pivptr = diag;
+      if (rtemp != RealScalar(0.0) && rtemp >= thresh) pivptr = diag;
     }
     pivrow = lsub_ptr[pivptr];
   }
   
   // Record pivot row
-  perm_r(pivrow) = jcol; 
+  perm_r(pivrow) = StorageIndex(jcol);
   // Interchange row subscripts
   if (pivptr != nsupc )
   {

diff --git a/Eigen/src/SparseLU/SparseLU_pruneL.h b/Eigen/src/SparseLU/SparseLU_pruneL.h
index 66460d1..ad32fed 100644
--- a/Eigen/src/SparseLU/SparseLU_pruneL.h
+++ b/Eigen/src/SparseLU/SparseLU_pruneL.h

@@ -49,8 +49,9 @@
  * \param glu Global LU data
  * 
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::pruneL(const Index jcol, const IndexVector& perm_r, const Index pivrow, const Index nseg, const IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar,StorageIndex>::pruneL(const Index jcol, const IndexVector& perm_r, const Index pivrow, const Index nseg,
+                                               const IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune, GlobalLU_t& glu)
 {
   // For each supernode-rep irep in U(*,j]
   Index jsupno = glu.supno(jcol); 
@@ -123,7 +124,7 @@
           }
         } // end while 
         
-        xprune(irep) = kmin;  //Pruning 
+        xprune(irep) = StorageIndex(kmin);  //Pruning 
       } // end if do_prune 
     } // end pruning 
   } // End for each U-segment

diff --git a/Eigen/src/SparseLU/SparseLU_relax_snode.h b/Eigen/src/SparseLU/SparseLU_relax_snode.h
index 58ec32e..c408d01 100644
--- a/Eigen/src/SparseLU/SparseLU_relax_snode.h
+++ b/Eigen/src/SparseLU/SparseLU_relax_snode.h

@@ -43,15 +43,15 @@
  * \param descendants Number of descendants of each node in the etree
  * \param relax_end last column in a supernode
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::relax_snode (const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants, IndexVector& relax_end)
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar,StorageIndex>::relax_snode (const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants, IndexVector& relax_end)
 {
   
   // compute the number of descendants of each node in the etree
-  Index j, parent; 
+  Index parent; 
   relax_end.setConstant(emptyIdxLU);
   descendants.setZero();
-  for (j = 0; j < n; j++) 
+  for (Index j = 0; j < n; j++) 
   {
     parent = et(j);
     if (parent != n) // not the dummy root
@@ -59,7 +59,7 @@
   }
   // Identify the relaxed supernodes by postorder traversal of the etree
   Index snode_start; // beginning of a snode 
-  for (j = 0; j < n; )
+  for (Index j = 0; j < n; )
   {
     parent = et(j);
     snode_start = j; 
@@ -69,7 +69,7 @@
       parent = et(j);
     }
     // Found a supernode in postordered etree, j is the last column 
-    relax_end(snode_start) = j; // Record last column
+    relax_end(snode_start) = StorageIndex(j); // Record last column
     j++;
     // Search for a new leaf
     while (descendants(j) != 0 && j < n) j++;

diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h
index 5fb5bc2..d1fb96f 100644
--- a/Eigen/src/SparseQR/SparseQR.h
+++ b/Eigen/src/SparseQR/SparseQR.h

@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2012-2013 Desire Nuentsa <desire.nuentsa_wakam@inria.fr>
-// Copyright (C) 2012-2013 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2012-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -21,8 +21,12 @@
   template <typename SparseQRType> struct traits<SparseQRMatrixQReturnType<SparseQRType> >
   {
     typedef typename SparseQRType::MatrixType ReturnType;
-    typedef typename ReturnType::Index Index;
+    typedef typename ReturnType::StorageIndex StorageIndex;
     typedef typename ReturnType::StorageKind StorageKind;
+    enum {
+      RowsAtCompileTime = Dynamic,
+      ColsAtCompileTime = Dynamic
+    };
   };
   template <typename SparseQRType> struct traits<SparseQRMatrixQTransposeReturnType<SparseQRType> >
   {
@@ -37,18 +41,19 @@
 /**
   * \ingroup SparseQR_Module
   * \class SparseQR
-  * \brief Sparse left-looking rank-revealing QR factorization
+  * \brief Sparse left-looking QR factorization with numerical column pivoting
   * 
-  * This class implements a left-looking rank-revealing QR decomposition 
-  * of sparse matrices. When a column has a norm less than a given tolerance
+  * This class implements a left-looking QR decomposition of sparse matrices
+  * with numerical column pivoting.
+  * When a column has a norm less than a given tolerance
   * it is implicitly permuted to the end. The QR factorization thus obtained is 
   * given by A*P = Q*R where R is upper triangular or trapezoidal. 
   * 
   * P is the column permutation which is the product of the fill-reducing and the
-  * rank-revealing permutations. Use colsPermutation() to get it.
+  * numerical permutations. Use colsPermutation() to get it.
   * 
   * Q is the orthogonal matrix represented as products of Householder reflectors. 
-  * Use matrixQ() to get an expression and matrixQ().transpose() to get the transpose.
+  * Use matrixQ() to get an expression and matrixQ().adjoint() to get the adjoint.
   * You can then apply it to a vector.
   * 
   * R is the sparse triangular or trapezoidal matrix. The later occurs when A is rank-deficient.
@@ -58,24 +63,48 @@
   * \tparam _OrderingType The fill-reducing ordering method. See the \link OrderingMethods_Module 
   *  OrderingMethods \endlink module for the list of built-in and external ordering methods.
   * 
+  * \implsparsesolverconcept
+  *
+  * The numerical pivoting strategy and default threshold are the same as in SuiteSparse QR, and
+  * detailed in the following paper:
+  * <i>
+  * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing
+  * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011.
+  * </i>
+  * Even though it is qualified as "rank-revealing", this strategy might fail for some 
+  * rank deficient problems. When this class is used to solve linear or least-square problems
+  * it is thus strongly recommended to check the accuracy of the computed solution. If it
+  * failed, it usually helps to increase the threshold with setPivotThreshold.
+  * 
   * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()).
+  * \warning For complex matrices matrixQ().transpose() will actually return the adjoint matrix.
   * 
   */
 template<typename _MatrixType, typename _OrderingType>
-class SparseQR
+class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
 {
+  protected:
+    typedef SparseSolverBase<SparseQR<_MatrixType,_OrderingType> > Base;
+    using Base::m_isInitialized;
   public:
+    using Base::_solve_impl;
     typedef _MatrixType MatrixType;
     typedef _OrderingType OrderingType;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef SparseMatrix<Scalar,ColMajor,Index> QRMatrixType;
-    typedef Matrix<Index, Dynamic, 1> IndexVector;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> QRMatrixType;
+    typedef Matrix<StorageIndex, Dynamic, 1> IndexVector;
     typedef Matrix<Scalar, Dynamic, 1> ScalarVector;
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
+    typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
+
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+    
   public:
-    SparseQR () : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false)
+    SparseQR () :  m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false),m_isEtreeOk(false)
     { }
     
     /** Construct a QR factorization of the matrix \a mat.
@@ -84,7 +113,7 @@
       * 
       * \sa compute()
       */
-    SparseQR(const MatrixType& mat) : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false)
+    explicit SparseQR(const MatrixType& mat) : m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false),m_isEtreeOk(false)
     {
       compute(mat);
     }
@@ -112,6 +141,17 @@
     inline Index cols() const { return m_pmat.cols();}
     
     /** \returns a const reference to the \b sparse upper triangular matrix R of the QR factorization.
+      * \warning The entries of the returned matrix are not sorted. This means that using it in algorithms
+      *          expecting sorted entries will fail. This include random coefficient accesses (SpaseMatrix::coeff()),
+      *          and coefficient-wise operations. Matrix products and triangular solves are fine though.
+      *
+      * To sort the entries, you can assign it to a row-major matrix, and if a column-major matrix
+      * is required, you can copy it again:
+      * \code
+      * SparseMatrix<double>          R  = qr.matrixR();  // column-major, not sorted!
+      * SparseMatrix<double,RowMajor> Rr = qr.matrixR();  // row-major, sorted
+      * SparseMatrix<double>          Rc = Rr;            // column-major, sorted
+      * \endcode
       */
     const QRMatrixType& matrixR() const { return m_R; }
     
@@ -119,7 +159,7 @@
       *
       * \sa setPivotThreshold()
       */
-    Index rank() const 
+    Index rank() const
     {
       eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
       return m_nonzeropivots; 
@@ -162,31 +202,30 @@
     
     /** \internal */
     template<typename Rhs, typename Dest>
-    bool _solve(const MatrixBase<Rhs> &B, MatrixBase<Dest> &dest) const
+    bool _solve_impl(const MatrixBase<Rhs> &B, MatrixBase<Dest> &dest) const
     {
       eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
       eigen_assert(this->rows() == B.rows() && "SparseQR::solve() : invalid number of rows in the right hand side matrix");
 
       Index rank = this->rank();
       
-      // Compute Q^T * b;
+      // Compute Q^* * b;
       typename Dest::PlainObject y, b;
-      y = this->matrixQ().transpose() * B; 
+      y = this->matrixQ().adjoint() * B;
       b = y;
       
       // Solve with the triangular matrix R
-      y.resize((std::max)(cols(),Index(y.rows())),y.cols());
+      y.resize((std::max<Index>)(cols(),y.rows()),y.cols());
       y.topRows(rank) = this->matrixR().topLeftCorner(rank, rank).template triangularView<Upper>().solve(b.topRows(rank));
       y.bottomRows(y.rows()-rank).setZero();
-
+      
       // Apply the column permutation
-      if (m_perm_c.size())  dest.topRows(cols()) = colsPermutation() * y.topRows(cols());
+      if (m_perm_c.size())  dest = colsPermutation() * y.topRows(cols());
       else                  dest = y.topRows(cols());
       
       m_info = Success;
       return true;
     }
-    
 
     /** Sets the threshold that is used to determine linearly dependent columns during the factorization.
       *
@@ -204,18 +243,18 @@
       * \sa compute()
       */
     template<typename Rhs>
-    inline const internal::solve_retval<SparseQR, Rhs> solve(const MatrixBase<Rhs>& B) const 
+    inline const Solve<SparseQR, Rhs> solve(const MatrixBase<Rhs>& B) const 
     {
       eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
       eigen_assert(this->rows() == B.rows() && "SparseQR::solve() : invalid number of rows in the right hand side matrix");
-      return internal::solve_retval<SparseQR, Rhs>(*this, B.derived());
+      return Solve<SparseQR, Rhs>(*this, B.derived());
     }
     template<typename Rhs>
-    inline const internal::sparse_solve_retval<SparseQR, Rhs> solve(const SparseMatrixBase<Rhs>& B) const
+    inline const Solve<SparseQR, Rhs> solve(const SparseMatrixBase<Rhs>& B) const
     {
           eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
           eigen_assert(this->rows() == B.rows() && "SparseQR::solve() : invalid number of rows in the right hand side matrix");
-          return internal::sparse_solve_retval<SparseQR, Rhs>(*this, B.derived());
+          return Solve<SparseQR, Rhs>(*this, B.derived());
     }
     
     /** \brief Reports whether previous computation was successful.
@@ -232,8 +271,9 @@
       return m_info;
     }
 
-  protected:
-    inline void sort_matrix_Q()
+
+    /** \internal */
+    inline void _sort_matrix_Q()
     {
       if(this->m_isQSorted) return;
       // The matrix Q is sorted during the transposition
@@ -244,7 +284,6 @@
 
     
   protected:
-    bool m_isInitialized;
     bool m_analysisIsok;
     bool m_factorizationIsok;
     mutable ComputationInfo m_info;
@@ -258,13 +297,13 @@
     PermutationType m_outputPerm_c; // The final column permutation
     RealScalar m_threshold;         // Threshold to determine null Householder reflections
     bool m_useDefaultThreshold;     // Use default threshold
-    Index m_nonzeropivots;          // Number of non zero pivots found 
+    Index m_nonzeropivots;          // Number of non zero pivots found
     IndexVector m_etree;            // Column elimination tree
     IndexVector m_firstRowElt;      // First element in each row
     bool m_isQSorted;               // whether Q is sorted or not
+    bool m_isEtreeOk;               // whether the elimination tree match the initial input matrix
     
     template <typename, typename > friend struct SparseQR_QProduct;
-    template <typename > friend struct SparseQRMatrixQReturnType;
     
 };
 
@@ -281,29 +320,33 @@
 void SparseQR<MatrixType,OrderingType>::analyzePattern(const MatrixType& mat)
 {
   eigen_assert(mat.isCompressed() && "SparseQR requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to SparseQR");
+  // Copy to a column major matrix if the input is rowmajor
+  typename internal::conditional<MatrixType::IsRowMajor,QRMatrixType,const MatrixType&>::type matCpy(mat);
   // Compute the column fill reducing ordering
   OrderingType ord; 
-  ord(mat, m_perm_c); 
+  ord(matCpy, m_perm_c); 
   Index n = mat.cols();
   Index m = mat.rows();
+  Index diagSize = (std::min)(m,n);
   
   if (!m_perm_c.size())
   {
     m_perm_c.resize(n);
-    m_perm_c.indices().setLinSpaced(n, 0,n-1);
+    m_perm_c.indices().setLinSpaced(n, 0,StorageIndex(n-1));
   }
   
   // Compute the column elimination tree of the permuted matrix
   m_outputPerm_c = m_perm_c.inverse();
-  internal::coletree(mat, m_etree, m_firstRowElt, m_outputPerm_c.indices().data());
+  internal::coletree(matCpy, m_etree, m_firstRowElt, m_outputPerm_c.indices().data());
+  m_isEtreeOk = true;
   
-  m_R.resize(n, n);
-  m_Q.resize(m, n);
+  m_R.resize(m, n);
+  m_Q.resize(m, diagSize);
   
-  // Allocate space for nonzero elements : rough estimation
+  // Allocate space for nonzero elements: rough estimation
   m_R.reserve(2*mat.nonZeros()); //FIXME Get a more accurate estimation through symbolic factorization with the etree
   m_Q.reserve(2*mat.nonZeros());
-  m_hcoeffs.resize(n);
+  m_hcoeffs.resize(diagSize);
   m_analysisIsok = true;
 }
 
@@ -318,67 +361,92 @@
 void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
 {
   using std::abs;
-  using std::max;
   
   eigen_assert(m_analysisIsok && "analyzePattern() should be called before this step");
-  Index m = mat.rows();
-  Index n = mat.cols();
-  IndexVector mark(m); mark.setConstant(-1);  // Record the visited nodes
-  IndexVector Ridx(n), Qidx(m);               // Store temporarily the row indexes for the current column of R and Q
-  Index nzcolR, nzcolQ;                       // Number of nonzero for the current column of R and Q
-  ScalarVector tval(m);                       // The dense vector used to compute the current column
-  bool found_diag;
-    
+  StorageIndex m = StorageIndex(mat.rows());
+  StorageIndex n = StorageIndex(mat.cols());
+  StorageIndex diagSize = (std::min)(m,n);
+  IndexVector mark((std::max)(m,n)); mark.setConstant(-1);  // Record the visited nodes
+  IndexVector Ridx(n), Qidx(m);                             // Store temporarily the row indexes for the current column of R and Q
+  Index nzcolR, nzcolQ;                                     // Number of nonzero for the current column of R and Q
+  ScalarVector tval(m);                                     // The dense vector used to compute the current column
+  RealScalar pivotThreshold = m_threshold;
+  
+  m_R.setZero();
+  m_Q.setZero();
   m_pmat = mat;
-  m_pmat.uncompress(); // To have the innerNonZeroPtr allocated
-  // Apply the fill-in reducing permutation lazily:
-  for (int i = 0; i < n; i++)
+  if(!m_isEtreeOk)
   {
-    Index p = m_perm_c.size() ? m_perm_c.indices()(i) : i;
-    m_pmat.outerIndexPtr()[p] = mat.outerIndexPtr()[i]; 
-    m_pmat.innerNonZeroPtr()[p] = mat.outerIndexPtr()[i+1] - mat.outerIndexPtr()[i]; 
+    m_outputPerm_c = m_perm_c.inverse();
+    internal::coletree(m_pmat, m_etree, m_firstRowElt, m_outputPerm_c.indices().data());
+    m_isEtreeOk = true;
+  }
+
+  m_pmat.uncompress(); // To have the innerNonZeroPtr allocated
+  
+  // Apply the fill-in reducing permutation lazily:
+  {
+    // If the input is row major, copy the original column indices,
+    // otherwise directly use the input matrix
+    // 
+    IndexVector originalOuterIndicesCpy;
+    const StorageIndex *originalOuterIndices = mat.outerIndexPtr();
+    if(MatrixType::IsRowMajor)
+    {
+      originalOuterIndicesCpy = IndexVector::Map(m_pmat.outerIndexPtr(),n+1);
+      originalOuterIndices = originalOuterIndicesCpy.data();
+    }
+    
+    for (int i = 0; i < n; i++)
+    {
+      Index p = m_perm_c.size() ? m_perm_c.indices()(i) : i;
+      m_pmat.outerIndexPtr()[p] = originalOuterIndices[i]; 
+      m_pmat.innerNonZeroPtr()[p] = originalOuterIndices[i+1] - originalOuterIndices[i]; 
+    }
   }
   
-  /* Compute the default threshold, see : 
+  /* Compute the default threshold as in MatLab, see:
    * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing
    * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011, Page 8:3 
    */
   if(m_useDefaultThreshold) 
   {
     RealScalar max2Norm = 0.0;
-    for (int j = 0; j < n; j++) max2Norm = (max)(max2Norm, m_pmat.col(j).norm());
-    m_threshold = 20 * (m + n) * max2Norm * NumTraits<RealScalar>::epsilon();
+    for (int j = 0; j < n; j++) max2Norm = numext::maxi(max2Norm, m_pmat.col(j).norm());
+    if(max2Norm==RealScalar(0))
+      max2Norm = RealScalar(1);
+    pivotThreshold = 20 * (m + n) * max2Norm * NumTraits<RealScalar>::epsilon();
   }
   
   // Initialize the numerical permutation
   m_pivotperm.setIdentity(n);
   
-  Index nonzeroCol = 0; // Record the number of valid pivots
-  
+  StorageIndex nonzeroCol = 0; // Record the number of valid pivots
+  m_Q.startVec(0);
+
   // Left looking rank-revealing QR factorization: compute a column of R and Q at a time
-  for (Index col = 0; col < (std::min)(n,m); ++col)
+  for (StorageIndex col = 0; col < n; ++col)
   {
     mark.setConstant(-1);
     m_R.startVec(col);
-    m_Q.startVec(col);
     mark(nonzeroCol) = col;
     Qidx(0) = nonzeroCol;
     nzcolR = 0; nzcolQ = 1;
-    found_diag = col>=m;
+    bool found_diag = nonzeroCol>=m;
     tval.setZero(); 
     
     // Symbolic factorization: find the nonzero locations of the column k of the factors R and Q, i.e.,
     // all the nodes (with indexes lower than rank) reachable through the column elimination tree (etree) rooted at node k.
     // Note: if the diagonal entry does not exist, then its contribution must be explicitly added,
     // thus the trick with found_diag that permits to do one more iteration on the diagonal element if this one has not been found.
-    for (typename MatrixType::InnerIterator itp(m_pmat, col); itp || !found_diag; ++itp)
+    for (typename QRMatrixType::InnerIterator itp(m_pmat, col); itp || !found_diag; ++itp)
     {
-      Index curIdx = nonzeroCol ;
-      if(itp) curIdx = itp.row();
+      StorageIndex curIdx = nonzeroCol;
+      if(itp) curIdx = StorageIndex(itp.row());
       if(curIdx == nonzeroCol) found_diag = true;
       
       // Get the nonzeros indexes of the current column of R
-      Index st = m_firstRowElt(curIdx); // The traversal of the etree starts here 
+      StorageIndex st = m_firstRowElt(curIdx); // The traversal of the etree starts here
       if (st < 0 )
       {
         m_lastError = "Empty row found during numerical factorization";
@@ -415,7 +483,7 @@
     // Browse all the indexes of R(:,col) in reverse order
     for (Index i = nzcolR-1; i >= 0; i--)
     {
-      Index curIdx = m_pivotperm.indices()(Ridx(i));
+      Index curIdx = Ridx(i);
       
       // Apply the curIdx-th householder vector to the current column (temporarily stored into tval)
       Scalar tdot(0);
@@ -435,7 +503,7 @@
       {
         for (typename QRMatrixType::InnerIterator itq(m_Q, curIdx); itq; ++itq)
         {
-          Index iQ = itq.row();
+          StorageIndex iQ = StorageIndex(itq.row());
           if (mark(iQ) != col)
           {
             Qidx(nzcolQ++) = iQ;  // Add this row to the pattern of Q,
@@ -444,34 +512,36 @@
         }
       }
     } // End update current column
-        
-    // Compute the Householder reflection that eliminate the current column
-    // FIXME this step should call the Householder module.
-    Scalar tau;
-    RealScalar beta;
-    Scalar c0 = nzcolQ ? tval(Qidx(0)) : Scalar(0);
     
-    // First, the squared norm of Q((col+1):m, col)
-    RealScalar sqrNorm = 0.;
-    for (Index itq = 1; itq < nzcolQ; ++itq) sqrNorm += numext::abs2(tval(Qidx(itq)));
+    Scalar tau = RealScalar(0);
+    RealScalar beta = 0;
     
-    if(sqrNorm == RealScalar(0) && numext::imag(c0) == RealScalar(0))
+    if(nonzeroCol < diagSize)
     {
-      tau = RealScalar(0);
-      beta = numext::real(c0);
-      tval(Qidx(0)) = 1;
-     }
-    else
-    {
-      using std::sqrt;
-      beta = sqrt(numext::abs2(c0) + sqrNorm);
-      if(numext::real(c0) >= RealScalar(0))
-        beta = -beta;
-      tval(Qidx(0)) = 1;
-      for (Index itq = 1; itq < nzcolQ; ++itq)
-        tval(Qidx(itq)) /= (c0 - beta);
-      tau = numext::conj((beta-c0) / beta);
-        
+      // Compute the Householder reflection that eliminate the current column
+      // FIXME this step should call the Householder module.
+      Scalar c0 = nzcolQ ? tval(Qidx(0)) : Scalar(0);
+      
+      // First, the squared norm of Q((col+1):m, col)
+      RealScalar sqrNorm = 0.;
+      for (Index itq = 1; itq < nzcolQ; ++itq) sqrNorm += numext::abs2(tval(Qidx(itq)));
+      if(sqrNorm == RealScalar(0) && numext::imag(c0) == RealScalar(0))
+      {
+        beta = numext::real(c0);
+        tval(Qidx(0)) = 1;
+      }
+      else
+      {
+        using std::sqrt;
+        beta = sqrt(numext::abs2(c0) + sqrNorm);
+        if(numext::real(c0) >= RealScalar(0))
+          beta = -beta;
+        tval(Qidx(0)) = 1;
+        for (Index itq = 1; itq < nzcolQ; ++itq)
+          tval(Qidx(itq)) /= (c0 - beta);
+        tau = numext::conj((beta-c0) / beta);
+          
+      }
     }
 
     // Insert values in R
@@ -485,45 +555,49 @@
       }
     }
 
-    if(abs(beta) >= m_threshold)
+    if(nonzeroCol < diagSize && abs(beta) >= pivotThreshold)
     {
       m_R.insertBackByOuterInner(col, nonzeroCol) = beta;
-      nonzeroCol++;
       // The householder coefficient
-      m_hcoeffs(col) = tau;
+      m_hcoeffs(nonzeroCol) = tau;
       // Record the householder reflections
       for (Index itq = 0; itq < nzcolQ; ++itq)
       {
         Index iQ = Qidx(itq);
-        m_Q.insertBackByOuterInnerUnordered(col,iQ) = tval(iQ);
+        m_Q.insertBackByOuterInnerUnordered(nonzeroCol,iQ) = tval(iQ);
         tval(iQ) = Scalar(0.);
-      }    
+      }
+      nonzeroCol++;
+      if(nonzeroCol<diagSize)
+        m_Q.startVec(nonzeroCol);
     }
     else
     {
       // Zero pivot found: move implicitly this column to the end
-      m_hcoeffs(col) = Scalar(0);
       for (Index j = nonzeroCol; j < n-1; j++) 
         std::swap(m_pivotperm.indices()(j), m_pivotperm.indices()[j+1]);
       
       // Recompute the column elimination tree
       internal::coletree(m_pmat, m_etree, m_firstRowElt, m_pivotperm.indices().data());
+      m_isEtreeOk = false;
     }
   }
   
+  m_hcoeffs.tail(diagSize-nonzeroCol).setZero();
+  
   // Finalize the column pointers of the sparse matrices R and Q
   m_Q.finalize();
   m_Q.makeCompressed();
   m_R.finalize();
   m_R.makeCompressed();
   m_isQSorted = false;
-  
+
   m_nonzeropivots = nonzeroCol;
   
   if(nonzeroCol<n)
   {
     // Permute the triangular factor to put the 'dead' columns to the end
-    MatrixType tempR(m_R);
+    QRMatrixType tempR(m_R);
     m_R = tempR * m_pivotperm;
     
     // Update the column permutation
@@ -535,58 +609,31 @@
   m_info = Success;
 }
 
-namespace internal {
-  
-template<typename _MatrixType, typename OrderingType, typename Rhs>
-struct solve_retval<SparseQR<_MatrixType,OrderingType>, Rhs>
-  : solve_retval_base<SparseQR<_MatrixType,OrderingType>, Rhs>
-{
-  typedef SparseQR<_MatrixType,OrderingType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-template<typename _MatrixType, typename OrderingType, typename Rhs>
-struct sparse_solve_retval<SparseQR<_MatrixType, OrderingType>, Rhs>
- : sparse_solve_retval_base<SparseQR<_MatrixType, OrderingType>, Rhs>
-{
-  typedef SparseQR<_MatrixType, OrderingType> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec, Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
-};
-} // end namespace internal
-
 template <typename SparseQRType, typename Derived>
 struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived> >
 {
   typedef typename SparseQRType::QRMatrixType MatrixType;
   typedef typename SparseQRType::Scalar Scalar;
-  typedef typename SparseQRType::Index Index;
   // Get the references 
   SparseQR_QProduct(const SparseQRType& qr, const Derived& other, bool transpose) : 
   m_qr(qr),m_other(other),m_transpose(transpose) {}
-  inline Index rows() const { return m_transpose ? m_qr.rows() : m_qr.cols(); }
+  inline Index rows() const { return m_qr.matrixQ().rows(); }
   inline Index cols() const { return m_other.cols(); }
   
   // Assign to a vector
   template<typename DesType>
   void evalTo(DesType& res) const
   {
+    Index m = m_qr.rows();
     Index n = m_qr.cols();
+    Index diagSize = (std::min)(m,n);
     res = m_other;
     if (m_transpose)
     {
       eigen_assert(m_qr.m_Q.rows() == m_other.rows() && "Non conforming object sizes");
       //Compute res = Q' * other column by column
       for(Index j = 0; j < res.cols(); j++){
-        for (Index k = 0; k < n; k++)
+        for (Index k = 0; k < diagSize; k++)
         {
           Scalar tau = Scalar(0);
           tau = m_qr.m_Q.col(k).dot(res.col(j));
@@ -598,16 +645,20 @@
     }
     else
     {
-      eigen_assert(m_qr.m_Q.rows() == m_other.rows() && "Non conforming object sizes");
-      // Compute res = Q' * other column by column
+      eigen_assert(m_qr.matrixQ().cols() == m_other.rows() && "Non conforming object sizes");
+
+      res.conservativeResize(rows(), cols());
+
+      // Compute res = Q * other column by column
       for(Index j = 0; j < res.cols(); j++)
       {
-        for (Index k = n-1; k >=0; k--)
+        Index start_k = internal::is_identity<Derived>::value ? numext::mini(j,diagSize-1) : diagSize-1;
+        for (Index k = start_k; k >=0; k--)
         {
           Scalar tau = Scalar(0);
           tau = m_qr.m_Q.col(k).dot(res.col(j));
           if(tau==Scalar(0)) continue;
-          tau = tau * m_qr.m_hcoeffs(k);
+          tau = tau * numext::conj(m_qr.m_hcoeffs(k));
           res.col(j) -= tau * m_qr.m_Q.col(k);
         }
       }
@@ -616,52 +667,44 @@
   
   const SparseQRType& m_qr;
   const Derived& m_other;
-  bool m_transpose;
+  bool m_transpose; // TODO this actually means adjoint
 };
 
 template<typename SparseQRType>
 struct SparseQRMatrixQReturnType : public EigenBase<SparseQRMatrixQReturnType<SparseQRType> >
 {  
-  typedef typename SparseQRType::Index Index;
   typedef typename SparseQRType::Scalar Scalar;
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
-  SparseQRMatrixQReturnType(const SparseQRType& qr) : m_qr(qr) {}
+  enum {
+    RowsAtCompileTime = Dynamic,
+    ColsAtCompileTime = Dynamic
+  };
+  explicit SparseQRMatrixQReturnType(const SparseQRType& qr) : m_qr(qr) {}
   template<typename Derived>
   SparseQR_QProduct<SparseQRType, Derived> operator*(const MatrixBase<Derived>& other)
   {
     return SparseQR_QProduct<SparseQRType,Derived>(m_qr,other.derived(),false);
   }
+  // To use for operations with the adjoint of Q
   SparseQRMatrixQTransposeReturnType<SparseQRType> adjoint() const
   {
     return SparseQRMatrixQTransposeReturnType<SparseQRType>(m_qr);
   }
   inline Index rows() const { return m_qr.rows(); }
-  inline Index cols() const { return m_qr.cols(); }
-  // To use for operations with the transpose of Q
+  inline Index cols() const { return m_qr.rows(); }
+  // To use for operations with the transpose of Q FIXME this is the same as adjoint at the moment
   SparseQRMatrixQTransposeReturnType<SparseQRType> transpose() const
   {
     return SparseQRMatrixQTransposeReturnType<SparseQRType>(m_qr);
   }
-  template<typename Dest> void evalTo(MatrixBase<Dest>& dest) const
-  {
-    dest.derived() = m_qr.matrixQ() * Dest::Identity(m_qr.rows(), m_qr.rows());
-  }
-  template<typename Dest> void evalTo(SparseMatrixBase<Dest>& dest) const
-  {
-    Dest idMat(m_qr.rows(), m_qr.rows());
-    idMat.setIdentity();
-    // Sort the sparse householder reflectors if needed
-    const_cast<SparseQRType *>(&m_qr)->sort_matrix_Q();
-    dest.derived() = SparseQR_QProduct<SparseQRType, Dest>(m_qr, idMat, false);
-  }
-
   const SparseQRType& m_qr;
 };
 
+// TODO this actually represents the adjoint of Q
 template<typename SparseQRType>
 struct SparseQRMatrixQTransposeReturnType
 {
-  SparseQRMatrixQTransposeReturnType(const SparseQRType& qr) : m_qr(qr) {}
+  explicit SparseQRMatrixQTransposeReturnType(const SparseQRType& qr) : m_qr(qr) {}
   template<typename Derived>
   SparseQR_QProduct<SparseQRType,Derived> operator*(const MatrixBase<Derived>& other)
   {
@@ -670,6 +713,46 @@
   const SparseQRType& m_qr;
 };
 
+namespace internal {
+  
+template<typename SparseQRType>
+struct evaluator_traits<SparseQRMatrixQReturnType<SparseQRType> >
+{
+  typedef typename SparseQRType::MatrixType MatrixType;
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef SparseShape Shape;
+};
+
+template< typename DstXprType, typename SparseQRType>
+struct Assignment<DstXprType, SparseQRMatrixQReturnType<SparseQRType>, internal::assign_op<typename DstXprType::Scalar,typename DstXprType::Scalar>, Sparse2Sparse>
+{
+  typedef SparseQRMatrixQReturnType<SparseQRType> SrcXprType;
+  typedef typename DstXprType::Scalar Scalar;
+  typedef typename DstXprType::StorageIndex StorageIndex;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &/*func*/)
+  {
+    typename DstXprType::PlainObject idMat(src.rows(), src.cols());
+    idMat.setIdentity();
+    // Sort the sparse householder reflectors if needed
+    const_cast<SparseQRType *>(&src.m_qr)->_sort_matrix_Q();
+    dst = SparseQR_QProduct<SparseQRType, DstXprType>(src.m_qr, idMat, false);
+  }
+};
+
+template< typename DstXprType, typename SparseQRType>
+struct Assignment<DstXprType, SparseQRMatrixQReturnType<SparseQRType>, internal::assign_op<typename DstXprType::Scalar,typename DstXprType::Scalar>, Sparse2Dense>
+{
+  typedef SparseQRMatrixQReturnType<SparseQRType> SrcXprType;
+  typedef typename DstXprType::Scalar Scalar;
+  typedef typename DstXprType::StorageIndex StorageIndex;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &/*func*/)
+  {
+    dst = src.m_qr.matrixQ() * DstXprType::Identity(src.m_qr.rows(), src.m_qr.rows());
+  }
+};
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif

diff --git a/Eigen/src/StlSupport/StdDeque.h b/Eigen/src/StlSupport/StdDeque.h
index 4ee8e5c..6d47e75 100644
--- a/Eigen/src/StlSupport/StdDeque.h
+++ b/Eigen/src/StlSupport/StdDeque.h

@@ -11,14 +11,7 @@
 #ifndef EIGEN_STDDEQUE_H
 #define EIGEN_STDDEQUE_H
 
-#include "Eigen/src/StlSupport/details.h"
-
-// Define the explicit instantiation (e.g. necessary for the Intel compiler)
-#if defined(__INTEL_COMPILER) || defined(__GNUC__)
-  #define EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(...) template class std::deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> >;
-#else
-  #define EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(...)
-#endif
+#include "details.h"
 
 /**
  * This section contains a convenience MACRO which allows an easy specialization of
@@ -26,25 +19,24 @@
  * is used automatically.
  */
 #define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...) \
-EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(__VA_ARGS__) \
 namespace std \
 { \
-  template<typename _Ay> \
-  class deque<__VA_ARGS__, _Ay>  \
+  template<> \
+  class deque<__VA_ARGS__, std::allocator<__VA_ARGS__> >           \
     : public deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > \
   { \
     typedef deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > deque_base; \
   public: \
     typedef __VA_ARGS__ value_type; \
-    typedef typename deque_base::allocator_type allocator_type; \
-    typedef typename deque_base::size_type size_type;  \
-    typedef typename deque_base::iterator iterator;  \
+    typedef deque_base::allocator_type allocator_type; \
+    typedef deque_base::size_type size_type;  \
+    typedef deque_base::iterator iterator;  \
     explicit deque(const allocator_type& a = allocator_type()) : deque_base(a) {}  \
     template<typename InputIterator> \
     deque(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : deque_base(first, last, a) {} \
     deque(const deque& c) : deque_base(c) {}  \
     explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \
-    deque(iterator start, iterator end) : deque_base(start, end) {}  \
+    deque(iterator start_, iterator end_) : deque_base(start_, end_) {}  \
     deque& operator=(const deque& x) {  \
       deque_base::operator=(x);  \
       return *this;  \
@@ -53,7 +45,7 @@
 }
 
 // check whether we really need the std::deque specialization
-#if !(defined(_GLIBCXX_DEQUE) && (!EIGEN_GNUC_AT_LEAST(4,1))) /* Note that before gcc-4.1 we already have: std::deque::resize(size_type,const T&). */
+#if !EIGEN_HAS_CXX11_CONTAINERS && !(defined(_GLIBCXX_DEQUE) && (!EIGEN_GNUC_AT_LEAST(4,1))) /* Note that before gcc-4.1 we already have: std::deque::resize(size_type,const T&). */
 
 namespace std {
 
@@ -70,7 +62,7 @@
     : deque_base(first, last, a) {} \
     deque(const deque& c) : deque_base(c) {}  \
     explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \
-    deque(iterator start, iterator end) : deque_base(start, end) {}  \
+    deque(iterator start_, iterator end_) : deque_base(start_, end_) {}  \
     deque& operator=(const deque& x) {  \
       deque_base::operator=(x);  \
       return *this;  \
@@ -106,17 +98,7 @@
   { return deque_base::insert(position,x); }
   void insert(const_iterator position, size_type new_size, const value_type& x)
   { deque_base::insert(position, new_size, x); }
-#elif defined(_GLIBCXX_DEQUE) && EIGEN_GNUC_AT_LEAST(4,2)
-  // workaround GCC std::deque implementation
-  void resize(size_type new_size, const value_type& x)
-  {
-    if (new_size < deque_base::size())
-      deque_base::_M_erase_at_end(this->_M_impl._M_start + new_size);
-    else
-      deque_base::insert(deque_base::end(), new_size - deque_base::size(), x);
-  }
 #else
-  // either GCC 4.1 or non-GCC
   // default implementation which should always work.
   void resize(size_type new_size, const value_type& x)
   {

diff --git a/Eigen/src/StlSupport/StdList.h b/Eigen/src/StlSupport/StdList.h
index 627381e..8ba3fad 100644
--- a/Eigen/src/StlSupport/StdList.h
+++ b/Eigen/src/StlSupport/StdList.h

@@ -10,14 +10,7 @@
 #ifndef EIGEN_STDLIST_H
 #define EIGEN_STDLIST_H
 
-#include "Eigen/src/StlSupport/details.h"
-
-// Define the explicit instantiation (e.g. necessary for the Intel compiler)
-#if defined(__INTEL_COMPILER) || defined(__GNUC__)
-  #define EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(...) template class std::list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> >;
-#else
-  #define EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(...)
-#endif
+#include "details.h"
 
 /**
  * This section contains a convenience MACRO which allows an easy specialization of
@@ -25,25 +18,24 @@
  * is used automatically.
  */
 #define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...) \
-EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(__VA_ARGS__) \
 namespace std \
 { \
-  template<typename _Ay> \
-  class list<__VA_ARGS__, _Ay>  \
+  template<> \
+  class list<__VA_ARGS__, std::allocator<__VA_ARGS__> >           \
     : public list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > \
   { \
     typedef list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > list_base; \
   public: \
     typedef __VA_ARGS__ value_type; \
-    typedef typename list_base::allocator_type allocator_type; \
-    typedef typename list_base::size_type size_type;  \
-    typedef typename list_base::iterator iterator;  \
+    typedef list_base::allocator_type allocator_type; \
+    typedef list_base::size_type size_type;  \
+    typedef list_base::iterator iterator;  \
     explicit list(const allocator_type& a = allocator_type()) : list_base(a) {}  \
     template<typename InputIterator> \
     list(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : list_base(first, last, a) {} \
     list(const list& c) : list_base(c) {}  \
     explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \
-    list(iterator start, iterator end) : list_base(start, end) {}  \
+    list(iterator start_, iterator end_) : list_base(start_, end_) {}  \
     list& operator=(const list& x) {  \
       list_base::operator=(x);  \
       return *this;  \
@@ -51,8 +43,8 @@
   }; \
 }
 
-// check whether we really need the std::vector specialization
-#if !(defined(_GLIBCXX_VECTOR) && (!EIGEN_GNUC_AT_LEAST(4,1))) /* Note that before gcc-4.1 we already have: std::list::resize(size_type,const T&). */
+// check whether we really need the std::list specialization
+#if !EIGEN_HAS_CXX11_CONTAINERS && !(defined(_GLIBCXX_LIST) && (!EIGEN_GNUC_AT_LEAST(4,1))) /* Note that before gcc-4.1 we already have: std::list::resize(size_type,const T&). */
 
 namespace std
 {
@@ -70,7 +62,7 @@
     : list_base(first, last, a) {} \
     list(const list& c) : list_base(c) {}  \
     explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \
-    list(iterator start, iterator end) : list_base(start, end) {}  \
+    list(iterator start_, iterator end_) : list_base(start_, end_) {}  \
     list& operator=(const list& x) {  \
     list_base::operator=(x);  \
     return *this; \

diff --git a/Eigen/src/StlSupport/StdVector.h b/Eigen/src/StlSupport/StdVector.h
index 40a9abe..9fcf19b 100644
--- a/Eigen/src/StlSupport/StdVector.h
+++ b/Eigen/src/StlSupport/StdVector.h

@@ -11,7 +11,7 @@
 #ifndef EIGEN_STDVECTOR_H
 #define EIGEN_STDVECTOR_H
 
-#include "Eigen/src/StlSupport/details.h"
+#include "details.h"
 
 /**
  * This section contains a convenience MACRO which allows an easy specialization of
@@ -36,7 +36,7 @@
     vector(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : vector_base(first, last, a) {} \
     vector(const vector& c) : vector_base(c) {}  \
     explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \
-    vector(iterator start, iterator end) : vector_base(start, end) {}  \
+    vector(iterator start_, iterator end_) : vector_base(start_, end_) {}  \
     vector& operator=(const vector& x) {  \
       vector_base::operator=(x);  \
       return *this;  \
@@ -44,6 +44,9 @@
   }; \
 }
 
+// Don't specialize if containers are implemented according to C++11
+#if !EIGEN_HAS_CXX11_CONTAINERS
+
 namespace std {
 
 #define EIGEN_STD_VECTOR_SPECIALIZATION_BODY \
@@ -59,7 +62,7 @@
     : vector_base(first, last, a) {} \
     vector(const vector& c) : vector_base(c) {}  \
     explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \
-    vector(iterator start, iterator end) : vector_base(start, end) {}  \
+    vector(iterator start_, iterator end_) : vector_base(start_, end_) {}  \
     vector& operator=(const vector& x) {  \
       vector_base::operator=(x);  \
       return *this;  \
@@ -122,5 +125,7 @@
 #endif
   };
 }
+#endif // !EIGEN_HAS_CXX11_CONTAINERS
+
 
 #endif // EIGEN_STDVECTOR_H

diff --git a/Eigen/src/StlSupport/details.h b/Eigen/src/StlSupport/details.h
index e42ec02..2cfd13e 100644
--- a/Eigen/src/StlSupport/details.h
+++ b/Eigen/src/StlSupport/details.h

@@ -22,13 +22,13 @@
   class aligned_allocator_indirection : public EIGEN_ALIGNED_ALLOCATOR<T>
   {
   public:
-    typedef size_t    size_type;
-    typedef ptrdiff_t difference_type;
-    typedef T*        pointer;
-    typedef const T*  const_pointer;
-    typedef T&        reference;
-    typedef const T&  const_reference;
-    typedef T         value_type;
+    typedef std::size_t     size_type;
+    typedef std::ptrdiff_t  difference_type;
+    typedef T*              pointer;
+    typedef const T*        const_pointer;
+    typedef T&              reference;
+    typedef const T&        const_reference;
+    typedef T               value_type;
 
     template<class U>
     struct rebind

diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h
index bcb3557..d1d3ad7 100644
--- a/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,16 +10,16 @@
 #ifndef EIGEN_SUPERLUSUPPORT_H
 #define EIGEN_SUPERLUSUPPORT_H
 
-namespace Eigen { 
+namespace Eigen {
 
+#if defined(SUPERLU_MAJOR_VERSION) && (SUPERLU_MAJOR_VERSION >= 5)
 #define DECL_GSSVX(PREFIX,FLOATTYPE,KEYTYPE)		\
     extern "C" {                                                                                          \
-      typedef struct { FLOATTYPE for_lu; FLOATTYPE total_needed; int expansions; } PREFIX##mem_usage_t;   \
       extern void PREFIX##gssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *,                  \
                                 char *, FLOATTYPE *, FLOATTYPE *, SuperMatrix *, SuperMatrix *,           \
                                 void *, int, SuperMatrix *, SuperMatrix *,                                \
                                 FLOATTYPE *, FLOATTYPE *, FLOATTYPE *, FLOATTYPE *,                       \
-                                PREFIX##mem_usage_t *, SuperLUStat_t *, int *);                           \
+                                GlobalLU_t *, mem_usage_t *, SuperLUStat_t *, int *);                     \
     }                                                                                                     \
     inline float SuperLU_gssvx(superlu_options_t *options, SuperMatrix *A,                                \
          int *perm_c, int *perm_r, int *etree, char *equed,                                               \
@@ -29,12 +29,37 @@
          FLOATTYPE *recip_pivot_growth,                                                                   \
          FLOATTYPE *rcond, FLOATTYPE *ferr, FLOATTYPE *berr,                                              \
          SuperLUStat_t *stats, int *info, KEYTYPE) {                                                      \
-    PREFIX##mem_usage_t mem_usage;                                                                        \
+    mem_usage_t mem_usage;                                                                                \
+    GlobalLU_t gLU;                                                                                       \
+    PREFIX##gssvx(options, A, perm_c, perm_r, etree, equed, R, C, L,                                      \
+         U, work, lwork, B, X, recip_pivot_growth, rcond,                                                 \
+         ferr, berr, &gLU, &mem_usage, stats, info);                                                      \
+    return mem_usage.for_lu; /* bytes used by the factor storage */                                       \
+  }
+#else // version < 5.0
+#define DECL_GSSVX(PREFIX,FLOATTYPE,KEYTYPE)		\
+    extern "C" {                                                                                          \
+      extern void PREFIX##gssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *,                  \
+                                char *, FLOATTYPE *, FLOATTYPE *, SuperMatrix *, SuperMatrix *,           \
+                                void *, int, SuperMatrix *, SuperMatrix *,                                \
+                                FLOATTYPE *, FLOATTYPE *, FLOATTYPE *, FLOATTYPE *,                       \
+                                mem_usage_t *, SuperLUStat_t *, int *);                                   \
+    }                                                                                                     \
+    inline float SuperLU_gssvx(superlu_options_t *options, SuperMatrix *A,                                \
+         int *perm_c, int *perm_r, int *etree, char *equed,                                               \
+         FLOATTYPE *R, FLOATTYPE *C, SuperMatrix *L,                                                      \
+         SuperMatrix *U, void *work, int lwork,                                                           \
+         SuperMatrix *B, SuperMatrix *X,                                                                  \
+         FLOATTYPE *recip_pivot_growth,                                                                   \
+         FLOATTYPE *rcond, FLOATTYPE *ferr, FLOATTYPE *berr,                                              \
+         SuperLUStat_t *stats, int *info, KEYTYPE) {                                                      \
+    mem_usage_t mem_usage;                                                                                \
     PREFIX##gssvx(options, A, perm_c, perm_r, etree, equed, R, C, L,                                      \
          U, work, lwork, B, X, recip_pivot_growth, rcond,                                                 \
          ferr, berr, &mem_usage, stats, info);                                                            \
     return mem_usage.for_lu; /* bytes used by the factor storage */                                       \
   }
+#endif
 
 DECL_GSSVX(s,float,float)
 DECL_GSSVX(c,float,std::complex<float>)
@@ -53,7 +78,7 @@
       extern void PREFIX##gsisx(superlu_options_t *, SuperMatrix *, int *, int *, int *,        \
                          char *, FLOATTYPE *, FLOATTYPE *, SuperMatrix *, SuperMatrix *,        \
                          void *, int, SuperMatrix *, SuperMatrix *, FLOATTYPE *, FLOATTYPE *,   \
-                         PREFIX##mem_usage_t *, SuperLUStat_t *, int *);                        \
+                         mem_usage_t *, SuperLUStat_t *, int *);                        \
     }                                                                                           \
     inline float SuperLU_gsisx(superlu_options_t *options, SuperMatrix *A,                      \
          int *perm_c, int *perm_r, int *etree, char *equed,                                     \
@@ -63,7 +88,7 @@
          FLOATTYPE *recip_pivot_growth,                                                         \
          FLOATTYPE *rcond,                                                                      \
          SuperLUStat_t *stats, int *info, KEYTYPE) {                                            \
-    PREFIX##mem_usage_t mem_usage;                                                              \
+    mem_usage_t mem_usage;                                                              \
     PREFIX##gsisx(options, A, perm_c, perm_r, etree, equed, R, C, L,                            \
          U, work, lwork, B, X, recip_pivot_growth, rcond,                                       \
          &mem_usage, stats, info);                                                              \
@@ -156,47 +181,48 @@
     res.setScalarType<typename MatrixType::Scalar>();
     res.Mtype     = SLU_GE;
 
-    res.nrow      = mat.rows();
-    res.ncol      = mat.cols();
+    res.nrow      = internal::convert_index<int>(mat.rows());
+    res.ncol      = internal::convert_index<int>(mat.cols());
 
-    res.storage.lda       = MatrixType::IsVectorAtCompileTime ? mat.size() : mat.outerStride();
+    res.storage.lda       = internal::convert_index<int>(MatrixType::IsVectorAtCompileTime ? mat.size() : mat.outerStride());
     res.storage.values    = (void*)(mat.data());
     return res;
   }
 
   template<typename MatrixType>
-  static SluMatrix Map(SparseMatrixBase<MatrixType>& mat)
+  static SluMatrix Map(SparseMatrixBase<MatrixType>& a_mat)
   {
+    MatrixType &mat(a_mat.derived());
     SluMatrix res;
     if ((MatrixType::Flags&RowMajorBit)==RowMajorBit)
     {
       res.setStorageType(SLU_NR);
-      res.nrow      = mat.cols();
-      res.ncol      = mat.rows();
+      res.nrow      = internal::convert_index<int>(mat.cols());
+      res.ncol      = internal::convert_index<int>(mat.rows());
     }
     else
     {
       res.setStorageType(SLU_NC);
-      res.nrow      = mat.rows();
-      res.ncol      = mat.cols();
+      res.nrow      = internal::convert_index<int>(mat.rows());
+      res.ncol      = internal::convert_index<int>(mat.cols());
     }
 
     res.Mtype       = SLU_GE;
 
-    res.storage.nnz       = mat.nonZeros();
-    res.storage.values    = mat.derived().valuePtr();
-    res.storage.innerInd  = mat.derived().innerIndexPtr();
-    res.storage.outerInd  = mat.derived().outerIndexPtr();
+    res.storage.nnz       = internal::convert_index<int>(mat.nonZeros());
+    res.storage.values    = mat.valuePtr();
+    res.storage.innerInd  = mat.innerIndexPtr();
+    res.storage.outerInd  = mat.outerIndexPtr();
 
     res.setScalarType<typename MatrixType::Scalar>();
 
     // FIXME the following is not very accurate
-    if (MatrixType::Flags & Upper)
+    if (int(MatrixType::Flags) & int(Upper))
       res.Mtype = SLU_TRU;
-    if (MatrixType::Flags & Lower)
+    if (int(MatrixType::Flags) & int(Lower))
       res.Mtype = SLU_TRL;
 
-    eigen_assert(((MatrixType::Flags & SelfAdjoint)==0) && "SelfAdjoint matrix shape not supported by SuperLU");
+    eigen_assert(((int(MatrixType::Flags) & int(SelfAdjoint))==0) && "SelfAdjoint matrix shape not supported by SuperLU");
 
     return res;
   }
@@ -271,8 +297,8 @@
 template<typename Scalar, int Flags, typename Index>
 MappedSparseMatrix<Scalar,Flags,Index> map_superlu(SluMatrix& sluMat)
 {
-  eigen_assert((Flags&RowMajor)==RowMajor && sluMat.Stype == SLU_NR
-         || (Flags&ColMajor)==ColMajor && sluMat.Stype == SLU_NC);
+  eigen_assert(((Flags&RowMajor)==RowMajor && sluMat.Stype == SLU_NR)
+         || ((Flags&ColMajor)==ColMajor && sluMat.Stype == SLU_NC));
 
   Index outerSize = (Flags&RowMajor)==RowMajor ? sluMat.ncol : sluMat.nrow;
 
@@ -288,17 +314,26 @@
   * \brief The base class for the direct and incomplete LU factorization of SuperLU
   */
 template<typename _MatrixType, typename Derived>
-class SuperLUBase : internal::noncopyable
+class SuperLUBase : public SparseSolverBase<Derived>
 {
+  protected:
+    typedef SparseSolverBase<Derived> Base;
+    using Base::derived;
+    using Base::m_isInitialized;
   public:
     typedef _MatrixType MatrixType;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<Scalar,Dynamic,1> Vector;
     typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
     typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;    
+    typedef Map<PermutationMatrix<Dynamic,Dynamic,int> > PermutationMap;
     typedef SparseMatrix<Scalar> LUMatrixType;
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
 
   public:
 
@@ -309,9 +344,6 @@
       clearFactors();
     }
     
-    Derived& derived() { return *static_cast<Derived*>(this); }
-    const Derived& derived() const { return *static_cast<const Derived*>(this); }
-    
     inline Index rows() const { return m_matrix.rows(); }
     inline Index cols() const { return m_matrix.cols(); }
     
@@ -320,7 +352,7 @@
     
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
@@ -335,33 +367,7 @@
       derived().analyzePattern(matrix);
       derived().factorize(matrix);
     }
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<SuperLUBase, Rhs> solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "SuperLU is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "SuperLU::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<SuperLUBase, Rhs>(*this, b.derived());
-    }
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<SuperLUBase, Rhs> solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "SuperLU is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "SuperLU::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<SuperLUBase, Rhs>(*this, b.derived());
-    }
-    
+
     /** Performs a symbolic decomposition on the sparcity of \a matrix.
       *
       * This function is particularly useful when solving for several problems having the same structure.
@@ -386,7 +392,7 @@
     {
       set_default_options(&this->m_sluOptions);
       
-      const int size = a.rows();
+      const Index size = a.rows();
       m_matrix = a;
 
       m_sluA = internal::asSluMatrix(m_matrix);
@@ -405,7 +411,7 @@
       m_sluB.storage.values = 0;
       m_sluB.nrow           = 0;
       m_sluB.ncol           = 0;
-      m_sluB.storage.lda    = size;
+      m_sluB.storage.lda    = internal::convert_index<int>(size);
       m_sluX                = m_sluB;
       
       m_extractedDataAreDirty = true;
@@ -453,7 +459,6 @@
     mutable char m_sluEqued;
 
     mutable ComputationInfo m_info;
-    bool m_isInitialized;
     int m_factorizationIsOk;
     int m_analysisIsOk;
     mutable bool m_extractedDataAreDirty;
@@ -473,7 +478,11 @@
   *
   * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   *
-  * \sa \ref TutorialSparseDirectSolvers
+  * \warning This class is only for the 4.x versions of SuperLU. The 3.x and 5.x versions are not supported.
+  *
+  * \implsparsesolverconcept
+  *
+  * \sa \ref TutorialSparseSolverConcept, class SparseLU
   */
 template<typename _MatrixType>
 class SuperLU : public SuperLUBase<_MatrixType,SuperLU<_MatrixType> >
@@ -483,18 +492,20 @@
     typedef _MatrixType MatrixType;
     typedef typename Base::Scalar Scalar;
     typedef typename Base::RealScalar RealScalar;
-    typedef typename Base::Index Index;
+    typedef typename Base::StorageIndex StorageIndex;
     typedef typename Base::IntRowVectorType IntRowVectorType;
-    typedef typename Base::IntColVectorType IntColVectorType;    
+    typedef typename Base::IntColVectorType IntColVectorType;   
+    typedef typename Base::PermutationMap PermutationMap;
     typedef typename Base::LUMatrixType LUMatrixType;
     typedef TriangularView<LUMatrixType, Lower|UnitDiag>  LMatrixType;
-    typedef TriangularView<LUMatrixType,  Upper>           UMatrixType;
+    typedef TriangularView<LUMatrixType,  Upper>          UMatrixType;
 
   public:
+    using Base::_solve_impl;
 
     SuperLU() : Base() { init(); }
 
-    SuperLU(const MatrixType& matrix) : Base()
+    explicit SuperLU(const MatrixType& matrix) : Base()
     {
       init();
       Base::compute(matrix);
@@ -525,11 +536,9 @@
       */
     void factorize(const MatrixType& matrix);
     
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal */
     template<typename Rhs,typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
-    #endif // EIGEN_PARSED_BY_DOXYGEN
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
     
     inline const LMatrixType& matrixL() const
     {
@@ -637,13 +646,12 @@
 
 template<typename MatrixType>
 template<typename Rhs,typename Dest>
-void SuperLU<MatrixType>::_solve(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x) const
+void SuperLU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x) const
 {
   eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()");
 
-  const int size = m_matrix.rows();
-  const int rhsCols = b.cols();
-  eigen_assert(size==b.rows());
+  const Index rhsCols = b.cols();
+  eigen_assert(m_matrix.rows()==b.rows());
 
   m_sluOptions.Trans = NOTRANS;
   m_sluOptions.Fact = FACTORED;
@@ -652,8 +660,12 @@
 
   m_sluFerr.resize(rhsCols);
   m_sluBerr.resize(rhsCols);
-  m_sluB = SluMatrix::Map(b.const_cast_derived());
-  m_sluX = SluMatrix::Map(x.derived());
+  
+  Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b);
+  Ref<const Matrix<typename Dest::Scalar,Dynamic,Dynamic,ColMajor> > x_ref(x);
+  
+  m_sluB = SluMatrix::Map(b_ref.const_cast_derived());
+  m_sluX = SluMatrix::Map(x_ref.const_cast_derived());
   
   typename Rhs::PlainObject b_cpy;
   if(m_sluEqued!='N')
@@ -676,6 +688,10 @@
                 &m_sluFerr[0], &m_sluBerr[0],
                 &m_sluStat, &info, Scalar());
   StatFree(&m_sluStat);
+  
+  if(x.derived().data() != x_ref.data())
+    x = x_ref;
+  
   m_info = info==0 ? Success : NumericalIssue;
 }
 
@@ -699,7 +715,7 @@
     NCformat    *Ustore = static_cast<NCformat*>(m_sluU.Store);
     Scalar      *SNptr;
 
-    const int size = m_matrix.rows();
+    const Index size = m_matrix.rows();
     m_l.resize(size,size);
     m_l.resizeNonZeros(Lstore->nnz);
     m_u.resize(size,size);
@@ -791,6 +807,8 @@
         det *= m_u.valuePtr()[lastId];
     }
   }
+  if(PermutationMap(m_p.data(),m_p.size()).determinant()*PermutationMap(m_q.data(),m_q.size()).determinant()<0)
+    det = -det;
   if(m_sluEqued!='N')
     return det/m_sluRscale.prod()/m_sluCscale.prod();
   else
@@ -810,11 +828,13 @@
   * This class allows to solve for an approximate solution of A.X = B sparse linear problems via an incomplete LU factorization
   * using the SuperLU library. This class is aimed to be used as a preconditioner of the iterative linear solvers.
   *
-  * \warning This class requires SuperLU 4 or later.
+  * \warning This class is only for the 4.x versions of SuperLU. The 3.x and 5.x versions are not supported.
   *
   * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   *
-  * \sa \ref TutorialSparseDirectSolvers, class ConjugateGradient, class BiCGSTAB
+  * \implsparsesolverconcept
+  *
+  * \sa \ref TutorialSparseSolverConcept, class IncompleteLUT, class ConjugateGradient, class BiCGSTAB
   */
 
 template<typename _MatrixType>
@@ -825,9 +845,9 @@
     typedef _MatrixType MatrixType;
     typedef typename Base::Scalar Scalar;
     typedef typename Base::RealScalar RealScalar;
-    typedef typename Base::Index Index;
 
   public:
+    using Base::_solve_impl;
 
     SuperILU() : Base() { init(); }
 
@@ -863,7 +883,7 @@
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal */
     template<typename Rhs,typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
     #endif // EIGEN_PARSED_BY_DOXYGEN
     
   protected:
@@ -946,15 +966,15 @@
   m_factorizationIsOk = true;
 }
 
+#ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatrixType>
 template<typename Rhs,typename Dest>
-void SuperILU<MatrixType>::_solve(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x) const
+void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x) const
 {
   eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()");
 
-  const int size = m_matrix.rows();
   const int rhsCols = b.cols();
-  eigen_assert(size==b.rows());
+  eigen_assert(m_matrix.rows()==b.rows());
 
   m_sluOptions.Trans = NOTRANS;
   m_sluOptions.Fact = FACTORED;
@@ -962,8 +982,12 @@
 
   m_sluFerr.resize(rhsCols);
   m_sluBerr.resize(rhsCols);
-  m_sluB = SluMatrix::Map(b.const_cast_derived());
-  m_sluX = SluMatrix::Map(x.derived());
+  
+  Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b);
+  Ref<const Matrix<typename Dest::Scalar,Dynamic,Dynamic,ColMajor> > x_ref(x);
+  
+  m_sluB = SluMatrix::Map(b_ref.const_cast_derived());
+  m_sluX = SluMatrix::Map(x_ref.const_cast_derived());
 
   typename Rhs::PlainObject b_cpy;
   if(m_sluEqued!='N')
@@ -986,40 +1010,15 @@
                 &recip_pivot_growth, &rcond,
                 &m_sluStat, &info, Scalar());
   StatFree(&m_sluStat);
+  
+  if(x.derived().data() != x_ref.data())
+    x = x_ref;
 
   m_info = info==0 ? Success : NumericalIssue;
 }
 #endif
 
-namespace internal {
-  
-template<typename _MatrixType, typename Derived, typename Rhs>
-struct solve_retval<SuperLUBase<_MatrixType,Derived>, Rhs>
-  : solve_retval_base<SuperLUBase<_MatrixType,Derived>, Rhs>
-{
-  typedef SuperLUBase<_MatrixType,Derived> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec().derived()._solve(rhs(),dst);
-  }
-};
-
-template<typename _MatrixType, typename Derived, typename Rhs>
-struct sparse_solve_retval<SuperLUBase<_MatrixType,Derived>, Rhs>
-  : sparse_solve_retval_base<SuperLUBase<_MatrixType,Derived>, Rhs>
-{
-  typedef SuperLUBase<_MatrixType,Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
-};
-
-} // end namespace internal
+#endif
 
 } // end namespace Eigen
 

diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h
index 3a48cec..e3a333f 100644
--- a/Eigen/src/UmfPackSupport/UmfPackSupport.h
+++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h

@@ -10,24 +10,102 @@
 #ifndef EIGEN_UMFPACKSUPPORT_H
 #define EIGEN_UMFPACKSUPPORT_H
 
-namespace Eigen { 
+// for compatibility with super old version of umfpack,
+// not sure this is really needed, but this is harmless.
+#ifndef SuiteSparse_long
+#ifdef UF_long
+#define SuiteSparse_long UF_long
+#else
+#error neither SuiteSparse_long nor UF_long are defined
+#endif
+#endif
+
+namespace Eigen {
 
 /* TODO extract L, extract U, compute det, etc... */
 
 // generic double/complex<double> wrapper functions:
 
-inline void umfpack_free_numeric(void **Numeric, double)
+
+ // Defaults
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, int)
+{ umfpack_di_defaults(control); }
+
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex<double>, int)
+{ umfpack_zi_defaults(control); }
+
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, SuiteSparse_long)
+{ umfpack_dl_defaults(control); }
+
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_defaults(control); }
+
+// Report info
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, int)
+{ umfpack_di_report_info(control, info);}
+
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex<double>, int)
+{ umfpack_zi_report_info(control, info);}
+
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, SuiteSparse_long)
+{ umfpack_dl_report_info(control, info);}
+
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_report_info(control, info);}
+
+// Report status
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, int)
+{ umfpack_di_report_status(control, status);}
+
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex<double>, int)
+{ umfpack_zi_report_status(control, status);}
+
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, SuiteSparse_long)
+{ umfpack_dl_report_status(control, status);}
+
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_report_status(control, status);}
+
+// report control
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, int)
+{ umfpack_di_report_control(control);}
+
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex<double>, int)
+{ umfpack_zi_report_control(control);}
+
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, SuiteSparse_long)
+{ umfpack_dl_report_control(control);}
+
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_report_control(control);}
+
+// Free numeric
+inline void umfpack_free_numeric(void **Numeric, double, int)
 { umfpack_di_free_numeric(Numeric); *Numeric = 0; }
 
-inline void umfpack_free_numeric(void **Numeric, std::complex<double>)
+inline void umfpack_free_numeric(void **Numeric, std::complex<double>, int)
 { umfpack_zi_free_numeric(Numeric); *Numeric = 0; }
 
-inline void umfpack_free_symbolic(void **Symbolic, double)
+inline void umfpack_free_numeric(void **Numeric, double, SuiteSparse_long)
+{ umfpack_dl_free_numeric(Numeric); *Numeric = 0; }
+
+inline void umfpack_free_numeric(void **Numeric, std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_free_numeric(Numeric); *Numeric = 0; }
+
+// Free symbolic
+inline void umfpack_free_symbolic(void **Symbolic, double, int)
 { umfpack_di_free_symbolic(Symbolic); *Symbolic = 0; }
 
-inline void umfpack_free_symbolic(void **Symbolic, std::complex<double>)
+inline void umfpack_free_symbolic(void **Symbolic, std::complex<double>, int)
 { umfpack_zi_free_symbolic(Symbolic); *Symbolic = 0; }
 
+inline void umfpack_free_symbolic(void **Symbolic, double, SuiteSparse_long)
+{ umfpack_dl_free_symbolic(Symbolic); *Symbolic = 0; }
+
+inline void umfpack_free_symbolic(void **Symbolic, std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_free_symbolic(Symbolic); *Symbolic = 0; }
+
+// Symbolic
 inline int umfpack_symbolic(int n_row,int n_col,
                             const int Ap[], const int Ai[], const double Ax[], void **Symbolic,
                             const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO])
@@ -41,7 +119,21 @@
 {
   return umfpack_zi_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info);
 }
+inline SuiteSparse_long umfpack_symbolic( SuiteSparse_long n_row,SuiteSparse_long n_col,
+                                          const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[], void **Symbolic,
+                                          const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO])
+{
+  return umfpack_dl_symbolic(n_row,n_col,Ap,Ai,Ax,Symbolic,Control,Info);
+}
 
+inline SuiteSparse_long umfpack_symbolic( SuiteSparse_long n_row,SuiteSparse_long n_col,
+                                          const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex<double> Ax[], void **Symbolic,
+                                          const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO])
+{
+  return umfpack_zl_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info);
+}
+
+// Numeric
 inline int umfpack_numeric( const int Ap[], const int Ai[], const double Ax[],
                             void *Symbolic, void **Numeric,
                             const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO])
@@ -55,7 +147,21 @@
 {
   return umfpack_zi_numeric(Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Numeric,Control,Info);
 }
+inline SuiteSparse_long umfpack_numeric(const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[],
+                                        void *Symbolic, void **Numeric,
+                                        const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO])
+{
+  return umfpack_dl_numeric(Ap,Ai,Ax,Symbolic,Numeric,Control,Info);
+}
 
+inline SuiteSparse_long umfpack_numeric(const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex<double> Ax[],
+                                        void *Symbolic, void **Numeric,
+                                        const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO])
+{
+  return umfpack_zl_numeric(Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Numeric,Control,Info);
+}
+
+// solve
 inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const double Ax[],
                           double X[], const double B[], void *Numeric,
                           const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO])
@@ -70,6 +176,21 @@
   return umfpack_zi_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info);
 }
 
+inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[],
+                                      double X[], const double B[], void *Numeric,
+                                      const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO])
+{
+  return umfpack_dl_solve(sys,Ap,Ai,Ax,X,B,Numeric,Control,Info);
+}
+
+inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex<double> Ax[],
+                                      std::complex<double> X[], const std::complex<double> B[], void *Numeric,
+                                      const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO])
+{
+  return umfpack_zl_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info);
+}
+
+// Get Lunz
 inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_udiag, void *Numeric, double)
 {
   return umfpack_di_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
@@ -80,6 +201,19 @@
   return umfpack_zi_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
 }
 
+inline SuiteSparse_long umfpack_get_lunz( SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row, SuiteSparse_long *n_col,
+                                          SuiteSparse_long *nz_udiag, void *Numeric, double)
+{
+  return umfpack_dl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
+}
+
+inline SuiteSparse_long umfpack_get_lunz( SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row, SuiteSparse_long *n_col,
+                                          SuiteSparse_long *nz_udiag, void *Numeric, std::complex<double>)
+{
+  return umfpack_zl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
+}
+
+// Get Numeric
 inline int umfpack_get_numeric(int Lp[], int Lj[], double Lx[], int Up[], int Ui[], double Ux[],
                                int P[], int Q[], double Dx[], int *do_recip, double Rs[], void *Numeric)
 {
@@ -95,18 +229,46 @@
   return umfpack_zi_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q,
                                 Dx?&dx0_real:0,0,do_recip,Rs,Numeric);
 }
+inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], double Lx[], SuiteSparse_long Up[], SuiteSparse_long Ui[], double Ux[],
+                                            SuiteSparse_long P[], SuiteSparse_long Q[], double Dx[], SuiteSparse_long *do_recip, double Rs[], void *Numeric)
+{
+  return umfpack_dl_get_numeric(Lp,Lj,Lx,Up,Ui,Ux,P,Q,Dx,do_recip,Rs,Numeric);
+}
 
-inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO])
+inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], std::complex<double> Lx[], SuiteSparse_long Up[], SuiteSparse_long Ui[], std::complex<double> Ux[],
+                                            SuiteSparse_long P[], SuiteSparse_long Q[], std::complex<double> Dx[], SuiteSparse_long *do_recip, double Rs[], void *Numeric)
+{
+  double& lx0_real = numext::real_ref(Lx[0]);
+  double& ux0_real = numext::real_ref(Ux[0]);
+  double& dx0_real = numext::real_ref(Dx[0]);
+  return umfpack_zl_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q,
+                                Dx?&dx0_real:0,0,do_recip,Rs,Numeric);
+}
+
+// Get Determinant
+inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], int)
 {
   return umfpack_di_get_determinant(Mx,Ex,NumericHandle,User_Info);
 }
 
-inline int umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO])
+inline int umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], int)
 {
   double& mx_real = numext::real_ref(*Mx);
   return umfpack_zi_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info);
 }
 
+inline SuiteSparse_long umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], SuiteSparse_long)
+{
+  return umfpack_dl_get_determinant(Mx,Ex,NumericHandle,User_Info);
+}
+
+inline SuiteSparse_long umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], SuiteSparse_long)
+{
+  double& mx_real = numext::real_ref(*Mx);
+  return umfpack_zl_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info);
+}
+
+
 /** \ingroup UmfPackSupport_Module
   * \brief A sparse LU factorization and solver based on UmfPack
   *
@@ -118,27 +280,47 @@
   * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix.
   * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   *
-  * \sa \ref TutorialSparseDirectSolvers
+  * \implsparsesolverconcept
+  *
+  * \sa \ref TutorialSparseSolverConcept, class SparseLU
   */
 template<typename _MatrixType>
-class UmfPackLU : internal::noncopyable
+class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
 {
+  protected:
+    typedef SparseSolverBase<UmfPackLU<_MatrixType> > Base;
+    using Base::m_isInitialized;
   public:
+    using Base::_solve_impl;
     typedef _MatrixType MatrixType;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<Scalar,Dynamic,1> Vector;
     typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
     typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
     typedef SparseMatrix<Scalar> LUMatrixType;
-    typedef SparseMatrix<Scalar,ColMajor,int> UmfpackMatrixType;
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> UmfpackMatrixType;
+    typedef Ref<const UmfpackMatrixType, StandardCompressedFormat> UmfpackMatrixRef;
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
 
   public:
 
-    UmfPackLU() { init(); }
+    typedef Array<double, UMFPACK_CONTROL, 1> UmfpackControl;
+    typedef Array<double, UMFPACK_INFO, 1> UmfpackInfo;
 
-    UmfPackLU(const MatrixType& matrix)
+    UmfPackLU()
+      : m_dummy(0,0), mp_matrix(m_dummy)
+    {
+      init();
+    }
+
+    template<typename InputMatrixType>
+    explicit UmfPackLU(const InputMatrixType& matrix)
+      : mp_matrix(matrix)
     {
       init();
       compute(matrix);
@@ -146,16 +328,16 @@
 
     ~UmfPackLU()
     {
-      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
-      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
+      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(), StorageIndex());
+      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar(), StorageIndex());
     }
 
-    inline Index rows() const { return m_copyMatrix.rows(); }
-    inline Index cols() const { return m_copyMatrix.cols(); }
+    inline Index rows() const { return mp_matrix.rows(); }
+    inline Index cols() const { return mp_matrix.cols(); }
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
@@ -188,40 +370,18 @@
       return m_q;
     }
 
-    /** Computes the sparse Cholesky decomposition of \a matrix 
+    /** Computes the sparse Cholesky decomposition of \a matrix
      *  Note that the matrix should be column-major, and in compressed format for best performance.
      *  \sa SparseMatrix::makeCompressed().
      */
-    void compute(const MatrixType& matrix)
+    template<typename InputMatrixType>
+    void compute(const InputMatrixType& matrix)
     {
-      analyzePattern(matrix);
-      factorize(matrix);
-    }
-
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<UmfPackLU, Rhs> solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "UmfPackLU is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "UmfPackLU::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<UmfPackLU, Rhs>(*this, b.derived());
-    }
-
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<UmfPackLU, Rhs> solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "UmfPackLU is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "UmfPackLU::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<UmfPackLU, Rhs>(*this, b.derived());
+      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(),StorageIndex());
+      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex());
+      grab(matrix.derived());
+      analyzePattern_impl();
+      factorize_impl();
     }
 
     /** Performs a symbolic decomposition on the sparcity of \a matrix.
@@ -230,23 +390,48 @@
       *
       * \sa factorize(), compute()
       */
-    void analyzePattern(const MatrixType& matrix)
+    template<typename InputMatrixType>
+    void analyzePattern(const InputMatrixType& matrix)
     {
-      if(m_symbolic)
-        umfpack_free_symbolic(&m_symbolic,Scalar());
-      if(m_numeric)
-        umfpack_free_numeric(&m_numeric,Scalar());
-      
-      grapInput(matrix);
+      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(),StorageIndex());
+      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex());
 
-      int errorCode = 0;
-      errorCode = umfpack_symbolic(matrix.rows(), matrix.cols(), m_outerIndexPtr, m_innerIndexPtr, m_valuePtr,
-                                   &m_symbolic, 0, 0);
+      grab(matrix.derived());
 
-      m_isInitialized = true;
-      m_info = errorCode ? InvalidInput : Success;
-      m_analysisIsOk = true;
-      m_factorizationIsOk = false;
+      analyzePattern_impl();
+    }
+
+    /** Provides the return status code returned by UmfPack during the numeric
+      * factorization.
+      *
+      * \sa factorize(), compute()
+      */
+    inline int umfpackFactorizeReturncode() const
+    {
+      eigen_assert(m_numeric && "UmfPackLU: you must first call factorize()");
+      return m_fact_errorCode;
+    }
+
+    /** Provides access to the control settings array used by UmfPack.
+      *
+      * If this array contains NaN's, the default values are used.
+      *
+      * See UMFPACK documentation for details.
+      */
+    inline const UmfpackControl& umfpackControl() const
+    {
+      return m_control;
+    }
+
+    /** Provides access to the control settings array used by UmfPack.
+      *
+      * If this array contains NaN's, the default values are used.
+      *
+      * See UMFPACK documentation for details.
+      */
+    inline UmfpackControl& umfpackControl()
+    {
+      return m_control;
     }
 
     /** Performs a numeric decomposition of \a matrix
@@ -255,27 +440,49 @@
       *
       * \sa analyzePattern(), compute()
       */
-    void factorize(const MatrixType& matrix)
+    template<typename InputMatrixType>
+    void factorize(const InputMatrixType& matrix)
     {
       eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()");
       if(m_numeric)
-        umfpack_free_numeric(&m_numeric,Scalar());
+        umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex());
 
-      grapInput(matrix);
+      grab(matrix.derived());
 
-      int errorCode;
-      errorCode = umfpack_numeric(m_outerIndexPtr, m_innerIndexPtr, m_valuePtr,
-                                  m_symbolic, &m_numeric, 0, 0);
-
-      m_info = errorCode ? NumericalIssue : Success;
-      m_factorizationIsOk = true;
+      factorize_impl();
     }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** Prints the current UmfPack control settings.
+      *
+      * \sa umfpackControl()
+      */
+    void printUmfpackControl()
+    {
+      umfpack_report_control(m_control.data(), Scalar(),StorageIndex());
+    }
+
+    /** Prints statistics collected by UmfPack.
+      *
+      * \sa analyzePattern(), compute()
+      */
+    void printUmfpackInfo()
+    {
+      eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()");
+      umfpack_report_info(m_control.data(), m_umfpackInfo.data(), Scalar(),StorageIndex());
+    }
+
+    /** Prints the status of the previous factorization operation performed by UmfPack (symbolic or numerical factorization).
+      *
+      * \sa analyzePattern(), compute()
+      */
+    void printUmfpackStatus() {
+      eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()");
+      umfpack_report_status(m_control.data(), m_fact_errorCode, Scalar(),StorageIndex());
+    }
+
     /** \internal */
     template<typename BDerived,typename XDerived>
-    bool _solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const;
-    #endif
+    bool _solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const;
 
     Scalar determinant() const;
 
@@ -283,58 +490,81 @@
 
   protected:
 
-
     void init()
     {
-      m_info = InvalidInput;
-      m_isInitialized = false;
-      m_numeric = 0;
-      m_symbolic = 0;
-      m_outerIndexPtr = 0;
-      m_innerIndexPtr = 0;
-      m_valuePtr      = 0;
+      m_info                  = InvalidInput;
+      m_isInitialized         = false;
+      m_numeric               = 0;
+      m_symbolic              = 0;
+      m_extractedDataAreDirty = true;
+
+      umfpack_defaults(m_control.data(), Scalar(),StorageIndex());
     }
-    
-    void grapInput(const MatrixType& mat)
+
+    void analyzePattern_impl()
     {
-      m_copyMatrix.resize(mat.rows(), mat.cols());
-      if( ((MatrixType::Flags&RowMajorBit)==RowMajorBit) || sizeof(typename MatrixType::Index)!=sizeof(int) || !mat.isCompressed() )
+      m_fact_errorCode = umfpack_symbolic(internal::convert_index<StorageIndex>(mp_matrix.rows()),
+                                          internal::convert_index<StorageIndex>(mp_matrix.cols()),
+                                          mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
+                                          &m_symbolic, m_control.data(), m_umfpackInfo.data());
+
+      m_isInitialized = true;
+      m_info = m_fact_errorCode ? InvalidInput : Success;
+      m_analysisIsOk = true;
+      m_factorizationIsOk = false;
+      m_extractedDataAreDirty = true;
+    }
+
+    void factorize_impl()
+    {
+
+      m_fact_errorCode = umfpack_numeric(mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
+                                         m_symbolic, &m_numeric, m_control.data(), m_umfpackInfo.data());
+
+      m_info = m_fact_errorCode == UMFPACK_OK ? Success : NumericalIssue;
+      m_factorizationIsOk = true;
+      m_extractedDataAreDirty = true;
+    }
+
+    template<typename MatrixDerived>
+    void grab(const EigenBase<MatrixDerived> &A)
+    {
+      mp_matrix.~UmfpackMatrixRef();
+      ::new (&mp_matrix) UmfpackMatrixRef(A.derived());
+    }
+
+    void grab(const UmfpackMatrixRef &A)
+    {
+      if(&(A.derived()) != &mp_matrix)
       {
-        // non supported input -> copy
-        m_copyMatrix = mat;
-        m_outerIndexPtr = m_copyMatrix.outerIndexPtr();
-        m_innerIndexPtr = m_copyMatrix.innerIndexPtr();
-        m_valuePtr      = m_copyMatrix.valuePtr();
-      }
-      else
-      {
-        m_outerIndexPtr = mat.outerIndexPtr();
-        m_innerIndexPtr = mat.innerIndexPtr();
-        m_valuePtr      = mat.valuePtr();
+        mp_matrix.~UmfpackMatrixRef();
+        ::new (&mp_matrix) UmfpackMatrixRef(A);
       }
     }
 
     // cached data to reduce reallocation, etc.
     mutable LUMatrixType m_l;
+    StorageIndex m_fact_errorCode;
+    UmfpackControl m_control;
+    mutable UmfpackInfo m_umfpackInfo;
+
     mutable LUMatrixType m_u;
     mutable IntColVectorType m_p;
     mutable IntRowVectorType m_q;
 
-    UmfpackMatrixType m_copyMatrix;
-    const Scalar* m_valuePtr;
-    const int* m_outerIndexPtr;
-    const int* m_innerIndexPtr;
+    UmfpackMatrixType m_dummy;
+    UmfpackMatrixRef mp_matrix;
+
     void* m_numeric;
     void* m_symbolic;
 
     mutable ComputationInfo m_info;
-    bool m_isInitialized;
     int m_factorizationIsOk;
     int m_analysisIsOk;
     mutable bool m_extractedDataAreDirty;
-    
+
   private:
-    UmfPackLU(UmfPackLU& ) { }
+    UmfPackLU(const UmfPackLU& ) { }
 };
 
 
@@ -344,7 +574,7 @@
   if (m_extractedDataAreDirty)
   {
     // get size of the data
-    int lnz, unz, rows, cols, nz_udiag;
+    StorageIndex lnz, unz, rows, cols, nz_udiag;
     umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar());
 
     // allocate data
@@ -370,25 +600,36 @@
 typename UmfPackLU<MatrixType>::Scalar UmfPackLU<MatrixType>::determinant() const
 {
   Scalar det;
-  umfpack_get_determinant(&det, 0, m_numeric, 0);
+  umfpack_get_determinant(&det, 0, m_numeric, 0, StorageIndex());
   return det;
 }
 
 template<typename MatrixType>
 template<typename BDerived,typename XDerived>
-bool UmfPackLU<MatrixType>::_solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const
+bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const
 {
-  const int rhsCols = b.cols();
+  Index rhsCols = b.cols();
   eigen_assert((BDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major rhs yet");
   eigen_assert((XDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major result yet");
   eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve");
-  
-  int errorCode;
+
+  Scalar* x_ptr = 0;
+  Matrix<Scalar,Dynamic,1> x_tmp;
+  if(x.innerStride()!=1)
+  {
+    x_tmp.resize(x.rows());
+    x_ptr = x_tmp.data();
+  }
   for (int j=0; j<rhsCols; ++j)
   {
-    errorCode = umfpack_solve(UMFPACK_A,
-        m_outerIndexPtr, m_innerIndexPtr, m_valuePtr,
-        &x.col(j).coeffRef(0), &b.const_cast_derived().col(j).coeffRef(0), m_numeric, 0, 0);
+    if(x.innerStride()==1)
+      x_ptr = &x.col(j).coeffRef(0);
+    StorageIndex errorCode = umfpack_solve(UMFPACK_A,
+                                mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
+                                x_ptr, &b.const_cast_derived().col(j).coeffRef(0),
+                                m_numeric, m_control.data(), m_umfpackInfo.data());
+    if(x.innerStride()!=1)
+      x.col(j) = x_tmp;
     if (errorCode!=0)
       return false;
   }
@@ -396,37 +637,6 @@
   return true;
 }
 
-
-namespace internal {
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<UmfPackLU<_MatrixType>, Rhs>
-  : solve_retval_base<UmfPackLU<_MatrixType>, Rhs>
-{
-  typedef UmfPackLU<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-template<typename _MatrixType, typename Rhs>
-struct sparse_solve_retval<UmfPackLU<_MatrixType>, Rhs>
-  : sparse_solve_retval_base<UmfPackLU<_MatrixType>, Rhs>
-{
-  typedef UmfPackLU<_MatrixType> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_UMFPACKSUPPORT_H

diff --git a/Eigen/src/misc/Image.h b/Eigen/src/misc/Image.h
index 75c5f43..b8b8a04 100644
--- a/Eigen/src/misc/Image.h
+++ b/Eigen/src/misc/Image.h

@@ -38,7 +38,6 @@
   typedef _DecompositionType DecompositionType;
   typedef typename DecompositionType::MatrixType MatrixType;
   typedef ReturnByValue<image_retval_base> Base;
-  typedef typename Base::Index Index;
 
   image_retval_base(const DecompositionType& dec, const MatrixType& originalMatrix)
     : m_dec(dec), m_rank(dec.rank()),
@@ -69,7 +68,6 @@
   typedef typename DecompositionType::MatrixType MatrixType; \
   typedef typename MatrixType::Scalar Scalar; \
   typedef typename MatrixType::RealScalar RealScalar; \
-  typedef typename MatrixType::Index Index; \
   typedef Eigen::internal::image_retval_base<DecompositionType> Base; \
   using Base::dec; \
   using Base::originalMatrix; \

diff --git a/Eigen/src/misc/Kernel.h b/Eigen/src/misc/Kernel.h
index b9e1518..bef5d6f 100644
--- a/Eigen/src/misc/Kernel.h
+++ b/Eigen/src/misc/Kernel.h

@@ -39,9 +39,8 @@
 {
   typedef _DecompositionType DecompositionType;
   typedef ReturnByValue<kernel_retval_base> Base;
-  typedef typename Base::Index Index;
 
-  kernel_retval_base(const DecompositionType& dec)
+  explicit kernel_retval_base(const DecompositionType& dec)
     : m_dec(dec),
       m_rank(dec.rank()),
       m_cols(m_rank==dec.cols() ? 1 : dec.cols() - m_rank)
@@ -68,7 +67,6 @@
   typedef typename DecompositionType::MatrixType MatrixType; \
   typedef typename MatrixType::Scalar Scalar; \
   typedef typename MatrixType::RealScalar RealScalar; \
-  typedef typename MatrixType::Index Index; \
   typedef Eigen::internal::kernel_retval_base<DecompositionType> Base; \
   using Base::dec; \
   using Base::rank; \

diff --git a/Eigen/src/misc/RealSvd2x2.h b/Eigen/src/misc/RealSvd2x2.h
new file mode 100644
index 0000000..abb4d3c
--- /dev/null
+++ b/Eigen/src/misc/RealSvd2x2.h

@@ -0,0 +1,55 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2013-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REALSVD2X2_H
+#define EIGEN_REALSVD2X2_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename MatrixType, typename RealScalar, typename Index>
+void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
+                         JacobiRotation<RealScalar> *j_left,
+                         JacobiRotation<RealScalar> *j_right)
+{
+  using std::sqrt;
+  using std::abs;
+  Matrix<RealScalar,2,2> m;
+  m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)),
+       numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q));
+  JacobiRotation<RealScalar> rot1;
+  RealScalar t = m.coeff(0,0) + m.coeff(1,1);
+  RealScalar d = m.coeff(1,0) - m.coeff(0,1);
+
+  if(abs(d) < (std::numeric_limits<RealScalar>::min)())
+  {
+    rot1.s() = RealScalar(0);
+    rot1.c() = RealScalar(1);
+  }
+  else
+  {
+    // If d!=0, then t/d cannot overflow because the magnitude of the
+    // entries forming d are not too small compared to the ones forming t.
+    RealScalar u = t / d;
+    RealScalar tmp = sqrt(RealScalar(1) + numext::abs2(u));
+    rot1.s() = RealScalar(1) / tmp;
+    rot1.c() = u / tmp;
+  }
+  m.applyOnTheLeft(0,1,rot1);
+  j_right->makeJacobi(m,0,1);
+  *j_left = rot1 * j_right->transpose();
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_REALSVD2X2_H

diff --git a/Eigen/src/misc/blas.h b/Eigen/src/misc/blas.h
index 6fce99e..25215b1 100644
--- a/Eigen/src/misc/blas.h
+++ b/Eigen/src/misc/blas.h

@@ -30,15 +30,15 @@
 int  BLASFUNC(zdotuw)  (int *, double  *, int *, double  *, int *, double*);
 int  BLASFUNC(zdotcw)  (int *, double  *, int *, double  *, int *, double*);
 
-int    BLASFUNC(saxpy) (int *, float  *, float  *, int *, float  *, int *);
-int    BLASFUNC(daxpy) (int *, double *, double *, int *, double *, int *);
-int    BLASFUNC(qaxpy) (int *, double *, double *, int *, double *, int *);
-int    BLASFUNC(caxpy) (int *, float  *, float  *, int *, float  *, int *);
-int    BLASFUNC(zaxpy) (int *, double *, double *, int *, double *, int *);
-int    BLASFUNC(xaxpy) (int *, double *, double *, int *, double *, int *);
-int    BLASFUNC(caxpyc)(int *, float  *, float  *, int *, float  *, int *);
-int    BLASFUNC(zaxpyc)(int *, double *, double *, int *, double *, int *);
-int    BLASFUNC(xaxpyc)(int *, double *, double *, int *, double *, int *);
+int    BLASFUNC(saxpy) (const int *, const float  *, const float  *, const int *, float  *, const int *);
+int    BLASFUNC(daxpy) (const int *, const double *, const double *, const int *, double *, const int *);
+int    BLASFUNC(qaxpy) (const int *, const double *, const double *, const int *, double *, const int *);
+int    BLASFUNC(caxpy) (const int *, const float  *, const float  *, const int *, float  *, const int *);
+int    BLASFUNC(zaxpy) (const int *, const double *, const double *, const int *, double *, const int *);
+int    BLASFUNC(xaxpy) (const int *, const double *, const double *, const int *, double *, const int *);
+int    BLASFUNC(caxpyc)(const int *, const float  *, const float  *, const int *, float  *, const int *);
+int    BLASFUNC(zaxpyc)(const int *, const double *, const double *, const int *, double *, const int *);
+int    BLASFUNC(xaxpyc)(const int *, const double *, const double *, const int *, double *, const int *);
 
 int    BLASFUNC(scopy) (int *, float  *, int *, float  *, int *);
 int    BLASFUNC(dcopy) (int *, double *, int *, double *, int *);
@@ -177,31 +177,19 @@
 int BLASFUNC(xgerc)(int *,    int *, double *, double *, int *,
 		    double *, int *, double *, int *);
 
-int BLASFUNC(sgemv)(char *, int *, int *, float  *, float  *, int *,
-		    float  *, int *, float  *, float  *, int *);
-int BLASFUNC(dgemv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-int BLASFUNC(qgemv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-int BLASFUNC(cgemv)(char *, int *, int *, float  *, float  *, int *,
-		    float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zgemv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-int BLASFUNC(xgemv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
+int BLASFUNC(sgemv)(const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(qgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(cgemv)(const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 
-int BLASFUNC(strsv) (char *, char *, char *, int *, float  *, int *,
-		     float  *, int *);
-int BLASFUNC(dtrsv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(qtrsv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(ctrsv) (char *, char *, char *, int *, float  *, int *,
-		     float  *, int *);
-int BLASFUNC(ztrsv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(xtrsv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
+int BLASFUNC(strsv) (const char *, const char *, const char *, const int *, const float  *, const int *, float  *, const int *);
+int BLASFUNC(dtrsv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(qtrsv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(ctrsv) (const char *, const char *, const char *, const int *, const float  *, const int *, float  *, const int *);
+int BLASFUNC(ztrsv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(xtrsv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
 
 int BLASFUNC(stpsv) (char *, char *, char *, int *, float  *, float  *, int *);
 int BLASFUNC(dtpsv) (char *, char *, char *, int *, double *, double *, int *);
@@ -210,18 +198,12 @@
 int BLASFUNC(ztpsv) (char *, char *, char *, int *, double *, double *, int *);
 int BLASFUNC(xtpsv) (char *, char *, char *, int *, double *, double *, int *);
 
-int BLASFUNC(strmv) (char *, char *, char *, int *, float  *, int *,
-		     float  *, int *);
-int BLASFUNC(dtrmv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(qtrmv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(ctrmv) (char *, char *, char *, int *, float  *, int *,
-		     float  *, int *);
-int BLASFUNC(ztrmv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(xtrmv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
+int BLASFUNC(strmv) (const char *, const char *, const char *, const int *, const float  *, const int *, float  *, const int *);
+int BLASFUNC(dtrmv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(qtrmv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(ctrmv) (const char *, const char *, const char *, const int *, const float  *, const int *, float  *, const int *);
+int BLASFUNC(ztrmv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(xtrmv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
 
 int BLASFUNC(stpmv) (char *, char *, char *, int *, float  *, float  *, int *);
 int BLASFUNC(dtpmv) (char *, char *, char *, int *, double *, double *, int *);
@@ -244,18 +226,9 @@
 int BLASFUNC(ztbsv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
 int BLASFUNC(xtbsv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
 
-int BLASFUNC(ssymv) (char *, int *, float  *, float *, int *,
-		     float  *, int *, float *, float *, int *);
-int BLASFUNC(dsymv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(qsymv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(csymv) (char *, int *, float  *, float *, int *,
-		     float  *, int *, float *, float *, int *);
-int BLASFUNC(zsymv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(xsymv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
+int BLASFUNC(ssymv) (const char *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(dsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(qsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 
 int BLASFUNC(sspmv) (char *, int *, float  *, float *,
 		     float  *, int *, float *, float *, int *);
@@ -263,38 +236,17 @@
 		     double  *, int *, double *, double *, int *);
 int BLASFUNC(qspmv) (char *, int *, double  *, double *,
 		     double  *, int *, double *, double *, int *);
-int BLASFUNC(cspmv) (char *, int *, float  *, float *,
-		     float  *, int *, float *, float *, int *);
-int BLASFUNC(zspmv) (char *, int *, double  *, double *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(xspmv) (char *, int *, double  *, double *,
-		     double  *, int *, double *, double *, int *);
 
-int BLASFUNC(ssyr) (char *, int *, float   *, float  *, int *,
-		    float  *, int *);
-int BLASFUNC(dsyr) (char *, int *, double  *, double *, int *,
-		    double *, int *);
-int BLASFUNC(qsyr) (char *, int *, double  *, double *, int *,
-		    double *, int *);
-int BLASFUNC(csyr) (char *, int *, float   *, float  *, int *,
-		    float  *, int *);
-int BLASFUNC(zsyr) (char *, int *, double  *, double *, int *,
-		    double *, int *);
-int BLASFUNC(xsyr) (char *, int *, double  *, double *, int *,
-		    double *, int *);
+int BLASFUNC(ssyr) (const char *, const int *, const float   *, const float  *, const int *, float  *, const int *);
+int BLASFUNC(dsyr) (const char *, const int *, const double  *, const double *, const int *, double *, const int *);
+int BLASFUNC(qsyr) (const char *, const int *, const double  *, const double *, const int *, double *, const int *);
 
-int BLASFUNC(ssyr2) (char *, int *, float   *,
-		     float  *, int *, float  *, int *, float  *, int *);
-int BLASFUNC(dsyr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *, int *);
-int BLASFUNC(qsyr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *, int *);
-int BLASFUNC(csyr2) (char *, int *, float   *,
-		     float  *, int *, float  *, int *, float  *, int *);
-int BLASFUNC(zsyr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *, int *);
-int BLASFUNC(xsyr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *, int *);
+int BLASFUNC(ssyr2) (const char *, const int *, const float   *, const float  *, const int *, const float  *, const int *, float  *, const int *);
+int BLASFUNC(dsyr2) (const char *, const int *, const double  *, const double *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(qsyr2) (const char *, const int *, const double  *, const double *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(csyr2) (const char *, const int *, const float   *, const float  *, const int *, const float  *, const int *, float  *, const int *);
+int BLASFUNC(zsyr2) (const char *, const int *, const double  *, const double *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(xsyr2) (const char *, const int *, const double  *, const double *, const int *, const double *, const int *, double *, const int *);
 
 int BLASFUNC(sspr) (char *, int *, float   *, float  *, int *,
 		    float  *);
@@ -302,12 +254,6 @@
 		    double *);
 int BLASFUNC(qspr) (char *, int *, double  *, double *, int *,
 		    double *);
-int BLASFUNC(cspr) (char *, int *, float   *, float  *, int *,
-		    float  *);
-int BLASFUNC(zspr) (char *, int *, double  *, double *, int *,
-		    double *);
-int BLASFUNC(xspr) (char *, int *, double  *, double *, int *,
-		    double *);
 
 int BLASFUNC(sspr2) (char *, int *, float   *,
 		     float  *, int *, float  *, int *, float  *);
@@ -347,12 +293,9 @@
 int BLASFUNC(xhpr2) (char *, int *, double  *,
 		     double *, int *, double *, int *, double *);
 
-int BLASFUNC(chemv) (char *, int *, float  *, float *, int *,
-		     float  *, int *, float *, float *, int *);
-int BLASFUNC(zhemv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(xhemv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
+int BLASFUNC(chemv) (const char *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zhemv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xhemv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 
 int BLASFUNC(chpmv) (char *, int *, float  *, float *,
 		     float  *, int *, float *, float *, int *);
@@ -401,18 +344,12 @@
 
 /* Level 3 routines */
 
-int BLASFUNC(sgemm)(char *, char *, int *, int *, int *, float *,
-	   float  *, int *, float  *, int *, float  *, float  *, int *);
-int BLASFUNC(dgemm)(char *, char *, int *, int *, int *, double *,
-	   double *, int *, double *, int *, double *, double *, int *);
-int BLASFUNC(qgemm)(char *, char *, int *, int *, int *, double *,
-	   double *, int *, double *, int *, double *, double *, int *);
-int BLASFUNC(cgemm)(char *, char *, int *, int *, int *, float *,
-	   float  *, int *, float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zgemm)(char *, char *, int *, int *, int *, double *,
-	   double *, int *, double *, int *, double *, double *, int *);
-int BLASFUNC(xgemm)(char *, char *, int *, int *, int *, double *,
-	   double *, int *, double *, int *, double *, double *, int *);
+int BLASFUNC(sgemm)(const char *, const char *, const int *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(qgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(cgemm)(const char *, const char *, const int *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 
 int BLASFUNC(cgemm3m)(char *, char *, int *, int *, int *, float *,
 	   float  *, int *, float  *, int *, float  *, float  *, int *);
@@ -434,84 +371,48 @@
 		     double *, double  *, int *, double  *, int *,
 		     double *, double  *, int *);
 
-int BLASFUNC(strsm)(char *, char *, char *, char *, int *, int *,
-	   float *,  float *, int *, float *, int *);
-int BLASFUNC(dtrsm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(qtrsm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(ctrsm)(char *, char *, char *, char *, int *, int *,
-	   float *,  float *, int *, float *, int *);
-int BLASFUNC(ztrsm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(xtrsm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
+int BLASFUNC(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *,  const float *,  const int *, float *,  const int *);
+int BLASFUNC(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+int BLASFUNC(qtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+int BLASFUNC(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *,  const float *,  const int *, float *,  const int *);
+int BLASFUNC(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+int BLASFUNC(xtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
 
-int BLASFUNC(strmm)(char *, char *, char *, char *, int *, int *,
-	   float *,  float *, int *, float *, int *);
-int BLASFUNC(dtrmm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(qtrmm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(ctrmm)(char *, char *, char *, char *, int *, int *,
-	   float *,  float *, int *, float *, int *);
-int BLASFUNC(ztrmm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(xtrmm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
+int BLASFUNC(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *,  const float *,  const int *, float *,  const int *);
+int BLASFUNC(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+int BLASFUNC(qtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+int BLASFUNC(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *,  const float *,  const int *, float *,  const int *);
+int BLASFUNC(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+int BLASFUNC(xtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
 
-int BLASFUNC(ssymm)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, int *, float  *, float  *, int *);
-int BLASFUNC(dsymm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-int BLASFUNC(qsymm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-int BLASFUNC(csymm)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zsymm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-int BLASFUNC(xsymm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
+int BLASFUNC(ssymm)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(qsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(csymm)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 
-int BLASFUNC(csymm3m)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zsymm3m)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-int BLASFUNC(xsymm3m)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
+int BLASFUNC(csymm3m)(char *, char *, int *, int *, float  *, float  *, int *, float  *, int *, float  *, float  *, int *);
+int BLASFUNC(zsymm3m)(char *, char *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *);
+int BLASFUNC(xsymm3m)(char *, char *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *);
 
-int BLASFUNC(ssyrk)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, float  *, int *);
-int BLASFUNC(dsyrk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-int BLASFUNC(qsyrk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-int BLASFUNC(csyrk)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, float  *, int *);
-int BLASFUNC(zsyrk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-int BLASFUNC(xsyrk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
+int BLASFUNC(ssyrk)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(dsyrk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(qsyrk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(csyrk)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zsyrk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xsyrk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *);
 
-int BLASFUNC(ssyr2k)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float *, int *, float  *, float  *, int *);
-int BLASFUNC(dsyr2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(qsyr2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(csyr2k)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float *, int *, float  *, float  *, int *);
-int BLASFUNC(zsyr2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(xsyr2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
+int BLASFUNC(ssyr2k)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *);
+int BLASFUNC(qsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *);
+int BLASFUNC(csyr2k)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *);
+int BLASFUNC(xsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *);
 
-int BLASFUNC(chemm)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zhemm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-int BLASFUNC(xhemm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
+int BLASFUNC(chemm)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 
 int BLASFUNC(chemm3m)(char *, char *, int *, int *, float  *, float  *, int *,
 	   float  *, int *, float  *, float  *, int *);
@@ -520,136 +421,17 @@
 int BLASFUNC(xhemm3m)(char *, char *, int *, int *, double *, double *, int *,
 	   double *, int *, double *, double *, int *);
 
-int BLASFUNC(cherk)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, float  *, int *);
-int BLASFUNC(zherk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-int BLASFUNC(xherk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
+int BLASFUNC(cherk)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zherk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xherk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *);
 
-int BLASFUNC(cher2k)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float *, int *, float  *, float  *, int *);
-int BLASFUNC(zher2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(xher2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(cher2m)(char *, char *, char *, int *, int *, float  *, float  *, int *,
-	   float *, int *, float  *, float  *, int *);
-int BLASFUNC(zher2m)(char *, char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(xher2m)(char *, char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
+int BLASFUNC(cher2k)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(cher2m)(const char *, const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zher2m)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *);
+int BLASFUNC(xher2m)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *);
 
-int BLASFUNC(sgemt)(char *, int *, int *, float  *, float  *, int *,
-		    float  *, int *);
-int BLASFUNC(dgemt)(char *, int *, int *, double *, double *, int *,
-		    double *, int *);
-int BLASFUNC(cgemt)(char *, int *, int *, float  *, float  *, int *,
-		    float  *, int *);
-int BLASFUNC(zgemt)(char *, int *, int *, double *, double *, int *,
-		    double *, int *);
-
-int BLASFUNC(sgema)(char *, char *, int *, int *, float  *,
-		    float  *, int *, float *, float  *, int *, float *, int *);
-int BLASFUNC(dgema)(char *, char *, int *, int *, double *,
-		    double *, int *, double*, double *, int *, double*, int *);
-int BLASFUNC(cgema)(char *, char *, int *, int *, float  *,
-		    float  *, int *, float *, float  *, int *, float *, int *);
-int BLASFUNC(zgema)(char *, char *, int *, int *, double *,
-		    double *, int *, double*, double *, int *, double*, int *);
-
-int BLASFUNC(sgems)(char *, char *, int *, int *, float  *,
-		    float  *, int *, float *, float  *, int *, float *, int *);
-int BLASFUNC(dgems)(char *, char *, int *, int *, double *,
-		    double *, int *, double*, double *, int *, double*, int *);
-int BLASFUNC(cgems)(char *, char *, int *, int *, float  *,
-		    float  *, int *, float *, float  *, int *, float *, int *);
-int BLASFUNC(zgems)(char *, char *, int *, int *, double *,
-		    double *, int *, double*, double *, int *, double*, int *);
-
-int BLASFUNC(sgetf2)(int *, int *, float  *, int *, int *, int *);
-int BLASFUNC(dgetf2)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(qgetf2)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(cgetf2)(int *, int *, float  *, int *, int *, int *);
-int BLASFUNC(zgetf2)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(xgetf2)(int *, int *, double *, int *, int *, int *);
-
-int BLASFUNC(sgetrf)(int *, int *, float  *, int *, int *, int *);
-int BLASFUNC(dgetrf)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(qgetrf)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(cgetrf)(int *, int *, float  *, int *, int *, int *);
-int BLASFUNC(zgetrf)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(xgetrf)(int *, int *, double *, int *, int *, int *);
-
-int BLASFUNC(slaswp)(int *, float  *, int *, int *, int *, int *, int *);
-int BLASFUNC(dlaswp)(int *, double *, int *, int *, int *, int *, int *);
-int BLASFUNC(qlaswp)(int *, double *, int *, int *, int *, int *, int *);
-int BLASFUNC(claswp)(int *, float  *, int *, int *, int *, int *, int *);
-int BLASFUNC(zlaswp)(int *, double *, int *, int *, int *, int *, int *);
-int BLASFUNC(xlaswp)(int *, double *, int *, int *, int *, int *, int *);
-
-int BLASFUNC(sgetrs)(char *, int *, int *, float  *, int *, int *, float  *, int *, int *);
-int BLASFUNC(dgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
-int BLASFUNC(qgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
-int BLASFUNC(cgetrs)(char *, int *, int *, float  *, int *, int *, float  *, int *, int *);
-int BLASFUNC(zgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
-int BLASFUNC(xgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
-
-int BLASFUNC(sgesv)(int *, int *, float  *, int *, int *, float *, int *, int *);
-int BLASFUNC(dgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
-int BLASFUNC(qgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
-int BLASFUNC(cgesv)(int *, int *, float  *, int *, int *, float *, int *, int *);
-int BLASFUNC(zgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
-int BLASFUNC(xgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
-
-int BLASFUNC(spotf2)(char *, int *, float  *, int *, int *);
-int BLASFUNC(dpotf2)(char *, int *, double *, int *, int *);
-int BLASFUNC(qpotf2)(char *, int *, double *, int *, int *);
-int BLASFUNC(cpotf2)(char *, int *, float  *, int *, int *);
-int BLASFUNC(zpotf2)(char *, int *, double *, int *, int *);
-int BLASFUNC(xpotf2)(char *, int *, double *, int *, int *);
-
-int BLASFUNC(spotrf)(char *, int *, float  *, int *, int *);
-int BLASFUNC(dpotrf)(char *, int *, double *, int *, int *);
-int BLASFUNC(qpotrf)(char *, int *, double *, int *, int *);
-int BLASFUNC(cpotrf)(char *, int *, float  *, int *, int *);
-int BLASFUNC(zpotrf)(char *, int *, double *, int *, int *);
-int BLASFUNC(xpotrf)(char *, int *, double *, int *, int *);
-
-int BLASFUNC(slauu2)(char *, int *, float  *, int *, int *);
-int BLASFUNC(dlauu2)(char *, int *, double *, int *, int *);
-int BLASFUNC(qlauu2)(char *, int *, double *, int *, int *);
-int BLASFUNC(clauu2)(char *, int *, float  *, int *, int *);
-int BLASFUNC(zlauu2)(char *, int *, double *, int *, int *);
-int BLASFUNC(xlauu2)(char *, int *, double *, int *, int *);
-
-int BLASFUNC(slauum)(char *, int *, float  *, int *, int *);
-int BLASFUNC(dlauum)(char *, int *, double *, int *, int *);
-int BLASFUNC(qlauum)(char *, int *, double *, int *, int *);
-int BLASFUNC(clauum)(char *, int *, float  *, int *, int *);
-int BLASFUNC(zlauum)(char *, int *, double *, int *, int *);
-int BLASFUNC(xlauum)(char *, int *, double *, int *, int *);
-
-int BLASFUNC(strti2)(char *, char *, int *, float  *, int *, int *);
-int BLASFUNC(dtrti2)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(qtrti2)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(ctrti2)(char *, char *, int *, float  *, int *, int *);
-int BLASFUNC(ztrti2)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(xtrti2)(char *, char *, int *, double *, int *, int *);
-
-int BLASFUNC(strtri)(char *, char *, int *, float  *, int *, int *);
-int BLASFUNC(dtrtri)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(qtrtri)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(ctrtri)(char *, char *, int *, float  *, int *, int *);
-int BLASFUNC(ztrtri)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(xtrtri)(char *, char *, int *, double *, int *, int *);
-
-int BLASFUNC(spotri)(char *, int *, float  *, int *, int *);
-int BLASFUNC(dpotri)(char *, int *, double *, int *, int *);
-int BLASFUNC(qpotri)(char *, int *, double *, int *, int *);
-int BLASFUNC(cpotri)(char *, int *, float  *, int *, int *);
-int BLASFUNC(zpotri)(char *, int *, double *, int *, int *);
-int BLASFUNC(xpotri)(char *, int *, double *, int *, int *);
 
 #ifdef __cplusplus
 }

diff --git a/Eigen/src/misc/lapack.h b/Eigen/src/misc/lapack.h
new file mode 100644
index 0000000..249f357
--- /dev/null
+++ b/Eigen/src/misc/lapack.h

@@ -0,0 +1,152 @@
+#ifndef LAPACK_H
+#define LAPACK_H
+
+#include "blas.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+int BLASFUNC(csymv) (const char *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+
+
+int BLASFUNC(cspmv) (char *, int *, float  *, float *,
+         float  *, int *, float *, float *, int *);
+int BLASFUNC(zspmv) (char *, int *, double  *, double *,
+         double  *, int *, double *, double *, int *);
+int BLASFUNC(xspmv) (char *, int *, double  *, double *,
+         double  *, int *, double *, double *, int *);
+
+int BLASFUNC(csyr) (char *, int *, float   *, float  *, int *,
+        float  *, int *);
+int BLASFUNC(zsyr) (char *, int *, double  *, double *, int *,
+        double *, int *);
+int BLASFUNC(xsyr) (char *, int *, double  *, double *, int *,
+        double *, int *);
+
+int BLASFUNC(cspr) (char *, int *, float   *, float  *, int *,
+        float  *);
+int BLASFUNC(zspr) (char *, int *, double  *, double *, int *,
+        double *);
+int BLASFUNC(xspr) (char *, int *, double  *, double *, int *,
+        double *);
+
+int BLASFUNC(sgemt)(char *, int *, int *, float  *, float  *, int *,
+        float  *, int *);
+int BLASFUNC(dgemt)(char *, int *, int *, double *, double *, int *,
+        double *, int *);
+int BLASFUNC(cgemt)(char *, int *, int *, float  *, float  *, int *,
+        float  *, int *);
+int BLASFUNC(zgemt)(char *, int *, int *, double *, double *, int *,
+        double *, int *);
+
+int BLASFUNC(sgema)(char *, char *, int *, int *, float  *,
+        float  *, int *, float *, float  *, int *, float *, int *);
+int BLASFUNC(dgema)(char *, char *, int *, int *, double *,
+        double *, int *, double*, double *, int *, double*, int *);
+int BLASFUNC(cgema)(char *, char *, int *, int *, float  *,
+        float  *, int *, float *, float  *, int *, float *, int *);
+int BLASFUNC(zgema)(char *, char *, int *, int *, double *,
+        double *, int *, double*, double *, int *, double*, int *);
+
+int BLASFUNC(sgems)(char *, char *, int *, int *, float  *,
+        float  *, int *, float *, float  *, int *, float *, int *);
+int BLASFUNC(dgems)(char *, char *, int *, int *, double *,
+        double *, int *, double*, double *, int *, double*, int *);
+int BLASFUNC(cgems)(char *, char *, int *, int *, float  *,
+        float  *, int *, float *, float  *, int *, float *, int *);
+int BLASFUNC(zgems)(char *, char *, int *, int *, double *,
+        double *, int *, double*, double *, int *, double*, int *);
+
+int BLASFUNC(sgetf2)(int *, int *, float  *, int *, int *, int *);
+int BLASFUNC(dgetf2)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(qgetf2)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(cgetf2)(int *, int *, float  *, int *, int *, int *);
+int BLASFUNC(zgetf2)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(xgetf2)(int *, int *, double *, int *, int *, int *);
+
+int BLASFUNC(sgetrf)(int *, int *, float  *, int *, int *, int *);
+int BLASFUNC(dgetrf)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(qgetrf)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(cgetrf)(int *, int *, float  *, int *, int *, int *);
+int BLASFUNC(zgetrf)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(xgetrf)(int *, int *, double *, int *, int *, int *);
+
+int BLASFUNC(slaswp)(int *, float  *, int *, int *, int *, int *, int *);
+int BLASFUNC(dlaswp)(int *, double *, int *, int *, int *, int *, int *);
+int BLASFUNC(qlaswp)(int *, double *, int *, int *, int *, int *, int *);
+int BLASFUNC(claswp)(int *, float  *, int *, int *, int *, int *, int *);
+int BLASFUNC(zlaswp)(int *, double *, int *, int *, int *, int *, int *);
+int BLASFUNC(xlaswp)(int *, double *, int *, int *, int *, int *, int *);
+
+int BLASFUNC(sgetrs)(char *, int *, int *, float  *, int *, int *, float  *, int *, int *);
+int BLASFUNC(dgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
+int BLASFUNC(qgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
+int BLASFUNC(cgetrs)(char *, int *, int *, float  *, int *, int *, float  *, int *, int *);
+int BLASFUNC(zgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
+int BLASFUNC(xgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
+
+int BLASFUNC(sgesv)(int *, int *, float  *, int *, int *, float *, int *, int *);
+int BLASFUNC(dgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
+int BLASFUNC(qgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
+int BLASFUNC(cgesv)(int *, int *, float  *, int *, int *, float *, int *, int *);
+int BLASFUNC(zgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
+int BLASFUNC(xgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
+
+int BLASFUNC(spotf2)(char *, int *, float  *, int *, int *);
+int BLASFUNC(dpotf2)(char *, int *, double *, int *, int *);
+int BLASFUNC(qpotf2)(char *, int *, double *, int *, int *);
+int BLASFUNC(cpotf2)(char *, int *, float  *, int *, int *);
+int BLASFUNC(zpotf2)(char *, int *, double *, int *, int *);
+int BLASFUNC(xpotf2)(char *, int *, double *, int *, int *);
+
+int BLASFUNC(spotrf)(char *, int *, float  *, int *, int *);
+int BLASFUNC(dpotrf)(char *, int *, double *, int *, int *);
+int BLASFUNC(qpotrf)(char *, int *, double *, int *, int *);
+int BLASFUNC(cpotrf)(char *, int *, float  *, int *, int *);
+int BLASFUNC(zpotrf)(char *, int *, double *, int *, int *);
+int BLASFUNC(xpotrf)(char *, int *, double *, int *, int *);
+
+int BLASFUNC(slauu2)(char *, int *, float  *, int *, int *);
+int BLASFUNC(dlauu2)(char *, int *, double *, int *, int *);
+int BLASFUNC(qlauu2)(char *, int *, double *, int *, int *);
+int BLASFUNC(clauu2)(char *, int *, float  *, int *, int *);
+int BLASFUNC(zlauu2)(char *, int *, double *, int *, int *);
+int BLASFUNC(xlauu2)(char *, int *, double *, int *, int *);
+
+int BLASFUNC(slauum)(char *, int *, float  *, int *, int *);
+int BLASFUNC(dlauum)(char *, int *, double *, int *, int *);
+int BLASFUNC(qlauum)(char *, int *, double *, int *, int *);
+int BLASFUNC(clauum)(char *, int *, float  *, int *, int *);
+int BLASFUNC(zlauum)(char *, int *, double *, int *, int *);
+int BLASFUNC(xlauum)(char *, int *, double *, int *, int *);
+
+int BLASFUNC(strti2)(char *, char *, int *, float  *, int *, int *);
+int BLASFUNC(dtrti2)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(qtrti2)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(ctrti2)(char *, char *, int *, float  *, int *, int *);
+int BLASFUNC(ztrti2)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(xtrti2)(char *, char *, int *, double *, int *, int *);
+
+int BLASFUNC(strtri)(char *, char *, int *, float  *, int *, int *);
+int BLASFUNC(dtrtri)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(qtrtri)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(ctrtri)(char *, char *, int *, float  *, int *, int *);
+int BLASFUNC(ztrtri)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(xtrtri)(char *, char *, int *, double *, int *, int *);
+
+int BLASFUNC(spotri)(char *, int *, float  *, int *, int *);
+int BLASFUNC(dpotri)(char *, int *, double *, int *, int *);
+int BLASFUNC(qpotri)(char *, int *, double *, int *, int *);
+int BLASFUNC(cpotri)(char *, int *, float  *, int *, int *);
+int BLASFUNC(zpotri)(char *, int *, double *, int *, int *);
+int BLASFUNC(xpotri)(char *, int *, double *, int *, int *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

diff --git a/Eigen/src/misc/lapacke.h b/Eigen/src/misc/lapacke.h
new file mode 100755
index 0000000..3d8e24f
--- /dev/null
+++ b/Eigen/src/misc/lapacke.h

@@ -0,0 +1,16292 @@
+/*****************************************************************************
+  Copyright (c) 2010, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+******************************************************************************
+* Contents: Native C interface to LAPACK
+* Author: Intel Corporation
+* Generated November, 2011
+*****************************************************************************/
+
+#ifndef _MKL_LAPACKE_H_
+
+#ifndef _LAPACKE_H_
+#define _LAPACKE_H_
+
+/*
+*  Turn on HAVE_LAPACK_CONFIG_H to redefine C-LAPACK datatypes
+*/
+#ifdef HAVE_LAPACK_CONFIG_H
+#include "lapacke_config.h"
+#endif
+
+#include <stdlib.h>
+
+#ifndef lapack_int
+#define lapack_int     int
+#endif
+
+#ifndef lapack_logical
+#define lapack_logical lapack_int
+#endif
+
+/* Complex types are structures equivalent to the
+* Fortran complex types COMPLEX(4) and COMPLEX(8).
+*
+* One can also redefine the types with his own types
+* for example by including in the code definitions like
+*
+* #define lapack_complex_float std::complex<float>
+* #define lapack_complex_double std::complex<double>
+*
+* or define these types in the command line:
+*
+* -Dlapack_complex_float="std::complex<float>"
+* -Dlapack_complex_double="std::complex<double>"
+*/
+
+#ifndef LAPACK_COMPLEX_CUSTOM
+
+/* Complex type (single precision) */
+#ifndef lapack_complex_float
+#include <complex.h>
+#define lapack_complex_float    float _Complex
+#endif
+
+#ifndef lapack_complex_float_real
+#define lapack_complex_float_real(z)       (creal(z))
+#endif
+
+#ifndef lapack_complex_float_imag
+#define lapack_complex_float_imag(z)       (cimag(z))
+#endif
+
+lapack_complex_float lapack_make_complex_float( float re, float im );
+
+/* Complex type (double precision) */
+#ifndef lapack_complex_double
+#include <complex.h>
+#define lapack_complex_double   double _Complex
+#endif
+
+#ifndef lapack_complex_double_real
+#define lapack_complex_double_real(z)      (creal(z))
+#endif
+
+#ifndef lapack_complex_double_imag
+#define lapack_complex_double_imag(z)       (cimag(z))
+#endif
+
+lapack_complex_double lapack_make_complex_double( double re, double im );
+
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#ifndef LAPACKE_malloc
+#define LAPACKE_malloc( size ) malloc( size )
+#endif
+#ifndef LAPACKE_free
+#define LAPACKE_free( p )      free( p )
+#endif
+
+#define LAPACK_C2INT( x ) (lapack_int)(*((float*)&x ))
+#define LAPACK_Z2INT( x ) (lapack_int)(*((double*)&x ))
+
+#define LAPACK_ROW_MAJOR               101
+#define LAPACK_COL_MAJOR               102
+
+#define LAPACK_WORK_MEMORY_ERROR       -1010
+#define LAPACK_TRANSPOSE_MEMORY_ERROR  -1011
+
+/* Callback logical functions of one, two, or three arguments are used
+*  to select eigenvalues to sort to the top left of the Schur form.
+*  The value is selected if function returns TRUE (non-zero). */
+
+typedef lapack_logical (*LAPACK_S_SELECT2) ( const float*, const float* );
+typedef lapack_logical (*LAPACK_S_SELECT3)
+    ( const float*, const float*, const float* );
+typedef lapack_logical (*LAPACK_D_SELECT2) ( const double*, const double* );
+typedef lapack_logical (*LAPACK_D_SELECT3)
+    ( const double*, const double*, const double* );
+
+typedef lapack_logical (*LAPACK_C_SELECT1) ( const lapack_complex_float* );
+typedef lapack_logical (*LAPACK_C_SELECT2)
+    ( const lapack_complex_float*, const lapack_complex_float* );
+typedef lapack_logical (*LAPACK_Z_SELECT1) ( const lapack_complex_double* );
+typedef lapack_logical (*LAPACK_Z_SELECT2)
+    ( const lapack_complex_double*, const lapack_complex_double* );
+
+#include "lapacke_mangling.h"
+
+#define LAPACK_lsame LAPACK_GLOBAL(lsame,LSAME)
+lapack_logical LAPACK_lsame( char* ca,  char* cb,
+                              lapack_int lca, lapack_int lcb );
+
+/* C-LAPACK function prototypes */
+
+lapack_int LAPACKE_sbdsdc( int matrix_order, char uplo, char compq,
+                           lapack_int n, float* d, float* e, float* u,
+                           lapack_int ldu, float* vt, lapack_int ldvt, float* q,
+                           lapack_int* iq );
+lapack_int LAPACKE_dbdsdc( int matrix_order, char uplo, char compq,
+                           lapack_int n, double* d, double* e, double* u,
+                           lapack_int ldu, double* vt, lapack_int ldvt,
+                           double* q, lapack_int* iq );
+
+lapack_int LAPACKE_sbdsqr( int matrix_order, char uplo, lapack_int n,
+                           lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                           float* d, float* e, float* vt, lapack_int ldvt,
+                           float* u, lapack_int ldu, float* c, lapack_int ldc );
+lapack_int LAPACKE_dbdsqr( int matrix_order, char uplo, lapack_int n,
+                           lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                           double* d, double* e, double* vt, lapack_int ldvt,
+                           double* u, lapack_int ldu, double* c,
+                           lapack_int ldc );
+lapack_int LAPACKE_cbdsqr( int matrix_order, char uplo, lapack_int n,
+                           lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                           float* d, float* e, lapack_complex_float* vt,
+                           lapack_int ldvt, lapack_complex_float* u,
+                           lapack_int ldu, lapack_complex_float* c,
+                           lapack_int ldc );
+lapack_int LAPACKE_zbdsqr( int matrix_order, char uplo, lapack_int n,
+                           lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                           double* d, double* e, lapack_complex_double* vt,
+                           lapack_int ldvt, lapack_complex_double* u,
+                           lapack_int ldu, lapack_complex_double* c,
+                           lapack_int ldc );
+
+lapack_int LAPACKE_sdisna( char job, lapack_int m, lapack_int n, const float* d,
+                           float* sep );
+lapack_int LAPACKE_ddisna( char job, lapack_int m, lapack_int n,
+                           const double* d, double* sep );
+
+lapack_int LAPACKE_sgbbrd( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int ncc, lapack_int kl,
+                           lapack_int ku, float* ab, lapack_int ldab, float* d,
+                           float* e, float* q, lapack_int ldq, float* pt,
+                           lapack_int ldpt, float* c, lapack_int ldc );
+lapack_int LAPACKE_dgbbrd( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int ncc, lapack_int kl,
+                           lapack_int ku, double* ab, lapack_int ldab,
+                           double* d, double* e, double* q, lapack_int ldq,
+                           double* pt, lapack_int ldpt, double* c,
+                           lapack_int ldc );
+lapack_int LAPACKE_cgbbrd( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int ncc, lapack_int kl,
+                           lapack_int ku, lapack_complex_float* ab,
+                           lapack_int ldab, float* d, float* e,
+                           lapack_complex_float* q, lapack_int ldq,
+                           lapack_complex_float* pt, lapack_int ldpt,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zgbbrd( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int ncc, lapack_int kl,
+                           lapack_int ku, lapack_complex_double* ab,
+                           lapack_int ldab, double* d, double* e,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_complex_double* pt, lapack_int ldpt,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sgbcon( int matrix_order, char norm, lapack_int n,
+                           lapack_int kl, lapack_int ku, const float* ab,
+                           lapack_int ldab, const lapack_int* ipiv, float anorm,
+                           float* rcond );
+lapack_int LAPACKE_dgbcon( int matrix_order, char norm, lapack_int n,
+                           lapack_int kl, lapack_int ku, const double* ab,
+                           lapack_int ldab, const lapack_int* ipiv,
+                           double anorm, double* rcond );
+lapack_int LAPACKE_cgbcon( int matrix_order, char norm, lapack_int n,
+                           lapack_int kl, lapack_int ku,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_zgbcon( int matrix_order, char norm, lapack_int n,
+                           lapack_int kl, lapack_int ku,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_sgbequ( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, const float* ab,
+                           lapack_int ldab, float* r, float* c, float* rowcnd,
+                           float* colcnd, float* amax );
+lapack_int LAPACKE_dgbequ( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, const double* ab,
+                           lapack_int ldab, double* r, double* c,
+                           double* rowcnd, double* colcnd, double* amax );
+lapack_int LAPACKE_cgbequ( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           float* r, float* c, float* rowcnd, float* colcnd,
+                           float* amax );
+lapack_int LAPACKE_zgbequ( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           double* r, double* c, double* rowcnd, double* colcnd,
+                           double* amax );
+
+lapack_int LAPACKE_sgbequb( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_int kl, lapack_int ku, const float* ab,
+                            lapack_int ldab, float* r, float* c, float* rowcnd,
+                            float* colcnd, float* amax );
+lapack_int LAPACKE_dgbequb( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_int kl, lapack_int ku, const double* ab,
+                            lapack_int ldab, double* r, double* c,
+                            double* rowcnd, double* colcnd, double* amax );
+lapack_int LAPACKE_cgbequb( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_int kl, lapack_int ku,
+                            const lapack_complex_float* ab, lapack_int ldab,
+                            float* r, float* c, float* rowcnd, float* colcnd,
+                            float* amax );
+lapack_int LAPACKE_zgbequb( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_int kl, lapack_int ku,
+                            const lapack_complex_double* ab, lapack_int ldab,
+                            double* r, double* c, double* rowcnd,
+                            double* colcnd, double* amax );
+
+lapack_int LAPACKE_sgbrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const float* ab, lapack_int ldab, const float* afb,
+                           lapack_int ldafb, const lapack_int* ipiv,
+                           const float* b, lapack_int ldb, float* x,
+                           lapack_int ldx, float* ferr, float* berr );
+lapack_int LAPACKE_dgbrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const double* ab, lapack_int ldab, const double* afb,
+                           lapack_int ldafb, const lapack_int* ipiv,
+                           const double* b, lapack_int ldb, double* x,
+                           lapack_int ldx, double* ferr, double* berr );
+lapack_int LAPACKE_cgbrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           const lapack_complex_float* afb, lapack_int ldafb,
+                           const lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zgbrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           const lapack_complex_double* afb, lapack_int ldafb,
+                           const lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_sgbrfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, const float* ab, lapack_int ldab,
+                            const float* afb, lapack_int ldafb,
+                            const lapack_int* ipiv, const float* r,
+                            const float* c, const float* b, lapack_int ldb,
+                            float* x, lapack_int ldx, float* rcond, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_dgbrfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, const double* ab, lapack_int ldab,
+                            const double* afb, lapack_int ldafb,
+                            const lapack_int* ipiv, const double* r,
+                            const double* c, const double* b, lapack_int ldb,
+                            double* x, lapack_int ldx, double* rcond,
+                            double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+lapack_int LAPACKE_cgbrfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, const lapack_complex_float* ab,
+                            lapack_int ldab, const lapack_complex_float* afb,
+                            lapack_int ldafb, const lapack_int* ipiv,
+                            const float* r, const float* c,
+                            const lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* x, lapack_int ldx,
+                            float* rcond, float* berr, lapack_int n_err_bnds,
+                            float* err_bnds_norm, float* err_bnds_comp,
+                            lapack_int nparams, float* params );
+lapack_int LAPACKE_zgbrfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, const lapack_complex_double* ab,
+                            lapack_int ldab, const lapack_complex_double* afb,
+                            lapack_int ldafb, const lapack_int* ipiv,
+                            const double* r, const double* c,
+                            const lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* x, lapack_int ldx,
+                            double* rcond, double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+
+lapack_int LAPACKE_sgbsv( int matrix_order, lapack_int n, lapack_int kl,
+                          lapack_int ku, lapack_int nrhs, float* ab,
+                          lapack_int ldab, lapack_int* ipiv, float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dgbsv( int matrix_order, lapack_int n, lapack_int kl,
+                          lapack_int ku, lapack_int nrhs, double* ab,
+                          lapack_int ldab, lapack_int* ipiv, double* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_cgbsv( int matrix_order, lapack_int n, lapack_int kl,
+                          lapack_int ku, lapack_int nrhs,
+                          lapack_complex_float* ab, lapack_int ldab,
+                          lapack_int* ipiv, lapack_complex_float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_zgbsv( int matrix_order, lapack_int n, lapack_int kl,
+                          lapack_int ku, lapack_int nrhs,
+                          lapack_complex_double* ab, lapack_int ldab,
+                          lapack_int* ipiv, lapack_complex_double* b,
+                          lapack_int ldb );
+
+lapack_int LAPACKE_sgbsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int kl, lapack_int ku,
+                           lapack_int nrhs, float* ab, lapack_int ldab,
+                           float* afb, lapack_int ldafb, lapack_int* ipiv,
+                           char* equed, float* r, float* c, float* b,
+                           lapack_int ldb, float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr,
+                           float* rpivot );
+lapack_int LAPACKE_dgbsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int kl, lapack_int ku,
+                           lapack_int nrhs, double* ab, lapack_int ldab,
+                           double* afb, lapack_int ldafb, lapack_int* ipiv,
+                           char* equed, double* r, double* c, double* b,
+                           lapack_int ldb, double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr,
+                           double* rpivot );
+lapack_int LAPACKE_cgbsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int kl, lapack_int ku,
+                           lapack_int nrhs, lapack_complex_float* ab,
+                           lapack_int ldab, lapack_complex_float* afb,
+                           lapack_int ldafb, lapack_int* ipiv, char* equed,
+                           float* r, float* c, lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* x,
+                           lapack_int ldx, float* rcond, float* ferr,
+                           float* berr, float* rpivot );
+lapack_int LAPACKE_zgbsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int kl, lapack_int ku,
+                           lapack_int nrhs, lapack_complex_double* ab,
+                           lapack_int ldab, lapack_complex_double* afb,
+                           lapack_int ldafb, lapack_int* ipiv, char* equed,
+                           double* r, double* c, lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* x,
+                           lapack_int ldx, double* rcond, double* ferr,
+                           double* berr, double* rpivot );
+
+lapack_int LAPACKE_sgbsvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, float* ab, lapack_int ldab,
+                            float* afb, lapack_int ldafb, lapack_int* ipiv,
+                            char* equed, float* r, float* c, float* b,
+                            lapack_int ldb, float* x, lapack_int ldx,
+                            float* rcond, float* rpvgrw, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_dgbsvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, double* ab, lapack_int ldab,
+                            double* afb, lapack_int ldafb, lapack_int* ipiv,
+                            char* equed, double* r, double* c, double* b,
+                            lapack_int ldb, double* x, lapack_int ldx,
+                            double* rcond, double* rpvgrw, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+lapack_int LAPACKE_cgbsvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, lapack_complex_float* ab,
+                            lapack_int ldab, lapack_complex_float* afb,
+                            lapack_int ldafb, lapack_int* ipiv, char* equed,
+                            float* r, float* c, lapack_complex_float* b,
+                            lapack_int ldb, lapack_complex_float* x,
+                            lapack_int ldx, float* rcond, float* rpvgrw,
+                            float* berr, lapack_int n_err_bnds,
+                            float* err_bnds_norm, float* err_bnds_comp,
+                            lapack_int nparams, float* params );
+lapack_int LAPACKE_zgbsvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, lapack_complex_double* ab,
+                            lapack_int ldab, lapack_complex_double* afb,
+                            lapack_int ldafb, lapack_int* ipiv, char* equed,
+                            double* r, double* c, lapack_complex_double* b,
+                            lapack_int ldb, lapack_complex_double* x,
+                            lapack_int ldx, double* rcond, double* rpvgrw,
+                            double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+
+lapack_int LAPACKE_sgbtrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, float* ab,
+                           lapack_int ldab, lapack_int* ipiv );
+lapack_int LAPACKE_dgbtrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, double* ab,
+                           lapack_int ldab, lapack_int* ipiv );
+lapack_int LAPACKE_cgbtrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku,
+                           lapack_complex_float* ab, lapack_int ldab,
+                           lapack_int* ipiv );
+lapack_int LAPACKE_zgbtrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku,
+                           lapack_complex_double* ab, lapack_int ldab,
+                           lapack_int* ipiv );
+
+lapack_int LAPACKE_sgbtrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const float* ab, lapack_int ldab,
+                           const lapack_int* ipiv, float* b, lapack_int ldb );
+lapack_int LAPACKE_dgbtrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const double* ab, lapack_int ldab,
+                           const lapack_int* ipiv, double* b, lapack_int ldb );
+lapack_int LAPACKE_cgbtrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           const lapack_int* ipiv, lapack_complex_float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_zgbtrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           const lapack_int* ipiv, lapack_complex_double* b,
+                           lapack_int ldb );
+
+lapack_int LAPACKE_sgebak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const float* scale,
+                           lapack_int m, float* v, lapack_int ldv );
+lapack_int LAPACKE_dgebak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const double* scale,
+                           lapack_int m, double* v, lapack_int ldv );
+lapack_int LAPACKE_cgebak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const float* scale,
+                           lapack_int m, lapack_complex_float* v,
+                           lapack_int ldv );
+lapack_int LAPACKE_zgebak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const double* scale,
+                           lapack_int m, lapack_complex_double* v,
+                           lapack_int ldv );
+
+lapack_int LAPACKE_sgebal( int matrix_order, char job, lapack_int n, float* a,
+                           lapack_int lda, lapack_int* ilo, lapack_int* ihi,
+                           float* scale );
+lapack_int LAPACKE_dgebal( int matrix_order, char job, lapack_int n, double* a,
+                           lapack_int lda, lapack_int* ilo, lapack_int* ihi,
+                           double* scale );
+lapack_int LAPACKE_cgebal( int matrix_order, char job, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* ilo, lapack_int* ihi, float* scale );
+lapack_int LAPACKE_zgebal( int matrix_order, char job, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* ilo, lapack_int* ihi, double* scale );
+
+lapack_int LAPACKE_sgebrd( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* d, float* e,
+                           float* tauq, float* taup );
+lapack_int LAPACKE_dgebrd( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* d, double* e,
+                           double* tauq, double* taup );
+lapack_int LAPACKE_cgebrd( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda, float* d,
+                           float* e, lapack_complex_float* tauq,
+                           lapack_complex_float* taup );
+lapack_int LAPACKE_zgebrd( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda, double* d,
+                           double* e, lapack_complex_double* tauq,
+                           lapack_complex_double* taup );
+
+lapack_int LAPACKE_sgecon( int matrix_order, char norm, lapack_int n,
+                           const float* a, lapack_int lda, float anorm,
+                           float* rcond );
+lapack_int LAPACKE_dgecon( int matrix_order, char norm, lapack_int n,
+                           const double* a, lapack_int lda, double anorm,
+                           double* rcond );
+lapack_int LAPACKE_cgecon( int matrix_order, char norm, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           float anorm, float* rcond );
+lapack_int LAPACKE_zgecon( int matrix_order, char norm, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           double anorm, double* rcond );
+
+lapack_int LAPACKE_sgeequ( int matrix_order, lapack_int m, lapack_int n,
+                           const float* a, lapack_int lda, float* r, float* c,
+                           float* rowcnd, float* colcnd, float* amax );
+lapack_int LAPACKE_dgeequ( int matrix_order, lapack_int m, lapack_int n,
+                           const double* a, lapack_int lda, double* r,
+                           double* c, double* rowcnd, double* colcnd,
+                           double* amax );
+lapack_int LAPACKE_cgeequ( int matrix_order, lapack_int m, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           float* r, float* c, float* rowcnd, float* colcnd,
+                           float* amax );
+lapack_int LAPACKE_zgeequ( int matrix_order, lapack_int m, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           double* r, double* c, double* rowcnd, double* colcnd,
+                           double* amax );
+
+lapack_int LAPACKE_sgeequb( int matrix_order, lapack_int m, lapack_int n,
+                            const float* a, lapack_int lda, float* r, float* c,
+                            float* rowcnd, float* colcnd, float* amax );
+lapack_int LAPACKE_dgeequb( int matrix_order, lapack_int m, lapack_int n,
+                            const double* a, lapack_int lda, double* r,
+                            double* c, double* rowcnd, double* colcnd,
+                            double* amax );
+lapack_int LAPACKE_cgeequb( int matrix_order, lapack_int m, lapack_int n,
+                            const lapack_complex_float* a, lapack_int lda,
+                            float* r, float* c, float* rowcnd, float* colcnd,
+                            float* amax );
+lapack_int LAPACKE_zgeequb( int matrix_order, lapack_int m, lapack_int n,
+                            const lapack_complex_double* a, lapack_int lda,
+                            double* r, double* c, double* rowcnd,
+                            double* colcnd, double* amax );
+
+lapack_int LAPACKE_sgees( int matrix_order, char jobvs, char sort,
+                          LAPACK_S_SELECT2 select, lapack_int n, float* a,
+                          lapack_int lda, lapack_int* sdim, float* wr,
+                          float* wi, float* vs, lapack_int ldvs );
+lapack_int LAPACKE_dgees( int matrix_order, char jobvs, char sort,
+                          LAPACK_D_SELECT2 select, lapack_int n, double* a,
+                          lapack_int lda, lapack_int* sdim, double* wr,
+                          double* wi, double* vs, lapack_int ldvs );
+lapack_int LAPACKE_cgees( int matrix_order, char jobvs, char sort,
+                          LAPACK_C_SELECT1 select, lapack_int n,
+                          lapack_complex_float* a, lapack_int lda,
+                          lapack_int* sdim, lapack_complex_float* w,
+                          lapack_complex_float* vs, lapack_int ldvs );
+lapack_int LAPACKE_zgees( int matrix_order, char jobvs, char sort,
+                          LAPACK_Z_SELECT1 select, lapack_int n,
+                          lapack_complex_double* a, lapack_int lda,
+                          lapack_int* sdim, lapack_complex_double* w,
+                          lapack_complex_double* vs, lapack_int ldvs );
+
+lapack_int LAPACKE_sgeesx( int matrix_order, char jobvs, char sort,
+                           LAPACK_S_SELECT2 select, char sense, lapack_int n,
+                           float* a, lapack_int lda, lapack_int* sdim,
+                           float* wr, float* wi, float* vs, lapack_int ldvs,
+                           float* rconde, float* rcondv );
+lapack_int LAPACKE_dgeesx( int matrix_order, char jobvs, char sort,
+                           LAPACK_D_SELECT2 select, char sense, lapack_int n,
+                           double* a, lapack_int lda, lapack_int* sdim,
+                           double* wr, double* wi, double* vs, lapack_int ldvs,
+                           double* rconde, double* rcondv );
+lapack_int LAPACKE_cgeesx( int matrix_order, char jobvs, char sort,
+                           LAPACK_C_SELECT1 select, char sense, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* sdim, lapack_complex_float* w,
+                           lapack_complex_float* vs, lapack_int ldvs,
+                           float* rconde, float* rcondv );
+lapack_int LAPACKE_zgeesx( int matrix_order, char jobvs, char sort,
+                           LAPACK_Z_SELECT1 select, char sense, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* sdim, lapack_complex_double* w,
+                           lapack_complex_double* vs, lapack_int ldvs,
+                           double* rconde, double* rcondv );
+
+lapack_int LAPACKE_sgeev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, float* a, lapack_int lda, float* wr,
+                          float* wi, float* vl, lapack_int ldvl, float* vr,
+                          lapack_int ldvr );
+lapack_int LAPACKE_dgeev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, double* a, lapack_int lda, double* wr,
+                          double* wi, double* vl, lapack_int ldvl, double* vr,
+                          lapack_int ldvr );
+lapack_int LAPACKE_cgeev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* w, lapack_complex_float* vl,
+                          lapack_int ldvl, lapack_complex_float* vr,
+                          lapack_int ldvr );
+lapack_int LAPACKE_zgeev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* w,
+                          lapack_complex_double* vl, lapack_int ldvl,
+                          lapack_complex_double* vr, lapack_int ldvr );
+
+lapack_int LAPACKE_sgeevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n, float* a,
+                           lapack_int lda, float* wr, float* wi, float* vl,
+                           lapack_int ldvl, float* vr, lapack_int ldvr,
+                           lapack_int* ilo, lapack_int* ihi, float* scale,
+                           float* abnrm, float* rconde, float* rcondv );
+lapack_int LAPACKE_dgeevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n, double* a,
+                           lapack_int lda, double* wr, double* wi, double* vl,
+                           lapack_int ldvl, double* vr, lapack_int ldvr,
+                           lapack_int* ilo, lapack_int* ihi, double* scale,
+                           double* abnrm, double* rconde, double* rcondv );
+lapack_int LAPACKE_cgeevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* w, lapack_complex_float* vl,
+                           lapack_int ldvl, lapack_complex_float* vr,
+                           lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
+                           float* scale, float* abnrm, float* rconde,
+                           float* rcondv );
+lapack_int LAPACKE_zgeevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* w, lapack_complex_double* vl,
+                           lapack_int ldvl, lapack_complex_double* vr,
+                           lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
+                           double* scale, double* abnrm, double* rconde,
+                           double* rcondv );
+
+lapack_int LAPACKE_sgehrd( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, float* a, lapack_int lda,
+                           float* tau );
+lapack_int LAPACKE_dgehrd( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, double* a, lapack_int lda,
+                           double* tau );
+lapack_int LAPACKE_cgehrd( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* tau );
+lapack_int LAPACKE_zgehrd( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgejsv( int matrix_order, char joba, char jobu, char jobv,
+                           char jobr, char jobt, char jobp, lapack_int m,
+                           lapack_int n, float* a, lapack_int lda, float* sva,
+                           float* u, lapack_int ldu, float* v, lapack_int ldv,
+                           float* stat, lapack_int* istat );
+lapack_int LAPACKE_dgejsv( int matrix_order, char joba, char jobu, char jobv,
+                           char jobr, char jobt, char jobp, lapack_int m,
+                           lapack_int n, double* a, lapack_int lda, double* sva,
+                           double* u, lapack_int ldu, double* v, lapack_int ldv,
+                           double* stat, lapack_int* istat );
+
+lapack_int LAPACKE_sgelq2( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dgelq2( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_cgelq2( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zgelq2( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgelqf( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dgelqf( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_cgelqf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zgelqf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgels( int matrix_order, char trans, lapack_int m,
+                          lapack_int n, lapack_int nrhs, float* a,
+                          lapack_int lda, float* b, lapack_int ldb );
+lapack_int LAPACKE_dgels( int matrix_order, char trans, lapack_int m,
+                          lapack_int n, lapack_int nrhs, double* a,
+                          lapack_int lda, double* b, lapack_int ldb );
+lapack_int LAPACKE_cgels( int matrix_order, char trans, lapack_int m,
+                          lapack_int n, lapack_int nrhs,
+                          lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zgels( int matrix_order, char trans, lapack_int m,
+                          lapack_int n, lapack_int nrhs,
+                          lapack_complex_double* a, lapack_int lda,
+                          lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sgelsd( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, float* a, lapack_int lda, float* b,
+                           lapack_int ldb, float* s, float rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_dgelsd( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, double* a, lapack_int lda,
+                           double* b, lapack_int ldb, double* s, double rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_cgelsd( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, float* s, float rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_zgelsd( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, double* s, double rcond,
+                           lapack_int* rank );
+
+lapack_int LAPACKE_sgelss( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, float* a, lapack_int lda, float* b,
+                           lapack_int ldb, float* s, float rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_dgelss( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, double* a, lapack_int lda,
+                           double* b, lapack_int ldb, double* s, double rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_cgelss( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, float* s, float rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_zgelss( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, double* s, double rcond,
+                           lapack_int* rank );
+
+lapack_int LAPACKE_sgelsy( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, float* a, lapack_int lda, float* b,
+                           lapack_int ldb, lapack_int* jpvt, float rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_dgelsy( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, double* a, lapack_int lda,
+                           double* b, lapack_int ldb, lapack_int* jpvt,
+                           double rcond, lapack_int* rank );
+lapack_int LAPACKE_cgelsy( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, lapack_int* jpvt, float rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_zgelsy( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, lapack_int* jpvt, double rcond,
+                           lapack_int* rank );
+
+lapack_int LAPACKE_sgeqlf( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dgeqlf( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_cgeqlf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zgeqlf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgeqp3( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, lapack_int* jpvt,
+                           float* tau );
+lapack_int LAPACKE_dgeqp3( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, lapack_int* jpvt,
+                           double* tau );
+lapack_int LAPACKE_cgeqp3( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* jpvt, lapack_complex_float* tau );
+lapack_int LAPACKE_zgeqp3( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* jpvt, lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgeqpf( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, lapack_int* jpvt,
+                           float* tau );
+lapack_int LAPACKE_dgeqpf( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, lapack_int* jpvt,
+                           double* tau );
+lapack_int LAPACKE_cgeqpf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* jpvt, lapack_complex_float* tau );
+lapack_int LAPACKE_zgeqpf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* jpvt, lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgeqr2( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dgeqr2( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_cgeqr2( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zgeqr2( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgeqrf( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dgeqrf( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_cgeqrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zgeqrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgeqrfp( int matrix_order, lapack_int m, lapack_int n,
+                            float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dgeqrfp( int matrix_order, lapack_int m, lapack_int n,
+                            double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_cgeqrfp( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* tau );
+lapack_int LAPACKE_zgeqrfp( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgerfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const float* a, lapack_int lda,
+                           const float* af, lapack_int ldaf,
+                           const lapack_int* ipiv, const float* b,
+                           lapack_int ldb, float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_dgerfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const double* a, lapack_int lda,
+                           const double* af, lapack_int ldaf,
+                           const lapack_int* ipiv, const double* b,
+                           lapack_int ldb, double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_cgerfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* af,
+                           lapack_int ldaf, const lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zgerfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* af,
+                           lapack_int ldaf, const lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_sgerfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int nrhs, const float* a,
+                            lapack_int lda, const float* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const float* r,
+                            const float* c, const float* b, lapack_int ldb,
+                            float* x, lapack_int ldx, float* rcond, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_dgerfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int nrhs, const double* a,
+                            lapack_int lda, const double* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const double* r,
+                            const double* c, const double* b, lapack_int ldb,
+                            double* x, lapack_int ldx, double* rcond,
+                            double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+lapack_int LAPACKE_cgerfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_float* a, lapack_int lda,
+                            const lapack_complex_float* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const float* r,
+                            const float* c, const lapack_complex_float* b,
+                            lapack_int ldb, lapack_complex_float* x,
+                            lapack_int ldx, float* rcond, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_zgerfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_double* a, lapack_int lda,
+                            const lapack_complex_double* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const double* r,
+                            const double* c, const lapack_complex_double* b,
+                            lapack_int ldb, lapack_complex_double* x,
+                            lapack_int ldx, double* rcond, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+
+lapack_int LAPACKE_sgerqf( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dgerqf( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_cgerqf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zgerqf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgesdd( int matrix_order, char jobz, lapack_int m,
+                           lapack_int n, float* a, lapack_int lda, float* s,
+                           float* u, lapack_int ldu, float* vt,
+                           lapack_int ldvt );
+lapack_int LAPACKE_dgesdd( int matrix_order, char jobz, lapack_int m,
+                           lapack_int n, double* a, lapack_int lda, double* s,
+                           double* u, lapack_int ldu, double* vt,
+                           lapack_int ldvt );
+lapack_int LAPACKE_cgesdd( int matrix_order, char jobz, lapack_int m,
+                           lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, float* s, lapack_complex_float* u,
+                           lapack_int ldu, lapack_complex_float* vt,
+                           lapack_int ldvt );
+lapack_int LAPACKE_zgesdd( int matrix_order, char jobz, lapack_int m,
+                           lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, double* s, lapack_complex_double* u,
+                           lapack_int ldu, lapack_complex_double* vt,
+                           lapack_int ldvt );
+
+lapack_int LAPACKE_sgesv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          float* a, lapack_int lda, lapack_int* ipiv, float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dgesv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          double* a, lapack_int lda, lapack_int* ipiv,
+                          double* b, lapack_int ldb );
+lapack_int LAPACKE_cgesv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          lapack_complex_float* a, lapack_int lda,
+                          lapack_int* ipiv, lapack_complex_float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_zgesv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          lapack_complex_double* a, lapack_int lda,
+                          lapack_int* ipiv, lapack_complex_double* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dsgesv( int matrix_order, lapack_int n, lapack_int nrhs,
+                           double* a, lapack_int lda, lapack_int* ipiv,
+                           double* b, lapack_int ldb, double* x, lapack_int ldx,
+                           lapack_int* iter );
+lapack_int LAPACKE_zcgesv( int matrix_order, lapack_int n, lapack_int nrhs,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* ipiv, lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* x,
+                           lapack_int ldx, lapack_int* iter );
+
+lapack_int LAPACKE_sgesvd( int matrix_order, char jobu, char jobvt,
+                           lapack_int m, lapack_int n, float* a, lapack_int lda,
+                           float* s, float* u, lapack_int ldu, float* vt,
+                           lapack_int ldvt, float* superb );
+lapack_int LAPACKE_dgesvd( int matrix_order, char jobu, char jobvt,
+                           lapack_int m, lapack_int n, double* a,
+                           lapack_int lda, double* s, double* u, lapack_int ldu,
+                           double* vt, lapack_int ldvt, double* superb );
+lapack_int LAPACKE_cgesvd( int matrix_order, char jobu, char jobvt,
+                           lapack_int m, lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, float* s, lapack_complex_float* u,
+                           lapack_int ldu, lapack_complex_float* vt,
+                           lapack_int ldvt, float* superb );
+lapack_int LAPACKE_zgesvd( int matrix_order, char jobu, char jobvt,
+                           lapack_int m, lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, double* s, lapack_complex_double* u,
+                           lapack_int ldu, lapack_complex_double* vt,
+                           lapack_int ldvt, double* superb );
+
+lapack_int LAPACKE_sgesvj( int matrix_order, char joba, char jobu, char jobv,
+                           lapack_int m, lapack_int n, float* a, lapack_int lda,
+                           float* sva, lapack_int mv, float* v, lapack_int ldv,
+                           float* stat );
+lapack_int LAPACKE_dgesvj( int matrix_order, char joba, char jobu, char jobv,
+                           lapack_int m, lapack_int n, double* a,
+                           lapack_int lda, double* sva, lapack_int mv,
+                           double* v, lapack_int ldv, double* stat );
+
+lapack_int LAPACKE_sgesvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs, float* a,
+                           lapack_int lda, float* af, lapack_int ldaf,
+                           lapack_int* ipiv, char* equed, float* r, float* c,
+                           float* b, lapack_int ldb, float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr,
+                           float* rpivot );
+lapack_int LAPACKE_dgesvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs, double* a,
+                           lapack_int lda, double* af, lapack_int ldaf,
+                           lapack_int* ipiv, char* equed, double* r, double* c,
+                           double* b, lapack_int ldb, double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr,
+                           double* rpivot );
+lapack_int LAPACKE_cgesvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* af, lapack_int ldaf,
+                           lapack_int* ipiv, char* equed, float* r, float* c,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr,
+                           float* rpivot );
+lapack_int LAPACKE_zgesvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* af, lapack_int ldaf,
+                           lapack_int* ipiv, char* equed, double* r, double* c,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr,
+                           double* rpivot );
+
+lapack_int LAPACKE_sgesvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int nrhs, float* a,
+                            lapack_int lda, float* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, float* r, float* c,
+                            float* b, lapack_int ldb, float* x, lapack_int ldx,
+                            float* rcond, float* rpvgrw, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_dgesvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int nrhs, double* a,
+                            lapack_int lda, double* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, double* r, double* c,
+                            double* b, lapack_int ldb, double* x,
+                            lapack_int ldx, double* rcond, double* rpvgrw,
+                            double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+lapack_int LAPACKE_cgesvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, float* r, float* c,
+                            lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* x, lapack_int ldx,
+                            float* rcond, float* rpvgrw, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_zgesvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, double* r, double* c,
+                            lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* x, lapack_int ldx,
+                            double* rcond, double* rpvgrw, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+
+lapack_int LAPACKE_sgetf2( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_dgetf2( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_cgetf2( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* ipiv );
+lapack_int LAPACKE_zgetf2( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* ipiv );
+
+lapack_int LAPACKE_sgetrf( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_dgetrf( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_cgetrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* ipiv );
+lapack_int LAPACKE_zgetrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* ipiv );
+
+lapack_int LAPACKE_sgetri( int matrix_order, lapack_int n, float* a,
+                           lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_dgetri( int matrix_order, lapack_int n, double* a,
+                           lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_cgetri( int matrix_order, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           const lapack_int* ipiv );
+lapack_int LAPACKE_zgetri( int matrix_order, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           const lapack_int* ipiv );
+
+lapack_int LAPACKE_sgetrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const float* a, lapack_int lda,
+                           const lapack_int* ipiv, float* b, lapack_int ldb );
+lapack_int LAPACKE_dgetrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const double* a, lapack_int lda,
+                           const lapack_int* ipiv, double* b, lapack_int ldb );
+lapack_int LAPACKE_cgetrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_int* ipiv,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zgetrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_int* ipiv,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sggbak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const float* lscale,
+                           const float* rscale, lapack_int m, float* v,
+                           lapack_int ldv );
+lapack_int LAPACKE_dggbak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const double* lscale,
+                           const double* rscale, lapack_int m, double* v,
+                           lapack_int ldv );
+lapack_int LAPACKE_cggbak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const float* lscale,
+                           const float* rscale, lapack_int m,
+                           lapack_complex_float* v, lapack_int ldv );
+lapack_int LAPACKE_zggbak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const double* lscale,
+                           const double* rscale, lapack_int m,
+                           lapack_complex_double* v, lapack_int ldv );
+
+lapack_int LAPACKE_sggbal( int matrix_order, char job, lapack_int n, float* a,
+                           lapack_int lda, float* b, lapack_int ldb,
+                           lapack_int* ilo, lapack_int* ihi, float* lscale,
+                           float* rscale );
+lapack_int LAPACKE_dggbal( int matrix_order, char job, lapack_int n, double* a,
+                           lapack_int lda, double* b, lapack_int ldb,
+                           lapack_int* ilo, lapack_int* ihi, double* lscale,
+                           double* rscale );
+lapack_int LAPACKE_cggbal( int matrix_order, char job, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_int* ilo, lapack_int* ihi, float* lscale,
+                           float* rscale );
+lapack_int LAPACKE_zggbal( int matrix_order, char job, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_int* ilo, lapack_int* ihi, double* lscale,
+                           double* rscale );
+
+lapack_int LAPACKE_sgges( int matrix_order, char jobvsl, char jobvsr, char sort,
+                          LAPACK_S_SELECT3 selctg, lapack_int n, float* a,
+                          lapack_int lda, float* b, lapack_int ldb,
+                          lapack_int* sdim, float* alphar, float* alphai,
+                          float* beta, float* vsl, lapack_int ldvsl, float* vsr,
+                          lapack_int ldvsr );
+lapack_int LAPACKE_dgges( int matrix_order, char jobvsl, char jobvsr, char sort,
+                          LAPACK_D_SELECT3 selctg, lapack_int n, double* a,
+                          lapack_int lda, double* b, lapack_int ldb,
+                          lapack_int* sdim, double* alphar, double* alphai,
+                          double* beta, double* vsl, lapack_int ldvsl,
+                          double* vsr, lapack_int ldvsr );
+lapack_int LAPACKE_cgges( int matrix_order, char jobvsl, char jobvsr, char sort,
+                          LAPACK_C_SELECT2 selctg, lapack_int n,
+                          lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* b, lapack_int ldb,
+                          lapack_int* sdim, lapack_complex_float* alpha,
+                          lapack_complex_float* beta, lapack_complex_float* vsl,
+                          lapack_int ldvsl, lapack_complex_float* vsr,
+                          lapack_int ldvsr );
+lapack_int LAPACKE_zgges( int matrix_order, char jobvsl, char jobvsr, char sort,
+                          LAPACK_Z_SELECT2 selctg, lapack_int n,
+                          lapack_complex_double* a, lapack_int lda,
+                          lapack_complex_double* b, lapack_int ldb,
+                          lapack_int* sdim, lapack_complex_double* alpha,
+                          lapack_complex_double* beta,
+                          lapack_complex_double* vsl, lapack_int ldvsl,
+                          lapack_complex_double* vsr, lapack_int ldvsr );
+
+lapack_int LAPACKE_sggesx( int matrix_order, char jobvsl, char jobvsr,
+                           char sort, LAPACK_S_SELECT3 selctg, char sense,
+                           lapack_int n, float* a, lapack_int lda, float* b,
+                           lapack_int ldb, lapack_int* sdim, float* alphar,
+                           float* alphai, float* beta, float* vsl,
+                           lapack_int ldvsl, float* vsr, lapack_int ldvsr,
+                           float* rconde, float* rcondv );
+lapack_int LAPACKE_dggesx( int matrix_order, char jobvsl, char jobvsr,
+                           char sort, LAPACK_D_SELECT3 selctg, char sense,
+                           lapack_int n, double* a, lapack_int lda, double* b,
+                           lapack_int ldb, lapack_int* sdim, double* alphar,
+                           double* alphai, double* beta, double* vsl,
+                           lapack_int ldvsl, double* vsr, lapack_int ldvsr,
+                           double* rconde, double* rcondv );
+lapack_int LAPACKE_cggesx( int matrix_order, char jobvsl, char jobvsr,
+                           char sort, LAPACK_C_SELECT2 selctg, char sense,
+                           lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, lapack_int* sdim,
+                           lapack_complex_float* alpha,
+                           lapack_complex_float* beta,
+                           lapack_complex_float* vsl, lapack_int ldvsl,
+                           lapack_complex_float* vsr, lapack_int ldvsr,
+                           float* rconde, float* rcondv );
+lapack_int LAPACKE_zggesx( int matrix_order, char jobvsl, char jobvsr,
+                           char sort, LAPACK_Z_SELECT2 selctg, char sense,
+                           lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, lapack_int* sdim,
+                           lapack_complex_double* alpha,
+                           lapack_complex_double* beta,
+                           lapack_complex_double* vsl, lapack_int ldvsl,
+                           lapack_complex_double* vsr, lapack_int ldvsr,
+                           double* rconde, double* rcondv );
+
+lapack_int LAPACKE_sggev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, float* a, lapack_int lda, float* b,
+                          lapack_int ldb, float* alphar, float* alphai,
+                          float* beta, float* vl, lapack_int ldvl, float* vr,
+                          lapack_int ldvr );
+lapack_int LAPACKE_dggev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, double* a, lapack_int lda, double* b,
+                          lapack_int ldb, double* alphar, double* alphai,
+                          double* beta, double* vl, lapack_int ldvl, double* vr,
+                          lapack_int ldvr );
+lapack_int LAPACKE_cggev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* b, lapack_int ldb,
+                          lapack_complex_float* alpha,
+                          lapack_complex_float* beta, lapack_complex_float* vl,
+                          lapack_int ldvl, lapack_complex_float* vr,
+                          lapack_int ldvr );
+lapack_int LAPACKE_zggev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* b,
+                          lapack_int ldb, lapack_complex_double* alpha,
+                          lapack_complex_double* beta,
+                          lapack_complex_double* vl, lapack_int ldvl,
+                          lapack_complex_double* vr, lapack_int ldvr );
+
+lapack_int LAPACKE_sggevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n, float* a,
+                           lapack_int lda, float* b, lapack_int ldb,
+                           float* alphar, float* alphai, float* beta, float* vl,
+                           lapack_int ldvl, float* vr, lapack_int ldvr,
+                           lapack_int* ilo, lapack_int* ihi, float* lscale,
+                           float* rscale, float* abnrm, float* bbnrm,
+                           float* rconde, float* rcondv );
+lapack_int LAPACKE_dggevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n, double* a,
+                           lapack_int lda, double* b, lapack_int ldb,
+                           double* alphar, double* alphai, double* beta,
+                           double* vl, lapack_int ldvl, double* vr,
+                           lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
+                           double* lscale, double* rscale, double* abnrm,
+                           double* bbnrm, double* rconde, double* rcondv );
+lapack_int LAPACKE_cggevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* alpha,
+                           lapack_complex_float* beta, lapack_complex_float* vl,
+                           lapack_int ldvl, lapack_complex_float* vr,
+                           lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
+                           float* lscale, float* rscale, float* abnrm,
+                           float* bbnrm, float* rconde, float* rcondv );
+lapack_int LAPACKE_zggevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* alpha,
+                           lapack_complex_double* beta,
+                           lapack_complex_double* vl, lapack_int ldvl,
+                           lapack_complex_double* vr, lapack_int ldvr,
+                           lapack_int* ilo, lapack_int* ihi, double* lscale,
+                           double* rscale, double* abnrm, double* bbnrm,
+                           double* rconde, double* rcondv );
+
+lapack_int LAPACKE_sggglm( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, float* a, lapack_int lda, float* b,
+                           lapack_int ldb, float* d, float* x, float* y );
+lapack_int LAPACKE_dggglm( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, double* a, lapack_int lda, double* b,
+                           lapack_int ldb, double* d, double* x, double* y );
+lapack_int LAPACKE_cggglm( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* d,
+                           lapack_complex_float* x, lapack_complex_float* y );
+lapack_int LAPACKE_zggglm( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* d,
+                           lapack_complex_double* x, lapack_complex_double* y );
+
+lapack_int LAPACKE_sgghrd( int matrix_order, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           float* a, lapack_int lda, float* b, lapack_int ldb,
+                           float* q, lapack_int ldq, float* z, lapack_int ldz );
+lapack_int LAPACKE_dgghrd( int matrix_order, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           double* a, lapack_int lda, double* b, lapack_int ldb,
+                           double* q, lapack_int ldq, double* z,
+                           lapack_int ldz );
+lapack_int LAPACKE_cgghrd( int matrix_order, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* q, lapack_int ldq,
+                           lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zgghrd( int matrix_order, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sgglse( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int p, float* a, lapack_int lda, float* b,
+                           lapack_int ldb, float* c, float* d, float* x );
+lapack_int LAPACKE_dgglse( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int p, double* a, lapack_int lda, double* b,
+                           lapack_int ldb, double* c, double* d, double* x );
+lapack_int LAPACKE_cgglse( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int p, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* c,
+                           lapack_complex_float* d, lapack_complex_float* x );
+lapack_int LAPACKE_zgglse( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int p, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* c,
+                           lapack_complex_double* d, lapack_complex_double* x );
+
+lapack_int LAPACKE_sggqrf( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, float* a, lapack_int lda, float* taua,
+                           float* b, lapack_int ldb, float* taub );
+lapack_int LAPACKE_dggqrf( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, double* a, lapack_int lda,
+                           double* taua, double* b, lapack_int ldb,
+                           double* taub );
+lapack_int LAPACKE_cggqrf( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* taua,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* taub );
+lapack_int LAPACKE_zggqrf( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* taua,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* taub );
+
+lapack_int LAPACKE_sggrqf( int matrix_order, lapack_int m, lapack_int p,
+                           lapack_int n, float* a, lapack_int lda, float* taua,
+                           float* b, lapack_int ldb, float* taub );
+lapack_int LAPACKE_dggrqf( int matrix_order, lapack_int m, lapack_int p,
+                           lapack_int n, double* a, lapack_int lda,
+                           double* taua, double* b, lapack_int ldb,
+                           double* taub );
+lapack_int LAPACKE_cggrqf( int matrix_order, lapack_int m, lapack_int p,
+                           lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* taua,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* taub );
+lapack_int LAPACKE_zggrqf( int matrix_order, lapack_int m, lapack_int p,
+                           lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* taua,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* taub );
+
+lapack_int LAPACKE_sggsvd( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int n, lapack_int p,
+                           lapack_int* k, lapack_int* l, float* a,
+                           lapack_int lda, float* b, lapack_int ldb,
+                           float* alpha, float* beta, float* u, lapack_int ldu,
+                           float* v, lapack_int ldv, float* q, lapack_int ldq,
+                           lapack_int* iwork );
+lapack_int LAPACKE_dggsvd( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int n, lapack_int p,
+                           lapack_int* k, lapack_int* l, double* a,
+                           lapack_int lda, double* b, lapack_int ldb,
+                           double* alpha, double* beta, double* u,
+                           lapack_int ldu, double* v, lapack_int ldv, double* q,
+                           lapack_int ldq, lapack_int* iwork );
+lapack_int LAPACKE_cggsvd( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int n, lapack_int p,
+                           lapack_int* k, lapack_int* l,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           float* alpha, float* beta, lapack_complex_float* u,
+                           lapack_int ldu, lapack_complex_float* v,
+                           lapack_int ldv, lapack_complex_float* q,
+                           lapack_int ldq, lapack_int* iwork );
+lapack_int LAPACKE_zggsvd( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int n, lapack_int p,
+                           lapack_int* k, lapack_int* l,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           double* alpha, double* beta,
+                           lapack_complex_double* u, lapack_int ldu,
+                           lapack_complex_double* v, lapack_int ldv,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_int* iwork );
+
+lapack_int LAPACKE_sggsvp( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n, float* a,
+                           lapack_int lda, float* b, lapack_int ldb, float tola,
+                           float tolb, lapack_int* k, lapack_int* l, float* u,
+                           lapack_int ldu, float* v, lapack_int ldv, float* q,
+                           lapack_int ldq );
+lapack_int LAPACKE_dggsvp( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n, double* a,
+                           lapack_int lda, double* b, lapack_int ldb,
+                           double tola, double tolb, lapack_int* k,
+                           lapack_int* l, double* u, lapack_int ldu, double* v,
+                           lapack_int ldv, double* q, lapack_int ldq );
+lapack_int LAPACKE_cggsvp( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb, float tola,
+                           float tolb, lapack_int* k, lapack_int* l,
+                           lapack_complex_float* u, lapack_int ldu,
+                           lapack_complex_float* v, lapack_int ldv,
+                           lapack_complex_float* q, lapack_int ldq );
+lapack_int LAPACKE_zggsvp( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           double tola, double tolb, lapack_int* k,
+                           lapack_int* l, lapack_complex_double* u,
+                           lapack_int ldu, lapack_complex_double* v,
+                           lapack_int ldv, lapack_complex_double* q,
+                           lapack_int ldq );
+
+lapack_int LAPACKE_sgtcon( char norm, lapack_int n, const float* dl,
+                           const float* d, const float* du, const float* du2,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_dgtcon( char norm, lapack_int n, const double* dl,
+                           const double* d, const double* du, const double* du2,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+lapack_int LAPACKE_cgtcon( char norm, lapack_int n,
+                           const lapack_complex_float* dl,
+                           const lapack_complex_float* d,
+                           const lapack_complex_float* du,
+                           const lapack_complex_float* du2,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_zgtcon( char norm, lapack_int n,
+                           const lapack_complex_double* dl,
+                           const lapack_complex_double* d,
+                           const lapack_complex_double* du,
+                           const lapack_complex_double* du2,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_sgtrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const float* dl, const float* d,
+                           const float* du, const float* dlf, const float* df,
+                           const float* duf, const float* du2,
+                           const lapack_int* ipiv, const float* b,
+                           lapack_int ldb, float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_dgtrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const double* dl, const double* d,
+                           const double* du, const double* dlf,
+                           const double* df, const double* duf,
+                           const double* du2, const lapack_int* ipiv,
+                           const double* b, lapack_int ldb, double* x,
+                           lapack_int ldx, double* ferr, double* berr );
+lapack_int LAPACKE_cgtrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* dl,
+                           const lapack_complex_float* d,
+                           const lapack_complex_float* du,
+                           const lapack_complex_float* dlf,
+                           const lapack_complex_float* df,
+                           const lapack_complex_float* duf,
+                           const lapack_complex_float* du2,
+                           const lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zgtrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* dl,
+                           const lapack_complex_double* d,
+                           const lapack_complex_double* du,
+                           const lapack_complex_double* dlf,
+                           const lapack_complex_double* df,
+                           const lapack_complex_double* duf,
+                           const lapack_complex_double* du2,
+                           const lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_sgtsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          float* dl, float* d, float* du, float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dgtsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          double* dl, double* d, double* du, double* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_cgtsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          lapack_complex_float* dl, lapack_complex_float* d,
+                          lapack_complex_float* du, lapack_complex_float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_zgtsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          lapack_complex_double* dl, lapack_complex_double* d,
+                          lapack_complex_double* du, lapack_complex_double* b,
+                          lapack_int ldb );
+
+lapack_int LAPACKE_sgtsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs, const float* dl,
+                           const float* d, const float* du, float* dlf,
+                           float* df, float* duf, float* du2, lapack_int* ipiv,
+                           const float* b, lapack_int ldb, float* x,
+                           lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_dgtsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs, const double* dl,
+                           const double* d, const double* du, double* dlf,
+                           double* df, double* duf, double* du2,
+                           lapack_int* ipiv, const double* b, lapack_int ldb,
+                           double* x, lapack_int ldx, double* rcond,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_cgtsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* dl,
+                           const lapack_complex_float* d,
+                           const lapack_complex_float* du,
+                           lapack_complex_float* dlf, lapack_complex_float* df,
+                           lapack_complex_float* duf, lapack_complex_float* du2,
+                           lapack_int* ipiv, const lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* x,
+                           lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zgtsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* dl,
+                           const lapack_complex_double* d,
+                           const lapack_complex_double* du,
+                           lapack_complex_double* dlf,
+                           lapack_complex_double* df,
+                           lapack_complex_double* duf,
+                           lapack_complex_double* du2, lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_sgttrf( lapack_int n, float* dl, float* d, float* du,
+                           float* du2, lapack_int* ipiv );
+lapack_int LAPACKE_dgttrf( lapack_int n, double* dl, double* d, double* du,
+                           double* du2, lapack_int* ipiv );
+lapack_int LAPACKE_cgttrf( lapack_int n, lapack_complex_float* dl,
+                           lapack_complex_float* d, lapack_complex_float* du,
+                           lapack_complex_float* du2, lapack_int* ipiv );
+lapack_int LAPACKE_zgttrf( lapack_int n, lapack_complex_double* dl,
+                           lapack_complex_double* d, lapack_complex_double* du,
+                           lapack_complex_double* du2, lapack_int* ipiv );
+
+lapack_int LAPACKE_sgttrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const float* dl, const float* d,
+                           const float* du, const float* du2,
+                           const lapack_int* ipiv, float* b, lapack_int ldb );
+lapack_int LAPACKE_dgttrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const double* dl, const double* d,
+                           const double* du, const double* du2,
+                           const lapack_int* ipiv, double* b, lapack_int ldb );
+lapack_int LAPACKE_cgttrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* dl,
+                           const lapack_complex_float* d,
+                           const lapack_complex_float* du,
+                           const lapack_complex_float* du2,
+                           const lapack_int* ipiv, lapack_complex_float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_zgttrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* dl,
+                           const lapack_complex_double* d,
+                           const lapack_complex_double* du,
+                           const lapack_complex_double* du2,
+                           const lapack_int* ipiv, lapack_complex_double* b,
+                           lapack_int ldb );
+
+lapack_int LAPACKE_chbev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int kd, lapack_complex_float* ab,
+                          lapack_int ldab, float* w, lapack_complex_float* z,
+                          lapack_int ldz );
+lapack_int LAPACKE_zhbev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int kd, lapack_complex_double* ab,
+                          lapack_int ldab, double* w, lapack_complex_double* z,
+                          lapack_int ldz );
+
+lapack_int LAPACKE_chbevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int kd, lapack_complex_float* ab,
+                           lapack_int ldab, float* w, lapack_complex_float* z,
+                           lapack_int ldz );
+lapack_int LAPACKE_zhbevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int kd, lapack_complex_double* ab,
+                           lapack_int ldab, double* w, lapack_complex_double* z,
+                           lapack_int ldz );
+
+lapack_int LAPACKE_chbevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int kd,
+                           lapack_complex_float* ab, lapack_int ldab,
+                           lapack_complex_float* q, lapack_int ldq, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, lapack_complex_float* z,
+                           lapack_int ldz, lapack_int* ifail );
+lapack_int LAPACKE_zhbevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int kd,
+                           lapack_complex_double* ab, lapack_int ldab,
+                           lapack_complex_double* q, lapack_int ldq, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_chbgst( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb,
+                           lapack_complex_float* ab, lapack_int ldab,
+                           const lapack_complex_float* bb, lapack_int ldbb,
+                           lapack_complex_float* x, lapack_int ldx );
+lapack_int LAPACKE_zhbgst( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb,
+                           lapack_complex_double* ab, lapack_int ldab,
+                           const lapack_complex_double* bb, lapack_int ldbb,
+                           lapack_complex_double* x, lapack_int ldx );
+
+lapack_int LAPACKE_chbgv( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int ka, lapack_int kb,
+                          lapack_complex_float* ab, lapack_int ldab,
+                          lapack_complex_float* bb, lapack_int ldbb, float* w,
+                          lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zhbgv( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int ka, lapack_int kb,
+                          lapack_complex_double* ab, lapack_int ldab,
+                          lapack_complex_double* bb, lapack_int ldbb, double* w,
+                          lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_chbgvd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb,
+                           lapack_complex_float* ab, lapack_int ldab,
+                           lapack_complex_float* bb, lapack_int ldbb, float* w,
+                           lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zhbgvd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb,
+                           lapack_complex_double* ab, lapack_int ldab,
+                           lapack_complex_double* bb, lapack_int ldbb,
+                           double* w, lapack_complex_double* z,
+                           lapack_int ldz );
+
+lapack_int LAPACKE_chbgvx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int ka, lapack_int kb,
+                           lapack_complex_float* ab, lapack_int ldab,
+                           lapack_complex_float* bb, lapack_int ldbb,
+                           lapack_complex_float* q, lapack_int ldq, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, lapack_complex_float* z,
+                           lapack_int ldz, lapack_int* ifail );
+lapack_int LAPACKE_zhbgvx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int ka, lapack_int kb,
+                           lapack_complex_double* ab, lapack_int ldab,
+                           lapack_complex_double* bb, lapack_int ldbb,
+                           lapack_complex_double* q, lapack_int ldq, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_chbtrd( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int kd, lapack_complex_float* ab,
+                           lapack_int ldab, float* d, float* e,
+                           lapack_complex_float* q, lapack_int ldq );
+lapack_int LAPACKE_zhbtrd( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int kd, lapack_complex_double* ab,
+                           lapack_int ldab, double* d, double* e,
+                           lapack_complex_double* q, lapack_int ldq );
+
+lapack_int LAPACKE_checon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_zhecon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_cheequb( int matrix_order, char uplo, lapack_int n,
+                            const lapack_complex_float* a, lapack_int lda,
+                            float* s, float* scond, float* amax );
+lapack_int LAPACKE_zheequb( int matrix_order, char uplo, lapack_int n,
+                            const lapack_complex_double* a, lapack_int lda,
+                            double* s, double* scond, double* amax );
+
+lapack_int LAPACKE_cheev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_complex_float* a, lapack_int lda, float* w );
+lapack_int LAPACKE_zheev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_complex_double* a, lapack_int lda, double* w );
+
+lapack_int LAPACKE_cheevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda, float* w );
+lapack_int LAPACKE_zheevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           double* w );
+
+lapack_int LAPACKE_cheevr( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, float vl, float vu, lapack_int il,
+                           lapack_int iu, float abstol, lapack_int* m, float* w,
+                           lapack_complex_float* z, lapack_int ldz,
+                           lapack_int* isuppz );
+lapack_int LAPACKE_zheevr( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, double vl, double vu, lapack_int il,
+                           lapack_int iu, double abstol, lapack_int* m,
+                           double* w, lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* isuppz );
+
+lapack_int LAPACKE_cheevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, float vl, float vu, lapack_int il,
+                           lapack_int iu, float abstol, lapack_int* m, float* w,
+                           lapack_complex_float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_zheevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, double vl, double vu, lapack_int il,
+                           lapack_int iu, double abstol, lapack_int* m,
+                           double* w, lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_chegst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_zhegst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* b,
+                           lapack_int ldb );
+
+lapack_int LAPACKE_chegv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, lapack_complex_float* a,
+                          lapack_int lda, lapack_complex_float* b,
+                          lapack_int ldb, float* w );
+lapack_int LAPACKE_zhegv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* b,
+                          lapack_int ldb, double* w );
+
+lapack_int LAPACKE_chegvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, float* w );
+lapack_int LAPACKE_zhegvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, double* w );
+
+lapack_int LAPACKE_chegvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, lapack_complex_float* z,
+                           lapack_int ldz, lapack_int* ifail );
+lapack_int LAPACKE_zhegvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_cherfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* af,
+                           lapack_int ldaf, const lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zherfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* af,
+                           lapack_int ldaf, const lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_cherfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_float* a, lapack_int lda,
+                            const lapack_complex_float* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const float* s,
+                            const lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* x, lapack_int ldx,
+                            float* rcond, float* berr, lapack_int n_err_bnds,
+                            float* err_bnds_norm, float* err_bnds_comp,
+                            lapack_int nparams, float* params );
+lapack_int LAPACKE_zherfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_double* a, lapack_int lda,
+                            const lapack_complex_double* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const double* s,
+                            const lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* x, lapack_int ldx,
+                            double* rcond, double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+
+lapack_int LAPACKE_chesv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_float* a,
+                          lapack_int lda, lapack_int* ipiv,
+                          lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zhesv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_double* a,
+                          lapack_int lda, lapack_int* ipiv,
+                          lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_chesvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* af,
+                           lapack_int ldaf, lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_zhesvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* af,
+                           lapack_int ldaf, lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_chesvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, float* s,
+                            lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* x, lapack_int ldx,
+                            float* rcond, float* rpvgrw, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_zhesvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, double* s,
+                            lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* x, lapack_int ldx,
+                            double* rcond, double* rpvgrw, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+
+lapack_int LAPACKE_chetrd( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda, float* d,
+                           float* e, lapack_complex_float* tau );
+lapack_int LAPACKE_zhetrd( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda, double* d,
+                           double* e, lapack_complex_double* tau );
+
+lapack_int LAPACKE_chetrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* ipiv );
+lapack_int LAPACKE_zhetrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* ipiv );
+
+lapack_int LAPACKE_chetri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           const lapack_int* ipiv );
+lapack_int LAPACKE_zhetri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           const lapack_int* ipiv );
+
+lapack_int LAPACKE_chetrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_int* ipiv,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zhetrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_int* ipiv,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_chfrk( int matrix_order, char transr, char uplo, char trans,
+                          lapack_int n, lapack_int k, float alpha,
+                          const lapack_complex_float* a, lapack_int lda,
+                          float beta, lapack_complex_float* c );
+lapack_int LAPACKE_zhfrk( int matrix_order, char transr, char uplo, char trans,
+                          lapack_int n, lapack_int k, double alpha,
+                          const lapack_complex_double* a, lapack_int lda,
+                          double beta, lapack_complex_double* c );
+
+lapack_int LAPACKE_shgeqz( int matrix_order, char job, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           float* h, lapack_int ldh, float* t, lapack_int ldt,
+                           float* alphar, float* alphai, float* beta, float* q,
+                           lapack_int ldq, float* z, lapack_int ldz );
+lapack_int LAPACKE_dhgeqz( int matrix_order, char job, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           double* h, lapack_int ldh, double* t, lapack_int ldt,
+                           double* alphar, double* alphai, double* beta,
+                           double* q, lapack_int ldq, double* z,
+                           lapack_int ldz );
+lapack_int LAPACKE_chgeqz( int matrix_order, char job, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           lapack_complex_float* h, lapack_int ldh,
+                           lapack_complex_float* t, lapack_int ldt,
+                           lapack_complex_float* alpha,
+                           lapack_complex_float* beta, lapack_complex_float* q,
+                           lapack_int ldq, lapack_complex_float* z,
+                           lapack_int ldz );
+lapack_int LAPACKE_zhgeqz( int matrix_order, char job, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           lapack_complex_double* h, lapack_int ldh,
+                           lapack_complex_double* t, lapack_int ldt,
+                           lapack_complex_double* alpha,
+                           lapack_complex_double* beta,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_chpcon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* ap,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_zhpcon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* ap,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_chpev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_complex_float* ap, float* w,
+                          lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zhpev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_complex_double* ap, double* w,
+                          lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_chpevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_complex_float* ap, float* w,
+                           lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zhpevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_complex_double* ap, double* w,
+                           lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_chpevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_complex_float* ap, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, lapack_complex_float* z,
+                           lapack_int ldz, lapack_int* ifail );
+lapack_int LAPACKE_zhpevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_complex_double* ap, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_chpgst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, lapack_complex_float* ap,
+                           const lapack_complex_float* bp );
+lapack_int LAPACKE_zhpgst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, lapack_complex_double* ap,
+                           const lapack_complex_double* bp );
+
+lapack_int LAPACKE_chpgv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, lapack_complex_float* ap,
+                          lapack_complex_float* bp, float* w,
+                          lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zhpgv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, lapack_complex_double* ap,
+                          lapack_complex_double* bp, double* w,
+                          lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_chpgvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, lapack_complex_float* ap,
+                           lapack_complex_float* bp, float* w,
+                           lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zhpgvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, lapack_complex_double* ap,
+                           lapack_complex_double* bp, double* w,
+                           lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_chpgvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n,
+                           lapack_complex_float* ap, lapack_complex_float* bp,
+                           float vl, float vu, lapack_int il, lapack_int iu,
+                           float abstol, lapack_int* m, float* w,
+                           lapack_complex_float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_zhpgvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n,
+                           lapack_complex_double* ap, lapack_complex_double* bp,
+                           double vl, double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_chprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           const lapack_complex_float* afp,
+                           const lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zhprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           const lapack_complex_double* afp,
+                           const lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_chpsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_float* ap,
+                          lapack_int* ipiv, lapack_complex_float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_zhpsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_double* ap,
+                          lapack_int* ipiv, lapack_complex_double* b,
+                          lapack_int ldb );
+
+lapack_int LAPACKE_chpsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           lapack_complex_float* afp, lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_zhpsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           lapack_complex_double* afp, lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_chptrd( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* ap, float* d, float* e,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zhptrd( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* ap, double* d, double* e,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_chptrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* ap, lapack_int* ipiv );
+lapack_int LAPACKE_zhptrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* ap, lapack_int* ipiv );
+
+lapack_int LAPACKE_chptri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* ap, const lapack_int* ipiv );
+lapack_int LAPACKE_zhptri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* ap, const lapack_int* ipiv );
+
+lapack_int LAPACKE_chptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           const lapack_int* ipiv, lapack_complex_float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_zhptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           const lapack_int* ipiv, lapack_complex_double* b,
+                           lapack_int ldb );
+
+lapack_int LAPACKE_shsein( int matrix_order, char job, char eigsrc, char initv,
+                           lapack_logical* select, lapack_int n, const float* h,
+                           lapack_int ldh, float* wr, const float* wi,
+                           float* vl, lapack_int ldvl, float* vr,
+                           lapack_int ldvr, lapack_int mm, lapack_int* m,
+                           lapack_int* ifaill, lapack_int* ifailr );
+lapack_int LAPACKE_dhsein( int matrix_order, char job, char eigsrc, char initv,
+                           lapack_logical* select, lapack_int n,
+                           const double* h, lapack_int ldh, double* wr,
+                           const double* wi, double* vl, lapack_int ldvl,
+                           double* vr, lapack_int ldvr, lapack_int mm,
+                           lapack_int* m, lapack_int* ifaill,
+                           lapack_int* ifailr );
+lapack_int LAPACKE_chsein( int matrix_order, char job, char eigsrc, char initv,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_float* h, lapack_int ldh,
+                           lapack_complex_float* w, lapack_complex_float* vl,
+                           lapack_int ldvl, lapack_complex_float* vr,
+                           lapack_int ldvr, lapack_int mm, lapack_int* m,
+                           lapack_int* ifaill, lapack_int* ifailr );
+lapack_int LAPACKE_zhsein( int matrix_order, char job, char eigsrc, char initv,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_double* h, lapack_int ldh,
+                           lapack_complex_double* w, lapack_complex_double* vl,
+                           lapack_int ldvl, lapack_complex_double* vr,
+                           lapack_int ldvr, lapack_int mm, lapack_int* m,
+                           lapack_int* ifaill, lapack_int* ifailr );
+
+lapack_int LAPACKE_shseqr( int matrix_order, char job, char compz, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, float* h,
+                           lapack_int ldh, float* wr, float* wi, float* z,
+                           lapack_int ldz );
+lapack_int LAPACKE_dhseqr( int matrix_order, char job, char compz, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, double* h,
+                           lapack_int ldh, double* wr, double* wi, double* z,
+                           lapack_int ldz );
+lapack_int LAPACKE_chseqr( int matrix_order, char job, char compz, lapack_int n,
+                           lapack_int ilo, lapack_int ihi,
+                           lapack_complex_float* h, lapack_int ldh,
+                           lapack_complex_float* w, lapack_complex_float* z,
+                           lapack_int ldz );
+lapack_int LAPACKE_zhseqr( int matrix_order, char job, char compz, lapack_int n,
+                           lapack_int ilo, lapack_int ihi,
+                           lapack_complex_double* h, lapack_int ldh,
+                           lapack_complex_double* w, lapack_complex_double* z,
+                           lapack_int ldz );
+
+lapack_int LAPACKE_clacgv( lapack_int n, lapack_complex_float* x,
+                           lapack_int incx );
+lapack_int LAPACKE_zlacgv( lapack_int n, lapack_complex_double* x,
+                           lapack_int incx );
+
+lapack_int LAPACKE_slacpy( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, const float* a, lapack_int lda, float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_dlacpy( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, const double* a, lapack_int lda, double* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_clacpy( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, const lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_zlacpy( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, const lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb );
+
+lapack_int LAPACKE_zlag2c( int matrix_order, lapack_int m, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_float* sa, lapack_int ldsa );
+
+lapack_int LAPACKE_slag2d( int matrix_order, lapack_int m, lapack_int n,
+                           const float* sa, lapack_int ldsa, double* a,
+                           lapack_int lda );
+
+lapack_int LAPACKE_dlag2s( int matrix_order, lapack_int m, lapack_int n,
+                           const double* a, lapack_int lda, float* sa,
+                           lapack_int ldsa );
+
+lapack_int LAPACKE_clag2z( int matrix_order, lapack_int m, lapack_int n,
+                           const lapack_complex_float* sa, lapack_int ldsa,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_slagge( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, const float* d,
+                           float* a, lapack_int lda, lapack_int* iseed );
+lapack_int LAPACKE_dlagge( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, const double* d,
+                           double* a, lapack_int lda, lapack_int* iseed );
+lapack_int LAPACKE_clagge( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, const float* d,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* iseed );
+lapack_int LAPACKE_zlagge( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, const double* d,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* iseed );
+
+float LAPACKE_slamch( char cmach );
+double LAPACKE_dlamch( char cmach );
+
+float LAPACKE_slange( int matrix_order, char norm, lapack_int m,
+                           lapack_int n, const float* a, lapack_int lda );
+double LAPACKE_dlange( int matrix_order, char norm, lapack_int m,
+                           lapack_int n, const double* a, lapack_int lda );
+float LAPACKE_clange( int matrix_order, char norm, lapack_int m,
+                           lapack_int n, const lapack_complex_float* a,
+                           lapack_int lda );
+double LAPACKE_zlange( int matrix_order, char norm, lapack_int m,
+                           lapack_int n, const lapack_complex_double* a,
+                           lapack_int lda );
+
+float LAPACKE_clanhe( int matrix_order, char norm, char uplo, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda );
+double LAPACKE_zlanhe( int matrix_order, char norm, char uplo, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda );
+
+float LAPACKE_slansy( int matrix_order, char norm, char uplo, lapack_int n,
+                           const float* a, lapack_int lda );
+double LAPACKE_dlansy( int matrix_order, char norm, char uplo, lapack_int n,
+                           const double* a, lapack_int lda );
+float LAPACKE_clansy( int matrix_order, char norm, char uplo, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda );
+double LAPACKE_zlansy( int matrix_order, char norm, char uplo, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda );
+
+float LAPACKE_slantr( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int m, lapack_int n, const float* a,
+                           lapack_int lda );
+double LAPACKE_dlantr( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int m, lapack_int n, const double* a,
+                           lapack_int lda );
+float LAPACKE_clantr( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int m, lapack_int n, const lapack_complex_float* a,
+                           lapack_int lda );
+double LAPACKE_zlantr( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int m, lapack_int n, const lapack_complex_double* a,
+                           lapack_int lda );
+
+
+lapack_int LAPACKE_slarfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, const float* v, lapack_int ldv,
+                           const float* t, lapack_int ldt, float* c,
+                           lapack_int ldc );
+lapack_int LAPACKE_dlarfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, const double* v, lapack_int ldv,
+                           const double* t, lapack_int ldt, double* c,
+                           lapack_int ldc );
+lapack_int LAPACKE_clarfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, const lapack_complex_float* v,
+                           lapack_int ldv, const lapack_complex_float* t,
+                           lapack_int ldt, lapack_complex_float* c,
+                           lapack_int ldc );
+lapack_int LAPACKE_zlarfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, const lapack_complex_double* v,
+                           lapack_int ldv, const lapack_complex_double* t,
+                           lapack_int ldt, lapack_complex_double* c,
+                           lapack_int ldc );
+
+lapack_int LAPACKE_slarfg( lapack_int n, float* alpha, float* x,
+                           lapack_int incx, float* tau );
+lapack_int LAPACKE_dlarfg( lapack_int n, double* alpha, double* x,
+                           lapack_int incx, double* tau );
+lapack_int LAPACKE_clarfg( lapack_int n, lapack_complex_float* alpha,
+                           lapack_complex_float* x, lapack_int incx,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zlarfg( lapack_int n, lapack_complex_double* alpha,
+                           lapack_complex_double* x, lapack_int incx,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_slarft( int matrix_order, char direct, char storev,
+                           lapack_int n, lapack_int k, const float* v,
+                           lapack_int ldv, const float* tau, float* t,
+                           lapack_int ldt );
+lapack_int LAPACKE_dlarft( int matrix_order, char direct, char storev,
+                           lapack_int n, lapack_int k, const double* v,
+                           lapack_int ldv, const double* tau, double* t,
+                           lapack_int ldt );
+lapack_int LAPACKE_clarft( int matrix_order, char direct, char storev,
+                           lapack_int n, lapack_int k,
+                           const lapack_complex_float* v, lapack_int ldv,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zlarft( int matrix_order, char direct, char storev,
+                           lapack_int n, lapack_int k,
+                           const lapack_complex_double* v, lapack_int ldv,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_slarfx( int matrix_order, char side, lapack_int m,
+                           lapack_int n, const float* v, float tau, float* c,
+                           lapack_int ldc, float* work );
+lapack_int LAPACKE_dlarfx( int matrix_order, char side, lapack_int m,
+                           lapack_int n, const double* v, double tau, double* c,
+                           lapack_int ldc, double* work );
+lapack_int LAPACKE_clarfx( int matrix_order, char side, lapack_int m,
+                           lapack_int n, const lapack_complex_float* v,
+                           lapack_complex_float tau, lapack_complex_float* c,
+                           lapack_int ldc, lapack_complex_float* work );
+lapack_int LAPACKE_zlarfx( int matrix_order, char side, lapack_int m,
+                           lapack_int n, const lapack_complex_double* v,
+                           lapack_complex_double tau, lapack_complex_double* c,
+                           lapack_int ldc, lapack_complex_double* work );
+
+lapack_int LAPACKE_slarnv( lapack_int idist, lapack_int* iseed, lapack_int n,
+                           float* x );
+lapack_int LAPACKE_dlarnv( lapack_int idist, lapack_int* iseed, lapack_int n,
+                           double* x );
+lapack_int LAPACKE_clarnv( lapack_int idist, lapack_int* iseed, lapack_int n,
+                           lapack_complex_float* x );
+lapack_int LAPACKE_zlarnv( lapack_int idist, lapack_int* iseed, lapack_int n,
+                           lapack_complex_double* x );
+
+lapack_int LAPACKE_slaset( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, float alpha, float beta, float* a,
+                           lapack_int lda );
+lapack_int LAPACKE_dlaset( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, double alpha, double beta, double* a,
+                           lapack_int lda );
+lapack_int LAPACKE_claset( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, lapack_complex_float alpha,
+                           lapack_complex_float beta, lapack_complex_float* a,
+                           lapack_int lda );
+lapack_int LAPACKE_zlaset( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, lapack_complex_double alpha,
+                           lapack_complex_double beta, lapack_complex_double* a,
+                           lapack_int lda );
+
+lapack_int LAPACKE_slasrt( char id, lapack_int n, float* d );
+lapack_int LAPACKE_dlasrt( char id, lapack_int n, double* d );
+
+lapack_int LAPACKE_slaswp( int matrix_order, lapack_int n, float* a,
+                           lapack_int lda, lapack_int k1, lapack_int k2,
+                           const lapack_int* ipiv, lapack_int incx );
+lapack_int LAPACKE_dlaswp( int matrix_order, lapack_int n, double* a,
+                           lapack_int lda, lapack_int k1, lapack_int k2,
+                           const lapack_int* ipiv, lapack_int incx );
+lapack_int LAPACKE_claswp( int matrix_order, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int k1, lapack_int k2, const lapack_int* ipiv,
+                           lapack_int incx );
+lapack_int LAPACKE_zlaswp( int matrix_order, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int k1, lapack_int k2, const lapack_int* ipiv,
+                           lapack_int incx );
+
+lapack_int LAPACKE_slatms( int matrix_order, lapack_int m, lapack_int n,
+                           char dist, lapack_int* iseed, char sym, float* d,
+                           lapack_int mode, float cond, float dmax,
+                           lapack_int kl, lapack_int ku, char pack, float* a,
+                           lapack_int lda );
+lapack_int LAPACKE_dlatms( int matrix_order, lapack_int m, lapack_int n,
+                           char dist, lapack_int* iseed, char sym, double* d,
+                           lapack_int mode, double cond, double dmax,
+                           lapack_int kl, lapack_int ku, char pack, double* a,
+                           lapack_int lda );
+lapack_int LAPACKE_clatms( int matrix_order, lapack_int m, lapack_int n,
+                           char dist, lapack_int* iseed, char sym, float* d,
+                           lapack_int mode, float cond, float dmax,
+                           lapack_int kl, lapack_int ku, char pack,
+                           lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zlatms( int matrix_order, lapack_int m, lapack_int n,
+                           char dist, lapack_int* iseed, char sym, double* d,
+                           lapack_int mode, double cond, double dmax,
+                           lapack_int kl, lapack_int ku, char pack,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_slauum( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda );
+lapack_int LAPACKE_dlauum( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda );
+lapack_int LAPACKE_clauum( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zlauum( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_sopgtr( int matrix_order, char uplo, lapack_int n,
+                           const float* ap, const float* tau, float* q,
+                           lapack_int ldq );
+lapack_int LAPACKE_dopgtr( int matrix_order, char uplo, lapack_int n,
+                           const double* ap, const double* tau, double* q,
+                           lapack_int ldq );
+
+lapack_int LAPACKE_sopmtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n, const float* ap,
+                           const float* tau, float* c, lapack_int ldc );
+lapack_int LAPACKE_dopmtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n, const double* ap,
+                           const double* tau, double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sorgbr( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int k, float* a, lapack_int lda,
+                           const float* tau );
+lapack_int LAPACKE_dorgbr( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int k, double* a,
+                           lapack_int lda, const double* tau );
+
+lapack_int LAPACKE_sorghr( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, float* a, lapack_int lda,
+                           const float* tau );
+lapack_int LAPACKE_dorghr( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, double* a, lapack_int lda,
+                           const double* tau );
+
+lapack_int LAPACKE_sorglq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, float* a, lapack_int lda,
+                           const float* tau );
+lapack_int LAPACKE_dorglq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, double* a, lapack_int lda,
+                           const double* tau );
+
+lapack_int LAPACKE_sorgql( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, float* a, lapack_int lda,
+                           const float* tau );
+lapack_int LAPACKE_dorgql( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, double* a, lapack_int lda,
+                           const double* tau );
+
+lapack_int LAPACKE_sorgqr( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, float* a, lapack_int lda,
+                           const float* tau );
+lapack_int LAPACKE_dorgqr( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, double* a, lapack_int lda,
+                           const double* tau );
+
+lapack_int LAPACKE_sorgrq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, float* a, lapack_int lda,
+                           const float* tau );
+lapack_int LAPACKE_dorgrq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, double* a, lapack_int lda,
+                           const double* tau );
+
+lapack_int LAPACKE_sorgtr( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda, const float* tau );
+lapack_int LAPACKE_dorgtr( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda, const double* tau );
+
+lapack_int LAPACKE_sormbr( int matrix_order, char vect, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const float* a, lapack_int lda, const float* tau,
+                           float* c, lapack_int ldc );
+lapack_int LAPACKE_dormbr( int matrix_order, char vect, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const double* a, lapack_int lda, const double* tau,
+                           double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sormhr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, const float* a, lapack_int lda,
+                           const float* tau, float* c, lapack_int ldc );
+lapack_int LAPACKE_dormhr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, const double* a, lapack_int lda,
+                           const double* tau, double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sormlq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const float* a, lapack_int lda, const float* tau,
+                           float* c, lapack_int ldc );
+lapack_int LAPACKE_dormlq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const double* a, lapack_int lda, const double* tau,
+                           double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sormql( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const float* a, lapack_int lda, const float* tau,
+                           float* c, lapack_int ldc );
+lapack_int LAPACKE_dormql( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const double* a, lapack_int lda, const double* tau,
+                           double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sormqr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const float* a, lapack_int lda, const float* tau,
+                           float* c, lapack_int ldc );
+lapack_int LAPACKE_dormqr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const double* a, lapack_int lda, const double* tau,
+                           double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sormrq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const float* a, lapack_int lda, const float* tau,
+                           float* c, lapack_int ldc );
+lapack_int LAPACKE_dormrq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const double* a, lapack_int lda, const double* tau,
+                           double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sormrz( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           lapack_int l, const float* a, lapack_int lda,
+                           const float* tau, float* c, lapack_int ldc );
+lapack_int LAPACKE_dormrz( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           lapack_int l, const double* a, lapack_int lda,
+                           const double* tau, double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sormtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n, const float* a,
+                           lapack_int lda, const float* tau, float* c,
+                           lapack_int ldc );
+lapack_int LAPACKE_dormtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n, const double* a,
+                           lapack_int lda, const double* tau, double* c,
+                           lapack_int ldc );
+
+lapack_int LAPACKE_spbcon( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const float* ab, lapack_int ldab,
+                           float anorm, float* rcond );
+lapack_int LAPACKE_dpbcon( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const double* ab, lapack_int ldab,
+                           double anorm, double* rcond );
+lapack_int LAPACKE_cpbcon( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const lapack_complex_float* ab,
+                           lapack_int ldab, float anorm, float* rcond );
+lapack_int LAPACKE_zpbcon( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const lapack_complex_double* ab,
+                           lapack_int ldab, double anorm, double* rcond );
+
+lapack_int LAPACKE_spbequ( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const float* ab, lapack_int ldab,
+                           float* s, float* scond, float* amax );
+lapack_int LAPACKE_dpbequ( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const double* ab, lapack_int ldab,
+                           double* s, double* scond, double* amax );
+lapack_int LAPACKE_cpbequ( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const lapack_complex_float* ab,
+                           lapack_int ldab, float* s, float* scond,
+                           float* amax );
+lapack_int LAPACKE_zpbequ( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const lapack_complex_double* ab,
+                           lapack_int ldab, double* s, double* scond,
+                           double* amax );
+
+lapack_int LAPACKE_spbrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs, const float* ab,
+                           lapack_int ldab, const float* afb, lapack_int ldafb,
+                           const float* b, lapack_int ldb, float* x,
+                           lapack_int ldx, float* ferr, float* berr );
+lapack_int LAPACKE_dpbrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs, const double* ab,
+                           lapack_int ldab, const double* afb, lapack_int ldafb,
+                           const double* b, lapack_int ldb, double* x,
+                           lapack_int ldx, double* ferr, double* berr );
+lapack_int LAPACKE_cpbrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           const lapack_complex_float* afb, lapack_int ldafb,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zpbrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           const lapack_complex_double* afb, lapack_int ldafb,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_spbstf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kb, float* bb, lapack_int ldbb );
+lapack_int LAPACKE_dpbstf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kb, double* bb, lapack_int ldbb );
+lapack_int LAPACKE_cpbstf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kb, lapack_complex_float* bb,
+                           lapack_int ldbb );
+lapack_int LAPACKE_zpbstf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kb, lapack_complex_double* bb,
+                           lapack_int ldbb );
+
+lapack_int LAPACKE_spbsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int kd, lapack_int nrhs, float* ab,
+                          lapack_int ldab, float* b, lapack_int ldb );
+lapack_int LAPACKE_dpbsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int kd, lapack_int nrhs, double* ab,
+                          lapack_int ldab, double* b, lapack_int ldb );
+lapack_int LAPACKE_cpbsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int kd, lapack_int nrhs,
+                          lapack_complex_float* ab, lapack_int ldab,
+                          lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpbsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int kd, lapack_int nrhs,
+                          lapack_complex_double* ab, lapack_int ldab,
+                          lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_spbsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs, float* ab,
+                           lapack_int ldab, float* afb, lapack_int ldafb,
+                           char* equed, float* s, float* b, lapack_int ldb,
+                           float* x, lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_dpbsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs, double* ab,
+                           lapack_int ldab, double* afb, lapack_int ldafb,
+                           char* equed, double* s, double* b, lapack_int ldb,
+                           double* x, lapack_int ldx, double* rcond,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_cpbsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs,
+                           lapack_complex_float* ab, lapack_int ldab,
+                           lapack_complex_float* afb, lapack_int ldafb,
+                           char* equed, float* s, lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* x,
+                           lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zpbsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs,
+                           lapack_complex_double* ab, lapack_int ldab,
+                           lapack_complex_double* afb, lapack_int ldafb,
+                           char* equed, double* s, lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* x,
+                           lapack_int ldx, double* rcond, double* ferr,
+                           double* berr );
+
+lapack_int LAPACKE_spbtrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, float* ab, lapack_int ldab );
+lapack_int LAPACKE_dpbtrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, double* ab, lapack_int ldab );
+lapack_int LAPACKE_cpbtrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_complex_float* ab,
+                           lapack_int ldab );
+lapack_int LAPACKE_zpbtrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_complex_double* ab,
+                           lapack_int ldab );
+
+lapack_int LAPACKE_spbtrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs, const float* ab,
+                           lapack_int ldab, float* b, lapack_int ldb );
+lapack_int LAPACKE_dpbtrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs, const double* ab,
+                           lapack_int ldab, double* b, lapack_int ldb );
+lapack_int LAPACKE_cpbtrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpbtrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_spftrf( int matrix_order, char transr, char uplo,
+                           lapack_int n, float* a );
+lapack_int LAPACKE_dpftrf( int matrix_order, char transr, char uplo,
+                           lapack_int n, double* a );
+lapack_int LAPACKE_cpftrf( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_complex_float* a );
+lapack_int LAPACKE_zpftrf( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_complex_double* a );
+
+lapack_int LAPACKE_spftri( int matrix_order, char transr, char uplo,
+                           lapack_int n, float* a );
+lapack_int LAPACKE_dpftri( int matrix_order, char transr, char uplo,
+                           lapack_int n, double* a );
+lapack_int LAPACKE_cpftri( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_complex_float* a );
+lapack_int LAPACKE_zpftri( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_complex_double* a );
+
+lapack_int LAPACKE_spftrs( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_int nrhs, const float* a,
+                           float* b, lapack_int ldb );
+lapack_int LAPACKE_dpftrs( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_int nrhs, const double* a,
+                           double* b, lapack_int ldb );
+lapack_int LAPACKE_cpftrs( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* a,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpftrs( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* a,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_spocon( int matrix_order, char uplo, lapack_int n,
+                           const float* a, lapack_int lda, float anorm,
+                           float* rcond );
+lapack_int LAPACKE_dpocon( int matrix_order, char uplo, lapack_int n,
+                           const double* a, lapack_int lda, double anorm,
+                           double* rcond );
+lapack_int LAPACKE_cpocon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           float anorm, float* rcond );
+lapack_int LAPACKE_zpocon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           double anorm, double* rcond );
+
+lapack_int LAPACKE_spoequ( int matrix_order, lapack_int n, const float* a,
+                           lapack_int lda, float* s, float* scond,
+                           float* amax );
+lapack_int LAPACKE_dpoequ( int matrix_order, lapack_int n, const double* a,
+                           lapack_int lda, double* s, double* scond,
+                           double* amax );
+lapack_int LAPACKE_cpoequ( int matrix_order, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           float* s, float* scond, float* amax );
+lapack_int LAPACKE_zpoequ( int matrix_order, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           double* s, double* scond, double* amax );
+
+lapack_int LAPACKE_spoequb( int matrix_order, lapack_int n, const float* a,
+                            lapack_int lda, float* s, float* scond,
+                            float* amax );
+lapack_int LAPACKE_dpoequb( int matrix_order, lapack_int n, const double* a,
+                            lapack_int lda, double* s, double* scond,
+                            double* amax );
+lapack_int LAPACKE_cpoequb( int matrix_order, lapack_int n,
+                            const lapack_complex_float* a, lapack_int lda,
+                            float* s, float* scond, float* amax );
+lapack_int LAPACKE_zpoequb( int matrix_order, lapack_int n,
+                            const lapack_complex_double* a, lapack_int lda,
+                            double* s, double* scond, double* amax );
+
+lapack_int LAPACKE_sporfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* a, lapack_int lda,
+                           const float* af, lapack_int ldaf, const float* b,
+                           lapack_int ldb, float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_dporfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* a, lapack_int lda,
+                           const double* af, lapack_int ldaf, const double* b,
+                           lapack_int ldb, double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_cporfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* af,
+                           lapack_int ldaf, const lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* x,
+                           lapack_int ldx, float* ferr, float* berr );
+lapack_int LAPACKE_zporfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* af,
+                           lapack_int ldaf, const lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* x,
+                           lapack_int ldx, double* ferr, double* berr );
+
+lapack_int LAPACKE_sporfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs, const float* a,
+                            lapack_int lda, const float* af, lapack_int ldaf,
+                            const float* s, const float* b, lapack_int ldb,
+                            float* x, lapack_int ldx, float* rcond, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_dporfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs, const double* a,
+                            lapack_int lda, const double* af, lapack_int ldaf,
+                            const double* s, const double* b, lapack_int ldb,
+                            double* x, lapack_int ldx, double* rcond,
+                            double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+lapack_int LAPACKE_cporfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_float* a, lapack_int lda,
+                            const lapack_complex_float* af, lapack_int ldaf,
+                            const float* s, const lapack_complex_float* b,
+                            lapack_int ldb, lapack_complex_float* x,
+                            lapack_int ldx, float* rcond, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_zporfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_double* a, lapack_int lda,
+                            const lapack_complex_double* af, lapack_int ldaf,
+                            const double* s, const lapack_complex_double* b,
+                            lapack_int ldb, lapack_complex_double* x,
+                            lapack_int ldx, double* rcond, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+
+lapack_int LAPACKE_sposv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, float* a, lapack_int lda, float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dposv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, double* a, lapack_int lda, double* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_cposv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_float* a,
+                          lapack_int lda, lapack_complex_float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_zposv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dsposv( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, double* a, lapack_int lda,
+                           double* b, lapack_int ldb, double* x, lapack_int ldx,
+                           lapack_int* iter );
+lapack_int LAPACKE_zcposv( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* x,
+                           lapack_int ldx, lapack_int* iter );
+
+lapack_int LAPACKE_sposvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, float* a, lapack_int lda, float* af,
+                           lapack_int ldaf, char* equed, float* s, float* b,
+                           lapack_int ldb, float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_dposvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, double* a, lapack_int lda,
+                           double* af, lapack_int ldaf, char* equed, double* s,
+                           double* b, lapack_int ldb, double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+lapack_int LAPACKE_cposvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* af,
+                           lapack_int ldaf, char* equed, float* s,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_zposvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* af,
+                           lapack_int ldaf, char* equed, double* s,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_sposvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs, float* a,
+                            lapack_int lda, float* af, lapack_int ldaf,
+                            char* equed, float* s, float* b, lapack_int ldb,
+                            float* x, lapack_int ldx, float* rcond,
+                            float* rpvgrw, float* berr, lapack_int n_err_bnds,
+                            float* err_bnds_norm, float* err_bnds_comp,
+                            lapack_int nparams, float* params );
+lapack_int LAPACKE_dposvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs, double* a,
+                            lapack_int lda, double* af, lapack_int ldaf,
+                            char* equed, double* s, double* b, lapack_int ldb,
+                            double* x, lapack_int ldx, double* rcond,
+                            double* rpvgrw, double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+lapack_int LAPACKE_cposvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* af, lapack_int ldaf,
+                            char* equed, float* s, lapack_complex_float* b,
+                            lapack_int ldb, lapack_complex_float* x,
+                            lapack_int ldx, float* rcond, float* rpvgrw,
+                            float* berr, lapack_int n_err_bnds,
+                            float* err_bnds_norm, float* err_bnds_comp,
+                            lapack_int nparams, float* params );
+lapack_int LAPACKE_zposvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* af, lapack_int ldaf,
+                            char* equed, double* s, lapack_complex_double* b,
+                            lapack_int ldb, lapack_complex_double* x,
+                            lapack_int ldx, double* rcond, double* rpvgrw,
+                            double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+
+lapack_int LAPACKE_spotrf( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda );
+lapack_int LAPACKE_dpotrf( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda );
+lapack_int LAPACKE_cpotrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zpotrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_spotri( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda );
+lapack_int LAPACKE_dpotri( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda );
+lapack_int LAPACKE_cpotri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zpotri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_spotrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* a, lapack_int lda,
+                           float* b, lapack_int ldb );
+lapack_int LAPACKE_dpotrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* a, lapack_int lda,
+                           double* b, lapack_int ldb );
+lapack_int LAPACKE_cpotrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_zpotrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb );
+
+lapack_int LAPACKE_sppcon( int matrix_order, char uplo, lapack_int n,
+                           const float* ap, float anorm, float* rcond );
+lapack_int LAPACKE_dppcon( int matrix_order, char uplo, lapack_int n,
+                           const double* ap, double anorm, double* rcond );
+lapack_int LAPACKE_cppcon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* ap, float anorm,
+                           float* rcond );
+lapack_int LAPACKE_zppcon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* ap, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_sppequ( int matrix_order, char uplo, lapack_int n,
+                           const float* ap, float* s, float* scond,
+                           float* amax );
+lapack_int LAPACKE_dppequ( int matrix_order, char uplo, lapack_int n,
+                           const double* ap, double* s, double* scond,
+                           double* amax );
+lapack_int LAPACKE_cppequ( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* ap, float* s,
+                           float* scond, float* amax );
+lapack_int LAPACKE_zppequ( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* ap, double* s,
+                           double* scond, double* amax );
+
+lapack_int LAPACKE_spprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* ap, const float* afp,
+                           const float* b, lapack_int ldb, float* x,
+                           lapack_int ldx, float* ferr, float* berr );
+lapack_int LAPACKE_dpprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* ap, const double* afp,
+                           const double* b, lapack_int ldb, double* x,
+                           lapack_int ldx, double* ferr, double* berr );
+lapack_int LAPACKE_cpprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           const lapack_complex_float* afp,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zpprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           const lapack_complex_double* afp,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_sppsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, float* ap, float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dppsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, double* ap, double* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_cppsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_float* ap,
+                          lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zppsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_double* ap,
+                          lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sppsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, float* ap, float* afp, char* equed,
+                           float* s, float* b, lapack_int ldb, float* x,
+                           lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_dppsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, double* ap, double* afp,
+                           char* equed, double* s, double* b, lapack_int ldb,
+                           double* x, lapack_int ldx, double* rcond,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_cppsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, lapack_complex_float* ap,
+                           lapack_complex_float* afp, char* equed, float* s,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_zppsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, lapack_complex_double* ap,
+                           lapack_complex_double* afp, char* equed, double* s,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_spptrf( int matrix_order, char uplo, lapack_int n,
+                           float* ap );
+lapack_int LAPACKE_dpptrf( int matrix_order, char uplo, lapack_int n,
+                           double* ap );
+lapack_int LAPACKE_cpptrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* ap );
+lapack_int LAPACKE_zpptrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* ap );
+
+lapack_int LAPACKE_spptri( int matrix_order, char uplo, lapack_int n,
+                           float* ap );
+lapack_int LAPACKE_dpptri( int matrix_order, char uplo, lapack_int n,
+                           double* ap );
+lapack_int LAPACKE_cpptri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* ap );
+lapack_int LAPACKE_zpptri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* ap );
+
+lapack_int LAPACKE_spptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* ap, float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_dpptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* ap, double* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_cpptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_spstrf( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda, lapack_int* piv, lapack_int* rank,
+                           float tol );
+lapack_int LAPACKE_dpstrf( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda, lapack_int* piv, lapack_int* rank,
+                           double tol );
+lapack_int LAPACKE_cpstrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* piv, lapack_int* rank, float tol );
+lapack_int LAPACKE_zpstrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* piv, lapack_int* rank, double tol );
+
+lapack_int LAPACKE_sptcon( lapack_int n, const float* d, const float* e,
+                           float anorm, float* rcond );
+lapack_int LAPACKE_dptcon( lapack_int n, const double* d, const double* e,
+                           double anorm, double* rcond );
+lapack_int LAPACKE_cptcon( lapack_int n, const float* d,
+                           const lapack_complex_float* e, float anorm,
+                           float* rcond );
+lapack_int LAPACKE_zptcon( lapack_int n, const double* d,
+                           const lapack_complex_double* e, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_spteqr( int matrix_order, char compz, lapack_int n, float* d,
+                           float* e, float* z, lapack_int ldz );
+lapack_int LAPACKE_dpteqr( int matrix_order, char compz, lapack_int n,
+                           double* d, double* e, double* z, lapack_int ldz );
+lapack_int LAPACKE_cpteqr( int matrix_order, char compz, lapack_int n, float* d,
+                           float* e, lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zpteqr( int matrix_order, char compz, lapack_int n,
+                           double* d, double* e, lapack_complex_double* z,
+                           lapack_int ldz );
+
+lapack_int LAPACKE_sptrfs( int matrix_order, lapack_int n, lapack_int nrhs,
+                           const float* d, const float* e, const float* df,
+                           const float* ef, const float* b, lapack_int ldb,
+                           float* x, lapack_int ldx, float* ferr, float* berr );
+lapack_int LAPACKE_dptrfs( int matrix_order, lapack_int n, lapack_int nrhs,
+                           const double* d, const double* e, const double* df,
+                           const double* ef, const double* b, lapack_int ldb,
+                           double* x, lapack_int ldx, double* ferr,
+                           double* berr );
+lapack_int LAPACKE_cptrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* d,
+                           const lapack_complex_float* e, const float* df,
+                           const lapack_complex_float* ef,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zptrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* d,
+                           const lapack_complex_double* e, const double* df,
+                           const lapack_complex_double* ef,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_sptsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          float* d, float* e, float* b, lapack_int ldb );
+lapack_int LAPACKE_dptsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          double* d, double* e, double* b, lapack_int ldb );
+lapack_int LAPACKE_cptsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          float* d, lapack_complex_float* e,
+                          lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zptsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          double* d, lapack_complex_double* e,
+                          lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sptsvx( int matrix_order, char fact, lapack_int n,
+                           lapack_int nrhs, const float* d, const float* e,
+                           float* df, float* ef, const float* b, lapack_int ldb,
+                           float* x, lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_dptsvx( int matrix_order, char fact, lapack_int n,
+                           lapack_int nrhs, const double* d, const double* e,
+                           double* df, double* ef, const double* b,
+                           lapack_int ldb, double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+lapack_int LAPACKE_cptsvx( int matrix_order, char fact, lapack_int n,
+                           lapack_int nrhs, const float* d,
+                           const lapack_complex_float* e, float* df,
+                           lapack_complex_float* ef,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_zptsvx( int matrix_order, char fact, lapack_int n,
+                           lapack_int nrhs, const double* d,
+                           const lapack_complex_double* e, double* df,
+                           lapack_complex_double* ef,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_spttrf( lapack_int n, float* d, float* e );
+lapack_int LAPACKE_dpttrf( lapack_int n, double* d, double* e );
+lapack_int LAPACKE_cpttrf( lapack_int n, float* d, lapack_complex_float* e );
+lapack_int LAPACKE_zpttrf( lapack_int n, double* d, lapack_complex_double* e );
+
+lapack_int LAPACKE_spttrs( int matrix_order, lapack_int n, lapack_int nrhs,
+                           const float* d, const float* e, float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_dpttrs( int matrix_order, lapack_int n, lapack_int nrhs,
+                           const double* d, const double* e, double* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_cpttrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* d,
+                           const lapack_complex_float* e,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpttrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* d,
+                           const lapack_complex_double* e,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_ssbev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int kd, float* ab, lapack_int ldab, float* w,
+                          float* z, lapack_int ldz );
+lapack_int LAPACKE_dsbev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int kd, double* ab, lapack_int ldab, double* w,
+                          double* z, lapack_int ldz );
+
+lapack_int LAPACKE_ssbevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int kd, float* ab, lapack_int ldab, float* w,
+                           float* z, lapack_int ldz );
+lapack_int LAPACKE_dsbevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int kd, double* ab, lapack_int ldab,
+                           double* w, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_ssbevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int kd, float* ab,
+                           lapack_int ldab, float* q, lapack_int ldq, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_dsbevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int kd, double* ab,
+                           lapack_int ldab, double* q, lapack_int ldq,
+                           double vl, double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* ifail );
+
+lapack_int LAPACKE_ssbgst( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb, float* ab,
+                           lapack_int ldab, const float* bb, lapack_int ldbb,
+                           float* x, lapack_int ldx );
+lapack_int LAPACKE_dsbgst( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb, double* ab,
+                           lapack_int ldab, const double* bb, lapack_int ldbb,
+                           double* x, lapack_int ldx );
+
+lapack_int LAPACKE_ssbgv( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int ka, lapack_int kb, float* ab,
+                          lapack_int ldab, float* bb, lapack_int ldbb, float* w,
+                          float* z, lapack_int ldz );
+lapack_int LAPACKE_dsbgv( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int ka, lapack_int kb, double* ab,
+                          lapack_int ldab, double* bb, lapack_int ldbb,
+                          double* w, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_ssbgvd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb, float* ab,
+                           lapack_int ldab, float* bb, lapack_int ldbb,
+                           float* w, float* z, lapack_int ldz );
+lapack_int LAPACKE_dsbgvd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb, double* ab,
+                           lapack_int ldab, double* bb, lapack_int ldbb,
+                           double* w, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_ssbgvx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int ka, lapack_int kb,
+                           float* ab, lapack_int ldab, float* bb,
+                           lapack_int ldbb, float* q, lapack_int ldq, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_dsbgvx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int ka, lapack_int kb,
+                           double* ab, lapack_int ldab, double* bb,
+                           lapack_int ldbb, double* q, lapack_int ldq,
+                           double vl, double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* ifail );
+
+lapack_int LAPACKE_ssbtrd( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int kd, float* ab, lapack_int ldab, float* d,
+                           float* e, float* q, lapack_int ldq );
+lapack_int LAPACKE_dsbtrd( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int kd, double* ab, lapack_int ldab,
+                           double* d, double* e, double* q, lapack_int ldq );
+
+lapack_int LAPACKE_ssfrk( int matrix_order, char transr, char uplo, char trans,
+                          lapack_int n, lapack_int k, float alpha,
+                          const float* a, lapack_int lda, float beta,
+                          float* c );
+lapack_int LAPACKE_dsfrk( int matrix_order, char transr, char uplo, char trans,
+                          lapack_int n, lapack_int k, double alpha,
+                          const double* a, lapack_int lda, double beta,
+                          double* c );
+
+lapack_int LAPACKE_sspcon( int matrix_order, char uplo, lapack_int n,
+                           const float* ap, const lapack_int* ipiv, float anorm,
+                           float* rcond );
+lapack_int LAPACKE_dspcon( int matrix_order, char uplo, lapack_int n,
+                           const double* ap, const lapack_int* ipiv,
+                           double anorm, double* rcond );
+lapack_int LAPACKE_cspcon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* ap,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_zspcon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* ap,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_sspev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          float* ap, float* w, float* z, lapack_int ldz );
+lapack_int LAPACKE_dspev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          double* ap, double* w, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sspevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           float* ap, float* w, float* z, lapack_int ldz );
+lapack_int LAPACKE_dspevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           double* ap, double* w, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sspevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, float* ap, float vl, float vu,
+                           lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_dspevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, double* ap, double vl, double vu,
+                           lapack_int il, lapack_int iu, double abstol,
+                           lapack_int* m, double* w, double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_sspgst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, float* ap, const float* bp );
+lapack_int LAPACKE_dspgst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, double* ap, const double* bp );
+
+lapack_int LAPACKE_sspgv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, float* ap, float* bp,
+                          float* w, float* z, lapack_int ldz );
+lapack_int LAPACKE_dspgv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, double* ap, double* bp,
+                          double* w, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sspgvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, float* ap, float* bp,
+                           float* w, float* z, lapack_int ldz );
+lapack_int LAPACKE_dspgvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, double* ap, double* bp,
+                           double* w, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sspgvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n, float* ap,
+                           float* bp, float vl, float vu, lapack_int il,
+                           lapack_int iu, float abstol, lapack_int* m, float* w,
+                           float* z, lapack_int ldz, lapack_int* ifail );
+lapack_int LAPACKE_dspgvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n, double* ap,
+                           double* bp, double vl, double vu, lapack_int il,
+                           lapack_int iu, double abstol, lapack_int* m,
+                           double* w, double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_ssprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* ap, const float* afp,
+                           const lapack_int* ipiv, const float* b,
+                           lapack_int ldb, float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_dsprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* ap, const double* afp,
+                           const lapack_int* ipiv, const double* b,
+                           lapack_int ldb, double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_csprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           const lapack_complex_float* afp,
+                           const lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zsprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           const lapack_complex_double* afp,
+                           const lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_sspsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, float* ap, lapack_int* ipiv,
+                          float* b, lapack_int ldb );
+lapack_int LAPACKE_dspsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, double* ap, lapack_int* ipiv,
+                          double* b, lapack_int ldb );
+lapack_int LAPACKE_cspsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_float* ap,
+                          lapack_int* ipiv, lapack_complex_float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_zspsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_double* ap,
+                          lapack_int* ipiv, lapack_complex_double* b,
+                          lapack_int ldb );
+
+lapack_int LAPACKE_sspsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* ap, float* afp,
+                           lapack_int* ipiv, const float* b, lapack_int ldb,
+                           float* x, lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_dspsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* ap, double* afp,
+                           lapack_int* ipiv, const double* b, lapack_int ldb,
+                           double* x, lapack_int ldx, double* rcond,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_cspsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           lapack_complex_float* afp, lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_zspsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           lapack_complex_double* afp, lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_ssptrd( int matrix_order, char uplo, lapack_int n, float* ap,
+                           float* d, float* e, float* tau );
+lapack_int LAPACKE_dsptrd( int matrix_order, char uplo, lapack_int n,
+                           double* ap, double* d, double* e, double* tau );
+
+lapack_int LAPACKE_ssptrf( int matrix_order, char uplo, lapack_int n, float* ap,
+                           lapack_int* ipiv );
+lapack_int LAPACKE_dsptrf( int matrix_order, char uplo, lapack_int n,
+                           double* ap, lapack_int* ipiv );
+lapack_int LAPACKE_csptrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* ap, lapack_int* ipiv );
+lapack_int LAPACKE_zsptrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* ap, lapack_int* ipiv );
+
+lapack_int LAPACKE_ssptri( int matrix_order, char uplo, lapack_int n, float* ap,
+                           const lapack_int* ipiv );
+lapack_int LAPACKE_dsptri( int matrix_order, char uplo, lapack_int n,
+                           double* ap, const lapack_int* ipiv );
+lapack_int LAPACKE_csptri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* ap, const lapack_int* ipiv );
+lapack_int LAPACKE_zsptri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* ap, const lapack_int* ipiv );
+
+lapack_int LAPACKE_ssptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* ap,
+                           const lapack_int* ipiv, float* b, lapack_int ldb );
+lapack_int LAPACKE_dsptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* ap,
+                           const lapack_int* ipiv, double* b, lapack_int ldb );
+lapack_int LAPACKE_csptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           const lapack_int* ipiv, lapack_complex_float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_zsptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           const lapack_int* ipiv, lapack_complex_double* b,
+                           lapack_int ldb );
+
+lapack_int LAPACKE_sstebz( char range, char order, lapack_int n, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           const float* d, const float* e, lapack_int* m,
+                           lapack_int* nsplit, float* w, lapack_int* iblock,
+                           lapack_int* isplit );
+lapack_int LAPACKE_dstebz( char range, char order, lapack_int n, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, const double* d, const double* e,
+                           lapack_int* m, lapack_int* nsplit, double* w,
+                           lapack_int* iblock, lapack_int* isplit );
+
+lapack_int LAPACKE_sstedc( int matrix_order, char compz, lapack_int n, float* d,
+                           float* e, float* z, lapack_int ldz );
+lapack_int LAPACKE_dstedc( int matrix_order, char compz, lapack_int n,
+                           double* d, double* e, double* z, lapack_int ldz );
+lapack_int LAPACKE_cstedc( int matrix_order, char compz, lapack_int n, float* d,
+                           float* e, lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zstedc( int matrix_order, char compz, lapack_int n,
+                           double* d, double* e, lapack_complex_double* z,
+                           lapack_int ldz );
+
+lapack_int LAPACKE_sstegr( int matrix_order, char jobz, char range,
+                           lapack_int n, float* d, float* e, float vl, float vu,
+                           lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* isuppz );
+lapack_int LAPACKE_dstegr( int matrix_order, char jobz, char range,
+                           lapack_int n, double* d, double* e, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* isuppz );
+lapack_int LAPACKE_cstegr( int matrix_order, char jobz, char range,
+                           lapack_int n, float* d, float* e, float vl, float vu,
+                           lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, lapack_complex_float* z,
+                           lapack_int ldz, lapack_int* isuppz );
+lapack_int LAPACKE_zstegr( int matrix_order, char jobz, char range,
+                           lapack_int n, double* d, double* e, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* isuppz );
+
+lapack_int LAPACKE_sstein( int matrix_order, lapack_int n, const float* d,
+                           const float* e, lapack_int m, const float* w,
+                           const lapack_int* iblock, const lapack_int* isplit,
+                           float* z, lapack_int ldz, lapack_int* ifailv );
+lapack_int LAPACKE_dstein( int matrix_order, lapack_int n, const double* d,
+                           const double* e, lapack_int m, const double* w,
+                           const lapack_int* iblock, const lapack_int* isplit,
+                           double* z, lapack_int ldz, lapack_int* ifailv );
+lapack_int LAPACKE_cstein( int matrix_order, lapack_int n, const float* d,
+                           const float* e, lapack_int m, const float* w,
+                           const lapack_int* iblock, const lapack_int* isplit,
+                           lapack_complex_float* z, lapack_int ldz,
+                           lapack_int* ifailv );
+lapack_int LAPACKE_zstein( int matrix_order, lapack_int n, const double* d,
+                           const double* e, lapack_int m, const double* w,
+                           const lapack_int* iblock, const lapack_int* isplit,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* ifailv );
+
+lapack_int LAPACKE_sstemr( int matrix_order, char jobz, char range,
+                           lapack_int n, float* d, float* e, float vl, float vu,
+                           lapack_int il, lapack_int iu, lapack_int* m,
+                           float* w, float* z, lapack_int ldz, lapack_int nzc,
+                           lapack_int* isuppz, lapack_logical* tryrac );
+lapack_int LAPACKE_dstemr( int matrix_order, char jobz, char range,
+                           lapack_int n, double* d, double* e, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           lapack_int* m, double* w, double* z, lapack_int ldz,
+                           lapack_int nzc, lapack_int* isuppz,
+                           lapack_logical* tryrac );
+lapack_int LAPACKE_cstemr( int matrix_order, char jobz, char range,
+                           lapack_int n, float* d, float* e, float vl, float vu,
+                           lapack_int il, lapack_int iu, lapack_int* m,
+                           float* w, lapack_complex_float* z, lapack_int ldz,
+                           lapack_int nzc, lapack_int* isuppz,
+                           lapack_logical* tryrac );
+lapack_int LAPACKE_zstemr( int matrix_order, char jobz, char range,
+                           lapack_int n, double* d, double* e, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           lapack_int* m, double* w, lapack_complex_double* z,
+                           lapack_int ldz, lapack_int nzc, lapack_int* isuppz,
+                           lapack_logical* tryrac );
+
+lapack_int LAPACKE_ssteqr( int matrix_order, char compz, lapack_int n, float* d,
+                           float* e, float* z, lapack_int ldz );
+lapack_int LAPACKE_dsteqr( int matrix_order, char compz, lapack_int n,
+                           double* d, double* e, double* z, lapack_int ldz );
+lapack_int LAPACKE_csteqr( int matrix_order, char compz, lapack_int n, float* d,
+                           float* e, lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zsteqr( int matrix_order, char compz, lapack_int n,
+                           double* d, double* e, lapack_complex_double* z,
+                           lapack_int ldz );
+
+lapack_int LAPACKE_ssterf( lapack_int n, float* d, float* e );
+lapack_int LAPACKE_dsterf( lapack_int n, double* d, double* e );
+
+lapack_int LAPACKE_sstev( int matrix_order, char jobz, lapack_int n, float* d,
+                          float* e, float* z, lapack_int ldz );
+lapack_int LAPACKE_dstev( int matrix_order, char jobz, lapack_int n, double* d,
+                          double* e, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sstevd( int matrix_order, char jobz, lapack_int n, float* d,
+                           float* e, float* z, lapack_int ldz );
+lapack_int LAPACKE_dstevd( int matrix_order, char jobz, lapack_int n, double* d,
+                           double* e, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sstevr( int matrix_order, char jobz, char range,
+                           lapack_int n, float* d, float* e, float vl, float vu,
+                           lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* isuppz );
+lapack_int LAPACKE_dstevr( int matrix_order, char jobz, char range,
+                           lapack_int n, double* d, double* e, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* isuppz );
+
+lapack_int LAPACKE_sstevx( int matrix_order, char jobz, char range,
+                           lapack_int n, float* d, float* e, float vl, float vu,
+                           lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_dstevx( int matrix_order, char jobz, char range,
+                           lapack_int n, double* d, double* e, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* ifail );
+
+lapack_int LAPACKE_ssycon( int matrix_order, char uplo, lapack_int n,
+                           const float* a, lapack_int lda,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_dsycon( int matrix_order, char uplo, lapack_int n,
+                           const double* a, lapack_int lda,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+lapack_int LAPACKE_csycon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_zsycon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_ssyequb( int matrix_order, char uplo, lapack_int n,
+                            const float* a, lapack_int lda, float* s,
+                            float* scond, float* amax );
+lapack_int LAPACKE_dsyequb( int matrix_order, char uplo, lapack_int n,
+                            const double* a, lapack_int lda, double* s,
+                            double* scond, double* amax );
+lapack_int LAPACKE_csyequb( int matrix_order, char uplo, lapack_int n,
+                            const lapack_complex_float* a, lapack_int lda,
+                            float* s, float* scond, float* amax );
+lapack_int LAPACKE_zsyequb( int matrix_order, char uplo, lapack_int n,
+                            const lapack_complex_double* a, lapack_int lda,
+                            double* s, double* scond, double* amax );
+
+lapack_int LAPACKE_ssyev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          float* a, lapack_int lda, float* w );
+lapack_int LAPACKE_dsyev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          double* a, lapack_int lda, double* w );
+
+lapack_int LAPACKE_ssyevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           float* a, lapack_int lda, float* w );
+lapack_int LAPACKE_dsyevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           double* a, lapack_int lda, double* w );
+
+lapack_int LAPACKE_ssyevr( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, float* a, lapack_int lda, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* isuppz );
+lapack_int LAPACKE_dsyevr( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, double* a, lapack_int lda, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* isuppz );
+
+lapack_int LAPACKE_ssyevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, float* a, lapack_int lda, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_dsyevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, double* a, lapack_int lda, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* ifail );
+
+lapack_int LAPACKE_ssygst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, float* a, lapack_int lda,
+                           const float* b, lapack_int ldb );
+lapack_int LAPACKE_dsygst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, double* a, lapack_int lda,
+                           const double* b, lapack_int ldb );
+
+lapack_int LAPACKE_ssygv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, float* a, lapack_int lda,
+                          float* b, lapack_int ldb, float* w );
+lapack_int LAPACKE_dsygv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, double* a, lapack_int lda,
+                          double* b, lapack_int ldb, double* w );
+
+lapack_int LAPACKE_ssygvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, float* a, lapack_int lda,
+                           float* b, lapack_int ldb, float* w );
+lapack_int LAPACKE_dsygvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, double* a, lapack_int lda,
+                           double* b, lapack_int ldb, double* w );
+
+lapack_int LAPACKE_ssygvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n, float* a,
+                           lapack_int lda, float* b, lapack_int ldb, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_dsygvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n, double* a,
+                           lapack_int lda, double* b, lapack_int ldb, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* ifail );
+
+lapack_int LAPACKE_ssyrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* a, lapack_int lda,
+                           const float* af, lapack_int ldaf,
+                           const lapack_int* ipiv, const float* b,
+                           lapack_int ldb, float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_dsyrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* a, lapack_int lda,
+                           const double* af, lapack_int ldaf,
+                           const lapack_int* ipiv, const double* b,
+                           lapack_int ldb, double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_csyrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* af,
+                           lapack_int ldaf, const lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zsyrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* af,
+                           lapack_int ldaf, const lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_ssyrfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs, const float* a,
+                            lapack_int lda, const float* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const float* s,
+                            const float* b, lapack_int ldb, float* x,
+                            lapack_int ldx, float* rcond, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_dsyrfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs, const double* a,
+                            lapack_int lda, const double* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const double* s,
+                            const double* b, lapack_int ldb, double* x,
+                            lapack_int ldx, double* rcond, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+lapack_int LAPACKE_csyrfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_float* a, lapack_int lda,
+                            const lapack_complex_float* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const float* s,
+                            const lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* x, lapack_int ldx,
+                            float* rcond, float* berr, lapack_int n_err_bnds,
+                            float* err_bnds_norm, float* err_bnds_comp,
+                            lapack_int nparams, float* params );
+lapack_int LAPACKE_zsyrfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_double* a, lapack_int lda,
+                            const lapack_complex_double* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const double* s,
+                            const lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* x, lapack_int ldx,
+                            double* rcond, double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+
+lapack_int LAPACKE_ssysv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, float* a, lapack_int lda,
+                          lapack_int* ipiv, float* b, lapack_int ldb );
+lapack_int LAPACKE_dsysv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, double* a, lapack_int lda,
+                          lapack_int* ipiv, double* b, lapack_int ldb );
+lapack_int LAPACKE_csysv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_float* a,
+                          lapack_int lda, lapack_int* ipiv,
+                          lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zsysv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_double* a,
+                          lapack_int lda, lapack_int* ipiv,
+                          lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_ssysvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* a, lapack_int lda,
+                           float* af, lapack_int ldaf, lapack_int* ipiv,
+                           const float* b, lapack_int ldb, float* x,
+                           lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_dsysvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* a, lapack_int lda,
+                           double* af, lapack_int ldaf, lapack_int* ipiv,
+                           const double* b, lapack_int ldb, double* x,
+                           lapack_int ldx, double* rcond, double* ferr,
+                           double* berr );
+lapack_int LAPACKE_csysvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* af,
+                           lapack_int ldaf, lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_zsysvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* af,
+                           lapack_int ldaf, lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_ssysvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs, float* a,
+                            lapack_int lda, float* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, float* s, float* b,
+                            lapack_int ldb, float* x, lapack_int ldx,
+                            float* rcond, float* rpvgrw, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_dsysvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs, double* a,
+                            lapack_int lda, double* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, double* s, double* b,
+                            lapack_int ldb, double* x, lapack_int ldx,
+                            double* rcond, double* rpvgrw, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+lapack_int LAPACKE_csysvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, float* s,
+                            lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* x, lapack_int ldx,
+                            float* rcond, float* rpvgrw, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_zsysvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, double* s,
+                            lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* x, lapack_int ldx,
+                            double* rcond, double* rpvgrw, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+
+lapack_int LAPACKE_ssytrd( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda, float* d, float* e, float* tau );
+lapack_int LAPACKE_dsytrd( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda, double* d, double* e, double* tau );
+
+lapack_int LAPACKE_ssytrf( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_dsytrf( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_csytrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* ipiv );
+lapack_int LAPACKE_zsytrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* ipiv );
+
+lapack_int LAPACKE_ssytri( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_dsytri( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_csytri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           const lapack_int* ipiv );
+lapack_int LAPACKE_zsytri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           const lapack_int* ipiv );
+
+lapack_int LAPACKE_ssytrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* a, lapack_int lda,
+                           const lapack_int* ipiv, float* b, lapack_int ldb );
+lapack_int LAPACKE_dsytrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* a, lapack_int lda,
+                           const lapack_int* ipiv, double* b, lapack_int ldb );
+lapack_int LAPACKE_csytrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_int* ipiv,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zsytrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_int* ipiv,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_stbcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, lapack_int kd, const float* ab,
+                           lapack_int ldab, float* rcond );
+lapack_int LAPACKE_dtbcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, lapack_int kd, const double* ab,
+                           lapack_int ldab, double* rcond );
+lapack_int LAPACKE_ctbcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, lapack_int kd,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           float* rcond );
+lapack_int LAPACKE_ztbcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, lapack_int kd,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           double* rcond );
+
+lapack_int LAPACKE_stbrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const float* ab, lapack_int ldab, const float* b,
+                           lapack_int ldb, const float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_dtbrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const double* ab, lapack_int ldab, const double* b,
+                           lapack_int ldb, const double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_ctbrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           const lapack_complex_float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_ztbrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           const lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_stbtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const float* ab, lapack_int ldab, float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_dtbtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const double* ab, lapack_int ldab, double* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_ctbtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztbtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_stfsm( int matrix_order, char transr, char side, char uplo,
+                          char trans, char diag, lapack_int m, lapack_int n,
+                          float alpha, const float* a, float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dtfsm( int matrix_order, char transr, char side, char uplo,
+                          char trans, char diag, lapack_int m, lapack_int n,
+                          double alpha, const double* a, double* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_ctfsm( int matrix_order, char transr, char side, char uplo,
+                          char trans, char diag, lapack_int m, lapack_int n,
+                          lapack_complex_float alpha,
+                          const lapack_complex_float* a,
+                          lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztfsm( int matrix_order, char transr, char side, char uplo,
+                          char trans, char diag, lapack_int m, lapack_int n,
+                          lapack_complex_double alpha,
+                          const lapack_complex_double* a,
+                          lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_stftri( int matrix_order, char transr, char uplo, char diag,
+                           lapack_int n, float* a );
+lapack_int LAPACKE_dtftri( int matrix_order, char transr, char uplo, char diag,
+                           lapack_int n, double* a );
+lapack_int LAPACKE_ctftri( int matrix_order, char transr, char uplo, char diag,
+                           lapack_int n, lapack_complex_float* a );
+lapack_int LAPACKE_ztftri( int matrix_order, char transr, char uplo, char diag,
+                           lapack_int n, lapack_complex_double* a );
+
+lapack_int LAPACKE_stfttp( int matrix_order, char transr, char uplo,
+                           lapack_int n, const float* arf, float* ap );
+lapack_int LAPACKE_dtfttp( int matrix_order, char transr, char uplo,
+                           lapack_int n, const double* arf, double* ap );
+lapack_int LAPACKE_ctfttp( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_float* arf,
+                           lapack_complex_float* ap );
+lapack_int LAPACKE_ztfttp( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_double* arf,
+                           lapack_complex_double* ap );
+
+lapack_int LAPACKE_stfttr( int matrix_order, char transr, char uplo,
+                           lapack_int n, const float* arf, float* a,
+                           lapack_int lda );
+lapack_int LAPACKE_dtfttr( int matrix_order, char transr, char uplo,
+                           lapack_int n, const double* arf, double* a,
+                           lapack_int lda );
+lapack_int LAPACKE_ctfttr( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_float* arf,
+                           lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_ztfttr( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_double* arf,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_stgevc( int matrix_order, char side, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const float* s, lapack_int lds, const float* p,
+                           lapack_int ldp, float* vl, lapack_int ldvl,
+                           float* vr, lapack_int ldvr, lapack_int mm,
+                           lapack_int* m );
+lapack_int LAPACKE_dtgevc( int matrix_order, char side, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const double* s, lapack_int lds, const double* p,
+                           lapack_int ldp, double* vl, lapack_int ldvl,
+                           double* vr, lapack_int ldvr, lapack_int mm,
+                           lapack_int* m );
+lapack_int LAPACKE_ctgevc( int matrix_order, char side, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_float* s, lapack_int lds,
+                           const lapack_complex_float* p, lapack_int ldp,
+                           lapack_complex_float* vl, lapack_int ldvl,
+                           lapack_complex_float* vr, lapack_int ldvr,
+                           lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_ztgevc( int matrix_order, char side, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_double* s, lapack_int lds,
+                           const lapack_complex_double* p, lapack_int ldp,
+                           lapack_complex_double* vl, lapack_int ldvl,
+                           lapack_complex_double* vr, lapack_int ldvr,
+                           lapack_int mm, lapack_int* m );
+
+lapack_int LAPACKE_stgexc( int matrix_order, lapack_logical wantq,
+                           lapack_logical wantz, lapack_int n, float* a,
+                           lapack_int lda, float* b, lapack_int ldb, float* q,
+                           lapack_int ldq, float* z, lapack_int ldz,
+                           lapack_int* ifst, lapack_int* ilst );
+lapack_int LAPACKE_dtgexc( int matrix_order, lapack_logical wantq,
+                           lapack_logical wantz, lapack_int n, double* a,
+                           lapack_int lda, double* b, lapack_int ldb, double* q,
+                           lapack_int ldq, double* z, lapack_int ldz,
+                           lapack_int* ifst, lapack_int* ilst );
+lapack_int LAPACKE_ctgexc( int matrix_order, lapack_logical wantq,
+                           lapack_logical wantz, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* q, lapack_int ldq,
+                           lapack_complex_float* z, lapack_int ldz,
+                           lapack_int ifst, lapack_int ilst );
+lapack_int LAPACKE_ztgexc( int matrix_order, lapack_logical wantq,
+                           lapack_logical wantz, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int ifst, lapack_int ilst );
+
+lapack_int LAPACKE_stgsen( int matrix_order, lapack_int ijob,
+                           lapack_logical wantq, lapack_logical wantz,
+                           const lapack_logical* select, lapack_int n, float* a,
+                           lapack_int lda, float* b, lapack_int ldb,
+                           float* alphar, float* alphai, float* beta, float* q,
+                           lapack_int ldq, float* z, lapack_int ldz,
+                           lapack_int* m, float* pl, float* pr, float* dif );
+lapack_int LAPACKE_dtgsen( int matrix_order, lapack_int ijob,
+                           lapack_logical wantq, lapack_logical wantz,
+                           const lapack_logical* select, lapack_int n,
+                           double* a, lapack_int lda, double* b, lapack_int ldb,
+                           double* alphar, double* alphai, double* beta,
+                           double* q, lapack_int ldq, double* z, lapack_int ldz,
+                           lapack_int* m, double* pl, double* pr, double* dif );
+lapack_int LAPACKE_ctgsen( int matrix_order, lapack_int ijob,
+                           lapack_logical wantq, lapack_logical wantz,
+                           const lapack_logical* select, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* alpha,
+                           lapack_complex_float* beta, lapack_complex_float* q,
+                           lapack_int ldq, lapack_complex_float* z,
+                           lapack_int ldz, lapack_int* m, float* pl, float* pr,
+                           float* dif );
+lapack_int LAPACKE_ztgsen( int matrix_order, lapack_int ijob,
+                           lapack_logical wantq, lapack_logical wantz,
+                           const lapack_logical* select, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* alpha,
+                           lapack_complex_double* beta,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* m, double* pl, double* pr, double* dif );
+
+lapack_int LAPACKE_stgsja( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n,
+                           lapack_int k, lapack_int l, float* a, lapack_int lda,
+                           float* b, lapack_int ldb, float tola, float tolb,
+                           float* alpha, float* beta, float* u, lapack_int ldu,
+                           float* v, lapack_int ldv, float* q, lapack_int ldq,
+                           lapack_int* ncycle );
+lapack_int LAPACKE_dtgsja( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n,
+                           lapack_int k, lapack_int l, double* a,
+                           lapack_int lda, double* b, lapack_int ldb,
+                           double tola, double tolb, double* alpha,
+                           double* beta, double* u, lapack_int ldu, double* v,
+                           lapack_int ldv, double* q, lapack_int ldq,
+                           lapack_int* ncycle );
+lapack_int LAPACKE_ctgsja( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n,
+                           lapack_int k, lapack_int l, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, float tola, float tolb, float* alpha,
+                           float* beta, lapack_complex_float* u, lapack_int ldu,
+                           lapack_complex_float* v, lapack_int ldv,
+                           lapack_complex_float* q, lapack_int ldq,
+                           lapack_int* ncycle );
+lapack_int LAPACKE_ztgsja( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n,
+                           lapack_int k, lapack_int l, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, double tola, double tolb,
+                           double* alpha, double* beta,
+                           lapack_complex_double* u, lapack_int ldu,
+                           lapack_complex_double* v, lapack_int ldv,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_int* ncycle );
+
+lapack_int LAPACKE_stgsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const float* a, lapack_int lda, const float* b,
+                           lapack_int ldb, const float* vl, lapack_int ldvl,
+                           const float* vr, lapack_int ldvr, float* s,
+                           float* dif, lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_dtgsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const double* a, lapack_int lda, const double* b,
+                           lapack_int ldb, const double* vl, lapack_int ldvl,
+                           const double* vr, lapack_int ldvr, double* s,
+                           double* dif, lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_ctgsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           const lapack_complex_float* vl, lapack_int ldvl,
+                           const lapack_complex_float* vr, lapack_int ldvr,
+                           float* s, float* dif, lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_ztgsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           const lapack_complex_double* vl, lapack_int ldvl,
+                           const lapack_complex_double* vr, lapack_int ldvr,
+                           double* s, double* dif, lapack_int mm,
+                           lapack_int* m );
+
+lapack_int LAPACKE_stgsyl( int matrix_order, char trans, lapack_int ijob,
+                           lapack_int m, lapack_int n, const float* a,
+                           lapack_int lda, const float* b, lapack_int ldb,
+                           float* c, lapack_int ldc, const float* d,
+                           lapack_int ldd, const float* e, lapack_int lde,
+                           float* f, lapack_int ldf, float* scale, float* dif );
+lapack_int LAPACKE_dtgsyl( int matrix_order, char trans, lapack_int ijob,
+                           lapack_int m, lapack_int n, const double* a,
+                           lapack_int lda, const double* b, lapack_int ldb,
+                           double* c, lapack_int ldc, const double* d,
+                           lapack_int ldd, const double* e, lapack_int lde,
+                           double* f, lapack_int ldf, double* scale,
+                           double* dif );
+lapack_int LAPACKE_ctgsyl( int matrix_order, char trans, lapack_int ijob,
+                           lapack_int m, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* c, lapack_int ldc,
+                           const lapack_complex_float* d, lapack_int ldd,
+                           const lapack_complex_float* e, lapack_int lde,
+                           lapack_complex_float* f, lapack_int ldf,
+                           float* scale, float* dif );
+lapack_int LAPACKE_ztgsyl( int matrix_order, char trans, lapack_int ijob,
+                           lapack_int m, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* c, lapack_int ldc,
+                           const lapack_complex_double* d, lapack_int ldd,
+                           const lapack_complex_double* e, lapack_int lde,
+                           lapack_complex_double* f, lapack_int ldf,
+                           double* scale, double* dif );
+
+lapack_int LAPACKE_stpcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const float* ap, float* rcond );
+lapack_int LAPACKE_dtpcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const double* ap, double* rcond );
+lapack_int LAPACKE_ctpcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const lapack_complex_float* ap,
+                           float* rcond );
+lapack_int LAPACKE_ztpcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const lapack_complex_double* ap,
+                           double* rcond );
+
+lapack_int LAPACKE_stprfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const float* ap,
+                           const float* b, lapack_int ldb, const float* x,
+                           lapack_int ldx, float* ferr, float* berr );
+lapack_int LAPACKE_dtprfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const double* ap,
+                           const double* b, lapack_int ldb, const double* x,
+                           lapack_int ldx, double* ferr, double* berr );
+lapack_int LAPACKE_ctprfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* ap,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           const lapack_complex_float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_ztprfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* ap,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           const lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_stptri( int matrix_order, char uplo, char diag, lapack_int n,
+                           float* ap );
+lapack_int LAPACKE_dtptri( int matrix_order, char uplo, char diag, lapack_int n,
+                           double* ap );
+lapack_int LAPACKE_ctptri( int matrix_order, char uplo, char diag, lapack_int n,
+                           lapack_complex_float* ap );
+lapack_int LAPACKE_ztptri( int matrix_order, char uplo, char diag, lapack_int n,
+                           lapack_complex_double* ap );
+
+lapack_int LAPACKE_stptrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const float* ap,
+                           float* b, lapack_int ldb );
+lapack_int LAPACKE_dtptrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const double* ap,
+                           double* b, lapack_int ldb );
+lapack_int LAPACKE_ctptrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* ap,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztptrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* ap,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_stpttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const float* ap, float* arf );
+lapack_int LAPACKE_dtpttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const double* ap, double* arf );
+lapack_int LAPACKE_ctpttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_float* ap,
+                           lapack_complex_float* arf );
+lapack_int LAPACKE_ztpttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_double* ap,
+                           lapack_complex_double* arf );
+
+lapack_int LAPACKE_stpttr( int matrix_order, char uplo, lapack_int n,
+                           const float* ap, float* a, lapack_int lda );
+lapack_int LAPACKE_dtpttr( int matrix_order, char uplo, lapack_int n,
+                           const double* ap, double* a, lapack_int lda );
+lapack_int LAPACKE_ctpttr( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* ap,
+                           lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_ztpttr( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* ap,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_strcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const float* a, lapack_int lda,
+                           float* rcond );
+lapack_int LAPACKE_dtrcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const double* a, lapack_int lda,
+                           double* rcond );
+lapack_int LAPACKE_ctrcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const lapack_complex_float* a,
+                           lapack_int lda, float* rcond );
+lapack_int LAPACKE_ztrcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const lapack_complex_double* a,
+                           lapack_int lda, double* rcond );
+
+lapack_int LAPACKE_strevc( int matrix_order, char side, char howmny,
+                           lapack_logical* select, lapack_int n, const float* t,
+                           lapack_int ldt, float* vl, lapack_int ldvl,
+                           float* vr, lapack_int ldvr, lapack_int mm,
+                           lapack_int* m );
+lapack_int LAPACKE_dtrevc( int matrix_order, char side, char howmny,
+                           lapack_logical* select, lapack_int n,
+                           const double* t, lapack_int ldt, double* vl,
+                           lapack_int ldvl, double* vr, lapack_int ldvr,
+                           lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_ctrevc( int matrix_order, char side, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           lapack_complex_float* t, lapack_int ldt,
+                           lapack_complex_float* vl, lapack_int ldvl,
+                           lapack_complex_float* vr, lapack_int ldvr,
+                           lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_ztrevc( int matrix_order, char side, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           lapack_complex_double* t, lapack_int ldt,
+                           lapack_complex_double* vl, lapack_int ldvl,
+                           lapack_complex_double* vr, lapack_int ldvr,
+                           lapack_int mm, lapack_int* m );
+
+lapack_int LAPACKE_strexc( int matrix_order, char compq, lapack_int n, float* t,
+                           lapack_int ldt, float* q, lapack_int ldq,
+                           lapack_int* ifst, lapack_int* ilst );
+lapack_int LAPACKE_dtrexc( int matrix_order, char compq, lapack_int n,
+                           double* t, lapack_int ldt, double* q, lapack_int ldq,
+                           lapack_int* ifst, lapack_int* ilst );
+lapack_int LAPACKE_ctrexc( int matrix_order, char compq, lapack_int n,
+                           lapack_complex_float* t, lapack_int ldt,
+                           lapack_complex_float* q, lapack_int ldq,
+                           lapack_int ifst, lapack_int ilst );
+lapack_int LAPACKE_ztrexc( int matrix_order, char compq, lapack_int n,
+                           lapack_complex_double* t, lapack_int ldt,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_int ifst, lapack_int ilst );
+
+lapack_int LAPACKE_strrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const float* a,
+                           lapack_int lda, const float* b, lapack_int ldb,
+                           const float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_dtrrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const double* a,
+                           lapack_int lda, const double* b, lapack_int ldb,
+                           const double* x, lapack_int ldx, double* ferr,
+                           double* berr );
+lapack_int LAPACKE_ctrrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           const lapack_complex_float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_ztrrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           const lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_strsen( int matrix_order, char job, char compq,
+                           const lapack_logical* select, lapack_int n, float* t,
+                           lapack_int ldt, float* q, lapack_int ldq, float* wr,
+                           float* wi, lapack_int* m, float* s, float* sep );
+lapack_int LAPACKE_dtrsen( int matrix_order, char job, char compq,
+                           const lapack_logical* select, lapack_int n,
+                           double* t, lapack_int ldt, double* q, lapack_int ldq,
+                           double* wr, double* wi, lapack_int* m, double* s,
+                           double* sep );
+lapack_int LAPACKE_ctrsen( int matrix_order, char job, char compq,
+                           const lapack_logical* select, lapack_int n,
+                           lapack_complex_float* t, lapack_int ldt,
+                           lapack_complex_float* q, lapack_int ldq,
+                           lapack_complex_float* w, lapack_int* m, float* s,
+                           float* sep );
+lapack_int LAPACKE_ztrsen( int matrix_order, char job, char compq,
+                           const lapack_logical* select, lapack_int n,
+                           lapack_complex_double* t, lapack_int ldt,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_complex_double* w, lapack_int* m, double* s,
+                           double* sep );
+
+lapack_int LAPACKE_strsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const float* t, lapack_int ldt, const float* vl,
+                           lapack_int ldvl, const float* vr, lapack_int ldvr,
+                           float* s, float* sep, lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_dtrsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const double* t, lapack_int ldt, const double* vl,
+                           lapack_int ldvl, const double* vr, lapack_int ldvr,
+                           double* s, double* sep, lapack_int mm,
+                           lapack_int* m );
+lapack_int LAPACKE_ctrsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_float* t, lapack_int ldt,
+                           const lapack_complex_float* vl, lapack_int ldvl,
+                           const lapack_complex_float* vr, lapack_int ldvr,
+                           float* s, float* sep, lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_ztrsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_double* t, lapack_int ldt,
+                           const lapack_complex_double* vl, lapack_int ldvl,
+                           const lapack_complex_double* vr, lapack_int ldvr,
+                           double* s, double* sep, lapack_int mm,
+                           lapack_int* m );
+
+lapack_int LAPACKE_strsyl( int matrix_order, char trana, char tranb,
+                           lapack_int isgn, lapack_int m, lapack_int n,
+                           const float* a, lapack_int lda, const float* b,
+                           lapack_int ldb, float* c, lapack_int ldc,
+                           float* scale );
+lapack_int LAPACKE_dtrsyl( int matrix_order, char trana, char tranb,
+                           lapack_int isgn, lapack_int m, lapack_int n,
+                           const double* a, lapack_int lda, const double* b,
+                           lapack_int ldb, double* c, lapack_int ldc,
+                           double* scale );
+lapack_int LAPACKE_ctrsyl( int matrix_order, char trana, char tranb,
+                           lapack_int isgn, lapack_int m, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* c, lapack_int ldc,
+                           float* scale );
+lapack_int LAPACKE_ztrsyl( int matrix_order, char trana, char tranb,
+                           lapack_int isgn, lapack_int m, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* c, lapack_int ldc,
+                           double* scale );
+
+lapack_int LAPACKE_strtri( int matrix_order, char uplo, char diag, lapack_int n,
+                           float* a, lapack_int lda );
+lapack_int LAPACKE_dtrtri( int matrix_order, char uplo, char diag, lapack_int n,
+                           double* a, lapack_int lda );
+lapack_int LAPACKE_ctrtri( int matrix_order, char uplo, char diag, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_ztrtri( int matrix_order, char uplo, char diag, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_strtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const float* a,
+                           lapack_int lda, float* b, lapack_int ldb );
+lapack_int LAPACKE_dtrtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const double* a,
+                           lapack_int lda, double* b, lapack_int ldb );
+lapack_int LAPACKE_ctrtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztrtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_strttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const float* a, lapack_int lda,
+                           float* arf );
+lapack_int LAPACKE_dtrttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const double* a, lapack_int lda,
+                           double* arf );
+lapack_int LAPACKE_ctrttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* arf );
+lapack_int LAPACKE_ztrttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* arf );
+
+lapack_int LAPACKE_strttp( int matrix_order, char uplo, lapack_int n,
+                           const float* a, lapack_int lda, float* ap );
+lapack_int LAPACKE_dtrttp( int matrix_order, char uplo, lapack_int n,
+                           const double* a, lapack_int lda, double* ap );
+lapack_int LAPACKE_ctrttp( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* ap );
+lapack_int LAPACKE_ztrttp( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* ap );
+
+lapack_int LAPACKE_stzrzf( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dtzrzf( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_ctzrzf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_ztzrzf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_cungbr( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int k, lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau );
+lapack_int LAPACKE_zungbr( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int k, lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau );
+
+lapack_int LAPACKE_cunghr( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau );
+lapack_int LAPACKE_zunghr( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau );
+
+lapack_int LAPACKE_cunglq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau );
+lapack_int LAPACKE_zunglq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau );
+
+lapack_int LAPACKE_cungql( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau );
+lapack_int LAPACKE_zungql( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau );
+
+lapack_int LAPACKE_cungqr( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau );
+lapack_int LAPACKE_zungqr( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau );
+
+lapack_int LAPACKE_cungrq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau );
+lapack_int LAPACKE_zungrq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau );
+
+lapack_int LAPACKE_cungtr( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* tau );
+lapack_int LAPACKE_zungtr( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* tau );
+
+lapack_int LAPACKE_cunmbr( int matrix_order, char vect, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmbr( int matrix_order, char vect, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cunmhr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmhr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cunmlq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmlq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cunmql( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmql( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cunmqr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmqr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cunmrq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmrq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cunmrz( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           lapack_int l, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmrz( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           lapack_int l, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cunmtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cupgtr( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* ap,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* q, lapack_int ldq );
+lapack_int LAPACKE_zupgtr( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* ap,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* q, lapack_int ldq );
+
+lapack_int LAPACKE_cupmtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n,
+                           const lapack_complex_float* ap,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zupmtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n,
+                           const lapack_complex_double* ap,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sbdsdc_work( int matrix_order, char uplo, char compq,
+                                lapack_int n, float* d, float* e, float* u,
+                                lapack_int ldu, float* vt, lapack_int ldvt,
+                                float* q, lapack_int* iq, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dbdsdc_work( int matrix_order, char uplo, char compq,
+                                lapack_int n, double* d, double* e, double* u,
+                                lapack_int ldu, double* vt, lapack_int ldvt,
+                                double* q, lapack_int* iq, double* work,
+                                lapack_int* iwork );
+
+lapack_int LAPACKE_sbdsqr_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                                float* d, float* e, float* vt, lapack_int ldvt,
+                                float* u, lapack_int ldu, float* c,
+                                lapack_int ldc, float* work );
+lapack_int LAPACKE_dbdsqr_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                                double* d, double* e, double* vt,
+                                lapack_int ldvt, double* u, lapack_int ldu,
+                                double* c, lapack_int ldc, double* work );
+lapack_int LAPACKE_cbdsqr_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                                float* d, float* e, lapack_complex_float* vt,
+                                lapack_int ldvt, lapack_complex_float* u,
+                                lapack_int ldu, lapack_complex_float* c,
+                                lapack_int ldc, float* work );
+lapack_int LAPACKE_zbdsqr_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                                double* d, double* e, lapack_complex_double* vt,
+                                lapack_int ldvt, lapack_complex_double* u,
+                                lapack_int ldu, lapack_complex_double* c,
+                                lapack_int ldc, double* work );
+
+lapack_int LAPACKE_sdisna_work( char job, lapack_int m, lapack_int n,
+                                const float* d, float* sep );
+lapack_int LAPACKE_ddisna_work( char job, lapack_int m, lapack_int n,
+                                const double* d, double* sep );
+
+lapack_int LAPACKE_sgbbrd_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int ncc, lapack_int kl,
+                                lapack_int ku, float* ab, lapack_int ldab,
+                                float* d, float* e, float* q, lapack_int ldq,
+                                float* pt, lapack_int ldpt, float* c,
+                                lapack_int ldc, float* work );
+lapack_int LAPACKE_dgbbrd_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int ncc, lapack_int kl,
+                                lapack_int ku, double* ab, lapack_int ldab,
+                                double* d, double* e, double* q, lapack_int ldq,
+                                double* pt, lapack_int ldpt, double* c,
+                                lapack_int ldc, double* work );
+lapack_int LAPACKE_cgbbrd_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int ncc, lapack_int kl,
+                                lapack_int ku, lapack_complex_float* ab,
+                                lapack_int ldab, float* d, float* e,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* pt, lapack_int ldpt,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgbbrd_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int ncc, lapack_int kl,
+                                lapack_int ku, lapack_complex_double* ab,
+                                lapack_int ldab, double* d, double* e,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* pt, lapack_int ldpt,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgbcon_work( int matrix_order, char norm, lapack_int n,
+                                lapack_int kl, lapack_int ku, const float* ab,
+                                lapack_int ldab, const lapack_int* ipiv,
+                                float anorm, float* rcond, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgbcon_work( int matrix_order, char norm, lapack_int n,
+                                lapack_int kl, lapack_int ku, const double* ab,
+                                lapack_int ldab, const lapack_int* ipiv,
+                                double anorm, double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cgbcon_work( int matrix_order, char norm, lapack_int n,
+                                lapack_int kl, lapack_int ku,
+                                const lapack_complex_float* ab, lapack_int ldab,
+                                const lapack_int* ipiv, float anorm,
+                                float* rcond, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_zgbcon_work( int matrix_order, char norm, lapack_int n,
+                                lapack_int kl, lapack_int ku,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab, const lapack_int* ipiv,
+                                double anorm, double* rcond,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgbequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, const float* ab,
+                                lapack_int ldab, float* r, float* c,
+                                float* rowcnd, float* colcnd, float* amax );
+lapack_int LAPACKE_dgbequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, const double* ab,
+                                lapack_int ldab, double* r, double* c,
+                                double* rowcnd, double* colcnd, double* amax );
+lapack_int LAPACKE_cgbequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku,
+                                const lapack_complex_float* ab, lapack_int ldab,
+                                float* r, float* c, float* rowcnd,
+                                float* colcnd, float* amax );
+lapack_int LAPACKE_zgbequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab, double* r, double* c,
+                                double* rowcnd, double* colcnd, double* amax );
+
+lapack_int LAPACKE_sgbequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_int kl, lapack_int ku, const float* ab,
+                                 lapack_int ldab, float* r, float* c,
+                                 float* rowcnd, float* colcnd, float* amax );
+lapack_int LAPACKE_dgbequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_int kl, lapack_int ku, const double* ab,
+                                 lapack_int ldab, double* r, double* c,
+                                 double* rowcnd, double* colcnd, double* amax );
+lapack_int LAPACKE_cgbequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_int kl, lapack_int ku,
+                                 const lapack_complex_float* ab,
+                                 lapack_int ldab, float* r, float* c,
+                                 float* rowcnd, float* colcnd, float* amax );
+lapack_int LAPACKE_zgbequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_int kl, lapack_int ku,
+                                 const lapack_complex_double* ab,
+                                 lapack_int ldab, double* r, double* c,
+                                 double* rowcnd, double* colcnd, double* amax );
+
+lapack_int LAPACKE_sgbrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const float* ab, lapack_int ldab,
+                                const float* afb, lapack_int ldafb,
+                                const lapack_int* ipiv, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgbrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const double* ab, lapack_int ldab,
+                                const double* afb, lapack_int ldafb,
+                                const lapack_int* ipiv, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* ferr, double* berr, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cgbrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const lapack_complex_float* ab, lapack_int ldab,
+                                const lapack_complex_float* afb,
+                                lapack_int ldafb, const lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgbrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab,
+                                const lapack_complex_double* afb,
+                                lapack_int ldafb, const lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgbrfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs, const float* ab,
+                                 lapack_int ldab, const float* afb,
+                                 lapack_int ldafb, const lapack_int* ipiv,
+                                 const float* r, const float* c, const float* b,
+                                 lapack_int ldb, float* x, lapack_int ldx,
+                                 float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dgbrfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs, const double* ab,
+                                 lapack_int ldab, const double* afb,
+                                 lapack_int ldafb, const lapack_int* ipiv,
+                                 const double* r, const double* c,
+                                 const double* b, lapack_int ldb, double* x,
+                                 lapack_int ldx, double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, double* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_cgbrfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs,
+                                 const lapack_complex_float* ab,
+                                 lapack_int ldab,
+                                 const lapack_complex_float* afb,
+                                 lapack_int ldafb, const lapack_int* ipiv,
+                                 const float* r, const float* c,
+                                 const lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zgbrfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs,
+                                 const lapack_complex_double* ab,
+                                 lapack_int ldab,
+                                 const lapack_complex_double* afb,
+                                 lapack_int ldafb, const lapack_int* ipiv,
+                                 const double* r, const double* c,
+                                 const lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_sgbsv_work( int matrix_order, lapack_int n, lapack_int kl,
+                               lapack_int ku, lapack_int nrhs, float* ab,
+                               lapack_int ldab, lapack_int* ipiv, float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_dgbsv_work( int matrix_order, lapack_int n, lapack_int kl,
+                               lapack_int ku, lapack_int nrhs, double* ab,
+                               lapack_int ldab, lapack_int* ipiv, double* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_cgbsv_work( int matrix_order, lapack_int n, lapack_int kl,
+                               lapack_int ku, lapack_int nrhs,
+                               lapack_complex_float* ab, lapack_int ldab,
+                               lapack_int* ipiv, lapack_complex_float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_zgbsv_work( int matrix_order, lapack_int n, lapack_int kl,
+                               lapack_int ku, lapack_int nrhs,
+                               lapack_complex_double* ab, lapack_int ldab,
+                               lapack_int* ipiv, lapack_complex_double* b,
+                               lapack_int ldb );
+
+lapack_int LAPACKE_sgbsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int kl, lapack_int ku,
+                                lapack_int nrhs, float* ab, lapack_int ldab,
+                                float* afb, lapack_int ldafb, lapack_int* ipiv,
+                                char* equed, float* r, float* c, float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dgbsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int kl, lapack_int ku,
+                                lapack_int nrhs, double* ab, lapack_int ldab,
+                                double* afb, lapack_int ldafb, lapack_int* ipiv,
+                                char* equed, double* r, double* c, double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cgbsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int kl, lapack_int ku,
+                                lapack_int nrhs, lapack_complex_float* ab,
+                                lapack_int ldab, lapack_complex_float* afb,
+                                lapack_int ldafb, lapack_int* ipiv, char* equed,
+                                float* r, float* c, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_zgbsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int kl, lapack_int ku,
+                                lapack_int nrhs, lapack_complex_double* ab,
+                                lapack_int ldab, lapack_complex_double* afb,
+                                lapack_int ldafb, lapack_int* ipiv, char* equed,
+                                double* r, double* c, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* x,
+                                lapack_int ldx, double* rcond, double* ferr,
+                                double* berr, lapack_complex_double* work,
+                                double* rwork );
+
+lapack_int LAPACKE_sgbsvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs, float* ab, lapack_int ldab,
+                                 float* afb, lapack_int ldafb, lapack_int* ipiv,
+                                 char* equed, float* r, float* c, float* b,
+                                 lapack_int ldb, float* x, lapack_int ldx,
+                                 float* rcond, float* rpvgrw, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dgbsvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs, double* ab, lapack_int ldab,
+                                 double* afb, lapack_int ldafb,
+                                 lapack_int* ipiv, char* equed, double* r,
+                                 double* c, double* b, lapack_int ldb,
+                                 double* x, lapack_int ldx, double* rcond,
+                                 double* rpvgrw, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, double* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_cgbsvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs, lapack_complex_float* ab,
+                                 lapack_int ldab, lapack_complex_float* afb,
+                                 lapack_int ldafb, lapack_int* ipiv,
+                                 char* equed, float* r, float* c,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 float* rcond, float* rpvgrw, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zgbsvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs, lapack_complex_double* ab,
+                                 lapack_int ldab, lapack_complex_double* afb,
+                                 lapack_int ldafb, lapack_int* ipiv,
+                                 char* equed, double* r, double* c,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* rpvgrw, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_sgbtrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, float* ab,
+                                lapack_int ldab, lapack_int* ipiv );
+lapack_int LAPACKE_dgbtrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, double* ab,
+                                lapack_int ldab, lapack_int* ipiv );
+lapack_int LAPACKE_cgbtrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku,
+                                lapack_complex_float* ab, lapack_int ldab,
+                                lapack_int* ipiv );
+lapack_int LAPACKE_zgbtrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku,
+                                lapack_complex_double* ab, lapack_int ldab,
+                                lapack_int* ipiv );
+
+lapack_int LAPACKE_sgbtrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const float* ab, lapack_int ldab,
+                                const lapack_int* ipiv, float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_dgbtrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const double* ab, lapack_int ldab,
+                                const lapack_int* ipiv, double* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_cgbtrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const lapack_complex_float* ab, lapack_int ldab,
+                                const lapack_int* ipiv, lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_zgbtrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab, const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sgebak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const float* scale, lapack_int m, float* v,
+                                lapack_int ldv );
+lapack_int LAPACKE_dgebak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const double* scale, lapack_int m, double* v,
+                                lapack_int ldv );
+lapack_int LAPACKE_cgebak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const float* scale, lapack_int m,
+                                lapack_complex_float* v, lapack_int ldv );
+lapack_int LAPACKE_zgebak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const double* scale, lapack_int m,
+                                lapack_complex_double* v, lapack_int ldv );
+
+lapack_int LAPACKE_sgebal_work( int matrix_order, char job, lapack_int n,
+                                float* a, lapack_int lda, lapack_int* ilo,
+                                lapack_int* ihi, float* scale );
+lapack_int LAPACKE_dgebal_work( int matrix_order, char job, lapack_int n,
+                                double* a, lapack_int lda, lapack_int* ilo,
+                                lapack_int* ihi, double* scale );
+lapack_int LAPACKE_cgebal_work( int matrix_order, char job, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* ilo, lapack_int* ihi,
+                                float* scale );
+lapack_int LAPACKE_zgebal_work( int matrix_order, char job, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* ilo, lapack_int* ihi,
+                                double* scale );
+
+lapack_int LAPACKE_sgebrd_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* d, float* e,
+                                float* tauq, float* taup, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dgebrd_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* d, double* e,
+                                double* tauq, double* taup, double* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_cgebrd_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                float* d, float* e, lapack_complex_float* tauq,
+                                lapack_complex_float* taup,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgebrd_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                double* d, double* e,
+                                lapack_complex_double* tauq,
+                                lapack_complex_double* taup,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgecon_work( int matrix_order, char norm, lapack_int n,
+                                const float* a, lapack_int lda, float anorm,
+                                float* rcond, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dgecon_work( int matrix_order, char norm, lapack_int n,
+                                const double* a, lapack_int lda, double anorm,
+                                double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cgecon_work( int matrix_order, char norm, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                float anorm, float* rcond,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgecon_work( int matrix_order, char norm, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                double anorm, double* rcond,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgeequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                const float* a, lapack_int lda, float* r,
+                                float* c, float* rowcnd, float* colcnd,
+                                float* amax );
+lapack_int LAPACKE_dgeequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                const double* a, lapack_int lda, double* r,
+                                double* c, double* rowcnd, double* colcnd,
+                                double* amax );
+lapack_int LAPACKE_cgeequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                float* r, float* c, float* rowcnd,
+                                float* colcnd, float* amax );
+lapack_int LAPACKE_zgeequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                double* r, double* c, double* rowcnd,
+                                double* colcnd, double* amax );
+
+lapack_int LAPACKE_sgeequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 const float* a, lapack_int lda, float* r,
+                                 float* c, float* rowcnd, float* colcnd,
+                                 float* amax );
+lapack_int LAPACKE_dgeequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 const double* a, lapack_int lda, double* r,
+                                 double* c, double* rowcnd, double* colcnd,
+                                 double* amax );
+lapack_int LAPACKE_cgeequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 float* r, float* c, float* rowcnd,
+                                 float* colcnd, float* amax );
+lapack_int LAPACKE_zgeequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 double* r, double* c, double* rowcnd,
+                                 double* colcnd, double* amax );
+
+lapack_int LAPACKE_sgees_work( int matrix_order, char jobvs, char sort,
+                               LAPACK_S_SELECT2 select, lapack_int n, float* a,
+                               lapack_int lda, lapack_int* sdim, float* wr,
+                               float* wi, float* vs, lapack_int ldvs,
+                               float* work, lapack_int lwork,
+                               lapack_logical* bwork );
+lapack_int LAPACKE_dgees_work( int matrix_order, char jobvs, char sort,
+                               LAPACK_D_SELECT2 select, lapack_int n, double* a,
+                               lapack_int lda, lapack_int* sdim, double* wr,
+                               double* wi, double* vs, lapack_int ldvs,
+                               double* work, lapack_int lwork,
+                               lapack_logical* bwork );
+lapack_int LAPACKE_cgees_work( int matrix_order, char jobvs, char sort,
+                               LAPACK_C_SELECT1 select, lapack_int n,
+                               lapack_complex_float* a, lapack_int lda,
+                               lapack_int* sdim, lapack_complex_float* w,
+                               lapack_complex_float* vs, lapack_int ldvs,
+                               lapack_complex_float* work, lapack_int lwork,
+                               float* rwork, lapack_logical* bwork );
+lapack_int LAPACKE_zgees_work( int matrix_order, char jobvs, char sort,
+                               LAPACK_Z_SELECT1 select, lapack_int n,
+                               lapack_complex_double* a, lapack_int lda,
+                               lapack_int* sdim, lapack_complex_double* w,
+                               lapack_complex_double* vs, lapack_int ldvs,
+                               lapack_complex_double* work, lapack_int lwork,
+                               double* rwork, lapack_logical* bwork );
+
+lapack_int LAPACKE_sgeesx_work( int matrix_order, char jobvs, char sort,
+                                LAPACK_S_SELECT2 select, char sense,
+                                lapack_int n, float* a, lapack_int lda,
+                                lapack_int* sdim, float* wr, float* wi,
+                                float* vs, lapack_int ldvs, float* rconde,
+                                float* rcondv, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork,
+                                lapack_logical* bwork );
+lapack_int LAPACKE_dgeesx_work( int matrix_order, char jobvs, char sort,
+                                LAPACK_D_SELECT2 select, char sense,
+                                lapack_int n, double* a, lapack_int lda,
+                                lapack_int* sdim, double* wr, double* wi,
+                                double* vs, lapack_int ldvs, double* rconde,
+                                double* rcondv, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork,
+                                lapack_logical* bwork );
+lapack_int LAPACKE_cgeesx_work( int matrix_order, char jobvs, char sort,
+                                LAPACK_C_SELECT1 select, char sense,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, lapack_int* sdim,
+                                lapack_complex_float* w,
+                                lapack_complex_float* vs, lapack_int ldvs,
+                                float* rconde, float* rcondv,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_logical* bwork );
+lapack_int LAPACKE_zgeesx_work( int matrix_order, char jobvs, char sort,
+                                LAPACK_Z_SELECT1 select, char sense,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, lapack_int* sdim,
+                                lapack_complex_double* w,
+                                lapack_complex_double* vs, lapack_int ldvs,
+                                double* rconde, double* rcondv,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_logical* bwork );
+
+lapack_int LAPACKE_sgeev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, float* a, lapack_int lda,
+                               float* wr, float* wi, float* vl, lapack_int ldvl,
+                               float* vr, lapack_int ldvr, float* work,
+                               lapack_int lwork );
+lapack_int LAPACKE_dgeev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, double* a, lapack_int lda,
+                               double* wr, double* wi, double* vl,
+                               lapack_int ldvl, double* vr, lapack_int ldvr,
+                               double* work, lapack_int lwork );
+lapack_int LAPACKE_cgeev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* w,
+                               lapack_complex_float* vl, lapack_int ldvl,
+                               lapack_complex_float* vr, lapack_int ldvr,
+                               lapack_complex_float* work, lapack_int lwork,
+                               float* rwork );
+lapack_int LAPACKE_zgeev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* w,
+                               lapack_complex_double* vl, lapack_int ldvl,
+                               lapack_complex_double* vr, lapack_int ldvr,
+                               lapack_complex_double* work, lapack_int lwork,
+                               double* rwork );
+
+lapack_int LAPACKE_sgeevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n, float* a,
+                                lapack_int lda, float* wr, float* wi, float* vl,
+                                lapack_int ldvl, float* vr, lapack_int ldvr,
+                                lapack_int* ilo, lapack_int* ihi, float* scale,
+                                float* abnrm, float* rconde, float* rcondv,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgeevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n, double* a,
+                                lapack_int lda, double* wr, double* wi,
+                                double* vl, lapack_int ldvl, double* vr,
+                                lapack_int ldvr, lapack_int* ilo,
+                                lapack_int* ihi, double* scale, double* abnrm,
+                                double* rconde, double* rcondv, double* work,
+                                lapack_int lwork, lapack_int* iwork );
+lapack_int LAPACKE_cgeevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* w,
+                                lapack_complex_float* vl, lapack_int ldvl,
+                                lapack_complex_float* vr, lapack_int ldvr,
+                                lapack_int* ilo, lapack_int* ihi, float* scale,
+                                float* abnrm, float* rconde, float* rcondv,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork );
+lapack_int LAPACKE_zgeevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* w,
+                                lapack_complex_double* vl, lapack_int ldvl,
+                                lapack_complex_double* vr, lapack_int ldvr,
+                                lapack_int* ilo, lapack_int* ihi, double* scale,
+                                double* abnrm, double* rconde, double* rcondv,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork );
+
+lapack_int LAPACKE_sgehrd_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, float* a, lapack_int lda,
+                                float* tau, float* work, lapack_int lwork );
+lapack_int LAPACKE_dgehrd_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, double* a, lapack_int lda,
+                                double* tau, double* work, lapack_int lwork );
+lapack_int LAPACKE_cgehrd_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgehrd_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgejsv_work( int matrix_order, char joba, char jobu,
+                                char jobv, char jobr, char jobt, char jobp,
+                                lapack_int m, lapack_int n, float* a,
+                                lapack_int lda, float* sva, float* u,
+                                lapack_int ldu, float* v, lapack_int ldv,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgejsv_work( int matrix_order, char joba, char jobu,
+                                char jobv, char jobr, char jobt, char jobp,
+                                lapack_int m, lapack_int n, double* a,
+                                lapack_int lda, double* sva, double* u,
+                                lapack_int ldu, double* v, lapack_int ldv,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork );
+
+lapack_int LAPACKE_sgelq2_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* tau,
+                                float* work );
+lapack_int LAPACKE_dgelq2_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* tau,
+                                double* work );
+lapack_int LAPACKE_cgelq2_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zgelq2_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_sgelqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* tau,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dgelqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* tau,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_cgelqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgelqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgels_work( int matrix_order, char trans, lapack_int m,
+                               lapack_int n, lapack_int nrhs, float* a,
+                               lapack_int lda, float* b, lapack_int ldb,
+                               float* work, lapack_int lwork );
+lapack_int LAPACKE_dgels_work( int matrix_order, char trans, lapack_int m,
+                               lapack_int n, lapack_int nrhs, double* a,
+                               lapack_int lda, double* b, lapack_int ldb,
+                               double* work, lapack_int lwork );
+lapack_int LAPACKE_cgels_work( int matrix_order, char trans, lapack_int m,
+                               lapack_int n, lapack_int nrhs,
+                               lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* b, lapack_int ldb,
+                               lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgels_work( int matrix_order, char trans, lapack_int m,
+                               lapack_int n, lapack_int nrhs,
+                               lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgelsd_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, float* s, float rcond,
+                                lapack_int* rank, float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgelsd_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, double* s,
+                                double rcond, lapack_int* rank, double* work,
+                                lapack_int lwork, lapack_int* iwork );
+lapack_int LAPACKE_cgelsd_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb, float* s, float rcond,
+                                lapack_int* rank, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_zgelsd_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, double* s, double rcond,
+                                lapack_int* rank, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int* iwork );
+
+lapack_int LAPACKE_sgelss_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, float* s, float rcond,
+                                lapack_int* rank, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dgelss_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, double* s,
+                                double rcond, lapack_int* rank, double* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_cgelss_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb, float* s, float rcond,
+                                lapack_int* rank, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork );
+lapack_int LAPACKE_zgelss_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, double* s, double rcond,
+                                lapack_int* rank, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork );
+
+lapack_int LAPACKE_sgelsy_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, lapack_int* jpvt,
+                                float rcond, lapack_int* rank, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dgelsy_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, lapack_int* jpvt,
+                                double rcond, lapack_int* rank, double* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_cgelsy_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb, lapack_int* jpvt, float rcond,
+                                lapack_int* rank, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork );
+lapack_int LAPACKE_zgelsy_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, lapack_int* jpvt, double rcond,
+                                lapack_int* rank, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork );
+
+lapack_int LAPACKE_sgeqlf_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* tau,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dgeqlf_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* tau,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_cgeqlf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgeqlf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgeqp3_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, lapack_int* jpvt,
+                                float* tau, float* work, lapack_int lwork );
+lapack_int LAPACKE_dgeqp3_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, lapack_int* jpvt,
+                                double* tau, double* work, lapack_int lwork );
+lapack_int LAPACKE_cgeqp3_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* jpvt, lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork );
+lapack_int LAPACKE_zgeqp3_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* jpvt, lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork );
+
+lapack_int LAPACKE_sgeqpf_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, lapack_int* jpvt,
+                                float* tau, float* work );
+lapack_int LAPACKE_dgeqpf_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, lapack_int* jpvt,
+                                double* tau, double* work );
+lapack_int LAPACKE_cgeqpf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* jpvt, lapack_complex_float* tau,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgeqpf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* jpvt, lapack_complex_double* tau,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgeqr2_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* tau,
+                                float* work );
+lapack_int LAPACKE_dgeqr2_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* tau,
+                                double* work );
+lapack_int LAPACKE_cgeqr2_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zgeqr2_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_sgeqrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* tau,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dgeqrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* tau,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_cgeqrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgeqrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgeqrfp_work( int matrix_order, lapack_int m, lapack_int n,
+                                 float* a, lapack_int lda, float* tau,
+                                 float* work, lapack_int lwork );
+lapack_int LAPACKE_dgeqrfp_work( int matrix_order, lapack_int m, lapack_int n,
+                                 double* a, lapack_int lda, double* tau,
+                                 double* work, lapack_int lwork );
+lapack_int LAPACKE_cgeqrfp_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* tau,
+                                 lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgeqrfp_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* tau,
+                                 lapack_complex_double* work,
+                                 lapack_int lwork );
+
+lapack_int LAPACKE_sgerfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const float* a, lapack_int lda,
+                                const float* af, lapack_int ldaf,
+                                const lapack_int* ipiv, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgerfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const double* a,
+                                lapack_int lda, const double* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cgerfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgerfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, const lapack_complex_double* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgerfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int nrhs, const float* a,
+                                 lapack_int lda, const float* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const float* r, const float* c, const float* b,
+                                 lapack_int ldb, float* x, lapack_int ldx,
+                                 float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dgerfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int nrhs, const double* a,
+                                 lapack_int lda, const double* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const double* r, const double* c,
+                                 const double* b, lapack_int ldb, double* x,
+                                 lapack_int ldx, double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, double* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_cgerfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 const lapack_complex_float* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const float* r, const float* c,
+                                 const lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zgerfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 const lapack_complex_double* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const double* r, const double* c,
+                                 const lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_sgerqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* tau,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dgerqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* tau,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_cgerqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgerqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgesdd_work( int matrix_order, char jobz, lapack_int m,
+                                lapack_int n, float* a, lapack_int lda,
+                                float* s, float* u, lapack_int ldu, float* vt,
+                                lapack_int ldvt, float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgesdd_work( int matrix_order, char jobz, lapack_int m,
+                                lapack_int n, double* a, lapack_int lda,
+                                double* s, double* u, lapack_int ldu,
+                                double* vt, lapack_int ldvt, double* work,
+                                lapack_int lwork, lapack_int* iwork );
+lapack_int LAPACKE_cgesdd_work( int matrix_order, char jobz, lapack_int m,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, float* s,
+                                lapack_complex_float* u, lapack_int ldu,
+                                lapack_complex_float* vt, lapack_int ldvt,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_int* iwork );
+lapack_int LAPACKE_zgesdd_work( int matrix_order, char jobz, lapack_int m,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, double* s,
+                                lapack_complex_double* u, lapack_int ldu,
+                                lapack_complex_double* vt, lapack_int ldvt,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_int* iwork );
+
+lapack_int LAPACKE_sgesv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               float* a, lapack_int lda, lapack_int* ipiv,
+                               float* b, lapack_int ldb );
+lapack_int LAPACKE_dgesv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               double* a, lapack_int lda, lapack_int* ipiv,
+                               double* b, lapack_int ldb );
+lapack_int LAPACKE_cgesv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               lapack_complex_float* a, lapack_int lda,
+                               lapack_int* ipiv, lapack_complex_float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_zgesv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               lapack_complex_double* a, lapack_int lda,
+                               lapack_int* ipiv, lapack_complex_double* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_dsgesv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                                double* a, lapack_int lda, lapack_int* ipiv,
+                                double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* work, float* swork,
+                                lapack_int* iter );
+lapack_int LAPACKE_zcgesv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* ipiv, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* x,
+                                lapack_int ldx, lapack_complex_double* work,
+                                lapack_complex_float* swork, double* rwork,
+                                lapack_int* iter );
+
+lapack_int LAPACKE_sgesvd_work( int matrix_order, char jobu, char jobvt,
+                                lapack_int m, lapack_int n, float* a,
+                                lapack_int lda, float* s, float* u,
+                                lapack_int ldu, float* vt, lapack_int ldvt,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dgesvd_work( int matrix_order, char jobu, char jobvt,
+                                lapack_int m, lapack_int n, double* a,
+                                lapack_int lda, double* s, double* u,
+                                lapack_int ldu, double* vt, lapack_int ldvt,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_cgesvd_work( int matrix_order, char jobu, char jobvt,
+                                lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                float* s, lapack_complex_float* u,
+                                lapack_int ldu, lapack_complex_float* vt,
+                                lapack_int ldvt, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork );
+lapack_int LAPACKE_zgesvd_work( int matrix_order, char jobu, char jobvt,
+                                lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                double* s, lapack_complex_double* u,
+                                lapack_int ldu, lapack_complex_double* vt,
+                                lapack_int ldvt, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork );
+
+lapack_int LAPACKE_sgesvj_work( int matrix_order, char joba, char jobu,
+                                char jobv, lapack_int m, lapack_int n, float* a,
+                                lapack_int lda, float* sva, lapack_int mv,
+                                float* v, lapack_int ldv, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dgesvj_work( int matrix_order, char joba, char jobu,
+                                char jobv, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* sva,
+                                lapack_int mv, double* v, lapack_int ldv,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgesvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs, float* a,
+                                lapack_int lda, float* af, lapack_int ldaf,
+                                lapack_int* ipiv, char* equed, float* r,
+                                float* c, float* b, lapack_int ldb, float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dgesvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs, double* a,
+                                lapack_int lda, double* af, lapack_int ldaf,
+                                lapack_int* ipiv, char* equed, double* r,
+                                double* c, double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* rcond, double* ferr,
+                                double* berr, double* work, lapack_int* iwork );
+lapack_int LAPACKE_cgesvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* af, lapack_int ldaf,
+                                lapack_int* ipiv, char* equed, float* r,
+                                float* c, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_zgesvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* af, lapack_int ldaf,
+                                lapack_int* ipiv, char* equed, double* r,
+                                double* c, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* x,
+                                lapack_int ldx, double* rcond, double* ferr,
+                                double* berr, lapack_complex_double* work,
+                                double* rwork );
+
+lapack_int LAPACKE_sgesvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int nrhs, float* a,
+                                 lapack_int lda, float* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, float* r,
+                                 float* c, float* b, lapack_int ldb, float* x,
+                                 lapack_int ldx, float* rcond, float* rpvgrw,
+                                 float* berr, lapack_int n_err_bnds,
+                                 float* err_bnds_norm, float* err_bnds_comp,
+                                 lapack_int nparams, float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dgesvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int nrhs, double* a,
+                                 lapack_int lda, double* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, double* r,
+                                 double* c, double* b, lapack_int ldb,
+                                 double* x, lapack_int ldx, double* rcond,
+                                 double* rpvgrw, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, double* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_cgesvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, float* r,
+                                 float* c, lapack_complex_float* b,
+                                 lapack_int ldb, lapack_complex_float* x,
+                                 lapack_int ldx, float* rcond, float* rpvgrw,
+                                 float* berr, lapack_int n_err_bnds,
+                                 float* err_bnds_norm, float* err_bnds_comp,
+                                 lapack_int nparams, float* params,
+                                 lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgesvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, double* r,
+                                 double* c, lapack_complex_double* b,
+                                 lapack_int ldb, lapack_complex_double* x,
+                                 lapack_int ldx, double* rcond, double* rpvgrw,
+                                 double* berr, lapack_int n_err_bnds,
+                                 double* err_bnds_norm, double* err_bnds_comp,
+                                 lapack_int nparams, double* params,
+                                 lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgetf2_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_dgetf2_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_cgetf2_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* ipiv );
+lapack_int LAPACKE_zgetf2_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* ipiv );
+
+lapack_int LAPACKE_sgetrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_dgetrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_cgetrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* ipiv );
+lapack_int LAPACKE_zgetrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* ipiv );
+
+lapack_int LAPACKE_sgetri_work( int matrix_order, lapack_int n, float* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dgetri_work( int matrix_order, lapack_int n, double* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_cgetri_work( int matrix_order, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                const lapack_int* ipiv,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgetri_work( int matrix_order, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgetrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const float* a, lapack_int lda,
+                                const lapack_int* ipiv, float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_dgetrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const double* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                double* b, lapack_int ldb );
+lapack_int LAPACKE_cgetrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zgetrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sggbak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const float* lscale, const float* rscale,
+                                lapack_int m, float* v, lapack_int ldv );
+lapack_int LAPACKE_dggbak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const double* lscale, const double* rscale,
+                                lapack_int m, double* v, lapack_int ldv );
+lapack_int LAPACKE_cggbak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const float* lscale, const float* rscale,
+                                lapack_int m, lapack_complex_float* v,
+                                lapack_int ldv );
+lapack_int LAPACKE_zggbak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const double* lscale, const double* rscale,
+                                lapack_int m, lapack_complex_double* v,
+                                lapack_int ldv );
+
+lapack_int LAPACKE_sggbal_work( int matrix_order, char job, lapack_int n,
+                                float* a, lapack_int lda, float* b,
+                                lapack_int ldb, lapack_int* ilo,
+                                lapack_int* ihi, float* lscale, float* rscale,
+                                float* work );
+lapack_int LAPACKE_dggbal_work( int matrix_order, char job, lapack_int n,
+                                double* a, lapack_int lda, double* b,
+                                lapack_int ldb, lapack_int* ilo,
+                                lapack_int* ihi, double* lscale, double* rscale,
+                                double* work );
+lapack_int LAPACKE_cggbal_work( int matrix_order, char job, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_int* ilo, lapack_int* ihi, float* lscale,
+                                float* rscale, float* work );
+lapack_int LAPACKE_zggbal_work( int matrix_order, char job, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_int* ilo, lapack_int* ihi,
+                                double* lscale, double* rscale, double* work );
+
+lapack_int LAPACKE_sgges_work( int matrix_order, char jobvsl, char jobvsr,
+                               char sort, LAPACK_S_SELECT3 selctg, lapack_int n,
+                               float* a, lapack_int lda, float* b,
+                               lapack_int ldb, lapack_int* sdim, float* alphar,
+                               float* alphai, float* beta, float* vsl,
+                               lapack_int ldvsl, float* vsr, lapack_int ldvsr,
+                               float* work, lapack_int lwork,
+                               lapack_logical* bwork );
+lapack_int LAPACKE_dgges_work( int matrix_order, char jobvsl, char jobvsr,
+                               char sort, LAPACK_D_SELECT3 selctg, lapack_int n,
+                               double* a, lapack_int lda, double* b,
+                               lapack_int ldb, lapack_int* sdim, double* alphar,
+                               double* alphai, double* beta, double* vsl,
+                               lapack_int ldvsl, double* vsr, lapack_int ldvsr,
+                               double* work, lapack_int lwork,
+                               lapack_logical* bwork );
+lapack_int LAPACKE_cgges_work( int matrix_order, char jobvsl, char jobvsr,
+                               char sort, LAPACK_C_SELECT2 selctg, lapack_int n,
+                               lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* b, lapack_int ldb,
+                               lapack_int* sdim, lapack_complex_float* alpha,
+                               lapack_complex_float* beta,
+                               lapack_complex_float* vsl, lapack_int ldvsl,
+                               lapack_complex_float* vsr, lapack_int ldvsr,
+                               lapack_complex_float* work, lapack_int lwork,
+                               float* rwork, lapack_logical* bwork );
+lapack_int LAPACKE_zgges_work( int matrix_order, char jobvsl, char jobvsr,
+                               char sort, LAPACK_Z_SELECT2 selctg, lapack_int n,
+                               lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* b, lapack_int ldb,
+                               lapack_int* sdim, lapack_complex_double* alpha,
+                               lapack_complex_double* beta,
+                               lapack_complex_double* vsl, lapack_int ldvsl,
+                               lapack_complex_double* vsr, lapack_int ldvsr,
+                               lapack_complex_double* work, lapack_int lwork,
+                               double* rwork, lapack_logical* bwork );
+
+lapack_int LAPACKE_sggesx_work( int matrix_order, char jobvsl, char jobvsr,
+                                char sort, LAPACK_S_SELECT3 selctg, char sense,
+                                lapack_int n, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, lapack_int* sdim,
+                                float* alphar, float* alphai, float* beta,
+                                float* vsl, lapack_int ldvsl, float* vsr,
+                                lapack_int ldvsr, float* rconde, float* rcondv,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork,
+                                lapack_logical* bwork );
+lapack_int LAPACKE_dggesx_work( int matrix_order, char jobvsl, char jobvsr,
+                                char sort, LAPACK_D_SELECT3 selctg, char sense,
+                                lapack_int n, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, lapack_int* sdim,
+                                double* alphar, double* alphai, double* beta,
+                                double* vsl, lapack_int ldvsl, double* vsr,
+                                lapack_int ldvsr, double* rconde,
+                                double* rcondv, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork,
+                                lapack_logical* bwork );
+lapack_int LAPACKE_cggesx_work( int matrix_order, char jobvsl, char jobvsr,
+                                char sort, LAPACK_C_SELECT2 selctg, char sense,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb, lapack_int* sdim,
+                                lapack_complex_float* alpha,
+                                lapack_complex_float* beta,
+                                lapack_complex_float* vsl, lapack_int ldvsl,
+                                lapack_complex_float* vsr, lapack_int ldvsr,
+                                float* rconde, float* rcondv,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_int* iwork,
+                                lapack_int liwork, lapack_logical* bwork );
+lapack_int LAPACKE_zggesx_work( int matrix_order, char jobvsl, char jobvsr,
+                                char sort, LAPACK_Z_SELECT2 selctg, char sense,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, lapack_int* sdim,
+                                lapack_complex_double* alpha,
+                                lapack_complex_double* beta,
+                                lapack_complex_double* vsl, lapack_int ldvsl,
+                                lapack_complex_double* vsr, lapack_int ldvsr,
+                                double* rconde, double* rcondv,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_int* iwork,
+                                lapack_int liwork, lapack_logical* bwork );
+
+lapack_int LAPACKE_sggev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, float* a, lapack_int lda, float* b,
+                               lapack_int ldb, float* alphar, float* alphai,
+                               float* beta, float* vl, lapack_int ldvl,
+                               float* vr, lapack_int ldvr, float* work,
+                               lapack_int lwork );
+lapack_int LAPACKE_dggev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, double* a, lapack_int lda,
+                               double* b, lapack_int ldb, double* alphar,
+                               double* alphai, double* beta, double* vl,
+                               lapack_int ldvl, double* vr, lapack_int ldvr,
+                               double* work, lapack_int lwork );
+lapack_int LAPACKE_cggev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* b,
+                               lapack_int ldb, lapack_complex_float* alpha,
+                               lapack_complex_float* beta,
+                               lapack_complex_float* vl, lapack_int ldvl,
+                               lapack_complex_float* vr, lapack_int ldvr,
+                               lapack_complex_float* work, lapack_int lwork,
+                               float* rwork );
+lapack_int LAPACKE_zggev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* b,
+                               lapack_int ldb, lapack_complex_double* alpha,
+                               lapack_complex_double* beta,
+                               lapack_complex_double* vl, lapack_int ldvl,
+                               lapack_complex_double* vr, lapack_int ldvr,
+                               lapack_complex_double* work, lapack_int lwork,
+                               double* rwork );
+
+lapack_int LAPACKE_sggevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n, float* a,
+                                lapack_int lda, float* b, lapack_int ldb,
+                                float* alphar, float* alphai, float* beta,
+                                float* vl, lapack_int ldvl, float* vr,
+                                lapack_int ldvr, lapack_int* ilo,
+                                lapack_int* ihi, float* lscale, float* rscale,
+                                float* abnrm, float* bbnrm, float* rconde,
+                                float* rcondv, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_logical* bwork );
+lapack_int LAPACKE_dggevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n, double* a,
+                                lapack_int lda, double* b, lapack_int ldb,
+                                double* alphar, double* alphai, double* beta,
+                                double* vl, lapack_int ldvl, double* vr,
+                                lapack_int ldvr, lapack_int* ilo,
+                                lapack_int* ihi, double* lscale, double* rscale,
+                                double* abnrm, double* bbnrm, double* rconde,
+                                double* rcondv, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_logical* bwork );
+lapack_int LAPACKE_cggevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* alpha,
+                                lapack_complex_float* beta,
+                                lapack_complex_float* vl, lapack_int ldvl,
+                                lapack_complex_float* vr, lapack_int ldvr,
+                                lapack_int* ilo, lapack_int* ihi, float* lscale,
+                                float* rscale, float* abnrm, float* bbnrm,
+                                float* rconde, float* rcondv,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_int* iwork,
+                                lapack_logical* bwork );
+lapack_int LAPACKE_zggevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* alpha,
+                                lapack_complex_double* beta,
+                                lapack_complex_double* vl, lapack_int ldvl,
+                                lapack_complex_double* vr, lapack_int ldvr,
+                                lapack_int* ilo, lapack_int* ihi,
+                                double* lscale, double* rscale, double* abnrm,
+                                double* bbnrm, double* rconde, double* rcondv,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_int* iwork,
+                                lapack_logical* bwork );
+
+lapack_int LAPACKE_sggglm_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, float* d, float* x,
+                                float* y, float* work, lapack_int lwork );
+lapack_int LAPACKE_dggglm_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, double* d, double* x,
+                                double* y, double* work, lapack_int lwork );
+lapack_int LAPACKE_cggglm_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* d,
+                                lapack_complex_float* x,
+                                lapack_complex_float* y,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zggglm_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* d,
+                                lapack_complex_double* x,
+                                lapack_complex_double* y,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgghrd_work( int matrix_order, char compq, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                float* a, lapack_int lda, float* b,
+                                lapack_int ldb, float* q, lapack_int ldq,
+                                float* z, lapack_int ldz );
+lapack_int LAPACKE_dgghrd_work( int matrix_order, char compq, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                double* a, lapack_int lda, double* b,
+                                lapack_int ldb, double* q, lapack_int ldq,
+                                double* z, lapack_int ldz );
+lapack_int LAPACKE_cgghrd_work( int matrix_order, char compq, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zgghrd_work( int matrix_order, char compq, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sgglse_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int p, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, float* c, float* d,
+                                float* x, float* work, lapack_int lwork );
+lapack_int LAPACKE_dgglse_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int p, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, double* c, double* d,
+                                double* x, double* work, lapack_int lwork );
+lapack_int LAPACKE_cgglse_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int p, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* c,
+                                lapack_complex_float* d,
+                                lapack_complex_float* x,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgglse_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int p, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* c,
+                                lapack_complex_double* d,
+                                lapack_complex_double* x,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sggqrf_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, float* a, lapack_int lda,
+                                float* taua, float* b, lapack_int ldb,
+                                float* taub, float* work, lapack_int lwork );
+lapack_int LAPACKE_dggqrf_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, double* a, lapack_int lda,
+                                double* taua, double* b, lapack_int ldb,
+                                double* taub, double* work, lapack_int lwork );
+lapack_int LAPACKE_cggqrf_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* taua,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* taub,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zggqrf_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* taua,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* taub,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sggrqf_work( int matrix_order, lapack_int m, lapack_int p,
+                                lapack_int n, float* a, lapack_int lda,
+                                float* taua, float* b, lapack_int ldb,
+                                float* taub, float* work, lapack_int lwork );
+lapack_int LAPACKE_dggrqf_work( int matrix_order, lapack_int m, lapack_int p,
+                                lapack_int n, double* a, lapack_int lda,
+                                double* taua, double* b, lapack_int ldb,
+                                double* taub, double* work, lapack_int lwork );
+lapack_int LAPACKE_cggrqf_work( int matrix_order, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* taua,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* taub,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zggrqf_work( int matrix_order, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* taua,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* taub,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sggsvd_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int n,
+                                lapack_int p, lapack_int* k, lapack_int* l,
+                                float* a, lapack_int lda, float* b,
+                                lapack_int ldb, float* alpha, float* beta,
+                                float* u, lapack_int ldu, float* v,
+                                lapack_int ldv, float* q, lapack_int ldq,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dggsvd_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int n,
+                                lapack_int p, lapack_int* k, lapack_int* l,
+                                double* a, lapack_int lda, double* b,
+                                lapack_int ldb, double* alpha, double* beta,
+                                double* u, lapack_int ldu, double* v,
+                                lapack_int ldv, double* q, lapack_int ldq,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cggsvd_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int n,
+                                lapack_int p, lapack_int* k, lapack_int* l,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                float* alpha, float* beta,
+                                lapack_complex_float* u, lapack_int ldu,
+                                lapack_complex_float* v, lapack_int ldv,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* work, float* rwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_zggsvd_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int n,
+                                lapack_int p, lapack_int* k, lapack_int* l,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                double* alpha, double* beta,
+                                lapack_complex_double* u, lapack_int ldu,
+                                lapack_complex_double* v, lapack_int ldv,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* work, double* rwork,
+                                lapack_int* iwork );
+
+lapack_int LAPACKE_sggsvp_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, float tola,
+                                float tolb, lapack_int* k, lapack_int* l,
+                                float* u, lapack_int ldu, float* v,
+                                lapack_int ldv, float* q, lapack_int ldq,
+                                lapack_int* iwork, float* tau, float* work );
+lapack_int LAPACKE_dggsvp_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, double tola,
+                                double tolb, lapack_int* k, lapack_int* l,
+                                double* u, lapack_int ldu, double* v,
+                                lapack_int ldv, double* q, lapack_int ldq,
+                                lapack_int* iwork, double* tau, double* work );
+lapack_int LAPACKE_cggsvp_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb, float tola, float tolb,
+                                lapack_int* k, lapack_int* l,
+                                lapack_complex_float* u, lapack_int ldu,
+                                lapack_complex_float* v, lapack_int ldv,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_int* iwork, float* rwork,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zggsvp_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, double tola, double tolb,
+                                lapack_int* k, lapack_int* l,
+                                lapack_complex_double* u, lapack_int ldu,
+                                lapack_complex_double* v, lapack_int ldv,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_int* iwork, double* rwork,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_sgtcon_work( char norm, lapack_int n, const float* dl,
+                                const float* d, const float* du,
+                                const float* du2, const lapack_int* ipiv,
+                                float anorm, float* rcond, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgtcon_work( char norm, lapack_int n, const double* dl,
+                                const double* d, const double* du,
+                                const double* du2, const lapack_int* ipiv,
+                                double anorm, double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cgtcon_work( char norm, lapack_int n,
+                                const lapack_complex_float* dl,
+                                const lapack_complex_float* d,
+                                const lapack_complex_float* du,
+                                const lapack_complex_float* du2,
+                                const lapack_int* ipiv, float anorm,
+                                float* rcond, lapack_complex_float* work );
+lapack_int LAPACKE_zgtcon_work( char norm, lapack_int n,
+                                const lapack_complex_double* dl,
+                                const lapack_complex_double* d,
+                                const lapack_complex_double* du,
+                                const lapack_complex_double* du2,
+                                const lapack_int* ipiv, double anorm,
+                                double* rcond, lapack_complex_double* work );
+
+lapack_int LAPACKE_sgtrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const float* dl,
+                                const float* d, const float* du,
+                                const float* dlf, const float* df,
+                                const float* duf, const float* du2,
+                                const lapack_int* ipiv, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgtrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const double* dl,
+                                const double* d, const double* du,
+                                const double* dlf, const double* df,
+                                const double* duf, const double* du2,
+                                const lapack_int* ipiv, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* ferr, double* berr, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cgtrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* dl,
+                                const lapack_complex_float* d,
+                                const lapack_complex_float* du,
+                                const lapack_complex_float* dlf,
+                                const lapack_complex_float* df,
+                                const lapack_complex_float* duf,
+                                const lapack_complex_float* du2,
+                                const lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgtrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* dl,
+                                const lapack_complex_double* d,
+                                const lapack_complex_double* du,
+                                const lapack_complex_double* dlf,
+                                const lapack_complex_double* df,
+                                const lapack_complex_double* duf,
+                                const lapack_complex_double* du2,
+                                const lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgtsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               float* dl, float* d, float* du, float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_dgtsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               double* dl, double* d, double* du, double* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_cgtsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               lapack_complex_float* dl,
+                               lapack_complex_float* d,
+                               lapack_complex_float* du,
+                               lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zgtsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               lapack_complex_double* dl,
+                               lapack_complex_double* d,
+                               lapack_complex_double* du,
+                               lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sgtsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs, const float* dl,
+                                const float* d, const float* du, float* dlf,
+                                float* df, float* duf, float* du2,
+                                lapack_int* ipiv, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dgtsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs, const double* dl,
+                                const double* d, const double* du, double* dlf,
+                                double* df, double* duf, double* du2,
+                                lapack_int* ipiv, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cgtsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* dl,
+                                const lapack_complex_float* d,
+                                const lapack_complex_float* du,
+                                lapack_complex_float* dlf,
+                                lapack_complex_float* df,
+                                lapack_complex_float* duf,
+                                lapack_complex_float* du2, lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgtsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* dl,
+                                const lapack_complex_double* d,
+                                const lapack_complex_double* du,
+                                lapack_complex_double* dlf,
+                                lapack_complex_double* df,
+                                lapack_complex_double* duf,
+                                lapack_complex_double* du2, lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgttrf_work( lapack_int n, float* dl, float* d, float* du,
+                                float* du2, lapack_int* ipiv );
+lapack_int LAPACKE_dgttrf_work( lapack_int n, double* dl, double* d, double* du,
+                                double* du2, lapack_int* ipiv );
+lapack_int LAPACKE_cgttrf_work( lapack_int n, lapack_complex_float* dl,
+                                lapack_complex_float* d,
+                                lapack_complex_float* du,
+                                lapack_complex_float* du2, lapack_int* ipiv );
+lapack_int LAPACKE_zgttrf_work( lapack_int n, lapack_complex_double* dl,
+                                lapack_complex_double* d,
+                                lapack_complex_double* du,
+                                lapack_complex_double* du2, lapack_int* ipiv );
+
+lapack_int LAPACKE_sgttrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const float* dl,
+                                const float* d, const float* du,
+                                const float* du2, const lapack_int* ipiv,
+                                float* b, lapack_int ldb );
+lapack_int LAPACKE_dgttrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const double* dl,
+                                const double* d, const double* du,
+                                const double* du2, const lapack_int* ipiv,
+                                double* b, lapack_int ldb );
+lapack_int LAPACKE_cgttrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* dl,
+                                const lapack_complex_float* d,
+                                const lapack_complex_float* du,
+                                const lapack_complex_float* du2,
+                                const lapack_int* ipiv, lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_zgttrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* dl,
+                                const lapack_complex_double* d,
+                                const lapack_complex_double* du,
+                                const lapack_complex_double* du2,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_chbev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int kd,
+                               lapack_complex_float* ab, lapack_int ldab,
+                               float* w, lapack_complex_float* z,
+                               lapack_int ldz, lapack_complex_float* work,
+                               float* rwork );
+lapack_int LAPACKE_zhbev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int kd,
+                               lapack_complex_double* ab, lapack_int ldab,
+                               double* w, lapack_complex_double* z,
+                               lapack_int ldz, lapack_complex_double* work,
+                               double* rwork );
+
+lapack_int LAPACKE_chbevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int kd,
+                                lapack_complex_float* ab, lapack_int ldab,
+                                float* w, lapack_complex_float* z,
+                                lapack_int ldz, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_zhbevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int kd,
+                                lapack_complex_double* ab, lapack_int ldab,
+                                double* w, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_chbevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int kd,
+                                lapack_complex_float* ab, lapack_int ldab,
+                                lapack_complex_float* q, lapack_int ldq,
+                                float vl, float vu, lapack_int il,
+                                lapack_int iu, float abstol, lapack_int* m,
+                                float* w, lapack_complex_float* z,
+                                lapack_int ldz, lapack_complex_float* work,
+                                float* rwork, lapack_int* iwork,
+                                lapack_int* ifail );
+lapack_int LAPACKE_zhbevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int kd,
+                                lapack_complex_double* ab, lapack_int ldab,
+                                lapack_complex_double* q, lapack_int ldq,
+                                double vl, double vu, lapack_int il,
+                                lapack_int iu, double abstol, lapack_int* m,
+                                double* w, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* work,
+                                double* rwork, lapack_int* iwork,
+                                lapack_int* ifail );
+
+lapack_int LAPACKE_chbgst_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                lapack_complex_float* ab, lapack_int ldab,
+                                const lapack_complex_float* bb, lapack_int ldbb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zhbgst_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                lapack_complex_double* ab, lapack_int ldab,
+                                const lapack_complex_double* bb,
+                                lapack_int ldbb, lapack_complex_double* x,
+                                lapack_int ldx, lapack_complex_double* work,
+                                double* rwork );
+
+lapack_int LAPACKE_chbgv_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int ka, lapack_int kb,
+                               lapack_complex_float* ab, lapack_int ldab,
+                               lapack_complex_float* bb, lapack_int ldbb,
+                               float* w, lapack_complex_float* z,
+                               lapack_int ldz, lapack_complex_float* work,
+                               float* rwork );
+lapack_int LAPACKE_zhbgv_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int ka, lapack_int kb,
+                               lapack_complex_double* ab, lapack_int ldab,
+                               lapack_complex_double* bb, lapack_int ldbb,
+                               double* w, lapack_complex_double* z,
+                               lapack_int ldz, lapack_complex_double* work,
+                               double* rwork );
+
+lapack_int LAPACKE_chbgvd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                lapack_complex_float* ab, lapack_int ldab,
+                                lapack_complex_float* bb, lapack_int ldbb,
+                                float* w, lapack_complex_float* z,
+                                lapack_int ldz, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_zhbgvd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                lapack_complex_double* ab, lapack_int ldab,
+                                lapack_complex_double* bb, lapack_int ldbb,
+                                double* w, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_chbgvx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int ka,
+                                lapack_int kb, lapack_complex_float* ab,
+                                lapack_int ldab, lapack_complex_float* bb,
+                                lapack_int ldbb, lapack_complex_float* q,
+                                lapack_int ldq, float vl, float vu,
+                                lapack_int il, lapack_int iu, float abstol,
+                                lapack_int* m, float* w,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_complex_float* work, float* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+lapack_int LAPACKE_zhbgvx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int ka,
+                                lapack_int kb, lapack_complex_double* ab,
+                                lapack_int ldab, lapack_complex_double* bb,
+                                lapack_int ldbb, lapack_complex_double* q,
+                                lapack_int ldq, double vl, double vu,
+                                lapack_int il, lapack_int iu, double abstol,
+                                lapack_int* m, double* w,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_complex_double* work, double* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_chbtrd_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int kd,
+                                lapack_complex_float* ab, lapack_int ldab,
+                                float* d, float* e, lapack_complex_float* q,
+                                lapack_int ldq, lapack_complex_float* work );
+lapack_int LAPACKE_zhbtrd_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int kd,
+                                lapack_complex_double* ab, lapack_int ldab,
+                                double* d, double* e, lapack_complex_double* q,
+                                lapack_int ldq, lapack_complex_double* work );
+
+lapack_int LAPACKE_checon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_int* ipiv, float anorm,
+                                float* rcond, lapack_complex_float* work );
+lapack_int LAPACKE_zhecon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_int* ipiv, double anorm,
+                                double* rcond, lapack_complex_double* work );
+
+lapack_int LAPACKE_cheequb_work( int matrix_order, char uplo, lapack_int n,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 float* s, float* scond, float* amax,
+                                 lapack_complex_float* work );
+lapack_int LAPACKE_zheequb_work( int matrix_order, char uplo, lapack_int n,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 double* s, double* scond, double* amax,
+                                 lapack_complex_double* work );
+
+lapack_int LAPACKE_cheev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_complex_float* a,
+                               lapack_int lda, float* w,
+                               lapack_complex_float* work, lapack_int lwork,
+                               float* rwork );
+lapack_int LAPACKE_zheev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_complex_double* a,
+                               lapack_int lda, double* w,
+                               lapack_complex_double* work, lapack_int lwork,
+                               double* rwork );
+
+lapack_int LAPACKE_cheevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, float* w,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_int lrwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_zheevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, double* w,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_int lrwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_cheevr_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                float vl, float vu, lapack_int il,
+                                lapack_int iu, float abstol, lapack_int* m,
+                                float* w, lapack_complex_float* z,
+                                lapack_int ldz, lapack_int* isuppz,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_int lrwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_zheevr_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                double vl, double vu, lapack_int il,
+                                lapack_int iu, double abstol, lapack_int* m,
+                                double* w, lapack_complex_double* z,
+                                lapack_int ldz, lapack_int* isuppz,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_int lrwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_cheevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                float vl, float vu, lapack_int il,
+                                lapack_int iu, float abstol, lapack_int* m,
+                                float* w, lapack_complex_float* z,
+                                lapack_int ldz, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+lapack_int LAPACKE_zheevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                double vl, double vu, lapack_int il,
+                                lapack_int iu, double abstol, lapack_int* m,
+                                double* w, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_chegst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_zhegst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, const lapack_complex_double* b,
+                                lapack_int ldb );
+
+lapack_int LAPACKE_chegv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n, lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* b,
+                               lapack_int ldb, float* w,
+                               lapack_complex_float* work, lapack_int lwork,
+                               float* rwork );
+lapack_int LAPACKE_zhegv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n,
+                               lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* b, lapack_int ldb,
+                               double* w, lapack_complex_double* work,
+                               lapack_int lwork, double* rwork );
+
+lapack_int LAPACKE_chegvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                float* w, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_zhegvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                double* w, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_chegvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                float vl, float vu, lapack_int il,
+                                lapack_int iu, float abstol, lapack_int* m,
+                                float* w, lapack_complex_float* z,
+                                lapack_int ldz, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+lapack_int LAPACKE_zhegvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                double vl, double vu, lapack_int il,
+                                lapack_int iu, double abstol, lapack_int* m,
+                                double* w, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_cherfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zherfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, const lapack_complex_double* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_cherfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 const lapack_complex_float* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const float* s, const lapack_complex_float* b,
+                                 lapack_int ldb, lapack_complex_float* x,
+                                 lapack_int ldx, float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zherfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 const lapack_complex_double* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const double* s,
+                                 const lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_chesv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_float* a,
+                               lapack_int lda, lapack_int* ipiv,
+                               lapack_complex_float* b, lapack_int ldb,
+                               lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zhesv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_double* a,
+                               lapack_int lda, lapack_int* ipiv,
+                               lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_chesvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* af, lapack_int ldaf,
+                                lapack_int* ipiv, const lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork );
+lapack_int LAPACKE_zhesvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* af, lapack_int ldaf,
+                                lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork );
+
+lapack_int LAPACKE_chesvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, float* s,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 float* rcond, float* rpvgrw, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zhesvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, double* s,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* rpvgrw, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_chetrd_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                float* d, float* e, lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zhetrd_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                double* d, double* e,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_chetrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* ipiv, lapack_complex_float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_zhetrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* ipiv, lapack_complex_double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_chetri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                const lapack_int* ipiv,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zhetri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_chetrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zhetrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_chfrk_work( int matrix_order, char transr, char uplo,
+                               char trans, lapack_int n, lapack_int k,
+                               float alpha, const lapack_complex_float* a,
+                               lapack_int lda, float beta,
+                               lapack_complex_float* c );
+lapack_int LAPACKE_zhfrk_work( int matrix_order, char transr, char uplo,
+                               char trans, lapack_int n, lapack_int k,
+                               double alpha, const lapack_complex_double* a,
+                               lapack_int lda, double beta,
+                               lapack_complex_double* c );
+
+lapack_int LAPACKE_shgeqz_work( int matrix_order, char job, char compq,
+                                char compz, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, float* h, lapack_int ldh,
+                                float* t, lapack_int ldt, float* alphar,
+                                float* alphai, float* beta, float* q,
+                                lapack_int ldq, float* z, lapack_int ldz,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dhgeqz_work( int matrix_order, char job, char compq,
+                                char compz, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, double* h, lapack_int ldh,
+                                double* t, lapack_int ldt, double* alphar,
+                                double* alphai, double* beta, double* q,
+                                lapack_int ldq, double* z, lapack_int ldz,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_chgeqz_work( int matrix_order, char job, char compq,
+                                char compz, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, lapack_complex_float* h,
+                                lapack_int ldh, lapack_complex_float* t,
+                                lapack_int ldt, lapack_complex_float* alpha,
+                                lapack_complex_float* beta,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork );
+lapack_int LAPACKE_zhgeqz_work( int matrix_order, char job, char compq,
+                                char compz, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, lapack_complex_double* h,
+                                lapack_int ldh, lapack_complex_double* t,
+                                lapack_int ldt, lapack_complex_double* alpha,
+                                lapack_complex_double* beta,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork );
+
+lapack_int LAPACKE_chpcon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* ap,
+                                const lapack_int* ipiv, float anorm,
+                                float* rcond, lapack_complex_float* work );
+lapack_int LAPACKE_zhpcon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* ap,
+                                const lapack_int* ipiv, double anorm,
+                                double* rcond, lapack_complex_double* work );
+
+lapack_int LAPACKE_chpev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_complex_float* ap, float* w,
+                               lapack_complex_float* z, lapack_int ldz,
+                               lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zhpev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_complex_double* ap,
+                               double* w, lapack_complex_double* z,
+                               lapack_int ldz, lapack_complex_double* work,
+                               double* rwork );
+
+lapack_int LAPACKE_chpevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_complex_float* ap,
+                                float* w, lapack_complex_float* z,
+                                lapack_int ldz, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_zhpevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_complex_double* ap,
+                                double* w, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_chpevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n,
+                                lapack_complex_float* ap, float vl, float vu,
+                                lapack_int il, lapack_int iu, float abstol,
+                                lapack_int* m, float* w,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_complex_float* work, float* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+lapack_int LAPACKE_zhpevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n,
+                                lapack_complex_double* ap, double vl, double vu,
+                                lapack_int il, lapack_int iu, double abstol,
+                                lapack_int* m, double* w,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_complex_double* work, double* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_chpgst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, lapack_complex_float* ap,
+                                const lapack_complex_float* bp );
+lapack_int LAPACKE_zhpgst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, lapack_complex_double* ap,
+                                const lapack_complex_double* bp );
+
+lapack_int LAPACKE_chpgv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n,
+                               lapack_complex_float* ap,
+                               lapack_complex_float* bp, float* w,
+                               lapack_complex_float* z, lapack_int ldz,
+                               lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zhpgv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n,
+                               lapack_complex_double* ap,
+                               lapack_complex_double* bp, double* w,
+                               lapack_complex_double* z, lapack_int ldz,
+                               lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_chpgvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n,
+                                lapack_complex_float* ap,
+                                lapack_complex_float* bp, float* w,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_int lrwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_zhpgvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n,
+                                lapack_complex_double* ap,
+                                lapack_complex_double* bp, double* w,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_int lrwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_chpgvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n,
+                                lapack_complex_float* ap,
+                                lapack_complex_float* bp, float vl, float vu,
+                                lapack_int il, lapack_int iu, float abstol,
+                                lapack_int* m, float* w,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_complex_float* work, float* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+lapack_int LAPACKE_zhpgvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n,
+                                lapack_complex_double* ap,
+                                lapack_complex_double* bp, double vl, double vu,
+                                lapack_int il, lapack_int iu, double abstol,
+                                lapack_int* m, double* w,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_complex_double* work, double* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_chprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* ap,
+                                const lapack_complex_float* afp,
+                                const lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zhprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                const lapack_complex_double* afp,
+                                const lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_chpsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_float* ap,
+                               lapack_int* ipiv, lapack_complex_float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_zhpsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_double* ap,
+                               lapack_int* ipiv, lapack_complex_double* b,
+                               lapack_int ldb );
+
+lapack_int LAPACKE_chpsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* ap,
+                                lapack_complex_float* afp, lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zhpsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                lapack_complex_double* afp, lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_chptrd_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* ap, float* d, float* e,
+                                lapack_complex_float* tau );
+lapack_int LAPACKE_zhptrd_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* ap, double* d, double* e,
+                                lapack_complex_double* tau );
+
+lapack_int LAPACKE_chptrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* ap, lapack_int* ipiv );
+lapack_int LAPACKE_zhptrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* ap, lapack_int* ipiv );
+
+lapack_int LAPACKE_chptri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* ap,
+                                const lapack_int* ipiv,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zhptri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* ap,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_chptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* ap,
+                                const lapack_int* ipiv, lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_zhptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_shsein_work( int matrix_order, char job, char eigsrc,
+                                char initv, lapack_logical* select,
+                                lapack_int n, const float* h, lapack_int ldh,
+                                float* wr, const float* wi, float* vl,
+                                lapack_int ldvl, float* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m, float* work,
+                                lapack_int* ifaill, lapack_int* ifailr );
+lapack_int LAPACKE_dhsein_work( int matrix_order, char job, char eigsrc,
+                                char initv, lapack_logical* select,
+                                lapack_int n, const double* h, lapack_int ldh,
+                                double* wr, const double* wi, double* vl,
+                                lapack_int ldvl, double* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m, double* work,
+                                lapack_int* ifaill, lapack_int* ifailr );
+lapack_int LAPACKE_chsein_work( int matrix_order, char job, char eigsrc,
+                                char initv, const lapack_logical* select,
+                                lapack_int n, const lapack_complex_float* h,
+                                lapack_int ldh, lapack_complex_float* w,
+                                lapack_complex_float* vl, lapack_int ldvl,
+                                lapack_complex_float* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_float* work, float* rwork,
+                                lapack_int* ifaill, lapack_int* ifailr );
+lapack_int LAPACKE_zhsein_work( int matrix_order, char job, char eigsrc,
+                                char initv, const lapack_logical* select,
+                                lapack_int n, const lapack_complex_double* h,
+                                lapack_int ldh, lapack_complex_double* w,
+                                lapack_complex_double* vl, lapack_int ldvl,
+                                lapack_complex_double* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_double* work, double* rwork,
+                                lapack_int* ifaill, lapack_int* ifailr );
+
+lapack_int LAPACKE_shseqr_work( int matrix_order, char job, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                float* h, lapack_int ldh, float* wr, float* wi,
+                                float* z, lapack_int ldz, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dhseqr_work( int matrix_order, char job, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                double* h, lapack_int ldh, double* wr,
+                                double* wi, double* z, lapack_int ldz,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_chseqr_work( int matrix_order, char job, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                lapack_complex_float* h, lapack_int ldh,
+                                lapack_complex_float* w,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zhseqr_work( int matrix_order, char job, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                lapack_complex_double* h, lapack_int ldh,
+                                lapack_complex_double* w,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_clacgv_work( lapack_int n, lapack_complex_float* x,
+                                lapack_int incx );
+lapack_int LAPACKE_zlacgv_work( lapack_int n, lapack_complex_double* x,
+                                lapack_int incx );
+
+lapack_int LAPACKE_slacpy_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, const float* a, lapack_int lda,
+                                float* b, lapack_int ldb );
+lapack_int LAPACKE_dlacpy_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, const double* a, lapack_int lda,
+                                double* b, lapack_int ldb );
+lapack_int LAPACKE_clacpy_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, const lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_zlacpy_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, const lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb );
+
+lapack_int LAPACKE_zlag2c_work( int matrix_order, lapack_int m, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_float* sa, lapack_int ldsa );
+
+lapack_int LAPACKE_slag2d_work( int matrix_order, lapack_int m, lapack_int n,
+                                const float* sa, lapack_int ldsa, double* a,
+                                lapack_int lda );
+
+lapack_int LAPACKE_dlag2s_work( int matrix_order, lapack_int m, lapack_int n,
+                                const double* a, lapack_int lda, float* sa,
+                                lapack_int ldsa );
+
+lapack_int LAPACKE_clag2z_work( int matrix_order, lapack_int m, lapack_int n,
+                                const lapack_complex_float* sa, lapack_int ldsa,
+                                lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_slagge_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, const float* d,
+                                float* a, lapack_int lda, lapack_int* iseed,
+                                float* work );
+lapack_int LAPACKE_dlagge_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, const double* d,
+                                double* a, lapack_int lda, lapack_int* iseed,
+                                double* work );
+lapack_int LAPACKE_clagge_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, const float* d,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* iseed, lapack_complex_float* work );
+lapack_int LAPACKE_zlagge_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, const double* d,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* iseed,
+                                lapack_complex_double* work );
+                                
+lapack_int LAPACKE_claghe_work( int matrix_order, lapack_int n, lapack_int k,
+                                const float* d, lapack_complex_float* a,
+                                lapack_int lda, lapack_int* iseed,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zlaghe_work( int matrix_order, lapack_int n, lapack_int k,
+                                const double* d, lapack_complex_double* a,
+                                lapack_int lda, lapack_int* iseed,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_slagsy_work( int matrix_order, lapack_int n, lapack_int k,
+                                const float* d, float* a, lapack_int lda,
+                                lapack_int* iseed, float* work );
+lapack_int LAPACKE_dlagsy_work( int matrix_order, lapack_int n, lapack_int k,
+                                const double* d, double* a, lapack_int lda,
+                                lapack_int* iseed, double* work );
+lapack_int LAPACKE_clagsy_work( int matrix_order, lapack_int n, lapack_int k,
+                                const float* d, lapack_complex_float* a,
+                                lapack_int lda, lapack_int* iseed,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zlagsy_work( int matrix_order, lapack_int n, lapack_int k,
+                                const double* d, lapack_complex_double* a,
+                                lapack_int lda, lapack_int* iseed,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_slapmr_work( int matrix_order, lapack_logical forwrd,
+                                lapack_int m, lapack_int n, float* x,
+                                lapack_int ldx, lapack_int* k );
+lapack_int LAPACKE_dlapmr_work( int matrix_order, lapack_logical forwrd,
+                                lapack_int m, lapack_int n, double* x,
+                                lapack_int ldx, lapack_int* k );
+lapack_int LAPACKE_clapmr_work( int matrix_order, lapack_logical forwrd,
+                                lapack_int m, lapack_int n,
+                                lapack_complex_float* x, lapack_int ldx,
+                                lapack_int* k );
+lapack_int LAPACKE_zlapmr_work( int matrix_order, lapack_logical forwrd,
+                                lapack_int m, lapack_int n,
+                                lapack_complex_double* x, lapack_int ldx,
+                                lapack_int* k );
+
+lapack_int LAPACKE_slartgp_work( float f, float g, float* cs, float* sn,
+                                 float* r );
+lapack_int LAPACKE_dlartgp_work( double f, double g, double* cs, double* sn,
+                                 double* r );
+
+lapack_int LAPACKE_slartgs_work( float x, float y, float sigma, float* cs,
+                                 float* sn );
+lapack_int LAPACKE_dlartgs_work( double x, double y, double sigma, double* cs,
+                                 double* sn );
+                                
+float LAPACKE_slapy2_work( float x, float y );
+double LAPACKE_dlapy2_work( double x, double y );
+
+float LAPACKE_slapy3_work( float x, float y, float z );
+double LAPACKE_dlapy3_work( double x, double y, double z );
+
+float LAPACKE_slamch_work( char cmach );
+double LAPACKE_dlamch_work( char cmach );
+
+float LAPACKE_slange_work( int matrix_order, char norm, lapack_int m,
+                                lapack_int n, const float* a, lapack_int lda,
+                                float* work );
+double LAPACKE_dlange_work( int matrix_order, char norm, lapack_int m,
+                                lapack_int n, const double* a, lapack_int lda,
+                                double* work );
+float LAPACKE_clange_work( int matrix_order, char norm, lapack_int m,
+                                lapack_int n, const lapack_complex_float* a,
+                                lapack_int lda, float* work );
+double LAPACKE_zlange_work( int matrix_order, char norm, lapack_int m,
+                                lapack_int n, const lapack_complex_double* a,
+                                lapack_int lda, double* work );
+
+float LAPACKE_clanhe_work( int matrix_order, char norm, char uplo,
+                                lapack_int n, const lapack_complex_float* a,
+                                lapack_int lda, float* work );
+double LAPACKE_zlanhe_work( int matrix_order, char norm, char uplo,
+                                lapack_int n, const lapack_complex_double* a,
+                                lapack_int lda, double* work );
+
+float LAPACKE_slansy_work( int matrix_order, char norm, char uplo,
+                                lapack_int n, const float* a, lapack_int lda,
+                                float* work );
+double LAPACKE_dlansy_work( int matrix_order, char norm, char uplo,
+                                lapack_int n, const double* a, lapack_int lda,
+                                double* work );
+float LAPACKE_clansy_work( int matrix_order, char norm, char uplo,
+                                lapack_int n, const lapack_complex_float* a,
+                                lapack_int lda, float* work );
+double LAPACKE_zlansy_work( int matrix_order, char norm, char uplo,
+                                lapack_int n, const lapack_complex_double* a,
+                                lapack_int lda, double* work );
+
+float LAPACKE_slantr_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int m, lapack_int n, const float* a,
+                                lapack_int lda, float* work );
+double LAPACKE_dlantr_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int m, lapack_int n,
+                                const double* a, lapack_int lda, double* work );
+float LAPACKE_clantr_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int m, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                float* work );
+double LAPACKE_zlantr_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int m, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                double* work );
+
+lapack_int LAPACKE_slarfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k, const float* v,
+                                lapack_int ldv, const float* t, lapack_int ldt,
+                                float* c, lapack_int ldc, float* work,
+                                lapack_int ldwork );
+lapack_int LAPACKE_dlarfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k, const double* v,
+                                lapack_int ldv, const double* t, lapack_int ldt,
+                                double* c, lapack_int ldc, double* work,
+                                lapack_int ldwork );
+lapack_int LAPACKE_clarfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k,
+                                const lapack_complex_float* v, lapack_int ldv,
+                                const lapack_complex_float* t, lapack_int ldt,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int ldwork );
+lapack_int LAPACKE_zlarfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k,
+                                const lapack_complex_double* v, lapack_int ldv,
+                                const lapack_complex_double* t, lapack_int ldt,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work,
+                                lapack_int ldwork );
+
+lapack_int LAPACKE_slarfg_work( lapack_int n, float* alpha, float* x,
+                                lapack_int incx, float* tau );
+lapack_int LAPACKE_dlarfg_work( lapack_int n, double* alpha, double* x,
+                                lapack_int incx, double* tau );
+lapack_int LAPACKE_clarfg_work( lapack_int n, lapack_complex_float* alpha,
+                                lapack_complex_float* x, lapack_int incx,
+                                lapack_complex_float* tau );
+lapack_int LAPACKE_zlarfg_work( lapack_int n, lapack_complex_double* alpha,
+                                lapack_complex_double* x, lapack_int incx,
+                                lapack_complex_double* tau );
+
+lapack_int LAPACKE_slarft_work( int matrix_order, char direct, char storev,
+                                lapack_int n, lapack_int k, const float* v,
+                                lapack_int ldv, const float* tau, float* t,
+                                lapack_int ldt );
+lapack_int LAPACKE_dlarft_work( int matrix_order, char direct, char storev,
+                                lapack_int n, lapack_int k, const double* v,
+                                lapack_int ldv, const double* tau, double* t,
+                                lapack_int ldt );
+lapack_int LAPACKE_clarft_work( int matrix_order, char direct, char storev,
+                                lapack_int n, lapack_int k,
+                                const lapack_complex_float* v, lapack_int ldv,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zlarft_work( int matrix_order, char direct, char storev,
+                                lapack_int n, lapack_int k,
+                                const lapack_complex_double* v, lapack_int ldv,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_slarfx_work( int matrix_order, char side, lapack_int m,
+                                lapack_int n, const float* v, float tau,
+                                float* c, lapack_int ldc, float* work );
+lapack_int LAPACKE_dlarfx_work( int matrix_order, char side, lapack_int m,
+                                lapack_int n, const double* v, double tau,
+                                double* c, lapack_int ldc, double* work );
+lapack_int LAPACKE_clarfx_work( int matrix_order, char side, lapack_int m,
+                                lapack_int n, const lapack_complex_float* v,
+                                lapack_complex_float tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zlarfx_work( int matrix_order, char side, lapack_int m,
+                                lapack_int n, const lapack_complex_double* v,
+                                lapack_complex_double tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_slarnv_work( lapack_int idist, lapack_int* iseed,
+                                lapack_int n, float* x );
+lapack_int LAPACKE_dlarnv_work( lapack_int idist, lapack_int* iseed,
+                                lapack_int n, double* x );
+lapack_int LAPACKE_clarnv_work( lapack_int idist, lapack_int* iseed,
+                                lapack_int n, lapack_complex_float* x );
+lapack_int LAPACKE_zlarnv_work( lapack_int idist, lapack_int* iseed,
+                                lapack_int n, lapack_complex_double* x );
+
+lapack_int LAPACKE_slaset_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, float alpha, float beta, float* a,
+                                lapack_int lda );
+lapack_int LAPACKE_dlaset_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, double alpha, double beta,
+                                double* a, lapack_int lda );
+lapack_int LAPACKE_claset_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, lapack_complex_float alpha,
+                                lapack_complex_float beta,
+                                lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zlaset_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, lapack_complex_double alpha,
+                                lapack_complex_double beta,
+                                lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_slasrt_work( char id, lapack_int n, float* d );
+lapack_int LAPACKE_dlasrt_work( char id, lapack_int n, double* d );
+
+lapack_int LAPACKE_slaswp_work( int matrix_order, lapack_int n, float* a,
+                                lapack_int lda, lapack_int k1, lapack_int k2,
+                                const lapack_int* ipiv, lapack_int incx );
+lapack_int LAPACKE_dlaswp_work( int matrix_order, lapack_int n, double* a,
+                                lapack_int lda, lapack_int k1, lapack_int k2,
+                                const lapack_int* ipiv, lapack_int incx );
+lapack_int LAPACKE_claswp_work( int matrix_order, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int k1, lapack_int k2,
+                                const lapack_int* ipiv, lapack_int incx );
+lapack_int LAPACKE_zlaswp_work( int matrix_order, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int k1, lapack_int k2,
+                                const lapack_int* ipiv, lapack_int incx );
+
+lapack_int LAPACKE_slatms_work( int matrix_order, lapack_int m, lapack_int n,
+                                char dist, lapack_int* iseed, char sym,
+                                float* d, lapack_int mode, float cond,
+                                float dmax, lapack_int kl, lapack_int ku,
+                                char pack, float* a, lapack_int lda,
+                                float* work );
+lapack_int LAPACKE_dlatms_work( int matrix_order, lapack_int m, lapack_int n,
+                                char dist, lapack_int* iseed, char sym,
+                                double* d, lapack_int mode, double cond,
+                                double dmax, lapack_int kl, lapack_int ku,
+                                char pack, double* a, lapack_int lda,
+                                double* work );
+lapack_int LAPACKE_clatms_work( int matrix_order, lapack_int m, lapack_int n,
+                                char dist, lapack_int* iseed, char sym,
+                                float* d, lapack_int mode, float cond,
+                                float dmax, lapack_int kl, lapack_int ku,
+                                char pack, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* work );
+lapack_int LAPACKE_zlatms_work( int matrix_order, lapack_int m, lapack_int n,
+                                char dist, lapack_int* iseed, char sym,
+                                double* d, lapack_int mode, double cond,
+                                double dmax, lapack_int kl, lapack_int ku,
+                                char pack, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* work );
+
+lapack_int LAPACKE_slauum_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda );
+lapack_int LAPACKE_dlauum_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda );
+lapack_int LAPACKE_clauum_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zlauum_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_sopgtr_work( int matrix_order, char uplo, lapack_int n,
+                                const float* ap, const float* tau, float* q,
+                                lapack_int ldq, float* work );
+lapack_int LAPACKE_dopgtr_work( int matrix_order, char uplo, lapack_int n,
+                                const double* ap, const double* tau, double* q,
+                                lapack_int ldq, double* work );
+
+lapack_int LAPACKE_sopmtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const float* ap, const float* tau, float* c,
+                                lapack_int ldc, float* work );
+lapack_int LAPACKE_dopmtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const double* ap, const double* tau, double* c,
+                                lapack_int ldc, double* work );
+
+lapack_int LAPACKE_sorgbr_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int k, float* a,
+                                lapack_int lda, const float* tau, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dorgbr_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int k, double* a,
+                                lapack_int lda, const double* tau, double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_sorghr_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, float* a, lapack_int lda,
+                                const float* tau, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dorghr_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, double* a, lapack_int lda,
+                                const double* tau, double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_sorglq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, float* a, lapack_int lda,
+                                const float* tau, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dorglq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, double* a, lapack_int lda,
+                                const double* tau, double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_sorgql_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, float* a, lapack_int lda,
+                                const float* tau, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dorgql_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, double* a, lapack_int lda,
+                                const double* tau, double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_sorgqr_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, float* a, lapack_int lda,
+                                const float* tau, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dorgqr_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, double* a, lapack_int lda,
+                                const double* tau, double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_sorgrq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, float* a, lapack_int lda,
+                                const float* tau, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dorgrq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, double* a, lapack_int lda,
+                                const double* tau, double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_sorgtr_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda, const float* tau,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dorgtr_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda, const double* tau,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormbr_work( int matrix_order, char vect, char side,
+                                char trans, lapack_int m, lapack_int n,
+                                lapack_int k, const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormbr_work( int matrix_order, char vect, char side,
+                                char trans, lapack_int m, lapack_int n,
+                                lapack_int k, const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormhr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormhr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormlq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormlq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormql_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormql_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormqr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormqr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormrq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormrq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormrz_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                lapack_int l, const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormrz_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                lapack_int l, const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_spbcon_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const float* ab, lapack_int ldab,
+                                float anorm, float* rcond, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dpbcon_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const double* ab,
+                                lapack_int ldab, double anorm, double* rcond,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cpbcon_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const lapack_complex_float* ab,
+                                lapack_int ldab, float anorm, float* rcond,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zpbcon_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const lapack_complex_double* ab,
+                                lapack_int ldab, double anorm, double* rcond,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_spbequ_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const float* ab, lapack_int ldab,
+                                float* s, float* scond, float* amax );
+lapack_int LAPACKE_dpbequ_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const double* ab,
+                                lapack_int ldab, double* s, double* scond,
+                                double* amax );
+lapack_int LAPACKE_cpbequ_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const lapack_complex_float* ab,
+                                lapack_int ldab, float* s, float* scond,
+                                float* amax );
+lapack_int LAPACKE_zpbequ_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const lapack_complex_double* ab,
+                                lapack_int ldab, double* s, double* scond,
+                                double* amax );
+
+lapack_int LAPACKE_spbrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs, const float* ab,
+                                lapack_int ldab, const float* afb,
+                                lapack_int ldafb, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dpbrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs,
+                                const double* ab, lapack_int ldab,
+                                const double* afb, lapack_int ldafb,
+                                const double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cpbrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs,
+                                const lapack_complex_float* ab, lapack_int ldab,
+                                const lapack_complex_float* afb,
+                                lapack_int ldafb, const lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zpbrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab,
+                                const lapack_complex_double* afb,
+                                lapack_int ldafb,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_spbstf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kb, float* bb, lapack_int ldbb );
+lapack_int LAPACKE_dpbstf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kb, double* bb, lapack_int ldbb );
+lapack_int LAPACKE_cpbstf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kb, lapack_complex_float* bb,
+                                lapack_int ldbb );
+lapack_int LAPACKE_zpbstf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kb, lapack_complex_double* bb,
+                                lapack_int ldbb );
+
+lapack_int LAPACKE_spbsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int kd, lapack_int nrhs, float* ab,
+                               lapack_int ldab, float* b, lapack_int ldb );
+lapack_int LAPACKE_dpbsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int kd, lapack_int nrhs, double* ab,
+                               lapack_int ldab, double* b, lapack_int ldb );
+lapack_int LAPACKE_cpbsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int kd, lapack_int nrhs,
+                               lapack_complex_float* ab, lapack_int ldab,
+                               lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpbsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int kd, lapack_int nrhs,
+                               lapack_complex_double* ab, lapack_int ldab,
+                               lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_spbsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int kd, lapack_int nrhs,
+                                float* ab, lapack_int ldab, float* afb,
+                                lapack_int ldafb, char* equed, float* s,
+                                float* b, lapack_int ldb, float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dpbsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int kd, lapack_int nrhs,
+                                double* ab, lapack_int ldab, double* afb,
+                                lapack_int ldafb, char* equed, double* s,
+                                double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* rcond, double* ferr,
+                                double* berr, double* work, lapack_int* iwork );
+lapack_int LAPACKE_cpbsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int kd, lapack_int nrhs,
+                                lapack_complex_float* ab, lapack_int ldab,
+                                lapack_complex_float* afb, lapack_int ldafb,
+                                char* equed, float* s, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_zpbsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int kd, lapack_int nrhs,
+                                lapack_complex_double* ab, lapack_int ldab,
+                                lapack_complex_double* afb, lapack_int ldafb,
+                                char* equed, double* s,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_spbtrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, float* ab, lapack_int ldab );
+lapack_int LAPACKE_dpbtrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, double* ab, lapack_int ldab );
+lapack_int LAPACKE_cpbtrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_complex_float* ab,
+                                lapack_int ldab );
+lapack_int LAPACKE_zpbtrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_complex_double* ab,
+                                lapack_int ldab );
+
+lapack_int LAPACKE_spbtrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs, const float* ab,
+                                lapack_int ldab, float* b, lapack_int ldb );
+lapack_int LAPACKE_dpbtrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs,
+                                const double* ab, lapack_int ldab, double* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_cpbtrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs,
+                                const lapack_complex_float* ab, lapack_int ldab,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpbtrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab, lapack_complex_double* b,
+                                lapack_int ldb );
+
+lapack_int LAPACKE_spftrf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, float* a );
+lapack_int LAPACKE_dpftrf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, double* a );
+lapack_int LAPACKE_cpftrf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_complex_float* a );
+lapack_int LAPACKE_zpftrf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_complex_double* a );
+
+lapack_int LAPACKE_spftri_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, float* a );
+lapack_int LAPACKE_dpftri_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, double* a );
+lapack_int LAPACKE_cpftri_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_complex_float* a );
+lapack_int LAPACKE_zpftri_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_complex_double* a );
+
+lapack_int LAPACKE_spftrs_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_int nrhs, const float* a,
+                                float* b, lapack_int ldb );
+lapack_int LAPACKE_dpftrs_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_int nrhs, const double* a,
+                                double* b, lapack_int ldb );
+lapack_int LAPACKE_cpftrs_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* a,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpftrs_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* a,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_spocon_work( int matrix_order, char uplo, lapack_int n,
+                                const float* a, lapack_int lda, float anorm,
+                                float* rcond, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dpocon_work( int matrix_order, char uplo, lapack_int n,
+                                const double* a, lapack_int lda, double anorm,
+                                double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cpocon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                float anorm, float* rcond,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zpocon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                double anorm, double* rcond,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_spoequ_work( int matrix_order, lapack_int n, const float* a,
+                                lapack_int lda, float* s, float* scond,
+                                float* amax );
+lapack_int LAPACKE_dpoequ_work( int matrix_order, lapack_int n, const double* a,
+                                lapack_int lda, double* s, double* scond,
+                                double* amax );
+lapack_int LAPACKE_cpoequ_work( int matrix_order, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                float* s, float* scond, float* amax );
+lapack_int LAPACKE_zpoequ_work( int matrix_order, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                double* s, double* scond, double* amax );
+
+lapack_int LAPACKE_spoequb_work( int matrix_order, lapack_int n, const float* a,
+                                 lapack_int lda, float* s, float* scond,
+                                 float* amax );
+lapack_int LAPACKE_dpoequb_work( int matrix_order, lapack_int n,
+                                 const double* a, lapack_int lda, double* s,
+                                 double* scond, double* amax );
+lapack_int LAPACKE_cpoequb_work( int matrix_order, lapack_int n,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 float* s, float* scond, float* amax );
+lapack_int LAPACKE_zpoequb_work( int matrix_order, lapack_int n,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 double* s, double* scond, double* amax );
+
+lapack_int LAPACKE_sporfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* a, lapack_int lda,
+                                const float* af, lapack_int ldaf,
+                                const float* b, lapack_int ldb, float* x,
+                                lapack_int ldx, float* ferr, float* berr,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dporfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* a,
+                                lapack_int lda, const double* af,
+                                lapack_int ldaf, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* ferr, double* berr, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cporfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* af,
+                                lapack_int ldaf, const lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zporfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, const lapack_complex_double* af,
+                                lapack_int ldaf, const lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* x,
+                                lapack_int ldx, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sporfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs, const float* a,
+                                 lapack_int lda, const float* af,
+                                 lapack_int ldaf, const float* s,
+                                 const float* b, lapack_int ldb, float* x,
+                                 lapack_int ldx, float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dporfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs, const double* a,
+                                 lapack_int lda, const double* af,
+                                 lapack_int ldaf, const double* s,
+                                 const double* b, lapack_int ldb, double* x,
+                                 lapack_int ldx, double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, double* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_cporfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 const lapack_complex_float* af,
+                                 lapack_int ldaf, const float* s,
+                                 const lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zporfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 const lapack_complex_double* af,
+                                 lapack_int ldaf, const double* s,
+                                 const lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_sposv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, float* a, lapack_int lda,
+                               float* b, lapack_int ldb );
+lapack_int LAPACKE_dposv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, double* a, lapack_int lda,
+                               double* b, lapack_int ldb );
+lapack_int LAPACKE_cposv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_zposv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_dsposv_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* work, float* swork,
+                                lapack_int* iter );
+lapack_int LAPACKE_zcposv_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* x,
+                                lapack_int ldx, lapack_complex_double* work,
+                                lapack_complex_float* swork, double* rwork,
+                                lapack_int* iter );
+
+lapack_int LAPACKE_sposvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, float* a,
+                                lapack_int lda, float* af, lapack_int ldaf,
+                                char* equed, float* s, float* b, lapack_int ldb,
+                                float* x, lapack_int ldx, float* rcond,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dposvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, double* a,
+                                lapack_int lda, double* af, lapack_int ldaf,
+                                char* equed, double* s, double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cposvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* af, lapack_int ldaf,
+                                char* equed, float* s, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_zposvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* af, lapack_int ldaf,
+                                char* equed, double* s,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sposvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs, float* a,
+                                 lapack_int lda, float* af, lapack_int ldaf,
+                                 char* equed, float* s, float* b,
+                                 lapack_int ldb, float* x, lapack_int ldx,
+                                 float* rcond, float* rpvgrw, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dposvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs, double* a,
+                                 lapack_int lda, double* af, lapack_int ldaf,
+                                 char* equed, double* s, double* b,
+                                 lapack_int ldb, double* x, lapack_int ldx,
+                                 double* rcond, double* rpvgrw, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, double* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_cposvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* af, lapack_int ldaf,
+                                 char* equed, float* s, lapack_complex_float* b,
+                                 lapack_int ldb, lapack_complex_float* x,
+                                 lapack_int ldx, float* rcond, float* rpvgrw,
+                                 float* berr, lapack_int n_err_bnds,
+                                 float* err_bnds_norm, float* err_bnds_comp,
+                                 lapack_int nparams, float* params,
+                                 lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zposvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* af, lapack_int ldaf,
+                                 char* equed, double* s,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* rpvgrw, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_spotrf_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda );
+lapack_int LAPACKE_dpotrf_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda );
+lapack_int LAPACKE_cpotrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zpotrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_spotri_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda );
+lapack_int LAPACKE_dpotri_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda );
+lapack_int LAPACKE_cpotri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zpotri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_spotrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* a, lapack_int lda,
+                                float* b, lapack_int ldb );
+lapack_int LAPACKE_dpotrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* a,
+                                lapack_int lda, double* b, lapack_int ldb );
+lapack_int LAPACKE_cpotrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_zpotrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb );
+
+lapack_int LAPACKE_sppcon_work( int matrix_order, char uplo, lapack_int n,
+                                const float* ap, float anorm, float* rcond,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dppcon_work( int matrix_order, char uplo, lapack_int n,
+                                const double* ap, double anorm, double* rcond,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cppcon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* ap, float anorm,
+                                float* rcond, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_zppcon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* ap, double anorm,
+                                double* rcond, lapack_complex_double* work,
+                                double* rwork );
+
+lapack_int LAPACKE_sppequ_work( int matrix_order, char uplo, lapack_int n,
+                                const float* ap, float* s, float* scond,
+                                float* amax );
+lapack_int LAPACKE_dppequ_work( int matrix_order, char uplo, lapack_int n,
+                                const double* ap, double* s, double* scond,
+                                double* amax );
+lapack_int LAPACKE_cppequ_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* ap, float* s,
+                                float* scond, float* amax );
+lapack_int LAPACKE_zppequ_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* ap, double* s,
+                                double* scond, double* amax );
+
+lapack_int LAPACKE_spprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* ap,
+                                const float* afp, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dpprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* ap,
+                                const double* afp, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* ferr, double* berr, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cpprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* ap,
+                                const lapack_complex_float* afp,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zpprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                const lapack_complex_double* afp,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sppsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, float* ap, float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_dppsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, double* ap, double* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_cppsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_float* ap,
+                               lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zppsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_double* ap,
+                               lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sppsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, float* ap,
+                                float* afp, char* equed, float* s, float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dppsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, double* ap,
+                                double* afp, char* equed, double* s, double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cppsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                lapack_complex_float* ap,
+                                lapack_complex_float* afp, char* equed,
+                                float* s, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_zppsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                lapack_complex_double* ap,
+                                lapack_complex_double* afp, char* equed,
+                                double* s, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* x,
+                                lapack_int ldx, double* rcond, double* ferr,
+                                double* berr, lapack_complex_double* work,
+                                double* rwork );
+
+lapack_int LAPACKE_spptrf_work( int matrix_order, char uplo, lapack_int n,
+                                float* ap );
+lapack_int LAPACKE_dpptrf_work( int matrix_order, char uplo, lapack_int n,
+                                double* ap );
+lapack_int LAPACKE_cpptrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* ap );
+lapack_int LAPACKE_zpptrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* ap );
+
+lapack_int LAPACKE_spptri_work( int matrix_order, char uplo, lapack_int n,
+                                float* ap );
+lapack_int LAPACKE_dpptri_work( int matrix_order, char uplo, lapack_int n,
+                                double* ap );
+lapack_int LAPACKE_cpptri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* ap );
+lapack_int LAPACKE_zpptri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* ap );
+
+lapack_int LAPACKE_spptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* ap, float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_dpptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* ap, double* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_cpptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* ap,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_spstrf_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda, lapack_int* piv,
+                                lapack_int* rank, float tol, float* work );
+lapack_int LAPACKE_dpstrf_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda, lapack_int* piv,
+                                lapack_int* rank, double tol, double* work );
+lapack_int LAPACKE_cpstrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* piv, lapack_int* rank, float tol,
+                                float* work );
+lapack_int LAPACKE_zpstrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* piv, lapack_int* rank, double tol,
+                                double* work );
+
+lapack_int LAPACKE_sptcon_work( lapack_int n, const float* d, const float* e,
+                                float anorm, float* rcond, float* work );
+lapack_int LAPACKE_dptcon_work( lapack_int n, const double* d, const double* e,
+                                double anorm, double* rcond, double* work );
+lapack_int LAPACKE_cptcon_work( lapack_int n, const float* d,
+                                const lapack_complex_float* e, float anorm,
+                                float* rcond, float* work );
+lapack_int LAPACKE_zptcon_work( lapack_int n, const double* d,
+                                const lapack_complex_double* e, double anorm,
+                                double* rcond, double* work );
+
+lapack_int LAPACKE_spteqr_work( int matrix_order, char compz, lapack_int n,
+                                float* d, float* e, float* z, lapack_int ldz,
+                                float* work );
+lapack_int LAPACKE_dpteqr_work( int matrix_order, char compz, lapack_int n,
+                                double* d, double* e, double* z, lapack_int ldz,
+                                double* work );
+lapack_int LAPACKE_cpteqr_work( int matrix_order, char compz, lapack_int n,
+                                float* d, float* e, lapack_complex_float* z,
+                                lapack_int ldz, float* work );
+lapack_int LAPACKE_zpteqr_work( int matrix_order, char compz, lapack_int n,
+                                double* d, double* e, lapack_complex_double* z,
+                                lapack_int ldz, double* work );
+
+lapack_int LAPACKE_sptrfs_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                                const float* d, const float* e, const float* df,
+                                const float* ef, const float* b, lapack_int ldb,
+                                float* x, lapack_int ldx, float* ferr,
+                                float* berr, float* work );
+lapack_int LAPACKE_dptrfs_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                                const double* d, const double* e,
+                                const double* df, const double* ef,
+                                const double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* ferr, double* berr,
+                                double* work );
+lapack_int LAPACKE_cptrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* d,
+                                const lapack_complex_float* e, const float* df,
+                                const lapack_complex_float* ef,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zptrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* d,
+                                const lapack_complex_double* e,
+                                const double* df,
+                                const lapack_complex_double* ef,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sptsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               float* d, float* e, float* b, lapack_int ldb );
+lapack_int LAPACKE_dptsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               double* d, double* e, double* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_cptsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               float* d, lapack_complex_float* e,
+                               lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zptsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               double* d, lapack_complex_double* e,
+                               lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sptsvx_work( int matrix_order, char fact, lapack_int n,
+                                lapack_int nrhs, const float* d, const float* e,
+                                float* df, float* ef, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                float* work );
+lapack_int LAPACKE_dptsvx_work( int matrix_order, char fact, lapack_int n,
+                                lapack_int nrhs, const double* d,
+                                const double* e, double* df, double* ef,
+                                const double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* rcond, double* ferr,
+                                double* berr, double* work );
+lapack_int LAPACKE_cptsvx_work( int matrix_order, char fact, lapack_int n,
+                                lapack_int nrhs, const float* d,
+                                const lapack_complex_float* e, float* df,
+                                lapack_complex_float* ef,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zptsvx_work( int matrix_order, char fact, lapack_int n,
+                                lapack_int nrhs, const double* d,
+                                const lapack_complex_double* e, double* df,
+                                lapack_complex_double* ef,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_spttrf_work( lapack_int n, float* d, float* e );
+lapack_int LAPACKE_dpttrf_work( lapack_int n, double* d, double* e );
+lapack_int LAPACKE_cpttrf_work( lapack_int n, float* d,
+                                lapack_complex_float* e );
+lapack_int LAPACKE_zpttrf_work( lapack_int n, double* d,
+                                lapack_complex_double* e );
+
+lapack_int LAPACKE_spttrs_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                                const float* d, const float* e, float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_dpttrs_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                                const double* d, const double* e, double* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_cpttrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* d,
+                                const lapack_complex_float* e,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpttrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* d,
+                                const lapack_complex_double* e,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_ssbev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int kd, float* ab,
+                               lapack_int ldab, float* w, float* z,
+                               lapack_int ldz, float* work );
+lapack_int LAPACKE_dsbev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int kd, double* ab,
+                               lapack_int ldab, double* w, double* z,
+                               lapack_int ldz, double* work );
+
+lapack_int LAPACKE_ssbevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int kd, float* ab,
+                                lapack_int ldab, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dsbevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int kd, double* ab,
+                                lapack_int ldab, double* w, double* z,
+                                lapack_int ldz, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_ssbevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int kd,
+                                float* ab, lapack_int ldab, float* q,
+                                lapack_int ldq, float vl, float vu,
+                                lapack_int il, lapack_int iu, float abstol,
+                                lapack_int* m, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int* iwork,
+                                lapack_int* ifail );
+lapack_int LAPACKE_dsbevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int kd,
+                                double* ab, lapack_int ldab, double* q,
+                                lapack_int ldq, double vl, double vu,
+                                lapack_int il, lapack_int iu, double abstol,
+                                lapack_int* m, double* w, double* z,
+                                lapack_int ldz, double* work, lapack_int* iwork,
+                                lapack_int* ifail );
+
+lapack_int LAPACKE_ssbgst_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                float* ab, lapack_int ldab, const float* bb,
+                                lapack_int ldbb, float* x, lapack_int ldx,
+                                float* work );
+lapack_int LAPACKE_dsbgst_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                double* ab, lapack_int ldab, const double* bb,
+                                lapack_int ldbb, double* x, lapack_int ldx,
+                                double* work );
+
+lapack_int LAPACKE_ssbgv_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int ka, lapack_int kb,
+                               float* ab, lapack_int ldab, float* bb,
+                               lapack_int ldbb, float* w, float* z,
+                               lapack_int ldz, float* work );
+lapack_int LAPACKE_dsbgv_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int ka, lapack_int kb,
+                               double* ab, lapack_int ldab, double* bb,
+                               lapack_int ldbb, double* w, double* z,
+                               lapack_int ldz, double* work );
+
+lapack_int LAPACKE_ssbgvd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                float* ab, lapack_int ldab, float* bb,
+                                lapack_int ldbb, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dsbgvd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                double* ab, lapack_int ldab, double* bb,
+                                lapack_int ldbb, double* w, double* z,
+                                lapack_int ldz, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_ssbgvx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int ka,
+                                lapack_int kb, float* ab, lapack_int ldab,
+                                float* bb, lapack_int ldbb, float* q,
+                                lapack_int ldq, float vl, float vu,
+                                lapack_int il, lapack_int iu, float abstol,
+                                lapack_int* m, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int* iwork,
+                                lapack_int* ifail );
+lapack_int LAPACKE_dsbgvx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int ka,
+                                lapack_int kb, double* ab, lapack_int ldab,
+                                double* bb, lapack_int ldbb, double* q,
+                                lapack_int ldq, double vl, double vu,
+                                lapack_int il, lapack_int iu, double abstol,
+                                lapack_int* m, double* w, double* z,
+                                lapack_int ldz, double* work, lapack_int* iwork,
+                                lapack_int* ifail );
+
+lapack_int LAPACKE_ssbtrd_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int kd, float* ab,
+                                lapack_int ldab, float* d, float* e, float* q,
+                                lapack_int ldq, float* work );
+lapack_int LAPACKE_dsbtrd_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int kd, double* ab,
+                                lapack_int ldab, double* d, double* e,
+                                double* q, lapack_int ldq, double* work );
+
+lapack_int LAPACKE_ssfrk_work( int matrix_order, char transr, char uplo,
+                               char trans, lapack_int n, lapack_int k,
+                               float alpha, const float* a, lapack_int lda,
+                               float beta, float* c );
+lapack_int LAPACKE_dsfrk_work( int matrix_order, char transr, char uplo,
+                               char trans, lapack_int n, lapack_int k,
+                               double alpha, const double* a, lapack_int lda,
+                               double beta, double* c );
+
+lapack_int LAPACKE_sspcon_work( int matrix_order, char uplo, lapack_int n,
+                                const float* ap, const lapack_int* ipiv,
+                                float anorm, float* rcond, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dspcon_work( int matrix_order, char uplo, lapack_int n,
+                                const double* ap, const lapack_int* ipiv,
+                                double anorm, double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cspcon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* ap,
+                                const lapack_int* ipiv, float anorm,
+                                float* rcond, lapack_complex_float* work );
+lapack_int LAPACKE_zspcon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* ap,
+                                const lapack_int* ipiv, double anorm,
+                                double* rcond, lapack_complex_double* work );
+
+lapack_int LAPACKE_sspev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, float* ap, float* w, float* z,
+                               lapack_int ldz, float* work );
+lapack_int LAPACKE_dspev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, double* ap, double* w, double* z,
+                               lapack_int ldz, double* work );
+
+lapack_int LAPACKE_sspevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, float* ap, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dspevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, double* ap, double* w, double* z,
+                                lapack_int ldz, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_sspevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, float* ap, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                float abstol, lapack_int* m, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int* iwork,
+                                lapack_int* ifail );
+lapack_int LAPACKE_dspevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, double* ap, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                double abstol, lapack_int* m, double* w,
+                                double* z, lapack_int ldz, double* work,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_sspgst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, float* ap, const float* bp );
+lapack_int LAPACKE_dspgst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, double* ap, const double* bp );
+
+lapack_int LAPACKE_sspgv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n, float* ap, float* bp,
+                               float* w, float* z, lapack_int ldz,
+                               float* work );
+lapack_int LAPACKE_dspgv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n, double* ap, double* bp,
+                               double* w, double* z, lapack_int ldz,
+                               double* work );
+
+lapack_int LAPACKE_sspgvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n, float* ap, float* bp,
+                                float* w, float* z, lapack_int ldz, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_dspgvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n, double* ap, double* bp,
+                                double* w, double* z, lapack_int ldz,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_sspgvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n, float* ap,
+                                float* bp, float vl, float vu, lapack_int il,
+                                lapack_int iu, float abstol, lapack_int* m,
+                                float* w, float* z, lapack_int ldz, float* work,
+                                lapack_int* iwork, lapack_int* ifail );
+lapack_int LAPACKE_dspgvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n, double* ap,
+                                double* bp, double vl, double vu, lapack_int il,
+                                lapack_int iu, double abstol, lapack_int* m,
+                                double* w, double* z, lapack_int ldz,
+                                double* work, lapack_int* iwork,
+                                lapack_int* ifail );
+
+lapack_int LAPACKE_ssprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* ap,
+                                const float* afp, const lapack_int* ipiv,
+                                const float* b, lapack_int ldb, float* x,
+                                lapack_int ldx, float* ferr, float* berr,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dsprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* ap,
+                                const double* afp, const lapack_int* ipiv,
+                                const double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_csprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* ap,
+                                const lapack_complex_float* afp,
+                                const lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zsprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                const lapack_complex_double* afp,
+                                const lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sspsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, float* ap, lapack_int* ipiv,
+                               float* b, lapack_int ldb );
+lapack_int LAPACKE_dspsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, double* ap, lapack_int* ipiv,
+                               double* b, lapack_int ldb );
+lapack_int LAPACKE_cspsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_float* ap,
+                               lapack_int* ipiv, lapack_complex_float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_zspsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_double* ap,
+                               lapack_int* ipiv, lapack_complex_double* b,
+                               lapack_int ldb );
+
+lapack_int LAPACKE_sspsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, const float* ap,
+                                float* afp, lapack_int* ipiv, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dspsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, const double* ap,
+                                double* afp, lapack_int* ipiv, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cspsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* ap,
+                                lapack_complex_float* afp, lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zspsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                lapack_complex_double* afp, lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_ssptrd_work( int matrix_order, char uplo, lapack_int n,
+                                float* ap, float* d, float* e, float* tau );
+lapack_int LAPACKE_dsptrd_work( int matrix_order, char uplo, lapack_int n,
+                                double* ap, double* d, double* e, double* tau );
+
+lapack_int LAPACKE_ssptrf_work( int matrix_order, char uplo, lapack_int n,
+                                float* ap, lapack_int* ipiv );
+lapack_int LAPACKE_dsptrf_work( int matrix_order, char uplo, lapack_int n,
+                                double* ap, lapack_int* ipiv );
+lapack_int LAPACKE_csptrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* ap, lapack_int* ipiv );
+lapack_int LAPACKE_zsptrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* ap, lapack_int* ipiv );
+
+lapack_int LAPACKE_ssptri_work( int matrix_order, char uplo, lapack_int n,
+                                float* ap, const lapack_int* ipiv,
+                                float* work );
+lapack_int LAPACKE_dsptri_work( int matrix_order, char uplo, lapack_int n,
+                                double* ap, const lapack_int* ipiv,
+                                double* work );
+lapack_int LAPACKE_csptri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* ap,
+                                const lapack_int* ipiv,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zsptri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* ap,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_ssptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* ap,
+                                const lapack_int* ipiv, float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_dsptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* ap,
+                                const lapack_int* ipiv, double* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_csptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* ap,
+                                const lapack_int* ipiv, lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_zsptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sstebz_work( char range, char order, lapack_int n, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                float abstol, const float* d, const float* e,
+                                lapack_int* m, lapack_int* nsplit, float* w,
+                                lapack_int* iblock, lapack_int* isplit,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dstebz_work( char range, char order, lapack_int n, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                double abstol, const double* d, const double* e,
+                                lapack_int* m, lapack_int* nsplit, double* w,
+                                lapack_int* iblock, lapack_int* isplit,
+                                double* work, lapack_int* iwork );
+
+lapack_int LAPACKE_sstedc_work( int matrix_order, char compz, lapack_int n,
+                                float* d, float* e, float* z, lapack_int ldz,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dstedc_work( int matrix_order, char compz, lapack_int n,
+                                double* d, double* e, double* z, lapack_int ldz,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_cstedc_work( int matrix_order, char compz, lapack_int n,
+                                float* d, float* e, lapack_complex_float* z,
+                                lapack_int ldz, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_zstedc_work( int matrix_order, char compz, lapack_int n,
+                                double* d, double* e, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_sstegr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, float* d, float* e, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                float abstol, lapack_int* m, float* w, float* z,
+                                lapack_int ldz, lapack_int* isuppz, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_dstegr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, double* d, double* e, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                double abstol, lapack_int* m, double* w,
+                                double* z, lapack_int ldz, lapack_int* isuppz,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_cstegr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, float* d, float* e, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                float abstol, lapack_int* m, float* w,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_int* isuppz, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_zstegr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, double* d, double* e, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                double abstol, lapack_int* m, double* w,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_int* isuppz, double* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_sstein_work( int matrix_order, lapack_int n, const float* d,
+                                const float* e, lapack_int m, const float* w,
+                                const lapack_int* iblock,
+                                const lapack_int* isplit, float* z,
+                                lapack_int ldz, float* work, lapack_int* iwork,
+                                lapack_int* ifailv );
+lapack_int LAPACKE_dstein_work( int matrix_order, lapack_int n, const double* d,
+                                const double* e, lapack_int m, const double* w,
+                                const lapack_int* iblock,
+                                const lapack_int* isplit, double* z,
+                                lapack_int ldz, double* work, lapack_int* iwork,
+                                lapack_int* ifailv );
+lapack_int LAPACKE_cstein_work( int matrix_order, lapack_int n, const float* d,
+                                const float* e, lapack_int m, const float* w,
+                                const lapack_int* iblock,
+                                const lapack_int* isplit,
+                                lapack_complex_float* z, lapack_int ldz,
+                                float* work, lapack_int* iwork,
+                                lapack_int* ifailv );
+lapack_int LAPACKE_zstein_work( int matrix_order, lapack_int n, const double* d,
+                                const double* e, lapack_int m, const double* w,
+                                const lapack_int* iblock,
+                                const lapack_int* isplit,
+                                lapack_complex_double* z, lapack_int ldz,
+                                double* work, lapack_int* iwork,
+                                lapack_int* ifailv );
+
+lapack_int LAPACKE_sstemr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, float* d, float* e, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                lapack_int* m, float* w, float* z,
+                                lapack_int ldz, lapack_int nzc,
+                                lapack_int* isuppz, lapack_logical* tryrac,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dstemr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, double* d, double* e, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                lapack_int* m, double* w, double* z,
+                                lapack_int ldz, lapack_int nzc,
+                                lapack_int* isuppz, lapack_logical* tryrac,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_cstemr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, float* d, float* e, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                lapack_int* m, float* w,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_int nzc, lapack_int* isuppz,
+                                lapack_logical* tryrac, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_zstemr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, double* d, double* e, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                lapack_int* m, double* w,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_int nzc, lapack_int* isuppz,
+                                lapack_logical* tryrac, double* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_ssteqr_work( int matrix_order, char compz, lapack_int n,
+                                float* d, float* e, float* z, lapack_int ldz,
+                                float* work );
+lapack_int LAPACKE_dsteqr_work( int matrix_order, char compz, lapack_int n,
+                                double* d, double* e, double* z, lapack_int ldz,
+                                double* work );
+lapack_int LAPACKE_csteqr_work( int matrix_order, char compz, lapack_int n,
+                                float* d, float* e, lapack_complex_float* z,
+                                lapack_int ldz, float* work );
+lapack_int LAPACKE_zsteqr_work( int matrix_order, char compz, lapack_int n,
+                                double* d, double* e, lapack_complex_double* z,
+                                lapack_int ldz, double* work );
+
+lapack_int LAPACKE_ssterf_work( lapack_int n, float* d, float* e );
+lapack_int LAPACKE_dsterf_work( lapack_int n, double* d, double* e );
+
+lapack_int LAPACKE_sstev_work( int matrix_order, char jobz, lapack_int n,
+                               float* d, float* e, float* z, lapack_int ldz,
+                               float* work );
+lapack_int LAPACKE_dstev_work( int matrix_order, char jobz, lapack_int n,
+                               double* d, double* e, double* z, lapack_int ldz,
+                               double* work );
+
+lapack_int LAPACKE_sstevd_work( int matrix_order, char jobz, lapack_int n,
+                                float* d, float* e, float* z, lapack_int ldz,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dstevd_work( int matrix_order, char jobz, lapack_int n,
+                                double* d, double* e, double* z, lapack_int ldz,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_sstevr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, float* d, float* e, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                float abstol, lapack_int* m, float* w, float* z,
+                                lapack_int ldz, lapack_int* isuppz, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_dstevr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, double* d, double* e, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                double abstol, lapack_int* m, double* w,
+                                double* z, lapack_int ldz, lapack_int* isuppz,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_sstevx_work( int matrix_order, char jobz, char range,
+                                lapack_int n, float* d, float* e, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                float abstol, lapack_int* m, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int* iwork,
+                                lapack_int* ifail );
+lapack_int LAPACKE_dstevx_work( int matrix_order, char jobz, char range,
+                                lapack_int n, double* d, double* e, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                double abstol, lapack_int* m, double* w,
+                                double* z, lapack_int ldz, double* work,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_ssycon_work( int matrix_order, char uplo, lapack_int n,
+                                const float* a, lapack_int lda,
+                                const lapack_int* ipiv, float anorm,
+                                float* rcond, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dsycon_work( int matrix_order, char uplo, lapack_int n,
+                                const double* a, lapack_int lda,
+                                const lapack_int* ipiv, double anorm,
+                                double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_csycon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_int* ipiv, float anorm,
+                                float* rcond, lapack_complex_float* work );
+lapack_int LAPACKE_zsycon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_int* ipiv, double anorm,
+                                double* rcond, lapack_complex_double* work );
+
+lapack_int LAPACKE_ssyequb_work( int matrix_order, char uplo, lapack_int n,
+                                 const float* a, lapack_int lda, float* s,
+                                 float* scond, float* amax, float* work );
+lapack_int LAPACKE_dsyequb_work( int matrix_order, char uplo, lapack_int n,
+                                 const double* a, lapack_int lda, double* s,
+                                 double* scond, double* amax, double* work );
+lapack_int LAPACKE_csyequb_work( int matrix_order, char uplo, lapack_int n,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 float* s, float* scond, float* amax,
+                                 lapack_complex_float* work );
+lapack_int LAPACKE_zsyequb_work( int matrix_order, char uplo, lapack_int n,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 double* s, double* scond, double* amax,
+                                 lapack_complex_double* work );
+
+lapack_int LAPACKE_ssyev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, float* a, lapack_int lda, float* w,
+                               float* work, lapack_int lwork );
+lapack_int LAPACKE_dsyev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, double* a, lapack_int lda,
+                               double* w, double* work, lapack_int lwork );
+
+lapack_int LAPACKE_ssyevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, float* a, lapack_int lda,
+                                float* w, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dsyevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, double* a, lapack_int lda,
+                                double* w, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_ssyevr_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, float* a,
+                                lapack_int lda, float vl, float vu,
+                                lapack_int il, lapack_int iu, float abstol,
+                                lapack_int* m, float* w, float* z,
+                                lapack_int ldz, lapack_int* isuppz, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_dsyevr_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, double* a,
+                                lapack_int lda, double vl, double vu,
+                                lapack_int il, lapack_int iu, double abstol,
+                                lapack_int* m, double* w, double* z,
+                                lapack_int ldz, lapack_int* isuppz,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_ssyevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, float* a,
+                                lapack_int lda, float vl, float vu,
+                                lapack_int il, lapack_int iu, float abstol,
+                                lapack_int* m, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int* ifail );
+lapack_int LAPACKE_dsyevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, double* a,
+                                lapack_int lda, double vl, double vu,
+                                lapack_int il, lapack_int iu, double abstol,
+                                lapack_int* m, double* w, double* z,
+                                lapack_int ldz, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_ssygst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, float* a, lapack_int lda,
+                                const float* b, lapack_int ldb );
+lapack_int LAPACKE_dsygst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, double* a, lapack_int lda,
+                                const double* b, lapack_int ldb );
+
+lapack_int LAPACKE_ssygv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n, float* a,
+                               lapack_int lda, float* b, lapack_int ldb,
+                               float* w, float* work, lapack_int lwork );
+lapack_int LAPACKE_dsygv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n, double* a,
+                               lapack_int lda, double* b, lapack_int ldb,
+                               double* w, double* work, lapack_int lwork );
+
+lapack_int LAPACKE_ssygvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n, float* a,
+                                lapack_int lda, float* b, lapack_int ldb,
+                                float* w, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dsygvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n, double* a,
+                                lapack_int lda, double* b, lapack_int ldb,
+                                double* w, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_ssygvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n, float* a,
+                                lapack_int lda, float* b, lapack_int ldb,
+                                float vl, float vu, lapack_int il,
+                                lapack_int iu, float abstol, lapack_int* m,
+                                float* w, float* z, lapack_int ldz, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int* ifail );
+lapack_int LAPACKE_dsygvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n, double* a,
+                                lapack_int lda, double* b, lapack_int ldb,
+                                double vl, double vu, lapack_int il,
+                                lapack_int iu, double abstol, lapack_int* m,
+                                double* w, double* z, lapack_int ldz,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_ssyrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* a, lapack_int lda,
+                                const float* af, lapack_int ldaf,
+                                const lapack_int* ipiv, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dsyrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* a,
+                                lapack_int lda, const double* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_csyrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zsyrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, const lapack_complex_double* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_ssyrfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs, const float* a,
+                                 lapack_int lda, const float* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const float* s, const float* b, lapack_int ldb,
+                                 float* x, lapack_int ldx, float* rcond,
+                                 float* berr, lapack_int n_err_bnds,
+                                 float* err_bnds_norm, float* err_bnds_comp,
+                                 lapack_int nparams, float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dsyrfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs, const double* a,
+                                 lapack_int lda, const double* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const double* s, const double* b,
+                                 lapack_int ldb, double* x, lapack_int ldx,
+                                 double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, double* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_csyrfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 const lapack_complex_float* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const float* s, const lapack_complex_float* b,
+                                 lapack_int ldb, lapack_complex_float* x,
+                                 lapack_int ldx, float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zsyrfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 const lapack_complex_double* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const double* s,
+                                 const lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_ssysv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, float* a, lapack_int lda,
+                               lapack_int* ipiv, float* b, lapack_int ldb,
+                               float* work, lapack_int lwork );
+lapack_int LAPACKE_dsysv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, double* a, lapack_int lda,
+                               lapack_int* ipiv, double* b, lapack_int ldb,
+                               double* work, lapack_int lwork );
+lapack_int LAPACKE_csysv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_float* a,
+                               lapack_int lda, lapack_int* ipiv,
+                               lapack_complex_float* b, lapack_int ldb,
+                               lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zsysv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_double* a,
+                               lapack_int lda, lapack_int* ipiv,
+                               lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_ssysvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, const float* a,
+                                lapack_int lda, float* af, lapack_int ldaf,
+                                lapack_int* ipiv, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dsysvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, const double* a,
+                                lapack_int lda, double* af, lapack_int ldaf,
+                                lapack_int* ipiv, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_csysvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* af, lapack_int ldaf,
+                                lapack_int* ipiv, const lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork );
+lapack_int LAPACKE_zsysvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* af, lapack_int ldaf,
+                                lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork );
+
+lapack_int LAPACKE_ssysvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs, float* a,
+                                 lapack_int lda, float* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, float* s,
+                                 float* b, lapack_int ldb, float* x,
+                                 lapack_int ldx, float* rcond, float* rpvgrw,
+                                 float* berr, lapack_int n_err_bnds,
+                                 float* err_bnds_norm, float* err_bnds_comp,
+                                 lapack_int nparams, float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dsysvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs, double* a,
+                                 lapack_int lda, double* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, double* s,
+                                 double* b, lapack_int ldb, double* x,
+                                 lapack_int ldx, double* rcond, double* rpvgrw,
+                                 double* berr, lapack_int n_err_bnds,
+                                 double* err_bnds_norm, double* err_bnds_comp,
+                                 lapack_int nparams, double* params,
+                                 double* work, lapack_int* iwork );
+lapack_int LAPACKE_csysvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, float* s,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 float* rcond, float* rpvgrw, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zsysvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, double* s,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* rpvgrw, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_ssytrd_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda, float* d, float* e,
+                                float* tau, float* work, lapack_int lwork );
+lapack_int LAPACKE_dsytrd_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda, double* d, double* e,
+                                double* tau, double* work, lapack_int lwork );
+
+lapack_int LAPACKE_ssytrf_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda, lapack_int* ipiv,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dsytrf_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda, lapack_int* ipiv,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_csytrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* ipiv, lapack_complex_float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_zsytrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* ipiv, lapack_complex_double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_ssytri_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda,
+                                const lapack_int* ipiv, float* work );
+lapack_int LAPACKE_dsytri_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda,
+                                const lapack_int* ipiv, double* work );
+lapack_int LAPACKE_csytri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                const lapack_int* ipiv,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zsytri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_ssytrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* a, lapack_int lda,
+                                const lapack_int* ipiv, float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_dsytrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                double* b, lapack_int ldb );
+lapack_int LAPACKE_csytrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zsytrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_stbcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, lapack_int kd,
+                                const float* ab, lapack_int ldab, float* rcond,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dtbcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, lapack_int kd,
+                                const double* ab, lapack_int ldab,
+                                double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctbcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, lapack_int kd,
+                                const lapack_complex_float* ab, lapack_int ldab,
+                                float* rcond, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_ztbcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, lapack_int kd,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab, double* rcond,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_stbrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs, const float* ab,
+                                lapack_int ldab, const float* b, lapack_int ldb,
+                                const float* x, lapack_int ldx, float* ferr,
+                                float* berr, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dtbrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs, const double* ab,
+                                lapack_int ldab, const double* b,
+                                lapack_int ldb, const double* x, lapack_int ldx,
+                                double* ferr, double* berr, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctbrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs, const lapack_complex_float* ab,
+                                lapack_int ldab, const lapack_complex_float* b,
+                                lapack_int ldb, const lapack_complex_float* x,
+                                lapack_int ldx, float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_ztbrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab, const lapack_complex_double* b,
+                                lapack_int ldb, const lapack_complex_double* x,
+                                lapack_int ldx, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_stbtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs, const float* ab,
+                                lapack_int ldab, float* b, lapack_int ldb );
+lapack_int LAPACKE_dtbtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs, const double* ab,
+                                lapack_int ldab, double* b, lapack_int ldb );
+lapack_int LAPACKE_ctbtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs, const lapack_complex_float* ab,
+                                lapack_int ldab, lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_ztbtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab, lapack_complex_double* b,
+                                lapack_int ldb );
+
+lapack_int LAPACKE_stfsm_work( int matrix_order, char transr, char side,
+                               char uplo, char trans, char diag, lapack_int m,
+                               lapack_int n, float alpha, const float* a,
+                               float* b, lapack_int ldb );
+lapack_int LAPACKE_dtfsm_work( int matrix_order, char transr, char side,
+                               char uplo, char trans, char diag, lapack_int m,
+                               lapack_int n, double alpha, const double* a,
+                               double* b, lapack_int ldb );
+lapack_int LAPACKE_ctfsm_work( int matrix_order, char transr, char side,
+                               char uplo, char trans, char diag, lapack_int m,
+                               lapack_int n, lapack_complex_float alpha,
+                               const lapack_complex_float* a,
+                               lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztfsm_work( int matrix_order, char transr, char side,
+                               char uplo, char trans, char diag, lapack_int m,
+                               lapack_int n, lapack_complex_double alpha,
+                               const lapack_complex_double* a,
+                               lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_stftri_work( int matrix_order, char transr, char uplo,
+                                char diag, lapack_int n, float* a );
+lapack_int LAPACKE_dtftri_work( int matrix_order, char transr, char uplo,
+                                char diag, lapack_int n, double* a );
+lapack_int LAPACKE_ctftri_work( int matrix_order, char transr, char uplo,
+                                char diag, lapack_int n,
+                                lapack_complex_float* a );
+lapack_int LAPACKE_ztftri_work( int matrix_order, char transr, char uplo,
+                                char diag, lapack_int n,
+                                lapack_complex_double* a );
+
+lapack_int LAPACKE_stfttp_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const float* arf, float* ap );
+lapack_int LAPACKE_dtfttp_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const double* arf, double* ap );
+lapack_int LAPACKE_ctfttp_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_float* arf,
+                                lapack_complex_float* ap );
+lapack_int LAPACKE_ztfttp_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_double* arf,
+                                lapack_complex_double* ap );
+
+lapack_int LAPACKE_stfttr_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const float* arf, float* a,
+                                lapack_int lda );
+lapack_int LAPACKE_dtfttr_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const double* arf, double* a,
+                                lapack_int lda );
+lapack_int LAPACKE_ctfttr_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_float* arf,
+                                lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_ztfttr_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_double* arf,
+                                lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_stgevc_work( int matrix_order, char side, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const float* s, lapack_int lds, const float* p,
+                                lapack_int ldp, float* vl, lapack_int ldvl,
+                                float* vr, lapack_int ldvr, lapack_int mm,
+                                lapack_int* m, float* work );
+lapack_int LAPACKE_dtgevc_work( int matrix_order, char side, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const double* s, lapack_int lds,
+                                const double* p, lapack_int ldp, double* vl,
+                                lapack_int ldvl, double* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m, double* work );
+lapack_int LAPACKE_ctgevc_work( int matrix_order, char side, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const lapack_complex_float* s, lapack_int lds,
+                                const lapack_complex_float* p, lapack_int ldp,
+                                lapack_complex_float* vl, lapack_int ldvl,
+                                lapack_complex_float* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_ztgevc_work( int matrix_order, char side, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const lapack_complex_double* s, lapack_int lds,
+                                const lapack_complex_double* p, lapack_int ldp,
+                                lapack_complex_double* vl, lapack_int ldvl,
+                                lapack_complex_double* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_stgexc_work( int matrix_order, lapack_logical wantq,
+                                lapack_logical wantz, lapack_int n, float* a,
+                                lapack_int lda, float* b, lapack_int ldb,
+                                float* q, lapack_int ldq, float* z,
+                                lapack_int ldz, lapack_int* ifst,
+                                lapack_int* ilst, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dtgexc_work( int matrix_order, lapack_logical wantq,
+                                lapack_logical wantz, lapack_int n, double* a,
+                                lapack_int lda, double* b, lapack_int ldb,
+                                double* q, lapack_int ldq, double* z,
+                                lapack_int ldz, lapack_int* ifst,
+                                lapack_int* ilst, double* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_ctgexc_work( int matrix_order, lapack_logical wantq,
+                                lapack_logical wantz, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_int ifst, lapack_int ilst );
+lapack_int LAPACKE_ztgexc_work( int matrix_order, lapack_logical wantq,
+                                lapack_logical wantz, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_int ifst, lapack_int ilst );
+
+lapack_int LAPACKE_stgsen_work( int matrix_order, lapack_int ijob,
+                                lapack_logical wantq, lapack_logical wantz,
+                                const lapack_logical* select, lapack_int n,
+                                float* a, lapack_int lda, float* b,
+                                lapack_int ldb, float* alphar, float* alphai,
+                                float* beta, float* q, lapack_int ldq, float* z,
+                                lapack_int ldz, lapack_int* m, float* pl,
+                                float* pr, float* dif, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_dtgsen_work( int matrix_order, lapack_int ijob,
+                                lapack_logical wantq, lapack_logical wantz,
+                                const lapack_logical* select, lapack_int n,
+                                double* a, lapack_int lda, double* b,
+                                lapack_int ldb, double* alphar, double* alphai,
+                                double* beta, double* q, lapack_int ldq,
+                                double* z, lapack_int ldz, lapack_int* m,
+                                double* pl, double* pr, double* dif,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_ctgsen_work( int matrix_order, lapack_int ijob,
+                                lapack_logical wantq, lapack_logical wantz,
+                                const lapack_logical* select, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* alpha,
+                                lapack_complex_float* beta,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_int* m, float* pl, float* pr, float* dif,
+                                lapack_complex_float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_ztgsen_work( int matrix_order, lapack_int ijob,
+                                lapack_logical wantq, lapack_logical wantz,
+                                const lapack_logical* select, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* alpha,
+                                lapack_complex_double* beta,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_int* m, double* pl, double* pr,
+                                double* dif, lapack_complex_double* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_stgsja_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                float* a, lapack_int lda, float* b,
+                                lapack_int ldb, float tola, float tolb,
+                                float* alpha, float* beta, float* u,
+                                lapack_int ldu, float* v, lapack_int ldv,
+                                float* q, lapack_int ldq, float* work,
+                                lapack_int* ncycle );
+lapack_int LAPACKE_dtgsja_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                double* a, lapack_int lda, double* b,
+                                lapack_int ldb, double tola, double tolb,
+                                double* alpha, double* beta, double* u,
+                                lapack_int ldu, double* v, lapack_int ldv,
+                                double* q, lapack_int ldq, double* work,
+                                lapack_int* ncycle );
+lapack_int LAPACKE_ctgsja_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                float tola, float tolb, float* alpha,
+                                float* beta, lapack_complex_float* u,
+                                lapack_int ldu, lapack_complex_float* v,
+                                lapack_int ldv, lapack_complex_float* q,
+                                lapack_int ldq, lapack_complex_float* work,
+                                lapack_int* ncycle );
+lapack_int LAPACKE_ztgsja_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                double tola, double tolb, double* alpha,
+                                double* beta, lapack_complex_double* u,
+                                lapack_int ldu, lapack_complex_double* v,
+                                lapack_int ldv, lapack_complex_double* q,
+                                lapack_int ldq, lapack_complex_double* work,
+                                lapack_int* ncycle );
+
+lapack_int LAPACKE_stgsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const float* a, lapack_int lda, const float* b,
+                                lapack_int ldb, const float* vl,
+                                lapack_int ldvl, const float* vr,
+                                lapack_int ldvr, float* s, float* dif,
+                                lapack_int mm, lapack_int* m, float* work,
+                                lapack_int lwork, lapack_int* iwork );
+lapack_int LAPACKE_dtgsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const double* a, lapack_int lda,
+                                const double* b, lapack_int ldb,
+                                const double* vl, lapack_int ldvl,
+                                const double* vr, lapack_int ldvr, double* s,
+                                double* dif, lapack_int mm, lapack_int* m,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctgsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                const lapack_complex_float* vl, lapack_int ldvl,
+                                const lapack_complex_float* vr, lapack_int ldvr,
+                                float* s, float* dif, lapack_int mm,
+                                lapack_int* m, lapack_complex_float* work,
+                                lapack_int lwork, lapack_int* iwork );
+lapack_int LAPACKE_ztgsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                const lapack_complex_double* vl,
+                                lapack_int ldvl,
+                                const lapack_complex_double* vr,
+                                lapack_int ldvr, double* s, double* dif,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_double* work, lapack_int lwork,
+                                lapack_int* iwork );
+
+lapack_int LAPACKE_stgsyl_work( int matrix_order, char trans, lapack_int ijob,
+                                lapack_int m, lapack_int n, const float* a,
+                                lapack_int lda, const float* b, lapack_int ldb,
+                                float* c, lapack_int ldc, const float* d,
+                                lapack_int ldd, const float* e, lapack_int lde,
+                                float* f, lapack_int ldf, float* scale,
+                                float* dif, float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dtgsyl_work( int matrix_order, char trans, lapack_int ijob,
+                                lapack_int m, lapack_int n, const double* a,
+                                lapack_int lda, const double* b, lapack_int ldb,
+                                double* c, lapack_int ldc, const double* d,
+                                lapack_int ldd, const double* e, lapack_int lde,
+                                double* f, lapack_int ldf, double* scale,
+                                double* dif, double* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctgsyl_work( int matrix_order, char trans, lapack_int ijob,
+                                lapack_int m, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* c, lapack_int ldc,
+                                const lapack_complex_float* d, lapack_int ldd,
+                                const lapack_complex_float* e, lapack_int lde,
+                                lapack_complex_float* f, lapack_int ldf,
+                                float* scale, float* dif,
+                                lapack_complex_float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ztgsyl_work( int matrix_order, char trans, lapack_int ijob,
+                                lapack_int m, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* c, lapack_int ldc,
+                                const lapack_complex_double* d, lapack_int ldd,
+                                const lapack_complex_double* e, lapack_int lde,
+                                lapack_complex_double* f, lapack_int ldf,
+                                double* scale, double* dif,
+                                lapack_complex_double* work, lapack_int lwork,
+                                lapack_int* iwork );
+
+lapack_int LAPACKE_stpcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, const float* ap,
+                                float* rcond, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dtpcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, const double* ap,
+                                double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctpcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n,
+                                const lapack_complex_float* ap, float* rcond,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_ztpcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n,
+                                const lapack_complex_double* ap, double* rcond,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_stprfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const float* ap, const float* b, lapack_int ldb,
+                                const float* x, lapack_int ldx, float* ferr,
+                                float* berr, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dtprfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const double* ap, const double* b,
+                                lapack_int ldb, const double* x, lapack_int ldx,
+                                double* ferr, double* berr, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctprfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* ap,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                const lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_ztprfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                const lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_stptri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, float* ap );
+lapack_int LAPACKE_dtptri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, double* ap );
+lapack_int LAPACKE_ctptri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, lapack_complex_float* ap );
+lapack_int LAPACKE_ztptri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, lapack_complex_double* ap );
+
+lapack_int LAPACKE_stptrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const float* ap, float* b, lapack_int ldb );
+lapack_int LAPACKE_dtptrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const double* ap, double* b, lapack_int ldb );
+lapack_int LAPACKE_ctptrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* ap,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztptrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_stpttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const float* ap, float* arf );
+lapack_int LAPACKE_dtpttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const double* ap, double* arf );
+lapack_int LAPACKE_ctpttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_float* ap,
+                                lapack_complex_float* arf );
+lapack_int LAPACKE_ztpttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_double* ap,
+                                lapack_complex_double* arf );
+
+lapack_int LAPACKE_stpttr_work( int matrix_order, char uplo, lapack_int n,
+                                const float* ap, float* a, lapack_int lda );
+lapack_int LAPACKE_dtpttr_work( int matrix_order, char uplo, lapack_int n,
+                                const double* ap, double* a, lapack_int lda );
+lapack_int LAPACKE_ctpttr_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* ap,
+                                lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_ztpttr_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* ap,
+                                lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_strcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, const float* a,
+                                lapack_int lda, float* rcond, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dtrcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, const double* a,
+                                lapack_int lda, double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctrcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                float* rcond, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_ztrcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                double* rcond, lapack_complex_double* work,
+                                double* rwork );
+
+lapack_int LAPACKE_strevc_work( int matrix_order, char side, char howmny,
+                                lapack_logical* select, lapack_int n,
+                                const float* t, lapack_int ldt, float* vl,
+                                lapack_int ldvl, float* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m, float* work );
+lapack_int LAPACKE_dtrevc_work( int matrix_order, char side, char howmny,
+                                lapack_logical* select, lapack_int n,
+                                const double* t, lapack_int ldt, double* vl,
+                                lapack_int ldvl, double* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m, double* work );
+lapack_int LAPACKE_ctrevc_work( int matrix_order, char side, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                lapack_complex_float* t, lapack_int ldt,
+                                lapack_complex_float* vl, lapack_int ldvl,
+                                lapack_complex_float* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_ztrevc_work( int matrix_order, char side, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                lapack_complex_double* t, lapack_int ldt,
+                                lapack_complex_double* vl, lapack_int ldvl,
+                                lapack_complex_double* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_strexc_work( int matrix_order, char compq, lapack_int n,
+                                float* t, lapack_int ldt, float* q,
+                                lapack_int ldq, lapack_int* ifst,
+                                lapack_int* ilst, float* work );
+lapack_int LAPACKE_dtrexc_work( int matrix_order, char compq, lapack_int n,
+                                double* t, lapack_int ldt, double* q,
+                                lapack_int ldq, lapack_int* ifst,
+                                lapack_int* ilst, double* work );
+lapack_int LAPACKE_ctrexc_work( int matrix_order, char compq, lapack_int n,
+                                lapack_complex_float* t, lapack_int ldt,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_int ifst, lapack_int ilst );
+lapack_int LAPACKE_ztrexc_work( int matrix_order, char compq, lapack_int n,
+                                lapack_complex_double* t, lapack_int ldt,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_int ifst, lapack_int ilst );
+
+lapack_int LAPACKE_strrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const float* a, lapack_int lda, const float* b,
+                                lapack_int ldb, const float* x, lapack_int ldx,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dtrrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const double* a, lapack_int lda,
+                                const double* b, lapack_int ldb,
+                                const double* x, lapack_int ldx, double* ferr,
+                                double* berr, double* work, lapack_int* iwork );
+lapack_int LAPACKE_ctrrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                const lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_ztrrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                const lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_strsen_work( int matrix_order, char job, char compq,
+                                const lapack_logical* select, lapack_int n,
+                                float* t, lapack_int ldt, float* q,
+                                lapack_int ldq, float* wr, float* wi,
+                                lapack_int* m, float* s, float* sep,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dtrsen_work( int matrix_order, char job, char compq,
+                                const lapack_logical* select, lapack_int n,
+                                double* t, lapack_int ldt, double* q,
+                                lapack_int ldq, double* wr, double* wi,
+                                lapack_int* m, double* s, double* sep,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_ctrsen_work( int matrix_order, char job, char compq,
+                                const lapack_logical* select, lapack_int n,
+                                lapack_complex_float* t, lapack_int ldt,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* w, lapack_int* m,
+                                float* s, float* sep,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_ztrsen_work( int matrix_order, char job, char compq,
+                                const lapack_logical* select, lapack_int n,
+                                lapack_complex_double* t, lapack_int ldt,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* w, lapack_int* m,
+                                double* s, double* sep,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_strsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const float* t, lapack_int ldt, const float* vl,
+                                lapack_int ldvl, const float* vr,
+                                lapack_int ldvr, float* s, float* sep,
+                                lapack_int mm, lapack_int* m, float* work,
+                                lapack_int ldwork, lapack_int* iwork );
+lapack_int LAPACKE_dtrsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const double* t, lapack_int ldt,
+                                const double* vl, lapack_int ldvl,
+                                const double* vr, lapack_int ldvr, double* s,
+                                double* sep, lapack_int mm, lapack_int* m,
+                                double* work, lapack_int ldwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctrsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const lapack_complex_float* t, lapack_int ldt,
+                                const lapack_complex_float* vl, lapack_int ldvl,
+                                const lapack_complex_float* vr, lapack_int ldvr,
+                                float* s, float* sep, lapack_int mm,
+                                lapack_int* m, lapack_complex_float* work,
+                                lapack_int ldwork, float* rwork );
+lapack_int LAPACKE_ztrsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const lapack_complex_double* t, lapack_int ldt,
+                                const lapack_complex_double* vl,
+                                lapack_int ldvl,
+                                const lapack_complex_double* vr,
+                                lapack_int ldvr, double* s, double* sep,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_double* work, lapack_int ldwork,
+                                double* rwork );
+
+lapack_int LAPACKE_strsyl_work( int matrix_order, char trana, char tranb,
+                                lapack_int isgn, lapack_int m, lapack_int n,
+                                const float* a, lapack_int lda, const float* b,
+                                lapack_int ldb, float* c, lapack_int ldc,
+                                float* scale );
+lapack_int LAPACKE_dtrsyl_work( int matrix_order, char trana, char tranb,
+                                lapack_int isgn, lapack_int m, lapack_int n,
+                                const double* a, lapack_int lda,
+                                const double* b, lapack_int ldb, double* c,
+                                lapack_int ldc, double* scale );
+lapack_int LAPACKE_ctrsyl_work( int matrix_order, char trana, char tranb,
+                                lapack_int isgn, lapack_int m, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* c, lapack_int ldc,
+                                float* scale );
+lapack_int LAPACKE_ztrsyl_work( int matrix_order, char trana, char tranb,
+                                lapack_int isgn, lapack_int m, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* c, lapack_int ldc,
+                                double* scale );
+
+lapack_int LAPACKE_strtri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, float* a, lapack_int lda );
+lapack_int LAPACKE_dtrtri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, double* a, lapack_int lda );
+lapack_int LAPACKE_ctrtri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda );
+lapack_int LAPACKE_ztrtri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda );
+
+lapack_int LAPACKE_strtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const float* a, lapack_int lda, float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_dtrtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const double* a, lapack_int lda, double* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_ctrtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztrtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_strttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const float* a, lapack_int lda,
+                                float* arf );
+lapack_int LAPACKE_dtrttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const double* a, lapack_int lda,
+                                double* arf );
+lapack_int LAPACKE_ctrttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* arf );
+lapack_int LAPACKE_ztrttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* arf );
+
+lapack_int LAPACKE_strttp_work( int matrix_order, char uplo, lapack_int n,
+                                const float* a, lapack_int lda, float* ap );
+lapack_int LAPACKE_dtrttp_work( int matrix_order, char uplo, lapack_int n,
+                                const double* a, lapack_int lda, double* ap );
+lapack_int LAPACKE_ctrttp_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* ap );
+lapack_int LAPACKE_ztrttp_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* ap );
+
+lapack_int LAPACKE_stzrzf_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* tau,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dtzrzf_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* tau,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_ctzrzf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_ztzrzf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cungbr_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int k,
+                                lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zungbr_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int k,
+                                lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunghr_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunghr_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunglq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunglq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cungql_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zungql_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cungqr_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zungqr_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cungrq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zungrq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cungtr_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zungtr_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmbr_work( int matrix_order, char vect, char side,
+                                char trans, lapack_int m, lapack_int n,
+                                lapack_int k, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmbr_work( int matrix_order, char vect, char side,
+                                char trans, lapack_int m, lapack_int n,
+                                lapack_int k, const lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmhr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmhr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, const lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmlq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmlq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmql_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmql_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmqr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmqr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmrq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmrq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmrz_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                lapack_int l, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmrz_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                lapack_int l, const lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cupgtr_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* ap,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zupgtr_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* ap,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_cupmtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const lapack_complex_float* ap,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zupmtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const lapack_complex_double* ap,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_claghe( int matrix_order, lapack_int n, lapack_int k,
+                           const float* d, lapack_complex_float* a,
+                           lapack_int lda, lapack_int* iseed );
+lapack_int LAPACKE_zlaghe( int matrix_order, lapack_int n, lapack_int k,
+                           const double* d, lapack_complex_double* a,
+                           lapack_int lda, lapack_int* iseed );
+
+lapack_int LAPACKE_slagsy( int matrix_order, lapack_int n, lapack_int k,
+                           const float* d, float* a, lapack_int lda,
+                           lapack_int* iseed );
+lapack_int LAPACKE_dlagsy( int matrix_order, lapack_int n, lapack_int k,
+                           const double* d, double* a, lapack_int lda,
+                           lapack_int* iseed );
+lapack_int LAPACKE_clagsy( int matrix_order, lapack_int n, lapack_int k,
+                           const float* d, lapack_complex_float* a,
+                           lapack_int lda, lapack_int* iseed );
+lapack_int LAPACKE_zlagsy( int matrix_order, lapack_int n, lapack_int k,
+                           const double* d, lapack_complex_double* a,
+                           lapack_int lda, lapack_int* iseed );
+
+lapack_int LAPACKE_slapmr( int matrix_order, lapack_logical forwrd,
+                           lapack_int m, lapack_int n, float* x, lapack_int ldx,
+                           lapack_int* k );
+lapack_int LAPACKE_dlapmr( int matrix_order, lapack_logical forwrd,
+                           lapack_int m, lapack_int n, double* x,
+                           lapack_int ldx, lapack_int* k );
+lapack_int LAPACKE_clapmr( int matrix_order, lapack_logical forwrd,
+                           lapack_int m, lapack_int n, lapack_complex_float* x,
+                           lapack_int ldx, lapack_int* k );
+lapack_int LAPACKE_zlapmr( int matrix_order, lapack_logical forwrd,
+                           lapack_int m, lapack_int n, lapack_complex_double* x,
+                           lapack_int ldx, lapack_int* k );
+
+
+float LAPACKE_slapy2( float x, float y );
+double LAPACKE_dlapy2( double x, double y );
+
+float LAPACKE_slapy3( float x, float y, float z );
+double LAPACKE_dlapy3( double x, double y, double z );
+
+lapack_int LAPACKE_slartgp( float f, float g, float* cs, float* sn, float* r );
+lapack_int LAPACKE_dlartgp( double f, double g, double* cs, double* sn,
+                            double* r );
+
+lapack_int LAPACKE_slartgs( float x, float y, float sigma, float* cs,
+                            float* sn );
+lapack_int LAPACKE_dlartgs( double x, double y, double sigma, double* cs,
+                            double* sn );
+
+
+//LAPACK 3.3.0
+lapack_int LAPACKE_cbbcsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, lapack_int m,
+                           lapack_int p, lapack_int q, float* theta, float* phi,
+                           lapack_complex_float* u1, lapack_int ldu1,
+                           lapack_complex_float* u2, lapack_int ldu2,
+                           lapack_complex_float* v1t, lapack_int ldv1t,
+                           lapack_complex_float* v2t, lapack_int ldv2t,
+                           float* b11d, float* b11e, float* b12d, float* b12e,
+                           float* b21d, float* b21e, float* b22d, float* b22e );
+lapack_int LAPACKE_cbbcsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                float* theta, float* phi,
+                                lapack_complex_float* u1, lapack_int ldu1,
+                                lapack_complex_float* u2, lapack_int ldu2,
+                                lapack_complex_float* v1t, lapack_int ldv1t,
+                                lapack_complex_float* v2t, lapack_int ldv2t,
+                                float* b11d, float* b11e, float* b12d,
+                                float* b12e, float* b21d, float* b21e,
+                                float* b22d, float* b22e, float* rwork,
+                                lapack_int lrwork );
+lapack_int LAPACKE_cheswapr( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_float* a, lapack_int i1,
+                             lapack_int i2 );
+lapack_int LAPACKE_cheswapr_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_float* a, lapack_int i1,
+                                  lapack_int i2 );
+lapack_int LAPACKE_chetri2( int matrix_order, char uplo, lapack_int n,
+                            lapack_complex_float* a, lapack_int lda,
+                            const lapack_int* ipiv );
+lapack_int LAPACKE_chetri2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 const lapack_int* ipiv,
+                                 lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_chetri2x( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_float* a, lapack_int lda,
+                             const lapack_int* ipiv, lapack_int nb );
+lapack_int LAPACKE_chetri2x_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_float* a, lapack_int lda,
+                                  const lapack_int* ipiv,
+                                  lapack_complex_float* work, lapack_int nb );
+lapack_int LAPACKE_chetrs2( int matrix_order, char uplo, lapack_int n,
+                            lapack_int nrhs, const lapack_complex_float* a,
+                            lapack_int lda, const lapack_int* ipiv,
+                            lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_chetrs2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_int nrhs, const lapack_complex_float* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* work );
+lapack_int LAPACKE_csyconv( int matrix_order, char uplo, char way, lapack_int n,
+                            lapack_complex_float* a, lapack_int lda,
+                            const lapack_int* ipiv );
+lapack_int LAPACKE_csyconv_work( int matrix_order, char uplo, char way,
+                                 lapack_int n, lapack_complex_float* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 lapack_complex_float* work );
+lapack_int LAPACKE_csyswapr( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_float* a, lapack_int i1,
+                             lapack_int i2 );
+lapack_int LAPACKE_csyswapr_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_float* a, lapack_int i1,
+                                  lapack_int i2 );
+lapack_int LAPACKE_csytri2( int matrix_order, char uplo, lapack_int n,
+                            lapack_complex_float* a, lapack_int lda,
+                            const lapack_int* ipiv );
+lapack_int LAPACKE_csytri2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 const lapack_int* ipiv,
+                                 lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_csytri2x( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_float* a, lapack_int lda,
+                             const lapack_int* ipiv, lapack_int nb );
+lapack_int LAPACKE_csytri2x_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_float* a, lapack_int lda,
+                                  const lapack_int* ipiv,
+                                  lapack_complex_float* work, lapack_int nb );
+lapack_int LAPACKE_csytrs2( int matrix_order, char uplo, lapack_int n,
+                            lapack_int nrhs, const lapack_complex_float* a,
+                            lapack_int lda, const lapack_int* ipiv,
+                            lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_csytrs2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_int nrhs, const lapack_complex_float* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* work );
+lapack_int LAPACKE_cunbdb( int matrix_order, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q,
+                           lapack_complex_float* x11, lapack_int ldx11,
+                           lapack_complex_float* x12, lapack_int ldx12,
+                           lapack_complex_float* x21, lapack_int ldx21,
+                           lapack_complex_float* x22, lapack_int ldx22,
+                           float* theta, float* phi,
+                           lapack_complex_float* taup1,
+                           lapack_complex_float* taup2,
+                           lapack_complex_float* tauq1,
+                           lapack_complex_float* tauq2 );
+lapack_int LAPACKE_cunbdb_work( int matrix_order, char trans, char signs,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                lapack_complex_float* x11, lapack_int ldx11,
+                                lapack_complex_float* x12, lapack_int ldx12,
+                                lapack_complex_float* x21, lapack_int ldx21,
+                                lapack_complex_float* x22, lapack_int ldx22,
+                                float* theta, float* phi,
+                                lapack_complex_float* taup1,
+                                lapack_complex_float* taup2,
+                                lapack_complex_float* tauq1,
+                                lapack_complex_float* tauq2,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_cuncsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q,
+                           lapack_complex_float* x11, lapack_int ldx11,
+                           lapack_complex_float* x12, lapack_int ldx12,
+                           lapack_complex_float* x21, lapack_int ldx21,
+                           lapack_complex_float* x22, lapack_int ldx22,
+                           float* theta, lapack_complex_float* u1,
+                           lapack_int ldu1, lapack_complex_float* u2,
+                           lapack_int ldu2, lapack_complex_float* v1t,
+                           lapack_int ldv1t, lapack_complex_float* v2t,
+                           lapack_int ldv2t );
+lapack_int LAPACKE_cuncsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                char signs, lapack_int m, lapack_int p,
+                                lapack_int q, lapack_complex_float* x11,
+                                lapack_int ldx11, lapack_complex_float* x12,
+                                lapack_int ldx12, lapack_complex_float* x21,
+                                lapack_int ldx21, lapack_complex_float* x22,
+                                lapack_int ldx22, float* theta,
+                                lapack_complex_float* u1, lapack_int ldu1,
+                                lapack_complex_float* u2, lapack_int ldu2,
+                                lapack_complex_float* v1t, lapack_int ldv1t,
+                                lapack_complex_float* v2t, lapack_int ldv2t,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_int lrwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dbbcsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, lapack_int m,
+                           lapack_int p, lapack_int q, double* theta,
+                           double* phi, double* u1, lapack_int ldu1, double* u2,
+                           lapack_int ldu2, double* v1t, lapack_int ldv1t,
+                           double* v2t, lapack_int ldv2t, double* b11d,
+                           double* b11e, double* b12d, double* b12e,
+                           double* b21d, double* b21e, double* b22d,
+                           double* b22e );
+lapack_int LAPACKE_dbbcsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                double* theta, double* phi, double* u1,
+                                lapack_int ldu1, double* u2, lapack_int ldu2,
+                                double* v1t, lapack_int ldv1t, double* v2t,
+                                lapack_int ldv2t, double* b11d, double* b11e,
+                                double* b12d, double* b12e, double* b21d,
+                                double* b21e, double* b22d, double* b22e,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_dorbdb( int matrix_order, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q,
+                           double* x11, lapack_int ldx11, double* x12,
+                           lapack_int ldx12, double* x21, lapack_int ldx21,
+                           double* x22, lapack_int ldx22, double* theta,
+                           double* phi, double* taup1, double* taup2,
+                           double* tauq1, double* tauq2 );
+lapack_int LAPACKE_dorbdb_work( int matrix_order, char trans, char signs,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                double* x11, lapack_int ldx11, double* x12,
+                                lapack_int ldx12, double* x21, lapack_int ldx21,
+                                double* x22, lapack_int ldx22, double* theta,
+                                double* phi, double* taup1, double* taup2,
+                                double* tauq1, double* tauq2, double* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dorcsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q,
+                           double* x11, lapack_int ldx11, double* x12,
+                           lapack_int ldx12, double* x21, lapack_int ldx21,
+                           double* x22, lapack_int ldx22, double* theta,
+                           double* u1, lapack_int ldu1, double* u2,
+                           lapack_int ldu2, double* v1t, lapack_int ldv1t,
+                           double* v2t, lapack_int ldv2t );
+lapack_int LAPACKE_dorcsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                char signs, lapack_int m, lapack_int p,
+                                lapack_int q, double* x11, lapack_int ldx11,
+                                double* x12, lapack_int ldx12, double* x21,
+                                lapack_int ldx21, double* x22, lapack_int ldx22,
+                                double* theta, double* u1, lapack_int ldu1,
+                                double* u2, lapack_int ldu2, double* v1t,
+                                lapack_int ldv1t, double* v2t, lapack_int ldv2t,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dsyconv( int matrix_order, char uplo, char way, lapack_int n,
+                            double* a, lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_dsyconv_work( int matrix_order, char uplo, char way,
+                                 lapack_int n, double* a, lapack_int lda,
+                                 const lapack_int* ipiv, double* work );
+lapack_int LAPACKE_dsyswapr( int matrix_order, char uplo, lapack_int n,
+                             double* a, lapack_int i1, lapack_int i2 );
+lapack_int LAPACKE_dsyswapr_work( int matrix_order, char uplo, lapack_int n,
+                                  double* a, lapack_int i1, lapack_int i2 );
+lapack_int LAPACKE_dsytri2( int matrix_order, char uplo, lapack_int n,
+                            double* a, lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_dsytri2_work( int matrix_order, char uplo, lapack_int n,
+                                 double* a, lapack_int lda,
+                                 const lapack_int* ipiv,
+                                 lapack_complex_double* work, lapack_int lwork );
+lapack_int LAPACKE_dsytri2x( int matrix_order, char uplo, lapack_int n,
+                             double* a, lapack_int lda, const lapack_int* ipiv,
+                             lapack_int nb );
+lapack_int LAPACKE_dsytri2x_work( int matrix_order, char uplo, lapack_int n,
+                                  double* a, lapack_int lda,
+                                  const lapack_int* ipiv, double* work,
+                                  lapack_int nb );
+lapack_int LAPACKE_dsytrs2( int matrix_order, char uplo, lapack_int n,
+                            lapack_int nrhs, const double* a, lapack_int lda,
+                            const lapack_int* ipiv, double* b, lapack_int ldb );
+lapack_int LAPACKE_dsytrs2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_int nrhs, const double* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 double* b, lapack_int ldb, double* work );
+lapack_int LAPACKE_sbbcsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, lapack_int m,
+                           lapack_int p, lapack_int q, float* theta, float* phi,
+                           float* u1, lapack_int ldu1, float* u2,
+                           lapack_int ldu2, float* v1t, lapack_int ldv1t,
+                           float* v2t, lapack_int ldv2t, float* b11d,
+                           float* b11e, float* b12d, float* b12e, float* b21d,
+                           float* b21e, float* b22d, float* b22e );
+lapack_int LAPACKE_sbbcsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                float* theta, float* phi, float* u1,
+                                lapack_int ldu1, float* u2, lapack_int ldu2,
+                                float* v1t, lapack_int ldv1t, float* v2t,
+                                lapack_int ldv2t, float* b11d, float* b11e,
+                                float* b12d, float* b12e, float* b21d,
+                                float* b21e, float* b22d, float* b22e,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_sorbdb( int matrix_order, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q, float* x11,
+                           lapack_int ldx11, float* x12, lapack_int ldx12,
+                           float* x21, lapack_int ldx21, float* x22,
+                           lapack_int ldx22, float* theta, float* phi,
+                           float* taup1, float* taup2, float* tauq1,
+                           float* tauq2 );
+lapack_int LAPACKE_sorbdb_work( int matrix_order, char trans, char signs,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                float* x11, lapack_int ldx11, float* x12,
+                                lapack_int ldx12, float* x21, lapack_int ldx21,
+                                float* x22, lapack_int ldx22, float* theta,
+                                float* phi, float* taup1, float* taup2,
+                                float* tauq1, float* tauq2, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_sorcsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q, float* x11,
+                           lapack_int ldx11, float* x12, lapack_int ldx12,
+                           float* x21, lapack_int ldx21, float* x22,
+                           lapack_int ldx22, float* theta, float* u1,
+                           lapack_int ldu1, float* u2, lapack_int ldu2,
+                           float* v1t, lapack_int ldv1t, float* v2t,
+                           lapack_int ldv2t );
+lapack_int LAPACKE_sorcsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                char signs, lapack_int m, lapack_int p,
+                                lapack_int q, float* x11, lapack_int ldx11,
+                                float* x12, lapack_int ldx12, float* x21,
+                                lapack_int ldx21, float* x22, lapack_int ldx22,
+                                float* theta, float* u1, lapack_int ldu1,
+                                float* u2, lapack_int ldu2, float* v1t,
+                                lapack_int ldv1t, float* v2t, lapack_int ldv2t,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ssyconv( int matrix_order, char uplo, char way, lapack_int n,
+                            float* a, lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_ssyconv_work( int matrix_order, char uplo, char way,
+                                 lapack_int n, float* a, lapack_int lda,
+                                 const lapack_int* ipiv, float* work );
+lapack_int LAPACKE_ssyswapr( int matrix_order, char uplo, lapack_int n,
+                             float* a, lapack_int i1, lapack_int i2 );
+lapack_int LAPACKE_ssyswapr_work( int matrix_order, char uplo, lapack_int n,
+                                  float* a, lapack_int i1, lapack_int i2 );
+lapack_int LAPACKE_ssytri2( int matrix_order, char uplo, lapack_int n, float* a,
+                            lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_ssytri2_work( int matrix_order, char uplo, lapack_int n,
+                                 float* a, lapack_int lda,
+                                 const lapack_int* ipiv,
+                                 lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_ssytri2x( int matrix_order, char uplo, lapack_int n,
+                             float* a, lapack_int lda, const lapack_int* ipiv,
+                             lapack_int nb );
+lapack_int LAPACKE_ssytri2x_work( int matrix_order, char uplo, lapack_int n,
+                                  float* a, lapack_int lda,
+                                  const lapack_int* ipiv, float* work,
+                                  lapack_int nb );
+lapack_int LAPACKE_ssytrs2( int matrix_order, char uplo, lapack_int n,
+                            lapack_int nrhs, const float* a, lapack_int lda,
+                            const lapack_int* ipiv, float* b, lapack_int ldb );
+lapack_int LAPACKE_ssytrs2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_int nrhs, const float* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 float* b, lapack_int ldb, float* work );
+lapack_int LAPACKE_zbbcsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, lapack_int m,
+                           lapack_int p, lapack_int q, double* theta,
+                           double* phi, lapack_complex_double* u1,
+                           lapack_int ldu1, lapack_complex_double* u2,
+                           lapack_int ldu2, lapack_complex_double* v1t,
+                           lapack_int ldv1t, lapack_complex_double* v2t,
+                           lapack_int ldv2t, double* b11d, double* b11e,
+                           double* b12d, double* b12e, double* b21d,
+                           double* b21e, double* b22d, double* b22e );
+lapack_int LAPACKE_zbbcsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                double* theta, double* phi,
+                                lapack_complex_double* u1, lapack_int ldu1,
+                                lapack_complex_double* u2, lapack_int ldu2,
+                                lapack_complex_double* v1t, lapack_int ldv1t,
+                                lapack_complex_double* v2t, lapack_int ldv2t,
+                                double* b11d, double* b11e, double* b12d,
+                                double* b12e, double* b21d, double* b21e,
+                                double* b22d, double* b22e, double* rwork,
+                                lapack_int lrwork );
+lapack_int LAPACKE_zheswapr( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_double* a, lapack_int i1,
+                             lapack_int i2 );
+lapack_int LAPACKE_zheswapr_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_double* a, lapack_int i1,
+                                  lapack_int i2 );
+lapack_int LAPACKE_zhetri2( int matrix_order, char uplo, lapack_int n,
+                            lapack_complex_double* a, lapack_int lda,
+                            const lapack_int* ipiv );
+lapack_int LAPACKE_zhetri2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 const lapack_int* ipiv,
+                                 lapack_complex_double* work, lapack_int lwork );
+lapack_int LAPACKE_zhetri2x( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_double* a, lapack_int lda,
+                             const lapack_int* ipiv, lapack_int nb );
+lapack_int LAPACKE_zhetri2x_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_double* a, lapack_int lda,
+                                  const lapack_int* ipiv,
+                                  lapack_complex_double* work, lapack_int nb );
+lapack_int LAPACKE_zhetrs2( int matrix_order, char uplo, lapack_int n,
+                            lapack_int nrhs, const lapack_complex_double* a,
+                            lapack_int lda, const lapack_int* ipiv,
+                            lapack_complex_double* b, lapack_int ldb );
+lapack_int LAPACKE_zhetrs2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_int nrhs, const lapack_complex_double* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* work );
+lapack_int LAPACKE_zsyconv( int matrix_order, char uplo, char way, lapack_int n,
+                            lapack_complex_double* a, lapack_int lda,
+                            const lapack_int* ipiv );
+lapack_int LAPACKE_zsyconv_work( int matrix_order, char uplo, char way,
+                                 lapack_int n, lapack_complex_double* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 lapack_complex_double* work );
+lapack_int LAPACKE_zsyswapr( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_double* a, lapack_int i1,
+                             lapack_int i2 );
+lapack_int LAPACKE_zsyswapr_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_double* a, lapack_int i1,
+                                  lapack_int i2 );
+lapack_int LAPACKE_zsytri2( int matrix_order, char uplo, lapack_int n,
+                            lapack_complex_double* a, lapack_int lda,
+                            const lapack_int* ipiv );
+lapack_int LAPACKE_zsytri2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 const lapack_int* ipiv,
+                                 lapack_complex_double* work, lapack_int lwork );
+lapack_int LAPACKE_zsytri2x( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_double* a, lapack_int lda,
+                             const lapack_int* ipiv, lapack_int nb );
+lapack_int LAPACKE_zsytri2x_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_double* a, lapack_int lda,
+                                  const lapack_int* ipiv,
+                                  lapack_complex_double* work, lapack_int nb );
+lapack_int LAPACKE_zsytrs2( int matrix_order, char uplo, lapack_int n,
+                            lapack_int nrhs, const lapack_complex_double* a,
+                            lapack_int lda, const lapack_int* ipiv,
+                            lapack_complex_double* b, lapack_int ldb );
+lapack_int LAPACKE_zsytrs2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_int nrhs, const lapack_complex_double* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* work );
+lapack_int LAPACKE_zunbdb( int matrix_order, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q,
+                           lapack_complex_double* x11, lapack_int ldx11,
+                           lapack_complex_double* x12, lapack_int ldx12,
+                           lapack_complex_double* x21, lapack_int ldx21,
+                           lapack_complex_double* x22, lapack_int ldx22,
+                           double* theta, double* phi,
+                           lapack_complex_double* taup1,
+                           lapack_complex_double* taup2,
+                           lapack_complex_double* tauq1,
+                           lapack_complex_double* tauq2 );
+lapack_int LAPACKE_zunbdb_work( int matrix_order, char trans, char signs,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                lapack_complex_double* x11, lapack_int ldx11,
+                                lapack_complex_double* x12, lapack_int ldx12,
+                                lapack_complex_double* x21, lapack_int ldx21,
+                                lapack_complex_double* x22, lapack_int ldx22,
+                                double* theta, double* phi,
+                                lapack_complex_double* taup1,
+                                lapack_complex_double* taup2,
+                                lapack_complex_double* tauq1,
+                                lapack_complex_double* tauq2,
+                                lapack_complex_double* work, lapack_int lwork );
+lapack_int LAPACKE_zuncsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q,
+                           lapack_complex_double* x11, lapack_int ldx11,
+                           lapack_complex_double* x12, lapack_int ldx12,
+                           lapack_complex_double* x21, lapack_int ldx21,
+                           lapack_complex_double* x22, lapack_int ldx22,
+                           double* theta, lapack_complex_double* u1,
+                           lapack_int ldu1, lapack_complex_double* u2,
+                           lapack_int ldu2, lapack_complex_double* v1t,
+                           lapack_int ldv1t, lapack_complex_double* v2t,
+                           lapack_int ldv2t );
+lapack_int LAPACKE_zuncsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                char signs, lapack_int m, lapack_int p,
+                                lapack_int q, lapack_complex_double* x11,
+                                lapack_int ldx11, lapack_complex_double* x12,
+                                lapack_int ldx12, lapack_complex_double* x21,
+                                lapack_int ldx21, lapack_complex_double* x22,
+                                lapack_int ldx22, double* theta,
+                                lapack_complex_double* u1, lapack_int ldu1,
+                                lapack_complex_double* u2, lapack_int ldu2,
+                                lapack_complex_double* v1t, lapack_int ldv1t,
+                                lapack_complex_double* v2t, lapack_int ldv2t,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_int lrwork,
+                                lapack_int* iwork );
+//LAPACK 3.4.0
+lapack_int LAPACKE_sgemqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int nb, const float* v, lapack_int ldv,
+                            const float* t, lapack_int ldt, float* c,
+                            lapack_int ldc );
+lapack_int LAPACKE_dgemqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int nb, const double* v, lapack_int ldv,
+                            const double* t, lapack_int ldt, double* c,
+                            lapack_int ldc );
+lapack_int LAPACKE_cgemqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int nb, const lapack_complex_float* v,
+                            lapack_int ldv, const lapack_complex_float* t,
+                            lapack_int ldt, lapack_complex_float* c,
+                            lapack_int ldc );
+lapack_int LAPACKE_zgemqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int nb, const lapack_complex_double* v,
+                            lapack_int ldv, const lapack_complex_double* t,
+                            lapack_int ldt, lapack_complex_double* c,
+                            lapack_int ldc );
+
+lapack_int LAPACKE_sgeqrt( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nb, float* a, lapack_int lda, float* t,
+                           lapack_int ldt );
+lapack_int LAPACKE_dgeqrt( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nb, double* a, lapack_int lda, double* t,
+                           lapack_int ldt );
+lapack_int LAPACKE_cgeqrt( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nb, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* t,
+                           lapack_int ldt );
+lapack_int LAPACKE_zgeqrt( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nb, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* t,
+                           lapack_int ldt );
+
+lapack_int LAPACKE_sgeqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            float* a, lapack_int lda, float* t,
+                            lapack_int ldt );
+lapack_int LAPACKE_dgeqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            double* a, lapack_int lda, double* t,
+                            lapack_int ldt );
+lapack_int LAPACKE_cgeqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zgeqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_sgeqrt3( int matrix_order, lapack_int m, lapack_int n,
+                            float* a, lapack_int lda, float* t,
+                            lapack_int ldt );
+lapack_int LAPACKE_dgeqrt3( int matrix_order, lapack_int m, lapack_int n,
+                            double* a, lapack_int lda, double* t,
+                            lapack_int ldt );
+lapack_int LAPACKE_cgeqrt3( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zgeqrt3( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_stpmqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int l, lapack_int nb, const float* v,
+                            lapack_int ldv, const float* t, lapack_int ldt,
+                            float* a, lapack_int lda, float* b,
+                            lapack_int ldb );
+lapack_int LAPACKE_dtpmqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int l, lapack_int nb, const double* v,
+                            lapack_int ldv, const double* t, lapack_int ldt,
+                            double* a, lapack_int lda, double* b,
+                            lapack_int ldb );
+lapack_int LAPACKE_ctpmqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int l, lapack_int nb,
+                            const lapack_complex_float* v, lapack_int ldv,
+                            const lapack_complex_float* t, lapack_int ldt,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztpmqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int l, lapack_int nb,
+                            const lapack_complex_double* v, lapack_int ldv,
+                            const lapack_complex_double* t, lapack_int ldt,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_dtpqrt( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int l, lapack_int nb, double* a,
+                           lapack_int lda, double* b, lapack_int ldb, double* t,
+                           lapack_int ldt );
+lapack_int LAPACKE_ctpqrt( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int l, lapack_int nb, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* t,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_int ldt );
+lapack_int LAPACKE_ztpqrt( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int l, lapack_int nb,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_stpqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            float* a, lapack_int lda, float* b, lapack_int ldb,
+                            float* t, lapack_int ldt );
+lapack_int LAPACKE_dtpqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            double* a, lapack_int lda, double* b,
+                            lapack_int ldb, double* t, lapack_int ldt );
+lapack_int LAPACKE_ctpqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_ztpqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_stprfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_int l, const float* v,
+                           lapack_int ldv, const float* t, lapack_int ldt,
+                           float* a, lapack_int lda, float* b, lapack_int ldb,
+                           lapack_int myldwork );
+lapack_int LAPACKE_dtprfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_int l, const double* v,
+                           lapack_int ldv, const double* t, lapack_int ldt,
+                           double* a, lapack_int lda, double* b, lapack_int ldb,
+                           lapack_int myldwork );
+lapack_int LAPACKE_ctprfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_int l,
+                           const lapack_complex_float* v, lapack_int ldv,
+                           const lapack_complex_float* t, lapack_int ldt,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_int myldwork );
+lapack_int LAPACKE_ztprfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_int l,
+                           const lapack_complex_double* v, lapack_int ldv,
+                           const lapack_complex_double* t, lapack_int ldt,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_int myldwork );
+
+lapack_int LAPACKE_sgemqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int nb, const float* v, lapack_int ldv,
+                                 const float* t, lapack_int ldt, float* c,
+                                 lapack_int ldc, float* work );
+lapack_int LAPACKE_dgemqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int nb, const double* v, lapack_int ldv,
+                                 const double* t, lapack_int ldt, double* c,
+                                 lapack_int ldc, double* work );
+lapack_int LAPACKE_cgemqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int nb, const lapack_complex_float* v,
+                                 lapack_int ldv, const lapack_complex_float* t,
+                                 lapack_int ldt, lapack_complex_float* c,
+                                 lapack_int ldc, lapack_complex_float* work );
+lapack_int LAPACKE_zgemqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int nb, const lapack_complex_double* v,
+                                 lapack_int ldv, const lapack_complex_double* t,
+                                 lapack_int ldt, lapack_complex_double* c,
+                                 lapack_int ldc, lapack_complex_double* work );
+
+lapack_int LAPACKE_sgeqrt_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nb, float* a, lapack_int lda,
+                                float* t, lapack_int ldt, float* work );
+lapack_int LAPACKE_dgeqrt_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nb, double* a, lapack_int lda,
+                                double* t, lapack_int ldt, double* work );
+lapack_int LAPACKE_cgeqrt_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nb, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* t,
+                                lapack_int ldt, lapack_complex_float* work );
+lapack_int LAPACKE_zgeqrt_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nb, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* t,
+                                lapack_int ldt, lapack_complex_double* work );
+
+lapack_int LAPACKE_sgeqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 float* a, lapack_int lda, float* t,
+                                 lapack_int ldt );
+lapack_int LAPACKE_dgeqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 double* a, lapack_int lda, double* t,
+                                 lapack_int ldt );
+lapack_int LAPACKE_cgeqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zgeqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_sgeqrt3_work( int matrix_order, lapack_int m, lapack_int n,
+                                 float* a, lapack_int lda, float* t,
+                                 lapack_int ldt );
+lapack_int LAPACKE_dgeqrt3_work( int matrix_order, lapack_int m, lapack_int n,
+                                 double* a, lapack_int lda, double* t,
+                                 lapack_int ldt );
+lapack_int LAPACKE_cgeqrt3_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zgeqrt3_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_stpmqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int l, lapack_int nb, const float* v,
+                                 lapack_int ldv, const float* t, lapack_int ldt,
+                                 float* a, lapack_int lda, float* b,
+                                 lapack_int ldb, float* work );
+lapack_int LAPACKE_dtpmqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int l, lapack_int nb, const double* v,
+                                 lapack_int ldv, const double* t,
+                                 lapack_int ldt, double* a, lapack_int lda,
+                                 double* b, lapack_int ldb, double* work );
+lapack_int LAPACKE_ctpmqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int l, lapack_int nb,
+                                 const lapack_complex_float* v, lapack_int ldv,
+                                 const lapack_complex_float* t, lapack_int ldt,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* work );
+lapack_int LAPACKE_ztpmqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int l, lapack_int nb,
+                                 const lapack_complex_double* v, lapack_int ldv,
+                                 const lapack_complex_double* t, lapack_int ldt,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* work );
+
+lapack_int LAPACKE_dtpqrt_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int l, lapack_int nb, double* a,
+                                lapack_int lda, double* b, lapack_int ldb,
+                                double* t, lapack_int ldt, double* work );
+lapack_int LAPACKE_ctpqrt_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int l, lapack_int nb,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* t,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_int ldt, lapack_complex_float* work );
+lapack_int LAPACKE_ztpqrt_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int l, lapack_int nb,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* t, lapack_int ldt,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_stpqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 float* a, lapack_int lda, float* b,
+                                 lapack_int ldb, float* t, lapack_int ldt );
+lapack_int LAPACKE_dtpqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 double* a, lapack_int lda, double* b,
+                                 lapack_int ldb, double* t, lapack_int ldt );
+lapack_int LAPACKE_ctpqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_ztpqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_stprfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                const float* v, lapack_int ldv, const float* t,
+                                lapack_int ldt, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, const float* mywork,
+                                lapack_int myldwork );
+lapack_int LAPACKE_dtprfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                const double* v, lapack_int ldv,
+                                const double* t, lapack_int ldt, double* a,
+                                lapack_int lda, double* b, lapack_int ldb,
+                                const double* mywork, lapack_int myldwork );
+lapack_int LAPACKE_ctprfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                const lapack_complex_float* v, lapack_int ldv,
+                                const lapack_complex_float* t, lapack_int ldt,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                const float* mywork, lapack_int myldwork );
+lapack_int LAPACKE_ztprfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                const lapack_complex_double* v, lapack_int ldv,
+                                const lapack_complex_double* t, lapack_int ldt,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                const double* mywork, lapack_int myldwork );
+//LAPACK 3.X.X
+lapack_int LAPACKE_csyr( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_float alpha,
+                             const lapack_complex_float* x, lapack_int incx,
+                             lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zsyr( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_double alpha,
+                             const lapack_complex_double* x, lapack_int incx,
+                             lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_csyr_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_float alpha,
+                                  const lapack_complex_float* x,
+                                  lapack_int incx, lapack_complex_float* a,
+                                  lapack_int lda );
+lapack_int LAPACKE_zsyr_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_double alpha,
+                                  const lapack_complex_double* x,
+                                  lapack_int incx, lapack_complex_double* a,
+                                  lapack_int lda );
+
+
+
+#define LAPACK_sgetrf LAPACK_GLOBAL(sgetrf,SGETRF)
+#define LAPACK_dgetrf LAPACK_GLOBAL(dgetrf,DGETRF)
+#define LAPACK_cgetrf LAPACK_GLOBAL(cgetrf,CGETRF)
+#define LAPACK_zgetrf LAPACK_GLOBAL(zgetrf,ZGETRF)
+#define LAPACK_sgbtrf LAPACK_GLOBAL(sgbtrf,SGBTRF)
+#define LAPACK_dgbtrf LAPACK_GLOBAL(dgbtrf,DGBTRF)
+#define LAPACK_cgbtrf LAPACK_GLOBAL(cgbtrf,CGBTRF)
+#define LAPACK_zgbtrf LAPACK_GLOBAL(zgbtrf,ZGBTRF)
+#define LAPACK_sgttrf LAPACK_GLOBAL(sgttrf,SGTTRF)
+#define LAPACK_dgttrf LAPACK_GLOBAL(dgttrf,DGTTRF)
+#define LAPACK_cgttrf LAPACK_GLOBAL(cgttrf,CGTTRF)
+#define LAPACK_zgttrf LAPACK_GLOBAL(zgttrf,ZGTTRF)
+#define LAPACK_spotrf LAPACK_GLOBAL(spotrf,SPOTRF)
+#define LAPACK_dpotrf LAPACK_GLOBAL(dpotrf,DPOTRF)
+#define LAPACK_cpotrf LAPACK_GLOBAL(cpotrf,CPOTRF)
+#define LAPACK_zpotrf LAPACK_GLOBAL(zpotrf,ZPOTRF)
+#define LAPACK_dpstrf LAPACK_GLOBAL(dpstrf,DPSTRF)
+#define LAPACK_spstrf LAPACK_GLOBAL(spstrf,SPSTRF)
+#define LAPACK_zpstrf LAPACK_GLOBAL(zpstrf,ZPSTRF)
+#define LAPACK_cpstrf LAPACK_GLOBAL(cpstrf,CPSTRF)
+#define LAPACK_dpftrf LAPACK_GLOBAL(dpftrf,DPFTRF)
+#define LAPACK_spftrf LAPACK_GLOBAL(spftrf,SPFTRF)
+#define LAPACK_zpftrf LAPACK_GLOBAL(zpftrf,ZPFTRF)
+#define LAPACK_cpftrf LAPACK_GLOBAL(cpftrf,CPFTRF)
+#define LAPACK_spptrf LAPACK_GLOBAL(spptrf,SPPTRF)
+#define LAPACK_dpptrf LAPACK_GLOBAL(dpptrf,DPPTRF)
+#define LAPACK_cpptrf LAPACK_GLOBAL(cpptrf,CPPTRF)
+#define LAPACK_zpptrf LAPACK_GLOBAL(zpptrf,ZPPTRF)
+#define LAPACK_spbtrf LAPACK_GLOBAL(spbtrf,SPBTRF)
+#define LAPACK_dpbtrf LAPACK_GLOBAL(dpbtrf,DPBTRF)
+#define LAPACK_cpbtrf LAPACK_GLOBAL(cpbtrf,CPBTRF)
+#define LAPACK_zpbtrf LAPACK_GLOBAL(zpbtrf,ZPBTRF)
+#define LAPACK_spttrf LAPACK_GLOBAL(spttrf,SPTTRF)
+#define LAPACK_dpttrf LAPACK_GLOBAL(dpttrf,DPTTRF)
+#define LAPACK_cpttrf LAPACK_GLOBAL(cpttrf,CPTTRF)
+#define LAPACK_zpttrf LAPACK_GLOBAL(zpttrf,ZPTTRF)
+#define LAPACK_ssytrf LAPACK_GLOBAL(ssytrf,SSYTRF)
+#define LAPACK_dsytrf LAPACK_GLOBAL(dsytrf,DSYTRF)
+#define LAPACK_csytrf LAPACK_GLOBAL(csytrf,CSYTRF)
+#define LAPACK_zsytrf LAPACK_GLOBAL(zsytrf,ZSYTRF)
+#define LAPACK_chetrf LAPACK_GLOBAL(chetrf,CHETRF)
+#define LAPACK_zhetrf LAPACK_GLOBAL(zhetrf,ZHETRF)
+#define LAPACK_ssptrf LAPACK_GLOBAL(ssptrf,SSPTRF)
+#define LAPACK_dsptrf LAPACK_GLOBAL(dsptrf,DSPTRF)
+#define LAPACK_csptrf LAPACK_GLOBAL(csptrf,CSPTRF)
+#define LAPACK_zsptrf LAPACK_GLOBAL(zsptrf,ZSPTRF)
+#define LAPACK_chptrf LAPACK_GLOBAL(chptrf,CHPTRF)
+#define LAPACK_zhptrf LAPACK_GLOBAL(zhptrf,ZHPTRF)
+#define LAPACK_sgetrs LAPACK_GLOBAL(sgetrs,SGETRS)
+#define LAPACK_dgetrs LAPACK_GLOBAL(dgetrs,DGETRS)
+#define LAPACK_cgetrs LAPACK_GLOBAL(cgetrs,CGETRS)
+#define LAPACK_zgetrs LAPACK_GLOBAL(zgetrs,ZGETRS)
+#define LAPACK_sgbtrs LAPACK_GLOBAL(sgbtrs,SGBTRS)
+#define LAPACK_dgbtrs LAPACK_GLOBAL(dgbtrs,DGBTRS)
+#define LAPACK_cgbtrs LAPACK_GLOBAL(cgbtrs,CGBTRS)
+#define LAPACK_zgbtrs LAPACK_GLOBAL(zgbtrs,ZGBTRS)
+#define LAPACK_sgttrs LAPACK_GLOBAL(sgttrs,SGTTRS)
+#define LAPACK_dgttrs LAPACK_GLOBAL(dgttrs,DGTTRS)
+#define LAPACK_cgttrs LAPACK_GLOBAL(cgttrs,CGTTRS)
+#define LAPACK_zgttrs LAPACK_GLOBAL(zgttrs,ZGTTRS)
+#define LAPACK_spotrs LAPACK_GLOBAL(spotrs,SPOTRS)
+#define LAPACK_dpotrs LAPACK_GLOBAL(dpotrs,DPOTRS)
+#define LAPACK_cpotrs LAPACK_GLOBAL(cpotrs,CPOTRS)
+#define LAPACK_zpotrs LAPACK_GLOBAL(zpotrs,ZPOTRS)
+#define LAPACK_dpftrs LAPACK_GLOBAL(dpftrs,DPFTRS)
+#define LAPACK_spftrs LAPACK_GLOBAL(spftrs,SPFTRS)
+#define LAPACK_zpftrs LAPACK_GLOBAL(zpftrs,ZPFTRS)
+#define LAPACK_cpftrs LAPACK_GLOBAL(cpftrs,CPFTRS)
+#define LAPACK_spptrs LAPACK_GLOBAL(spptrs,SPPTRS)
+#define LAPACK_dpptrs LAPACK_GLOBAL(dpptrs,DPPTRS)
+#define LAPACK_cpptrs LAPACK_GLOBAL(cpptrs,CPPTRS)
+#define LAPACK_zpptrs LAPACK_GLOBAL(zpptrs,ZPPTRS)
+#define LAPACK_spbtrs LAPACK_GLOBAL(spbtrs,SPBTRS)
+#define LAPACK_dpbtrs LAPACK_GLOBAL(dpbtrs,DPBTRS)
+#define LAPACK_cpbtrs LAPACK_GLOBAL(cpbtrs,CPBTRS)
+#define LAPACK_zpbtrs LAPACK_GLOBAL(zpbtrs,ZPBTRS)
+#define LAPACK_spttrs LAPACK_GLOBAL(spttrs,SPTTRS)
+#define LAPACK_dpttrs LAPACK_GLOBAL(dpttrs,DPTTRS)
+#define LAPACK_cpttrs LAPACK_GLOBAL(cpttrs,CPTTRS)
+#define LAPACK_zpttrs LAPACK_GLOBAL(zpttrs,ZPTTRS)
+#define LAPACK_ssytrs LAPACK_GLOBAL(ssytrs,SSYTRS)
+#define LAPACK_dsytrs LAPACK_GLOBAL(dsytrs,DSYTRS)
+#define LAPACK_csytrs LAPACK_GLOBAL(csytrs,CSYTRS)
+#define LAPACK_zsytrs LAPACK_GLOBAL(zsytrs,ZSYTRS)
+#define LAPACK_chetrs LAPACK_GLOBAL(chetrs,CHETRS)
+#define LAPACK_zhetrs LAPACK_GLOBAL(zhetrs,ZHETRS)
+#define LAPACK_ssptrs LAPACK_GLOBAL(ssptrs,SSPTRS)
+#define LAPACK_dsptrs LAPACK_GLOBAL(dsptrs,DSPTRS)
+#define LAPACK_csptrs LAPACK_GLOBAL(csptrs,CSPTRS)
+#define LAPACK_zsptrs LAPACK_GLOBAL(zsptrs,ZSPTRS)
+#define LAPACK_chptrs LAPACK_GLOBAL(chptrs,CHPTRS)
+#define LAPACK_zhptrs LAPACK_GLOBAL(zhptrs,ZHPTRS)
+#define LAPACK_strtrs LAPACK_GLOBAL(strtrs,STRTRS)
+#define LAPACK_dtrtrs LAPACK_GLOBAL(dtrtrs,DTRTRS)
+#define LAPACK_ctrtrs LAPACK_GLOBAL(ctrtrs,CTRTRS)
+#define LAPACK_ztrtrs LAPACK_GLOBAL(ztrtrs,ZTRTRS)
+#define LAPACK_stptrs LAPACK_GLOBAL(stptrs,STPTRS)
+#define LAPACK_dtptrs LAPACK_GLOBAL(dtptrs,DTPTRS)
+#define LAPACK_ctptrs LAPACK_GLOBAL(ctptrs,CTPTRS)
+#define LAPACK_ztptrs LAPACK_GLOBAL(ztptrs,ZTPTRS)
+#define LAPACK_stbtrs LAPACK_GLOBAL(stbtrs,STBTRS)
+#define LAPACK_dtbtrs LAPACK_GLOBAL(dtbtrs,DTBTRS)
+#define LAPACK_ctbtrs LAPACK_GLOBAL(ctbtrs,CTBTRS)
+#define LAPACK_ztbtrs LAPACK_GLOBAL(ztbtrs,ZTBTRS)
+#define LAPACK_sgecon LAPACK_GLOBAL(sgecon,SGECON)
+#define LAPACK_dgecon LAPACK_GLOBAL(dgecon,DGECON)
+#define LAPACK_cgecon LAPACK_GLOBAL(cgecon,CGECON)
+#define LAPACK_zgecon LAPACK_GLOBAL(zgecon,ZGECON)
+#define LAPACK_sgbcon LAPACK_GLOBAL(sgbcon,SGBCON)
+#define LAPACK_dgbcon LAPACK_GLOBAL(dgbcon,DGBCON)
+#define LAPACK_cgbcon LAPACK_GLOBAL(cgbcon,CGBCON)
+#define LAPACK_zgbcon LAPACK_GLOBAL(zgbcon,ZGBCON)
+#define LAPACK_sgtcon LAPACK_GLOBAL(sgtcon,SGTCON)
+#define LAPACK_dgtcon LAPACK_GLOBAL(dgtcon,DGTCON)
+#define LAPACK_cgtcon LAPACK_GLOBAL(cgtcon,CGTCON)
+#define LAPACK_zgtcon LAPACK_GLOBAL(zgtcon,ZGTCON)
+#define LAPACK_spocon LAPACK_GLOBAL(spocon,SPOCON)
+#define LAPACK_dpocon LAPACK_GLOBAL(dpocon,DPOCON)
+#define LAPACK_cpocon LAPACK_GLOBAL(cpocon,CPOCON)
+#define LAPACK_zpocon LAPACK_GLOBAL(zpocon,ZPOCON)
+#define LAPACK_sppcon LAPACK_GLOBAL(sppcon,SPPCON)
+#define LAPACK_dppcon LAPACK_GLOBAL(dppcon,DPPCON)
+#define LAPACK_cppcon LAPACK_GLOBAL(cppcon,CPPCON)
+#define LAPACK_zppcon LAPACK_GLOBAL(zppcon,ZPPCON)
+#define LAPACK_spbcon LAPACK_GLOBAL(spbcon,SPBCON)
+#define LAPACK_dpbcon LAPACK_GLOBAL(dpbcon,DPBCON)
+#define LAPACK_cpbcon LAPACK_GLOBAL(cpbcon,CPBCON)
+#define LAPACK_zpbcon LAPACK_GLOBAL(zpbcon,ZPBCON)
+#define LAPACK_sptcon LAPACK_GLOBAL(sptcon,SPTCON)
+#define LAPACK_dptcon LAPACK_GLOBAL(dptcon,DPTCON)
+#define LAPACK_cptcon LAPACK_GLOBAL(cptcon,CPTCON)
+#define LAPACK_zptcon LAPACK_GLOBAL(zptcon,ZPTCON)
+#define LAPACK_ssycon LAPACK_GLOBAL(ssycon,SSYCON)
+#define LAPACK_dsycon LAPACK_GLOBAL(dsycon,DSYCON)
+#define LAPACK_csycon LAPACK_GLOBAL(csycon,CSYCON)
+#define LAPACK_zsycon LAPACK_GLOBAL(zsycon,ZSYCON)
+#define LAPACK_checon LAPACK_GLOBAL(checon,CHECON)
+#define LAPACK_zhecon LAPACK_GLOBAL(zhecon,ZHECON)
+#define LAPACK_sspcon LAPACK_GLOBAL(sspcon,SSPCON)
+#define LAPACK_dspcon LAPACK_GLOBAL(dspcon,DSPCON)
+#define LAPACK_cspcon LAPACK_GLOBAL(cspcon,CSPCON)
+#define LAPACK_zspcon LAPACK_GLOBAL(zspcon,ZSPCON)
+#define LAPACK_chpcon LAPACK_GLOBAL(chpcon,CHPCON)
+#define LAPACK_zhpcon LAPACK_GLOBAL(zhpcon,ZHPCON)
+#define LAPACK_strcon LAPACK_GLOBAL(strcon,STRCON)
+#define LAPACK_dtrcon LAPACK_GLOBAL(dtrcon,DTRCON)
+#define LAPACK_ctrcon LAPACK_GLOBAL(ctrcon,CTRCON)
+#define LAPACK_ztrcon LAPACK_GLOBAL(ztrcon,ZTRCON)
+#define LAPACK_stpcon LAPACK_GLOBAL(stpcon,STPCON)
+#define LAPACK_dtpcon LAPACK_GLOBAL(dtpcon,DTPCON)
+#define LAPACK_ctpcon LAPACK_GLOBAL(ctpcon,CTPCON)
+#define LAPACK_ztpcon LAPACK_GLOBAL(ztpcon,ZTPCON)
+#define LAPACK_stbcon LAPACK_GLOBAL(stbcon,STBCON)
+#define LAPACK_dtbcon LAPACK_GLOBAL(dtbcon,DTBCON)
+#define LAPACK_ctbcon LAPACK_GLOBAL(ctbcon,CTBCON)
+#define LAPACK_ztbcon LAPACK_GLOBAL(ztbcon,ZTBCON)
+#define LAPACK_sgerfs LAPACK_GLOBAL(sgerfs,SGERFS)
+#define LAPACK_dgerfs LAPACK_GLOBAL(dgerfs,DGERFS)
+#define LAPACK_cgerfs LAPACK_GLOBAL(cgerfs,CGERFS)
+#define LAPACK_zgerfs LAPACK_GLOBAL(zgerfs,ZGERFS)
+#define LAPACK_dgerfsx LAPACK_GLOBAL(dgerfsx,DGERFSX)
+#define LAPACK_sgerfsx LAPACK_GLOBAL(sgerfsx,SGERFSX)
+#define LAPACK_zgerfsx LAPACK_GLOBAL(zgerfsx,ZGERFSX)
+#define LAPACK_cgerfsx LAPACK_GLOBAL(cgerfsx,CGERFSX)
+#define LAPACK_sgbrfs LAPACK_GLOBAL(sgbrfs,SGBRFS)
+#define LAPACK_dgbrfs LAPACK_GLOBAL(dgbrfs,DGBRFS)
+#define LAPACK_cgbrfs LAPACK_GLOBAL(cgbrfs,CGBRFS)
+#define LAPACK_zgbrfs LAPACK_GLOBAL(zgbrfs,ZGBRFS)
+#define LAPACK_dgbrfsx LAPACK_GLOBAL(dgbrfsx,DGBRFSX)
+#define LAPACK_sgbrfsx LAPACK_GLOBAL(sgbrfsx,SGBRFSX)
+#define LAPACK_zgbrfsx LAPACK_GLOBAL(zgbrfsx,ZGBRFSX)
+#define LAPACK_cgbrfsx LAPACK_GLOBAL(cgbrfsx,CGBRFSX)
+#define LAPACK_sgtrfs LAPACK_GLOBAL(sgtrfs,SGTRFS)
+#define LAPACK_dgtrfs LAPACK_GLOBAL(dgtrfs,DGTRFS)
+#define LAPACK_cgtrfs LAPACK_GLOBAL(cgtrfs,CGTRFS)
+#define LAPACK_zgtrfs LAPACK_GLOBAL(zgtrfs,ZGTRFS)
+#define LAPACK_sporfs LAPACK_GLOBAL(sporfs,SPORFS)
+#define LAPACK_dporfs LAPACK_GLOBAL(dporfs,DPORFS)
+#define LAPACK_cporfs LAPACK_GLOBAL(cporfs,CPORFS)
+#define LAPACK_zporfs LAPACK_GLOBAL(zporfs,ZPORFS)
+#define LAPACK_dporfsx LAPACK_GLOBAL(dporfsx,DPORFSX)
+#define LAPACK_sporfsx LAPACK_GLOBAL(sporfsx,SPORFSX)
+#define LAPACK_zporfsx LAPACK_GLOBAL(zporfsx,ZPORFSX)
+#define LAPACK_cporfsx LAPACK_GLOBAL(cporfsx,CPORFSX)
+#define LAPACK_spprfs LAPACK_GLOBAL(spprfs,SPPRFS)
+#define LAPACK_dpprfs LAPACK_GLOBAL(dpprfs,DPPRFS)
+#define LAPACK_cpprfs LAPACK_GLOBAL(cpprfs,CPPRFS)
+#define LAPACK_zpprfs LAPACK_GLOBAL(zpprfs,ZPPRFS)
+#define LAPACK_spbrfs LAPACK_GLOBAL(spbrfs,SPBRFS)
+#define LAPACK_dpbrfs LAPACK_GLOBAL(dpbrfs,DPBRFS)
+#define LAPACK_cpbrfs LAPACK_GLOBAL(cpbrfs,CPBRFS)
+#define LAPACK_zpbrfs LAPACK_GLOBAL(zpbrfs,ZPBRFS)
+#define LAPACK_sptrfs LAPACK_GLOBAL(sptrfs,SPTRFS)
+#define LAPACK_dptrfs LAPACK_GLOBAL(dptrfs,DPTRFS)
+#define LAPACK_cptrfs LAPACK_GLOBAL(cptrfs,CPTRFS)
+#define LAPACK_zptrfs LAPACK_GLOBAL(zptrfs,ZPTRFS)
+#define LAPACK_ssyrfs LAPACK_GLOBAL(ssyrfs,SSYRFS)
+#define LAPACK_dsyrfs LAPACK_GLOBAL(dsyrfs,DSYRFS)
+#define LAPACK_csyrfs LAPACK_GLOBAL(csyrfs,CSYRFS)
+#define LAPACK_zsyrfs LAPACK_GLOBAL(zsyrfs,ZSYRFS)
+#define LAPACK_dsyrfsx LAPACK_GLOBAL(dsyrfsx,DSYRFSX)
+#define LAPACK_ssyrfsx LAPACK_GLOBAL(ssyrfsx,SSYRFSX)
+#define LAPACK_zsyrfsx LAPACK_GLOBAL(zsyrfsx,ZSYRFSX)
+#define LAPACK_csyrfsx LAPACK_GLOBAL(csyrfsx,CSYRFSX)
+#define LAPACK_cherfs LAPACK_GLOBAL(cherfs,CHERFS)
+#define LAPACK_zherfs LAPACK_GLOBAL(zherfs,ZHERFS)
+#define LAPACK_zherfsx LAPACK_GLOBAL(zherfsx,ZHERFSX)
+#define LAPACK_cherfsx LAPACK_GLOBAL(cherfsx,CHERFSX)
+#define LAPACK_ssprfs LAPACK_GLOBAL(ssprfs,SSPRFS)
+#define LAPACK_dsprfs LAPACK_GLOBAL(dsprfs,DSPRFS)
+#define LAPACK_csprfs LAPACK_GLOBAL(csprfs,CSPRFS)
+#define LAPACK_zsprfs LAPACK_GLOBAL(zsprfs,ZSPRFS)
+#define LAPACK_chprfs LAPACK_GLOBAL(chprfs,CHPRFS)
+#define LAPACK_zhprfs LAPACK_GLOBAL(zhprfs,ZHPRFS)
+#define LAPACK_strrfs LAPACK_GLOBAL(strrfs,STRRFS)
+#define LAPACK_dtrrfs LAPACK_GLOBAL(dtrrfs,DTRRFS)
+#define LAPACK_ctrrfs LAPACK_GLOBAL(ctrrfs,CTRRFS)
+#define LAPACK_ztrrfs LAPACK_GLOBAL(ztrrfs,ZTRRFS)
+#define LAPACK_stprfs LAPACK_GLOBAL(stprfs,STPRFS)
+#define LAPACK_dtprfs LAPACK_GLOBAL(dtprfs,DTPRFS)
+#define LAPACK_ctprfs LAPACK_GLOBAL(ctprfs,CTPRFS)
+#define LAPACK_ztprfs LAPACK_GLOBAL(ztprfs,ZTPRFS)
+#define LAPACK_stbrfs LAPACK_GLOBAL(stbrfs,STBRFS)
+#define LAPACK_dtbrfs LAPACK_GLOBAL(dtbrfs,DTBRFS)
+#define LAPACK_ctbrfs LAPACK_GLOBAL(ctbrfs,CTBRFS)
+#define LAPACK_ztbrfs LAPACK_GLOBAL(ztbrfs,ZTBRFS)
+#define LAPACK_sgetri LAPACK_GLOBAL(sgetri,SGETRI)
+#define LAPACK_dgetri LAPACK_GLOBAL(dgetri,DGETRI)
+#define LAPACK_cgetri LAPACK_GLOBAL(cgetri,CGETRI)
+#define LAPACK_zgetri LAPACK_GLOBAL(zgetri,ZGETRI)
+#define LAPACK_spotri LAPACK_GLOBAL(spotri,SPOTRI)
+#define LAPACK_dpotri LAPACK_GLOBAL(dpotri,DPOTRI)
+#define LAPACK_cpotri LAPACK_GLOBAL(cpotri,CPOTRI)
+#define LAPACK_zpotri LAPACK_GLOBAL(zpotri,ZPOTRI)
+#define LAPACK_dpftri LAPACK_GLOBAL(dpftri,DPFTRI)
+#define LAPACK_spftri LAPACK_GLOBAL(spftri,SPFTRI)
+#define LAPACK_zpftri LAPACK_GLOBAL(zpftri,ZPFTRI)
+#define LAPACK_cpftri LAPACK_GLOBAL(cpftri,CPFTRI)
+#define LAPACK_spptri LAPACK_GLOBAL(spptri,SPPTRI)
+#define LAPACK_dpptri LAPACK_GLOBAL(dpptri,DPPTRI)
+#define LAPACK_cpptri LAPACK_GLOBAL(cpptri,CPPTRI)
+#define LAPACK_zpptri LAPACK_GLOBAL(zpptri,ZPPTRI)
+#define LAPACK_ssytri LAPACK_GLOBAL(ssytri,SSYTRI)
+#define LAPACK_dsytri LAPACK_GLOBAL(dsytri,DSYTRI)
+#define LAPACK_csytri LAPACK_GLOBAL(csytri,CSYTRI)
+#define LAPACK_zsytri LAPACK_GLOBAL(zsytri,ZSYTRI)
+#define LAPACK_chetri LAPACK_GLOBAL(chetri,CHETRI)
+#define LAPACK_zhetri LAPACK_GLOBAL(zhetri,ZHETRI)
+#define LAPACK_ssptri LAPACK_GLOBAL(ssptri,SSPTRI)
+#define LAPACK_dsptri LAPACK_GLOBAL(dsptri,DSPTRI)
+#define LAPACK_csptri LAPACK_GLOBAL(csptri,CSPTRI)
+#define LAPACK_zsptri LAPACK_GLOBAL(zsptri,ZSPTRI)
+#define LAPACK_chptri LAPACK_GLOBAL(chptri,CHPTRI)
+#define LAPACK_zhptri LAPACK_GLOBAL(zhptri,ZHPTRI)
+#define LAPACK_strtri LAPACK_GLOBAL(strtri,STRTRI)
+#define LAPACK_dtrtri LAPACK_GLOBAL(dtrtri,DTRTRI)
+#define LAPACK_ctrtri LAPACK_GLOBAL(ctrtri,CTRTRI)
+#define LAPACK_ztrtri LAPACK_GLOBAL(ztrtri,ZTRTRI)
+#define LAPACK_dtftri LAPACK_GLOBAL(dtftri,DTFTRI)
+#define LAPACK_stftri LAPACK_GLOBAL(stftri,STFTRI)
+#define LAPACK_ztftri LAPACK_GLOBAL(ztftri,ZTFTRI)
+#define LAPACK_ctftri LAPACK_GLOBAL(ctftri,CTFTRI)
+#define LAPACK_stptri LAPACK_GLOBAL(stptri,STPTRI)
+#define LAPACK_dtptri LAPACK_GLOBAL(dtptri,DTPTRI)
+#define LAPACK_ctptri LAPACK_GLOBAL(ctptri,CTPTRI)
+#define LAPACK_ztptri LAPACK_GLOBAL(ztptri,ZTPTRI)
+#define LAPACK_sgeequ LAPACK_GLOBAL(sgeequ,SGEEQU)
+#define LAPACK_dgeequ LAPACK_GLOBAL(dgeequ,DGEEQU)
+#define LAPACK_cgeequ LAPACK_GLOBAL(cgeequ,CGEEQU)
+#define LAPACK_zgeequ LAPACK_GLOBAL(zgeequ,ZGEEQU)
+#define LAPACK_dgeequb LAPACK_GLOBAL(dgeequb,DGEEQUB)
+#define LAPACK_sgeequb LAPACK_GLOBAL(sgeequb,SGEEQUB)
+#define LAPACK_zgeequb LAPACK_GLOBAL(zgeequb,ZGEEQUB)
+#define LAPACK_cgeequb LAPACK_GLOBAL(cgeequb,CGEEQUB)
+#define LAPACK_sgbequ LAPACK_GLOBAL(sgbequ,SGBEQU)
+#define LAPACK_dgbequ LAPACK_GLOBAL(dgbequ,DGBEQU)
+#define LAPACK_cgbequ LAPACK_GLOBAL(cgbequ,CGBEQU)
+#define LAPACK_zgbequ LAPACK_GLOBAL(zgbequ,ZGBEQU)
+#define LAPACK_dgbequb LAPACK_GLOBAL(dgbequb,DGBEQUB)
+#define LAPACK_sgbequb LAPACK_GLOBAL(sgbequb,SGBEQUB)
+#define LAPACK_zgbequb LAPACK_GLOBAL(zgbequb,ZGBEQUB)
+#define LAPACK_cgbequb LAPACK_GLOBAL(cgbequb,CGBEQUB)
+#define LAPACK_spoequ LAPACK_GLOBAL(spoequ,SPOEQU)
+#define LAPACK_dpoequ LAPACK_GLOBAL(dpoequ,DPOEQU)
+#define LAPACK_cpoequ LAPACK_GLOBAL(cpoequ,CPOEQU)
+#define LAPACK_zpoequ LAPACK_GLOBAL(zpoequ,ZPOEQU)
+#define LAPACK_dpoequb LAPACK_GLOBAL(dpoequb,DPOEQUB)
+#define LAPACK_spoequb LAPACK_GLOBAL(spoequb,SPOEQUB)
+#define LAPACK_zpoequb LAPACK_GLOBAL(zpoequb,ZPOEQUB)
+#define LAPACK_cpoequb LAPACK_GLOBAL(cpoequb,CPOEQUB)
+#define LAPACK_sppequ LAPACK_GLOBAL(sppequ,SPPEQU)
+#define LAPACK_dppequ LAPACK_GLOBAL(dppequ,DPPEQU)
+#define LAPACK_cppequ LAPACK_GLOBAL(cppequ,CPPEQU)
+#define LAPACK_zppequ LAPACK_GLOBAL(zppequ,ZPPEQU)
+#define LAPACK_spbequ LAPACK_GLOBAL(spbequ,SPBEQU)
+#define LAPACK_dpbequ LAPACK_GLOBAL(dpbequ,DPBEQU)
+#define LAPACK_cpbequ LAPACK_GLOBAL(cpbequ,CPBEQU)
+#define LAPACK_zpbequ LAPACK_GLOBAL(zpbequ,ZPBEQU)
+#define LAPACK_dsyequb LAPACK_GLOBAL(dsyequb,DSYEQUB)
+#define LAPACK_ssyequb LAPACK_GLOBAL(ssyequb,SSYEQUB)
+#define LAPACK_zsyequb LAPACK_GLOBAL(zsyequb,ZSYEQUB)
+#define LAPACK_csyequb LAPACK_GLOBAL(csyequb,CSYEQUB)
+#define LAPACK_zheequb LAPACK_GLOBAL(zheequb,ZHEEQUB)
+#define LAPACK_cheequb LAPACK_GLOBAL(cheequb,CHEEQUB)
+#define LAPACK_sgesv LAPACK_GLOBAL(sgesv,SGESV)
+#define LAPACK_dgesv LAPACK_GLOBAL(dgesv,DGESV)
+#define LAPACK_cgesv LAPACK_GLOBAL(cgesv,CGESV)
+#define LAPACK_zgesv LAPACK_GLOBAL(zgesv,ZGESV)
+#define LAPACK_dsgesv LAPACK_GLOBAL(dsgesv,DSGESV)
+#define LAPACK_zcgesv LAPACK_GLOBAL(zcgesv,ZCGESV)
+#define LAPACK_sgesvx LAPACK_GLOBAL(sgesvx,SGESVX)
+#define LAPACK_dgesvx LAPACK_GLOBAL(dgesvx,DGESVX)
+#define LAPACK_cgesvx LAPACK_GLOBAL(cgesvx,CGESVX)
+#define LAPACK_zgesvx LAPACK_GLOBAL(zgesvx,ZGESVX)
+#define LAPACK_dgesvxx LAPACK_GLOBAL(dgesvxx,DGESVXX)
+#define LAPACK_sgesvxx LAPACK_GLOBAL(sgesvxx,SGESVXX)
+#define LAPACK_zgesvxx LAPACK_GLOBAL(zgesvxx,ZGESVXX)
+#define LAPACK_cgesvxx LAPACK_GLOBAL(cgesvxx,CGESVXX)
+#define LAPACK_sgbsv LAPACK_GLOBAL(sgbsv,SGBSV)
+#define LAPACK_dgbsv LAPACK_GLOBAL(dgbsv,DGBSV)
+#define LAPACK_cgbsv LAPACK_GLOBAL(cgbsv,CGBSV)
+#define LAPACK_zgbsv LAPACK_GLOBAL(zgbsv,ZGBSV)
+#define LAPACK_sgbsvx LAPACK_GLOBAL(sgbsvx,SGBSVX)
+#define LAPACK_dgbsvx LAPACK_GLOBAL(dgbsvx,DGBSVX)
+#define LAPACK_cgbsvx LAPACK_GLOBAL(cgbsvx,CGBSVX)
+#define LAPACK_zgbsvx LAPACK_GLOBAL(zgbsvx,ZGBSVX)
+#define LAPACK_dgbsvxx LAPACK_GLOBAL(dgbsvxx,DGBSVXX)
+#define LAPACK_sgbsvxx LAPACK_GLOBAL(sgbsvxx,SGBSVXX)
+#define LAPACK_zgbsvxx LAPACK_GLOBAL(zgbsvxx,ZGBSVXX)
+#define LAPACK_cgbsvxx LAPACK_GLOBAL(cgbsvxx,CGBSVXX)
+#define LAPACK_sgtsv LAPACK_GLOBAL(sgtsv,SGTSV)
+#define LAPACK_dgtsv LAPACK_GLOBAL(dgtsv,DGTSV)
+#define LAPACK_cgtsv LAPACK_GLOBAL(cgtsv,CGTSV)
+#define LAPACK_zgtsv LAPACK_GLOBAL(zgtsv,ZGTSV)
+#define LAPACK_sgtsvx LAPACK_GLOBAL(sgtsvx,SGTSVX)
+#define LAPACK_dgtsvx LAPACK_GLOBAL(dgtsvx,DGTSVX)
+#define LAPACK_cgtsvx LAPACK_GLOBAL(cgtsvx,CGTSVX)
+#define LAPACK_zgtsvx LAPACK_GLOBAL(zgtsvx,ZGTSVX)
+#define LAPACK_sposv LAPACK_GLOBAL(sposv,SPOSV)
+#define LAPACK_dposv LAPACK_GLOBAL(dposv,DPOSV)
+#define LAPACK_cposv LAPACK_GLOBAL(cposv,CPOSV)
+#define LAPACK_zposv LAPACK_GLOBAL(zposv,ZPOSV)
+#define LAPACK_dsposv LAPACK_GLOBAL(dsposv,DSPOSV)
+#define LAPACK_zcposv LAPACK_GLOBAL(zcposv,ZCPOSV)
+#define LAPACK_sposvx LAPACK_GLOBAL(sposvx,SPOSVX)
+#define LAPACK_dposvx LAPACK_GLOBAL(dposvx,DPOSVX)
+#define LAPACK_cposvx LAPACK_GLOBAL(cposvx,CPOSVX)
+#define LAPACK_zposvx LAPACK_GLOBAL(zposvx,ZPOSVX)
+#define LAPACK_dposvxx LAPACK_GLOBAL(dposvxx,DPOSVXX)
+#define LAPACK_sposvxx LAPACK_GLOBAL(sposvxx,SPOSVXX)
+#define LAPACK_zposvxx LAPACK_GLOBAL(zposvxx,ZPOSVXX)
+#define LAPACK_cposvxx LAPACK_GLOBAL(cposvxx,CPOSVXX)
+#define LAPACK_sppsv LAPACK_GLOBAL(sppsv,SPPSV)
+#define LAPACK_dppsv LAPACK_GLOBAL(dppsv,DPPSV)
+#define LAPACK_cppsv LAPACK_GLOBAL(cppsv,CPPSV)
+#define LAPACK_zppsv LAPACK_GLOBAL(zppsv,ZPPSV)
+#define LAPACK_sppsvx LAPACK_GLOBAL(sppsvx,SPPSVX)
+#define LAPACK_dppsvx LAPACK_GLOBAL(dppsvx,DPPSVX)
+#define LAPACK_cppsvx LAPACK_GLOBAL(cppsvx,CPPSVX)
+#define LAPACK_zppsvx LAPACK_GLOBAL(zppsvx,ZPPSVX)
+#define LAPACK_spbsv LAPACK_GLOBAL(spbsv,SPBSV)
+#define LAPACK_dpbsv LAPACK_GLOBAL(dpbsv,DPBSV)
+#define LAPACK_cpbsv LAPACK_GLOBAL(cpbsv,CPBSV)
+#define LAPACK_zpbsv LAPACK_GLOBAL(zpbsv,ZPBSV)
+#define LAPACK_spbsvx LAPACK_GLOBAL(spbsvx,SPBSVX)
+#define LAPACK_dpbsvx LAPACK_GLOBAL(dpbsvx,DPBSVX)
+#define LAPACK_cpbsvx LAPACK_GLOBAL(cpbsvx,CPBSVX)
+#define LAPACK_zpbsvx LAPACK_GLOBAL(zpbsvx,ZPBSVX)
+#define LAPACK_sptsv LAPACK_GLOBAL(sptsv,SPTSV)
+#define LAPACK_dptsv LAPACK_GLOBAL(dptsv,DPTSV)
+#define LAPACK_cptsv LAPACK_GLOBAL(cptsv,CPTSV)
+#define LAPACK_zptsv LAPACK_GLOBAL(zptsv,ZPTSV)
+#define LAPACK_sptsvx LAPACK_GLOBAL(sptsvx,SPTSVX)
+#define LAPACK_dptsvx LAPACK_GLOBAL(dptsvx,DPTSVX)
+#define LAPACK_cptsvx LAPACK_GLOBAL(cptsvx,CPTSVX)
+#define LAPACK_zptsvx LAPACK_GLOBAL(zptsvx,ZPTSVX)
+#define LAPACK_ssysv LAPACK_GLOBAL(ssysv,SSYSV)
+#define LAPACK_dsysv LAPACK_GLOBAL(dsysv,DSYSV)
+#define LAPACK_csysv LAPACK_GLOBAL(csysv,CSYSV)
+#define LAPACK_zsysv LAPACK_GLOBAL(zsysv,ZSYSV)
+#define LAPACK_ssysvx LAPACK_GLOBAL(ssysvx,SSYSVX)
+#define LAPACK_dsysvx LAPACK_GLOBAL(dsysvx,DSYSVX)
+#define LAPACK_csysvx LAPACK_GLOBAL(csysvx,CSYSVX)
+#define LAPACK_zsysvx LAPACK_GLOBAL(zsysvx,ZSYSVX)
+#define LAPACK_dsysvxx LAPACK_GLOBAL(dsysvxx,DSYSVXX)
+#define LAPACK_ssysvxx LAPACK_GLOBAL(ssysvxx,SSYSVXX)
+#define LAPACK_zsysvxx LAPACK_GLOBAL(zsysvxx,ZSYSVXX)
+#define LAPACK_csysvxx LAPACK_GLOBAL(csysvxx,CSYSVXX)
+#define LAPACK_chesv LAPACK_GLOBAL(chesv,CHESV)
+#define LAPACK_zhesv LAPACK_GLOBAL(zhesv,ZHESV)
+#define LAPACK_chesvx LAPACK_GLOBAL(chesvx,CHESVX)
+#define LAPACK_zhesvx LAPACK_GLOBAL(zhesvx,ZHESVX)
+#define LAPACK_zhesvxx LAPACK_GLOBAL(zhesvxx,ZHESVXX)
+#define LAPACK_chesvxx LAPACK_GLOBAL(chesvxx,CHESVXX)
+#define LAPACK_sspsv LAPACK_GLOBAL(sspsv,SSPSV)
+#define LAPACK_dspsv LAPACK_GLOBAL(dspsv,DSPSV)
+#define LAPACK_cspsv LAPACK_GLOBAL(cspsv,CSPSV)
+#define LAPACK_zspsv LAPACK_GLOBAL(zspsv,ZSPSV)
+#define LAPACK_sspsvx LAPACK_GLOBAL(sspsvx,SSPSVX)
+#define LAPACK_dspsvx LAPACK_GLOBAL(dspsvx,DSPSVX)
+#define LAPACK_cspsvx LAPACK_GLOBAL(cspsvx,CSPSVX)
+#define LAPACK_zspsvx LAPACK_GLOBAL(zspsvx,ZSPSVX)
+#define LAPACK_chpsv LAPACK_GLOBAL(chpsv,CHPSV)
+#define LAPACK_zhpsv LAPACK_GLOBAL(zhpsv,ZHPSV)
+#define LAPACK_chpsvx LAPACK_GLOBAL(chpsvx,CHPSVX)
+#define LAPACK_zhpsvx LAPACK_GLOBAL(zhpsvx,ZHPSVX)
+#define LAPACK_sgeqrf LAPACK_GLOBAL(sgeqrf,SGEQRF)
+#define LAPACK_dgeqrf LAPACK_GLOBAL(dgeqrf,DGEQRF)
+#define LAPACK_cgeqrf LAPACK_GLOBAL(cgeqrf,CGEQRF)
+#define LAPACK_zgeqrf LAPACK_GLOBAL(zgeqrf,ZGEQRF)
+#define LAPACK_sgeqpf LAPACK_GLOBAL(sgeqpf,SGEQPF)
+#define LAPACK_dgeqpf LAPACK_GLOBAL(dgeqpf,DGEQPF)
+#define LAPACK_cgeqpf LAPACK_GLOBAL(cgeqpf,CGEQPF)
+#define LAPACK_zgeqpf LAPACK_GLOBAL(zgeqpf,ZGEQPF)
+#define LAPACK_sgeqp3 LAPACK_GLOBAL(sgeqp3,SGEQP3)
+#define LAPACK_dgeqp3 LAPACK_GLOBAL(dgeqp3,DGEQP3)
+#define LAPACK_cgeqp3 LAPACK_GLOBAL(cgeqp3,CGEQP3)
+#define LAPACK_zgeqp3 LAPACK_GLOBAL(zgeqp3,ZGEQP3)
+#define LAPACK_sorgqr LAPACK_GLOBAL(sorgqr,SORGQR)
+#define LAPACK_dorgqr LAPACK_GLOBAL(dorgqr,DORGQR)
+#define LAPACK_sormqr LAPACK_GLOBAL(sormqr,SORMQR)
+#define LAPACK_dormqr LAPACK_GLOBAL(dormqr,DORMQR)
+#define LAPACK_cungqr LAPACK_GLOBAL(cungqr,CUNGQR)
+#define LAPACK_zungqr LAPACK_GLOBAL(zungqr,ZUNGQR)
+#define LAPACK_cunmqr LAPACK_GLOBAL(cunmqr,CUNMQR)
+#define LAPACK_zunmqr LAPACK_GLOBAL(zunmqr,ZUNMQR)
+#define LAPACK_sgelqf LAPACK_GLOBAL(sgelqf,SGELQF)
+#define LAPACK_dgelqf LAPACK_GLOBAL(dgelqf,DGELQF)
+#define LAPACK_cgelqf LAPACK_GLOBAL(cgelqf,CGELQF)
+#define LAPACK_zgelqf LAPACK_GLOBAL(zgelqf,ZGELQF)
+#define LAPACK_sorglq LAPACK_GLOBAL(sorglq,SORGLQ)
+#define LAPACK_dorglq LAPACK_GLOBAL(dorglq,DORGLQ)
+#define LAPACK_sormlq LAPACK_GLOBAL(sormlq,SORMLQ)
+#define LAPACK_dormlq LAPACK_GLOBAL(dormlq,DORMLQ)
+#define LAPACK_cunglq LAPACK_GLOBAL(cunglq,CUNGLQ)
+#define LAPACK_zunglq LAPACK_GLOBAL(zunglq,ZUNGLQ)
+#define LAPACK_cunmlq LAPACK_GLOBAL(cunmlq,CUNMLQ)
+#define LAPACK_zunmlq LAPACK_GLOBAL(zunmlq,ZUNMLQ)
+#define LAPACK_sgeqlf LAPACK_GLOBAL(sgeqlf,SGEQLF)
+#define LAPACK_dgeqlf LAPACK_GLOBAL(dgeqlf,DGEQLF)
+#define LAPACK_cgeqlf LAPACK_GLOBAL(cgeqlf,CGEQLF)
+#define LAPACK_zgeqlf LAPACK_GLOBAL(zgeqlf,ZGEQLF)
+#define LAPACK_sorgql LAPACK_GLOBAL(sorgql,SORGQL)
+#define LAPACK_dorgql LAPACK_GLOBAL(dorgql,DORGQL)
+#define LAPACK_cungql LAPACK_GLOBAL(cungql,CUNGQL)
+#define LAPACK_zungql LAPACK_GLOBAL(zungql,ZUNGQL)
+#define LAPACK_sormql LAPACK_GLOBAL(sormql,SORMQL)
+#define LAPACK_dormql LAPACK_GLOBAL(dormql,DORMQL)
+#define LAPACK_cunmql LAPACK_GLOBAL(cunmql,CUNMQL)
+#define LAPACK_zunmql LAPACK_GLOBAL(zunmql,ZUNMQL)
+#define LAPACK_sgerqf LAPACK_GLOBAL(sgerqf,SGERQF)
+#define LAPACK_dgerqf LAPACK_GLOBAL(dgerqf,DGERQF)
+#define LAPACK_cgerqf LAPACK_GLOBAL(cgerqf,CGERQF)
+#define LAPACK_zgerqf LAPACK_GLOBAL(zgerqf,ZGERQF)
+#define LAPACK_sorgrq LAPACK_GLOBAL(sorgrq,SORGRQ)
+#define LAPACK_dorgrq LAPACK_GLOBAL(dorgrq,DORGRQ)
+#define LAPACK_cungrq LAPACK_GLOBAL(cungrq,CUNGRQ)
+#define LAPACK_zungrq LAPACK_GLOBAL(zungrq,ZUNGRQ)
+#define LAPACK_sormrq LAPACK_GLOBAL(sormrq,SORMRQ)
+#define LAPACK_dormrq LAPACK_GLOBAL(dormrq,DORMRQ)
+#define LAPACK_cunmrq LAPACK_GLOBAL(cunmrq,CUNMRQ)
+#define LAPACK_zunmrq LAPACK_GLOBAL(zunmrq,ZUNMRQ)
+#define LAPACK_stzrzf LAPACK_GLOBAL(stzrzf,STZRZF)
+#define LAPACK_dtzrzf LAPACK_GLOBAL(dtzrzf,DTZRZF)
+#define LAPACK_ctzrzf LAPACK_GLOBAL(ctzrzf,CTZRZF)
+#define LAPACK_ztzrzf LAPACK_GLOBAL(ztzrzf,ZTZRZF)
+#define LAPACK_sormrz LAPACK_GLOBAL(sormrz,SORMRZ)
+#define LAPACK_dormrz LAPACK_GLOBAL(dormrz,DORMRZ)
+#define LAPACK_cunmrz LAPACK_GLOBAL(cunmrz,CUNMRZ)
+#define LAPACK_zunmrz LAPACK_GLOBAL(zunmrz,ZUNMRZ)
+#define LAPACK_sggqrf LAPACK_GLOBAL(sggqrf,SGGQRF)
+#define LAPACK_dggqrf LAPACK_GLOBAL(dggqrf,DGGQRF)
+#define LAPACK_cggqrf LAPACK_GLOBAL(cggqrf,CGGQRF)
+#define LAPACK_zggqrf LAPACK_GLOBAL(zggqrf,ZGGQRF)
+#define LAPACK_sggrqf LAPACK_GLOBAL(sggrqf,SGGRQF)
+#define LAPACK_dggrqf LAPACK_GLOBAL(dggrqf,DGGRQF)
+#define LAPACK_cggrqf LAPACK_GLOBAL(cggrqf,CGGRQF)
+#define LAPACK_zggrqf LAPACK_GLOBAL(zggrqf,ZGGRQF)
+#define LAPACK_sgebrd LAPACK_GLOBAL(sgebrd,SGEBRD)
+#define LAPACK_dgebrd LAPACK_GLOBAL(dgebrd,DGEBRD)
+#define LAPACK_cgebrd LAPACK_GLOBAL(cgebrd,CGEBRD)
+#define LAPACK_zgebrd LAPACK_GLOBAL(zgebrd,ZGEBRD)
+#define LAPACK_sgbbrd LAPACK_GLOBAL(sgbbrd,SGBBRD)
+#define LAPACK_dgbbrd LAPACK_GLOBAL(dgbbrd,DGBBRD)
+#define LAPACK_cgbbrd LAPACK_GLOBAL(cgbbrd,CGBBRD)
+#define LAPACK_zgbbrd LAPACK_GLOBAL(zgbbrd,ZGBBRD)
+#define LAPACK_sorgbr LAPACK_GLOBAL(sorgbr,SORGBR)
+#define LAPACK_dorgbr LAPACK_GLOBAL(dorgbr,DORGBR)
+#define LAPACK_sormbr LAPACK_GLOBAL(sormbr,SORMBR)
+#define LAPACK_dormbr LAPACK_GLOBAL(dormbr,DORMBR)
+#define LAPACK_cungbr LAPACK_GLOBAL(cungbr,CUNGBR)
+#define LAPACK_zungbr LAPACK_GLOBAL(zungbr,ZUNGBR)
+#define LAPACK_cunmbr LAPACK_GLOBAL(cunmbr,CUNMBR)
+#define LAPACK_zunmbr LAPACK_GLOBAL(zunmbr,ZUNMBR)
+#define LAPACK_sbdsqr LAPACK_GLOBAL(sbdsqr,SBDSQR)
+#define LAPACK_dbdsqr LAPACK_GLOBAL(dbdsqr,DBDSQR)
+#define LAPACK_cbdsqr LAPACK_GLOBAL(cbdsqr,CBDSQR)
+#define LAPACK_zbdsqr LAPACK_GLOBAL(zbdsqr,ZBDSQR)
+#define LAPACK_sbdsdc LAPACK_GLOBAL(sbdsdc,SBDSDC)
+#define LAPACK_dbdsdc LAPACK_GLOBAL(dbdsdc,DBDSDC)
+#define LAPACK_ssytrd LAPACK_GLOBAL(ssytrd,SSYTRD)
+#define LAPACK_dsytrd LAPACK_GLOBAL(dsytrd,DSYTRD)
+#define LAPACK_sorgtr LAPACK_GLOBAL(sorgtr,SORGTR)
+#define LAPACK_dorgtr LAPACK_GLOBAL(dorgtr,DORGTR)
+#define LAPACK_sormtr LAPACK_GLOBAL(sormtr,SORMTR)
+#define LAPACK_dormtr LAPACK_GLOBAL(dormtr,DORMTR)
+#define LAPACK_chetrd LAPACK_GLOBAL(chetrd,CHETRD)
+#define LAPACK_zhetrd LAPACK_GLOBAL(zhetrd,ZHETRD)
+#define LAPACK_cungtr LAPACK_GLOBAL(cungtr,CUNGTR)
+#define LAPACK_zungtr LAPACK_GLOBAL(zungtr,ZUNGTR)
+#define LAPACK_cunmtr LAPACK_GLOBAL(cunmtr,CUNMTR)
+#define LAPACK_zunmtr LAPACK_GLOBAL(zunmtr,ZUNMTR)
+#define LAPACK_ssptrd LAPACK_GLOBAL(ssptrd,SSPTRD)
+#define LAPACK_dsptrd LAPACK_GLOBAL(dsptrd,DSPTRD)
+#define LAPACK_sopgtr LAPACK_GLOBAL(sopgtr,SOPGTR)
+#define LAPACK_dopgtr LAPACK_GLOBAL(dopgtr,DOPGTR)
+#define LAPACK_sopmtr LAPACK_GLOBAL(sopmtr,SOPMTR)
+#define LAPACK_dopmtr LAPACK_GLOBAL(dopmtr,DOPMTR)
+#define LAPACK_chptrd LAPACK_GLOBAL(chptrd,CHPTRD)
+#define LAPACK_zhptrd LAPACK_GLOBAL(zhptrd,ZHPTRD)
+#define LAPACK_cupgtr LAPACK_GLOBAL(cupgtr,CUPGTR)
+#define LAPACK_zupgtr LAPACK_GLOBAL(zupgtr,ZUPGTR)
+#define LAPACK_cupmtr LAPACK_GLOBAL(cupmtr,CUPMTR)
+#define LAPACK_zupmtr LAPACK_GLOBAL(zupmtr,ZUPMTR)
+#define LAPACK_ssbtrd LAPACK_GLOBAL(ssbtrd,SSBTRD)
+#define LAPACK_dsbtrd LAPACK_GLOBAL(dsbtrd,DSBTRD)
+#define LAPACK_chbtrd LAPACK_GLOBAL(chbtrd,CHBTRD)
+#define LAPACK_zhbtrd LAPACK_GLOBAL(zhbtrd,ZHBTRD)
+#define LAPACK_ssterf LAPACK_GLOBAL(ssterf,SSTERF)
+#define LAPACK_dsterf LAPACK_GLOBAL(dsterf,DSTERF)
+#define LAPACK_ssteqr LAPACK_GLOBAL(ssteqr,SSTEQR)
+#define LAPACK_dsteqr LAPACK_GLOBAL(dsteqr,DSTEQR)
+#define LAPACK_csteqr LAPACK_GLOBAL(csteqr,CSTEQR)
+#define LAPACK_zsteqr LAPACK_GLOBAL(zsteqr,ZSTEQR)
+#define LAPACK_sstemr LAPACK_GLOBAL(sstemr,SSTEMR)
+#define LAPACK_dstemr LAPACK_GLOBAL(dstemr,DSTEMR)
+#define LAPACK_cstemr LAPACK_GLOBAL(cstemr,CSTEMR)
+#define LAPACK_zstemr LAPACK_GLOBAL(zstemr,ZSTEMR)
+#define LAPACK_sstedc LAPACK_GLOBAL(sstedc,SSTEDC)
+#define LAPACK_dstedc LAPACK_GLOBAL(dstedc,DSTEDC)
+#define LAPACK_cstedc LAPACK_GLOBAL(cstedc,CSTEDC)
+#define LAPACK_zstedc LAPACK_GLOBAL(zstedc,ZSTEDC)
+#define LAPACK_sstegr LAPACK_GLOBAL(sstegr,SSTEGR)
+#define LAPACK_dstegr LAPACK_GLOBAL(dstegr,DSTEGR)
+#define LAPACK_cstegr LAPACK_GLOBAL(cstegr,CSTEGR)
+#define LAPACK_zstegr LAPACK_GLOBAL(zstegr,ZSTEGR)
+#define LAPACK_spteqr LAPACK_GLOBAL(spteqr,SPTEQR)
+#define LAPACK_dpteqr LAPACK_GLOBAL(dpteqr,DPTEQR)
+#define LAPACK_cpteqr LAPACK_GLOBAL(cpteqr,CPTEQR)
+#define LAPACK_zpteqr LAPACK_GLOBAL(zpteqr,ZPTEQR)
+#define LAPACK_sstebz LAPACK_GLOBAL(sstebz,SSTEBZ)
+#define LAPACK_dstebz LAPACK_GLOBAL(dstebz,DSTEBZ)
+#define LAPACK_sstein LAPACK_GLOBAL(sstein,SSTEIN)
+#define LAPACK_dstein LAPACK_GLOBAL(dstein,DSTEIN)
+#define LAPACK_cstein LAPACK_GLOBAL(cstein,CSTEIN)
+#define LAPACK_zstein LAPACK_GLOBAL(zstein,ZSTEIN)
+#define LAPACK_sdisna LAPACK_GLOBAL(sdisna,SDISNA)
+#define LAPACK_ddisna LAPACK_GLOBAL(ddisna,DDISNA)
+#define LAPACK_ssygst LAPACK_GLOBAL(ssygst,SSYGST)
+#define LAPACK_dsygst LAPACK_GLOBAL(dsygst,DSYGST)
+#define LAPACK_chegst LAPACK_GLOBAL(chegst,CHEGST)
+#define LAPACK_zhegst LAPACK_GLOBAL(zhegst,ZHEGST)
+#define LAPACK_sspgst LAPACK_GLOBAL(sspgst,SSPGST)
+#define LAPACK_dspgst LAPACK_GLOBAL(dspgst,DSPGST)
+#define LAPACK_chpgst LAPACK_GLOBAL(chpgst,CHPGST)
+#define LAPACK_zhpgst LAPACK_GLOBAL(zhpgst,ZHPGST)
+#define LAPACK_ssbgst LAPACK_GLOBAL(ssbgst,SSBGST)
+#define LAPACK_dsbgst LAPACK_GLOBAL(dsbgst,DSBGST)
+#define LAPACK_chbgst LAPACK_GLOBAL(chbgst,CHBGST)
+#define LAPACK_zhbgst LAPACK_GLOBAL(zhbgst,ZHBGST)
+#define LAPACK_spbstf LAPACK_GLOBAL(spbstf,SPBSTF)
+#define LAPACK_dpbstf LAPACK_GLOBAL(dpbstf,DPBSTF)
+#define LAPACK_cpbstf LAPACK_GLOBAL(cpbstf,CPBSTF)
+#define LAPACK_zpbstf LAPACK_GLOBAL(zpbstf,ZPBSTF)
+#define LAPACK_sgehrd LAPACK_GLOBAL(sgehrd,SGEHRD)
+#define LAPACK_dgehrd LAPACK_GLOBAL(dgehrd,DGEHRD)
+#define LAPACK_cgehrd LAPACK_GLOBAL(cgehrd,CGEHRD)
+#define LAPACK_zgehrd LAPACK_GLOBAL(zgehrd,ZGEHRD)
+#define LAPACK_sorghr LAPACK_GLOBAL(sorghr,SORGHR)
+#define LAPACK_dorghr LAPACK_GLOBAL(dorghr,DORGHR)
+#define LAPACK_sormhr LAPACK_GLOBAL(sormhr,SORMHR)
+#define LAPACK_dormhr LAPACK_GLOBAL(dormhr,DORMHR)
+#define LAPACK_cunghr LAPACK_GLOBAL(cunghr,CUNGHR)
+#define LAPACK_zunghr LAPACK_GLOBAL(zunghr,ZUNGHR)
+#define LAPACK_cunmhr LAPACK_GLOBAL(cunmhr,CUNMHR)
+#define LAPACK_zunmhr LAPACK_GLOBAL(zunmhr,ZUNMHR)
+#define LAPACK_sgebal LAPACK_GLOBAL(sgebal,SGEBAL)
+#define LAPACK_dgebal LAPACK_GLOBAL(dgebal,DGEBAL)
+#define LAPACK_cgebal LAPACK_GLOBAL(cgebal,CGEBAL)
+#define LAPACK_zgebal LAPACK_GLOBAL(zgebal,ZGEBAL)
+#define LAPACK_sgebak LAPACK_GLOBAL(sgebak,SGEBAK)
+#define LAPACK_dgebak LAPACK_GLOBAL(dgebak,DGEBAK)
+#define LAPACK_cgebak LAPACK_GLOBAL(cgebak,CGEBAK)
+#define LAPACK_zgebak LAPACK_GLOBAL(zgebak,ZGEBAK)
+#define LAPACK_shseqr LAPACK_GLOBAL(shseqr,SHSEQR)
+#define LAPACK_dhseqr LAPACK_GLOBAL(dhseqr,DHSEQR)
+#define LAPACK_chseqr LAPACK_GLOBAL(chseqr,CHSEQR)
+#define LAPACK_zhseqr LAPACK_GLOBAL(zhseqr,ZHSEQR)
+#define LAPACK_shsein LAPACK_GLOBAL(shsein,SHSEIN)
+#define LAPACK_dhsein LAPACK_GLOBAL(dhsein,DHSEIN)
+#define LAPACK_chsein LAPACK_GLOBAL(chsein,CHSEIN)
+#define LAPACK_zhsein LAPACK_GLOBAL(zhsein,ZHSEIN)
+#define LAPACK_strevc LAPACK_GLOBAL(strevc,STREVC)
+#define LAPACK_dtrevc LAPACK_GLOBAL(dtrevc,DTREVC)
+#define LAPACK_ctrevc LAPACK_GLOBAL(ctrevc,CTREVC)
+#define LAPACK_ztrevc LAPACK_GLOBAL(ztrevc,ZTREVC)
+#define LAPACK_strsna LAPACK_GLOBAL(strsna,STRSNA)
+#define LAPACK_dtrsna LAPACK_GLOBAL(dtrsna,DTRSNA)
+#define LAPACK_ctrsna LAPACK_GLOBAL(ctrsna,CTRSNA)
+#define LAPACK_ztrsna LAPACK_GLOBAL(ztrsna,ZTRSNA)
+#define LAPACK_strexc LAPACK_GLOBAL(strexc,STREXC)
+#define LAPACK_dtrexc LAPACK_GLOBAL(dtrexc,DTREXC)
+#define LAPACK_ctrexc LAPACK_GLOBAL(ctrexc,CTREXC)
+#define LAPACK_ztrexc LAPACK_GLOBAL(ztrexc,ZTREXC)
+#define LAPACK_strsen LAPACK_GLOBAL(strsen,STRSEN)
+#define LAPACK_dtrsen LAPACK_GLOBAL(dtrsen,DTRSEN)
+#define LAPACK_ctrsen LAPACK_GLOBAL(ctrsen,CTRSEN)
+#define LAPACK_ztrsen LAPACK_GLOBAL(ztrsen,ZTRSEN)
+#define LAPACK_strsyl LAPACK_GLOBAL(strsyl,STRSYL)
+#define LAPACK_dtrsyl LAPACK_GLOBAL(dtrsyl,DTRSYL)
+#define LAPACK_ctrsyl LAPACK_GLOBAL(ctrsyl,CTRSYL)
+#define LAPACK_ztrsyl LAPACK_GLOBAL(ztrsyl,ZTRSYL)
+#define LAPACK_sgghrd LAPACK_GLOBAL(sgghrd,SGGHRD)
+#define LAPACK_dgghrd LAPACK_GLOBAL(dgghrd,DGGHRD)
+#define LAPACK_cgghrd LAPACK_GLOBAL(cgghrd,CGGHRD)
+#define LAPACK_zgghrd LAPACK_GLOBAL(zgghrd,ZGGHRD)
+#define LAPACK_sggbal LAPACK_GLOBAL(sggbal,SGGBAL)
+#define LAPACK_dggbal LAPACK_GLOBAL(dggbal,DGGBAL)
+#define LAPACK_cggbal LAPACK_GLOBAL(cggbal,CGGBAL)
+#define LAPACK_zggbal LAPACK_GLOBAL(zggbal,ZGGBAL)
+#define LAPACK_sggbak LAPACK_GLOBAL(sggbak,SGGBAK)
+#define LAPACK_dggbak LAPACK_GLOBAL(dggbak,DGGBAK)
+#define LAPACK_cggbak LAPACK_GLOBAL(cggbak,CGGBAK)
+#define LAPACK_zggbak LAPACK_GLOBAL(zggbak,ZGGBAK)
+#define LAPACK_shgeqz LAPACK_GLOBAL(shgeqz,SHGEQZ)
+#define LAPACK_dhgeqz LAPACK_GLOBAL(dhgeqz,DHGEQZ)
+#define LAPACK_chgeqz LAPACK_GLOBAL(chgeqz,CHGEQZ)
+#define LAPACK_zhgeqz LAPACK_GLOBAL(zhgeqz,ZHGEQZ)
+#define LAPACK_stgevc LAPACK_GLOBAL(stgevc,STGEVC)
+#define LAPACK_dtgevc LAPACK_GLOBAL(dtgevc,DTGEVC)
+#define LAPACK_ctgevc LAPACK_GLOBAL(ctgevc,CTGEVC)
+#define LAPACK_ztgevc LAPACK_GLOBAL(ztgevc,ZTGEVC)
+#define LAPACK_stgexc LAPACK_GLOBAL(stgexc,STGEXC)
+#define LAPACK_dtgexc LAPACK_GLOBAL(dtgexc,DTGEXC)
+#define LAPACK_ctgexc LAPACK_GLOBAL(ctgexc,CTGEXC)
+#define LAPACK_ztgexc LAPACK_GLOBAL(ztgexc,ZTGEXC)
+#define LAPACK_stgsen LAPACK_GLOBAL(stgsen,STGSEN)
+#define LAPACK_dtgsen LAPACK_GLOBAL(dtgsen,DTGSEN)
+#define LAPACK_ctgsen LAPACK_GLOBAL(ctgsen,CTGSEN)
+#define LAPACK_ztgsen LAPACK_GLOBAL(ztgsen,ZTGSEN)
+#define LAPACK_stgsyl LAPACK_GLOBAL(stgsyl,STGSYL)
+#define LAPACK_dtgsyl LAPACK_GLOBAL(dtgsyl,DTGSYL)
+#define LAPACK_ctgsyl LAPACK_GLOBAL(ctgsyl,CTGSYL)
+#define LAPACK_ztgsyl LAPACK_GLOBAL(ztgsyl,ZTGSYL)
+#define LAPACK_stgsna LAPACK_GLOBAL(stgsna,STGSNA)
+#define LAPACK_dtgsna LAPACK_GLOBAL(dtgsna,DTGSNA)
+#define LAPACK_ctgsna LAPACK_GLOBAL(ctgsna,CTGSNA)
+#define LAPACK_ztgsna LAPACK_GLOBAL(ztgsna,ZTGSNA)
+#define LAPACK_sggsvp LAPACK_GLOBAL(sggsvp,SGGSVP)
+#define LAPACK_dggsvp LAPACK_GLOBAL(dggsvp,DGGSVP)
+#define LAPACK_cggsvp LAPACK_GLOBAL(cggsvp,CGGSVP)
+#define LAPACK_zggsvp LAPACK_GLOBAL(zggsvp,ZGGSVP)
+#define LAPACK_stgsja LAPACK_GLOBAL(stgsja,STGSJA)
+#define LAPACK_dtgsja LAPACK_GLOBAL(dtgsja,DTGSJA)
+#define LAPACK_ctgsja LAPACK_GLOBAL(ctgsja,CTGSJA)
+#define LAPACK_ztgsja LAPACK_GLOBAL(ztgsja,ZTGSJA)
+#define LAPACK_sgels LAPACK_GLOBAL(sgels,SGELS)
+#define LAPACK_dgels LAPACK_GLOBAL(dgels,DGELS)
+#define LAPACK_cgels LAPACK_GLOBAL(cgels,CGELS)
+#define LAPACK_zgels LAPACK_GLOBAL(zgels,ZGELS)
+#define LAPACK_sgelsy LAPACK_GLOBAL(sgelsy,SGELSY)
+#define LAPACK_dgelsy LAPACK_GLOBAL(dgelsy,DGELSY)
+#define LAPACK_cgelsy LAPACK_GLOBAL(cgelsy,CGELSY)
+#define LAPACK_zgelsy LAPACK_GLOBAL(zgelsy,ZGELSY)
+#define LAPACK_sgelss LAPACK_GLOBAL(sgelss,SGELSS)
+#define LAPACK_dgelss LAPACK_GLOBAL(dgelss,DGELSS)
+#define LAPACK_cgelss LAPACK_GLOBAL(cgelss,CGELSS)
+#define LAPACK_zgelss LAPACK_GLOBAL(zgelss,ZGELSS)
+#define LAPACK_sgelsd LAPACK_GLOBAL(sgelsd,SGELSD)
+#define LAPACK_dgelsd LAPACK_GLOBAL(dgelsd,DGELSD)
+#define LAPACK_cgelsd LAPACK_GLOBAL(cgelsd,CGELSD)
+#define LAPACK_zgelsd LAPACK_GLOBAL(zgelsd,ZGELSD)
+#define LAPACK_sgglse LAPACK_GLOBAL(sgglse,SGGLSE)
+#define LAPACK_dgglse LAPACK_GLOBAL(dgglse,DGGLSE)
+#define LAPACK_cgglse LAPACK_GLOBAL(cgglse,CGGLSE)
+#define LAPACK_zgglse LAPACK_GLOBAL(zgglse,ZGGLSE)
+#define LAPACK_sggglm LAPACK_GLOBAL(sggglm,SGGGLM)
+#define LAPACK_dggglm LAPACK_GLOBAL(dggglm,DGGGLM)
+#define LAPACK_cggglm LAPACK_GLOBAL(cggglm,CGGGLM)
+#define LAPACK_zggglm LAPACK_GLOBAL(zggglm,ZGGGLM)
+#define LAPACK_ssyev LAPACK_GLOBAL(ssyev,SSYEV)
+#define LAPACK_dsyev LAPACK_GLOBAL(dsyev,DSYEV)
+#define LAPACK_cheev LAPACK_GLOBAL(cheev,CHEEV)
+#define LAPACK_zheev LAPACK_GLOBAL(zheev,ZHEEV)
+#define LAPACK_ssyevd LAPACK_GLOBAL(ssyevd,SSYEVD)
+#define LAPACK_dsyevd LAPACK_GLOBAL(dsyevd,DSYEVD)
+#define LAPACK_cheevd LAPACK_GLOBAL(cheevd,CHEEVD)
+#define LAPACK_zheevd LAPACK_GLOBAL(zheevd,ZHEEVD)
+#define LAPACK_ssyevx LAPACK_GLOBAL(ssyevx,SSYEVX)
+#define LAPACK_dsyevx LAPACK_GLOBAL(dsyevx,DSYEVX)
+#define LAPACK_cheevx LAPACK_GLOBAL(cheevx,CHEEVX)
+#define LAPACK_zheevx LAPACK_GLOBAL(zheevx,ZHEEVX)
+#define LAPACK_ssyevr LAPACK_GLOBAL(ssyevr,SSYEVR)
+#define LAPACK_dsyevr LAPACK_GLOBAL(dsyevr,DSYEVR)
+#define LAPACK_cheevr LAPACK_GLOBAL(cheevr,CHEEVR)
+#define LAPACK_zheevr LAPACK_GLOBAL(zheevr,ZHEEVR)
+#define LAPACK_sspev LAPACK_GLOBAL(sspev,SSPEV)
+#define LAPACK_dspev LAPACK_GLOBAL(dspev,DSPEV)
+#define LAPACK_chpev LAPACK_GLOBAL(chpev,CHPEV)
+#define LAPACK_zhpev LAPACK_GLOBAL(zhpev,ZHPEV)
+#define LAPACK_sspevd LAPACK_GLOBAL(sspevd,SSPEVD)
+#define LAPACK_dspevd LAPACK_GLOBAL(dspevd,DSPEVD)
+#define LAPACK_chpevd LAPACK_GLOBAL(chpevd,CHPEVD)
+#define LAPACK_zhpevd LAPACK_GLOBAL(zhpevd,ZHPEVD)
+#define LAPACK_sspevx LAPACK_GLOBAL(sspevx,SSPEVX)
+#define LAPACK_dspevx LAPACK_GLOBAL(dspevx,DSPEVX)
+#define LAPACK_chpevx LAPACK_GLOBAL(chpevx,CHPEVX)
+#define LAPACK_zhpevx LAPACK_GLOBAL(zhpevx,ZHPEVX)
+#define LAPACK_ssbev LAPACK_GLOBAL(ssbev,SSBEV)
+#define LAPACK_dsbev LAPACK_GLOBAL(dsbev,DSBEV)
+#define LAPACK_chbev LAPACK_GLOBAL(chbev,CHBEV)
+#define LAPACK_zhbev LAPACK_GLOBAL(zhbev,ZHBEV)
+#define LAPACK_ssbevd LAPACK_GLOBAL(ssbevd,SSBEVD)
+#define LAPACK_dsbevd LAPACK_GLOBAL(dsbevd,DSBEVD)
+#define LAPACK_chbevd LAPACK_GLOBAL(chbevd,CHBEVD)
+#define LAPACK_zhbevd LAPACK_GLOBAL(zhbevd,ZHBEVD)
+#define LAPACK_ssbevx LAPACK_GLOBAL(ssbevx,SSBEVX)
+#define LAPACK_dsbevx LAPACK_GLOBAL(dsbevx,DSBEVX)
+#define LAPACK_chbevx LAPACK_GLOBAL(chbevx,CHBEVX)
+#define LAPACK_zhbevx LAPACK_GLOBAL(zhbevx,ZHBEVX)
+#define LAPACK_sstev LAPACK_GLOBAL(sstev,SSTEV)
+#define LAPACK_dstev LAPACK_GLOBAL(dstev,DSTEV)
+#define LAPACK_sstevd LAPACK_GLOBAL(sstevd,SSTEVD)
+#define LAPACK_dstevd LAPACK_GLOBAL(dstevd,DSTEVD)
+#define LAPACK_sstevx LAPACK_GLOBAL(sstevx,SSTEVX)
+#define LAPACK_dstevx LAPACK_GLOBAL(dstevx,DSTEVX)
+#define LAPACK_sstevr LAPACK_GLOBAL(sstevr,SSTEVR)
+#define LAPACK_dstevr LAPACK_GLOBAL(dstevr,DSTEVR)
+#define LAPACK_sgees LAPACK_GLOBAL(sgees,SGEES)
+#define LAPACK_dgees LAPACK_GLOBAL(dgees,DGEES)
+#define LAPACK_cgees LAPACK_GLOBAL(cgees,CGEES)
+#define LAPACK_zgees LAPACK_GLOBAL(zgees,ZGEES)
+#define LAPACK_sgeesx LAPACK_GLOBAL(sgeesx,SGEESX)
+#define LAPACK_dgeesx LAPACK_GLOBAL(dgeesx,DGEESX)
+#define LAPACK_cgeesx LAPACK_GLOBAL(cgeesx,CGEESX)
+#define LAPACK_zgeesx LAPACK_GLOBAL(zgeesx,ZGEESX)
+#define LAPACK_sgeev LAPACK_GLOBAL(sgeev,SGEEV)
+#define LAPACK_dgeev LAPACK_GLOBAL(dgeev,DGEEV)
+#define LAPACK_cgeev LAPACK_GLOBAL(cgeev,CGEEV)
+#define LAPACK_zgeev LAPACK_GLOBAL(zgeev,ZGEEV)
+#define LAPACK_sgeevx LAPACK_GLOBAL(sgeevx,SGEEVX)
+#define LAPACK_dgeevx LAPACK_GLOBAL(dgeevx,DGEEVX)
+#define LAPACK_cgeevx LAPACK_GLOBAL(cgeevx,CGEEVX)
+#define LAPACK_zgeevx LAPACK_GLOBAL(zgeevx,ZGEEVX)
+#define LAPACK_sgesvd LAPACK_GLOBAL(sgesvd,SGESVD)
+#define LAPACK_dgesvd LAPACK_GLOBAL(dgesvd,DGESVD)
+#define LAPACK_cgesvd LAPACK_GLOBAL(cgesvd,CGESVD)
+#define LAPACK_zgesvd LAPACK_GLOBAL(zgesvd,ZGESVD)
+#define LAPACK_sgesdd LAPACK_GLOBAL(sgesdd,SGESDD)
+#define LAPACK_dgesdd LAPACK_GLOBAL(dgesdd,DGESDD)
+#define LAPACK_cgesdd LAPACK_GLOBAL(cgesdd,CGESDD)
+#define LAPACK_zgesdd LAPACK_GLOBAL(zgesdd,ZGESDD)
+#define LAPACK_dgejsv LAPACK_GLOBAL(dgejsv,DGEJSV)
+#define LAPACK_sgejsv LAPACK_GLOBAL(sgejsv,SGEJSV)
+#define LAPACK_dgesvj LAPACK_GLOBAL(dgesvj,DGESVJ)
+#define LAPACK_sgesvj LAPACK_GLOBAL(sgesvj,SGESVJ)
+#define LAPACK_sggsvd LAPACK_GLOBAL(sggsvd,SGGSVD)
+#define LAPACK_dggsvd LAPACK_GLOBAL(dggsvd,DGGSVD)
+#define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd,CGGSVD)
+#define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd,ZGGSVD)
+#define LAPACK_ssygv LAPACK_GLOBAL(ssygv,SSYGV)
+#define LAPACK_dsygv LAPACK_GLOBAL(dsygv,DSYGV)
+#define LAPACK_chegv LAPACK_GLOBAL(chegv,CHEGV)
+#define LAPACK_zhegv LAPACK_GLOBAL(zhegv,ZHEGV)
+#define LAPACK_ssygvd LAPACK_GLOBAL(ssygvd,SSYGVD)
+#define LAPACK_dsygvd LAPACK_GLOBAL(dsygvd,DSYGVD)
+#define LAPACK_chegvd LAPACK_GLOBAL(chegvd,CHEGVD)
+#define LAPACK_zhegvd LAPACK_GLOBAL(zhegvd,ZHEGVD)
+#define LAPACK_ssygvx LAPACK_GLOBAL(ssygvx,SSYGVX)
+#define LAPACK_dsygvx LAPACK_GLOBAL(dsygvx,DSYGVX)
+#define LAPACK_chegvx LAPACK_GLOBAL(chegvx,CHEGVX)
+#define LAPACK_zhegvx LAPACK_GLOBAL(zhegvx,ZHEGVX)
+#define LAPACK_sspgv LAPACK_GLOBAL(sspgv,SSPGV)
+#define LAPACK_dspgv LAPACK_GLOBAL(dspgv,DSPGV)
+#define LAPACK_chpgv LAPACK_GLOBAL(chpgv,CHPGV)
+#define LAPACK_zhpgv LAPACK_GLOBAL(zhpgv,ZHPGV)
+#define LAPACK_sspgvd LAPACK_GLOBAL(sspgvd,SSPGVD)
+#define LAPACK_dspgvd LAPACK_GLOBAL(dspgvd,DSPGVD)
+#define LAPACK_chpgvd LAPACK_GLOBAL(chpgvd,CHPGVD)
+#define LAPACK_zhpgvd LAPACK_GLOBAL(zhpgvd,ZHPGVD)
+#define LAPACK_sspgvx LAPACK_GLOBAL(sspgvx,SSPGVX)
+#define LAPACK_dspgvx LAPACK_GLOBAL(dspgvx,DSPGVX)
+#define LAPACK_chpgvx LAPACK_GLOBAL(chpgvx,CHPGVX)
+#define LAPACK_zhpgvx LAPACK_GLOBAL(zhpgvx,ZHPGVX)
+#define LAPACK_ssbgv LAPACK_GLOBAL(ssbgv,SSBGV)
+#define LAPACK_dsbgv LAPACK_GLOBAL(dsbgv,DSBGV)
+#define LAPACK_chbgv LAPACK_GLOBAL(chbgv,CHBGV)
+#define LAPACK_zhbgv LAPACK_GLOBAL(zhbgv,ZHBGV)
+#define LAPACK_ssbgvd LAPACK_GLOBAL(ssbgvd,SSBGVD)
+#define LAPACK_dsbgvd LAPACK_GLOBAL(dsbgvd,DSBGVD)
+#define LAPACK_chbgvd LAPACK_GLOBAL(chbgvd,CHBGVD)
+#define LAPACK_zhbgvd LAPACK_GLOBAL(zhbgvd,ZHBGVD)
+#define LAPACK_ssbgvx LAPACK_GLOBAL(ssbgvx,SSBGVX)
+#define LAPACK_dsbgvx LAPACK_GLOBAL(dsbgvx,DSBGVX)
+#define LAPACK_chbgvx LAPACK_GLOBAL(chbgvx,CHBGVX)
+#define LAPACK_zhbgvx LAPACK_GLOBAL(zhbgvx,ZHBGVX)
+#define LAPACK_sgges LAPACK_GLOBAL(sgges,SGGES)
+#define LAPACK_dgges LAPACK_GLOBAL(dgges,DGGES)
+#define LAPACK_cgges LAPACK_GLOBAL(cgges,CGGES)
+#define LAPACK_zgges LAPACK_GLOBAL(zgges,ZGGES)
+#define LAPACK_sggesx LAPACK_GLOBAL(sggesx,SGGESX)
+#define LAPACK_dggesx LAPACK_GLOBAL(dggesx,DGGESX)
+#define LAPACK_cggesx LAPACK_GLOBAL(cggesx,CGGESX)
+#define LAPACK_zggesx LAPACK_GLOBAL(zggesx,ZGGESX)
+#define LAPACK_sggev LAPACK_GLOBAL(sggev,SGGEV)
+#define LAPACK_dggev LAPACK_GLOBAL(dggev,DGGEV)
+#define LAPACK_cggev LAPACK_GLOBAL(cggev,CGGEV)
+#define LAPACK_zggev LAPACK_GLOBAL(zggev,ZGGEV)
+#define LAPACK_sggevx LAPACK_GLOBAL(sggevx,SGGEVX)
+#define LAPACK_dggevx LAPACK_GLOBAL(dggevx,DGGEVX)
+#define LAPACK_cggevx LAPACK_GLOBAL(cggevx,CGGEVX)
+#define LAPACK_zggevx LAPACK_GLOBAL(zggevx,ZGGEVX)
+#define LAPACK_dsfrk LAPACK_GLOBAL(dsfrk,DSFRK)
+#define LAPACK_ssfrk LAPACK_GLOBAL(ssfrk,SSFRK)
+#define LAPACK_zhfrk LAPACK_GLOBAL(zhfrk,ZHFRK)
+#define LAPACK_chfrk LAPACK_GLOBAL(chfrk,CHFRK)
+#define LAPACK_dtfsm LAPACK_GLOBAL(dtfsm,DTFSM)
+#define LAPACK_stfsm LAPACK_GLOBAL(stfsm,STFSM)
+#define LAPACK_ztfsm LAPACK_GLOBAL(ztfsm,ZTFSM)
+#define LAPACK_ctfsm LAPACK_GLOBAL(ctfsm,CTFSM)
+#define LAPACK_dtfttp LAPACK_GLOBAL(dtfttp,DTFTTP)
+#define LAPACK_stfttp LAPACK_GLOBAL(stfttp,STFTTP)
+#define LAPACK_ztfttp LAPACK_GLOBAL(ztfttp,ZTFTTP)
+#define LAPACK_ctfttp LAPACK_GLOBAL(ctfttp,CTFTTP)
+#define LAPACK_dtfttr LAPACK_GLOBAL(dtfttr,DTFTTR)
+#define LAPACK_stfttr LAPACK_GLOBAL(stfttr,STFTTR)
+#define LAPACK_ztfttr LAPACK_GLOBAL(ztfttr,ZTFTTR)
+#define LAPACK_ctfttr LAPACK_GLOBAL(ctfttr,CTFTTR)
+#define LAPACK_dtpttf LAPACK_GLOBAL(dtpttf,DTPTTF)
+#define LAPACK_stpttf LAPACK_GLOBAL(stpttf,STPTTF)
+#define LAPACK_ztpttf LAPACK_GLOBAL(ztpttf,ZTPTTF)
+#define LAPACK_ctpttf LAPACK_GLOBAL(ctpttf,CTPTTF)
+#define LAPACK_dtpttr LAPACK_GLOBAL(dtpttr,DTPTTR)
+#define LAPACK_stpttr LAPACK_GLOBAL(stpttr,STPTTR)
+#define LAPACK_ztpttr LAPACK_GLOBAL(ztpttr,ZTPTTR)
+#define LAPACK_ctpttr LAPACK_GLOBAL(ctpttr,CTPTTR)
+#define LAPACK_dtrttf LAPACK_GLOBAL(dtrttf,DTRTTF)
+#define LAPACK_strttf LAPACK_GLOBAL(strttf,STRTTF)
+#define LAPACK_ztrttf LAPACK_GLOBAL(ztrttf,ZTRTTF)
+#define LAPACK_ctrttf LAPACK_GLOBAL(ctrttf,CTRTTF)
+#define LAPACK_dtrttp LAPACK_GLOBAL(dtrttp,DTRTTP)
+#define LAPACK_strttp LAPACK_GLOBAL(strttp,STRTTP)
+#define LAPACK_ztrttp LAPACK_GLOBAL(ztrttp,ZTRTTP)
+#define LAPACK_ctrttp LAPACK_GLOBAL(ctrttp,CTRTTP)
+#define LAPACK_sgeqrfp LAPACK_GLOBAL(sgeqrfp,SGEQRFP)
+#define LAPACK_dgeqrfp LAPACK_GLOBAL(dgeqrfp,DGEQRFP)
+#define LAPACK_cgeqrfp LAPACK_GLOBAL(cgeqrfp,CGEQRFP)
+#define LAPACK_zgeqrfp LAPACK_GLOBAL(zgeqrfp,ZGEQRFP)
+#define LAPACK_clacgv LAPACK_GLOBAL(clacgv,CLACGV)
+#define LAPACK_zlacgv LAPACK_GLOBAL(zlacgv,ZLACGV)
+#define LAPACK_slarnv LAPACK_GLOBAL(slarnv,SLARNV)
+#define LAPACK_dlarnv LAPACK_GLOBAL(dlarnv,DLARNV)
+#define LAPACK_clarnv LAPACK_GLOBAL(clarnv,CLARNV)
+#define LAPACK_zlarnv LAPACK_GLOBAL(zlarnv,ZLARNV)
+#define LAPACK_sgeqr2 LAPACK_GLOBAL(sgeqr2,SGEQR2)
+#define LAPACK_dgeqr2 LAPACK_GLOBAL(dgeqr2,DGEQR2)
+#define LAPACK_cgeqr2 LAPACK_GLOBAL(cgeqr2,CGEQR2)
+#define LAPACK_zgeqr2 LAPACK_GLOBAL(zgeqr2,ZGEQR2)
+#define LAPACK_slacpy LAPACK_GLOBAL(slacpy,SLACPY)
+#define LAPACK_dlacpy LAPACK_GLOBAL(dlacpy,DLACPY)
+#define LAPACK_clacpy LAPACK_GLOBAL(clacpy,CLACPY)
+#define LAPACK_zlacpy LAPACK_GLOBAL(zlacpy,ZLACPY)
+#define LAPACK_sgetf2 LAPACK_GLOBAL(sgetf2,SGETF2)
+#define LAPACK_dgetf2 LAPACK_GLOBAL(dgetf2,DGETF2)
+#define LAPACK_cgetf2 LAPACK_GLOBAL(cgetf2,CGETF2)
+#define LAPACK_zgetf2 LAPACK_GLOBAL(zgetf2,ZGETF2)
+#define LAPACK_slaswp LAPACK_GLOBAL(slaswp,SLASWP)
+#define LAPACK_dlaswp LAPACK_GLOBAL(dlaswp,DLASWP)
+#define LAPACK_claswp LAPACK_GLOBAL(claswp,CLASWP)
+#define LAPACK_zlaswp LAPACK_GLOBAL(zlaswp,ZLASWP)
+#define LAPACK_slange LAPACK_GLOBAL(slange,SLANGE)
+#define LAPACK_dlange LAPACK_GLOBAL(dlange,DLANGE)
+#define LAPACK_clange LAPACK_GLOBAL(clange,CLANGE)
+#define LAPACK_zlange LAPACK_GLOBAL(zlange,ZLANGE)
+#define LAPACK_clanhe LAPACK_GLOBAL(clanhe,CLANHE)
+#define LAPACK_zlanhe LAPACK_GLOBAL(zlanhe,ZLANHE)
+#define LAPACK_slansy LAPACK_GLOBAL(slansy,SLANSY)
+#define LAPACK_dlansy LAPACK_GLOBAL(dlansy,DLANSY)
+#define LAPACK_clansy LAPACK_GLOBAL(clansy,CLANSY)
+#define LAPACK_zlansy LAPACK_GLOBAL(zlansy,ZLANSY)
+#define LAPACK_slantr LAPACK_GLOBAL(slantr,SLANTR)
+#define LAPACK_dlantr LAPACK_GLOBAL(dlantr,DLANTR)
+#define LAPACK_clantr LAPACK_GLOBAL(clantr,CLANTR)
+#define LAPACK_zlantr LAPACK_GLOBAL(zlantr,ZLANTR)
+#define LAPACK_slamch LAPACK_GLOBAL(slamch,SLAMCH)
+#define LAPACK_dlamch LAPACK_GLOBAL(dlamch,DLAMCH)
+#define LAPACK_sgelq2 LAPACK_GLOBAL(sgelq2,SGELQ2)
+#define LAPACK_dgelq2 LAPACK_GLOBAL(dgelq2,DGELQ2)
+#define LAPACK_cgelq2 LAPACK_GLOBAL(cgelq2,CGELQ2)
+#define LAPACK_zgelq2 LAPACK_GLOBAL(zgelq2,ZGELQ2)
+#define LAPACK_slarfb LAPACK_GLOBAL(slarfb,SLARFB)
+#define LAPACK_dlarfb LAPACK_GLOBAL(dlarfb,DLARFB)
+#define LAPACK_clarfb LAPACK_GLOBAL(clarfb,CLARFB)
+#define LAPACK_zlarfb LAPACK_GLOBAL(zlarfb,ZLARFB)
+#define LAPACK_slarfg LAPACK_GLOBAL(slarfg,SLARFG)
+#define LAPACK_dlarfg LAPACK_GLOBAL(dlarfg,DLARFG)
+#define LAPACK_clarfg LAPACK_GLOBAL(clarfg,CLARFG)
+#define LAPACK_zlarfg LAPACK_GLOBAL(zlarfg,ZLARFG)
+#define LAPACK_slarft LAPACK_GLOBAL(slarft,SLARFT)
+#define LAPACK_dlarft LAPACK_GLOBAL(dlarft,DLARFT)
+#define LAPACK_clarft LAPACK_GLOBAL(clarft,CLARFT)
+#define LAPACK_zlarft LAPACK_GLOBAL(zlarft,ZLARFT)
+#define LAPACK_slarfx LAPACK_GLOBAL(slarfx,SLARFX)
+#define LAPACK_dlarfx LAPACK_GLOBAL(dlarfx,DLARFX)
+#define LAPACK_clarfx LAPACK_GLOBAL(clarfx,CLARFX)
+#define LAPACK_zlarfx LAPACK_GLOBAL(zlarfx,ZLARFX)
+#define LAPACK_slatms LAPACK_GLOBAL(slatms,SLATMS)
+#define LAPACK_dlatms LAPACK_GLOBAL(dlatms,DLATMS)
+#define LAPACK_clatms LAPACK_GLOBAL(clatms,CLATMS)
+#define LAPACK_zlatms LAPACK_GLOBAL(zlatms,ZLATMS)
+#define LAPACK_slag2d LAPACK_GLOBAL(slag2d,SLAG2D)
+#define LAPACK_dlag2s LAPACK_GLOBAL(dlag2s,DLAG2S)
+#define LAPACK_clag2z LAPACK_GLOBAL(clag2z,CLAG2Z)
+#define LAPACK_zlag2c LAPACK_GLOBAL(zlag2c,ZLAG2C)
+#define LAPACK_slauum LAPACK_GLOBAL(slauum,SLAUUM)
+#define LAPACK_dlauum LAPACK_GLOBAL(dlauum,DLAUUM)
+#define LAPACK_clauum LAPACK_GLOBAL(clauum,CLAUUM)
+#define LAPACK_zlauum LAPACK_GLOBAL(zlauum,ZLAUUM)
+#define LAPACK_slagge LAPACK_GLOBAL(slagge,SLAGGE)
+#define LAPACK_dlagge LAPACK_GLOBAL(dlagge,DLAGGE)
+#define LAPACK_clagge LAPACK_GLOBAL(clagge,CLAGGE)
+#define LAPACK_zlagge LAPACK_GLOBAL(zlagge,ZLAGGE)
+#define LAPACK_slaset LAPACK_GLOBAL(slaset,SLASET)
+#define LAPACK_dlaset LAPACK_GLOBAL(dlaset,DLASET)
+#define LAPACK_claset LAPACK_GLOBAL(claset,CLASET)
+#define LAPACK_zlaset LAPACK_GLOBAL(zlaset,ZLASET)
+#define LAPACK_slasrt LAPACK_GLOBAL(slasrt,SLASRT)
+#define LAPACK_dlasrt LAPACK_GLOBAL(dlasrt,DLASRT)
+#define LAPACK_slagsy LAPACK_GLOBAL(slagsy,SLAGSY)
+#define LAPACK_dlagsy LAPACK_GLOBAL(dlagsy,DLAGSY)
+#define LAPACK_clagsy LAPACK_GLOBAL(clagsy,CLAGSY)
+#define LAPACK_zlagsy LAPACK_GLOBAL(zlagsy,ZLAGSY)
+#define LAPACK_claghe LAPACK_GLOBAL(claghe,CLAGHE)
+#define LAPACK_zlaghe LAPACK_GLOBAL(zlaghe,ZLAGHE)
+#define LAPACK_slapmr LAPACK_GLOBAL(slapmr,SLAPMR)
+#define LAPACK_dlapmr LAPACK_GLOBAL(dlapmr,DLAPMR)
+#define LAPACK_clapmr LAPACK_GLOBAL(clapmr,CLAPMR)
+#define LAPACK_zlapmr LAPACK_GLOBAL(zlapmr,ZLAPMR)
+#define LAPACK_slapy2 LAPACK_GLOBAL(slapy2,SLAPY2)
+#define LAPACK_dlapy2 LAPACK_GLOBAL(dlapy2,DLAPY2)
+#define LAPACK_slapy3 LAPACK_GLOBAL(slapy3,SLAPY3)
+#define LAPACK_dlapy3 LAPACK_GLOBAL(dlapy3,DLAPY3)
+#define LAPACK_slartgp LAPACK_GLOBAL(slartgp,SLARTGP)
+#define LAPACK_dlartgp LAPACK_GLOBAL(dlartgp,DLARTGP)
+#define LAPACK_slartgs LAPACK_GLOBAL(slartgs,SLARTGS)
+#define LAPACK_dlartgs LAPACK_GLOBAL(dlartgs,DLARTGS)
+// LAPACK 3.3.0
+#define LAPACK_cbbcsd LAPACK_GLOBAL(cbbcsd,CBBCSD)
+#define LAPACK_cheswapr LAPACK_GLOBAL(cheswapr,CHESWAPR)
+#define LAPACK_chetri2 LAPACK_GLOBAL(chetri2,CHETRI2)
+#define LAPACK_chetri2x LAPACK_GLOBAL(chetri2x,CHETRI2X)
+#define LAPACK_chetrs2 LAPACK_GLOBAL(chetrs2,CHETRS2)
+#define LAPACK_csyconv LAPACK_GLOBAL(csyconv,CSYCONV)
+#define LAPACK_csyswapr LAPACK_GLOBAL(csyswapr,CSYSWAPR)
+#define LAPACK_csytri2 LAPACK_GLOBAL(csytri2,CSYTRI2)
+#define LAPACK_csytri2x LAPACK_GLOBAL(csytri2x,CSYTRI2X)
+#define LAPACK_csytrs2 LAPACK_GLOBAL(csytrs2,CSYTRS2)
+#define LAPACK_cunbdb LAPACK_GLOBAL(cunbdb,CUNBDB)
+#define LAPACK_cuncsd LAPACK_GLOBAL(cuncsd,CUNCSD)
+#define LAPACK_dbbcsd LAPACK_GLOBAL(dbbcsd,DBBCSD)
+#define LAPACK_dorbdb LAPACK_GLOBAL(dorbdb,DORBDB)
+#define LAPACK_dorcsd LAPACK_GLOBAL(dorcsd,DORCSD)
+#define LAPACK_dsyconv LAPACK_GLOBAL(dsyconv,DSYCONV)
+#define LAPACK_dsyswapr LAPACK_GLOBAL(dsyswapr,DSYSWAPR)
+#define LAPACK_dsytri2 LAPACK_GLOBAL(dsytri2,DSYTRI2)
+#define LAPACK_dsytri2x LAPACK_GLOBAL(dsytri2x,DSYTRI2X)
+#define LAPACK_dsytrs2 LAPACK_GLOBAL(dsytrs2,DSYTRS2)
+#define LAPACK_sbbcsd LAPACK_GLOBAL(sbbcsd,SBBCSD)
+#define LAPACK_sorbdb LAPACK_GLOBAL(sorbdb,SORBDB)
+#define LAPACK_sorcsd LAPACK_GLOBAL(sorcsd,SORCSD)
+#define LAPACK_ssyconv LAPACK_GLOBAL(ssyconv,SSYCONV)
+#define LAPACK_ssyswapr LAPACK_GLOBAL(ssyswapr,SSYSWAPR)
+#define LAPACK_ssytri2 LAPACK_GLOBAL(ssytri2,SSYTRI2)
+#define LAPACK_ssytri2x LAPACK_GLOBAL(ssytri2x,SSYTRI2X)
+#define LAPACK_ssytrs2 LAPACK_GLOBAL(ssytrs2,SSYTRS2)
+#define LAPACK_zbbcsd LAPACK_GLOBAL(zbbcsd,ZBBCSD)
+#define LAPACK_zheswapr LAPACK_GLOBAL(zheswapr,ZHESWAPR)
+#define LAPACK_zhetri2 LAPACK_GLOBAL(zhetri2,ZHETRI2)
+#define LAPACK_zhetri2x LAPACK_GLOBAL(zhetri2x,ZHETRI2X)
+#define LAPACK_zhetrs2 LAPACK_GLOBAL(zhetrs2,ZHETRS2)
+#define LAPACK_zsyconv LAPACK_GLOBAL(zsyconv,ZSYCONV)
+#define LAPACK_zsyswapr LAPACK_GLOBAL(zsyswapr,ZSYSWAPR)
+#define LAPACK_zsytri2 LAPACK_GLOBAL(zsytri2,ZSYTRI2)
+#define LAPACK_zsytri2x LAPACK_GLOBAL(zsytri2x,ZSYTRI2X)
+#define LAPACK_zsytrs2 LAPACK_GLOBAL(zsytrs2,ZSYTRS2)
+#define LAPACK_zunbdb LAPACK_GLOBAL(zunbdb,ZUNBDB)
+#define LAPACK_zuncsd LAPACK_GLOBAL(zuncsd,ZUNCSD)
+// LAPACK 3.4.0
+#define LAPACK_sgemqrt LAPACK_GLOBAL(sgemqrt,SGEMQRT)
+#define LAPACK_dgemqrt LAPACK_GLOBAL(dgemqrt,DGEMQRT)
+#define LAPACK_cgemqrt LAPACK_GLOBAL(cgemqrt,CGEMQRT)
+#define LAPACK_zgemqrt LAPACK_GLOBAL(zgemqrt,ZGEMQRT)
+#define LAPACK_sgeqrt LAPACK_GLOBAL(sgeqrt,SGEQRT)
+#define LAPACK_dgeqrt LAPACK_GLOBAL(dgeqrt,DGEQRT)
+#define LAPACK_cgeqrt LAPACK_GLOBAL(cgeqrt,CGEQRT)
+#define LAPACK_zgeqrt LAPACK_GLOBAL(zgeqrt,ZGEQRT)
+#define LAPACK_sgeqrt2 LAPACK_GLOBAL(sgeqrt2,SGEQRT2)
+#define LAPACK_dgeqrt2 LAPACK_GLOBAL(dgeqrt2,DGEQRT2)
+#define LAPACK_cgeqrt2 LAPACK_GLOBAL(cgeqrt2,CGEQRT2)
+#define LAPACK_zgeqrt2 LAPACK_GLOBAL(zgeqrt2,ZGEQRT2)
+#define LAPACK_sgeqrt3 LAPACK_GLOBAL(sgeqrt3,SGEQRT3)
+#define LAPACK_dgeqrt3 LAPACK_GLOBAL(dgeqrt3,DGEQRT3)
+#define LAPACK_cgeqrt3 LAPACK_GLOBAL(cgeqrt3,CGEQRT3)
+#define LAPACK_zgeqrt3 LAPACK_GLOBAL(zgeqrt3,ZGEQRT3)
+#define LAPACK_stpmqrt LAPACK_GLOBAL(stpmqrt,STPMQRT)
+#define LAPACK_dtpmqrt LAPACK_GLOBAL(dtpmqrt,DTPMQRT)
+#define LAPACK_ctpmqrt LAPACK_GLOBAL(ctpmqrt,CTPMQRT)
+#define LAPACK_ztpmqrt LAPACK_GLOBAL(ztpmqrt,ZTPMQRT)
+#define LAPACK_dtpqrt LAPACK_GLOBAL(dtpqrt,DTPQRT)
+#define LAPACK_ctpqrt LAPACK_GLOBAL(ctpqrt,CTPQRT)
+#define LAPACK_ztpqrt LAPACK_GLOBAL(ztpqrt,ZTPQRT)
+#define LAPACK_stpqrt2 LAPACK_GLOBAL(stpqrt2,STPQRT2)
+#define LAPACK_dtpqrt2 LAPACK_GLOBAL(dtpqrt2,DTPQRT2)
+#define LAPACK_ctpqrt2 LAPACK_GLOBAL(ctpqrt2,CTPQRT2)
+#define LAPACK_ztpqrt2 LAPACK_GLOBAL(ztpqrt2,ZTPQRT2)
+#define LAPACK_stprfb LAPACK_GLOBAL(stprfb,STPRFB)
+#define LAPACK_dtprfb LAPACK_GLOBAL(dtprfb,DTPRFB)
+#define LAPACK_ctprfb LAPACK_GLOBAL(ctprfb,CTPRFB)
+#define LAPACK_ztprfb LAPACK_GLOBAL(ztprfb,ZTPRFB)
+// LAPACK 3.X.X
+#define LAPACK_csyr LAPACK_GLOBAL(csyr,CSYR)
+#define LAPACK_zsyr LAPACK_GLOBAL(zsyr,ZSYR)
+
+
+void LAPACK_sgetrf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_dgetrf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_cgetrf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* ipiv, lapack_int *info );
+void LAPACK_zgetrf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* ipiv, lapack_int *info );
+void LAPACK_sgbtrf( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, float* ab, lapack_int* ldab,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_dgbtrf( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, double* ab, lapack_int* ldab,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_cgbtrf( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, lapack_complex_float* ab, lapack_int* ldab,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_zgbtrf( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, lapack_complex_double* ab, lapack_int* ldab,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_sgttrf( lapack_int* n, float* dl, float* d, float* du, float* du2,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_dgttrf( lapack_int* n, double* dl, double* d, double* du,
+                    double* du2, lapack_int* ipiv, lapack_int *info );
+void LAPACK_cgttrf( lapack_int* n, lapack_complex_float* dl,
+                    lapack_complex_float* d, lapack_complex_float* du,
+                    lapack_complex_float* du2, lapack_int* ipiv,
+                    lapack_int *info );
+void LAPACK_zgttrf( lapack_int* n, lapack_complex_double* dl,
+                    lapack_complex_double* d, lapack_complex_double* du,
+                    lapack_complex_double* du2, lapack_int* ipiv,
+                    lapack_int *info );
+void LAPACK_spotrf( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_dpotrf( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_cpotrf( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_zpotrf( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_dpstrf( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* piv, lapack_int* rank, double* tol,
+                    double* work, lapack_int *info );
+void LAPACK_spstrf( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* piv, lapack_int* rank, float* tol, float* work,
+                    lapack_int *info );
+void LAPACK_zpstrf( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* piv, lapack_int* rank,
+                    double* tol, double* work, lapack_int *info );
+void LAPACK_cpstrf( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* piv, lapack_int* rank,
+                    float* tol, float* work, lapack_int *info );
+void LAPACK_dpftrf( char* transr, char* uplo, lapack_int* n, double* a,
+                    lapack_int *info );
+void LAPACK_spftrf( char* transr, char* uplo, lapack_int* n, float* a,
+                    lapack_int *info );
+void LAPACK_zpftrf( char* transr, char* uplo, lapack_int* n,
+                    lapack_complex_double* a, lapack_int *info );
+void LAPACK_cpftrf( char* transr, char* uplo, lapack_int* n,
+                    lapack_complex_float* a, lapack_int *info );
+void LAPACK_spptrf( char* uplo, lapack_int* n, float* ap, lapack_int *info );
+void LAPACK_dpptrf( char* uplo, lapack_int* n, double* ap, lapack_int *info );
+void LAPACK_cpptrf( char* uplo, lapack_int* n, lapack_complex_float* ap,
+                    lapack_int *info );
+void LAPACK_zpptrf( char* uplo, lapack_int* n, lapack_complex_double* ap,
+                    lapack_int *info );
+void LAPACK_spbtrf( char* uplo, lapack_int* n, lapack_int* kd, float* ab,
+                    lapack_int* ldab, lapack_int *info );
+void LAPACK_dpbtrf( char* uplo, lapack_int* n, lapack_int* kd, double* ab,
+                    lapack_int* ldab, lapack_int *info );
+void LAPACK_cpbtrf( char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_complex_float* ab, lapack_int* ldab,
+                    lapack_int *info );
+void LAPACK_zpbtrf( char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_complex_double* ab, lapack_int* ldab,
+                    lapack_int *info );
+void LAPACK_spttrf( lapack_int* n, float* d, float* e, lapack_int *info );
+void LAPACK_dpttrf( lapack_int* n, double* d, double* e, lapack_int *info );
+void LAPACK_cpttrf( lapack_int* n, float* d, lapack_complex_float* e,
+                    lapack_int *info );
+void LAPACK_zpttrf( lapack_int* n, double* d, lapack_complex_double* e,
+                    lapack_int *info );
+void LAPACK_ssytrf( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* ipiv, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dsytrf( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* ipiv, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_csytrf( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* ipiv,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zsytrf( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* ipiv,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_chetrf( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* ipiv,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zhetrf( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* ipiv,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_ssptrf( char* uplo, lapack_int* n, float* ap, lapack_int* ipiv,
+                    lapack_int *info );
+void LAPACK_dsptrf( char* uplo, lapack_int* n, double* ap, lapack_int* ipiv,
+                    lapack_int *info );
+void LAPACK_csptrf( char* uplo, lapack_int* n, lapack_complex_float* ap,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_zsptrf( char* uplo, lapack_int* n, lapack_complex_double* ap,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_chptrf( char* uplo, lapack_int* n, lapack_complex_float* ap,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_zhptrf( char* uplo, lapack_int* n, lapack_complex_double* ap,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_sgetrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const float* a, lapack_int* lda, const lapack_int* ipiv,
+                    float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_dgetrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const double* a, lapack_int* lda, const lapack_int* ipiv,
+                    double* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_cgetrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_zgetrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_sgbtrs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const float* ab, lapack_int* ldab,
+                    const lapack_int* ipiv, float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_dgbtrs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const double* ab, lapack_int* ldab,
+                    const lapack_int* ipiv, double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_cgbtrs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const lapack_complex_float* ab,
+                    lapack_int* ldab, const lapack_int* ipiv,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zgbtrs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const lapack_complex_double* ab,
+                    lapack_int* ldab, const lapack_int* ipiv,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_sgttrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const float* dl, const float* d, const float* du,
+                    const float* du2, const lapack_int* ipiv, float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_dgttrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const double* dl, const double* d, const double* du,
+                    const double* du2, const lapack_int* ipiv, double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_cgttrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* dl,
+                    const lapack_complex_float* d,
+                    const lapack_complex_float* du,
+                    const lapack_complex_float* du2, const lapack_int* ipiv,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zgttrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* dl,
+                    const lapack_complex_double* d,
+                    const lapack_complex_double* du,
+                    const lapack_complex_double* du2, const lapack_int* ipiv,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_spotrs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a,
+                    lapack_int* lda, float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_dpotrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* a, lapack_int* lda, double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_cpotrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zpotrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_dpftrs( char* transr, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* a, double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_spftrs( char* transr, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const float* a, float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zpftrs( char* transr, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_cpftrs( char* transr, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_spptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const float* ap, float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_dpptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* ap, double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_cpptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_zpptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_spbtrs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const float* ab, lapack_int* ldab, float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_dpbtrs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const double* ab, lapack_int* ldab, double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_cpbtrs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_float* ab, lapack_int* ldab,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zpbtrs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_double* ab, lapack_int* ldab,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_spttrs( lapack_int* n, lapack_int* nrhs, const float* d,
+                    const float* e, float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_dpttrs( lapack_int* n, lapack_int* nrhs, const double* d,
+                    const double* e, double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_cpttrs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* d,
+                    const lapack_complex_float* e, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_zpttrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* d, const lapack_complex_double* e,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_ssytrs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a,
+                    lapack_int* lda, const lapack_int* ipiv, float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_dsytrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* a, lapack_int* lda, const lapack_int* ipiv,
+                    double* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_csytrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_zsytrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_chetrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_zhetrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_ssptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const float* ap, const lapack_int* ipiv, float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_dsptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* ap, const lapack_int* ipiv, double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_csptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap, const lapack_int* ipiv,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zsptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap, const lapack_int* ipiv,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_chptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap, const lapack_int* ipiv,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zhptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap, const lapack_int* ipiv,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_strtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const float* a, lapack_int* lda, float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_dtrtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const double* a, lapack_int* lda,
+                    double* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_ctrtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_ztrtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_stptrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const float* ap, float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_dtptrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const double* ap, double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_ctptrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_float* ap,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_ztptrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_double* ap,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_stbtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs, const float* ab,
+                    lapack_int* ldab, float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_dtbtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs, const double* ab,
+                    lapack_int* ldab, double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_ctbtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_float* ab, lapack_int* ldab,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_ztbtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_double* ab, lapack_int* ldab,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_sgecon( char* norm, lapack_int* n, const float* a, lapack_int* lda,
+                    float* anorm, float* rcond, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgecon( char* norm, lapack_int* n, const double* a, lapack_int* lda,
+                    double* anorm, double* rcond, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_cgecon( char* norm, lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, float* anorm, float* rcond,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zgecon( char* norm, lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, double* anorm, double* rcond,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sgbcon( char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    const float* ab, lapack_int* ldab, const lapack_int* ipiv,
+                    float* anorm, float* rcond, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgbcon( char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    const double* ab, lapack_int* ldab, const lapack_int* ipiv,
+                    double* anorm, double* rcond, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_cgbcon( char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    const lapack_complex_float* ab, lapack_int* ldab,
+                    const lapack_int* ipiv, float* anorm, float* rcond,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zgbcon( char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    const lapack_complex_double* ab, lapack_int* ldab,
+                    const lapack_int* ipiv, double* anorm, double* rcond,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sgtcon( char* norm, lapack_int* n, const float* dl, const float* d,
+                    const float* du, const float* du2, const lapack_int* ipiv,
+                    float* anorm, float* rcond, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgtcon( char* norm, lapack_int* n, const double* dl,
+                    const double* d, const double* du, const double* du2,
+                    const lapack_int* ipiv, double* anorm, double* rcond,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cgtcon( char* norm, lapack_int* n, const lapack_complex_float* dl,
+                    const lapack_complex_float* d,
+                    const lapack_complex_float* du,
+                    const lapack_complex_float* du2, const lapack_int* ipiv,
+                    float* anorm, float* rcond, lapack_complex_float* work,
+                    lapack_int *info );
+void LAPACK_zgtcon( char* norm, lapack_int* n, const lapack_complex_double* dl,
+                    const lapack_complex_double* d,
+                    const lapack_complex_double* du,
+                    const lapack_complex_double* du2, const lapack_int* ipiv,
+                    double* anorm, double* rcond, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_spocon( char* uplo, lapack_int* n, const float* a, lapack_int* lda,
+                    float* anorm, float* rcond, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dpocon( char* uplo, lapack_int* n, const double* a, lapack_int* lda,
+                    double* anorm, double* rcond, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_cpocon( char* uplo, lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, float* anorm, float* rcond,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zpocon( char* uplo, lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, double* anorm, double* rcond,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sppcon( char* uplo, lapack_int* n, const float* ap, float* anorm,
+                    float* rcond, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dppcon( char* uplo, lapack_int* n, const double* ap, double* anorm,
+                    double* rcond, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cppcon( char* uplo, lapack_int* n, const lapack_complex_float* ap,
+                    float* anorm, float* rcond, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zppcon( char* uplo, lapack_int* n, const lapack_complex_double* ap,
+                    double* anorm, double* rcond, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_spbcon( char* uplo, lapack_int* n, lapack_int* kd, const float* ab,
+                    lapack_int* ldab, float* anorm, float* rcond, float* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dpbcon( char* uplo, lapack_int* n, lapack_int* kd, const double* ab,
+                    lapack_int* ldab, double* anorm, double* rcond,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cpbcon( char* uplo, lapack_int* n, lapack_int* kd,
+                    const lapack_complex_float* ab, lapack_int* ldab,
+                    float* anorm, float* rcond, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zpbcon( char* uplo, lapack_int* n, lapack_int* kd,
+                    const lapack_complex_double* ab, lapack_int* ldab,
+                    double* anorm, double* rcond, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_sptcon( lapack_int* n, const float* d, const float* e, float* anorm,
+                    float* rcond, float* work, lapack_int *info );
+void LAPACK_dptcon( lapack_int* n, const double* d, const double* e,
+                    double* anorm, double* rcond, double* work,
+                    lapack_int *info );
+void LAPACK_cptcon( lapack_int* n, const float* d,
+                    const lapack_complex_float* e, float* anorm, float* rcond,
+                    float* work, lapack_int *info );
+void LAPACK_zptcon( lapack_int* n, const double* d,
+                    const lapack_complex_double* e, double* anorm,
+                    double* rcond, double* work, lapack_int *info );
+void LAPACK_ssycon( char* uplo, lapack_int* n, const float* a, lapack_int* lda,
+                    const lapack_int* ipiv, float* anorm, float* rcond,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dsycon( char* uplo, lapack_int* n, const double* a, lapack_int* lda,
+                    const lapack_int* ipiv, double* anorm, double* rcond,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_csycon( char* uplo, lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_int* ipiv, float* anorm,
+                    float* rcond, lapack_complex_float* work,
+                    lapack_int *info );
+void LAPACK_zsycon( char* uplo, lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_int* ipiv, double* anorm,
+                    double* rcond, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_checon( char* uplo, lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_int* ipiv, float* anorm,
+                    float* rcond, lapack_complex_float* work,
+                    lapack_int *info );
+void LAPACK_zhecon( char* uplo, lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_int* ipiv, double* anorm,
+                    double* rcond, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_sspcon( char* uplo, lapack_int* n, const float* ap,
+                    const lapack_int* ipiv, float* anorm, float* rcond,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dspcon( char* uplo, lapack_int* n, const double* ap,
+                    const lapack_int* ipiv, double* anorm, double* rcond,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cspcon( char* uplo, lapack_int* n, const lapack_complex_float* ap,
+                    const lapack_int* ipiv, float* anorm, float* rcond,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zspcon( char* uplo, lapack_int* n, const lapack_complex_double* ap,
+                    const lapack_int* ipiv, double* anorm, double* rcond,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_chpcon( char* uplo, lapack_int* n, const lapack_complex_float* ap,
+                    const lapack_int* ipiv, float* anorm, float* rcond,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zhpcon( char* uplo, lapack_int* n, const lapack_complex_double* ap,
+                    const lapack_int* ipiv, double* anorm, double* rcond,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_strcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const float* a, lapack_int* lda, float* rcond, float* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dtrcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const double* a, lapack_int* lda, double* rcond,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_ctrcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    float* rcond, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_ztrcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    double* rcond, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_stpcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const float* ap, float* rcond, float* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dtpcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const double* ap, double* rcond, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_ctpcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const lapack_complex_float* ap, float* rcond,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_ztpcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const lapack_complex_double* ap, double* rcond,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_stbcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    lapack_int* kd, const float* ab, lapack_int* ldab,
+                    float* rcond, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dtbcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    lapack_int* kd, const double* ab, lapack_int* ldab,
+                    double* rcond, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_ctbcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    lapack_int* kd, const lapack_complex_float* ab,
+                    lapack_int* ldab, float* rcond, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_ztbcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    lapack_int* kd, const lapack_complex_double* ab,
+                    lapack_int* ldab, double* rcond,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sgerfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const float* a, lapack_int* lda, const float* af,
+                    lapack_int* ldaf, const lapack_int* ipiv, const float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* ferr,
+                    float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgerfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const double* a, lapack_int* lda, const double* af,
+                    lapack_int* ldaf, const lapack_int* ipiv, const double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
+                    double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cgerfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* af, lapack_int* ldaf,
+                    const lapack_int* ipiv, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zgerfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* af, lapack_int* ldaf,
+                    const lapack_int* ipiv, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_dgerfsx( char* trans, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const double* a, lapack_int* lda, const double* af,
+                     lapack_int* ldaf, const lapack_int* ipiv, const double* r,
+                     const double* c, const double* b, lapack_int* ldb,
+                     double* x, lapack_int* ldx, double* rcond, double* berr,
+                     lapack_int* n_err_bnds, double* err_bnds_norm,
+                     double* err_bnds_comp, lapack_int* nparams, double* params,
+                     double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_sgerfsx( char* trans, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const float* a, lapack_int* lda, const float* af,
+                     lapack_int* ldaf, const lapack_int* ipiv, const float* r,
+                     const float* c, const float* b, lapack_int* ldb, float* x,
+                     lapack_int* ldx, float* rcond, float* berr,
+                     lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_zgerfsx( char* trans, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_double* a, lapack_int* lda,
+                     const lapack_complex_double* af, lapack_int* ldaf,
+                     const lapack_int* ipiv, const double* r, const double* c,
+                     const lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_cgerfsx( char* trans, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_float* a, lapack_int* lda,
+                     const lapack_complex_float* af, lapack_int* ldaf,
+                     const lapack_int* ipiv, const float* r, const float* c,
+                     const lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_sgbrfs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const float* ab, lapack_int* ldab,
+                    const float* afb, lapack_int* ldafb, const lapack_int* ipiv,
+                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgbrfs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const double* ab, lapack_int* ldab,
+                    const double* afb, lapack_int* ldafb,
+                    const lapack_int* ipiv, const double* b, lapack_int* ldb,
+                    double* x, lapack_int* ldx, double* ferr, double* berr,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cgbrfs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const lapack_complex_float* ab,
+                    lapack_int* ldab, const lapack_complex_float* afb,
+                    lapack_int* ldafb, const lapack_int* ipiv,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zgbrfs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const lapack_complex_double* ab,
+                    lapack_int* ldab, const lapack_complex_double* afb,
+                    lapack_int* ldafb, const lapack_int* ipiv,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_dgbrfsx( char* trans, char* equed, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs, const double* ab,
+                     lapack_int* ldab, const double* afb, lapack_int* ldafb,
+                     const lapack_int* ipiv, const double* r, const double* c,
+                     const double* b, lapack_int* ldb, double* x,
+                     lapack_int* ldx, double* rcond, double* berr,
+                     lapack_int* n_err_bnds, double* err_bnds_norm,
+                     double* err_bnds_comp, lapack_int* nparams, double* params,
+                     double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_sgbrfsx( char* trans, char* equed, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs, const float* ab,
+                     lapack_int* ldab, const float* afb, lapack_int* ldafb,
+                     const lapack_int* ipiv, const float* r, const float* c,
+                     const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                     float* rcond, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params, float* work,
+                     lapack_int* iwork, lapack_int *info );
+void LAPACK_zgbrfsx( char* trans, char* equed, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs,
+                     const lapack_complex_double* ab, lapack_int* ldab,
+                     const lapack_complex_double* afb, lapack_int* ldafb,
+                     const lapack_int* ipiv, const double* r, const double* c,
+                     const lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_cgbrfsx( char* trans, char* equed, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs,
+                     const lapack_complex_float* ab, lapack_int* ldab,
+                     const lapack_complex_float* afb, lapack_int* ldafb,
+                     const lapack_int* ipiv, const float* r, const float* c,
+                     const lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_sgtrfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const float* dl, const float* d, const float* du,
+                    const float* dlf, const float* df, const float* duf,
+                    const float* du2, const lapack_int* ipiv, const float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* ferr,
+                    float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgtrfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const double* dl, const double* d, const double* du,
+                    const double* dlf, const double* df, const double* duf,
+                    const double* du2, const lapack_int* ipiv, const double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
+                    double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cgtrfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* dl,
+                    const lapack_complex_float* d,
+                    const lapack_complex_float* du,
+                    const lapack_complex_float* dlf,
+                    const lapack_complex_float* df,
+                    const lapack_complex_float* duf,
+                    const lapack_complex_float* du2, const lapack_int* ipiv,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zgtrfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* dl,
+                    const lapack_complex_double* d,
+                    const lapack_complex_double* du,
+                    const lapack_complex_double* dlf,
+                    const lapack_complex_double* df,
+                    const lapack_complex_double* duf,
+                    const lapack_complex_double* du2, const lapack_int* ipiv,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sporfs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a,
+                    lapack_int* lda, const float* af, lapack_int* ldaf,
+                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dporfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* a, lapack_int* lda, const double* af,
+                    lapack_int* ldaf, const double* b, lapack_int* ldb,
+                    double* x, lapack_int* ldx, double* ferr, double* berr,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cporfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* af, lapack_int* ldaf,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zporfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* af, lapack_int* ldaf,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_dporfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const double* a, lapack_int* lda, const double* af,
+                     lapack_int* ldaf, const double* s, const double* b,
+                     lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
+                     double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params, double* work,
+                     lapack_int* iwork, lapack_int *info );
+void LAPACK_sporfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const float* a, lapack_int* lda, const float* af,
+                     lapack_int* ldaf, const float* s, const float* b,
+                     lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                     float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_zporfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_double* a, lapack_int* lda,
+                     const lapack_complex_double* af, lapack_int* ldaf,
+                     const double* s, const lapack_complex_double* b,
+                     lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                     double* rcond, double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_cporfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_float* a, lapack_int* lda,
+                     const lapack_complex_float* af, lapack_int* ldaf,
+                     const float* s, const lapack_complex_float* b,
+                     lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                     float* rcond, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_spprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const float* ap, const float* afp, const float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* ferr,
+                    float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dpprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* ap, const double* afp, const double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
+                    double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cpprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap,
+                    const lapack_complex_float* afp,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zpprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap,
+                    const lapack_complex_double* afp,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_spbrfs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const float* ab, lapack_int* ldab, const float* afb,
+                    lapack_int* ldafb, const float* b, lapack_int* ldb,
+                    float* x, lapack_int* ldx, float* ferr, float* berr,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dpbrfs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const double* ab, lapack_int* ldab, const double* afb,
+                    lapack_int* ldafb, const double* b, lapack_int* ldb,
+                    double* x, lapack_int* ldx, double* ferr, double* berr,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cpbrfs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_float* ab, lapack_int* ldab,
+                    const lapack_complex_float* afb, lapack_int* ldafb,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zpbrfs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_double* ab, lapack_int* ldab,
+                    const lapack_complex_double* afb, lapack_int* ldafb,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sptrfs( lapack_int* n, lapack_int* nrhs, const float* d,
+                    const float* e, const float* df, const float* ef,
+                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                    float* ferr, float* berr, float* work, lapack_int *info );
+void LAPACK_dptrfs( lapack_int* n, lapack_int* nrhs, const double* d,
+                    const double* e, const double* df, const double* ef,
+                    const double* b, lapack_int* ldb, double* x,
+                    lapack_int* ldx, double* ferr, double* berr, double* work,
+                    lapack_int *info );
+void LAPACK_cptrfs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* d,
+                    const lapack_complex_float* e, const float* df,
+                    const lapack_complex_float* ef,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zptrfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* d, const lapack_complex_double* e,
+                    const double* df, const lapack_complex_double* ef,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_ssyrfs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a,
+                    lapack_int* lda, const float* af, lapack_int* ldaf,
+                    const lapack_int* ipiv, const float* b, lapack_int* ldb,
+                    float* x, lapack_int* ldx, float* ferr, float* berr,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dsyrfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* a, lapack_int* lda, const double* af,
+                    lapack_int* ldaf, const lapack_int* ipiv, const double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
+                    double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_csyrfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* af, lapack_int* ldaf,
+                    const lapack_int* ipiv, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zsyrfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* af, lapack_int* ldaf,
+                    const lapack_int* ipiv, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_dsyrfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const double* a, lapack_int* lda, const double* af,
+                     lapack_int* ldaf, const lapack_int* ipiv, const double* s,
+                     const double* b, lapack_int* ldb, double* x,
+                     lapack_int* ldx, double* rcond, double* berr,
+                     lapack_int* n_err_bnds, double* err_bnds_norm,
+                     double* err_bnds_comp, lapack_int* nparams, double* params,
+                     double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_ssyrfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const float* a, lapack_int* lda, const float* af,
+                     lapack_int* ldaf, const lapack_int* ipiv, const float* s,
+                     const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                     float* rcond, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params, float* work,
+                     lapack_int* iwork, lapack_int *info );
+void LAPACK_zsyrfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_double* a, lapack_int* lda,
+                     const lapack_complex_double* af, lapack_int* ldaf,
+                     const lapack_int* ipiv, const double* s,
+                     const lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_csyrfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_float* a, lapack_int* lda,
+                     const lapack_complex_float* af, lapack_int* ldaf,
+                     const lapack_int* ipiv, const float* s,
+                     const lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_cherfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* af, lapack_int* ldaf,
+                    const lapack_int* ipiv, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zherfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* af, lapack_int* ldaf,
+                    const lapack_int* ipiv, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_zherfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_double* a, lapack_int* lda,
+                     const lapack_complex_double* af, lapack_int* ldaf,
+                     const lapack_int* ipiv, const double* s,
+                     const lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_cherfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_float* a, lapack_int* lda,
+                     const lapack_complex_float* af, lapack_int* ldaf,
+                     const lapack_int* ipiv, const float* s,
+                     const lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_ssprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const float* ap, const float* afp, const lapack_int* ipiv,
+                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dsprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* ap, const double* afp, const lapack_int* ipiv,
+                    const double* b, lapack_int* ldb, double* x,
+                    lapack_int* ldx, double* ferr, double* berr, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_csprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap,
+                    const lapack_complex_float* afp, const lapack_int* ipiv,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zsprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap,
+                    const lapack_complex_double* afp, const lapack_int* ipiv,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_chprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap,
+                    const lapack_complex_float* afp, const lapack_int* ipiv,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zhprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap,
+                    const lapack_complex_double* afp, const lapack_int* ipiv,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_strrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const float* a, lapack_int* lda,
+                    const float* b, lapack_int* ldb, const float* x,
+                    lapack_int* ldx, float* ferr, float* berr, float* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dtrrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const double* a, lapack_int* lda,
+                    const double* b, lapack_int* ldb, const double* x,
+                    lapack_int* ldx, double* ferr, double* berr, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_ctrrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* b,
+                    lapack_int* ldb, const lapack_complex_float* x,
+                    lapack_int* ldx, float* ferr, float* berr,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_ztrrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* b,
+                    lapack_int* ldb, const lapack_complex_double* x,
+                    lapack_int* ldx, double* ferr, double* berr,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_stprfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const float* ap, const float* b,
+                    lapack_int* ldb, const float* x, lapack_int* ldx,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dtprfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const double* ap, const double* b,
+                    lapack_int* ldb, const double* x, lapack_int* ldx,
+                    double* ferr, double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_ctprfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_float* ap,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    const lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_ztprfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_double* ap,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    const lapack_complex_double* x, lapack_int* ldx,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_stbrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs, const float* ab,
+                    lapack_int* ldab, const float* b, lapack_int* ldb,
+                    const float* x, lapack_int* ldx, float* ferr, float* berr,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dtbrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs, const double* ab,
+                    lapack_int* ldab, const double* b, lapack_int* ldb,
+                    const double* x, lapack_int* ldx, double* ferr,
+                    double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_ctbrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_float* ab, lapack_int* ldab,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    const lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_ztbrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_double* ab, lapack_int* ldab,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    const lapack_complex_double* x, lapack_int* ldx,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_sgetri( lapack_int* n, float* a, lapack_int* lda,
+                    const lapack_int* ipiv, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dgetri( lapack_int* n, double* a, lapack_int* lda,
+                    const lapack_int* ipiv, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cgetri( lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zgetri( lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_spotri( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_dpotri( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_cpotri( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_zpotri( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_dpftri( char* transr, char* uplo, lapack_int* n, double* a,
+                    lapack_int *info );
+void LAPACK_spftri( char* transr, char* uplo, lapack_int* n, float* a,
+                    lapack_int *info );
+void LAPACK_zpftri( char* transr, char* uplo, lapack_int* n,
+                    lapack_complex_double* a, lapack_int *info );
+void LAPACK_cpftri( char* transr, char* uplo, lapack_int* n,
+                    lapack_complex_float* a, lapack_int *info );
+void LAPACK_spptri( char* uplo, lapack_int* n, float* ap, lapack_int *info );
+void LAPACK_dpptri( char* uplo, lapack_int* n, double* ap, lapack_int *info );
+void LAPACK_cpptri( char* uplo, lapack_int* n, lapack_complex_float* ap,
+                    lapack_int *info );
+void LAPACK_zpptri( char* uplo, lapack_int* n, lapack_complex_double* ap,
+                    lapack_int *info );
+void LAPACK_ssytri( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    const lapack_int* ipiv, float* work, lapack_int *info );
+void LAPACK_dsytri( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    const lapack_int* ipiv, double* work, lapack_int *info );
+void LAPACK_csytri( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, const lapack_int* ipiv,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zsytri( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, const lapack_int* ipiv,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_chetri( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, const lapack_int* ipiv,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zhetri( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, const lapack_int* ipiv,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_ssptri( char* uplo, lapack_int* n, float* ap,
+                    const lapack_int* ipiv, float* work, lapack_int *info );
+void LAPACK_dsptri( char* uplo, lapack_int* n, double* ap,
+                    const lapack_int* ipiv, double* work, lapack_int *info );
+void LAPACK_csptri( char* uplo, lapack_int* n, lapack_complex_float* ap,
+                    const lapack_int* ipiv, lapack_complex_float* work,
+                    lapack_int *info );
+void LAPACK_zsptri( char* uplo, lapack_int* n, lapack_complex_double* ap,
+                    const lapack_int* ipiv, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_chptri( char* uplo, lapack_int* n, lapack_complex_float* ap,
+                    const lapack_int* ipiv, lapack_complex_float* work,
+                    lapack_int *info );
+void LAPACK_zhptri( char* uplo, lapack_int* n, lapack_complex_double* ap,
+                    const lapack_int* ipiv, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_strtri( char* uplo, char* diag, lapack_int* n, float* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_dtrtri( char* uplo, char* diag, lapack_int* n, double* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_ctrtri( char* uplo, char* diag, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_ztrtri( char* uplo, char* diag, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_dtftri( char* transr, char* uplo, char* diag, lapack_int* n,
+                    double* a, lapack_int *info );
+void LAPACK_stftri( char* transr, char* uplo, char* diag, lapack_int* n,
+                    float* a, lapack_int *info );
+void LAPACK_ztftri( char* transr, char* uplo, char* diag, lapack_int* n,
+                    lapack_complex_double* a, lapack_int *info );
+void LAPACK_ctftri( char* transr, char* uplo, char* diag, lapack_int* n,
+                    lapack_complex_float* a, lapack_int *info );
+void LAPACK_stptri( char* uplo, char* diag, lapack_int* n, float* ap,
+                    lapack_int *info );
+void LAPACK_dtptri( char* uplo, char* diag, lapack_int* n, double* ap,
+                    lapack_int *info );
+void LAPACK_ctptri( char* uplo, char* diag, lapack_int* n,
+                    lapack_complex_float* ap, lapack_int *info );
+void LAPACK_ztptri( char* uplo, char* diag, lapack_int* n,
+                    lapack_complex_double* ap, lapack_int *info );
+void LAPACK_sgeequ( lapack_int* m, lapack_int* n, const float* a,
+                    lapack_int* lda, float* r, float* c, float* rowcnd,
+                    float* colcnd, float* amax, lapack_int *info );
+void LAPACK_dgeequ( lapack_int* m, lapack_int* n, const double* a,
+                    lapack_int* lda, double* r, double* c, double* rowcnd,
+                    double* colcnd, double* amax, lapack_int *info );
+void LAPACK_cgeequ( lapack_int* m, lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, float* r, float* c, float* rowcnd,
+                    float* colcnd, float* amax, lapack_int *info );
+void LAPACK_zgeequ( lapack_int* m, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda, double* r,
+                    double* c, double* rowcnd, double* colcnd, double* amax,
+                    lapack_int *info );
+void LAPACK_dgeequb( lapack_int* m, lapack_int* n, const double* a,
+                     lapack_int* lda, double* r, double* c, double* rowcnd,
+                     double* colcnd, double* amax, lapack_int *info );
+void LAPACK_sgeequb( lapack_int* m, lapack_int* n, const float* a,
+                     lapack_int* lda, float* r, float* c, float* rowcnd,
+                     float* colcnd, float* amax, lapack_int *info );
+void LAPACK_zgeequb( lapack_int* m, lapack_int* n,
+                     const lapack_complex_double* a, lapack_int* lda, double* r,
+                     double* c, double* rowcnd, double* colcnd, double* amax,
+                     lapack_int *info );
+void LAPACK_cgeequb( lapack_int* m, lapack_int* n,
+                     const lapack_complex_float* a, lapack_int* lda, float* r,
+                     float* c, float* rowcnd, float* colcnd, float* amax,
+                     lapack_int *info );
+void LAPACK_sgbequ( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const float* ab, lapack_int* ldab, float* r,
+                    float* c, float* rowcnd, float* colcnd, float* amax,
+                    lapack_int *info );
+void LAPACK_dgbequ( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const double* ab, lapack_int* ldab,
+                    double* r, double* c, double* rowcnd, double* colcnd,
+                    double* amax, lapack_int *info );
+void LAPACK_cgbequ( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const lapack_complex_float* ab,
+                    lapack_int* ldab, float* r, float* c, float* rowcnd,
+                    float* colcnd, float* amax, lapack_int *info );
+void LAPACK_zgbequ( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const lapack_complex_double* ab,
+                    lapack_int* ldab, double* r, double* c, double* rowcnd,
+                    double* colcnd, double* amax, lapack_int *info );
+void LAPACK_dgbequb( lapack_int* m, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, const double* ab, lapack_int* ldab,
+                     double* r, double* c, double* rowcnd, double* colcnd,
+                     double* amax, lapack_int *info );
+void LAPACK_sgbequb( lapack_int* m, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, const float* ab, lapack_int* ldab,
+                     float* r, float* c, float* rowcnd, float* colcnd,
+                     float* amax, lapack_int *info );
+void LAPACK_zgbequb( lapack_int* m, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, const lapack_complex_double* ab,
+                     lapack_int* ldab, double* r, double* c, double* rowcnd,
+                     double* colcnd, double* amax, lapack_int *info );
+void LAPACK_cgbequb( lapack_int* m, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, const lapack_complex_float* ab,
+                     lapack_int* ldab, float* r, float* c, float* rowcnd,
+                     float* colcnd, float* amax, lapack_int *info );
+void LAPACK_spoequ( lapack_int* n, const float* a, lapack_int* lda, float* s,
+                    float* scond, float* amax, lapack_int *info );
+void LAPACK_dpoequ( lapack_int* n, const double* a, lapack_int* lda, double* s,
+                    double* scond, double* amax, lapack_int *info );
+void LAPACK_cpoequ( lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, float* s, float* scond, float* amax,
+                    lapack_int *info );
+void LAPACK_zpoequ( lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, double* s, double* scond, double* amax,
+                    lapack_int *info );
+void LAPACK_dpoequb( lapack_int* n, const double* a, lapack_int* lda, double* s,
+                     double* scond, double* amax, lapack_int *info );
+void LAPACK_spoequb( lapack_int* n, const float* a, lapack_int* lda, float* s,
+                     float* scond, float* amax, lapack_int *info );
+void LAPACK_zpoequb( lapack_int* n, const lapack_complex_double* a,
+                     lapack_int* lda, double* s, double* scond, double* amax,
+                     lapack_int *info );
+void LAPACK_cpoequb( lapack_int* n, const lapack_complex_float* a,
+                     lapack_int* lda, float* s, float* scond, float* amax,
+                     lapack_int *info );
+void LAPACK_sppequ( char* uplo, lapack_int* n, const float* ap, float* s,
+                    float* scond, float* amax, lapack_int *info );
+void LAPACK_dppequ( char* uplo, lapack_int* n, const double* ap, double* s,
+                    double* scond, double* amax, lapack_int *info );
+void LAPACK_cppequ( char* uplo, lapack_int* n, const lapack_complex_float* ap,
+                    float* s, float* scond, float* amax, lapack_int *info );
+void LAPACK_zppequ( char* uplo, lapack_int* n, const lapack_complex_double* ap,
+                    double* s, double* scond, double* amax, lapack_int *info );
+void LAPACK_spbequ( char* uplo, lapack_int* n, lapack_int* kd, const float* ab,
+                    lapack_int* ldab, float* s, float* scond, float* amax,
+                    lapack_int *info );
+void LAPACK_dpbequ( char* uplo, lapack_int* n, lapack_int* kd, const double* ab,
+                    lapack_int* ldab, double* s, double* scond, double* amax,
+                    lapack_int *info );
+void LAPACK_cpbequ( char* uplo, lapack_int* n, lapack_int* kd,
+                    const lapack_complex_float* ab, lapack_int* ldab, float* s,
+                    float* scond, float* amax, lapack_int *info );
+void LAPACK_zpbequ( char* uplo, lapack_int* n, lapack_int* kd,
+                    const lapack_complex_double* ab, lapack_int* ldab,
+                    double* s, double* scond, double* amax, lapack_int *info );
+void LAPACK_dsyequb( char* uplo, lapack_int* n, const double* a,
+                     lapack_int* lda, double* s, double* scond, double* amax,
+                     double* work, lapack_int *info );
+void LAPACK_ssyequb( char* uplo, lapack_int* n, const float* a, lapack_int* lda,
+                     float* s, float* scond, float* amax, float* work,
+                     lapack_int *info );
+void LAPACK_zsyequb( char* uplo, lapack_int* n, const lapack_complex_double* a,
+                     lapack_int* lda, double* s, double* scond, double* amax,
+                     lapack_complex_double* work, lapack_int *info );
+void LAPACK_csyequb( char* uplo, lapack_int* n, const lapack_complex_float* a,
+                     lapack_int* lda, float* s, float* scond, float* amax,
+                     lapack_complex_float* work, lapack_int *info );
+void LAPACK_zheequb( char* uplo, lapack_int* n, const lapack_complex_double* a,
+                     lapack_int* lda, double* s, double* scond, double* amax,
+                     lapack_complex_double* work, lapack_int *info );
+void LAPACK_cheequb( char* uplo, lapack_int* n, const lapack_complex_float* a,
+                     lapack_int* lda, float* s, float* scond, float* amax,
+                     lapack_complex_float* work, lapack_int *info );
+void LAPACK_sgesv( lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda,
+                   lapack_int* ipiv, float* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_dgesv( lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda,
+                   lapack_int* ipiv, double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_cgesv( lapack_int* n, lapack_int* nrhs, lapack_complex_float* a,
+                   lapack_int* lda, lapack_int* ipiv, lapack_complex_float* b,
+                   lapack_int* ldb, lapack_int *info );
+void LAPACK_zgesv( lapack_int* n, lapack_int* nrhs, lapack_complex_double* a,
+                   lapack_int* lda, lapack_int* ipiv, lapack_complex_double* b,
+                   lapack_int* ldb, lapack_int *info );
+void LAPACK_dsgesv( lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda,
+                    lapack_int* ipiv, double* b, lapack_int* ldb, double* x,
+                    lapack_int* ldx, double* work, float* swork,
+                    lapack_int* iter, lapack_int *info );
+void LAPACK_zcgesv( lapack_int* n, lapack_int* nrhs, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* ipiv, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    lapack_complex_double* work, lapack_complex_float* swork,
+                    double* rwork, lapack_int* iter, lapack_int *info );
+void LAPACK_sgesvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    float* a, lapack_int* lda, float* af, lapack_int* ldaf,
+                    lapack_int* ipiv, char* equed, float* r, float* c, float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgesvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    double* a, lapack_int* lda, double* af, lapack_int* ldaf,
+                    lapack_int* ipiv, char* equed, double* r, double* c,
+                    double* b, lapack_int* ldb, double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_cgesvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* af, lapack_int* ldaf,
+                    lapack_int* ipiv, char* equed, float* r, float* c,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zgesvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* af, lapack_int* ldaf,
+                    lapack_int* ipiv, char* equed, double* r, double* c,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_dgesvxx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                     double* a, lapack_int* lda, double* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, double* r, double* c,
+                     double* b, lapack_int* ldb, double* x, lapack_int* ldx,
+                     double* rcond, double* rpvgrw, double* berr,
+                     lapack_int* n_err_bnds, double* err_bnds_norm,
+                     double* err_bnds_comp, lapack_int* nparams, double* params,
+                     double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_sgesvxx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                     float* a, lapack_int* lda, float* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, float* r, float* c,
+                     float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                     float* rcond, float* rpvgrw, float* berr,
+                     lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_zgesvxx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_double* a, lapack_int* lda,
+                     lapack_complex_double* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, double* r, double* c,
+                     lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_cgesvxx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_float* a, lapack_int* lda,
+                     lapack_complex_float* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, float* r, float* c,
+                     lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_sgbsv( lapack_int* n, lapack_int* kl, lapack_int* ku,
+                   lapack_int* nrhs, float* ab, lapack_int* ldab,
+                   lapack_int* ipiv, float* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_dgbsv( lapack_int* n, lapack_int* kl, lapack_int* ku,
+                   lapack_int* nrhs, double* ab, lapack_int* ldab,
+                   lapack_int* ipiv, double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_cgbsv( lapack_int* n, lapack_int* kl, lapack_int* ku,
+                   lapack_int* nrhs, lapack_complex_float* ab, lapack_int* ldab,
+                   lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_zgbsv( lapack_int* n, lapack_int* kl, lapack_int* ku,
+                   lapack_int* nrhs, lapack_complex_double* ab,
+                   lapack_int* ldab, lapack_int* ipiv, lapack_complex_double* b,
+                   lapack_int* ldb, lapack_int *info );
+void LAPACK_sgbsvx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, lapack_int* nrhs, float* ab,
+                    lapack_int* ldab, float* afb, lapack_int* ldafb,
+                    lapack_int* ipiv, char* equed, float* r, float* c, float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgbsvx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, lapack_int* nrhs, double* ab,
+                    lapack_int* ldab, double* afb, lapack_int* ldafb,
+                    lapack_int* ipiv, char* equed, double* r, double* c,
+                    double* b, lapack_int* ldb, double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_cgbsvx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, lapack_int* nrhs, lapack_complex_float* ab,
+                    lapack_int* ldab, lapack_complex_float* afb,
+                    lapack_int* ldafb, lapack_int* ipiv, char* equed, float* r,
+                    float* c, lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zgbsvx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, lapack_int* nrhs, lapack_complex_double* ab,
+                    lapack_int* ldab, lapack_complex_double* afb,
+                    lapack_int* ldafb, lapack_int* ipiv, char* equed, double* r,
+                    double* c, lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_dgbsvxx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs, double* ab,
+                     lapack_int* ldab, double* afb, lapack_int* ldafb,
+                     lapack_int* ipiv, char* equed, double* r, double* c,
+                     double* b, lapack_int* ldb, double* x, lapack_int* ldx,
+                     double* rcond, double* rpvgrw, double* berr,
+                     lapack_int* n_err_bnds, double* err_bnds_norm,
+                     double* err_bnds_comp, lapack_int* nparams, double* params,
+                     double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_sgbsvxx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs, float* ab,
+                     lapack_int* ldab, float* afb, lapack_int* ldafb,
+                     lapack_int* ipiv, char* equed, float* r, float* c,
+                     float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                     float* rcond, float* rpvgrw, float* berr,
+                     lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_zgbsvxx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs,
+                     lapack_complex_double* ab, lapack_int* ldab,
+                     lapack_complex_double* afb, lapack_int* ldafb,
+                     lapack_int* ipiv, char* equed, double* r, double* c,
+                     lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_cgbsvxx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs, lapack_complex_float* ab,
+                     lapack_int* ldab, lapack_complex_float* afb,
+                     lapack_int* ldafb, lapack_int* ipiv, char* equed, float* r,
+                     float* c, lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_sgtsv( lapack_int* n, lapack_int* nrhs, float* dl, float* d,
+                   float* du, float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_dgtsv( lapack_int* n, lapack_int* nrhs, double* dl, double* d,
+                   double* du, double* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_cgtsv( lapack_int* n, lapack_int* nrhs, lapack_complex_float* dl,
+                   lapack_complex_float* d, lapack_complex_float* du,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_zgtsv( lapack_int* n, lapack_int* nrhs, lapack_complex_double* dl,
+                   lapack_complex_double* d, lapack_complex_double* du,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_sgtsvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    const float* dl, const float* d, const float* du,
+                    float* dlf, float* df, float* duf, float* du2,
+                    lapack_int* ipiv, const float* b, lapack_int* ldb, float* x,
+                    lapack_int* ldx, float* rcond, float* ferr, float* berr,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dgtsvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    const double* dl, const double* d, const double* du,
+                    double* dlf, double* df, double* duf, double* du2,
+                    lapack_int* ipiv, const double* b, lapack_int* ldb,
+                    double* x, lapack_int* ldx, double* rcond, double* ferr,
+                    double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cgtsvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* dl,
+                    const lapack_complex_float* d,
+                    const lapack_complex_float* du, lapack_complex_float* dlf,
+                    lapack_complex_float* df, lapack_complex_float* duf,
+                    lapack_complex_float* du2, lapack_int* ipiv,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zgtsvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* dl,
+                    const lapack_complex_double* d,
+                    const lapack_complex_double* du, lapack_complex_double* dlf,
+                    lapack_complex_double* df, lapack_complex_double* duf,
+                    lapack_complex_double* du2, lapack_int* ipiv,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_sposv( char* uplo, lapack_int* n, lapack_int* nrhs, float* a,
+                   lapack_int* lda, float* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_dposv( char* uplo, lapack_int* n, lapack_int* nrhs, double* a,
+                   lapack_int* lda, double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_cposv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_zposv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_dsposv( char* uplo, lapack_int* n, lapack_int* nrhs, double* a,
+                    lapack_int* lda, double* b, lapack_int* ldb, double* x,
+                    lapack_int* ldx, double* work, float* swork,
+                    lapack_int* iter, lapack_int *info );
+void LAPACK_zcposv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx,
+                    lapack_complex_double* work, lapack_complex_float* swork,
+                    double* rwork, lapack_int* iter, lapack_int *info );
+void LAPACK_sposvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    float* a, lapack_int* lda, float* af, lapack_int* ldaf,
+                    char* equed, float* s, float* b, lapack_int* ldb, float* x,
+                    lapack_int* ldx, float* rcond, float* ferr, float* berr,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dposvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    double* a, lapack_int* lda, double* af, lapack_int* ldaf,
+                    char* equed, double* s, double* b, lapack_int* ldb,
+                    double* x, lapack_int* ldx, double* rcond, double* ferr,
+                    double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cposvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* af, lapack_int* ldaf, char* equed,
+                    float* s, lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zposvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* af, lapack_int* ldaf, char* equed,
+                    double* s, lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_dposvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     double* a, lapack_int* lda, double* af, lapack_int* ldaf,
+                     char* equed, double* s, double* b, lapack_int* ldb,
+                     double* x, lapack_int* ldx, double* rcond, double* rpvgrw,
+                     double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params, double* work,
+                     lapack_int* iwork, lapack_int *info );
+void LAPACK_sposvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     float* a, lapack_int* lda, float* af, lapack_int* ldaf,
+                     char* equed, float* s, float* b, lapack_int* ldb, float* x,
+                     lapack_int* ldx, float* rcond, float* rpvgrw, float* berr,
+                     lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_zposvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_double* a, lapack_int* lda,
+                     lapack_complex_double* af, lapack_int* ldaf, char* equed,
+                     double* s, lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_cposvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_float* a, lapack_int* lda,
+                     lapack_complex_float* af, lapack_int* ldaf, char* equed,
+                     float* s, lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_sppsv( char* uplo, lapack_int* n, lapack_int* nrhs, float* ap,
+                   float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_dppsv( char* uplo, lapack_int* n, lapack_int* nrhs, double* ap,
+                   double* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_cppsv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_float* ap, lapack_complex_float* b,
+                   lapack_int* ldb, lapack_int *info );
+void LAPACK_zppsv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_double* ap, lapack_complex_double* b,
+                   lapack_int* ldb, lapack_int *info );
+void LAPACK_sppsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    float* ap, float* afp, char* equed, float* s, float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dppsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    double* ap, double* afp, char* equed, double* s, double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cppsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_float* ap, lapack_complex_float* afp,
+                    char* equed, float* s, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* rcond, float* ferr, float* berr,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zppsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_double* ap, lapack_complex_double* afp,
+                    char* equed, double* s, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_spbsv( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                   float* ab, lapack_int* ldab, float* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_dpbsv( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                   double* ab, lapack_int* ldab, double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_cpbsv( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                   lapack_complex_float* ab, lapack_int* ldab,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_zpbsv( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                   lapack_complex_double* ab, lapack_int* ldab,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_spbsvx( char* fact, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_int* nrhs, float* ab, lapack_int* ldab, float* afb,
+                    lapack_int* ldafb, char* equed, float* s, float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dpbsvx( char* fact, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_int* nrhs, double* ab, lapack_int* ldab, double* afb,
+                    lapack_int* ldafb, char* equed, double* s, double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cpbsvx( char* fact, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_int* nrhs, lapack_complex_float* ab,
+                    lapack_int* ldab, lapack_complex_float* afb,
+                    lapack_int* ldafb, char* equed, float* s,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zpbsvx( char* fact, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_int* nrhs, lapack_complex_double* ab,
+                    lapack_int* ldab, lapack_complex_double* afb,
+                    lapack_int* ldafb, char* equed, double* s,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_sptsv( lapack_int* n, lapack_int* nrhs, float* d, float* e,
+                   float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_dptsv( lapack_int* n, lapack_int* nrhs, double* d, double* e,
+                   double* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_cptsv( lapack_int* n, lapack_int* nrhs, float* d,
+                   lapack_complex_float* e, lapack_complex_float* b,
+                   lapack_int* ldb, lapack_int *info );
+void LAPACK_zptsv( lapack_int* n, lapack_int* nrhs, double* d,
+                   lapack_complex_double* e, lapack_complex_double* b,
+                   lapack_int* ldb, lapack_int *info );
+void LAPACK_sptsvx( char* fact, lapack_int* n, lapack_int* nrhs, const float* d,
+                    const float* e, float* df, float* ef, const float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, float* work, lapack_int *info );
+void LAPACK_dptsvx( char* fact, lapack_int* n, lapack_int* nrhs,
+                    const double* d, const double* e, double* df, double* ef,
+                    const double* b, lapack_int* ldb, double* x,
+                    lapack_int* ldx, double* rcond, double* ferr, double* berr,
+                    double* work, lapack_int *info );
+void LAPACK_cptsvx( char* fact, lapack_int* n, lapack_int* nrhs, const float* d,
+                    const lapack_complex_float* e, float* df,
+                    lapack_complex_float* ef, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* rcond, float* ferr, float* berr,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zptsvx( char* fact, lapack_int* n, lapack_int* nrhs,
+                    const double* d, const lapack_complex_double* e, double* df,
+                    lapack_complex_double* ef, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_ssysv( char* uplo, lapack_int* n, lapack_int* nrhs, float* a,
+                   lapack_int* lda, lapack_int* ipiv, float* b, lapack_int* ldb,
+                   float* work, lapack_int* lwork, lapack_int *info );
+void LAPACK_dsysv( char* uplo, lapack_int* n, lapack_int* nrhs, double* a,
+                   lapack_int* lda, lapack_int* ipiv, double* b,
+                   lapack_int* ldb, double* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_csysv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv,
+                   lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_zsysv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_ssysvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const float* a, lapack_int* lda, float* af,
+                    lapack_int* ldaf, lapack_int* ipiv, const float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dsysvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* a, lapack_int* lda, double* af,
+                    lapack_int* ldaf, lapack_int* ipiv, const double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_csysvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* af, lapack_int* ldaf,
+                    lapack_int* ipiv, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* rcond, float* ferr, float* berr,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int *info );
+void LAPACK_zsysvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* af, lapack_int* ldaf,
+                    lapack_int* ipiv, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int *info );
+void LAPACK_dsysvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     double* a, lapack_int* lda, double* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, double* s, double* b,
+                     lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
+                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params, double* work,
+                     lapack_int* iwork, lapack_int *info );
+void LAPACK_ssysvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     float* a, lapack_int* lda, float* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, float* s, float* b,
+                     lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params, float* work,
+                     lapack_int* iwork, lapack_int *info );
+void LAPACK_zsysvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_double* a, lapack_int* lda,
+                     lapack_complex_double* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, double* s,
+                     lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_csysvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_float* a, lapack_int* lda,
+                     lapack_complex_float* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, float* s,
+                     lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_chesv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv,
+                   lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_zhesv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_chesvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* af, lapack_int* ldaf,
+                    lapack_int* ipiv, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* rcond, float* ferr, float* berr,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int *info );
+void LAPACK_zhesvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* af, lapack_int* ldaf,
+                    lapack_int* ipiv, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int *info );
+void LAPACK_zhesvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_double* a, lapack_int* lda,
+                     lapack_complex_double* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, double* s,
+                     lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_chesvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_float* a, lapack_int* lda,
+                     lapack_complex_float* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, float* s,
+                     lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_sspsv( char* uplo, lapack_int* n, lapack_int* nrhs, float* ap,
+                   lapack_int* ipiv, float* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_dspsv( char* uplo, lapack_int* n, lapack_int* nrhs, double* ap,
+                   lapack_int* ipiv, double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_cspsv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_float* ap, lapack_int* ipiv,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_zspsv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_double* ap, lapack_int* ipiv,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_sspsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const float* ap, float* afp, lapack_int* ipiv,
+                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                    float* rcond, float* ferr, float* berr, float* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dspsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* ap, double* afp, lapack_int* ipiv,
+                    const double* b, lapack_int* ldb, double* x,
+                    lapack_int* ldx, double* rcond, double* ferr, double* berr,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cspsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap, lapack_complex_float* afp,
+                    lapack_int* ipiv, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* rcond, float* ferr, float* berr,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zspsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap, lapack_complex_double* afp,
+                    lapack_int* ipiv, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_chpsv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_float* ap, lapack_int* ipiv,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_zhpsv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_double* ap, lapack_int* ipiv,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_chpsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap, lapack_complex_float* afp,
+                    lapack_int* ipiv, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* rcond, float* ferr, float* berr,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zhpsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap, lapack_complex_double* afp,
+                    lapack_int* ipiv, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sgeqrf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* tau, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dgeqrf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* tau, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cgeqrf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zgeqrf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sgeqpf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* jpvt, float* tau, float* work,
+                    lapack_int *info );
+void LAPACK_dgeqpf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* jpvt, double* tau, double* work,
+                    lapack_int *info );
+void LAPACK_cgeqpf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* jpvt,
+                    lapack_complex_float* tau, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zgeqpf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* jpvt,
+                    lapack_complex_double* tau, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_sgeqp3( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* jpvt, float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dgeqp3( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* jpvt, double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cgeqp3( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* jpvt,
+                    lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int *info );
+void LAPACK_zgeqp3( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* jpvt,
+                    lapack_complex_double* tau, lapack_complex_double* work,
+                    lapack_int* lwork, double* rwork, lapack_int *info );
+void LAPACK_sorgqr( lapack_int* m, lapack_int* n, lapack_int* k, float* a,
+                    lapack_int* lda, const float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dorgqr( lapack_int* m, lapack_int* n, lapack_int* k, double* a,
+                    lapack_int* lda, const double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sormqr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const float* a, lapack_int* lda,
+                    const float* tau, float* c, lapack_int* ldc, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dormqr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const double* a, lapack_int* lda,
+                    const double* tau, double* c, lapack_int* ldc, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cungqr( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zungqr( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cunmqr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zunmqr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* tau,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sgelqf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* tau, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dgelqf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* tau, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cgelqf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zgelqf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sorglq( lapack_int* m, lapack_int* n, lapack_int* k, float* a,
+                    lapack_int* lda, const float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dorglq( lapack_int* m, lapack_int* n, lapack_int* k, double* a,
+                    lapack_int* lda, const double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sormlq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const float* a, lapack_int* lda,
+                    const float* tau, float* c, lapack_int* ldc, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dormlq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const double* a, lapack_int* lda,
+                    const double* tau, double* c, lapack_int* ldc, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cunglq( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zunglq( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cunmlq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zunmlq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* tau,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sgeqlf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* tau, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dgeqlf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* tau, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cgeqlf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zgeqlf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sorgql( lapack_int* m, lapack_int* n, lapack_int* k, float* a,
+                    lapack_int* lda, const float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dorgql( lapack_int* m, lapack_int* n, lapack_int* k, double* a,
+                    lapack_int* lda, const double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cungql( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zungql( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sormql( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const float* a, lapack_int* lda,
+                    const float* tau, float* c, lapack_int* ldc, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dormql( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const double* a, lapack_int* lda,
+                    const double* tau, double* c, lapack_int* ldc, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cunmql( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zunmql( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* tau,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sgerqf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* tau, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dgerqf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* tau, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cgerqf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zgerqf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sorgrq( lapack_int* m, lapack_int* n, lapack_int* k, float* a,
+                    lapack_int* lda, const float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dorgrq( lapack_int* m, lapack_int* n, lapack_int* k, double* a,
+                    lapack_int* lda, const double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cungrq( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zungrq( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sormrq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const float* a, lapack_int* lda,
+                    const float* tau, float* c, lapack_int* ldc, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dormrq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const double* a, lapack_int* lda,
+                    const double* tau, double* c, lapack_int* ldc, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cunmrq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zunmrq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* tau,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_stzrzf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* tau, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dtzrzf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* tau, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_ctzrzf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_ztzrzf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sormrz( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, lapack_int* l, const float* a,
+                    lapack_int* lda, const float* tau, float* c,
+                    lapack_int* ldc, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dormrz( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, lapack_int* l, const double* a,
+                    lapack_int* lda, const double* tau, double* c,
+                    lapack_int* ldc, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cunmrz( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, lapack_int* l, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zunmrz( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, lapack_int* l,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau, lapack_complex_double* c,
+                    lapack_int* ldc, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sggqrf( lapack_int* n, lapack_int* m, lapack_int* p, float* a,
+                    lapack_int* lda, float* taua, float* b, lapack_int* ldb,
+                    float* taub, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dggqrf( lapack_int* n, lapack_int* m, lapack_int* p, double* a,
+                    lapack_int* lda, double* taua, double* b, lapack_int* ldb,
+                    double* taub, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cggqrf( lapack_int* n, lapack_int* m, lapack_int* p,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* taua, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* taub,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zggqrf( lapack_int* n, lapack_int* m, lapack_int* p,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* taua, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* taub,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sggrqf( lapack_int* m, lapack_int* p, lapack_int* n, float* a,
+                    lapack_int* lda, float* taua, float* b, lapack_int* ldb,
+                    float* taub, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dggrqf( lapack_int* m, lapack_int* p, lapack_int* n, double* a,
+                    lapack_int* lda, double* taua, double* b, lapack_int* ldb,
+                    double* taub, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cggrqf( lapack_int* m, lapack_int* p, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* taua, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* taub,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zggrqf( lapack_int* m, lapack_int* p, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* taua, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* taub,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sgebrd( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* d, float* e, float* tauq, float* taup, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dgebrd( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* d, double* e, double* tauq, double* taup,
+                    double* work, lapack_int* lwork, lapack_int *info );
+void LAPACK_cgebrd( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, float* d, float* e,
+                    lapack_complex_float* tauq, lapack_complex_float* taup,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zgebrd( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, double* d, double* e,
+                    lapack_complex_double* tauq, lapack_complex_double* taup,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sgbbrd( char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc,
+                    lapack_int* kl, lapack_int* ku, float* ab, lapack_int* ldab,
+                    float* d, float* e, float* q, lapack_int* ldq, float* pt,
+                    lapack_int* ldpt, float* c, lapack_int* ldc, float* work,
+                    lapack_int *info );
+void LAPACK_dgbbrd( char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc,
+                    lapack_int* kl, lapack_int* ku, double* ab,
+                    lapack_int* ldab, double* d, double* e, double* q,
+                    lapack_int* ldq, double* pt, lapack_int* ldpt, double* c,
+                    lapack_int* ldc, double* work, lapack_int *info );
+void LAPACK_cgbbrd( char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc,
+                    lapack_int* kl, lapack_int* ku, lapack_complex_float* ab,
+                    lapack_int* ldab, float* d, float* e,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* pt, lapack_int* ldpt,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zgbbrd( char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc,
+                    lapack_int* kl, lapack_int* ku, lapack_complex_double* ab,
+                    lapack_int* ldab, double* d, double* e,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* pt, lapack_int* ldpt,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sorgbr( char* vect, lapack_int* m, lapack_int* n, lapack_int* k,
+                    float* a, lapack_int* lda, const float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dorgbr( char* vect, lapack_int* m, lapack_int* n, lapack_int* k,
+                    double* a, lapack_int* lda, const double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sormbr( char* vect, char* side, char* trans, lapack_int* m,
+                    lapack_int* n, lapack_int* k, const float* a,
+                    lapack_int* lda, const float* tau, float* c,
+                    lapack_int* ldc, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dormbr( char* vect, char* side, char* trans, lapack_int* m,
+                    lapack_int* n, lapack_int* k, const double* a,
+                    lapack_int* lda, const double* tau, double* c,
+                    lapack_int* ldc, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cungbr( char* vect, lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zungbr( char* vect, lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cunmbr( char* vect, char* side, char* trans, lapack_int* m,
+                    lapack_int* n, lapack_int* k, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zunmbr( char* vect, char* side, char* trans, lapack_int* m,
+                    lapack_int* n, lapack_int* k,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau, lapack_complex_double* c,
+                    lapack_int* ldc, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sbdsqr( char* uplo, lapack_int* n, lapack_int* ncvt,
+                    lapack_int* nru, lapack_int* ncc, float* d, float* e,
+                    float* vt, lapack_int* ldvt, float* u, lapack_int* ldu,
+                    float* c, lapack_int* ldc, float* work, lapack_int *info );
+void LAPACK_dbdsqr( char* uplo, lapack_int* n, lapack_int* ncvt,
+                    lapack_int* nru, lapack_int* ncc, double* d, double* e,
+                    double* vt, lapack_int* ldvt, double* u, lapack_int* ldu,
+                    double* c, lapack_int* ldc, double* work,
+                    lapack_int *info );
+void LAPACK_cbdsqr( char* uplo, lapack_int* n, lapack_int* ncvt,
+                    lapack_int* nru, lapack_int* ncc, float* d, float* e,
+                    lapack_complex_float* vt, lapack_int* ldvt,
+                    lapack_complex_float* u, lapack_int* ldu,
+                    lapack_complex_float* c, lapack_int* ldc, float* work,
+                    lapack_int *info );
+void LAPACK_zbdsqr( char* uplo, lapack_int* n, lapack_int* ncvt,
+                    lapack_int* nru, lapack_int* ncc, double* d, double* e,
+                    lapack_complex_double* vt, lapack_int* ldvt,
+                    lapack_complex_double* u, lapack_int* ldu,
+                    lapack_complex_double* c, lapack_int* ldc, double* work,
+                    lapack_int *info );
+void LAPACK_sbdsdc( char* uplo, char* compq, lapack_int* n, float* d, float* e,
+                    float* u, lapack_int* ldu, float* vt, lapack_int* ldvt,
+                    float* q, lapack_int* iq, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dbdsdc( char* uplo, char* compq, lapack_int* n, double* d,
+                    double* e, double* u, lapack_int* ldu, double* vt,
+                    lapack_int* ldvt, double* q, lapack_int* iq, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_ssytrd( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    float* d, float* e, float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dsytrd( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    double* d, double* e, double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sorgtr( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    const float* tau, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dorgtr( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    const double* tau, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sormtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const float* a, lapack_int* lda,
+                    const float* tau, float* c, lapack_int* ldc, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dormtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const double* a, lapack_int* lda,
+                    const double* tau, double* c, lapack_int* ldc, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_chetrd( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, float* d, float* e,
+                    lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zhetrd( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, double* d, double* e,
+                    lapack_complex_double* tau, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cungtr( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zungtr( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cunmtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zunmtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* tau,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_ssptrd( char* uplo, lapack_int* n, float* ap, float* d, float* e,
+                    float* tau, lapack_int *info );
+void LAPACK_dsptrd( char* uplo, lapack_int* n, double* ap, double* d, double* e,
+                    double* tau, lapack_int *info );
+void LAPACK_sopgtr( char* uplo, lapack_int* n, const float* ap,
+                    const float* tau, float* q, lapack_int* ldq, float* work,
+                    lapack_int *info );
+void LAPACK_dopgtr( char* uplo, lapack_int* n, const double* ap,
+                    const double* tau, double* q, lapack_int* ldq, double* work,
+                    lapack_int *info );
+void LAPACK_sopmtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const float* ap, const float* tau, float* c,
+                    lapack_int* ldc, float* work, lapack_int *info );
+void LAPACK_dopmtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const double* ap, const double* tau,
+                    double* c, lapack_int* ldc, double* work,
+                    lapack_int *info );
+void LAPACK_chptrd( char* uplo, lapack_int* n, lapack_complex_float* ap,
+                    float* d, float* e, lapack_complex_float* tau,
+                    lapack_int *info );
+void LAPACK_zhptrd( char* uplo, lapack_int* n, lapack_complex_double* ap,
+                    double* d, double* e, lapack_complex_double* tau,
+                    lapack_int *info );
+void LAPACK_cupgtr( char* uplo, lapack_int* n, const lapack_complex_float* ap,
+                    const lapack_complex_float* tau, lapack_complex_float* q,
+                    lapack_int* ldq, lapack_complex_float* work,
+                    lapack_int *info );
+void LAPACK_zupgtr( char* uplo, lapack_int* n, const lapack_complex_double* ap,
+                    const lapack_complex_double* tau, lapack_complex_double* q,
+                    lapack_int* ldq, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_cupmtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const lapack_complex_float* ap,
+                    const lapack_complex_float* tau, lapack_complex_float* c,
+                    lapack_int* ldc, lapack_complex_float* work,
+                    lapack_int *info );
+void LAPACK_zupmtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const lapack_complex_double* ap,
+                    const lapack_complex_double* tau, lapack_complex_double* c,
+                    lapack_int* ldc, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_ssbtrd( char* vect, char* uplo, lapack_int* n, lapack_int* kd,
+                    float* ab, lapack_int* ldab, float* d, float* e, float* q,
+                    lapack_int* ldq, float* work, lapack_int *info );
+void LAPACK_dsbtrd( char* vect, char* uplo, lapack_int* n, lapack_int* kd,
+                    double* ab, lapack_int* ldab, double* d, double* e,
+                    double* q, lapack_int* ldq, double* work,
+                    lapack_int *info );
+void LAPACK_chbtrd( char* vect, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_complex_float* ab, lapack_int* ldab, float* d,
+                    float* e, lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zhbtrd( char* vect, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_complex_double* ab, lapack_int* ldab, double* d,
+                    double* e, lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_ssterf( lapack_int* n, float* d, float* e, lapack_int *info );
+void LAPACK_dsterf( lapack_int* n, double* d, double* e, lapack_int *info );
+void LAPACK_ssteqr( char* compz, lapack_int* n, float* d, float* e, float* z,
+                    lapack_int* ldz, float* work, lapack_int *info );
+void LAPACK_dsteqr( char* compz, lapack_int* n, double* d, double* e, double* z,
+                    lapack_int* ldz, double* work, lapack_int *info );
+void LAPACK_csteqr( char* compz, lapack_int* n, float* d, float* e,
+                    lapack_complex_float* z, lapack_int* ldz, float* work,
+                    lapack_int *info );
+void LAPACK_zsteqr( char* compz, lapack_int* n, double* d, double* e,
+                    lapack_complex_double* z, lapack_int* ldz, double* work,
+                    lapack_int *info );
+void LAPACK_sstemr( char* jobz, char* range, lapack_int* n, float* d, float* e,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    lapack_int* m, float* w, float* z, lapack_int* ldz,
+                    lapack_int* nzc, lapack_int* isuppz, lapack_logical* tryrac,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_dstemr( char* jobz, char* range, lapack_int* n, double* d,
+                    double* e, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, lapack_int* m, double* w, double* z,
+                    lapack_int* ldz, lapack_int* nzc, lapack_int* isuppz,
+                    lapack_logical* tryrac, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_cstemr( char* jobz, char* range, lapack_int* n, float* d, float* e,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    lapack_int* m, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_int* nzc, lapack_int* isuppz,
+                    lapack_logical* tryrac, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_zstemr( char* jobz, char* range, lapack_int* n, double* d,
+                    double* e, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, lapack_int* m, double* w,
+                    lapack_complex_double* z, lapack_int* ldz, lapack_int* nzc,
+                    lapack_int* isuppz, lapack_logical* tryrac, double* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_sstedc( char* compz, lapack_int* n, float* d, float* e, float* z,
+                    lapack_int* ldz, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_dstedc( char* compz, lapack_int* n, double* d, double* e, double* z,
+                    lapack_int* ldz, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_cstedc( char* compz, lapack_int* n, float* d, float* e,
+                    lapack_complex_float* z, lapack_int* ldz,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_zstedc( char* compz, lapack_int* n, double* d, double* e,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_sstegr( char* jobz, char* range, lapack_int* n, float* d, float* e,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    float* abstol, lapack_int* m, float* w, float* z,
+                    lapack_int* ldz, lapack_int* isuppz, float* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_dstegr( char* jobz, char* range, lapack_int* n, double* d,
+                    double* e, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    double* z, lapack_int* ldz, lapack_int* isuppz,
+                    double* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_cstegr( char* jobz, char* range, lapack_int* n, float* d, float* e,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    float* abstol, lapack_int* m, float* w,
+                    lapack_complex_float* z, lapack_int* ldz,
+                    lapack_int* isuppz, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_zstegr( char* jobz, char* range, lapack_int* n, double* d,
+                    double* e, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_int* isuppz, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_spteqr( char* compz, lapack_int* n, float* d, float* e, float* z,
+                    lapack_int* ldz, float* work, lapack_int *info );
+void LAPACK_dpteqr( char* compz, lapack_int* n, double* d, double* e, double* z,
+                    lapack_int* ldz, double* work, lapack_int *info );
+void LAPACK_cpteqr( char* compz, lapack_int* n, float* d, float* e,
+                    lapack_complex_float* z, lapack_int* ldz, float* work,
+                    lapack_int *info );
+void LAPACK_zpteqr( char* compz, lapack_int* n, double* d, double* e,
+                    lapack_complex_double* z, lapack_int* ldz, double* work,
+                    lapack_int *info );
+void LAPACK_sstebz( char* range, char* order, lapack_int* n, float* vl,
+                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
+                    const float* d, const float* e, lapack_int* m,
+                    lapack_int* nsplit, float* w, lapack_int* iblock,
+                    lapack_int* isplit, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dstebz( char* range, char* order, lapack_int* n, double* vl,
+                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
+                    const double* d, const double* e, lapack_int* m,
+                    lapack_int* nsplit, double* w, lapack_int* iblock,
+                    lapack_int* isplit, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_sstein( lapack_int* n, const float* d, const float* e,
+                    lapack_int* m, const float* w, const lapack_int* iblock,
+                    const lapack_int* isplit, float* z, lapack_int* ldz,
+                    float* work, lapack_int* iwork, lapack_int* ifailv,
+                    lapack_int *info );
+void LAPACK_dstein( lapack_int* n, const double* d, const double* e,
+                    lapack_int* m, const double* w, const lapack_int* iblock,
+                    const lapack_int* isplit, double* z, lapack_int* ldz,
+                    double* work, lapack_int* iwork, lapack_int* ifailv,
+                    lapack_int *info );
+void LAPACK_cstein( lapack_int* n, const float* d, const float* e,
+                    lapack_int* m, const float* w, const lapack_int* iblock,
+                    const lapack_int* isplit, lapack_complex_float* z,
+                    lapack_int* ldz, float* work, lapack_int* iwork,
+                    lapack_int* ifailv, lapack_int *info );
+void LAPACK_zstein( lapack_int* n, const double* d, const double* e,
+                    lapack_int* m, const double* w, const lapack_int* iblock,
+                    const lapack_int* isplit, lapack_complex_double* z,
+                    lapack_int* ldz, double* work, lapack_int* iwork,
+                    lapack_int* ifailv, lapack_int *info );
+void LAPACK_sdisna( char* job, lapack_int* m, lapack_int* n, const float* d,
+                    float* sep, lapack_int *info );
+void LAPACK_ddisna( char* job, lapack_int* m, lapack_int* n, const double* d,
+                    double* sep, lapack_int *info );
+void LAPACK_ssygst( lapack_int* itype, char* uplo, lapack_int* n, float* a,
+                    lapack_int* lda, const float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_dsygst( lapack_int* itype, char* uplo, lapack_int* n, double* a,
+                    lapack_int* lda, const double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_chegst( lapack_int* itype, char* uplo, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zhegst( lapack_int* itype, char* uplo, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_sspgst( lapack_int* itype, char* uplo, lapack_int* n, float* ap,
+                    const float* bp, lapack_int *info );
+void LAPACK_dspgst( lapack_int* itype, char* uplo, lapack_int* n, double* ap,
+                    const double* bp, lapack_int *info );
+void LAPACK_chpgst( lapack_int* itype, char* uplo, lapack_int* n,
+                    lapack_complex_float* ap, const lapack_complex_float* bp,
+                    lapack_int *info );
+void LAPACK_zhpgst( lapack_int* itype, char* uplo, lapack_int* n,
+                    lapack_complex_double* ap, const lapack_complex_double* bp,
+                    lapack_int *info );
+void LAPACK_ssbgst( char* vect, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, float* ab, lapack_int* ldab,
+                    const float* bb, lapack_int* ldbb, float* x,
+                    lapack_int* ldx, float* work, lapack_int *info );
+void LAPACK_dsbgst( char* vect, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, double* ab, lapack_int* ldab,
+                    const double* bb, lapack_int* ldbb, double* x,
+                    lapack_int* ldx, double* work, lapack_int *info );
+void LAPACK_chbgst( char* vect, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, lapack_complex_float* ab, lapack_int* ldab,
+                    const lapack_complex_float* bb, lapack_int* ldbb,
+                    lapack_complex_float* x, lapack_int* ldx,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zhbgst( char* vect, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, lapack_complex_double* ab, lapack_int* ldab,
+                    const lapack_complex_double* bb, lapack_int* ldbb,
+                    lapack_complex_double* x, lapack_int* ldx,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_spbstf( char* uplo, lapack_int* n, lapack_int* kb, float* bb,
+                    lapack_int* ldbb, lapack_int *info );
+void LAPACK_dpbstf( char* uplo, lapack_int* n, lapack_int* kb, double* bb,
+                    lapack_int* ldbb, lapack_int *info );
+void LAPACK_cpbstf( char* uplo, lapack_int* n, lapack_int* kb,
+                    lapack_complex_float* bb, lapack_int* ldbb,
+                    lapack_int *info );
+void LAPACK_zpbstf( char* uplo, lapack_int* n, lapack_int* kb,
+                    lapack_complex_double* bb, lapack_int* ldbb,
+                    lapack_int *info );
+void LAPACK_sgehrd( lapack_int* n, lapack_int* ilo, lapack_int* ihi, float* a,
+                    lapack_int* lda, float* tau, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dgehrd( lapack_int* n, lapack_int* ilo, lapack_int* ihi, double* a,
+                    lapack_int* lda, double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cgehrd( lapack_int* n, lapack_int* ilo, lapack_int* ihi,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zgehrd( lapack_int* n, lapack_int* ilo, lapack_int* ihi,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* tau, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sorghr( lapack_int* n, lapack_int* ilo, lapack_int* ihi, float* a,
+                    lapack_int* lda, const float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dorghr( lapack_int* n, lapack_int* ilo, lapack_int* ihi, double* a,
+                    lapack_int* lda, const double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sormhr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi, const float* a,
+                    lapack_int* lda, const float* tau, float* c,
+                    lapack_int* ldc, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dormhr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi, const double* a,
+                    lapack_int* lda, const double* tau, double* c,
+                    lapack_int* ldc, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cunghr( lapack_int* n, lapack_int* ilo, lapack_int* ihi,
+                    lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zunghr( lapack_int* n, lapack_int* ilo, lapack_int* ihi,
+                    lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cunmhr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* tau, lapack_complex_float* c,
+                    lapack_int* ldc, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zunmhr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau, lapack_complex_double* c,
+                    lapack_int* ldc, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sgebal( char* job, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* ilo, lapack_int* ihi, float* scale,
+                    lapack_int *info );
+void LAPACK_dgebal( char* job, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* ilo, lapack_int* ihi, double* scale,
+                    lapack_int *info );
+void LAPACK_cgebal( char* job, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* ilo, lapack_int* ihi,
+                    float* scale, lapack_int *info );
+void LAPACK_zgebal( char* job, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* ilo, lapack_int* ihi,
+                    double* scale, lapack_int *info );
+void LAPACK_sgebak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const float* scale, lapack_int* m,
+                    float* v, lapack_int* ldv, lapack_int *info );
+void LAPACK_dgebak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const double* scale, lapack_int* m,
+                    double* v, lapack_int* ldv, lapack_int *info );
+void LAPACK_cgebak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const float* scale, lapack_int* m,
+                    lapack_complex_float* v, lapack_int* ldv,
+                    lapack_int *info );
+void LAPACK_zgebak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const double* scale, lapack_int* m,
+                    lapack_complex_double* v, lapack_int* ldv,
+                    lapack_int *info );
+void LAPACK_shseqr( char* job, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, float* h, lapack_int* ldh, float* wr,
+                    float* wi, float* z, lapack_int* ldz, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dhseqr( char* job, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, double* h, lapack_int* ldh, double* wr,
+                    double* wi, double* z, lapack_int* ldz, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_chseqr( char* job, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, lapack_complex_float* h, lapack_int* ldh,
+                    lapack_complex_float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zhseqr( char* job, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, lapack_complex_double* h, lapack_int* ldh,
+                    lapack_complex_double* w, lapack_complex_double* z,
+                    lapack_int* ldz, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_shsein( char* job, char* eigsrc, char* initv,
+                    lapack_logical* select, lapack_int* n, const float* h,
+                    lapack_int* ldh, float* wr, const float* wi, float* vl,
+                    lapack_int* ldvl, float* vr, lapack_int* ldvr,
+                    lapack_int* mm, lapack_int* m, float* work,
+                    lapack_int* ifaill, lapack_int* ifailr, lapack_int *info );
+void LAPACK_dhsein( char* job, char* eigsrc, char* initv,
+                    lapack_logical* select, lapack_int* n, const double* h,
+                    lapack_int* ldh, double* wr, const double* wi, double* vl,
+                    lapack_int* ldvl, double* vr, lapack_int* ldvr,
+                    lapack_int* mm, lapack_int* m, double* work,
+                    lapack_int* ifaill, lapack_int* ifailr, lapack_int *info );
+void LAPACK_chsein( char* job, char* eigsrc, char* initv,
+                    const lapack_logical* select, lapack_int* n,
+                    const lapack_complex_float* h, lapack_int* ldh,
+                    lapack_complex_float* w, lapack_complex_float* vl,
+                    lapack_int* ldvl, lapack_complex_float* vr,
+                    lapack_int* ldvr, lapack_int* mm, lapack_int* m,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int* ifaill, lapack_int* ifailr, lapack_int *info );
+void LAPACK_zhsein( char* job, char* eigsrc, char* initv,
+                    const lapack_logical* select, lapack_int* n,
+                    const lapack_complex_double* h, lapack_int* ldh,
+                    lapack_complex_double* w, lapack_complex_double* vl,
+                    lapack_int* ldvl, lapack_complex_double* vr,
+                    lapack_int* ldvr, lapack_int* mm, lapack_int* m,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int* ifaill, lapack_int* ifailr, lapack_int *info );
+void LAPACK_strevc( char* side, char* howmny, lapack_logical* select,
+                    lapack_int* n, const float* t, lapack_int* ldt, float* vl,
+                    lapack_int* ldvl, float* vr, lapack_int* ldvr,
+                    lapack_int* mm, lapack_int* m, float* work,
+                    lapack_int *info );
+void LAPACK_dtrevc( char* side, char* howmny, lapack_logical* select,
+                    lapack_int* n, const double* t, lapack_int* ldt, double* vl,
+                    lapack_int* ldvl, double* vr, lapack_int* ldvr,
+                    lapack_int* mm, lapack_int* m, double* work,
+                    lapack_int *info );
+void LAPACK_ctrevc( char* side, char* howmny, const lapack_logical* select,
+                    lapack_int* n, lapack_complex_float* t, lapack_int* ldt,
+                    lapack_complex_float* vl, lapack_int* ldvl,
+                    lapack_complex_float* vr, lapack_int* ldvr, lapack_int* mm,
+                    lapack_int* m, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_ztrevc( char* side, char* howmny, const lapack_logical* select,
+                    lapack_int* n, lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* vl, lapack_int* ldvl,
+                    lapack_complex_double* vr, lapack_int* ldvr, lapack_int* mm,
+                    lapack_int* m, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_strsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const float* t, lapack_int* ldt,
+                    const float* vl, lapack_int* ldvl, const float* vr,
+                    lapack_int* ldvr, float* s, float* sep, lapack_int* mm,
+                    lapack_int* m, float* work, lapack_int* ldwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dtrsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const double* t, lapack_int* ldt,
+                    const double* vl, lapack_int* ldvl, const double* vr,
+                    lapack_int* ldvr, double* s, double* sep, lapack_int* mm,
+                    lapack_int* m, double* work, lapack_int* ldwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_ctrsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const lapack_complex_float* t,
+                    lapack_int* ldt, const lapack_complex_float* vl,
+                    lapack_int* ldvl, const lapack_complex_float* vr,
+                    lapack_int* ldvr, float* s, float* sep, lapack_int* mm,
+                    lapack_int* m, lapack_complex_float* work,
+                    lapack_int* ldwork, float* rwork, lapack_int *info );
+void LAPACK_ztrsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const lapack_complex_double* t,
+                    lapack_int* ldt, const lapack_complex_double* vl,
+                    lapack_int* ldvl, const lapack_complex_double* vr,
+                    lapack_int* ldvr, double* s, double* sep, lapack_int* mm,
+                    lapack_int* m, lapack_complex_double* work,
+                    lapack_int* ldwork, double* rwork, lapack_int *info );
+void LAPACK_strexc( char* compq, lapack_int* n, float* t, lapack_int* ldt,
+                    float* q, lapack_int* ldq, lapack_int* ifst,
+                    lapack_int* ilst, float* work, lapack_int *info );
+void LAPACK_dtrexc( char* compq, lapack_int* n, double* t, lapack_int* ldt,
+                    double* q, lapack_int* ldq, lapack_int* ifst,
+                    lapack_int* ilst, double* work, lapack_int *info );
+void LAPACK_ctrexc( char* compq, lapack_int* n, lapack_complex_float* t,
+                    lapack_int* ldt, lapack_complex_float* q, lapack_int* ldq,
+                    lapack_int* ifst, lapack_int* ilst, lapack_int *info );
+void LAPACK_ztrexc( char* compq, lapack_int* n, lapack_complex_double* t,
+                    lapack_int* ldt, lapack_complex_double* q, lapack_int* ldq,
+                    lapack_int* ifst, lapack_int* ilst, lapack_int *info );
+void LAPACK_strsen( char* job, char* compq, const lapack_logical* select,
+                    lapack_int* n, float* t, lapack_int* ldt, float* q,
+                    lapack_int* ldq, float* wr, float* wi, lapack_int* m,
+                    float* s, float* sep, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_dtrsen( char* job, char* compq, const lapack_logical* select,
+                    lapack_int* n, double* t, lapack_int* ldt, double* q,
+                    lapack_int* ldq, double* wr, double* wi, lapack_int* m,
+                    double* s, double* sep, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_ctrsen( char* job, char* compq, const lapack_logical* select,
+                    lapack_int* n, lapack_complex_float* t, lapack_int* ldt,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* w, lapack_int* m, float* s,
+                    float* sep, lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_ztrsen( char* job, char* compq, const lapack_logical* select,
+                    lapack_int* n, lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* w, lapack_int* m, double* s,
+                    double* sep, lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_strsyl( char* trana, char* tranb, lapack_int* isgn, lapack_int* m,
+                    lapack_int* n, const float* a, lapack_int* lda,
+                    const float* b, lapack_int* ldb, float* c, lapack_int* ldc,
+                    float* scale, lapack_int *info );
+void LAPACK_dtrsyl( char* trana, char* tranb, lapack_int* isgn, lapack_int* m,
+                    lapack_int* n, const double* a, lapack_int* lda,
+                    const double* b, lapack_int* ldb, double* c,
+                    lapack_int* ldc, double* scale, lapack_int *info );
+void LAPACK_ctrsyl( char* trana, char* tranb, lapack_int* isgn, lapack_int* m,
+                    lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* c, lapack_int* ldc,
+                    float* scale, lapack_int *info );
+void LAPACK_ztrsyl( char* trana, char* tranb, lapack_int* isgn, lapack_int* m,
+                    lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* c, lapack_int* ldc,
+                    double* scale, lapack_int *info );
+void LAPACK_sgghrd( char* compq, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, float* a, lapack_int* lda, float* b,
+                    lapack_int* ldb, float* q, lapack_int* ldq, float* z,
+                    lapack_int* ldz, lapack_int *info );
+void LAPACK_dgghrd( char* compq, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, double* a, lapack_int* lda, double* b,
+                    lapack_int* ldb, double* q, lapack_int* ldq, double* z,
+                    lapack_int* ldz, lapack_int *info );
+void LAPACK_cgghrd( char* compq, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* z, lapack_int* ldz,
+                    lapack_int *info );
+void LAPACK_zgghrd( char* compq, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_int *info );
+void LAPACK_sggbal( char* job, lapack_int* n, float* a, lapack_int* lda,
+                    float* b, lapack_int* ldb, lapack_int* ilo, lapack_int* ihi,
+                    float* lscale, float* rscale, float* work,
+                    lapack_int *info );
+void LAPACK_dggbal( char* job, lapack_int* n, double* a, lapack_int* lda,
+                    double* b, lapack_int* ldb, lapack_int* ilo,
+                    lapack_int* ihi, double* lscale, double* rscale,
+                    double* work, lapack_int *info );
+void LAPACK_cggbal( char* job, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int* ilo, lapack_int* ihi, float* lscale,
+                    float* rscale, float* work, lapack_int *info );
+void LAPACK_zggbal( char* job, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int* ilo, lapack_int* ihi, double* lscale,
+                    double* rscale, double* work, lapack_int *info );
+void LAPACK_sggbak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const float* lscale, const float* rscale,
+                    lapack_int* m, float* v, lapack_int* ldv,
+                    lapack_int *info );
+void LAPACK_dggbak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const double* lscale, const double* rscale,
+                    lapack_int* m, double* v, lapack_int* ldv,
+                    lapack_int *info );
+void LAPACK_cggbak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const float* lscale, const float* rscale,
+                    lapack_int* m, lapack_complex_float* v, lapack_int* ldv,
+                    lapack_int *info );
+void LAPACK_zggbak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const double* lscale, const double* rscale,
+                    lapack_int* m, lapack_complex_double* v, lapack_int* ldv,
+                    lapack_int *info );
+void LAPACK_shgeqz( char* job, char* compq, char* compz, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi, float* h, lapack_int* ldh,
+                    float* t, lapack_int* ldt, float* alphar, float* alphai,
+                    float* beta, float* q, lapack_int* ldq, float* z,
+                    lapack_int* ldz, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dhgeqz( char* job, char* compq, char* compz, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi, double* h,
+                    lapack_int* ldh, double* t, lapack_int* ldt, double* alphar,
+                    double* alphai, double* beta, double* q, lapack_int* ldq,
+                    double* z, lapack_int* ldz, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_chgeqz( char* job, char* compq, char* compz, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi, lapack_complex_float* h,
+                    lapack_int* ldh, lapack_complex_float* t, lapack_int* ldt,
+                    lapack_complex_float* alpha, lapack_complex_float* beta,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* z, lapack_int* ldz,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int *info );
+void LAPACK_zhgeqz( char* job, char* compq, char* compz, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi, lapack_complex_double* h,
+                    lapack_int* ldh, lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* alpha, lapack_complex_double* beta,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int *info );
+void LAPACK_stgevc( char* side, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const float* s, lapack_int* lds,
+                    const float* p, lapack_int* ldp, float* vl,
+                    lapack_int* ldvl, float* vr, lapack_int* ldvr,
+                    lapack_int* mm, lapack_int* m, float* work,
+                    lapack_int *info );
+void LAPACK_dtgevc( char* side, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const double* s, lapack_int* lds,
+                    const double* p, lapack_int* ldp, double* vl,
+                    lapack_int* ldvl, double* vr, lapack_int* ldvr,
+                    lapack_int* mm, lapack_int* m, double* work,
+                    lapack_int *info );
+void LAPACK_ctgevc( char* side, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const lapack_complex_float* s,
+                    lapack_int* lds, const lapack_complex_float* p,
+                    lapack_int* ldp, lapack_complex_float* vl, lapack_int* ldvl,
+                    lapack_complex_float* vr, lapack_int* ldvr, lapack_int* mm,
+                    lapack_int* m, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_ztgevc( char* side, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const lapack_complex_double* s,
+                    lapack_int* lds, const lapack_complex_double* p,
+                    lapack_int* ldp, lapack_complex_double* vl,
+                    lapack_int* ldvl, lapack_complex_double* vr,
+                    lapack_int* ldvr, lapack_int* mm, lapack_int* m,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_stgexc( lapack_logical* wantq, lapack_logical* wantz, lapack_int* n,
+                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                    float* q, lapack_int* ldq, float* z, lapack_int* ldz,
+                    lapack_int* ifst, lapack_int* ilst, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dtgexc( lapack_logical* wantq, lapack_logical* wantz, lapack_int* n,
+                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                    double* q, lapack_int* ldq, double* z, lapack_int* ldz,
+                    lapack_int* ifst, lapack_int* ilst, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_ctgexc( lapack_logical* wantq, lapack_logical* wantz, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* z, lapack_int* ldz, lapack_int* ifst,
+                    lapack_int* ilst, lapack_int *info );
+void LAPACK_ztgexc( lapack_logical* wantq, lapack_logical* wantz, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* z, lapack_int* ldz, lapack_int* ifst,
+                    lapack_int* ilst, lapack_int *info );
+void LAPACK_stgsen( lapack_int* ijob, lapack_logical* wantq,
+                    lapack_logical* wantz, const lapack_logical* select,
+                    lapack_int* n, float* a, lapack_int* lda, float* b,
+                    lapack_int* ldb, float* alphar, float* alphai, float* beta,
+                    float* q, lapack_int* ldq, float* z, lapack_int* ldz,
+                    lapack_int* m, float* pl, float* pr, float* dif,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_dtgsen( lapack_int* ijob, lapack_logical* wantq,
+                    lapack_logical* wantz, const lapack_logical* select,
+                    lapack_int* n, double* a, lapack_int* lda, double* b,
+                    lapack_int* ldb, double* alphar, double* alphai,
+                    double* beta, double* q, lapack_int* ldq, double* z,
+                    lapack_int* ldz, lapack_int* m, double* pl, double* pr,
+                    double* dif, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_ctgsen( lapack_int* ijob, lapack_logical* wantq,
+                    lapack_logical* wantz, const lapack_logical* select,
+                    lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* alpha, lapack_complex_float* beta,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* z, lapack_int* ldz, lapack_int* m,
+                    float* pl, float* pr, float* dif,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_ztgsen( lapack_int* ijob, lapack_logical* wantq,
+                    lapack_logical* wantz, const lapack_logical* select,
+                    lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* alpha, lapack_complex_double* beta,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* z, lapack_int* ldz, lapack_int* m,
+                    double* pl, double* pr, double* dif,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_stgsyl( char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n,
+                    const float* a, lapack_int* lda, const float* b,
+                    lapack_int* ldb, float* c, lapack_int* ldc, const float* d,
+                    lapack_int* ldd, const float* e, lapack_int* lde, float* f,
+                    lapack_int* ldf, float* scale, float* dif, float* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_dtgsyl( char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n,
+                    const double* a, lapack_int* lda, const double* b,
+                    lapack_int* ldb, double* c, lapack_int* ldc,
+                    const double* d, lapack_int* ldd, const double* e,
+                    lapack_int* lde, double* f, lapack_int* ldf, double* scale,
+                    double* dif, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_ctgsyl( char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    const lapack_complex_float* d, lapack_int* ldd,
+                    const lapack_complex_float* e, lapack_int* lde,
+                    lapack_complex_float* f, lapack_int* ldf, float* scale,
+                    float* dif, lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_ztgsyl( char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    const lapack_complex_double* d, lapack_int* ldd,
+                    const lapack_complex_double* e, lapack_int* lde,
+                    lapack_complex_double* f, lapack_int* ldf, double* scale,
+                    double* dif, lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_stgsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const float* a, lapack_int* lda,
+                    const float* b, lapack_int* ldb, const float* vl,
+                    lapack_int* ldvl, const float* vr, lapack_int* ldvr,
+                    float* s, float* dif, lapack_int* mm, lapack_int* m,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dtgsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const double* a, lapack_int* lda,
+                    const double* b, lapack_int* ldb, const double* vl,
+                    lapack_int* ldvl, const double* vr, lapack_int* ldvr,
+                    double* s, double* dif, lapack_int* mm, lapack_int* m,
+                    double* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_ctgsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* b,
+                    lapack_int* ldb, const lapack_complex_float* vl,
+                    lapack_int* ldvl, const lapack_complex_float* vr,
+                    lapack_int* ldvr, float* s, float* dif, lapack_int* mm,
+                    lapack_int* m, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_ztgsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* b,
+                    lapack_int* ldb, const lapack_complex_double* vl,
+                    lapack_int* ldvl, const lapack_complex_double* vr,
+                    lapack_int* ldvr, double* s, double* dif, lapack_int* mm,
+                    lapack_int* m, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_sggsvp( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, float* a, lapack_int* lda,
+                    float* b, lapack_int* ldb, float* tola, float* tolb,
+                    lapack_int* k, lapack_int* l, float* u, lapack_int* ldu,
+                    float* v, lapack_int* ldv, float* q, lapack_int* ldq,
+                    lapack_int* iwork, float* tau, float* work,
+                    lapack_int *info );
+void LAPACK_dggsvp( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, double* a, lapack_int* lda,
+                    double* b, lapack_int* ldb, double* tola, double* tolb,
+                    lapack_int* k, lapack_int* l, double* u, lapack_int* ldu,
+                    double* v, lapack_int* ldv, double* q, lapack_int* ldq,
+                    lapack_int* iwork, double* tau, double* work,
+                    lapack_int *info );
+void LAPACK_cggsvp( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
+                    float* tola, float* tolb, lapack_int* k, lapack_int* l,
+                    lapack_complex_float* u, lapack_int* ldu,
+                    lapack_complex_float* v, lapack_int* ldv,
+                    lapack_complex_float* q, lapack_int* ldq, lapack_int* iwork,
+                    float* rwork, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zggsvp( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
+                    double* tola, double* tolb, lapack_int* k, lapack_int* l,
+                    lapack_complex_double* u, lapack_int* ldu,
+                    lapack_complex_double* v, lapack_int* ldv,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_int* iwork, double* rwork,
+                    lapack_complex_double* tau, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_stgsja( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, lapack_int* k, lapack_int* l,
+                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                    float* tola, float* tolb, float* alpha, float* beta,
+                    float* u, lapack_int* ldu, float* v, lapack_int* ldv,
+                    float* q, lapack_int* ldq, float* work, lapack_int* ncycle,
+                    lapack_int *info );
+void LAPACK_dtgsja( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, lapack_int* k, lapack_int* l,
+                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                    double* tola, double* tolb, double* alpha, double* beta,
+                    double* u, lapack_int* ldu, double* v, lapack_int* ldv,
+                    double* q, lapack_int* ldq, double* work,
+                    lapack_int* ncycle, lapack_int *info );
+void LAPACK_ctgsja( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, lapack_int* k, lapack_int* l,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, float* tola,
+                    float* tolb, float* alpha, float* beta,
+                    lapack_complex_float* u, lapack_int* ldu,
+                    lapack_complex_float* v, lapack_int* ldv,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* work, lapack_int* ncycle,
+                    lapack_int *info );
+void LAPACK_ztgsja( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, lapack_int* k, lapack_int* l,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, double* tola,
+                    double* tolb, double* alpha, double* beta,
+                    lapack_complex_double* u, lapack_int* ldu,
+                    lapack_complex_double* v, lapack_int* ldv,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* work, lapack_int* ncycle,
+                    lapack_int *info );
+void LAPACK_sgels( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                   float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                   float* work, lapack_int* lwork, lapack_int *info );
+void LAPACK_dgels( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                   double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                   double* work, lapack_int* lwork, lapack_int *info );
+void LAPACK_cgels( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_zgels( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_sgelsy( lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a,
+                    lapack_int* lda, float* b, lapack_int* ldb,
+                    lapack_int* jpvt, float* rcond, lapack_int* rank,
+                    float* work, lapack_int* lwork, lapack_int *info );
+void LAPACK_dgelsy( lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a,
+                    lapack_int* lda, double* b, lapack_int* ldb,
+                    lapack_int* jpvt, double* rcond, lapack_int* rank,
+                    double* work, lapack_int* lwork, lapack_int *info );
+void LAPACK_cgelsy( lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, lapack_int* jpvt,
+                    float* rcond, lapack_int* rank, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int *info );
+void LAPACK_zgelsy( lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, lapack_int* jpvt,
+                    double* rcond, lapack_int* rank,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int *info );
+void LAPACK_sgelss( lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a,
+                    lapack_int* lda, float* b, lapack_int* ldb, float* s,
+                    float* rcond, lapack_int* rank, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dgelss( lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a,
+                    lapack_int* lda, double* b, lapack_int* ldb, double* s,
+                    double* rcond, lapack_int* rank, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cgelss( lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, float* s,
+                    float* rcond, lapack_int* rank, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int *info );
+void LAPACK_zgelss( lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, double* s,
+                    double* rcond, lapack_int* rank,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int *info );
+void LAPACK_sgelsd( lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a,
+                    lapack_int* lda, float* b, lapack_int* ldb, float* s,
+                    float* rcond, lapack_int* rank, float* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_dgelsd( lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a,
+                    lapack_int* lda, double* b, lapack_int* ldb, double* s,
+                    double* rcond, lapack_int* rank, double* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_cgelsd( lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, float* s,
+                    float* rcond, lapack_int* rank, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_zgelsd( lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, double* s,
+                    double* rcond, lapack_int* rank,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_sgglse( lapack_int* m, lapack_int* n, lapack_int* p, float* a,
+                    lapack_int* lda, float* b, lapack_int* ldb, float* c,
+                    float* d, float* x, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dgglse( lapack_int* m, lapack_int* n, lapack_int* p, double* a,
+                    lapack_int* lda, double* b, lapack_int* ldb, double* c,
+                    double* d, double* x, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cgglse( lapack_int* m, lapack_int* n, lapack_int* p,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* c, lapack_complex_float* d,
+                    lapack_complex_float* x, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zgglse( lapack_int* m, lapack_int* n, lapack_int* p,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* c, lapack_complex_double* d,
+                    lapack_complex_double* x, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sggglm( lapack_int* n, lapack_int* m, lapack_int* p, float* a,
+                    lapack_int* lda, float* b, lapack_int* ldb, float* d,
+                    float* x, float* y, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dggglm( lapack_int* n, lapack_int* m, lapack_int* p, double* a,
+                    lapack_int* lda, double* b, lapack_int* ldb, double* d,
+                    double* x, double* y, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cggglm( lapack_int* n, lapack_int* m, lapack_int* p,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* d, lapack_complex_float* x,
+                    lapack_complex_float* y, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zggglm( lapack_int* n, lapack_int* m, lapack_int* p,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* d, lapack_complex_double* x,
+                    lapack_complex_double* y, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_ssyev( char* jobz, char* uplo, lapack_int* n, float* a,
+                   lapack_int* lda, float* w, float* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_dsyev( char* jobz, char* uplo, lapack_int* n, double* a,
+                   lapack_int* lda, double* w, double* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_cheev( char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_float* a, lapack_int* lda, float* w,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                   lapack_int *info );
+void LAPACK_zheev( char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_double* a, lapack_int* lda, double* w,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_int *info );
+void LAPACK_ssyevd( char* jobz, char* uplo, lapack_int* n, float* a,
+                    lapack_int* lda, float* w, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_dsyevd( char* jobz, char* uplo, lapack_int* n, double* a,
+                    lapack_int* lda, double* w, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_cheevd( char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda, float* w,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_zheevd( char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda, double* w,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_ssyevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    float* a, lapack_int* lda, float* vl, float* vu,
+                    lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, float* z, lapack_int* ldz,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_dsyevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    double* a, lapack_int* lda, double* vl, double* vu,
+                    lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, double* z, lapack_int* ldz,
+                    double* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_cheevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda, float* vl,
+                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_zheevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda, double* vl,
+                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, lapack_complex_double* z,
+                    lapack_int* ldz, lapack_complex_double* work,
+                    lapack_int* lwork, double* rwork, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_ssyevr( char* jobz, char* range, char* uplo, lapack_int* n,
+                    float* a, lapack_int* lda, float* vl, float* vu,
+                    lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, float* z, lapack_int* ldz,
+                    lapack_int* isuppz, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_dsyevr( char* jobz, char* range, char* uplo, lapack_int* n,
+                    double* a, lapack_int* lda, double* vl, double* vu,
+                    lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, double* z, lapack_int* ldz,
+                    lapack_int* isuppz, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_cheevr( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda, float* vl,
+                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_int* isuppz,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_zheevr( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda, double* vl,
+                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, lapack_complex_double* z,
+                    lapack_int* ldz, lapack_int* isuppz,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_sspev( char* jobz, char* uplo, lapack_int* n, float* ap, float* w,
+                   float* z, lapack_int* ldz, float* work, lapack_int *info );
+void LAPACK_dspev( char* jobz, char* uplo, lapack_int* n, double* ap, double* w,
+                   double* z, lapack_int* ldz, double* work, lapack_int *info );
+void LAPACK_chpev( char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_float* ap, float* w, lapack_complex_float* z,
+                   lapack_int* ldz, lapack_complex_float* work, float* rwork,
+                   lapack_int *info );
+void LAPACK_zhpev( char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_double* ap, double* w,
+                   lapack_complex_double* z, lapack_int* ldz,
+                   lapack_complex_double* work, double* rwork,
+                   lapack_int *info );
+void LAPACK_sspevd( char* jobz, char* uplo, lapack_int* n, float* ap, float* w,
+                    float* z, lapack_int* ldz, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_dspevd( char* jobz, char* uplo, lapack_int* n, double* ap,
+                    double* w, double* z, lapack_int* ldz, double* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_chpevd( char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_float* ap, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int* lrwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_zhpevd( char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_double* ap, double* w,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_sspevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    float* ap, float* vl, float* vu, lapack_int* il,
+                    lapack_int* iu, float* abstol, lapack_int* m, float* w,
+                    float* z, lapack_int* ldz, float* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_dspevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    double* ap, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    double* z, lapack_int* ldz, double* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_chpevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_complex_float* ap, float* vl, float* vu,
+                    lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_complex_float* work, float* rwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_zhpevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_complex_double* ap, double* vl, double* vu,
+                    lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, lapack_complex_double* z,
+                    lapack_int* ldz, lapack_complex_double* work, double* rwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_ssbev( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                   float* ab, lapack_int* ldab, float* w, float* z,
+                   lapack_int* ldz, float* work, lapack_int *info );
+void LAPACK_dsbev( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                   double* ab, lapack_int* ldab, double* w, double* z,
+                   lapack_int* ldz, double* work, lapack_int *info );
+void LAPACK_chbev( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                   lapack_complex_float* ab, lapack_int* ldab, float* w,
+                   lapack_complex_float* z, lapack_int* ldz,
+                   lapack_complex_float* work, float* rwork, lapack_int *info );
+void LAPACK_zhbev( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                   lapack_complex_double* ab, lapack_int* ldab, double* w,
+                   lapack_complex_double* z, lapack_int* ldz,
+                   lapack_complex_double* work, double* rwork,
+                   lapack_int *info );
+void LAPACK_ssbevd( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                    float* ab, lapack_int* ldab, float* w, float* z,
+                    lapack_int* ldz, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_dsbevd( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                    double* ab, lapack_int* ldab, double* w, double* z,
+                    lapack_int* ldz, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_chbevd( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_complex_float* ab, lapack_int* ldab, float* w,
+                    lapack_complex_float* z, lapack_int* ldz,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_zhbevd( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_complex_double* ab, lapack_int* ldab, double* w,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_ssbevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* kd, float* ab, lapack_int* ldab, float* q,
+                    lapack_int* ldq, float* vl, float* vu, lapack_int* il,
+                    lapack_int* iu, float* abstol, lapack_int* m, float* w,
+                    float* z, lapack_int* ldz, float* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_dsbevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* kd, double* ab, lapack_int* ldab, double* q,
+                    lapack_int* ldq, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    double* z, lapack_int* ldz, double* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_chbevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* kd, lapack_complex_float* ab, lapack_int* ldab,
+                    lapack_complex_float* q, lapack_int* ldq, float* vl,
+                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_complex_float* work, float* rwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_zhbevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* kd, lapack_complex_double* ab, lapack_int* ldab,
+                    lapack_complex_double* q, lapack_int* ldq, double* vl,
+                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, lapack_complex_double* z,
+                    lapack_int* ldz, lapack_complex_double* work, double* rwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_sstev( char* jobz, lapack_int* n, float* d, float* e, float* z,
+                   lapack_int* ldz, float* work, lapack_int *info );
+void LAPACK_dstev( char* jobz, lapack_int* n, double* d, double* e, double* z,
+                   lapack_int* ldz, double* work, lapack_int *info );
+void LAPACK_sstevd( char* jobz, lapack_int* n, float* d, float* e, float* z,
+                    lapack_int* ldz, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_dstevd( char* jobz, lapack_int* n, double* d, double* e, double* z,
+                    lapack_int* ldz, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_sstevx( char* jobz, char* range, lapack_int* n, float* d, float* e,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    float* abstol, lapack_int* m, float* w, float* z,
+                    lapack_int* ldz, float* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_dstevx( char* jobz, char* range, lapack_int* n, double* d,
+                    double* e, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    double* z, lapack_int* ldz, double* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_sstevr( char* jobz, char* range, lapack_int* n, float* d, float* e,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    float* abstol, lapack_int* m, float* w, float* z,
+                    lapack_int* ldz, lapack_int* isuppz, float* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_dstevr( char* jobz, char* range, lapack_int* n, double* d,
+                    double* e, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    double* z, lapack_int* ldz, lapack_int* isuppz,
+                    double* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_sgees( char* jobvs, char* sort, LAPACK_S_SELECT2 select,
+                   lapack_int* n, float* a, lapack_int* lda, lapack_int* sdim,
+                   float* wr, float* wi, float* vs, lapack_int* ldvs,
+                   float* work, lapack_int* lwork, lapack_logical* bwork,
+                   lapack_int *info );
+void LAPACK_dgees( char* jobvs, char* sort, LAPACK_D_SELECT2 select,
+                   lapack_int* n, double* a, lapack_int* lda, lapack_int* sdim,
+                   double* wr, double* wi, double* vs, lapack_int* ldvs,
+                   double* work, lapack_int* lwork, lapack_logical* bwork,
+                   lapack_int *info );
+void LAPACK_cgees( char* jobvs, char* sort, LAPACK_C_SELECT1 select,
+                   lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                   lapack_int* sdim, lapack_complex_float* w,
+                   lapack_complex_float* vs, lapack_int* ldvs,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                   lapack_logical* bwork, lapack_int *info );
+void LAPACK_zgees( char* jobvs, char* sort, LAPACK_Z_SELECT1 select,
+                   lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                   lapack_int* sdim, lapack_complex_double* w,
+                   lapack_complex_double* vs, lapack_int* ldvs,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_logical* bwork, lapack_int *info );
+void LAPACK_sgeesx( char* jobvs, char* sort, LAPACK_S_SELECT2 select,
+                    char* sense, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* sdim, float* wr, float* wi, float* vs,
+                    lapack_int* ldvs, float* rconde, float* rcondv, float* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_logical* bwork, lapack_int *info );
+void LAPACK_dgeesx( char* jobvs, char* sort, LAPACK_D_SELECT2 select,
+                    char* sense, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* sdim, double* wr, double* wi, double* vs,
+                    lapack_int* ldvs, double* rconde, double* rcondv,
+                    double* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_cgeesx( char* jobvs, char* sort, LAPACK_C_SELECT1 select,
+                    char* sense, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* sdim, lapack_complex_float* w,
+                    lapack_complex_float* vs, lapack_int* ldvs, float* rconde,
+                    float* rcondv, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_zgeesx( char* jobvs, char* sort, LAPACK_Z_SELECT1 select,
+                    char* sense, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* sdim, lapack_complex_double* w,
+                    lapack_complex_double* vs, lapack_int* ldvs, double* rconde,
+                    double* rcondv, lapack_complex_double* work,
+                    lapack_int* lwork, double* rwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_sgeev( char* jobvl, char* jobvr, lapack_int* n, float* a,
+                   lapack_int* lda, float* wr, float* wi, float* vl,
+                   lapack_int* ldvl, float* vr, lapack_int* ldvr, float* work,
+                   lapack_int* lwork, lapack_int *info );
+void LAPACK_dgeev( char* jobvl, char* jobvr, lapack_int* n, double* a,
+                   lapack_int* lda, double* wr, double* wi, double* vl,
+                   lapack_int* ldvl, double* vr, lapack_int* ldvr, double* work,
+                   lapack_int* lwork, lapack_int *info );
+void LAPACK_cgeev( char* jobvl, char* jobvr, lapack_int* n,
+                   lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* w, lapack_complex_float* vl,
+                   lapack_int* ldvl, lapack_complex_float* vr, lapack_int* ldvr,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                   lapack_int *info );
+void LAPACK_zgeev( char* jobvl, char* jobvr, lapack_int* n,
+                   lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* w, lapack_complex_double* vl,
+                   lapack_int* ldvl, lapack_complex_double* vr,
+                   lapack_int* ldvr, lapack_complex_double* work,
+                   lapack_int* lwork, double* rwork, lapack_int *info );
+void LAPACK_sgeevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, float* a, lapack_int* lda, float* wr,
+                    float* wi, float* vl, lapack_int* ldvl, float* vr,
+                    lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi,
+                    float* scale, float* abnrm, float* rconde, float* rcondv,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgeevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, double* a, lapack_int* lda, double* wr,
+                    double* wi, double* vl, lapack_int* ldvl, double* vr,
+                    lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi,
+                    double* scale, double* abnrm, double* rconde,
+                    double* rcondv, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_cgeevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* w, lapack_complex_float* vl,
+                    lapack_int* ldvl, lapack_complex_float* vr,
+                    lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi,
+                    float* scale, float* abnrm, float* rconde, float* rcondv,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int *info );
+void LAPACK_zgeevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* w, lapack_complex_double* vl,
+                    lapack_int* ldvl, lapack_complex_double* vr,
+                    lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi,
+                    double* scale, double* abnrm, double* rconde,
+                    double* rcondv, lapack_complex_double* work,
+                    lapack_int* lwork, double* rwork, lapack_int *info );
+void LAPACK_sgesvd( char* jobu, char* jobvt, lapack_int* m, lapack_int* n,
+                    float* a, lapack_int* lda, float* s, float* u,
+                    lapack_int* ldu, float* vt, lapack_int* ldvt, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dgesvd( char* jobu, char* jobvt, lapack_int* m, lapack_int* n,
+                    double* a, lapack_int* lda, double* s, double* u,
+                    lapack_int* ldu, double* vt, lapack_int* ldvt, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cgesvd( char* jobu, char* jobvt, lapack_int* m, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda, float* s,
+                    lapack_complex_float* u, lapack_int* ldu,
+                    lapack_complex_float* vt, lapack_int* ldvt,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int *info );
+void LAPACK_zgesvd( char* jobu, char* jobvt, lapack_int* m, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda, double* s,
+                    lapack_complex_double* u, lapack_int* ldu,
+                    lapack_complex_double* vt, lapack_int* ldvt,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int *info );
+void LAPACK_sgesdd( char* jobz, lapack_int* m, lapack_int* n, float* a,
+                    lapack_int* lda, float* s, float* u, lapack_int* ldu,
+                    float* vt, lapack_int* ldvt, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dgesdd( char* jobz, lapack_int* m, lapack_int* n, double* a,
+                    lapack_int* lda, double* s, double* u, lapack_int* ldu,
+                    double* vt, lapack_int* ldvt, double* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_cgesdd( char* jobz, lapack_int* m, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda, float* s,
+                    lapack_complex_float* u, lapack_int* ldu,
+                    lapack_complex_float* vt, lapack_int* ldvt,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_zgesdd( char* jobz, lapack_int* m, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda, double* s,
+                    lapack_complex_double* u, lapack_int* ldu,
+                    lapack_complex_double* vt, lapack_int* ldvt,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_dgejsv( char* joba, char* jobu, char* jobv, char* jobr, char* jobt,
+                    char* jobp, lapack_int* m, lapack_int* n, double* a,
+                    lapack_int* lda, double* sva, double* u, lapack_int* ldu,
+                    double* v, lapack_int* ldv, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_sgejsv( char* joba, char* jobu, char* jobv, char* jobr, char* jobt,
+                    char* jobp, lapack_int* m, lapack_int* n, float* a,
+                    lapack_int* lda, float* sva, float* u, lapack_int* ldu,
+                    float* v, lapack_int* ldv, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dgesvj( char* joba, char* jobu, char* jobv, lapack_int* m,
+                    lapack_int* n, double* a, lapack_int* lda, double* sva,
+                    lapack_int* mv, double* v, lapack_int* ldv, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sgesvj( char* joba, char* jobu, char* jobv, lapack_int* m,
+                    lapack_int* n, float* a, lapack_int* lda, float* sva,
+                    lapack_int* mv, float* v, lapack_int* ldv, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sggsvd( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l,
+                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                    float* alpha, float* beta, float* u, lapack_int* ldu,
+                    float* v, lapack_int* ldv, float* q, lapack_int* ldq,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dggsvd( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l,
+                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                    double* alpha, double* beta, double* u, lapack_int* ldu,
+                    double* v, lapack_int* ldv, double* q, lapack_int* ldq,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cggsvd( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, float* alpha,
+                    float* beta, lapack_complex_float* u, lapack_int* ldu,
+                    lapack_complex_float* v, lapack_int* ldv,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* work, float* rwork, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_zggsvd( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, double* alpha,
+                    double* beta, lapack_complex_double* u, lapack_int* ldu,
+                    lapack_complex_double* v, lapack_int* ldv,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_ssygv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                   float* w, float* work, lapack_int* lwork, lapack_int *info );
+void LAPACK_dsygv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                   double* w, double* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_chegv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb, float* w,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                   lapack_int *info );
+void LAPACK_zhegv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb, double* w,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_int *info );
+void LAPACK_ssygvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                    float* w, float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_dsygvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                    double* w, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_chegvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, float* w,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_zhegvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, double* w,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_ssygvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, float* a, lapack_int* lda, float* b,
+                    lapack_int* ldb, float* vl, float* vu, lapack_int* il,
+                    lapack_int* iu, float* abstol, lapack_int* m, float* w,
+                    float* z, lapack_int* ldz, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_dsygvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, double* a, lapack_int* lda, double* b,
+                    lapack_int* ldb, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    double* z, lapack_int* ldz, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_chegvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, float* vl,
+                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_zhegvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, double* vl,
+                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, lapack_complex_double* z,
+                    lapack_int* ldz, lapack_complex_double* work,
+                    lapack_int* lwork, double* rwork, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_sspgv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   float* ap, float* bp, float* w, float* z, lapack_int* ldz,
+                   float* work, lapack_int *info );
+void LAPACK_dspgv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   double* ap, double* bp, double* w, double* z,
+                   lapack_int* ldz, double* work, lapack_int *info );
+void LAPACK_chpgv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_float* ap, lapack_complex_float* bp, float* w,
+                   lapack_complex_float* z, lapack_int* ldz,
+                   lapack_complex_float* work, float* rwork, lapack_int *info );
+void LAPACK_zhpgv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_double* ap, lapack_complex_double* bp,
+                   double* w, lapack_complex_double* z, lapack_int* ldz,
+                   lapack_complex_double* work, double* rwork,
+                   lapack_int *info );
+void LAPACK_sspgvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    float* ap, float* bp, float* w, float* z, lapack_int* ldz,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_dspgvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    double* ap, double* bp, double* w, double* z,
+                    lapack_int* ldz, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_chpgvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_float* ap, lapack_complex_float* bp,
+                    float* w, lapack_complex_float* z, lapack_int* ldz,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_zhpgvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_double* ap, lapack_complex_double* bp,
+                    double* w, lapack_complex_double* z, lapack_int* ldz,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_sspgvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, float* ap, float* bp, float* vl, float* vu,
+                    lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, float* z, lapack_int* ldz,
+                    float* work, lapack_int* iwork, lapack_int* ifail,
+                    lapack_int *info );
+void LAPACK_dspgvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, double* ap, double* bp, double* vl,
+                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, double* z, lapack_int* ldz,
+                    double* work, lapack_int* iwork, lapack_int* ifail,
+                    lapack_int *info );
+void LAPACK_chpgvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, lapack_complex_float* ap,
+                    lapack_complex_float* bp, float* vl, float* vu,
+                    lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_complex_float* work, float* rwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_zhpgvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, lapack_complex_double* ap,
+                    lapack_complex_double* bp, double* vl, double* vu,
+                    lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, lapack_complex_double* z,
+                    lapack_int* ldz, lapack_complex_double* work, double* rwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_ssbgv( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                   lapack_int* kb, float* ab, lapack_int* ldab, float* bb,
+                   lapack_int* ldbb, float* w, float* z, lapack_int* ldz,
+                   float* work, lapack_int *info );
+void LAPACK_dsbgv( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                   lapack_int* kb, double* ab, lapack_int* ldab, double* bb,
+                   lapack_int* ldbb, double* w, double* z, lapack_int* ldz,
+                   double* work, lapack_int *info );
+void LAPACK_chbgv( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                   lapack_int* kb, lapack_complex_float* ab, lapack_int* ldab,
+                   lapack_complex_float* bb, lapack_int* ldbb, float* w,
+                   lapack_complex_float* z, lapack_int* ldz,
+                   lapack_complex_float* work, float* rwork, lapack_int *info );
+void LAPACK_zhbgv( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                   lapack_int* kb, lapack_complex_double* ab, lapack_int* ldab,
+                   lapack_complex_double* bb, lapack_int* ldbb, double* w,
+                   lapack_complex_double* z, lapack_int* ldz,
+                   lapack_complex_double* work, double* rwork,
+                   lapack_int *info );
+void LAPACK_ssbgvd( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, float* ab, lapack_int* ldab, float* bb,
+                    lapack_int* ldbb, float* w, float* z, lapack_int* ldz,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_dsbgvd( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, double* ab, lapack_int* ldab, double* bb,
+                    lapack_int* ldbb, double* w, double* z, lapack_int* ldz,
+                    double* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_chbgvd( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, lapack_complex_float* ab, lapack_int* ldab,
+                    lapack_complex_float* bb, lapack_int* ldbb, float* w,
+                    lapack_complex_float* z, lapack_int* ldz,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_zhbgvd( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, lapack_complex_double* ab, lapack_int* ldab,
+                    lapack_complex_double* bb, lapack_int* ldbb, double* w,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_ssbgvx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* ka, lapack_int* kb, float* ab, lapack_int* ldab,
+                    float* bb, lapack_int* ldbb, float* q, lapack_int* ldq,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    float* abstol, lapack_int* m, float* w, float* z,
+                    lapack_int* ldz, float* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_dsbgvx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* ka, lapack_int* kb, double* ab,
+                    lapack_int* ldab, double* bb, lapack_int* ldbb, double* q,
+                    lapack_int* ldq, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    double* z, lapack_int* ldz, double* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_chbgvx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* ka, lapack_int* kb, lapack_complex_float* ab,
+                    lapack_int* ldab, lapack_complex_float* bb,
+                    lapack_int* ldbb, lapack_complex_float* q, lapack_int* ldq,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    float* abstol, lapack_int* m, float* w,
+                    lapack_complex_float* z, lapack_int* ldz,
+                    lapack_complex_float* work, float* rwork, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_zhbgvx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* ka, lapack_int* kb, lapack_complex_double* ab,
+                    lapack_int* ldab, lapack_complex_double* bb,
+                    lapack_int* ldbb, lapack_complex_double* q, lapack_int* ldq,
+                    double* vl, double* vu, lapack_int* il, lapack_int* iu,
+                    double* abstol, lapack_int* m, double* w,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_sgges( char* jobvsl, char* jobvsr, char* sort,
+                   LAPACK_S_SELECT3 selctg, lapack_int* n, float* a,
+                   lapack_int* lda, float* b, lapack_int* ldb, lapack_int* sdim,
+                   float* alphar, float* alphai, float* beta, float* vsl,
+                   lapack_int* ldvsl, float* vsr, lapack_int* ldvsr,
+                   float* work, lapack_int* lwork, lapack_logical* bwork,
+                   lapack_int *info );
+void LAPACK_dgges( char* jobvsl, char* jobvsr, char* sort,
+                   LAPACK_D_SELECT3 selctg, lapack_int* n, double* a,
+                   lapack_int* lda, double* b, lapack_int* ldb,
+                   lapack_int* sdim, double* alphar, double* alphai,
+                   double* beta, double* vsl, lapack_int* ldvsl, double* vsr,
+                   lapack_int* ldvsr, double* work, lapack_int* lwork,
+                   lapack_logical* bwork, lapack_int *info );
+void LAPACK_cgges( char* jobvsl, char* jobvsr, char* sort,
+                   LAPACK_C_SELECT2 selctg, lapack_int* n,
+                   lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int* sdim,
+                   lapack_complex_float* alpha, lapack_complex_float* beta,
+                   lapack_complex_float* vsl, lapack_int* ldvsl,
+                   lapack_complex_float* vsr, lapack_int* ldvsr,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                   lapack_logical* bwork, lapack_int *info );
+void LAPACK_zgges( char* jobvsl, char* jobvsr, char* sort,
+                   LAPACK_Z_SELECT2 selctg, lapack_int* n,
+                   lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb, lapack_int* sdim,
+                   lapack_complex_double* alpha, lapack_complex_double* beta,
+                   lapack_complex_double* vsl, lapack_int* ldvsl,
+                   lapack_complex_double* vsr, lapack_int* ldvsr,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_logical* bwork, lapack_int *info );
+void LAPACK_sggesx( char* jobvsl, char* jobvsr, char* sort,
+                    LAPACK_S_SELECT3 selctg, char* sense, lapack_int* n,
+                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                    lapack_int* sdim, float* alphar, float* alphai, float* beta,
+                    float* vsl, lapack_int* ldvsl, float* vsr,
+                    lapack_int* ldvsr, float* rconde, float* rcondv,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_dggesx( char* jobvsl, char* jobvsr, char* sort,
+                    LAPACK_D_SELECT3 selctg, char* sense, lapack_int* n,
+                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                    lapack_int* sdim, double* alphar, double* alphai,
+                    double* beta, double* vsl, lapack_int* ldvsl, double* vsr,
+                    lapack_int* ldvsr, double* rconde, double* rcondv,
+                    double* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_cggesx( char* jobvsl, char* jobvsr, char* sort,
+                    LAPACK_C_SELECT2 selctg, char* sense, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, lapack_int* sdim,
+                    lapack_complex_float* alpha, lapack_complex_float* beta,
+                    lapack_complex_float* vsl, lapack_int* ldvsl,
+                    lapack_complex_float* vsr, lapack_int* ldvsr, float* rconde,
+                    float* rcondv, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_zggesx( char* jobvsl, char* jobvsr, char* sort,
+                    LAPACK_Z_SELECT2 selctg, char* sense, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, lapack_int* sdim,
+                    lapack_complex_double* alpha, lapack_complex_double* beta,
+                    lapack_complex_double* vsl, lapack_int* ldvsl,
+                    lapack_complex_double* vsr, lapack_int* ldvsr,
+                    double* rconde, double* rcondv, lapack_complex_double* work,
+                    lapack_int* lwork, double* rwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_sggev( char* jobvl, char* jobvr, lapack_int* n, float* a,
+                   lapack_int* lda, float* b, lapack_int* ldb, float* alphar,
+                   float* alphai, float* beta, float* vl, lapack_int* ldvl,
+                   float* vr, lapack_int* ldvr, float* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_dggev( char* jobvl, char* jobvr, lapack_int* n, double* a,
+                   lapack_int* lda, double* b, lapack_int* ldb, double* alphar,
+                   double* alphai, double* beta, double* vl, lapack_int* ldvl,
+                   double* vr, lapack_int* ldvr, double* work,
+                   lapack_int* lwork, lapack_int *info );
+void LAPACK_cggev( char* jobvl, char* jobvr, lapack_int* n,
+                   lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* alpha, lapack_complex_float* beta,
+                   lapack_complex_float* vl, lapack_int* ldvl,
+                   lapack_complex_float* vr, lapack_int* ldvr,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                   lapack_int *info );
+void LAPACK_zggev( char* jobvl, char* jobvr, lapack_int* n,
+                   lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* alpha, lapack_complex_double* beta,
+                   lapack_complex_double* vl, lapack_int* ldvl,
+                   lapack_complex_double* vr, lapack_int* ldvr,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_int *info );
+void LAPACK_sggevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, float* a, lapack_int* lda, float* b,
+                    lapack_int* ldb, float* alphar, float* alphai, float* beta,
+                    float* vl, lapack_int* ldvl, float* vr, lapack_int* ldvr,
+                    lapack_int* ilo, lapack_int* ihi, float* lscale,
+                    float* rscale, float* abnrm, float* bbnrm, float* rconde,
+                    float* rcondv, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_dggevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, double* a, lapack_int* lda, double* b,
+                    lapack_int* ldb, double* alphar, double* alphai,
+                    double* beta, double* vl, lapack_int* ldvl, double* vr,
+                    lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi,
+                    double* lscale, double* rscale, double* abnrm,
+                    double* bbnrm, double* rconde, double* rcondv, double* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_cggevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* alpha, lapack_complex_float* beta,
+                    lapack_complex_float* vl, lapack_int* ldvl,
+                    lapack_complex_float* vr, lapack_int* ldvr, lapack_int* ilo,
+                    lapack_int* ihi, float* lscale, float* rscale, float* abnrm,
+                    float* bbnrm, float* rconde, float* rcondv,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* iwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_zggevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* alpha, lapack_complex_double* beta,
+                    lapack_complex_double* vl, lapack_int* ldvl,
+                    lapack_complex_double* vr, lapack_int* ldvr,
+                    lapack_int* ilo, lapack_int* ihi, double* lscale,
+                    double* rscale, double* abnrm, double* bbnrm,
+                    double* rconde, double* rcondv, lapack_complex_double* work,
+                    lapack_int* lwork, double* rwork, lapack_int* iwork,
+                    lapack_logical* bwork, lapack_int *info );
+void LAPACK_dsfrk( char* transr, char* uplo, char* trans, lapack_int* n,
+                   lapack_int* k, double* alpha, const double* a,
+                   lapack_int* lda, double* beta, double* c );
+void LAPACK_ssfrk( char* transr, char* uplo, char* trans, lapack_int* n,
+                   lapack_int* k, float* alpha, const float* a, lapack_int* lda,
+                   float* beta, float* c );
+void LAPACK_zhfrk( char* transr, char* uplo, char* trans, lapack_int* n,
+                   lapack_int* k, double* alpha, const lapack_complex_double* a,
+                   lapack_int* lda, double* beta, lapack_complex_double* c );
+void LAPACK_chfrk( char* transr, char* uplo, char* trans, lapack_int* n,
+                   lapack_int* k, float* alpha, const lapack_complex_float* a,
+                   lapack_int* lda, float* beta, lapack_complex_float* c );
+void LAPACK_dtfsm( char* transr, char* side, char* uplo, char* trans,
+                   char* diag, lapack_int* m, lapack_int* n, double* alpha,
+                   const double* a, double* b, lapack_int* ldb );
+void LAPACK_stfsm( char* transr, char* side, char* uplo, char* trans,
+                   char* diag, lapack_int* m, lapack_int* n, float* alpha,
+                   const float* a, float* b, lapack_int* ldb );
+void LAPACK_ztfsm( char* transr, char* side, char* uplo, char* trans,
+                   char* diag, lapack_int* m, lapack_int* n,
+                   lapack_complex_double* alpha, const lapack_complex_double* a,
+                   lapack_complex_double* b, lapack_int* ldb );
+void LAPACK_ctfsm( char* transr, char* side, char* uplo, char* trans,
+                   char* diag, lapack_int* m, lapack_int* n,
+                   lapack_complex_float* alpha, const lapack_complex_float* a,
+                   lapack_complex_float* b, lapack_int* ldb );
+void LAPACK_dtfttp( char* transr, char* uplo, lapack_int* n, const double* arf,
+                    double* ap, lapack_int *info );
+void LAPACK_stfttp( char* transr, char* uplo, lapack_int* n, const float* arf,
+                    float* ap, lapack_int *info );
+void LAPACK_ztfttp( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_double* arf, lapack_complex_double* ap,
+                    lapack_int *info );
+void LAPACK_ctfttp( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_float* arf, lapack_complex_float* ap,
+                    lapack_int *info );
+void LAPACK_dtfttr( char* transr, char* uplo, lapack_int* n, const double* arf,
+                    double* a, lapack_int* lda, lapack_int *info );
+void LAPACK_stfttr( char* transr, char* uplo, lapack_int* n, const float* arf,
+                    float* a, lapack_int* lda, lapack_int *info );
+void LAPACK_ztfttr( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_double* arf, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_ctfttr( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_float* arf, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_dtpttf( char* transr, char* uplo, lapack_int* n, const double* ap,
+                    double* arf, lapack_int *info );
+void LAPACK_stpttf( char* transr, char* uplo, lapack_int* n, const float* ap,
+                    float* arf, lapack_int *info );
+void LAPACK_ztpttf( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_double* ap, lapack_complex_double* arf,
+                    lapack_int *info );
+void LAPACK_ctpttf( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_float* ap, lapack_complex_float* arf,
+                    lapack_int *info );
+void LAPACK_dtpttr( char* uplo, lapack_int* n, const double* ap, double* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_stpttr( char* uplo, lapack_int* n, const float* ap, float* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_ztpttr( char* uplo, lapack_int* n, const lapack_complex_double* ap,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_ctpttr( char* uplo, lapack_int* n, const lapack_complex_float* ap,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_dtrttf( char* transr, char* uplo, lapack_int* n, const double* a,
+                    lapack_int* lda, double* arf, lapack_int *info );
+void LAPACK_strttf( char* transr, char* uplo, lapack_int* n, const float* a,
+                    lapack_int* lda, float* arf, lapack_int *info );
+void LAPACK_ztrttf( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* arf, lapack_int *info );
+void LAPACK_ctrttf( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* arf, lapack_int *info );
+void LAPACK_dtrttp( char* uplo, lapack_int* n, const double* a, lapack_int* lda,
+                    double* ap, lapack_int *info );
+void LAPACK_strttp( char* uplo, lapack_int* n, const float* a, lapack_int* lda,
+                    float* ap, lapack_int *info );
+void LAPACK_ztrttp( char* uplo, lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* ap,
+                    lapack_int *info );
+void LAPACK_ctrttp( char* uplo, lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* ap,
+                    lapack_int *info );
+void LAPACK_sgeqrfp( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                     float* tau, float* work, lapack_int* lwork,
+                     lapack_int *info );
+void LAPACK_dgeqrfp( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                     double* tau, double* work, lapack_int* lwork,
+                     lapack_int *info );
+void LAPACK_cgeqrfp( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                     lapack_int* lda, lapack_complex_float* tau,
+                     lapack_complex_float* work, lapack_int* lwork,
+                     lapack_int *info );
+void LAPACK_zgeqrfp( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                     lapack_int* lda, lapack_complex_double* tau,
+                     lapack_complex_double* work, lapack_int* lwork,
+                     lapack_int *info );
+void LAPACK_clacgv( lapack_int* n, lapack_complex_float* x, lapack_int* incx );
+void LAPACK_zlacgv( lapack_int* n, lapack_complex_double* x, lapack_int* incx );
+void LAPACK_slarnv( lapack_int* idist, lapack_int* iseed, lapack_int* n,
+                    float* x );
+void LAPACK_dlarnv( lapack_int* idist, lapack_int* iseed, lapack_int* n,
+                    double* x );
+void LAPACK_clarnv( lapack_int* idist, lapack_int* iseed, lapack_int* n,
+                    lapack_complex_float* x );
+void LAPACK_zlarnv( lapack_int* idist, lapack_int* iseed, lapack_int* n,
+                    lapack_complex_double* x );
+void LAPACK_sgeqr2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* tau, float* work, lapack_int *info );
+void LAPACK_dgeqr2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* tau, double* work, lapack_int *info );
+void LAPACK_cgeqr2( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zgeqr2( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_slacpy( char* uplo, lapack_int* m, lapack_int* n, const float* a,
+                    lapack_int* lda, float* b, lapack_int* ldb );
+void LAPACK_dlacpy( char* uplo, lapack_int* m, lapack_int* n, const double* a,
+                    lapack_int* lda, double* b, lapack_int* ldb );
+void LAPACK_clacpy( char* uplo, lapack_int* m, lapack_int* n,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb );
+void LAPACK_zlacpy( char* uplo, lapack_int* m, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb );
+void LAPACK_sgetf2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_dgetf2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_cgetf2( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* ipiv, lapack_int *info );
+void LAPACK_zgetf2( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* ipiv, lapack_int *info );
+void LAPACK_slaswp( lapack_int* n, float* a, lapack_int* lda, lapack_int* k1,
+                    lapack_int* k2, const lapack_int* ipiv, lapack_int* incx );
+void LAPACK_dlaswp( lapack_int* n, double* a, lapack_int* lda, lapack_int* k1,
+                    lapack_int* k2, const lapack_int* ipiv, lapack_int* incx );
+void LAPACK_claswp( lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                    lapack_int* k1, lapack_int* k2, const lapack_int* ipiv,
+                    lapack_int* incx );
+void LAPACK_zlaswp( lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                    lapack_int* k1, lapack_int* k2, const lapack_int* ipiv,
+                    lapack_int* incx );
+float LAPACK_slange( char* norm, lapack_int* m, lapack_int* n, const float* a,
+                    lapack_int* lda, float* work );
+double LAPACK_dlange( char* norm, lapack_int* m, lapack_int* n, const double* a,
+                    lapack_int* lda, double* work );
+float LAPACK_clange( char* norm, lapack_int* m, lapack_int* n,
+                    const lapack_complex_float* a, lapack_int* lda, float* work );
+double LAPACK_zlange( char* norm, lapack_int* m, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda, double* work );
+float LAPACK_clanhe( char* norm, char* uplo, lapack_int* n,
+                    const lapack_complex_float* a, lapack_int* lda, float* work );
+double LAPACK_zlanhe( char* norm, char* uplo, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda, double* work );
+float LAPACK_slansy( char* norm, char* uplo, lapack_int* n, const float* a,
+                    lapack_int* lda, float* work );
+double LAPACK_dlansy( char* norm, char* uplo, lapack_int* n, const double* a,
+                    lapack_int* lda, double* work );
+float LAPACK_clansy( char* norm, char* uplo, lapack_int* n,
+                    const lapack_complex_float* a, lapack_int* lda, float* work );
+double LAPACK_zlansy( char* norm, char* uplo, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda, double* work );
+float LAPACK_slantr( char* norm, char* uplo, char* diag, lapack_int* m,
+                    lapack_int* n, const float* a, lapack_int* lda, float* work );
+double LAPACK_dlantr( char* norm, char* uplo, char* diag, lapack_int* m,
+                    lapack_int* n, const double* a, lapack_int* lda, double* work );
+float LAPACK_clantr( char* norm, char* uplo, char* diag, lapack_int* m,
+                    lapack_int* n, const lapack_complex_float* a, lapack_int* lda,
+                    float* work );
+double LAPACK_zlantr( char* norm, char* uplo, char* diag, lapack_int* m,
+                    lapack_int* n, const lapack_complex_double* a, lapack_int* lda,
+                    double* work );
+float LAPACK_slamch( char* cmach );
+double LAPACK_dlamch( char* cmach );
+void LAPACK_sgelq2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* tau, float* work, lapack_int *info );
+void LAPACK_dgelq2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* tau, double* work, lapack_int *info );
+void LAPACK_cgelq2( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zgelq2( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_slarfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k, const float* v,
+                    lapack_int* ldv, const float* t, lapack_int* ldt, float* c,
+                    lapack_int* ldc, float* work, lapack_int* ldwork );
+void LAPACK_dlarfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k,
+                    const double* v, lapack_int* ldv, const double* t,
+                    lapack_int* ldt, double* c, lapack_int* ldc, double* work,
+                    lapack_int* ldwork );
+void LAPACK_clarfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k,
+                    const lapack_complex_float* v, lapack_int* ldv,
+                    const lapack_complex_float* t, lapack_int* ldt,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* ldwork );
+void LAPACK_zlarfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k,
+                    const lapack_complex_double* v, lapack_int* ldv,
+                    const lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work, lapack_int* ldwork );
+void LAPACK_slarfg( lapack_int* n, float* alpha, float* x, lapack_int* incx,
+                    float* tau );
+void LAPACK_dlarfg( lapack_int* n, double* alpha, double* x, lapack_int* incx,
+                    double* tau );
+void LAPACK_clarfg( lapack_int* n, lapack_complex_float* alpha,
+                    lapack_complex_float* x, lapack_int* incx,
+                    lapack_complex_float* tau );
+void LAPACK_zlarfg( lapack_int* n, lapack_complex_double* alpha,
+                    lapack_complex_double* x, lapack_int* incx,
+                    lapack_complex_double* tau );
+void LAPACK_slarft( char* direct, char* storev, lapack_int* n, lapack_int* k,
+                    const float* v, lapack_int* ldv, const float* tau, float* t,
+                    lapack_int* ldt );
+void LAPACK_dlarft( char* direct, char* storev, lapack_int* n, lapack_int* k,
+                    const double* v, lapack_int* ldv, const double* tau,
+                    double* t, lapack_int* ldt );
+void LAPACK_clarft( char* direct, char* storev, lapack_int* n, lapack_int* k,
+                    const lapack_complex_float* v, lapack_int* ldv,
+                    const lapack_complex_float* tau, lapack_complex_float* t,
+                    lapack_int* ldt );
+void LAPACK_zlarft( char* direct, char* storev, lapack_int* n, lapack_int* k,
+                    const lapack_complex_double* v, lapack_int* ldv,
+                    const lapack_complex_double* tau, lapack_complex_double* t,
+                    lapack_int* ldt );
+void LAPACK_slarfx( char* side, lapack_int* m, lapack_int* n, const float* v,
+                    float* tau, float* c, lapack_int* ldc, float* work );
+void LAPACK_dlarfx( char* side, lapack_int* m, lapack_int* n, const double* v,
+                    double* tau, double* c, lapack_int* ldc, double* work );
+void LAPACK_clarfx( char* side, lapack_int* m, lapack_int* n,
+                    const lapack_complex_float* v, lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work );
+void LAPACK_zlarfx( char* side, lapack_int* m, lapack_int* n,
+                    const lapack_complex_double* v, lapack_complex_double* tau,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work );
+void LAPACK_slatms( lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed,
+                    char* sym, float* d, lapack_int* mode, float* cond,
+                    float* dmax, lapack_int* kl, lapack_int* ku, char* pack,
+                    float* a, lapack_int* lda, float* work, lapack_int *info );
+void LAPACK_dlatms( lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed,
+                    char* sym, double* d, lapack_int* mode, double* cond,
+                    double* dmax, lapack_int* kl, lapack_int* ku, char* pack,
+                    double* a, lapack_int* lda, double* work,
+                    lapack_int *info );
+void LAPACK_clatms( lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed,
+                    char* sym, float* d, lapack_int* mode, float* cond,
+                    float* dmax, lapack_int* kl, lapack_int* ku, char* pack,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zlatms( lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed,
+                    char* sym, double* d, lapack_int* mode, double* cond,
+                    double* dmax, lapack_int* kl, lapack_int* ku, char* pack,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_slag2d( lapack_int* m, lapack_int* n, const float* sa,
+                    lapack_int* ldsa, double* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_dlag2s( lapack_int* m, lapack_int* n, const double* a,
+                    lapack_int* lda, float* sa, lapack_int* ldsa,
+                    lapack_int *info );
+void LAPACK_clag2z( lapack_int* m, lapack_int* n,
+                    const lapack_complex_float* sa, lapack_int* ldsa,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_zlag2c( lapack_int* m, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_float* sa, lapack_int* ldsa,
+                    lapack_int *info );
+void LAPACK_slauum( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_dlauum( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_clauum( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_zlauum( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_slagge( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const float* d, float* a, lapack_int* lda,
+                    lapack_int* iseed, float* work, lapack_int *info );
+void LAPACK_dlagge( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const double* d, double* a, lapack_int* lda,
+                    lapack_int* iseed, double* work, lapack_int *info );
+void LAPACK_clagge( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const float* d, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* iseed,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zlagge( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const double* d, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* iseed,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_slaset( char* uplo, lapack_int* m, lapack_int* n, float* alpha,
+                    float* beta, float* a, lapack_int* lda );
+void LAPACK_dlaset( char* uplo, lapack_int* m, lapack_int* n, double* alpha,
+                    double* beta, double* a, lapack_int* lda );
+void LAPACK_claset( char* uplo, lapack_int* m, lapack_int* n,
+                    lapack_complex_float* alpha, lapack_complex_float* beta,
+                    lapack_complex_float* a, lapack_int* lda );
+void LAPACK_zlaset( char* uplo, lapack_int* m, lapack_int* n,
+                    lapack_complex_double* alpha, lapack_complex_double* beta,
+                    lapack_complex_double* a, lapack_int* lda );
+void LAPACK_slasrt( char* id, lapack_int* n, float* d, lapack_int *info );
+void LAPACK_dlasrt( char* id, lapack_int* n, double* d, lapack_int *info );
+void LAPACK_claghe( lapack_int* n, lapack_int* k, const float* d,
+                    lapack_complex_float* a, lapack_int* lda, lapack_int* iseed,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zlaghe( lapack_int* n, lapack_int* k, const double* d,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_int* iseed, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_slagsy( lapack_int* n, lapack_int* k, const float* d, float* a,
+                    lapack_int* lda, lapack_int* iseed, float* work,
+                    lapack_int *info );
+void LAPACK_dlagsy( lapack_int* n, lapack_int* k, const double* d, double* a,
+                    lapack_int* lda, lapack_int* iseed, double* work,
+                    lapack_int *info );
+void LAPACK_clagsy( lapack_int* n, lapack_int* k, const float* d,
+                    lapack_complex_float* a, lapack_int* lda, lapack_int* iseed,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zlagsy( lapack_int* n, lapack_int* k, const double* d,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_int* iseed, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_slapmr( lapack_logical* forwrd, lapack_int* m, lapack_int* n,
+                    float* x, lapack_int* ldx, lapack_int* k );
+void LAPACK_dlapmr( lapack_logical* forwrd, lapack_int* m, lapack_int* n,
+                    double* x, lapack_int* ldx, lapack_int* k );
+void LAPACK_clapmr( lapack_logical* forwrd, lapack_int* m, lapack_int* n,
+                    lapack_complex_float* x, lapack_int* ldx, lapack_int* k );
+void LAPACK_zlapmr( lapack_logical* forwrd, lapack_int* m, lapack_int* n,
+                    lapack_complex_double* x, lapack_int* ldx, lapack_int* k );
+float LAPACK_slapy2( float* x, float* y );
+double LAPACK_dlapy2( double* x, double* y );
+float LAPACK_slapy3( float* x, float* y, float* z );
+double LAPACK_dlapy3( double* x, double* y, double* z );
+void LAPACK_slartgp( float* f, float* g, float* cs, float* sn, float* r );
+void LAPACK_dlartgp( double* f, double* g, double* cs, double* sn, double* r );
+void LAPACK_slartgs( float* x, float* y, float* sigma, float* cs, float* sn );
+void LAPACK_dlartgs( double* x, double* y, double* sigma, double* cs,
+                     double* sn );
+// LAPACK 3.3.0
+void LAPACK_cbbcsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    float* theta, float* phi,
+                    lapack_complex_float* u1, lapack_int* ldu1,
+                    lapack_complex_float* u2, lapack_int* ldu2,
+                    lapack_complex_float* v1t, lapack_int* ldv1t,
+                    lapack_complex_float* v2t, lapack_int* ldv2t,
+                    float* b11d, float* b11e, float* b12d,
+                    float* b12e, float* b21d, float* b21e,
+                    float* b22d, float* b22e, float* rwork,
+                    lapack_int* lrwork , lapack_int *info );
+void LAPACK_cheswapr( char* uplo, lapack_int* n,
+                      lapack_complex_float* a, lapack_int* i1,
+                      lapack_int* i2 );
+void LAPACK_chetri2( char* uplo, lapack_int* n,
+                     lapack_complex_float* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_float* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_chetri2x( char* uplo, lapack_int* n,
+                      lapack_complex_float* a, lapack_int* lda,
+                      const lapack_int* ipiv,
+                      lapack_complex_float* work, lapack_int* nb , lapack_int *info );
+void LAPACK_chetrs2( char* uplo, lapack_int* n,
+                     lapack_int* nrhs, const lapack_complex_float* a,
+                     lapack_int* lda, const lapack_int* ipiv,
+                     lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* work , lapack_int *info );
+void LAPACK_csyconv( char* uplo, char* way,
+                     lapack_int* n, lapack_complex_float* a,
+                     lapack_int* lda, const lapack_int* ipiv,
+                     lapack_complex_float* work , lapack_int *info );
+void LAPACK_csyswapr( char* uplo, lapack_int* n,
+                      lapack_complex_float* a, lapack_int* i1,
+                      lapack_int* i2 );
+void LAPACK_csytri2( char* uplo, lapack_int* n,
+                     lapack_complex_float* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_float* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_csytri2x( char* uplo, lapack_int* n,
+                      lapack_complex_float* a, lapack_int* lda,
+                      const lapack_int* ipiv,
+                      lapack_complex_float* work, lapack_int* nb , lapack_int *info );
+void LAPACK_csytrs2( char* uplo, lapack_int* n,
+                     lapack_int* nrhs, const lapack_complex_float* a,
+                     lapack_int* lda, const lapack_int* ipiv,
+                     lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* work , lapack_int *info );
+void LAPACK_cunbdb( char* trans, char* signs,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    lapack_complex_float* x11, lapack_int* ldx11,
+                    lapack_complex_float* x12, lapack_int* ldx12,
+                    lapack_complex_float* x21, lapack_int* ldx21,
+                    lapack_complex_float* x22, lapack_int* ldx22,
+                    float* theta, float* phi,
+                    lapack_complex_float* taup1,
+                    lapack_complex_float* taup2,
+                    lapack_complex_float* tauq1,
+                    lapack_complex_float* tauq2,
+                    lapack_complex_float* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_cuncsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    char* signs, lapack_int* m, lapack_int* p,
+                    lapack_int* q, lapack_complex_float* x11,
+                    lapack_int* ldx11, lapack_complex_float* x12,
+                    lapack_int* ldx12, lapack_complex_float* x21,
+                    lapack_int* ldx21, lapack_complex_float* x22,
+                    lapack_int* ldx22, float* theta,
+                    lapack_complex_float* u1, lapack_int* ldu1,
+                    lapack_complex_float* u2, lapack_int* ldu2,
+                    lapack_complex_float* v1t, lapack_int* ldv1t,
+                    lapack_complex_float* v2t, lapack_int* ldv2t,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    float* rwork, lapack_int* lrwork,
+                    lapack_int* iwork , lapack_int *info );
+void LAPACK_dbbcsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    double* theta, double* phi, double* u1,
+                    lapack_int* ldu1, double* u2, lapack_int* ldu2,
+                    double* v1t, lapack_int* ldv1t, double* v2t,
+                    lapack_int* ldv2t, double* b11d, double* b11e,
+                    double* b12d, double* b12e, double* b21d,
+                    double* b21e, double* b22d, double* b22e,
+                    double* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_dorbdb( char* trans, char* signs,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    double* x11, lapack_int* ldx11, double* x12,
+                    lapack_int* ldx12, double* x21, lapack_int* ldx21,
+                    double* x22, lapack_int* ldx22, double* theta,
+                    double* phi, double* taup1, double* taup2,
+                    double* tauq1, double* tauq2, double* work,
+                    lapack_int* lwork , lapack_int *info );
+void LAPACK_dorcsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    char* signs, lapack_int* m, lapack_int* p,
+                    lapack_int* q, double* x11, lapack_int* ldx11,
+                    double* x12, lapack_int* ldx12, double* x21,
+                    lapack_int* ldx21, double* x22, lapack_int* ldx22,
+                    double* theta, double* u1, lapack_int* ldu1,
+                    double* u2, lapack_int* ldu2, double* v1t,
+                    lapack_int* ldv1t, double* v2t, lapack_int* ldv2t,
+                    double* work, lapack_int* lwork,
+                    lapack_int* iwork , lapack_int *info );
+void LAPACK_dsyconv( char* uplo, char* way,
+                     lapack_int* n, double* a, lapack_int* lda,
+                     const lapack_int* ipiv, double* work , lapack_int *info );
+void LAPACK_dsyswapr( char* uplo, lapack_int* n,
+                      double* a, lapack_int* i1, lapack_int* i2 );
+void LAPACK_dsytri2( char* uplo, lapack_int* n,
+                     double* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_double* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_dsytri2x( char* uplo, lapack_int* n,
+                      double* a, lapack_int* lda,
+                      const lapack_int* ipiv, double* work,
+                      lapack_int* nb , lapack_int *info );
+void LAPACK_dsytrs2( char* uplo, lapack_int* n,
+                     lapack_int* nrhs, const double* a,
+                     lapack_int* lda, const lapack_int* ipiv,
+                     double* b, lapack_int* ldb, double* work , lapack_int *info );
+void LAPACK_sbbcsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    float* theta, float* phi, float* u1,
+                    lapack_int* ldu1, float* u2, lapack_int* ldu2,
+                    float* v1t, lapack_int* ldv1t, float* v2t,
+                    lapack_int* ldv2t, float* b11d, float* b11e,
+                    float* b12d, float* b12e, float* b21d,
+                    float* b21e, float* b22d, float* b22e,
+                    float* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_sorbdb( char* trans, char* signs,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    float* x11, lapack_int* ldx11, float* x12,
+                    lapack_int* ldx12, float* x21, lapack_int* ldx21,
+                    float* x22, lapack_int* ldx22, float* theta,
+                    float* phi, float* taup1, float* taup2,
+                    float* tauq1, float* tauq2, float* work,
+                    lapack_int* lwork , lapack_int *info );
+void LAPACK_sorcsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    char* signs, lapack_int* m, lapack_int* p,
+                    lapack_int* q, float* x11, lapack_int* ldx11,
+                    float* x12, lapack_int* ldx12, float* x21,
+                    lapack_int* ldx21, float* x22, lapack_int* ldx22,
+                    float* theta, float* u1, lapack_int* ldu1,
+                    float* u2, lapack_int* ldu2, float* v1t,
+                    lapack_int* ldv1t, float* v2t, lapack_int* ldv2t,
+                    float* work, lapack_int* lwork,
+                    lapack_int* iwork , lapack_int *info );
+void LAPACK_ssyconv( char* uplo, char* way,
+                     lapack_int* n, float* a, lapack_int* lda,
+                     const lapack_int* ipiv, float* work , lapack_int *info );
+void LAPACK_ssyswapr( char* uplo, lapack_int* n,
+                      float* a, lapack_int* i1, lapack_int* i2 );
+void LAPACK_ssytri2( char* uplo, lapack_int* n,
+                     float* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_float* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_ssytri2x( char* uplo, lapack_int* n,
+                      float* a, lapack_int* lda,
+                      const lapack_int* ipiv, float* work,
+                      lapack_int* nb , lapack_int *info );
+void LAPACK_ssytrs2( char* uplo, lapack_int* n,
+                     lapack_int* nrhs, const float* a,
+                     lapack_int* lda, const lapack_int* ipiv,
+                     float* b, lapack_int* ldb, float* work , lapack_int *info );
+void LAPACK_zbbcsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    double* theta, double* phi,
+                    lapack_complex_double* u1, lapack_int* ldu1,
+                    lapack_complex_double* u2, lapack_int* ldu2,
+                    lapack_complex_double* v1t, lapack_int* ldv1t,
+                    lapack_complex_double* v2t, lapack_int* ldv2t,
+                    double* b11d, double* b11e, double* b12d,
+                    double* b12e, double* b21d, double* b21e,
+                    double* b22d, double* b22e, double* rwork,
+                    lapack_int* lrwork , lapack_int *info );
+void LAPACK_zheswapr( char* uplo, lapack_int* n,
+                      lapack_complex_double* a, lapack_int* i1,
+                      lapack_int* i2 );
+void LAPACK_zhetri2( char* uplo, lapack_int* n,
+                     lapack_complex_double* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_double* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_zhetri2x( char* uplo, lapack_int* n,
+                      lapack_complex_double* a, lapack_int* lda,
+                      const lapack_int* ipiv,
+                      lapack_complex_double* work, lapack_int* nb , lapack_int *info );
+void LAPACK_zhetrs2( char* uplo, lapack_int* n,
+                     lapack_int* nrhs,
+                     const lapack_complex_double* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* work , lapack_int *info );
+void LAPACK_zsyconv( char* uplo, char* way,
+                     lapack_int* n, lapack_complex_double* a,
+                     lapack_int* lda, const lapack_int* ipiv,
+                     lapack_complex_double* work , lapack_int *info );
+void LAPACK_zsyswapr( char* uplo, lapack_int* n,
+                      lapack_complex_double* a, lapack_int* i1,
+                      lapack_int* i2 );
+void LAPACK_zsytri2( char* uplo, lapack_int* n,
+                     lapack_complex_double* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_double* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_zsytri2x( char* uplo, lapack_int* n,
+                      lapack_complex_double* a, lapack_int* lda,
+                      const lapack_int* ipiv,
+                      lapack_complex_double* work, lapack_int* nb , lapack_int *info );
+void LAPACK_zsytrs2( char* uplo, lapack_int* n,
+                     lapack_int* nrhs,
+                     const lapack_complex_double* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* work , lapack_int *info );
+void LAPACK_zunbdb( char* trans, char* signs,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    lapack_complex_double* x11, lapack_int* ldx11,
+                    lapack_complex_double* x12, lapack_int* ldx12,
+                    lapack_complex_double* x21, lapack_int* ldx21,
+                    lapack_complex_double* x22, lapack_int* ldx22,
+                    double* theta, double* phi,
+                    lapack_complex_double* taup1,
+                    lapack_complex_double* taup2,
+                    lapack_complex_double* tauq1,
+                    lapack_complex_double* tauq2,
+                    lapack_complex_double* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_zuncsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    char* signs, lapack_int* m, lapack_int* p,
+                    lapack_int* q, lapack_complex_double* x11,
+                    lapack_int* ldx11, lapack_complex_double* x12,
+                    lapack_int* ldx12, lapack_complex_double* x21,
+                    lapack_int* ldx21, lapack_complex_double* x22,
+                    lapack_int* ldx22, double* theta,
+                    lapack_complex_double* u1, lapack_int* ldu1,
+                    lapack_complex_double* u2, lapack_int* ldu2,
+                    lapack_complex_double* v1t, lapack_int* ldv1t,
+                    lapack_complex_double* v2t, lapack_int* ldv2t,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork,
+                    lapack_int* iwork , lapack_int *info );
+// LAPACK 3.4.0
+void LAPACK_sgemqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* nb, const float* v,
+                     lapack_int* ldv, const float* t, lapack_int* ldt, float* c,
+                     lapack_int* ldc, float* work, lapack_int *info );
+void LAPACK_dgemqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* nb, const double* v,
+                     lapack_int* ldv, const double* t, lapack_int* ldt,
+                     double* c, lapack_int* ldc, double* work,
+                     lapack_int *info );
+void LAPACK_cgemqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* nb,
+                     const lapack_complex_float* v, lapack_int* ldv,
+                     const lapack_complex_float* t, lapack_int* ldt,
+                     lapack_complex_float* c, lapack_int* ldc,
+                     lapack_complex_float* work, lapack_int *info );
+void LAPACK_zgemqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* nb,
+                     const lapack_complex_double* v, lapack_int* ldv,
+                     const lapack_complex_double* t, lapack_int* ldt,
+                     lapack_complex_double* c, lapack_int* ldc,
+                     lapack_complex_double* work, lapack_int *info );
+void LAPACK_sgeqrt( lapack_int* m, lapack_int* n, lapack_int* nb, float* a,
+                    lapack_int* lda, float* t, lapack_int* ldt, float* work,
+                    lapack_int *info );
+void LAPACK_dgeqrt( lapack_int* m, lapack_int* n, lapack_int* nb, double* a,
+                    lapack_int* lda, double* t, lapack_int* ldt, double* work,
+                    lapack_int *info );
+void LAPACK_cgeqrt( lapack_int* m, lapack_int* n, lapack_int* nb,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* t, lapack_int* ldt,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zgeqrt( lapack_int* m, lapack_int* n, lapack_int* nb,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_sgeqrt2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                     float* t, lapack_int* ldt, lapack_int *info );
+void LAPACK_dgeqrt2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                     double* t, lapack_int* ldt, lapack_int *info );
+void LAPACK_cgeqrt2( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                     lapack_int* lda, lapack_complex_float* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_zgeqrt2( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                     lapack_int* lda, lapack_complex_double* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_sgeqrt3( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                     float* t, lapack_int* ldt, lapack_int *info );
+void LAPACK_dgeqrt3( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                     double* t, lapack_int* ldt, lapack_int *info );
+void LAPACK_cgeqrt3( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                     lapack_int* lda, lapack_complex_float* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_zgeqrt3( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                     lapack_int* lda, lapack_complex_double* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_stpmqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* l, lapack_int* nb,
+                     const float* v, lapack_int* ldv, const float* t,
+                     lapack_int* ldt, float* a, lapack_int* lda, float* b,
+                     lapack_int* ldb, float* work, lapack_int *info );
+void LAPACK_dtpmqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* l, lapack_int* nb,
+                     const double* v, lapack_int* ldv, const double* t,
+                     lapack_int* ldt, double* a, lapack_int* lda, double* b,
+                     lapack_int* ldb, double* work, lapack_int *info );
+void LAPACK_ctpmqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* l, lapack_int* nb,
+                     const lapack_complex_float* v, lapack_int* ldv,
+                     const lapack_complex_float* t, lapack_int* ldt,
+                     lapack_complex_float* a, lapack_int* lda,
+                     lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* work, lapack_int *info );
+void LAPACK_ztpmqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* l, lapack_int* nb,
+                     const lapack_complex_double* v, lapack_int* ldv,
+                     const lapack_complex_double* t, lapack_int* ldt,
+                     lapack_complex_double* a, lapack_int* lda,
+                     lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* work, lapack_int *info );
+void LAPACK_dtpqrt( lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb,
+                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                    double* t, lapack_int* ldt, double* work,
+                    lapack_int *info );
+void LAPACK_ctpqrt( lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* t, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_int* ldt,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_ztpqrt( lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_stpqrt2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                     float* b, lapack_int* ldb, float* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_dtpqrt2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                     double* b, lapack_int* ldb, double* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_ctpqrt2( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                     lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_ztpqrt2( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                     lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_stprfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l,
+                    const float* v, lapack_int* ldv, const float* t,
+                    lapack_int* ldt, float* a, lapack_int* lda, float* b,
+                    lapack_int* ldb, const float* mywork,
+                    lapack_int* myldwork );
+void LAPACK_dtprfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l,
+                    const double* v, lapack_int* ldv, const double* t,
+                    lapack_int* ldt, double* a, lapack_int* lda, double* b,
+                    lapack_int* ldb, const double* mywork,
+                    lapack_int* myldwork );
+void LAPACK_ctprfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l,
+                    const lapack_complex_float* v, lapack_int* ldv,
+                    const lapack_complex_float* t, lapack_int* ldt,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    const float* mywork, lapack_int* myldwork );
+void LAPACK_ztprfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l,
+                    const lapack_complex_double* v, lapack_int* ldv,
+                    const lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    const double* mywork, lapack_int* myldwork );
+// LAPACK 3.X.X
+void LAPACK_csyr( char* uplo, lapack_int* n, lapack_complex_float* alpha,
+                      const lapack_complex_float* x, lapack_int* incx,
+                      lapack_complex_float* a, lapack_int* lda );
+void LAPACK_zsyr( char* uplo, lapack_int* n, lapack_complex_double* alpha,
+                      const lapack_complex_double* x, lapack_int* incx,
+                      lapack_complex_double* a, lapack_int* lda );
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _LAPACKE_H_ */
+
+#endif /* _MKL_LAPACKE_H_ */

diff --git a/Eigen/src/misc/lapacke_mangling.h b/Eigen/src/misc/lapacke_mangling.h
new file mode 100644
index 0000000..6211fd1
--- /dev/null
+++ b/Eigen/src/misc/lapacke_mangling.h

@@ -0,0 +1,17 @@
+#ifndef LAPACK_HEADER_INCLUDED
+#define LAPACK_HEADER_INCLUDED
+
+#ifndef LAPACK_GLOBAL
+#if defined(LAPACK_GLOBAL_PATTERN_LC) || defined(ADD_)
+#define LAPACK_GLOBAL(lcname,UCNAME)  lcname##_
+#elif defined(LAPACK_GLOBAL_PATTERN_UC) || defined(UPPER)
+#define LAPACK_GLOBAL(lcname,UCNAME)  UCNAME
+#elif defined(LAPACK_GLOBAL_PATTERN_MC) || defined(NOCHANGE)
+#define LAPACK_GLOBAL(lcname,UCNAME)  lcname
+#else
+#define LAPACK_GLOBAL(lcname,UCNAME)  lcname##_
+#endif
+#endif
+
+#endif
+

diff --git a/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/Eigen/src/plugins/ArrayCwiseBinaryOps.h
index 5b36420..1b422e2 100644
--- a/Eigen/src/plugins/ArrayCwiseBinaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseBinaryOps.h

@@ -1,13 +1,14 @@
+
 /** \returns an expression of the coefficient wise product of \c *this and \a other
   *
   * \sa MatrixBase::cwiseProduct
   */
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)
+EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,product)
 operator*(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)(derived(), other.derived());
+  return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,product)(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient wise quotient of \c *this and \a other
@@ -16,10 +17,10 @@
   */
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_quotient_op<Scalar,typename OtherDerived::Scalar>, const Derived, const OtherDerived>
 operator/(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+  return CwiseBinaryOp<internal::scalar_quotient_op<Scalar,typename OtherDerived::Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise min of \c *this and \a other
@@ -29,15 +30,40 @@
   *
   * \sa max()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(min,internal::scalar_min_op)
+template <int NaNPropagation, typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+min
+#else
+(min)
+#endif
+(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  return CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>(derived(), other.derived());
+}
+
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,PropagateFast>, const Derived, const OtherDerived>
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+min
+#else
+(min)
+#endif
+(const OtherDerived &other) const
+{
+  return (min<PropagateFast>)(other);
+}
 
 /** \returns an expression of the coefficient-wise min of \c *this and scalar \a other
   *
   * \sa max()
   */
+template <int NaNPropagation>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived,
-                                        const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NaNPropagation>, const Derived,
+    const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
 #ifdef EIGEN_PARSED_BY_DOXYGEN
 min
 #else
@@ -45,7 +71,20 @@
 #endif
 (const Scalar &other) const
 {
-  return (min)(Derived::PlainObject::Constant(rows(), cols(), other));
+  return (min<NaNPropagation>)(Derived::PlainObject::Constant(rows(), cols(), other));
+}
+
+EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,PropagateFast>, const Derived,
+    const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+min
+#else
+(min)
+#endif
+(const Scalar &other) const
+{
+  return (min<PropagateFast>)(Derived::PlainObject::Constant(rows(), cols(), other));
 }
 
 /** \returns an expression of the coefficient-wise max of \c *this and \a other
@@ -55,14 +94,39 @@
   *
   * \sa min()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(max,internal::scalar_max_op)
+template <int NaNPropagation, typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+max
+#else
+(max)
+#endif
+(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  return CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>(derived(), other.derived());
+}
+
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,PropagateFast>, const Derived, const OtherDerived>
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+max
+#else
+(max)
+#endif
+(const OtherDerived &other) const
+{
+  return (max<PropagateFast>)(other);
+}
 
 /** \returns an expression of the coefficient-wise max of \c *this and scalar \a other
   *
   * \sa min()
   */
+template <int NaNPropagation>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived,
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NaNPropagation>, const Derived,
                                         const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
 #ifdef EIGEN_PARSED_BY_DOXYGEN
 max
@@ -71,7 +135,46 @@
 #endif
 (const Scalar &other) const
 {
-  return (max)(Derived::PlainObject::Constant(rows(), cols(), other));
+  return (max<NaNPropagation>)(Derived::PlainObject::Constant(rows(), cols(), other));
+}
+
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,PropagateFast>, const Derived,
+                                        const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+max
+#else
+(max)
+#endif
+(const Scalar &other) const
+{
+  return (max<PropagateFast>)(Derived::PlainObject::Constant(rows(), cols(), other));
+}
+
+/** \returns an expression of the coefficient-wise absdiff of \c *this and \a other
+  *
+  * Example: \include Cwise_absolute_difference.cpp
+  * Output: \verbinclude Cwise_absolute_difference.out
+  *
+  * \sa absolute_difference()
+  */
+EIGEN_MAKE_CWISE_BINARY_OP(absolute_difference,absolute_difference)
+
+/** \returns an expression of the coefficient-wise absolute_difference of \c *this and scalar \a other
+  *
+  * \sa absolute_difference()
+  */
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_absolute_difference_op<Scalar,Scalar>, const Derived,
+                                        const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+absolute_difference
+#else
+(absolute_difference)
+#endif
+(const Scalar &other) const
+{
+  return (absolute_difference)(Derived::PlainObject::Constant(rows(), cols(), other));
 }
 
 /** \returns an expression of the coefficient-wise power of \c *this to the given array of \a exponents.
@@ -81,17 +184,66 @@
   * Example: \include Cwise_array_power_array.cpp
   * Output: \verbinclude Cwise_array_power_array.out
   */
-template<typename ExponentDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-const CwiseBinaryOp<internal::scalar_binary_pow_op<Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
-pow(const ArrayBase<ExponentDerived>& exponents) const
-{
-  return CwiseBinaryOp<internal::scalar_binary_pow_op<Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
-    this->derived(),
-    exponents.derived()
-  );
+EIGEN_MAKE_CWISE_BINARY_OP(pow,pow)
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(pow,pow)
+#else
+/** \returns an expression of the coefficients of \c *this rasied to the constant power \a exponent
+  *
+  * \tparam T is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression.
+  *
+  * This function computes the coefficient-wise power. The function MatrixBase::pow() in the
+  * unsupported module MatrixFunctions computes the matrix power.
+  *
+  * Example: \include Cwise_pow.cpp
+  * Output: \verbinclude Cwise_pow.out
+  *
+  * \sa ArrayBase::pow(ArrayBase), square(), cube(), exp(), log()
+  */
+template<typename T>
+const CwiseBinaryOp<internal::scalar_pow_op<Scalar,T>,Derived,Constant<T> > pow(const T& exponent) const;
+#endif
+
+
+// TODO code generating macros could be moved to Macros.h and could include generation of documentation
+#define EIGEN_MAKE_CWISE_COMP_OP(OP, COMPARATOR) \
+template<typename OtherDerived> \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_cmp_op<Scalar, typename OtherDerived::Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const OtherDerived> \
+OP(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
+{ \
+  return CwiseBinaryOp<internal::scalar_cmp_op<Scalar, typename OtherDerived::Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const OtherDerived>(derived(), other.derived()); \
+}\
+typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar,Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> > Cmp ## COMPARATOR ## ReturnType; \
+typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar,Scalar, internal::cmp_ ## COMPARATOR>, const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject>, const Derived > RCmp ## COMPARATOR ## ReturnType; \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Cmp ## COMPARATOR ## ReturnType \
+OP(const Scalar& s) const { \
+  return this->OP(Derived::PlainObject::Constant(rows(), cols(), s)); \
+} \
+EIGEN_DEVICE_FUNC friend EIGEN_STRONG_INLINE const RCmp ## COMPARATOR ## ReturnType \
+OP(const Scalar& s, const EIGEN_CURRENT_STORAGE_BASE_CLASS<Derived>& d) { \
+  return Derived::PlainObject::Constant(d.rows(), d.cols(), s).OP(d); \
 }
 
+#define EIGEN_MAKE_CWISE_COMP_R_OP(OP, R_OP, RCOMPARATOR) \
+template<typename OtherDerived> \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_cmp_op<typename OtherDerived::Scalar, Scalar, internal::cmp_##RCOMPARATOR>, const OtherDerived, const Derived> \
+OP(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
+{ \
+  return CwiseBinaryOp<internal::scalar_cmp_op<typename OtherDerived::Scalar, Scalar, internal::cmp_##RCOMPARATOR>, const OtherDerived, const Derived>(other.derived(), derived()); \
+} \
+EIGEN_DEVICE_FUNC \
+inline const RCmp ## RCOMPARATOR ## ReturnType \
+OP(const Scalar& s) const { \
+  return Derived::PlainObject::Constant(rows(), cols(), s).R_OP(*this); \
+} \
+friend inline const Cmp ## RCOMPARATOR ## ReturnType \
+OP(const Scalar& s, const Derived& d) { \
+  return d.R_OP(Derived::PlainObject::Constant(d.rows(), d.cols(), s)); \
+}
+
+
+
 /** \returns an expression of the coefficient-wise \< operator of *this and \a other
   *
   * Example: \include Cwise_less.cpp
@@ -99,7 +251,7 @@
   *
   * \sa all(), any(), operator>(), operator<=()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator<,std::less)
+EIGEN_MAKE_CWISE_COMP_OP(operator<, LT)
 
 /** \returns an expression of the coefficient-wise \<= operator of *this and \a other
   *
@@ -108,7 +260,7 @@
   *
   * \sa all(), any(), operator>=(), operator<()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator<=,std::less_equal)
+EIGEN_MAKE_CWISE_COMP_OP(operator<=, LE)
 
 /** \returns an expression of the coefficient-wise \> operator of *this and \a other
   *
@@ -117,7 +269,7 @@
   *
   * \sa all(), any(), operator>=(), operator<()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator>,std::greater)
+EIGEN_MAKE_CWISE_COMP_R_OP(operator>, operator<, LT)
 
 /** \returns an expression of the coefficient-wise \>= operator of *this and \a other
   *
@@ -126,7 +278,7 @@
   *
   * \sa all(), any(), operator>(), operator<=()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator>=,std::greater_equal)
+EIGEN_MAKE_CWISE_COMP_R_OP(operator>=, operator<=, LE)
 
 /** \returns an expression of the coefficient-wise == operator of *this and \a other
   *
@@ -140,7 +292,7 @@
   *
   * \sa all(), any(), isApprox(), isMuchSmallerThan()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator==,std::equal_to)
+EIGEN_MAKE_CWISE_COMP_OP(operator==, EQ)
 
 /** \returns an expression of the coefficient-wise != operator of *this and \a other
   *
@@ -154,99 +306,80 @@
   *
   * \sa all(), any(), isApprox(), isMuchSmallerThan()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator!=,std::not_equal_to)
+EIGEN_MAKE_CWISE_COMP_OP(operator!=, NEQ)
+
+
+#undef EIGEN_MAKE_CWISE_COMP_OP
+#undef EIGEN_MAKE_CWISE_COMP_R_OP
 
 // scalar addition
-
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_MAKE_SCALAR_BINARY_OP(operator+,sum)
+#else
 /** \returns an expression of \c *this with each coeff incremented by the constant \a scalar
   *
+  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+  *
   * Example: \include Cwise_plus.cpp
   * Output: \verbinclude Cwise_plus.out
   *
   * \sa operator+=(), operator-()
   */
-EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
-operator+(const Scalar& scalar) const
-{
-  return CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>(derived(), internal::scalar_add_op<Scalar>(scalar));
-}
+template<typename T>
+const CwiseBinaryOp<internal::scalar_sum_op<Scalar,T>,Derived,Constant<T> > operator+(const T& scalar) const;
+/** \returns an expression of \a expr with each coeff incremented by the constant \a scalar
+  *
+  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+  */
+template<typename T> friend
+const CwiseBinaryOp<internal::scalar_sum_op<T,Scalar>,Constant<T>,Derived> operator+(const T& scalar, const StorageBaseType& expr);
+#endif
 
-EIGEN_DEVICE_FUNC
-friend inline const CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
-operator+(const Scalar& scalar,const EIGEN_CURRENT_STORAGE_BASE_CLASS<Derived>& other)
-{
-  return other + scalar;
-}
-
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_MAKE_SCALAR_BINARY_OP(operator-,difference)
+#else
 /** \returns an expression of \c *this with each coeff decremented by the constant \a scalar
   *
+  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+  *
   * Example: \include Cwise_minus.cpp
   * Output: \verbinclude Cwise_minus.out
   *
-  * \sa operator+(), operator-=()
+  * \sa operator+=(), operator-()
   */
-EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived>
-operator-(const Scalar& scalar) const
-{
-  return CwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived>(derived(), internal::scalar_sub_op<Scalar>(scalar));;
-}
-
-EIGEN_DEVICE_FUNC
-friend inline const CwiseUnaryOp<internal::scalar_rsub_op<Scalar>, const Derived>
-operator-(const Scalar& scalar,const EIGEN_CURRENT_STORAGE_BASE_CLASS<Derived>& other)
-{
-  return CwiseUnaryOp<internal::scalar_rsub_op<Scalar>, const Derived>(other.derived(), internal::scalar_rsub_op<Scalar>(scalar));;
-}
-
-/** \returns an expression of the coefficient-wise && operator of *this and \a other
+template<typename T>
+const CwiseBinaryOp<internal::scalar_difference_op<Scalar,T>,Derived,Constant<T> > operator-(const T& scalar) const;
+/** \returns an expression of the constant matrix of value \a scalar decremented by the coefficients of \a expr
   *
-  * \warning this operator is for expression of bool only.
-  *
-  * Example: \include Cwise_boolean_and.cpp
-  * Output: \verbinclude Cwise_boolean_and.out
-  *
-  * \sa operator||(), select()
+  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
   */
-template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-inline const CwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>
-operator&&(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
-{
-  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
-                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
-  return CwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>(derived(),other.derived());
-}
+template<typename T> friend
+const CwiseBinaryOp<internal::scalar_difference_op<T,Scalar>,Constant<T>,Derived> operator-(const T& scalar, const StorageBaseType& expr);
+#endif
 
-/** \returns an expression of the coefficient-wise || operator of *this and \a other
-  *
-  * \warning this operator is for expression of bool only.
-  *
-  * Example: \include Cwise_boolean_or.cpp
-  * Output: \verbinclude Cwise_boolean_or.out
-  *
-  * \sa operator&&(), select()
-  */
-template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-inline const CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>
-operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
-{
-  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
-                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
-  return CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>(derived(),other.derived());
-}
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(operator/,quotient)
+#else
+  /**
+    * \brief Component-wise division of the scalar \a s by array elements of \a a.
+    *
+    * \tparam Scalar is the scalar type of \a x. It must be compatible with the scalar type of the given array expression (\c Derived::Scalar).
+    */
+  template<typename T> friend
+  inline const CwiseBinaryOp<internal::scalar_quotient_op<T,Scalar>,Constant<T>,Derived>
+  operator/(const T& s,const StorageBaseType& a);
+#endif
 
 /** \returns an expression of the coefficient-wise ^ operator of *this and \a other
-  *
-  * \warning this operator is for expression of bool only.
-  *
-  * Example: \include Cwise_boolean_xor.cpp
-  * Output: \verbinclude Cwise_boolean_xor.out
-  *
-  * \sa operator^(), select()
-  */
+ *
+ * \warning this operator is for expression of bool only.
+ *
+ * Example: \include Cwise_boolean_xor.cpp
+ * Output: \verbinclude Cwise_boolean_xor.out
+ *
+ * \sa operator&&(), select()
+ */
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
 inline const CwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived>
@@ -261,6 +394,8 @@
 #if 0
 /** \cpp11 \returns an expression of the coefficient-wise polygamma function.
   *
+  * \specialfunctions_module
+  *
   * It returns the \a n -th derivative of the digamma(psi) evaluated at \c *this.
   *
   * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x)
@@ -274,3 +409,26 @@
   return CwiseBinaryOp<internal::scalar_polygamma_op<Scalar>, const DerivedN, const Derived>(n.derived(), this->derived());
 }
 #endif
+
+/** \returns an expression of the coefficient-wise zeta function.
+  *
+  * \specialfunctions_module
+  *
+  * It returns the Riemann zeta function of two arguments \c *this and \a q:
+  *
+  * \param q is the shift, it must be > 0
+  *
+  * \note *this is the exponent, it must be > 1.
+  * \note This function supports only float and double scalar types. To support other scalar types, the user has
+  * to provide implementations of zeta(T,T) for any scalar type T to be supported.
+  *
+  * This method is an alias for zeta(*this,q);
+  *
+  * \sa Eigen::zeta()
+  */
+template<typename DerivedQ>
+inline const CwiseBinaryOp<internal::scalar_zeta_op<Scalar>, const Derived, const DerivedQ>
+zeta(const EIGEN_CURRENT_STORAGE_BASE_CLASS<DerivedQ> &q) const
+{
+  return CwiseBinaryOp<internal::scalar_zeta_op<Scalar>, const Derived, const DerivedQ>(this->derived(), q.derived());
+}

diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
index 3d95804..13c55f4 100644
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h

@@ -1,17 +1,71 @@
 
 
+typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> AbsReturnType;
+typedef CwiseUnaryOp<internal::scalar_arg_op<Scalar>, const Derived> ArgReturnType;
+typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> Abs2ReturnType;
+typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> SqrtReturnType;
+typedef CwiseUnaryOp<internal::scalar_rsqrt_op<Scalar>, const Derived> RsqrtReturnType;
+typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> SignReturnType;
+typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> InverseReturnType;
+typedef CwiseUnaryOp<internal::scalar_boolean_not_op<Scalar>, const Derived> BooleanNotReturnType;
+
+typedef CwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived> ExpReturnType;
+typedef CwiseUnaryOp<internal::scalar_expm1_op<Scalar>, const Derived> Expm1ReturnType;
+typedef CwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived> LogReturnType;
+typedef CwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived> Log1pReturnType;
+typedef CwiseUnaryOp<internal::scalar_log10_op<Scalar>, const Derived> Log10ReturnType;
+typedef CwiseUnaryOp<internal::scalar_log2_op<Scalar>, const Derived> Log2ReturnType;
+typedef CwiseUnaryOp<internal::scalar_cos_op<Scalar>, const Derived> CosReturnType;
+typedef CwiseUnaryOp<internal::scalar_sin_op<Scalar>, const Derived> SinReturnType;
+typedef CwiseUnaryOp<internal::scalar_tan_op<Scalar>, const Derived> TanReturnType;
+typedef CwiseUnaryOp<internal::scalar_acos_op<Scalar>, const Derived> AcosReturnType;
+typedef CwiseUnaryOp<internal::scalar_asin_op<Scalar>, const Derived> AsinReturnType;
+typedef CwiseUnaryOp<internal::scalar_atan_op<Scalar>, const Derived> AtanReturnType;
+typedef CwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived> TanhReturnType;
+typedef CwiseUnaryOp<internal::scalar_logistic_op<Scalar>, const Derived> LogisticReturnType;
+typedef CwiseUnaryOp<internal::scalar_sinh_op<Scalar>, const Derived> SinhReturnType;
+#if EIGEN_HAS_CXX11_MATH
+typedef CwiseUnaryOp<internal::scalar_atanh_op<Scalar>, const Derived> AtanhReturnType;
+typedef CwiseUnaryOp<internal::scalar_asinh_op<Scalar>, const Derived> AsinhReturnType;
+typedef CwiseUnaryOp<internal::scalar_acosh_op<Scalar>, const Derived> AcoshReturnType;
+#endif
+typedef CwiseUnaryOp<internal::scalar_cosh_op<Scalar>, const Derived> CoshReturnType;
+typedef CwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived> SquareReturnType;
+typedef CwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived> CubeReturnType;
+typedef CwiseUnaryOp<internal::scalar_round_op<Scalar>, const Derived> RoundReturnType;
+typedef CwiseUnaryOp<internal::scalar_rint_op<Scalar>, const Derived> RintReturnType;
+typedef CwiseUnaryOp<internal::scalar_floor_op<Scalar>, const Derived> FloorReturnType;
+typedef CwiseUnaryOp<internal::scalar_ceil_op<Scalar>, const Derived> CeilReturnType;
+typedef CwiseUnaryOp<internal::scalar_isnan_op<Scalar>, const Derived> IsNaNReturnType;
+typedef CwiseUnaryOp<internal::scalar_isinf_op<Scalar>, const Derived> IsInfReturnType;
+typedef CwiseUnaryOp<internal::scalar_isfinite_op<Scalar>, const Derived> IsFiniteReturnType;
+
 /** \returns an expression of the coefficient-wise absolute value of \c *this
   *
   * Example: \include Cwise_abs.cpp
   * Output: \verbinclude Cwise_abs.out
   *
-  * \sa abs2()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_abs">Math functions</a>, abs2()
   */
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
+EIGEN_STRONG_INLINE const AbsReturnType
 abs() const
 {
-  return derived();
+  return AbsReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise phase angle of \c *this
+  *
+  * Example: \include Cwise_arg.cpp
+  * Output: \verbinclude Cwise_arg.out
+  *
+  * \sa abs()
+  */
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const ArgReturnType
+arg() const
+{
+  return ArgReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise squared absolute value of \c *this
@@ -19,155 +73,188 @@
   * Example: \include Cwise_abs2.cpp
   * Output: \verbinclude Cwise_abs2.out
   *
-  * \sa abs(), square()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_abs2">Math functions</a>, abs(), square()
   */
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived>
+EIGEN_STRONG_INLINE const Abs2ReturnType
 abs2() const
 {
-  return derived();
+  return Abs2ReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise exponential of *this.
   *
+  * This function computes the coefficient-wise exponential. The function MatrixBase::exp() in the
+  * unsupported module MatrixFunctions computes the matrix exponential.
+  *
   * Example: \include Cwise_exp.cpp
   * Output: \verbinclude Cwise_exp.out
   *
-  * \sa pow(), log(), sin(), cos()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_exp">Math functions</a>, pow(), log(), sin(), cos()
   */
 EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived>
+inline const ExpReturnType
 exp() const
 {
-  return derived();
+  return ExpReturnType(derived());
 }
 
-/** \returns an expression of the coefficient-wise exponential of *this - 1.
+/** \returns an expression of the coefficient-wise exponential of *this minus 1.
   *
-  * Example: \include Cwise_expm1.cpp
-  * Output: \verbinclude Cwise_exp.out
-  * In exact arithmetic, \c (x-1).exp() is equivalent to \c x.expm1(),
-  * however, with finite precision, this function is much more accurate when \c
-  * x is close to zero.
+  * In exact arithmetic, \c x.expm1() is equivalent to \c x.exp() - 1,
+  * however, with finite precision, this function is much more accurate when \c x is close to zero.
   *
-  * \sa exp()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_expm1">Math functions</a>, exp()
   */
 EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_expm1_op<Scalar>, const Derived>
+inline const Expm1ReturnType
 expm1() const
 {
-  return derived();
+  return Expm1ReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise logarithm of *this.
   *
+  * This function computes the coefficient-wise logarithm. The function MatrixBase::log() in the
+  * unsupported module MatrixFunctions computes the matrix logarithm.
+  *
   * Example: \include Cwise_log.cpp
   * Output: \verbinclude Cwise_log.out
   *
-  * \sa exp()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log">Math functions</a>, log()
   */
 EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived>
+inline const LogReturnType
 log() const
 {
-  return derived();
+  return LogReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise logarithm of 1 plus \c *this.
   *
   * In exact arithmetic, \c x.log() is equivalent to \c (x+1).log(),
-  * however, with finite precision, this function is much more accurate when \c
- * x is close to zero.
+  * however, with finite precision, this function is much more accurate when \c x is close to zero.
   *
-  * \sa log()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log1p">Math functions</a>, log()
   */
 EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived>
-log1p() const {
-  return derived();
+inline const Log1pReturnType
+log1p() const
+{
+  return Log1pReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise base-10 logarithm of *this.
+  *
+  * This function computes the coefficient-wise base-10 logarithm.
+  *
+  * Example: \include Cwise_log10.cpp
+  * Output: \verbinclude Cwise_log10.out
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log10">Math functions</a>, log()
+  */
+EIGEN_DEVICE_FUNC
+inline const Log10ReturnType
+log10() const
+{
+  return Log10ReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise base-2 logarithm of *this.
+  *
+  * This function computes the coefficient-wise base-2 logarithm.
+  *
+  */
+EIGEN_DEVICE_FUNC
+inline const Log2ReturnType
+log2() const
+{
+  return Log2ReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise square root of *this.
   *
+  * This function computes the coefficient-wise square root. The function MatrixBase::sqrt() in the
+  * unsupported module MatrixFunctions computes the matrix square root.
+  *
   * Example: \include Cwise_sqrt.cpp
   * Output: \verbinclude Cwise_sqrt.out
   *
-  * \sa rsqrt(), pow(), square()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sqrt">Math functions</a>, pow(), square()
   */
 EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
+inline const SqrtReturnType
 sqrt() const
 {
-  return derived();
+  return SqrtReturnType(derived());
 }
 
-/** \returns an expression of the coefficient-wise reciprocal square root of *this.
+/** \returns an expression of the coefficient-wise inverse square root of *this.
   *
-  * \sa sqrt(), pow(), square()
+  * This function computes the coefficient-wise inverse square root.
+  *
+  * Example: \include Cwise_sqrt.cpp
+  * Output: \verbinclude Cwise_sqrt.out
+  *
+  * \sa pow(), square()
   */
 EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_rsqrt_op<Scalar>, const Derived>
+inline const RsqrtReturnType
 rsqrt() const
 {
-  return derived();
+  return RsqrtReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise signum of *this.
+  *
+  * This function computes the coefficient-wise signum.
+  *
+  * Example: \include Cwise_sign.cpp
+  * Output: \verbinclude Cwise_sign.out
+  *
+  * \sa pow(), square()
+  */
+EIGEN_DEVICE_FUNC
+inline const SignReturnType
+sign() const
+{
+  return SignReturnType(derived());
 }
 
 
 /** \returns an expression of the coefficient-wise cosine of *this.
   *
+  * This function computes the coefficient-wise cosine. The function MatrixBase::cos() in the
+  * unsupported module MatrixFunctions computes the matrix cosine.
+  *
   * Example: \include Cwise_cos.cpp
   * Output: \verbinclude Cwise_cos.out
   *
-  * \sa sin(), acos()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cos">Math functions</a>, sin(), acos()
   */
 EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_cos_op<Scalar>, const Derived>
+inline const CosReturnType
 cos() const
 {
-  return derived();
+  return CosReturnType(derived());
 }
 
 
 /** \returns an expression of the coefficient-wise sine of *this.
   *
+  * This function computes the coefficient-wise sine. The function MatrixBase::sin() in the
+  * unsupported module MatrixFunctions computes the matrix sine.
+  *
   * Example: \include Cwise_sin.cpp
   * Output: \verbinclude Cwise_sin.out
   *
-  * \sa cos(), asin()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sin">Math functions</a>, cos(), asin()
   */
 EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_sin_op<Scalar>, const Derived>
+inline const SinReturnType
 sin() const
 {
-  return derived();
-}
-
-/** \returns an expression of the coefficient-wise arc cosine of *this.
-  *
-  * Example: \include Cwise_acos.cpp
-  * Output: \verbinclude Cwise_acos.out
-  *
-  * \sa cos(), asin()
-  */
-EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_acos_op<Scalar>, const Derived>
-acos() const
-{
-  return derived();
-}
-
-/** \returns an expression of the coefficient-wise arc sine of *this.
-  *
-  * Example: \include Cwise_asin.cpp
-  * Output: \verbinclude Cwise_asin.out
-  *
-  * \sa sin(), acos()
-  */
-EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_asin_op<Scalar>, const Derived>
-asin() const
-{
-  return derived();
+  return SinReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise tan of *this.
@@ -175,13 +262,13 @@
   * Example: \include Cwise_tan.cpp
   * Output: \verbinclude Cwise_tan.out
   *
-  * \sa cos(), sin()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_tan">Math functions</a>, cos(), sin()
   */
 EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_tan_op<Scalar>, Derived>
+inline const TanReturnType
 tan() const
 {
-  return derived();
+  return TanReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise arc tan of *this.
@@ -189,42 +276,127 @@
   * Example: \include Cwise_atan.cpp
   * Output: \verbinclude Cwise_atan.out
   *
-  * \sa cos(), sin(), tan()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_atan">Math functions</a>, tan(), asin(), acos()
   */
 EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_atan_op<Scalar>, Derived>
+inline const AtanReturnType
 atan() const
 {
-  return derived();
+  return AtanReturnType(derived());
 }
 
-/** \returns an expression of the coefficient-wise hyperbolic tangent of *this.
+/** \returns an expression of the coefficient-wise arc cosine of *this.
+  *
+  * Example: \include Cwise_acos.cpp
+  * Output: \verbinclude Cwise_acos.out
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_acos">Math functions</a>, cos(), asin()
+  */
+EIGEN_DEVICE_FUNC
+inline const AcosReturnType
+acos() const
+{
+  return AcosReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise arc sine of *this.
+  *
+  * Example: \include Cwise_asin.cpp
+  * Output: \verbinclude Cwise_asin.out
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_asin">Math functions</a>, sin(), acos()
+  */
+EIGEN_DEVICE_FUNC
+inline const AsinReturnType
+asin() const
+{
+  return AsinReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise hyperbolic tan of *this.
   *
   * Example: \include Cwise_tanh.cpp
   * Output: \verbinclude Cwise_tanh.out
   *
-  * \sa cos(), sin()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_tanh">Math functions</a>, tan(), sinh(), cosh()
   */
 EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_tanh_op<Scalar>, Derived>
+inline const TanhReturnType
 tanh() const
 {
-  return derived();
+  return TanhReturnType(derived());
 }
 
-/** \returns an expression of the coefficient-wise power of *this to the given exponent.
+/** \returns an expression of the coefficient-wise hyperbolic sin of *this.
   *
-  * Example: \include Cwise_pow.cpp
-  * Output: \verbinclude Cwise_pow.out
+  * Example: \include Cwise_sinh.cpp
+  * Output: \verbinclude Cwise_sinh.out
   *
-  * \sa exp(), log()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sinh">Math functions</a>, sin(), tanh(), cosh()
   */
 EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived>
-pow(const Scalar& exponent) const
+inline const SinhReturnType
+sinh() const
 {
-  return CwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived>
-          (derived(), internal::scalar_pow_op<Scalar>(exponent));
+  return SinhReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise hyperbolic cos of *this.
+  *
+  * Example: \include Cwise_cosh.cpp
+  * Output: \verbinclude Cwise_cosh.out
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cosh">Math functions</a>, tanh(), sinh(), cosh()
+  */
+EIGEN_DEVICE_FUNC
+inline const CoshReturnType
+cosh() const
+{
+  return CoshReturnType(derived());
+}
+
+#if EIGEN_HAS_CXX11_MATH
+/** \returns an expression of the coefficient-wise inverse hyperbolic tan of *this.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_atanh">Math functions</a>, atanh(), asinh(), acosh()
+  */
+EIGEN_DEVICE_FUNC
+inline const AtanhReturnType
+atanh() const
+{
+  return AtanhReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise inverse hyperbolic sin of *this.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_asinh">Math functions</a>, atanh(), asinh(), acosh()
+  */
+EIGEN_DEVICE_FUNC
+inline const AsinhReturnType
+asinh() const
+{
+  return AsinhReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise inverse hyperbolic cos of *this.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_acosh">Math functions</a>, atanh(), asinh(), acosh()
+  */
+EIGEN_DEVICE_FUNC
+inline const AcoshReturnType
+acosh() const
+{
+  return AcoshReturnType(derived());
+}
+#endif
+
+/** \returns an expression of the coefficient-wise logistic of *this.
+  */
+EIGEN_DEVICE_FUNC
+inline const LogisticReturnType
+logistic() const
+{
+  return LogisticReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise inverse of *this.
@@ -235,10 +407,10 @@
   * \sa operator/(), operator*()
   */
 EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived>
+inline const InverseReturnType
 inverse() const
 {
-  return derived();
+  return InverseReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise square of *this.
@@ -246,13 +418,13 @@
   * Example: \include Cwise_square.cpp
   * Output: \verbinclude Cwise_square.out
   *
-  * \sa operator/(), operator*(), abs2()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_squareE">Math functions</a>, abs2(), cube(), pow()
   */
 EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived>
+inline const SquareReturnType
 square() const
 {
-  return derived();
+  return SquareReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise cube of *this.
@@ -260,35 +432,265 @@
   * Example: \include Cwise_cube.cpp
   * Output: \verbinclude Cwise_cube.out
   *
-  * \sa square(), pow()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cube">Math functions</a>, square(), pow()
   */
 EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived>
+inline const CubeReturnType
 cube() const
 {
-  return derived();
+  return CubeReturnType(derived());
 }
 
-#if 0
-// TODO(b/140237672) upstream discarded deprecated std::binder1st/2nd
+/** \returns an expression of the coefficient-wise rint of *this.
+  *
+  * Example: \include Cwise_rint.cpp
+  * Output: \verbinclude Cwise_rint.out
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_rint">Math functions</a>, ceil(), floor()
+  */
+EIGEN_DEVICE_FUNC
+inline const RintReturnType
+rint() const
+{
+  return RintReturnType(derived());
+}
 
-#define EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(METHOD_NAME,FUNCTOR) \
-  EIGEN_DEVICE_FUNC \
-  inline const CwiseUnaryOp<std::binder2nd<FUNCTOR<Scalar> >, const Derived> \
-  METHOD_NAME(const Scalar& s) const { \
-    return CwiseUnaryOp<std::binder2nd<FUNCTOR<Scalar> >, const Derived> \
-            (derived(), std::bind2nd(FUNCTOR<Scalar>(), s)); \
-  } \
-  friend inline const CwiseUnaryOp<std::binder1st<FUNCTOR<Scalar> >, const Derived> \
-  METHOD_NAME(const Scalar& s, const Derived& d) { \
-    return CwiseUnaryOp<std::binder1st<FUNCTOR<Scalar> >, const Derived> \
-            (d, std::bind1st(FUNCTOR<Scalar>(), s)); \
-  }
+/** \returns an expression of the coefficient-wise round of *this.
+  *
+  * Example: \include Cwise_round.cpp
+  * Output: \verbinclude Cwise_round.out
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_round">Math functions</a>, ceil(), floor()
+  */
+EIGEN_DEVICE_FUNC
+inline const RoundReturnType
+round() const
+{
+  return RoundReturnType(derived());
+}
 
-EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator==,  std::equal_to)
-EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator!=,  std::not_equal_to)
-EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator<,   std::less)
-EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator<=,  std::less_equal)
-EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator>,   std::greater)
-EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator>=,  std::greater_equal)
-#endif
+/** \returns an expression of the coefficient-wise floor of *this.
+  *
+  * Example: \include Cwise_floor.cpp
+  * Output: \verbinclude Cwise_floor.out
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_floor">Math functions</a>, ceil(), round()
+  */
+EIGEN_DEVICE_FUNC
+inline const FloorReturnType
+floor() const
+{
+  return FloorReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise ceil of *this.
+  *
+  * Example: \include Cwise_ceil.cpp
+  * Output: \verbinclude Cwise_ceil.out
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_ceil">Math functions</a>, floor(), round()
+  */
+EIGEN_DEVICE_FUNC
+inline const CeilReturnType
+ceil() const
+{
+  return CeilReturnType(derived());
+}
+
+template<int N> struct ShiftRightXpr {
+  typedef CwiseUnaryOp<internal::scalar_shift_right_op<Scalar, N>, const Derived> Type;
+};
+
+/** \returns an expression of \c *this with the \a Scalar type arithmetically
+  * shifted right by \a N bit positions.
+  *
+  * The template parameter \a N specifies the number of bit positions to shift.
+  * 
+  * \sa shiftLeft()
+  */
+template<int N>
+EIGEN_DEVICE_FUNC
+typename ShiftRightXpr<N>::Type
+shiftRight() const
+{
+  return typename ShiftRightXpr<N>::Type(derived());
+}
+
+
+template<int N> struct ShiftLeftXpr {
+  typedef CwiseUnaryOp<internal::scalar_shift_left_op<Scalar, N>, const Derived> Type;
+};
+
+/** \returns an expression of \c *this with the \a Scalar type logically
+  * shifted left by \a N bit positions.
+  *
+  * The template parameter \a N specifies the number of bit positions to shift.
+  *
+  * \sa shiftRight()
+  */
+template<int N>
+EIGEN_DEVICE_FUNC
+typename ShiftLeftXpr<N>::Type
+shiftLeft() const
+{
+  return typename ShiftLeftXpr<N>::Type(derived());
+}
+
+/** \returns an expression of the coefficient-wise isnan of *this.
+  *
+  * Example: \include Cwise_isNaN.cpp
+  * Output: \verbinclude Cwise_isNaN.out
+  *
+  * \sa isfinite(), isinf()
+  */
+EIGEN_DEVICE_FUNC
+inline const IsNaNReturnType
+isNaN() const
+{
+  return IsNaNReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise isinf of *this.
+  *
+  * Example: \include Cwise_isInf.cpp
+  * Output: \verbinclude Cwise_isInf.out
+  *
+  * \sa isnan(), isfinite()
+  */
+EIGEN_DEVICE_FUNC
+inline const IsInfReturnType
+isInf() const
+{
+  return IsInfReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise isfinite of *this.
+  *
+  * Example: \include Cwise_isFinite.cpp
+  * Output: \verbinclude Cwise_isFinite.out
+  *
+  * \sa isnan(), isinf()
+  */
+EIGEN_DEVICE_FUNC
+inline const IsFiniteReturnType
+isFinite() const
+{
+  return IsFiniteReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise ! operator of *this
+  *
+  * \warning this operator is for expression of bool only.
+  *
+  * Example: \include Cwise_boolean_not.cpp
+  * Output: \verbinclude Cwise_boolean_not.out
+  *
+  * \sa operator!=()
+  */
+EIGEN_DEVICE_FUNC
+inline const BooleanNotReturnType
+operator!() const
+{
+  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value),
+                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
+  return BooleanNotReturnType(derived());
+}
+
+
+// --- SpecialFunctions module ---
+
+typedef CwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived> LgammaReturnType;
+typedef CwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived> DigammaReturnType;
+typedef CwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived> ErfReturnType;
+typedef CwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived> ErfcReturnType;
+typedef CwiseUnaryOp<internal::scalar_ndtri_op<Scalar>, const Derived> NdtriReturnType;
+
+/** \cpp11 \returns an expression of the coefficient-wise ln(|gamma(*this)|).
+  *
+  * \specialfunctions_module
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of lgamma(T) for any scalar
+  * type T to be supported.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_lgamma">Math functions</a>, digamma()
+  */
+EIGEN_DEVICE_FUNC
+inline const LgammaReturnType
+lgamma() const
+{
+  return LgammaReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise digamma (psi, derivative of lgamma).
+  *
+  * \specialfunctions_module
+  *
+  * \note This function supports only float and double scalar types. To support other scalar types,
+  * the user has to provide implementations of digamma(T) for any scalar
+  * type T to be supported.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_digamma">Math functions</a>, Eigen::digamma(), Eigen::polygamma(), lgamma()
+  */
+EIGEN_DEVICE_FUNC
+inline const DigammaReturnType
+digamma() const
+{
+  return DigammaReturnType(derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise Gauss error
+  * function of *this.
+  *
+  * \specialfunctions_module
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of erf(T) for any scalar
+  * type T to be supported.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_erf">Math functions</a>, erfc()
+  */
+EIGEN_DEVICE_FUNC
+inline const ErfReturnType
+erf() const
+{
+  return ErfReturnType(derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise Complementary error
+  * function of *this.
+  *
+  * \specialfunctions_module
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of erfc(T) for any scalar
+  * type T to be supported.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_erfc">Math functions</a>, erf()
+  */
+EIGEN_DEVICE_FUNC
+inline const ErfcReturnType
+erfc() const
+{
+  return ErfcReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise inverse of the CDF of the Normal distribution function
+  * function of *this.
+  *
+  * \specialfunctions_module
+  * 
+  * In other words, considering `x = ndtri(y)`, it returns the argument, x, for which the area under the
+  * Gaussian probability density function (integrated from minus infinity to x) is equal to y.
+  *
+  * \note This function supports only float and double scalar types. To support other scalar types,
+  * the user has to provide implementations of ndtri(T) for any scalar type T to be supported.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_ndtri">Math functions</a>
+  */
+EIGEN_DEVICE_FUNC
+inline const NdtriReturnType
+ndtri() const
+{
+  return NdtriReturnType(derived());
+}

diff --git a/Eigen/src/plugins/BlockMethods.h b/Eigen/src/plugins/BlockMethods.h
index 9b7fdc4..63a52a6 100644
--- a/Eigen/src/plugins/BlockMethods.h
+++ b/Eigen/src/plugins/BlockMethods.h

@@ -8,988 +8,1435 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 
-/** \internal expression type of a column */
+/// \internal expression type of a column */
 typedef Block<Derived, internal::traits<Derived>::RowsAtCompileTime, 1, !IsRowMajor> ColXpr;
 typedef const Block<const Derived, internal::traits<Derived>::RowsAtCompileTime, 1, !IsRowMajor> ConstColXpr;
-/** \internal expression type of a row */
+/// \internal expression type of a row */
 typedef Block<Derived, 1, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> RowXpr;
 typedef const Block<const Derived, 1, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> ConstRowXpr;
-/** \internal expression type of a block of whole columns */
+/// \internal expression type of a block of whole columns */
 typedef Block<Derived, internal::traits<Derived>::RowsAtCompileTime, Dynamic, !IsRowMajor> ColsBlockXpr;
 typedef const Block<const Derived, internal::traits<Derived>::RowsAtCompileTime, Dynamic, !IsRowMajor> ConstColsBlockXpr;
-/** \internal expression type of a block of whole rows */
+/// \internal expression type of a block of whole rows */
 typedef Block<Derived, Dynamic, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> RowsBlockXpr;
 typedef const Block<const Derived, Dynamic, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> ConstRowsBlockXpr;
-/** \internal expression type of a block of whole columns */
+/// \internal expression type of a block of whole columns */
 template<int N> struct NColsBlockXpr { typedef Block<Derived, internal::traits<Derived>::RowsAtCompileTime, N, !IsRowMajor> Type; };
 template<int N> struct ConstNColsBlockXpr { typedef const Block<const Derived, internal::traits<Derived>::RowsAtCompileTime, N, !IsRowMajor> Type; };
-/** \internal expression type of a block of whole rows */
+/// \internal expression type of a block of whole rows */
 template<int N> struct NRowsBlockXpr { typedef Block<Derived, N, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> Type; };
 template<int N> struct ConstNRowsBlockXpr { typedef const Block<const Derived, N, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> Type; };
+/// \internal expression of a block */
+typedef Block<Derived> BlockXpr;
+typedef const Block<const Derived> ConstBlockXpr;
+/// \internal expression of a block of fixed sizes */
+template<int Rows, int Cols> struct FixedBlockXpr { typedef Block<Derived,Rows,Cols> Type; };
+template<int Rows, int Cols> struct ConstFixedBlockXpr { typedef Block<const Derived,Rows,Cols> Type; };
 
 typedef VectorBlock<Derived> SegmentReturnType;
 typedef const VectorBlock<const Derived> ConstSegmentReturnType;
 template<int Size> struct FixedSegmentReturnType { typedef VectorBlock<Derived, Size> Type; };
 template<int Size> struct ConstFixedSegmentReturnType { typedef const VectorBlock<const Derived, Size> Type; };
 
+/// \internal inner-vector
+typedef Block<Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true>       InnerVectorReturnType;
+typedef Block<const Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true> ConstInnerVectorReturnType;
+
+/// \internal set of inner-vectors
+typedef Block<Derived,Dynamic,Dynamic,true> InnerVectorsReturnType;
+typedef Block<const Derived,Dynamic,Dynamic,true> ConstInnerVectorsReturnType;
+
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
-/** \returns a dynamic-size expression of a block in *this.
-  *
-  * \param startRow the first row in the block
-  * \param startCol the first column in the block
-  * \param blockRows the number of rows in the block
-  * \param blockCols the number of columns in the block
-  *
-  * Example: \include MatrixBase_block_int_int_int_int.cpp
-  * Output: \verbinclude MatrixBase_block_int_int_int_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size matrix, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, block(Index,Index)
-  */
-EIGEN_DEVICE_FUNC
-inline Block<Derived> block(Index startRow, Index startCol, Index blockRows, Index blockCols)
+/// \returns an expression of a block in \c *this with either dynamic or fixed sizes.
+///
+/// \param  startRow  the first row in the block
+/// \param  startCol  the first column in the block
+/// \param  blockRows number of rows in the block, specified at either run-time or compile-time
+/// \param  blockCols number of columns in the block, specified at either run-time or compile-time
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
+///
+/// Example using runtime (aka dynamic) sizes: \include MatrixBase_block_int_int_int_int.cpp
+/// Output: \verbinclude MatrixBase_block_int_int_int_int.out
+///
+/// \newin{3.4}:
+///
+/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments. In the later case, \c n plays the role of a runtime fallback value in case \c N equals Eigen::Dynamic.
+/// Here is an example with a fixed number of rows \c NRows and dynamic number of columns \c cols:
+/// \code
+/// mat.block(i,j,fix<NRows>,cols)
+/// \endcode
+///
+/// This function thus fully covers the features offered by the following overloads block<NRows,NCols>(Index, Index),
+/// and block<NRows,NCols>(Index, Index, Index, Index) that are thus obsolete. Indeed, this generic version avoids
+/// redundancy, it preserves the argument order, and prevents the need to rely on the template keyword in templated code.
+///
+/// but with less redundancy and more consistency as it does not modify the argument order
+/// and seamlessly enable hybrid fixed/dynamic sizes.
+///
+/// \note Even in the case that the returned expression has dynamic size, in the case
+/// when it is applied to a fixed-size matrix, it inherits a fixed maximal size,
+/// which means that evaluating it does not cause a dynamic memory allocation.
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, fix, fix<N>(int)
+///
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+#else
+typename FixedBlockXpr<...,...>::Type
+#endif
+block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols)
 {
-  return Block<Derived>(derived(), startRow, startCol, blockRows, blockCols);
+  return typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type(
+            derived(), startRow, startCol, internal::get_runtime_value(blockRows), internal::get_runtime_value(blockCols));
 }
 
-/** This is the const version of block(Index,Index,Index,Index). */
-EIGEN_DEVICE_FUNC
-inline const Block<const Derived> block(Index startRow, Index startCol, Index blockRows, Index blockCols) const
+/// This is the const version of block(Index,Index,NRowsType,NColsType)
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+#else
+const typename ConstFixedBlockXpr<...,...>::Type
+#endif
+block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols) const
 {
-  return Block<const Derived>(derived(), startRow, startCol, blockRows, blockCols);
+  return typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type(
+            derived(), startRow, startCol, internal::get_runtime_value(blockRows), internal::get_runtime_value(blockCols));
 }
 
 
 
-
-/** \returns a dynamic-size expression of a top-right corner of *this.
-  *
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_topRightCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_topRightCorner_int_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-EIGEN_DEVICE_FUNC
-inline Block<Derived> topRightCorner(Index cRows, Index cCols)
+/// \returns a expression of a top-right corner of \c *this with either dynamic or fixed sizes.
+///
+/// \param cRows the number of rows in the corner
+/// \param cCols the number of columns in the corner
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
+///
+/// Example with dynamic sizes: \include MatrixBase_topRightCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_topRightCorner_int_int.out
+///
+/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+#else
+typename FixedBlockXpr<...,...>::Type
+#endif
+topRightCorner(NRowsType cRows, NColsType cCols)
 {
-  return Block<Derived>(derived(), 0, cols() - cCols, cRows, cCols);
+  return typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), 0, cols() - internal::get_runtime_value(cCols), internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
 }
 
-/** This is the const version of topRightCorner(Index, Index).*/
-EIGEN_DEVICE_FUNC
-inline const Block<const Derived> topRightCorner(Index cRows, Index cCols) const
+/// This is the const version of topRightCorner(NRowsType, NColsType).
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+#else
+const typename ConstFixedBlockXpr<...,...>::Type
+#endif
+topRightCorner(NRowsType cRows, NColsType cCols) const
 {
-  return Block<const Derived>(derived(), 0, cols() - cCols, cRows, cCols);
+  return typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), 0, cols() - internal::get_runtime_value(cCols), internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
 }
 
-/** \returns an expression of a fixed-size top-right corner of *this.
-  *
-  * \tparam CRows the number of rows in the corner
-  * \tparam CCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_template_int_int_topRightCorner.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_topRightCorner.out
-  *
-  * \sa class Block, block<int,int>(Index,Index)
-  */
+/// \returns an expression of a fixed-size top-right corner of \c *this.
+///
+/// \tparam CRows the number of rows in the corner
+/// \tparam CCols the number of columns in the corner
+///
+/// Example: \include MatrixBase_template_int_int_topRightCorner.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_topRightCorner.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block<int,int>(Index,Index)
+///
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC
-inline Block<Derived, CRows, CCols> topRightCorner()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedBlockXpr<CRows,CCols>::Type topRightCorner()
 {
-  return Block<Derived, CRows, CCols>(derived(), 0, cols() - CCols);
+  return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - CCols);
 }
 
-/** This is the const version of topRightCorner<int, int>().*/
+/// This is the const version of topRightCorner<int, int>().
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC
-inline const Block<const Derived, CRows, CCols> topRightCorner() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const typename ConstFixedBlockXpr<CRows,CCols>::Type topRightCorner() const
 {
-  return Block<const Derived, CRows, CCols>(derived(), 0, cols() - CCols);
+  return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - CCols);
 }
 
-/** \returns an expression of a top-right corner of *this.
-  *
-  * \tparam CRows number of rows in corner as specified at compile-time
-  * \tparam CCols number of columns in corner as specified at compile-time
-  * \param  cRows number of rows in corner as specified at run-time
-  * \param  cCols number of columns in corner as specified at run-time
-  *
-  * This function is mainly useful for corners where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a cRows should equal \a CRows unless
-  * \a CRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_topRightCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_topRightCorner_int_int.out
-  *
-  * \sa class Block
-  */
+/// \returns an expression of a top-right corner of \c *this.
+///
+/// \tparam CRows number of rows in corner as specified at compile-time
+/// \tparam CCols number of columns in corner as specified at compile-time
+/// \param  cRows number of rows in corner as specified at run-time
+/// \param  cCols number of columns in corner as specified at run-time
+///
+/// This function is mainly useful for corners where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a cRows should equal \a CRows unless
+/// \a CRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_topRightCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_topRightCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block
+///
 template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols> topRightCorner(Index cRows, Index cCols)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedBlockXpr<CRows,CCols>::Type topRightCorner(Index cRows, Index cCols)
 {
-  return Block<Derived, CRows, CCols>(derived(), 0, cols() - cCols, cRows, cCols);
+  return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - cCols, cRows, cCols);
 }
 
-/** This is the const version of topRightCorner<int, int>(Index, Index).*/
+/// This is the const version of topRightCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
-inline const Block<const Derived, CRows, CCols> topRightCorner(Index cRows, Index cCols) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const typename ConstFixedBlockXpr<CRows,CCols>::Type topRightCorner(Index cRows, Index cCols) const
 {
-  return Block<const Derived, CRows, CCols>(derived(), 0, cols() - cCols, cRows, cCols);
+  return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - cCols, cRows, cCols);
 }
 
 
 
-/** \returns a dynamic-size expression of a top-left corner of *this.
-  *
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_topLeftCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_topLeftCorner_int_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-EIGEN_DEVICE_FUNC
-inline Block<Derived> topLeftCorner(Index cRows, Index cCols)
+/// \returns an expression of a top-left corner of \c *this  with either dynamic or fixed sizes.
+///
+/// \param cRows the number of rows in the corner
+/// \param cCols the number of columns in the corner
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
+///
+/// Example: \include MatrixBase_topLeftCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_topLeftCorner_int_int.out
+///
+/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+#else
+typename FixedBlockXpr<...,...>::Type
+#endif
+topLeftCorner(NRowsType cRows, NColsType cCols)
 {
-  return Block<Derived>(derived(), 0, 0, cRows, cCols);
+  return typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), 0, 0, internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
 }
 
-/** This is the const version of topLeftCorner(Index, Index).*/
-EIGEN_DEVICE_FUNC
-inline const Block<const Derived> topLeftCorner(Index cRows, Index cCols) const
+/// This is the const version of topLeftCorner(Index, Index).
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+#else
+const typename ConstFixedBlockXpr<...,...>::Type
+#endif
+topLeftCorner(NRowsType cRows, NColsType cCols) const
 {
-  return Block<const Derived>(derived(), 0, 0, cRows, cCols);
+  return typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), 0, 0, internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
 }
 
-/** \returns an expression of a fixed-size top-left corner of *this.
-  *
-  * The template parameters CRows and CCols are the number of rows and columns in the corner.
-  *
-  * Example: \include MatrixBase_template_int_int_topLeftCorner.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_topLeftCorner.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns an expression of a fixed-size top-left corner of \c *this.
+///
+/// The template parameters CRows and CCols are the number of rows and columns in the corner.
+///
+/// Example: \include MatrixBase_template_int_int_topLeftCorner.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_topLeftCorner.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC
-inline Block<Derived, CRows, CCols> topLeftCorner()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedBlockXpr<CRows,CCols>::Type topLeftCorner()
 {
-  return Block<Derived, CRows, CCols>(derived(), 0, 0);
+  return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0);
 }
 
-/** This is the const version of topLeftCorner<int, int>().*/
+/// This is the const version of topLeftCorner<int, int>().
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC
-inline const Block<const Derived, CRows, CCols> topLeftCorner() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const typename ConstFixedBlockXpr<CRows,CCols>::Type topLeftCorner() const
 {
-  return Block<const Derived, CRows, CCols>(derived(), 0, 0);
+  return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0);
 }
 
-/** \returns an expression of a top-left corner of *this.
-  *
-  * \tparam CRows number of rows in corner as specified at compile-time
-  * \tparam CCols number of columns in corner as specified at compile-time
-  * \param  cRows number of rows in corner as specified at run-time
-  * \param  cCols number of columns in corner as specified at run-time
-  *
-  * This function is mainly useful for corners where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a cRows should equal \a CRows unless
-  * \a CRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_topLeftCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_topLeftCorner_int_int.out
-  *
-  * \sa class Block
-  */
+/// \returns an expression of a top-left corner of \c *this.
+///
+/// \tparam CRows number of rows in corner as specified at compile-time
+/// \tparam CCols number of columns in corner as specified at compile-time
+/// \param  cRows number of rows in corner as specified at run-time
+/// \param  cCols number of columns in corner as specified at run-time
+///
+/// This function is mainly useful for corners where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a cRows should equal \a CRows unless
+/// \a CRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_topLeftCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_topLeftCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block
+///
 template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols> topLeftCorner(Index cRows, Index cCols)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedBlockXpr<CRows,CCols>::Type topLeftCorner(Index cRows, Index cCols)
 {
-  return Block<Derived, CRows, CCols>(derived(), 0, 0, cRows, cCols);
+  return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0, cRows, cCols);
 }
 
-/** This is the const version of topLeftCorner<int, int>(Index, Index).*/
+/// This is the const version of topLeftCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
-inline const Block<const Derived, CRows, CCols> topLeftCorner(Index cRows, Index cCols) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const typename ConstFixedBlockXpr<CRows,CCols>::Type topLeftCorner(Index cRows, Index cCols) const
 {
-  return Block<const Derived, CRows, CCols>(derived(), 0, 0, cRows, cCols);
+  return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0, cRows, cCols);
 }
 
 
 
-/** \returns a dynamic-size expression of a bottom-right corner of *this.
-  *
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_bottomRightCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_bottomRightCorner_int_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-EIGEN_DEVICE_FUNC
-inline Block<Derived> bottomRightCorner(Index cRows, Index cCols)
+/// \returns an expression of a bottom-right corner of \c *this  with either dynamic or fixed sizes.
+///
+/// \param cRows the number of rows in the corner
+/// \param cCols the number of columns in the corner
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
+///
+/// Example: \include MatrixBase_bottomRightCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_bottomRightCorner_int_int.out
+///
+/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+#else
+typename FixedBlockXpr<...,...>::Type
+#endif
+bottomRightCorner(NRowsType cRows, NColsType cCols)
 {
-  return Block<Derived>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
+  return typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), rows() - internal::get_runtime_value(cRows), cols() - internal::get_runtime_value(cCols),
+                        internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
 }
 
-/** This is the const version of bottomRightCorner(Index, Index).*/
-EIGEN_DEVICE_FUNC
-inline const Block<const Derived> bottomRightCorner(Index cRows, Index cCols) const
+/// This is the const version of bottomRightCorner(NRowsType, NColsType).
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+#else
+const typename ConstFixedBlockXpr<...,...>::Type
+#endif
+bottomRightCorner(NRowsType cRows, NColsType cCols) const
 {
-  return Block<const Derived>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
+  return typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), rows() - internal::get_runtime_value(cRows), cols() - internal::get_runtime_value(cCols),
+                        internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
 }
 
-/** \returns an expression of a fixed-size bottom-right corner of *this.
-  *
-  * The template parameters CRows and CCols are the number of rows and columns in the corner.
-  *
-  * Example: \include MatrixBase_template_int_int_bottomRightCorner.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns an expression of a fixed-size bottom-right corner of \c *this.
+///
+/// The template parameters CRows and CCols are the number of rows and columns in the corner.
+///
+/// Example: \include MatrixBase_template_int_int_bottomRightCorner.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC
-inline Block<Derived, CRows, CCols> bottomRightCorner()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedBlockXpr<CRows,CCols>::Type bottomRightCorner()
 {
-  return Block<Derived, CRows, CCols>(derived(), rows() - CRows, cols() - CCols);
+  return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, cols() - CCols);
 }
 
-/** This is the const version of bottomRightCorner<int, int>().*/
+/// This is the const version of bottomRightCorner<int, int>().
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC
-inline const Block<const Derived, CRows, CCols> bottomRightCorner() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomRightCorner() const
 {
-  return Block<const Derived, CRows, CCols>(derived(), rows() - CRows, cols() - CCols);
+  return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, cols() - CCols);
 }
 
-/** \returns an expression of a bottom-right corner of *this.
-  *
-  * \tparam CRows number of rows in corner as specified at compile-time
-  * \tparam CCols number of columns in corner as specified at compile-time
-  * \param  cRows number of rows in corner as specified at run-time
-  * \param  cCols number of columns in corner as specified at run-time
-  *
-  * This function is mainly useful for corners where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a cRows should equal \a CRows unless
-  * \a CRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_bottomRightCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner_int_int.out
-  *
-  * \sa class Block
-  */
+/// \returns an expression of a bottom-right corner of \c *this.
+///
+/// \tparam CRows number of rows in corner as specified at compile-time
+/// \tparam CCols number of columns in corner as specified at compile-time
+/// \param  cRows number of rows in corner as specified at run-time
+/// \param  cCols number of columns in corner as specified at run-time
+///
+/// This function is mainly useful for corners where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a cRows should equal \a CRows unless
+/// \a CRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_bottomRightCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block
+///
 template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols> bottomRightCorner(Index cRows, Index cCols)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedBlockXpr<CRows,CCols>::Type bottomRightCorner(Index cRows, Index cCols)
 {
-  return Block<Derived, CRows, CCols>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
+  return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
 
-/** This is the const version of bottomRightCorner<int, int>(Index, Index).*/
+/// This is the const version of bottomRightCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
-inline const Block<const Derived, CRows, CCols> bottomRightCorner(Index cRows, Index cCols) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomRightCorner(Index cRows, Index cCols) const
 {
-  return Block<const Derived, CRows, CCols>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
+  return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
 
 
 
-/** \returns a dynamic-size expression of a bottom-left corner of *this.
-  *
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_bottomLeftCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_bottomLeftCorner_int_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-EIGEN_DEVICE_FUNC
-inline Block<Derived> bottomLeftCorner(Index cRows, Index cCols)
+/// \returns an expression of a bottom-left corner of \c *this  with either dynamic or fixed sizes.
+///
+/// \param cRows the number of rows in the corner
+/// \param cCols the number of columns in the corner
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
+///
+/// Example: \include MatrixBase_bottomLeftCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_bottomLeftCorner_int_int.out
+///
+/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+#else
+typename FixedBlockXpr<...,...>::Type
+#endif
+bottomLeftCorner(NRowsType cRows, NColsType cCols)
 {
-  return Block<Derived>(derived(), rows() - cRows, 0, cRows, cCols);
+  return typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), rows() - internal::get_runtime_value(cRows), 0,
+                        internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
 }
 
-/** This is the const version of bottomLeftCorner(Index, Index).*/
-EIGEN_DEVICE_FUNC
-inline const Block<const Derived> bottomLeftCorner(Index cRows, Index cCols) const
+/// This is the const version of bottomLeftCorner(NRowsType, NColsType).
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+#else
+typename ConstFixedBlockXpr<...,...>::Type
+#endif
+bottomLeftCorner(NRowsType cRows, NColsType cCols) const
 {
-  return Block<const Derived>(derived(), rows() - cRows, 0, cRows, cCols);
+  return typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), rows() - internal::get_runtime_value(cRows), 0,
+                        internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
 }
 
-/** \returns an expression of a fixed-size bottom-left corner of *this.
-  *
-  * The template parameters CRows and CCols are the number of rows and columns in the corner.
-  *
-  * Example: \include MatrixBase_template_int_int_bottomLeftCorner.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns an expression of a fixed-size bottom-left corner of \c *this.
+///
+/// The template parameters CRows and CCols are the number of rows and columns in the corner.
+///
+/// Example: \include MatrixBase_template_int_int_bottomLeftCorner.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC
-inline Block<Derived, CRows, CCols> bottomLeftCorner()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedBlockXpr<CRows,CCols>::Type bottomLeftCorner()
 {
-  return Block<Derived, CRows, CCols>(derived(), rows() - CRows, 0);
+  return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, 0);
 }
 
-/** This is the const version of bottomLeftCorner<int, int>().*/
+/// This is the const version of bottomLeftCorner<int, int>().
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC
-inline const Block<const Derived, CRows, CCols> bottomLeftCorner() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomLeftCorner() const
 {
-  return Block<const Derived, CRows, CCols>(derived(), rows() - CRows, 0);
+  return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, 0);
 }
 
-/** \returns an expression of a bottom-left corner of *this.
-  *
-  * \tparam CRows number of rows in corner as specified at compile-time
-  * \tparam CCols number of columns in corner as specified at compile-time
-  * \param  cRows number of rows in corner as specified at run-time
-  * \param  cCols number of columns in corner as specified at run-time
-  *
-  * This function is mainly useful for corners where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a cRows should equal \a CRows unless
-  * \a CRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_bottomLeftCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner_int_int.out
-  *
-  * \sa class Block
-  */
+/// \returns an expression of a bottom-left corner of \c *this.
+///
+/// \tparam CRows number of rows in corner as specified at compile-time
+/// \tparam CCols number of columns in corner as specified at compile-time
+/// \param  cRows number of rows in corner as specified at run-time
+/// \param  cCols number of columns in corner as specified at run-time
+///
+/// This function is mainly useful for corners where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a cRows should equal \a CRows unless
+/// \a CRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_bottomLeftCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block
+///
 template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols> bottomLeftCorner(Index cRows, Index cCols)
+EIGEN_STRONG_INLINE
+typename FixedBlockXpr<CRows,CCols>::Type bottomLeftCorner(Index cRows, Index cCols)
 {
-  return Block<Derived, CRows, CCols>(derived(), rows() - cRows, 0, cRows, cCols);
+  return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - cRows, 0, cRows, cCols);
 }
 
-/** This is the const version of bottomLeftCorner<int, int>(Index, Index).*/
+/// This is the const version of bottomLeftCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
-inline const Block<const Derived, CRows, CCols> bottomLeftCorner(Index cRows, Index cCols) const
+EIGEN_STRONG_INLINE
+const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomLeftCorner(Index cRows, Index cCols) const
 {
-  return Block<const Derived, CRows, CCols>(derived(), rows() - cRows, 0, cRows, cCols);
+  return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), rows() - cRows, 0, cRows, cCols);
 }
 
 
 
-/** \returns a block consisting of the top rows of *this.
-  *
-  * \param n the number of rows in the block
-  *
-  * Example: \include MatrixBase_topRows_int.cpp
-  * Output: \verbinclude MatrixBase_topRows_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-EIGEN_DEVICE_FUNC
-inline RowsBlockXpr topRows(Index n)
+/// \returns a block consisting of the top rows of \c *this.
+///
+/// \param n the number of rows in the block
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
+///
+/// Example: \include MatrixBase_topRows_int.cpp
+/// Output: \verbinclude MatrixBase_topRows_int.out
+///
+/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template<typename NRowsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+#else
+typename NRowsBlockXpr<...>::Type
+#endif
+topRows(NRowsType n)
 {
-  return RowsBlockXpr(derived(), 0, 0, n, cols());
+  return typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+            (derived(), 0, 0, internal::get_runtime_value(n), cols());
 }
 
-/** This is the const version of topRows(Index).*/
-EIGEN_DEVICE_FUNC
-inline ConstRowsBlockXpr topRows(Index n) const
+/// This is the const version of topRows(NRowsType).
+template<typename NRowsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+#else
+const typename ConstNRowsBlockXpr<...>::Type
+#endif
+topRows(NRowsType n) const
 {
-  return ConstRowsBlockXpr(derived(), 0, 0, n, cols());
+  return typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+            (derived(), 0, 0, internal::get_runtime_value(n), cols());
 }
 
-/** \returns a block consisting of the top rows of *this.
-  *
-  * \tparam N the number of rows in the block as specified at compile-time
-  * \param n the number of rows in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_topRows.cpp
-  * Output: \verbinclude MatrixBase_template_int_topRows.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the top rows of \c *this.
+///
+/// \tparam N the number of rows in the block as specified at compile-time
+/// \param n the number of rows in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_topRows.cpp
+/// Output: \verbinclude MatrixBase_template_int_topRows.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename NRowsBlockXpr<N>::Type topRows(Index n = N)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename NRowsBlockXpr<N>::Type topRows(Index n = N)
 {
   return typename NRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
 }
 
-/** This is the const version of topRows<int>().*/
+/// This is the const version of topRows<int>().
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename ConstNRowsBlockXpr<N>::Type topRows(Index n = N) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename ConstNRowsBlockXpr<N>::Type topRows(Index n = N) const
 {
   return typename ConstNRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
 }
 
 
 
-/** \returns a block consisting of the bottom rows of *this.
-  *
-  * \param n the number of rows in the block
-  *
-  * Example: \include MatrixBase_bottomRows_int.cpp
-  * Output: \verbinclude MatrixBase_bottomRows_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-EIGEN_DEVICE_FUNC
-inline RowsBlockXpr bottomRows(Index n)
+/// \returns a block consisting of the bottom rows of \c *this.
+///
+/// \param n the number of rows in the block
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
+///
+/// Example: \include MatrixBase_bottomRows_int.cpp
+/// Output: \verbinclude MatrixBase_bottomRows_int.out
+///
+/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template<typename NRowsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+#else
+typename NRowsBlockXpr<...>::Type
+#endif
+bottomRows(NRowsType n)
 {
-  return RowsBlockXpr(derived(), rows() - n, 0, n, cols());
+  return typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+            (derived(), rows() - internal::get_runtime_value(n), 0, internal::get_runtime_value(n), cols());
 }
 
-/** This is the const version of bottomRows(Index).*/
-EIGEN_DEVICE_FUNC
-inline ConstRowsBlockXpr bottomRows(Index n) const
+/// This is the const version of bottomRows(NRowsType).
+template<typename NRowsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+#else
+const typename ConstNRowsBlockXpr<...>::Type
+#endif
+bottomRows(NRowsType n) const
 {
-  return ConstRowsBlockXpr(derived(), rows() - n, 0, n, cols());
+  return typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+            (derived(), rows() - internal::get_runtime_value(n), 0, internal::get_runtime_value(n), cols());
 }
 
-/** \returns a block consisting of the bottom rows of *this.
-  *
-  * \tparam N the number of rows in the block as specified at compile-time
-  * \param n the number of rows in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_bottomRows.cpp
-  * Output: \verbinclude MatrixBase_template_int_bottomRows.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the bottom rows of \c *this.
+///
+/// \tparam N the number of rows in the block as specified at compile-time
+/// \param n the number of rows in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_bottomRows.cpp
+/// Output: \verbinclude MatrixBase_template_int_bottomRows.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename NRowsBlockXpr<N>::Type bottomRows(Index n = N)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename NRowsBlockXpr<N>::Type bottomRows(Index n = N)
 {
   return typename NRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
 }
 
-/** This is the const version of bottomRows<int>().*/
+/// This is the const version of bottomRows<int>().
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename ConstNRowsBlockXpr<N>::Type bottomRows(Index n = N) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename ConstNRowsBlockXpr<N>::Type bottomRows(Index n = N) const
 {
   return typename ConstNRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
 }
 
 
 
-/** \returns a block consisting of a range of rows of *this.
-  *
-  * \param startRow the index of the first row in the block
-  * \param n the number of rows in the block
-  *
-  * Example: \include DenseBase_middleRows_int.cpp
-  * Output: \verbinclude DenseBase_middleRows_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-EIGEN_DEVICE_FUNC
-inline RowsBlockXpr middleRows(Index startRow, Index n)
+/// \returns a block consisting of a range of rows of \c *this.
+///
+/// \param startRow the index of the first row in the block
+/// \param n the number of rows in the block
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
+///
+/// Example: \include DenseBase_middleRows_int.cpp
+/// Output: \verbinclude DenseBase_middleRows_int.out
+///
+/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template<typename NRowsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+#else
+typename NRowsBlockXpr<...>::Type
+#endif
+middleRows(Index startRow, NRowsType n)
 {
-  return RowsBlockXpr(derived(), startRow, 0, n, cols());
+  return typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+            (derived(), startRow, 0, internal::get_runtime_value(n), cols());
 }
 
-/** This is the const version of middleRows(Index,Index).*/
-EIGEN_DEVICE_FUNC
-inline ConstRowsBlockXpr middleRows(Index startRow, Index n) const
+/// This is the const version of middleRows(Index,NRowsType).
+template<typename NRowsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+#else
+const typename ConstNRowsBlockXpr<...>::Type
+#endif
+middleRows(Index startRow, NRowsType n) const
 {
-  return ConstRowsBlockXpr(derived(), startRow, 0, n, cols());
+  return typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+            (derived(), startRow, 0, internal::get_runtime_value(n), cols());
 }
 
-/** \returns a block consisting of a range of rows of *this.
-  *
-  * \tparam N the number of rows in the block as specified at compile-time
-  * \param startRow the index of the first row in the block
-  * \param n the number of rows in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include DenseBase_template_int_middleRows.cpp
-  * Output: \verbinclude DenseBase_template_int_middleRows.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of a range of rows of \c *this.
+///
+/// \tparam N the number of rows in the block as specified at compile-time
+/// \param startRow the index of the first row in the block
+/// \param n the number of rows in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include DenseBase_template_int_middleRows.cpp
+/// Output: \verbinclude DenseBase_template_int_middleRows.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename NRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename NRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N)
 {
   return typename NRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
 }
 
-/** This is the const version of middleRows<int>().*/
+/// This is the const version of middleRows<int>().
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N) const
 {
   return typename ConstNRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
 }
 
 
 
-/** \returns a block consisting of the left columns of *this.
-  *
-  * \param n the number of columns in the block
-  *
-  * Example: \include MatrixBase_leftCols_int.cpp
-  * Output: \verbinclude MatrixBase_leftCols_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-EIGEN_DEVICE_FUNC
-inline ColsBlockXpr leftCols(Index n)
+/// \returns a block consisting of the left columns of \c *this.
+///
+/// \param n the number of columns in the block
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
+///
+/// Example: \include MatrixBase_leftCols_int.cpp
+/// Output: \verbinclude MatrixBase_leftCols_int.out
+///
+/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template<typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+#else
+typename NColsBlockXpr<...>::Type
+#endif
+leftCols(NColsType n)
 {
-  return ColsBlockXpr(derived(), 0, 0, rows(), n);
+  return typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), 0, 0, rows(), internal::get_runtime_value(n));
 }
 
-/** This is the const version of leftCols(Index).*/
-EIGEN_DEVICE_FUNC
-inline ConstColsBlockXpr leftCols(Index n) const
+/// This is the const version of leftCols(NColsType).
+template<typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+#else
+const typename ConstNColsBlockXpr<...>::Type
+#endif
+leftCols(NColsType n) const
 {
-  return ConstColsBlockXpr(derived(), 0, 0, rows(), n);
+  return typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), 0, 0, rows(), internal::get_runtime_value(n));
 }
 
-/** \returns a block consisting of the left columns of *this.
-  *
-  * \tparam N the number of columns in the block as specified at compile-time
-  * \param n the number of columns in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_leftCols.cpp
-  * Output: \verbinclude MatrixBase_template_int_leftCols.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the left columns of \c *this.
+///
+/// \tparam N the number of columns in the block as specified at compile-time
+/// \param n the number of columns in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_leftCols.cpp
+/// Output: \verbinclude MatrixBase_template_int_leftCols.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename NColsBlockXpr<N>::Type leftCols(Index n = N)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename NColsBlockXpr<N>::Type leftCols(Index n = N)
 {
   return typename NColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
 }
 
-/** This is the const version of leftCols<int>().*/
+/// This is the const version of leftCols<int>().
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename ConstNColsBlockXpr<N>::Type leftCols(Index n = N) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename ConstNColsBlockXpr<N>::Type leftCols(Index n = N) const
 {
   return typename ConstNColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
 }
 
 
 
-/** \returns a block consisting of the right columns of *this.
-  *
-  * \param n the number of columns in the block
-  *
-  * Example: \include MatrixBase_rightCols_int.cpp
-  * Output: \verbinclude MatrixBase_rightCols_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-EIGEN_DEVICE_FUNC
-inline ColsBlockXpr rightCols(Index n)
+/// \returns a block consisting of the right columns of \c *this.
+///
+/// \param n the number of columns in the block
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
+///
+/// Example: \include MatrixBase_rightCols_int.cpp
+/// Output: \verbinclude MatrixBase_rightCols_int.out
+///
+/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template<typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+#else
+typename NColsBlockXpr<...>::Type
+#endif
+rightCols(NColsType n)
 {
-  return ColsBlockXpr(derived(), 0, cols() - n, rows(), n);
+  return typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), 0, cols() - internal::get_runtime_value(n), rows(), internal::get_runtime_value(n));
 }
 
-/** This is the const version of rightCols(Index).*/
-EIGEN_DEVICE_FUNC
-inline ConstColsBlockXpr rightCols(Index n) const
+/// This is the const version of rightCols(NColsType).
+template<typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+#else
+const typename ConstNColsBlockXpr<...>::Type
+#endif
+rightCols(NColsType n) const
 {
-  return ConstColsBlockXpr(derived(), 0, cols() - n, rows(), n);
+  return typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), 0, cols() - internal::get_runtime_value(n), rows(), internal::get_runtime_value(n));
 }
 
-/** \returns a block consisting of the right columns of *this.
-  *
-  * \tparam N the number of columns in the block as specified at compile-time
-  * \param n the number of columns in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_rightCols.cpp
-  * Output: \verbinclude MatrixBase_template_int_rightCols.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the right columns of \c *this.
+///
+/// \tparam N the number of columns in the block as specified at compile-time
+/// \param n the number of columns in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_rightCols.cpp
+/// Output: \verbinclude MatrixBase_template_int_rightCols.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename NColsBlockXpr<N>::Type rightCols(Index n = N)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename NColsBlockXpr<N>::Type rightCols(Index n = N)
 {
   return typename NColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
 }
 
-/** This is the const version of rightCols<int>().*/
+/// This is the const version of rightCols<int>().
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename ConstNColsBlockXpr<N>::Type rightCols(Index n = N) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename ConstNColsBlockXpr<N>::Type rightCols(Index n = N) const
 {
   return typename ConstNColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
 }
 
 
 
-/** \returns a block consisting of a range of columns of *this.
-  *
-  * \param startCol the index of the first column in the block
-  * \param numCols the number of columns in the block
-  *
-  * Example: \include DenseBase_middleCols_int.cpp
-  * Output: \verbinclude DenseBase_middleCols_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-EIGEN_DEVICE_FUNC
-inline ColsBlockXpr middleCols(Index startCol, Index numCols)
+/// \returns a block consisting of a range of columns of \c *this.
+///
+/// \param startCol the index of the first column in the block
+/// \param numCols the number of columns in the block
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
+///
+/// Example: \include DenseBase_middleCols_int.cpp
+/// Output: \verbinclude DenseBase_middleCols_int.out
+///
+/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template<typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+#else
+typename NColsBlockXpr<...>::Type
+#endif
+middleCols(Index startCol, NColsType numCols)
 {
-  return ColsBlockXpr(derived(), 0, startCol, rows(), numCols);
+  return typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), 0, startCol, rows(), internal::get_runtime_value(numCols));
 }
 
-/** This is the const version of middleCols(Index,Index).*/
-EIGEN_DEVICE_FUNC
-inline ConstColsBlockXpr middleCols(Index startCol, Index numCols) const
+/// This is the const version of middleCols(Index,NColsType).
+template<typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+#else
+const typename ConstNColsBlockXpr<...>::Type
+#endif
+middleCols(Index startCol, NColsType numCols) const
 {
-  return ConstColsBlockXpr(derived(), 0, startCol, rows(), numCols);
+  return typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), 0, startCol, rows(), internal::get_runtime_value(numCols));
 }
 
-/** \returns a block consisting of a range of columns of *this.
-  *
-  * \tparam N the number of columns in the block as specified at compile-time
-  * \param startCol the index of the first column in the block
-  * \param n the number of columns in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include DenseBase_template_int_middleCols.cpp
-  * Output: \verbinclude DenseBase_template_int_middleCols.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of a range of columns of \c *this.
+///
+/// \tparam N the number of columns in the block as specified at compile-time
+/// \param startCol the index of the first column in the block
+/// \param n the number of columns in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include DenseBase_template_int_middleCols.cpp
+/// Output: \verbinclude DenseBase_template_int_middleCols.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename NColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename NColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N)
 {
   return typename NColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
 }
 
-/** This is the const version of middleCols<int>().*/
+/// This is the const version of middleCols<int>().
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N) const
 {
   return typename ConstNColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
 }
 
 
 
-/** \returns a fixed-size expression of a block in *this.
-  *
-  * The template parameters \a BlockRows and \a BlockCols are the number of
-  * rows and columns in the block.
-  *
-  * \param startRow the first row in the block
-  * \param startCol the first column in the block
-  *
-  * Example: \include MatrixBase_block_int_int.cpp
-  * Output: \verbinclude MatrixBase_block_int_int.out
-  *
-  * \note since block is a templated member, the keyword template has to be used
-  * if the matrix type is also a template parameter: \code m.template block<3,3>(1,1); \endcode
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-template<int BlockRows, int BlockCols>
-EIGEN_DEVICE_FUNC
-inline Block<Derived, BlockRows, BlockCols> block(Index startRow, Index startCol)
+/// \returns a fixed-size expression of a block of \c *this.
+///
+/// The template parameters \a NRows and \a NCols are the number of
+/// rows and columns in the block.
+///
+/// \param startRow the first row in the block
+/// \param startCol the first column in the block
+///
+/// Example: \include MatrixBase_block_int_int.cpp
+/// Output: \verbinclude MatrixBase_block_int_int.out
+///
+/// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic
+/// block(Index,Index,NRowsType,NColsType), here is the one-to-one equivalence:
+/// \code
+/// mat.template block<NRows,NCols>(i,j)  <-->  mat.block(i,j,fix<NRows>,fix<NCols>)
+/// \endcode
+///
+/// \note since block is a templated member, the keyword template has to be used
+/// if the matrix type is also a template parameter: \code m.template block<3,3>(1,1); \endcode
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template<int NRows, int NCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol)
 {
-  return Block<Derived, BlockRows, BlockCols>(derived(), startRow, startCol);
+  return typename FixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol);
 }
 
-/** This is the const version of block<>(Index, Index). */
-template<int BlockRows, int BlockCols>
-EIGEN_DEVICE_FUNC
-inline const Block<const Derived, BlockRows, BlockCols> block(Index startRow, Index startCol) const
+/// This is the const version of block<>(Index, Index). */
+template<int NRows, int NCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol) const
 {
-  return Block<const Derived, BlockRows, BlockCols>(derived(), startRow, startCol);
+  return typename ConstFixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol);
 }
 
-/** \returns an expression of a block in *this.
-  *
-  * \tparam BlockRows number of rows in block as specified at compile-time
-  * \tparam BlockCols number of columns in block as specified at compile-time
-  * \param  startRow  the first row in the block
-  * \param  startCol  the first column in the block
-  * \param  blockRows number of rows in block as specified at run-time
-  * \param  blockCols number of columns in block as specified at run-time
-  *
-  * This function is mainly useful for blocks where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a blockRows should equal \a BlockRows unless
-  * \a BlockRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.cpp
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-template<int BlockRows, int BlockCols>
-inline Block<Derived, BlockRows, BlockCols> block(Index startRow, Index startCol, 
+/// \returns an expression of a block of \c *this.
+///
+/// \tparam NRows number of rows in block as specified at compile-time
+/// \tparam NCols number of columns in block as specified at compile-time
+/// \param  startRow  the first row in the block
+/// \param  startCol  the first column in the block
+/// \param  blockRows number of rows in block as specified at run-time
+/// \param  blockCols number of columns in block as specified at run-time
+///
+/// This function is mainly useful for blocks where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a blockRows should equal \a NRows unless
+/// \a NRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.out
+///
+/// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic
+/// block(Index,Index,NRowsType,NColsType), here is the one-to-one complete equivalence:
+/// \code
+/// mat.template block<NRows,NCols>(i,j,rows,cols)     <-->  mat.block(i,j,fix<NRows>(rows),fix<NCols>(cols))
+/// \endcode
+/// If we known that, e.g., NRows==Dynamic and NCols!=Dynamic, then the equivalence becomes:
+/// \code
+/// mat.template block<Dynamic,NCols>(i,j,rows,NCols)  <-->  mat.block(i,j,rows,fix<NCols>)
+/// \endcode
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template<int NRows, int NCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
                                                   Index blockRows, Index blockCols)
 {
-  return Block<Derived, BlockRows, BlockCols>(derived(), startRow, startCol, blockRows, blockCols);
+  return typename FixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol, blockRows, blockCols);
 }
 
-/** This is the const version of block<>(Index, Index, Index, Index). */
-template<int BlockRows, int BlockCols>
-inline const Block<const Derived, BlockRows, BlockCols> block(Index startRow, Index startCol,
+/// This is the const version of block<>(Index, Index, Index, Index).
+template<int NRows, int NCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
                                                               Index blockRows, Index blockCols) const
 {
-  return Block<const Derived, BlockRows, BlockCols>(derived(), startRow, startCol, blockRows, blockCols);
+  return typename ConstFixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol, blockRows, blockCols);
 }
 
-/** \returns an expression of the \a i-th column of *this. Note that the numbering starts at 0.
-  *
-  * Example: \include MatrixBase_col.cpp
-  * Output: \verbinclude MatrixBase_col.out
-  *
+/// \returns an expression of the \a i-th column of \c *this. Note that the numbering starts at 0.
+///
+/// Example: \include MatrixBase_col.cpp
+/// Output: \verbinclude MatrixBase_col.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+/**
   * \sa row(), class Block */
-EIGEN_DEVICE_FUNC
-inline ColXpr col(Index i)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ColXpr col(Index i)
 {
   return ColXpr(derived(), i);
 }
 
-/** This is the const version of col(). */
-EIGEN_DEVICE_FUNC
-inline ConstColXpr col(Index i) const
+/// This is the const version of col().
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ConstColXpr col(Index i) const
 {
   return ConstColXpr(derived(), i);
 }
 
-/** \returns an expression of the \a i-th row of *this. Note that the numbering starts at 0.
-  *
-  * Example: \include MatrixBase_row.cpp
-  * Output: \verbinclude MatrixBase_row.out
-  *
+/// \returns an expression of the \a i-th row of \c *this. Note that the numbering starts at 0.
+///
+/// Example: \include MatrixBase_row.cpp
+/// Output: \verbinclude MatrixBase_row.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+/**
   * \sa col(), class Block */
-EIGEN_DEVICE_FUNC
-inline RowXpr row(Index i)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+RowXpr row(Index i)
 {
   return RowXpr(derived(), i);
 }
 
-/** This is the const version of row(). */
-EIGEN_DEVICE_FUNC
-inline ConstRowXpr row(Index i) const
+/// This is the const version of row(). */
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ConstRowXpr row(Index i) const
 {
   return ConstRowXpr(derived(), i);
 }
 
-/** \returns a dynamic-size expression of a segment (i.e. a vector block) in *this.
-  *
-  * \only_for_vectors
-  *
-  * \param start the first coefficient in the segment
-  * \param n the number of coefficients in the segment
-  *
-  * Example: \include MatrixBase_segment_int_int.cpp
-  * Output: \verbinclude MatrixBase_segment_int_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size vector, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, segment(Index)
-  */
-EIGEN_DEVICE_FUNC
-inline SegmentReturnType segment(Index start, Index n)
+/// \returns an expression of a segment (i.e. a vector block) in \c *this with either dynamic or fixed sizes.
+///
+/// \only_for_vectors
+///
+/// \param start the first coefficient in the segment
+/// \param n the number of coefficients in the segment
+/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index.
+///
+/// Example: \include MatrixBase_segment_int_int.cpp
+/// Output: \verbinclude MatrixBase_segment_int_int.out
+///
+/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+/// \note Even in the case that the returned expression has dynamic size, in the case
+/// when it is applied to a fixed-size vector, it inherits a fixed maximal size,
+/// which means that evaluating it does not cause a dynamic memory allocation.
+///
+/// \sa block(Index,Index,NRowsType,NColsType), fix<N>, fix<N>(int), class Block
+///
+template<typename NType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+#else
+typename FixedSegmentReturnType<...>::Type
+#endif
+segment(Index start, NType n)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return SegmentReturnType(derived(), start, n);
+  return typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+            (derived(), start, internal::get_runtime_value(n));
 }
 
 
-/** This is the const version of segment(Index,Index).*/
-EIGEN_DEVICE_FUNC
-inline ConstSegmentReturnType segment(Index start, Index n) const
+/// This is the const version of segment(Index,NType).
+template<typename NType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+#else
+const typename ConstFixedSegmentReturnType<...>::Type
+#endif
+segment(Index start, NType n) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return ConstSegmentReturnType(derived(), start, n);
+  return typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+            (derived(), start, internal::get_runtime_value(n));
 }
 
-/** \returns a dynamic-size expression of the first coefficients of *this.
-  *
-  * \only_for_vectors
-  *
-  * \param n the number of coefficients in the segment
-  *
-  * Example: \include MatrixBase_start_int.cpp
-  * Output: \verbinclude MatrixBase_start_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size vector, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, block(Index,Index)
-  */
-EIGEN_DEVICE_FUNC
-inline SegmentReturnType head(Index n)
+/// \returns an expression of the first coefficients of \c *this with either dynamic or fixed sizes.
+///
+/// \only_for_vectors
+///
+/// \param n the number of coefficients in the segment
+/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index.
+///
+/// Example: \include MatrixBase_start_int.cpp
+/// Output: \verbinclude MatrixBase_start_int.out
+///
+/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+/// \note Even in the case that the returned expression has dynamic size, in the case
+/// when it is applied to a fixed-size vector, it inherits a fixed maximal size,
+/// which means that evaluating it does not cause a dynamic memory allocation.
+///
+/// \sa class Block, block(Index,Index)
+///
+template<typename NType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+#else
+typename FixedSegmentReturnType<...>::Type
+#endif
+head(NType n)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return SegmentReturnType(derived(), 0, n);
+  return typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+              (derived(), 0, internal::get_runtime_value(n));
 }
 
-/** This is the const version of head(Index).*/
-EIGEN_DEVICE_FUNC
-inline ConstSegmentReturnType head(Index n) const
+/// This is the const version of head(NType).
+template<typename NType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+#else
+const typename ConstFixedSegmentReturnType<...>::Type
+#endif
+head(NType n) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return ConstSegmentReturnType(derived(), 0, n);
+  return typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+            (derived(), 0, internal::get_runtime_value(n));
 }
 
-/** \returns a dynamic-size expression of the last coefficients of *this.
-  *
-  * \only_for_vectors
-  *
-  * \param n the number of coefficients in the segment
-  *
-  * Example: \include MatrixBase_end_int.cpp
-  * Output: \verbinclude MatrixBase_end_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size vector, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, block(Index,Index)
-  */
-EIGEN_DEVICE_FUNC
-inline SegmentReturnType tail(Index n)
+/// \returns an expression of a last coefficients of \c *this with either dynamic or fixed sizes.
+///
+/// \only_for_vectors
+///
+/// \param n the number of coefficients in the segment
+/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index.
+///
+/// Example: \include MatrixBase_end_int.cpp
+/// Output: \verbinclude MatrixBase_end_int.out
+///
+/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+/// \note Even in the case that the returned expression has dynamic size, in the case
+/// when it is applied to a fixed-size vector, it inherits a fixed maximal size,
+/// which means that evaluating it does not cause a dynamic memory allocation.
+///
+/// \sa class Block, block(Index,Index)
+///
+template<typename NType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+#else
+typename FixedSegmentReturnType<...>::Type
+#endif
+tail(NType n)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return SegmentReturnType(derived(), this->size() - n, n);
+  return typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+            (derived(), this->size() - internal::get_runtime_value(n), internal::get_runtime_value(n));
 }
 
-/** This is the const version of tail(Index).*/
-EIGEN_DEVICE_FUNC
-inline ConstSegmentReturnType tail(Index n) const
+/// This is the const version of tail(Index).
+template<typename NType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+#else
+const typename ConstFixedSegmentReturnType<...>::Type
+#endif
+tail(NType n) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return ConstSegmentReturnType(derived(), this->size() - n, n);
+  return typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+            (derived(), this->size() - internal::get_runtime_value(n), internal::get_runtime_value(n));
 }
 
-/** \returns a fixed-size expression of a segment (i.e. a vector block) in \c *this
-  *
-  * \only_for_vectors
-  *
-  * \tparam N the number of coefficients in the segment as specified at compile-time
-  * \param start the index of the first element in the segment
-  * \param n the number of coefficients in the segment as specified at compile-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_segment.cpp
-  * Output: \verbinclude MatrixBase_template_int_segment.out
-  *
-  * \sa class Block
-  */
+/// \returns a fixed-size expression of a segment (i.e. a vector block) in \c *this
+///
+/// \only_for_vectors
+///
+/// \tparam N the number of coefficients in the segment as specified at compile-time
+/// \param start the index of the first element in the segment
+/// \param n the number of coefficients in the segment as specified at compile-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_segment.cpp
+/// Output: \verbinclude MatrixBase_template_int_segment.out
+///
+/// \sa segment(Index,NType), class Block
+///
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename FixedSegmentReturnType<N>::Type segment(Index start, Index n = N)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedSegmentReturnType<N>::Type segment(Index start, Index n = N)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename FixedSegmentReturnType<N>::Type(derived(), start, n);
 }
 
-/** This is the const version of segment<int>(Index).*/
+/// This is the const version of segment<int>(Index).
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename ConstFixedSegmentReturnType<N>::Type segment(Index start, Index n = N) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename ConstFixedSegmentReturnType<N>::Type segment(Index start, Index n = N) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), start, n);
 }
 
-/** \returns a fixed-size expression of the first coefficients of *this.
-  *
-  * \only_for_vectors
-  *
-  * \tparam N the number of coefficients in the segment as specified at compile-time
-  * \param  n the number of coefficients in the segment as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_start.cpp
-  * Output: \verbinclude MatrixBase_template_int_start.out
-  *
-  * \sa class Block
-  */
+/// \returns a fixed-size expression of the first coefficients of \c *this.
+///
+/// \only_for_vectors
+///
+/// \tparam N the number of coefficients in the segment as specified at compile-time
+/// \param  n the number of coefficients in the segment as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_start.cpp
+/// Output: \verbinclude MatrixBase_template_int_start.out
+///
+/// \sa head(NType), class Block
+///
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename FixedSegmentReturnType<N>::Type head(Index n = N)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedSegmentReturnType<N>::Type head(Index n = N)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename FixedSegmentReturnType<N>::Type(derived(), 0, n);
 }
 
-/** This is the const version of head<int>().*/
+/// This is the const version of head<int>().
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename ConstFixedSegmentReturnType<N>::Type head(Index n = N) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename ConstFixedSegmentReturnType<N>::Type head(Index n = N) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), 0, n);
 }
 
-/** \returns a fixed-size expression of the last coefficients of *this.
-  *
-  * \only_for_vectors
-  *
-  * \tparam N the number of coefficients in the segment as specified at compile-time
-  * \param  n the number of coefficients in the segment as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_end.cpp
-  * Output: \verbinclude MatrixBase_template_int_end.out
-  *
-  * \sa class Block
-  */
+/// \returns a fixed-size expression of the last coefficients of \c *this.
+///
+/// \only_for_vectors
+///
+/// \tparam N the number of coefficients in the segment as specified at compile-time
+/// \param  n the number of coefficients in the segment as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_end.cpp
+/// Output: \verbinclude MatrixBase_template_int_end.out
+///
+/// \sa tail(NType), class Block
+///
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename FixedSegmentReturnType<N>::Type tail(Index n = N)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedSegmentReturnType<N>::Type tail(Index n = N)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename FixedSegmentReturnType<N>::Type(derived(), size() - n);
 }
 
-/** This is the const version of tail<int>.*/
+/// This is the const version of tail<int>.
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename ConstFixedSegmentReturnType<N>::Type tail(Index n = N) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename ConstFixedSegmentReturnType<N>::Type tail(Index n = N) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), size() - n);
 }
+
+/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+/// is col-major (resp. row-major).
+///
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+InnerVectorReturnType innerVector(Index outer)
+{ return InnerVectorReturnType(derived(), outer); }
+
+/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+/// is col-major (resp. row-major). Read-only.
+///
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const ConstInnerVectorReturnType innerVector(Index outer) const
+{ return ConstInnerVectorReturnType(derived(), outer); }
+
+/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+/// is col-major (resp. row-major).
+///
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+InnerVectorsReturnType
+innerVectors(Index outerStart, Index outerSize)
+{
+  return Block<Derived,Dynamic,Dynamic,true>(derived(),
+                                             IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
+                                             IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
+
+}
+
+/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+/// is col-major (resp. row-major). Read-only.
+///
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const ConstInnerVectorsReturnType
+innerVectors(Index outerStart, Index outerSize) const
+{
+  return Block<const Derived,Dynamic,Dynamic,true>(derived(),
+                                                  IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
+                                                  IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
+
+}
+
+/** \returns the i-th subvector (column or vector) according to the \c Direction
+  * \sa subVectors()
+  */
+template<DirectionType Direction>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename internal::conditional<Direction==Vertical,ColXpr,RowXpr>::type
+subVector(Index i)
+{
+  return typename internal::conditional<Direction==Vertical,ColXpr,RowXpr>::type(derived(),i);
+}
+
+/** This is the const version of subVector(Index) */
+template<DirectionType Direction>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename internal::conditional<Direction==Vertical,ConstColXpr,ConstRowXpr>::type
+subVector(Index i) const
+{
+  return typename internal::conditional<Direction==Vertical,ConstColXpr,ConstRowXpr>::type(derived(),i);
+}
+
+/** \returns the number of subvectors (rows or columns) in the direction \c Direction
+  * \sa subVector(Index)
+  */
+template<DirectionType Direction>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+Index subVectors() const
+{ return (Direction==Vertical)?cols():rows(); }

diff --git a/Eigen/src/plugins/CommonCwiseBinaryOps.h b/Eigen/src/plugins/CommonCwiseBinaryOps.h
index 23c968d..8b6730e 100644
--- a/Eigen/src/plugins/CommonCwiseBinaryOps.h
+++ b/Eigen/src/plugins/CommonCwiseBinaryOps.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -16,7 +16,7 @@
   *
   * \sa class CwiseBinaryOp, operator-=()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator-,internal::scalar_difference_op)
+EIGEN_MAKE_CWISE_BINARY_OP(operator-,difference)
 
 /** \returns an expression of the sum of \c *this and \a other
   *
@@ -24,7 +24,7 @@
   *
   * \sa class CwiseBinaryOp, operator+=()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator+,internal::scalar_sum_op)
+EIGEN_MAKE_CWISE_BINARY_OP(operator+,sum)
 
 /** \returns an expression of a custom coefficient-wise operator \a func of *this and \a other
   *
@@ -44,3 +44,72 @@
 {
   return CwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>(derived(), other.derived(), func);
 }
+
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_MAKE_SCALAR_BINARY_OP(operator*,product)
+#else
+/** \returns an expression of \c *this scaled by the scalar factor \a scalar
+  *
+  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+  */
+template<typename T>
+const CwiseBinaryOp<internal::scalar_product_op<Scalar,T>,Derived,Constant<T> > operator*(const T& scalar) const;
+/** \returns an expression of \a expr scaled by the scalar factor \a scalar
+  *
+  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+  */
+template<typename T> friend
+const CwiseBinaryOp<internal::scalar_product_op<T,Scalar>,Constant<T>,Derived> operator*(const T& scalar, const StorageBaseType& expr);
+#endif
+
+
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(operator/,quotient)
+#else
+/** \returns an expression of \c *this divided by the scalar value \a scalar
+  *
+  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+  */
+template<typename T>
+const CwiseBinaryOp<internal::scalar_quotient_op<Scalar,T>,Derived,Constant<T> > operator/(const T& scalar) const;
+#endif
+
+/** \returns an expression of the coefficient-wise boolean \b and operator of \c *this and \a other
+  *
+  * \warning this operator is for expression of bool only.
+  *
+  * Example: \include Cwise_boolean_and.cpp
+  * Output: \verbinclude Cwise_boolean_and.out
+  *
+  * \sa operator||(), select()
+  */
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
+inline const CwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>
+operator&&(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
+                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
+  return CwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>(derived(),other.derived());
+}
+
+/** \returns an expression of the coefficient-wise boolean \b or operator of \c *this and \a other
+  *
+  * \warning this operator is for expression of bool only.
+  *
+  * Example: \include Cwise_boolean_or.cpp
+  * Output: \verbinclude Cwise_boolean_or.out
+  *
+  * \sa operator&&(), select()
+  */
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
+inline const CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>
+operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
+                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
+  return CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>(derived(),other.derived());
+}

diff --git a/Eigen/src/plugins/CommonCwiseUnaryOps.h b/Eigen/src/plugins/CommonCwiseUnaryOps.h
index 82f6c46..5418dc4 100644
--- a/Eigen/src/plugins/CommonCwiseUnaryOps.h
+++ b/Eigen/src/plugins/CommonCwiseUnaryOps.h

@@ -12,10 +12,6 @@
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 
-/** \internal Represents a scalar multiple of an expression */
-typedef CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived> ScalarMultipleReturnType;
-/** \internal Represents a quotient of an expression by a scalar*/
-typedef CwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived> ScalarQuotient1ReturnType;
 /** \internal the return type of conjugate() */
 typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
                     const CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>,
@@ -36,74 +32,43 @@
 /** \internal the return type of imag() */
 typedef CwiseUnaryView<internal::scalar_imag_ref_op<Scalar>, Derived> NonConstImagReturnType;
 
+typedef CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived> NegativeReturnType;
+
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
-/** \returns an expression of the opposite of \c *this
-  */
+/// \returns an expression of the opposite of \c *this
+///
+EIGEN_DOC_UNARY_ADDONS(operator-,opposite)
+///
 EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_opposite_op<typename internal::traits<Derived>::Scalar>, const Derived>
-operator-() const { return derived(); }
+inline const NegativeReturnType
+operator-() const { return NegativeReturnType(derived()); }
 
 
-/** \returns an expression of \c *this scaled by the scalar factor \a scalar */
-EIGEN_DEVICE_FUNC
-inline const ScalarMultipleReturnType
-operator*(const Scalar& scalar) const
-{
-  return CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived>
-    (derived(), internal::scalar_multiple_op<Scalar>(scalar));
-}
+template<class NewType> struct CastXpr { typedef typename internal::cast_return_type<Derived,const CwiseUnaryOp<internal::scalar_cast_op<Scalar, NewType>, const Derived> >::type Type; };
 
-#ifdef EIGEN_PARSED_BY_DOXYGEN
-const ScalarMultipleReturnType operator*(const RealScalar& scalar) const;
-#endif
-
-/** \returns an expression of \c *this divided by the scalar value \a scalar */
-EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_quotient1_op<typename internal::traits<Derived>::Scalar>, const Derived>
-operator/(const Scalar& scalar) const
-{
-  return CwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived>
-    (derived(), internal::scalar_quotient1_op<Scalar>(scalar));
-}
-
-/** Overloaded for efficient real matrix times complex scalar value */
-EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >, const Derived>
-operator*(const std::complex<Scalar>& scalar) const
-{
-  return CwiseUnaryOp<internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >, const Derived>
-    (*static_cast<const Derived*>(this), internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >(scalar));
-}
-
-EIGEN_DEVICE_FUNC
-inline friend const ScalarMultipleReturnType
-operator*(const Scalar& scalar, const StorageBaseType& matrix)
-{ return matrix*scalar; }
-
-EIGEN_DEVICE_FUNC
-inline friend const CwiseUnaryOp<internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >, const Derived>
-operator*(const std::complex<Scalar>& scalar, const StorageBaseType& matrix)
-{ return matrix*scalar; }
-
-/** \returns an expression of *this with the \a Scalar type casted to
-  * \a NewScalar.
-  *
-  * The template parameter \a NewScalar is the type we are casting the scalars to.
-  *
-  * \sa class CwiseUnaryOp
-  */
+/// \returns an expression of \c *this with the \a Scalar type casted to
+/// \a NewScalar.
+///
+/// The template parameter \a NewScalar is the type we are casting the scalars to.
+///
+EIGEN_DOC_UNARY_ADDONS(cast,conversion function)
+///
+/// \sa class CwiseUnaryOp
+///
 template<typename NewType>
 EIGEN_DEVICE_FUNC
-typename internal::cast_return_type<Derived,const CwiseUnaryOp<internal::scalar_cast_op<typename internal::traits<Derived>::Scalar, NewType>, const Derived> >::type
+typename CastXpr<NewType>::Type
 cast() const
 {
-  return derived();
+  return typename CastXpr<NewType>::Type(derived());
 }
 
-/** \returns an expression of the complex conjugate of \c *this.
-  *
-  * \sa adjoint() */
+/// \returns an expression of the complex conjugate of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate)
+///
+/// \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_conj">Math functions</a>, MatrixBase::adjoint()
 EIGEN_DEVICE_FUNC
 inline ConjugateReturnType
 conjugate() const
@@ -111,39 +76,59 @@
   return ConjugateReturnType(derived());
 }
 
-/** \returns a read-only expression of the real part of \c *this.
-  *
-  * \sa imag() */
+/// \returns an expression of the complex conjugate of \c *this if Cond==true, returns derived() otherwise.
+///
+EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate)
+///
+/// \sa conjugate()
+template<bool Cond>
+EIGEN_DEVICE_FUNC
+inline typename internal::conditional<Cond,ConjugateReturnType,const Derived&>::type
+conjugateIf() const
+{
+  typedef typename internal::conditional<Cond,ConjugateReturnType,const Derived&>::type ReturnType;
+  return ReturnType(derived());
+}
+
+/// \returns a read-only expression of the real part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(real,real part function)
+///
+/// \sa imag()
 EIGEN_DEVICE_FUNC
 inline RealReturnType
-real() const { return derived(); }
+real() const { return RealReturnType(derived()); }
 
-/** \returns an read-only expression of the imaginary part of \c *this.
-  *
-  * \sa real() */
+/// \returns an read-only expression of the imaginary part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(imag,imaginary part function)
+///
+/// \sa real()
 EIGEN_DEVICE_FUNC
 inline const ImagReturnType
-imag() const { return derived(); }
+imag() const { return ImagReturnType(derived()); }
 
-/** \brief Apply a unary operator coefficient-wise
-  * \param[in]  func  Functor implementing the unary operator
-  * \tparam  CustomUnaryOp Type of \a func
-  * \returns An expression of a custom coefficient-wise unary operator \a func of *this
-  *
-  * The function \c ptr_fun() from the C++ standard library can be used to make functors out of normal functions.
-  *
-  * Example:
-  * \include class_CwiseUnaryOp_ptrfun.cpp
-  * Output: \verbinclude class_CwiseUnaryOp_ptrfun.out
-  *
-  * Genuine functors allow for more possibilities, for instance it may contain a state.
-  *
-  * Example:
-  * \include class_CwiseUnaryOp.cpp
-  * Output: \verbinclude class_CwiseUnaryOp.out
-  *
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp
-  */
+/// \brief Apply a unary operator coefficient-wise
+/// \param[in]  func  Functor implementing the unary operator
+/// \tparam  CustomUnaryOp Type of \a func
+/// \returns An expression of a custom coefficient-wise unary operator \a func of *this
+///
+/// The function \c ptr_fun() from the C++ standard library can be used to make functors out of normal functions.
+///
+/// Example:
+/// \include class_CwiseUnaryOp_ptrfun.cpp
+/// Output: \verbinclude class_CwiseUnaryOp_ptrfun.out
+///
+/// Genuine functors allow for more possibilities, for instance it may contain a state.
+///
+/// Example:
+/// \include class_CwiseUnaryOp.cpp
+/// Output: \verbinclude class_CwiseUnaryOp.out
+///
+EIGEN_DOC_UNARY_ADDONS(unaryExpr,unary function)
+///
+/// \sa unaryViewExpr, binaryExpr, class CwiseUnaryOp
+///
 template<typename CustomUnaryOp>
 EIGEN_DEVICE_FUNC
 inline const CwiseUnaryOp<CustomUnaryOp, const Derived>
@@ -152,17 +137,19 @@
   return CwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
 }
 
-/** \returns an expression of a custom coefficient-wise unary operator \a func of *this
-  *
-  * The template parameter \a CustomUnaryOp is the type of the functor
-  * of the custom unary operator.
-  *
-  * Example:
-  * \include class_CwiseUnaryOp.cpp
-  * Output: \verbinclude class_CwiseUnaryOp.out
-  *
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp
-  */
+/// \returns an expression of a custom coefficient-wise unary operator \a func of *this
+///
+/// The template parameter \a CustomUnaryOp is the type of the functor
+/// of the custom unary operator.
+///
+/// Example:
+/// \include class_CwiseUnaryOp.cpp
+/// Output: \verbinclude class_CwiseUnaryOp.out
+///
+EIGEN_DOC_UNARY_ADDONS(unaryViewExpr,unary function)
+///
+/// \sa unaryExpr, binaryExpr class CwiseUnaryOp
+///
 template<typename CustomViewOp>
 EIGEN_DEVICE_FUNC
 inline const CwiseUnaryView<CustomViewOp, const Derived>
@@ -171,16 +158,20 @@
   return CwiseUnaryView<CustomViewOp, const Derived>(derived(), func);
 }
 
-/** \returns a non const expression of the real part of \c *this.
-  *
-  * \sa imag() */
+/// \returns a non const expression of the real part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(real,real part function)
+///
+/// \sa imag()
 EIGEN_DEVICE_FUNC
 inline NonConstRealReturnType
-real() { return derived(); }
+real() { return NonConstRealReturnType(derived()); }
 
-/** \returns a non const expression of the imaginary part of \c *this.
-  *
-  * \sa real() */
+/// \returns a non const expression of the imaginary part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(imag,imaginary part function)
+///
+/// \sa real()
 EIGEN_DEVICE_FUNC
 inline NonConstImagReturnType
-imag() { return derived(); }
+imag() { return NonConstImagReturnType(derived()); }

diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h
new file mode 100644
index 0000000..5bfb19a
--- /dev/null
+++ b/Eigen/src/plugins/IndexedViewMethods.h

@@ -0,0 +1,262 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if !defined(EIGEN_PARSED_BY_DOXYGEN)
+
+// This file is automatically included twice to generate const and non-const versions
+
+#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS
+#define EIGEN_INDEXED_VIEW_METHOD_CONST const
+#define EIGEN_INDEXED_VIEW_METHOD_TYPE  ConstIndexedViewType
+#else
+#define EIGEN_INDEXED_VIEW_METHOD_CONST
+#define EIGEN_INDEXED_VIEW_METHOD_TYPE IndexedViewType
+#endif
+
+#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS
+protected:
+
+// define some aliases to ease readability
+
+template<typename Indices>
+struct IvcRowType : public internal::IndexedViewCompatibleType<Indices,RowsAtCompileTime> {};
+
+template<typename Indices>
+struct IvcColType : public internal::IndexedViewCompatibleType<Indices,ColsAtCompileTime> {};
+
+template<typename Indices>
+struct IvcType : public internal::IndexedViewCompatibleType<Indices,SizeAtCompileTime> {};
+
+typedef typename internal::IndexedViewCompatibleType<Index,1>::type IvcIndex;
+
+template<typename Indices>
+typename IvcRowType<Indices>::type
+ivcRow(const Indices& indices) const {
+  return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic<Index,RowsAtCompileTime>(derived().rows()),Specialized);
+}
+
+template<typename Indices>
+typename IvcColType<Indices>::type
+ivcCol(const Indices& indices) const {
+  return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic<Index,ColsAtCompileTime>(derived().cols()),Specialized);
+}
+
+template<typename Indices>
+typename IvcColType<Indices>::type
+ivcSize(const Indices& indices) const {
+  return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic<Index,SizeAtCompileTime>(derived().size()),Specialized);
+}
+
+public:
+
+#endif
+
+template<typename RowIndices, typename ColIndices>
+struct EIGEN_INDEXED_VIEW_METHOD_TYPE {
+  typedef IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,
+                      typename IvcRowType<RowIndices>::type,
+                      typename IvcColType<ColIndices>::type> type;
+};
+
+// This is the generic version
+
+template<typename RowIndices, typename ColIndices>
+typename internal::enable_if<internal::valid_indexed_view_overload<RowIndices,ColIndices>::value
+  && internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::ReturnAsIndexedView,
+  typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type >::type
+operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  return typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type
+            (derived(), ivcRow(rowIndices), ivcCol(colIndices));
+}
+
+// The following overload returns a Block<> object
+
+template<typename RowIndices, typename ColIndices>
+typename internal::enable_if<internal::valid_indexed_view_overload<RowIndices,ColIndices>::value
+  && internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::ReturnAsBlock,
+  typename internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::BlockType>::type
+operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  typedef typename internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::BlockType BlockType;
+  typename IvcRowType<RowIndices>::type actualRowIndices = ivcRow(rowIndices);
+  typename IvcColType<ColIndices>::type actualColIndices = ivcCol(colIndices);
+  return BlockType(derived(),
+                   internal::first(actualRowIndices),
+                   internal::first(actualColIndices),
+                   internal::size(actualRowIndices),
+                   internal::size(actualColIndices));
+}
+
+// The following overload returns a Scalar
+
+template<typename RowIndices, typename ColIndices>
+typename internal::enable_if<internal::valid_indexed_view_overload<RowIndices,ColIndices>::value
+  && internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::ReturnAsScalar,
+  CoeffReturnType >::type
+operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  return Base::operator()(internal::eval_expr_given_size(rowIndices,rows()),internal::eval_expr_given_size(colIndices,cols()));
+}
+
+#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE
+
+// The following three overloads are needed to handle raw Index[N] arrays.
+
+template<typename RowIndicesT, std::size_t RowIndicesN, typename ColIndices>
+IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const RowIndicesT (&)[RowIndicesN],typename IvcColType<ColIndices>::type>
+operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const RowIndicesT (&)[RowIndicesN],typename IvcColType<ColIndices>::type>
+                    (derived(), rowIndices, ivcCol(colIndices));
+}
+
+template<typename RowIndices, typename ColIndicesT, std::size_t ColIndicesN>
+IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,typename IvcRowType<RowIndices>::type, const ColIndicesT (&)[ColIndicesN]>
+operator()(const RowIndices& rowIndices, const ColIndicesT (&colIndices)[ColIndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,typename IvcRowType<RowIndices>::type,const ColIndicesT (&)[ColIndicesN]>
+                    (derived(), ivcRow(rowIndices), colIndices);
+}
+
+template<typename RowIndicesT, std::size_t RowIndicesN, typename ColIndicesT, std::size_t ColIndicesN>
+IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const RowIndicesT (&)[RowIndicesN], const ColIndicesT (&)[ColIndicesN]>
+operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndicesT (&colIndices)[ColIndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const RowIndicesT (&)[RowIndicesN],const ColIndicesT (&)[ColIndicesN]>
+                    (derived(), rowIndices, colIndices);
+}
+
+#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE
+
+// Overloads for 1D vectors/arrays
+
+template<typename Indices>
+typename internal::enable_if<
+  IsRowMajor && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_valid_index_type<Indices>::value)),
+  IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,typename IvcType<Indices>::type> >::type
+operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,typename IvcType<Indices>::type>
+            (derived(), IvcIndex(0), ivcCol(indices));
+}
+
+template<typename Indices>
+typename internal::enable_if<
+  (!IsRowMajor) && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_valid_index_type<Indices>::value)),
+  IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,typename IvcType<Indices>::type,IvcIndex> >::type
+operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,typename IvcType<Indices>::type,IvcIndex>
+            (derived(), ivcRow(indices), IvcIndex(0));
+}
+
+template<typename Indices>
+typename internal::enable_if<
+  (internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1) && (!internal::is_valid_index_type<Indices>::value) && (!symbolic::is_symbolic<Indices>::value),
+  VectorBlock<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,internal::array_size<Indices>::value> >::type
+operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  typename IvcType<Indices>::type actualIndices = ivcSize(indices);
+  return VectorBlock<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,internal::array_size<Indices>::value>
+            (derived(), internal::first(actualIndices), internal::size(actualIndices));
+}
+
+template<typename IndexType>
+typename internal::enable_if<symbolic::is_symbolic<IndexType>::value, CoeffReturnType >::type
+operator()(const IndexType& id) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  return Base::operator()(internal::eval_expr_given_size(id,size()));
+}
+
+#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE
+
+template<typename IndicesT, std::size_t IndicesN>
+typename internal::enable_if<IsRowMajor,
+  IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,const IndicesT (&)[IndicesN]> >::type
+operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,const IndicesT (&)[IndicesN]>
+            (derived(), IvcIndex(0), indices);
+}
+
+template<typename IndicesT, std::size_t IndicesN>
+typename internal::enable_if<!IsRowMajor,
+  IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const IndicesT (&)[IndicesN],IvcIndex> >::type
+operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const IndicesT (&)[IndicesN],IvcIndex>
+            (derived(), indices, IvcIndex(0));
+}
+
+#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE
+
+#undef EIGEN_INDEXED_VIEW_METHOD_CONST
+#undef EIGEN_INDEXED_VIEW_METHOD_TYPE
+
+#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS
+#define EIGEN_INDEXED_VIEW_METHOD_2ND_PASS
+#include "IndexedViewMethods.h"
+#undef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS
+#endif
+
+#else // EIGEN_PARSED_BY_DOXYGEN
+
+/**
+  * \returns a generic submatrix view defined by the rows and columns indexed \a rowIndices and \a colIndices respectively.
+  *
+  * Each parameter must either be:
+  *  - An integer indexing a single row or column
+  *  - Eigen::all indexing the full set of respective rows or columns in increasing order
+  *  - An ArithmeticSequence as returned by the Eigen::seq and Eigen::seqN functions
+  *  - Any %Eigen's vector/array of integers or expressions
+  *  - Plain C arrays: \c int[N]
+  *  - And more generally any type exposing the following two member functions:
+  * \code
+  * <integral type> operator[](<integral type>) const;
+  * <integral type> size() const;
+  * \endcode
+  * where \c <integral \c type>  stands for any integer type compatible with Eigen::Index (i.e. \c std::ptrdiff_t).
+  *
+  * The last statement implies compatibility with \c std::vector, \c std::valarray, \c std::array, many of the Range-v3's ranges, etc.
+  *
+  * If the submatrix can be represented using a starting position \c (i,j) and positive sizes \c (rows,columns), then this
+  * method will returns a Block object after extraction of the relevant information from the passed arguments. This is the case
+  * when all arguments are either:
+  *  - An integer
+  *  - Eigen::all
+  *  - An ArithmeticSequence with compile-time increment strictly equal to 1, as returned by Eigen::seq(a,b), and Eigen::seqN(a,N).
+  *
+  * Otherwise a more general IndexedView<Derived,RowIndices',ColIndices'> object will be returned, after conversion of the inputs
+  * to more suitable types \c RowIndices' and \c ColIndices'.
+  *
+  * For 1D vectors and arrays, you better use the operator()(const Indices&) overload, which behave the same way but taking a single parameter.
+  *
+  * See also this <a href="https://stackoverflow.com/questions/46110917/eigen-replicate-items-along-one-dimension-without-useless-allocations">question</a> and its answer for an example of how to duplicate coefficients.
+  *
+  * \sa operator()(const Indices&), class Block, class IndexedView, DenseBase::block(Index,Index,Index,Index)
+  */
+template<typename RowIndices, typename ColIndices>
+IndexedView_or_Block
+operator()(const RowIndices& rowIndices, const ColIndices& colIndices);
+
+/** This is an overload of operator()(const RowIndices&, const ColIndices&) for 1D vectors or arrays
+  *
+  * \only_for_vectors
+  */
+template<typename Indices>
+IndexedView_or_VectorBlock
+operator()(const Indices& indices);
+
+#endif  // EIGEN_PARSED_BY_DOXYGEN

diff --git a/Eigen/src/plugins/MatrixCwiseBinaryOps.h b/Eigen/src/plugins/MatrixCwiseBinaryOps.h
index 49430fb..514d83a 100644
--- a/Eigen/src/plugins/MatrixCwiseBinaryOps.h
+++ b/Eigen/src/plugins/MatrixCwiseBinaryOps.h

@@ -19,10 +19,10 @@
   */
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)
+EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,product)
 cwiseProduct(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)(derived(), other.derived());
+  return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,product)(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise == operator of *this and \a other
@@ -39,10 +39,10 @@
   */
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
-inline const CwiseBinaryOp<std::equal_to<Scalar>, const Derived, const OtherDerived>
+inline const CwiseBinaryOp<numext::equal_to<Scalar>, const Derived, const OtherDerived>
 cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return CwiseBinaryOp<std::equal_to<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+  return CwiseBinaryOp<numext::equal_to<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise != operator of *this and \a other
@@ -59,10 +59,10 @@
   */
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
-inline const CwiseBinaryOp<std::not_equal_to<Scalar>, const Derived, const OtherDerived>
+inline const CwiseBinaryOp<numext::not_equal_to<Scalar>, const Derived, const OtherDerived>
 cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return CwiseBinaryOp<std::not_equal_to<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+  return CwiseBinaryOp<numext::not_equal_to<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise min of *this and \a other
@@ -72,23 +72,39 @@
   *
   * \sa class CwiseBinaryOp, max()
   */
-template<typename OtherDerived>
+template<int NaNPropagation, typename OtherDerived>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const OtherDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>
 cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+  return CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>(derived(), other.derived());
+}
+
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,PropagateFast>, const Derived, const OtherDerived>
+cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  return cwiseMin<PropagateFast>(other);
 }
 
 /** \returns an expression of the coefficient-wise min of *this and scalar \a other
   *
   * \sa class CwiseBinaryOp, min()
   */
+template<int NaNPropagation>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const ConstantReturnType>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NaNPropagation>, const Derived, const ConstantReturnType>
 cwiseMin(const Scalar &other) const
 {
-  return cwiseMin(Derived::Constant(rows(), cols(), other));
+  return cwiseMin<NaNPropagation>(Derived::Constant(rows(), cols(), other));
+}
+
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,PropagateFast>, const Derived, const ConstantReturnType>
+cwiseMin(const Scalar &other) const
+{
+  return cwiseMin<PropagateFast>(Derived::Constant(rows(), cols(), other));
 }
 
 /** \returns an expression of the coefficient-wise max of *this and \a other
@@ -98,23 +114,39 @@
   *
   * \sa class CwiseBinaryOp, min()
   */
-template<typename OtherDerived>
+template<int NaNPropagation, typename OtherDerived>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const OtherDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>
 cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+  return CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>(derived(), other.derived());
+}
+
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,PropagateFast>, const Derived, const OtherDerived>
+cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  return cwiseMax<PropagateFast>(other);
 }
 
 /** \returns an expression of the coefficient-wise max of *this and scalar \a other
   *
   * \sa class CwiseBinaryOp, min()
   */
+template<int NaNPropagation>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const ConstantReturnType>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NaNPropagation>, const Derived, const ConstantReturnType>
 cwiseMax(const Scalar &other) const
 {
-  return cwiseMax(Derived::Constant(rows(), cols(), other));
+  return cwiseMax<NaNPropagation>(Derived::Constant(rows(), cols(), other));
+}
+
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,PropagateFast>, const Derived, const ConstantReturnType>
+cwiseMax(const Scalar &other) const
+{
+  return cwiseMax<PropagateFast>(Derived::Constant(rows(), cols(), other));
 }
 
 
@@ -133,24 +165,20 @@
   return CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
 }
 
-/** \returns an expression of the coefficient-wise == operator of \c *this and a
- * scalar \a s
- *
- * \warning this performs an exact comparison, which is generally a bad idea
- * with floating-point types. In order to check for equality between two vectors
- * or matrices with floating-point coefficients, it is generally a far better
- * idea to use a fuzzy comparison as provided by isApprox() and
- * isMuchSmallerThan().
- *
- * \sa cwiseEqual(const MatrixBase<OtherDerived> &) const
- */
+typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar,Scalar,internal::cmp_EQ>, const Derived, const ConstantReturnType> CwiseScalarEqualReturnType;
+
+/** \returns an expression of the coefficient-wise == operator of \c *this and a scalar \a s
+  *
+  * \warning this performs an exact comparison, which is generally a bad idea with floating-point types.
+  * In order to check for equality between two vectors or matrices with floating-point coefficients, it is
+  * generally a far better idea to use a fuzzy comparison as provided by isApprox() and
+  * isMuchSmallerThan().
+  *
+  * \sa cwiseEqual(const MatrixBase<OtherDerived> &) const
+  */
 EIGEN_DEVICE_FUNC
-inline const CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>,
-                           const Derived,
-                           const ConstantReturnType>
-cwiseEqual(const Scalar& s) const {
-  return CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>,
-                       const Derived, const ConstantReturnType>(
-      derived(), Derived::Constant(rows(), cols(), s),
-      internal::scalar_cmp_op<Scalar, internal::cmp_EQ>());
+inline const CwiseScalarEqualReturnType
+cwiseEqual(const Scalar& s) const
+{
+  return CwiseScalarEqualReturnType(derived(), Derived::Constant(rows(), cols(), s), internal::scalar_cmp_op<Scalar,Scalar,internal::cmp_EQ>());
 }

diff --git a/Eigen/src/plugins/MatrixCwiseUnaryOps.h b/Eigen/src/plugins/MatrixCwiseUnaryOps.h
index a2b1446..0514d8f 100644
--- a/Eigen/src/plugins/MatrixCwiseUnaryOps.h
+++ b/Eigen/src/plugins/MatrixCwiseUnaryOps.h

@@ -8,48 +8,88 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-// This file is a base class plugin containing matrix specifics coefficient wise functions.
+// This file is included into the body of the base classes supporting matrix specific coefficient-wise functions.
+// This include MatrixBase and SparseMatrixBase.
 
-/** \returns an expression of the coefficient-wise absolute value of \c *this
-  *
-  * Example: \include MatrixBase_cwiseAbs.cpp
-  * Output: \verbinclude MatrixBase_cwiseAbs.out
-  *
-  * \sa cwiseAbs2()
-  */
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
-cwiseAbs() const { return derived(); }
 
-/** \returns an expression of the coefficient-wise squared absolute value of \c *this
-  *
-  * Example: \include MatrixBase_cwiseAbs2.cpp
-  * Output: \verbinclude MatrixBase_cwiseAbs2.out
-  *
-  * \sa cwiseAbs()
-  */
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived>
-cwiseAbs2() const { return derived(); }
+typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> CwiseAbsReturnType;
+typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> CwiseAbs2ReturnType;
+typedef CwiseUnaryOp<internal::scalar_arg_op<Scalar>, const Derived> CwiseArgReturnType;
+typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> CwiseSqrtReturnType;
+typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> CwiseSignReturnType;
+typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> CwiseInverseReturnType;
 
-/** \returns an expression of the coefficient-wise square root of *this.
-  *
-  * Example: \include MatrixBase_cwiseSqrt.cpp
-  * Output: \verbinclude MatrixBase_cwiseSqrt.out
-  *
-  * \sa cwisePow(), cwiseSquare()
-  */
+/// \returns an expression of the coefficient-wise absolute value of \c *this
+///
+/// Example: \include MatrixBase_cwiseAbs.cpp
+/// Output: \verbinclude MatrixBase_cwiseAbs.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseAbs,absolute value)
+///
+/// \sa cwiseAbs2()
+///
 EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
-cwiseSqrt() const { return derived(); }
+EIGEN_STRONG_INLINE const CwiseAbsReturnType
+cwiseAbs() const { return CwiseAbsReturnType(derived()); }
 
-/** \returns an expression of the coefficient-wise inverse of *this.
-  *
-  * Example: \include MatrixBase_cwiseInverse.cpp
-  * Output: \verbinclude MatrixBase_cwiseInverse.out
-  *
-  * \sa cwiseProduct()
-  */
+/// \returns an expression of the coefficient-wise squared absolute value of \c *this
+///
+/// Example: \include MatrixBase_cwiseAbs2.cpp
+/// Output: \verbinclude MatrixBase_cwiseAbs2.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseAbs2,squared absolute value)
+///
+/// \sa cwiseAbs()
+///
 EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived>
-cwiseInverse() const { return derived(); }
+EIGEN_STRONG_INLINE const CwiseAbs2ReturnType
+cwiseAbs2() const { return CwiseAbs2ReturnType(derived()); }
+
+/// \returns an expression of the coefficient-wise square root of *this.
+///
+/// Example: \include MatrixBase_cwiseSqrt.cpp
+/// Output: \verbinclude MatrixBase_cwiseSqrt.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseSqrt,square-root)
+///
+/// \sa cwisePow(), cwiseSquare()
+///
+EIGEN_DEVICE_FUNC
+inline const CwiseSqrtReturnType
+cwiseSqrt() const { return CwiseSqrtReturnType(derived()); }
+
+/// \returns an expression of the coefficient-wise signum of *this.
+///
+/// Example: \include MatrixBase_cwiseSign.cpp
+/// Output: \verbinclude MatrixBase_cwiseSign.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseSign,sign function)
+///
+EIGEN_DEVICE_FUNC
+inline const CwiseSignReturnType
+cwiseSign() const { return CwiseSignReturnType(derived()); }
+
+
+/// \returns an expression of the coefficient-wise inverse of *this.
+///
+/// Example: \include MatrixBase_cwiseInverse.cpp
+/// Output: \verbinclude MatrixBase_cwiseInverse.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseInverse,inverse)
+///
+/// \sa cwiseProduct()
+///
+EIGEN_DEVICE_FUNC
+inline const CwiseInverseReturnType
+cwiseInverse() const { return CwiseInverseReturnType(derived()); }
+
+/// \returns an expression of the coefficient-wise phase angle of \c *this
+///
+/// Example: \include MatrixBase_cwiseArg.cpp
+/// Output: \verbinclude MatrixBase_cwiseArg.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseArg,arg)
+
+EIGEN_DEVICE_FUNC
+inline const CwiseArgReturnType
+cwiseArg() const { return CwiseArgReturnType(derived()); }

diff --git a/Eigen/src/plugins/ReshapedMethods.h b/Eigen/src/plugins/ReshapedMethods.h
new file mode 100644
index 0000000..482a6b0
--- /dev/null
+++ b/Eigen/src/plugins/ReshapedMethods.h

@@ -0,0 +1,149 @@
+
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+
+/// \returns an expression of \c *this with reshaped sizes.
+///
+/// \param nRows the number of rows in the reshaped expression, specified at either run-time or compile-time, or AutoSize
+/// \param nCols the number of columns in the reshaped expression, specified at either run-time or compile-time, or AutoSize
+/// \tparam Order specifies whether the coefficients should be processed in column-major-order (ColMajor), in row-major-order (RowMajor),
+///               or follows the \em natural order of the nested expression (AutoOrder). The default is ColMajor.
+/// \tparam NRowsType the type of the value handling the number of rows, typically Index.
+/// \tparam NColsType the type of the value handling the number of columns, typically Index.
+///
+/// Dynamic size example: \include MatrixBase_reshaped_int_int.cpp
+/// Output: \verbinclude MatrixBase_reshaped_int_int.out
+///
+/// The number of rows \a nRows and columns \a nCols can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments. In the later case, \c n plays the role of a runtime fallback value in case \c N equals Eigen::Dynamic.
+/// Here is an example with a fixed number of rows and columns:
+/// \include MatrixBase_reshaped_fixed.cpp
+/// Output: \verbinclude MatrixBase_reshaped_fixed.out
+///
+/// Finally, one of the sizes parameter can be automatically deduced from the other one by passing AutoSize as in the following example:
+/// \include MatrixBase_reshaped_auto.cpp
+/// Output: \verbinclude MatrixBase_reshaped_auto.out
+/// AutoSize does preserve compile-time sizes when possible, i.e., when the sizes of the input are known at compile time \b and
+/// that the other size is passed at compile-time using Eigen::fix<N> as above.
+///
+/// \sa class Reshaped, fix, fix<N>(int)
+///
+template<int Order = ColMajor, typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC
+inline Reshaped<Derived,...>
+reshaped(NRowsType nRows, NColsType nCols);
+
+/// This is the const version of reshaped(NRowsType,NColsType).
+template<int Order = ColMajor, typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC
+inline const Reshaped<const Derived,...>
+reshaped(NRowsType nRows, NColsType nCols) const;
+
+/// \returns an expression of \c *this with columns (or rows) stacked to a linear column vector
+///
+/// \tparam Order specifies whether the coefficients should be processed in column-major-order (ColMajor), in row-major-order (RowMajor),
+///               or follows the \em natural order of the nested expression (AutoOrder). The default is ColMajor.
+///
+/// This overloads is essentially a shortcut for `A.reshaped<Order>(AutoSize,fix<1>)`.
+///
+/// - If `Order==ColMajor` (the default), then it returns a column-vector from the stacked columns of \c *this.
+/// - If `Order==RowMajor`, then it returns a column-vector from the stacked rows of \c *this.
+/// - If `Order==AutoOrder`, then it returns a column-vector with elements stacked following the storage order of \c *this.
+///   This mode is the recommended one when the particular ordering of the element is not relevant.
+///
+/// Example:
+/// \include MatrixBase_reshaped_to_vector.cpp
+/// Output: \verbinclude MatrixBase_reshaped_to_vector.out
+///
+/// If you want more control, you can still fall back to reshaped(NRowsType,NColsType).
+///
+/// \sa reshaped(NRowsType,NColsType), class Reshaped
+///
+template<int Order = ColMajor>
+EIGEN_DEVICE_FUNC
+inline Reshaped<Derived,...>
+reshaped();
+
+/// This is the const version of reshaped().
+template<int Order = ColMajor>
+EIGEN_DEVICE_FUNC
+inline const Reshaped<const Derived,...>
+reshaped() const;
+
+#else
+
+// This file is automatically included twice to generate const and non-const versions
+
+#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS
+#define EIGEN_RESHAPED_METHOD_CONST const
+#else
+#define EIGEN_RESHAPED_METHOD_CONST
+#endif
+
+#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS
+
+// This part is included once
+
+#endif
+
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC
+inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,
+                internal::get_compiletime_reshape_size<NRowsType,NColsType,SizeAtCompileTime>::value,
+                internal::get_compiletime_reshape_size<NColsType,NRowsType,SizeAtCompileTime>::value>
+reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST
+{
+  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,
+                  internal::get_compiletime_reshape_size<NRowsType,NColsType,SizeAtCompileTime>::value,
+                  internal::get_compiletime_reshape_size<NColsType,NRowsType,SizeAtCompileTime>::value>
+                (derived(),
+                 internal::get_runtime_reshape_size(nRows,internal::get_runtime_value(nCols),size()),
+                 internal::get_runtime_reshape_size(nCols,internal::get_runtime_value(nRows),size()));
+}
+
+template<int Order, typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC
+inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,
+                internal::get_compiletime_reshape_size<NRowsType,NColsType,SizeAtCompileTime>::value,
+                internal::get_compiletime_reshape_size<NColsType,NRowsType,SizeAtCompileTime>::value,
+                internal::get_compiletime_reshape_order<Flags,Order>::value>
+reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST
+{
+  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,
+                  internal::get_compiletime_reshape_size<NRowsType,NColsType,SizeAtCompileTime>::value,
+                  internal::get_compiletime_reshape_size<NColsType,NRowsType,SizeAtCompileTime>::value,
+                  internal::get_compiletime_reshape_order<Flags,Order>::value>
+                (derived(),
+                 internal::get_runtime_reshape_size(nRows,internal::get_runtime_value(nCols),size()),
+                 internal::get_runtime_reshape_size(nCols,internal::get_runtime_value(nRows),size()));
+}
+
+// Views as linear vectors
+
+EIGEN_DEVICE_FUNC
+inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,SizeAtCompileTime,1>
+reshaped() EIGEN_RESHAPED_METHOD_CONST
+{
+  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,SizeAtCompileTime,1>(derived(),size(),1);
+}
+
+template<int Order>
+EIGEN_DEVICE_FUNC
+inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1,
+                internal::get_compiletime_reshape_order<Flags,Order>::value>
+reshaped() EIGEN_RESHAPED_METHOD_CONST
+{
+  EIGEN_STATIC_ASSERT(Order==RowMajor || Order==ColMajor || Order==AutoOrder, INVALID_TEMPLATE_PARAMETER);
+  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1,
+                  internal::get_compiletime_reshape_order<Flags,Order>::value>
+                (derived(), size(), 1);
+}
+
+#undef EIGEN_RESHAPED_METHOD_CONST
+
+#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS
+#define EIGEN_RESHAPED_METHOD_2ND_PASS
+#include "ReshapedMethods.h"
+#undef EIGEN_RESHAPED_METHOD_2ND_PASS
+#endif
+
+#endif // EIGEN_PARSED_BY_DOXYGEN

diff --git a/INSTALL b/INSTALL
new file mode 100644
index 0000000..4f717e9
--- /dev/null
+++ b/INSTALL

@@ -0,0 +1,35 @@
+Installation instructions for Eigen
+***********************************
+
+Explanation before starting
+***************************
+
+Eigen consists only of header files, hence there is nothing to compile
+before you can use it. Moreover, these header files do not depend on your
+platform, they are the same for everybody.
+
+Method 1. Installing without using CMake
+****************************************
+
+You can use right away the headers in the Eigen/ subdirectory. In order
+to install, just copy this Eigen/ subdirectory to your favorite location.
+If you also want the unsupported features, copy the unsupported/
+subdirectory too.
+
+Method 2. Installing using CMake
+********************************
+
+Let's call this directory 'source_dir' (where this INSTALL file is).
+Before starting, create another directory which we will call 'build_dir'.
+
+Do:
+
+  cd build_dir
+  cmake source_dir
+  make install
+
+The "make install" step may require administrator privileges.
+
+You can adjust the installation destination (the "prefix")
+by passing the -DCMAKE_INSTALL_PREFIX=myprefix option to cmake, as is
+explained in the message that cmake prints at the end.

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9b40e9e
--- /dev/null
+++ b/README.md

@@ -0,0 +1,5 @@
+**Eigen is a C++ template library for linear algebra: matrices, vectors, numerical solvers, and related algorithms.**
+
+For more information go to http://eigen.tuxfamily.org/.
+
+For ***pull request***, ***bug reports***, and ***feature requests***, go to https://gitlab.com/libeigen/eigen.

diff --git a/blas/BandTriangularSolver.h b/blas/BandTriangularSolver.h
new file mode 100644
index 0000000..ce2d74d
--- /dev/null
+++ b/blas/BandTriangularSolver.h

@@ -0,0 +1,97 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BAND_TRIANGULARSOLVER_H
+#define EIGEN_BAND_TRIANGULARSOLVER_H
+
+namespace internal {
+
+ /* \internal
+  * Solve Ax=b with A a band triangular matrix
+  * TODO: extend it to matrices for x abd b */
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, int StorageOrder>
+struct band_solve_triangular_selector;
+
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar>
+struct band_solve_triangular_selector<Index,Mode,LhsScalar,ConjLhs,RhsScalar,RowMajor>
+{
+  typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
+  typedef Map<Matrix<RhsScalar,Dynamic,1> > RhsMap;
+  enum { IsLower = (Mode&Lower) ? 1 : 0 };
+  static void run(Index size, Index k, const LhsScalar* _lhs, Index lhsStride, RhsScalar* _other)
+  {
+    const LhsMap lhs(_lhs,size,k+1,OuterStride<>(lhsStride));
+    RhsMap other(_other,size,1);
+    typename internal::conditional<
+                          ConjLhs,
+                          const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
+                          const LhsMap&>
+                        ::type cjLhs(lhs);
+                        
+    for(int col=0 ; col<other.cols() ; ++col)
+    {
+      for(int ii=0; ii<size; ++ii)
+      {
+        int i = IsLower ? ii : size-ii-1;
+        int actual_k = (std::min)(k,ii);
+        int actual_start = IsLower ? k-actual_k : 1;
+        
+        if(actual_k>0)
+          other.coeffRef(i,col) -= cjLhs.row(i).segment(actual_start,actual_k).transpose()
+                                  .cwiseProduct(other.col(col).segment(IsLower ? i-actual_k : i+1,actual_k)).sum();
+
+        if((Mode&UnitDiag)==0)
+          other.coeffRef(i,col) /= cjLhs(i,IsLower ? k : 0);
+      }
+    }
+  }
+  
+};
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar>
+struct band_solve_triangular_selector<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ColMajor>
+{
+  typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
+  typedef Map<Matrix<RhsScalar,Dynamic,1> > RhsMap;
+  enum { IsLower = (Mode&Lower) ? 1 : 0 };
+  static void run(Index size, Index k, const LhsScalar* _lhs, Index lhsStride, RhsScalar* _other)
+  {
+    const LhsMap lhs(_lhs,k+1,size,OuterStride<>(lhsStride));
+    RhsMap other(_other,size,1);
+    typename internal::conditional<
+                          ConjLhs,
+                          const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
+                          const LhsMap&>
+                        ::type cjLhs(lhs);
+                        
+    for(int col=0 ; col<other.cols() ; ++col)
+    {
+      for(int ii=0; ii<size; ++ii)
+      {
+        int i = IsLower ? ii : size-ii-1;
+        int actual_k = (std::min)(k,size-ii-1);
+        int actual_start = IsLower ? 1 : k-actual_k;
+        
+        if((Mode&UnitDiag)==0)
+          other.coeffRef(i,col) /= cjLhs(IsLower ? 0 : k, i);
+
+        if(actual_k>0)
+          other.col(col).segment(IsLower ? i+1 : i-actual_k, actual_k)
+              -= other.coeff(i,col) * cjLhs.col(i).segment(actual_start,actual_k);
+        
+      }
+    }
+  }
+};
+
+
+} // end namespace internal
+
+#endif // EIGEN_BAND_TRIANGULARSOLVER_H

diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt
new file mode 100644
index 0000000..f3a94ec
--- /dev/null
+++ b/blas/CMakeLists.txt

@@ -0,0 +1,62 @@
+
+project(EigenBlas CXX)
+
+include(CheckLanguage)
+check_language(Fortran)
+if(CMAKE_Fortran_COMPILER)
+  enable_language(Fortran)
+  set(EIGEN_Fortran_COMPILER_WORKS ON)
+else()
+  set(EIGEN_Fortran_COMPILER_WORKS OFF)
+endif()
+
+add_custom_target(blas)
+
+set(EigenBlas_SRCS  single.cpp double.cpp complex_single.cpp complex_double.cpp xerbla.cpp
+                    f2c/srotm.c   f2c/srotmg.c  f2c/drotm.c f2c/drotmg.c
+                    f2c/lsame.c   f2c/dspmv.c   f2c/ssbmv.c f2c/chbmv.c
+                    f2c/sspmv.c   f2c/zhbmv.c   f2c/chpmv.c f2c/dsbmv.c
+                    f2c/zhpmv.c   f2c/dtbmv.c   f2c/stbmv.c f2c/ctbmv.c
+                    f2c/ztbmv.c   f2c/d_cnjg.c  f2c/r_cnjg.c
+   )
+
+if (EIGEN_Fortran_COMPILER_WORKS)
+  set(EigenBlas_SRCS ${EigenBlas_SRCS} fortran/complexdots.f)
+else()
+  set(EigenBlas_SRCS ${EigenBlas_SRCS} f2c/complexdots.c)
+endif()
+
+set(EIGEN_BLAS_TARGETS "")
+
+add_library(eigen_blas_static ${EigenBlas_SRCS})
+list(APPEND EIGEN_BLAS_TARGETS eigen_blas_static)
+
+if (EIGEN_BUILD_SHARED_LIBS)
+  add_library(eigen_blas SHARED ${EigenBlas_SRCS})
+  list(APPEND EIGEN_BLAS_TARGETS eigen_blas)
+endif()
+
+foreach(target IN LISTS EIGEN_BLAS_TARGETS)
+  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+      target_link_libraries(${target} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+  endif()
+
+  add_dependencies(blas ${target})
+  install(TARGETS ${target}
+          RUNTIME DESTINATION bin
+          LIBRARY DESTINATION lib
+          ARCHIVE DESTINATION lib)
+endforeach()
+
+if(EIGEN_Fortran_COMPILER_WORKS)
+
+if(BUILD_TESTING)
+  if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
+    add_subdirectory(testing) # can't do EXCLUDE_FROM_ALL here, breaks CTest
+  else()
+    add_subdirectory(testing EXCLUDE_FROM_ALL)
+  endif()
+endif()
+
+endif()
+

diff --git a/blas/GeneralRank1Update.h b/blas/GeneralRank1Update.h
new file mode 100644
index 0000000..07d388c
--- /dev/null
+++ b/blas/GeneralRank1Update.h

@@ -0,0 +1,44 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Chen-Pang He <jdh8@ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_RANK1UPDATE_H
+#define EIGEN_GENERAL_RANK1UPDATE_H
+
+namespace internal {
+
+/* Optimized matrix += alpha * uv' */
+template<typename Scalar, typename Index, int StorageOrder, bool ConjLhs, bool ConjRhs>
+struct general_rank1_update;
+
+template<typename Scalar, typename Index, bool ConjLhs, bool ConjRhs>
+struct general_rank1_update<Scalar,Index,ColMajor,ConjLhs,ConjRhs>
+{
+  static void run(Index rows, Index cols, Scalar* mat, Index stride, const Scalar* u, const Scalar* v, Scalar alpha)
+  {
+    typedef Map<const Matrix<Scalar,Dynamic,1> > OtherMap;
+    typedef typename conj_expr_if<ConjLhs,OtherMap>::type ConjRhsType;
+    conj_if<ConjRhs> cj;
+
+    for (Index i=0; i<cols; ++i)
+      Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i,rows) += alpha * cj(v[i]) * ConjRhsType(OtherMap(u,rows));
+  }
+};
+
+template<typename Scalar, typename Index, bool ConjLhs, bool ConjRhs>
+struct general_rank1_update<Scalar,Index,RowMajor,ConjLhs,ConjRhs>
+{
+  static void run(Index rows, Index cols, Scalar* mat, Index stride, const Scalar* u, const Scalar* v, Scalar alpha)
+  {
+    general_rank1_update<Scalar,Index,ColMajor,ConjRhs,ConjRhs>::run(rows,cols,mat,stride,u,v,alpha);
+  }
+};
+
+} // end namespace internal
+
+#endif // EIGEN_GENERAL_RANK1UPDATE_H

diff --git a/blas/PackedSelfadjointProduct.h b/blas/PackedSelfadjointProduct.h
new file mode 100644
index 0000000..07327a2
--- /dev/null
+++ b/blas/PackedSelfadjointProduct.h

@@ -0,0 +1,53 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Chen-Pang He <jdh8@ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINT_PACKED_PRODUCT_H
+#define EIGEN_SELFADJOINT_PACKED_PRODUCT_H
+
+namespace internal {
+
+/* Optimized matrix += alpha * uv'
+ * The matrix is in packed form.
+ */
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjLhs, bool ConjRhs>
+struct selfadjoint_packed_rank1_update;
+
+template<typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
+struct selfadjoint_packed_rank1_update<Scalar,Index,ColMajor,UpLo,ConjLhs,ConjRhs>
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static void run(Index size, Scalar* mat, const Scalar* vec, RealScalar alpha)
+  {
+    typedef Map<const Matrix<Scalar,Dynamic,1> > OtherMap;
+    typedef typename conj_expr_if<ConjLhs,OtherMap>::type ConjRhsType;
+    conj_if<ConjRhs> cj;
+
+    for (Index i=0; i<size; ++i)
+    {
+      Map<Matrix<Scalar,Dynamic,1> >(mat, UpLo==Lower ? size-i : (i+1)) += alpha * cj(vec[i]) * ConjRhsType(OtherMap(vec+(UpLo==Lower ? i : 0), UpLo==Lower ? size-i : (i+1)));
+      //FIXME This should be handled outside.
+      mat[UpLo==Lower ? 0 : i] = numext::real(mat[UpLo==Lower ? 0 : i]);
+      mat += UpLo==Lower ? size-i : (i+1);
+    }
+  }
+};
+
+template<typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
+struct selfadjoint_packed_rank1_update<Scalar,Index,RowMajor,UpLo,ConjLhs,ConjRhs>
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static void run(Index size, Scalar* mat, const Scalar* vec, RealScalar alpha)
+  {
+    selfadjoint_packed_rank1_update<Scalar,Index,ColMajor,UpLo==Lower?Upper:Lower,ConjRhs,ConjLhs>::run(size,mat,vec,alpha);
+  }
+};
+
+} // end namespace internal
+
+#endif // EIGEN_SELFADJOINT_PACKED_PRODUCT_H

diff --git a/blas/PackedTriangularMatrixVector.h b/blas/PackedTriangularMatrixVector.h
new file mode 100644
index 0000000..0039536
--- /dev/null
+++ b/blas/PackedTriangularMatrixVector.h

@@ -0,0 +1,79 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Chen-Pang He <jdh8@ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKED_TRIANGULAR_MATRIX_VECTOR_H
+#define EIGEN_PACKED_TRIANGULAR_MATRIX_VECTOR_H
+
+namespace internal {
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int StorageOrder>
+struct packed_triangular_matrix_vector_product;
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs>
+struct packed_triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  enum {
+    IsLower     = (Mode & Lower)   ==Lower,
+    HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
+    HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
+  };
+  static void run(Index size, const LhsScalar* lhs, const RhsScalar* rhs, ResScalar* res, ResScalar alpha)
+  {
+    internal::conj_if<ConjRhs> cj;
+    typedef Map<const Matrix<LhsScalar,Dynamic,1> > LhsMap;
+    typedef typename conj_expr_if<ConjLhs,LhsMap>::type ConjLhsType;
+    typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap;
+
+    for (Index i=0; i<size; ++i)
+    {
+      Index s = IsLower&&(HasUnitDiag||HasZeroDiag) ? 1 : 0;
+      Index r = IsLower ? size-i: i+1;
+      if (EIGEN_IMPLIES(HasUnitDiag||HasZeroDiag, (--r)>0))
+	ResMap(res+(IsLower ? s+i : 0),r) += alpha * cj(rhs[i]) * ConjLhsType(LhsMap(lhs+s,r));
+      if (HasUnitDiag)
+	res[i] += alpha * cj(rhs[i]);
+      lhs += IsLower ? size-i: i+1;
+    }
+  };
+};
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs>
+struct packed_triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  enum {
+    IsLower     = (Mode & Lower)   ==Lower,
+    HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
+    HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
+  };
+  static void run(Index size, const LhsScalar* lhs, const RhsScalar* rhs, ResScalar* res, ResScalar alpha)
+  {
+    internal::conj_if<ConjRhs> cj;
+    typedef Map<const Matrix<LhsScalar,Dynamic,1> > LhsMap;
+    typedef typename conj_expr_if<ConjLhs,LhsMap>::type ConjLhsType;
+    typedef Map<const Matrix<RhsScalar,Dynamic,1> > RhsMap;
+    typedef typename conj_expr_if<ConjRhs,RhsMap>::type ConjRhsType;
+
+    for (Index i=0; i<size; ++i)
+    {
+      Index s = !IsLower&&(HasUnitDiag||HasZeroDiag) ? 1 : 0;
+      Index r = IsLower ? i+1 : size-i;
+      if (EIGEN_IMPLIES(HasUnitDiag||HasZeroDiag, (--r)>0))
+	res[i] += alpha * (ConjLhsType(LhsMap(lhs+s,r)).cwiseProduct(ConjRhsType(RhsMap(rhs+(IsLower ? 0 : s+i),r)))).sum();
+      if (HasUnitDiag)
+	res[i] += alpha * cj(rhs[i]);
+      lhs += IsLower ? i+1 : size-i;
+    }
+  };
+};
+
+} // end namespace internal
+
+#endif // EIGEN_PACKED_TRIANGULAR_MATRIX_VECTOR_H

diff --git a/blas/PackedTriangularSolverVector.h b/blas/PackedTriangularSolverVector.h
new file mode 100644
index 0000000..5c0bb4b
--- /dev/null
+++ b/blas/PackedTriangularSolverVector.h

@@ -0,0 +1,88 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Chen-Pang He <jdh8@ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKED_TRIANGULAR_SOLVER_VECTOR_H
+#define EIGEN_PACKED_TRIANGULAR_SOLVER_VECTOR_H
+
+namespace internal {
+
+template<typename LhsScalar, typename RhsScalar, typename Index, int Side, int Mode, bool Conjugate, int StorageOrder>
+struct packed_triangular_solve_vector;
+
+// forward and backward substitution, row-major, rhs is a vector
+template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
+struct packed_triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, RowMajor>
+{
+  enum {
+    IsLower = (Mode&Lower)==Lower
+  };
+  static void run(Index size, const LhsScalar* lhs, RhsScalar* rhs)
+  {
+    internal::conj_if<Conjugate> cj;
+    typedef Map<const Matrix<LhsScalar,Dynamic,1> > LhsMap;
+    typedef typename conj_expr_if<Conjugate,LhsMap>::type ConjLhsType;
+
+    lhs += IsLower ? 0 : (size*(size+1)>>1)-1;
+    for(Index pi=0; pi<size; ++pi)
+    {
+      Index i = IsLower ? pi : size-pi-1;
+      Index s = IsLower ? 0 : 1;
+      if (pi>0)
+	rhs[i] -= (ConjLhsType(LhsMap(lhs+s,pi))
+	    .cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+(IsLower ? 0 : i+1),pi))).sum();
+      if (!(Mode & UnitDiag))
+	rhs[i] /= cj(lhs[IsLower ? i : 0]);
+      IsLower ? lhs += pi+1 : lhs -= pi+2;
+    }
+  }
+};
+
+// forward and backward substitution, column-major, rhs is a vector
+template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
+struct packed_triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, ColMajor>
+{
+  enum {
+    IsLower = (Mode&Lower)==Lower
+  };
+  static void run(Index size, const LhsScalar* lhs, RhsScalar* rhs)
+  {
+    internal::conj_if<Conjugate> cj;
+    typedef Map<const Matrix<LhsScalar,Dynamic,1> > LhsMap;
+    typedef typename conj_expr_if<Conjugate,LhsMap>::type ConjLhsType;
+
+    lhs += IsLower ? 0 : size*(size-1)>>1;
+    for(Index pi=0; pi<size; ++pi)
+    {
+      Index i = IsLower ? pi : size-pi-1;
+      Index r = size - pi - 1;
+      if (!(Mode & UnitDiag))
+	rhs[i] /= cj(lhs[IsLower ? 0 : i]);
+      if (r>0)
+	Map<Matrix<RhsScalar,Dynamic,1> >(rhs+(IsLower? i+1 : 0),r) -=
+	    rhs[i] * ConjLhsType(LhsMap(lhs+(IsLower? 1 : 0),r));
+      IsLower ? lhs += size-pi : lhs -= r;
+    }
+  }
+};
+
+template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate, int StorageOrder>
+struct packed_triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheRight, Mode, Conjugate, StorageOrder>
+{
+  static void run(Index size, const LhsScalar* lhs, RhsScalar* rhs)
+  {
+    packed_triangular_solve_vector<LhsScalar,RhsScalar,Index,OnTheLeft,
+	((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag),
+	Conjugate,StorageOrder==RowMajor?ColMajor:RowMajor
+      >::run(size, lhs, rhs);
+  }
+};
+
+} // end namespace internal
+
+#endif // EIGEN_PACKED_TRIANGULAR_SOLVER_VECTOR_H

diff --git a/blas/README.txt b/blas/README.txt
new file mode 100644
index 0000000..63a5203
--- /dev/null
+++ b/blas/README.txt

@@ -0,0 +1,6 @@
+
+This directory contains a BLAS library built on top of Eigen.
+
+This module is not built by default. In order to compile it, you need to
+type 'make blas' from within your build dir.
+

diff --git a/blas/Rank2Update.h b/blas/Rank2Update.h
new file mode 100644
index 0000000..138d70f
--- /dev/null
+++ b/blas/Rank2Update.h

@@ -0,0 +1,57 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Chen-Pang He <jdh8@ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_RANK2UPDATE_H
+#define EIGEN_RANK2UPDATE_H
+
+namespace internal {
+
+/* Optimized selfadjoint matrix += alpha * uv' + conj(alpha)*vu'
+ * This is the low-level version of SelfadjointRank2Update.h
+ */
+template<typename Scalar, typename Index, int UpLo>
+struct rank2_update_selector
+{
+  static void run(Index size, Scalar* mat, Index stride, const Scalar* u, const Scalar* v, Scalar alpha)
+  {
+    typedef Map<const Matrix<Scalar,Dynamic,1> > OtherMap;
+    for (Index i=0; i<size; ++i)
+    {
+      Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i+(UpLo==Lower ? i : 0), UpLo==Lower ? size-i : (i+1)) +=
+      numext::conj(alpha) * numext::conj(u[i]) * OtherMap(v+(UpLo==Lower ? i : 0), UpLo==Lower ? size-i : (i+1))
+                + alpha * numext::conj(v[i]) * OtherMap(u+(UpLo==Lower ? i : 0), UpLo==Lower ? size-i : (i+1));
+    }
+  }
+};
+
+/* Optimized selfadjoint matrix += alpha * uv' + conj(alpha)*vu'
+ * The matrix is in packed form.
+ */
+template<typename Scalar, typename Index, int UpLo>
+struct packed_rank2_update_selector
+{
+  static void run(Index size, Scalar* mat, const Scalar* u, const Scalar* v, Scalar alpha)
+  {
+    typedef Map<const Matrix<Scalar,Dynamic,1> > OtherMap;
+    Index offset = 0;
+    for (Index i=0; i<size; ++i)
+    {
+      Map<Matrix<Scalar,Dynamic,1> >(mat+offset, UpLo==Lower ? size-i : (i+1)) +=
+      numext::conj(alpha) * numext::conj(u[i]) * OtherMap(v+(UpLo==Lower ? i : 0), UpLo==Lower ? size-i : (i+1))
+                + alpha * numext::conj(v[i]) * OtherMap(u+(UpLo==Lower ? i : 0), UpLo==Lower ? size-i : (i+1));
+      //FIXME This should be handled outside.
+      mat[offset+(UpLo==Lower ? 0 : i)] = numext::real(mat[offset+(UpLo==Lower ? 0 : i)]);
+      offset += UpLo==Lower ? size-i : (i+1);
+    }
+  }
+};
+
+} // end namespace internal
+
+#endif // EIGEN_RANK2UPDATE_H

diff --git a/blas/common.h b/blas/common.h
new file mode 100644
index 0000000..a9b6978
--- /dev/null
+++ b/blas/common.h

@@ -0,0 +1,175 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BLAS_COMMON_H
+#define EIGEN_BLAS_COMMON_H
+
+#ifdef __GNUC__
+# if __GNUC__<5
+// GCC < 5.0 does not like the global Scalar typedef
+// we just keep shadow-warnings disabled permanently
+#  define EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+# endif
+#endif
+
+#include "../Eigen/Core"
+#include "../Eigen/Jacobi"
+
+#include <complex>
+
+#ifndef SCALAR
+#error the token SCALAR must be defined to compile this file
+#endif
+
+#include "../Eigen/src/misc/blas.h"
+
+#define NOTR    0
+#define TR      1
+#define ADJ     2
+
+#define LEFT    0
+#define RIGHT   1
+
+#define UP      0
+#define LO      1
+
+#define NUNIT   0
+#define UNIT    1
+
+#define INVALID 0xff
+
+#define OP(X)   (   ((X)=='N' || (X)=='n') ? NOTR   \
+                  : ((X)=='T' || (X)=='t') ? TR     \
+                  : ((X)=='C' || (X)=='c') ? ADJ    \
+                  : INVALID)
+
+#define SIDE(X) (   ((X)=='L' || (X)=='l') ? LEFT   \
+                  : ((X)=='R' || (X)=='r') ? RIGHT  \
+                  : INVALID)
+
+#define UPLO(X) (   ((X)=='U' || (X)=='u') ? UP     \
+                  : ((X)=='L' || (X)=='l') ? LO     \
+                  : INVALID)
+
+#define DIAG(X) (   ((X)=='N' || (X)=='n') ? NUNIT  \
+                  : ((X)=='U' || (X)=='u') ? UNIT   \
+                  : INVALID)
+
+
+inline bool check_op(const char* op)
+{
+  return OP(*op)!=0xff;
+}
+
+inline bool check_side(const char* side)
+{
+  return SIDE(*side)!=0xff;
+}
+
+inline bool check_uplo(const char* uplo)
+{
+  return UPLO(*uplo)!=0xff;
+}
+
+
+namespace Eigen {
+#include "BandTriangularSolver.h"
+#include "GeneralRank1Update.h"
+#include "PackedSelfadjointProduct.h"
+#include "PackedTriangularMatrixVector.h"
+#include "PackedTriangularSolverVector.h"
+#include "Rank2Update.h"
+}
+
+using namespace Eigen;
+
+typedef SCALAR Scalar;
+typedef NumTraits<Scalar>::Real RealScalar;
+typedef std::complex<RealScalar> Complex;
+
+enum
+{
+  IsComplex = Eigen::NumTraits<SCALAR>::IsComplex,
+  Conj = IsComplex
+};
+
+typedef Matrix<Scalar,Dynamic,Dynamic,ColMajor> PlainMatrixType;
+typedef Map<Matrix<Scalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > MatrixType;
+typedef Map<const Matrix<Scalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > ConstMatrixType;
+typedef Map<Matrix<Scalar,Dynamic,1>, 0, InnerStride<Dynamic> > StridedVectorType;
+typedef Map<Matrix<Scalar,Dynamic,1> > CompactVectorType;
+
+template<typename T>
+Map<Matrix<T,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> >
+matrix(T* data, int rows, int cols, int stride)
+{
+  return Map<Matrix<T,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> >(data, rows, cols, OuterStride<>(stride));
+}
+
+template<typename T>
+Map<const Matrix<T,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> >
+matrix(const T* data, int rows, int cols, int stride)
+{
+  return Map<const Matrix<T,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> >(data, rows, cols, OuterStride<>(stride));
+}
+
+template<typename T>
+Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> > make_vector(T* data, int size, int incr)
+{
+  return Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> >(data, size, InnerStride<Dynamic>(incr));
+}
+
+template<typename T>
+Map<const Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> > make_vector(const T* data, int size, int incr)
+{
+  return Map<const Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> >(data, size, InnerStride<Dynamic>(incr));
+}
+
+template<typename T>
+Map<Matrix<T,Dynamic,1> > make_vector(T* data, int size)
+{
+  return Map<Matrix<T,Dynamic,1> >(data, size);
+}
+
+template<typename T>
+Map<const Matrix<T,Dynamic,1> > make_vector(const T* data, int size)
+{
+  return Map<const Matrix<T,Dynamic,1> >(data, size);
+}
+
+template<typename T>
+T* get_compact_vector(T* x, int n, int incx)
+{
+  if(incx==1)
+    return x;
+
+  typename Eigen::internal::remove_const<T>::type* ret = new Scalar[n];
+  if(incx<0) make_vector(ret,n) = make_vector(x,n,-incx).reverse();
+  else       make_vector(ret,n) = make_vector(x,n, incx);
+  return ret;
+}
+
+template<typename T>
+T* copy_back(T* x_cpy, T* x, int n, int incx)
+{
+  if(x_cpy==x)
+    return 0;
+
+  if(incx<0) make_vector(x,n,-incx).reverse() = make_vector(x_cpy,n);
+  else       make_vector(x,n, incx)           = make_vector(x_cpy,n);
+  return x_cpy;
+}
+
+#ifndef EIGEN_BLAS_FUNC_SUFFIX
+#define EIGEN_BLAS_FUNC_SUFFIX _
+#endif
+
+#define EIGEN_BLAS_FUNC(X) EIGEN_CAT(SCALAR_SUFFIX, EIGEN_CAT(X, EIGEN_BLAS_FUNC_SUFFIX))
+
+#endif // EIGEN_BLAS_COMMON_H

diff --git a/blas/complex_double.cpp b/blas/complex_double.cpp
new file mode 100644
index 0000000..648c6d4
--- /dev/null
+++ b/blas/complex_double.cpp

@@ -0,0 +1,20 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define SCALAR        std::complex<double>
+#define SCALAR_SUFFIX z
+#define SCALAR_SUFFIX_UP "Z"
+#define REAL_SCALAR_SUFFIX d
+#define ISCOMPLEX     1
+
+#include "level1_impl.h"
+#include "level1_cplx_impl.h"
+#include "level2_impl.h"
+#include "level2_cplx_impl.h"
+#include "level3_impl.h"

diff --git a/blas/complex_single.cpp b/blas/complex_single.cpp
new file mode 100644
index 0000000..7786519
--- /dev/null
+++ b/blas/complex_single.cpp

@@ -0,0 +1,20 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define SCALAR        std::complex<float>
+#define SCALAR_SUFFIX c
+#define SCALAR_SUFFIX_UP "C"
+#define REAL_SCALAR_SUFFIX s
+#define ISCOMPLEX     1
+
+#include "level1_impl.h"
+#include "level1_cplx_impl.h"
+#include "level2_impl.h"
+#include "level2_cplx_impl.h"
+#include "level3_impl.h"

diff --git a/blas/double.cpp b/blas/double.cpp
new file mode 100644
index 0000000..eb2e573
--- /dev/null
+++ b/blas/double.cpp

@@ -0,0 +1,32 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2012 Chen-Pang He <jdh8@ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define SCALAR        double
+#define SCALAR_SUFFIX d
+#define SCALAR_SUFFIX_UP "D"
+#define ISCOMPLEX     0
+
+#include "level1_impl.h"
+#include "level1_real_impl.h"
+#include "level2_impl.h"
+#include "level2_real_impl.h"
+#include "level3_impl.h"
+
+double EIGEN_BLAS_FUNC(sdot)(int* n, float* x, int* incx, float* y, int* incy)
+{
+  if(*n<=0) return 0;
+
+  if(*incx==1 && *incy==1)    return (make_vector(x,*n).cast<double>().cwiseProduct(make_vector(y,*n).cast<double>())).sum();
+  else if(*incx>0 && *incy>0) return (make_vector(x,*n,*incx).cast<double>().cwiseProduct(make_vector(y,*n,*incy).cast<double>())).sum();
+  else if(*incx<0 && *incy>0) return (make_vector(x,*n,-*incx).reverse().cast<double>().cwiseProduct(make_vector(y,*n,*incy).cast<double>())).sum();
+  else if(*incx>0 && *incy<0) return (make_vector(x,*n,*incx).cast<double>().cwiseProduct(make_vector(y,*n,-*incy).reverse().cast<double>())).sum();
+  else if(*incx<0 && *incy<0) return (make_vector(x,*n,-*incx).reverse().cast<double>().cwiseProduct(make_vector(y,*n,-*incy).reverse().cast<double>())).sum();
+  else return 0;
+}

diff --git a/blas/f2c/chbmv.c b/blas/f2c/chbmv.c
new file mode 100644
index 0000000..f218fe3
--- /dev/null
+++ b/blas/f2c/chbmv.c

@@ -0,0 +1,487 @@
+/* chbmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int chbmv_(char *uplo, integer *n, integer *k, complex *
+	alpha, complex *a, integer *lda, complex *x, integer *incx, complex *
+	beta, complex *y, integer *incy, ftnlen uplo_len)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    real r__1;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Builtin functions */
+    void r_cnjg(complex *, complex *);
+
+    /* Local variables */
+    integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
+    complex temp1, temp2;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    integer kplus1;
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  CHBMV  performs the matrix-vector  operation */
+
+/*     y := alpha*A*x + beta*y, */
+
+/*  where alpha and beta are scalars, x and y are n element vectors and */
+/*  A is an n by n hermitian band matrix, with k super-diagonals. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the upper or lower */
+/*           triangular part of the band matrix A is being supplied as */
+/*           follows: */
+
+/*              UPLO = 'U' or 'u'   The upper triangular part of A is */
+/*                                  being supplied. */
+
+/*              UPLO = 'L' or 'l'   The lower triangular part of A is */
+/*                                  being supplied. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  K      - INTEGER. */
+/*           On entry, K specifies the number of super-diagonals of the */
+/*           matrix A. K must satisfy  0 .le. K. */
+/*           Unchanged on exit. */
+
+/*  ALPHA  - COMPLEX         . */
+/*           On entry, ALPHA specifies the scalar alpha. */
+/*           Unchanged on exit. */
+
+/*  A      - COMPLEX          array of DIMENSION ( LDA, n ). */
+/*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the upper triangular */
+/*           band part of the hermitian matrix, supplied column by */
+/*           column, with the leading diagonal of the matrix in row */
+/*           ( k + 1 ) of the array, the first super-diagonal starting at */
+/*           position 2 in row k, and so on. The top left k by k triangle */
+/*           of the array A is not referenced. */
+/*           The following program segment will transfer the upper */
+/*           triangular part of a hermitian band matrix from conventional */
+/*           full matrix storage to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = K + 1 - J */
+/*                    DO 10, I = MAX( 1, J - K ), J */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the lower triangular */
+/*           band part of the hermitian matrix, supplied column by */
+/*           column, with the leading diagonal of the matrix in row 1 of */
+/*           the array, the first sub-diagonal starting at position 1 in */
+/*           row 2, and so on. The bottom right k by k triangle of the */
+/*           array A is not referenced. */
+/*           The following program segment will transfer the lower */
+/*           triangular part of a hermitian band matrix from conventional */
+/*           full matrix storage to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = 1 - J */
+/*                    DO 10, I = J, MIN( N, J + K ) */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Note that the imaginary parts of the diagonal elements need */
+/*           not be set and are assumed to be zero. */
+/*           Unchanged on exit. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program. LDA must be at least */
+/*           ( k + 1 ). */
+/*           Unchanged on exit. */
+
+/*  X      - COMPLEX          array of DIMENSION at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the */
+/*           vector x. */
+/*           Unchanged on exit. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  BETA   - COMPLEX         . */
+/*           On entry, BETA specifies the scalar beta. */
+/*           Unchanged on exit. */
+
+/*  Y      - COMPLEX          array of DIMENSION at least */
+/*           ( 1 + ( n - 1 )*abs( INCY ) ). */
+/*           Before entry, the incremented array Y must contain the */
+/*           vector y. On exit, Y is overwritten by the updated vector y. */
+
+/*  INCY   - INTEGER. */
+/*           On entry, INCY specifies the increment for the elements of */
+/*           Y. INCY must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+    --y;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*k < 0) {
+	info = 3;
+    } else if (*lda < *k + 1) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    } else if (*incy == 0) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("CHBMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && 
+                                                           beta->i == 0.f))) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of the array A */
+/*     are accessed sequentially with one pass through A. */
+
+/*     First form  y := beta*y. */
+
+    if (beta->r != 1.f || beta->i != 0.f) {
+	if (*incy == 1) {
+	    if (beta->r == 0.f && beta->i == 0.f) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    y[i__2].r = 0.f, y[i__2].i = 0.f;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    i__3 = i__;
+		    q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, 
+			    q__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (beta->r == 0.f && beta->i == 0.f) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    y[i__2].r = 0.f, y[i__2].i = 0.f;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    i__3 = iy;
+		    q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, 
+			    q__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (alpha->r == 0.f && alpha->i == 0.f) {
+	return 0;
+    }
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+
+/*        Form  y  when upper triangle of A is stored. */
+
+	kplus1 = *k + 1;
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		l = kplus1 - j;
+/* Computing MAX */
+		i__2 = 1, i__3 = j - *k;
+		i__4 = j - 1;
+		for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
+		    i__2 = i__;
+		    i__3 = i__;
+		    i__5 = l + i__ + j * a_dim1;
+		    q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, 
+			    q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
+		    y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		    r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
+		    i__2 = i__;
+		    q__2.r = q__3.r * x[i__2].r - q__3.i * x[i__2].i, q__2.i =
+			     q__3.r * x[i__2].i + q__3.i * x[i__2].r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+/* L50: */
+		}
+		i__4 = j;
+		i__2 = j;
+		i__3 = kplus1 + j * a_dim1;
+		r__1 = a[i__3].r;
+		q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
+		q__2.r = y[i__2].r + q__3.r, q__2.i = y[i__2].i + q__3.i;
+		q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
+		y[i__4].r = q__1.r, y[i__4].i = q__1.i;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__4 = jx;
+		q__1.r = alpha->r * x[i__4].r - alpha->i * x[i__4].i, q__1.i =
+			 alpha->r * x[i__4].i + alpha->i * x[i__4].r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		ix = kx;
+		iy = ky;
+		l = kplus1 - j;
+/* Computing MAX */
+		i__4 = 1, i__2 = j - *k;
+		i__3 = j - 1;
+		for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
+		    i__4 = iy;
+		    i__2 = iy;
+		    i__5 = l + i__ + j * a_dim1;
+		    q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, 
+			    q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
+		    y[i__4].r = q__1.r, y[i__4].i = q__1.i;
+		    r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
+		    i__4 = ix;
+		    q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i =
+			     q__3.r * x[i__4].i + q__3.i * x[i__4].r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		i__3 = jy;
+		i__4 = jy;
+		i__2 = kplus1 + j * a_dim1;
+		r__1 = a[i__2].r;
+		q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
+		q__2.r = y[i__4].r + q__3.r, q__2.i = y[i__4].i + q__3.i;
+		q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
+		y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+		jx += *incx;
+		jy += *incy;
+		if (j > *k) {
+		    kx += *incx;
+		    ky += *incy;
+		}
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when lower triangle of A is stored. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__3 = j;
+		q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i =
+			 alpha->r * x[i__3].i + alpha->i * x[i__3].r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		i__3 = j;
+		i__4 = j;
+		i__2 = j * a_dim1 + 1;
+		r__1 = a[i__2].r;
+		q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
+		q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
+		y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+		l = 1 - j;
+/* Computing MIN */
+		i__4 = *n, i__2 = j + *k;
+		i__3 = min(i__4,i__2);
+		for (i__ = j + 1; i__ <= i__3; ++i__) {
+		    i__4 = i__;
+		    i__2 = i__;
+		    i__5 = l + i__ + j * a_dim1;
+		    q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, 
+			    q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
+		    y[i__4].r = q__1.r, y[i__4].i = q__1.i;
+		    r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
+		    i__4 = i__;
+		    q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i =
+			     q__3.r * x[i__4].i + q__3.i * x[i__4].r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+/* L90: */
+		}
+		i__3 = j;
+		i__4 = j;
+		q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
+		y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__3 = jx;
+		q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i =
+			 alpha->r * x[i__3].i + alpha->i * x[i__3].r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		i__3 = jy;
+		i__4 = jy;
+		i__2 = j * a_dim1 + 1;
+		r__1 = a[i__2].r;
+		q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
+		q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
+		y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+		l = 1 - j;
+		ix = jx;
+		iy = jy;
+/* Computing MIN */
+		i__4 = *n, i__2 = j + *k;
+		i__3 = min(i__4,i__2);
+		for (i__ = j + 1; i__ <= i__3; ++i__) {
+		    ix += *incx;
+		    iy += *incy;
+		    i__4 = iy;
+		    i__2 = iy;
+		    i__5 = l + i__ + j * a_dim1;
+		    q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, 
+			    q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
+		    y[i__4].r = q__1.r, y[i__4].i = q__1.i;
+		    r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
+		    i__4 = ix;
+		    q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i =
+			     q__3.r * x[i__4].i + q__3.i * x[i__4].r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+/* L110: */
+		}
+		i__3 = jy;
+		i__4 = jy;
+		q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
+		y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+		jx += *incx;
+		jy += *incy;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CHBMV . */
+
+} /* chbmv_ */
+

diff --git a/blas/f2c/chpmv.c b/blas/f2c/chpmv.c
new file mode 100644
index 0000000..65bab1c
--- /dev/null
+++ b/blas/f2c/chpmv.c

@@ -0,0 +1,438 @@
+/* chpmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int chpmv_(char *uplo, integer *n, complex *alpha, complex *
+	ap, complex *x, integer *incx, complex *beta, complex *y, integer *
+	incy, ftnlen uplo_len)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4, i__5;
+    real r__1;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Builtin functions */
+    void r_cnjg(complex *, complex *);
+
+    /* Local variables */
+    integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
+    complex temp1, temp2;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  CHPMV  performs the matrix-vector operation */
+
+/*     y := alpha*A*x + beta*y, */
+
+/*  where alpha and beta are scalars, x and y are n element vectors and */
+/*  A is an n by n hermitian matrix, supplied in packed form. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the upper or lower */
+/*           triangular part of the matrix A is supplied in the packed */
+/*           array AP as follows: */
+
+/*              UPLO = 'U' or 'u'   The upper triangular part of A is */
+/*                                  supplied in AP. */
+
+/*              UPLO = 'L' or 'l'   The lower triangular part of A is */
+/*                                  supplied in AP. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  ALPHA  - COMPLEX         . */
+/*           On entry, ALPHA specifies the scalar alpha. */
+/*           Unchanged on exit. */
+
+/*  AP     - COMPLEX          array of DIMENSION at least */
+/*           ( ( n*( n + 1 ) )/2 ). */
+/*           Before entry with UPLO = 'U' or 'u', the array AP must */
+/*           contain the upper triangular part of the hermitian matrix */
+/*           packed sequentially, column by column, so that AP( 1 ) */
+/*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
+/*           and a( 2, 2 ) respectively, and so on. */
+/*           Before entry with UPLO = 'L' or 'l', the array AP must */
+/*           contain the lower triangular part of the hermitian matrix */
+/*           packed sequentially, column by column, so that AP( 1 ) */
+/*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
+/*           and a( 3, 1 ) respectively, and so on. */
+/*           Note that the imaginary parts of the diagonal elements need */
+/*           not be set and are assumed to be zero. */
+/*           Unchanged on exit. */
+
+/*  X      - COMPLEX          array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the n */
+/*           element vector x. */
+/*           Unchanged on exit. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  BETA   - COMPLEX         . */
+/*           On entry, BETA specifies the scalar beta. When BETA is */
+/*           supplied as zero then Y need not be set on input. */
+/*           Unchanged on exit. */
+
+/*  Y      - COMPLEX          array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCY ) ). */
+/*           Before entry, the incremented array Y must contain the n */
+/*           element vector y. On exit, Y is overwritten by the updated */
+/*           vector y. */
+
+/*  INCY   - INTEGER. */
+/*           On entry, INCY specifies the increment for the elements of */
+/*           Y. INCY must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    --y;
+    --x;
+    --ap;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 6;
+    } else if (*incy == 0) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("CHPMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && 
+                                                           beta->i == 0.f))) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of the array AP */
+/*     are accessed sequentially with one pass through AP. */
+
+/*     First form  y := beta*y. */
+
+    if (beta->r != 1.f || beta->i != 0.f) {
+	if (*incy == 1) {
+	    if (beta->r == 0.f && beta->i == 0.f) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    y[i__2].r = 0.f, y[i__2].i = 0.f;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    i__3 = i__;
+		    q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, 
+			    q__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (beta->r == 0.f && beta->i == 0.f) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    y[i__2].r = 0.f, y[i__2].i = 0.f;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    i__3 = iy;
+		    q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, 
+			    q__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (alpha->r == 0.f && alpha->i == 0.f) {
+	return 0;
+    }
+    kk = 1;
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+
+/*        Form  y  when AP contains the upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		k = kk;
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = i__;
+		    i__4 = i__;
+		    i__5 = k;
+		    q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, 
+			    q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
+			    .r;
+		    q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
+		    y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+		    r_cnjg(&q__3, &ap[k]);
+		    i__3 = i__;
+		    q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i =
+			     q__3.r * x[i__3].i + q__3.i * x[i__3].r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+		    ++k;
+/* L50: */
+		}
+		i__2 = j;
+		i__3 = j;
+		i__4 = kk + j - 1;
+		r__1 = ap[i__4].r;
+		q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
+		q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i;
+		q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
+		y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		kk += j;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = jx;
+		q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		ix = kx;
+		iy = ky;
+		i__2 = kk + j - 2;
+		for (k = kk; k <= i__2; ++k) {
+		    i__3 = iy;
+		    i__4 = iy;
+		    i__5 = k;
+		    q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, 
+			    q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
+			    .r;
+		    q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
+		    y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+		    r_cnjg(&q__3, &ap[k]);
+		    i__3 = ix;
+		    q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i =
+			     q__3.r * x[i__3].i + q__3.i * x[i__3].r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		i__2 = jy;
+		i__3 = jy;
+		i__4 = kk + j - 1;
+		r__1 = ap[i__4].r;
+		q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
+		q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i;
+		q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
+		y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		jx += *incx;
+		jy += *incy;
+		kk += j;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when AP contains the lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		i__2 = j;
+		i__3 = j;
+		i__4 = kk;
+		r__1 = ap[i__4].r;
+		q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
+		q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
+		y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		k = kk + 1;
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    i__3 = i__;
+		    i__4 = i__;
+		    i__5 = k;
+		    q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, 
+			    q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
+			    .r;
+		    q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
+		    y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+		    r_cnjg(&q__3, &ap[k]);
+		    i__3 = i__;
+		    q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i =
+			     q__3.r * x[i__3].i + q__3.i * x[i__3].r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+		    ++k;
+/* L90: */
+		}
+		i__2 = j;
+		i__3 = j;
+		q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
+		y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		kk += *n - j + 1;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = jx;
+		q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		i__2 = jy;
+		i__3 = jy;
+		i__4 = kk;
+		r__1 = ap[i__4].r;
+		q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
+		q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
+		y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		ix = jx;
+		iy = jy;
+		i__2 = kk + *n - j;
+		for (k = kk + 1; k <= i__2; ++k) {
+		    ix += *incx;
+		    iy += *incy;
+		    i__3 = iy;
+		    i__4 = iy;
+		    i__5 = k;
+		    q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, 
+			    q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
+			    .r;
+		    q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
+		    y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+		    r_cnjg(&q__3, &ap[k]);
+		    i__3 = ix;
+		    q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i =
+			     q__3.r * x[i__3].i + q__3.i * x[i__3].r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+/* L110: */
+		}
+		i__2 = jy;
+		i__3 = jy;
+		q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
+		y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		jx += *incx;
+		jy += *incy;
+		kk += *n - j + 1;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CHPMV . */
+
+} /* chpmv_ */
+

diff --git a/blas/f2c/complexdots.c b/blas/f2c/complexdots.c
new file mode 100644
index 0000000..a856a23
--- /dev/null
+++ b/blas/f2c/complexdots.c

@@ -0,0 +1,84 @@
+/* This file has been modified to use the standard gfortran calling
+   convention, rather than the f2c calling convention.
+
+   It does not require -ff2c when compiled with gfortran.
+*/
+
+/* complexdots.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+complex cdotc_(integer *n, complex *cx, integer 
+	*incx, complex *cy, integer *incy)
+{
+    complex res;
+    extern /* Subroutine */ int cdotcw_(integer *, complex *, integer *, 
+	    complex *, integer *, complex *);
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    cdotcw_(n, &cx[1], incx, &cy[1], incy, &res);
+    return res;
+} /* cdotc_ */
+
+complex cdotu_(integer *n, complex *cx, integer 
+	*incx, complex *cy, integer *incy)
+{
+    complex res;
+    extern /* Subroutine */ int cdotuw_(integer *, complex *, integer *, 
+	    complex *, integer *, complex *);
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    cdotuw_(n, &cx[1], incx, &cy[1], incy, &res);
+    return res;
+} /* cdotu_ */
+
+doublecomplex zdotc_(integer *n, doublecomplex *cx, integer *incx, 
+                     doublecomplex *cy, integer *incy)
+{
+    doublecomplex res;
+    extern /* Subroutine */ int zdotcw_(integer *, doublecomplex *, integer *,
+	     doublecomplex *, integer *, doublecomplex *);
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    zdotcw_(n, &cx[1], incx, &cy[1], incy, &res);
+    return res;
+} /* zdotc_ */
+
+doublecomplex zdotu_(integer *n, doublecomplex *cx, integer *incx, 
+                     doublecomplex *cy, integer *incy)
+{
+    doublecomplex res;
+    extern /* Subroutine */ int zdotuw_(integer *, doublecomplex *, integer *,
+	     doublecomplex *, integer *, doublecomplex *);
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    zdotuw_(n, &cx[1], incx, &cy[1], incy, &res);
+    return res;
+} /* zdotu_ */
+

diff --git a/blas/f2c/ctbmv.c b/blas/f2c/ctbmv.c
new file mode 100644
index 0000000..a6e0dae
--- /dev/null
+++ b/blas/f2c/ctbmv.c

@@ -0,0 +1,647 @@
+/* ctbmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int ctbmv_(char *uplo, char *trans, char *diag, integer *n, 
+	integer *k, complex *a, integer *lda, complex *x, integer *incx, 
+	ftnlen uplo_len, ftnlen trans_len, ftnlen diag_len)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    complex q__1, q__2, q__3;
+
+    /* Builtin functions */
+    void r_cnjg(complex *, complex *);
+
+    /* Local variables */
+    integer i__, j, l, ix, jx, kx, info;
+    complex temp;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    integer kplus1;
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+    logical noconj, nounit;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  CTBMV  performs one of the matrix-vector operations */
+
+/*     x := A*x,   or   x := A'*x,   or   x := conjg( A' )*x, */
+
+/*  where x is an n element vector and  A is an n by n unit, or non-unit, */
+/*  upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the matrix is an upper or */
+/*           lower triangular matrix as follows: */
+
+/*              UPLO = 'U' or 'u'   A is an upper triangular matrix. */
+
+/*              UPLO = 'L' or 'l'   A is a lower triangular matrix. */
+
+/*           Unchanged on exit. */
+
+/*  TRANS  - CHARACTER*1. */
+/*           On entry, TRANS specifies the operation to be performed as */
+/*           follows: */
+
+/*              TRANS = 'N' or 'n'   x := A*x. */
+
+/*              TRANS = 'T' or 't'   x := A'*x. */
+
+/*              TRANS = 'C' or 'c'   x := conjg( A' )*x. */
+
+/*           Unchanged on exit. */
+
+/*  DIAG   - CHARACTER*1. */
+/*           On entry, DIAG specifies whether or not A is unit */
+/*           triangular as follows: */
+
+/*              DIAG = 'U' or 'u'   A is assumed to be unit triangular. */
+
+/*              DIAG = 'N' or 'n'   A is not assumed to be unit */
+/*                                  triangular. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  K      - INTEGER. */
+/*           On entry with UPLO = 'U' or 'u', K specifies the number of */
+/*           super-diagonals of the matrix A. */
+/*           On entry with UPLO = 'L' or 'l', K specifies the number of */
+/*           sub-diagonals of the matrix A. */
+/*           K must satisfy  0 .le. K. */
+/*           Unchanged on exit. */
+
+/*  A      - COMPLEX          array of DIMENSION ( LDA, n ). */
+/*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the upper triangular */
+/*           band part of the matrix of coefficients, supplied column by */
+/*           column, with the leading diagonal of the matrix in row */
+/*           ( k + 1 ) of the array, the first super-diagonal starting at */
+/*           position 2 in row k, and so on. The top left k by k triangle */
+/*           of the array A is not referenced. */
+/*           The following program segment will transfer an upper */
+/*           triangular band matrix from conventional full matrix storage */
+/*           to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = K + 1 - J */
+/*                    DO 10, I = MAX( 1, J - K ), J */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the lower triangular */
+/*           band part of the matrix of coefficients, supplied column by */
+/*           column, with the leading diagonal of the matrix in row 1 of */
+/*           the array, the first sub-diagonal starting at position 1 in */
+/*           row 2, and so on. The bottom right k by k triangle of the */
+/*           array A is not referenced. */
+/*           The following program segment will transfer a lower */
+/*           triangular band matrix from conventional full matrix storage */
+/*           to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = 1 - J */
+/*                    DO 10, I = J, MIN( N, J + K ) */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Note that when DIAG = 'U' or 'u' the elements of the array A */
+/*           corresponding to the diagonal elements of the matrix are not */
+/*           referenced, but are assumed to be unity. */
+/*           Unchanged on exit. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program. LDA must be at least */
+/*           ( k + 1 ). */
+/*           Unchanged on exit. */
+
+/*  X      - COMPLEX          array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the n */
+/*           element vector x. On exit, X is overwritten with the */
+/*           transformed vector x. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, 
+	    "T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, (
+	    ftnlen)1)) {
+	info = 2;
+    } else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag, 
+	    "N", (ftnlen)1, (ftnlen)1)) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*k < 0) {
+	info = 5;
+    } else if (*lda < *k + 1) {
+	info = 7;
+    } else if (*incx == 0) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("CTBMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    noconj = lsame_(trans, "T", (ftnlen)1, (ftnlen)1);
+    nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1);
+
+/*     Set up the start point in X if the increment is not unity. This */
+/*     will be  ( N - 1 )*INCX   too small for descending loops. */
+
+    if (*incx <= 0) {
+	kx = 1 - (*n - 1) * *incx;
+    } else if (*incx != 1) {
+	kx = 1;
+    }
+
+/*     Start the operations. In this version the elements of A are */
+/*     accessed sequentially with one pass through A. */
+
+    if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) {
+
+/*         Form  x := A*x. */
+
+	if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	    kplus1 = *k + 1;
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    if (x[i__2].r != 0.f || x[i__2].i != 0.f) {
+			i__2 = j;
+			temp.r = x[i__2].r, temp.i = x[i__2].i;
+			l = kplus1 - j;
+/* Computing MAX */
+			i__2 = 1, i__3 = j - *k;
+			i__4 = j - 1;
+			for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
+			    i__2 = i__;
+			    i__3 = i__;
+			    i__5 = l + i__ + j * a_dim1;
+			    q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, 
+				    q__2.i = temp.r * a[i__5].i + temp.i * a[
+				    i__5].r;
+			    q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i + 
+				    q__2.i;
+			    x[i__2].r = q__1.r, x[i__2].i = q__1.i;
+/* L10: */
+			}
+			if (nounit) {
+			    i__4 = j;
+			    i__2 = j;
+			    i__3 = kplus1 + j * a_dim1;
+			    q__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[
+				    i__3].i, q__1.i = x[i__2].r * a[i__3].i + 
+				    x[i__2].i * a[i__3].r;
+			    x[i__4].r = q__1.r, x[i__4].i = q__1.i;
+			}
+		    }
+/* L20: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__4 = jx;
+		    if (x[i__4].r != 0.f || x[i__4].i != 0.f) {
+			i__4 = jx;
+			temp.r = x[i__4].r, temp.i = x[i__4].i;
+			ix = kx;
+			l = kplus1 - j;
+/* Computing MAX */
+			i__4 = 1, i__2 = j - *k;
+			i__3 = j - 1;
+			for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
+			    i__4 = ix;
+			    i__2 = ix;
+			    i__5 = l + i__ + j * a_dim1;
+			    q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, 
+				    q__2.i = temp.r * a[i__5].i + temp.i * a[
+				    i__5].r;
+			    q__1.r = x[i__2].r + q__2.r, q__1.i = x[i__2].i + 
+				    q__2.i;
+			    x[i__4].r = q__1.r, x[i__4].i = q__1.i;
+			    ix += *incx;
+/* L30: */
+			}
+			if (nounit) {
+			    i__3 = jx;
+			    i__4 = jx;
+			    i__2 = kplus1 + j * a_dim1;
+			    q__1.r = x[i__4].r * a[i__2].r - x[i__4].i * a[
+				    i__2].i, q__1.i = x[i__4].r * a[i__2].i + 
+				    x[i__4].i * a[i__2].r;
+			    x[i__3].r = q__1.r, x[i__3].i = q__1.i;
+			}
+		    }
+		    jx += *incx;
+		    if (j > *k) {
+			kx += *incx;
+		    }
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    i__1 = j;
+		    if (x[i__1].r != 0.f || x[i__1].i != 0.f) {
+			i__1 = j;
+			temp.r = x[i__1].r, temp.i = x[i__1].i;
+			l = 1 - j;
+/* Computing MIN */
+			i__1 = *n, i__3 = j + *k;
+			i__4 = j + 1;
+			for (i__ = min(i__1,i__3); i__ >= i__4; --i__) {
+			    i__1 = i__;
+			    i__3 = i__;
+			    i__2 = l + i__ + j * a_dim1;
+			    q__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, 
+				    q__2.i = temp.r * a[i__2].i + temp.i * a[
+				    i__2].r;
+			    q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i + 
+				    q__2.i;
+			    x[i__1].r = q__1.r, x[i__1].i = q__1.i;
+/* L50: */
+			}
+			if (nounit) {
+			    i__4 = j;
+			    i__1 = j;
+			    i__3 = j * a_dim1 + 1;
+			    q__1.r = x[i__1].r * a[i__3].r - x[i__1].i * a[
+				    i__3].i, q__1.i = x[i__1].r * a[i__3].i + 
+				    x[i__1].i * a[i__3].r;
+			    x[i__4].r = q__1.r, x[i__4].i = q__1.i;
+			}
+		    }
+/* L60: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    i__4 = jx;
+		    if (x[i__4].r != 0.f || x[i__4].i != 0.f) {
+			i__4 = jx;
+			temp.r = x[i__4].r, temp.i = x[i__4].i;
+			ix = kx;
+			l = 1 - j;
+/* Computing MIN */
+			i__4 = *n, i__1 = j + *k;
+			i__3 = j + 1;
+			for (i__ = min(i__4,i__1); i__ >= i__3; --i__) {
+			    i__4 = ix;
+			    i__1 = ix;
+			    i__2 = l + i__ + j * a_dim1;
+			    q__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, 
+				    q__2.i = temp.r * a[i__2].i + temp.i * a[
+				    i__2].r;
+			    q__1.r = x[i__1].r + q__2.r, q__1.i = x[i__1].i + 
+				    q__2.i;
+			    x[i__4].r = q__1.r, x[i__4].i = q__1.i;
+			    ix -= *incx;
+/* L70: */
+			}
+			if (nounit) {
+			    i__3 = jx;
+			    i__4 = jx;
+			    i__1 = j * a_dim1 + 1;
+			    q__1.r = x[i__4].r * a[i__1].r - x[i__4].i * a[
+				    i__1].i, q__1.i = x[i__4].r * a[i__1].i + 
+				    x[i__4].i * a[i__1].r;
+			    x[i__3].r = q__1.r, x[i__3].i = q__1.i;
+			}
+		    }
+		    jx -= *incx;
+		    if (*n - j >= *k) {
+			kx -= *incx;
+		    }
+/* L80: */
+		}
+	    }
+	}
+    } else {
+
+/*        Form  x := A'*x  or  x := conjg( A' )*x. */
+
+	if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	    kplus1 = *k + 1;
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    i__3 = j;
+		    temp.r = x[i__3].r, temp.i = x[i__3].i;
+		    l = kplus1 - j;
+		    if (noconj) {
+			if (nounit) {
+			    i__3 = kplus1 + j * a_dim1;
+			    q__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, 
+				    q__1.i = temp.r * a[i__3].i + temp.i * a[
+				    i__3].r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+/* Computing MAX */
+			i__4 = 1, i__1 = j - *k;
+			i__3 = max(i__4,i__1);
+			for (i__ = j - 1; i__ >= i__3; --i__) {
+			    i__4 = l + i__ + j * a_dim1;
+			    i__1 = i__;
+			    q__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[
+				    i__1].i, q__2.i = a[i__4].r * x[i__1].i + 
+				    a[i__4].i * x[i__1].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i + 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L90: */
+			}
+		    } else {
+			if (nounit) {
+			    r_cnjg(&q__2, &a[kplus1 + j * a_dim1]);
+			    q__1.r = temp.r * q__2.r - temp.i * q__2.i, 
+				    q__1.i = temp.r * q__2.i + temp.i * 
+				    q__2.r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+/* Computing MAX */
+			i__4 = 1, i__1 = j - *k;
+			i__3 = max(i__4,i__1);
+			for (i__ = j - 1; i__ >= i__3; --i__) {
+			    r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
+			    i__4 = i__;
+			    q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, 
+				    q__2.i = q__3.r * x[i__4].i + q__3.i * x[
+				    i__4].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i + 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L100: */
+			}
+		    }
+		    i__3 = j;
+		    x[i__3].r = temp.r, x[i__3].i = temp.i;
+/* L110: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    i__3 = jx;
+		    temp.r = x[i__3].r, temp.i = x[i__3].i;
+		    kx -= *incx;
+		    ix = kx;
+		    l = kplus1 - j;
+		    if (noconj) {
+			if (nounit) {
+			    i__3 = kplus1 + j * a_dim1;
+			    q__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, 
+				    q__1.i = temp.r * a[i__3].i + temp.i * a[
+				    i__3].r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+/* Computing MAX */
+			i__4 = 1, i__1 = j - *k;
+			i__3 = max(i__4,i__1);
+			for (i__ = j - 1; i__ >= i__3; --i__) {
+			    i__4 = l + i__ + j * a_dim1;
+			    i__1 = ix;
+			    q__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[
+				    i__1].i, q__2.i = a[i__4].r * x[i__1].i + 
+				    a[i__4].i * x[i__1].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i + 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			    ix -= *incx;
+/* L120: */
+			}
+		    } else {
+			if (nounit) {
+			    r_cnjg(&q__2, &a[kplus1 + j * a_dim1]);
+			    q__1.r = temp.r * q__2.r - temp.i * q__2.i, 
+				    q__1.i = temp.r * q__2.i + temp.i * 
+				    q__2.r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+/* Computing MAX */
+			i__4 = 1, i__1 = j - *k;
+			i__3 = max(i__4,i__1);
+			for (i__ = j - 1; i__ >= i__3; --i__) {
+			    r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
+			    i__4 = ix;
+			    q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, 
+				    q__2.i = q__3.r * x[i__4].i + q__3.i * x[
+				    i__4].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i + 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			    ix -= *incx;
+/* L130: */
+			}
+		    }
+		    i__3 = jx;
+		    x[i__3].r = temp.r, x[i__3].i = temp.i;
+		    jx -= *incx;
+/* L140: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		i__3 = *n;
+		for (j = 1; j <= i__3; ++j) {
+		    i__4 = j;
+		    temp.r = x[i__4].r, temp.i = x[i__4].i;
+		    l = 1 - j;
+		    if (noconj) {
+			if (nounit) {
+			    i__4 = j * a_dim1 + 1;
+			    q__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, 
+				    q__1.i = temp.r * a[i__4].i + temp.i * a[
+				    i__4].r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+/* Computing MIN */
+			i__1 = *n, i__2 = j + *k;
+			i__4 = min(i__1,i__2);
+			for (i__ = j + 1; i__ <= i__4; ++i__) {
+			    i__1 = l + i__ + j * a_dim1;
+			    i__2 = i__;
+			    q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[
+				    i__2].i, q__2.i = a[i__1].r * x[i__2].i + 
+				    a[i__1].i * x[i__2].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i + 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L150: */
+			}
+		    } else {
+			if (nounit) {
+			    r_cnjg(&q__2, &a[j * a_dim1 + 1]);
+			    q__1.r = temp.r * q__2.r - temp.i * q__2.i, 
+				    q__1.i = temp.r * q__2.i + temp.i * 
+				    q__2.r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+/* Computing MIN */
+			i__1 = *n, i__2 = j + *k;
+			i__4 = min(i__1,i__2);
+			for (i__ = j + 1; i__ <= i__4; ++i__) {
+			    r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
+			    i__1 = i__;
+			    q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i, 
+				    q__2.i = q__3.r * x[i__1].i + q__3.i * x[
+				    i__1].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i + 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L160: */
+			}
+		    }
+		    i__4 = j;
+		    x[i__4].r = temp.r, x[i__4].i = temp.i;
+/* L170: */
+		}
+	    } else {
+		jx = kx;
+		i__3 = *n;
+		for (j = 1; j <= i__3; ++j) {
+		    i__4 = jx;
+		    temp.r = x[i__4].r, temp.i = x[i__4].i;
+		    kx += *incx;
+		    ix = kx;
+		    l = 1 - j;
+		    if (noconj) {
+			if (nounit) {
+			    i__4 = j * a_dim1 + 1;
+			    q__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, 
+				    q__1.i = temp.r * a[i__4].i + temp.i * a[
+				    i__4].r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+/* Computing MIN */
+			i__1 = *n, i__2 = j + *k;
+			i__4 = min(i__1,i__2);
+			for (i__ = j + 1; i__ <= i__4; ++i__) {
+			    i__1 = l + i__ + j * a_dim1;
+			    i__2 = ix;
+			    q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[
+				    i__2].i, q__2.i = a[i__1].r * x[i__2].i + 
+				    a[i__1].i * x[i__2].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i + 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			    ix += *incx;
+/* L180: */
+			}
+		    } else {
+			if (nounit) {
+			    r_cnjg(&q__2, &a[j * a_dim1 + 1]);
+			    q__1.r = temp.r * q__2.r - temp.i * q__2.i, 
+				    q__1.i = temp.r * q__2.i + temp.i * 
+				    q__2.r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+/* Computing MIN */
+			i__1 = *n, i__2 = j + *k;
+			i__4 = min(i__1,i__2);
+			for (i__ = j + 1; i__ <= i__4; ++i__) {
+			    r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
+			    i__1 = ix;
+			    q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i, 
+				    q__2.i = q__3.r * x[i__1].i + q__3.i * x[
+				    i__1].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i + 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			    ix += *incx;
+/* L190: */
+			}
+		    }
+		    i__4 = jx;
+		    x[i__4].r = temp.r, x[i__4].i = temp.i;
+		    jx += *incx;
+/* L200: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CTBMV . */
+
+} /* ctbmv_ */
+

diff --git a/blas/f2c/d_cnjg.c b/blas/f2c/d_cnjg.c
new file mode 100644
index 0000000..623090c
--- /dev/null
+++ b/blas/f2c/d_cnjg.c

@@ -0,0 +1,6 @@
+#include "datatypes.h"    
+
+void d_cnjg(doublecomplex *r, doublecomplex *z) {
+    r->r = z->r;
+    r->i = -(z->i);
+}

diff --git a/blas/f2c/datatypes.h b/blas/f2c/datatypes.h
new file mode 100644
index 0000000..63232b2
--- /dev/null
+++ b/blas/f2c/datatypes.h

@@ -0,0 +1,24 @@
+/* This contains a limited subset of the typedefs exposed by f2c
+   for use by the Eigen BLAS C-only implementation.
+*/
+
+#ifndef __EIGEN_DATATYPES_H__
+#define __EIGEN_DATATYPES_H__
+
+typedef int integer;
+typedef unsigned int uinteger;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+typedef int ftnlen;
+typedef int logical;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (doublereal)abs(x)
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (doublereal)min(a,b)
+#define dmax(a,b) (doublereal)max(a,b)
+
+#endif

diff --git a/blas/f2c/drotm.c b/blas/f2c/drotm.c
new file mode 100644
index 0000000..17a779b
--- /dev/null
+++ b/blas/f2c/drotm.c

@@ -0,0 +1,215 @@
+/* drotm.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int drotm_(integer *n, doublereal *dx, integer *incx, 
+	doublereal *dy, integer *incy, doublereal *dparam)
+{
+    /* Initialized data */
+
+    static doublereal zero = 0.;
+    static doublereal two = 2.;
+
+    /* System generated locals */
+    integer i__1, i__2;
+
+    /* Local variables */
+    integer i__;
+    doublereal w, z__;
+    integer kx, ky;
+    doublereal dh11, dh12, dh21, dh22, dflag;
+    integer nsteps;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*     APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */
+
+/*     (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN */
+/*     (DY**T) */
+
+/*     DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */
+/*     LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY. */
+/*     WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
+
+/*     DFLAG=-1.D0     DFLAG=0.D0        DFLAG=1.D0     DFLAG=-2.D0 */
+
+/*       (DH11  DH12)    (1.D0  DH12)    (DH11  1.D0)    (1.D0  0.D0) */
+/*     H=(          )    (          )    (          )    (          ) */
+/*       (DH21  DH22),   (DH21  1.D0),   (-1.D0 DH22),   (0.D0  1.D0). */
+/*     SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  N      (input) INTEGER */
+/*         number of elements in input vector(s) */
+
+/*  DX     (input/output) DOUBLE PRECISION array, dimension N */
+/*         double precision vector with N elements */
+
+/*  INCX   (input) INTEGER */
+/*         storage spacing between elements of DX */
+
+/*  DY     (input/output) DOUBLE PRECISION array, dimension N */
+/*         double precision vector with N elements */
+
+/*  INCY   (input) INTEGER */
+/*         storage spacing between elements of DY */
+
+/*  DPARAM (input/output)  DOUBLE PRECISION array, dimension 5 */
+/*     DPARAM(1)=DFLAG */
+/*     DPARAM(2)=DH11 */
+/*     DPARAM(3)=DH21 */
+/*     DPARAM(4)=DH12 */
+/*     DPARAM(5)=DH22 */
+
+/*  ===================================================================== */
+
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Data statements .. */
+    /* Parameter adjustments */
+    --dparam;
+    --dy;
+    --dx;
+
+    /* Function Body */
+/*     .. */
+
+    dflag = dparam[1];
+    if (*n <= 0 || dflag + two == zero) {
+	goto L140;
+    }
+    if (! (*incx == *incy && *incx > 0)) {
+	goto L70;
+    }
+
+    nsteps = *n * *incx;
+    if (dflag < 0.) {
+	goto L50;
+    } else if (dflag == 0) {
+	goto L10;
+    } else {
+	goto L30;
+    }
+L10:
+    dh12 = dparam[4];
+    dh21 = dparam[3];
+    i__1 = nsteps;
+    i__2 = *incx;
+    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	w = dx[i__];
+	z__ = dy[i__];
+	dx[i__] = w + z__ * dh12;
+	dy[i__] = w * dh21 + z__;
+/* L20: */
+    }
+    goto L140;
+L30:
+    dh11 = dparam[2];
+    dh22 = dparam[5];
+    i__2 = nsteps;
+    i__1 = *incx;
+    for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
+	w = dx[i__];
+	z__ = dy[i__];
+	dx[i__] = w * dh11 + z__;
+	dy[i__] = -w + dh22 * z__;
+/* L40: */
+    }
+    goto L140;
+L50:
+    dh11 = dparam[2];
+    dh12 = dparam[4];
+    dh21 = dparam[3];
+    dh22 = dparam[5];
+    i__1 = nsteps;
+    i__2 = *incx;
+    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	w = dx[i__];
+	z__ = dy[i__];
+	dx[i__] = w * dh11 + z__ * dh12;
+	dy[i__] = w * dh21 + z__ * dh22;
+/* L60: */
+    }
+    goto L140;
+L70:
+    kx = 1;
+    ky = 1;
+    if (*incx < 0) {
+	kx = (1 - *n) * *incx + 1;
+    }
+    if (*incy < 0) {
+	ky = (1 - *n) * *incy + 1;
+    }
+
+    if (dflag < 0.) {
+	goto L120;
+    } else if (dflag == 0) {
+	goto L80;
+    } else {
+	goto L100;
+    }
+L80:
+    dh12 = dparam[4];
+    dh21 = dparam[3];
+    i__2 = *n;
+    for (i__ = 1; i__ <= i__2; ++i__) {
+	w = dx[kx];
+	z__ = dy[ky];
+	dx[kx] = w + z__ * dh12;
+	dy[ky] = w * dh21 + z__;
+	kx += *incx;
+	ky += *incy;
+/* L90: */
+    }
+    goto L140;
+L100:
+    dh11 = dparam[2];
+    dh22 = dparam[5];
+    i__2 = *n;
+    for (i__ = 1; i__ <= i__2; ++i__) {
+	w = dx[kx];
+	z__ = dy[ky];
+	dx[kx] = w * dh11 + z__;
+	dy[ky] = -w + dh22 * z__;
+	kx += *incx;
+	ky += *incy;
+/* L110: */
+    }
+    goto L140;
+L120:
+    dh11 = dparam[2];
+    dh12 = dparam[4];
+    dh21 = dparam[3];
+    dh22 = dparam[5];
+    i__2 = *n;
+    for (i__ = 1; i__ <= i__2; ++i__) {
+	w = dx[kx];
+	z__ = dy[ky];
+	dx[kx] = w * dh11 + z__ * dh12;
+	dy[ky] = w * dh21 + z__ * dh22;
+	kx += *incx;
+	ky += *incy;
+/* L130: */
+    }
+L140:
+    return 0;
+} /* drotm_ */
+

diff --git a/blas/f2c/drotmg.c b/blas/f2c/drotmg.c
new file mode 100644
index 0000000..a63eb10
--- /dev/null
+++ b/blas/f2c/drotmg.c

@@ -0,0 +1,293 @@
+/* drotmg.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int drotmg_(doublereal *dd1, doublereal *dd2, doublereal *
+	dx1, doublereal *dy1, doublereal *dparam)
+{
+    /* Initialized data */
+
+    static doublereal zero = 0.;
+    static doublereal one = 1.;
+    static doublereal two = 2.;
+    static doublereal gam = 4096.;
+    static doublereal gamsq = 16777216.;
+    static doublereal rgamsq = 5.9604645e-8;
+
+    /* Format strings */
+    static char fmt_120[] = "";
+    static char fmt_150[] = "";
+    static char fmt_180[] = "";
+    static char fmt_210[] = "";
+
+    /* System generated locals */
+    doublereal d__1;
+
+    /* Local variables */
+    doublereal du, dp1, dp2, dq1, dq2, dh11, dh12, dh21, dh22;
+    integer igo;
+    doublereal dflag, dtemp;
+
+    /* Assigned format variables */
+    static char *igo_fmt;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*     CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */
+/*     THE SECOND COMPONENT OF THE 2-VECTOR  (DSQRT(DD1)*DX1,DSQRT(DD2)* */
+/*     DY2)**T. */
+/*     WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
+
+/*     DFLAG=-1.D0     DFLAG=0.D0        DFLAG=1.D0     DFLAG=-2.D0 */
+
+/*       (DH11  DH12)    (1.D0  DH12)    (DH11  1.D0)    (1.D0  0.D0) */
+/*     H=(          )    (          )    (          )    (          ) */
+/*       (DH21  DH22),   (DH21  1.D0),   (-1.D0 DH22),   (0.D0  1.D0). */
+/*     LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22 */
+/*     RESPECTIVELY. (VALUES OF 1.D0, -1.D0, OR 0.D0 IMPLIED BY THE */
+/*     VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.) */
+
+/*     THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */
+/*     INEXACT.  THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */
+/*     OF DD1 AND DD2.  ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */
+
+
+/*  Arguments */
+/*  ========= */
+
+/*  DD1    (input/output) DOUBLE PRECISION */
+
+/*  DD2    (input/output) DOUBLE PRECISION */
+
+/*  DX1    (input/output) DOUBLE PRECISION */
+
+/*  DY1    (input) DOUBLE PRECISION */
+
+/*  DPARAM (input/output)  DOUBLE PRECISION array, dimension 5 */
+/*     DPARAM(1)=DFLAG */
+/*     DPARAM(2)=DH11 */
+/*     DPARAM(3)=DH21 */
+/*     DPARAM(4)=DH12 */
+/*     DPARAM(5)=DH22 */
+
+/*  ===================================================================== */
+
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Data statements .. */
+
+    /* Parameter adjustments */
+    --dparam;
+
+    /* Function Body */
+/*     .. */
+    if (! (*dd1 < zero)) {
+	goto L10;
+    }
+/*       GO ZERO-H-D-AND-DX1.. */
+    goto L60;
+L10:
+/*     CASE-DD1-NONNEGATIVE */
+    dp2 = *dd2 * *dy1;
+    if (! (dp2 == zero)) {
+	goto L20;
+    }
+    dflag = -two;
+    goto L260;
+/*     REGULAR-CASE.. */
+L20:
+    dp1 = *dd1 * *dx1;
+    dq2 = dp2 * *dy1;
+    dq1 = dp1 * *dx1;
+
+    if (! (abs(dq1) > abs(dq2))) {
+	goto L40;
+    }
+    dh21 = -(*dy1) / *dx1;
+    dh12 = dp2 / dp1;
+
+    du = one - dh12 * dh21;
+
+    if (! (du <= zero)) {
+	goto L30;
+    }
+/*         GO ZERO-H-D-AND-DX1.. */
+    goto L60;
+L30:
+    dflag = zero;
+    *dd1 /= du;
+    *dd2 /= du;
+    *dx1 *= du;
+/*         GO SCALE-CHECK.. */
+    goto L100;
+L40:
+    if (! (dq2 < zero)) {
+	goto L50;
+    }
+/*         GO ZERO-H-D-AND-DX1.. */
+    goto L60;
+L50:
+    dflag = one;
+    dh11 = dp1 / dp2;
+    dh22 = *dx1 / *dy1;
+    du = one + dh11 * dh22;
+    dtemp = *dd2 / du;
+    *dd2 = *dd1 / du;
+    *dd1 = dtemp;
+    *dx1 = *dy1 * du;
+/*         GO SCALE-CHECK */
+    goto L100;
+/*     PROCEDURE..ZERO-H-D-AND-DX1.. */
+L60:
+    dflag = -one;
+    dh11 = zero;
+    dh12 = zero;
+    dh21 = zero;
+    dh22 = zero;
+
+    *dd1 = zero;
+    *dd2 = zero;
+    *dx1 = zero;
+/*         RETURN.. */
+    goto L220;
+/*     PROCEDURE..FIX-H.. */
+L70:
+    if (! (dflag >= zero)) {
+	goto L90;
+    }
+
+    if (! (dflag == zero)) {
+	goto L80;
+    }
+    dh11 = one;
+    dh22 = one;
+    dflag = -one;
+    goto L90;
+L80:
+    dh21 = -one;
+    dh12 = one;
+    dflag = -one;
+L90:
+    switch (igo) {
+	case 0: goto L120;
+	case 1: goto L150;
+	case 2: goto L180;
+	case 3: goto L210;
+    }
+/*     PROCEDURE..SCALE-CHECK */
+L100:
+L110:
+    if (! (*dd1 <= rgamsq)) {
+	goto L130;
+    }
+    if (*dd1 == zero) {
+	goto L160;
+    }
+    igo = 0;
+    igo_fmt = fmt_120;
+/*              FIX-H.. */
+    goto L70;
+L120:
+/* Computing 2nd power */
+    d__1 = gam;
+    *dd1 *= d__1 * d__1;
+    *dx1 /= gam;
+    dh11 /= gam;
+    dh12 /= gam;
+    goto L110;
+L130:
+L140:
+    if (! (*dd1 >= gamsq)) {
+	goto L160;
+    }
+    igo = 1;
+    igo_fmt = fmt_150;
+/*              FIX-H.. */
+    goto L70;
+L150:
+/* Computing 2nd power */
+    d__1 = gam;
+    *dd1 /= d__1 * d__1;
+    *dx1 *= gam;
+    dh11 *= gam;
+    dh12 *= gam;
+    goto L140;
+L160:
+L170:
+    if (! (abs(*dd2) <= rgamsq)) {
+	goto L190;
+    }
+    if (*dd2 == zero) {
+	goto L220;
+    }
+    igo = 2;
+    igo_fmt = fmt_180;
+/*              FIX-H.. */
+    goto L70;
+L180:
+/* Computing 2nd power */
+    d__1 = gam;
+    *dd2 *= d__1 * d__1;
+    dh21 /= gam;
+    dh22 /= gam;
+    goto L170;
+L190:
+L200:
+    if (! (abs(*dd2) >= gamsq)) {
+	goto L220;
+    }
+    igo = 3;
+    igo_fmt = fmt_210;
+/*              FIX-H.. */
+    goto L70;
+L210:
+/* Computing 2nd power */
+    d__1 = gam;
+    *dd2 /= d__1 * d__1;
+    dh21 *= gam;
+    dh22 *= gam;
+    goto L200;
+L220:
+    if (dflag < 0.) {
+	goto L250;
+    } else if (dflag == 0) {
+	goto L230;
+    } else {
+	goto L240;
+    }
+L230:
+    dparam[3] = dh21;
+    dparam[4] = dh12;
+    goto L260;
+L240:
+    dparam[2] = dh11;
+    dparam[5] = dh22;
+    goto L260;
+L250:
+    dparam[2] = dh11;
+    dparam[3] = dh21;
+    dparam[4] = dh12;
+    dparam[5] = dh22;
+L260:
+    dparam[1] = dflag;
+    return 0;
+} /* drotmg_ */
+

diff --git a/blas/f2c/dsbmv.c b/blas/f2c/dsbmv.c
new file mode 100644
index 0000000..c6b4b21
--- /dev/null
+++ b/blas/f2c/dsbmv.c

@@ -0,0 +1,366 @@
+/* dsbmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int dsbmv_(char *uplo, integer *n, integer *k, doublereal *
+	alpha, doublereal *a, integer *lda, doublereal *x, integer *incx, 
+	doublereal *beta, doublereal *y, integer *incy, ftnlen uplo_len)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
+    doublereal temp1, temp2;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    integer kplus1;
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DSBMV  performs the matrix-vector  operation */
+
+/*     y := alpha*A*x + beta*y, */
+
+/*  where alpha and beta are scalars, x and y are n element vectors and */
+/*  A is an n by n symmetric band matrix, with k super-diagonals. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the upper or lower */
+/*           triangular part of the band matrix A is being supplied as */
+/*           follows: */
+
+/*              UPLO = 'U' or 'u'   The upper triangular part of A is */
+/*                                  being supplied. */
+
+/*              UPLO = 'L' or 'l'   The lower triangular part of A is */
+/*                                  being supplied. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  K      - INTEGER. */
+/*           On entry, K specifies the number of super-diagonals of the */
+/*           matrix A. K must satisfy  0 .le. K. */
+/*           Unchanged on exit. */
+
+/*  ALPHA  - DOUBLE PRECISION. */
+/*           On entry, ALPHA specifies the scalar alpha. */
+/*           Unchanged on exit. */
+
+/*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */
+/*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the upper triangular */
+/*           band part of the symmetric matrix, supplied column by */
+/*           column, with the leading diagonal of the matrix in row */
+/*           ( k + 1 ) of the array, the first super-diagonal starting at */
+/*           position 2 in row k, and so on. The top left k by k triangle */
+/*           of the array A is not referenced. */
+/*           The following program segment will transfer the upper */
+/*           triangular part of a symmetric band matrix from conventional */
+/*           full matrix storage to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = K + 1 - J */
+/*                    DO 10, I = MAX( 1, J - K ), J */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the lower triangular */
+/*           band part of the symmetric matrix, supplied column by */
+/*           column, with the leading diagonal of the matrix in row 1 of */
+/*           the array, the first sub-diagonal starting at position 1 in */
+/*           row 2, and so on. The bottom right k by k triangle of the */
+/*           array A is not referenced. */
+/*           The following program segment will transfer the lower */
+/*           triangular part of a symmetric band matrix from conventional */
+/*           full matrix storage to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = 1 - J */
+/*                    DO 10, I = J, MIN( N, J + K ) */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Unchanged on exit. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program. LDA must be at least */
+/*           ( k + 1 ). */
+/*           Unchanged on exit. */
+
+/*  X      - DOUBLE PRECISION array of DIMENSION at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the */
+/*           vector x. */
+/*           Unchanged on exit. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  BETA   - DOUBLE PRECISION. */
+/*           On entry, BETA specifies the scalar beta. */
+/*           Unchanged on exit. */
+
+/*  Y      - DOUBLE PRECISION array of DIMENSION at least */
+/*           ( 1 + ( n - 1 )*abs( INCY ) ). */
+/*           Before entry, the incremented array Y must contain the */
+/*           vector y. On exit, Y is overwritten by the updated vector y. */
+
+/*  INCY   - INTEGER. */
+/*           On entry, INCY specifies the increment for the elements of */
+/*           Y. INCY must not be zero. */
+/*           Unchanged on exit. */
+
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+    --y;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*k < 0) {
+	info = 3;
+    } else if (*lda < *k + 1) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    } else if (*incy == 0) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("DSBMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (*alpha == 0. && *beta == 1.)) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of the array A */
+/*     are accessed sequentially with one pass through A. */
+
+/*     First form  y := beta*y. */
+
+    if (*beta != 1.) {
+	if (*incy == 1) {
+	    if (*beta == 0.) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = 0.;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = *beta * y[i__];
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (*beta == 0.) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = 0.;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = *beta * y[iy];
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (*alpha == 0.) {
+	return 0;
+    }
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+
+/*        Form  y  when upper triangle of A is stored. */
+
+	kplus1 = *k + 1;
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[j];
+		temp2 = 0.;
+		l = kplus1 - j;
+/* Computing MAX */
+		i__2 = 1, i__3 = j - *k;
+		i__4 = j - 1;
+		for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
+		    y[i__] += temp1 * a[l + i__ + j * a_dim1];
+		    temp2 += a[l + i__ + j * a_dim1] * x[i__];
+/* L50: */
+		}
+		y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[jx];
+		temp2 = 0.;
+		ix = kx;
+		iy = ky;
+		l = kplus1 - j;
+/* Computing MAX */
+		i__4 = 1, i__2 = j - *k;
+		i__3 = j - 1;
+		for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
+		    y[iy] += temp1 * a[l + i__ + j * a_dim1];
+		    temp2 += a[l + i__ + j * a_dim1] * x[ix];
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha * 
+			temp2;
+		jx += *incx;
+		jy += *incy;
+		if (j > *k) {
+		    kx += *incx;
+		    ky += *incy;
+		}
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when lower triangle of A is stored. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[j];
+		temp2 = 0.;
+		y[j] += temp1 * a[j * a_dim1 + 1];
+		l = 1 - j;
+/* Computing MIN */
+		i__4 = *n, i__2 = j + *k;
+		i__3 = min(i__4,i__2);
+		for (i__ = j + 1; i__ <= i__3; ++i__) {
+		    y[i__] += temp1 * a[l + i__ + j * a_dim1];
+		    temp2 += a[l + i__ + j * a_dim1] * x[i__];
+/* L90: */
+		}
+		y[j] += *alpha * temp2;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[jx];
+		temp2 = 0.;
+		y[jy] += temp1 * a[j * a_dim1 + 1];
+		l = 1 - j;
+		ix = jx;
+		iy = jy;
+/* Computing MIN */
+		i__4 = *n, i__2 = j + *k;
+		i__3 = min(i__4,i__2);
+		for (i__ = j + 1; i__ <= i__3; ++i__) {
+		    ix += *incx;
+		    iy += *incy;
+		    y[iy] += temp1 * a[l + i__ + j * a_dim1];
+		    temp2 += a[l + i__ + j * a_dim1] * x[ix];
+/* L110: */
+		}
+		y[jy] += *alpha * temp2;
+		jx += *incx;
+		jy += *incy;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DSBMV . */
+
+} /* dsbmv_ */
+

diff --git a/blas/f2c/dspmv.c b/blas/f2c/dspmv.c
new file mode 100644
index 0000000..0b4e92d
--- /dev/null
+++ b/blas/f2c/dspmv.c

@@ -0,0 +1,316 @@
+/* dspmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int dspmv_(char *uplo, integer *n, doublereal *alpha, 
+	doublereal *ap, doublereal *x, integer *incx, doublereal *beta, 
+	doublereal *y, integer *incy, ftnlen uplo_len)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+
+    /* Local variables */
+    integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
+    doublereal temp1, temp2;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DSPMV  performs the matrix-vector operation */
+
+/*     y := alpha*A*x + beta*y, */
+
+/*  where alpha and beta are scalars, x and y are n element vectors and */
+/*  A is an n by n symmetric matrix, supplied in packed form. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the upper or lower */
+/*           triangular part of the matrix A is supplied in the packed */
+/*           array AP as follows: */
+
+/*              UPLO = 'U' or 'u'   The upper triangular part of A is */
+/*                                  supplied in AP. */
+
+/*              UPLO = 'L' or 'l'   The lower triangular part of A is */
+/*                                  supplied in AP. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  ALPHA  - DOUBLE PRECISION. */
+/*           On entry, ALPHA specifies the scalar alpha. */
+/*           Unchanged on exit. */
+
+/*  AP     - DOUBLE PRECISION array of DIMENSION at least */
+/*           ( ( n*( n + 1 ) )/2 ). */
+/*           Before entry with UPLO = 'U' or 'u', the array AP must */
+/*           contain the upper triangular part of the symmetric matrix */
+/*           packed sequentially, column by column, so that AP( 1 ) */
+/*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
+/*           and a( 2, 2 ) respectively, and so on. */
+/*           Before entry with UPLO = 'L' or 'l', the array AP must */
+/*           contain the lower triangular part of the symmetric matrix */
+/*           packed sequentially, column by column, so that AP( 1 ) */
+/*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
+/*           and a( 3, 1 ) respectively, and so on. */
+/*           Unchanged on exit. */
+
+/*  X      - DOUBLE PRECISION array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the n */
+/*           element vector x. */
+/*           Unchanged on exit. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  BETA   - DOUBLE PRECISION. */
+/*           On entry, BETA specifies the scalar beta. When BETA is */
+/*           supplied as zero then Y need not be set on input. */
+/*           Unchanged on exit. */
+
+/*  Y      - DOUBLE PRECISION array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCY ) ). */
+/*           Before entry, the incremented array Y must contain the n */
+/*           element vector y. On exit, Y is overwritten by the updated */
+/*           vector y. */
+
+/*  INCY   - INTEGER. */
+/*           On entry, INCY specifies the increment for the elements of */
+/*           Y. INCY must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    --y;
+    --x;
+    --ap;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 6;
+    } else if (*incy == 0) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("DSPMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (*alpha == 0. && *beta == 1.)) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of the array AP */
+/*     are accessed sequentially with one pass through AP. */
+
+/*     First form  y := beta*y. */
+
+    if (*beta != 1.) {
+	if (*incy == 1) {
+	    if (*beta == 0.) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = 0.;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = *beta * y[i__];
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (*beta == 0.) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = 0.;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = *beta * y[iy];
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (*alpha == 0.) {
+	return 0;
+    }
+    kk = 1;
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+
+/*        Form  y  when AP contains the upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[j];
+		temp2 = 0.;
+		k = kk;
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    y[i__] += temp1 * ap[k];
+		    temp2 += ap[k] * x[i__];
+		    ++k;
+/* L50: */
+		}
+		y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2;
+		kk += j;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[jx];
+		temp2 = 0.;
+		ix = kx;
+		iy = ky;
+		i__2 = kk + j - 2;
+		for (k = kk; k <= i__2; ++k) {
+		    y[iy] += temp1 * ap[k];
+		    temp2 += ap[k] * x[ix];
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2;
+		jx += *incx;
+		jy += *incy;
+		kk += j;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when AP contains the lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[j];
+		temp2 = 0.;
+		y[j] += temp1 * ap[kk];
+		k = kk + 1;
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    y[i__] += temp1 * ap[k];
+		    temp2 += ap[k] * x[i__];
+		    ++k;
+/* L90: */
+		}
+		y[j] += *alpha * temp2;
+		kk += *n - j + 1;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[jx];
+		temp2 = 0.;
+		y[jy] += temp1 * ap[kk];
+		ix = jx;
+		iy = jy;
+		i__2 = kk + *n - j;
+		for (k = kk + 1; k <= i__2; ++k) {
+		    ix += *incx;
+		    iy += *incy;
+		    y[iy] += temp1 * ap[k];
+		    temp2 += ap[k] * x[ix];
+/* L110: */
+		}
+		y[jy] += *alpha * temp2;
+		jx += *incx;
+		jy += *incy;
+		kk += *n - j + 1;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DSPMV . */
+
+} /* dspmv_ */
+

diff --git a/blas/f2c/dtbmv.c b/blas/f2c/dtbmv.c
new file mode 100644
index 0000000..aa67d19
--- /dev/null
+++ b/blas/f2c/dtbmv.c

@@ -0,0 +1,428 @@
+/* dtbmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int dtbmv_(char *uplo, char *trans, char *diag, integer *n, 
+	integer *k, doublereal *a, integer *lda, doublereal *x, integer *incx,
+	 ftnlen uplo_len, ftnlen trans_len, ftnlen diag_len)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    integer i__, j, l, ix, jx, kx, info;
+    doublereal temp;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    integer kplus1;
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+    logical nounit;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DTBMV  performs one of the matrix-vector operations */
+
+/*     x := A*x,   or   x := A'*x, */
+
+/*  where x is an n element vector and  A is an n by n unit, or non-unit, */
+/*  upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the matrix is an upper or */
+/*           lower triangular matrix as follows: */
+
+/*              UPLO = 'U' or 'u'   A is an upper triangular matrix. */
+
+/*              UPLO = 'L' or 'l'   A is a lower triangular matrix. */
+
+/*           Unchanged on exit. */
+
+/*  TRANS  - CHARACTER*1. */
+/*           On entry, TRANS specifies the operation to be performed as */
+/*           follows: */
+
+/*              TRANS = 'N' or 'n'   x := A*x. */
+
+/*              TRANS = 'T' or 't'   x := A'*x. */
+
+/*              TRANS = 'C' or 'c'   x := A'*x. */
+
+/*           Unchanged on exit. */
+
+/*  DIAG   - CHARACTER*1. */
+/*           On entry, DIAG specifies whether or not A is unit */
+/*           triangular as follows: */
+
+/*              DIAG = 'U' or 'u'   A is assumed to be unit triangular. */
+
+/*              DIAG = 'N' or 'n'   A is not assumed to be unit */
+/*                                  triangular. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  K      - INTEGER. */
+/*           On entry with UPLO = 'U' or 'u', K specifies the number of */
+/*           super-diagonals of the matrix A. */
+/*           On entry with UPLO = 'L' or 'l', K specifies the number of */
+/*           sub-diagonals of the matrix A. */
+/*           K must satisfy  0 .le. K. */
+/*           Unchanged on exit. */
+
+/*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */
+/*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the upper triangular */
+/*           band part of the matrix of coefficients, supplied column by */
+/*           column, with the leading diagonal of the matrix in row */
+/*           ( k + 1 ) of the array, the first super-diagonal starting at */
+/*           position 2 in row k, and so on. The top left k by k triangle */
+/*           of the array A is not referenced. */
+/*           The following program segment will transfer an upper */
+/*           triangular band matrix from conventional full matrix storage */
+/*           to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = K + 1 - J */
+/*                    DO 10, I = MAX( 1, J - K ), J */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the lower triangular */
+/*           band part of the matrix of coefficients, supplied column by */
+/*           column, with the leading diagonal of the matrix in row 1 of */
+/*           the array, the first sub-diagonal starting at position 1 in */
+/*           row 2, and so on. The bottom right k by k triangle of the */
+/*           array A is not referenced. */
+/*           The following program segment will transfer a lower */
+/*           triangular band matrix from conventional full matrix storage */
+/*           to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = 1 - J */
+/*                    DO 10, I = J, MIN( N, J + K ) */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Note that when DIAG = 'U' or 'u' the elements of the array A */
+/*           corresponding to the diagonal elements of the matrix are not */
+/*           referenced, but are assumed to be unity. */
+/*           Unchanged on exit. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program. LDA must be at least */
+/*           ( k + 1 ). */
+/*           Unchanged on exit. */
+
+/*  X      - DOUBLE PRECISION array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the n */
+/*           element vector x. On exit, X is overwritten with the */
+/*           transformed vector x. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, 
+	    "T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, (
+	    ftnlen)1)) {
+	info = 2;
+    } else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag, 
+	    "N", (ftnlen)1, (ftnlen)1)) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*k < 0) {
+	info = 5;
+    } else if (*lda < *k + 1) {
+	info = 7;
+    } else if (*incx == 0) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("DTBMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1);
+
+/*     Set up the start point in X if the increment is not unity. This */
+/*     will be  ( N - 1 )*INCX   too small for descending loops. */
+
+    if (*incx <= 0) {
+	kx = 1 - (*n - 1) * *incx;
+    } else if (*incx != 1) {
+	kx = 1;
+    }
+
+/*     Start the operations. In this version the elements of A are */
+/*     accessed sequentially with one pass through A. */
+
+    if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) {
+
+/*         Form  x := A*x. */
+
+	if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	    kplus1 = *k + 1;
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (x[j] != 0.) {
+			temp = x[j];
+			l = kplus1 - j;
+/* Computing MAX */
+			i__2 = 1, i__3 = j - *k;
+			i__4 = j - 1;
+			for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
+			    x[i__] += temp * a[l + i__ + j * a_dim1];
+/* L10: */
+			}
+			if (nounit) {
+			    x[j] *= a[kplus1 + j * a_dim1];
+			}
+		    }
+/* L20: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (x[jx] != 0.) {
+			temp = x[jx];
+			ix = kx;
+			l = kplus1 - j;
+/* Computing MAX */
+			i__4 = 1, i__2 = j - *k;
+			i__3 = j - 1;
+			for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
+			    x[ix] += temp * a[l + i__ + j * a_dim1];
+			    ix += *incx;
+/* L30: */
+			}
+			if (nounit) {
+			    x[jx] *= a[kplus1 + j * a_dim1];
+			}
+		    }
+		    jx += *incx;
+		    if (j > *k) {
+			kx += *incx;
+		    }
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    if (x[j] != 0.) {
+			temp = x[j];
+			l = 1 - j;
+/* Computing MIN */
+			i__1 = *n, i__3 = j + *k;
+			i__4 = j + 1;
+			for (i__ = min(i__1,i__3); i__ >= i__4; --i__) {
+			    x[i__] += temp * a[l + i__ + j * a_dim1];
+/* L50: */
+			}
+			if (nounit) {
+			    x[j] *= a[j * a_dim1 + 1];
+			}
+		    }
+/* L60: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    if (x[jx] != 0.) {
+			temp = x[jx];
+			ix = kx;
+			l = 1 - j;
+/* Computing MIN */
+			i__4 = *n, i__1 = j + *k;
+			i__3 = j + 1;
+			for (i__ = min(i__4,i__1); i__ >= i__3; --i__) {
+			    x[ix] += temp * a[l + i__ + j * a_dim1];
+			    ix -= *incx;
+/* L70: */
+			}
+			if (nounit) {
+			    x[jx] *= a[j * a_dim1 + 1];
+			}
+		    }
+		    jx -= *incx;
+		    if (*n - j >= *k) {
+			kx -= *incx;
+		    }
+/* L80: */
+		}
+	    }
+	}
+    } else {
+
+/*        Form  x := A'*x. */
+
+	if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	    kplus1 = *k + 1;
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    temp = x[j];
+		    l = kplus1 - j;
+		    if (nounit) {
+			temp *= a[kplus1 + j * a_dim1];
+		    }
+/* Computing MAX */
+		    i__4 = 1, i__1 = j - *k;
+		    i__3 = max(i__4,i__1);
+		    for (i__ = j - 1; i__ >= i__3; --i__) {
+			temp += a[l + i__ + j * a_dim1] * x[i__];
+/* L90: */
+		    }
+		    x[j] = temp;
+/* L100: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    temp = x[jx];
+		    kx -= *incx;
+		    ix = kx;
+		    l = kplus1 - j;
+		    if (nounit) {
+			temp *= a[kplus1 + j * a_dim1];
+		    }
+/* Computing MAX */
+		    i__4 = 1, i__1 = j - *k;
+		    i__3 = max(i__4,i__1);
+		    for (i__ = j - 1; i__ >= i__3; --i__) {
+			temp += a[l + i__ + j * a_dim1] * x[ix];
+			ix -= *incx;
+/* L110: */
+		    }
+		    x[jx] = temp;
+		    jx -= *incx;
+/* L120: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		i__3 = *n;
+		for (j = 1; j <= i__3; ++j) {
+		    temp = x[j];
+		    l = 1 - j;
+		    if (nounit) {
+			temp *= a[j * a_dim1 + 1];
+		    }
+/* Computing MIN */
+		    i__1 = *n, i__2 = j + *k;
+		    i__4 = min(i__1,i__2);
+		    for (i__ = j + 1; i__ <= i__4; ++i__) {
+			temp += a[l + i__ + j * a_dim1] * x[i__];
+/* L130: */
+		    }
+		    x[j] = temp;
+/* L140: */
+		}
+	    } else {
+		jx = kx;
+		i__3 = *n;
+		for (j = 1; j <= i__3; ++j) {
+		    temp = x[jx];
+		    kx += *incx;
+		    ix = kx;
+		    l = 1 - j;
+		    if (nounit) {
+			temp *= a[j * a_dim1 + 1];
+		    }
+/* Computing MIN */
+		    i__1 = *n, i__2 = j + *k;
+		    i__4 = min(i__1,i__2);
+		    for (i__ = j + 1; i__ <= i__4; ++i__) {
+			temp += a[l + i__ + j * a_dim1] * x[ix];
+			ix += *incx;
+/* L150: */
+		    }
+		    x[jx] = temp;
+		    jx += *incx;
+/* L160: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DTBMV . */
+
+} /* dtbmv_ */
+

diff --git a/blas/f2c/lsame.c b/blas/f2c/lsame.c
new file mode 100644
index 0000000..46324d9
--- /dev/null
+++ b/blas/f2c/lsame.c

@@ -0,0 +1,117 @@
+/* lsame.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+logical lsame_(char *ca, char *cb, ftnlen ca_len, ftnlen cb_len)
+{
+    /* System generated locals */
+    logical ret_val;
+
+    /* Local variables */
+    integer inta, intb, zcode;
+
+
+/*  -- LAPACK auxiliary routine (version 3.1) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  LSAME returns .TRUE. if CA is the same letter as CB regardless of */
+/*  case. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  CA      (input) CHARACTER*1 */
+
+/*  CB      (input) CHARACTER*1 */
+/*          CA and CB specify the single characters to be compared. */
+
+/* ===================================================================== */
+
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+
+/*     Test if the characters are equal */
+
+    ret_val = *(unsigned char *)ca == *(unsigned char *)cb;
+    if (ret_val) {
+	return ret_val;
+    }
+
+/*     Now test for equivalence if both characters are alphabetic. */
+
+    zcode = 'Z';
+
+/*     Use 'Z' rather than 'A' so that ASCII can be detected on Prime */
+/*     machines, on which ICHAR returns a value with bit 8 set. */
+/*     ICHAR('A') on Prime machines returns 193 which is the same as */
+/*     ICHAR('A') on an EBCDIC machine. */
+
+    inta = *(unsigned char *)ca;
+    intb = *(unsigned char *)cb;
+
+    if (zcode == 90 || zcode == 122) {
+
+/*        ASCII is assumed - ZCODE is the ASCII code of either lower or */
+/*        upper case 'Z'. */
+
+	if (inta >= 97 && inta <= 122) {
+	    inta += -32;
+	}
+	if (intb >= 97 && intb <= 122) {
+	    intb += -32;
+	}
+
+    } else if (zcode == 233 || zcode == 169) {
+
+/*        EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */
+/*        upper case 'Z'. */
+
+	if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || 
+            (inta >= 162 && inta <= 169)) {
+	    inta += 64;
+	}
+	if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || 
+            (intb >= 162 && intb <= 169)) {
+	    intb += 64;
+	}
+
+    } else if (zcode == 218 || zcode == 250) {
+
+/*        ASCII is assumed, on Prime machines - ZCODE is the ASCII code */
+/*        plus 128 of either lower or upper case 'Z'. */
+
+	if (inta >= 225 && inta <= 250) {
+	    inta += -32;
+	}
+	if (intb >= 225 && intb <= 250) {
+	    intb += -32;
+	}
+    }
+    ret_val = inta == intb;
+
+/*     RETURN */
+
+/*     End of LSAME */
+
+    return ret_val;
+} /* lsame_ */
+

diff --git a/blas/f2c/r_cnjg.c b/blas/f2c/r_cnjg.c
new file mode 100644
index 0000000..c08182f
--- /dev/null
+++ b/blas/f2c/r_cnjg.c

@@ -0,0 +1,6 @@
+#include "datatypes.h"    
+
+void r_cnjg(complex *r, complex *z) {
+    r->r = z->r;
+    r->i = -(z->i);
+}

diff --git a/blas/f2c/srotm.c b/blas/f2c/srotm.c
new file mode 100644
index 0000000..bd5944a
--- /dev/null
+++ b/blas/f2c/srotm.c

@@ -0,0 +1,216 @@
+/* srotm.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int srotm_(integer *n, real *sx, integer *incx, real *sy, 
+	integer *incy, real *sparam)
+{
+    /* Initialized data */
+
+    static real zero = 0.f;
+    static real two = 2.f;
+
+    /* System generated locals */
+    integer i__1, i__2;
+
+    /* Local variables */
+    integer i__;
+    real w, z__;
+    integer kx, ky;
+    real sh11, sh12, sh21, sh22, sflag;
+    integer nsteps;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*     APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */
+
+/*     (SX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF SX ARE IN */
+/*     (DX**T) */
+
+/*     SX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */
+/*     LX = (-INCX)*N, AND SIMILARLY FOR SY USING USING LY AND INCY. */
+/*     WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
+
+/*     SFLAG=-1.E0     SFLAG=0.E0        SFLAG=1.E0     SFLAG=-2.E0 */
+
+/*       (SH11  SH12)    (1.E0  SH12)    (SH11  1.E0)    (1.E0  0.E0) */
+/*     H=(          )    (          )    (          )    (          ) */
+/*       (SH21  SH22),   (SH21  1.E0),   (-1.E0 SH22),   (0.E0  1.E0). */
+/*     SEE  SROTMG FOR A DESCRIPTION OF DATA STORAGE IN SPARAM. */
+
+
+/*  Arguments */
+/*  ========= */
+
+/*  N      (input) INTEGER */
+/*         number of elements in input vector(s) */
+
+/*  SX     (input/output) REAL array, dimension N */
+/*         double precision vector with N elements */
+
+/*  INCX   (input) INTEGER */
+/*         storage spacing between elements of SX */
+
+/*  SY     (input/output) REAL array, dimension N */
+/*         double precision vector with N elements */
+
+/*  INCY   (input) INTEGER */
+/*         storage spacing between elements of SY */
+
+/*  SPARAM (input/output)  REAL array, dimension 5 */
+/*     SPARAM(1)=SFLAG */
+/*     SPARAM(2)=SH11 */
+/*     SPARAM(3)=SH21 */
+/*     SPARAM(4)=SH12 */
+/*     SPARAM(5)=SH22 */
+
+/*  ===================================================================== */
+
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Data statements .. */
+    /* Parameter adjustments */
+    --sparam;
+    --sy;
+    --sx;
+
+    /* Function Body */
+/*     .. */
+
+    sflag = sparam[1];
+    if (*n <= 0 || sflag + two == zero) {
+	goto L140;
+    }
+    if (! (*incx == *incy && *incx > 0)) {
+	goto L70;
+    }
+
+    nsteps = *n * *incx;
+    if (sflag < 0.f) {
+	goto L50;
+    } else if (sflag == 0) {
+	goto L10;
+    } else {
+	goto L30;
+    }
+L10:
+    sh12 = sparam[4];
+    sh21 = sparam[3];
+    i__1 = nsteps;
+    i__2 = *incx;
+    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	w = sx[i__];
+	z__ = sy[i__];
+	sx[i__] = w + z__ * sh12;
+	sy[i__] = w * sh21 + z__;
+/* L20: */
+    }
+    goto L140;
+L30:
+    sh11 = sparam[2];
+    sh22 = sparam[5];
+    i__2 = nsteps;
+    i__1 = *incx;
+    for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
+	w = sx[i__];
+	z__ = sy[i__];
+	sx[i__] = w * sh11 + z__;
+	sy[i__] = -w + sh22 * z__;
+/* L40: */
+    }
+    goto L140;
+L50:
+    sh11 = sparam[2];
+    sh12 = sparam[4];
+    sh21 = sparam[3];
+    sh22 = sparam[5];
+    i__1 = nsteps;
+    i__2 = *incx;
+    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	w = sx[i__];
+	z__ = sy[i__];
+	sx[i__] = w * sh11 + z__ * sh12;
+	sy[i__] = w * sh21 + z__ * sh22;
+/* L60: */
+    }
+    goto L140;
+L70:
+    kx = 1;
+    ky = 1;
+    if (*incx < 0) {
+	kx = (1 - *n) * *incx + 1;
+    }
+    if (*incy < 0) {
+	ky = (1 - *n) * *incy + 1;
+    }
+
+    if (sflag < 0.f) {
+	goto L120;
+    } else if (sflag == 0) {
+	goto L80;
+    } else {
+	goto L100;
+    }
+L80:
+    sh12 = sparam[4];
+    sh21 = sparam[3];
+    i__2 = *n;
+    for (i__ = 1; i__ <= i__2; ++i__) {
+	w = sx[kx];
+	z__ = sy[ky];
+	sx[kx] = w + z__ * sh12;
+	sy[ky] = w * sh21 + z__;
+	kx += *incx;
+	ky += *incy;
+/* L90: */
+    }
+    goto L140;
+L100:
+    sh11 = sparam[2];
+    sh22 = sparam[5];
+    i__2 = *n;
+    for (i__ = 1; i__ <= i__2; ++i__) {
+	w = sx[kx];
+	z__ = sy[ky];
+	sx[kx] = w * sh11 + z__;
+	sy[ky] = -w + sh22 * z__;
+	kx += *incx;
+	ky += *incy;
+/* L110: */
+    }
+    goto L140;
+L120:
+    sh11 = sparam[2];
+    sh12 = sparam[4];
+    sh21 = sparam[3];
+    sh22 = sparam[5];
+    i__2 = *n;
+    for (i__ = 1; i__ <= i__2; ++i__) {
+	w = sx[kx];
+	z__ = sy[ky];
+	sx[kx] = w * sh11 + z__ * sh12;
+	sy[ky] = w * sh21 + z__ * sh22;
+	kx += *incx;
+	ky += *incy;
+/* L130: */
+    }
+L140:
+    return 0;
+} /* srotm_ */
+

diff --git a/blas/f2c/srotmg.c b/blas/f2c/srotmg.c
new file mode 100644
index 0000000..75f789f
--- /dev/null
+++ b/blas/f2c/srotmg.c

@@ -0,0 +1,295 @@
+/* srotmg.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int srotmg_(real *sd1, real *sd2, real *sx1, real *sy1, real 
+	*sparam)
+{
+    /* Initialized data */
+
+    static real zero = 0.f;
+    static real one = 1.f;
+    static real two = 2.f;
+    static real gam = 4096.f;
+    static real gamsq = 16777200.f;
+    static real rgamsq = 5.96046e-8f;
+
+    /* Format strings */
+    static char fmt_120[] = "";
+    static char fmt_150[] = "";
+    static char fmt_180[] = "";
+    static char fmt_210[] = "";
+
+    /* System generated locals */
+    real r__1;
+
+    /* Local variables */
+    real su, sp1, sp2, sq1, sq2, sh11, sh12, sh21, sh22;
+    integer igo;
+    real sflag, stemp;
+
+    /* Assigned format variables */
+    static char *igo_fmt;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*     CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */
+/*     THE SECOND COMPONENT OF THE 2-VECTOR  (SQRT(SD1)*SX1,SQRT(SD2)* */
+/*     SY2)**T. */
+/*     WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
+
+/*     SFLAG=-1.E0     SFLAG=0.E0        SFLAG=1.E0     SFLAG=-2.E0 */
+
+/*       (SH11  SH12)    (1.E0  SH12)    (SH11  1.E0)    (1.E0  0.E0) */
+/*     H=(          )    (          )    (          )    (          ) */
+/*       (SH21  SH22),   (SH21  1.E0),   (-1.E0 SH22),   (0.E0  1.E0). */
+/*     LOCATIONS 2-4 OF SPARAM CONTAIN SH11,SH21,SH12, AND SH22 */
+/*     RESPECTIVELY. (VALUES OF 1.E0, -1.E0, OR 0.E0 IMPLIED BY THE */
+/*     VALUE OF SPARAM(1) ARE NOT STORED IN SPARAM.) */
+
+/*     THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */
+/*     INEXACT.  THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */
+/*     OF SD1 AND SD2.  ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */
+
+
+/*  Arguments */
+/*  ========= */
+
+
+/*  SD1    (input/output) REAL */
+
+/*  SD2    (input/output) REAL */
+
+/*  SX1    (input/output) REAL */
+
+/*  SY1    (input) REAL */
+
+
+/*  SPARAM (input/output)  REAL array, dimension 5 */
+/*     SPARAM(1)=SFLAG */
+/*     SPARAM(2)=SH11 */
+/*     SPARAM(3)=SH21 */
+/*     SPARAM(4)=SH12 */
+/*     SPARAM(5)=SH22 */
+
+/*  ===================================================================== */
+
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Data statements .. */
+
+    /* Parameter adjustments */
+    --sparam;
+
+    /* Function Body */
+/*     .. */
+    if (! (*sd1 < zero)) {
+	goto L10;
+    }
+/*       GO ZERO-H-D-AND-SX1.. */
+    goto L60;
+L10:
+/*     CASE-SD1-NONNEGATIVE */
+    sp2 = *sd2 * *sy1;
+    if (! (sp2 == zero)) {
+	goto L20;
+    }
+    sflag = -two;
+    goto L260;
+/*     REGULAR-CASE.. */
+L20:
+    sp1 = *sd1 * *sx1;
+    sq2 = sp2 * *sy1;
+    sq1 = sp1 * *sx1;
+
+    if (! (dabs(sq1) > dabs(sq2))) {
+	goto L40;
+    }
+    sh21 = -(*sy1) / *sx1;
+    sh12 = sp2 / sp1;
+
+    su = one - sh12 * sh21;
+
+    if (! (su <= zero)) {
+	goto L30;
+    }
+/*         GO ZERO-H-D-AND-SX1.. */
+    goto L60;
+L30:
+    sflag = zero;
+    *sd1 /= su;
+    *sd2 /= su;
+    *sx1 *= su;
+/*         GO SCALE-CHECK.. */
+    goto L100;
+L40:
+    if (! (sq2 < zero)) {
+	goto L50;
+    }
+/*         GO ZERO-H-D-AND-SX1.. */
+    goto L60;
+L50:
+    sflag = one;
+    sh11 = sp1 / sp2;
+    sh22 = *sx1 / *sy1;
+    su = one + sh11 * sh22;
+    stemp = *sd2 / su;
+    *sd2 = *sd1 / su;
+    *sd1 = stemp;
+    *sx1 = *sy1 * su;
+/*         GO SCALE-CHECK */
+    goto L100;
+/*     PROCEDURE..ZERO-H-D-AND-SX1.. */
+L60:
+    sflag = -one;
+    sh11 = zero;
+    sh12 = zero;
+    sh21 = zero;
+    sh22 = zero;
+
+    *sd1 = zero;
+    *sd2 = zero;
+    *sx1 = zero;
+/*         RETURN.. */
+    goto L220;
+/*     PROCEDURE..FIX-H.. */
+L70:
+    if (! (sflag >= zero)) {
+	goto L90;
+    }
+
+    if (! (sflag == zero)) {
+	goto L80;
+    }
+    sh11 = one;
+    sh22 = one;
+    sflag = -one;
+    goto L90;
+L80:
+    sh21 = -one;
+    sh12 = one;
+    sflag = -one;
+L90:
+    switch (igo) {
+	case 0: goto L120;
+	case 1: goto L150;
+	case 2: goto L180;
+	case 3: goto L210;
+    }
+/*     PROCEDURE..SCALE-CHECK */
+L100:
+L110:
+    if (! (*sd1 <= rgamsq)) {
+	goto L130;
+    }
+    if (*sd1 == zero) {
+	goto L160;
+    }
+    igo = 0;
+    igo_fmt = fmt_120;
+/*              FIX-H.. */
+    goto L70;
+L120:
+/* Computing 2nd power */
+    r__1 = gam;
+    *sd1 *= r__1 * r__1;
+    *sx1 /= gam;
+    sh11 /= gam;
+    sh12 /= gam;
+    goto L110;
+L130:
+L140:
+    if (! (*sd1 >= gamsq)) {
+	goto L160;
+    }
+    igo = 1;
+    igo_fmt = fmt_150;
+/*              FIX-H.. */
+    goto L70;
+L150:
+/* Computing 2nd power */
+    r__1 = gam;
+    *sd1 /= r__1 * r__1;
+    *sx1 *= gam;
+    sh11 *= gam;
+    sh12 *= gam;
+    goto L140;
+L160:
+L170:
+    if (! (dabs(*sd2) <= rgamsq)) {
+	goto L190;
+    }
+    if (*sd2 == zero) {
+	goto L220;
+    }
+    igo = 2;
+    igo_fmt = fmt_180;
+/*              FIX-H.. */
+    goto L70;
+L180:
+/* Computing 2nd power */
+    r__1 = gam;
+    *sd2 *= r__1 * r__1;
+    sh21 /= gam;
+    sh22 /= gam;
+    goto L170;
+L190:
+L200:
+    if (! (dabs(*sd2) >= gamsq)) {
+	goto L220;
+    }
+    igo = 3;
+    igo_fmt = fmt_210;
+/*              FIX-H.. */
+    goto L70;
+L210:
+/* Computing 2nd power */
+    r__1 = gam;
+    *sd2 /= r__1 * r__1;
+    sh21 *= gam;
+    sh22 *= gam;
+    goto L200;
+L220:
+    if (sflag < 0.f) {
+	goto L250;
+    } else if (sflag == 0) {
+	goto L230;
+    } else {
+	goto L240;
+    }
+L230:
+    sparam[3] = sh21;
+    sparam[4] = sh12;
+    goto L260;
+L240:
+    sparam[2] = sh11;
+    sparam[5] = sh22;
+    goto L260;
+L250:
+    sparam[2] = sh11;
+    sparam[3] = sh21;
+    sparam[4] = sh12;
+    sparam[5] = sh22;
+L260:
+    sparam[1] = sflag;
+    return 0;
+} /* srotmg_ */
+

diff --git a/blas/f2c/ssbmv.c b/blas/f2c/ssbmv.c
new file mode 100644
index 0000000..8599325
--- /dev/null
+++ b/blas/f2c/ssbmv.c

@@ -0,0 +1,368 @@
+/* ssbmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int ssbmv_(char *uplo, integer *n, integer *k, real *alpha, 
+	real *a, integer *lda, real *x, integer *incx, real *beta, real *y, 
+	integer *incy, ftnlen uplo_len)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
+    real temp1, temp2;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    integer kplus1;
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  SSBMV  performs the matrix-vector  operation */
+
+/*     y := alpha*A*x + beta*y, */
+
+/*  where alpha and beta are scalars, x and y are n element vectors and */
+/*  A is an n by n symmetric band matrix, with k super-diagonals. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the upper or lower */
+/*           triangular part of the band matrix A is being supplied as */
+/*           follows: */
+
+/*              UPLO = 'U' or 'u'   The upper triangular part of A is */
+/*                                  being supplied. */
+
+/*              UPLO = 'L' or 'l'   The lower triangular part of A is */
+/*                                  being supplied. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  K      - INTEGER. */
+/*           On entry, K specifies the number of super-diagonals of the */
+/*           matrix A. K must satisfy  0 .le. K. */
+/*           Unchanged on exit. */
+
+/*  ALPHA  - REAL            . */
+/*           On entry, ALPHA specifies the scalar alpha. */
+/*           Unchanged on exit. */
+
+/*  A      - REAL             array of DIMENSION ( LDA, n ). */
+/*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the upper triangular */
+/*           band part of the symmetric matrix, supplied column by */
+/*           column, with the leading diagonal of the matrix in row */
+/*           ( k + 1 ) of the array, the first super-diagonal starting at */
+/*           position 2 in row k, and so on. The top left k by k triangle */
+/*           of the array A is not referenced. */
+/*           The following program segment will transfer the upper */
+/*           triangular part of a symmetric band matrix from conventional */
+/*           full matrix storage to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = K + 1 - J */
+/*                    DO 10, I = MAX( 1, J - K ), J */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the lower triangular */
+/*           band part of the symmetric matrix, supplied column by */
+/*           column, with the leading diagonal of the matrix in row 1 of */
+/*           the array, the first sub-diagonal starting at position 1 in */
+/*           row 2, and so on. The bottom right k by k triangle of the */
+/*           array A is not referenced. */
+/*           The following program segment will transfer the lower */
+/*           triangular part of a symmetric band matrix from conventional */
+/*           full matrix storage to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = 1 - J */
+/*                    DO 10, I = J, MIN( N, J + K ) */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Unchanged on exit. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program. LDA must be at least */
+/*           ( k + 1 ). */
+/*           Unchanged on exit. */
+
+/*  X      - REAL             array of DIMENSION at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the */
+/*           vector x. */
+/*           Unchanged on exit. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  BETA   - REAL            . */
+/*           On entry, BETA specifies the scalar beta. */
+/*           Unchanged on exit. */
+
+/*  Y      - REAL             array of DIMENSION at least */
+/*           ( 1 + ( n - 1 )*abs( INCY ) ). */
+/*           Before entry, the incremented array Y must contain the */
+/*           vector y. On exit, Y is overwritten by the updated vector y. */
+
+/*  INCY   - INTEGER. */
+/*           On entry, INCY specifies the increment for the elements of */
+/*           Y. INCY must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+    --y;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*k < 0) {
+	info = 3;
+    } else if (*lda < *k + 1) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    } else if (*incy == 0) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("SSBMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of the array A */
+/*     are accessed sequentially with one pass through A. */
+
+/*     First form  y := beta*y. */
+
+    if (*beta != 1.f) {
+	if (*incy == 1) {
+	    if (*beta == 0.f) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = 0.f;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = *beta * y[i__];
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (*beta == 0.f) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = 0.f;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = *beta * y[iy];
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (*alpha == 0.f) {
+	return 0;
+    }
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+
+/*        Form  y  when upper triangle of A is stored. */
+
+	kplus1 = *k + 1;
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[j];
+		temp2 = 0.f;
+		l = kplus1 - j;
+/* Computing MAX */
+		i__2 = 1, i__3 = j - *k;
+		i__4 = j - 1;
+		for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
+		    y[i__] += temp1 * a[l + i__ + j * a_dim1];
+		    temp2 += a[l + i__ + j * a_dim1] * x[i__];
+/* L50: */
+		}
+		y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[jx];
+		temp2 = 0.f;
+		ix = kx;
+		iy = ky;
+		l = kplus1 - j;
+/* Computing MAX */
+		i__4 = 1, i__2 = j - *k;
+		i__3 = j - 1;
+		for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
+		    y[iy] += temp1 * a[l + i__ + j * a_dim1];
+		    temp2 += a[l + i__ + j * a_dim1] * x[ix];
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha * 
+			temp2;
+		jx += *incx;
+		jy += *incy;
+		if (j > *k) {
+		    kx += *incx;
+		    ky += *incy;
+		}
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when lower triangle of A is stored. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[j];
+		temp2 = 0.f;
+		y[j] += temp1 * a[j * a_dim1 + 1];
+		l = 1 - j;
+/* Computing MIN */
+		i__4 = *n, i__2 = j + *k;
+		i__3 = min(i__4,i__2);
+		for (i__ = j + 1; i__ <= i__3; ++i__) {
+		    y[i__] += temp1 * a[l + i__ + j * a_dim1];
+		    temp2 += a[l + i__ + j * a_dim1] * x[i__];
+/* L90: */
+		}
+		y[j] += *alpha * temp2;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[jx];
+		temp2 = 0.f;
+		y[jy] += temp1 * a[j * a_dim1 + 1];
+		l = 1 - j;
+		ix = jx;
+		iy = jy;
+/* Computing MIN */
+		i__4 = *n, i__2 = j + *k;
+		i__3 = min(i__4,i__2);
+		for (i__ = j + 1; i__ <= i__3; ++i__) {
+		    ix += *incx;
+		    iy += *incy;
+		    y[iy] += temp1 * a[l + i__ + j * a_dim1];
+		    temp2 += a[l + i__ + j * a_dim1] * x[ix];
+/* L110: */
+		}
+		y[jy] += *alpha * temp2;
+		jx += *incx;
+		jy += *incy;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of SSBMV . */
+
+} /* ssbmv_ */
+

diff --git a/blas/f2c/sspmv.c b/blas/f2c/sspmv.c
new file mode 100644
index 0000000..47858ec
--- /dev/null
+++ b/blas/f2c/sspmv.c

@@ -0,0 +1,316 @@
+/* sspmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int sspmv_(char *uplo, integer *n, real *alpha, real *ap, 
+	real *x, integer *incx, real *beta, real *y, integer *incy, ftnlen 
+	uplo_len)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+
+    /* Local variables */
+    integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
+    real temp1, temp2;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  SSPMV  performs the matrix-vector operation */
+
+/*     y := alpha*A*x + beta*y, */
+
+/*  where alpha and beta are scalars, x and y are n element vectors and */
+/*  A is an n by n symmetric matrix, supplied in packed form. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the upper or lower */
+/*           triangular part of the matrix A is supplied in the packed */
+/*           array AP as follows: */
+
+/*              UPLO = 'U' or 'u'   The upper triangular part of A is */
+/*                                  supplied in AP. */
+
+/*              UPLO = 'L' or 'l'   The lower triangular part of A is */
+/*                                  supplied in AP. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  ALPHA  - REAL            . */
+/*           On entry, ALPHA specifies the scalar alpha. */
+/*           Unchanged on exit. */
+
+/*  AP     - REAL             array of DIMENSION at least */
+/*           ( ( n*( n + 1 ) )/2 ). */
+/*           Before entry with UPLO = 'U' or 'u', the array AP must */
+/*           contain the upper triangular part of the symmetric matrix */
+/*           packed sequentially, column by column, so that AP( 1 ) */
+/*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
+/*           and a( 2, 2 ) respectively, and so on. */
+/*           Before entry with UPLO = 'L' or 'l', the array AP must */
+/*           contain the lower triangular part of the symmetric matrix */
+/*           packed sequentially, column by column, so that AP( 1 ) */
+/*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
+/*           and a( 3, 1 ) respectively, and so on. */
+/*           Unchanged on exit. */
+
+/*  X      - REAL             array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the n */
+/*           element vector x. */
+/*           Unchanged on exit. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  BETA   - REAL            . */
+/*           On entry, BETA specifies the scalar beta. When BETA is */
+/*           supplied as zero then Y need not be set on input. */
+/*           Unchanged on exit. */
+
+/*  Y      - REAL             array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCY ) ). */
+/*           Before entry, the incremented array Y must contain the n */
+/*           element vector y. On exit, Y is overwritten by the updated */
+/*           vector y. */
+
+/*  INCY   - INTEGER. */
+/*           On entry, INCY specifies the increment for the elements of */
+/*           Y. INCY must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    --y;
+    --x;
+    --ap;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 6;
+    } else if (*incy == 0) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("SSPMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of the array AP */
+/*     are accessed sequentially with one pass through AP. */
+
+/*     First form  y := beta*y. */
+
+    if (*beta != 1.f) {
+	if (*incy == 1) {
+	    if (*beta == 0.f) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = 0.f;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = *beta * y[i__];
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (*beta == 0.f) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = 0.f;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = *beta * y[iy];
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (*alpha == 0.f) {
+	return 0;
+    }
+    kk = 1;
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+
+/*        Form  y  when AP contains the upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[j];
+		temp2 = 0.f;
+		k = kk;
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    y[i__] += temp1 * ap[k];
+		    temp2 += ap[k] * x[i__];
+		    ++k;
+/* L50: */
+		}
+		y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2;
+		kk += j;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[jx];
+		temp2 = 0.f;
+		ix = kx;
+		iy = ky;
+		i__2 = kk + j - 2;
+		for (k = kk; k <= i__2; ++k) {
+		    y[iy] += temp1 * ap[k];
+		    temp2 += ap[k] * x[ix];
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2;
+		jx += *incx;
+		jy += *incy;
+		kk += j;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when AP contains the lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[j];
+		temp2 = 0.f;
+		y[j] += temp1 * ap[kk];
+		k = kk + 1;
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    y[i__] += temp1 * ap[k];
+		    temp2 += ap[k] * x[i__];
+		    ++k;
+/* L90: */
+		}
+		y[j] += *alpha * temp2;
+		kk += *n - j + 1;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[jx];
+		temp2 = 0.f;
+		y[jy] += temp1 * ap[kk];
+		ix = jx;
+		iy = jy;
+		i__2 = kk + *n - j;
+		for (k = kk + 1; k <= i__2; ++k) {
+		    ix += *incx;
+		    iy += *incy;
+		    y[iy] += temp1 * ap[k];
+		    temp2 += ap[k] * x[ix];
+/* L110: */
+		}
+		y[jy] += *alpha * temp2;
+		jx += *incx;
+		jy += *incy;
+		kk += *n - j + 1;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of SSPMV . */
+
+} /* sspmv_ */
+

diff --git a/blas/f2c/stbmv.c b/blas/f2c/stbmv.c
new file mode 100644
index 0000000..b5a68b5
--- /dev/null
+++ b/blas/f2c/stbmv.c

@@ -0,0 +1,428 @@
+/* stbmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int stbmv_(char *uplo, char *trans, char *diag, integer *n, 
+	integer *k, real *a, integer *lda, real *x, integer *incx, ftnlen 
+	uplo_len, ftnlen trans_len, ftnlen diag_len)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    integer i__, j, l, ix, jx, kx, info;
+    real temp;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    integer kplus1;
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+    logical nounit;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  STBMV  performs one of the matrix-vector operations */
+
+/*     x := A*x,   or   x := A'*x, */
+
+/*  where x is an n element vector and  A is an n by n unit, or non-unit, */
+/*  upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the matrix is an upper or */
+/*           lower triangular matrix as follows: */
+
+/*              UPLO = 'U' or 'u'   A is an upper triangular matrix. */
+
+/*              UPLO = 'L' or 'l'   A is a lower triangular matrix. */
+
+/*           Unchanged on exit. */
+
+/*  TRANS  - CHARACTER*1. */
+/*           On entry, TRANS specifies the operation to be performed as */
+/*           follows: */
+
+/*              TRANS = 'N' or 'n'   x := A*x. */
+
+/*              TRANS = 'T' or 't'   x := A'*x. */
+
+/*              TRANS = 'C' or 'c'   x := A'*x. */
+
+/*           Unchanged on exit. */
+
+/*  DIAG   - CHARACTER*1. */
+/*           On entry, DIAG specifies whether or not A is unit */
+/*           triangular as follows: */
+
+/*              DIAG = 'U' or 'u'   A is assumed to be unit triangular. */
+
+/*              DIAG = 'N' or 'n'   A is not assumed to be unit */
+/*                                  triangular. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  K      - INTEGER. */
+/*           On entry with UPLO = 'U' or 'u', K specifies the number of */
+/*           super-diagonals of the matrix A. */
+/*           On entry with UPLO = 'L' or 'l', K specifies the number of */
+/*           sub-diagonals of the matrix A. */
+/*           K must satisfy  0 .le. K. */
+/*           Unchanged on exit. */
+
+/*  A      - REAL             array of DIMENSION ( LDA, n ). */
+/*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the upper triangular */
+/*           band part of the matrix of coefficients, supplied column by */
+/*           column, with the leading diagonal of the matrix in row */
+/*           ( k + 1 ) of the array, the first super-diagonal starting at */
+/*           position 2 in row k, and so on. The top left k by k triangle */
+/*           of the array A is not referenced. */
+/*           The following program segment will transfer an upper */
+/*           triangular band matrix from conventional full matrix storage */
+/*           to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = K + 1 - J */
+/*                    DO 10, I = MAX( 1, J - K ), J */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the lower triangular */
+/*           band part of the matrix of coefficients, supplied column by */
+/*           column, with the leading diagonal of the matrix in row 1 of */
+/*           the array, the first sub-diagonal starting at position 1 in */
+/*           row 2, and so on. The bottom right k by k triangle of the */
+/*           array A is not referenced. */
+/*           The following program segment will transfer a lower */
+/*           triangular band matrix from conventional full matrix storage */
+/*           to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = 1 - J */
+/*                    DO 10, I = J, MIN( N, J + K ) */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Note that when DIAG = 'U' or 'u' the elements of the array A */
+/*           corresponding to the diagonal elements of the matrix are not */
+/*           referenced, but are assumed to be unity. */
+/*           Unchanged on exit. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program. LDA must be at least */
+/*           ( k + 1 ). */
+/*           Unchanged on exit. */
+
+/*  X      - REAL             array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the n */
+/*           element vector x. On exit, X is overwritten with the */
+/*           transformed vector x. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, 
+	    "T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, (
+	    ftnlen)1)) {
+	info = 2;
+    } else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag, 
+	    "N", (ftnlen)1, (ftnlen)1)) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*k < 0) {
+	info = 5;
+    } else if (*lda < *k + 1) {
+	info = 7;
+    } else if (*incx == 0) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("STBMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1);
+
+/*     Set up the start point in X if the increment is not unity. This */
+/*     will be  ( N - 1 )*INCX   too small for descending loops. */
+
+    if (*incx <= 0) {
+	kx = 1 - (*n - 1) * *incx;
+    } else if (*incx != 1) {
+	kx = 1;
+    }
+
+/*     Start the operations. In this version the elements of A are */
+/*     accessed sequentially with one pass through A. */
+
+    if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) {
+
+/*         Form  x := A*x. */
+
+	if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	    kplus1 = *k + 1;
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (x[j] != 0.f) {
+			temp = x[j];
+			l = kplus1 - j;
+/* Computing MAX */
+			i__2 = 1, i__3 = j - *k;
+			i__4 = j - 1;
+			for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
+			    x[i__] += temp * a[l + i__ + j * a_dim1];
+/* L10: */
+			}
+			if (nounit) {
+			    x[j] *= a[kplus1 + j * a_dim1];
+			}
+		    }
+/* L20: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (x[jx] != 0.f) {
+			temp = x[jx];
+			ix = kx;
+			l = kplus1 - j;
+/* Computing MAX */
+			i__4 = 1, i__2 = j - *k;
+			i__3 = j - 1;
+			for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
+			    x[ix] += temp * a[l + i__ + j * a_dim1];
+			    ix += *incx;
+/* L30: */
+			}
+			if (nounit) {
+			    x[jx] *= a[kplus1 + j * a_dim1];
+			}
+		    }
+		    jx += *incx;
+		    if (j > *k) {
+			kx += *incx;
+		    }
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    if (x[j] != 0.f) {
+			temp = x[j];
+			l = 1 - j;
+/* Computing MIN */
+			i__1 = *n, i__3 = j + *k;
+			i__4 = j + 1;
+			for (i__ = min(i__1,i__3); i__ >= i__4; --i__) {
+			    x[i__] += temp * a[l + i__ + j * a_dim1];
+/* L50: */
+			}
+			if (nounit) {
+			    x[j] *= a[j * a_dim1 + 1];
+			}
+		    }
+/* L60: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    if (x[jx] != 0.f) {
+			temp = x[jx];
+			ix = kx;
+			l = 1 - j;
+/* Computing MIN */
+			i__4 = *n, i__1 = j + *k;
+			i__3 = j + 1;
+			for (i__ = min(i__4,i__1); i__ >= i__3; --i__) {
+			    x[ix] += temp * a[l + i__ + j * a_dim1];
+			    ix -= *incx;
+/* L70: */
+			}
+			if (nounit) {
+			    x[jx] *= a[j * a_dim1 + 1];
+			}
+		    }
+		    jx -= *incx;
+		    if (*n - j >= *k) {
+			kx -= *incx;
+		    }
+/* L80: */
+		}
+	    }
+	}
+    } else {
+
+/*        Form  x := A'*x. */
+
+	if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	    kplus1 = *k + 1;
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    temp = x[j];
+		    l = kplus1 - j;
+		    if (nounit) {
+			temp *= a[kplus1 + j * a_dim1];
+		    }
+/* Computing MAX */
+		    i__4 = 1, i__1 = j - *k;
+		    i__3 = max(i__4,i__1);
+		    for (i__ = j - 1; i__ >= i__3; --i__) {
+			temp += a[l + i__ + j * a_dim1] * x[i__];
+/* L90: */
+		    }
+		    x[j] = temp;
+/* L100: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    temp = x[jx];
+		    kx -= *incx;
+		    ix = kx;
+		    l = kplus1 - j;
+		    if (nounit) {
+			temp *= a[kplus1 + j * a_dim1];
+		    }
+/* Computing MAX */
+		    i__4 = 1, i__1 = j - *k;
+		    i__3 = max(i__4,i__1);
+		    for (i__ = j - 1; i__ >= i__3; --i__) {
+			temp += a[l + i__ + j * a_dim1] * x[ix];
+			ix -= *incx;
+/* L110: */
+		    }
+		    x[jx] = temp;
+		    jx -= *incx;
+/* L120: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		i__3 = *n;
+		for (j = 1; j <= i__3; ++j) {
+		    temp = x[j];
+		    l = 1 - j;
+		    if (nounit) {
+			temp *= a[j * a_dim1 + 1];
+		    }
+/* Computing MIN */
+		    i__1 = *n, i__2 = j + *k;
+		    i__4 = min(i__1,i__2);
+		    for (i__ = j + 1; i__ <= i__4; ++i__) {
+			temp += a[l + i__ + j * a_dim1] * x[i__];
+/* L130: */
+		    }
+		    x[j] = temp;
+/* L140: */
+		}
+	    } else {
+		jx = kx;
+		i__3 = *n;
+		for (j = 1; j <= i__3; ++j) {
+		    temp = x[jx];
+		    kx += *incx;
+		    ix = kx;
+		    l = 1 - j;
+		    if (nounit) {
+			temp *= a[j * a_dim1 + 1];
+		    }
+/* Computing MIN */
+		    i__1 = *n, i__2 = j + *k;
+		    i__4 = min(i__1,i__2);
+		    for (i__ = j + 1; i__ <= i__4; ++i__) {
+			temp += a[l + i__ + j * a_dim1] * x[ix];
+			ix += *incx;
+/* L150: */
+		    }
+		    x[jx] = temp;
+		    jx += *incx;
+/* L160: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of STBMV . */
+
+} /* stbmv_ */
+

diff --git a/blas/f2c/zhbmv.c b/blas/f2c/zhbmv.c
new file mode 100644
index 0000000..42da13d
--- /dev/null
+++ b/blas/f2c/zhbmv.c

@@ -0,0 +1,488 @@
+/* zhbmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int zhbmv_(char *uplo, integer *n, integer *k, doublecomplex 
+	*alpha, doublecomplex *a, integer *lda, doublecomplex *x, integer *
+	incx, doublecomplex *beta, doublecomplex *y, integer *incy, ftnlen 
+	uplo_len)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    doublereal d__1;
+    doublecomplex z__1, z__2, z__3, z__4;
+
+    /* Builtin functions */
+    void d_cnjg(doublecomplex *, doublecomplex *);
+
+    /* Local variables */
+    integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
+    doublecomplex temp1, temp2;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    integer kplus1;
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  ZHBMV  performs the matrix-vector  operation */
+
+/*     y := alpha*A*x + beta*y, */
+
+/*  where alpha and beta are scalars, x and y are n element vectors and */
+/*  A is an n by n hermitian band matrix, with k super-diagonals. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the upper or lower */
+/*           triangular part of the band matrix A is being supplied as */
+/*           follows: */
+
+/*              UPLO = 'U' or 'u'   The upper triangular part of A is */
+/*                                  being supplied. */
+
+/*              UPLO = 'L' or 'l'   The lower triangular part of A is */
+/*                                  being supplied. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  K      - INTEGER. */
+/*           On entry, K specifies the number of super-diagonals of the */
+/*           matrix A. K must satisfy  0 .le. K. */
+/*           Unchanged on exit. */
+
+/*  ALPHA  - COMPLEX*16      . */
+/*           On entry, ALPHA specifies the scalar alpha. */
+/*           Unchanged on exit. */
+
+/*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ). */
+/*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the upper triangular */
+/*           band part of the hermitian matrix, supplied column by */
+/*           column, with the leading diagonal of the matrix in row */
+/*           ( k + 1 ) of the array, the first super-diagonal starting at */
+/*           position 2 in row k, and so on. The top left k by k triangle */
+/*           of the array A is not referenced. */
+/*           The following program segment will transfer the upper */
+/*           triangular part of a hermitian band matrix from conventional */
+/*           full matrix storage to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = K + 1 - J */
+/*                    DO 10, I = MAX( 1, J - K ), J */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the lower triangular */
+/*           band part of the hermitian matrix, supplied column by */
+/*           column, with the leading diagonal of the matrix in row 1 of */
+/*           the array, the first sub-diagonal starting at position 1 in */
+/*           row 2, and so on. The bottom right k by k triangle of the */
+/*           array A is not referenced. */
+/*           The following program segment will transfer the lower */
+/*           triangular part of a hermitian band matrix from conventional */
+/*           full matrix storage to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = 1 - J */
+/*                    DO 10, I = J, MIN( N, J + K ) */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Note that the imaginary parts of the diagonal elements need */
+/*           not be set and are assumed to be zero. */
+/*           Unchanged on exit. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program. LDA must be at least */
+/*           ( k + 1 ). */
+/*           Unchanged on exit. */
+
+/*  X      - COMPLEX*16       array of DIMENSION at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the */
+/*           vector x. */
+/*           Unchanged on exit. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  BETA   - COMPLEX*16      . */
+/*           On entry, BETA specifies the scalar beta. */
+/*           Unchanged on exit. */
+
+/*  Y      - COMPLEX*16       array of DIMENSION at least */
+/*           ( 1 + ( n - 1 )*abs( INCY ) ). */
+/*           Before entry, the incremented array Y must contain the */
+/*           vector y. On exit, Y is overwritten by the updated vector y. */
+
+/*  INCY   - INTEGER. */
+/*           On entry, INCY specifies the increment for the elements of */
+/*           Y. INCY must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+    --y;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*k < 0) {
+	info = 3;
+    } else if (*lda < *k + 1) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    } else if (*incy == 0) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("ZHBMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (alpha->r == 0. && alpha->i == 0. && (beta->r == 1. && 
+                                                         beta->i == 0.))) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of the array A */
+/*     are accessed sequentially with one pass through A. */
+
+/*     First form  y := beta*y. */
+
+    if (beta->r != 1. || beta->i != 0.) {
+	if (*incy == 1) {
+	    if (beta->r == 0. && beta->i == 0.) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    y[i__2].r = 0., y[i__2].i = 0.;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    i__3 = i__;
+		    z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, 
+			    z__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (beta->r == 0. && beta->i == 0.) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    y[i__2].r = 0., y[i__2].i = 0.;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    i__3 = iy;
+		    z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, 
+			    z__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (alpha->r == 0. && alpha->i == 0.) {
+	return 0;
+    }
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+
+/*        Form  y  when upper triangle of A is stored. */
+
+	kplus1 = *k + 1;
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		l = kplus1 - j;
+/* Computing MAX */
+		i__2 = 1, i__3 = j - *k;
+		i__4 = j - 1;
+		for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
+		    i__2 = i__;
+		    i__3 = i__;
+		    i__5 = l + i__ + j * a_dim1;
+		    z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, 
+			    z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
+		    y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		    d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
+		    i__2 = i__;
+		    z__2.r = z__3.r * x[i__2].r - z__3.i * x[i__2].i, z__2.i =
+			     z__3.r * x[i__2].i + z__3.i * x[i__2].r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+/* L50: */
+		}
+		i__4 = j;
+		i__2 = j;
+		i__3 = kplus1 + j * a_dim1;
+		d__1 = a[i__3].r;
+		z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
+		z__2.r = y[i__2].r + z__3.r, z__2.i = y[i__2].i + z__3.i;
+		z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
+		y[i__4].r = z__1.r, y[i__4].i = z__1.i;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__4 = jx;
+		z__1.r = alpha->r * x[i__4].r - alpha->i * x[i__4].i, z__1.i =
+			 alpha->r * x[i__4].i + alpha->i * x[i__4].r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		ix = kx;
+		iy = ky;
+		l = kplus1 - j;
+/* Computing MAX */
+		i__4 = 1, i__2 = j - *k;
+		i__3 = j - 1;
+		for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
+		    i__4 = iy;
+		    i__2 = iy;
+		    i__5 = l + i__ + j * a_dim1;
+		    z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, 
+			    z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
+		    y[i__4].r = z__1.r, y[i__4].i = z__1.i;
+		    d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
+		    i__4 = ix;
+		    z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i =
+			     z__3.r * x[i__4].i + z__3.i * x[i__4].r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		i__3 = jy;
+		i__4 = jy;
+		i__2 = kplus1 + j * a_dim1;
+		d__1 = a[i__2].r;
+		z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
+		z__2.r = y[i__4].r + z__3.r, z__2.i = y[i__4].i + z__3.i;
+		z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
+		y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+		jx += *incx;
+		jy += *incy;
+		if (j > *k) {
+		    kx += *incx;
+		    ky += *incy;
+		}
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when lower triangle of A is stored. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__3 = j;
+		z__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, z__1.i =
+			 alpha->r * x[i__3].i + alpha->i * x[i__3].r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		i__3 = j;
+		i__4 = j;
+		i__2 = j * a_dim1 + 1;
+		d__1 = a[i__2].r;
+		z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
+		z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
+		y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+		l = 1 - j;
+/* Computing MIN */
+		i__4 = *n, i__2 = j + *k;
+		i__3 = min(i__4,i__2);
+		for (i__ = j + 1; i__ <= i__3; ++i__) {
+		    i__4 = i__;
+		    i__2 = i__;
+		    i__5 = l + i__ + j * a_dim1;
+		    z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, 
+			    z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
+		    y[i__4].r = z__1.r, y[i__4].i = z__1.i;
+		    d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
+		    i__4 = i__;
+		    z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i =
+			     z__3.r * x[i__4].i + z__3.i * x[i__4].r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+/* L90: */
+		}
+		i__3 = j;
+		i__4 = j;
+		z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
+		y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__3 = jx;
+		z__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, z__1.i =
+			 alpha->r * x[i__3].i + alpha->i * x[i__3].r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		i__3 = jy;
+		i__4 = jy;
+		i__2 = j * a_dim1 + 1;
+		d__1 = a[i__2].r;
+		z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
+		z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
+		y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+		l = 1 - j;
+		ix = jx;
+		iy = jy;
+/* Computing MIN */
+		i__4 = *n, i__2 = j + *k;
+		i__3 = min(i__4,i__2);
+		for (i__ = j + 1; i__ <= i__3; ++i__) {
+		    ix += *incx;
+		    iy += *incy;
+		    i__4 = iy;
+		    i__2 = iy;
+		    i__5 = l + i__ + j * a_dim1;
+		    z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, 
+			    z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
+		    y[i__4].r = z__1.r, y[i__4].i = z__1.i;
+		    d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
+		    i__4 = ix;
+		    z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i =
+			     z__3.r * x[i__4].i + z__3.i * x[i__4].r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+/* L110: */
+		}
+		i__3 = jy;
+		i__4 = jy;
+		z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
+		y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+		jx += *incx;
+		jy += *incy;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZHBMV . */
+
+} /* zhbmv_ */
+

diff --git a/blas/f2c/zhpmv.c b/blas/f2c/zhpmv.c
new file mode 100644
index 0000000..fbe2f42
--- /dev/null
+++ b/blas/f2c/zhpmv.c

@@ -0,0 +1,438 @@
+/* zhpmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int zhpmv_(char *uplo, integer *n, doublecomplex *alpha, 
+	doublecomplex *ap, doublecomplex *x, integer *incx, doublecomplex *
+	beta, doublecomplex *y, integer *incy, ftnlen uplo_len)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4, i__5;
+    doublereal d__1;
+    doublecomplex z__1, z__2, z__3, z__4;
+
+    /* Builtin functions */
+    void d_cnjg(doublecomplex *, doublecomplex *);
+
+    /* Local variables */
+    integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
+    doublecomplex temp1, temp2;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  ZHPMV  performs the matrix-vector operation */
+
+/*     y := alpha*A*x + beta*y, */
+
+/*  where alpha and beta are scalars, x and y are n element vectors and */
+/*  A is an n by n hermitian matrix, supplied in packed form. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the upper or lower */
+/*           triangular part of the matrix A is supplied in the packed */
+/*           array AP as follows: */
+
+/*              UPLO = 'U' or 'u'   The upper triangular part of A is */
+/*                                  supplied in AP. */
+
+/*              UPLO = 'L' or 'l'   The lower triangular part of A is */
+/*                                  supplied in AP. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  ALPHA  - COMPLEX*16      . */
+/*           On entry, ALPHA specifies the scalar alpha. */
+/*           Unchanged on exit. */
+
+/*  AP     - COMPLEX*16       array of DIMENSION at least */
+/*           ( ( n*( n + 1 ) )/2 ). */
+/*           Before entry with UPLO = 'U' or 'u', the array AP must */
+/*           contain the upper triangular part of the hermitian matrix */
+/*           packed sequentially, column by column, so that AP( 1 ) */
+/*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
+/*           and a( 2, 2 ) respectively, and so on. */
+/*           Before entry with UPLO = 'L' or 'l', the array AP must */
+/*           contain the lower triangular part of the hermitian matrix */
+/*           packed sequentially, column by column, so that AP( 1 ) */
+/*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
+/*           and a( 3, 1 ) respectively, and so on. */
+/*           Note that the imaginary parts of the diagonal elements need */
+/*           not be set and are assumed to be zero. */
+/*           Unchanged on exit. */
+
+/*  X      - COMPLEX*16       array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the n */
+/*           element vector x. */
+/*           Unchanged on exit. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  BETA   - COMPLEX*16      . */
+/*           On entry, BETA specifies the scalar beta. When BETA is */
+/*           supplied as zero then Y need not be set on input. */
+/*           Unchanged on exit. */
+
+/*  Y      - COMPLEX*16       array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCY ) ). */
+/*           Before entry, the incremented array Y must contain the n */
+/*           element vector y. On exit, Y is overwritten by the updated */
+/*           vector y. */
+
+/*  INCY   - INTEGER. */
+/*           On entry, INCY specifies the increment for the elements of */
+/*           Y. INCY must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    --y;
+    --x;
+    --ap;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 6;
+    } else if (*incy == 0) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("ZHPMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (alpha->r == 0. && alpha->i == 0. && (beta->r == 1. && 
+                                                         beta->i == 0.))) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of the array AP */
+/*     are accessed sequentially with one pass through AP. */
+
+/*     First form  y := beta*y. */
+
+    if (beta->r != 1. || beta->i != 0.) {
+	if (*incy == 1) {
+	    if (beta->r == 0. && beta->i == 0.) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    y[i__2].r = 0., y[i__2].i = 0.;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    i__3 = i__;
+		    z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, 
+			    z__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (beta->r == 0. && beta->i == 0.) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    y[i__2].r = 0., y[i__2].i = 0.;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    i__3 = iy;
+		    z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, 
+			    z__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (alpha->r == 0. && alpha->i == 0.) {
+	return 0;
+    }
+    kk = 1;
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+
+/*        Form  y  when AP contains the upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		k = kk;
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = i__;
+		    i__4 = i__;
+		    i__5 = k;
+		    z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, 
+			    z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
+			    .r;
+		    z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
+		    y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+		    d_cnjg(&z__3, &ap[k]);
+		    i__3 = i__;
+		    z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i =
+			     z__3.r * x[i__3].i + z__3.i * x[i__3].r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+		    ++k;
+/* L50: */
+		}
+		i__2 = j;
+		i__3 = j;
+		i__4 = kk + j - 1;
+		d__1 = ap[i__4].r;
+		z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
+		z__2.r = y[i__3].r + z__3.r, z__2.i = y[i__3].i + z__3.i;
+		z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
+		y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		kk += j;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = jx;
+		z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		ix = kx;
+		iy = ky;
+		i__2 = kk + j - 2;
+		for (k = kk; k <= i__2; ++k) {
+		    i__3 = iy;
+		    i__4 = iy;
+		    i__5 = k;
+		    z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, 
+			    z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
+			    .r;
+		    z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
+		    y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+		    d_cnjg(&z__3, &ap[k]);
+		    i__3 = ix;
+		    z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i =
+			     z__3.r * x[i__3].i + z__3.i * x[i__3].r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		i__2 = jy;
+		i__3 = jy;
+		i__4 = kk + j - 1;
+		d__1 = ap[i__4].r;
+		z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
+		z__2.r = y[i__3].r + z__3.r, z__2.i = y[i__3].i + z__3.i;
+		z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
+		y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		jx += *incx;
+		jy += *incy;
+		kk += j;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when AP contains the lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		i__2 = j;
+		i__3 = j;
+		i__4 = kk;
+		d__1 = ap[i__4].r;
+		z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
+		z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
+		y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		k = kk + 1;
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    i__3 = i__;
+		    i__4 = i__;
+		    i__5 = k;
+		    z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, 
+			    z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
+			    .r;
+		    z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
+		    y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+		    d_cnjg(&z__3, &ap[k]);
+		    i__3 = i__;
+		    z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i =
+			     z__3.r * x[i__3].i + z__3.i * x[i__3].r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+		    ++k;
+/* L90: */
+		}
+		i__2 = j;
+		i__3 = j;
+		z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
+		y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		kk += *n - j + 1;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = jx;
+		z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		i__2 = jy;
+		i__3 = jy;
+		i__4 = kk;
+		d__1 = ap[i__4].r;
+		z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
+		z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
+		y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		ix = jx;
+		iy = jy;
+		i__2 = kk + *n - j;
+		for (k = kk + 1; k <= i__2; ++k) {
+		    ix += *incx;
+		    iy += *incy;
+		    i__3 = iy;
+		    i__4 = iy;
+		    i__5 = k;
+		    z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, 
+			    z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
+			    .r;
+		    z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
+		    y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+		    d_cnjg(&z__3, &ap[k]);
+		    i__3 = ix;
+		    z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i =
+			     z__3.r * x[i__3].i + z__3.i * x[i__3].r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+/* L110: */
+		}
+		i__2 = jy;
+		i__3 = jy;
+		z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
+		y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		jx += *incx;
+		jy += *incy;
+		kk += *n - j + 1;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZHPMV . */
+
+} /* zhpmv_ */
+

diff --git a/blas/f2c/ztbmv.c b/blas/f2c/ztbmv.c
new file mode 100644
index 0000000..3bf0beb
--- /dev/null
+++ b/blas/f2c/ztbmv.c

@@ -0,0 +1,647 @@
+/* ztbmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int ztbmv_(char *uplo, char *trans, char *diag, integer *n, 
+	integer *k, doublecomplex *a, integer *lda, doublecomplex *x, integer 
+	*incx, ftnlen uplo_len, ftnlen trans_len, ftnlen diag_len)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    doublecomplex z__1, z__2, z__3;
+
+    /* Builtin functions */
+    void d_cnjg(doublecomplex *, doublecomplex *);
+
+    /* Local variables */
+    integer i__, j, l, ix, jx, kx, info;
+    doublecomplex temp;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    integer kplus1;
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+    logical noconj, nounit;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  ZTBMV  performs one of the matrix-vector operations */
+
+/*     x := A*x,   or   x := A'*x,   or   x := conjg( A' )*x, */
+
+/*  where x is an n element vector and  A is an n by n unit, or non-unit, */
+/*  upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the matrix is an upper or */
+/*           lower triangular matrix as follows: */
+
+/*              UPLO = 'U' or 'u'   A is an upper triangular matrix. */
+
+/*              UPLO = 'L' or 'l'   A is a lower triangular matrix. */
+
+/*           Unchanged on exit. */
+
+/*  TRANS  - CHARACTER*1. */
+/*           On entry, TRANS specifies the operation to be performed as */
+/*           follows: */
+
+/*              TRANS = 'N' or 'n'   x := A*x. */
+
+/*              TRANS = 'T' or 't'   x := A'*x. */
+
+/*              TRANS = 'C' or 'c'   x := conjg( A' )*x. */
+
+/*           Unchanged on exit. */
+
+/*  DIAG   - CHARACTER*1. */
+/*           On entry, DIAG specifies whether or not A is unit */
+/*           triangular as follows: */
+
+/*              DIAG = 'U' or 'u'   A is assumed to be unit triangular. */
+
+/*              DIAG = 'N' or 'n'   A is not assumed to be unit */
+/*                                  triangular. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  K      - INTEGER. */
+/*           On entry with UPLO = 'U' or 'u', K specifies the number of */
+/*           super-diagonals of the matrix A. */
+/*           On entry with UPLO = 'L' or 'l', K specifies the number of */
+/*           sub-diagonals of the matrix A. */
+/*           K must satisfy  0 .le. K. */
+/*           Unchanged on exit. */
+
+/*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ). */
+/*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the upper triangular */
+/*           band part of the matrix of coefficients, supplied column by */
+/*           column, with the leading diagonal of the matrix in row */
+/*           ( k + 1 ) of the array, the first super-diagonal starting at */
+/*           position 2 in row k, and so on. The top left k by k triangle */
+/*           of the array A is not referenced. */
+/*           The following program segment will transfer an upper */
+/*           triangular band matrix from conventional full matrix storage */
+/*           to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = K + 1 - J */
+/*                    DO 10, I = MAX( 1, J - K ), J */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the lower triangular */
+/*           band part of the matrix of coefficients, supplied column by */
+/*           column, with the leading diagonal of the matrix in row 1 of */
+/*           the array, the first sub-diagonal starting at position 1 in */
+/*           row 2, and so on. The bottom right k by k triangle of the */
+/*           array A is not referenced. */
+/*           The following program segment will transfer a lower */
+/*           triangular band matrix from conventional full matrix storage */
+/*           to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = 1 - J */
+/*                    DO 10, I = J, MIN( N, J + K ) */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Note that when DIAG = 'U' or 'u' the elements of the array A */
+/*           corresponding to the diagonal elements of the matrix are not */
+/*           referenced, but are assumed to be unity. */
+/*           Unchanged on exit. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program. LDA must be at least */
+/*           ( k + 1 ). */
+/*           Unchanged on exit. */
+
+/*  X      - COMPLEX*16       array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the n */
+/*           element vector x. On exit, X is overwritten with the */
+/*           transformed vector x. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, 
+	    "T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, (
+	    ftnlen)1)) {
+	info = 2;
+    } else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag, 
+	    "N", (ftnlen)1, (ftnlen)1)) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*k < 0) {
+	info = 5;
+    } else if (*lda < *k + 1) {
+	info = 7;
+    } else if (*incx == 0) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("ZTBMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    noconj = lsame_(trans, "T", (ftnlen)1, (ftnlen)1);
+    nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1);
+
+/*     Set up the start point in X if the increment is not unity. This */
+/*     will be  ( N - 1 )*INCX   too small for descending loops. */
+
+    if (*incx <= 0) {
+	kx = 1 - (*n - 1) * *incx;
+    } else if (*incx != 1) {
+	kx = 1;
+    }
+
+/*     Start the operations. In this version the elements of A are */
+/*     accessed sequentially with one pass through A. */
+
+    if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) {
+
+/*         Form  x := A*x. */
+
+	if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	    kplus1 = *k + 1;
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    if (x[i__2].r != 0. || x[i__2].i != 0.) {
+			i__2 = j;
+			temp.r = x[i__2].r, temp.i = x[i__2].i;
+			l = kplus1 - j;
+/* Computing MAX */
+			i__2 = 1, i__3 = j - *k;
+			i__4 = j - 1;
+			for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
+			    i__2 = i__;
+			    i__3 = i__;
+			    i__5 = l + i__ + j * a_dim1;
+			    z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, 
+				    z__2.i = temp.r * a[i__5].i + temp.i * a[
+				    i__5].r;
+			    z__1.r = x[i__3].r + z__2.r, z__1.i = x[i__3].i + 
+				    z__2.i;
+			    x[i__2].r = z__1.r, x[i__2].i = z__1.i;
+/* L10: */
+			}
+			if (nounit) {
+			    i__4 = j;
+			    i__2 = j;
+			    i__3 = kplus1 + j * a_dim1;
+			    z__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[
+				    i__3].i, z__1.i = x[i__2].r * a[i__3].i + 
+				    x[i__2].i * a[i__3].r;
+			    x[i__4].r = z__1.r, x[i__4].i = z__1.i;
+			}
+		    }
+/* L20: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__4 = jx;
+		    if (x[i__4].r != 0. || x[i__4].i != 0.) {
+			i__4 = jx;
+			temp.r = x[i__4].r, temp.i = x[i__4].i;
+			ix = kx;
+			l = kplus1 - j;
+/* Computing MAX */
+			i__4 = 1, i__2 = j - *k;
+			i__3 = j - 1;
+			for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
+			    i__4 = ix;
+			    i__2 = ix;
+			    i__5 = l + i__ + j * a_dim1;
+			    z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, 
+				    z__2.i = temp.r * a[i__5].i + temp.i * a[
+				    i__5].r;
+			    z__1.r = x[i__2].r + z__2.r, z__1.i = x[i__2].i + 
+				    z__2.i;
+			    x[i__4].r = z__1.r, x[i__4].i = z__1.i;
+			    ix += *incx;
+/* L30: */
+			}
+			if (nounit) {
+			    i__3 = jx;
+			    i__4 = jx;
+			    i__2 = kplus1 + j * a_dim1;
+			    z__1.r = x[i__4].r * a[i__2].r - x[i__4].i * a[
+				    i__2].i, z__1.i = x[i__4].r * a[i__2].i + 
+				    x[i__4].i * a[i__2].r;
+			    x[i__3].r = z__1.r, x[i__3].i = z__1.i;
+			}
+		    }
+		    jx += *incx;
+		    if (j > *k) {
+			kx += *incx;
+		    }
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    i__1 = j;
+		    if (x[i__1].r != 0. || x[i__1].i != 0.) {
+			i__1 = j;
+			temp.r = x[i__1].r, temp.i = x[i__1].i;
+			l = 1 - j;
+/* Computing MIN */
+			i__1 = *n, i__3 = j + *k;
+			i__4 = j + 1;
+			for (i__ = min(i__1,i__3); i__ >= i__4; --i__) {
+			    i__1 = i__;
+			    i__3 = i__;
+			    i__2 = l + i__ + j * a_dim1;
+			    z__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, 
+				    z__2.i = temp.r * a[i__2].i + temp.i * a[
+				    i__2].r;
+			    z__1.r = x[i__3].r + z__2.r, z__1.i = x[i__3].i + 
+				    z__2.i;
+			    x[i__1].r = z__1.r, x[i__1].i = z__1.i;
+/* L50: */
+			}
+			if (nounit) {
+			    i__4 = j;
+			    i__1 = j;
+			    i__3 = j * a_dim1 + 1;
+			    z__1.r = x[i__1].r * a[i__3].r - x[i__1].i * a[
+				    i__3].i, z__1.i = x[i__1].r * a[i__3].i + 
+				    x[i__1].i * a[i__3].r;
+			    x[i__4].r = z__1.r, x[i__4].i = z__1.i;
+			}
+		    }
+/* L60: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    i__4 = jx;
+		    if (x[i__4].r != 0. || x[i__4].i != 0.) {
+			i__4 = jx;
+			temp.r = x[i__4].r, temp.i = x[i__4].i;
+			ix = kx;
+			l = 1 - j;
+/* Computing MIN */
+			i__4 = *n, i__1 = j + *k;
+			i__3 = j + 1;
+			for (i__ = min(i__4,i__1); i__ >= i__3; --i__) {
+			    i__4 = ix;
+			    i__1 = ix;
+			    i__2 = l + i__ + j * a_dim1;
+			    z__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, 
+				    z__2.i = temp.r * a[i__2].i + temp.i * a[
+				    i__2].r;
+			    z__1.r = x[i__1].r + z__2.r, z__1.i = x[i__1].i + 
+				    z__2.i;
+			    x[i__4].r = z__1.r, x[i__4].i = z__1.i;
+			    ix -= *incx;
+/* L70: */
+			}
+			if (nounit) {
+			    i__3 = jx;
+			    i__4 = jx;
+			    i__1 = j * a_dim1 + 1;
+			    z__1.r = x[i__4].r * a[i__1].r - x[i__4].i * a[
+				    i__1].i, z__1.i = x[i__4].r * a[i__1].i + 
+				    x[i__4].i * a[i__1].r;
+			    x[i__3].r = z__1.r, x[i__3].i = z__1.i;
+			}
+		    }
+		    jx -= *incx;
+		    if (*n - j >= *k) {
+			kx -= *incx;
+		    }
+/* L80: */
+		}
+	    }
+	}
+    } else {
+
+/*        Form  x := A'*x  or  x := conjg( A' )*x. */
+
+	if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	    kplus1 = *k + 1;
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    i__3 = j;
+		    temp.r = x[i__3].r, temp.i = x[i__3].i;
+		    l = kplus1 - j;
+		    if (noconj) {
+			if (nounit) {
+			    i__3 = kplus1 + j * a_dim1;
+			    z__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, 
+				    z__1.i = temp.r * a[i__3].i + temp.i * a[
+				    i__3].r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+/* Computing MAX */
+			i__4 = 1, i__1 = j - *k;
+			i__3 = max(i__4,i__1);
+			for (i__ = j - 1; i__ >= i__3; --i__) {
+			    i__4 = l + i__ + j * a_dim1;
+			    i__1 = i__;
+			    z__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[
+				    i__1].i, z__2.i = a[i__4].r * x[i__1].i + 
+				    a[i__4].i * x[i__1].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i + 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L90: */
+			}
+		    } else {
+			if (nounit) {
+			    d_cnjg(&z__2, &a[kplus1 + j * a_dim1]);
+			    z__1.r = temp.r * z__2.r - temp.i * z__2.i, 
+				    z__1.i = temp.r * z__2.i + temp.i * 
+				    z__2.r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+/* Computing MAX */
+			i__4 = 1, i__1 = j - *k;
+			i__3 = max(i__4,i__1);
+			for (i__ = j - 1; i__ >= i__3; --i__) {
+			    d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
+			    i__4 = i__;
+			    z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, 
+				    z__2.i = z__3.r * x[i__4].i + z__3.i * x[
+				    i__4].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i + 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L100: */
+			}
+		    }
+		    i__3 = j;
+		    x[i__3].r = temp.r, x[i__3].i = temp.i;
+/* L110: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    i__3 = jx;
+		    temp.r = x[i__3].r, temp.i = x[i__3].i;
+		    kx -= *incx;
+		    ix = kx;
+		    l = kplus1 - j;
+		    if (noconj) {
+			if (nounit) {
+			    i__3 = kplus1 + j * a_dim1;
+			    z__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, 
+				    z__1.i = temp.r * a[i__3].i + temp.i * a[
+				    i__3].r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+/* Computing MAX */
+			i__4 = 1, i__1 = j - *k;
+			i__3 = max(i__4,i__1);
+			for (i__ = j - 1; i__ >= i__3; --i__) {
+			    i__4 = l + i__ + j * a_dim1;
+			    i__1 = ix;
+			    z__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[
+				    i__1].i, z__2.i = a[i__4].r * x[i__1].i + 
+				    a[i__4].i * x[i__1].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i + 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			    ix -= *incx;
+/* L120: */
+			}
+		    } else {
+			if (nounit) {
+			    d_cnjg(&z__2, &a[kplus1 + j * a_dim1]);
+			    z__1.r = temp.r * z__2.r - temp.i * z__2.i, 
+				    z__1.i = temp.r * z__2.i + temp.i * 
+				    z__2.r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+/* Computing MAX */
+			i__4 = 1, i__1 = j - *k;
+			i__3 = max(i__4,i__1);
+			for (i__ = j - 1; i__ >= i__3; --i__) {
+			    d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
+			    i__4 = ix;
+			    z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, 
+				    z__2.i = z__3.r * x[i__4].i + z__3.i * x[
+				    i__4].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i + 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			    ix -= *incx;
+/* L130: */
+			}
+		    }
+		    i__3 = jx;
+		    x[i__3].r = temp.r, x[i__3].i = temp.i;
+		    jx -= *incx;
+/* L140: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		i__3 = *n;
+		for (j = 1; j <= i__3; ++j) {
+		    i__4 = j;
+		    temp.r = x[i__4].r, temp.i = x[i__4].i;
+		    l = 1 - j;
+		    if (noconj) {
+			if (nounit) {
+			    i__4 = j * a_dim1 + 1;
+			    z__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, 
+				    z__1.i = temp.r * a[i__4].i + temp.i * a[
+				    i__4].r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+/* Computing MIN */
+			i__1 = *n, i__2 = j + *k;
+			i__4 = min(i__1,i__2);
+			for (i__ = j + 1; i__ <= i__4; ++i__) {
+			    i__1 = l + i__ + j * a_dim1;
+			    i__2 = i__;
+			    z__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[
+				    i__2].i, z__2.i = a[i__1].r * x[i__2].i + 
+				    a[i__1].i * x[i__2].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i + 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L150: */
+			}
+		    } else {
+			if (nounit) {
+			    d_cnjg(&z__2, &a[j * a_dim1 + 1]);
+			    z__1.r = temp.r * z__2.r - temp.i * z__2.i, 
+				    z__1.i = temp.r * z__2.i + temp.i * 
+				    z__2.r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+/* Computing MIN */
+			i__1 = *n, i__2 = j + *k;
+			i__4 = min(i__1,i__2);
+			for (i__ = j + 1; i__ <= i__4; ++i__) {
+			    d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
+			    i__1 = i__;
+			    z__2.r = z__3.r * x[i__1].r - z__3.i * x[i__1].i, 
+				    z__2.i = z__3.r * x[i__1].i + z__3.i * x[
+				    i__1].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i + 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L160: */
+			}
+		    }
+		    i__4 = j;
+		    x[i__4].r = temp.r, x[i__4].i = temp.i;
+/* L170: */
+		}
+	    } else {
+		jx = kx;
+		i__3 = *n;
+		for (j = 1; j <= i__3; ++j) {
+		    i__4 = jx;
+		    temp.r = x[i__4].r, temp.i = x[i__4].i;
+		    kx += *incx;
+		    ix = kx;
+		    l = 1 - j;
+		    if (noconj) {
+			if (nounit) {
+			    i__4 = j * a_dim1 + 1;
+			    z__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, 
+				    z__1.i = temp.r * a[i__4].i + temp.i * a[
+				    i__4].r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+/* Computing MIN */
+			i__1 = *n, i__2 = j + *k;
+			i__4 = min(i__1,i__2);
+			for (i__ = j + 1; i__ <= i__4; ++i__) {
+			    i__1 = l + i__ + j * a_dim1;
+			    i__2 = ix;
+			    z__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[
+				    i__2].i, z__2.i = a[i__1].r * x[i__2].i + 
+				    a[i__1].i * x[i__2].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i + 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			    ix += *incx;
+/* L180: */
+			}
+		    } else {
+			if (nounit) {
+			    d_cnjg(&z__2, &a[j * a_dim1 + 1]);
+			    z__1.r = temp.r * z__2.r - temp.i * z__2.i, 
+				    z__1.i = temp.r * z__2.i + temp.i * 
+				    z__2.r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+/* Computing MIN */
+			i__1 = *n, i__2 = j + *k;
+			i__4 = min(i__1,i__2);
+			for (i__ = j + 1; i__ <= i__4; ++i__) {
+			    d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
+			    i__1 = ix;
+			    z__2.r = z__3.r * x[i__1].r - z__3.i * x[i__1].i, 
+				    z__2.i = z__3.r * x[i__1].i + z__3.i * x[
+				    i__1].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i + 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			    ix += *incx;
+/* L190: */
+			}
+		    }
+		    i__4 = jx;
+		    x[i__4].r = temp.r, x[i__4].i = temp.i;
+		    jx += *incx;
+/* L200: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZTBMV . */
+
+} /* ztbmv_ */
+

diff --git a/blas/fortran/complexdots.f b/blas/fortran/complexdots.f
new file mode 100644
index 0000000..a7da51d
--- /dev/null
+++ b/blas/fortran/complexdots.f

@@ -0,0 +1,43 @@
+      COMPLEX FUNCTION CDOTC(N,CX,INCX,CY,INCY)
+      INTEGER INCX,INCY,N
+      COMPLEX CX(*),CY(*)
+      COMPLEX RES
+      EXTERNAL CDOTCW
+      
+      CALL CDOTCW(N,CX,INCX,CY,INCY,RES)
+      CDOTC = RES
+      RETURN
+      END
+      
+      COMPLEX FUNCTION CDOTU(N,CX,INCX,CY,INCY)
+      INTEGER INCX,INCY,N
+      COMPLEX CX(*),CY(*)
+      COMPLEX RES
+      EXTERNAL CDOTUW
+      
+      CALL CDOTUW(N,CX,INCX,CY,INCY,RES)
+      CDOTU = RES
+      RETURN
+      END
+      
+      DOUBLE COMPLEX FUNCTION ZDOTC(N,CX,INCX,CY,INCY)
+      INTEGER INCX,INCY,N
+      DOUBLE COMPLEX CX(*),CY(*)
+      DOUBLE COMPLEX RES
+      EXTERNAL ZDOTCW
+      
+      CALL ZDOTCW(N,CX,INCX,CY,INCY,RES)
+      ZDOTC = RES
+      RETURN
+      END
+      
+      DOUBLE COMPLEX FUNCTION ZDOTU(N,CX,INCX,CY,INCY)
+      INTEGER INCX,INCY,N
+      DOUBLE COMPLEX CX(*),CY(*)
+      DOUBLE COMPLEX RES
+      EXTERNAL ZDOTUW
+      
+      CALL ZDOTUW(N,CX,INCX,CY,INCY,RES)
+      ZDOTU = RES
+      RETURN
+      END

diff --git a/blas/level1_cplx_impl.h b/blas/level1_cplx_impl.h
new file mode 100644
index 0000000..6c7edd7
--- /dev/null
+++ b/blas/level1_cplx_impl.h

@@ -0,0 +1,155 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "common.h"
+
+struct scalar_norm1_op {
+  typedef RealScalar result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_norm1_op)
+  inline RealScalar operator() (const Scalar& a) const { return numext::norm1(a); }
+};
+namespace Eigen {
+  namespace internal {
+    template<> struct functor_traits<scalar_norm1_op >
+    {
+      enum { Cost = 3 * NumTraits<Scalar>::AddCost, PacketAccess = 0 };
+    };
+  }
+}
+
+// computes the sum of magnitudes of all vector elements or, for a complex vector x, the sum
+// res = |Rex1| + |Imx1| + |Rex2| + |Imx2| + ... + |Rexn| + |Imxn|, where x is a vector of order n
+RealScalar EIGEN_CAT(REAL_SCALAR_SUFFIX, EIGEN_BLAS_FUNC(asum))(int *n, RealScalar *px, int *incx)
+{
+//   std::cerr << "__asum " << *n << " " << *incx << "\n";
+  Complex* x = reinterpret_cast<Complex*>(px);
+
+  if(*n<=0) return 0;
+
+  if(*incx==1)  return make_vector(x,*n).unaryExpr<scalar_norm1_op>().sum();
+  else          return make_vector(x,*n,std::abs(*incx)).unaryExpr<scalar_norm1_op>().sum();
+}
+
+int EIGEN_CAT(i, EIGEN_BLAS_FUNC(amax))(int *n, RealScalar *px, int *incx)
+{
+  if(*n<=0) return 0;
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+
+  DenseIndex ret;
+  if(*incx==1)  make_vector(x,*n).unaryExpr<scalar_norm1_op>().maxCoeff(&ret);
+  else          make_vector(x,*n,std::abs(*incx)).unaryExpr<scalar_norm1_op>().maxCoeff(&ret);
+  return int(ret)+1;
+}
+
+int EIGEN_CAT(i, EIGEN_BLAS_FUNC(amin))(int *n, RealScalar *px, int *incx)
+{
+  if(*n<=0) return 0;
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+
+  DenseIndex ret;
+  if(*incx==1)  make_vector(x,*n).unaryExpr<scalar_norm1_op>().minCoeff(&ret);
+  else          make_vector(x,*n,std::abs(*incx)).unaryExpr<scalar_norm1_op>().minCoeff(&ret);
+  return int(ret)+1;
+}
+
+// computes a dot product of a conjugated vector with another vector.
+int EIGEN_BLAS_FUNC(dotcw)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar* pres)
+{
+//   std::cerr << "_dotc " << *n << " " << *incx << " " << *incy << "\n";
+  Scalar* res = reinterpret_cast<Scalar*>(pres);
+
+  if(*n<=0)
+  {
+    *res = Scalar(0);
+    return 0;
+  }
+
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  Scalar* y = reinterpret_cast<Scalar*>(py);
+
+  if(*incx==1 && *incy==1)    *res = (make_vector(x,*n).dot(make_vector(y,*n)));
+  else if(*incx>0 && *incy>0) *res = (make_vector(x,*n,*incx).dot(make_vector(y,*n,*incy)));
+  else if(*incx<0 && *incy>0) *res = (make_vector(x,*n,-*incx).reverse().dot(make_vector(y,*n,*incy)));
+  else if(*incx>0 && *incy<0) *res = (make_vector(x,*n,*incx).dot(make_vector(y,*n,-*incy).reverse()));
+  else if(*incx<0 && *incy<0) *res = (make_vector(x,*n,-*incx).reverse().dot(make_vector(y,*n,-*incy).reverse()));
+  return 0;
+}
+
+// computes a vector-vector dot product without complex conjugation.
+int EIGEN_BLAS_FUNC(dotuw)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar* pres)
+{
+  Scalar* res = reinterpret_cast<Scalar*>(pres);
+
+  if(*n<=0)
+  {
+    *res = Scalar(0);
+    return 0;
+  }
+
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  Scalar* y = reinterpret_cast<Scalar*>(py);
+
+  if(*incx==1 && *incy==1)    *res = (make_vector(x,*n).cwiseProduct(make_vector(y,*n))).sum();
+  else if(*incx>0 && *incy>0) *res = (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,*incy))).sum();
+  else if(*incx<0 && *incy>0) *res = (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,*incy))).sum();
+  else if(*incx>0 && *incy<0) *res = (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum();
+  else if(*incx<0 && *incy<0) *res = (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum();
+  return 0;
+}
+
+RealScalar EIGEN_CAT(REAL_SCALAR_SUFFIX, EIGEN_BLAS_FUNC(nrm2))(int *n, RealScalar *px, int *incx)
+{
+//   std::cerr << "__nrm2 " << *n << " " << *incx << "\n";
+  if(*n<=0) return 0;
+
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+
+  if(*incx==1)
+    return make_vector(x,*n).stableNorm();
+
+  return make_vector(x,*n,*incx).stableNorm();
+}
+
+int EIGEN_BLAS_FUNC(EIGEN_CAT(REAL_SCALAR_SUFFIX, rot))(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps)
+{
+  if(*n<=0) return 0;
+
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  Scalar* y = reinterpret_cast<Scalar*>(py);
+  RealScalar c = *pc;
+  RealScalar s = *ps;
+
+  StridedVectorType vx(make_vector(x,*n,std::abs(*incx)));
+  StridedVectorType vy(make_vector(y,*n,std::abs(*incy)));
+
+  Reverse<StridedVectorType> rvx(vx);
+  Reverse<StridedVectorType> rvy(vy);
+
+  // TODO implement mixed real-scalar rotations
+       if(*incx<0 && *incy>0) internal::apply_rotation_in_the_plane(rvx, vy, JacobiRotation<Scalar>(c,s));
+  else if(*incx>0 && *incy<0) internal::apply_rotation_in_the_plane(vx, rvy, JacobiRotation<Scalar>(c,s));
+  else                        internal::apply_rotation_in_the_plane(vx, vy,  JacobiRotation<Scalar>(c,s));
+
+  return 0;
+}
+
+int EIGEN_BLAS_FUNC(EIGEN_CAT(REAL_SCALAR_SUFFIX, scal))(int *n, RealScalar *palpha, RealScalar *px, int *incx)
+{
+  if(*n<=0) return 0;
+
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  RealScalar alpha = *palpha;
+
+//   std::cerr << "__scal " << *n << " " << alpha << " " << *incx << "\n";
+
+  if(*incx==1)  make_vector(x,*n) *= alpha;
+  else          make_vector(x,*n,std::abs(*incx)) *= alpha;
+
+  return 0;
+}

diff --git a/blas/level1_impl.h b/blas/level1_impl.h
new file mode 100644
index 0000000..71bd534
--- /dev/null
+++ b/blas/level1_impl.h

@@ -0,0 +1,144 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "common.h"
+
+int EIGEN_BLAS_FUNC(axpy)(const int *n, const RealScalar *palpha, const RealScalar *px, const int *incx, RealScalar *py, const int *incy)
+{
+  const Scalar* x = reinterpret_cast<const Scalar*>(px);
+  Scalar* y = reinterpret_cast<Scalar*>(py);
+  Scalar alpha  = *reinterpret_cast<const Scalar*>(palpha);
+
+  if(*n<=0) return 0;
+
+  if(*incx==1 && *incy==1)    make_vector(y,*n) += alpha * make_vector(x,*n);
+  else if(*incx>0 && *incy>0) make_vector(y,*n,*incy) += alpha * make_vector(x,*n,*incx);
+  else if(*incx>0 && *incy<0) make_vector(y,*n,-*incy).reverse() += alpha * make_vector(x,*n,*incx);
+  else if(*incx<0 && *incy>0) make_vector(y,*n,*incy) += alpha * make_vector(x,*n,-*incx).reverse();
+  else if(*incx<0 && *incy<0) make_vector(y,*n,-*incy).reverse() += alpha * make_vector(x,*n,-*incx).reverse();
+
+  return 0;
+}
+
+int EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy)
+{
+  if(*n<=0) return 0;
+
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  Scalar* y = reinterpret_cast<Scalar*>(py);
+
+  // be careful, *incx==0 is allowed !!
+  if(*incx==1 && *incy==1)
+    make_vector(y,*n) = make_vector(x,*n);
+  else
+  {
+    if(*incx<0) x = x - (*n-1)*(*incx);
+    if(*incy<0) y = y - (*n-1)*(*incy);
+    for(int i=0;i<*n;++i)
+    {
+      *y = *x;
+      x += *incx;
+      y += *incy;
+    }
+  }
+
+  return 0;
+}
+
+int EIGEN_BLAS_FUNC(rotg)(RealScalar *pa, RealScalar *pb, RealScalar *pc, RealScalar *ps)
+{
+  using std::sqrt;
+  using std::abs;
+
+  Scalar& a = *reinterpret_cast<Scalar*>(pa);
+  Scalar& b = *reinterpret_cast<Scalar*>(pb);
+  RealScalar* c = pc;
+  Scalar* s = reinterpret_cast<Scalar*>(ps);
+
+  #if !ISCOMPLEX
+  Scalar r,z;
+  Scalar aa = abs(a);
+  Scalar ab = abs(b);
+  if((aa+ab)==Scalar(0))
+  {
+    *c = 1;
+    *s = 0;
+    r = 0;
+    z = 0;
+  }
+  else
+  {
+    r = sqrt(a*a + b*b);
+    Scalar amax = aa>ab ? a : b;
+    r = amax>0 ? r : -r;
+    *c = a/r;
+    *s = b/r;
+    z = 1;
+    if (aa > ab) z = *s;
+    if (ab > aa && *c!=RealScalar(0))
+      z = Scalar(1)/ *c;
+  }
+  *pa = r;
+  *pb = z;
+  #else
+  Scalar alpha;
+  RealScalar norm,scale;
+  if(abs(a)==RealScalar(0))
+  {
+    *c = RealScalar(0);
+    *s = Scalar(1);
+    a = b;
+  }
+  else
+  {
+    scale = abs(a) + abs(b);
+    norm = scale*sqrt((numext::abs2(a/scale)) + (numext::abs2(b/scale)));
+    alpha = a/abs(a);
+    *c = abs(a)/norm;
+    *s = alpha*numext::conj(b)/norm;
+    a = alpha*norm;
+  }
+  #endif
+
+//   JacobiRotation<Scalar> r;
+//   r.makeGivens(a,b);
+//   *c = r.c();
+//   *s = r.s();
+
+  return 0;
+}
+
+int EIGEN_BLAS_FUNC(scal)(int *n, RealScalar *palpha, RealScalar *px, int *incx)
+{
+  if(*n<=0) return 0;
+
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
+
+  if(*incx==1)  make_vector(x,*n) *= alpha;
+  else          make_vector(x,*n,std::abs(*incx)) *= alpha;
+
+  return 0;
+}
+
+int EIGEN_BLAS_FUNC(swap)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy)
+{
+  if(*n<=0) return 0;
+
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  Scalar* y = reinterpret_cast<Scalar*>(py);
+
+  if(*incx==1 && *incy==1)    make_vector(y,*n).swap(make_vector(x,*n));
+  else if(*incx>0 && *incy>0) make_vector(y,*n,*incy).swap(make_vector(x,*n,*incx));
+  else if(*incx>0 && *incy<0) make_vector(y,*n,-*incy).reverse().swap(make_vector(x,*n,*incx));
+  else if(*incx<0 && *incy>0) make_vector(y,*n,*incy).swap(make_vector(x,*n,-*incx).reverse());
+  else if(*incx<0 && *incy<0) make_vector(y,*n,-*incy).reverse().swap(make_vector(x,*n,-*incx).reverse());
+
+  return 1;
+}

diff --git a/blas/level1_real_impl.h b/blas/level1_real_impl.h
new file mode 100644
index 0000000..c587711
--- /dev/null
+++ b/blas/level1_real_impl.h

@@ -0,0 +1,122 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "common.h"
+
+// computes the sum of magnitudes of all vector elements or, for a complex vector x, the sum
+// res = |Rex1| + |Imx1| + |Rex2| + |Imx2| + ... + |Rexn| + |Imxn|, where x is a vector of order n
+RealScalar EIGEN_BLAS_FUNC(asum)(int *n, RealScalar *px, int *incx)
+{
+//   std::cerr << "_asum " << *n << " " << *incx << "\n";
+
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+
+  if(*n<=0) return 0;
+
+  if(*incx==1)  return make_vector(x,*n).cwiseAbs().sum();
+  else          return make_vector(x,*n,std::abs(*incx)).cwiseAbs().sum();
+}
+
+int EIGEN_CAT(i, EIGEN_BLAS_FUNC(amax))(int *n, RealScalar *px, int *incx)
+{
+  if(*n<=0) return 0;
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+
+  DenseIndex ret;
+  if(*incx==1)  make_vector(x,*n).cwiseAbs().maxCoeff(&ret);
+  else          make_vector(x,*n,std::abs(*incx)).cwiseAbs().maxCoeff(&ret);
+  return int(ret)+1;
+}
+
+int EIGEN_CAT(i, EIGEN_BLAS_FUNC(amin))(int *n, RealScalar *px, int *incx)
+{
+  if(*n<=0) return 0;
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+
+  DenseIndex ret;
+  if(*incx==1)  make_vector(x,*n).cwiseAbs().minCoeff(&ret);
+  else          make_vector(x,*n,std::abs(*incx)).cwiseAbs().minCoeff(&ret);
+  return int(ret)+1;
+}
+
+// computes a vector-vector dot product.
+Scalar EIGEN_BLAS_FUNC(dot)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy)
+{
+//   std::cerr << "_dot " << *n << " " << *incx << " " << *incy << "\n";
+
+  if(*n<=0) return 0;
+
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  Scalar* y = reinterpret_cast<Scalar*>(py);
+
+  if(*incx==1 && *incy==1)    return (make_vector(x,*n).cwiseProduct(make_vector(y,*n))).sum();
+  else if(*incx>0 && *incy>0) return (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,*incy))).sum();
+  else if(*incx<0 && *incy>0) return (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,*incy))).sum();
+  else if(*incx>0 && *incy<0) return (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum();
+  else if(*incx<0 && *incy<0) return (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum();
+  else return 0;
+}
+
+// computes the Euclidean norm of a vector.
+// FIXME
+Scalar EIGEN_BLAS_FUNC(nrm2)(int *n, RealScalar *px, int *incx)
+{
+//   std::cerr << "_nrm2 " << *n << " " << *incx << "\n";
+  if(*n<=0) return 0;
+
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+
+  if(*incx==1)  return make_vector(x,*n).stableNorm();
+  else          return make_vector(x,*n,std::abs(*incx)).stableNorm();
+}
+
+int EIGEN_BLAS_FUNC(rot)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps)
+{
+//   std::cerr << "_rot " << *n << " " << *incx << " " << *incy << "\n";
+  if(*n<=0) return 0;
+
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  Scalar* y = reinterpret_cast<Scalar*>(py);
+  Scalar c = *reinterpret_cast<Scalar*>(pc);
+  Scalar s = *reinterpret_cast<Scalar*>(ps);
+
+  StridedVectorType vx(make_vector(x,*n,std::abs(*incx)));
+  StridedVectorType vy(make_vector(y,*n,std::abs(*incy)));
+
+  Reverse<StridedVectorType> rvx(vx);
+  Reverse<StridedVectorType> rvy(vy);
+
+       if(*incx<0 && *incy>0) internal::apply_rotation_in_the_plane(rvx, vy, JacobiRotation<Scalar>(c,s));
+  else if(*incx>0 && *incy<0) internal::apply_rotation_in_the_plane(vx, rvy, JacobiRotation<Scalar>(c,s));
+  else                        internal::apply_rotation_in_the_plane(vx, vy,  JacobiRotation<Scalar>(c,s));
+
+
+  return 0;
+}
+
+/*
+// performs rotation of points in the modified plane.
+int EIGEN_BLAS_FUNC(rotm)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *param)
+{
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  Scalar* y = reinterpret_cast<Scalar*>(py);
+
+  // TODO
+
+  return 0;
+}
+
+// computes the modified parameters for a Givens rotation.
+int EIGEN_BLAS_FUNC(rotmg)(RealScalar *d1, RealScalar *d2, RealScalar *x1, RealScalar *x2, RealScalar *param)
+{
+  // TODO
+
+  return 0;
+}
+*/

diff --git a/blas/level2_cplx_impl.h b/blas/level2_cplx_impl.h
new file mode 100644
index 0000000..e3ce614
--- /dev/null
+++ b/blas/level2_cplx_impl.h

@@ -0,0 +1,360 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "common.h"
+
+/**  ZHEMV  performs the matrix-vector  operation
+  *
+  *     y := alpha*A*x + beta*y,
+  *
+  *  where alpha and beta are scalars, x and y are n element vectors and
+  *  A is an n by n hermitian matrix.
+  */
+int EIGEN_BLAS_FUNC(hemv)(const char *uplo, const int *n, const RealScalar *palpha, const RealScalar *pa, const int *lda,
+                          const RealScalar *px, const int *incx, const RealScalar *pbeta, RealScalar *py, const int *incy)
+{
+  typedef void (*functype)(int, const Scalar*, int, const Scalar*, Scalar*, Scalar);
+  static const functype func[2] = {
+    // array index: UP
+    (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Upper,false,false>::run),
+    // array index: LO
+    (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Lower,false,false>::run),
+  };
+
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* x = reinterpret_cast<const Scalar*>(px);
+  Scalar* y = reinterpret_cast<Scalar*>(py);
+  Scalar alpha  = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta   = *reinterpret_cast<const Scalar*>(pbeta);
+
+  // check arguments
+  int info = 0;
+  if(UPLO(*uplo)==INVALID)        info = 1;
+  else if(*n<0)                   info = 2;
+  else if(*lda<std::max(1,*n))    info = 5;
+  else if(*incx==0)               info = 7;
+  else if(*incy==0)               info = 10;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"HEMV ",&info,6);
+
+  if(*n==0)
+    return 1;
+
+  const Scalar* actual_x = get_compact_vector(x,*n,*incx);
+  Scalar* actual_y = get_compact_vector(y,*n,*incy);
+
+  if(beta!=Scalar(1))
+  {
+    if(beta==Scalar(0)) make_vector(actual_y, *n).setZero();
+    else                make_vector(actual_y, *n) *= beta;
+  }
+
+  if(alpha!=Scalar(0))
+  {
+    int code = UPLO(*uplo);
+    if(code>=2 || func[code]==0)
+      return 0;
+
+    func[code](*n, a, *lda, actual_x, actual_y, alpha);
+  }
+
+  if(actual_x!=x) delete[] actual_x;
+  if(actual_y!=y) delete[] copy_back(actual_y,y,*n,*incy);
+
+  return 1;
+}
+
+/**  ZHBMV  performs the matrix-vector  operation
+  *
+  *     y := alpha*A*x + beta*y,
+  *
+  *  where alpha and beta are scalars, x and y are n element vectors and
+  *  A is an n by n hermitian band matrix, with k super-diagonals.
+  */
+// int EIGEN_BLAS_FUNC(hbmv)(char *uplo, int *n, int *k, RealScalar *alpha, RealScalar *a, int *lda,
+//                           RealScalar *x, int *incx, RealScalar *beta, RealScalar *y, int *incy)
+// {
+//   return 1;
+// }
+
+/**  ZHPMV  performs the matrix-vector operation
+  *
+  *     y := alpha*A*x + beta*y,
+  *
+  *  where alpha and beta are scalars, x and y are n element vectors and
+  *  A is an n by n hermitian matrix, supplied in packed form.
+  */
+// int EIGEN_BLAS_FUNC(hpmv)(char *uplo, int *n, RealScalar *alpha, RealScalar *ap, RealScalar *x, int *incx, RealScalar *beta, RealScalar *y, int *incy)
+// {
+//   return 1;
+// }
+
+/**  ZHPR    performs the hermitian rank 1 operation
+  *
+  *     A := alpha*x*conjg( x' ) + A,
+  *
+  *  where alpha is a real scalar, x is an n element vector and A is an
+  *  n by n hermitian matrix, supplied in packed form.
+  */
+int EIGEN_BLAS_FUNC(hpr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *pap)
+{
+  typedef void (*functype)(int, Scalar*, const Scalar*, RealScalar);
+  static const functype func[2] = {
+    // array index: UP
+    (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run),
+    // array index: LO
+    (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run),
+  };
+
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  Scalar* ap = reinterpret_cast<Scalar*>(pap);
+  RealScalar alpha = *palpha;
+
+  int info = 0;
+  if(UPLO(*uplo)==INVALID)                                            info = 1;
+  else if(*n<0)                                                       info = 2;
+  else if(*incx==0)                                                   info = 5;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"HPR  ",&info,6);
+
+  if(alpha==Scalar(0))
+    return 1;
+
+  Scalar* x_cpy = get_compact_vector(x, *n, *incx);
+
+  int code = UPLO(*uplo);
+  if(code>=2 || func[code]==0)
+    return 0;
+
+  func[code](*n, ap, x_cpy, alpha);
+
+  if(x_cpy!=x)  delete[] x_cpy;
+
+  return 1;
+}
+
+/**  ZHPR2  performs the hermitian rank 2 operation
+  *
+  *     A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A,
+  *
+  *  where alpha is a scalar, x and y are n element vectors and A is an
+  *  n by n hermitian matrix, supplied in packed form.
+  */
+int EIGEN_BLAS_FUNC(hpr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pap)
+{
+  typedef void (*functype)(int, Scalar*, const Scalar*, const Scalar*, Scalar);
+  static const functype func[2] = {
+    // array index: UP
+    (internal::packed_rank2_update_selector<Scalar,int,Upper>::run),
+    // array index: LO
+    (internal::packed_rank2_update_selector<Scalar,int,Lower>::run),
+  };
+
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  Scalar* y = reinterpret_cast<Scalar*>(py);
+  Scalar* ap = reinterpret_cast<Scalar*>(pap);
+  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
+
+  int info = 0;
+  if(UPLO(*uplo)==INVALID)                                            info = 1;
+  else if(*n<0)                                                       info = 2;
+  else if(*incx==0)                                                   info = 5;
+  else if(*incy==0)                                                   info = 7;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"HPR2 ",&info,6);
+
+  if(alpha==Scalar(0))
+    return 1;
+
+  Scalar* x_cpy = get_compact_vector(x, *n, *incx);
+  Scalar* y_cpy = get_compact_vector(y, *n, *incy);
+
+  int code = UPLO(*uplo);
+  if(code>=2 || func[code]==0)
+    return 0;
+
+  func[code](*n, ap, x_cpy, y_cpy, alpha);
+
+  if(x_cpy!=x)  delete[] x_cpy;
+  if(y_cpy!=y)  delete[] y_cpy;
+
+  return 1;
+}
+
+/**  ZHER   performs the hermitian rank 1 operation
+  *
+  *     A := alpha*x*conjg( x' ) + A,
+  *
+  *  where alpha is a real scalar, x is an n element vector and A is an
+  *  n by n hermitian matrix.
+  */
+int EIGEN_BLAS_FUNC(her)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *pa, int *lda)
+{
+  typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, const Scalar&);
+  static const functype func[2] = {
+    // array index: UP
+    (selfadjoint_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run),
+    // array index: LO
+    (selfadjoint_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run),
+  };
+
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  RealScalar alpha = *reinterpret_cast<RealScalar*>(palpha);
+
+  int info = 0;
+  if(UPLO(*uplo)==INVALID)                                            info = 1;
+  else if(*n<0)                                                       info = 2;
+  else if(*incx==0)                                                   info = 5;
+  else if(*lda<std::max(1,*n))                                        info = 7;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"HER  ",&info,6);
+
+  if(alpha==RealScalar(0))
+    return 1;
+
+  Scalar* x_cpy = get_compact_vector(x, *n, *incx);
+
+  int code = UPLO(*uplo);
+  if(code>=2 || func[code]==0)
+    return 0;
+
+  func[code](*n, a, *lda, x_cpy, x_cpy, alpha);
+
+  matrix(a,*n,*n,*lda).diagonal().imag().setZero();
+
+  if(x_cpy!=x)  delete[] x_cpy;
+
+  return 1;
+}
+
+/**  ZHER2  performs the hermitian rank 2 operation
+  *
+  *     A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A,
+  *
+  *  where alpha is a scalar, x and y are n element vectors and A is an n
+  *  by n hermitian matrix.
+  */
+int EIGEN_BLAS_FUNC(her2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pa, int *lda)
+{
+  typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, Scalar);
+  static const functype func[2] = {
+    // array index: UP
+    (internal::rank2_update_selector<Scalar,int,Upper>::run),
+    // array index: LO
+    (internal::rank2_update_selector<Scalar,int,Lower>::run),
+  };
+
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  Scalar* y = reinterpret_cast<Scalar*>(py);
+  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
+
+  int info = 0;
+  if(UPLO(*uplo)==INVALID)                                            info = 1;
+  else if(*n<0)                                                       info = 2;
+  else if(*incx==0)                                                   info = 5;
+  else if(*incy==0)                                                   info = 7;
+  else if(*lda<std::max(1,*n))                                        info = 9;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"HER2 ",&info,6);
+
+  if(alpha==Scalar(0))
+    return 1;
+
+  Scalar* x_cpy = get_compact_vector(x, *n, *incx);
+  Scalar* y_cpy = get_compact_vector(y, *n, *incy);
+
+  int code = UPLO(*uplo);
+  if(code>=2 || func[code]==0)
+    return 0;
+
+  func[code](*n, a, *lda, x_cpy, y_cpy, alpha);
+
+  matrix(a,*n,*n,*lda).diagonal().imag().setZero();
+
+  if(x_cpy!=x)  delete[] x_cpy;
+  if(y_cpy!=y)  delete[] y_cpy;
+
+  return 1;
+}
+
+/**  ZGERU  performs the rank 1 operation
+  *
+  *     A := alpha*x*y' + A,
+  *
+  *  where alpha is a scalar, x is an m element vector, y is an n element
+  *  vector and A is an m by n matrix.
+  */
+int EIGEN_BLAS_FUNC(geru)(int *m, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pa, int *lda)
+{
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  Scalar* y = reinterpret_cast<Scalar*>(py);
+  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
+
+  int info = 0;
+       if(*m<0)                                                       info = 1;
+  else if(*n<0)                                                       info = 2;
+  else if(*incx==0)                                                   info = 5;
+  else if(*incy==0)                                                   info = 7;
+  else if(*lda<std::max(1,*m))                                        info = 9;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"GERU ",&info,6);
+
+  if(alpha==Scalar(0))
+    return 1;
+
+  Scalar* x_cpy = get_compact_vector(x,*m,*incx);
+  Scalar* y_cpy = get_compact_vector(y,*n,*incy);
+
+  internal::general_rank1_update<Scalar,int,ColMajor,false,false>::run(*m, *n, a, *lda, x_cpy, y_cpy, alpha);
+
+  if(x_cpy!=x)  delete[] x_cpy;
+  if(y_cpy!=y)  delete[] y_cpy;
+
+  return 1;
+}
+
+/**  ZGERC  performs the rank 1 operation
+  *
+  *     A := alpha*x*conjg( y' ) + A,
+  *
+  *  where alpha is a scalar, x is an m element vector, y is an n element
+  *  vector and A is an m by n matrix.
+  */
+int EIGEN_BLAS_FUNC(gerc)(int *m, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pa, int *lda)
+{
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  Scalar* y = reinterpret_cast<Scalar*>(py);
+  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
+
+  int info = 0;
+       if(*m<0)                                                       info = 1;
+  else if(*n<0)                                                       info = 2;
+  else if(*incx==0)                                                   info = 5;
+  else if(*incy==0)                                                   info = 7;
+  else if(*lda<std::max(1,*m))                                        info = 9;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"GERC ",&info,6);
+
+  if(alpha==Scalar(0))
+    return 1;
+
+  Scalar* x_cpy = get_compact_vector(x,*m,*incx);
+  Scalar* y_cpy = get_compact_vector(y,*n,*incy);
+
+  internal::general_rank1_update<Scalar,int,ColMajor,false,Conj>::run(*m, *n, a, *lda, x_cpy, y_cpy, alpha);
+
+  if(x_cpy!=x)  delete[] x_cpy;
+  if(y_cpy!=y)  delete[] y_cpy;
+
+  return 1;
+}

diff --git a/blas/level2_impl.h b/blas/level2_impl.h
new file mode 100644
index 0000000..173f40b
--- /dev/null
+++ b/blas/level2_impl.h

@@ -0,0 +1,553 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "common.h"
+
+template<typename Index, typename Scalar, int StorageOrder, bool ConjugateLhs, bool ConjugateRhs>
+struct general_matrix_vector_product_wrapper
+{
+  static void run(Index rows, Index cols,const Scalar *lhs, Index lhsStride, const Scalar *rhs, Index rhsIncr, Scalar* res, Index resIncr, Scalar alpha)
+  {
+    typedef internal::const_blas_data_mapper<Scalar,Index,StorageOrder> LhsMapper;
+    typedef internal::const_blas_data_mapper<Scalar,Index,RowMajor> RhsMapper;
+    
+    internal::general_matrix_vector_product
+        <Index,Scalar,LhsMapper,StorageOrder,ConjugateLhs,Scalar,RhsMapper,ConjugateRhs>::run(
+        rows, cols, LhsMapper(lhs, lhsStride), RhsMapper(rhs, rhsIncr), res, resIncr, alpha);
+  }
+};
+
+int EIGEN_BLAS_FUNC(gemv)(const char *opa, const int *m, const int *n, const RealScalar *palpha,
+                          const RealScalar *pa, const int *lda, const RealScalar *pb, const int *incb, const RealScalar *pbeta, RealScalar *pc, const int *incc)
+{
+  typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int , Scalar *, int, Scalar);
+  static const functype func[4] = {
+    // array index: NOTR
+    (general_matrix_vector_product_wrapper<int,Scalar,ColMajor,false,false>::run),
+    // array index: TR  
+    (general_matrix_vector_product_wrapper<int,Scalar,RowMajor,false,false>::run),
+    // array index: ADJ 
+    (general_matrix_vector_product_wrapper<int,Scalar,RowMajor,Conj ,false>::run),
+    0
+  };
+
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* b = reinterpret_cast<const Scalar*>(pb);
+  Scalar* c = reinterpret_cast<Scalar*>(pc);
+  Scalar alpha  = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta   = *reinterpret_cast<const Scalar*>(pbeta);
+
+  // check arguments
+  int info = 0;
+  if(OP(*opa)==INVALID)           info = 1;
+  else if(*m<0)                   info = 2;
+  else if(*n<0)                   info = 3;
+  else if(*lda<std::max(1,*m))    info = 6;
+  else if(*incb==0)               info = 8;
+  else if(*incc==0)               info = 11;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"GEMV ",&info,6);
+
+  if(*m==0 || *n==0 || (alpha==Scalar(0) && beta==Scalar(1)))
+    return 0;
+
+  int actual_m = *m;
+  int actual_n = *n;
+  int code = OP(*opa);
+  if(code!=NOTR)
+    std::swap(actual_m,actual_n);
+
+  const Scalar* actual_b = get_compact_vector(b,actual_n,*incb);
+  Scalar* actual_c = get_compact_vector(c,actual_m,*incc);
+
+  if(beta!=Scalar(1))
+  {
+    if(beta==Scalar(0)) make_vector(actual_c, actual_m).setZero();
+    else                make_vector(actual_c, actual_m) *= beta;
+  }
+
+  if(code>=4 || func[code]==0)
+    return 0;
+
+  func[code](actual_m, actual_n, a, *lda, actual_b, 1, actual_c, 1, alpha);
+
+  if(actual_b!=b) delete[] actual_b;
+  if(actual_c!=c) delete[] copy_back(actual_c,c,actual_m,*incc);
+
+  return 1;
+}
+
+int EIGEN_BLAS_FUNC(trsv)(const char *uplo, const char *opa, const char *diag, const int *n, const RealScalar *pa, const int *lda, RealScalar *pb, const int *incb)
+{
+  typedef void (*functype)(int, const Scalar *, int, Scalar *);
+  static const functype func[16] = {
+    // array index: NOTR  | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,Conj, RowMajor>::run),
+    0
+  };
+
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  Scalar* b = reinterpret_cast<Scalar*>(pb);
+
+  int info = 0;
+  if(UPLO(*uplo)==INVALID)                                            info = 1;
+  else if(OP(*opa)==INVALID)                                          info = 2;
+  else if(DIAG(*diag)==INVALID)                                       info = 3;
+  else if(*n<0)                                                       info = 4;
+  else if(*lda<std::max(1,*n))                                        info = 6;
+  else if(*incb==0)                                                   info = 8;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"TRSV ",&info,6);
+
+  Scalar* actual_b = get_compact_vector(b,*n,*incb);
+
+  int code = OP(*opa) | (UPLO(*uplo) << 2) | (DIAG(*diag) << 3);
+  func[code](*n, a, *lda, actual_b);
+
+  if(actual_b!=b) delete[] copy_back(actual_b,b,*n,*incb);
+
+  return 0;
+}
+
+
+
+int EIGEN_BLAS_FUNC(trmv)(const char *uplo, const char *opa, const char *diag, const int *n, const RealScalar *pa, const int *lda, RealScalar *pb, const int *incb)
+{
+  typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int, Scalar *, int, const Scalar&);
+  static const functype func[16] = {
+    // array index: NOTR  | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|0,       Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|0,       Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run),
+    0
+  };
+
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  Scalar* b = reinterpret_cast<Scalar*>(pb);
+
+  int info = 0;
+  if(UPLO(*uplo)==INVALID)                                            info = 1;
+  else if(OP(*opa)==INVALID)                                          info = 2;
+  else if(DIAG(*diag)==INVALID)                                       info = 3;
+  else if(*n<0)                                                       info = 4;
+  else if(*lda<std::max(1,*n))                                        info = 6;
+  else if(*incb==0)                                                   info = 8;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"TRMV ",&info,6);
+
+  if(*n==0)
+    return 1;
+
+  Scalar* actual_b = get_compact_vector(b,*n,*incb);
+  Matrix<Scalar,Dynamic,1> res(*n);
+  res.setZero();
+
+  int code = OP(*opa) | (UPLO(*uplo) << 2) | (DIAG(*diag) << 3);
+  if(code>=16 || func[code]==0)
+    return 0;
+
+  func[code](*n, *n, a, *lda, actual_b, 1, res.data(), 1, Scalar(1));
+
+  copy_back(res.data(),b,*n,*incb);
+  if(actual_b!=b) delete[] actual_b;
+
+  return 1;
+}
+
+/**  GBMV  performs one of the matrix-vector operations
+  *
+  *     y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,
+  *
+  *  where alpha and beta are scalars, x and y are vectors and A is an
+  *  m by n band matrix, with kl sub-diagonals and ku super-diagonals.
+  */
+int EIGEN_BLAS_FUNC(gbmv)(char *trans, int *m, int *n, int *kl, int *ku, RealScalar *palpha, RealScalar *pa, int *lda,
+                          RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py, int *incy)
+{
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* x = reinterpret_cast<const Scalar*>(px);
+  Scalar* y = reinterpret_cast<Scalar*>(py);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta = *reinterpret_cast<const Scalar*>(pbeta);
+  int coeff_rows = *kl+*ku+1;
+
+  int info = 0;
+       if(OP(*trans)==INVALID)                                        info = 1;
+  else if(*m<0)                                                       info = 2;
+  else if(*n<0)                                                       info = 3;
+  else if(*kl<0)                                                      info = 4;
+  else if(*ku<0)                                                      info = 5;
+  else if(*lda<coeff_rows)                                            info = 8;
+  else if(*incx==0)                                                   info = 10;
+  else if(*incy==0)                                                   info = 13;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"GBMV ",&info,6);
+
+  if(*m==0 || *n==0 || (alpha==Scalar(0) && beta==Scalar(1)))
+    return 0;
+
+  int actual_m = *m;
+  int actual_n = *n;
+  if(OP(*trans)!=NOTR)
+    std::swap(actual_m,actual_n);
+
+  const Scalar* actual_x = get_compact_vector(x,actual_n,*incx);
+  Scalar* actual_y = get_compact_vector(y,actual_m,*incy);
+
+  if(beta!=Scalar(1))
+  {
+    if(beta==Scalar(0)) make_vector(actual_y, actual_m).setZero();
+    else                make_vector(actual_y, actual_m) *= beta;
+  }
+
+  ConstMatrixType mat_coeffs(a,coeff_rows,*n,*lda);
+
+  int nb = std::min(*n,(*m)+(*ku));
+  for(int j=0; j<nb; ++j)
+  {
+    int start = std::max(0,j - *ku);
+    int end = std::min((*m)-1,j + *kl);
+    int len = end - start + 1;
+    int offset = (*ku) - j + start;
+    if(OP(*trans)==NOTR)
+      make_vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len);
+    else if(OP(*trans)==TR)
+      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * make_vector(actual_x+start,len) ).value();
+    else
+      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint()   * make_vector(actual_x+start,len) ).value();
+  }
+
+  if(actual_x!=x) delete[] actual_x;
+  if(actual_y!=y) delete[] copy_back(actual_y,y,actual_m,*incy);
+
+  return 0;
+}
+
+#if 0
+/**  TBMV  performs one of the matrix-vector operations
+  *
+  *     x := A*x,   or   x := A'*x,
+  *
+  *  where x is an n element vector and  A is an n by n unit, or non-unit,
+  *  upper or lower triangular band matrix, with ( k + 1 ) diagonals.
+  */
+int EIGEN_BLAS_FUNC(tbmv)(char *uplo, char *opa, char *diag, int *n, int *k, RealScalar *pa, int *lda, RealScalar *px, int *incx)
+{
+  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  int coeff_rows = *k + 1;
+
+  int info = 0;
+       if(UPLO(*uplo)==INVALID)                                       info = 1;
+  else if(OP(*opa)==INVALID)                                          info = 2;
+  else if(DIAG(*diag)==INVALID)                                       info = 3;
+  else if(*n<0)                                                       info = 4;
+  else if(*k<0)                                                       info = 5;
+  else if(*lda<coeff_rows)                                            info = 7;
+  else if(*incx==0)                                                   info = 9;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"TBMV ",&info,6);
+
+  if(*n==0)
+    return 0;
+
+  int actual_n = *n;
+
+  Scalar* actual_x = get_compact_vector(x,actual_n,*incx);
+
+  MatrixType mat_coeffs(a,coeff_rows,*n,*lda);
+
+  int ku = UPLO(*uplo)==UPPER ? *k : 0;
+  int kl = UPLO(*uplo)==LOWER ? *k : 0;
+
+  for(int j=0; j<*n; ++j)
+  {
+    int start = std::max(0,j - ku);
+    int end = std::min((*m)-1,j + kl);
+    int len = end - start + 1;
+    int offset = (ku) - j + start;
+
+    if(OP(*trans)==NOTR)
+      make_vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len);
+    else if(OP(*trans)==TR)
+      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * make_vector(actual_x+start,len) ).value();
+    else
+      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint()   * make_vector(actual_x+start,len) ).value();
+  }
+
+  if(actual_x!=x) delete[] actual_x;
+  if(actual_y!=y) delete[] copy_back(actual_y,y,actual_m,*incy);
+
+  return 0;
+}
+#endif
+
+/**  DTBSV  solves one of the systems of equations
+  *
+  *     A*x = b,   or   A'*x = b,
+  *
+  *  where b and x are n element vectors and A is an n by n unit, or
+  *  non-unit, upper or lower triangular band matrix, with ( k + 1 )
+  *  diagonals.
+  *
+  *  No test for singularity or near-singularity is included in this
+  *  routine. Such tests must be performed before calling this routine.
+  */
+int EIGEN_BLAS_FUNC(tbsv)(char *uplo, char *op, char *diag, int *n, int *k, RealScalar *pa, int *lda, RealScalar *px, int *incx)
+{
+  typedef void (*functype)(int, int, const Scalar *, int, Scalar *);
+  static const functype func[16] = {
+    // array index: NOTR  | (UP << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,false,Scalar,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,false,Scalar,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,Conj, Scalar,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,false,Scalar,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,false,Scalar,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,Conj, Scalar,RowMajor>::run),
+    0,
+    // array index: NOTR  | (UP << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Upper|UnitDiag,Scalar,false,Scalar,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Lower|UnitDiag,Scalar,false,Scalar,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Lower|UnitDiag,Scalar,Conj, Scalar,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Lower|UnitDiag,Scalar,false,Scalar,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Upper|UnitDiag,Scalar,false,Scalar,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Upper|UnitDiag,Scalar,Conj, Scalar,RowMajor>::run),
+    0,
+  };
+
+  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  int coeff_rows = *k+1;
+
+  int info = 0;
+       if(UPLO(*uplo)==INVALID)                                       info = 1;
+  else if(OP(*op)==INVALID)                                           info = 2;
+  else if(DIAG(*diag)==INVALID)                                       info = 3;
+  else if(*n<0)                                                       info = 4;
+  else if(*k<0)                                                       info = 5;
+  else if(*lda<coeff_rows)                                            info = 7;
+  else if(*incx==0)                                                   info = 9;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"TBSV ",&info,6);
+
+  if(*n==0 || (*k==0 && DIAG(*diag)==UNIT))
+    return 0;
+
+  int actual_n = *n;
+
+  Scalar* actual_x = get_compact_vector(x,actual_n,*incx);
+
+  int code = OP(*op) | (UPLO(*uplo) << 2) | (DIAG(*diag) << 3);
+  if(code>=16 || func[code]==0)
+    return 0;
+
+  func[code](*n, *k, a, *lda, actual_x);
+
+  if(actual_x!=x) delete[] copy_back(actual_x,x,actual_n,*incx);
+
+  return 0;
+}
+
+/**  DTPMV  performs one of the matrix-vector operations
+  *
+  *     x := A*x,   or   x := A'*x,
+  *
+  *  where x is an n element vector and  A is an n by n unit, or non-unit,
+  *  upper or lower triangular matrix, supplied in packed form.
+  */
+int EIGEN_BLAS_FUNC(tpmv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pap, RealScalar *px, int *incx)
+{
+  typedef void (*functype)(int, const Scalar*, const Scalar*, Scalar*, Scalar);
+  static const functype func[16] = {
+    // array index: NOTR  | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|0,       Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|0,       Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run),
+    0
+  };
+
+  Scalar* ap = reinterpret_cast<Scalar*>(pap);
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+
+  int info = 0;
+  if(UPLO(*uplo)==INVALID)                                            info = 1;
+  else if(OP(*opa)==INVALID)                                          info = 2;
+  else if(DIAG(*diag)==INVALID)                                       info = 3;
+  else if(*n<0)                                                       info = 4;
+  else if(*incx==0)                                                   info = 7;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"TPMV ",&info,6);
+
+  if(*n==0)
+    return 1;
+
+  Scalar* actual_x = get_compact_vector(x,*n,*incx);
+  Matrix<Scalar,Dynamic,1> res(*n);
+  res.setZero();
+
+  int code = OP(*opa) | (UPLO(*uplo) << 2) | (DIAG(*diag) << 3);
+  if(code>=16 || func[code]==0)
+    return 0;
+
+  func[code](*n, ap, actual_x, res.data(), Scalar(1));
+
+  copy_back(res.data(),x,*n,*incx);
+  if(actual_x!=x) delete[] actual_x;
+
+  return 1;
+}
+
+/**  DTPSV  solves one of the systems of equations
+  *
+  *     A*x = b,   or   A'*x = b,
+  *
+  *  where b and x are n element vectors and A is an n by n unit, or
+  *  non-unit, upper or lower triangular matrix, supplied in packed form.
+  *
+  *  No test for singularity or near-singularity is included in this
+  *  routine. Such tests must be performed before calling this routine.
+  */
+int EIGEN_BLAS_FUNC(tpsv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pap, RealScalar *px, int *incx)
+{
+  typedef void (*functype)(int, const Scalar*, Scalar*);
+  static const functype func[16] = {
+    // array index: NOTR  | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,Conj, RowMajor>::run),
+    0
+  };
+
+  Scalar* ap = reinterpret_cast<Scalar*>(pap);
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+
+  int info = 0;
+  if(UPLO(*uplo)==INVALID)                                            info = 1;
+  else if(OP(*opa)==INVALID)                                          info = 2;
+  else if(DIAG(*diag)==INVALID)                                       info = 3;
+  else if(*n<0)                                                       info = 4;
+  else if(*incx==0)                                                   info = 7;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"TPSV ",&info,6);
+
+  Scalar* actual_x = get_compact_vector(x,*n,*incx);
+
+  int code = OP(*opa) | (UPLO(*uplo) << 2) | (DIAG(*diag) << 3);
+  func[code](*n, ap, actual_x);
+
+  if(actual_x!=x) delete[] copy_back(actual_x,x,*n,*incx);
+
+  return 1;
+}

diff --git a/blas/level2_real_impl.h b/blas/level2_real_impl.h
new file mode 100644
index 0000000..7620f0a
--- /dev/null
+++ b/blas/level2_real_impl.h

@@ -0,0 +1,306 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "common.h"
+
+// y = alpha*A*x + beta*y
+int EIGEN_BLAS_FUNC(symv) (const char *uplo, const int *n, const RealScalar *palpha, const RealScalar *pa, const int *lda,
+                           const RealScalar *px, const int *incx, const RealScalar *pbeta, RealScalar *py, const int *incy)
+{
+  typedef void (*functype)(int, const Scalar*, int, const Scalar*, Scalar*, Scalar);
+  static const functype func[2] = {
+    // array index: UP
+    (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Upper,false,false>::run),
+    // array index: LO
+    (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Lower,false,false>::run),
+  };
+
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* x = reinterpret_cast<const Scalar*>(px);
+  Scalar* y = reinterpret_cast<Scalar*>(py);
+  Scalar alpha  = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta   = *reinterpret_cast<const Scalar*>(pbeta);
+
+  // check arguments
+  int info = 0;
+  if(UPLO(*uplo)==INVALID)        info = 1;
+  else if(*n<0)                   info = 2;
+  else if(*lda<std::max(1,*n))    info = 5;
+  else if(*incx==0)               info = 7;
+  else if(*incy==0)               info = 10;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"SYMV ",&info,6);
+
+  if(*n==0)
+    return 0;
+
+  const Scalar* actual_x = get_compact_vector(x,*n,*incx);
+  Scalar* actual_y = get_compact_vector(y,*n,*incy);
+
+  if(beta!=Scalar(1))
+  {
+    if(beta==Scalar(0)) make_vector(actual_y, *n).setZero();
+    else                make_vector(actual_y, *n) *= beta;
+  }
+
+  int code = UPLO(*uplo);
+  if(code>=2 || func[code]==0)
+    return 0;
+
+  func[code](*n, a, *lda, actual_x, actual_y, alpha);
+
+  if(actual_x!=x) delete[] actual_x;
+  if(actual_y!=y) delete[] copy_back(actual_y,y,*n,*incy);
+
+  return 1;
+}
+
+// C := alpha*x*x' + C
+int EIGEN_BLAS_FUNC(syr)(const char *uplo, const int *n, const RealScalar *palpha, const RealScalar *px, const int *incx, RealScalar *pc, const int *ldc)
+{
+
+  typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, const Scalar&);
+  static const functype func[2] = {
+    // array index: UP
+    (selfadjoint_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run),
+    // array index: LO
+    (selfadjoint_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run),
+  };
+
+  const Scalar* x = reinterpret_cast<const Scalar*>(px);
+  Scalar* c = reinterpret_cast<Scalar*>(pc);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
+
+  int info = 0;
+  if(UPLO(*uplo)==INVALID)                                            info = 1;
+  else if(*n<0)                                                       info = 2;
+  else if(*incx==0)                                                   info = 5;
+  else if(*ldc<std::max(1,*n))                                        info = 7;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"SYR  ",&info,6);
+
+  if(*n==0 || alpha==Scalar(0)) return 1;
+
+  // if the increment is not 1, let's copy it to a temporary vector to enable vectorization
+  const Scalar* x_cpy = get_compact_vector(x,*n,*incx);
+
+  int code = UPLO(*uplo);
+  if(code>=2 || func[code]==0)
+    return 0;
+
+  func[code](*n, c, *ldc, x_cpy, x_cpy, alpha);
+
+  if(x_cpy!=x)  delete[] x_cpy;
+
+  return 1;
+}
+
+// C := alpha*x*y' + alpha*y*x' + C
+int EIGEN_BLAS_FUNC(syr2)(const char *uplo, const int *n, const RealScalar *palpha, const RealScalar *px, const int *incx, const RealScalar *py, const int *incy, RealScalar *pc, const int *ldc)
+{
+  typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, Scalar);
+  static const functype func[2] = {
+    // array index: UP
+    (internal::rank2_update_selector<Scalar,int,Upper>::run),
+    // array index: LO
+    (internal::rank2_update_selector<Scalar,int,Lower>::run),
+  };
+
+  const Scalar* x = reinterpret_cast<const Scalar*>(px);
+  const Scalar* y = reinterpret_cast<const Scalar*>(py);
+  Scalar* c = reinterpret_cast<Scalar*>(pc);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
+
+  int info = 0;
+  if(UPLO(*uplo)==INVALID)                                            info = 1;
+  else if(*n<0)                                                       info = 2;
+  else if(*incx==0)                                                   info = 5;
+  else if(*incy==0)                                                   info = 7;
+  else if(*ldc<std::max(1,*n))                                        info = 9;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"SYR2 ",&info,6);
+
+  if(alpha==Scalar(0))
+    return 1;
+
+  const Scalar* x_cpy = get_compact_vector(x,*n,*incx);
+  const Scalar* y_cpy = get_compact_vector(y,*n,*incy);
+
+  int code = UPLO(*uplo);
+  if(code>=2 || func[code]==0)
+    return 0;
+
+  func[code](*n, c, *ldc, x_cpy, y_cpy, alpha);
+
+  if(x_cpy!=x)  delete[] x_cpy;
+  if(y_cpy!=y)  delete[] y_cpy;
+
+//   int code = UPLO(*uplo);
+//   if(code>=2 || func[code]==0)
+//     return 0;
+
+//   func[code](*n, a, *inca, b, *incb, c, *ldc, alpha);
+  return 1;
+}
+
+/**  DSBMV  performs the matrix-vector  operation
+  *
+  *     y := alpha*A*x + beta*y,
+  *
+  *  where alpha and beta are scalars, x and y are n element vectors and
+  *  A is an n by n symmetric band matrix, with k super-diagonals.
+  */
+// int EIGEN_BLAS_FUNC(sbmv)( char *uplo, int *n, int *k, RealScalar *alpha, RealScalar *a, int *lda,
+//                            RealScalar *x, int *incx, RealScalar *beta, RealScalar *y, int *incy)
+// {
+//   return 1;
+// }
+
+
+/**  DSPMV  performs the matrix-vector operation
+  *
+  *     y := alpha*A*x + beta*y,
+  *
+  *  where alpha and beta are scalars, x and y are n element vectors and
+  *  A is an n by n symmetric matrix, supplied in packed form.
+  *
+  */
+// int EIGEN_BLAS_FUNC(spmv)(char *uplo, int *n, RealScalar *alpha, RealScalar *ap, RealScalar *x, int *incx, RealScalar *beta, RealScalar *y, int *incy)
+// {
+//   return 1;
+// }
+
+/**  DSPR    performs the symmetric rank 1 operation
+  *
+  *     A := alpha*x*x' + A,
+  *
+  *  where alpha is a real scalar, x is an n element vector and A is an
+  *  n by n symmetric matrix, supplied in packed form.
+  */
+int EIGEN_BLAS_FUNC(spr)(char *uplo, int *n, Scalar *palpha, Scalar *px, int *incx, Scalar *pap)
+{
+  typedef void (*functype)(int, Scalar*, const Scalar*, Scalar);
+  static const functype func[2] = {
+    // array index: UP
+    (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Upper,false,false>::run),
+    // array index: LO
+    (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Lower,false,false>::run),
+  };
+
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  Scalar* ap = reinterpret_cast<Scalar*>(pap);
+  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
+
+  int info = 0;
+  if(UPLO(*uplo)==INVALID)                                            info = 1;
+  else if(*n<0)                                                       info = 2;
+  else if(*incx==0)                                                   info = 5;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"SPR  ",&info,6);
+
+  if(alpha==Scalar(0))
+    return 1;
+
+  Scalar* x_cpy = get_compact_vector(x, *n, *incx);
+
+  int code = UPLO(*uplo);
+  if(code>=2 || func[code]==0)
+    return 0;
+
+  func[code](*n, ap, x_cpy, alpha);
+
+  if(x_cpy!=x)  delete[] x_cpy;
+
+  return 1;
+}
+
+/**  DSPR2  performs the symmetric rank 2 operation
+  *
+  *     A := alpha*x*y' + alpha*y*x' + A,
+  *
+  *  where alpha is a scalar, x and y are n element vectors and A is an
+  *  n by n symmetric matrix, supplied in packed form.
+  */
+int EIGEN_BLAS_FUNC(spr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pap)
+{
+  typedef void (*functype)(int, Scalar*, const Scalar*, const Scalar*, Scalar);
+  static const functype func[2] = {
+    // array index: UP
+    (internal::packed_rank2_update_selector<Scalar,int,Upper>::run),
+    // array index: LO
+    (internal::packed_rank2_update_selector<Scalar,int,Lower>::run),
+  };
+
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  Scalar* y = reinterpret_cast<Scalar*>(py);
+  Scalar* ap = reinterpret_cast<Scalar*>(pap);
+  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
+
+  int info = 0;
+  if(UPLO(*uplo)==INVALID)                                            info = 1;
+  else if(*n<0)                                                       info = 2;
+  else if(*incx==0)                                                   info = 5;
+  else if(*incy==0)                                                   info = 7;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"SPR2 ",&info,6);
+
+  if(alpha==Scalar(0))
+    return 1;
+
+  Scalar* x_cpy = get_compact_vector(x, *n, *incx);
+  Scalar* y_cpy = get_compact_vector(y, *n, *incy);
+
+  int code = UPLO(*uplo);
+  if(code>=2 || func[code]==0)
+    return 0;
+
+  func[code](*n, ap, x_cpy, y_cpy, alpha);
+
+  if(x_cpy!=x)  delete[] x_cpy;
+  if(y_cpy!=y)  delete[] y_cpy;
+
+  return 1;
+}
+
+/**  DGER   performs the rank 1 operation
+  *
+  *     A := alpha*x*y' + A,
+  *
+  *  where alpha is a scalar, x is an m element vector, y is an n element
+  *  vector and A is an m by n matrix.
+  */
+int EIGEN_BLAS_FUNC(ger)(int *m, int *n, Scalar *palpha, Scalar *px, int *incx, Scalar *py, int *incy, Scalar *pa, int *lda)
+{
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  Scalar* y = reinterpret_cast<Scalar*>(py);
+  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
+
+  int info = 0;
+       if(*m<0)                                                       info = 1;
+  else if(*n<0)                                                       info = 2;
+  else if(*incx==0)                                                   info = 5;
+  else if(*incy==0)                                                   info = 7;
+  else if(*lda<std::max(1,*m))                                        info = 9;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"GER  ",&info,6);
+
+  if(alpha==Scalar(0))
+    return 1;
+
+  Scalar* x_cpy = get_compact_vector(x,*m,*incx);
+  Scalar* y_cpy = get_compact_vector(y,*n,*incy);
+
+  internal::general_rank1_update<Scalar,int,ColMajor,false,false>::run(*m, *n, a, *lda, x_cpy, y_cpy, alpha);
+
+  if(x_cpy!=x)  delete[] x_cpy;
+  if(y_cpy!=y)  delete[] y_cpy;
+
+  return 1;
+}

diff --git a/blas/level3_impl.h b/blas/level3_impl.h
new file mode 100644
index 0000000..6dd6338
--- /dev/null
+++ b/blas/level3_impl.h

@@ -0,0 +1,702 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#include <iostream>
+#include "common.h"
+
+int EIGEN_BLAS_FUNC(gemm)(const char *opa, const char *opb, const int *m, const int *n, const int *k, const RealScalar *palpha,
+                          const RealScalar *pa, const int *lda, const RealScalar *pb, const int *ldb, const RealScalar *pbeta, RealScalar *pc, const int *ldc)
+{
+//   std::cerr << "in gemm " << *opa << " " << *opb << " " << *m << " " << *n << " " << *k << " " << *lda << " " << *ldb << " " << *ldc << " " << *palpha << " " << *pbeta << "\n";
+  typedef void (*functype)(DenseIndex, DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, DenseIndex, Scalar, internal::level3_blocking<Scalar,Scalar>&, Eigen::internal::GemmParallelInfo<DenseIndex>*);
+  static const functype func[12] = {
+    // array index: NOTR  | (NOTR << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,ColMajor,false,ColMajor,1>::run),
+    // array index: TR    | (NOTR << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,false,ColMajor,1>::run),
+    // array index: ADJ   | (NOTR << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor,1>::run),
+    0,
+    // array index: NOTR  | (TR   << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,false,ColMajor,1>::run),
+    // array index: TR    | (TR   << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,RowMajor,false,ColMajor,1>::run),
+    // array index: ADJ   | (TR   << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,RowMajor,false,ColMajor,1>::run),
+    0,
+    // array index: NOTR  | (ADJ  << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor,1>::run),
+    // array index: TR    | (ADJ  << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,RowMajor,Conj, ColMajor,1>::run),
+    // array index: ADJ   | (ADJ  << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,RowMajor,Conj, ColMajor,1>::run),
+    0
+  };
+
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* b = reinterpret_cast<const Scalar*>(pb);
+  Scalar* c = reinterpret_cast<Scalar*>(pc);
+  Scalar alpha  = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta   = *reinterpret_cast<const Scalar*>(pbeta);
+
+  int info = 0;
+  if(OP(*opa)==INVALID)                                               info = 1;
+  else if(OP(*opb)==INVALID)                                          info = 2;
+  else if(*m<0)                                                       info = 3;
+  else if(*n<0)                                                       info = 4;
+  else if(*k<0)                                                       info = 5;
+  else if(*lda<std::max(1,(OP(*opa)==NOTR)?*m:*k))                    info = 8;
+  else if(*ldb<std::max(1,(OP(*opb)==NOTR)?*k:*n))                    info = 10;
+  else if(*ldc<std::max(1,*m))                                        info = 13;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"GEMM ",&info,6);
+
+  if (*m == 0 || *n == 0)
+    return 0;
+
+  if(beta!=Scalar(1))
+  {
+    if(beta==Scalar(0)) matrix(c, *m, *n, *ldc).setZero();
+    else                matrix(c, *m, *n, *ldc) *= beta;
+  }
+
+  if(*k == 0)
+    return 0;
+
+  internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,*k,1,true);
+
+  int code = OP(*opa) | (OP(*opb) << 2);
+  func[code](*m, *n, *k, a, *lda, b, *ldb, c, 1, *ldc, alpha, blocking, 0);
+  return 0;
+}
+
+int EIGEN_BLAS_FUNC(trsm)(const char *side, const char *uplo, const char *opa, const char *diag, const int *m, const int *n,
+                          const RealScalar *palpha,  const RealScalar *pa, const int *lda, RealScalar *pb, const int *ldb)
+{
+//   std::cerr << "in trsm " << *side << " " << *uplo << " " << *opa << " " << *diag << " " << *m << "," << *n << " " << *palpha << " " << *lda << " " << *ldb<< "\n";
+  typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, DenseIndex, internal::level3_blocking<Scalar,Scalar>&);
+  static const functype func[32] = {
+    // array index: NOTR  | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          false,ColMajor,ColMajor,1>::run),
+    // array index: TR    | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          false,RowMajor,ColMajor,1>::run),
+    // array index: ADJ   | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          Conj, RowMajor,ColMajor,1>::run),\
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|0,          false,ColMajor,ColMajor,1>::run),
+    // array index: TR    | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|0,          false,RowMajor,ColMajor,1>::run),
+    // array index: ADJ   | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|0,          Conj, RowMajor,ColMajor,1>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          false,ColMajor,ColMajor,1>::run),
+    // array index: TR    | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          false,RowMajor,ColMajor,1>::run),
+    // array index: ADJ   | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          Conj, RowMajor,ColMajor,1>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|0,          false,ColMajor,ColMajor,1>::run),
+    // array index: TR    | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|0,          false,RowMajor,ColMajor,1>::run),
+    // array index: ADJ   | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|0,          Conj, RowMajor,ColMajor,1>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|UnitDiag,false,ColMajor,ColMajor,1>::run),
+    // array index: TR    | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|UnitDiag,false,RowMajor,ColMajor,1>::run),
+    // array index: ADJ   | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|UnitDiag,Conj, RowMajor,ColMajor,1>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|UnitDiag,false,ColMajor,ColMajor,1>::run),
+    // array index: TR    | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|UnitDiag,false,RowMajor,ColMajor,1>::run),
+    // array index: ADJ   | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|UnitDiag,Conj, RowMajor,ColMajor,1>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|UnitDiag,false,ColMajor,ColMajor,1>::run),
+    // array index: TR    | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|UnitDiag,false,RowMajor,ColMajor,1>::run),
+    // array index: ADJ   | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|UnitDiag,Conj, RowMajor,ColMajor,1>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|UnitDiag,false,ColMajor,ColMajor,1>::run),
+    // array index: TR    | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|UnitDiag,false,RowMajor,ColMajor,1>::run),
+    // array index: ADJ   | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|UnitDiag,Conj, RowMajor,ColMajor,1>::run),
+    0
+  };
+
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  Scalar* b = reinterpret_cast<Scalar*>(pb);
+  Scalar  alpha = *reinterpret_cast<const Scalar*>(palpha);
+
+  int info = 0;
+  if(SIDE(*side)==INVALID)                                            info = 1;
+  else if(UPLO(*uplo)==INVALID)                                       info = 2;
+  else if(OP(*opa)==INVALID)                                          info = 3;
+  else if(DIAG(*diag)==INVALID)                                       info = 4;
+  else if(*m<0)                                                       info = 5;
+  else if(*n<0)                                                       info = 6;
+  else if(*lda<std::max(1,(SIDE(*side)==LEFT)?*m:*n))                 info = 9;
+  else if(*ldb<std::max(1,*m))                                        info = 11;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"TRSM ",&info,6);
+
+  if(*m==0 || *n==0)
+    return 0;
+
+  int code = OP(*opa) | (SIDE(*side) << 2) | (UPLO(*uplo) << 3) | (DIAG(*diag) << 4);
+
+  if(SIDE(*side)==LEFT)
+  {
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m,1,false);
+    func[code](*m, *n, a, *lda, b, 1, *ldb, blocking);
+  }
+  else
+  {
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n,1,false);
+    func[code](*n, *m, a, *lda, b, 1, *ldb, blocking);
+  }
+
+  if(alpha!=Scalar(1))
+    matrix(b,*m,*n,*ldb) *= alpha;
+
+  return 0;
+}
+
+
+// b = alpha*op(a)*b  for side = 'L'or'l'
+// b = alpha*b*op(a)  for side = 'R'or'r'
+int EIGEN_BLAS_FUNC(trmm)(const char *side, const char *uplo, const char *opa, const char *diag, const int *m, const int *n,
+                          const RealScalar *palpha, const RealScalar *pa, const int *lda, RealScalar *pb, const int *ldb)
+{
+//   std::cerr << "in trmm " << *side << " " << *uplo << " " << *opa << " " << *diag << " " << *m << " " << *n << " " << *lda << " " << *ldb << " " << *palpha << "\n";
+  typedef void (*functype)(DenseIndex, DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, DenseIndex, const Scalar&, internal::level3_blocking<Scalar,Scalar>&);
+  static const functype func[32] = {
+    // array index: NOTR  | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          true, ColMajor,false,ColMajor,false,ColMajor,1>::run),
+    // array index: TR    | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          true, RowMajor,false,ColMajor,false,ColMajor,1>::run),
+    // array index: ADJ   | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          true, RowMajor,Conj, ColMajor,false,ColMajor,1>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          false,ColMajor,false,ColMajor,false,ColMajor,1>::run),
+    // array index: TR    | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          false,ColMajor,false,RowMajor,false,ColMajor,1>::run),
+    // array index: ADJ   | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          false,ColMajor,false,RowMajor,Conj, ColMajor,1>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          true, ColMajor,false,ColMajor,false,ColMajor,1>::run),
+    // array index: TR    | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          true, RowMajor,false,ColMajor,false,ColMajor,1>::run),
+    // array index: ADJ   | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          true, RowMajor,Conj, ColMajor,false,ColMajor,1>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          false,ColMajor,false,ColMajor,false,ColMajor,1>::run),
+    // array index: TR    | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          false,ColMajor,false,RowMajor,false,ColMajor,1>::run),
+    // array index: ADJ   | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          false,ColMajor,false,RowMajor,Conj, ColMajor,1>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,true, ColMajor,false,ColMajor,false,ColMajor,1>::run),
+    // array index: TR    | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,true, RowMajor,false,ColMajor,false,ColMajor,1>::run),
+    // array index: ADJ   | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,true, RowMajor,Conj, ColMajor,false,ColMajor,1>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,false,ColMajor,false,ColMajor,false,ColMajor,1>::run),
+    // array index: TR    | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,false,ColMajor,false,RowMajor,false,ColMajor,1>::run),
+    // array index: ADJ   | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,false,ColMajor,false,RowMajor,Conj, ColMajor,1>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,true, ColMajor,false,ColMajor,false,ColMajor,1>::run),
+    // array index: TR    | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,true, RowMajor,false,ColMajor,false,ColMajor,1>::run),
+    // array index: ADJ   | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,true, RowMajor,Conj, ColMajor,false,ColMajor,1>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,false,ColMajor,false,ColMajor,false,ColMajor,1>::run),
+    // array index: TR    | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,false,ColMajor,false,RowMajor,false,ColMajor,1>::run),
+    // array index: ADJ   | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,false,ColMajor,false,RowMajor,Conj, ColMajor,1>::run),
+    0
+  };
+
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  Scalar* b = reinterpret_cast<Scalar*>(pb);
+  Scalar  alpha = *reinterpret_cast<const Scalar*>(palpha);
+
+  int info = 0;
+  if(SIDE(*side)==INVALID)                                            info = 1;
+  else if(UPLO(*uplo)==INVALID)                                       info = 2;
+  else if(OP(*opa)==INVALID)                                          info = 3;
+  else if(DIAG(*diag)==INVALID)                                       info = 4;
+  else if(*m<0)                                                       info = 5;
+  else if(*n<0)                                                       info = 6;
+  else if(*lda<std::max(1,(SIDE(*side)==LEFT)?*m:*n))                 info = 9;
+  else if(*ldb<std::max(1,*m))                                        info = 11;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"TRMM ",&info,6);
+
+  int code = OP(*opa) | (SIDE(*side) << 2) | (UPLO(*uplo) << 3) | (DIAG(*diag) << 4);
+
+  if(*m==0 || *n==0)
+    return 1;
+
+  // FIXME find a way to avoid this copy
+  Matrix<Scalar,Dynamic,Dynamic,ColMajor> tmp = matrix(b,*m,*n,*ldb);
+  matrix(b,*m,*n,*ldb).setZero();
+
+  if(SIDE(*side)==LEFT)
+  {
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m,1,false);
+    func[code](*m, *n, *m, a, *lda, tmp.data(), tmp.outerStride(), b, 1, *ldb, alpha, blocking);
+  }
+  else
+  {
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n,1,false);
+    func[code](*m, *n, *n, tmp.data(), tmp.outerStride(), a, *lda, b, 1, *ldb, alpha, blocking);
+  }
+  return 1;
+}
+
+// c = alpha*a*b + beta*c  for side = 'L'or'l'
+// c = alpha*b*a + beta*c  for side = 'R'or'r
+int EIGEN_BLAS_FUNC(symm)(const char *side, const char *uplo, const int *m, const int *n, const RealScalar *palpha,
+                          const RealScalar *pa, const int *lda, const RealScalar *pb, const int *ldb, const RealScalar *pbeta, RealScalar *pc, const int *ldc)
+{
+//   std::cerr << "in symm " << *side << " " << *uplo << " " << *m << "x" << *n << " lda:" << *lda << " ldb:" << *ldb << " ldc:" << *ldc << " alpha:" << *palpha << " beta:" << *pbeta << "\n";
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* b = reinterpret_cast<const Scalar*>(pb);
+  Scalar* c = reinterpret_cast<Scalar*>(pc);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta  = *reinterpret_cast<const Scalar*>(pbeta);
+
+  int info = 0;
+  if(SIDE(*side)==INVALID)                                            info = 1;
+  else if(UPLO(*uplo)==INVALID)                                       info = 2;
+  else if(*m<0)                                                       info = 3;
+  else if(*n<0)                                                       info = 4;
+  else if(*lda<std::max(1,(SIDE(*side)==LEFT)?*m:*n))                 info = 7;
+  else if(*ldb<std::max(1,*m))                                        info = 9;
+  else if(*ldc<std::max(1,*m))                                        info = 12;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"SYMM ",&info,6);
+
+  if(beta!=Scalar(1))
+  {
+    if(beta==Scalar(0)) matrix(c, *m, *n, *ldc).setZero();
+    else                matrix(c, *m, *n, *ldc) *= beta;
+  }
+
+  if(*m==0 || *n==0)
+  {
+    return 1;
+  }
+
+  int size = (SIDE(*side)==LEFT) ? (*m) : (*n);
+  #if ISCOMPLEX
+  // FIXME add support for symmetric complex matrix
+  Matrix<Scalar,Dynamic,Dynamic,ColMajor> matA(size,size);
+  if(UPLO(*uplo)==UP)
+  {
+    matA.triangularView<Upper>() = matrix(a,size,size,*lda);
+    matA.triangularView<Lower>() = matrix(a,size,size,*lda).transpose();
+  }
+  else if(UPLO(*uplo)==LO)
+  {
+    matA.triangularView<Lower>() = matrix(a,size,size,*lda);
+    matA.triangularView<Upper>() = matrix(a,size,size,*lda).transpose();
+  }
+  if(SIDE(*side)==LEFT)
+    matrix(c, *m, *n, *ldc) += alpha * matA * matrix(b, *m, *n, *ldb);
+  else if(SIDE(*side)==RIGHT)
+    matrix(c, *m, *n, *ldc) += alpha * matrix(b, *m, *n, *ldb) * matA;
+  #else
+  internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,size,1,false);
+
+  if(SIDE(*side)==LEFT)
+    if(UPLO(*uplo)==UP)       internal::product_selfadjoint_matrix<Scalar, DenseIndex, RowMajor,true,false, ColMajor,false,false, ColMajor,1>::run(*m, *n, a, *lda, b, *ldb, c, 1, *ldc, alpha, blocking);
+    else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar, DenseIndex, ColMajor,true,false, ColMajor,false,false, ColMajor,1>::run(*m, *n, a, *lda, b, *ldb, c, 1, *ldc, alpha, blocking);
+    else                      return 0;
+  else if(SIDE(*side)==RIGHT)
+    if(UPLO(*uplo)==UP)       internal::product_selfadjoint_matrix<Scalar, DenseIndex, ColMajor,false,false, RowMajor,true,false, ColMajor,1>::run(*m, *n, b, *ldb, a, *lda, c, 1, *ldc, alpha, blocking);
+    else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar, DenseIndex, ColMajor,false,false, ColMajor,true,false, ColMajor,1>::run(*m, *n, b, *ldb, a, *lda, c, 1, *ldc, alpha, blocking);
+    else                      return 0;
+  else
+    return 0;
+  #endif
+
+  return 0;
+}
+
+// c = alpha*a*a' + beta*c  for op = 'N'or'n'
+// c = alpha*a'*a + beta*c  for op = 'T'or't','C'or'c'
+int EIGEN_BLAS_FUNC(syrk)(const char *uplo, const char *op, const int *n, const int *k,
+                          const RealScalar *palpha, const RealScalar *pa, const int *lda, const RealScalar *pbeta, RealScalar *pc, const int *ldc)
+{
+//   std::cerr << "in syrk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n";
+  #if !ISCOMPLEX
+  typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, DenseIndex, const Scalar&, internal::level3_blocking<Scalar,Scalar>&);
+  static const functype func[8] = {
+    // array index: NOTR  | (UP << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,ColMajor,Conj, 1, Upper>::run),
+    // array index: TR    | (UP << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,ColMajor,Conj, 1, Upper>::run),
+    // array index: ADJ   | (UP << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,ColMajor,false,1, Upper>::run),
+    0,
+    // array index: NOTR  | (LO << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,ColMajor,Conj, 1, Lower>::run),
+    // array index: TR    | (LO << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,ColMajor,Conj, 1, Lower>::run),
+    // array index: ADJ   | (LO << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,ColMajor,false,1, Lower>::run),
+    0
+  };
+  #endif
+
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  Scalar* c = reinterpret_cast<Scalar*>(pc);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta  = *reinterpret_cast<const Scalar*>(pbeta);
+
+  int info = 0;
+  if(UPLO(*uplo)==INVALID)                                            info = 1;
+  else if(OP(*op)==INVALID || (ISCOMPLEX && OP(*op)==ADJ) )           info = 2;
+  else if(*n<0)                                                       info = 3;
+  else if(*k<0)                                                       info = 4;
+  else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 7;
+  else if(*ldc<std::max(1,*n))                                        info = 10;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"SYRK ",&info,6);
+
+  if(beta!=Scalar(1))
+  {
+    if(UPLO(*uplo)==UP)
+      if(beta==Scalar(0)) matrix(c, *n, *n, *ldc).triangularView<Upper>().setZero();
+      else                matrix(c, *n, *n, *ldc).triangularView<Upper>() *= beta;
+    else
+      if(beta==Scalar(0)) matrix(c, *n, *n, *ldc).triangularView<Lower>().setZero();
+      else                matrix(c, *n, *n, *ldc).triangularView<Lower>() *= beta;
+  }
+
+  if(*n==0 || *k==0)
+    return 0;
+
+  #if ISCOMPLEX
+  // FIXME add support for symmetric complex matrix
+  if(UPLO(*uplo)==UP)
+  {
+    if(OP(*op)==NOTR)
+      matrix(c, *n, *n, *ldc).triangularView<Upper>() += alpha * matrix(a,*n,*k,*lda) * matrix(a,*n,*k,*lda).transpose();
+    else
+      matrix(c, *n, *n, *ldc).triangularView<Upper>() += alpha * matrix(a,*k,*n,*lda).transpose() * matrix(a,*k,*n,*lda);
+  }
+  else
+  {
+    if(OP(*op)==NOTR)
+      matrix(c, *n, *n, *ldc).triangularView<Lower>() += alpha * matrix(a,*n,*k,*lda) * matrix(a,*n,*k,*lda).transpose();
+    else
+      matrix(c, *n, *n, *ldc).triangularView<Lower>() += alpha * matrix(a,*k,*n,*lda).transpose() * matrix(a,*k,*n,*lda);
+  }
+  #else
+  internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*n,*n,*k,1,false);
+
+  int code = OP(*op) | (UPLO(*uplo) << 2);
+  func[code](*n, *k, a, *lda, a, *lda, c, 1, *ldc, alpha, blocking);
+  #endif
+
+  return 0;
+}
+
+// c = alpha*a*b' + alpha*b*a' + beta*c  for op = 'N'or'n'
+// c = alpha*a'*b + alpha*b'*a + beta*c  for op = 'T'or't'
+int EIGEN_BLAS_FUNC(syr2k)(const char *uplo, const char *op, const int *n, const int *k, const RealScalar *palpha,
+                           const RealScalar *pa, const int *lda, const RealScalar *pb, const int *ldb, const RealScalar *pbeta, RealScalar *pc, const int *ldc)
+{
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* b = reinterpret_cast<const Scalar*>(pb);
+  Scalar* c = reinterpret_cast<Scalar*>(pc);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta  = *reinterpret_cast<const Scalar*>(pbeta);
+
+//   std::cerr << "in syr2k " << *uplo << " " << *op << " " << *n << " " << *k << " " << alpha << " " << *lda << " " << *ldb << " " << beta << " " << *ldc << "\n";
+
+  int info = 0;
+  if(UPLO(*uplo)==INVALID)                                            info = 1;
+  else if(OP(*op)==INVALID || (ISCOMPLEX && OP(*op)==ADJ) )           info = 2;
+  else if(*n<0)                                                       info = 3;
+  else if(*k<0)                                                       info = 4;
+  else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 7;
+  else if(*ldb<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 9;
+  else if(*ldc<std::max(1,*n))                                        info = 12;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"SYR2K",&info,6);
+
+  if(beta!=Scalar(1))
+  {
+    if(UPLO(*uplo)==UP)
+      if(beta==Scalar(0)) matrix(c, *n, *n, *ldc).triangularView<Upper>().setZero();
+      else                matrix(c, *n, *n, *ldc).triangularView<Upper>() *= beta;
+    else
+      if(beta==Scalar(0)) matrix(c, *n, *n, *ldc).triangularView<Lower>().setZero();
+      else                matrix(c, *n, *n, *ldc).triangularView<Lower>() *= beta;
+  }
+
+  if(*k==0)
+    return 1;
+
+  if(OP(*op)==NOTR)
+  {
+    if(UPLO(*uplo)==UP)
+    {
+      matrix(c, *n, *n, *ldc).triangularView<Upper>()
+        += alpha *matrix(a, *n, *k, *lda)*matrix(b, *n, *k, *ldb).transpose()
+        +  alpha*matrix(b, *n, *k, *ldb)*matrix(a, *n, *k, *lda).transpose();
+    }
+    else if(UPLO(*uplo)==LO)
+      matrix(c, *n, *n, *ldc).triangularView<Lower>()
+        += alpha*matrix(a, *n, *k, *lda)*matrix(b, *n, *k, *ldb).transpose()
+        +  alpha*matrix(b, *n, *k, *ldb)*matrix(a, *n, *k, *lda).transpose();
+  }
+  else if(OP(*op)==TR || OP(*op)==ADJ)
+  {
+    if(UPLO(*uplo)==UP)
+      matrix(c, *n, *n, *ldc).triangularView<Upper>()
+        += alpha*matrix(a, *k, *n, *lda).transpose()*matrix(b, *k, *n, *ldb)
+        +  alpha*matrix(b, *k, *n, *ldb).transpose()*matrix(a, *k, *n, *lda);
+    else if(UPLO(*uplo)==LO)
+      matrix(c, *n, *n, *ldc).triangularView<Lower>()
+        += alpha*matrix(a, *k, *n, *lda).transpose()*matrix(b, *k, *n, *ldb)
+        +  alpha*matrix(b, *k, *n, *ldb).transpose()*matrix(a, *k, *n, *lda);
+  }
+
+  return 0;
+}
+
+
+#if ISCOMPLEX
+
+// c = alpha*a*b + beta*c  for side = 'L'or'l'
+// c = alpha*b*a + beta*c  for side = 'R'or'r
+int EIGEN_BLAS_FUNC(hemm)(const char *side, const char *uplo, const int *m, const int *n, const RealScalar *palpha,
+                          const RealScalar *pa, const int *lda, const RealScalar *pb, const int *ldb, const RealScalar *pbeta, RealScalar *pc, const int *ldc)
+{
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* b = reinterpret_cast<const Scalar*>(pb);
+  Scalar* c = reinterpret_cast<Scalar*>(pc);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta  = *reinterpret_cast<const Scalar*>(pbeta);
+
+//   std::cerr << "in hemm " << *side << " " << *uplo << " " << *m << " " << *n << " " << alpha << " " << *lda << " " << beta << " " << *ldc << "\n";
+
+  int info = 0;
+  if(SIDE(*side)==INVALID)                                            info = 1;
+  else if(UPLO(*uplo)==INVALID)                                       info = 2;
+  else if(*m<0)                                                       info = 3;
+  else if(*n<0)                                                       info = 4;
+  else if(*lda<std::max(1,(SIDE(*side)==LEFT)?*m:*n))                 info = 7;
+  else if(*ldb<std::max(1,*m))                                        info = 9;
+  else if(*ldc<std::max(1,*m))                                        info = 12;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"HEMM ",&info,6);
+
+  if(beta==Scalar(0))       matrix(c, *m, *n, *ldc).setZero();
+  else if(beta!=Scalar(1))  matrix(c, *m, *n, *ldc) *= beta;
+
+  if(*m==0 || *n==0)
+  {
+    return 1;
+  }
+
+  int size = (SIDE(*side)==LEFT) ? (*m) : (*n);
+  internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,size,1,false);
+
+  if(SIDE(*side)==LEFT)
+  {
+    if(UPLO(*uplo)==UP)       internal::product_selfadjoint_matrix<Scalar,DenseIndex,RowMajor,true,Conj,  ColMajor,false,false, ColMajor, 1>
+                                ::run(*m, *n, a, *lda, b, *ldb, c, 1, *ldc, alpha, blocking);
+    else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar,DenseIndex,ColMajor,true,false, ColMajor,false,false, ColMajor,1>
+                                ::run(*m, *n, a, *lda, b, *ldb, c, 1, *ldc, alpha, blocking);
+    else                      return 0;
+  }
+  else if(SIDE(*side)==RIGHT)
+  {
+    if(UPLO(*uplo)==UP)       matrix(c,*m,*n,*ldc) += alpha * matrix(b,*m,*n,*ldb) * matrix(a,*n,*n,*lda).selfadjointView<Upper>();/*internal::product_selfadjoint_matrix<Scalar,DenseIndex,ColMajor,false,false, RowMajor,true,Conj,  ColMajor, 1>
+                                ::run(*m, *n, b, *ldb, a, *lda, c, 1, *ldc, alpha, blocking);*/
+    else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar,DenseIndex,ColMajor,false,false, ColMajor,true,false, ColMajor,1>
+                                ::run(*m, *n, b, *ldb, a, *lda, c, 1, *ldc, alpha, blocking);
+    else                      return 0;
+  }
+  else
+  {
+    return 0;
+  }
+
+  return 0;
+}
+
+// c = alpha*a*conj(a') + beta*c  for op = 'N'or'n'
+// c = alpha*conj(a')*a + beta*c  for op  = 'C'or'c'
+int EIGEN_BLAS_FUNC(herk)(const char *uplo, const char *op, const int *n, const int *k,
+                          const RealScalar *palpha, const RealScalar *pa, const int *lda, const RealScalar *pbeta, RealScalar *pc, const int *ldc)
+{
+//   std::cerr << "in herk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n";
+
+  typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, DenseIndex, const Scalar&, internal::level3_blocking<Scalar,Scalar>&);
+  static const functype func[8] = {
+    // array index: NOTR  | (UP << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor,1,Upper>::run),
+    0,
+    // array index: ADJ   | (UP << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor,1,Upper>::run),
+    0,
+    // array index: NOTR  | (LO << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor,1,Lower>::run),
+    0,
+    // array index: ADJ   | (LO << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor,1,Lower>::run),
+    0
+  };
+
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  Scalar* c = reinterpret_cast<Scalar*>(pc);
+  RealScalar alpha = *palpha;
+  RealScalar beta  = *pbeta;
+
+//   std::cerr << "in herk " << *uplo << " " << *op << " " << *n << " " << *k << " " << alpha << " " << *lda << " " << beta << " " << *ldc << "\n";
+
+  int info = 0;
+  if(UPLO(*uplo)==INVALID)                                            info = 1;
+  else if((OP(*op)==INVALID) || (OP(*op)==TR))                        info = 2;
+  else if(*n<0)                                                       info = 3;
+  else if(*k<0)                                                       info = 4;
+  else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 7;
+  else if(*ldc<std::max(1,*n))                                        info = 10;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"HERK ",&info,6);
+
+  int code = OP(*op) | (UPLO(*uplo) << 2);
+
+  if(beta!=RealScalar(1))
+  {
+    if(UPLO(*uplo)==UP)
+      if(beta==Scalar(0)) matrix(c, *n, *n, *ldc).triangularView<Upper>().setZero();
+      else                matrix(c, *n, *n, *ldc).triangularView<StrictlyUpper>() *= beta;
+    else
+      if(beta==Scalar(0)) matrix(c, *n, *n, *ldc).triangularView<Lower>().setZero();
+      else                matrix(c, *n, *n, *ldc).triangularView<StrictlyLower>() *= beta;
+
+    if(beta!=Scalar(0))
+    {
+      matrix(c, *n, *n, *ldc).diagonal().real() *= beta;
+      matrix(c, *n, *n, *ldc).diagonal().imag().setZero();
+    }
+  }
+
+  if(*k>0 && alpha!=RealScalar(0))
+  {
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*n,*n,*k,1,false);
+    func[code](*n, *k, a, *lda, a, *lda, c, 1, *ldc, alpha, blocking);
+    matrix(c, *n, *n, *ldc).diagonal().imag().setZero();
+  }
+  return 0;
+}
+
+// c = alpha*a*conj(b') + conj(alpha)*b*conj(a') + beta*c,  for op = 'N'or'n'
+// c = alpha*conj(a')*b + conj(alpha)*conj(b')*a + beta*c,  for op = 'C'or'c'
+int EIGEN_BLAS_FUNC(her2k)(const char *uplo, const char *op, const int *n, const int *k,
+                           const RealScalar *palpha, const RealScalar *pa, const int *lda, const RealScalar *pb, const int *ldb, const RealScalar *pbeta, RealScalar *pc, const int *ldc)
+{
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* b = reinterpret_cast<const Scalar*>(pb);
+  Scalar* c = reinterpret_cast<Scalar*>(pc);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
+  RealScalar beta  = *pbeta;
+
+//   std::cerr << "in her2k " << *uplo << " " << *op << " " << *n << " " << *k << " " << alpha << " " << *lda << " " << *ldb << " " << beta << " " << *ldc << "\n";
+
+  int info = 0;
+  if(UPLO(*uplo)==INVALID)                                            info = 1;
+  else if((OP(*op)==INVALID) || (OP(*op)==TR))                        info = 2;
+  else if(*n<0)                                                       info = 3;
+  else if(*k<0)                                                       info = 4;
+  else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 7;
+  else if(*ldb<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 9;
+  else if(*ldc<std::max(1,*n))                                        info = 12;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"HER2K",&info,6);
+
+  if(beta!=RealScalar(1))
+  {
+    if(UPLO(*uplo)==UP)
+      if(beta==Scalar(0)) matrix(c, *n, *n, *ldc).triangularView<Upper>().setZero();
+      else                matrix(c, *n, *n, *ldc).triangularView<StrictlyUpper>() *= beta;
+    else
+      if(beta==Scalar(0)) matrix(c, *n, *n, *ldc).triangularView<Lower>().setZero();
+      else                matrix(c, *n, *n, *ldc).triangularView<StrictlyLower>() *= beta;
+
+    if(beta!=Scalar(0))
+    {
+      matrix(c, *n, *n, *ldc).diagonal().real() *= beta;
+      matrix(c, *n, *n, *ldc).diagonal().imag().setZero();
+    }
+  }
+  else if(*k>0 && alpha!=Scalar(0))
+    matrix(c, *n, *n, *ldc).diagonal().imag().setZero();
+
+  if(*k==0)
+    return 1;
+
+  if(OP(*op)==NOTR)
+  {
+    if(UPLO(*uplo)==UP)
+    {
+      matrix(c, *n, *n, *ldc).triangularView<Upper>()
+        +=            alpha *matrix(a, *n, *k, *lda)*matrix(b, *n, *k, *ldb).adjoint()
+        +  numext::conj(alpha)*matrix(b, *n, *k, *ldb)*matrix(a, *n, *k, *lda).adjoint();
+    }
+    else if(UPLO(*uplo)==LO)
+      matrix(c, *n, *n, *ldc).triangularView<Lower>()
+        += alpha*matrix(a, *n, *k, *lda)*matrix(b, *n, *k, *ldb).adjoint()
+        +  numext::conj(alpha)*matrix(b, *n, *k, *ldb)*matrix(a, *n, *k, *lda).adjoint();
+  }
+  else if(OP(*op)==ADJ)
+  {
+    if(UPLO(*uplo)==UP)
+      matrix(c, *n, *n, *ldc).triangularView<Upper>()
+        +=             alpha*matrix(a, *k, *n, *lda).adjoint()*matrix(b, *k, *n, *ldb)
+        +  numext::conj(alpha)*matrix(b, *k, *n, *ldb).adjoint()*matrix(a, *k, *n, *lda);
+    else if(UPLO(*uplo)==LO)
+      matrix(c, *n, *n, *ldc).triangularView<Lower>()
+        +=             alpha*matrix(a, *k, *n, *lda).adjoint()*matrix(b, *k, *n, *ldb)
+        +  numext::conj(alpha)*matrix(b, *k, *n, *ldb).adjoint()*matrix(a, *k, *n, *lda);
+  }
+
+  return 1;
+}
+
+#endif // ISCOMPLEX

diff --git a/blas/single.cpp b/blas/single.cpp
new file mode 100644
index 0000000..e66879a
--- /dev/null
+++ b/blas/single.cpp

@@ -0,0 +1,22 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define SCALAR        float
+#define SCALAR_SUFFIX s
+#define SCALAR_SUFFIX_UP "S"
+#define ISCOMPLEX     0
+
+#include "level1_impl.h"
+#include "level1_real_impl.h"
+#include "level2_impl.h"
+#include "level2_real_impl.h"
+#include "level3_impl.h"
+
+float EIGEN_BLAS_FUNC(dsdot)(int* n, float* alpha, float* x, int* incx, float* y, int* incy)
+{ return double(*alpha) + BLASFUNC(dsdot)(n, x, incx, y, incy); }

diff --git a/blas/testing/CMakeLists.txt b/blas/testing/CMakeLists.txt
new file mode 100644
index 0000000..52c23ac
--- /dev/null
+++ b/blas/testing/CMakeLists.txt

@@ -0,0 +1,40 @@
+
+macro(ei_add_blas_test testname)
+
+  set(targetname ${testname})
+
+  set(filename ${testname}.f)
+  add_executable(${targetname} ${filename})
+
+  target_link_libraries(${targetname} eigen_blas)
+
+  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+    target_link_libraries(${targetname} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+  endif()
+
+  target_link_libraries(${targetname} ${EXTERNAL_LIBS})
+
+  add_test(${testname} "${Eigen_SOURCE_DIR}/blas/testing/runblastest.sh" "${testname}" "${Eigen_SOURCE_DIR}/blas/testing/${testname}.dat")
+  add_dependencies(buildtests ${targetname})
+  
+endmacro()
+
+ei_add_blas_test(sblat1)
+ei_add_blas_test(sblat2)
+ei_add_blas_test(sblat3)
+
+ei_add_blas_test(dblat1)
+ei_add_blas_test(dblat2)
+ei_add_blas_test(dblat3)
+
+ei_add_blas_test(cblat1)
+ei_add_blas_test(cblat2)
+ei_add_blas_test(cblat3)
+
+ei_add_blas_test(zblat1)
+ei_add_blas_test(zblat2)
+ei_add_blas_test(zblat3)
+
+# add_custom_target(level1)
+# add_dependencies(level1 sblat1)
+

diff --git a/blas/testing/cblat1.f b/blas/testing/cblat1.f
new file mode 100644
index 0000000..73015f5
--- /dev/null
+++ b/blas/testing/cblat1.f

@@ -0,0 +1,724 @@
+*> \brief \b CBLAT1
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM CBLAT1
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*>    Test program for the COMPLEX Level 1 BLAS.
+*>    Based upon the original BLAS test routine together with:
+*>
+*>    F06GAF Example Program Text
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complex_blas_testing
+*
+*  =====================================================================
+      PROGRAM CBLAT1
+*
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER          NOUT
+      PARAMETER        (NOUT=6)
+*     .. Scalars in Common ..
+      INTEGER          ICASE, INCX, INCY, MODE, N
+      LOGICAL          PASS
+*     .. Local Scalars ..
+      REAL             SFAC
+      INTEGER          IC
+*     .. External Subroutines ..
+      EXTERNAL         CHECK1, CHECK2, HEADER
+*     .. Common blocks ..
+      COMMON           /COMBLA/ICASE, N, INCX, INCY, MODE, PASS
+*     .. Data statements ..
+      DATA             SFAC/9.765625E-4/
+*     .. Executable Statements ..
+      WRITE (NOUT,99999)
+      DO 20 IC = 1, 10
+         ICASE = IC
+         CALL HEADER
+*
+*        Initialize PASS, INCX, INCY, and MODE for a new case.
+*        The value 9999 for INCX, INCY or MODE will appear in the
+*        detailed  output, if any, for cases that do not involve
+*        these parameters.
+*
+         PASS = .TRUE.
+         INCX = 9999
+         INCY = 9999
+         MODE = 9999
+         IF (ICASE.LE.5) THEN
+            CALL CHECK2(SFAC)
+         ELSE IF (ICASE.GE.6) THEN
+            CALL CHECK1(SFAC)
+         END IF
+*        -- Print
+         IF (PASS) WRITE (NOUT,99998)
+   20 CONTINUE
+      STOP
+*
+99999 FORMAT (' Complex BLAS Test Program Results',/1X)
+99998 FORMAT ('                                    ----- PASS -----')
+      END
+      SUBROUTINE HEADER
+*     .. Parameters ..
+      INTEGER          NOUT
+      PARAMETER        (NOUT=6)
+*     .. Scalars in Common ..
+      INTEGER          ICASE, INCX, INCY, MODE, N
+      LOGICAL          PASS
+*     .. Local Arrays ..
+      CHARACTER*6      L(10)
+*     .. Common blocks ..
+      COMMON           /COMBLA/ICASE, N, INCX, INCY, MODE, PASS
+*     .. Data statements ..
+      DATA             L(1)/'CDOTC '/
+      DATA             L(2)/'CDOTU '/
+      DATA             L(3)/'CAXPY '/
+      DATA             L(4)/'CCOPY '/
+      DATA             L(5)/'CSWAP '/
+      DATA             L(6)/'SCNRM2'/
+      DATA             L(7)/'SCASUM'/
+      DATA             L(8)/'CSCAL '/
+      DATA             L(9)/'CSSCAL'/
+      DATA             L(10)/'ICAMAX'/
+*     .. Executable Statements ..
+      WRITE (NOUT,99999) ICASE, L(ICASE)
+      RETURN
+*
+99999 FORMAT (/' Test of subprogram number',I3,12X,A6)
+      END
+      SUBROUTINE CHECK1(SFAC)
+*     .. Parameters ..
+      INTEGER           NOUT
+      PARAMETER         (NOUT=6)
+*     .. Scalar Arguments ..
+      REAL              SFAC
+*     .. Scalars in Common ..
+      INTEGER           ICASE, INCX, INCY, MODE, N
+      LOGICAL           PASS
+*     .. Local Scalars ..
+      COMPLEX           CA
+      REAL              SA
+      INTEGER           I, J, LEN, NP1
+*     .. Local Arrays ..
+      COMPLEX           CTRUE5(8,5,2), CTRUE6(8,5,2), CV(8,5,2), CX(8),
+     +                  MWPCS(5), MWPCT(5)
+      REAL              STRUE2(5), STRUE4(5)
+      INTEGER           ITRUE3(5)
+*     .. External Functions ..
+      REAL              SCASUM, SCNRM2
+      INTEGER           ICAMAX
+      EXTERNAL          SCASUM, SCNRM2, ICAMAX
+*     .. External Subroutines ..
+      EXTERNAL          CSCAL, CSSCAL, CTEST, ITEST1, STEST1
+*     .. Intrinsic Functions ..
+      INTRINSIC         MAX
+*     .. Common blocks ..
+      COMMON            /COMBLA/ICASE, N, INCX, INCY, MODE, PASS
+*     .. Data statements ..
+      DATA              SA, CA/0.3E0, (0.4E0,-0.7E0)/
+      DATA              ((CV(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0),
+     +                  (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0),
+     +                  (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0),
+     +                  (1.0E0,2.0E0), (0.3E0,-0.4E0), (3.0E0,4.0E0),
+     +                  (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0),
+     +                  (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0),
+     +                  (0.1E0,-0.3E0), (0.5E0,-0.1E0), (5.0E0,6.0E0),
+     +                  (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0),
+     +                  (5.0E0,6.0E0), (5.0E0,6.0E0), (0.1E0,0.1E0),
+     +                  (-0.6E0,0.1E0), (0.1E0,-0.3E0), (7.0E0,8.0E0),
+     +                  (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0),
+     +                  (7.0E0,8.0E0), (0.3E0,0.1E0), (0.5E0,0.0E0),
+     +                  (0.0E0,0.5E0), (0.0E0,0.2E0), (2.0E0,3.0E0),
+     +                  (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/
+      DATA              ((CV(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0),
+     +                  (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0),
+     +                  (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0),
+     +                  (4.0E0,5.0E0), (0.3E0,-0.4E0), (6.0E0,7.0E0),
+     +                  (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0),
+     +                  (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0),
+     +                  (0.1E0,-0.3E0), (8.0E0,9.0E0), (0.5E0,-0.1E0),
+     +                  (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0),
+     +                  (2.0E0,5.0E0), (2.0E0,5.0E0), (0.1E0,0.1E0),
+     +                  (3.0E0,6.0E0), (-0.6E0,0.1E0), (4.0E0,7.0E0),
+     +                  (0.1E0,-0.3E0), (7.0E0,2.0E0), (7.0E0,2.0E0),
+     +                  (7.0E0,2.0E0), (0.3E0,0.1E0), (5.0E0,8.0E0),
+     +                  (0.5E0,0.0E0), (6.0E0,9.0E0), (0.0E0,0.5E0),
+     +                  (8.0E0,3.0E0), (0.0E0,0.2E0), (9.0E0,4.0E0)/
+      DATA              STRUE2/0.0E0, 0.5E0, 0.6E0, 0.7E0, 0.8E0/
+      DATA              STRUE4/0.0E0, 0.7E0, 1.0E0, 1.3E0, 1.6E0/
+      DATA              ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0),
+     +                  (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0),
+     +                  (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0),
+     +                  (1.0E0,2.0E0), (-0.16E0,-0.37E0), (3.0E0,4.0E0),
+     +                  (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0),
+     +                  (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0),
+     +                  (-0.17E0,-0.19E0), (0.13E0,-0.39E0),
+     +                  (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0),
+     +                  (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0),
+     +                  (0.11E0,-0.03E0), (-0.17E0,0.46E0),
+     +                  (-0.17E0,-0.19E0), (7.0E0,8.0E0), (7.0E0,8.0E0),
+     +                  (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0),
+     +                  (0.19E0,-0.17E0), (0.20E0,-0.35E0),
+     +                  (0.35E0,0.20E0), (0.14E0,0.08E0),
+     +                  (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0),
+     +                  (2.0E0,3.0E0)/
+      DATA              ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0),
+     +                  (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0),
+     +                  (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0),
+     +                  (4.0E0,5.0E0), (-0.16E0,-0.37E0), (6.0E0,7.0E0),
+     +                  (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0),
+     +                  (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0),
+     +                  (-0.17E0,-0.19E0), (8.0E0,9.0E0),
+     +                  (0.13E0,-0.39E0), (2.0E0,5.0E0), (2.0E0,5.0E0),
+     +                  (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0),
+     +                  (0.11E0,-0.03E0), (3.0E0,6.0E0),
+     +                  (-0.17E0,0.46E0), (4.0E0,7.0E0),
+     +                  (-0.17E0,-0.19E0), (7.0E0,2.0E0), (7.0E0,2.0E0),
+     +                  (7.0E0,2.0E0), (0.19E0,-0.17E0), (5.0E0,8.0E0),
+     +                  (0.20E0,-0.35E0), (6.0E0,9.0E0),
+     +                  (0.35E0,0.20E0), (8.0E0,3.0E0),
+     +                  (0.14E0,0.08E0), (9.0E0,4.0E0)/
+      DATA              ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0),
+     +                  (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0),
+     +                  (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0),
+     +                  (1.0E0,2.0E0), (0.09E0,-0.12E0), (3.0E0,4.0E0),
+     +                  (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0),
+     +                  (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0),
+     +                  (0.03E0,-0.09E0), (0.15E0,-0.03E0),
+     +                  (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0),
+     +                  (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0),
+     +                  (0.03E0,0.03E0), (-0.18E0,0.03E0),
+     +                  (0.03E0,-0.09E0), (7.0E0,8.0E0), (7.0E0,8.0E0),
+     +                  (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0),
+     +                  (0.09E0,0.03E0), (0.15E0,0.00E0),
+     +                  (0.00E0,0.15E0), (0.00E0,0.06E0), (2.0E0,3.0E0),
+     +                  (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/
+      DATA              ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0),
+     +                  (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0),
+     +                  (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0),
+     +                  (4.0E0,5.0E0), (0.09E0,-0.12E0), (6.0E0,7.0E0),
+     +                  (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0),
+     +                  (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0),
+     +                  (0.03E0,-0.09E0), (8.0E0,9.0E0),
+     +                  (0.15E0,-0.03E0), (2.0E0,5.0E0), (2.0E0,5.0E0),
+     +                  (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0),
+     +                  (0.03E0,0.03E0), (3.0E0,6.0E0),
+     +                  (-0.18E0,0.03E0), (4.0E0,7.0E0),
+     +                  (0.03E0,-0.09E0), (7.0E0,2.0E0), (7.0E0,2.0E0),
+     +                  (7.0E0,2.0E0), (0.09E0,0.03E0), (5.0E0,8.0E0),
+     +                  (0.15E0,0.00E0), (6.0E0,9.0E0), (0.00E0,0.15E0),
+     +                  (8.0E0,3.0E0), (0.00E0,0.06E0), (9.0E0,4.0E0)/
+      DATA              ITRUE3/0, 1, 2, 2, 2/
+*     .. Executable Statements ..
+      DO 60 INCX = 1, 2
+         DO 40 NP1 = 1, 5
+            N = NP1 - 1
+            LEN = 2*MAX(N,1)
+*           .. Set vector arguments ..
+            DO 20 I = 1, LEN
+               CX(I) = CV(I,NP1,INCX)
+   20       CONTINUE
+            IF (ICASE.EQ.6) THEN
+*              .. SCNRM2 ..
+               CALL STEST1(SCNRM2(N,CX,INCX),STRUE2(NP1),STRUE2(NP1),
+     +                     SFAC)
+            ELSE IF (ICASE.EQ.7) THEN
+*              .. SCASUM ..
+               CALL STEST1(SCASUM(N,CX,INCX),STRUE4(NP1),STRUE4(NP1),
+     +                     SFAC)
+            ELSE IF (ICASE.EQ.8) THEN
+*              .. CSCAL ..
+               CALL CSCAL(N,CA,CX,INCX)
+               CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX),
+     +                    SFAC)
+            ELSE IF (ICASE.EQ.9) THEN
+*              .. CSSCAL ..
+               CALL CSSCAL(N,SA,CX,INCX)
+               CALL CTEST(LEN,CX,CTRUE6(1,NP1,INCX),CTRUE6(1,NP1,INCX),
+     +                    SFAC)
+            ELSE IF (ICASE.EQ.10) THEN
+*              .. ICAMAX ..
+               CALL ITEST1(ICAMAX(N,CX,INCX),ITRUE3(NP1))
+            ELSE
+               WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
+               STOP
+            END IF
+*
+   40    CONTINUE
+   60 CONTINUE
+*
+      INCX = 1
+      IF (ICASE.EQ.8) THEN
+*        CSCAL
+*        Add a test for alpha equal to zero.
+         CA = (0.0E0,0.0E0)
+         DO 80 I = 1, 5
+            MWPCT(I) = (0.0E0,0.0E0)
+            MWPCS(I) = (1.0E0,1.0E0)
+   80    CONTINUE
+         CALL CSCAL(5,CA,CX,INCX)
+         CALL CTEST(5,CX,MWPCT,MWPCS,SFAC)
+      ELSE IF (ICASE.EQ.9) THEN
+*        CSSCAL
+*        Add a test for alpha equal to zero.
+         SA = 0.0E0
+         DO 100 I = 1, 5
+            MWPCT(I) = (0.0E0,0.0E0)
+            MWPCS(I) = (1.0E0,1.0E0)
+  100    CONTINUE
+         CALL CSSCAL(5,SA,CX,INCX)
+         CALL CTEST(5,CX,MWPCT,MWPCS,SFAC)
+*        Add a test for alpha equal to one.
+         SA = 1.0E0
+         DO 120 I = 1, 5
+            MWPCT(I) = CX(I)
+            MWPCS(I) = CX(I)
+  120    CONTINUE
+         CALL CSSCAL(5,SA,CX,INCX)
+         CALL CTEST(5,CX,MWPCT,MWPCS,SFAC)
+*        Add a test for alpha equal to minus one.
+         SA = -1.0E0
+         DO 140 I = 1, 5
+            MWPCT(I) = -CX(I)
+            MWPCS(I) = -CX(I)
+  140    CONTINUE
+         CALL CSSCAL(5,SA,CX,INCX)
+         CALL CTEST(5,CX,MWPCT,MWPCS,SFAC)
+      END IF
+      RETURN
+      END
+      SUBROUTINE CHECK2(SFAC)
+*     .. Parameters ..
+      INTEGER           NOUT
+      PARAMETER         (NOUT=6)
+*     .. Scalar Arguments ..
+      REAL              SFAC
+*     .. Scalars in Common ..
+      INTEGER           ICASE, INCX, INCY, MODE, N
+      LOGICAL           PASS
+*     .. Local Scalars ..
+      COMPLEX           CA
+      INTEGER           I, J, KI, KN, KSIZE, LENX, LENY, MX, MY
+*     .. Local Arrays ..
+      COMPLEX           CDOT(1), CSIZE1(4), CSIZE2(7,2), CSIZE3(14),
+     +                  CT10X(7,4,4), CT10Y(7,4,4), CT6(4,4), CT7(4,4),
+     +                  CT8(7,4,4), CX(7), CX1(7), CY(7), CY1(7)
+      INTEGER           INCXS(4), INCYS(4), LENS(4,2), NS(4)
+*     .. External Functions ..
+      COMPLEX           CDOTC, CDOTU
+      EXTERNAL          CDOTC, CDOTU
+*     .. External Subroutines ..
+      EXTERNAL          CAXPY, CCOPY, CSWAP, CTEST
+*     .. Intrinsic Functions ..
+      INTRINSIC         ABS, MIN
+*     .. Common blocks ..
+      COMMON            /COMBLA/ICASE, N, INCX, INCY, MODE, PASS
+*     .. Data statements ..
+      DATA              CA/(0.4E0,-0.7E0)/
+      DATA              INCXS/1, 2, -2, -1/
+      DATA              INCYS/1, -2, 1, -2/
+      DATA              LENS/1, 1, 2, 4, 1, 1, 3, 7/
+      DATA              NS/0, 1, 2, 4/
+      DATA              CX1/(0.7E0,-0.8E0), (-0.4E0,-0.7E0),
+     +                  (-0.1E0,-0.9E0), (0.2E0,-0.8E0),
+     +                  (-0.9E0,-0.4E0), (0.1E0,0.4E0), (-0.6E0,0.6E0)/
+      DATA              CY1/(0.6E0,-0.6E0), (-0.9E0,0.5E0),
+     +                  (0.7E0,-0.6E0), (0.1E0,-0.5E0), (-0.1E0,-0.2E0),
+     +                  (-0.5E0,-0.3E0), (0.8E0,-0.7E0)/
+      DATA              ((CT8(I,J,1),I=1,7),J=1,4)/(0.6E0,-0.6E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.32E0,-1.41E0),
+     +                  (-1.55E0,0.5E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.32E0,-1.41E0), (-1.55E0,0.5E0),
+     +                  (0.03E0,-0.89E0), (-0.38E0,-0.96E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/
+      DATA              ((CT8(I,J,2),I=1,7),J=1,4)/(0.6E0,-0.6E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (-0.07E0,-0.89E0),
+     +                  (-0.9E0,0.5E0), (0.42E0,-1.41E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.78E0,0.06E0), (-0.9E0,0.5E0),
+     +                  (0.06E0,-0.13E0), (0.1E0,-0.5E0),
+     +                  (-0.77E0,-0.49E0), (-0.5E0,-0.3E0),
+     +                  (0.52E0,-1.51E0)/
+      DATA              ((CT8(I,J,3),I=1,7),J=1,4)/(0.6E0,-0.6E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (-0.07E0,-0.89E0),
+     +                  (-1.18E0,-0.31E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.78E0,0.06E0), (-1.54E0,0.97E0),
+     +                  (0.03E0,-0.89E0), (-0.18E0,-1.31E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/
+      DATA              ((CT8(I,J,4),I=1,7),J=1,4)/(0.6E0,-0.6E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.32E0,-1.41E0), (-0.9E0,0.5E0),
+     +                  (0.05E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.32E0,-1.41E0),
+     +                  (-0.9E0,0.5E0), (0.05E0,-0.6E0), (0.1E0,-0.5E0),
+     +                  (-0.77E0,-0.49E0), (-0.5E0,-0.3E0),
+     +                  (0.32E0,-1.16E0)/
+      DATA              CT7/(0.0E0,0.0E0), (-0.06E0,-0.90E0),
+     +                  (0.65E0,-0.47E0), (-0.34E0,-1.22E0),
+     +                  (0.0E0,0.0E0), (-0.06E0,-0.90E0),
+     +                  (-0.59E0,-1.46E0), (-1.04E0,-0.04E0),
+     +                  (0.0E0,0.0E0), (-0.06E0,-0.90E0),
+     +                  (-0.83E0,0.59E0), (0.07E0,-0.37E0),
+     +                  (0.0E0,0.0E0), (-0.06E0,-0.90E0),
+     +                  (-0.76E0,-1.15E0), (-1.33E0,-1.82E0)/
+      DATA              CT6/(0.0E0,0.0E0), (0.90E0,0.06E0),
+     +                  (0.91E0,-0.77E0), (1.80E0,-0.10E0),
+     +                  (0.0E0,0.0E0), (0.90E0,0.06E0), (1.45E0,0.74E0),
+     +                  (0.20E0,0.90E0), (0.0E0,0.0E0), (0.90E0,0.06E0),
+     +                  (-0.55E0,0.23E0), (0.83E0,-0.39E0),
+     +                  (0.0E0,0.0E0), (0.90E0,0.06E0), (1.04E0,0.79E0),
+     +                  (1.95E0,1.22E0)/
+      DATA              ((CT10X(I,J,1),I=1,7),J=1,4)/(0.7E0,-0.8E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.6E0,-0.6E0), (-0.9E0,0.5E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.6E0,-0.6E0),
+     +                  (-0.9E0,0.5E0), (0.7E0,-0.6E0), (0.1E0,-0.5E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/
+      DATA              ((CT10X(I,J,2),I=1,7),J=1,4)/(0.7E0,-0.8E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.7E0,-0.6E0), (-0.4E0,-0.7E0),
+     +                  (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.8E0,-0.7E0),
+     +                  (-0.4E0,-0.7E0), (-0.1E0,-0.2E0),
+     +                  (0.2E0,-0.8E0), (0.7E0,-0.6E0), (0.1E0,0.4E0),
+     +                  (0.6E0,-0.6E0)/
+      DATA              ((CT10X(I,J,3),I=1,7),J=1,4)/(0.7E0,-0.8E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (-0.9E0,0.5E0), (-0.4E0,-0.7E0),
+     +                  (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.1E0,-0.5E0),
+     +                  (-0.4E0,-0.7E0), (0.7E0,-0.6E0), (0.2E0,-0.8E0),
+     +                  (-0.9E0,0.5E0), (0.1E0,0.4E0), (0.6E0,-0.6E0)/
+      DATA              ((CT10X(I,J,4),I=1,7),J=1,4)/(0.7E0,-0.8E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.6E0,-0.6E0), (0.7E0,-0.6E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.6E0,-0.6E0),
+     +                  (0.7E0,-0.6E0), (-0.1E0,-0.2E0), (0.8E0,-0.7E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/
+      DATA              ((CT10Y(I,J,1),I=1,7),J=1,4)/(0.6E0,-0.6E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.7E0,-0.8E0), (-0.4E0,-0.7E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.7E0,-0.8E0),
+     +                  (-0.4E0,-0.7E0), (-0.1E0,-0.9E0),
+     +                  (0.2E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0)/
+      DATA              ((CT10Y(I,J,2),I=1,7),J=1,4)/(0.6E0,-0.6E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (-0.1E0,-0.9E0), (-0.9E0,0.5E0),
+     +                  (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (-0.6E0,0.6E0),
+     +                  (-0.9E0,0.5E0), (-0.9E0,-0.4E0), (0.1E0,-0.5E0),
+     +                  (-0.1E0,-0.9E0), (-0.5E0,-0.3E0),
+     +                  (0.7E0,-0.8E0)/
+      DATA              ((CT10Y(I,J,3),I=1,7),J=1,4)/(0.6E0,-0.6E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (-0.1E0,-0.9E0), (0.7E0,-0.8E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (-0.6E0,0.6E0),
+     +                  (-0.9E0,-0.4E0), (-0.1E0,-0.9E0),
+     +                  (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0)/
+      DATA              ((CT10Y(I,J,4),I=1,7),J=1,4)/(0.6E0,-0.6E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.7E0,-0.8E0), (-0.9E0,0.5E0),
+     +                  (-0.4E0,-0.7E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.7E0,-0.8E0),
+     +                  (-0.9E0,0.5E0), (-0.4E0,-0.7E0), (0.1E0,-0.5E0),
+     +                  (-0.1E0,-0.9E0), (-0.5E0,-0.3E0),
+     +                  (0.2E0,-0.8E0)/
+      DATA              CSIZE1/(0.0E0,0.0E0), (0.9E0,0.9E0),
+     +                  (1.63E0,1.73E0), (2.90E0,2.78E0)/
+      DATA              CSIZE3/(0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (1.17E0,1.17E0),
+     +                  (1.17E0,1.17E0), (1.17E0,1.17E0),
+     +                  (1.17E0,1.17E0), (1.17E0,1.17E0),
+     +                  (1.17E0,1.17E0), (1.17E0,1.17E0)/
+      DATA              CSIZE2/(0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0),
+     +                  (0.0E0,0.0E0), (0.0E0,0.0E0), (1.54E0,1.54E0),
+     +                  (1.54E0,1.54E0), (1.54E0,1.54E0),
+     +                  (1.54E0,1.54E0), (1.54E0,1.54E0),
+     +                  (1.54E0,1.54E0), (1.54E0,1.54E0)/
+*     .. Executable Statements ..
+      DO 60 KI = 1, 4
+         INCX = INCXS(KI)
+         INCY = INCYS(KI)
+         MX = ABS(INCX)
+         MY = ABS(INCY)
+*
+         DO 40 KN = 1, 4
+            N = NS(KN)
+            KSIZE = MIN(2,KN)
+            LENX = LENS(KN,MX)
+            LENY = LENS(KN,MY)
+*           .. initialize all argument arrays ..
+            DO 20 I = 1, 7
+               CX(I) = CX1(I)
+               CY(I) = CY1(I)
+   20       CONTINUE
+            IF (ICASE.EQ.1) THEN
+*              .. CDOTC ..
+               CDOT(1) = CDOTC(N,CX,INCX,CY,INCY)
+               CALL CTEST(1,CDOT,CT6(KN,KI),CSIZE1(KN),SFAC)
+            ELSE IF (ICASE.EQ.2) THEN
+*              .. CDOTU ..
+               CDOT(1) = CDOTU(N,CX,INCX,CY,INCY)
+               CALL CTEST(1,CDOT,CT7(KN,KI),CSIZE1(KN),SFAC)
+            ELSE IF (ICASE.EQ.3) THEN
+*              .. CAXPY ..
+               CALL CAXPY(N,CA,CX,INCX,CY,INCY)
+               CALL CTEST(LENY,CY,CT8(1,KN,KI),CSIZE2(1,KSIZE),SFAC)
+            ELSE IF (ICASE.EQ.4) THEN
+*              .. CCOPY ..
+               CALL CCOPY(N,CX,INCX,CY,INCY)
+               CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0)
+            ELSE IF (ICASE.EQ.5) THEN
+*              .. CSWAP ..
+               CALL CSWAP(N,CX,INCX,CY,INCY)
+               CALL CTEST(LENX,CX,CT10X(1,KN,KI),CSIZE3,1.0E0)
+               CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0)
+            ELSE
+               WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
+               STOP
+            END IF
+*
+   40    CONTINUE
+   60 CONTINUE
+      RETURN
+      END
+      SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC)
+*     ********************************* STEST **************************
+*
+*     THIS SUBR COMPARES ARRAYS  SCOMP() AND STRUE() OF LENGTH LEN TO
+*     SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE
+*     NEGLIGIBLE.
+*
+*     C. L. LAWSON, JPL, 1974 DEC 10
+*
+*     .. Parameters ..
+      INTEGER          NOUT
+      REAL             ZERO
+      PARAMETER        (NOUT=6, ZERO=0.0E0)
+*     .. Scalar Arguments ..
+      REAL             SFAC
+      INTEGER          LEN
+*     .. Array Arguments ..
+      REAL             SCOMP(LEN), SSIZE(LEN), STRUE(LEN)
+*     .. Scalars in Common ..
+      INTEGER          ICASE, INCX, INCY, MODE, N
+      LOGICAL          PASS
+*     .. Local Scalars ..
+      REAL             SD
+      INTEGER          I
+*     .. External Functions ..
+      REAL             SDIFF
+      EXTERNAL         SDIFF
+*     .. Intrinsic Functions ..
+      INTRINSIC        ABS
+*     .. Common blocks ..
+      COMMON           /COMBLA/ICASE, N, INCX, INCY, MODE, PASS
+*     .. Executable Statements ..
+*
+      DO 40 I = 1, LEN
+         SD = SCOMP(I) - STRUE(I)
+         IF (ABS(SFAC*SD) .LE. ABS(SSIZE(I))*EPSILON(ZERO))
+     +       GO TO 40
+*
+*                             HERE    SCOMP(I) IS NOT CLOSE TO STRUE(I).
+*
+         IF ( .NOT. PASS) GO TO 20
+*                             PRINT FAIL MESSAGE AND HEADER.
+         PASS = .FALSE.
+         WRITE (NOUT,99999)
+         WRITE (NOUT,99998)
+   20    WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I),
+     +     STRUE(I), SD, SSIZE(I)
+   40 CONTINUE
+      RETURN
+*
+99999 FORMAT ('                                       FAIL')
+99998 FORMAT (/' CASE  N INCX INCY MODE  I                            ',
+     +       ' COMP(I)                             TRUE(I)  DIFFERENCE',
+     +       '     SIZE(I)',/1X)
+99997 FORMAT (1X,I4,I3,3I5,I3,2E36.8,2E12.4)
+      END
+      SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
+*     ************************* STEST1 *****************************
+*
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
+*     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
+*     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
+*
+*     C.L. LAWSON, JPL, 1978 DEC 6
+*
+*     .. Scalar Arguments ..
+      REAL              SCOMP1, SFAC, STRUE1
+*     .. Array Arguments ..
+      REAL              SSIZE(*)
+*     .. Local Arrays ..
+      REAL              SCOMP(1), STRUE(1)
+*     .. External Subroutines ..
+      EXTERNAL          STEST
+*     .. Executable Statements ..
+*
+      SCOMP(1) = SCOMP1
+      STRUE(1) = STRUE1
+      CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC)
+*
+      RETURN
+      END
+      REAL             FUNCTION SDIFF(SA,SB)
+*     ********************************* SDIFF **************************
+*     COMPUTES DIFFERENCE OF TWO NUMBERS.  C. L. LAWSON, JPL 1974 FEB 15
+*
+*     .. Scalar Arguments ..
+      REAL                            SA, SB
+*     .. Executable Statements ..
+      SDIFF = SA - SB
+      RETURN
+      END
+      SUBROUTINE CTEST(LEN,CCOMP,CTRUE,CSIZE,SFAC)
+*     **************************** CTEST *****************************
+*
+*     C.L. LAWSON, JPL, 1978 DEC 6
+*
+*     .. Scalar Arguments ..
+      REAL             SFAC
+      INTEGER          LEN
+*     .. Array Arguments ..
+      COMPLEX          CCOMP(LEN), CSIZE(LEN), CTRUE(LEN)
+*     .. Local Scalars ..
+      INTEGER          I
+*     .. Local Arrays ..
+      REAL             SCOMP(20), SSIZE(20), STRUE(20)
+*     .. External Subroutines ..
+      EXTERNAL         STEST
+*     .. Intrinsic Functions ..
+      INTRINSIC        AIMAG, REAL
+*     .. Executable Statements ..
+      DO 20 I = 1, LEN
+         SCOMP(2*I-1) = REAL(CCOMP(I))
+         SCOMP(2*I) = AIMAG(CCOMP(I))
+         STRUE(2*I-1) = REAL(CTRUE(I))
+         STRUE(2*I) = AIMAG(CTRUE(I))
+         SSIZE(2*I-1) = REAL(CSIZE(I))
+         SSIZE(2*I) = AIMAG(CSIZE(I))
+   20 CONTINUE
+*
+      CALL STEST(2*LEN,SCOMP,STRUE,SSIZE,SFAC)
+      RETURN
+      END
+      SUBROUTINE ITEST1(ICOMP,ITRUE)
+*     ********************************* ITEST1 *************************
+*
+*     THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR
+*     EQUALITY.
+*     C. L. LAWSON, JPL, 1974 DEC 10
+*
+*     .. Parameters ..
+      INTEGER           NOUT
+      PARAMETER         (NOUT=6)
+*     .. Scalar Arguments ..
+      INTEGER           ICOMP, ITRUE
+*     .. Scalars in Common ..
+      INTEGER           ICASE, INCX, INCY, MODE, N
+      LOGICAL           PASS
+*     .. Local Scalars ..
+      INTEGER           ID
+*     .. Common blocks ..
+      COMMON            /COMBLA/ICASE, N, INCX, INCY, MODE, PASS
+*     .. Executable Statements ..
+      IF (ICOMP.EQ.ITRUE) GO TO 40
+*
+*                            HERE ICOMP IS NOT EQUAL TO ITRUE.
+*
+      IF ( .NOT. PASS) GO TO 20
+*                             PRINT FAIL MESSAGE AND HEADER.
+      PASS = .FALSE.
+      WRITE (NOUT,99999)
+      WRITE (NOUT,99998)
+   20 ID = ICOMP - ITRUE
+      WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID
+   40 CONTINUE
+      RETURN
+*
+99999 FORMAT ('                                       FAIL')
+99998 FORMAT (/' CASE  N INCX INCY MODE                               ',
+     +       ' COMP                                TRUE     DIFFERENCE',
+     +       /1X)
+99997 FORMAT (1X,I4,I3,3I5,2I36,I12)
+      END

diff --git a/blas/testing/cblat2.dat b/blas/testing/cblat2.dat
new file mode 100644
index 0000000..ae98730
--- /dev/null
+++ b/blas/testing/cblat2.dat

@@ -0,0 +1,35 @@
+'cblat2.summ'     NAME OF SUMMARY OUTPUT FILE
+6                 UNIT NUMBER OF SUMMARY FILE
+'cblat2.snap'     NAME OF SNAPSHOT OUTPUT FILE
+-1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+F        LOGICAL FLAG, T TO STOP ON FAILURES.
+T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+16.0     THRESHOLD VALUE OF TEST RATIO
+6                 NUMBER OF VALUES OF N
+0 1 2 3 5 9       VALUES OF N
+4                 NUMBER OF VALUES OF K
+0 1 2 4           VALUES OF K
+4                 NUMBER OF VALUES OF INCX AND INCY
+1 2 -1 -2         VALUES OF INCX AND INCY
+3                 NUMBER OF VALUES OF ALPHA
+(0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
+3                 NUMBER OF VALUES OF BETA
+(0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
+CGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+CGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+CHEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+CHBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+CHPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+CTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
+CTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+CTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+CTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
+CTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
+CTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
+CGERC  T PUT F FOR NO TEST. SAME COLUMNS.
+CGERU  T PUT F FOR NO TEST. SAME COLUMNS.
+CHER   T PUT F FOR NO TEST. SAME COLUMNS.
+CHPR   T PUT F FOR NO TEST. SAME COLUMNS.
+CHER2  T PUT F FOR NO TEST. SAME COLUMNS.
+CHPR2  T PUT F FOR NO TEST. SAME COLUMNS.

diff --git a/blas/testing/cblat2.f b/blas/testing/cblat2.f
new file mode 100644
index 0000000..5833ea8
--- /dev/null
+++ b/blas/testing/cblat2.f

@@ -0,0 +1,3279 @@
+*> \brief \b CBLAT2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM CBLAT2
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the COMPLEX          Level 2 Blas.
+*>
+*> The program must be driven by a short data file. The first 18 records
+*> of the file are read using list-directed input, the last 17 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 35 lines:
+*> 'cblat2.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'CBLA2T.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 4                 NUMBER OF VALUES OF K
+*> 0 1 2 4           VALUES OF K
+*> 4                 NUMBER OF VALUES OF INCX AND INCY
+*> 1 2 -1 -2         VALUES OF INCX AND INCY
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
+*> CGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CGERC  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CGERU  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHER   T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHPR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHER2  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHPR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*>    See:
+*>
+*>       Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
+*>       An  extended  set of Fortran  Basic Linear Algebra Subprograms.
+*>
+*>       Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
+*>       and  Computer Science  Division,  Argonne  National Laboratory,
+*>       9700 South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*>       Or
+*>
+*>       NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
+*>       Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
+*>       OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
+*>       Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
+*>
+*>
+*> -- Written on 10-August-1987.
+*>    Richard Hanson, Sandia National Labs.
+*>    Jeremy Du Croz, NAG Central Office.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complex_blas_testing
+*
+*  =====================================================================
+      PROGRAM CBLAT2
+*
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NIN
+      PARAMETER          ( NIN = 5 )
+      INTEGER            NSUBS
+      PARAMETER          ( NSUBS = 17 )
+      COMPLEX            ZERO, ONE
+      PARAMETER          ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) )
+      REAL               RZERO
+      PARAMETER          ( RZERO = 0.0 )
+      INTEGER            NMAX, INCMAX
+      PARAMETER          ( NMAX = 65, INCMAX = 2 )
+      INTEGER            NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX
+      PARAMETER          ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7,
+     $                   NALMAX = 7, NBEMAX = 7 )
+*     .. Local Scalars ..
+      REAL               EPS, ERR, THRESH
+      INTEGER            I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB,
+     $                   NOUT, NTRA
+      LOGICAL            FATAL, LTESTT, REWI, SAME, SFATAL, TRACE,
+     $                   TSTERR
+      CHARACTER*1        TRANS
+      CHARACTER*6        SNAMET
+      CHARACTER*32       SNAPS, SUMMRY
+*     .. Local Arrays ..
+      COMPLEX            A( NMAX, NMAX ), AA( NMAX*NMAX ),
+     $                   ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ),
+     $                   X( NMAX ), XS( NMAX*INCMAX ),
+     $                   XX( NMAX*INCMAX ), Y( NMAX ),
+     $                   YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX ), Z( 2*NMAX )
+      REAL               G( NMAX )
+      INTEGER            IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX )
+      LOGICAL            LTEST( NSUBS )
+      CHARACTER*6        SNAMES( NSUBS )
+*     .. External Functions ..
+      REAL               SDIFF
+      LOGICAL            LCE
+      EXTERNAL           SDIFF, LCE
+*     .. External Subroutines ..
+      EXTERNAL           CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CCHK6,
+     $                   CCHKE, CMVCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+      COMMON             /SRNAMC/SRNAMT
+*     .. Data statements ..
+      DATA               SNAMES/'CGEMV ', 'CGBMV ', 'CHEMV ', 'CHBMV ',
+     $                   'CHPMV ', 'CTRMV ', 'CTBMV ', 'CTPMV ',
+     $                   'CTRSV ', 'CTBSV ', 'CTPSV ', 'CGERC ',
+     $                   'CGERU ', 'CHER  ', 'CHPR  ', 'CHER2 ',
+     $                   'CHPR2 '/
+*     .. Executable Statements ..
+*
+*     Read name and unit number for summary output file and open file.
+*
+      READ( NIN, FMT = * )SUMMRY
+      READ( NIN, FMT = * )NOUT
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
+      NOUTC = NOUT
+*
+*     Read name and unit number for snapshot output file and open file.
+*
+      READ( NIN, FMT = * )SNAPS
+      READ( NIN, FMT = * )NTRA
+      TRACE = NTRA.GE.0
+      IF( TRACE )THEN
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
+      END IF
+*     Read the flag that directs rewinding of the snapshot file.
+      READ( NIN, FMT = * )REWI
+      REWI = REWI.AND.TRACE
+*     Read the flag that directs stopping on any failure.
+      READ( NIN, FMT = * )SFATAL
+*     Read the flag that indicates whether error exits are to be tested.
+      READ( NIN, FMT = * )TSTERR
+*     Read the threshold value of the test ratio
+      READ( NIN, FMT = * )THRESH
+*
+*     Read and check the parameter values for the tests.
+*
+*     Values of N
+      READ( NIN, FMT = * )NIDIM
+      IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'N', NIDMAX
+         GO TO 230
+      END IF
+      READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM )
+      DO 10 I = 1, NIDIM
+         IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN
+            WRITE( NOUT, FMT = 9996 )NMAX
+            GO TO 230
+         END IF
+   10 CONTINUE
+*     Values of K
+      READ( NIN, FMT = * )NKB
+      IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'K', NKBMAX
+         GO TO 230
+      END IF
+      READ( NIN, FMT = * )( KB( I ), I = 1, NKB )
+      DO 20 I = 1, NKB
+         IF( KB( I ).LT.0 )THEN
+            WRITE( NOUT, FMT = 9995 )
+            GO TO 230
+         END IF
+   20 CONTINUE
+*     Values of INCX and INCY
+      READ( NIN, FMT = * )NINC
+      IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX
+         GO TO 230
+      END IF
+      READ( NIN, FMT = * )( INC( I ), I = 1, NINC )
+      DO 30 I = 1, NINC
+         IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN
+            WRITE( NOUT, FMT = 9994 )INCMAX
+            GO TO 230
+         END IF
+   30 CONTINUE
+*     Values of ALPHA
+      READ( NIN, FMT = * )NALF
+      IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX
+         GO TO 230
+      END IF
+      READ( NIN, FMT = * )( ALF( I ), I = 1, NALF )
+*     Values of BETA
+      READ( NIN, FMT = * )NBET
+      IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX
+         GO TO 230
+      END IF
+      READ( NIN, FMT = * )( BET( I ), I = 1, NBET )
+*
+*     Report values of parameters.
+*
+      WRITE( NOUT, FMT = 9993 )
+      WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM )
+      WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB )
+      WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC )
+      WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF )
+      WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET )
+      IF( .NOT.TSTERR )THEN
+         WRITE( NOUT, FMT = * )
+         WRITE( NOUT, FMT = 9980 )
+      END IF
+      WRITE( NOUT, FMT = * )
+      WRITE( NOUT, FMT = 9999 )THRESH
+      WRITE( NOUT, FMT = * )
+*
+*     Read names of subroutines and flags which indicate
+*     whether they are to be tested.
+*
+      DO 40 I = 1, NSUBS
+         LTEST( I ) = .FALSE.
+   40 CONTINUE
+   50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT
+      DO 60 I = 1, NSUBS
+         IF( SNAMET.EQ.SNAMES( I ) )
+     $      GO TO 70
+   60 CONTINUE
+      WRITE( NOUT, FMT = 9986 )SNAMET
+      STOP
+   70 LTEST( I ) = LTESTT
+      GO TO 50
+*
+   80 CONTINUE
+      CLOSE ( NIN )
+*
+*     Compute EPS (the machine precision).
+*
+      EPS = EPSILON(RZERO)
+      WRITE( NOUT, FMT = 9998 )EPS
+*
+*     Check the reliability of CMVCH using exact data.
+*
+      N = MIN( 32, NMAX )
+      DO 120 J = 1, N
+         DO 110 I = 1, N
+            A( I, J ) = MAX( I - J + 1, 0 )
+  110    CONTINUE
+         X( J ) = J
+         Y( J ) = ZERO
+  120 CONTINUE
+      DO 130 J = 1, N
+         YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3
+  130 CONTINUE
+*     YY holds the exact result. On exit from CMVCH YT holds
+*     the result computed by CMVCH.
+      TRANS = 'N'
+      CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G,
+     $            YY, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LCE( YY, YT, N )
+      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
+         STOP
+      END IF
+      TRANS = 'T'
+      CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
+     $            YY, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LCE( YY, YT, N )
+      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
+         STOP
+      END IF
+*
+*     Test each subroutine in turn.
+*
+      DO 210 ISNUM = 1, NSUBS
+         WRITE( NOUT, FMT = * )
+         IF( .NOT.LTEST( ISNUM ) )THEN
+*           Subprogram is not to be tested.
+            WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM )
+         ELSE
+            SRNAMT = SNAMES( ISNUM )
+*           Test error exits.
+            IF( TSTERR )THEN
+               CALL CCHKE( ISNUM, SNAMES( ISNUM ), NOUT )
+               WRITE( NOUT, FMT = * )
+            END IF
+*           Test computations.
+            INFOT = 0
+            OK = .TRUE.
+            FATAL = .FALSE.
+            GO TO ( 140, 140, 150, 150, 150, 160, 160,
+     $              160, 160, 160, 160, 170, 170, 180,
+     $              180, 190, 190 )ISNUM
+*           Test CGEMV, 01, and CGBMV, 02.
+  140       CALL CCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF,
+     $                  NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS,
+     $                  X, XX, XS, Y, YY, YS, YT, G )
+            GO TO 200
+*           Test CHEMV, 03, CHBMV, 04, and CHPMV, 05.
+  150       CALL CCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF,
+     $                  NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS,
+     $                  X, XX, XS, Y, YY, YS, YT, G )
+            GO TO 200
+*           Test CTRMV, 06, CTBMV, 07, CTPMV, 08,
+*           CTRSV, 09, CTBSV, 10, and CTPSV, 11.
+  160       CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC,
+     $                  NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z )
+            GO TO 200
+*           Test CGERC, 12, CGERU, 13.
+  170       CALL CCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC,
+     $                  NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS,
+     $                  YT, G, Z )
+            GO TO 200
+*           Test CHER, 14, and CHPR, 15.
+  180       CALL CCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC,
+     $                  NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS,
+     $                  YT, G, Z )
+            GO TO 200
+*           Test CHER2, 16, and CHPR2, 17.
+  190       CALL CCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC,
+     $                  NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS,
+     $                  YT, G, Z )
+*
+  200       IF( FATAL.AND.SFATAL )
+     $         GO TO 220
+         END IF
+  210 CONTINUE
+      WRITE( NOUT, FMT = 9982 )
+      GO TO 240
+*
+  220 CONTINUE
+      WRITE( NOUT, FMT = 9981 )
+      GO TO 240
+*
+  230 CONTINUE
+      WRITE( NOUT, FMT = 9987 )
+*
+  240 CONTINUE
+      IF( TRACE )
+     $   CLOSE ( NTRA )
+      CLOSE ( NOUT )
+      STOP
+*
+ 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES',
+     $      'S THAN', F8.2 )
+ 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 )
+ 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ',
+     $      'THAN ', I2 )
+ 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 )
+ 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' )
+ 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ',
+     $      I2 )
+ 9993 FORMAT( ' TESTS OF THE COMPLEX          LEVEL 2 BLAS', //' THE F',
+     $      'OLLOWING PARAMETER VALUES WILL BE USED:' )
+ 9992 FORMAT( '   FOR N              ', 9I6 )
+ 9991 FORMAT( '   FOR K              ', 7I6 )
+ 9990 FORMAT( '   FOR INCX AND INCY  ', 7I6 )
+ 9989 FORMAT( '   FOR ALPHA          ',
+     $      7( '(', F4.1, ',', F4.1, ')  ', : ) )
+ 9988 FORMAT( '   FOR BETA           ',
+     $      7( '(', F4.1, ',', F4.1, ')  ', : ) )
+ 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM',
+     $      /' ******* TESTS ABANDONED *******' )
+ 9986 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T',
+     $      'ESTS ABANDONED *******' )
+ 9985 FORMAT( ' ERROR IN CMVCH -  IN-LINE DOT PRODUCTS ARE BEING EVALU',
+     $      'ATED WRONGLY.', /' CMVCH WAS CALLED WITH TRANS = ', A1,
+     $      ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', /
+     $   ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.'
+     $      , /' ******* TESTS ABANDONED *******' )
+ 9984 FORMAT( A6, L2 )
+ 9983 FORMAT( 1X, A6, ' WAS NOT TESTED' )
+ 9982 FORMAT( /' END OF TESTS' )
+ 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' )
+ 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' )
+*
+*     End of CBLAT2.
+*
+      END
+      SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET,
+     $                  BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX,
+     $                  XS, Y, YY, YS, YT, G )
+*
+*  Tests CGEMV and CGBMV.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      COMPLEX            ZERO, HALF
+      PARAMETER          ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ) )
+      REAL               RZERO
+      PARAMETER          ( RZERO = 0.0 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX,
+     $                   NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX            A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), BET( NBET ), X( NMAX ),
+     $                   XS( NMAX*INCMAX ), XX( NMAX*INCMAX ),
+     $                   Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX )
+      REAL               G( NMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC ), KB( NKB )
+*     .. Local Scalars ..
+      COMPLEX            ALPHA, ALS, BETA, BLS, TRANSL
+      REAL               ERR, ERRMAX
+      INTEGER            I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY,
+     $                   INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA,
+     $                   LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK,
+     $                   NL, NS
+      LOGICAL            BANDED, FULL, NULL, RESET, SAME, TRAN
+      CHARACTER*1        TRANS, TRANSS
+      CHARACTER*3        ICH
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LCE, LCERES
+      EXTERNAL           LCE, LCERES
+*     .. External Subroutines ..
+      EXTERNAL           CGBMV, CGEMV, CMAKE, CMVCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICH/'NTC'/
+*     .. Executable Statements ..
+      FULL = SNAME( 3: 3 ).EQ.'E'
+      BANDED = SNAME( 3: 3 ).EQ.'B'
+*     Define the number of arguments.
+      IF( FULL )THEN
+         NARGS = 11
+      ELSE IF( BANDED )THEN
+         NARGS = 13
+      END IF
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*
+      DO 120 IN = 1, NIDIM
+         N = IDIM( IN )
+         ND = N/2 + 1
+*
+         DO 110 IM = 1, 2
+            IF( IM.EQ.1 )
+     $         M = MAX( N - ND, 0 )
+            IF( IM.EQ.2 )
+     $         M = MIN( N + ND, NMAX )
+*
+            IF( BANDED )THEN
+               NK = NKB
+            ELSE
+               NK = 1
+            END IF
+            DO 100 IKU = 1, NK
+               IF( BANDED )THEN
+                  KU = KB( IKU )
+                  KL = MAX( KU - 1, 0 )
+               ELSE
+                  KU = N - 1
+                  KL = M - 1
+               END IF
+*              Set LDA to 1 more than minimum value if room.
+               IF( BANDED )THEN
+                  LDA = KL + KU + 1
+               ELSE
+                  LDA = M
+               END IF
+               IF( LDA.LT.NMAX )
+     $            LDA = LDA + 1
+*              Skip tests if not enough room.
+               IF( LDA.GT.NMAX )
+     $            GO TO 100
+               LAA = LDA*N
+               NULL = N.LE.0.OR.M.LE.0
+*
+*              Generate the matrix A.
+*
+               TRANSL = ZERO
+               CALL CMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, AA,
+     $                     LDA, KL, KU, RESET, TRANSL )
+*
+               DO 90 IC = 1, 3
+                  TRANS = ICH( IC: IC )
+                  TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C'
+*
+                  IF( TRAN )THEN
+                     ML = N
+                     NL = M
+                  ELSE
+                     ML = M
+                     NL = N
+                  END IF
+*
+                  DO 80 IX = 1, NINC
+                     INCX = INC( IX )
+                     LX = ABS( INCX )*NL
+*
+*                    Generate the vector X.
+*
+                     TRANSL = HALF
+                     CALL CMAKE( 'GE', ' ', ' ', 1, NL, X, 1, XX,
+     $                           ABS( INCX ), 0, NL - 1, RESET, TRANSL )
+                     IF( NL.GT.1 )THEN
+                        X( NL/2 ) = ZERO
+                        XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO
+                     END IF
+*
+                     DO 70 IY = 1, NINC
+                        INCY = INC( IY )
+                        LY = ABS( INCY )*ML
+*
+                        DO 60 IA = 1, NALF
+                           ALPHA = ALF( IA )
+*
+                           DO 50 IB = 1, NBET
+                              BETA = BET( IB )
+*
+*                             Generate the vector Y.
+*
+                              TRANSL = ZERO
+                              CALL CMAKE( 'GE', ' ', ' ', 1, ML, Y, 1,
+     $                                    YY, ABS( INCY ), 0, ML - 1,
+     $                                    RESET, TRANSL )
+*
+                              NC = NC + 1
+*
+*                             Save every datum before calling the
+*                             subroutine.
+*
+                              TRANSS = TRANS
+                              MS = M
+                              NS = N
+                              KLS = KL
+                              KUS = KU
+                              ALS = ALPHA
+                              DO 10 I = 1, LAA
+                                 AS( I ) = AA( I )
+   10                         CONTINUE
+                              LDAS = LDA
+                              DO 20 I = 1, LX
+                                 XS( I ) = XX( I )
+   20                         CONTINUE
+                              INCXS = INCX
+                              BLS = BETA
+                              DO 30 I = 1, LY
+                                 YS( I ) = YY( I )
+   30                         CONTINUE
+                              INCYS = INCY
+*
+*                             Call the subroutine.
+*
+                              IF( FULL )THEN
+                                 IF( TRACE )
+     $                              WRITE( NTRA, FMT = 9994 )NC, SNAME,
+     $                              TRANS, M, N, ALPHA, LDA, INCX, BETA,
+     $                              INCY
+                                 IF( REWI )
+     $                              REWIND NTRA
+                                 CALL CGEMV( TRANS, M, N, ALPHA, AA,
+     $                                       LDA, XX, INCX, BETA, YY,
+     $                                       INCY )
+                              ELSE IF( BANDED )THEN
+                                 IF( TRACE )
+     $                              WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                              TRANS, M, N, KL, KU, ALPHA, LDA,
+     $                              INCX, BETA, INCY
+                                 IF( REWI )
+     $                              REWIND NTRA
+                                 CALL CGBMV( TRANS, M, N, KL, KU, ALPHA,
+     $                                       AA, LDA, XX, INCX, BETA,
+     $                                       YY, INCY )
+                              END IF
+*
+*                             Check if error-exit was taken incorrectly.
+*
+                              IF( .NOT.OK )THEN
+                                 WRITE( NOUT, FMT = 9993 )
+                                 FATAL = .TRUE.
+                                 GO TO 130
+                              END IF
+*
+*                             See what data changed inside subroutines.
+*
+                              ISAME( 1 ) = TRANS.EQ.TRANSS
+                              ISAME( 2 ) = MS.EQ.M
+                              ISAME( 3 ) = NS.EQ.N
+                              IF( FULL )THEN
+                                 ISAME( 4 ) = ALS.EQ.ALPHA
+                                 ISAME( 5 ) = LCE( AS, AA, LAA )
+                                 ISAME( 6 ) = LDAS.EQ.LDA
+                                 ISAME( 7 ) = LCE( XS, XX, LX )
+                                 ISAME( 8 ) = INCXS.EQ.INCX
+                                 ISAME( 9 ) = BLS.EQ.BETA
+                                 IF( NULL )THEN
+                                    ISAME( 10 ) = LCE( YS, YY, LY )
+                                 ELSE
+                                    ISAME( 10 ) = LCERES( 'GE', ' ', 1,
+     $                                            ML, YS, YY,
+     $                                            ABS( INCY ) )
+                                 END IF
+                                 ISAME( 11 ) = INCYS.EQ.INCY
+                              ELSE IF( BANDED )THEN
+                                 ISAME( 4 ) = KLS.EQ.KL
+                                 ISAME( 5 ) = KUS.EQ.KU
+                                 ISAME( 6 ) = ALS.EQ.ALPHA
+                                 ISAME( 7 ) = LCE( AS, AA, LAA )
+                                 ISAME( 8 ) = LDAS.EQ.LDA
+                                 ISAME( 9 ) = LCE( XS, XX, LX )
+                                 ISAME( 10 ) = INCXS.EQ.INCX
+                                 ISAME( 11 ) = BLS.EQ.BETA
+                                 IF( NULL )THEN
+                                    ISAME( 12 ) = LCE( YS, YY, LY )
+                                 ELSE
+                                    ISAME( 12 ) = LCERES( 'GE', ' ', 1,
+     $                                            ML, YS, YY,
+     $                                            ABS( INCY ) )
+                                 END IF
+                                 ISAME( 13 ) = INCYS.EQ.INCY
+                              END IF
+*
+*                             If data was incorrectly changed, report
+*                             and return.
+*
+                              SAME = .TRUE.
+                              DO 40 I = 1, NARGS
+                                 SAME = SAME.AND.ISAME( I )
+                                 IF( .NOT.ISAME( I ) )
+     $                              WRITE( NOUT, FMT = 9998 )I
+   40                         CONTINUE
+                              IF( .NOT.SAME )THEN
+                                 FATAL = .TRUE.
+                                 GO TO 130
+                              END IF
+*
+                              IF( .NOT.NULL )THEN
+*
+*                                Check the result.
+*
+                                 CALL CMVCH( TRANS, M, N, ALPHA, A,
+     $                                       NMAX, X, INCX, BETA, Y,
+     $                                       INCY, YT, G, YY, EPS, ERR,
+     $                                       FATAL, NOUT, .TRUE. )
+                                 ERRMAX = MAX( ERRMAX, ERR )
+*                                If got really bad answer, report and
+*                                return.
+                                 IF( FATAL )
+     $                              GO TO 130
+                              ELSE
+*                                Avoid repeating tests with M.le.0 or
+*                                N.le.0.
+                                 GO TO 110
+                              END IF
+*
+   50                      CONTINUE
+*
+   60                   CONTINUE
+*
+   70                CONTINUE
+*
+   80             CONTINUE
+*
+   90          CONTINUE
+*
+  100       CONTINUE
+*
+  110    CONTINUE
+*
+  120 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 140
+*
+  130 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( FULL )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, TRANS, M, N, ALPHA, LDA,
+     $      INCX, BETA, INCY
+      ELSE IF( BANDED )THEN
+         WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANS, M, N, KL, KU,
+     $      ALPHA, LDA, INCX, BETA, INCY
+      END IF
+*
+  140 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 4( I3, ',' ), '(',
+     $      F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',',
+     $      F4.1, '), Y,', I2, ') .' )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), '(',
+     $      F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',',
+     $      F4.1, '), Y,', I2, ')         .' )
+ 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of CCHK1.
+*
+      END
+      SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET,
+     $                  BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX,
+     $                  XS, Y, YY, YS, YT, G )
+*
+*  Tests CHEMV, CHBMV and CHPMV.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      COMPLEX            ZERO, HALF
+      PARAMETER          ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ) )
+      REAL               RZERO
+      PARAMETER          ( RZERO = 0.0 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX,
+     $                   NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX            A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), BET( NBET ), X( NMAX ),
+     $                   XS( NMAX*INCMAX ), XX( NMAX*INCMAX ),
+     $                   Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX )
+      REAL               G( NMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC ), KB( NKB )
+*     .. Local Scalars ..
+      COMPLEX            ALPHA, ALS, BETA, BLS, TRANSL
+      REAL               ERR, ERRMAX
+      INTEGER            I, IA, IB, IC, IK, IN, INCX, INCXS, INCY,
+     $                   INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY,
+     $                   N, NARGS, NC, NK, NS
+      LOGICAL            BANDED, FULL, NULL, PACKED, RESET, SAME
+      CHARACTER*1        UPLO, UPLOS
+      CHARACTER*2        ICH
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LCE, LCERES
+      EXTERNAL           LCE, LCERES
+*     .. External Subroutines ..
+      EXTERNAL           CHBMV, CHEMV, CHPMV, CMAKE, CMVCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICH/'UL'/
+*     .. Executable Statements ..
+      FULL = SNAME( 3: 3 ).EQ.'E'
+      BANDED = SNAME( 3: 3 ).EQ.'B'
+      PACKED = SNAME( 3: 3 ).EQ.'P'
+*     Define the number of arguments.
+      IF( FULL )THEN
+         NARGS = 10
+      ELSE IF( BANDED )THEN
+         NARGS = 11
+      ELSE IF( PACKED )THEN
+         NARGS = 9
+      END IF
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*
+      DO 110 IN = 1, NIDIM
+         N = IDIM( IN )
+*
+         IF( BANDED )THEN
+            NK = NKB
+         ELSE
+            NK = 1
+         END IF
+         DO 100 IK = 1, NK
+            IF( BANDED )THEN
+               K = KB( IK )
+            ELSE
+               K = N - 1
+            END IF
+*           Set LDA to 1 more than minimum value if room.
+            IF( BANDED )THEN
+               LDA = K + 1
+            ELSE
+               LDA = N
+            END IF
+            IF( LDA.LT.NMAX )
+     $         LDA = LDA + 1
+*           Skip tests if not enough room.
+            IF( LDA.GT.NMAX )
+     $         GO TO 100
+            IF( PACKED )THEN
+               LAA = ( N*( N + 1 ) )/2
+            ELSE
+               LAA = LDA*N
+            END IF
+            NULL = N.LE.0
+*
+            DO 90 IC = 1, 2
+               UPLO = ICH( IC: IC )
+*
+*              Generate the matrix A.
+*
+               TRANSL = ZERO
+               CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, AA,
+     $                     LDA, K, K, RESET, TRANSL )
+*
+               DO 80 IX = 1, NINC
+                  INCX = INC( IX )
+                  LX = ABS( INCX )*N
+*
+*                 Generate the vector X.
+*
+                  TRANSL = HALF
+                  CALL CMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX,
+     $                        ABS( INCX ), 0, N - 1, RESET, TRANSL )
+                  IF( N.GT.1 )THEN
+                     X( N/2 ) = ZERO
+                     XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO
+                  END IF
+*
+                  DO 70 IY = 1, NINC
+                     INCY = INC( IY )
+                     LY = ABS( INCY )*N
+*
+                     DO 60 IA = 1, NALF
+                        ALPHA = ALF( IA )
+*
+                        DO 50 IB = 1, NBET
+                           BETA = BET( IB )
+*
+*                          Generate the vector Y.
+*
+                           TRANSL = ZERO
+                           CALL CMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY,
+     $                                 ABS( INCY ), 0, N - 1, RESET,
+     $                                 TRANSL )
+*
+                           NC = NC + 1
+*
+*                          Save every datum before calling the
+*                          subroutine.
+*
+                           UPLOS = UPLO
+                           NS = N
+                           KS = K
+                           ALS = ALPHA
+                           DO 10 I = 1, LAA
+                              AS( I ) = AA( I )
+   10                      CONTINUE
+                           LDAS = LDA
+                           DO 20 I = 1, LX
+                              XS( I ) = XX( I )
+   20                      CONTINUE
+                           INCXS = INCX
+                           BLS = BETA
+                           DO 30 I = 1, LY
+                              YS( I ) = YY( I )
+   30                      CONTINUE
+                           INCYS = INCY
+*
+*                          Call the subroutine.
+*
+                           IF( FULL )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9993 )NC, SNAME,
+     $                           UPLO, N, ALPHA, LDA, INCX, BETA, INCY
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL CHEMV( UPLO, N, ALPHA, AA, LDA, XX,
+     $                                    INCX, BETA, YY, INCY )
+                           ELSE IF( BANDED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9994 )NC, SNAME,
+     $                           UPLO, N, K, ALPHA, LDA, INCX, BETA,
+     $                           INCY
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL CHBMV( UPLO, N, K, ALPHA, AA, LDA,
+     $                                    XX, INCX, BETA, YY, INCY )
+                           ELSE IF( PACKED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                           UPLO, N, ALPHA, INCX, BETA, INCY
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL CHPMV( UPLO, N, ALPHA, AA, XX, INCX,
+     $                                    BETA, YY, INCY )
+                           END IF
+*
+*                          Check if error-exit was taken incorrectly.
+*
+                           IF( .NOT.OK )THEN
+                              WRITE( NOUT, FMT = 9992 )
+                              FATAL = .TRUE.
+                              GO TO 120
+                           END IF
+*
+*                          See what data changed inside subroutines.
+*
+                           ISAME( 1 ) = UPLO.EQ.UPLOS
+                           ISAME( 2 ) = NS.EQ.N
+                           IF( FULL )THEN
+                              ISAME( 3 ) = ALS.EQ.ALPHA
+                              ISAME( 4 ) = LCE( AS, AA, LAA )
+                              ISAME( 5 ) = LDAS.EQ.LDA
+                              ISAME( 6 ) = LCE( XS, XX, LX )
+                              ISAME( 7 ) = INCXS.EQ.INCX
+                              ISAME( 8 ) = BLS.EQ.BETA
+                              IF( NULL )THEN
+                                 ISAME( 9 ) = LCE( YS, YY, LY )
+                              ELSE
+                                 ISAME( 9 ) = LCERES( 'GE', ' ', 1, N,
+     $                                        YS, YY, ABS( INCY ) )
+                              END IF
+                              ISAME( 10 ) = INCYS.EQ.INCY
+                           ELSE IF( BANDED )THEN
+                              ISAME( 3 ) = KS.EQ.K
+                              ISAME( 4 ) = ALS.EQ.ALPHA
+                              ISAME( 5 ) = LCE( AS, AA, LAA )
+                              ISAME( 6 ) = LDAS.EQ.LDA
+                              ISAME( 7 ) = LCE( XS, XX, LX )
+                              ISAME( 8 ) = INCXS.EQ.INCX
+                              ISAME( 9 ) = BLS.EQ.BETA
+                              IF( NULL )THEN
+                                 ISAME( 10 ) = LCE( YS, YY, LY )
+                              ELSE
+                                 ISAME( 10 ) = LCERES( 'GE', ' ', 1, N,
+     $                                         YS, YY, ABS( INCY ) )
+                              END IF
+                              ISAME( 11 ) = INCYS.EQ.INCY
+                           ELSE IF( PACKED )THEN
+                              ISAME( 3 ) = ALS.EQ.ALPHA
+                              ISAME( 4 ) = LCE( AS, AA, LAA )
+                              ISAME( 5 ) = LCE( XS, XX, LX )
+                              ISAME( 6 ) = INCXS.EQ.INCX
+                              ISAME( 7 ) = BLS.EQ.BETA
+                              IF( NULL )THEN
+                                 ISAME( 8 ) = LCE( YS, YY, LY )
+                              ELSE
+                                 ISAME( 8 ) = LCERES( 'GE', ' ', 1, N,
+     $                                        YS, YY, ABS( INCY ) )
+                              END IF
+                              ISAME( 9 ) = INCYS.EQ.INCY
+                           END IF
+*
+*                          If data was incorrectly changed, report and
+*                          return.
+*
+                           SAME = .TRUE.
+                           DO 40 I = 1, NARGS
+                              SAME = SAME.AND.ISAME( I )
+                              IF( .NOT.ISAME( I ) )
+     $                           WRITE( NOUT, FMT = 9998 )I
+   40                      CONTINUE
+                           IF( .NOT.SAME )THEN
+                              FATAL = .TRUE.
+                              GO TO 120
+                           END IF
+*
+                           IF( .NOT.NULL )THEN
+*
+*                             Check the result.
+*
+                              CALL CMVCH( 'N', N, N, ALPHA, A, NMAX, X,
+     $                                    INCX, BETA, Y, INCY, YT, G,
+     $                                    YY, EPS, ERR, FATAL, NOUT,
+     $                                    .TRUE. )
+                              ERRMAX = MAX( ERRMAX, ERR )
+*                             If got really bad answer, report and
+*                             return.
+                              IF( FATAL )
+     $                           GO TO 120
+                           ELSE
+*                             Avoid repeating tests with N.le.0
+                              GO TO 110
+                           END IF
+*
+   50                   CONTINUE
+*
+   60                CONTINUE
+*
+   70             CONTINUE
+*
+   80          CONTINUE
+*
+   90       CONTINUE
+*
+  100    CONTINUE
+*
+  110 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 130
+*
+  120 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( FULL )THEN
+         WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, LDA, INCX,
+     $      BETA, INCY
+      ELSE IF( BANDED )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, K, ALPHA, LDA,
+     $      INCX, BETA, INCY
+      ELSE IF( PACKED )THEN
+         WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, N, ALPHA, INCX,
+     $      BETA, INCY
+      END IF
+*
+  130 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',',
+     $      F4.1, '), AP, X,', I2, ',(', F4.1, ',', F4.1, '), Y,', I2,
+     $      ')                .' )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), '(',
+     $      F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',',
+     $      F4.1, '), Y,', I2, ')         .' )
+ 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',',
+     $      F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', F4.1, '), ',
+     $      'Y,', I2, ')             .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of CCHK2.
+*
+      END
+      SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX,
+     $                  INCMAX, A, AA, AS, X, XX, XS, XT, G, Z )
+*
+*  Tests CTRMV, CTBMV, CTPMV, CTRSV, CTBSV and CTPSV.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      COMPLEX            ZERO, HALF, ONE
+      PARAMETER          ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ),
+     $                   ONE = ( 1.0, 0.0 ) )
+      REAL               RZERO
+      PARAMETER          ( RZERO = 0.0 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX            A( NMAX, NMAX ), AA( NMAX*NMAX ),
+     $                   AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ),
+     $                   XT( NMAX ), XX( NMAX*INCMAX ), Z( NMAX )
+      REAL               G( NMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC ), KB( NKB )
+*     .. Local Scalars ..
+      COMPLEX            TRANSL
+      REAL               ERR, ERRMAX
+      INTEGER            I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K,
+     $                   KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS
+      LOGICAL            BANDED, FULL, NULL, PACKED, RESET, SAME
+      CHARACTER*1        DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS
+      CHARACTER*2        ICHD, ICHU
+      CHARACTER*3        ICHT
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LCE, LCERES
+      EXTERNAL           LCE, LCERES
+*     .. External Subroutines ..
+      EXTERNAL           CMAKE, CMVCH, CTBMV, CTBSV, CTPMV, CTPSV,
+     $                   CTRMV, CTRSV
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/
+*     .. Executable Statements ..
+      FULL = SNAME( 3: 3 ).EQ.'R'
+      BANDED = SNAME( 3: 3 ).EQ.'B'
+      PACKED = SNAME( 3: 3 ).EQ.'P'
+*     Define the number of arguments.
+      IF( FULL )THEN
+         NARGS = 8
+      ELSE IF( BANDED )THEN
+         NARGS = 9
+      ELSE IF( PACKED )THEN
+         NARGS = 7
+      END IF
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*     Set up zero vector for CMVCH.
+      DO 10 I = 1, NMAX
+         Z( I ) = ZERO
+   10 CONTINUE
+*
+      DO 110 IN = 1, NIDIM
+         N = IDIM( IN )
+*
+         IF( BANDED )THEN
+            NK = NKB
+         ELSE
+            NK = 1
+         END IF
+         DO 100 IK = 1, NK
+            IF( BANDED )THEN
+               K = KB( IK )
+            ELSE
+               K = N - 1
+            END IF
+*           Set LDA to 1 more than minimum value if room.
+            IF( BANDED )THEN
+               LDA = K + 1
+            ELSE
+               LDA = N
+            END IF
+            IF( LDA.LT.NMAX )
+     $         LDA = LDA + 1
+*           Skip tests if not enough room.
+            IF( LDA.GT.NMAX )
+     $         GO TO 100
+            IF( PACKED )THEN
+               LAA = ( N*( N + 1 ) )/2
+            ELSE
+               LAA = LDA*N
+            END IF
+            NULL = N.LE.0
+*
+            DO 90 ICU = 1, 2
+               UPLO = ICHU( ICU: ICU )
+*
+               DO 80 ICT = 1, 3
+                  TRANS = ICHT( ICT: ICT )
+*
+                  DO 70 ICD = 1, 2
+                     DIAG = ICHD( ICD: ICD )
+*
+*                    Generate the matrix A.
+*
+                     TRANSL = ZERO
+                     CALL CMAKE( SNAME( 2: 3 ), UPLO, DIAG, N, N, A,
+     $                           NMAX, AA, LDA, K, K, RESET, TRANSL )
+*
+                     DO 60 IX = 1, NINC
+                        INCX = INC( IX )
+                        LX = ABS( INCX )*N
+*
+*                       Generate the vector X.
+*
+                        TRANSL = HALF
+                        CALL CMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX,
+     $                              ABS( INCX ), 0, N - 1, RESET,
+     $                              TRANSL )
+                        IF( N.GT.1 )THEN
+                           X( N/2 ) = ZERO
+                           XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO
+                        END IF
+*
+                        NC = NC + 1
+*
+*                       Save every datum before calling the subroutine.
+*
+                        UPLOS = UPLO
+                        TRANSS = TRANS
+                        DIAGS = DIAG
+                        NS = N
+                        KS = K
+                        DO 20 I = 1, LAA
+                           AS( I ) = AA( I )
+   20                   CONTINUE
+                        LDAS = LDA
+                        DO 30 I = 1, LX
+                           XS( I ) = XX( I )
+   30                   CONTINUE
+                        INCXS = INCX
+*
+*                       Call the subroutine.
+*
+                        IF( SNAME( 4: 5 ).EQ.'MV' )THEN
+                           IF( FULL )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9993 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, LDA, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL CTRMV( UPLO, TRANS, DIAG, N, AA, LDA,
+     $                                    XX, INCX )
+                           ELSE IF( BANDED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9994 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, K, LDA, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL CTBMV( UPLO, TRANS, DIAG, N, K, AA,
+     $                                    LDA, XX, INCX )
+                           ELSE IF( PACKED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL CTPMV( UPLO, TRANS, DIAG, N, AA, XX,
+     $                                    INCX )
+                           END IF
+                        ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN
+                           IF( FULL )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9993 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, LDA, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL CTRSV( UPLO, TRANS, DIAG, N, AA, LDA,
+     $                                    XX, INCX )
+                           ELSE IF( BANDED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9994 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, K, LDA, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL CTBSV( UPLO, TRANS, DIAG, N, K, AA,
+     $                                    LDA, XX, INCX )
+                           ELSE IF( PACKED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL CTPSV( UPLO, TRANS, DIAG, N, AA, XX,
+     $                                    INCX )
+                           END IF
+                        END IF
+*
+*                       Check if error-exit was taken incorrectly.
+*
+                        IF( .NOT.OK )THEN
+                           WRITE( NOUT, FMT = 9992 )
+                           FATAL = .TRUE.
+                           GO TO 120
+                        END IF
+*
+*                       See what data changed inside subroutines.
+*
+                        ISAME( 1 ) = UPLO.EQ.UPLOS
+                        ISAME( 2 ) = TRANS.EQ.TRANSS
+                        ISAME( 3 ) = DIAG.EQ.DIAGS
+                        ISAME( 4 ) = NS.EQ.N
+                        IF( FULL )THEN
+                           ISAME( 5 ) = LCE( AS, AA, LAA )
+                           ISAME( 6 ) = LDAS.EQ.LDA
+                           IF( NULL )THEN
+                              ISAME( 7 ) = LCE( XS, XX, LX )
+                           ELSE
+                              ISAME( 7 ) = LCERES( 'GE', ' ', 1, N, XS,
+     $                                     XX, ABS( INCX ) )
+                           END IF
+                           ISAME( 8 ) = INCXS.EQ.INCX
+                        ELSE IF( BANDED )THEN
+                           ISAME( 5 ) = KS.EQ.K
+                           ISAME( 6 ) = LCE( AS, AA, LAA )
+                           ISAME( 7 ) = LDAS.EQ.LDA
+                           IF( NULL )THEN
+                              ISAME( 8 ) = LCE( XS, XX, LX )
+                           ELSE
+                              ISAME( 8 ) = LCERES( 'GE', ' ', 1, N, XS,
+     $                                     XX, ABS( INCX ) )
+                           END IF
+                           ISAME( 9 ) = INCXS.EQ.INCX
+                        ELSE IF( PACKED )THEN
+                           ISAME( 5 ) = LCE( AS, AA, LAA )
+                           IF( NULL )THEN
+                              ISAME( 6 ) = LCE( XS, XX, LX )
+                           ELSE
+                              ISAME( 6 ) = LCERES( 'GE', ' ', 1, N, XS,
+     $                                     XX, ABS( INCX ) )
+                           END IF
+                           ISAME( 7 ) = INCXS.EQ.INCX
+                        END IF
+*
+*                       If data was incorrectly changed, report and
+*                       return.
+*
+                        SAME = .TRUE.
+                        DO 40 I = 1, NARGS
+                           SAME = SAME.AND.ISAME( I )
+                           IF( .NOT.ISAME( I ) )
+     $                        WRITE( NOUT, FMT = 9998 )I
+   40                   CONTINUE
+                        IF( .NOT.SAME )THEN
+                           FATAL = .TRUE.
+                           GO TO 120
+                        END IF
+*
+                        IF( .NOT.NULL )THEN
+                           IF( SNAME( 4: 5 ).EQ.'MV' )THEN
+*
+*                             Check the result.
+*
+                              CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X,
+     $                                    INCX, ZERO, Z, INCX, XT, G,
+     $                                    XX, EPS, ERR, FATAL, NOUT,
+     $                                    .TRUE. )
+                           ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN
+*
+*                             Compute approximation to original vector.
+*
+                              DO 50 I = 1, N
+                                 Z( I ) = XX( 1 + ( I - 1 )*
+     $                                    ABS( INCX ) )
+                                 XX( 1 + ( I - 1 )*ABS( INCX ) )
+     $                              = X( I )
+   50                         CONTINUE
+                              CALL CMVCH( TRANS, N, N, ONE, A, NMAX, Z,
+     $                                    INCX, ZERO, X, INCX, XT, G,
+     $                                    XX, EPS, ERR, FATAL, NOUT,
+     $                                    .FALSE. )
+                           END IF
+                           ERRMAX = MAX( ERRMAX, ERR )
+*                          If got really bad answer, report and return.
+                           IF( FATAL )
+     $                        GO TO 120
+                        ELSE
+*                          Avoid repeating tests with N.le.0.
+                           GO TO 110
+                        END IF
+*
+   60                CONTINUE
+*
+   70             CONTINUE
+*
+   80          CONTINUE
+*
+   90       CONTINUE
+*
+  100    CONTINUE
+*
+  110 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 130
+*
+  120 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( FULL )THEN
+         WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, DIAG, N, LDA,
+     $      INCX
+      ELSE IF( BANDED )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, DIAG, N, K,
+     $      LDA, INCX
+      ELSE IF( PACKED )THEN
+         WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, TRANS, DIAG, N, INCX
+      END IF
+*
+  130 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', AP, ',
+     $      'X,', I2, ')                                      .' )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), 2( I3, ',' ),
+     $      ' A,', I3, ', X,', I2, ')                               .' )
+ 9993 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', A,',
+     $      I3, ', X,', I2, ')                                   .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of CCHK3.
+*
+      END
+      SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX,
+     $                  INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G,
+     $                  Z )
+*
+*  Tests CGERC and CGERU.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      COMPLEX            ZERO, HALF, ONE
+      PARAMETER          ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ),
+     $                   ONE = ( 1.0, 0.0 ) )
+      REAL               RZERO
+      PARAMETER          ( RZERO = 0.0 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX            A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ),
+     $                   XX( NMAX*INCMAX ), Y( NMAX ),
+     $                   YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX ), Z( NMAX )
+      REAL               G( NMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC )
+*     .. Local Scalars ..
+      COMPLEX            ALPHA, ALS, TRANSL
+      REAL               ERR, ERRMAX
+      INTEGER            I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX,
+     $                   IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS,
+     $                   NC, ND, NS
+      LOGICAL            CONJ, NULL, RESET, SAME
+*     .. Local Arrays ..
+      COMPLEX            W( 1 )
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LCE, LCERES
+      EXTERNAL           LCE, LCERES
+*     .. External Subroutines ..
+      EXTERNAL           CGERC, CGERU, CMAKE, CMVCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, CONJG, MAX, MIN
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Executable Statements ..
+      CONJ = SNAME( 5: 5 ).EQ.'C'
+*     Define the number of arguments.
+      NARGS = 9
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*
+      DO 120 IN = 1, NIDIM
+         N = IDIM( IN )
+         ND = N/2 + 1
+*
+         DO 110 IM = 1, 2
+            IF( IM.EQ.1 )
+     $         M = MAX( N - ND, 0 )
+            IF( IM.EQ.2 )
+     $         M = MIN( N + ND, NMAX )
+*
+*           Set LDA to 1 more than minimum value if room.
+            LDA = M
+            IF( LDA.LT.NMAX )
+     $         LDA = LDA + 1
+*           Skip tests if not enough room.
+            IF( LDA.GT.NMAX )
+     $         GO TO 110
+            LAA = LDA*N
+            NULL = N.LE.0.OR.M.LE.0
+*
+            DO 100 IX = 1, NINC
+               INCX = INC( IX )
+               LX = ABS( INCX )*M
+*
+*              Generate the vector X.
+*
+               TRANSL = HALF
+               CALL CMAKE( 'GE', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ),
+     $                     0, M - 1, RESET, TRANSL )
+               IF( M.GT.1 )THEN
+                  X( M/2 ) = ZERO
+                  XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO
+               END IF
+*
+               DO 90 IY = 1, NINC
+                  INCY = INC( IY )
+                  LY = ABS( INCY )*N
+*
+*                 Generate the vector Y.
+*
+                  TRANSL = ZERO
+                  CALL CMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY,
+     $                        ABS( INCY ), 0, N - 1, RESET, TRANSL )
+                  IF( N.GT.1 )THEN
+                     Y( N/2 ) = ZERO
+                     YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO
+                  END IF
+*
+                  DO 80 IA = 1, NALF
+                     ALPHA = ALF( IA )
+*
+*                    Generate the matrix A.
+*
+                     TRANSL = ZERO
+                     CALL CMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX,
+     $                           AA, LDA, M - 1, N - 1, RESET, TRANSL )
+*
+                     NC = NC + 1
+*
+*                    Save every datum before calling the subroutine.
+*
+                     MS = M
+                     NS = N
+                     ALS = ALPHA
+                     DO 10 I = 1, LAA
+                        AS( I ) = AA( I )
+   10                CONTINUE
+                     LDAS = LDA
+                     DO 20 I = 1, LX
+                        XS( I ) = XX( I )
+   20                CONTINUE
+                     INCXS = INCX
+                     DO 30 I = 1, LY
+                        YS( I ) = YY( I )
+   30                CONTINUE
+                     INCYS = INCY
+*
+*                    Call the subroutine.
+*
+                     IF( TRACE )
+     $                  WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N,
+     $                  ALPHA, INCX, INCY, LDA
+                     IF( CONJ )THEN
+                        IF( REWI )
+     $                     REWIND NTRA
+                        CALL CGERC( M, N, ALPHA, XX, INCX, YY, INCY, AA,
+     $                              LDA )
+                     ELSE
+                        IF( REWI )
+     $                     REWIND NTRA
+                        CALL CGERU( M, N, ALPHA, XX, INCX, YY, INCY, AA,
+     $                              LDA )
+                     END IF
+*
+*                    Check if error-exit was taken incorrectly.
+*
+                     IF( .NOT.OK )THEN
+                        WRITE( NOUT, FMT = 9993 )
+                        FATAL = .TRUE.
+                        GO TO 140
+                     END IF
+*
+*                    See what data changed inside subroutine.
+*
+                     ISAME( 1 ) = MS.EQ.M
+                     ISAME( 2 ) = NS.EQ.N
+                     ISAME( 3 ) = ALS.EQ.ALPHA
+                     ISAME( 4 ) = LCE( XS, XX, LX )
+                     ISAME( 5 ) = INCXS.EQ.INCX
+                     ISAME( 6 ) = LCE( YS, YY, LY )
+                     ISAME( 7 ) = INCYS.EQ.INCY
+                     IF( NULL )THEN
+                        ISAME( 8 ) = LCE( AS, AA, LAA )
+                     ELSE
+                        ISAME( 8 ) = LCERES( 'GE', ' ', M, N, AS, AA,
+     $                               LDA )
+                     END IF
+                     ISAME( 9 ) = LDAS.EQ.LDA
+*
+*                    If data was incorrectly changed, report and return.
+*
+                     SAME = .TRUE.
+                     DO 40 I = 1, NARGS
+                        SAME = SAME.AND.ISAME( I )
+                        IF( .NOT.ISAME( I ) )
+     $                     WRITE( NOUT, FMT = 9998 )I
+   40                CONTINUE
+                     IF( .NOT.SAME )THEN
+                        FATAL = .TRUE.
+                        GO TO 140
+                     END IF
+*
+                     IF( .NOT.NULL )THEN
+*
+*                       Check the result column by column.
+*
+                        IF( INCX.GT.0 )THEN
+                           DO 50 I = 1, M
+                              Z( I ) = X( I )
+   50                      CONTINUE
+                        ELSE
+                           DO 60 I = 1, M
+                              Z( I ) = X( M - I + 1 )
+   60                      CONTINUE
+                        END IF
+                        DO 70 J = 1, N
+                           IF( INCY.GT.0 )THEN
+                              W( 1 ) = Y( J )
+                           ELSE
+                              W( 1 ) = Y( N - J + 1 )
+                           END IF
+                           IF( CONJ )
+     $                        W( 1 ) = CONJG( W( 1 ) )
+                           CALL CMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1,
+     $                                 ONE, A( 1, J ), 1, YT, G,
+     $                                 AA( 1 + ( J - 1 )*LDA ), EPS,
+     $                                 ERR, FATAL, NOUT, .TRUE. )
+                           ERRMAX = MAX( ERRMAX, ERR )
+*                          If got really bad answer, report and return.
+                           IF( FATAL )
+     $                        GO TO 130
+   70                   CONTINUE
+                     ELSE
+*                       Avoid repeating tests with M.le.0 or N.le.0.
+                        GO TO 110
+                     END IF
+*
+   80             CONTINUE
+*
+   90          CONTINUE
+*
+  100       CONTINUE
+*
+  110    CONTINUE
+*
+  120 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 150
+*
+  130 CONTINUE
+      WRITE( NOUT, FMT = 9995 )J
+*
+  140 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA
+*
+  150 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( I3, ',' ), '(', F4.1, ',', F4.1,
+     $      '), X,', I2, ', Y,', I2, ', A,', I3, ')                   ',
+     $      '      .' )
+ 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of CCHK4.
+*
+      END
+      SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX,
+     $                  INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G,
+     $                  Z )
+*
+*  Tests CHER and CHPR.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      COMPLEX            ZERO, HALF, ONE
+      PARAMETER          ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ),
+     $                   ONE = ( 1.0, 0.0 ) )
+      REAL               RZERO
+      PARAMETER          ( RZERO = 0.0 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX            A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ),
+     $                   XX( NMAX*INCMAX ), Y( NMAX ),
+     $                   YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX ), Z( NMAX )
+      REAL               G( NMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC )
+*     .. Local Scalars ..
+      COMPLEX            ALPHA, TRANSL
+      REAL               ERR, ERRMAX, RALPHA, RALS
+      INTEGER            I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA,
+     $                   LDA, LDAS, LJ, LX, N, NARGS, NC, NS
+      LOGICAL            FULL, NULL, PACKED, RESET, SAME, UPPER
+      CHARACTER*1        UPLO, UPLOS
+      CHARACTER*2        ICH
+*     .. Local Arrays ..
+      COMPLEX            W( 1 )
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LCE, LCERES
+      EXTERNAL           LCE, LCERES
+*     .. External Subroutines ..
+      EXTERNAL           CHER, CHPR, CMAKE, CMVCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, CMPLX, CONJG, MAX, REAL
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICH/'UL'/
+*     .. Executable Statements ..
+      FULL = SNAME( 3: 3 ).EQ.'E'
+      PACKED = SNAME( 3: 3 ).EQ.'P'
+*     Define the number of arguments.
+      IF( FULL )THEN
+         NARGS = 7
+      ELSE IF( PACKED )THEN
+         NARGS = 6
+      END IF
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*
+      DO 100 IN = 1, NIDIM
+         N = IDIM( IN )
+*        Set LDA to 1 more than minimum value if room.
+         LDA = N
+         IF( LDA.LT.NMAX )
+     $      LDA = LDA + 1
+*        Skip tests if not enough room.
+         IF( LDA.GT.NMAX )
+     $      GO TO 100
+         IF( PACKED )THEN
+            LAA = ( N*( N + 1 ) )/2
+         ELSE
+            LAA = LDA*N
+         END IF
+*
+         DO 90 IC = 1, 2
+            UPLO = ICH( IC: IC )
+            UPPER = UPLO.EQ.'U'
+*
+            DO 80 IX = 1, NINC
+               INCX = INC( IX )
+               LX = ABS( INCX )*N
+*
+*              Generate the vector X.
+*
+               TRANSL = HALF
+               CALL CMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ),
+     $                     0, N - 1, RESET, TRANSL )
+               IF( N.GT.1 )THEN
+                  X( N/2 ) = ZERO
+                  XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO
+               END IF
+*
+               DO 70 IA = 1, NALF
+                  RALPHA = REAL( ALF( IA ) )
+                  ALPHA = CMPLX( RALPHA, RZERO )
+                  NULL = N.LE.0.OR.RALPHA.EQ.RZERO
+*
+*                 Generate the matrix A.
+*
+                  TRANSL = ZERO
+                  CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX,
+     $                        AA, LDA, N - 1, N - 1, RESET, TRANSL )
+*
+                  NC = NC + 1
+*
+*                 Save every datum before calling the subroutine.
+*
+                  UPLOS = UPLO
+                  NS = N
+                  RALS = RALPHA
+                  DO 10 I = 1, LAA
+                     AS( I ) = AA( I )
+   10             CONTINUE
+                  LDAS = LDA
+                  DO 20 I = 1, LX
+                     XS( I ) = XX( I )
+   20             CONTINUE
+                  INCXS = INCX
+*
+*                 Call the subroutine.
+*
+                  IF( FULL )THEN
+                     IF( TRACE )
+     $                  WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N,
+     $                  RALPHA, INCX, LDA
+                     IF( REWI )
+     $                  REWIND NTRA
+                     CALL CHER( UPLO, N, RALPHA, XX, INCX, AA, LDA )
+                  ELSE IF( PACKED )THEN
+                     IF( TRACE )
+     $                  WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N,
+     $                  RALPHA, INCX
+                     IF( REWI )
+     $                  REWIND NTRA
+                     CALL CHPR( UPLO, N, RALPHA, XX, INCX, AA )
+                  END IF
+*
+*                 Check if error-exit was taken incorrectly.
+*
+                  IF( .NOT.OK )THEN
+                     WRITE( NOUT, FMT = 9992 )
+                     FATAL = .TRUE.
+                     GO TO 120
+                  END IF
+*
+*                 See what data changed inside subroutines.
+*
+                  ISAME( 1 ) = UPLO.EQ.UPLOS
+                  ISAME( 2 ) = NS.EQ.N
+                  ISAME( 3 ) = RALS.EQ.RALPHA
+                  ISAME( 4 ) = LCE( XS, XX, LX )
+                  ISAME( 5 ) = INCXS.EQ.INCX
+                  IF( NULL )THEN
+                     ISAME( 6 ) = LCE( AS, AA, LAA )
+                  ELSE
+                     ISAME( 6 ) = LCERES( SNAME( 2: 3 ), UPLO, N, N, AS,
+     $                            AA, LDA )
+                  END IF
+                  IF( .NOT.PACKED )THEN
+                     ISAME( 7 ) = LDAS.EQ.LDA
+                  END IF
+*
+*                 If data was incorrectly changed, report and return.
+*
+                  SAME = .TRUE.
+                  DO 30 I = 1, NARGS
+                     SAME = SAME.AND.ISAME( I )
+                     IF( .NOT.ISAME( I ) )
+     $                  WRITE( NOUT, FMT = 9998 )I
+   30             CONTINUE
+                  IF( .NOT.SAME )THEN
+                     FATAL = .TRUE.
+                     GO TO 120
+                  END IF
+*
+                  IF( .NOT.NULL )THEN
+*
+*                    Check the result column by column.
+*
+                     IF( INCX.GT.0 )THEN
+                        DO 40 I = 1, N
+                           Z( I ) = X( I )
+   40                   CONTINUE
+                     ELSE
+                        DO 50 I = 1, N
+                           Z( I ) = X( N - I + 1 )
+   50                   CONTINUE
+                     END IF
+                     JA = 1
+                     DO 60 J = 1, N
+                        W( 1 ) = CONJG( Z( J ) )
+                        IF( UPPER )THEN
+                           JJ = 1
+                           LJ = J
+                        ELSE
+                           JJ = J
+                           LJ = N - J + 1
+                        END IF
+                        CALL CMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W,
+     $                              1, ONE, A( JJ, J ), 1, YT, G,
+     $                              AA( JA ), EPS, ERR, FATAL, NOUT,
+     $                              .TRUE. )
+                        IF( FULL )THEN
+                           IF( UPPER )THEN
+                              JA = JA + LDA
+                           ELSE
+                              JA = JA + LDA + 1
+                           END IF
+                        ELSE
+                           JA = JA + LJ
+                        END IF
+                        ERRMAX = MAX( ERRMAX, ERR )
+*                       If got really bad answer, report and return.
+                        IF( FATAL )
+     $                     GO TO 110
+   60                CONTINUE
+                  ELSE
+*                    Avoid repeating tests if N.le.0.
+                     IF( N.LE.0 )
+     $                  GO TO 100
+                  END IF
+*
+   70          CONTINUE
+*
+   80       CONTINUE
+*
+   90    CONTINUE
+*
+  100 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 130
+*
+  110 CONTINUE
+      WRITE( NOUT, FMT = 9995 )J
+*
+  120 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( FULL )THEN
+         WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, RALPHA, INCX, LDA
+      ELSE IF( PACKED )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, RALPHA, INCX
+      END IF
+*
+  130 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,',
+     $      I2, ', AP)                                         .' )
+ 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,',
+     $      I2, ', A,', I3, ')                                      .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of CCHK5.
+*
+      END
+      SUBROUTINE CCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX,
+     $                  INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G,
+     $                  Z )
+*
+*  Tests CHER2 and CHPR2.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      COMPLEX            ZERO, HALF, ONE
+      PARAMETER          ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ),
+     $                   ONE = ( 1.0, 0.0 ) )
+      REAL               RZERO
+      PARAMETER          ( RZERO = 0.0 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX            A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ),
+     $                   XX( NMAX*INCMAX ), Y( NMAX ),
+     $                   YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX ), Z( NMAX, 2 )
+      REAL               G( NMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC )
+*     .. Local Scalars ..
+      COMPLEX            ALPHA, ALS, TRANSL
+      REAL               ERR, ERRMAX
+      INTEGER            I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX,
+     $                   IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N,
+     $                   NARGS, NC, NS
+      LOGICAL            FULL, NULL, PACKED, RESET, SAME, UPPER
+      CHARACTER*1        UPLO, UPLOS
+      CHARACTER*2        ICH
+*     .. Local Arrays ..
+      COMPLEX            W( 2 )
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LCE, LCERES
+      EXTERNAL           LCE, LCERES
+*     .. External Subroutines ..
+      EXTERNAL           CHER2, CHPR2, CMAKE, CMVCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, CONJG, MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICH/'UL'/
+*     .. Executable Statements ..
+      FULL = SNAME( 3: 3 ).EQ.'E'
+      PACKED = SNAME( 3: 3 ).EQ.'P'
+*     Define the number of arguments.
+      IF( FULL )THEN
+         NARGS = 9
+      ELSE IF( PACKED )THEN
+         NARGS = 8
+      END IF
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*
+      DO 140 IN = 1, NIDIM
+         N = IDIM( IN )
+*        Set LDA to 1 more than minimum value if room.
+         LDA = N
+         IF( LDA.LT.NMAX )
+     $      LDA = LDA + 1
+*        Skip tests if not enough room.
+         IF( LDA.GT.NMAX )
+     $      GO TO 140
+         IF( PACKED )THEN
+            LAA = ( N*( N + 1 ) )/2
+         ELSE
+            LAA = LDA*N
+         END IF
+*
+         DO 130 IC = 1, 2
+            UPLO = ICH( IC: IC )
+            UPPER = UPLO.EQ.'U'
+*
+            DO 120 IX = 1, NINC
+               INCX = INC( IX )
+               LX = ABS( INCX )*N
+*
+*              Generate the vector X.
+*
+               TRANSL = HALF
+               CALL CMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ),
+     $                     0, N - 1, RESET, TRANSL )
+               IF( N.GT.1 )THEN
+                  X( N/2 ) = ZERO
+                  XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO
+               END IF
+*
+               DO 110 IY = 1, NINC
+                  INCY = INC( IY )
+                  LY = ABS( INCY )*N
+*
+*                 Generate the vector Y.
+*
+                  TRANSL = ZERO
+                  CALL CMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY,
+     $                        ABS( INCY ), 0, N - 1, RESET, TRANSL )
+                  IF( N.GT.1 )THEN
+                     Y( N/2 ) = ZERO
+                     YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO
+                  END IF
+*
+                  DO 100 IA = 1, NALF
+                     ALPHA = ALF( IA )
+                     NULL = N.LE.0.OR.ALPHA.EQ.ZERO
+*
+*                    Generate the matrix A.
+*
+                     TRANSL = ZERO
+                     CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A,
+     $                           NMAX, AA, LDA, N - 1, N - 1, RESET,
+     $                           TRANSL )
+*
+                     NC = NC + 1
+*
+*                    Save every datum before calling the subroutine.
+*
+                     UPLOS = UPLO
+                     NS = N
+                     ALS = ALPHA
+                     DO 10 I = 1, LAA
+                        AS( I ) = AA( I )
+   10                CONTINUE
+                     LDAS = LDA
+                     DO 20 I = 1, LX
+                        XS( I ) = XX( I )
+   20                CONTINUE
+                     INCXS = INCX
+                     DO 30 I = 1, LY
+                        YS( I ) = YY( I )
+   30                CONTINUE
+                     INCYS = INCY
+*
+*                    Call the subroutine.
+*
+                     IF( FULL )THEN
+                        IF( TRACE )
+     $                     WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N,
+     $                     ALPHA, INCX, INCY, LDA
+                        IF( REWI )
+     $                     REWIND NTRA
+                        CALL CHER2( UPLO, N, ALPHA, XX, INCX, YY, INCY,
+     $                              AA, LDA )
+                     ELSE IF( PACKED )THEN
+                        IF( TRACE )
+     $                     WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N,
+     $                     ALPHA, INCX, INCY
+                        IF( REWI )
+     $                     REWIND NTRA
+                        CALL CHPR2( UPLO, N, ALPHA, XX, INCX, YY, INCY,
+     $                              AA )
+                     END IF
+*
+*                    Check if error-exit was taken incorrectly.
+*
+                     IF( .NOT.OK )THEN
+                        WRITE( NOUT, FMT = 9992 )
+                        FATAL = .TRUE.
+                        GO TO 160
+                     END IF
+*
+*                    See what data changed inside subroutines.
+*
+                     ISAME( 1 ) = UPLO.EQ.UPLOS
+                     ISAME( 2 ) = NS.EQ.N
+                     ISAME( 3 ) = ALS.EQ.ALPHA
+                     ISAME( 4 ) = LCE( XS, XX, LX )
+                     ISAME( 5 ) = INCXS.EQ.INCX
+                     ISAME( 6 ) = LCE( YS, YY, LY )
+                     ISAME( 7 ) = INCYS.EQ.INCY
+                     IF( NULL )THEN
+                        ISAME( 8 ) = LCE( AS, AA, LAA )
+                     ELSE
+                        ISAME( 8 ) = LCERES( SNAME( 2: 3 ), UPLO, N, N,
+     $                               AS, AA, LDA )
+                     END IF
+                     IF( .NOT.PACKED )THEN
+                        ISAME( 9 ) = LDAS.EQ.LDA
+                     END IF
+*
+*                    If data was incorrectly changed, report and return.
+*
+                     SAME = .TRUE.
+                     DO 40 I = 1, NARGS
+                        SAME = SAME.AND.ISAME( I )
+                        IF( .NOT.ISAME( I ) )
+     $                     WRITE( NOUT, FMT = 9998 )I
+   40                CONTINUE
+                     IF( .NOT.SAME )THEN
+                        FATAL = .TRUE.
+                        GO TO 160
+                     END IF
+*
+                     IF( .NOT.NULL )THEN
+*
+*                       Check the result column by column.
+*
+                        IF( INCX.GT.0 )THEN
+                           DO 50 I = 1, N
+                              Z( I, 1 ) = X( I )
+   50                      CONTINUE
+                        ELSE
+                           DO 60 I = 1, N
+                              Z( I, 1 ) = X( N - I + 1 )
+   60                      CONTINUE
+                        END IF
+                        IF( INCY.GT.0 )THEN
+                           DO 70 I = 1, N
+                              Z( I, 2 ) = Y( I )
+   70                      CONTINUE
+                        ELSE
+                           DO 80 I = 1, N
+                              Z( I, 2 ) = Y( N - I + 1 )
+   80                      CONTINUE
+                        END IF
+                        JA = 1
+                        DO 90 J = 1, N
+                           W( 1 ) = ALPHA*CONJG( Z( J, 2 ) )
+                           W( 2 ) = CONJG( ALPHA )*CONJG( Z( J, 1 ) )
+                           IF( UPPER )THEN
+                              JJ = 1
+                              LJ = J
+                           ELSE
+                              JJ = J
+                              LJ = N - J + 1
+                           END IF
+                           CALL CMVCH( 'N', LJ, 2, ONE, Z( JJ, 1 ),
+     $                                 NMAX, W, 1, ONE, A( JJ, J ), 1,
+     $                                 YT, G, AA( JA ), EPS, ERR, FATAL,
+     $                                 NOUT, .TRUE. )
+                           IF( FULL )THEN
+                              IF( UPPER )THEN
+                                 JA = JA + LDA
+                              ELSE
+                                 JA = JA + LDA + 1
+                              END IF
+                           ELSE
+                              JA = JA + LJ
+                           END IF
+                           ERRMAX = MAX( ERRMAX, ERR )
+*                          If got really bad answer, report and return.
+                           IF( FATAL )
+     $                        GO TO 150
+   90                   CONTINUE
+                     ELSE
+*                       Avoid repeating tests with N.le.0.
+                        IF( N.LE.0 )
+     $                     GO TO 140
+                     END IF
+*
+  100             CONTINUE
+*
+  110          CONTINUE
+*
+  120       CONTINUE
+*
+  130    CONTINUE
+*
+  140 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 170
+*
+  150 CONTINUE
+      WRITE( NOUT, FMT = 9995 )J
+*
+  160 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( FULL )THEN
+         WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX,
+     $      INCY, LDA
+      ELSE IF( PACKED )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX, INCY
+      END IF
+*
+  170 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',',
+     $      F4.1, '), X,', I2, ', Y,', I2, ', AP)                     ',
+     $      '       .' )
+ 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',',
+     $      F4.1, '), X,', I2, ', Y,', I2, ', A,', I3, ')             ',
+     $      '            .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of CCHK6.
+*
+      END
+      SUBROUTINE CCHKE( ISNUM, SRNAMT, NOUT )
+*
+*  Tests the error exits from the Level 2 Blas.
+*  Requires a special version of the error-handling routine XERBLA.
+*  ALPHA, RALPHA, BETA, A, X and Y should not need to be defined.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      INTEGER            ISNUM, NOUT
+      CHARACTER*6        SRNAMT
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Local Scalars ..
+      COMPLEX            ALPHA, BETA
+      REAL               RALPHA
+*     .. Local Arrays ..
+      COMPLEX            A( 1, 1 ), X( 1 ), Y( 1 )
+*     .. External Subroutines ..
+      EXTERNAL           CGBMV, CGEMV, CGERC, CGERU, CHBMV, CHEMV, CHER,
+     $                   CHER2, CHKXER, CHPMV, CHPR, CHPR2, CTBMV,
+     $                   CTBSV, CTPMV, CTPSV, CTRMV, CTRSV
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Executable Statements ..
+*     OK is set to .FALSE. by the special version of XERBLA or by CHKXER
+*     if anything is wrong.
+      OK = .TRUE.
+*     LERR is set to .TRUE. by the special version of XERBLA each time
+*     it is called, and is then tested and re-set by CHKXER.
+      LERR = .FALSE.
+      GO TO ( 10, 20, 30, 40, 50, 60, 70, 80,
+     $        90, 100, 110, 120, 130, 140, 150, 160,
+     $        170 )ISNUM
+   10 INFOT = 1
+      CALL CGEMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CGEMV( 'N', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CGEMV( 'N', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CGEMV( 'N', 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL CGEMV( 'N', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CGEMV( 'N', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+   20 INFOT = 1
+      CALL CGBMV( '/', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CGBMV( 'N', -1, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CGBMV( 'N', 0, -1, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CGBMV( 'N', 0, 0, -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CGBMV( 'N', 2, 0, 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL CGBMV( 'N', 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL CGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL CGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+   30 INFOT = 1
+      CALL CHEMV( '/', 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CHEMV( 'U', -1, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CHEMV( 'U', 2, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CHEMV( 'U', 0, ALPHA, A, 1, X, 0, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL CHEMV( 'U', 0, ALPHA, A, 1, X, 1, BETA, Y, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+   40 INFOT = 1
+      CALL CHBMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CHBMV( 'U', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CHBMV( 'U', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CHBMV( 'U', 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL CHBMV( 'U', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CHBMV( 'U', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+   50 INFOT = 1
+      CALL CHPMV( '/', 0, ALPHA, A, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CHPMV( 'U', -1, ALPHA, A, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CHPMV( 'U', 0, ALPHA, A, X, 0, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CHPMV( 'U', 0, ALPHA, A, X, 1, BETA, Y, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+   60 INFOT = 1
+      CALL CTRMV( '/', 'N', 'N', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CTRMV( 'U', '/', 'N', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CTRMV( 'U', 'N', '/', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CTRMV( 'U', 'N', 'N', -1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRMV( 'U', 'N', 'N', 2, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL CTRMV( 'U', 'N', 'N', 0, A, 1, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+   70 INFOT = 1
+      CALL CTBMV( '/', 'N', 'N', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CTBMV( 'U', '/', 'N', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CTBMV( 'U', 'N', '/', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CTBMV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTBMV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CTBMV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTBMV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+   80 INFOT = 1
+      CALL CTPMV( '/', 'N', 'N', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CTPMV( 'U', '/', 'N', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CTPMV( 'U', 'N', '/', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CTPMV( 'U', 'N', 'N', -1, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CTPMV( 'U', 'N', 'N', 0, A, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+   90 INFOT = 1
+      CALL CTRSV( '/', 'N', 'N', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CTRSV( 'U', '/', 'N', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CTRSV( 'U', 'N', '/', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CTRSV( 'U', 'N', 'N', -1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRSV( 'U', 'N', 'N', 2, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL CTRSV( 'U', 'N', 'N', 0, A, 1, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+  100 INFOT = 1
+      CALL CTBSV( '/', 'N', 'N', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CTBSV( 'U', '/', 'N', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CTBSV( 'U', 'N', '/', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CTBSV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTBSV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CTBSV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTBSV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+  110 INFOT = 1
+      CALL CTPSV( '/', 'N', 'N', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CTPSV( 'U', '/', 'N', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CTPSV( 'U', 'N', '/', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CTPSV( 'U', 'N', 'N', -1, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CTPSV( 'U', 'N', 'N', 0, A, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+  120 INFOT = 1
+      CALL CGERC( -1, 0, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CGERC( 0, -1, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CGERC( 0, 0, ALPHA, X, 0, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CGERC( 0, 0, ALPHA, X, 1, Y, 0, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CGERC( 2, 0, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+  130 INFOT = 1
+      CALL CGERU( -1, 0, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CGERU( 0, -1, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CGERU( 0, 0, ALPHA, X, 0, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CGERU( 0, 0, ALPHA, X, 1, Y, 0, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CGERU( 2, 0, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+  140 INFOT = 1
+      CALL CHER( '/', 0, RALPHA, X, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CHER( 'U', -1, RALPHA, X, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CHER( 'U', 0, RALPHA, X, 0, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CHER( 'U', 2, RALPHA, X, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+  150 INFOT = 1
+      CALL CHPR( '/', 0, RALPHA, X, 1, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CHPR( 'U', -1, RALPHA, X, 1, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CHPR( 'U', 0, RALPHA, X, 0, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+  160 INFOT = 1
+      CALL CHER2( '/', 0, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CHER2( 'U', -1, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CHER2( 'U', 0, ALPHA, X, 0, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CHER2( 'U', 0, ALPHA, X, 1, Y, 0, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CHER2( 'U', 2, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+  170 INFOT = 1
+      CALL CHPR2( '/', 0, ALPHA, X, 1, Y, 1, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CHPR2( 'U', -1, ALPHA, X, 1, Y, 1, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CHPR2( 'U', 0, ALPHA, X, 0, Y, 1, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CHPR2( 'U', 0, ALPHA, X, 1, Y, 0, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+*
+  180 IF( OK )THEN
+         WRITE( NOUT, FMT = 9999 )SRNAMT
+      ELSE
+         WRITE( NOUT, FMT = 9998 )SRNAMT
+      END IF
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' )
+ 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****',
+     $      '**' )
+*
+*     End of CCHKE.
+*
+      END
+      SUBROUTINE CMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL,
+     $                  KU, RESET, TRANSL )
+*
+*  Generates values for an M by N matrix A within the bandwidth
+*  defined by KL and KU.
+*  Stores the values in the array AA in the data structure required
+*  by the routine, with unwanted elements set to rogue value.
+*
+*  TYPE is 'GE', 'GB', 'HE', 'HB', 'HP', 'TR', 'TB' OR 'TP'.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      COMPLEX            ZERO, ONE
+      PARAMETER          ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) )
+      COMPLEX            ROGUE
+      PARAMETER          ( ROGUE = ( -1.0E10, 1.0E10 ) )
+      REAL               RZERO
+      PARAMETER          ( RZERO = 0.0 )
+      REAL               RROGUE
+      PARAMETER          ( RROGUE = -1.0E10 )
+*     .. Scalar Arguments ..
+      COMPLEX            TRANSL
+      INTEGER            KL, KU, LDA, M, N, NMAX
+      LOGICAL            RESET
+      CHARACTER*1        DIAG, UPLO
+      CHARACTER*2        TYPE
+*     .. Array Arguments ..
+      COMPLEX            A( NMAX, * ), AA( * )
+*     .. Local Scalars ..
+      INTEGER            I, I1, I2, I3, IBEG, IEND, IOFF, J, JJ, KK
+      LOGICAL            GEN, LOWER, SYM, TRI, UNIT, UPPER
+*     .. External Functions ..
+      COMPLEX            CBEG
+      EXTERNAL           CBEG
+*     .. Intrinsic Functions ..
+      INTRINSIC          CMPLX, CONJG, MAX, MIN, REAL
+*     .. Executable Statements ..
+      GEN = TYPE( 1: 1 ).EQ.'G'
+      SYM = TYPE( 1: 1 ).EQ.'H'
+      TRI = TYPE( 1: 1 ).EQ.'T'
+      UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U'
+      LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L'
+      UNIT = TRI.AND.DIAG.EQ.'U'
+*
+*     Generate data in array A.
+*
+      DO 20 J = 1, N
+         DO 10 I = 1, M
+            IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) )
+     $          THEN
+               IF( ( I.LE.J.AND.J - I.LE.KU ).OR.
+     $             ( I.GE.J.AND.I - J.LE.KL ) )THEN
+                  A( I, J ) = CBEG( RESET ) + TRANSL
+               ELSE
+                  A( I, J ) = ZERO
+               END IF
+               IF( I.NE.J )THEN
+                  IF( SYM )THEN
+                     A( J, I ) = CONJG( A( I, J ) )
+                  ELSE IF( TRI )THEN
+                     A( J, I ) = ZERO
+                  END IF
+               END IF
+            END IF
+   10    CONTINUE
+         IF( SYM )
+     $      A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO )
+         IF( TRI )
+     $      A( J, J ) = A( J, J ) + ONE
+         IF( UNIT )
+     $      A( J, J ) = ONE
+   20 CONTINUE
+*
+*     Store elements in array AS in data structure required by routine.
+*
+      IF( TYPE.EQ.'GE' )THEN
+         DO 50 J = 1, N
+            DO 30 I = 1, M
+               AA( I + ( J - 1 )*LDA ) = A( I, J )
+   30       CONTINUE
+            DO 40 I = M + 1, LDA
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+   40       CONTINUE
+   50    CONTINUE
+      ELSE IF( TYPE.EQ.'GB' )THEN
+         DO 90 J = 1, N
+            DO 60 I1 = 1, KU + 1 - J
+               AA( I1 + ( J - 1 )*LDA ) = ROGUE
+   60       CONTINUE
+            DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J )
+               AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J )
+   70       CONTINUE
+            DO 80 I3 = I2, LDA
+               AA( I3 + ( J - 1 )*LDA ) = ROGUE
+   80       CONTINUE
+   90    CONTINUE
+      ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'TR' )THEN
+         DO 130 J = 1, N
+            IF( UPPER )THEN
+               IBEG = 1
+               IF( UNIT )THEN
+                  IEND = J - 1
+               ELSE
+                  IEND = J
+               END IF
+            ELSE
+               IF( UNIT )THEN
+                  IBEG = J + 1
+               ELSE
+                  IBEG = J
+               END IF
+               IEND = N
+            END IF
+            DO 100 I = 1, IBEG - 1
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+  100       CONTINUE
+            DO 110 I = IBEG, IEND
+               AA( I + ( J - 1 )*LDA ) = A( I, J )
+  110       CONTINUE
+            DO 120 I = IEND + 1, LDA
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+  120       CONTINUE
+            IF( SYM )THEN
+               JJ = J + ( J - 1 )*LDA
+               AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE )
+            END IF
+  130    CONTINUE
+      ELSE IF( TYPE.EQ.'HB'.OR.TYPE.EQ.'TB' )THEN
+         DO 170 J = 1, N
+            IF( UPPER )THEN
+               KK = KL + 1
+               IBEG = MAX( 1, KL + 2 - J )
+               IF( UNIT )THEN
+                  IEND = KL
+               ELSE
+                  IEND = KL + 1
+               END IF
+            ELSE
+               KK = 1
+               IF( UNIT )THEN
+                  IBEG = 2
+               ELSE
+                  IBEG = 1
+               END IF
+               IEND = MIN( KL + 1, 1 + M - J )
+            END IF
+            DO 140 I = 1, IBEG - 1
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+  140       CONTINUE
+            DO 150 I = IBEG, IEND
+               AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J )
+  150       CONTINUE
+            DO 160 I = IEND + 1, LDA
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+  160       CONTINUE
+            IF( SYM )THEN
+               JJ = KK + ( J - 1 )*LDA
+               AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE )
+            END IF
+  170    CONTINUE
+      ELSE IF( TYPE.EQ.'HP'.OR.TYPE.EQ.'TP' )THEN
+         IOFF = 0
+         DO 190 J = 1, N
+            IF( UPPER )THEN
+               IBEG = 1
+               IEND = J
+            ELSE
+               IBEG = J
+               IEND = N
+            END IF
+            DO 180 I = IBEG, IEND
+               IOFF = IOFF + 1
+               AA( IOFF ) = A( I, J )
+               IF( I.EQ.J )THEN
+                  IF( UNIT )
+     $               AA( IOFF ) = ROGUE
+                  IF( SYM )
+     $               AA( IOFF ) = CMPLX( REAL( AA( IOFF ) ), RROGUE )
+               END IF
+  180       CONTINUE
+  190    CONTINUE
+      END IF
+      RETURN
+*
+*     End of CMAKE.
+*
+      END
+      SUBROUTINE CMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y,
+     $                  INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV )
+*
+*  Checks the results of the computational tests.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      COMPLEX            ZERO
+      PARAMETER          ( ZERO = ( 0.0, 0.0 ) )
+      REAL               RZERO, RONE
+      PARAMETER          ( RZERO = 0.0, RONE = 1.0 )
+*     .. Scalar Arguments ..
+      COMPLEX            ALPHA, BETA
+      REAL               EPS, ERR
+      INTEGER            INCX, INCY, M, N, NMAX, NOUT
+      LOGICAL            FATAL, MV
+      CHARACTER*1        TRANS
+*     .. Array Arguments ..
+      COMPLEX            A( NMAX, * ), X( * ), Y( * ), YT( * ), YY( * )
+      REAL               G( * )
+*     .. Local Scalars ..
+      COMPLEX            C
+      REAL               ERRI
+      INTEGER            I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL
+      LOGICAL            CTRAN, TRAN
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, AIMAG, CONJG, MAX, REAL, SQRT
+*     .. Statement Functions ..
+      REAL               ABS1
+*     .. Statement Function definitions ..
+      ABS1( C ) = ABS( REAL( C ) ) + ABS( AIMAG( C ) )
+*     .. Executable Statements ..
+      TRAN = TRANS.EQ.'T'
+      CTRAN = TRANS.EQ.'C'
+      IF( TRAN.OR.CTRAN )THEN
+         ML = N
+         NL = M
+      ELSE
+         ML = M
+         NL = N
+      END IF
+      IF( INCX.LT.0 )THEN
+         KX = NL
+         INCXL = -1
+      ELSE
+         KX = 1
+         INCXL = 1
+      END IF
+      IF( INCY.LT.0 )THEN
+         KY = ML
+         INCYL = -1
+      ELSE
+         KY = 1
+         INCYL = 1
+      END IF
+*
+*     Compute expected result in YT using data in A, X and Y.
+*     Compute gauges in G.
+*
+      IY = KY
+      DO 40 I = 1, ML
+         YT( IY ) = ZERO
+         G( IY ) = RZERO
+         JX = KX
+         IF( TRAN )THEN
+            DO 10 J = 1, NL
+               YT( IY ) = YT( IY ) + A( J, I )*X( JX )
+               G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) )
+               JX = JX + INCXL
+   10       CONTINUE
+         ELSE IF( CTRAN )THEN
+            DO 20 J = 1, NL
+               YT( IY ) = YT( IY ) + CONJG( A( J, I ) )*X( JX )
+               G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) )
+               JX = JX + INCXL
+   20       CONTINUE
+         ELSE
+            DO 30 J = 1, NL
+               YT( IY ) = YT( IY ) + A( I, J )*X( JX )
+               G( IY ) = G( IY ) + ABS1( A( I, J ) )*ABS1( X( JX ) )
+               JX = JX + INCXL
+   30       CONTINUE
+         END IF
+         YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY )
+         G( IY ) = ABS1( ALPHA )*G( IY ) + ABS1( BETA )*ABS1( Y( IY ) )
+         IY = IY + INCYL
+   40 CONTINUE
+*
+*     Compute the error ratio for this result.
+*
+      ERR = ZERO
+      DO 50 I = 1, ML
+         ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS
+         IF( G( I ).NE.RZERO )
+     $      ERRI = ERRI/G( I )
+         ERR = MAX( ERR, ERRI )
+         IF( ERR*SQRT( EPS ).GE.RONE )
+     $      GO TO 60
+   50 CONTINUE
+*     If the loop completes, all results are at least half accurate.
+      GO TO 80
+*
+*     Report fatal error.
+*
+   60 FATAL = .TRUE.
+      WRITE( NOUT, FMT = 9999 )
+      DO 70 I = 1, ML
+         IF( MV )THEN
+            WRITE( NOUT, FMT = 9998 )I, YT( I ),
+     $         YY( 1 + ( I - 1 )*ABS( INCY ) )
+         ELSE
+            WRITE( NOUT, FMT = 9998 )I,
+     $         YY( 1 + ( I - 1 )*ABS( INCY ) ), YT( I )
+         END IF
+   70 CONTINUE
+*
+   80 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL',
+     $      'F ACCURATE *******', /'                       EXPECTED RE',
+     $      'SULT                    COMPUTED RESULT' )
+ 9998 FORMAT( 1X, I7, 2( '  (', G15.6, ',', G15.6, ')' ) )
+*
+*     End of CMVCH.
+*
+      END
+      LOGICAL FUNCTION LCE( RI, RJ, LR )
+*
+*  Tests if two arrays are identical.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      INTEGER            LR
+*     .. Array Arguments ..
+      COMPLEX            RI( * ), RJ( * )
+*     .. Local Scalars ..
+      INTEGER            I
+*     .. Executable Statements ..
+      DO 10 I = 1, LR
+         IF( RI( I ).NE.RJ( I ) )
+     $      GO TO 20
+   10 CONTINUE
+      LCE = .TRUE.
+      GO TO 30
+   20 CONTINUE
+      LCE = .FALSE.
+   30 RETURN
+*
+*     End of LCE.
+*
+      END
+      LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA )
+*
+*  Tests if selected elements in two arrays are equal.
+*
+*  TYPE is 'GE', 'HE' or 'HP'.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      INTEGER            LDA, M, N
+      CHARACTER*1        UPLO
+      CHARACTER*2        TYPE
+*     .. Array Arguments ..
+      COMPLEX            AA( LDA, * ), AS( LDA, * )
+*     .. Local Scalars ..
+      INTEGER            I, IBEG, IEND, J
+      LOGICAL            UPPER
+*     .. Executable Statements ..
+      UPPER = UPLO.EQ.'U'
+      IF( TYPE.EQ.'GE' )THEN
+         DO 20 J = 1, N
+            DO 10 I = M + 1, LDA
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   10       CONTINUE
+   20    CONTINUE
+      ELSE IF( TYPE.EQ.'HE' )THEN
+         DO 50 J = 1, N
+            IF( UPPER )THEN
+               IBEG = 1
+               IEND = J
+            ELSE
+               IBEG = J
+               IEND = N
+            END IF
+            DO 30 I = 1, IBEG - 1
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   30       CONTINUE
+            DO 40 I = IEND + 1, LDA
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   40       CONTINUE
+   50    CONTINUE
+      END IF
+*
+      LCERES = .TRUE.
+      GO TO 80
+   70 CONTINUE
+      LCERES = .FALSE.
+   80 RETURN
+*
+*     End of LCERES.
+*
+      END
+      COMPLEX FUNCTION CBEG( RESET )
+*
+*  Generates complex numbers as pairs of random numbers uniformly
+*  distributed between -0.5 and 0.5.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      LOGICAL            RESET
+*     .. Local Scalars ..
+      INTEGER            I, IC, J, MI, MJ
+*     .. Save statement ..
+      SAVE               I, IC, J, MI, MJ
+*     .. Intrinsic Functions ..
+      INTRINSIC          CMPLX
+*     .. Executable Statements ..
+      IF( RESET )THEN
+*        Initialize local variables.
+         MI = 891
+         MJ = 457
+         I = 7
+         J = 7
+         IC = 0
+         RESET = .FALSE.
+      END IF
+*
+*     The sequence of values of I or J is bounded between 1 and 999.
+*     If initial I or J = 1,2,3,6,7 or 9, the period will be 50.
+*     If initial I or J = 4 or 8, the period will be 25.
+*     If initial I or J = 5, the period will be 10.
+*     IC is used to break up the period by skipping 1 value of I or J
+*     in 6.
+*
+      IC = IC + 1
+   10 I = I*MI
+      J = J*MJ
+      I = I - 1000*( I/1000 )
+      J = J - 1000*( J/1000 )
+      IF( IC.GE.5 )THEN
+         IC = 0
+         GO TO 10
+      END IF
+      CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 )
+      RETURN
+*
+*     End of CBEG.
+*
+      END
+      REAL FUNCTION SDIFF( X, Y )
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*
+*     .. Scalar Arguments ..
+      REAL               X, Y
+*     .. Executable Statements ..
+      SDIFF = X - Y
+      RETURN
+*
+*     End of SDIFF.
+*
+      END
+      SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+*
+*  Tests whether XERBLA has detected an error when it should.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFOT, NOUT
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Executable Statements ..
+      IF( .NOT.LERR )THEN
+         WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT
+         OK = .FALSE.
+      END IF
+      LERR = .FALSE.
+      RETURN
+*
+ 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D',
+     $      'ETECTED BY ', A6, ' *****' )
+*
+*     End of CHKXER.
+*
+      END
+      SUBROUTINE XERBLA( SRNAME, INFO )
+*
+*  This is a special version of XERBLA to be used only as part of
+*  the test program for testing error exits from the Level 2 BLAS
+*  routines.
+*
+*  XERBLA  is an error handler for the Level 2 BLAS routines.
+*
+*  It is called by the Level 2 BLAS routines if an input parameter is
+*  invalid.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO
+      CHARACTER*6        SRNAME
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUT
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUT, OK, LERR
+      COMMON             /SRNAMC/SRNAMT
+*     .. Executable Statements ..
+      LERR = .TRUE.
+      IF( INFO.NE.INFOT )THEN
+         IF( INFOT.NE.0 )THEN
+            WRITE( NOUT, FMT = 9999 )INFO, INFOT
+         ELSE
+            WRITE( NOUT, FMT = 9997 )INFO
+         END IF
+         OK = .FALSE.
+      END IF
+      IF( SRNAME.NE.SRNAMT )THEN
+         WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT
+         OK = .FALSE.
+      END IF
+      RETURN
+*
+ 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD',
+     $      ' OF ', I2, ' *******' )
+ 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE',
+     $      'AD OF ', A6, ' *******' )
+ 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6,
+     $      ' *******' )
+*
+*     End of XERBLA
+*
+      END
+

diff --git a/blas/testing/cblat3.dat b/blas/testing/cblat3.dat
new file mode 100644
index 0000000..59881ea
--- /dev/null
+++ b/blas/testing/cblat3.dat

@@ -0,0 +1,23 @@
+'cblat3.summ'     NAME OF SUMMARY OUTPUT FILE
+6                 UNIT NUMBER OF SUMMARY FILE
+'cblat3.snap'     NAME OF SNAPSHOT OUTPUT FILE
+-1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+F        LOGICAL FLAG, T TO STOP ON FAILURES.
+F        LOGICAL FLAG, T TO TEST ERROR EXITS.
+16.0     THRESHOLD VALUE OF TEST RATIO
+6                 NUMBER OF VALUES OF N
+0 1 2 3 5 9       VALUES OF N
+3                 NUMBER OF VALUES OF ALPHA
+(0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
+3                 NUMBER OF VALUES OF BETA
+(0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
+CGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+CHEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+CSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
+CTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
+CTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
+CHERK  T PUT F FOR NO TEST. SAME COLUMNS.
+CSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
+CHER2K T PUT F FOR NO TEST. SAME COLUMNS.
+CSYR2K T PUT F FOR NO TEST. SAME COLUMNS.

diff --git a/blas/testing/cblat3.f b/blas/testing/cblat3.f
new file mode 100644
index 0000000..09f2cb9
--- /dev/null
+++ b/blas/testing/cblat3.f

@@ -0,0 +1,3492 @@
+*> \brief \b CBLAT3
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM CBLAT3
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the COMPLEX          Level 3 Blas.
+*>
+*> The program must be driven by a short data file. The first 14 records
+*> of the file are read using list-directed input, the last 9 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 23 lines:
+*> 'cblat3.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
+*> CGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHERK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHER2K T PUT F FOR NO TEST. SAME COLUMNS.
+*> CSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*> See:
+*>
+*>    Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
+*>    A Set of Level 3 Basic Linear Algebra Subprograms.
+*>
+*>    Technical Memorandum No.88 (Revision 1), Mathematics and
+*>    Computer Science Division, Argonne National Laboratory, 9700
+*>    South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*> -- Written on 8-February-1989.
+*>    Jack Dongarra, Argonne National Laboratory.
+*>    Iain Duff, AERE Harwell.
+*>    Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*>    Sven Hammarling, Numerical Algorithms Group Ltd.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complex_blas_testing
+*
+*  =====================================================================
+      PROGRAM CBLAT3
+*
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NIN
+      PARAMETER          ( NIN = 5 )
+      INTEGER            NSUBS
+      PARAMETER          ( NSUBS = 9 )
+      COMPLEX            ZERO, ONE
+      PARAMETER          ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) )
+      REAL               RZERO
+      PARAMETER          ( RZERO = 0.0 )
+      INTEGER            NMAX
+      PARAMETER          ( NMAX = 65 )
+      INTEGER            NIDMAX, NALMAX, NBEMAX
+      PARAMETER          ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 )
+*     .. Local Scalars ..
+      REAL               EPS, ERR, THRESH
+      INTEGER            I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA
+      LOGICAL            FATAL, LTESTT, REWI, SAME, SFATAL, TRACE,
+     $                   TSTERR
+      CHARACTER*1        TRANSA, TRANSB
+      CHARACTER*6        SNAMET
+      CHARACTER*32       SNAPS, SUMMRY
+*     .. Local Arrays ..
+      COMPLEX            AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ),
+     $                   ALF( NALMAX ), AS( NMAX*NMAX ),
+     $                   BB( NMAX*NMAX ), BET( NBEMAX ),
+     $                   BS( NMAX*NMAX ), C( NMAX, NMAX ),
+     $                   CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ),
+     $                   W( 2*NMAX )
+      REAL               G( NMAX )
+      INTEGER            IDIM( NIDMAX )
+      LOGICAL            LTEST( NSUBS )
+      CHARACTER*6        SNAMES( NSUBS )
+*     .. External Functions ..
+      REAL               SDIFF
+      LOGICAL            LCE
+      EXTERNAL           SDIFF, LCE
+*     .. External Subroutines ..
+      EXTERNAL           CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CCHKE, CMMCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+      COMMON             /SRNAMC/SRNAMT
+*     .. Data statements ..
+      DATA               SNAMES/'CGEMM ', 'CHEMM ', 'CSYMM ', 'CTRMM ',
+     $                   'CTRSM ', 'CHERK ', 'CSYRK ', 'CHER2K',
+     $                   'CSYR2K'/
+*     .. Executable Statements ..
+*
+*     Read name and unit number for summary output file and open file.
+*
+      READ( NIN, FMT = * )SUMMRY
+      READ( NIN, FMT = * )NOUT
+      OPEN( NOUT, FILE = SUMMRY )
+      NOUTC = NOUT
+*
+*     Read name and unit number for snapshot output file and open file.
+*
+      READ( NIN, FMT = * )SNAPS
+      READ( NIN, FMT = * )NTRA
+      TRACE = NTRA.GE.0
+      IF( TRACE )THEN
+         OPEN( NTRA, FILE = SNAPS )
+      END IF
+*     Read the flag that directs rewinding of the snapshot file.
+      READ( NIN, FMT = * )REWI
+      REWI = REWI.AND.TRACE
+*     Read the flag that directs stopping on any failure.
+      READ( NIN, FMT = * )SFATAL
+*     Read the flag that indicates whether error exits are to be tested.
+      READ( NIN, FMT = * )TSTERR
+*     Read the threshold value of the test ratio
+      READ( NIN, FMT = * )THRESH
+*
+*     Read and check the parameter values for the tests.
+*
+*     Values of N
+      READ( NIN, FMT = * )NIDIM
+      IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'N', NIDMAX
+         GO TO 220
+      END IF
+      READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM )
+      DO 10 I = 1, NIDIM
+         IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN
+            WRITE( NOUT, FMT = 9996 )NMAX
+            GO TO 220
+         END IF
+   10 CONTINUE
+*     Values of ALPHA
+      READ( NIN, FMT = * )NALF
+      IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX
+         GO TO 220
+      END IF
+      READ( NIN, FMT = * )( ALF( I ), I = 1, NALF )
+*     Values of BETA
+      READ( NIN, FMT = * )NBET
+      IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX
+         GO TO 220
+      END IF
+      READ( NIN, FMT = * )( BET( I ), I = 1, NBET )
+*
+*     Report values of parameters.
+*
+      WRITE( NOUT, FMT = 9995 )
+      WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM )
+      WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF )
+      WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET )
+      IF( .NOT.TSTERR )THEN
+         WRITE( NOUT, FMT = * )
+         WRITE( NOUT, FMT = 9984 )
+      END IF
+      WRITE( NOUT, FMT = * )
+      WRITE( NOUT, FMT = 9999 )THRESH
+      WRITE( NOUT, FMT = * )
+*
+*     Read names of subroutines and flags which indicate
+*     whether they are to be tested.
+*
+      DO 20 I = 1, NSUBS
+         LTEST( I ) = .FALSE.
+   20 CONTINUE
+   30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT
+      DO 40 I = 1, NSUBS
+         IF( SNAMET.EQ.SNAMES( I ) )
+     $      GO TO 50
+   40 CONTINUE
+      WRITE( NOUT, FMT = 9990 )SNAMET
+      STOP
+   50 LTEST( I ) = LTESTT
+      GO TO 30
+*
+   60 CONTINUE
+      CLOSE ( NIN )
+*
+*     Compute EPS (the machine precision).
+*
+      EPS = EPSILON(RZERO)
+      WRITE( NOUT, FMT = 9998 )EPS
+*
+*     Check the reliability of CMMCH using exact data.
+*
+      N = MIN( 32, NMAX )
+      DO 100 J = 1, N
+         DO 90 I = 1, N
+            AB( I, J ) = MAX( I - J + 1, 0 )
+   90    CONTINUE
+         AB( J, NMAX + 1 ) = J
+         AB( 1, NMAX + J ) = J
+         C( J, 1 ) = ZERO
+  100 CONTINUE
+      DO 110 J = 1, N
+         CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3
+  110 CONTINUE
+*     CC holds the exact result. On exit from CMMCH CT holds
+*     the result computed by CMMCH.
+      TRANSA = 'N'
+      TRANSB = 'N'
+      CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+     $            AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+     $            NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LCE( CC, CT, N )
+      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+         STOP
+      END IF
+      TRANSB = 'C'
+      CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+     $            AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+     $            NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LCE( CC, CT, N )
+      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+         STOP
+      END IF
+      DO 120 J = 1, N
+         AB( J, NMAX + 1 ) = N - J + 1
+         AB( 1, NMAX + J ) = N - J + 1
+  120 CONTINUE
+      DO 130 J = 1, N
+         CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 -
+     $                     ( ( J + 1 )*J*( J - 1 ) )/3
+  130 CONTINUE
+      TRANSA = 'C'
+      TRANSB = 'N'
+      CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+     $            AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+     $            NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LCE( CC, CT, N )
+      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+         STOP
+      END IF
+      TRANSB = 'C'
+      CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+     $            AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+     $            NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LCE( CC, CT, N )
+      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+         STOP
+      END IF
+*
+*     Test each subroutine in turn.
+*
+      DO 200 ISNUM = 1, NSUBS
+         WRITE( NOUT, FMT = * )
+         IF( .NOT.LTEST( ISNUM ) )THEN
+*           Subprogram is not to be tested.
+            WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM )
+         ELSE
+            SRNAMT = SNAMES( ISNUM )
+*           Test error exits.
+            IF( TSTERR )THEN
+               CALL CCHKE( ISNUM, SNAMES( ISNUM ), NOUT )
+               WRITE( NOUT, FMT = * )
+            END IF
+*           Test computations.
+            INFOT = 0
+            OK = .TRUE.
+            FATAL = .FALSE.
+            GO TO ( 140, 150, 150, 160, 160, 170, 170,
+     $              180, 180 )ISNUM
+*           Test CGEMM, 01.
+  140       CALL CCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+     $                  NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+     $                  CC, CS, CT, G )
+            GO TO 190
+*           Test CHEMM, 02, CSYMM, 03.
+  150       CALL CCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+     $                  NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+     $                  CC, CS, CT, G )
+            GO TO 190
+*           Test CTRMM, 04, CTRSM, 05.
+  160       CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB,
+     $                  AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C )
+            GO TO 190
+*           Test CHERK, 06, CSYRK, 07.
+  170       CALL CCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+     $                  NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+     $                  CC, CS, CT, G )
+            GO TO 190
+*           Test CHER2K, 08, CSYR2K, 09.
+  180       CALL CCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+     $                  NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W )
+            GO TO 190
+*
+  190       IF( FATAL.AND.SFATAL )
+     $         GO TO 210
+         END IF
+  200 CONTINUE
+      WRITE( NOUT, FMT = 9986 )
+      GO TO 230
+*
+  210 CONTINUE
+      WRITE( NOUT, FMT = 9985 )
+      GO TO 230
+*
+  220 CONTINUE
+      WRITE( NOUT, FMT = 9991 )
+*
+  230 CONTINUE
+      IF( TRACE )
+     $   CLOSE ( NTRA )
+      CLOSE ( NOUT )
+      STOP
+*
+ 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES',
+     $      'S THAN', F8.2 )
+ 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 )
+ 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ',
+     $      'THAN ', I2 )
+ 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 )
+ 9995 FORMAT( ' TESTS OF THE COMPLEX          LEVEL 3 BLAS', //' THE F',
+     $      'OLLOWING PARAMETER VALUES WILL BE USED:' )
+ 9994 FORMAT( '   FOR N              ', 9I6 )
+ 9993 FORMAT( '   FOR ALPHA          ',
+     $      7( '(', F4.1, ',', F4.1, ')  ', : ) )
+ 9992 FORMAT( '   FOR BETA           ',
+     $      7( '(', F4.1, ',', F4.1, ')  ', : ) )
+ 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM',
+     $      /' ******* TESTS ABANDONED *******' )
+ 9990 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T',
+     $      'ESTS ABANDONED *******' )
+ 9989 FORMAT( ' ERROR IN CMMCH -  IN-LINE DOT PRODUCTS ARE BEING EVALU',
+     $      'ATED WRONGLY.', /' CMMCH WAS CALLED WITH TRANSA = ', A1,
+     $      ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ',
+     $      'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ',
+     $      'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ',
+     $      '*******' )
+ 9988 FORMAT( A6, L2 )
+ 9987 FORMAT( 1X, A6, ' WAS NOT TESTED' )
+ 9986 FORMAT( /' END OF TESTS' )
+ 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' )
+ 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' )
+*
+*     End of CBLAT3.
+*
+      END
+      SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+     $                  A, AA, AS, B, BB, BS, C, CC, CS, CT, G )
+*
+*  Tests CGEMM.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      COMPLEX            ZERO
+      PARAMETER          ( ZERO = ( 0.0, 0.0 ) )
+      REAL               RZERO
+      PARAMETER          ( RZERO = 0.0 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX            A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), B( NMAX, NMAX ),
+     $                   BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+     $                   C( NMAX, NMAX ), CC( NMAX*NMAX ),
+     $                   CS( NMAX*NMAX ), CT( NMAX )
+      REAL               G( NMAX )
+      INTEGER            IDIM( NIDIM )
+*     .. Local Scalars ..
+      COMPLEX            ALPHA, ALS, BETA, BLS
+      REAL               ERR, ERRMAX
+      INTEGER            I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA,
+     $                   LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M,
+     $                   MA, MB, MS, N, NA, NARGS, NB, NC, NS
+      LOGICAL            NULL, RESET, SAME, TRANA, TRANB
+      CHARACTER*1        TRANAS, TRANBS, TRANSA, TRANSB
+      CHARACTER*3        ICH
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LCE, LCERES
+      EXTERNAL           LCE, LCERES
+*     .. External Subroutines ..
+      EXTERNAL           CGEMM, CMAKE, CMMCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICH/'NTC'/
+*     .. Executable Statements ..
+*
+      NARGS = 13
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*
+      DO 110 IM = 1, NIDIM
+         M = IDIM( IM )
+*
+         DO 100 IN = 1, NIDIM
+            N = IDIM( IN )
+*           Set LDC to 1 more than minimum value if room.
+            LDC = M
+            IF( LDC.LT.NMAX )
+     $         LDC = LDC + 1
+*           Skip tests if not enough room.
+            IF( LDC.GT.NMAX )
+     $         GO TO 100
+            LCC = LDC*N
+            NULL = N.LE.0.OR.M.LE.0
+*
+            DO 90 IK = 1, NIDIM
+               K = IDIM( IK )
+*
+               DO 80 ICA = 1, 3
+                  TRANSA = ICH( ICA: ICA )
+                  TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C'
+*
+                  IF( TRANA )THEN
+                     MA = K
+                     NA = M
+                  ELSE
+                     MA = M
+                     NA = K
+                  END IF
+*                 Set LDA to 1 more than minimum value if room.
+                  LDA = MA
+                  IF( LDA.LT.NMAX )
+     $               LDA = LDA + 1
+*                 Skip tests if not enough room.
+                  IF( LDA.GT.NMAX )
+     $               GO TO 80
+                  LAA = LDA*NA
+*
+*                 Generate the matrix A.
+*
+                  CALL CMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA,
+     $                        RESET, ZERO )
+*
+                  DO 70 ICB = 1, 3
+                     TRANSB = ICH( ICB: ICB )
+                     TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C'
+*
+                     IF( TRANB )THEN
+                        MB = N
+                        NB = K
+                     ELSE
+                        MB = K
+                        NB = N
+                     END IF
+*                    Set LDB to 1 more than minimum value if room.
+                     LDB = MB
+                     IF( LDB.LT.NMAX )
+     $                  LDB = LDB + 1
+*                    Skip tests if not enough room.
+                     IF( LDB.GT.NMAX )
+     $                  GO TO 70
+                     LBB = LDB*NB
+*
+*                    Generate the matrix B.
+*
+                     CALL CMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB,
+     $                           LDB, RESET, ZERO )
+*
+                     DO 60 IA = 1, NALF
+                        ALPHA = ALF( IA )
+*
+                        DO 50 IB = 1, NBET
+                           BETA = BET( IB )
+*
+*                          Generate the matrix C.
+*
+                           CALL CMAKE( 'GE', ' ', ' ', M, N, C, NMAX,
+     $                                 CC, LDC, RESET, ZERO )
+*
+                           NC = NC + 1
+*
+*                          Save every datum before calling the
+*                          subroutine.
+*
+                           TRANAS = TRANSA
+                           TRANBS = TRANSB
+                           MS = M
+                           NS = N
+                           KS = K
+                           ALS = ALPHA
+                           DO 10 I = 1, LAA
+                              AS( I ) = AA( I )
+   10                      CONTINUE
+                           LDAS = LDA
+                           DO 20 I = 1, LBB
+                              BS( I ) = BB( I )
+   20                      CONTINUE
+                           LDBS = LDB
+                           BLS = BETA
+                           DO 30 I = 1, LCC
+                              CS( I ) = CC( I )
+   30                      CONTINUE
+                           LDCS = LDC
+*
+*                          Call the subroutine.
+*
+                           IF( TRACE )
+     $                        WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                        TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB,
+     $                        BETA, LDC
+                           IF( REWI )
+     $                        REWIND NTRA
+                           CALL CGEMM( TRANSA, TRANSB, M, N, K, ALPHA,
+     $                                 AA, LDA, BB, LDB, BETA, CC, LDC )
+*
+*                          Check if error-exit was taken incorrectly.
+*
+                           IF( .NOT.OK )THEN
+                              WRITE( NOUT, FMT = 9994 )
+                              FATAL = .TRUE.
+                              GO TO 120
+                           END IF
+*
+*                          See what data changed inside subroutines.
+*
+                           ISAME( 1 ) = TRANSA.EQ.TRANAS
+                           ISAME( 2 ) = TRANSB.EQ.TRANBS
+                           ISAME( 3 ) = MS.EQ.M
+                           ISAME( 4 ) = NS.EQ.N
+                           ISAME( 5 ) = KS.EQ.K
+                           ISAME( 6 ) = ALS.EQ.ALPHA
+                           ISAME( 7 ) = LCE( AS, AA, LAA )
+                           ISAME( 8 ) = LDAS.EQ.LDA
+                           ISAME( 9 ) = LCE( BS, BB, LBB )
+                           ISAME( 10 ) = LDBS.EQ.LDB
+                           ISAME( 11 ) = BLS.EQ.BETA
+                           IF( NULL )THEN
+                              ISAME( 12 ) = LCE( CS, CC, LCC )
+                           ELSE
+                              ISAME( 12 ) = LCERES( 'GE', ' ', M, N, CS,
+     $                                      CC, LDC )
+                           END IF
+                           ISAME( 13 ) = LDCS.EQ.LDC
+*
+*                          If data was incorrectly changed, report
+*                          and return.
+*
+                           SAME = .TRUE.
+                           DO 40 I = 1, NARGS
+                              SAME = SAME.AND.ISAME( I )
+                              IF( .NOT.ISAME( I ) )
+     $                           WRITE( NOUT, FMT = 9998 )I
+   40                      CONTINUE
+                           IF( .NOT.SAME )THEN
+                              FATAL = .TRUE.
+                              GO TO 120
+                           END IF
+*
+                           IF( .NOT.NULL )THEN
+*
+*                             Check the result.
+*
+                              CALL CMMCH( TRANSA, TRANSB, M, N, K,
+     $                                    ALPHA, A, NMAX, B, NMAX, BETA,
+     $                                    C, NMAX, CT, G, CC, LDC, EPS,
+     $                                    ERR, FATAL, NOUT, .TRUE. )
+                              ERRMAX = MAX( ERRMAX, ERR )
+*                             If got really bad answer, report and
+*                             return.
+                              IF( FATAL )
+     $                           GO TO 120
+                           END IF
+*
+   50                   CONTINUE
+*
+   60                CONTINUE
+*
+   70             CONTINUE
+*
+   80          CONTINUE
+*
+   90       CONTINUE
+*
+  100    CONTINUE
+*
+  110 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 130
+*
+  120 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K,
+     $   ALPHA, LDA, LDB, BETA, LDC
+*
+  130 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',''', A1, ''',',
+     $      3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3,
+     $      ',(', F4.1, ',', F4.1, '), C,', I3, ').' )
+ 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of CCHK1.
+*
+      END
+      SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+     $                  A, AA, AS, B, BB, BS, C, CC, CS, CT, G )
+*
+*  Tests CHEMM and CSYMM.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      COMPLEX            ZERO
+      PARAMETER          ( ZERO = ( 0.0, 0.0 ) )
+      REAL               RZERO
+      PARAMETER          ( RZERO = 0.0 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX            A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), B( NMAX, NMAX ),
+     $                   BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+     $                   C( NMAX, NMAX ), CC( NMAX*NMAX ),
+     $                   CS( NMAX*NMAX ), CT( NMAX )
+      REAL               G( NMAX )
+      INTEGER            IDIM( NIDIM )
+*     .. Local Scalars ..
+      COMPLEX            ALPHA, ALS, BETA, BLS
+      REAL               ERR, ERRMAX
+      INTEGER            I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC,
+     $                   LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA,
+     $                   NARGS, NC, NS
+      LOGICAL            CONJ, LEFT, NULL, RESET, SAME
+      CHARACTER*1        SIDE, SIDES, UPLO, UPLOS
+      CHARACTER*2        ICHS, ICHU
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LCE, LCERES
+      EXTERNAL           LCE, LCERES
+*     .. External Subroutines ..
+      EXTERNAL           CHEMM, CMAKE, CMMCH, CSYMM
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICHS/'LR'/, ICHU/'UL'/
+*     .. Executable Statements ..
+      CONJ = SNAME( 2: 3 ).EQ.'HE'
+*
+      NARGS = 12
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*
+      DO 100 IM = 1, NIDIM
+         M = IDIM( IM )
+*
+         DO 90 IN = 1, NIDIM
+            N = IDIM( IN )
+*           Set LDC to 1 more than minimum value if room.
+            LDC = M
+            IF( LDC.LT.NMAX )
+     $         LDC = LDC + 1
+*           Skip tests if not enough room.
+            IF( LDC.GT.NMAX )
+     $         GO TO 90
+            LCC = LDC*N
+            NULL = N.LE.0.OR.M.LE.0
+*           Set LDB to 1 more than minimum value if room.
+            LDB = M
+            IF( LDB.LT.NMAX )
+     $         LDB = LDB + 1
+*           Skip tests if not enough room.
+            IF( LDB.GT.NMAX )
+     $         GO TO 90
+            LBB = LDB*N
+*
+*           Generate the matrix B.
+*
+            CALL CMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET,
+     $                  ZERO )
+*
+            DO 80 ICS = 1, 2
+               SIDE = ICHS( ICS: ICS )
+               LEFT = SIDE.EQ.'L'
+*
+               IF( LEFT )THEN
+                  NA = M
+               ELSE
+                  NA = N
+               END IF
+*              Set LDA to 1 more than minimum value if room.
+               LDA = NA
+               IF( LDA.LT.NMAX )
+     $            LDA = LDA + 1
+*              Skip tests if not enough room.
+               IF( LDA.GT.NMAX )
+     $            GO TO 80
+               LAA = LDA*NA
+*
+               DO 70 ICU = 1, 2
+                  UPLO = ICHU( ICU: ICU )
+*
+*                 Generate the hermitian or symmetric matrix A.
+*
+                  CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', NA, NA, A, NMAX,
+     $                        AA, LDA, RESET, ZERO )
+*
+                  DO 60 IA = 1, NALF
+                     ALPHA = ALF( IA )
+*
+                     DO 50 IB = 1, NBET
+                        BETA = BET( IB )
+*
+*                       Generate the matrix C.
+*
+                        CALL CMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC,
+     $                              LDC, RESET, ZERO )
+*
+                        NC = NC + 1
+*
+*                       Save every datum before calling the
+*                       subroutine.
+*
+                        SIDES = SIDE
+                        UPLOS = UPLO
+                        MS = M
+                        NS = N
+                        ALS = ALPHA
+                        DO 10 I = 1, LAA
+                           AS( I ) = AA( I )
+   10                   CONTINUE
+                        LDAS = LDA
+                        DO 20 I = 1, LBB
+                           BS( I ) = BB( I )
+   20                   CONTINUE
+                        LDBS = LDB
+                        BLS = BETA
+                        DO 30 I = 1, LCC
+                           CS( I ) = CC( I )
+   30                   CONTINUE
+                        LDCS = LDC
+*
+*                       Call the subroutine.
+*
+                        IF( TRACE )
+     $                     WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE,
+     $                     UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC
+                        IF( REWI )
+     $                     REWIND NTRA
+                        IF( CONJ )THEN
+                           CALL CHEMM( SIDE, UPLO, M, N, ALPHA, AA, LDA,
+     $                                 BB, LDB, BETA, CC, LDC )
+                        ELSE
+                           CALL CSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA,
+     $                                 BB, LDB, BETA, CC, LDC )
+                        END IF
+*
+*                       Check if error-exit was taken incorrectly.
+*
+                        IF( .NOT.OK )THEN
+                           WRITE( NOUT, FMT = 9994 )
+                           FATAL = .TRUE.
+                           GO TO 110
+                        END IF
+*
+*                       See what data changed inside subroutines.
+*
+                        ISAME( 1 ) = SIDES.EQ.SIDE
+                        ISAME( 2 ) = UPLOS.EQ.UPLO
+                        ISAME( 3 ) = MS.EQ.M
+                        ISAME( 4 ) = NS.EQ.N
+                        ISAME( 5 ) = ALS.EQ.ALPHA
+                        ISAME( 6 ) = LCE( AS, AA, LAA )
+                        ISAME( 7 ) = LDAS.EQ.LDA
+                        ISAME( 8 ) = LCE( BS, BB, LBB )
+                        ISAME( 9 ) = LDBS.EQ.LDB
+                        ISAME( 10 ) = BLS.EQ.BETA
+                        IF( NULL )THEN
+                           ISAME( 11 ) = LCE( CS, CC, LCC )
+                        ELSE
+                           ISAME( 11 ) = LCERES( 'GE', ' ', M, N, CS,
+     $                                   CC, LDC )
+                        END IF
+                        ISAME( 12 ) = LDCS.EQ.LDC
+*
+*                       If data was incorrectly changed, report and
+*                       return.
+*
+                        SAME = .TRUE.
+                        DO 40 I = 1, NARGS
+                           SAME = SAME.AND.ISAME( I )
+                           IF( .NOT.ISAME( I ) )
+     $                        WRITE( NOUT, FMT = 9998 )I
+   40                   CONTINUE
+                        IF( .NOT.SAME )THEN
+                           FATAL = .TRUE.
+                           GO TO 110
+                        END IF
+*
+                        IF( .NOT.NULL )THEN
+*
+*                          Check the result.
+*
+                           IF( LEFT )THEN
+                              CALL CMMCH( 'N', 'N', M, N, M, ALPHA, A,
+     $                                    NMAX, B, NMAX, BETA, C, NMAX,
+     $                                    CT, G, CC, LDC, EPS, ERR,
+     $                                    FATAL, NOUT, .TRUE. )
+                           ELSE
+                              CALL CMMCH( 'N', 'N', M, N, N, ALPHA, B,
+     $                                    NMAX, A, NMAX, BETA, C, NMAX,
+     $                                    CT, G, CC, LDC, EPS, ERR,
+     $                                    FATAL, NOUT, .TRUE. )
+                           END IF
+                           ERRMAX = MAX( ERRMAX, ERR )
+*                          If got really bad answer, report and
+*                          return.
+                           IF( FATAL )
+     $                        GO TO 110
+                        END IF
+*
+   50                CONTINUE
+*
+   60             CONTINUE
+*
+   70          CONTINUE
+*
+   80       CONTINUE
+*
+   90    CONTINUE
+*
+  100 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 120
+*
+  110 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA,
+     $   LDB, BETA, LDC
+*
+  120 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+     $      '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1,
+     $      ',', F4.1, '), C,', I3, ')    .' )
+ 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of CCHK2.
+*
+      END
+      SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS,
+     $                  B, BB, BS, CT, G, C )
+*
+*  Tests CTRMM and CTRSM.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      COMPLEX            ZERO, ONE
+      PARAMETER          ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) )
+      REAL               RZERO
+      PARAMETER          ( RZERO = 0.0 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            NALF, NIDIM, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX            A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), B( NMAX, NMAX ),
+     $                   BB( NMAX*NMAX ), BS( NMAX*NMAX ),
+     $                   C( NMAX, NMAX ), CT( NMAX )
+      REAL               G( NMAX )
+      INTEGER            IDIM( NIDIM )
+*     .. Local Scalars ..
+      COMPLEX            ALPHA, ALS
+      REAL               ERR, ERRMAX
+      INTEGER            I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB,
+     $                   LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC,
+     $                   NS
+      LOGICAL            LEFT, NULL, RESET, SAME
+      CHARACTER*1        DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO,
+     $                   UPLOS
+      CHARACTER*2        ICHD, ICHS, ICHU
+      CHARACTER*3        ICHT
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LCE, LCERES
+      EXTERNAL           LCE, LCERES
+*     .. External Subroutines ..
+      EXTERNAL           CMAKE, CMMCH, CTRMM, CTRSM
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/
+*     .. Executable Statements ..
+*
+      NARGS = 11
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*     Set up zero matrix for CMMCH.
+      DO 20 J = 1, NMAX
+         DO 10 I = 1, NMAX
+            C( I, J ) = ZERO
+   10    CONTINUE
+   20 CONTINUE
+*
+      DO 140 IM = 1, NIDIM
+         M = IDIM( IM )
+*
+         DO 130 IN = 1, NIDIM
+            N = IDIM( IN )
+*           Set LDB to 1 more than minimum value if room.
+            LDB = M
+            IF( LDB.LT.NMAX )
+     $         LDB = LDB + 1
+*           Skip tests if not enough room.
+            IF( LDB.GT.NMAX )
+     $         GO TO 130
+            LBB = LDB*N
+            NULL = M.LE.0.OR.N.LE.0
+*
+            DO 120 ICS = 1, 2
+               SIDE = ICHS( ICS: ICS )
+               LEFT = SIDE.EQ.'L'
+               IF( LEFT )THEN
+                  NA = M
+               ELSE
+                  NA = N
+               END IF
+*              Set LDA to 1 more than minimum value if room.
+               LDA = NA
+               IF( LDA.LT.NMAX )
+     $            LDA = LDA + 1
+*              Skip tests if not enough room.
+               IF( LDA.GT.NMAX )
+     $            GO TO 130
+               LAA = LDA*NA
+*
+               DO 110 ICU = 1, 2
+                  UPLO = ICHU( ICU: ICU )
+*
+                  DO 100 ICT = 1, 3
+                     TRANSA = ICHT( ICT: ICT )
+*
+                     DO 90 ICD = 1, 2
+                        DIAG = ICHD( ICD: ICD )
+*
+                        DO 80 IA = 1, NALF
+                           ALPHA = ALF( IA )
+*
+*                          Generate the matrix A.
+*
+                           CALL CMAKE( 'TR', UPLO, DIAG, NA, NA, A,
+     $                                 NMAX, AA, LDA, RESET, ZERO )
+*
+*                          Generate the matrix B.
+*
+                           CALL CMAKE( 'GE', ' ', ' ', M, N, B, NMAX,
+     $                                 BB, LDB, RESET, ZERO )
+*
+                           NC = NC + 1
+*
+*                          Save every datum before calling the
+*                          subroutine.
+*
+                           SIDES = SIDE
+                           UPLOS = UPLO
+                           TRANAS = TRANSA
+                           DIAGS = DIAG
+                           MS = M
+                           NS = N
+                           ALS = ALPHA
+                           DO 30 I = 1, LAA
+                              AS( I ) = AA( I )
+   30                      CONTINUE
+                           LDAS = LDA
+                           DO 40 I = 1, LBB
+                              BS( I ) = BB( I )
+   40                      CONTINUE
+                           LDBS = LDB
+*
+*                          Call the subroutine.
+*
+                           IF( SNAME( 4: 5 ).EQ.'MM' )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                           SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA,
+     $                           LDA, LDB
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL CTRMM( SIDE, UPLO, TRANSA, DIAG, M,
+     $                                    N, ALPHA, AA, LDA, BB, LDB )
+                           ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                           SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA,
+     $                           LDA, LDB
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL CTRSM( SIDE, UPLO, TRANSA, DIAG, M,
+     $                                    N, ALPHA, AA, LDA, BB, LDB )
+                           END IF
+*
+*                          Check if error-exit was taken incorrectly.
+*
+                           IF( .NOT.OK )THEN
+                              WRITE( NOUT, FMT = 9994 )
+                              FATAL = .TRUE.
+                              GO TO 150
+                           END IF
+*
+*                          See what data changed inside subroutines.
+*
+                           ISAME( 1 ) = SIDES.EQ.SIDE
+                           ISAME( 2 ) = UPLOS.EQ.UPLO
+                           ISAME( 3 ) = TRANAS.EQ.TRANSA
+                           ISAME( 4 ) = DIAGS.EQ.DIAG
+                           ISAME( 5 ) = MS.EQ.M
+                           ISAME( 6 ) = NS.EQ.N
+                           ISAME( 7 ) = ALS.EQ.ALPHA
+                           ISAME( 8 ) = LCE( AS, AA, LAA )
+                           ISAME( 9 ) = LDAS.EQ.LDA
+                           IF( NULL )THEN
+                              ISAME( 10 ) = LCE( BS, BB, LBB )
+                           ELSE
+                              ISAME( 10 ) = LCERES( 'GE', ' ', M, N, BS,
+     $                                      BB, LDB )
+                           END IF
+                           ISAME( 11 ) = LDBS.EQ.LDB
+*
+*                          If data was incorrectly changed, report and
+*                          return.
+*
+                           SAME = .TRUE.
+                           DO 50 I = 1, NARGS
+                              SAME = SAME.AND.ISAME( I )
+                              IF( .NOT.ISAME( I ) )
+     $                           WRITE( NOUT, FMT = 9998 )I
+   50                      CONTINUE
+                           IF( .NOT.SAME )THEN
+                              FATAL = .TRUE.
+                              GO TO 150
+                           END IF
+*
+                           IF( .NOT.NULL )THEN
+                              IF( SNAME( 4: 5 ).EQ.'MM' )THEN
+*
+*                                Check the result.
+*
+                                 IF( LEFT )THEN
+                                    CALL CMMCH( TRANSA, 'N', M, N, M,
+     $                                          ALPHA, A, NMAX, B, NMAX,
+     $                                          ZERO, C, NMAX, CT, G,
+     $                                          BB, LDB, EPS, ERR,
+     $                                          FATAL, NOUT, .TRUE. )
+                                 ELSE
+                                    CALL CMMCH( 'N', TRANSA, M, N, N,
+     $                                          ALPHA, B, NMAX, A, NMAX,
+     $                                          ZERO, C, NMAX, CT, G,
+     $                                          BB, LDB, EPS, ERR,
+     $                                          FATAL, NOUT, .TRUE. )
+                                 END IF
+                              ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN
+*
+*                                Compute approximation to original
+*                                matrix.
+*
+                                 DO 70 J = 1, N
+                                    DO 60 I = 1, M
+                                       C( I, J ) = BB( I + ( J - 1 )*
+     $                                             LDB )
+                                       BB( I + ( J - 1 )*LDB ) = ALPHA*
+     $                                    B( I, J )
+   60                               CONTINUE
+   70                            CONTINUE
+*
+                                 IF( LEFT )THEN
+                                    CALL CMMCH( TRANSA, 'N', M, N, M,
+     $                                          ONE, A, NMAX, C, NMAX,
+     $                                          ZERO, B, NMAX, CT, G,
+     $                                          BB, LDB, EPS, ERR,
+     $                                          FATAL, NOUT, .FALSE. )
+                                 ELSE
+                                    CALL CMMCH( 'N', TRANSA, M, N, N,
+     $                                          ONE, C, NMAX, A, NMAX,
+     $                                          ZERO, B, NMAX, CT, G,
+     $                                          BB, LDB, EPS, ERR,
+     $                                          FATAL, NOUT, .FALSE. )
+                                 END IF
+                              END IF
+                              ERRMAX = MAX( ERRMAX, ERR )
+*                             If got really bad answer, report and
+*                             return.
+                              IF( FATAL )
+     $                           GO TO 150
+                           END IF
+*
+   80                   CONTINUE
+*
+   90                CONTINUE
+*
+  100             CONTINUE
+*
+  110          CONTINUE
+*
+  120       CONTINUE
+*
+  130    CONTINUE
+*
+  140 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 160
+*
+  150 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M,
+     $   N, ALPHA, LDA, LDB
+*
+  160 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(', 4( '''', A1, ''',' ), 2( I3, ',' ),
+     $      '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ')         ',
+     $      '      .' )
+ 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of CCHK3.
+*
+      END
+      SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+     $                  A, AA, AS, B, BB, BS, C, CC, CS, CT, G )
+*
+*  Tests CHERK and CSYRK.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      COMPLEX            ZERO
+      PARAMETER          ( ZERO = ( 0.0, 0.0 ) )
+      REAL               RONE, RZERO
+      PARAMETER          ( RONE = 1.0, RZERO = 0.0 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX            A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), B( NMAX, NMAX ),
+     $                   BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+     $                   C( NMAX, NMAX ), CC( NMAX*NMAX ),
+     $                   CS( NMAX*NMAX ), CT( NMAX )
+      REAL               G( NMAX )
+      INTEGER            IDIM( NIDIM )
+*     .. Local Scalars ..
+      COMPLEX            ALPHA, ALS, BETA, BETS
+      REAL               ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS
+      INTEGER            I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS,
+     $                   LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA,
+     $                   NARGS, NC, NS
+      LOGICAL            CONJ, NULL, RESET, SAME, TRAN, UPPER
+      CHARACTER*1        TRANS, TRANSS, TRANST, UPLO, UPLOS
+      CHARACTER*2        ICHT, ICHU
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LCE, LCERES
+      EXTERNAL           LCE, LCERES
+*     .. External Subroutines ..
+      EXTERNAL           CHERK, CMAKE, CMMCH, CSYRK
+*     .. Intrinsic Functions ..
+      INTRINSIC          CMPLX, MAX, REAL
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICHT/'NC'/, ICHU/'UL'/
+*     .. Executable Statements ..
+      CONJ = SNAME( 2: 3 ).EQ.'HE'
+*
+      NARGS = 10
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*
+      DO 100 IN = 1, NIDIM
+         N = IDIM( IN )
+*        Set LDC to 1 more than minimum value if room.
+         LDC = N
+         IF( LDC.LT.NMAX )
+     $      LDC = LDC + 1
+*        Skip tests if not enough room.
+         IF( LDC.GT.NMAX )
+     $      GO TO 100
+         LCC = LDC*N
+*
+         DO 90 IK = 1, NIDIM
+            K = IDIM( IK )
+*
+            DO 80 ICT = 1, 2
+               TRANS = ICHT( ICT: ICT )
+               TRAN = TRANS.EQ.'C'
+               IF( TRAN.AND..NOT.CONJ )
+     $            TRANS = 'T'
+               IF( TRAN )THEN
+                  MA = K
+                  NA = N
+               ELSE
+                  MA = N
+                  NA = K
+               END IF
+*              Set LDA to 1 more than minimum value if room.
+               LDA = MA
+               IF( LDA.LT.NMAX )
+     $            LDA = LDA + 1
+*              Skip tests if not enough room.
+               IF( LDA.GT.NMAX )
+     $            GO TO 80
+               LAA = LDA*NA
+*
+*              Generate the matrix A.
+*
+               CALL CMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA,
+     $                     RESET, ZERO )
+*
+               DO 70 ICU = 1, 2
+                  UPLO = ICHU( ICU: ICU )
+                  UPPER = UPLO.EQ.'U'
+*
+                  DO 60 IA = 1, NALF
+                     ALPHA = ALF( IA )
+                     IF( CONJ )THEN
+                        RALPHA = REAL( ALPHA )
+                        ALPHA = CMPLX( RALPHA, RZERO )
+                     END IF
+*
+                     DO 50 IB = 1, NBET
+                        BETA = BET( IB )
+                        IF( CONJ )THEN
+                           RBETA = REAL( BETA )
+                           BETA = CMPLX( RBETA, RZERO )
+                        END IF
+                        NULL = N.LE.0
+                        IF( CONJ )
+     $                     NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ.
+     $                            RZERO ).AND.RBETA.EQ.RONE )
+*
+*                       Generate the matrix C.
+*
+                        CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C,
+     $                              NMAX, CC, LDC, RESET, ZERO )
+*
+                        NC = NC + 1
+*
+*                       Save every datum before calling the subroutine.
+*
+                        UPLOS = UPLO
+                        TRANSS = TRANS
+                        NS = N
+                        KS = K
+                        IF( CONJ )THEN
+                           RALS = RALPHA
+                        ELSE
+                           ALS = ALPHA
+                        END IF
+                        DO 10 I = 1, LAA
+                           AS( I ) = AA( I )
+   10                   CONTINUE
+                        LDAS = LDA
+                        IF( CONJ )THEN
+                           RBETS = RBETA
+                        ELSE
+                           BETS = BETA
+                        END IF
+                        DO 20 I = 1, LCC
+                           CS( I ) = CC( I )
+   20                   CONTINUE
+                        LDCS = LDC
+*
+*                       Call the subroutine.
+*
+                        IF( CONJ )THEN
+                           IF( TRACE )
+     $                        WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO,
+     $                        TRANS, N, K, RALPHA, LDA, RBETA, LDC
+                           IF( REWI )
+     $                        REWIND NTRA
+                           CALL CHERK( UPLO, TRANS, N, K, RALPHA, AA,
+     $                                 LDA, RBETA, CC, LDC )
+                        ELSE
+                           IF( TRACE )
+     $                        WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO,
+     $                        TRANS, N, K, ALPHA, LDA, BETA, LDC
+                           IF( REWI )
+     $                        REWIND NTRA
+                           CALL CSYRK( UPLO, TRANS, N, K, ALPHA, AA,
+     $                                 LDA, BETA, CC, LDC )
+                        END IF
+*
+*                       Check if error-exit was taken incorrectly.
+*
+                        IF( .NOT.OK )THEN
+                           WRITE( NOUT, FMT = 9992 )
+                           FATAL = .TRUE.
+                           GO TO 120
+                        END IF
+*
+*                       See what data changed inside subroutines.
+*
+                        ISAME( 1 ) = UPLOS.EQ.UPLO
+                        ISAME( 2 ) = TRANSS.EQ.TRANS
+                        ISAME( 3 ) = NS.EQ.N
+                        ISAME( 4 ) = KS.EQ.K
+                        IF( CONJ )THEN
+                           ISAME( 5 ) = RALS.EQ.RALPHA
+                        ELSE
+                           ISAME( 5 ) = ALS.EQ.ALPHA
+                        END IF
+                        ISAME( 6 ) = LCE( AS, AA, LAA )
+                        ISAME( 7 ) = LDAS.EQ.LDA
+                        IF( CONJ )THEN
+                           ISAME( 8 ) = RBETS.EQ.RBETA
+                        ELSE
+                           ISAME( 8 ) = BETS.EQ.BETA
+                        END IF
+                        IF( NULL )THEN
+                           ISAME( 9 ) = LCE( CS, CC, LCC )
+                        ELSE
+                           ISAME( 9 ) = LCERES( SNAME( 2: 3 ), UPLO, N,
+     $                                  N, CS, CC, LDC )
+                        END IF
+                        ISAME( 10 ) = LDCS.EQ.LDC
+*
+*                       If data was incorrectly changed, report and
+*                       return.
+*
+                        SAME = .TRUE.
+                        DO 30 I = 1, NARGS
+                           SAME = SAME.AND.ISAME( I )
+                           IF( .NOT.ISAME( I ) )
+     $                        WRITE( NOUT, FMT = 9998 )I
+   30                   CONTINUE
+                        IF( .NOT.SAME )THEN
+                           FATAL = .TRUE.
+                           GO TO 120
+                        END IF
+*
+                        IF( .NOT.NULL )THEN
+*
+*                          Check the result column by column.
+*
+                           IF( CONJ )THEN
+                              TRANST = 'C'
+                           ELSE
+                              TRANST = 'T'
+                           END IF
+                           JC = 1
+                           DO 40 J = 1, N
+                              IF( UPPER )THEN
+                                 JJ = 1
+                                 LJ = J
+                              ELSE
+                                 JJ = J
+                                 LJ = N - J + 1
+                              END IF
+                              IF( TRAN )THEN
+                                 CALL CMMCH( TRANST, 'N', LJ, 1, K,
+     $                                       ALPHA, A( 1, JJ ), NMAX,
+     $                                       A( 1, J ), NMAX, BETA,
+     $                                       C( JJ, J ), NMAX, CT, G,
+     $                                       CC( JC ), LDC, EPS, ERR,
+     $                                       FATAL, NOUT, .TRUE. )
+                              ELSE
+                                 CALL CMMCH( 'N', TRANST, LJ, 1, K,
+     $                                       ALPHA, A( JJ, 1 ), NMAX,
+     $                                       A( J, 1 ), NMAX, BETA,
+     $                                       C( JJ, J ), NMAX, CT, G,
+     $                                       CC( JC ), LDC, EPS, ERR,
+     $                                       FATAL, NOUT, .TRUE. )
+                              END IF
+                              IF( UPPER )THEN
+                                 JC = JC + LDC
+                              ELSE
+                                 JC = JC + LDC + 1
+                              END IF
+                              ERRMAX = MAX( ERRMAX, ERR )
+*                             If got really bad answer, report and
+*                             return.
+                              IF( FATAL )
+     $                           GO TO 110
+   40                      CONTINUE
+                        END IF
+*
+   50                CONTINUE
+*
+   60             CONTINUE
+*
+   70          CONTINUE
+*
+   80       CONTINUE
+*
+   90    CONTINUE
+*
+  100 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 130
+*
+  110 CONTINUE
+      IF( N.GT.1 )
+     $   WRITE( NOUT, FMT = 9995 )J
+*
+  120 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( CONJ )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, RALPHA,
+     $      LDA, RBETA, LDC
+      ELSE
+         WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA,
+     $      LDA, BETA, LDC
+      END IF
+*
+  130 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+     $      F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ')               ',
+     $      '          .' )
+ 9993 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+     $      '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1,
+     $      '), C,', I3, ')          .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of CCHK4.
+*
+      END
+      SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+     $                  AB, AA, AS, BB, BS, C, CC, CS, CT, G, W )
+*
+*  Tests CHER2K and CSYR2K.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      COMPLEX            ZERO, ONE
+      PARAMETER          ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) )
+      REAL               RONE, RZERO
+      PARAMETER          ( RONE = 1.0, RZERO = 0.0 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX            AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ),
+     $                   ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ),
+     $                   BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ),
+     $                   CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ),
+     $                   W( 2*NMAX )
+      REAL               G( NMAX )
+      INTEGER            IDIM( NIDIM )
+*     .. Local Scalars ..
+      COMPLEX            ALPHA, ALS, BETA, BETS
+      REAL               ERR, ERRMAX, RBETA, RBETS
+      INTEGER            I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB,
+     $                   K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS,
+     $                   LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS
+      LOGICAL            CONJ, NULL, RESET, SAME, TRAN, UPPER
+      CHARACTER*1        TRANS, TRANSS, TRANST, UPLO, UPLOS
+      CHARACTER*2        ICHT, ICHU
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LCE, LCERES
+      EXTERNAL           LCE, LCERES
+*     .. External Subroutines ..
+      EXTERNAL           CHER2K, CMAKE, CMMCH, CSYR2K
+*     .. Intrinsic Functions ..
+      INTRINSIC          CMPLX, CONJG, MAX, REAL
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICHT/'NC'/, ICHU/'UL'/
+*     .. Executable Statements ..
+      CONJ = SNAME( 2: 3 ).EQ.'HE'
+*
+      NARGS = 12
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*
+      DO 130 IN = 1, NIDIM
+         N = IDIM( IN )
+*        Set LDC to 1 more than minimum value if room.
+         LDC = N
+         IF( LDC.LT.NMAX )
+     $      LDC = LDC + 1
+*        Skip tests if not enough room.
+         IF( LDC.GT.NMAX )
+     $      GO TO 130
+         LCC = LDC*N
+*
+         DO 120 IK = 1, NIDIM
+            K = IDIM( IK )
+*
+            DO 110 ICT = 1, 2
+               TRANS = ICHT( ICT: ICT )
+               TRAN = TRANS.EQ.'C'
+               IF( TRAN.AND..NOT.CONJ )
+     $            TRANS = 'T'
+               IF( TRAN )THEN
+                  MA = K
+                  NA = N
+               ELSE
+                  MA = N
+                  NA = K
+               END IF
+*              Set LDA to 1 more than minimum value if room.
+               LDA = MA
+               IF( LDA.LT.NMAX )
+     $            LDA = LDA + 1
+*              Skip tests if not enough room.
+               IF( LDA.GT.NMAX )
+     $            GO TO 110
+               LAA = LDA*NA
+*
+*              Generate the matrix A.
+*
+               IF( TRAN )THEN
+                  CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA,
+     $                        LDA, RESET, ZERO )
+               ELSE
+                  CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA,
+     $                        RESET, ZERO )
+               END IF
+*
+*              Generate the matrix B.
+*
+               LDB = LDA
+               LBB = LAA
+               IF( TRAN )THEN
+                  CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ),
+     $                        2*NMAX, BB, LDB, RESET, ZERO )
+               ELSE
+                  CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ),
+     $                        NMAX, BB, LDB, RESET, ZERO )
+               END IF
+*
+               DO 100 ICU = 1, 2
+                  UPLO = ICHU( ICU: ICU )
+                  UPPER = UPLO.EQ.'U'
+*
+                  DO 90 IA = 1, NALF
+                     ALPHA = ALF( IA )
+*
+                     DO 80 IB = 1, NBET
+                        BETA = BET( IB )
+                        IF( CONJ )THEN
+                           RBETA = REAL( BETA )
+                           BETA = CMPLX( RBETA, RZERO )
+                        END IF
+                        NULL = N.LE.0
+                        IF( CONJ )
+     $                     NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ.
+     $                            ZERO ).AND.RBETA.EQ.RONE )
+*
+*                       Generate the matrix C.
+*
+                        CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C,
+     $                              NMAX, CC, LDC, RESET, ZERO )
+*
+                        NC = NC + 1
+*
+*                       Save every datum before calling the subroutine.
+*
+                        UPLOS = UPLO
+                        TRANSS = TRANS
+                        NS = N
+                        KS = K
+                        ALS = ALPHA
+                        DO 10 I = 1, LAA
+                           AS( I ) = AA( I )
+   10                   CONTINUE
+                        LDAS = LDA
+                        DO 20 I = 1, LBB
+                           BS( I ) = BB( I )
+   20                   CONTINUE
+                        LDBS = LDB
+                        IF( CONJ )THEN
+                           RBETS = RBETA
+                        ELSE
+                           BETS = BETA
+                        END IF
+                        DO 30 I = 1, LCC
+                           CS( I ) = CC( I )
+   30                   CONTINUE
+                        LDCS = LDC
+*
+*                       Call the subroutine.
+*
+                        IF( CONJ )THEN
+                           IF( TRACE )
+     $                        WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO,
+     $                        TRANS, N, K, ALPHA, LDA, LDB, RBETA, LDC
+                           IF( REWI )
+     $                        REWIND NTRA
+                           CALL CHER2K( UPLO, TRANS, N, K, ALPHA, AA,
+     $                                  LDA, BB, LDB, RBETA, CC, LDC )
+                        ELSE
+                           IF( TRACE )
+     $                        WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO,
+     $                        TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC
+                           IF( REWI )
+     $                        REWIND NTRA
+                           CALL CSYR2K( UPLO, TRANS, N, K, ALPHA, AA,
+     $                                  LDA, BB, LDB, BETA, CC, LDC )
+                        END IF
+*
+*                       Check if error-exit was taken incorrectly.
+*
+                        IF( .NOT.OK )THEN
+                           WRITE( NOUT, FMT = 9992 )
+                           FATAL = .TRUE.
+                           GO TO 150
+                        END IF
+*
+*                       See what data changed inside subroutines.
+*
+                        ISAME( 1 ) = UPLOS.EQ.UPLO
+                        ISAME( 2 ) = TRANSS.EQ.TRANS
+                        ISAME( 3 ) = NS.EQ.N
+                        ISAME( 4 ) = KS.EQ.K
+                        ISAME( 5 ) = ALS.EQ.ALPHA
+                        ISAME( 6 ) = LCE( AS, AA, LAA )
+                        ISAME( 7 ) = LDAS.EQ.LDA
+                        ISAME( 8 ) = LCE( BS, BB, LBB )
+                        ISAME( 9 ) = LDBS.EQ.LDB
+                        IF( CONJ )THEN
+                           ISAME( 10 ) = RBETS.EQ.RBETA
+                        ELSE
+                           ISAME( 10 ) = BETS.EQ.BETA
+                        END IF
+                        IF( NULL )THEN
+                           ISAME( 11 ) = LCE( CS, CC, LCC )
+                        ELSE
+                           ISAME( 11 ) = LCERES( 'HE', UPLO, N, N, CS,
+     $                                   CC, LDC )
+                        END IF
+                        ISAME( 12 ) = LDCS.EQ.LDC
+*
+*                       If data was incorrectly changed, report and
+*                       return.
+*
+                        SAME = .TRUE.
+                        DO 40 I = 1, NARGS
+                           SAME = SAME.AND.ISAME( I )
+                           IF( .NOT.ISAME( I ) )
+     $                        WRITE( NOUT, FMT = 9998 )I
+   40                   CONTINUE
+                        IF( .NOT.SAME )THEN
+                           FATAL = .TRUE.
+                           GO TO 150
+                        END IF
+*
+                        IF( .NOT.NULL )THEN
+*
+*                          Check the result column by column.
+*
+                           IF( CONJ )THEN
+                              TRANST = 'C'
+                           ELSE
+                              TRANST = 'T'
+                           END IF
+                           JJAB = 1
+                           JC = 1
+                           DO 70 J = 1, N
+                              IF( UPPER )THEN
+                                 JJ = 1
+                                 LJ = J
+                              ELSE
+                                 JJ = J
+                                 LJ = N - J + 1
+                              END IF
+                              IF( TRAN )THEN
+                                 DO 50 I = 1, K
+                                    W( I ) = ALPHA*AB( ( J - 1 )*2*
+     $                                       NMAX + K + I )
+                                    IF( CONJ )THEN
+                                       W( K + I ) = CONJG( ALPHA )*
+     $                                              AB( ( J - 1 )*2*
+     $                                              NMAX + I )
+                                    ELSE
+                                       W( K + I ) = ALPHA*
+     $                                              AB( ( J - 1 )*2*
+     $                                              NMAX + I )
+                                    END IF
+   50                            CONTINUE
+                                 CALL CMMCH( TRANST, 'N', LJ, 1, 2*K,
+     $                                       ONE, AB( JJAB ), 2*NMAX, W,
+     $                                       2*NMAX, BETA, C( JJ, J ),
+     $                                       NMAX, CT, G, CC( JC ), LDC,
+     $                                       EPS, ERR, FATAL, NOUT,
+     $                                       .TRUE. )
+                              ELSE
+                                 DO 60 I = 1, K
+                                    IF( CONJ )THEN
+                                       W( I ) = ALPHA*CONJG( AB( ( K +
+     $                                          I - 1 )*NMAX + J ) )
+                                       W( K + I ) = CONJG( ALPHA*
+     $                                              AB( ( I - 1 )*NMAX +
+     $                                              J ) )
+                                    ELSE
+                                       W( I ) = ALPHA*AB( ( K + I - 1 )*
+     $                                          NMAX + J )
+                                       W( K + I ) = ALPHA*
+     $                                              AB( ( I - 1 )*NMAX +
+     $                                              J )
+                                    END IF
+   60                            CONTINUE
+                                 CALL CMMCH( 'N', 'N', LJ, 1, 2*K, ONE,
+     $                                       AB( JJ ), NMAX, W, 2*NMAX,
+     $                                       BETA, C( JJ, J ), NMAX, CT,
+     $                                       G, CC( JC ), LDC, EPS, ERR,
+     $                                       FATAL, NOUT, .TRUE. )
+                              END IF
+                              IF( UPPER )THEN
+                                 JC = JC + LDC
+                              ELSE
+                                 JC = JC + LDC + 1
+                                 IF( TRAN )
+     $                              JJAB = JJAB + 2*NMAX
+                              END IF
+                              ERRMAX = MAX( ERRMAX, ERR )
+*                             If got really bad answer, report and
+*                             return.
+                              IF( FATAL )
+     $                           GO TO 140
+   70                      CONTINUE
+                        END IF
+*
+   80                CONTINUE
+*
+   90             CONTINUE
+*
+  100          CONTINUE
+*
+  110       CONTINUE
+*
+  120    CONTINUE
+*
+  130 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 160
+*
+  140 CONTINUE
+      IF( N.GT.1 )
+     $   WRITE( NOUT, FMT = 9995 )J
+*
+  150 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( CONJ )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA,
+     $      LDA, LDB, RBETA, LDC
+      ELSE
+         WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA,
+     $      LDA, LDB, BETA, LDC
+      END IF
+*
+  160 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+     $      '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1,
+     $      ', C,', I3, ')           .' )
+ 9993 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+     $      '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1,
+     $      ',', F4.1, '), C,', I3, ')    .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of CCHK5.
+*
+      END
+      SUBROUTINE CCHKE( ISNUM, SRNAMT, NOUT )
+*
+*  Tests the error exits from the Level 3 Blas.
+*  Requires a special version of the error-handling routine XERBLA.
+*  A, B and C should not need to be defined.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*  3-19-92:  Initialize ALPHA, BETA, RALPHA, and RBETA  (eca)
+*  3-19-92:  Fix argument 12 in calls to CSYMM and CHEMM
+*            with INFOT = 9  (eca)
+*
+*     .. Scalar Arguments ..
+      INTEGER            ISNUM, NOUT
+      CHARACTER*6        SRNAMT
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Parameters ..
+      REAL               ONE, TWO
+      PARAMETER          ( ONE = 1.0E0, TWO = 2.0E0 )
+*     .. Local Scalars ..
+      COMPLEX            ALPHA, BETA
+      REAL               RALPHA, RBETA
+*     .. Local Arrays ..
+      COMPLEX            A( 2, 1 ), B( 2, 1 ), C( 2, 1 )
+*     .. External Subroutines ..
+      EXTERNAL           CGEMM, CHEMM, CHER2K, CHERK, CHKXER, CSYMM,
+     $                   CSYR2K, CSYRK, CTRMM, CTRSM
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Executable Statements ..
+*     OK is set to .FALSE. by the special version of XERBLA or by CHKXER
+*     if anything is wrong.
+      OK = .TRUE.
+*     LERR is set to .TRUE. by the special version of XERBLA each time
+*     it is called, and is then tested and re-set by CHKXER.
+      LERR = .FALSE.
+*
+*     Initialize ALPHA, BETA, RALPHA, and RBETA.
+*
+      ALPHA = CMPLX( ONE, -ONE )
+      BETA = CMPLX( TWO, -TWO )
+      RALPHA = ONE
+      RBETA = TWO
+*
+      GO TO ( 10, 20, 30, 40, 50, 60, 70, 80,
+     $        90 )ISNUM
+   10 INFOT = 1
+      CALL CGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 1
+      CALL CGEMM( '/', 'C', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 1
+      CALL CGEMM( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CGEMM( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CGEMM( 'C', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CGEMM( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CGEMM( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CGEMM( 'N', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CGEMM( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CGEMM( 'C', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CGEMM( 'C', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CGEMM( 'C', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CGEMM( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CGEMM( 'T', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CGEMM( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CGEMM( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CGEMM( 'N', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CGEMM( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CGEMM( 'C', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CGEMM( 'C', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CGEMM( 'C', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CGEMM( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CGEMM( 'T', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CGEMM( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CGEMM( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CGEMM( 'N', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CGEMM( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CGEMM( 'C', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CGEMM( 'C', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CGEMM( 'C', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CGEMM( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CGEMM( 'T', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CGEMM( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL CGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL CGEMM( 'N', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL CGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL CGEMM( 'C', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL CGEMM( 'C', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL CGEMM( 'C', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL CGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL CGEMM( 'T', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL CGEMM( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL CGEMM( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL CGEMM( 'C', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL CGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL CGEMM( 'N', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL CGEMM( 'C', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL CGEMM( 'T', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL CGEMM( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL CGEMM( 'C', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL CGEMM( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL CGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL CGEMM( 'N', 'C', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL CGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL CGEMM( 'C', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL CGEMM( 'C', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL CGEMM( 'C', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL CGEMM( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL CGEMM( 'T', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL CGEMM( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 100
+   20 INFOT = 1
+      CALL CHEMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CHEMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CHEMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CHEMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CHEMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CHEMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CHEMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CHEMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CHEMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CHEMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CHEMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 100
+   30 INFOT = 1
+      CALL CSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 100
+   40 INFOT = 1
+      CALL CTRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CTRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CTRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CTRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRMM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRMM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRMM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRMM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRMM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRMM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRMM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRMM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRMM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRMM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRMM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRMM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 100
+   50 INFOT = 1
+      CALL CTRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CTRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CTRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CTRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRSM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRSM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRSM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRSM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRSM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRSM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRSM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRSM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL CTRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRSM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRSM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRSM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRSM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 100
+   60 INFOT = 1
+      CALL CHERK( '/', 'N', 0, 0, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CHERK( 'U', 'T', 0, 0, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CHERK( 'U', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CHERK( 'U', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CHERK( 'L', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CHERK( 'L', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CHERK( 'U', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CHERK( 'U', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CHERK( 'L', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CHERK( 'L', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CHERK( 'U', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CHERK( 'U', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CHERK( 'L', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CHERK( 'L', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL CHERK( 'U', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL CHERK( 'U', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL CHERK( 'L', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL CHERK( 'L', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 100
+   70 INFOT = 1
+      CALL CSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CSYRK( 'U', 'C', 0, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL CSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL CSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL CSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL CSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 100
+   80 INFOT = 1
+      CALL CHER2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CHER2K( 'U', 'T', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CHER2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CHER2K( 'U', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CHER2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CHER2K( 'L', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CHER2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CHER2K( 'U', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CHER2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CHER2K( 'L', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CHER2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CHER2K( 'U', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CHER2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CHER2K( 'L', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CHER2K( 'U', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CHER2K( 'L', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL CHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL CHER2K( 'U', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL CHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL CHER2K( 'L', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 100
+   90 INFOT = 1
+      CALL CSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CSYR2K( 'U', 'C', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL CSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL CSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL CSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL CSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+*
+  100 IF( OK )THEN
+         WRITE( NOUT, FMT = 9999 )SRNAMT
+      ELSE
+         WRITE( NOUT, FMT = 9998 )SRNAMT
+      END IF
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' )
+ 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****',
+     $      '**' )
+*
+*     End of CCHKE.
+*
+      END
+      SUBROUTINE CMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET,
+     $                  TRANSL )
+*
+*  Generates values for an M by N matrix A.
+*  Stores the values in the array AA in the data structure required
+*  by the routine, with unwanted elements set to rogue value.
+*
+*  TYPE is 'GE', 'HE', 'SY' or 'TR'.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      COMPLEX            ZERO, ONE
+      PARAMETER          ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) )
+      COMPLEX            ROGUE
+      PARAMETER          ( ROGUE = ( -1.0E10, 1.0E10 ) )
+      REAL               RZERO
+      PARAMETER          ( RZERO = 0.0 )
+      REAL               RROGUE
+      PARAMETER          ( RROGUE = -1.0E10 )
+*     .. Scalar Arguments ..
+      COMPLEX            TRANSL
+      INTEGER            LDA, M, N, NMAX
+      LOGICAL            RESET
+      CHARACTER*1        DIAG, UPLO
+      CHARACTER*2        TYPE
+*     .. Array Arguments ..
+      COMPLEX            A( NMAX, * ), AA( * )
+*     .. Local Scalars ..
+      INTEGER            I, IBEG, IEND, J, JJ
+      LOGICAL            GEN, HER, LOWER, SYM, TRI, UNIT, UPPER
+*     .. External Functions ..
+      COMPLEX            CBEG
+      EXTERNAL           CBEG
+*     .. Intrinsic Functions ..
+      INTRINSIC          CMPLX, CONJG, REAL
+*     .. Executable Statements ..
+      GEN = TYPE.EQ.'GE'
+      HER = TYPE.EQ.'HE'
+      SYM = TYPE.EQ.'SY'
+      TRI = TYPE.EQ.'TR'
+      UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U'
+      LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L'
+      UNIT = TRI.AND.DIAG.EQ.'U'
+*
+*     Generate data in array A.
+*
+      DO 20 J = 1, N
+         DO 10 I = 1, M
+            IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) )
+     $          THEN
+               A( I, J ) = CBEG( RESET ) + TRANSL
+               IF( I.NE.J )THEN
+*                 Set some elements to zero
+                  IF( N.GT.3.AND.J.EQ.N/2 )
+     $               A( I, J ) = ZERO
+                  IF( HER )THEN
+                     A( J, I ) = CONJG( A( I, J ) )
+                  ELSE IF( SYM )THEN
+                     A( J, I ) = A( I, J )
+                  ELSE IF( TRI )THEN
+                     A( J, I ) = ZERO
+                  END IF
+               END IF
+            END IF
+   10    CONTINUE
+         IF( HER )
+     $      A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO )
+         IF( TRI )
+     $      A( J, J ) = A( J, J ) + ONE
+         IF( UNIT )
+     $      A( J, J ) = ONE
+   20 CONTINUE
+*
+*     Store elements in array AS in data structure required by routine.
+*
+      IF( TYPE.EQ.'GE' )THEN
+         DO 50 J = 1, N
+            DO 30 I = 1, M
+               AA( I + ( J - 1 )*LDA ) = A( I, J )
+   30       CONTINUE
+            DO 40 I = M + 1, LDA
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+   40       CONTINUE
+   50    CONTINUE
+      ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN
+         DO 90 J = 1, N
+            IF( UPPER )THEN
+               IBEG = 1
+               IF( UNIT )THEN
+                  IEND = J - 1
+               ELSE
+                  IEND = J
+               END IF
+            ELSE
+               IF( UNIT )THEN
+                  IBEG = J + 1
+               ELSE
+                  IBEG = J
+               END IF
+               IEND = N
+            END IF
+            DO 60 I = 1, IBEG - 1
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+   60       CONTINUE
+            DO 70 I = IBEG, IEND
+               AA( I + ( J - 1 )*LDA ) = A( I, J )
+   70       CONTINUE
+            DO 80 I = IEND + 1, LDA
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+   80       CONTINUE
+            IF( HER )THEN
+               JJ = J + ( J - 1 )*LDA
+               AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE )
+            END IF
+   90    CONTINUE
+      END IF
+      RETURN
+*
+*     End of CMAKE.
+*
+      END
+      SUBROUTINE CMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB,
+     $                  BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL,
+     $                  NOUT, MV )
+*
+*  Checks the results of the computational tests.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      COMPLEX            ZERO
+      PARAMETER          ( ZERO = ( 0.0, 0.0 ) )
+      REAL               RZERO, RONE
+      PARAMETER          ( RZERO = 0.0, RONE = 1.0 )
+*     .. Scalar Arguments ..
+      COMPLEX            ALPHA, BETA
+      REAL               EPS, ERR
+      INTEGER            KK, LDA, LDB, LDC, LDCC, M, N, NOUT
+      LOGICAL            FATAL, MV
+      CHARACTER*1        TRANSA, TRANSB
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), B( LDB, * ), C( LDC, * ),
+     $                   CC( LDCC, * ), CT( * )
+      REAL               G( * )
+*     .. Local Scalars ..
+      COMPLEX            CL
+      REAL               ERRI
+      INTEGER            I, J, K
+      LOGICAL            CTRANA, CTRANB, TRANA, TRANB
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, AIMAG, CONJG, MAX, REAL, SQRT
+*     .. Statement Functions ..
+      REAL               ABS1
+*     .. Statement Function definitions ..
+      ABS1( CL ) = ABS( REAL( CL ) ) + ABS( AIMAG( CL ) )
+*     .. Executable Statements ..
+      TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C'
+      TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C'
+      CTRANA = TRANSA.EQ.'C'
+      CTRANB = TRANSB.EQ.'C'
+*
+*     Compute expected result, one column at a time, in CT using data
+*     in A, B and C.
+*     Compute gauges in G.
+*
+      DO 220 J = 1, N
+*
+         DO 10 I = 1, M
+            CT( I ) = ZERO
+            G( I ) = RZERO
+   10    CONTINUE
+         IF( .NOT.TRANA.AND..NOT.TRANB )THEN
+            DO 30 K = 1, KK
+               DO 20 I = 1, M
+                  CT( I ) = CT( I ) + A( I, K )*B( K, J )
+                  G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) )
+   20          CONTINUE
+   30       CONTINUE
+         ELSE IF( TRANA.AND..NOT.TRANB )THEN
+            IF( CTRANA )THEN
+               DO 50 K = 1, KK
+                  DO 40 I = 1, M
+                     CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( K, J )
+                     G( I ) = G( I ) + ABS1( A( K, I ) )*
+     $                        ABS1( B( K, J ) )
+   40             CONTINUE
+   50          CONTINUE
+            ELSE
+               DO 70 K = 1, KK
+                  DO 60 I = 1, M
+                     CT( I ) = CT( I ) + A( K, I )*B( K, J )
+                     G( I ) = G( I ) + ABS1( A( K, I ) )*
+     $                        ABS1( B( K, J ) )
+   60             CONTINUE
+   70          CONTINUE
+            END IF
+         ELSE IF( .NOT.TRANA.AND.TRANB )THEN
+            IF( CTRANB )THEN
+               DO 90 K = 1, KK
+                  DO 80 I = 1, M
+                     CT( I ) = CT( I ) + A( I, K )*CONJG( B( J, K ) )
+                     G( I ) = G( I ) + ABS1( A( I, K ) )*
+     $                        ABS1( B( J, K ) )
+   80             CONTINUE
+   90          CONTINUE
+            ELSE
+               DO 110 K = 1, KK
+                  DO 100 I = 1, M
+                     CT( I ) = CT( I ) + A( I, K )*B( J, K )
+                     G( I ) = G( I ) + ABS1( A( I, K ) )*
+     $                        ABS1( B( J, K ) )
+  100             CONTINUE
+  110          CONTINUE
+            END IF
+         ELSE IF( TRANA.AND.TRANB )THEN
+            IF( CTRANA )THEN
+               IF( CTRANB )THEN
+                  DO 130 K = 1, KK
+                     DO 120 I = 1, M
+                        CT( I ) = CT( I ) + CONJG( A( K, I ) )*
+     $                            CONJG( B( J, K ) )
+                        G( I ) = G( I ) + ABS1( A( K, I ) )*
+     $                           ABS1( B( J, K ) )
+  120                CONTINUE
+  130             CONTINUE
+               ELSE
+                  DO 150 K = 1, KK
+                     DO 140 I = 1, M
+                        CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( J, K )
+                        G( I ) = G( I ) + ABS1( A( K, I ) )*
+     $                           ABS1( B( J, K ) )
+  140                CONTINUE
+  150             CONTINUE
+               END IF
+            ELSE
+               IF( CTRANB )THEN
+                  DO 170 K = 1, KK
+                     DO 160 I = 1, M
+                        CT( I ) = CT( I ) + A( K, I )*CONJG( B( J, K ) )
+                        G( I ) = G( I ) + ABS1( A( K, I ) )*
+     $                           ABS1( B( J, K ) )
+  160                CONTINUE
+  170             CONTINUE
+               ELSE
+                  DO 190 K = 1, KK
+                     DO 180 I = 1, M
+                        CT( I ) = CT( I ) + A( K, I )*B( J, K )
+                        G( I ) = G( I ) + ABS1( A( K, I ) )*
+     $                           ABS1( B( J, K ) )
+  180                CONTINUE
+  190             CONTINUE
+               END IF
+            END IF
+         END IF
+         DO 200 I = 1, M
+            CT( I ) = ALPHA*CT( I ) + BETA*C( I, J )
+            G( I ) = ABS1( ALPHA )*G( I ) +
+     $               ABS1( BETA )*ABS1( C( I, J ) )
+  200    CONTINUE
+*
+*        Compute the error ratio for this result.
+*
+         ERR = ZERO
+         DO 210 I = 1, M
+            ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS
+            IF( G( I ).NE.RZERO )
+     $         ERRI = ERRI/G( I )
+            ERR = MAX( ERR, ERRI )
+            IF( ERR*SQRT( EPS ).GE.RONE )
+     $         GO TO 230
+  210    CONTINUE
+*
+  220 CONTINUE
+*
+*     If the loop completes, all results are at least half accurate.
+      GO TO 250
+*
+*     Report fatal error.
+*
+  230 FATAL = .TRUE.
+      WRITE( NOUT, FMT = 9999 )
+      DO 240 I = 1, M
+         IF( MV )THEN
+            WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J )
+         ELSE
+            WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I )
+         END IF
+  240 CONTINUE
+      IF( N.GT.1 )
+     $   WRITE( NOUT, FMT = 9997 )J
+*
+  250 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL',
+     $      'F ACCURATE *******', /'                       EXPECTED RE',
+     $      'SULT                    COMPUTED RESULT' )
+ 9998 FORMAT( 1X, I7, 2( '  (', G15.6, ',', G15.6, ')' ) )
+ 9997 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+*
+*     End of CMMCH.
+*
+      END
+      LOGICAL FUNCTION LCE( RI, RJ, LR )
+*
+*  Tests if two arrays are identical.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      INTEGER            LR
+*     .. Array Arguments ..
+      COMPLEX            RI( * ), RJ( * )
+*     .. Local Scalars ..
+      INTEGER            I
+*     .. Executable Statements ..
+      DO 10 I = 1, LR
+         IF( RI( I ).NE.RJ( I ) )
+     $      GO TO 20
+   10 CONTINUE
+      LCE = .TRUE.
+      GO TO 30
+   20 CONTINUE
+      LCE = .FALSE.
+   30 RETURN
+*
+*     End of LCE.
+*
+      END
+      LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA )
+*
+*  Tests if selected elements in two arrays are equal.
+*
+*  TYPE is 'GE' or 'HE' or 'SY'.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      INTEGER            LDA, M, N
+      CHARACTER*1        UPLO
+      CHARACTER*2        TYPE
+*     .. Array Arguments ..
+      COMPLEX            AA( LDA, * ), AS( LDA, * )
+*     .. Local Scalars ..
+      INTEGER            I, IBEG, IEND, J
+      LOGICAL            UPPER
+*     .. Executable Statements ..
+      UPPER = UPLO.EQ.'U'
+      IF( TYPE.EQ.'GE' )THEN
+         DO 20 J = 1, N
+            DO 10 I = M + 1, LDA
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   10       CONTINUE
+   20    CONTINUE
+      ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY' )THEN
+         DO 50 J = 1, N
+            IF( UPPER )THEN
+               IBEG = 1
+               IEND = J
+            ELSE
+               IBEG = J
+               IEND = N
+            END IF
+            DO 30 I = 1, IBEG - 1
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   30       CONTINUE
+            DO 40 I = IEND + 1, LDA
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   40       CONTINUE
+   50    CONTINUE
+      END IF
+*
+      LCERES = .TRUE.
+      GO TO 80
+   70 CONTINUE
+      LCERES = .FALSE.
+   80 RETURN
+*
+*     End of LCERES.
+*
+      END
+      COMPLEX FUNCTION CBEG( RESET )
+*
+*  Generates complex numbers as pairs of random numbers uniformly
+*  distributed between -0.5 and 0.5.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      LOGICAL            RESET
+*     .. Local Scalars ..
+      INTEGER            I, IC, J, MI, MJ
+*     .. Save statement ..
+      SAVE               I, IC, J, MI, MJ
+*     .. Intrinsic Functions ..
+      INTRINSIC          CMPLX
+*     .. Executable Statements ..
+      IF( RESET )THEN
+*        Initialize local variables.
+         MI = 891
+         MJ = 457
+         I = 7
+         J = 7
+         IC = 0
+         RESET = .FALSE.
+      END IF
+*
+*     The sequence of values of I or J is bounded between 1 and 999.
+*     If initial I or J = 1,2,3,6,7 or 9, the period will be 50.
+*     If initial I or J = 4 or 8, the period will be 25.
+*     If initial I or J = 5, the period will be 10.
+*     IC is used to break up the period by skipping 1 value of I or J
+*     in 6.
+*
+      IC = IC + 1
+   10 I = I*MI
+      J = J*MJ
+      I = I - 1000*( I/1000 )
+      J = J - 1000*( J/1000 )
+      IF( IC.GE.5 )THEN
+         IC = 0
+         GO TO 10
+      END IF
+      CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 )
+      RETURN
+*
+*     End of CBEG.
+*
+      END
+      REAL FUNCTION SDIFF( X, Y )
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      REAL               X, Y
+*     .. Executable Statements ..
+      SDIFF = X - Y
+      RETURN
+*
+*     End of SDIFF.
+*
+      END
+      SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+*
+*  Tests whether XERBLA has detected an error when it should.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFOT, NOUT
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Executable Statements ..
+      IF( .NOT.LERR )THEN
+         WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT
+         OK = .FALSE.
+      END IF
+      LERR = .FALSE.
+      RETURN
+*
+ 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D',
+     $      'ETECTED BY ', A6, ' *****' )
+*
+*     End of CHKXER.
+*
+      END
+      SUBROUTINE XERBLA( SRNAME, INFO )
+*
+*  This is a special version of XERBLA to be used only as part of
+*  the test program for testing error exits from the Level 3 BLAS
+*  routines.
+*
+*  XERBLA  is an error handler for the Level 3 BLAS routines.
+*
+*  It is called by the Level 3 BLAS routines if an input parameter is
+*  invalid.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO
+      CHARACTER*6        SRNAME
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUT
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUT, OK, LERR
+      COMMON             /SRNAMC/SRNAMT
+*     .. Executable Statements ..
+      LERR = .TRUE.
+      IF( INFO.NE.INFOT )THEN
+         IF( INFOT.NE.0 )THEN
+            WRITE( NOUT, FMT = 9999 )INFO, INFOT
+         ELSE
+            WRITE( NOUT, FMT = 9997 )INFO
+         END IF
+         OK = .FALSE.
+      END IF
+      IF( SRNAME.NE.SRNAMT )THEN
+         WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT
+         OK = .FALSE.
+      END IF
+      RETURN
+*
+ 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD',
+     $      ' OF ', I2, ' *******' )
+ 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE',
+     $      'AD OF ', A6, ' *******' )
+ 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6,
+     $      ' *******' )
+*
+*     End of XERBLA
+*
+      END
+

diff --git a/blas/testing/dblat1.f b/blas/testing/dblat1.f
new file mode 100644
index 0000000..03d9f13
--- /dev/null
+++ b/blas/testing/dblat1.f

@@ -0,0 +1,1065 @@
+*> \brief \b DBLAT1
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM DBLAT1
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*>    Test program for the DOUBLE PRECISION Level 1 BLAS.
+*>
+*>    Based upon the original BLAS test routine together with:
+*>    F06EAF Example Program Text
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup double_blas_testing
+*
+*  =====================================================================
+      PROGRAM DBLAT1
+*
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER          NOUT
+      PARAMETER        (NOUT=6)
+*     .. Scalars in Common ..
+      INTEGER          ICASE, INCX, INCY, N
+      LOGICAL          PASS
+*     .. Local Scalars ..
+      DOUBLE PRECISION SFAC
+      INTEGER          IC
+*     .. External Subroutines ..
+      EXTERNAL         CHECK0, CHECK1, CHECK2, CHECK3, HEADER
+*     .. Common blocks ..
+      COMMON           /COMBLA/ICASE, N, INCX, INCY, PASS
+*     .. Data statements ..
+      DATA             SFAC/9.765625D-4/
+*     .. Executable Statements ..
+      WRITE (NOUT,99999)
+      DO 20 IC = 1, 13
+         ICASE = IC
+         CALL HEADER
+*
+*        .. Initialize  PASS,  INCX,  and INCY for a new case. ..
+*        .. the value 9999 for INCX or INCY will appear in the ..
+*        .. detailed  output, if any, for cases  that do not involve ..
+*        .. these parameters ..
+*
+         PASS = .TRUE.
+         INCX = 9999
+         INCY = 9999
+         IF (ICASE.EQ.3 .OR. ICASE.EQ.11) THEN
+            CALL CHECK0(SFAC)
+         ELSE IF (ICASE.EQ.7 .OR. ICASE.EQ.8 .OR. ICASE.EQ.9 .OR.
+     +            ICASE.EQ.10) THEN
+            CALL CHECK1(SFAC)
+         ELSE IF (ICASE.EQ.1 .OR. ICASE.EQ.2 .OR. ICASE.EQ.5 .OR.
+     +            ICASE.EQ.6 .OR. ICASE.EQ.12 .OR. ICASE.EQ.13) THEN
+            CALL CHECK2(SFAC)
+         ELSE IF (ICASE.EQ.4) THEN
+            CALL CHECK3(SFAC)
+         END IF
+*        -- Print
+         IF (PASS) WRITE (NOUT,99998)
+   20 CONTINUE
+      STOP
+*
+99999 FORMAT (' Real BLAS Test Program Results',/1X)
+99998 FORMAT ('                                    ----- PASS -----')
+      END
+      SUBROUTINE HEADER
+*     .. Parameters ..
+      INTEGER          NOUT
+      PARAMETER        (NOUT=6)
+*     .. Scalars in Common ..
+      INTEGER          ICASE, INCX, INCY, N
+      LOGICAL          PASS
+*     .. Local Arrays ..
+      CHARACTER*6      L(13)
+*     .. Common blocks ..
+      COMMON           /COMBLA/ICASE, N, INCX, INCY, PASS
+*     .. Data statements ..
+      DATA             L(1)/' DDOT '/
+      DATA             L(2)/'DAXPY '/
+      DATA             L(3)/'DROTG '/
+      DATA             L(4)/' DROT '/
+      DATA             L(5)/'DCOPY '/
+      DATA             L(6)/'DSWAP '/
+      DATA             L(7)/'DNRM2 '/
+      DATA             L(8)/'DASUM '/
+      DATA             L(9)/'DSCAL '/
+      DATA             L(10)/'IDAMAX'/
+      DATA             L(11)/'DROTMG'/
+      DATA             L(12)/'DROTM '/
+      DATA             L(13)/'DSDOT '/
+*     .. Executable Statements ..
+      WRITE (NOUT,99999) ICASE, L(ICASE)
+      RETURN
+*
+99999 FORMAT (/' Test of subprogram number',I3,12X,A6)
+      END
+      SUBROUTINE CHECK0(SFAC)
+*     .. Parameters ..
+      INTEGER           NOUT
+      PARAMETER         (NOUT=6)
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION  SFAC
+*     .. Scalars in Common ..
+      INTEGER           ICASE, INCX, INCY, N
+      LOGICAL           PASS
+*     .. Local Scalars ..
+      DOUBLE PRECISION  SA, SB, SC, SS, D12
+      INTEGER           I, K
+*     .. Local Arrays ..
+      DOUBLE PRECISION  DA1(8), DATRUE(8), DB1(8), DBTRUE(8), DC1(8),
+     $                  DS1(8), DAB(4,9), DTEMP(9), DTRUE(9,9)
+*     .. External Subroutines ..
+      EXTERNAL          DROTG, DROTMG, STEST1
+*     .. Common blocks ..
+      COMMON            /COMBLA/ICASE, N, INCX, INCY, PASS
+*     .. Data statements ..
+      DATA              DA1/0.3D0, 0.4D0, -0.3D0, -0.4D0, -0.3D0, 0.0D0,
+     +                  0.0D0, 1.0D0/
+      DATA              DB1/0.4D0, 0.3D0, 0.4D0, 0.3D0, -0.4D0, 0.0D0,
+     +                  1.0D0, 0.0D0/
+      DATA              DC1/0.6D0, 0.8D0, -0.6D0, 0.8D0, 0.6D0, 1.0D0,
+     +                  0.0D0, 1.0D0/
+      DATA              DS1/0.8D0, 0.6D0, 0.8D0, -0.6D0, 0.8D0, 0.0D0,
+     +                  1.0D0, 0.0D0/
+      DATA              DATRUE/0.5D0, 0.5D0, 0.5D0, -0.5D0, -0.5D0,
+     +                  0.0D0, 1.0D0, 1.0D0/
+      DATA              DBTRUE/0.0D0, 0.6D0, 0.0D0, -0.6D0, 0.0D0,
+     +                  0.0D0, 1.0D0, 0.0D0/
+*     INPUT FOR MODIFIED GIVENS
+      DATA DAB/ .1D0,.3D0,1.2D0,.2D0,
+     A          .7D0, .2D0, .6D0, 4.2D0,
+     B          0.D0,0.D0,0.D0,0.D0,
+     C          4.D0, -1.D0, 2.D0, 4.D0,
+     D          6.D-10, 2.D-2, 1.D5, 10.D0,
+     E          4.D10, 2.D-2, 1.D-5, 10.D0,
+     F          2.D-10, 4.D-2, 1.D5, 10.D0,
+     G          2.D10, 4.D-2, 1.D-5, 10.D0,
+     H          4.D0, -2.D0, 8.D0, 4.D0    /
+*    TRUE RESULTS FOR MODIFIED GIVENS
+      DATA DTRUE/0.D0,0.D0, 1.3D0, .2D0, 0.D0,0.D0,0.D0, .5D0, 0.D0,
+     A           0.D0,0.D0, 4.5D0, 4.2D0, 1.D0, .5D0, 0.D0,0.D0,0.D0,
+     B           0.D0,0.D0,0.D0,0.D0, -2.D0, 0.D0,0.D0,0.D0,0.D0,
+     C           0.D0,0.D0,0.D0, 4.D0, -1.D0, 0.D0,0.D0,0.D0,0.D0,
+     D           0.D0, 15.D-3, 0.D0, 10.D0, -1.D0, 0.D0, -1.D-4,
+     E           0.D0, 1.D0,
+     F           0.D0,0.D0, 6144.D-5, 10.D0, -1.D0, 4096.D0, -1.D6,
+     G           0.D0, 1.D0,
+     H           0.D0,0.D0,15.D0,10.D0,-1.D0, 5.D-5, 0.D0,1.D0,0.D0,
+     I           0.D0,0.D0, 15.D0, 10.D0, -1. D0, 5.D5, -4096.D0,
+     J           1.D0, 4096.D-6,
+     K           0.D0,0.D0, 7.D0, 4.D0, 0.D0,0.D0, -.5D0, -.25D0, 0.D0/
+*                   4096 = 2 ** 12
+      DATA D12  /4096.D0/
+      DTRUE(1,1) = 12.D0 / 130.D0
+      DTRUE(2,1) = 36.D0 / 130.D0
+      DTRUE(7,1) = -1.D0 / 6.D0
+      DTRUE(1,2) = 14.D0 / 75.D0
+      DTRUE(2,2) = 49.D0 / 75.D0
+      DTRUE(9,2) = 1.D0 / 7.D0
+      DTRUE(1,5) = 45.D-11 * (D12 * D12)
+      DTRUE(3,5) = 4.D5 / (3.D0 * D12)
+      DTRUE(6,5) = 1.D0 / D12
+      DTRUE(8,5) = 1.D4 / (3.D0 * D12)
+      DTRUE(1,6) = 4.D10 / (1.5D0 * D12 * D12)
+      DTRUE(2,6) = 2.D-2 / 1.5D0
+      DTRUE(8,6) = 5.D-7 * D12
+      DTRUE(1,7) = 4.D0 / 150.D0
+      DTRUE(2,7) = (2.D-10 / 1.5D0) * (D12 * D12)
+      DTRUE(7,7) = -DTRUE(6,5)
+      DTRUE(9,7) = 1.D4 / D12
+      DTRUE(1,8) = DTRUE(1,7)
+      DTRUE(2,8) = 2.D10 / (1.5D0 * D12 * D12)
+      DTRUE(1,9) = 32.D0 / 7.D0
+      DTRUE(2,9) = -16.D0 / 7.D0
+*     .. Executable Statements ..
+*
+*     Compute true values which cannot be prestored
+*     in decimal notation
+*
+      DBTRUE(1) = 1.0D0/0.6D0
+      DBTRUE(3) = -1.0D0/0.6D0
+      DBTRUE(5) = 1.0D0/0.6D0
+*
+      DO 20 K = 1, 8
+*        .. Set N=K for identification in output if any ..
+         N = K
+         IF (ICASE.EQ.3) THEN
+*           .. DROTG ..
+            IF (K.GT.8) GO TO 40
+            SA = DA1(K)
+            SB = DB1(K)
+            CALL DROTG(SA,SB,SC,SS)
+            CALL STEST1(SA,DATRUE(K),DATRUE(K),SFAC)
+            CALL STEST1(SB,DBTRUE(K),DBTRUE(K),SFAC)
+            CALL STEST1(SC,DC1(K),DC1(K),SFAC)
+            CALL STEST1(SS,DS1(K),DS1(K),SFAC)
+         ELSEIF (ICASE.EQ.11) THEN
+*           .. DROTMG ..
+            DO I=1,4
+               DTEMP(I)= DAB(I,K)
+               DTEMP(I+4) = 0.0
+            END DO
+            DTEMP(9) = 0.0
+            CALL DROTMG(DTEMP(1),DTEMP(2),DTEMP(3),DTEMP(4),DTEMP(5))
+            CALL STEST(9,DTEMP,DTRUE(1,K),DTRUE(1,K),SFAC)
+         ELSE
+            WRITE (NOUT,*) ' Shouldn''t be here in CHECK0'
+            STOP
+         END IF
+   20 CONTINUE
+   40 RETURN
+      END
+      SUBROUTINE CHECK1(SFAC)
+*     .. Parameters ..
+      INTEGER           NOUT
+      PARAMETER         (NOUT=6)
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION  SFAC
+*     .. Scalars in Common ..
+      INTEGER           ICASE, INCX, INCY, N
+      LOGICAL           PASS
+*     .. Local Scalars ..
+      INTEGER           I, LEN, NP1
+*     .. Local Arrays ..
+      DOUBLE PRECISION  DTRUE1(5), DTRUE3(5), DTRUE5(8,5,2), DV(8,5,2),
+     +                  SA(10), STEMP(1), STRUE(8), SX(8)
+      INTEGER           ITRUE2(5)
+*     .. External Functions ..
+      DOUBLE PRECISION  DASUM, DNRM2
+      INTEGER           IDAMAX
+      EXTERNAL          DASUM, DNRM2, IDAMAX
+*     .. External Subroutines ..
+      EXTERNAL          ITEST1, DSCAL, STEST, STEST1
+*     .. Intrinsic Functions ..
+      INTRINSIC         MAX
+*     .. Common blocks ..
+      COMMON            /COMBLA/ICASE, N, INCX, INCY, PASS
+*     .. Data statements ..
+      DATA              SA/0.3D0, -1.0D0, 0.0D0, 1.0D0, 0.3D0, 0.3D0,
+     +                  0.3D0, 0.3D0, 0.3D0, 0.3D0/
+      DATA              DV/0.1D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0,
+     +                  2.0D0, 2.0D0, 0.3D0, 3.0D0, 3.0D0, 3.0D0, 3.0D0,
+     +                  3.0D0, 3.0D0, 3.0D0, 0.3D0, -0.4D0, 4.0D0,
+     +                  4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, 0.2D0,
+     +                  -0.6D0, 0.3D0, 5.0D0, 5.0D0, 5.0D0, 5.0D0,
+     +                  5.0D0, 0.1D0, -0.3D0, 0.5D0, -0.1D0, 6.0D0,
+     +                  6.0D0, 6.0D0, 6.0D0, 0.1D0, 8.0D0, 8.0D0, 8.0D0,
+     +                  8.0D0, 8.0D0, 8.0D0, 8.0D0, 0.3D0, 9.0D0, 9.0D0,
+     +                  9.0D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, 0.3D0, 2.0D0,
+     +                  -0.4D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0,
+     +                  0.2D0, 3.0D0, -0.6D0, 5.0D0, 0.3D0, 2.0D0,
+     +                  2.0D0, 2.0D0, 0.1D0, 4.0D0, -0.3D0, 6.0D0,
+     +                  -0.5D0, 7.0D0, -0.1D0, 3.0D0/
+      DATA              DTRUE1/0.0D0, 0.3D0, 0.5D0, 0.7D0, 0.6D0/
+      DATA              DTRUE3/0.0D0, 0.3D0, 0.7D0, 1.1D0, 1.0D0/
+      DATA              DTRUE5/0.10D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0,
+     +                  2.0D0, 2.0D0, 2.0D0, -0.3D0, 3.0D0, 3.0D0,
+     +                  3.0D0, 3.0D0, 3.0D0, 3.0D0, 3.0D0, 0.0D0, 0.0D0,
+     +                  4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0,
+     +                  0.20D0, -0.60D0, 0.30D0, 5.0D0, 5.0D0, 5.0D0,
+     +                  5.0D0, 5.0D0, 0.03D0, -0.09D0, 0.15D0, -0.03D0,
+     +                  6.0D0, 6.0D0, 6.0D0, 6.0D0, 0.10D0, 8.0D0,
+     +                  8.0D0, 8.0D0, 8.0D0, 8.0D0, 8.0D0, 8.0D0,
+     +                  0.09D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0,
+     +                  9.0D0, 9.0D0, 0.09D0, 2.0D0, -0.12D0, 2.0D0,
+     +                  2.0D0, 2.0D0, 2.0D0, 2.0D0, 0.06D0, 3.0D0,
+     +                  -0.18D0, 5.0D0, 0.09D0, 2.0D0, 2.0D0, 2.0D0,
+     +                  0.03D0, 4.0D0, -0.09D0, 6.0D0, -0.15D0, 7.0D0,
+     +                  -0.03D0, 3.0D0/
+      DATA              ITRUE2/0, 1, 2, 2, 3/
+*     .. Executable Statements ..
+      DO 80 INCX = 1, 2
+         DO 60 NP1 = 1, 5
+            N = NP1 - 1
+            LEN = 2*MAX(N,1)
+*           .. Set vector arguments ..
+            DO 20 I = 1, LEN
+               SX(I) = DV(I,NP1,INCX)
+   20       CONTINUE
+*
+            IF (ICASE.EQ.7) THEN
+*              .. DNRM2 ..
+               STEMP(1) = DTRUE1(NP1)
+               CALL STEST1(DNRM2(N,SX,INCX),STEMP(1),STEMP,SFAC)
+            ELSE IF (ICASE.EQ.8) THEN
+*              .. DASUM ..
+               STEMP(1) = DTRUE3(NP1)
+               CALL STEST1(DASUM(N,SX,INCX),STEMP(1),STEMP,SFAC)
+            ELSE IF (ICASE.EQ.9) THEN
+*              .. DSCAL ..
+               CALL DSCAL(N,SA((INCX-1)*5+NP1),SX,INCX)
+               DO 40 I = 1, LEN
+                  STRUE(I) = DTRUE5(I,NP1,INCX)
+   40          CONTINUE
+               CALL STEST(LEN,SX,STRUE,STRUE,SFAC)
+            ELSE IF (ICASE.EQ.10) THEN
+*              .. IDAMAX ..
+               CALL ITEST1(IDAMAX(N,SX,INCX),ITRUE2(NP1))
+            ELSE
+               WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
+               STOP
+            END IF
+   60    CONTINUE
+   80 CONTINUE
+      RETURN
+      END
+      SUBROUTINE CHECK2(SFAC)
+*     .. Parameters ..
+      INTEGER           NOUT
+      PARAMETER         (NOUT=6)
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION  SFAC
+*     .. Scalars in Common ..
+      INTEGER           ICASE, INCX, INCY, N
+      LOGICAL           PASS
+*     .. Local Scalars ..
+      DOUBLE PRECISION  SA
+      INTEGER           I, J, KI, KN, KNI, KPAR, KSIZE, LENX, LENY,
+     $                  MX, MY 
+*     .. Local Arrays ..
+      DOUBLE PRECISION  DT10X(7,4,4), DT10Y(7,4,4), DT7(4,4),
+     $                  DT8(7,4,4), DX1(7),
+     $                  DY1(7), SSIZE1(4), SSIZE2(14,2), SSIZE(7),
+     $                  STX(7), STY(7), SX(7), SY(7),
+     $                  DPAR(5,4), DT19X(7,4,16),DT19XA(7,4,4),
+     $                  DT19XB(7,4,4), DT19XC(7,4,4),DT19XD(7,4,4),
+     $                  DT19Y(7,4,16), DT19YA(7,4,4),DT19YB(7,4,4),
+     $                  DT19YC(7,4,4), DT19YD(7,4,4), DTEMP(5)
+      INTEGER           INCXS(4), INCYS(4), LENS(4,2), NS(4)
+*     .. External Functions ..
+      DOUBLE PRECISION  DDOT, DSDOT
+      EXTERNAL          DDOT, DSDOT
+*     .. External Subroutines ..
+      EXTERNAL          DAXPY, DCOPY, DROTM, DSWAP, STEST, STEST1
+*     .. Intrinsic Functions ..
+      INTRINSIC         ABS, MIN
+*     .. Common blocks ..
+      COMMON            /COMBLA/ICASE, N, INCX, INCY, PASS
+*     .. Data statements ..
+      EQUIVALENCE (DT19X(1,1,1),DT19XA(1,1,1)),(DT19X(1,1,5),
+     A   DT19XB(1,1,1)),(DT19X(1,1,9),DT19XC(1,1,1)),
+     B   (DT19X(1,1,13),DT19XD(1,1,1))
+      EQUIVALENCE (DT19Y(1,1,1),DT19YA(1,1,1)),(DT19Y(1,1,5),
+     A   DT19YB(1,1,1)),(DT19Y(1,1,9),DT19YC(1,1,1)),
+     B   (DT19Y(1,1,13),DT19YD(1,1,1))
+
+      DATA              SA/0.3D0/
+      DATA              INCXS/1, 2, -2, -1/
+      DATA              INCYS/1, -2, 1, -2/
+      DATA              LENS/1, 1, 2, 4, 1, 1, 3, 7/
+      DATA              NS/0, 1, 2, 4/
+      DATA              DX1/0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.9D0, -0.3D0,
+     +                  -0.4D0/
+      DATA              DY1/0.5D0, -0.9D0, 0.3D0, 0.7D0, -0.6D0, 0.2D0,
+     +                  0.8D0/
+      DATA              DT7/0.0D0, 0.30D0, 0.21D0, 0.62D0, 0.0D0,
+     +                  0.30D0, -0.07D0, 0.85D0, 0.0D0, 0.30D0, -0.79D0,
+     +                  -0.74D0, 0.0D0, 0.30D0, 0.33D0, 1.27D0/
+      DATA              DT8/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.68D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.68D0, -0.87D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.68D0, -0.87D0, 0.15D0,
+     +                  0.94D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.68D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.35D0, -0.9D0, 0.48D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.38D0, -0.9D0, 0.57D0, 0.7D0, -0.75D0,
+     +                  0.2D0, 0.98D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.68D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.35D0, -0.72D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.38D0,
+     +                  -0.63D0, 0.15D0, 0.88D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.68D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.68D0, -0.9D0, 0.33D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.68D0, -0.9D0, 0.33D0, 0.7D0,
+     +                  -0.75D0, 0.2D0, 1.04D0/
+      DATA              DT10X/0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.5D0, -0.9D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.5D0, -0.9D0, 0.3D0, 0.7D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.3D0, 0.1D0, 0.5D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.8D0, 0.1D0, -0.6D0,
+     +                  0.8D0, 0.3D0, -0.3D0, 0.5D0, 0.6D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.9D0,
+     +                  0.1D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.7D0,
+     +                  0.1D0, 0.3D0, 0.8D0, -0.9D0, -0.3D0, 0.5D0,
+     +                  0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.5D0, 0.3D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.5D0, 0.3D0, -0.6D0, 0.8D0, 0.0D0, 0.0D0,
+     +                  0.0D0/
+      DATA              DT10Y/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.6D0, 0.1D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, -0.5D0, -0.9D0, 0.6D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, -0.4D0, -0.9D0, 0.9D0,
+     +                  0.7D0, -0.5D0, 0.2D0, 0.6D0, 0.5D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.5D0,
+     +                  0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  -0.4D0, 0.9D0, -0.5D0, 0.6D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.6D0, -0.9D0, 0.1D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.6D0, -0.9D0, 0.1D0, 0.7D0,
+     +                  -0.5D0, 0.2D0, 0.8D0/
+      DATA              SSIZE1/0.0D0, 0.3D0, 1.6D0, 3.2D0/
+      DATA              SSIZE2/0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0,
+     +                  1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0,
+     +                  1.17D0, 1.17D0, 1.17D0/
+*
+*                         FOR DROTM
+*
+      DATA DPAR/-2.D0,  0.D0,0.D0,0.D0,0.D0,
+     A          -1.D0,  2.D0, -3.D0, -4.D0,  5.D0,
+     B           0.D0,  0.D0,  2.D0, -3.D0,  0.D0,
+     C           1.D0,  5.D0,  2.D0,  0.D0, -4.D0/
+*                        TRUE X RESULTS F0R ROTATIONS DROTM
+      DATA DT19XA/.6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     A            .6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     B            .6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     C            .6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     D            .6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     E           -.8D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     F           -.9D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     G           3.5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     H            .6D0,   .1D0,             0.D0,0.D0,0.D0,0.D0,0.D0,
+     I           -.8D0,  3.8D0,             0.D0,0.D0,0.D0,0.D0,0.D0,
+     J           -.9D0,  2.8D0,             0.D0,0.D0,0.D0,0.D0,0.D0,
+     K           3.5D0,  -.4D0,             0.D0,0.D0,0.D0,0.D0,0.D0,
+     L            .6D0,   .1D0,  -.5D0,   .8D0,          0.D0,0.D0,0.D0,
+     M           -.8D0,  3.8D0, -2.2D0, -1.2D0,          0.D0,0.D0,0.D0,
+     N           -.9D0,  2.8D0, -1.4D0, -1.3D0,          0.D0,0.D0,0.D0,
+     O           3.5D0,  -.4D0, -2.2D0,  4.7D0,          0.D0,0.D0,0.D0/
+*
+      DATA DT19XB/.6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     A            .6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     B            .6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     C            .6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     D            .6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     E           -.8D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     F           -.9D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     G           3.5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     H            .6D0,   .1D0,  -.5D0,             0.D0,0.D0,0.D0,0.D0,
+     I           0.D0,    .1D0, -3.0D0,             0.D0,0.D0,0.D0,0.D0,
+     J           -.3D0,   .1D0, -2.0D0,             0.D0,0.D0,0.D0,0.D0,
+     K           3.3D0,   .1D0, -2.0D0,             0.D0,0.D0,0.D0,0.D0,
+     L            .6D0,   .1D0,  -.5D0,   .8D0,   .9D0,  -.3D0,  -.4D0,
+     M          -2.0D0,   .1D0,  1.4D0,   .8D0,   .6D0,  -.3D0, -2.8D0,
+     N          -1.8D0,   .1D0,  1.3D0,   .8D0,  0.D0,   -.3D0, -1.9D0,
+     O           3.8D0,   .1D0, -3.1D0,   .8D0,  4.8D0,  -.3D0, -1.5D0 /
+*
+      DATA DT19XC/.6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     A            .6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     B            .6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     C            .6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     D            .6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     E           -.8D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     F           -.9D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     G           3.5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     H            .6D0,   .1D0,  -.5D0,             0.D0,0.D0,0.D0,0.D0,
+     I           4.8D0,   .1D0, -3.0D0,             0.D0,0.D0,0.D0,0.D0,
+     J           3.3D0,   .1D0, -2.0D0,             0.D0,0.D0,0.D0,0.D0,
+     K           2.1D0,   .1D0, -2.0D0,             0.D0,0.D0,0.D0,0.D0,
+     L            .6D0,   .1D0,  -.5D0,   .8D0,   .9D0,  -.3D0,  -.4D0,
+     M          -1.6D0,   .1D0, -2.2D0,   .8D0,  5.4D0,  -.3D0, -2.8D0,
+     N          -1.5D0,   .1D0, -1.4D0,   .8D0,  3.6D0,  -.3D0, -1.9D0,
+     O           3.7D0,   .1D0, -2.2D0,   .8D0,  3.6D0,  -.3D0, -1.5D0 /
+*
+      DATA DT19XD/.6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     A            .6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     B            .6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     C            .6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     D            .6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     E           -.8D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     F           -.9D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     G           3.5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     H            .6D0,   .1D0,             0.D0,0.D0,0.D0,0.D0,0.D0,
+     I           -.8D0, -1.0D0,             0.D0,0.D0,0.D0,0.D0,0.D0,
+     J           -.9D0,  -.8D0,             0.D0,0.D0,0.D0,0.D0,0.D0,
+     K           3.5D0,   .8D0,             0.D0,0.D0,0.D0,0.D0,0.D0,
+     L            .6D0,   .1D0,  -.5D0,   .8D0,          0.D0,0.D0,0.D0,
+     M           -.8D0, -1.0D0,  1.4D0, -1.6D0,          0.D0,0.D0,0.D0,
+     N           -.9D0,  -.8D0,  1.3D0, -1.6D0,          0.D0,0.D0,0.D0,
+     O           3.5D0,   .8D0, -3.1D0,  4.8D0,          0.D0,0.D0,0.D0/
+*                        TRUE Y RESULTS FOR ROTATIONS DROTM
+      DATA DT19YA/.5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     A            .5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     B            .5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     C            .5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     D            .5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     E            .7D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     F           1.7D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     G          -2.6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     H            .5D0,  -.9D0,             0.D0,0.D0,0.D0,0.D0,0.D0,
+     I            .7D0, -4.8D0,             0.D0,0.D0,0.D0,0.D0,0.D0,
+     J           1.7D0,  -.7D0,             0.D0,0.D0,0.D0,0.D0,0.D0,
+     K          -2.6D0,  3.5D0,             0.D0,0.D0,0.D0,0.D0,0.D0,
+     L            .5D0,  -.9D0,   .3D0,   .7D0,          0.D0,0.D0,0.D0,
+     M            .7D0, -4.8D0,  3.0D0,  1.1D0,          0.D0,0.D0,0.D0,
+     N           1.7D0,  -.7D0,  -.7D0,  2.3D0,          0.D0,0.D0,0.D0,
+     O          -2.6D0,  3.5D0,  -.7D0, -3.6D0,          0.D0,0.D0,0.D0/
+*
+      DATA DT19YB/.5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     A            .5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     B            .5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     C            .5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     D            .5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     E            .7D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     F           1.7D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     G          -2.6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     H            .5D0,  -.9D0,   .3D0,             0.D0,0.D0,0.D0,0.D0,
+     I           4.0D0,  -.9D0,  -.3D0,             0.D0,0.D0,0.D0,0.D0,
+     J           -.5D0,  -.9D0,  1.5D0,             0.D0,0.D0,0.D0,0.D0,
+     K          -1.5D0,  -.9D0, -1.8D0,             0.D0,0.D0,0.D0,0.D0,
+     L            .5D0,  -.9D0,   .3D0,   .7D0,  -.6D0,   .2D0,   .8D0,
+     M           3.7D0,  -.9D0, -1.2D0,   .7D0, -1.5D0,   .2D0,  2.2D0,
+     N           -.3D0,  -.9D0,  2.1D0,   .7D0, -1.6D0,   .2D0,  2.0D0,
+     O          -1.6D0,  -.9D0, -2.1D0,   .7D0,  2.9D0,   .2D0, -3.8D0 /
+*
+      DATA DT19YC/.5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     A            .5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     B            .5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     C            .5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     D            .5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     E            .7D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     F           1.7D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     G          -2.6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     H            .5D0,  -.9D0,             0.D0,0.D0,0.D0,0.D0,0.D0,
+     I           4.0D0, -6.3D0,             0.D0,0.D0,0.D0,0.D0,0.D0,
+     J           -.5D0,   .3D0,             0.D0,0.D0,0.D0,0.D0,0.D0,
+     K          -1.5D0,  3.0D0,             0.D0,0.D0,0.D0,0.D0,0.D0,
+     L            .5D0,  -.9D0,   .3D0,   .7D0,          0.D0,0.D0,0.D0,
+     M           3.7D0, -7.2D0,  3.0D0,  1.7D0,          0.D0,0.D0,0.D0,
+     N           -.3D0,   .9D0,  -.7D0,  1.9D0,          0.D0,0.D0,0.D0,
+     O          -1.6D0,  2.7D0,  -.7D0, -3.4D0,          0.D0,0.D0,0.D0/
+*
+      DATA DT19YD/.5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     A            .5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     B            .5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     C            .5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     D            .5D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     E            .7D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     F           1.7D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     G          -2.6D0,                  0.D0,0.D0,0.D0,0.D0,0.D0,0.D0,
+     H            .5D0,  -.9D0,   .3D0,             0.D0,0.D0,0.D0,0.D0,
+     I            .7D0,  -.9D0,  1.2D0,             0.D0,0.D0,0.D0,0.D0,
+     J           1.7D0,  -.9D0,   .5D0,             0.D0,0.D0,0.D0,0.D0,
+     K          -2.6D0,  -.9D0, -1.3D0,             0.D0,0.D0,0.D0,0.D0,
+     L            .5D0,  -.9D0,   .3D0,   .7D0,  -.6D0,   .2D0,   .8D0,
+     M            .7D0,  -.9D0,  1.2D0,   .7D0, -1.5D0,   .2D0,  1.6D0,
+     N           1.7D0,  -.9D0,   .5D0,   .7D0, -1.6D0,   .2D0,  2.4D0,
+     O          -2.6D0,  -.9D0, -1.3D0,   .7D0,  2.9D0,   .2D0, -4.0D0 /
+*    
+*     .. Executable Statements ..
+*
+      DO 120 KI = 1, 4
+         INCX = INCXS(KI)
+         INCY = INCYS(KI)
+         MX = ABS(INCX)
+         MY = ABS(INCY)
+*
+         DO 100 KN = 1, 4
+            N = NS(KN)
+            KSIZE = MIN(2,KN)
+            LENX = LENS(KN,MX)
+            LENY = LENS(KN,MY)
+*           .. Initialize all argument arrays ..
+            DO 20 I = 1, 7
+               SX(I) = DX1(I)
+               SY(I) = DY1(I)
+   20       CONTINUE
+*
+            IF (ICASE.EQ.1) THEN
+*              .. DDOT ..
+               CALL STEST1(DDOT(N,SX,INCX,SY,INCY),DT7(KN,KI),SSIZE1(KN)
+     +                     ,SFAC)
+            ELSE IF (ICASE.EQ.2) THEN
+*              .. DAXPY ..
+               CALL DAXPY(N,SA,SX,INCX,SY,INCY)
+               DO 40 J = 1, LENY
+                  STY(J) = DT8(J,KN,KI)
+   40          CONTINUE
+               CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC)
+            ELSE IF (ICASE.EQ.5) THEN
+*              .. DCOPY ..
+               DO 60 I = 1, 7
+                  STY(I) = DT10Y(I,KN,KI)
+   60          CONTINUE
+               CALL DCOPY(N,SX,INCX,SY,INCY)
+               CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0)
+            ELSE IF (ICASE.EQ.6) THEN
+*              .. DSWAP ..
+               CALL DSWAP(N,SX,INCX,SY,INCY)
+               DO 80 I = 1, 7
+                  STX(I) = DT10X(I,KN,KI)
+                  STY(I) = DT10Y(I,KN,KI)
+   80          CONTINUE
+               CALL STEST(LENX,SX,STX,SSIZE2(1,1),1.0D0)
+               CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0)
+            ELSE IF (ICASE.EQ.12) THEN
+*              .. DROTM ..
+               KNI=KN+4*(KI-1)
+               DO KPAR=1,4
+                  DO I=1,7
+                     SX(I) = DX1(I)
+                     SY(I) = DY1(I)
+                     STX(I)= DT19X(I,KPAR,KNI)
+                     STY(I)= DT19Y(I,KPAR,KNI)
+                  END DO
+*
+                  DO I=1,5
+                     DTEMP(I) = DPAR(I,KPAR)
+                  END DO
+*
+                  DO  I=1,LENX
+                     SSIZE(I)=STX(I)
+                  END DO
+*                   SEE REMARK ABOVE ABOUT DT11X(1,2,7)
+*                       AND DT11X(5,3,8).
+                  IF ((KPAR .EQ. 2) .AND. (KNI .EQ. 7))
+     $               SSIZE(1) = 2.4D0
+                  IF ((KPAR .EQ. 3) .AND. (KNI .EQ. 8))
+     $               SSIZE(5) = 1.8D0
+*
+                  CALL   DROTM(N,SX,INCX,SY,INCY,DTEMP)
+                  CALL   STEST(LENX,SX,STX,SSIZE,SFAC)
+                  CALL   STEST(LENY,SY,STY,STY,SFAC)
+               END DO
+            ELSE IF (ICASE.EQ.13) THEN
+*              .. DSDOT ..
+            CALL TESTDSDOT(REAL(DSDOT(N,REAL(SX),INCX,REAL(SY),INCY)),
+     $                 REAL(DT7(KN,KI)),REAL(SSIZE1(KN)), .3125E-1)
+            ELSE
+               WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
+               STOP
+            END IF
+  100    CONTINUE
+  120 CONTINUE
+      RETURN
+      END
+      SUBROUTINE CHECK3(SFAC)
+*     .. Parameters ..
+      INTEGER           NOUT
+      PARAMETER         (NOUT=6)
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION  SFAC
+*     .. Scalars in Common ..
+      INTEGER           ICASE, INCX, INCY, N
+      LOGICAL           PASS
+*     .. Local Scalars ..
+      DOUBLE PRECISION  SC, SS
+      INTEGER           I, K, KI, KN, KSIZE, LENX, LENY, MX, MY
+*     .. Local Arrays ..
+      DOUBLE PRECISION  COPYX(5), COPYY(5), DT9X(7,4,4), DT9Y(7,4,4),
+     +                  DX1(7), DY1(7), MWPC(11), MWPS(11), MWPSTX(5),
+     +                  MWPSTY(5), MWPTX(11,5), MWPTY(11,5), MWPX(5),
+     +                  MWPY(5), SSIZE2(14,2), STX(7), STY(7), SX(7),
+     +                  SY(7)
+      INTEGER           INCXS(4), INCYS(4), LENS(4,2), MWPINX(11),
+     +                  MWPINY(11), MWPN(11), NS(4)
+*     .. External Subroutines ..
+      EXTERNAL          DROT, STEST
+*     .. Intrinsic Functions ..
+      INTRINSIC         ABS, MIN
+*     .. Common blocks ..
+      COMMON            /COMBLA/ICASE, N, INCX, INCY, PASS
+*     .. Data statements ..
+      DATA              INCXS/1, 2, -2, -1/
+      DATA              INCYS/1, -2, 1, -2/
+      DATA              LENS/1, 1, 2, 4, 1, 1, 3, 7/
+      DATA              NS/0, 1, 2, 4/
+      DATA              DX1/0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.9D0, -0.3D0,
+     +                  -0.4D0/
+      DATA              DY1/0.5D0, -0.9D0, 0.3D0, 0.7D0, -0.6D0, 0.2D0,
+     +                  0.8D0/
+      DATA              SC, SS/0.8D0, 0.6D0/
+      DATA              DT9X/0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.78D0, -0.46D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.78D0, -0.46D0, -0.22D0,
+     +                  1.06D0, 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.78D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.66D0, 0.1D0, -0.1D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.96D0, 0.1D0, -0.76D0, 0.8D0, 0.90D0,
+     +                  -0.3D0, -0.02D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.78D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.06D0, 0.1D0,
+     +                  -0.1D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.90D0,
+     +                  0.1D0, -0.22D0, 0.8D0, 0.18D0, -0.3D0, -0.02D0,
+     +                  0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.78D0, 0.26D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.78D0, 0.26D0, -0.76D0, 1.12D0,
+     +                  0.0D0, 0.0D0, 0.0D0/
+      DATA              DT9Y/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.54D0,
+     +                  0.08D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.04D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.7D0,
+     +                  -0.9D0, -0.12D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.64D0, -0.9D0, -0.30D0, 0.7D0, -0.18D0, 0.2D0,
+     +                  0.28D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.7D0, -1.08D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.64D0, -1.26D0,
+     +                  0.54D0, 0.20D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.7D0,
+     +                  -0.18D0, 0.2D0, 0.16D0/
+      DATA              SSIZE2/0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0,
+     +                  0.0D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0,
+     +                  1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0,
+     +                  1.17D0, 1.17D0, 1.17D0/
+*     .. Executable Statements ..
+*
+      DO 60 KI = 1, 4
+         INCX = INCXS(KI)
+         INCY = INCYS(KI)
+         MX = ABS(INCX)
+         MY = ABS(INCY)
+*
+         DO 40 KN = 1, 4
+            N = NS(KN)
+            KSIZE = MIN(2,KN)
+            LENX = LENS(KN,MX)
+            LENY = LENS(KN,MY)
+*
+            IF (ICASE.EQ.4) THEN
+*              .. DROT ..
+               DO 20 I = 1, 7
+                  SX(I) = DX1(I)
+                  SY(I) = DY1(I)
+                  STX(I) = DT9X(I,KN,KI)
+                  STY(I) = DT9Y(I,KN,KI)
+   20          CONTINUE
+               CALL DROT(N,SX,INCX,SY,INCY,SC,SS)
+               CALL STEST(LENX,SX,STX,SSIZE2(1,KSIZE),SFAC)
+               CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC)
+            ELSE
+               WRITE (NOUT,*) ' Shouldn''t be here in CHECK3'
+               STOP
+            END IF
+   40    CONTINUE
+   60 CONTINUE
+*
+      MWPC(1) = 1
+      DO 80 I = 2, 11
+         MWPC(I) = 0
+   80 CONTINUE
+      MWPS(1) = 0
+      DO 100 I = 2, 6
+         MWPS(I) = 1
+  100 CONTINUE
+      DO 120 I = 7, 11
+         MWPS(I) = -1
+  120 CONTINUE
+      MWPINX(1) = 1
+      MWPINX(2) = 1
+      MWPINX(3) = 1
+      MWPINX(4) = -1
+      MWPINX(5) = 1
+      MWPINX(6) = -1
+      MWPINX(7) = 1
+      MWPINX(8) = 1
+      MWPINX(9) = -1
+      MWPINX(10) = 1
+      MWPINX(11) = -1
+      MWPINY(1) = 1
+      MWPINY(2) = 1
+      MWPINY(3) = -1
+      MWPINY(4) = -1
+      MWPINY(5) = 2
+      MWPINY(6) = 1
+      MWPINY(7) = 1
+      MWPINY(8) = -1
+      MWPINY(9) = -1
+      MWPINY(10) = 2
+      MWPINY(11) = 1
+      DO 140 I = 1, 11
+         MWPN(I) = 5
+  140 CONTINUE
+      MWPN(5) = 3
+      MWPN(10) = 3
+      DO 160 I = 1, 5
+         MWPX(I) = I
+         MWPY(I) = I
+         MWPTX(1,I) = I
+         MWPTY(1,I) = I
+         MWPTX(2,I) = I
+         MWPTY(2,I) = -I
+         MWPTX(3,I) = 6 - I
+         MWPTY(3,I) = I - 6
+         MWPTX(4,I) = I
+         MWPTY(4,I) = -I
+         MWPTX(6,I) = 6 - I
+         MWPTY(6,I) = I - 6
+         MWPTX(7,I) = -I
+         MWPTY(7,I) = I
+         MWPTX(8,I) = I - 6
+         MWPTY(8,I) = 6 - I
+         MWPTX(9,I) = -I
+         MWPTY(9,I) = I
+         MWPTX(11,I) = I - 6
+         MWPTY(11,I) = 6 - I
+  160 CONTINUE
+      MWPTX(5,1) = 1
+      MWPTX(5,2) = 3
+      MWPTX(5,3) = 5
+      MWPTX(5,4) = 4
+      MWPTX(5,5) = 5
+      MWPTY(5,1) = -1
+      MWPTY(5,2) = 2
+      MWPTY(5,3) = -2
+      MWPTY(5,4) = 4
+      MWPTY(5,5) = -3
+      MWPTX(10,1) = -1
+      MWPTX(10,2) = -3
+      MWPTX(10,3) = -5
+      MWPTX(10,4) = 4
+      MWPTX(10,5) = 5
+      MWPTY(10,1) = 1
+      MWPTY(10,2) = 2
+      MWPTY(10,3) = 2
+      MWPTY(10,4) = 4
+      MWPTY(10,5) = 3
+      DO 200 I = 1, 11
+         INCX = MWPINX(I)
+         INCY = MWPINY(I)
+         DO 180 K = 1, 5
+            COPYX(K) = MWPX(K)
+            COPYY(K) = MWPY(K)
+            MWPSTX(K) = MWPTX(I,K)
+            MWPSTY(K) = MWPTY(I,K)
+  180    CONTINUE
+         CALL DROT(MWPN(I),COPYX,INCX,COPYY,INCY,MWPC(I),MWPS(I))
+         CALL STEST(5,COPYX,MWPSTX,MWPSTX,SFAC)
+         CALL STEST(5,COPYY,MWPSTY,MWPSTY,SFAC)
+  200 CONTINUE
+      RETURN
+      END
+      SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC)
+*     ********************************* STEST **************************
+*
+*     THIS SUBR COMPARES ARRAYS  SCOMP() AND STRUE() OF LENGTH LEN TO
+*     SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE
+*     NEGLIGIBLE.
+*
+*     C. L. LAWSON, JPL, 1974 DEC 10
+*
+*     .. Parameters ..
+      INTEGER          NOUT
+      DOUBLE PRECISION ZERO
+      PARAMETER        (NOUT=6, ZERO=0.0D0)
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION SFAC
+      INTEGER          LEN
+*     .. Array Arguments ..
+      DOUBLE PRECISION SCOMP(LEN), SSIZE(LEN), STRUE(LEN)
+*     .. Scalars in Common ..
+      INTEGER          ICASE, INCX, INCY, N
+      LOGICAL          PASS
+*     .. Local Scalars ..
+      DOUBLE PRECISION SD
+      INTEGER          I
+*     .. External Functions ..
+      DOUBLE PRECISION SDIFF
+      EXTERNAL         SDIFF
+*     .. Intrinsic Functions ..
+      INTRINSIC        ABS
+*     .. Common blocks ..
+      COMMON           /COMBLA/ICASE, N, INCX, INCY, PASS
+*     .. Executable Statements ..
+*
+      DO 40 I = 1, LEN
+         SD = SCOMP(I) - STRUE(I)
+         IF (ABS(SFAC*SD) .LE. ABS(SSIZE(I))*EPSILON(ZERO))
+     +       GO TO 40
+*
+*                             HERE    SCOMP(I) IS NOT CLOSE TO STRUE(I).
+*
+         IF ( .NOT. PASS) GO TO 20
+*                             PRINT FAIL MESSAGE AND HEADER.
+         PASS = .FALSE.
+         WRITE (NOUT,99999)
+         WRITE (NOUT,99998)
+   20    WRITE (NOUT,99997) ICASE, N, INCX, INCY, I, SCOMP(I),
+     +     STRUE(I), SD, SSIZE(I)
+   40 CONTINUE
+      RETURN
+*
+99999 FORMAT ('                                       FAIL')
+99998 FORMAT (/' CASE  N INCX INCY  I                            ',
+     +       ' COMP(I)                             TRUE(I)  DIFFERENCE',
+     +       '     SIZE(I)',/1X)
+99997 FORMAT (1X,I4,I3,2I5,I3,2D36.8,2D12.4)
+      END
+      SUBROUTINE TESTDSDOT(SCOMP,STRUE,SSIZE,SFAC)
+*     ********************************* STEST **************************
+*
+*     THIS SUBR COMPARES ARRAYS  SCOMP() AND STRUE() OF LENGTH LEN TO
+*     SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE
+*     NEGLIGIBLE.
+*
+*     C. L. LAWSON, JPL, 1974 DEC 10
+*
+*     .. Parameters ..
+      INTEGER          NOUT
+      REAL             ZERO
+      PARAMETER        (NOUT=6, ZERO=0.0E0)
+*     .. Scalar Arguments ..
+      REAL             SFAC, SCOMP, SSIZE, STRUE
+*     .. Scalars in Common ..
+      INTEGER          ICASE, INCX, INCY, N
+      LOGICAL          PASS
+*     .. Local Scalars ..
+      REAL             SD
+*     .. Intrinsic Functions ..
+      INTRINSIC        ABS
+*     .. Common blocks ..
+      COMMON           /COMBLA/ICASE, N, INCX, INCY, PASS
+*     .. Executable Statements ..
+*
+         SD = SCOMP - STRUE
+         IF (ABS(SFAC*SD) .LE. ABS(SSIZE) * EPSILON(ZERO))
+     +       GO TO 40
+*
+*                             HERE    SCOMP(I) IS NOT CLOSE TO STRUE(I).
+*
+         IF ( .NOT. PASS) GO TO 20
+*                             PRINT FAIL MESSAGE AND HEADER.
+         PASS = .FALSE.
+         WRITE (NOUT,99999)
+         WRITE (NOUT,99998)
+   20    WRITE (NOUT,99997) ICASE, N, INCX, INCY, SCOMP,
+     +     STRUE, SD, SSIZE
+   40 CONTINUE
+      RETURN
+*
+99999 FORMAT ('                                       FAIL')
+99998 FORMAT (/' CASE  N INCX INCY                           ',
+     +       ' COMP(I)                             TRUE(I)  DIFFERENCE',
+     +       '     SIZE(I)',/1X)
+99997 FORMAT (1X,I4,I3,1I5,I3,2E36.8,2E12.4)
+      END
+      SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
+*     ************************* STEST1 *****************************
+*
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
+*     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
+*     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
+*
+*     C.L. LAWSON, JPL, 1978 DEC 6
+*
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION  SCOMP1, SFAC, STRUE1
+*     .. Array Arguments ..
+      DOUBLE PRECISION  SSIZE(*)
+*     .. Local Arrays ..
+      DOUBLE PRECISION  SCOMP(1), STRUE(1)
+*     .. External Subroutines ..
+      EXTERNAL          STEST
+*     .. Executable Statements ..
+*
+      SCOMP(1) = SCOMP1
+      STRUE(1) = STRUE1
+      CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC)
+*
+      RETURN
+      END
+      DOUBLE PRECISION FUNCTION SDIFF(SA,SB)
+*     ********************************* SDIFF **************************
+*     COMPUTES DIFFERENCE OF TWO NUMBERS.  C. L. LAWSON, JPL 1974 FEB 15
+*
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION                SA, SB
+*     .. Executable Statements ..
+      SDIFF = SA - SB
+      RETURN
+      END
+      SUBROUTINE ITEST1(ICOMP,ITRUE)
+*     ********************************* ITEST1 *************************
+*
+*     THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR
+*     EQUALITY.
+*     C. L. LAWSON, JPL, 1974 DEC 10
+*
+*     .. Parameters ..
+      INTEGER           NOUT
+      PARAMETER         (NOUT=6)
+*     .. Scalar Arguments ..
+      INTEGER           ICOMP, ITRUE
+*     .. Scalars in Common ..
+      INTEGER           ICASE, INCX, INCY, N
+      LOGICAL           PASS
+*     .. Local Scalars ..
+      INTEGER           ID
+*     .. Common blocks ..
+      COMMON            /COMBLA/ICASE, N, INCX, INCY, PASS
+*     .. Executable Statements ..
+*
+      IF (ICOMP.EQ.ITRUE) GO TO 40
+*
+*                            HERE ICOMP IS NOT EQUAL TO ITRUE.
+*
+      IF ( .NOT. PASS) GO TO 20
+*                             PRINT FAIL MESSAGE AND HEADER.
+      PASS = .FALSE.
+      WRITE (NOUT,99999)
+      WRITE (NOUT,99998)
+   20 ID = ICOMP - ITRUE
+      WRITE (NOUT,99997) ICASE, N, INCX, INCY, ICOMP, ITRUE, ID
+   40 CONTINUE
+      RETURN
+*
+99999 FORMAT ('                                       FAIL')
+99998 FORMAT (/' CASE  N INCX INCY                               ',
+     +       ' COMP                                TRUE     DIFFERENCE',
+     +       /1X)
+99997 FORMAT (1X,I4,I3,2I5,2I36,I12)
+      END

diff --git a/blas/testing/dblat2.dat b/blas/testing/dblat2.dat
new file mode 100644
index 0000000..3755b83
--- /dev/null
+++ b/blas/testing/dblat2.dat

@@ -0,0 +1,34 @@
+'dblat2.summ'     NAME OF SUMMARY OUTPUT FILE
+6                 UNIT NUMBER OF SUMMARY FILE
+'dblat2.snap'     NAME OF SNAPSHOT OUTPUT FILE
+-1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+F        LOGICAL FLAG, T TO STOP ON FAILURES.
+T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+16.0     THRESHOLD VALUE OF TEST RATIO
+6                 NUMBER OF VALUES OF N
+0 1 2 3 5 9       VALUES OF N
+4                 NUMBER OF VALUES OF K
+0 1 2 4           VALUES OF K
+4                 NUMBER OF VALUES OF INCX AND INCY
+1 2 -1 -2         VALUES OF INCX AND INCY
+3                 NUMBER OF VALUES OF ALPHA
+0.0 1.0 0.7       VALUES OF ALPHA
+3                 NUMBER OF VALUES OF BETA
+0.0 1.0 0.9       VALUES OF BETA
+DGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+DGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+DSYMV  T PUT F FOR NO TEST. SAME COLUMNS.
+DSBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+DSPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+DTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
+DTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+DTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+DTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
+DTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
+DTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
+DGER   T PUT F FOR NO TEST. SAME COLUMNS.
+DSYR   T PUT F FOR NO TEST. SAME COLUMNS.
+DSPR   T PUT F FOR NO TEST. SAME COLUMNS.
+DSYR2  T PUT F FOR NO TEST. SAME COLUMNS.
+DSPR2  T PUT F FOR NO TEST. SAME COLUMNS.

diff --git a/blas/testing/dblat2.f b/blas/testing/dblat2.f
new file mode 100644
index 0000000..0fa80af
--- /dev/null
+++ b/blas/testing/dblat2.f

@@ -0,0 +1,3176 @@
+*> \brief \b DBLAT2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM DBLAT2
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the DOUBLE PRECISION Level 2 Blas.
+*>
+*> The program must be driven by a short data file. The first 18 records
+*> of the file are read using list-directed input, the last 16 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 34 lines:
+*> 'dblat2.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'DBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 4                 NUMBER OF VALUES OF K
+*> 0 1 2 4           VALUES OF K
+*> 4                 NUMBER OF VALUES OF INCX AND INCY
+*> 1 2 -1 -2         VALUES OF INCX AND INCY
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> 0.0 1.0 0.7       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> 0.0 1.0 0.9       VALUES OF BETAC
+*> DGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DGER   T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSPR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSPR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*>    See:
+*>
+*>       Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
+*>       An  extended  set of Fortran  Basic Linear Algebra Subprograms.
+*>
+*>       Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
+*>       and  Computer Science  Division,  Argonne  National Laboratory,
+*>       9700 South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*>       Or
+*>
+*>       NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
+*>       Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
+*>       OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
+*>       Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
+*>
+*>
+*> -- Written on 10-August-1987.
+*>    Richard Hanson, Sandia National Labs.
+*>    Jeremy Du Croz, NAG Central Office.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup double_blas_testing
+*
+*  =====================================================================
+      PROGRAM DBLAT2
+*
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NIN
+      PARAMETER          ( NIN = 5 )
+      INTEGER            NSUBS
+      PARAMETER          ( NSUBS = 16 )
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
+      INTEGER            NMAX, INCMAX
+      PARAMETER          ( NMAX = 65, INCMAX = 2 )
+      INTEGER            NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX
+      PARAMETER          ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7,
+     $                   NALMAX = 7, NBEMAX = 7 )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   EPS, ERR, THRESH
+      INTEGER            I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB,
+     $                   NOUT, NTRA
+      LOGICAL            FATAL, LTESTT, REWI, SAME, SFATAL, TRACE,
+     $                   TSTERR
+      CHARACTER*1        TRANS
+      CHARACTER*6        SNAMET
+      CHARACTER*32       SNAPS, SUMMRY
+*     .. Local Arrays ..
+      DOUBLE PRECISION   A( NMAX, NMAX ), AA( NMAX*NMAX ),
+     $                   ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ),
+     $                   G( NMAX ), X( NMAX ), XS( NMAX*INCMAX ),
+     $                   XX( NMAX*INCMAX ), Y( NMAX ),
+     $                   YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX ), Z( 2*NMAX )
+      INTEGER            IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX )
+      LOGICAL            LTEST( NSUBS )
+      CHARACTER*6        SNAMES( NSUBS )
+*     .. External Functions ..
+      DOUBLE PRECISION   DDIFF
+      LOGICAL            LDE
+      EXTERNAL           DDIFF, LDE
+*     .. External Subroutines ..
+      EXTERNAL           DCHK1, DCHK2, DCHK3, DCHK4, DCHK5, DCHK6,
+     $                   DCHKE, DMVCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+      COMMON             /SRNAMC/SRNAMT
+*     .. Data statements ..
+      DATA               SNAMES/'DGEMV ', 'DGBMV ', 'DSYMV ', 'DSBMV ',
+     $                   'DSPMV ', 'DTRMV ', 'DTBMV ', 'DTPMV ',
+     $                   'DTRSV ', 'DTBSV ', 'DTPSV ', 'DGER  ',
+     $                   'DSYR  ', 'DSPR  ', 'DSYR2 ', 'DSPR2 '/
+*     .. Executable Statements ..
+*
+*     Read name and unit number for summary output file and open file.
+*
+      READ( NIN, FMT = * )SUMMRY
+      READ( NIN, FMT = * )NOUT
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
+      NOUTC = NOUT
+*
+*     Read name and unit number for snapshot output file and open file.
+*
+      READ( NIN, FMT = * )SNAPS
+      READ( NIN, FMT = * )NTRA
+      TRACE = NTRA.GE.0
+      IF( TRACE )THEN
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
+      END IF
+*     Read the flag that directs rewinding of the snapshot file.
+      READ( NIN, FMT = * )REWI
+      REWI = REWI.AND.TRACE
+*     Read the flag that directs stopping on any failure.
+      READ( NIN, FMT = * )SFATAL
+*     Read the flag that indicates whether error exits are to be tested.
+      READ( NIN, FMT = * )TSTERR
+*     Read the threshold value of the test ratio
+      READ( NIN, FMT = * )THRESH
+*
+*     Read and check the parameter values for the tests.
+*
+*     Values of N
+      READ( NIN, FMT = * )NIDIM
+      IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'N', NIDMAX
+         GO TO 230
+      END IF
+      READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM )
+      DO 10 I = 1, NIDIM
+         IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN
+            WRITE( NOUT, FMT = 9996 )NMAX
+            GO TO 230
+         END IF
+   10 CONTINUE
+*     Values of K
+      READ( NIN, FMT = * )NKB
+      IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'K', NKBMAX
+         GO TO 230
+      END IF
+      READ( NIN, FMT = * )( KB( I ), I = 1, NKB )
+      DO 20 I = 1, NKB
+         IF( KB( I ).LT.0 )THEN
+            WRITE( NOUT, FMT = 9995 )
+            GO TO 230
+         END IF
+   20 CONTINUE
+*     Values of INCX and INCY
+      READ( NIN, FMT = * )NINC
+      IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX
+         GO TO 230
+      END IF
+      READ( NIN, FMT = * )( INC( I ), I = 1, NINC )
+      DO 30 I = 1, NINC
+         IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN
+            WRITE( NOUT, FMT = 9994 )INCMAX
+            GO TO 230
+         END IF
+   30 CONTINUE
+*     Values of ALPHA
+      READ( NIN, FMT = * )NALF
+      IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX
+         GO TO 230
+      END IF
+      READ( NIN, FMT = * )( ALF( I ), I = 1, NALF )
+*     Values of BETA
+      READ( NIN, FMT = * )NBET
+      IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX
+         GO TO 230
+      END IF
+      READ( NIN, FMT = * )( BET( I ), I = 1, NBET )
+*
+*     Report values of parameters.
+*
+      WRITE( NOUT, FMT = 9993 )
+      WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM )
+      WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB )
+      WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC )
+      WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF )
+      WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET )
+      IF( .NOT.TSTERR )THEN
+         WRITE( NOUT, FMT = * )
+         WRITE( NOUT, FMT = 9980 )
+      END IF
+      WRITE( NOUT, FMT = * )
+      WRITE( NOUT, FMT = 9999 )THRESH
+      WRITE( NOUT, FMT = * )
+*
+*     Read names of subroutines and flags which indicate
+*     whether they are to be tested.
+*
+      DO 40 I = 1, NSUBS
+         LTEST( I ) = .FALSE.
+   40 CONTINUE
+   50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT
+      DO 60 I = 1, NSUBS
+         IF( SNAMET.EQ.SNAMES( I ) )
+     $      GO TO 70
+   60 CONTINUE
+      WRITE( NOUT, FMT = 9986 )SNAMET
+      STOP
+   70 LTEST( I ) = LTESTT
+      GO TO 50
+*
+   80 CONTINUE
+      CLOSE ( NIN )
+*
+*     Compute EPS (the machine precision).
+*
+      EPS = EPSILON(ZERO)
+      WRITE( NOUT, FMT = 9998 )EPS
+*
+*     Check the reliability of DMVCH using exact data.
+*
+      N = MIN( 32, NMAX )
+      DO 120 J = 1, N
+         DO 110 I = 1, N
+            A( I, J ) = MAX( I - J + 1, 0 )
+  110    CONTINUE
+         X( J ) = J
+         Y( J ) = ZERO
+  120 CONTINUE
+      DO 130 J = 1, N
+         YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3
+  130 CONTINUE
+*     YY holds the exact result. On exit from DMVCH YT holds
+*     the result computed by DMVCH.
+      TRANS = 'N'
+      CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G,
+     $            YY, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LDE( YY, YT, N )
+      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
+         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
+         STOP
+      END IF
+      TRANS = 'T'
+      CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
+     $            YY, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LDE( YY, YT, N )
+      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
+         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
+         STOP
+      END IF
+*
+*     Test each subroutine in turn.
+*
+      DO 210 ISNUM = 1, NSUBS
+         WRITE( NOUT, FMT = * )
+         IF( .NOT.LTEST( ISNUM ) )THEN
+*           Subprogram is not to be tested.
+            WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM )
+         ELSE
+            SRNAMT = SNAMES( ISNUM )
+*           Test error exits.
+            IF( TSTERR )THEN
+               CALL DCHKE( ISNUM, SNAMES( ISNUM ), NOUT )
+               WRITE( NOUT, FMT = * )
+            END IF
+*           Test computations.
+            INFOT = 0
+            OK = .TRUE.
+            FATAL = .FALSE.
+            GO TO ( 140, 140, 150, 150, 150, 160, 160,
+     $              160, 160, 160, 160, 170, 180, 180,
+     $              190, 190 )ISNUM
+*           Test DGEMV, 01, and DGBMV, 02.
+  140       CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF,
+     $                  NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS,
+     $                  X, XX, XS, Y, YY, YS, YT, G )
+            GO TO 200
+*           Test DSYMV, 03, DSBMV, 04, and DSPMV, 05.
+  150       CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF,
+     $                  NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS,
+     $                  X, XX, XS, Y, YY, YS, YT, G )
+            GO TO 200
+*           Test DTRMV, 06, DTBMV, 07, DTPMV, 08,
+*           DTRSV, 09, DTBSV, 10, and DTPSV, 11.
+  160       CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC,
+     $                  NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z )
+            GO TO 200
+*           Test DGER, 12.
+  170       CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC,
+     $                  NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS,
+     $                  YT, G, Z )
+            GO TO 200
+*           Test DSYR, 13, and DSPR, 14.
+  180       CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC,
+     $                  NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS,
+     $                  YT, G, Z )
+            GO TO 200
+*           Test DSYR2, 15, and DSPR2, 16.
+  190       CALL DCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC,
+     $                  NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS,
+     $                  YT, G, Z )
+*
+  200       IF( FATAL.AND.SFATAL )
+     $         GO TO 220
+         END IF
+  210 CONTINUE
+      WRITE( NOUT, FMT = 9982 )
+      GO TO 240
+*
+  220 CONTINUE
+      WRITE( NOUT, FMT = 9981 )
+      GO TO 240
+*
+  230 CONTINUE
+      WRITE( NOUT, FMT = 9987 )
+*
+  240 CONTINUE
+      IF( TRACE )
+     $   CLOSE ( NTRA )
+      CLOSE ( NOUT )
+      STOP
+*
+ 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES',
+     $      'S THAN', F8.2 )
+ 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 )
+ 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ',
+     $      'THAN ', I2 )
+ 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 )
+ 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' )
+ 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ',
+     $      I2 )
+ 9993 FORMAT( ' TESTS OF THE DOUBLE PRECISION LEVEL 2 BLAS', //' THE F',
+     $      'OLLOWING PARAMETER VALUES WILL BE USED:' )
+ 9992 FORMAT( '   FOR N              ', 9I6 )
+ 9991 FORMAT( '   FOR K              ', 7I6 )
+ 9990 FORMAT( '   FOR INCX AND INCY  ', 7I6 )
+ 9989 FORMAT( '   FOR ALPHA          ', 7F6.1 )
+ 9988 FORMAT( '   FOR BETA           ', 7F6.1 )
+ 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM',
+     $      /' ******* TESTS ABANDONED *******' )
+ 9986 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T',
+     $      'ESTS ABANDONED *******' )
+ 9985 FORMAT( ' ERROR IN DMVCH -  IN-LINE DOT PRODUCTS ARE BEING EVALU',
+     $      'ATED WRONGLY.', /' DMVCH WAS CALLED WITH TRANS = ', A1,
+     $      ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', /
+     $   ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.'
+     $      , /' ******* TESTS ABANDONED *******' )
+ 9984 FORMAT( A6, L2 )
+ 9983 FORMAT( 1X, A6, ' WAS NOT TESTED' )
+ 9982 FORMAT( /' END OF TESTS' )
+ 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' )
+ 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' )
+*
+*     End of DBLAT2.
+*
+      END
+      SUBROUTINE DCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET,
+     $                  BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX,
+     $                  XS, Y, YY, YS, YT, G )
+*
+*  Tests DGEMV and DGBMV.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, HALF
+      PARAMETER          ( ZERO = 0.0D0, HALF = 0.5D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX,
+     $                   NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), BET( NBET ), G( NMAX ),
+     $                   X( NMAX ), XS( NMAX*INCMAX ),
+     $                   XX( NMAX*INCMAX ), Y( NMAX ),
+     $                   YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC ), KB( NKB )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL
+      INTEGER            I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY,
+     $                   INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA,
+     $                   LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK,
+     $                   NL, NS
+      LOGICAL            BANDED, FULL, NULL, RESET, SAME, TRAN
+      CHARACTER*1        TRANS, TRANSS
+      CHARACTER*3        ICH
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LDE, LDERES
+      EXTERNAL           LDE, LDERES
+*     .. External Subroutines ..
+      EXTERNAL           DGBMV, DGEMV, DMAKE, DMVCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICH/'NTC'/
+*     .. Executable Statements ..
+      FULL = SNAME( 3: 3 ).EQ.'E'
+      BANDED = SNAME( 3: 3 ).EQ.'B'
+*     Define the number of arguments.
+      IF( FULL )THEN
+         NARGS = 11
+      ELSE IF( BANDED )THEN
+         NARGS = 13
+      END IF
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*
+      DO 120 IN = 1, NIDIM
+         N = IDIM( IN )
+         ND = N/2 + 1
+*
+         DO 110 IM = 1, 2
+            IF( IM.EQ.1 )
+     $         M = MAX( N - ND, 0 )
+            IF( IM.EQ.2 )
+     $         M = MIN( N + ND, NMAX )
+*
+            IF( BANDED )THEN
+               NK = NKB
+            ELSE
+               NK = 1
+            END IF
+            DO 100 IKU = 1, NK
+               IF( BANDED )THEN
+                  KU = KB( IKU )
+                  KL = MAX( KU - 1, 0 )
+               ELSE
+                  KU = N - 1
+                  KL = M - 1
+               END IF
+*              Set LDA to 1 more than minimum value if room.
+               IF( BANDED )THEN
+                  LDA = KL + KU + 1
+               ELSE
+                  LDA = M
+               END IF
+               IF( LDA.LT.NMAX )
+     $            LDA = LDA + 1
+*              Skip tests if not enough room.
+               IF( LDA.GT.NMAX )
+     $            GO TO 100
+               LAA = LDA*N
+               NULL = N.LE.0.OR.M.LE.0
+*
+*              Generate the matrix A.
+*
+               TRANSL = ZERO
+               CALL DMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, AA,
+     $                     LDA, KL, KU, RESET, TRANSL )
+*
+               DO 90 IC = 1, 3
+                  TRANS = ICH( IC: IC )
+                  TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C'
+*
+                  IF( TRAN )THEN
+                     ML = N
+                     NL = M
+                  ELSE
+                     ML = M
+                     NL = N
+                  END IF
+*
+                  DO 80 IX = 1, NINC
+                     INCX = INC( IX )
+                     LX = ABS( INCX )*NL
+*
+*                    Generate the vector X.
+*
+                     TRANSL = HALF
+                     CALL DMAKE( 'GE', ' ', ' ', 1, NL, X, 1, XX,
+     $                           ABS( INCX ), 0, NL - 1, RESET, TRANSL )
+                     IF( NL.GT.1 )THEN
+                        X( NL/2 ) = ZERO
+                        XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO
+                     END IF
+*
+                     DO 70 IY = 1, NINC
+                        INCY = INC( IY )
+                        LY = ABS( INCY )*ML
+*
+                        DO 60 IA = 1, NALF
+                           ALPHA = ALF( IA )
+*
+                           DO 50 IB = 1, NBET
+                              BETA = BET( IB )
+*
+*                             Generate the vector Y.
+*
+                              TRANSL = ZERO
+                              CALL DMAKE( 'GE', ' ', ' ', 1, ML, Y, 1,
+     $                                    YY, ABS( INCY ), 0, ML - 1,
+     $                                    RESET, TRANSL )
+*
+                              NC = NC + 1
+*
+*                             Save every datum before calling the
+*                             subroutine.
+*
+                              TRANSS = TRANS
+                              MS = M
+                              NS = N
+                              KLS = KL
+                              KUS = KU
+                              ALS = ALPHA
+                              DO 10 I = 1, LAA
+                                 AS( I ) = AA( I )
+   10                         CONTINUE
+                              LDAS = LDA
+                              DO 20 I = 1, LX
+                                 XS( I ) = XX( I )
+   20                         CONTINUE
+                              INCXS = INCX
+                              BLS = BETA
+                              DO 30 I = 1, LY
+                                 YS( I ) = YY( I )
+   30                         CONTINUE
+                              INCYS = INCY
+*
+*                             Call the subroutine.
+*
+                              IF( FULL )THEN
+                                 IF( TRACE )
+     $                              WRITE( NTRA, FMT = 9994 )NC, SNAME,
+     $                              TRANS, M, N, ALPHA, LDA, INCX, BETA,
+     $                              INCY
+                                 IF( REWI )
+     $                              REWIND NTRA
+                                 CALL DGEMV( TRANS, M, N, ALPHA, AA,
+     $                                       LDA, XX, INCX, BETA, YY,
+     $                                       INCY )
+                              ELSE IF( BANDED )THEN
+                                 IF( TRACE )
+     $                              WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                              TRANS, M, N, KL, KU, ALPHA, LDA,
+     $                              INCX, BETA, INCY
+                                 IF( REWI )
+     $                              REWIND NTRA
+                                 CALL DGBMV( TRANS, M, N, KL, KU, ALPHA,
+     $                                       AA, LDA, XX, INCX, BETA,
+     $                                       YY, INCY )
+                              END IF
+*
+*                             Check if error-exit was taken incorrectly.
+*
+                              IF( .NOT.OK )THEN
+                                 WRITE( NOUT, FMT = 9993 )
+                                 FATAL = .TRUE.
+                                 GO TO 130
+                              END IF
+*
+*                             See what data changed inside subroutines.
+*
+                              ISAME( 1 ) = TRANS.EQ.TRANSS
+                              ISAME( 2 ) = MS.EQ.M
+                              ISAME( 3 ) = NS.EQ.N
+                              IF( FULL )THEN
+                                 ISAME( 4 ) = ALS.EQ.ALPHA
+                                 ISAME( 5 ) = LDE( AS, AA, LAA )
+                                 ISAME( 6 ) = LDAS.EQ.LDA
+                                 ISAME( 7 ) = LDE( XS, XX, LX )
+                                 ISAME( 8 ) = INCXS.EQ.INCX
+                                 ISAME( 9 ) = BLS.EQ.BETA
+                                 IF( NULL )THEN
+                                    ISAME( 10 ) = LDE( YS, YY, LY )
+                                 ELSE
+                                    ISAME( 10 ) = LDERES( 'GE', ' ', 1,
+     $                                            ML, YS, YY,
+     $                                            ABS( INCY ) )
+                                 END IF
+                                 ISAME( 11 ) = INCYS.EQ.INCY
+                              ELSE IF( BANDED )THEN
+                                 ISAME( 4 ) = KLS.EQ.KL
+                                 ISAME( 5 ) = KUS.EQ.KU
+                                 ISAME( 6 ) = ALS.EQ.ALPHA
+                                 ISAME( 7 ) = LDE( AS, AA, LAA )
+                                 ISAME( 8 ) = LDAS.EQ.LDA
+                                 ISAME( 9 ) = LDE( XS, XX, LX )
+                                 ISAME( 10 ) = INCXS.EQ.INCX
+                                 ISAME( 11 ) = BLS.EQ.BETA
+                                 IF( NULL )THEN
+                                    ISAME( 12 ) = LDE( YS, YY, LY )
+                                 ELSE
+                                    ISAME( 12 ) = LDERES( 'GE', ' ', 1,
+     $                                            ML, YS, YY,
+     $                                            ABS( INCY ) )
+                                 END IF
+                                 ISAME( 13 ) = INCYS.EQ.INCY
+                              END IF
+*
+*                             If data was incorrectly changed, report
+*                             and return.
+*
+                              SAME = .TRUE.
+                              DO 40 I = 1, NARGS
+                                 SAME = SAME.AND.ISAME( I )
+                                 IF( .NOT.ISAME( I ) )
+     $                              WRITE( NOUT, FMT = 9998 )I
+   40                         CONTINUE
+                              IF( .NOT.SAME )THEN
+                                 FATAL = .TRUE.
+                                 GO TO 130
+                              END IF
+*
+                              IF( .NOT.NULL )THEN
+*
+*                                Check the result.
+*
+                                 CALL DMVCH( TRANS, M, N, ALPHA, A,
+     $                                       NMAX, X, INCX, BETA, Y,
+     $                                       INCY, YT, G, YY, EPS, ERR,
+     $                                       FATAL, NOUT, .TRUE. )
+                                 ERRMAX = MAX( ERRMAX, ERR )
+*                                If got really bad answer, report and
+*                                return.
+                                 IF( FATAL )
+     $                              GO TO 130
+                              ELSE
+*                                Avoid repeating tests with M.le.0 or
+*                                N.le.0.
+                                 GO TO 110
+                              END IF
+*
+   50                      CONTINUE
+*
+   60                   CONTINUE
+*
+   70                CONTINUE
+*
+   80             CONTINUE
+*
+   90          CONTINUE
+*
+  100       CONTINUE
+*
+  110    CONTINUE
+*
+  120 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 140
+*
+  130 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( FULL )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, TRANS, M, N, ALPHA, LDA,
+     $      INCX, BETA, INCY
+      ELSE IF( BANDED )THEN
+         WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANS, M, N, KL, KU,
+     $      ALPHA, LDA, INCX, BETA, INCY
+      END IF
+*
+  140 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 4( I3, ',' ), F4.1,
+     $      ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), F4.1,
+     $      ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2,
+     $      ')         .' )
+ 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of DCHK1.
+*
+      END
+      SUBROUTINE DCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET,
+     $                  BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX,
+     $                  XS, Y, YY, YS, YT, G )
+*
+*  Tests DSYMV, DSBMV and DSPMV.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, HALF
+      PARAMETER          ( ZERO = 0.0D0, HALF = 0.5D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX,
+     $                   NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), BET( NBET ), G( NMAX ),
+     $                   X( NMAX ), XS( NMAX*INCMAX ),
+     $                   XX( NMAX*INCMAX ), Y( NMAX ),
+     $                   YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC ), KB( NKB )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL
+      INTEGER            I, IA, IB, IC, IK, IN, INCX, INCXS, INCY,
+     $                   INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY,
+     $                   N, NARGS, NC, NK, NS
+      LOGICAL            BANDED, FULL, NULL, PACKED, RESET, SAME
+      CHARACTER*1        UPLO, UPLOS
+      CHARACTER*2        ICH
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LDE, LDERES
+      EXTERNAL           LDE, LDERES
+*     .. External Subroutines ..
+      EXTERNAL           DMAKE, DMVCH, DSBMV, DSPMV, DSYMV
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICH/'UL'/
+*     .. Executable Statements ..
+      FULL = SNAME( 3: 3 ).EQ.'Y'
+      BANDED = SNAME( 3: 3 ).EQ.'B'
+      PACKED = SNAME( 3: 3 ).EQ.'P'
+*     Define the number of arguments.
+      IF( FULL )THEN
+         NARGS = 10
+      ELSE IF( BANDED )THEN
+         NARGS = 11
+      ELSE IF( PACKED )THEN
+         NARGS = 9
+      END IF
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*
+      DO 110 IN = 1, NIDIM
+         N = IDIM( IN )
+*
+         IF( BANDED )THEN
+            NK = NKB
+         ELSE
+            NK = 1
+         END IF
+         DO 100 IK = 1, NK
+            IF( BANDED )THEN
+               K = KB( IK )
+            ELSE
+               K = N - 1
+            END IF
+*           Set LDA to 1 more than minimum value if room.
+            IF( BANDED )THEN
+               LDA = K + 1
+            ELSE
+               LDA = N
+            END IF
+            IF( LDA.LT.NMAX )
+     $         LDA = LDA + 1
+*           Skip tests if not enough room.
+            IF( LDA.GT.NMAX )
+     $         GO TO 100
+            IF( PACKED )THEN
+               LAA = ( N*( N + 1 ) )/2
+            ELSE
+               LAA = LDA*N
+            END IF
+            NULL = N.LE.0
+*
+            DO 90 IC = 1, 2
+               UPLO = ICH( IC: IC )
+*
+*              Generate the matrix A.
+*
+               TRANSL = ZERO
+               CALL DMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, AA,
+     $                     LDA, K, K, RESET, TRANSL )
+*
+               DO 80 IX = 1, NINC
+                  INCX = INC( IX )
+                  LX = ABS( INCX )*N
+*
+*                 Generate the vector X.
+*
+                  TRANSL = HALF
+                  CALL DMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX,
+     $                        ABS( INCX ), 0, N - 1, RESET, TRANSL )
+                  IF( N.GT.1 )THEN
+                     X( N/2 ) = ZERO
+                     XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO
+                  END IF
+*
+                  DO 70 IY = 1, NINC
+                     INCY = INC( IY )
+                     LY = ABS( INCY )*N
+*
+                     DO 60 IA = 1, NALF
+                        ALPHA = ALF( IA )
+*
+                        DO 50 IB = 1, NBET
+                           BETA = BET( IB )
+*
+*                          Generate the vector Y.
+*
+                           TRANSL = ZERO
+                           CALL DMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY,
+     $                                 ABS( INCY ), 0, N - 1, RESET,
+     $                                 TRANSL )
+*
+                           NC = NC + 1
+*
+*                          Save every datum before calling the
+*                          subroutine.
+*
+                           UPLOS = UPLO
+                           NS = N
+                           KS = K
+                           ALS = ALPHA
+                           DO 10 I = 1, LAA
+                              AS( I ) = AA( I )
+   10                      CONTINUE
+                           LDAS = LDA
+                           DO 20 I = 1, LX
+                              XS( I ) = XX( I )
+   20                      CONTINUE
+                           INCXS = INCX
+                           BLS = BETA
+                           DO 30 I = 1, LY
+                              YS( I ) = YY( I )
+   30                      CONTINUE
+                           INCYS = INCY
+*
+*                          Call the subroutine.
+*
+                           IF( FULL )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9993 )NC, SNAME,
+     $                           UPLO, N, ALPHA, LDA, INCX, BETA, INCY
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL DSYMV( UPLO, N, ALPHA, AA, LDA, XX,
+     $                                    INCX, BETA, YY, INCY )
+                           ELSE IF( BANDED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9994 )NC, SNAME,
+     $                           UPLO, N, K, ALPHA, LDA, INCX, BETA,
+     $                           INCY
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL DSBMV( UPLO, N, K, ALPHA, AA, LDA,
+     $                                    XX, INCX, BETA, YY, INCY )
+                           ELSE IF( PACKED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                           UPLO, N, ALPHA, INCX, BETA, INCY
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL DSPMV( UPLO, N, ALPHA, AA, XX, INCX,
+     $                                    BETA, YY, INCY )
+                           END IF
+*
+*                          Check if error-exit was taken incorrectly.
+*
+                           IF( .NOT.OK )THEN
+                              WRITE( NOUT, FMT = 9992 )
+                              FATAL = .TRUE.
+                              GO TO 120
+                           END IF
+*
+*                          See what data changed inside subroutines.
+*
+                           ISAME( 1 ) = UPLO.EQ.UPLOS
+                           ISAME( 2 ) = NS.EQ.N
+                           IF( FULL )THEN
+                              ISAME( 3 ) = ALS.EQ.ALPHA
+                              ISAME( 4 ) = LDE( AS, AA, LAA )
+                              ISAME( 5 ) = LDAS.EQ.LDA
+                              ISAME( 6 ) = LDE( XS, XX, LX )
+                              ISAME( 7 ) = INCXS.EQ.INCX
+                              ISAME( 8 ) = BLS.EQ.BETA
+                              IF( NULL )THEN
+                                 ISAME( 9 ) = LDE( YS, YY, LY )
+                              ELSE
+                                 ISAME( 9 ) = LDERES( 'GE', ' ', 1, N,
+     $                                        YS, YY, ABS( INCY ) )
+                              END IF
+                              ISAME( 10 ) = INCYS.EQ.INCY
+                           ELSE IF( BANDED )THEN
+                              ISAME( 3 ) = KS.EQ.K
+                              ISAME( 4 ) = ALS.EQ.ALPHA
+                              ISAME( 5 ) = LDE( AS, AA, LAA )
+                              ISAME( 6 ) = LDAS.EQ.LDA
+                              ISAME( 7 ) = LDE( XS, XX, LX )
+                              ISAME( 8 ) = INCXS.EQ.INCX
+                              ISAME( 9 ) = BLS.EQ.BETA
+                              IF( NULL )THEN
+                                 ISAME( 10 ) = LDE( YS, YY, LY )
+                              ELSE
+                                 ISAME( 10 ) = LDERES( 'GE', ' ', 1, N,
+     $                                         YS, YY, ABS( INCY ) )
+                              END IF
+                              ISAME( 11 ) = INCYS.EQ.INCY
+                           ELSE IF( PACKED )THEN
+                              ISAME( 3 ) = ALS.EQ.ALPHA
+                              ISAME( 4 ) = LDE( AS, AA, LAA )
+                              ISAME( 5 ) = LDE( XS, XX, LX )
+                              ISAME( 6 ) = INCXS.EQ.INCX
+                              ISAME( 7 ) = BLS.EQ.BETA
+                              IF( NULL )THEN
+                                 ISAME( 8 ) = LDE( YS, YY, LY )
+                              ELSE
+                                 ISAME( 8 ) = LDERES( 'GE', ' ', 1, N,
+     $                                        YS, YY, ABS( INCY ) )
+                              END IF
+                              ISAME( 9 ) = INCYS.EQ.INCY
+                           END IF
+*
+*                          If data was incorrectly changed, report and
+*                          return.
+*
+                           SAME = .TRUE.
+                           DO 40 I = 1, NARGS
+                              SAME = SAME.AND.ISAME( I )
+                              IF( .NOT.ISAME( I ) )
+     $                           WRITE( NOUT, FMT = 9998 )I
+   40                      CONTINUE
+                           IF( .NOT.SAME )THEN
+                              FATAL = .TRUE.
+                              GO TO 120
+                           END IF
+*
+                           IF( .NOT.NULL )THEN
+*
+*                             Check the result.
+*
+                              CALL DMVCH( 'N', N, N, ALPHA, A, NMAX, X,
+     $                                    INCX, BETA, Y, INCY, YT, G,
+     $                                    YY, EPS, ERR, FATAL, NOUT,
+     $                                    .TRUE. )
+                              ERRMAX = MAX( ERRMAX, ERR )
+*                             If got really bad answer, report and
+*                             return.
+                              IF( FATAL )
+     $                           GO TO 120
+                           ELSE
+*                             Avoid repeating tests with N.le.0
+                              GO TO 110
+                           END IF
+*
+   50                   CONTINUE
+*
+   60                CONTINUE
+*
+   70             CONTINUE
+*
+   80          CONTINUE
+*
+   90       CONTINUE
+*
+  100    CONTINUE
+*
+  110 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 130
+*
+  120 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( FULL )THEN
+         WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, LDA, INCX,
+     $      BETA, INCY
+      ELSE IF( BANDED )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, K, ALPHA, LDA,
+     $      INCX, BETA, INCY
+      ELSE IF( PACKED )THEN
+         WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, N, ALPHA, INCX,
+     $      BETA, INCY
+      END IF
+*
+  130 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', AP',
+     $      ', X,', I2, ',', F4.1, ', Y,', I2, ')                .' )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), F4.1,
+     $      ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2,
+     $      ')         .' )
+ 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', A,',
+     $      I3, ', X,', I2, ',', F4.1, ', Y,', I2, ')             .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of DCHK2.
+*
+      END
+      SUBROUTINE DCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX,
+     $                  INCMAX, A, AA, AS, X, XX, XS, XT, G, Z )
+*
+*  Tests DTRMV, DTBMV, DTPMV, DTRSV, DTBSV and DTPSV.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, HALF, ONE
+      PARAMETER          ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( NMAX, NMAX ), AA( NMAX*NMAX ),
+     $                   AS( NMAX*NMAX ), G( NMAX ), X( NMAX ),
+     $                   XS( NMAX*INCMAX ), XT( NMAX ),
+     $                   XX( NMAX*INCMAX ), Z( NMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC ), KB( NKB )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   ERR, ERRMAX, TRANSL
+      INTEGER            I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K,
+     $                   KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS
+      LOGICAL            BANDED, FULL, NULL, PACKED, RESET, SAME
+      CHARACTER*1        DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS
+      CHARACTER*2        ICHD, ICHU
+      CHARACTER*3        ICHT
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LDE, LDERES
+      EXTERNAL           LDE, LDERES
+*     .. External Subroutines ..
+      EXTERNAL           DMAKE, DMVCH, DTBMV, DTBSV, DTPMV, DTPSV,
+     $                   DTRMV, DTRSV
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/
+*     .. Executable Statements ..
+      FULL = SNAME( 3: 3 ).EQ.'R'
+      BANDED = SNAME( 3: 3 ).EQ.'B'
+      PACKED = SNAME( 3: 3 ).EQ.'P'
+*     Define the number of arguments.
+      IF( FULL )THEN
+         NARGS = 8
+      ELSE IF( BANDED )THEN
+         NARGS = 9
+      ELSE IF( PACKED )THEN
+         NARGS = 7
+      END IF
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*     Set up zero vector for DMVCH.
+      DO 10 I = 1, NMAX
+         Z( I ) = ZERO
+   10 CONTINUE
+*
+      DO 110 IN = 1, NIDIM
+         N = IDIM( IN )
+*
+         IF( BANDED )THEN
+            NK = NKB
+         ELSE
+            NK = 1
+         END IF
+         DO 100 IK = 1, NK
+            IF( BANDED )THEN
+               K = KB( IK )
+            ELSE
+               K = N - 1
+            END IF
+*           Set LDA to 1 more than minimum value if room.
+            IF( BANDED )THEN
+               LDA = K + 1
+            ELSE
+               LDA = N
+            END IF
+            IF( LDA.LT.NMAX )
+     $         LDA = LDA + 1
+*           Skip tests if not enough room.
+            IF( LDA.GT.NMAX )
+     $         GO TO 100
+            IF( PACKED )THEN
+               LAA = ( N*( N + 1 ) )/2
+            ELSE
+               LAA = LDA*N
+            END IF
+            NULL = N.LE.0
+*
+            DO 90 ICU = 1, 2
+               UPLO = ICHU( ICU: ICU )
+*
+               DO 80 ICT = 1, 3
+                  TRANS = ICHT( ICT: ICT )
+*
+                  DO 70 ICD = 1, 2
+                     DIAG = ICHD( ICD: ICD )
+*
+*                    Generate the matrix A.
+*
+                     TRANSL = ZERO
+                     CALL DMAKE( SNAME( 2: 3 ), UPLO, DIAG, N, N, A,
+     $                           NMAX, AA, LDA, K, K, RESET, TRANSL )
+*
+                     DO 60 IX = 1, NINC
+                        INCX = INC( IX )
+                        LX = ABS( INCX )*N
+*
+*                       Generate the vector X.
+*
+                        TRANSL = HALF
+                        CALL DMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX,
+     $                              ABS( INCX ), 0, N - 1, RESET,
+     $                              TRANSL )
+                        IF( N.GT.1 )THEN
+                           X( N/2 ) = ZERO
+                           XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO
+                        END IF
+*
+                        NC = NC + 1
+*
+*                       Save every datum before calling the subroutine.
+*
+                        UPLOS = UPLO
+                        TRANSS = TRANS
+                        DIAGS = DIAG
+                        NS = N
+                        KS = K
+                        DO 20 I = 1, LAA
+                           AS( I ) = AA( I )
+   20                   CONTINUE
+                        LDAS = LDA
+                        DO 30 I = 1, LX
+                           XS( I ) = XX( I )
+   30                   CONTINUE
+                        INCXS = INCX
+*
+*                       Call the subroutine.
+*
+                        IF( SNAME( 4: 5 ).EQ.'MV' )THEN
+                           IF( FULL )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9993 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, LDA, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL DTRMV( UPLO, TRANS, DIAG, N, AA, LDA,
+     $                                    XX, INCX )
+                           ELSE IF( BANDED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9994 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, K, LDA, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL DTBMV( UPLO, TRANS, DIAG, N, K, AA,
+     $                                    LDA, XX, INCX )
+                           ELSE IF( PACKED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL DTPMV( UPLO, TRANS, DIAG, N, AA, XX,
+     $                                    INCX )
+                           END IF
+                        ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN
+                           IF( FULL )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9993 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, LDA, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL DTRSV( UPLO, TRANS, DIAG, N, AA, LDA,
+     $                                    XX, INCX )
+                           ELSE IF( BANDED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9994 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, K, LDA, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL DTBSV( UPLO, TRANS, DIAG, N, K, AA,
+     $                                    LDA, XX, INCX )
+                           ELSE IF( PACKED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL DTPSV( UPLO, TRANS, DIAG, N, AA, XX,
+     $                                    INCX )
+                           END IF
+                        END IF
+*
+*                       Check if error-exit was taken incorrectly.
+*
+                        IF( .NOT.OK )THEN
+                           WRITE( NOUT, FMT = 9992 )
+                           FATAL = .TRUE.
+                           GO TO 120
+                        END IF
+*
+*                       See what data changed inside subroutines.
+*
+                        ISAME( 1 ) = UPLO.EQ.UPLOS
+                        ISAME( 2 ) = TRANS.EQ.TRANSS
+                        ISAME( 3 ) = DIAG.EQ.DIAGS
+                        ISAME( 4 ) = NS.EQ.N
+                        IF( FULL )THEN
+                           ISAME( 5 ) = LDE( AS, AA, LAA )
+                           ISAME( 6 ) = LDAS.EQ.LDA
+                           IF( NULL )THEN
+                              ISAME( 7 ) = LDE( XS, XX, LX )
+                           ELSE
+                              ISAME( 7 ) = LDERES( 'GE', ' ', 1, N, XS,
+     $                                     XX, ABS( INCX ) )
+                           END IF
+                           ISAME( 8 ) = INCXS.EQ.INCX
+                        ELSE IF( BANDED )THEN
+                           ISAME( 5 ) = KS.EQ.K
+                           ISAME( 6 ) = LDE( AS, AA, LAA )
+                           ISAME( 7 ) = LDAS.EQ.LDA
+                           IF( NULL )THEN
+                              ISAME( 8 ) = LDE( XS, XX, LX )
+                           ELSE
+                              ISAME( 8 ) = LDERES( 'GE', ' ', 1, N, XS,
+     $                                     XX, ABS( INCX ) )
+                           END IF
+                           ISAME( 9 ) = INCXS.EQ.INCX
+                        ELSE IF( PACKED )THEN
+                           ISAME( 5 ) = LDE( AS, AA, LAA )
+                           IF( NULL )THEN
+                              ISAME( 6 ) = LDE( XS, XX, LX )
+                           ELSE
+                              ISAME( 6 ) = LDERES( 'GE', ' ', 1, N, XS,
+     $                                     XX, ABS( INCX ) )
+                           END IF
+                           ISAME( 7 ) = INCXS.EQ.INCX
+                        END IF
+*
+*                       If data was incorrectly changed, report and
+*                       return.
+*
+                        SAME = .TRUE.
+                        DO 40 I = 1, NARGS
+                           SAME = SAME.AND.ISAME( I )
+                           IF( .NOT.ISAME( I ) )
+     $                        WRITE( NOUT, FMT = 9998 )I
+   40                   CONTINUE
+                        IF( .NOT.SAME )THEN
+                           FATAL = .TRUE.
+                           GO TO 120
+                        END IF
+*
+                        IF( .NOT.NULL )THEN
+                           IF( SNAME( 4: 5 ).EQ.'MV' )THEN
+*
+*                             Check the result.
+*
+                              CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X,
+     $                                    INCX, ZERO, Z, INCX, XT, G,
+     $                                    XX, EPS, ERR, FATAL, NOUT,
+     $                                    .TRUE. )
+                           ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN
+*
+*                             Compute approximation to original vector.
+*
+                              DO 50 I = 1, N
+                                 Z( I ) = XX( 1 + ( I - 1 )*
+     $                                    ABS( INCX ) )
+                                 XX( 1 + ( I - 1 )*ABS( INCX ) )
+     $                              = X( I )
+   50                         CONTINUE
+                              CALL DMVCH( TRANS, N, N, ONE, A, NMAX, Z,
+     $                                    INCX, ZERO, X, INCX, XT, G,
+     $                                    XX, EPS, ERR, FATAL, NOUT,
+     $                                    .FALSE. )
+                           END IF
+                           ERRMAX = MAX( ERRMAX, ERR )
+*                          If got really bad answer, report and return.
+                           IF( FATAL )
+     $                        GO TO 120
+                        ELSE
+*                          Avoid repeating tests with N.le.0.
+                           GO TO 110
+                        END IF
+*
+   60                CONTINUE
+*
+   70             CONTINUE
+*
+   80          CONTINUE
+*
+   90       CONTINUE
+*
+  100    CONTINUE
+*
+  110 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 130
+*
+  120 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( FULL )THEN
+         WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, DIAG, N, LDA,
+     $      INCX
+      ELSE IF( BANDED )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, DIAG, N, K,
+     $      LDA, INCX
+      ELSE IF( PACKED )THEN
+         WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, TRANS, DIAG, N, INCX
+      END IF
+*
+  130 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', AP, ',
+     $      'X,', I2, ')                        .' )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), 2( I3, ',' ),
+     $      ' A,', I3, ', X,', I2, ')                 .' )
+ 9993 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', A,',
+     $      I3, ', X,', I2, ')                     .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of DCHK3.
+*
+      END
+      SUBROUTINE DCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX,
+     $                  INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G,
+     $                  Z )
+*
+*  Tests DGER.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, HALF, ONE
+      PARAMETER          ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), G( NMAX ), X( NMAX ),
+     $                   XS( NMAX*INCMAX ), XX( NMAX*INCMAX ),
+     $                   Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX ), Z( NMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   ALPHA, ALS, ERR, ERRMAX, TRANSL
+      INTEGER            I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX,
+     $                   IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS,
+     $                   NC, ND, NS
+      LOGICAL            NULL, RESET, SAME
+*     .. Local Arrays ..
+      DOUBLE PRECISION   W( 1 )
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LDE, LDERES
+      EXTERNAL           LDE, LDERES
+*     .. External Subroutines ..
+      EXTERNAL           DGER, DMAKE, DMVCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Executable Statements ..
+*     Define the number of arguments.
+      NARGS = 9
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*
+      DO 120 IN = 1, NIDIM
+         N = IDIM( IN )
+         ND = N/2 + 1
+*
+         DO 110 IM = 1, 2
+            IF( IM.EQ.1 )
+     $         M = MAX( N - ND, 0 )
+            IF( IM.EQ.2 )
+     $         M = MIN( N + ND, NMAX )
+*
+*           Set LDA to 1 more than minimum value if room.
+            LDA = M
+            IF( LDA.LT.NMAX )
+     $         LDA = LDA + 1
+*           Skip tests if not enough room.
+            IF( LDA.GT.NMAX )
+     $         GO TO 110
+            LAA = LDA*N
+            NULL = N.LE.0.OR.M.LE.0
+*
+            DO 100 IX = 1, NINC
+               INCX = INC( IX )
+               LX = ABS( INCX )*M
+*
+*              Generate the vector X.
+*
+               TRANSL = HALF
+               CALL DMAKE( 'GE', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ),
+     $                     0, M - 1, RESET, TRANSL )
+               IF( M.GT.1 )THEN
+                  X( M/2 ) = ZERO
+                  XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO
+               END IF
+*
+               DO 90 IY = 1, NINC
+                  INCY = INC( IY )
+                  LY = ABS( INCY )*N
+*
+*                 Generate the vector Y.
+*
+                  TRANSL = ZERO
+                  CALL DMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY,
+     $                        ABS( INCY ), 0, N - 1, RESET, TRANSL )
+                  IF( N.GT.1 )THEN
+                     Y( N/2 ) = ZERO
+                     YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO
+                  END IF
+*
+                  DO 80 IA = 1, NALF
+                     ALPHA = ALF( IA )
+*
+*                    Generate the matrix A.
+*
+                     TRANSL = ZERO
+                     CALL DMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX,
+     $                           AA, LDA, M - 1, N - 1, RESET, TRANSL )
+*
+                     NC = NC + 1
+*
+*                    Save every datum before calling the subroutine.
+*
+                     MS = M
+                     NS = N
+                     ALS = ALPHA
+                     DO 10 I = 1, LAA
+                        AS( I ) = AA( I )
+   10                CONTINUE
+                     LDAS = LDA
+                     DO 20 I = 1, LX
+                        XS( I ) = XX( I )
+   20                CONTINUE
+                     INCXS = INCX
+                     DO 30 I = 1, LY
+                        YS( I ) = YY( I )
+   30                CONTINUE
+                     INCYS = INCY
+*
+*                    Call the subroutine.
+*
+                     IF( TRACE )
+     $                  WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N,
+     $                  ALPHA, INCX, INCY, LDA
+                     IF( REWI )
+     $                  REWIND NTRA
+                     CALL DGER( M, N, ALPHA, XX, INCX, YY, INCY, AA,
+     $                          LDA )
+*
+*                    Check if error-exit was taken incorrectly.
+*
+                     IF( .NOT.OK )THEN
+                        WRITE( NOUT, FMT = 9993 )
+                        FATAL = .TRUE.
+                        GO TO 140
+                     END IF
+*
+*                    See what data changed inside subroutine.
+*
+                     ISAME( 1 ) = MS.EQ.M
+                     ISAME( 2 ) = NS.EQ.N
+                     ISAME( 3 ) = ALS.EQ.ALPHA
+                     ISAME( 4 ) = LDE( XS, XX, LX )
+                     ISAME( 5 ) = INCXS.EQ.INCX
+                     ISAME( 6 ) = LDE( YS, YY, LY )
+                     ISAME( 7 ) = INCYS.EQ.INCY
+                     IF( NULL )THEN
+                        ISAME( 8 ) = LDE( AS, AA, LAA )
+                     ELSE
+                        ISAME( 8 ) = LDERES( 'GE', ' ', M, N, AS, AA,
+     $                               LDA )
+                     END IF
+                     ISAME( 9 ) = LDAS.EQ.LDA
+*
+*                    If data was incorrectly changed, report and return.
+*
+                     SAME = .TRUE.
+                     DO 40 I = 1, NARGS
+                        SAME = SAME.AND.ISAME( I )
+                        IF( .NOT.ISAME( I ) )
+     $                     WRITE( NOUT, FMT = 9998 )I
+   40                CONTINUE
+                     IF( .NOT.SAME )THEN
+                        FATAL = .TRUE.
+                        GO TO 140
+                     END IF
+*
+                     IF( .NOT.NULL )THEN
+*
+*                       Check the result column by column.
+*
+                        IF( INCX.GT.0 )THEN
+                           DO 50 I = 1, M
+                              Z( I ) = X( I )
+   50                      CONTINUE
+                        ELSE
+                           DO 60 I = 1, M
+                              Z( I ) = X( M - I + 1 )
+   60                      CONTINUE
+                        END IF
+                        DO 70 J = 1, N
+                           IF( INCY.GT.0 )THEN
+                              W( 1 ) = Y( J )
+                           ELSE
+                              W( 1 ) = Y( N - J + 1 )
+                           END IF
+                           CALL DMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1,
+     $                                 ONE, A( 1, J ), 1, YT, G,
+     $                                 AA( 1 + ( J - 1 )*LDA ), EPS,
+     $                                 ERR, FATAL, NOUT, .TRUE. )
+                           ERRMAX = MAX( ERRMAX, ERR )
+*                          If got really bad answer, report and return.
+                           IF( FATAL )
+     $                        GO TO 130
+   70                   CONTINUE
+                     ELSE
+*                       Avoid repeating tests with M.le.0 or N.le.0.
+                        GO TO 110
+                     END IF
+*
+   80             CONTINUE
+*
+   90          CONTINUE
+*
+  100       CONTINUE
+*
+  110    CONTINUE
+*
+  120 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 150
+*
+  130 CONTINUE
+      WRITE( NOUT, FMT = 9995 )J
+*
+  140 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA
+*
+  150 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( I3, ',' ), F4.1, ', X,', I2,
+     $      ', Y,', I2, ', A,', I3, ')                  .' )
+ 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of DCHK4.
+*
+      END
+      SUBROUTINE DCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX,
+     $                  INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G,
+     $                  Z )
+*
+*  Tests DSYR and DSPR.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, HALF, ONE
+      PARAMETER          ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), G( NMAX ), X( NMAX ),
+     $                   XS( NMAX*INCMAX ), XX( NMAX*INCMAX ),
+     $                   Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX ), Z( NMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   ALPHA, ALS, ERR, ERRMAX, TRANSL
+      INTEGER            I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA,
+     $                   LDA, LDAS, LJ, LX, N, NARGS, NC, NS
+      LOGICAL            FULL, NULL, PACKED, RESET, SAME, UPPER
+      CHARACTER*1        UPLO, UPLOS
+      CHARACTER*2        ICH
+*     .. Local Arrays ..
+      DOUBLE PRECISION   W( 1 )
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LDE, LDERES
+      EXTERNAL           LDE, LDERES
+*     .. External Subroutines ..
+      EXTERNAL           DMAKE, DMVCH, DSPR, DSYR
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICH/'UL'/
+*     .. Executable Statements ..
+      FULL = SNAME( 3: 3 ).EQ.'Y'
+      PACKED = SNAME( 3: 3 ).EQ.'P'
+*     Define the number of arguments.
+      IF( FULL )THEN
+         NARGS = 7
+      ELSE IF( PACKED )THEN
+         NARGS = 6
+      END IF
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*
+      DO 100 IN = 1, NIDIM
+         N = IDIM( IN )
+*        Set LDA to 1 more than minimum value if room.
+         LDA = N
+         IF( LDA.LT.NMAX )
+     $      LDA = LDA + 1
+*        Skip tests if not enough room.
+         IF( LDA.GT.NMAX )
+     $      GO TO 100
+         IF( PACKED )THEN
+            LAA = ( N*( N + 1 ) )/2
+         ELSE
+            LAA = LDA*N
+         END IF
+*
+         DO 90 IC = 1, 2
+            UPLO = ICH( IC: IC )
+            UPPER = UPLO.EQ.'U'
+*
+            DO 80 IX = 1, NINC
+               INCX = INC( IX )
+               LX = ABS( INCX )*N
+*
+*              Generate the vector X.
+*
+               TRANSL = HALF
+               CALL DMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ),
+     $                     0, N - 1, RESET, TRANSL )
+               IF( N.GT.1 )THEN
+                  X( N/2 ) = ZERO
+                  XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO
+               END IF
+*
+               DO 70 IA = 1, NALF
+                  ALPHA = ALF( IA )
+                  NULL = N.LE.0.OR.ALPHA.EQ.ZERO
+*
+*                 Generate the matrix A.
+*
+                  TRANSL = ZERO
+                  CALL DMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX,
+     $                        AA, LDA, N - 1, N - 1, RESET, TRANSL )
+*
+                  NC = NC + 1
+*
+*                 Save every datum before calling the subroutine.
+*
+                  UPLOS = UPLO
+                  NS = N
+                  ALS = ALPHA
+                  DO 10 I = 1, LAA
+                     AS( I ) = AA( I )
+   10             CONTINUE
+                  LDAS = LDA
+                  DO 20 I = 1, LX
+                     XS( I ) = XX( I )
+   20             CONTINUE
+                  INCXS = INCX
+*
+*                 Call the subroutine.
+*
+                  IF( FULL )THEN
+                     IF( TRACE )
+     $                  WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N,
+     $                  ALPHA, INCX, LDA
+                     IF( REWI )
+     $                  REWIND NTRA
+                     CALL DSYR( UPLO, N, ALPHA, XX, INCX, AA, LDA )
+                  ELSE IF( PACKED )THEN
+                     IF( TRACE )
+     $                  WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N,
+     $                  ALPHA, INCX
+                     IF( REWI )
+     $                  REWIND NTRA
+                     CALL DSPR( UPLO, N, ALPHA, XX, INCX, AA )
+                  END IF
+*
+*                 Check if error-exit was taken incorrectly.
+*
+                  IF( .NOT.OK )THEN
+                     WRITE( NOUT, FMT = 9992 )
+                     FATAL = .TRUE.
+                     GO TO 120
+                  END IF
+*
+*                 See what data changed inside subroutines.
+*
+                  ISAME( 1 ) = UPLO.EQ.UPLOS
+                  ISAME( 2 ) = NS.EQ.N
+                  ISAME( 3 ) = ALS.EQ.ALPHA
+                  ISAME( 4 ) = LDE( XS, XX, LX )
+                  ISAME( 5 ) = INCXS.EQ.INCX
+                  IF( NULL )THEN
+                     ISAME( 6 ) = LDE( AS, AA, LAA )
+                  ELSE
+                     ISAME( 6 ) = LDERES( SNAME( 2: 3 ), UPLO, N, N, AS,
+     $                            AA, LDA )
+                  END IF
+                  IF( .NOT.PACKED )THEN
+                     ISAME( 7 ) = LDAS.EQ.LDA
+                  END IF
+*
+*                 If data was incorrectly changed, report and return.
+*
+                  SAME = .TRUE.
+                  DO 30 I = 1, NARGS
+                     SAME = SAME.AND.ISAME( I )
+                     IF( .NOT.ISAME( I ) )
+     $                  WRITE( NOUT, FMT = 9998 )I
+   30             CONTINUE
+                  IF( .NOT.SAME )THEN
+                     FATAL = .TRUE.
+                     GO TO 120
+                  END IF
+*
+                  IF( .NOT.NULL )THEN
+*
+*                    Check the result column by column.
+*
+                     IF( INCX.GT.0 )THEN
+                        DO 40 I = 1, N
+                           Z( I ) = X( I )
+   40                   CONTINUE
+                     ELSE
+                        DO 50 I = 1, N
+                           Z( I ) = X( N - I + 1 )
+   50                   CONTINUE
+                     END IF
+                     JA = 1
+                     DO 60 J = 1, N
+                        W( 1 ) = Z( J )
+                        IF( UPPER )THEN
+                           JJ = 1
+                           LJ = J
+                        ELSE
+                           JJ = J
+                           LJ = N - J + 1
+                        END IF
+                        CALL DMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W,
+     $                              1, ONE, A( JJ, J ), 1, YT, G,
+     $                              AA( JA ), EPS, ERR, FATAL, NOUT,
+     $                              .TRUE. )
+                        IF( FULL )THEN
+                           IF( UPPER )THEN
+                              JA = JA + LDA
+                           ELSE
+                              JA = JA + LDA + 1
+                           END IF
+                        ELSE
+                           JA = JA + LJ
+                        END IF
+                        ERRMAX = MAX( ERRMAX, ERR )
+*                       If got really bad answer, report and return.
+                        IF( FATAL )
+     $                     GO TO 110
+   60                CONTINUE
+                  ELSE
+*                    Avoid repeating tests if N.le.0.
+                     IF( N.LE.0 )
+     $                  GO TO 100
+                  END IF
+*
+   70          CONTINUE
+*
+   80       CONTINUE
+*
+   90    CONTINUE
+*
+  100 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 130
+*
+  110 CONTINUE
+      WRITE( NOUT, FMT = 9995 )J
+*
+  120 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( FULL )THEN
+         WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX, LDA
+      ELSE IF( PACKED )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX
+      END IF
+*
+  130 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,',
+     $      I2, ', AP)                           .' )
+ 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,',
+     $      I2, ', A,', I3, ')                        .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of DCHK5.
+*
+      END
+      SUBROUTINE DCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX,
+     $                  INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G,
+     $                  Z )
+*
+*  Tests DSYR2 and DSPR2.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, HALF, ONE
+      PARAMETER          ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), G( NMAX ), X( NMAX ),
+     $                   XS( NMAX*INCMAX ), XX( NMAX*INCMAX ),
+     $                   Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX ), Z( NMAX, 2 )
+      INTEGER            IDIM( NIDIM ), INC( NINC )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   ALPHA, ALS, ERR, ERRMAX, TRANSL
+      INTEGER            I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX,
+     $                   IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N,
+     $                   NARGS, NC, NS
+      LOGICAL            FULL, NULL, PACKED, RESET, SAME, UPPER
+      CHARACTER*1        UPLO, UPLOS
+      CHARACTER*2        ICH
+*     .. Local Arrays ..
+      DOUBLE PRECISION   W( 2 )
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LDE, LDERES
+      EXTERNAL           LDE, LDERES
+*     .. External Subroutines ..
+      EXTERNAL           DMAKE, DMVCH, DSPR2, DSYR2
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICH/'UL'/
+*     .. Executable Statements ..
+      FULL = SNAME( 3: 3 ).EQ.'Y'
+      PACKED = SNAME( 3: 3 ).EQ.'P'
+*     Define the number of arguments.
+      IF( FULL )THEN
+         NARGS = 9
+      ELSE IF( PACKED )THEN
+         NARGS = 8
+      END IF
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*
+      DO 140 IN = 1, NIDIM
+         N = IDIM( IN )
+*        Set LDA to 1 more than minimum value if room.
+         LDA = N
+         IF( LDA.LT.NMAX )
+     $      LDA = LDA + 1
+*        Skip tests if not enough room.
+         IF( LDA.GT.NMAX )
+     $      GO TO 140
+         IF( PACKED )THEN
+            LAA = ( N*( N + 1 ) )/2
+         ELSE
+            LAA = LDA*N
+         END IF
+*
+         DO 130 IC = 1, 2
+            UPLO = ICH( IC: IC )
+            UPPER = UPLO.EQ.'U'
+*
+            DO 120 IX = 1, NINC
+               INCX = INC( IX )
+               LX = ABS( INCX )*N
+*
+*              Generate the vector X.
+*
+               TRANSL = HALF
+               CALL DMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ),
+     $                     0, N - 1, RESET, TRANSL )
+               IF( N.GT.1 )THEN
+                  X( N/2 ) = ZERO
+                  XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO
+               END IF
+*
+               DO 110 IY = 1, NINC
+                  INCY = INC( IY )
+                  LY = ABS( INCY )*N
+*
+*                 Generate the vector Y.
+*
+                  TRANSL = ZERO
+                  CALL DMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY,
+     $                        ABS( INCY ), 0, N - 1, RESET, TRANSL )
+                  IF( N.GT.1 )THEN
+                     Y( N/2 ) = ZERO
+                     YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO
+                  END IF
+*
+                  DO 100 IA = 1, NALF
+                     ALPHA = ALF( IA )
+                     NULL = N.LE.0.OR.ALPHA.EQ.ZERO
+*
+*                    Generate the matrix A.
+*
+                     TRANSL = ZERO
+                     CALL DMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A,
+     $                           NMAX, AA, LDA, N - 1, N - 1, RESET,
+     $                           TRANSL )
+*
+                     NC = NC + 1
+*
+*                    Save every datum before calling the subroutine.
+*
+                     UPLOS = UPLO
+                     NS = N
+                     ALS = ALPHA
+                     DO 10 I = 1, LAA
+                        AS( I ) = AA( I )
+   10                CONTINUE
+                     LDAS = LDA
+                     DO 20 I = 1, LX
+                        XS( I ) = XX( I )
+   20                CONTINUE
+                     INCXS = INCX
+                     DO 30 I = 1, LY
+                        YS( I ) = YY( I )
+   30                CONTINUE
+                     INCYS = INCY
+*
+*                    Call the subroutine.
+*
+                     IF( FULL )THEN
+                        IF( TRACE )
+     $                     WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N,
+     $                     ALPHA, INCX, INCY, LDA
+                        IF( REWI )
+     $                     REWIND NTRA
+                        CALL DSYR2( UPLO, N, ALPHA, XX, INCX, YY, INCY,
+     $                              AA, LDA )
+                     ELSE IF( PACKED )THEN
+                        IF( TRACE )
+     $                     WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N,
+     $                     ALPHA, INCX, INCY
+                        IF( REWI )
+     $                     REWIND NTRA
+                        CALL DSPR2( UPLO, N, ALPHA, XX, INCX, YY, INCY,
+     $                              AA )
+                     END IF
+*
+*                    Check if error-exit was taken incorrectly.
+*
+                     IF( .NOT.OK )THEN
+                        WRITE( NOUT, FMT = 9992 )
+                        FATAL = .TRUE.
+                        GO TO 160
+                     END IF
+*
+*                    See what data changed inside subroutines.
+*
+                     ISAME( 1 ) = UPLO.EQ.UPLOS
+                     ISAME( 2 ) = NS.EQ.N
+                     ISAME( 3 ) = ALS.EQ.ALPHA
+                     ISAME( 4 ) = LDE( XS, XX, LX )
+                     ISAME( 5 ) = INCXS.EQ.INCX
+                     ISAME( 6 ) = LDE( YS, YY, LY )
+                     ISAME( 7 ) = INCYS.EQ.INCY
+                     IF( NULL )THEN
+                        ISAME( 8 ) = LDE( AS, AA, LAA )
+                     ELSE
+                        ISAME( 8 ) = LDERES( SNAME( 2: 3 ), UPLO, N, N,
+     $                               AS, AA, LDA )
+                     END IF
+                     IF( .NOT.PACKED )THEN
+                        ISAME( 9 ) = LDAS.EQ.LDA
+                     END IF
+*
+*                    If data was incorrectly changed, report and return.
+*
+                     SAME = .TRUE.
+                     DO 40 I = 1, NARGS
+                        SAME = SAME.AND.ISAME( I )
+                        IF( .NOT.ISAME( I ) )
+     $                     WRITE( NOUT, FMT = 9998 )I
+   40                CONTINUE
+                     IF( .NOT.SAME )THEN
+                        FATAL = .TRUE.
+                        GO TO 160
+                     END IF
+*
+                     IF( .NOT.NULL )THEN
+*
+*                       Check the result column by column.
+*
+                        IF( INCX.GT.0 )THEN
+                           DO 50 I = 1, N
+                              Z( I, 1 ) = X( I )
+   50                      CONTINUE
+                        ELSE
+                           DO 60 I = 1, N
+                              Z( I, 1 ) = X( N - I + 1 )
+   60                      CONTINUE
+                        END IF
+                        IF( INCY.GT.0 )THEN
+                           DO 70 I = 1, N
+                              Z( I, 2 ) = Y( I )
+   70                      CONTINUE
+                        ELSE
+                           DO 80 I = 1, N
+                              Z( I, 2 ) = Y( N - I + 1 )
+   80                      CONTINUE
+                        END IF
+                        JA = 1
+                        DO 90 J = 1, N
+                           W( 1 ) = Z( J, 2 )
+                           W( 2 ) = Z( J, 1 )
+                           IF( UPPER )THEN
+                              JJ = 1
+                              LJ = J
+                           ELSE
+                              JJ = J
+                              LJ = N - J + 1
+                           END IF
+                           CALL DMVCH( 'N', LJ, 2, ALPHA, Z( JJ, 1 ),
+     $                                 NMAX, W, 1, ONE, A( JJ, J ), 1,
+     $                                 YT, G, AA( JA ), EPS, ERR, FATAL,
+     $                                 NOUT, .TRUE. )
+                           IF( FULL )THEN
+                              IF( UPPER )THEN
+                                 JA = JA + LDA
+                              ELSE
+                                 JA = JA + LDA + 1
+                              END IF
+                           ELSE
+                              JA = JA + LJ
+                           END IF
+                           ERRMAX = MAX( ERRMAX, ERR )
+*                          If got really bad answer, report and return.
+                           IF( FATAL )
+     $                        GO TO 150
+   90                   CONTINUE
+                     ELSE
+*                       Avoid repeating tests with N.le.0.
+                        IF( N.LE.0 )
+     $                     GO TO 140
+                     END IF
+*
+  100             CONTINUE
+*
+  110          CONTINUE
+*
+  120       CONTINUE
+*
+  130    CONTINUE
+*
+  140 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 170
+*
+  150 CONTINUE
+      WRITE( NOUT, FMT = 9995 )J
+*
+  160 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( FULL )THEN
+         WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX,
+     $      INCY, LDA
+      ELSE IF( PACKED )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX, INCY
+      END IF
+*
+  170 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,',
+     $      I2, ', Y,', I2, ', AP)                     .' )
+ 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,',
+     $      I2, ', Y,', I2, ', A,', I3, ')                  .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of DCHK6.
+*
+      END
+      SUBROUTINE DCHKE( ISNUM, SRNAMT, NOUT )
+*
+*  Tests the error exits from the Level 2 Blas.
+*  Requires a special version of the error-handling routine XERBLA.
+*  ALPHA, BETA, A, X and Y should not need to be defined.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      INTEGER            ISNUM, NOUT
+      CHARACTER*6        SRNAMT
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Local Scalars ..
+      DOUBLE PRECISION   ALPHA, BETA
+*     .. Local Arrays ..
+      DOUBLE PRECISION   A( 1, 1 ), X( 1 ), Y( 1 )
+*     .. External Subroutines ..
+      EXTERNAL           CHKXER, DGBMV, DGEMV, DGER, DSBMV, DSPMV, DSPR,
+     $                   DSPR2, DSYMV, DSYR, DSYR2, DTBMV, DTBSV, DTPMV,
+     $                   DTPSV, DTRMV, DTRSV
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Executable Statements ..
+*     OK is set to .FALSE. by the special version of XERBLA or by CHKXER
+*     if anything is wrong.
+      OK = .TRUE.
+*     LERR is set to .TRUE. by the special version of XERBLA each time
+*     it is called, and is then tested and re-set by CHKXER.
+      LERR = .FALSE.
+      GO TO ( 10, 20, 30, 40, 50, 60, 70, 80,
+     $        90, 100, 110, 120, 130, 140, 150,
+     $        160 )ISNUM
+   10 INFOT = 1
+      CALL DGEMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DGEMV( 'N', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DGEMV( 'N', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL DGEMV( 'N', 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL DGEMV( 'N', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL DGEMV( 'N', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+   20 INFOT = 1
+      CALL DGBMV( '/', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DGBMV( 'N', -1, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DGBMV( 'N', 0, -1, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DGBMV( 'N', 0, 0, -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DGBMV( 'N', 2, 0, 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL DGBMV( 'N', 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL DGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL DGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+   30 INFOT = 1
+      CALL DSYMV( '/', 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DSYMV( 'U', -1, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DSYMV( 'U', 2, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DSYMV( 'U', 0, ALPHA, A, 1, X, 0, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL DSYMV( 'U', 0, ALPHA, A, 1, X, 1, BETA, Y, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+   40 INFOT = 1
+      CALL DSBMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DSBMV( 'U', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DSBMV( 'U', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL DSBMV( 'U', 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL DSBMV( 'U', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL DSBMV( 'U', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+   50 INFOT = 1
+      CALL DSPMV( '/', 0, ALPHA, A, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DSPMV( 'U', -1, ALPHA, A, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL DSPMV( 'U', 0, ALPHA, A, X, 0, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DSPMV( 'U', 0, ALPHA, A, X, 1, BETA, Y, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+   60 INFOT = 1
+      CALL DTRMV( '/', 'N', 'N', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DTRMV( 'U', '/', 'N', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DTRMV( 'U', 'N', '/', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DTRMV( 'U', 'N', 'N', -1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL DTRMV( 'U', 'N', 'N', 2, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL DTRMV( 'U', 'N', 'N', 0, A, 1, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+   70 INFOT = 1
+      CALL DTBMV( '/', 'N', 'N', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DTBMV( 'U', '/', 'N', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DTBMV( 'U', 'N', '/', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DTBMV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DTBMV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DTBMV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DTBMV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+   80 INFOT = 1
+      CALL DTPMV( '/', 'N', 'N', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DTPMV( 'U', '/', 'N', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DTPMV( 'U', 'N', '/', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DTPMV( 'U', 'N', 'N', -1, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DTPMV( 'U', 'N', 'N', 0, A, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+   90 INFOT = 1
+      CALL DTRSV( '/', 'N', 'N', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DTRSV( 'U', '/', 'N', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DTRSV( 'U', 'N', '/', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DTRSV( 'U', 'N', 'N', -1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL DTRSV( 'U', 'N', 'N', 2, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL DTRSV( 'U', 'N', 'N', 0, A, 1, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+  100 INFOT = 1
+      CALL DTBSV( '/', 'N', 'N', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DTBSV( 'U', '/', 'N', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DTBSV( 'U', 'N', '/', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DTBSV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DTBSV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DTBSV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DTBSV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+  110 INFOT = 1
+      CALL DTPSV( '/', 'N', 'N', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DTPSV( 'U', '/', 'N', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DTPSV( 'U', 'N', '/', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DTPSV( 'U', 'N', 'N', -1, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DTPSV( 'U', 'N', 'N', 0, A, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+  120 INFOT = 1
+      CALL DGER( -1, 0, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DGER( 0, -1, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DGER( 0, 0, ALPHA, X, 0, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DGER( 0, 0, ALPHA, X, 1, Y, 0, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DGER( 2, 0, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+  130 INFOT = 1
+      CALL DSYR( '/', 0, ALPHA, X, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DSYR( 'U', -1, ALPHA, X, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DSYR( 'U', 0, ALPHA, X, 0, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DSYR( 'U', 2, ALPHA, X, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+  140 INFOT = 1
+      CALL DSPR( '/', 0, ALPHA, X, 1, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DSPR( 'U', -1, ALPHA, X, 1, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DSPR( 'U', 0, ALPHA, X, 0, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+  150 INFOT = 1
+      CALL DSYR2( '/', 0, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DSYR2( 'U', -1, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DSYR2( 'U', 0, ALPHA, X, 0, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DSYR2( 'U', 0, ALPHA, X, 1, Y, 0, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DSYR2( 'U', 2, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+  160 INFOT = 1
+      CALL DSPR2( '/', 0, ALPHA, X, 1, Y, 1, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DSPR2( 'U', -1, ALPHA, X, 1, Y, 1, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DSPR2( 'U', 0, ALPHA, X, 0, Y, 1, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DSPR2( 'U', 0, ALPHA, X, 1, Y, 0, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+*
+  170 IF( OK )THEN
+         WRITE( NOUT, FMT = 9999 )SRNAMT
+      ELSE
+         WRITE( NOUT, FMT = 9998 )SRNAMT
+      END IF
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' )
+ 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****',
+     $      '**' )
+*
+*     End of DCHKE.
+*
+      END
+      SUBROUTINE DMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL,
+     $                  KU, RESET, TRANSL )
+*
+*  Generates values for an M by N matrix A within the bandwidth
+*  defined by KL and KU.
+*  Stores the values in the array AA in the data structure required
+*  by the routine, with unwanted elements set to rogue value.
+*
+*  TYPE is 'GE', 'GB', 'SY', 'SB', 'SP', 'TR', 'TB' OR 'TP'.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
+      DOUBLE PRECISION   ROGUE
+      PARAMETER          ( ROGUE = -1.0D10 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   TRANSL
+      INTEGER            KL, KU, LDA, M, N, NMAX
+      LOGICAL            RESET
+      CHARACTER*1        DIAG, UPLO
+      CHARACTER*2        TYPE
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( NMAX, * ), AA( * )
+*     .. Local Scalars ..
+      INTEGER            I, I1, I2, I3, IBEG, IEND, IOFF, J, KK
+      LOGICAL            GEN, LOWER, SYM, TRI, UNIT, UPPER
+*     .. External Functions ..
+      DOUBLE PRECISION   DBEG
+      EXTERNAL           DBEG
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     .. Executable Statements ..
+      GEN = TYPE( 1: 1 ).EQ.'G'
+      SYM = TYPE( 1: 1 ).EQ.'S'
+      TRI = TYPE( 1: 1 ).EQ.'T'
+      UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U'
+      LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L'
+      UNIT = TRI.AND.DIAG.EQ.'U'
+*
+*     Generate data in array A.
+*
+      DO 20 J = 1, N
+         DO 10 I = 1, M
+            IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) )
+     $          THEN
+               IF( ( I.LE.J.AND.J - I.LE.KU ).OR.
+     $             ( I.GE.J.AND.I - J.LE.KL ) )THEN
+                  A( I, J ) = DBEG( RESET ) + TRANSL
+               ELSE
+                  A( I, J ) = ZERO
+               END IF
+               IF( I.NE.J )THEN
+                  IF( SYM )THEN
+                     A( J, I ) = A( I, J )
+                  ELSE IF( TRI )THEN
+                     A( J, I ) = ZERO
+                  END IF
+               END IF
+            END IF
+   10    CONTINUE
+         IF( TRI )
+     $      A( J, J ) = A( J, J ) + ONE
+         IF( UNIT )
+     $      A( J, J ) = ONE
+   20 CONTINUE
+*
+*     Store elements in array AS in data structure required by routine.
+*
+      IF( TYPE.EQ.'GE' )THEN
+         DO 50 J = 1, N
+            DO 30 I = 1, M
+               AA( I + ( J - 1 )*LDA ) = A( I, J )
+   30       CONTINUE
+            DO 40 I = M + 1, LDA
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+   40       CONTINUE
+   50    CONTINUE
+      ELSE IF( TYPE.EQ.'GB' )THEN
+         DO 90 J = 1, N
+            DO 60 I1 = 1, KU + 1 - J
+               AA( I1 + ( J - 1 )*LDA ) = ROGUE
+   60       CONTINUE
+            DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J )
+               AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J )
+   70       CONTINUE
+            DO 80 I3 = I2, LDA
+               AA( I3 + ( J - 1 )*LDA ) = ROGUE
+   80       CONTINUE
+   90    CONTINUE
+      ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN
+         DO 130 J = 1, N
+            IF( UPPER )THEN
+               IBEG = 1
+               IF( UNIT )THEN
+                  IEND = J - 1
+               ELSE
+                  IEND = J
+               END IF
+            ELSE
+               IF( UNIT )THEN
+                  IBEG = J + 1
+               ELSE
+                  IBEG = J
+               END IF
+               IEND = N
+            END IF
+            DO 100 I = 1, IBEG - 1
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+  100       CONTINUE
+            DO 110 I = IBEG, IEND
+               AA( I + ( J - 1 )*LDA ) = A( I, J )
+  110       CONTINUE
+            DO 120 I = IEND + 1, LDA
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+  120       CONTINUE
+  130    CONTINUE
+      ELSE IF( TYPE.EQ.'SB'.OR.TYPE.EQ.'TB' )THEN
+         DO 170 J = 1, N
+            IF( UPPER )THEN
+               KK = KL + 1
+               IBEG = MAX( 1, KL + 2 - J )
+               IF( UNIT )THEN
+                  IEND = KL
+               ELSE
+                  IEND = KL + 1
+               END IF
+            ELSE
+               KK = 1
+               IF( UNIT )THEN
+                  IBEG = 2
+               ELSE
+                  IBEG = 1
+               END IF
+               IEND = MIN( KL + 1, 1 + M - J )
+            END IF
+            DO 140 I = 1, IBEG - 1
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+  140       CONTINUE
+            DO 150 I = IBEG, IEND
+               AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J )
+  150       CONTINUE
+            DO 160 I = IEND + 1, LDA
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+  160       CONTINUE
+  170    CONTINUE
+      ELSE IF( TYPE.EQ.'SP'.OR.TYPE.EQ.'TP' )THEN
+         IOFF = 0
+         DO 190 J = 1, N
+            IF( UPPER )THEN
+               IBEG = 1
+               IEND = J
+            ELSE
+               IBEG = J
+               IEND = N
+            END IF
+            DO 180 I = IBEG, IEND
+               IOFF = IOFF + 1
+               AA( IOFF ) = A( I, J )
+               IF( I.EQ.J )THEN
+                  IF( UNIT )
+     $               AA( IOFF ) = ROGUE
+               END IF
+  180       CONTINUE
+  190    CONTINUE
+      END IF
+      RETURN
+*
+*     End of DMAKE.
+*
+      END
+      SUBROUTINE DMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y,
+     $                  INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV )
+*
+*  Checks the results of the computational tests.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   ALPHA, BETA, EPS, ERR
+      INTEGER            INCX, INCY, M, N, NMAX, NOUT
+      LOGICAL            FATAL, MV
+      CHARACTER*1        TRANS
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( NMAX, * ), G( * ), X( * ), Y( * ), YT( * ),
+     $                   YY( * )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   ERRI
+      INTEGER            I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL
+      LOGICAL            TRAN
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, SQRT
+*     .. Executable Statements ..
+      TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C'
+      IF( TRAN )THEN
+         ML = N
+         NL = M
+      ELSE
+         ML = M
+         NL = N
+      END IF
+      IF( INCX.LT.0 )THEN
+         KX = NL
+         INCXL = -1
+      ELSE
+         KX = 1
+         INCXL = 1
+      END IF
+      IF( INCY.LT.0 )THEN
+         KY = ML
+         INCYL = -1
+      ELSE
+         KY = 1
+         INCYL = 1
+      END IF
+*
+*     Compute expected result in YT using data in A, X and Y.
+*     Compute gauges in G.
+*
+      IY = KY
+      DO 30 I = 1, ML
+         YT( IY ) = ZERO
+         G( IY ) = ZERO
+         JX = KX
+         IF( TRAN )THEN
+            DO 10 J = 1, NL
+               YT( IY ) = YT( IY ) + A( J, I )*X( JX )
+               G( IY ) = G( IY ) + ABS( A( J, I )*X( JX ) )
+               JX = JX + INCXL
+   10       CONTINUE
+         ELSE
+            DO 20 J = 1, NL
+               YT( IY ) = YT( IY ) + A( I, J )*X( JX )
+               G( IY ) = G( IY ) + ABS( A( I, J )*X( JX ) )
+               JX = JX + INCXL
+   20       CONTINUE
+         END IF
+         YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY )
+         G( IY ) = ABS( ALPHA )*G( IY ) + ABS( BETA*Y( IY ) )
+         IY = IY + INCYL
+   30 CONTINUE
+*
+*     Compute the error ratio for this result.
+*
+      ERR = ZERO
+      DO 40 I = 1, ML
+         ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS
+         IF( G( I ).NE.ZERO )
+     $      ERRI = ERRI/G( I )
+         ERR = MAX( ERR, ERRI )
+         IF( ERR*SQRT( EPS ).GE.ONE )
+     $      GO TO 50
+   40 CONTINUE
+*     If the loop completes, all results are at least half accurate.
+      GO TO 70
+*
+*     Report fatal error.
+*
+   50 FATAL = .TRUE.
+      WRITE( NOUT, FMT = 9999 )
+      DO 60 I = 1, ML
+         IF( MV )THEN
+            WRITE( NOUT, FMT = 9998 )I, YT( I ),
+     $         YY( 1 + ( I - 1 )*ABS( INCY ) )
+         ELSE
+            WRITE( NOUT, FMT = 9998 )I,
+     $         YY( 1 + ( I - 1 )*ABS( INCY ) ), YT( I )
+         END IF
+   60 CONTINUE
+*
+   70 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL',
+     $      'F ACCURATE *******', /'           EXPECTED RESULT   COMPU',
+     $      'TED RESULT' )
+ 9998 FORMAT( 1X, I7, 2G18.6 )
+*
+*     End of DMVCH.
+*
+      END
+      LOGICAL FUNCTION LDE( RI, RJ, LR )
+*
+*  Tests if two arrays are identical.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      INTEGER            LR
+*     .. Array Arguments ..
+      DOUBLE PRECISION   RI( * ), RJ( * )
+*     .. Local Scalars ..
+      INTEGER            I
+*     .. Executable Statements ..
+      DO 10 I = 1, LR
+         IF( RI( I ).NE.RJ( I ) )
+     $      GO TO 20
+   10 CONTINUE
+      LDE = .TRUE.
+      GO TO 30
+   20 CONTINUE
+      LDE = .FALSE.
+   30 RETURN
+*
+*     End of LDE.
+*
+      END
+      LOGICAL FUNCTION LDERES( TYPE, UPLO, M, N, AA, AS, LDA )
+*
+*  Tests if selected elements in two arrays are equal.
+*
+*  TYPE is 'GE', 'SY' or 'SP'.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      INTEGER            LDA, M, N
+      CHARACTER*1        UPLO
+      CHARACTER*2        TYPE
+*     .. Array Arguments ..
+      DOUBLE PRECISION   AA( LDA, * ), AS( LDA, * )
+*     .. Local Scalars ..
+      INTEGER            I, IBEG, IEND, J
+      LOGICAL            UPPER
+*     .. Executable Statements ..
+      UPPER = UPLO.EQ.'U'
+      IF( TYPE.EQ.'GE' )THEN
+         DO 20 J = 1, N
+            DO 10 I = M + 1, LDA
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   10       CONTINUE
+   20    CONTINUE
+      ELSE IF( TYPE.EQ.'SY' )THEN
+         DO 50 J = 1, N
+            IF( UPPER )THEN
+               IBEG = 1
+               IEND = J
+            ELSE
+               IBEG = J
+               IEND = N
+            END IF
+            DO 30 I = 1, IBEG - 1
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   30       CONTINUE
+            DO 40 I = IEND + 1, LDA
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   40       CONTINUE
+   50    CONTINUE
+      END IF
+*
+      LDERES = .TRUE.
+      GO TO 80
+   70 CONTINUE
+      LDERES = .FALSE.
+   80 RETURN
+*
+*     End of LDERES.
+*
+      END
+      DOUBLE PRECISION FUNCTION DBEG( RESET )
+*
+*  Generates random numbers uniformly distributed between -0.5 and 0.5.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      LOGICAL            RESET
+*     .. Local Scalars ..
+      INTEGER            I, IC, MI
+*     .. Save statement ..
+      SAVE               I, IC, MI
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE
+*     .. Executable Statements ..
+      IF( RESET )THEN
+*        Initialize local variables.
+         MI = 891
+         I = 7
+         IC = 0
+         RESET = .FALSE.
+      END IF
+*
+*     The sequence of values of I is bounded between 1 and 999.
+*     If initial I = 1,2,3,6,7 or 9, the period will be 50.
+*     If initial I = 4 or 8, the period will be 25.
+*     If initial I = 5, the period will be 10.
+*     IC is used to break up the period by skipping 1 value of I in 6.
+*
+      IC = IC + 1
+   10 I = I*MI
+      I = I - 1000*( I/1000 )
+      IF( IC.GE.5 )THEN
+         IC = 0
+         GO TO 10
+      END IF
+      DBEG = DBLE( I - 500 )/1001.0D0
+      RETURN
+*
+*     End of DBEG.
+*
+      END
+      DOUBLE PRECISION FUNCTION DDIFF( X, Y )
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   X, Y
+*     .. Executable Statements ..
+      DDIFF = X - Y
+      RETURN
+*
+*     End of DDIFF.
+*
+      END
+      SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+*
+*  Tests whether XERBLA has detected an error when it should.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFOT, NOUT
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Executable Statements ..
+      IF( .NOT.LERR )THEN
+         WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT
+         OK = .FALSE.
+      END IF
+      LERR = .FALSE.
+      RETURN
+*
+ 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D',
+     $      'ETECTED BY ', A6, ' *****' )
+*
+*     End of CHKXER.
+*
+      END
+      SUBROUTINE XERBLA( SRNAME, INFO )
+*
+*  This is a special version of XERBLA to be used only as part of
+*  the test program for testing error exits from the Level 2 BLAS
+*  routines.
+*
+*  XERBLA  is an error handler for the Level 2 BLAS routines.
+*
+*  It is called by the Level 2 BLAS routines if an input parameter is
+*  invalid.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO
+      CHARACTER*6        SRNAME
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUT
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUT, OK, LERR
+      COMMON             /SRNAMC/SRNAMT
+*     .. Executable Statements ..
+      LERR = .TRUE.
+      IF( INFO.NE.INFOT )THEN
+         IF( INFOT.NE.0 )THEN
+            WRITE( NOUT, FMT = 9999 )INFO, INFOT
+         ELSE
+            WRITE( NOUT, FMT = 9997 )INFO
+         END IF
+         OK = .FALSE.
+      END IF
+      IF( SRNAME.NE.SRNAMT )THEN
+         WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT
+         OK = .FALSE.
+      END IF
+      RETURN
+*
+ 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD',
+     $      ' OF ', I2, ' *******' )
+ 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE',
+     $      'AD OF ', A6, ' *******' )
+ 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6,
+     $      ' *******' )
+*
+*     End of XERBLA
+*
+      END
+

diff --git a/blas/testing/dblat3.dat b/blas/testing/dblat3.dat
new file mode 100644
index 0000000..5cbc2e6
--- /dev/null
+++ b/blas/testing/dblat3.dat

@@ -0,0 +1,20 @@
+'dblat3.summ'     NAME OF SUMMARY OUTPUT FILE
+6                 UNIT NUMBER OF SUMMARY FILE
+'dblat3.snap'     NAME OF SNAPSHOT OUTPUT FILE
+-1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+F        LOGICAL FLAG, T TO STOP ON FAILURES.
+T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+16.0     THRESHOLD VALUE OF TEST RATIO
+6                 NUMBER OF VALUES OF N
+0 1 2 3 5 9       VALUES OF N
+3                 NUMBER OF VALUES OF ALPHA
+0.0 1.0 0.7       VALUES OF ALPHA
+3                 NUMBER OF VALUES OF BETA
+0.0 1.0 1.3       VALUES OF BETA
+DGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+DSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
+DTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
+DTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
+DSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
+DSYR2K T PUT F FOR NO TEST. SAME COLUMNS.

diff --git a/blas/testing/dblat3.f b/blas/testing/dblat3.f
new file mode 100644
index 0000000..8d37c74
--- /dev/null
+++ b/blas/testing/dblat3.f

@@ -0,0 +1,2873 @@
+*> \brief \b DBLAT3
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM DBLAT3
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the DOUBLE PRECISION Level 3 Blas.
+*>
+*> The program must be driven by a short data file. The first 14 records
+*> of the file are read using list-directed input, the last 6 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 20 lines:
+*> 'dblat3.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'DBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> 0.0 1.0 0.7       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> 0.0 1.0 1.3       VALUES OF BETA
+*> DGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*> See:
+*>
+*>    Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
+*>    A Set of Level 3 Basic Linear Algebra Subprograms.
+*>
+*>    Technical Memorandum No.88 (Revision 1), Mathematics and
+*>    Computer Science Division, Argonne National Laboratory, 9700
+*>    South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*> -- Written on 8-February-1989.
+*>    Jack Dongarra, Argonne National Laboratory.
+*>    Iain Duff, AERE Harwell.
+*>    Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*>    Sven Hammarling, Numerical Algorithms Group Ltd.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup double_blas_testing
+*
+*  =====================================================================
+      PROGRAM DBLAT3
+*
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NIN
+      PARAMETER          ( NIN = 5 )
+      INTEGER            NSUBS
+      PARAMETER          ( NSUBS = 6 )
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
+      INTEGER            NMAX
+      PARAMETER          ( NMAX = 65 )
+      INTEGER            NIDMAX, NALMAX, NBEMAX
+      PARAMETER          ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   EPS, ERR, THRESH
+      INTEGER            I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA
+      LOGICAL            FATAL, LTESTT, REWI, SAME, SFATAL, TRACE,
+     $                   TSTERR
+      CHARACTER*1        TRANSA, TRANSB
+      CHARACTER*6        SNAMET
+      CHARACTER*32       SNAPS, SUMMRY
+*     .. Local Arrays ..
+      DOUBLE PRECISION   AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ),
+     $                   ALF( NALMAX ), AS( NMAX*NMAX ),
+     $                   BB( NMAX*NMAX ), BET( NBEMAX ),
+     $                   BS( NMAX*NMAX ), C( NMAX, NMAX ),
+     $                   CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ),
+     $                   G( NMAX ), W( 2*NMAX )
+      INTEGER            IDIM( NIDMAX )
+      LOGICAL            LTEST( NSUBS )
+      CHARACTER*6        SNAMES( NSUBS )
+*     .. External Functions ..
+      DOUBLE PRECISION   DDIFF
+      LOGICAL            LDE
+      EXTERNAL           DDIFF, LDE
+*     .. External Subroutines ..
+      EXTERNAL           DCHK1, DCHK2, DCHK3, DCHK4, DCHK5, DCHKE, DMMCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+      COMMON             /SRNAMC/SRNAMT
+*     .. Data statements ..
+      DATA               SNAMES/'DGEMM ', 'DSYMM ', 'DTRMM ', 'DTRSM ',
+     $                   'DSYRK ', 'DSYR2K'/
+*     .. Executable Statements ..
+*
+*     Read name and unit number for summary output file and open file.
+*
+      READ( NIN, FMT = * )SUMMRY
+      READ( NIN, FMT = * )NOUT
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
+      NOUTC = NOUT
+*
+*     Read name and unit number for snapshot output file and open file.
+*
+      READ( NIN, FMT = * )SNAPS
+      READ( NIN, FMT = * )NTRA
+      TRACE = NTRA.GE.0
+      IF( TRACE )THEN
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
+      END IF
+*     Read the flag that directs rewinding of the snapshot file.
+      READ( NIN, FMT = * )REWI
+      REWI = REWI.AND.TRACE
+*     Read the flag that directs stopping on any failure.
+      READ( NIN, FMT = * )SFATAL
+*     Read the flag that indicates whether error exits are to be tested.
+      READ( NIN, FMT = * )TSTERR
+*     Read the threshold value of the test ratio
+      READ( NIN, FMT = * )THRESH
+*
+*     Read and check the parameter values for the tests.
+*
+*     Values of N
+      READ( NIN, FMT = * )NIDIM
+      IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'N', NIDMAX
+         GO TO 220
+      END IF
+      READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM )
+      DO 10 I = 1, NIDIM
+         IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN
+            WRITE( NOUT, FMT = 9996 )NMAX
+            GO TO 220
+         END IF
+   10 CONTINUE
+*     Values of ALPHA
+      READ( NIN, FMT = * )NALF
+      IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX
+         GO TO 220
+      END IF
+      READ( NIN, FMT = * )( ALF( I ), I = 1, NALF )
+*     Values of BETA
+      READ( NIN, FMT = * )NBET
+      IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX
+         GO TO 220
+      END IF
+      READ( NIN, FMT = * )( BET( I ), I = 1, NBET )
+*
+*     Report values of parameters.
+*
+      WRITE( NOUT, FMT = 9995 )
+      WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM )
+      WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF )
+      WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET )
+      IF( .NOT.TSTERR )THEN
+         WRITE( NOUT, FMT = * )
+         WRITE( NOUT, FMT = 9984 )
+      END IF
+      WRITE( NOUT, FMT = * )
+      WRITE( NOUT, FMT = 9999 )THRESH
+      WRITE( NOUT, FMT = * )
+*
+*     Read names of subroutines and flags which indicate
+*     whether they are to be tested.
+*
+      DO 20 I = 1, NSUBS
+         LTEST( I ) = .FALSE.
+   20 CONTINUE
+   30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT
+      DO 40 I = 1, NSUBS
+         IF( SNAMET.EQ.SNAMES( I ) )
+     $      GO TO 50
+   40 CONTINUE
+      WRITE( NOUT, FMT = 9990 )SNAMET
+      STOP
+   50 LTEST( I ) = LTESTT
+      GO TO 30
+*
+   60 CONTINUE
+      CLOSE ( NIN )
+*
+*     Compute EPS (the machine precision).
+*
+      EPS = EPSILON(ZERO)
+      WRITE( NOUT, FMT = 9998 )EPS
+*
+*     Check the reliability of DMMCH using exact data.
+*
+      N = MIN( 32, NMAX )
+      DO 100 J = 1, N
+         DO 90 I = 1, N
+            AB( I, J ) = MAX( I - J + 1, 0 )
+   90    CONTINUE
+         AB( J, NMAX + 1 ) = J
+         AB( 1, NMAX + J ) = J
+         C( J, 1 ) = ZERO
+  100 CONTINUE
+      DO 110 J = 1, N
+         CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3
+  110 CONTINUE
+*     CC holds the exact result. On exit from DMMCH CT holds
+*     the result computed by DMMCH.
+      TRANSA = 'N'
+      TRANSB = 'N'
+      CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+     $            AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+     $            NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LDE( CC, CT, N )
+      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
+         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+         STOP
+      END IF
+      TRANSB = 'T'
+      CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+     $            AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+     $            NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LDE( CC, CT, N )
+      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
+         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+         STOP
+      END IF
+      DO 120 J = 1, N
+         AB( J, NMAX + 1 ) = N - J + 1
+         AB( 1, NMAX + J ) = N - J + 1
+  120 CONTINUE
+      DO 130 J = 1, N
+         CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 -
+     $                     ( ( J + 1 )*J*( J - 1 ) )/3
+  130 CONTINUE
+      TRANSA = 'T'
+      TRANSB = 'N'
+      CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+     $            AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+     $            NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LDE( CC, CT, N )
+      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
+         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+         STOP
+      END IF
+      TRANSB = 'T'
+      CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+     $            AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+     $            NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LDE( CC, CT, N )
+      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
+         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+         STOP
+      END IF
+*
+*     Test each subroutine in turn.
+*
+      DO 200 ISNUM = 1, NSUBS
+         WRITE( NOUT, FMT = * )
+         IF( .NOT.LTEST( ISNUM ) )THEN
+*           Subprogram is not to be tested.
+            WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM )
+         ELSE
+            SRNAMT = SNAMES( ISNUM )
+*           Test error exits.
+            IF( TSTERR )THEN
+               CALL DCHKE( ISNUM, SNAMES( ISNUM ), NOUT )
+               WRITE( NOUT, FMT = * )
+            END IF
+*           Test computations.
+            INFOT = 0
+            OK = .TRUE.
+            FATAL = .FALSE.
+            GO TO ( 140, 150, 160, 160, 170, 180 )ISNUM
+*           Test DGEMM, 01.
+  140       CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+     $                  NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+     $                  CC, CS, CT, G )
+            GO TO 190
+*           Test DSYMM, 02.
+  150       CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+     $                  NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+     $                  CC, CS, CT, G )
+            GO TO 190
+*           Test DTRMM, 03, DTRSM, 04.
+  160       CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB,
+     $                  AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C )
+            GO TO 190
+*           Test DSYRK, 05.
+  170       CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+     $                  NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+     $                  CC, CS, CT, G )
+            GO TO 190
+*           Test DSYR2K, 06.
+  180       CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+     $                  NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W )
+            GO TO 190
+*
+  190       IF( FATAL.AND.SFATAL )
+     $         GO TO 210
+         END IF
+  200 CONTINUE
+      WRITE( NOUT, FMT = 9986 )
+      GO TO 230
+*
+  210 CONTINUE
+      WRITE( NOUT, FMT = 9985 )
+      GO TO 230
+*
+  220 CONTINUE
+      WRITE( NOUT, FMT = 9991 )
+*
+  230 CONTINUE
+      IF( TRACE )
+     $   CLOSE ( NTRA )
+      CLOSE ( NOUT )
+      STOP
+*
+ 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES',
+     $      'S THAN', F8.2 )
+ 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 )
+ 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ',
+     $      'THAN ', I2 )
+ 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 )
+ 9995 FORMAT( ' TESTS OF THE DOUBLE PRECISION LEVEL 3 BLAS', //' THE F',
+     $      'OLLOWING PARAMETER VALUES WILL BE USED:' )
+ 9994 FORMAT( '   FOR N              ', 9I6 )
+ 9993 FORMAT( '   FOR ALPHA          ', 7F6.1 )
+ 9992 FORMAT( '   FOR BETA           ', 7F6.1 )
+ 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM',
+     $      /' ******* TESTS ABANDONED *******' )
+ 9990 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T',
+     $      'ESTS ABANDONED *******' )
+ 9989 FORMAT( ' ERROR IN DMMCH -  IN-LINE DOT PRODUCTS ARE BEING EVALU',
+     $      'ATED WRONGLY.', /' DMMCH WAS CALLED WITH TRANSA = ', A1,
+     $      ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ',
+     $      'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ',
+     $      'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ',
+     $      '*******' )
+ 9988 FORMAT( A6, L2 )
+ 9987 FORMAT( 1X, A6, ' WAS NOT TESTED' )
+ 9986 FORMAT( /' END OF TESTS' )
+ 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' )
+ 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' )
+*
+*     End of DBLAT3.
+*
+      END
+      SUBROUTINE DCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+     $                  A, AA, AS, B, BB, BS, C, CC, CS, CT, G )
+*
+*  Tests DGEMM.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO
+      PARAMETER          ( ZERO = 0.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), B( NMAX, NMAX ),
+     $                   BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+     $                   C( NMAX, NMAX ), CC( NMAX*NMAX ),
+     $                   CS( NMAX*NMAX ), CT( NMAX ), G( NMAX )
+      INTEGER            IDIM( NIDIM )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   ALPHA, ALS, BETA, BLS, ERR, ERRMAX
+      INTEGER            I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA,
+     $                   LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M,
+     $                   MA, MB, MS, N, NA, NARGS, NB, NC, NS
+      LOGICAL            NULL, RESET, SAME, TRANA, TRANB
+      CHARACTER*1        TRANAS, TRANBS, TRANSA, TRANSB
+      CHARACTER*3        ICH
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LDE, LDERES
+      EXTERNAL           LDE, LDERES
+*     .. External Subroutines ..
+      EXTERNAL           DGEMM, DMAKE, DMMCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICH/'NTC'/
+*     .. Executable Statements ..
+*
+      NARGS = 13
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*
+      DO 110 IM = 1, NIDIM
+         M = IDIM( IM )
+*
+         DO 100 IN = 1, NIDIM
+            N = IDIM( IN )
+*           Set LDC to 1 more than minimum value if room.
+            LDC = M
+            IF( LDC.LT.NMAX )
+     $         LDC = LDC + 1
+*           Skip tests if not enough room.
+            IF( LDC.GT.NMAX )
+     $         GO TO 100
+            LCC = LDC*N
+            NULL = N.LE.0.OR.M.LE.0
+*
+            DO 90 IK = 1, NIDIM
+               K = IDIM( IK )
+*
+               DO 80 ICA = 1, 3
+                  TRANSA = ICH( ICA: ICA )
+                  TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C'
+*
+                  IF( TRANA )THEN
+                     MA = K
+                     NA = M
+                  ELSE
+                     MA = M
+                     NA = K
+                  END IF
+*                 Set LDA to 1 more than minimum value if room.
+                  LDA = MA
+                  IF( LDA.LT.NMAX )
+     $               LDA = LDA + 1
+*                 Skip tests if not enough room.
+                  IF( LDA.GT.NMAX )
+     $               GO TO 80
+                  LAA = LDA*NA
+*
+*                 Generate the matrix A.
+*
+                  CALL DMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA,
+     $                        RESET, ZERO )
+*
+                  DO 70 ICB = 1, 3
+                     TRANSB = ICH( ICB: ICB )
+                     TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C'
+*
+                     IF( TRANB )THEN
+                        MB = N
+                        NB = K
+                     ELSE
+                        MB = K
+                        NB = N
+                     END IF
+*                    Set LDB to 1 more than minimum value if room.
+                     LDB = MB
+                     IF( LDB.LT.NMAX )
+     $                  LDB = LDB + 1
+*                    Skip tests if not enough room.
+                     IF( LDB.GT.NMAX )
+     $                  GO TO 70
+                     LBB = LDB*NB
+*
+*                    Generate the matrix B.
+*
+                     CALL DMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB,
+     $                           LDB, RESET, ZERO )
+*
+                     DO 60 IA = 1, NALF
+                        ALPHA = ALF( IA )
+*
+                        DO 50 IB = 1, NBET
+                           BETA = BET( IB )
+*
+*                          Generate the matrix C.
+*
+                           CALL DMAKE( 'GE', ' ', ' ', M, N, C, NMAX,
+     $                                 CC, LDC, RESET, ZERO )
+*
+                           NC = NC + 1
+*
+*                          Save every datum before calling the
+*                          subroutine.
+*
+                           TRANAS = TRANSA
+                           TRANBS = TRANSB
+                           MS = M
+                           NS = N
+                           KS = K
+                           ALS = ALPHA
+                           DO 10 I = 1, LAA
+                              AS( I ) = AA( I )
+   10                      CONTINUE
+                           LDAS = LDA
+                           DO 20 I = 1, LBB
+                              BS( I ) = BB( I )
+   20                      CONTINUE
+                           LDBS = LDB
+                           BLS = BETA
+                           DO 30 I = 1, LCC
+                              CS( I ) = CC( I )
+   30                      CONTINUE
+                           LDCS = LDC
+*
+*                          Call the subroutine.
+*
+                           IF( TRACE )
+     $                        WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                        TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB,
+     $                        BETA, LDC
+                           IF( REWI )
+     $                        REWIND NTRA
+                           CALL DGEMM( TRANSA, TRANSB, M, N, K, ALPHA,
+     $                                 AA, LDA, BB, LDB, BETA, CC, LDC )
+*
+*                          Check if error-exit was taken incorrectly.
+*
+                           IF( .NOT.OK )THEN
+                              WRITE( NOUT, FMT = 9994 )
+                              FATAL = .TRUE.
+                              GO TO 120
+                           END IF
+*
+*                          See what data changed inside subroutines.
+*
+                           ISAME( 1 ) = TRANSA.EQ.TRANAS
+                           ISAME( 2 ) = TRANSB.EQ.TRANBS
+                           ISAME( 3 ) = MS.EQ.M
+                           ISAME( 4 ) = NS.EQ.N
+                           ISAME( 5 ) = KS.EQ.K
+                           ISAME( 6 ) = ALS.EQ.ALPHA
+                           ISAME( 7 ) = LDE( AS, AA, LAA )
+                           ISAME( 8 ) = LDAS.EQ.LDA
+                           ISAME( 9 ) = LDE( BS, BB, LBB )
+                           ISAME( 10 ) = LDBS.EQ.LDB
+                           ISAME( 11 ) = BLS.EQ.BETA
+                           IF( NULL )THEN
+                              ISAME( 12 ) = LDE( CS, CC, LCC )
+                           ELSE
+                              ISAME( 12 ) = LDERES( 'GE', ' ', M, N, CS,
+     $                                      CC, LDC )
+                           END IF
+                           ISAME( 13 ) = LDCS.EQ.LDC
+*
+*                          If data was incorrectly changed, report
+*                          and return.
+*
+                           SAME = .TRUE.
+                           DO 40 I = 1, NARGS
+                              SAME = SAME.AND.ISAME( I )
+                              IF( .NOT.ISAME( I ) )
+     $                           WRITE( NOUT, FMT = 9998 )I
+   40                      CONTINUE
+                           IF( .NOT.SAME )THEN
+                              FATAL = .TRUE.
+                              GO TO 120
+                           END IF
+*
+                           IF( .NOT.NULL )THEN
+*
+*                             Check the result.
+*
+                              CALL DMMCH( TRANSA, TRANSB, M, N, K,
+     $                                    ALPHA, A, NMAX, B, NMAX, BETA,
+     $                                    C, NMAX, CT, G, CC, LDC, EPS,
+     $                                    ERR, FATAL, NOUT, .TRUE. )
+                              ERRMAX = MAX( ERRMAX, ERR )
+*                             If got really bad answer, report and
+*                             return.
+                              IF( FATAL )
+     $                           GO TO 120
+                           END IF
+*
+   50                   CONTINUE
+*
+   60                CONTINUE
+*
+   70             CONTINUE
+*
+   80          CONTINUE
+*
+   90       CONTINUE
+*
+  100    CONTINUE
+*
+  110 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 130
+*
+  120 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K,
+     $   ALPHA, LDA, LDB, BETA, LDC
+*
+  130 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',''', A1, ''',',
+     $      3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', ',
+     $      'C,', I3, ').' )
+ 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of DCHK1.
+*
+      END
+      SUBROUTINE DCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+     $                  A, AA, AS, B, BB, BS, C, CC, CS, CT, G )
+*
+*  Tests DSYMM.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO
+      PARAMETER          ( ZERO = 0.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), B( NMAX, NMAX ),
+     $                   BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+     $                   C( NMAX, NMAX ), CC( NMAX*NMAX ),
+     $                   CS( NMAX*NMAX ), CT( NMAX ), G( NMAX )
+      INTEGER            IDIM( NIDIM )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   ALPHA, ALS, BETA, BLS, ERR, ERRMAX
+      INTEGER            I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC,
+     $                   LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA,
+     $                   NARGS, NC, NS
+      LOGICAL            LEFT, NULL, RESET, SAME
+      CHARACTER*1        SIDE, SIDES, UPLO, UPLOS
+      CHARACTER*2        ICHS, ICHU
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LDE, LDERES
+      EXTERNAL           LDE, LDERES
+*     .. External Subroutines ..
+      EXTERNAL           DMAKE, DMMCH, DSYMM
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICHS/'LR'/, ICHU/'UL'/
+*     .. Executable Statements ..
+*
+      NARGS = 12
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*
+      DO 100 IM = 1, NIDIM
+         M = IDIM( IM )
+*
+         DO 90 IN = 1, NIDIM
+            N = IDIM( IN )
+*           Set LDC to 1 more than minimum value if room.
+            LDC = M
+            IF( LDC.LT.NMAX )
+     $         LDC = LDC + 1
+*           Skip tests if not enough room.
+            IF( LDC.GT.NMAX )
+     $         GO TO 90
+            LCC = LDC*N
+            NULL = N.LE.0.OR.M.LE.0
+*
+*           Set LDB to 1 more than minimum value if room.
+            LDB = M
+            IF( LDB.LT.NMAX )
+     $         LDB = LDB + 1
+*           Skip tests if not enough room.
+            IF( LDB.GT.NMAX )
+     $         GO TO 90
+            LBB = LDB*N
+*
+*           Generate the matrix B.
+*
+            CALL DMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET,
+     $                  ZERO )
+*
+            DO 80 ICS = 1, 2
+               SIDE = ICHS( ICS: ICS )
+               LEFT = SIDE.EQ.'L'
+*
+               IF( LEFT )THEN
+                  NA = M
+               ELSE
+                  NA = N
+               END IF
+*              Set LDA to 1 more than minimum value if room.
+               LDA = NA
+               IF( LDA.LT.NMAX )
+     $            LDA = LDA + 1
+*              Skip tests if not enough room.
+               IF( LDA.GT.NMAX )
+     $            GO TO 80
+               LAA = LDA*NA
+*
+               DO 70 ICU = 1, 2
+                  UPLO = ICHU( ICU: ICU )
+*
+*                 Generate the symmetric matrix A.
+*
+                  CALL DMAKE( 'SY', UPLO, ' ', NA, NA, A, NMAX, AA, LDA,
+     $                        RESET, ZERO )
+*
+                  DO 60 IA = 1, NALF
+                     ALPHA = ALF( IA )
+*
+                     DO 50 IB = 1, NBET
+                        BETA = BET( IB )
+*
+*                       Generate the matrix C.
+*
+                        CALL DMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC,
+     $                              LDC, RESET, ZERO )
+*
+                        NC = NC + 1
+*
+*                       Save every datum before calling the
+*                       subroutine.
+*
+                        SIDES = SIDE
+                        UPLOS = UPLO
+                        MS = M
+                        NS = N
+                        ALS = ALPHA
+                        DO 10 I = 1, LAA
+                           AS( I ) = AA( I )
+   10                   CONTINUE
+                        LDAS = LDA
+                        DO 20 I = 1, LBB
+                           BS( I ) = BB( I )
+   20                   CONTINUE
+                        LDBS = LDB
+                        BLS = BETA
+                        DO 30 I = 1, LCC
+                           CS( I ) = CC( I )
+   30                   CONTINUE
+                        LDCS = LDC
+*
+*                       Call the subroutine.
+*
+                        IF( TRACE )
+     $                     WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE,
+     $                     UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC
+                        IF( REWI )
+     $                     REWIND NTRA
+                        CALL DSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA,
+     $                              BB, LDB, BETA, CC, LDC )
+*
+*                       Check if error-exit was taken incorrectly.
+*
+                        IF( .NOT.OK )THEN
+                           WRITE( NOUT, FMT = 9994 )
+                           FATAL = .TRUE.
+                           GO TO 110
+                        END IF
+*
+*                       See what data changed inside subroutines.
+*
+                        ISAME( 1 ) = SIDES.EQ.SIDE
+                        ISAME( 2 ) = UPLOS.EQ.UPLO
+                        ISAME( 3 ) = MS.EQ.M
+                        ISAME( 4 ) = NS.EQ.N
+                        ISAME( 5 ) = ALS.EQ.ALPHA
+                        ISAME( 6 ) = LDE( AS, AA, LAA )
+                        ISAME( 7 ) = LDAS.EQ.LDA
+                        ISAME( 8 ) = LDE( BS, BB, LBB )
+                        ISAME( 9 ) = LDBS.EQ.LDB
+                        ISAME( 10 ) = BLS.EQ.BETA
+                        IF( NULL )THEN
+                           ISAME( 11 ) = LDE( CS, CC, LCC )
+                        ELSE
+                           ISAME( 11 ) = LDERES( 'GE', ' ', M, N, CS,
+     $                                   CC, LDC )
+                        END IF
+                        ISAME( 12 ) = LDCS.EQ.LDC
+*
+*                       If data was incorrectly changed, report and
+*                       return.
+*
+                        SAME = .TRUE.
+                        DO 40 I = 1, NARGS
+                           SAME = SAME.AND.ISAME( I )
+                           IF( .NOT.ISAME( I ) )
+     $                        WRITE( NOUT, FMT = 9998 )I
+   40                   CONTINUE
+                        IF( .NOT.SAME )THEN
+                           FATAL = .TRUE.
+                           GO TO 110
+                        END IF
+*
+                        IF( .NOT.NULL )THEN
+*
+*                          Check the result.
+*
+                           IF( LEFT )THEN
+                              CALL DMMCH( 'N', 'N', M, N, M, ALPHA, A,
+     $                                    NMAX, B, NMAX, BETA, C, NMAX,
+     $                                    CT, G, CC, LDC, EPS, ERR,
+     $                                    FATAL, NOUT, .TRUE. )
+                           ELSE
+                              CALL DMMCH( 'N', 'N', M, N, N, ALPHA, B,
+     $                                    NMAX, A, NMAX, BETA, C, NMAX,
+     $                                    CT, G, CC, LDC, EPS, ERR,
+     $                                    FATAL, NOUT, .TRUE. )
+                           END IF
+                           ERRMAX = MAX( ERRMAX, ERR )
+*                          If got really bad answer, report and
+*                          return.
+                           IF( FATAL )
+     $                        GO TO 110
+                        END IF
+*
+   50                CONTINUE
+*
+   60             CONTINUE
+*
+   70          CONTINUE
+*
+   80       CONTINUE
+*
+   90    CONTINUE
+*
+  100 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 120
+*
+  110 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA,
+     $   LDB, BETA, LDC
+*
+  120 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+     $      F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ')   ',
+     $      ' .' )
+ 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of DCHK2.
+*
+      END
+      SUBROUTINE DCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS,
+     $                  B, BB, BS, CT, G, C )
+*
+*  Tests DTRMM and DTRSM.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            NALF, NIDIM, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), B( NMAX, NMAX ),
+     $                   BB( NMAX*NMAX ), BS( NMAX*NMAX ),
+     $                   C( NMAX, NMAX ), CT( NMAX ), G( NMAX )
+      INTEGER            IDIM( NIDIM )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   ALPHA, ALS, ERR, ERRMAX
+      INTEGER            I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB,
+     $                   LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC,
+     $                   NS
+      LOGICAL            LEFT, NULL, RESET, SAME
+      CHARACTER*1        DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO,
+     $                   UPLOS
+      CHARACTER*2        ICHD, ICHS, ICHU
+      CHARACTER*3        ICHT
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LDE, LDERES
+      EXTERNAL           LDE, LDERES
+*     .. External Subroutines ..
+      EXTERNAL           DMAKE, DMMCH, DTRMM, DTRSM
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/
+*     .. Executable Statements ..
+*
+      NARGS = 11
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*     Set up zero matrix for DMMCH.
+      DO 20 J = 1, NMAX
+         DO 10 I = 1, NMAX
+            C( I, J ) = ZERO
+   10    CONTINUE
+   20 CONTINUE
+*
+      DO 140 IM = 1, NIDIM
+         M = IDIM( IM )
+*
+         DO 130 IN = 1, NIDIM
+            N = IDIM( IN )
+*           Set LDB to 1 more than minimum value if room.
+            LDB = M
+            IF( LDB.LT.NMAX )
+     $         LDB = LDB + 1
+*           Skip tests if not enough room.
+            IF( LDB.GT.NMAX )
+     $         GO TO 130
+            LBB = LDB*N
+            NULL = M.LE.0.OR.N.LE.0
+*
+            DO 120 ICS = 1, 2
+               SIDE = ICHS( ICS: ICS )
+               LEFT = SIDE.EQ.'L'
+               IF( LEFT )THEN
+                  NA = M
+               ELSE
+                  NA = N
+               END IF
+*              Set LDA to 1 more than minimum value if room.
+               LDA = NA
+               IF( LDA.LT.NMAX )
+     $            LDA = LDA + 1
+*              Skip tests if not enough room.
+               IF( LDA.GT.NMAX )
+     $            GO TO 130
+               LAA = LDA*NA
+*
+               DO 110 ICU = 1, 2
+                  UPLO = ICHU( ICU: ICU )
+*
+                  DO 100 ICT = 1, 3
+                     TRANSA = ICHT( ICT: ICT )
+*
+                     DO 90 ICD = 1, 2
+                        DIAG = ICHD( ICD: ICD )
+*
+                        DO 80 IA = 1, NALF
+                           ALPHA = ALF( IA )
+*
+*                          Generate the matrix A.
+*
+                           CALL DMAKE( 'TR', UPLO, DIAG, NA, NA, A,
+     $                                 NMAX, AA, LDA, RESET, ZERO )
+*
+*                          Generate the matrix B.
+*
+                           CALL DMAKE( 'GE', ' ', ' ', M, N, B, NMAX,
+     $                                 BB, LDB, RESET, ZERO )
+*
+                           NC = NC + 1
+*
+*                          Save every datum before calling the
+*                          subroutine.
+*
+                           SIDES = SIDE
+                           UPLOS = UPLO
+                           TRANAS = TRANSA
+                           DIAGS = DIAG
+                           MS = M
+                           NS = N
+                           ALS = ALPHA
+                           DO 30 I = 1, LAA
+                              AS( I ) = AA( I )
+   30                      CONTINUE
+                           LDAS = LDA
+                           DO 40 I = 1, LBB
+                              BS( I ) = BB( I )
+   40                      CONTINUE
+                           LDBS = LDB
+*
+*                          Call the subroutine.
+*
+                           IF( SNAME( 4: 5 ).EQ.'MM' )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                           SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA,
+     $                           LDA, LDB
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL DTRMM( SIDE, UPLO, TRANSA, DIAG, M,
+     $                                    N, ALPHA, AA, LDA, BB, LDB )
+                           ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                           SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA,
+     $                           LDA, LDB
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL DTRSM( SIDE, UPLO, TRANSA, DIAG, M,
+     $                                    N, ALPHA, AA, LDA, BB, LDB )
+                           END IF
+*
+*                          Check if error-exit was taken incorrectly.
+*
+                           IF( .NOT.OK )THEN
+                              WRITE( NOUT, FMT = 9994 )
+                              FATAL = .TRUE.
+                              GO TO 150
+                           END IF
+*
+*                          See what data changed inside subroutines.
+*
+                           ISAME( 1 ) = SIDES.EQ.SIDE
+                           ISAME( 2 ) = UPLOS.EQ.UPLO
+                           ISAME( 3 ) = TRANAS.EQ.TRANSA
+                           ISAME( 4 ) = DIAGS.EQ.DIAG
+                           ISAME( 5 ) = MS.EQ.M
+                           ISAME( 6 ) = NS.EQ.N
+                           ISAME( 7 ) = ALS.EQ.ALPHA
+                           ISAME( 8 ) = LDE( AS, AA, LAA )
+                           ISAME( 9 ) = LDAS.EQ.LDA
+                           IF( NULL )THEN
+                              ISAME( 10 ) = LDE( BS, BB, LBB )
+                           ELSE
+                              ISAME( 10 ) = LDERES( 'GE', ' ', M, N, BS,
+     $                                      BB, LDB )
+                           END IF
+                           ISAME( 11 ) = LDBS.EQ.LDB
+*
+*                          If data was incorrectly changed, report and
+*                          return.
+*
+                           SAME = .TRUE.
+                           DO 50 I = 1, NARGS
+                              SAME = SAME.AND.ISAME( I )
+                              IF( .NOT.ISAME( I ) )
+     $                           WRITE( NOUT, FMT = 9998 )I
+   50                      CONTINUE
+                           IF( .NOT.SAME )THEN
+                              FATAL = .TRUE.
+                              GO TO 150
+                           END IF
+*
+                           IF( .NOT.NULL )THEN
+                              IF( SNAME( 4: 5 ).EQ.'MM' )THEN
+*
+*                                Check the result.
+*
+                                 IF( LEFT )THEN
+                                    CALL DMMCH( TRANSA, 'N', M, N, M,
+     $                                          ALPHA, A, NMAX, B, NMAX,
+     $                                          ZERO, C, NMAX, CT, G,
+     $                                          BB, LDB, EPS, ERR,
+     $                                          FATAL, NOUT, .TRUE. )
+                                 ELSE
+                                    CALL DMMCH( 'N', TRANSA, M, N, N,
+     $                                          ALPHA, B, NMAX, A, NMAX,
+     $                                          ZERO, C, NMAX, CT, G,
+     $                                          BB, LDB, EPS, ERR,
+     $                                          FATAL, NOUT, .TRUE. )
+                                 END IF
+                              ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN
+*
+*                                Compute approximation to original
+*                                matrix.
+*
+                                 DO 70 J = 1, N
+                                    DO 60 I = 1, M
+                                       C( I, J ) = BB( I + ( J - 1 )*
+     $                                             LDB )
+                                       BB( I + ( J - 1 )*LDB ) = ALPHA*
+     $                                    B( I, J )
+   60                               CONTINUE
+   70                            CONTINUE
+*
+                                 IF( LEFT )THEN
+                                    CALL DMMCH( TRANSA, 'N', M, N, M,
+     $                                          ONE, A, NMAX, C, NMAX,
+     $                                          ZERO, B, NMAX, CT, G,
+     $                                          BB, LDB, EPS, ERR,
+     $                                          FATAL, NOUT, .FALSE. )
+                                 ELSE
+                                    CALL DMMCH( 'N', TRANSA, M, N, N,
+     $                                          ONE, C, NMAX, A, NMAX,
+     $                                          ZERO, B, NMAX, CT, G,
+     $                                          BB, LDB, EPS, ERR,
+     $                                          FATAL, NOUT, .FALSE. )
+                                 END IF
+                              END IF
+                              ERRMAX = MAX( ERRMAX, ERR )
+*                             If got really bad answer, report and
+*                             return.
+                              IF( FATAL )
+     $                           GO TO 150
+                           END IF
+*
+   80                   CONTINUE
+*
+   90                CONTINUE
+*
+  100             CONTINUE
+*
+  110          CONTINUE
+*
+  120       CONTINUE
+*
+  130    CONTINUE
+*
+  140 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 160
+*
+  150 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M,
+     $   N, ALPHA, LDA, LDB
+*
+  160 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(', 4( '''', A1, ''',' ), 2( I3, ',' ),
+     $      F4.1, ', A,', I3, ', B,', I3, ')        .' )
+ 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of DCHK3.
+*
+      END
+      SUBROUTINE DCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+     $                  A, AA, AS, B, BB, BS, C, CC, CS, CT, G )
+*
+*  Tests DSYRK.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO
+      PARAMETER          ( ZERO = 0.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), B( NMAX, NMAX ),
+     $                   BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+     $                   C( NMAX, NMAX ), CC( NMAX*NMAX ),
+     $                   CS( NMAX*NMAX ), CT( NMAX ), G( NMAX )
+      INTEGER            IDIM( NIDIM )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   ALPHA, ALS, BETA, BETS, ERR, ERRMAX
+      INTEGER            I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS,
+     $                   LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA,
+     $                   NARGS, NC, NS
+      LOGICAL            NULL, RESET, SAME, TRAN, UPPER
+      CHARACTER*1        TRANS, TRANSS, UPLO, UPLOS
+      CHARACTER*2        ICHU
+      CHARACTER*3        ICHT
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LDE, LDERES
+      EXTERNAL           LDE, LDERES
+*     .. External Subroutines ..
+      EXTERNAL           DMAKE, DMMCH, DSYRK
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICHT/'NTC'/, ICHU/'UL'/
+*     .. Executable Statements ..
+*
+      NARGS = 10
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*
+      DO 100 IN = 1, NIDIM
+         N = IDIM( IN )
+*        Set LDC to 1 more than minimum value if room.
+         LDC = N
+         IF( LDC.LT.NMAX )
+     $      LDC = LDC + 1
+*        Skip tests if not enough room.
+         IF( LDC.GT.NMAX )
+     $      GO TO 100
+         LCC = LDC*N
+         NULL = N.LE.0
+*
+         DO 90 IK = 1, NIDIM
+            K = IDIM( IK )
+*
+            DO 80 ICT = 1, 3
+               TRANS = ICHT( ICT: ICT )
+               TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C'
+               IF( TRAN )THEN
+                  MA = K
+                  NA = N
+               ELSE
+                  MA = N
+                  NA = K
+               END IF
+*              Set LDA to 1 more than minimum value if room.
+               LDA = MA
+               IF( LDA.LT.NMAX )
+     $            LDA = LDA + 1
+*              Skip tests if not enough room.
+               IF( LDA.GT.NMAX )
+     $            GO TO 80
+               LAA = LDA*NA
+*
+*              Generate the matrix A.
+*
+               CALL DMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA,
+     $                     RESET, ZERO )
+*
+               DO 70 ICU = 1, 2
+                  UPLO = ICHU( ICU: ICU )
+                  UPPER = UPLO.EQ.'U'
+*
+                  DO 60 IA = 1, NALF
+                     ALPHA = ALF( IA )
+*
+                     DO 50 IB = 1, NBET
+                        BETA = BET( IB )
+*
+*                       Generate the matrix C.
+*
+                        CALL DMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC,
+     $                              LDC, RESET, ZERO )
+*
+                        NC = NC + 1
+*
+*                       Save every datum before calling the subroutine.
+*
+                        UPLOS = UPLO
+                        TRANSS = TRANS
+                        NS = N
+                        KS = K
+                        ALS = ALPHA
+                        DO 10 I = 1, LAA
+                           AS( I ) = AA( I )
+   10                   CONTINUE
+                        LDAS = LDA
+                        BETS = BETA
+                        DO 20 I = 1, LCC
+                           CS( I ) = CC( I )
+   20                   CONTINUE
+                        LDCS = LDC
+*
+*                       Call the subroutine.
+*
+                        IF( TRACE )
+     $                     WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO,
+     $                     TRANS, N, K, ALPHA, LDA, BETA, LDC
+                        IF( REWI )
+     $                     REWIND NTRA
+                        CALL DSYRK( UPLO, TRANS, N, K, ALPHA, AA, LDA,
+     $                              BETA, CC, LDC )
+*
+*                       Check if error-exit was taken incorrectly.
+*
+                        IF( .NOT.OK )THEN
+                           WRITE( NOUT, FMT = 9993 )
+                           FATAL = .TRUE.
+                           GO TO 120
+                        END IF
+*
+*                       See what data changed inside subroutines.
+*
+                        ISAME( 1 ) = UPLOS.EQ.UPLO
+                        ISAME( 2 ) = TRANSS.EQ.TRANS
+                        ISAME( 3 ) = NS.EQ.N
+                        ISAME( 4 ) = KS.EQ.K
+                        ISAME( 5 ) = ALS.EQ.ALPHA
+                        ISAME( 6 ) = LDE( AS, AA, LAA )
+                        ISAME( 7 ) = LDAS.EQ.LDA
+                        ISAME( 8 ) = BETS.EQ.BETA
+                        IF( NULL )THEN
+                           ISAME( 9 ) = LDE( CS, CC, LCC )
+                        ELSE
+                           ISAME( 9 ) = LDERES( 'SY', UPLO, N, N, CS,
+     $                                  CC, LDC )
+                        END IF
+                        ISAME( 10 ) = LDCS.EQ.LDC
+*
+*                       If data was incorrectly changed, report and
+*                       return.
+*
+                        SAME = .TRUE.
+                        DO 30 I = 1, NARGS
+                           SAME = SAME.AND.ISAME( I )
+                           IF( .NOT.ISAME( I ) )
+     $                        WRITE( NOUT, FMT = 9998 )I
+   30                   CONTINUE
+                        IF( .NOT.SAME )THEN
+                           FATAL = .TRUE.
+                           GO TO 120
+                        END IF
+*
+                        IF( .NOT.NULL )THEN
+*
+*                          Check the result column by column.
+*
+                           JC = 1
+                           DO 40 J = 1, N
+                              IF( UPPER )THEN
+                                 JJ = 1
+                                 LJ = J
+                              ELSE
+                                 JJ = J
+                                 LJ = N - J + 1
+                              END IF
+                              IF( TRAN )THEN
+                                 CALL DMMCH( 'T', 'N', LJ, 1, K, ALPHA,
+     $                                       A( 1, JJ ), NMAX,
+     $                                       A( 1, J ), NMAX, BETA,
+     $                                       C( JJ, J ), NMAX, CT, G,
+     $                                       CC( JC ), LDC, EPS, ERR,
+     $                                       FATAL, NOUT, .TRUE. )
+                              ELSE
+                                 CALL DMMCH( 'N', 'T', LJ, 1, K, ALPHA,
+     $                                       A( JJ, 1 ), NMAX,
+     $                                       A( J, 1 ), NMAX, BETA,
+     $                                       C( JJ, J ), NMAX, CT, G,
+     $                                       CC( JC ), LDC, EPS, ERR,
+     $                                       FATAL, NOUT, .TRUE. )
+                              END IF
+                              IF( UPPER )THEN
+                                 JC = JC + LDC
+                              ELSE
+                                 JC = JC + LDC + 1
+                              END IF
+                              ERRMAX = MAX( ERRMAX, ERR )
+*                             If got really bad answer, report and
+*                             return.
+                              IF( FATAL )
+     $                           GO TO 110
+   40                      CONTINUE
+                        END IF
+*
+   50                CONTINUE
+*
+   60             CONTINUE
+*
+   70          CONTINUE
+*
+   80       CONTINUE
+*
+   90    CONTINUE
+*
+  100 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 130
+*
+  110 CONTINUE
+      IF( N.GT.1 )
+     $   WRITE( NOUT, FMT = 9995 )J
+*
+  120 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA,
+     $   LDA, BETA, LDC
+*
+  130 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+     $      F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ')           .' )
+ 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of DCHK4.
+*
+      END
+      SUBROUTINE DCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+     $                  AB, AA, AS, BB, BS, C, CC, CS, CT, G, W )
+*
+*  Tests DSYR2K.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO
+      PARAMETER          ( ZERO = 0.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      DOUBLE PRECISION   AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ),
+     $                   ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ),
+     $                   BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ),
+     $                   CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ),
+     $                   G( NMAX ), W( 2*NMAX )
+      INTEGER            IDIM( NIDIM )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   ALPHA, ALS, BETA, BETS, ERR, ERRMAX
+      INTEGER            I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB,
+     $                   K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS,
+     $                   LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS
+      LOGICAL            NULL, RESET, SAME, TRAN, UPPER
+      CHARACTER*1        TRANS, TRANSS, UPLO, UPLOS
+      CHARACTER*2        ICHU
+      CHARACTER*3        ICHT
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LDE, LDERES
+      EXTERNAL           LDE, LDERES
+*     .. External Subroutines ..
+      EXTERNAL           DMAKE, DMMCH, DSYR2K
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICHT/'NTC'/, ICHU/'UL'/
+*     .. Executable Statements ..
+*
+      NARGS = 12
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*
+      DO 130 IN = 1, NIDIM
+         N = IDIM( IN )
+*        Set LDC to 1 more than minimum value if room.
+         LDC = N
+         IF( LDC.LT.NMAX )
+     $      LDC = LDC + 1
+*        Skip tests if not enough room.
+         IF( LDC.GT.NMAX )
+     $      GO TO 130
+         LCC = LDC*N
+         NULL = N.LE.0
+*
+         DO 120 IK = 1, NIDIM
+            K = IDIM( IK )
+*
+            DO 110 ICT = 1, 3
+               TRANS = ICHT( ICT: ICT )
+               TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C'
+               IF( TRAN )THEN
+                  MA = K
+                  NA = N
+               ELSE
+                  MA = N
+                  NA = K
+               END IF
+*              Set LDA to 1 more than minimum value if room.
+               LDA = MA
+               IF( LDA.LT.NMAX )
+     $            LDA = LDA + 1
+*              Skip tests if not enough room.
+               IF( LDA.GT.NMAX )
+     $            GO TO 110
+               LAA = LDA*NA
+*
+*              Generate the matrix A.
+*
+               IF( TRAN )THEN
+                  CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA,
+     $                        LDA, RESET, ZERO )
+               ELSE
+                  CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA,
+     $                        RESET, ZERO )
+               END IF
+*
+*              Generate the matrix B.
+*
+               LDB = LDA
+               LBB = LAA
+               IF( TRAN )THEN
+                  CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ),
+     $                        2*NMAX, BB, LDB, RESET, ZERO )
+               ELSE
+                  CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ),
+     $                        NMAX, BB, LDB, RESET, ZERO )
+               END IF
+*
+               DO 100 ICU = 1, 2
+                  UPLO = ICHU( ICU: ICU )
+                  UPPER = UPLO.EQ.'U'
+*
+                  DO 90 IA = 1, NALF
+                     ALPHA = ALF( IA )
+*
+                     DO 80 IB = 1, NBET
+                        BETA = BET( IB )
+*
+*                       Generate the matrix C.
+*
+                        CALL DMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC,
+     $                              LDC, RESET, ZERO )
+*
+                        NC = NC + 1
+*
+*                       Save every datum before calling the subroutine.
+*
+                        UPLOS = UPLO
+                        TRANSS = TRANS
+                        NS = N
+                        KS = K
+                        ALS = ALPHA
+                        DO 10 I = 1, LAA
+                           AS( I ) = AA( I )
+   10                   CONTINUE
+                        LDAS = LDA
+                        DO 20 I = 1, LBB
+                           BS( I ) = BB( I )
+   20                   CONTINUE
+                        LDBS = LDB
+                        BETS = BETA
+                        DO 30 I = 1, LCC
+                           CS( I ) = CC( I )
+   30                   CONTINUE
+                        LDCS = LDC
+*
+*                       Call the subroutine.
+*
+                        IF( TRACE )
+     $                     WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO,
+     $                     TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC
+                        IF( REWI )
+     $                     REWIND NTRA
+                        CALL DSYR2K( UPLO, TRANS, N, K, ALPHA, AA, LDA,
+     $                               BB, LDB, BETA, CC, LDC )
+*
+*                       Check if error-exit was taken incorrectly.
+*
+                        IF( .NOT.OK )THEN
+                           WRITE( NOUT, FMT = 9993 )
+                           FATAL = .TRUE.
+                           GO TO 150
+                        END IF
+*
+*                       See what data changed inside subroutines.
+*
+                        ISAME( 1 ) = UPLOS.EQ.UPLO
+                        ISAME( 2 ) = TRANSS.EQ.TRANS
+                        ISAME( 3 ) = NS.EQ.N
+                        ISAME( 4 ) = KS.EQ.K
+                        ISAME( 5 ) = ALS.EQ.ALPHA
+                        ISAME( 6 ) = LDE( AS, AA, LAA )
+                        ISAME( 7 ) = LDAS.EQ.LDA
+                        ISAME( 8 ) = LDE( BS, BB, LBB )
+                        ISAME( 9 ) = LDBS.EQ.LDB
+                        ISAME( 10 ) = BETS.EQ.BETA
+                        IF( NULL )THEN
+                           ISAME( 11 ) = LDE( CS, CC, LCC )
+                        ELSE
+                           ISAME( 11 ) = LDERES( 'SY', UPLO, N, N, CS,
+     $                                   CC, LDC )
+                        END IF
+                        ISAME( 12 ) = LDCS.EQ.LDC
+*
+*                       If data was incorrectly changed, report and
+*                       return.
+*
+                        SAME = .TRUE.
+                        DO 40 I = 1, NARGS
+                           SAME = SAME.AND.ISAME( I )
+                           IF( .NOT.ISAME( I ) )
+     $                        WRITE( NOUT, FMT = 9998 )I
+   40                   CONTINUE
+                        IF( .NOT.SAME )THEN
+                           FATAL = .TRUE.
+                           GO TO 150
+                        END IF
+*
+                        IF( .NOT.NULL )THEN
+*
+*                          Check the result column by column.
+*
+                           JJAB = 1
+                           JC = 1
+                           DO 70 J = 1, N
+                              IF( UPPER )THEN
+                                 JJ = 1
+                                 LJ = J
+                              ELSE
+                                 JJ = J
+                                 LJ = N - J + 1
+                              END IF
+                              IF( TRAN )THEN
+                                 DO 50 I = 1, K
+                                    W( I ) = AB( ( J - 1 )*2*NMAX + K +
+     $                                       I )
+                                    W( K + I ) = AB( ( J - 1 )*2*NMAX +
+     $                                           I )
+   50                            CONTINUE
+                                 CALL DMMCH( 'T', 'N', LJ, 1, 2*K,
+     $                                       ALPHA, AB( JJAB ), 2*NMAX,
+     $                                       W, 2*NMAX, BETA,
+     $                                       C( JJ, J ), NMAX, CT, G,
+     $                                       CC( JC ), LDC, EPS, ERR,
+     $                                       FATAL, NOUT, .TRUE. )
+                              ELSE
+                                 DO 60 I = 1, K
+                                    W( I ) = AB( ( K + I - 1 )*NMAX +
+     $                                       J )
+                                    W( K + I ) = AB( ( I - 1 )*NMAX +
+     $                                           J )
+   60                            CONTINUE
+                                 CALL DMMCH( 'N', 'N', LJ, 1, 2*K,
+     $                                       ALPHA, AB( JJ ), NMAX, W,
+     $                                       2*NMAX, BETA, C( JJ, J ),
+     $                                       NMAX, CT, G, CC( JC ), LDC,
+     $                                       EPS, ERR, FATAL, NOUT,
+     $                                       .TRUE. )
+                              END IF
+                              IF( UPPER )THEN
+                                 JC = JC + LDC
+                              ELSE
+                                 JC = JC + LDC + 1
+                                 IF( TRAN )
+     $                              JJAB = JJAB + 2*NMAX
+                              END IF
+                              ERRMAX = MAX( ERRMAX, ERR )
+*                             If got really bad answer, report and
+*                             return.
+                              IF( FATAL )
+     $                           GO TO 140
+   70                      CONTINUE
+                        END IF
+*
+   80                CONTINUE
+*
+   90             CONTINUE
+*
+  100          CONTINUE
+*
+  110       CONTINUE
+*
+  120    CONTINUE
+*
+  130 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 160
+*
+  140 CONTINUE
+      IF( N.GT.1 )
+     $   WRITE( NOUT, FMT = 9995 )J
+*
+  150 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA,
+     $   LDA, LDB, BETA, LDC
+*
+  160 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+     $      F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ')   ',
+     $      ' .' )
+ 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of DCHK5.
+*
+      END
+      SUBROUTINE DCHKE( ISNUM, SRNAMT, NOUT )
+*
+*  Tests the error exits from the Level 3 Blas.
+*  Requires a special version of the error-handling routine XERBLA.
+*  A, B and C should not need to be defined.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*  3-19-92:  Initialize ALPHA and BETA  (eca)
+*  3-19-92:  Fix argument 12 in calls to SSYMM with INFOT = 9  (eca)
+*
+*     .. Scalar Arguments ..
+      INTEGER            ISNUM, NOUT
+      CHARACTER*6        SRNAMT
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE, TWO
+      PARAMETER          ( ONE = 1.0D0, TWO = 2.0D0 )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   ALPHA, BETA
+*     .. Local Arrays ..
+      DOUBLE PRECISION   A( 2, 1 ), B( 2, 1 ), C( 2, 1 )
+*     .. External Subroutines ..
+      EXTERNAL           CHKXER, DGEMM, DSYMM, DSYR2K, DSYRK, DTRMM,
+     $                   DTRSM
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Executable Statements ..
+*     OK is set to .FALSE. by the special version of XERBLA or by CHKXER
+*     if anything is wrong.
+      OK = .TRUE.
+*     LERR is set to .TRUE. by the special version of XERBLA each time
+*     it is called, and is then tested and re-set by CHKXER.
+      LERR = .FALSE.
+*
+*     Initialize ALPHA and BETA.
+*
+      ALPHA = ONE
+      BETA = TWO
+*
+      GO TO ( 10, 20, 30, 40, 50, 60 )ISNUM
+   10 INFOT = 1
+      CALL DGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 1
+      CALL DGEMM( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DGEMM( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DGEMM( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DGEMM( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DGEMM( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DGEMM( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DGEMM( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DGEMM( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DGEMM( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DGEMM( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DGEMM( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DGEMM( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DGEMM( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DGEMM( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DGEMM( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL DGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL DGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL DGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL DGEMM( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL DGEMM( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL DGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL DGEMM( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL DGEMM( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL DGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL DGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL DGEMM( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL DGEMM( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 70
+   20 INFOT = 1
+      CALL DSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL DSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL DSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 70
+   30 INFOT = 1
+      CALL DTRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DTRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DTRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DTRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DTRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DTRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DTRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DTRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DTRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DTRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DTRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DTRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL DTRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL DTRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL DTRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL DTRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL DTRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL DTRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL DTRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL DTRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DTRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DTRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DTRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DTRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL DTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL DTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL DTRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL DTRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL DTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL DTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL DTRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL DTRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 70
+   40 INFOT = 1
+      CALL DTRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DTRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DTRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DTRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DTRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DTRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DTRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DTRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DTRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DTRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DTRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DTRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL DTRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL DTRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL DTRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL DTRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL DTRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL DTRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL DTRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL DTRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DTRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DTRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DTRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DTRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL DTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL DTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL DTRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL DTRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL DTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL DTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL DTRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL DTRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 70
+   50 INFOT = 1
+      CALL DSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DSYRK( 'U', '/', 0, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL DSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL DSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL DSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL DSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 70
+   60 INFOT = 1
+      CALL DSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DSYR2K( 'U', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL DSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL DSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL DSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL DSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+*
+   70 IF( OK )THEN
+         WRITE( NOUT, FMT = 9999 )SRNAMT
+      ELSE
+         WRITE( NOUT, FMT = 9998 )SRNAMT
+      END IF
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' )
+ 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****',
+     $      '**' )
+*
+*     End of DCHKE.
+*
+      END
+      SUBROUTINE DMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET,
+     $                  TRANSL )
+*
+*  Generates values for an M by N matrix A.
+*  Stores the values in the array AA in the data structure required
+*  by the routine, with unwanted elements set to rogue value.
+*
+*  TYPE is 'GE', 'SY' or 'TR'.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
+      DOUBLE PRECISION   ROGUE
+      PARAMETER          ( ROGUE = -1.0D10 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   TRANSL
+      INTEGER            LDA, M, N, NMAX
+      LOGICAL            RESET
+      CHARACTER*1        DIAG, UPLO
+      CHARACTER*2        TYPE
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( NMAX, * ), AA( * )
+*     .. Local Scalars ..
+      INTEGER            I, IBEG, IEND, J
+      LOGICAL            GEN, LOWER, SYM, TRI, UNIT, UPPER
+*     .. External Functions ..
+      DOUBLE PRECISION   DBEG
+      EXTERNAL           DBEG
+*     .. Executable Statements ..
+      GEN = TYPE.EQ.'GE'
+      SYM = TYPE.EQ.'SY'
+      TRI = TYPE.EQ.'TR'
+      UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U'
+      LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L'
+      UNIT = TRI.AND.DIAG.EQ.'U'
+*
+*     Generate data in array A.
+*
+      DO 20 J = 1, N
+         DO 10 I = 1, M
+            IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) )
+     $          THEN
+               A( I, J ) = DBEG( RESET ) + TRANSL
+               IF( I.NE.J )THEN
+*                 Set some elements to zero
+                  IF( N.GT.3.AND.J.EQ.N/2 )
+     $               A( I, J ) = ZERO
+                  IF( SYM )THEN
+                     A( J, I ) = A( I, J )
+                  ELSE IF( TRI )THEN
+                     A( J, I ) = ZERO
+                  END IF
+               END IF
+            END IF
+   10    CONTINUE
+         IF( TRI )
+     $      A( J, J ) = A( J, J ) + ONE
+         IF( UNIT )
+     $      A( J, J ) = ONE
+   20 CONTINUE
+*
+*     Store elements in array AS in data structure required by routine.
+*
+      IF( TYPE.EQ.'GE' )THEN
+         DO 50 J = 1, N
+            DO 30 I = 1, M
+               AA( I + ( J - 1 )*LDA ) = A( I, J )
+   30       CONTINUE
+            DO 40 I = M + 1, LDA
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+   40       CONTINUE
+   50    CONTINUE
+      ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN
+         DO 90 J = 1, N
+            IF( UPPER )THEN
+               IBEG = 1
+               IF( UNIT )THEN
+                  IEND = J - 1
+               ELSE
+                  IEND = J
+               END IF
+            ELSE
+               IF( UNIT )THEN
+                  IBEG = J + 1
+               ELSE
+                  IBEG = J
+               END IF
+               IEND = N
+            END IF
+            DO 60 I = 1, IBEG - 1
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+   60       CONTINUE
+            DO 70 I = IBEG, IEND
+               AA( I + ( J - 1 )*LDA ) = A( I, J )
+   70       CONTINUE
+            DO 80 I = IEND + 1, LDA
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+   80       CONTINUE
+   90    CONTINUE
+      END IF
+      RETURN
+*
+*     End of DMAKE.
+*
+      END
+      SUBROUTINE DMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB,
+     $                  BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL,
+     $                  NOUT, MV )
+*
+*  Checks the results of the computational tests.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   ALPHA, BETA, EPS, ERR
+      INTEGER            KK, LDA, LDB, LDC, LDCC, M, N, NOUT
+      LOGICAL            FATAL, MV
+      CHARACTER*1        TRANSA, TRANSB
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), C( LDC, * ),
+     $                   CC( LDCC, * ), CT( * ), G( * )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   ERRI
+      INTEGER            I, J, K
+      LOGICAL            TRANA, TRANB
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, SQRT
+*     .. Executable Statements ..
+      TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C'
+      TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C'
+*
+*     Compute expected result, one column at a time, in CT using data
+*     in A, B and C.
+*     Compute gauges in G.
+*
+      DO 120 J = 1, N
+*
+         DO 10 I = 1, M
+            CT( I ) = ZERO
+            G( I ) = ZERO
+   10    CONTINUE
+         IF( .NOT.TRANA.AND..NOT.TRANB )THEN
+            DO 30 K = 1, KK
+               DO 20 I = 1, M
+                  CT( I ) = CT( I ) + A( I, K )*B( K, J )
+                  G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( K, J ) )
+   20          CONTINUE
+   30       CONTINUE
+         ELSE IF( TRANA.AND..NOT.TRANB )THEN
+            DO 50 K = 1, KK
+               DO 40 I = 1, M
+                  CT( I ) = CT( I ) + A( K, I )*B( K, J )
+                  G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( K, J ) )
+   40          CONTINUE
+   50       CONTINUE
+         ELSE IF( .NOT.TRANA.AND.TRANB )THEN
+            DO 70 K = 1, KK
+               DO 60 I = 1, M
+                  CT( I ) = CT( I ) + A( I, K )*B( J, K )
+                  G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( J, K ) )
+   60          CONTINUE
+   70       CONTINUE
+         ELSE IF( TRANA.AND.TRANB )THEN
+            DO 90 K = 1, KK
+               DO 80 I = 1, M
+                  CT( I ) = CT( I ) + A( K, I )*B( J, K )
+                  G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( J, K ) )
+   80          CONTINUE
+   90       CONTINUE
+         END IF
+         DO 100 I = 1, M
+            CT( I ) = ALPHA*CT( I ) + BETA*C( I, J )
+            G( I ) = ABS( ALPHA )*G( I ) + ABS( BETA )*ABS( C( I, J ) )
+  100    CONTINUE
+*
+*        Compute the error ratio for this result.
+*
+         ERR = ZERO
+         DO 110 I = 1, M
+            ERRI = ABS( CT( I ) - CC( I, J ) )/EPS
+            IF( G( I ).NE.ZERO )
+     $         ERRI = ERRI/G( I )
+            ERR = MAX( ERR, ERRI )
+            IF( ERR*SQRT( EPS ).GE.ONE )
+     $         GO TO 130
+  110    CONTINUE
+*
+  120 CONTINUE
+*
+*     If the loop completes, all results are at least half accurate.
+      GO TO 150
+*
+*     Report fatal error.
+*
+  130 FATAL = .TRUE.
+      WRITE( NOUT, FMT = 9999 )
+      DO 140 I = 1, M
+         IF( MV )THEN
+            WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J )
+         ELSE
+            WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I )
+         END IF
+  140 CONTINUE
+      IF( N.GT.1 )
+     $   WRITE( NOUT, FMT = 9997 )J
+*
+  150 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL',
+     $      'F ACCURATE *******', /'           EXPECTED RESULT   COMPU',
+     $      'TED RESULT' )
+ 9998 FORMAT( 1X, I7, 2G18.6 )
+ 9997 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+*
+*     End of DMMCH.
+*
+      END
+      LOGICAL FUNCTION LDE( RI, RJ, LR )
+*
+*  Tests if two arrays are identical.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      INTEGER            LR
+*     .. Array Arguments ..
+      DOUBLE PRECISION   RI( * ), RJ( * )
+*     .. Local Scalars ..
+      INTEGER            I
+*     .. Executable Statements ..
+      DO 10 I = 1, LR
+         IF( RI( I ).NE.RJ( I ) )
+     $      GO TO 20
+   10 CONTINUE
+      LDE = .TRUE.
+      GO TO 30
+   20 CONTINUE
+      LDE = .FALSE.
+   30 RETURN
+*
+*     End of LDE.
+*
+      END
+      LOGICAL FUNCTION LDERES( TYPE, UPLO, M, N, AA, AS, LDA )
+*
+*  Tests if selected elements in two arrays are equal.
+*
+*  TYPE is 'GE' or 'SY'.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      INTEGER            LDA, M, N
+      CHARACTER*1        UPLO
+      CHARACTER*2        TYPE
+*     .. Array Arguments ..
+      DOUBLE PRECISION   AA( LDA, * ), AS( LDA, * )
+*     .. Local Scalars ..
+      INTEGER            I, IBEG, IEND, J
+      LOGICAL            UPPER
+*     .. Executable Statements ..
+      UPPER = UPLO.EQ.'U'
+      IF( TYPE.EQ.'GE' )THEN
+         DO 20 J = 1, N
+            DO 10 I = M + 1, LDA
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   10       CONTINUE
+   20    CONTINUE
+      ELSE IF( TYPE.EQ.'SY' )THEN
+         DO 50 J = 1, N
+            IF( UPPER )THEN
+               IBEG = 1
+               IEND = J
+            ELSE
+               IBEG = J
+               IEND = N
+            END IF
+            DO 30 I = 1, IBEG - 1
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   30       CONTINUE
+            DO 40 I = IEND + 1, LDA
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   40       CONTINUE
+   50    CONTINUE
+      END IF
+*
+      LDERES = .TRUE.
+      GO TO 80
+   70 CONTINUE
+      LDERES = .FALSE.
+   80 RETURN
+*
+*     End of LDERES.
+*
+      END
+      DOUBLE PRECISION FUNCTION DBEG( RESET )
+*
+*  Generates random numbers uniformly distributed between -0.5 and 0.5.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      LOGICAL            RESET
+*     .. Local Scalars ..
+      INTEGER            I, IC, MI
+*     .. Save statement ..
+      SAVE               I, IC, MI
+*     .. Executable Statements ..
+      IF( RESET )THEN
+*        Initialize local variables.
+         MI = 891
+         I = 7
+         IC = 0
+         RESET = .FALSE.
+      END IF
+*
+*     The sequence of values of I is bounded between 1 and 999.
+*     If initial I = 1,2,3,6,7 or 9, the period will be 50.
+*     If initial I = 4 or 8, the period will be 25.
+*     If initial I = 5, the period will be 10.
+*     IC is used to break up the period by skipping 1 value of I in 6.
+*
+      IC = IC + 1
+   10 I = I*MI
+      I = I - 1000*( I/1000 )
+      IF( IC.GE.5 )THEN
+         IC = 0
+         GO TO 10
+      END IF
+      DBEG = ( I - 500 )/1001.0D0
+      RETURN
+*
+*     End of DBEG.
+*
+      END
+      DOUBLE PRECISION FUNCTION DDIFF( X, Y )
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   X, Y
+*     .. Executable Statements ..
+      DDIFF = X - Y
+      RETURN
+*
+*     End of DDIFF.
+*
+      END
+      SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+*
+*  Tests whether XERBLA has detected an error when it should.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFOT, NOUT
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Executable Statements ..
+      IF( .NOT.LERR )THEN
+         WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT
+         OK = .FALSE.
+      END IF
+      LERR = .FALSE.
+      RETURN
+*
+ 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D',
+     $      'ETECTED BY ', A6, ' *****' )
+*
+*     End of CHKXER.
+*
+      END
+      SUBROUTINE XERBLA( SRNAME, INFO )
+*
+*  This is a special version of XERBLA to be used only as part of
+*  the test program for testing error exits from the Level 3 BLAS
+*  routines.
+*
+*  XERBLA  is an error handler for the Level 3 BLAS routines.
+*
+*  It is called by the Level 3 BLAS routines if an input parameter is
+*  invalid.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO
+      CHARACTER*6        SRNAME
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUT
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUT, OK, LERR
+      COMMON             /SRNAMC/SRNAMT
+*     .. Executable Statements ..
+      LERR = .TRUE.
+      IF( INFO.NE.INFOT )THEN
+         IF( INFOT.NE.0 )THEN
+            WRITE( NOUT, FMT = 9999 )INFO, INFOT
+         ELSE
+            WRITE( NOUT, FMT = 9997 )INFO
+         END IF
+         OK = .FALSE.
+      END IF
+      IF( SRNAME.NE.SRNAMT )THEN
+         WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT
+         OK = .FALSE.
+      END IF
+      RETURN
+*
+ 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD',
+     $      ' OF ', I2, ' *******' )
+ 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE',
+     $      'AD OF ', A6, ' *******' )
+ 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6,
+     $      ' *******' )
+*
+*     End of XERBLA
+*
+      END
+

diff --git a/blas/testing/runblastest.sh b/blas/testing/runblastest.sh
new file mode 100755
index 0000000..4ffaf01
--- /dev/null
+++ b/blas/testing/runblastest.sh

@@ -0,0 +1,45 @@
+#!/bin/bash
+
+black='\E[30m'
+red='\E[31m'
+green='\E[32m'
+yellow='\E[33m'
+blue='\E[34m'
+magenta='\E[35m'
+cyan='\E[36m'
+white='\E[37m'
+
+if [ -f $2 ]; then
+  data=$2
+  if [ -f $1.summ ]; then rm $1.summ; fi
+  if [ -f $1.snap ]; then rm $1.snap; fi
+else
+  data=$1
+fi
+
+if ! ./$1 < $data > /dev/null 2> .runtest.log ; then
+  echo -e  $red Test $1 failed: $black
+  echo -e $blue
+  cat .runtest.log
+  echo -e $black
+  exit 1
+else
+  if [ -f $1.summ ]; then
+    if [ `grep "FATAL ERROR" $1.summ | wc -l` -gt 0 ]; then
+      echo -e  $red "Test $1 failed (FATAL ERROR, read the file $1.summ for details)" $black
+      echo -e $blue
+      cat .runtest.log
+      echo -e $black
+      exit 1;
+    fi
+
+    if [ `grep "FAILED THE TESTS OF ERROR-EXITS" $1.summ | wc -l` -gt 0 ]; then
+      echo -e  $red "Test $1 failed (FAILED THE TESTS OF ERROR-EXITS, read the file $1.summ for details)" $black
+      echo -e $blue
+      cat .runtest.log
+      echo -e $black
+      exit 1;
+    fi      
+  fi
+  echo -e $green Test $1 passed$black
+fi

diff --git a/blas/testing/sblat1.f b/blas/testing/sblat1.f
new file mode 100644
index 0000000..4d43d9b
--- /dev/null
+++ b/blas/testing/sblat1.f

@@ -0,0 +1,1021 @@
+*> \brief \b SBLAT1
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM SBLAT1
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*>    Test program for the REAL Level 1 BLAS.
+*>
+*>    Based upon the original BLAS test routine together with:
+*>    F06EAF Example Program Text
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup single_blas_testing
+*
+*  =====================================================================
+      PROGRAM SBLAT1
+*
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER          NOUT
+      PARAMETER        (NOUT=6)
+*     .. Scalars in Common ..
+      INTEGER          ICASE, INCX, INCY, N
+      LOGICAL          PASS
+*     .. Local Scalars ..
+      REAL             SFAC
+      INTEGER          IC
+*     .. External Subroutines ..
+      EXTERNAL         CHECK0, CHECK1, CHECK2, CHECK3, HEADER
+*     .. Common blocks ..
+      COMMON           /COMBLA/ICASE, N, INCX, INCY, PASS
+*     .. Data statements ..
+      DATA             SFAC/9.765625E-4/
+*     .. Executable Statements ..
+      WRITE (NOUT,99999)
+      DO 20 IC = 1, 13
+         ICASE = IC
+         CALL HEADER
+*
+*        .. Initialize  PASS,  INCX,  and INCY for a new case. ..
+*        .. the value 9999 for INCX or INCY will appear in the ..
+*        .. detailed  output, if any, for cases  that do not involve ..
+*        .. these parameters ..
+*
+         PASS = .TRUE.
+         INCX = 9999
+         INCY = 9999
+         IF (ICASE.EQ.3 .OR. ICASE.EQ.11) THEN
+            CALL CHECK0(SFAC)
+         ELSE IF (ICASE.EQ.7 .OR. ICASE.EQ.8 .OR. ICASE.EQ.9 .OR.
+     +            ICASE.EQ.10) THEN
+            CALL CHECK1(SFAC)
+         ELSE IF (ICASE.EQ.1 .OR. ICASE.EQ.2 .OR. ICASE.EQ.5 .OR.
+     +            ICASE.EQ.6 .OR. ICASE.EQ.12 .OR. ICASE.EQ.13) THEN
+            CALL CHECK2(SFAC)
+         ELSE IF (ICASE.EQ.4) THEN
+            CALL CHECK3(SFAC)
+         END IF
+*        -- Print
+         IF (PASS) WRITE (NOUT,99998)
+   20 CONTINUE
+      STOP
+*
+99999 FORMAT (' Real BLAS Test Program Results',/1X)
+99998 FORMAT ('                                    ----- PASS -----')
+      END
+      SUBROUTINE HEADER
+*     .. Parameters ..
+      INTEGER          NOUT
+      PARAMETER        (NOUT=6)
+*     .. Scalars in Common ..
+      INTEGER          ICASE, INCX, INCY, N
+      LOGICAL          PASS
+*     .. Local Arrays ..
+      CHARACTER*6      L(13)
+*     .. Common blocks ..
+      COMMON           /COMBLA/ICASE, N, INCX, INCY, PASS
+*     .. Data statements ..
+      DATA             L(1)/' SDOT '/
+      DATA             L(2)/'SAXPY '/
+      DATA             L(3)/'SROTG '/
+      DATA             L(4)/' SROT '/
+      DATA             L(5)/'SCOPY '/
+      DATA             L(6)/'SSWAP '/
+      DATA             L(7)/'SNRM2 '/
+      DATA             L(8)/'SASUM '/
+      DATA             L(9)/'SSCAL '/
+      DATA             L(10)/'ISAMAX'/
+      DATA             L(11)/'SROTMG'/
+      DATA             L(12)/'SROTM '/
+      DATA             L(13)/'SDSDOT'/
+*     .. Executable Statements ..
+      WRITE (NOUT,99999) ICASE, L(ICASE)
+      RETURN
+*
+99999 FORMAT (/' Test of subprogram number',I3,12X,A6)
+      END
+      SUBROUTINE CHECK0(SFAC)
+*     .. Parameters ..
+      INTEGER           NOUT
+      PARAMETER         (NOUT=6)
+*     .. Scalar Arguments ..
+      REAL              SFAC
+*     .. Scalars in Common ..
+      INTEGER           ICASE, INCX, INCY, N
+      LOGICAL           PASS
+*     .. Local Scalars ..
+      REAL              D12, SA, SB, SC, SS
+      INTEGER           I, K
+*     .. Local Arrays ..
+      REAL              DA1(8), DATRUE(8), DB1(8), DBTRUE(8), DC1(8),
+     +                  DS1(8), DAB(4,9), DTEMP(9), DTRUE(9,9)
+*     .. External Subroutines ..
+      EXTERNAL          SROTG, SROTMG, STEST1
+*     .. Common blocks ..
+      COMMON            /COMBLA/ICASE, N, INCX, INCY, PASS
+*     .. Data statements ..
+      DATA              DA1/0.3E0, 0.4E0, -0.3E0, -0.4E0, -0.3E0, 0.0E0,
+     +                  0.0E0, 1.0E0/
+      DATA              DB1/0.4E0, 0.3E0, 0.4E0, 0.3E0, -0.4E0, 0.0E0,
+     +                  1.0E0, 0.0E0/
+      DATA              DC1/0.6E0, 0.8E0, -0.6E0, 0.8E0, 0.6E0, 1.0E0,
+     +                  0.0E0, 1.0E0/
+      DATA              DS1/0.8E0, 0.6E0, 0.8E0, -0.6E0, 0.8E0, 0.0E0,
+     +                  1.0E0, 0.0E0/
+      DATA              DATRUE/0.5E0, 0.5E0, 0.5E0, -0.5E0, -0.5E0,
+     +                  0.0E0, 1.0E0, 1.0E0/
+      DATA              DBTRUE/0.0E0, 0.6E0, 0.0E0, -0.6E0, 0.0E0,
+     +                  0.0E0, 1.0E0, 0.0E0/
+*     INPUT FOR MODIFIED GIVENS
+      DATA DAB/ .1E0,.3E0,1.2E0,.2E0,
+     A          .7E0, .2E0, .6E0, 4.2E0,
+     B          0.E0,0.E0,0.E0,0.E0,
+     C          4.E0, -1.E0, 2.E0, 4.E0,
+     D          6.E-10, 2.E-2, 1.E5, 10.E0,
+     E          4.E10, 2.E-2, 1.E-5, 10.E0,
+     F          2.E-10, 4.E-2, 1.E5, 10.E0,
+     G          2.E10, 4.E-2, 1.E-5, 10.E0,
+     H          4.E0, -2.E0, 8.E0, 4.E0    /
+*    TRUE RESULTS FOR MODIFIED GIVENS
+      DATA DTRUE/0.E0,0.E0, 1.3E0, .2E0, 0.E0,0.E0,0.E0, .5E0, 0.E0,
+     A           0.E0,0.E0, 4.5E0, 4.2E0, 1.E0, .5E0, 0.E0,0.E0,0.E0,
+     B           0.E0,0.E0,0.E0,0.E0, -2.E0, 0.E0,0.E0,0.E0,0.E0,
+     C           0.E0,0.E0,0.E0, 4.E0, -1.E0, 0.E0,0.E0,0.E0,0.E0,
+     D           0.E0, 15.E-3, 0.E0, 10.E0, -1.E0, 0.E0, -1.E-4,
+     E           0.E0, 1.E0,
+     F           0.E0,0.E0, 6144.E-5, 10.E0, -1.E0, 4096.E0, -1.E6,
+     G           0.E0, 1.E0,
+     H           0.E0,0.E0,15.E0,10.E0,-1.E0, 5.E-5, 0.E0,1.E0,0.E0,
+     I           0.E0,0.E0, 15.E0, 10.E0, -1. E0, 5.E5, -4096.E0,
+     J           1.E0, 4096.E-6,
+     K           0.E0,0.E0, 7.E0, 4.E0, 0.E0,0.E0, -.5E0, -.25E0, 0.E0/
+*                   4096 = 2 ** 12
+      DATA D12  /4096.E0/
+      DTRUE(1,1) = 12.E0 / 130.E0
+      DTRUE(2,1) = 36.E0 / 130.E0
+      DTRUE(7,1) = -1.E0 / 6.E0
+      DTRUE(1,2) = 14.E0 / 75.E0
+      DTRUE(2,2) = 49.E0 / 75.E0
+      DTRUE(9,2) = 1.E0 / 7.E0
+      DTRUE(1,5) = 45.E-11 * (D12 * D12)
+      DTRUE(3,5) = 4.E5 / (3.E0 * D12)
+      DTRUE(6,5) = 1.E0 / D12
+      DTRUE(8,5) = 1.E4 / (3.E0 * D12)
+      DTRUE(1,6) = 4.E10 / (1.5E0 * D12 * D12)
+      DTRUE(2,6) = 2.E-2 / 1.5E0
+      DTRUE(8,6) = 5.E-7 * D12
+      DTRUE(1,7) = 4.E0 / 150.E0
+      DTRUE(2,7) = (2.E-10 / 1.5E0) * (D12 * D12)
+      DTRUE(7,7) = -DTRUE(6,5)
+      DTRUE(9,7) = 1.E4 / D12
+      DTRUE(1,8) = DTRUE(1,7)
+      DTRUE(2,8) = 2.E10 / (1.5E0 * D12 * D12)
+      DTRUE(1,9) = 32.E0 / 7.E0
+      DTRUE(2,9) = -16.E0 / 7.E0
+*     .. Executable Statements ..
+*
+*     Compute true values which cannot be prestored
+*     in decimal notation
+*
+      DBTRUE(1) = 1.0E0/0.6E0
+      DBTRUE(3) = -1.0E0/0.6E0
+      DBTRUE(5) = 1.0E0/0.6E0
+*
+      DO 20 K = 1, 8
+*        .. Set N=K for identification in output if any ..
+         N = K
+         IF (ICASE.EQ.3) THEN
+*           .. SROTG ..
+            IF (K.GT.8) GO TO 40
+            SA = DA1(K)
+            SB = DB1(K)
+            CALL SROTG(SA,SB,SC,SS)
+            CALL STEST1(SA,DATRUE(K),DATRUE(K),SFAC)
+            CALL STEST1(SB,DBTRUE(K),DBTRUE(K),SFAC)
+            CALL STEST1(SC,DC1(K),DC1(K),SFAC)
+            CALL STEST1(SS,DS1(K),DS1(K),SFAC)
+         ELSEIF (ICASE.EQ.11) THEN
+*           .. SROTMG ..
+            DO I=1,4
+               DTEMP(I)= DAB(I,K)
+               DTEMP(I+4) = 0.0
+            END DO
+            DTEMP(9) = 0.0
+            CALL SROTMG(DTEMP(1),DTEMP(2),DTEMP(3),DTEMP(4),DTEMP(5))
+            CALL STEST(9,DTEMP,DTRUE(1,K),DTRUE(1,K),SFAC)
+         ELSE
+            WRITE (NOUT,*) ' Shouldn''t be here in CHECK0'
+            STOP
+         END IF
+   20 CONTINUE
+   40 RETURN
+      END
+      SUBROUTINE CHECK1(SFAC)
+*     .. Parameters ..
+      INTEGER           NOUT
+      PARAMETER         (NOUT=6)
+*     .. Scalar Arguments ..
+      REAL              SFAC
+*     .. Scalars in Common ..
+      INTEGER           ICASE, INCX, INCY, N
+      LOGICAL           PASS
+*     .. Local Scalars ..
+      INTEGER           I, LEN, NP1
+*     .. Local Arrays ..
+      REAL              DTRUE1(5), DTRUE3(5), DTRUE5(8,5,2), DV(8,5,2),
+     +                  SA(10), STEMP(1), STRUE(8), SX(8)
+      INTEGER           ITRUE2(5)
+*     .. External Functions ..
+      REAL              SASUM, SNRM2
+      INTEGER           ISAMAX
+      EXTERNAL          SASUM, SNRM2, ISAMAX
+*     .. External Subroutines ..
+      EXTERNAL          ITEST1, SSCAL, STEST, STEST1
+*     .. Intrinsic Functions ..
+      INTRINSIC         MAX
+*     .. Common blocks ..
+      COMMON            /COMBLA/ICASE, N, INCX, INCY, PASS
+*     .. Data statements ..
+      DATA              SA/0.3E0, -1.0E0, 0.0E0, 1.0E0, 0.3E0, 0.3E0,
+     +                  0.3E0, 0.3E0, 0.3E0, 0.3E0/
+      DATA              DV/0.1E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0,
+     +                  2.0E0, 2.0E0, 0.3E0, 3.0E0, 3.0E0, 3.0E0, 3.0E0,
+     +                  3.0E0, 3.0E0, 3.0E0, 0.3E0, -0.4E0, 4.0E0,
+     +                  4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, 0.2E0,
+     +                  -0.6E0, 0.3E0, 5.0E0, 5.0E0, 5.0E0, 5.0E0,
+     +                  5.0E0, 0.1E0, -0.3E0, 0.5E0, -0.1E0, 6.0E0,
+     +                  6.0E0, 6.0E0, 6.0E0, 0.1E0, 8.0E0, 8.0E0, 8.0E0,
+     +                  8.0E0, 8.0E0, 8.0E0, 8.0E0, 0.3E0, 9.0E0, 9.0E0,
+     +                  9.0E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, 0.3E0, 2.0E0,
+     +                  -0.4E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0,
+     +                  0.2E0, 3.0E0, -0.6E0, 5.0E0, 0.3E0, 2.0E0,
+     +                  2.0E0, 2.0E0, 0.1E0, 4.0E0, -0.3E0, 6.0E0,
+     +                  -0.5E0, 7.0E0, -0.1E0, 3.0E0/
+      DATA              DTRUE1/0.0E0, 0.3E0, 0.5E0, 0.7E0, 0.6E0/
+      DATA              DTRUE3/0.0E0, 0.3E0, 0.7E0, 1.1E0, 1.0E0/
+      DATA              DTRUE5/0.10E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0,
+     +                  2.0E0, 2.0E0, 2.0E0, -0.3E0, 3.0E0, 3.0E0,
+     +                  3.0E0, 3.0E0, 3.0E0, 3.0E0, 3.0E0, 0.0E0, 0.0E0,
+     +                  4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0,
+     +                  0.20E0, -0.60E0, 0.30E0, 5.0E0, 5.0E0, 5.0E0,
+     +                  5.0E0, 5.0E0, 0.03E0, -0.09E0, 0.15E0, -0.03E0,
+     +                  6.0E0, 6.0E0, 6.0E0, 6.0E0, 0.10E0, 8.0E0,
+     +                  8.0E0, 8.0E0, 8.0E0, 8.0E0, 8.0E0, 8.0E0,
+     +                  0.09E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0,
+     +                  9.0E0, 9.0E0, 0.09E0, 2.0E0, -0.12E0, 2.0E0,
+     +                  2.0E0, 2.0E0, 2.0E0, 2.0E0, 0.06E0, 3.0E0,
+     +                  -0.18E0, 5.0E0, 0.09E0, 2.0E0, 2.0E0, 2.0E0,
+     +                  0.03E0, 4.0E0, -0.09E0, 6.0E0, -0.15E0, 7.0E0,
+     +                  -0.03E0, 3.0E0/
+      DATA              ITRUE2/0, 1, 2, 2, 3/
+*     .. Executable Statements ..
+      DO 80 INCX = 1, 2
+         DO 60 NP1 = 1, 5
+            N = NP1 - 1
+            LEN = 2*MAX(N,1)
+*           .. Set vector arguments ..
+            DO 20 I = 1, LEN
+               SX(I) = DV(I,NP1,INCX)
+   20       CONTINUE
+*
+            IF (ICASE.EQ.7) THEN
+*              .. SNRM2 ..
+               STEMP(1) = DTRUE1(NP1)
+               CALL STEST1(SNRM2(N,SX,INCX),STEMP(1),STEMP,SFAC)
+            ELSE IF (ICASE.EQ.8) THEN
+*              .. SASUM ..
+               STEMP(1) = DTRUE3(NP1)
+               CALL STEST1(SASUM(N,SX,INCX),STEMP(1),STEMP,SFAC)
+            ELSE IF (ICASE.EQ.9) THEN
+*              .. SSCAL ..
+               CALL SSCAL(N,SA((INCX-1)*5+NP1),SX,INCX)
+               DO 40 I = 1, LEN
+                  STRUE(I) = DTRUE5(I,NP1,INCX)
+   40          CONTINUE
+               CALL STEST(LEN,SX,STRUE,STRUE,SFAC)
+            ELSE IF (ICASE.EQ.10) THEN
+*              .. ISAMAX ..
+               CALL ITEST1(ISAMAX(N,SX,INCX),ITRUE2(NP1))
+            ELSE
+               WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
+               STOP
+            END IF
+   60    CONTINUE
+   80 CONTINUE
+      RETURN
+      END
+      SUBROUTINE CHECK2(SFAC)
+*     .. Parameters ..
+      INTEGER           NOUT
+      PARAMETER         (NOUT=6)
+*     .. Scalar Arguments ..
+      REAL              SFAC
+*     .. Scalars in Common ..
+      INTEGER           ICASE, INCX, INCY, N
+      LOGICAL           PASS
+*     .. Local Scalars ..
+      REAL              SA
+      INTEGER           I, J, KI, KN, KNI, KPAR, KSIZE, LENX, LENY,
+     $                  MX, MY 
+*     .. Local Arrays ..
+      REAL              DT10X(7,4,4), DT10Y(7,4,4), DT7(4,4),
+     $                  DT8(7,4,4), DX1(7),
+     $                  DY1(7), SSIZE1(4), SSIZE2(14,2), SSIZE3(4),
+     $                  SSIZE(7), STX(7), STY(7), SX(7), SY(7),
+     $                  DPAR(5,4), DT19X(7,4,16),DT19XA(7,4,4),
+     $                  DT19XB(7,4,4), DT19XC(7,4,4),DT19XD(7,4,4),
+     $                  DT19Y(7,4,16), DT19YA(7,4,4),DT19YB(7,4,4),
+     $                  DT19YC(7,4,4), DT19YD(7,4,4), DTEMP(5),
+     $                  ST7B(4,4)
+      INTEGER           INCXS(4), INCYS(4), LENS(4,2), NS(4)
+*     .. External Functions ..
+      REAL              SDOT, SDSDOT
+      EXTERNAL          SDOT, SDSDOT
+*     .. External Subroutines ..
+      EXTERNAL          SAXPY, SCOPY, SROTM, SSWAP, STEST, STEST1
+*     .. Intrinsic Functions ..
+      INTRINSIC         ABS, MIN
+*     .. Common blocks ..
+      COMMON            /COMBLA/ICASE, N, INCX, INCY, PASS
+*     .. Data statements ..
+      EQUIVALENCE (DT19X(1,1,1),DT19XA(1,1,1)),(DT19X(1,1,5),
+     A   DT19XB(1,1,1)),(DT19X(1,1,9),DT19XC(1,1,1)),
+     B   (DT19X(1,1,13),DT19XD(1,1,1))
+      EQUIVALENCE (DT19Y(1,1,1),DT19YA(1,1,1)),(DT19Y(1,1,5),
+     A   DT19YB(1,1,1)),(DT19Y(1,1,9),DT19YC(1,1,1)),
+     B   (DT19Y(1,1,13),DT19YD(1,1,1))
+
+      DATA              SA/0.3E0/
+      DATA              INCXS/1, 2, -2, -1/
+      DATA              INCYS/1, -2, 1, -2/
+      DATA              LENS/1, 1, 2, 4, 1, 1, 3, 7/
+      DATA              NS/0, 1, 2, 4/
+      DATA              DX1/0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.9E0, -0.3E0,
+     +                  -0.4E0/
+      DATA              DY1/0.5E0, -0.9E0, 0.3E0, 0.7E0, -0.6E0, 0.2E0,
+     +                  0.8E0/
+      DATA              DT7/0.0E0, 0.30E0, 0.21E0, 0.62E0, 0.0E0,
+     +                  0.30E0, -0.07E0, 0.85E0, 0.0E0, 0.30E0, -0.79E0,
+     +                  -0.74E0, 0.0E0, 0.30E0, 0.33E0, 1.27E0/
+      DATA              ST7B/ .1, .4, .31, .72,     .1, .4, .03, .95,
+     +                  .1, .4, -.69, -.64,   .1, .4, .43, 1.37/
+      DATA              DT8/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.68E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.68E0, -0.87E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.68E0, -0.87E0, 0.15E0,
+     +                  0.94E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.68E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.35E0, -0.9E0, 0.48E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.38E0, -0.9E0, 0.57E0, 0.7E0, -0.75E0,
+     +                  0.2E0, 0.98E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.68E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.35E0, -0.72E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.38E0,
+     +                  -0.63E0, 0.15E0, 0.88E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.68E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.68E0, -0.9E0, 0.33E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.68E0, -0.9E0, 0.33E0, 0.7E0,
+     +                  -0.75E0, 0.2E0, 1.04E0/
+      DATA              DT10X/0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.5E0, -0.9E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.5E0, -0.9E0, 0.3E0, 0.7E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.3E0, 0.1E0, 0.5E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.8E0, 0.1E0, -0.6E0,
+     +                  0.8E0, 0.3E0, -0.3E0, 0.5E0, 0.6E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.9E0,
+     +                  0.1E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.7E0,
+     +                  0.1E0, 0.3E0, 0.8E0, -0.9E0, -0.3E0, 0.5E0,
+     +                  0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.5E0, 0.3E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.5E0, 0.3E0, -0.6E0, 0.8E0, 0.0E0, 0.0E0,
+     +                  0.0E0/
+      DATA              DT10Y/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.6E0, 0.1E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, -0.5E0, -0.9E0, 0.6E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, -0.4E0, -0.9E0, 0.9E0,
+     +                  0.7E0, -0.5E0, 0.2E0, 0.6E0, 0.5E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.5E0,
+     +                  0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  -0.4E0, 0.9E0, -0.5E0, 0.6E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.6E0, -0.9E0, 0.1E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.6E0, -0.9E0, 0.1E0, 0.7E0,
+     +                  -0.5E0, 0.2E0, 0.8E0/
+      DATA              SSIZE1/0.0E0, 0.3E0, 1.6E0, 3.2E0/
+      DATA              SSIZE2/0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0,
+     +                  1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0,
+     +                  1.17E0, 1.17E0, 1.17E0/
+      DATA              SSIZE3/ .1, .4, 1.7, 3.3 /
+*
+*                         FOR DROTM
+*
+      DATA DPAR/-2.E0,  0.E0,0.E0,0.E0,0.E0,
+     A          -1.E0,  2.E0, -3.E0, -4.E0,  5.E0,
+     B           0.E0,  0.E0,  2.E0, -3.E0,  0.E0,
+     C           1.E0,  5.E0,  2.E0,  0.E0, -4.E0/
+*                        TRUE X RESULTS F0R ROTATIONS DROTM
+      DATA DT19XA/.6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     A            .6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     B            .6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     C            .6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     D            .6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     E           -.8E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     F           -.9E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     G           3.5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     H            .6E0,   .1E0,             0.E0,0.E0,0.E0,0.E0,0.E0,
+     I           -.8E0,  3.8E0,             0.E0,0.E0,0.E0,0.E0,0.E0,
+     J           -.9E0,  2.8E0,             0.E0,0.E0,0.E0,0.E0,0.E0,
+     K           3.5E0,  -.4E0,             0.E0,0.E0,0.E0,0.E0,0.E0,
+     L            .6E0,   .1E0,  -.5E0,   .8E0,          0.E0,0.E0,0.E0,
+     M           -.8E0,  3.8E0, -2.2E0, -1.2E0,          0.E0,0.E0,0.E0,
+     N           -.9E0,  2.8E0, -1.4E0, -1.3E0,          0.E0,0.E0,0.E0,
+     O           3.5E0,  -.4E0, -2.2E0,  4.7E0,          0.E0,0.E0,0.E0/
+*
+      DATA DT19XB/.6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     A            .6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     B            .6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     C            .6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     D            .6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     E           -.8E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     F           -.9E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     G           3.5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     H            .6E0,   .1E0,  -.5E0,             0.E0,0.E0,0.E0,0.E0,
+     I           0.E0,    .1E0, -3.0E0,             0.E0,0.E0,0.E0,0.E0,
+     J           -.3E0,   .1E0, -2.0E0,             0.E0,0.E0,0.E0,0.E0,
+     K           3.3E0,   .1E0, -2.0E0,             0.E0,0.E0,0.E0,0.E0,
+     L            .6E0,   .1E0,  -.5E0,   .8E0,   .9E0,  -.3E0,  -.4E0,
+     M          -2.0E0,   .1E0,  1.4E0,   .8E0,   .6E0,  -.3E0, -2.8E0,
+     N          -1.8E0,   .1E0,  1.3E0,   .8E0,  0.E0,   -.3E0, -1.9E0,
+     O           3.8E0,   .1E0, -3.1E0,   .8E0,  4.8E0,  -.3E0, -1.5E0 /
+*
+      DATA DT19XC/.6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     A            .6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     B            .6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     C            .6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     D            .6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     E           -.8E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     F           -.9E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     G           3.5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     H            .6E0,   .1E0,  -.5E0,             0.E0,0.E0,0.E0,0.E0,
+     I           4.8E0,   .1E0, -3.0E0,             0.E0,0.E0,0.E0,0.E0,
+     J           3.3E0,   .1E0, -2.0E0,             0.E0,0.E0,0.E0,0.E0,
+     K           2.1E0,   .1E0, -2.0E0,             0.E0,0.E0,0.E0,0.E0,
+     L            .6E0,   .1E0,  -.5E0,   .8E0,   .9E0,  -.3E0,  -.4E0,
+     M          -1.6E0,   .1E0, -2.2E0,   .8E0,  5.4E0,  -.3E0, -2.8E0,
+     N          -1.5E0,   .1E0, -1.4E0,   .8E0,  3.6E0,  -.3E0, -1.9E0,
+     O           3.7E0,   .1E0, -2.2E0,   .8E0,  3.6E0,  -.3E0, -1.5E0 /
+*
+      DATA DT19XD/.6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     A            .6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     B            .6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     C            .6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     D            .6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     E           -.8E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     F           -.9E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     G           3.5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     H            .6E0,   .1E0,             0.E0,0.E0,0.E0,0.E0,0.E0,
+     I           -.8E0, -1.0E0,             0.E0,0.E0,0.E0,0.E0,0.E0,
+     J           -.9E0,  -.8E0,             0.E0,0.E0,0.E0,0.E0,0.E0,
+     K           3.5E0,   .8E0,             0.E0,0.E0,0.E0,0.E0,0.E0,
+     L            .6E0,   .1E0,  -.5E0,   .8E0,          0.E0,0.E0,0.E0,
+     M           -.8E0, -1.0E0,  1.4E0, -1.6E0,          0.E0,0.E0,0.E0,
+     N           -.9E0,  -.8E0,  1.3E0, -1.6E0,          0.E0,0.E0,0.E0,
+     O           3.5E0,   .8E0, -3.1E0,  4.8E0,          0.E0,0.E0,0.E0/
+*                        TRUE Y RESULTS FOR ROTATIONS DROTM
+      DATA DT19YA/.5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     A            .5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     B            .5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     C            .5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     D            .5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     E            .7E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     F           1.7E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     G          -2.6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     H            .5E0,  -.9E0,             0.E0,0.E0,0.E0,0.E0,0.E0,
+     I            .7E0, -4.8E0,             0.E0,0.E0,0.E0,0.E0,0.E0,
+     J           1.7E0,  -.7E0,             0.E0,0.E0,0.E0,0.E0,0.E0,
+     K          -2.6E0,  3.5E0,             0.E0,0.E0,0.E0,0.E0,0.E0,
+     L            .5E0,  -.9E0,   .3E0,   .7E0,          0.E0,0.E0,0.E0,
+     M            .7E0, -4.8E0,  3.0E0,  1.1E0,          0.E0,0.E0,0.E0,
+     N           1.7E0,  -.7E0,  -.7E0,  2.3E0,          0.E0,0.E0,0.E0,
+     O          -2.6E0,  3.5E0,  -.7E0, -3.6E0,          0.E0,0.E0,0.E0/
+*
+      DATA DT19YB/.5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     A            .5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     B            .5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     C            .5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     D            .5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     E            .7E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     F           1.7E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     G          -2.6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     H            .5E0,  -.9E0,   .3E0,             0.E0,0.E0,0.E0,0.E0,
+     I           4.0E0,  -.9E0,  -.3E0,             0.E0,0.E0,0.E0,0.E0,
+     J           -.5E0,  -.9E0,  1.5E0,             0.E0,0.E0,0.E0,0.E0,
+     K          -1.5E0,  -.9E0, -1.8E0,             0.E0,0.E0,0.E0,0.E0,
+     L            .5E0,  -.9E0,   .3E0,   .7E0,  -.6E0,   .2E0,   .8E0,
+     M           3.7E0,  -.9E0, -1.2E0,   .7E0, -1.5E0,   .2E0,  2.2E0,
+     N           -.3E0,  -.9E0,  2.1E0,   .7E0, -1.6E0,   .2E0,  2.0E0,
+     O          -1.6E0,  -.9E0, -2.1E0,   .7E0,  2.9E0,   .2E0, -3.8E0 /
+*
+      DATA DT19YC/.5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     A            .5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     B            .5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     C            .5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     D            .5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     E            .7E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     F           1.7E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     G          -2.6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     H            .5E0,  -.9E0,             0.E0,0.E0,0.E0,0.E0,0.E0,
+     I           4.0E0, -6.3E0,             0.E0,0.E0,0.E0,0.E0,0.E0,
+     J           -.5E0,   .3E0,             0.E0,0.E0,0.E0,0.E0,0.E0,
+     K          -1.5E0,  3.0E0,             0.E0,0.E0,0.E0,0.E0,0.E0,
+     L            .5E0,  -.9E0,   .3E0,   .7E0,          0.E0,0.E0,0.E0,
+     M           3.7E0, -7.2E0,  3.0E0,  1.7E0,          0.E0,0.E0,0.E0,
+     N           -.3E0,   .9E0,  -.7E0,  1.9E0,          0.E0,0.E0,0.E0,
+     O          -1.6E0,  2.7E0,  -.7E0, -3.4E0,          0.E0,0.E0,0.E0/
+*
+      DATA DT19YD/.5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     A            .5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     B            .5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     C            .5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     D            .5E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     E            .7E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     F           1.7E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     G          -2.6E0,                  0.E0,0.E0,0.E0,0.E0,0.E0,0.E0,
+     H            .5E0,  -.9E0,   .3E0,             0.E0,0.E0,0.E0,0.E0,
+     I            .7E0,  -.9E0,  1.2E0,             0.E0,0.E0,0.E0,0.E0,
+     J           1.7E0,  -.9E0,   .5E0,             0.E0,0.E0,0.E0,0.E0,
+     K          -2.6E0,  -.9E0, -1.3E0,             0.E0,0.E0,0.E0,0.E0,
+     L            .5E0,  -.9E0,   .3E0,   .7E0,  -.6E0,   .2E0,   .8E0,
+     M            .7E0,  -.9E0,  1.2E0,   .7E0, -1.5E0,   .2E0,  1.6E0,
+     N           1.7E0,  -.9E0,   .5E0,   .7E0, -1.6E0,   .2E0,  2.4E0,
+     O          -2.6E0,  -.9E0, -1.3E0,   .7E0,  2.9E0,   .2E0, -4.0E0 /
+*
+*     .. Executable Statements ..
+*
+      DO 120 KI = 1, 4
+         INCX = INCXS(KI)
+         INCY = INCYS(KI)
+         MX = ABS(INCX)
+         MY = ABS(INCY)
+*
+         DO 100 KN = 1, 4
+            N = NS(KN)
+            KSIZE = MIN(2,KN)
+            LENX = LENS(KN,MX)
+            LENY = LENS(KN,MY)
+*           .. Initialize all argument arrays ..
+            DO 20 I = 1, 7
+               SX(I) = DX1(I)
+               SY(I) = DY1(I)
+   20       CONTINUE
+*
+            IF (ICASE.EQ.1) THEN
+*              .. SDOT ..
+               CALL STEST1(SDOT(N,SX,INCX,SY,INCY),DT7(KN,KI),SSIZE1(KN)
+     +                     ,SFAC)
+            ELSE IF (ICASE.EQ.2) THEN
+*              .. SAXPY ..
+               CALL SAXPY(N,SA,SX,INCX,SY,INCY)
+               DO 40 J = 1, LENY
+                  STY(J) = DT8(J,KN,KI)
+   40          CONTINUE
+               CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC)
+            ELSE IF (ICASE.EQ.5) THEN
+*              .. SCOPY ..
+               DO 60 I = 1, 7
+                  STY(I) = DT10Y(I,KN,KI)
+   60          CONTINUE
+               CALL SCOPY(N,SX,INCX,SY,INCY)
+               CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0)
+            ELSE IF (ICASE.EQ.6) THEN
+*              .. SSWAP ..
+               CALL SSWAP(N,SX,INCX,SY,INCY)
+               DO 80 I = 1, 7
+                  STX(I) = DT10X(I,KN,KI)
+                  STY(I) = DT10Y(I,KN,KI)
+   80          CONTINUE
+               CALL STEST(LENX,SX,STX,SSIZE2(1,1),1.0E0)
+               CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0)
+            ELSEIF (ICASE.EQ.12) THEN
+*              .. SROTM ..
+               KNI=KN+4*(KI-1)
+               DO KPAR=1,4
+                  DO I=1,7
+                     SX(I) = DX1(I)
+                     SY(I) = DY1(I)
+                     STX(I)= DT19X(I,KPAR,KNI)
+                     STY(I)= DT19Y(I,KPAR,KNI)
+                  END DO
+*
+                  DO I=1,5
+                     DTEMP(I) = DPAR(I,KPAR)
+                  END DO
+*
+                  DO  I=1,LENX
+                     SSIZE(I)=STX(I)
+                  END DO
+*                   SEE REMARK ABOVE ABOUT DT11X(1,2,7)
+*                       AND DT11X(5,3,8).
+                  IF ((KPAR .EQ. 2) .AND. (KNI .EQ. 7))
+     $               SSIZE(1) = 2.4E0
+                  IF ((KPAR .EQ. 3) .AND. (KNI .EQ. 8))
+     $               SSIZE(5) = 1.8E0
+*
+                  CALL   SROTM(N,SX,INCX,SY,INCY,DTEMP)
+                  CALL   STEST(LENX,SX,STX,SSIZE,SFAC)
+                  CALL   STEST(LENY,SY,STY,STY,SFAC)
+               END DO
+            ELSEIF (ICASE.EQ.13) THEN
+*              .. SDSROT ..
+               CALL STEST1 (SDSDOT(N,.1,SX,INCX,SY,INCY),
+     $                 ST7B(KN,KI),SSIZE3(KN),SFAC)
+            ELSE
+               WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
+               STOP
+            END IF
+  100    CONTINUE
+  120 CONTINUE
+      RETURN
+      END
+      SUBROUTINE CHECK3(SFAC)
+*     .. Parameters ..
+      INTEGER           NOUT
+      PARAMETER         (NOUT=6)
+*     .. Scalar Arguments ..
+      REAL              SFAC
+*     .. Scalars in Common ..
+      INTEGER           ICASE, INCX, INCY, N
+      LOGICAL           PASS
+*     .. Local Scalars ..
+      REAL              SC, SS
+      INTEGER           I, K, KI, KN, KSIZE, LENX, LENY, MX, MY
+*     .. Local Arrays ..
+      REAL              COPYX(5), COPYY(5), DT9X(7,4,4), DT9Y(7,4,4),
+     +                  DX1(7), DY1(7), MWPC(11), MWPS(11), MWPSTX(5),
+     +                  MWPSTY(5), MWPTX(11,5), MWPTY(11,5), MWPX(5),
+     +                  MWPY(5), SSIZE2(14,2), STX(7), STY(7), SX(7),
+     +                  SY(7)
+      INTEGER           INCXS(4), INCYS(4), LENS(4,2), MWPINX(11),
+     +                  MWPINY(11), MWPN(11), NS(4)
+*     .. External Subroutines ..
+      EXTERNAL          SROT, STEST
+*     .. Intrinsic Functions ..
+      INTRINSIC         ABS, MIN
+*     .. Common blocks ..
+      COMMON            /COMBLA/ICASE, N, INCX, INCY, PASS
+*     .. Data statements ..
+      DATA              INCXS/1, 2, -2, -1/
+      DATA              INCYS/1, -2, 1, -2/
+      DATA              LENS/1, 1, 2, 4, 1, 1, 3, 7/
+      DATA              NS/0, 1, 2, 4/
+      DATA              DX1/0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.9E0, -0.3E0,
+     +                  -0.4E0/
+      DATA              DY1/0.5E0, -0.9E0, 0.3E0, 0.7E0, -0.6E0, 0.2E0,
+     +                  0.8E0/
+      DATA              SC, SS/0.8E0, 0.6E0/
+      DATA              DT9X/0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.78E0, -0.46E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.78E0, -0.46E0, -0.22E0,
+     +                  1.06E0, 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.78E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.66E0, 0.1E0, -0.1E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.96E0, 0.1E0, -0.76E0, 0.8E0, 0.90E0,
+     +                  -0.3E0, -0.02E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.78E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.06E0, 0.1E0,
+     +                  -0.1E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.90E0,
+     +                  0.1E0, -0.22E0, 0.8E0, 0.18E0, -0.3E0, -0.02E0,
+     +                  0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.78E0, 0.26E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.78E0, 0.26E0, -0.76E0, 1.12E0,
+     +                  0.0E0, 0.0E0, 0.0E0/
+      DATA              DT9Y/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.54E0,
+     +                  0.08E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.04E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.7E0,
+     +                  -0.9E0, -0.12E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.64E0, -0.9E0, -0.30E0, 0.7E0, -0.18E0, 0.2E0,
+     +                  0.28E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.7E0, -1.08E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.64E0, -1.26E0,
+     +                  0.54E0, 0.20E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.7E0,
+     +                  -0.18E0, 0.2E0, 0.16E0/
+      DATA              SSIZE2/0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0,
+     +                  0.0E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0,
+     +                  1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0,
+     +                  1.17E0, 1.17E0, 1.17E0/
+*     .. Executable Statements ..
+*
+      DO 60 KI = 1, 4
+         INCX = INCXS(KI)
+         INCY = INCYS(KI)
+         MX = ABS(INCX)
+         MY = ABS(INCY)
+*
+         DO 40 KN = 1, 4
+            N = NS(KN)
+            KSIZE = MIN(2,KN)
+            LENX = LENS(KN,MX)
+            LENY = LENS(KN,MY)
+*
+            IF (ICASE.EQ.4) THEN
+*              .. SROT ..
+               DO 20 I = 1, 7
+                  SX(I) = DX1(I)
+                  SY(I) = DY1(I)
+                  STX(I) = DT9X(I,KN,KI)
+                  STY(I) = DT9Y(I,KN,KI)
+   20          CONTINUE
+               CALL SROT(N,SX,INCX,SY,INCY,SC,SS)
+               CALL STEST(LENX,SX,STX,SSIZE2(1,KSIZE),SFAC)
+               CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC)
+            ELSE
+               WRITE (NOUT,*) ' Shouldn''t be here in CHECK3'
+               STOP
+            END IF
+   40    CONTINUE
+   60 CONTINUE
+*
+      MWPC(1) = 1
+      DO 80 I = 2, 11
+         MWPC(I) = 0
+   80 CONTINUE
+      MWPS(1) = 0
+      DO 100 I = 2, 6
+         MWPS(I) = 1
+  100 CONTINUE
+      DO 120 I = 7, 11
+         MWPS(I) = -1
+  120 CONTINUE
+      MWPINX(1) = 1
+      MWPINX(2) = 1
+      MWPINX(3) = 1
+      MWPINX(4) = -1
+      MWPINX(5) = 1
+      MWPINX(6) = -1
+      MWPINX(7) = 1
+      MWPINX(8) = 1
+      MWPINX(9) = -1
+      MWPINX(10) = 1
+      MWPINX(11) = -1
+      MWPINY(1) = 1
+      MWPINY(2) = 1
+      MWPINY(3) = -1
+      MWPINY(4) = -1
+      MWPINY(5) = 2
+      MWPINY(6) = 1
+      MWPINY(7) = 1
+      MWPINY(8) = -1
+      MWPINY(9) = -1
+      MWPINY(10) = 2
+      MWPINY(11) = 1
+      DO 140 I = 1, 11
+         MWPN(I) = 5
+  140 CONTINUE
+      MWPN(5) = 3
+      MWPN(10) = 3
+      DO 160 I = 1, 5
+         MWPX(I) = I
+         MWPY(I) = I
+         MWPTX(1,I) = I
+         MWPTY(1,I) = I
+         MWPTX(2,I) = I
+         MWPTY(2,I) = -I
+         MWPTX(3,I) = 6 - I
+         MWPTY(3,I) = I - 6
+         MWPTX(4,I) = I
+         MWPTY(4,I) = -I
+         MWPTX(6,I) = 6 - I
+         MWPTY(6,I) = I - 6
+         MWPTX(7,I) = -I
+         MWPTY(7,I) = I
+         MWPTX(8,I) = I - 6
+         MWPTY(8,I) = 6 - I
+         MWPTX(9,I) = -I
+         MWPTY(9,I) = I
+         MWPTX(11,I) = I - 6
+         MWPTY(11,I) = 6 - I
+  160 CONTINUE
+      MWPTX(5,1) = 1
+      MWPTX(5,2) = 3
+      MWPTX(5,3) = 5
+      MWPTX(5,4) = 4
+      MWPTX(5,5) = 5
+      MWPTY(5,1) = -1
+      MWPTY(5,2) = 2
+      MWPTY(5,3) = -2
+      MWPTY(5,4) = 4
+      MWPTY(5,5) = -3
+      MWPTX(10,1) = -1
+      MWPTX(10,2) = -3
+      MWPTX(10,3) = -5
+      MWPTX(10,4) = 4
+      MWPTX(10,5) = 5
+      MWPTY(10,1) = 1
+      MWPTY(10,2) = 2
+      MWPTY(10,3) = 2
+      MWPTY(10,4) = 4
+      MWPTY(10,5) = 3
+      DO 200 I = 1, 11
+         INCX = MWPINX(I)
+         INCY = MWPINY(I)
+         DO 180 K = 1, 5
+            COPYX(K) = MWPX(K)
+            COPYY(K) = MWPY(K)
+            MWPSTX(K) = MWPTX(I,K)
+            MWPSTY(K) = MWPTY(I,K)
+  180    CONTINUE
+         CALL SROT(MWPN(I),COPYX,INCX,COPYY,INCY,MWPC(I),MWPS(I))
+         CALL STEST(5,COPYX,MWPSTX,MWPSTX,SFAC)
+         CALL STEST(5,COPYY,MWPSTY,MWPSTY,SFAC)
+  200 CONTINUE
+      RETURN
+      END
+      SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC)
+*     ********************************* STEST **************************
+*
+*     THIS SUBR COMPARES ARRAYS  SCOMP() AND STRUE() OF LENGTH LEN TO
+*     SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE
+*     NEGLIGIBLE.
+*
+*     C. L. LAWSON, JPL, 1974 DEC 10
+*
+*     .. Parameters ..
+      INTEGER          NOUT
+      REAL             ZERO
+      PARAMETER        (NOUT=6, ZERO=0.0E0)
+*     .. Scalar Arguments ..
+      REAL             SFAC
+      INTEGER          LEN
+*     .. Array Arguments ..
+      REAL             SCOMP(LEN), SSIZE(LEN), STRUE(LEN)
+*     .. Scalars in Common ..
+      INTEGER          ICASE, INCX, INCY, N
+      LOGICAL          PASS
+*     .. Local Scalars ..
+      REAL             SD
+      INTEGER          I
+*     .. External Functions ..
+      REAL             SDIFF
+      EXTERNAL         SDIFF
+*     .. Intrinsic Functions ..
+      INTRINSIC        ABS
+*     .. Common blocks ..
+      COMMON           /COMBLA/ICASE, N, INCX, INCY, PASS
+*     .. Executable Statements ..
+*
+      DO 40 I = 1, LEN
+         SD = SCOMP(I) - STRUE(I)
+         IF (ABS(SFAC*SD) .LE. ABS(SSIZE(I))*EPSILON(ZERO))
+     +       GO TO 40
+*
+*                             HERE    SCOMP(I) IS NOT CLOSE TO STRUE(I).
+*
+         IF ( .NOT. PASS) GO TO 20
+*                             PRINT FAIL MESSAGE AND HEADER.
+         PASS = .FALSE.
+         WRITE (NOUT,99999)
+         WRITE (NOUT,99998)
+   20    WRITE (NOUT,99997) ICASE, N, INCX, INCY, I, SCOMP(I),
+     +     STRUE(I), SD, SSIZE(I)
+   40 CONTINUE
+      RETURN
+*
+99999 FORMAT ('                                       FAIL')
+99998 FORMAT (/' CASE  N INCX INCY  I                            ',
+     +       ' COMP(I)                             TRUE(I)  DIFFERENCE',
+     +       '     SIZE(I)',/1X)
+99997 FORMAT (1X,I4,I3,2I5,I3,2E36.8,2E12.4)
+      END
+      SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
+*     ************************* STEST1 *****************************
+*
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
+*     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
+*     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
+*
+*     C.L. LAWSON, JPL, 1978 DEC 6
+*
+*     .. Scalar Arguments ..
+      REAL              SCOMP1, SFAC, STRUE1
+*     .. Array Arguments ..
+      REAL              SSIZE(*)
+*     .. Local Arrays ..
+      REAL              SCOMP(1), STRUE(1)
+*     .. External Subroutines ..
+      EXTERNAL          STEST
+*     .. Executable Statements ..
+*
+      SCOMP(1) = SCOMP1
+      STRUE(1) = STRUE1
+      CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC)
+*
+      RETURN
+      END
+      REAL             FUNCTION SDIFF(SA,SB)
+*     ********************************* SDIFF **************************
+*     COMPUTES DIFFERENCE OF TWO NUMBERS.  C. L. LAWSON, JPL 1974 FEB 15
+*
+*     .. Scalar Arguments ..
+      REAL                            SA, SB
+*     .. Executable Statements ..
+      SDIFF = SA - SB
+      RETURN
+      END
+      SUBROUTINE ITEST1(ICOMP,ITRUE)
+*     ********************************* ITEST1 *************************
+*
+*     THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR
+*     EQUALITY.
+*     C. L. LAWSON, JPL, 1974 DEC 10
+*
+*     .. Parameters ..
+      INTEGER           NOUT
+      PARAMETER         (NOUT=6)
+*     .. Scalar Arguments ..
+      INTEGER           ICOMP, ITRUE
+*     .. Scalars in Common ..
+      INTEGER           ICASE, INCX, INCY, N
+      LOGICAL           PASS
+*     .. Local Scalars ..
+      INTEGER           ID
+*     .. Common blocks ..
+      COMMON            /COMBLA/ICASE, N, INCX, INCY, PASS
+*     .. Executable Statements ..
+*
+      IF (ICOMP.EQ.ITRUE) GO TO 40
+*
+*                            HERE ICOMP IS NOT EQUAL TO ITRUE.
+*
+      IF ( .NOT. PASS) GO TO 20
+*                             PRINT FAIL MESSAGE AND HEADER.
+      PASS = .FALSE.
+      WRITE (NOUT,99999)
+      WRITE (NOUT,99998)
+   20 ID = ICOMP - ITRUE
+      WRITE (NOUT,99997) ICASE, N, INCX, INCY, ICOMP, ITRUE, ID
+   40 CONTINUE
+      RETURN
+*
+99999 FORMAT ('                                       FAIL')
+99998 FORMAT (/' CASE  N INCX INCY                               ',
+     +       ' COMP                                TRUE     DIFFERENCE',
+     +       /1X)
+99997 FORMAT (1X,I4,I3,2I5,2I36,I12)
+      END

diff --git a/blas/testing/sblat2.dat b/blas/testing/sblat2.dat
new file mode 100644
index 0000000..f537d30
--- /dev/null
+++ b/blas/testing/sblat2.dat

@@ -0,0 +1,34 @@
+'sblat2.summ'     NAME OF SUMMARY OUTPUT FILE
+6                 UNIT NUMBER OF SUMMARY FILE
+'sblat2.snap'     NAME OF SNAPSHOT OUTPUT FILE
+-1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+F        LOGICAL FLAG, T TO STOP ON FAILURES.
+T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+16.0     THRESHOLD VALUE OF TEST RATIO
+6                 NUMBER OF VALUES OF N
+0 1 2 3 5 9       VALUES OF N
+4                 NUMBER OF VALUES OF K
+0 1 2 4           VALUES OF K
+4                 NUMBER OF VALUES OF INCX AND INCY
+1 2 -1 -2         VALUES OF INCX AND INCY
+3                 NUMBER OF VALUES OF ALPHA
+0.0 1.0 0.7       VALUES OF ALPHA
+3                 NUMBER OF VALUES OF BETA
+0.0 1.0 0.9       VALUES OF BETA
+SGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+SGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+SSYMV  T PUT F FOR NO TEST. SAME COLUMNS.
+SSBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+SSPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+STRMV  T PUT F FOR NO TEST. SAME COLUMNS.
+STBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+STPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+STRSV  T PUT F FOR NO TEST. SAME COLUMNS.
+STBSV  T PUT F FOR NO TEST. SAME COLUMNS.
+STPSV  T PUT F FOR NO TEST. SAME COLUMNS.
+SGER   T PUT F FOR NO TEST. SAME COLUMNS.
+SSYR   T PUT F FOR NO TEST. SAME COLUMNS.
+SSPR   T PUT F FOR NO TEST. SAME COLUMNS.
+SSYR2  T PUT F FOR NO TEST. SAME COLUMNS.
+SSPR2  T PUT F FOR NO TEST. SAME COLUMNS.

diff --git a/blas/testing/sblat2.f b/blas/testing/sblat2.f
new file mode 100644
index 0000000..71605ed
--- /dev/null
+++ b/blas/testing/sblat2.f

@@ -0,0 +1,3176 @@
+*> \brief \b SBLAT2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM SBLAT2
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the REAL Level 2 Blas.
+*>
+*> The program must be driven by a short data file. The first 18 records
+*> of the file are read using list-directed input, the last 16 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 34 lines:
+*> 'sblat2.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'SBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 4                 NUMBER OF VALUES OF K
+*> 0 1 2 4           VALUES OF K
+*> 4                 NUMBER OF VALUES OF INCX AND INCY
+*> 1 2 -1 -2         VALUES OF INCX AND INCY
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> 0.0 1.0 0.7       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> 0.0 1.0 0.9       VALUES OF BETA
+*> SGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STRMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STRSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STBSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STPSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SGER   T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSPR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSPR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*>    See:
+*>
+*>       Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
+*>       An  extended  set of Fortran  Basic Linear Algebra Subprograms.
+*>
+*>       Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
+*>       and  Computer Science  Division,  Argonne  National Laboratory,
+*>       9700 South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*>       Or
+*>
+*>       NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
+*>       Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
+*>       OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
+*>       Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
+*>
+*>
+*> -- Written on 10-August-1987.
+*>    Richard Hanson, Sandia National Labs.
+*>    Jeremy Du Croz, NAG Central Office.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup single_blas_testing
+*
+*  =====================================================================
+      PROGRAM SBLAT2
+*
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NIN
+      PARAMETER          ( NIN = 5 )
+      INTEGER            NSUBS
+      PARAMETER          ( NSUBS = 16 )
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0, ONE = 1.0 )
+      INTEGER            NMAX, INCMAX
+      PARAMETER          ( NMAX = 65, INCMAX = 2 )
+      INTEGER            NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX
+      PARAMETER          ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7,
+     $                   NALMAX = 7, NBEMAX = 7 )
+*     .. Local Scalars ..
+      REAL               EPS, ERR, THRESH
+      INTEGER            I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB,
+     $                   NOUT, NTRA
+      LOGICAL            FATAL, LTESTT, REWI, SAME, SFATAL, TRACE,
+     $                   TSTERR
+      CHARACTER*1        TRANS
+      CHARACTER*6        SNAMET
+      CHARACTER*32       SNAPS, SUMMRY
+*     .. Local Arrays ..
+      REAL               A( NMAX, NMAX ), AA( NMAX*NMAX ),
+     $                   ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ),
+     $                   G( NMAX ), X( NMAX ), XS( NMAX*INCMAX ),
+     $                   XX( NMAX*INCMAX ), Y( NMAX ),
+     $                   YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX ), Z( 2*NMAX )
+      INTEGER            IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX )
+      LOGICAL            LTEST( NSUBS )
+      CHARACTER*6        SNAMES( NSUBS )
+*     .. External Functions ..
+      REAL               SDIFF
+      LOGICAL            LSE
+      EXTERNAL           SDIFF, LSE
+*     .. External Subroutines ..
+      EXTERNAL           SCHK1, SCHK2, SCHK3, SCHK4, SCHK5, SCHK6,
+     $                   SCHKE, SMVCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+      COMMON             /SRNAMC/SRNAMT
+*     .. Data statements ..
+      DATA               SNAMES/'SGEMV ', 'SGBMV ', 'SSYMV ', 'SSBMV ',
+     $                   'SSPMV ', 'STRMV ', 'STBMV ', 'STPMV ',
+     $                   'STRSV ', 'STBSV ', 'STPSV ', 'SGER  ',
+     $                   'SSYR  ', 'SSPR  ', 'SSYR2 ', 'SSPR2 '/
+*     .. Executable Statements ..
+*
+*     Read name and unit number for summary output file and open file.
+*
+      READ( NIN, FMT = * )SUMMRY
+      READ( NIN, FMT = * )NOUT
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
+      NOUTC = NOUT
+*
+*     Read name and unit number for snapshot output file and open file.
+*
+      READ( NIN, FMT = * )SNAPS
+      READ( NIN, FMT = * )NTRA
+      TRACE = NTRA.GE.0
+      IF( TRACE )THEN
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
+      END IF
+*     Read the flag that directs rewinding of the snapshot file.
+      READ( NIN, FMT = * )REWI
+      REWI = REWI.AND.TRACE
+*     Read the flag that directs stopping on any failure.
+      READ( NIN, FMT = * )SFATAL
+*     Read the flag that indicates whether error exits are to be tested.
+      READ( NIN, FMT = * )TSTERR
+*     Read the threshold value of the test ratio
+      READ( NIN, FMT = * )THRESH
+*
+*     Read and check the parameter values for the tests.
+*
+*     Values of N
+      READ( NIN, FMT = * )NIDIM
+      IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'N', NIDMAX
+         GO TO 230
+      END IF
+      READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM )
+      DO 10 I = 1, NIDIM
+         IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN
+            WRITE( NOUT, FMT = 9996 )NMAX
+            GO TO 230
+         END IF
+   10 CONTINUE
+*     Values of K
+      READ( NIN, FMT = * )NKB
+      IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'K', NKBMAX
+         GO TO 230
+      END IF
+      READ( NIN, FMT = * )( KB( I ), I = 1, NKB )
+      DO 20 I = 1, NKB
+         IF( KB( I ).LT.0 )THEN
+            WRITE( NOUT, FMT = 9995 )
+            GO TO 230
+         END IF
+   20 CONTINUE
+*     Values of INCX and INCY
+      READ( NIN, FMT = * )NINC
+      IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX
+         GO TO 230
+      END IF
+      READ( NIN, FMT = * )( INC( I ), I = 1, NINC )
+      DO 30 I = 1, NINC
+         IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN
+            WRITE( NOUT, FMT = 9994 )INCMAX
+            GO TO 230
+         END IF
+   30 CONTINUE
+*     Values of ALPHA
+      READ( NIN, FMT = * )NALF
+      IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX
+         GO TO 230
+      END IF
+      READ( NIN, FMT = * )( ALF( I ), I = 1, NALF )
+*     Values of BETA
+      READ( NIN, FMT = * )NBET
+      IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX
+         GO TO 230
+      END IF
+      READ( NIN, FMT = * )( BET( I ), I = 1, NBET )
+*
+*     Report values of parameters.
+*
+      WRITE( NOUT, FMT = 9993 )
+      WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM )
+      WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB )
+      WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC )
+      WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF )
+      WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET )
+      IF( .NOT.TSTERR )THEN
+         WRITE( NOUT, FMT = * )
+         WRITE( NOUT, FMT = 9980 )
+      END IF
+      WRITE( NOUT, FMT = * )
+      WRITE( NOUT, FMT = 9999 )THRESH
+      WRITE( NOUT, FMT = * )
+*
+*     Read names of subroutines and flags which indicate
+*     whether they are to be tested.
+*
+      DO 40 I = 1, NSUBS
+         LTEST( I ) = .FALSE.
+   40 CONTINUE
+   50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT
+      DO 60 I = 1, NSUBS
+         IF( SNAMET.EQ.SNAMES( I ) )
+     $      GO TO 70
+   60 CONTINUE
+      WRITE( NOUT, FMT = 9986 )SNAMET
+      STOP
+   70 LTEST( I ) = LTESTT
+      GO TO 50
+*
+   80 CONTINUE
+      CLOSE ( NIN )
+*
+*     Compute EPS (the machine precision).
+*
+      EPS = EPSILON(ZERO)
+      WRITE( NOUT, FMT = 9998 )EPS
+*
+*     Check the reliability of SMVCH using exact data.
+*
+      N = MIN( 32, NMAX )
+      DO 120 J = 1, N
+         DO 110 I = 1, N
+            A( I, J ) = MAX( I - J + 1, 0 )
+  110    CONTINUE
+         X( J ) = J
+         Y( J ) = ZERO
+  120 CONTINUE
+      DO 130 J = 1, N
+         YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3
+  130 CONTINUE
+*     YY holds the exact result. On exit from SMVCH YT holds
+*     the result computed by SMVCH.
+      TRANS = 'N'
+      CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G,
+     $            YY, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LSE( YY, YT, N )
+      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
+         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
+         STOP
+      END IF
+      TRANS = 'T'
+      CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
+     $            YY, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LSE( YY, YT, N )
+      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
+         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
+         STOP
+      END IF
+*
+*     Test each subroutine in turn.
+*
+      DO 210 ISNUM = 1, NSUBS
+         WRITE( NOUT, FMT = * )
+         IF( .NOT.LTEST( ISNUM ) )THEN
+*           Subprogram is not to be tested.
+            WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM )
+         ELSE
+            SRNAMT = SNAMES( ISNUM )
+*           Test error exits.
+            IF( TSTERR )THEN
+               CALL SCHKE( ISNUM, SNAMES( ISNUM ), NOUT )
+               WRITE( NOUT, FMT = * )
+            END IF
+*           Test computations.
+            INFOT = 0
+            OK = .TRUE.
+            FATAL = .FALSE.
+            GO TO ( 140, 140, 150, 150, 150, 160, 160,
+     $              160, 160, 160, 160, 170, 180, 180,
+     $              190, 190 )ISNUM
+*           Test SGEMV, 01, and SGBMV, 02.
+  140       CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF,
+     $                  NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS,
+     $                  X, XX, XS, Y, YY, YS, YT, G )
+            GO TO 200
+*           Test SSYMV, 03, SSBMV, 04, and SSPMV, 05.
+  150       CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF,
+     $                  NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS,
+     $                  X, XX, XS, Y, YY, YS, YT, G )
+            GO TO 200
+*           Test STRMV, 06, STBMV, 07, STPMV, 08,
+*           STRSV, 09, STBSV, 10, and STPSV, 11.
+  160       CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC,
+     $                  NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z )
+            GO TO 200
+*           Test SGER, 12.
+  170       CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC,
+     $                  NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS,
+     $                  YT, G, Z )
+            GO TO 200
+*           Test SSYR, 13, and SSPR, 14.
+  180       CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC,
+     $                  NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS,
+     $                  YT, G, Z )
+            GO TO 200
+*           Test SSYR2, 15, and SSPR2, 16.
+  190       CALL SCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC,
+     $                  NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS,
+     $                  YT, G, Z )
+*
+  200       IF( FATAL.AND.SFATAL )
+     $         GO TO 220
+         END IF
+  210 CONTINUE
+      WRITE( NOUT, FMT = 9982 )
+      GO TO 240
+*
+  220 CONTINUE
+      WRITE( NOUT, FMT = 9981 )
+      GO TO 240
+*
+  230 CONTINUE
+      WRITE( NOUT, FMT = 9987 )
+*
+  240 CONTINUE
+      IF( TRACE )
+     $   CLOSE ( NTRA )
+      CLOSE ( NOUT )
+      STOP
+*
+ 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES',
+     $      'S THAN', F8.2 )
+ 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 )
+ 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ',
+     $      'THAN ', I2 )
+ 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 )
+ 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' )
+ 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ',
+     $      I2 )
+ 9993 FORMAT( ' TESTS OF THE REAL             LEVEL 2 BLAS', //' THE F',
+     $      'OLLOWING PARAMETER VALUES WILL BE USED:' )
+ 9992 FORMAT( '   FOR N              ', 9I6 )
+ 9991 FORMAT( '   FOR K              ', 7I6 )
+ 9990 FORMAT( '   FOR INCX AND INCY  ', 7I6 )
+ 9989 FORMAT( '   FOR ALPHA          ', 7F6.1 )
+ 9988 FORMAT( '   FOR BETA           ', 7F6.1 )
+ 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM',
+     $      /' ******* TESTS ABANDONED *******' )
+ 9986 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T',
+     $      'ESTS ABANDONED *******' )
+ 9985 FORMAT( ' ERROR IN SMVCH -  IN-LINE DOT PRODUCTS ARE BEING EVALU',
+     $      'ATED WRONGLY.', /' SMVCH WAS CALLED WITH TRANS = ', A1,
+     $      ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', /
+     $   ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.'
+     $      , /' ******* TESTS ABANDONED *******' )
+ 9984 FORMAT( A6, L2 )
+ 9983 FORMAT( 1X, A6, ' WAS NOT TESTED' )
+ 9982 FORMAT( /' END OF TESTS' )
+ 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' )
+ 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' )
+*
+*     End of SBLAT2.
+*
+      END
+      SUBROUTINE SCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET,
+     $                  BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX,
+     $                  XS, Y, YY, YS, YT, G )
+*
+*  Tests SGEMV and SGBMV.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      REAL               ZERO, HALF
+      PARAMETER          ( ZERO = 0.0, HALF = 0.5 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX,
+     $                   NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      REAL               A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), BET( NBET ), G( NMAX ),
+     $                   X( NMAX ), XS( NMAX*INCMAX ),
+     $                   XX( NMAX*INCMAX ), Y( NMAX ),
+     $                   YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC ), KB( NKB )
+*     .. Local Scalars ..
+      REAL               ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL
+      INTEGER            I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY,
+     $                   INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA,
+     $                   LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK,
+     $                   NL, NS
+      LOGICAL            BANDED, FULL, NULL, RESET, SAME, TRAN
+      CHARACTER*1        TRANS, TRANSS
+      CHARACTER*3        ICH
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LSE, LSERES
+      EXTERNAL           LSE, LSERES
+*     .. External Subroutines ..
+      EXTERNAL           SGBMV, SGEMV, SMAKE, SMVCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICH/'NTC'/
+*     .. Executable Statements ..
+      FULL = SNAME( 3: 3 ).EQ.'E'
+      BANDED = SNAME( 3: 3 ).EQ.'B'
+*     Define the number of arguments.
+      IF( FULL )THEN
+         NARGS = 11
+      ELSE IF( BANDED )THEN
+         NARGS = 13
+      END IF
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*
+      DO 120 IN = 1, NIDIM
+         N = IDIM( IN )
+         ND = N/2 + 1
+*
+         DO 110 IM = 1, 2
+            IF( IM.EQ.1 )
+     $         M = MAX( N - ND, 0 )
+            IF( IM.EQ.2 )
+     $         M = MIN( N + ND, NMAX )
+*
+            IF( BANDED )THEN
+               NK = NKB
+            ELSE
+               NK = 1
+            END IF
+            DO 100 IKU = 1, NK
+               IF( BANDED )THEN
+                  KU = KB( IKU )
+                  KL = MAX( KU - 1, 0 )
+               ELSE
+                  KU = N - 1
+                  KL = M - 1
+               END IF
+*              Set LDA to 1 more than minimum value if room.
+               IF( BANDED )THEN
+                  LDA = KL + KU + 1
+               ELSE
+                  LDA = M
+               END IF
+               IF( LDA.LT.NMAX )
+     $            LDA = LDA + 1
+*              Skip tests if not enough room.
+               IF( LDA.GT.NMAX )
+     $            GO TO 100
+               LAA = LDA*N
+               NULL = N.LE.0.OR.M.LE.0
+*
+*              Generate the matrix A.
+*
+               TRANSL = ZERO
+               CALL SMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, AA,
+     $                     LDA, KL, KU, RESET, TRANSL )
+*
+               DO 90 IC = 1, 3
+                  TRANS = ICH( IC: IC )
+                  TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C'
+*
+                  IF( TRAN )THEN
+                     ML = N
+                     NL = M
+                  ELSE
+                     ML = M
+                     NL = N
+                  END IF
+*
+                  DO 80 IX = 1, NINC
+                     INCX = INC( IX )
+                     LX = ABS( INCX )*NL
+*
+*                    Generate the vector X.
+*
+                     TRANSL = HALF
+                     CALL SMAKE( 'GE', ' ', ' ', 1, NL, X, 1, XX,
+     $                           ABS( INCX ), 0, NL - 1, RESET, TRANSL )
+                     IF( NL.GT.1 )THEN
+                        X( NL/2 ) = ZERO
+                        XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO
+                     END IF
+*
+                     DO 70 IY = 1, NINC
+                        INCY = INC( IY )
+                        LY = ABS( INCY )*ML
+*
+                        DO 60 IA = 1, NALF
+                           ALPHA = ALF( IA )
+*
+                           DO 50 IB = 1, NBET
+                              BETA = BET( IB )
+*
+*                             Generate the vector Y.
+*
+                              TRANSL = ZERO
+                              CALL SMAKE( 'GE', ' ', ' ', 1, ML, Y, 1,
+     $                                    YY, ABS( INCY ), 0, ML - 1,
+     $                                    RESET, TRANSL )
+*
+                              NC = NC + 1
+*
+*                             Save every datum before calling the
+*                             subroutine.
+*
+                              TRANSS = TRANS
+                              MS = M
+                              NS = N
+                              KLS = KL
+                              KUS = KU
+                              ALS = ALPHA
+                              DO 10 I = 1, LAA
+                                 AS( I ) = AA( I )
+   10                         CONTINUE
+                              LDAS = LDA
+                              DO 20 I = 1, LX
+                                 XS( I ) = XX( I )
+   20                         CONTINUE
+                              INCXS = INCX
+                              BLS = BETA
+                              DO 30 I = 1, LY
+                                 YS( I ) = YY( I )
+   30                         CONTINUE
+                              INCYS = INCY
+*
+*                             Call the subroutine.
+*
+                              IF( FULL )THEN
+                                 IF( TRACE )
+     $                              WRITE( NTRA, FMT = 9994 )NC, SNAME,
+     $                              TRANS, M, N, ALPHA, LDA, INCX, BETA,
+     $                              INCY
+                                 IF( REWI )
+     $                              REWIND NTRA
+                                 CALL SGEMV( TRANS, M, N, ALPHA, AA,
+     $                                       LDA, XX, INCX, BETA, YY,
+     $                                       INCY )
+                              ELSE IF( BANDED )THEN
+                                 IF( TRACE )
+     $                              WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                              TRANS, M, N, KL, KU, ALPHA, LDA,
+     $                              INCX, BETA, INCY
+                                 IF( REWI )
+     $                              REWIND NTRA
+                                 CALL SGBMV( TRANS, M, N, KL, KU, ALPHA,
+     $                                       AA, LDA, XX, INCX, BETA,
+     $                                       YY, INCY )
+                              END IF
+*
+*                             Check if error-exit was taken incorrectly.
+*
+                              IF( .NOT.OK )THEN
+                                 WRITE( NOUT, FMT = 9993 )
+                                 FATAL = .TRUE.
+                                 GO TO 130
+                              END IF
+*
+*                             See what data changed inside subroutines.
+*
+                              ISAME( 1 ) = TRANS.EQ.TRANSS
+                              ISAME( 2 ) = MS.EQ.M
+                              ISAME( 3 ) = NS.EQ.N
+                              IF( FULL )THEN
+                                 ISAME( 4 ) = ALS.EQ.ALPHA
+                                 ISAME( 5 ) = LSE( AS, AA, LAA )
+                                 ISAME( 6 ) = LDAS.EQ.LDA
+                                 ISAME( 7 ) = LSE( XS, XX, LX )
+                                 ISAME( 8 ) = INCXS.EQ.INCX
+                                 ISAME( 9 ) = BLS.EQ.BETA
+                                 IF( NULL )THEN
+                                    ISAME( 10 ) = LSE( YS, YY, LY )
+                                 ELSE
+                                    ISAME( 10 ) = LSERES( 'GE', ' ', 1,
+     $                                            ML, YS, YY,
+     $                                            ABS( INCY ) )
+                                 END IF
+                                 ISAME( 11 ) = INCYS.EQ.INCY
+                              ELSE IF( BANDED )THEN
+                                 ISAME( 4 ) = KLS.EQ.KL
+                                 ISAME( 5 ) = KUS.EQ.KU
+                                 ISAME( 6 ) = ALS.EQ.ALPHA
+                                 ISAME( 7 ) = LSE( AS, AA, LAA )
+                                 ISAME( 8 ) = LDAS.EQ.LDA
+                                 ISAME( 9 ) = LSE( XS, XX, LX )
+                                 ISAME( 10 ) = INCXS.EQ.INCX
+                                 ISAME( 11 ) = BLS.EQ.BETA
+                                 IF( NULL )THEN
+                                    ISAME( 12 ) = LSE( YS, YY, LY )
+                                 ELSE
+                                    ISAME( 12 ) = LSERES( 'GE', ' ', 1,
+     $                                            ML, YS, YY,
+     $                                            ABS( INCY ) )
+                                 END IF
+                                 ISAME( 13 ) = INCYS.EQ.INCY
+                              END IF
+*
+*                             If data was incorrectly changed, report
+*                             and return.
+*
+                              SAME = .TRUE.
+                              DO 40 I = 1, NARGS
+                                 SAME = SAME.AND.ISAME( I )
+                                 IF( .NOT.ISAME( I ) )
+     $                              WRITE( NOUT, FMT = 9998 )I
+   40                         CONTINUE
+                              IF( .NOT.SAME )THEN
+                                 FATAL = .TRUE.
+                                 GO TO 130
+                              END IF
+*
+                              IF( .NOT.NULL )THEN
+*
+*                                Check the result.
+*
+                                 CALL SMVCH( TRANS, M, N, ALPHA, A,
+     $                                       NMAX, X, INCX, BETA, Y,
+     $                                       INCY, YT, G, YY, EPS, ERR,
+     $                                       FATAL, NOUT, .TRUE. )
+                                 ERRMAX = MAX( ERRMAX, ERR )
+*                                If got really bad answer, report and
+*                                return.
+                                 IF( FATAL )
+     $                              GO TO 130
+                              ELSE
+*                                Avoid repeating tests with M.le.0 or
+*                                N.le.0.
+                                 GO TO 110
+                              END IF
+*
+   50                      CONTINUE
+*
+   60                   CONTINUE
+*
+   70                CONTINUE
+*
+   80             CONTINUE
+*
+   90          CONTINUE
+*
+  100       CONTINUE
+*
+  110    CONTINUE
+*
+  120 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 140
+*
+  130 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( FULL )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, TRANS, M, N, ALPHA, LDA,
+     $      INCX, BETA, INCY
+      ELSE IF( BANDED )THEN
+         WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANS, M, N, KL, KU,
+     $      ALPHA, LDA, INCX, BETA, INCY
+      END IF
+*
+  140 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 4( I3, ',' ), F4.1,
+     $      ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), F4.1,
+     $      ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2,
+     $      ')         .' )
+ 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of SCHK1.
+*
+      END
+      SUBROUTINE SCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET,
+     $                  BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX,
+     $                  XS, Y, YY, YS, YT, G )
+*
+*  Tests SSYMV, SSBMV and SSPMV.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      REAL               ZERO, HALF
+      PARAMETER          ( ZERO = 0.0, HALF = 0.5 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX,
+     $                   NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      REAL               A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), BET( NBET ), G( NMAX ),
+     $                   X( NMAX ), XS( NMAX*INCMAX ),
+     $                   XX( NMAX*INCMAX ), Y( NMAX ),
+     $                   YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC ), KB( NKB )
+*     .. Local Scalars ..
+      REAL               ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL
+      INTEGER            I, IA, IB, IC, IK, IN, INCX, INCXS, INCY,
+     $                   INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY,
+     $                   N, NARGS, NC, NK, NS
+      LOGICAL            BANDED, FULL, NULL, PACKED, RESET, SAME
+      CHARACTER*1        UPLO, UPLOS
+      CHARACTER*2        ICH
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LSE, LSERES
+      EXTERNAL           LSE, LSERES
+*     .. External Subroutines ..
+      EXTERNAL           SMAKE, SMVCH, SSBMV, SSPMV, SSYMV
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICH/'UL'/
+*     .. Executable Statements ..
+      FULL = SNAME( 3: 3 ).EQ.'Y'
+      BANDED = SNAME( 3: 3 ).EQ.'B'
+      PACKED = SNAME( 3: 3 ).EQ.'P'
+*     Define the number of arguments.
+      IF( FULL )THEN
+         NARGS = 10
+      ELSE IF( BANDED )THEN
+         NARGS = 11
+      ELSE IF( PACKED )THEN
+         NARGS = 9
+      END IF
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*
+      DO 110 IN = 1, NIDIM
+         N = IDIM( IN )
+*
+         IF( BANDED )THEN
+            NK = NKB
+         ELSE
+            NK = 1
+         END IF
+         DO 100 IK = 1, NK
+            IF( BANDED )THEN
+               K = KB( IK )
+            ELSE
+               K = N - 1
+            END IF
+*           Set LDA to 1 more than minimum value if room.
+            IF( BANDED )THEN
+               LDA = K + 1
+            ELSE
+               LDA = N
+            END IF
+            IF( LDA.LT.NMAX )
+     $         LDA = LDA + 1
+*           Skip tests if not enough room.
+            IF( LDA.GT.NMAX )
+     $         GO TO 100
+            IF( PACKED )THEN
+               LAA = ( N*( N + 1 ) )/2
+            ELSE
+               LAA = LDA*N
+            END IF
+            NULL = N.LE.0
+*
+            DO 90 IC = 1, 2
+               UPLO = ICH( IC: IC )
+*
+*              Generate the matrix A.
+*
+               TRANSL = ZERO
+               CALL SMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, AA,
+     $                     LDA, K, K, RESET, TRANSL )
+*
+               DO 80 IX = 1, NINC
+                  INCX = INC( IX )
+                  LX = ABS( INCX )*N
+*
+*                 Generate the vector X.
+*
+                  TRANSL = HALF
+                  CALL SMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX,
+     $                        ABS( INCX ), 0, N - 1, RESET, TRANSL )
+                  IF( N.GT.1 )THEN
+                     X( N/2 ) = ZERO
+                     XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO
+                  END IF
+*
+                  DO 70 IY = 1, NINC
+                     INCY = INC( IY )
+                     LY = ABS( INCY )*N
+*
+                     DO 60 IA = 1, NALF
+                        ALPHA = ALF( IA )
+*
+                        DO 50 IB = 1, NBET
+                           BETA = BET( IB )
+*
+*                          Generate the vector Y.
+*
+                           TRANSL = ZERO
+                           CALL SMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY,
+     $                                 ABS( INCY ), 0, N - 1, RESET,
+     $                                 TRANSL )
+*
+                           NC = NC + 1
+*
+*                          Save every datum before calling the
+*                          subroutine.
+*
+                           UPLOS = UPLO
+                           NS = N
+                           KS = K
+                           ALS = ALPHA
+                           DO 10 I = 1, LAA
+                              AS( I ) = AA( I )
+   10                      CONTINUE
+                           LDAS = LDA
+                           DO 20 I = 1, LX
+                              XS( I ) = XX( I )
+   20                      CONTINUE
+                           INCXS = INCX
+                           BLS = BETA
+                           DO 30 I = 1, LY
+                              YS( I ) = YY( I )
+   30                      CONTINUE
+                           INCYS = INCY
+*
+*                          Call the subroutine.
+*
+                           IF( FULL )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9993 )NC, SNAME,
+     $                           UPLO, N, ALPHA, LDA, INCX, BETA, INCY
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL SSYMV( UPLO, N, ALPHA, AA, LDA, XX,
+     $                                    INCX, BETA, YY, INCY )
+                           ELSE IF( BANDED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9994 )NC, SNAME,
+     $                           UPLO, N, K, ALPHA, LDA, INCX, BETA,
+     $                           INCY
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL SSBMV( UPLO, N, K, ALPHA, AA, LDA,
+     $                                    XX, INCX, BETA, YY, INCY )
+                           ELSE IF( PACKED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                           UPLO, N, ALPHA, INCX, BETA, INCY
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL SSPMV( UPLO, N, ALPHA, AA, XX, INCX,
+     $                                    BETA, YY, INCY )
+                           END IF
+*
+*                          Check if error-exit was taken incorrectly.
+*
+                           IF( .NOT.OK )THEN
+                              WRITE( NOUT, FMT = 9992 )
+                              FATAL = .TRUE.
+                              GO TO 120
+                           END IF
+*
+*                          See what data changed inside subroutines.
+*
+                           ISAME( 1 ) = UPLO.EQ.UPLOS
+                           ISAME( 2 ) = NS.EQ.N
+                           IF( FULL )THEN
+                              ISAME( 3 ) = ALS.EQ.ALPHA
+                              ISAME( 4 ) = LSE( AS, AA, LAA )
+                              ISAME( 5 ) = LDAS.EQ.LDA
+                              ISAME( 6 ) = LSE( XS, XX, LX )
+                              ISAME( 7 ) = INCXS.EQ.INCX
+                              ISAME( 8 ) = BLS.EQ.BETA
+                              IF( NULL )THEN
+                                 ISAME( 9 ) = LSE( YS, YY, LY )
+                              ELSE
+                                 ISAME( 9 ) = LSERES( 'GE', ' ', 1, N,
+     $                                        YS, YY, ABS( INCY ) )
+                              END IF
+                              ISAME( 10 ) = INCYS.EQ.INCY
+                           ELSE IF( BANDED )THEN
+                              ISAME( 3 ) = KS.EQ.K
+                              ISAME( 4 ) = ALS.EQ.ALPHA
+                              ISAME( 5 ) = LSE( AS, AA, LAA )
+                              ISAME( 6 ) = LDAS.EQ.LDA
+                              ISAME( 7 ) = LSE( XS, XX, LX )
+                              ISAME( 8 ) = INCXS.EQ.INCX
+                              ISAME( 9 ) = BLS.EQ.BETA
+                              IF( NULL )THEN
+                                 ISAME( 10 ) = LSE( YS, YY, LY )
+                              ELSE
+                                 ISAME( 10 ) = LSERES( 'GE', ' ', 1, N,
+     $                                         YS, YY, ABS( INCY ) )
+                              END IF
+                              ISAME( 11 ) = INCYS.EQ.INCY
+                           ELSE IF( PACKED )THEN
+                              ISAME( 3 ) = ALS.EQ.ALPHA
+                              ISAME( 4 ) = LSE( AS, AA, LAA )
+                              ISAME( 5 ) = LSE( XS, XX, LX )
+                              ISAME( 6 ) = INCXS.EQ.INCX
+                              ISAME( 7 ) = BLS.EQ.BETA
+                              IF( NULL )THEN
+                                 ISAME( 8 ) = LSE( YS, YY, LY )
+                              ELSE
+                                 ISAME( 8 ) = LSERES( 'GE', ' ', 1, N,
+     $                                        YS, YY, ABS( INCY ) )
+                              END IF
+                              ISAME( 9 ) = INCYS.EQ.INCY
+                           END IF
+*
+*                          If data was incorrectly changed, report and
+*                          return.
+*
+                           SAME = .TRUE.
+                           DO 40 I = 1, NARGS
+                              SAME = SAME.AND.ISAME( I )
+                              IF( .NOT.ISAME( I ) )
+     $                           WRITE( NOUT, FMT = 9998 )I
+   40                      CONTINUE
+                           IF( .NOT.SAME )THEN
+                              FATAL = .TRUE.
+                              GO TO 120
+                           END IF
+*
+                           IF( .NOT.NULL )THEN
+*
+*                             Check the result.
+*
+                              CALL SMVCH( 'N', N, N, ALPHA, A, NMAX, X,
+     $                                    INCX, BETA, Y, INCY, YT, G,
+     $                                    YY, EPS, ERR, FATAL, NOUT,
+     $                                    .TRUE. )
+                              ERRMAX = MAX( ERRMAX, ERR )
+*                             If got really bad answer, report and
+*                             return.
+                              IF( FATAL )
+     $                           GO TO 120
+                           ELSE
+*                             Avoid repeating tests with N.le.0
+                              GO TO 110
+                           END IF
+*
+   50                   CONTINUE
+*
+   60                CONTINUE
+*
+   70             CONTINUE
+*
+   80          CONTINUE
+*
+   90       CONTINUE
+*
+  100    CONTINUE
+*
+  110 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 130
+*
+  120 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( FULL )THEN
+         WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, LDA, INCX,
+     $      BETA, INCY
+      ELSE IF( BANDED )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, K, ALPHA, LDA,
+     $      INCX, BETA, INCY
+      ELSE IF( PACKED )THEN
+         WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, N, ALPHA, INCX,
+     $      BETA, INCY
+      END IF
+*
+  130 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', AP',
+     $      ', X,', I2, ',', F4.1, ', Y,', I2, ')                .' )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), F4.1,
+     $      ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2,
+     $      ')         .' )
+ 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', A,',
+     $      I3, ', X,', I2, ',', F4.1, ', Y,', I2, ')             .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of SCHK2.
+*
+      END
+      SUBROUTINE SCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX,
+     $                  INCMAX, A, AA, AS, X, XX, XS, XT, G, Z )
+*
+*  Tests STRMV, STBMV, STPMV, STRSV, STBSV and STPSV.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      REAL               ZERO, HALF, ONE
+      PARAMETER          ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      REAL               A( NMAX, NMAX ), AA( NMAX*NMAX ),
+     $                   AS( NMAX*NMAX ), G( NMAX ), X( NMAX ),
+     $                   XS( NMAX*INCMAX ), XT( NMAX ),
+     $                   XX( NMAX*INCMAX ), Z( NMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC ), KB( NKB )
+*     .. Local Scalars ..
+      REAL               ERR, ERRMAX, TRANSL
+      INTEGER            I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K,
+     $                   KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS
+      LOGICAL            BANDED, FULL, NULL, PACKED, RESET, SAME
+      CHARACTER*1        DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS
+      CHARACTER*2        ICHD, ICHU
+      CHARACTER*3        ICHT
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LSE, LSERES
+      EXTERNAL           LSE, LSERES
+*     .. External Subroutines ..
+      EXTERNAL           SMAKE, SMVCH, STBMV, STBSV, STPMV, STPSV,
+     $                   STRMV, STRSV
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/
+*     .. Executable Statements ..
+      FULL = SNAME( 3: 3 ).EQ.'R'
+      BANDED = SNAME( 3: 3 ).EQ.'B'
+      PACKED = SNAME( 3: 3 ).EQ.'P'
+*     Define the number of arguments.
+      IF( FULL )THEN
+         NARGS = 8
+      ELSE IF( BANDED )THEN
+         NARGS = 9
+      ELSE IF( PACKED )THEN
+         NARGS = 7
+      END IF
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*     Set up zero vector for SMVCH.
+      DO 10 I = 1, NMAX
+         Z( I ) = ZERO
+   10 CONTINUE
+*
+      DO 110 IN = 1, NIDIM
+         N = IDIM( IN )
+*
+         IF( BANDED )THEN
+            NK = NKB
+         ELSE
+            NK = 1
+         END IF
+         DO 100 IK = 1, NK
+            IF( BANDED )THEN
+               K = KB( IK )
+            ELSE
+               K = N - 1
+            END IF
+*           Set LDA to 1 more than minimum value if room.
+            IF( BANDED )THEN
+               LDA = K + 1
+            ELSE
+               LDA = N
+            END IF
+            IF( LDA.LT.NMAX )
+     $         LDA = LDA + 1
+*           Skip tests if not enough room.
+            IF( LDA.GT.NMAX )
+     $         GO TO 100
+            IF( PACKED )THEN
+               LAA = ( N*( N + 1 ) )/2
+            ELSE
+               LAA = LDA*N
+            END IF
+            NULL = N.LE.0
+*
+            DO 90 ICU = 1, 2
+               UPLO = ICHU( ICU: ICU )
+*
+               DO 80 ICT = 1, 3
+                  TRANS = ICHT( ICT: ICT )
+*
+                  DO 70 ICD = 1, 2
+                     DIAG = ICHD( ICD: ICD )
+*
+*                    Generate the matrix A.
+*
+                     TRANSL = ZERO
+                     CALL SMAKE( SNAME( 2: 3 ), UPLO, DIAG, N, N, A,
+     $                           NMAX, AA, LDA, K, K, RESET, TRANSL )
+*
+                     DO 60 IX = 1, NINC
+                        INCX = INC( IX )
+                        LX = ABS( INCX )*N
+*
+*                       Generate the vector X.
+*
+                        TRANSL = HALF
+                        CALL SMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX,
+     $                              ABS( INCX ), 0, N - 1, RESET,
+     $                              TRANSL )
+                        IF( N.GT.1 )THEN
+                           X( N/2 ) = ZERO
+                           XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO
+                        END IF
+*
+                        NC = NC + 1
+*
+*                       Save every datum before calling the subroutine.
+*
+                        UPLOS = UPLO
+                        TRANSS = TRANS
+                        DIAGS = DIAG
+                        NS = N
+                        KS = K
+                        DO 20 I = 1, LAA
+                           AS( I ) = AA( I )
+   20                   CONTINUE
+                        LDAS = LDA
+                        DO 30 I = 1, LX
+                           XS( I ) = XX( I )
+   30                   CONTINUE
+                        INCXS = INCX
+*
+*                       Call the subroutine.
+*
+                        IF( SNAME( 4: 5 ).EQ.'MV' )THEN
+                           IF( FULL )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9993 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, LDA, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL STRMV( UPLO, TRANS, DIAG, N, AA, LDA,
+     $                                    XX, INCX )
+                           ELSE IF( BANDED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9994 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, K, LDA, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL STBMV( UPLO, TRANS, DIAG, N, K, AA,
+     $                                    LDA, XX, INCX )
+                           ELSE IF( PACKED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL STPMV( UPLO, TRANS, DIAG, N, AA, XX,
+     $                                    INCX )
+                           END IF
+                        ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN
+                           IF( FULL )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9993 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, LDA, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL STRSV( UPLO, TRANS, DIAG, N, AA, LDA,
+     $                                    XX, INCX )
+                           ELSE IF( BANDED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9994 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, K, LDA, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL STBSV( UPLO, TRANS, DIAG, N, K, AA,
+     $                                    LDA, XX, INCX )
+                           ELSE IF( PACKED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL STPSV( UPLO, TRANS, DIAG, N, AA, XX,
+     $                                    INCX )
+                           END IF
+                        END IF
+*
+*                       Check if error-exit was taken incorrectly.
+*
+                        IF( .NOT.OK )THEN
+                           WRITE( NOUT, FMT = 9992 )
+                           FATAL = .TRUE.
+                           GO TO 120
+                        END IF
+*
+*                       See what data changed inside subroutines.
+*
+                        ISAME( 1 ) = UPLO.EQ.UPLOS
+                        ISAME( 2 ) = TRANS.EQ.TRANSS
+                        ISAME( 3 ) = DIAG.EQ.DIAGS
+                        ISAME( 4 ) = NS.EQ.N
+                        IF( FULL )THEN
+                           ISAME( 5 ) = LSE( AS, AA, LAA )
+                           ISAME( 6 ) = LDAS.EQ.LDA
+                           IF( NULL )THEN
+                              ISAME( 7 ) = LSE( XS, XX, LX )
+                           ELSE
+                              ISAME( 7 ) = LSERES( 'GE', ' ', 1, N, XS,
+     $                                     XX, ABS( INCX ) )
+                           END IF
+                           ISAME( 8 ) = INCXS.EQ.INCX
+                        ELSE IF( BANDED )THEN
+                           ISAME( 5 ) = KS.EQ.K
+                           ISAME( 6 ) = LSE( AS, AA, LAA )
+                           ISAME( 7 ) = LDAS.EQ.LDA
+                           IF( NULL )THEN
+                              ISAME( 8 ) = LSE( XS, XX, LX )
+                           ELSE
+                              ISAME( 8 ) = LSERES( 'GE', ' ', 1, N, XS,
+     $                                     XX, ABS( INCX ) )
+                           END IF
+                           ISAME( 9 ) = INCXS.EQ.INCX
+                        ELSE IF( PACKED )THEN
+                           ISAME( 5 ) = LSE( AS, AA, LAA )
+                           IF( NULL )THEN
+                              ISAME( 6 ) = LSE( XS, XX, LX )
+                           ELSE
+                              ISAME( 6 ) = LSERES( 'GE', ' ', 1, N, XS,
+     $                                     XX, ABS( INCX ) )
+                           END IF
+                           ISAME( 7 ) = INCXS.EQ.INCX
+                        END IF
+*
+*                       If data was incorrectly changed, report and
+*                       return.
+*
+                        SAME = .TRUE.
+                        DO 40 I = 1, NARGS
+                           SAME = SAME.AND.ISAME( I )
+                           IF( .NOT.ISAME( I ) )
+     $                        WRITE( NOUT, FMT = 9998 )I
+   40                   CONTINUE
+                        IF( .NOT.SAME )THEN
+                           FATAL = .TRUE.
+                           GO TO 120
+                        END IF
+*
+                        IF( .NOT.NULL )THEN
+                           IF( SNAME( 4: 5 ).EQ.'MV' )THEN
+*
+*                             Check the result.
+*
+                              CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X,
+     $                                    INCX, ZERO, Z, INCX, XT, G,
+     $                                    XX, EPS, ERR, FATAL, NOUT,
+     $                                    .TRUE. )
+                           ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN
+*
+*                             Compute approximation to original vector.
+*
+                              DO 50 I = 1, N
+                                 Z( I ) = XX( 1 + ( I - 1 )*
+     $                                    ABS( INCX ) )
+                                 XX( 1 + ( I - 1 )*ABS( INCX ) )
+     $                              = X( I )
+   50                         CONTINUE
+                              CALL SMVCH( TRANS, N, N, ONE, A, NMAX, Z,
+     $                                    INCX, ZERO, X, INCX, XT, G,
+     $                                    XX, EPS, ERR, FATAL, NOUT,
+     $                                    .FALSE. )
+                           END IF
+                           ERRMAX = MAX( ERRMAX, ERR )
+*                          If got really bad answer, report and return.
+                           IF( FATAL )
+     $                        GO TO 120
+                        ELSE
+*                          Avoid repeating tests with N.le.0.
+                           GO TO 110
+                        END IF
+*
+   60                CONTINUE
+*
+   70             CONTINUE
+*
+   80          CONTINUE
+*
+   90       CONTINUE
+*
+  100    CONTINUE
+*
+  110 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 130
+*
+  120 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( FULL )THEN
+         WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, DIAG, N, LDA,
+     $      INCX
+      ELSE IF( BANDED )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, DIAG, N, K,
+     $      LDA, INCX
+      ELSE IF( PACKED )THEN
+         WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, TRANS, DIAG, N, INCX
+      END IF
+*
+  130 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', AP, ',
+     $      'X,', I2, ')                        .' )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), 2( I3, ',' ),
+     $      ' A,', I3, ', X,', I2, ')                 .' )
+ 9993 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', A,',
+     $      I3, ', X,', I2, ')                     .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of SCHK3.
+*
+      END
+      SUBROUTINE SCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX,
+     $                  INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G,
+     $                  Z )
+*
+*  Tests SGER.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      REAL               ZERO, HALF, ONE
+      PARAMETER          ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      REAL               A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), G( NMAX ), X( NMAX ),
+     $                   XS( NMAX*INCMAX ), XX( NMAX*INCMAX ),
+     $                   Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX ), Z( NMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC )
+*     .. Local Scalars ..
+      REAL               ALPHA, ALS, ERR, ERRMAX, TRANSL
+      INTEGER            I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX,
+     $                   IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS,
+     $                   NC, ND, NS
+      LOGICAL            NULL, RESET, SAME
+*     .. Local Arrays ..
+      REAL               W( 1 )
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LSE, LSERES
+      EXTERNAL           LSE, LSERES
+*     .. External Subroutines ..
+      EXTERNAL           SGER, SMAKE, SMVCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Executable Statements ..
+*     Define the number of arguments.
+      NARGS = 9
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*
+      DO 120 IN = 1, NIDIM
+         N = IDIM( IN )
+         ND = N/2 + 1
+*
+         DO 110 IM = 1, 2
+            IF( IM.EQ.1 )
+     $         M = MAX( N - ND, 0 )
+            IF( IM.EQ.2 )
+     $         M = MIN( N + ND, NMAX )
+*
+*           Set LDA to 1 more than minimum value if room.
+            LDA = M
+            IF( LDA.LT.NMAX )
+     $         LDA = LDA + 1
+*           Skip tests if not enough room.
+            IF( LDA.GT.NMAX )
+     $         GO TO 110
+            LAA = LDA*N
+            NULL = N.LE.0.OR.M.LE.0
+*
+            DO 100 IX = 1, NINC
+               INCX = INC( IX )
+               LX = ABS( INCX )*M
+*
+*              Generate the vector X.
+*
+               TRANSL = HALF
+               CALL SMAKE( 'GE', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ),
+     $                     0, M - 1, RESET, TRANSL )
+               IF( M.GT.1 )THEN
+                  X( M/2 ) = ZERO
+                  XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO
+               END IF
+*
+               DO 90 IY = 1, NINC
+                  INCY = INC( IY )
+                  LY = ABS( INCY )*N
+*
+*                 Generate the vector Y.
+*
+                  TRANSL = ZERO
+                  CALL SMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY,
+     $                        ABS( INCY ), 0, N - 1, RESET, TRANSL )
+                  IF( N.GT.1 )THEN
+                     Y( N/2 ) = ZERO
+                     YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO
+                  END IF
+*
+                  DO 80 IA = 1, NALF
+                     ALPHA = ALF( IA )
+*
+*                    Generate the matrix A.
+*
+                     TRANSL = ZERO
+                     CALL SMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX,
+     $                           AA, LDA, M - 1, N - 1, RESET, TRANSL )
+*
+                     NC = NC + 1
+*
+*                    Save every datum before calling the subroutine.
+*
+                     MS = M
+                     NS = N
+                     ALS = ALPHA
+                     DO 10 I = 1, LAA
+                        AS( I ) = AA( I )
+   10                CONTINUE
+                     LDAS = LDA
+                     DO 20 I = 1, LX
+                        XS( I ) = XX( I )
+   20                CONTINUE
+                     INCXS = INCX
+                     DO 30 I = 1, LY
+                        YS( I ) = YY( I )
+   30                CONTINUE
+                     INCYS = INCY
+*
+*                    Call the subroutine.
+*
+                     IF( TRACE )
+     $                  WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N,
+     $                  ALPHA, INCX, INCY, LDA
+                     IF( REWI )
+     $                  REWIND NTRA
+                     CALL SGER( M, N, ALPHA, XX, INCX, YY, INCY, AA,
+     $                          LDA )
+*
+*                    Check if error-exit was taken incorrectly.
+*
+                     IF( .NOT.OK )THEN
+                        WRITE( NOUT, FMT = 9993 )
+                        FATAL = .TRUE.
+                        GO TO 140
+                     END IF
+*
+*                    See what data changed inside subroutine.
+*
+                     ISAME( 1 ) = MS.EQ.M
+                     ISAME( 2 ) = NS.EQ.N
+                     ISAME( 3 ) = ALS.EQ.ALPHA
+                     ISAME( 4 ) = LSE( XS, XX, LX )
+                     ISAME( 5 ) = INCXS.EQ.INCX
+                     ISAME( 6 ) = LSE( YS, YY, LY )
+                     ISAME( 7 ) = INCYS.EQ.INCY
+                     IF( NULL )THEN
+                        ISAME( 8 ) = LSE( AS, AA, LAA )
+                     ELSE
+                        ISAME( 8 ) = LSERES( 'GE', ' ', M, N, AS, AA,
+     $                               LDA )
+                     END IF
+                     ISAME( 9 ) = LDAS.EQ.LDA
+*
+*                    If data was incorrectly changed, report and return.
+*
+                     SAME = .TRUE.
+                     DO 40 I = 1, NARGS
+                        SAME = SAME.AND.ISAME( I )
+                        IF( .NOT.ISAME( I ) )
+     $                     WRITE( NOUT, FMT = 9998 )I
+   40                CONTINUE
+                     IF( .NOT.SAME )THEN
+                        FATAL = .TRUE.
+                        GO TO 140
+                     END IF
+*
+                     IF( .NOT.NULL )THEN
+*
+*                       Check the result column by column.
+*
+                        IF( INCX.GT.0 )THEN
+                           DO 50 I = 1, M
+                              Z( I ) = X( I )
+   50                      CONTINUE
+                        ELSE
+                           DO 60 I = 1, M
+                              Z( I ) = X( M - I + 1 )
+   60                      CONTINUE
+                        END IF
+                        DO 70 J = 1, N
+                           IF( INCY.GT.0 )THEN
+                              W( 1 ) = Y( J )
+                           ELSE
+                              W( 1 ) = Y( N - J + 1 )
+                           END IF
+                           CALL SMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1,
+     $                                 ONE, A( 1, J ), 1, YT, G,
+     $                                 AA( 1 + ( J - 1 )*LDA ), EPS,
+     $                                 ERR, FATAL, NOUT, .TRUE. )
+                           ERRMAX = MAX( ERRMAX, ERR )
+*                          If got really bad answer, report and return.
+                           IF( FATAL )
+     $                        GO TO 130
+   70                   CONTINUE
+                     ELSE
+*                       Avoid repeating tests with M.le.0 or N.le.0.
+                        GO TO 110
+                     END IF
+*
+   80             CONTINUE
+*
+   90          CONTINUE
+*
+  100       CONTINUE
+*
+  110    CONTINUE
+*
+  120 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 150
+*
+  130 CONTINUE
+      WRITE( NOUT, FMT = 9995 )J
+*
+  140 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA
+*
+  150 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( I3, ',' ), F4.1, ', X,', I2,
+     $      ', Y,', I2, ', A,', I3, ')                  .' )
+ 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of SCHK4.
+*
+      END
+      SUBROUTINE SCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX,
+     $                  INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G,
+     $                  Z )
+*
+*  Tests SSYR and SSPR.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      REAL               ZERO, HALF, ONE
+      PARAMETER          ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      REAL               A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), G( NMAX ), X( NMAX ),
+     $                   XS( NMAX*INCMAX ), XX( NMAX*INCMAX ),
+     $                   Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX ), Z( NMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC )
+*     .. Local Scalars ..
+      REAL               ALPHA, ALS, ERR, ERRMAX, TRANSL
+      INTEGER            I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA,
+     $                   LDA, LDAS, LJ, LX, N, NARGS, NC, NS
+      LOGICAL            FULL, NULL, PACKED, RESET, SAME, UPPER
+      CHARACTER*1        UPLO, UPLOS
+      CHARACTER*2        ICH
+*     .. Local Arrays ..
+      REAL               W( 1 )
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LSE, LSERES
+      EXTERNAL           LSE, LSERES
+*     .. External Subroutines ..
+      EXTERNAL           SMAKE, SMVCH, SSPR, SSYR
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICH/'UL'/
+*     .. Executable Statements ..
+      FULL = SNAME( 3: 3 ).EQ.'Y'
+      PACKED = SNAME( 3: 3 ).EQ.'P'
+*     Define the number of arguments.
+      IF( FULL )THEN
+         NARGS = 7
+      ELSE IF( PACKED )THEN
+         NARGS = 6
+      END IF
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*
+      DO 100 IN = 1, NIDIM
+         N = IDIM( IN )
+*        Set LDA to 1 more than minimum value if room.
+         LDA = N
+         IF( LDA.LT.NMAX )
+     $      LDA = LDA + 1
+*        Skip tests if not enough room.
+         IF( LDA.GT.NMAX )
+     $      GO TO 100
+         IF( PACKED )THEN
+            LAA = ( N*( N + 1 ) )/2
+         ELSE
+            LAA = LDA*N
+         END IF
+*
+         DO 90 IC = 1, 2
+            UPLO = ICH( IC: IC )
+            UPPER = UPLO.EQ.'U'
+*
+            DO 80 IX = 1, NINC
+               INCX = INC( IX )
+               LX = ABS( INCX )*N
+*
+*              Generate the vector X.
+*
+               TRANSL = HALF
+               CALL SMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ),
+     $                     0, N - 1, RESET, TRANSL )
+               IF( N.GT.1 )THEN
+                  X( N/2 ) = ZERO
+                  XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO
+               END IF
+*
+               DO 70 IA = 1, NALF
+                  ALPHA = ALF( IA )
+                  NULL = N.LE.0.OR.ALPHA.EQ.ZERO
+*
+*                 Generate the matrix A.
+*
+                  TRANSL = ZERO
+                  CALL SMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX,
+     $                        AA, LDA, N - 1, N - 1, RESET, TRANSL )
+*
+                  NC = NC + 1
+*
+*                 Save every datum before calling the subroutine.
+*
+                  UPLOS = UPLO
+                  NS = N
+                  ALS = ALPHA
+                  DO 10 I = 1, LAA
+                     AS( I ) = AA( I )
+   10             CONTINUE
+                  LDAS = LDA
+                  DO 20 I = 1, LX
+                     XS( I ) = XX( I )
+   20             CONTINUE
+                  INCXS = INCX
+*
+*                 Call the subroutine.
+*
+                  IF( FULL )THEN
+                     IF( TRACE )
+     $                  WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N,
+     $                  ALPHA, INCX, LDA
+                     IF( REWI )
+     $                  REWIND NTRA
+                     CALL SSYR( UPLO, N, ALPHA, XX, INCX, AA, LDA )
+                  ELSE IF( PACKED )THEN
+                     IF( TRACE )
+     $                  WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N,
+     $                  ALPHA, INCX
+                     IF( REWI )
+     $                  REWIND NTRA
+                     CALL SSPR( UPLO, N, ALPHA, XX, INCX, AA )
+                  END IF
+*
+*                 Check if error-exit was taken incorrectly.
+*
+                  IF( .NOT.OK )THEN
+                     WRITE( NOUT, FMT = 9992 )
+                     FATAL = .TRUE.
+                     GO TO 120
+                  END IF
+*
+*                 See what data changed inside subroutines.
+*
+                  ISAME( 1 ) = UPLO.EQ.UPLOS
+                  ISAME( 2 ) = NS.EQ.N
+                  ISAME( 3 ) = ALS.EQ.ALPHA
+                  ISAME( 4 ) = LSE( XS, XX, LX )
+                  ISAME( 5 ) = INCXS.EQ.INCX
+                  IF( NULL )THEN
+                     ISAME( 6 ) = LSE( AS, AA, LAA )
+                  ELSE
+                     ISAME( 6 ) = LSERES( SNAME( 2: 3 ), UPLO, N, N, AS,
+     $                            AA, LDA )
+                  END IF
+                  IF( .NOT.PACKED )THEN
+                     ISAME( 7 ) = LDAS.EQ.LDA
+                  END IF
+*
+*                 If data was incorrectly changed, report and return.
+*
+                  SAME = .TRUE.
+                  DO 30 I = 1, NARGS
+                     SAME = SAME.AND.ISAME( I )
+                     IF( .NOT.ISAME( I ) )
+     $                  WRITE( NOUT, FMT = 9998 )I
+   30             CONTINUE
+                  IF( .NOT.SAME )THEN
+                     FATAL = .TRUE.
+                     GO TO 120
+                  END IF
+*
+                  IF( .NOT.NULL )THEN
+*
+*                    Check the result column by column.
+*
+                     IF( INCX.GT.0 )THEN
+                        DO 40 I = 1, N
+                           Z( I ) = X( I )
+   40                   CONTINUE
+                     ELSE
+                        DO 50 I = 1, N
+                           Z( I ) = X( N - I + 1 )
+   50                   CONTINUE
+                     END IF
+                     JA = 1
+                     DO 60 J = 1, N
+                        W( 1 ) = Z( J )
+                        IF( UPPER )THEN
+                           JJ = 1
+                           LJ = J
+                        ELSE
+                           JJ = J
+                           LJ = N - J + 1
+                        END IF
+                        CALL SMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W,
+     $                              1, ONE, A( JJ, J ), 1, YT, G,
+     $                              AA( JA ), EPS, ERR, FATAL, NOUT,
+     $                              .TRUE. )
+                        IF( FULL )THEN
+                           IF( UPPER )THEN
+                              JA = JA + LDA
+                           ELSE
+                              JA = JA + LDA + 1
+                           END IF
+                        ELSE
+                           JA = JA + LJ
+                        END IF
+                        ERRMAX = MAX( ERRMAX, ERR )
+*                       If got really bad answer, report and return.
+                        IF( FATAL )
+     $                     GO TO 110
+   60                CONTINUE
+                  ELSE
+*                    Avoid repeating tests if N.le.0.
+                     IF( N.LE.0 )
+     $                  GO TO 100
+                  END IF
+*
+   70          CONTINUE
+*
+   80       CONTINUE
+*
+   90    CONTINUE
+*
+  100 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 130
+*
+  110 CONTINUE
+      WRITE( NOUT, FMT = 9995 )J
+*
+  120 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( FULL )THEN
+         WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX, LDA
+      ELSE IF( PACKED )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX
+      END IF
+*
+  130 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,',
+     $      I2, ', AP)                           .' )
+ 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,',
+     $      I2, ', A,', I3, ')                        .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of SCHK5.
+*
+      END
+      SUBROUTINE SCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX,
+     $                  INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G,
+     $                  Z )
+*
+*  Tests SSYR2 and SSPR2.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      REAL               ZERO, HALF, ONE
+      PARAMETER          ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      REAL               A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), G( NMAX ), X( NMAX ),
+     $                   XS( NMAX*INCMAX ), XX( NMAX*INCMAX ),
+     $                   Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX ), Z( NMAX, 2 )
+      INTEGER            IDIM( NIDIM ), INC( NINC )
+*     .. Local Scalars ..
+      REAL               ALPHA, ALS, ERR, ERRMAX, TRANSL
+      INTEGER            I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX,
+     $                   IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N,
+     $                   NARGS, NC, NS
+      LOGICAL            FULL, NULL, PACKED, RESET, SAME, UPPER
+      CHARACTER*1        UPLO, UPLOS
+      CHARACTER*2        ICH
+*     .. Local Arrays ..
+      REAL               W( 2 )
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LSE, LSERES
+      EXTERNAL           LSE, LSERES
+*     .. External Subroutines ..
+      EXTERNAL           SMAKE, SMVCH, SSPR2, SSYR2
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICH/'UL'/
+*     .. Executable Statements ..
+      FULL = SNAME( 3: 3 ).EQ.'Y'
+      PACKED = SNAME( 3: 3 ).EQ.'P'
+*     Define the number of arguments.
+      IF( FULL )THEN
+         NARGS = 9
+      ELSE IF( PACKED )THEN
+         NARGS = 8
+      END IF
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*
+      DO 140 IN = 1, NIDIM
+         N = IDIM( IN )
+*        Set LDA to 1 more than minimum value if room.
+         LDA = N
+         IF( LDA.LT.NMAX )
+     $      LDA = LDA + 1
+*        Skip tests if not enough room.
+         IF( LDA.GT.NMAX )
+     $      GO TO 140
+         IF( PACKED )THEN
+            LAA = ( N*( N + 1 ) )/2
+         ELSE
+            LAA = LDA*N
+         END IF
+*
+         DO 130 IC = 1, 2
+            UPLO = ICH( IC: IC )
+            UPPER = UPLO.EQ.'U'
+*
+            DO 120 IX = 1, NINC
+               INCX = INC( IX )
+               LX = ABS( INCX )*N
+*
+*              Generate the vector X.
+*
+               TRANSL = HALF
+               CALL SMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ),
+     $                     0, N - 1, RESET, TRANSL )
+               IF( N.GT.1 )THEN
+                  X( N/2 ) = ZERO
+                  XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO
+               END IF
+*
+               DO 110 IY = 1, NINC
+                  INCY = INC( IY )
+                  LY = ABS( INCY )*N
+*
+*                 Generate the vector Y.
+*
+                  TRANSL = ZERO
+                  CALL SMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY,
+     $                        ABS( INCY ), 0, N - 1, RESET, TRANSL )
+                  IF( N.GT.1 )THEN
+                     Y( N/2 ) = ZERO
+                     YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO
+                  END IF
+*
+                  DO 100 IA = 1, NALF
+                     ALPHA = ALF( IA )
+                     NULL = N.LE.0.OR.ALPHA.EQ.ZERO
+*
+*                    Generate the matrix A.
+*
+                     TRANSL = ZERO
+                     CALL SMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A,
+     $                           NMAX, AA, LDA, N - 1, N - 1, RESET,
+     $                           TRANSL )
+*
+                     NC = NC + 1
+*
+*                    Save every datum before calling the subroutine.
+*
+                     UPLOS = UPLO
+                     NS = N
+                     ALS = ALPHA
+                     DO 10 I = 1, LAA
+                        AS( I ) = AA( I )
+   10                CONTINUE
+                     LDAS = LDA
+                     DO 20 I = 1, LX
+                        XS( I ) = XX( I )
+   20                CONTINUE
+                     INCXS = INCX
+                     DO 30 I = 1, LY
+                        YS( I ) = YY( I )
+   30                CONTINUE
+                     INCYS = INCY
+*
+*                    Call the subroutine.
+*
+                     IF( FULL )THEN
+                        IF( TRACE )
+     $                     WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N,
+     $                     ALPHA, INCX, INCY, LDA
+                        IF( REWI )
+     $                     REWIND NTRA
+                        CALL SSYR2( UPLO, N, ALPHA, XX, INCX, YY, INCY,
+     $                              AA, LDA )
+                     ELSE IF( PACKED )THEN
+                        IF( TRACE )
+     $                     WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N,
+     $                     ALPHA, INCX, INCY
+                        IF( REWI )
+     $                     REWIND NTRA
+                        CALL SSPR2( UPLO, N, ALPHA, XX, INCX, YY, INCY,
+     $                              AA )
+                     END IF
+*
+*                    Check if error-exit was taken incorrectly.
+*
+                     IF( .NOT.OK )THEN
+                        WRITE( NOUT, FMT = 9992 )
+                        FATAL = .TRUE.
+                        GO TO 160
+                     END IF
+*
+*                    See what data changed inside subroutines.
+*
+                     ISAME( 1 ) = UPLO.EQ.UPLOS
+                     ISAME( 2 ) = NS.EQ.N
+                     ISAME( 3 ) = ALS.EQ.ALPHA
+                     ISAME( 4 ) = LSE( XS, XX, LX )
+                     ISAME( 5 ) = INCXS.EQ.INCX
+                     ISAME( 6 ) = LSE( YS, YY, LY )
+                     ISAME( 7 ) = INCYS.EQ.INCY
+                     IF( NULL )THEN
+                        ISAME( 8 ) = LSE( AS, AA, LAA )
+                     ELSE
+                        ISAME( 8 ) = LSERES( SNAME( 2: 3 ), UPLO, N, N,
+     $                               AS, AA, LDA )
+                     END IF
+                     IF( .NOT.PACKED )THEN
+                        ISAME( 9 ) = LDAS.EQ.LDA
+                     END IF
+*
+*                    If data was incorrectly changed, report and return.
+*
+                     SAME = .TRUE.
+                     DO 40 I = 1, NARGS
+                        SAME = SAME.AND.ISAME( I )
+                        IF( .NOT.ISAME( I ) )
+     $                     WRITE( NOUT, FMT = 9998 )I
+   40                CONTINUE
+                     IF( .NOT.SAME )THEN
+                        FATAL = .TRUE.
+                        GO TO 160
+                     END IF
+*
+                     IF( .NOT.NULL )THEN
+*
+*                       Check the result column by column.
+*
+                        IF( INCX.GT.0 )THEN
+                           DO 50 I = 1, N
+                              Z( I, 1 ) = X( I )
+   50                      CONTINUE
+                        ELSE
+                           DO 60 I = 1, N
+                              Z( I, 1 ) = X( N - I + 1 )
+   60                      CONTINUE
+                        END IF
+                        IF( INCY.GT.0 )THEN
+                           DO 70 I = 1, N
+                              Z( I, 2 ) = Y( I )
+   70                      CONTINUE
+                        ELSE
+                           DO 80 I = 1, N
+                              Z( I, 2 ) = Y( N - I + 1 )
+   80                      CONTINUE
+                        END IF
+                        JA = 1
+                        DO 90 J = 1, N
+                           W( 1 ) = Z( J, 2 )
+                           W( 2 ) = Z( J, 1 )
+                           IF( UPPER )THEN
+                              JJ = 1
+                              LJ = J
+                           ELSE
+                              JJ = J
+                              LJ = N - J + 1
+                           END IF
+                           CALL SMVCH( 'N', LJ, 2, ALPHA, Z( JJ, 1 ),
+     $                                 NMAX, W, 1, ONE, A( JJ, J ), 1,
+     $                                 YT, G, AA( JA ), EPS, ERR, FATAL,
+     $                                 NOUT, .TRUE. )
+                           IF( FULL )THEN
+                              IF( UPPER )THEN
+                                 JA = JA + LDA
+                              ELSE
+                                 JA = JA + LDA + 1
+                              END IF
+                           ELSE
+                              JA = JA + LJ
+                           END IF
+                           ERRMAX = MAX( ERRMAX, ERR )
+*                          If got really bad answer, report and return.
+                           IF( FATAL )
+     $                        GO TO 150
+   90                   CONTINUE
+                     ELSE
+*                       Avoid repeating tests with N.le.0.
+                        IF( N.LE.0 )
+     $                     GO TO 140
+                     END IF
+*
+  100             CONTINUE
+*
+  110          CONTINUE
+*
+  120       CONTINUE
+*
+  130    CONTINUE
+*
+  140 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 170
+*
+  150 CONTINUE
+      WRITE( NOUT, FMT = 9995 )J
+*
+  160 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( FULL )THEN
+         WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX,
+     $      INCY, LDA
+      ELSE IF( PACKED )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX, INCY
+      END IF
+*
+  170 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,',
+     $      I2, ', Y,', I2, ', AP)                     .' )
+ 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,',
+     $      I2, ', Y,', I2, ', A,', I3, ')                  .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of SCHK6.
+*
+      END
+      SUBROUTINE SCHKE( ISNUM, SRNAMT, NOUT )
+*
+*  Tests the error exits from the Level 2 Blas.
+*  Requires a special version of the error-handling routine XERBLA.
+*  ALPHA, BETA, A, X and Y should not need to be defined.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      INTEGER            ISNUM, NOUT
+      CHARACTER*6        SRNAMT
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Local Scalars ..
+      REAL               ALPHA, BETA
+*     .. Local Arrays ..
+      REAL               A( 1, 1 ), X( 1 ), Y( 1 )
+*     .. External Subroutines ..
+      EXTERNAL           CHKXER, SGBMV, SGEMV, SGER, SSBMV, SSPMV, SSPR,
+     $                   SSPR2, SSYMV, SSYR, SSYR2, STBMV, STBSV, STPMV,
+     $                   STPSV, STRMV, STRSV
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Executable Statements ..
+*     OK is set to .FALSE. by the special version of XERBLA or by CHKXER
+*     if anything is wrong.
+      OK = .TRUE.
+*     LERR is set to .TRUE. by the special version of XERBLA each time
+*     it is called, and is then tested and re-set by CHKXER.
+      LERR = .FALSE.
+      GO TO ( 10, 20, 30, 40, 50, 60, 70, 80,
+     $        90, 100, 110, 120, 130, 140, 150,
+     $        160 )ISNUM
+   10 INFOT = 1
+      CALL SGEMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL SGEMV( 'N', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL SGEMV( 'N', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL SGEMV( 'N', 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL SGEMV( 'N', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL SGEMV( 'N', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+   20 INFOT = 1
+      CALL SGBMV( '/', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL SGBMV( 'N', -1, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL SGBMV( 'N', 0, -1, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL SGBMV( 'N', 0, 0, -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL SGBMV( 'N', 2, 0, 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL SGBMV( 'N', 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL SGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL SGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+   30 INFOT = 1
+      CALL SSYMV( '/', 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL SSYMV( 'U', -1, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL SSYMV( 'U', 2, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL SSYMV( 'U', 0, ALPHA, A, 1, X, 0, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL SSYMV( 'U', 0, ALPHA, A, 1, X, 1, BETA, Y, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+   40 INFOT = 1
+      CALL SSBMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL SSBMV( 'U', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL SSBMV( 'U', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL SSBMV( 'U', 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL SSBMV( 'U', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL SSBMV( 'U', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+   50 INFOT = 1
+      CALL SSPMV( '/', 0, ALPHA, A, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL SSPMV( 'U', -1, ALPHA, A, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL SSPMV( 'U', 0, ALPHA, A, X, 0, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL SSPMV( 'U', 0, ALPHA, A, X, 1, BETA, Y, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+   60 INFOT = 1
+      CALL STRMV( '/', 'N', 'N', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL STRMV( 'U', '/', 'N', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL STRMV( 'U', 'N', '/', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL STRMV( 'U', 'N', 'N', -1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL STRMV( 'U', 'N', 'N', 2, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL STRMV( 'U', 'N', 'N', 0, A, 1, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+   70 INFOT = 1
+      CALL STBMV( '/', 'N', 'N', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL STBMV( 'U', '/', 'N', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL STBMV( 'U', 'N', '/', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL STBMV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL STBMV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL STBMV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL STBMV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+   80 INFOT = 1
+      CALL STPMV( '/', 'N', 'N', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL STPMV( 'U', '/', 'N', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL STPMV( 'U', 'N', '/', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL STPMV( 'U', 'N', 'N', -1, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL STPMV( 'U', 'N', 'N', 0, A, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+   90 INFOT = 1
+      CALL STRSV( '/', 'N', 'N', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL STRSV( 'U', '/', 'N', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL STRSV( 'U', 'N', '/', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL STRSV( 'U', 'N', 'N', -1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL STRSV( 'U', 'N', 'N', 2, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL STRSV( 'U', 'N', 'N', 0, A, 1, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+  100 INFOT = 1
+      CALL STBSV( '/', 'N', 'N', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL STBSV( 'U', '/', 'N', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL STBSV( 'U', 'N', '/', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL STBSV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL STBSV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL STBSV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL STBSV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+  110 INFOT = 1
+      CALL STPSV( '/', 'N', 'N', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL STPSV( 'U', '/', 'N', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL STPSV( 'U', 'N', '/', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL STPSV( 'U', 'N', 'N', -1, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL STPSV( 'U', 'N', 'N', 0, A, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+  120 INFOT = 1
+      CALL SGER( -1, 0, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL SGER( 0, -1, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL SGER( 0, 0, ALPHA, X, 0, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL SGER( 0, 0, ALPHA, X, 1, Y, 0, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL SGER( 2, 0, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+  130 INFOT = 1
+      CALL SSYR( '/', 0, ALPHA, X, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL SSYR( 'U', -1, ALPHA, X, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL SSYR( 'U', 0, ALPHA, X, 0, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL SSYR( 'U', 2, ALPHA, X, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+  140 INFOT = 1
+      CALL SSPR( '/', 0, ALPHA, X, 1, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL SSPR( 'U', -1, ALPHA, X, 1, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL SSPR( 'U', 0, ALPHA, X, 0, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+  150 INFOT = 1
+      CALL SSYR2( '/', 0, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL SSYR2( 'U', -1, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL SSYR2( 'U', 0, ALPHA, X, 0, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL SSYR2( 'U', 0, ALPHA, X, 1, Y, 0, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL SSYR2( 'U', 2, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 170
+  160 INFOT = 1
+      CALL SSPR2( '/', 0, ALPHA, X, 1, Y, 1, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL SSPR2( 'U', -1, ALPHA, X, 1, Y, 1, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL SSPR2( 'U', 0, ALPHA, X, 0, Y, 1, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL SSPR2( 'U', 0, ALPHA, X, 1, Y, 0, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+*
+  170 IF( OK )THEN
+         WRITE( NOUT, FMT = 9999 )SRNAMT
+      ELSE
+         WRITE( NOUT, FMT = 9998 )SRNAMT
+      END IF
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' )
+ 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****',
+     $      '**' )
+*
+*     End of SCHKE.
+*
+      END
+      SUBROUTINE SMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL,
+     $                  KU, RESET, TRANSL )
+*
+*  Generates values for an M by N matrix A within the bandwidth
+*  defined by KL and KU.
+*  Stores the values in the array AA in the data structure required
+*  by the routine, with unwanted elements set to rogue value.
+*
+*  TYPE is 'GE', 'GB', 'SY', 'SB', 'SP', 'TR', 'TB' OR 'TP'.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0, ONE = 1.0 )
+      REAL               ROGUE
+      PARAMETER          ( ROGUE = -1.0E10 )
+*     .. Scalar Arguments ..
+      REAL               TRANSL
+      INTEGER            KL, KU, LDA, M, N, NMAX
+      LOGICAL            RESET
+      CHARACTER*1        DIAG, UPLO
+      CHARACTER*2        TYPE
+*     .. Array Arguments ..
+      REAL               A( NMAX, * ), AA( * )
+*     .. Local Scalars ..
+      INTEGER            I, I1, I2, I3, IBEG, IEND, IOFF, J, KK
+      LOGICAL            GEN, LOWER, SYM, TRI, UNIT, UPPER
+*     .. External Functions ..
+      REAL               SBEG
+      EXTERNAL           SBEG
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     .. Executable Statements ..
+      GEN = TYPE( 1: 1 ).EQ.'G'
+      SYM = TYPE( 1: 1 ).EQ.'S'
+      TRI = TYPE( 1: 1 ).EQ.'T'
+      UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U'
+      LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L'
+      UNIT = TRI.AND.DIAG.EQ.'U'
+*
+*     Generate data in array A.
+*
+      DO 20 J = 1, N
+         DO 10 I = 1, M
+            IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) )
+     $          THEN
+               IF( ( I.LE.J.AND.J - I.LE.KU ).OR.
+     $             ( I.GE.J.AND.I - J.LE.KL ) )THEN
+                  A( I, J ) = SBEG( RESET ) + TRANSL
+               ELSE
+                  A( I, J ) = ZERO
+               END IF
+               IF( I.NE.J )THEN
+                  IF( SYM )THEN
+                     A( J, I ) = A( I, J )
+                  ELSE IF( TRI )THEN
+                     A( J, I ) = ZERO
+                  END IF
+               END IF
+            END IF
+   10    CONTINUE
+         IF( TRI )
+     $      A( J, J ) = A( J, J ) + ONE
+         IF( UNIT )
+     $      A( J, J ) = ONE
+   20 CONTINUE
+*
+*     Store elements in array AS in data structure required by routine.
+*
+      IF( TYPE.EQ.'GE' )THEN
+         DO 50 J = 1, N
+            DO 30 I = 1, M
+               AA( I + ( J - 1 )*LDA ) = A( I, J )
+   30       CONTINUE
+            DO 40 I = M + 1, LDA
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+   40       CONTINUE
+   50    CONTINUE
+      ELSE IF( TYPE.EQ.'GB' )THEN
+         DO 90 J = 1, N
+            DO 60 I1 = 1, KU + 1 - J
+               AA( I1 + ( J - 1 )*LDA ) = ROGUE
+   60       CONTINUE
+            DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J )
+               AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J )
+   70       CONTINUE
+            DO 80 I3 = I2, LDA
+               AA( I3 + ( J - 1 )*LDA ) = ROGUE
+   80       CONTINUE
+   90    CONTINUE
+      ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN
+         DO 130 J = 1, N
+            IF( UPPER )THEN
+               IBEG = 1
+               IF( UNIT )THEN
+                  IEND = J - 1
+               ELSE
+                  IEND = J
+               END IF
+            ELSE
+               IF( UNIT )THEN
+                  IBEG = J + 1
+               ELSE
+                  IBEG = J
+               END IF
+               IEND = N
+            END IF
+            DO 100 I = 1, IBEG - 1
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+  100       CONTINUE
+            DO 110 I = IBEG, IEND
+               AA( I + ( J - 1 )*LDA ) = A( I, J )
+  110       CONTINUE
+            DO 120 I = IEND + 1, LDA
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+  120       CONTINUE
+  130    CONTINUE
+      ELSE IF( TYPE.EQ.'SB'.OR.TYPE.EQ.'TB' )THEN
+         DO 170 J = 1, N
+            IF( UPPER )THEN
+               KK = KL + 1
+               IBEG = MAX( 1, KL + 2 - J )
+               IF( UNIT )THEN
+                  IEND = KL
+               ELSE
+                  IEND = KL + 1
+               END IF
+            ELSE
+               KK = 1
+               IF( UNIT )THEN
+                  IBEG = 2
+               ELSE
+                  IBEG = 1
+               END IF
+               IEND = MIN( KL + 1, 1 + M - J )
+            END IF
+            DO 140 I = 1, IBEG - 1
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+  140       CONTINUE
+            DO 150 I = IBEG, IEND
+               AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J )
+  150       CONTINUE
+            DO 160 I = IEND + 1, LDA
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+  160       CONTINUE
+  170    CONTINUE
+      ELSE IF( TYPE.EQ.'SP'.OR.TYPE.EQ.'TP' )THEN
+         IOFF = 0
+         DO 190 J = 1, N
+            IF( UPPER )THEN
+               IBEG = 1
+               IEND = J
+            ELSE
+               IBEG = J
+               IEND = N
+            END IF
+            DO 180 I = IBEG, IEND
+               IOFF = IOFF + 1
+               AA( IOFF ) = A( I, J )
+               IF( I.EQ.J )THEN
+                  IF( UNIT )
+     $               AA( IOFF ) = ROGUE
+               END IF
+  180       CONTINUE
+  190    CONTINUE
+      END IF
+      RETURN
+*
+*     End of SMAKE.
+*
+      END
+      SUBROUTINE SMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y,
+     $                  INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV )
+*
+*  Checks the results of the computational tests.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0, ONE = 1.0 )
+*     .. Scalar Arguments ..
+      REAL               ALPHA, BETA, EPS, ERR
+      INTEGER            INCX, INCY, M, N, NMAX, NOUT
+      LOGICAL            FATAL, MV
+      CHARACTER*1        TRANS
+*     .. Array Arguments ..
+      REAL               A( NMAX, * ), G( * ), X( * ), Y( * ), YT( * ),
+     $                   YY( * )
+*     .. Local Scalars ..
+      REAL               ERRI
+      INTEGER            I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL
+      LOGICAL            TRAN
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, SQRT
+*     .. Executable Statements ..
+      TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C'
+      IF( TRAN )THEN
+         ML = N
+         NL = M
+      ELSE
+         ML = M
+         NL = N
+      END IF
+      IF( INCX.LT.0 )THEN
+         KX = NL
+         INCXL = -1
+      ELSE
+         KX = 1
+         INCXL = 1
+      END IF
+      IF( INCY.LT.0 )THEN
+         KY = ML
+         INCYL = -1
+      ELSE
+         KY = 1
+         INCYL = 1
+      END IF
+*
+*     Compute expected result in YT using data in A, X and Y.
+*     Compute gauges in G.
+*
+      IY = KY
+      DO 30 I = 1, ML
+         YT( IY ) = ZERO
+         G( IY ) = ZERO
+         JX = KX
+         IF( TRAN )THEN
+            DO 10 J = 1, NL
+               YT( IY ) = YT( IY ) + A( J, I )*X( JX )
+               G( IY ) = G( IY ) + ABS( A( J, I )*X( JX ) )
+               JX = JX + INCXL
+   10       CONTINUE
+         ELSE
+            DO 20 J = 1, NL
+               YT( IY ) = YT( IY ) + A( I, J )*X( JX )
+               G( IY ) = G( IY ) + ABS( A( I, J )*X( JX ) )
+               JX = JX + INCXL
+   20       CONTINUE
+         END IF
+         YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY )
+         G( IY ) = ABS( ALPHA )*G( IY ) + ABS( BETA*Y( IY ) )
+         IY = IY + INCYL
+   30 CONTINUE
+*
+*     Compute the error ratio for this result.
+*
+      ERR = ZERO
+      DO 40 I = 1, ML
+         ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS
+         IF( G( I ).NE.ZERO )
+     $      ERRI = ERRI/G( I )
+         ERR = MAX( ERR, ERRI )
+         IF( ERR*SQRT( EPS ).GE.ONE )
+     $      GO TO 50
+   40 CONTINUE
+*     If the loop completes, all results are at least half accurate.
+      GO TO 70
+*
+*     Report fatal error.
+*
+   50 FATAL = .TRUE.
+      WRITE( NOUT, FMT = 9999 )
+      DO 60 I = 1, ML
+         IF( MV )THEN
+            WRITE( NOUT, FMT = 9998 )I, YT( I ),
+     $         YY( 1 + ( I - 1 )*ABS( INCY ) )
+         ELSE
+            WRITE( NOUT, FMT = 9998 )I, 
+     $         YY( 1 + ( I - 1 )*ABS( INCY ) ), YT(I)
+         END IF
+   60 CONTINUE
+*
+   70 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL',
+     $      'F ACCURATE *******', /'           EXPECTED RESULT   COMPU',
+     $      'TED RESULT' )
+ 9998 FORMAT( 1X, I7, 2G18.6 )
+*
+*     End of SMVCH.
+*
+      END
+      LOGICAL FUNCTION LSE( RI, RJ, LR )
+*
+*  Tests if two arrays are identical.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      INTEGER            LR
+*     .. Array Arguments ..
+      REAL               RI( * ), RJ( * )
+*     .. Local Scalars ..
+      INTEGER            I
+*     .. Executable Statements ..
+      DO 10 I = 1, LR
+         IF( RI( I ).NE.RJ( I ) )
+     $      GO TO 20
+   10 CONTINUE
+      LSE = .TRUE.
+      GO TO 30
+   20 CONTINUE
+      LSE = .FALSE.
+   30 RETURN
+*
+*     End of LSE.
+*
+      END
+      LOGICAL FUNCTION LSERES( TYPE, UPLO, M, N, AA, AS, LDA )
+*
+*  Tests if selected elements in two arrays are equal.
+*
+*  TYPE is 'GE', 'SY' or 'SP'.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      INTEGER            LDA, M, N
+      CHARACTER*1        UPLO
+      CHARACTER*2        TYPE
+*     .. Array Arguments ..
+      REAL               AA( LDA, * ), AS( LDA, * )
+*     .. Local Scalars ..
+      INTEGER            I, IBEG, IEND, J
+      LOGICAL            UPPER
+*     .. Executable Statements ..
+      UPPER = UPLO.EQ.'U'
+      IF( TYPE.EQ.'GE' )THEN
+         DO 20 J = 1, N
+            DO 10 I = M + 1, LDA
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   10       CONTINUE
+   20    CONTINUE
+      ELSE IF( TYPE.EQ.'SY' )THEN
+         DO 50 J = 1, N
+            IF( UPPER )THEN
+               IBEG = 1
+               IEND = J
+            ELSE
+               IBEG = J
+               IEND = N
+            END IF
+            DO 30 I = 1, IBEG - 1
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   30       CONTINUE
+            DO 40 I = IEND + 1, LDA
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   40       CONTINUE
+   50    CONTINUE
+      END IF
+*
+      LSERES = .TRUE.
+      GO TO 80
+   70 CONTINUE
+      LSERES = .FALSE.
+   80 RETURN
+*
+*     End of LSERES.
+*
+      END
+      REAL FUNCTION SBEG( RESET )
+*
+*  Generates random numbers uniformly distributed between -0.5 and 0.5.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      LOGICAL            RESET
+*     .. Local Scalars ..
+      INTEGER            I, IC, MI
+*     .. Save statement ..
+      SAVE               I, IC, MI
+*     .. Intrinsic Functions ..
+      INTRINSIC          REAL
+*     .. Executable Statements ..
+      IF( RESET )THEN
+*        Initialize local variables.
+         MI = 891
+         I = 7
+         IC = 0
+         RESET = .FALSE.
+      END IF
+*
+*     The sequence of values of I is bounded between 1 and 999.
+*     If initial I = 1,2,3,6,7 or 9, the period will be 50.
+*     If initial I = 4 or 8, the period will be 25.
+*     If initial I = 5, the period will be 10.
+*     IC is used to break up the period by skipping 1 value of I in 6.
+*
+      IC = IC + 1
+   10 I = I*MI
+      I = I - 1000*( I/1000 )
+      IF( IC.GE.5 )THEN
+         IC = 0
+         GO TO 10
+      END IF
+      SBEG = REAL( I - 500 )/1001.0
+      RETURN
+*
+*     End of SBEG.
+*
+      END
+      REAL FUNCTION SDIFF( X, Y )
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*
+*     .. Scalar Arguments ..
+      REAL               X, Y
+*     .. Executable Statements ..
+      SDIFF = X - Y
+      RETURN
+*
+*     End of SDIFF.
+*
+      END
+      SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+*
+*  Tests whether XERBLA has detected an error when it should.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFOT, NOUT
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Executable Statements ..
+      IF( .NOT.LERR )THEN
+         WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT
+         OK = .FALSE.
+      END IF
+      LERR = .FALSE.
+      RETURN
+*
+ 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D',
+     $      'ETECTED BY ', A6, ' *****' )
+*
+*     End of CHKXER.
+*
+      END
+      SUBROUTINE XERBLA( SRNAME, INFO )
+*
+*  This is a special version of XERBLA to be used only as part of
+*  the test program for testing error exits from the Level 2 BLAS
+*  routines.
+*
+*  XERBLA  is an error handler for the Level 2 BLAS routines.
+*
+*  It is called by the Level 2 BLAS routines if an input parameter is
+*  invalid.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO
+      CHARACTER*6        SRNAME
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUT
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUT, OK, LERR
+      COMMON             /SRNAMC/SRNAMT
+*     .. Executable Statements ..
+      LERR = .TRUE.
+      IF( INFO.NE.INFOT )THEN
+         IF( INFOT.NE.0 )THEN
+            WRITE( NOUT, FMT = 9999 )INFO, INFOT
+         ELSE
+            WRITE( NOUT, FMT = 9997 )INFO
+         END IF
+         OK = .FALSE.
+      END IF
+      IF( SRNAME.NE.SRNAMT )THEN
+         WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT
+         OK = .FALSE.
+      END IF
+      RETURN
+*
+ 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD',
+     $      ' OF ', I2, ' *******' )
+ 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE',
+     $      'AD OF ', A6, ' *******' )
+ 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6,
+     $      ' *******' )
+*
+*     End of XERBLA
+*
+      END
+

diff --git a/blas/testing/sblat3.dat b/blas/testing/sblat3.dat
new file mode 100644
index 0000000..680e736
--- /dev/null
+++ b/blas/testing/sblat3.dat

@@ -0,0 +1,20 @@
+'sblat3.summ'     NAME OF SUMMARY OUTPUT FILE
+6                 UNIT NUMBER OF SUMMARY FILE
+'sblat3.snap'     NAME OF SNAPSHOT OUTPUT FILE
+-1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+F        LOGICAL FLAG, T TO STOP ON FAILURES.
+T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+16.0     THRESHOLD VALUE OF TEST RATIO
+6                 NUMBER OF VALUES OF N
+0 1 2 3 5 9       VALUES OF N
+3                 NUMBER OF VALUES OF ALPHA
+0.0 1.0 0.7       VALUES OF ALPHA
+3                 NUMBER OF VALUES OF BETA
+0.0 1.0 1.3       VALUES OF BETA
+SGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+SSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
+STRMM  T PUT F FOR NO TEST. SAME COLUMNS.
+STRSM  T PUT F FOR NO TEST. SAME COLUMNS.
+SSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
+SSYR2K T PUT F FOR NO TEST. SAME COLUMNS.

diff --git a/blas/testing/sblat3.f b/blas/testing/sblat3.f
new file mode 100644
index 0000000..8792696
--- /dev/null
+++ b/blas/testing/sblat3.f

@@ -0,0 +1,2873 @@
+*> \brief \b SBLAT3
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM SBLAT3
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the REAL             Level 3 Blas.
+*>
+*> The program must be driven by a short data file. The first 14 records
+*> of the file are read using list-directed input, the last 6 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 20 lines:
+*> 'sblat3.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'SBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> 0.0 1.0 0.7       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> 0.0 1.0 1.3       VALUES OF BETA
+*> SGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STRMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STRSM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*> See:
+*>
+*>    Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
+*>    A Set of Level 3 Basic Linear Algebra Subprograms.
+*>
+*>    Technical Memorandum No.88 (Revision 1), Mathematics and
+*>    Computer Science Division, Argonne National Laboratory, 9700
+*>    South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*> -- Written on 8-February-1989.
+*>    Jack Dongarra, Argonne National Laboratory.
+*>    Iain Duff, AERE Harwell.
+*>    Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*>    Sven Hammarling, Numerical Algorithms Group Ltd.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup single_blas_testing
+*
+*  =====================================================================
+      PROGRAM SBLAT3
+*
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NIN
+      PARAMETER          ( NIN = 5 )
+      INTEGER            NSUBS
+      PARAMETER          ( NSUBS = 6 )
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0, ONE = 1.0 )
+      INTEGER            NMAX
+      PARAMETER          ( NMAX = 65 )
+      INTEGER            NIDMAX, NALMAX, NBEMAX
+      PARAMETER          ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 )
+*     .. Local Scalars ..
+      REAL               EPS, ERR, THRESH
+      INTEGER            I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA
+      LOGICAL            FATAL, LTESTT, REWI, SAME, SFATAL, TRACE,
+     $                   TSTERR
+      CHARACTER*1        TRANSA, TRANSB
+      CHARACTER*6        SNAMET
+      CHARACTER*32       SNAPS, SUMMRY
+*     .. Local Arrays ..
+      REAL               AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ),
+     $                   ALF( NALMAX ), AS( NMAX*NMAX ),
+     $                   BB( NMAX*NMAX ), BET( NBEMAX ),
+     $                   BS( NMAX*NMAX ), C( NMAX, NMAX ),
+     $                   CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ),
+     $                   G( NMAX ), W( 2*NMAX )
+      INTEGER            IDIM( NIDMAX )
+      LOGICAL            LTEST( NSUBS )
+      CHARACTER*6        SNAMES( NSUBS )
+*     .. External Functions ..
+      REAL               SDIFF
+      LOGICAL            LSE
+      EXTERNAL           SDIFF, LSE
+*     .. External Subroutines ..
+      EXTERNAL           SCHK1, SCHK2, SCHK3, SCHK4, SCHK5, SCHKE, SMMCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+      COMMON             /SRNAMC/SRNAMT
+*     .. Data statements ..
+      DATA               SNAMES/'SGEMM ', 'SSYMM ', 'STRMM ', 'STRSM ',
+     $                   'SSYRK ', 'SSYR2K'/
+*     .. Executable Statements ..
+*
+*     Read name and unit number for summary output file and open file.
+*
+      READ( NIN, FMT = * )SUMMRY
+      READ( NIN, FMT = * )NOUT
+      OPEN( NOUT, FILE = SUMMRY )
+      NOUTC = NOUT
+*
+*     Read name and unit number for snapshot output file and open file.
+*
+      READ( NIN, FMT = * )SNAPS
+      READ( NIN, FMT = * )NTRA
+      TRACE = NTRA.GE.0
+      IF( TRACE )THEN
+         OPEN( NTRA, FILE = SNAPS )
+      END IF
+*     Read the flag that directs rewinding of the snapshot file.
+      READ( NIN, FMT = * )REWI
+      REWI = REWI.AND.TRACE
+*     Read the flag that directs stopping on any failure.
+      READ( NIN, FMT = * )SFATAL
+*     Read the flag that indicates whether error exits are to be tested.
+      READ( NIN, FMT = * )TSTERR
+*     Read the threshold value of the test ratio
+      READ( NIN, FMT = * )THRESH
+*
+*     Read and check the parameter values for the tests.
+*
+*     Values of N
+      READ( NIN, FMT = * )NIDIM
+      IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'N', NIDMAX
+         GO TO 220
+      END IF
+      READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM )
+      DO 10 I = 1, NIDIM
+         IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN
+            WRITE( NOUT, FMT = 9996 )NMAX
+            GO TO 220
+         END IF
+   10 CONTINUE
+*     Values of ALPHA
+      READ( NIN, FMT = * )NALF
+      IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX
+         GO TO 220
+      END IF
+      READ( NIN, FMT = * )( ALF( I ), I = 1, NALF )
+*     Values of BETA
+      READ( NIN, FMT = * )NBET
+      IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX
+         GO TO 220
+      END IF
+      READ( NIN, FMT = * )( BET( I ), I = 1, NBET )
+*
+*     Report values of parameters.
+*
+      WRITE( NOUT, FMT = 9995 )
+      WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM )
+      WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF )
+      WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET )
+      IF( .NOT.TSTERR )THEN
+         WRITE( NOUT, FMT = * )
+         WRITE( NOUT, FMT = 9984 )
+      END IF
+      WRITE( NOUT, FMT = * )
+      WRITE( NOUT, FMT = 9999 )THRESH
+      WRITE( NOUT, FMT = * )
+*
+*     Read names of subroutines and flags which indicate
+*     whether they are to be tested.
+*
+      DO 20 I = 1, NSUBS
+         LTEST( I ) = .FALSE.
+   20 CONTINUE
+   30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT
+      DO 40 I = 1, NSUBS
+         IF( SNAMET.EQ.SNAMES( I ) )
+     $      GO TO 50
+   40 CONTINUE
+      WRITE( NOUT, FMT = 9990 )SNAMET
+      STOP
+   50 LTEST( I ) = LTESTT
+      GO TO 30
+*
+   60 CONTINUE
+      CLOSE ( NIN )
+*
+*     Compute EPS (the machine precision).
+*
+      EPS = EPSILON(ZERO)
+      WRITE( NOUT, FMT = 9998 )EPS
+*
+*     Check the reliability of SMMCH using exact data.
+*
+      N = MIN( 32, NMAX )
+      DO 100 J = 1, N
+         DO 90 I = 1, N
+            AB( I, J ) = MAX( I - J + 1, 0 )
+   90    CONTINUE
+         AB( J, NMAX + 1 ) = J
+         AB( 1, NMAX + J ) = J
+         C( J, 1 ) = ZERO
+  100 CONTINUE
+      DO 110 J = 1, N
+         CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3
+  110 CONTINUE
+*     CC holds the exact result. On exit from SMMCH CT holds
+*     the result computed by SMMCH.
+      TRANSA = 'N'
+      TRANSB = 'N'
+      CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+     $            AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+     $            NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LSE( CC, CT, N )
+      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
+         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+         STOP
+      END IF
+      TRANSB = 'T'
+      CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+     $            AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+     $            NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LSE( CC, CT, N )
+      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
+         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+         STOP
+      END IF
+      DO 120 J = 1, N
+         AB( J, NMAX + 1 ) = N - J + 1
+         AB( 1, NMAX + J ) = N - J + 1
+  120 CONTINUE
+      DO 130 J = 1, N
+         CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 -
+     $                     ( ( J + 1 )*J*( J - 1 ) )/3
+  130 CONTINUE
+      TRANSA = 'T'
+      TRANSB = 'N'
+      CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+     $            AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+     $            NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LSE( CC, CT, N )
+      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
+         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+         STOP
+      END IF
+      TRANSB = 'T'
+      CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+     $            AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+     $            NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LSE( CC, CT, N )
+      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
+         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+         STOP
+      END IF
+*
+*     Test each subroutine in turn.
+*
+      DO 200 ISNUM = 1, NSUBS
+         WRITE( NOUT, FMT = * )
+         IF( .NOT.LTEST( ISNUM ) )THEN
+*           Subprogram is not to be tested.
+            WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM )
+         ELSE
+            SRNAMT = SNAMES( ISNUM )
+*           Test error exits.
+            IF( TSTERR )THEN
+               CALL SCHKE( ISNUM, SNAMES( ISNUM ), NOUT )
+               WRITE( NOUT, FMT = * )
+            END IF
+*           Test computations.
+            INFOT = 0
+            OK = .TRUE.
+            FATAL = .FALSE.
+            GO TO ( 140, 150, 160, 160, 170, 180 )ISNUM
+*           Test SGEMM, 01.
+  140       CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+     $                  NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+     $                  CC, CS, CT, G )
+            GO TO 190
+*           Test SSYMM, 02.
+  150       CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+     $                  NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+     $                  CC, CS, CT, G )
+            GO TO 190
+*           Test STRMM, 03, STRSM, 04.
+  160       CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB,
+     $                  AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C )
+            GO TO 190
+*           Test SSYRK, 05.
+  170       CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+     $                  NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+     $                  CC, CS, CT, G )
+            GO TO 190
+*           Test SSYR2K, 06.
+  180       CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+     $                  NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W )
+            GO TO 190
+*
+  190       IF( FATAL.AND.SFATAL )
+     $         GO TO 210
+         END IF
+  200 CONTINUE
+      WRITE( NOUT, FMT = 9986 )
+      GO TO 230
+*
+  210 CONTINUE
+      WRITE( NOUT, FMT = 9985 )
+      GO TO 230
+*
+  220 CONTINUE
+      WRITE( NOUT, FMT = 9991 )
+*
+  230 CONTINUE
+      IF( TRACE )
+     $   CLOSE ( NTRA )
+      CLOSE ( NOUT )
+      STOP
+*
+ 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES',
+     $      'S THAN', F8.2 )
+ 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 )
+ 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ',
+     $      'THAN ', I2 )
+ 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 )
+ 9995 FORMAT( ' TESTS OF THE REAL             LEVEL 3 BLAS', //' THE F',
+     $      'OLLOWING PARAMETER VALUES WILL BE USED:' )
+ 9994 FORMAT( '   FOR N              ', 9I6 )
+ 9993 FORMAT( '   FOR ALPHA          ', 7F6.1 )
+ 9992 FORMAT( '   FOR BETA           ', 7F6.1 )
+ 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM',
+     $      /' ******* TESTS ABANDONED *******' )
+ 9990 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T',
+     $      'ESTS ABANDONED *******' )
+ 9989 FORMAT( ' ERROR IN SMMCH -  IN-LINE DOT PRODUCTS ARE BEING EVALU',
+     $      'ATED WRONGLY.', /' SMMCH WAS CALLED WITH TRANSA = ', A1,
+     $      ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ',
+     $      'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ',
+     $      'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ',
+     $      '*******' )
+ 9988 FORMAT( A6, L2 )
+ 9987 FORMAT( 1X, A6, ' WAS NOT TESTED' )
+ 9986 FORMAT( /' END OF TESTS' )
+ 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' )
+ 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' )
+*
+*     End of SBLAT3.
+*
+      END
+      SUBROUTINE SCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+     $                  A, AA, AS, B, BB, BS, C, CC, CS, CT, G )
+*
+*  Tests SGEMM.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      REAL               ZERO
+      PARAMETER          ( ZERO = 0.0 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      REAL               A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), B( NMAX, NMAX ),
+     $                   BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+     $                   C( NMAX, NMAX ), CC( NMAX*NMAX ),
+     $                   CS( NMAX*NMAX ), CT( NMAX ), G( NMAX )
+      INTEGER            IDIM( NIDIM )
+*     .. Local Scalars ..
+      REAL               ALPHA, ALS, BETA, BLS, ERR, ERRMAX
+      INTEGER            I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA,
+     $                   LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M,
+     $                   MA, MB, MS, N, NA, NARGS, NB, NC, NS
+      LOGICAL            NULL, RESET, SAME, TRANA, TRANB
+      CHARACTER*1        TRANAS, TRANBS, TRANSA, TRANSB
+      CHARACTER*3        ICH
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LSE, LSERES
+      EXTERNAL           LSE, LSERES
+*     .. External Subroutines ..
+      EXTERNAL           SGEMM, SMAKE, SMMCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICH/'NTC'/
+*     .. Executable Statements ..
+*
+      NARGS = 13
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*
+      DO 110 IM = 1, NIDIM
+         M = IDIM( IM )
+*
+         DO 100 IN = 1, NIDIM
+            N = IDIM( IN )
+*           Set LDC to 1 more than minimum value if room.
+            LDC = M
+            IF( LDC.LT.NMAX )
+     $         LDC = LDC + 1
+*           Skip tests if not enough room.
+            IF( LDC.GT.NMAX )
+     $         GO TO 100
+            LCC = LDC*N
+            NULL = N.LE.0.OR.M.LE.0
+*
+            DO 90 IK = 1, NIDIM
+               K = IDIM( IK )
+*
+               DO 80 ICA = 1, 3
+                  TRANSA = ICH( ICA: ICA )
+                  TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C'
+*
+                  IF( TRANA )THEN
+                     MA = K
+                     NA = M
+                  ELSE
+                     MA = M
+                     NA = K
+                  END IF
+*                 Set LDA to 1 more than minimum value if room.
+                  LDA = MA
+                  IF( LDA.LT.NMAX )
+     $               LDA = LDA + 1
+*                 Skip tests if not enough room.
+                  IF( LDA.GT.NMAX )
+     $               GO TO 80
+                  LAA = LDA*NA
+*
+*                 Generate the matrix A.
+*
+                  CALL SMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA,
+     $                        RESET, ZERO )
+*
+                  DO 70 ICB = 1, 3
+                     TRANSB = ICH( ICB: ICB )
+                     TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C'
+*
+                     IF( TRANB )THEN
+                        MB = N
+                        NB = K
+                     ELSE
+                        MB = K
+                        NB = N
+                     END IF
+*                    Set LDB to 1 more than minimum value if room.
+                     LDB = MB
+                     IF( LDB.LT.NMAX )
+     $                  LDB = LDB + 1
+*                    Skip tests if not enough room.
+                     IF( LDB.GT.NMAX )
+     $                  GO TO 70
+                     LBB = LDB*NB
+*
+*                    Generate the matrix B.
+*
+                     CALL SMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB,
+     $                           LDB, RESET, ZERO )
+*
+                     DO 60 IA = 1, NALF
+                        ALPHA = ALF( IA )
+*
+                        DO 50 IB = 1, NBET
+                           BETA = BET( IB )
+*
+*                          Generate the matrix C.
+*
+                           CALL SMAKE( 'GE', ' ', ' ', M, N, C, NMAX,
+     $                                 CC, LDC, RESET, ZERO )
+*
+                           NC = NC + 1
+*
+*                          Save every datum before calling the
+*                          subroutine.
+*
+                           TRANAS = TRANSA
+                           TRANBS = TRANSB
+                           MS = M
+                           NS = N
+                           KS = K
+                           ALS = ALPHA
+                           DO 10 I = 1, LAA
+                              AS( I ) = AA( I )
+   10                      CONTINUE
+                           LDAS = LDA
+                           DO 20 I = 1, LBB
+                              BS( I ) = BB( I )
+   20                      CONTINUE
+                           LDBS = LDB
+                           BLS = BETA
+                           DO 30 I = 1, LCC
+                              CS( I ) = CC( I )
+   30                      CONTINUE
+                           LDCS = LDC
+*
+*                          Call the subroutine.
+*
+                           IF( TRACE )
+     $                        WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                        TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB,
+     $                        BETA, LDC
+                           IF( REWI )
+     $                        REWIND NTRA
+                           CALL SGEMM( TRANSA, TRANSB, M, N, K, ALPHA,
+     $                                 AA, LDA, BB, LDB, BETA, CC, LDC )
+*
+*                          Check if error-exit was taken incorrectly.
+*
+                           IF( .NOT.OK )THEN
+                              WRITE( NOUT, FMT = 9994 )
+                              FATAL = .TRUE.
+                              GO TO 120
+                           END IF
+*
+*                          See what data changed inside subroutines.
+*
+                           ISAME( 1 ) = TRANSA.EQ.TRANAS
+                           ISAME( 2 ) = TRANSB.EQ.TRANBS
+                           ISAME( 3 ) = MS.EQ.M
+                           ISAME( 4 ) = NS.EQ.N
+                           ISAME( 5 ) = KS.EQ.K
+                           ISAME( 6 ) = ALS.EQ.ALPHA
+                           ISAME( 7 ) = LSE( AS, AA, LAA )
+                           ISAME( 8 ) = LDAS.EQ.LDA
+                           ISAME( 9 ) = LSE( BS, BB, LBB )
+                           ISAME( 10 ) = LDBS.EQ.LDB
+                           ISAME( 11 ) = BLS.EQ.BETA
+                           IF( NULL )THEN
+                              ISAME( 12 ) = LSE( CS, CC, LCC )
+                           ELSE
+                              ISAME( 12 ) = LSERES( 'GE', ' ', M, N, CS,
+     $                                      CC, LDC )
+                           END IF
+                           ISAME( 13 ) = LDCS.EQ.LDC
+*
+*                          If data was incorrectly changed, report
+*                          and return.
+*
+                           SAME = .TRUE.
+                           DO 40 I = 1, NARGS
+                              SAME = SAME.AND.ISAME( I )
+                              IF( .NOT.ISAME( I ) )
+     $                           WRITE( NOUT, FMT = 9998 )I
+   40                      CONTINUE
+                           IF( .NOT.SAME )THEN
+                              FATAL = .TRUE.
+                              GO TO 120
+                           END IF
+*
+                           IF( .NOT.NULL )THEN
+*
+*                             Check the result.
+*
+                              CALL SMMCH( TRANSA, TRANSB, M, N, K,
+     $                                    ALPHA, A, NMAX, B, NMAX, BETA,
+     $                                    C, NMAX, CT, G, CC, LDC, EPS,
+     $                                    ERR, FATAL, NOUT, .TRUE. )
+                              ERRMAX = MAX( ERRMAX, ERR )
+*                             If got really bad answer, report and
+*                             return.
+                              IF( FATAL )
+     $                           GO TO 120
+                           END IF
+*
+   50                   CONTINUE
+*
+   60                CONTINUE
+*
+   70             CONTINUE
+*
+   80          CONTINUE
+*
+   90       CONTINUE
+*
+  100    CONTINUE
+*
+  110 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 130
+*
+  120 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K,
+     $   ALPHA, LDA, LDB, BETA, LDC
+*
+  130 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',''', A1, ''',',
+     $      3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', ',
+     $      'C,', I3, ').' )
+ 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of SCHK1.
+*
+      END
+      SUBROUTINE SCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+     $                  A, AA, AS, B, BB, BS, C, CC, CS, CT, G )
+*
+*  Tests SSYMM.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      REAL               ZERO
+      PARAMETER          ( ZERO = 0.0 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      REAL               A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), B( NMAX, NMAX ),
+     $                   BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+     $                   C( NMAX, NMAX ), CC( NMAX*NMAX ),
+     $                   CS( NMAX*NMAX ), CT( NMAX ), G( NMAX )
+      INTEGER            IDIM( NIDIM )
+*     .. Local Scalars ..
+      REAL               ALPHA, ALS, BETA, BLS, ERR, ERRMAX
+      INTEGER            I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC,
+     $                   LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA,
+     $                   NARGS, NC, NS
+      LOGICAL            LEFT, NULL, RESET, SAME
+      CHARACTER*1        SIDE, SIDES, UPLO, UPLOS
+      CHARACTER*2        ICHS, ICHU
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LSE, LSERES
+      EXTERNAL           LSE, LSERES
+*     .. External Subroutines ..
+      EXTERNAL           SMAKE, SMMCH, SSYMM
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICHS/'LR'/, ICHU/'UL'/
+*     .. Executable Statements ..
+*
+      NARGS = 12
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*
+      DO 100 IM = 1, NIDIM
+         M = IDIM( IM )
+*
+         DO 90 IN = 1, NIDIM
+            N = IDIM( IN )
+*           Set LDC to 1 more than minimum value if room.
+            LDC = M
+            IF( LDC.LT.NMAX )
+     $         LDC = LDC + 1
+*           Skip tests if not enough room.
+            IF( LDC.GT.NMAX )
+     $         GO TO 90
+            LCC = LDC*N
+            NULL = N.LE.0.OR.M.LE.0
+*
+*           Set LDB to 1 more than minimum value if room.
+            LDB = M
+            IF( LDB.LT.NMAX )
+     $         LDB = LDB + 1
+*           Skip tests if not enough room.
+            IF( LDB.GT.NMAX )
+     $         GO TO 90
+            LBB = LDB*N
+*
+*           Generate the matrix B.
+*
+            CALL SMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET,
+     $                  ZERO )
+*
+            DO 80 ICS = 1, 2
+               SIDE = ICHS( ICS: ICS )
+               LEFT = SIDE.EQ.'L'
+*
+               IF( LEFT )THEN
+                  NA = M
+               ELSE
+                  NA = N
+               END IF
+*              Set LDA to 1 more than minimum value if room.
+               LDA = NA
+               IF( LDA.LT.NMAX )
+     $            LDA = LDA + 1
+*              Skip tests if not enough room.
+               IF( LDA.GT.NMAX )
+     $            GO TO 80
+               LAA = LDA*NA
+*
+               DO 70 ICU = 1, 2
+                  UPLO = ICHU( ICU: ICU )
+*
+*                 Generate the symmetric matrix A.
+*
+                  CALL SMAKE( 'SY', UPLO, ' ', NA, NA, A, NMAX, AA, LDA,
+     $                        RESET, ZERO )
+*
+                  DO 60 IA = 1, NALF
+                     ALPHA = ALF( IA )
+*
+                     DO 50 IB = 1, NBET
+                        BETA = BET( IB )
+*
+*                       Generate the matrix C.
+*
+                        CALL SMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC,
+     $                              LDC, RESET, ZERO )
+*
+                        NC = NC + 1
+*
+*                       Save every datum before calling the
+*                       subroutine.
+*
+                        SIDES = SIDE
+                        UPLOS = UPLO
+                        MS = M
+                        NS = N
+                        ALS = ALPHA
+                        DO 10 I = 1, LAA
+                           AS( I ) = AA( I )
+   10                   CONTINUE
+                        LDAS = LDA
+                        DO 20 I = 1, LBB
+                           BS( I ) = BB( I )
+   20                   CONTINUE
+                        LDBS = LDB
+                        BLS = BETA
+                        DO 30 I = 1, LCC
+                           CS( I ) = CC( I )
+   30                   CONTINUE
+                        LDCS = LDC
+*
+*                       Call the subroutine.
+*
+                        IF( TRACE )
+     $                     WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE,
+     $                     UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC
+                        IF( REWI )
+     $                     REWIND NTRA
+                        CALL SSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA,
+     $                              BB, LDB, BETA, CC, LDC )
+*
+*                       Check if error-exit was taken incorrectly.
+*
+                        IF( .NOT.OK )THEN
+                           WRITE( NOUT, FMT = 9994 )
+                           FATAL = .TRUE.
+                           GO TO 110
+                        END IF
+*
+*                       See what data changed inside subroutines.
+*
+                        ISAME( 1 ) = SIDES.EQ.SIDE
+                        ISAME( 2 ) = UPLOS.EQ.UPLO
+                        ISAME( 3 ) = MS.EQ.M
+                        ISAME( 4 ) = NS.EQ.N
+                        ISAME( 5 ) = ALS.EQ.ALPHA
+                        ISAME( 6 ) = LSE( AS, AA, LAA )
+                        ISAME( 7 ) = LDAS.EQ.LDA
+                        ISAME( 8 ) = LSE( BS, BB, LBB )
+                        ISAME( 9 ) = LDBS.EQ.LDB
+                        ISAME( 10 ) = BLS.EQ.BETA
+                        IF( NULL )THEN
+                           ISAME( 11 ) = LSE( CS, CC, LCC )
+                        ELSE
+                           ISAME( 11 ) = LSERES( 'GE', ' ', M, N, CS,
+     $                                   CC, LDC )
+                        END IF
+                        ISAME( 12 ) = LDCS.EQ.LDC
+*
+*                       If data was incorrectly changed, report and
+*                       return.
+*
+                        SAME = .TRUE.
+                        DO 40 I = 1, NARGS
+                           SAME = SAME.AND.ISAME( I )
+                           IF( .NOT.ISAME( I ) )
+     $                        WRITE( NOUT, FMT = 9998 )I
+   40                   CONTINUE
+                        IF( .NOT.SAME )THEN
+                           FATAL = .TRUE.
+                           GO TO 110
+                        END IF
+*
+                        IF( .NOT.NULL )THEN
+*
+*                          Check the result.
+*
+                           IF( LEFT )THEN
+                              CALL SMMCH( 'N', 'N', M, N, M, ALPHA, A,
+     $                                    NMAX, B, NMAX, BETA, C, NMAX,
+     $                                    CT, G, CC, LDC, EPS, ERR,
+     $                                    FATAL, NOUT, .TRUE. )
+                           ELSE
+                              CALL SMMCH( 'N', 'N', M, N, N, ALPHA, B,
+     $                                    NMAX, A, NMAX, BETA, C, NMAX,
+     $                                    CT, G, CC, LDC, EPS, ERR,
+     $                                    FATAL, NOUT, .TRUE. )
+                           END IF
+                           ERRMAX = MAX( ERRMAX, ERR )
+*                          If got really bad answer, report and
+*                          return.
+                           IF( FATAL )
+     $                        GO TO 110
+                        END IF
+*
+   50                CONTINUE
+*
+   60             CONTINUE
+*
+   70          CONTINUE
+*
+   80       CONTINUE
+*
+   90    CONTINUE
+*
+  100 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 120
+*
+  110 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA,
+     $   LDB, BETA, LDC
+*
+  120 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+     $      F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ')   ',
+     $      ' .' )
+ 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of SCHK2.
+*
+      END
+      SUBROUTINE SCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS,
+     $                  B, BB, BS, CT, G, C )
+*
+*  Tests STRMM and STRSM.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0, ONE = 1.0 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            NALF, NIDIM, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      REAL               A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), B( NMAX, NMAX ),
+     $                   BB( NMAX*NMAX ), BS( NMAX*NMAX ),
+     $                   C( NMAX, NMAX ), CT( NMAX ), G( NMAX )
+      INTEGER            IDIM( NIDIM )
+*     .. Local Scalars ..
+      REAL               ALPHA, ALS, ERR, ERRMAX
+      INTEGER            I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB,
+     $                   LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC,
+     $                   NS
+      LOGICAL            LEFT, NULL, RESET, SAME
+      CHARACTER*1        DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO,
+     $                   UPLOS
+      CHARACTER*2        ICHD, ICHS, ICHU
+      CHARACTER*3        ICHT
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LSE, LSERES
+      EXTERNAL           LSE, LSERES
+*     .. External Subroutines ..
+      EXTERNAL           SMAKE, SMMCH, STRMM, STRSM
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/
+*     .. Executable Statements ..
+*
+      NARGS = 11
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*     Set up zero matrix for SMMCH.
+      DO 20 J = 1, NMAX
+         DO 10 I = 1, NMAX
+            C( I, J ) = ZERO
+   10    CONTINUE
+   20 CONTINUE
+*
+      DO 140 IM = 1, NIDIM
+         M = IDIM( IM )
+*
+         DO 130 IN = 1, NIDIM
+            N = IDIM( IN )
+*           Set LDB to 1 more than minimum value if room.
+            LDB = M
+            IF( LDB.LT.NMAX )
+     $         LDB = LDB + 1
+*           Skip tests if not enough room.
+            IF( LDB.GT.NMAX )
+     $         GO TO 130
+            LBB = LDB*N
+            NULL = M.LE.0.OR.N.LE.0
+*
+            DO 120 ICS = 1, 2
+               SIDE = ICHS( ICS: ICS )
+               LEFT = SIDE.EQ.'L'
+               IF( LEFT )THEN
+                  NA = M
+               ELSE
+                  NA = N
+               END IF
+*              Set LDA to 1 more than minimum value if room.
+               LDA = NA
+               IF( LDA.LT.NMAX )
+     $            LDA = LDA + 1
+*              Skip tests if not enough room.
+               IF( LDA.GT.NMAX )
+     $            GO TO 130
+               LAA = LDA*NA
+*
+               DO 110 ICU = 1, 2
+                  UPLO = ICHU( ICU: ICU )
+*
+                  DO 100 ICT = 1, 3
+                     TRANSA = ICHT( ICT: ICT )
+*
+                     DO 90 ICD = 1, 2
+                        DIAG = ICHD( ICD: ICD )
+*
+                        DO 80 IA = 1, NALF
+                           ALPHA = ALF( IA )
+*
+*                          Generate the matrix A.
+*
+                           CALL SMAKE( 'TR', UPLO, DIAG, NA, NA, A,
+     $                                 NMAX, AA, LDA, RESET, ZERO )
+*
+*                          Generate the matrix B.
+*
+                           CALL SMAKE( 'GE', ' ', ' ', M, N, B, NMAX,
+     $                                 BB, LDB, RESET, ZERO )
+*
+                           NC = NC + 1
+*
+*                          Save every datum before calling the
+*                          subroutine.
+*
+                           SIDES = SIDE
+                           UPLOS = UPLO
+                           TRANAS = TRANSA
+                           DIAGS = DIAG
+                           MS = M
+                           NS = N
+                           ALS = ALPHA
+                           DO 30 I = 1, LAA
+                              AS( I ) = AA( I )
+   30                      CONTINUE
+                           LDAS = LDA
+                           DO 40 I = 1, LBB
+                              BS( I ) = BB( I )
+   40                      CONTINUE
+                           LDBS = LDB
+*
+*                          Call the subroutine.
+*
+                           IF( SNAME( 4: 5 ).EQ.'MM' )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                           SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA,
+     $                           LDA, LDB
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL STRMM( SIDE, UPLO, TRANSA, DIAG, M,
+     $                                    N, ALPHA, AA, LDA, BB, LDB )
+                           ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                           SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA,
+     $                           LDA, LDB
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL STRSM( SIDE, UPLO, TRANSA, DIAG, M,
+     $                                    N, ALPHA, AA, LDA, BB, LDB )
+                           END IF
+*
+*                          Check if error-exit was taken incorrectly.
+*
+                           IF( .NOT.OK )THEN
+                              WRITE( NOUT, FMT = 9994 )
+                              FATAL = .TRUE.
+                              GO TO 150
+                           END IF
+*
+*                          See what data changed inside subroutines.
+*
+                           ISAME( 1 ) = SIDES.EQ.SIDE
+                           ISAME( 2 ) = UPLOS.EQ.UPLO
+                           ISAME( 3 ) = TRANAS.EQ.TRANSA
+                           ISAME( 4 ) = DIAGS.EQ.DIAG
+                           ISAME( 5 ) = MS.EQ.M
+                           ISAME( 6 ) = NS.EQ.N
+                           ISAME( 7 ) = ALS.EQ.ALPHA
+                           ISAME( 8 ) = LSE( AS, AA, LAA )
+                           ISAME( 9 ) = LDAS.EQ.LDA
+                           IF( NULL )THEN
+                              ISAME( 10 ) = LSE( BS, BB, LBB )
+                           ELSE
+                              ISAME( 10 ) = LSERES( 'GE', ' ', M, N, BS,
+     $                                      BB, LDB )
+                           END IF
+                           ISAME( 11 ) = LDBS.EQ.LDB
+*
+*                          If data was incorrectly changed, report and
+*                          return.
+*
+                           SAME = .TRUE.
+                           DO 50 I = 1, NARGS
+                              SAME = SAME.AND.ISAME( I )
+                              IF( .NOT.ISAME( I ) )
+     $                           WRITE( NOUT, FMT = 9998 )I
+   50                      CONTINUE
+                           IF( .NOT.SAME )THEN
+                              FATAL = .TRUE.
+                              GO TO 150
+                           END IF
+*
+                           IF( .NOT.NULL )THEN
+                              IF( SNAME( 4: 5 ).EQ.'MM' )THEN
+*
+*                                Check the result.
+*
+                                 IF( LEFT )THEN
+                                    CALL SMMCH( TRANSA, 'N', M, N, M,
+     $                                          ALPHA, A, NMAX, B, NMAX,
+     $                                          ZERO, C, NMAX, CT, G,
+     $                                          BB, LDB, EPS, ERR,
+     $                                          FATAL, NOUT, .TRUE. )
+                                 ELSE
+                                    CALL SMMCH( 'N', TRANSA, M, N, N,
+     $                                          ALPHA, B, NMAX, A, NMAX,
+     $                                          ZERO, C, NMAX, CT, G,
+     $                                          BB, LDB, EPS, ERR,
+     $                                          FATAL, NOUT, .TRUE. )
+                                 END IF
+                              ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN
+*
+*                                Compute approximation to original
+*                                matrix.
+*
+                                 DO 70 J = 1, N
+                                    DO 60 I = 1, M
+                                       C( I, J ) = BB( I + ( J - 1 )*
+     $                                             LDB )
+                                       BB( I + ( J - 1 )*LDB ) = ALPHA*
+     $                                    B( I, J )
+   60                               CONTINUE
+   70                            CONTINUE
+*
+                                 IF( LEFT )THEN
+                                    CALL SMMCH( TRANSA, 'N', M, N, M,
+     $                                          ONE, A, NMAX, C, NMAX,
+     $                                          ZERO, B, NMAX, CT, G,
+     $                                          BB, LDB, EPS, ERR,
+     $                                          FATAL, NOUT, .FALSE. )
+                                 ELSE
+                                    CALL SMMCH( 'N', TRANSA, M, N, N,
+     $                                          ONE, C, NMAX, A, NMAX,
+     $                                          ZERO, B, NMAX, CT, G,
+     $                                          BB, LDB, EPS, ERR,
+     $                                          FATAL, NOUT, .FALSE. )
+                                 END IF
+                              END IF
+                              ERRMAX = MAX( ERRMAX, ERR )
+*                             If got really bad answer, report and
+*                             return.
+                              IF( FATAL )
+     $                           GO TO 150
+                           END IF
+*
+   80                   CONTINUE
+*
+   90                CONTINUE
+*
+  100             CONTINUE
+*
+  110          CONTINUE
+*
+  120       CONTINUE
+*
+  130    CONTINUE
+*
+  140 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 160
+*
+  150 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M,
+     $   N, ALPHA, LDA, LDB
+*
+  160 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(', 4( '''', A1, ''',' ), 2( I3, ',' ),
+     $      F4.1, ', A,', I3, ', B,', I3, ')        .' )
+ 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of SCHK3.
+*
+      END
+      SUBROUTINE SCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+     $                  A, AA, AS, B, BB, BS, C, CC, CS, CT, G )
+*
+*  Tests SSYRK.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      REAL               ZERO
+      PARAMETER          ( ZERO = 0.0 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      REAL               A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), B( NMAX, NMAX ),
+     $                   BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+     $                   C( NMAX, NMAX ), CC( NMAX*NMAX ),
+     $                   CS( NMAX*NMAX ), CT( NMAX ), G( NMAX )
+      INTEGER            IDIM( NIDIM )
+*     .. Local Scalars ..
+      REAL               ALPHA, ALS, BETA, BETS, ERR, ERRMAX
+      INTEGER            I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS,
+     $                   LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA,
+     $                   NARGS, NC, NS
+      LOGICAL            NULL, RESET, SAME, TRAN, UPPER
+      CHARACTER*1        TRANS, TRANSS, UPLO, UPLOS
+      CHARACTER*2        ICHU
+      CHARACTER*3        ICHT
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LSE, LSERES
+      EXTERNAL           LSE, LSERES
+*     .. External Subroutines ..
+      EXTERNAL           SMAKE, SMMCH, SSYRK
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICHT/'NTC'/, ICHU/'UL'/
+*     .. Executable Statements ..
+*
+      NARGS = 10
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*
+      DO 100 IN = 1, NIDIM
+         N = IDIM( IN )
+*        Set LDC to 1 more than minimum value if room.
+         LDC = N
+         IF( LDC.LT.NMAX )
+     $      LDC = LDC + 1
+*        Skip tests if not enough room.
+         IF( LDC.GT.NMAX )
+     $      GO TO 100
+         LCC = LDC*N
+         NULL = N.LE.0
+*
+         DO 90 IK = 1, NIDIM
+            K = IDIM( IK )
+*
+            DO 80 ICT = 1, 3
+               TRANS = ICHT( ICT: ICT )
+               TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C'
+               IF( TRAN )THEN
+                  MA = K
+                  NA = N
+               ELSE
+                  MA = N
+                  NA = K
+               END IF
+*              Set LDA to 1 more than minimum value if room.
+               LDA = MA
+               IF( LDA.LT.NMAX )
+     $            LDA = LDA + 1
+*              Skip tests if not enough room.
+               IF( LDA.GT.NMAX )
+     $            GO TO 80
+               LAA = LDA*NA
+*
+*              Generate the matrix A.
+*
+               CALL SMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA,
+     $                     RESET, ZERO )
+*
+               DO 70 ICU = 1, 2
+                  UPLO = ICHU( ICU: ICU )
+                  UPPER = UPLO.EQ.'U'
+*
+                  DO 60 IA = 1, NALF
+                     ALPHA = ALF( IA )
+*
+                     DO 50 IB = 1, NBET
+                        BETA = BET( IB )
+*
+*                       Generate the matrix C.
+*
+                        CALL SMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC,
+     $                              LDC, RESET, ZERO )
+*
+                        NC = NC + 1
+*
+*                       Save every datum before calling the subroutine.
+*
+                        UPLOS = UPLO
+                        TRANSS = TRANS
+                        NS = N
+                        KS = K
+                        ALS = ALPHA
+                        DO 10 I = 1, LAA
+                           AS( I ) = AA( I )
+   10                   CONTINUE
+                        LDAS = LDA
+                        BETS = BETA
+                        DO 20 I = 1, LCC
+                           CS( I ) = CC( I )
+   20                   CONTINUE
+                        LDCS = LDC
+*
+*                       Call the subroutine.
+*
+                        IF( TRACE )
+     $                     WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO,
+     $                     TRANS, N, K, ALPHA, LDA, BETA, LDC
+                        IF( REWI )
+     $                     REWIND NTRA
+                        CALL SSYRK( UPLO, TRANS, N, K, ALPHA, AA, LDA,
+     $                              BETA, CC, LDC )
+*
+*                       Check if error-exit was taken incorrectly.
+*
+                        IF( .NOT.OK )THEN
+                           WRITE( NOUT, FMT = 9993 )
+                           FATAL = .TRUE.
+                           GO TO 120
+                        END IF
+*
+*                       See what data changed inside subroutines.
+*
+                        ISAME( 1 ) = UPLOS.EQ.UPLO
+                        ISAME( 2 ) = TRANSS.EQ.TRANS
+                        ISAME( 3 ) = NS.EQ.N
+                        ISAME( 4 ) = KS.EQ.K
+                        ISAME( 5 ) = ALS.EQ.ALPHA
+                        ISAME( 6 ) = LSE( AS, AA, LAA )
+                        ISAME( 7 ) = LDAS.EQ.LDA
+                        ISAME( 8 ) = BETS.EQ.BETA
+                        IF( NULL )THEN
+                           ISAME( 9 ) = LSE( CS, CC, LCC )
+                        ELSE
+                           ISAME( 9 ) = LSERES( 'SY', UPLO, N, N, CS,
+     $                                  CC, LDC )
+                        END IF
+                        ISAME( 10 ) = LDCS.EQ.LDC
+*
+*                       If data was incorrectly changed, report and
+*                       return.
+*
+                        SAME = .TRUE.
+                        DO 30 I = 1, NARGS
+                           SAME = SAME.AND.ISAME( I )
+                           IF( .NOT.ISAME( I ) )
+     $                        WRITE( NOUT, FMT = 9998 )I
+   30                   CONTINUE
+                        IF( .NOT.SAME )THEN
+                           FATAL = .TRUE.
+                           GO TO 120
+                        END IF
+*
+                        IF( .NOT.NULL )THEN
+*
+*                          Check the result column by column.
+*
+                           JC = 1
+                           DO 40 J = 1, N
+                              IF( UPPER )THEN
+                                 JJ = 1
+                                 LJ = J
+                              ELSE
+                                 JJ = J
+                                 LJ = N - J + 1
+                              END IF
+                              IF( TRAN )THEN
+                                 CALL SMMCH( 'T', 'N', LJ, 1, K, ALPHA,
+     $                                       A( 1, JJ ), NMAX,
+     $                                       A( 1, J ), NMAX, BETA,
+     $                                       C( JJ, J ), NMAX, CT, G,
+     $                                       CC( JC ), LDC, EPS, ERR,
+     $                                       FATAL, NOUT, .TRUE. )
+                              ELSE
+                                 CALL SMMCH( 'N', 'T', LJ, 1, K, ALPHA,
+     $                                       A( JJ, 1 ), NMAX,
+     $                                       A( J, 1 ), NMAX, BETA,
+     $                                       C( JJ, J ), NMAX, CT, G,
+     $                                       CC( JC ), LDC, EPS, ERR,
+     $                                       FATAL, NOUT, .TRUE. )
+                              END IF
+                              IF( UPPER )THEN
+                                 JC = JC + LDC
+                              ELSE
+                                 JC = JC + LDC + 1
+                              END IF
+                              ERRMAX = MAX( ERRMAX, ERR )
+*                             If got really bad answer, report and
+*                             return.
+                              IF( FATAL )
+     $                           GO TO 110
+   40                      CONTINUE
+                        END IF
+*
+   50                CONTINUE
+*
+   60             CONTINUE
+*
+   70          CONTINUE
+*
+   80       CONTINUE
+*
+   90    CONTINUE
+*
+  100 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 130
+*
+  110 CONTINUE
+      IF( N.GT.1 )
+     $   WRITE( NOUT, FMT = 9995 )J
+*
+  120 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA,
+     $   LDA, BETA, LDC
+*
+  130 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+     $      F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ')           .' )
+ 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of SCHK4.
+*
+      END
+      SUBROUTINE SCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+     $                  AB, AA, AS, BB, BS, C, CC, CS, CT, G, W )
+*
+*  Tests SSYR2K.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      REAL               ZERO
+      PARAMETER          ( ZERO = 0.0 )
+*     .. Scalar Arguments ..
+      REAL               EPS, THRESH
+      INTEGER            NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      REAL               AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ),
+     $                   ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ),
+     $                   BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ),
+     $                   CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ),
+     $                   G( NMAX ), W( 2*NMAX )
+      INTEGER            IDIM( NIDIM )
+*     .. Local Scalars ..
+      REAL               ALPHA, ALS, BETA, BETS, ERR, ERRMAX
+      INTEGER            I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB,
+     $                   K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS,
+     $                   LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS
+      LOGICAL            NULL, RESET, SAME, TRAN, UPPER
+      CHARACTER*1        TRANS, TRANSS, UPLO, UPLOS
+      CHARACTER*2        ICHU
+      CHARACTER*3        ICHT
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LSE, LSERES
+      EXTERNAL           LSE, LSERES
+*     .. External Subroutines ..
+      EXTERNAL           SMAKE, SMMCH, SSYR2K
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICHT/'NTC'/, ICHU/'UL'/
+*     .. Executable Statements ..
+*
+      NARGS = 12
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = ZERO
+*
+      DO 130 IN = 1, NIDIM
+         N = IDIM( IN )
+*        Set LDC to 1 more than minimum value if room.
+         LDC = N
+         IF( LDC.LT.NMAX )
+     $      LDC = LDC + 1
+*        Skip tests if not enough room.
+         IF( LDC.GT.NMAX )
+     $      GO TO 130
+         LCC = LDC*N
+         NULL = N.LE.0
+*
+         DO 120 IK = 1, NIDIM
+            K = IDIM( IK )
+*
+            DO 110 ICT = 1, 3
+               TRANS = ICHT( ICT: ICT )
+               TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C'
+               IF( TRAN )THEN
+                  MA = K
+                  NA = N
+               ELSE
+                  MA = N
+                  NA = K
+               END IF
+*              Set LDA to 1 more than minimum value if room.
+               LDA = MA
+               IF( LDA.LT.NMAX )
+     $            LDA = LDA + 1
+*              Skip tests if not enough room.
+               IF( LDA.GT.NMAX )
+     $            GO TO 110
+               LAA = LDA*NA
+*
+*              Generate the matrix A.
+*
+               IF( TRAN )THEN
+                  CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA,
+     $                        LDA, RESET, ZERO )
+               ELSE
+                  CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA,
+     $                        RESET, ZERO )
+               END IF
+*
+*              Generate the matrix B.
+*
+               LDB = LDA
+               LBB = LAA
+               IF( TRAN )THEN
+                  CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ),
+     $                        2*NMAX, BB, LDB, RESET, ZERO )
+               ELSE
+                  CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ),
+     $                        NMAX, BB, LDB, RESET, ZERO )
+               END IF
+*
+               DO 100 ICU = 1, 2
+                  UPLO = ICHU( ICU: ICU )
+                  UPPER = UPLO.EQ.'U'
+*
+                  DO 90 IA = 1, NALF
+                     ALPHA = ALF( IA )
+*
+                     DO 80 IB = 1, NBET
+                        BETA = BET( IB )
+*
+*                       Generate the matrix C.
+*
+                        CALL SMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC,
+     $                              LDC, RESET, ZERO )
+*
+                        NC = NC + 1
+*
+*                       Save every datum before calling the subroutine.
+*
+                        UPLOS = UPLO
+                        TRANSS = TRANS
+                        NS = N
+                        KS = K
+                        ALS = ALPHA
+                        DO 10 I = 1, LAA
+                           AS( I ) = AA( I )
+   10                   CONTINUE
+                        LDAS = LDA
+                        DO 20 I = 1, LBB
+                           BS( I ) = BB( I )
+   20                   CONTINUE
+                        LDBS = LDB
+                        BETS = BETA
+                        DO 30 I = 1, LCC
+                           CS( I ) = CC( I )
+   30                   CONTINUE
+                        LDCS = LDC
+*
+*                       Call the subroutine.
+*
+                        IF( TRACE )
+     $                     WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO,
+     $                     TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC
+                        IF( REWI )
+     $                     REWIND NTRA
+                        CALL SSYR2K( UPLO, TRANS, N, K, ALPHA, AA, LDA,
+     $                               BB, LDB, BETA, CC, LDC )
+*
+*                       Check if error-exit was taken incorrectly.
+*
+                        IF( .NOT.OK )THEN
+                           WRITE( NOUT, FMT = 9993 )
+                           FATAL = .TRUE.
+                           GO TO 150
+                        END IF
+*
+*                       See what data changed inside subroutines.
+*
+                        ISAME( 1 ) = UPLOS.EQ.UPLO
+                        ISAME( 2 ) = TRANSS.EQ.TRANS
+                        ISAME( 3 ) = NS.EQ.N
+                        ISAME( 4 ) = KS.EQ.K
+                        ISAME( 5 ) = ALS.EQ.ALPHA
+                        ISAME( 6 ) = LSE( AS, AA, LAA )
+                        ISAME( 7 ) = LDAS.EQ.LDA
+                        ISAME( 8 ) = LSE( BS, BB, LBB )
+                        ISAME( 9 ) = LDBS.EQ.LDB
+                        ISAME( 10 ) = BETS.EQ.BETA
+                        IF( NULL )THEN
+                           ISAME( 11 ) = LSE( CS, CC, LCC )
+                        ELSE
+                           ISAME( 11 ) = LSERES( 'SY', UPLO, N, N, CS,
+     $                                   CC, LDC )
+                        END IF
+                        ISAME( 12 ) = LDCS.EQ.LDC
+*
+*                       If data was incorrectly changed, report and
+*                       return.
+*
+                        SAME = .TRUE.
+                        DO 40 I = 1, NARGS
+                           SAME = SAME.AND.ISAME( I )
+                           IF( .NOT.ISAME( I ) )
+     $                        WRITE( NOUT, FMT = 9998 )I
+   40                   CONTINUE
+                        IF( .NOT.SAME )THEN
+                           FATAL = .TRUE.
+                           GO TO 150
+                        END IF
+*
+                        IF( .NOT.NULL )THEN
+*
+*                          Check the result column by column.
+*
+                           JJAB = 1
+                           JC = 1
+                           DO 70 J = 1, N
+                              IF( UPPER )THEN
+                                 JJ = 1
+                                 LJ = J
+                              ELSE
+                                 JJ = J
+                                 LJ = N - J + 1
+                              END IF
+                              IF( TRAN )THEN
+                                 DO 50 I = 1, K
+                                    W( I ) = AB( ( J - 1 )*2*NMAX + K +
+     $                                       I )
+                                    W( K + I ) = AB( ( J - 1 )*2*NMAX +
+     $                                           I )
+   50                            CONTINUE
+                                 CALL SMMCH( 'T', 'N', LJ, 1, 2*K,
+     $                                       ALPHA, AB( JJAB ), 2*NMAX,
+     $                                       W, 2*NMAX, BETA,
+     $                                       C( JJ, J ), NMAX, CT, G,
+     $                                       CC( JC ), LDC, EPS, ERR,
+     $                                       FATAL, NOUT, .TRUE. )
+                              ELSE
+                                 DO 60 I = 1, K
+                                    W( I ) = AB( ( K + I - 1 )*NMAX +
+     $                                       J )
+                                    W( K + I ) = AB( ( I - 1 )*NMAX +
+     $                                           J )
+   60                            CONTINUE
+                                 CALL SMMCH( 'N', 'N', LJ, 1, 2*K,
+     $                                       ALPHA, AB( JJ ), NMAX, W,
+     $                                       2*NMAX, BETA, C( JJ, J ),
+     $                                       NMAX, CT, G, CC( JC ), LDC,
+     $                                       EPS, ERR, FATAL, NOUT,
+     $                                       .TRUE. )
+                              END IF
+                              IF( UPPER )THEN
+                                 JC = JC + LDC
+                              ELSE
+                                 JC = JC + LDC + 1
+                                 IF( TRAN )
+     $                              JJAB = JJAB + 2*NMAX
+                              END IF
+                              ERRMAX = MAX( ERRMAX, ERR )
+*                             If got really bad answer, report and
+*                             return.
+                              IF( FATAL )
+     $                           GO TO 140
+   70                      CONTINUE
+                        END IF
+*
+   80                CONTINUE
+*
+   90             CONTINUE
+*
+  100          CONTINUE
+*
+  110       CONTINUE
+*
+  120    CONTINUE
+*
+  130 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 160
+*
+  140 CONTINUE
+      IF( N.GT.1 )
+     $   WRITE( NOUT, FMT = 9995 )J
+*
+  150 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA,
+     $   LDA, LDB, BETA, LDC
+*
+  160 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+     $      F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ')   ',
+     $      ' .' )
+ 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of SCHK5.
+*
+      END
+      SUBROUTINE SCHKE( ISNUM, SRNAMT, NOUT )
+*
+*  Tests the error exits from the Level 3 Blas.
+*  Requires a special version of the error-handling routine XERBLA.
+*  A, B and C should not need to be defined.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*  3-19-92:  Initialize ALPHA and BETA  (eca)
+*  3-19-92:  Fix argument 12 in calls to SSYMM with INFOT = 9  (eca)
+*
+*     .. Scalar Arguments ..
+      INTEGER            ISNUM, NOUT
+      CHARACTER*6        SRNAMT
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Parameters ..
+      REAL               ONE, TWO
+      PARAMETER          ( ONE = 1.0E0, TWO = 2.0E0 )
+*     .. Local Scalars ..
+      REAL               ALPHA, BETA
+*     .. Local Arrays ..
+      REAL               A( 2, 1 ), B( 2, 1 ), C( 2, 1 )
+*     .. External Subroutines ..
+      EXTERNAL           CHKXER, SGEMM, SSYMM, SSYR2K, SSYRK, STRMM,
+     $                   STRSM
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Executable Statements ..
+*     OK is set to .FALSE. by the special version of XERBLA or by CHKXER
+*     if anything is wrong.
+      OK = .TRUE.
+*     LERR is set to .TRUE. by the special version of XERBLA each time
+*     it is called, and is then tested and re-set by CHKXER.
+      LERR = .FALSE.
+*
+*     Initialize ALPHA and BETA.
+*
+      ALPHA = ONE
+      BETA = TWO
+*
+      GO TO ( 10, 20, 30, 40, 50, 60 )ISNUM
+   10 INFOT = 1
+      CALL SGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 1
+      CALL SGEMM( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL SGEMM( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL SGEMM( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL SGEMM( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL SGEMM( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL SGEMM( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL SGEMM( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL SGEMM( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL SGEMM( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL SGEMM( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL SGEMM( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL SGEMM( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL SGEMM( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL SGEMM( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL SGEMM( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL SGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL SGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL SGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL SGEMM( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL SGEMM( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL SGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL SGEMM( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL SGEMM( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL SGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL SGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL SGEMM( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL SGEMM( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 70
+   20 INFOT = 1
+      CALL SSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL SSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL SSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL SSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL SSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL SSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL SSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL SSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL SSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL SSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL SSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL SSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL SSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL SSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL SSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL SSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 70
+   30 INFOT = 1
+      CALL STRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL STRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL STRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL STRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL STRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL STRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL STRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL STRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL STRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL STRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL STRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL STRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL STRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL STRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL STRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL STRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL STRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL STRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL STRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL STRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL STRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL STRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL STRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL STRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL STRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL STRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL STRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL STRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL STRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL STRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL STRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL STRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL STRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL STRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL STRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL STRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 70
+   40 INFOT = 1
+      CALL STRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL STRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL STRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL STRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL STRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL STRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL STRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL STRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL STRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL STRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL STRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL STRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL STRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL STRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL STRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL STRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL STRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL STRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL STRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL STRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL STRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL STRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL STRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL STRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL STRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL STRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL STRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL STRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL STRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL STRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL STRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL STRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL STRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL STRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL STRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL STRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 70
+   50 INFOT = 1
+      CALL SSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL SSYRK( 'U', '/', 0, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL SSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL SSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL SSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL SSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL SSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL SSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL SSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL SSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL SSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL SSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL SSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL SSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL SSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL SSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL SSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL SSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 70
+   60 INFOT = 1
+      CALL SSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL SSYR2K( 'U', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL SSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL SSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL SSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL SSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL SSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL SSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL SSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL SSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL SSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL SSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL SSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL SSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL SSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL SSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL SSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL SSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL SSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL SSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL SSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL SSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+*
+   70 IF( OK )THEN
+         WRITE( NOUT, FMT = 9999 )SRNAMT
+      ELSE
+         WRITE( NOUT, FMT = 9998 )SRNAMT
+      END IF
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' )
+ 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****',
+     $      '**' )
+*
+*     End of SCHKE.
+*
+      END
+      SUBROUTINE SMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET,
+     $                  TRANSL )
+*
+*  Generates values for an M by N matrix A.
+*  Stores the values in the array AA in the data structure required
+*  by the routine, with unwanted elements set to rogue value.
+*
+*  TYPE is 'GE', 'SY' or 'TR'.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0, ONE = 1.0 )
+      REAL               ROGUE
+      PARAMETER          ( ROGUE = -1.0E10 )
+*     .. Scalar Arguments ..
+      REAL               TRANSL
+      INTEGER            LDA, M, N, NMAX
+      LOGICAL            RESET
+      CHARACTER*1        DIAG, UPLO
+      CHARACTER*2        TYPE
+*     .. Array Arguments ..
+      REAL               A( NMAX, * ), AA( * )
+*     .. Local Scalars ..
+      INTEGER            I, IBEG, IEND, J
+      LOGICAL            GEN, LOWER, SYM, TRI, UNIT, UPPER
+*     .. External Functions ..
+      REAL               SBEG
+      EXTERNAL           SBEG
+*     .. Executable Statements ..
+      GEN = TYPE.EQ.'GE'
+      SYM = TYPE.EQ.'SY'
+      TRI = TYPE.EQ.'TR'
+      UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U'
+      LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L'
+      UNIT = TRI.AND.DIAG.EQ.'U'
+*
+*     Generate data in array A.
+*
+      DO 20 J = 1, N
+         DO 10 I = 1, M
+            IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) )
+     $          THEN
+               A( I, J ) = SBEG( RESET ) + TRANSL
+               IF( I.NE.J )THEN
+*                 Set some elements to zero
+                  IF( N.GT.3.AND.J.EQ.N/2 )
+     $               A( I, J ) = ZERO
+                  IF( SYM )THEN
+                     A( J, I ) = A( I, J )
+                  ELSE IF( TRI )THEN
+                     A( J, I ) = ZERO
+                  END IF
+               END IF
+            END IF
+   10    CONTINUE
+         IF( TRI )
+     $      A( J, J ) = A( J, J ) + ONE
+         IF( UNIT )
+     $      A( J, J ) = ONE
+   20 CONTINUE
+*
+*     Store elements in array AS in data structure required by routine.
+*
+      IF( TYPE.EQ.'GE' )THEN
+         DO 50 J = 1, N
+            DO 30 I = 1, M
+               AA( I + ( J - 1 )*LDA ) = A( I, J )
+   30       CONTINUE
+            DO 40 I = M + 1, LDA
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+   40       CONTINUE
+   50    CONTINUE
+      ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN
+         DO 90 J = 1, N
+            IF( UPPER )THEN
+               IBEG = 1
+               IF( UNIT )THEN
+                  IEND = J - 1
+               ELSE
+                  IEND = J
+               END IF
+            ELSE
+               IF( UNIT )THEN
+                  IBEG = J + 1
+               ELSE
+                  IBEG = J
+               END IF
+               IEND = N
+            END IF
+            DO 60 I = 1, IBEG - 1
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+   60       CONTINUE
+            DO 70 I = IBEG, IEND
+               AA( I + ( J - 1 )*LDA ) = A( I, J )
+   70       CONTINUE
+            DO 80 I = IEND + 1, LDA
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+   80       CONTINUE
+   90    CONTINUE
+      END IF
+      RETURN
+*
+*     End of SMAKE.
+*
+      END
+      SUBROUTINE SMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB,
+     $                  BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL,
+     $                  NOUT, MV )
+*
+*  Checks the results of the computational tests.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0, ONE = 1.0 )
+*     .. Scalar Arguments ..
+      REAL               ALPHA, BETA, EPS, ERR
+      INTEGER            KK, LDA, LDB, LDC, LDCC, M, N, NOUT
+      LOGICAL            FATAL, MV
+      CHARACTER*1        TRANSA, TRANSB
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), B( LDB, * ), C( LDC, * ),
+     $                   CC( LDCC, * ), CT( * ), G( * )
+*     .. Local Scalars ..
+      REAL               ERRI
+      INTEGER            I, J, K
+      LOGICAL            TRANA, TRANB
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, SQRT
+*     .. Executable Statements ..
+      TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C'
+      TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C'
+*
+*     Compute expected result, one column at a time, in CT using data
+*     in A, B and C.
+*     Compute gauges in G.
+*
+      DO 120 J = 1, N
+*
+         DO 10 I = 1, M
+            CT( I ) = ZERO
+            G( I ) = ZERO
+   10    CONTINUE
+         IF( .NOT.TRANA.AND..NOT.TRANB )THEN
+            DO 30 K = 1, KK
+               DO 20 I = 1, M
+                  CT( I ) = CT( I ) + A( I, K )*B( K, J )
+                  G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( K, J ) )
+   20          CONTINUE
+   30       CONTINUE
+         ELSE IF( TRANA.AND..NOT.TRANB )THEN
+            DO 50 K = 1, KK
+               DO 40 I = 1, M
+                  CT( I ) = CT( I ) + A( K, I )*B( K, J )
+                  G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( K, J ) )
+   40          CONTINUE
+   50       CONTINUE
+         ELSE IF( .NOT.TRANA.AND.TRANB )THEN
+            DO 70 K = 1, KK
+               DO 60 I = 1, M
+                  CT( I ) = CT( I ) + A( I, K )*B( J, K )
+                  G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( J, K ) )
+   60          CONTINUE
+   70       CONTINUE
+         ELSE IF( TRANA.AND.TRANB )THEN
+            DO 90 K = 1, KK
+               DO 80 I = 1, M
+                  CT( I ) = CT( I ) + A( K, I )*B( J, K )
+                  G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( J, K ) )
+   80          CONTINUE
+   90       CONTINUE
+         END IF
+         DO 100 I = 1, M
+            CT( I ) = ALPHA*CT( I ) + BETA*C( I, J )
+            G( I ) = ABS( ALPHA )*G( I ) + ABS( BETA )*ABS( C( I, J ) )
+  100    CONTINUE
+*
+*        Compute the error ratio for this result.
+*
+         ERR = ZERO
+         DO 110 I = 1, M
+            ERRI = ABS( CT( I ) - CC( I, J ) )/EPS
+            IF( G( I ).NE.ZERO )
+     $         ERRI = ERRI/G( I )
+            ERR = MAX( ERR, ERRI )
+            IF( ERR*SQRT( EPS ).GE.ONE )
+     $         GO TO 130
+  110    CONTINUE
+*
+  120 CONTINUE
+*
+*     If the loop completes, all results are at least half accurate.
+      GO TO 150
+*
+*     Report fatal error.
+*
+  130 FATAL = .TRUE.
+      WRITE( NOUT, FMT = 9999 )
+      DO 140 I = 1, M
+         IF( MV )THEN
+            WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J )
+         ELSE
+            WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I )
+         END IF
+  140 CONTINUE
+      IF( N.GT.1 )
+     $   WRITE( NOUT, FMT = 9997 )J
+*
+  150 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL',
+     $      'F ACCURATE *******', /'           EXPECTED RESULT   COMPU',
+     $      'TED RESULT' )
+ 9998 FORMAT( 1X, I7, 2G18.6 )
+ 9997 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+*
+*     End of SMMCH.
+*
+      END
+      LOGICAL FUNCTION LSE( RI, RJ, LR )
+*
+*  Tests if two arrays are identical.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      INTEGER            LR
+*     .. Array Arguments ..
+      REAL               RI( * ), RJ( * )
+*     .. Local Scalars ..
+      INTEGER            I
+*     .. Executable Statements ..
+      DO 10 I = 1, LR
+         IF( RI( I ).NE.RJ( I ) )
+     $      GO TO 20
+   10 CONTINUE
+      LSE = .TRUE.
+      GO TO 30
+   20 CONTINUE
+      LSE = .FALSE.
+   30 RETURN
+*
+*     End of LSE.
+*
+      END
+      LOGICAL FUNCTION LSERES( TYPE, UPLO, M, N, AA, AS, LDA )
+*
+*  Tests if selected elements in two arrays are equal.
+*
+*  TYPE is 'GE' or 'SY'.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      INTEGER            LDA, M, N
+      CHARACTER*1        UPLO
+      CHARACTER*2        TYPE
+*     .. Array Arguments ..
+      REAL               AA( LDA, * ), AS( LDA, * )
+*     .. Local Scalars ..
+      INTEGER            I, IBEG, IEND, J
+      LOGICAL            UPPER
+*     .. Executable Statements ..
+      UPPER = UPLO.EQ.'U'
+      IF( TYPE.EQ.'GE' )THEN
+         DO 20 J = 1, N
+            DO 10 I = M + 1, LDA
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   10       CONTINUE
+   20    CONTINUE
+      ELSE IF( TYPE.EQ.'SY' )THEN
+         DO 50 J = 1, N
+            IF( UPPER )THEN
+               IBEG = 1
+               IEND = J
+            ELSE
+               IBEG = J
+               IEND = N
+            END IF
+            DO 30 I = 1, IBEG - 1
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   30       CONTINUE
+            DO 40 I = IEND + 1, LDA
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   40       CONTINUE
+   50    CONTINUE
+      END IF
+*
+      LSERES = .TRUE.
+      GO TO 80
+   70 CONTINUE
+      LSERES = .FALSE.
+   80 RETURN
+*
+*     End of LSERES.
+*
+      END
+      REAL FUNCTION SBEG( RESET )
+*
+*  Generates random numbers uniformly distributed between -0.5 and 0.5.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      LOGICAL            RESET
+*     .. Local Scalars ..
+      INTEGER            I, IC, MI
+*     .. Save statement ..
+      SAVE               I, IC, MI
+*     .. Executable Statements ..
+      IF( RESET )THEN
+*        Initialize local variables.
+         MI = 891
+         I = 7
+         IC = 0
+         RESET = .FALSE.
+      END IF
+*
+*     The sequence of values of I is bounded between 1 and 999.
+*     If initial I = 1,2,3,6,7 or 9, the period will be 50.
+*     If initial I = 4 or 8, the period will be 25.
+*     If initial I = 5, the period will be 10.
+*     IC is used to break up the period by skipping 1 value of I in 6.
+*
+      IC = IC + 1
+   10 I = I*MI
+      I = I - 1000*( I/1000 )
+      IF( IC.GE.5 )THEN
+         IC = 0
+         GO TO 10
+      END IF
+      SBEG = ( I - 500 )/1001.0
+      RETURN
+*
+*     End of SBEG.
+*
+      END
+      REAL FUNCTION SDIFF( X, Y )
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      REAL               X, Y
+*     .. Executable Statements ..
+      SDIFF = X - Y
+      RETURN
+*
+*     End of SDIFF.
+*
+      END
+      SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+*
+*  Tests whether XERBLA has detected an error when it should.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFOT, NOUT
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Executable Statements ..
+      IF( .NOT.LERR )THEN
+         WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT
+         OK = .FALSE.
+      END IF
+      LERR = .FALSE.
+      RETURN
+*
+ 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D',
+     $      'ETECTED BY ', A6, ' *****' )
+*
+*     End of CHKXER.
+*
+      END
+      SUBROUTINE XERBLA( SRNAME, INFO )
+*
+*  This is a special version of XERBLA to be used only as part of
+*  the test program for testing error exits from the Level 3 BLAS
+*  routines.
+*
+*  XERBLA  is an error handler for the Level 3 BLAS routines.
+*
+*  It is called by the Level 3 BLAS routines if an input parameter is
+*  invalid.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO
+      CHARACTER*6        SRNAME
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUT
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUT, OK, LERR
+      COMMON             /SRNAMC/SRNAMT
+*     .. Executable Statements ..
+      LERR = .TRUE.
+      IF( INFO.NE.INFOT )THEN
+         IF( INFOT.NE.0 )THEN
+            WRITE( NOUT, FMT = 9999 )INFO, INFOT
+         ELSE
+            WRITE( NOUT, FMT = 9997 )INFO
+         END IF
+         OK = .FALSE.
+      END IF
+      IF( SRNAME.NE.SRNAMT )THEN
+         WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT
+         OK = .FALSE.
+      END IF
+      RETURN
+*
+ 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD',
+     $      ' OF ', I2, ' *******' )
+ 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE',
+     $      'AD OF ', A6, ' *******' )
+ 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6,
+     $      ' *******' )
+*
+*     End of XERBLA
+*
+      END
+

diff --git a/blas/testing/zblat1.f b/blas/testing/zblat1.f
new file mode 100644
index 0000000..c00b67d
--- /dev/null
+++ b/blas/testing/zblat1.f

@@ -0,0 +1,724 @@
+*> \brief \b ZBLAT1
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM ZBLAT1
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*>    Test program for the COMPLEX*16 Level 1 BLAS.
+*>
+*>    Based upon the original BLAS test routine together with:
+*>    F06GAF Example Program Text
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complex16_blas_testing
+*
+*  =====================================================================
+      PROGRAM ZBLAT1
+*
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER          NOUT
+      PARAMETER        (NOUT=6)
+*     .. Scalars in Common ..
+      INTEGER          ICASE, INCX, INCY, MODE, N
+      LOGICAL          PASS
+*     .. Local Scalars ..
+      DOUBLE PRECISION SFAC
+      INTEGER          IC
+*     .. External Subroutines ..
+      EXTERNAL         CHECK1, CHECK2, HEADER
+*     .. Common blocks ..
+      COMMON           /COMBLA/ICASE, N, INCX, INCY, MODE, PASS
+*     .. Data statements ..
+      DATA             SFAC/9.765625D-4/
+*     .. Executable Statements ..
+      WRITE (NOUT,99999)
+      DO 20 IC = 1, 10
+         ICASE = IC
+         CALL HEADER
+*
+*        Initialize PASS, INCX, INCY, and MODE for a new case.
+*        The value 9999 for INCX, INCY or MODE will appear in the
+*        detailed  output, if any, for cases that do not involve
+*        these parameters.
+*
+         PASS = .TRUE.
+         INCX = 9999
+         INCY = 9999
+         MODE = 9999
+         IF (ICASE.LE.5) THEN
+            CALL CHECK2(SFAC)
+         ELSE IF (ICASE.GE.6) THEN
+            CALL CHECK1(SFAC)
+         END IF
+*        -- Print
+         IF (PASS) WRITE (NOUT,99998)
+   20 CONTINUE
+      STOP
+*
+99999 FORMAT (' Complex BLAS Test Program Results',/1X)
+99998 FORMAT ('                                    ----- PASS -----')
+      END
+      SUBROUTINE HEADER
+*     .. Parameters ..
+      INTEGER          NOUT
+      PARAMETER        (NOUT=6)
+*     .. Scalars in Common ..
+      INTEGER          ICASE, INCX, INCY, MODE, N
+      LOGICAL          PASS
+*     .. Local Arrays ..
+      CHARACTER*6      L(10)
+*     .. Common blocks ..
+      COMMON           /COMBLA/ICASE, N, INCX, INCY, MODE, PASS
+*     .. Data statements ..
+      DATA             L(1)/'ZDOTC '/
+      DATA             L(2)/'ZDOTU '/
+      DATA             L(3)/'ZAXPY '/
+      DATA             L(4)/'ZCOPY '/
+      DATA             L(5)/'ZSWAP '/
+      DATA             L(6)/'DZNRM2'/
+      DATA             L(7)/'DZASUM'/
+      DATA             L(8)/'ZSCAL '/
+      DATA             L(9)/'ZDSCAL'/
+      DATA             L(10)/'IZAMAX'/
+*     .. Executable Statements ..
+      WRITE (NOUT,99999) ICASE, L(ICASE)
+      RETURN
+*
+99999 FORMAT (/' Test of subprogram number',I3,12X,A6)
+      END
+      SUBROUTINE CHECK1(SFAC)
+*     .. Parameters ..
+      INTEGER           NOUT
+      PARAMETER         (NOUT=6)
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION  SFAC
+*     .. Scalars in Common ..
+      INTEGER           ICASE, INCX, INCY, MODE, N
+      LOGICAL           PASS
+*     .. Local Scalars ..
+      COMPLEX*16        CA
+      DOUBLE PRECISION  SA
+      INTEGER           I, J, LEN, NP1
+*     .. Local Arrays ..
+      COMPLEX*16        CTRUE5(8,5,2), CTRUE6(8,5,2), CV(8,5,2), CX(8),
+     +                  MWPCS(5), MWPCT(5)
+      DOUBLE PRECISION  STRUE2(5), STRUE4(5)
+      INTEGER           ITRUE3(5)
+*     .. External Functions ..
+      DOUBLE PRECISION  DZASUM, DZNRM2
+      INTEGER           IZAMAX
+      EXTERNAL          DZASUM, DZNRM2, IZAMAX
+*     .. External Subroutines ..
+      EXTERNAL          ZSCAL, ZDSCAL, CTEST, ITEST1, STEST1
+*     .. Intrinsic Functions ..
+      INTRINSIC         MAX
+*     .. Common blocks ..
+      COMMON            /COMBLA/ICASE, N, INCX, INCY, MODE, PASS
+*     .. Data statements ..
+      DATA              SA, CA/0.3D0, (0.4D0,-0.7D0)/
+      DATA              ((CV(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0),
+     +                  (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0),
+     +                  (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0),
+     +                  (1.0D0,2.0D0), (0.3D0,-0.4D0), (3.0D0,4.0D0),
+     +                  (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0),
+     +                  (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0),
+     +                  (0.1D0,-0.3D0), (0.5D0,-0.1D0), (5.0D0,6.0D0),
+     +                  (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0),
+     +                  (5.0D0,6.0D0), (5.0D0,6.0D0), (0.1D0,0.1D0),
+     +                  (-0.6D0,0.1D0), (0.1D0,-0.3D0), (7.0D0,8.0D0),
+     +                  (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0),
+     +                  (7.0D0,8.0D0), (0.3D0,0.1D0), (0.5D0,0.0D0),
+     +                  (0.0D0,0.5D0), (0.0D0,0.2D0), (2.0D0,3.0D0),
+     +                  (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/
+      DATA              ((CV(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0),
+     +                  (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0),
+     +                  (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0),
+     +                  (4.0D0,5.0D0), (0.3D0,-0.4D0), (6.0D0,7.0D0),
+     +                  (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0),
+     +                  (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0),
+     +                  (0.1D0,-0.3D0), (8.0D0,9.0D0), (0.5D0,-0.1D0),
+     +                  (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0),
+     +                  (2.0D0,5.0D0), (2.0D0,5.0D0), (0.1D0,0.1D0),
+     +                  (3.0D0,6.0D0), (-0.6D0,0.1D0), (4.0D0,7.0D0),
+     +                  (0.1D0,-0.3D0), (7.0D0,2.0D0), (7.0D0,2.0D0),
+     +                  (7.0D0,2.0D0), (0.3D0,0.1D0), (5.0D0,8.0D0),
+     +                  (0.5D0,0.0D0), (6.0D0,9.0D0), (0.0D0,0.5D0),
+     +                  (8.0D0,3.0D0), (0.0D0,0.2D0), (9.0D0,4.0D0)/
+      DATA              STRUE2/0.0D0, 0.5D0, 0.6D0, 0.7D0, 0.8D0/
+      DATA              STRUE4/0.0D0, 0.7D0, 1.0D0, 1.3D0, 1.6D0/
+      DATA              ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0),
+     +                  (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0),
+     +                  (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0),
+     +                  (1.0D0,2.0D0), (-0.16D0,-0.37D0), (3.0D0,4.0D0),
+     +                  (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0),
+     +                  (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0),
+     +                  (-0.17D0,-0.19D0), (0.13D0,-0.39D0),
+     +                  (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0),
+     +                  (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0),
+     +                  (0.11D0,-0.03D0), (-0.17D0,0.46D0),
+     +                  (-0.17D0,-0.19D0), (7.0D0,8.0D0), (7.0D0,8.0D0),
+     +                  (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0),
+     +                  (0.19D0,-0.17D0), (0.20D0,-0.35D0),
+     +                  (0.35D0,0.20D0), (0.14D0,0.08D0),
+     +                  (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0),
+     +                  (2.0D0,3.0D0)/
+      DATA              ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0),
+     +                  (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0),
+     +                  (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0),
+     +                  (4.0D0,5.0D0), (-0.16D0,-0.37D0), (6.0D0,7.0D0),
+     +                  (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0),
+     +                  (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0),
+     +                  (-0.17D0,-0.19D0), (8.0D0,9.0D0),
+     +                  (0.13D0,-0.39D0), (2.0D0,5.0D0), (2.0D0,5.0D0),
+     +                  (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0),
+     +                  (0.11D0,-0.03D0), (3.0D0,6.0D0),
+     +                  (-0.17D0,0.46D0), (4.0D0,7.0D0),
+     +                  (-0.17D0,-0.19D0), (7.0D0,2.0D0), (7.0D0,2.0D0),
+     +                  (7.0D0,2.0D0), (0.19D0,-0.17D0), (5.0D0,8.0D0),
+     +                  (0.20D0,-0.35D0), (6.0D0,9.0D0),
+     +                  (0.35D0,0.20D0), (8.0D0,3.0D0),
+     +                  (0.14D0,0.08D0), (9.0D0,4.0D0)/
+      DATA              ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0),
+     +                  (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0),
+     +                  (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0),
+     +                  (1.0D0,2.0D0), (0.09D0,-0.12D0), (3.0D0,4.0D0),
+     +                  (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0),
+     +                  (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0),
+     +                  (0.03D0,-0.09D0), (0.15D0,-0.03D0),
+     +                  (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0),
+     +                  (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0),
+     +                  (0.03D0,0.03D0), (-0.18D0,0.03D0),
+     +                  (0.03D0,-0.09D0), (7.0D0,8.0D0), (7.0D0,8.0D0),
+     +                  (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0),
+     +                  (0.09D0,0.03D0), (0.15D0,0.00D0),
+     +                  (0.00D0,0.15D0), (0.00D0,0.06D0), (2.0D0,3.0D0),
+     +                  (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/
+      DATA              ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0),
+     +                  (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0),
+     +                  (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0),
+     +                  (4.0D0,5.0D0), (0.09D0,-0.12D0), (6.0D0,7.0D0),
+     +                  (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0),
+     +                  (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0),
+     +                  (0.03D0,-0.09D0), (8.0D0,9.0D0),
+     +                  (0.15D0,-0.03D0), (2.0D0,5.0D0), (2.0D0,5.0D0),
+     +                  (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0),
+     +                  (0.03D0,0.03D0), (3.0D0,6.0D0),
+     +                  (-0.18D0,0.03D0), (4.0D0,7.0D0),
+     +                  (0.03D0,-0.09D0), (7.0D0,2.0D0), (7.0D0,2.0D0),
+     +                  (7.0D0,2.0D0), (0.09D0,0.03D0), (5.0D0,8.0D0),
+     +                  (0.15D0,0.00D0), (6.0D0,9.0D0), (0.00D0,0.15D0),
+     +                  (8.0D0,3.0D0), (0.00D0,0.06D0), (9.0D0,4.0D0)/
+      DATA              ITRUE3/0, 1, 2, 2, 2/
+*     .. Executable Statements ..
+      DO 60 INCX = 1, 2
+         DO 40 NP1 = 1, 5
+            N = NP1 - 1
+            LEN = 2*MAX(N,1)
+*           .. Set vector arguments ..
+            DO 20 I = 1, LEN
+               CX(I) = CV(I,NP1,INCX)
+   20       CONTINUE
+            IF (ICASE.EQ.6) THEN
+*              .. DZNRM2 ..
+               CALL STEST1(DZNRM2(N,CX,INCX),STRUE2(NP1),STRUE2(NP1),
+     +                     SFAC)
+            ELSE IF (ICASE.EQ.7) THEN
+*              .. DZASUM ..
+               CALL STEST1(DZASUM(N,CX,INCX),STRUE4(NP1),STRUE4(NP1),
+     +                     SFAC)
+            ELSE IF (ICASE.EQ.8) THEN
+*              .. ZSCAL ..
+               CALL ZSCAL(N,CA,CX,INCX)
+               CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX),
+     +                    SFAC)
+            ELSE IF (ICASE.EQ.9) THEN
+*              .. ZDSCAL ..
+               CALL ZDSCAL(N,SA,CX,INCX)
+               CALL CTEST(LEN,CX,CTRUE6(1,NP1,INCX),CTRUE6(1,NP1,INCX),
+     +                    SFAC)
+            ELSE IF (ICASE.EQ.10) THEN
+*              .. IZAMAX ..
+               CALL ITEST1(IZAMAX(N,CX,INCX),ITRUE3(NP1))
+            ELSE
+               WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
+               STOP
+            END IF
+*
+   40    CONTINUE
+   60 CONTINUE
+*
+      INCX = 1
+      IF (ICASE.EQ.8) THEN
+*        ZSCAL
+*        Add a test for alpha equal to zero.
+         CA = (0.0D0,0.0D0)
+         DO 80 I = 1, 5
+            MWPCT(I) = (0.0D0,0.0D0)
+            MWPCS(I) = (1.0D0,1.0D0)
+   80    CONTINUE
+         CALL ZSCAL(5,CA,CX,INCX)
+         CALL CTEST(5,CX,MWPCT,MWPCS,SFAC)
+      ELSE IF (ICASE.EQ.9) THEN
+*        ZDSCAL
+*        Add a test for alpha equal to zero.
+         SA = 0.0D0
+         DO 100 I = 1, 5
+            MWPCT(I) = (0.0D0,0.0D0)
+            MWPCS(I) = (1.0D0,1.0D0)
+  100    CONTINUE
+         CALL ZDSCAL(5,SA,CX,INCX)
+         CALL CTEST(5,CX,MWPCT,MWPCS,SFAC)
+*        Add a test for alpha equal to one.
+         SA = 1.0D0
+         DO 120 I = 1, 5
+            MWPCT(I) = CX(I)
+            MWPCS(I) = CX(I)
+  120    CONTINUE
+         CALL ZDSCAL(5,SA,CX,INCX)
+         CALL CTEST(5,CX,MWPCT,MWPCS,SFAC)
+*        Add a test for alpha equal to minus one.
+         SA = -1.0D0
+         DO 140 I = 1, 5
+            MWPCT(I) = -CX(I)
+            MWPCS(I) = -CX(I)
+  140    CONTINUE
+         CALL ZDSCAL(5,SA,CX,INCX)
+         CALL CTEST(5,CX,MWPCT,MWPCS,SFAC)
+      END IF
+      RETURN
+      END
+      SUBROUTINE CHECK2(SFAC)
+*     .. Parameters ..
+      INTEGER           NOUT
+      PARAMETER         (NOUT=6)
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION  SFAC
+*     .. Scalars in Common ..
+      INTEGER           ICASE, INCX, INCY, MODE, N
+      LOGICAL           PASS
+*     .. Local Scalars ..
+      COMPLEX*16        CA
+      INTEGER           I, J, KI, KN, KSIZE, LENX, LENY, MX, MY
+*     .. Local Arrays ..
+      COMPLEX*16        CDOT(1), CSIZE1(4), CSIZE2(7,2), CSIZE3(14),
+     +                  CT10X(7,4,4), CT10Y(7,4,4), CT6(4,4), CT7(4,4),
+     +                  CT8(7,4,4), CX(7), CX1(7), CY(7), CY1(7)
+      INTEGER           INCXS(4), INCYS(4), LENS(4,2), NS(4)
+*     .. External Functions ..
+      COMPLEX*16        ZDOTC, ZDOTU
+      EXTERNAL          ZDOTC, ZDOTU
+*     .. External Subroutines ..
+      EXTERNAL          ZAXPY, ZCOPY, ZSWAP, CTEST
+*     .. Intrinsic Functions ..
+      INTRINSIC         ABS, MIN
+*     .. Common blocks ..
+      COMMON            /COMBLA/ICASE, N, INCX, INCY, MODE, PASS
+*     .. Data statements ..
+      DATA              CA/(0.4D0,-0.7D0)/
+      DATA              INCXS/1, 2, -2, -1/
+      DATA              INCYS/1, -2, 1, -2/
+      DATA              LENS/1, 1, 2, 4, 1, 1, 3, 7/
+      DATA              NS/0, 1, 2, 4/
+      DATA              CX1/(0.7D0,-0.8D0), (-0.4D0,-0.7D0),
+     +                  (-0.1D0,-0.9D0), (0.2D0,-0.8D0),
+     +                  (-0.9D0,-0.4D0), (0.1D0,0.4D0), (-0.6D0,0.6D0)/
+      DATA              CY1/(0.6D0,-0.6D0), (-0.9D0,0.5D0),
+     +                  (0.7D0,-0.6D0), (0.1D0,-0.5D0), (-0.1D0,-0.2D0),
+     +                  (-0.5D0,-0.3D0), (0.8D0,-0.7D0)/
+      DATA              ((CT8(I,J,1),I=1,7),J=1,4)/(0.6D0,-0.6D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.32D0,-1.41D0),
+     +                  (-1.55D0,0.5D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.32D0,-1.41D0), (-1.55D0,0.5D0),
+     +                  (0.03D0,-0.89D0), (-0.38D0,-0.96D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/
+      DATA              ((CT8(I,J,2),I=1,7),J=1,4)/(0.6D0,-0.6D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (-0.07D0,-0.89D0),
+     +                  (-0.9D0,0.5D0), (0.42D0,-1.41D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.78D0,0.06D0), (-0.9D0,0.5D0),
+     +                  (0.06D0,-0.13D0), (0.1D0,-0.5D0),
+     +                  (-0.77D0,-0.49D0), (-0.5D0,-0.3D0),
+     +                  (0.52D0,-1.51D0)/
+      DATA              ((CT8(I,J,3),I=1,7),J=1,4)/(0.6D0,-0.6D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (-0.07D0,-0.89D0),
+     +                  (-1.18D0,-0.31D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.78D0,0.06D0), (-1.54D0,0.97D0),
+     +                  (0.03D0,-0.89D0), (-0.18D0,-1.31D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/
+      DATA              ((CT8(I,J,4),I=1,7),J=1,4)/(0.6D0,-0.6D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.32D0,-1.41D0), (-0.9D0,0.5D0),
+     +                  (0.05D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.32D0,-1.41D0),
+     +                  (-0.9D0,0.5D0), (0.05D0,-0.6D0), (0.1D0,-0.5D0),
+     +                  (-0.77D0,-0.49D0), (-0.5D0,-0.3D0),
+     +                  (0.32D0,-1.16D0)/
+      DATA              CT7/(0.0D0,0.0D0), (-0.06D0,-0.90D0),
+     +                  (0.65D0,-0.47D0), (-0.34D0,-1.22D0),
+     +                  (0.0D0,0.0D0), (-0.06D0,-0.90D0),
+     +                  (-0.59D0,-1.46D0), (-1.04D0,-0.04D0),
+     +                  (0.0D0,0.0D0), (-0.06D0,-0.90D0),
+     +                  (-0.83D0,0.59D0), (0.07D0,-0.37D0),
+     +                  (0.0D0,0.0D0), (-0.06D0,-0.90D0),
+     +                  (-0.76D0,-1.15D0), (-1.33D0,-1.82D0)/
+      DATA              CT6/(0.0D0,0.0D0), (0.90D0,0.06D0),
+     +                  (0.91D0,-0.77D0), (1.80D0,-0.10D0),
+     +                  (0.0D0,0.0D0), (0.90D0,0.06D0), (1.45D0,0.74D0),
+     +                  (0.20D0,0.90D0), (0.0D0,0.0D0), (0.90D0,0.06D0),
+     +                  (-0.55D0,0.23D0), (0.83D0,-0.39D0),
+     +                  (0.0D0,0.0D0), (0.90D0,0.06D0), (1.04D0,0.79D0),
+     +                  (1.95D0,1.22D0)/
+      DATA              ((CT10X(I,J,1),I=1,7),J=1,4)/(0.7D0,-0.8D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.6D0,-0.6D0), (-0.9D0,0.5D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.6D0,-0.6D0),
+     +                  (-0.9D0,0.5D0), (0.7D0,-0.6D0), (0.1D0,-0.5D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/
+      DATA              ((CT10X(I,J,2),I=1,7),J=1,4)/(0.7D0,-0.8D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.7D0,-0.6D0), (-0.4D0,-0.7D0),
+     +                  (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.8D0,-0.7D0),
+     +                  (-0.4D0,-0.7D0), (-0.1D0,-0.2D0),
+     +                  (0.2D0,-0.8D0), (0.7D0,-0.6D0), (0.1D0,0.4D0),
+     +                  (0.6D0,-0.6D0)/
+      DATA              ((CT10X(I,J,3),I=1,7),J=1,4)/(0.7D0,-0.8D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (-0.9D0,0.5D0), (-0.4D0,-0.7D0),
+     +                  (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.1D0,-0.5D0),
+     +                  (-0.4D0,-0.7D0), (0.7D0,-0.6D0), (0.2D0,-0.8D0),
+     +                  (-0.9D0,0.5D0), (0.1D0,0.4D0), (0.6D0,-0.6D0)/
+      DATA              ((CT10X(I,J,4),I=1,7),J=1,4)/(0.7D0,-0.8D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.6D0,-0.6D0), (0.7D0,-0.6D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.6D0,-0.6D0),
+     +                  (0.7D0,-0.6D0), (-0.1D0,-0.2D0), (0.8D0,-0.7D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/
+      DATA              ((CT10Y(I,J,1),I=1,7),J=1,4)/(0.6D0,-0.6D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.7D0,-0.8D0), (-0.4D0,-0.7D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.7D0,-0.8D0),
+     +                  (-0.4D0,-0.7D0), (-0.1D0,-0.9D0),
+     +                  (0.2D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0)/
+      DATA              ((CT10Y(I,J,2),I=1,7),J=1,4)/(0.6D0,-0.6D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (-0.1D0,-0.9D0), (-0.9D0,0.5D0),
+     +                  (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (-0.6D0,0.6D0),
+     +                  (-0.9D0,0.5D0), (-0.9D0,-0.4D0), (0.1D0,-0.5D0),
+     +                  (-0.1D0,-0.9D0), (-0.5D0,-0.3D0),
+     +                  (0.7D0,-0.8D0)/
+      DATA              ((CT10Y(I,J,3),I=1,7),J=1,4)/(0.6D0,-0.6D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (-0.1D0,-0.9D0), (0.7D0,-0.8D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (-0.6D0,0.6D0),
+     +                  (-0.9D0,-0.4D0), (-0.1D0,-0.9D0),
+     +                  (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0)/
+      DATA              ((CT10Y(I,J,4),I=1,7),J=1,4)/(0.6D0,-0.6D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.7D0,-0.8D0), (-0.9D0,0.5D0),
+     +                  (-0.4D0,-0.7D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.7D0,-0.8D0),
+     +                  (-0.9D0,0.5D0), (-0.4D0,-0.7D0), (0.1D0,-0.5D0),
+     +                  (-0.1D0,-0.9D0), (-0.5D0,-0.3D0),
+     +                  (0.2D0,-0.8D0)/
+      DATA              CSIZE1/(0.0D0,0.0D0), (0.9D0,0.9D0),
+     +                  (1.63D0,1.73D0), (2.90D0,2.78D0)/
+      DATA              CSIZE3/(0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (1.17D0,1.17D0),
+     +                  (1.17D0,1.17D0), (1.17D0,1.17D0),
+     +                  (1.17D0,1.17D0), (1.17D0,1.17D0),
+     +                  (1.17D0,1.17D0), (1.17D0,1.17D0)/
+      DATA              CSIZE2/(0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0),
+     +                  (0.0D0,0.0D0), (0.0D0,0.0D0), (1.54D0,1.54D0),
+     +                  (1.54D0,1.54D0), (1.54D0,1.54D0),
+     +                  (1.54D0,1.54D0), (1.54D0,1.54D0),
+     +                  (1.54D0,1.54D0), (1.54D0,1.54D0)/
+*     .. Executable Statements ..
+      DO 60 KI = 1, 4
+         INCX = INCXS(KI)
+         INCY = INCYS(KI)
+         MX = ABS(INCX)
+         MY = ABS(INCY)
+*
+         DO 40 KN = 1, 4
+            N = NS(KN)
+            KSIZE = MIN(2,KN)
+            LENX = LENS(KN,MX)
+            LENY = LENS(KN,MY)
+*           .. initialize all argument arrays ..
+            DO 20 I = 1, 7
+               CX(I) = CX1(I)
+               CY(I) = CY1(I)
+   20       CONTINUE
+            IF (ICASE.EQ.1) THEN
+*              .. ZDOTC ..
+               CDOT(1) = ZDOTC(N,CX,INCX,CY,INCY)
+               CALL CTEST(1,CDOT,CT6(KN,KI),CSIZE1(KN),SFAC)
+            ELSE IF (ICASE.EQ.2) THEN
+*              .. ZDOTU ..
+               CDOT(1) = ZDOTU(N,CX,INCX,CY,INCY)
+               CALL CTEST(1,CDOT,CT7(KN,KI),CSIZE1(KN),SFAC)
+            ELSE IF (ICASE.EQ.3) THEN
+*              .. ZAXPY ..
+               CALL ZAXPY(N,CA,CX,INCX,CY,INCY)
+               CALL CTEST(LENY,CY,CT8(1,KN,KI),CSIZE2(1,KSIZE),SFAC)
+            ELSE IF (ICASE.EQ.4) THEN
+*              .. ZCOPY ..
+               CALL ZCOPY(N,CX,INCX,CY,INCY)
+               CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0)
+            ELSE IF (ICASE.EQ.5) THEN
+*              .. ZSWAP ..
+               CALL ZSWAP(N,CX,INCX,CY,INCY)
+               CALL CTEST(LENX,CX,CT10X(1,KN,KI),CSIZE3,1.0D0)
+               CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0)
+            ELSE
+               WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
+               STOP
+            END IF
+*
+   40    CONTINUE
+   60 CONTINUE
+      RETURN
+      END
+      SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC)
+*     ********************************* STEST **************************
+*
+*     THIS SUBR COMPARES ARRAYS  SCOMP() AND STRUE() OF LENGTH LEN TO
+*     SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE
+*     NEGLIGIBLE.
+*
+*     C. L. LAWSON, JPL, 1974 DEC 10
+*
+*     .. Parameters ..
+      INTEGER          NOUT
+      DOUBLE PRECISION ZERO
+      PARAMETER        (NOUT=6, ZERO=0.0D0)
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION SFAC
+      INTEGER          LEN
+*     .. Array Arguments ..
+      DOUBLE PRECISION SCOMP(LEN), SSIZE(LEN), STRUE(LEN)
+*     .. Scalars in Common ..
+      INTEGER          ICASE, INCX, INCY, MODE, N
+      LOGICAL          PASS
+*     .. Local Scalars ..
+      DOUBLE PRECISION SD
+      INTEGER          I
+*     .. External Functions ..
+      DOUBLE PRECISION SDIFF
+      EXTERNAL         SDIFF
+*     .. Intrinsic Functions ..
+      INTRINSIC        ABS
+*     .. Common blocks ..
+      COMMON           /COMBLA/ICASE, N, INCX, INCY, MODE, PASS
+*     .. Executable Statements ..
+*
+      DO 40 I = 1, LEN
+         SD = SCOMP(I) - STRUE(I)
+         IF (ABS(SFAC*SD) .LE. ABS(SSIZE(I))*EPSILON(ZERO))
+     +       GO TO 40
+*
+*                             HERE    SCOMP(I) IS NOT CLOSE TO STRUE(I).
+*
+         IF ( .NOT. PASS) GO TO 20
+*                             PRINT FAIL MESSAGE AND HEADER.
+         PASS = .FALSE.
+         WRITE (NOUT,99999)
+         WRITE (NOUT,99998)
+   20    WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I),
+     +     STRUE(I), SD, SSIZE(I)
+   40 CONTINUE
+      RETURN
+*
+99999 FORMAT ('                                       FAIL')
+99998 FORMAT (/' CASE  N INCX INCY MODE  I                            ',
+     +       ' COMP(I)                             TRUE(I)  DIFFERENCE',
+     +       '     SIZE(I)',/1X)
+99997 FORMAT (1X,I4,I3,3I5,I3,2D36.8,2D12.4)
+      END
+      SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
+*     ************************* STEST1 *****************************
+*
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
+*     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
+*     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
+*
+*     C.L. LAWSON, JPL, 1978 DEC 6
+*
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION  SCOMP1, SFAC, STRUE1
+*     .. Array Arguments ..
+      DOUBLE PRECISION  SSIZE(*)
+*     .. Local Arrays ..
+      DOUBLE PRECISION  SCOMP(1), STRUE(1)
+*     .. External Subroutines ..
+      EXTERNAL          STEST
+*     .. Executable Statements ..
+*
+      SCOMP(1) = SCOMP1
+      STRUE(1) = STRUE1
+      CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC)
+*
+      RETURN
+      END
+      DOUBLE PRECISION FUNCTION SDIFF(SA,SB)
+*     ********************************* SDIFF **************************
+*     COMPUTES DIFFERENCE OF TWO NUMBERS.  C. L. LAWSON, JPL 1974 FEB 15
+*
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION                SA, SB
+*     .. Executable Statements ..
+      SDIFF = SA - SB
+      RETURN
+      END
+      SUBROUTINE CTEST(LEN,CCOMP,CTRUE,CSIZE,SFAC)
+*     **************************** CTEST *****************************
+*
+*     C.L. LAWSON, JPL, 1978 DEC 6
+*
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION SFAC
+      INTEGER          LEN
+*     .. Array Arguments ..
+      COMPLEX*16       CCOMP(LEN), CSIZE(LEN), CTRUE(LEN)
+*     .. Local Scalars ..
+      INTEGER          I
+*     .. Local Arrays ..
+      DOUBLE PRECISION SCOMP(20), SSIZE(20), STRUE(20)
+*     .. External Subroutines ..
+      EXTERNAL         STEST
+*     .. Intrinsic Functions ..
+      INTRINSIC        DIMAG, DBLE
+*     .. Executable Statements ..
+      DO 20 I = 1, LEN
+         SCOMP(2*I-1) = DBLE(CCOMP(I))
+         SCOMP(2*I) = DIMAG(CCOMP(I))
+         STRUE(2*I-1) = DBLE(CTRUE(I))
+         STRUE(2*I) = DIMAG(CTRUE(I))
+         SSIZE(2*I-1) = DBLE(CSIZE(I))
+         SSIZE(2*I) = DIMAG(CSIZE(I))
+   20 CONTINUE
+*
+      CALL STEST(2*LEN,SCOMP,STRUE,SSIZE,SFAC)
+      RETURN
+      END
+      SUBROUTINE ITEST1(ICOMP,ITRUE)
+*     ********************************* ITEST1 *************************
+*
+*     THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR
+*     EQUALITY.
+*     C. L. LAWSON, JPL, 1974 DEC 10
+*
+*     .. Parameters ..
+      INTEGER           NOUT
+      PARAMETER         (NOUT=6)
+*     .. Scalar Arguments ..
+      INTEGER           ICOMP, ITRUE
+*     .. Scalars in Common ..
+      INTEGER           ICASE, INCX, INCY, MODE, N
+      LOGICAL           PASS
+*     .. Local Scalars ..
+      INTEGER           ID
+*     .. Common blocks ..
+      COMMON            /COMBLA/ICASE, N, INCX, INCY, MODE, PASS
+*     .. Executable Statements ..
+      IF (ICOMP.EQ.ITRUE) GO TO 40
+*
+*                            HERE ICOMP IS NOT EQUAL TO ITRUE.
+*
+      IF ( .NOT. PASS) GO TO 20
+*                             PRINT FAIL MESSAGE AND HEADER.
+      PASS = .FALSE.
+      WRITE (NOUT,99999)
+      WRITE (NOUT,99998)
+   20 ID = ICOMP - ITRUE
+      WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID
+   40 CONTINUE
+      RETURN
+*
+99999 FORMAT ('                                       FAIL')
+99998 FORMAT (/' CASE  N INCX INCY MODE                               ',
+     +       ' COMP                                TRUE     DIFFERENCE',
+     +       /1X)
+99997 FORMAT (1X,I4,I3,3I5,2I36,I12)
+      END

diff --git a/blas/testing/zblat2.dat b/blas/testing/zblat2.dat
new file mode 100644
index 0000000..c922440
--- /dev/null
+++ b/blas/testing/zblat2.dat

@@ -0,0 +1,35 @@
+'zblat2.summ'     NAME OF SUMMARY OUTPUT FILE
+6                 UNIT NUMBER OF SUMMARY FILE
+'cbla2t.snap'     NAME OF SNAPSHOT OUTPUT FILE
+-1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+F        LOGICAL FLAG, T TO STOP ON FAILURES.
+T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+16.0     THRESHOLD VALUE OF TEST RATIO
+6                 NUMBER OF VALUES OF N
+0 1 2 3 5 9       VALUES OF N
+4                 NUMBER OF VALUES OF K
+0 1 2 4           VALUES OF K
+4                 NUMBER OF VALUES OF INCX AND INCY
+1 2 -1 -2         VALUES OF INCX AND INCY
+3                 NUMBER OF VALUES OF ALPHA
+(0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
+3                 NUMBER OF VALUES OF BETA
+(0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
+ZGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+ZGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+ZHEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+ZHBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+ZHPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+ZTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
+ZTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+ZTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+ZTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
+ZTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
+ZTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
+ZGERC  T PUT F FOR NO TEST. SAME COLUMNS.
+ZGERU  T PUT F FOR NO TEST. SAME COLUMNS.
+ZHER   T PUT F FOR NO TEST. SAME COLUMNS.
+ZHPR   T PUT F FOR NO TEST. SAME COLUMNS.
+ZHER2  T PUT F FOR NO TEST. SAME COLUMNS.
+ZHPR2  T PUT F FOR NO TEST. SAME COLUMNS.

diff --git a/blas/testing/zblat2.f b/blas/testing/zblat2.f
new file mode 100644
index 0000000..53129a1
--- /dev/null
+++ b/blas/testing/zblat2.f

@@ -0,0 +1,3287 @@
+*> \brief \b ZBLAT2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM ZBLAT2
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the COMPLEX*16       Level 2 Blas.
+*>
+*> The program must be driven by a short data file. The first 18 records
+*> of the file are read using list-directed input, the last 17 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 35 lines:
+*> 'zblat2.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'CBLA2T.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 4                 NUMBER OF VALUES OF K
+*> 0 1 2 4           VALUES OF K
+*> 4                 NUMBER OF VALUES OF INCX AND INCY
+*> 1 2 -1 -2         VALUES OF INCX AND INCY
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
+*> ZGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZGERC  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZGERU  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHER   T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHPR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHER2  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHPR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*>    See:
+*>
+*>       Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
+*>       An  extended  set of Fortran  Basic Linear Algebra Subprograms.
+*>
+*>       Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
+*>       and  Computer Science  Division,  Argonne  National Laboratory,
+*>       9700 South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*>       Or
+*>
+*>       NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
+*>       Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
+*>       OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
+*>       Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
+*>
+*>
+*> -- Written on 10-August-1987.
+*>    Richard Hanson, Sandia National Labs.
+*>    Jeremy Du Croz, NAG Central Office.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complex16_blas_testing
+*
+*  =====================================================================
+      PROGRAM ZBLAT2
+*
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NIN
+      PARAMETER          ( NIN = 5 )
+      INTEGER            NSUBS
+      PARAMETER          ( NSUBS = 17 )
+      COMPLEX*16         ZERO, ONE
+      PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ),
+     $                   ONE = ( 1.0D0, 0.0D0 ) )
+      DOUBLE PRECISION   RZERO
+      PARAMETER          ( RZERO = 0.0D0 )
+      INTEGER            NMAX, INCMAX
+      PARAMETER          ( NMAX = 65, INCMAX = 2 )
+      INTEGER            NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX
+      PARAMETER          ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7,
+     $                   NALMAX = 7, NBEMAX = 7 )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   EPS, ERR, THRESH
+      INTEGER            I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB,
+     $                   NOUT, NTRA
+      LOGICAL            FATAL, LTESTT, REWI, SAME, SFATAL, TRACE,
+     $                   TSTERR
+      CHARACTER*1        TRANS
+      CHARACTER*6        SNAMET
+      CHARACTER*32       SNAPS, SUMMRY
+*     .. Local Arrays ..
+      COMPLEX*16         A( NMAX, NMAX ), AA( NMAX*NMAX ),
+     $                   ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ),
+     $                   X( NMAX ), XS( NMAX*INCMAX ),
+     $                   XX( NMAX*INCMAX ), Y( NMAX ),
+     $                   YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX ), Z( 2*NMAX )
+      DOUBLE PRECISION   G( NMAX )
+      INTEGER            IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX )
+      LOGICAL            LTEST( NSUBS )
+      CHARACTER*6        SNAMES( NSUBS )
+*     .. External Functions ..
+      DOUBLE PRECISION   DDIFF
+      LOGICAL            LZE
+      EXTERNAL           DDIFF, LZE
+*     .. External Subroutines ..
+      EXTERNAL           ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5, ZCHK6,
+     $                   ZCHKE, ZMVCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+      COMMON             /SRNAMC/SRNAMT
+*     .. Data statements ..
+      DATA               SNAMES/'ZGEMV ', 'ZGBMV ', 'ZHEMV ', 'ZHBMV ',
+     $                   'ZHPMV ', 'ZTRMV ', 'ZTBMV ', 'ZTPMV ',
+     $                   'ZTRSV ', 'ZTBSV ', 'ZTPSV ', 'ZGERC ',
+     $                   'ZGERU ', 'ZHER  ', 'ZHPR  ', 'ZHER2 ',
+     $                   'ZHPR2 '/
+*     .. Executable Statements ..
+*
+*     Read name and unit number for summary output file and open file.
+*
+      READ( NIN, FMT = * )SUMMRY
+      READ( NIN, FMT = * )NOUT
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
+      NOUTC = NOUT
+*
+*     Read name and unit number for snapshot output file and open file.
+*
+      READ( NIN, FMT = * )SNAPS
+      READ( NIN, FMT = * )NTRA
+      TRACE = NTRA.GE.0
+      IF( TRACE )THEN
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
+      END IF
+*     Read the flag that directs rewinding of the snapshot file.
+      READ( NIN, FMT = * )REWI
+      REWI = REWI.AND.TRACE
+*     Read the flag that directs stopping on any failure.
+      READ( NIN, FMT = * )SFATAL
+*     Read the flag that indicates whether error exits are to be tested.
+      READ( NIN, FMT = * )TSTERR
+*     Read the threshold value of the test ratio
+      READ( NIN, FMT = * )THRESH
+*
+*     Read and check the parameter values for the tests.
+*
+*     Values of N
+      READ( NIN, FMT = * )NIDIM
+      IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'N', NIDMAX
+         GO TO 230
+      END IF
+      READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM )
+      DO 10 I = 1, NIDIM
+         IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN
+            WRITE( NOUT, FMT = 9996 )NMAX
+            GO TO 230
+         END IF
+   10 CONTINUE
+*     Values of K
+      READ( NIN, FMT = * )NKB
+      IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'K', NKBMAX
+         GO TO 230
+      END IF
+      READ( NIN, FMT = * )( KB( I ), I = 1, NKB )
+      DO 20 I = 1, NKB
+         IF( KB( I ).LT.0 )THEN
+            WRITE( NOUT, FMT = 9995 )
+            GO TO 230
+         END IF
+   20 CONTINUE
+*     Values of INCX and INCY
+      READ( NIN, FMT = * )NINC
+      IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX
+         GO TO 230
+      END IF
+      READ( NIN, FMT = * )( INC( I ), I = 1, NINC )
+      DO 30 I = 1, NINC
+         IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN
+            WRITE( NOUT, FMT = 9994 )INCMAX
+            GO TO 230
+         END IF
+   30 CONTINUE
+*     Values of ALPHA
+      READ( NIN, FMT = * )NALF
+      IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX
+         GO TO 230
+      END IF
+      READ( NIN, FMT = * )( ALF( I ), I = 1, NALF )
+*     Values of BETA
+      READ( NIN, FMT = * )NBET
+      IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX
+         GO TO 230
+      END IF
+      READ( NIN, FMT = * )( BET( I ), I = 1, NBET )
+*
+*     Report values of parameters.
+*
+      WRITE( NOUT, FMT = 9993 )
+      WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM )
+      WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB )
+      WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC )
+      WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF )
+      WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET )
+      IF( .NOT.TSTERR )THEN
+         WRITE( NOUT, FMT = * )
+         WRITE( NOUT, FMT = 9980 )
+      END IF
+      WRITE( NOUT, FMT = * )
+      WRITE( NOUT, FMT = 9999 )THRESH
+      WRITE( NOUT, FMT = * )
+*
+*     Read names of subroutines and flags which indicate
+*     whether they are to be tested.
+*
+      DO 40 I = 1, NSUBS
+         LTEST( I ) = .FALSE.
+   40 CONTINUE
+   50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT
+      DO 60 I = 1, NSUBS
+         IF( SNAMET.EQ.SNAMES( I ) )
+     $      GO TO 70
+   60 CONTINUE
+      WRITE( NOUT, FMT = 9986 )SNAMET
+      STOP
+   70 LTEST( I ) = LTESTT
+      GO TO 50
+*
+   80 CONTINUE
+      CLOSE ( NIN )
+*
+*     Compute EPS (the machine precision).
+*
+      EPS = EPSILON(RZERO)
+      WRITE( NOUT, FMT = 9998 )EPS
+*
+*     Check the reliability of ZMVCH using exact data.
+*
+      N = MIN( 32, NMAX )
+      DO 120 J = 1, N
+         DO 110 I = 1, N
+            A( I, J ) = MAX( I - J + 1, 0 )
+  110    CONTINUE
+         X( J ) = J
+         Y( J ) = ZERO
+  120 CONTINUE
+      DO 130 J = 1, N
+         YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3
+  130 CONTINUE
+*     YY holds the exact result. On exit from ZMVCH YT holds
+*     the result computed by ZMVCH.
+      TRANS = 'N'
+      CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G,
+     $            YY, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LZE( YY, YT, N )
+      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
+         STOP
+      END IF
+      TRANS = 'T'
+      CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
+     $            YY, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LZE( YY, YT, N )
+      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
+         STOP
+      END IF
+*
+*     Test each subroutine in turn.
+*
+      DO 210 ISNUM = 1, NSUBS
+         WRITE( NOUT, FMT = * )
+         IF( .NOT.LTEST( ISNUM ) )THEN
+*           Subprogram is not to be tested.
+            WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM )
+         ELSE
+            SRNAMT = SNAMES( ISNUM )
+*           Test error exits.
+            IF( TSTERR )THEN
+               CALL ZCHKE( ISNUM, SNAMES( ISNUM ), NOUT )
+               WRITE( NOUT, FMT = * )
+            END IF
+*           Test computations.
+            INFOT = 0
+            OK = .TRUE.
+            FATAL = .FALSE.
+            GO TO ( 140, 140, 150, 150, 150, 160, 160,
+     $              160, 160, 160, 160, 170, 170, 180,
+     $              180, 190, 190 )ISNUM
+*           Test ZGEMV, 01, and ZGBMV, 02.
+  140       CALL ZCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF,
+     $                  NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS,
+     $                  X, XX, XS, Y, YY, YS, YT, G )
+            GO TO 200
+*           Test ZHEMV, 03, ZHBMV, 04, and ZHPMV, 05.
+  150       CALL ZCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF,
+     $                  NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS,
+     $                  X, XX, XS, Y, YY, YS, YT, G )
+            GO TO 200
+*           Test ZTRMV, 06, ZTBMV, 07, ZTPMV, 08,
+*           ZTRSV, 09, ZTBSV, 10, and ZTPSV, 11.
+  160       CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC,
+     $                  NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z )
+            GO TO 200
+*           Test ZGERC, 12, ZGERU, 13.
+  170       CALL ZCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC,
+     $                  NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS,
+     $                  YT, G, Z )
+            GO TO 200
+*           Test ZHER, 14, and ZHPR, 15.
+  180       CALL ZCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC,
+     $                  NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS,
+     $                  YT, G, Z )
+            GO TO 200
+*           Test ZHER2, 16, and ZHPR2, 17.
+  190       CALL ZCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC,
+     $                  NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS,
+     $                  YT, G, Z )
+*
+  200       IF( FATAL.AND.SFATAL )
+     $         GO TO 220
+         END IF
+  210 CONTINUE
+      WRITE( NOUT, FMT = 9982 )
+      GO TO 240
+*
+  220 CONTINUE
+      WRITE( NOUT, FMT = 9981 )
+      GO TO 240
+*
+  230 CONTINUE
+      WRITE( NOUT, FMT = 9987 )
+*
+  240 CONTINUE
+      IF( TRACE )
+     $   CLOSE ( NTRA )
+      CLOSE ( NOUT )
+      STOP
+*
+ 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES',
+     $      'S THAN', F8.2 )
+ 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 )
+ 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ',
+     $      'THAN ', I2 )
+ 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 )
+ 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' )
+ 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ',
+     $      I2 )
+ 9993 FORMAT( ' TESTS OF THE COMPLEX*16       LEVEL 2 BLAS', //' THE F',
+     $      'OLLOWING PARAMETER VALUES WILL BE USED:' )
+ 9992 FORMAT( '   FOR N              ', 9I6 )
+ 9991 FORMAT( '   FOR K              ', 7I6 )
+ 9990 FORMAT( '   FOR INCX AND INCY  ', 7I6 )
+ 9989 FORMAT( '   FOR ALPHA          ',
+     $      7( '(', F4.1, ',', F4.1, ')  ', : ) )
+ 9988 FORMAT( '   FOR BETA           ',
+     $      7( '(', F4.1, ',', F4.1, ')  ', : ) )
+ 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM',
+     $      /' ******* TESTS ABANDONED *******' )
+ 9986 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T',
+     $      'ESTS ABANDONED *******' )
+ 9985 FORMAT( ' ERROR IN ZMVCH -  IN-LINE DOT PRODUCTS ARE BEING EVALU',
+     $      'ATED WRONGLY.', /' ZMVCH WAS CALLED WITH TRANS = ', A1,
+     $      ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', /
+     $   ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.'
+     $      , /' ******* TESTS ABANDONED *******' )
+ 9984 FORMAT( A6, L2 )
+ 9983 FORMAT( 1X, A6, ' WAS NOT TESTED' )
+ 9982 FORMAT( /' END OF TESTS' )
+ 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' )
+ 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' )
+*
+*     End of ZBLAT2.
+*
+      END
+      SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET,
+     $                  BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX,
+     $                  XS, Y, YY, YS, YT, G )
+*
+*  Tests ZGEMV and ZGBMV.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO, HALF
+      PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ),
+     $                   HALF = ( 0.5D0, 0.0D0 ) )
+      DOUBLE PRECISION   RZERO
+      PARAMETER          ( RZERO = 0.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX,
+     $                   NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX*16         A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), BET( NBET ), X( NMAX ),
+     $                   XS( NMAX*INCMAX ), XX( NMAX*INCMAX ),
+     $                   Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX )
+      DOUBLE PRECISION   G( NMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC ), KB( NKB )
+*     .. Local Scalars ..
+      COMPLEX*16         ALPHA, ALS, BETA, BLS, TRANSL
+      DOUBLE PRECISION   ERR, ERRMAX
+      INTEGER            I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY,
+     $                   INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA,
+     $                   LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK,
+     $                   NL, NS
+      LOGICAL            BANDED, FULL, NULL, RESET, SAME, TRAN
+      CHARACTER*1        TRANS, TRANSS
+      CHARACTER*3        ICH
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LZE, LZERES
+      EXTERNAL           LZE, LZERES
+*     .. External Subroutines ..
+      EXTERNAL           ZGBMV, ZGEMV, ZMAKE, ZMVCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICH/'NTC'/
+*     .. Executable Statements ..
+      FULL = SNAME( 3: 3 ).EQ.'E'
+      BANDED = SNAME( 3: 3 ).EQ.'B'
+*     Define the number of arguments.
+      IF( FULL )THEN
+         NARGS = 11
+      ELSE IF( BANDED )THEN
+         NARGS = 13
+      END IF
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*
+      DO 120 IN = 1, NIDIM
+         N = IDIM( IN )
+         ND = N/2 + 1
+*
+         DO 110 IM = 1, 2
+            IF( IM.EQ.1 )
+     $         M = MAX( N - ND, 0 )
+            IF( IM.EQ.2 )
+     $         M = MIN( N + ND, NMAX )
+*
+            IF( BANDED )THEN
+               NK = NKB
+            ELSE
+               NK = 1
+            END IF
+            DO 100 IKU = 1, NK
+               IF( BANDED )THEN
+                  KU = KB( IKU )
+                  KL = MAX( KU - 1, 0 )
+               ELSE
+                  KU = N - 1
+                  KL = M - 1
+               END IF
+*              Set LDA to 1 more than minimum value if room.
+               IF( BANDED )THEN
+                  LDA = KL + KU + 1
+               ELSE
+                  LDA = M
+               END IF
+               IF( LDA.LT.NMAX )
+     $            LDA = LDA + 1
+*              Skip tests if not enough room.
+               IF( LDA.GT.NMAX )
+     $            GO TO 100
+               LAA = LDA*N
+               NULL = N.LE.0.OR.M.LE.0
+*
+*              Generate the matrix A.
+*
+               TRANSL = ZERO
+               CALL ZMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, AA,
+     $                     LDA, KL, KU, RESET, TRANSL )
+*
+               DO 90 IC = 1, 3
+                  TRANS = ICH( IC: IC )
+                  TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C'
+*
+                  IF( TRAN )THEN
+                     ML = N
+                     NL = M
+                  ELSE
+                     ML = M
+                     NL = N
+                  END IF
+*
+                  DO 80 IX = 1, NINC
+                     INCX = INC( IX )
+                     LX = ABS( INCX )*NL
+*
+*                    Generate the vector X.
+*
+                     TRANSL = HALF
+                     CALL ZMAKE( 'GE', ' ', ' ', 1, NL, X, 1, XX,
+     $                           ABS( INCX ), 0, NL - 1, RESET, TRANSL )
+                     IF( NL.GT.1 )THEN
+                        X( NL/2 ) = ZERO
+                        XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO
+                     END IF
+*
+                     DO 70 IY = 1, NINC
+                        INCY = INC( IY )
+                        LY = ABS( INCY )*ML
+*
+                        DO 60 IA = 1, NALF
+                           ALPHA = ALF( IA )
+*
+                           DO 50 IB = 1, NBET
+                              BETA = BET( IB )
+*
+*                             Generate the vector Y.
+*
+                              TRANSL = ZERO
+                              CALL ZMAKE( 'GE', ' ', ' ', 1, ML, Y, 1,
+     $                                    YY, ABS( INCY ), 0, ML - 1,
+     $                                    RESET, TRANSL )
+*
+                              NC = NC + 1
+*
+*                             Save every datum before calling the
+*                             subroutine.
+*
+                              TRANSS = TRANS
+                              MS = M
+                              NS = N
+                              KLS = KL
+                              KUS = KU
+                              ALS = ALPHA
+                              DO 10 I = 1, LAA
+                                 AS( I ) = AA( I )
+   10                         CONTINUE
+                              LDAS = LDA
+                              DO 20 I = 1, LX
+                                 XS( I ) = XX( I )
+   20                         CONTINUE
+                              INCXS = INCX
+                              BLS = BETA
+                              DO 30 I = 1, LY
+                                 YS( I ) = YY( I )
+   30                         CONTINUE
+                              INCYS = INCY
+*
+*                             Call the subroutine.
+*
+                              IF( FULL )THEN
+                                 IF( TRACE )
+     $                              WRITE( NTRA, FMT = 9994 )NC, SNAME,
+     $                              TRANS, M, N, ALPHA, LDA, INCX, BETA,
+     $                              INCY
+                                 IF( REWI )
+     $                              REWIND NTRA
+                                 CALL ZGEMV( TRANS, M, N, ALPHA, AA,
+     $                                       LDA, XX, INCX, BETA, YY,
+     $                                       INCY )
+                              ELSE IF( BANDED )THEN
+                                 IF( TRACE )
+     $                              WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                              TRANS, M, N, KL, KU, ALPHA, LDA,
+     $                              INCX, BETA, INCY
+                                 IF( REWI )
+     $                              REWIND NTRA
+                                 CALL ZGBMV( TRANS, M, N, KL, KU, ALPHA,
+     $                                       AA, LDA, XX, INCX, BETA,
+     $                                       YY, INCY )
+                              END IF
+*
+*                             Check if error-exit was taken incorrectly.
+*
+                              IF( .NOT.OK )THEN
+                                 WRITE( NOUT, FMT = 9993 )
+                                 FATAL = .TRUE.
+                                 GO TO 130
+                              END IF
+*
+*                             See what data changed inside subroutines.
+*
+                              ISAME( 1 ) = TRANS.EQ.TRANSS
+                              ISAME( 2 ) = MS.EQ.M
+                              ISAME( 3 ) = NS.EQ.N
+                              IF( FULL )THEN
+                                 ISAME( 4 ) = ALS.EQ.ALPHA
+                                 ISAME( 5 ) = LZE( AS, AA, LAA )
+                                 ISAME( 6 ) = LDAS.EQ.LDA
+                                 ISAME( 7 ) = LZE( XS, XX, LX )
+                                 ISAME( 8 ) = INCXS.EQ.INCX
+                                 ISAME( 9 ) = BLS.EQ.BETA
+                                 IF( NULL )THEN
+                                    ISAME( 10 ) = LZE( YS, YY, LY )
+                                 ELSE
+                                    ISAME( 10 ) = LZERES( 'GE', ' ', 1,
+     $                                            ML, YS, YY,
+     $                                            ABS( INCY ) )
+                                 END IF
+                                 ISAME( 11 ) = INCYS.EQ.INCY
+                              ELSE IF( BANDED )THEN
+                                 ISAME( 4 ) = KLS.EQ.KL
+                                 ISAME( 5 ) = KUS.EQ.KU
+                                 ISAME( 6 ) = ALS.EQ.ALPHA
+                                 ISAME( 7 ) = LZE( AS, AA, LAA )
+                                 ISAME( 8 ) = LDAS.EQ.LDA
+                                 ISAME( 9 ) = LZE( XS, XX, LX )
+                                 ISAME( 10 ) = INCXS.EQ.INCX
+                                 ISAME( 11 ) = BLS.EQ.BETA
+                                 IF( NULL )THEN
+                                    ISAME( 12 ) = LZE( YS, YY, LY )
+                                 ELSE
+                                    ISAME( 12 ) = LZERES( 'GE', ' ', 1,
+     $                                            ML, YS, YY,
+     $                                            ABS( INCY ) )
+                                 END IF
+                                 ISAME( 13 ) = INCYS.EQ.INCY
+                              END IF
+*
+*                             If data was incorrectly changed, report
+*                             and return.
+*
+                              SAME = .TRUE.
+                              DO 40 I = 1, NARGS
+                                 SAME = SAME.AND.ISAME( I )
+                                 IF( .NOT.ISAME( I ) )
+     $                              WRITE( NOUT, FMT = 9998 )I
+   40                         CONTINUE
+                              IF( .NOT.SAME )THEN
+                                 FATAL = .TRUE.
+                                 GO TO 130
+                              END IF
+*
+                              IF( .NOT.NULL )THEN
+*
+*                                Check the result.
+*
+                                 CALL ZMVCH( TRANS, M, N, ALPHA, A,
+     $                                       NMAX, X, INCX, BETA, Y,
+     $                                       INCY, YT, G, YY, EPS, ERR,
+     $                                       FATAL, NOUT, .TRUE. )
+                                 ERRMAX = MAX( ERRMAX, ERR )
+*                                If got really bad answer, report and
+*                                return.
+                                 IF( FATAL )
+     $                              GO TO 130
+                              ELSE
+*                                Avoid repeating tests with M.le.0 or
+*                                N.le.0.
+                                 GO TO 110
+                              END IF
+*
+   50                      CONTINUE
+*
+   60                   CONTINUE
+*
+   70                CONTINUE
+*
+   80             CONTINUE
+*
+   90          CONTINUE
+*
+  100       CONTINUE
+*
+  110    CONTINUE
+*
+  120 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 140
+*
+  130 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( FULL )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, TRANS, M, N, ALPHA, LDA,
+     $      INCX, BETA, INCY
+      ELSE IF( BANDED )THEN
+         WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANS, M, N, KL, KU,
+     $      ALPHA, LDA, INCX, BETA, INCY
+      END IF
+*
+  140 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 4( I3, ',' ), '(',
+     $      F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',',
+     $      F4.1, '), Y,', I2, ') .' )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), '(',
+     $      F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',',
+     $      F4.1, '), Y,', I2, ')         .' )
+ 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of ZCHK1.
+*
+      END
+      SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET,
+     $                  BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX,
+     $                  XS, Y, YY, YS, YT, G )
+*
+*  Tests ZHEMV, ZHBMV and ZHPMV.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO, HALF
+      PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ),
+     $                   HALF = ( 0.5D0, 0.0D0 ) )
+      DOUBLE PRECISION   RZERO
+      PARAMETER          ( RZERO = 0.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX,
+     $                   NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX*16         A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), BET( NBET ), X( NMAX ),
+     $                   XS( NMAX*INCMAX ), XX( NMAX*INCMAX ),
+     $                   Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX )
+      DOUBLE PRECISION   G( NMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC ), KB( NKB )
+*     .. Local Scalars ..
+      COMPLEX*16         ALPHA, ALS, BETA, BLS, TRANSL
+      DOUBLE PRECISION   ERR, ERRMAX
+      INTEGER            I, IA, IB, IC, IK, IN, INCX, INCXS, INCY,
+     $                   INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY,
+     $                   N, NARGS, NC, NK, NS
+      LOGICAL            BANDED, FULL, NULL, PACKED, RESET, SAME
+      CHARACTER*1        UPLO, UPLOS
+      CHARACTER*2        ICH
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LZE, LZERES
+      EXTERNAL           LZE, LZERES
+*     .. External Subroutines ..
+      EXTERNAL           ZHBMV, ZHEMV, ZHPMV, ZMAKE, ZMVCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICH/'UL'/
+*     .. Executable Statements ..
+      FULL = SNAME( 3: 3 ).EQ.'E'
+      BANDED = SNAME( 3: 3 ).EQ.'B'
+      PACKED = SNAME( 3: 3 ).EQ.'P'
+*     Define the number of arguments.
+      IF( FULL )THEN
+         NARGS = 10
+      ELSE IF( BANDED )THEN
+         NARGS = 11
+      ELSE IF( PACKED )THEN
+         NARGS = 9
+      END IF
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*
+      DO 110 IN = 1, NIDIM
+         N = IDIM( IN )
+*
+         IF( BANDED )THEN
+            NK = NKB
+         ELSE
+            NK = 1
+         END IF
+         DO 100 IK = 1, NK
+            IF( BANDED )THEN
+               K = KB( IK )
+            ELSE
+               K = N - 1
+            END IF
+*           Set LDA to 1 more than minimum value if room.
+            IF( BANDED )THEN
+               LDA = K + 1
+            ELSE
+               LDA = N
+            END IF
+            IF( LDA.LT.NMAX )
+     $         LDA = LDA + 1
+*           Skip tests if not enough room.
+            IF( LDA.GT.NMAX )
+     $         GO TO 100
+            IF( PACKED )THEN
+               LAA = ( N*( N + 1 ) )/2
+            ELSE
+               LAA = LDA*N
+            END IF
+            NULL = N.LE.0
+*
+            DO 90 IC = 1, 2
+               UPLO = ICH( IC: IC )
+*
+*              Generate the matrix A.
+*
+               TRANSL = ZERO
+               CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, AA,
+     $                     LDA, K, K, RESET, TRANSL )
+*
+               DO 80 IX = 1, NINC
+                  INCX = INC( IX )
+                  LX = ABS( INCX )*N
+*
+*                 Generate the vector X.
+*
+                  TRANSL = HALF
+                  CALL ZMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX,
+     $                        ABS( INCX ), 0, N - 1, RESET, TRANSL )
+                  IF( N.GT.1 )THEN
+                     X( N/2 ) = ZERO
+                     XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO
+                  END IF
+*
+                  DO 70 IY = 1, NINC
+                     INCY = INC( IY )
+                     LY = ABS( INCY )*N
+*
+                     DO 60 IA = 1, NALF
+                        ALPHA = ALF( IA )
+*
+                        DO 50 IB = 1, NBET
+                           BETA = BET( IB )
+*
+*                          Generate the vector Y.
+*
+                           TRANSL = ZERO
+                           CALL ZMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY,
+     $                                 ABS( INCY ), 0, N - 1, RESET,
+     $                                 TRANSL )
+*
+                           NC = NC + 1
+*
+*                          Save every datum before calling the
+*                          subroutine.
+*
+                           UPLOS = UPLO
+                           NS = N
+                           KS = K
+                           ALS = ALPHA
+                           DO 10 I = 1, LAA
+                              AS( I ) = AA( I )
+   10                      CONTINUE
+                           LDAS = LDA
+                           DO 20 I = 1, LX
+                              XS( I ) = XX( I )
+   20                      CONTINUE
+                           INCXS = INCX
+                           BLS = BETA
+                           DO 30 I = 1, LY
+                              YS( I ) = YY( I )
+   30                      CONTINUE
+                           INCYS = INCY
+*
+*                          Call the subroutine.
+*
+                           IF( FULL )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9993 )NC, SNAME,
+     $                           UPLO, N, ALPHA, LDA, INCX, BETA, INCY
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL ZHEMV( UPLO, N, ALPHA, AA, LDA, XX,
+     $                                    INCX, BETA, YY, INCY )
+                           ELSE IF( BANDED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9994 )NC, SNAME,
+     $                           UPLO, N, K, ALPHA, LDA, INCX, BETA,
+     $                           INCY
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL ZHBMV( UPLO, N, K, ALPHA, AA, LDA,
+     $                                    XX, INCX, BETA, YY, INCY )
+                           ELSE IF( PACKED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                           UPLO, N, ALPHA, INCX, BETA, INCY
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL ZHPMV( UPLO, N, ALPHA, AA, XX, INCX,
+     $                                    BETA, YY, INCY )
+                           END IF
+*
+*                          Check if error-exit was taken incorrectly.
+*
+                           IF( .NOT.OK )THEN
+                              WRITE( NOUT, FMT = 9992 )
+                              FATAL = .TRUE.
+                              GO TO 120
+                           END IF
+*
+*                          See what data changed inside subroutines.
+*
+                           ISAME( 1 ) = UPLO.EQ.UPLOS
+                           ISAME( 2 ) = NS.EQ.N
+                           IF( FULL )THEN
+                              ISAME( 3 ) = ALS.EQ.ALPHA
+                              ISAME( 4 ) = LZE( AS, AA, LAA )
+                              ISAME( 5 ) = LDAS.EQ.LDA
+                              ISAME( 6 ) = LZE( XS, XX, LX )
+                              ISAME( 7 ) = INCXS.EQ.INCX
+                              ISAME( 8 ) = BLS.EQ.BETA
+                              IF( NULL )THEN
+                                 ISAME( 9 ) = LZE( YS, YY, LY )
+                              ELSE
+                                 ISAME( 9 ) = LZERES( 'GE', ' ', 1, N,
+     $                                        YS, YY, ABS( INCY ) )
+                              END IF
+                              ISAME( 10 ) = INCYS.EQ.INCY
+                           ELSE IF( BANDED )THEN
+                              ISAME( 3 ) = KS.EQ.K
+                              ISAME( 4 ) = ALS.EQ.ALPHA
+                              ISAME( 5 ) = LZE( AS, AA, LAA )
+                              ISAME( 6 ) = LDAS.EQ.LDA
+                              ISAME( 7 ) = LZE( XS, XX, LX )
+                              ISAME( 8 ) = INCXS.EQ.INCX
+                              ISAME( 9 ) = BLS.EQ.BETA
+                              IF( NULL )THEN
+                                 ISAME( 10 ) = LZE( YS, YY, LY )
+                              ELSE
+                                 ISAME( 10 ) = LZERES( 'GE', ' ', 1, N,
+     $                                         YS, YY, ABS( INCY ) )
+                              END IF
+                              ISAME( 11 ) = INCYS.EQ.INCY
+                           ELSE IF( PACKED )THEN
+                              ISAME( 3 ) = ALS.EQ.ALPHA
+                              ISAME( 4 ) = LZE( AS, AA, LAA )
+                              ISAME( 5 ) = LZE( XS, XX, LX )
+                              ISAME( 6 ) = INCXS.EQ.INCX
+                              ISAME( 7 ) = BLS.EQ.BETA
+                              IF( NULL )THEN
+                                 ISAME( 8 ) = LZE( YS, YY, LY )
+                              ELSE
+                                 ISAME( 8 ) = LZERES( 'GE', ' ', 1, N,
+     $                                        YS, YY, ABS( INCY ) )
+                              END IF
+                              ISAME( 9 ) = INCYS.EQ.INCY
+                           END IF
+*
+*                          If data was incorrectly changed, report and
+*                          return.
+*
+                           SAME = .TRUE.
+                           DO 40 I = 1, NARGS
+                              SAME = SAME.AND.ISAME( I )
+                              IF( .NOT.ISAME( I ) )
+     $                           WRITE( NOUT, FMT = 9998 )I
+   40                      CONTINUE
+                           IF( .NOT.SAME )THEN
+                              FATAL = .TRUE.
+                              GO TO 120
+                           END IF
+*
+                           IF( .NOT.NULL )THEN
+*
+*                             Check the result.
+*
+                              CALL ZMVCH( 'N', N, N, ALPHA, A, NMAX, X,
+     $                                    INCX, BETA, Y, INCY, YT, G,
+     $                                    YY, EPS, ERR, FATAL, NOUT,
+     $                                    .TRUE. )
+                              ERRMAX = MAX( ERRMAX, ERR )
+*                             If got really bad answer, report and
+*                             return.
+                              IF( FATAL )
+     $                           GO TO 120
+                           ELSE
+*                             Avoid repeating tests with N.le.0
+                              GO TO 110
+                           END IF
+*
+   50                   CONTINUE
+*
+   60                CONTINUE
+*
+   70             CONTINUE
+*
+   80          CONTINUE
+*
+   90       CONTINUE
+*
+  100    CONTINUE
+*
+  110 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 130
+*
+  120 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( FULL )THEN
+         WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, LDA, INCX,
+     $      BETA, INCY
+      ELSE IF( BANDED )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, K, ALPHA, LDA,
+     $      INCX, BETA, INCY
+      ELSE IF( PACKED )THEN
+         WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, N, ALPHA, INCX,
+     $      BETA, INCY
+      END IF
+*
+  130 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',',
+     $      F4.1, '), AP, X,', I2, ',(', F4.1, ',', F4.1, '), Y,', I2,
+     $      ')                .' )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), '(',
+     $      F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',',
+     $      F4.1, '), Y,', I2, ')         .' )
+ 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',',
+     $      F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', F4.1, '), ',
+     $      'Y,', I2, ')             .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of ZCHK2.
+*
+      END
+      SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX,
+     $                  INCMAX, A, AA, AS, X, XX, XS, XT, G, Z )
+*
+*  Tests ZTRMV, ZTBMV, ZTPMV, ZTRSV, ZTBSV and ZTPSV.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO, HALF, ONE
+      PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ),
+     $                   HALF = ( 0.5D0, 0.0D0 ),
+     $                   ONE = ( 1.0D0, 0.0D0 ) )
+      DOUBLE PRECISION   RZERO
+      PARAMETER          ( RZERO = 0.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX*16         A( NMAX, NMAX ), AA( NMAX*NMAX ),
+     $                   AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ),
+     $                   XT( NMAX ), XX( NMAX*INCMAX ), Z( NMAX )
+      DOUBLE PRECISION   G( NMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC ), KB( NKB )
+*     .. Local Scalars ..
+      COMPLEX*16         TRANSL
+      DOUBLE PRECISION   ERR, ERRMAX
+      INTEGER            I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K,
+     $                   KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS
+      LOGICAL            BANDED, FULL, NULL, PACKED, RESET, SAME
+      CHARACTER*1        DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS
+      CHARACTER*2        ICHD, ICHU
+      CHARACTER*3        ICHT
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LZE, LZERES
+      EXTERNAL           LZE, LZERES
+*     .. External Subroutines ..
+      EXTERNAL           ZMAKE, ZMVCH, ZTBMV, ZTBSV, ZTPMV, ZTPSV,
+     $                   ZTRMV, ZTRSV
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/
+*     .. Executable Statements ..
+      FULL = SNAME( 3: 3 ).EQ.'R'
+      BANDED = SNAME( 3: 3 ).EQ.'B'
+      PACKED = SNAME( 3: 3 ).EQ.'P'
+*     Define the number of arguments.
+      IF( FULL )THEN
+         NARGS = 8
+      ELSE IF( BANDED )THEN
+         NARGS = 9
+      ELSE IF( PACKED )THEN
+         NARGS = 7
+      END IF
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*     Set up zero vector for ZMVCH.
+      DO 10 I = 1, NMAX
+         Z( I ) = ZERO
+   10 CONTINUE
+*
+      DO 110 IN = 1, NIDIM
+         N = IDIM( IN )
+*
+         IF( BANDED )THEN
+            NK = NKB
+         ELSE
+            NK = 1
+         END IF
+         DO 100 IK = 1, NK
+            IF( BANDED )THEN
+               K = KB( IK )
+            ELSE
+               K = N - 1
+            END IF
+*           Set LDA to 1 more than minimum value if room.
+            IF( BANDED )THEN
+               LDA = K + 1
+            ELSE
+               LDA = N
+            END IF
+            IF( LDA.LT.NMAX )
+     $         LDA = LDA + 1
+*           Skip tests if not enough room.
+            IF( LDA.GT.NMAX )
+     $         GO TO 100
+            IF( PACKED )THEN
+               LAA = ( N*( N + 1 ) )/2
+            ELSE
+               LAA = LDA*N
+            END IF
+            NULL = N.LE.0
+*
+            DO 90 ICU = 1, 2
+               UPLO = ICHU( ICU: ICU )
+*
+               DO 80 ICT = 1, 3
+                  TRANS = ICHT( ICT: ICT )
+*
+                  DO 70 ICD = 1, 2
+                     DIAG = ICHD( ICD: ICD )
+*
+*                    Generate the matrix A.
+*
+                     TRANSL = ZERO
+                     CALL ZMAKE( SNAME( 2: 3 ), UPLO, DIAG, N, N, A,
+     $                           NMAX, AA, LDA, K, K, RESET, TRANSL )
+*
+                     DO 60 IX = 1, NINC
+                        INCX = INC( IX )
+                        LX = ABS( INCX )*N
+*
+*                       Generate the vector X.
+*
+                        TRANSL = HALF
+                        CALL ZMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX,
+     $                              ABS( INCX ), 0, N - 1, RESET,
+     $                              TRANSL )
+                        IF( N.GT.1 )THEN
+                           X( N/2 ) = ZERO
+                           XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO
+                        END IF
+*
+                        NC = NC + 1
+*
+*                       Save every datum before calling the subroutine.
+*
+                        UPLOS = UPLO
+                        TRANSS = TRANS
+                        DIAGS = DIAG
+                        NS = N
+                        KS = K
+                        DO 20 I = 1, LAA
+                           AS( I ) = AA( I )
+   20                   CONTINUE
+                        LDAS = LDA
+                        DO 30 I = 1, LX
+                           XS( I ) = XX( I )
+   30                   CONTINUE
+                        INCXS = INCX
+*
+*                       Call the subroutine.
+*
+                        IF( SNAME( 4: 5 ).EQ.'MV' )THEN
+                           IF( FULL )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9993 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, LDA, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL ZTRMV( UPLO, TRANS, DIAG, N, AA, LDA,
+     $                                    XX, INCX )
+                           ELSE IF( BANDED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9994 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, K, LDA, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL ZTBMV( UPLO, TRANS, DIAG, N, K, AA,
+     $                                    LDA, XX, INCX )
+                           ELSE IF( PACKED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL ZTPMV( UPLO, TRANS, DIAG, N, AA, XX,
+     $                                    INCX )
+                           END IF
+                        ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN
+                           IF( FULL )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9993 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, LDA, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL ZTRSV( UPLO, TRANS, DIAG, N, AA, LDA,
+     $                                    XX, INCX )
+                           ELSE IF( BANDED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9994 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, K, LDA, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL ZTBSV( UPLO, TRANS, DIAG, N, K, AA,
+     $                                    LDA, XX, INCX )
+                           ELSE IF( PACKED )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                           UPLO, TRANS, DIAG, N, INCX
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL ZTPSV( UPLO, TRANS, DIAG, N, AA, XX,
+     $                                    INCX )
+                           END IF
+                        END IF
+*
+*                       Check if error-exit was taken incorrectly.
+*
+                        IF( .NOT.OK )THEN
+                           WRITE( NOUT, FMT = 9992 )
+                           FATAL = .TRUE.
+                           GO TO 120
+                        END IF
+*
+*                       See what data changed inside subroutines.
+*
+                        ISAME( 1 ) = UPLO.EQ.UPLOS
+                        ISAME( 2 ) = TRANS.EQ.TRANSS
+                        ISAME( 3 ) = DIAG.EQ.DIAGS
+                        ISAME( 4 ) = NS.EQ.N
+                        IF( FULL )THEN
+                           ISAME( 5 ) = LZE( AS, AA, LAA )
+                           ISAME( 6 ) = LDAS.EQ.LDA
+                           IF( NULL )THEN
+                              ISAME( 7 ) = LZE( XS, XX, LX )
+                           ELSE
+                              ISAME( 7 ) = LZERES( 'GE', ' ', 1, N, XS,
+     $                                     XX, ABS( INCX ) )
+                           END IF
+                           ISAME( 8 ) = INCXS.EQ.INCX
+                        ELSE IF( BANDED )THEN
+                           ISAME( 5 ) = KS.EQ.K
+                           ISAME( 6 ) = LZE( AS, AA, LAA )
+                           ISAME( 7 ) = LDAS.EQ.LDA
+                           IF( NULL )THEN
+                              ISAME( 8 ) = LZE( XS, XX, LX )
+                           ELSE
+                              ISAME( 8 ) = LZERES( 'GE', ' ', 1, N, XS,
+     $                                     XX, ABS( INCX ) )
+                           END IF
+                           ISAME( 9 ) = INCXS.EQ.INCX
+                        ELSE IF( PACKED )THEN
+                           ISAME( 5 ) = LZE( AS, AA, LAA )
+                           IF( NULL )THEN
+                              ISAME( 6 ) = LZE( XS, XX, LX )
+                           ELSE
+                              ISAME( 6 ) = LZERES( 'GE', ' ', 1, N, XS,
+     $                                     XX, ABS( INCX ) )
+                           END IF
+                           ISAME( 7 ) = INCXS.EQ.INCX
+                        END IF
+*
+*                       If data was incorrectly changed, report and
+*                       return.
+*
+                        SAME = .TRUE.
+                        DO 40 I = 1, NARGS
+                           SAME = SAME.AND.ISAME( I )
+                           IF( .NOT.ISAME( I ) )
+     $                        WRITE( NOUT, FMT = 9998 )I
+   40                   CONTINUE
+                        IF( .NOT.SAME )THEN
+                           FATAL = .TRUE.
+                           GO TO 120
+                        END IF
+*
+                        IF( .NOT.NULL )THEN
+                           IF( SNAME( 4: 5 ).EQ.'MV' )THEN
+*
+*                             Check the result.
+*
+                              CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X,
+     $                                    INCX, ZERO, Z, INCX, XT, G,
+     $                                    XX, EPS, ERR, FATAL, NOUT,
+     $                                    .TRUE. )
+                           ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN
+*
+*                             Compute approximation to original vector.
+*
+                              DO 50 I = 1, N
+                                 Z( I ) = XX( 1 + ( I - 1 )*
+     $                                    ABS( INCX ) )
+                                 XX( 1 + ( I - 1 )*ABS( INCX ) )
+     $                              = X( I )
+   50                         CONTINUE
+                              CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, Z,
+     $                                    INCX, ZERO, X, INCX, XT, G,
+     $                                    XX, EPS, ERR, FATAL, NOUT,
+     $                                    .FALSE. )
+                           END IF
+                           ERRMAX = MAX( ERRMAX, ERR )
+*                          If got really bad answer, report and return.
+                           IF( FATAL )
+     $                        GO TO 120
+                        ELSE
+*                          Avoid repeating tests with N.le.0.
+                           GO TO 110
+                        END IF
+*
+   60                CONTINUE
+*
+   70             CONTINUE
+*
+   80          CONTINUE
+*
+   90       CONTINUE
+*
+  100    CONTINUE
+*
+  110 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 130
+*
+  120 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( FULL )THEN
+         WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, DIAG, N, LDA,
+     $      INCX
+      ELSE IF( BANDED )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, DIAG, N, K,
+     $      LDA, INCX
+      ELSE IF( PACKED )THEN
+         WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, TRANS, DIAG, N, INCX
+      END IF
+*
+  130 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', AP, ',
+     $      'X,', I2, ')                                      .' )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), 2( I3, ',' ),
+     $      ' A,', I3, ', X,', I2, ')                               .' )
+ 9993 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', A,',
+     $      I3, ', X,', I2, ')                                   .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of ZCHK3.
+*
+      END
+      SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX,
+     $                  INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G,
+     $                  Z )
+*
+*  Tests ZGERC and ZGERU.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO, HALF, ONE
+      PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ),
+     $                   HALF = ( 0.5D0, 0.0D0 ),
+     $                   ONE = ( 1.0D0, 0.0D0 ) )
+      DOUBLE PRECISION   RZERO
+      PARAMETER          ( RZERO = 0.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX*16         A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ),
+     $                   XX( NMAX*INCMAX ), Y( NMAX ),
+     $                   YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX ), Z( NMAX )
+      DOUBLE PRECISION   G( NMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC )
+*     .. Local Scalars ..
+      COMPLEX*16         ALPHA, ALS, TRANSL
+      DOUBLE PRECISION   ERR, ERRMAX
+      INTEGER            I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX,
+     $                   IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS,
+     $                   NC, ND, NS
+      LOGICAL            CONJ, NULL, RESET, SAME
+*     .. Local Arrays ..
+      COMPLEX*16         W( 1 )
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LZE, LZERES
+      EXTERNAL           LZE, LZERES
+*     .. External Subroutines ..
+      EXTERNAL           ZGERC, ZGERU, ZMAKE, ZMVCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DCONJG, MAX, MIN
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Executable Statements ..
+      CONJ = SNAME( 5: 5 ).EQ.'C'
+*     Define the number of arguments.
+      NARGS = 9
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*
+      DO 120 IN = 1, NIDIM
+         N = IDIM( IN )
+         ND = N/2 + 1
+*
+         DO 110 IM = 1, 2
+            IF( IM.EQ.1 )
+     $         M = MAX( N - ND, 0 )
+            IF( IM.EQ.2 )
+     $         M = MIN( N + ND, NMAX )
+*
+*           Set LDA to 1 more than minimum value if room.
+            LDA = M
+            IF( LDA.LT.NMAX )
+     $         LDA = LDA + 1
+*           Skip tests if not enough room.
+            IF( LDA.GT.NMAX )
+     $         GO TO 110
+            LAA = LDA*N
+            NULL = N.LE.0.OR.M.LE.0
+*
+            DO 100 IX = 1, NINC
+               INCX = INC( IX )
+               LX = ABS( INCX )*M
+*
+*              Generate the vector X.
+*
+               TRANSL = HALF
+               CALL ZMAKE( 'GE', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ),
+     $                     0, M - 1, RESET, TRANSL )
+               IF( M.GT.1 )THEN
+                  X( M/2 ) = ZERO
+                  XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO
+               END IF
+*
+               DO 90 IY = 1, NINC
+                  INCY = INC( IY )
+                  LY = ABS( INCY )*N
+*
+*                 Generate the vector Y.
+*
+                  TRANSL = ZERO
+                  CALL ZMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY,
+     $                        ABS( INCY ), 0, N - 1, RESET, TRANSL )
+                  IF( N.GT.1 )THEN
+                     Y( N/2 ) = ZERO
+                     YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO
+                  END IF
+*
+                  DO 80 IA = 1, NALF
+                     ALPHA = ALF( IA )
+*
+*                    Generate the matrix A.
+*
+                     TRANSL = ZERO
+                     CALL ZMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX,
+     $                           AA, LDA, M - 1, N - 1, RESET, TRANSL )
+*
+                     NC = NC + 1
+*
+*                    Save every datum before calling the subroutine.
+*
+                     MS = M
+                     NS = N
+                     ALS = ALPHA
+                     DO 10 I = 1, LAA
+                        AS( I ) = AA( I )
+   10                CONTINUE
+                     LDAS = LDA
+                     DO 20 I = 1, LX
+                        XS( I ) = XX( I )
+   20                CONTINUE
+                     INCXS = INCX
+                     DO 30 I = 1, LY
+                        YS( I ) = YY( I )
+   30                CONTINUE
+                     INCYS = INCY
+*
+*                    Call the subroutine.
+*
+                     IF( TRACE )
+     $                  WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N,
+     $                  ALPHA, INCX, INCY, LDA
+                     IF( CONJ )THEN
+                        IF( REWI )
+     $                     REWIND NTRA
+                        CALL ZGERC( M, N, ALPHA, XX, INCX, YY, INCY, AA,
+     $                              LDA )
+                     ELSE
+                        IF( REWI )
+     $                     REWIND NTRA
+                        CALL ZGERU( M, N, ALPHA, XX, INCX, YY, INCY, AA,
+     $                              LDA )
+                     END IF
+*
+*                    Check if error-exit was taken incorrectly.
+*
+                     IF( .NOT.OK )THEN
+                        WRITE( NOUT, FMT = 9993 )
+                        FATAL = .TRUE.
+                        GO TO 140
+                     END IF
+*
+*                    See what data changed inside subroutine.
+*
+                     ISAME( 1 ) = MS.EQ.M
+                     ISAME( 2 ) = NS.EQ.N
+                     ISAME( 3 ) = ALS.EQ.ALPHA
+                     ISAME( 4 ) = LZE( XS, XX, LX )
+                     ISAME( 5 ) = INCXS.EQ.INCX
+                     ISAME( 6 ) = LZE( YS, YY, LY )
+                     ISAME( 7 ) = INCYS.EQ.INCY
+                     IF( NULL )THEN
+                        ISAME( 8 ) = LZE( AS, AA, LAA )
+                     ELSE
+                        ISAME( 8 ) = LZERES( 'GE', ' ', M, N, AS, AA,
+     $                               LDA )
+                     END IF
+                     ISAME( 9 ) = LDAS.EQ.LDA
+*
+*                    If data was incorrectly changed, report and return.
+*
+                     SAME = .TRUE.
+                     DO 40 I = 1, NARGS
+                        SAME = SAME.AND.ISAME( I )
+                        IF( .NOT.ISAME( I ) )
+     $                     WRITE( NOUT, FMT = 9998 )I
+   40                CONTINUE
+                     IF( .NOT.SAME )THEN
+                        FATAL = .TRUE.
+                        GO TO 140
+                     END IF
+*
+                     IF( .NOT.NULL )THEN
+*
+*                       Check the result column by column.
+*
+                        IF( INCX.GT.0 )THEN
+                           DO 50 I = 1, M
+                              Z( I ) = X( I )
+   50                      CONTINUE
+                        ELSE
+                           DO 60 I = 1, M
+                              Z( I ) = X( M - I + 1 )
+   60                      CONTINUE
+                        END IF
+                        DO 70 J = 1, N
+                           IF( INCY.GT.0 )THEN
+                              W( 1 ) = Y( J )
+                           ELSE
+                              W( 1 ) = Y( N - J + 1 )
+                           END IF
+                           IF( CONJ )
+     $                        W( 1 ) = DCONJG( W( 1 ) )
+                           CALL ZMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1,
+     $                                 ONE, A( 1, J ), 1, YT, G,
+     $                                 AA( 1 + ( J - 1 )*LDA ), EPS,
+     $                                 ERR, FATAL, NOUT, .TRUE. )
+                           ERRMAX = MAX( ERRMAX, ERR )
+*                          If got really bad answer, report and return.
+                           IF( FATAL )
+     $                        GO TO 130
+   70                   CONTINUE
+                     ELSE
+*                       Avoid repeating tests with M.le.0 or N.le.0.
+                        GO TO 110
+                     END IF
+*
+   80             CONTINUE
+*
+   90          CONTINUE
+*
+  100       CONTINUE
+*
+  110    CONTINUE
+*
+  120 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 150
+*
+  130 CONTINUE
+      WRITE( NOUT, FMT = 9995 )J
+*
+  140 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA
+*
+  150 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( I3, ',' ), '(', F4.1, ',', F4.1,
+     $      '), X,', I2, ', Y,', I2, ', A,', I3, ')                   ',
+     $      '      .' )
+ 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of ZCHK4.
+*
+      END
+      SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX,
+     $                  INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G,
+     $                  Z )
+*
+*  Tests ZHER and ZHPR.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO, HALF, ONE
+      PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ),
+     $                   HALF = ( 0.5D0, 0.0D0 ),
+     $                   ONE = ( 1.0D0, 0.0D0 ) )
+      DOUBLE PRECISION   RZERO
+      PARAMETER          ( RZERO = 0.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX*16         A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ),
+     $                   XX( NMAX*INCMAX ), Y( NMAX ),
+     $                   YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX ), Z( NMAX )
+      DOUBLE PRECISION   G( NMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC )
+*     .. Local Scalars ..
+      COMPLEX*16         ALPHA, TRANSL
+      DOUBLE PRECISION   ERR, ERRMAX, RALPHA, RALS
+      INTEGER            I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA,
+     $                   LDA, LDAS, LJ, LX, N, NARGS, NC, NS
+      LOGICAL            FULL, NULL, PACKED, RESET, SAME, UPPER
+      CHARACTER*1        UPLO, UPLOS
+      CHARACTER*2        ICH
+*     .. Local Arrays ..
+      COMPLEX*16         W( 1 )
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LZE, LZERES
+      EXTERNAL           LZE, LZERES
+*     .. External Subroutines ..
+      EXTERNAL           ZHER, ZHPR, ZMAKE, ZMVCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DBLE, DCMPLX, DCONJG, MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICH/'UL'/
+*     .. Executable Statements ..
+      FULL = SNAME( 3: 3 ).EQ.'E'
+      PACKED = SNAME( 3: 3 ).EQ.'P'
+*     Define the number of arguments.
+      IF( FULL )THEN
+         NARGS = 7
+      ELSE IF( PACKED )THEN
+         NARGS = 6
+      END IF
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*
+      DO 100 IN = 1, NIDIM
+         N = IDIM( IN )
+*        Set LDA to 1 more than minimum value if room.
+         LDA = N
+         IF( LDA.LT.NMAX )
+     $      LDA = LDA + 1
+*        Skip tests if not enough room.
+         IF( LDA.GT.NMAX )
+     $      GO TO 100
+         IF( PACKED )THEN
+            LAA = ( N*( N + 1 ) )/2
+         ELSE
+            LAA = LDA*N
+         END IF
+*
+         DO 90 IC = 1, 2
+            UPLO = ICH( IC: IC )
+            UPPER = UPLO.EQ.'U'
+*
+            DO 80 IX = 1, NINC
+               INCX = INC( IX )
+               LX = ABS( INCX )*N
+*
+*              Generate the vector X.
+*
+               TRANSL = HALF
+               CALL ZMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ),
+     $                     0, N - 1, RESET, TRANSL )
+               IF( N.GT.1 )THEN
+                  X( N/2 ) = ZERO
+                  XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO
+               END IF
+*
+               DO 70 IA = 1, NALF
+                  RALPHA = DBLE( ALF( IA ) )
+                  ALPHA = DCMPLX( RALPHA, RZERO )
+                  NULL = N.LE.0.OR.RALPHA.EQ.RZERO
+*
+*                 Generate the matrix A.
+*
+                  TRANSL = ZERO
+                  CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX,
+     $                        AA, LDA, N - 1, N - 1, RESET, TRANSL )
+*
+                  NC = NC + 1
+*
+*                 Save every datum before calling the subroutine.
+*
+                  UPLOS = UPLO
+                  NS = N
+                  RALS = RALPHA
+                  DO 10 I = 1, LAA
+                     AS( I ) = AA( I )
+   10             CONTINUE
+                  LDAS = LDA
+                  DO 20 I = 1, LX
+                     XS( I ) = XX( I )
+   20             CONTINUE
+                  INCXS = INCX
+*
+*                 Call the subroutine.
+*
+                  IF( FULL )THEN
+                     IF( TRACE )
+     $                  WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N,
+     $                  RALPHA, INCX, LDA
+                     IF( REWI )
+     $                  REWIND NTRA
+                     CALL ZHER( UPLO, N, RALPHA, XX, INCX, AA, LDA )
+                  ELSE IF( PACKED )THEN
+                     IF( TRACE )
+     $                  WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N,
+     $                  RALPHA, INCX
+                     IF( REWI )
+     $                  REWIND NTRA
+                     CALL ZHPR( UPLO, N, RALPHA, XX, INCX, AA )
+                  END IF
+*
+*                 Check if error-exit was taken incorrectly.
+*
+                  IF( .NOT.OK )THEN
+                     WRITE( NOUT, FMT = 9992 )
+                     FATAL = .TRUE.
+                     GO TO 120
+                  END IF
+*
+*                 See what data changed inside subroutines.
+*
+                  ISAME( 1 ) = UPLO.EQ.UPLOS
+                  ISAME( 2 ) = NS.EQ.N
+                  ISAME( 3 ) = RALS.EQ.RALPHA
+                  ISAME( 4 ) = LZE( XS, XX, LX )
+                  ISAME( 5 ) = INCXS.EQ.INCX
+                  IF( NULL )THEN
+                     ISAME( 6 ) = LZE( AS, AA, LAA )
+                  ELSE
+                     ISAME( 6 ) = LZERES( SNAME( 2: 3 ), UPLO, N, N, AS,
+     $                            AA, LDA )
+                  END IF
+                  IF( .NOT.PACKED )THEN
+                     ISAME( 7 ) = LDAS.EQ.LDA
+                  END IF
+*
+*                 If data was incorrectly changed, report and return.
+*
+                  SAME = .TRUE.
+                  DO 30 I = 1, NARGS
+                     SAME = SAME.AND.ISAME( I )
+                     IF( .NOT.ISAME( I ) )
+     $                  WRITE( NOUT, FMT = 9998 )I
+   30             CONTINUE
+                  IF( .NOT.SAME )THEN
+                     FATAL = .TRUE.
+                     GO TO 120
+                  END IF
+*
+                  IF( .NOT.NULL )THEN
+*
+*                    Check the result column by column.
+*
+                     IF( INCX.GT.0 )THEN
+                        DO 40 I = 1, N
+                           Z( I ) = X( I )
+   40                   CONTINUE
+                     ELSE
+                        DO 50 I = 1, N
+                           Z( I ) = X( N - I + 1 )
+   50                   CONTINUE
+                     END IF
+                     JA = 1
+                     DO 60 J = 1, N
+                        W( 1 ) = DCONJG( Z( J ) )
+                        IF( UPPER )THEN
+                           JJ = 1
+                           LJ = J
+                        ELSE
+                           JJ = J
+                           LJ = N - J + 1
+                        END IF
+                        CALL ZMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W,
+     $                              1, ONE, A( JJ, J ), 1, YT, G,
+     $                              AA( JA ), EPS, ERR, FATAL, NOUT,
+     $                              .TRUE. )
+                        IF( FULL )THEN
+                           IF( UPPER )THEN
+                              JA = JA + LDA
+                           ELSE
+                              JA = JA + LDA + 1
+                           END IF
+                        ELSE
+                           JA = JA + LJ
+                        END IF
+                        ERRMAX = MAX( ERRMAX, ERR )
+*                       If got really bad answer, report and return.
+                        IF( FATAL )
+     $                     GO TO 110
+   60                CONTINUE
+                  ELSE
+*                    Avoid repeating tests if N.le.0.
+                     IF( N.LE.0 )
+     $                  GO TO 100
+                  END IF
+*
+   70          CONTINUE
+*
+   80       CONTINUE
+*
+   90    CONTINUE
+*
+  100 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 130
+*
+  110 CONTINUE
+      WRITE( NOUT, FMT = 9995 )J
+*
+  120 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( FULL )THEN
+         WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, RALPHA, INCX, LDA
+      ELSE IF( PACKED )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, RALPHA, INCX
+      END IF
+*
+  130 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,',
+     $      I2, ', AP)                                         .' )
+ 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,',
+     $      I2, ', A,', I3, ')                                      .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of ZCHK5.
+*
+      END
+      SUBROUTINE ZCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX,
+     $                  INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G,
+     $                  Z )
+*
+*  Tests ZHER2 and ZHPR2.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO, HALF, ONE
+      PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ),
+     $                   HALF = ( 0.5D0, 0.0D0 ),
+     $                   ONE = ( 1.0D0, 0.0D0 ) )
+      DOUBLE PRECISION   RZERO
+      PARAMETER          ( RZERO = 0.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX*16         A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ),
+     $                   XX( NMAX*INCMAX ), Y( NMAX ),
+     $                   YS( NMAX*INCMAX ), YT( NMAX ),
+     $                   YY( NMAX*INCMAX ), Z( NMAX, 2 )
+      DOUBLE PRECISION   G( NMAX )
+      INTEGER            IDIM( NIDIM ), INC( NINC )
+*     .. Local Scalars ..
+      COMPLEX*16         ALPHA, ALS, TRANSL
+      DOUBLE PRECISION   ERR, ERRMAX
+      INTEGER            I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX,
+     $                   IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N,
+     $                   NARGS, NC, NS
+      LOGICAL            FULL, NULL, PACKED, RESET, SAME, UPPER
+      CHARACTER*1        UPLO, UPLOS
+      CHARACTER*2        ICH
+*     .. Local Arrays ..
+      COMPLEX*16         W( 2 )
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LZE, LZERES
+      EXTERNAL           LZE, LZERES
+*     .. External Subroutines ..
+      EXTERNAL           ZHER2, ZHPR2, ZMAKE, ZMVCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DCONJG, MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICH/'UL'/
+*     .. Executable Statements ..
+      FULL = SNAME( 3: 3 ).EQ.'E'
+      PACKED = SNAME( 3: 3 ).EQ.'P'
+*     Define the number of arguments.
+      IF( FULL )THEN
+         NARGS = 9
+      ELSE IF( PACKED )THEN
+         NARGS = 8
+      END IF
+*
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*
+      DO 140 IN = 1, NIDIM
+         N = IDIM( IN )
+*        Set LDA to 1 more than minimum value if room.
+         LDA = N
+         IF( LDA.LT.NMAX )
+     $      LDA = LDA + 1
+*        Skip tests if not enough room.
+         IF( LDA.GT.NMAX )
+     $      GO TO 140
+         IF( PACKED )THEN
+            LAA = ( N*( N + 1 ) )/2
+         ELSE
+            LAA = LDA*N
+         END IF
+*
+         DO 130 IC = 1, 2
+            UPLO = ICH( IC: IC )
+            UPPER = UPLO.EQ.'U'
+*
+            DO 120 IX = 1, NINC
+               INCX = INC( IX )
+               LX = ABS( INCX )*N
+*
+*              Generate the vector X.
+*
+               TRANSL = HALF
+               CALL ZMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ),
+     $                     0, N - 1, RESET, TRANSL )
+               IF( N.GT.1 )THEN
+                  X( N/2 ) = ZERO
+                  XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO
+               END IF
+*
+               DO 110 IY = 1, NINC
+                  INCY = INC( IY )
+                  LY = ABS( INCY )*N
+*
+*                 Generate the vector Y.
+*
+                  TRANSL = ZERO
+                  CALL ZMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY,
+     $                        ABS( INCY ), 0, N - 1, RESET, TRANSL )
+                  IF( N.GT.1 )THEN
+                     Y( N/2 ) = ZERO
+                     YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO
+                  END IF
+*
+                  DO 100 IA = 1, NALF
+                     ALPHA = ALF( IA )
+                     NULL = N.LE.0.OR.ALPHA.EQ.ZERO
+*
+*                    Generate the matrix A.
+*
+                     TRANSL = ZERO
+                     CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A,
+     $                           NMAX, AA, LDA, N - 1, N - 1, RESET,
+     $                           TRANSL )
+*
+                     NC = NC + 1
+*
+*                    Save every datum before calling the subroutine.
+*
+                     UPLOS = UPLO
+                     NS = N
+                     ALS = ALPHA
+                     DO 10 I = 1, LAA
+                        AS( I ) = AA( I )
+   10                CONTINUE
+                     LDAS = LDA
+                     DO 20 I = 1, LX
+                        XS( I ) = XX( I )
+   20                CONTINUE
+                     INCXS = INCX
+                     DO 30 I = 1, LY
+                        YS( I ) = YY( I )
+   30                CONTINUE
+                     INCYS = INCY
+*
+*                    Call the subroutine.
+*
+                     IF( FULL )THEN
+                        IF( TRACE )
+     $                     WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N,
+     $                     ALPHA, INCX, INCY, LDA
+                        IF( REWI )
+     $                     REWIND NTRA
+                        CALL ZHER2( UPLO, N, ALPHA, XX, INCX, YY, INCY,
+     $                              AA, LDA )
+                     ELSE IF( PACKED )THEN
+                        IF( TRACE )
+     $                     WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N,
+     $                     ALPHA, INCX, INCY
+                        IF( REWI )
+     $                     REWIND NTRA
+                        CALL ZHPR2( UPLO, N, ALPHA, XX, INCX, YY, INCY,
+     $                              AA )
+                     END IF
+*
+*                    Check if error-exit was taken incorrectly.
+*
+                     IF( .NOT.OK )THEN
+                        WRITE( NOUT, FMT = 9992 )
+                        FATAL = .TRUE.
+                        GO TO 160
+                     END IF
+*
+*                    See what data changed inside subroutines.
+*
+                     ISAME( 1 ) = UPLO.EQ.UPLOS
+                     ISAME( 2 ) = NS.EQ.N
+                     ISAME( 3 ) = ALS.EQ.ALPHA
+                     ISAME( 4 ) = LZE( XS, XX, LX )
+                     ISAME( 5 ) = INCXS.EQ.INCX
+                     ISAME( 6 ) = LZE( YS, YY, LY )
+                     ISAME( 7 ) = INCYS.EQ.INCY
+                     IF( NULL )THEN
+                        ISAME( 8 ) = LZE( AS, AA, LAA )
+                     ELSE
+                        ISAME( 8 ) = LZERES( SNAME( 2: 3 ), UPLO, N, N,
+     $                               AS, AA, LDA )
+                     END IF
+                     IF( .NOT.PACKED )THEN
+                        ISAME( 9 ) = LDAS.EQ.LDA
+                     END IF
+*
+*                    If data was incorrectly changed, report and return.
+*
+                     SAME = .TRUE.
+                     DO 40 I = 1, NARGS
+                        SAME = SAME.AND.ISAME( I )
+                        IF( .NOT.ISAME( I ) )
+     $                     WRITE( NOUT, FMT = 9998 )I
+   40                CONTINUE
+                     IF( .NOT.SAME )THEN
+                        FATAL = .TRUE.
+                        GO TO 160
+                     END IF
+*
+                     IF( .NOT.NULL )THEN
+*
+*                       Check the result column by column.
+*
+                        IF( INCX.GT.0 )THEN
+                           DO 50 I = 1, N
+                              Z( I, 1 ) = X( I )
+   50                      CONTINUE
+                        ELSE
+                           DO 60 I = 1, N
+                              Z( I, 1 ) = X( N - I + 1 )
+   60                      CONTINUE
+                        END IF
+                        IF( INCY.GT.0 )THEN
+                           DO 70 I = 1, N
+                              Z( I, 2 ) = Y( I )
+   70                      CONTINUE
+                        ELSE
+                           DO 80 I = 1, N
+                              Z( I, 2 ) = Y( N - I + 1 )
+   80                      CONTINUE
+                        END IF
+                        JA = 1
+                        DO 90 J = 1, N
+                           W( 1 ) = ALPHA*DCONJG( Z( J, 2 ) )
+                           W( 2 ) = DCONJG( ALPHA )*DCONJG( Z( J, 1 ) )
+                           IF( UPPER )THEN
+                              JJ = 1
+                              LJ = J
+                           ELSE
+                              JJ = J
+                              LJ = N - J + 1
+                           END IF
+                           CALL ZMVCH( 'N', LJ, 2, ONE, Z( JJ, 1 ),
+     $                                 NMAX, W, 1, ONE, A( JJ, J ), 1,
+     $                                 YT, G, AA( JA ), EPS, ERR, FATAL,
+     $                                 NOUT, .TRUE. )
+                           IF( FULL )THEN
+                              IF( UPPER )THEN
+                                 JA = JA + LDA
+                              ELSE
+                                 JA = JA + LDA + 1
+                              END IF
+                           ELSE
+                              JA = JA + LJ
+                           END IF
+                           ERRMAX = MAX( ERRMAX, ERR )
+*                          If got really bad answer, report and return.
+                           IF( FATAL )
+     $                        GO TO 150
+   90                   CONTINUE
+                     ELSE
+*                       Avoid repeating tests with N.le.0.
+                        IF( N.LE.0 )
+     $                     GO TO 140
+                     END IF
+*
+  100             CONTINUE
+*
+  110          CONTINUE
+*
+  120       CONTINUE
+*
+  130    CONTINUE
+*
+  140 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 170
+*
+  150 CONTINUE
+      WRITE( NOUT, FMT = 9995 )J
+*
+  160 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( FULL )THEN
+         WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX,
+     $      INCY, LDA
+      ELSE IF( PACKED )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX, INCY
+      END IF
+*
+  170 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',',
+     $      F4.1, '), X,', I2, ', Y,', I2, ', AP)                     ',
+     $      '       .' )
+ 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',',
+     $      F4.1, '), X,', I2, ', Y,', I2, ', A,', I3, ')             ',
+     $      '            .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of ZCHK6.
+*
+      END
+      SUBROUTINE ZCHKE( ISNUM, SRNAMT, NOUT )
+*
+*  Tests the error exits from the Level 2 Blas.
+*  Requires a special version of the error-handling routine XERBLA.
+*  ALPHA, RALPHA, BETA, A, X and Y should not need to be defined.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      INTEGER            ISNUM, NOUT
+      CHARACTER*6        SRNAMT
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Local Scalars ..
+      COMPLEX*16         ALPHA, BETA
+      DOUBLE PRECISION   RALPHA
+*     .. Local Arrays ..
+      COMPLEX*16         A( 1, 1 ), X( 1 ), Y( 1 )
+*     .. External Subroutines ..
+      EXTERNAL           CHKXER, ZGBMV, ZGEMV, ZGERC, ZGERU, ZHBMV,
+     $                   ZHEMV, ZHER, ZHER2, ZHPMV, ZHPR, ZHPR2, ZTBMV,
+     $                   ZTBSV, ZTPMV, ZTPSV, ZTRMV, ZTRSV
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Executable Statements ..
+*     OK is set to .FALSE. by the special version of XERBLA or by CHKXER
+*     if anything is wrong.
+      OK = .TRUE.
+*     LERR is set to .TRUE. by the special version of XERBLA each time
+*     it is called, and is then tested and re-set by CHKXER.
+      LERR = .FALSE.
+      GO TO ( 10, 20, 30, 40, 50, 60, 70, 80,
+     $        90, 100, 110, 120, 130, 140, 150, 160,
+     $        170 )ISNUM
+   10 INFOT = 1
+      CALL ZGEMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZGEMV( 'N', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZGEMV( 'N', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZGEMV( 'N', 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL ZGEMV( 'N', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZGEMV( 'N', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+   20 INFOT = 1
+      CALL ZGBMV( '/', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZGBMV( 'N', -1, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZGBMV( 'N', 0, -1, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZGBMV( 'N', 0, 0, -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZGBMV( 'N', 2, 0, 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL ZGBMV( 'N', 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL ZGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL ZGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+   30 INFOT = 1
+      CALL ZHEMV( '/', 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZHEMV( 'U', -1, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZHEMV( 'U', 2, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZHEMV( 'U', 0, ALPHA, A, 1, X, 0, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL ZHEMV( 'U', 0, ALPHA, A, 1, X, 1, BETA, Y, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+   40 INFOT = 1
+      CALL ZHBMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZHBMV( 'U', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZHBMV( 'U', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZHBMV( 'U', 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL ZHBMV( 'U', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZHBMV( 'U', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+   50 INFOT = 1
+      CALL ZHPMV( '/', 0, ALPHA, A, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZHPMV( 'U', -1, ALPHA, A, X, 1, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZHPMV( 'U', 0, ALPHA, A, X, 0, BETA, Y, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZHPMV( 'U', 0, ALPHA, A, X, 1, BETA, Y, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+   60 INFOT = 1
+      CALL ZTRMV( '/', 'N', 'N', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZTRMV( 'U', '/', 'N', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZTRMV( 'U', 'N', '/', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZTRMV( 'U', 'N', 'N', -1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRMV( 'U', 'N', 'N', 2, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL ZTRMV( 'U', 'N', 'N', 0, A, 1, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+   70 INFOT = 1
+      CALL ZTBMV( '/', 'N', 'N', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZTBMV( 'U', '/', 'N', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZTBMV( 'U', 'N', '/', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZTBMV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTBMV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZTBMV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTBMV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+   80 INFOT = 1
+      CALL ZTPMV( '/', 'N', 'N', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZTPMV( 'U', '/', 'N', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZTPMV( 'U', 'N', '/', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZTPMV( 'U', 'N', 'N', -1, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZTPMV( 'U', 'N', 'N', 0, A, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+   90 INFOT = 1
+      CALL ZTRSV( '/', 'N', 'N', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZTRSV( 'U', '/', 'N', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZTRSV( 'U', 'N', '/', 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZTRSV( 'U', 'N', 'N', -1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRSV( 'U', 'N', 'N', 2, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL ZTRSV( 'U', 'N', 'N', 0, A, 1, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+  100 INFOT = 1
+      CALL ZTBSV( '/', 'N', 'N', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZTBSV( 'U', '/', 'N', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZTBSV( 'U', 'N', '/', 0, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZTBSV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTBSV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZTBSV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTBSV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+  110 INFOT = 1
+      CALL ZTPSV( '/', 'N', 'N', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZTPSV( 'U', '/', 'N', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZTPSV( 'U', 'N', '/', 0, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZTPSV( 'U', 'N', 'N', -1, A, X, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZTPSV( 'U', 'N', 'N', 0, A, X, 0 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+  120 INFOT = 1
+      CALL ZGERC( -1, 0, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZGERC( 0, -1, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZGERC( 0, 0, ALPHA, X, 0, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZGERC( 0, 0, ALPHA, X, 1, Y, 0, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZGERC( 2, 0, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+  130 INFOT = 1
+      CALL ZGERU( -1, 0, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZGERU( 0, -1, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZGERU( 0, 0, ALPHA, X, 0, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZGERU( 0, 0, ALPHA, X, 1, Y, 0, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZGERU( 2, 0, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+  140 INFOT = 1
+      CALL ZHER( '/', 0, RALPHA, X, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZHER( 'U', -1, RALPHA, X, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZHER( 'U', 0, RALPHA, X, 0, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZHER( 'U', 2, RALPHA, X, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+  150 INFOT = 1
+      CALL ZHPR( '/', 0, RALPHA, X, 1, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZHPR( 'U', -1, RALPHA, X, 1, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZHPR( 'U', 0, RALPHA, X, 0, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+  160 INFOT = 1
+      CALL ZHER2( '/', 0, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZHER2( 'U', -1, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZHER2( 'U', 0, ALPHA, X, 0, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZHER2( 'U', 0, ALPHA, X, 1, Y, 0, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZHER2( 'U', 2, ALPHA, X, 1, Y, 1, A, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 180
+  170 INFOT = 1
+      CALL ZHPR2( '/', 0, ALPHA, X, 1, Y, 1, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZHPR2( 'U', -1, ALPHA, X, 1, Y, 1, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZHPR2( 'U', 0, ALPHA, X, 0, Y, 1, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZHPR2( 'U', 0, ALPHA, X, 1, Y, 0, A )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+*
+  180 IF( OK )THEN
+         WRITE( NOUT, FMT = 9999 )SRNAMT
+      ELSE
+         WRITE( NOUT, FMT = 9998 )SRNAMT
+      END IF
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' )
+ 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****',
+     $      '**' )
+*
+*     End of ZCHKE.
+*
+      END
+      SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL,
+     $                  KU, RESET, TRANSL )
+*
+*  Generates values for an M by N matrix A within the bandwidth
+*  defined by KL and KU.
+*  Stores the values in the array AA in the data structure required
+*  by the routine, with unwanted elements set to rogue value.
+*
+*  TYPE is 'GE', 'GB', 'HE', 'HB', 'HP', 'TR', 'TB' OR 'TP'.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO, ONE
+      PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ),
+     $                   ONE = ( 1.0D0, 0.0D0 ) )
+      COMPLEX*16         ROGUE
+      PARAMETER          ( ROGUE = ( -1.0D10, 1.0D10 ) )
+      DOUBLE PRECISION   RZERO
+      PARAMETER          ( RZERO = 0.0D0 )
+      DOUBLE PRECISION   RROGUE
+      PARAMETER          ( RROGUE = -1.0D10 )
+*     .. Scalar Arguments ..
+      COMPLEX*16         TRANSL
+      INTEGER            KL, KU, LDA, M, N, NMAX
+      LOGICAL            RESET
+      CHARACTER*1        DIAG, UPLO
+      CHARACTER*2        TYPE
+*     .. Array Arguments ..
+      COMPLEX*16         A( NMAX, * ), AA( * )
+*     .. Local Scalars ..
+      INTEGER            I, I1, I2, I3, IBEG, IEND, IOFF, J, JJ, KK
+      LOGICAL            GEN, LOWER, SYM, TRI, UNIT, UPPER
+*     .. External Functions ..
+      COMPLEX*16         ZBEG
+      EXTERNAL           ZBEG
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE, DCMPLX, DCONJG, MAX, MIN
+*     .. Executable Statements ..
+      GEN = TYPE( 1: 1 ).EQ.'G'
+      SYM = TYPE( 1: 1 ).EQ.'H'
+      TRI = TYPE( 1: 1 ).EQ.'T'
+      UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U'
+      LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L'
+      UNIT = TRI.AND.DIAG.EQ.'U'
+*
+*     Generate data in array A.
+*
+      DO 20 J = 1, N
+         DO 10 I = 1, M
+            IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) )
+     $          THEN
+               IF( ( I.LE.J.AND.J - I.LE.KU ).OR.
+     $             ( I.GE.J.AND.I - J.LE.KL ) )THEN
+                  A( I, J ) = ZBEG( RESET ) + TRANSL
+               ELSE
+                  A( I, J ) = ZERO
+               END IF
+               IF( I.NE.J )THEN
+                  IF( SYM )THEN
+                     A( J, I ) = DCONJG( A( I, J ) )
+                  ELSE IF( TRI )THEN
+                     A( J, I ) = ZERO
+                  END IF
+               END IF
+            END IF
+   10    CONTINUE
+         IF( SYM )
+     $      A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO )
+         IF( TRI )
+     $      A( J, J ) = A( J, J ) + ONE
+         IF( UNIT )
+     $      A( J, J ) = ONE
+   20 CONTINUE
+*
+*     Store elements in array AS in data structure required by routine.
+*
+      IF( TYPE.EQ.'GE' )THEN
+         DO 50 J = 1, N
+            DO 30 I = 1, M
+               AA( I + ( J - 1 )*LDA ) = A( I, J )
+   30       CONTINUE
+            DO 40 I = M + 1, LDA
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+   40       CONTINUE
+   50    CONTINUE
+      ELSE IF( TYPE.EQ.'GB' )THEN
+         DO 90 J = 1, N
+            DO 60 I1 = 1, KU + 1 - J
+               AA( I1 + ( J - 1 )*LDA ) = ROGUE
+   60       CONTINUE
+            DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J )
+               AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J )
+   70       CONTINUE
+            DO 80 I3 = I2, LDA
+               AA( I3 + ( J - 1 )*LDA ) = ROGUE
+   80       CONTINUE
+   90    CONTINUE
+      ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'TR' )THEN
+         DO 130 J = 1, N
+            IF( UPPER )THEN
+               IBEG = 1
+               IF( UNIT )THEN
+                  IEND = J - 1
+               ELSE
+                  IEND = J
+               END IF
+            ELSE
+               IF( UNIT )THEN
+                  IBEG = J + 1
+               ELSE
+                  IBEG = J
+               END IF
+               IEND = N
+            END IF
+            DO 100 I = 1, IBEG - 1
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+  100       CONTINUE
+            DO 110 I = IBEG, IEND
+               AA( I + ( J - 1 )*LDA ) = A( I, J )
+  110       CONTINUE
+            DO 120 I = IEND + 1, LDA
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+  120       CONTINUE
+            IF( SYM )THEN
+               JJ = J + ( J - 1 )*LDA
+               AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE )
+            END IF
+  130    CONTINUE
+      ELSE IF( TYPE.EQ.'HB'.OR.TYPE.EQ.'TB' )THEN
+         DO 170 J = 1, N
+            IF( UPPER )THEN
+               KK = KL + 1
+               IBEG = MAX( 1, KL + 2 - J )
+               IF( UNIT )THEN
+                  IEND = KL
+               ELSE
+                  IEND = KL + 1
+               END IF
+            ELSE
+               KK = 1
+               IF( UNIT )THEN
+                  IBEG = 2
+               ELSE
+                  IBEG = 1
+               END IF
+               IEND = MIN( KL + 1, 1 + M - J )
+            END IF
+            DO 140 I = 1, IBEG - 1
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+  140       CONTINUE
+            DO 150 I = IBEG, IEND
+               AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J )
+  150       CONTINUE
+            DO 160 I = IEND + 1, LDA
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+  160       CONTINUE
+            IF( SYM )THEN
+               JJ = KK + ( J - 1 )*LDA
+               AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE )
+            END IF
+  170    CONTINUE
+      ELSE IF( TYPE.EQ.'HP'.OR.TYPE.EQ.'TP' )THEN
+         IOFF = 0
+         DO 190 J = 1, N
+            IF( UPPER )THEN
+               IBEG = 1
+               IEND = J
+            ELSE
+               IBEG = J
+               IEND = N
+            END IF
+            DO 180 I = IBEG, IEND
+               IOFF = IOFF + 1
+               AA( IOFF ) = A( I, J )
+               IF( I.EQ.J )THEN
+                  IF( UNIT )
+     $               AA( IOFF ) = ROGUE
+                  IF( SYM )
+     $               AA( IOFF ) = DCMPLX( DBLE( AA( IOFF ) ), RROGUE )
+               END IF
+  180       CONTINUE
+  190    CONTINUE
+      END IF
+      RETURN
+*
+*     End of ZMAKE.
+*
+      END
+      SUBROUTINE ZMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y,
+     $                  INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV )
+*
+*  Checks the results of the computational tests.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO
+      PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ) )
+      DOUBLE PRECISION   RZERO, RONE
+      PARAMETER          ( RZERO = 0.0D0, RONE = 1.0D0 )
+*     .. Scalar Arguments ..
+      COMPLEX*16         ALPHA, BETA
+      DOUBLE PRECISION   EPS, ERR
+      INTEGER            INCX, INCY, M, N, NMAX, NOUT
+      LOGICAL            FATAL, MV
+      CHARACTER*1        TRANS
+*     .. Array Arguments ..
+      COMPLEX*16         A( NMAX, * ), X( * ), Y( * ), YT( * ), YY( * )
+      DOUBLE PRECISION   G( * )
+*     .. Local Scalars ..
+      COMPLEX*16         C
+      DOUBLE PRECISION   ERRI
+      INTEGER            I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL
+      LOGICAL            CTRAN, TRAN
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DBLE, DCONJG, DIMAG, MAX, SQRT
+*     .. Statement Functions ..
+      DOUBLE PRECISION   ABS1
+*     .. Statement Function definitions ..
+      ABS1( C ) = ABS( DBLE( C ) ) + ABS( DIMAG( C ) )
+*     .. Executable Statements ..
+      TRAN = TRANS.EQ.'T'
+      CTRAN = TRANS.EQ.'C'
+      IF( TRAN.OR.CTRAN )THEN
+         ML = N
+         NL = M
+      ELSE
+         ML = M
+         NL = N
+      END IF
+      IF( INCX.LT.0 )THEN
+         KX = NL
+         INCXL = -1
+      ELSE
+         KX = 1
+         INCXL = 1
+      END IF
+      IF( INCY.LT.0 )THEN
+         KY = ML
+         INCYL = -1
+      ELSE
+         KY = 1
+         INCYL = 1
+      END IF
+*
+*     Compute expected result in YT using data in A, X and Y.
+*     Compute gauges in G.
+*
+      IY = KY
+      DO 40 I = 1, ML
+         YT( IY ) = ZERO
+         G( IY ) = RZERO
+         JX = KX
+         IF( TRAN )THEN
+            DO 10 J = 1, NL
+               YT( IY ) = YT( IY ) + A( J, I )*X( JX )
+               G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) )
+               JX = JX + INCXL
+   10       CONTINUE
+         ELSE IF( CTRAN )THEN
+            DO 20 J = 1, NL
+               YT( IY ) = YT( IY ) + DCONJG( A( J, I ) )*X( JX )
+               G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) )
+               JX = JX + INCXL
+   20       CONTINUE
+         ELSE
+            DO 30 J = 1, NL
+               YT( IY ) = YT( IY ) + A( I, J )*X( JX )
+               G( IY ) = G( IY ) + ABS1( A( I, J ) )*ABS1( X( JX ) )
+               JX = JX + INCXL
+   30       CONTINUE
+         END IF
+         YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY )
+         G( IY ) = ABS1( ALPHA )*G( IY ) + ABS1( BETA )*ABS1( Y( IY ) )
+         IY = IY + INCYL
+   40 CONTINUE
+*
+*     Compute the error ratio for this result.
+*
+      ERR = ZERO
+      DO 50 I = 1, ML
+         ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS
+         IF( G( I ).NE.RZERO )
+     $      ERRI = ERRI/G( I )
+         ERR = MAX( ERR, ERRI )
+         IF( ERR*SQRT( EPS ).GE.RONE )
+     $      GO TO 60
+   50 CONTINUE
+*     If the loop completes, all results are at least half accurate.
+      GO TO 80
+*
+*     Report fatal error.
+*
+   60 FATAL = .TRUE.
+      WRITE( NOUT, FMT = 9999 )
+      DO 70 I = 1, ML
+         IF( MV )THEN
+            WRITE( NOUT, FMT = 9998 )I, YT( I ),
+     $         YY( 1 + ( I - 1 )*ABS( INCY ) )
+         ELSE
+            WRITE( NOUT, FMT = 9998 )I,
+     $         YY( 1 + ( I - 1 )*ABS( INCY ) ), YT( I )
+         END IF
+   70 CONTINUE
+*
+   80 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL',
+     $      'F ACCURATE *******', /'                       EXPECTED RE',
+     $      'SULT                    COMPUTED RESULT' )
+ 9998 FORMAT( 1X, I7, 2( '  (', G15.6, ',', G15.6, ')' ) )
+*
+*     End of ZMVCH.
+*
+      END
+      LOGICAL FUNCTION LZE( RI, RJ, LR )
+*
+*  Tests if two arrays are identical.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      INTEGER            LR
+*     .. Array Arguments ..
+      COMPLEX*16         RI( * ), RJ( * )
+*     .. Local Scalars ..
+      INTEGER            I
+*     .. Executable Statements ..
+      DO 10 I = 1, LR
+         IF( RI( I ).NE.RJ( I ) )
+     $      GO TO 20
+   10 CONTINUE
+      LZE = .TRUE.
+      GO TO 30
+   20 CONTINUE
+      LZE = .FALSE.
+   30 RETURN
+*
+*     End of LZE.
+*
+      END
+      LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA )
+*
+*  Tests if selected elements in two arrays are equal.
+*
+*  TYPE is 'GE', 'HE' or 'HP'.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      INTEGER            LDA, M, N
+      CHARACTER*1        UPLO
+      CHARACTER*2        TYPE
+*     .. Array Arguments ..
+      COMPLEX*16         AA( LDA, * ), AS( LDA, * )
+*     .. Local Scalars ..
+      INTEGER            I, IBEG, IEND, J
+      LOGICAL            UPPER
+*     .. Executable Statements ..
+      UPPER = UPLO.EQ.'U'
+      IF( TYPE.EQ.'GE' )THEN
+         DO 20 J = 1, N
+            DO 10 I = M + 1, LDA
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   10       CONTINUE
+   20    CONTINUE
+      ELSE IF( TYPE.EQ.'HE' )THEN
+         DO 50 J = 1, N
+            IF( UPPER )THEN
+               IBEG = 1
+               IEND = J
+            ELSE
+               IBEG = J
+               IEND = N
+            END IF
+            DO 30 I = 1, IBEG - 1
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   30       CONTINUE
+            DO 40 I = IEND + 1, LDA
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   40       CONTINUE
+   50    CONTINUE
+      END IF
+*
+      LZERES = .TRUE.
+      GO TO 80
+   70 CONTINUE
+      LZERES = .FALSE.
+   80 RETURN
+*
+*     End of LZERES.
+*
+      END
+      COMPLEX*16 FUNCTION ZBEG( RESET )
+*
+*  Generates complex numbers as pairs of random numbers uniformly
+*  distributed between -0.5 and 0.5.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      LOGICAL            RESET
+*     .. Local Scalars ..
+      INTEGER            I, IC, J, MI, MJ
+*     .. Save statement ..
+      SAVE               I, IC, J, MI, MJ
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCMPLX
+*     .. Executable Statements ..
+      IF( RESET )THEN
+*        Initialize local variables.
+         MI = 891
+         MJ = 457
+         I = 7
+         J = 7
+         IC = 0
+         RESET = .FALSE.
+      END IF
+*
+*     The sequence of values of I or J is bounded between 1 and 999.
+*     If initial I or J = 1,2,3,6,7 or 9, the period will be 50.
+*     If initial I or J = 4 or 8, the period will be 25.
+*     If initial I or J = 5, the period will be 10.
+*     IC is used to break up the period by skipping 1 value of I or J
+*     in 6.
+*
+      IC = IC + 1
+   10 I = I*MI
+      J = J*MJ
+      I = I - 1000*( I/1000 )
+      J = J - 1000*( J/1000 )
+      IF( IC.GE.5 )THEN
+         IC = 0
+         GO TO 10
+      END IF
+      ZBEG = DCMPLX( ( I - 500 )/1001.0D0, ( J - 500 )/1001.0D0 )
+      RETURN
+*
+*     End of ZBEG.
+*
+      END
+      DOUBLE PRECISION FUNCTION DDIFF( X, Y )
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   X, Y
+*     .. Executable Statements ..
+      DDIFF = X - Y
+      RETURN
+*
+*     End of DDIFF.
+*
+      END
+      SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+*
+*  Tests whether XERBLA has detected an error when it should.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFOT, NOUT
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Executable Statements ..
+      IF( .NOT.LERR )THEN
+         WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT
+         OK = .FALSE.
+      END IF
+      LERR = .FALSE.
+      RETURN
+*
+ 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D',
+     $      'ETECTED BY ', A6, ' *****' )
+*
+*     End of CHKXER.
+*
+      END
+      SUBROUTINE XERBLA( SRNAME, INFO )
+*
+*  This is a special version of XERBLA to be used only as part of
+*  the test program for testing error exits from the Level 2 BLAS
+*  routines.
+*
+*  XERBLA  is an error handler for the Level 2 BLAS routines.
+*
+*  It is called by the Level 2 BLAS routines if an input parameter is
+*  invalid.
+*
+*  Auxiliary routine for test program for Level 2 Blas.
+*
+*  -- Written on 10-August-1987.
+*     Richard Hanson, Sandia National Labs.
+*     Jeremy Du Croz, NAG Central Office.
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO
+      CHARACTER*6        SRNAME
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUT
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUT, OK, LERR
+      COMMON             /SRNAMC/SRNAMT
+*     .. Executable Statements ..
+      LERR = .TRUE.
+      IF( INFO.NE.INFOT )THEN
+         IF( INFOT.NE.0 )THEN
+            WRITE( NOUT, FMT = 9999 )INFO, INFOT
+         ELSE
+            WRITE( NOUT, FMT = 9997 )INFO
+         END IF
+         OK = .FALSE.
+      END IF
+      IF( SRNAME.NE.SRNAMT )THEN
+         WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT
+         OK = .FALSE.
+      END IF
+      RETURN
+*
+ 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD',
+     $      ' OF ', I2, ' *******' )
+ 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE',
+     $      'AD OF ', A6, ' *******' )
+ 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6,
+     $      ' *******' )
+*
+*     End of XERBLA
+*
+      END
+

diff --git a/blas/testing/zblat3.dat b/blas/testing/zblat3.dat
new file mode 100644
index 0000000..ede516f
--- /dev/null
+++ b/blas/testing/zblat3.dat

@@ -0,0 +1,23 @@
+'zblat3.summ'     NAME OF SUMMARY OUTPUT FILE
+6                 UNIT NUMBER OF SUMMARY FILE
+'zblat3.snap'     NAME OF SNAPSHOT OUTPUT FILE
+-1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+F        LOGICAL FLAG, T TO STOP ON FAILURES.
+F        LOGICAL FLAG, T TO TEST ERROR EXITS.
+16.0     THRESHOLD VALUE OF TEST RATIO
+6                 NUMBER OF VALUES OF N
+0 1 2 3 5 9       VALUES OF N
+3                 NUMBER OF VALUES OF ALPHA
+(0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
+3                 NUMBER OF VALUES OF BETA
+(0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
+ZGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+ZHEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+ZSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
+ZTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
+ZTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
+ZHERK  T PUT F FOR NO TEST. SAME COLUMNS.
+ZSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
+ZHER2K T PUT F FOR NO TEST. SAME COLUMNS.
+ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS.

diff --git a/blas/testing/zblat3.f b/blas/testing/zblat3.f
new file mode 100644
index 0000000..59ca241
--- /dev/null
+++ b/blas/testing/zblat3.f

@@ -0,0 +1,3502 @@
+*> \brief \b ZBLAT3
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM ZBLAT3
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the COMPLEX*16       Level 3 Blas.
+*>
+*> The program must be driven by a short data file. The first 14 records
+*> of the file are read using list-directed input, the last 9 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 23 lines:
+*> 'zblat3.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'ZBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
+*> ZGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHERK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHER2K T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> 
+*> Further Details
+*> ===============
+*>
+*> See:
+*>
+*>    Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
+*>    A Set of Level 3 Basic Linear Algebra Subprograms.
+*>
+*>    Technical Memorandum No.88 (Revision 1), Mathematics and
+*>    Computer Science Division, Argonne National Laboratory, 9700
+*>    South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*> -- Written on 8-February-1989.
+*>    Jack Dongarra, Argonne National Laboratory.
+*>    Iain Duff, AERE Harwell.
+*>    Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*>    Sven Hammarling, Numerical Algorithms Group Ltd.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complex16_blas_testing
+*
+*  =====================================================================
+      PROGRAM ZBLAT3
+*
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NIN
+      PARAMETER          ( NIN = 5 )
+      INTEGER            NSUBS
+      PARAMETER          ( NSUBS = 9 )
+      COMPLEX*16         ZERO, ONE
+      PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ),
+     $                   ONE = ( 1.0D0, 0.0D0 ) )
+      DOUBLE PRECISION   RZERO
+      PARAMETER          ( RZERO = 0.0D0 )
+      INTEGER            NMAX
+      PARAMETER          ( NMAX = 65 )
+      INTEGER            NIDMAX, NALMAX, NBEMAX
+      PARAMETER          ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   EPS, ERR, THRESH
+      INTEGER            I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA
+      LOGICAL            FATAL, LTESTT, REWI, SAME, SFATAL, TRACE,
+     $                   TSTERR
+      CHARACTER*1        TRANSA, TRANSB
+      CHARACTER*6        SNAMET
+      CHARACTER*32       SNAPS, SUMMRY
+*     .. Local Arrays ..
+      COMPLEX*16         AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ),
+     $                   ALF( NALMAX ), AS( NMAX*NMAX ),
+     $                   BB( NMAX*NMAX ), BET( NBEMAX ),
+     $                   BS( NMAX*NMAX ), C( NMAX, NMAX ),
+     $                   CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ),
+     $                   W( 2*NMAX )
+      DOUBLE PRECISION   G( NMAX )
+      INTEGER            IDIM( NIDMAX )
+      LOGICAL            LTEST( NSUBS )
+      CHARACTER*6        SNAMES( NSUBS )
+*     .. External Functions ..
+      DOUBLE PRECISION   DDIFF
+      LOGICAL            LZE
+      EXTERNAL           DDIFF, LZE
+*     .. External Subroutines ..
+      EXTERNAL           ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5, ZCHKE, ZMMCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+      COMMON             /SRNAMC/SRNAMT
+*     .. Data statements ..
+      DATA               SNAMES/'ZGEMM ', 'ZHEMM ', 'ZSYMM ', 'ZTRMM ',
+     $                   'ZTRSM ', 'ZHERK ', 'ZSYRK ', 'ZHER2K',
+     $                   'ZSYR2K'/
+*     .. Executable Statements ..
+*
+*     Read name and unit number for summary output file and open file.
+*
+      READ( NIN, FMT = * )SUMMRY
+      READ( NIN, FMT = * )NOUT
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
+      NOUTC = NOUT
+*
+*     Read name and unit number for snapshot output file and open file.
+*
+      READ( NIN, FMT = * )SNAPS
+      READ( NIN, FMT = * )NTRA
+      TRACE = NTRA.GE.0
+      IF( TRACE )THEN
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
+      END IF
+*     Read the flag that directs rewinding of the snapshot file.
+      READ( NIN, FMT = * )REWI
+      REWI = REWI.AND.TRACE
+*     Read the flag that directs stopping on any failure.
+      READ( NIN, FMT = * )SFATAL
+*     Read the flag that indicates whether error exits are to be tested.
+      READ( NIN, FMT = * )TSTERR
+*     Read the threshold value of the test ratio
+      READ( NIN, FMT = * )THRESH
+*
+*     Read and check the parameter values for the tests.
+*
+*     Values of N
+      READ( NIN, FMT = * )NIDIM
+      IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'N', NIDMAX
+         GO TO 220
+      END IF
+      READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM )
+      DO 10 I = 1, NIDIM
+         IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN
+            WRITE( NOUT, FMT = 9996 )NMAX
+            GO TO 220
+         END IF
+   10 CONTINUE
+*     Values of ALPHA
+      READ( NIN, FMT = * )NALF
+      IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX
+         GO TO 220
+      END IF
+      READ( NIN, FMT = * )( ALF( I ), I = 1, NALF )
+*     Values of BETA
+      READ( NIN, FMT = * )NBET
+      IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN
+         WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX
+         GO TO 220
+      END IF
+      READ( NIN, FMT = * )( BET( I ), I = 1, NBET )
+*
+*     Report values of parameters.
+*
+      WRITE( NOUT, FMT = 9995 )
+      WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM )
+      WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF )
+      WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET )
+      IF( .NOT.TSTERR )THEN
+         WRITE( NOUT, FMT = * )
+         WRITE( NOUT, FMT = 9984 )
+      END IF
+      WRITE( NOUT, FMT = * )
+      WRITE( NOUT, FMT = 9999 )THRESH
+      WRITE( NOUT, FMT = * )
+*
+*     Read names of subroutines and flags which indicate
+*     whether they are to be tested.
+*
+      DO 20 I = 1, NSUBS
+         LTEST( I ) = .FALSE.
+   20 CONTINUE
+   30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT
+      DO 40 I = 1, NSUBS
+         IF( SNAMET.EQ.SNAMES( I ) )
+     $      GO TO 50
+   40 CONTINUE
+      WRITE( NOUT, FMT = 9990 )SNAMET
+      STOP
+   50 LTEST( I ) = LTESTT
+      GO TO 30
+*
+   60 CONTINUE
+      CLOSE ( NIN )
+*
+*     Compute EPS (the machine precision).
+*
+      EPS = EPSILON(RZERO)
+      WRITE( NOUT, FMT = 9998 )EPS
+*
+*     Check the reliability of ZMMCH using exact data.
+*
+      N = MIN( 32, NMAX )
+      DO 100 J = 1, N
+         DO 90 I = 1, N
+            AB( I, J ) = MAX( I - J + 1, 0 )
+   90    CONTINUE
+         AB( J, NMAX + 1 ) = J
+         AB( 1, NMAX + J ) = J
+         C( J, 1 ) = ZERO
+  100 CONTINUE
+      DO 110 J = 1, N
+         CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3
+  110 CONTINUE
+*     CC holds the exact result. On exit from ZMMCH CT holds
+*     the result computed by ZMMCH.
+      TRANSA = 'N'
+      TRANSB = 'N'
+      CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+     $            AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+     $            NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LZE( CC, CT, N )
+      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+         STOP
+      END IF
+      TRANSB = 'C'
+      CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+     $            AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+     $            NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LZE( CC, CT, N )
+      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+         STOP
+      END IF
+      DO 120 J = 1, N
+         AB( J, NMAX + 1 ) = N - J + 1
+         AB( 1, NMAX + J ) = N - J + 1
+  120 CONTINUE
+      DO 130 J = 1, N
+         CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 -
+     $                     ( ( J + 1 )*J*( J - 1 ) )/3
+  130 CONTINUE
+      TRANSA = 'C'
+      TRANSB = 'N'
+      CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+     $            AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+     $            NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LZE( CC, CT, N )
+      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+         STOP
+      END IF
+      TRANSB = 'C'
+      CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+     $            AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+     $            NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+      SAME = LZE( CC, CT, N )
+      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+         STOP
+      END IF
+*
+*     Test each subroutine in turn.
+*
+      DO 200 ISNUM = 1, NSUBS
+         WRITE( NOUT, FMT = * )
+         IF( .NOT.LTEST( ISNUM ) )THEN
+*           Subprogram is not to be tested.
+            WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM )
+         ELSE
+            SRNAMT = SNAMES( ISNUM )
+*           Test error exits.
+            IF( TSTERR )THEN
+               CALL ZCHKE( ISNUM, SNAMES( ISNUM ), NOUT )
+               WRITE( NOUT, FMT = * )
+            END IF
+*           Test computations.
+            INFOT = 0
+            OK = .TRUE.
+            FATAL = .FALSE.
+            GO TO ( 140, 150, 150, 160, 160, 170, 170,
+     $              180, 180 )ISNUM
+*           Test ZGEMM, 01.
+  140       CALL ZCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+     $                  NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+     $                  CC, CS, CT, G )
+            GO TO 190
+*           Test ZHEMM, 02, ZSYMM, 03.
+  150       CALL ZCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+     $                  NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+     $                  CC, CS, CT, G )
+            GO TO 190
+*           Test ZTRMM, 04, ZTRSM, 05.
+  160       CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB,
+     $                  AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C )
+            GO TO 190
+*           Test ZHERK, 06, ZSYRK, 07.
+  170       CALL ZCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+     $                  NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+     $                  CC, CS, CT, G )
+            GO TO 190
+*           Test ZHER2K, 08, ZSYR2K, 09.
+  180       CALL ZCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+     $                  REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+     $                  NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W )
+            GO TO 190
+*
+  190       IF( FATAL.AND.SFATAL )
+     $         GO TO 210
+         END IF
+  200 CONTINUE
+      WRITE( NOUT, FMT = 9986 )
+      GO TO 230
+*
+  210 CONTINUE
+      WRITE( NOUT, FMT = 9985 )
+      GO TO 230
+*
+  220 CONTINUE
+      WRITE( NOUT, FMT = 9991 )
+*
+  230 CONTINUE
+      IF( TRACE )
+     $   CLOSE ( NTRA )
+      CLOSE ( NOUT )
+      STOP
+*
+ 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES',
+     $      'S THAN', F8.2 )
+ 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 )
+ 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ',
+     $      'THAN ', I2 )
+ 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 )
+ 9995 FORMAT( ' TESTS OF THE COMPLEX*16       LEVEL 3 BLAS', //' THE F',
+     $      'OLLOWING PARAMETER VALUES WILL BE USED:' )
+ 9994 FORMAT( '   FOR N              ', 9I6 )
+ 9993 FORMAT( '   FOR ALPHA          ',
+     $      7( '(', F4.1, ',', F4.1, ')  ', : ) )
+ 9992 FORMAT( '   FOR BETA           ',
+     $      7( '(', F4.1, ',', F4.1, ')  ', : ) )
+ 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM',
+     $      /' ******* TESTS ABANDONED *******' )
+ 9990 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T',
+     $      'ESTS ABANDONED *******' )
+ 9989 FORMAT( ' ERROR IN ZMMCH -  IN-LINE DOT PRODUCTS ARE BEING EVALU',
+     $      'ATED WRONGLY.', /' ZMMCH WAS CALLED WITH TRANSA = ', A1,
+     $      ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ',
+     $      'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ',
+     $      'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ',
+     $      '*******' )
+ 9988 FORMAT( A6, L2 )
+ 9987 FORMAT( 1X, A6, ' WAS NOT TESTED' )
+ 9986 FORMAT( /' END OF TESTS' )
+ 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' )
+ 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' )
+*
+*     End of ZBLAT3.
+*
+      END
+      SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+     $                  A, AA, AS, B, BB, BS, C, CC, CS, CT, G )
+*
+*  Tests ZGEMM.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO
+      PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ) )
+      DOUBLE PRECISION   RZERO
+      PARAMETER          ( RZERO = 0.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX*16         A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), B( NMAX, NMAX ),
+     $                   BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+     $                   C( NMAX, NMAX ), CC( NMAX*NMAX ),
+     $                   CS( NMAX*NMAX ), CT( NMAX )
+      DOUBLE PRECISION   G( NMAX )
+      INTEGER            IDIM( NIDIM )
+*     .. Local Scalars ..
+      COMPLEX*16         ALPHA, ALS, BETA, BLS
+      DOUBLE PRECISION   ERR, ERRMAX
+      INTEGER            I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA,
+     $                   LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M,
+     $                   MA, MB, MS, N, NA, NARGS, NB, NC, NS
+      LOGICAL            NULL, RESET, SAME, TRANA, TRANB
+      CHARACTER*1        TRANAS, TRANBS, TRANSA, TRANSB
+      CHARACTER*3        ICH
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LZE, LZERES
+      EXTERNAL           LZE, LZERES
+*     .. External Subroutines ..
+      EXTERNAL           ZGEMM, ZMAKE, ZMMCH
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICH/'NTC'/
+*     .. Executable Statements ..
+*
+      NARGS = 13
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*
+      DO 110 IM = 1, NIDIM
+         M = IDIM( IM )
+*
+         DO 100 IN = 1, NIDIM
+            N = IDIM( IN )
+*           Set LDC to 1 more than minimum value if room.
+            LDC = M
+            IF( LDC.LT.NMAX )
+     $         LDC = LDC + 1
+*           Skip tests if not enough room.
+            IF( LDC.GT.NMAX )
+     $         GO TO 100
+            LCC = LDC*N
+            NULL = N.LE.0.OR.M.LE.0
+*
+            DO 90 IK = 1, NIDIM
+               K = IDIM( IK )
+*
+               DO 80 ICA = 1, 3
+                  TRANSA = ICH( ICA: ICA )
+                  TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C'
+*
+                  IF( TRANA )THEN
+                     MA = K
+                     NA = M
+                  ELSE
+                     MA = M
+                     NA = K
+                  END IF
+*                 Set LDA to 1 more than minimum value if room.
+                  LDA = MA
+                  IF( LDA.LT.NMAX )
+     $               LDA = LDA + 1
+*                 Skip tests if not enough room.
+                  IF( LDA.GT.NMAX )
+     $               GO TO 80
+                  LAA = LDA*NA
+*
+*                 Generate the matrix A.
+*
+                  CALL ZMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA,
+     $                        RESET, ZERO )
+*
+                  DO 70 ICB = 1, 3
+                     TRANSB = ICH( ICB: ICB )
+                     TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C'
+*
+                     IF( TRANB )THEN
+                        MB = N
+                        NB = K
+                     ELSE
+                        MB = K
+                        NB = N
+                     END IF
+*                    Set LDB to 1 more than minimum value if room.
+                     LDB = MB
+                     IF( LDB.LT.NMAX )
+     $                  LDB = LDB + 1
+*                    Skip tests if not enough room.
+                     IF( LDB.GT.NMAX )
+     $                  GO TO 70
+                     LBB = LDB*NB
+*
+*                    Generate the matrix B.
+*
+                     CALL ZMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB,
+     $                           LDB, RESET, ZERO )
+*
+                     DO 60 IA = 1, NALF
+                        ALPHA = ALF( IA )
+*
+                        DO 50 IB = 1, NBET
+                           BETA = BET( IB )
+*
+*                          Generate the matrix C.
+*
+                           CALL ZMAKE( 'GE', ' ', ' ', M, N, C, NMAX,
+     $                                 CC, LDC, RESET, ZERO )
+*
+                           NC = NC + 1
+*
+*                          Save every datum before calling the
+*                          subroutine.
+*
+                           TRANAS = TRANSA
+                           TRANBS = TRANSB
+                           MS = M
+                           NS = N
+                           KS = K
+                           ALS = ALPHA
+                           DO 10 I = 1, LAA
+                              AS( I ) = AA( I )
+   10                      CONTINUE
+                           LDAS = LDA
+                           DO 20 I = 1, LBB
+                              BS( I ) = BB( I )
+   20                      CONTINUE
+                           LDBS = LDB
+                           BLS = BETA
+                           DO 30 I = 1, LCC
+                              CS( I ) = CC( I )
+   30                      CONTINUE
+                           LDCS = LDC
+*
+*                          Call the subroutine.
+*
+                           IF( TRACE )
+     $                        WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                        TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB,
+     $                        BETA, LDC
+                           IF( REWI )
+     $                        REWIND NTRA
+                           CALL ZGEMM( TRANSA, TRANSB, M, N, K, ALPHA,
+     $                                 AA, LDA, BB, LDB, BETA, CC, LDC )
+*
+*                          Check if error-exit was taken incorrectly.
+*
+                           IF( .NOT.OK )THEN
+                              WRITE( NOUT, FMT = 9994 )
+                              FATAL = .TRUE.
+                              GO TO 120
+                           END IF
+*
+*                          See what data changed inside subroutines.
+*
+                           ISAME( 1 ) = TRANSA.EQ.TRANAS
+                           ISAME( 2 ) = TRANSB.EQ.TRANBS
+                           ISAME( 3 ) = MS.EQ.M
+                           ISAME( 4 ) = NS.EQ.N
+                           ISAME( 5 ) = KS.EQ.K
+                           ISAME( 6 ) = ALS.EQ.ALPHA
+                           ISAME( 7 ) = LZE( AS, AA, LAA )
+                           ISAME( 8 ) = LDAS.EQ.LDA
+                           ISAME( 9 ) = LZE( BS, BB, LBB )
+                           ISAME( 10 ) = LDBS.EQ.LDB
+                           ISAME( 11 ) = BLS.EQ.BETA
+                           IF( NULL )THEN
+                              ISAME( 12 ) = LZE( CS, CC, LCC )
+                           ELSE
+                              ISAME( 12 ) = LZERES( 'GE', ' ', M, N, CS,
+     $                                      CC, LDC )
+                           END IF
+                           ISAME( 13 ) = LDCS.EQ.LDC
+*
+*                          If data was incorrectly changed, report
+*                          and return.
+*
+                           SAME = .TRUE.
+                           DO 40 I = 1, NARGS
+                              SAME = SAME.AND.ISAME( I )
+                              IF( .NOT.ISAME( I ) )
+     $                           WRITE( NOUT, FMT = 9998 )I
+   40                      CONTINUE
+                           IF( .NOT.SAME )THEN
+                              FATAL = .TRUE.
+                              GO TO 120
+                           END IF
+*
+                           IF( .NOT.NULL )THEN
+*
+*                             Check the result.
+*
+                              CALL ZMMCH( TRANSA, TRANSB, M, N, K,
+     $                                    ALPHA, A, NMAX, B, NMAX, BETA,
+     $                                    C, NMAX, CT, G, CC, LDC, EPS,
+     $                                    ERR, FATAL, NOUT, .TRUE. )
+                              ERRMAX = MAX( ERRMAX, ERR )
+*                             If got really bad answer, report and
+*                             return.
+                              IF( FATAL )
+     $                           GO TO 120
+                           END IF
+*
+   50                   CONTINUE
+*
+   60                CONTINUE
+*
+   70             CONTINUE
+*
+   80          CONTINUE
+*
+   90       CONTINUE
+*
+  100    CONTINUE
+*
+  110 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 130
+*
+  120 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K,
+     $   ALPHA, LDA, LDB, BETA, LDC
+*
+  130 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',''', A1, ''',',
+     $      3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3,
+     $      ',(', F4.1, ',', F4.1, '), C,', I3, ').' )
+ 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of ZCHK1.
+*
+      END
+      SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+     $                  A, AA, AS, B, BB, BS, C, CC, CS, CT, G )
+*
+*  Tests ZHEMM and ZSYMM.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO
+      PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ) )
+      DOUBLE PRECISION   RZERO
+      PARAMETER          ( RZERO = 0.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX*16         A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), B( NMAX, NMAX ),
+     $                   BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+     $                   C( NMAX, NMAX ), CC( NMAX*NMAX ),
+     $                   CS( NMAX*NMAX ), CT( NMAX )
+      DOUBLE PRECISION   G( NMAX )
+      INTEGER            IDIM( NIDIM )
+*     .. Local Scalars ..
+      COMPLEX*16         ALPHA, ALS, BETA, BLS
+      DOUBLE PRECISION   ERR, ERRMAX
+      INTEGER            I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC,
+     $                   LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA,
+     $                   NARGS, NC, NS
+      LOGICAL            CONJ, LEFT, NULL, RESET, SAME
+      CHARACTER*1        SIDE, SIDES, UPLO, UPLOS
+      CHARACTER*2        ICHS, ICHU
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LZE, LZERES
+      EXTERNAL           LZE, LZERES
+*     .. External Subroutines ..
+      EXTERNAL           ZHEMM, ZMAKE, ZMMCH, ZSYMM
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICHS/'LR'/, ICHU/'UL'/
+*     .. Executable Statements ..
+      CONJ = SNAME( 2: 3 ).EQ.'HE'
+*
+      NARGS = 12
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*
+      DO 100 IM = 1, NIDIM
+         M = IDIM( IM )
+*
+         DO 90 IN = 1, NIDIM
+            N = IDIM( IN )
+*           Set LDC to 1 more than minimum value if room.
+            LDC = M
+            IF( LDC.LT.NMAX )
+     $         LDC = LDC + 1
+*           Skip tests if not enough room.
+            IF( LDC.GT.NMAX )
+     $         GO TO 90
+            LCC = LDC*N
+            NULL = N.LE.0.OR.M.LE.0
+*           Set LDB to 1 more than minimum value if room.
+            LDB = M
+            IF( LDB.LT.NMAX )
+     $         LDB = LDB + 1
+*           Skip tests if not enough room.
+            IF( LDB.GT.NMAX )
+     $         GO TO 90
+            LBB = LDB*N
+*
+*           Generate the matrix B.
+*
+            CALL ZMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET,
+     $                  ZERO )
+*
+            DO 80 ICS = 1, 2
+               SIDE = ICHS( ICS: ICS )
+               LEFT = SIDE.EQ.'L'
+*
+               IF( LEFT )THEN
+                  NA = M
+               ELSE
+                  NA = N
+               END IF
+*              Set LDA to 1 more than minimum value if room.
+               LDA = NA
+               IF( LDA.LT.NMAX )
+     $            LDA = LDA + 1
+*              Skip tests if not enough room.
+               IF( LDA.GT.NMAX )
+     $            GO TO 80
+               LAA = LDA*NA
+*
+               DO 70 ICU = 1, 2
+                  UPLO = ICHU( ICU: ICU )
+*
+*                 Generate the hermitian or symmetric matrix A.
+*
+                  CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', NA, NA, A, NMAX,
+     $                        AA, LDA, RESET, ZERO )
+*
+                  DO 60 IA = 1, NALF
+                     ALPHA = ALF( IA )
+*
+                     DO 50 IB = 1, NBET
+                        BETA = BET( IB )
+*
+*                       Generate the matrix C.
+*
+                        CALL ZMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC,
+     $                              LDC, RESET, ZERO )
+*
+                        NC = NC + 1
+*
+*                       Save every datum before calling the
+*                       subroutine.
+*
+                        SIDES = SIDE
+                        UPLOS = UPLO
+                        MS = M
+                        NS = N
+                        ALS = ALPHA
+                        DO 10 I = 1, LAA
+                           AS( I ) = AA( I )
+   10                   CONTINUE
+                        LDAS = LDA
+                        DO 20 I = 1, LBB
+                           BS( I ) = BB( I )
+   20                   CONTINUE
+                        LDBS = LDB
+                        BLS = BETA
+                        DO 30 I = 1, LCC
+                           CS( I ) = CC( I )
+   30                   CONTINUE
+                        LDCS = LDC
+*
+*                       Call the subroutine.
+*
+                        IF( TRACE )
+     $                     WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE,
+     $                     UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC
+                        IF( REWI )
+     $                     REWIND NTRA
+                        IF( CONJ )THEN
+                           CALL ZHEMM( SIDE, UPLO, M, N, ALPHA, AA, LDA,
+     $                                 BB, LDB, BETA, CC, LDC )
+                        ELSE
+                           CALL ZSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA,
+     $                                 BB, LDB, BETA, CC, LDC )
+                        END IF
+*
+*                       Check if error-exit was taken incorrectly.
+*
+                        IF( .NOT.OK )THEN
+                           WRITE( NOUT, FMT = 9994 )
+                           FATAL = .TRUE.
+                           GO TO 110
+                        END IF
+*
+*                       See what data changed inside subroutines.
+*
+                        ISAME( 1 ) = SIDES.EQ.SIDE
+                        ISAME( 2 ) = UPLOS.EQ.UPLO
+                        ISAME( 3 ) = MS.EQ.M
+                        ISAME( 4 ) = NS.EQ.N
+                        ISAME( 5 ) = ALS.EQ.ALPHA
+                        ISAME( 6 ) = LZE( AS, AA, LAA )
+                        ISAME( 7 ) = LDAS.EQ.LDA
+                        ISAME( 8 ) = LZE( BS, BB, LBB )
+                        ISAME( 9 ) = LDBS.EQ.LDB
+                        ISAME( 10 ) = BLS.EQ.BETA
+                        IF( NULL )THEN
+                           ISAME( 11 ) = LZE( CS, CC, LCC )
+                        ELSE
+                           ISAME( 11 ) = LZERES( 'GE', ' ', M, N, CS,
+     $                                   CC, LDC )
+                        END IF
+                        ISAME( 12 ) = LDCS.EQ.LDC
+*
+*                       If data was incorrectly changed, report and
+*                       return.
+*
+                        SAME = .TRUE.
+                        DO 40 I = 1, NARGS
+                           SAME = SAME.AND.ISAME( I )
+                           IF( .NOT.ISAME( I ) )
+     $                        WRITE( NOUT, FMT = 9998 )I
+   40                   CONTINUE
+                        IF( .NOT.SAME )THEN
+                           FATAL = .TRUE.
+                           GO TO 110
+                        END IF
+*
+                        IF( .NOT.NULL )THEN
+*
+*                          Check the result.
+*
+                           IF( LEFT )THEN
+                              CALL ZMMCH( 'N', 'N', M, N, M, ALPHA, A,
+     $                                    NMAX, B, NMAX, BETA, C, NMAX,
+     $                                    CT, G, CC, LDC, EPS, ERR,
+     $                                    FATAL, NOUT, .TRUE. )
+                           ELSE
+                              CALL ZMMCH( 'N', 'N', M, N, N, ALPHA, B,
+     $                                    NMAX, A, NMAX, BETA, C, NMAX,
+     $                                    CT, G, CC, LDC, EPS, ERR,
+     $                                    FATAL, NOUT, .TRUE. )
+                           END IF
+                           ERRMAX = MAX( ERRMAX, ERR )
+*                          If got really bad answer, report and
+*                          return.
+                           IF( FATAL )
+     $                        GO TO 110
+                        END IF
+*
+   50                CONTINUE
+*
+   60             CONTINUE
+*
+   70          CONTINUE
+*
+   80       CONTINUE
+*
+   90    CONTINUE
+*
+  100 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 120
+*
+  110 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA,
+     $   LDB, BETA, LDC
+*
+  120 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+     $      '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1,
+     $      ',', F4.1, '), C,', I3, ')    .' )
+ 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of ZCHK2.
+*
+      END
+      SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS,
+     $                  B, BB, BS, CT, G, C )
+*
+*  Tests ZTRMM and ZTRSM.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO, ONE
+      PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ),
+     $                   ONE = ( 1.0D0, 0.0D0 ) )
+      DOUBLE PRECISION   RZERO
+      PARAMETER          ( RZERO = 0.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            NALF, NIDIM, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX*16         A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), B( NMAX, NMAX ),
+     $                   BB( NMAX*NMAX ), BS( NMAX*NMAX ),
+     $                   C( NMAX, NMAX ), CT( NMAX )
+      DOUBLE PRECISION   G( NMAX )
+      INTEGER            IDIM( NIDIM )
+*     .. Local Scalars ..
+      COMPLEX*16         ALPHA, ALS
+      DOUBLE PRECISION   ERR, ERRMAX
+      INTEGER            I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB,
+     $                   LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC,
+     $                   NS
+      LOGICAL            LEFT, NULL, RESET, SAME
+      CHARACTER*1        DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO,
+     $                   UPLOS
+      CHARACTER*2        ICHD, ICHS, ICHU
+      CHARACTER*3        ICHT
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LZE, LZERES
+      EXTERNAL           LZE, LZERES
+*     .. External Subroutines ..
+      EXTERNAL           ZMAKE, ZMMCH, ZTRMM, ZTRSM
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/
+*     .. Executable Statements ..
+*
+      NARGS = 11
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*     Set up zero matrix for ZMMCH.
+      DO 20 J = 1, NMAX
+         DO 10 I = 1, NMAX
+            C( I, J ) = ZERO
+   10    CONTINUE
+   20 CONTINUE
+*
+      DO 140 IM = 1, NIDIM
+         M = IDIM( IM )
+*
+         DO 130 IN = 1, NIDIM
+            N = IDIM( IN )
+*           Set LDB to 1 more than minimum value if room.
+            LDB = M
+            IF( LDB.LT.NMAX )
+     $         LDB = LDB + 1
+*           Skip tests if not enough room.
+            IF( LDB.GT.NMAX )
+     $         GO TO 130
+            LBB = LDB*N
+            NULL = M.LE.0.OR.N.LE.0
+*
+            DO 120 ICS = 1, 2
+               SIDE = ICHS( ICS: ICS )
+               LEFT = SIDE.EQ.'L'
+               IF( LEFT )THEN
+                  NA = M
+               ELSE
+                  NA = N
+               END IF
+*              Set LDA to 1 more than minimum value if room.
+               LDA = NA
+               IF( LDA.LT.NMAX )
+     $            LDA = LDA + 1
+*              Skip tests if not enough room.
+               IF( LDA.GT.NMAX )
+     $            GO TO 130
+               LAA = LDA*NA
+*
+               DO 110 ICU = 1, 2
+                  UPLO = ICHU( ICU: ICU )
+*
+                  DO 100 ICT = 1, 3
+                     TRANSA = ICHT( ICT: ICT )
+*
+                     DO 90 ICD = 1, 2
+                        DIAG = ICHD( ICD: ICD )
+*
+                        DO 80 IA = 1, NALF
+                           ALPHA = ALF( IA )
+*
+*                          Generate the matrix A.
+*
+                           CALL ZMAKE( 'TR', UPLO, DIAG, NA, NA, A,
+     $                                 NMAX, AA, LDA, RESET, ZERO )
+*
+*                          Generate the matrix B.
+*
+                           CALL ZMAKE( 'GE', ' ', ' ', M, N, B, NMAX,
+     $                                 BB, LDB, RESET, ZERO )
+*
+                           NC = NC + 1
+*
+*                          Save every datum before calling the
+*                          subroutine.
+*
+                           SIDES = SIDE
+                           UPLOS = UPLO
+                           TRANAS = TRANSA
+                           DIAGS = DIAG
+                           MS = M
+                           NS = N
+                           ALS = ALPHA
+                           DO 30 I = 1, LAA
+                              AS( I ) = AA( I )
+   30                      CONTINUE
+                           LDAS = LDA
+                           DO 40 I = 1, LBB
+                              BS( I ) = BB( I )
+   40                      CONTINUE
+                           LDBS = LDB
+*
+*                          Call the subroutine.
+*
+                           IF( SNAME( 4: 5 ).EQ.'MM' )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                           SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA,
+     $                           LDA, LDB
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL ZTRMM( SIDE, UPLO, TRANSA, DIAG, M,
+     $                                    N, ALPHA, AA, LDA, BB, LDB )
+                           ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN
+                              IF( TRACE )
+     $                           WRITE( NTRA, FMT = 9995 )NC, SNAME,
+     $                           SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA,
+     $                           LDA, LDB
+                              IF( REWI )
+     $                           REWIND NTRA
+                              CALL ZTRSM( SIDE, UPLO, TRANSA, DIAG, M,
+     $                                    N, ALPHA, AA, LDA, BB, LDB )
+                           END IF
+*
+*                          Check if error-exit was taken incorrectly.
+*
+                           IF( .NOT.OK )THEN
+                              WRITE( NOUT, FMT = 9994 )
+                              FATAL = .TRUE.
+                              GO TO 150
+                           END IF
+*
+*                          See what data changed inside subroutines.
+*
+                           ISAME( 1 ) = SIDES.EQ.SIDE
+                           ISAME( 2 ) = UPLOS.EQ.UPLO
+                           ISAME( 3 ) = TRANAS.EQ.TRANSA
+                           ISAME( 4 ) = DIAGS.EQ.DIAG
+                           ISAME( 5 ) = MS.EQ.M
+                           ISAME( 6 ) = NS.EQ.N
+                           ISAME( 7 ) = ALS.EQ.ALPHA
+                           ISAME( 8 ) = LZE( AS, AA, LAA )
+                           ISAME( 9 ) = LDAS.EQ.LDA
+                           IF( NULL )THEN
+                              ISAME( 10 ) = LZE( BS, BB, LBB )
+                           ELSE
+                              ISAME( 10 ) = LZERES( 'GE', ' ', M, N, BS,
+     $                                      BB, LDB )
+                           END IF
+                           ISAME( 11 ) = LDBS.EQ.LDB
+*
+*                          If data was incorrectly changed, report and
+*                          return.
+*
+                           SAME = .TRUE.
+                           DO 50 I = 1, NARGS
+                              SAME = SAME.AND.ISAME( I )
+                              IF( .NOT.ISAME( I ) )
+     $                           WRITE( NOUT, FMT = 9998 )I
+   50                      CONTINUE
+                           IF( .NOT.SAME )THEN
+                              FATAL = .TRUE.
+                              GO TO 150
+                           END IF
+*
+                           IF( .NOT.NULL )THEN
+                              IF( SNAME( 4: 5 ).EQ.'MM' )THEN
+*
+*                                Check the result.
+*
+                                 IF( LEFT )THEN
+                                    CALL ZMMCH( TRANSA, 'N', M, N, M,
+     $                                          ALPHA, A, NMAX, B, NMAX,
+     $                                          ZERO, C, NMAX, CT, G,
+     $                                          BB, LDB, EPS, ERR,
+     $                                          FATAL, NOUT, .TRUE. )
+                                 ELSE
+                                    CALL ZMMCH( 'N', TRANSA, M, N, N,
+     $                                          ALPHA, B, NMAX, A, NMAX,
+     $                                          ZERO, C, NMAX, CT, G,
+     $                                          BB, LDB, EPS, ERR,
+     $                                          FATAL, NOUT, .TRUE. )
+                                 END IF
+                              ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN
+*
+*                                Compute approximation to original
+*                                matrix.
+*
+                                 DO 70 J = 1, N
+                                    DO 60 I = 1, M
+                                       C( I, J ) = BB( I + ( J - 1 )*
+     $                                             LDB )
+                                       BB( I + ( J - 1 )*LDB ) = ALPHA*
+     $                                    B( I, J )
+   60                               CONTINUE
+   70                            CONTINUE
+*
+                                 IF( LEFT )THEN
+                                    CALL ZMMCH( TRANSA, 'N', M, N, M,
+     $                                          ONE, A, NMAX, C, NMAX,
+     $                                          ZERO, B, NMAX, CT, G,
+     $                                          BB, LDB, EPS, ERR,
+     $                                          FATAL, NOUT, .FALSE. )
+                                 ELSE
+                                    CALL ZMMCH( 'N', TRANSA, M, N, N,
+     $                                          ONE, C, NMAX, A, NMAX,
+     $                                          ZERO, B, NMAX, CT, G,
+     $                                          BB, LDB, EPS, ERR,
+     $                                          FATAL, NOUT, .FALSE. )
+                                 END IF
+                              END IF
+                              ERRMAX = MAX( ERRMAX, ERR )
+*                             If got really bad answer, report and
+*                             return.
+                              IF( FATAL )
+     $                           GO TO 150
+                           END IF
+*
+   80                   CONTINUE
+*
+   90                CONTINUE
+*
+  100             CONTINUE
+*
+  110          CONTINUE
+*
+  120       CONTINUE
+*
+  130    CONTINUE
+*
+  140 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 160
+*
+  150 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M,
+     $   N, ALPHA, LDA, LDB
+*
+  160 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A6, '(', 4( '''', A1, ''',' ), 2( I3, ',' ),
+     $      '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ')         ',
+     $      '      .' )
+ 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of ZCHK3.
+*
+      END
+      SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+     $                  A, AA, AS, B, BB, BS, C, CC, CS, CT, G )
+*
+*  Tests ZHERK and ZSYRK.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO
+      PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ) )
+      DOUBLE PRECISION   RONE, RZERO
+      PARAMETER          ( RONE = 1.0D0, RZERO = 0.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX*16         A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+     $                   AS( NMAX*NMAX ), B( NMAX, NMAX ),
+     $                   BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+     $                   C( NMAX, NMAX ), CC( NMAX*NMAX ),
+     $                   CS( NMAX*NMAX ), CT( NMAX )
+      DOUBLE PRECISION   G( NMAX )
+      INTEGER            IDIM( NIDIM )
+*     .. Local Scalars ..
+      COMPLEX*16         ALPHA, ALS, BETA, BETS
+      DOUBLE PRECISION   ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS
+      INTEGER            I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS,
+     $                   LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA,
+     $                   NARGS, NC, NS
+      LOGICAL            CONJ, NULL, RESET, SAME, TRAN, UPPER
+      CHARACTER*1        TRANS, TRANSS, TRANST, UPLO, UPLOS
+      CHARACTER*2        ICHT, ICHU
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LZE, LZERES
+      EXTERNAL           LZE, LZERES
+*     .. External Subroutines ..
+      EXTERNAL           ZHERK, ZMAKE, ZMMCH, ZSYRK
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCMPLX, MAX, DBLE
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICHT/'NC'/, ICHU/'UL'/
+*     .. Executable Statements ..
+      CONJ = SNAME( 2: 3 ).EQ.'HE'
+*
+      NARGS = 10
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*
+      DO 100 IN = 1, NIDIM
+         N = IDIM( IN )
+*        Set LDC to 1 more than minimum value if room.
+         LDC = N
+         IF( LDC.LT.NMAX )
+     $      LDC = LDC + 1
+*        Skip tests if not enough room.
+         IF( LDC.GT.NMAX )
+     $      GO TO 100
+         LCC = LDC*N
+*
+         DO 90 IK = 1, NIDIM
+            K = IDIM( IK )
+*
+            DO 80 ICT = 1, 2
+               TRANS = ICHT( ICT: ICT )
+               TRAN = TRANS.EQ.'C'
+               IF( TRAN.AND..NOT.CONJ )
+     $            TRANS = 'T'
+               IF( TRAN )THEN
+                  MA = K
+                  NA = N
+               ELSE
+                  MA = N
+                  NA = K
+               END IF
+*              Set LDA to 1 more than minimum value if room.
+               LDA = MA
+               IF( LDA.LT.NMAX )
+     $            LDA = LDA + 1
+*              Skip tests if not enough room.
+               IF( LDA.GT.NMAX )
+     $            GO TO 80
+               LAA = LDA*NA
+*
+*              Generate the matrix A.
+*
+               CALL ZMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA,
+     $                     RESET, ZERO )
+*
+               DO 70 ICU = 1, 2
+                  UPLO = ICHU( ICU: ICU )
+                  UPPER = UPLO.EQ.'U'
+*
+                  DO 60 IA = 1, NALF
+                     ALPHA = ALF( IA )
+                     IF( CONJ )THEN
+                        RALPHA = DBLE( ALPHA )
+                        ALPHA = DCMPLX( RALPHA, RZERO )
+                     END IF
+*
+                     DO 50 IB = 1, NBET
+                        BETA = BET( IB )
+                        IF( CONJ )THEN
+                           RBETA = DBLE( BETA )
+                           BETA = DCMPLX( RBETA, RZERO )
+                        END IF
+                        NULL = N.LE.0
+                        IF( CONJ )
+     $                     NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ.
+     $                            RZERO ).AND.RBETA.EQ.RONE )
+*
+*                       Generate the matrix C.
+*
+                        CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C,
+     $                              NMAX, CC, LDC, RESET, ZERO )
+*
+                        NC = NC + 1
+*
+*                       Save every datum before calling the subroutine.
+*
+                        UPLOS = UPLO
+                        TRANSS = TRANS
+                        NS = N
+                        KS = K
+                        IF( CONJ )THEN
+                           RALS = RALPHA
+                        ELSE
+                           ALS = ALPHA
+                        END IF
+                        DO 10 I = 1, LAA
+                           AS( I ) = AA( I )
+   10                   CONTINUE
+                        LDAS = LDA
+                        IF( CONJ )THEN
+                           RBETS = RBETA
+                        ELSE
+                           BETS = BETA
+                        END IF
+                        DO 20 I = 1, LCC
+                           CS( I ) = CC( I )
+   20                   CONTINUE
+                        LDCS = LDC
+*
+*                       Call the subroutine.
+*
+                        IF( CONJ )THEN
+                           IF( TRACE )
+     $                        WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO,
+     $                        TRANS, N, K, RALPHA, LDA, RBETA, LDC
+                           IF( REWI )
+     $                        REWIND NTRA
+                           CALL ZHERK( UPLO, TRANS, N, K, RALPHA, AA,
+     $                                 LDA, RBETA, CC, LDC )
+                        ELSE
+                           IF( TRACE )
+     $                        WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO,
+     $                        TRANS, N, K, ALPHA, LDA, BETA, LDC
+                           IF( REWI )
+     $                        REWIND NTRA
+                           CALL ZSYRK( UPLO, TRANS, N, K, ALPHA, AA,
+     $                                 LDA, BETA, CC, LDC )
+                        END IF
+*
+*                       Check if error-exit was taken incorrectly.
+*
+                        IF( .NOT.OK )THEN
+                           WRITE( NOUT, FMT = 9992 )
+                           FATAL = .TRUE.
+                           GO TO 120
+                        END IF
+*
+*                       See what data changed inside subroutines.
+*
+                        ISAME( 1 ) = UPLOS.EQ.UPLO
+                        ISAME( 2 ) = TRANSS.EQ.TRANS
+                        ISAME( 3 ) = NS.EQ.N
+                        ISAME( 4 ) = KS.EQ.K
+                        IF( CONJ )THEN
+                           ISAME( 5 ) = RALS.EQ.RALPHA
+                        ELSE
+                           ISAME( 5 ) = ALS.EQ.ALPHA
+                        END IF
+                        ISAME( 6 ) = LZE( AS, AA, LAA )
+                        ISAME( 7 ) = LDAS.EQ.LDA
+                        IF( CONJ )THEN
+                           ISAME( 8 ) = RBETS.EQ.RBETA
+                        ELSE
+                           ISAME( 8 ) = BETS.EQ.BETA
+                        END IF
+                        IF( NULL )THEN
+                           ISAME( 9 ) = LZE( CS, CC, LCC )
+                        ELSE
+                           ISAME( 9 ) = LZERES( SNAME( 2: 3 ), UPLO, N,
+     $                                  N, CS, CC, LDC )
+                        END IF
+                        ISAME( 10 ) = LDCS.EQ.LDC
+*
+*                       If data was incorrectly changed, report and
+*                       return.
+*
+                        SAME = .TRUE.
+                        DO 30 I = 1, NARGS
+                           SAME = SAME.AND.ISAME( I )
+                           IF( .NOT.ISAME( I ) )
+     $                        WRITE( NOUT, FMT = 9998 )I
+   30                   CONTINUE
+                        IF( .NOT.SAME )THEN
+                           FATAL = .TRUE.
+                           GO TO 120
+                        END IF
+*
+                        IF( .NOT.NULL )THEN
+*
+*                          Check the result column by column.
+*
+                           IF( CONJ )THEN
+                              TRANST = 'C'
+                           ELSE
+                              TRANST = 'T'
+                           END IF
+                           JC = 1
+                           DO 40 J = 1, N
+                              IF( UPPER )THEN
+                                 JJ = 1
+                                 LJ = J
+                              ELSE
+                                 JJ = J
+                                 LJ = N - J + 1
+                              END IF
+                              IF( TRAN )THEN
+                                 CALL ZMMCH( TRANST, 'N', LJ, 1, K,
+     $                                       ALPHA, A( 1, JJ ), NMAX,
+     $                                       A( 1, J ), NMAX, BETA,
+     $                                       C( JJ, J ), NMAX, CT, G,
+     $                                       CC( JC ), LDC, EPS, ERR,
+     $                                       FATAL, NOUT, .TRUE. )
+                              ELSE
+                                 CALL ZMMCH( 'N', TRANST, LJ, 1, K,
+     $                                       ALPHA, A( JJ, 1 ), NMAX,
+     $                                       A( J, 1 ), NMAX, BETA,
+     $                                       C( JJ, J ), NMAX, CT, G,
+     $                                       CC( JC ), LDC, EPS, ERR,
+     $                                       FATAL, NOUT, .TRUE. )
+                              END IF
+                              IF( UPPER )THEN
+                                 JC = JC + LDC
+                              ELSE
+                                 JC = JC + LDC + 1
+                              END IF
+                              ERRMAX = MAX( ERRMAX, ERR )
+*                             If got really bad answer, report and
+*                             return.
+                              IF( FATAL )
+     $                           GO TO 110
+   40                      CONTINUE
+                        END IF
+*
+   50                CONTINUE
+*
+   60             CONTINUE
+*
+   70          CONTINUE
+*
+   80       CONTINUE
+*
+   90    CONTINUE
+*
+  100 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 130
+*
+  110 CONTINUE
+      IF( N.GT.1 )
+     $   WRITE( NOUT, FMT = 9995 )J
+*
+  120 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( CONJ )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, RALPHA,
+     $      LDA, RBETA, LDC
+      ELSE
+         WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA,
+     $      LDA, BETA, LDC
+      END IF
+*
+  130 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+     $      F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ')               ',
+     $      '          .' )
+ 9993 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+     $      '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1,
+     $      '), C,', I3, ')          .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of ZCHK4.
+*
+      END
+      SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+     $                  FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+     $                  AB, AA, AS, BB, BS, C, CC, CS, CT, G, W )
+*
+*  Tests ZHER2K and ZSYR2K.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO, ONE
+      PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ),
+     $                   ONE = ( 1.0D0, 0.0D0 ) )
+      DOUBLE PRECISION   RONE, RZERO
+      PARAMETER          ( RONE = 1.0D0, RZERO = 0.0D0 )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   EPS, THRESH
+      INTEGER            NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+      LOGICAL            FATAL, REWI, TRACE
+      CHARACTER*6        SNAME
+*     .. Array Arguments ..
+      COMPLEX*16         AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ),
+     $                   ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ),
+     $                   BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ),
+     $                   CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ),
+     $                   W( 2*NMAX )
+      DOUBLE PRECISION   G( NMAX )
+      INTEGER            IDIM( NIDIM )
+*     .. Local Scalars ..
+      COMPLEX*16         ALPHA, ALS, BETA, BETS
+      DOUBLE PRECISION   ERR, ERRMAX, RBETA, RBETS
+      INTEGER            I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB,
+     $                   K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS,
+     $                   LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS
+      LOGICAL            CONJ, NULL, RESET, SAME, TRAN, UPPER
+      CHARACTER*1        TRANS, TRANSS, TRANST, UPLO, UPLOS
+      CHARACTER*2        ICHT, ICHU
+*     .. Local Arrays ..
+      LOGICAL            ISAME( 13 )
+*     .. External Functions ..
+      LOGICAL            LZE, LZERES
+      EXTERNAL           LZE, LZERES
+*     .. External Subroutines ..
+      EXTERNAL           ZHER2K, ZMAKE, ZMMCH, ZSYR2K
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCMPLX, DCONJG, MAX, DBLE
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Data statements ..
+      DATA               ICHT/'NC'/, ICHU/'UL'/
+*     .. Executable Statements ..
+      CONJ = SNAME( 2: 3 ).EQ.'HE'
+*
+      NARGS = 12
+      NC = 0
+      RESET = .TRUE.
+      ERRMAX = RZERO
+*
+      DO 130 IN = 1, NIDIM
+         N = IDIM( IN )
+*        Set LDC to 1 more than minimum value if room.
+         LDC = N
+         IF( LDC.LT.NMAX )
+     $      LDC = LDC + 1
+*        Skip tests if not enough room.
+         IF( LDC.GT.NMAX )
+     $      GO TO 130
+         LCC = LDC*N
+*
+         DO 120 IK = 1, NIDIM
+            K = IDIM( IK )
+*
+            DO 110 ICT = 1, 2
+               TRANS = ICHT( ICT: ICT )
+               TRAN = TRANS.EQ.'C'
+               IF( TRAN.AND..NOT.CONJ )
+     $            TRANS = 'T'
+               IF( TRAN )THEN
+                  MA = K
+                  NA = N
+               ELSE
+                  MA = N
+                  NA = K
+               END IF
+*              Set LDA to 1 more than minimum value if room.
+               LDA = MA
+               IF( LDA.LT.NMAX )
+     $            LDA = LDA + 1
+*              Skip tests if not enough room.
+               IF( LDA.GT.NMAX )
+     $            GO TO 110
+               LAA = LDA*NA
+*
+*              Generate the matrix A.
+*
+               IF( TRAN )THEN
+                  CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA,
+     $                        LDA, RESET, ZERO )
+               ELSE
+                  CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA,
+     $                        RESET, ZERO )
+               END IF
+*
+*              Generate the matrix B.
+*
+               LDB = LDA
+               LBB = LAA
+               IF( TRAN )THEN
+                  CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ),
+     $                        2*NMAX, BB, LDB, RESET, ZERO )
+               ELSE
+                  CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ),
+     $                        NMAX, BB, LDB, RESET, ZERO )
+               END IF
+*
+               DO 100 ICU = 1, 2
+                  UPLO = ICHU( ICU: ICU )
+                  UPPER = UPLO.EQ.'U'
+*
+                  DO 90 IA = 1, NALF
+                     ALPHA = ALF( IA )
+*
+                     DO 80 IB = 1, NBET
+                        BETA = BET( IB )
+                        IF( CONJ )THEN
+                           RBETA = DBLE( BETA )
+                           BETA = DCMPLX( RBETA, RZERO )
+                        END IF
+                        NULL = N.LE.0
+                        IF( CONJ )
+     $                     NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ.
+     $                            ZERO ).AND.RBETA.EQ.RONE )
+*
+*                       Generate the matrix C.
+*
+                        CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C,
+     $                              NMAX, CC, LDC, RESET, ZERO )
+*
+                        NC = NC + 1
+*
+*                       Save every datum before calling the subroutine.
+*
+                        UPLOS = UPLO
+                        TRANSS = TRANS
+                        NS = N
+                        KS = K
+                        ALS = ALPHA
+                        DO 10 I = 1, LAA
+                           AS( I ) = AA( I )
+   10                   CONTINUE
+                        LDAS = LDA
+                        DO 20 I = 1, LBB
+                           BS( I ) = BB( I )
+   20                   CONTINUE
+                        LDBS = LDB
+                        IF( CONJ )THEN
+                           RBETS = RBETA
+                        ELSE
+                           BETS = BETA
+                        END IF
+                        DO 30 I = 1, LCC
+                           CS( I ) = CC( I )
+   30                   CONTINUE
+                        LDCS = LDC
+*
+*                       Call the subroutine.
+*
+                        IF( CONJ )THEN
+                           IF( TRACE )
+     $                        WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO,
+     $                        TRANS, N, K, ALPHA, LDA, LDB, RBETA, LDC
+                           IF( REWI )
+     $                        REWIND NTRA
+                           CALL ZHER2K( UPLO, TRANS, N, K, ALPHA, AA,
+     $                                  LDA, BB, LDB, RBETA, CC, LDC )
+                        ELSE
+                           IF( TRACE )
+     $                        WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO,
+     $                        TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC
+                           IF( REWI )
+     $                        REWIND NTRA
+                           CALL ZSYR2K( UPLO, TRANS, N, K, ALPHA, AA,
+     $                                  LDA, BB, LDB, BETA, CC, LDC )
+                        END IF
+*
+*                       Check if error-exit was taken incorrectly.
+*
+                        IF( .NOT.OK )THEN
+                           WRITE( NOUT, FMT = 9992 )
+                           FATAL = .TRUE.
+                           GO TO 150
+                        END IF
+*
+*                       See what data changed inside subroutines.
+*
+                        ISAME( 1 ) = UPLOS.EQ.UPLO
+                        ISAME( 2 ) = TRANSS.EQ.TRANS
+                        ISAME( 3 ) = NS.EQ.N
+                        ISAME( 4 ) = KS.EQ.K
+                        ISAME( 5 ) = ALS.EQ.ALPHA
+                        ISAME( 6 ) = LZE( AS, AA, LAA )
+                        ISAME( 7 ) = LDAS.EQ.LDA
+                        ISAME( 8 ) = LZE( BS, BB, LBB )
+                        ISAME( 9 ) = LDBS.EQ.LDB
+                        IF( CONJ )THEN
+                           ISAME( 10 ) = RBETS.EQ.RBETA
+                        ELSE
+                           ISAME( 10 ) = BETS.EQ.BETA
+                        END IF
+                        IF( NULL )THEN
+                           ISAME( 11 ) = LZE( CS, CC, LCC )
+                        ELSE
+                           ISAME( 11 ) = LZERES( 'HE', UPLO, N, N, CS,
+     $                                   CC, LDC )
+                        END IF
+                        ISAME( 12 ) = LDCS.EQ.LDC
+*
+*                       If data was incorrectly changed, report and
+*                       return.
+*
+                        SAME = .TRUE.
+                        DO 40 I = 1, NARGS
+                           SAME = SAME.AND.ISAME( I )
+                           IF( .NOT.ISAME( I ) )
+     $                        WRITE( NOUT, FMT = 9998 )I
+   40                   CONTINUE
+                        IF( .NOT.SAME )THEN
+                           FATAL = .TRUE.
+                           GO TO 150
+                        END IF
+*
+                        IF( .NOT.NULL )THEN
+*
+*                          Check the result column by column.
+*
+                           IF( CONJ )THEN
+                              TRANST = 'C'
+                           ELSE
+                              TRANST = 'T'
+                           END IF
+                           JJAB = 1
+                           JC = 1
+                           DO 70 J = 1, N
+                              IF( UPPER )THEN
+                                 JJ = 1
+                                 LJ = J
+                              ELSE
+                                 JJ = J
+                                 LJ = N - J + 1
+                              END IF
+                              IF( TRAN )THEN
+                                 DO 50 I = 1, K
+                                    W( I ) = ALPHA*AB( ( J - 1 )*2*
+     $                                       NMAX + K + I )
+                                    IF( CONJ )THEN
+                                       W( K + I ) = DCONJG( ALPHA )*
+     $                                              AB( ( J - 1 )*2*
+     $                                              NMAX + I )
+                                    ELSE
+                                       W( K + I ) = ALPHA*
+     $                                              AB( ( J - 1 )*2*
+     $                                              NMAX + I )
+                                    END IF
+   50                            CONTINUE
+                                 CALL ZMMCH( TRANST, 'N', LJ, 1, 2*K,
+     $                                       ONE, AB( JJAB ), 2*NMAX, W,
+     $                                       2*NMAX, BETA, C( JJ, J ),
+     $                                       NMAX, CT, G, CC( JC ), LDC,
+     $                                       EPS, ERR, FATAL, NOUT,
+     $                                       .TRUE. )
+                              ELSE
+                                 DO 60 I = 1, K
+                                    IF( CONJ )THEN
+                                       W( I ) = ALPHA*DCONJG( AB( ( K +
+     $                                          I - 1 )*NMAX + J ) )
+                                       W( K + I ) = DCONJG( ALPHA*
+     $                                              AB( ( I - 1 )*NMAX +
+     $                                              J ) )
+                                    ELSE
+                                       W( I ) = ALPHA*AB( ( K + I - 1 )*
+     $                                          NMAX + J )
+                                       W( K + I ) = ALPHA*
+     $                                              AB( ( I - 1 )*NMAX +
+     $                                              J )
+                                    END IF
+   60                            CONTINUE
+                                 CALL ZMMCH( 'N', 'N', LJ, 1, 2*K, ONE,
+     $                                       AB( JJ ), NMAX, W, 2*NMAX,
+     $                                       BETA, C( JJ, J ), NMAX, CT,
+     $                                       G, CC( JC ), LDC, EPS, ERR,
+     $                                       FATAL, NOUT, .TRUE. )
+                              END IF
+                              IF( UPPER )THEN
+                                 JC = JC + LDC
+                              ELSE
+                                 JC = JC + LDC + 1
+                                 IF( TRAN )
+     $                              JJAB = JJAB + 2*NMAX
+                              END IF
+                              ERRMAX = MAX( ERRMAX, ERR )
+*                             If got really bad answer, report and
+*                             return.
+                              IF( FATAL )
+     $                           GO TO 140
+   70                      CONTINUE
+                        END IF
+*
+   80                CONTINUE
+*
+   90             CONTINUE
+*
+  100          CONTINUE
+*
+  110       CONTINUE
+*
+  120    CONTINUE
+*
+  130 CONTINUE
+*
+*     Report result.
+*
+      IF( ERRMAX.LT.THRESH )THEN
+         WRITE( NOUT, FMT = 9999 )SNAME, NC
+      ELSE
+         WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+      END IF
+      GO TO 160
+*
+  140 CONTINUE
+      IF( N.GT.1 )
+     $   WRITE( NOUT, FMT = 9995 )J
+*
+  150 CONTINUE
+      WRITE( NOUT, FMT = 9996 )SNAME
+      IF( CONJ )THEN
+         WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA,
+     $      LDA, LDB, RBETA, LDC
+      ELSE
+         WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA,
+     $      LDA, LDB, BETA, LDC
+      END IF
+*
+  160 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+     $      'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+     $      'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+     $      'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+     $      ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+     $      '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1,
+     $      ', C,', I3, ')           .' )
+ 9993 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+     $      '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1,
+     $      ',', F4.1, '), C,', I3, ')    .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+     $      '******' )
+*
+*     End of ZCHK5.
+*
+      END
+      SUBROUTINE ZCHKE( ISNUM, SRNAMT, NOUT )
+*
+*  Tests the error exits from the Level 3 Blas.
+*  Requires a special version of the error-handling routine XERBLA.
+*  A, B and C should not need to be defined.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*  3-19-92:  Initialize ALPHA, BETA, RALPHA, and RBETA  (eca)
+*  3-19-92:  Fix argument 12 in calls to ZSYMM and ZHEMM
+*            with INFOT = 9  (eca)
+*  10-9-00:  Declared INTRINSIC DCMPLX (susan)
+*
+*     .. Scalar Arguments ..
+      INTEGER            ISNUM, NOUT
+      CHARACTER*6        SRNAMT
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUTC
+      LOGICAL            LERR, OK
+*     .. Parameters ..
+      REAL               ONE, TWO
+      PARAMETER          ( ONE = 1.0D0, TWO = 2.0D0 )
+*     .. Local Scalars ..
+      COMPLEX*16         ALPHA, BETA
+      DOUBLE PRECISION   RALPHA, RBETA
+*     .. Local Arrays ..
+      COMPLEX*16         A( 2, 1 ), B( 2, 1 ), C( 2, 1 )
+*     .. External Subroutines ..
+      EXTERNAL           ZGEMM, ZHEMM, ZHER2K, ZHERK, CHKXER, ZSYMM,
+     $                   ZSYR2K, ZSYRK, ZTRMM, ZTRSM
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCMPLX
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUTC, OK, LERR
+*     .. Executable Statements ..
+*     OK is set to .FALSE. by the special version of XERBLA or by CHKXER
+*     if anything is wrong.
+      OK = .TRUE.
+*     LERR is set to .TRUE. by the special version of XERBLA each time
+*     it is called, and is then tested and re-set by CHKXER.
+      LERR = .FALSE.
+*
+*     Initialize ALPHA, BETA, RALPHA, and RBETA.
+*
+      ALPHA = DCMPLX( ONE, -ONE )
+      BETA = DCMPLX( TWO, -TWO )
+      RALPHA = ONE
+      RBETA = TWO
+*
+      GO TO ( 10, 20, 30, 40, 50, 60, 70, 80,
+     $        90 )ISNUM
+   10 INFOT = 1
+      CALL ZGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 1
+      CALL ZGEMM( '/', 'C', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 1
+      CALL ZGEMM( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZGEMM( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZGEMM( 'C', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZGEMM( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZGEMM( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZGEMM( 'N', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZGEMM( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZGEMM( 'C', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZGEMM( 'C', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZGEMM( 'C', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZGEMM( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZGEMM( 'T', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZGEMM( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZGEMM( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZGEMM( 'N', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZGEMM( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZGEMM( 'C', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZGEMM( 'C', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZGEMM( 'C', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZGEMM( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZGEMM( 'T', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZGEMM( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZGEMM( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZGEMM( 'N', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZGEMM( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZGEMM( 'C', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZGEMM( 'C', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZGEMM( 'C', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZGEMM( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZGEMM( 'T', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZGEMM( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL ZGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL ZGEMM( 'N', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL ZGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL ZGEMM( 'C', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL ZGEMM( 'C', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL ZGEMM( 'C', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL ZGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL ZGEMM( 'T', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 8
+      CALL ZGEMM( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL ZGEMM( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL ZGEMM( 'C', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL ZGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL ZGEMM( 'N', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL ZGEMM( 'C', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL ZGEMM( 'T', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL ZGEMM( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL ZGEMM( 'C', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL ZGEMM( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL ZGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL ZGEMM( 'N', 'C', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL ZGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL ZGEMM( 'C', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL ZGEMM( 'C', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL ZGEMM( 'C', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL ZGEMM( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL ZGEMM( 'T', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 13
+      CALL ZGEMM( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 100
+   20 INFOT = 1
+      CALL ZHEMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZHEMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZHEMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZHEMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZHEMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZHEMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZHEMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZHEMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZHEMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZHEMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZHEMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 100
+   30 INFOT = 1
+      CALL ZSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 100
+   40 INFOT = 1
+      CALL ZTRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZTRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZTRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZTRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRMM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRMM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRMM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRMM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRMM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRMM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRMM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRMM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRMM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRMM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRMM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRMM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 100
+   50 INFOT = 1
+      CALL ZTRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZTRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZTRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZTRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRSM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRSM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRSM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRSM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRSM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRSM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRSM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRSM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 6
+      CALL ZTRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRSM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRSM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRSM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRSM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 100
+   60 INFOT = 1
+      CALL ZHERK( '/', 'N', 0, 0, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZHERK( 'U', 'T', 0, 0, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZHERK( 'U', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZHERK( 'U', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZHERK( 'L', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZHERK( 'L', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZHERK( 'U', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZHERK( 'U', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZHERK( 'L', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZHERK( 'L', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZHERK( 'U', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZHERK( 'U', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZHERK( 'L', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZHERK( 'L', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL ZHERK( 'U', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL ZHERK( 'U', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL ZHERK( 'L', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL ZHERK( 'L', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 100
+   70 INFOT = 1
+      CALL ZSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZSYRK( 'U', 'C', 0, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL ZSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL ZSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL ZSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 10
+      CALL ZSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 100
+   80 INFOT = 1
+      CALL ZHER2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZHER2K( 'U', 'T', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZHER2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZHER2K( 'U', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZHER2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZHER2K( 'L', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZHER2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZHER2K( 'U', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZHER2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZHER2K( 'L', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZHER2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZHER2K( 'U', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZHER2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZHER2K( 'L', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZHER2K( 'U', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZHER2K( 'L', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL ZHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL ZHER2K( 'U', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL ZHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL ZHER2K( 'L', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      GO TO 100
+   90 INFOT = 1
+      CALL ZSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZSYR2K( 'U', 'C', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL ZSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL ZSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL ZSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+      INFOT = 12
+      CALL ZSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+*
+  100 IF( OK )THEN
+         WRITE( NOUT, FMT = 9999 )SRNAMT
+      ELSE
+         WRITE( NOUT, FMT = 9998 )SRNAMT
+      END IF
+      RETURN
+*
+ 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' )
+ 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****',
+     $      '**' )
+*
+*     End of ZCHKE.
+*
+      END
+      SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET,
+     $                  TRANSL )
+*
+*  Generates values for an M by N matrix A.
+*  Stores the values in the array AA in the data structure required
+*  by the routine, with unwanted elements set to rogue value.
+*
+*  TYPE is 'GE', 'HE', 'SY' or 'TR'.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO, ONE
+      PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ),
+     $                   ONE = ( 1.0D0, 0.0D0 ) )
+      COMPLEX*16         ROGUE
+      PARAMETER          ( ROGUE = ( -1.0D10, 1.0D10 ) )
+      DOUBLE PRECISION   RZERO
+      PARAMETER          ( RZERO = 0.0D0 )
+      DOUBLE PRECISION   RROGUE
+      PARAMETER          ( RROGUE = -1.0D10 )
+*     .. Scalar Arguments ..
+      COMPLEX*16         TRANSL
+      INTEGER            LDA, M, N, NMAX
+      LOGICAL            RESET
+      CHARACTER*1        DIAG, UPLO
+      CHARACTER*2        TYPE
+*     .. Array Arguments ..
+      COMPLEX*16         A( NMAX, * ), AA( * )
+*     .. Local Scalars ..
+      INTEGER            I, IBEG, IEND, J, JJ
+      LOGICAL            GEN, HER, LOWER, SYM, TRI, UNIT, UPPER
+*     .. External Functions ..
+      COMPLEX*16         ZBEG
+      EXTERNAL           ZBEG
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCMPLX, DCONJG, DBLE
+*     .. Executable Statements ..
+      GEN = TYPE.EQ.'GE'
+      HER = TYPE.EQ.'HE'
+      SYM = TYPE.EQ.'SY'
+      TRI = TYPE.EQ.'TR'
+      UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U'
+      LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L'
+      UNIT = TRI.AND.DIAG.EQ.'U'
+*
+*     Generate data in array A.
+*
+      DO 20 J = 1, N
+         DO 10 I = 1, M
+            IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) )
+     $          THEN
+               A( I, J ) = ZBEG( RESET ) + TRANSL
+               IF( I.NE.J )THEN
+*                 Set some elements to zero
+                  IF( N.GT.3.AND.J.EQ.N/2 )
+     $               A( I, J ) = ZERO
+                  IF( HER )THEN
+                     A( J, I ) = DCONJG( A( I, J ) )
+                  ELSE IF( SYM )THEN
+                     A( J, I ) = A( I, J )
+                  ELSE IF( TRI )THEN
+                     A( J, I ) = ZERO
+                  END IF
+               END IF
+            END IF
+   10    CONTINUE
+         IF( HER )
+     $      A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO )
+         IF( TRI )
+     $      A( J, J ) = A( J, J ) + ONE
+         IF( UNIT )
+     $      A( J, J ) = ONE
+   20 CONTINUE
+*
+*     Store elements in array AS in data structure required by routine.
+*
+      IF( TYPE.EQ.'GE' )THEN
+         DO 50 J = 1, N
+            DO 30 I = 1, M
+               AA( I + ( J - 1 )*LDA ) = A( I, J )
+   30       CONTINUE
+            DO 40 I = M + 1, LDA
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+   40       CONTINUE
+   50    CONTINUE
+      ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN
+         DO 90 J = 1, N
+            IF( UPPER )THEN
+               IBEG = 1
+               IF( UNIT )THEN
+                  IEND = J - 1
+               ELSE
+                  IEND = J
+               END IF
+            ELSE
+               IF( UNIT )THEN
+                  IBEG = J + 1
+               ELSE
+                  IBEG = J
+               END IF
+               IEND = N
+            END IF
+            DO 60 I = 1, IBEG - 1
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+   60       CONTINUE
+            DO 70 I = IBEG, IEND
+               AA( I + ( J - 1 )*LDA ) = A( I, J )
+   70       CONTINUE
+            DO 80 I = IEND + 1, LDA
+               AA( I + ( J - 1 )*LDA ) = ROGUE
+   80       CONTINUE
+            IF( HER )THEN
+               JJ = J + ( J - 1 )*LDA
+               AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE )
+            END IF
+   90    CONTINUE
+      END IF
+      RETURN
+*
+*     End of ZMAKE.
+*
+      END
+      SUBROUTINE ZMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB,
+     $                  BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL,
+     $                  NOUT, MV )
+*
+*  Checks the results of the computational tests.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO
+      PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ) )
+      DOUBLE PRECISION   RZERO, RONE
+      PARAMETER          ( RZERO = 0.0D0, RONE = 1.0D0 )
+*     .. Scalar Arguments ..
+      COMPLEX*16         ALPHA, BETA
+      DOUBLE PRECISION   EPS, ERR
+      INTEGER            KK, LDA, LDB, LDC, LDCC, M, N, NOUT
+      LOGICAL            FATAL, MV
+      CHARACTER*1        TRANSA, TRANSB
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), B( LDB, * ), C( LDC, * ),
+     $                   CC( LDCC, * ), CT( * )
+      DOUBLE PRECISION   G( * )
+*     .. Local Scalars ..
+      COMPLEX*16         CL
+      DOUBLE PRECISION   ERRI
+      INTEGER            I, J, K
+      LOGICAL            CTRANA, CTRANB, TRANA, TRANB
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DIMAG, DCONJG, MAX, DBLE, SQRT
+*     .. Statement Functions ..
+      DOUBLE PRECISION   ABS1
+*     .. Statement Function definitions ..
+      ABS1( CL ) = ABS( DBLE( CL ) ) + ABS( DIMAG( CL ) )
+*     .. Executable Statements ..
+      TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C'
+      TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C'
+      CTRANA = TRANSA.EQ.'C'
+      CTRANB = TRANSB.EQ.'C'
+*
+*     Compute expected result, one column at a time, in CT using data
+*     in A, B and C.
+*     Compute gauges in G.
+*
+      DO 220 J = 1, N
+*
+         DO 10 I = 1, M
+            CT( I ) = ZERO
+            G( I ) = RZERO
+   10    CONTINUE
+         IF( .NOT.TRANA.AND..NOT.TRANB )THEN
+            DO 30 K = 1, KK
+               DO 20 I = 1, M
+                  CT( I ) = CT( I ) + A( I, K )*B( K, J )
+                  G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) )
+   20          CONTINUE
+   30       CONTINUE
+         ELSE IF( TRANA.AND..NOT.TRANB )THEN
+            IF( CTRANA )THEN
+               DO 50 K = 1, KK
+                  DO 40 I = 1, M
+                     CT( I ) = CT( I ) + DCONJG( A( K, I ) )*B( K, J )
+                     G( I ) = G( I ) + ABS1( A( K, I ) )*
+     $                        ABS1( B( K, J ) )
+   40             CONTINUE
+   50          CONTINUE
+            ELSE
+               DO 70 K = 1, KK
+                  DO 60 I = 1, M
+                     CT( I ) = CT( I ) + A( K, I )*B( K, J )
+                     G( I ) = G( I ) + ABS1( A( K, I ) )*
+     $                        ABS1( B( K, J ) )
+   60             CONTINUE
+   70          CONTINUE
+            END IF
+         ELSE IF( .NOT.TRANA.AND.TRANB )THEN
+            IF( CTRANB )THEN
+               DO 90 K = 1, KK
+                  DO 80 I = 1, M
+                     CT( I ) = CT( I ) + A( I, K )*DCONJG( B( J, K ) )
+                     G( I ) = G( I ) + ABS1( A( I, K ) )*
+     $                        ABS1( B( J, K ) )
+   80             CONTINUE
+   90          CONTINUE
+            ELSE
+               DO 110 K = 1, KK
+                  DO 100 I = 1, M
+                     CT( I ) = CT( I ) + A( I, K )*B( J, K )
+                     G( I ) = G( I ) + ABS1( A( I, K ) )*
+     $                        ABS1( B( J, K ) )
+  100             CONTINUE
+  110          CONTINUE
+            END IF
+         ELSE IF( TRANA.AND.TRANB )THEN
+            IF( CTRANA )THEN
+               IF( CTRANB )THEN
+                  DO 130 K = 1, KK
+                     DO 120 I = 1, M
+                        CT( I ) = CT( I ) + DCONJG( A( K, I ) )*
+     $                            DCONJG( B( J, K ) )
+                        G( I ) = G( I ) + ABS1( A( K, I ) )*
+     $                           ABS1( B( J, K ) )
+  120                CONTINUE
+  130             CONTINUE
+               ELSE
+                  DO 150 K = 1, KK
+                     DO 140 I = 1, M
+                        CT( I ) = CT( I ) + DCONJG( A( K, I ) )*
+     $                            B( J, K )
+                        G( I ) = G( I ) + ABS1( A( K, I ) )*
+     $                           ABS1( B( J, K ) )
+  140                CONTINUE
+  150             CONTINUE
+               END IF
+            ELSE
+               IF( CTRANB )THEN
+                  DO 170 K = 1, KK
+                     DO 160 I = 1, M
+                        CT( I ) = CT( I ) + A( K, I )*
+     $                            DCONJG( B( J, K ) )
+                        G( I ) = G( I ) + ABS1( A( K, I ) )*
+     $                           ABS1( B( J, K ) )
+  160                CONTINUE
+  170             CONTINUE
+               ELSE
+                  DO 190 K = 1, KK
+                     DO 180 I = 1, M
+                        CT( I ) = CT( I ) + A( K, I )*B( J, K )
+                        G( I ) = G( I ) + ABS1( A( K, I ) )*
+     $                           ABS1( B( J, K ) )
+  180                CONTINUE
+  190             CONTINUE
+               END IF
+            END IF
+         END IF
+         DO 200 I = 1, M
+            CT( I ) = ALPHA*CT( I ) + BETA*C( I, J )
+            G( I ) = ABS1( ALPHA )*G( I ) +
+     $               ABS1( BETA )*ABS1( C( I, J ) )
+  200    CONTINUE
+*
+*        Compute the error ratio for this result.
+*
+         ERR = ZERO
+         DO 210 I = 1, M
+            ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS
+            IF( G( I ).NE.RZERO )
+     $         ERRI = ERRI/G( I )
+            ERR = MAX( ERR, ERRI )
+            IF( ERR*SQRT( EPS ).GE.RONE )
+     $         GO TO 230
+  210    CONTINUE
+*
+  220 CONTINUE
+*
+*     If the loop completes, all results are at least half accurate.
+      GO TO 250
+*
+*     Report fatal error.
+*
+  230 FATAL = .TRUE.
+      WRITE( NOUT, FMT = 9999 )
+      DO 240 I = 1, M
+         IF( MV )THEN
+            WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J )
+         ELSE
+            WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I )
+         END IF
+  240 CONTINUE
+      IF( N.GT.1 )
+     $   WRITE( NOUT, FMT = 9997 )J
+*
+  250 CONTINUE
+      RETURN
+*
+ 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL',
+     $      'F ACCURATE *******', /'                       EXPECTED RE',
+     $      'SULT                    COMPUTED RESULT' )
+ 9998 FORMAT( 1X, I7, 2( '  (', G15.6, ',', G15.6, ')' ) )
+ 9997 FORMAT( '      THESE ARE THE RESULTS FOR COLUMN ', I3 )
+*
+*     End of ZMMCH.
+*
+      END
+      LOGICAL FUNCTION LZE( RI, RJ, LR )
+*
+*  Tests if two arrays are identical.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      INTEGER            LR
+*     .. Array Arguments ..
+      COMPLEX*16         RI( * ), RJ( * )
+*     .. Local Scalars ..
+      INTEGER            I
+*     .. Executable Statements ..
+      DO 10 I = 1, LR
+         IF( RI( I ).NE.RJ( I ) )
+     $      GO TO 20
+   10 CONTINUE
+      LZE = .TRUE.
+      GO TO 30
+   20 CONTINUE
+      LZE = .FALSE.
+   30 RETURN
+*
+*     End of LZE.
+*
+      END
+      LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA )
+*
+*  Tests if selected elements in two arrays are equal.
+*
+*  TYPE is 'GE' or 'HE' or 'SY'.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      INTEGER            LDA, M, N
+      CHARACTER*1        UPLO
+      CHARACTER*2        TYPE
+*     .. Array Arguments ..
+      COMPLEX*16         AA( LDA, * ), AS( LDA, * )
+*     .. Local Scalars ..
+      INTEGER            I, IBEG, IEND, J
+      LOGICAL            UPPER
+*     .. Executable Statements ..
+      UPPER = UPLO.EQ.'U'
+      IF( TYPE.EQ.'GE' )THEN
+         DO 20 J = 1, N
+            DO 10 I = M + 1, LDA
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   10       CONTINUE
+   20    CONTINUE
+      ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY' )THEN
+         DO 50 J = 1, N
+            IF( UPPER )THEN
+               IBEG = 1
+               IEND = J
+            ELSE
+               IBEG = J
+               IEND = N
+            END IF
+            DO 30 I = 1, IBEG - 1
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   30       CONTINUE
+            DO 40 I = IEND + 1, LDA
+               IF( AA( I, J ).NE.AS( I, J ) )
+     $            GO TO 70
+   40       CONTINUE
+   50    CONTINUE
+      END IF
+*
+      LZERES = .TRUE.
+      GO TO 80
+   70 CONTINUE
+      LZERES = .FALSE.
+   80 RETURN
+*
+*     End of LZERES.
+*
+      END
+      COMPLEX*16     FUNCTION ZBEG( RESET )
+*
+*  Generates complex numbers as pairs of random numbers uniformly
+*  distributed between -0.5 and 0.5.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      LOGICAL            RESET
+*     .. Local Scalars ..
+      INTEGER            I, IC, J, MI, MJ
+*     .. Save statement ..
+      SAVE               I, IC, J, MI, MJ
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCMPLX
+*     .. Executable Statements ..
+      IF( RESET )THEN
+*        Initialize local variables.
+         MI = 891
+         MJ = 457
+         I = 7
+         J = 7
+         IC = 0
+         RESET = .FALSE.
+      END IF
+*
+*     The sequence of values of I or J is bounded between 1 and 999.
+*     If initial I or J = 1,2,3,6,7 or 9, the period will be 50.
+*     If initial I or J = 4 or 8, the period will be 25.
+*     If initial I or J = 5, the period will be 10.
+*     IC is used to break up the period by skipping 1 value of I or J
+*     in 6.
+*
+      IC = IC + 1
+   10 I = I*MI
+      J = J*MJ
+      I = I - 1000*( I/1000 )
+      J = J - 1000*( J/1000 )
+      IF( IC.GE.5 )THEN
+         IC = 0
+         GO TO 10
+      END IF
+      ZBEG = DCMPLX( ( I - 500 )/1001.0D0, ( J - 500 )/1001.0D0 )
+      RETURN
+*
+*     End of ZBEG.
+*
+      END
+      DOUBLE PRECISION FUNCTION DDIFF( X, Y )
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   X, Y
+*     .. Executable Statements ..
+      DDIFF = X - Y
+      RETURN
+*
+*     End of DDIFF.
+*
+      END
+      SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+*
+*  Tests whether XERBLA has detected an error when it should.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFOT, NOUT
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Executable Statements ..
+      IF( .NOT.LERR )THEN
+         WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT
+         OK = .FALSE.
+      END IF
+      LERR = .FALSE.
+      RETURN
+*
+ 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D',
+     $      'ETECTED BY ', A6, ' *****' )
+*
+*     End of CHKXER.
+*
+      END
+      SUBROUTINE XERBLA( SRNAME, INFO )
+*
+*  This is a special version of XERBLA to be used only as part of
+*  the test program for testing error exits from the Level 3 BLAS
+*  routines.
+*
+*  XERBLA  is an error handler for the Level 3 BLAS routines.
+*
+*  It is called by the Level 3 BLAS routines if an input parameter is
+*  invalid.
+*
+*  Auxiliary routine for test program for Level 3 Blas.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO
+      CHARACTER*6        SRNAME
+*     .. Scalars in Common ..
+      INTEGER            INFOT, NOUT
+      LOGICAL            LERR, OK
+      CHARACTER*6        SRNAMT
+*     .. Common blocks ..
+      COMMON             /INFOC/INFOT, NOUT, OK, LERR
+      COMMON             /SRNAMC/SRNAMT
+*     .. Executable Statements ..
+      LERR = .TRUE.
+      IF( INFO.NE.INFOT )THEN
+         IF( INFOT.NE.0 )THEN
+            WRITE( NOUT, FMT = 9999 )INFO, INFOT
+         ELSE
+            WRITE( NOUT, FMT = 9997 )INFO
+         END IF
+         OK = .FALSE.
+      END IF
+      IF( SRNAME.NE.SRNAMT )THEN
+         WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT
+         OK = .FALSE.
+      END IF
+      RETURN
+*
+ 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD',
+     $      ' OF ', I2, ' *******' )
+ 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE',
+     $      'AD OF ', A6, ' *******' )
+ 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6,
+     $      ' *******' )
+*
+*     End of XERBLA
+*
+      END
+

diff --git a/blas/xerbla.cpp b/blas/xerbla.cpp
new file mode 100644
index 0000000..c373e86
--- /dev/null
+++ b/blas/xerbla.cpp

@@ -0,0 +1,23 @@
+
+#include <stdio.h>
+
+#if (defined __GNUC__) && (!defined __MINGW32__) && (!defined __CYGWIN__)
+#define EIGEN_WEAK_LINKING __attribute__ ((weak))
+#else
+#define EIGEN_WEAK_LINKING
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+EIGEN_WEAK_LINKING int xerbla_(const char * msg, int *info, int)
+{
+  printf("Eigen BLAS ERROR #%i: %s\n", *info, msg );
+  return 0;
+}
+
+#ifdef __cplusplus
+}
+#endif

diff --git a/ci/CTest2JUnit.xsl b/ci/CTest2JUnit.xsl
new file mode 100644
index 0000000..8ba21f4
--- /dev/null
+++ b/ci/CTest2JUnit.xsl

@@ -0,0 +1,120 @@
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
+<xsl:output method="xml" indent="yes"/>
+    <xsl:template match="/Site">
+	<xsl:variable name="Name"><xsl:value-of select="@Name"/></xsl:variable>
+	<xsl:variable name="Hostname"><xsl:value-of select="@Hostname"/></xsl:variable>
+	<xsl:variable name="TestCount"><xsl:value-of select="count(//TestList/Test)"/> </xsl:variable>
+	<xsl:variable name="ErrorCount"><xsl:value-of select="count(//TestList/Test[@Status='error'])"/> </xsl:variable>
+	<xsl:variable name="FailureCount"><xsl:value-of select="count(//Testing/Test[@Status='failed'])"/> </xsl:variable>
+	<testsuite name="{$Name}" hostname="{$Hostname}" errors="0" failures="{$FailureCount}" tests="{$TestCount}">
+	    <xsl:variable name="BuildName"><xsl:value-of select="@BuildName"/></xsl:variable>
+	    <xsl:variable name="BuildStamp"><xsl:value-of select="@BuildStamp"/></xsl:variable>
+	    <xsl:variable name="Generator"><xsl:value-of select="@Generator"/></xsl:variable>
+	    <xsl:variable name="CompilerName"><xsl:value-of select="@CompilerName"/></xsl:variable>
+	    <xsl:variable name="OSName"><xsl:value-of select="@OSName"/></xsl:variable>
+	    <xsl:variable name="OSRelease"><xsl:value-of select="@OSRelease"/></xsl:variable>
+	    <xsl:variable name="OSVersion"><xsl:value-of select="@OSVersion"/></xsl:variable>
+	    <xsl:variable name="OSPlatform"><xsl:value-of select="@OSPlatform"/></xsl:variable>
+	    <xsl:variable name="Is64Bits"><xsl:value-of select="@Is64Bits"/></xsl:variable>
+	    <xsl:variable name="VendorString"><xsl:value-of select="@VendorString"/></xsl:variable>
+	    <xsl:variable name="VendorID"><xsl:value-of select="@VendorID"/></xsl:variable>
+	    <xsl:variable name="FamilyID"><xsl:value-of select="@FamilyID"/></xsl:variable>
+	    <xsl:variable name="ModelID"><xsl:value-of select="@ModelID"/></xsl:variable>
+	    <xsl:variable name="ProcessorCacheSize"><xsl:value-of select="@ProcessorCacheSize"/></xsl:variable>
+	    <xsl:variable name="NumberOfLogicalCPU"><xsl:value-of select="@NumberOfLogicalCPU"/></xsl:variable>
+	    <xsl:variable name="NumberOfPhysicalCPU"><xsl:value-of select="@NumberOfPhysicalCPU"/></xsl:variable>
+	    <xsl:variable name="TotalVirtualMemory"><xsl:value-of select="@TotalVirtualMemory"/></xsl:variable>
+	    <xsl:variable name="TotalPhysicalMemory"><xsl:value-of select="@TotalPhysicalMemory"/></xsl:variable>
+	    <xsl:variable name="LogicalProcessorsPerPhysical"><xsl:value-of select="@LogicalProcessorsPerPhysical"/></xsl:variable>
+	    <xsl:variable name="ProcessorClockFrequency"><xsl:value-of select="@ProcessorClockFrequency"/></xsl:variable>
+	    <properties>
+		<property name="BuildName" value="{$BuildName}" />
+		<property name="BuildStamp" value="{$BuildStamp}" />
+		<property name="Name" value="{$Name}" />
+		<property name="Generator" value="{$Generator}" />
+		<property name="CompilerName" value="{$CompilerName}" />
+		<property name="OSName" value="{$OSName}" />
+		<property name="Hostname" value="{$Hostname}" />
+		<property name="OSRelease" value="{$OSRelease}" />
+		<property name="OSVersion" value="{$OSVersion}" />
+		<property name="OSPlatform" value="{$OSPlatform}" />
+		<property name="Is64Bits" value="{$Is64Bits}" />
+		<property name="VendorString" value="{$VendorString}" />
+		<property name="VendorID" value="{$VendorID}" />
+		<property name="FamilyID" value="{$FamilyID}" />
+		<property name="ModelID" value="{$ModelID}" />
+		<property name="ProcessorCacheSize" value="{$ProcessorCacheSize}" />
+		<property name="NumberOfLogicalCPU" value="{$NumberOfLogicalCPU}" />
+		<property name="NumberOfPhysicalCPU" value="{$NumberOfPhysicalCPU}" />
+		<property name="TotalVirtualMemory" value="{$TotalVirtualMemory}" />
+		<property name="TotalPhysicalMemory" value="{$TotalPhysicalMemory}" />
+		<property name="LogicalProcessorsPerPhysical" value="{$LogicalProcessorsPerPhysical}" />
+		<property name="ProcessorClockFrequency" value="{$ProcessorClockFrequency}" />
+	    </properties>
+	    <xsl:apply-templates select="Testing/Test"/>
+
+	    <system-out>
+		BuildName: <xsl:value-of select="$BuildName" />
+		BuildStamp: <xsl:value-of select="$BuildStamp" />
+		Name: <xsl:value-of select="$Name" />
+		Generator: <xsl:value-of select="$Generator" />
+		CompilerName: <xsl:value-of select="$CompilerName" />
+		OSName: <xsl:value-of select="$OSName" />
+		Hostname: <xsl:value-of select="$Hostname" />
+		OSRelease: <xsl:value-of select="$OSRelease" />
+		OSVersion: <xsl:value-of select="$OSVersion" />
+		OSPlatform: <xsl:value-of select="$OSPlatform" />
+		Is64Bits: <xsl:value-of select="$Is64Bits" />
+		VendorString: <xsl:value-of select="$VendorString" />
+		VendorID: <xsl:value-of select="$VendorID" />
+		FamilyID: <xsl:value-of select="$FamilyID" />
+		ModelID: <xsl:value-of select="$ModelID" />
+		ProcessorCacheSize: <xsl:value-of select="$ProcessorCacheSize" />
+		NumberOfLogicalCPU: <xsl:value-of select="$NumberOfLogicalCPU" />
+		NumberOfPhysicalCPU: <xsl:value-of select="$NumberOfPhysicalCPU" />
+		TotalVirtualMemory: <xsl:value-of select="$TotalVirtualMemory" />
+		TotalPhysicalMemory: <xsl:value-of select="$TotalPhysicalMemory" />
+		LogicalProcessorsPerPhysical: <xsl:value-of select="$LogicalProcessorsPerPhysical" />
+		ProcessorClockFrequency: <xsl:value-of select="$ProcessorClockFrequency" />
+	    </system-out>
+	</testsuite>
+    </xsl:template>
+
+    <xsl:template match="Testing/Test">
+	<xsl:variable name="testcasename"><xsl:value-of select= "Name"/></xsl:variable>
+	<xsl:variable name="testclassname"><xsl:value-of select= " concat('this', substring(Path,2))"/></xsl:variable>
+	<xsl:variable name="exectime">
+	    <xsl:for-each select="Results/NamedMeasurement">
+		<xsl:if test="@name = 'Execution Time'">
+		    <xsl:value-of select="."/>
+		</xsl:if>
+	    </xsl:for-each>
+	</xsl:variable>
+
+	<testcase name="{$testcasename}" classname="{$testclassname}" time="{$exectime}">
+	    <xsl:if test="@Status = 'passed'">
+	    </xsl:if>
+	    <xsl:if test="@Status = 'failed'">
+		<xsl:variable name="failtype">
+		    <xsl:for-each select="Results/NamedMeasurement">
+			<xsl:if test="@name = 'Exit Code'">
+			    <xsl:value-of select="."/>
+			</xsl:if>
+		    </xsl:for-each>
+		</xsl:variable>
+		<xsl:variable name="failcode">
+		    <xsl:for-each select="Results/NamedMeasurement">
+			<xsl:if test="@name = 'Exit Value'">
+			    <xsl:value-of select="."/>
+			</xsl:if>
+		    </xsl:for-each>
+		</xsl:variable>
+		<failure message="{$failtype} ({$failcode})"><xsl:value-of select="Results/Measurement/Value/text()" /></failure>
+	    </xsl:if>
+	    <xsl:if test="@Status = 'notrun'">
+		<skipped><xsl:value-of select="Results/Measurement/Value/text()" /></skipped>
+	    </xsl:if>
+	</testcase>
+    </xsl:template>
+
+</xsl:stylesheet>

diff --git a/ci/README.md b/ci/README.md
new file mode 100644
index 0000000..8395b16
--- /dev/null
+++ b/ci/README.md

@@ -0,0 +1,56 @@
+## Eigen CI infrastructure
+
+Eigen's CI infrastructure uses two stages: A `build` stage to build the unit-test
+suite and a `test` stage to run the unit-tests.
+
+### Build Stage
+
+The build stage consists of the following jobs:
+
+| Job Name                                 | Arch      | OS             | Compiler   | C++11   |
+|------------------------------------------|-----------|----------------|------------|---------|
+| `build:x86-64:linux:gcc-4.8:cxx11-off`   | `x86-64`  | `Ubuntu 18.04` | `GCC-4.8`  | `Off`   |
+| `build:x86-64:linux:gcc-4.8:cxx11-on`    | `x86-64`  | `Ubuntu 18.04` | `GCC-4.8`  | `On`    |
+| `build:x86-64:linux:gcc-9:cxx11-off`     | `x86-64`  | `Ubuntu 18.04` | `GCC-9`    | `Off`   |
+| `build:x86-64:linux:gcc-9:cxx11-on`      | `x86-64`  | `Ubuntu 18.04` | `GCC-9`    | `On`    |
+| `build:x86-64:linux:gcc-10:cxx11-off`    | `x86-64`  | `Ubuntu 18.04` | `GCC-10`   | `Off`   |
+| `build:x86-64:linux:gcc-10:cxx11-on`     | `x86-64`  | `Ubuntu 18.04` | `GCC-10`   | `On`    |
+| `build:x86-64:linux:clang-10:cxx11-off`  | `x86-64`  | `Ubuntu 18.04` | `Clang-10` | `Off`   |
+| `build:x86-64:linux:clang-10:cxx11-on`   | `x86-64`  | `Ubuntu 18.04` | `Clang-10` | `On`    |
+| `build:aarch64:linux:gcc-10:cxx11-off`   | `AArch64` | `Ubuntu 18.04` | `GCC-10`   | `Off`   |
+| `build:aarch64:linux:gcc-10:cxx11-on`    | `AArch64` | `Ubuntu 18.04` | `GCC-10`   | `On`    |
+| `build:aarch64:linux:clang-10:cxx11-off` | `AArch64` | `Ubuntu 18.04` | `Clang-10` | `Off`   |
+| `build:aarch64:linux:clang-10:cxx11-on`  | `AArch64` | `Ubuntu 18.04` | `Clang-10` | `On`    |
+
+### Test stage
+
+In principle every build-job has a corresponding test-job, however testing supported and unsupported modules is divided into separate jobs. The test jobs in detail:
+
+### Job dependecies
+
+| Job Name                                            | Arch      | OS             | Compiler   | C++11   | Module
+|-----------------------------------------------------|-----------|----------------|------------|---------|--------
+| `test:x86-64:linux:gcc-4.8:cxx11-off:official`      | `x86-64`  | `Ubuntu 18.04` | `GCC-4.8`  | `Off`   | `Official`
+| `test:x86-64:linux:gcc-4.8:cxx11-off:unsupported`   | `x86-64`  | `Ubuntu 18.04` | `GCC-4.8`  | `Off`   | `Unsupported`
+| `test:x86-64:linux:gcc-4.8:cxx11-on:official`       | `x86-64`  | `Ubuntu 18.04` | `GCC-4.8`  | `On`    | `Official`
+| `test:x86-64:linux:gcc-4.8:cxx11-on:unsupported`    | `x86-64`  | `Ubuntu 18.04` | `GCC-4.8`  | `On`    | `Unsupported`
+| `test:x86-64:linux:gcc-9:cxx11-off:official`        | `x86-64`  | `Ubuntu 18.04` | `GCC-9`    | `Off`   | `Official`
+| `test:x86-64:linux:gcc-9:cxx11-off:unsupported`     | `x86-64`  | `Ubuntu 18.04` | `GCC-9`    | `Off`   | `Unsupported`
+| `test:x86-64:linux:gcc-9:cxx11-on:official`         | `x86-64`  | `Ubuntu 18.04` | `GCC-9`    | `On`    | `Official`
+| `test:x86-64:linux:gcc-9:cxx11-on:unsupported`      | `x86-64`  | `Ubuntu 18.04` | `GCC-9`    | `On`    | `Unsupported`
+| `test:x86-64:linux:gcc-10:cxx11-off:official`       | `x86-64`  | `Ubuntu 18.04` | `GCC-10`   | `Off`   | `Official`
+| `test:x86-64:linux:gcc-10:cxx11-off:unsupported`    | `x86-64`  | `Ubuntu 18.04` | `GCC-10`   | `Off`   | `Unsupported`
+| `test:x86-64:linux:gcc-10:cxx11-on:official`        | `x86-64`  | `Ubuntu 18.04` | `GCC-10`   | `On`    | `Official`
+| `test:x86-64:linux:gcc-10:cxx11-on:unsupported`     | `x86-64`  | `Ubuntu 18.04` | `GCC-10`   | `On`    | `Unsupported`
+| `test:x86-64:linux:clang-10:cxx11-off:official`     | `x86-64`  | `Ubuntu 18.04` | `Clang-10` | `Off`   | `Official`
+| `test:x86-64:linux:clang-10:cxx11-off:unsupported`  | `x86-64`  | `Ubuntu 18.04` | `Clang-10` | `Off`   | `Unsupported`
+| `test:x86-64:linux:clang-10:cxx11-on:official`      | `x86-64`  | `Ubuntu 18.04` | `Clang-10` | `On`    | `Official`
+| `test:x86-64:linux:clang-10:cxx11-on:unsupported`   | `x86-64`  | `Ubuntu 18.04` | `Clang-10` | `On`    | `Unsupported`
+| `test:aarch64:linux:gcc-10:cxx11-off:official`      | `AArch64` | `Ubuntu 18.04` | `GCC-10`   | `Off`   | `Official`
+| `test:aarch64:linux:gcc-10:cxx11-off:unsupported`   | `AArch64` | `Ubuntu 18.04` | `GCC-10`   | `Off`   | `Unsupported`
+| `test:aarch64:linux:gcc-10:cxx11-on:official`       | `AArch64` | `Ubuntu 18.04` | `GCC-10`   | `On`    | `Official`
+| `test:aarch64:linux:gcc-10:cxx11-on:unsupported`    | `AArch64` | `Ubuntu 18.04` | `GCC-10`   | `On`    | `Unsupported`
+| `test:aarch64:linux:clang-10:cxx11-off:official`    | `AArch64` | `Ubuntu 18.04` | `Clang-10` | `Off`   | `Official`
+| `test:aarch64:linux:clang-10:cxx11-off:unsupported` | `AArch64` | `Ubuntu 18.04` | `Clang-10` | `Off`   | `Unsupported`
+| `test:aarch64:linux:clang-10:cxx11-on:official`     | `AArch64` | `Ubuntu 18.04` | `Clang-10` | `On`    | `Official`
+| `test:aarch64:linux:clang-10:cxx11-on:unsupported`  | `AArch64` | `Ubuntu 18.04` | `Clang-10` | `On`    | `Unsupported`

diff --git a/ci/build.gitlab-ci.yml b/ci/build.gitlab-ci.yml
new file mode 100644
index 0000000..6b9f415
--- /dev/null
+++ b/ci/build.gitlab-ci.yml

@@ -0,0 +1,216 @@
+.build:linux:base:
+  stage: build
+  image: ubuntu:18.04
+  before_script:
+    - apt-get update -y
+    - apt-get install -y --no-install-recommends software-properties-common
+    - add-apt-repository -y  ppa:ubuntu-toolchain-r/test
+    - apt-get update
+    - apt-get install --no-install-recommends -y ${EIGEN_CI_CXX_COMPILER}
+      ${EIGEN_CI_CC_COMPILER} cmake ninja-build
+  script:
+    - mkdir -p ${BUILDDIR} && cd ${BUILDDIR}
+    - CXX=${EIGEN_CI_CXX_COMPILER} CC=${EIGEN_CI_CC_COMPILER} cmake -G
+      ${EIGEN_CI_CMAKE_GENEATOR} -DEIGEN_TEST_CXX11=${EIGEN_TEST_CXX11}
+      ${EIGEN_CI_ADDITIONAL_ARGS} ..
+    - cmake --build . --target buildtests
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME"
+    paths:
+      - ${BUILDDIR}/
+    expire_in: 5 days
+  only:
+    - schedules
+
+######## x86-64 ################################################################
+# GCC-4.8 (the oldest compiler we support)
+build:x86-64:linux:gcc-4.8:cxx11-off:
+  extends: .build:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "g++-4.8"
+    EIGEN_CI_CC_COMPILER: "gcc-4.8"
+    EIGEN_TEST_CXX11: "off"
+  tags:
+    - eigen-runner
+    - linux
+    - x86-64
+
+build:x86-64:linux:gcc-4.8:cxx11-on:
+  extends: .build:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "g++-4.8"
+    EIGEN_CI_CC_COMPILER: "gcc-4.8"
+    EIGEN_TEST_CXX11: "on"
+  tags:
+    - eigen-runner
+    - linux
+    - x86-64
+
+# GCC-9
+build:x86-64:linux:gcc-9:cxx11-off:
+  extends: .build:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "g++-9"
+    EIGEN_CI_CC_COMPILER: "gcc-9"
+    EIGEN_TEST_CXX11: "off"
+  tags:
+    - eigen-runner
+    - linux
+    - x86-64
+
+build:x86-64:linux:gcc-9:cxx11-on:
+  extends: .build:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "g++-9"
+    EIGEN_CI_CC_COMPILER: "gcc-9"
+    EIGEN_TEST_CXX11: "on"
+  tags:
+    - eigen-runner
+    - linux
+    - x86-64
+
+# GCC-10
+build:x86-64:linux:gcc-10:cxx11-off:
+  extends: .build:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "g++-10"
+    EIGEN_CI_CC_COMPILER: "gcc-10"
+    EIGEN_TEST_CXX11: "off"
+  tags:
+    - eigen-runner
+    - linux
+    - x86-64
+
+build:x86-64:linux:gcc-10:cxx11-on:
+  extends: .build:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "g++-10"
+    EIGEN_CI_CC_COMPILER: "gcc-10"
+    EIGEN_TEST_CXX11: "on"
+  tags:
+    - eigen-runner
+    - linux
+    - x86-64
+
+# Clang-10
+build:x86-64:linux:clang-10:cxx11-off:
+  extends: .build:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "clang++-10"
+    EIGEN_CI_CC_COMPILER: "clang-10"
+    EIGEN_TEST_CXX11: "off"
+  tags:
+    - eigen-runner
+    - linux
+    - x86-64
+
+build:x86-64:linux:clang-10:cxx11-on:
+  extends: .build:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "clang++-10"
+    EIGEN_CI_CC_COMPILER: "clang-10"
+    EIGEN_TEST_CXX11: "on"
+  tags:
+    - eigen-runner
+    - linux
+    - x86-64
+
+######## AArch64 ###############################################################
+# GCC-10
+build:aarch64:linux:gcc-10:cxx11-off:
+  extends: .build:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "g++-10"
+    EIGEN_CI_CC_COMPILER: "gcc-10"
+    EIGEN_TEST_CXX11: "off"
+  tags:
+    - eigen-runner
+    - linux
+    - aarch64
+
+build:aarch64:linux:gcc-10:cxx11-on:
+  extends: .build:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "g++-10"
+    EIGEN_CI_CC_COMPILER: "gcc-10"
+    EIGEN_TEST_CXX11: "on"
+  tags:
+    - eigen-runner
+    - linux
+    - aarch64
+
+# Clang-10
+build:aarch64:linux:clang-10:cxx11-off:
+  extends: .build:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "clang++-10"
+    EIGEN_CI_CC_COMPILER: "clang-10"
+    EIGEN_TEST_CXX11: "off"
+  tags:
+    - eigen-runner
+    - linux
+    - aarch64
+
+build:aarch64:linux:clang-10:cxx11-on:
+  extends: .build:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "clang++-10"
+    EIGEN_CI_CC_COMPILER: "clang-10"
+    EIGEN_TEST_CXX11: "on"
+  tags:
+    - eigen-runner
+    - linux
+    - aarch64
+
+######## ppc64le ###############################################################
+# Currently all ppc64le jobs are allowed to fail
+
+# GCC-10
+build:ppc64le:linux:gcc-10:cxx11-off:
+  allow_failure: true
+  extends: .build:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "g++-10"
+    EIGEN_CI_CC_COMPILER: "gcc-10"
+    EIGEN_TEST_CXX11: "off"
+  tags:
+    - eigen-runner
+    - linux
+    - ppc64le
+
+build:ppc64le:linux:gcc-10:cxx11-on:
+  allow_failure: true
+  extends: .build:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "g++-10"
+    EIGEN_CI_CC_COMPILER: "gcc-10"
+    EIGEN_TEST_CXX11: "on"
+  tags:
+    - eigen-runner
+    - linux
+    - ppc64le
+
+# # Clang-10
+build:ppc64le:linux:clang-10:cxx11-off:
+  allow_failure: true
+  extends: .build:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "clang++-10"
+    EIGEN_CI_CC_COMPILER: "clang-10"
+    EIGEN_TEST_CXX11: "off"
+  tags:
+    - eigen-runner
+    - linux
+    - ppc64le
+
+build:ppc64le:linux:clang-10:cxx11-on:
+  allow_failure: true
+  extends: .build:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "clang++-10"
+    EIGEN_CI_CC_COMPILER: "clang-10"
+    EIGEN_TEST_CXX11: "on"
+  tags:
+    - eigen-runner
+    - linux
+    - ppc64le

diff --git a/ci/smoketests.gitlab-ci.yml b/ci/smoketests.gitlab-ci.yml
new file mode 100644
index 0000000..6384f10
--- /dev/null
+++ b/ci/smoketests.gitlab-ci.yml

@@ -0,0 +1,107 @@
+.buildsmoketests:linux:base:
+  stage: buildsmoketests
+  image: ubuntu:18.04
+  before_script:
+    - apt-get update -y
+    - apt-get install -y --no-install-recommends software-properties-common
+    - add-apt-repository -y  ppa:ubuntu-toolchain-r/test
+    - apt-get update
+    - apt-get install --no-install-recommends -y ${EIGEN_CI_CXX_COMPILER}
+      ${EIGEN_CI_CC_COMPILER} cmake ninja-build
+  script:
+    - mkdir -p ${BUILDDIR} && cd ${BUILDDIR}
+    - CXX=${EIGEN_CI_CXX_COMPILER} CC=${EIGEN_CI_CC_COMPILER} cmake -G
+      ${EIGEN_CI_CMAKE_GENEATOR} -DEIGEN_TEST_CXX11=${EIGEN_TEST_CXX11}
+      ${EIGEN_CI_ADDITIONAL_ARGS} ..
+    - cmake --build . --target buildsmoketests
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME"
+    paths:
+      - ${BUILDDIR}/
+    expire_in: 5 days
+  only:
+    - merge_requests
+
+buildsmoketests:x86-64:linux:gcc-10:cxx11-off:
+  extends: .buildsmoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "g++-10"
+    EIGEN_CI_CC_COMPILER: "gcc-10"
+    EIGEN_TEST_CXX11: "off"
+
+buildsmoketests:x86-64:linux:gcc-10:cxx11-on:
+  extends: .buildsmoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "g++-10"
+    EIGEN_CI_CC_COMPILER: "gcc-10"
+    EIGEN_TEST_CXX11: "on"
+
+buildsmoketests:x86-64:linux:clang-10:cxx11-off:
+  extends: .buildsmoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "clang++-10"
+    EIGEN_CI_CC_COMPILER: "clang-10"
+    EIGEN_TEST_CXX11: "off"
+
+buildsmoketests:x86-64:linux:clang-10:cxx11-on:
+  extends: .buildsmoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "clang++-10"
+    EIGEN_CI_CC_COMPILER: "clang-10"
+    EIGEN_TEST_CXX11: "on"
+
+.smoketests:linux:base:
+  stage: smoketests
+  image: ubuntu:18.04
+  before_script:
+    - apt-get update -y
+    - apt-get install -y --no-install-recommends software-properties-common
+    - add-apt-repository -y ppa:ubuntu-toolchain-r/test
+    - apt-get update
+    - apt-get install --no-install-recommends -y ${EIGEN_CI_CXX_COMPILER}
+      ${EIGEN_CI_CC_COMPILER} cmake ninja-build xsltproc
+  script:
+    - export CXX=${EIGEN_CI_CXX_COMPILER}
+    - export CC=${EIGEN_CI_CC_COMPILER}
+    - cd ${BUILDDIR} && ctest --output-on-failure --no-compress-output
+      --build-no-clean -T test -L smoketest
+  after_script:
+    - apt-get update -y
+    - apt-get install --no-install-recommends -y xsltproc
+    - cd ${BUILDDIR}
+    - xsltproc ../ci/CTest2JUnit.xsl Testing/`head -n 1 < Testing/TAG`/Test.xml > "JUnitTestResults_$CI_JOB_ID.xml"
+  artifacts:
+    reports:
+      junit:
+        - ${BUILDDIR}/JUnitTestResults_$CI_JOB_ID.xml
+    expire_in: 5 days
+  only:
+    - merge_requests
+
+smoketests:x86-64:linux:gcc-10:cxx11-off:
+  extends: .smoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CC_COMPILER: gcc-10
+  needs: [ "buildsmoketests:x86-64:linux:gcc-10:cxx11-off" ]
+
+smoketests:x86-64:linux:gcc-10:cxx11-on:
+  extends: .smoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CC_COMPILER: gcc-10
+  needs: [ "buildsmoketests:x86-64:linux:gcc-10:cxx11-on" ]
+
+smoketests:x86-64:linux:clang-10:cxx11-off:
+  extends: .smoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: clang++-10
+    EIGEN_CI_CC_COMPILER: clang-10
+  needs: [ "buildsmoketests:x86-64:linux:clang-10:cxx11-off" ]
+
+smoketests:x86-64:linux:clang-10:cxx11-on:
+  extends: .smoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: clang++-10
+    EIGEN_CI_CC_COMPILER: clang-10
+  needs: [ "buildsmoketests:x86-64:linux:clang-10:cxx11-on" ]

diff --git a/ci/test.gitlab-ci.yml b/ci/test.gitlab-ci.yml
new file mode 100644
index 0000000..2a0f5dd
--- /dev/null
+++ b/ci/test.gitlab-ci.yml

@@ -0,0 +1,388 @@
+.test:linux:base:
+  stage: test
+  image: ubuntu:18.04
+  retry: 2
+  before_script:
+    - apt-get update -y
+    - apt-get install -y --no-install-recommends software-properties-common
+    - add-apt-repository -y ppa:ubuntu-toolchain-r/test
+    - apt-get update
+    - apt-get install --no-install-recommends -y ${EIGEN_CI_CXX_COMPILER}
+      ${EIGEN_CI_CC_COMPILER} cmake ninja-build xsltproc
+  script:
+    - export CXX=${EIGEN_CI_CXX_COMPILER}
+    - export CC=${EIGEN_CI_CC_COMPILER}
+    - cd ${BUILDDIR} && ctest --output-on-failure --no-compress-output
+      --build-no-clean -T test -L ${EIGEN_CI_TEST_LABEL}
+  after_script:
+    - apt-get update -y
+    - apt-get install --no-install-recommends -y xsltproc
+    - cd ${BUILDDIR}
+    - xsltproc ../ci/CTest2JUnit.xsl Testing/`head -n 1 < Testing/TAG`/Test.xml > "JUnitTestResults_$CI_JOB_ID.xml"
+  artifacts:
+    reports:
+      junit:
+        - ${BUILDDIR}/JUnitTestResults_$CI_JOB_ID.xml
+    expire_in: 5 days
+  only:
+    - schedules
+
+##### x86-64 ###################################################################
+# GCC-4.8
+.test:x86-64:linux:gcc-4.8:cxx11-off:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-4.8
+    EIGEN_CI_CC_COMPILER: gcc-4.8
+  needs: [ "build:x86-64:linux:gcc-4.8:cxx11-off" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - x86-64
+
+test:x86-64:linux:gcc-4.8:cxx11-off:official:
+  extends: .test:x86-64:linux:gcc-4.8:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:x86-64:linux:gcc-4.8:cxx11-off:unsupported:
+  extends: .test:x86-64:linux:gcc-4.8:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+.test:x86-64:linux:gcc-4.8:cxx11-on:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-4.8
+    EIGEN_CI_CC_COMPILER: gcc-4.8
+  needs: [ "build:x86-64:linux:gcc-4.8:cxx11-on" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - x86-64
+
+test:x86-64:linux:gcc-4.8:cxx11-on:official:
+  extends: .test:x86-64:linux:gcc-4.8:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:x86-64:linux:gcc-4.8:cxx11-on:unsupported:
+  extends: .test:x86-64:linux:gcc-4.8:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+# GCC-9
+.test:x86-64:linux:gcc-9:cxx11-off:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-9
+    EIGEN_CI_CC_COMPILER: gcc-9
+  needs: [ "build:x86-64:linux:gcc-9:cxx11-off" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - x86-64
+
+test:x86-64:linux:gcc-9:cxx11-off:official:
+  extends: .test:x86-64:linux:gcc-9:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:x86-64:linux:gcc-9:cxx11-off:unsupported:
+  extends: .test:x86-64:linux:gcc-9:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+.test:x86-64:linux:gcc-9:cxx11-on:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-9
+    EIGEN_CI_CC_COMPILER: gcc-9
+  needs: [ "build:x86-64:linux:gcc-9:cxx11-on" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - x86-64
+
+test:x86-64:linux:gcc-9:cxx11-on:official:
+  extends: .test:x86-64:linux:gcc-9:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:x86-64:linux:gcc-9:cxx11-on:unsupported:
+  extends: .test:x86-64:linux:gcc-9:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+# GCC-10
+.test:x86-64:linux:gcc-10:cxx11-off:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CC_COMPILER: gcc-10
+  needs: [ "build:x86-64:linux:gcc-10:cxx11-off" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - x86-64
+
+test:x86-64:linux:gcc-10:cxx11-off:official:
+  extends: .test:x86-64:linux:gcc-10:cxx11-off
+  allow_failure: true
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:x86-64:linux:gcc-10:cxx11-off:unsupported:
+  extends: .test:x86-64:linux:gcc-10:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+.test:x86-64:linux:gcc-10:cxx11-on:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CC_COMPILER: gcc-10
+  needs: [ "build:x86-64:linux:gcc-10:cxx11-on" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - x86-64
+
+test:x86-64:linux:gcc-10:cxx11-on:official:
+  extends: .test:x86-64:linux:gcc-10:cxx11-on
+  allow_failure: true
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:x86-64:linux:gcc-10:cxx11-on:unsupported:
+  extends: .test:x86-64:linux:gcc-10:cxx11-on
+  allow_failure: true
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+# Clang 10
+.test:x86-64:linux:clang-10:cxx11-off:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: clang++-10
+    EIGEN_CI_CC_COMPILER: clang-10
+  needs: [ "build:x86-64:linux:clang-10:cxx11-off" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - x86-64
+
+test:x86-64:linux:clang-10:cxx11-off:official:
+  extends: .test:x86-64:linux:clang-10:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:x86-64:linux:clang-10:cxx11-off:unsupported:
+  extends: .test:x86-64:linux:clang-10:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+.test:x86-64:linux:clang-10:cxx11-on:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: clang++-10
+    EIGEN_CI_CC_COMPILER: clang-10
+  needs: [ "build:x86-64:linux:clang-10:cxx11-on" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - x86-64
+
+test:x86-64:linux:clang-10:cxx11-on:official:
+  extends: .test:x86-64:linux:clang-10:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:x86-64:linux:clang-10:cxx11-on:unsupported:
+  extends: .test:x86-64:linux:clang-10:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+##### AArch64 ##################################################################
+# GCC-10
+.test:aarch64:linux:gcc-10:cxx11-off:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CC_COMPILER: gcc-10
+  needs: [ "build:aarch64:linux:gcc-10:cxx11-off" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - aarch64
+
+test:aarch64:linux:gcc-10:cxx11-off:official:
+  extends: .test:aarch64:linux:gcc-10:cxx11-off
+  allow_failure: true
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:aarch64:linux:gcc-10:cxx11-off:unsupported:
+  extends: .test:aarch64:linux:gcc-10:cxx11-off
+  allow_failure: true
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+.test:aarch64:linux:gcc-10:cxx11-on:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CC_COMPILER: gcc-10
+  needs: [ "build:aarch64:linux:gcc-10:cxx11-on" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - aarch64
+
+test:aarch64:linux:gcc-10:cxx11-on:official:
+  extends: .test:aarch64:linux:gcc-10:cxx11-on
+  allow_failure: true
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:aarch64:linux:gcc-10:cxx11-on:unsupported:
+  extends: .test:aarch64:linux:gcc-10:cxx11-on
+  allow_failure: true
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+# Clang 10
+.test:aarch64:linux:clang-10:cxx11-off:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: clang++-10
+    EIGEN_CI_CC_COMPILER: clang-10
+  needs: [ "build:aarch64:linux:clang-10:cxx11-off" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - aarch64
+
+test:aarch64:linux:clang-10:cxx11-off:official:
+  extends: .test:aarch64:linux:clang-10:cxx11-off
+  allow_failure: true
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:aarch64:linux:clang-10:cxx11-off:unsupported:
+  extends: .test:aarch64:linux:clang-10:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+.test:aarch64:linux:clang-10:cxx11-on:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: clang++-10
+    EIGEN_CI_CC_COMPILER: clang-10
+  needs: [ "build:aarch64:linux:clang-10:cxx11-on" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - aarch64
+
+test:aarch64:linux:clang-10:cxx11-on:official:
+  extends: .test:aarch64:linux:clang-10:cxx11-on
+  allow_failure: true
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:aarch64:linux:clang-10:cxx11-on:unsupported:
+  extends: .test:aarch64:linux:clang-10:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+##### ppc64le ##################################################################
+# GCC-10
+.test:ppc64le:linux:gcc-10:cxx11-off:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CC_COMPILER: gcc-10
+  needs: [ "build:ppc64le:linux:gcc-10:cxx11-off" ]
+  allow_failure: true
+  tags: 
+    - eigen-runner
+    - linux
+    - ppc64le
+
+test:ppc64le:linux:gcc-10:cxx11-off:official:
+  extends: .test:ppc64le:linux:gcc-10:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:ppc64le:linux:gcc-10:cxx11-off:unsupported:
+  extends: .test:ppc64le:linux:gcc-10:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+.test:ppc64le:linux:gcc-10:cxx11-on:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CC_COMPILER: gcc-10
+  needs: [ "build:ppc64le:linux:gcc-10:cxx11-on" ]
+  allow_failure: true
+  tags: 
+    - eigen-runner
+    - linux
+    - ppc64le
+
+test:ppc64le:linux:gcc-10:cxx11-on:official:
+  extends: .test:ppc64le:linux:gcc-10:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:ppc64le:linux:gcc-10:cxx11-on:unsupported:
+  extends: .test:ppc64le:linux:gcc-10:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+# # Clang 10
+.test:ppc64le:linux:clang-10:cxx11-off:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: clang++-10
+    EIGEN_CI_CC_COMPILER: clang-10
+  needs: [ "build:ppc64le:linux:clang-10:cxx11-off" ]
+  allow_failure: true
+  tags: 
+    - eigen-runner
+    - linux
+    - ppc64le
+
+test:ppc64le:linux:clang-10:cxx11-off:official:
+  extends: .test:ppc64le:linux:clang-10:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:ppc64le:linux:clang-10:cxx11-off:unsupported:
+  extends: .test:ppc64le:linux:clang-10:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+.test:ppc64le:linux:clang-10:cxx11-on:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: clang++-10
+    EIGEN_CI_CC_COMPILER: clang-10
+  needs: [ "build:ppc64le:linux:clang-10:cxx11-on" ]
+  allow_failure: true
+  tags: 
+    - eigen-runner
+    - linux
+    - ppc64le
+
+test:ppc64le:linux:clang-10:cxx11-on:official:
+  extends: .test:ppc64le:linux:clang-10:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:ppc64le:linux:clang-10:cxx11-on:unsupported:
+  extends: .test:ppc64le:linux:clang-10:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"

diff --git a/cmake/ComputeCppCompilerChecks.cmake b/cmake/ComputeCppCompilerChecks.cmake
new file mode 100644
index 0000000..1807485
--- /dev/null
+++ b/cmake/ComputeCppCompilerChecks.cmake

@@ -0,0 +1,50 @@
+cmake_minimum_required(VERSION 3.4.3)
+
+if(CMAKE_COMPILER_IS_GNUCXX)
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8)
+    message(FATAL_ERROR "host compiler - gcc version must be > 4.8")
+  endif()
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+  if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.6)
+    message(FATAL_ERROR "host compiler - clang version must be > 3.6")
+  endif()
+endif()
+
+if(MSVC)
+  set(ComputeCpp_STL_CHECK_SRC __STL_check)
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${ComputeCpp_STL_CHECK_SRC}.cpp
+    "#include <ios>\n"
+    "int main() { return 0; }\n")
+  execute_process(
+    COMMAND ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE}
+            ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
+            -isystem ${ComputeCpp_INCLUDE_DIRS}
+            -o ${ComputeCpp_STL_CHECK_SRC}.sycl
+            -c ${ComputeCpp_STL_CHECK_SRC}.cpp
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    RESULT_VARIABLE ComputeCpp_STL_CHECK_RESULT
+    ERROR_QUIET
+    OUTPUT_QUIET)
+  if(NOT ${ComputeCpp_STL_CHECK_RESULT} EQUAL 0)
+    # Try disabling compiler version checks
+    execute_process(
+      COMMAND ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE}
+              ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
+              -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH
+              -isystem ${ComputeCpp_INCLUDE_DIRS}
+              -o ${ComputeCpp_STL_CHECK_SRC}.cpp.sycl
+              -c ${ComputeCpp_STL_CHECK_SRC}.cpp
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+      RESULT_VARIABLE ComputeCpp_STL_CHECK_RESULT
+      ERROR_QUIET
+      OUTPUT_QUIET)
+    if(NOT ${ComputeCpp_STL_CHECK_RESULT} EQUAL 0)
+      message(STATUS "Device compiler cannot consume hosted STL headers. Using any parts of the STL will likely result in device compiler errors.")
+    else()
+    message(STATUS "Device compiler does not meet certain STL version requirements. Disabling version checks and hoping for the best.")
+      list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH)
+    endif()
+  endif()
+  file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/${ComputeCpp_STL_CHECK_SRC}.cpp
+              ${CMAKE_CURRENT_BINARY_DIR}/${ComputeCpp_STL_CHECK_SRC}.cpp.sycl)
+endif(MSVC)

diff --git a/cmake/ComputeCppIRMap.cmake b/cmake/ComputeCppIRMap.cmake
new file mode 100644
index 0000000..942d91d
--- /dev/null
+++ b/cmake/ComputeCppIRMap.cmake

@@ -0,0 +1,18 @@
+cmake_minimum_required(VERSION 3.4.3)
+
+# These should match the types of IR output by compute++
+set(IR_MAP_spir bc)
+set(IR_MAP_spir64 bc)
+set(IR_MAP_spir32 bc)
+set(IR_MAP_spirv spv)
+set(IR_MAP_spirv64 spv)
+set(IR_MAP_spirv32 spv)
+set(IR_MAP_aorta-x86_64 o)
+set(IR_MAP_aorta-aarch64 o)
+set(IR_MAP_aorta-rcar-cve o)
+set(IR_MAP_custom-spir64 bc)
+set(IR_MAP_custom-spir32 bc)
+set(IR_MAP_custom-spirv64 spv)
+set(IR_MAP_custom-spirv32 spv)
+set(IR_MAP_ptx64 s)
+set(IR_MAP_amdgcn s)

diff --git a/cmake/Eigen3Config.cmake.in b/cmake/Eigen3Config.cmake.in
new file mode 100644
index 0000000..0a1ac61
--- /dev/null
+++ b/cmake/Eigen3Config.cmake.in

@@ -0,0 +1,23 @@
+# This file exports the Eigen3::Eigen CMake target which should be passed to the
+# target_link_libraries command.
+
+@PACKAGE_INIT@
+
+if (NOT TARGET eigen)
+  include ("${CMAKE_CURRENT_LIST_DIR}/Eigen3Targets.cmake")
+endif ()
+
+# Legacy variables, do *not* use. May be removed in the future.
+
+set (EIGEN3_FOUND 1)
+set (EIGEN3_USE_FILE    "${CMAKE_CURRENT_LIST_DIR}/UseEigen3.cmake")
+
+set (EIGEN3_DEFINITIONS  "@EIGEN_DEFINITIONS@")
+set (EIGEN3_INCLUDE_DIR  "@PACKAGE_EIGEN_INCLUDE_DIR@")
+set (EIGEN3_INCLUDE_DIRS "@PACKAGE_EIGEN_INCLUDE_DIR@")
+set (EIGEN3_ROOT_DIR     "@PACKAGE_EIGEN_ROOT_DIR@")
+
+set (EIGEN3_VERSION_STRING "@EIGEN_VERSION_STRING@")
+set (EIGEN3_VERSION_MAJOR  "@EIGEN_VERSION_MAJOR@")
+set (EIGEN3_VERSION_MINOR  "@EIGEN_VERSION_MINOR@")
+set (EIGEN3_VERSION_PATCH  "@EIGEN_VERSION_PATCH@")

diff --git a/cmake/Eigen3ConfigLegacy.cmake.in b/cmake/Eigen3ConfigLegacy.cmake.in
new file mode 100644
index 0000000..62d7224
--- /dev/null
+++ b/cmake/Eigen3ConfigLegacy.cmake.in

@@ -0,0 +1,30 @@
+#                                               -*- cmake -*-
+#
+#  Eigen3Config.cmake(.in)
+
+# Use the following variables to compile and link against Eigen:
+#  EIGEN3_FOUND              - True if Eigen was found on your system
+#  EIGEN3_USE_FILE           - The file making Eigen usable
+#  EIGEN3_DEFINITIONS        - Definitions needed to build with Eigen
+#  EIGEN3_INCLUDE_DIR        - Directory where signature_of_eigen3_matrix_library can be found
+#  EIGEN3_INCLUDE_DIRS       - List of directories of Eigen and it's dependencies
+#  EIGEN3_ROOT_DIR           - The base directory of Eigen
+#  EIGEN3_VERSION_STRING     - A human-readable string containing the version
+#  EIGEN3_VERSION_MAJOR      - The major version of Eigen
+#  EIGEN3_VERSION_MINOR      - The minor version of Eigen
+#  EIGEN3_VERSION_PATCH      - The patch version of Eigen
+
+@PACKAGE_INIT@
+
+set ( EIGEN3_FOUND 1 )
+set ( EIGEN3_USE_FILE     "${CMAKE_CURRENT_LIST_DIR}/UseEigen3.cmake" )
+
+set ( EIGEN3_DEFINITIONS  "@EIGEN_DEFINITIONS@" )
+set ( EIGEN3_INCLUDE_DIR  "@PACKAGE_EIGEN_INCLUDE_DIR@" )
+set ( EIGEN3_INCLUDE_DIRS "@PACKAGE_EIGEN_INCLUDE_DIR@" )
+set ( EIGEN3_ROOT_DIR     "@PACKAGE_EIGEN_ROOT_DIR@" )
+
+set ( EIGEN3_VERSION_STRING "@EIGEN_VERSION_STRING@" )
+set ( EIGEN3_VERSION_MAJOR  "@EIGEN_VERSION_MAJOR@" )
+set ( EIGEN3_VERSION_MINOR  "@EIGEN_VERSION_MINOR@" )
+set ( EIGEN3_VERSION_PATCH  "@EIGEN_VERSION_PATCH@" )

diff --git a/cmake/EigenConfigureTesting.cmake b/cmake/EigenConfigureTesting.cmake
new file mode 100644
index 0000000..9cb3bb2
--- /dev/null
+++ b/cmake/EigenConfigureTesting.cmake

@@ -0,0 +1,58 @@
+include(EigenTesting)
+include(CheckCXXSourceCompiles)
+
+# configure the "site" and "buildname" 
+ei_set_sitename()
+
+# retrieve and store the build string
+ei_set_build_string()
+
+add_custom_target(buildtests)
+add_custom_target(check COMMAND "ctest")
+add_dependencies(check buildtests)
+
+# check whether /bin/bash exists (disabled as not used anymore)
+# find_file(EIGEN_BIN_BASH_EXISTS "/bin/bash" PATHS "/" NO_DEFAULT_PATH)
+
+# This call activates testing and generates the DartConfiguration.tcl
+include(CTest)
+
+set(EIGEN_TEST_BUILD_FLAGS "" CACHE STRING "Options passed to the build command of unit tests")
+set(EIGEN_DASHBOARD_BUILD_TARGET "buildtests" CACHE STRING "Target to be built in dashboard mode, default is buildtests")
+set(EIGEN_CTEST_ERROR_EXCEPTION "" CACHE STRING "Regular expression for build error messages to be filtered out")
+
+# Overwrite default DartConfiguration.tcl such that ctest can build our unit tests.
+# Recall that our unit tests are not in the "all" target, so we have to explicitly ask ctest to build our custom 'buildtests' target.
+# At this stage, we can also add custom flags to the build tool through the user defined EIGEN_TEST_BUILD_FLAGS variable.
+file(READ  "${CMAKE_CURRENT_BINARY_DIR}/DartConfiguration.tcl" EIGEN_DART_CONFIG_FILE)
+# try to grab the default flags
+string(REGEX MATCH "MakeCommand:.*-- (.*)\nDefaultCTestConfigurationType" EIGEN_DUMMY ${EIGEN_DART_CONFIG_FILE})
+if(NOT CMAKE_MATCH_1)
+string(REGEX MATCH "MakeCommand:.*[^c]make (.*)\nDefaultCTestConfigurationType" EIGEN_DUMMY ${EIGEN_DART_CONFIG_FILE})
+endif()
+string(REGEX REPLACE "MakeCommand:.*DefaultCTestConfigurationType" "MakeCommand: ${CMAKE_COMMAND} --build . --target ${EIGEN_DASHBOARD_BUILD_TARGET} --config \"\${CTEST_CONFIGURATION_TYPE}\" -- ${CMAKE_MATCH_1} ${EIGEN_TEST_BUILD_FLAGS}\nDefaultCTestConfigurationType"
+       EIGEN_DART_CONFIG_FILE2 ${EIGEN_DART_CONFIG_FILE})
+file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/DartConfiguration.tcl" ${EIGEN_DART_CONFIG_FILE2})
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/CTestCustom.cmake.in ${CMAKE_BINARY_DIR}/CTestCustom.cmake)
+
+# some documentation of this function would be nice
+ei_init_testing()
+
+# configure Eigen related testing options
+option(EIGEN_NO_ASSERTION_CHECKING "Disable checking of assertions using exceptions" OFF)
+option(EIGEN_DEBUG_ASSERTS "Enable advanced debugging of assertions" OFF)
+
+if(CMAKE_COMPILER_IS_GNUCXX)
+  option(EIGEN_COVERAGE_TESTING "Enable/disable gcov" OFF)
+  if(EIGEN_COVERAGE_TESTING)
+    set(COVERAGE_FLAGS "-fprofile-arcs -ftest-coverage")
+    set(CTEST_CUSTOM_COVERAGE_EXCLUDE "/test/")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COVERAGE_FLAGS}")
+  endif()
+  
+elseif(MSVC)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_CRT_SECURE_NO_WARNINGS /D_SCL_SECURE_NO_WARNINGS")
+endif()
+
+

diff --git a/cmake/EigenDetermineOSVersion.cmake b/cmake/EigenDetermineOSVersion.cmake
new file mode 100644
index 0000000..9246fa6
--- /dev/null
+++ b/cmake/EigenDetermineOSVersion.cmake

@@ -0,0 +1,46 @@
+# The utility function DetermineOSVersion aims at providing an
+# improved version of the CMake variable ${CMAKE_SYSTEM} on Windows
+# machines.
+#
+# Usage:
+#  include(EigenDetermineOSVersion)
+#  DetermineOSVersion(OS_VERSION)
+#  message("OS: ${OS_VERSION}")
+
+# - A little helper variable which should not be directly called
+function(DetermineShortWindowsName WIN_VERSION win_num_version)
+   if    (${win_num_version} VERSION_EQUAL "6.1")
+       set(_version "win7")
+   elseif(${win_num_version} VERSION_EQUAL "6.0")
+       set(_version "winVista")
+   elseif(${win_num_version} VERSION_EQUAL "5.2")
+       set(_version "winXpProf")
+   elseif(${win_num_version} VERSION_EQUAL "5.1")
+       set(_version "winXp")
+   elseif(${win_num_version} VERSION_EQUAL "5.0")
+       set(_version "win2000Prof")
+   else()
+       set(_version "unknownWin")
+   endif()
+   set(${WIN_VERSION} ${_version} PARENT_SCOPE)
+endfunction()
+
+function(DetermineOSVersion OS_VERSION)
+  if (WIN32 AND CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
+    file (TO_NATIVE_PATH "$ENV{COMSPEC}" SHELL)
+    exec_program( ${SHELL} ARGS "/c" "ver" OUTPUT_VARIABLE ver_output)
+				
+      string(REGEX MATCHALL "[0-9]+"
+           ver_list "${ver_output}")
+      list(GET ver_list 0 _major)		   
+      list(GET ver_list 1 _minor)
+				
+    set(win_num_version ${_major}.${_minor})
+    DetermineShortWindowsName(win_version "${win_num_version}")
+    if(win_version)
+      set(${OS_VERSION} ${win_version} PARENT_SCOPE)
+    endif()
+  else()
+    set(${OS_VERSION} ${CMAKE_SYSTEM} PARENT_SCOPE)
+  endif()
+endfunction()

diff --git a/cmake/EigenDetermineVSServicePack.cmake b/cmake/EigenDetermineVSServicePack.cmake
new file mode 100644
index 0000000..fed7819
--- /dev/null
+++ b/cmake/EigenDetermineVSServicePack.cmake

@@ -0,0 +1,41 @@
+include(CMakeDetermineVSServicePack)
+
+# The code is almost identical to the CMake version. The only difference is that we remove
+# _DetermineVSServicePack_FastCheckVersionWithCompiler which lead to errors on some systems.
+function(EigenDetermineVSServicePack _pack)
+    if(NOT DETERMINED_VS_SERVICE_PACK OR NOT ${_pack})
+        if(NOT DETERMINED_VS_SERVICE_PACK)
+            _DetermineVSServicePack_CheckVersionWithTryCompile(DETERMINED_VS_SERVICE_PACK _cl_version)
+            if(NOT DETERMINED_VS_SERVICE_PACK)
+                _DetermineVSServicePack_CheckVersionWithTryRun(DETERMINED_VS_SERVICE_PACK _cl_version)
+            endif()
+        endif()
+
+        if(DETERMINED_VS_SERVICE_PACK)
+            if(_cl_version)
+                # Call helper function to determine VS version
+                _DetermineVSServicePackFromCompiler(_sp "${_cl_version}")
+              
+                # temporary fix, until CMake catches up
+                if (NOT _sp)
+                    if(${_cl_version} VERSION_EQUAL "17.00.50727.1")
+                        set(_sp "vc110")
+                    elseif(${_cl_version} VERSION_EQUAL "17.00.51106.1")
+                        set(_sp "vc110sp1")
+                    elseif(${_cl_version} VERSION_EQUAL "17.00.60315.1")
+                        set(_sp "vc110sp2")
+                    elseif(${_cl_version} VERSION_EQUAL "17.00.60610.1")
+                        set(_sp "vc110sp3")
+                    else()
+                        set(_sp ${CMAKE_CXX_COMPILER_VERSION})
+                    endif()
+                endif()
+                
+                if(_sp)
+                    set(${_pack} ${_sp} CACHE INTERNAL
+                        "The Visual Studio Release with Service Pack")
+                endif()
+            endif()
+        endif()
+    endif()
+endfunction()

diff --git a/cmake/EigenSmokeTestList.cmake b/cmake/EigenSmokeTestList.cmake
new file mode 100644
index 0000000..6f0f724
--- /dev/null
+++ b/cmake/EigenSmokeTestList.cmake

@@ -0,0 +1,131 @@
+# List of tests that will be build and run during Eigen's smoke testing. If one
+# of these tests doesn't exists or cannot be build with the current configuration
+# it will just be skipped.
+set(ei_smoke_test_list
+  adjoint_1
+  alignedvector3
+  array_cwise_7
+  array_cwise_8
+  array_for_matrix_1
+  array_of_string
+  array_replicate_1
+  array_reverse_1
+  autodiff_1
+  autodiff_scalar_1
+  bandmatrix
+  bdcsvd_9
+  bessel_functions_1
+  bfloat16_float
+  blasutil_1
+  block_5
+  BVH
+  cholesky_1
+  cholmod_support_23
+  cholmod_support_24
+  conservative_resize_1
+  constructor_1
+  corners_1
+  ctorleakmiscmatrices_4
+  dense_storage
+  determinant_1
+  diagonal_1
+  diagonal_2
+  diagonalmatrices_1
+  dynalloc
+  eigensolver_complex_1
+  eigensolver_selfadjoint_8
+  EulerAngles_1
+  exceptions
+  fastmath
+  first_aligned
+  geo_alignedbox_2
+  geo_eulerangles_1
+  geo_homogeneous_1
+  geo_hyperplane_1
+  geo_orthomethods_1
+  geo_parametrizedline_1
+  geo_transformations_7
+  half_float
+  hessenberg_1
+  hessenberg_6qr_10
+  householder_8
+  indexed_view_1
+  inplace_decomposition_1
+  integer_types_1
+  inverse_1
+  is_same_dense
+  jacobi_1
+  jacobisvd_1
+  kronecker_product
+  linearstructure_1
+  mapped_matrix_1
+  mapstaticmethods_1
+  mapstride_1
+  matrix_square_root_1
+  meta
+  minres_2
+  miscmatrices_1
+  mixingtypes_7
+  nestbyvalue
+  nesting_ops_1
+  nomalloc_1
+  nullary_1
+  num_dimensions
+  NumericalDiff
+  numext
+  packetmath
+  permutationmatrices_1
+  polynomialsolver_1
+  prec_inverse_4x4_1
+  product_extra_5
+  product_selfadjoint_1
+  product_small_7
+  product_symm_1
+  product_syrk_1
+  product_trmm_1
+  product_trmv_1
+  product_trsolve_5
+  qr_1
+  qr_colpivoting_7
+  qr_fullpivoting_4
+  rand
+  real_qz_1
+  redux_1
+  ref_1
+  resize
+  rvalue_types_1
+  schur_complex_1
+  schur_real_1
+  selfadjoint_1
+  sizeof
+  sizeoverflow
+  smallvectors
+  sparse_basic_3
+  sparse_block_1
+  sparse_extra_4
+  sparse_permutations_2
+  sparse_product_4
+  sparse_ref_1
+  sparse_solvers_1
+  sparse_vector_1
+  special_functions_1
+  special_numbers_1
+  special_packetmath_1
+  spqr_support_2
+  stable_norm_1
+  stddeque_1
+  stddeque_overload_1
+  stdlist_1
+  stdlist_overload_1
+  stdvector_1
+  stdvector_overload_1
+  stl_iterators_1
+  swap_1
+  symbolic_index_1
+  triangular_1
+  type_aliaslu_9
+  umeyama_3
+  unalignedassert
+  unalignedcount
+  vectorwiseop_1
+  visitor_1)
\ No newline at end of file

diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
new file mode 100644
index 0000000..eb8457d
--- /dev/null
+++ b/cmake/EigenTesting.cmake

@@ -0,0 +1,782 @@
+
+macro(ei_add_property prop value)
+  get_property(previous GLOBAL PROPERTY ${prop})
+  if ((NOT previous) OR (previous STREQUAL ""))
+    set_property(GLOBAL PROPERTY ${prop} "${value}")
+  else()
+    set_property(GLOBAL PROPERTY ${prop} "${previous} ${value}")
+  endif()
+endmacro()
+
+#internal. See documentation of ei_add_test for details.
+macro(ei_add_test_internal testname testname_with_suffix)
+  set(targetname ${testname_with_suffix})
+
+  if(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+    set(filename ${testname}.${EIGEN_ADD_TEST_FILENAME_EXTENSION})
+  else()
+    set(filename ${testname}.cpp)
+  endif()
+
+  # Add the current target to the list of subtest targets
+  get_property(EIGEN_SUBTESTS_LIST GLOBAL PROPERTY EIGEN_SUBTESTS_LIST)
+  set(EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}${targetname}\n")
+  set_property(GLOBAL PROPERTY EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}")
+
+  if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu)
+    if(EIGEN_TEST_HIP)
+      hip_reset_flags()
+      hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS "-DEIGEN_USE_HIP ${ARGV2}")
+    elseif(EIGEN_TEST_CUDA_CLANG)
+      set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)
+      
+      if(CUDA_64_BIT_DEVICE_CODE AND (EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64"))
+        link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
+      else()
+        link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib")
+      endif()
+
+      if (${ARGC} GREATER 2)
+        add_executable(${targetname} ${filename})
+      else()
+        add_executable(${targetname} ${filename} OPTIONS ${ARGV2})
+      endif()
+      set(CUDA_CLANG_LINK_LIBRARIES "cudart_static" "cuda" "dl" "pthread")
+      if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
+      set(CUDA_CLANG_LINK_LIBRARIES ${CUDA_CLANG_LINK_LIBRARIES} "rt")
+      endif()
+      target_link_libraries(${targetname} ${CUDA_CLANG_LINK_LIBRARIES})
+    else()
+      if (${ARGC} GREATER 2)
+        cuda_add_executable(${targetname} ${filename} OPTIONS ${ARGV2})
+      else()
+        cuda_add_executable(${targetname} ${filename})
+      endif()
+    endif()
+  else()
+    add_executable(${targetname} ${filename})
+  endif()
+
+  if (targetname MATCHES "^eigen2_")
+    add_dependencies(eigen2_buildtests ${targetname})
+  else()
+    add_dependencies(buildtests ${targetname})
+  endif()
+
+  if(EIGEN_NO_ASSERTION_CHECKING)
+    ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_NO_ASSERTION_CHECKING=1")
+  else()
+    if(EIGEN_DEBUG_ASSERTS)
+      ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_DEBUG_ASSERTS=1")
+    endif()
+  endif()
+
+  ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}")
+
+  if(MSVC)
+    ei_add_target_property(${targetname} COMPILE_FLAGS "/bigobj")
+  endif()
+
+  # let the user pass flags.
+  if(${ARGC} GREATER 2)
+    ei_add_target_property(${targetname} COMPILE_FLAGS "${ARGV2}")
+  endif()
+
+  if(EIGEN_TEST_CUSTOM_CXX_FLAGS)
+    ei_add_target_property(${targetname} COMPILE_FLAGS "${EIGEN_TEST_CUSTOM_CXX_FLAGS}")
+  endif()
+
+  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+    target_link_libraries(${targetname} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+  endif()
+  if(EXTERNAL_LIBS)
+    target_link_libraries(${targetname} ${EXTERNAL_LIBS})
+  endif()
+  if(EIGEN_TEST_CUSTOM_LINKER_FLAGS)
+    target_link_libraries(${targetname} ${EIGEN_TEST_CUSTOM_LINKER_FLAGS})
+  endif()
+
+  if(${ARGC} GREATER 3)
+    set(libs_to_link ${ARGV3})
+    # it could be that some cmake module provides a bad library string " "  (just spaces),
+    # and that severely breaks target_link_libraries ("can't link to -l-lstdc++" errors).
+    # so we check for strings containing only spaces.
+    string(STRIP "${libs_to_link}" libs_to_link_stripped)
+    string(LENGTH "${libs_to_link_stripped}" libs_to_link_stripped_length)
+    if(${libs_to_link_stripped_length} GREATER 0)
+      # notice: no double quotes around ${libs_to_link} here. It may be a list.
+      target_link_libraries(${targetname} ${libs_to_link})
+    endif()
+  endif()
+
+  add_test(${testname_with_suffix} "${targetname}")
+
+  # Specify target and test labels according to EIGEN_CURRENT_SUBPROJECT
+  get_property(current_subproject GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT)
+  if ((current_subproject) AND (NOT (current_subproject STREQUAL "")))
+    set_property(TARGET ${targetname} PROPERTY LABELS "Build${current_subproject}")
+    add_dependencies("Build${current_subproject}" ${targetname})
+    set_property(TEST ${testname_with_suffix} PROPERTY LABELS "${current_subproject}")
+  endif()
+  if(EIGEN_SYCL)
+    # Force include of the SYCL file at the end to avoid errors.
+    set_property(TARGET ${targetname} PROPERTY COMPUTECPP_INCLUDE_AFTER 1)
+    # Set COMPILE_FLAGS to COMPILE_DEFINITIONS instead to avoid having to duplicate the flags
+    # to the device compiler.
+    get_target_property(target_compile_flags ${targetname} COMPILE_FLAGS)
+    separate_arguments(target_compile_flags)
+    foreach(flag ${target_compile_flags})
+      if(${flag} MATCHES "^-D.*")
+        string(REPLACE "-D" "" definition_flag ${flag})
+        set_property(TARGET ${targetname} APPEND PROPERTY COMPILE_DEFINITIONS ${definition_flag})
+        list(REMOVE_ITEM target_compile_flags ${flag})
+      endif()
+    endforeach()
+    set_property(TARGET ${targetname} PROPERTY COMPILE_FLAGS ${target_compile_flags})
+    # Link against pthread and add sycl to target
+    set(THREADS_PREFER_PTHREAD_FLAG ON)
+    find_package(Threads REQUIRED)
+    target_link_libraries(${targetname} Threads::Threads)
+    add_sycl_to_target(TARGET ${targetname} SOURCES ${filename})
+  endif(EIGEN_SYCL)
+endmacro(ei_add_test_internal)
+# Macro to add a test
+#
+# the unique mandatory parameter testname must correspond to a file
+# <testname>.cpp which follows this pattern:
+#
+# #include "main.h"
+# void test_<testname>() { ... }
+#
+# Depending on the contents of that file, this macro can have 2 behaviors,
+# see below.
+#
+# The optional 2nd parameter is libraries to link to.
+#
+# A. Default behavior
+#
+# this macro adds an executable <testname> as well as a ctest test
+# named <testname> too.
+#
+# On platforms with bash simply run:
+#   "ctest -V" or "ctest -V -R <testname>"
+# On other platform use ctest as usual
+#
+# B. Multi-part behavior
+#
+# If the source file matches the regexp
+#    CALL_SUBTEST_[0-9]+|EIGEN_TEST_PART_[0-9]+
+# then it is interpreted as a multi-part test. The behavior then depends on the
+# CMake option EIGEN_SPLIT_LARGE_TESTS, which is ON by default.
+#
+# If EIGEN_SPLIT_LARGE_TESTS is OFF, the behavior is the same as in A (the multi-part
+# aspect is ignored).
+#
+# If EIGEN_SPLIT_LARGE_TESTS is ON, the test is split into multiple executables
+#   test_<testname>_<N>
+# where N runs from 1 to the greatest occurrence found in the source file. Each of these
+# executables is built passing -DEIGEN_TEST_PART_N. This allows to split large tests
+# into smaller executables.
+#
+# Moreover, targets <testname> are still generated, they
+# have the effect of building all the parts of the test.
+#
+# Again, ctest -R allows to run all matching tests.
+macro(ei_add_test testname)
+  get_property(EIGEN_TESTS_LIST GLOBAL PROPERTY EIGEN_TESTS_LIST)
+  set(EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}${testname}\n")
+  set_property(GLOBAL PROPERTY EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}")
+
+  if(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+    set(filename ${testname}.${EIGEN_ADD_TEST_FILENAME_EXTENSION})
+  else()
+    set(filename ${testname}.cpp)
+  endif()
+
+  file(READ "${filename}" test_source)
+  string(REGEX MATCHALL "CALL_SUBTEST_[0-9]+|EIGEN_TEST_PART_[0-9]+|EIGEN_SUFFIXES(;[0-9]+)+"
+         occurrences "${test_source}")
+  string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurrences}")
+  list(REMOVE_DUPLICATES suffixes)
+  set(explicit_suffixes "")
+  if( (NOT EIGEN_SPLIT_LARGE_TESTS) AND suffixes)
+    # Check whether we have EIGEN_TEST_PART_* statements, in which case we likely must enforce splitting.
+    # For instance, indexed_view activate a different c++ version for each part.
+    string(REGEX MATCHALL "EIGEN_TEST_PART_[0-9]+" occurrences "${test_source}")
+    string(REGEX REPLACE "EIGEN_TEST_PART_" "" explicit_suffixes "${occurrences}")
+    list(REMOVE_DUPLICATES explicit_suffixes)
+  endif()
+  if( (EIGEN_SPLIT_LARGE_TESTS AND suffixes) OR explicit_suffixes)
+    add_custom_target(${testname})
+    foreach(suffix ${suffixes})
+      ei_add_test_internal(${testname} ${testname}_${suffix}
+        "${ARGV1} -DEIGEN_TEST_PART_${suffix}=1" "${ARGV2}")
+      add_dependencies(${testname} ${testname}_${suffix})
+    endforeach()
+  else()
+    ei_add_test_internal(${testname} ${testname} "${ARGV1} -DEIGEN_TEST_PART_ALL=1" "${ARGV2}")
+  endif()
+endmacro()
+
+# adds a failtest, i.e. a test that succeed if the program fails to compile
+# note that the test runner for these is CMake itself, when passed -DEIGEN_FAILTEST=ON
+# so here we're just running CMake commands immediately, we're not adding any targets.
+macro(ei_add_failtest testname)
+
+  set(test_target_ok ${testname}_ok)
+  set(test_target_ko ${testname}_ko)
+
+  # Add executables
+  add_executable(${test_target_ok} ${testname}.cpp)
+  add_executable(${test_target_ko} ${testname}.cpp)
+
+  # Remove them from the normal build process
+  set_target_properties(${test_target_ok} ${test_target_ko} PROPERTIES
+                        EXCLUDE_FROM_ALL TRUE
+                        EXCLUDE_FROM_DEFAULT_BUILD TRUE)
+
+  # Configure the failing test
+  target_compile_definitions(${test_target_ko} PRIVATE EIGEN_SHOULD_FAIL_TO_BUILD)
+
+  # Add the tests to ctest.
+  add_test(NAME ${test_target_ok}
+          COMMAND ${CMAKE_COMMAND} --build . --target ${test_target_ok} --config $<CONFIGURATION>
+          WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+  add_test(NAME ${test_target_ko}
+          COMMAND ${CMAKE_COMMAND} --build . --target ${test_target_ko} --config $<CONFIGURATION>
+          WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+
+  # Expect the second test to fail
+  set_tests_properties(${test_target_ko} PROPERTIES WILL_FAIL TRUE)
+endmacro()
+
+# print a summary of the different options
+macro(ei_testing_print_summary)
+  message(STATUS "************************************************************")
+  message(STATUS "***    Eigen's unit tests configuration summary          ***")
+  message(STATUS "************************************************************")
+  message(STATUS "")
+  message(STATUS "Build type:        ${CMAKE_BUILD_TYPE}")
+  message(STATUS "Build site:        ${SITE}")
+  message(STATUS "Build string:      ${BUILDNAME}")
+  get_property(EIGEN_TESTING_SUMMARY GLOBAL PROPERTY EIGEN_TESTING_SUMMARY)
+  get_property(EIGEN_TESTED_BACKENDS GLOBAL PROPERTY EIGEN_TESTED_BACKENDS)
+  get_property(EIGEN_MISSING_BACKENDS GLOBAL PROPERTY EIGEN_MISSING_BACKENDS)
+  message(STATUS "Enabled backends:  ${EIGEN_TESTED_BACKENDS}")
+  message(STATUS "Disabled backends: ${EIGEN_MISSING_BACKENDS}")
+
+  if(EIGEN_DEFAULT_TO_ROW_MAJOR)
+    message(STATUS "Default order:     Row-major")
+  else()
+    message(STATUS "Default order:     Column-major")
+  endif()
+
+  if(EIGEN_TEST_NO_EXPLICIT_ALIGNMENT)
+    message(STATUS "Explicit alignment (hence vectorization) disabled")
+  elseif(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION)
+    message(STATUS "Explicit vectorization disabled (alignment kept enabled)")
+  else()
+
+  message(STATUS "Maximal matrix/vector size: ${EIGEN_TEST_MAX_SIZE}")
+
+    if(EIGEN_TEST_SSE2)
+      message(STATUS "SSE2:              ON")
+    else()
+      message(STATUS "SSE2:              Using architecture defaults")
+    endif()
+
+    if(EIGEN_TEST_SSE3)
+      message(STATUS "SSE3:              ON")
+    else()
+      message(STATUS "SSE3:              Using architecture defaults")
+    endif()
+
+    if(EIGEN_TEST_SSSE3)
+      message(STATUS "SSSE3:             ON")
+    else()
+      message(STATUS "SSSE3:             Using architecture defaults")
+    endif()
+
+    if(EIGEN_TEST_SSE4_1)
+      message(STATUS "SSE4.1:            ON")
+    else()
+      message(STATUS "SSE4.1:            Using architecture defaults")
+    endif()
+
+    if(EIGEN_TEST_SSE4_2)
+      message(STATUS "SSE4.2:            ON")
+    else()
+      message(STATUS "SSE4.2:            Using architecture defaults")
+    endif()
+
+    if(EIGEN_TEST_AVX)
+      message(STATUS "AVX:               ON")
+    else()
+      message(STATUS "AVX:               Using architecture defaults")
+    endif()
+
+    if(EIGEN_TEST_AVX2)
+      message(STATUS "AVX2:              ON")
+    else()
+      message(STATUS "AVX2:              Using architecture defaults")
+    endif()
+
+    if(EIGEN_TEST_FMA)
+      message(STATUS "FMA:               ON")
+    else()
+      message(STATUS "FMA:               Using architecture defaults")
+    endif()
+
+    if(EIGEN_TEST_AVX512)
+      message(STATUS "AVX512:            ON")
+    else()
+      message(STATUS "AVX512:            Using architecture defaults")
+    endif()
+
+    if(EIGEN_TEST_AVX512DQ)
+      message(STATUS "AVX512DQ:          ON")
+    else()
+      message(STATUS "AVX512DQ:          Using architecture defaults")
+    endif()
+
+    if(EIGEN_TEST_ALTIVEC)
+      message(STATUS "Altivec:           ON")
+    else()
+      message(STATUS "Altivec:           Using architecture defaults")
+    endif()
+
+    if(EIGEN_TEST_VSX)
+      message(STATUS "VSX:               ON")
+    else()
+      message(STATUS "VSX:               Using architecture defaults")
+    endif()
+
+    if(EIGEN_TEST_MSA)
+      message(STATUS "MIPS MSA:          ON")
+    else()
+      message(STATUS "MIPS MSA:          Using architecture defaults")
+    endif()
+
+    if(EIGEN_TEST_NEON)
+      message(STATUS "ARM NEON:          ON")
+    else()
+      message(STATUS "ARM NEON:          Using architecture defaults")
+    endif()
+
+    if(EIGEN_TEST_NEON64)
+      message(STATUS "ARMv8 NEON:        ON")
+    else()
+      message(STATUS "ARMv8 NEON:        Using architecture defaults")
+    endif()
+
+    if(EIGEN_TEST_ZVECTOR)
+      message(STATUS "S390X ZVECTOR:     ON")
+    else()
+      message(STATUS "S390X ZVECTOR:     Using architecture defaults")
+    endif()
+
+    if(EIGEN_TEST_CXX11)
+      message(STATUS "C++11:             ON")
+    else()
+      message(STATUS "C++11:             OFF")
+    endif()
+
+    if(EIGEN_TEST_SYCL)
+      if(EIGEN_SYCL_TRISYCL)
+        message(STATUS "SYCL:              ON (using triSYCL)")
+      else()
+        message(STATUS "SYCL:              ON (using computeCPP)")
+      endif()
+    else()
+      message(STATUS "SYCL:              OFF")
+    endif()
+    if(EIGEN_TEST_CUDA)
+      if(EIGEN_TEST_CUDA_CLANG)
+        message(STATUS "CUDA:              ON (using clang)")
+      else()
+        message(STATUS "CUDA:              ON (using nvcc)")
+      endif()
+    else()
+      message(STATUS "CUDA:              OFF")
+    endif()
+    if(EIGEN_TEST_HIP)
+      message(STATUS "HIP:               ON (using hipcc)")
+    else()
+      message(STATUS "HIP:               OFF")
+    endif()
+
+  endif() # vectorization / alignment options
+
+  message(STATUS "\n${EIGEN_TESTING_SUMMARY}")
+
+  message(STATUS "************************************************************")
+endmacro()
+
+macro(ei_init_testing)
+  define_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT BRIEF_DOCS " " FULL_DOCS " ")
+  define_property(GLOBAL PROPERTY EIGEN_TESTED_BACKENDS BRIEF_DOCS " " FULL_DOCS " ")
+  define_property(GLOBAL PROPERTY EIGEN_MISSING_BACKENDS BRIEF_DOCS " " FULL_DOCS " ")
+  define_property(GLOBAL PROPERTY EIGEN_TESTING_SUMMARY BRIEF_DOCS " " FULL_DOCS " ")
+  define_property(GLOBAL PROPERTY EIGEN_TESTS_LIST BRIEF_DOCS " " FULL_DOCS " ")
+  define_property(GLOBAL PROPERTY EIGEN_SUBTESTS_LIST BRIEF_DOCS " " FULL_DOCS " ")
+
+  set_property(GLOBAL PROPERTY EIGEN_TESTED_BACKENDS "")
+  set_property(GLOBAL PROPERTY EIGEN_MISSING_BACKENDS "")
+  set_property(GLOBAL PROPERTY EIGEN_TESTING_SUMMARY "")
+  set_property(GLOBAL PROPERTY EIGEN_TESTS_LIST "")
+  set_property(GLOBAL PROPERTY EIGEN_SUBTESTS_LIST "")
+
+  define_property(GLOBAL PROPERTY EIGEN_FAILTEST_FAILURE_COUNT BRIEF_DOCS " " FULL_DOCS " ")
+  define_property(GLOBAL PROPERTY EIGEN_FAILTEST_COUNT BRIEF_DOCS " " FULL_DOCS " ")
+
+  set_property(GLOBAL PROPERTY EIGEN_FAILTEST_FAILURE_COUNT "0")
+  set_property(GLOBAL PROPERTY EIGEN_FAILTEST_COUNT "0")
+
+  # uncomment anytime you change the ei_get_compilerver_from_cxx_version_string macro
+  # ei_test_get_compilerver_from_cxx_version_string()
+endmacro()
+
+macro(ei_set_sitename)
+  # if the sitename is not yet set, try to set it
+  if(NOT ${SITE} OR ${SITE} STREQUAL "")
+    set(eigen_computername $ENV{COMPUTERNAME})
+    set(eigen_hostname $ENV{HOSTNAME})
+    if(eigen_hostname)
+      set(SITE ${eigen_hostname})
+    elseif(eigen_computername)
+      set(SITE ${eigen_computername})
+    endif()
+  endif()
+  # in case it is already set, enforce lower case
+  if(SITE)
+    string(TOLOWER ${SITE} SITE)
+  endif()
+endmacro()
+
+macro(ei_get_compilerver VAR)
+    if(MSVC)
+      # on windows system, we use a modified CMake script
+      include(EigenDetermineVSServicePack)
+      EigenDetermineVSServicePack( my_service_pack )
+
+      if( my_service_pack )
+        set(${VAR} ${my_service_pack})
+      else()
+        set(${VAR} "na")
+      endif()
+    elseif(${CMAKE_CXX_COMPILER_ID} MATCHES "PGI")
+      set(${VAR} "${CMAKE_CXX_COMPILER_ID}-${CMAKE_CXX_COMPILER_VERSION}")
+    else()
+    # on all other system we rely on ${CMAKE_CXX_COMPILER}
+    # supporting a "--version" or "/version" flag
+
+    if(WIN32 AND ${CMAKE_CXX_COMPILER_ID} EQUAL "Intel")
+      set(EIGEN_CXX_FLAG_VERSION "/version")
+    else()
+      set(EIGEN_CXX_FLAG_VERSION "--version")
+    endif()
+
+    execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${EIGEN_CXX_FLAG_VERSION}
+                    OUTPUT_VARIABLE eigen_cxx_compiler_version_string OUTPUT_STRIP_TRAILING_WHITESPACE)
+    string(REGEX REPLACE "^[ \n\r]+" "" eigen_cxx_compiler_version_string ${eigen_cxx_compiler_version_string})
+    string(REGEX REPLACE "[\n\r].*"  ""  eigen_cxx_compiler_version_string  ${eigen_cxx_compiler_version_string})
+
+    ei_get_compilerver_from_cxx_version_string("${eigen_cxx_compiler_version_string}" CNAME CVER)
+    set(${VAR} "${CNAME}-${CVER}")
+
+  endif()
+endmacro()
+
+# Extract compiler name and version from a raw version string
+# WARNING: if you edit this macro, then please test it by uncommenting
+# the testing macro call in ei_init_testing() of the EigenTesting.cmake file.
+# See also the ei_test_get_compilerver_from_cxx_version_string macro at the end
+# of the file
+macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER)
+  # extract possible compiler names
+  string(REGEX MATCH "g\\+\\+"      ei_has_gpp    ${VERSTRING})
+  string(REGEX MATCH "llvm|LLVM"    ei_has_llvm   ${VERSTRING})
+  string(REGEX MATCH "gcc|GCC"      ei_has_gcc    ${VERSTRING})
+  string(REGEX MATCH "icpc|ICC"     ei_has_icpc   ${VERSTRING})
+  string(REGEX MATCH "clang|CLANG"  ei_has_clang  ${VERSTRING})
+  string(REGEX MATCH "mingw32"      ei_has_mingw  ${VERSTRING})
+
+  # combine them
+  if((ei_has_llvm) AND (ei_has_gpp OR ei_has_gcc))
+    set(${CNAME} "llvm-g++")
+  elseif((ei_has_llvm) AND (ei_has_clang))
+    set(${CNAME} "llvm-clang++")
+  elseif(ei_has_clang)
+    set(${CNAME} "clang++")
+  elseif ((ei_has_mingw) AND (ei_has_gpp OR ei_has_gcc))
+    set(${CNAME} "mingw32-g++")
+  elseif(ei_has_icpc)
+    set(${CNAME} "icpc")
+  elseif(ei_has_gpp OR ei_has_gcc)
+    set(${CNAME} "g++")
+  else()
+    set(${CNAME} "_")
+  endif()
+
+  # extract possible version numbers
+  # first try to extract 3 isolated numbers:
+  string(REGEX MATCH " [0-9]+\\.[0-9]+\\.[0-9]+" eicver ${VERSTRING})
+  if(NOT eicver)
+    # try to extract 2 isolated ones:
+    string(REGEX MATCH " [0-9]+\\.[0-9]+" eicver ${VERSTRING})
+    if(NOT eicver)
+      # try to extract 3:
+      string(REGEX MATCH "[^0-9][0-9]+\\.[0-9]+\\.[0-9]+" eicver ${VERSTRING})
+      if(NOT eicver)
+        # try to extract 2:
+        string(REGEX MATCH "[^0-9][0-9]+\\.[0-9]+" eicver ${VERSTRING})
+        if (NOT eicver AND ei_has_mingw)
+          # try to extract 1 number plus suffix:
+          string(REGEX MATCH "[^0-9][0-9]+-win32" eicver ${VERSTRING})          
+        endif()
+      endif()
+    endif()
+  endif()
+  
+  if (NOT eicver)
+    set(eicver " _")
+  endif()
+
+  string(REGEX REPLACE ".(.*)" "\\1" ${CVER} ${eicver})
+
+endmacro()
+
+macro(ei_get_cxxflags VAR)
+  set(${VAR} "")
+  ei_is_64bit_env(IS_64BIT_ENV)
+  if(EIGEN_TEST_NEON)
+    set(${VAR} NEON)
+  elseif(EIGEN_TEST_NEON64)
+    set(${VAR} NEON)
+  elseif(EIGEN_TEST_ZVECTOR)
+    set(${VAR} ZVECTOR)
+  elseif(EIGEN_TEST_VSX)
+    set(${VAR} VSX)
+  elseif(EIGEN_TEST_ALTIVEC)
+    set(${VAR} ALVEC)
+  elseif(EIGEN_TEST_FMA)
+    set(${VAR} FMA)
+  elseif(EIGEN_TEST_AVX)
+    set(${VAR} AVX)
+  elseif(EIGEN_TEST_SSE4_2)
+    set(${VAR} SSE42)
+  elseif(EIGEN_TEST_SSE4_1)
+    set(${VAR} SSE41)
+  elseif(EIGEN_TEST_SSSE3)
+    set(${VAR} SSSE3)
+  elseif(EIGEN_TEST_SSE3)
+    set(${VAR} SSE3)
+  elseif(EIGEN_TEST_SSE2 OR IS_64BIT_ENV)
+    set(${VAR} SSE2)
+  elseif(EIGEN_TEST_MSA)
+    set(${VAR} MSA)
+  endif()
+
+  if(EIGEN_TEST_OPENMP)
+    if (${VAR} STREQUAL "")
+      set(${VAR} OMP)
+    else()
+      set(${VAR} ${${VAR}}-OMP)
+    endif()
+  endif()
+
+  if(EIGEN_DEFAULT_TO_ROW_MAJOR)
+    if (${VAR} STREQUAL "")
+      set(${VAR} ROW)
+    else()
+      set(${VAR} ${${VAR}}-ROWMAJ)
+    endif()
+  endif()
+endmacro()
+
+macro(ei_set_build_string)
+  ei_get_compilerver(LOCAL_COMPILER_VERSION)
+  ei_get_cxxflags(LOCAL_COMPILER_FLAGS)
+
+  include(EigenDetermineOSVersion)
+  DetermineOSVersion(OS_VERSION)
+
+  set(TMP_BUILD_STRING ${OS_VERSION}-${LOCAL_COMPILER_VERSION})
+
+  if (NOT ${LOCAL_COMPILER_FLAGS} STREQUAL  "")
+    set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-${LOCAL_COMPILER_FLAGS})
+  endif()
+
+  if(EIGEN_TEST_EXTERNAL_BLAS)
+    set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-external_blas)
+  endif()
+
+  ei_is_64bit_env(IS_64BIT_ENV)
+  if(NOT IS_64BIT_ENV)
+    set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-32bit)
+  else()
+    set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-64bit)
+  endif()
+
+  if(EIGEN_TEST_CXX11)
+    set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-cxx11)
+  endif()
+
+  if(EIGEN_BUILD_STRING_SUFFIX)
+    set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-${EIGEN_BUILD_STRING_SUFFIX})
+  endif()
+
+  string(TOLOWER ${TMP_BUILD_STRING} BUILDNAME)
+endmacro()
+
+macro(ei_is_64bit_env VAR)
+  if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+    set(${VAR} 1)
+  elseif(CMAKE_SIZEOF_VOID_P EQUAL 4)
+    set(${VAR} 0)
+  else()
+    message(WARNING "Unsupported pointer size. Please contact the authors.")
+  endif()
+endmacro()
+
+
+# helper macro for testing ei_get_compilerver_from_cxx_version_string
+# STR: raw version string
+# REFNAME: expected compiler name
+# REFVER: expected compiler version
+macro(ei_test1_get_compilerver_from_cxx_version_string STR REFNAME REFVER)
+  ei_get_compilerver_from_cxx_version_string(${STR} CNAME CVER)
+  if((NOT ${REFNAME} STREQUAL ${CNAME}) OR (NOT ${REFVER} STREQUAL ${CVER}))
+    message("STATUS ei_get_compilerver_from_cxx_version_string error:")
+    message("Expected \"${REFNAME}-${REFVER}\", got \"${CNAME}-${CVER}\"")
+  endif()
+endmacro()
+
+# macro for testing ei_get_compilerver_from_cxx_version_string
+# feel free to add more version strings
+macro(ei_test_get_compilerver_from_cxx_version_string)
+  ei_test1_get_compilerver_from_cxx_version_string("g++ (SUSE Linux) 4.5.3 20110428 [gcc-4_5-branch revision 173117]" "g++" "4.5.3")
+  ei_test1_get_compilerver_from_cxx_version_string("c++ (GCC) 4.5.1 20100924 (Red Hat 4.5.1-4)" "g++" "4.5.1")
+  ei_test1_get_compilerver_from_cxx_version_string("icpc (ICC) 11.0 20081105" "icpc" "11.0")
+  ei_test1_get_compilerver_from_cxx_version_string("g++-3.4 (GCC) 3.4.6" "g++" "3.4.6")
+  ei_test1_get_compilerver_from_cxx_version_string("SUSE Linux clang version 3.0 (branches/release_30 145598) (based on LLVM 3.0)" "llvm-clang++" "3.0")
+  ei_test1_get_compilerver_from_cxx_version_string("icpc (ICC) 12.0.5 20110719" "icpc" "12.0.5")
+  ei_test1_get_compilerver_from_cxx_version_string("Apple clang version 2.1 (tags/Apple/clang-163.7.1) (based on LLVM 3.0svn)" "llvm-clang++" "2.1")
+  ei_test1_get_compilerver_from_cxx_version_string("i686-apple-darwin11-llvm-g++-4.2 (GCC) 4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2335.15.00)" "llvm-g++" "4.2.1")
+  ei_test1_get_compilerver_from_cxx_version_string("g++-mp-4.4 (GCC) 4.4.6" "g++" "4.4.6")
+  ei_test1_get_compilerver_from_cxx_version_string("g++-mp-4.4 (GCC) 2011" "g++" "4.4")
+  ei_test1_get_compilerver_from_cxx_version_string("x86_64-w64-mingw32-g++ (GCC) 10-win32 20210110" "mingw32-g++" "10-win32")
+endmacro()
+
+# Split all tests listed in EIGEN_TESTS_LIST into num_splits many targets
+# named buildtestspartN with N = { 0, ..., num_splits-1}.
+#
+# The intention behind the existance of this macro is the size of Eigen's
+# testsuite. Together with the relativly big compile-times building all tests
+# can take a substantial amount of time depending on the available hardware.
+# 
+# The last buildtestspartN target will build possible remaining tests.
+#
+# An example:
+#
+#   EIGEN_TESTS_LIST= [ test1, test2, test3, test4, test5, test6, test7 ]
+#
+# A call to ei_split_testsuite(3) creates the following targets with dependencies
+#
+#   Target                      Dependencies
+#   ------                      ------------
+#   buildtestspart0             test1, test2
+#   buildtestspart1             test3, test4
+#   buildtestspart2             test5, test6, test7
+#
+macro(ei_split_testsuite num_splits)
+  get_property(EIGEN_TESTS_LIST GLOBAL PROPERTY EIGEN_TESTS_LIST)
+
+  # Translate EIGEN_TESTS_LIST into a CMake list
+  string(REGEX REPLACE "\n" " " EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}")
+  set(EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}")
+  separate_arguments(EIGEN_TESTS_LIST)
+
+  set(eigen_test_count "0")
+  foreach(t IN ITEMS ${EIGEN_TESTS_LIST})
+    math(EXPR eigen_test_count "${eigen_test_count}+1")
+  endforeach()
+
+  # Get number of tests per target
+  math(EXPR num_tests_per_target "${eigen_test_count}/${num_splits} - ${eigen_test_count}/${num_splits} % 1")
+
+  set(test_idx "0")
+  math(EXPR target_bound "${num_splits}-1")
+  foreach(part RANGE "0" "${target_bound}")
+    # Create target
+    set(current_target "buildtestspart${part}")
+    add_custom_target("${current_target}")
+    math(EXPR upper_bound "${test_idx} + ${num_tests_per_target} - 1")
+    foreach(test_idx RANGE "${test_idx}" "${upper_bound}")
+      list(GET EIGEN_TESTS_LIST "${test_idx}" curr_test)
+      add_dependencies("${current_target}" "${curr_test}")
+    endforeach()
+    math(EXPR test_idx "${test_idx} + ${num_tests_per_target}")
+  endforeach()
+  
+  # Handle the possibly remaining tests
+  math(EXPR test_idx "${num_splits} * ${num_tests_per_target}")
+  math(EXPR target_bound "${eigen_test_count} - 1")
+  foreach(test_idx RANGE "${test_idx}" "${target_bound}")
+    list(GET EIGEN_TESTS_LIST "${test_idx}" curr_test)
+    add_dependencies("${current_target}" "${curr_test}")
+  endforeach()
+endmacro(ei_split_testsuite num_splits)
+
+# Defines the custom command buildsmoketests to build a number of tests
+# specified in smoke_test_list.
+# 
+# Test in smoke_test_list can be either test targets (e.g. packetmath) or
+# subtests targets (e.g. packetmath_2). If any of the test are not available
+# in the current configuration they are just skipped. 
+#
+# All tests added via this macro are labeled with the smoketest label. This
+# allows running smoketests only using ctest.
+#
+# Smoke tests are intended to be run before the whole test suite is invoked,
+# e.g., to smoke test patches.
+macro(ei_add_smoke_tests smoke_test_list)
+  # Set the build target to build smoketests
+  set(buildtarget "buildsmoketests")
+  add_custom_target("${buildtarget}")
+
+  # Get list of all tests and translate it into a CMake list
+  get_property(EIGEN_TESTS_LIST GLOBAL PROPERTY EIGEN_TESTS_LIST)
+  string(REGEX REPLACE "\n" " " EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}")
+  set(EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}")
+  separate_arguments(EIGEN_TESTS_LIST)
+
+  # Check if the test in smoke_test_list is a currently valid test target
+  foreach(test IN ITEMS ${smoke_test_list})
+    # Add tests in smoke_test_list to our smoke test target but only if the test
+    # is currently available, i.e., is in EIGEN_SUBTESTS_LIST
+    if ("${test}" IN_LIST EIGEN_TESTS_LIST)
+      add_dependencies("${buildtarget}" "${test}")
+      # In the case of a test we match all subtests
+      set(ctest_regex "${ctest_regex}^${test}_[0-9]+$$|")
+    endif()
+  endforeach()
+
+  # Get list of all subtests and translate it into a CMake list
+  get_property(EIGEN_SUBTESTS_LIST GLOBAL PROPERTY EIGEN_SUBTESTS_LIST)
+  string(REGEX REPLACE "\n" " " EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}")
+  set(EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}")
+  separate_arguments(EIGEN_SUBTESTS_LIST)
+
+  # Check if the test in smoke_test_list is a currently valid subtest target
+  foreach(test IN ITEMS ${smoke_test_list})
+    # Add tests in smoke_test_list to our smoke test target but only if the test
+    # is currently available, i.e., is in EIGEN_SUBTESTS_LIST
+    if ("${test}" IN_LIST EIGEN_SUBTESTS_LIST)
+      add_dependencies("${buildtarget}" "${test}")
+      # Add label smoketest to be able to run smoketests using ctest
+      get_property(test_labels TEST ${test} PROPERTY LABELS)
+      set_property(TEST ${test} PROPERTY LABELS "${test_labels};smoketest")
+    endif()
+  endforeach()
+endmacro(ei_add_smoke_tests)

diff --git a/cmake/EigenUninstall.cmake b/cmake/EigenUninstall.cmake
new file mode 100644
index 0000000..5e63c98
--- /dev/null
+++ b/cmake/EigenUninstall.cmake

@@ -0,0 +1,40 @@
+################ CMake Uninstall Template #######################
+# CMake Template file for uninstallation of files
+# mentioned in 'install_manifest.txt'
+#
+# Used by uinstall target
+#################################################################
+
+set(MANIFEST "${CMAKE_CURRENT_BINARY_DIR}/install_manifest.txt")
+
+if(EXISTS ${MANIFEST})
+  message(STATUS "============== Uninstalling Eigen  ===================")
+
+  file(STRINGS ${MANIFEST} files)
+  foreach(file ${files})
+    if(EXISTS ${file})
+      message(STATUS "Removing file: '${file}'")
+
+      execute_process(
+        COMMAND ${CMAKE_COMMAND} -E remove ${file}
+        OUTPUT_VARIABLE rm_out
+        RESULT_VARIABLE rm_retval
+        )
+
+      if(NOT "${rm_retval}" STREQUAL 0)
+        message(FATAL_ERROR "Failed to remove file: '${file}'.")
+      endif()
+    else()
+      message(STATUS "File '${file}' does not exist.")
+    endif()
+  endforeach()
+
+  message(STATUS "========== Finished Uninstalling Eigen  ==============")
+else()
+  message(STATUS "Cannot find install manifest: '${MANIFEST}'")
+  message(STATUS "Probably make install has not been performed")
+  message(STATUS "   or install_manifest.txt has been deleted.")
+endif()
+
+
+

diff --git a/cmake/FindAdolc.cmake b/cmake/FindAdolc.cmake
new file mode 100644
index 0000000..13c59fc
--- /dev/null
+++ b/cmake/FindAdolc.cmake

@@ -0,0 +1,20 @@
+
+if (ADOLC_INCLUDES AND ADOLC_LIBRARIES)
+  set(ADOLC_FIND_QUIETLY TRUE)
+endif ()
+
+find_path(ADOLC_INCLUDES
+  NAMES adolc/adtl.h
+  PATHS $ENV{ADOLCDIR} $ENV{ADOLCDIR}/include ${INCLUDE_INSTALL_DIR}
+)
+
+find_library(ADOLC_LIBRARIES 
+  adolc 
+  PATHS $ENV{ADOLCDIR} ${LIB_INSTALL_DIR} 
+  PATH_SUFFIXES lib lib64)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Adolc DEFAULT_MSG
+                                  ADOLC_INCLUDES ADOLC_LIBRARIES)
+
+mark_as_advanced(ADOLC_INCLUDES ADOLC_LIBRARIES)

diff --git a/cmake/FindBLAS.cmake b/cmake/FindBLAS.cmake
new file mode 100644
index 0000000..1bb8f19
--- /dev/null
+++ b/cmake/FindBLAS.cmake

@@ -0,0 +1,1407 @@
+###
+#
+# @copyright (c) 2009-2014 The University of Tennessee and The University
+#                          of Tennessee Research Foundation.
+#                          All rights reserved.
+# @copyright (c) 2012-2016 Inria. All rights reserved.
+# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+#
+###
+#
+# - Find BLAS library
+# This module finds an installed fortran library that implements the BLAS
+# linear-algebra interface (see http://www.netlib.org/blas/).
+# The list of libraries searched for is taken
+# from the autoconf macro file, acx_blas.m4 (distributed at
+# http://ac-archive.sourceforge.net/ac-archive/acx_blas.html).
+#
+# This module sets the following variables:
+#  BLAS_FOUND - set to true if a library implementing the BLAS interface
+#    is found
+#  BLAS_LINKER_FLAGS - uncached list of required linker flags (excluding -l
+#    and -L).
+#  BLAS_COMPILER_FLAGS - uncached list of required compiler flags (including -I for mkl headers).
+#  BLAS_LIBRARIES - uncached list of libraries (using full path name) to
+#    link against to use BLAS
+#  BLAS95_LIBRARIES - uncached list of libraries (using full path name)
+#    to link against to use BLAS95 interface
+#  BLAS95_FOUND - set to true if a library implementing the BLAS f95 interface
+#    is found
+#  BLA_STATIC  if set on this determines what kind of linkage we do (static)
+#  BLA_VENDOR  if set checks only the specified vendor, if not set checks
+#     all the possibilities
+#  BLAS_VENDOR_FOUND stores the BLAS vendor found 
+#  BLA_F95     if set on tries to find the f95 interfaces for BLAS/LAPACK
+# The user can give specific paths where to find the libraries adding cmake
+# options at configure (ex: cmake path/to/project -DBLAS_DIR=path/to/blas):
+#  BLAS_DIR            - Where to find the base directory of blas
+#  BLAS_INCDIR         - Where to find the header files
+#  BLAS_LIBDIR         - Where to find the library files
+# The module can also look for the following environment variables if paths
+# are not given as cmake variable: BLAS_DIR, BLAS_INCDIR, BLAS_LIBDIR
+# For MKL case and if no paths are given as hints, we will try to use the MKLROOT
+# environment variable
+#  BLAS_VERBOSE Print some additional information during BLAS libraries detection
+##########
+### List of vendors (BLA_VENDOR) valid in this module
+########## List of vendors (BLA_VENDOR) valid in this module
+##  Open (for OpenBlas), Eigen (for EigenBlas), Goto, ATLAS PhiPACK,
+##  CXML, DXML, SunPerf, SCSL, SGIMATH, IBMESSL, IBMESSLMT
+##  Intel10_32 (intel mkl v10 32 bit), Intel10_64lp (intel mkl v10 64 bit,lp thread model, lp64 model),
+##  Intel10_64lp_seq (intel mkl v10 64 bit,sequential code, lp64 model),
+##  Intel( older versions of mkl 32 and 64 bit),
+##  ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic
+# C/CXX should be enabled to use Intel mkl
+###
+# We handle different modes to find the dependency
+#
+# - Detection if already installed on the system
+#   - BLAS libraries can be detected from different ways
+#     Here is the order of precedence:
+#     1) we look in cmake variable BLAS_LIBDIR or BLAS_DIR (we guess the libdirs) if defined
+#     2) we look in environment variable BLAS_LIBDIR or BLAS_DIR (we guess the libdirs) if defined
+#     3) we look in common environnment variables depending on the system (INCLUDE, C_INCLUDE_PATH, CPATH - LIB, DYLD_LIBRARY_PATH, LD_LIBRARY_PATH)
+#     4) we look in common system paths depending on the system, see for example paths contained in the following cmake variables:
+#       - CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES, CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES
+#       - CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES, CMAKE_C_IMPLICIT_LINK_DIRECTORIES
+#
+
+#=============================================================================
+# Copyright 2007-2009 Kitware, Inc.
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+## Some macros to print status when search for headers and libs
+# This macro informs why the _lib_to_find file has not been found
+macro(Print_Find_Library_Blas_Status _libname _lib_to_find)
+
+  # save _libname upper/lower case
+  string(TOUPPER ${_libname} LIBNAME)
+  string(TOLOWER ${_libname} libname)
+
+  # print status
+  #message(" ")
+  if(${LIBNAME}_LIBDIR)
+    message("${Yellow}${LIBNAME}_LIBDIR is defined but ${_lib_to_find}"
+      "has not been found in ${ARGN}${ColourReset}")
+  else()
+    if(${LIBNAME}_DIR)
+      message("${Yellow}${LIBNAME}_DIR is defined but ${_lib_to_find}"
+	"has not been found in ${ARGN}${ColourReset}")
+    else()
+      message("${Yellow}${_lib_to_find} not found."
+	"Nor ${LIBNAME}_DIR neither ${LIBNAME}_LIBDIR"
+	"are defined so that we look for ${_lib_to_find} in"
+	"system paths (Linux: LD_LIBRARY_PATH, Windows: LIB,"
+	"Mac: DYLD_LIBRARY_PATH,"
+	"CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES,"
+	"CMAKE_C_IMPLICIT_LINK_DIRECTORIES)${ColourReset}")
+      if(_lib_env)
+	message("${Yellow}${_lib_to_find} has not been found in"
+	  "${_lib_env}${ColourReset}")
+      endif()
+    endif()
+  endif()
+  message("${BoldYellow}Please indicate where to find ${_lib_to_find}. You have three options:\n"
+    "- Option 1: Provide the Installation directory of BLAS library with cmake option: -D${LIBNAME}_DIR=your/path/to/${libname}/\n"
+    "- Option 2: Provide the directory where to find the library with cmake option: -D${LIBNAME}_LIBDIR=your/path/to/${libname}/lib/\n"
+    "- Option 3: Update your environment variable (Linux: LD_LIBRARY_PATH, Windows: LIB, Mac: DYLD_LIBRARY_PATH)\n"
+    "- Option 4: If your library provides a PkgConfig file, make sure pkg-config finds your library${ColourReset}")
+
+endmacro()
+
+# This macro informs why the _lib_to_find file has not been found
+macro(Print_Find_Library_Blas_CheckFunc_Status _name)
+
+  # save _libname upper/lower case
+  string(TOUPPER ${_name} FUNCNAME)
+  string(TOLOWER ${_name} funcname)
+
+  # print status
+  #message(" ")
+  message("${Red}Libs have been found but check of symbol ${_name} failed "
+    "with following libraries ${ARGN}${ColourReset}")
+  message("${BoldRed}Please open your error file CMakeFiles/CMakeError.log"
+    "to figure out why it fails${ColourReset}")
+  #message(" ")
+
+endmacro()
+
+if (NOT BLAS_FOUND)
+  set(BLAS_DIR "" CACHE PATH "Installation directory of BLAS library")
+  if (NOT BLAS_FIND_QUIETLY)
+    message(STATUS "A cache variable, namely BLAS_DIR, has been set to specify the install directory of BLAS")
+  endif()
+endif()
+
+option(BLAS_VERBOSE "Print some additional information during BLAS libraries detection" OFF)
+mark_as_advanced(BLAS_VERBOSE)
+
+include(CheckFunctionExists)
+include(CheckFortranFunctionExists)
+include(CMakeFindDependencyMacro)
+
+set(_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
+
+# Check the language being used
+get_property( _LANGUAGES_ GLOBAL PROPERTY ENABLED_LANGUAGES )
+if( _LANGUAGES_ MATCHES Fortran AND CMAKE_Fortran_COMPILER)
+  set( _CHECK_FORTRAN TRUE )
+elseif( (_LANGUAGES_ MATCHES C) OR (_LANGUAGES_ MATCHES CXX) )
+  set( _CHECK_FORTRAN FALSE )
+else()
+  if(BLAS_FIND_REQUIRED)
+    message(FATAL_ERROR "FindBLAS requires Fortran, C, or C++ to be enabled.")
+  else()
+    message(STATUS "Looking for BLAS... - NOT found (Unsupported languages)")
+    return()
+  endif()
+endif()
+
+macro(Check_Fortran_Libraries LIBRARIES _prefix _name _flags _list _thread)
+  # This macro checks for the existence of the combination of fortran libraries
+  # given by _list.  If the combination is found, this macro checks (using the
+  # Check_Fortran_Function_Exists macro) whether can link against that library
+  # combination using the name of a routine given by _name using the linker
+  # flags given by _flags.  If the combination of libraries is found and passes
+  # the link test, LIBRARIES is set to the list of complete library paths that
+  # have been found.  Otherwise, LIBRARIES is set to FALSE.
+
+  # N.B. _prefix is the prefix applied to the names of all cached variables that
+  # are generated internally and marked advanced by this macro.
+
+  set(_libdir ${ARGN})
+
+  set(_libraries_work TRUE)
+  set(${LIBRARIES})
+  set(_combined_name)
+  set(ENV_MKLROOT "$ENV{MKLROOT}")
+  set(ENV_BLAS_DIR "$ENV{BLAS_DIR}")
+  set(ENV_BLAS_LIBDIR "$ENV{BLAS_LIBDIR}")
+  if (NOT _libdir)
+    if (BLAS_LIBDIR)
+      list(APPEND _libdir "${BLAS_LIBDIR}")
+    elseif (BLAS_DIR)
+      list(APPEND _libdir "${BLAS_DIR}")
+      list(APPEND _libdir "${BLAS_DIR}/lib")
+      if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
+	list(APPEND _libdir "${BLAS_DIR}/lib64")
+	list(APPEND _libdir "${BLAS_DIR}/lib/intel64")
+      else()
+	list(APPEND _libdir "${BLAS_DIR}/lib32")
+	list(APPEND _libdir "${BLAS_DIR}/lib/ia32")
+      endif()
+    elseif(ENV_BLAS_LIBDIR)
+      list(APPEND _libdir "${ENV_BLAS_LIBDIR}")
+    elseif(ENV_BLAS_DIR)
+      list(APPEND _libdir "${ENV_BLAS_DIR}")
+      list(APPEND _libdir "${ENV_BLAS_DIR}/lib")
+      if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
+	list(APPEND _libdir "${ENV_BLAS_DIR}/lib64")
+	list(APPEND _libdir "${ENV_BLAS_DIR}/lib/intel64")
+      else()
+	list(APPEND _libdir "${ENV_BLAS_DIR}/lib32")
+	list(APPEND _libdir "${ENV_BLAS_DIR}/lib/ia32")
+      endif()
+    else()
+      if (ENV_MKLROOT)
+	list(APPEND _libdir "${ENV_MKLROOT}/lib")
+	if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
+	  list(APPEND _libdir "${ENV_MKLROOT}/lib64")
+	  list(APPEND _libdir "${ENV_MKLROOT}/lib/intel64")
+	else()
+	  list(APPEND _libdir "${ENV_MKLROOT}/lib32")
+	  list(APPEND _libdir "${ENV_MKLROOT}/lib/ia32")
+	endif()
+      endif()
+      if (WIN32)
+	string(REPLACE ":" ";" _libdir2 "$ENV{LIB}")
+      elseif (APPLE)
+	string(REPLACE ":" ";" _libdir2 "$ENV{DYLD_LIBRARY_PATH}")
+      else ()
+	string(REPLACE ":" ";" _libdir2 "$ENV{LD_LIBRARY_PATH}")
+      endif ()
+      list(APPEND _libdir "${_libdir2}")
+      list(APPEND _libdir "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}")
+      list(APPEND _libdir "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}")
+    endif()
+  endif ()
+
+  if (BLAS_VERBOSE)
+    message("${Cyan}Try to find BLAS libraries: ${_list}")
+  endif ()
+
+  foreach(_library ${_list})
+    set(_combined_name ${_combined_name}_${_library})
+
+    if(_libraries_work)
+      if (BLA_STATIC)
+	if (WIN32)
+	  set(CMAKE_FIND_LIBRARY_SUFFIXES .lib ${CMAKE_FIND_LIBRARY_SUFFIXES})
+	endif ()
+	if (APPLE)
+	  set(CMAKE_FIND_LIBRARY_SUFFIXES .lib ${CMAKE_FIND_LIBRARY_SUFFIXES})
+	else ()
+	  set(CMAKE_FIND_LIBRARY_SUFFIXES .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
+	endif ()
+      else ()
+	if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
+	  # for ubuntu's libblas3gf and liblapack3gf packages
+	  set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES} .so.3gf)
+	endif ()
+      endif ()
+      find_library(${_prefix}_${_library}_LIBRARY
+	NAMES ${_library}
+	HINTS ${_libdir}
+	NO_DEFAULT_PATH
+	)
+      mark_as_advanced(${_prefix}_${_library}_LIBRARY)
+      # Print status if not found
+      # -------------------------
+      if (NOT ${_prefix}_${_library}_LIBRARY AND NOT BLAS_FIND_QUIETLY AND BLAS_VERBOSE)
+	Print_Find_Library_Blas_Status(blas ${_library} ${_libdir})
+      endif ()
+      set(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY})
+      set(_libraries_work ${${_prefix}_${_library}_LIBRARY})
+    endif()
+  endforeach()
+
+  if(_libraries_work)
+    # Test this combination of libraries.
+    if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND BLA_STATIC)
+      list(INSERT ${LIBRARIES} 0 "-Wl,--start-group")
+      list(APPEND ${LIBRARIES} "-Wl,--end-group")
+    endif()
+    set(CMAKE_REQUIRED_LIBRARIES "${_flags};${${LIBRARIES}};${_thread}")
+    set(CMAKE_REQUIRED_FLAGS "${BLAS_COMPILER_FLAGS}")
+    if (BLAS_VERBOSE)
+      message("${Cyan}BLAS libs found for BLA_VENDOR ${BLA_VENDOR}."
+	"Try to compile symbol ${_name} with following libraries:"
+	"${CMAKE_REQUIRED_LIBRARIES}")
+    endif ()
+    if(NOT BLAS_FOUND)
+      unset(${_prefix}${_combined_name}_WORKS CACHE)
+    endif()
+    if (_CHECK_FORTRAN)
+      if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
+	string(REPLACE "mkl_intel_lp64" "mkl_gf_lp64" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}")
+	string(REPLACE "mkl_intel_ilp64" "mkl_gf_ilp64" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}")
+      endif()
+      check_fortran_function_exists("${_name}" ${_prefix}${_combined_name}_WORKS)
+    else()
+      check_function_exists("${_name}_" ${_prefix}${_combined_name}_WORKS)
+    endif()
+    mark_as_advanced(${_prefix}${_combined_name}_WORKS)
+    set(_libraries_work ${${_prefix}${_combined_name}_WORKS})
+    # Print status if not found
+    # -------------------------
+    if (NOT _libraries_work AND NOT BLAS_FIND_QUIETLY AND BLAS_VERBOSE)
+      Print_Find_Library_Blas_CheckFunc_Status(${_name} ${CMAKE_REQUIRED_LIBRARIES})
+    endif ()
+    set(CMAKE_REQUIRED_LIBRARIES)
+  endif()
+
+  if(_libraries_work)
+    set(${LIBRARIES} ${${LIBRARIES}} ${_thread})
+  else()
+    set(${LIBRARIES} FALSE)
+  endif()
+
+endmacro()
+
+
+set(BLAS_LINKER_FLAGS)
+set(BLAS_LIBRARIES)
+set(BLAS95_LIBRARIES)
+if ($ENV{BLA_VENDOR} MATCHES ".+")
+  set(BLA_VENDOR $ENV{BLA_VENDOR})
+else ()
+  if(NOT BLA_VENDOR)
+    set(BLA_VENDOR "All")
+  endif()
+endif ()
+
+#BLAS in intel mkl 10 library? (em64t 64bit)
+if (BLA_VENDOR MATCHES "Intel*" OR BLA_VENDOR STREQUAL "All")
+
+  if(NOT BLAS_LIBRARIES OR BLA_VENDOR MATCHES "Intel*")
+    # Looking for include
+    # -------------------
+
+    # Add system include paths to search include
+    # ------------------------------------------
+    unset(_inc_env)
+    set(ENV_MKLROOT "$ENV{MKLROOT}")
+    set(ENV_BLAS_DIR "$ENV{BLAS_DIR}")
+    set(ENV_BLAS_INCDIR "$ENV{BLAS_INCDIR}")
+    if(ENV_BLAS_INCDIR)
+      list(APPEND _inc_env "${ENV_BLAS_INCDIR}")
+    elseif(ENV_BLAS_DIR)
+      list(APPEND _inc_env "${ENV_BLAS_DIR}")
+      list(APPEND _inc_env "${ENV_BLAS_DIR}/include")
+    else()
+      if (ENV_MKLROOT)
+	list(APPEND _inc_env "${ENV_MKLROOT}/include")
+      endif()
+      # system variables
+      if(WIN32)
+	string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}")
+	list(APPEND _inc_env "${_path_env}")
+      else()
+	string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}")
+	list(APPEND _inc_env "${_path_env}")
+	string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}")
+	list(APPEND _inc_env "${_path_env}")
+	string(REPLACE ":" ";" _path_env "$ENV{CPATH}")
+	list(APPEND _inc_env "${_path_env}")
+	string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}")
+	list(APPEND _inc_env "${_path_env}")
+      endif()
+    endif()
+    list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}")
+    list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}")
+    list(REMOVE_DUPLICATES _inc_env)
+
+    # set paths where to look for
+    set(PATH_TO_LOOK_FOR "${_inc_env}")
+
+    # Try to find the fftw header in the given paths
+    # -------------------------------------------------
+    # call cmake macro to find the header path
+    if(BLAS_INCDIR)
+      set(BLAS_mkl.h_DIRS "BLAS_mkl.h_DIRS-NOTFOUND")
+      find_path(BLAS_mkl.h_DIRS
+	NAMES mkl.h
+	HINTS ${BLAS_INCDIR})
+    else()
+      if(BLAS_DIR)
+	set(BLAS_mkl.h_DIRS "BLAS_mkl.h_DIRS-NOTFOUND")
+	find_path(BLAS_mkl.h_DIRS
+	  NAMES mkl.h
+	  HINTS ${BLAS_DIR}
+	  PATH_SUFFIXES "include")
+      else()
+	set(BLAS_mkl.h_DIRS "BLAS_mkl.h_DIRS-NOTFOUND")
+	find_path(BLAS_mkl.h_DIRS
+	  NAMES mkl.h
+	  HINTS ${PATH_TO_LOOK_FOR})
+      endif()
+    endif()
+    mark_as_advanced(BLAS_mkl.h_DIRS)
+
+    # If found, add path to cmake variable
+    # ------------------------------------
+    if (BLAS_mkl.h_DIRS)
+      set(BLAS_INCLUDE_DIRS "${BLAS_mkl.h_DIRS}")
+    else ()
+      set(BLAS_INCLUDE_DIRS "BLAS_INCLUDE_DIRS-NOTFOUND")
+      if(NOT BLAS_FIND_QUIETLY)
+	message(STATUS "Looking for BLAS -- mkl.h not found")
+      endif()
+    endif()
+
+    if (WIN32)
+      string(REPLACE ":" ";" _libdir "$ENV{LIB}")
+    elseif (APPLE)
+      string(REPLACE ":" ";" _libdir "$ENV{DYLD_LIBRARY_PATH}")
+    else ()
+      string(REPLACE ":" ";" _libdir "$ENV{LD_LIBRARY_PATH}")
+    endif ()
+    list(APPEND _libdir "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}")
+    list(APPEND _libdir "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}")
+    # libiomp5
+    # --------
+    set(OMP_iomp5_LIBRARY "OMP_iomp5_LIBRARY-NOTFOUND")
+    find_library(OMP_iomp5_LIBRARY
+      NAMES iomp5
+      HINTS ${_libdir}
+      )
+    mark_as_advanced(OMP_iomp5_LIBRARY)
+    set(OMP_LIB "")
+    # libgomp
+    # -------
+    set(OMP_gomp_LIBRARY "OMP_gomp_LIBRARY-NOTFOUND")
+    find_library(OMP_gomp_LIBRARY
+      NAMES gomp
+      HINTS ${_libdir}
+      )
+    mark_as_advanced(OMP_gomp_LIBRARY)
+    # choose one or another depending on the compilo
+    if (CMAKE_C_COMPILER_ID STREQUAL "GNU")
+      if (OMP_gomp_LIBRARY)
+	set(OMP_LIB "${OMP_gomp_LIBRARY}")
+      endif()
+    else()
+      if (OMP_iomp5_LIBRARY)
+	set(OMP_LIB "${OMP_iomp5_LIBRARY}")
+      endif()
+    endif()
+
+    if (UNIX AND NOT WIN32)
+      # m
+      find_library(M_LIBRARY
+	NAMES m
+	HINTS ${_libdir})
+      mark_as_advanced(M_LIBRARY)
+      if(M_LIBRARY)
+	set(LM "-lm")
+      else()
+	set(LM "")
+      endif()
+      # Fortran
+      set(LGFORTRAN "")
+      if (CMAKE_C_COMPILER_ID MATCHES "GNU")
+	find_library(
+	  FORTRAN_gfortran_LIBRARY
+	  NAMES gfortran
+	  HINTS ${_libdir}
+	  )
+	mark_as_advanced(FORTRAN_gfortran_LIBRARY)
+	if (FORTRAN_gfortran_LIBRARY)
+	  set(LGFORTRAN "${FORTRAN_gfortran_LIBRARY}")
+	endif()
+      elseif (CMAKE_C_COMPILER_ID MATCHES "Intel")
+	find_library(
+	  FORTRAN_ifcore_LIBRARY
+	  NAMES ifcore
+	  HINTS ${_libdir}
+	  )
+	mark_as_advanced(FORTRAN_ifcore_LIBRARY)
+	if (FORTRAN_ifcore_LIBRARY)
+	  set(LGFORTRAN "{FORTRAN_ifcore_LIBRARY}")
+	endif()
+      endif()
+      set(BLAS_COMPILER_FLAGS "")
+      if (NOT BLA_VENDOR STREQUAL "Intel10_64lp_seq")
+	if (CMAKE_C_COMPILER_ID STREQUAL "Intel")
+	  list(APPEND BLAS_COMPILER_FLAGS "-openmp")
+	endif()
+	if (CMAKE_C_COMPILER_ID STREQUAL "GNU")
+	  list(APPEND BLAS_COMPILER_FLAGS "-fopenmp")
+	endif()
+      endif()
+      if (CMAKE_C_COMPILER_ID STREQUAL "GNU")
+	if (BLA_VENDOR STREQUAL "Intel10_32")
+	  list(APPEND BLAS_COMPILER_FLAGS "-m32")
+	else()
+	  list(APPEND BLAS_COMPILER_FLAGS "-m64")
+	endif()
+	if (NOT BLA_VENDOR STREQUAL "Intel10_64lp_seq")
+	  list(APPEND OMP_LIB "-ldl")
+	endif()
+	if (ENV_MKLROOT)
+	  list(APPEND BLAS_COMPILER_FLAGS "-I${ENV_MKLROOT}/include")
+	endif()
+      endif()
+
+      set(additional_flags "")
+      if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND CMAKE_SYSTEM_NAME STREQUAL "Linux")
+	set(additional_flags "-Wl,--no-as-needed")
+      endif()
+    endif ()
+
+    if (_LANGUAGES_ MATCHES C OR _LANGUAGES_ MATCHES CXX)
+      if(BLAS_FIND_QUIETLY OR NOT BLAS_FIND_REQUIRED)
+	find_dependency(Threads)
+      else()
+	find_dependency(Threads REQUIRED)
+      endif()
+
+      set(BLAS_SEARCH_LIBS "")
+
+      if(BLA_F95)
+
+	set(BLAS_mkl_SEARCH_SYMBOL SGEMM)
+	set(_LIBRARIES BLAS95_LIBRARIES)
+	if (WIN32)
+	  if (BLA_STATIC)
+	    set(BLAS_mkl_DLL_SUFFIX "")
+	  else()
+	    set(BLAS_mkl_DLL_SUFFIX "_dll")
+	  endif()
+
+	  # Find the main file (32-bit or 64-bit)
+	  set(BLAS_SEARCH_LIBS_WIN_MAIN "")
+	  if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All")
+	    list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN
+	      "mkl_blas95${BLAS_mkl_DLL_SUFFIX} mkl_intel_c${BLAS_mkl_DLL_SUFFIX}")
+	  endif()
+	  if (BLA_VENDOR STREQUAL "Intel10_64lp*" OR BLA_VENDOR STREQUAL "All")
+	    list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN
+	      "mkl_blas95_lp64${BLAS_mkl_DLL_SUFFIX} mkl_intel_lp64${BLAS_mkl_DLL_SUFFIX}")
+	  endif ()
+
+	  # Add threading/sequential libs
+	  set(BLAS_SEARCH_LIBS_WIN_THREAD "")
+	  if (BLA_VENDOR STREQUAL "*_seq" OR BLA_VENDOR STREQUAL "All")
+	    list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD
+	      "mkl_sequential${BLAS_mkl_DLL_SUFFIX}")
+	  endif()
+	  if (NOT BLA_VENDOR STREQUAL "*_seq" OR BLA_VENDOR STREQUAL "All")
+	    # old version
+	    list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD
+	      "libguide40 mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}")
+	    # mkl >= 10.3
+	    list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD
+	      "libiomp5md mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}")
+	  endif()
+
+	  # Cartesian product of the above
+	  foreach (MAIN ${BLAS_SEARCH_LIBS_WIN_MAIN})
+	    foreach (THREAD ${BLAS_SEARCH_LIBS_WIN_THREAD})
+	      list(APPEND BLAS_SEARCH_LIBS
+		"${MAIN} ${THREAD} mkl_core${BLAS_mkl_DLL_SUFFIX}")
+	    endforeach()
+	  endforeach()
+	else ()
+	  if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All")
+	    list(APPEND BLAS_SEARCH_LIBS
+	      "mkl_blas95 mkl_intel mkl_intel_thread mkl_core guide")
+	  endif ()
+	  if (BLA_VENDOR STREQUAL "Intel10_64lp" OR BLA_VENDOR STREQUAL "All")
+	    # old version
+	    list(APPEND BLAS_SEARCH_LIBS
+	      "mkl_blas95 mkl_intel_lp64 mkl_intel_thread mkl_core guide")
+	    # mkl >= 10.3
+	    if (CMAKE_C_COMPILER_ID STREQUAL "Intel")
+	      list(APPEND BLAS_SEARCH_LIBS
+		"mkl_blas95_lp64 mkl_intel_lp64 mkl_intel_thread mkl_core")
+	    endif()
+	    if (CMAKE_C_COMPILER_ID STREQUAL "GNU")
+	      list(APPEND BLAS_SEARCH_LIBS
+		"mkl_blas95_lp64 mkl_intel_lp64 mkl_gnu_thread mkl_core")
+	    endif()
+	  endif ()
+	  if (BLA_VENDOR STREQUAL "Intel10_64lp_seq" OR BLA_VENDOR STREQUAL "All")
+	    list(APPEND BLAS_SEARCH_LIBS
+	      "mkl_intel_lp64 mkl_sequential mkl_core")
+	    if (BLA_VENDOR STREQUAL "Intel10_64lp_seq")
+	      set(OMP_LIB "")
+	    endif()
+	  endif ()
+	endif ()
+
+      else ()
+
+	set(BLAS_mkl_SEARCH_SYMBOL sgemm)
+	set(_LIBRARIES BLAS_LIBRARIES)
+	if (WIN32)
+	  if (BLA_STATIC)
+	    set(BLAS_mkl_DLL_SUFFIX "")
+	  else()
+	    set(BLAS_mkl_DLL_SUFFIX "_dll")
+	  endif()
+
+	  # Find the main file (32-bit or 64-bit)
+	  set(BLAS_SEARCH_LIBS_WIN_MAIN "")
+	  if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All")
+	    list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN
+	      "mkl_intel_c${BLAS_mkl_DLL_SUFFIX}")
+	  endif()
+	  if (BLA_VENDOR STREQUAL "Intel10_64lp*" OR BLA_VENDOR STREQUAL "All")
+	    list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN
+	      "mkl_intel_lp64${BLAS_mkl_DLL_SUFFIX}")
+	  endif ()
+
+	  # Add threading/sequential libs
+	  set(BLAS_SEARCH_LIBS_WIN_THREAD "")
+	  if (NOT BLA_VENDOR STREQUAL "*_seq" OR BLA_VENDOR STREQUAL "All")
+	    # old version
+	    list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD
+	      "libguide40 mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}")
+	    # mkl >= 10.3
+	    list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD
+	      "libiomp5md mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}")
+	  endif()
+	  if (BLA_VENDOR STREQUAL "*_seq" OR BLA_VENDOR STREQUAL "All")
+	    list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD
+	      "mkl_sequential${BLAS_mkl_DLL_SUFFIX}")
+	  endif()
+
+	  # Cartesian product of the above
+	  foreach (MAIN ${BLAS_SEARCH_LIBS_WIN_MAIN})
+	    foreach (THREAD ${BLAS_SEARCH_LIBS_WIN_THREAD})
+	      list(APPEND BLAS_SEARCH_LIBS
+		"${MAIN} ${THREAD} mkl_core${BLAS_mkl_DLL_SUFFIX}")
+	    endforeach()
+	  endforeach()
+	else ()
+	  if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All")
+	    list(APPEND BLAS_SEARCH_LIBS
+	      "mkl_intel mkl_intel_thread mkl_core guide")
+	  endif ()
+	  if (BLA_VENDOR STREQUAL "Intel10_64lp" OR BLA_VENDOR STREQUAL "All")
+	    # old version
+	    list(APPEND BLAS_SEARCH_LIBS
+	      "mkl_intel_lp64 mkl_intel_thread mkl_core guide")
+	    # mkl >= 10.3
+	    if (CMAKE_C_COMPILER_ID STREQUAL "Intel")
+	      list(APPEND BLAS_SEARCH_LIBS
+		"mkl_intel_lp64 mkl_intel_thread mkl_core")
+	    endif()
+	    if (CMAKE_C_COMPILER_ID STREQUAL "GNU")
+	      list(APPEND BLAS_SEARCH_LIBS
+		"mkl_intel_lp64 mkl_gnu_thread mkl_core")
+	    endif()
+	  endif ()
+	  if (BLA_VENDOR STREQUAL "Intel10_64lp_seq" OR BLA_VENDOR STREQUAL "All")
+	    list(APPEND BLAS_SEARCH_LIBS
+	      "mkl_intel_lp64 mkl_sequential mkl_core")
+	    if (BLA_VENDOR STREQUAL "Intel10_64lp_seq")
+	      set(OMP_LIB "")
+	    endif()
+	  endif ()
+	  #older vesions of intel mkl libs
+	  if (BLA_VENDOR STREQUAL "Intel" OR BLA_VENDOR STREQUAL "All")
+	    list(APPEND BLAS_SEARCH_LIBS
+	      "mkl")
+	    list(APPEND BLAS_SEARCH_LIBS
+	      "mkl_ia32")
+	    list(APPEND BLAS_SEARCH_LIBS
+	      "mkl_em64t")
+	  endif ()
+	endif ()
+
+      endif ()
+
+      foreach (IT ${BLAS_SEARCH_LIBS})
+	string(REPLACE " " ";" SEARCH_LIBS ${IT})
+	if (${_LIBRARIES})
+	else ()
+	  check_fortran_libraries(
+	    ${_LIBRARIES}
+	    BLAS
+	    ${BLAS_mkl_SEARCH_SYMBOL}
+	    "${additional_flags}"
+	    "${SEARCH_LIBS}"
+	    "${OMP_LIB};${CMAKE_THREAD_LIBS_INIT};${LM}"
+	    )
+	  if(_LIBRARIES)
+	    set(BLAS_LINKER_FLAGS "${additional_flags}")
+	  endif()
+	endif()
+      endforeach ()
+      if(NOT BLAS_FIND_QUIETLY)
+        if(${_LIBRARIES})
+          message(STATUS "Looking for MKL BLAS: found")
+        else()
+          message(STATUS "Looking for MKL BLAS: not found")
+        endif()
+      endif()
+      if (${_LIBRARIES} AND NOT BLAS_VENDOR_FOUND)
+          set (BLAS_VENDOR_FOUND "Intel MKL")
+      endif()
+    endif ()
+  endif()
+endif ()
+
+
+if (BLA_VENDOR STREQUAL "Goto" OR BLA_VENDOR STREQUAL "All")
+
+  if(NOT BLAS_LIBRARIES)
+    # gotoblas (http://www.tacc.utexas.edu/tacc-projects/gotoblas2)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "goto2"
+      ""
+      )
+    if(NOT BLAS_FIND_QUIETLY)
+      if(BLAS_LIBRARIES)
+	message(STATUS "Looking for Goto BLAS: found")
+      else()
+	message(STATUS "Looking for Goto BLAS: not found")
+      endif()
+    endif()
+  endif()
+  if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND)
+      set (BLAS_VENDOR_FOUND "Goto")
+  endif()
+
+endif ()
+
+
+# OpenBlas
+if (BLA_VENDOR STREQUAL "Open" OR BLA_VENDOR STREQUAL "All")
+
+  if(NOT BLAS_LIBRARIES)
+    # openblas (http://www.openblas.net/)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "openblas"
+      ""
+      )
+    if(NOT BLAS_FIND_QUIETLY)
+      if(BLAS_LIBRARIES)
+	message(STATUS "Looking for Open BLAS: found")
+      else()
+	message(STATUS "Looking for Open BLAS: not found")
+      endif()
+    endif()
+  endif()
+  if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND)
+      set (BLAS_VENDOR_FOUND "Openblas")
+  endif()
+
+endif ()
+
+
+# EigenBlas
+if (BLA_VENDOR STREQUAL "Eigen" OR BLA_VENDOR STREQUAL "All")
+
+  if(NOT BLAS_LIBRARIES)
+    # eigenblas (http://eigen.tuxfamily.org/index.php?title=Main_Page)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "eigen_blas"
+      ""
+      )
+    if(NOT BLAS_FIND_QUIETLY)
+      if(BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND)
+	message(STATUS "Looking for Eigen BLAS: found")
+      else()
+	message(STATUS "Looking for Eigen BLAS: not found")
+      endif()
+    endif()
+  endif()
+
+  if(NOT BLAS_LIBRARIES)
+    # eigenblas (http://eigen.tuxfamily.org/index.php?title=Main_Page)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "eigen_blas_static"
+      ""
+      )
+    if(NOT BLAS_FIND_QUIETLY)
+      if(BLAS_LIBRARIES)
+	message(STATUS "Looking for Eigen BLAS: found")
+      else()
+	message(STATUS "Looking for Eigen BLAS: not found")
+      endif()
+    endif()
+  endif()
+  if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND)
+      set (BLAS_VENDOR_FOUND "Eigen")
+  endif()
+
+endif ()
+
+
+if (BLA_VENDOR STREQUAL "ATLAS" OR BLA_VENDOR STREQUAL "All")
+
+  if(NOT BLAS_LIBRARIES)
+    # BLAS in ATLAS library? (http://math-atlas.sourceforge.net/)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      dgemm
+      ""
+      "f77blas;atlas"
+      ""
+      )
+    if(NOT BLAS_FIND_QUIETLY)
+      if(BLAS_LIBRARIES)
+	message(STATUS "Looking for Atlas BLAS: found")
+      else()
+	message(STATUS "Looking for Atlas BLAS: not found")
+      endif()
+    endif()
+  endif()
+
+  if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND)
+      set (BLAS_VENDOR_FOUND "Atlas")
+  endif()
+
+endif ()
+
+
+# BLAS in PhiPACK libraries? (requires generic BLAS lib, too)
+if (BLA_VENDOR STREQUAL "PhiPACK" OR BLA_VENDOR STREQUAL "All")
+
+  if(NOT BLAS_LIBRARIES)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "sgemm;dgemm;blas"
+      ""
+      )
+    if(NOT BLAS_FIND_QUIETLY)
+      if(BLAS_LIBRARIES)
+	message(STATUS "Looking for PhiPACK BLAS: found")
+      else()
+	message(STATUS "Looking for PhiPACK BLAS: not found")
+      endif()
+    endif()
+  endif()
+
+  if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND)
+      set (BLAS_VENDOR_FOUND "PhiPACK")
+  endif()
+
+endif ()
+
+
+# BLAS in Alpha CXML library?
+if (BLA_VENDOR STREQUAL "CXML" OR BLA_VENDOR STREQUAL "All")
+
+  if(NOT BLAS_LIBRARIES)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "cxml"
+      ""
+      )
+    if(NOT BLAS_FIND_QUIETLY)
+      if(BLAS_LIBRARIES)
+	message(STATUS "Looking for CXML BLAS: found")
+      else()
+	message(STATUS "Looking for CXML BLAS: not found")
+      endif()
+    endif()
+  endif()
+
+  if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND)
+      set (BLAS_VENDOR_FOUND "CXML")
+  endif()
+
+endif ()
+
+
+# BLAS in Alpha DXML library? (now called CXML, see above)
+if (BLA_VENDOR STREQUAL "DXML" OR BLA_VENDOR STREQUAL "All")
+
+  if(NOT BLAS_LIBRARIES)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "dxml"
+      ""
+      )
+    if(NOT BLAS_FIND_QUIETLY)
+      if(BLAS_LIBRARIES)
+	message(STATUS "Looking for DXML BLAS: found")
+      else()
+	message(STATUS "Looking for DXML BLAS: not found")
+      endif()
+    endif()
+  endif()
+
+  if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND)
+      set (BLAS_VENDOR_FOUND "DXML")
+  endif()
+  
+endif ()
+
+
+# BLAS in Sun Performance library?
+if (BLA_VENDOR STREQUAL "SunPerf" OR BLA_VENDOR STREQUAL "All")
+
+  if(NOT BLAS_LIBRARIES)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      "-xlic_lib=sunperf"
+      "sunperf;sunmath"
+      ""
+      )
+    if(BLAS_LIBRARIES)
+      set(BLAS_LINKER_FLAGS "-xlic_lib=sunperf")
+    endif()
+    if(NOT BLAS_FIND_QUIETLY)
+      if(BLAS_LIBRARIES)
+	message(STATUS "Looking for SunPerf BLAS: found")
+      else()
+	message(STATUS "Looking for SunPerf BLAS: not found")
+      endif()
+    endif()
+  endif()
+
+  if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND)
+      set (BLAS_VENDOR_FOUND "SunPerf")
+  endif()
+
+endif ()
+
+
+# BLAS in SCSL library?  (SGI/Cray Scientific Library)
+if (BLA_VENDOR STREQUAL "SCSL" OR BLA_VENDOR STREQUAL "All")
+
+  if(NOT BLAS_LIBRARIES)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "scsl"
+      ""
+      )
+    if(NOT BLAS_FIND_QUIETLY)
+      if(BLAS_LIBRARIES)
+	message(STATUS "Looking for SCSL BLAS: found")
+      else()
+	message(STATUS "Looking for SCSL BLAS: not found")
+      endif()
+    endif()
+  endif()
+
+  if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND)
+      set (BLAS_VENDOR_FOUND "SunPerf")
+  endif()
+
+endif ()
+
+
+# BLAS in SGIMATH library?
+if (BLA_VENDOR STREQUAL "SGIMATH" OR BLA_VENDOR STREQUAL "All")
+
+  if(NOT BLAS_LIBRARIES)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "complib.sgimath"
+      ""
+      )
+    if(NOT BLAS_FIND_QUIETLY)
+      if(BLAS_LIBRARIES)
+	message(STATUS "Looking for SGIMATH BLAS: found")
+      else()
+	message(STATUS "Looking for SGIMATH BLAS: not found")
+      endif()
+    endif()
+  endif()
+
+  if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND)
+      set (BLAS_VENDOR_FOUND "SGIMATH")
+  endif()
+
+endif ()
+
+
+# BLAS in IBM ESSL library (requires generic BLAS lib, too)
+if (BLA_VENDOR STREQUAL "IBMESSL" OR BLA_VENDOR STREQUAL "All")
+
+  if(NOT BLAS_LIBRARIES)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "essl;xlfmath;xlf90_r;blas"
+      ""
+      )
+    if(NOT BLAS_FIND_QUIETLY)
+      if(BLAS_LIBRARIES)
+	message(STATUS "Looking for IBM ESSL BLAS: found")
+      else()
+	message(STATUS "Looking for IBM ESSL BLAS: not found")
+      endif()
+    endif()
+  endif()
+
+  if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND)
+      set (BLAS_VENDOR_FOUND "IBM ESSL")
+  endif()
+
+endif ()
+
+# BLAS in IBM ESSL_MT library (requires generic BLAS lib, too)
+if (BLA_VENDOR STREQUAL "IBMESSLMT" OR BLA_VENDOR STREQUAL "All")
+
+  if(NOT BLAS_LIBRARIES)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "esslsmp;xlsmp;xlfmath;xlf90_r;blas"
+      ""
+      )
+    if(NOT BLAS_FIND_QUIETLY)
+      if(BLAS_LIBRARIES)
+	message(STATUS "Looking for IBM ESSL MT BLAS: found")
+      else()
+	message(STATUS "Looking for IBM ESSL MT BLAS: not found")
+      endif()
+    endif()
+  endif()
+
+  if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND)
+      set (BLAS_VENDOR_FOUND "IBM ESSL MT")
+  endif()
+
+endif ()
+
+
+#BLAS in acml library?
+if (BLA_VENDOR MATCHES "ACML.*" OR BLA_VENDOR STREQUAL "All")
+
+  if( ((BLA_VENDOR STREQUAL "ACML") AND (NOT BLAS_ACML_LIB_DIRS)) OR
+      ((BLA_VENDOR STREQUAL "ACML_MP") AND (NOT BLAS_ACML_MP_LIB_DIRS)) OR
+      ((BLA_VENDOR STREQUAL "ACML_GPU") AND (NOT BLAS_ACML_GPU_LIB_DIRS)))
+
+    # try to find acml in "standard" paths
+    if( WIN32 )
+      file( GLOB _ACML_ROOT "C:/AMD/acml*/ACML-EULA.txt" )
+    else()
+      file( GLOB _ACML_ROOT "/opt/acml*/ACML-EULA.txt" )
+    endif()
+    if( WIN32 )
+      file( GLOB _ACML_GPU_ROOT "C:/AMD/acml*/GPGPUexamples" )
+    else()
+      file( GLOB _ACML_GPU_ROOT "/opt/acml*/GPGPUexamples" )
+    endif()
+    list(GET _ACML_ROOT 0 _ACML_ROOT)
+    list(GET _ACML_GPU_ROOT 0 _ACML_GPU_ROOT)
+
+    if( _ACML_ROOT )
+
+      get_filename_component( _ACML_ROOT ${_ACML_ROOT} PATH )
+      if( SIZEOF_INTEGER EQUAL 8 )
+	set( _ACML_PATH_SUFFIX "_int64" )
+      else()
+	set( _ACML_PATH_SUFFIX "" )
+      endif()
+      if( CMAKE_Fortran_COMPILER_ID STREQUAL "Intel" )
+	set( _ACML_COMPILER32 "ifort32" )
+	set( _ACML_COMPILER64 "ifort64" )
+      elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "SunPro" )
+	set( _ACML_COMPILER32 "sun32" )
+	set( _ACML_COMPILER64 "sun64" )
+      elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "PGI" )
+	set( _ACML_COMPILER32 "pgi32" )
+	if( WIN32 )
+	  set( _ACML_COMPILER64 "win64" )
+	else()
+	  set( _ACML_COMPILER64 "pgi64" )
+	endif()
+      elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "Open64" )
+	# 32 bit builds not supported on Open64 but for code simplicity
+	# We'll just use the same directory twice
+	set( _ACML_COMPILER32 "open64_64" )
+	set( _ACML_COMPILER64 "open64_64" )
+      elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "NAG" )
+	set( _ACML_COMPILER32 "nag32" )
+	set( _ACML_COMPILER64 "nag64" )
+      else()
+	set( _ACML_COMPILER32 "gfortran32" )
+	set( _ACML_COMPILER64 "gfortran64" )
+      endif()
+
+      if( BLA_VENDOR STREQUAL "ACML_MP" )
+	set(_ACML_MP_LIB_DIRS
+	  "${_ACML_ROOT}/${_ACML_COMPILER32}_mp${_ACML_PATH_SUFFIX}/lib"
+	  "${_ACML_ROOT}/${_ACML_COMPILER64}_mp${_ACML_PATH_SUFFIX}/lib" )
+      else()
+	set(_ACML_LIB_DIRS
+	  "${_ACML_ROOT}/${_ACML_COMPILER32}${_ACML_PATH_SUFFIX}/lib"
+	  "${_ACML_ROOT}/${_ACML_COMPILER64}${_ACML_PATH_SUFFIX}/lib" )
+      endif()
+
+    endif()
+
+  elseif(BLAS_${BLA_VENDOR}_LIB_DIRS)
+
+    set(_${BLA_VENDOR}_LIB_DIRS ${BLAS_${BLA_VENDOR}_LIB_DIRS})
+
+  endif()
+
+  if( BLA_VENDOR STREQUAL "ACML_MP" )
+    foreach( BLAS_ACML_MP_LIB_DIRS ${_ACML_MP_LIB_DIRS})
+      check_fortran_libraries (
+	BLAS_LIBRARIES
+	BLAS
+	sgemm
+	"" "acml_mp;acml_mv" "" ${BLAS_ACML_MP_LIB_DIRS}
+	)
+      if( BLAS_LIBRARIES )
+	break()
+      endif()
+    endforeach()
+  elseif( BLA_VENDOR STREQUAL "ACML_GPU" )
+    foreach( BLAS_ACML_GPU_LIB_DIRS ${_ACML_GPU_LIB_DIRS})
+      check_fortran_libraries (
+	BLAS_LIBRARIES
+	BLAS
+	sgemm
+	"" "acml;acml_mv;CALBLAS" "" ${BLAS_ACML_GPU_LIB_DIRS}
+	)
+      if( BLAS_LIBRARIES )
+	break()
+      endif()
+    endforeach()
+  else()
+    foreach( BLAS_ACML_LIB_DIRS ${_ACML_LIB_DIRS} )
+      check_fortran_libraries (
+	BLAS_LIBRARIES
+	BLAS
+	sgemm
+	"" "acml;acml_mv" "" ${BLAS_ACML_LIB_DIRS}
+	)
+      if( BLAS_LIBRARIES )
+	break()
+      endif()
+    endforeach()
+  endif()
+
+  # Either acml or acml_mp should be in LD_LIBRARY_PATH but not both
+  if(NOT BLAS_LIBRARIES)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "acml;acml_mv"
+      ""
+      )
+    if(NOT BLAS_FIND_QUIETLY)
+      if(BLAS_LIBRARIES)
+	message(STATUS "Looking for ACML BLAS: found")
+      else()
+	message(STATUS "Looking for ACML BLAS: not found")
+      endif()
+    endif()
+  endif()
+
+  if(NOT BLAS_LIBRARIES)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "acml_mp;acml_mv"
+      ""
+      )
+    if(NOT BLAS_FIND_QUIETLY)
+      if(BLAS_LIBRARIES)
+	message(STATUS "Looking for ACML BLAS: found")
+      else()
+	message(STATUS "Looking for ACML BLAS: not found")
+      endif()
+    endif()
+  endif()
+
+  if(NOT BLAS_LIBRARIES)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "acml;acml_mv;CALBLAS"
+      ""
+      )
+    if(NOT BLAS_FIND_QUIETLY)
+      if(BLAS_LIBRARIES)
+	message(STATUS "Looking for ACML BLAS: found")
+      else()
+	message(STATUS "Looking for ACML BLAS: not found")
+      endif()
+    endif()
+  endif()
+
+  if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND)
+      set (BLAS_VENDOR_FOUND "ACML")
+  endif()
+
+endif () # ACML
+
+
+# Apple BLAS library?
+if (BLA_VENDOR STREQUAL "Apple" OR BLA_VENDOR STREQUAL "All")
+
+  if(NOT BLAS_LIBRARIES)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      dgemm
+      ""
+      "Accelerate"
+      ""
+      )
+    if(NOT BLAS_FIND_QUIETLY)
+      if(BLAS_LIBRARIES)
+	message(STATUS "Looking for Apple BLAS: found")
+      else()
+	message(STATUS "Looking for Apple BLAS: not found")
+      endif()
+    endif()
+  endif()
+
+  if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND)
+      set (BLAS_VENDOR_FOUND "Apple Accelerate")
+  endif()
+
+endif ()
+
+
+if (BLA_VENDOR STREQUAL "NAS" OR BLA_VENDOR STREQUAL "All")
+
+  if ( NOT BLAS_LIBRARIES )
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      dgemm
+      ""
+      "vecLib"
+      ""
+      )
+    if(NOT BLAS_FIND_QUIETLY)
+      if(BLAS_LIBRARIES)
+	message(STATUS "Looking for NAS BLAS: found")
+      else()
+	message(STATUS "Looking for NAS BLAS: not found")
+      endif()
+    endif()
+  endif ()
+
+  if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND)
+      set (BLAS_VENDOR_FOUND "NAS")
+  endif()
+
+endif ()
+
+
+# Generic BLAS library?
+if (BLA_VENDOR STREQUAL "Generic" OR BLA_VENDOR STREQUAL "All")
+
+  set(BLAS_SEARCH_LIBS "blas;blas_LINUX;blas_MAC;blas_WINDOWS;refblas")
+  foreach (SEARCH_LIB ${BLAS_SEARCH_LIBS})
+    if (BLAS_LIBRARIES)
+    else ()
+      check_fortran_libraries(
+	BLAS_LIBRARIES
+	BLAS
+	sgemm
+	""
+	"${SEARCH_LIB}"
+	"${LGFORTRAN}"
+	)
+      if(NOT BLAS_FIND_QUIETLY)
+	if(BLAS_LIBRARIES)
+	  message(STATUS "Looking for Generic BLAS: found")
+	else()
+	  message(STATUS "Looking for Generic BLAS: not found")
+	endif()
+      endif()
+    endif()
+  endforeach ()
+
+  if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND)
+      set (BLAS_VENDOR_FOUND "Netlib or other Generic libblas")
+  endif()
+
+endif ()
+
+
+if(BLA_F95)
+
+  if(BLAS95_LIBRARIES)
+    set(BLAS95_FOUND TRUE)
+  else()
+    set(BLAS95_FOUND FALSE)
+  endif()
+
+  if(NOT BLAS_FIND_QUIETLY)
+    if(BLAS95_FOUND)
+      message(STATUS "A library with BLAS95 API found.")
+      message(STATUS "BLAS_LIBRARIES ${BLAS_LIBRARIES}")
+    else()
+      message(WARNING "BLA_VENDOR has been set to ${BLA_VENDOR} but blas 95 libraries could not be found or check of symbols failed."
+	"\nPlease indicate where to find blas libraries. You have three options:\n"
+	"- Option 1: Provide the installation directory of BLAS library with cmake option: -DBLAS_DIR=your/path/to/blas\n"
+	"- Option 2: Provide the directory where to find BLAS libraries with cmake option: -DBLAS_LIBDIR=your/path/to/blas/libs\n"
+	"- Option 3: Update your environment variable (Linux: LD_LIBRARY_PATH, Windows: LIB, Mac: DYLD_LIBRARY_PATH)\n"
+	"\nTo follow libraries detection more precisely you can activate a verbose mode with -DBLAS_VERBOSE=ON at cmake configure."
+	"\nYou could also specify a BLAS vendor to look for by setting -DBLA_VENDOR=blas_vendor_name."
+	"\nList of possible BLAS vendor: Goto, ATLAS PhiPACK, CXML, DXML, SunPerf, SCSL, SGIMATH, IBMESSL, Intel10_32 (intel mkl v10 32 bit),"
+	"Intel10_64lp (intel mkl v10 64 bit, lp thread model, lp64 model), Intel10_64lp_seq (intel mkl v10 64 bit, sequential code, lp64 model),"
+	"Intel( older versions of mkl 32 and 64 bit), ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic")
+      if(BLAS_FIND_REQUIRED)
+	message(FATAL_ERROR
+	  "A required library with BLAS95 API not found. Please specify library location.")
+      else()
+	message(STATUS
+	  "A library with BLAS95 API not found. Please specify library location.")
+      endif()
+    endif()
+  endif()
+
+  set(BLAS_FOUND TRUE)
+  set(BLAS_LIBRARIES "${BLAS95_LIBRARIES}")
+
+else()
+
+  if(BLAS_LIBRARIES)
+    set(BLAS_FOUND TRUE)
+  else()
+    set(BLAS_FOUND FALSE)
+  endif()
+
+  if(NOT BLAS_FIND_QUIETLY)
+    if(BLAS_FOUND)
+      message(STATUS "A library with BLAS API found.")
+      message(STATUS "BLAS_LIBRARIES ${BLAS_LIBRARIES}")
+    else()
+      message(WARNING "BLA_VENDOR has been set to ${BLA_VENDOR} but blas libraries could not be found or check of symbols failed."
+	"\nPlease indicate where to find blas libraries. You have three options:\n"
+	"- Option 1: Provide the installation directory of BLAS library with cmake option: -DBLAS_DIR=your/path/to/blas\n"
+	"- Option 2: Provide the directory where to find BLAS libraries with cmake option: -DBLAS_LIBDIR=your/path/to/blas/libs\n"
+	"- Option 3: Update your environment variable (Linux: LD_LIBRARY_PATH, Windows: LIB, Mac: DYLD_LIBRARY_PATH)\n"
+	"\nTo follow libraries detection more precisely you can activate a verbose mode with -DBLAS_VERBOSE=ON at cmake configure."
+	"\nYou could also specify a BLAS vendor to look for by setting -DBLA_VENDOR=blas_vendor_name."
+	"\nList of possible BLAS vendor: Goto, ATLAS PhiPACK, CXML, DXML, SunPerf, SCSL, SGIMATH, IBMESSL, Intel10_32 (intel mkl v10 32 bit),"
+	"Intel10_64lp (intel mkl v10 64 bit, lp thread model, lp64 model), Intel10_64lp_seq (intel mkl v10 64 bit, sequential code, lp64 model),"
+	"Intel( older versions of mkl 32 and 64 bit), ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic")
+      if(BLAS_FIND_REQUIRED)
+	message(FATAL_ERROR
+	  "A required library with BLAS API not found. Please specify library location.")
+      else()
+	message(STATUS
+	  "A library with BLAS API not found. Please specify library location.")
+      endif()
+    endif()
+  endif()
+
+endif()
+
+set(CMAKE_FIND_LIBRARY_SUFFIXES ${_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
+
+if (BLAS_FOUND)
+  list(GET BLAS_LIBRARIES 0 first_lib)
+  get_filename_component(first_lib_path "${first_lib}" PATH)
+  if (${first_lib_path} MATCHES "(/lib(32|64)?$)|(/lib/intel64$|/lib/ia32$)")
+    string(REGEX REPLACE "(/lib(32|64)?$)|(/lib/intel64$|/lib/ia32$)" "" not_cached_dir "${first_lib_path}")
+    set(BLAS_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of BLAS library" FORCE)
+  else()
+    set(BLAS_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of BLAS library" FORCE)
+  endif()
+endif()
+mark_as_advanced(BLAS_DIR)
+mark_as_advanced(BLAS_DIR_FOUND)

diff --git a/cmake/FindBLASEXT.cmake b/cmake/FindBLASEXT.cmake
new file mode 100644
index 0000000..69a9418
--- /dev/null
+++ b/cmake/FindBLASEXT.cmake

@@ -0,0 +1,384 @@
+###
+#
+# @copyright (c) 2009-2014 The University of Tennessee and The University
+#                          of Tennessee Research Foundation.
+#                          All rights reserved.
+# @copyright (c) 2012-2016 Inria. All rights reserved.
+# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+#
+###
+#
+# - Find BLAS EXTENDED for MORSE projects: find include dirs and libraries
+#
+# This module allows to find BLAS libraries by calling the official FindBLAS module
+# and handles the creation of different library lists whether the user wishes to link
+# with a sequential BLAS or a multihreaded (BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES).
+# BLAS is detected with a FindBLAS call then if the BLAS vendor is Intel10_64lp, ACML
+# or IBMESSLMT then the module attempts to find the corresponding multithreaded libraries.
+#
+# The following variables have been added to manage links with sequential or multithreaded
+# versions:
+#  BLAS_INCLUDE_DIRS  - BLAS include directories
+#  BLAS_LIBRARY_DIRS  - Link directories for BLAS libraries
+#  BLAS_SEQ_LIBRARIES - BLAS component libraries to be linked (sequential)
+#  BLAS_PAR_LIBRARIES - BLAS component libraries to be linked (multithreaded)
+
+#=============================================================================
+# Copyright 2012-2013 Inria
+# Copyright 2012-2013 Emmanuel Agullo
+# Copyright 2012-2013 Mathieu Faverge
+# Copyright 2012      Cedric Castagnede
+# Copyright 2013-2016 Florent Pruvost
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file MORSE-Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of Morse, substitute the full
+#  License text for the above reference.)
+
+# macro to factorize this call
+include(CMakeFindDependencyMacro)
+macro(find_package_blas)
+  if(BLASEXT_FIND_REQUIRED)
+    if(BLASEXT_FIND_QUIETLY)
+      find_dependency(BLAS REQUIRED QUIET)
+    else()
+      find_dependency(BLAS REQUIRED)
+    endif()
+  else()
+    if(BLASEXT_FIND_QUIETLY)
+      find_dependency(BLAS QUIET)
+    else()
+      find_dependency(BLAS)
+    endif()
+  endif()
+endmacro()
+
+# add a cache variable to let the user specify the BLAS vendor
+set(BLA_VENDOR "" CACHE STRING "list of possible BLAS vendor:
+    Open, Eigen, Goto, ATLAS PhiPACK, CXML, DXML, SunPerf, SCSL, SGIMATH, IBMESSL, IBMESSLMT,
+    Intel10_32 (intel mkl v10 32 bit),
+    Intel10_64lp (intel mkl v10 64 bit, lp thread model, lp64 model),
+    Intel10_64lp_seq (intel mkl v10 64 bit, sequential code, lp64 model),
+    Intel( older versions of mkl 32 and 64 bit),
+    ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic")
+
+if(NOT BLASEXT_FIND_QUIETLY)
+  message(STATUS "In FindBLASEXT")
+  message(STATUS "If you want to force the use of one specific library, "
+    "\n   please specify the BLAS vendor by setting -DBLA_VENDOR=blas_vendor_name"
+    "\n   at cmake configure.")
+  message(STATUS "List of possible BLAS vendor: Goto, ATLAS PhiPACK, CXML, "
+    "\n   DXML, SunPerf, SCSL, SGIMATH, IBMESSL, IBMESSLMT, Intel10_32 (intel mkl v10 32 bit),"
+    "\n   Intel10_64lp (intel mkl v10 64 bit, lp thread model, lp64 model),"
+    "\n   Intel10_64lp_seq (intel mkl v10 64 bit, sequential code, lp64 model),"
+    "\n   Intel( older versions of mkl 32 and 64 bit),"
+    "\n   ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic")
+endif()
+
+if (NOT BLAS_FOUND)
+  # First try to detect two cases:
+  # 1: only SEQ libs are handled
+  # 2: both SEQ and PAR libs are handled
+  find_package_blas()
+endif ()
+
+# detect the cases where SEQ and PAR libs are handled
+if(BLA_VENDOR STREQUAL "All" AND
+    (BLAS_mkl_core_LIBRARY OR BLAS_mkl_core_dll_LIBRARY)
+    )
+  set(BLA_VENDOR "Intel")
+  if(BLAS_mkl_intel_LIBRARY)
+    set(BLA_VENDOR "Intel10_32")
+  endif()
+  if(BLAS_mkl_intel_lp64_LIBRARY)
+    set(BLA_VENDOR "Intel10_64lp")
+  endif()
+  if(NOT BLASEXT_FIND_QUIETLY)
+    message(STATUS "A BLAS library has been found (${BLAS_LIBRARIES}) but we"
+      "\n   have also potentially detected some multithreaded BLAS libraries from the MKL."
+      "\n   We try to find both libraries lists (Sequential/Multithreaded).")
+  endif()
+  set(BLAS_FOUND "")
+elseif(BLA_VENDOR STREQUAL "All" AND BLAS_acml_LIBRARY)
+  set(BLA_VENDOR "ACML")
+  if(NOT BLASEXT_FIND_QUIETLY)
+    message(STATUS "A BLAS library has been found (${BLAS_LIBRARIES}) but we"
+      "\n   have also potentially detected some multithreaded BLAS libraries from the ACML."
+      "\n   We try to find both libraries lists (Sequential/Multithreaded).")
+  endif()
+  set(BLAS_FOUND "")
+elseif(BLA_VENDOR STREQUAL "All" AND BLAS_essl_LIBRARY)
+  set(BLA_VENDOR "IBMESSL")
+  if(NOT BLASEXT_FIND_QUIETLY)
+    message(STATUS "A BLAS library has been found (${BLAS_LIBRARIES}) but we"
+      "\n   have also potentially detected some multithreaded BLAS libraries from the ESSL."
+      "\n   We try to find both libraries lists (Sequential/Multithreaded).")
+  endif()
+  set(BLAS_FOUND "")
+endif()
+
+# Intel case
+if(BLA_VENDOR MATCHES "Intel*")
+
+  ###
+  # look for include path if the BLAS vendor is Intel
+  ###
+
+  # gather system include paths
+  unset(_inc_env)
+  if(WIN32)
+    string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}")
+  else()
+    string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}")
+    list(APPEND _inc_env "${_path_env}")
+    string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}")
+    list(APPEND _inc_env "${_path_env}")
+    string(REPLACE ":" ";" _path_env "$ENV{CPATH}")
+    list(APPEND _inc_env "${_path_env}")
+    string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}")
+    list(APPEND _inc_env "${_path_env}")
+  endif()
+  list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}")
+  list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}")
+  set(ENV_MKLROOT "$ENV{MKLROOT}")
+  if (ENV_MKLROOT)
+    list(APPEND _inc_env "${ENV_MKLROOT}/include")
+  endif()
+  list(REMOVE_DUPLICATES _inc_env)
+
+  # find mkl.h inside known include paths
+  set(BLAS_mkl.h_INCLUDE_DIRS "BLAS_mkl.h_INCLUDE_DIRS-NOTFOUND")
+  if(BLAS_INCDIR)
+    set(BLAS_mkl.h_INCLUDE_DIRS "BLAS_mkl.h_INCLUDE_DIRS-NOTFOUND")
+    find_path(BLAS_mkl.h_INCLUDE_DIRS
+      NAMES mkl.h
+      HINTS ${BLAS_INCDIR})
+  else()
+    if(BLAS_DIR)
+      set(BLAS_mkl.h_INCLUDE_DIRS "BLAS_mkl.h_INCLUDE_DIRS-NOTFOUND")
+      find_path(BLAS_mkl.h_INCLUDE_DIRS
+	NAMES mkl.h
+	HINTS ${BLAS_DIR}
+	PATH_SUFFIXES include)
+    else()
+      set(BLAS_mkl.h_INCLUDE_DIRS "BLAS_mkl.h_INCLUDE_DIRS-NOTFOUND")
+      find_path(BLAS_mkl.h_INCLUDE_DIRS
+	NAMES mkl.h
+	HINTS ${_inc_env})
+    endif()
+  endif()
+  mark_as_advanced(BLAS_mkl.h_INCLUDE_DIRS)
+  ## Print status if not found
+  ## -------------------------
+  #if (NOT BLAS_mkl.h_INCLUDE_DIRS AND MORSE_VERBOSE)
+  #    Print_Find_Header_Status(blas mkl.h)
+  #endif ()
+  set(BLAS_INCLUDE_DIRS "")
+  if(BLAS_mkl.h_INCLUDE_DIRS)
+    list(APPEND BLAS_INCLUDE_DIRS "${BLAS_mkl.h_INCLUDE_DIRS}" )
+  endif()
+
+  ###
+  # look for libs
+  ###
+  # if Intel 10 64 bit -> look for sequential and multithreaded versions
+  if(BLA_VENDOR MATCHES "Intel10_64lp*")
+
+    ## look for the sequential version
+    set(BLA_VENDOR "Intel10_64lp_seq")
+    if(NOT BLASEXT_FIND_QUIETLY)
+      message(STATUS "Look for the sequential version Intel10_64lp_seq")
+    endif()
+    find_package_blas()
+    if(BLAS_FOUND)
+      set(BLAS_SEQ_LIBRARIES "${BLAS_LIBRARIES}")
+    else()
+      set(BLAS_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES-NOTFOUND}")
+    endif()
+
+    ## look for the multithreaded version
+    set(BLA_VENDOR "Intel10_64lp")
+    if(NOT BLASEXT_FIND_QUIETLY)
+      message(STATUS "Look for the multithreaded version Intel10_64lp")
+    endif()
+    find_package_blas()
+    if(BLAS_FOUND)
+      set(BLAS_PAR_LIBRARIES "${BLAS_LIBRARIES}")
+    else()
+      set(BLAS_PAR_LIBRARIES "${BLAS_PAR_LIBRARIES-NOTFOUND}")
+    endif()
+
+  else()
+
+    if(BLAS_FOUND)
+      set(BLAS_SEQ_LIBRARIES "${BLAS_LIBRARIES}")
+    else()
+      set(BLAS_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES-NOTFOUND}")
+    endif()
+
+  endif()
+
+  # ACML case
+elseif(BLA_VENDOR MATCHES "ACML*")
+
+  ## look for the sequential version
+  set(BLA_VENDOR "ACML")
+  find_package_blas()
+  if(BLAS_FOUND)
+    set(BLAS_SEQ_LIBRARIES "${BLAS_LIBRARIES}")
+  else()
+    set(BLAS_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES-NOTFOUND}")
+  endif()
+
+  ## look for the multithreaded version
+  set(BLA_VENDOR "ACML_MP")
+  find_package_blas()
+  if(BLAS_FOUND)
+    set(BLAS_PAR_LIBRARIES "${BLAS_LIBRARIES}")
+  else()
+    set(BLAS_PAR_LIBRARIES "${BLAS_PAR_LIBRARIES-NOTFOUND}")
+  endif()
+
+  # IBMESSL case
+elseif(BLA_VENDOR MATCHES "IBMESSL*")
+
+  ## look for the sequential version
+  set(BLA_VENDOR "IBMESSL")
+  find_package_blas()
+  if(BLAS_FOUND)
+    set(BLAS_SEQ_LIBRARIES "${BLAS_LIBRARIES}")
+  else()
+    set(BLAS_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES-NOTFOUND}")
+  endif()
+
+  ## look for the multithreaded version
+  set(BLA_VENDOR "IBMESSLMT")
+  find_package_blas()
+  if(BLAS_FOUND)
+    set(BLAS_PAR_LIBRARIES "${BLAS_LIBRARIES}")
+  else()
+    set(BLAS_PAR_LIBRARIES "${BLAS_PAR_LIBRARIES-NOTFOUND}")
+  endif()
+
+else()
+
+  if(BLAS_FOUND)
+    # define the SEQ libs as the BLAS_LIBRARIES
+    set(BLAS_SEQ_LIBRARIES "${BLAS_LIBRARIES}")
+  else()
+    set(BLAS_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES-NOTFOUND}")
+  endif()
+  set(BLAS_PAR_LIBRARIES "${BLAS_PAR_LIBRARIES-NOTFOUND}")
+
+endif()
+
+
+if(BLAS_SEQ_LIBRARIES)
+  set(BLAS_LIBRARIES "${BLAS_SEQ_LIBRARIES}")
+endif()
+
+# extract libs paths
+# remark: because it is not given by find_package(BLAS)
+set(BLAS_LIBRARY_DIRS "")
+string(REPLACE " " ";" BLAS_LIBRARIES "${BLAS_LIBRARIES}")
+foreach(blas_lib ${BLAS_LIBRARIES})
+  if (EXISTS "${blas_lib}")
+    get_filename_component(a_blas_lib_dir "${blas_lib}" PATH)
+    list(APPEND BLAS_LIBRARY_DIRS "${a_blas_lib_dir}" )
+  else()
+    string(REPLACE "-L" "" blas_lib "${blas_lib}")
+    if (EXISTS "${blas_lib}")
+      list(APPEND BLAS_LIBRARY_DIRS "${blas_lib}" )
+    else()
+      get_filename_component(a_blas_lib_dir "${blas_lib}" PATH)
+      if (EXISTS "${a_blas_lib_dir}")
+	list(APPEND BLAS_LIBRARY_DIRS "${a_blas_lib_dir}" )
+      endif()
+    endif()
+  endif()
+endforeach()
+if (BLAS_LIBRARY_DIRS)
+  list(REMOVE_DUPLICATES BLAS_LIBRARY_DIRS)
+endif ()
+
+# check that BLAS has been found
+# ---------------------------------
+include(FindPackageHandleStandardArgs)
+if(BLA_VENDOR MATCHES "Intel*")
+  if(BLA_VENDOR MATCHES "Intel10_64lp*")
+    if(NOT BLASEXT_FIND_QUIETLY)
+      message(STATUS "BLAS found is Intel MKL:"
+	"\n   we manage two lists of libs, one sequential and one parallel if found"
+	"\n   (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)")
+      message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES")
+    endif()
+    find_package_handle_standard_args(BLASEXT DEFAULT_MSG
+      BLAS_SEQ_LIBRARIES
+      BLAS_LIBRARY_DIRS
+      BLAS_INCLUDE_DIRS)
+    if(BLAS_PAR_LIBRARIES)
+      if(NOT BLASEXT_FIND_QUIETLY)
+	message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES")
+      endif()
+      find_package_handle_standard_args(BLASEXT DEFAULT_MSG
+	BLAS_PAR_LIBRARIES)
+    endif()
+  else()
+    if(NOT BLASEXT_FIND_QUIETLY)
+      message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES")
+    endif()
+    find_package_handle_standard_args(BLASEXT DEFAULT_MSG
+      BLAS_SEQ_LIBRARIES
+      BLAS_LIBRARY_DIRS
+      BLAS_INCLUDE_DIRS)
+  endif()
+elseif(BLA_VENDOR MATCHES "ACML*")
+  if(NOT BLASEXT_FIND_QUIETLY)
+    message(STATUS "BLAS found is ACML:"
+      "\n   we manage two lists of libs, one sequential and one parallel if found"
+      "\n   (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)")
+    message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES")
+  endif()
+  find_package_handle_standard_args(BLASEXT DEFAULT_MSG
+    BLAS_SEQ_LIBRARIES
+    BLAS_LIBRARY_DIRS)
+  if(BLAS_PAR_LIBRARIES)
+    if(NOT BLASEXT_FIND_QUIETLY)
+      message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES")
+    endif()
+    find_package_handle_standard_args(BLASEXT DEFAULT_MSG
+      BLAS_PAR_LIBRARIES)
+  endif()
+elseif(BLA_VENDOR MATCHES "IBMESSL*")
+  if(NOT BLASEXT_FIND_QUIETLY)
+    message(STATUS "BLAS found is ESSL:"
+      "\n   we manage two lists of libs, one sequential and one parallel if found"
+      "\n   (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)")
+    message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES")
+  endif()
+  find_package_handle_standard_args(BLASEXT DEFAULT_MSG
+    BLAS_SEQ_LIBRARIES
+    BLAS_LIBRARY_DIRS)
+  if(BLAS_PAR_LIBRARIES)
+    if(NOT BLASEXT_FIND_QUIETLY)
+      message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES")
+    endif()
+    find_package_handle_standard_args(BLASEXT DEFAULT_MSG
+      BLAS_PAR_LIBRARIES)
+  endif()
+else()
+  if(NOT BLASEXT_FIND_QUIETLY)
+    message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES")
+  endif()
+  find_package_handle_standard_args(BLASEXT DEFAULT_MSG
+    BLAS_SEQ_LIBRARIES
+    BLAS_LIBRARY_DIRS)
+endif()
+
+# Callers expect BLAS_FOUND to be set as well.
+set(BLAS_FOUND BLASEXT_FOUND)

diff --git a/cmake/FindCHOLMOD.cmake b/cmake/FindCHOLMOD.cmake
new file mode 100644
index 0000000..e470cb2
--- /dev/null
+++ b/cmake/FindCHOLMOD.cmake

@@ -0,0 +1,89 @@
+# CHOLMOD lib usually requires linking to a blas and lapack library.
+# It is up to the user of this module to find a BLAS and link to it.
+
+if (CHOLMOD_INCLUDES AND CHOLMOD_LIBRARIES)
+  set(CHOLMOD_FIND_QUIETLY TRUE)
+endif ()
+
+find_path(CHOLMOD_INCLUDES
+  NAMES
+  cholmod.h
+  PATHS
+  $ENV{CHOLMODDIR}
+  ${INCLUDE_INSTALL_DIR}
+  PATH_SUFFIXES
+  suitesparse
+  ufsparse
+)
+
+find_library(CHOLMOD_LIBRARIES cholmod PATHS $ENV{CHOLMODDIR} ${LIB_INSTALL_DIR})
+
+if(CHOLMOD_LIBRARIES)
+
+  get_filename_component(CHOLMOD_LIBDIR ${CHOLMOD_LIBRARIES} PATH)
+
+  find_library(AMD_LIBRARY amd PATHS ${CHOLMOD_LIBDIR} $ENV{CHOLMODDIR} ${LIB_INSTALL_DIR})
+  if (AMD_LIBRARY)
+    set(CHOLMOD_LIBRARIES ${CHOLMOD_LIBRARIES} ${AMD_LIBRARY})
+  else ()
+    set(CHOLMOD_LIBRARIES FALSE)
+  endif ()
+
+endif()
+
+if(CHOLMOD_LIBRARIES)
+
+  find_library(COLAMD_LIBRARY colamd PATHS ${CHOLMOD_LIBDIR} $ENV{CHOLMODDIR} ${LIB_INSTALL_DIR})
+  if (COLAMD_LIBRARY)
+    set(CHOLMOD_LIBRARIES ${CHOLMOD_LIBRARIES} ${COLAMD_LIBRARY})
+  else ()
+    set(CHOLMOD_LIBRARIES FALSE)
+  endif ()
+
+endif()
+
+if(CHOLMOD_LIBRARIES)
+
+  find_library(CAMD_LIBRARY camd PATHS ${CHOLMOD_LIBDIR} $ENV{CHOLMODDIR} ${LIB_INSTALL_DIR})
+  if (CAMD_LIBRARY)
+    set(CHOLMOD_LIBRARIES ${CHOLMOD_LIBRARIES} ${CAMD_LIBRARY})
+  else ()
+    set(CHOLMOD_LIBRARIES FALSE)
+  endif ()
+
+endif()
+
+if(CHOLMOD_LIBRARIES)
+
+  find_library(CCOLAMD_LIBRARY ccolamd PATHS ${CHOLMOD_LIBDIR} $ENV{CHOLMODDIR} ${LIB_INSTALL_DIR})
+  if (CCOLAMD_LIBRARY)
+    set(CHOLMOD_LIBRARIES ${CHOLMOD_LIBRARIES} ${CCOLAMD_LIBRARY})
+  else ()
+    set(CHOLMOD_LIBRARIES FALSE)
+  endif ()
+
+endif()
+
+if(CHOLMOD_LIBRARIES)
+
+  find_library(CHOLMOD_METIS_LIBRARY metis PATHS ${CHOLMOD_LIBDIR} $ENV{CHOLMODDIR} ${LIB_INSTALL_DIR})
+  if (CHOLMOD_METIS_LIBRARY)
+    set(CHOLMOD_LIBRARIES ${CHOLMOD_LIBRARIES} ${CHOLMOD_METIS_LIBRARY})
+  endif ()
+
+endif()
+
+if(CHOLMOD_LIBRARIES)
+
+  find_library(SUITESPARSE_LIBRARY SuiteSparse PATHS ${CHOLMOD_LIBDIR} $ENV{CHOLMODDIR} ${LIB_INSTALL_DIR})
+  if (SUITESPARSE_LIBRARY)
+    set(CHOLMOD_LIBRARIES ${CHOLMOD_LIBRARIES} ${SUITESPARSE_LIBRARY})
+  endif ()
+  
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CHOLMOD DEFAULT_MSG
+                                  CHOLMOD_INCLUDES CHOLMOD_LIBRARIES)
+
+mark_as_advanced(CHOLMOD_INCLUDES CHOLMOD_LIBRARIES AMD_LIBRARY COLAMD_LIBRARY SUITESPARSE_LIBRARY CAMD_LIBRARY CCOLAMD_LIBRARY CHOLMOD_METIS_LIBRARY)

diff --git a/cmake/FindComputeCpp.cmake b/cmake/FindComputeCpp.cmake
new file mode 100644
index 0000000..1c271f0
--- /dev/null
+++ b/cmake/FindComputeCpp.cmake

@@ -0,0 +1,455 @@
+#.rst:
+# FindComputeCpp
+#---------------
+#
+#   Copyright 2016-2018 Codeplay Software Ltd.
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use these files except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+#########################
+#  FindComputeCpp.cmake
+#########################
+#
+#  Tools for finding and building with ComputeCpp.
+#
+#  User must define ComputeCpp_DIR pointing to the ComputeCpp
+#  installation.
+#
+#  Latest version of this file can be found at:
+#    https://github.com/codeplaysoftware/computecpp-sdk
+
+cmake_minimum_required(VERSION 3.4.3)
+include(FindPackageHandleStandardArgs)
+include(ComputeCppIRMap)
+
+set(COMPUTECPP_USER_FLAGS "" CACHE STRING "User flags for compute++")
+separate_arguments(COMPUTECPP_USER_FLAGS)
+mark_as_advanced(COMPUTECPP_USER_FLAGS)
+
+set(COMPUTECPP_BITCODE "spir64" CACHE STRING
+  "Bitcode type to use as SYCL target in compute++")
+mark_as_advanced(COMPUTECPP_BITCODE)
+
+include(CMakeFindDependencyMacro)
+find_dependency(OpenCL REQUIRED)
+
+# Find ComputeCpp package
+
+if(DEFINED ComputeCpp_DIR)
+  set(computecpp_find_hint ${ComputeCpp_DIR})
+elseif(DEFINED ENV{COMPUTECPP_DIR})
+  set(computecpp_find_hint $ENV{COMPUTECPP_DIR})
+endif()
+
+# Used for running executables on the host
+set(computecpp_host_find_hint ${computecpp_find_hint})
+
+if(CMAKE_CROSSCOMPILING)
+  # ComputeCpp_HOST_DIR is used to find executables that are run on the host
+  if(DEFINED ComputeCpp_HOST_DIR)
+    set(computecpp_host_find_hint ${ComputeCpp_HOST_DIR})
+  elseif(DEFINED ENV{COMPUTECPP_HOST_DIR})
+    set(computecpp_host_find_hint $ENV{COMPUTECPP_HOST_DIR})
+  endif()
+endif()
+
+find_program(ComputeCpp_DEVICE_COMPILER_EXECUTABLE compute++
+  HINTS ${computecpp_host_find_hint}
+  PATH_SUFFIXES bin
+  NO_SYSTEM_ENVIRONMENT_PATH)
+
+find_program(ComputeCpp_INFO_EXECUTABLE computecpp_info
+  HINTS ${computecpp_host_find_hint}
+  PATH_SUFFIXES bin
+  NO_SYSTEM_ENVIRONMENT_PATH)
+
+find_library(COMPUTECPP_RUNTIME_LIBRARY
+  NAMES ComputeCpp ComputeCpp_vs2015
+  HINTS ${computecpp_find_hint}
+  PATH_SUFFIXES lib
+  DOC "ComputeCpp Runtime Library")
+
+find_library(COMPUTECPP_RUNTIME_LIBRARY_DEBUG
+  NAMES ComputeCpp_d ComputeCpp ComputeCpp_vs2015_d
+  HINTS ${computecpp_find_hint}
+  PATH_SUFFIXES lib
+  DOC "ComputeCpp Debug Runtime Library")
+
+find_path(ComputeCpp_INCLUDE_DIRS
+  NAMES "CL/sycl.hpp"
+  HINTS ${computecpp_find_hint}/include
+  DOC "The ComputeCpp include directory")
+get_filename_component(ComputeCpp_INCLUDE_DIRS ${ComputeCpp_INCLUDE_DIRS} ABSOLUTE)
+
+get_filename_component(computecpp_canonical_root_dir "${ComputeCpp_INCLUDE_DIRS}/.." ABSOLUTE)
+set(ComputeCpp_ROOT_DIR "${computecpp_canonical_root_dir}" CACHE PATH
+    "The root of the ComputeCpp install")
+
+if(NOT ComputeCpp_INFO_EXECUTABLE)
+  message(WARNING "Can't find computecpp_info - check ComputeCpp_DIR")
+else()
+  execute_process(COMMAND ${ComputeCpp_INFO_EXECUTABLE} "--dump-version"
+    OUTPUT_VARIABLE ComputeCpp_VERSION
+    RESULT_VARIABLE ComputeCpp_INFO_EXECUTABLE_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(NOT ComputeCpp_INFO_EXECUTABLE_RESULT EQUAL "0")
+    message(WARNING "Package version - Error obtaining version!")
+  endif()
+
+  execute_process(COMMAND ${ComputeCpp_INFO_EXECUTABLE} "--dump-is-supported"
+    OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED
+    RESULT_VARIABLE ComputeCpp_INFO_EXECUTABLE_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(NOT ComputeCpp_INFO_EXECUTABLE_RESULT EQUAL "0")
+    message(WARNING "platform - Error checking platform support!")
+  else()
+    mark_as_advanced(COMPUTECPP_PLATFORM_IS_SUPPORTED)
+    if (COMPUTECPP_PLATFORM_IS_SUPPORTED)
+      message(STATUS "platform - your system can support ComputeCpp")
+    else()
+      message(STATUS "platform - your system is not officially supported")
+    endif()
+  endif()
+endif()
+
+find_package_handle_standard_args(ComputeCpp
+  REQUIRED_VARS ComputeCpp_ROOT_DIR
+                ComputeCpp_DEVICE_COMPILER_EXECUTABLE
+                ComputeCpp_INFO_EXECUTABLE
+                COMPUTECPP_RUNTIME_LIBRARY
+                COMPUTECPP_RUNTIME_LIBRARY_DEBUG
+                ComputeCpp_INCLUDE_DIRS
+  VERSION_VAR ComputeCpp_VERSION)
+mark_as_advanced(ComputeCpp_ROOT_DIR
+                 ComputeCpp_DEVICE_COMPILER_EXECUTABLE
+                 ComputeCpp_INFO_EXECUTABLE
+                 COMPUTECPP_RUNTIME_LIBRARY
+                 COMPUTECPP_RUNTIME_LIBRARY_DEBUG
+                 ComputeCpp_INCLUDE_DIRS
+                 ComputeCpp_VERSION)
+
+if(NOT ComputeCpp_FOUND)
+  return()
+endif()
+
+list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -O2 -mllvm -inline-threshold=1000 -intelspirmetadata)
+mark_as_advanced(COMPUTECPP_DEVICE_COMPILER_FLAGS)
+
+if(CMAKE_CROSSCOMPILING)
+  if(NOT COMPUTECPP_DONT_USE_TOOLCHAIN)
+    list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS --gcc-toolchain=${COMPUTECPP_TOOLCHAIN_DIR})
+  endif()
+  list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS --sysroot=${COMPUTECPP_SYSROOT_DIR})
+  list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -target ${COMPUTECPP_TARGET_TRIPLE})
+endif()
+
+list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -sycl-target ${COMPUTECPP_BITCODE})
+message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}")
+
+include(ComputeCppCompilerChecks)
+
+if(NOT TARGET OpenCL::OpenCL)
+  add_library(OpenCL::OpenCL UNKNOWN IMPORTED)
+  set_target_properties(OpenCL::OpenCL PROPERTIES
+    IMPORTED_LOCATION             "${OpenCL_LIBRARIES}"
+    INTERFACE_INCLUDE_DIRECTORIES "${OpenCL_INCLUDE_DIRS}"
+  )
+endif()
+
+if(NOT TARGET ComputeCpp::ComputeCpp)
+  add_library(ComputeCpp::ComputeCpp UNKNOWN IMPORTED)
+  set_target_properties(ComputeCpp::ComputeCpp PROPERTIES
+    IMPORTED_LOCATION_DEBUG          "${COMPUTECPP_RUNTIME_LIBRARY_DEBUG}"
+    IMPORTED_LOCATION_RELWITHDEBINFO "${COMPUTECPP_RUNTIME_LIBRARY}"
+    IMPORTED_LOCATION                "${COMPUTECPP_RUNTIME_LIBRARY}"
+    INTERFACE_INCLUDE_DIRECTORIES    "${ComputeCpp_INCLUDE_DIRS}"
+    INTERFACE_LINK_LIBRARIES         "OpenCL::OpenCL"
+  )
+endif()
+
+# This property allows targets to specify that their sources should be
+# compiled with the integration header included after the user's
+# sources, not before (e.g. when an enum is used in a kernel name, this
+# is not technically valid SYCL code but can work with ComputeCpp)
+define_property(
+  TARGET PROPERTY COMPUTECPP_INCLUDE_AFTER
+  BRIEF_DOCS "Include integration header after user source"
+  FULL_DOCS "Changes compiler arguments such that the source file is
+  actually the integration header, and the .cpp file is included on
+  the command line so that it is seen by the compiler first. Enables
+  non-standards-conformant SYCL code to compile with ComputeCpp."
+)
+define_property(
+  TARGET PROPERTY INTERFACE_COMPUTECPP_FLAGS
+  BRIEF_DOCS "Interface compile flags to provide compute++"
+  FULL_DOCS  "Set additional compile flags to pass to compute++ when compiling
+  any target which links to this one."
+)
+define_property(
+  SOURCE PROPERTY COMPUTECPP_SOURCE_FLAGS
+  BRIEF_DOCS "Source file compile flags for compute++"
+  FULL_DOCS  "Set additional compile flags for compiling the SYCL integration
+  header for the given source file."
+)
+
+####################
+#   __build_ir
+####################
+#
+#  Adds a custom target for running compute++ and adding a dependency for the
+#  resulting integration header and kernel binary.
+#
+#  TARGET : Name of the target.
+#  SOURCE : Source file to be compiled.
+#  COUNTER : Counter included in name of custom target. Different counter
+#       values prevent duplicated names of custom target when source files with
+#       the same name, but located in different directories, are used for the
+#       same target.
+#
+function(__build_ir)
+  set(options)
+  set(one_value_args
+    TARGET
+    SOURCE
+    COUNTER
+  )
+  set(multi_value_args)
+  cmake_parse_arguments(SDK_BUILD_IR
+    "${options}"
+    "${one_value_args}"
+    "${multi_value_args}"
+    ${ARGN}
+  )
+  get_filename_component(sourceFileName ${SDK_BUILD_IR_SOURCE} NAME)
+
+  # Set the path to the integration header.
+  # The .sycl filename must depend on the target so that different targets
+  # using the same source file will be generated with a different rule.
+  set(baseSyclName ${CMAKE_CURRENT_BINARY_DIR}/${SDK_BUILD_IR_TARGET}_${sourceFileName})
+  set(outputSyclFile ${baseSyclName}.sycl)
+  set(outputDeviceFile ${baseSyclName}.${IR_MAP_${COMPUTECPP_BITCODE}})
+  set(depFileName ${baseSyclName}.sycl.d)
+
+  set(include_directories "$<TARGET_PROPERTY:${SDK_BUILD_IR_TARGET},INCLUDE_DIRECTORIES>")
+  set(compile_definitions "$<TARGET_PROPERTY:${SDK_BUILD_IR_TARGET},COMPILE_DEFINITIONS>")
+  set(generated_include_directories
+    $<$<BOOL:${include_directories}>:-I\"$<JOIN:${include_directories},\"\t-I\">\">)
+  set(generated_compile_definitions
+    $<$<BOOL:${compile_definitions}>:-D$<JOIN:${compile_definitions},\t-D>>)
+
+  # Obtain language standard of the file
+  set(device_compiler_cxx_standard)
+  get_target_property(targetCxxStandard ${SDK_BUILD_IR_TARGET} CXX_STANDARD)
+  if (targetCxxStandard MATCHES 17)
+    set(device_compiler_cxx_standard "-std=c++1z")
+  elseif (targetCxxStandard MATCHES 14)
+    set(device_compiler_cxx_standard "-std=c++14")
+  elseif (targetCxxStandard MATCHES 11)
+    set(device_compiler_cxx_standard "-std=c++11")
+  elseif (targetCxxStandard MATCHES 98)
+    message(FATAL_ERROR "SYCL applications cannot be compiled using C++98")
+  else ()
+    set(device_compiler_cxx_standard "")
+  endif()
+
+  get_property(source_compile_flags
+    SOURCE ${SDK_BUILD_IR_SOURCE}
+    PROPERTY COMPUTECPP_SOURCE_FLAGS
+  )
+  separate_arguments(source_compile_flags)
+  if(source_compile_flags)
+    list(APPEND computecpp_source_flags ${source_compile_flags})
+  endif()
+
+  list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS
+    ${device_compiler_cxx_standard}
+    ${COMPUTECPP_USER_FLAGS}
+    ${computecpp_source_flags}
+  )
+
+  set(ir_dependencies ${SDK_BUILD_IR_SOURCE})
+  get_target_property(target_libraries ${SDK_BUILD_IR_TARGET} LINK_LIBRARIES)
+  if(target_libraries)
+    foreach(library ${target_libraries})
+      if(TARGET ${library})
+        list(APPEND ir_dependencies ${library})
+      endif()
+    endforeach()
+  endif()
+
+  # Depfile support was only added in CMake 3.7
+  # CMake throws an error if it is unsupported by the generator (i. e. not ninja)
+  if((NOT CMAKE_VERSION VERSION_LESS 3.7.0) AND
+          CMAKE_GENERATOR MATCHES "Ninja")
+    file(RELATIVE_PATH relOutputFile ${CMAKE_BINARY_DIR} ${outputDeviceFile})
+    set(generate_depfile -MMD -MF ${depFileName} -MT ${relOutputFile})
+    set(enable_depfile DEPFILE ${depFileName})
+  endif()
+
+  # Add custom command for running compute++
+  add_custom_command(
+    OUTPUT ${outputDeviceFile} ${outputSyclFile}
+    COMMAND ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE}
+            ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
+            ${generated_include_directories}
+            ${generated_compile_definitions}
+            -sycl-ih ${outputSyclFile}
+            -o ${outputDeviceFile}
+            -c ${SDK_BUILD_IR_SOURCE}
+            ${generate_depfile}
+    DEPENDS ${ir_dependencies}
+    IMPLICIT_DEPENDS CXX ${SDK_BUILD_IR_SOURCE}
+    ${enable_depfile}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    COMMENT "Building ComputeCpp integration header file ${outputSyclFile}")
+
+  # Name: (user-defined name)_(source file)_(counter)_ih
+  set(headerTargetName
+    ${SDK_BUILD_IR_TARGET}_${sourceFileName}_${SDK_BUILD_IR_COUNTER}_ih)
+
+  if(NOT MSVC)
+    # Add a custom target for the generated integration header
+    add_custom_target(${headerTargetName} DEPENDS ${outputDeviceFile} ${outputSyclFile})
+    add_dependencies(${SDK_BUILD_IR_TARGET} ${headerTargetName})
+  endif()
+
+  # This property can be set on a per-target basis to indicate that the
+  # integration header should appear after the main source listing
+  get_target_property(includeAfter ${SDK_ADD_SYCL_TARGET} COMPUTECPP_INCLUDE_AFTER)
+
+  if(includeAfter)
+    # Change the source file to the integration header - e.g.
+    # g++ -c source_file_name.cpp.sycl
+    get_target_property(current_sources ${SDK_BUILD_IR_TARGET} SOURCES)
+    # Remove absolute path to source file
+    list(REMOVE_ITEM current_sources ${SDK_BUILD_IR_SOURCE})
+    # Remove relative path to source file
+    string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" ""
+      rel_source_file ${SDK_BUILD_IR_SOURCE}
+    )
+    list(REMOVE_ITEM current_sources ${rel_source_file})
+    # Add SYCL header to source list
+    list(APPEND current_sources ${outputSyclFile})
+    set_property(TARGET ${SDK_BUILD_IR_TARGET}
+      PROPERTY SOURCES ${current_sources})
+    # CMake/gcc don't know what language a .sycl file is, so tell them
+    set_property(SOURCE ${outputSyclFile} PROPERTY LANGUAGE CXX)
+    set(includedFile ${SDK_BUILD_IR_SOURCE})
+    set(cppFile ${outputSyclFile})
+  else()
+    set_property(SOURCE ${outputSyclFile} PROPERTY HEADER_FILE_ONLY ON)
+    set(includedFile ${outputSyclFile})
+    set(cppFile ${SDK_BUILD_IR_SOURCE})
+  endif()
+
+  # Force inclusion of the integration header for the host compiler
+  if(MSVC)
+    # Group SYCL files inside Visual Studio
+    source_group("SYCL" FILES ${outputSyclFile})
+
+    if(includeAfter)
+      # Allow the source file to be edited using Visual Studio.
+      # It will be added as a header file so it won't be compiled.
+      set_property(SOURCE ${SDK_BUILD_IR_SOURCE} PROPERTY HEADER_FILE_ONLY true)
+    endif()
+
+    # Add both source and the sycl files to the VS solution.
+    target_sources(${SDK_BUILD_IR_TARGET} PUBLIC ${SDK_BUILD_IR_SOURCE} ${outputSyclFile})
+
+    set(forceIncludeFlags "/FI${includedFile} /TP")
+  else()
+    set(forceIncludeFlags "-include ${includedFile} -x c++")
+  endif()
+
+  set_property(
+    SOURCE ${cppFile}
+    APPEND_STRING PROPERTY COMPILE_FLAGS "${forceIncludeFlags}"
+  )
+
+endfunction(__build_ir)
+
+#######################
+#  add_sycl_to_target
+#######################
+#
+#  Adds a SYCL compilation custom command associated with an existing
+#  target and sets a dependancy on that new command.
+#
+#  TARGET : Name of the target to add SYCL to.
+#  SOURCES : Source files to be compiled for SYCL.
+#
+function(add_sycl_to_target)
+  set(options)
+  set(one_value_args
+    TARGET
+  )
+  set(multi_value_args
+    SOURCES
+  )
+  cmake_parse_arguments(SDK_ADD_SYCL
+    "${options}"
+    "${one_value_args}"
+    "${multi_value_args}"
+    ${ARGN}
+  )
+
+  set_target_properties(${SDK_ADD_SYCL_TARGET} PROPERTIES LINKER_LANGUAGE CXX)
+
+  # If the CXX compiler is set to compute++ enable the driver.
+  get_filename_component(cmakeCxxCompilerFileName "${CMAKE_CXX_COMPILER}" NAME)
+  if("${cmakeCxxCompilerFileName}" STREQUAL "compute++")
+    if(MSVC)
+      message(FATAL_ERROR "The compiler driver is not supported by this system,
+                           revert the CXX compiler to your default host compiler.")
+    endif()
+
+    get_target_property(includeAfter ${SDK_ADD_SYCL_TARGET} COMPUTECPP_INCLUDE_AFTER)
+    if(includeAfter)
+      list(APPEND COMPUTECPP_USER_FLAGS -fsycl-ih-last)
+    endif()
+    list(INSERT COMPUTECPP_DEVICE_COMPILER_FLAGS 0 -sycl-driver)
+    # Prepend COMPUTECPP_DEVICE_COMPILER_FLAGS and append COMPUTECPP_USER_FLAGS
+    foreach(prop COMPILE_OPTIONS INTERFACE_COMPILE_OPTIONS)
+      get_target_property(target_compile_options ${SDK_ADD_SYCL_TARGET} ${prop})
+      if(NOT target_compile_options)
+        set(target_compile_options "")
+      endif()
+      set_property(
+        TARGET ${SDK_ADD_SYCL_TARGET}
+        PROPERTY ${prop}
+        ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
+        ${target_compile_options}
+        ${COMPUTECPP_USER_FLAGS}
+      )
+    endforeach()
+  else()
+    set(fileCounter 0)
+    list(INSERT COMPUTECPP_DEVICE_COMPILER_FLAGS 0 -sycl)
+    # Add custom target to run compute++ and generate the integration header
+    foreach(sourceFile ${SDK_ADD_SYCL_SOURCES})
+      if(NOT IS_ABSOLUTE ${sourceFile})
+        set(sourceFile "${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}")
+      endif()
+      __build_ir(
+        TARGET     ${SDK_ADD_SYCL_TARGET}
+        SOURCE     ${sourceFile}
+        COUNTER    ${fileCounter}
+      )
+      MATH(EXPR fileCounter "${fileCounter} + 1")
+    endforeach()
+  endif()
+
+  set_property(TARGET ${SDK_ADD_SYCL_TARGET}
+    APPEND PROPERTY LINK_LIBRARIES ComputeCpp::ComputeCpp)
+  set_property(TARGET ${SDK_ADD_SYCL_TARGET}
+    APPEND PROPERTY INTERFACE_LINK_LIBRARIES ComputeCpp::ComputeCpp)
+endfunction(add_sycl_to_target)

diff --git a/cmake/FindEigen2.cmake b/cmake/FindEigen2.cmake
new file mode 100644
index 0000000..eb2709d
--- /dev/null
+++ b/cmake/FindEigen2.cmake

@@ -0,0 +1,80 @@
+# - Try to find Eigen2 lib
+#
+# This module supports requiring a minimum version, e.g. you can do
+#   find_package(Eigen2 2.0.3)
+# to require version 2.0.3 to newer of Eigen2.
+#
+# Once done this will define
+#
+#  EIGEN2_FOUND - system has eigen lib with correct version
+#  EIGEN2_INCLUDE_DIR - the eigen include directory
+#  EIGEN2_VERSION - eigen version
+
+# Copyright (c) 2006, 2007 Montel Laurent, <montel@kde.org>
+# Copyright (c) 2008, 2009 Gael Guennebaud, <g.gael@free.fr>
+# Redistribution and use is allowed according to the terms of the BSD license.
+
+if(NOT Eigen2_FIND_VERSION)
+  if(NOT Eigen2_FIND_VERSION_MAJOR)
+    set(Eigen2_FIND_VERSION_MAJOR 2)
+  endif()
+  if(NOT Eigen2_FIND_VERSION_MINOR)
+    set(Eigen2_FIND_VERSION_MINOR 0)
+  endif()
+  if(NOT Eigen2_FIND_VERSION_PATCH)
+    set(Eigen2_FIND_VERSION_PATCH 0)
+  endif()
+
+  set(Eigen2_FIND_VERSION "${Eigen2_FIND_VERSION_MAJOR}.${Eigen2_FIND_VERSION_MINOR}.${Eigen2_FIND_VERSION_PATCH}")
+endif()
+
+macro(_eigen2_check_version)
+  file(READ "${EIGEN2_INCLUDE_DIR}/Eigen/src/Core/util/Macros.h" _eigen2_version_header)
+
+  string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen2_world_version_match "${_eigen2_version_header}")
+  set(EIGEN2_WORLD_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen2_major_version_match "${_eigen2_version_header}")
+  set(EIGEN2_MAJOR_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen2_minor_version_match "${_eigen2_version_header}")
+  set(EIGEN2_MINOR_VERSION "${CMAKE_MATCH_1}")
+
+  set(EIGEN2_VERSION ${EIGEN2_WORLD_VERSION}.${EIGEN2_MAJOR_VERSION}.${EIGEN2_MINOR_VERSION})
+  if((${EIGEN2_WORLD_VERSION} NOTEQUAL 2) OR (${EIGEN2_MAJOR_VERSION} GREATER 10) OR (${EIGEN2_VERSION} VERSION_LESS ${Eigen2_FIND_VERSION}))
+    set(EIGEN2_VERSION_OK FALSE)
+  else()
+    set(EIGEN2_VERSION_OK TRUE)
+  endif()
+
+  if(NOT EIGEN2_VERSION_OK)
+
+    message(STATUS "Eigen2 version ${EIGEN2_VERSION} found in ${EIGEN2_INCLUDE_DIR}, "
+                   "but at least version ${Eigen2_FIND_VERSION} is required")
+  endif()
+endmacro()
+
+if (EIGEN2_INCLUDE_DIR)
+
+  # in cache already
+  _eigen2_check_version()
+  set(EIGEN2_FOUND ${EIGEN2_VERSION_OK})
+
+else ()
+
+find_path(EIGEN2_INCLUDE_DIR NAMES Eigen/Core
+     PATHS
+     ${INCLUDE_INSTALL_DIR}
+     ${KDE4_INCLUDE_DIR}
+     PATH_SUFFIXES eigen2
+   )
+
+if(EIGEN2_INCLUDE_DIR)
+  _eigen2_check_version()
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Eigen2 DEFAULT_MSG EIGEN2_INCLUDE_DIR EIGEN2_VERSION_OK)
+
+mark_as_advanced(EIGEN2_INCLUDE_DIR)
+
+endif()
+

diff --git a/cmake/FindEigen3.cmake b/cmake/FindEigen3.cmake
new file mode 100644
index 0000000..0b36805
--- /dev/null
+++ b/cmake/FindEigen3.cmake

@@ -0,0 +1,107 @@
+# - Try to find Eigen3 lib
+#
+# This module supports requiring a minimum version, e.g. you can do
+#   find_package(Eigen3 3.1.2)
+# to require version 3.1.2 or newer of Eigen3.
+#
+# Once done this will define
+#
+#  EIGEN3_FOUND - system has eigen lib with correct version
+#  EIGEN3_INCLUDE_DIR - the eigen include directory
+#  EIGEN3_VERSION - eigen version
+#
+# and the following imported target:
+#
+#  Eigen3::Eigen - The header-only Eigen library
+#
+# This module reads hints about search locations from 
+# the following environment variables:
+#
+# EIGEN3_ROOT
+# EIGEN3_ROOT_DIR
+
+# Copyright (c) 2006, 2007 Montel Laurent, <montel@kde.org>
+# Copyright (c) 2008, 2009 Gael Guennebaud, <g.gael@free.fr>
+# Copyright (c) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+# Redistribution and use is allowed according to the terms of the 2-clause BSD license.
+
+if(NOT Eigen3_FIND_VERSION)
+  if(NOT Eigen3_FIND_VERSION_MAJOR)
+    set(Eigen3_FIND_VERSION_MAJOR 2)
+  endif()
+  if(NOT Eigen3_FIND_VERSION_MINOR)
+    set(Eigen3_FIND_VERSION_MINOR 91)
+  endif()
+  if(NOT Eigen3_FIND_VERSION_PATCH)
+    set(Eigen3_FIND_VERSION_PATCH 0)
+  endif()
+
+  set(Eigen3_FIND_VERSION "${Eigen3_FIND_VERSION_MAJOR}.${Eigen3_FIND_VERSION_MINOR}.${Eigen3_FIND_VERSION_PATCH}")
+endif()
+
+macro(_eigen3_check_version)
+  file(READ "${EIGEN3_INCLUDE_DIR}/Eigen/src/Core/util/Macros.h" _eigen3_version_header)
+
+  string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen3_world_version_match "${_eigen3_version_header}")
+  set(EIGEN3_WORLD_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen3_major_version_match "${_eigen3_version_header}")
+  set(EIGEN3_MAJOR_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen3_minor_version_match "${_eigen3_version_header}")
+  set(EIGEN3_MINOR_VERSION "${CMAKE_MATCH_1}")
+
+  set(EIGEN3_VERSION ${EIGEN3_WORLD_VERSION}.${EIGEN3_MAJOR_VERSION}.${EIGEN3_MINOR_VERSION})
+  if(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+    set(EIGEN3_VERSION_OK FALSE)
+  else()
+    set(EIGEN3_VERSION_OK TRUE)
+  endif()
+
+  if(NOT EIGEN3_VERSION_OK)
+
+    message(STATUS "Eigen3 version ${EIGEN3_VERSION} found in ${EIGEN3_INCLUDE_DIR}, "
+                   "but at least version ${Eigen3_FIND_VERSION} is required")
+  endif()
+endmacro()
+
+if (EIGEN3_INCLUDE_DIR)
+
+  # in cache already
+  _eigen3_check_version()
+  set(EIGEN3_FOUND ${EIGEN3_VERSION_OK})
+  set(Eigen3_FOUND ${EIGEN3_VERSION_OK})
+
+else ()
+  
+  # search first if an Eigen3Config.cmake is available in the system,
+  # if successful this would set EIGEN3_INCLUDE_DIR and the rest of
+  # the script will work as usual
+  find_package(Eigen3 ${Eigen3_FIND_VERSION} NO_MODULE QUIET)
+
+  if(NOT EIGEN3_INCLUDE_DIR)
+    find_path(EIGEN3_INCLUDE_DIR NAMES signature_of_eigen3_matrix_library
+        HINTS
+        ENV EIGEN3_ROOT 
+        ENV EIGEN3_ROOT_DIR
+        PATHS
+        ${CMAKE_INSTALL_PREFIX}/include
+        ${KDE4_INCLUDE_DIR}
+        PATH_SUFFIXES eigen3 eigen
+      )
+  endif()
+
+  if(EIGEN3_INCLUDE_DIR)
+    _eigen3_check_version()
+  endif()
+
+  include(FindPackageHandleStandardArgs)
+  find_package_handle_standard_args(Eigen3 DEFAULT_MSG EIGEN3_INCLUDE_DIR EIGEN3_VERSION_OK)
+
+  mark_as_advanced(EIGEN3_INCLUDE_DIR)
+
+endif()
+
+if(EIGEN3_FOUND AND NOT TARGET Eigen3::Eigen)
+  add_library(Eigen3::Eigen INTERFACE IMPORTED)
+  set_target_properties(Eigen3::Eigen PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES "${EIGEN3_INCLUDE_DIR}")
+endif()

diff --git a/cmake/FindFFTW.cmake b/cmake/FindFFTW.cmake
new file mode 100644
index 0000000..ed55c5f
--- /dev/null
+++ b/cmake/FindFFTW.cmake

@@ -0,0 +1,120 @@
+# - Find the FFTW library
+#
+# Usage:
+#   find_package(FFTW [REQUIRED] [QUIET] )
+#     
+# It sets the following variables:
+#   FFTW_FOUND               ... true if fftw is found on the system
+#   FFTW_LIBRARIES           ... full path to fftw library
+#   FFTW_INCLUDES            ... fftw include directory
+#
+# The following variables will be checked by the function
+#   FFTW_USE_STATIC_LIBS    ... if true, only static libraries are found
+#   FFTW_ROOT               ... if set, the libraries are exclusively searched
+#                               under this path
+#   FFTW_LIBRARY            ... fftw library to use
+#   FFTW_INCLUDE_DIR        ... fftw include directory
+#
+
+#If environment variable FFTWDIR is specified, it has same effect as FFTW_ROOT
+if( NOT FFTW_ROOT AND ENV{FFTWDIR} )
+  set( FFTW_ROOT $ENV{FFTWDIR} )
+endif()
+
+# Check if we can use PkgConfig
+include(CMakeFindDependencyMacro)
+find_dependency(PkgConfig)
+
+#Determine from PKG
+if( PKG_CONFIG_FOUND AND NOT FFTW_ROOT )
+  pkg_check_modules( PKG_FFTW QUIET "fftw3" )
+endif()
+
+#Check whether to search static or dynamic libs
+set( CMAKE_FIND_LIBRARY_SUFFIXES_SAV ${CMAKE_FIND_LIBRARY_SUFFIXES} )
+
+if( ${FFTW_USE_STATIC_LIBS} )
+  set( CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_STATIC_LIBRARY_SUFFIX} )
+else()
+  set( CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_SHARED_LIBRARY_SUFFIX} )
+endif()
+
+if( FFTW_ROOT )
+
+  #find libs
+  find_library(
+    FFTW_LIB
+    NAMES "fftw3"
+    PATHS ${FFTW_ROOT}
+    PATH_SUFFIXES "lib" "lib64"
+    NO_DEFAULT_PATH
+  )
+
+  find_library(
+    FFTWF_LIB
+    NAMES "fftw3f"
+    PATHS ${FFTW_ROOT}
+    PATH_SUFFIXES "lib" "lib64"
+    NO_DEFAULT_PATH
+  )
+
+  find_library(
+    FFTWL_LIB
+    NAMES "fftw3l"
+    PATHS ${FFTW_ROOT}
+    PATH_SUFFIXES "lib" "lib64"
+    NO_DEFAULT_PATH
+  )
+
+  #find includes
+  find_path(
+    FFTW_INCLUDES
+    NAMES "fftw3.h"
+    PATHS ${FFTW_ROOT}
+    PATH_SUFFIXES "include"
+    NO_DEFAULT_PATH
+  )
+
+else()
+
+  find_library(
+    FFTW_LIB
+    NAMES "fftw3"
+    PATHS ${PKG_FFTW_LIBRARY_DIRS} ${LIB_INSTALL_DIR}
+  )
+
+  find_library(
+    FFTWF_LIB
+    NAMES "fftw3f"
+    PATHS ${PKG_FFTW_LIBRARY_DIRS} ${LIB_INSTALL_DIR}
+  )
+
+
+  find_library(
+    FFTWL_LIB
+    NAMES "fftw3l"
+    PATHS ${PKG_FFTW_LIBRARY_DIRS} ${LIB_INSTALL_DIR}
+  )
+
+  find_path(
+    FFTW_INCLUDES
+    NAMES "fftw3.h"
+    PATHS ${PKG_FFTW_INCLUDE_DIRS} ${INCLUDE_INSTALL_DIR}
+  )
+
+endif()
+
+set(FFTW_LIBRARIES ${FFTW_LIB} ${FFTWF_LIB})
+
+if(FFTWL_LIB)
+  set(FFTW_LIBRARIES ${FFTW_LIBRARIES} ${FFTWL_LIB})
+endif()
+
+set( CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES_SAV} )
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(FFTW DEFAULT_MSG
+                                  FFTW_INCLUDES FFTW_LIBRARIES)
+
+mark_as_advanced(FFTW_INCLUDES FFTW_LIBRARIES FFTW_LIB FFTWF_LIB FFTWL_LIB)
+

diff --git a/cmake/FindGLEW.cmake b/cmake/FindGLEW.cmake
new file mode 100644
index 0000000..9d486d5
--- /dev/null
+++ b/cmake/FindGLEW.cmake

@@ -0,0 +1,105 @@
+# Copyright (c) 2009 Boudewijn Rempt <boud@valdyas.org>                                                                                          
+#                                                                                                                                                
+# Redistribution and use is allowed according to the terms of the BSD license.                                                                   
+# For details see the accompanying COPYING-CMAKE-SCRIPTS file. 
+# 
+# - try to find glew library and include files
+#  GLEW_INCLUDE_DIR, where to find GL/glew.h, etc.
+#  GLEW_LIBRARIES, the libraries to link against
+#  GLEW_FOUND, If false, do not try to use GLEW.
+# Also defined, but not for general use are:
+#  GLEW_GLEW_LIBRARY = the full path to the glew library.
+
+if (WIN32)
+
+  if(CYGWIN)
+
+    find_path( GLEW_INCLUDE_DIR GL/glew.h)
+
+    find_library( GLEW_GLEW_LIBRARY glew32
+      ${OPENGL_LIBRARY_DIR}
+      /usr/lib/w32api
+      /usr/X11R6/lib
+    )
+
+
+  else(CYGWIN)
+  
+    find_path( GLEW_INCLUDE_DIR GL/glew.h
+      $ENV{GLEW_ROOT_PATH}/include
+    )
+
+    find_library( GLEW_GLEW_LIBRARY
+      NAMES glew glew32
+      PATHS
+      $ENV{GLEW_ROOT_PATH}/lib
+      ${OPENGL_LIBRARY_DIR}
+    )
+
+  endif(CYGWIN)
+
+else (WIN32)
+
+  if (APPLE)
+# These values for Apple could probably do with improvement.
+    find_path( GLEW_INCLUDE_DIR glew.h
+      /System/Library/Frameworks/GLEW.framework/Versions/A/Headers
+      ${OPENGL_LIBRARY_DIR}
+    )
+    set(GLEW_GLEW_LIBRARY "-framework GLEW" CACHE STRING "GLEW library for OSX")
+    set(GLEW_cocoa_LIBRARY "-framework Cocoa" CACHE STRING "Cocoa framework for OSX")
+  else (APPLE)
+
+    find_path( GLEW_INCLUDE_DIR GL/glew.h
+      /usr/include/GL
+      /usr/openwin/share/include
+      /usr/openwin/include
+      /usr/X11R6/include
+      /usr/include/X11
+      /opt/graphics/OpenGL/include
+      /opt/graphics/OpenGL/contrib/libglew
+    )
+
+    find_library( GLEW_GLEW_LIBRARY GLEW
+      /usr/openwin/lib
+      /usr/X11R6/lib
+    )
+
+  endif (APPLE)
+
+endif (WIN32)
+
+set( GLEW_FOUND "NO" )
+if(GLEW_INCLUDE_DIR)
+  if(GLEW_GLEW_LIBRARY)
+    # Is -lXi and -lXmu required on all platforms that have it?
+    # If not, we need some way to figure out what platform we are on.
+    set( GLEW_LIBRARIES
+      ${GLEW_GLEW_LIBRARY}
+      ${GLEW_cocoa_LIBRARY}
+    )
+    set( GLEW_FOUND "YES" )
+
+#The following deprecated settings are for backwards compatibility with CMake1.4
+    set (GLEW_LIBRARY ${GLEW_LIBRARIES})
+    set (GLEW_INCLUDE_PATH ${GLEW_INCLUDE_DIR})
+
+  endif(GLEW_GLEW_LIBRARY)
+endif(GLEW_INCLUDE_DIR)
+
+if(GLEW_FOUND)
+  if(NOT GLEW_FIND_QUIETLY)
+    message(STATUS "Found Glew: ${GLEW_LIBRARIES}")
+  endif(NOT GLEW_FIND_QUIETLY)
+else(GLEW_FOUND)
+  if(GLEW_FIND_REQUIRED)
+    message(FATAL_ERROR "Could not find Glew")
+  endif(GLEW_FIND_REQUIRED)
+endif(GLEW_FOUND)
+
+mark_as_advanced(
+  GLEW_INCLUDE_DIR
+  GLEW_GLEW_LIBRARY
+  GLEW_Xmu_LIBRARY
+  GLEW_Xi_LIBRARY
+)

diff --git a/cmake/FindGMP.cmake b/cmake/FindGMP.cmake
new file mode 100644
index 0000000..c41eedc
--- /dev/null
+++ b/cmake/FindGMP.cmake

@@ -0,0 +1,21 @@
+# Try to find the GNU Multiple Precision Arithmetic Library (GMP)
+# See http://gmplib.org/
+
+if (GMP_INCLUDES AND GMP_LIBRARIES)
+  set(GMP_FIND_QUIETLY TRUE)
+endif ()
+
+find_path(GMP_INCLUDES
+  NAMES
+  gmp.h
+  PATHS
+  $ENV{GMPDIR}
+  ${INCLUDE_INSTALL_DIR}
+)
+
+find_library(GMP_LIBRARIES gmp PATHS $ENV{GMPDIR} ${LIB_INSTALL_DIR})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(GMP DEFAULT_MSG
+                                  GMP_INCLUDES GMP_LIBRARIES)
+mark_as_advanced(GMP_INCLUDES GMP_LIBRARIES)

diff --git a/cmake/FindGSL.cmake b/cmake/FindGSL.cmake
new file mode 100644
index 0000000..8632232
--- /dev/null
+++ b/cmake/FindGSL.cmake

@@ -0,0 +1,170 @@
+# Try to find gnu scientific library GSL
+# See 
+# http://www.gnu.org/software/gsl/  and
+# http://gnuwin32.sourceforge.net/packages/gsl.htm
+#
+# Once run this will define: 
+# 
+# GSL_FOUND       = system has GSL lib
+#
+# GSL_LIBRARIES   = full path to the libraries
+#    on Unix/Linux with additional linker flags from "gsl-config --libs"
+# 
+# CMAKE_GSL_CXX_FLAGS  = Unix compiler flags for GSL, essentially "`gsl-config --cxxflags`"
+#
+# GSL_INCLUDE_DIR      = where to find headers 
+#
+# GSL_LINK_DIRECTORIES = link directories, useful for rpath on Unix
+# GSL_EXE_LINKER_FLAGS = rpath on Unix
+#
+# Felix Woelk 07/2004
+# Jan Woetzel
+#
+# www.mip.informatik.uni-kiel.de
+# --------------------------------
+
+if(WIN32)
+  # JW tested with gsl-1.8, Windows XP, MSVS 7.1
+  set(GSL_POSSIBLE_ROOT_DIRS
+    ${GSL_ROOT_DIR}
+    $ENV{GSL_ROOT_DIR}
+    ${GSL_DIR}
+    ${GSL_HOME}    
+    $ENV{GSL_DIR}
+    $ENV{GSL_HOME}
+    $ENV{EXTRA}
+    "C:/Program Files/GnuWin32"
+    )
+  find_path(GSL_INCLUDE_DIR
+    NAMES gsl/gsl_cdf.h gsl/gsl_randist.h
+    PATHS ${GSL_POSSIBLE_ROOT_DIRS}
+    PATH_SUFFIXES include
+    DOC "GSL header include dir"
+    )
+  
+  find_library(GSL_GSL_LIBRARY
+    NAMES libgsl.dll.a gsl libgsl
+    PATHS  ${GSL_POSSIBLE_ROOT_DIRS}
+    PATH_SUFFIXES lib
+    DOC "GSL library" )
+  
+  if(NOT GSL_GSL_LIBRARY)
+	find_file(GSL_GSL_LIBRARY
+		NAMES libgsl.dll.a
+		PATHS  ${GSL_POSSIBLE_ROOT_DIRS}
+		PATH_SUFFIXES lib
+		DOC "GSL library")
+  endif()
+  
+  find_library(GSL_GSLCBLAS_LIBRARY
+    NAMES libgslcblas.dll.a gslcblas libgslcblas
+    PATHS  ${GSL_POSSIBLE_ROOT_DIRS}
+    PATH_SUFFIXES lib
+    DOC "GSL cblas library dir" )
+  
+  if(NOT GSL_GSLCBLAS_LIBRARY)
+	find_file(GSL_GSLCBLAS_LIBRARY
+		NAMES libgslcblas.dll.a
+		PATHS  ${GSL_POSSIBLE_ROOT_DIRS}
+		PATH_SUFFIXES lib
+		DOC "GSL library")
+  endif()
+  
+  set(GSL_LIBRARIES ${GSL_GSL_LIBRARY})
+
+  #message("DBG\n"
+  #  "GSL_GSL_LIBRARY=${GSL_GSL_LIBRARY}\n"
+  #  "GSL_GSLCBLAS_LIBRARY=${GSL_GSLCBLAS_LIBRARY}\n"
+  #  "GSL_LIBRARIES=${GSL_LIBRARIES}")
+
+
+else(WIN32)
+  
+  if(UNIX) 
+    set(GSL_CONFIG_PREFER_PATH 
+      "$ENV{GSL_DIR}/bin"
+      "$ENV{GSL_DIR}"
+      "$ENV{GSL_HOME}/bin" 
+      "$ENV{GSL_HOME}" 
+      CACHE STRING "preferred path to GSL (gsl-config)")
+    find_program(GSL_CONFIG gsl-config
+      ${GSL_CONFIG_PREFER_PATH}
+      /usr/bin/
+      )
+    # message("DBG GSL_CONFIG ${GSL_CONFIG}")
+    
+    if (GSL_CONFIG) 
+      # set CXXFLAGS to be fed into CXX_FLAGS by the user:
+      set(GSL_CXX_FLAGS "`${GSL_CONFIG} --cflags`")
+      
+      # set INCLUDE_DIRS to prefix+include
+      exec_program(${GSL_CONFIG}
+        ARGS --prefix
+        OUTPUT_VARIABLE GSL_PREFIX)
+      set(GSL_INCLUDE_DIR ${GSL_PREFIX}/include CACHE STRING INTERNAL)
+
+      # set link libraries and link flags
+      #set(GSL_LIBRARIES "`${GSL_CONFIG} --libs`")
+      exec_program(${GSL_CONFIG}
+        ARGS --libs
+        OUTPUT_VARIABLE GSL_LIBRARIES )
+        
+      # extract link dirs for rpath  
+      exec_program(${GSL_CONFIG}
+        ARGS --libs
+        OUTPUT_VARIABLE GSL_CONFIG_LIBS )
+      
+      # extract version
+      exec_program(${GSL_CONFIG}
+        ARGS --version
+        OUTPUT_VARIABLE GSL_FULL_VERSION )
+      
+      # split version as major/minor
+      string(REGEX MATCH "(.)\\..*" GSL_VERSION_MAJOR_ "${GSL_FULL_VERSION}")
+      set(GSL_VERSION_MAJOR ${CMAKE_MATCH_1})
+      string(REGEX MATCH ".\\.(.*)" GSL_VERSION_MINOR_ "${GSL_FULL_VERSION}")
+      set(GSL_VERSION_MINOR ${CMAKE_MATCH_1})
+
+      # split off the link dirs (for rpath)
+      # use regular expression to match wildcard equivalent "-L*<endchar>"
+      # with <endchar> is a space or a semicolon
+      string(REGEX MATCHALL "[-][L]([^ ;])+" 
+        GSL_LINK_DIRECTORIES_WITH_PREFIX 
+        "${GSL_CONFIG_LIBS}" )
+      #      message("DBG  GSL_LINK_DIRECTORIES_WITH_PREFIX=${GSL_LINK_DIRECTORIES_WITH_PREFIX}")
+
+      # remove prefix -L because we need the pure directory for LINK_DIRECTORIES
+      
+      if (GSL_LINK_DIRECTORIES_WITH_PREFIX)
+        string(REGEX REPLACE "[-][L]" "" GSL_LINK_DIRECTORIES ${GSL_LINK_DIRECTORIES_WITH_PREFIX} )
+      endif (GSL_LINK_DIRECTORIES_WITH_PREFIX)
+      set(GSL_EXE_LINKER_FLAGS "-Wl,-rpath,${GSL_LINK_DIRECTORIES}" CACHE STRING INTERNAL)
+      #      message("DBG  GSL_LINK_DIRECTORIES=${GSL_LINK_DIRECTORIES}")
+      #      message("DBG  GSL_EXE_LINKER_FLAGS=${GSL_EXE_LINKER_FLAGS}")
+
+      #      add_definitions("-DHAVE_GSL")
+      #      set(GSL_DEFINITIONS "-DHAVE_GSL")
+      mark_as_advanced(
+        GSL_CXX_FLAGS
+        GSL_INCLUDE_DIR
+        GSL_LIBRARIES
+        GSL_LINK_DIRECTORIES
+        GSL_DEFINITIONS
+        )
+      message(STATUS "Using GSL from ${GSL_PREFIX}")
+      
+    else(GSL_CONFIG)
+      message("FindGSL.cmake: gsl-config not found. Please set it manually. GSL_CONFIG=${GSL_CONFIG}")
+    endif(GSL_CONFIG)
+
+  endif(UNIX)
+endif(WIN32)
+
+
+if(GSL_LIBRARIES)
+  if(GSL_INCLUDE_DIR OR GSL_CXX_FLAGS)
+
+    set(GSL_FOUND 1)
+    
+  endif(GSL_INCLUDE_DIR OR GSL_CXX_FLAGS)
+endif(GSL_LIBRARIES)

diff --git a/cmake/FindGoogleHash.cmake b/cmake/FindGoogleHash.cmake
new file mode 100644
index 0000000..481eb4d
--- /dev/null
+++ b/cmake/FindGoogleHash.cmake

@@ -0,0 +1,23 @@
+
+if (GOOGLEHASH_INCLUDES AND GOOGLEHASH_LIBRARIES)
+  set(GOOGLEHASH_FIND_QUIETLY TRUE)
+endif ()
+
+find_path(GOOGLEHASH_INCLUDES
+  NAMES
+  google/dense_hash_map
+  PATHS
+  ${INCLUDE_INSTALL_DIR}
+)
+
+if(GOOGLEHASH_INCLUDES)
+  # let's make sure it compiles with the current compiler
+  file(WRITE ${CMAKE_BINARY_DIR}/googlehash_test.cpp
+  "#include <google/sparse_hash_map>\n#include <google/dense_hash_map>\nint main(int argc, char** argv) { google::dense_hash_map<int,float> a; google::sparse_hash_map<int,float> b; return 0;}\n")
+  try_compile(GOOGLEHASH_COMPILE ${CMAKE_BINARY_DIR} ${CMAKE_BINARY_DIR}/googlehash_test.cpp OUTPUT_VARIABLE GOOGLEHASH_COMPILE_RESULT)
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(GoogleHash DEFAULT_MSG GOOGLEHASH_INCLUDES GOOGLEHASH_COMPILE)
+
+mark_as_advanced(GOOGLEHASH_INCLUDES)

diff --git a/cmake/FindHWLOC.cmake b/cmake/FindHWLOC.cmake
new file mode 100644
index 0000000..522f521
--- /dev/null
+++ b/cmake/FindHWLOC.cmake

@@ -0,0 +1,332 @@
+###
+#
+# @copyright (c) 2009-2014 The University of Tennessee and The University
+#                          of Tennessee Research Foundation.
+#                          All rights reserved.
+# @copyright (c) 2012-2014 Inria. All rights reserved.
+# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+#
+###
+#
+# - Find HWLOC include dirs and libraries
+# Use this module by invoking find_package with the form:
+#  find_package(HWLOC
+#               [REQUIRED]) # Fail with error if hwloc is not found
+#
+# This module finds headers and hwloc library.
+# Results are reported in variables:
+#  HWLOC_FOUND           - True if headers and requested libraries were found
+#  HWLOC_INCLUDE_DIRS    - hwloc include directories
+#  HWLOC_LIBRARY_DIRS    - Link directories for hwloc libraries
+#  HWLOC_LIBRARIES       - hwloc component libraries to be linked
+#
+# The user can give specific paths where to find the libraries adding cmake
+# options at configure (ex: cmake path/to/project -DHWLOC_DIR=path/to/hwloc):
+#  HWLOC_DIR             - Where to find the base directory of hwloc
+#  HWLOC_INCDIR          - Where to find the header files
+#  HWLOC_LIBDIR          - Where to find the library files
+# The module can also look for the following environment variables if paths
+# are not given as cmake variable: HWLOC_DIR, HWLOC_INCDIR, HWLOC_LIBDIR
+
+#=============================================================================
+# Copyright 2012-2013 Inria
+# Copyright 2012-2013 Emmanuel Agullo
+# Copyright 2012-2013 Mathieu Faverge
+# Copyright 2012      Cedric Castagnede
+# Copyright 2013      Florent Pruvost
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file MORSE-Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of Morse, substitute the full
+#  License text for the above reference.)
+
+include(CheckStructHasMember)
+include(CheckCSourceCompiles)
+
+if (NOT HWLOC_FOUND)
+  set(HWLOC_DIR "" CACHE PATH "Installation directory of HWLOC library")
+  if (NOT HWLOC_FIND_QUIETLY)
+    message(STATUS "A cache variable, namely HWLOC_DIR, has been set to specify the install directory of HWLOC")
+  endif()
+endif()
+
+set(ENV_HWLOC_DIR "$ENV{HWLOC_DIR}")
+set(ENV_HWLOC_INCDIR "$ENV{HWLOC_INCDIR}")
+set(ENV_HWLOC_LIBDIR "$ENV{HWLOC_LIBDIR}")
+set(HWLOC_GIVEN_BY_USER "FALSE")
+if ( HWLOC_DIR OR ( HWLOC_INCDIR AND HWLOC_LIBDIR) OR ENV_HWLOC_DIR OR (ENV_HWLOC_INCDIR AND ENV_HWLOC_LIBDIR) )
+  set(HWLOC_GIVEN_BY_USER "TRUE")
+endif()
+
+# Optionally use pkg-config to detect include/library dirs (if pkg-config is available)
+# -------------------------------------------------------------------------------------
+include(CMakeFindDependencyMacro)
+# include(FindPkgConfig)
+find_dependency(PkgConfig QUIET)
+if( PKG_CONFIG_EXECUTABLE AND NOT HWLOC_GIVEN_BY_USER )
+
+  pkg_search_module(HWLOC hwloc)
+  if (NOT HWLOC_FIND_QUIETLY)
+    if (HWLOC_FOUND AND HWLOC_LIBRARIES)
+      message(STATUS "Looking for HWLOC - found using PkgConfig")
+      #if(NOT HWLOC_INCLUDE_DIRS)
+      #    message("${Magenta}HWLOC_INCLUDE_DIRS is empty using PkgConfig."
+      #        "Perhaps the path to hwloc headers is already present in your"
+      #        "C(PLUS)_INCLUDE_PATH environment variable.${ColourReset}")
+      #endif()
+    else()
+      message(STATUS "${Magenta}Looking for HWLOC - not found using PkgConfig."
+	"\n   Perhaps you should add the directory containing hwloc.pc to"
+	"\n   the PKG_CONFIG_PATH environment variable.${ColourReset}")
+    endif()
+  endif()
+
+endif()
+
+if( (NOT PKG_CONFIG_EXECUTABLE) OR (PKG_CONFIG_EXECUTABLE AND NOT HWLOC_FOUND) OR (HWLOC_GIVEN_BY_USER) )
+
+  if (NOT HWLOC_FIND_QUIETLY)
+    message(STATUS "Looking for HWLOC - PkgConfig not used")
+  endif()
+
+  # Looking for include
+  # -------------------
+
+  # Add system include paths to search include
+  # ------------------------------------------
+  unset(_inc_env)
+  if(ENV_HWLOC_INCDIR)
+    list(APPEND _inc_env "${ENV_HWLOC_INCDIR}")
+  elseif(ENV_HWLOC_DIR)
+    list(APPEND _inc_env "${ENV_HWLOC_DIR}")
+    list(APPEND _inc_env "${ENV_HWLOC_DIR}/include")
+    list(APPEND _inc_env "${ENV_HWLOC_DIR}/include/hwloc")
+  else()
+    if(WIN32)
+      string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}")
+    else()
+      string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}")
+      list(APPEND _inc_env "${_path_env}")
+      string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}")
+      list(APPEND _inc_env "${_path_env}")
+      string(REPLACE ":" ";" _path_env "$ENV{CPATH}")
+      list(APPEND _inc_env "${_path_env}")
+      string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}")
+      list(APPEND _inc_env "${_path_env}")
+    endif()
+  endif()
+  list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}")
+  list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}")
+  list(REMOVE_DUPLICATES _inc_env)
+
+  # set paths where to look for
+  set(PATH_TO_LOOK_FOR "${_inc_env}")
+
+  # Try to find the hwloc header in the given paths
+  # -------------------------------------------------
+  # call cmake macro to find the header path
+  if(HWLOC_INCDIR)
+    set(HWLOC_hwloc.h_DIRS "HWLOC_hwloc.h_DIRS-NOTFOUND")
+    find_path(HWLOC_hwloc.h_DIRS
+      NAMES hwloc.h
+      HINTS ${HWLOC_INCDIR})
+  else()
+    if(HWLOC_DIR)
+      set(HWLOC_hwloc.h_DIRS "HWLOC_hwloc.h_DIRS-NOTFOUND")
+      find_path(HWLOC_hwloc.h_DIRS
+	NAMES hwloc.h
+	HINTS ${HWLOC_DIR}
+	PATH_SUFFIXES "include" "include/hwloc")
+    else()
+      set(HWLOC_hwloc.h_DIRS "HWLOC_hwloc.h_DIRS-NOTFOUND")
+      find_path(HWLOC_hwloc.h_DIRS
+	NAMES hwloc.h
+	HINTS ${PATH_TO_LOOK_FOR}
+	PATH_SUFFIXES "hwloc")
+    endif()
+  endif()
+  mark_as_advanced(HWLOC_hwloc.h_DIRS)
+
+  # Add path to cmake variable
+  # ------------------------------------
+  if (HWLOC_hwloc.h_DIRS)
+    set(HWLOC_INCLUDE_DIRS "${HWLOC_hwloc.h_DIRS}")
+  else ()
+    set(HWLOC_INCLUDE_DIRS "HWLOC_INCLUDE_DIRS-NOTFOUND")
+    if(NOT HWLOC_FIND_QUIETLY)
+      message(STATUS "Looking for hwloc -- hwloc.h not found")
+    endif()
+  endif ()
+
+  if (HWLOC_INCLUDE_DIRS)
+    list(REMOVE_DUPLICATES HWLOC_INCLUDE_DIRS)
+  endif ()
+
+
+  # Looking for lib
+  # ---------------
+
+  # Add system library paths to search lib
+  # --------------------------------------
+  unset(_lib_env)
+  if(ENV_HWLOC_LIBDIR)
+    list(APPEND _lib_env "${ENV_HWLOC_LIBDIR}")
+  elseif(ENV_HWLOC_DIR)
+    list(APPEND _lib_env "${ENV_HWLOC_DIR}")
+    list(APPEND _lib_env "${ENV_HWLOC_DIR}/lib")
+  else()
+    if(WIN32)
+      string(REPLACE ":" ";" _lib_env "$ENV{LIB}")
+    else()
+      if(APPLE)
+	string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}")
+      else()
+	string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}")
+      endif()
+      list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}")
+      list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}")
+    endif()
+  endif()
+  list(REMOVE_DUPLICATES _lib_env)
+
+  # set paths where to look for
+  set(PATH_TO_LOOK_FOR "${_lib_env}")
+
+  # Try to find the hwloc lib in the given paths
+  # ----------------------------------------------
+
+  # call cmake macro to find the lib path
+  if(HWLOC_LIBDIR)
+    set(HWLOC_hwloc_LIBRARY "HWLOC_hwloc_LIBRARY-NOTFOUND")
+    find_library(HWLOC_hwloc_LIBRARY
+      NAMES hwloc
+      HINTS ${HWLOC_LIBDIR})
+  else()
+    if(HWLOC_DIR)
+      set(HWLOC_hwloc_LIBRARY "HWLOC_hwloc_LIBRARY-NOTFOUND")
+      find_library(HWLOC_hwloc_LIBRARY
+	NAMES hwloc
+	HINTS ${HWLOC_DIR}
+	PATH_SUFFIXES lib lib32 lib64)
+    else()
+      set(HWLOC_hwloc_LIBRARY "HWLOC_hwloc_LIBRARY-NOTFOUND")
+      find_library(HWLOC_hwloc_LIBRARY
+	NAMES hwloc
+	HINTS ${PATH_TO_LOOK_FOR})
+    endif()
+  endif()
+  mark_as_advanced(HWLOC_hwloc_LIBRARY)
+
+  # If found, add path to cmake variable
+  # ------------------------------------
+  if (HWLOC_hwloc_LIBRARY)
+    get_filename_component(hwloc_lib_path ${HWLOC_hwloc_LIBRARY} PATH)
+    # set cmake variables (respects naming convention)
+    set(HWLOC_LIBRARIES    "${HWLOC_hwloc_LIBRARY}")
+    set(HWLOC_LIBRARY_DIRS "${hwloc_lib_path}")
+  else ()
+    set(HWLOC_LIBRARIES    "HWLOC_LIBRARIES-NOTFOUND")
+    set(HWLOC_LIBRARY_DIRS "HWLOC_LIBRARY_DIRS-NOTFOUND")
+    if(NOT HWLOC_FIND_QUIETLY)
+      message(STATUS "Looking for hwloc -- lib hwloc not found")
+    endif()
+  endif ()
+
+  if (HWLOC_LIBRARY_DIRS)
+    list(REMOVE_DUPLICATES HWLOC_LIBRARY_DIRS)
+  endif ()
+
+  # check a function to validate the find
+  if(HWLOC_LIBRARIES)
+
+    set(REQUIRED_INCDIRS)
+    set(REQUIRED_LIBDIRS)
+    set(REQUIRED_LIBS)
+
+    # HWLOC
+    if (HWLOC_INCLUDE_DIRS)
+      set(REQUIRED_INCDIRS "${HWLOC_INCLUDE_DIRS}")
+    endif()
+    if (HWLOC_LIBRARY_DIRS)
+      set(REQUIRED_LIBDIRS "${HWLOC_LIBRARY_DIRS}")
+    endif()
+    set(REQUIRED_LIBS "${HWLOC_LIBRARIES}")
+
+    # set required libraries for link
+    set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}")
+    set(CMAKE_REQUIRED_LIBRARIES)
+    foreach(lib_dir ${REQUIRED_LIBDIRS})
+      list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}")
+    endforeach()
+    list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}")
+    string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}")
+
+    # test link
+    unset(HWLOC_WORKS CACHE)
+    include(CheckFunctionExists)
+    check_function_exists(hwloc_topology_init HWLOC_WORKS)
+    mark_as_advanced(HWLOC_WORKS)
+
+    if(NOT HWLOC_WORKS)
+      if(NOT HWLOC_FIND_QUIETLY)
+	message(STATUS "Looking for hwloc : test of hwloc_topology_init with hwloc library fails")
+	message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}")
+	message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}")
+	message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails")
+      endif()
+    endif()
+    set(CMAKE_REQUIRED_INCLUDES)
+    set(CMAKE_REQUIRED_FLAGS)
+    set(CMAKE_REQUIRED_LIBRARIES)
+  endif()
+
+endif()
+
+if (HWLOC_LIBRARIES)
+  if (HWLOC_LIBRARY_DIRS)
+    list(GET HWLOC_LIBRARY_DIRS 0 first_lib_path)
+  else()
+    list(GET HWLOC_LIBRARIES 0 first_lib)
+    get_filename_component(first_lib_path "${first_lib}" PATH)
+  endif()
+  if (${first_lib_path} MATCHES "/lib(32|64)?$")
+    string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}")
+    set(HWLOC_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of HWLOC library" FORCE)
+  else()
+    set(HWLOC_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of HWLOC library" FORCE)
+  endif()
+endif()
+mark_as_advanced(HWLOC_DIR)
+mark_as_advanced(HWLOC_DIR_FOUND)
+
+# check that HWLOC has been found
+# -------------------------------
+include(FindPackageHandleStandardArgs)
+if (PKG_CONFIG_EXECUTABLE AND HWLOC_FOUND)
+  find_package_handle_standard_args(HWLOC DEFAULT_MSG
+    HWLOC_LIBRARIES)
+else()
+  find_package_handle_standard_args(HWLOC DEFAULT_MSG
+    HWLOC_LIBRARIES
+    HWLOC_WORKS)
+endif()
+
+if (HWLOC_FOUND)
+  set(HWLOC_SAVE_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
+  list(APPEND CMAKE_REQUIRED_INCLUDES ${HWLOC_INCLUDE_DIRS})
+
+  # test headers to guess the version
+  check_struct_has_member( "struct hwloc_obj" parent hwloc.h HAVE_HWLOC_PARENT_MEMBER )
+  check_struct_has_member( "struct hwloc_cache_attr_s" size hwloc.h HAVE_HWLOC_CACHE_ATTR )
+  check_c_source_compiles( "#include <hwloc.h>
+	    int main(void) { hwloc_obj_t o; o->type = HWLOC_OBJ_PU; return 0;}" HAVE_HWLOC_OBJ_PU)
+  include(CheckLibraryExists)
+  check_library_exists(${HWLOC_LIBRARIES} hwloc_bitmap_free "" HAVE_HWLOC_BITMAP)
+
+  set(CMAKE_REQUIRED_INCLUDES ${HWLOC_SAVE_CMAKE_REQUIRED_INCLUDES})
+endif()

diff --git a/cmake/FindKLU.cmake b/cmake/FindKLU.cmake
new file mode 100644
index 0000000..6217d14
--- /dev/null
+++ b/cmake/FindKLU.cmake

@@ -0,0 +1,48 @@
+# KLU lib usually requires linking to a blas library.
+# It is up to the user of this module to find a BLAS and link to it.
+
+if (KLU_INCLUDES AND KLU_LIBRARIES)
+  set(KLU_FIND_QUIETLY TRUE)
+endif ()
+
+find_path(KLU_INCLUDES
+  NAMES
+  klu.h
+  PATHS
+  $ENV{KLUDIR}
+  ${INCLUDE_INSTALL_DIR}
+  PATH_SUFFIXES
+  suitesparse
+  ufsparse
+)
+
+find_library(KLU_LIBRARIES klu PATHS $ENV{KLUDIR} ${LIB_INSTALL_DIR})
+
+if(KLU_LIBRARIES)
+
+  if(NOT KLU_LIBDIR)
+    get_filename_component(KLU_LIBDIR ${KLU_LIBRARIES} PATH)
+  endif()
+
+  find_library(COLAMD_LIBRARY colamd PATHS ${KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR})
+  if(COLAMD_LIBRARY)
+    set(KLU_LIBRARIES ${KLU_LIBRARIES} ${COLAMD_LIBRARY})
+  endif ()
+  
+  find_library(AMD_LIBRARY amd PATHS ${KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR})
+  if(AMD_LIBRARY)
+    set(KLU_LIBRARIES ${KLU_LIBRARIES} ${AMD_LIBRARY})
+  endif ()
+
+  find_library(BTF_LIBRARY btf PATHS $ENV{KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR})
+  if(BTF_LIBRARY)
+    set(KLU_LIBRARIES ${KLU_LIBRARIES} ${BTF_LIBRARY})
+  endif()
+
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(KLU DEFAULT_MSG
+                                  KLU_INCLUDES KLU_LIBRARIES)
+
+mark_as_advanced(KLU_INCLUDES KLU_LIBRARIES AMD_LIBRARY COLAMD_LIBRARY BTF_LIBRARY)

diff --git a/cmake/FindLAPACK.cmake b/cmake/FindLAPACK.cmake
new file mode 100644
index 0000000..3fd7388
--- /dev/null
+++ b/cmake/FindLAPACK.cmake

@@ -0,0 +1,274 @@
+# Find LAPACK library
+#
+# This module finds an installed library that implements the LAPACK
+# linear-algebra interface (see http://www.netlib.org/lapack/).
+# The approach follows mostly that taken for the autoconf macro file, acx_lapack.m4
+# (distributed at http://ac-archive.sourceforge.net/ac-archive/acx_lapack.html).
+#
+# This module sets the following variables:
+#  LAPACK_FOUND - set to true if a library implementing the LAPACK interface
+#    is found
+#  LAPACK_INCLUDE_DIR - Directories containing the LAPACK header files
+#  LAPACK_DEFINITIONS - Compilation options to use LAPACK
+#  LAPACK_LINKER_FLAGS - Linker flags to use LAPACK (excluding -l
+#    and -L).
+#  LAPACK_LIBRARIES_DIR - Directories containing the LAPACK libraries.
+#     May be null if LAPACK_LIBRARIES contains libraries name using full path.
+#  LAPACK_LIBRARIES - List of libraries to link against LAPACK interface.
+#     May be null if the compiler supports auto-link (e.g. VC++).
+#  LAPACK_USE_FILE - The name of the cmake module to include to compile
+#     applications or libraries using LAPACK.
+#
+# This module was modified by CGAL team:
+# - find libraries for a C++ compiler, instead of Fortran
+# - added LAPACK_INCLUDE_DIR, LAPACK_DEFINITIONS and LAPACK_LIBRARIES_DIR
+# - removed LAPACK95_LIBRARIES
+
+
+include(CheckFunctionExists)
+include(CMakeFindDependencyMacro)
+
+# This macro checks for the existence of the combination of fortran libraries
+# given by _list.  If the combination is found, this macro checks (using the
+# check_function_exists macro) whether can link against that library
+# combination using the name of a routine given by _name using the linker
+# flags given by _flags.  If the combination of libraries is found and passes
+# the link test, LIBRARIES is set to the list of complete library paths that
+# have been found and DEFINITIONS to the required definitions.
+# Otherwise, LIBRARIES is set to FALSE.
+# N.B. _prefix is the prefix applied to the names of all cached variables that
+# are generated internally and marked advanced by this macro.
+macro(check_lapack_libraries DEFINITIONS LIBRARIES _prefix _name _flags _list _blas _path)
+  #message("DEBUG: check_lapack_libraries(${_list} in ${_path} with ${_blas})")
+
+  # Check for the existence of the libraries given by _list
+  set(_libraries_found TRUE)
+  set(_libraries_work FALSE)
+  set(${DEFINITIONS} "")
+  set(${LIBRARIES} "")
+  set(_combined_name)
+  foreach(_library ${_list})
+    set(_combined_name ${_combined_name}_${_library})
+
+    if(_libraries_found)
+      # search first in ${_path}
+      find_library(${_prefix}_${_library}_LIBRARY
+                  NAMES ${_library}
+                  PATHS ${_path} NO_DEFAULT_PATH
+                  )
+      # if not found, search in environment variables and system
+      if ( WIN32 )
+        find_library(${_prefix}_${_library}_LIBRARY
+                    NAMES ${_library}
+                    PATHS ENV LIB
+                    )
+      elseif ( APPLE )
+        find_library(${_prefix}_${_library}_LIBRARY
+                    NAMES ${_library}
+                    PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV DYLD_LIBRARY_PATH
+                    )
+      else ()
+        find_library(${_prefix}_${_library}_LIBRARY
+                    NAMES ${_library}
+                    PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV LD_LIBRARY_PATH
+                    )
+      endif()
+      mark_as_advanced(${_prefix}_${_library}_LIBRARY)
+      set(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY})
+      set(_libraries_found ${${_prefix}_${_library}_LIBRARY})
+    endif()
+  endforeach()
+  if(_libraries_found)
+    set(_libraries_found ${${LIBRARIES}})
+  endif()
+
+  # Test this combination of libraries with the Fortran/f2c interface.
+  # We test the Fortran interface first as it is well standardized.
+  if(_libraries_found AND NOT _libraries_work)
+    set(${DEFINITIONS}  "-D${_prefix}_USE_F2C")
+    set(${LIBRARIES}    ${_libraries_found})
+    # Some C++ linkers require the f2c library to link with Fortran libraries.
+    # I do not know which ones, thus I just add the f2c library if it is available.
+    find_dependency( F2C QUIET )
+    if ( F2C_FOUND )
+      set(${DEFINITIONS}  ${${DEFINITIONS}} ${F2C_DEFINITIONS})
+      set(${LIBRARIES}    ${${LIBRARIES}} ${F2C_LIBRARIES})
+    endif()
+    set(CMAKE_REQUIRED_DEFINITIONS  ${${DEFINITIONS}})
+    set(CMAKE_REQUIRED_LIBRARIES    ${_flags} ${${LIBRARIES}} ${_blas})
+    #message("DEBUG: CMAKE_REQUIRED_DEFINITIONS = ${CMAKE_REQUIRED_DEFINITIONS}")
+    #message("DEBUG: CMAKE_REQUIRED_LIBRARIES = ${CMAKE_REQUIRED_LIBRARIES}")
+    # Check if function exists with f2c calling convention (ie a trailing underscore)
+    check_function_exists(${_name}_ ${_prefix}_${_name}_${_combined_name}_f2c_WORKS)
+    set(CMAKE_REQUIRED_DEFINITIONS} "")
+    set(CMAKE_REQUIRED_LIBRARIES    "")
+    mark_as_advanced(${_prefix}_${_name}_${_combined_name}_f2c_WORKS)
+    set(_libraries_work ${${_prefix}_${_name}_${_combined_name}_f2c_WORKS})
+  endif()
+
+  # If not found, test this combination of libraries with a C interface.
+  # A few implementations (ie ACML) provide a C interface. Unfortunately, there is no standard.
+  if(_libraries_found AND NOT _libraries_work)
+    set(${DEFINITIONS} "")
+    set(${LIBRARIES}   ${_libraries_found})
+    set(CMAKE_REQUIRED_DEFINITIONS "")
+    set(CMAKE_REQUIRED_LIBRARIES   ${_flags} ${${LIBRARIES}} ${_blas})
+    #message("DEBUG: CMAKE_REQUIRED_LIBRARIES = ${CMAKE_REQUIRED_LIBRARIES}")
+    check_function_exists(${_name} ${_prefix}_${_name}${_combined_name}_WORKS)
+    set(CMAKE_REQUIRED_LIBRARIES "")
+    mark_as_advanced(${_prefix}_${_name}${_combined_name}_WORKS)
+    set(_libraries_work ${${_prefix}_${_name}${_combined_name}_WORKS})
+  endif()
+
+  # on failure
+  if(NOT _libraries_work)
+    set(${DEFINITIONS} "")
+    set(${LIBRARIES}   FALSE)
+  endif()
+  #message("DEBUG: ${DEFINITIONS} = ${${DEFINITIONS}}")
+  #message("DEBUG: ${LIBRARIES} = ${${LIBRARIES}}")
+endmacro()
+
+
+#
+# main
+#
+
+# LAPACK requires BLAS
+if(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED)
+  find_dependency(BLAS)
+else()
+  find_dependency(BLAS REQUIRED)
+endif()
+
+if (NOT BLAS_FOUND)
+
+  message(STATUS "LAPACK requires BLAS.")
+  set(LAPACK_FOUND FALSE)
+
+# Is it already configured?
+elseif (LAPACK_LIBRARIES_DIR OR LAPACK_LIBRARIES)
+
+  set(LAPACK_FOUND TRUE)
+
+else()
+
+  # reset variables
+  set( LAPACK_INCLUDE_DIR "" )
+  set( LAPACK_DEFINITIONS "" )
+  set( LAPACK_LINKER_FLAGS "" ) # unused (yet)
+  set( LAPACK_LIBRARIES "" )
+  set( LAPACK_LIBRARIES_DIR "" )
+
+    #
+    # If Unix, search for LAPACK function in possible libraries
+    #
+
+    #intel mkl lapack?
+    if(NOT LAPACK_LIBRARIES)
+      check_lapack_libraries(
+      LAPACK_DEFINITIONS
+      LAPACK_LIBRARIES
+      LAPACK
+      cheev
+      ""
+      "mkl_lapack"
+      "${BLAS_LIBRARIES}"
+      "${CGAL_TAUCS_LIBRARIES_DIR} ENV LAPACK_LIB_DIR"
+      )
+    endif()
+
+    #acml lapack?
+    if(NOT LAPACK_LIBRARIES)
+      check_lapack_libraries(
+      LAPACK_DEFINITIONS
+      LAPACK_LIBRARIES
+      LAPACK
+      cheev
+      ""
+      "acml"
+      "${BLAS_LIBRARIES}"
+      "${CGAL_TAUCS_LIBRARIES_DIR} ENV LAPACK_LIB_DIR"
+      )
+    endif()
+
+    # Apple LAPACK library?
+    if(NOT LAPACK_LIBRARIES)
+      check_lapack_libraries(
+      LAPACK_DEFINITIONS
+      LAPACK_LIBRARIES
+      LAPACK
+      cheev
+      ""
+      "Accelerate"
+      "${BLAS_LIBRARIES}"
+      "${CGAL_TAUCS_LIBRARIES_DIR} ENV LAPACK_LIB_DIR"
+      )
+    endif()
+
+    if ( NOT LAPACK_LIBRARIES )
+      check_lapack_libraries(
+      LAPACK_DEFINITIONS
+      LAPACK_LIBRARIES
+      LAPACK
+      cheev
+      ""
+      "vecLib"
+      "${BLAS_LIBRARIES}"
+      "${CGAL_TAUCS_LIBRARIES_DIR} ENV LAPACK_LIB_DIR"
+      )
+    endif ()
+
+    # Generic LAPACK library?
+    # This configuration *must* be the last try as this library is notably slow.
+    if ( NOT LAPACK_LIBRARIES )
+      check_lapack_libraries(
+      LAPACK_DEFINITIONS
+      LAPACK_LIBRARIES
+      LAPACK
+      cheev
+      ""
+      "lapack"
+      "${BLAS_LIBRARIES}"
+      "${CGAL_TAUCS_LIBRARIES_DIR} ENV LAPACK_LIB_DIR"
+      )
+    endif()
+
+  if(LAPACK_LIBRARIES_DIR OR LAPACK_LIBRARIES)
+    set(LAPACK_FOUND TRUE)
+  else()
+    set(LAPACK_FOUND FALSE)
+  endif()
+
+  if(NOT LAPACK_FIND_QUIETLY)
+    if(LAPACK_FOUND)
+      message(STATUS "A library with LAPACK API found.")
+    else()
+      if(LAPACK_FIND_REQUIRED)
+        message(FATAL_ERROR "A required library with LAPACK API not found. Please specify library location.")
+      else()
+        message(STATUS "A library with LAPACK API not found. Please specify library location.")
+      endif()
+    endif()
+  endif()
+
+  # Add variables to cache
+  set( LAPACK_INCLUDE_DIR   "${LAPACK_INCLUDE_DIR}"
+                            CACHE PATH "Directories containing the LAPACK header files" FORCE )
+  set( LAPACK_DEFINITIONS   "${LAPACK_DEFINITIONS}"
+                            CACHE STRING "Compilation options to use LAPACK" FORCE )
+  set( LAPACK_LINKER_FLAGS  "${LAPACK_LINKER_FLAGS}"
+                            CACHE STRING "Linker flags to use LAPACK" FORCE )
+  set( LAPACK_LIBRARIES     "${LAPACK_LIBRARIES}"
+                            CACHE FILEPATH "LAPACK libraries name" FORCE )
+  set( LAPACK_LIBRARIES_DIR "${LAPACK_LIBRARIES_DIR}"
+                            CACHE PATH "Directories containing the LAPACK libraries" FORCE )
+
+  #message("DEBUG: LAPACK_INCLUDE_DIR = ${LAPACK_INCLUDE_DIR}")
+  #message("DEBUG: LAPACK_DEFINITIONS = ${LAPACK_DEFINITIONS}")
+  #message("DEBUG: LAPACK_LINKER_FLAGS = ${LAPACK_LINKER_FLAGS}")
+  #message("DEBUG: LAPACK_LIBRARIES = ${LAPACK_LIBRARIES}")
+  #message("DEBUG: LAPACK_LIBRARIES_DIR = ${LAPACK_LIBRARIES_DIR}")
+  #message("DEBUG: LAPACK_FOUND = ${LAPACK_FOUND}")
+
+endif()

diff --git a/cmake/FindMPFR.cmake b/cmake/FindMPFR.cmake
new file mode 100644
index 0000000..d8da9d6
--- /dev/null
+++ b/cmake/FindMPFR.cmake

@@ -0,0 +1,83 @@
+# Try to find the MPFR library
+# See http://www.mpfr.org/
+#
+# This module supports requiring a minimum version, e.g. you can do
+#   find_package(MPFR 2.3.0)
+# to require version 2.3.0 to newer of MPFR.
+#
+# Once done this will define
+#
+#  MPFR_FOUND - system has MPFR lib with correct version
+#  MPFR_INCLUDES - the MPFR include directory
+#  MPFR_LIBRARIES - the MPFR library
+#  MPFR_VERSION - MPFR version
+
+# Copyright (c) 2006, 2007 Montel Laurent, <montel@kde.org>
+# Copyright (c) 2008, 2009 Gael Guennebaud, <g.gael@free.fr>
+# Copyright (c) 2010 Jitse Niesen, <jitse@maths.leeds.ac.uk>
+# Redistribution and use is allowed according to the terms of the BSD license.
+
+# Set MPFR_INCLUDES
+
+find_path(MPFR_INCLUDES
+  NAMES
+  mpfr.h
+  PATHS
+  $ENV{GMPDIR}
+  ${INCLUDE_INSTALL_DIR}
+)
+
+# Set MPFR_FIND_VERSION to 1.0.0 if no minimum version is specified
+
+if(NOT MPFR_FIND_VERSION)
+  if(NOT MPFR_FIND_VERSION_MAJOR)
+    set(MPFR_FIND_VERSION_MAJOR 1)
+  endif()
+  if(NOT MPFR_FIND_VERSION_MINOR)
+    set(MPFR_FIND_VERSION_MINOR 0)
+  endif()
+  if(NOT MPFR_FIND_VERSION_PATCH)
+    set(MPFR_FIND_VERSION_PATCH 0)
+  endif()
+
+  set(MPFR_FIND_VERSION "${MPFR_FIND_VERSION_MAJOR}.${MPFR_FIND_VERSION_MINOR}.${MPFR_FIND_VERSION_PATCH}")
+endif()
+
+
+if(MPFR_INCLUDES)
+
+  # Set MPFR_VERSION
+  
+  file(READ "${MPFR_INCLUDES}/mpfr.h" _mpfr_version_header)
+  
+  string(REGEX MATCH "define[ \t]+MPFR_VERSION_MAJOR[ \t]+([0-9]+)" _mpfr_major_version_match "${_mpfr_version_header}")
+  set(MPFR_MAJOR_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+MPFR_VERSION_MINOR[ \t]+([0-9]+)" _mpfr_minor_version_match "${_mpfr_version_header}")
+  set(MPFR_MINOR_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+MPFR_VERSION_PATCHLEVEL[ \t]+([0-9]+)" _mpfr_patchlevel_version_match "${_mpfr_version_header}")
+  set(MPFR_PATCHLEVEL_VERSION "${CMAKE_MATCH_1}")
+  
+  set(MPFR_VERSION ${MPFR_MAJOR_VERSION}.${MPFR_MINOR_VERSION}.${MPFR_PATCHLEVEL_VERSION})
+  
+  # Check whether found version exceeds minimum version
+  
+  if(${MPFR_VERSION} VERSION_LESS ${MPFR_FIND_VERSION})
+    set(MPFR_VERSION_OK FALSE)
+    message(STATUS "MPFR version ${MPFR_VERSION} found in ${MPFR_INCLUDES}, "
+                   "but at least version ${MPFR_FIND_VERSION} is required")
+  else()
+    set(MPFR_VERSION_OK TRUE)
+  endif()
+
+endif()
+
+# Set MPFR_LIBRARIES
+
+find_library(MPFR_LIBRARIES mpfr PATHS $ENV{GMPDIR} ${LIB_INSTALL_DIR})
+
+# Epilogue
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MPFR DEFAULT_MSG
+                                  MPFR_INCLUDES MPFR_LIBRARIES MPFR_VERSION_OK)
+mark_as_advanced(MPFR_INCLUDES MPFR_LIBRARIES)

diff --git a/cmake/FindMPREAL.cmake b/cmake/FindMPREAL.cmake
new file mode 100644
index 0000000..947a1ce
--- /dev/null
+++ b/cmake/FindMPREAL.cmake

@@ -0,0 +1,103 @@
+# Try to find the MPFR C++ (MPREAL) library
+# See http://www.holoborodko.com/pavel/mpreal/
+#
+# This module supports requiring a minimum version, e.g. you can do
+#   find_package(MPREAL 1.8.6)
+# to require version 1.8.6 or newer of MPREAL C++.
+#
+# Once done this will define
+#
+#  MPREAL_FOUND - system has MPREAL lib with correct version
+#  MPREAL_INCLUDES - MPREAL required include directories
+#  MPREAL_LIBRARIES - MPREAL required libraries
+#  MPREAL_VERSION - MPREAL version
+
+# Copyright (c) 2020 The Eigen Authors.
+# Redistribution and use is allowed according to the terms of the BSD license.
+
+include(CMakeFindDependencyMacro)
+find_dependency(MPFR)
+find_dependency(GMP)
+
+# Set MPREAL_INCLUDES
+find_path(MPREAL_INCLUDES
+  NAMES
+  mpreal.h
+  PATHS
+  $ENV{GMPDIR}
+  ${INCLUDE_INSTALL_DIR}
+)
+
+# Set MPREAL_FIND_VERSION to 1.0.0 if no minimum version is specified
+
+if(NOT MPREAL_FIND_VERSION)
+  if(NOT MPREAL_FIND_VERSION_MAJOR)
+    set(MPREAL_FIND_VERSION_MAJOR 1)
+  endif()
+  if(NOT MPREAL_FIND_VERSION_MINOR)
+    set(MPREAL_FIND_VERSION_MINOR 0)
+  endif()
+  if(NOT MPREAL_FIND_VERSION_PATCH)
+    set(MPREAL_FIND_VERSION_PATCH 0)
+  endif()
+
+  set(MPREAL_FIND_VERSION "${MPREAL_FIND_VERSION_MAJOR}.${MPREAL_FIND_VERSION_MINOR}.${MPREAL_FIND_VERSION_PATCH}")
+endif()
+
+# Check bugs
+# - https://github.com/advanpix/mpreal/issues/7
+# - https://github.com/advanpix/mpreal/issues/9
+set(MPREAL_TEST_PROGRAM "
+#include <mpreal.h>
+#include <algorithm>
+int main(int argc, char** argv) {
+  const mpfr::mpreal one  =    1.0;
+  const mpfr::mpreal zero =    0.0;
+  using namespace std;
+  const mpfr::mpreal smaller = min(one, zero);
+  return 0;
+}")
+
+if(MPREAL_INCLUDES)
+
+  # Set MPREAL_VERSION
+  
+  file(READ "${MPREAL_INCLUDES}/mpreal.h" _mpreal_version_header)
+  
+  string(REGEX MATCH "define[ \t]+MPREAL_VERSION_MAJOR[ \t]+([0-9]+)" _mpreal_major_version_match "${_mpreal_version_header}")
+  set(MPREAL_MAJOR_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+MPREAL_VERSION_MINOR[ \t]+([0-9]+)" _mpreal_minor_version_match "${_mpreal_version_header}")
+  set(MPREAL_MINOR_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+MPREAL_VERSION_PATCHLEVEL[ \t]+([0-9]+)" _mpreal_patchlevel_version_match "${_mpreal_version_header}")
+  set(MPREAL_PATCHLEVEL_VERSION "${CMAKE_MATCH_1}")
+  
+  set(MPREAL_VERSION ${MPREAL_MAJOR_VERSION}.${MPREAL_MINOR_VERSION}.${MPREAL_PATCHLEVEL_VERSION})
+  
+  # Check whether found version exceeds minimum version
+  
+  if(${MPREAL_VERSION} VERSION_LESS ${MPREAL_FIND_VERSION})
+    set(MPREAL_VERSION_OK FALSE)
+    message(STATUS "MPREAL version ${MPREAL_VERSION} found in ${MPREAL_INCLUDES}, "
+                   "but at least version ${MPREAL_FIND_VERSION} is required")
+  else()
+    set(MPREAL_VERSION_OK TRUE)
+    
+    list(APPEND MPREAL_INCLUDES "${MPFR_INCLUDES}" "${GMP_INCLUDES}")
+    list(REMOVE_DUPLICATES MPREAL_INCLUDES)
+    
+    list(APPEND MPREAL_LIBRARIES "${MPFR_LIBRARIES}" "${GMP_LIBRARIES}")
+    list(REMOVE_DUPLICATES MPREAL_LIBRARIES)
+    
+    # Make sure it compiles with the current compiler.
+    unset(MPREAL_WORKS CACHE)
+    include(CheckCXXSourceCompiles)
+    set(CMAKE_REQUIRED_INCLUDES "${MPREAL_INCLUDES}")
+    set(CMAKE_REQUIRED_LIBRARIES "${MPREAL_LIBRARIES}")
+    check_cxx_source_compiles("${MPREAL_TEST_PROGRAM}" MPREAL_WORKS)
+  endif()
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MPREAL DEFAULT_MSG
+                                  MPREAL_INCLUDES MPREAL_VERSION_OK MPREAL_WORKS)
+mark_as_advanced(MPREAL_INCLUDES)

diff --git a/cmake/FindMetis.cmake b/cmake/FindMetis.cmake
new file mode 100644
index 0000000..747f882
--- /dev/null
+++ b/cmake/FindMetis.cmake

@@ -0,0 +1,265 @@
+###
+#
+# @copyright (c) 2009-2014 The University of Tennessee and The University
+#                          of Tennessee Research Foundation.
+#                          All rights reserved.
+# @copyright (c) 2012-2014 Inria. All rights reserved.
+# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+#
+###
+#
+# - Find METIS include dirs and libraries
+# Use this module by invoking find_package with the form:
+#  find_package(METIS
+#               [REQUIRED]             # Fail with error if metis is not found
+#              )
+#
+# This module finds headers and metis library.
+# Results are reported in variables:
+#  METIS_FOUND           - True if headers and requested libraries were found
+#  METIS_INCLUDE_DIRS    - metis include directories
+#  METIS_LIBRARY_DIRS    - Link directories for metis libraries
+#  METIS_LIBRARIES       - metis component libraries to be linked
+#
+# The user can give specific paths where to find the libraries adding cmake
+# options at configure (ex: cmake path/to/project -DMETIS_DIR=path/to/metis):
+#  METIS_DIR             - Where to find the base directory of metis
+#  METIS_INCDIR          - Where to find the header files
+#  METIS_LIBDIR          - Where to find the library files
+# The module can also look for the following environment variables if paths
+# are not given as cmake variable: METIS_DIR, METIS_INCDIR, METIS_LIBDIR
+
+#=============================================================================
+# Copyright 2012-2013 Inria
+# Copyright 2012-2013 Emmanuel Agullo
+# Copyright 2012-2013 Mathieu Faverge
+# Copyright 2012      Cedric Castagnede
+# Copyright 2013      Florent Pruvost
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file MORSE-Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of Morse, substitute the full
+#  License text for the above reference.)
+
+if (NOT METIS_FOUND)
+  set(METIS_DIR "" CACHE PATH "Installation directory of METIS library")
+  if (NOT METIS_FIND_QUIETLY)
+    message(STATUS "A cache variable, namely METIS_DIR, has been set to specify the install directory of METIS")
+  endif()
+endif()
+
+# Looking for include
+# -------------------
+
+# Add system include paths to search include
+# ------------------------------------------
+unset(_inc_env)
+set(ENV_METIS_DIR "$ENV{METIS_DIR}")
+set(ENV_METIS_INCDIR "$ENV{METIS_INCDIR}")
+if(ENV_METIS_INCDIR)
+  list(APPEND _inc_env "${ENV_METIS_INCDIR}")
+elseif(ENV_METIS_DIR)
+  list(APPEND _inc_env "${ENV_METIS_DIR}")
+  list(APPEND _inc_env "${ENV_METIS_DIR}/include")
+  list(APPEND _inc_env "${ENV_METIS_DIR}/include/metis")
+else()
+  if(WIN32)
+    string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}")
+  else()
+    string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}")
+    list(APPEND _inc_env "${_path_env}")
+    string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}")
+    list(APPEND _inc_env "${_path_env}")
+    string(REPLACE ":" ";" _path_env "$ENV{CPATH}")
+    list(APPEND _inc_env "${_path_env}")
+    string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}")
+    list(APPEND _inc_env "${_path_env}")
+  endif()
+endif()
+list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}")
+list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}")
+list(REMOVE_DUPLICATES _inc_env)
+
+
+# Try to find the metis header in the given paths
+# -------------------------------------------------
+# call cmake macro to find the header path
+if(METIS_INCDIR)
+  set(METIS_metis.h_DIRS "METIS_metis.h_DIRS-NOTFOUND")
+  find_path(METIS_metis.h_DIRS
+    NAMES metis.h
+    HINTS ${METIS_INCDIR})
+else()
+  if(METIS_DIR)
+    set(METIS_metis.h_DIRS "METIS_metis.h_DIRS-NOTFOUND")
+    find_path(METIS_metis.h_DIRS
+      NAMES metis.h
+      HINTS ${METIS_DIR}
+      PATH_SUFFIXES "include" "include/metis")
+  else()
+    set(METIS_metis.h_DIRS "METIS_metis.h_DIRS-NOTFOUND")
+    find_path(METIS_metis.h_DIRS
+      NAMES metis.h
+      HINTS ${_inc_env})
+  endif()
+endif()
+mark_as_advanced(METIS_metis.h_DIRS)
+
+
+# If found, add path to cmake variable
+# ------------------------------------
+if (METIS_metis.h_DIRS)
+  set(METIS_INCLUDE_DIRS "${METIS_metis.h_DIRS}")
+else ()
+  set(METIS_INCLUDE_DIRS "METIS_INCLUDE_DIRS-NOTFOUND")
+  if(NOT METIS_FIND_QUIETLY)
+    message(STATUS "Looking for metis -- metis.h not found")
+  endif()
+endif()
+
+
+# Looking for lib
+# ---------------
+
+# Add system library paths to search lib
+# --------------------------------------
+unset(_lib_env)
+set(ENV_METIS_LIBDIR "$ENV{METIS_LIBDIR}")
+if(ENV_METIS_LIBDIR)
+  list(APPEND _lib_env "${ENV_METIS_LIBDIR}")
+elseif(ENV_METIS_DIR)
+  list(APPEND _lib_env "${ENV_METIS_DIR}")
+  list(APPEND _lib_env "${ENV_METIS_DIR}/lib")
+else()
+  if(WIN32)
+    string(REPLACE ":" ";" _lib_env "$ENV{LIB}")
+  else()
+    if(APPLE)
+      string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}")
+    else()
+      string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}")
+    endif()
+    list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}")
+    list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}")
+  endif()
+endif()
+list(REMOVE_DUPLICATES _lib_env)
+
+# Try to find the metis lib in the given paths
+# ----------------------------------------------
+# call cmake macro to find the lib path
+if(METIS_LIBDIR)
+  set(METIS_metis_LIBRARY "METIS_metis_LIBRARY-NOTFOUND")
+  find_library(METIS_metis_LIBRARY
+    NAMES metis
+    HINTS ${METIS_LIBDIR})
+else()
+  if(METIS_DIR)
+    set(METIS_metis_LIBRARY "METIS_metis_LIBRARY-NOTFOUND")
+    find_library(METIS_metis_LIBRARY
+      NAMES metis
+      HINTS ${METIS_DIR}
+      PATH_SUFFIXES lib lib32 lib64)
+  else()
+    set(METIS_metis_LIBRARY "METIS_metis_LIBRARY-NOTFOUND")
+    find_library(METIS_metis_LIBRARY
+      NAMES metis
+      HINTS ${_lib_env})
+  endif()
+endif()
+mark_as_advanced(METIS_metis_LIBRARY)
+
+
+# If found, add path to cmake variable
+# ------------------------------------
+if (METIS_metis_LIBRARY)
+  get_filename_component(metis_lib_path "${METIS_metis_LIBRARY}" PATH)
+  # set cmake variables
+  set(METIS_LIBRARIES    "${METIS_metis_LIBRARY}")
+  set(METIS_LIBRARY_DIRS "${metis_lib_path}")
+else ()
+  set(METIS_LIBRARIES    "METIS_LIBRARIES-NOTFOUND")
+  set(METIS_LIBRARY_DIRS "METIS_LIBRARY_DIRS-NOTFOUND")
+  if(NOT METIS_FIND_QUIETLY)
+    message(STATUS "Looking for metis -- lib metis not found")
+  endif()
+endif ()
+
+# check a function to validate the find
+if(METIS_LIBRARIES)
+
+  set(REQUIRED_INCDIRS)
+  set(REQUIRED_LIBDIRS)
+  set(REQUIRED_LIBS)
+
+  # METIS
+  if (METIS_INCLUDE_DIRS)
+    set(REQUIRED_INCDIRS  "${METIS_INCLUDE_DIRS}")
+  endif()
+  if (METIS_LIBRARY_DIRS)
+    set(REQUIRED_LIBDIRS "${METIS_LIBRARY_DIRS}")
+  endif()
+  set(REQUIRED_LIBS "${METIS_LIBRARIES}")
+  # m
+  find_library(M_LIBRARY NAMES m)
+  mark_as_advanced(M_LIBRARY)
+  if(M_LIBRARY)
+    list(APPEND REQUIRED_LIBS "-lm")
+  endif()
+
+  # set required libraries for link
+  set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}")
+  set(CMAKE_REQUIRED_LIBRARIES)
+  foreach(lib_dir ${REQUIRED_LIBDIRS})
+    list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}")
+  endforeach()
+  list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}")
+  string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}")
+
+  # test link
+  unset(METIS_WORKS CACHE)
+  include(CheckFunctionExists)
+  check_function_exists(METIS_NodeND METIS_WORKS)
+  mark_as_advanced(METIS_WORKS)
+
+  if(NOT METIS_WORKS)
+    if(NOT METIS_FIND_QUIETLY)
+      message(STATUS "Looking for METIS : test of METIS_NodeND with METIS library fails")
+      message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}")
+      message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}")
+      message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails")
+    endif()
+  endif()
+  set(CMAKE_REQUIRED_INCLUDES)
+  set(CMAKE_REQUIRED_FLAGS)
+  set(CMAKE_REQUIRED_LIBRARIES)
+endif()
+
+if (METIS_LIBRARIES)
+  list(GET METIS_LIBRARIES 0 first_lib)
+  get_filename_component(first_lib_path "${first_lib}" PATH)
+  if (${first_lib_path} MATCHES "/lib(32|64)?$")
+    string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}")
+    set(METIS_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of METIS library" FORCE)
+  else()
+    set(METIS_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of METIS library" FORCE)
+  endif()
+endif()
+mark_as_advanced(METIS_DIR)
+mark_as_advanced(METIS_DIR_FOUND)
+
+# check that METIS has been found
+# ---------------------------------
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(METIS DEFAULT_MSG
+  METIS_LIBRARIES
+  METIS_WORKS
+  METIS_INCLUDE_DIRS)
+#
+# TODO: Add possibility to check for specific functions in the library
+#

diff --git a/cmake/FindPASTIX.cmake b/cmake/FindPASTIX.cmake
new file mode 100644
index 0000000..db1427b
--- /dev/null
+++ b/cmake/FindPASTIX.cmake

@@ -0,0 +1,704 @@
+###
+#
+# @copyright (c) 2009-2014 The University of Tennessee and The University
+#                          of Tennessee Research Foundation.
+#                          All rights reserved.
+# @copyright (c) 2012-2014 Inria. All rights reserved.
+# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+#
+###
+#
+# - Find PASTIX include dirs and libraries
+# Use this module by invoking find_package with the form:
+#  find_package(PASTIX
+#               [REQUIRED] # Fail with error if pastix is not found
+#               [COMPONENTS <comp1> <comp2> ...] # dependencies
+#              )
+#
+#  PASTIX depends on the following libraries:
+#   - Threads, m, rt
+#   - MPI
+#   - HWLOC
+#   - BLAS
+#
+#  COMPONENTS are optional libraries PASTIX could be linked with,
+#  Use it to drive detection of a specific compilation chain
+#  COMPONENTS can be some of the following:
+#   - MPI: to activate detection of the parallel MPI version (default)
+#        it looks for Threads, HWLOC, BLAS, MPI and ScaLAPACK libraries
+#   - SEQ: to activate detection of the sequential version (exclude MPI version)
+#   - STARPU: to activate detection of StarPU version
+#   it looks for MPI version of StarPU (default behaviour)
+#   if SEQ and STARPU are given, it looks for a StarPU without MPI
+#   - STARPU_CUDA: to activate detection of StarPU with CUDA
+#   - STARPU_FXT: to activate detection of StarPU with FxT
+#   - SCOTCH: to activate detection of PASTIX linked with SCOTCH
+#   - PTSCOTCH: to activate detection of PASTIX linked with SCOTCH
+#   - METIS: to activate detection of PASTIX linked with SCOTCH
+#
+# This module finds headers and pastix library.
+# Results are reported in variables:
+#  PASTIX_FOUND            - True if headers and requested libraries were found
+#  PASTIX_LINKER_FLAGS     - list of required linker flags (excluding -l and -L)
+#  PASTIX_INCLUDE_DIRS     - pastix include directories
+#  PASTIX_LIBRARY_DIRS     - Link directories for pastix libraries
+#  PASTIX_LIBRARIES        - pastix libraries
+#  PASTIX_INCLUDE_DIRS_DEP - pastix + dependencies include directories
+#  PASTIX_LIBRARY_DIRS_DEP - pastix + dependencies link directories
+#  PASTIX_LIBRARIES_DEP    - pastix libraries + dependencies
+#
+# The user can give specific paths where to find the libraries adding cmake
+# options at configure (ex: cmake path/to/project -DPASTIX_DIR=path/to/pastix):
+#  PASTIX_DIR              - Where to find the base directory of pastix
+#  PASTIX_INCDIR           - Where to find the header files
+#  PASTIX_LIBDIR           - Where to find the library files
+# The module can also look for the following environment variables if paths
+# are not given as cmake variable: PASTIX_DIR, PASTIX_INCDIR, PASTIX_LIBDIR
+
+#=============================================================================
+# Copyright 2012-2013 Inria
+# Copyright 2012-2013 Emmanuel Agullo
+# Copyright 2012-2013 Mathieu Faverge
+# Copyright 2012      Cedric Castagnede
+# Copyright 2013      Florent Pruvost
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file MORSE-Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of Morse, substitute the full
+#  License text for the above reference.)
+
+
+if (NOT PASTIX_FOUND)
+  set(PASTIX_DIR "" CACHE PATH "Installation directory of PASTIX library")
+  if (NOT PASTIX_FIND_QUIETLY)
+    message(STATUS "A cache variable, namely PASTIX_DIR, has been set to specify the install directory of PASTIX")
+  endif()
+endif()
+
+# Set the version to find
+set(PASTIX_LOOK_FOR_MPI ON)
+set(PASTIX_LOOK_FOR_SEQ OFF)
+set(PASTIX_LOOK_FOR_STARPU OFF)
+set(PASTIX_LOOK_FOR_STARPU_CUDA OFF)
+set(PASTIX_LOOK_FOR_STARPU_FXT OFF)
+set(PASTIX_LOOK_FOR_SCOTCH ON)
+set(PASTIX_LOOK_FOR_PTSCOTCH OFF)
+set(PASTIX_LOOK_FOR_METIS OFF)
+
+if( PASTIX_FIND_COMPONENTS )
+  foreach( component ${PASTIX_FIND_COMPONENTS} )
+    if (${component} STREQUAL "SEQ")
+      # means we look for the sequential version of PaStiX (without MPI)
+      set(PASTIX_LOOK_FOR_SEQ ON)
+      set(PASTIX_LOOK_FOR_MPI OFF)
+    endif()
+    if (${component} STREQUAL "MPI")
+      # means we look for the MPI version of PaStiX (default)
+      set(PASTIX_LOOK_FOR_SEQ OFF)
+      set(PASTIX_LOOK_FOR_MPI ON)
+    endif()
+    if (${component} STREQUAL "STARPU")
+      # means we look for PaStiX with StarPU
+      set(PASTIX_LOOK_FOR_STARPU ON)
+    endif()
+    if (${component} STREQUAL "STARPU_CUDA")
+      # means we look for PaStiX with StarPU + CUDA
+      set(PASTIX_LOOK_FOR_STARPU ON)
+      set(PASTIX_LOOK_FOR_STARPU_CUDA ON)
+    endif()
+    if (${component} STREQUAL "STARPU_FXT")
+      # means we look for PaStiX with StarPU + FxT
+      set(PASTIX_LOOK_FOR_STARPU_FXT ON)
+    endif()
+    if (${component} STREQUAL "SCOTCH")
+      set(PASTIX_LOOK_FOR_SCOTCH ON)
+    endif()
+    if (${component} STREQUAL "PTSCOTCH")
+      set(PASTIX_LOOK_FOR_PTSCOTCH ON)
+    endif()
+    if (${component} STREQUAL "METIS")
+      set(PASTIX_LOOK_FOR_METIS ON)
+    endif()
+  endforeach()
+endif()
+
+# Dependencies detection
+# ----------------------
+
+
+# Required dependencies
+# ---------------------
+include(CMakeFindDependencyMacro)
+if (NOT PASTIX_FIND_QUIETLY)
+  message(STATUS "Looking for PASTIX - Try to detect pthread")
+endif()
+if (PASTIX_FIND_REQUIRED)
+  find_dependency(Threads REQUIRED QUIET)
+else()
+  find_dependency(Threads QUIET)
+endif()
+set(PASTIX_EXTRA_LIBRARIES "")
+if( THREADS_FOUND )
+  list(APPEND PASTIX_EXTRA_LIBRARIES ${CMAKE_THREAD_LIBS_INIT})
+endif ()
+
+# Add math library to the list of extra
+# it normally exists on all common systems provided with a C compiler
+if (NOT PASTIX_FIND_QUIETLY)
+  message(STATUS "Looking for PASTIX - Try to detect libm")
+endif()
+set(PASTIX_M_LIBRARIES "")
+if(UNIX OR WIN32)
+  find_library(
+    PASTIX_M_m_LIBRARY
+    NAMES m
+    )
+  mark_as_advanced(PASTIX_M_m_LIBRARY)
+  if (PASTIX_M_m_LIBRARY)
+    list(APPEND PASTIX_M_LIBRARIES "${PASTIX_M_m_LIBRARY}")
+    list(APPEND PASTIX_EXTRA_LIBRARIES "${PASTIX_M_m_LIBRARY}")
+  else()
+    if (PASTIX_FIND_REQUIRED)
+      message(FATAL_ERROR "Could NOT find libm on your system."
+	"Are you sure to a have a C compiler installed?")
+    endif()
+  endif()
+endif()
+
+# Try to find librt (libposix4 - POSIX.1b Realtime Extensions library)
+# on Unix systems except Apple ones because it does not exist on it
+if (NOT PASTIX_FIND_QUIETLY)
+  message(STATUS "Looking for PASTIX - Try to detect librt")
+endif()
+set(PASTIX_RT_LIBRARIES "")
+if(UNIX AND NOT APPLE)
+  find_library(
+    PASTIX_RT_rt_LIBRARY
+    NAMES rt
+    )
+  mark_as_advanced(PASTIX_RT_rt_LIBRARY)
+  if (PASTIX_RT_rt_LIBRARY)
+    list(APPEND PASTIX_RT_LIBRARIES "${PASTIX_RT_rt_LIBRARY}")
+    list(APPEND PASTIX_EXTRA_LIBRARIES "${PASTIX_RT_rt_LIBRARY}")
+  else()
+    if (PASTIX_FIND_REQUIRED)
+      message(FATAL_ERROR "Could NOT find librt on your system")
+    endif()
+  endif()
+endif()
+
+# PASTIX depends on HWLOC
+#------------------------
+if (NOT PASTIX_FIND_QUIETLY)
+  message(STATUS "Looking for PASTIX - Try to detect HWLOC")
+endif()
+if (PASTIX_FIND_REQUIRED)
+  find_dependency(HWLOC REQUIRED QUIET)
+else()
+  find_dependency(HWLOC QUIET)
+endif()
+
+# PASTIX depends on BLAS
+#-----------------------
+if (NOT PASTIX_FIND_QUIETLY)
+  message(STATUS "Looking for PASTIX - Try to detect BLAS")
+endif()
+if (PASTIX_FIND_REQUIRED)
+  find_dependency(BLASEXT REQUIRED QUIET)
+else()
+  find_dependency(BLASEXT QUIET)
+endif()
+
+# Optional dependencies
+# ---------------------
+
+# PASTIX may depend on MPI
+#-------------------------
+if (NOT MPI_FOUND AND PASTIX_LOOK_FOR_MPI)
+  if (NOT PASTIX_FIND_QUIETLY)
+    message(STATUS "Looking for PASTIX - Try to detect MPI")
+  endif()
+  # allows to use an external mpi compilation by setting compilers with
+  # -DMPI_C_COMPILER=path/to/mpicc -DMPI_Fortran_COMPILER=path/to/mpif90
+  # at cmake configure
+  if(NOT MPI_C_COMPILER)
+    set(MPI_C_COMPILER mpicc)
+  endif()
+  if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_MPI)
+    find_dependency(MPI REQUIRED QUIET)
+  else()
+    find_dependency(MPI QUIET)
+  endif()
+  if (MPI_FOUND)
+    mark_as_advanced(MPI_LIBRARY)
+    mark_as_advanced(MPI_EXTRA_LIBRARY)
+  endif()
+endif ()
+
+# PASTIX may depend on STARPU
+#----------------------------
+if( NOT STARPU_FOUND AND PASTIX_LOOK_FOR_STARPU)
+
+  if (NOT PASTIX_FIND_QUIETLY)
+    message(STATUS "Looking for PASTIX - Try to detect StarPU")
+  endif()
+
+  set(PASTIX_STARPU_VERSION "1.1" CACHE STRING "oldest STARPU version desired")
+
+  # create list of components in order to make a single call to find_package(starpu...)
+  # we explicitly need a StarPU version built with hwloc
+  set(STARPU_COMPONENT_LIST "HWLOC")
+
+  # StarPU may depend on MPI
+  # allows to use an external mpi compilation by setting compilers with
+  # -DMPI_C_COMPILER=path/to/mpicc -DMPI_Fortran_COMPILER=path/to/mpif90
+  # at cmake configure
+  if (PASTIX_LOOK_FOR_MPI)
+    if(NOT MPI_C_COMPILER)
+      set(MPI_C_COMPILER mpicc)
+    endif()
+    list(APPEND STARPU_COMPONENT_LIST "MPI")
+  endif()
+  if (PASTIX_LOOK_FOR_STARPU_CUDA)
+    list(APPEND STARPU_COMPONENT_LIST "CUDA")
+  endif()
+  if (PASTIX_LOOK_FOR_STARPU_FXT)
+    list(APPEND STARPU_COMPONENT_LIST "FXT")
+  endif()
+  # set the list of optional dependencies we may discover
+  if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_STARPU)
+    find_dependency(STARPU ${PASTIX_STARPU_VERSION} REQUIRED
+      COMPONENTS ${STARPU_COMPONENT_LIST})
+  else()
+    find_dependency(STARPU ${PASTIX_STARPU_VERSION}
+      COMPONENTS ${STARPU_COMPONENT_LIST})
+  endif()
+
+endif()
+
+# PASTIX may depends on SCOTCH
+#-----------------------------
+if (NOT SCOTCH_FOUND AND PASTIX_LOOK_FOR_SCOTCH)
+  if (NOT PASTIX_FIND_QUIETLY)
+    message(STATUS "Looking for PASTIX - Try to detect SCOTCH")
+  endif()
+  if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_SCOTCH)
+    find_dependency(SCOTCH REQUIRED QUIET)
+  else()
+    find_dependency(SCOTCH QUIET)
+  endif()
+endif()
+
+# PASTIX may depends on PTSCOTCH
+#-------------------------------
+if (NOT PTSCOTCH_FOUND AND PASTIX_LOOK_FOR_PTSCOTCH)
+  if (NOT PASTIX_FIND_QUIETLY)
+    message(STATUS "Looking for PASTIX - Try to detect PTSCOTCH")
+  endif()
+  if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_PTSCOTCH)
+    find_dependency(PTSCOTCH REQUIRED QUIET)
+  else()
+    find_dependency(PTSCOTCH QUIET)
+  endif()
+endif()
+
+# PASTIX may depends on METIS
+#----------------------------
+if (NOT METIS_FOUND AND PASTIX_LOOK_FOR_METIS)
+  if (NOT PASTIX_FIND_QUIETLY)
+    message(STATUS "Looking for PASTIX - Try to detect METIS")
+  endif()
+  if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_METIS)
+    find_dependency(METIS REQUIRED QUIET)
+  else()
+    find_dependency(METIS QUIET)
+  endif()
+endif()
+
+# Error if pastix required and no partitioning lib found
+if (PASTIX_FIND_REQUIRED AND NOT SCOTCH_FOUND AND NOT PTSCOTCH_FOUND AND NOT METIS_FOUND)
+  message(FATAL_ERROR "Could NOT find any partitioning library on your system"
+    " (install scotch, ptscotch or metis)")
+endif()
+
+
+# Looking for PaStiX
+# ------------------
+
+# Looking for include
+# -------------------
+
+# Add system include paths to search include
+# ------------------------------------------
+unset(_inc_env)
+set(ENV_PASTIX_DIR "$ENV{PASTIX_DIR}")
+set(ENV_PASTIX_INCDIR "$ENV{PASTIX_INCDIR}")
+if(ENV_PASTIX_INCDIR)
+  list(APPEND _inc_env "${ENV_PASTIX_INCDIR}")
+elseif(ENV_PASTIX_DIR)
+  list(APPEND _inc_env "${ENV_PASTIX_DIR}")
+  list(APPEND _inc_env "${ENV_PASTIX_DIR}/include")
+  list(APPEND _inc_env "${ENV_PASTIX_DIR}/include/pastix")
+else()
+  if(WIN32)
+    string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}")
+  else()
+    string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}")
+    list(APPEND _inc_env "${_path_env}")
+    string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}")
+    list(APPEND _inc_env "${_path_env}")
+    string(REPLACE ":" ";" _path_env "$ENV{CPATH}")
+    list(APPEND _inc_env "${_path_env}")
+    string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}")
+    list(APPEND _inc_env "${_path_env}")
+  endif()
+endif()
+list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}")
+list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}")
+list(REMOVE_DUPLICATES _inc_env)
+
+
+# Try to find the pastix header in the given paths
+# ---------------------------------------------------
+# call cmake macro to find the header path
+if(PASTIX_INCDIR)
+  set(PASTIX_pastix.h_DIRS "PASTIX_pastix.h_DIRS-NOTFOUND")
+  find_path(PASTIX_pastix.h_DIRS
+    NAMES pastix.h
+    HINTS ${PASTIX_INCDIR})
+else()
+  if(PASTIX_DIR)
+    set(PASTIX_pastix.h_DIRS "PASTIX_pastix.h_DIRS-NOTFOUND")
+    find_path(PASTIX_pastix.h_DIRS
+      NAMES pastix.h
+      HINTS ${PASTIX_DIR}
+      PATH_SUFFIXES "include" "include/pastix")
+  else()
+    set(PASTIX_pastix.h_DIRS "PASTIX_pastix.h_DIRS-NOTFOUND")
+    find_path(PASTIX_pastix.h_DIRS
+      NAMES pastix.h
+      HINTS ${_inc_env}
+      PATH_SUFFIXES "pastix")
+  endif()
+endif()
+mark_as_advanced(PASTIX_pastix.h_DIRS)
+
+# If found, add path to cmake variable
+# ------------------------------------
+if (PASTIX_pastix.h_DIRS)
+  set(PASTIX_INCLUDE_DIRS "${PASTIX_pastix.h_DIRS}")
+else ()
+  set(PASTIX_INCLUDE_DIRS "PASTIX_INCLUDE_DIRS-NOTFOUND")
+  if(NOT PASTIX_FIND_QUIETLY)
+    message(STATUS "Looking for pastix -- pastix.h not found")
+  endif()
+endif()
+
+
+# Looking for lib
+# ---------------
+
+# Add system library paths to search lib
+# --------------------------------------
+unset(_lib_env)
+set(ENV_PASTIX_LIBDIR "$ENV{PASTIX_LIBDIR}")
+if(ENV_PASTIX_LIBDIR)
+  list(APPEND _lib_env "${ENV_PASTIX_LIBDIR}")
+elseif(ENV_PASTIX_DIR)
+  list(APPEND _lib_env "${ENV_PASTIX_DIR}")
+  list(APPEND _lib_env "${ENV_PASTIX_DIR}/lib")
+else()
+  if(WIN32)
+    string(REPLACE ":" ";" _lib_env "$ENV{LIB}")
+  else()
+    if(APPLE)
+      string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}")
+    else()
+      string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}")
+    endif()
+    list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}")
+    list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}")
+  endif()
+endif()
+list(REMOVE_DUPLICATES _lib_env)
+
+# Try to find the pastix lib in the given paths
+# ------------------------------------------------
+
+# create list of libs to find
+set(PASTIX_libs_to_find "pastix_murge;pastix")
+
+# call cmake macro to find the lib path
+if(PASTIX_LIBDIR)
+  foreach(pastix_lib ${PASTIX_libs_to_find})
+    set(PASTIX_${pastix_lib}_LIBRARY "PASTIX_${pastix_lib}_LIBRARY-NOTFOUND")
+    find_library(PASTIX_${pastix_lib}_LIBRARY
+      NAMES ${pastix_lib}
+      HINTS ${PASTIX_LIBDIR})
+  endforeach()
+else()
+  if(PASTIX_DIR)
+    foreach(pastix_lib ${PASTIX_libs_to_find})
+      set(PASTIX_${pastix_lib}_LIBRARY "PASTIX_${pastix_lib}_LIBRARY-NOTFOUND")
+      find_library(PASTIX_${pastix_lib}_LIBRARY
+	NAMES ${pastix_lib}
+	HINTS ${PASTIX_DIR}
+	PATH_SUFFIXES lib lib32 lib64)
+    endforeach()
+  else()
+    foreach(pastix_lib ${PASTIX_libs_to_find})
+      set(PASTIX_${pastix_lib}_LIBRARY "PASTIX_${pastix_lib}_LIBRARY-NOTFOUND")
+      find_library(PASTIX_${pastix_lib}_LIBRARY
+	NAMES ${pastix_lib}
+	HINTS ${_lib_env})
+    endforeach()
+  endif()
+endif()
+
+# If found, add path to cmake variable
+# ------------------------------------
+foreach(pastix_lib ${PASTIX_libs_to_find})
+
+  get_filename_component(${pastix_lib}_lib_path ${PASTIX_${pastix_lib}_LIBRARY} PATH)
+  # set cmake variables (respects naming convention)
+  if (PASTIX_LIBRARIES)
+    list(APPEND PASTIX_LIBRARIES "${PASTIX_${pastix_lib}_LIBRARY}")
+  else()
+    set(PASTIX_LIBRARIES "${PASTIX_${pastix_lib}_LIBRARY}")
+  endif()
+  if (PASTIX_LIBRARY_DIRS)
+    list(APPEND PASTIX_LIBRARY_DIRS "${${pastix_lib}_lib_path}")
+  else()
+    set(PASTIX_LIBRARY_DIRS "${${pastix_lib}_lib_path}")
+  endif()
+  mark_as_advanced(PASTIX_${pastix_lib}_LIBRARY)
+
+endforeach()
+
+# check a function to validate the find
+if(PASTIX_LIBRARIES)
+
+  set(REQUIRED_LDFLAGS)
+  set(REQUIRED_INCDIRS)
+  set(REQUIRED_LIBDIRS)
+  set(REQUIRED_LIBS)
+
+  # PASTIX
+  if (PASTIX_INCLUDE_DIRS)
+    set(REQUIRED_INCDIRS "${PASTIX_INCLUDE_DIRS}")
+  endif()
+  foreach(libdir ${PASTIX_LIBRARY_DIRS})
+    if (libdir)
+      list(APPEND REQUIRED_LIBDIRS "${libdir}")
+    endif()
+  endforeach()
+  set(REQUIRED_LIBS "${PASTIX_LIBRARIES}")
+  # STARPU
+  if (PASTIX_LOOK_FOR_STARPU AND STARPU_FOUND)
+    if (STARPU_INCLUDE_DIRS_DEP)
+      list(APPEND REQUIRED_INCDIRS "${STARPU_INCLUDE_DIRS_DEP}")
+    elseif (STARPU_INCLUDE_DIRS)
+      list(APPEND REQUIRED_INCDIRS "${STARPU_INCLUDE_DIRS}")
+    endif()
+    if(STARPU_LIBRARY_DIRS_DEP)
+      list(APPEND REQUIRED_LIBDIRS "${STARPU_LIBRARY_DIRS_DEP}")
+    elseif(STARPU_LIBRARY_DIRS)
+      list(APPEND REQUIRED_LIBDIRS "${STARPU_LIBRARY_DIRS}")
+    endif()
+    if (STARPU_LIBRARIES_DEP)
+      list(APPEND REQUIRED_LIBS "${STARPU_LIBRARIES_DEP}")
+    elseif (STARPU_LIBRARIES)
+      foreach(lib ${STARPU_LIBRARIES})
+	if (EXISTS ${lib} OR ${lib} MATCHES "^-")
+	  list(APPEND REQUIRED_LIBS "${lib}")
+	else()
+	  list(APPEND REQUIRED_LIBS "-l${lib}")
+	endif()
+      endforeach()
+    endif()
+  endif()
+  # CUDA
+  if (PASTIX_LOOK_FOR_STARPU_CUDA AND CUDA_FOUND)
+    if (CUDA_INCLUDE_DIRS)
+      list(APPEND REQUIRED_INCDIRS "${CUDA_INCLUDE_DIRS}")
+    endif()
+    foreach(libdir ${CUDA_LIBRARY_DIRS})
+      if (libdir)
+	list(APPEND REQUIRED_LIBDIRS "${libdir}")
+      endif()
+    endforeach()
+    list(APPEND REQUIRED_LIBS "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES}")
+  endif()
+  # MPI
+  if (PASTIX_LOOK_FOR_MPI AND MPI_FOUND)
+    if (MPI_C_INCLUDE_PATH)
+      list(APPEND REQUIRED_INCDIRS "${MPI_C_INCLUDE_PATH}")
+    endif()
+    if (MPI_C_LINK_FLAGS)
+      if (${MPI_C_LINK_FLAGS} MATCHES "  -")
+	string(REGEX REPLACE " -" "-" MPI_C_LINK_FLAGS ${MPI_C_LINK_FLAGS})
+      endif()
+      list(APPEND REQUIRED_LDFLAGS "${MPI_C_LINK_FLAGS}")
+    endif()
+    list(APPEND REQUIRED_LIBS "${MPI_C_LIBRARIES}")
+  endif()
+  # HWLOC
+  if (HWLOC_FOUND)
+    if (HWLOC_INCLUDE_DIRS)
+      list(APPEND REQUIRED_INCDIRS "${HWLOC_INCLUDE_DIRS}")
+    endif()
+    foreach(libdir ${HWLOC_LIBRARY_DIRS})
+      if (libdir)
+	list(APPEND REQUIRED_LIBDIRS "${libdir}")
+      endif()
+    endforeach()
+    foreach(lib ${HWLOC_LIBRARIES})
+      if (EXISTS ${lib} OR ${lib} MATCHES "^-")
+	list(APPEND REQUIRED_LIBS "${lib}")
+      else()
+	list(APPEND REQUIRED_LIBS "-l${lib}")
+      endif()
+    endforeach()
+  endif()
+  # BLAS
+  if (BLAS_FOUND)
+    if (BLAS_INCLUDE_DIRS)
+      list(APPEND REQUIRED_INCDIRS "${BLAS_INCLUDE_DIRS}")
+    endif()
+    foreach(libdir ${BLAS_LIBRARY_DIRS})
+      if (libdir)
+	list(APPEND REQUIRED_LIBDIRS "${libdir}")
+      endif()
+    endforeach()
+    list(APPEND REQUIRED_LIBS "${BLAS_LIBRARIES}")
+    if (BLAS_LINKER_FLAGS)
+      list(APPEND REQUIRED_LDFLAGS "${BLAS_LINKER_FLAGS}")
+    endif()
+  endif()
+  # SCOTCH
+  if (PASTIX_LOOK_FOR_SCOTCH AND SCOTCH_FOUND)
+    if (SCOTCH_INCLUDE_DIRS)
+      list(APPEND REQUIRED_INCDIRS "${SCOTCH_INCLUDE_DIRS}")
+    endif()
+    foreach(libdir ${SCOTCH_LIBRARY_DIRS})
+      if (libdir)
+	list(APPEND REQUIRED_LIBDIRS "${libdir}")
+      endif()
+    endforeach()
+    list(APPEND REQUIRED_LIBS "${SCOTCH_LIBRARIES}")
+  endif()
+  # PTSCOTCH
+  if (PASTIX_LOOK_FOR_PTSCOTCH AND PTSCOTCH_FOUND)
+    if (PTSCOTCH_INCLUDE_DIRS)
+      list(APPEND REQUIRED_INCDIRS "${PTSCOTCH_INCLUDE_DIRS}")
+    endif()
+    foreach(libdir ${PTSCOTCH_LIBRARY_DIRS})
+      if (libdir)
+	list(APPEND REQUIRED_LIBDIRS "${libdir}")
+      endif()
+    endforeach()
+    list(APPEND REQUIRED_LIBS "${PTSCOTCH_LIBRARIES}")
+  endif()
+  # METIS
+  if (PASTIX_LOOK_FOR_METIS AND METIS_FOUND)
+    if (METIS_INCLUDE_DIRS)
+      list(APPEND REQUIRED_INCDIRS "${METIS_INCLUDE_DIRS}")
+    endif()
+    foreach(libdir ${METIS_LIBRARY_DIRS})
+      if (libdir)
+	list(APPEND REQUIRED_LIBDIRS "${libdir}")
+      endif()
+    endforeach()
+    list(APPEND REQUIRED_LIBS "${METIS_LIBRARIES}")
+  endif()
+  # Fortran
+  if (CMAKE_C_COMPILER_ID MATCHES "GNU")
+    find_library(
+      FORTRAN_gfortran_LIBRARY
+      NAMES gfortran
+      HINTS ${_lib_env}
+      )
+    mark_as_advanced(FORTRAN_gfortran_LIBRARY)
+    if (FORTRAN_gfortran_LIBRARY)
+      list(APPEND REQUIRED_LIBS "${FORTRAN_gfortran_LIBRARY}")
+    endif()
+  elseif (CMAKE_C_COMPILER_ID MATCHES "Intel")
+    find_library(
+      FORTRAN_ifcore_LIBRARY
+      NAMES ifcore
+      HINTS ${_lib_env}
+      )
+    mark_as_advanced(FORTRAN_ifcore_LIBRARY)
+    if (FORTRAN_ifcore_LIBRARY)
+      list(APPEND REQUIRED_LIBS "${FORTRAN_ifcore_LIBRARY}")
+    endif()
+  endif()
+  # EXTRA LIBS such that pthread, m, rt
+  list(APPEND REQUIRED_LIBS ${PASTIX_EXTRA_LIBRARIES})
+
+  # set required libraries for link
+  set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}")
+  set(CMAKE_REQUIRED_LIBRARIES)
+  list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LDFLAGS}")
+  foreach(lib_dir ${REQUIRED_LIBDIRS})
+    list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}")
+  endforeach()
+  list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}")
+  list(APPEND CMAKE_REQUIRED_FLAGS "${REQUIRED_FLAGS}")
+  string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}")
+
+  # test link
+  unset(PASTIX_WORKS CACHE)
+  include(CheckFunctionExists)
+  check_function_exists(pastix PASTIX_WORKS)
+  mark_as_advanced(PASTIX_WORKS)
+
+  if(PASTIX_WORKS)
+    # save link with dependencies
+    set(PASTIX_LIBRARIES_DEP "${REQUIRED_LIBS}")
+    set(PASTIX_LIBRARY_DIRS_DEP "${REQUIRED_LIBDIRS}")
+    set(PASTIX_INCLUDE_DIRS_DEP "${REQUIRED_INCDIRS}")
+    set(PASTIX_LINKER_FLAGS "${REQUIRED_LDFLAGS}")
+    list(REMOVE_DUPLICATES PASTIX_LIBRARY_DIRS_DEP)
+    list(REMOVE_DUPLICATES PASTIX_INCLUDE_DIRS_DEP)
+    list(REMOVE_DUPLICATES PASTIX_LINKER_FLAGS)
+  else()
+    if(NOT PASTIX_FIND_QUIETLY)
+      message(STATUS "Looking for PASTIX : test of pastix() fails")
+      message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}")
+      message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}")
+      message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails")
+      message(STATUS "Maybe PASTIX is linked with specific libraries. "
+	"Have you tried with COMPONENTS (MPI/SEQ, STARPU, STARPU_CUDA, SCOTCH, PTSCOTCH, METIS)? "
+	"See the explanation in FindPASTIX.cmake.")
+    endif()
+  endif()
+  set(CMAKE_REQUIRED_INCLUDES)
+  set(CMAKE_REQUIRED_FLAGS)
+  set(CMAKE_REQUIRED_LIBRARIES)
+endif()
+
+if (PASTIX_LIBRARIES)
+  list(GET PASTIX_LIBRARIES 0 first_lib)
+  get_filename_component(first_lib_path "${first_lib}" PATH)
+  if (${first_lib_path} MATCHES "/lib(32|64)?$")
+    string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}")
+    set(PASTIX_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of PASTIX library" FORCE)
+  else()
+    set(PASTIX_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of PASTIX library" FORCE)
+  endif()
+endif()
+mark_as_advanced(PASTIX_DIR)
+mark_as_advanced(PASTIX_DIR_FOUND)
+
+# check that PASTIX has been found
+# ---------------------------------
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(PASTIX DEFAULT_MSG
+  PASTIX_LIBRARIES
+  PASTIX_WORKS)

diff --git a/cmake/FindPTSCOTCH.cmake b/cmake/FindPTSCOTCH.cmake
new file mode 100644
index 0000000..6ccc743
--- /dev/null
+++ b/cmake/FindPTSCOTCH.cmake

@@ -0,0 +1,422 @@
+###
+#
+# @copyright (c) 2009-2014 The University of Tennessee and The University
+#                          of Tennessee Research Foundation.
+#                          All rights reserved.
+# @copyright (c) 2012-2016 Inria. All rights reserved.
+# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+#
+###
+#
+# - Find PTSCOTCH include dirs and libraries
+# Use this module by invoking find_package with the form:
+#  find_package(PTSCOTCH
+#               [REQUIRED]             # Fail with error if ptscotch is not found
+#               [COMPONENTS <comp1> <comp2> ...] # dependencies
+#              )
+#
+#  PTSCOTCH depends on the following libraries:
+#   - Threads
+#   - MPI
+#
+#  COMPONENTS can be some of the following:
+#   - ESMUMPS: to activate detection of PT-Scotch with the esmumps interface
+#
+# This module finds headers and ptscotch library.
+# Results are reported in variables:
+#  PTSCOTCH_FOUND            - True if headers and requested libraries were found
+#  PTSCOTCH_LINKER_FLAGS     - list of required linker flags (excluding -l and -L)
+#  PTSCOTCH_INCLUDE_DIRS     - ptscotch include directories
+#  PTSCOTCH_LIBRARY_DIRS     - Link directories for ptscotch libraries
+#  PTSCOTCH_LIBRARIES        - ptscotch component libraries to be linked
+#  PTSCOTCH_INCLUDE_DIRS_DEP - ptscotch + dependencies include directories
+#  PTSCOTCH_LIBRARY_DIRS_DEP - ptscotch + dependencies link directories
+#  PTSCOTCH_LIBRARIES_DEP    - ptscotch libraries + dependencies
+#  PTSCOTCH_INTSIZE          - Number of octets occupied by a SCOTCH_Num
+#
+# The user can give specific paths where to find the libraries adding cmake
+# options at configure (ex: cmake path/to/project -DPTSCOTCH=path/to/ptscotch):
+#  PTSCOTCH_DIR              - Where to find the base directory of ptscotch
+#  PTSCOTCH_INCDIR           - Where to find the header files
+#  PTSCOTCH_LIBDIR           - Where to find the library files
+# The module can also look for the following environment variables if paths
+# are not given as cmake variable: PTSCOTCH_DIR, PTSCOTCH_INCDIR, PTSCOTCH_LIBDIR
+
+#=============================================================================
+# Copyright 2012-2013 Inria
+# Copyright 2012-2013 Emmanuel Agullo
+# Copyright 2012-2013 Mathieu Faverge
+# Copyright 2012      Cedric Castagnede
+# Copyright 2013-2016 Florent Pruvost
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file MORSE-Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of Morse, substitute the full
+#  License text for the above reference.)
+
+if (NOT PTSCOTCH_FOUND)
+  set(PTSCOTCH_DIR "" CACHE PATH "Installation directory of PTSCOTCH library")
+  if (NOT PTSCOTCH_FIND_QUIETLY)
+    message(STATUS "A cache variable, namely PTSCOTCH_DIR, has been set to specify the install directory of PTSCOTCH")
+  endif()
+endif()
+
+# Set the version to find
+set(PTSCOTCH_LOOK_FOR_ESMUMPS OFF)
+
+if( PTSCOTCH_FIND_COMPONENTS )
+  foreach( component ${PTSCOTCH_FIND_COMPONENTS} )
+    if (${component} STREQUAL "ESMUMPS")
+      # means we look for esmumps library
+      set(PTSCOTCH_LOOK_FOR_ESMUMPS ON)
+    endif()
+  endforeach()
+endif()
+
+# PTSCOTCH depends on Threads, try to find it
+include(CMakeFindDependencyMacro)
+if (NOT THREADS_FOUND)
+  if (PTSCOTCH_FIND_REQUIRED)
+    find_dependency(Threads REQUIRED)
+  else()
+    find_dependency(Threads)
+  endif()
+endif()
+
+# PTSCOTCH depends on MPI, try to find it
+if (NOT MPI_FOUND)
+  if (PTSCOTCH_FIND_REQUIRED)
+    find_dependency(MPI REQUIRED)
+  else()
+    find_dependency(MPI)
+  endif()
+endif()
+
+# Looking for include
+# -------------------
+
+# Add system include paths to search include
+# ------------------------------------------
+unset(_inc_env)
+set(ENV_PTSCOTCH_DIR "$ENV{PTSCOTCH_DIR}")
+set(ENV_PTSCOTCH_INCDIR "$ENV{PTSCOTCH_INCDIR}")
+if(ENV_PTSCOTCH_INCDIR)
+  list(APPEND _inc_env "${ENV_PTSCOTCH_INCDIR}")
+elseif(ENV_PTSCOTCH_DIR)
+  list(APPEND _inc_env "${ENV_PTSCOTCH_DIR}")
+  list(APPEND _inc_env "${ENV_PTSCOTCH_DIR}/include")
+  list(APPEND _inc_env "${ENV_PTSCOTCH_DIR}/include/ptscotch")
+else()
+  if(WIN32)
+    string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}")
+  else()
+    string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}")
+    list(APPEND _inc_env "${_path_env}")
+    string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}")
+    list(APPEND _inc_env "${_path_env}")
+    string(REPLACE ":" ";" _path_env "$ENV{CPATH}")
+    list(APPEND _inc_env "${_path_env}")
+    string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}")
+    list(APPEND _inc_env "${_path_env}")
+  endif()
+endif()
+list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}")
+list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}")
+list(REMOVE_DUPLICATES _inc_env)
+
+
+# Try to find the ptscotch header in the given paths
+# -------------------------------------------------
+
+set(PTSCOTCH_hdrs_to_find "ptscotch.h;scotch.h")
+
+# call cmake macro to find the header path
+if(PTSCOTCH_INCDIR)
+  foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find})
+    set(PTSCOTCH_${ptscotch_hdr}_DIRS "PTSCOTCH_${ptscotch_hdr}_DIRS-NOTFOUND")
+    find_path(PTSCOTCH_${ptscotch_hdr}_DIRS
+      NAMES ${ptscotch_hdr}
+      HINTS ${PTSCOTCH_INCDIR})
+    mark_as_advanced(PTSCOTCH_${ptscotch_hdr}_DIRS)
+  endforeach()
+else()
+  if(PTSCOTCH_DIR)
+    foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find})
+      set(PTSCOTCH_${ptscotch_hdr}_DIRS "PTSCOTCH_${ptscotch_hdr}_DIRS-NOTFOUND")
+      find_path(PTSCOTCH_${ptscotch_hdr}_DIRS
+        NAMES ${ptscotch_hdr}
+        HINTS ${PTSCOTCH_DIR}
+        PATH_SUFFIXES "include" "include/scotch")
+      mark_as_advanced(PTSCOTCH_${ptscotch_hdr}_DIRS)
+    endforeach()
+  else()
+    foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find})
+      set(PTSCOTCH_${ptscotch_hdr}_DIRS "PTSCOTCH_${ptscotch_hdr}_DIRS-NOTFOUND")
+      find_path(PTSCOTCH_${ptscotch_hdr}_DIRS
+        NAMES ${ptscotch_hdr}
+        HINTS ${_inc_env}
+        PATH_SUFFIXES "scotch")
+      mark_as_advanced(PTSCOTCH_${ptscotch_hdr}_DIRS)
+    endforeach()
+  endif()
+endif()
+
+# If found, add path to cmake variable
+# ------------------------------------
+foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find})
+  if (PTSCOTCH_${ptscotch_hdr}_DIRS)
+    list(APPEND PTSCOTCH_INCLUDE_DIRS "${PTSCOTCH_${ptscotch_hdr}_DIRS}")
+  else ()
+    if (NOT PTSCOTCH_FIND_QUIETLY)
+      message(STATUS "Looking for ptscotch -- ${ptscotch_hdr} not found")
+    endif()
+  endif()
+endforeach()
+list(REMOVE_DUPLICATES PTSCOTCH_INCLUDE_DIRS)
+
+# Looking for lib
+# ---------------
+
+# Add system library paths to search lib
+# --------------------------------------
+unset(_lib_env)
+set(ENV_PTSCOTCH_LIBDIR "$ENV{PTSCOTCH_LIBDIR}")
+if(ENV_PTSCOTCH_LIBDIR)
+  list(APPEND _lib_env "${ENV_PTSCOTCH_LIBDIR}")
+elseif(ENV_PTSCOTCH_DIR)
+  list(APPEND _lib_env "${ENV_PTSCOTCH_DIR}")
+  list(APPEND _lib_env "${ENV_PTSCOTCH_DIR}/lib")
+else()
+  if(WIN32)
+    string(REPLACE ":" ";" _lib_env "$ENV{LIB}")
+  else()
+    if(APPLE)
+      string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}")
+    else()
+      string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}")
+    endif()
+    list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}")
+    list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}")
+  endif()
+endif()
+list(REMOVE_DUPLICATES _lib_env)
+
+# Try to find the ptscotch lib in the given paths
+# ----------------------------------------------
+
+set(PTSCOTCH_libs_to_find "ptscotch;ptscotcherr")
+if (PTSCOTCH_LOOK_FOR_ESMUMPS)
+  list(INSERT PTSCOTCH_libs_to_find 0 "ptesmumps")
+  list(APPEND PTSCOTCH_libs_to_find   "esmumps"  )
+endif()
+list(APPEND PTSCOTCH_libs_to_find "scotch;scotcherr")
+
+# call cmake macro to find the lib path
+if(PTSCOTCH_LIBDIR)
+  foreach(ptscotch_lib ${PTSCOTCH_libs_to_find})
+    set(PTSCOTCH_${ptscotch_lib}_LIBRARY "PTSCOTCH_${ptscotch_lib}_LIBRARY-NOTFOUND")
+    find_library(PTSCOTCH_${ptscotch_lib}_LIBRARY
+      NAMES ${ptscotch_lib}
+      HINTS ${PTSCOTCH_LIBDIR})
+  endforeach()
+else()
+  if(PTSCOTCH_DIR)
+    foreach(ptscotch_lib ${PTSCOTCH_libs_to_find})
+      set(PTSCOTCH_${ptscotch_lib}_LIBRARY "PTSCOTCH_${ptscotch_lib}_LIBRARY-NOTFOUND")
+      find_library(PTSCOTCH_${ptscotch_lib}_LIBRARY
+        NAMES ${ptscotch_lib}
+        HINTS ${PTSCOTCH_DIR}
+        PATH_SUFFIXES lib lib32 lib64)
+    endforeach()
+  else()
+    foreach(ptscotch_lib ${PTSCOTCH_libs_to_find})
+      set(PTSCOTCH_${ptscotch_lib}_LIBRARY "PTSCOTCH_${ptscotch_lib}_LIBRARY-NOTFOUND")
+      find_library(PTSCOTCH_${ptscotch_lib}_LIBRARY
+        NAMES ${ptscotch_lib}
+        HINTS ${_lib_env})
+    endforeach()
+  endif()
+endif()
+
+set(PTSCOTCH_LIBRARIES "")
+set(PTSCOTCH_LIBRARY_DIRS "")
+# If found, add path to cmake variable
+# ------------------------------------
+foreach(ptscotch_lib ${PTSCOTCH_libs_to_find})
+
+  if (PTSCOTCH_${ptscotch_lib}_LIBRARY)
+    get_filename_component(${ptscotch_lib}_lib_path "${PTSCOTCH_${ptscotch_lib}_LIBRARY}" PATH)
+    # set cmake variables
+    list(APPEND PTSCOTCH_LIBRARIES "${PTSCOTCH_${ptscotch_lib}_LIBRARY}")
+    list(APPEND PTSCOTCH_LIBRARY_DIRS "${${ptscotch_lib}_lib_path}")
+  else ()
+    if (NOT PTSCOTCH_FIND_QUIETLY)
+      message(STATUS "Looking for ptscotch -- lib ${ptscotch_lib} not found")
+    endif()
+  endif ()
+
+  mark_as_advanced(PTSCOTCH_${ptscotch_lib}_LIBRARY)
+
+endforeach()
+list(REMOVE_DUPLICATES PTSCOTCH_LIBRARY_DIRS)
+
+# check a function to validate the find
+if(PTSCOTCH_LIBRARIES)
+
+  set(REQUIRED_LDFLAGS)
+  set(REQUIRED_INCDIRS)
+  set(REQUIRED_LIBDIRS)
+  set(REQUIRED_LIBS)
+
+  # PTSCOTCH
+  if (PTSCOTCH_INCLUDE_DIRS)
+    set(REQUIRED_INCDIRS  "${PTSCOTCH_INCLUDE_DIRS}")
+  endif()
+  if (PTSCOTCH_LIBRARY_DIRS)
+    set(REQUIRED_LIBDIRS "${PTSCOTCH_LIBRARY_DIRS}")
+  endif()
+  set(REQUIRED_LIBS "${PTSCOTCH_LIBRARIES}")
+  # MPI
+  if (MPI_FOUND)
+    if (MPI_C_INCLUDE_PATH)
+      list(APPEND CMAKE_REQUIRED_INCLUDES "${MPI_C_INCLUDE_PATH}")
+    endif()
+    if (MPI_C_LINK_FLAGS)
+      if (${MPI_C_LINK_FLAGS} MATCHES "  -")
+	string(REGEX REPLACE " -" "-" MPI_C_LINK_FLAGS ${MPI_C_LINK_FLAGS})
+      endif()
+      list(APPEND REQUIRED_LDFLAGS "${MPI_C_LINK_FLAGS}")
+    endif()
+    list(APPEND REQUIRED_LIBS "${MPI_C_LIBRARIES}")
+  endif()
+  # THREADS
+  if(CMAKE_THREAD_LIBS_INIT)
+    list(APPEND REQUIRED_LIBS "${CMAKE_THREAD_LIBS_INIT}")
+  endif()
+  set(Z_LIBRARY "Z_LIBRARY-NOTFOUND")
+  find_library(Z_LIBRARY NAMES z)
+  mark_as_advanced(Z_LIBRARY)
+  if(Z_LIBRARY)
+    list(APPEND REQUIRED_LIBS "-lz")
+  endif()
+  set(M_LIBRARY "M_LIBRARY-NOTFOUND")
+  find_library(M_LIBRARY NAMES m)
+  mark_as_advanced(M_LIBRARY)
+  if(M_LIBRARY)
+    list(APPEND REQUIRED_LIBS "-lm")
+  endif()
+  set(RT_LIBRARY "RT_LIBRARY-NOTFOUND")
+  find_library(RT_LIBRARY NAMES rt)
+  mark_as_advanced(RT_LIBRARY)
+  if(RT_LIBRARY)
+    list(APPEND REQUIRED_LIBS "-lrt")
+  endif()
+
+  # set required libraries for link
+  set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}")
+  set(CMAKE_REQUIRED_LIBRARIES)
+  list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LDFLAGS}")
+  foreach(lib_dir ${REQUIRED_LIBDIRS})
+    list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}")
+  endforeach()
+  list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}")
+  list(APPEND CMAKE_REQUIRED_FLAGS "${REQUIRED_FLAGS}")
+  string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}")
+
+  # test link
+  unset(PTSCOTCH_WORKS CACHE)
+  include(CheckFunctionExists)
+  check_function_exists(SCOTCH_dgraphInit PTSCOTCH_WORKS)
+  mark_as_advanced(PTSCOTCH_WORKS)
+
+  if(PTSCOTCH_WORKS)
+    # save link with dependencies
+    set(PTSCOTCH_LIBRARIES_DEP "${REQUIRED_LIBS}")
+    set(PTSCOTCH_LIBRARY_DIRS_DEP "${REQUIRED_LIBDIRS}")
+    set(PTSCOTCH_INCLUDE_DIRS_DEP "${REQUIRED_INCDIRS}")
+    set(PTSCOTCH_LINKER_FLAGS "${REQUIRED_LDFLAGS}")
+    list(REMOVE_DUPLICATES PTSCOTCH_LIBRARY_DIRS_DEP)
+    list(REMOVE_DUPLICATES PTSCOTCH_INCLUDE_DIRS_DEP)
+    list(REMOVE_DUPLICATES PTSCOTCH_LINKER_FLAGS)
+  else()
+    if(NOT PTSCOTCH_FIND_QUIETLY)
+      message(STATUS "Looking for PTSCOTCH : test of SCOTCH_dgraphInit with PTSCOTCH library fails")
+      message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}")
+      message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}")
+      message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails")
+    endif()
+  endif()
+  set(CMAKE_REQUIRED_INCLUDES)
+  set(CMAKE_REQUIRED_FLAGS)
+  set(CMAKE_REQUIRED_LIBRARIES)
+endif()
+
+if (PTSCOTCH_LIBRARIES)
+  list(GET PTSCOTCH_LIBRARIES 0 first_lib)
+  get_filename_component(first_lib_path "${first_lib}" PATH)
+  if (${first_lib_path} MATCHES "/lib(32|64)?$")
+    string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}")
+    set(PTSCOTCH_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of PTSCOTCH library" FORCE)
+  else()
+    set(PTSCOTCH_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of PTSCOTCH library" FORCE)
+  endif()
+endif()
+mark_as_advanced(PTSCOTCH_DIR)
+mark_as_advanced(PTSCOTCH_DIR_FOUND)
+
+# Check the size of SCOTCH_Num
+# ---------------------------------
+set(CMAKE_REQUIRED_INCLUDES ${PTSCOTCH_INCLUDE_DIRS})
+
+include(CheckCSourceRuns)
+#stdio.h and stdint.h should be included by scotch.h directly
+set(PTSCOTCH_C_TEST_SCOTCH_Num_4 "
+#include <stdio.h>
+#include <stdint.h>
+#include <ptscotch.h>
+int main(int argc, char **argv) {
+  if (sizeof(SCOTCH_Num) == 4)
+    return 0;
+  else
+    return 1;
+}
+")
+
+set(PTSCOTCH_C_TEST_SCOTCH_Num_8 "
+#include <stdio.h>
+#include <stdint.h>
+#include <ptscotch.h>
+int main(int argc, char **argv) {
+  if (sizeof(SCOTCH_Num) == 8)
+    return 0;
+  else
+    return 1;
+}
+")
+check_c_source_runs("${PTSCOTCH_C_TEST_SCOTCH_Num_4}" PTSCOTCH_Num_4)
+if(NOT PTSCOTCH_Num_4)
+  check_c_source_runs("${PTSCOTCH_C_TEST_SCOTCH_Num_8}" PTSCOTCH_Num_8)
+  if(NOT PTSCOTCH_Num_8)
+    set(PTSCOTCH_INTSIZE -1)
+  else()
+    set(PTSCOTCH_INTSIZE 8)
+  endif()
+else()
+  set(PTSCOTCH_INTSIZE 4)
+endif()
+set(CMAKE_REQUIRED_INCLUDES "")
+
+# check that PTSCOTCH has been found
+# ---------------------------------
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(PTSCOTCH DEFAULT_MSG
+  PTSCOTCH_LIBRARIES
+  PTSCOTCH_WORKS)
+#
+# TODO: Add possibility to check for specific functions in the library
+#

diff --git a/cmake/FindSCOTCH.cmake b/cmake/FindSCOTCH.cmake
new file mode 100644
index 0000000..11b971a
--- /dev/null
+++ b/cmake/FindSCOTCH.cmake

@@ -0,0 +1,370 @@
+###
+#
+# @copyright (c) 2009-2014 The University of Tennessee and The University
+#                          of Tennessee Research Foundation.
+#                          All rights reserved.
+# @copyright (c) 2012-2014 Inria. All rights reserved.
+# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+#
+###
+#
+# - Find SCOTCH include dirs and libraries
+# Use this module by invoking find_package with the form:
+#  find_package(SCOTCH
+#               [REQUIRED]             # Fail with error if scotch is not found
+#               [COMPONENTS <comp1> <comp2> ...] # dependencies
+#              )
+#
+#  COMPONENTS can be some of the following:
+#   - ESMUMPS: to activate detection of Scotch with the esmumps interface
+#
+# This module finds headers and scotch library.
+# Results are reported in variables:
+#  SCOTCH_FOUND           - True if headers and requested libraries were found
+#  SCOTCH_INCLUDE_DIRS    - scotch include directories
+#  SCOTCH_LIBRARY_DIRS    - Link directories for scotch libraries
+#  SCOTCH_LIBRARIES       - scotch component libraries to be linked
+#  SCOTCH_INTSIZE         - Number of octets occupied by a SCOTCH_Num
+#
+# The user can give specific paths where to find the libraries adding cmake
+# options at configure (ex: cmake path/to/project -DSCOTCH=path/to/scotch):
+#  SCOTCH_DIR             - Where to find the base directory of scotch
+#  SCOTCH_INCDIR          - Where to find the header files
+#  SCOTCH_LIBDIR          - Where to find the library files
+# The module can also look for the following environment variables if paths
+# are not given as cmake variable: SCOTCH_DIR, SCOTCH_INCDIR, SCOTCH_LIBDIR
+
+#=============================================================================
+# Copyright 2012-2013 Inria
+# Copyright 2012-2013 Emmanuel Agullo
+# Copyright 2012-2013 Mathieu Faverge
+# Copyright 2012      Cedric Castagnede
+# Copyright 2013      Florent Pruvost
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file MORSE-Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of Morse, substitute the full
+#  License text for the above reference.)
+
+if (NOT SCOTCH_FOUND)
+  set(SCOTCH_DIR "" CACHE PATH "Installation directory of SCOTCH library")
+  if (NOT SCOTCH_FIND_QUIETLY)
+    message(STATUS "A cache variable, namely SCOTCH_DIR, has been set to specify the install directory of SCOTCH")
+  endif()
+endif()
+
+# Set the version to find
+set(SCOTCH_LOOK_FOR_ESMUMPS OFF)
+
+if( SCOTCH_FIND_COMPONENTS )
+  foreach( component ${SCOTCH_FIND_COMPONENTS} )
+    if (${component} STREQUAL "ESMUMPS")
+      # means we look for esmumps library
+      set(SCOTCH_LOOK_FOR_ESMUMPS ON)
+    endif()
+  endforeach()
+endif()
+
+# SCOTCH may depend on Threads, try to find it
+include(CMakeFindDependencyMacro)
+if (NOT THREADS_FOUND)
+  if (SCOTCH_FIND_REQUIRED)
+    find_dependency(Threads REQUIRED)
+  else()
+    find_dependency(Threads)
+  endif()
+endif()
+
+# Looking for include
+# -------------------
+
+# Add system include paths to search include
+# ------------------------------------------
+unset(_inc_env)
+set(ENV_SCOTCH_DIR "$ENV{SCOTCH_DIR}")
+set(ENV_SCOTCH_INCDIR "$ENV{SCOTCH_INCDIR}")
+if(ENV_SCOTCH_INCDIR)
+  list(APPEND _inc_env "${ENV_SCOTCH_INCDIR}")
+elseif(ENV_SCOTCH_DIR)
+  list(APPEND _inc_env "${ENV_SCOTCH_DIR}")
+  list(APPEND _inc_env "${ENV_SCOTCH_DIR}/include")
+  list(APPEND _inc_env "${ENV_SCOTCH_DIR}/include/scotch")
+else()
+  if(WIN32)
+    string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}")
+  else()
+    string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}")
+    list(APPEND _inc_env "${_path_env}")
+    string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}")
+    list(APPEND _inc_env "${_path_env}")
+    string(REPLACE ":" ";" _path_env "$ENV{CPATH}")
+    list(APPEND _inc_env "${_path_env}")
+    string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}")
+    list(APPEND _inc_env "${_path_env}")
+  endif()
+endif()
+list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}")
+list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}")
+list(REMOVE_DUPLICATES _inc_env)
+
+
+# Try to find the scotch header in the given paths
+# -------------------------------------------------
+# call cmake macro to find the header path
+if(SCOTCH_INCDIR)
+  set(SCOTCH_scotch.h_DIRS "SCOTCH_scotch.h_DIRS-NOTFOUND")
+  find_path(SCOTCH_scotch.h_DIRS
+    NAMES scotch.h
+    HINTS ${SCOTCH_INCDIR})
+else()
+  if(SCOTCH_DIR)
+    set(SCOTCH_scotch.h_DIRS "SCOTCH_scotch.h_DIRS-NOTFOUND")
+    find_path(SCOTCH_scotch.h_DIRS
+      NAMES scotch.h
+      HINTS ${SCOTCH_DIR}
+      PATH_SUFFIXES "include" "include/scotch")
+  else()
+    set(SCOTCH_scotch.h_DIRS "SCOTCH_scotch.h_DIRS-NOTFOUND")
+    find_path(SCOTCH_scotch.h_DIRS
+      NAMES scotch.h
+      HINTS ${_inc_env}
+      PATH_SUFFIXES "scotch")
+  endif()
+endif()
+mark_as_advanced(SCOTCH_scotch.h_DIRS)
+
+# If found, add path to cmake variable
+# ------------------------------------
+if (SCOTCH_scotch.h_DIRS)
+  set(SCOTCH_INCLUDE_DIRS "${SCOTCH_scotch.h_DIRS}")
+else ()
+  set(SCOTCH_INCLUDE_DIRS "SCOTCH_INCLUDE_DIRS-NOTFOUND")
+  if (NOT SCOTCH_FIND_QUIETLY)
+    message(STATUS "Looking for scotch -- scotch.h not found")
+  endif()
+endif()
+list(REMOVE_DUPLICATES SCOTCH_INCLUDE_DIRS)
+
+# Looking for lib
+# ---------------
+
+# Add system library paths to search lib
+# --------------------------------------
+unset(_lib_env)
+set(ENV_SCOTCH_LIBDIR "$ENV{SCOTCH_LIBDIR}")
+if(ENV_SCOTCH_LIBDIR)
+  list(APPEND _lib_env "${ENV_SCOTCH_LIBDIR}")
+elseif(ENV_SCOTCH_DIR)
+  list(APPEND _lib_env "${ENV_SCOTCH_DIR}")
+  list(APPEND _lib_env "${ENV_SCOTCH_DIR}/lib")
+else()
+  if(WIN32)
+    string(REPLACE ":" ";" _lib_env "$ENV{LIB}")
+  else()
+    if(APPLE)
+      string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}")
+    else()
+      string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}")
+    endif()
+    list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}")
+    list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}")
+  endif()
+endif()
+list(REMOVE_DUPLICATES _lib_env)
+
+# Try to find the scotch lib in the given paths
+# ----------------------------------------------
+
+set(SCOTCH_libs_to_find "scotch;scotcherrexit")
+if (SCOTCH_LOOK_FOR_ESMUMPS)
+  list(INSERT SCOTCH_libs_to_find 0 "esmumps")
+endif()
+
+# call cmake macro to find the lib path
+if(SCOTCH_LIBDIR)
+  foreach(scotch_lib ${SCOTCH_libs_to_find})
+    set(SCOTCH_${scotch_lib}_LIBRARY "SCOTCH_${scotch_lib}_LIBRARY-NOTFOUND")
+    find_library(SCOTCH_${scotch_lib}_LIBRARY
+      NAMES ${scotch_lib}
+      HINTS ${SCOTCH_LIBDIR})
+  endforeach()
+else()
+  if(SCOTCH_DIR)
+    foreach(scotch_lib ${SCOTCH_libs_to_find})
+      set(SCOTCH_${scotch_lib}_LIBRARY "SCOTCH_${scotch_lib}_LIBRARY-NOTFOUND")
+      find_library(SCOTCH_${scotch_lib}_LIBRARY
+	NAMES ${scotch_lib}
+	HINTS ${SCOTCH_DIR}
+	PATH_SUFFIXES lib lib32 lib64)
+    endforeach()
+  else()
+    foreach(scotch_lib ${SCOTCH_libs_to_find})
+      set(SCOTCH_${scotch_lib}_LIBRARY "SCOTCH_${scotch_lib}_LIBRARY-NOTFOUND")
+      find_library(SCOTCH_${scotch_lib}_LIBRARY
+	NAMES ${scotch_lib}
+	HINTS ${_lib_env})
+    endforeach()
+  endif()
+endif()
+
+set(SCOTCH_LIBRARIES "")
+set(SCOTCH_LIBRARY_DIRS "")
+# If found, add path to cmake variable
+# ------------------------------------
+foreach(scotch_lib ${SCOTCH_libs_to_find})
+
+  if (SCOTCH_${scotch_lib}_LIBRARY)
+    get_filename_component(${scotch_lib}_lib_path "${SCOTCH_${scotch_lib}_LIBRARY}" PATH)
+    # set cmake variables
+    list(APPEND SCOTCH_LIBRARIES "${SCOTCH_${scotch_lib}_LIBRARY}")
+    list(APPEND SCOTCH_LIBRARY_DIRS "${${scotch_lib}_lib_path}")
+  else ()
+    list(APPEND SCOTCH_LIBRARIES "${SCOTCH_${scotch_lib}_LIBRARY}")
+    if (NOT SCOTCH_FIND_QUIETLY)
+      message(STATUS "Looking for scotch -- lib ${scotch_lib} not found")
+    endif()
+  endif ()
+
+  mark_as_advanced(SCOTCH_${scotch_lib}_LIBRARY)
+
+endforeach()
+list(REMOVE_DUPLICATES SCOTCH_LIBRARY_DIRS)
+
+# check a function to validate the find
+if(SCOTCH_LIBRARIES)
+
+  set(REQUIRED_INCDIRS)
+  set(REQUIRED_LIBDIRS)
+  set(REQUIRED_LIBS)
+
+  # SCOTCH
+  if (SCOTCH_INCLUDE_DIRS)
+    set(REQUIRED_INCDIRS  "${SCOTCH_INCLUDE_DIRS}")
+  endif()
+  if (SCOTCH_LIBRARY_DIRS)
+    set(REQUIRED_LIBDIRS "${SCOTCH_LIBRARY_DIRS}")
+  endif()
+  set(REQUIRED_LIBS "${SCOTCH_LIBRARIES}")
+  # THREADS
+  if(CMAKE_THREAD_LIBS_INIT)
+    list(APPEND REQUIRED_LIBS "${CMAKE_THREAD_LIBS_INIT}")
+  endif()
+  set(Z_LIBRARY "Z_LIBRARY-NOTFOUND")
+  find_library(Z_LIBRARY NAMES z)
+  mark_as_advanced(Z_LIBRARY)
+  if(Z_LIBRARY)
+    list(APPEND REQUIRED_LIBS "-lz")
+  endif()
+  set(M_LIBRARY "M_LIBRARY-NOTFOUND")
+  find_library(M_LIBRARY NAMES m)
+  mark_as_advanced(M_LIBRARY)
+  if(M_LIBRARY)
+    list(APPEND REQUIRED_LIBS "-lm")
+  endif()
+  set(RT_LIBRARY "RT_LIBRARY-NOTFOUND")
+  find_library(RT_LIBRARY NAMES rt)
+  mark_as_advanced(RT_LIBRARY)
+  if(RT_LIBRARY)
+    list(APPEND REQUIRED_LIBS "-lrt")
+  endif()
+
+  # set required libraries for link
+  set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}")
+  set(CMAKE_REQUIRED_LIBRARIES)
+  foreach(lib_dir ${REQUIRED_LIBDIRS})
+    list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}")
+  endforeach()
+  list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}")
+  string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}")
+
+  # test link
+  unset(SCOTCH_WORKS CACHE)
+  include(CheckFunctionExists)
+  check_function_exists(SCOTCH_graphInit SCOTCH_WORKS)
+  mark_as_advanced(SCOTCH_WORKS)
+
+  if(SCOTCH_WORKS)
+    # save link with dependencies
+    set(SCOTCH_LIBRARIES "${REQUIRED_LIBS}")
+  else()
+    if(NOT SCOTCH_FIND_QUIETLY)
+      message(STATUS "Looking for SCOTCH : test of SCOTCH_graphInit with SCOTCH library fails")
+      message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}")
+      message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}")
+      message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails")
+    endif()
+  endif()
+  set(CMAKE_REQUIRED_INCLUDES)
+  set(CMAKE_REQUIRED_FLAGS)
+  set(CMAKE_REQUIRED_LIBRARIES)
+endif()
+
+if (SCOTCH_LIBRARIES)
+  list(GET SCOTCH_LIBRARIES 0 first_lib)
+  get_filename_component(first_lib_path "${first_lib}" PATH)
+  if (${first_lib_path} MATCHES "/lib(32|64)?$")
+    string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}")
+    set(SCOTCH_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of SCOTCH library" FORCE)
+  else()
+    set(SCOTCH_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of SCOTCH library" FORCE)
+  endif()
+endif()
+mark_as_advanced(SCOTCH_DIR)
+mark_as_advanced(SCOTCH_DIR_FOUND)
+
+# Check the size of SCOTCH_Num
+# ---------------------------------
+set(CMAKE_REQUIRED_INCLUDES ${SCOTCH_INCLUDE_DIRS})
+
+include(CheckCSourceRuns)
+#stdio.h and stdint.h should be included by scotch.h directly
+set(SCOTCH_C_TEST_SCOTCH_Num_4 "
+#include <stdio.h>
+#include <stdint.h>
+#include <scotch.h>
+int main(int argc, char **argv) {
+  if (sizeof(SCOTCH_Num) == 4)
+    return 0;
+  else
+    return 1;
+}
+")
+
+set(SCOTCH_C_TEST_SCOTCH_Num_8 "
+#include <stdio.h>
+#include <stdint.h>
+#include <scotch.h>
+int main(int argc, char **argv) {
+  if (sizeof(SCOTCH_Num) == 8)
+    return 0;
+  else
+    return 1;
+}
+")
+check_c_source_runs("${SCOTCH_C_TEST_SCOTCH_Num_4}" SCOTCH_Num_4)
+if(NOT SCOTCH_Num_4)
+  check_c_source_runs("${SCOTCH_C_TEST_SCOTCH_Num_8}" SCOTCH_Num_8)
+  if(NOT SCOTCH_Num_8)
+    set(SCOTCH_INTSIZE -1)
+  else()
+    set(SCOTCH_INTSIZE 8)
+  endif()
+else()
+  set(SCOTCH_INTSIZE 4)
+endif()
+set(CMAKE_REQUIRED_INCLUDES "")
+
+# check that SCOTCH has been found
+# ---------------------------------
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(SCOTCH DEFAULT_MSG
+  SCOTCH_LIBRARIES
+  SCOTCH_WORKS)
+#
+# TODO: Add possibility to check for specific functions in the library
+#

diff --git a/cmake/FindSPQR.cmake b/cmake/FindSPQR.cmake
new file mode 100644
index 0000000..d6fb2e1
--- /dev/null
+++ b/cmake/FindSPQR.cmake

@@ -0,0 +1,41 @@
+# SPQR lib usually requires linking to a blas and lapack library.
+# It is up to the user of this module to find a BLAS and link to it.
+
+# SPQR lib requires Cholmod, colamd and amd as well. 
+# FindCholmod.cmake can be used to find those packages before finding spqr
+
+if (SPQR_INCLUDES AND SPQR_LIBRARIES)
+  set(SPQR_FIND_QUIETLY TRUE)
+endif ()
+
+find_path(SPQR_INCLUDES
+  NAMES
+  SuiteSparseQR.hpp
+  PATHS
+  $ENV{SPQRDIR}
+  ${INCLUDE_INSTALL_DIR}
+  PATH_SUFFIXES
+  suitesparse
+  ufsparse
+)
+
+find_library(SPQR_LIBRARIES spqr $ENV{SPQRDIR} ${LIB_INSTALL_DIR})
+
+if(SPQR_LIBRARIES)
+
+  find_library(SUITESPARSE_LIBRARY SuiteSparse PATHS $ENV{SPQRDIR} ${LIB_INSTALL_DIR})
+  if (SUITESPARSE_LIBRARY)
+    set(SPQR_LIBRARIES ${SPQR_LIBRARIES} ${SUITESPARSE_LIBRARY})
+  endif()
+
+  find_library(CHOLMOD_LIBRARY cholmod PATHS $ENV{UMFPACK_LIBDIR} $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR})
+  if(CHOLMOD_LIBRARY)
+    set(SPQR_LIBRARIES ${SPQR_LIBRARIES} ${CHOLMOD_LIBRARY})
+  endif()
+  
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(SPQR DEFAULT_MSG SPQR_INCLUDES SPQR_LIBRARIES)
+
+mark_as_advanced(SPQR_INCLUDES SPQR_LIBRARIES)
\ No newline at end of file

diff --git a/cmake/FindStandardMathLibrary.cmake b/cmake/FindStandardMathLibrary.cmake
new file mode 100644
index 0000000..1d1e5b3
--- /dev/null
+++ b/cmake/FindStandardMathLibrary.cmake

@@ -0,0 +1,70 @@
+# - Try to find how to link to the standard math library, if anything at all is needed to do.
+# On most platforms this is automatic, but for example it's not automatic on QNX.
+#
+# Once done this will define
+#
+#  STANDARD_MATH_LIBRARY_FOUND - we found how to successfully link to the standard math library
+#  STANDARD_MATH_LIBRARY - the name of the standard library that one has to link to.
+#                            -- this will be left empty if it's automatic (most platforms).
+#                            -- this will be set to "m" on platforms where one must explicitly
+#                               pass the "-lm" linker flag.
+#
+# Copyright (c) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+#               2020 Susi Lehtola <susi.lehtola@gmail.com>
+# Redistribution and use is allowed according to the terms of the 2-clause BSD license.
+
+
+include(CheckCXXSourceCompiles)
+
+# a little test program for c++ math functions.
+# notice the std:: is required on some platforms such as QNX
+# notice the (void) is required if -Wall (-Wunused-value) is added to CMAKE_CXX_FLAG
+
+# We read in the arguments from standard input to avoid the compiler optimizing away the calls
+set(find_standard_math_library_test_program
+"
+#include<cmath>
+int main(int argc, char **){
+  return int(std::sin(double(argc)) + std::log(double(argc)));
+}")
+
+# first try compiling/linking the test program without any linker flags
+
+set(CMAKE_REQUIRED_FLAGS "")
+set(CMAKE_REQUIRED_LIBRARIES "")
+CHECK_CXX_SOURCE_COMPILES(
+  "${find_standard_math_library_test_program}"
+  standard_math_library_linked_to_automatically
+)
+
+if(standard_math_library_linked_to_automatically)
+
+  # the test program linked successfully without any linker flag.
+  set(STANDARD_MATH_LIBRARY "")
+  set(STANDARD_MATH_LIBRARY_FOUND TRUE)
+
+else()
+
+  # the test program did not link successfully without any linker flag.
+  # This is a very uncommon case that so far we only saw on QNX. The next try is the
+  # standard name 'm' for the standard math library.
+
+  set(CMAKE_REQUIRED_LIBRARIES "m")
+  CHECK_CXX_SOURCE_COMPILES(
+    "${find_standard_math_library_test_program}"
+    standard_math_library_linked_to_as_m)
+
+  if(standard_math_library_linked_to_as_m)
+
+    # the test program linked successfully when linking to the 'm' library
+    set(STANDARD_MATH_LIBRARY "m")
+    set(STANDARD_MATH_LIBRARY_FOUND TRUE)
+
+  else()
+
+    # the test program still doesn't link successfully
+    set(STANDARD_MATH_LIBRARY_FOUND FALSE)
+
+  endif()
+
+endif()

diff --git a/cmake/FindSuperLU.cmake b/cmake/FindSuperLU.cmake
new file mode 100644
index 0000000..4b779f5
--- /dev/null
+++ b/cmake/FindSuperLU.cmake

@@ -0,0 +1,97 @@
+
+# Umfpack lib usually requires linking to a blas library.
+# It is up to the user of this module to find a BLAS and link to it.
+
+if (SUPERLU_INCLUDES AND SUPERLU_LIBRARIES)
+  set(SUPERLU_FIND_QUIETLY TRUE)
+endif ()
+
+find_path(SUPERLU_INCLUDES
+  NAMES
+  supermatrix.h
+  PATHS
+  $ENV{SUPERLUDIR}
+  ${INCLUDE_INSTALL_DIR}
+  PATH_SUFFIXES
+  superlu
+  SRC
+)
+
+find_library(SUPERLU_LIBRARIES
+  NAMES "superlu_5.2.1" "superlu_5.2" "superlu_5.1.1" "superlu_5.1" "superlu_5.0" "superlu_4.3" "superlu_4.2" "superlu_4.1" "superlu_4.0" "superlu_3.1" "superlu_3.0" "superlu"
+  PATHS $ENV{SUPERLUDIR} ${LIB_INSTALL_DIR}
+  PATH_SUFFIXES lib)
+
+if(SUPERLU_INCLUDES AND SUPERLU_LIBRARIES)
+
+include(CheckCXXSourceCompiles)
+include(CMakePushCheckState)
+cmake_push_check_state()
+
+set(CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES} ${SUPERLU_INCLUDES})
+
+# check whether struct mem_usage_t is globally defined
+check_cxx_source_compiles("
+typedef int int_t;
+#include <supermatrix.h>
+#include <slu_util.h>
+int main() {
+  mem_usage_t mem;
+  return 0;
+}"
+SUPERLU_HAS_GLOBAL_MEM_USAGE_T)
+
+
+check_cxx_source_compiles("
+typedef int int_t;
+#include <supermatrix.h>
+#include <superlu_enum_consts.h>
+int main() {
+  return SLU_SINGLE;
+}"
+SUPERLU_HAS_CLEAN_ENUMS)
+
+check_cxx_source_compiles("
+typedef int int_t;
+#include <supermatrix.h>
+#include <slu_util.h>
+int main(void)
+{
+  GlobalLU_t glu;
+  return 0;
+}"
+SUPERLU_HAS_GLOBALLU_T)
+
+if(SUPERLU_HAS_GLOBALLU_T)
+  # at least 5.0
+  set(SUPERLU_VERSION_VAR "5.0")
+elseif(SUPERLU_HAS_CLEAN_ENUMS)
+  # at least 4.3
+  set(SUPERLU_VERSION_VAR "4.3")
+elseif(SUPERLU_HAS_GLOBAL_MEM_USAGE_T)
+  # at least 4.0
+  set(SUPERLU_VERSION_VAR "4.0")
+else()
+  set(SUPERLU_VERSION_VAR "3.0")
+endif()
+
+cmake_pop_check_state()
+
+if(SuperLU_FIND_VERSION)
+  if(${SUPERLU_VERSION_VAR} VERSION_LESS ${SuperLU_FIND_VERSION})
+    set(SUPERLU_VERSION_OK FALSE)
+  else()
+    set(SUPERLU_VERSION_OK TRUE)
+  endif()
+else()
+  set(SUPERLU_VERSION_OK TRUE)
+endif()
+
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(SuperLU
+                                  REQUIRED_VARS SUPERLU_INCLUDES SUPERLU_LIBRARIES SUPERLU_VERSION_OK
+                                  VERSION_VAR SUPERLU_VERSION_VAR)
+
+mark_as_advanced(SUPERLU_INCLUDES SUPERLU_LIBRARIES)

diff --git a/cmake/FindTriSYCL.cmake b/cmake/FindTriSYCL.cmake
new file mode 100644
index 0000000..8104239
--- /dev/null
+++ b/cmake/FindTriSYCL.cmake

@@ -0,0 +1,173 @@
+#.rst:
+# FindTriSYCL
+#---------------
+#
+# TODO : insert Copyright and licence
+
+#########################
+#  FindTriSYCL.cmake
+#########################
+#
+#  Tools for finding and building with TriSYCL.
+#
+#  User must define TRISYCL_INCLUDE_DIR pointing to the triSYCL
+#  include directory.
+#
+#  Latest version of this file can be found at:
+#    https://github.com/triSYCL/triSYCL
+
+# Requite CMake version 3.5 or higher
+cmake_minimum_required (VERSION 3.5)
+
+# Check that a supported host compiler can be found
+if(CMAKE_COMPILER_IS_GNUCXX)
+  # Require at least gcc 5.4
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.4)
+    message(FATAL_ERROR
+      "host compiler - Not found! (gcc version must be at least 5.4)")
+  else()
+    message(STATUS "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION}")
+  endif()
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+  # Require at least clang 3.9
+  if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.9)
+    message(FATAL_ERROR
+      "host compiler - Not found! (clang version must be at least 3.9)")
+  else()
+    message(STATUS "host compiler - clang ${CMAKE_CXX_COMPILER_VERSION}")
+  endif()
+else()
+  message(WARNING
+    "host compiler - Not found! (triSYCL supports GCC and Clang)")
+endif()
+
+#triSYCL options
+option(TRISYCL_OPENMP "triSYCL multi-threading with OpenMP" ON)
+option(TRISYCL_OPENCL "triSYCL OpenCL interoperability mode" OFF)
+option(TRISYCL_NO_ASYNC "triSYCL use synchronous kernel execution" OFF)
+option(TRISYCL_DEBUG "triSCYL use debug mode" OFF)
+option(TRISYCL_DEBUG_STRUCTORS "triSYCL trace of object lifetimes" OFF)
+option(TRISYCL_TRACE_KERNEL "triSYCL trace of kernel execution" OFF)
+
+mark_as_advanced(TRISYCL_OPENMP)
+mark_as_advanced(TRISYCL_OPENCL)
+mark_as_advanced(TRISYCL_NO_ASYNC)
+mark_as_advanced(TRISYCL_DEBUG)
+mark_as_advanced(TRISYCL_DEBUG_STRUCTORS)
+mark_as_advanced(TRISYCL_TRACE_KERNEL)
+
+#triSYCL definitions
+set(CL_SYCL_LANGUAGE_VERSION 220 CACHE STRING
+  "Host language version to be used by trisYCL (default is: 220)")
+set(TRISYCL_CL_LANGUAGE_VERSION 220 CACHE STRING
+  "Device language version to be used by trisYCL (default is: 220)")
+# triSYCL now requires c++17
+set(CMAKE_CXX_STANDARD 17)
+set(CXX_STANDARD_REQUIRED ON)
+
+
+# Find OpenCL package
+include(CMakeFindDependencyMacro)
+if(TRISYCL_OPENCL)
+  find_dependency(OpenCL REQUIRED)
+  if(UNIX)
+    set(BOOST_COMPUTE_INCPATH /usr/include/compute CACHE PATH
+      "Path to Boost.Compute headers (default is: /usr/include/compute)")
+  endif()
+endif()
+
+# Find OpenMP package
+if(TRISYCL_OPENMP)
+  find_dependency(OpenMP REQUIRED)
+endif()
+
+# Find Boost
+find_dependency(Boost 1.58 REQUIRED COMPONENTS chrono log)
+
+# If debug or trace we need boost log
+if(TRISYCL_DEBUG OR TRISYCL_DEBUG_STRUCTORS OR TRISYCL_TRACE_KERNEL)
+  set(LOG_NEEDED ON)
+else()
+  set(LOG_NEEDED OFF)
+endif()
+
+find_dependency(Threads REQUIRED)
+
+# Find triSYCL directory
+if (TRISYCL_INCLUDES AND TRISYCL_LIBRARIES)
+  set(TRISYCL_FIND_QUIETLY TRUE)
+endif ()
+
+find_path(TRISYCL_INCLUDE_DIR
+  NAMES sycl.hpp
+  PATHS $ENV{TRISYCLDIR} $ENV{TRISYCLDIR}/include ${INCLUDE_INSTALL_DIR}
+  PATH_SUFFIXES triSYCL
+)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(TriSYCL DEFAULT_MSG
+                                  TRISYCL_INCLUDE_DIR)
+
+if(NOT TRISYCL_INCLUDE_DIR)
+  message(FATAL_ERROR
+    "triSYCL include directory - Not found! (please set TRISYCL_INCLUDE_DIR")
+else()
+  message(STATUS "triSYCL include directory - Found ${TRISYCL_INCLUDE_DIR}")
+endif()
+
+include(CMakeParseArguments)
+#######################
+#  add_sycl_to_target
+#######################
+function(add_sycl_to_target)
+  set(options)
+  set(one_value_args
+    TARGET
+  )
+  set(multi_value_args
+    SOURCES
+  )
+  cmake_parse_arguments(ADD_SYCL_ARGS
+    "${options}"
+    "${one_value_args}"
+    "${multi_value_args}"
+    ${ARGN}
+  )
+
+  # Add include directories to the "#include <>" paths
+  target_include_directories (${ADD_SYCL_ARGS_TARGET} PUBLIC
+    ${TRISYCL_INCLUDE_DIR}
+    ${Boost_INCLUDE_DIRS}
+    $<$<BOOL:${TRISYCL_OPENCL}>:${OpenCL_INCLUDE_DIRS}>
+    $<$<BOOL:${TRISYCL_OPENCL}>:${BOOST_COMPUTE_INCPATH}>)
+
+  # Link dependencies
+  target_link_libraries(${ADD_SYCL_ARGS_TARGET}
+    $<$<BOOL:${TRISYCL_OPENCL}>:${OpenCL_LIBRARIES}>
+    Threads::Threads
+    $<$<BOOL:${LOG_NEEDED}>:Boost::log>
+    Boost::chrono)
+
+  # Compile definitions
+  target_compile_definitions(${ADD_SYCL_ARGS_TARGET} PUBLIC
+    EIGEN_SYCL_TRISYCL
+    $<$<BOOL:${TRISYCL_NO_ASYNC}>:TRISYCL_NO_ASYNC>
+    $<$<BOOL:${TRISYCL_OPENCL}>:TRISYCL_OPENCL>
+    $<$<BOOL:${TRISYCL_DEBUG}>:TRISYCL_DEBUG>
+    $<$<BOOL:${TRISYCL_DEBUG_STRUCTORS}>:TRISYCL_DEBUG_STRUCTORS>
+    $<$<BOOL:${TRISYCL_TRACE_KERNEL}>:TRISYCL_TRACE_KERNEL>
+    $<$<BOOL:${LOG_NEEDED}>:BOOST_LOG_DYN_LINK>)
+
+  # C++ and OpenMP requirements
+  target_compile_options(${ADD_SYCL_ARGS_TARGET} PUBLIC
+    ${TRISYCL_COMPILE_OPTIONS}
+    $<$<BOOL:${TRISYCL_OPENMP}>:${OpenMP_CXX_FLAGS}>)
+
+  if(${TRISYCL_OPENMP} AND (NOT WIN32))
+    # Does not support generator expressions
+    set_target_properties(${ADD_SYCL_ARGS_TARGET}
+      PROPERTIES
+      LINK_FLAGS ${OpenMP_CXX_FLAGS})
+  endif()
+
+endfunction()

diff --git a/cmake/FindUMFPACK.cmake b/cmake/FindUMFPACK.cmake
new file mode 100644
index 0000000..91cf637
--- /dev/null
+++ b/cmake/FindUMFPACK.cmake

@@ -0,0 +1,53 @@
+# Umfpack lib usually requires linking to a blas library.
+# It is up to the user of this module to find a BLAS and link to it.
+
+if (UMFPACK_INCLUDES AND UMFPACK_LIBRARIES)
+  set(UMFPACK_FIND_QUIETLY TRUE)
+endif ()
+
+find_path(UMFPACK_INCLUDES
+  NAMES
+  umfpack.h
+  PATHS
+  $ENV{UMFPACKDIR}
+  ${INCLUDE_INSTALL_DIR}
+  PATH_SUFFIXES
+  suitesparse
+  ufsparse
+)
+
+find_library(UMFPACK_LIBRARIES umfpack PATHS $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR})
+
+if(UMFPACK_LIBRARIES)
+
+  if(NOT UMFPACK_LIBDIR)
+    get_filename_component(UMFPACK_LIBDIR ${UMFPACK_LIBRARIES} PATH)
+  endif()
+
+  find_library(COLAMD_LIBRARY colamd PATHS ${UMFPACK_LIBDIR} $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR})
+  if(COLAMD_LIBRARY)
+    set(UMFPACK_LIBRARIES ${UMFPACK_LIBRARIES} ${COLAMD_LIBRARY})
+  endif ()
+  
+  find_library(AMD_LIBRARY amd PATHS ${UMFPACK_LIBDIR} $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR})
+  if(AMD_LIBRARY)
+    set(UMFPACK_LIBRARIES ${UMFPACK_LIBRARIES} ${AMD_LIBRARY})
+  endif ()
+
+  find_library(SUITESPARSE_LIBRARY SuiteSparse PATHS ${UMFPACK_LIBDIR} $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR})
+  if(SUITESPARSE_LIBRARY)
+    set(UMFPACK_LIBRARIES ${UMFPACK_LIBRARIES} ${SUITESPARSE_LIBRARY})
+  endif ()
+
+  find_library(CHOLMOD_LIBRARY cholmod PATHS $ENV{UMFPACK_LIBDIR} $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR})
+  if(CHOLMOD_LIBRARY)
+    set(UMFPACK_LIBRARIES ${UMFPACK_LIBRARIES} ${CHOLMOD_LIBRARY})
+  endif()
+
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(UMFPACK DEFAULT_MSG
+                                  UMFPACK_INCLUDES UMFPACK_LIBRARIES)
+
+mark_as_advanced(UMFPACK_INCLUDES UMFPACK_LIBRARIES AMD_LIBRARY COLAMD_LIBRARY CHOLMOD_LIBRARY SUITESPARSE_LIBRARY)

diff --git a/cmake/RegexUtils.cmake b/cmake/RegexUtils.cmake
new file mode 100644
index 0000000..f0a1524
--- /dev/null
+++ b/cmake/RegexUtils.cmake

@@ -0,0 +1,19 @@
+function(escape_string_as_regex _str_out _str_in)
+  string(REGEX REPLACE "\\\\" "\\\\\\\\" FILETEST2 "${_str_in}")
+  string(REGEX REPLACE "([.$+*?|-])" "\\\\\\1" FILETEST2 "${FILETEST2}")
+  string(REGEX REPLACE "\\^" "\\\\^" FILETEST2 "${FILETEST2}")
+  string(REGEX REPLACE "\\(" "\\\\(" FILETEST2 "${FILETEST2}")
+  string(REGEX REPLACE "\\)" "\\\\)" FILETEST2 "${FILETEST2}")
+  string(REGEX REPLACE "\\[" "\\\\[" FILETEST2 "${FILETEST2}")
+  string(REGEX REPLACE "\\]" "\\\\]" FILETEST2 "${FILETEST2}")
+  set(${_str_out} "${FILETEST2}" PARENT_SCOPE)
+endfunction()
+
+function(test_escape_string_as_regex)
+  set(test1 "\\.^$-+*()[]?|")
+  escape_string_as_regex(test2 "${test1}")
+  set(testRef "\\\\\\.\\^\\$\\-\\+\\*\\(\\)\\[\\]\\?\\|")
+  if(NOT test2 STREQUAL testRef)
+	message("Error in the escape_string_for_regex function : \n   ${test1} was escaped as ${test2}, should be ${testRef}")
+  endif()
+endfunction()
\ No newline at end of file

diff --git a/cmake/UseEigen3.cmake b/cmake/UseEigen3.cmake
new file mode 100644
index 0000000..a38bac8
--- /dev/null
+++ b/cmake/UseEigen3.cmake

@@ -0,0 +1,6 @@
+#                                               -*- cmake -*-
+#
+#  UseEigen3.cmake
+
+add_definitions     ( ${EIGEN3_DEFINITIONS} )
+include_directories ( ${EIGEN3_INCLUDE_DIRS} )

diff --git a/debug/gdb/__init__.py b/debug/gdb/__init__.py
new file mode 100644
index 0000000..bb7b160
--- /dev/null
+++ b/debug/gdb/__init__.py

@@ -0,0 +1 @@
+# Intentionally empty

diff --git a/debug/gdb/printers.py b/debug/gdb/printers.py
new file mode 100644
index 0000000..24961d1
--- /dev/null
+++ b/debug/gdb/printers.py

@@ -0,0 +1,314 @@
+# -*- coding: utf-8 -*-
+# This file is part of Eigen, a lightweight C++ template library
+# for linear algebra.
+#
+# Copyright (C) 2009 Benjamin Schindler <bschindler@inf.ethz.ch>
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+# Pretty printers for Eigen::Matrix
+# This is still pretty basic as the python extension to gdb is still pretty basic. 
+# It cannot handle complex eigen types and it doesn't support many of the other eigen types
+# This code supports fixed size as well as dynamic size matrices
+
+# To use it:
+#
+# * Create a directory and put the file as well as an empty __init__.py in 
+#   that directory.
+# * Create a ~/.gdbinit file, that contains the following:
+#      python
+#      import sys
+#      sys.path.insert(0, '/path/to/eigen/printer/directory')
+#      from printers import register_eigen_printers
+#      register_eigen_printers (None)
+#      end
+
+import gdb
+import re
+import itertools
+from bisect import bisect_left
+
+# Basic row/column iteration code for use with Sparse and Dense matrices
+class _MatrixEntryIterator(object):
+	
+	def __init__ (self, rows, cols, rowMajor):
+		self.rows = rows
+		self.cols = cols
+		self.currentRow = 0
+		self.currentCol = 0
+		self.rowMajor = rowMajor
+
+	def __iter__ (self):
+		return self
+
+	def next(self):
+                return self.__next__()  # Python 2.x compatibility
+
+	def __next__(self):
+		row = self.currentRow
+		col = self.currentCol
+		if self.rowMajor == 0:
+			if self.currentCol >= self.cols:
+				raise StopIteration
+				
+			self.currentRow = self.currentRow + 1
+			if self.currentRow >= self.rows:
+				self.currentRow = 0
+				self.currentCol = self.currentCol + 1
+		else:
+			if self.currentRow >= self.rows:
+				raise StopIteration
+				
+			self.currentCol = self.currentCol + 1
+			if self.currentCol >= self.cols:
+				self.currentCol = 0
+				self.currentRow = self.currentRow + 1
+
+		return (row, col)
+
+class EigenMatrixPrinter:
+	"Print Eigen Matrix or Array of some kind"
+
+	def __init__(self, variety, val):
+		"Extract all the necessary information"
+		
+		# Save the variety (presumably "Matrix" or "Array") for later usage
+		self.variety = variety
+		
+		# The gdb extension does not support value template arguments - need to extract them by hand
+		type = val.type
+		if type.code == gdb.TYPE_CODE_REF:
+			type = type.target()
+		self.type = type.unqualified().strip_typedefs()
+		tag = self.type.tag
+		regex = re.compile('\<.*\>')
+		m = regex.findall(tag)[0][1:-1]
+		template_params = m.split(',')
+		template_params = [x.replace(" ", "") for x in template_params]
+		
+		if template_params[1] == '-0x00000000000000001' or template_params[1] == '-0x000000001' or template_params[1] == '-1':
+			self.rows = val['m_storage']['m_rows']
+		else:
+			self.rows = int(template_params[1])
+		
+		if template_params[2] == '-0x00000000000000001' or template_params[2] == '-0x000000001' or template_params[2] == '-1':
+			self.cols = val['m_storage']['m_cols']
+		else:
+			self.cols = int(template_params[2])
+		
+		self.options = 0 # default value
+		if len(template_params) > 3:
+			self.options = template_params[3];
+		
+		self.rowMajor = (int(self.options) & 0x1)
+		
+		self.innerType = self.type.template_argument(0)
+		
+		self.val = val
+		
+		# Fixed size matrices have a struct as their storage, so we need to walk through this
+		self.data = self.val['m_storage']['m_data']
+		if self.data.type.code == gdb.TYPE_CODE_STRUCT:
+			self.data = self.data['array']
+			self.data = self.data.cast(self.innerType.pointer())
+			
+	class _iterator(_MatrixEntryIterator):
+		def __init__ (self, rows, cols, dataPtr, rowMajor):
+			super(EigenMatrixPrinter._iterator, self).__init__(rows, cols, rowMajor)
+
+			self.dataPtr = dataPtr
+
+		def __next__(self):
+			
+			row, col = super(EigenMatrixPrinter._iterator, self).__next__()
+			
+			item = self.dataPtr.dereference()
+			self.dataPtr = self.dataPtr + 1
+			if (self.cols == 1): #if it's a column vector
+				return ('[%d]' % (row,), item)
+			elif (self.rows == 1): #if it's a row vector
+				return ('[%d]' % (col,), item)
+			return ('[%d,%d]' % (row, col), item)
+			
+	def children(self):
+		
+		return self._iterator(self.rows, self.cols, self.data, self.rowMajor)
+		
+	def to_string(self):
+		return "Eigen::%s<%s,%d,%d,%s> (data ptr: %s)" % (self.variety, self.innerType, self.rows, self.cols, "RowMajor" if self.rowMajor else  "ColMajor", self.data)
+
+class EigenSparseMatrixPrinter:
+	"Print an Eigen SparseMatrix"
+
+	def __init__(self, val):
+		"Extract all the necessary information"
+
+		type = val.type
+		if type.code == gdb.TYPE_CODE_REF:
+			type = type.target()
+		self.type = type.unqualified().strip_typedefs()
+		tag = self.type.tag
+		regex = re.compile('\<.*\>')
+		m = regex.findall(tag)[0][1:-1]
+		template_params = m.split(',')
+		template_params = [x.replace(" ", "") for x in template_params]
+
+		self.options = 0
+		if len(template_params) > 1:
+			self.options = template_params[1];
+		
+		self.rowMajor = (int(self.options) & 0x1)
+		
+		self.innerType = self.type.template_argument(0)
+		
+		self.val = val
+
+		self.data = self.val['m_data']
+		self.data = self.data.cast(self.innerType.pointer())
+
+	class _iterator(_MatrixEntryIterator):
+		def __init__ (self, rows, cols, val, rowMajor):
+			super(EigenSparseMatrixPrinter._iterator, self).__init__(rows, cols, rowMajor)
+
+			self.val = val
+			
+		def __next__(self):
+			
+			row, col = super(EigenSparseMatrixPrinter._iterator, self).__next__()
+				
+			# repeat calculations from SparseMatrix.h:
+			outer = row if self.rowMajor else col
+			inner = col if self.rowMajor else row
+			start = self.val['m_outerIndex'][outer]
+			end = ((start + self.val['m_innerNonZeros'][outer]) if self.val['m_innerNonZeros'] else
+			       self.val['m_outerIndex'][outer+1])
+
+			# and from CompressedStorage.h:
+			data = self.val['m_data']
+			if start >= end:
+				item = 0
+			elif (end > start) and (inner == data['m_indices'][end-1]):
+				item = data['m_values'][end-1]
+			else:
+				# create Python index list from the target range within m_indices
+				indices = [data['m_indices'][x] for x in range(int(start), int(end)-1)]
+				# find the index with binary search
+				idx = int(start) + bisect_left(indices, inner)
+				if ((idx < end) and (data['m_indices'][idx] == inner)):
+					item = data['m_values'][idx]
+				else:
+					item = 0
+
+			return ('[%d,%d]' % (row, col), item)
+
+	def children(self):
+		if self.data:
+			return self._iterator(self.rows(), self.cols(), self.val, self.rowMajor)
+
+		return iter([])   # empty matrix, for now
+
+
+	def rows(self):
+		return self.val['m_outerSize'] if self.rowMajor else self.val['m_innerSize']
+
+	def cols(self):
+		return self.val['m_innerSize'] if self.rowMajor else self.val['m_outerSize']
+
+	def to_string(self):
+
+		if self.data:
+			status = ("not compressed" if self.val['m_innerNonZeros'] else "compressed")
+		else:
+			status = "empty"
+		dimensions  = "%d x %d" % (self.rows(), self.cols())
+		layout      = "row" if self.rowMajor else "column"
+
+		return "Eigen::SparseMatrix<%s>, %s, %s major, %s" % (
+			self.innerType, dimensions, layout, status )
+
+class EigenQuaternionPrinter:
+	"Print an Eigen Quaternion"
+	
+	def __init__(self, val):
+		"Extract all the necessary information"
+		# The gdb extension does not support value template arguments - need to extract them by hand
+		type = val.type
+		if type.code == gdb.TYPE_CODE_REF:
+			type = type.target()
+		self.type = type.unqualified().strip_typedefs()
+		self.innerType = self.type.template_argument(0)
+		self.val = val
+		
+		# Quaternions have a struct as their storage, so we need to walk through this
+		self.data = self.val['m_coeffs']['m_storage']['m_data']['array']
+		self.data = self.data.cast(self.innerType.pointer())
+			
+	class _iterator:
+		def __init__ (self, dataPtr):
+			self.dataPtr = dataPtr
+			self.currentElement = 0
+			self.elementNames = ['x', 'y', 'z', 'w']
+			
+		def __iter__ (self):
+			return self
+	
+		def next(self):
+			return self.__next__()  # Python 2.x compatibility
+
+		def __next__(self):
+			element = self.currentElement
+			
+			if self.currentElement >= 4: #there are 4 elements in a quanternion
+				raise StopIteration
+			
+			self.currentElement = self.currentElement + 1
+			
+			item = self.dataPtr.dereference()
+			self.dataPtr = self.dataPtr + 1
+			return ('[%s]' % (self.elementNames[element],), item)
+			
+	def children(self):
+		
+		return self._iterator(self.data)
+	
+	def to_string(self):
+		return "Eigen::Quaternion<%s> (data ptr: %s)" % (self.innerType, self.data)
+
+def build_eigen_dictionary ():
+	pretty_printers_dict[re.compile('^Eigen::Quaternion<.*>$')] = lambda val: EigenQuaternionPrinter(val)
+	pretty_printers_dict[re.compile('^Eigen::Matrix<.*>$')] = lambda val: EigenMatrixPrinter("Matrix", val)
+	pretty_printers_dict[re.compile('^Eigen::SparseMatrix<.*>$')] = lambda val: EigenSparseMatrixPrinter(val)
+	pretty_printers_dict[re.compile('^Eigen::Array<.*>$')]  = lambda val: EigenMatrixPrinter("Array",  val)
+
+def register_eigen_printers(obj):
+	"Register eigen pretty-printers with objfile Obj"
+
+	if obj == None:
+		obj = gdb
+	obj.pretty_printers.append(lookup_function)
+
+def lookup_function(val):
+	"Look-up and return a pretty-printer that can print va."
+	
+	type = val.type
+	
+	if type.code == gdb.TYPE_CODE_REF:
+		type = type.target()
+	
+	type = type.unqualified().strip_typedefs()
+	
+	typename = type.tag
+	if typename == None:
+		return None
+	
+	for function in pretty_printers_dict:
+		if function.search(typename):
+			return pretty_printers_dict[function](val)
+	
+	return None
+
+pretty_printers_dict = {}
+
+build_eigen_dictionary ()

diff --git a/debug/msvc/eigen.natvis b/debug/msvc/eigen.natvis
new file mode 100644
index 0000000..da89857
--- /dev/null
+++ b/debug/msvc/eigen.natvis

@@ -0,0 +1,235 @@
+<?xml version="1.0" encoding="utf-8"?>

+

+<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">

+

+  <!-- Fixed x Fixed Matrix -->

+  <Type Name="Eigen::Matrix&lt;*,*,*,*,*,*&gt;">      

+      <AlternativeType Name="Eigen::Array&lt;*,-1,-1,*,*,*&gt;"/>

+      <DisplayString>[{$T2}, {$T3}] (fixed matrix)</DisplayString>

+      <Expand>

+        <ArrayItems Condition="Flags%2"> <!-- row major layout -->

+          <Rank>2</Rank>

+          <Size>$i==0 ? $T2 : $T3</Size>

+          <ValuePointer>m_storage.m_data.array</ValuePointer>

+        </ArrayItems>

+        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->

+          <Direction>Backward</Direction>

+          <Rank>2</Rank>

+          <Size>$i==0 ? $T2 : $T3</Size>

+          <ValuePointer>m_storage.m_data.array</ValuePointer>

+        </ArrayItems>

+      </Expand>

+  </Type>

+  

+  <!-- 2 x 2 Matrix -->

+  <Type Name="Eigen::Matrix&lt;*,2,2,*,*,*&gt;">      

+      <AlternativeType Name="Eigen::Array&lt;*,2,2,*,*,*&gt;"/>

+      <DisplayString>[2, 2] (fixed matrix)</DisplayString>

+      <Expand>

+        <Synthetic Name="[row 0]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 0]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[2]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 1]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 1]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[3]})</DisplayString>

+        </Synthetic>        

+      </Expand>

+  </Type>

+  

+  <!-- 3 x 3 Matrix -->

+  <Type Name="Eigen::Matrix&lt;*,3,3,*,*,*&gt;">      

+      <AlternativeType Name="Eigen::Array&lt;*,3,3,*,*,*&gt;"/>

+      <DisplayString>[3, 3] (fixed matrix)</DisplayString>

+      <Expand>

+        <Synthetic Name="[row 0]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 0]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[3]}, {m_storage.m_data.array[6]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 1]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[5]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 1]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[7]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 2]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[6]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[8]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 2]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[8]})</DisplayString>

+        </Synthetic>        

+      </Expand>

+  </Type>

+  

+  <!-- 4 x 4 Matrix -->

+  <Type Name="Eigen::Matrix&lt;*,4,4,*,*,*&gt;">      

+      <AlternativeType Name="Eigen::Array&lt;*,4,4,*,*,*&gt;"/>

+      <DisplayString>[4, 4] (fixed matrix)</DisplayString>

+      <Expand>

+        <Synthetic Name="[row 0]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 0]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[8]}, {m_storage.m_data.array[12]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 1]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[4]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[7]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 1]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[13]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 2]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[8]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[11]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 2]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[14]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 3]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[12]}, {m_storage.m_data.array[13]}, {m_storage.m_data.array[14]}, {m_storage.m_data.array[15]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 3]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[11]}, {m_storage.m_data.array[15]})</DisplayString>

+        </Synthetic>        

+      </Expand>

+  </Type>  

+  

+  <!-- Dynamic x Dynamic Matrix -->

+  <Type Name="Eigen::Matrix&lt;*,-1,-1,*,*,*&gt;">      

+      <AlternativeType Name="Eigen::Array&lt;*,-1,-1,*,*,*&gt;"/>

+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

+      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {m_storage.m_cols}] (dynamic matrix)</DisplayString>

+      <Expand>

+        <ArrayItems Condition="Flags%2"> <!-- row major layout -->

+          <Rank>2</Rank>

+          <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->

+          <Direction>Backward</Direction>

+          <Rank>2</Rank>

+          <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+      </Expand>

+  </Type>

+  

+  <!-- Fixed x Dynamic Matrix -->

+  <Type Name="Eigen::Matrix&lt;*,*,-1,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Array&lt;*,*,-1,*,*,*&gt;"/>

+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

+      <DisplayString Condition="m_storage.m_data != 0">[{$T2}, {m_storage.m_cols}] (dynamic column matrix)</DisplayString>

+      <Expand>

+        <ArrayItems Condition="Flags%2"> <!-- row major layout -->

+          <Rank>2</Rank>

+          <Size>$i==0 ? $T2 : m_storage.m_cols</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->

+          <Direction>Backward</Direction>

+          <Rank>2</Rank>

+          <Size>$i==0 ? $T2 : m_storage.m_cols</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+      </Expand>

+  </Type>

+  

+  <!-- Dynamic x Fixed Matrix -->

+  <Type Name="Eigen::Matrix&lt;*,-1,*,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Array&lt;*,-1,*,*,*,*&gt;"/>

+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

+      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {$T2}] (dynamic row matrix)</DisplayString>

+      <Expand>

+        <ArrayItems Condition="Flags%2"> <!-- row major layout -->

+          <Rank>2</Rank>

+          <Size>$i==0 ? m_storage.m_rows : $T2</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->

+          <Direction>Backward</Direction>

+          <Rank>2</Rank>

+          <Size>$i==0 ? m_storage.m_rows : $T2</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+      </Expand>

+  </Type>

+  

+  <!-- Dynamic Column Vector -->

+  <Type Name="Eigen::Matrix&lt;*,1,-1,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Array&lt;*,1,-1,*,*,*&gt;"/>

+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

+      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_cols}] (dynamic column vector)</DisplayString>

+      <Expand>

+        <Item Name="[size]">m_storage.m_cols</Item>

+        <ArrayItems>

+          <Size>m_storage.m_cols</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+      </Expand>

+  </Type>

+  

+  <!-- Dynamic Row Vector -->

+  <Type Name="Eigen::Matrix&lt;*,-1,1,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Array&lt;*,-1,1,*,*,*&gt;"/>

+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

+      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}] (dynamic row vector)</DisplayString>

+      <Expand>

+        <Item Name="[size]">m_storage.m_rows</Item>

+        <ArrayItems>

+          <Size>m_storage.m_rows</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+      </Expand>

+  </Type>

+  

+  <!-- Fixed Vector -->

+  <Type Name="Eigen::Matrix&lt;*,1,1,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Array&lt;*,1,1,*,*,*&gt;"/>

+      <DisplayString>[1] ({m_storage.m_data.array[0]})</DisplayString>

+      <Expand>

+        <Item Name="[x]">m_storage.m_data.array[0]</Item>

+      </Expand>

+  </Type>

+  

+  <Type Name="Eigen::Matrix&lt;*,2,1,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Matrix&lt;*,1,2,*,*,*&gt;"/>

+      <AlternativeType Name="Eigen::Array&lt;*,2,1,*,*,*&gt;"/>

+      <AlternativeType Name="Eigen::Array&lt;*,1,2,*,*,*&gt;"/>

+      <DisplayString>[2] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>

+      <Expand>

+        <Item Name="[x]">m_storage.m_data.array[0]</Item>

+        <Item Name="[y]">m_storage.m_data.array[1]</Item>

+      </Expand>

+  </Type>

+  

+  <Type Name="Eigen::Matrix&lt;*,3,1,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Matrix&lt;*,1,3,*,*,*&gt;"/>

+      <AlternativeType Name="Eigen::Array&lt;*,3,1,*,*,*&gt;"/>

+      <AlternativeType Name="Eigen::Array&lt;*,1,3,*,*,*&gt;"/>

+      <DisplayString>[3] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>

+      <Expand>

+        <Item Name="[x]">m_storage.m_data.array[0]</Item>

+        <Item Name="[y]">m_storage.m_data.array[1]</Item>

+        <Item Name="[z]">m_storage.m_data.array[2]</Item>

+      </Expand>

+  </Type>

+  

+    <Type Name="Eigen::Matrix&lt;*,4,1,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Matrix&lt;*,1,4,*,*,*&gt;"/>

+      <AlternativeType Name="Eigen::Array&lt;*,4,1,*,*,*&gt;"/>

+      <AlternativeType Name="Eigen::Array&lt;*,1,4,*,*,*&gt;"/>

+      <DisplayString>[4] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>

+      <Expand>

+        <Item Name="[x]">m_storage.m_data.array[0]</Item>

+        <Item Name="[y]">m_storage.m_data.array[1]</Item>

+        <Item Name="[z]">m_storage.m_data.array[2]</Item>

+        <Item Name="[w]">m_storage.m_data.array[3]</Item>

+      </Expand>

+  </Type>

+

+</AutoVisualizer>


diff --git a/debug/msvc/eigen_autoexp_part.dat b/debug/msvc/eigen_autoexp_part.dat
new file mode 100644
index 0000000..35ef580
--- /dev/null
+++ b/debug/msvc/eigen_autoexp_part.dat

@@ -0,0 +1,295 @@
+; ***************************************************************

+; * Eigen Visualizer

+; *

+; * Author: Hauke Heibel <hauke.heibel@gmail.com>

+; *

+; * Support the enhanced debugging of the following Eigen

+; * types (*: any, +:fixed dimension) :

+; *

+; * - Eigen::Matrix<*,4,1,*,*,*> and Eigen::Matrix<*,1,4,*,*,*>

+; * - Eigen::Matrix<*,3,1,*,*,*> and Eigen::Matrix<*,1,3,*,*,*>

+; * - Eigen::Matrix<*,2,1,*,*,*> and Eigen::Matrix<*,1,2,*,*,*>

+; * - Eigen::Matrix<*,-1,-1,*,*,*>

+; * - Eigen::Matrix<*,+,-1,*,*,*>

+; * - Eigen::Matrix<*,-1,+,*,*,*>

+; * - Eigen::Matrix<*,+,+,*,*,*>

+; *

+; * Matrices are displayed properly independently of the memory

+; * alignment (RowMajor vs. ColMajor).

+; *

+; * This file is distributed WITHOUT ANY WARRANTY. Please ensure

+; * that your original autoexp.dat file is copied to a safe 

+; * place before proceeding with its modification.

+; ***************************************************************

+

+[Visualizer]

+

+; Fixed size 4-vectors

+Eigen::Matrix<*,4,1,*,*,*>|Eigen::Matrix<*,1,4,*,*,*>{

+   children

+   (

+      #(

+        [internals]: [$c,!],

+         x : ($c.m_storage.m_data.array)[0],

+         y : ($c.m_storage.m_data.array)[1],

+         z : ($c.m_storage.m_data.array)[2],

+         w : ($c.m_storage.m_data.array)[3]

+      )

+   )

+

+   preview

+   (

+      #(

+        "[",

+        4,

+        "](",

+        #array(expr: $e.m_storage.m_data.array[$i], size: 4),

+        ")"

+      )

+   )

+}

+

+; Fixed size 3-vectors

+Eigen::Matrix<*,3,1,*,*,*>|Eigen::Matrix<*,1,3,*,*,*>{

+   children

+   (

+      #(

+        [internals]: [$c,!],

+         x : ($c.m_storage.m_data.array)[0],

+         y : ($c.m_storage.m_data.array)[1],

+         z : ($c.m_storage.m_data.array)[2]

+      )

+   )

+

+   preview

+   (

+      #(

+        "[",

+        3,

+        "](",

+        #array(expr: $e.m_storage.m_data.array[$i], size: 3),

+        ")"

+      )

+   )

+}

+

+; Fixed size 2-vectors

+Eigen::Matrix<*,2,1,*,*,*>|Eigen::Matrix<*,1,2,*,*,*>{

+   children

+   (

+      #(

+        [internals]: [$c,!],

+         x : ($c.m_storage.m_data.array)[0],

+         y : ($c.m_storage.m_data.array)[1]

+      )

+   )

+

+   preview

+   (

+      #(

+        "[",

+        2,

+        "](",

+        #array(expr: $e.m_storage.m_data.array[$i], size: 2),

+        ")"

+      )

+   )

+}

+

+; Fixed size 1-vectors

+Eigen::Matrix<*,1,1,*,*,*>|Eigen::Matrix<*,1,1,*,*,*>{

+   children

+   (

+      #(

+        [internals]: [$c,!],

+         x : ($c.m_storage.m_data.array)[0]

+      )

+   )

+

+   preview

+   (

+      #(

+        "[",

+        1,

+        "](",

+        #array(expr: $e.m_storage.m_data.array[$i], size: 1),

+        ")"

+      )

+   )

+}

+

+; Dynamic matrices (ColMajor and RowMajor support)

+Eigen::Matrix<*,-1,-1,*,*,*>{

+  children

+   (

+      #(

+         [internals]: [$c,!],

+         rows: $c.m_storage.m_rows,

+         cols: $c.m_storage.m_cols,

+         ; Check for RowMajorBit

+         #if ($c.Flags & 0x1) (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.m_storage.m_cols + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], 

+                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols

+             )

+         ) #else (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data)[$i],

+                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols

+             )

+         )

+      )

+   )

+

+   preview

+   (

+     #(

+         "[",

+           $c.m_storage.m_rows,

+         ",",

+           $c.m_storage.m_cols,

+         "](",

+           #array(

+            expr :    [($c.m_storage.m_data)[$i],g],

+            size :    $c.m_storage.m_rows*$c.m_storage.m_cols

+           ),

+         ")"

+      )

+   )

+}

+

+; Fixed rows, dynamic columns matrix (ColMajor and RowMajor support)

+Eigen::Matrix<*,*,-1,*,*,*>{

+  children

+   (

+      #(

+         [internals]: [$c,!],

+         rows: $c.RowsAtCompileTime,

+         cols: $c.m_storage.m_cols,

+         ; Check for RowMajorBit

+         #if ($c.Flags & 0x1) (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data)[($i % $c.RowsAtCompileTime)*$c.m_storage.m_cols + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],

+                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols

+             )

+         ) #else (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data)[$i],

+                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols

+             )

+         )

+      )

+   )

+

+   preview

+   (

+     #(

+         "[",

+           $c.RowsAtCompileTime,

+         ",",

+           $c.m_storage.m_cols,

+         "](",

+           #array(

+            expr :    [($c.m_storage.m_data)[$i],g],

+            size :    $c.RowsAtCompileTime*$c.m_storage.m_cols

+           ),

+         ")"

+      )

+   )

+}

+

+; Dynamic rows, fixed columns matrix (ColMajor and RowMajor support)

+Eigen::Matrix<*,-1,*,*,*,*>{

+  children

+   (

+      #(

+         [internals]: [$c,!],

+         rows: $c.m_storage.m_rows,

+         cols: $c.ColsAtCompileTime,

+         ; Check for RowMajorBit

+         #if ($c.Flags & 0x1) (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.ColsAtCompileTime + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], 

+                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime

+             )

+         ) #else (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data)[$i],

+                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime

+             )

+         )

+      )

+   )

+

+   preview

+   (

+     #(

+         "[",

+           $c.m_storage.m_rows,

+         ",",

+           $c.ColsAtCompileTime,

+         "](",

+           #array(

+            expr :    [($c.m_storage.m_data)[$i],g],

+            size :    $c.m_storage.m_rows*$c.ColsAtCompileTime

+           ),

+         ")"

+      )

+   )

+}

+

+; Fixed size matrix (ColMajor and RowMajor support)

+Eigen::Matrix<*,*,*,*,*,*>{

+  children

+   (

+      #(

+         [internals]: [$c,!],

+         rows: $c.RowsAtCompileTime,

+         cols: $c.ColsAtCompileTime,

+         ; Check for RowMajorBit

+         #if ($c.Flags & 0x1) (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data.array)[($i % $c.RowsAtCompileTime)*$c.ColsAtCompileTime + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)], 

+                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime

+             )

+         ) #else (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data.array)[$i],

+                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime

+             )

+         )

+      )

+   )

+

+   preview

+   (

+     #(

+         "[",

+           $c.RowsAtCompileTime,

+         ",",

+           $c.ColsAtCompileTime,

+         "](",

+           #array(

+            expr :    [($c.m_storage.m_data.array)[$i],g],

+            size :    $c.RowsAtCompileTime*$c.ColsAtCompileTime

+           ),

+         ")"

+      )

+   )

+}


diff --git a/demos/CMakeLists.txt b/demos/CMakeLists.txt
new file mode 100644
index 0000000..deb560f
--- /dev/null
+++ b/demos/CMakeLists.txt

@@ -0,0 +1,13 @@
+project(EigenDemos)
+
+add_custom_target(demos)
+
+if(NOT EIGEN_TEST_NOQT)
+  find_package(Qt4)
+  if(QT4_FOUND)
+    add_subdirectory(mandelbrot)
+    add_subdirectory(opengl)
+  else()
+    message(STATUS "Qt4 not found, so disabling the mandelbrot and opengl demos")
+  endif()
+endif()

diff --git a/demos/mandelbrot/CMakeLists.txt b/demos/mandelbrot/CMakeLists.txt
new file mode 100644
index 0000000..ae6001d
--- /dev/null
+++ b/demos/mandelbrot/CMakeLists.txt

@@ -0,0 +1,21 @@
+find_package(Qt4 REQUIRED)
+
+set(CMAKE_INCLUDE_CURRENT_DIR ON)
+
+if (CMAKE_COMPILER_IS_GNUCXX)
+   set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
+   add_definitions ( "-DNDEBUG" )
+endif ()
+
+include_directories( ${QT_INCLUDE_DIR} )
+
+set(mandelbrot_SRCS
+    mandelbrot.cpp
+)
+
+qt4_automoc(${mandelbrot_SRCS})
+
+add_executable(mandelbrot ${mandelbrot_SRCS})
+add_dependencies(demos mandelbrot)
+
+target_link_libraries(mandelbrot ${QT_QTCORE_LIBRARY} ${QT_QTGUI_LIBRARY})

diff --git a/demos/mandelbrot/README b/demos/mandelbrot/README
new file mode 100644
index 0000000..a451d65
--- /dev/null
+++ b/demos/mandelbrot/README

@@ -0,0 +1,10 @@
+*** Mandelbrot demo ***
+
+Controls:
+* Left mouse button to center view at a point.
+* Drag vertically with left mouse button to zoom in and out.
+
+Be sure to enable SSE2 or AltiVec to improve performance.
+
+The number of iterations, and the choice between single and double precision, are
+determined at runtime depending on the zoom level.

diff --git a/demos/mandelbrot/mandelbrot.cpp b/demos/mandelbrot/mandelbrot.cpp
new file mode 100644
index 0000000..5d575d5
--- /dev/null
+++ b/demos/mandelbrot/mandelbrot.cpp

@@ -0,0 +1,213 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "mandelbrot.h"
+#include <iostream>
+#include<QtGui/QPainter>
+#include<QtGui/QImage>
+#include<QtGui/QMouseEvent>
+#include<QtCore/QTime>
+
+void MandelbrotWidget::resizeEvent(QResizeEvent *)
+{
+  if(size < width() * height())
+  {
+    std::cout << "reallocate buffer" << std::endl;
+    size = width() * height();
+    if(buffer) delete[]buffer;
+    buffer = new unsigned char[4*size];
+  }
+}
+
+template<typename T> struct iters_before_test { enum { ret = 8 }; };
+template<> struct iters_before_test<double> { enum { ret = 16 }; };
+
+template<typename Real> void MandelbrotThread::render(int img_width, int img_height)
+{
+  enum { packetSize = Eigen::internal::packet_traits<Real>::size }; // number of reals in a Packet
+  typedef Eigen::Array<Real, packetSize, 1> Packet; // wrap a Packet as a vector
+
+  enum { iters_before_test = iters_before_test<Real>::ret };
+  max_iter = (max_iter / iters_before_test) * iters_before_test;
+  const int alignedWidth = (img_width/packetSize)*packetSize;
+  unsigned char *const buffer = widget->buffer;
+  const double xradius = widget->xradius;
+  const double yradius = xradius * img_height / img_width;
+  const int threadcount = widget->threadcount;
+  typedef Eigen::Array<Real, 2, 1> Vector2;
+  Vector2 start(widget->center.x() - widget->xradius, widget->center.y() - yradius);
+  Vector2 step(2*widget->xradius/img_width, 2*yradius/img_height);
+  total_iter = 0;
+
+  for(int y = id; y < img_height; y += threadcount)
+  {
+    int pix = y * img_width;
+
+    // for each pixel, we're going to do the iteration z := z^2 + c where z and c are complex numbers, 
+    // starting with z = c = complex coord of the pixel. pzi and pzr denote the real and imaginary parts of z.
+    // pci and pcr denote the real and imaginary parts of c.
+
+    Packet pzi_start, pci_start;
+    for(int i = 0; i < packetSize; i++) pzi_start[i] = pci_start[i] = start.y() + y * step.y();
+
+    for(int x = 0; x < alignedWidth; x += packetSize, pix += packetSize)
+    {
+      Packet pcr, pci = pci_start, pzr, pzi = pzi_start, pzr_buf;
+      for(int i = 0; i < packetSize; i++) pzr[i] = pcr[i] = start.x() + (x+i) * step.x();
+
+      // do the iterations. Every iters_before_test iterations we check for divergence,
+      // in which case we can stop iterating.
+      int j = 0;
+      typedef Eigen::Matrix<int, packetSize, 1> Packeti;
+      Packeti pix_iter = Packeti::Zero(), // number of iteration per pixel in the packet
+              pix_dont_diverge; // whether or not each pixel has already diverged
+      do
+      {
+        for(int i = 0; i < iters_before_test/4; i++) // peel the inner loop by 4
+        {
+#         define ITERATE \
+            pzr_buf = pzr; \
+            pzr = pzr.square(); \
+            pzr -= pzi.square(); \
+            pzr += pcr; \
+            pzi = (2*pzr_buf)*pzi; \
+            pzi += pci;
+          ITERATE ITERATE ITERATE ITERATE
+        }
+        pix_dont_diverge = ((pzr.square() + pzi.square())
+                           .eval() // temporary fix as what follows is not yet vectorized by Eigen
+                           <= Packet::Constant(4))
+                                // the 4 here is not a magic value, it's a math fact that if
+                                // the square modulus is >4 then divergence is inevitable.
+                           .template cast<int>();
+        pix_iter += iters_before_test * pix_dont_diverge;
+        j++;
+        total_iter += iters_before_test * packetSize;
+      }
+      while(j < max_iter/iters_before_test && pix_dont_diverge.any()); // any() is not yet vectorized by Eigen
+
+      // compute pixel colors
+      for(int i = 0; i < packetSize; i++)
+      {
+        buffer[4*(pix+i)] = 255*pix_iter[i]/max_iter;
+        buffer[4*(pix+i)+1] = 0;
+        buffer[4*(pix+i)+2] = 0;
+      }
+    }
+
+    // if the width is not a multiple of packetSize, fill the remainder in black
+    for(int x = alignedWidth; x < img_width; x++, pix++)
+      buffer[4*pix] = buffer[4*pix+1] = buffer[4*pix+2] = 0;
+  }
+  return;
+}
+
+void MandelbrotThread::run()
+{
+  setTerminationEnabled(true);
+  double resolution = widget->xradius*2/widget->width();
+  max_iter = 128;
+  if(resolution < 1e-4f) max_iter += 128 * ( - 4 - std::log10(resolution));
+  int img_width = widget->width()/widget->draft;
+  int img_height = widget->height()/widget->draft;
+  single_precision = resolution > 1e-7f;
+
+  if(single_precision)
+    render<float>(img_width, img_height);
+  else
+    render<double>(img_width, img_height);
+}
+
+void MandelbrotWidget::paintEvent(QPaintEvent *)
+{
+  static float max_speed = 0;
+  long long total_iter = 0;
+
+  QTime time;
+  time.start();
+  for(int th = 0; th < threadcount; th++)
+    threads[th]->start(QThread::LowPriority);
+  for(int th = 0; th < threadcount; th++)
+  {
+    threads[th]->wait();
+    total_iter += threads[th]->total_iter;
+  }
+  int elapsed = time.elapsed();
+
+  if(draft == 1)
+  {
+    float speed = elapsed ? float(total_iter)*1000/elapsed : 0;
+    max_speed = std::max(max_speed, speed);
+    std::cout << threadcount << " threads, "
+              << elapsed << " ms, "
+              << speed << " iters/s (max " << max_speed << ")" << std::endl;
+    int packetSize = threads[0]->single_precision
+                   ? int(Eigen::internal::packet_traits<float>::size)
+                   : int(Eigen::internal::packet_traits<double>::size);
+    setWindowTitle(QString("resolution ")+QString::number(xradius*2/width(), 'e', 2)
+                  +QString(", %1 iterations per pixel, ").arg(threads[0]->max_iter)
+                  +(threads[0]->single_precision ? QString("single ") : QString("double "))
+                  +QString("precision, ")
+                  +(packetSize==1 ? QString("no vectorization")
+                                  : QString("vectorized (%1 per packet)").arg(packetSize)));
+  }
+  
+  QImage image(buffer, width()/draft, height()/draft, QImage::Format_RGB32);
+  QPainter painter(this);
+  painter.drawImage(QPoint(0, 0), image.scaled(width(), height()));
+
+  if(draft>1)
+  {
+    draft /= 2;
+    setWindowTitle(QString("recomputing at 1/%1 resolution...").arg(draft));
+    update();
+  }
+}
+
+void MandelbrotWidget::mousePressEvent(QMouseEvent *event)
+{
+  if( event->buttons() & Qt::LeftButton )
+  {
+    lastpos = event->pos();
+    double yradius = xradius * height() / width();
+    center = Eigen::Vector2d(center.x() + (event->pos().x() - width()/2) * xradius * 2 / width(),
+                             center.y() + (event->pos().y() - height()/2) * yradius * 2 / height());
+    draft = 16;
+    for(int th = 0; th < threadcount; th++)
+      threads[th]->terminate();
+    update();
+  }
+}
+
+void MandelbrotWidget::mouseMoveEvent(QMouseEvent *event)
+{
+  QPoint delta = event->pos() - lastpos;
+  lastpos = event->pos();
+  if( event->buttons() & Qt::LeftButton )
+  {
+    double t = 1 + 5 * double(delta.y()) / height();
+    if(t < 0.5) t = 0.5;
+    if(t > 2) t = 2;
+    xradius *= t;
+    draft = 16;
+    for(int th = 0; th < threadcount; th++)
+      threads[th]->terminate();
+    update();
+  }
+}
+
+int main(int argc, char *argv[])
+{
+  QApplication app(argc, argv);
+  MandelbrotWidget w;
+  w.show();
+  return app.exec();
+}
+
+#include "mandelbrot.moc"

diff --git a/demos/mandelbrot/mandelbrot.h b/demos/mandelbrot/mandelbrot.h
new file mode 100644
index 0000000..a687fd0
--- /dev/null
+++ b/demos/mandelbrot/mandelbrot.h

@@ -0,0 +1,71 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef MANDELBROT_H
+#define MANDELBROT_H
+
+#include <Eigen/Core>
+#include <QtGui/QApplication>
+#include <QtGui/QWidget>
+#include <QtCore/QThread>
+
+class MandelbrotWidget;
+
+class MandelbrotThread : public QThread
+{
+    friend class MandelbrotWidget;
+    MandelbrotWidget *widget;
+    long long total_iter;
+    int id, max_iter;
+    bool single_precision;
+
+  public:
+    MandelbrotThread(MandelbrotWidget *w, int i) : widget(w), id(i) {}
+    void run();
+    template<typename Real> void render(int img_width, int img_height);
+};
+
+class MandelbrotWidget : public QWidget
+{
+    Q_OBJECT
+
+    friend class MandelbrotThread;
+    Eigen::Vector2d center;
+    double xradius;
+    int size;
+    unsigned char *buffer;
+    QPoint lastpos;
+    int draft;
+    MandelbrotThread **threads;
+    int threadcount;
+
+  protected:
+    void resizeEvent(QResizeEvent *);
+    void paintEvent(QPaintEvent *);
+    void mousePressEvent(QMouseEvent *event);
+    void mouseMoveEvent(QMouseEvent *event);
+
+  public:
+    MandelbrotWidget() : QWidget(), center(0,0), xradius(2),
+                         size(0), buffer(0), draft(16)
+    {
+      setAutoFillBackground(false);
+      threadcount = QThread::idealThreadCount();
+      threads = new MandelbrotThread*[threadcount];
+      for(int th = 0; th < threadcount; th++) threads[th] = new MandelbrotThread(this, th);
+    }
+    ~MandelbrotWidget()
+    {
+      if(buffer) delete[]buffer;
+      for(int th = 0; th < threadcount; th++) delete threads[th];
+      delete[] threads;
+    }
+};
+
+#endif // MANDELBROT_H

diff --git a/demos/mix_eigen_and_c/README b/demos/mix_eigen_and_c/README
new file mode 100644
index 0000000..d9cc927
--- /dev/null
+++ b/demos/mix_eigen_and_c/README

@@ -0,0 +1,9 @@
+This is an example of how one can wrap some of Eigen into a C library.
+
+To try this with GCC, do:
+
+  g++ -c binary_library.cpp -O2 -msse2 -I ../..
+  gcc example.c binary_library.o -o example -lstdc++
+  ./example
+
+TODO: add CMakeLists, add more explanations here

diff --git a/demos/mix_eigen_and_c/binary_library.cpp b/demos/mix_eigen_and_c/binary_library.cpp
new file mode 100644
index 0000000..15a2d03
--- /dev/null
+++ b/demos/mix_eigen_and_c/binary_library.cpp

@@ -0,0 +1,185 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// This C++ file compiles to binary code that can be linked to by your C program,
+// thanks to the extern "C" syntax used in the declarations in binary_library.h.
+
+#include "binary_library.h"
+
+#include <Eigen/Core>
+
+using namespace Eigen;
+
+/************************* pointer conversion methods **********************************************/
+
+////// class MatrixXd //////
+
+inline MatrixXd& c_to_eigen(C_MatrixXd* ptr)
+{
+  return *reinterpret_cast<MatrixXd*>(ptr);
+}
+
+inline const MatrixXd& c_to_eigen(const C_MatrixXd* ptr)
+{
+  return *reinterpret_cast<const MatrixXd*>(ptr);
+}
+
+inline C_MatrixXd* eigen_to_c(MatrixXd& ref)
+{
+  return reinterpret_cast<C_MatrixXd*>(&ref);
+}
+
+inline const C_MatrixXd* eigen_to_c(const MatrixXd& ref)
+{
+  return reinterpret_cast<const C_MatrixXd*>(&ref);
+}
+
+////// class Map<MatrixXd> //////
+
+inline Map<MatrixXd>& c_to_eigen(C_Map_MatrixXd* ptr)
+{
+  return *reinterpret_cast<Map<MatrixXd>*>(ptr);
+}
+
+inline const Map<MatrixXd>& c_to_eigen(const C_Map_MatrixXd* ptr)
+{
+  return *reinterpret_cast<const Map<MatrixXd>*>(ptr);
+}
+
+inline C_Map_MatrixXd* eigen_to_c(Map<MatrixXd>& ref)
+{
+  return reinterpret_cast<C_Map_MatrixXd*>(&ref);
+}
+
+inline const C_Map_MatrixXd* eigen_to_c(const Map<MatrixXd>& ref)
+{
+  return reinterpret_cast<const C_Map_MatrixXd*>(&ref);
+}
+
+
+/************************* implementation of classes **********************************************/
+
+
+////// class MatrixXd //////
+
+
+C_MatrixXd* MatrixXd_new(int rows, int cols)
+{
+  return eigen_to_c(*new MatrixXd(rows,cols));
+}
+
+void MatrixXd_delete(C_MatrixXd *m)
+{
+  delete &c_to_eigen(m);
+}
+
+double* MatrixXd_data(C_MatrixXd *m)
+{
+  return c_to_eigen(m).data();
+}
+
+void MatrixXd_set_zero(C_MatrixXd *m)
+{
+  c_to_eigen(m).setZero();
+}
+
+void MatrixXd_resize(C_MatrixXd *m, int rows, int cols)
+{
+  c_to_eigen(m).resize(rows,cols);
+}
+
+void MatrixXd_copy(C_MatrixXd *dst, const C_MatrixXd *src)
+{
+  c_to_eigen(dst) = c_to_eigen(src);
+}
+
+void MatrixXd_copy_map(C_MatrixXd *dst, const C_Map_MatrixXd *src)
+{
+  c_to_eigen(dst) = c_to_eigen(src);
+}
+
+void MatrixXd_set_coeff(C_MatrixXd *m, int i, int j, double coeff)
+{
+  c_to_eigen(m)(i,j) = coeff;
+}
+
+double MatrixXd_get_coeff(const C_MatrixXd *m, int i, int j)
+{
+  return c_to_eigen(m)(i,j);
+}
+
+void MatrixXd_print(const C_MatrixXd *m)
+{
+  std::cout << c_to_eigen(m) << std::endl;
+}
+
+void MatrixXd_multiply(const C_MatrixXd *m1, const C_MatrixXd *m2, C_MatrixXd *result)
+{
+  c_to_eigen(result) = c_to_eigen(m1) * c_to_eigen(m2);
+}
+
+void MatrixXd_add(const C_MatrixXd *m1, const C_MatrixXd *m2, C_MatrixXd *result)
+{
+  c_to_eigen(result) = c_to_eigen(m1) + c_to_eigen(m2);
+}
+
+
+
+////// class Map_MatrixXd //////
+
+
+C_Map_MatrixXd* Map_MatrixXd_new(double *array, int rows, int cols)
+{
+  return eigen_to_c(*new Map<MatrixXd>(array,rows,cols));
+}
+
+void Map_MatrixXd_delete(C_Map_MatrixXd *m)
+{
+  delete &c_to_eigen(m);
+}
+
+void Map_MatrixXd_set_zero(C_Map_MatrixXd *m)
+{
+  c_to_eigen(m).setZero();
+}
+
+void Map_MatrixXd_copy(C_Map_MatrixXd *dst, const C_Map_MatrixXd *src)
+{
+  c_to_eigen(dst) = c_to_eigen(src);
+}
+
+void Map_MatrixXd_copy_matrix(C_Map_MatrixXd *dst, const C_MatrixXd *src)
+{
+  c_to_eigen(dst) = c_to_eigen(src);
+}
+
+void Map_MatrixXd_set_coeff(C_Map_MatrixXd *m, int i, int j, double coeff)
+{
+  c_to_eigen(m)(i,j) = coeff;
+}
+
+double Map_MatrixXd_get_coeff(const C_Map_MatrixXd *m, int i, int j)
+{
+  return c_to_eigen(m)(i,j);
+}
+
+void Map_MatrixXd_print(const C_Map_MatrixXd *m)
+{
+  std::cout << c_to_eigen(m) << std::endl;
+}
+
+void Map_MatrixXd_multiply(const C_Map_MatrixXd *m1, const C_Map_MatrixXd *m2, C_Map_MatrixXd *result)
+{
+  c_to_eigen(result) = c_to_eigen(m1) * c_to_eigen(m2);
+}
+
+void Map_MatrixXd_add(const C_Map_MatrixXd *m1, const C_Map_MatrixXd *m2, C_Map_MatrixXd *result)
+{
+  c_to_eigen(result) = c_to_eigen(m1) + c_to_eigen(m2);
+}

diff --git a/demos/mix_eigen_and_c/binary_library.h b/demos/mix_eigen_and_c/binary_library.h
new file mode 100644
index 0000000..9b63fac
--- /dev/null
+++ b/demos/mix_eigen_and_c/binary_library.h

@@ -0,0 +1,71 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// This is a pure C header, no C++ here.
+// The functions declared here will be implemented in C++ but
+// we don't have to know, because thanks to the extern "C" syntax,
+// they will be compiled to C object code.
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  // just dummy empty structs to give different pointer types,
+  // instead of using void* which would be type unsafe
+  struct C_MatrixXd {};
+  struct C_Map_MatrixXd {};
+
+  // the C_MatrixXd class, wraps some of the functionality
+  // of Eigen::MatrixXd.
+  struct C_MatrixXd* MatrixXd_new(int rows, int cols);
+  void    MatrixXd_delete     (struct C_MatrixXd *m);
+  double* MatrixXd_data       (struct C_MatrixXd *m);
+  void    MatrixXd_set_zero   (struct C_MatrixXd *m);
+  void    MatrixXd_resize     (struct C_MatrixXd *m, int rows, int cols);
+  void    MatrixXd_copy       (struct C_MatrixXd *dst,
+                               const struct C_MatrixXd *src);
+  void    MatrixXd_copy_map   (struct C_MatrixXd *dst,
+                               const struct C_Map_MatrixXd *src);  
+  void    MatrixXd_set_coeff  (struct C_MatrixXd *m,
+                               int i, int j, double coeff);
+  double  MatrixXd_get_coeff  (const struct C_MatrixXd *m,
+                               int i, int j);
+  void    MatrixXd_print      (const struct C_MatrixXd *m);
+  void    MatrixXd_add        (const struct C_MatrixXd *m1,
+                               const struct C_MatrixXd *m2,
+                               struct C_MatrixXd *result);  
+  void    MatrixXd_multiply   (const struct C_MatrixXd *m1,
+                               const struct C_MatrixXd *m2,
+                               struct C_MatrixXd *result);
+  
+  // the C_Map_MatrixXd class, wraps some of the functionality
+  // of Eigen::Map<MatrixXd>
+  struct C_Map_MatrixXd* Map_MatrixXd_new(double *array, int rows, int cols);
+  void   Map_MatrixXd_delete     (struct C_Map_MatrixXd *m);
+  void   Map_MatrixXd_set_zero   (struct C_Map_MatrixXd *m);
+  void   Map_MatrixXd_copy       (struct C_Map_MatrixXd *dst,
+                                  const struct C_Map_MatrixXd *src);
+  void   Map_MatrixXd_copy_matrix(struct C_Map_MatrixXd *dst,
+                                  const struct C_MatrixXd *src);  
+  void   Map_MatrixXd_set_coeff  (struct C_Map_MatrixXd *m,
+                                  int i, int j, double coeff);
+  double Map_MatrixXd_get_coeff  (const struct C_Map_MatrixXd *m,
+                                  int i, int j);
+  void   Map_MatrixXd_print      (const struct C_Map_MatrixXd *m);
+  void   Map_MatrixXd_add        (const struct C_Map_MatrixXd *m1,
+                                  const struct C_Map_MatrixXd *m2,
+                                  struct C_Map_MatrixXd *result);  
+  void   Map_MatrixXd_multiply   (const struct C_Map_MatrixXd *m1,
+                                  const struct C_Map_MatrixXd *m2,
+                                  struct C_Map_MatrixXd *result);
+
+#ifdef __cplusplus
+} // end extern "C"
+#endif

diff --git a/demos/mix_eigen_and_c/example.c b/demos/mix_eigen_and_c/example.c
new file mode 100644
index 0000000..508eb54
--- /dev/null
+++ b/demos/mix_eigen_and_c/example.c

@@ -0,0 +1,65 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "binary_library.h"
+#include "stdio.h"
+
+void demo_MatrixXd()
+{
+  struct C_MatrixXd *matrix1, *matrix2, *result;
+  printf("*** demo_MatrixXd ***\n");
+  
+  matrix1 = MatrixXd_new(3, 3);
+  MatrixXd_set_zero(matrix1);
+  MatrixXd_set_coeff(matrix1, 0, 1, 2.5);
+  MatrixXd_set_coeff(matrix1, 1, 0, 1.4);
+  printf("Here is matrix1:\n");
+  MatrixXd_print(matrix1);
+
+  matrix2 = MatrixXd_new(3, 3);
+  MatrixXd_multiply(matrix1, matrix1, matrix2);
+  printf("Here is matrix1*matrix1:\n");
+  MatrixXd_print(matrix2);
+
+  MatrixXd_delete(matrix1);
+  MatrixXd_delete(matrix2);
+}
+
+// this helper function takes a plain C array and prints it in one line
+void print_array(double *array, int n)
+{
+  struct C_Map_MatrixXd *m = Map_MatrixXd_new(array, 1, n);
+  Map_MatrixXd_print(m);
+  Map_MatrixXd_delete(m);
+}
+
+void demo_Map_MatrixXd()
+{
+  struct C_Map_MatrixXd *map;
+  double array[5];
+  int i;
+  printf("*** demo_Map_MatrixXd ***\n");
+  
+  for(i = 0; i < 5; ++i) array[i] = i;
+  printf("Initially, the array is:\n");
+  print_array(array, 5);
+  
+  map = Map_MatrixXd_new(array, 5, 1);
+  Map_MatrixXd_add(map, map, map);
+  Map_MatrixXd_delete(map);
+
+  printf("Now the array is:\n");
+  print_array(array, 5);
+}
+
+int main()
+{
+  demo_MatrixXd();
+  demo_Map_MatrixXd();
+}

diff --git a/demos/opengl/CMakeLists.txt b/demos/opengl/CMakeLists.txt
new file mode 100644
index 0000000..299aa44
--- /dev/null
+++ b/demos/opengl/CMakeLists.txt

@@ -0,0 +1,28 @@
+find_package(Qt4)
+find_package(OpenGL)
+
+if(QT4_FOUND AND OPENGL_FOUND)
+
+  set(QT_USE_QTOPENGL TRUE)
+  include(${QT_USE_FILE})
+
+  set(CMAKE_INCLUDE_CURRENT_DIR ON)
+
+  include_directories( ${QT_INCLUDE_DIR} )
+
+  set(quaternion_demo_SRCS  gpuhelper.cpp icosphere.cpp camera.cpp trackball.cpp quaternion_demo.cpp)
+
+  qt4_automoc(${quaternion_demo_SRCS})
+
+  add_executable(quaternion_demo ${quaternion_demo_SRCS})
+  add_dependencies(demos quaternion_demo)
+
+  target_link_libraries(quaternion_demo
+    ${QT_QTCORE_LIBRARY}    ${QT_QTGUI_LIBRARY}
+    ${QT_QTOPENGL_LIBRARY}  ${OPENGL_LIBRARIES} )
+
+else()
+
+  message(STATUS "OpenGL demo disabled because Qt4 and/or OpenGL have not been found.")
+
+endif()
\ No newline at end of file

diff --git a/demos/opengl/README b/demos/opengl/README
new file mode 100644
index 0000000..8fb1649
--- /dev/null
+++ b/demos/opengl/README

@@ -0,0 +1,13 @@
+
+Navigation:
+ left button:           rotate around the target
+ middle button:         zoom
+ left button + ctrl     quake rotate (rotate around camera position)
+ middle button + ctrl   walk (progress along camera's z direction)
+ left button:           pan (translate in the XY camera's plane)
+
+R : move the camera to initial position
+A : start/stop animation
+C : clear the animation
+G : add a key frame
+

diff --git a/demos/opengl/camera.cpp b/demos/opengl/camera.cpp
new file mode 100644
index 0000000..8a2344c
--- /dev/null
+++ b/demos/opengl/camera.cpp

@@ -0,0 +1,264 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "camera.h"
+
+#include "gpuhelper.h"
+#include <GL/glu.h>
+
+#include "Eigen/LU"
+using namespace Eigen;
+
+Camera::Camera()
+    : mViewIsUptodate(false), mProjIsUptodate(false)
+{
+    mViewMatrix.setIdentity();
+    
+    mFovY = M_PI/3.;
+    mNearDist = 1.;
+    mFarDist = 50000.;
+    
+    mVpX = 0;
+    mVpY = 0;
+
+    setPosition(Vector3f::Constant(100.));
+    setTarget(Vector3f::Zero());
+}
+
+Camera& Camera::operator=(const Camera& other)
+{
+    mViewIsUptodate = false;
+    mProjIsUptodate = false;
+    
+    mVpX = other.mVpX;
+    mVpY = other.mVpY;
+    mVpWidth = other.mVpWidth;
+    mVpHeight = other.mVpHeight;
+
+    mTarget = other.mTarget;
+    mFovY = other.mFovY;
+    mNearDist = other.mNearDist;
+    mFarDist = other.mFarDist;
+    
+    mViewMatrix = other.mViewMatrix;
+    mProjectionMatrix = other.mProjectionMatrix;
+
+    return *this;
+}
+
+Camera::Camera(const Camera& other)
+{
+    *this = other;
+}
+
+Camera::~Camera()
+{
+}
+
+
+void Camera::setViewport(uint offsetx, uint offsety, uint width, uint height)
+{
+    mVpX = offsetx;
+    mVpY = offsety;
+    mVpWidth = width;
+    mVpHeight = height;
+    
+    mProjIsUptodate = false;
+}
+
+void Camera::setViewport(uint width, uint height)
+{
+    mVpWidth = width;
+    mVpHeight = height;
+    
+    mProjIsUptodate = false;
+}
+
+void Camera::setFovY(float value)
+{
+    mFovY = value;
+    mProjIsUptodate = false;
+}
+
+Vector3f Camera::direction(void) const
+{
+    return - (orientation() * Vector3f::UnitZ());
+}
+Vector3f Camera::up(void) const
+{
+    return orientation() * Vector3f::UnitY();
+}
+Vector3f Camera::right(void) const
+{
+    return orientation() * Vector3f::UnitX();
+}
+
+void Camera::setDirection(const Vector3f& newDirection)
+{
+    // TODO implement it computing the rotation between newDirection and current dir ?
+    Vector3f up = this->up();
+    
+    Matrix3f camAxes;
+
+    camAxes.col(2) = (-newDirection).normalized();
+    camAxes.col(0) = up.cross( camAxes.col(2) ).normalized();
+    camAxes.col(1) = camAxes.col(2).cross( camAxes.col(0) ).normalized();
+    setOrientation(Quaternionf(camAxes));
+    
+    mViewIsUptodate = false;
+}
+
+void Camera::setTarget(const Vector3f& target)
+{
+    mTarget = target;
+    if (!mTarget.isApprox(position()))
+    {
+        Vector3f newDirection = mTarget - position();
+        setDirection(newDirection.normalized());
+    }
+}
+
+void Camera::setPosition(const Vector3f& p)
+{
+    mFrame.position = p;
+    mViewIsUptodate = false;
+}
+
+void Camera::setOrientation(const Quaternionf& q)
+{
+    mFrame.orientation = q;
+    mViewIsUptodate = false;
+}
+
+void Camera::setFrame(const Frame& f)
+{
+  mFrame = f;
+  mViewIsUptodate = false;
+}
+
+void Camera::rotateAroundTarget(const Quaternionf& q)
+{
+    Matrix4f mrot, mt, mtm;
+    
+    // update the transform matrix
+    updateViewMatrix();
+    Vector3f t = mViewMatrix * mTarget;
+
+    mViewMatrix = Translation3f(t)
+                * q
+                * Translation3f(-t)
+                * mViewMatrix;
+    
+    Quaternionf qa(mViewMatrix.linear());
+    qa = qa.conjugate();
+    setOrientation(qa);
+    setPosition(- (qa * mViewMatrix.translation()) );
+
+    mViewIsUptodate = true;
+}
+
+void Camera::localRotate(const Quaternionf& q)
+{
+    float dist = (position() - mTarget).norm();
+    setOrientation(orientation() * q);
+    mTarget = position() + dist * direction();
+    mViewIsUptodate = false;
+}
+
+void Camera::zoom(float d)
+{
+    float dist = (position() - mTarget).norm();
+    if(dist > d)
+    {
+        setPosition(position() + direction() * d);
+        mViewIsUptodate = false;
+    }
+}
+
+void Camera::localTranslate(const Vector3f& t)
+{
+  Vector3f trans = orientation() * t;
+  setPosition( position() + trans );
+  setTarget( mTarget + trans );
+
+  mViewIsUptodate = false;
+}
+
+void Camera::updateViewMatrix(void) const
+{
+    if(!mViewIsUptodate)
+    {
+        Quaternionf q = orientation().conjugate();
+        mViewMatrix.linear() = q.toRotationMatrix();
+        mViewMatrix.translation() = - (mViewMatrix.linear() * position());
+
+        mViewIsUptodate = true;
+    }
+}
+
+const Affine3f& Camera::viewMatrix(void) const
+{
+  updateViewMatrix();
+  return mViewMatrix;
+}
+
+void Camera::updateProjectionMatrix(void) const
+{
+  if(!mProjIsUptodate)
+  {
+    mProjectionMatrix.setIdentity();
+    float aspect = float(mVpWidth)/float(mVpHeight);
+    float theta = mFovY*0.5;
+    float range = mFarDist - mNearDist;
+    float invtan = 1./tan(theta);
+
+    mProjectionMatrix(0,0) = invtan / aspect;
+    mProjectionMatrix(1,1) = invtan;
+    mProjectionMatrix(2,2) = -(mNearDist + mFarDist) / range;
+    mProjectionMatrix(3,2) = -1;
+    mProjectionMatrix(2,3) = -2 * mNearDist * mFarDist / range;
+    mProjectionMatrix(3,3) = 0;
+    
+    mProjIsUptodate = true;
+  }
+}
+
+const Matrix4f& Camera::projectionMatrix(void) const
+{
+  updateProjectionMatrix();
+  return mProjectionMatrix;
+}
+
+void Camera::activateGL(void)
+{
+  glViewport(vpX(), vpY(), vpWidth(), vpHeight());
+  gpu.loadMatrix(projectionMatrix(),GL_PROJECTION);
+  gpu.loadMatrix(viewMatrix().matrix(),GL_MODELVIEW);
+}
+
+
+Vector3f Camera::unProject(const Vector2f& uv, float depth) const
+{
+    Matrix4f inv = mViewMatrix.inverse().matrix();
+    return unProject(uv, depth, inv);
+}
+
+Vector3f Camera::unProject(const Vector2f& uv, float depth, const Matrix4f& invModelview) const
+{
+    updateViewMatrix();
+    updateProjectionMatrix();
+    
+    Vector3f a(2.*uv.x()/float(mVpWidth)-1., 2.*uv.y()/float(mVpHeight)-1., 1.);
+    a.x() *= depth/mProjectionMatrix(0,0);
+    a.y() *= depth/mProjectionMatrix(1,1);
+    a.z() = -depth;
+    // FIXME /\/|
+    Vector4f b = invModelview * Vector4f(a.x(), a.y(), a.z(), 1.);
+    return Vector3f(b.x(), b.y(), b.z());
+}

diff --git a/demos/opengl/camera.h b/demos/opengl/camera.h
new file mode 100644
index 0000000..15714d2
--- /dev/null
+++ b/demos/opengl/camera.h

@@ -0,0 +1,118 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CAMERA_H
+#define EIGEN_CAMERA_H
+
+#include <Eigen/Geometry>
+#include <QObject>
+// #include <frame.h>
+
+class Frame
+{
+  public:
+    EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+    
+    inline Frame(const Eigen::Vector3f& pos = Eigen::Vector3f::Zero(),
+                 const Eigen::Quaternionf& o = Eigen::Quaternionf())
+      : orientation(o), position(pos)
+    {}
+    Frame lerp(float alpha, const Frame& other) const
+    {
+      return Frame((1.f-alpha)*position + alpha * other.position,
+                   orientation.slerp(alpha,other.orientation));
+    }
+
+    Eigen::Quaternionf orientation;
+    Eigen::Vector3f position;
+};
+
+class Camera
+{
+  public:
+    EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+
+    Camera(void);
+    
+    Camera(const Camera& other);
+    
+    virtual ~Camera();
+    
+    Camera& operator=(const Camera& other);
+    
+    void setViewport(uint offsetx, uint offsety, uint width, uint height);
+    void setViewport(uint width, uint height);
+    
+    inline uint vpX(void) const { return mVpX; }
+    inline uint vpY(void) const { return mVpY; }
+    inline uint vpWidth(void) const { return mVpWidth; }
+    inline uint vpHeight(void) const { return mVpHeight; }
+
+    inline float fovY(void) const { return mFovY; }
+    void setFovY(float value);
+    
+    void setPosition(const Eigen::Vector3f& pos);
+    inline const Eigen::Vector3f& position(void) const { return mFrame.position; }
+
+    void setOrientation(const Eigen::Quaternionf& q);
+    inline const Eigen::Quaternionf& orientation(void) const { return mFrame.orientation; }
+
+    void setFrame(const Frame& f);
+    const Frame& frame(void) const { return mFrame; }
+    
+    void setDirection(const Eigen::Vector3f& newDirection);
+    Eigen::Vector3f direction(void) const;
+    void setUp(const Eigen::Vector3f& vectorUp);
+    Eigen::Vector3f up(void) const;
+    Eigen::Vector3f right(void) const;
+    
+    void setTarget(const Eigen::Vector3f& target);
+    inline const Eigen::Vector3f& target(void) { return mTarget; }
+    
+    const Eigen::Affine3f& viewMatrix(void) const;
+    const Eigen::Matrix4f& projectionMatrix(void) const;
+    
+    void rotateAroundTarget(const Eigen::Quaternionf& q);
+    void localRotate(const Eigen::Quaternionf& q);
+    void zoom(float d);
+    
+    void localTranslate(const Eigen::Vector3f& t);
+    
+    /** Setup OpenGL matrices and viewport */
+    void activateGL(void);
+    
+    Eigen::Vector3f unProject(const Eigen::Vector2f& uv, float depth, const Eigen::Matrix4f& invModelview) const;
+    Eigen::Vector3f unProject(const Eigen::Vector2f& uv, float depth) const;
+    
+  protected:
+    void updateViewMatrix(void) const;
+    void updateProjectionMatrix(void) const;
+
+  protected:
+
+    uint mVpX, mVpY;
+    uint mVpWidth, mVpHeight;
+
+    Frame mFrame;
+    
+    mutable Eigen::Affine3f mViewMatrix;
+    mutable Eigen::Matrix4f mProjectionMatrix;
+
+    mutable bool mViewIsUptodate;
+    mutable bool mProjIsUptodate;
+
+    // used by rotateAroundTarget
+    Eigen::Vector3f mTarget;
+    
+    float mFovY;
+    float mNearDist;
+    float mFarDist;
+};
+
+#endif // EIGEN_CAMERA_H

diff --git a/demos/opengl/gpuhelper.cpp b/demos/opengl/gpuhelper.cpp
new file mode 100644
index 0000000..fd236b1
--- /dev/null
+++ b/demos/opengl/gpuhelper.cpp

@@ -0,0 +1,126 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "gpuhelper.h"
+#include "icosphere.h"
+#include <GL/glu.h>
+// PLEASE don't look at this old code... ;)
+
+#include <fstream>
+#include <algorithm>
+
+GpuHelper gpu;
+
+GpuHelper::GpuHelper()
+{
+    mVpWidth = mVpHeight = 0;
+    mCurrentMatrixTarget = 0;
+    mInitialized = false;
+}
+
+GpuHelper::~GpuHelper()
+{
+}
+
+void GpuHelper::pushProjectionMode2D(ProjectionMode2D pm)
+{
+    // switch to 2D projection
+    pushMatrix(Matrix4f::Identity(),GL_PROJECTION);
+
+    if(pm==PM_Normalized)
+    {
+        //glOrtho(-1., 1., -1., 1., 0., 1.);
+    }
+    else if(pm==PM_Viewport)
+    {
+        GLint vp[4];
+        glGetIntegerv(GL_VIEWPORT, vp);
+        glOrtho(0., vp[2], 0., vp[3], -1., 1.);
+    }
+
+    pushMatrix(Matrix4f::Identity(),GL_MODELVIEW);
+}
+
+void GpuHelper::popProjectionMode2D(void)
+{
+    popMatrix(GL_PROJECTION);
+    popMatrix(GL_MODELVIEW);
+}
+
+void GpuHelper::drawVector(const Vector3f& position, const Vector3f& vec, const Color& color, float aspect /* = 50.*/)
+{
+    static GLUquadricObj *cylindre = gluNewQuadric();
+    glColor4fv(color.data());
+    float length = vec.norm();
+    pushMatrix(GL_MODELVIEW);
+    glTranslatef(position.x(), position.y(), position.z());
+    Vector3f ax = Matrix3f::Identity().col(2).cross(vec);
+    ax.normalize();
+    Vector3f tmp = vec;
+    tmp.normalize();
+    float angle = 180.f/M_PI * acos(tmp.z());
+    if (angle>1e-3)
+        glRotatef(angle, ax.x(), ax.y(), ax.z());
+    gluCylinder(cylindre, length/aspect, length/aspect, 0.8*length, 10, 10);
+    glTranslatef(0.0,0.0,0.8*length);
+    gluCylinder(cylindre, 2.0*length/aspect, 0.0, 0.2*length, 10, 10);
+
+    popMatrix(GL_MODELVIEW);
+}
+
+void GpuHelper::drawVectorBox(const Vector3f& position, const Vector3f& vec, const Color& color, float aspect)
+{
+    static GLUquadricObj *cylindre = gluNewQuadric();
+    glColor4fv(color.data());
+    float length = vec.norm();
+    pushMatrix(GL_MODELVIEW);
+    glTranslatef(position.x(), position.y(), position.z());
+    Vector3f ax = Matrix3f::Identity().col(2).cross(vec);
+    ax.normalize();
+    Vector3f tmp = vec;
+    tmp.normalize();
+    float angle = 180.f/M_PI * acos(tmp.z());
+    if (angle>1e-3)
+        glRotatef(angle, ax.x(), ax.y(), ax.z());
+    gluCylinder(cylindre, length/aspect, length/aspect, 0.8*length, 10, 10);
+    glTranslatef(0.0,0.0,0.8*length);
+    glScalef(4.0*length/aspect,4.0*length/aspect,4.0*length/aspect);
+    drawUnitCube();
+    popMatrix(GL_MODELVIEW);
+}
+
+void GpuHelper::drawUnitCube(void)
+{
+    static float vertices[][3] = {
+        {-0.5,-0.5,-0.5},
+        { 0.5,-0.5,-0.5},
+        {-0.5, 0.5,-0.5},
+        { 0.5, 0.5,-0.5},
+        {-0.5,-0.5, 0.5},
+        { 0.5,-0.5, 0.5},
+        {-0.5, 0.5, 0.5},
+        { 0.5, 0.5, 0.5}};
+
+    glBegin(GL_QUADS);
+    glNormal3f(0,0,-1); glVertex3fv(vertices[0]); glVertex3fv(vertices[2]); glVertex3fv(vertices[3]); glVertex3fv(vertices[1]);
+    glNormal3f(0,0, 1); glVertex3fv(vertices[4]); glVertex3fv(vertices[5]); glVertex3fv(vertices[7]); glVertex3fv(vertices[6]);
+    glNormal3f(0,-1,0); glVertex3fv(vertices[0]); glVertex3fv(vertices[1]); glVertex3fv(vertices[5]); glVertex3fv(vertices[4]);
+    glNormal3f(0, 1,0); glVertex3fv(vertices[2]); glVertex3fv(vertices[6]); glVertex3fv(vertices[7]); glVertex3fv(vertices[3]);
+    glNormal3f(-1,0,0); glVertex3fv(vertices[0]); glVertex3fv(vertices[4]); glVertex3fv(vertices[6]); glVertex3fv(vertices[2]);
+    glNormal3f( 1,0,0); glVertex3fv(vertices[1]); glVertex3fv(vertices[3]); glVertex3fv(vertices[7]); glVertex3fv(vertices[5]);
+    glEnd();
+}
+
+void GpuHelper::drawUnitSphere(int level)
+{
+  static IcoSphere sphere;
+  sphere.draw(level);
+}
+
+

diff --git a/demos/opengl/gpuhelper.h b/demos/opengl/gpuhelper.h
new file mode 100644
index 0000000..9ff98e9
--- /dev/null
+++ b/demos/opengl/gpuhelper.h

@@ -0,0 +1,207 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GPUHELPER_H
+#define EIGEN_GPUHELPER_H
+
+#include <Eigen/Geometry>
+#include <GL/gl.h>
+#include <vector>
+
+using namespace Eigen;
+
+typedef Vector4f Color;
+
+class GpuHelper
+{
+  public:
+
+    GpuHelper();
+
+    ~GpuHelper();
+
+    enum ProjectionMode2D { PM_Normalized = 1, PM_Viewport = 2 };
+    void pushProjectionMode2D(ProjectionMode2D pm);
+    void popProjectionMode2D();
+
+    /** Multiply the OpenGL matrix \a matrixTarget by the matrix \a mat.
+        Essentially, this helper function automatically calls glMatrixMode(matrixTarget) if required
+        and does a proper call to the right glMultMatrix*() function according to the scalar type
+        and storage order.
+        \warning glMatrixMode() must never be called directly. If your're unsure, use forceMatrixMode().
+        \sa Matrix, loadMatrix(), forceMatrixMode()
+    */
+    template<typename Scalar, int _Flags>
+    void multMatrix(const Matrix<Scalar,4,4, _Flags, 4,4>& mat, GLenum matrixTarget);
+
+    /** Load the matrix \a mat to the OpenGL matrix \a matrixTarget.
+        Essentially, this helper function automatically calls glMatrixMode(matrixTarget) if required
+        and does a proper call to the right glLoadMatrix*() or glLoadIdentity() function according to the scalar type
+        and storage order.
+        \warning glMatrixMode() must never be called directly. If your're unsure, use forceMatrixMode().
+        \sa Matrix, multMatrix(), forceMatrixMode()
+    */
+    template<typename Scalar, int _Flags>
+    void loadMatrix(const Eigen::Matrix<Scalar,4,4, _Flags, 4,4>& mat, GLenum matrixTarget);
+
+    template<typename Scalar, typename Derived>
+    void loadMatrix(
+        const Eigen::CwiseNullaryOp<Eigen::internal::scalar_identity_op<Scalar>,Derived>&,
+        GLenum matrixTarget);
+
+    /** Make the matrix \a matrixTarget the current OpenGL matrix target.
+        Call this function before loadMatrix() or multMatrix() if you cannot guarantee that glMatrixMode()
+        has never been called after the last loadMatrix() or multMatrix() calls.
+        \todo provides a debug mode checking the sanity of the cached matrix mode.
+    */
+    inline void forceMatrixTarget(GLenum matrixTarget) {glMatrixMode(mCurrentMatrixTarget=matrixTarget);}
+
+    inline void setMatrixTarget(GLenum matrixTarget);
+
+    /** Push the OpenGL matrix \a matrixTarget and load \a mat.
+    */
+    template<typename Scalar, int _Flags>
+    inline void pushMatrix(const Matrix<Scalar,4,4, _Flags, 4,4>& mat, GLenum matrixTarget);
+
+    template<typename Scalar, typename Derived>
+    void pushMatrix(
+        const Eigen::CwiseNullaryOp<Eigen::internal::scalar_identity_op<Scalar>,Derived>&,
+        GLenum matrixTarget);
+
+    /** Push and clone the OpenGL matrix \a matrixTarget
+    */
+    inline void pushMatrix(GLenum matrixTarget);
+
+    /** Pop the OpenGL matrix \a matrixTarget
+    */
+    inline void popMatrix(GLenum matrixTarget);
+
+    void drawVector(const Vector3f& position, const Vector3f& vec, const Color& color, float aspect = 50.);
+    void drawVectorBox(const Vector3f& position, const Vector3f& vec, const Color& color, float aspect = 50.);
+    void drawUnitCube(void);
+    void drawUnitSphere(int level=0);
+
+    /// draw the \a nofElement first elements
+    inline void draw(GLenum mode, uint nofElement);
+
+    /// draw a range of elements
+    inline void draw(GLenum mode, uint start, uint end);
+
+    /// draw an indexed subset
+    inline void draw(GLenum mode, const std::vector<uint>* pIndexes);
+
+protected:
+
+    void update(void);
+
+    GLuint mColorBufferId;
+    int mVpWidth, mVpHeight;
+    GLenum mCurrentMatrixTarget;
+    bool mInitialized;
+};
+
+/** Singleton shortcut
+*/
+extern GpuHelper gpu;
+
+
+/** \internal
+*/
+template<bool RowMajor, int _Flags> struct GlMatrixHelper;
+
+template<int _Flags> struct GlMatrixHelper<false,_Flags>
+{
+    static void loadMatrix(const Matrix<float, 4,4, _Flags, 4,4>&  mat) { glLoadMatrixf(mat.data()); }
+    static void loadMatrix(const Matrix<double,4,4, _Flags, 4,4>& mat) { glLoadMatrixd(mat.data()); }
+    static void multMatrix(const Matrix<float, 4,4, _Flags, 4,4>&  mat) { glMultMatrixf(mat.data()); }
+    static void multMatrix(const Matrix<double,4,4, _Flags, 4,4>& mat) { glMultMatrixd(mat.data()); }
+};
+
+template<int _Flags> struct GlMatrixHelper<true,_Flags>
+{
+    static void loadMatrix(const Matrix<float, 4,4, _Flags, 4,4>&  mat) { glLoadMatrixf(mat.transpose().eval().data()); }
+    static void loadMatrix(const Matrix<double,4,4, _Flags, 4,4>& mat) { glLoadMatrixd(mat.transpose().eval().data()); }
+    static void multMatrix(const Matrix<float, 4,4, _Flags, 4,4>&  mat) { glMultMatrixf(mat.transpose().eval().data()); }
+    static void multMatrix(const Matrix<double,4,4, _Flags, 4,4>& mat) { glMultMatrixd(mat.transpose().eval().data()); }
+};
+
+inline void GpuHelper::setMatrixTarget(GLenum matrixTarget)
+{
+    if (matrixTarget != mCurrentMatrixTarget)
+        glMatrixMode(mCurrentMatrixTarget=matrixTarget);
+}
+
+template<typename Scalar, int _Flags>
+void GpuHelper::multMatrix(const Matrix<Scalar,4,4, _Flags, 4,4>& mat, GLenum matrixTarget)
+{
+    setMatrixTarget(matrixTarget);
+    GlMatrixHelper<_Flags&Eigen::RowMajorBit, _Flags>::multMatrix(mat);
+}
+
+template<typename Scalar, typename Derived>
+void GpuHelper::loadMatrix(
+    const Eigen::CwiseNullaryOp<Eigen::internal::scalar_identity_op<Scalar>,Derived>&,
+    GLenum matrixTarget)
+{
+    setMatrixTarget(matrixTarget);
+    glLoadIdentity();
+}
+
+template<typename Scalar, int _Flags>
+void GpuHelper::loadMatrix(const Eigen::Matrix<Scalar,4,4, _Flags, 4,4>& mat, GLenum matrixTarget)
+{
+    setMatrixTarget(matrixTarget);
+    GlMatrixHelper<(_Flags&Eigen::RowMajorBit)!=0, _Flags>::loadMatrix(mat);
+}
+
+inline void GpuHelper::pushMatrix(GLenum matrixTarget)
+{
+    setMatrixTarget(matrixTarget);
+    glPushMatrix();
+}
+
+template<typename Scalar, int _Flags>
+inline void GpuHelper::pushMatrix(const Matrix<Scalar,4,4, _Flags, 4,4>& mat, GLenum matrixTarget)
+{
+    pushMatrix(matrixTarget);
+    GlMatrixHelper<_Flags&Eigen::RowMajorBit,_Flags>::loadMatrix(mat);
+}
+
+template<typename Scalar, typename Derived>
+void GpuHelper::pushMatrix(
+    const Eigen::CwiseNullaryOp<Eigen::internal::scalar_identity_op<Scalar>,Derived>&,
+    GLenum matrixTarget)
+{
+    pushMatrix(matrixTarget);
+    glLoadIdentity();
+}
+
+inline void GpuHelper::popMatrix(GLenum matrixTarget)
+{
+    setMatrixTarget(matrixTarget);
+    glPopMatrix();
+}
+
+inline void GpuHelper::draw(GLenum mode, uint nofElement)
+{
+    glDrawArrays(mode, 0, nofElement);
+}
+
+
+inline void GpuHelper::draw(GLenum mode, const std::vector<uint>* pIndexes)
+{
+    glDrawElements(mode, pIndexes->size(), GL_UNSIGNED_INT, &(pIndexes->front()));
+}
+
+inline void GpuHelper::draw(GLenum mode, uint start, uint end)
+{
+    glDrawArrays(mode, start, end-start);
+}
+
+#endif // EIGEN_GPUHELPER_H

diff --git a/demos/opengl/icosphere.cpp b/demos/opengl/icosphere.cpp
new file mode 100644
index 0000000..39444cb
--- /dev/null
+++ b/demos/opengl/icosphere.cpp

@@ -0,0 +1,120 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "icosphere.h"
+
+#include <GL/gl.h>
+#include <map>
+
+using namespace Eigen;
+
+//--------------------------------------------------------------------------------
+// icosahedron data
+//--------------------------------------------------------------------------------
+#define X .525731112119133606
+#define Z .850650808352039932
+
+static GLfloat vdata[12][3] = {
+   {-X, 0.0, Z}, {X, 0.0, Z}, {-X, 0.0, -Z}, {X, 0.0, -Z},
+   {0.0, Z, X}, {0.0, Z, -X}, {0.0, -Z, X}, {0.0, -Z, -X},
+   {Z, X, 0.0}, {-Z, X, 0.0}, {Z, -X, 0.0}, {-Z, -X, 0.0}
+};
+
+static GLint tindices[20][3] = {
+   {0,4,1}, {0,9,4}, {9,5,4}, {4,5,8}, {4,8,1},
+   {8,10,1}, {8,3,10}, {5,3,8}, {5,2,3}, {2,7,3},
+   {7,10,3}, {7,6,10}, {7,11,6}, {11,0,6}, {0,1,6},
+   {6,1,10}, {9,0,11}, {9,11,2}, {9,2,5}, {7,2,11} };
+//--------------------------------------------------------------------------------
+
+IcoSphere::IcoSphere(unsigned int levels)
+{
+  // init with an icosahedron
+  for (int i = 0; i < 12; i++)
+    mVertices.push_back(Map<Vector3f>(vdata[i]));
+  mIndices.push_back(new std::vector<int>);
+  std::vector<int>& indices = *mIndices.back();
+  for (int i = 0; i < 20; i++)
+  {
+    for (int k = 0; k < 3; k++)
+      indices.push_back(tindices[i][k]);
+  }
+  mListIds.push_back(0);
+
+  while(mIndices.size()<levels)
+    _subdivide();
+}
+
+const std::vector<int>& IcoSphere::indices(int level) const
+{
+  while (level>=int(mIndices.size()))
+    const_cast<IcoSphere*>(this)->_subdivide();
+  return *mIndices[level];
+}
+
+void IcoSphere::_subdivide(void)
+{
+  typedef unsigned long long Key;
+  std::map<Key,int> edgeMap;
+  const std::vector<int>& indices = *mIndices.back();
+  mIndices.push_back(new std::vector<int>);
+  std::vector<int>& refinedIndices = *mIndices.back();
+  int end = indices.size();
+  for (int i=0; i<end; i+=3)
+  {
+    int ids0[3],  // indices of outer vertices
+        ids1[3];  // indices of edge vertices
+    for (int k=0; k<3; ++k)
+    {
+      int k1 = (k+1)%3;
+      int e0 = indices[i+k];
+      int e1 = indices[i+k1];
+      ids0[k] = e0;
+      if (e1>e0)
+        std::swap(e0,e1);
+      Key edgeKey = Key(e0) | (Key(e1)<<32);
+      std::map<Key,int>::iterator it = edgeMap.find(edgeKey);
+      if (it==edgeMap.end())
+      {
+        ids1[k] = mVertices.size();
+        edgeMap[edgeKey] = ids1[k];
+        mVertices.push_back( (mVertices[e0]+mVertices[e1]).normalized() );
+      }
+      else
+        ids1[k] = it->second;
+    }
+    refinedIndices.push_back(ids0[0]); refinedIndices.push_back(ids1[0]); refinedIndices.push_back(ids1[2]);
+    refinedIndices.push_back(ids0[1]); refinedIndices.push_back(ids1[1]); refinedIndices.push_back(ids1[0]);
+    refinedIndices.push_back(ids0[2]); refinedIndices.push_back(ids1[2]); refinedIndices.push_back(ids1[1]);
+    refinedIndices.push_back(ids1[0]); refinedIndices.push_back(ids1[1]); refinedIndices.push_back(ids1[2]);
+  }
+  mListIds.push_back(0);
+}
+
+void IcoSphere::draw(int level)
+{
+  while (level>=int(mIndices.size()))
+    const_cast<IcoSphere*>(this)->_subdivide();
+  if (mListIds[level]==0)
+  {
+    mListIds[level] = glGenLists(1);
+    glNewList(mListIds[level], GL_COMPILE);
+      glVertexPointer(3, GL_FLOAT, 0, mVertices[0].data());
+      glNormalPointer(GL_FLOAT, 0, mVertices[0].data());
+      glEnableClientState(GL_VERTEX_ARRAY);
+      glEnableClientState(GL_NORMAL_ARRAY);
+      glDrawElements(GL_TRIANGLES, mIndices[level]->size(), GL_UNSIGNED_INT, &(mIndices[level]->at(0)));
+      glDisableClientState(GL_VERTEX_ARRAY);
+      glDisableClientState(GL_NORMAL_ARRAY);
+    glEndList();
+  }
+  glCallList(mListIds[level]);
+}
+
+

diff --git a/demos/opengl/icosphere.h b/demos/opengl/icosphere.h
new file mode 100644
index 0000000..b0210ed
--- /dev/null
+++ b/demos/opengl/icosphere.h

@@ -0,0 +1,30 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ICOSPHERE_H
+#define EIGEN_ICOSPHERE_H
+
+#include <Eigen/Core>
+#include <vector>
+
+class IcoSphere
+{
+  public:
+    IcoSphere(unsigned int levels=1);
+    const std::vector<Eigen::Vector3f>& vertices() const { return mVertices; }
+    const std::vector<int>& indices(int level) const;
+    void draw(int level);
+  protected:
+    void _subdivide();
+    std::vector<Eigen::Vector3f> mVertices;
+    std::vector<std::vector<int>*> mIndices;
+    std::vector<int> mListIds;
+};
+
+#endif // EIGEN_ICOSPHERE_H

diff --git a/demos/opengl/quaternion_demo.cpp b/demos/opengl/quaternion_demo.cpp
new file mode 100644
index 0000000..dd323a4
--- /dev/null
+++ b/demos/opengl/quaternion_demo.cpp

@@ -0,0 +1,656 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "quaternion_demo.h"
+#include "icosphere.h"
+
+#include <Eigen/Geometry>
+#include <Eigen/QR>
+#include <Eigen/LU>
+
+#include <iostream>
+#include <QEvent>
+#include <QMouseEvent>
+#include <QInputDialog>
+#include <QGridLayout>
+#include <QButtonGroup>
+#include <QRadioButton>
+#include <QDockWidget>
+#include <QPushButton>
+#include <QGroupBox>
+
+using namespace Eigen;
+
+class FancySpheres
+{
+  public:
+    EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+    
+    FancySpheres()
+    {
+      const int levels = 4;
+      const float scale = 0.33;
+      float radius = 100;
+      std::vector<int> parents;
+
+      // leval 0
+      mCenters.push_back(Vector3f::Zero());
+      parents.push_back(-1);
+      mRadii.push_back(radius);
+
+      // generate level 1 using icosphere vertices
+      radius *= 0.45;
+      {
+        float dist = mRadii[0]*0.9;
+        for (int i=0; i<12; ++i)
+        {
+          mCenters.push_back(mIcoSphere.vertices()[i] * dist);
+          mRadii.push_back(radius);
+          parents.push_back(0);
+        }
+      }
+
+      static const float angles [10] = {
+        0, 0,
+        M_PI, 0.*M_PI,
+        M_PI, 0.5*M_PI,
+        M_PI, 1.*M_PI,
+        M_PI, 1.5*M_PI
+      };
+
+      // generate other levels
+      int start = 1;
+      for (int l=1; l<levels; l++)
+      {
+        radius *= scale;
+        int end = mCenters.size();
+        for (int i=start; i<end; ++i)
+        {
+          Vector3f c = mCenters[i];
+          Vector3f ax0 = (c - mCenters[parents[i]]).normalized();
+          Vector3f ax1 = ax0.unitOrthogonal();
+          Quaternionf q;
+          q.setFromTwoVectors(Vector3f::UnitZ(), ax0);
+          Affine3f t = Translation3f(c) * q * Scaling(mRadii[i]+radius);
+          for (int j=0; j<5; ++j)
+          {
+            Vector3f newC = c + ( (AngleAxisf(angles[j*2+1], ax0)
+                                * AngleAxisf(angles[j*2+0] * (l==1 ? 0.35 : 0.5), ax1)) * ax0)
+                                * (mRadii[i] + radius*0.8);
+            mCenters.push_back(newC);
+            mRadii.push_back(radius);
+            parents.push_back(i);
+          }
+        }
+        start = end;
+      }
+    }
+
+    void draw()
+    {
+      int end = mCenters.size();
+      glEnable(GL_NORMALIZE);
+      for (int i=0; i<end; ++i)
+      {
+        Affine3f t = Translation3f(mCenters[i]) * Scaling(mRadii[i]);
+        gpu.pushMatrix(GL_MODELVIEW);
+        gpu.multMatrix(t.matrix(),GL_MODELVIEW);
+        mIcoSphere.draw(2);
+        gpu.popMatrix(GL_MODELVIEW);
+      }
+      glDisable(GL_NORMALIZE);
+    }
+  protected:
+    std::vector<Vector3f> mCenters;
+    std::vector<float> mRadii;
+    IcoSphere mIcoSphere;
+};
+
+
+// generic linear interpolation method
+template<typename T> T lerp(float t, const T& a, const T& b)
+{
+  return a*(1-t) + b*t;
+}
+
+// quaternion slerp
+template<> Quaternionf lerp(float t, const Quaternionf& a, const Quaternionf& b)
+{ return a.slerp(t,b); }
+
+// linear interpolation of a frame using the type OrientationType
+// to perform the interpolation of the orientations
+template<typename OrientationType>
+inline static Frame lerpFrame(float alpha, const Frame& a, const Frame& b)
+{
+  return Frame(lerp(alpha,a.position,b.position),
+               Quaternionf(lerp(alpha,OrientationType(a.orientation),OrientationType(b.orientation))));
+}
+
+template<typename _Scalar> class EulerAngles
+{
+public:
+  enum { Dim = 3 };
+  typedef _Scalar Scalar;
+  typedef Matrix<Scalar,3,3> Matrix3;
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Quaternion<Scalar> QuaternionType;
+
+protected:
+
+  Vector3 m_angles;
+
+public:
+
+  EulerAngles() {}
+  inline EulerAngles(Scalar a0, Scalar a1, Scalar a2) : m_angles(a0, a1, a2) {}
+  inline EulerAngles(const QuaternionType& q) { *this = q; }
+
+  const Vector3& coeffs() const { return m_angles; }
+  Vector3& coeffs() { return m_angles; }
+
+  EulerAngles& operator=(const QuaternionType& q)
+  {
+    Matrix3 m = q.toRotationMatrix();
+    return *this = m;
+  }
+
+  EulerAngles& operator=(const Matrix3& m)
+  {
+    // mat =  cy*cz          -cy*sz           sy
+    //        cz*sx*sy+cx*sz  cx*cz-sx*sy*sz -cy*sx
+    //       -cx*cz*sy+sx*sz  cz*sx+cx*sy*sz  cx*cy
+    m_angles.coeffRef(1) = std::asin(m.coeff(0,2));
+    m_angles.coeffRef(0) = std::atan2(-m.coeff(1,2),m.coeff(2,2));
+    m_angles.coeffRef(2) = std::atan2(-m.coeff(0,1),m.coeff(0,0));
+    return *this;
+  }
+
+  Matrix3 toRotationMatrix(void) const
+  {
+    Vector3 c = m_angles.array().cos();
+    Vector3 s = m_angles.array().sin();
+    Matrix3 res;
+    res <<  c.y()*c.z(),                    -c.y()*s.z(),                   s.y(),
+            c.z()*s.x()*s.y()+c.x()*s.z(),  c.x()*c.z()-s.x()*s.y()*s.z(),  -c.y()*s.x(),
+            -c.x()*c.z()*s.y()+s.x()*s.z(), c.z()*s.x()+c.x()*s.y()*s.z(),  c.x()*c.y();
+    return res;
+  }
+
+  operator QuaternionType() { return QuaternionType(toRotationMatrix()); }
+};
+
+// Euler angles slerp
+template<> EulerAngles<float> lerp(float t, const EulerAngles<float>& a, const EulerAngles<float>& b)
+{
+  EulerAngles<float> res;
+  res.coeffs() = lerp(t, a.coeffs(), b.coeffs());
+  return res;
+}
+
+
+RenderingWidget::RenderingWidget()
+{
+  mAnimate = false;
+  mCurrentTrackingMode = TM_NO_TRACK;
+  mNavMode = NavTurnAround;
+  mLerpMode = LerpQuaternion;
+  mRotationMode = RotationStable;
+  mTrackball.setCamera(&mCamera);
+
+  // required to capture key press events
+  setFocusPolicy(Qt::ClickFocus);
+}
+
+void RenderingWidget::grabFrame(void)
+{
+    // ask user for a time
+    bool ok = false;
+    double t = 0;
+    if (!m_timeline.empty())
+      t = (--m_timeline.end())->first + 1.;
+    t = QInputDialog::getDouble(this, "Eigen's RenderingWidget", "time value: ",
+      t, 0, 1e3, 1, &ok);
+    if (ok)
+    {
+      Frame aux;
+      aux.orientation = mCamera.viewMatrix().linear();
+      aux.position = mCamera.viewMatrix().translation();
+      m_timeline[t] = aux;
+    }
+}
+
+void RenderingWidget::drawScene()
+{
+  static FancySpheres sFancySpheres;
+  float length = 50;
+  gpu.drawVector(Vector3f::Zero(), length*Vector3f::UnitX(), Color(1,0,0,1));
+  gpu.drawVector(Vector3f::Zero(), length*Vector3f::UnitY(), Color(0,1,0,1));
+  gpu.drawVector(Vector3f::Zero(), length*Vector3f::UnitZ(), Color(0,0,1,1));
+
+  // draw the fractal object
+  float sqrt3 = std::sqrt(3.);
+  glLightfv(GL_LIGHT0, GL_AMBIENT, Vector4f(0.5,0.5,0.5,1).data());
+  glLightfv(GL_LIGHT0, GL_DIFFUSE, Vector4f(0.5,1,0.5,1).data());
+  glLightfv(GL_LIGHT0, GL_SPECULAR, Vector4f(1,1,1,1).data());
+  glLightfv(GL_LIGHT0, GL_POSITION, Vector4f(-sqrt3,-sqrt3,sqrt3,0).data());
+
+  glLightfv(GL_LIGHT1, GL_AMBIENT, Vector4f(0,0,0,1).data());
+  glLightfv(GL_LIGHT1, GL_DIFFUSE, Vector4f(1,0.5,0.5,1).data());
+  glLightfv(GL_LIGHT1, GL_SPECULAR, Vector4f(1,1,1,1).data());
+  glLightfv(GL_LIGHT1, GL_POSITION, Vector4f(-sqrt3,sqrt3,-sqrt3,0).data());
+
+  glMaterialfv(GL_FRONT_AND_BACK, GL_AMBIENT, Vector4f(0.7, 0.7, 0.7, 1).data());
+  glMaterialfv(GL_FRONT_AND_BACK, GL_DIFFUSE, Vector4f(0.8, 0.75, 0.6, 1).data());
+  glMaterialfv(GL_FRONT_AND_BACK, GL_SPECULAR, Vector4f(1, 1, 1, 1).data());
+  glMaterialf(GL_FRONT_AND_BACK, GL_SHININESS, 64);
+
+  glEnable(GL_LIGHTING);
+  glEnable(GL_LIGHT0);
+  glEnable(GL_LIGHT1);
+
+  sFancySpheres.draw();
+  glVertexPointer(3, GL_FLOAT, 0, mVertices[0].data());
+  glNormalPointer(GL_FLOAT, 0, mNormals[0].data());
+  glEnableClientState(GL_VERTEX_ARRAY);
+  glEnableClientState(GL_NORMAL_ARRAY);
+  glDrawArrays(GL_TRIANGLES, 0, mVertices.size());
+  glDisableClientState(GL_VERTEX_ARRAY);
+  glDisableClientState(GL_NORMAL_ARRAY);
+
+  glDisable(GL_LIGHTING);
+}
+
+void RenderingWidget::animate()
+{
+  m_alpha += double(m_timer.interval()) * 1e-3;
+
+  TimeLine::const_iterator hi = m_timeline.upper_bound(m_alpha);
+  TimeLine::const_iterator lo = hi;
+  --lo;
+
+  Frame currentFrame;
+
+  if(hi==m_timeline.end())
+  {
+    // end
+    currentFrame = lo->second;
+    stopAnimation();
+  }
+  else if(hi==m_timeline.begin())
+  {
+    // start
+    currentFrame = hi->second;
+  }
+  else
+  {
+    float s = (m_alpha - lo->first)/(hi->first - lo->first);
+    if (mLerpMode==LerpEulerAngles)
+      currentFrame = ::lerpFrame<EulerAngles<float> >(s, lo->second, hi->second);
+    else if (mLerpMode==LerpQuaternion)
+      currentFrame = ::lerpFrame<Eigen::Quaternionf>(s, lo->second, hi->second);
+    else
+    {
+      std::cerr << "Invalid rotation interpolation mode (abort)\n";
+      exit(2);
+    }
+    currentFrame.orientation.coeffs().normalize();
+  }
+
+  currentFrame.orientation = currentFrame.orientation.inverse();
+  currentFrame.position = - (currentFrame.orientation * currentFrame.position);
+  mCamera.setFrame(currentFrame);
+
+  updateGL();
+}
+
+void RenderingWidget::keyPressEvent(QKeyEvent * e)
+{
+    switch(e->key())
+    {
+      case Qt::Key_Up:
+        mCamera.zoom(2);
+        break;
+      case Qt::Key_Down:
+        mCamera.zoom(-2);
+        break;
+      // add a frame
+      case Qt::Key_G:
+        grabFrame();
+        break;
+      // clear the time line
+      case Qt::Key_C:
+        m_timeline.clear();
+        break;
+      // move the camera to initial pos
+      case Qt::Key_R:
+        resetCamera();
+        break;
+      // start/stop the animation
+      case Qt::Key_A:
+        if (mAnimate)
+        {
+          stopAnimation();
+        }
+        else
+        {
+          m_alpha = 0;
+          connect(&m_timer, SIGNAL(timeout()), this, SLOT(animate()));
+          m_timer.start(1000/30);
+          mAnimate = true;
+        }
+        break;
+      default:
+        break;
+    }
+
+    updateGL();
+}
+
+void RenderingWidget::stopAnimation()
+{
+  disconnect(&m_timer, SIGNAL(timeout()), this, SLOT(animate()));
+  m_timer.stop();
+  mAnimate = false;
+  m_alpha = 0;
+}
+
+void RenderingWidget::mousePressEvent(QMouseEvent* e)
+{
+  mMouseCoords = Vector2i(e->pos().x(), e->pos().y());
+  bool fly = (mNavMode==NavFly) || (e->modifiers()&Qt::ControlModifier);
+  switch(e->button())
+  {
+    case Qt::LeftButton:
+      if(fly)
+      {
+        mCurrentTrackingMode = TM_LOCAL_ROTATE;
+        mTrackball.start(Trackball::Local);
+      }
+      else
+      {
+        mCurrentTrackingMode = TM_ROTATE_AROUND;
+        mTrackball.start(Trackball::Around);
+      }
+      mTrackball.track(mMouseCoords);
+      break;
+    case Qt::MidButton:
+      if(fly)
+        mCurrentTrackingMode = TM_FLY_Z;
+      else
+        mCurrentTrackingMode = TM_ZOOM;
+      break;
+    case Qt::RightButton:
+        mCurrentTrackingMode = TM_FLY_PAN;
+      break;
+    default:
+      break;
+  }
+}
+void RenderingWidget::mouseReleaseEvent(QMouseEvent*)
+{
+    mCurrentTrackingMode = TM_NO_TRACK;
+    updateGL();
+}
+
+void RenderingWidget::mouseMoveEvent(QMouseEvent* e)
+{
+    // tracking
+    if(mCurrentTrackingMode != TM_NO_TRACK)
+    {
+        float dx =   float(e->x() - mMouseCoords.x()) / float(mCamera.vpWidth());
+        float dy = - float(e->y() - mMouseCoords.y()) / float(mCamera.vpHeight());
+
+        // speedup the transformations
+        if(e->modifiers() & Qt::ShiftModifier)
+        {
+          dx *= 10.;
+          dy *= 10.;
+        }
+
+        switch(mCurrentTrackingMode)
+        {
+          case TM_ROTATE_AROUND:
+          case TM_LOCAL_ROTATE:
+            if (mRotationMode==RotationStable)
+            {
+              // use the stable trackball implementation mapping
+              // the 2D coordinates to 3D points on a sphere.
+              mTrackball.track(Vector2i(e->pos().x(), e->pos().y()));
+            }
+            else
+            {
+              // standard approach mapping the x and y displacements as rotations
+              // around the camera's X and Y axes.
+              Quaternionf q = AngleAxisf( dx*M_PI, Vector3f::UnitY())
+                            * AngleAxisf(-dy*M_PI, Vector3f::UnitX());
+              if (mCurrentTrackingMode==TM_LOCAL_ROTATE)
+                mCamera.localRotate(q);
+              else
+                mCamera.rotateAroundTarget(q);
+            }
+            break;
+          case TM_ZOOM :
+            mCamera.zoom(dy*100);
+            break;
+          case TM_FLY_Z :
+            mCamera.localTranslate(Vector3f(0, 0, -dy*200));
+            break;
+          case TM_FLY_PAN :
+            mCamera.localTranslate(Vector3f(dx*200, dy*200, 0));
+            break;
+          default:
+            break;
+        }
+
+        updateGL();
+    }
+
+    mMouseCoords = Vector2i(e->pos().x(), e->pos().y());
+}
+
+void RenderingWidget::paintGL()
+{
+  glEnable(GL_DEPTH_TEST);
+  glDisable(GL_CULL_FACE);
+  glPolygonMode(GL_FRONT_AND_BACK,GL_FILL);
+  glDisable(GL_COLOR_MATERIAL);
+  glDisable(GL_BLEND);
+  glDisable(GL_ALPHA_TEST);
+  glDisable(GL_TEXTURE_1D);
+  glDisable(GL_TEXTURE_2D);
+  glDisable(GL_TEXTURE_3D);
+
+  // Clear buffers
+  glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+
+  mCamera.activateGL();
+
+  drawScene();
+}
+
+void RenderingWidget::initializeGL()
+{
+  glClearColor(1., 1., 1., 0.);
+  glLightModeli(GL_LIGHT_MODEL_LOCAL_VIEWER, 1);
+  glDepthMask(GL_TRUE);
+  glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
+
+  mCamera.setPosition(Vector3f(-200, -200, -200));
+  mCamera.setTarget(Vector3f(0, 0, 0));
+  mInitFrame.orientation = mCamera.orientation().inverse();
+  mInitFrame.position = mCamera.viewMatrix().translation();
+}
+
+void RenderingWidget::resizeGL(int width, int height)
+{
+    mCamera.setViewport(width,height);
+}
+
+void RenderingWidget::setNavMode(int m)
+{
+  mNavMode = NavMode(m);
+}
+
+void RenderingWidget::setLerpMode(int m)
+{
+  mLerpMode = LerpMode(m);
+}
+
+void RenderingWidget::setRotationMode(int m)
+{
+  mRotationMode = RotationMode(m);
+}
+
+void RenderingWidget::resetCamera()
+{
+  if (mAnimate)
+    stopAnimation();
+  m_timeline.clear();
+  Frame aux0 = mCamera.frame();
+  aux0.orientation = aux0.orientation.inverse();
+  aux0.position = mCamera.viewMatrix().translation();
+  m_timeline[0] = aux0;
+
+  Vector3f currentTarget = mCamera.target();
+  mCamera.setTarget(Vector3f::Zero());
+
+  // compute the rotation duration to move the camera to the target
+  Frame aux1 = mCamera.frame();
+  aux1.orientation = aux1.orientation.inverse();
+  aux1.position = mCamera.viewMatrix().translation();
+  float duration = aux0.orientation.angularDistance(aux1.orientation) * 0.9;
+  if (duration<0.1) duration = 0.1;
+
+  // put the camera at that time step:
+  aux1 = aux0.lerp(duration/2,mInitFrame);
+  // and make it look at the target again
+  aux1.orientation = aux1.orientation.inverse();
+  aux1.position = - (aux1.orientation * aux1.position);
+  mCamera.setFrame(aux1);
+  mCamera.setTarget(Vector3f::Zero());
+
+  // add this camera keyframe
+  aux1.orientation = aux1.orientation.inverse();
+  aux1.position = mCamera.viewMatrix().translation();
+  m_timeline[duration] = aux1;
+
+  m_timeline[2] = mInitFrame;
+  m_alpha = 0;
+  animate();
+  connect(&m_timer, SIGNAL(timeout()), this, SLOT(animate()));
+  m_timer.start(1000/30);
+  mAnimate = true;
+}
+
+QWidget* RenderingWidget::createNavigationControlWidget()
+{
+  QWidget* panel = new QWidget();
+  QVBoxLayout* layout = new QVBoxLayout();
+
+  {
+    QPushButton* but = new QPushButton("reset");
+    but->setToolTip("move the camera to initial position (with animation)");
+    layout->addWidget(but);
+    connect(but, SIGNAL(clicked()), this, SLOT(resetCamera()));
+  }
+  {
+    // navigation mode
+    QGroupBox* box = new QGroupBox("navigation mode");
+    QVBoxLayout* boxLayout = new QVBoxLayout;
+    QButtonGroup* group = new QButtonGroup(panel);
+    QRadioButton* but;
+    but = new QRadioButton("turn around");
+    but->setToolTip("look around an object");
+    group->addButton(but, NavTurnAround);
+    boxLayout->addWidget(but);
+    but = new QRadioButton("fly");
+    but->setToolTip("free navigation like a spaceship\n(this mode can also be enabled pressing the \"shift\" key)");
+    group->addButton(but, NavFly);
+    boxLayout->addWidget(but);
+    group->button(mNavMode)->setChecked(true);
+    connect(group, SIGNAL(buttonClicked(int)), this, SLOT(setNavMode(int)));
+    box->setLayout(boxLayout);
+    layout->addWidget(box);
+  }
+  {
+    // track ball, rotation mode
+    QGroupBox* box = new QGroupBox("rotation mode");
+    QVBoxLayout* boxLayout = new QVBoxLayout;
+    QButtonGroup* group = new QButtonGroup(panel);
+    QRadioButton* but;
+    but = new QRadioButton("stable trackball");
+    group->addButton(but, RotationStable);
+    boxLayout->addWidget(but);
+    but->setToolTip("use the stable trackball implementation mapping\nthe 2D coordinates to 3D points on a sphere");
+    but = new QRadioButton("standard rotation");
+    group->addButton(but, RotationStandard);
+    boxLayout->addWidget(but);
+    but->setToolTip("standard approach mapping the x and y displacements\nas rotations around the camera's X and Y axes");
+    group->button(mRotationMode)->setChecked(true);
+    connect(group, SIGNAL(buttonClicked(int)), this, SLOT(setRotationMode(int)));
+    box->setLayout(boxLayout);
+    layout->addWidget(box);
+  }
+  {
+    // interpolation mode
+    QGroupBox* box = new QGroupBox("spherical interpolation");
+    QVBoxLayout* boxLayout = new QVBoxLayout;
+    QButtonGroup* group = new QButtonGroup(panel);
+    QRadioButton* but;
+    but = new QRadioButton("quaternion slerp");
+    group->addButton(but, LerpQuaternion);
+    boxLayout->addWidget(but);
+    but->setToolTip("use quaternion spherical interpolation\nto interpolate orientations");
+    but = new QRadioButton("euler angles");
+    group->addButton(but, LerpEulerAngles);
+    boxLayout->addWidget(but);
+    but->setToolTip("use Euler angles to interpolate orientations");
+    group->button(mNavMode)->setChecked(true);
+    connect(group, SIGNAL(buttonClicked(int)), this, SLOT(setLerpMode(int)));
+    box->setLayout(boxLayout);
+    layout->addWidget(box);
+  }
+  layout->addItem(new QSpacerItem(0,0,QSizePolicy::Minimum,QSizePolicy::Expanding));
+  panel->setLayout(layout);
+  return panel;
+}
+
+QuaternionDemo::QuaternionDemo()
+{
+  mRenderingWidget = new RenderingWidget();
+  setCentralWidget(mRenderingWidget);
+
+  QDockWidget* panel = new QDockWidget("navigation", this);
+  panel->setAllowedAreas((QFlags<Qt::DockWidgetArea>)(Qt::RightDockWidgetArea | Qt::LeftDockWidgetArea));
+  addDockWidget(Qt::RightDockWidgetArea, panel);
+  panel->setWidget(mRenderingWidget->createNavigationControlWidget());
+}
+
+int main(int argc, char *argv[])
+{
+  std::cout << "Navigation:\n";
+  std::cout << "  left button:           rotate around the target\n";
+  std::cout << "  middle button:         zoom\n";
+  std::cout << "  left button + ctrl     quake rotate (rotate around camera position)\n";
+  std::cout << "  middle button + ctrl   walk (progress along camera's z direction)\n";
+  std::cout << "  left button:           pan (translate in the XY camera's plane)\n\n";
+  std::cout << "R : move the camera to initial position\n";
+  std::cout << "A : start/stop animation\n";
+  std::cout << "C : clear the animation\n";
+  std::cout << "G : add a key frame\n";
+
+  QApplication app(argc, argv);
+  QuaternionDemo demo;
+  demo.resize(600,500);
+  demo.show();
+  return app.exec();
+}
+
+#include "quaternion_demo.moc"
+

diff --git a/demos/opengl/quaternion_demo.h b/demos/opengl/quaternion_demo.h
new file mode 100644
index 0000000..dbff46c
--- /dev/null
+++ b/demos/opengl/quaternion_demo.h

@@ -0,0 +1,114 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_QUATERNION_DEMO_H
+#define EIGEN_QUATERNION_DEMO_H
+
+#include "gpuhelper.h"
+#include "camera.h"
+#include "trackball.h"
+#include <map>
+#include <QTimer>
+#include <QtGui/QApplication>
+#include <QtOpenGL/QGLWidget>
+#include <QtGui/QMainWindow>
+
+class RenderingWidget : public QGLWidget
+{
+  Q_OBJECT
+
+    typedef std::map<float,Frame> TimeLine;
+    TimeLine m_timeline;
+    Frame lerpFrame(float t);
+
+    Frame mInitFrame;
+    bool mAnimate;
+    float m_alpha;
+
+    enum TrackMode {
+      TM_NO_TRACK=0, TM_ROTATE_AROUND, TM_ZOOM,
+      TM_LOCAL_ROTATE, TM_FLY_Z, TM_FLY_PAN
+    };
+
+    enum NavMode {
+      NavTurnAround,
+      NavFly
+    };
+
+    enum LerpMode {
+      LerpQuaternion,
+      LerpEulerAngles
+    };
+
+    enum RotationMode {
+      RotationStable,
+      RotationStandard
+    };
+
+    Camera mCamera;
+    TrackMode mCurrentTrackingMode;
+    NavMode mNavMode;
+    LerpMode mLerpMode;
+    RotationMode mRotationMode;
+    Vector2i mMouseCoords;
+    Trackball mTrackball;
+
+    QTimer m_timer;
+
+    void setupCamera();
+
+    std::vector<Vector3f> mVertices;
+    std::vector<Vector3f> mNormals;
+    std::vector<int> mIndices;
+
+  protected slots:
+
+    virtual void animate(void);
+    virtual void drawScene(void);
+
+    virtual void grabFrame(void);
+    virtual void stopAnimation();
+
+    virtual void setNavMode(int);
+    virtual void setLerpMode(int);
+    virtual void setRotationMode(int);
+    virtual void resetCamera();
+
+  protected:
+
+    virtual void initializeGL();
+    virtual void resizeGL(int width, int height);
+    virtual void paintGL();
+    
+    //--------------------------------------------------------------------------------
+    virtual void mousePressEvent(QMouseEvent * e);
+    virtual void mouseReleaseEvent(QMouseEvent * e);
+    virtual void mouseMoveEvent(QMouseEvent * e);
+    virtual void keyPressEvent(QKeyEvent * e);
+    //--------------------------------------------------------------------------------
+
+  public: 
+    EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+    
+    RenderingWidget();
+    ~RenderingWidget() { }
+
+    QWidget* createNavigationControlWidget();
+};
+
+class QuaternionDemo : public QMainWindow
+{
+  Q_OBJECT
+  public:
+    QuaternionDemo();
+  protected:
+    RenderingWidget* mRenderingWidget;
+};
+
+#endif // EIGEN_QUATERNION_DEMO_H

diff --git a/demos/opengl/trackball.cpp b/demos/opengl/trackball.cpp
new file mode 100644
index 0000000..7c2da8e
--- /dev/null
+++ b/demos/opengl/trackball.cpp

@@ -0,0 +1,59 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "trackball.h"
+#include "camera.h"
+
+using namespace Eigen;
+
+void Trackball::track(const Vector2i& point2D)
+{
+  if (mpCamera==0)
+    return;
+  Vector3f newPoint3D;
+  bool newPointOk = mapToSphere(point2D, newPoint3D);
+
+  if (mLastPointOk && newPointOk)
+  {
+    Vector3f axis = mLastPoint3D.cross(newPoint3D).normalized();
+    float cos_angle = mLastPoint3D.dot(newPoint3D);
+    if ( std::abs(cos_angle) < 1.0 )
+    {
+      float angle = 2. * acos(cos_angle);
+      if (mMode==Around)
+        mpCamera->rotateAroundTarget(Quaternionf(AngleAxisf(angle, axis)));
+      else
+        mpCamera->localRotate(Quaternionf(AngleAxisf(-angle, axis)));
+    }
+  }
+
+  mLastPoint3D = newPoint3D;
+  mLastPointOk = newPointOk;
+}
+
+bool Trackball::mapToSphere(const Vector2i& p2, Vector3f& v3)
+{
+  if ((p2.x() >= 0) && (p2.x() <= int(mpCamera->vpWidth())) &&
+      (p2.y() >= 0) && (p2.y() <= int(mpCamera->vpHeight())) )
+  {
+    double x  = (double)(p2.x() - 0.5*mpCamera->vpWidth())  / (double)mpCamera->vpWidth();
+    double y  = (double)(0.5*mpCamera->vpHeight() - p2.y()) / (double)mpCamera->vpHeight();
+    double sinx         = sin(M_PI * x * 0.5);
+    double siny         = sin(M_PI * y * 0.5);
+    double sinx2siny2   = sinx * sinx + siny * siny;
+
+    v3.x() = sinx;
+    v3.y() = siny;
+    v3.z() = sinx2siny2 < 1.0 ? sqrt(1.0 - sinx2siny2) : 0.0;
+
+    return true;
+  }
+  else
+    return false;
+}

diff --git a/demos/opengl/trackball.h b/demos/opengl/trackball.h
new file mode 100644
index 0000000..1ea842f
--- /dev/null
+++ b/demos/opengl/trackball.h

@@ -0,0 +1,42 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRACKBALL_H
+#define EIGEN_TRACKBALL_H
+
+#include <Eigen/Geometry>
+
+class Camera;
+
+class Trackball
+{
+  public:
+
+    enum Mode {Around, Local};
+
+    Trackball() : mpCamera(0) {}
+
+    void start(Mode m = Around) { mMode = m; mLastPointOk = false; }
+
+    void setCamera(Camera* pCam) { mpCamera = pCam; }
+
+    void track(const Eigen::Vector2i& newPoint2D);
+
+  protected:
+
+    bool mapToSphere( const Eigen::Vector2i& p2, Eigen::Vector3f& v3);
+
+    Camera* mpCamera;
+    Eigen::Vector3f mLastPoint3D;
+    Mode mMode;
+    bool mLastPointOk;
+
+};
+
+#endif // EIGEN_TRACKBALL_H

diff --git a/doc/AsciiQuickReference.txt b/doc/AsciiQuickReference.txt
new file mode 100644
index 0000000..18b4446
--- /dev/null
+++ b/doc/AsciiQuickReference.txt

@@ -0,0 +1,226 @@
+// A simple quickref for Eigen. Add anything that's missing.
+// Main author: Keir Mierle
+
+#include <Eigen/Dense>
+
+Matrix<double, 3, 3> A;               // Fixed rows and cols. Same as Matrix3d.
+Matrix<double, 3, Dynamic> B;         // Fixed rows, dynamic cols.
+Matrix<double, Dynamic, Dynamic> C;   // Full dynamic. Same as MatrixXd.
+Matrix<double, 3, 3, RowMajor> E;     // Row major; default is column-major.
+Matrix3f P, Q, R;                     // 3x3 float matrix.
+Vector3f x, y, z;                     // 3x1 float matrix.
+RowVector3f a, b, c;                  // 1x3 float matrix.
+VectorXd v;                           // Dynamic column vector of doubles
+double s;                            
+
+// Basic usage
+// Eigen          // Matlab           // comments
+x.size()          // length(x)        // vector size
+C.rows()          // size(C,1)        // number of rows
+C.cols()          // size(C,2)        // number of columns
+x(i)              // x(i+1)           // Matlab is 1-based
+C(i,j)            // C(i+1,j+1)       //
+
+A.resize(4, 4);   // Runtime error if assertions are on.
+B.resize(4, 9);   // Runtime error if assertions are on.
+A.resize(3, 3);   // Ok; size didn't change.
+B.resize(3, 9);   // Ok; only dynamic cols changed.
+                  
+A << 1, 2, 3,     // Initialize A. The elements can also be
+     4, 5, 6,     // matrices, which are stacked along cols
+     7, 8, 9;     // and then the rows are stacked.
+B << A, A, A;     // B is three horizontally stacked A's.
+A.fill(10);       // Fill A with all 10's.
+
+// Eigen                                    // Matlab
+MatrixXd::Identity(rows,cols)               // eye(rows,cols)
+C.setIdentity(rows,cols)                    // C = eye(rows,cols)
+MatrixXd::Zero(rows,cols)                   // zeros(rows,cols)
+C.setZero(rows,cols)                        // C = zeros(rows,cols)
+MatrixXd::Ones(rows,cols)                   // ones(rows,cols)
+C.setOnes(rows,cols)                        // C = ones(rows,cols)
+MatrixXd::Random(rows,cols)                 // rand(rows,cols)*2-1            // MatrixXd::Random returns uniform random numbers in (-1, 1).
+C.setRandom(rows,cols)                      // C = rand(rows,cols)*2-1
+VectorXd::LinSpaced(size,low,high)          // linspace(low,high,size)'
+v.setLinSpaced(size,low,high)               // v = linspace(low,high,size)'
+VectorXi::LinSpaced(((hi-low)/step)+1,      // low:step:hi
+                    low,low+step*(size-1))  //
+
+
+// Matrix slicing and blocks. All expressions listed here are read/write.
+// Templated size versions are faster. Note that Matlab is 1-based (a size N
+// vector is x(1)...x(N)).
+/******************************************************************************/
+/*                  PLEASE HELP US IMPROVING THIS SECTION                     */
+/* Eigen 3.4 supports a much improved API for sub-matrices, including,        */
+/* slicing and indexing from arrays:                                          */
+/* http://eigen.tuxfamily.org/dox-devel/group__TutorialSlicingIndexing.html   */
+/******************************************************************************/
+// Eigen                           // Matlab
+x.head(n)                          // x(1:n)
+x.head<n>()                        // x(1:n)
+x.tail(n)                          // x(end - n + 1: end)
+x.tail<n>()                        // x(end - n + 1: end)
+x.segment(i, n)                    // x(i+1 : i+n)
+x.segment<n>(i)                    // x(i+1 : i+n)
+P.block(i, j, rows, cols)          // P(i+1 : i+rows, j+1 : j+cols)
+P.block<rows, cols>(i, j)          // P(i+1 : i+rows, j+1 : j+cols)
+P.row(i)                           // P(i+1, :)
+P.col(j)                           // P(:, j+1)
+P.leftCols<cols>()                 // P(:, 1:cols)
+P.leftCols(cols)                   // P(:, 1:cols)
+P.middleCols<cols>(j)              // P(:, j+1:j+cols)
+P.middleCols(j, cols)              // P(:, j+1:j+cols)
+P.rightCols<cols>()                // P(:, end-cols+1:end)
+P.rightCols(cols)                  // P(:, end-cols+1:end)
+P.topRows<rows>()                  // P(1:rows, :)
+P.topRows(rows)                    // P(1:rows, :)
+P.middleRows<rows>(i)              // P(i+1:i+rows, :)
+P.middleRows(i, rows)              // P(i+1:i+rows, :)
+P.bottomRows<rows>()               // P(end-rows+1:end, :)
+P.bottomRows(rows)                 // P(end-rows+1:end, :)
+P.topLeftCorner(rows, cols)        // P(1:rows, 1:cols)
+P.topRightCorner(rows, cols)       // P(1:rows, end-cols+1:end)
+P.bottomLeftCorner(rows, cols)     // P(end-rows+1:end, 1:cols)
+P.bottomRightCorner(rows, cols)    // P(end-rows+1:end, end-cols+1:end)
+P.topLeftCorner<rows,cols>()       // P(1:rows, 1:cols)
+P.topRightCorner<rows,cols>()      // P(1:rows, end-cols+1:end)
+P.bottomLeftCorner<rows,cols>()    // P(end-rows+1:end, 1:cols)
+P.bottomRightCorner<rows,cols>()   // P(end-rows+1:end, end-cols+1:end)
+
+// Of particular note is Eigen's swap function which is highly optimized.
+// Eigen                           // Matlab
+R.row(i) = P.col(j);               // R(i, :) = P(:, j)
+R.col(j1).swap(mat1.col(j2));      // R(:, [j1 j2]) = R(:, [j2, j1])
+
+// Views, transpose, etc;
+/******************************************************************************/
+/*                  PLEASE HELP US IMPROVING THIS SECTION                     */
+/* Eigen 3.4 supports a new API for reshaping:                                */
+/* http://eigen.tuxfamily.org/dox-devel/group__TutorialReshape.html           */
+/******************************************************************************/
+// Eigen                           // Matlab
+R.adjoint()                        // R'
+R.transpose()                      // R.' or conj(R')       // Read-write
+R.diagonal()                       // diag(R)               // Read-write
+x.asDiagonal()                     // diag(x)
+R.transpose().colwise().reverse()  // rot90(R)              // Read-write
+R.rowwise().reverse()              // fliplr(R)
+R.colwise().reverse()              // flipud(R)
+R.replicate(i,j)                   // repmat(P,i,j)
+
+
+// All the same as Matlab, but matlab doesn't have *= style operators.
+// Matrix-vector.  Matrix-matrix.   Matrix-scalar.
+y  = M*x;          R  = P*Q;        R  = P*s;
+a  = b*M;          R  = P - Q;      R  = s*P;
+a *= M;            R  = P + Q;      R  = P/s;
+                   R *= Q;          R  = s*P;
+                   R += Q;          R *= s;
+                   R -= Q;          R /= s;
+
+// Vectorized operations on each element independently
+// Eigen                       // Matlab
+R = P.cwiseProduct(Q);         // R = P .* Q
+R = P.array() * s.array();     // R = P .* s
+R = P.cwiseQuotient(Q);        // R = P ./ Q
+R = P.array() / Q.array();     // R = P ./ Q
+R = P.array() + s.array();     // R = P + s
+R = P.array() - s.array();     // R = P - s
+R.array() += s;                // R = R + s
+R.array() -= s;                // R = R - s
+R.array() < Q.array();         // R < Q
+R.array() <= Q.array();        // R <= Q
+R.cwiseInverse();              // 1 ./ P
+R.array().inverse();           // 1 ./ P
+R.array().sin()                // sin(P)
+R.array().cos()                // cos(P)
+R.array().pow(s)               // P .^ s
+R.array().square()             // P .^ 2
+R.array().cube()               // P .^ 3
+R.cwiseSqrt()                  // sqrt(P)
+R.array().sqrt()               // sqrt(P)
+R.array().exp()                // exp(P)
+R.array().log()                // log(P)
+R.cwiseMax(P)                  // max(R, P)
+R.array().max(P.array())       // max(R, P)
+R.cwiseMin(P)                  // min(R, P)
+R.array().min(P.array())       // min(R, P)
+R.cwiseAbs()                   // abs(P)
+R.array().abs()                // abs(P)
+R.cwiseAbs2()                  // abs(P.^2)
+R.array().abs2()               // abs(P.^2)
+(R.array() < s).select(P,Q );  // (R < s ? P : Q)
+R = (Q.array()==0).select(P,R) // R(Q==0) = P(Q==0)
+R = P.unaryExpr(ptr_fun(func)) // R = arrayfun(func, P)   // with: scalar func(const scalar &x);
+
+
+// Reductions.
+int r, c;
+// Eigen                  // Matlab
+R.minCoeff()              // min(R(:))
+R.maxCoeff()              // max(R(:))
+s = R.minCoeff(&r, &c)    // [s, i] = min(R(:)); [r, c] = ind2sub(size(R), i);
+s = R.maxCoeff(&r, &c)    // [s, i] = max(R(:)); [r, c] = ind2sub(size(R), i);
+R.sum()                   // sum(R(:))
+R.colwise().sum()         // sum(R)
+R.rowwise().sum()         // sum(R, 2) or sum(R')'
+R.prod()                  // prod(R(:))
+R.colwise().prod()        // prod(R)
+R.rowwise().prod()        // prod(R, 2) or prod(R')'
+R.trace()                 // trace(R)
+R.all()                   // all(R(:))
+R.colwise().all()         // all(R)
+R.rowwise().all()         // all(R, 2)
+R.any()                   // any(R(:))
+R.colwise().any()         // any(R)
+R.rowwise().any()         // any(R, 2)
+
+// Dot products, norms, etc.
+// Eigen                  // Matlab
+x.norm()                  // norm(x).    Note that norm(R) doesn't work in Eigen.
+x.squaredNorm()           // dot(x, x)   Note the equivalence is not true for complex
+x.dot(y)                  // dot(x, y)
+x.cross(y)                // cross(x, y) Requires #include <Eigen/Geometry>
+
+//// Type conversion
+// Eigen                  // Matlab
+A.cast<double>();         // double(A)
+A.cast<float>();          // single(A)
+A.cast<int>();            // int32(A)
+A.real();                 // real(A)
+A.imag();                 // imag(A)
+// if the original type equals destination type, no work is done
+
+// Note that for most operations Eigen requires all operands to have the same type:
+MatrixXf F = MatrixXf::Zero(3,3);
+A += F;                // illegal in Eigen. In Matlab A = A+F is allowed
+A += F.cast<double>(); // F converted to double and then added (generally, conversion happens on-the-fly)
+
+// Eigen can map existing memory into Eigen matrices.
+float array[3];
+Vector3f::Map(array).fill(10);            // create a temporary Map over array and sets entries to 10
+int data[4] = {1, 2, 3, 4};
+Matrix2i mat2x2(data);                    // copies data into mat2x2
+Matrix2i::Map(data) = 2*mat2x2;           // overwrite elements of data with 2*mat2x2
+MatrixXi::Map(data, 2, 2) += mat2x2;      // adds mat2x2 to elements of data (alternative syntax if size is not know at compile time)
+
+// Solve Ax = b. Result stored in x. Matlab: x = A \ b.
+x = A.ldlt().solve(b));  // A sym. p.s.d.    #include <Eigen/Cholesky>
+x = A.llt() .solve(b));  // A sym. p.d.      #include <Eigen/Cholesky>
+x = A.lu()  .solve(b));  // Stable and fast. #include <Eigen/LU>
+x = A.qr()  .solve(b));  // No pivoting.     #include <Eigen/QR>
+x = A.svd() .solve(b));  // Stable, slowest. #include <Eigen/SVD>
+// .ldlt() -> .matrixL() and .matrixD()
+// .llt()  -> .matrixL()
+// .lu()   -> .matrixL() and .matrixU()
+// .qr()   -> .matrixQ() and .matrixR()
+// .svd()  -> .matrixU(), .singularValues(), and .matrixV()
+
+// Eigenvalue problems
+// Eigen                          // Matlab
+A.eigenvalues();                  // eig(A);
+EigenSolver<Matrix3d> eig(A);     // [vec val] = eig(A)
+eig.eigenvalues();                // diag(val)
+eig.eigenvectors();               // vec
+// For self-adjoint matrices use SelfAdjointEigenSolver<>

diff --git a/doc/B01_Experimental.dox b/doc/B01_Experimental.dox
new file mode 100644
index 0000000..e1f031d
--- /dev/null
+++ b/doc/B01_Experimental.dox

@@ -0,0 +1,52 @@
+namespace Eigen {
+
+/** \page Experimental Experimental parts of Eigen
+
+\eigenAutoToc
+
+\section Experimental_summary Summary
+
+With the 2.0 release, Eigen's API is, to a large extent, stable. However, we wish to retain the freedom to make API incompatible changes. To that effect, we call many parts of Eigen "experimental" which means that they are not subject to API stability guarantee.
+
+Our goal is that for the 2.1 release (expected in July 2009) most of these parts become API-stable too.
+
+We are aware that API stability is a major concern for our users. That's why it's a priority for us to reach it, but at the same time we're being serious about not calling Eigen API-stable too early.
+
+Experimental features may at any time:
+\li be removed;
+\li be subject to an API incompatible change;
+\li introduce API or ABI incompatible changes in your own code if you let them affect your API or ABI.
+
+\section Experimental_modules Experimental modules
+
+The following modules are considered entirely experimental, and we make no firm API stability guarantee about them for the time being:
+\li SVD
+\li QR
+\li Cholesky
+\li Sparse
+\li Geometry (this one should be mostly stable, but it's a little too early to make a formal guarantee)
+
+\section Experimental_core Experimental parts of the Core module
+
+In the Core module, the only classes subject to ABI stability guarantee (meaning that you can use it for data members in your public ABI) is:
+\li Matrix
+\li Map
+
+All other classes offer no ABI guarantee, e.g. the layout of their data can be changed.
+
+The only classes subject to (even partial) API stability guarantee (meaning that you can safely construct and use objects) are:
+\li MatrixBase : partial API stability (see below)
+\li Matrix : full API stability (except for experimental stuff inherited from MatrixBase)
+\li Map : full API stability (except for experimental stuff inherited from MatrixBase)
+
+All other classes offer no direct API guarantee, e.g. their methods can be changed; however notice that most classes inherit MatrixBase and that this is where most of their API comes from -- so in practice most of the API is stable.
+
+A few MatrixBase methods are considered experimental, hence not part of any API stability guarantee:
+\li all methods documented as internal
+\li all methods hidden in the Doxygen documentation
+\li all methods marked as experimental
+\li all methods defined in experimental modules
+
+*/
+
+}

diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
new file mode 100644
index 0000000..0f9ef23
--- /dev/null
+++ b/doc/CMakeLists.txt

@@ -0,0 +1,122 @@
+project(EigenDoc)
+
+set_directory_properties(PROPERTIES EXCLUDE_FROM_ALL TRUE)
+
+project(EigenDoc)
+
+if(CMAKE_COMPILER_IS_GNUCXX)
+  if(CMAKE_SYSTEM_NAME MATCHES Linux)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O1 -g1")
+  endif()
+endif()
+
+# some examples and snippets needs c++11, so let's check it once
+check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11)
+
+option(EIGEN_INTERNAL_DOCUMENTATION "Build internal documentation" OFF)
+option(EIGEN_DOC_USE_MATHJAX "Use MathJax for rendering math in HTML docs" ON)
+
+# Set some Doxygen flags
+set(EIGEN_DOXY_PROJECT_NAME             "Eigen")
+set(EIGEN_DOXY_OUTPUT_DIRECTORY_SUFFIX  "")
+set(EIGEN_DOXY_INPUT                    "\"${Eigen_SOURCE_DIR}/Eigen\" \"${Eigen_SOURCE_DIR}/doc\"")
+set(EIGEN_DOXY_HTML_COLORSTYLE_HUE      "220")
+set(EIGEN_DOXY_TAGFILES                 "")
+
+if(EIGEN_INTERNAL_DOCUMENTATION)
+  set(EIGEN_DOXY_INTERNAL                 "YES")
+else()
+  set(EIGEN_DOXY_INTERNAL                 "NO")
+endif()
+
+if (EIGEN_DOC_USE_MATHJAX)
+  set(EIGEN_DOXY_USE_MATHJAX "YES")
+else ()
+  set(EIGEN_DOXY_USE_MATHJAX "NO")
+endif()
+
+configure_file(
+  ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in
+  ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
+)
+
+set(EIGEN_DOXY_PROJECT_NAME             "Eigen-unsupported")
+set(EIGEN_DOXY_OUTPUT_DIRECTORY_SUFFIX  "/unsupported")
+set(EIGEN_DOXY_INPUT                    "\"${Eigen_SOURCE_DIR}/unsupported/Eigen\" \"${Eigen_SOURCE_DIR}/unsupported/doc\"")
+set(EIGEN_DOXY_HTML_COLORSTYLE_HUE      "0")
+set(EIGEN_DOXY_TAGFILES                 "\"${Eigen_BINARY_DIR}/doc/Eigen.doxytags=..\"")
+#set(EIGEN_DOXY_TAGFILES                 "")
+
+configure_file(
+  ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in
+  ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile-unsupported
+)
+
+configure_file(
+  ${CMAKE_CURRENT_SOURCE_DIR}/eigendoxy_header.html.in
+  ${CMAKE_CURRENT_BINARY_DIR}/eigendoxy_header.html
+)
+
+configure_file(
+  ${CMAKE_CURRENT_SOURCE_DIR}/eigendoxy_footer.html.in
+  ${CMAKE_CURRENT_BINARY_DIR}/eigendoxy_footer.html
+)
+
+configure_file(
+  ${CMAKE_CURRENT_SOURCE_DIR}/eigendoxy_layout.xml.in
+  ${CMAKE_CURRENT_BINARY_DIR}/eigendoxy_layout.xml
+)
+
+configure_file(
+  ${Eigen_SOURCE_DIR}/unsupported/doc/eigendoxy_layout.xml.in
+  ${Eigen_BINARY_DIR}/doc/unsupported/eigendoxy_layout.xml
+)
+
+set(examples_targets "")
+set(snippets_targets "")
+
+add_definitions("-DEIGEN_MAKING_DOCS")
+add_custom_target(all_examples)
+
+add_subdirectory(examples)
+add_subdirectory(special_examples)
+add_subdirectory(snippets)
+
+add_custom_target(
+  doc-eigen-prerequisites
+  ALL
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/html/
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/eigen_navtree_hacks.js           ${CMAKE_CURRENT_BINARY_DIR}/html/
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/Eigen_Silly_Professor_64x64.png  ${CMAKE_CURRENT_BINARY_DIR}/html/
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/ftv2pnode.png                    ${CMAKE_CURRENT_BINARY_DIR}/html/
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/ftv2node.png                     ${CMAKE_CURRENT_BINARY_DIR}/html/
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/AsciiQuickReference.txt          ${CMAKE_CURRENT_BINARY_DIR}/html/
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+add_custom_target(
+  doc-unsupported-prerequisites
+  ALL
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${Eigen_BINARY_DIR}/doc/html/unsupported
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/eigen_navtree_hacks.js           ${CMAKE_CURRENT_BINARY_DIR}/html/unsupported/
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/Eigen_Silly_Professor_64x64.png  ${CMAKE_CURRENT_BINARY_DIR}/html/unsupported/
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/ftv2pnode.png                    ${CMAKE_CURRENT_BINARY_DIR}/html/unsupported/
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/ftv2node.png                     ${CMAKE_CURRENT_BINARY_DIR}/html/unsupported/
+  WORKING_DIRECTORY ${Eigen_BINARY_DIR}/doc
+)
+
+add_dependencies(doc-eigen-prerequisites all_snippets all_examples)
+add_dependencies(doc-unsupported-prerequisites unsupported_snippets unsupported_examples)
+
+add_custom_target(doc ALL
+  COMMAND doxygen
+  COMMAND doxygen Doxyfile-unsupported
+  COMMAND ${CMAKE_COMMAND} -E copy ${Eigen_BINARY_DIR}/doc/html/group__TopicUnalignedArrayAssert.html ${Eigen_BINARY_DIR}/doc/html/TopicUnalignedArrayAssert.html
+  COMMAND ${CMAKE_COMMAND} -E rename html eigen-doc
+  COMMAND ${CMAKE_COMMAND} -E remove eigen-doc/eigen-doc.tgz eigen-doc/unsupported/_formulas.log eigen-doc/_formulas.log
+  COMMAND ${CMAKE_COMMAND} -E tar cfz eigen-doc.tgz eigen-doc
+  COMMAND ${CMAKE_COMMAND} -E rename eigen-doc.tgz eigen-doc/eigen-doc.tgz
+  COMMAND ${CMAKE_COMMAND} -E rename eigen-doc html
+  WORKING_DIRECTORY ${Eigen_BINARY_DIR}/doc)
+
+add_dependencies(doc doc-eigen-prerequisites doc-unsupported-prerequisites)

diff --git a/doc/ClassHierarchy.dox b/doc/ClassHierarchy.dox
new file mode 100644
index 0000000..468e60a
--- /dev/null
+++ b/doc/ClassHierarchy.dox

@@ -0,0 +1,129 @@
+namespace Eigen {
+
+/** \page TopicClassHierarchy The class hierarchy
+
+This page explains the design of the core classes in Eigen's class hierarchy and how they fit together. Casual
+users probably need not concern themselves with these details, but it may be useful for both advanced users
+and Eigen developers.
+
+\eigenAutoToc
+
+
+\section TopicClassHierarchyPrinciples Principles
+
+Eigen's class hierarchy is designed so that virtual functions are avoided where their overhead would
+significantly impair performance. Instead, Eigen achieves polymorphism with the Curiously Recurring Template
+Pattern (CRTP). In this pattern, the base class (for instance, \c MatrixBase) is in fact a template class, and
+the derived class (for instance, \c Matrix) inherits the base class with the derived class itself as a
+template argument (in this case, \c Matrix inherits from \c MatrixBase&lt;Matrix&gt;). This allows Eigen to
+resolve the polymorphic function calls at compile time.
+
+In addition, the design avoids multiple inheritance. One reason for this is that in our experience, some
+compilers (like MSVC) fail to perform empty base class optimization, which is crucial for our fixed-size
+types.
+
+
+\section TopicClassHierarchyCoreClasses The core classes
+
+These are the classes that you need to know about if you want to write functions that accept or return Eigen
+objects.
+
+  - Matrix means plain dense matrix. If \c m is a \c %Matrix, then, for instance, \c m+m is no longer a 
+    \c %Matrix, it is a "matrix expression".
+  - MatrixBase means dense matrix expression. This means that a \c %MatrixBase is something that can be
+    added, matrix-multiplied, LU-decomposed, QR-decomposed... All matrix expression classes, including 
+    \c %Matrix itself, inherit \c %MatrixBase.
+  - Array means plain dense array. If \c x is an \c %Array, then, for instance, \c x+x is no longer an 
+    \c %Array, it is an "array expression".
+  - ArrayBase means dense array expression. This means that an \c %ArrayBase is something that can be
+    added, array-multiplied, and on which you can perform all sorts of array operations... All array
+    expression classes, including \c %Array itself, inherit \c %ArrayBase.
+  - DenseBase means dense (matrix or array) expression. Both \c %ArrayBase and \c %MatrixBase inherit
+    \c %DenseBase. \c %DenseBase is where all the methods go that apply to dense expressions regardless of
+    whether they are matrix or array expressions. For example, the \link DenseBase::block() block(...) \endlink
+    methods are in \c %DenseBase.
+
+\section TopicClassHierarchyBaseClasses Base classes
+
+These classes serve as base classes for the five core classes mentioned above. They are more internal and so
+less interesting for users of the Eigen library.
+
+  - PlainObjectBase means dense (matrix or array) plain object, i.e. something that stores its own dense
+    array of coefficients. This is where, for instance, the \link PlainObjectBase::resize() resize() \endlink
+    methods go. \c %PlainObjectBase is inherited by \c %Matrix and by \c %Array. But above, we said that 
+    \c %Matrix inherits \c %MatrixBase and \c %Array inherits \c %ArrayBase. So does that mean multiple
+    inheritance? No, because \c %PlainObjectBase \e itself inherits \c %MatrixBase or \c %ArrayBase depending
+    on whether we are in the matrix or array case. When we said above that \c %Matrix inherited 
+    \c %MatrixBase, we omitted to say it does so indirectly via \c %PlainObjectBase. Same for \c %Array.
+  - DenseCoeffsBase means something that has dense coefficient accessors. It is a base class for
+    \c %DenseBase. The reason for \c %DenseCoeffsBase to exist is that the set of available coefficient
+    accessors is very different depending on whether a dense expression has direct memory access or not (the
+    \c DirectAccessBit flag). For example, if \c x is a plain matrix, then \c x has direct access, and 
+    \c x.transpose() and \c x.block(...) also have direct access, because their coefficients can be read right
+    off memory, but for example, \c x+x does not have direct memory access, because obtaining any of its
+    coefficients requires a computation (an addition), it can't be just read off memory.
+  - EigenBase means anything that can be evaluated into a plain dense matrix or array (even if that would
+    be a bad idea). \c %EigenBase is really the absolute base class for anything that remotely looks like a
+    matrix or array. It is a base class for \c %DenseCoeffsBase, so it sits below all our dense class
+    hierarchy, but it is not limited to dense expressions. For example, \c %EigenBase is also inherited by
+    diagonal matrices, sparse matrices, etc...
+
+
+\section TopicClassHierarchyInheritanceDiagrams Inheritance diagrams
+
+The inheritance diagram for Matrix looks as follows:
+
+<pre>
+EigenBase&lt;%Matrix&gt;
+  <-- DenseCoeffsBase&lt;%Matrix&gt;    (direct access case)
+    <-- DenseBase&lt;%Matrix&gt;
+      <-- MatrixBase&lt;%Matrix&gt;
+        <-- PlainObjectBase&lt;%Matrix&gt;    (matrix case)
+          <-- Matrix
+</pre>
+
+The inheritance diagram for Array looks as follows:
+
+<pre>
+EigenBase&lt;%Array&gt;
+  <-- DenseCoeffsBase&lt;%Array&gt;    (direct access case)
+    <-- DenseBase&lt;%Array&gt;
+      <-- ArrayBase&lt;%Array&gt;
+        <-- PlainObjectBase&lt;%Array&gt;    (array case)
+          <-- Array
+</pre>
+
+The inheritance diagram for some other matrix expression class, here denoted by \c SomeMatrixXpr, looks as
+follows:
+
+<pre>
+EigenBase&lt;SomeMatrixXpr&gt;
+  <-- DenseCoeffsBase&lt;SomeMatrixXpr&gt;    (direct access or no direct access case)
+    <-- DenseBase&lt;SomeMatrixXpr&gt;
+      <-- MatrixBase&lt;SomeMatrixXpr&gt;
+        <-- SomeMatrixXpr
+</pre>
+
+The inheritance diagram for some other array expression class, here denoted by \c SomeArrayXpr, looks as
+follows:
+
+<pre>
+EigenBase&lt;SomeArrayXpr&gt;
+  <-- DenseCoeffsBase&lt;SomeArrayXpr&gt;    (direct access or no direct access case)
+    <-- DenseBase&lt;SomeArrayXpr&gt;
+      <-- ArrayBase&lt;SomeArrayXpr&gt;
+        <-- SomeArrayXpr
+</pre>
+
+Finally, consider an example of something that is not a dense expression, for instance a diagonal matrix. The
+corresponding inheritance diagram is:
+
+<pre>
+EigenBase&lt;%DiagonalMatrix&gt;
+  <-- DiagonalBase&lt;%DiagonalMatrix&gt;
+    <-- DiagonalMatrix
+</pre>
+
+
+*/
+}

diff --git a/doc/CoeffwiseMathFunctionsTable.dox b/doc/CoeffwiseMathFunctionsTable.dox
new file mode 100644
index 0000000..3f5c564
--- /dev/null
+++ b/doc/CoeffwiseMathFunctionsTable.dox

@@ -0,0 +1,600 @@
+namespace Eigen {
+
+/** \eigenManualPage CoeffwiseMathFunctions Catalog of coefficient-wise math functions
+
+
+<!-- <span style="font-size:300%; color:red; font-weight: 900;">!WORK IN PROGRESS!</span> -->
+
+This table presents a catalog of the coefficient-wise math functions supported by %Eigen.
+In this table, \c a, \c b, refer to Array objects or expressions, and \c m refers to a linear algebra Matrix/Vector object. Standard scalar types are abbreviated as follows:
+  - \c int: \c i32
+  - \c float: \c f
+  - \c double: \c d
+  - \c std::complex<float>: \c cf
+  - \c std::complex<double>: \c cd
+
+For each row, the first column list the equivalent calls for arrays, and matrices when supported. Of course, all functions are available for matrices by first casting it as an array: \c m.array().
+
+The third column gives some hints in the underlying scalar implementation. In most cases, %Eigen does not implement itself the math function but relies on the STL for standard scalar types, or user-provided functions for custom scalar types.
+For instance, some simply calls the respective function of the STL while preserving <a href="http://en.cppreference.com/w/cpp/language/adl">argument-dependent lookup</a> for custom types.
+The following:
+\code
+using std::foo;
+foo(a[i]);
+\endcode
+means that the STL's function \c std::foo will be potentially called if it is compatible with the underlying scalar type. If not, then the user must ensure that an overload of the function foo is available for the given scalar type (usually defined in the same namespace as the given scalar type).
+This also means that, unless specified, if the function \c std::foo is available only in some recent c++ versions (e.g., c++11), then the respective %Eigen's function/method will be usable on standard types only if the compiler support the required c++ version.
+
+<table class="manual-hl">
+<tr>
+<th>API</th><th>Description</th><th>Default scalar implementation</th><th>SIMD</th>
+</tr>
+<tr><td colspan="4"></td></tr>
+<tr><th colspan="4">Basic operations</th></tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_abs
+  a.\link ArrayBase::abs abs\endlink(); \n
+  \link Eigen::abs abs\endlink(a); \n
+  m.\link MatrixBase::cwiseAbs cwiseAbs\endlink();
+  </td>
+  <td>absolute value (\f$ |a_i| \f$) </td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/fabs">std::abs</a>; \n
+  abs(a[i]);
+  </td>
+  <td>SSE2, AVX (i32,f,d)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_inverse
+  a.\link ArrayBase::inverse inverse\endlink(); \n
+  \link Eigen::inverse inverse\endlink(a); \n
+  m.\link MatrixBase::cwiseInverse cwiseInverse\endlink();
+  </td>
+  <td>inverse value (\f$ 1/a_i \f$) </td>
+  <td class="code">
+  1/a[i];
+  </td>
+  <td>All engines (f,d,fc,fd)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_conj
+  a.\link ArrayBase::conjugate conjugate\endlink(); \n
+  \link Eigen::conj conj\endlink(a); \n
+  m.\link MatrixBase::conjugate conjugate\endlink();
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Complex_conjugate">complex conjugate</a> (\f$ \bar{a_i} \f$),\n
+  no-op for real </td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/complex/conj">std::conj</a>; \n
+  conj(a[i]);
+  </td>
+  <td>All engines (fc,fd)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_arg
+  a.\link ArrayBase::arg arg\endlink(); \n
+  \link Eigen::arg arg\endlink(a); \n
+  m.\link MatrixBase::cwiseArg cwiseArg\endlink();
+  </td>
+  <td>phase angle of complex number</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/complex/arg">std::arg</a>; \n
+  arg(a[i]);
+  </td>
+  <td>All engines (fc,fd)</td>
+</tr>
+<tr>
+<th colspan="4">Exponential functions</th>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_exp
+  a.\link ArrayBase::exp exp\endlink(); \n
+  \link Eigen::exp exp\endlink(a);
+  </td>
+  <td>\f$ e \f$ raised to the given power (\f$ e^{a_i} \f$) </td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/exp">std::exp</a>; \n
+  exp(a[i]);
+  </td>
+  <td>SSE2, AVX (f,d)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_log
+  a.\link ArrayBase::log log\endlink(); \n
+  \link Eigen::log log\endlink(a);
+  </td>
+  <td>natural (base \f$ e \f$) logarithm (\f$ \ln({a_i}) \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/log">std::log</a>; \n
+  log(a[i]);
+  </td>
+  <td>SSE2, AVX (f)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_log1p
+  a.\link ArrayBase::log1p log1p\endlink(); \n
+  \link Eigen::log1p log1p\endlink(a);
+  </td>
+  <td>natural (base \f$ e \f$) logarithm of 1 plus \n the given number (\f$ \ln({1+a_i}) \f$)</td>
+  <td>built-in generic implementation based on \c log,\n
+  plus \c using <a href="http://en.cppreference.com/w/cpp/numeric/math/log1p">\c std::log1p </a>; \cpp11</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_log10
+  a.\link ArrayBase::log10 log10\endlink(); \n
+  \link Eigen::log10 log10\endlink(a);
+  </td>
+  <td>base 10 logarithm (\f$ \log_{10}({a_i}) \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/log10">std::log10</a>; \n
+  log10(a[i]);
+  </td>
+  <td></td>
+</tr>
+<tr>
+<th colspan="4">Power functions</th>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_pow
+  a.\link ArrayBase::pow pow\endlink(b); \n
+  \link ArrayBase::pow(const Eigen::ArrayBase< Derived > &x, const Eigen::ArrayBase< ExponentDerived > &exponents) pow\endlink(a,b);
+  </td>
+  <!-- For some reason Doxygen thinks that pow is in ArrayBase namespace -->
+  <td>raises a number to the given power (\f$ a_i ^ {b_i} \f$) \n \c a and \c b can be either an array or scalar.</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/pow">std::pow</a>; \n
+  pow(a[i],b[i]);\n
+  (plus builtin for integer types)</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_sqrt
+  a.\link ArrayBase::sqrt sqrt\endlink(); \n
+  \link Eigen::sqrt sqrt\endlink(a);\n
+  m.\link MatrixBase::cwiseSqrt cwiseSqrt\endlink();
+  </td>
+  <td>computes square root (\f$ \sqrt a_i \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/sqrt">std::sqrt</a>; \n
+  sqrt(a[i]);</td>
+  <td>SSE2, AVX (f,d)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_rsqrt
+  a.\link ArrayBase::rsqrt rsqrt\endlink(); \n
+  \link Eigen::rsqrt rsqrt\endlink(a);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Fast_inverse_square_root">reciprocal square root</a> (\f$ 1/{\sqrt a_i} \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/sqrt">std::sqrt</a>; \n
+  1/sqrt(a[i]); \n
+  </td>
+  <td>SSE2, AVX, AltiVec, ZVector (f,d)\n
+  (approx + 1 Newton iteration)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_square
+  a.\link ArrayBase::square square\endlink(); \n
+  \link Eigen::square square\endlink(a);
+  </td>
+  <td>computes square power (\f$ a_i^2 \f$)</td>
+  <td class="code">
+  a[i]*a[i]</td>
+  <td>All (i32,f,d,cf,cd)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_cube
+  a.\link ArrayBase::cube cube\endlink(); \n
+  \link Eigen::cube cube\endlink(a);
+  </td>
+  <td>computes cubic power (\f$ a_i^3 \f$)</td>
+  <td class="code">
+  a[i]*a[i]*a[i]</td>
+  <td>All (i32,f,d,cf,cd)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_abs2
+  a.\link ArrayBase::abs2 abs2\endlink(); \n
+  \link Eigen::abs2 abs2\endlink(a);\n
+  m.\link MatrixBase::cwiseAbs2 cwiseAbs2\endlink();
+  </td>
+  <td>computes the squared absolute value (\f$ |a_i|^2 \f$)</td>
+  <td class="code">
+  real:    a[i]*a[i] \n
+  complex:  real(a[i])*real(a[i]) \n
+  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; + imag(a[i])*imag(a[i])</td>
+  <td>All (i32,f,d)</td>
+</tr>
+<tr>
+<th colspan="4">Trigonometric functions</th>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_sin
+  a.\link ArrayBase::sin sin\endlink(); \n
+  \link Eigen::sin sin\endlink(a);
+  </td>
+  <td>computes sine</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/sin">std::sin</a>; \n
+  sin(a[i]);</td>
+  <td>SSE2, AVX (f)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_cos
+  a.\link ArrayBase::cos cos\endlink(); \n
+  \link Eigen::cos cos\endlink(a);
+  </td>
+  <td>computes cosine</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/cos">std::cos</a>; \n
+  cos(a[i]);</td>
+  <td>SSE2, AVX (f)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_tan
+  a.\link ArrayBase::tan tan\endlink(); \n
+  \link Eigen::tan tan\endlink(a);
+  </td>
+  <td>computes tangent</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/tan">std::tan</a>; \n
+  tan(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_asin
+  a.\link ArrayBase::asin asin\endlink(); \n
+  \link Eigen::asin asin\endlink(a);
+  </td>
+  <td>computes arc sine (\f$ \sin^{-1} a_i \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/asin">std::asin</a>; \n
+  asin(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_acos
+  a.\link ArrayBase::acos acos\endlink(); \n
+  \link Eigen::acos acos\endlink(a);
+  </td>
+  <td>computes arc cosine  (\f$ \cos^{-1} a_i \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/acos">std::acos</a>; \n
+  acos(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_atan
+  a.\link ArrayBase::atan atan\endlink(); \n
+  \link Eigen::atan atan\endlink(a);
+  </td>
+  <td>computes arc tangent (\f$ \tan^{-1} a_i \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/atan">std::atan</a>; \n
+  atan(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+<th colspan="4">Hyperbolic functions</th>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_sinh
+  a.\link ArrayBase::sinh sinh\endlink(); \n
+  \link Eigen::sinh sinh\endlink(a);
+  </td>
+  <td>computes hyperbolic sine</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/sinh">std::sinh</a>; \n
+  sinh(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_cosh
+  a.\link ArrayBase::cosh cohs\endlink(); \n
+  \link Eigen::cosh cosh\endlink(a);
+  </td>
+  <td>computes hyperbolic cosine</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/cosh">std::cosh</a>; \n
+  cosh(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_tanh
+  a.\link ArrayBase::tanh tanh\endlink(); \n
+  \link Eigen::tanh tanh\endlink(a);
+  </td>
+  <td>computes hyperbolic tangent</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/tanh">std::tanh</a>; \n
+  tanh(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_asinh
+  a.\link ArrayBase::asinh asinh\endlink(); \n
+  \link Eigen::asinh asinh\endlink(a);
+  </td>
+  <td>computes inverse hyperbolic sine</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/asinh">std::asinh</a>; \n
+  asinh(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_acosh
+  a.\link ArrayBase::acosh cohs\endlink(); \n
+  \link Eigen::acosh acosh\endlink(a);
+  </td>
+  <td>computes hyperbolic cosine</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/acosh">std::acosh</a>; \n
+  acosh(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_atanh
+  a.\link ArrayBase::atanh atanh\endlink(); \n
+  \link Eigen::atanh atanh\endlink(a);
+  </td>
+  <td>computes hyperbolic tangent</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/atanh">std::atanh</a>; \n
+  atanh(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+<th colspan="4">Nearest integer floating point operations</th>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_ceil
+  a.\link ArrayBase::ceil ceil\endlink(); \n
+  \link Eigen::ceil ceil\endlink(a);
+  </td>
+  <td>nearest integer not less than the given value</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/ceil">std::ceil</a>; \n
+  ceil(a[i]);</td>
+  <td>SSE4,AVX,ZVector (f,d)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_floor
+  a.\link ArrayBase::floor floor\endlink(); \n
+  \link Eigen::floor floor\endlink(a);
+  </td>
+  <td>nearest integer not greater than the given value</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/floor">std::floor</a>; \n
+  floor(a[i]);</td>
+  <td>SSE4,AVX,ZVector (f,d)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_round
+  a.\link ArrayBase::round round\endlink(); \n
+  \link Eigen::round round\endlink(a);
+  </td>
+  <td>nearest integer, \n rounding away from zero in halfway cases</td>
+  <td>built-in generic implementation \n based on \c floor and \c ceil,\n
+  plus \c using <a href="http://en.cppreference.com/w/cpp/numeric/math/round">\c std::round </a>; \cpp11</td>
+  <td>SSE4,AVX,ZVector (f,d)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_rint
+  a.\link ArrayBase::rint rint\endlink(); \n
+  \link Eigen::rint rint\endlink(a);
+  </td>
+  <td>nearest integer, \n rounding to nearest even in halfway cases</td>
+  <td>built-in generic implementation using <a href="http://en.cppreference.com/w/cpp/numeric/math/rint">\c std::rint </a>; \cpp11
+  or <a href="http://en.cppreference.com/w/c/numeric/math/rint">\c rintf </a>; </td>
+  <td>SSE4,AVX (f,d)</td>
+</tr>
+<tr>
+<th colspan="4">Floating point manipulation functions</th>
+</tr>
+<tr>
+<th colspan="4">Classification and comparison</th>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_isfinite
+  a.\link ArrayBase::isFinite isFinite\endlink(); \n
+  \link Eigen::isfinite isfinite\endlink(a);
+  </td>
+  <td>checks if the given number has finite value</td>
+  <td>built-in generic implementation,\n
+  plus \c using <a href="http://en.cppreference.com/w/cpp/numeric/math/isfinite">\c std::isfinite </a>; \cpp11</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_isinf
+  a.\link ArrayBase::isInf isInf\endlink(); \n
+  \link Eigen::isinf isinf\endlink(a);
+  </td>
+  <td>checks if the given number is infinite</td>
+  <td>built-in generic implementation,\n
+  plus \c using <a href="http://en.cppreference.com/w/cpp/numeric/math/isinf">\c std::isinf </a>; \cpp11</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_isnan
+  a.\link ArrayBase::isNaN isNaN\endlink(); \n
+  \link Eigen::isnan isnan\endlink(a);
+  </td>
+  <td>checks if the given number is not a number</td>
+  <td>built-in generic implementation,\n
+  plus \c using <a href="http://en.cppreference.com/w/cpp/numeric/math/isnan">\c std::isnan </a>; \cpp11</td>
+  <td></td>
+</tr>
+<tr>
+<th colspan="4">Error and gamma functions</th>
+</tr>
+<tr> <td colspan="4">  Require \c \#include \c <unsupported/Eigen/SpecialFunctions> </td></tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_erf
+  a.\link ArrayBase::erf erf\endlink(); \n
+  \link Eigen::erf erf\endlink(a);
+  </td>
+  <td>error function</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/erf">std::erf</a>; \cpp11 \n
+  erf(a[i]);
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_erfc
+  a.\link ArrayBase::erfc erfc\endlink(); \n
+  \link Eigen::erfc erfc\endlink(a);
+  </td>
+  <td>complementary error function</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/erfc">std::erfc</a>; \cpp11 \n
+  erfc(a[i]);
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_lgamma
+  a.\link ArrayBase::lgamma lgamma\endlink(); \n
+  \link Eigen::lgamma lgamma\endlink(a);
+  </td>
+  <td>natural logarithm of the gamma function</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/lgamma">std::lgamma</a>; \cpp11 \n
+  lgamma(a[i]);
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_digamma
+  a.\link ArrayBase::digamma digamma\endlink(); \n
+  \link Eigen::digamma digamma\endlink(a);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Digamma_function">logarithmic derivative of the gamma function</a></td>
+  <td>
+  built-in for float and double
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_igamma
+  \link Eigen::igamma igamma\endlink(a,x);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Incomplete_gamma_function">lower incomplete gamma integral</a>
+  \n \f$ \gamma(a_i,x_i)= \frac{1}{|a_i|} \int_{0}^{x_i}e^{\text{-}t} t^{a_i-1} \mathrm{d} t \f$</td>
+  <td>
+  built-in for float and double,\n but requires \cpp11
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_igammac
+  \link Eigen::igammac igammac\endlink(a,x);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Incomplete_gamma_function">upper incomplete gamma integral</a>
+  \n \f$ \Gamma(a_i,x_i) = \frac{1}{|a_i|} \int_{x_i}^{\infty}e^{\text{-}t} t^{a_i-1} \mathrm{d} t \f$</td>
+  <td>
+  built-in for float and double,\n but requires \cpp11
+  </td>
+  <td></td>
+</tr>
+<tr>
+<th colspan="4">Special functions</th>
+</tr>
+<tr> <td colspan="4">  Require \c \#include \c <unsupported/Eigen/SpecialFunctions> </td></tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_polygamma
+  \link Eigen::polygamma polygamma\endlink(n,x);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Polygamma_function">n-th derivative of digamma at x</a></td>
+  <td>
+  built-in generic based on\n <a href="#cwisetable_lgamma">\c lgamma </a>,
+  <a href="#cwisetable_digamma"> \c digamma </a>
+  and <a href="#cwisetable_zeta">\c zeta </a>.
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_betainc
+  \link Eigen::betainc betainc\endlink(a,b,x);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Beta_function#Incomplete_beta_function">Incomplete beta function</a></td>
+  <td>
+  built-in for float and double,\n but requires \cpp11
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_zeta
+  \link Eigen::zeta zeta\endlink(a,b); \n
+  a.\link ArrayBase::zeta zeta\endlink(b);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Hurwitz_zeta_function">Hurwitz zeta function</a>
+  \n \f$ \zeta(a_i,b_i)=\sum_{k=0}^{\infty}(b_i+k)^{\text{-}a_i} \f$</td>
+  <td>
+  built-in for float and double
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_ndtri
+  a.\link ArrayBase::ndtri ndtri\endlink(); \n
+  \link Eigen::ndtri ndtri\endlink(a);
+  </td>
+  <td>Inverse of the CDF of the Normal distribution function</td>
+  <td>
+  built-in for float and double
+  </td>
+  <td></td>
+</tr>
+<tr><td colspan="4"></td></tr>
+</table>
+
+\n
+
+*/
+
+}

diff --git a/doc/CustomizingEigen_CustomScalar.dox b/doc/CustomizingEigen_CustomScalar.dox
new file mode 100644
index 0000000..24e5f56
--- /dev/null
+++ b/doc/CustomizingEigen_CustomScalar.dox

@@ -0,0 +1,120 @@
+namespace Eigen {
+
+/** \page TopicCustomizing_CustomScalar Using custom scalar types
+\anchor user_defined_scalars
+
+By default, Eigen currently supports standard floating-point types (\c float, \c double, \c std::complex<float>, \c std::complex<double>, \c long \c double), as well as all native integer types (e.g., \c int, \c unsigned \c int, \c short, etc.), and \c bool.
+On x86-64 systems, \c long \c double permits to locally enforces the use of x87 registers with extended accuracy (in comparison to SSE).
+
+In order to add support for a custom type \c T you need:
+-# make sure the common operator (+,-,*,/,etc.) are supported by the type \c T
+-# add a specialization of struct Eigen::NumTraits<T> (see \ref NumTraits)
+-# define the math functions that makes sense for your type. This includes standard ones like sqrt, pow, sin, tan, conj, real, imag, etc, as well as abs2 which is Eigen specific.
+     (see the file Eigen/src/Core/MathFunctions.h)
+
+The math function should be defined in the same namespace than \c T, or in the \c std namespace though that second approach is not recommended.
+
+Here is a concrete example adding support for the Adolc's \c adouble type. <a href="https://projects.coin-or.org/ADOL-C">Adolc</a> is an automatic differentiation library. The type \c adouble is basically a real value tracking the values of any number of partial derivatives.
+
+\code
+#ifndef ADOLCSUPPORT_H
+#define ADOLCSUPPORT_H
+
+#define ADOLC_TAPELESS
+#include <adolc/adouble.h>
+#include <Eigen/Core>
+
+namespace Eigen {
+
+template<> struct NumTraits<adtl::adouble>
+ : NumTraits<double> // permits to get the epsilon, dummy_precision, lowest, highest functions
+{
+  typedef adtl::adouble Real;
+  typedef adtl::adouble NonInteger;
+  typedef adtl::adouble Nested;
+
+  enum {
+    IsComplex = 0,
+    IsInteger = 0,
+    IsSigned = 1,
+    RequireInitialization = 1,
+    ReadCost = 1,
+    AddCost = 3,
+    MulCost = 3
+  };
+};
+
+}
+
+namespace adtl {
+
+inline const adouble& conj(const adouble& x)  { return x; }
+inline const adouble& real(const adouble& x)  { return x; }
+inline adouble imag(const adouble&)    { return 0.; }
+inline adouble abs(const adouble&  x)  { return fabs(x); }
+inline adouble abs2(const adouble& x)  { return x*x; }
+
+}
+
+#endif // ADOLCSUPPORT_H
+\endcode
+
+This other example adds support for the \c mpq_class type from <a href="https://gmplib.org/">GMP</a>. It shows in particular how to change the way Eigen picks the best pivot during LU factorization. It selects the coefficient with the highest score, where the score is by default the absolute value of a number, but we can define a different score, for instance to prefer pivots with a more compact representation (this is an example, not a recommendation). Note that the scores should always be non-negative and only zero is allowed to have a score of zero. Also, this can interact badly with thresholds for inexact scalar types.
+
+\code
+#include <gmpxx.h>
+#include <Eigen/Core>
+#include <boost/operators.hpp>
+
+namespace Eigen {
+  template<> struct NumTraits<mpq_class> : GenericNumTraits<mpq_class>
+  {
+    typedef mpq_class Real;
+    typedef mpq_class NonInteger;
+    typedef mpq_class Nested;
+
+    static inline Real epsilon() { return 0; }
+    static inline Real dummy_precision() { return 0; }
+    static inline int digits10() { return 0; }
+
+    enum {
+      IsInteger = 0,
+      IsSigned = 1,
+      IsComplex = 0,
+      RequireInitialization = 1,
+      ReadCost = 6,
+      AddCost = 150,
+      MulCost = 100
+    };
+  };
+
+  namespace internal {
+
+    template<> struct scalar_score_coeff_op<mpq_class> {
+      struct result_type : boost::totally_ordered1<result_type> {
+        std::size_t len;
+        result_type(int i = 0) : len(i) {} // Eigen uses Score(0) and Score()
+        result_type(mpq_class const& q) :
+          len(mpz_size(q.get_num_mpz_t())+
+              mpz_size(q.get_den_mpz_t())-1) {}
+        friend bool operator<(result_type x, result_type y) {
+          // 0 is the worst possible pivot
+          if (x.len == 0) return y.len > 0;
+          if (y.len == 0) return false;
+          // Prefer a pivot with a small representation
+          return x.len > y.len;
+        }
+        friend bool operator==(result_type x, result_type y) {
+          // Only used to test if the score is 0
+          return x.len == y.len;
+        }
+      };
+      result_type operator()(mpq_class const& x) const { return x; }
+    };
+  }
+}
+\endcode
+
+*/
+
+}

diff --git a/doc/CustomizingEigen_InheritingMatrix.dox b/doc/CustomizingEigen_InheritingMatrix.dox
new file mode 100644
index 0000000..b21e554
--- /dev/null
+++ b/doc/CustomizingEigen_InheritingMatrix.dox

@@ -0,0 +1,34 @@
+namespace Eigen {
+
+/** \page TopicCustomizing_InheritingMatrix Inheriting from Matrix
+
+Before inheriting from Matrix, be really, I mean REALLY, sure that using
+EIGEN_MATRIX_PLUGIN is not what you really want (see previous section).
+If you just need to add few members to Matrix, this is the way to go.
+
+An example of when you actually need to inherit Matrix, is when you
+have several layers of heritage such as 
+MyVerySpecificVector1, MyVerySpecificVector2 -> MyVector1 -> Matrix and
+MyVerySpecificVector3, MyVerySpecificVector4 -> MyVector2 -> Matrix.
+
+In order for your object to work within the %Eigen framework, you need to
+define a few members in your inherited class.
+
+Here is a minimalistic example:
+
+\include CustomizingEigen_Inheritance.cpp
+
+Output: \verbinclude CustomizingEigen_Inheritance.out
+
+This is the kind of error you can get if you don't provide those methods
+\verbatim
+error: no match for ‘operator=’ in ‘v = Eigen::operator*(
+const Eigen::MatrixBase<Eigen::Matrix<double, -0x000000001, 1, 0, -0x000000001, 1> >::Scalar&, 
+const Eigen::MatrixBase<Eigen::Matrix<double, -0x000000001, 1> >::StorageBaseType&)
+(((const Eigen::MatrixBase<Eigen::Matrix<double, -0x000000001, 1> >::StorageBaseType&)
+((const Eigen::MatrixBase<Eigen::Matrix<double, -0x000000001, 1> >::StorageBaseType*)(& v))))’
+\endverbatim
+
+*/
+
+}

diff --git a/doc/CustomizingEigen_NullaryExpr.dox b/doc/CustomizingEigen_NullaryExpr.dox
new file mode 100644
index 0000000..37c8dcd
--- /dev/null
+++ b/doc/CustomizingEigen_NullaryExpr.dox

@@ -0,0 +1,86 @@
+namespace Eigen {
+
+/** \page TopicCustomizing_NullaryExpr Matrix manipulation via nullary-expressions
+
+
+The main purpose of the class CwiseNullaryOp is to define \em procedural matrices such as constant or random matrices as returned by the Ones(), Zero(), Constant(), Identity() and Random() methods.
+Nevertheless, with some imagination it is possible to accomplish very sophisticated matrix manipulation with minimal efforts such that \ref TopicNewExpressionType "implementing new expression" is rarely needed.
+
+\section NullaryExpr_Circulant Example 1: circulant matrix
+
+To explore these possibilities let us start with the  \em circulant example of the \ref TopicNewExpressionType "implementing new expression" topic.
+Let us recall that a circulant matrix is a matrix where each column is the same as the
+column to the left, except that it is cyclically shifted downwards.
+For example, here is a 4-by-4 circulant matrix:
+\f[ \begin{bmatrix}
+    1 & 8 & 4 & 2 \\
+    2 & 1 & 8 & 4 \\
+    4 & 2 & 1 & 8 \\
+    8 & 4 & 2 & 1
+\end{bmatrix} \f]
+A circulant matrix is uniquely determined by its first column. We wish
+to write a function \c makeCirculant which, given the first column,
+returns an expression representing the circulant matrix.
+
+For this exercise, the return type of \c makeCirculant will be a CwiseNullaryOp that we need to instantiate with:
+1 - a proper \c circulant_functor storing the input vector and implementing the adequate coefficient accessor \c operator(i,j)
+2 - a template instantiation of class Matrix conveying compile-time information such as the scalar type, sizes, and preferred storage layout.
+
+Calling \c ArgType the type of the input vector, we can construct the equivalent squared Matrix type as follows:
+
+\snippet make_circulant2.cpp square
+
+This little helper structure will help us to implement our \c makeCirculant function as follows:
+
+\snippet make_circulant2.cpp makeCirculant
+
+As usual, our function takes as argument a \c MatrixBase (see this \ref TopicFunctionTakingEigenTypes "page" for more details).
+Then, the CwiseNullaryOp object is constructed through the DenseBase::NullaryExpr static method with the adequate runtime sizes.
+
+Then, we need to implement our \c circulant_functor, which is a straightforward exercise:
+
+\snippet make_circulant2.cpp circulant_func
+
+We are now all set to try our new feature:
+
+\snippet make_circulant2.cpp main
+
+
+If all the fragments are combined, the following output is produced,
+showing that the program works as expected:
+
+\include make_circulant2.out
+
+This implementation of \c makeCirculant is much simpler than \ref TopicNewExpressionType "defining a new expression" from scratch.
+
+
+\section NullaryExpr_Indexing Example 2: indexing rows and columns
+
+The goal here is to mimic MatLab's ability to index a matrix through two vectors of indices referencing the rows and columns to be picked respectively, like this:
+
+\snippet nullary_indexing.out main1
+
+To this end, let us first write a nullary-functor storing references to the input matrix and to the two arrays of indices, and implementing the required \c operator()(i,j):
+
+\snippet nullary_indexing.cpp functor
+
+Then, let's create an \c indexing(A,rows,cols) function creating the nullary expression:
+
+\snippet nullary_indexing.cpp function
+
+Finally, here is an example of how this function can be used:
+
+\snippet nullary_indexing.cpp main1
+
+This straightforward implementation is already quite powerful as the row or column index arrays can also be expressions to perform offsetting, modulo, striding, reverse, etc.
+
+\snippet nullary_indexing.cpp main2
+
+and the output is:
+
+\snippet nullary_indexing.out main2
+
+*/
+
+}
+

diff --git a/doc/CustomizingEigen_Plugins.dox b/doc/CustomizingEigen_Plugins.dox
new file mode 100644
index 0000000..d88f240
--- /dev/null
+++ b/doc/CustomizingEigen_Plugins.dox

@@ -0,0 +1,69 @@
+namespace Eigen {
+
+/** \page TopicCustomizing_Plugins Extending MatrixBase (and other classes)
+
+In this section we will see how to add custom methods to MatrixBase. Since all expressions and matrix types inherit MatrixBase, adding a method to MatrixBase make it immediately available to all expressions ! A typical use case is, for instance, to make Eigen compatible with another API.
+
+You certainly know that in C++ it is not possible to add methods to an existing class. So how that's possible ? Here the trick is to include in the declaration of MatrixBase a file defined by the preprocessor token \c EIGEN_MATRIXBASE_PLUGIN:
+\code
+class MatrixBase {
+  // ...
+  #ifdef EIGEN_MATRIXBASE_PLUGIN
+  #include EIGEN_MATRIXBASE_PLUGIN
+  #endif
+};
+\endcode
+Therefore to extend MatrixBase with your own methods you just have to create a file with your method declaration and define EIGEN_MATRIXBASE_PLUGIN before you include any Eigen's header file.
+
+You can extend many of the other classes used in Eigen by defining similarly named preprocessor symbols. For instance, define \c EIGEN_ARRAYBASE_PLUGIN if you want to extend the ArrayBase class. A full list of classes that can be extended in this way and the corresponding preprocessor symbols can be found on our page \ref TopicPreprocessorDirectives.
+
+Here is an example of an extension file for adding methods to MatrixBase: \n
+\b MatrixBaseAddons.h
+\code
+inline Scalar at(uint i, uint j) const { return this->operator()(i,j); }
+inline Scalar& at(uint i, uint j) { return this->operator()(i,j); }
+inline Scalar at(uint i) const { return this->operator[](i); }
+inline Scalar& at(uint i) { return this->operator[](i); }
+
+inline RealScalar squaredLength() const { return squaredNorm(); }
+inline RealScalar length() const { return norm(); }
+inline RealScalar invLength(void) const { return fast_inv_sqrt(squaredNorm()); }
+
+template<typename OtherDerived>
+inline Scalar squaredDistanceTo(const MatrixBase<OtherDerived>& other) const
+{ return (derived() - other.derived()).squaredNorm(); }
+
+template<typename OtherDerived>
+inline RealScalar distanceTo(const MatrixBase<OtherDerived>& other) const
+{ return internal::sqrt(derived().squaredDistanceTo(other)); }
+
+inline void scaleTo(RealScalar l) { RealScalar vl = norm(); if (vl>1e-9) derived() *= (l/vl); }
+
+inline Transpose<Derived> transposed() {return this->transpose();}
+inline const Transpose<Derived> transposed() const {return this->transpose();}
+
+inline uint minComponentId(void) const  { int i; this->minCoeff(&i); return i; }
+inline uint maxComponentId(void) const  { int i; this->maxCoeff(&i); return i; }
+
+template<typename OtherDerived>
+void makeFloor(const MatrixBase<OtherDerived>& other) { derived() = derived().cwiseMin(other.derived()); }
+template<typename OtherDerived>
+void makeCeil(const MatrixBase<OtherDerived>& other) { derived() = derived().cwiseMax(other.derived()); }
+
+const CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const Derived, const ConstantReturnType>
+operator+(const Scalar& scalar) const
+{ return CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const Derived, const ConstantReturnType>(derived(), Constant(rows(),cols(),scalar)); }
+
+friend const CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ConstantReturnType, Derived>
+operator+(const Scalar& scalar, const MatrixBase<Derived>& mat)
+{ return CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ConstantReturnType, Derived>(Constant(rows(),cols(),scalar), mat.derived()); }
+\endcode
+
+Then one can the following declaration in the config.h or whatever prerequisites header file of his project:
+\code
+#define EIGEN_MATRIXBASE_PLUGIN "MatrixBaseAddons.h"
+\endcode
+
+*/
+
+}

diff --git a/doc/DenseDecompositionBenchmark.dox b/doc/DenseDecompositionBenchmark.dox
new file mode 100644
index 0000000..8f9570b
--- /dev/null
+++ b/doc/DenseDecompositionBenchmark.dox

@@ -0,0 +1,42 @@
+namespace Eigen {
+
+/** \eigenManualPage DenseDecompositionBenchmark Benchmark of dense decompositions
+
+This page presents a speed comparison of the dense matrix decompositions offered by %Eigen for a wide range of square matrices and overconstrained problems.
+
+For a more general overview on the features and numerical robustness of linear solvers and decompositions, check this \link TopicLinearAlgebraDecompositions table \endlink.
+
+This benchmark has been run on a laptop equipped with an Intel core i7 \@ 2,6 GHz, and compiled with clang with \b AVX and \b FMA instruction sets enabled but without multi-threading.
+It uses \b single \b precision \b float numbers. For double, you can get a good estimate by multiplying the timings by a factor 2.
+
+The square matrices are symmetric, and for the overconstrained matrices, the reported timmings include the cost to compute the symmetric covariance matrix \f$ A^T A \f$ for the first four solvers based on Cholesky and LU, as denoted by the \b * symbol (top-right corner part of the table).
+Timings are in \b milliseconds, and factors are relative to the LLT decomposition which is the fastest but also the least general and robust.
+
+<table class="manual">
+<tr><th>solver/size</th>
+  <th>8x8</th>  <th>100x100</th>  <th>1000x1000</th>  <th>4000x4000</th>  <th>10000x8</th>  <th>10000x100</th>  <th>10000x1000</th>  <th>10000x4000</th></tr>
+<tr><td>LLT</td><td>0.05</td><td>0.42</td><td>5.83</td><td>374.55</td><td>6.79 <sup><a href="#note_ls">*</a></sup></td><td>30.15 <sup><a href="#note_ls">*</a></sup></td><td>236.34 <sup><a href="#note_ls">*</a></sup></td><td>3847.17 <sup><a href="#note_ls">*</a></sup></td></tr>
+<tr class="alt"><td>LDLT</td><td>0.07 (x1.3)</td><td>0.65 (x1.5)</td><td>26.86 (x4.6)</td><td>2361.18 (x6.3)</td><td>6.81 (x1) <sup><a href="#note_ls">*</a></sup></td><td>31.91 (x1.1) <sup><a href="#note_ls">*</a></sup></td><td>252.61 (x1.1) <sup><a href="#note_ls">*</a></sup></td><td>5807.66 (x1.5) <sup><a href="#note_ls">*</a></sup></td></tr>
+<tr><td>PartialPivLU</td><td>0.08 (x1.5)</td><td>0.69 (x1.6)</td><td>15.63 (x2.7)</td><td>709.32 (x1.9)</td><td>6.81 (x1) <sup><a href="#note_ls">*</a></sup></td><td>31.32 (x1) <sup><a href="#note_ls">*</a></sup></td><td>241.68 (x1) <sup><a href="#note_ls">*</a></sup></td><td>4270.48 (x1.1) <sup><a href="#note_ls">*</a></sup></td></tr>
+<tr class="alt"><td>FullPivLU</td><td>0.1 (x1.9)</td><td>4.48 (x10.6)</td><td>281.33 (x48.2)</td><td>-</td><td>6.83 (x1) <sup><a href="#note_ls">*</a></sup></td><td>32.67 (x1.1) <sup><a href="#note_ls">*</a></sup></td><td>498.25 (x2.1) <sup><a href="#note_ls">*</a></sup></td><td>-</td></tr>
+<tr><td>HouseholderQR</td><td>0.19 (x3.5)</td><td>2.18 (x5.2)</td><td>23.42 (x4)</td><td>1337.52 (x3.6)</td><td>34.26 (x5)</td><td>129.01 (x4.3)</td><td>377.37 (x1.6)</td><td>4839.1 (x1.3)</td></tr>
+<tr class="alt"><td>ColPivHouseholderQR</td><td>0.23 (x4.3)</td><td>2.23 (x5.3)</td><td>103.34 (x17.7)</td><td>9987.16 (x26.7)</td><td>36.05 (x5.3)</td><td>163.18 (x5.4)</td><td>2354.08 (x10)</td><td>37860.5 (x9.8)</td></tr>
+<tr><td>CompleteOrthogonalDecomposition</td><td>0.23 (x4.3)</td><td>2.22 (x5.2)</td><td>99.44 (x17.1)</td><td>10555.3 (x28.2)</td><td>35.75 (x5.3)</td><td>169.39 (x5.6)</td><td>2150.56 (x9.1)</td><td>36981.8 (x9.6)</td></tr>
+<tr class="alt"><td>FullPivHouseholderQR</td><td>0.23 (x4.3)</td><td>4.64 (x11)</td><td>289.1 (x49.6)</td><td>-</td><td>69.38 (x10.2)</td><td>446.73 (x14.8)</td><td>4852.12 (x20.5)</td><td>-</td></tr>
+<tr><td>JacobiSVD</td><td>1.01 (x18.6)</td><td>71.43 (x168.4)</td><td>-</td><td>-</td><td>113.81 (x16.7)</td><td>1179.66 (x39.1)</td><td>-</td><td>-</td></tr>
+<tr class="alt"><td>BDCSVD</td><td>1.07 (x19.7)</td><td>21.83 (x51.5)</td><td>331.77 (x56.9)</td><td>18587.9 (x49.6)</td><td>110.53 (x16.3)</td><td>397.67 (x13.2)</td><td>2975 (x12.6)</td><td>48593.2 (x12.6)</td></tr>
+</table>
+
+<a name="note_ls">\b *: </a> This decomposition do not support direct least-square solving for over-constrained problems, and the reported timing include the cost to form the symmetric covariance matrix \f$ A^T A \f$.
+
+\b Observations:
+ + LLT is always the fastest solvers.
+ + For largely over-constrained problems, the cost of Cholesky/LU decompositions is dominated by the computation of the symmetric covariance matrix.
+ + For large problem sizes, only the decomposition implementing a cache-friendly blocking strategy scale well. Those include LLT, PartialPivLU, HouseholderQR, and BDCSVD. This explain why for a 4k x 4k matrix, HouseholderQR is faster than LDLT. In the future, LDLT and ColPivHouseholderQR will also implement blocking strategies.
+ + CompleteOrthogonalDecomposition is based on ColPivHouseholderQR and they thus achieve the same level of performance.
+
+The above table has been generated by the <a href="https://gitlab.com/libeigen/eigen/raw/master/bench/dense_solvers.cpp">bench/dense_solvers.cpp</a> file, feel-free to hack it to generate a table matching your hardware, compiler, and favorite problem sizes.
+
+*/
+
+}

diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
new file mode 100644
index 0000000..bc1e03c
--- /dev/null
+++ b/doc/Doxyfile.in

@@ -0,0 +1,1915 @@
+# Doxyfile 1.8.1.1
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a hash (#) is considered a comment and will be ignored.
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or sequence of words) that should
+# identify the project. Note that if you do not use Doxywizard you need
+# to put quotes around the project name if it contains spaces.
+
+PROJECT_NAME           = ${EIGEN_DOXY_PROJECT_NAME}
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+# EIGEN_VERSION is set in the root CMakeLists.txt
+
+PROJECT_NUMBER         = "${EIGEN_VERSION}"
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer
+# a quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is
+# included in the documentation. The maximum height of the logo should not
+# exceed 55 pixels and the maximum width should not exceed 200 pixels.
+# Doxygen will copy the logo to the output directory.
+
+PROJECT_LOGO           = "${Eigen_SOURCE_DIR}/doc/Eigen_Silly_Professor_64x64.png"
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = "${Eigen_BINARY_DIR}/doc${EIGEN_DOXY_OUTPUT_DIRECTORY_SUFFIX}"
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful if your file system
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                = "only_for_vectors=This is only for vectors (either row-vectors or column-vectors), i.e. matrices which are known at compile-time to have either one row or one column." \
+                         "not_reentrant=\warning This function is not re-entrant." \
+                         "array_module=This is defined in the %Array module. \code #include <Eigen/Array> \endcode" \
+                         "cholesky_module=This is defined in the %Cholesky module. \code #include <Eigen/Cholesky> \endcode" \
+                         "eigenvalues_module=This is defined in the %Eigenvalues module. \code #include <Eigen/Eigenvalues> \endcode" \
+                         "geometry_module=This is defined in the %Geometry module. \code #include <Eigen/Geometry> \endcode" \
+                         "householder_module=This is defined in the %Householder module. \code #include <Eigen/Householder> \endcode" \
+                         "jacobi_module=This is defined in the %Jacobi module. \code #include <Eigen/Jacobi> \endcode" \
+                         "lu_module=This is defined in the %LU module. \code #include <Eigen/LU> \endcode" \
+                         "qr_module=This is defined in the %QR module. \code #include <Eigen/QR> \endcode" \
+                         "svd_module=This is defined in the %SVD module. \code #include <Eigen/SVD> \endcode" \
+                         "specialfunctions_module=This is defined in the \b unsupported SpecialFunctions module. \code #include <Eigen/SpecialFunctions> \endcode" \
+                         "label=\bug" \
+                         "matrixworld=<a href='#matrixonly' style='color:green;text-decoration: none;'>*</a>" \
+                         "arrayworld=<a href='#arrayonly' style='color:blue;text-decoration: none;'>*</a>" \
+                         "note_about_arbitrary_choice_of_solution=If there exists more than one solution, this method will arbitrarily choose one." \
+                         "note_about_using_kernel_to_study_multiple_solutions=If you need a complete analysis of the space of solutions, take the one solution obtained by this method and add to it elements of the kernel, as determined by kernel()." \
+                         "note_about_checking_solutions=This method just tries to find as good a solution as possible. If you want to check whether a solution exists or if it is accurate, just call this function to get a result and then compute the error of this result, or use MatrixBase::isApprox() directly, for instance like this: \code bool a_solution_exists = (A*result).isApprox(b, precision); \endcode This method avoids dividing by zero, so that the non-existence of a solution doesn't by itself mean that you'll get \c inf or \c nan values." \
+                         "note_try_to_help_rvo=This function returns the result by value. In order to make that efficient, it is implemented as just a return statement using a special constructor, hopefully allowing the compiler to perform a RVO (return value optimization)." \
+                         "nonstableyet=\warning This is not considered to be part of the stable public API yet. Changes may happen in future releases. See \ref Experimental \"Experimental parts of Eigen\"" \
+                         "implsparsesolverconcept=This class follows the \link TutorialSparseSolverConcept sparse solver concept \endlink." \
+                         "blank= " \
+                         "cpp11=<span class='cpp11'>[c++11]</span>" \
+                         "cpp14=<span class='cpp14'>[c++14]</span>" \
+                         "cpp17=<span class='cpp17'>[c++17]</span>" \
+                         "newin{1}=<span class='newin3x'>New in %Eigen \1.</span>"
+                         
+
+ALIASES += "eigenAutoToc=  "
+ALIASES += "eigenManualPage=\defgroup"
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding
+# "class=itcl::class" will allow you to use the command class in the
+# itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this
+# tag. The format is ext=language, where ext is a file extension, and language
+# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C,
+# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions
+# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING      = .h=C++ no_extension=C++
+
+# If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all
+# comments according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you
+# can mix doxygen, HTML, and XML commands with Markdown formatting.
+# Disable only in case of backward compatibilities issues.
+
+MARKDOWN_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also makes the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = YES
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and
+# unions are shown inside the group in which they are included (e.g. using
+# @ingroup) instead of on a separate page (for HTML and Man pages) or
+# section (for LaTeX and RTF).
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and
+# unions with only public data fields will be shown inline in the documentation
+# of the scope in which they are defined (i.e. file, namespace, or group
+# documentation), provided this scope is documented. If set to NO (the default),
+# structs, classes, and unions are shown on a separate page (for HTML and Man
+# pages) or section (for LaTeX and RTF).
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penalty.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will roughly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols.
+
+# SYMBOL_CACHE_SIZE      = 0
+
+# Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be
+# set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given
+# their name and scope. Since this can be an expensive process and often the
+# same symbol appear multiple times in the code, doxygen keeps a cache of
+# pre-resolved symbols. If the cache is too small doxygen will become slower.
+# If the cache is too large, memory is wasted. The cache size is given by this
+# formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal scope will be included in the documentation.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = NO
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespaces are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = YES
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = YES
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = YES
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = ${EIGEN_DOXY_INTERNAL}
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = ${EIGEN_DOXY_INTERNAL}
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = YES
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to
+# do proper type resolution of all parameters of a function it will reject a
+# match between the prototype and the implementation of a member function even
+# if there is only one candidate or it is obvious which candidate to choose
+# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
+# will still accept a match between prototype and implementation in such cases.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST      = ${EIGEN_DOXY_INTERNAL}
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = NO
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = ${EIGEN_DOXY_INTERNAL}
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or macro consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and macros in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 0
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.
+# This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = NO
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option.
+# You can optionally specify a file name after the option, if omitted
+# DoxygenLayout.xml will be used as the name of the layout file.
+
+LAYOUT_FILE            = "${Eigen_BINARY_DIR}/doc${EIGEN_DOXY_OUTPUT_DIRECTORY_SUFFIX}/eigendoxy_layout.xml"
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files
+# containing the references data. This must be a list of .bib files. The
+# .bib extension is automatically appended if omitted. Using this command
+# requires the bibtex tool to be installed. See also
+# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style
+# of the bibliography can be controlled using LATEX_BIB_STYLE. To use this
+# feature you need bibtex and perl available in the search path.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = NO
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# The WARN_NO_PARAMDOC option can be enabled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT                  = ${EIGEN_DOXY_INPUT}
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
+# *.f90 *.f *.for *.vhd *.vhdl
+
+FILE_PATTERNS          = *
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                = "${Eigen_SOURCE_DIR}/Eigen/src/Core/products" \
+                         "${Eigen_SOURCE_DIR}/Eigen/Eigen2Support" \
+                         "${Eigen_SOURCE_DIR}/Eigen/src/Eigen2Support" \
+                         "${Eigen_SOURCE_DIR}/doc/examples" \
+                         "${Eigen_SOURCE_DIR}/doc/special_examples" \
+                         "${Eigen_SOURCE_DIR}/doc/snippets" \
+                         "${Eigen_SOURCE_DIR}/unsupported/doc/examples" \
+                         "${Eigen_SOURCE_DIR}/unsupported/doc/snippets"
+
+# Forward declarations of class templates cause the title of the main page for
+# the class template to not contain the template signature.  This only happens
+# when the \class command is used to document the class.  Possibly caused
+# by https://github.com/doxygen/doxygen/issues/7698.  Confirmed fixed by
+# doxygen release 1.8.19.
+
+EXCLUDE += "${Eigen_SOURCE_DIR}/Eigen/src/Core/util/ForwardDeclarations.h"
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = CMake* \
+                         *.txt \
+                         *.sh \
+                         *.orig \
+                         *.diff \
+                         diff \
+                         *~ \
+                         *. \
+                         *.sln \
+                         *.sdf \
+                         *.tmp \
+                         *.vcxproj \
+                         *.filters \
+                         *.user \
+                         *.suo
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        = internal::* \
+                         Flagged* \
+                         *InnerIterator* \
+                         DenseStorage<* \
+                         
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           = "${Eigen_SOURCE_DIR}/doc/snippets" \
+                         "${Eigen_BINARY_DIR}/doc/snippets" \
+                         "${Eigen_SOURCE_DIR}/doc/examples" \
+                         "${Eigen_BINARY_DIR}/doc/examples" \
+                         "${Eigen_SOURCE_DIR}/doc/special_examples" \
+                         "${Eigen_BINARY_DIR}/doc/special_examples" \
+                         "${Eigen_SOURCE_DIR}/unsupported/doc/snippets" \
+                         "${Eigen_BINARY_DIR}/unsupported/doc/snippets" \
+                         "${Eigen_SOURCE_DIR}/unsupported/doc/examples" \
+                         "${Eigen_BINARY_DIR}/unsupported/doc/examples"
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             = ${Eigen_BINARY_DIR}/doc/html
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+# If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.
+# Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.
+# The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty or if
+# non of the patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
+# and it is also possible to disable source filtering for a specific pattern
+# using *.ext= (so without naming a filter). This option only has effect when
+# FILTER_SOURCE_FILES is enabled.
+
+FILTER_SOURCE_PATTERNS =
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C, C++ and Fortran comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.
+# Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = NO
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = "${Eigen_BINARY_DIR}/doc/html${EIGEN_DOXY_OUTPUT_DIRECTORY_SUFFIX}"
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header. Note that when using a custom header you are responsible
+#  for the proper inclusion of any scripts and style sheets that doxygen
+# needs, which is dependent on the configuration options used.
+# It is advised to generate a default header using "doxygen -w html
+# header.html footer.html stylesheet.css YourConfigFile" and then modify
+# that header. Note that the header is subject to change so you typically
+# have to redo this when upgrading to a newer version of doxygen or when
+# changing the value of configuration settings such as GENERATE_TREEVIEW!
+
+HTML_HEADER            = "${Eigen_BINARY_DIR}/doc/eigendoxy_header.html"
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            = "${Eigen_BINARY_DIR}/doc/eigendoxy_footer.html"
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# style sheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        = 
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that
+# the files will be copied as-is; there are no commands or markers available.
+
+HTML_EXTRA_FILES       = "${Eigen_SOURCE_DIR}/doc/eigendoxy.css"
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
+# Doxygen will adjust the colors in the style sheet and background images
+# according to this color. Hue is specified as an angle on a colorwheel,
+# see http://en.wikipedia.org/wiki/Hue for more information.
+# For instance the value 0 represents red, 60 is yellow, 120 is green,
+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
+# The allowed range is 0 to 359.
+# The default is 220.
+
+HTML_COLORSTYLE_HUE    = ${EIGEN_DOXY_HTML_COLORSTYLE_HUE}
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
+# the colors in the HTML output. For a value of 0 the output will use
+# grayscales only. A value of 255 will produce the most vivid colors.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
+# the luminance component of the colors in the HTML output. Values below
+# 100 gradually make the output lighter, whereas values above 100 make
+# the output darker. The value divided by 100 is the actual gamma applied,
+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
+# and 100 does not change the gamma.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+
+HTML_DYNAMIC_SECTIONS  = YES
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of
+# entries shown in the various tree structured indices initially; the user
+# can expand and collapse entries dynamically later on. Doxygen will expand
+# the tree to such a level that at most the specified number of entries are
+# visible (unless a fully collapsed tree already exceeds this amount).
+# So setting the number of entries 1 will produce a full collapsed tree by
+# default. 0 is a special value representing an infinite number of entries
+# and will result in a full expanded tree by default.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING     =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
+# that can be used as input for Qt's qhelpgenerator to generate a
+# Qt Compressed Help (.qch) of the generated HTML documentation.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
+# add. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
+# Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
+# Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+#  will be generated, which together with the HTML files, form an Eclipse help
+# plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs)
+# at top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it. Since the tabs have the same information as the
+# navigation tree you can set this option to NO if you already set
+# GENERATE_TREEVIEW to YES.
+
+DISABLE_INDEX          = YES
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+# Since the tree basically has the same information as the tab index you
+# could consider to set DISABLE_INDEX to NO when enabling this option.
+
+GENERATE_TREEVIEW      = YES
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values
+# (range [0,1..20]) that doxygen will group on one line in the generated HTML
+# documentation. Note that a value of 0 will completely suppress the enum
+# values from appearing in the overview section.
+
+ENUM_VALUES_PER_LINE   = 1
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
+# links to external symbols imported via tag files in a separate window.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 12
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are
+# not supported properly for IE 6.0, but are supported on all modern browsers.
+# Note that when changing this option you need to delete any form_*.png files
+# in the HTML output before the changes have effect.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
+# (see http://www.mathjax.org) which uses client side Javascript for the
+# rendering instead of using prerendered bitmaps. Use this if you do not
+# have LaTeX installed or if you want to formulas look prettier in the HTML
+# output. When enabled you may also need to install MathJax separately and
+# configure the path to it using the MATHJAX_RELPATH option.
+
+USE_MATHJAX            = @EIGEN_DOXY_USE_MATHJAX@
+
+# When MathJax is enabled you need to specify the location relative to the
+# HTML output directory using the MATHJAX_RELPATH option. The destination
+# directory should contain the MathJax.js script. For instance, if the mathjax
+# directory is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to
+# the MathJax Content Delivery Network so you can quickly see the result without
+# installing MathJax.
+# However, it is strongly recommended to install a local
+# copy of MathJax from http://www.mathjax.org before deployment.
+
+MATHJAX_RELPATH        = https://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension
+# names that should be enabled during MathJax rendering.
+
+MATHJAX_EXTENSIONS     = TeX/AMSmath TeX/AMSsymbols
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvantages are that it is more difficult to setup
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4wide
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         = amssymb \
+                         amsmath
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for
+# the generated latex document. The footer should contain everything after
+# the last chapter. If it is left blank doxygen will generate a
+# standard footer. Notice: only use this tag if you know what you are doing!
+
+LATEX_FOOTER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = NO
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = NO
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See
+# http://en.wikipedia.org/wiki/BibTeX for more info.
+
+LATEX_BIB_STYLE        = plain
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load style sheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+# XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+# XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.
+# This is useful
+# if you want to understand what is going on.
+# On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = YES
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# pointed to by INCLUDE_PATH will be searched when a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           = "${Eigen_SOURCE_DIR}/Eigen/src/plugins"
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             = EIGEN_EMPTY_STRUCT \
+                         EIGEN_PARSED_BY_DOXYGEN \
+                         EIGEN_VECTORIZE \
+                         EIGEN_QT_SUPPORT \
+                         EIGEN_STRONG_INLINE=inline \
+                         EIGEN_DEVICE_FUNC= \
+                         EIGEN_HAS_CXX11=1 \
+                         EIGEN_HAS_CXX11_MATH=1 \
+                         "EIGEN_MAKE_CWISE_BINARY_OP(METHOD,FUNCTOR)=template<typename OtherDerived> const CwiseBinaryOp<FUNCTOR<Scalar>, const Derived, const OtherDerived> METHOD(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const;" \
+                         "EIGEN_CWISE_PRODUCT_RETURN_TYPE(LHS,RHS)=CwiseBinaryOp<internal::scalar_product_op<LHS::Scalar,RHS::Scalar>, const LHS, const RHS>"\
+                         "EIGEN_CAT2(a,b)= a ## b"\
+                         "EIGEN_CAT(a,b)=EIGEN_CAT2(a,b)"\
+                         "EIGEN_CWISE_BINARY_RETURN_TYPE(LHS,RHS,OPNAME)=CwiseBinaryOp<EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)<LHS::Scalar, RHS::Scalar>, const LHS, const RHS>"\
+                         "EIGEN_ALIGN_TO_BOUNDARY(x)="\
+                         DOXCOMMA=,
+
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition that
+# overrules the definition found in the source code.
+
+EXPAND_AS_DEFINED      = EIGEN_MAKE_TYPEDEFS \
+                         EIGEN_MAKE_FIXED_TYPEDEFS \
+                         EIGEN_MAKE_TYPEDEFS_ALL_SIZES \
+                         EIGEN_MAKE_ARRAY_TYPEDEFS \
+                         EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS \
+                         EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES \
+                         EIGEN_CWISE_UNOP_RETURN_TYPE \
+                         EIGEN_CWISE_BINOP_RETURN_TYPE \
+                         EIGEN_CURRENT_STORAGE_BASE_CLASS \
+                         EIGEN_MATHFUNC_IMPL \
+                         _EIGEN_GENERIC_PUBLIC_INTERFACE \
+                         EIGEN_ARRAY_DECLARE_GLOBAL_UNARY \
+                         EIGEN_EMPTY \
+                         EIGEN_EULER_ANGLES_TYPEDEFS \
+                         EIGEN_EULER_ANGLES_SINGLE_TYPEDEF \
+                         EIGEN_EULER_SYSTEM_TYPEDEF \
+                         EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY \
+                         EIGEN_MATRIX_FUNCTION \
+                         EIGEN_MATRIX_FUNCTION_1 \
+                         EIGEN_DOC_UNARY_ADDONS \
+                         EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL \
+                         EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
+
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all references to function-like macros
+# that are alone on a line, have an all uppercase name, and do not end with a
+# semicolon, because these will confuse the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles. For each
+# tag file the location of the external documentation should be added. The
+# format of a tag file without this location is as follows:
+#
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths
+# or URLs. Note that each tag file must have a unique name (where the name does
+# NOT include the path). If a tag file is not located in the directory in which
+# doxygen is run, you must also specify the path to the tagfile here.
+
+TAGFILES               = ${EIGEN_DOXY_TAGFILES}
+# "${Eigen_BINARY_DIR}/doc/eigen-unsupported.doxytags =unsupported"
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       = "${Eigen_BINARY_DIR}/doc/${EIGEN_DOXY_PROJECT_NAME}.doxytags"
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = NO
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option also works with HAVE_DOT disabled, but it is recommended to
+# install and use dot, since it yields more powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = NO
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = YES
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
+# allowed to run in parallel. When set to 0 (the default) doxygen will
+# base this on the number of processors available in the system. You can set it
+# explicitly to a value larger than 0 to get control over the balance
+# between CPU load and processing speed.
+
+DOT_NUM_THREADS        = 0
+
+# By default doxygen will use the Helvetica font for all dot files that
+# doxygen generates. When you want a differently looking font you can specify
+# the font name using DOT_FONTNAME. You need to make sure dot is able to find
+# the font, which can be done by putting it in a standard location or by setting
+# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
+# directory containing the font.
+
+DOT_FONTNAME           = 
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the Helvetica font.
+# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to
+# set the path where dot can find it.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = NO
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = NO
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = YES
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside
+# the class node. If there are many fields or methods and many nodes the
+# graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS
+# threshold limits the number of items for each type to make the size more
+# manageable. Set this to 0 for no limit. Note that the threshold may be
+# exceeded by 50% before the limit is enforced.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = NO
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will generate a graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = NO
+
+# If the DIRECTORY_GRAPH and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = NO
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are svg, png, jpg, or gif.
+# If left blank png will be used. If you choose svg you need to set
+# HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible in IE 9+ (other browsers do not have this requirement).
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+# Note that this requires a modern browser other than Internet Explorer.
+# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you
+# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible. Older versions of IE do not have SVG support.
+
+INTERACTIVE_SVG        = NO
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the
+# \mscfile command).
+
+MSCFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES

diff --git a/doc/Eigen_Silly_Professor_64x64.png b/doc/Eigen_Silly_Professor_64x64.png
new file mode 100644
index 0000000..079d45b
--- /dev/null
+++ b/doc/Eigen_Silly_Professor_64x64.png
Binary files differ

diff --git a/doc/FixedSizeVectorizable.dox b/doc/FixedSizeVectorizable.dox
new file mode 100644
index 0000000..0012465
--- /dev/null
+++ b/doc/FixedSizeVectorizable.dox

@@ -0,0 +1,38 @@
+namespace Eigen {
+
+/** \eigenManualPage TopicFixedSizeVectorizable Fixed-size vectorizable %Eigen objects
+
+The goal of this page is to explain what we mean by "fixed-size vectorizable".
+
+\section FixedSizeVectorizable_summary Executive Summary
+
+An Eigen object is called "fixed-size vectorizable" if it has fixed size and that size is a multiple of 16 bytes.
+
+Examples include:
+\li Eigen::Vector2d
+\li Eigen::Vector4d
+\li Eigen::Vector4f
+\li Eigen::Matrix2d
+\li Eigen::Matrix2f
+\li Eigen::Matrix4d
+\li Eigen::Matrix4f
+\li Eigen::Affine3d
+\li Eigen::Affine3f
+\li Eigen::Quaterniond
+\li Eigen::Quaternionf
+
+\section FixedSizeVectorizable_explanation Explanation
+
+First, "fixed-size" should be clear: an %Eigen object has fixed size if its number of rows and its number of columns are fixed at compile-time. So for example \ref Matrix3f has fixed size, but \ref MatrixXf doesn't (the opposite of fixed-size is dynamic-size).
+
+The array of coefficients of a fixed-size %Eigen object is a plain "static array", it is not dynamically allocated. For example, the data behind a \ref Matrix4f is just a "float array[16]".
+
+Fixed-size objects are typically very small, which means that we want to handle them with zero runtime overhead -- both in terms of memory usage and of speed.
+
+Now, vectorization works with 128-bit packets (e.g., SSE, AltiVec, NEON), 256-bit packets (e.g., AVX), or 512-bit packets (e.g., AVX512). Moreover, for performance reasons, these packets are most efficiently read and written if they have the same alignment as the packet size, that is 16 bytes, 32 bytes, and 64 bytes respectively.
+
+So it turns out that the best way that fixed-size %Eigen objects can be vectorized, is if their size is a multiple of 16 bytes (or more). %Eigen will then request 16-byte alignment (or more) for these objects, and henceforth rely on these objects being aligned to achieve maximal efficiency.
+
+*/
+
+}

diff --git a/doc/FunctionsTakingEigenTypes.dox b/doc/FunctionsTakingEigenTypes.dox
new file mode 100644
index 0000000..6b4e492
--- /dev/null
+++ b/doc/FunctionsTakingEigenTypes.dox

@@ -0,0 +1,217 @@
+namespace Eigen {
+
+/** \page TopicFunctionTakingEigenTypes Writing Functions Taking %Eigen Types as Parameters
+
+%Eigen's use of expression templates results in potentially every expression being of a different type. If you pass such an expression to a function taking a parameter of type Matrix, your expression will implicitly be evaluated into a temporary Matrix, which will then be passed to the function. This means that you lose the benefit of expression templates. Concretely, this has two drawbacks:
+ \li The evaluation into a temporary may be useless and inefficient;
+ \li This only allows the function to read from the expression, not to write to it.
+
+Fortunately, all this myriad of expression types have in common that they all inherit a few common, templated base classes. By letting your function take templated parameters of these base types, you can let them play nicely with %Eigen's expression templates.
+
+\eigenAutoToc
+
+\section TopicFirstExamples Some First Examples
+
+This section will provide simple examples for different types of objects %Eigen is offering. Before starting with the actual examples, we need to recapitulate which base objects we can work with (see also \ref TopicClassHierarchy).
+
+ \li MatrixBase: The common base class for all dense matrix expressions (as opposed to array expressions, as opposed to sparse and special matrix classes). Use it in functions that are meant to work only on dense matrices.
+ \li ArrayBase: The common base class for all dense array expressions (as opposed to matrix expressions, etc). Use it in functions that are meant to work only on arrays.
+ \li DenseBase: The common base class for all dense matrix expression, that is, the base class for both \c MatrixBase and \c ArrayBase. It can be used in functions that are meant to work on both matrices and arrays.
+ \li EigenBase: The base class unifying all types of objects that can be evaluated into dense matrices or arrays, for example special matrix classes such as diagonal matrices, permutation matrices, etc. It can be used in functions that are meant to work on any such general type.
+
+<b> %EigenBase Example </b><br/><br/>
+Prints the dimensions of the most generic object present in %Eigen. It could be any matrix expressions, any dense or sparse matrix and any array.
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include function_taking_eigenbase.cpp
+</td>
+<td>
+\verbinclude function_taking_eigenbase.out
+</td></tr></table>
+<b> %DenseBase Example </b><br/><br/>
+Prints a sub-block of the dense expression. Accepts any dense matrix or array expression, but no sparse objects and no special matrix classes such as DiagonalMatrix.
+\code
+template <typename Derived>
+void print_block(const DenseBase<Derived>& b, int x, int y, int r, int c)
+{
+  std::cout << "block: " << b.block(x,y,r,c) << std::endl;
+}
+\endcode
+<b> %ArrayBase Example </b><br/><br/>
+Prints the maximum coefficient of the array or array-expression.
+\code
+template <typename Derived>
+void print_max_coeff(const ArrayBase<Derived> &a)
+{
+  std::cout << "max: " << a.maxCoeff() << std::endl;
+}
+\endcode
+<b> %MatrixBase Example </b><br/><br/>
+Prints the inverse condition number of the given matrix or matrix-expression.
+\code
+template <typename Derived>
+void print_inv_cond(const MatrixBase<Derived>& a)
+{
+  const typename JacobiSVD<typename Derived::PlainObject>::SingularValuesType&
+    sing_vals = a.jacobiSvd().singularValues();
+  std::cout << "inv cond: " << sing_vals(sing_vals.size()-1) / sing_vals(0) << std::endl;
+}
+\endcode
+<b> Multiple templated arguments example </b><br/><br/>
+Calculate the Euclidean distance between two points.
+\code
+template <typename DerivedA,typename DerivedB>
+typename DerivedA::Scalar squaredist(const MatrixBase<DerivedA>& p1,const MatrixBase<DerivedB>& p2)
+{
+  return (p1-p2).squaredNorm();
+}
+\endcode
+Notice that we used two template parameters, one per argument. This permits the function to handle inputs of different types, e.g.,
+\code
+squaredist(v1,2*v2)
+\endcode
+where the first argument \c v1 is a vector and the second argument \c 2*v2 is an expression.
+<br/><br/>
+
+These examples are just intended to give the reader a first impression of how functions can be written which take a plain and constant Matrix or Array argument. They are also intended to give the reader an idea about the most common base classes being the optimal candidates for functions. In the next section we will look in more detail at an example and the different ways it can be implemented, while discussing each implementation's problems and advantages. For the discussion below, Matrix and Array as well as MatrixBase and ArrayBase can be exchanged and all arguments still hold.
+
+
+\section TopicUsingRefClass How to write generic, but non-templated function?
+
+In all the previous examples, the functions had to be template functions. This approach allows to write very generic code, but it is often desirable to write non templated functions and still keep some level of genericity to avoid stupid copies of the arguments. The typical example is to write functions accepting both a MatrixXf or a block of a MatrixXf. This is exactly the purpose of the Ref class. Here is a simple example:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include function_taking_ref.cpp
+</td>
+<td>
+\verbinclude function_taking_ref.out
+</td></tr></table>
+In the first two calls to inv_cond, no copy occur because the memory layout of the arguments matches the memory layout accepted by Ref<MatrixXf>. However, in the last call, we have a generic expression that will be automatically evaluated into a temporary MatrixXf by the Ref<> object.
+
+A Ref object can also be writable. Here is an example of a function computing the covariance matrix of two input matrices where each row is an observation:
+\code
+void cov(const Ref<const MatrixXf> x, const Ref<const MatrixXf> y, Ref<MatrixXf> C)
+{
+  const float num_observations = static_cast<float>(x.rows());
+  const RowVectorXf x_mean = x.colwise().sum() / num_observations;
+  const RowVectorXf y_mean = y.colwise().sum() / num_observations;
+  C = (x.rowwise() - x_mean).transpose() * (y.rowwise() - y_mean) / num_observations;
+}
+\endcode
+and here are two examples calling cov without any copy:
+\code
+MatrixXf m1, m2, m3
+cov(m1, m2, m3);
+cov(m1.leftCols<3>(), m2.leftCols<3>(), m3.topLeftCorner<3,3>());
+\endcode
+The Ref<> class has two other optional template arguments allowing to control the kind of memory layout that can be accepted without any copy. See the class Ref documentation for the details.
+
+\section TopicPlainFunctionsWorking In which cases do functions taking plain Matrix or Array arguments work?
+
+Without using template functions, and without the Ref class, a naive implementation of the previous cov function might look like this
+\code
+MatrixXf cov(const MatrixXf& x, const MatrixXf& y)
+{
+  const float num_observations = static_cast<float>(x.rows());
+  const RowVectorXf x_mean = x.colwise().sum() / num_observations;
+  const RowVectorXf y_mean = y.colwise().sum() / num_observations;
+  return (x.rowwise() - x_mean).transpose() * (y.rowwise() - y_mean) / num_observations;
+}
+\endcode
+and contrary to what one might think at first, this implementation is fine unless you require a generic implementation that works with double matrices too and unless you do not care about temporary objects. Why is that the case? Where are temporaries involved? How can code as given below compile?
+\code
+MatrixXf x,y,z;
+MatrixXf C = cov(x,y+z);
+\endcode
+In this special case, the example is fine and will be working because both parameters are declared as \e const references. The compiler creates a temporary and evaluates the expression x+z into this temporary. Once the function is processed, the temporary is released and the result is assigned to C.
+
+\b Note: Functions taking \e const references to Matrix (or Array) can process expressions at the cost of temporaries.
+
+
+\section TopicPlainFunctionsFailing In which cases do functions taking a plain Matrix or Array argument fail?
+
+Here, we consider a slightly modified version of the function given above. This time, we do not want to return the result but pass an additional non-const parameter which allows us to store the result. A first naive implementation might look as follows.
+\code
+// Note: This code is flawed!
+void cov(const MatrixXf& x, const MatrixXf& y, MatrixXf& C)
+{
+  const float num_observations = static_cast<float>(x.rows());
+  const RowVectorXf x_mean = x.colwise().sum() / num_observations;
+  const RowVectorXf y_mean = y.colwise().sum() / num_observations;
+  C = (x.rowwise() - x_mean).transpose() * (y.rowwise() - y_mean) / num_observations;
+}
+\endcode
+When trying to execute the following code
+\code
+MatrixXf C = MatrixXf::Zero(3,6);
+cov(x,y, C.block(0,0,3,3));
+\endcode
+the compiler will fail, because it is not possible to convert the expression returned by \c MatrixXf::block() into a non-const \c MatrixXf&. This is the case because the compiler wants to protect you from writing your result to a temporary object. In this special case this protection is not intended -- we want to write to a temporary object. So how can we overcome this problem? 
+
+The solution which is preferred at the moment is based on a little \em hack. One needs to pass a const reference to the matrix and internally the constness needs to be cast away. The correct implementation for C98 compliant compilers would be
+\code
+template <typename Derived, typename OtherDerived>
+void cov(const MatrixBase<Derived>& x, const MatrixBase<Derived>& y, MatrixBase<OtherDerived> const & C)
+{
+  typedef typename Derived::Scalar Scalar;
+  typedef typename internal::plain_row_type<Derived>::type RowVectorType;
+
+  const Scalar num_observations = static_cast<Scalar>(x.rows());
+
+  const RowVectorType x_mean = x.colwise().sum() / num_observations;
+  const RowVectorType y_mean = y.colwise().sum() / num_observations;
+
+  const_cast< MatrixBase<OtherDerived>& >(C) =
+    (x.rowwise() - x_mean).transpose() * (y.rowwise() - y_mean) / num_observations;
+}
+\endcode
+The implementation above does now not only work with temporary expressions but it also allows to use the function with matrices of arbitrary floating point scalar types.
+
+\b Note: The const cast hack will only work with templated functions. It will not work with the MatrixXf implementation because it is not possible to cast a Block expression to a Matrix reference!
+
+
+
+\section TopicResizingInGenericImplementations How to resize matrices in generic implementations?
+
+One might think we are done now, right? This is not completely true because in order for our covariance function to be generically applicable, we want the following code to work
+\code
+MatrixXf x = MatrixXf::Random(100,3);
+MatrixXf y = MatrixXf::Random(100,3);
+MatrixXf C;
+cov(x, y, C);
+\endcode
+This is not the case anymore, when we are using an implementation taking MatrixBase as a parameter. In general, %Eigen supports automatic resizing but it is not possible to do so on expressions. Why should resizing of a matrix Block be allowed? It is a reference to a sub-matrix and we definitely don't want to resize that. So how can we incorporate resizing if we cannot resize on MatrixBase? The solution is to resize the derived object as in this implementation.
+\code
+template <typename Derived, typename OtherDerived>
+void cov(const MatrixBase<Derived>& x, const MatrixBase<Derived>& y, MatrixBase<OtherDerived> const & C_)
+{
+  typedef typename Derived::Scalar Scalar;
+  typedef typename internal::plain_row_type<Derived>::type RowVectorType;
+
+  const Scalar num_observations = static_cast<Scalar>(x.rows());
+
+  const RowVectorType x_mean = x.colwise().sum() / num_observations;
+  const RowVectorType y_mean = y.colwise().sum() / num_observations;
+
+  MatrixBase<OtherDerived>& C = const_cast< MatrixBase<OtherDerived>& >(C_);
+  
+  C.derived().resize(x.cols(),x.cols()); // resize the derived object
+  C = (x.rowwise() - x_mean).transpose() * (y.rowwise() - y_mean) / num_observations;
+}
+\endcode
+This implementation is now working for parameters being expressions and for parameters being matrices and having the wrong size. Resizing the expressions does not do any harm in this case unless they actually require resizing. That means, passing an expression with the wrong dimensions will result in a run-time error (in debug mode only) while passing expressions of the correct size will just work fine.
+
+\b Note: In the above discussion the terms Matrix and Array and MatrixBase and ArrayBase can be exchanged and all arguments still hold.
+
+\section TopicSummary Summary
+
+  - To summarize, the implementation of functions taking non-writable (const referenced) objects is not a big issue and does not lead to problematic situations in terms of compiling and running your program. However, a naive implementation is likely to introduce unnecessary temporary objects in your code. In order to avoid evaluating parameters into temporaries, pass them as (const) references to MatrixBase or ArrayBase (so templatize your function).
+
+  - Functions taking writable (non-const) parameters must take const references and cast away constness within the function body.
+
+  - Functions that take as parameters MatrixBase (or ArrayBase) objects, and potentially need to resize them (in the case where they are resizable), must call resize() on the derived class, as returned by derived().
+*/
+}

diff --git a/doc/HiPerformance.dox b/doc/HiPerformance.dox
new file mode 100644
index 0000000..9cee335
--- /dev/null
+++ b/doc/HiPerformance.dox

@@ -0,0 +1,128 @@
+
+namespace Eigen {
+
+/** \page TopicWritingEfficientProductExpression Writing efficient matrix product expressions
+
+In general achieving good performance with Eigen does no require any special effort:
+simply write your expressions in the most high level way. This is especially true
+for small fixed size matrices. For large matrices, however, it might be useful to
+take some care when writing your expressions in order to minimize useless evaluations
+and optimize the performance.
+In this page we will give a brief overview of the Eigen's internal mechanism to simplify
+and evaluate complex product expressions, and discuss the current limitations.
+In particular we will focus on expressions matching level 2 and 3 BLAS routines, i.e,
+all kind of matrix products and triangular solvers.
+
+Indeed, in Eigen we have implemented a set of highly optimized routines which are very similar
+to BLAS's ones. Unlike BLAS, those routines are made available to user via a high level and
+natural API. Each of these routines can compute in a single evaluation a wide variety of expressions.
+Given an expression, the challenge is then to map it to a minimal set of routines.
+As explained latter, this mechanism has some limitations, and knowing them will allow
+you to write faster code by making your expressions more Eigen friendly.
+
+\section GEMM General Matrix-Matrix product (GEMM)
+
+Let's start with the most common primitive: the matrix product of general dense matrices.
+In the BLAS world this corresponds to the GEMM routine. Our equivalent primitive can
+perform the following operation:
+\f$ C.noalias() += \alpha op1(A) op2(B) \f$
+where A, B, and C are column and/or row major matrices (or sub-matrices),
+alpha is a scalar value, and op1, op2 can be transpose, adjoint, conjugate, or the identity.
+When Eigen detects a matrix product, it analyzes both sides of the product to extract a
+unique scalar factor alpha, and for each side, its effective storage order, shape, and conjugation states.
+More precisely each side is simplified by iteratively removing trivial expressions such as scalar multiple,
+negation and conjugation. Transpose and Block expressions are not evaluated and they only modify the storage order
+and shape. All other expressions are immediately evaluated.
+For instance, the following expression:
+\code m1.noalias() -= s4 * (s1 * m2.adjoint() * (-(s3*m3).conjugate()*s2))  \endcode
+is automatically simplified to:
+\code m1.noalias() += (s1*s2*conj(s3)*s4) * m2.adjoint() * m3.conjugate() \endcode
+which exactly matches our GEMM routine.
+
+\subsection GEMM_Limitations Limitations
+Unfortunately, this simplification mechanism is not perfect yet and not all expressions which could be
+handled by a single GEMM-like call are correctly detected.
+<table class="manual" style="width:100%">
+<tr>
+<th>Not optimal expression</th>
+<th>Evaluated as</th>
+<th>Optimal version (single evaluation)</th>
+<th>Comments</th>
+</tr>
+<tr>
+<td>\code
+m1 += m2 * m3; \endcode</td>
+<td>\code
+temp = m2 * m3;
+m1 += temp; \endcode</td>
+<td>\code
+m1.noalias() += m2 * m3; \endcode</td>
+<td>Use .noalias() to tell Eigen the result and right-hand-sides do not alias. 
+    Otherwise the product m2 * m3 is evaluated into a temporary.</td>
+</tr>
+<tr class="alt">
+<td></td>
+<td></td>
+<td>\code
+m1.noalias() += s1 * (m2 * m3); \endcode</td>
+<td>This is a special feature of Eigen. Here the product between a scalar
+    and a matrix product does not evaluate the matrix product but instead it
+    returns a matrix product expression tracking the scalar scaling factor. <br>
+    Without this optimization, the matrix product would be evaluated into a
+    temporary as in the next example.</td>
+</tr>
+<tr>
+<td>\code
+m1.noalias() += (m2 * m3).adjoint(); \endcode</td>
+<td>\code
+temp = m2 * m3;
+m1 += temp.adjoint(); \endcode</td>
+<td>\code
+m1.noalias() += m3.adjoint()
+*              * m2.adjoint(); \endcode</td>
+<td>This is because the product expression has the EvalBeforeNesting bit which
+    enforces the evaluation of the product by the Tranpose expression.</td>
+</tr>
+<tr class="alt">
+<td>\code
+m1 = m1 + m2 * m3; \endcode</td>
+<td>\code
+temp = m2 * m3;
+m1 = m1 + temp; \endcode</td>
+<td>\code m1.noalias() += m2 * m3; \endcode</td>
+<td>Here there is no way to detect at compile time that the two m1 are the same,
+    and so the matrix product will be immediately evaluated.</td>
+</tr>
+<tr>
+<td>\code
+m1.noalias() = m4 + m2 * m3; \endcode</td>
+<td>\code
+temp = m2 * m3;
+m1 = m4 + temp; \endcode</td>
+<td>\code
+m1 = m4;
+m1.noalias() += m2 * m3; \endcode</td>
+<td>First of all, here the .noalias() in the first expression is useless because
+    m2*m3 will be evaluated anyway. However, note how this expression can be rewritten
+    so that no temporary is required. (tip: for very small fixed size matrix
+    it is slightly better to rewrite it like this: m1.noalias() = m2 * m3; m1 += m4;</td>
+</tr>
+<tr class="alt">
+<td>\code
+m1.noalias() += (s1*m2).block(..) * m3; \endcode</td>
+<td>\code
+temp = (s1*m2).block(..);
+m1 += temp * m3; \endcode</td>
+<td>\code
+m1.noalias() += s1 * m2.block(..) * m3; \endcode</td>
+<td>This is because our expression analyzer is currently not able to extract trivial
+    expressions nested in a Block expression. Therefore the nested scalar
+    multiple cannot be properly extracted.</td>
+</tr>
+</table>
+
+Of course all these remarks hold for all other kind of products involving triangular or selfadjoint matrices.
+
+*/
+
+}

diff --git a/doc/InplaceDecomposition.dox b/doc/InplaceDecomposition.dox
new file mode 100644
index 0000000..cb1c6d4
--- /dev/null
+++ b/doc/InplaceDecomposition.dox

@@ -0,0 +1,115 @@
+namespace Eigen {
+
+/** \eigenManualPage InplaceDecomposition Inplace matrix decompositions
+
+Starting from %Eigen 3.3, the LU, Cholesky, and QR decompositions can operate \em inplace, that is, directly within the given input matrix.
+This feature is especially useful when dealing with huge matrices, and or when the available memory is very limited (embedded systems).
+
+To this end, the respective decomposition class must be instantiated with a Ref<> matrix type, and the decomposition object must be constructed with the input matrix as argument. As an example, let us consider an inplace LU decomposition with partial pivoting.
+
+Let's start with the basic inclusions, and declaration of a 2x2 matrix \c A:
+
+<table class="example">
+<tr><th>code</th><th>output</th></tr>
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp init
+  </td>
+  <td>\snippet TutorialInplaceLU.out init
+  </td>
+</tr>
+</table>
+
+No surprise here! Then, let's declare our inplace LU object \c lu, and check the content of the matrix \c A:
+
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp declaration
+  </td>
+  <td>\snippet TutorialInplaceLU.out declaration
+  </td>
+</tr>
+</table>
+
+Here, the \c lu object computes and stores the \c L and \c U factors within the memory held by the matrix \c A.
+The coefficients of \c A have thus been destroyed during the factorization, and replaced by the L and U factors as one can verify:
+
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp matrixLU
+  </td>
+  <td>\snippet TutorialInplaceLU.out matrixLU
+  </td>
+</tr>
+</table>
+
+Then, one can use the \c lu object as usual, for instance to solve the Ax=b problem:
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp solve
+  </td>
+  <td>\snippet TutorialInplaceLU.out solve
+  </td>
+</tr>
+</table>
+
+Here, since the content of the original matrix \c A has been lost, we had to declared a new matrix \c A0 to verify the result.
+
+Since the memory is shared between \c A and \c lu, modifying the matrix \c A will make \c lu invalid.
+This can easily be verified by modifying the content of \c A and trying to solve the initial problem again:
+
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp modifyA
+  </td>
+  <td>\snippet TutorialInplaceLU.out modifyA
+  </td>
+</tr>
+</table>
+
+Note that there is no shared pointer under the hood, it is the \b responsibility \b of \b the \b user to keep the input matrix \c A in life as long as \c lu is living.
+
+If one wants to update the factorization with the modified A, one has to call the compute method as usual:
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp recompute
+  </td>
+  <td>\snippet TutorialInplaceLU.out recompute
+  </td>
+</tr>
+</table>
+
+Note that calling compute does not change the memory which is referenced by the \c lu object. Therefore, if the compute method is called with another matrix \c A1 different than \c A, then the content of \c A1 won't be modified. This is still the content of \c A that will be used to store the L and U factors of the matrix \c A1.
+This can easily be verified as follows:
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp recompute_bis0
+ </td>
+  <td>\snippet TutorialInplaceLU.out recompute_bis0
+ </td>
+</tr>
+</table>
+The matrix \c A1 is unchanged, and one can thus solve A1*x=b, and directly check the residual without any copy of \c A1:
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp recompute_bis1
+  </td>
+  <td>\snippet TutorialInplaceLU.out recompute_bis1
+ </td>
+</tr>
+</table>
+
+
+Here is the list of matrix decompositions supporting this inplace mechanism:
+
+- class LLT
+- class LDLT
+- class PartialPivLU
+- class FullPivLU
+- class HouseholderQR
+- class ColPivHouseholderQR
+- class FullPivHouseholderQR
+- class CompleteOrthogonalDecomposition
+
+*/
+
+}
\ No newline at end of file

diff --git a/doc/InsideEigenExample.dox b/doc/InsideEigenExample.dox
new file mode 100644
index 0000000..ea2275b
--- /dev/null
+++ b/doc/InsideEigenExample.dox

@@ -0,0 +1,500 @@
+namespace Eigen {
+
+/** \page TopicInsideEigenExample What happens inside Eigen, on a simple example
+
+\eigenAutoToc
+
+<hr>
+
+
+Consider the following example program:
+
+\code
+#include<Eigen/Core>
+
+int main()
+{
+  int size = 50;
+  // VectorXf is a vector of floats, with dynamic size.
+  Eigen::VectorXf u(size), v(size), w(size);
+  u = v + w;
+}
+\endcode
+
+The goal of this page is to understand how Eigen compiles it, assuming that SSE2 vectorization is enabled (GCC option -msse2).
+
+\section WhyInteresting Why it's interesting
+
+Maybe you think, that the above example program is so simple, that compiling it shouldn't involve anything interesting. So before starting, let us explain what is nontrivial in compiling it correctly -- that is, producing optimized code -- so that the complexity of Eigen, that we'll explain here, is really useful.
+
+Look at the line of code
+\code
+  u = v + w;   //   (*)
+\endcode
+
+The first important thing about compiling it, is that the arrays should be traversed only once, like
+\code
+  for(int i = 0; i < size; i++) u[i] = v[i] + w[i];
+\endcode
+The problem is that if we make a naive C++ library where the VectorXf class has an operator+ returning a VectorXf, then the line of code (*) will amount to:
+\code
+  VectorXf tmp = v + w;
+  VectorXf u = tmp;
+\endcode
+Obviously, the introduction of the temporary \a tmp here is useless. It has a very bad effect on performance, first because the creation of \a tmp requires a dynamic memory allocation in this context, and second as there are now two for loops:
+\code
+  for(int i = 0; i < size; i++) tmp[i] = v[i] + w[i];
+  for(int i = 0; i < size; i++) u[i] = tmp[i];
+\endcode
+Traversing the arrays twice instead of once is terrible for performance, as it means that we do many redundant memory accesses.
+
+The second important thing about compiling the above program, is to make correct use of SSE2 instructions. Notice that Eigen also supports AltiVec and that all the discussion that we make here applies also to AltiVec.
+
+SSE2, like AltiVec, is a set of instructions allowing to perform computations on packets of 128 bits at once. Since a float is 32 bits, this means that SSE2 instructions can handle 4 floats at once. This means that, if correctly used, they can make our computation go up to 4x faster.
+
+However, in the above program, we have chosen size=50, so our vectors consist of 50 float's, and 50 is not a multiple of 4. This means that we cannot hope to do all of that computation using SSE2 instructions. The second best thing, to which we should aim, is to handle the 48 first coefficients with SSE2 instructions, since 48 is the biggest multiple of 4 below 50, and then handle separately, without SSE2, the 49th and 50th coefficients. Something like this:
+
+\code
+  for(int i = 0; i < 4*(size/4); i+=4) u.packet(i)  = v.packet(i) + w.packet(i);
+  for(int i = 4*(size/4); i < size; i++) u[i] = v[i] + w[i];
+\endcode
+
+So let us look line by line at our example program, and let's follow Eigen as it compiles it.
+
+\section ConstructingVectors Constructing vectors
+
+Let's analyze the first line:
+
+\code
+  Eigen::VectorXf u(size), v(size), w(size);
+\endcode
+
+First of all, VectorXf is the following typedef:
+\code
+  typedef Matrix<float, Dynamic, 1> VectorXf;
+\endcode
+
+The class template Matrix is declared in src/Core/util/ForwardDeclarations.h with 6 template parameters, but the last 3 are automatically determined by the first 3. So you don't need to worry about them for now. Here, Matrix\<float, Dynamic, 1\> means a matrix of floats, with a dynamic number of rows and 1 column.
+
+The Matrix class inherits a base class, MatrixBase. Don't worry about it, for now it suffices to say that MatrixBase is what unifies matrices/vectors and all the expressions types -- more on that below.
+
+When we do
+\code
+  Eigen::VectorXf u(size);
+\endcode
+the constructor that is called is Matrix::Matrix(int), in src/Core/Matrix.h. Besides some assertions, all it does is to construct the \a m_storage member, which is of type DenseStorage\<float, Dynamic, Dynamic, 1\>.
+
+You may wonder, isn't it overengineering to have the storage in a separate class? The reason is that the Matrix class template covers all kinds of matrices and vector: both fixed-size and dynamic-size. The storage method is not the same in these two cases. For fixed-size, the matrix coefficients are stored as a plain member array. For dynamic-size, the coefficients will be stored as a pointer to a dynamically-allocated array. Because of this, we need to abstract storage away from the Matrix class. That's DenseStorage.
+
+Let's look at this constructor, in src/Core/DenseStorage.h. You can see that there are many partial template specializations of DenseStorages here, treating separately the cases where dimensions are Dynamic or fixed at compile-time. The partial specialization that we are looking at is:
+\code
+template<typename T, int _Cols> class DenseStorage<T, Dynamic, Dynamic, _Cols>
+\endcode
+
+Here, the constructor called is DenseStorage::DenseStorage(int size, int rows, int columns)
+with size=50, rows=50, columns=1.
+
+Here is this constructor:
+\code
+inline DenseStorage(int size, int rows, int) : m_data(internal::aligned_new<T>(size)), m_rows(rows) {}
+\endcode
+
+Here, the \a m_data member is the actual array of coefficients of the matrix. As you see, it is dynamically allocated. Rather than calling new[] or malloc(), as you can see, we have our own internal::aligned_new defined in src/Core/util/Memory.h. What it does is that if vectorization is enabled, then it uses a platform-specific call to allocate a 128-bit-aligned array, as that is very useful for vectorization with both SSE2 and AltiVec. If vectorization is disabled, it amounts to the standard new[].
+
+As you can see, the constructor also sets the \a m_rows member to \a size. Notice that there is no \a m_columns member: indeed, in this partial specialization of DenseStorage, we know the number of columns at compile-time, since the _Cols template parameter is different from Dynamic. Namely, in our case, _Cols is 1, which is to say that our vector is just a matrix with 1 column. Hence, there is no need to store the number of columns as a runtime variable.
+
+When you call VectorXf::data() to get the pointer to the array of coefficients, it returns DenseStorage::data() which returns the \a m_data member.
+
+When you call VectorXf::size() to get the size of the vector, this is actually a method in the base class MatrixBase. It determines that the vector is a column-vector, since ColsAtCompileTime==1 (this comes from the template parameters in the typedef VectorXf). It deduces that the size is the number of rows, so it returns VectorXf::rows(), which returns DenseStorage::rows(), which returns the \a m_rows member, which was set to \a size by the constructor.
+
+\section ConstructionOfSumXpr Construction of the sum expression
+
+Now that our vectors are constructed, let's move on to the next line:
+
+\code
+u = v + w;
+\endcode
+
+The executive summary is that operator+ returns a "sum of vectors" expression, but doesn't actually perform the computation. It is the operator=, whose call occurs thereafter, that does the computation.
+
+Let us now see what Eigen does when it sees this:
+
+\code
+v + w
+\endcode
+
+Here, v and w are of type VectorXf, which is a typedef for a specialization of Matrix (as we explained above), which is a subclass of MatrixBase. So what is being called is
+
+\code
+MatrixBase::operator+(const MatrixBase&)
+\endcode
+
+The return type of this operator is
+\code
+CwiseBinaryOp<internal::scalar_sum_op<float>, VectorXf, VectorXf>
+\endcode
+The CwiseBinaryOp class is our first encounter with an expression template. As we said, the operator+ doesn't by itself perform any computation, it just returns an abstract "sum of vectors" expression. Since there are also "difference of vectors" and "coefficient-wise product of vectors" expressions, we unify them all as "coefficient-wise binary operations", which we abbreviate as "CwiseBinaryOp". "Coefficient-wise" means that the operations is performed coefficient by coefficient. "binary" means that there are two operands -- we are adding two vectors with one another.
+
+Now you might ask, what if we did something like
+
+\code
+v + w + u;
+\endcode
+
+The first v + w would return a CwiseBinaryOp as above, so in order for this to compile, we'd need to define an operator+ also in the class CwiseBinaryOp... at this point it starts looking like a nightmare: are we going to have to define all operators in each of the expression classes (as you guessed, CwiseBinaryOp is only one of many) ? This looks like a dead end!
+
+The solution is that CwiseBinaryOp itself, as well as Matrix and all the other expression types, is a subclass of MatrixBase. So it is enough to define once and for all the operators in class MatrixBase.
+
+Since MatrixBase is the common base class of different subclasses, the aspects that depend on the subclass must be abstracted from MatrixBase. This is called polymorphism.
+
+The classical approach to polymorphism in C++ is by means of virtual functions. This is dynamic polymorphism. Here we don't want dynamic polymorphism because the whole design of Eigen is based around the assumption that all the complexity, all the abstraction, gets resolved at compile-time. This is crucial: if the abstraction can't get resolved at compile-time, Eigen's compile-time optimization mechanisms become useless, not to mention that if that abstraction has to be resolved at runtime it'll incur an overhead by itself.
+
+Here, what we want is to have a single class MatrixBase as the base of many subclasses, in such a way that each MatrixBase object (be it a matrix, or vector, or any kind of expression) knows at compile-time (as opposed to run-time) of which particular subclass it is an object (i.e. whether it is a matrix, or an expression, and what kind of expression).
+
+The solution is the <a href="http://en.wikipedia.org/wiki/Curiously_Recurring_Template_Pattern">Curiously Recurring Template Pattern</a>. Let's do the break now. Hopefully you can read this wikipedia page during the break if needed, but it won't be allowed during the exam.
+
+In short, MatrixBase takes a template parameter \a Derived. Whenever we define a subclass Subclass, we actually make Subclass inherit MatrixBase\<Subclass\>. The point is that different subclasses inherit different MatrixBase types. Thanks to this, whenever we have an object of a subclass, and we call on it some MatrixBase method, we still remember even from inside the MatrixBase method which particular subclass we're talking about.
+
+This means that we can put almost all the methods and operators in the base class MatrixBase, and have only the bare minimum in the subclasses. If you look at the subclasses in Eigen, like for instance the CwiseBinaryOp class, they have very few methods. There are coeff() and sometimes coeffRef() methods for access to the coefficients, there are rows() and cols() methods returning the number of rows and columns, but there isn't much more than that. All the meat is in MatrixBase, so it only needs to be coded once for all kinds of expressions, matrices, and vectors.
+
+So let's end this digression and come back to the piece of code from our example program that we were currently analyzing,
+
+\code
+v + w
+\endcode
+
+Now that MatrixBase is a good friend, let's write fully the prototype of the operator+ that gets called here (this code is from src/Core/MatrixBase.h):
+
+\code
+template<typename Derived>
+class MatrixBase
+{
+  // ...
+
+  template<typename OtherDerived>
+  const CwiseBinaryOp<internal::scalar_sum_op<typename internal::traits<Derived>::Scalar>, Derived, OtherDerived>
+  operator+(const MatrixBase<OtherDerived> &other) const;
+
+  // ...
+};
+\endcode
+
+Here of course, \a Derived and \a OtherDerived are VectorXf.
+
+As we said, CwiseBinaryOp is also used for other operations such as substration, so it takes another template parameter determining the operation that will be applied to coefficients. This template parameter is a functor, that is, a class in which we have an operator() so it behaves like a function. Here, the functor used is internal::scalar_sum_op. It is defined in src/Core/Functors.h.
+
+Let us now explain the internal::traits here. The internal::scalar_sum_op class takes one template parameter: the type of the numbers to handle. Here of course we want to pass the scalar type (a.k.a. numeric type) of VectorXf, which is \c float. How do we determine which is the scalar type of \a Derived ? Throughout Eigen, all matrix and expression types define a typedef \a Scalar which gives its scalar type. For example, VectorXf::Scalar is a typedef for \c float. So here, if life was easy, we could find the numeric type of \a Derived as just
+\code
+typename Derived::Scalar
+\endcode
+Unfortunately, we can't do that here, as the compiler would complain that the type Derived hasn't yet been defined. So we use a workaround: in src/Core/util/ForwardDeclarations.h, we declared (not defined!) all our subclasses, like Matrix, and we also declared the following class template:
+\code
+template<typename T> struct internal::traits;
+\endcode
+In src/Core/Matrix.h, right \em before the definition of class Matrix, we define a partial specialization of internal::traits for T=Matrix\<any template parameters\>. In this specialization of internal::traits, we define the Scalar typedef. So when we actually define Matrix, it is legal to refer to "typename internal::traits\<Matrix\>::Scalar".
+
+Anyway, we have declared our operator+. In our case, where \a Derived and \a OtherDerived are VectorXf, the above declaration amounts to:
+\code
+class MatrixBase<VectorXf>
+{
+  // ...
+
+  const CwiseBinaryOp<internal::scalar_sum_op<float>, VectorXf, VectorXf>
+  operator+(const MatrixBase<VectorXf> &other) const;
+
+  // ...
+};
+\endcode
+
+Let's now jump to src/Core/CwiseBinaryOp.h to see how it is defined. As you can see there, all it does is to return a CwiseBinaryOp object, and this object is just storing references to the left-hand-side and right-hand-side expressions -- here, these are the vectors \a v and \a w. Well, the CwiseBinaryOp object is also storing an instance of the (empty) functor class, but you shouldn't worry about it as that is a minor implementation detail.
+
+Thus, the operator+ hasn't performed any actual computation. To summarize, the operation \a v + \a w just returned an object of type CwiseBinaryOp which did nothing else than just storing references to \a v and \a w.
+
+\section Assignment The assignment
+
+<div class="warningbox">
+<strong>PLEASE HELP US IMPROVING THIS SECTION.</strong>
+This page reflects how %Eigen worked until 3.2, but since %Eigen 3.3 the assignment is more sophisticated as it involves an Assignment expression, and the creation of so called evaluator which are responsible for the evaluation of each kind of expressions.
+</div>
+
+At this point, the expression \a v + \a w has finished evaluating, so, in the process of compiling the line of code
+\code
+u = v + w;
+\endcode
+we now enter the operator=.
+
+What operator= is being called here? The vector u is an object of class VectorXf, i.e. Matrix. In src/Core/Matrix.h, inside the definition of class Matrix, we see this:
+\code
+    template<typename OtherDerived>
+    inline Matrix& operator=(const MatrixBase<OtherDerived>& other)
+    {
+      eigen_assert(m_storage.data()!=0 && "you cannot use operator= with a non initialized matrix (instead use set()");
+      return Base::operator=(other.derived());
+    }
+\endcode
+Here, Base is a typedef for MatrixBase\<Matrix\>. So, what is being called is the operator= of MatrixBase. Let's see its prototype in src/Core/MatrixBase.h:
+\code
+    template<typename OtherDerived>
+    Derived& operator=(const MatrixBase<OtherDerived>& other);
+\endcode
+Here, \a Derived is VectorXf (since u is a VectorXf) and \a OtherDerived is CwiseBinaryOp. More specifically, as explained in the previous section, \a OtherDerived is:
+\code
+CwiseBinaryOp<internal::scalar_sum_op<float>, VectorXf, VectorXf>
+\endcode
+So the full prototype of the operator= being called is:
+\code
+VectorXf& MatrixBase<VectorXf>::operator=(const MatrixBase<CwiseBinaryOp<internal::scalar_sum_op<float>, VectorXf, VectorXf> > & other);
+\endcode
+This operator= literally reads "copying a sum of two VectorXf's into another VectorXf".
+
+Let's now look at the implementation of this operator=. It resides in the file src/Core/Assign.h.
+
+What we can see there is:
+\code
+template<typename Derived>
+template<typename OtherDerived>
+inline Derived& MatrixBase<Derived>
+  ::operator=(const MatrixBase<OtherDerived>& other)
+{
+  return internal::assign_selector<Derived,OtherDerived>::run(derived(), other.derived());
+}
+\endcode
+
+OK so our next task is to understand internal::assign_selector :)
+
+Here is its declaration (all that is still in the same file src/Core/Assign.h)
+\code
+template<typename Derived, typename OtherDerived,
+         bool EvalBeforeAssigning = int(OtherDerived::Flags) & EvalBeforeAssigningBit,
+         bool NeedToTranspose = Derived::IsVectorAtCompileTime
+                && OtherDerived::IsVectorAtCompileTime
+                && int(Derived::RowsAtCompileTime) == int(OtherDerived::ColsAtCompileTime)
+                && int(Derived::ColsAtCompileTime) == int(OtherDerived::RowsAtCompileTime)
+                && int(Derived::SizeAtCompileTime) != 1>
+struct internal::assign_selector;
+\endcode
+
+So internal::assign_selector takes 4 template parameters, but the 2 last ones are automatically determined by the 2 first ones.
+
+EvalBeforeAssigning is here to enforce the EvalBeforeAssigningBit. As explained <a href="TopicLazyEvaluation.html">here</a>, certain expressions have this flag which makes them automatically evaluate into temporaries before assigning them to another expression. This is the case of the Product expression, in order to avoid strange aliasing effects when doing "m = m * m;" However, of course here our CwiseBinaryOp expression doesn't have the EvalBeforeAssigningBit: we said since the beginning that we didn't want a temporary to be introduced here. So if you go to src/Core/CwiseBinaryOp.h, you'll see that the Flags in internal::traits\<CwiseBinaryOp\> don't include the EvalBeforeAssigningBit. The Flags member of CwiseBinaryOp is then imported from the internal::traits by the EIGEN_GENERIC_PUBLIC_INTERFACE macro. Anyway, here the template parameter EvalBeforeAssigning has the value \c false.
+
+NeedToTranspose is here for the case where the user wants to copy a row-vector into a column-vector. We allow this as a special exception to the general rule that in assignments we require the dimesions to match. Anyway, here both the left-hand and right-hand sides are column vectors, in the sense that ColsAtCompileTime is equal to 1. So NeedToTranspose is \c false too.
+
+So, here we are in the partial specialization:
+\code
+internal::assign_selector<Derived, OtherDerived, false, false>
+\endcode
+
+Here's how it is defined:
+\code
+template<typename Derived, typename OtherDerived>
+struct internal::assign_selector<Derived,OtherDerived,false,false> {
+  static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.derived()); }
+};
+\endcode
+
+OK so now our next job is to understand how lazyAssign works :)
+
+\code
+template<typename Derived>
+template<typename OtherDerived>
+inline Derived& MatrixBase<Derived>
+  ::lazyAssign(const MatrixBase<OtherDerived>& other)
+{
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived,OtherDerived)
+  eigen_assert(rows() == other.rows() && cols() == other.cols());
+  internal::assign_impl<Derived, OtherDerived>::run(derived(),other.derived());
+  return derived();
+}
+\endcode
+
+What do we see here? Some assertions, and then the only interesting line is:
+\code
+  internal::assign_impl<Derived, OtherDerived>::run(derived(),other.derived());
+\endcode
+
+OK so now we want to know what is inside internal::assign_impl.
+
+Here is its declaration:
+\code
+template<typename Derived1, typename Derived2,
+         int Vectorization = internal::assign_traits<Derived1, Derived2>::Vectorization,
+         int Unrolling = internal::assign_traits<Derived1, Derived2>::Unrolling>
+struct internal::assign_impl;
+\endcode
+Again, internal::assign_selector takes 4 template parameters, but the 2 last ones are automatically determined by the 2 first ones.
+
+These two parameters \a Vectorization and \a Unrolling are determined by a helper class internal::assign_traits. Its job is to determine which vectorization strategy to use (that is \a Vectorization) and which unrolling strategy to use (that is \a Unrolling).
+
+We'll not enter into the details of how these strategies are chosen (this is in the implementation of internal::assign_traits at the top of the same file). Let's just say that here \a Vectorization has the value \a LinearVectorization, and \a Unrolling has the value \a NoUnrolling (the latter is obvious since our vectors have dynamic size so there's no way to unroll the loop at compile-time).
+
+So the partial specialization of internal::assign_impl that we're looking at is:
+\code
+internal::assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
+\endcode
+
+Here is how it's defined:
+\code
+template<typename Derived1, typename Derived2>
+struct internal::assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
+{
+  static void run(Derived1 &dst, const Derived2 &src)
+  {
+    const int size = dst.size();
+    const int packetSize = internal::packet_traits<typename Derived1::Scalar>::size;
+    const int alignedStart = internal::assign_traits<Derived1,Derived2>::DstIsAligned ? 0
+                           : internal::first_aligned(&dst.coeffRef(0), size);
+    const int alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
+
+    for(int index = 0; index < alignedStart; index++)
+      dst.copyCoeff(index, src);
+
+    for(int index = alignedStart; index < alignedEnd; index += packetSize)
+    {
+      dst.template copyPacket<Derived2, Aligned, internal::assign_traits<Derived1,Derived2>::SrcAlignment>(index, src);
+    }
+
+    for(int index = alignedEnd; index < size; index++)
+      dst.copyCoeff(index, src);
+  }
+};
+\endcode
+
+Here's how it works. \a LinearVectorization means that the left-hand and right-hand side expression can be accessed linearly i.e. you can refer to their coefficients by one integer \a index, as opposed to having to refer to its coefficients by two integers \a row, \a column.
+
+As we said at the beginning, vectorization works with blocks of 4 floats. Here, \a PacketSize is 4.
+
+There are two potential problems that we need to deal with:
+\li first, vectorization works much better if the packets are 128-bit-aligned. This is especially important for write access. So when writing to the coefficients of \a dst, we want to group these coefficients by packets of 4 such that each of these packets is 128-bit-aligned. In general, this requires to skip a few coefficients at the beginning of \a dst. This is the purpose of \a alignedStart. We then copy these first few coefficients one by one, not by packets. However, in our case, the \a dst expression is a VectorXf and remember that in the construction of the vectors we allocated aligned arrays. Thanks to \a DstIsAligned, Eigen remembers that without having to do any runtime check, so \a alignedStart is zero and this part is avoided altogether.
+\li second, the number of coefficients to copy is not in general a multiple of \a packetSize. Here, there are 50 coefficients to copy and \a packetSize is 4. So we'll have to copy the last 2 coefficients one by one, not by packets. Here, \a alignedEnd is 48.
+
+Now come the actual loops.
+
+First, the vectorized part: the 48 first coefficients out of 50 will be copied by packets of 4:
+\code
+  for(int index = alignedStart; index < alignedEnd; index += packetSize)
+  {
+    dst.template copyPacket<Derived2, Aligned, internal::assign_traits<Derived1,Derived2>::SrcAlignment>(index, src);
+  }
+\endcode
+
+What is copyPacket? It is defined in src/Core/Coeffs.h:
+\code
+template<typename Derived>
+template<typename OtherDerived, int StoreMode, int LoadMode>
+inline void MatrixBase<Derived>::copyPacket(int index, const MatrixBase<OtherDerived>& other)
+{
+  eigen_internal_assert(index >= 0 && index < size());
+  derived().template writePacket<StoreMode>(index,
+    other.derived().template packet<LoadMode>(index));
+}
+\endcode
+
+OK, what are writePacket() and packet() here?
+
+First, writePacket() here is a method on the left-hand side VectorXf. So we go to src/Core/Matrix.h to look at its definition:
+\code
+template<int StoreMode>
+inline void writePacket(int index, const PacketScalar& x)
+{
+  internal::pstoret<Scalar, PacketScalar, StoreMode>(m_storage.data() + index, x);
+}
+\endcode
+Here, \a StoreMode is \a #Aligned, indicating that we are doing a 128-bit-aligned write access, \a PacketScalar is a type representing a "SSE packet of 4 floats" and internal::pstoret is a function writing such a packet in memory. Their definitions are architecture-specific, we find them in src/Core/arch/SSE/PacketMath.h:
+
+The line in src/Core/arch/SSE/PacketMath.h that determines the PacketScalar type (via a typedef in Matrix.h) is:
+\code
+template<> struct internal::packet_traits<float>  { typedef __m128  type; enum {size=4}; };
+\endcode
+Here, __m128 is a SSE-specific type. Notice that the enum \a size here is what was used to define \a packetSize above.
+
+And here is the implementation of internal::pstoret:
+\code
+template<> inline void internal::pstore(float*  to, const __m128&  from) { _mm_store_ps(to, from); }
+\endcode
+Here, __mm_store_ps is a SSE-specific intrinsic function, representing a single SSE instruction. The difference between internal::pstore and internal::pstoret is that internal::pstoret is a dispatcher handling both the aligned and unaligned cases, you find its definition in src/Core/GenericPacketMath.h:
+\code
+template<typename Scalar, typename Packet, int LoadMode>
+inline void internal::pstoret(Scalar* to, const Packet& from)
+{
+  if(LoadMode == Aligned)
+    internal::pstore(to, from);
+  else
+    internal::pstoreu(to, from);
+}
+\endcode
+
+OK, that explains how writePacket() works. Now let's look into the packet() call. Remember that we are analyzing this line of code inside copyPacket():
+\code
+derived().template writePacket<StoreMode>(index,
+    other.derived().template packet<LoadMode>(index));
+\endcode
+
+Here, \a other is our sum expression \a v + \a w. The .derived() is just casting from MatrixBase to the subclass which here is CwiseBinaryOp. So let's go to src/Core/CwiseBinaryOp.h:
+\code
+class CwiseBinaryOp
+{
+  // ...
+    template<int LoadMode>
+    inline PacketScalar packet(int index) const
+    {
+      return m_functor.packetOp(m_lhs.template packet<LoadMode>(index), m_rhs.template packet<LoadMode>(index));
+    }
+};
+\endcode
+Here, \a m_lhs is the vector \a v, and \a m_rhs is the vector \a w. So the packet() function here is Matrix::packet(). The template parameter \a LoadMode is \a #Aligned. So we're looking at
+\code
+class Matrix
+{
+  // ...
+    template<int LoadMode>
+    inline PacketScalar packet(int index) const
+    {
+      return internal::ploadt<Scalar, LoadMode>(m_storage.data() + index);
+    }
+};
+\endcode
+We let you look up the definition of internal::ploadt in GenericPacketMath.h and the internal::pload in src/Core/arch/SSE/PacketMath.h. It is very similar to the above for internal::pstore.
+
+Let's go back to CwiseBinaryOp::packet(). Once the packets from the vectors \a v and \a w have been returned, what does this function do? It calls m_functor.packetOp() on them. What is m_functor? Here we must remember what particular template specialization of CwiseBinaryOp we're dealing with:
+\code
+CwiseBinaryOp<internal::scalar_sum_op<float>, VectorXf, VectorXf>
+\endcode
+So m_functor is an object of the empty class internal::scalar_sum_op<float>. As we mentioned above, don't worry about why we constructed an object of this empty class at all -- it's an implementation detail, the point is that some other functors need to store member data.
+
+Anyway, internal::scalar_sum_op is defined in src/Core/Functors.h:
+\code
+template<typename Scalar> struct internal::scalar_sum_op EIGEN_EMPTY_STRUCT {
+  inline const Scalar operator() (const Scalar& a, const Scalar& b) const { return a + b; }
+  template<typename PacketScalar>
+  inline const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
+  { return internal::padd(a,b); }
+};
+\endcode
+As you can see, all what packetOp() does is to call internal::padd on the two packets. Here is the definition of internal::padd from src/Core/arch/SSE/PacketMath.h:
+\code
+template<> inline __m128  internal::padd(const __m128&  a, const __m128&  b) { return _mm_add_ps(a,b); }
+\endcode
+Here, _mm_add_ps is a SSE-specific intrinsic function, representing a single SSE instruction.
+
+To summarize, the loop
+\code
+  for(int index = alignedStart; index < alignedEnd; index += packetSize)
+  {
+    dst.template copyPacket<Derived2, Aligned, internal::assign_traits<Derived1,Derived2>::SrcAlignment>(index, src);
+  }
+\endcode
+has been compiled to the following code: for \a index going from 0 to the 11 ( = 48/4 - 1), read the i-th packet (of 4 floats) from the vector v and the i-th packet from the vector w using two __mm_load_ps SSE instructions, then add them together using a __mm_add_ps instruction, then store the result using a __mm_store_ps instruction.
+
+There remains the second loop handling the last few (here, the last 2) coefficients:
+\code
+  for(int index = alignedEnd; index < size; index++)
+    dst.copyCoeff(index, src);
+\endcode
+However, it works just like the one we just explained, it is just simpler because there is no SSE vectorization involved here. copyPacket() becomes copyCoeff(), packet() becomes coeff(), writePacket() becomes coeffRef(). If you followed us this far, you can probably understand this part by yourself.
+
+We see that all the C++ abstraction of Eigen goes away during compilation and that we indeed are precisely controlling which assembly instructions we emit. Such is the beauty of C++! Since we have such precise control over the emitted assembly instructions, but such complex logic to choose the right instructions, we can say that Eigen really behaves like an optimizing compiler. If you prefer, you could say that Eigen behaves like a script for the compiler. In a sense, C++ template metaprogramming is scripting the compiler -- and it's been shown that this scripting language is Turing-complete. See <a href="http://en.wikipedia.org/wiki/Template_metaprogramming"> Wikipedia</a>.
+
+*/
+
+}

diff --git a/doc/LeastSquares.dox b/doc/LeastSquares.dox
new file mode 100644
index 0000000..ddbf38d
--- /dev/null
+++ b/doc/LeastSquares.dox

@@ -0,0 +1,75 @@
+namespace Eigen {
+
+/** \eigenManualPage LeastSquares Solving linear least squares systems
+
+This page describes how to solve linear least squares systems using %Eigen. An overdetermined system
+of equations, say \a Ax = \a b, has no solutions. In this case, it makes sense to search for the
+vector \a x which is closest to being a solution, in the sense that the difference \a Ax - \a b is
+as small as possible. This \a x is called the least square solution (if the Euclidean norm is used).
+
+The three methods discussed on this page are the SVD decomposition, the QR decomposition and normal
+equations. Of these, the SVD decomposition is generally the most accurate but the slowest, normal
+equations is the fastest but least accurate, and the QR decomposition is in between.
+
+\eigenAutoToc
+
+
+\section LeastSquaresSVD Using the SVD decomposition
+
+The \link BDCSVD::solve() solve() \endlink method in the BDCSVD class can be directly used to
+solve linear squares systems. It is not enough to compute only the singular values (the default for
+this class); you also need the singular vectors but the thin SVD decomposition suffices for
+computing least squares solutions:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+  <td>\include TutorialLinAlgSVDSolve.cpp </td>
+  <td>\verbinclude TutorialLinAlgSVDSolve.out </td>
+</tr>
+</table>
+
+This is example from the page \link TutorialLinearAlgebra Linear algebra and decompositions \endlink.
+If you just need to solve the least squares problem, but are not interested in the SVD per se, a
+faster alternative method is CompleteOrthogonalDecomposition. 
+
+
+\section LeastSquaresQR Using the QR decomposition
+
+The solve() method in QR decomposition classes also computes the least squares solution. There are
+three QR decomposition classes: HouseholderQR (no pivoting, fast but unstable if your matrix is
+not rull rank), ColPivHouseholderQR (column pivoting, thus a bit slower but more stable) and
+FullPivHouseholderQR (full pivoting, so slowest and slightly more stable than ColPivHouseholderQR).
+Here is an example with column pivoting:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+  <td>\include LeastSquaresQR.cpp </td>
+  <td>\verbinclude LeastSquaresQR.out </td>
+</tr>
+</table>
+
+
+\section LeastSquaresNormalEquations Using normal equations
+
+Finding the least squares solution of \a Ax = \a b is equivalent to solving the normal equation
+<i>A</i><sup>T</sup><i>Ax</i> = <i>A</i><sup>T</sup><i>b</i>. This leads to the following code
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+  <td>\include LeastSquaresNormalEquations.cpp </td>
+  <td>\verbinclude LeastSquaresNormalEquations.out </td>
+</tr>
+</table>
+
+This method is usually the fastest, especially when \a A is "tall and skinny". However, if the
+matrix \a A is even mildly ill-conditioned, this is not a good method, because the condition number
+of <i>A</i><sup>T</sup><i>A</i> is the square of the condition number of \a A. This means that you
+lose roughly twice as many digits of accuracy using the normal equation, compared to the more stable
+methods mentioned above.
+
+*/
+
+}
\ No newline at end of file

diff --git a/doc/Manual.dox b/doc/Manual.dox
new file mode 100644
index 0000000..84f0db6
--- /dev/null
+++ b/doc/Manual.dox

@@ -0,0 +1,192 @@
+
+// This file strutures pages and modules into a convenient hierarchical structure.
+
+namespace Eigen {
+
+/** \page UserManual_CustomizingEigen Extending/Customizing Eigen
+  %Eigen can be extended in several ways, for instance, by defining global methods, by inserting custom methods within main %Eigen's classes through the \ref TopicCustomizing_Plugins "plugin" mechanism, by adding support to \ref TopicCustomizing_CustomScalar "custom scalar types" etc. See below for the respective sub-topics.
+  - \subpage TopicCustomizing_Plugins
+  - \subpage TopicCustomizing_InheritingMatrix
+  - \subpage TopicCustomizing_CustomScalar
+  - \subpage TopicCustomizing_NullaryExpr
+  - \subpage TopicNewExpressionType
+  \sa \ref TopicPreprocessorDirectives
+*/
+
+
+/** \page UserManual_Generalities General topics
+  - \subpage TopicFunctionTakingEigenTypes
+  - \subpage TopicPreprocessorDirectives
+  - \subpage TopicAssertions
+  - \subpage TopicMultiThreading
+  - \subpage TopicUsingBlasLapack
+  - \subpage TopicUsingIntelMKL
+  - \subpage TopicCUDA
+  - \subpage TopicPitfalls
+  - \subpage TopicTemplateKeyword
+  - \subpage UserManual_UnderstandingEigen
+  - \subpage TopicCMakeGuide
+*/
+
+/** \page UserManual_UnderstandingEigen Understanding Eigen
+  - \subpage TopicInsideEigenExample
+  - \subpage TopicClassHierarchy
+  - \subpage TopicLazyEvaluation
+*/
+
+/** \page UnclassifiedPages Unclassified pages
+  - \subpage TopicResizing
+  - \subpage TopicVectorization
+  - \subpage TopicEigenExpressionTemplates
+  - \subpage TopicScalarTypes
+  - \subpage GettingStarted
+  - \subpage TutorialSparse_example_details
+  - \subpage TopicWritingEfficientProductExpression
+  - \subpage Experimental
+*/
+
+
+/** \defgroup Support_modules Support modules
+  * Category of modules which add support for external libraries.
+  */
+
+
+/** \defgroup DenseMatrixManipulation_chapter Dense matrix and array manipulation */
+/** \defgroup DenseMatrixManipulation_Alignement Alignment issues */
+/** \defgroup DenseMatrixManipulation_Reference Reference */
+
+/** \addtogroup TutorialMatrixClass
+    \ingroup DenseMatrixManipulation_chapter */
+/** \addtogroup TutorialMatrixArithmetic
+    \ingroup DenseMatrixManipulation_chapter */
+/** \addtogroup TutorialArrayClass
+    \ingroup DenseMatrixManipulation_chapter */
+/** \addtogroup TutorialBlockOperations
+    \ingroup DenseMatrixManipulation_chapter */
+/** \addtogroup TutorialSlicingIndexing
+    \ingroup DenseMatrixManipulation_chapter */
+/** \addtogroup TutorialAdvancedInitialization
+    \ingroup DenseMatrixManipulation_chapter */
+/** \addtogroup TutorialReductionsVisitorsBroadcasting
+    \ingroup DenseMatrixManipulation_chapter */
+/** \addtogroup TutorialReshape
+    \ingroup DenseMatrixManipulation_chapter */
+/** \addtogroup TutorialSTL
+    \ingroup DenseMatrixManipulation_chapter */
+/** \addtogroup TutorialMapClass
+    \ingroup DenseMatrixManipulation_chapter */
+/** \addtogroup TopicAliasing
+    \ingroup DenseMatrixManipulation_chapter */
+/** \addtogroup TopicStorageOrders
+    \ingroup DenseMatrixManipulation_chapter */
+
+/** \addtogroup DenseMatrixManipulation_Alignement
+    \ingroup DenseMatrixManipulation_chapter        */
+/**     \addtogroup TopicUnalignedArrayAssert
+        \ingroup DenseMatrixManipulation_Alignement */
+/**     \addtogroup TopicFixedSizeVectorizable
+        \ingroup DenseMatrixManipulation_Alignement */
+/**     \addtogroup TopicStructHavingEigenMembers
+        \ingroup DenseMatrixManipulation_Alignement */
+/**     \addtogroup TopicStlContainers
+        \ingroup DenseMatrixManipulation_Alignement */
+/**     \addtogroup TopicPassingByValue
+        \ingroup DenseMatrixManipulation_Alignement */
+/**     \addtogroup TopicWrongStackAlignment
+        \ingroup DenseMatrixManipulation_Alignement */
+    
+/** \addtogroup DenseMatrixManipulation_Reference
+    \ingroup DenseMatrixManipulation_chapter       */
+/**     \addtogroup Core_Module
+        \ingroup DenseMatrixManipulation_Reference */  
+/**     \addtogroup Jacobi_Module
+        \ingroup DenseMatrixManipulation_Reference */ 
+/**     \addtogroup Householder_Module
+        \ingroup DenseMatrixManipulation_Reference */ 
+
+/** \addtogroup CoeffwiseMathFunctions
+    \ingroup DenseMatrixManipulation_chapter */
+
+/** \addtogroup QuickRefPage
+    \ingroup DenseMatrixManipulation_chapter */
+
+
+/** \defgroup DenseLinearSolvers_chapter Dense linear problems and decompositions */
+/** \defgroup DenseLinearSolvers_Reference Reference */
+
+/** \addtogroup TutorialLinearAlgebra
+    \ingroup DenseLinearSolvers_chapter */
+/** \addtogroup TopicLinearAlgebraDecompositions
+    \ingroup DenseLinearSolvers_chapter */
+/** \addtogroup LeastSquares 
+    \ingroup DenseLinearSolvers_chapter */
+/** \addtogroup InplaceDecomposition
+    \ingroup DenseLinearSolvers_chapter */
+/** \addtogroup DenseDecompositionBenchmark
+    \ingroup DenseLinearSolvers_chapter */
+
+/** \addtogroup DenseLinearSolvers_Reference
+    \ingroup DenseLinearSolvers_chapter */
+/** \addtogroup Cholesky_Module
+    \ingroup DenseLinearSolvers_Reference */
+/** \addtogroup LU_Module
+    \ingroup DenseLinearSolvers_Reference */
+/** \addtogroup QR_Module
+    \ingroup DenseLinearSolvers_Reference */
+/** \addtogroup SVD_Module
+    \ingroup DenseLinearSolvers_Reference*/
+/** \addtogroup Eigenvalues_Module
+    \ingroup DenseLinearSolvers_Reference */
+
+
+
+
+/** \defgroup Sparse_chapter Sparse linear algebra */
+/** \defgroup Sparse_Reference Reference */
+
+/** \addtogroup TutorialSparse
+    \ingroup Sparse_chapter */
+/** \addtogroup TopicSparseSystems
+    \ingroup Sparse_chapter */
+/** \addtogroup MatrixfreeSolverExample
+    \ingroup Sparse_chapter */
+
+/** \addtogroup Sparse_Reference
+    \ingroup Sparse_chapter */
+/** \addtogroup SparseCore_Module
+    \ingroup Sparse_Reference */
+/** \addtogroup OrderingMethods_Module
+    \ingroup Sparse_Reference */
+/** \addtogroup SparseCholesky_Module
+    \ingroup Sparse_Reference */
+/** \addtogroup SparseLU_Module
+    \ingroup Sparse_Reference */
+/** \addtogroup SparseQR_Module
+    \ingroup Sparse_Reference */
+/** \addtogroup IterativeLinearSolvers_Module
+    \ingroup Sparse_Reference */
+/** \addtogroup Sparse_Module
+    \ingroup Sparse_Reference */
+/** \addtogroup Support_modules
+    \ingroup Sparse_Reference */    
+
+/** \addtogroup SparseQuickRefPage
+    \ingroup Sparse_chapter */
+
+
+/** \defgroup Geometry_chapter Geometry */
+/** \defgroup Geometry_Reference Reference */
+
+/** \addtogroup TutorialGeometry
+    \ingroup Geometry_chapter */
+    
+/** \addtogroup Geometry_Reference
+    \ingroup Geometry_chapter */
+/** \addtogroup Geometry_Module
+    \ingroup Geometry_Reference */
+/** \addtogroup Splines_Module
+    \ingroup Geometry_Reference */
+
+/** \internal \brief Namespace containing low-level routines from the %Eigen library. */
+namespace internal {}
+}

diff --git a/doc/MatrixfreeSolverExample.dox b/doc/MatrixfreeSolverExample.dox
new file mode 100644
index 0000000..3efa292
--- /dev/null
+++ b/doc/MatrixfreeSolverExample.dox

@@ -0,0 +1,20 @@
+
+namespace Eigen {
+
+/**
+
+\eigenManualPage MatrixfreeSolverExample Matrix-free solvers
+
+Iterative solvers such as ConjugateGradient and BiCGSTAB can be used in a matrix free context. To this end, user must provide a wrapper class inheriting EigenBase<> and implementing the following methods:
+ - \c Index \c rows() and \c Index \c cols(): returns number of rows and columns respectively
+ - \c operator* with your type and an %Eigen dense column vector (its actual implementation goes in a specialization of the internal::generic_product_impl class)
+
+\c Eigen::internal::traits<> must also be specialized for the wrapper type.
+
+Here is a complete example wrapping an Eigen::SparseMatrix:
+\include matrixfree_cg.cpp
+Output: \verbinclude matrixfree_cg.out
+
+*/
+
+}
\ No newline at end of file

diff --git a/doc/NewExpressionType.dox b/doc/NewExpressionType.dox
new file mode 100644
index 0000000..c2f2433
--- /dev/null
+++ b/doc/NewExpressionType.dox

@@ -0,0 +1,143 @@
+namespace Eigen {
+
+/** \page TopicNewExpressionType Adding a new expression type
+
+<!--<span style="font-size:130%; color:red; font-weight: 900;"></span>-->
+\warning
+Disclaimer: this page is tailored to very advanced users who are not afraid of dealing with some %Eigen's internal aspects.
+In most cases, a custom expression can be avoided by either using custom \ref MatrixBase::unaryExpr "unary" or \ref MatrixBase::binaryExpr "binary" functors,
+while extremely complex matrix manipulations can be achieved by a nullary functors as described in the \ref TopicCustomizing_NullaryExpr "previous page".
+
+This page describes with the help of an example how to implement a new
+light-weight expression type in %Eigen. This consists of three parts:
+the expression type itself, a traits class containing compile-time
+information about the expression, and the evaluator class which is
+used to evaluate the expression to a matrix.
+
+\b TO \b DO: Write a page explaining the design, with details on
+vectorization etc., and refer to that page here.
+
+
+\eigenAutoToc
+
+\section TopicSetting The setting
+
+A circulant matrix is a matrix where each column is the same as the
+column to the left, except that it is cyclically shifted downwards.
+For example, here is a 4-by-4 circulant matrix:
+\f[ \begin{bmatrix} 
+    1 & 8 & 4 & 2 \\ 
+    2 & 1 & 8 & 4 \\
+    4 & 2 & 1 & 8 \\
+    8 & 4 & 2 & 1
+\end{bmatrix} \f]
+A circulant matrix is uniquely determined by its first column. We wish
+to write a function \c makeCirculant which, given the first column,
+returns an expression representing the circulant matrix.
+
+For simplicity, we restrict the \c makeCirculant function to dense
+matrices. It may make sense to also allow arrays, or sparse matrices,
+but we will not do so here. We also do not want to support
+vectorization.
+
+
+\section TopicPreamble Getting started
+
+We will present the file implementing the \c makeCirculant function
+part by part. We start by including the appropriate header files and
+forward declaring the expression class, which we will call
+\c Circulant. The \c makeCirculant function will return an object of
+this type. The class \c Circulant is in fact a class template; the
+template argument \c ArgType refers to the type of the vector passed
+to the \c makeCirculant function.
+
+\include make_circulant.cpp.preamble
+
+
+\section TopicTraits The traits class
+
+For every expression class \c X, there should be a traits class 
+\c Traits<X> in the \c Eigen::internal namespace containing
+information about \c X known as compile time.
+
+As explained in \ref TopicSetting, we designed the \c Circulant
+expression class to refer to dense matrices. The entries of the
+circulant matrix have the same type as the entries of the vector
+passed to the \c makeCirculant function. The type used to index the
+entries is also the same. Again for simplicity, we will only return
+column-major matrices. Finally, the circulant matrix is a square
+matrix (number of rows equals number of columns), and the number of
+rows equals the number of rows of the column vector passed to the
+\c makeCirculant function. If this is a dynamic-size vector, then the
+size of the circulant matrix is not known at compile-time.
+
+This leads to the following code:
+
+\include make_circulant.cpp.traits
+
+
+\section TopicExpression The expression class
+
+The next step is to define the expression class itself. In our case,
+we want to inherit from \c MatrixBase in order to expose the interface
+for dense matrices. In the constructor, we check that we are passed a
+column vector (see \ref TopicAssertions) and we store the vector from
+which we are going to build the circulant matrix in the member
+variable \c m_arg. Finally, the expression class should compute the
+size of the corresponding circulant matrix. As explained above, this
+is a square matrix with as many columns as the vector used to
+construct the matrix.
+
+\b TO \b DO: What about the \c Nested typedef? It seems to be
+necessary; is this only temporary?
+
+\include make_circulant.cpp.expression
+
+
+\section TopicEvaluator The evaluator
+
+The last big fragment implements the evaluator for the \c Circulant
+expression. The evaluator computes the entries of the circulant
+matrix; this is done in the \c .coeff() member function. The entries
+are computed by finding the corresponding entry of the vector from
+which the circulant matrix is constructed. Getting this entry may
+actually be non-trivial when the circulant matrix is constructed from
+a vector which is given by a complicated expression, so we use the
+evaluator which corresponds to the vector.
+
+The \c CoeffReadCost constant records the cost of computing an entry
+of the circulant matrix; we ignore the index computation and say that
+this is the same as the cost of computing an entry of the vector from
+which the circulant matrix is constructed.
+
+In the constructor, we save the evaluator for the column vector which
+defined the circulant matrix. We also save the size of that vector;
+remember that we can query an expression object to find the size but
+not the evaluator. 
+
+\include make_circulant.cpp.evaluator
+
+
+\section TopicEntry The entry point
+
+After all this, the \c makeCirculant function is very simple. It
+simply creates an expression object and returns it.
+
+\include make_circulant.cpp.entry
+
+
+\section TopicMain A simple main function for testing
+
+Finally, a short \c main function that shows how the \c makeCirculant
+function can be called.
+
+\include make_circulant.cpp.main
+
+If all the fragments are combined, the following output is produced,
+showing that the program works as expected:
+
+\include make_circulant.out
+
+*/
+}
+

diff --git a/doc/Overview.dox b/doc/Overview.dox
new file mode 100644
index 0000000..43a1287
--- /dev/null
+++ b/doc/Overview.dox

@@ -0,0 +1,28 @@
+namespace Eigen {
+
+/** \mainpage notitle
+
+This is the API documentation for Eigen3. You can <a href="eigen-doc.tgz">download</a> it as a tgz archive for offline reading.
+
+For a first contact with Eigen, the best place is to have a look at the \link GettingStarted getting started \endlink page that show you how to write and compile your first program with Eigen.
+
+Then, the \b quick \b reference \b pages give you a quite complete description of the API in a very condensed format that is specially useful to recall the syntax of a particular feature, or to have a quick look at the API. They currently cover the two following feature sets, and more will come in the future:
+  - \link QuickRefPage [QuickRef] Dense matrix and array manipulations \endlink
+  - \link SparseQuickRefPage [QuickRef] Sparse linear algebra \endlink
+
+You're a MatLab user? There is also a <a href="AsciiQuickReference.txt">short ASCII reference</a> with Matlab translations.
+  
+The \b main \b documentation is organized into \em chapters covering different domains of features.
+They are themselves composed of \em user \em manual pages describing the different features in a comprehensive way, and \em reference pages that gives you access to the API documentation through the related Eigen's \em modules and \em classes.
+
+Under the \subpage UserManual_CustomizingEigen section, you will find discussions and examples on extending %Eigen's features and supporting custom scalar types.
+
+Under the \subpage UserManual_Generalities section, you will find documentation on more general topics such as preprocessor directives, controlling assertions, multi-threading, MKL support, some Eigen's internal insights, and much more...
+
+Finally, do not miss the search engine, useful to quickly get to the documentation of a given class or function.
+
+Want more? Checkout the <a href="unsupported/index.html">\em unsupported \em modules </a> documentation.
+
+*/
+
+}

diff --git a/doc/PassingByValue.dox b/doc/PassingByValue.dox
new file mode 100644
index 0000000..9254fe6
--- /dev/null
+++ b/doc/PassingByValue.dox

@@ -0,0 +1,40 @@
+namespace Eigen {
+
+/** \eigenManualPage TopicPassingByValue Passing Eigen objects by value to functions
+
+Passing objects by value is almost always a very bad idea in C++, as this means useless copies, and one should pass them by reference instead.
+
+With %Eigen, this is even more important: passing \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen objects" by value is not only inefficient, it can be illegal or make your program crash! And the reason is that these %Eigen objects have alignment modifiers that aren't respected when they are passed by value.
+
+For example, a function like this, where \c v is passed by value:
+
+\code
+void my_function(Eigen::Vector2d v);
+\endcode
+
+needs to be rewritten as follows, passing \c v by const reference:
+
+\code
+void my_function(const Eigen::Vector2d& v);
+\endcode
+
+Likewise if you have a class having an %Eigen object as member:
+
+\code
+struct Foo
+{
+  Eigen::Vector2d v;
+};
+void my_function(Foo v);
+\endcode
+
+This function also needs to be rewritten like this:
+\code
+void my_function(const Foo& v);
+\endcode
+
+Note that on the other hand, there is no problem with functions that return objects by value.
+
+*/
+
+}

diff --git a/doc/Pitfalls.dox b/doc/Pitfalls.dox
new file mode 100644
index 0000000..85282bd
--- /dev/null
+++ b/doc/Pitfalls.dox

@@ -0,0 +1,149 @@
+namespace Eigen {
+
+/** \page TopicPitfalls Common pitfalls
+
+
+\section TopicPitfalls_template_keyword Compilation error with template methods
+
+See this \link TopicTemplateKeyword page \endlink.
+
+
+\section TopicPitfalls_aliasing Aliasing
+
+Don't miss this \link TopicAliasing page \endlink on aliasing,
+especially if you got wrong results in statements where the destination appears on the right hand side of the expression.
+
+
+\section TopicPitfalls_alignment_issue Alignment Issues (runtime assertion)
+
+%Eigen does explicit vectorization, and while that is appreciated by many users, that also leads to some issues in special situations where data alignment is compromised.
+Indeed, prior to C++17,  C++ does not have quite good enough support for explicit data alignment.
+In that case your program hits an assertion failure (that is, a "controlled crash") with a message that tells you to consult this page:
+\code
+http://eigen.tuxfamily.org/dox/group__TopicUnalignedArrayAssert.html
+\endcode
+Have a look at \link TopicUnalignedArrayAssert it \endlink and see for yourself if that's something that you can cope with.
+It contains detailed information about how to deal with each known cause for that issue.
+
+Now what if you don't care about vectorization and so don't want to be annoyed with these alignment issues? Then read \link getrid how to get rid of them \endlink.
+
+
+\section TopicPitfalls_auto_keyword C++11 and the auto keyword
+
+In short: do not use the auto keywords with %Eigen's expressions, unless you are 100% sure about what you are doing. In particular, do not use the auto keyword as a replacement for a \c Matrix<> type. Here is an example:
+
+\code
+MatrixXd A, B;
+auto C = A*B;
+for(...) { ... w = C * v;  ...}
+\endcode
+
+In this example, the type of C is not a \c MatrixXd but an abstract expression representing a matrix product and storing references to \c A and \c B.
+Therefore, the product of \c A*B will be carried out multiple times, once per iteration of the for loop.
+Moreover, if the coefficients of `A` or `B` change during the iteration, then `C` will evaluate to different values as in the following example:
+
+\code
+MatrixXd A = ..., B = ...;
+auto C = A*B;
+MatrixXd R1 = C;
+A = ...;
+MatrixXd R2 = C;
+\endcode
+for which we end up with `R1` &ne; `R2`.
+
+
+Here is another example leading to a segfault:
+\code
+auto C = ((A+B).eval()).transpose();
+// do something with C
+\endcode
+The problem is that \c eval() returns a temporary object (in this case a \c MatrixXd) which is then referenced by the \c Transpose<> expression.
+However, this temporary is deleted right after the first line, and then the \c C expression references a dead object.
+One possible fix consists in applying \c eval() on the whole expression:
+\code
+auto C = (A+B).transpose().eval();
+\endcode
+
+The same issue might occur when sub expressions are automatically evaluated by %Eigen as in the following example:
+\code
+VectorXd u, v;
+auto C = u + (A*v).normalized();
+// do something with C
+\endcode
+Here the \c normalized() method has to evaluate the expensive product \c A*v to avoid evaluating it twice.
+Again, one possible fix is to call \c .eval() on the whole expression:
+\code
+auto C = (u + (A*v).normalized()).eval();
+\endcode
+In this case, \c C will be a regular \c VectorXd object.
+Note that DenseBase::eval() is smart enough to avoid copies when the underlying expression is already a plain \c Matrix<>.
+
+
+\section TopicPitfalls_header_issues Header Issues (failure to compile)
+
+With all libraries, one must check the documentation for which header to include.
+The same is true with %Eigen, but slightly worse: with %Eigen, a method in a class may require an additional \c \#include over what the class itself requires!
+For example, if you want to use the \c cross() method on a vector (it computes a cross-product) then you need to:
+\code
+#include<Eigen/Geometry>
+\endcode
+We try to always document this, but do tell us if we forgot an occurrence.
+
+
+\section TopicPitfalls_ternary_operator Ternary operator
+
+In short: avoid the use of the ternary operator <code>(COND ? THEN : ELSE)</code> with %Eigen's expressions for the \c THEN and \c ELSE statements.
+To see why, let's consider the following example:
+\code
+Vector3f A;
+A << 1, 2, 3;
+Vector3f B = ((1 < 0) ? (A.reverse()) : A);
+\endcode
+This example will return <code>B = 3, 2, 1</code>. Do you see why?
+The reason is that in c++ the type of the \c ELSE statement is inferred from the type of the \c THEN expression such that both match.
+Since \c THEN is a <code>Reverse<Vector3f></code>, the \c ELSE statement A is converted to a <code>Reverse<Vector3f></code>, and the compiler thus generates:
+\code
+Vector3f B = ((1 < 0) ? (A.reverse()) : Reverse<Vector3f>(A));
+\endcode
+In this very particular case, a workaround would be to call A.reverse().eval() for the \c THEN statement, but the safest and fastest is really to avoid this ternary operator with %Eigen's expressions and use a if/else construct.
+
+
+\section TopicPitfalls_pass_by_value Pass-by-value
+
+If you don't know why passing-by-value is wrong with %Eigen, read this \link TopicPassingByValue page \endlink first.
+
+While you may be extremely careful and use care to make sure that all of your code that explicitly uses %Eigen types is pass-by-reference you have to watch out for templates which define the argument types at compile time.
+
+If a template has a function that takes arguments pass-by-value, and the relevant template parameter ends up being an %Eigen type, then you will of course have the same alignment problems that you would in an explicitly defined function passing %Eigen types by reference.
+
+Using %Eigen types with other third party libraries or even the STL can present the same problem.
+<code>boost::bind</code> for example uses pass-by-value to store arguments in the returned functor.
+This will of course be a problem.
+
+There are at least two ways around this:
+  - If the value you are passing is guaranteed to be around for the life of the functor, you can use boost::ref() to wrap the value as you pass it to boost::bind. Generally this is not a solution for values on the stack as if the functor ever gets passed to a lower or independent scope, the object may be gone by the time it's attempted to be used.
+  - The other option is to make your functions take a reference counted pointer like boost::shared_ptr as the argument. This avoids needing to worry about managing the lifetime of the object being passed.
+
+
+\section TopicPitfalls_matrix_bool Matrices with boolean coefficients
+
+The current behaviour of using \c Matrix with boolean coefficients is inconsistent and likely to change in future versions of Eigen, so please use it carefully!
+
+A simple example for such an inconsistency is 
+
+\code
+template<int Size>
+void foo() {
+  Eigen::Matrix<bool, Size, Size> A, B, C;
+  A.setOnes();
+  B.setOnes();
+
+  C = A * B - A * B;
+  std::cout << C << "\n";
+}
+\endcode
+
+since calling \c foo<3>() prints the zero matrix while calling \c foo<10>() prints the identity matrix.
+
+*/
+}

diff --git a/doc/PreprocessorDirectives.dox b/doc/PreprocessorDirectives.dox
new file mode 100644
index 0000000..0f545b0
--- /dev/null
+++ b/doc/PreprocessorDirectives.dox

@@ -0,0 +1,179 @@
+namespace Eigen {
+
+/** \page TopicPreprocessorDirectives Preprocessor directives
+
+You can control some aspects of %Eigen by defining the preprocessor tokens using \c \#define. These macros
+should be defined before any %Eigen headers are included. Often they are best set in the project options.
+
+This page lists the preprocessor tokens recognized by %Eigen.
+
+\eigenAutoToc
+
+
+\section TopicPreprocessorDirectivesMajor Macros with major effects
+
+These macros have a major effect and typically break the API (Application Programming Interface) and/or the
+ABI (Application Binary Interface). This can be rather dangerous: if parts of your program are compiled with
+one option, and other parts (or libraries that you use) are compiled with another option, your program may
+fail to link or exhibit subtle bugs. Nevertheless, these options can be useful for people who know what they
+are doing.
+
+ - \b EIGEN2_SUPPORT and \b EIGEN2_SUPPORT_STAGEnn_xxx are disabled starting from the 3.3 release.
+   Defining one of these will raise a compile-error. If you need to compile Eigen2 code,
+   <a href="http://eigen.tuxfamily.org/index.php?title=Eigen2">check this site</a>.
+ - \b EIGEN_DEFAULT_DENSE_INDEX_TYPE - the type for column and row indices in matrices, vectors and array
+   (DenseBase::Index). Set to \c std::ptrdiff_t by default.
+ - \b EIGEN_DEFAULT_IO_FORMAT - the IOFormat to use when printing a matrix if no %IOFormat is specified.
+   Defaults to the %IOFormat constructed by the default constructor IOFormat::IOFormat().
+ - \b EIGEN_INITIALIZE_MATRICES_BY_ZERO - if defined, all entries of newly constructed matrices and arrays are
+   initialized to zero, as are new entries in matrices and arrays after resizing. Not defined by default.
+   \warning The unary (resp. binary) constructor of \c 1x1 (resp. \c 2x1 or \c 1x2) fixed size matrices is
+   always interpreted as an initialization constructor where the argument(s) are the coefficient values
+   and not the sizes. For instance, \code Vector2d v(2,1); \endcode will create a vector with coeficients [2,1],
+   and \b not a \c 2x1 vector initialized with zeros (i.e., [0,0]). If such cases might occur, then it is
+   recommended to use the default constructor with a explicit call to resize:
+   \code
+   Matrix<?,SizeAtCompileTime,1> v;
+   v.resize(size);
+   Matrix<?,RowsAtCompileTime,ColsAtCompileTime> m;
+   m.resize(rows,cols);
+   \endcode
+ - \b EIGEN_INITIALIZE_MATRICES_BY_NAN - if defined, all entries of newly constructed matrices and arrays are
+   initialized to NaN, as are new entries in matrices and arrays after resizing. This option is especially
+   useful for debugging purpose, though a memory tool like <a href="http://valgrind.org/">valgrind</a> is
+   preferable. Not defined by default.
+   \warning See the documentation of \c EIGEN_INITIALIZE_MATRICES_BY_ZERO for a discussion on a limitations
+   of these macros when applied to \c 1x1, \c 1x2, and \c 2x1 fixed-size matrices.
+ - \b EIGEN_NO_AUTOMATIC_RESIZING - if defined, the matrices (or arrays) on both sides of an assignment 
+   <tt>a = b</tt> have to be of the same size; otherwise, %Eigen automatically resizes \c a so that it is of
+   the correct size. Not defined by default.
+
+
+\section TopicPreprocessorDirectivesCppVersion C++ standard features
+
+By default, %Eigen strive to automatically detect and enable language features at compile-time based on
+the information provided by the compiler.
+
+ - \b EIGEN_MAX_CPP_VER - disables usage of C++ features requiring a version greater than EIGEN_MAX_CPP_VER.
+   Possible values are: 03, 11, 14, 17, etc. If not defined (the default), %Eigen enables all features supported
+   by the compiler.
+
+Individual features can be explicitly enabled or disabled by defining the following token to 0 or 1 respectively.
+For instance, one might limit the C++ version to C++03 by defining EIGEN_MAX_CPP_VER=03, but still enable C99 math
+functions by defining EIGEN_HAS_C99_MATH=1.
+
+ - \b EIGEN_HAS_C99_MATH - controls the usage of C99 math functions such as erf, erfc, lgamma, etc.
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+ - \b EIGEN_HAS_CXX11_MATH - controls the implementation of some functions such as round, logp1, isinf, isnan, etc.
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+ - \b EIGEN_HAS_RVALUE_REFERENCES - defines whether rvalue references are supported
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+ - \b EIGEN_HAS_STD_RESULT_OF - defines whether std::result_of is supported
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+ - \b EIGEN_HAS_VARIADIC_TEMPLATES - defines whether variadic templates are supported
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+ - \b EIGEN_HAS_CONSTEXPR - defines whether relaxed const expression are supported
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<14.
+ - \b EIGEN_HAS_CXX11_CONTAINERS - defines whether STL's containers follows C++11 specifications
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+ - \b EIGEN_HAS_CXX11_NOEXCEPT - defines whether noexcept is supported
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+ - \b EIGEN_NO_IO - Disables any usage and support for `<iostreams>`.
+
+\section TopicPreprocessorDirectivesAssertions Assertions
+
+The %Eigen library contains many assertions to guard against programming errors, both at compile time and at
+run time. However, these assertions do cost time and can thus be turned off.
+
+ - \b EIGEN_NO_DEBUG - disables %Eigen's assertions if defined. Not defined by default, unless the
+   \c NDEBUG macro is defined (this is a standard C++ macro which disables all asserts). 
+ - \b EIGEN_NO_STATIC_ASSERT - if defined, compile-time static assertions are replaced by runtime assertions; 
+   this saves compilation time. Not defined by default.
+ - \b eigen_assert - macro with one argument that is used inside %Eigen for assertions. By default, it is
+   basically defined to be \c assert, which aborts the program if the assertion is violated. Redefine this
+   macro if you want to do something else, like throwing an exception.
+ - \b EIGEN_MPL2_ONLY - disable non MPL2 compatible features, or in other words disable the features which
+   are still under the LGPL.
+
+
+\section TopicPreprocessorDirectivesPerformance Alignment, vectorization and performance tweaking
+
+ - \b \c EIGEN_MALLOC_ALREADY_ALIGNED - Can be set to 0 or 1 to tell whether default system \c malloc already
+   returns aligned buffers. In not defined, then this information is automatically deduced from the compiler
+   and system preprocessor tokens.
+ - \b \c EIGEN_MAX_ALIGN_BYTES - Must be a power of two, or 0. Defines an upper bound on the memory boundary in bytes on which dynamically and statically allocated data may be aligned by %Eigen. If not defined, a default value is automatically computed based on architecture, compiler, and OS.
+ This option is typically used to enforce binary compatibility between code/libraries compiled with different SIMD options. For instance, one may compile AVX code and enforce ABI compatibility with existing SSE code by defining \c EIGEN_MAX_ALIGN_BYTES=16. In the other way round, since by default AVX implies 32 bytes alignment for best performance, one can compile SSE code to be ABI compatible with AVX code by defining \c EIGEN_MAX_ALIGN_BYTES=32.
+ - \b \c EIGEN_MAX_STATIC_ALIGN_BYTES - Same as \c EIGEN_MAX_ALIGN_BYTES but for statically allocated data only. By default, if only  \c EIGEN_MAX_ALIGN_BYTES is defined, then \c EIGEN_MAX_STATIC_ALIGN_BYTES == \c EIGEN_MAX_ALIGN_BYTES, otherwise a default value is automatically computed based on architecture, compiler, and OS (can be smaller than the default value of EIGEN_MAX_ALIGN_BYTES on architectures that do not support stack alignment).
+ Let us emphasize that \c EIGEN_MAX_*_ALIGN_BYTES define only a diserable upper bound. In practice data is aligned to largest power-of-two common divisor of \c EIGEN_MAX_STATIC_ALIGN_BYTES and the size of the data, such that memory is not wasted.
+ - \b \c EIGEN_DONT_PARALLELIZE - if defined, this disables multi-threading. This is only relevant if you enabled OpenMP.
+   See \ref TopicMultiThreading for details.
+ - \b \c EIGEN_DONT_VECTORIZE - disables explicit vectorization when defined. Not defined by default, unless 
+   alignment is disabled by %Eigen's platform test or the user defining \c EIGEN_DONT_ALIGN.
+ - \b \c EIGEN_UNALIGNED_VECTORIZE - disables/enables vectorization with unaligned stores. Default is 1 (enabled).
+   If set to 0 (disabled), then expression for which the destination cannot be aligned are not vectorized (e.g., unaligned
+   small fixed size vectors or matrices)
+ - \b \c EIGEN_FAST_MATH - enables some optimizations which might affect the accuracy of the result. This currently
+   enables the SSE vectorization of sin() and cos(), and speedups sqrt() for single precision. Defined to 1 by default.
+   Define it to 0 to disable.
+ - \b \c EIGEN_UNROLLING_LIMIT - defines the size of a loop to enable meta unrolling. Set it to zero to disable
+   unrolling. The size of a loop here is expressed in %Eigen's own notion of "number of FLOPS", it does not
+   correspond to the number of iterations or the number of instructions. The default is value 110.
+ - \b \c EIGEN_STACK_ALLOCATION_LIMIT - defines the maximum bytes for a buffer to be allocated on the stack. For internal
+   temporary buffers, dynamic memory allocation is employed as a fall back. For fixed-size matrices or arrays, exceeding
+   this threshold raises a compile time assertion. Use 0 to set no limit. Default is 128 KB.
+ - \b \c EIGEN_NO_CUDA - disables CUDA support when defined. Might be useful in .cu files for which Eigen is used on the host only,
+   and never called from device code.
+ - \b \c EIGEN_STRONG_INLINE - This macro is used to qualify critical functions and methods that we expect the compiler to inline.
+   By default it is defined to \c __forceinline for MSVC and ICC, and to \c inline for other compilers. A tipical usage is to
+   define it to \c inline for MSVC users wanting faster compilation times, at the risk of performance degradations in some rare
+   cases for which MSVC inliner fails to do a good job.
+ - \b \c EIGEN_DEFAULT_L1_CACHE_SIZE - Sets the default L1 cache size that is used in Eigen's GEBP kernel when the correct cache size cannot be determined at runtime.
+ - \b \c EIGEN_DEFAULT_L2_CACHE_SIZE - Sets the default L2 cache size that is used in Eigen's GEBP kernel when the correct cache size cannot be determined at runtime.
+ - \b \c EIGEN_DEFAULT_L3_CACHE_SIZE - Sets the default L3 cache size that is used in Eigen's GEBP kernel when the correct cache size cannot be determined at runtime.
+
+ - \c EIGEN_DONT_ALIGN - Deprecated, it is a synonym for \c EIGEN_MAX_ALIGN_BYTES=0. It disables alignment completely. %Eigen will not try to align its objects and does not expect that any objects passed to it are aligned. This will turn off vectorization if \b \c EIGEN_UNALIGNED_VECTORIZE=1. Not defined by default.
+ - \c EIGEN_DONT_ALIGN_STATICALLY - Deprecated, it is a synonym for \c EIGEN_MAX_STATIC_ALIGN_BYTES=0. It disables alignment of arrays on the stack. Not defined by default, unless \c EIGEN_DONT_ALIGN is defined.
+
+
+\section TopicPreprocessorDirectivesPlugins Plugins
+
+It is possible to add new methods to many fundamental classes in %Eigen by writing a plugin. As explained in
+the section \ref TopicCustomizing_Plugins, the plugin is specified by defining a \c EIGEN_xxx_PLUGIN macro. The
+following macros are supported; none of them are defined by default.
+
+ - \b EIGEN_ARRAY_PLUGIN - filename of plugin for extending the Array class.
+ - \b EIGEN_ARRAYBASE_PLUGIN - filename of plugin for extending the ArrayBase class.
+ - \b EIGEN_CWISE_PLUGIN - filename of plugin for extending the Cwise class.
+ - \b EIGEN_DENSEBASE_PLUGIN - filename of plugin for extending the DenseBase class.
+ - \b EIGEN_DYNAMICSPARSEMATRIX_PLUGIN - filename of plugin for extending the DynamicSparseMatrix class.
+ - \b EIGEN_FUNCTORS_PLUGIN - filename of plugin for adding new functors and specializations of functor_traits.
+ - \b EIGEN_MAPBASE_PLUGIN - filename of plugin for extending the MapBase class.
+ - \b EIGEN_MATRIX_PLUGIN - filename of plugin for extending the Matrix class.
+ - \b EIGEN_MATRIXBASE_PLUGIN - filename of plugin for extending the MatrixBase class.
+ - \b EIGEN_PLAINOBJECTBASE_PLUGIN - filename of plugin for extending the PlainObjectBase class.
+ - \b EIGEN_QUATERNION_PLUGIN - filename of plugin for extending the Quaternion class.
+ - \b EIGEN_QUATERNIONBASE_PLUGIN - filename of plugin for extending the QuaternionBase class.
+ - \b EIGEN_SPARSEMATRIX_PLUGIN - filename of plugin for extending the SparseMatrix class.
+ - \b EIGEN_SPARSEMATRIXBASE_PLUGIN - filename of plugin for extending the SparseMatrixBase class.
+ - \b EIGEN_SPARSEVECTOR_PLUGIN - filename of plugin for extending the SparseVector class.
+ - \b EIGEN_TRANSFORM_PLUGIN - filename of plugin for extending the Transform class.
+ - \b EIGEN_VECTORWISEOP_PLUGIN - filename of plugin for extending the VectorwiseOp class.
+
+\section TopicPreprocessorDirectivesDevelopers Macros for Eigen developers
+
+These macros are mainly meant for people developing %Eigen and for testing purposes. Even though, they might be useful for power users and the curious for debugging and testing purpose, they \b should \b not \b be \b used by real-word code.
+
+ - \b EIGEN_DEFAULT_TO_ROW_MAJOR - when defined, the default storage order for matrices becomes row-major
+   instead of column-major. Not defined by default.
+ - \b EIGEN_INTERNAL_DEBUGGING - if defined, enables assertions in %Eigen's internal routines. This is useful
+   for debugging %Eigen itself. Not defined by default.
+ - \b EIGEN_NO_MALLOC - if defined, any request from inside the %Eigen to allocate memory from the heap
+   results in an assertion failure. This is useful to check that some routine does not allocate memory
+   dynamically. Not defined by default.
+ - \b EIGEN_RUNTIME_NO_MALLOC - if defined, a new switch is introduced which can be turned on and off by
+   calling <tt>set_is_malloc_allowed(bool)</tt>. If malloc is not allowed and %Eigen tries to allocate memory
+   dynamically anyway, an assertion failure results. Not defined by default.
+
+*/
+
+}

diff --git a/doc/QuickReference.dox b/doc/QuickReference.dox
new file mode 100644
index 0000000..c5dfce4
--- /dev/null
+++ b/doc/QuickReference.dox

@@ -0,0 +1,805 @@
+namespace Eigen {
+
+/** \eigenManualPage QuickRefPage Quick reference guide
+
+\eigenAutoToc
+
+<hr>
+
+<a href="#" class="top">top</a>
+\section QuickRef_Headers Modules and Header files
+
+The Eigen library is divided in a Core module and several additional modules. Each module has a corresponding header file which has to be included in order to use the module. The \c %Dense and \c Eigen header files are provided to conveniently gain access to several modules at once.
+
+<table class="manual">
+<tr><th>Module</th><th>Header file</th><th>Contents</th></tr>
+<tr            ><td>\link Core_Module Core \endlink</td><td>\code#include <Eigen/Core>\endcode</td><td>Matrix and Array classes, basic linear algebra (including triangular and selfadjoint products), array manipulation</td></tr>
+<tr class="alt"><td>\link Geometry_Module Geometry \endlink</td><td>\code#include <Eigen/Geometry>\endcode</td><td>Transform, Translation, Scaling, Rotation2D and 3D rotations (Quaternion, AngleAxis)</td></tr>
+<tr            ><td>\link LU_Module LU \endlink</td><td>\code#include <Eigen/LU>\endcode</td><td>Inverse, determinant, LU decompositions with solver (FullPivLU, PartialPivLU)</td></tr>
+<tr class="alt"><td>\link Cholesky_Module Cholesky \endlink</td><td>\code#include <Eigen/Cholesky>\endcode</td><td>LLT and LDLT Cholesky factorization with solver</td></tr>
+<tr            ><td>\link Householder_Module Householder \endlink</td><td>\code#include <Eigen/Householder>\endcode</td><td>Householder transformations; this module is used by several linear algebra modules</td></tr>
+<tr class="alt"><td>\link SVD_Module SVD \endlink</td><td>\code#include <Eigen/SVD>\endcode</td><td>SVD decompositions with least-squares solver (JacobiSVD, BDCSVD)</td></tr>
+<tr            ><td>\link QR_Module QR \endlink</td><td>\code#include <Eigen/QR>\endcode</td><td>QR decomposition with solver (HouseholderQR, ColPivHouseholderQR, FullPivHouseholderQR)</td></tr>
+<tr class="alt"><td>\link Eigenvalues_Module Eigenvalues \endlink</td><td>\code#include <Eigen/Eigenvalues>\endcode</td><td>Eigenvalue, eigenvector decompositions (EigenSolver, SelfAdjointEigenSolver, ComplexEigenSolver)</td></tr>
+<tr            ><td>\link Sparse_Module Sparse \endlink</td><td>\code#include <Eigen/Sparse>\endcode</td><td>%Sparse matrix storage and related basic linear algebra (SparseMatrix, SparseVector) \n (see \ref SparseQuickRefPage for details on sparse modules)</td></tr>
+<tr class="alt"><td></td><td>\code#include <Eigen/Dense>\endcode</td><td>Includes Core, Geometry, LU, Cholesky, SVD, QR, and Eigenvalues header files</td></tr>
+<tr            ><td></td><td>\code#include <Eigen/Eigen>\endcode</td><td>Includes %Dense and %Sparse header files (the whole Eigen library)</td></tr>
+</table>
+
+<a href="#" class="top">top</a>
+\section QuickRef_Types Array, matrix and vector types
+
+
+\b Recall: Eigen provides two kinds of dense objects: mathematical matrices and vectors which are both represented by the template class Matrix, and general 1D and 2D arrays represented by the template class Array:
+\code
+typedef Matrix<Scalar, RowsAtCompileTime, ColsAtCompileTime, Options> MyMatrixType;
+typedef Array<Scalar, RowsAtCompileTime, ColsAtCompileTime, Options> MyArrayType;
+\endcode
+
+\li \c Scalar is the scalar type of the coefficients (e.g., \c float, \c double, \c bool, \c int, etc.).
+\li \c RowsAtCompileTime and \c ColsAtCompileTime are the number of rows and columns of the matrix as known at compile-time or \c Dynamic.
+\li \c Options can be \c ColMajor or \c RowMajor, default is \c ColMajor. (see class Matrix for more options)
+
+All combinations are allowed: you can have a matrix with a fixed number of rows and a dynamic number of columns, etc. The following are all valid:
+\code
+Matrix<double, 6, Dynamic>                  // Dynamic number of columns (heap allocation)
+Matrix<double, Dynamic, 2>                  // Dynamic number of rows (heap allocation)
+Matrix<double, Dynamic, Dynamic, RowMajor>  // Fully dynamic, row major (heap allocation)
+Matrix<double, 13, 3>                       // Fully fixed (usually allocated on stack)
+\endcode
+
+In most cases, you can simply use one of the convenience typedefs for \ref matrixtypedefs "matrices" and \ref arraytypedefs "arrays". Some examples:
+<table class="example">
+<tr><th>Matrices</th><th>Arrays</th></tr>
+<tr><td>\code
+Matrix<float,Dynamic,Dynamic>   <=>   MatrixXf
+Matrix<double,Dynamic,1>        <=>   VectorXd
+Matrix<int,1,Dynamic>           <=>   RowVectorXi
+Matrix<float,3,3>               <=>   Matrix3f
+Matrix<float,4,1>               <=>   Vector4f
+\endcode</td><td>\code
+Array<float,Dynamic,Dynamic>    <=>   ArrayXXf
+Array<double,Dynamic,1>         <=>   ArrayXd
+Array<int,1,Dynamic>            <=>   RowArrayXi
+Array<float,3,3>                <=>   Array33f
+Array<float,4,1>                <=>   Array4f
+\endcode</td></tr>
+</table>
+
+Conversion between the matrix and array worlds:
+\code
+Array44f a1, a2;
+Matrix4f m1, m2;
+m1 = a1 * a2;                     // coeffwise product, implicit conversion from array to matrix.
+a1 = m1 * m2;                     // matrix product, implicit conversion from matrix to array.
+a2 = a1 + m1.array();             // mixing array and matrix is forbidden
+m2 = a1.matrix() + m1;            // and explicit conversion is required.
+ArrayWrapper<Matrix4f> m1a(m1);   // m1a is an alias for m1.array(), they share the same coefficients
+MatrixWrapper<Array44f> a1m(a1);
+\endcode
+
+In the rest of this document we will use the following symbols to emphasize the features which are specifics to a given kind of object:
+\li <a name="matrixonly"></a>\matrixworld linear algebra matrix and vector only
+\li <a name="arrayonly"></a>\arrayworld array objects only
+
+\subsection QuickRef_Basics Basic matrix manipulation
+
+<table class="manual">
+<tr><th></th><th>1D objects</th><th>2D objects</th><th>Notes</th></tr>
+<tr><td>Constructors</td>
+<td>\code
+Vector4d  v4;
+Vector2f  v1(x, y);
+Array3i   v2(x, y, z);
+Vector4d  v3(x, y, z, w);
+
+VectorXf  v5; // empty object
+ArrayXf   v6(size);
+\endcode</td><td>\code
+Matrix4f  m1;
+
+
+
+
+MatrixXf  m5; // empty object
+MatrixXf  m6(nb_rows, nb_columns);
+\endcode</td><td class="note">
+By default, the coefficients \n are left uninitialized</td></tr>
+<tr class="alt"><td>Comma initializer</td>
+<td>\code
+Vector3f  v1;     v1 << x, y, z;
+ArrayXf   v2(4);  v2 << 1, 2, 3, 4;
+
+\endcode</td><td>\code
+Matrix3f  m1;   m1 << 1, 2, 3,
+                      4, 5, 6,
+                      7, 8, 9;
+\endcode</td><td></td></tr>
+
+<tr><td>Comma initializer (bis)</td>
+<td colspan="2">
+\include Tutorial_commainit_02.cpp
+</td>
+<td>
+output:
+\verbinclude Tutorial_commainit_02.out
+</td>
+</tr>
+
+<tr class="alt"><td>Runtime info</td>
+<td>\code
+vector.size();
+
+vector.innerStride();
+vector.data();
+\endcode</td><td>\code
+matrix.rows();          matrix.cols();
+matrix.innerSize();     matrix.outerSize();
+matrix.innerStride();   matrix.outerStride();
+matrix.data();
+\endcode</td><td class="note">Inner/Outer* are storage order dependent</td></tr>
+<tr><td>Compile-time info</td>
+<td colspan="2">\code
+ObjectType::Scalar              ObjectType::RowsAtCompileTime
+ObjectType::RealScalar          ObjectType::ColsAtCompileTime
+ObjectType::Index               ObjectType::SizeAtCompileTime
+\endcode</td><td></td></tr>
+<tr class="alt"><td>Resizing</td>
+<td>\code
+vector.resize(size);
+
+
+vector.resizeLike(other_vector);
+vector.conservativeResize(size);
+\endcode</td><td>\code
+matrix.resize(nb_rows, nb_cols);
+matrix.resize(Eigen::NoChange, nb_cols);
+matrix.resize(nb_rows, Eigen::NoChange);
+matrix.resizeLike(other_matrix);
+matrix.conservativeResize(nb_rows, nb_cols);
+\endcode</td><td class="note">no-op if the new sizes match,<br/>otherwise data are lost<br/><br/>resizing with data preservation</td></tr>
+
+<tr><td>Coeff access with \n range checking</td>
+<td>\code
+vector(i)     vector.x()
+vector[i]     vector.y()
+              vector.z()
+              vector.w()
+\endcode</td><td>\code
+matrix(i,j)
+\endcode</td><td class="note">Range checking is disabled if \n NDEBUG or EIGEN_NO_DEBUG is defined</td></tr>
+
+<tr class="alt"><td>Coeff access without \n range checking</td>
+<td>\code
+vector.coeff(i)
+vector.coeffRef(i)
+\endcode</td><td>\code
+matrix.coeff(i,j)
+matrix.coeffRef(i,j)
+\endcode</td><td></td></tr>
+
+<tr><td>Assignment/copy</td>
+<td colspan="2">\code
+object = expression;
+object_of_float = expression_of_double.cast<float>();
+\endcode</td><td class="note">the destination is automatically resized (if possible)</td></tr>
+
+</table>
+
+\subsection QuickRef_PredefMat Predefined Matrices
+
+<table class="manual">
+<tr>
+  <th>Fixed-size matrix or vector</th>
+  <th>Dynamic-size matrix</th>
+  <th>Dynamic-size vector</th>
+</tr>
+<tr style="border-bottom-style: none;">
+  <td>
+\code
+typedef {Matrix3f|Array33f} FixedXD;
+FixedXD x;
+
+x = FixedXD::Zero();
+x = FixedXD::Ones();
+x = FixedXD::Constant(value);
+x = FixedXD::Random();
+x = FixedXD::LinSpaced(size, low, high);
+
+x.setZero();
+x.setOnes();
+x.setConstant(value);
+x.setRandom();
+x.setLinSpaced(size, low, high);
+\endcode
+  </td>
+  <td>
+\code
+typedef {MatrixXf|ArrayXXf} Dynamic2D;
+Dynamic2D x;
+
+x = Dynamic2D::Zero(rows, cols);
+x = Dynamic2D::Ones(rows, cols);
+x = Dynamic2D::Constant(rows, cols, value);
+x = Dynamic2D::Random(rows, cols);
+N/A
+
+x.setZero(rows, cols);
+x.setOnes(rows, cols);
+x.setConstant(rows, cols, value);
+x.setRandom(rows, cols);
+N/A
+\endcode
+  </td>
+  <td>
+\code
+typedef {VectorXf|ArrayXf} Dynamic1D;
+Dynamic1D x;
+
+x = Dynamic1D::Zero(size);
+x = Dynamic1D::Ones(size);
+x = Dynamic1D::Constant(size, value);
+x = Dynamic1D::Random(size);
+x = Dynamic1D::LinSpaced(size, low, high);
+
+x.setZero(size);
+x.setOnes(size);
+x.setConstant(size, value);
+x.setRandom(size);
+x.setLinSpaced(size, low, high);
+\endcode
+  </td>
+</tr>
+
+<tr><td colspan="3">Identity and \link MatrixBase::Unit basis vectors \endlink \matrixworld</td></tr>
+<tr style="border-bottom-style: none;">
+  <td>
+\code
+x = FixedXD::Identity();
+x.setIdentity();
+
+Vector3f::UnitX() // 1 0 0
+Vector3f::UnitY() // 0 1 0
+Vector3f::UnitZ() // 0 0 1
+Vector4f::Unit(i)
+x.setUnit(i);
+\endcode
+  </td>
+  <td>
+\code
+x = Dynamic2D::Identity(rows, cols);
+x.setIdentity(rows, cols);
+
+
+
+N/A
+\endcode
+  </td>
+  <td>\code
+N/A
+
+
+VectorXf::Unit(size,i)
+x.setUnit(size,i);
+VectorXf::Unit(4,1) == Vector4f(0,1,0,0)
+                    == Vector4f::UnitY()
+\endcode
+  </td>
+</tr>
+</table>
+
+Note that it is allowed to call any of the \c set* functions to a dynamic-sized vector or matrix without passing new sizes.
+For instance:
+\code
+MatrixXi M(3,3);
+M.setIdentity();
+\endcode
+
+\subsection QuickRef_Map Mapping external arrays
+
+<table class="manual">
+<tr>
+<td>Contiguous \n memory</td>
+<td>\code
+float data[] = {1,2,3,4};
+Map<Vector3f> v1(data);       // uses v1 as a Vector3f object
+Map<ArrayXf>  v2(data,3);     // uses v2 as a ArrayXf object
+Map<Array22f> m1(data);       // uses m1 as a Array22f object
+Map<MatrixXf> m2(data,2,2);   // uses m2 as a MatrixXf object
+\endcode</td>
+</tr>
+<tr>
+<td>Typical usage \n of strides</td>
+<td>\code
+float data[] = {1,2,3,4,5,6,7,8,9};
+Map<VectorXf,0,InnerStride<2> >  v1(data,3);                      // = [1,3,5]
+Map<VectorXf,0,InnerStride<> >   v2(data,3,InnerStride<>(3));     // = [1,4,7]
+Map<MatrixXf,0,OuterStride<3> >  m2(data,2,3);                    // both lines     |1,4,7|
+Map<MatrixXf,0,OuterStride<> >   m1(data,2,3,OuterStride<>(3));   // are equal to:  |2,5,8|
+\endcode</td>
+</tr>
+</table>
+
+
+<a href="#" class="top">top</a>
+\section QuickRef_ArithmeticOperators Arithmetic Operators
+
+<table class="manual">
+<tr><td>
+add \n subtract</td><td>\code
+mat3 = mat1 + mat2;           mat3 += mat1;
+mat3 = mat1 - mat2;           mat3 -= mat1;\endcode
+</td></tr>
+<tr class="alt"><td>
+scalar product</td><td>\code
+mat3 = mat1 * s1;             mat3 *= s1;           mat3 = s1 * mat1;
+mat3 = mat1 / s1;             mat3 /= s1;\endcode
+</td></tr>
+<tr><td>
+matrix/vector \n products \matrixworld</td><td>\code
+col2 = mat1 * col1;
+row2 = row1 * mat1;           row1 *= mat1;
+mat3 = mat1 * mat2;           mat3 *= mat1; \endcode
+</td></tr>
+<tr class="alt"><td>
+transposition \n adjoint \matrixworld</td><td>\code
+mat1 = mat2.transpose();      mat1.transposeInPlace();
+mat1 = mat2.adjoint();        mat1.adjointInPlace();
+\endcode
+</td></tr>
+<tr><td>
+\link MatrixBase::dot dot \endlink product \n inner product \matrixworld</td><td>\code
+scalar = vec1.dot(vec2);
+scalar = col1.adjoint() * col2;
+scalar = (col1.adjoint() * col2).value();\endcode
+</td></tr>
+<tr class="alt"><td>
+outer product \matrixworld</td><td>\code
+mat = col1 * col2.transpose();\endcode
+</td></tr>
+
+<tr><td>
+\link MatrixBase::norm() norm \endlink \n \link MatrixBase::normalized() normalization \endlink \matrixworld</td><td>\code
+scalar = vec1.norm();         scalar = vec1.squaredNorm()
+vec2 = vec1.normalized();     vec1.normalize(); // inplace \endcode
+</td></tr>
+
+<tr class="alt"><td>
+\link MatrixBase::cross() cross product \endlink \matrixworld</td><td>\code
+#include <Eigen/Geometry>
+vec3 = vec1.cross(vec2);\endcode</td></tr>
+</table>
+
+<a href="#" class="top">top</a>
+\section QuickRef_Coeffwise Coefficient-wise \& Array operators
+
+In addition to the aforementioned operators, Eigen supports numerous coefficient-wise operator and functions.
+Most of them unambiguously makes sense in array-world\arrayworld. The following operators are readily available for arrays,
+or available through .array() for vectors and matrices:
+
+<table class="manual">
+<tr><td>Arithmetic operators</td><td>\code
+array1 * array2     array1 / array2     array1 *= array2    array1 /= array2
+array1 + scalar     array1 - scalar     array1 += scalar    array1 -= scalar
+\endcode</td></tr>
+<tr><td>Comparisons</td><td>\code
+array1 < array2     array1 > array2     array1 < scalar     array1 > scalar
+array1 <= array2    array1 >= array2    array1 <= scalar    array1 >= scalar
+array1 == array2    array1 != array2    array1 == scalar    array1 != scalar
+array1.min(array2)  array1.max(array2)  array1.min(scalar)  array1.max(scalar)
+\endcode</td></tr>
+<tr><td>Trigo, power, and \n misc functions \n and the STL-like variants</td><td>\code
+array1.abs2()
+array1.abs()                  abs(array1)
+array1.sqrt()                 sqrt(array1)
+array1.log()                  log(array1)
+array1.log10()                log10(array1)
+array1.exp()                  exp(array1)
+array1.pow(array2)            pow(array1,array2)
+array1.pow(scalar)            pow(array1,scalar)
+                              pow(scalar,array2)
+array1.square()
+array1.cube()
+array1.inverse()
+
+array1.sin()                  sin(array1)
+array1.cos()                  cos(array1)
+array1.tan()                  tan(array1)
+array1.asin()                 asin(array1)
+array1.acos()                 acos(array1)
+array1.atan()                 atan(array1)
+array1.sinh()                 sinh(array1)
+array1.cosh()                 cosh(array1)
+array1.tanh()                 tanh(array1)
+array1.arg()                  arg(array1)
+
+array1.floor()                floor(array1)
+array1.ceil()                 ceil(array1)
+array1.round()                round(aray1)
+
+array1.isFinite()             isfinite(array1)
+array1.isInf()                isinf(array1)
+array1.isNaN()                isnan(array1)
+\endcode
+</td></tr>
+</table>
+
+
+The following coefficient-wise operators are available for all kind of expressions (matrices, vectors, and arrays), and for both real or complex scalar types:
+
+<table class="manual">
+<tr><th>Eigen's API</th><th>STL-like APIs\arrayworld </th><th>Comments</th></tr>
+<tr><td>\code
+mat1.real()
+mat1.imag()
+mat1.conjugate()
+\endcode
+</td><td>\code
+real(array1)
+imag(array1)
+conj(array1)
+\endcode
+</td><td>
+\code
+ // read-write, no-op for real expressions
+ // read-only for real, read-write for complexes
+ // no-op for real expressions
+\endcode
+</td></tr>
+</table>
+
+Some coefficient-wise operators are readily available for for matrices and vectors through the following cwise* methods:
+<table class="manual">
+<tr><th>Matrix API \matrixworld</th><th>Via Array conversions</th></tr>
+<tr><td>\code
+mat1.cwiseMin(mat2)         mat1.cwiseMin(scalar)
+mat1.cwiseMax(mat2)         mat1.cwiseMax(scalar)
+mat1.cwiseAbs2()
+mat1.cwiseAbs()
+mat1.cwiseSqrt()
+mat1.cwiseInverse()
+mat1.cwiseProduct(mat2)
+mat1.cwiseQuotient(mat2)
+mat1.cwiseEqual(mat2)       mat1.cwiseEqual(scalar)
+mat1.cwiseNotEqual(mat2)
+\endcode
+</td><td>\code
+mat1.array().min(mat2.array())    mat1.array().min(scalar)
+mat1.array().max(mat2.array())    mat1.array().max(scalar)
+mat1.array().abs2()
+mat1.array().abs()
+mat1.array().sqrt()
+mat1.array().inverse()
+mat1.array() * mat2.array()
+mat1.array() / mat2.array()
+mat1.array() == mat2.array()      mat1.array() == scalar
+mat1.array() != mat2.array()
+\endcode</td></tr>
+</table>
+The main difference between the two API is that the one based on cwise* methods returns an expression in the matrix world,
+while the second one (based on .array()) returns an array expression.
+Recall that .array() has no cost, it only changes the available API and interpretation of the data.
+
+It is also very simple to apply any user defined function \c foo using DenseBase::unaryExpr together with <a href="http://en.cppreference.com/w/cpp/utility/functional/ptr_fun">std::ptr_fun</a> (c++03, deprecated or removed in newer C++ versions), <a href="http://en.cppreference.com/w/cpp/utility/functional/ref">std::ref</a> (c++11), or <a href="http://en.cppreference.com/w/cpp/language/lambda">lambdas</a> (c++11):
+\code
+mat1.unaryExpr(std::ptr_fun(foo));
+mat1.unaryExpr(std::ref(foo));
+mat1.unaryExpr([](double x) { return foo(x); });
+\endcode
+
+Please note that it's not possible to pass a raw function pointer to \c unaryExpr, so please warp it as shown above.
+
+<a href="#" class="top">top</a>
+\section QuickRef_Reductions Reductions
+
+Eigen provides several reduction methods such as:
+\link DenseBase::minCoeff() minCoeff() \endlink, \link DenseBase::maxCoeff() maxCoeff() \endlink,
+\link DenseBase::sum() sum() \endlink, \link DenseBase::prod() prod() \endlink,
+\link MatrixBase::trace() trace() \endlink \matrixworld,
+\link MatrixBase::norm() norm() \endlink \matrixworld, \link MatrixBase::squaredNorm() squaredNorm() \endlink \matrixworld,
+\link DenseBase::all() all() \endlink, and \link DenseBase::any() any() \endlink.
+All reduction operations can be done matrix-wise,
+\link DenseBase::colwise() column-wise \endlink or
+\link DenseBase::rowwise() row-wise \endlink. Usage example:
+<table class="manual">
+<tr><td rowspan="3" style="border-right-style:dashed;vertical-align:middle">\code
+      5 3 1
+mat = 2 7 8
+      9 4 6 \endcode
+</td> <td>\code mat.minCoeff(); \endcode</td><td>\code 1 \endcode</td></tr>
+<tr class="alt"><td>\code mat.colwise().minCoeff(); \endcode</td><td>\code 2 3 1 \endcode</td></tr>
+<tr style="vertical-align:middle"><td>\code mat.rowwise().minCoeff(); \endcode</td><td>\code
+1
+2
+4
+\endcode</td></tr>
+</table>
+
+Special versions of \link DenseBase::minCoeff(IndexType*,IndexType*) const minCoeff \endlink and \link DenseBase::maxCoeff(IndexType*,IndexType*) const maxCoeff \endlink:
+\code
+int i, j;
+s = vector.minCoeff(&i);        // s == vector[i]
+s = matrix.maxCoeff(&i, &j);    // s == matrix(i,j)
+\endcode
+Typical use cases of all() and any():
+\code
+if((array1 > 0).all()) ...      // if all coefficients of array1 are greater than 0 ...
+if((array1 < array2).any()) ... // if there exist a pair i,j such that array1(i,j) < array2(i,j) ...
+\endcode
+
+
+<a href="#" class="top">top</a>\section QuickRef_Blocks Sub-matrices
+
+<div class="warningbox">
+<strong>PLEASE HELP US IMPROVING THIS SECTION.</strong>
+%Eigen 3.4 supports a much improved API for sub-matrices, including,
+slicing and indexing from arrays: \ref TutorialSlicingIndexing
+</div>
+
+Read-write access to a \link DenseBase::col(Index) column \endlink
+or a \link DenseBase::row(Index) row \endlink of a matrix (or array):
+\code
+mat1.row(i) = mat2.col(j);
+mat1.col(j1).swap(mat1.col(j2));
+\endcode
+
+Read-write access to sub-vectors:
+<table class="manual">
+<tr>
+<th>Default versions</th>
+<th>Optimized versions when the size \n is known at compile time</th></tr>
+<th></th>
+
+<tr><td>\code vec1.head(n)\endcode</td><td>\code vec1.head<n>()\endcode</td><td>the first \c n coeffs </td></tr>
+<tr><td>\code vec1.tail(n)\endcode</td><td>\code vec1.tail<n>()\endcode</td><td>the last \c n coeffs </td></tr>
+<tr><td>\code vec1.segment(pos,n)\endcode</td><td>\code vec1.segment<n>(pos)\endcode</td>
+    <td>the \c n coeffs in the \n range [\c pos : \c pos + \c n - 1]</td></tr>
+<tr class="alt"><td colspan="3">
+
+Read-write access to sub-matrices:</td></tr>
+<tr>
+  <td>\code mat1.block(i,j,rows,cols)\endcode
+      \link DenseBase::block(Index,Index,Index,Index) (more) \endlink</td>
+  <td>\code mat1.block<rows,cols>(i,j)\endcode
+      \link DenseBase::block(Index,Index) (more) \endlink</td>
+  <td>the \c rows x \c cols sub-matrix \n starting from position (\c i,\c j)</td></tr>
+<tr><td>\code
+ mat1.topLeftCorner(rows,cols)
+ mat1.topRightCorner(rows,cols)
+ mat1.bottomLeftCorner(rows,cols)
+ mat1.bottomRightCorner(rows,cols)\endcode
+ <td>\code
+ mat1.topLeftCorner<rows,cols>()
+ mat1.topRightCorner<rows,cols>()
+ mat1.bottomLeftCorner<rows,cols>()
+ mat1.bottomRightCorner<rows,cols>()\endcode
+ <td>the \c rows x \c cols sub-matrix \n taken in one of the four corners</td></tr>
+ <tr><td>\code
+ mat1.topRows(rows)
+ mat1.bottomRows(rows)
+ mat1.leftCols(cols)
+ mat1.rightCols(cols)\endcode
+ <td>\code
+ mat1.topRows<rows>()
+ mat1.bottomRows<rows>()
+ mat1.leftCols<cols>()
+ mat1.rightCols<cols>()\endcode
+ <td>specialized versions of block() \n when the block fit two corners</td></tr>
+</table>
+
+
+
+<a href="#" class="top">top</a>\section QuickRef_Misc Miscellaneous operations
+
+<div class="warningbox">
+<strong>PLEASE HELP US IMPROVING THIS SECTION.</strong>
+%Eigen 3.4 supports a new API for reshaping: \ref TutorialReshape
+</div>
+
+\subsection QuickRef_Reverse Reverse
+Vectors, rows, and/or columns of a matrix can be reversed (see DenseBase::reverse(), DenseBase::reverseInPlace(), VectorwiseOp::reverse()).
+\code
+vec.reverse()           mat.colwise().reverse()   mat.rowwise().reverse()
+vec.reverseInPlace()
+\endcode
+
+\subsection QuickRef_Replicate Replicate
+Vectors, matrices, rows, and/or columns can be replicated in any direction (see DenseBase::replicate(), VectorwiseOp::replicate())
+\code
+vec.replicate(times)                                          vec.replicate<Times>
+mat.replicate(vertical_times, horizontal_times)               mat.replicate<VerticalTimes, HorizontalTimes>()
+mat.colwise().replicate(vertical_times, horizontal_times)     mat.colwise().replicate<VerticalTimes, HorizontalTimes>()
+mat.rowwise().replicate(vertical_times, horizontal_times)     mat.rowwise().replicate<VerticalTimes, HorizontalTimes>()
+\endcode
+
+
+<a href="#" class="top">top</a>\section QuickRef_DiagTriSymm Diagonal, Triangular, and Self-adjoint matrices
+(matrix world \matrixworld)
+
+\subsection QuickRef_Diagonal Diagonal matrices
+
+<table class="example">
+<tr><th>Operation</th><th>Code</th></tr>
+<tr><td>
+view a vector \link MatrixBase::asDiagonal() as a diagonal matrix \endlink \n </td><td>\code
+mat1 = vec1.asDiagonal();\endcode
+</td></tr>
+<tr><td>
+Declare a diagonal matrix</td><td>\code
+DiagonalMatrix<Scalar,SizeAtCompileTime> diag1(size);
+diag1.diagonal() = vector;\endcode
+</td></tr>
+<tr><td>Access the \link MatrixBase::diagonal() diagonal \endlink and \link MatrixBase::diagonal(Index) super/sub diagonals \endlink of a matrix as a vector (read/write)</td>
+ <td>\code
+vec1 = mat1.diagonal();        mat1.diagonal() = vec1;      // main diagonal
+vec1 = mat1.diagonal(+n);      mat1.diagonal(+n) = vec1;    // n-th super diagonal
+vec1 = mat1.diagonal(-n);      mat1.diagonal(-n) = vec1;    // n-th sub diagonal
+vec1 = mat1.diagonal<1>();     mat1.diagonal<1>() = vec1;   // first super diagonal
+vec1 = mat1.diagonal<-2>();    mat1.diagonal<-2>() = vec1;  // second sub diagonal
+\endcode</td>
+</tr>
+
+<tr><td>Optimized products and inverse</td>
+ <td>\code
+mat3  = scalar * diag1 * mat1;
+mat3 += scalar * mat1 * vec1.asDiagonal();
+mat3 = vec1.asDiagonal().inverse() * mat1
+mat3 = mat1 * diag1.inverse()
+\endcode</td>
+</tr>
+
+</table>
+
+\subsection QuickRef_TriangularView Triangular views
+
+TriangularView gives a view on a triangular part of a dense matrix and allows to perform optimized operations on it. The opposite triangular part is never referenced and can be used to store other information.
+
+\note The .triangularView() template member function requires the \c template keyword if it is used on an
+object of a type that depends on a template parameter; see \ref TopicTemplateKeyword for details.
+
+<table class="example">
+<tr><th>Operation</th><th>Code</th></tr>
+<tr><td>
+Reference to a triangular with optional \n
+unit or null diagonal (read/write):
+</td><td>\code
+m.triangularView<Xxx>()
+\endcode \n
+\c Xxx = ::Upper, ::Lower, ::StrictlyUpper, ::StrictlyLower, ::UnitUpper, ::UnitLower
+</td></tr>
+<tr><td>
+Writing to a specific triangular part:\n (only the referenced triangular part is evaluated)
+</td><td>\code
+m1.triangularView<Eigen::Lower>() = m2 + m3 \endcode
+</td></tr>
+<tr><td>
+Conversion to a dense matrix setting the opposite triangular part to zero:
+</td><td>\code
+m2 = m1.triangularView<Eigen::UnitUpper>()\endcode
+</td></tr>
+<tr><td>
+Products:
+</td><td>\code
+m3 += s1 * m1.adjoint().triangularView<Eigen::UnitUpper>() * m2
+m3 -= s1 * m2.conjugate() * m1.adjoint().triangularView<Eigen::Lower>() \endcode
+</td></tr>
+<tr><td>
+Solving linear equations:\n
+\f$ M_2 := L_1^{-1} M_2 \f$ \n
+\f$ M_3 := {L_1^*}^{-1} M_3 \f$ \n
+\f$ M_4 := M_4 U_1^{-1} \f$
+</td><td>\n \code
+L1.triangularView<Eigen::UnitLower>().solveInPlace(M2)
+L1.triangularView<Eigen::Lower>().adjoint().solveInPlace(M3)
+U1.triangularView<Eigen::Upper>().solveInPlace<OnTheRight>(M4)\endcode
+</td></tr>
+</table>
+
+\subsection QuickRef_SelfadjointMatrix Symmetric/selfadjoint views
+
+Just as for triangular matrix, you can reference any triangular part of a square matrix to see it as a selfadjoint
+matrix and perform special and optimized operations. Again the opposite triangular part is never referenced and can be
+used to store other information.
+
+\note The .selfadjointView() template member function requires the \c template keyword if it is used on an
+object of a type that depends on a template parameter; see \ref TopicTemplateKeyword for details.
+
+<table class="example">
+<tr><th>Operation</th><th>Code</th></tr>
+<tr><td>
+Conversion to a dense matrix:
+</td><td>\code
+m2 = m.selfadjointView<Eigen::Lower>();\endcode
+</td></tr>
+<tr><td>
+Product with another general matrix or vector:
+</td><td>\code
+m3  = s1 * m1.conjugate().selfadjointView<Eigen::Upper>() * m3;
+m3 -= s1 * m3.adjoint() * m1.selfadjointView<Eigen::Lower>();\endcode
+</td></tr>
+<tr><td>
+Rank 1 and rank K update: \n
+\f$ upper(M_1) \mathrel{{+}{=}} s_1 M_2 M_2^* \f$ \n
+\f$ lower(M_1) \mathbin{{-}{=}} M_2^* M_2 \f$
+</td><td>\n \code
+M1.selfadjointView<Eigen::Upper>().rankUpdate(M2,s1);
+M1.selfadjointView<Eigen::Lower>().rankUpdate(M2.adjoint(),-1); \endcode
+</td></tr>
+<tr><td>
+Rank 2 update: (\f$ M \mathrel{{+}{=}} s u v^* + s v u^* \f$)
+</td><td>\code
+M.selfadjointView<Eigen::Upper>().rankUpdate(u,v,s);
+\endcode
+</td></tr>
+<tr><td>
+Solving linear equations:\n(\f$ M_2 := M_1^{-1} M_2 \f$)
+</td><td>\code
+// via a standard Cholesky factorization
+m2 = m1.selfadjointView<Eigen::Upper>().llt().solve(m2);
+// via a Cholesky factorization with pivoting
+m2 = m1.selfadjointView<Eigen::Lower>().ldlt().solve(m2);
+\endcode
+</td></tr>
+</table>
+
+*/
+
+/*
+<table class="tutorial_code">
+<tr><td>
+\link MatrixBase::asDiagonal() make a diagonal matrix \endlink \n from a vector </td><td>\code
+mat1 = vec1.asDiagonal();\endcode
+</td></tr>
+<tr><td>
+Declare a diagonal matrix</td><td>\code
+DiagonalMatrix<Scalar,SizeAtCompileTime> diag1(size);
+diag1.diagonal() = vector;\endcode
+</td></tr>
+<tr><td>Access \link MatrixBase::diagonal() the diagonal and super/sub diagonals of a matrix \endlink as a vector (read/write)</td>
+ <td>\code
+vec1 = mat1.diagonal();            mat1.diagonal() = vec1;      // main diagonal
+vec1 = mat1.diagonal(+n);          mat1.diagonal(+n) = vec1;    // n-th super diagonal
+vec1 = mat1.diagonal(-n);          mat1.diagonal(-n) = vec1;    // n-th sub diagonal
+vec1 = mat1.diagonal<1>();         mat1.diagonal<1>() = vec1;   // first super diagonal
+vec1 = mat1.diagonal<-2>();        mat1.diagonal<-2>() = vec1;  // second sub diagonal
+\endcode</td>
+</tr>
+
+<tr><td>View on a triangular part of a matrix (read/write)</td>
+ <td>\code
+mat2 = mat1.triangularView<Xxx>();
+// Xxx = Upper, Lower, StrictlyUpper, StrictlyLower, UnitUpper, UnitLower
+mat1.triangularView<Upper>() = mat2 + mat3; // only the upper part is evaluated and referenced
+\endcode</td></tr>
+
+<tr><td>View a triangular part as a symmetric/self-adjoint matrix (read/write)</td>
+ <td>\code
+mat2 = mat1.selfadjointView<Xxx>();     // Xxx = Upper or Lower
+mat1.selfadjointView<Upper>() = mat2 + mat2.adjoint();  // evaluated and write to the upper triangular part only
+\endcode</td></tr>
+
+</table>
+
+Optimized products:
+\code
+mat3 += scalar * vec1.asDiagonal() * mat1
+mat3 += scalar * mat1 * vec1.asDiagonal()
+mat3.noalias() += scalar * mat1.triangularView<Xxx>() * mat2
+mat3.noalias() += scalar * mat2 * mat1.triangularView<Xxx>()
+mat3.noalias() += scalar * mat1.selfadjointView<Upper or Lower>() * mat2
+mat3.noalias() += scalar * mat2 * mat1.selfadjointView<Upper or Lower>()
+mat1.selfadjointView<Upper or Lower>().rankUpdate(mat2);
+mat1.selfadjointView<Upper or Lower>().rankUpdate(mat2.adjoint(), scalar);
+\endcode
+
+Inverse products: (all are optimized)
+\code
+mat3 = vec1.asDiagonal().inverse() * mat1
+mat3 = mat1 * diag1.inverse()
+mat1.triangularView<Xxx>().solveInPlace(mat2)
+mat1.triangularView<Xxx>().solveInPlace<OnTheRight>(mat2)
+mat2 = mat1.selfadjointView<Upper or Lower>().llt().solve(mat2)
+\endcode
+
+*/
+}

diff --git a/doc/QuickStartGuide.dox b/doc/QuickStartGuide.dox
new file mode 100644
index 0000000..4192b28
--- /dev/null
+++ b/doc/QuickStartGuide.dox

@@ -0,0 +1,100 @@
+namespace Eigen {
+
+/** \page GettingStarted Getting started
+
+\eigenAutoToc
+
+This is a very short guide on how to get started with Eigen. It has a dual purpose. It serves as a minimal introduction to the Eigen library for people who want to start coding as soon as possible. You can also read this page as the first part of the Tutorial, which explains the library in more detail; in this case you will continue with \ref TutorialMatrixClass.
+
+\section GettingStartedInstallation How to "install" Eigen?
+
+In order to use Eigen, you just need to download and extract Eigen's source code (see <a href="http://eigen.tuxfamily.org/index.php?title=Main_Page#Download">the wiki</a> for download instructions). In fact, the header files in the \c Eigen subdirectory are the only files required to compile programs using Eigen. The header files are the same for all platforms. It is not necessary to use CMake or install anything.
+
+
+\section GettingStartedFirstProgram A simple first program
+
+Here is a rather simple program to get you started.
+
+\include QuickStart_example.cpp
+
+We will explain the program after telling you how to compile it.
+
+
+\section GettingStartedCompiling Compiling and running your first program
+
+There is no library to link to. The only thing that you need to keep in mind when compiling the above program is that the compiler must be able to find the Eigen header files. The directory in which you placed Eigen's source code must be in the include path. With GCC you use the -I option to achieve this, so you can compile the program with a command like this:
+
+\code g++ -I /path/to/eigen/ my_program.cpp -o my_program \endcode
+
+On Linux or Mac OS X, another option is to symlink or copy the Eigen folder into /usr/local/include/. This way, you can compile the program with:
+
+\code g++ my_program.cpp -o my_program \endcode
+
+When you run the program, it produces the following output:
+
+\include QuickStart_example.out
+
+
+\section GettingStartedExplanation Explanation of the first program
+
+The Eigen header files define many types, but for simple applications it may be enough to use only the \c MatrixXd type. This represents a matrix of arbitrary size (hence the \c X in \c MatrixXd), in which every entry is a \c double (hence the \c d in \c MatrixXd). See the \ref QuickRef_Types "quick reference guide" for an overview of the different types you can use to represent a matrix.
+
+The \c Eigen/Dense header file defines all member functions for the MatrixXd type and related types (see also the \ref QuickRef_Headers "table of header files"). All classes and functions defined in this header file (and other Eigen header files) are in the \c Eigen namespace. 
+
+The first line of the \c main function declares a variable of type \c MatrixXd and specifies that it is a matrix with 2 rows and 2 columns (the entries are not initialized). The statement <tt>m(0,0) = 3</tt> sets the entry in the top-left corner to 3. You need to use round parentheses to refer to entries in the matrix. As usual in computer science, the index of the first index is 0, as opposed to the convention in mathematics that the first index is 1.
+
+The following three statements sets the other three entries. The final line outputs the matrix \c m to the standard output stream.
+
+
+\section GettingStartedExample2 Example 2: Matrices and vectors
+
+Here is another example, which combines matrices with vectors. Concentrate on the left-hand program for now; we will talk about the right-hand program later.
+
+<table class="manual">
+<tr><th>Size set at run time:</th><th>Size set at compile time:</th></tr>
+<tr><td>
+\include QuickStart_example2_dynamic.cpp
+</td>
+<td>
+\include QuickStart_example2_fixed.cpp
+</td></tr></table>
+
+The output is as follows:
+
+\include QuickStart_example2_dynamic.out
+
+
+\section GettingStartedExplanation2 Explanation of the second example
+
+The second example starts by declaring a 3-by-3 matrix \c m which is initialized using the \link DenseBase::Random(Index,Index) Random() \endlink method with random values between -1 and 1. The next line applies a linear mapping such that the values are between 10 and 110. The function call \link DenseBase::Constant(Index,Index,const Scalar&) MatrixXd::Constant\endlink(3,3,1.2) returns a 3-by-3 matrix expression having all coefficients equal to 1.2. The rest is standard arithmetic.
+
+The next line of the \c main function introduces a new type: \c VectorXd. This represents a (column) vector of arbitrary size. Here, the vector \c v is created to contain \c 3 coefficients which are left uninitialized. The one but last line uses the so-called comma-initializer, explained in \ref TutorialAdvancedInitialization, to set all coefficients of the vector \c v to be as follows:
+
+\f[
+v =
+\begin{bmatrix}
+  1 \\
+  2 \\
+  3
+\end{bmatrix}.
+\f]
+
+The final line of the program multiplies the matrix \c m with the vector \c v and outputs the result.
+
+Now look back at the second example program. We presented two versions of it. In the version in the left column, the matrix is of type \c MatrixXd which represents matrices of arbitrary size. The version in the right column is similar, except that the matrix is of type \c Matrix3d, which represents matrices of a fixed size (here 3-by-3). Because the type already encodes the size of the matrix, it is not necessary to specify the size in the constructor; compare <tt>MatrixXd m(3,3)</tt> with <tt>Matrix3d m</tt>. Similarly, we have \c VectorXd on the left (arbitrary size) versus \c Vector3d on the right (fixed size). Note that here the coefficients of vector \c v are directly set in the constructor, though the same syntax of the left example could be used too.
+
+The use of fixed-size matrices and vectors has two advantages. The compiler emits better (faster) code because it knows the size of the matrices and vectors. Specifying the size in the type also allows for more rigorous checking at compile-time. For instance, the compiler will complain if you try to multiply a \c Matrix4d (a 4-by-4 matrix) with a \c Vector3d (a vector of size 3). However, the use of many types increases compilation time and the size of the executable. The size of the matrix may also not be known at compile-time. A rule of thumb is to use fixed-size matrices for size 4-by-4 and smaller.
+
+
+\section GettingStartedConclusion Where to go from here?
+
+It's worth taking the time to read the  \ref TutorialMatrixClass "long tutorial".
+
+However if you think you don't need it, you can directly use the classes documentation and our \ref QuickRefPage.
+
+\li \b Next: \ref TutorialMatrixClass
+
+*/
+
+}
+

diff --git a/doc/SparseLinearSystems.dox b/doc/SparseLinearSystems.dox
new file mode 100644
index 0000000..66d3bcd
--- /dev/null
+++ b/doc/SparseLinearSystems.dox

@@ -0,0 +1,225 @@
+namespace Eigen {
+/** \eigenManualPage TopicSparseSystems Solving Sparse Linear Systems
+In Eigen, there are several methods available to solve linear systems when the coefficient matrix is sparse. Because of the special representation of this class of matrices, special care should be taken in order to get a good performance. See \ref TutorialSparse for a detailed introduction about sparse matrices in Eigen. This page lists the sparse solvers available in Eigen. The main steps that are common to all these linear solvers are introduced as well. Depending on the properties of the matrix, the desired accuracy, the end-user is able to tune those steps in order to improve the performance of its code. Note that it is not required to know deeply what's hiding behind these steps: the last section presents a benchmark routine that can be easily used to get an insight on the performance of all the available solvers. 
+
+\eigenAutoToc
+
+\section TutorialSparseSolverList List of sparse solvers
+
+%Eigen currently provides a wide set of built-in solvers, as well as wrappers to external solver libraries.
+They are summarized in the following tables:
+
+\subsection TutorialSparseSolverList_Direct Built-in direct solvers
+
+<table class="manual">
+<tr><th>Class</th><th>Solver kind</th><th>Matrix kind</th><th>Features related to performance</th>
+    <th class="width20em"><p>Notes</p></th></tr>
+
+<tr><td>SimplicialLLT \n <tt>\#include<Eigen/\link SparseCholesky_Module SparseCholesky\endlink></tt></td><td>Direct LLt factorization</td><td>SPD</td><td>Fill-in reducing</td>
+    <td>SimplicialLDLT is often preferable</td></tr>
+
+<tr><td>SimplicialLDLT \n <tt>\#include<Eigen/\link SparseCholesky_Module SparseCholesky\endlink></tt></td><td>Direct LDLt factorization</td><td>SPD</td><td>Fill-in reducing</td>
+    <td>Recommended for very sparse and not too large problems (e.g., 2D Poisson eq.)</td></tr>
+
+<tr><td>SparseLU \n <tt>\#include<Eigen/\link SparseLU_Module SparseLU\endlink></tt></td> <td>LU factorization </td>
+    <td>Square </td><td>Fill-in reducing, Leverage fast dense algebra</td>
+    <td>optimized for small and large problems with irregular patterns </td></tr>
+
+<tr><td>SparseQR \n <tt>\#include<Eigen/\link SparseQR_Module SparseQR\endlink></tt></td> <td> QR factorization</td>
+    <td>Any, rectangular</td><td> Fill-in reducing</td>
+    <td>recommended for least-square problems, has a basic rank-revealing feature</td></tr>
+ </table>
+
+\subsection TutorialSparseSolverList_Iterative Built-in iterative solvers
+
+<table class="manual">
+<tr><th>Class</th><th>Solver kind</th><th>Matrix kind</th><th>Supported preconditioners, [default]</th>
+    <th class="width20em"><p>Notes</p></th></tr>
+
+<tr><td>ConjugateGradient \n <tt>\#include<Eigen/\link IterativeLinearSolvers_Module IterativeLinearSolvers\endlink></tt></td> <td>Classic iterative CG</td><td>SPD</td>
+    <td>IdentityPreconditioner, [DiagonalPreconditioner], IncompleteCholesky</td>
+    <td>Recommended for large symmetric problems (e.g., 3D Poisson eq.)</td></tr>
+
+<tr><td>LeastSquaresConjugateGradient \n <tt>\#include<Eigen/\link IterativeLinearSolvers_Module IterativeLinearSolvers\endlink></tt></td><td>CG for rectangular least-square problem</td><td>Rectangular</td>
+    <td>IdentityPreconditioner, [LeastSquareDiagonalPreconditioner]</td>
+    <td>Solve for min |A'Ax-b|^2 without forming A'A</td></tr>
+
+<tr><td>BiCGSTAB \n <tt>\#include<Eigen/\link IterativeLinearSolvers_Module IterativeLinearSolvers\endlink></tt></td><td>Iterative stabilized bi-conjugate gradient</td><td>Square</td>
+    <td>IdentityPreconditioner, [DiagonalPreconditioner], IncompleteLUT</td>
+    <td>To speedup the convergence, try it with the \ref IncompleteLUT preconditioner.</td></tr>
+</table>
+
+\subsection TutorialSparseSolverList_Wrapper Wrappers to external solvers
+
+<table class="manual">
+<tr><th>Class</th><th>Module</th><th>Solver kind</th><th>Matrix kind</th><th>Features related to performance</th>
+    <th>Dependencies,License</th><th class="width20em"><p>Notes</p></th></tr>
+<tr><td>PastixLLT \n PastixLDLT \n PastixLU</td><td>\link PaStiXSupport_Module PaStiXSupport \endlink</td><td>Direct LLt, LDLt, LU factorizations</td><td>SPD \n SPD \n Square</td><td>Fill-in reducing, Leverage fast dense algebra, Multithreading</td>
+    <td>Requires the <a href="http://pastix.gforge.inria.fr">PaStiX</a> package, \b CeCILL-C </td>
+    <td>optimized for tough problems and symmetric patterns</td></tr>
+<tr><td>CholmodSupernodalLLT</td><td>\link CholmodSupport_Module CholmodSupport \endlink</td><td>Direct LLt factorization</td><td>SPD</td><td>Fill-in reducing, Leverage fast dense algebra</td>
+    <td>Requires the <a href="http://www.suitesparse.com">SuiteSparse</a> package, \b GPL </td>
+    <td></td></tr>
+<tr><td>UmfPackLU</td><td>\link UmfPackSupport_Module UmfPackSupport \endlink</td><td>Direct LU factorization</td><td>Square</td><td>Fill-in reducing, Leverage fast dense algebra</td>
+    <td>Requires the <a href="http://www.suitesparse.com">SuiteSparse</a> package, \b GPL </td>
+    <td></td></tr>
+<tr><td>KLU</td><td>\link KLUSupport_Module KLUSupport \endlink</td><td>Direct LU factorization</td><td>Square</td><td>Fill-in reducing, suitted for circuit simulation</td>
+    <td>Requires the <a href="http://www.suitesparse.com">SuiteSparse</a> package, \b GPL </td>
+    <td></td></tr>
+<tr><td>SuperLU</td><td>\link SuperLUSupport_Module SuperLUSupport \endlink</td><td>Direct LU factorization</td><td>Square</td><td>Fill-in reducing, Leverage fast dense algebra</td>
+    <td>Requires the <a href="http://crd-legacy.lbl.gov/~xiaoye/SuperLU/">SuperLU</a> library, (BSD-like)</td>
+    <td></td></tr>
+<tr><td>SPQR</td><td>\link SPQRSupport_Module SPQRSupport \endlink  </td> <td> QR factorization </td> 
+    <td> Any, rectangular</td><td>fill-in reducing, multithreaded, fast dense algebra</td>
+    <td> requires the <a href="http://www.suitesparse.com">SuiteSparse</a> package, \b GPL </td><td>recommended for linear least-squares problems, has a rank-revealing feature</tr>
+<tr><td>PardisoLLT \n PardisoLDLT \n PardisoLU</td><td>\link PardisoSupport_Module PardisoSupport \endlink</td><td>Direct LLt, LDLt, LU factorizations</td><td>SPD \n SPD \n Square</td><td>Fill-in reducing, Leverage fast dense algebra, Multithreading</td>
+    <td>Requires the <a href="http://eigen.tuxfamily.org/Counter/redirect_to_mkl.php">Intel MKL</a> package, \b Proprietary </td>
+    <td>optimized for tough problems patterns, see also \link TopicUsingIntelMKL using MKL with Eigen \endlink</td></tr>
+</table>
+
+Here \c SPD means symmetric positive definite.
+
+\section TutorialSparseSolverConcept Sparse solver concept
+
+All these solvers follow the same general concept.
+Here is a typical and general example:
+\code
+#include <Eigen/RequiredModuleName>
+// ...
+SparseMatrix<double> A;
+// fill A
+VectorXd b, x;
+// fill b
+// solve Ax = b
+SolverClassName<SparseMatrix<double> > solver;
+solver.compute(A);
+if(solver.info()!=Success) {
+  // decomposition failed
+  return;
+}
+x = solver.solve(b);
+if(solver.info()!=Success) {
+  // solving failed
+  return;
+}
+// solve for another right hand side:
+x1 = solver.solve(b1);
+\endcode
+
+For \c SPD solvers, a second optional template argument allows to specify which triangular part have to be used, e.g.:
+
+\code
+#include <Eigen/IterativeLinearSolvers>
+
+ConjugateGradient<SparseMatrix<double>, Eigen::Upper> solver;
+x = solver.compute(A).solve(b);
+\endcode
+In the above example, only the upper triangular part of the input matrix A is considered for solving. The opposite triangle might either be empty or contain arbitrary values.
+
+In the case where multiple problems with the same sparsity pattern have to be solved, then the "compute" step can be decomposed as follow:
+\code
+SolverClassName<SparseMatrix<double> > solver;
+solver.analyzePattern(A);   // for this step the numerical values of A are not used
+solver.factorize(A);
+x1 = solver.solve(b1);
+x2 = solver.solve(b2);
+...
+A = ...;                    // modify the values of the nonzeros of A, the nonzeros pattern must stay unchanged
+solver.factorize(A);
+x1 = solver.solve(b1);
+x2 = solver.solve(b2);
+...
+\endcode
+The compute() method is equivalent to calling both analyzePattern() and factorize().
+
+Each solver provides some specific features, such as determinant, access to the factors, controls of the iterations, and so on.
+More details are available in the documentations of the respective classes.
+
+Finally, most of the iterative solvers, can also be used in a \b matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+
+\section TheSparseCompute The Compute Step
+In the compute() function, the matrix is generally factorized: LLT for self-adjoint matrices, LDLT for general hermitian matrices, LU for non hermitian matrices and QR for rectangular matrices. These are the results of using direct solvers. For this class of solvers precisely, the compute step is further subdivided into analyzePattern() and factorize(). 
+
+The goal of analyzePattern() is to reorder the nonzero elements of the matrix, such that the factorization step creates less fill-in. This step exploits only the structure of the matrix. Hence, the results of this step can be used for other linear systems where the matrix has the same structure. Note however that sometimes, some external solvers (like SuperLU) require that the values of the matrix are set in this step, for instance to equilibrate the rows and columns of the matrix. In this situation, the results of this step should not be used with other matrices.
+
+Eigen provides a limited set of methods to reorder the matrix in this step, either built-in (COLAMD, AMD) or external (METIS). These methods are set in template parameter list of the solver :
+\code
+DirectSolverClassName<SparseMatrix<double>, OrderingMethod<IndexType> > solver;
+\endcode 
+
+See the \link OrderingMethods_Module OrderingMethods module \endlink for the list of available methods and the associated options. 
+
+In factorize(), the factors of the coefficient matrix are computed. This step should be called each time the values of the matrix change. However, the structural pattern of the matrix should not change between multiple calls. 
+
+For iterative solvers, the compute step is used to eventually setup a preconditioner. For instance, with the ILUT preconditioner, the incomplete factors L and U are computed in this step. Remember that, basically, the goal of the preconditioner is to speedup the convergence of an iterative method by solving a modified linear system where the coefficient matrix has more clustered eigenvalues. For real problems, an iterative solver should always be used with a preconditioner. In Eigen, a preconditioner is  selected by simply adding it as a template parameter to the iterative solver object. 
+\code
+IterativeSolverClassName<SparseMatrix<double>, PreconditionerName<SparseMatrix<double> > solver; 
+\endcode
+The member function preconditioner() returns a read-write reference to the preconditioner 
+ to directly interact with it. See the \link IterativeLinearSolvers_Module Iterative solvers module \endlink and the documentation of each class for the list of available methods.
+
+\section TheSparseSolve The Solve step
+The solve() function computes the solution of the linear systems with one or many right hand sides.
+\code
+X = solver.solve(B);
+\endcode 
+Here, B  can be a vector or a matrix where the columns form the different right hand sides. The solve() function can be called several times as well, for instance when all the right hand sides are not available at once. 
+\code
+x1 = solver.solve(b1);
+// Get the second right hand side b2
+x2 = solver.solve(b2); 
+//  ...
+\endcode
+For direct methods, the solution are computed at the machine precision. Sometimes, the solution need not be too accurate. In this case, the iterative methods are more suitable and the desired accuracy can be set before the solve step using \b setTolerance(). For all the available functions, please, refer to the documentation of the \link IterativeLinearSolvers_Module Iterative solvers module \endlink. 
+
+\section BenchmarkRoutine
+Most of the time, all you need is to know how much time it will take to solve your system, and hopefully, what is the most suitable solver. In Eigen, we provide a benchmark routine that can be used for this purpose. It is very easy to use. In the build directory, navigate to bench/spbench and compile the routine by typing \b make \e spbenchsolver. Run it with --help option to get the list of all available options. Basically, the matrices to test should be in <a href="http://math.nist.gov/MatrixMarket/formats.html">MatrixMarket Coordinate format</a>, and the routine returns the statistics from all available solvers in Eigen.
+
+To export your matrices and right-hand-side vectors in the matrix-market format, you can the the unsupported SparseExtra module:
+\code
+#include <unsupported/Eigen/SparseExtra>
+...
+Eigen::saveMarket(A, "filename.mtx");
+Eigen::saveMarket(A, "filename_SPD.mtx", Eigen::Symmetric); // if A is symmetric-positive-definite
+Eigen::saveMarketVector(B, "filename_b.mtx");
+\endcode
+
+The following table gives an example of XML statistics from several Eigen built-in and external solvers. 
+<TABLE border="1">
+ <TR><TH>Matrix <TH> N <TH> NNZ <TH>  <TH > UMFPACK <TH > SUPERLU <TH > PASTIX LU <TH >BiCGSTAB <TH > BiCGSTAB+ILUT <TH >GMRES+ILUT<TH > LDLT <TH> CHOLMOD LDLT <TH > PASTIX LDLT <TH > LLT <TH > CHOLMOD SP LLT <TH > CHOLMOD LLT <TH > PASTIX LLT <TH> CG</TR>
+<TR><TH rowspan="4">vector_graphics <TD rowspan="4"> 12855 <TD rowspan="4"> 72069 <TH>Compute Time <TD>0.0254549<TD>0.0215677<TD>0.0701827<TD>0.000153388<TD>0.0140107<TD>0.0153709<TD>0.0101601<TD style="background-color:red">0.00930502<TD>0.0649689
+<TR><TH>Solve Time <TD>0.00337835<TD>0.000951826<TD>0.00484373<TD>0.0374886<TD>0.0046445<TD>0.00847754<TD>0.000541813<TD style="background-color:red">0.000293696<TD>0.00485376
+<TR><TH>Total Time <TD>0.0288333<TD>0.0225195<TD>0.0750265<TD>0.037642<TD>0.0186552<TD>0.0238484<TD>0.0107019<TD style="background-color:red">0.00959871<TD>0.0698227
+<TR><TH>Error(Iter) <TD> 1.299e-16 <TD> 2.04207e-16 <TD> 4.83393e-15 <TD> 3.94856e-11 (80)  <TD> 1.03861e-12 (3)  <TD> 5.81088e-14 (6)  <TD> 1.97578e-16 <TD> 1.83927e-16 <TD> 4.24115e-15
+<TR><TH rowspan="4">poisson_SPD <TD rowspan="4"> 19788 <TD rowspan="4"> 308232 <TH>Compute Time <TD>0.425026<TD>1.82378<TD>0.617367<TD>0.000478921<TD>1.34001<TD>1.33471<TD>0.796419<TD>0.857573<TD>0.473007<TD>0.814826<TD style="background-color:red">0.184719<TD>0.861555<TD>0.470559<TD>0.000458188
+<TR><TH>Solve Time <TD>0.0280053<TD>0.0194402<TD>0.0268747<TD>0.249437<TD>0.0548444<TD>0.0926991<TD>0.00850204<TD>0.0053171<TD>0.0258932<TD>0.00874603<TD style="background-color:red">0.00578155<TD>0.00530361<TD>0.0248942<TD>0.239093
+<TR><TH>Total Time <TD>0.453031<TD>1.84322<TD>0.644241<TD>0.249916<TD>1.39486<TD>1.42741<TD>0.804921<TD>0.862891<TD>0.4989<TD>0.823572<TD style="background-color:red">0.190501<TD>0.866859<TD>0.495453<TD>0.239551
+<TR><TH>Error(Iter) <TD> 4.67146e-16 <TD> 1.068e-15 <TD> 1.3397e-15 <TD> 6.29233e-11 (201)  <TD> 3.68527e-11 (6)  <TD> 3.3168e-15 (16)  <TD> 1.86376e-15 <TD> 1.31518e-16 <TD> 1.42593e-15 <TD> 3.45361e-15 <TD> 3.14575e-16 <TD> 2.21723e-15 <TD> 7.21058e-16 <TD> 9.06435e-12 (261) 
+<TR><TH rowspan="4">sherman2 <TD rowspan="4"> 1080 <TD rowspan="4"> 23094 <TH>Compute Time <TD style="background-color:red">0.00631754<TD>0.015052<TD>0.0247514 <TD> -<TD>0.0214425<TD>0.0217988
+<TR><TH>Solve Time <TD style="background-color:red">0.000478424<TD>0.000337998<TD>0.0010291 <TD> -<TD>0.00243152<TD>0.00246152
+<TR><TH>Total Time <TD style="background-color:red">0.00679597<TD>0.01539<TD>0.0257805 <TD> -<TD>0.023874<TD>0.0242603
+<TR><TH>Error(Iter) <TD> 1.83099e-15 <TD> 8.19351e-15 <TD> 2.625e-14 <TD> 1.3678e+69 (1080)  <TD> 4.1911e-12 (7)  <TD> 5.0299e-13 (12) 
+<TR><TH rowspan="4">bcsstk01_SPD <TD rowspan="4"> 48 <TD rowspan="4"> 400 <TH>Compute Time <TD>0.000169079<TD>0.00010789<TD>0.000572538<TD>1.425e-06<TD>9.1612e-05<TD>8.3985e-05<TD style="background-color:red">5.6489e-05<TD>7.0913e-05<TD>0.000468251<TD>5.7389e-05<TD>8.0212e-05<TD>5.8394e-05<TD>0.000463017<TD>1.333e-06
+<TR><TH>Solve Time <TD>1.2288e-05<TD>1.1124e-05<TD>0.000286387<TD>8.5896e-05<TD>1.6381e-05<TD>1.6984e-05<TD style="background-color:red">3.095e-06<TD>4.115e-06<TD>0.000325438<TD>3.504e-06<TD>7.369e-06<TD>3.454e-06<TD>0.000294095<TD>6.0516e-05
+<TR><TH>Total Time <TD>0.000181367<TD>0.000119014<TD>0.000858925<TD>8.7321e-05<TD>0.000107993<TD>0.000100969<TD style="background-color:red">5.9584e-05<TD>7.5028e-05<TD>0.000793689<TD>6.0893e-05<TD>8.7581e-05<TD>6.1848e-05<TD>0.000757112<TD>6.1849e-05
+<TR><TH>Error(Iter) <TD> 1.03474e-16 <TD> 2.23046e-16 <TD> 2.01273e-16 <TD> 4.87455e-07 (48)  <TD> 1.03553e-16 (2)  <TD> 3.55965e-16 (2)  <TD> 2.48189e-16 <TD> 1.88808e-16 <TD> 1.97976e-16 <TD> 2.37248e-16 <TD> 1.82701e-16 <TD> 2.71474e-16 <TD> 2.11322e-16 <TD> 3.547e-09 (48) 
+<TR><TH rowspan="4">sherman1 <TD rowspan="4"> 1000 <TD rowspan="4"> 3750 <TH>Compute Time <TD>0.00228805<TD>0.00209231<TD>0.00528268<TD>9.846e-06<TD>0.00163522<TD>0.00162155<TD>0.000789259<TD style="background-color:red">0.000804495<TD>0.00438269
+<TR><TH>Solve Time <TD>0.000213788<TD>9.7983e-05<TD>0.000938831<TD>0.00629835<TD>0.000361764<TD>0.00078794<TD>4.3989e-05<TD style="background-color:red">2.5331e-05<TD>0.000917166
+<TR><TH>Total Time <TD>0.00250184<TD>0.00219029<TD>0.00622151<TD>0.0063082<TD>0.00199698<TD>0.00240949<TD>0.000833248<TD style="background-color:red">0.000829826<TD>0.00529986
+<TR><TH>Error(Iter) <TD> 1.16839e-16 <TD> 2.25968e-16 <TD> 2.59116e-16 <TD> 3.76779e-11 (248)  <TD> 4.13343e-11 (4)  <TD> 2.22347e-14 (10)  <TD> 2.05861e-16 <TD> 1.83555e-16 <TD> 1.02917e-15
+<TR><TH rowspan="4">young1c <TD rowspan="4"> 841 <TD rowspan="4"> 4089 <TH>Compute Time <TD>0.00235843<TD style="background-color:red">0.00217228<TD>0.00568075<TD>1.2735e-05<TD>0.00264866<TD>0.00258236
+<TR><TH>Solve Time <TD>0.000329599<TD style="background-color:red">0.000168634<TD>0.00080118<TD>0.0534738<TD>0.00187193<TD>0.00450211
+<TR><TH>Total Time <TD>0.00268803<TD style="background-color:red">0.00234091<TD>0.00648193<TD>0.0534865<TD>0.00452059<TD>0.00708447
+<TR><TH>Error(Iter) <TD> 1.27029e-16 <TD> 2.81321e-16 <TD> 5.0492e-15 <TD> 8.0507e-11 (706)  <TD> 3.00447e-12 (8)  <TD> 1.46532e-12 (16) 
+<TR><TH rowspan="4">mhd1280b <TD rowspan="4"> 1280 <TD rowspan="4"> 22778 <TH>Compute Time <TD>0.00234898<TD>0.00207079<TD>0.00570918<TD>2.5976e-05<TD>0.00302563<TD>0.00298036<TD>0.00144525<TD style="background-color:red">0.000919922<TD>0.00426444
+<TR><TH>Solve Time <TD>0.00103392<TD>0.000211911<TD>0.00105<TD>0.0110432<TD>0.000628287<TD>0.00392089<TD>0.000138303<TD style="background-color:red">6.2446e-05<TD>0.00097564
+<TR><TH>Total Time <TD>0.0033829<TD>0.0022827<TD>0.00675918<TD>0.0110692<TD>0.00365392<TD>0.00690124<TD>0.00158355<TD style="background-color:red">0.000982368<TD>0.00524008
+<TR><TH>Error(Iter) <TD> 1.32953e-16 <TD> 3.08646e-16 <TD> 6.734e-16 <TD> 8.83132e-11 (40)  <TD> 1.51153e-16 (1)  <TD> 6.08556e-16 (8)  <TD> 1.89264e-16 <TD> 1.97477e-16 <TD> 6.68126e-09
+<TR><TH rowspan="4">crashbasis <TD rowspan="4"> 160000 <TD rowspan="4"> 1750416 <TH>Compute Time <TD>3.2019<TD>5.7892<TD>15.7573<TD style="background-color:red">0.00383515<TD>3.1006<TD>3.09921
+<TR><TH>Solve Time <TD>0.261915<TD>0.106225<TD>0.402141<TD style="background-color:red">1.49089<TD>0.24888<TD>0.443673
+<TR><TH>Total Time <TD>3.46381<TD>5.89542<TD>16.1594<TD style="background-color:red">1.49473<TD>3.34948<TD>3.54288
+<TR><TH>Error(Iter) <TD> 1.76348e-16 <TD> 4.58395e-16 <TD> 1.67982e-14 <TD> 8.64144e-11 (61)  <TD> 8.5996e-12 (2)  <TD> 6.04042e-14 (5) 
+
+</TABLE>
+*/
+}

diff --git a/doc/SparseQuickReference.dox b/doc/SparseQuickReference.dox
new file mode 100644
index 0000000..9779f3f
--- /dev/null
+++ b/doc/SparseQuickReference.dox

@@ -0,0 +1,272 @@
+namespace Eigen {
+/** \eigenManualPage SparseQuickRefPage Quick reference guide for sparse matrices
+\eigenAutoToc
+
+<hr>
+
+In this page, we give a quick summary of the main operations available for sparse matrices in the class SparseMatrix. First, it is recommended to read  the introductory tutorial at \ref TutorialSparse. The important point to have in mind when working on sparse matrices is how they are stored : 
+i.e either row major or column major. The default is column major. Most arithmetic operations on sparse matrices will assert that they have the same storage order. 
+
+\section SparseMatrixInit Sparse Matrix Initialization
+<table class="manual">
+<tr><th> Category </th> <th> Operations</th> <th>Notes</th></tr>
+<tr><td>Constructor</td>
+<td>
+\code
+  SparseMatrix<double> sm1(1000,1000); 
+  SparseMatrix<std::complex<double>,RowMajor> sm2;
+\endcode
+</td> <td> Default is ColMajor</td> </tr>
+<tr class="alt">
+<td> Resize/Reserve</td>
+<td> 
+ \code
+    sm1.resize(m,n);      // Change sm1 to a m x n matrix.
+    sm1.reserve(nnz);     // Allocate room for nnz nonzeros elements.   
+  \endcode 
+</td>
+<td> Note that when calling reserve(), it is not required that nnz is the exact number of nonzero elements in the final matrix. However, an exact estimation will avoid multiple reallocations during the insertion phase. </td>
+</tr>
+<tr> 
+<td> Assignment </td>
+<td> 
+\code 
+  SparseMatrix<double,Colmajor> sm1;
+ // Initialize sm2 with sm1.
+  SparseMatrix<double,Rowmajor> sm2(sm1), sm3;        
+  // Assignment and evaluations modify the storage order.
+  sm3 = sm1; 
+ \endcode
+</td>
+<td> The copy constructor can be used to convert from a storage order to another</td>
+</tr>
+<tr class="alt">
+<td> Element-wise Insertion</td>
+<td>
+\code 
+// Insert a new element; 
+ sm1.insert(i, j) = v_ij;  
+
+// Update the value v_ij
+ sm1.coeffRef(i,j) = v_ij;
+ sm1.coeffRef(i,j) += v_ij;
+ sm1.coeffRef(i,j) -= v_ij;
+\endcode
+</td>
+<td> insert() assumes that the element does not already exist; otherwise, use coeffRef()</td>
+</tr>
+<tr> 
+<td> Batch insertion</td>
+<td>
+\code
+  std::vector< Eigen::Triplet<double> > tripletList;
+  tripletList.reserve(estimation_of_entries);
+  // -- Fill tripletList with nonzero elements...
+  sm1.setFromTriplets(TripletList.begin(), TripletList.end());
+\endcode
+</td>
+<td>A complete example is available at \link TutorialSparseFilling Triplet Insertion \endlink.</td>
+</tr>
+<tr class="alt"> 
+<td> Constant or Random Insertion</td>
+<td>
+\code
+sm1.setZero();
+\endcode
+</td>
+<td>Remove all non-zero coefficients</td>
+</tr>
+</table>
+
+
+\section SparseBasicInfos Matrix properties
+Beyond the basic functions rows() and cols(), there are some useful functions that are available to easily get some information from the matrix. 
+<table class="manual">
+<tr>
+  <td> \code
+  sm1.rows();         // Number of rows
+  sm1.cols();         // Number of columns 
+  sm1.nonZeros();     // Number of non zero values   
+  sm1.outerSize();    // Number of columns (resp. rows) for a column major (resp. row major )
+  sm1.innerSize();    // Number of rows (resp. columns) for a row major (resp. column major)
+  sm1.norm();         // Euclidian norm of the matrix
+  sm1.squaredNorm();  // Squared norm of the matrix
+  sm1.blueNorm();
+  sm1.isVector();     // Check if sm1 is a sparse vector or a sparse matrix
+  sm1.isCompressed(); // Check if sm1 is in compressed form
+  ...
+  \endcode </td>
+</tr>
+</table>
+
+\section SparseBasicOps Arithmetic operations
+It is easy to perform arithmetic operations on sparse matrices provided that the dimensions are adequate and that the matrices have the same storage order. Note that the evaluation can always be done in a matrix with a different storage order. In the following, \b sm denotes a sparse matrix, \b dm a dense matrix and \b dv a dense vector.
+<table class="manual">
+<tr><th> Operations </th> <th> Code </th> <th> Notes </th></tr>
+
+<tr>
+  <td> add subtract </td> 
+  <td> \code
+  sm3 = sm1 + sm2; 
+  sm3 = sm1 - sm2;
+  sm2 += sm1; 
+  sm2 -= sm1; \endcode
+  </td>
+  <td> 
+  sm1 and sm2 should have the same storage order
+  </td> 
+</tr>
+
+<tr class="alt"><td>
+  scalar product</td><td>\code
+  sm3 = sm1 * s1;   sm3 *= s1; 
+  sm3 = s1 * sm1 + s2 * sm2; sm3 /= s1;\endcode
+  </td>
+  <td>
+    Many combinations are possible if the dimensions and the storage order agree.
+</tr>
+
+<tr>
+  <td> %Sparse %Product </td>
+  <td> \code
+  sm3 = sm1 * sm2;
+  dm2 = sm1 * dm1;
+  dv2 = sm1 * dv1;
+  \endcode </td>
+  <td>
+  </td>
+</tr> 
+
+<tr class='alt'>
+  <td> transposition, adjoint</td>
+  <td> \code
+  sm2 = sm1.transpose();
+  sm2 = sm1.adjoint();
+  \endcode </td>
+  <td>
+  Note that the transposition change the storage order. There is no support for transposeInPlace().
+  </td>
+</tr> 
+<tr>
+<td> Permutation </td>
+<td> 
+\code 
+perm.indices();      // Reference to the vector of indices
+sm1.twistedBy(perm); // Permute rows and columns
+sm2 = sm1 * perm;    // Permute the columns
+sm2 = perm * sm1;    // Permute the columns
+\endcode 
+</td>
+<td> 
+
+</td>
+</tr>
+<tr>
+  <td>
+  Component-wise ops
+  </td>
+  <td>\code 
+  sm1.cwiseProduct(sm2);
+  sm1.cwiseQuotient(sm2);
+  sm1.cwiseMin(sm2);
+  sm1.cwiseMax(sm2);
+  sm1.cwiseAbs();
+  sm1.cwiseSqrt();
+  \endcode</td>
+  <td>
+  sm1 and sm2 should have the same storage order
+  </td>
+</tr>
+</table>
+
+\section sparseotherops Other supported operations
+<table class="manual">
+<tr><th style="min-width:initial"> Code </th> <th> Notes</th> </tr>
+<tr><td colspan="2">Sub-matrices</td></tr>
+<tr>
+<td> 
+\code 
+  sm1.block(startRow, startCol, rows, cols); 
+  sm1.block(startRow, startCol); 
+  sm1.topLeftCorner(rows, cols); 
+  sm1.topRightCorner(rows, cols);
+  sm1.bottomLeftCorner( rows, cols);
+  sm1.bottomRightCorner( rows, cols);
+  \endcode
+</td><td>
+Contrary to dense matrices, here <strong>all these methods are read-only</strong>.\n
+See \ref TutorialSparse_SubMatrices and below for read-write sub-matrices.
+</td>
+</tr>
+<tr class="alt"><td colspan="2"> Range </td></tr>
+<tr class="alt">
+<td> 
+\code 
+  sm1.innerVector(outer);           // RW
+  sm1.innerVectors(start, size);    // RW
+  sm1.leftCols(size);               // RW
+  sm2.rightCols(size);              // RO because sm2 is row-major
+  sm1.middleRows(start, numRows);   // RO because sm1 is column-major
+  sm1.middleCols(start, numCols);   // RW
+  sm1.col(j);                       // RW
+\endcode
+</td>
+<td>
+A inner vector is either a row (for row-major) or a column (for column-major).\n
+As stated earlier, for a read-write sub-matrix (RW), the evaluation can be done in a matrix with different storage order.
+</td>
+</tr>
+<tr><td colspan="2"> Triangular and selfadjoint views</td></tr>
+<tr>
+<td> 
+\code
+  sm2 = sm1.triangularview<Lower>();
+  sm2 = sm1.selfadjointview<Lower>();
+\endcode
+</td>
+<td> Several combination between triangular views and blocks views are possible
+\code 
+  \endcode </td>
+</tr>
+<tr class="alt"><td colspan="2">Triangular solve </td></tr>
+<tr class="alt">
+<td> 
+\code 
+ dv2 = sm1.triangularView<Upper>().solve(dv1);
+ dv2 = sm1.topLeftCorner(size, size)
+          .triangularView<Lower>().solve(dv1);
+\endcode 
+</td>
+<td> For general sparse solve, Use any suitable module described at \ref TopicSparseSystems </td>
+</tr>
+<tr><td colspan="2"> Low-level API</td></tr>
+<tr>
+<td>
+\code
+sm1.valuePtr();      // Pointer to the values
+sm1.innerIndexPtr();  // Pointer to the indices.
+sm1.outerIndexPtr(); // Pointer to the beginning of each inner vector
+\endcode
+</td>
+<td>
+If the matrix is not in compressed form, makeCompressed() should be called before.\n
+Note that these functions are mostly provided for interoperability purposes with external libraries.\n
+A better access to the values of the matrix is done by using the InnerIterator class as described in \link TutorialSparse the Tutorial Sparse \endlink section</td>
+</tr>
+<tr class="alt"><td colspan="2">Mapping external buffers</td></tr>
+<tr class="alt">
+<td>
+\code
+int outerIndexPtr[cols+1];
+int innerIndices[nnz];
+double values[nnz];
+Map<SparseMatrix<double> > sm1(rows,cols,nnz,outerIndexPtr, // read-write
+                               innerIndices,values);
+Map<const SparseMatrix<double> > sm2(...);                  // read-only
+\endcode
+</td>
+<td>As for dense matrices, class Map<SparseMatrixType> can be used to see external buffers as an %Eigen's SparseMatrix object. </td>
+</tr>
+</table>
+*/
+}

diff --git a/doc/StlContainers.dox b/doc/StlContainers.dox
new file mode 100644
index 0000000..0342573
--- /dev/null
+++ b/doc/StlContainers.dox

@@ -0,0 +1,73 @@
+namespace Eigen {
+
+/** \eigenManualPage TopicStlContainers Using STL Containers with Eigen
+
+\eigenAutoToc
+
+\section StlContainers_summary Executive summary
+
+If you're compiling in \cpp17 mode only with a sufficiently recent compiler (e.g., GCC>=7, clang>=5, MSVC>=19.12), then everything is taken care by the compiler and you can stop reading.
+
+Otherwise, using STL containers on \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", or classes having members of such types, requires the use of an over-aligned allocator.
+That is, an allocator capable of allocating buffers with 16, 32, or even 64 bytes alignment.
+%Eigen does provide one ready for use: aligned_allocator.
+
+Prior to \cpp11, if you want to use the `std::vector` container, then you also have to <code> \#include <Eigen/StdVector> </code>.
+
+These issues arise only with \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types" and \ref TopicStructHavingEigenMembers "structures having such Eigen objects as member".
+For other %Eigen types, such as Vector3f or MatrixXd, no special care is needed when using STL containers.
+
+\section allocator Using an aligned allocator
+
+STL containers take an optional template parameter, the allocator type. When using STL containers on \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", you need tell the container to use an allocator that will always allocate memory at 16-byte-aligned (or more) locations. Fortunately, %Eigen does provide such an allocator: Eigen::aligned_allocator.
+
+For example, instead of
+\code
+std::map<int, Eigen::Vector4d>
+\endcode
+you need to use
+\code
+std::map<int, Eigen::Vector4d, std::less<int>, 
+         Eigen::aligned_allocator<std::pair<const int, Eigen::Vector4d> > >
+\endcode
+Note that the third parameter `std::less<int>` is just the default value, but we have to include it because we want to specify the fourth parameter, which is the allocator type.
+
+\section StlContainers_vector The case of std::vector
+
+This section is for c++98/03 users only. \cpp11 (or above) users can stop reading here.
+
+So in c++98/03, the situation with `std::vector` is more complicated because of a bug in the standard (explanation below).
+To workaround the issue, we had to specialize it for the Eigen::aligned_allocator type.
+In practice you \b must use the Eigen::aligned_allocator (not another aligned allocator), \b and \#include <Eigen/StdVector>.
+
+Here is an example:
+\code
+#include<Eigen/StdVector>
+/* ... */
+std::vector<Eigen::Vector4f,Eigen::aligned_allocator<Eigen::Vector4f> >
+\endcode
+
+<span class="note">\b Explanation: The `resize()` method of `std::vector` takes a `value_type` argument (defaulting to `value_type()`). So with `std::vector<Eigen::Vector4d>`, some Eigen::Vector4d objects will be passed by value, which discards any alignment modifiers, so a Eigen::Vector4d can be created at an unaligned location.
+In order to avoid that, the only solution we saw was to specialize `std::vector` to make it work on a slight modification of, here, Eigen::Vector4d, that is able to deal properly with this situation.
+</span>
+
+\subsection vector_spec An alternative - specializing std::vector for Eigen types
+
+As an alternative to the recommended approach described above, you have the option to specialize std::vector for Eigen types requiring alignment. 
+The advantage is that you won't need to declare std::vector all over with Eigen::aligned_allocator. One drawback on the other hand side is that
+the specialization needs to be defined before all code pieces in which e.g. `std::vector<Vector2d>` is used. Otherwise, without knowing the specialization
+the compiler will compile that particular instance with the default `std::allocator` and you program is most likely to crash.
+
+Here is an example:
+\code
+#include<Eigen/StdVector>
+/* ... */
+EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(Matrix2d)
+std::vector<Eigen::Vector2d>
+\endcode
+
+
+
+*/
+
+}

diff --git a/doc/StorageOrders.dox b/doc/StorageOrders.dox
new file mode 100644
index 0000000..6164531
--- /dev/null
+++ b/doc/StorageOrders.dox

@@ -0,0 +1,86 @@
+namespace Eigen {
+
+/** \eigenManualPage TopicStorageOrders Storage orders
+
+There are two different storage orders for matrices and two-dimensional arrays: column-major and row-major.
+This page explains these storage orders and how to specify which one should be used.
+
+\eigenAutoToc
+
+
+\section TopicStorageOrdersIntro Column-major and row-major storage
+
+The entries of a matrix form a two-dimensional grid. However, when the matrix is stored in memory, the entries
+have to somehow be laid out linearly. There are two main ways to do this, by row and by column.
+
+We say that a matrix is stored in \b row-major order if it is stored row by row. The entire first row is
+stored first, followed by the entire second row, and so on. Consider for example the matrix
+
+\f[
+A = \begin{bmatrix}
+8 & 2 & 2 & 9 \\
+9 & 1 & 4 & 4 \\
+3 & 5 & 4 & 5
+\end{bmatrix}.
+\f]
+
+If this matrix is stored in row-major order, then the entries are laid out in memory as follows:
+
+\code 8 2 2 9 9 1 4 4 3 5 4 5 \endcode
+
+On the other hand, a matrix is stored in \b column-major order if it is stored column by column, starting with
+the entire first column, followed by the entire second column, and so on. If the above matrix is stored in
+column-major order, it is laid out as follows:
+
+\code 8 9 3 2 1 5 2 4 4 9 4 5 \endcode
+
+This example is illustrated by the following Eigen code. It uses the PlainObjectBase::data() function, which
+returns a pointer to the memory location of the first entry of the matrix.
+
+<table class="example">
+<tr><th>Example</th><th>Output</th></tr>
+<tr><td>
+\include TopicStorageOrders_example.cpp
+</td>
+<td>
+\verbinclude TopicStorageOrders_example.out
+</td></tr></table>
+
+
+\section TopicStorageOrdersInEigen Storage orders in Eigen
+
+The storage order of a matrix or a two-dimensional array can be set by specifying the \c Options template
+parameter for Matrix or Array. As \ref TutorialMatrixClass explains, the %Matrix class template has six
+template parameters, of which three are compulsory (\c Scalar, \c RowsAtCompileTime and \c ColsAtCompileTime)
+and three are optional (\c Options, \c MaxRowsAtCompileTime and \c MaxColsAtCompileTime). If the \c Options
+parameter is set to \c RowMajor, then the matrix or array is stored in row-major order; if it is set to 
+\c ColMajor, then it is stored in column-major order. This mechanism is used in the above Eigen program to
+specify the storage order.
+
+If the storage order is not specified, then Eigen defaults to storing the entry in column-major. This is also
+the case if one of the convenience typedefs (\c Matrix3f, \c ArrayXXd, etc.) is used.
+
+Matrices and arrays using one storage order can be assigned to matrices and arrays using the other storage
+order, as happens in the above program when \c Arowmajor is initialized using \c Acolmajor. Eigen will reorder
+the entries automatically. More generally, row-major and column-major matrices can be mixed in an expression
+as we want.
+
+
+\section TopicStorageOrdersWhich Which storage order to choose?
+
+So, which storage order should you use in your program? There is no simple answer to this question; it depends
+on your application. Here are some points to keep in mind:
+
+  - Your users may expect you to use a specific storage order. Alternatively, you may use other libraries than
+    Eigen, and these other libraries may expect a certain storage order. In these cases it may be easiest and
+    fastest to use this storage order in your whole program.
+  - Algorithms that traverse a matrix row by row will go faster when the matrix is stored in row-major order
+    because of better data locality. Similarly, column-by-column traversal is faster for column-major
+    matrices. It may be worthwhile to experiment a bit to find out what is faster for your particular
+    application.
+  - The default in Eigen is column-major. Naturally, most of the development and testing of the Eigen library
+    is thus done with column-major matrices. This means that, even though we aim to support column-major and
+    row-major storage orders transparently, the Eigen library may well work best with column-major matrices.
+
+*/
+}

diff --git a/doc/StructHavingEigenMembers.dox b/doc/StructHavingEigenMembers.dox
new file mode 100644
index 0000000..87016cd
--- /dev/null
+++ b/doc/StructHavingEigenMembers.dox

@@ -0,0 +1,203 @@
+namespace Eigen {
+
+/** \eigenManualPage TopicStructHavingEigenMembers Structures Having Eigen Members
+
+\eigenAutoToc
+
+\section StructHavingEigenMembers_summary Executive Summary
+
+
+If you define a structure having members of \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", you must ensure that calling operator new on it allocates properly aligned buffers.
+If you're compiling in \cpp17 mode only with a sufficiently recent compiler (e.g., GCC>=7, clang>=5, MSVC>=19.12), then everything is taken care by the compiler and you can stop reading.
+
+Otherwise, you have to overload its `operator new` so that it generates properly aligned pointers (e.g., 32-bytes-aligned for Vector4d and AVX).
+Fortunately, %Eigen provides you with a macro `EIGEN_MAKE_ALIGNED_OPERATOR_NEW` that does that for you.
+
+\section StructHavingEigenMembers_what What kind of code needs to be changed?
+
+The kind of code that needs to be changed is this:
+
+\code
+class Foo
+{
+  ...
+  Eigen::Vector2d v;
+  ...
+};
+
+...
+
+Foo *foo = new Foo;
+\endcode
+
+In other words: you have a class that has as a member a \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen object", and then you dynamically create an object of that class.
+
+\section StructHavingEigenMembers_how How should such code be modified?
+
+Very easy, you just need to put a `EIGEN_MAKE_ALIGNED_OPERATOR_NEW` macro in a public part of your class, like this:
+
+\code
+class Foo
+{
+  ...
+  Eigen::Vector4d v;
+  ...
+public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+};
+
+...
+
+Foo *foo = new Foo;
+\endcode
+
+This macro makes `new Foo` always return an aligned pointer.
+
+In \cpp17, this macro is empty.
+
+If this approach is too intrusive, see also the \ref StructHavingEigenMembers_othersolutions "other solutions".
+
+\section StructHavingEigenMembers_why Why is this needed?
+
+OK let's say that your code looks like this:
+
+\code
+class Foo
+{
+  ...
+  Eigen::Vector4d v;
+  ...
+};
+
+...
+
+Foo *foo = new Foo;
+\endcode
+
+A Eigen::Vector4d consists of 4 doubles, which is 256 bits.
+This is exactly the size of an AVX register, which makes it possible to use AVX for all sorts of operations on this vector.
+But AVX instructions (at least the ones that %Eigen uses, which are the fast ones) require 256-bit alignment.
+Otherwise you get a segmentation fault.
+
+For this reason, %Eigen takes care by itself to require 256-bit alignment for Eigen::Vector4d, by doing two things:
+\li %Eigen requires 256-bit alignment for the Eigen::Vector4d's array (of 4 doubles). With \cpp11 this is done with the <a href="https://en.cppreference.com/w/cpp/keyword/alignas">alignas</a> keyword, or compiler's extensions for c++98/03.
+\li %Eigen overloads the `operator new` of Eigen::Vector4d so it will always return 256-bit aligned pointers. (removed in \cpp17)
+
+Thus, normally, you don't have to worry about anything, %Eigen handles alignment of operator new for you...
+
+... except in one case. When you have a `class Foo` like above, and you dynamically allocate a new `Foo` as above, then, since `Foo` doesn't have aligned `operator new`, the returned pointer foo is not necessarily 256-bit aligned.
+
+The alignment attribute of the member `v` is then relative to the start of the class `Foo`. If the `foo` pointer wasn't aligned, then `foo->v` won't be aligned either!
+
+The solution is to let `class Foo` have an aligned `operator new`, as we showed in the previous section.
+
+This explanation also holds for SSE/NEON/MSA/Altivec/VSX targets, which require 16-bytes alignment, and AVX512 which requires 64-bytes alignment for fixed-size objects multiple of 64 bytes (e.g., Eigen::Matrix4d).
+
+\section StructHavingEigenMembers_movetotop Should I then put all the members of Eigen types at the beginning of my class?
+
+That's not required. Since %Eigen takes care of declaring adequate alignment, all members that need it are automatically aligned relatively to the class. So code like this works fine:
+
+\code
+class Foo
+{
+  double x;
+  Eigen::Vector4d v;
+public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+};
+\endcode
+
+That said, as usual, it is recommended to sort the members so that alignment does not waste memory.
+In the above example, with AVX, the compiler will have to reserve 24 empty bytes between `x` and `v`.
+
+
+\section StructHavingEigenMembers_dynamicsize What about dynamic-size matrices and vectors?
+
+Dynamic-size matrices and vectors, such as Eigen::VectorXd, allocate dynamically their own array of coefficients, so they take care of requiring absolute alignment automatically. So they don't cause this issue. The issue discussed here is only with \ref TopicFixedSizeVectorizable  "fixed-size vectorizable matrices and vectors".
+
+
+\section StructHavingEigenMembers_bugineigen So is this a bug in Eigen?
+
+No, it's not our bug. It's more like an inherent problem of the c++ language specification that has been solved in c++17 through the feature known as <a href="http://wg21.link/p0035r4">dynamic memory allocation for over-aligned data</a>.
+
+
+\section StructHavingEigenMembers_conditional What if I want to do this conditionally (depending on template parameters) ?
+
+For this situation, we offer the macro `EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)`.
+It will generate aligned operators like `EIGEN_MAKE_ALIGNED_OPERATOR_NEW` if `NeedsToAlign` is true.
+It will generate operators with the default alignment if `NeedsToAlign` is false.
+In \cpp17, this macro is empty.
+
+Example:
+
+\code
+template<int n> class Foo
+{
+  typedef Eigen::Matrix<float,n,1> Vector;
+  enum { NeedsToAlign = (sizeof(Vector)%16)==0 };
+  ...
+  Vector v;
+  ...
+public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
+};
+
+...
+
+Foo<4> *foo4 = new Foo<4>; // foo4 is guaranteed to be 128bit-aligned
+Foo<3> *foo3 = new Foo<3>; // foo3 has only the system default alignment guarantee
+\endcode
+
+
+\section StructHavingEigenMembers_othersolutions Other solutions
+
+In case putting the `EIGEN_MAKE_ALIGNED_OPERATOR_NEW` macro everywhere is too intrusive, there exists at least two other solutions.
+
+\subsection othersolutions1 Disabling alignment
+
+The first is to disable alignment requirement for the fixed size members:
+\code
+class Foo
+{
+  ...
+  Eigen::Matrix<double,4,1,Eigen::DontAlign> v;
+  ...
+};
+\endcode
+This `v` is fully compatible with aligned Eigen::Vector4d.
+This has only for effect to make load/stores to `v` more expensive (usually slightly, but that's hardware dependent).
+
+
+\subsection othersolutions2 Private structure
+
+The second consist in storing the fixed-size objects into a private struct which will be dynamically allocated at the construction time of the main object:
+
+\code
+struct Foo_d
+{
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+  Vector4d v;
+  ...
+};
+
+
+struct Foo {
+  Foo() { init_d(); }
+  ~Foo() { delete d; }
+  void bar()
+  {
+    // use d->v instead of v
+    ...
+  }
+private:
+  void init_d() { d = new Foo_d; }
+  Foo_d* d;
+};
+\endcode
+
+The clear advantage here is that the class `Foo` remains unchanged regarding alignment issues.
+The drawback is that an additional heap allocation will be required whatsoever.
+
+*/
+
+}

diff --git a/doc/TemplateKeyword.dox b/doc/TemplateKeyword.dox
new file mode 100644
index 0000000..fbf2c70
--- /dev/null
+++ b/doc/TemplateKeyword.dox

@@ -0,0 +1,133 @@
+namespace Eigen {
+
+/** \page TopicTemplateKeyword The template and typename keywords in C++
+
+There are two uses for the \c template and \c typename keywords in C++. One of them is fairly well known
+amongst programmers: to define templates. The other use is more obscure: to specify that an expression refers
+to a template function or a type. This regularly trips up programmers that use the %Eigen library, often
+leading to error messages from the compiler that are difficult to understand, such as "expected expression" or
+"no match for operator<".
+
+\eigenAutoToc
+
+
+\section TopicTemplateKeywordToDefineTemplates Using the template and typename keywords to define templates
+
+The \c template and \c typename keywords are routinely used to define templates. This is not the topic of this
+page as we assume that the reader is aware of this (otherwise consult a C++ book). The following example
+should illustrate this use of the \c template keyword.
+
+\code
+template <typename T>
+bool isPositive(T x)
+{
+    return x > 0;
+}
+\endcode
+
+We could just as well have written <tt>template &lt;class T&gt;</tt>; the keywords \c typename and \c class have the
+same meaning in this context.
+
+
+\section TopicTemplateKeywordExample An example showing the second use of the template keyword
+
+Let us illustrate the second use of the \c template keyword with an example. Suppose we want to write a
+function which copies all entries in the upper triangular part of a matrix into another matrix, while keeping
+the lower triangular part unchanged. A straightforward implementation would be as follows:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include TemplateKeyword_simple.cpp
+</td>
+<td>
+\verbinclude TemplateKeyword_simple.out
+</td></tr></table>
+
+That works fine, but it is not very flexible. First, it only works with dynamic-size matrices of
+single-precision floats; the function \c copyUpperTriangularPart() does not accept static-size matrices or
+matrices with double-precision numbers. Second, if you use an expression such as
+<tt>mat.topLeftCorner(3,3)</tt> as the parameter \c src, then this is copied into a temporary variable of type
+MatrixXf; this copy can be avoided.
+
+As explained in \ref TopicFunctionTakingEigenTypes, both issues can be resolved by making 
+\c copyUpperTriangularPart() accept any object of type MatrixBase. This leads to the following code:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include TemplateKeyword_flexible.cpp
+</td>
+<td>
+\verbinclude TemplateKeyword_flexible.out
+</td></tr></table>
+
+The one line in the body of the function \c copyUpperTriangularPart() shows the second, more obscure use of
+the \c template keyword in C++.  Even though it may look strange, the \c template keywords are necessary
+according to the standard. Without it, the compiler may reject the code with an error message like "no match
+for operator<".
+
+
+\section TopicTemplateKeywordExplanation Explanation
+
+The reason that the \c template keyword is necessary in the last example has to do with the rules for how
+templates are supposed to be compiled in C++. The compiler has to check the code for correct syntax at the
+point where the template is defined, without knowing the actual value of the template arguments (\c Derived1
+and \c Derived2 in the example). That means that the compiler cannot know that <tt>dst.triangularView</tt> is
+a member template and that the following &lt; symbol is part of the delimiter for the template
+parameter. Another possibility would be that <tt>dst.triangularView</tt> is a member variable with the &lt;
+symbol referring to the <tt>operator&lt;()</tt> function. In fact, the compiler should choose the second
+possibility, according to the standard. If <tt>dst.triangularView</tt> is a member template (as in our case),
+the programmer should specify this explicitly with the \c template keyword and write <tt>dst.template
+triangularView</tt>.
+
+The precise rules are rather complicated, but ignoring some subtleties we can summarize them as follows:
+- A <em>dependent name</em> is name that depends (directly or indirectly) on a template parameter. In the
+  example, \c dst is a dependent name because it is of type <tt>MatrixBase&lt;Derived1&gt;</tt> which depends
+  on the template parameter \c Derived1.
+- If the code contains either one of the constructs <tt>xxx.yyy</tt> or <tt>xxx-&gt;yyy</tt> and \c xxx is a
+  dependent name and \c yyy refers to a member template, then the \c template keyword must be used before 
+  \c yyy, leading to <tt>xxx.template yyy</tt> or <tt>xxx-&gt;template yyy</tt>.
+- If the code contains the construct <tt>xxx::yyy</tt> and \c xxx is a dependent name and \c yyy refers to a
+  member typedef, then the \c typename keyword must be used before the whole construct, leading to
+  <tt>typename xxx::yyy</tt>.
+
+As an example where the \c typename keyword is required, consider the following code in \ref TutorialSparse
+for iterating over the non-zero entries of a sparse matrix type:
+
+\code
+SparseMatrixType mat(rows,cols);
+for (int k=0; k<mat.outerSize(); ++k)
+  for (SparseMatrixType::InnerIterator it(mat,k); it; ++it)
+  {
+    /* ... */
+  }
+\endcode
+
+If \c SparseMatrixType depends on a template parameter, then the \c typename keyword is required:
+
+\code
+template <typename T>
+void iterateOverSparseMatrix(const SparseMatrix<T>& mat;
+{
+  for (int k=0; k<m1.outerSize(); ++k)
+    for (typename SparseMatrix<T>::InnerIterator it(mat,k); it; ++it)
+    {
+      /* ... */
+    }
+}
+\endcode
+
+
+\section TopicTemplateKeywordResources Resources for further reading
+
+For more information and a fuller explanation of this topic, the reader may consult the following sources:
+- The book "C++ Template Metaprogramming" by David Abrahams and Aleksey Gurtovoy contains a very good
+  explanation in Appendix B ("The typename and template Keywords") which formed the basis for this page.
+- http://pages.cs.wisc.edu/~driscoll/typename.html
+- http://www.parashift.com/c++-faq-lite/templates.html#faq-35.18
+- http://www.comeaucomputing.com/techtalk/templates/#templateprefix
+- http://www.comeaucomputing.com/techtalk/templates/#typename
+
+*/
+}

diff --git a/doc/TopicAliasing.dox b/doc/TopicAliasing.dox
new file mode 100644
index 0000000..a8f1644
--- /dev/null
+++ b/doc/TopicAliasing.dox

@@ -0,0 +1,237 @@
+namespace Eigen {
+
+/** \eigenManualPage TopicAliasing Aliasing
+
+In %Eigen, aliasing refers to assignment statement in which the same matrix (or array or vector) appears on the
+left and on the right of the assignment operators. Statements like <tt>mat = 2 * mat;</tt> or <tt>mat =
+mat.transpose();</tt> exhibit aliasing. The aliasing in the first example is harmless, but the aliasing in the
+second example leads to unexpected results. This page explains what aliasing is, when it is harmful, and what
+to do about it.
+
+\eigenAutoToc
+
+
+\section TopicAliasingExamples Examples
+
+Here is a simple example exhibiting aliasing:
+
+<table class="example">
+<tr><th>Example</th><th>Output</th></tr>
+<tr><td>
+\include TopicAliasing_block.cpp
+</td>
+<td>
+\verbinclude TopicAliasing_block.out
+</td></tr></table>
+
+The output is not what one would expect. The problem is the assignment
+\code
+mat.bottomRightCorner(2,2) = mat.topLeftCorner(2,2);
+\endcode
+This assignment exhibits aliasing: the coefficient \c mat(1,1) appears both in the block
+<tt>mat.bottomRightCorner(2,2)</tt> on the left-hand side of the assignment and the block
+<tt>mat.topLeftCorner(2,2)</tt> on the right-hand side. After the assignment, the (2,2) entry in the bottom
+right corner should have the value of \c mat(1,1) before the assignment, which is 5. However, the output shows
+that \c mat(2,2) is actually 1. The problem is that %Eigen uses lazy evaluation (see 
+\ref TopicEigenExpressionTemplates) for <tt>mat.topLeftCorner(2,2)</tt>. The result is similar to
+\code
+mat(1,1) = mat(0,0);
+mat(1,2) = mat(0,1);
+mat(2,1) = mat(1,0);
+mat(2,2) = mat(1,1);
+\endcode
+Thus, \c mat(2,2) is assigned the \e new value of \c mat(1,1) instead of the old value. The next section
+explains how to solve this problem by calling \link DenseBase::eval() eval()\endlink.
+
+Aliasing occurs more naturally when trying to shrink a matrix. For example, the expressions <tt>vec =
+vec.head(n)</tt> and <tt>mat = mat.block(i,j,r,c)</tt> exhibit aliasing.
+
+In general, aliasing cannot be detected at compile time: if \c mat in the first example were a bit bigger,
+then the blocks would not overlap, and there would be no aliasing problem. However, %Eigen does detect some
+instances of aliasing, albeit at run time.  The following example exhibiting aliasing was mentioned in \ref
+TutorialMatrixArithmetic :
+
+<table class="example">
+<tr><th>Example</th><th>Output</th></tr>
+<tr><td>
+\include tut_arithmetic_transpose_aliasing.cpp
+</td>
+<td>
+\verbinclude tut_arithmetic_transpose_aliasing.out
+</td></tr></table>
+
+Again, the output shows the aliasing issue. However, by default %Eigen uses a run-time assertion to detect this
+and exits with a message like
+
+\verbatim
+void Eigen::DenseBase<Derived>::checkTransposeAliasing(const OtherDerived&) const 
+[with OtherDerived = Eigen::Transpose<Eigen::Matrix<int, 2, 2, 0, 2, 2> >, Derived = Eigen::Matrix<int, 2, 2, 0, 2, 2>]: 
+Assertion `(!internal::check_transpose_aliasing_selector<Scalar,internal::blas_traits<Derived>::IsTransposed,OtherDerived>::run(internal::extract_data(derived()), other)) 
+&& "aliasing detected during transposition, use transposeInPlace() or evaluate the rhs into a temporary using .eval()"' failed.
+\endverbatim
+
+The user can turn %Eigen's run-time assertions like the one to detect this aliasing problem off by defining the
+EIGEN_NO_DEBUG macro, and the above program was compiled with this macro turned off in order to illustrate the
+aliasing problem. See \ref TopicAssertions for more information about %Eigen's run-time assertions.
+
+
+\section TopicAliasingSolution Resolving aliasing issues
+
+If you understand the cause of the aliasing issue, then it is obvious what must happen to solve it: %Eigen has
+to evaluate the right-hand side fully into a temporary matrix/array and then assign it to the left-hand
+side. The function \link DenseBase::eval() eval() \endlink does precisely that.
+
+For example, here is the corrected version of the first example above:
+
+<table class="example">
+<tr><th>Example</th><th>Output</th></tr>
+<tr><td>
+\include TopicAliasing_block_correct.cpp
+</td>
+<td>
+\verbinclude TopicAliasing_block_correct.out
+</td></tr></table>
+
+Now, \c mat(2,2) equals 5 after the assignment, as it should be.
+
+The same solution also works for the second example, with the transpose: simply replace the line 
+<tt>a = a.transpose();</tt> with <tt>a = a.transpose().eval();</tt>. However, in this common case there is a
+better solution. %Eigen provides the special-purpose function 
+\link DenseBase::transposeInPlace() transposeInPlace() \endlink which replaces a matrix by its transpose. 
+This is shown below:
+
+<table class="example">
+<tr><th>Example</th><th>Output</th></tr>
+<tr><td>
+\include tut_arithmetic_transpose_inplace.cpp
+</td>
+<td>
+\verbinclude tut_arithmetic_transpose_inplace.out
+</td></tr></table>
+
+If an xxxInPlace() function is available, then it is best to use it, because it indicates more clearly what you
+are doing. This may also allow %Eigen to optimize more aggressively. These are some of the xxxInPlace()
+functions provided: 
+
+<table class="manual">
+<tr><th>Original function</th><th>In-place function</th></tr>
+<tr> <td> MatrixBase::adjoint() </td> <td> MatrixBase::adjointInPlace() </td> </tr>
+<tr class="alt"> <td> DenseBase::reverse() </td> <td> DenseBase::reverseInPlace() </td> </tr>
+<tr> <td> LDLT::solve() </td> <td> LDLT::solveInPlace() </td> </tr>
+<tr class="alt"> <td> LLT::solve() </td> <td> LLT::solveInPlace() </td> </tr>
+<tr> <td> TriangularView::solve() </td> <td> TriangularView::solveInPlace() </td> </tr>
+<tr class="alt"> <td> DenseBase::transpose() </td> <td> DenseBase::transposeInPlace() </td> </tr>
+</table>
+
+In the special case where a matrix or vector is shrunk using an expression like <tt>vec = vec.head(n)</tt>,
+you can use \link PlainObjectBase::conservativeResize() conservativeResize() \endlink.
+
+
+\section TopicAliasingCwise Aliasing and component-wise operations
+
+As explained above, it may be dangerous if the same matrix or array occurs on both the left-hand side and the
+right-hand side of an assignment operator, and it is then often necessary to evaluate the right-hand side
+explicitly. However, applying component-wise operations (such as matrix addition, scalar multiplication and
+array multiplication) is safe. 
+
+The following example has only component-wise operations. Thus, there is no need for \link DenseBase::eval()
+eval() \endlink even though the same matrix appears on both sides of the assignments.
+
+<table class="example">
+<tr><th>Example</th><th>Output</th></tr>
+<tr><td>
+\include TopicAliasing_cwise.cpp
+</td>
+<td>
+\verbinclude TopicAliasing_cwise.out
+</td></tr></table>
+
+In general, an assignment is safe if the (i,j) entry of the expression on the right-hand side depends only on
+the (i,j) entry of the matrix or array on the left-hand side and not on any other entries. In that case it is
+not necessary to evaluate the right-hand side explicitly.
+
+
+\section TopicAliasingMatrixMult Aliasing and matrix multiplication
+
+Matrix multiplication is the only operation in %Eigen that assumes aliasing by default, <strong>under the
+condition that the destination matrix is not resized</strong>.
+Thus, if \c matA is a \b squared matrix, then the statement <tt>matA = matA * matA;</tt> is safe.
+All other operations in %Eigen assume that there are no aliasing problems,
+either because the result is assigned to a different matrix or because it is a component-wise operation.
+
+<table class="example">
+<tr><th>Example</th><th>Output</th></tr>
+<tr><td>
+\include TopicAliasing_mult1.cpp
+</td>
+<td>
+\verbinclude TopicAliasing_mult1.out
+</td></tr></table>
+
+However, this comes at a price. When executing the expression <tt>matA = matA * matA</tt>, %Eigen evaluates the
+product in a temporary matrix which is assigned to \c matA after the computation. This is fine. But %Eigen does
+the same when the product is assigned to a different matrix (e.g., <tt>matB = matA * matA</tt>). In that case,
+it is more efficient to evaluate the product directly into \c matB instead of evaluating it first into a
+temporary matrix and copying that matrix to \c matB.
+
+The user can indicate with the \link MatrixBase::noalias() noalias()\endlink function that there is no
+aliasing, as follows: <tt>matB.noalias() = matA * matA</tt>. This allows %Eigen to evaluate the matrix product
+<tt>matA * matA</tt> directly into \c matB.
+
+<table class="example">
+<tr><th>Example</th><th>Output</th></tr>
+<tr><td>
+\include TopicAliasing_mult2.cpp
+</td>
+<td>
+\verbinclude TopicAliasing_mult2.out
+</td></tr></table>
+
+Of course, you should not use \c noalias() when there is in fact aliasing taking place. If you do, then you
+may get wrong results:
+
+<table class="example">
+<tr><th>Example</th><th>Output</th></tr>
+<tr><td>
+\include TopicAliasing_mult3.cpp
+</td>
+<td>
+\verbinclude TopicAliasing_mult3.out
+</td></tr></table>
+
+Moreover, starting in Eigen 3.3, aliasing is \b not assumed if the destination matrix is resized and the product is not directly assigned to the destination.
+Therefore, the following example is also wrong:
+
+<table class="example">
+<tr><th>Example</th><th>Output</th></tr>
+<tr><td>
+\include TopicAliasing_mult4.cpp
+</td>
+<td>
+\verbinclude TopicAliasing_mult4.out
+</td></tr></table>
+
+As for any aliasing issue, you can resolve it by explicitly evaluating the expression prior to assignment:
+<table class="example">
+<tr><th>Example</th><th>Output</th></tr>
+<tr><td>
+\include TopicAliasing_mult5.cpp
+</td>
+<td>
+\verbinclude TopicAliasing_mult5.out
+</td></tr></table>
+
+\section TopicAliasingSummary Summary
+
+Aliasing occurs when the same matrix or array coefficients appear both on the left- and the right-hand side of
+an assignment operator.
+ - Aliasing is harmless with coefficient-wise computations; this includes scalar multiplication and matrix or
+   array addition.
+ - When you multiply two matrices, %Eigen assumes that aliasing occurs. If you know that there is no aliasing,
+   then you can use \link MatrixBase::noalias() noalias()\endlink.
+ - In all other situations, %Eigen assumes that there is no aliasing issue and thus gives the wrong result if
+   aliasing does in fact occur. To prevent this, you have to use \link DenseBase::eval() eval() \endlink or
+   one of the xxxInPlace() functions.
+
+*/
+}

diff --git a/doc/TopicAssertions.dox b/doc/TopicAssertions.dox
new file mode 100644
index 0000000..c8b4d84
--- /dev/null
+++ b/doc/TopicAssertions.dox

@@ -0,0 +1,108 @@
+namespace Eigen {
+
+/** \page TopicAssertions Assertions
+
+\eigenAutoToc
+
+\section PlainAssert Assertions
+
+The macro eigen_assert is defined to be \c eigen_plain_assert by default. We use eigen_plain_assert instead of \c assert to work around a known bug for GCC <= 4.3. Basically, eigen_plain_assert \a is \c assert.
+
+\subsection RedefineAssert Redefining assertions
+
+Both eigen_assert and eigen_plain_assert are defined in Macros.h. Defining eigen_assert indirectly gives you a chance to change its behavior. You can redefine this macro if you want to do something else such as throwing an exception, and fall back to its default behavior with eigen_plain_assert. The code below tells Eigen to throw an std::runtime_error:
+
+\code
+#include <stdexcept>
+#undef eigen_assert
+#define eigen_assert(x) \
+  if (!(x)) { throw (std::runtime_error("Put your message here")); }
+\endcode
+
+\subsection DisableAssert Disabling assertions
+
+Assertions cost run time and can be turned off. You can suppress eigen_assert by defining \c EIGEN_NO_DEBUG \b before including Eigen headers. \c EIGEN_NO_DEBUG is undefined by default unless \c NDEBUG is defined.
+
+\section StaticAssert Static assertions
+
+Static assertions are not standardized until C++11. However, in the Eigen library, there are many conditions can and should be detectedat compile time. For instance, we use static assertions to prevent the code below from compiling.
+
+\code
+Matrix3d()  + Matrix4d();   // adding matrices of different sizes
+Matrix4cd() * Vector3cd();  // invalid product known at compile time
+\endcode
+
+Static assertions are defined in StaticAssert.h. If there is native static_assert, we use it. Otherwise, we have implemented an assertion macro that can show a limited range of messages.
+
+One can easily come up with static assertions without messages, such as:
+
+\code
+#define STATIC_ASSERT(x) \
+  switch(0) { case 0: case x:; }
+\endcode
+
+However, the example above obviously cannot tell why the assertion failed. Therefore, we define a \c struct in namespace Eigen::internal to handle available messages.
+
+\code
+template<bool condition>
+struct static_assertion {};
+
+template<>
+struct static_assertion<true>
+{
+  enum {
+    YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX,
+    YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES,
+    // see StaticAssert.h for all enums.
+  };
+};
+\endcode
+
+And then, we define EIGEN_STATIC_ASSERT(CONDITION,MSG) to access Eigen::internal::static_assertion<bool(CONDITION)>::MSG. If the condition evaluates into \c false, your compiler displays a lot of messages explaining there is no MSG in static_assert<false>. Nevertheless, this is \a not in what we are interested. As you can see, all members of static_assert<true> are ALL_CAPS_AND_THEY_ARE_SHOUTING.
+
+\warning
+When using this macro, MSG should be a member of static_assertion<true>, or the static assertion \b always fails.
+Currently, it can only be used in function scope.
+
+\subsection DerivedStaticAssert Derived static assertions
+
+There are other macros derived from EIGEN_STATIC_ASSERT to enhance readability. Their names are self-explanatory.
+
+- \b EIGEN_STATIC_ASSERT_FIXED_SIZE(TYPE) - passes if \a TYPE is fixed size.
+- \b EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(TYPE) - passes if \a TYPE is dynamic size.
+- \b EIGEN_STATIC_ASSERT_LVALUE(Derived) - failes if \a Derived is read-only.
+- \b EIGEN_STATIC_ASSERT_ARRAYXPR(Derived) - passes if \a Derived is an array expression.
+- <b>EIGEN_STATIC_ASSERT_SAME_XPR_KIND(Derived1, Derived2)</b> - failes if the two expressions are an array one and a matrix one.
+
+Because Eigen handles both fixed-size and dynamic-size expressions, some conditions cannot be clearly determined at compile time. We classify them into strict assertions and permissive assertions.
+
+\subsubsection StrictAssertions Strict assertions
+
+These assertions fail if the condition <b>may not</b> be met. For example, MatrixXd may not be a vector, so it fails EIGEN_STATIC_ASSERT_VECTOR_ONLY.
+
+- \b EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) - passes if \a TYPE must be a vector type.
+- <b>EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(TYPE, SIZE)</b> - passes if \a TYPE must be a vector of the given size.
+- <b>EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(TYPE, ROWS, COLS)</b> - passes if \a TYPE must be a matrix with given rows and columns.
+
+\subsubsection PermissiveAssertions Permissive assertions
+
+These assertions fail if the condition \b cannot be met. For example, MatrixXd and Matrix4d may have the same size, so they pass EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE.
+
+- \b EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(TYPE0,TYPE1) - fails if the two vector expression types must have different sizes.
+- \b EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(TYPE0,TYPE1) - fails if the two matrix expression types must have different sizes.
+- \b EIGEN_STATIC_ASSERT_SIZE_1x1(TYPE) - fails if \a TYPE cannot be an 1x1 expression.
+
+See StaticAssert.h for details such as what messages they throw.
+
+\subsection DisableStaticAssert Disabling static assertions
+
+If \c EIGEN_NO_STATIC_ASSERT is defined, static assertions turn into <tt>eigen_assert</tt>'s, working like:
+
+\code
+#define EIGEN_STATIC_ASSERT(CONDITION,MSG) eigen_assert((CONDITION) && #MSG);
+\endcode
+
+This saves compile time but consumes more run time. \c EIGEN_NO_STATIC_ASSERT is undefined by default.
+
+*/
+}

diff --git a/doc/TopicCMakeGuide.dox b/doc/TopicCMakeGuide.dox
new file mode 100644
index 0000000..cf767d0
--- /dev/null
+++ b/doc/TopicCMakeGuide.dox

@@ -0,0 +1,56 @@
+namespace Eigen {
+
+/**
+
+\page TopicCMakeGuide Using %Eigen in CMake Projects
+
+%Eigen provides native CMake support which allows the library to be easily
+used in CMake projects.
+
+\note %CMake 3.0 (or later) is required to enable this functionality.
+
+%Eigen exports a CMake target called `Eigen3::Eigen` which can be imported
+using the `find_package` CMake command and used by calling
+`target_link_libraries` as in the following example:
+\code{.cmake}
+cmake_minimum_required (VERSION 3.0)
+project (myproject)
+
+find_package (Eigen3 3.3 REQUIRED NO_MODULE)
+
+add_executable (example example.cpp)
+target_link_libraries (example Eigen3::Eigen)
+\endcode
+
+The above code snippet must be placed in a file called `CMakeLists.txt` alongside
+`example.cpp`. After running
+\code{.sh}
+$ cmake path-to-example-directory
+\endcode
+CMake will produce project files that generate an executable called `example`
+which requires at least version 3.3 of %Eigen. Here, `path-to-example-directory`
+is the path to the directory that contains both `CMakeLists.txt` and
+`example.cpp`.
+
+Do not forget to set the <a href="https://cmake.org/cmake/help/v3.7/variable/CMAKE_PREFIX_PATH.html">\c CMAKE_PREFIX_PATH </a> variable if Eigen is not installed in a default location or if you want to pick a specific version. For instance:
+\code{.sh}
+$ cmake path-to-example-directory -DCMAKE_PREFIX_PATH=$HOME/mypackages
+\endcode
+An alternative is to set the \c Eigen3_DIR cmake's variable to the respective path containing the \c Eigen3*.cmake files. For instance:
+\code{.sh}
+$ cmake path-to-example-directory -DEigen3_DIR=$HOME/mypackages/share/eigen3/cmake/
+\endcode
+
+If the `REQUIRED` option is omitted when locating %Eigen using
+`find_package`, one can check whether the package was found as follows:
+\code{.cmake}
+find_package (Eigen3 3.3 NO_MODULE)
+
+if (TARGET Eigen3::Eigen)
+  # Use the imported target
+endif (TARGET Eigen3::Eigen)
+\endcode
+
+*/
+
+}

diff --git a/doc/TopicEigenExpressionTemplates.dox b/doc/TopicEigenExpressionTemplates.dox
new file mode 100644
index 0000000..b31fd47
--- /dev/null
+++ b/doc/TopicEigenExpressionTemplates.dox

@@ -0,0 +1,12 @@
+namespace Eigen {
+
+/** \page TopicEigenExpressionTemplates Expression templates in Eigen
+
+
+TODO: write this dox page!
+
+Is linked from the tutorial on arithmetic ops.
+
+*/
+
+}

diff --git a/doc/TopicLazyEvaluation.dox b/doc/TopicLazyEvaluation.dox
new file mode 100644
index 0000000..d2a704f
--- /dev/null
+++ b/doc/TopicLazyEvaluation.dox

@@ -0,0 +1,97 @@
+namespace Eigen {
+
+/** \page TopicLazyEvaluation Lazy Evaluation and Aliasing
+
+Executive summary: %Eigen has intelligent compile-time mechanisms to enable lazy evaluation and removing temporaries where appropriate.
+It will handle aliasing automatically in most cases, for example with matrix products. The automatic behavior can be overridden
+manually by using the MatrixBase::eval() and MatrixBase::noalias() methods.
+
+When you write a line of code involving a complex expression such as
+
+\code mat1 = mat2 + mat3 * (mat4 + mat5);
+\endcode
+
+%Eigen determines automatically, for each sub-expression, whether to evaluate it into a temporary variable. Indeed, in certain cases it is better to evaluate a sub-expression into a temporary variable, while in other cases it is better to avoid that.
+
+A traditional math library without expression templates always evaluates all sub-expressions into temporaries. So with this code,
+
+\code vec1 = vec2 + vec3;
+\endcode
+
+a traditional library would evaluate \c vec2 + vec3 into a temporary \c vec4 and then copy \c vec4  into \c vec1. This is of course inefficient: the arrays are traversed twice, so there are a lot of useless load/store operations.
+
+Expression-templates-based libraries can avoid evaluating sub-expressions into temporaries, which in many cases results in large speed improvements.
+This is called <i>lazy evaluation</i> as an expression is getting evaluated as late as possible.
+In %Eigen <b>all expressions are lazy-evaluated</b>.
+More precisely, an expression starts to be evaluated once it is assigned to a matrix.
+Until then nothing happens beyond constructing the abstract expression tree.
+In contrast to most other expression-templates-based libraries, however, <b>%Eigen might choose to evaluate some sub-expressions into temporaries</b>.
+There are two reasons for that: first, pure lazy evaluation is not always a good choice for performance; second, pure lazy evaluation can be very dangerous, for example with matrix products: doing <tt>mat = mat*mat</tt> gives a wrong result if the matrix product is directly evaluated within the destination matrix, because of the way matrix product works.
+
+For these reasons, %Eigen has intelligent compile-time mechanisms to determine automatically which sub-expression should be evaluated into a temporary variable.
+
+So in the basic example,
+
+\code mat1 = mat2 + mat3;
+\endcode
+
+%Eigen chooses not to introduce any temporary. Thus the arrays are traversed only once, producing optimized code.
+If you really want to force immediate evaluation, use \link MatrixBase::eval() eval()\endlink:
+
+\code mat1 = (mat2 + mat3).eval();
+\endcode
+
+Here is now a more involved example:
+
+\code mat1 = -mat2 + mat3 + 5 * mat4;
+\endcode
+
+Here again %Eigen won't introduce any temporary, thus producing a single <b>fused</b> evaluation loop, which is clearly the correct choice.
+
+\section TopicLazyEvaluationWhichExpr Which sub-expressions are evaluated into temporaries?
+
+The default evaluation strategy is to fuse the operations in a single loop, and %Eigen will choose it except in a few circumstances.
+
+<b>The first circumstance</b> in which %Eigen chooses to evaluate a sub-expression is when it sees an assignment <tt>a = b;</tt> and the expression \c b has the evaluate-before-assigning \link flags flag\endlink.
+The most important example of such an expression is the \link Product matrix product expression\endlink. For example, when you do
+
+\code mat = mat * mat;
+\endcode
+
+%Eigen will evaluate <tt>mat * mat</tt> into a temporary matrix, and then copies it into the original \c mat.
+This guarantees a correct result as we saw above that lazy evaluation gives wrong results with matrix products.
+It also doesn't cost much, as the cost of the matrix product itself is much higher.
+Note that this temporary is introduced at evaluation time only, that is, within operator= in this example.
+The expression <tt>mat * mat</tt> still return a abstract product type.
+
+What if you know that the result does no alias the operand of the product and want to force lazy evaluation? Then use \link MatrixBase::noalias() .noalias()\endlink instead. Here is an example:
+
+\code mat1.noalias() = mat2 * mat2;
+\endcode
+
+Here, since we know that mat2 is not the same matrix as mat1, we know that lazy evaluation is not dangerous, so we may force lazy evaluation. Concretely, the effect of noalias() here is to bypass the evaluate-before-assigning \link flags flag\endlink.
+
+<b>The second circumstance</b> in which %Eigen chooses to evaluate a sub-expression, is when it sees a nested expression such as <tt>a + b</tt> where \c b is already an expression having the evaluate-before-nesting \link flags flag\endlink.
+Again, the most important example of such an expression is the \link Product matrix product expression\endlink.
+For example, when you do
+
+\code mat1 = mat2 * mat3 + mat4 * mat5;
+\endcode
+
+the products <tt>mat2 * mat3</tt> and <tt>mat4 * mat5</tt> gets evaluated separately into temporary matrices before being summed up in <tt>mat1</tt>.
+Indeed, to be efficient matrix products need to be evaluated within a destination matrix at hand, and not as simple "dot products".
+For small matrices, however, you might want to enforce a "dot-product" based lazy evaluation with lazyProduct().
+Again, it is important to understand that those temporaries are created at evaluation time only, that is in operator =.
+See TopicPitfalls_auto_keyword for common pitfalls regarding this remark.
+
+<b>The third circumstance</b> in which %Eigen chooses to evaluate a sub-expression, is when its cost model shows that the total cost of an operation is reduced if a sub-expression gets evaluated into a temporary.
+Indeed, in certain cases, an intermediate result is sufficiently costly to compute and is reused sufficiently many times, that is worth "caching". Here is an example:
+
+\code mat1 = mat2 * (mat3 + mat4);
+\endcode
+
+Here, provided the matrices have at least 2 rows and 2 columns, each coefficient of the expression <tt>mat3 + mat4</tt> is going to be used several times in the matrix product. Instead of computing the sum every time, it is much better to compute it once and store it in a temporary variable. %Eigen understands this and evaluates <tt>mat3 + mat4</tt> into a temporary variable before evaluating the product.
+
+*/
+
+}

diff --git a/doc/TopicLinearAlgebraDecompositions.dox b/doc/TopicLinearAlgebraDecompositions.dox
new file mode 100644
index 0000000..402b376
--- /dev/null
+++ b/doc/TopicLinearAlgebraDecompositions.dox

@@ -0,0 +1,287 @@
+namespace Eigen {
+
+/** \eigenManualPage TopicLinearAlgebraDecompositions Catalogue of dense decompositions
+
+This page presents a catalogue of the dense matrix decompositions offered by Eigen.
+For an introduction on linear solvers and decompositions, check this \link TutorialLinearAlgebra page \endlink.
+To get an overview of the true relative speed of the different decompositions, check this \link DenseDecompositionBenchmark benchmark \endlink.
+
+\section TopicLinAlgBigTable Catalogue of decompositions offered by Eigen
+
+<table class="manual-vl">
+    <tr>
+        <th class="meta"></th>
+        <th class="meta" colspan="5">Generic information, not Eigen-specific</th>
+        <th class="meta" colspan="3">Eigen-specific</th>
+    </tr>
+
+    <tr>
+        <th>Decomposition</th>
+        <th>Requirements on the matrix</th>
+        <th>Speed</th>
+        <th>Algorithm reliability and accuracy</th>
+        <th>Rank-revealing</th>
+        <th>Allows to compute (besides linear solving)</th>
+        <th>Linear solver provided by Eigen</th>
+        <th>Maturity of Eigen's implementation</th>
+        <th>Optimizations</th>
+    </tr>
+
+    <tr>
+        <td>PartialPivLU</td>
+        <td>Invertible</td>
+        <td>Fast</td>
+        <td>Depends on condition number</td>
+        <td>-</td>
+        <td>-</td>
+        <td>Yes</td>
+        <td>Excellent</td>
+        <td>Blocking, Implicit MT</td>
+    </tr>
+
+    <tr class="alt">
+        <td>FullPivLU</td>
+        <td>-</td>
+        <td>Slow</td>
+        <td>Proven</td>
+        <td>Yes</td>
+        <td>-</td>
+        <td>Yes</td>
+        <td>Excellent</td>
+        <td>-</td>
+    </tr>
+
+    <tr>
+        <td>HouseholderQR</td>
+        <td>-</td>
+        <td>Fast</td>
+        <td>Depends on condition number</td>
+        <td>-</td>
+        <td>Orthogonalization</td>
+        <td>Yes</td>
+        <td>Excellent</td>
+        <td>Blocking</td>
+    </tr>
+
+    <tr class="alt">
+        <td>ColPivHouseholderQR</td>
+        <td>-</td>
+        <td>Fast</td>
+        <td>Good</td>
+        <td>Yes</td>
+        <td>Orthogonalization</td>
+        <td>Yes</td>
+        <td>Excellent</td>
+        <td><em>-</em></td>
+    </tr>
+
+    <tr>
+        <td>FullPivHouseholderQR</td>
+        <td>-</td>
+        <td>Slow</td>
+        <td>Proven</td>
+        <td>Yes</td>
+        <td>Orthogonalization</td>
+        <td>Yes</td>
+        <td>Average</td>
+        <td>-</td>
+    </tr>
+
+    <tr class="alt">
+        <td>CompleteOrthogonalDecomposition</td>
+        <td>-</td>
+        <td>Fast</td>
+        <td>Good</td>
+        <td>Yes</td>
+        <td>Orthogonalization</td>
+        <td>Yes</td>
+        <td>Excellent</td>
+        <td><em>-</em></td>
+    </tr>
+
+    <tr>
+        <td>LLT</td>
+        <td>Positive definite</td>
+        <td>Very fast</td>
+        <td>Depends on condition number</td>
+        <td>-</td>
+        <td>-</td>
+        <td>Yes</td>
+        <td>Excellent</td>
+        <td>Blocking</td>
+    </tr>
+
+    <tr class="alt">
+        <td>LDLT</td>
+        <td>Positive or negative semidefinite<sup><a href="#note1">1</a></sup></td>
+        <td>Very fast</td>
+        <td>Good</td>
+        <td>-</td>
+        <td>-</td>
+        <td>Yes</td>
+        <td>Excellent</td>
+        <td><em>Soon: blocking</em></td>
+    </tr>
+
+    <tr><th class="inter" colspan="9">\n Singular values and eigenvalues decompositions</th></tr>
+
+    <tr>
+        <td>BDCSVD (divide \& conquer)</td>
+        <td>-</td>
+        <td>One of the fastest SVD algorithms</td>
+        <td>Excellent</td>
+        <td>Yes</td>
+        <td>Singular values/vectors, least squares</td>
+        <td>Yes (and does least squares)</td>
+        <td>Excellent</td>
+        <td>Blocked bidiagonalization</td>
+    </tr>
+
+    <tr>
+        <td>JacobiSVD (two-sided)</td>
+        <td>-</td>
+        <td>Slow (but fast for small matrices)</td>
+        <td>Proven<sup><a href="#note3">3</a></sup></td>
+        <td>Yes</td>
+        <td>Singular values/vectors, least squares</td>
+        <td>Yes (and does least squares)</td>
+        <td>Excellent</td>
+        <td>R-SVD</td>
+    </tr>
+
+    <tr class="alt">
+        <td>SelfAdjointEigenSolver</td>
+        <td>Self-adjoint</td>
+        <td>Fast-average<sup><a href="#note2">2</a></sup></td>
+        <td>Good</td>
+        <td>Yes</td>
+        <td>Eigenvalues/vectors</td>
+        <td>-</td>
+        <td>Excellent</td>
+        <td><em>Closed forms for 2x2 and 3x3</em></td>
+    </tr>
+
+    <tr>
+        <td>ComplexEigenSolver</td>
+        <td>Square</td>
+        <td>Slow-very slow<sup><a href="#note2">2</a></sup></td>
+        <td>Depends on condition number</td>
+        <td>Yes</td>
+        <td>Eigenvalues/vectors</td>
+        <td>-</td>
+        <td>Average</td>
+        <td>-</td>
+    </tr>
+
+    <tr class="alt">
+        <td>EigenSolver</td>
+        <td>Square and real</td>
+        <td>Average-slow<sup><a href="#note2">2</a></sup></td>
+        <td>Depends on condition number</td>
+        <td>Yes</td>
+        <td>Eigenvalues/vectors</td>
+        <td>-</td>
+        <td>Average</td>
+        <td>-</td>
+    </tr>
+
+    <tr>
+        <td>GeneralizedSelfAdjointEigenSolver</td>
+        <td>Square</td>
+        <td>Fast-average<sup><a href="#note2">2</a></sup></td>
+        <td>Depends on condition number</td>
+        <td>-</td>
+        <td>Generalized eigenvalues/vectors</td>
+        <td>-</td>
+        <td>Good</td>
+        <td>-</td>
+    </tr>
+
+    <tr><th class="inter" colspan="9">\n Helper decompositions</th></tr>
+
+    <tr>
+        <td>RealSchur</td>
+        <td>Square and real</td>
+        <td>Average-slow<sup><a href="#note2">2</a></sup></td>
+        <td>Depends on condition number</td>
+        <td>Yes</td>
+        <td>-</td>
+        <td>-</td>
+        <td>Average</td>
+        <td>-</td>
+    </tr>
+
+    <tr class="alt">
+        <td>ComplexSchur</td>
+        <td>Square</td>
+        <td>Slow-very slow<sup><a href="#note2">2</a></sup></td>
+        <td>Depends on condition number</td>
+        <td>Yes</td>
+        <td>-</td>
+        <td>-</td>
+        <td>Average</td>
+        <td>-</td>
+    </tr>
+
+    <tr class="alt">
+        <td>Tridiagonalization</td>
+        <td>Self-adjoint</td>
+        <td>Fast</td>
+        <td>Good</td>
+        <td>-</td>
+        <td>-</td>
+        <td>-</td>
+        <td>Good</td>
+        <td><em>Soon: blocking</em></td>
+    </tr>
+
+    <tr>
+        <td>HessenbergDecomposition</td>
+        <td>Square</td>
+        <td>Average</td>
+        <td>Good</td>
+        <td>-</td>
+        <td>-</td>
+        <td>-</td>
+        <td>Good</td>
+        <td><em>Soon: blocking</em></td>
+    </tr>
+
+</table>
+
+\b Notes:
+<ul>
+<li><a name="note1">\b 1: </a>There exist two variants of the LDLT algorithm. Eigen's one produces a pure diagonal D matrix, and therefore it cannot handle indefinite matrices, unlike Lapack's one which produces a block diagonal D matrix.</li>
+<li><a name="note2">\b 2: </a>Eigenvalues, SVD and Schur decompositions rely on iterative algorithms. Their convergence speed depends on how well the eigenvalues are separated.</li>
+<li><a name="note3">\b 3: </a>Our JacobiSVD is two-sided, making for proven and optimal precision for square matrices. For non-square matrices, we have to use a QR preconditioner first. The default choice, ColPivHouseholderQR, is already very reliable, but if you want it to be proven, use FullPivHouseholderQR instead.
+</ul>
+
+\section TopicLinAlgTerminology Terminology
+
+<dl>
+  <dt><b>Selfadjoint</b></dt>
+    <dd>For a real matrix, selfadjoint is a synonym for symmetric. For a complex matrix, selfadjoint is a synonym for \em hermitian.
+        More generally, a matrix \f$ A \f$ is selfadjoint if and only if it is equal to its adjoint \f$ A^* \f$. The adjoint is also called the \em conjugate \em transpose. </dd>
+  <dt><b>Positive/negative definite</b></dt>
+    <dd>A selfadjoint matrix \f$ A \f$ is positive definite if \f$ v^* A v > 0 \f$ for any non zero vector \f$ v \f$.
+        In the same vein, it is negative definite if \f$ v^* A v < 0 \f$ for any non zero vector \f$ v \f$ </dd>
+  <dt><b>Positive/negative semidefinite</b></dt>
+    <dd>A selfadjoint matrix \f$ A \f$ is positive semi-definite if \f$ v^* A v \ge 0 \f$ for any non zero vector \f$ v \f$.
+        In the same vein, it is negative semi-definite if \f$ v^* A v \le 0 \f$ for any non zero vector \f$ v \f$ </dd>
+
+  <dt><b>Blocking</b></dt>
+    <dd>Means the algorithm can work per block, whence guaranteeing a good scaling of the performance for large matrices.</dd>
+  <dt><b>Implicit Multi Threading (MT)</b></dt>
+    <dd>Means the algorithm can take advantage of multicore processors via OpenMP. "Implicit" means the algortihm itself is not parallelized, but that it relies on parallelized matrix-matrix product routines.</dd>
+  <dt><b>Explicit Multi Threading (MT)</b></dt>
+    <dd>Means the algorithm is explicitly parallelized to take advantage of multicore processors via OpenMP.</dd>
+  <dt><b>Meta-unroller</b></dt>
+    <dd>Means the algorithm is automatically and explicitly unrolled for very small fixed size matrices.</dd>
+  <dt><b></b></dt>
+    <dd></dd>
+</dl>
+
+
+*/
+
+}

diff --git a/doc/TopicMultithreading.dox b/doc/TopicMultithreading.dox
new file mode 100644
index 0000000..7a8ff30
--- /dev/null
+++ b/doc/TopicMultithreading.dox

@@ -0,0 +1,67 @@
+namespace Eigen {
+
+/** \page TopicMultiThreading Eigen and multi-threading
+
+\section TopicMultiThreading_MakingEigenMT Make Eigen run in parallel
+
+Some %Eigen's algorithms can exploit the multiple cores present in your hardware.
+To this end, it is enough to enable OpenMP on your compiler, for instance:
+ - GCC: \c -fopenmp
+ - ICC: \c -openmp
+ - MSVC: check the respective option in the build properties.
+
+You can control the number of threads that will be used using either the OpenMP API or %Eigen's API using the following priority:
+\code
+ OMP_NUM_THREADS=n ./my_program
+ omp_set_num_threads(n);
+ Eigen::setNbThreads(n);
+\endcode
+Unless `setNbThreads` has been called, %Eigen uses the number of threads specified by OpenMP.
+You can restore this behavior by calling `setNbThreads(0);`.
+You can query the number of threads that will be used with:
+\code
+n = Eigen::nbThreads( );
+\endcode
+You can disable %Eigen's multi threading at compile time by defining the \link TopicPreprocessorDirectivesPerformance EIGEN_DONT_PARALLELIZE \endlink preprocessor token.
+
+Currently, the following algorithms can make use of multi-threading:
+ - general dense matrix - matrix products
+ - PartialPivLU
+ - row-major-sparse * dense vector/matrix products
+ - ConjugateGradient with \c Lower|Upper as the \c UpLo template parameter.
+ - BiCGSTAB with a row-major sparse matrix format.
+ - LeastSquaresConjugateGradient
+
+\warning On most OS it is <strong>very important</strong> to limit the number of threads to the number of physical cores, otherwise significant slowdowns are expected, especially for operations involving dense matrices.
+
+Indeed, the principle of hyper-threading is to run multiple threads (in most cases 2) on a single core in an interleaved manner.
+However, %Eigen's matrix-matrix product kernel is fully optimized and already exploits nearly 100% of the CPU capacity.
+Consequently, there is no room for running multiple such threads on a single core, and the performance would drops significantly because of cache pollution and other sources of overheads.
+At this stage of reading you're probably wondering why %Eigen does not limit itself to the number of physical cores?
+This is simply because OpenMP does not allow to know the number of physical cores, and thus %Eigen will launch as many threads as <i>cores</i> reported by OpenMP.
+
+\section TopicMultiThreading_UsingEigenWithMT Using Eigen in a multi-threaded application
+
+In the case your own application is multithreaded, and multiple threads make calls to %Eigen, then you have to initialize %Eigen by calling the following routine \b before creating the threads:
+\code
+#include <Eigen/Core>
+
+int main(int argc, char** argv)
+{
+  Eigen::initParallel();
+  
+  ...
+}
+\endcode
+
+\note With %Eigen 3.3, and a fully C++11 compliant compiler (i.e., <a href="http://en.cppreference.com/w/cpp/language/storage_duration#Static_local_variables">thread-safe static local variable initialization</a>), then calling \c initParallel() is optional.
+
+\warning Note that all functions generating random matrices are \b not re-entrant nor thread-safe. Those include DenseBase::Random(), and DenseBase::setRandom() despite a call to `Eigen::initParallel()`. This is because these functions are based on `std::rand` which is not re-entrant.
+For thread-safe random generator, we recommend the use of c++11 random generators (\link DenseBase::NullaryExpr(Index, const CustomNullaryOp&) example \endlink) or `boost::random`.
+
+In the case your application is parallelized with OpenMP, you might want to disable %Eigen's own parallelization as detailed in the previous section.
+
+\warning Using OpenMP with custom scalar types that might throw exceptions can lead to unexpected behaviour in the event of throwing.
+*/
+
+}

diff --git a/doc/TopicResizing.dox b/doc/TopicResizing.dox
new file mode 100644
index 0000000..c323e17
--- /dev/null
+++ b/doc/TopicResizing.dox

@@ -0,0 +1,11 @@
+namespace Eigen {
+
+/** \page TopicResizing Resizing
+
+
+TODO: write this dox page!
+
+Is linked from the tutorial on the Matrix class.
+
+*/
+}

diff --git a/doc/TopicScalarTypes.dox b/doc/TopicScalarTypes.dox
new file mode 100644
index 0000000..2ff03c1
--- /dev/null
+++ b/doc/TopicScalarTypes.dox

@@ -0,0 +1,12 @@
+namespace Eigen {
+
+/** \page TopicScalarTypes Scalar types
+
+
+TODO: write this dox page!
+
+Is linked from the tutorial on the Matrix class.
+
+*/
+
+}

diff --git a/doc/TopicVectorization.dox b/doc/TopicVectorization.dox
new file mode 100644
index 0000000..274d045
--- /dev/null
+++ b/doc/TopicVectorization.dox

@@ -0,0 +1,9 @@
+namespace Eigen {
+
+/** \page TopicVectorization Vectorization
+
+
+TODO: write this dox page!
+
+*/
+}

diff --git a/doc/TutorialAdvancedInitialization.dox b/doc/TutorialAdvancedInitialization.dox
new file mode 100644
index 0000000..50374d0
--- /dev/null
+++ b/doc/TutorialAdvancedInitialization.dox

@@ -0,0 +1,162 @@
+namespace Eigen {
+
+/** \eigenManualPage TutorialAdvancedInitialization Advanced initialization
+
+This page discusses several advanced methods for initializing matrices. It gives more details on the
+comma-initializer, which was introduced before. It also explains how to get special matrices such as the
+identity matrix and the zero matrix.
+
+\eigenAutoToc
+
+\section TutorialAdvancedInitializationCommaInitializer The comma initializer
+
+Eigen offers a comma initializer syntax which allows the user to easily set all the coefficients of a matrix,
+vector or array. Simply list the coefficients, starting at the top-left corner and moving from left to right
+and from the top to the bottom. The size of the object needs to be specified beforehand. If you list too few
+or too many coefficients, Eigen will complain.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_commainit_01.cpp
+</td>
+<td>
+\verbinclude Tutorial_commainit_01.out
+</td></tr></table>
+
+Moreover, the elements of the initialization list may themselves be vectors or matrices. A common use is
+to join vectors or matrices together. For example, here is how to join two row vectors together. Remember
+that you have to set the size before you can use the comma initializer.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_AdvancedInitialization_Join.cpp
+</td>
+<td>
+\verbinclude Tutorial_AdvancedInitialization_Join.out
+</td></tr></table>
+
+We can use the same technique to initialize matrices with a block structure.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_AdvancedInitialization_Block.cpp
+</td>
+<td>
+\verbinclude Tutorial_AdvancedInitialization_Block.out
+</td></tr></table>
+
+The comma initializer can also be used to fill block expressions such as <tt>m.row(i)</tt>. Here is a more
+complicated way to get the same result as in the first example above:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_commainit_01b.cpp
+</td>
+<td>
+\verbinclude Tutorial_commainit_01b.out
+</td></tr></table>
+
+
+\section TutorialAdvancedInitializationSpecialMatrices Special matrices and arrays
+
+The Matrix and Array classes have static methods like \link DenseBase::Zero() Zero()\endlink, which can be
+used to initialize all coefficients to zero. There are three variants. The first variant takes no arguments
+and can only be used for fixed-size objects. If you want to initialize a dynamic-size object to zero, you need
+to specify the size. Thus, the second variant requires one argument and can be used for one-dimensional
+dynamic-size objects, while the third variant requires two arguments and can be used for two-dimensional
+objects. All three variants are illustrated in the following example:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_AdvancedInitialization_Zero.cpp
+</td>
+<td>
+\verbinclude Tutorial_AdvancedInitialization_Zero.out
+</td></tr></table>
+
+Similarly, the static method \link DenseBase::Constant() Constant\endlink(value) sets all coefficients to \c value.
+If the size of the object needs to be specified, the additional arguments go before the \c value
+argument, as in <tt>MatrixXd::Constant(rows, cols, value)</tt>. The method \link DenseBase::Random() Random()
+\endlink fills the matrix or array with random coefficients. The identity matrix can be obtained by calling
+\link MatrixBase::Identity() Identity()\endlink; this method is only available for Matrix, not for Array,
+because "identity matrix" is a linear algebra concept.  The method
+\link DenseBase::LinSpaced LinSpaced\endlink(size, low, high) is only available for vectors and
+one-dimensional arrays; it yields a vector of the specified size whose coefficients are equally spaced between
+\c low and \c high. The method \c LinSpaced() is illustrated in the following example, which prints a table
+with angles in degrees, the corresponding angle in radians, and their sine and cosine.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_AdvancedInitialization_LinSpaced.cpp
+</td>
+<td>
+\verbinclude Tutorial_AdvancedInitialization_LinSpaced.out
+</td></tr></table>
+
+This example shows that objects like the ones returned by LinSpaced() can be assigned to variables (and
+expressions). Eigen defines utility functions like \link DenseBase::setZero() setZero()\endlink, 
+\link MatrixBase::setIdentity() \endlink and \link DenseBase::setLinSpaced() \endlink to do this
+conveniently. The following example contrasts three ways to construct the matrix
+\f$ J = \bigl[ \begin{smallmatrix} O & I \\ I & O \end{smallmatrix} \bigr] \f$: using static methods and
+assignment, using static methods and the comma-initializer, or using the setXxx() methods.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_AdvancedInitialization_ThreeWays.cpp
+</td>
+<td>
+\verbinclude Tutorial_AdvancedInitialization_ThreeWays.out
+</td></tr></table>
+
+A summary of all pre-defined matrix, vector and array objects can be found in the \ref QuickRefPage.
+
+
+\section TutorialAdvancedInitializationTemporaryObjects Usage as temporary objects
+
+As shown above, static methods as Zero() and Constant() can be used to initialize variables at the time of
+declaration or at the right-hand side of an assignment operator. You can think of these methods as returning a
+matrix or array; in fact, they return so-called \ref TopicEigenExpressionTemplates "expression objects" which
+evaluate to a matrix or array when needed, so that this syntax does not incur any overhead.
+
+These expressions can also be used as a temporary object. The second example in
+the \ref GettingStarted guide, which we reproduce here, already illustrates this.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include QuickStart_example2_dynamic.cpp
+</td>
+<td>
+\verbinclude QuickStart_example2_dynamic.out
+</td></tr></table>
+
+The expression <tt>m + MatrixXf::Constant(3,3,1.2)</tt> constructs the 3-by-3 matrix expression with all its coefficients
+equal to 1.2 plus the corresponding coefficient of \a m.
+
+The comma-initializer, too, can also be used to construct temporary objects. The following example constructs a random
+matrix of size 2-by-3, and then multiplies this matrix on the left with 
+\f$ \bigl[ \begin{smallmatrix} 0 & 1 \\ 1 & 0 \end{smallmatrix} \bigr] \f$.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_AdvancedInitialization_CommaTemporary.cpp
+</td>
+<td>
+\verbinclude Tutorial_AdvancedInitialization_CommaTemporary.out
+</td></tr></table>
+
+The \link CommaInitializer::finished() finished() \endlink method is necessary here to get the actual matrix
+object once the comma initialization of our temporary submatrix is done.
+
+
+*/
+
+}

diff --git a/doc/TutorialArrayClass.dox b/doc/TutorialArrayClass.dox
new file mode 100644
index 0000000..f6f3510
--- /dev/null
+++ b/doc/TutorialArrayClass.dox

@@ -0,0 +1,192 @@
+namespace Eigen {
+
+/** \eigenManualPage TutorialArrayClass The Array class and coefficient-wise operations
+
+This page aims to provide an overview and explanations on how to use
+Eigen's Array class.
+
+\eigenAutoToc
+  
+\section TutorialArrayClassIntro What is the Array class?
+
+The Array class provides general-purpose arrays, as opposed to the Matrix class which
+is intended for linear algebra. Furthermore, the Array class provides an easy way to
+perform coefficient-wise operations, which might not have a linear algebraic meaning,
+such as adding a constant to every coefficient in the array or multiplying two arrays coefficient-wise.
+
+
+\section TutorialArrayClassTypes Array types
+Array is a class template taking the same template parameters as Matrix.
+As with Matrix, the first three template parameters are mandatory:
+\code
+Array<typename Scalar, int RowsAtCompileTime, int ColsAtCompileTime>
+\endcode
+The last three template parameters are optional. Since this is exactly the same as for Matrix,
+we won't explain it again here and just refer to \ref TutorialMatrixClass.
+
+Eigen also provides typedefs for some common cases, in a way that is similar to the Matrix typedefs
+but with some slight differences, as the word "array" is used for both 1-dimensional and 2-dimensional arrays.
+We adopt the convention that typedefs of the form ArrayNt stand for 1-dimensional arrays, where N and t are
+the size and the scalar type, as in the Matrix typedefs explained on \ref TutorialMatrixClass "this page". For 2-dimensional arrays, we
+use typedefs of the form ArrayNNt. Some examples are shown in the following table:
+
+<table class="manual">
+  <tr>
+    <th>Type </th>
+    <th>Typedef </th>
+  </tr>
+  <tr>
+    <td> \code Array<float,Dynamic,1> \endcode </td>
+    <td> \code ArrayXf \endcode </td>
+  </tr>
+  <tr>
+    <td> \code Array<float,3,1> \endcode </td>
+    <td> \code Array3f \endcode </td>
+  </tr>
+  <tr>
+    <td> \code Array<double,Dynamic,Dynamic> \endcode </td>
+    <td> \code ArrayXXd \endcode </td>
+  </tr>
+  <tr>
+    <td> \code Array<double,3,3> \endcode </td>
+    <td> \code Array33d \endcode </td>
+  </tr>
+</table>
+
+
+\section TutorialArrayClassAccess Accessing values inside an Array
+
+The parenthesis operator is overloaded to provide write and read access to the coefficients of an array, just as with matrices.
+Furthermore, the \c << operator can be used to initialize arrays (via the comma initializer) or to print them.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_ArrayClass_accessors.cpp
+</td>
+<td>
+\verbinclude Tutorial_ArrayClass_accessors.out
+</td></tr></table>
+
+For more information about the comma initializer, see \ref TutorialAdvancedInitialization.
+
+
+\section TutorialArrayClassAddSub Addition and subtraction
+
+Adding and subtracting two arrays is the same as for matrices.
+The operation is valid if both arrays have the same size, and the addition or subtraction is done coefficient-wise.
+
+Arrays also support expressions of the form <tt>array + scalar</tt> which add a scalar to each coefficient in the array.
+This provides a functionality that is not directly available for Matrix objects.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_ArrayClass_addition.cpp
+</td>
+<td>
+\verbinclude Tutorial_ArrayClass_addition.out
+</td></tr></table>
+
+
+\section TutorialArrayClassMult Array multiplication
+
+First of all, of course you can multiply an array by a scalar, this works in the same way as matrices. Where arrays
+are fundamentally different from matrices, is when you multiply two together. Matrices interpret
+multiplication as matrix product and arrays interpret multiplication as coefficient-wise product. Thus, two 
+arrays can be multiplied if and only if they have the same dimensions.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_ArrayClass_mult.cpp
+</td>
+<td>
+\verbinclude Tutorial_ArrayClass_mult.out
+</td></tr></table>
+
+
+\section TutorialArrayClassCwiseOther Other coefficient-wise operations
+
+The Array class defines other coefficient-wise operations besides the addition, subtraction and multiplication
+operators described above. For example, the \link ArrayBase::abs() .abs() \endlink method takes the absolute
+value of each coefficient, while \link ArrayBase::sqrt() .sqrt() \endlink computes the square root of the
+coefficients. If you have two arrays of the same size, you can call \link ArrayBase::min(const Eigen::ArrayBase<OtherDerived>&) const .min(.) \endlink to
+construct the array whose coefficients are the minimum of the corresponding coefficients of the two given
+arrays. These operations are illustrated in the following example.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_ArrayClass_cwise_other.cpp
+</td>
+<td>
+\verbinclude Tutorial_ArrayClass_cwise_other.out
+</td></tr></table>
+
+More coefficient-wise operations can be found in the \ref QuickRefPage.
+
+
+\section TutorialArrayClassConvert Converting between array and matrix expressions
+
+When should you use objects of the Matrix class and when should you use objects of the Array class? You cannot
+apply Matrix operations on arrays, or Array operations on matrices. Thus, if you need to do linear algebraic
+operations such as matrix multiplication, then you should use matrices; if you need to do coefficient-wise
+operations, then you should use arrays. However, sometimes it is not that simple, but you need to use both
+Matrix and Array operations. In that case, you need to convert a matrix to an array or reversely. This gives
+access to all operations regardless of the choice of declaring objects as arrays or as matrices.
+
+\link MatrixBase Matrix expressions \endlink have an \link MatrixBase::array() .array() \endlink method that
+'converts' them into \link ArrayBase array expressions\endlink, so that coefficient-wise operations
+can be applied easily. Conversely, \link ArrayBase array expressions \endlink
+have a \link ArrayBase::matrix() .matrix() \endlink method. As with all Eigen expression abstractions,
+this doesn't have any runtime cost (provided that you let your compiler optimize).
+Both \link MatrixBase::array() .array() \endlink and \link ArrayBase::matrix() .matrix() \endlink 
+can be used as rvalues and as lvalues.
+
+Mixing matrices and arrays in an expression is forbidden with Eigen. For instance, you cannot add a matrix and
+array directly; the operands of a \c + operator should either both be matrices or both be arrays. However,
+it is easy to convert from one to the other with \link MatrixBase::array() .array() \endlink and 
+\link ArrayBase::matrix() .matrix()\endlink. The exception to this rule is the assignment operator: it is
+allowed to assign a matrix expression to an array variable, or to assign an array expression to a matrix
+variable.
+
+The following example shows how to use array operations on a Matrix object by employing the 
+\link MatrixBase::array() .array() \endlink method. For example, the statement 
+<tt>result = m.array() * n.array()</tt> takes two matrices \c m and \c n, converts them both to an array, uses
+* to multiply them coefficient-wise and assigns the result to the matrix variable \c result (this is legal
+because Eigen allows assigning array expressions to matrix variables). 
+
+As a matter of fact, this usage case is so common that Eigen provides a \link MatrixBase::cwiseProduct const
+.cwiseProduct(.) \endlink method for matrices to compute the coefficient-wise product. This is also shown in
+the example program.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_ArrayClass_interop_matrix.cpp
+</td>
+<td>
+\verbinclude Tutorial_ArrayClass_interop_matrix.out
+</td></tr></table>
+
+Similarly, if \c array1 and \c array2 are arrays, then the expression <tt>array1.matrix() * array2.matrix()</tt>
+computes their matrix product.
+
+Here is a more advanced example. The expression <tt>(m.array() + 4).matrix() * m</tt> adds 4 to every
+coefficient in the matrix \c m and then computes the matrix product of the result with \c m. Similarly, the
+expression <tt>(m.array() * n.array()).matrix() * m</tt> computes the coefficient-wise product of the matrices
+\c m and \c n and then the matrix product of the result with \c m.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_ArrayClass_interop.cpp
+</td>
+<td>
+\verbinclude Tutorial_ArrayClass_interop.out
+</td></tr></table>
+
+*/
+
+}

diff --git a/doc/TutorialBlockOperations.dox b/doc/TutorialBlockOperations.dox
new file mode 100644
index 0000000..df27748
--- /dev/null
+++ b/doc/TutorialBlockOperations.dox

@@ -0,0 +1,242 @@
+namespace Eigen {
+
+/** \eigenManualPage TutorialBlockOperations Block operations
+
+This page explains the essentials of block operations.
+A block is a rectangular part of a matrix or array. Blocks expressions can be used both
+as rvalues and as lvalues. As usual with Eigen expressions, this abstraction has zero runtime cost
+provided that you let your compiler optimize.
+
+\eigenAutoToc
+
+\section TutorialBlockOperationsUsing Using block operations
+
+The most general block operation in Eigen is called \link DenseBase::block() .block() \endlink.
+There are two versions, whose syntax is as follows:
+
+<table class="manual">
+<tr><th>\b %Block \b operation</td>
+<th>Version constructing a \n dynamic-size block expression</th>
+<th>Version constructing a \n fixed-size block expression</th></tr>
+<tr><td>%Block of size <tt>(p,q)</tt>, starting at <tt>(i,j)</tt></td>
+    <td>\code
+matrix.block(i,j,p,q);\endcode </td>
+    <td>\code 
+matrix.block<p,q>(i,j);\endcode </td>
+</tr>
+</table>
+
+As always in Eigen, indices start at 0.
+
+Both versions can be used on fixed-size and dynamic-size matrices and arrays.
+These two expressions are semantically equivalent.
+The only difference is that the fixed-size version will typically give you faster code if the block size is small,
+but requires this size to be known at compile time.
+
+The following program uses the dynamic-size and fixed-size versions to print the values of several blocks inside a
+matrix.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_BlockOperations_print_block.cpp
+</td>
+<td>
+\verbinclude Tutorial_BlockOperations_print_block.out
+</td></tr></table>
+
+In the above example the \link DenseBase::block() .block() \endlink function was employed as a \em rvalue, i.e.
+it was only read from. However, blocks can also be used as \em lvalues, meaning that you can assign to a block.
+
+This is illustrated in the following example. This example also demonstrates blocks in arrays, which works exactly like the above-demonstrated blocks in matrices.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_BlockOperations_block_assignment.cpp
+</td>
+<td>
+\verbinclude Tutorial_BlockOperations_block_assignment.out
+</td></tr></table>
+
+While the \link DenseBase::block() .block() \endlink method can be used for any block operation, there are
+other methods for special cases, providing more specialized API and/or better performance. On the topic of performance, all what
+matters is that you give Eigen as much information as possible at compile time. For example, if your block is a single whole column in a matrix,
+using the specialized \link DenseBase::col() .col() \endlink function described below lets Eigen know that, which can give it optimization opportunities.
+
+The rest of this page describes these specialized methods.
+
+\section TutorialBlockOperationsSyntaxColumnRows Columns and rows
+
+Individual columns and rows are special cases of blocks. Eigen provides methods to easily address them:
+\link DenseBase::col() .col() \endlink and \link DenseBase::row() .row()\endlink.
+
+<table class="manual">
+<tr><th>%Block operation</th>
+<th>Method</th>
+<tr><td>i<sup>th</sup> row
+                    \link DenseBase::row() * \endlink</td>
+    <td>\code
+matrix.row(i);\endcode </td>
+</tr>
+<tr><td>j<sup>th</sup> column
+                    \link DenseBase::col() * \endlink</td>
+    <td>\code
+matrix.col(j);\endcode </td>
+</tr>
+</table>
+
+The argument for \p col() and \p row() is the index of the column or row to be accessed. As always in Eigen, indices start at 0.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_BlockOperations_colrow.cpp
+</td>
+<td>
+\verbinclude Tutorial_BlockOperations_colrow.out
+</td></tr></table>
+
+That example also demonstrates that block expressions (here columns) can be used in arithmetic like any other expression.
+
+
+\section TutorialBlockOperationsSyntaxCorners Corner-related operations
+
+Eigen also provides special methods for blocks that are flushed against one of the corners or sides of a
+matrix or array. For instance, \link DenseBase::topLeftCorner() .topLeftCorner() \endlink can be used to refer
+to a block in the top-left corner of a matrix.
+
+The different possibilities are summarized in the following table:
+
+<table class="manual">
+<tr><th>%Block \b operation</td>
+<th>Version constructing a \n dynamic-size block expression</th>
+<th>Version constructing a \n fixed-size block expression</th></tr>
+<tr><td>Top-left p by q block \link DenseBase::topLeftCorner() * \endlink</td>
+    <td>\code
+matrix.topLeftCorner(p,q);\endcode </td>
+    <td>\code 
+matrix.topLeftCorner<p,q>();\endcode </td>
+</tr>
+<tr><td>Bottom-left p by q block
+              \link DenseBase::bottomLeftCorner() * \endlink</td>
+    <td>\code
+matrix.bottomLeftCorner(p,q);\endcode </td>
+    <td>\code 
+matrix.bottomLeftCorner<p,q>();\endcode </td>
+</tr>
+<tr><td>Top-right p by q block
+              \link DenseBase::topRightCorner() * \endlink</td>
+    <td>\code
+matrix.topRightCorner(p,q);\endcode </td>
+    <td>\code 
+matrix.topRightCorner<p,q>();\endcode </td>
+</tr>
+<tr><td>Bottom-right p by q block
+               \link DenseBase::bottomRightCorner() * \endlink</td>
+    <td>\code
+matrix.bottomRightCorner(p,q);\endcode </td>
+    <td>\code 
+matrix.bottomRightCorner<p,q>();\endcode </td>
+</tr>
+<tr><td>%Block containing the first q rows
+                   \link DenseBase::topRows() * \endlink</td>
+    <td>\code
+matrix.topRows(q);\endcode </td>
+    <td>\code 
+matrix.topRows<q>();\endcode </td>
+</tr>
+<tr><td>%Block containing the last q rows
+                    \link DenseBase::bottomRows() * \endlink</td>
+    <td>\code
+matrix.bottomRows(q);\endcode </td>
+    <td>\code 
+matrix.bottomRows<q>();\endcode </td>
+</tr>
+<tr><td>%Block containing the first p columns
+                    \link DenseBase::leftCols() * \endlink</td>
+    <td>\code
+matrix.leftCols(p);\endcode </td>
+    <td>\code 
+matrix.leftCols<p>();\endcode </td>
+</tr>
+<tr><td>%Block containing the last q columns
+                    \link DenseBase::rightCols() * \endlink</td>
+    <td>\code
+matrix.rightCols(q);\endcode </td>
+    <td>\code 
+matrix.rightCols<q>();\endcode </td>
+</tr>
+<tr><td>%Block containing the q columns starting from i
+                    \link DenseBase::middleCols() * \endlink</td>
+    <td>\code
+matrix.middleCols(i,q);\endcode </td>
+    <td>\code 
+matrix.middleCols<q>(i);\endcode </td>
+</tr>
+<tr><td>%Block containing the q rows starting from i
+                    \link DenseBase::middleRows() * \endlink</td>
+    <td>\code
+matrix.middleRows(i,q);\endcode </td>
+    <td>\code 
+matrix.middleRows<q>(i);\endcode </td>
+</tr>
+</table>
+
+Here is a simple example illustrating the use of the operations presented above:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_BlockOperations_corner.cpp
+</td>
+<td>
+\verbinclude Tutorial_BlockOperations_corner.out
+</td></tr></table>
+
+
+\section TutorialBlockOperationsSyntaxVectors Block operations for vectors
+
+Eigen also provides a set of block operations designed specifically for the special case of vectors and one-dimensional arrays:
+
+<table class="manual">
+<tr><th> %Block operation</th>
+<th>Version constructing a \n dynamic-size block expression</th>
+<th>Version constructing a \n fixed-size block expression</th></tr>
+<tr><td>%Block containing the first \p n elements 
+                    \link DenseBase::head() * \endlink</td>
+    <td>\code
+vector.head(n);\endcode </td>
+    <td>\code 
+vector.head<n>();\endcode </td>
+</tr>
+<tr><td>%Block containing the last \p n elements
+                    \link DenseBase::tail() * \endlink</td>
+    <td>\code
+vector.tail(n);\endcode </td>
+    <td>\code 
+vector.tail<n>();\endcode </td>
+</tr>
+<tr><td>%Block containing \p n elements, starting at position \p i
+                    \link DenseBase::segment() * \endlink</td>
+    <td>\code
+vector.segment(i,n);\endcode </td>
+    <td>\code 
+vector.segment<n>(i);\endcode </td>
+</tr>
+</table>
+
+
+An example is presented below:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_BlockOperations_vector.cpp
+</td>
+<td>
+\verbinclude Tutorial_BlockOperations_vector.out
+</td></tr></table>
+
+*/
+
+}

diff --git a/doc/TutorialGeometry.dox b/doc/TutorialGeometry.dox
new file mode 100644
index 0000000..1d214f3
--- /dev/null
+++ b/doc/TutorialGeometry.dox

@@ -0,0 +1,242 @@
+namespace Eigen {
+
+/** \eigenManualPage TutorialGeometry Space transformations
+
+In this page, we will introduce the many possibilities offered by the \ref Geometry_Module "geometry module" to deal with 2D and 3D rotations and projective or affine transformations.
+
+\eigenAutoToc
+
+Eigen's Geometry module provides two different kinds of geometric transformations:
+  - Abstract transformations, such as rotations (represented by \ref AngleAxis "angle and axis" or by a \ref Quaternion "quaternion"), \ref Translation "translations", \ref Scaling "scalings". These transformations are NOT represented as matrices, but you can nevertheless mix them with matrices and vectors in expressions, and convert them to matrices if you wish.
+  - Projective or affine transformation matrices: see the Transform class. These are really matrices.
+
+\note If you are working with OpenGL 4x4 matrices then Affine3f and Affine3d are what you want. Since Eigen defaults to column-major storage, you can directly use the Transform::data() method to pass your transformation matrix to OpenGL.
+
+You can construct a Transform from an abstract transformation, like this:
+\code
+  Transform t(AngleAxis(angle,axis));
+\endcode
+or like this:
+\code
+  Transform t;
+  t = AngleAxis(angle,axis);
+\endcode
+But note that unfortunately, because of how C++ works, you can \b not do this:
+\code
+  Transform t = AngleAxis(angle,axis);
+\endcode
+<span class="note">\b Explanation: In the C++ language, this would require Transform to have a non-explicit conversion constructor from AngleAxis, but we really don't want to allow implicit casting here.
+</span>
+
+\section TutorialGeoElementaryTransformations Transformation types
+
+<table class="manual">
+<tr><th>Transformation type</th><th>Typical initialization code</th></tr>
+<tr><td>
+\ref Rotation2D "2D rotation" from an angle</td><td>\code
+Rotation2D<float> rot2(angle_in_radian);\endcode</td></tr>
+<tr class="alt"><td>
+3D rotation as an \ref AngleAxis "angle + axis"</td><td>\code
+AngleAxis<float> aa(angle_in_radian, Vector3f(ax,ay,az));\endcode
+<span class="note">The axis vector must be normalized.</span></td></tr>
+<tr><td>
+3D rotation as a \ref Quaternion "quaternion"</td><td>\code
+Quaternion<float> q;  q = AngleAxis<float>(angle_in_radian, axis);\endcode</td></tr>
+<tr class="alt"><td>
+N-D Scaling</td><td>\code
+Scaling(sx, sy)
+Scaling(sx, sy, sz)
+Scaling(s)
+Scaling(vecN)\endcode</td></tr>
+<tr><td>
+N-D Translation</td><td>\code
+Translation<float,2>(tx, ty)
+Translation<float,3>(tx, ty, tz)
+Translation<float,N>(s)
+Translation<float,N>(vecN)\endcode</td></tr>
+<tr class="alt"><td>
+N-D \ref TutorialGeoTransform "Affine transformation"</td><td>\code
+Transform<float,N,Affine> t = concatenation_of_any_transformations;
+Transform<float,3,Affine> t = Translation3f(p) * AngleAxisf(a,axis) * Scaling(s);\endcode</td></tr>
+<tr><td>
+N-D Linear transformations \n
+<em class=note>(pure rotations, \n scaling, etc.)</em></td><td>\code
+Matrix<float,N> t = concatenation_of_rotations_and_scalings;
+Matrix<float,2> t = Rotation2Df(a) * Scaling(s);
+Matrix<float,3> t = AngleAxisf(a,axis) * Scaling(s);\endcode</td></tr>
+</table>
+
+<strong>Notes on rotations</strong>\n To transform more than a single vector the preferred
+representations are rotation matrices, while for other usages Quaternion is the
+representation of choice as they are compact, fast and stable. Finally Rotation2D and
+AngleAxis are mainly convenient types to create other rotation objects.
+
+<strong>Notes on Translation and Scaling</strong>\n Like AngleAxis, these classes were
+designed to simplify the creation/initialization of linear (Matrix) and affine (Transform)
+transformations. Nevertheless, unlike AngleAxis which is inefficient to use, these classes
+might still be interesting to write generic and efficient algorithms taking as input any
+kind of transformations.
+
+Any of the above transformation types can be converted to any other types of the same nature,
+or to a more generic type. Here are some additional examples:
+<table class="manual">
+<tr><td>\code
+Rotation2Df r;  r  = Matrix2f(..);       // assumes a pure rotation matrix
+AngleAxisf aa;  aa = Quaternionf(..);
+AngleAxisf aa;  aa = Matrix3f(..);       // assumes a pure rotation matrix
+Matrix2f m;     m  = Rotation2Df(..);
+Matrix3f m;     m  = Quaternionf(..);       Matrix3f m;   m = Scaling(..);
+Affine3f m;     m  = AngleAxis3f(..);       Affine3f m;   m = Scaling(..);
+Affine3f m;     m  = Translation3f(..);     Affine3f m;   m = Matrix3f(..);
+\endcode</td></tr>
+</table>
+
+
+<a href="#" class="top">top</a>\section TutorialGeoCommontransformationAPI Common API across transformation types
+
+To some extent, Eigen's \ref Geometry_Module "geometry module" allows you to write
+generic algorithms working on any kind of transformation representations:
+<table class="manual">
+<tr><td>
+Concatenation of two transformations</td><td>\code
+gen1 * gen2;\endcode</td></tr>
+<tr class="alt"><td>Apply the transformation to a vector</td><td>\code
+vec2 = gen1 * vec1;\endcode</td></tr>
+<tr><td>Get the inverse of the transformation</td><td>\code
+gen2 = gen1.inverse();\endcode</td></tr>
+<tr class="alt"><td>Spherical interpolation \n (Rotation2D and Quaternion only)</td><td>\code
+rot3 = rot1.slerp(alpha,rot2);\endcode</td></tr>
+</table>
+
+
+
+<a href="#" class="top">top</a>\section TutorialGeoTransform Affine transformations
+Generic affine transformations are represented by the Transform class which internally
+is a (Dim+1)^2 matrix. In Eigen we have chosen to not distinghish between points and
+vectors such that all points are actually represented by displacement vectors from the
+origin ( \f$ \mathbf{p} \equiv \mathbf{p}-0 \f$ ). With that in mind, real points and
+vector distinguish when the transformation is applied.
+<table class="manual">
+<tr><td>
+Apply the transformation to a \b point </td><td>\code
+VectorNf p1, p2;
+p2 = t * p1;\endcode</td></tr>
+<tr class="alt"><td>
+Apply the transformation to a \b vector </td><td>\code
+VectorNf vec1, vec2;
+vec2 = t.linear() * vec1;\endcode</td></tr>
+<tr><td>
+Apply a \em general transformation \n to a \b normal \b vector \n
+</td><td>\code
+VectorNf n1, n2;
+MatrixNf normalMatrix = t.linear().inverse().transpose();
+n2 = (normalMatrix * n1).normalized();\endcode</td></tr>
+<tr><td colspan="2">(See subject 5.27 of this <a href="http://www.faqs.org/faqs/graphics/algorithms-faq">faq</a> for the explanations)</td></tr>
+<tr class="alt"><td>
+Apply a transformation with \em pure \em rotation \n to a \b normal \b vector
+(no scaling, no shear)</td><td>\code
+n2 = t.linear() * n1;\endcode</td></tr>
+<tr><td>
+OpenGL compatibility \b 3D </td><td>\code
+glLoadMatrixf(t.data());\endcode</td></tr>
+<tr class="alt"><td>
+OpenGL compatibility \b 2D </td><td>\code
+Affine3f aux(Affine3f::Identity());
+aux.linear().topLeftCorner<2,2>() = t.linear();
+aux.translation().start<2>() = t.translation();
+glLoadMatrixf(aux.data());\endcode</td></tr>
+</table>
+
+\b Component \b accessors
+<table class="manual">
+<tr><td>
+full read-write access to the internal matrix</td><td>\code
+t.matrix() = matN1xN1;    // N1 means N+1
+matN1xN1 = t.matrix();
+\endcode</td></tr>
+<tr class="alt"><td>
+coefficient accessors</td><td>\code
+t(i,j) = scalar;   <=>   t.matrix()(i,j) = scalar;
+scalar = t(i,j);   <=>   scalar = t.matrix()(i,j);
+\endcode</td></tr>
+<tr><td>
+translation part</td><td>\code
+t.translation() = vecN;
+vecN = t.translation();
+\endcode</td></tr>
+<tr class="alt"><td>
+linear part</td><td>\code
+t.linear() = matNxN;
+matNxN = t.linear();
+\endcode</td></tr>
+<tr><td>
+extract the rotation matrix</td><td>\code
+matNxN = t.rotation();
+\endcode</td></tr>
+</table>
+
+
+\b Transformation \b creation \n
+While transformation objects can be created and updated concatenating elementary transformations,
+the Transform class also features a procedural API:
+<table class="manual">
+<tr><th></th><th>procedural API</th><th>equivalent natural API </th></tr>
+<tr><td>Translation</td><td>\code
+t.translate(Vector_(tx,ty,..));
+t.pretranslate(Vector_(tx,ty,..));
+\endcode</td><td>\code
+t *= Translation_(tx,ty,..);
+t = Translation_(tx,ty,..) * t;
+\endcode</td></tr>
+<tr class="alt"><td>\b Rotation \n <em class="note">In 2D and for the procedural API, any_rotation can also \n be an angle in radian</em></td><td>\code
+t.rotate(any_rotation);
+t.prerotate(any_rotation);
+\endcode</td><td>\code
+t *= any_rotation;
+t = any_rotation * t;
+\endcode</td></tr>
+<tr><td>Scaling</td><td>\code
+t.scale(Vector_(sx,sy,..));
+t.scale(s);
+t.prescale(Vector_(sx,sy,..));
+t.prescale(s);
+\endcode</td><td>\code
+t *= Scaling(sx,sy,..);
+t *= Scaling(s);
+t = Scaling(sx,sy,..) * t;
+t = Scaling(s) * t;
+\endcode</td></tr>
+<tr class="alt"><td>Shear transformation \n ( \b 2D \b only ! )</td><td>\code
+t.shear(sx,sy);
+t.preshear(sx,sy);
+\endcode</td><td></td></tr>
+</table>
+
+Note that in both API, any many transformations can be concatenated in a single expression as shown in the two following equivalent examples:
+<table class="manual">
+<tr><td>\code
+t.pretranslate(..).rotate(..).translate(..).scale(..);
+\endcode</td></tr>
+<tr><td>\code
+t = Translation_(..) * t * RotationType(..) * Translation_(..) * Scaling(..);
+\endcode</td></tr>
+</table>
+
+
+
+<a href="#" class="top">top</a>\section TutorialGeoEulerAngles Euler angles
+<table class="manual">
+<tr><td style="max-width:30em;">
+Euler angles might be convenient to create rotation objects.
+On the other hand, since there exist 24 different conventions, they are pretty confusing to use. This example shows how
+to create a rotation matrix according to the 2-1-2 convention.</td><td>\code
+Matrix3f m;
+m = AngleAxisf(angle1, Vector3f::UnitZ())
+ *  * AngleAxisf(angle2, Vector3f::UnitY())
+ *  * AngleAxisf(angle3, Vector3f::UnitZ());
+\endcode</td></tr>
+</table>
+
+*/
+
+}

diff --git a/doc/TutorialLinearAlgebra.dox b/doc/TutorialLinearAlgebra.dox
new file mode 100644
index 0000000..8042fca
--- /dev/null
+++ b/doc/TutorialLinearAlgebra.dox

@@ -0,0 +1,299 @@
+namespace Eigen {
+
+/** \eigenManualPage TutorialLinearAlgebra Linear algebra and decompositions
+
+This page explains how to solve linear systems, compute various decompositions such as LU,
+QR, %SVD, eigendecompositions... After reading this page, don't miss our
+\link TopicLinearAlgebraDecompositions catalogue \endlink of dense matrix decompositions.
+
+\eigenAutoToc
+
+\section TutorialLinAlgBasicSolve Basic linear solving
+
+\b The \b problem: You have a system of equations, that you have written as a single matrix equation
+    \f[ Ax \: = \: b \f]
+Where \a A and \a b are matrices (\a b could be a vector, as a special case). You want to find a solution \a x.
+
+\b The \b solution: You can choose between various decompositions, depending on the properties of your matrix \a A,
+and depending on whether you favor speed or accuracy. However, let's start with an example that works in all cases,
+and is a good compromise:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+  <td>\include TutorialLinAlgExSolveColPivHouseholderQR.cpp </td>
+  <td>\verbinclude TutorialLinAlgExSolveColPivHouseholderQR.out </td>
+</tr>
+</table>
+
+In this example, the colPivHouseholderQr() method returns an object of class ColPivHouseholderQR. Since here the
+matrix is of type Matrix3f, this line could have been replaced by:
+\code
+ColPivHouseholderQR<Matrix3f> dec(A);
+Vector3f x = dec.solve(b);
+\endcode
+
+Here, ColPivHouseholderQR is a QR decomposition with column pivoting. It's a good compromise for this tutorial, as it
+works for all matrices while being quite fast. Here is a table of some other decompositions that you can choose from,
+depending on your matrix, the problem you are trying to solve, and the trade-off you want to make:
+
+<table class="manual">
+    <tr>
+        <th>Decomposition</th>
+        <th>Method</th>
+        <th>Requirements<br/>on the matrix</th>
+        <th>Speed<br/> (small-to-medium)</th>
+        <th>Speed<br/> (large)</th>
+        <th>Accuracy</th>
+    </tr>
+    <tr>
+        <td>PartialPivLU</td>
+        <td>partialPivLu()</td>
+        <td>Invertible</td>
+        <td>++</td>
+        <td>++</td>
+        <td>+</td>
+    </tr>
+    <tr class="alt">
+        <td>FullPivLU</td>
+        <td>fullPivLu()</td>
+        <td>None</td>
+        <td>-</td>
+        <td>- -</td>
+        <td>+++</td>
+    </tr>
+    <tr>
+        <td>HouseholderQR</td>
+        <td>householderQr()</td>
+        <td>None</td>
+        <td>++</td>
+        <td>++</td>
+        <td>+</td>
+    </tr>
+    <tr class="alt">
+        <td>ColPivHouseholderQR</td>
+        <td>colPivHouseholderQr()</td>
+        <td>None</td>
+        <td>+</td>
+        <td>-</td>
+        <td>+++</td>
+    </tr>
+    <tr>
+        <td>FullPivHouseholderQR</td>
+        <td>fullPivHouseholderQr()</td>
+        <td>None</td>
+        <td>-</td>
+        <td>- -</td>
+        <td>+++</td>
+    </tr>
+    <tr class="alt">
+        <td>CompleteOrthogonalDecomposition</td>
+        <td>completeOrthogonalDecomposition()</td>
+        <td>None</td>
+        <td>+</td>
+        <td>-</td>
+        <td>+++</td>
+    </tr>
+    <tr class="alt">
+        <td>LLT</td>
+        <td>llt()</td>
+        <td>Positive definite</td>
+        <td>+++</td>
+        <td>+++</td>
+        <td>+</td>
+    </tr>
+    <tr>
+        <td>LDLT</td>
+        <td>ldlt()</td>
+        <td>Positive or negative<br/> semidefinite</td>
+        <td>+++</td>
+        <td>+</td>
+        <td>++</td>
+    </tr>
+    <tr class="alt">
+        <td>BDCSVD</td>
+        <td>bdcSvd()</td>
+        <td>None</td>
+        <td>-</td>
+        <td>-</td>
+        <td>+++</td>
+    </tr>
+    <tr class="alt">
+        <td>JacobiSVD</td>
+        <td>jacobiSvd()</td>
+        <td>None</td>
+        <td>-</td>
+        <td>- - -</td>
+        <td>+++</td>
+    </tr>
+</table>
+To get an overview of the true relative speed of the different decompositions, check this \link DenseDecompositionBenchmark benchmark \endlink.
+
+All of these decompositions offer a solve() method that works as in the above example. 
+
+If you know more about the properties of your matrix, you can use the above table to select the best method.
+For example, a good choice for solving linear systems with a non-symmetric matrix of full rank is PartialPivLU.
+If you know that your matrix is also symmetric and positive definite, the above table says that
+a very good choice is the LLT or LDLT decomposition. Here's an example, also demonstrating that using a general
+matrix (not a vector) as right hand side is possible:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+  <td>\include TutorialLinAlgExSolveLDLT.cpp </td>
+  <td>\verbinclude TutorialLinAlgExSolveLDLT.out </td>
+</tr>
+</table>
+
+For a \ref TopicLinearAlgebraDecompositions "much more complete table" comparing all decompositions supported by Eigen (notice that Eigen
+supports many other decompositions), see our special page on
+\ref TopicLinearAlgebraDecompositions "this topic".
+
+
+\section TutorialLinAlgLeastsquares Least squares solving
+
+The most general and accurate method to solve under- or over-determined linear systems
+in the least squares sense, is the SVD decomposition. Eigen provides two implementations.
+The recommended one is the BDCSVD class, which scales well for large problems
+and automatically falls back to the JacobiSVD class for smaller problems.
+For both classes, their solve() method solved the linear system in the least-squares
+sense. 
+
+Here is an example:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+  <td>\include TutorialLinAlgSVDSolve.cpp </td>
+  <td>\verbinclude TutorialLinAlgSVDSolve.out </td>
+</tr>
+</table>
+
+An alternative to the SVD, which is usually faster and about as accurate, is CompleteOrthogonalDecomposition. 
+
+Again, if you know more about the problem, the table above contains methods that are potentially faster.
+If your matrix is full rank, HouseHolderQR is the method of choice. If your matrix is full rank and well conditioned,
+using the Cholesky decomposition (LLT) on the matrix of the normal equations can be faster still.
+Our page on \link LeastSquares least squares solving \endlink has more details.
+
+
+\section TutorialLinAlgSolutionExists Checking if a matrix is singular
+
+Only you know what error margin you want to allow for a solution to be considered valid.
+So Eigen lets you do this computation for yourself, if you want to, as in this example:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+  <td>\include TutorialLinAlgExComputeSolveError.cpp </td>
+  <td>\verbinclude TutorialLinAlgExComputeSolveError.out </td>
+</tr>
+</table>
+
+\section TutorialLinAlgEigensolving Computing eigenvalues and eigenvectors
+
+You need an eigendecomposition here, see available such decompositions on \ref TopicLinearAlgebraDecompositions "this page".
+Make sure to check if your matrix is self-adjoint, as is often the case in these problems. Here's an example using
+SelfAdjointEigenSolver, it could easily be adapted to general matrices using EigenSolver or ComplexEigenSolver.
+
+The computation of eigenvalues and eigenvectors does not necessarily converge, but such failure to converge is
+very rare. The call to info() is to check for this possibility.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+  <td>\include TutorialLinAlgSelfAdjointEigenSolver.cpp </td>
+  <td>\verbinclude TutorialLinAlgSelfAdjointEigenSolver.out </td>
+</tr>
+</table>
+
+\section TutorialLinAlgInverse Computing inverse and determinant
+
+First of all, make sure that you really want this. While inverse and determinant are fundamental mathematical concepts,
+in \em numerical linear algebra they are not as useful as in pure mathematics. Inverse computations are often
+advantageously replaced by solve() operations, and the determinant is often \em not a good way of checking if a matrix
+is invertible.
+
+However, for \em very \em small matrices, the above may not be true, and inverse and determinant can be very useful.
+
+While certain decompositions, such as PartialPivLU and FullPivLU, offer inverse() and determinant() methods, you can also
+call inverse() and determinant() directly on a matrix. If your matrix is of a very small fixed size (at most 4x4) this
+allows Eigen to avoid performing a LU decomposition, and instead use formulas that are more efficient on such small matrices.
+
+Here is an example:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+  <td>\include TutorialLinAlgInverseDeterminant.cpp </td>
+  <td>\verbinclude TutorialLinAlgInverseDeterminant.out </td>
+</tr>
+</table>
+
+\section TutorialLinAlgSeparateComputation Separating the computation from the construction
+
+In the above examples, the decomposition was computed at the same time that the decomposition object was constructed.
+There are however situations where you might want to separate these two things, for example if you don't know,
+at the time of the construction, the matrix that you will want to decompose; or if you want to reuse an existing
+decomposition object.
+
+What makes this possible is that:
+\li all decompositions have a default constructor,
+\li all decompositions have a compute(matrix) method that does the computation, and that may be called again
+    on an already-computed decomposition, reinitializing it.
+
+For example:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+  <td>\include TutorialLinAlgComputeTwice.cpp </td>
+  <td>\verbinclude TutorialLinAlgComputeTwice.out </td>
+</tr>
+</table>
+
+Finally, you can tell the decomposition constructor to preallocate storage for decomposing matrices of a given size,
+so that when you subsequently decompose such matrices, no dynamic memory allocation is performed (of course, if you
+are using fixed-size matrices, no dynamic memory allocation happens at all). This is done by just
+passing the size to the decomposition constructor, as in this example:
+\code
+HouseholderQR<MatrixXf> qr(50,50);
+MatrixXf A = MatrixXf::Random(50,50);
+qr.compute(A); // no dynamic memory allocation
+\endcode
+
+\section TutorialLinAlgRankRevealing Rank-revealing decompositions
+
+Certain decompositions are rank-revealing, i.e. are able to compute the rank of a matrix. These are typically
+also the decompositions that behave best in the face of a non-full-rank matrix (which in the square case means a
+singular matrix). On \ref TopicLinearAlgebraDecompositions "this table" you can see for all our decompositions
+whether they are rank-revealing or not.
+
+Rank-revealing decompositions offer at least a rank() method. They can also offer convenience methods such as isInvertible(),
+and some are also providing methods to compute the kernel (null-space) and image (column-space) of the matrix, as is the
+case with FullPivLU:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+  <td>\include TutorialLinAlgRankRevealing.cpp </td>
+  <td>\verbinclude TutorialLinAlgRankRevealing.out </td>
+</tr>
+</table>
+
+Of course, any rank computation depends on the choice of an arbitrary threshold, since practically no
+floating-point matrix is \em exactly rank-deficient. Eigen picks a sensible default threshold, which depends
+on the decomposition but is typically the diagonal size times machine epsilon. While this is the best default we
+could pick, only you know what is the right threshold for your application. You can set this by calling setThreshold()
+on your decomposition object before calling rank() or any other method that needs to use such a threshold.
+The decomposition itself, i.e. the compute() method, is independent of the threshold. You don't need to recompute the
+decomposition after you've changed the threshold.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+  <td>\include TutorialLinAlgSetThreshold.cpp </td>
+  <td>\verbinclude TutorialLinAlgSetThreshold.out </td>
+</tr>
+</table>
+
+*/
+
+}

diff --git a/doc/TutorialMapClass.dox b/doc/TutorialMapClass.dox
new file mode 100644
index 0000000..caa2539
--- /dev/null
+++ b/doc/TutorialMapClass.dox

@@ -0,0 +1,86 @@
+namespace Eigen {
+
+/** \eigenManualPage TutorialMapClass Interfacing with raw buffers: the Map class
+
+This page explains how to work with "raw" C/C++ arrays.
+This can be useful in a variety of contexts, particularly when "importing" vectors and matrices from other libraries into %Eigen.
+
+\eigenAutoToc
+
+\section TutorialMapIntroduction Introduction
+
+Occasionally you may have a pre-defined array of numbers that you want to use within %Eigen as a vector or matrix. While one option is to make a copy of the data, most commonly you probably want to re-use this memory as an %Eigen type. Fortunately, this is very easy with the Map class.
+
+\section TutorialMapTypes Map types and declaring Map variables
+
+A Map object has a type defined by its %Eigen equivalent:
+\code
+Map<Matrix<typename Scalar, int RowsAtCompileTime, int ColsAtCompileTime> >
+\endcode
+Note that, in this default case, a Map requires just a single template parameter.  
+
+To construct a Map variable, you need two other pieces of information: a pointer to the region of memory defining the array of coefficients, and the desired shape of the matrix or vector.  For example, to define a matrix of \c float with sizes determined at compile time, you might do the following:
+\code
+Map<MatrixXf> mf(pf,rows,columns);
+\endcode
+where \c pf is a \c float \c * pointing to the array of memory.  A fixed-size read-only vector of integers might be declared as
+\code
+Map<const Vector4i> mi(pi);
+\endcode
+where \c pi is an \c int \c *. In this case the size does not have to be passed to the constructor, because it is already specified by the Matrix/Array type.
+
+Note that Map does not have a default constructor; you \em must pass a pointer to initialize the object. However, you can work around this requirement (see \ref TutorialMapPlacementNew).
+
+Map is flexible enough to accommodate a variety of different data representations.  There are two other (optional) template parameters:
+\code
+Map<typename MatrixType,
+    int MapOptions,
+    typename StrideType>
+\endcode
+\li \c MapOptions specifies whether the pointer is \c #Aligned, or \c #Unaligned.  The default is \c #Unaligned.
+\li \c StrideType allows you to specify a custom layout for the memory array, using the Stride class.  One example would be to specify that the data array is organized in row-major format:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+<td>\include Tutorial_Map_rowmajor.cpp </td>
+<td>\verbinclude Tutorial_Map_rowmajor.out </td>
+</table>
+However, Stride is even more flexible than this; for details, see the documentation for the Map and Stride classes.
+
+\section TutorialMapUsing Using Map variables
+
+You can use a Map object just like any other %Eigen type:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+<td>\include Tutorial_Map_using.cpp </td>
+<td>\verbinclude Tutorial_Map_using.out </td>
+</table>
+
+All %Eigen functions are written to accept Map objects just like other %Eigen types. However, when writing your own functions taking %Eigen types, this does \em not happen automatically: a Map type is not identical to its Dense equivalent.  See \ref TopicFunctionTakingEigenTypes for details.
+
+\section TutorialMapPlacementNew Changing the mapped array
+
+It is possible to change the array of a Map object after declaration, using the C++ "placement new" syntax:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+<td>\include Map_placement_new.cpp </td>
+<td>\verbinclude Map_placement_new.out </td>
+</table>
+Despite appearances, this does not invoke the memory allocator, because the syntax specifies the location for storing the result.
+
+This syntax makes it possible to declare a Map object without first knowing the mapped array's location in memory:
+\code
+Map<Matrix3f> A(NULL);  // don't try to use this matrix yet!
+VectorXf b(n_matrices);
+for (int i = 0; i < n_matrices; i++)
+{
+  new (&A) Map<Matrix3f>(get_matrix_pointer(i));
+  b(i) = A.trace();
+}
+\endcode
+
+*/
+
+}

diff --git a/doc/TutorialMatrixArithmetic.dox b/doc/TutorialMatrixArithmetic.dox
new file mode 100644
index 0000000..5fc569a
--- /dev/null
+++ b/doc/TutorialMatrixArithmetic.dox

@@ -0,0 +1,214 @@
+namespace Eigen {
+
+/** \eigenManualPage TutorialMatrixArithmetic Matrix and vector arithmetic
+
+This page aims to provide an overview and some details on how to perform arithmetic
+between matrices, vectors and scalars with Eigen.
+
+\eigenAutoToc
+
+\section TutorialArithmeticIntroduction Introduction
+
+Eigen offers matrix/vector arithmetic operations either through overloads of common C++ arithmetic operators such as +, -, *,
+or through special methods such as dot(), cross(), etc.
+For the Matrix class (matrices and vectors), operators are only overloaded to support
+linear-algebraic operations. For example, \c matrix1 \c * \c matrix2 means matrix-matrix product,
+and \c vector \c + \c scalar is just not allowed. If you want to perform all kinds of array operations,
+not linear algebra, see the \ref TutorialArrayClass "next page".
+
+\section TutorialArithmeticAddSub Addition and subtraction
+
+The left hand side and right hand side must, of course, have the same numbers of rows and of columns. They must
+also have the same \c Scalar type, as Eigen doesn't do automatic type promotion. The operators at hand here are:
+\li binary operator + as in \c a+b
+\li binary operator - as in \c a-b
+\li unary operator - as in \c -a
+\li compound operator += as in \c a+=b
+\li compound operator -= as in \c a-=b
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include tut_arithmetic_add_sub.cpp
+</td>
+<td>
+\verbinclude tut_arithmetic_add_sub.out
+</td></tr></table>
+
+\section TutorialArithmeticScalarMulDiv Scalar multiplication and division
+
+Multiplication and division by a scalar is very simple too. The operators at hand here are:
+\li binary operator * as in \c matrix*scalar
+\li binary operator * as in \c scalar*matrix
+\li binary operator / as in \c matrix/scalar
+\li compound operator *= as in \c matrix*=scalar
+\li compound operator /= as in \c matrix/=scalar
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include tut_arithmetic_scalar_mul_div.cpp
+</td>
+<td>
+\verbinclude tut_arithmetic_scalar_mul_div.out
+</td></tr></table>
+
+
+\section TutorialArithmeticMentionXprTemplates A note about expression templates
+
+This is an advanced topic that we explain on \ref TopicEigenExpressionTemplates "this page",
+but it is useful to just mention it now. In Eigen, arithmetic operators such as \c operator+ don't
+perform any computation by themselves, they just return an "expression object" describing the computation to be
+performed. The actual computation happens later, when the whole expression is evaluated, typically in \c operator=.
+While this might sound heavy, any modern optimizing compiler is able to optimize away that abstraction and
+the result is perfectly optimized code. For example, when you do:
+\code
+VectorXf a(50), b(50), c(50), d(50);
+...
+a = 3*b + 4*c + 5*d;
+\endcode
+Eigen compiles it to just one for loop, so that the arrays are traversed only once. Simplifying (e.g. ignoring
+SIMD optimizations), this loop looks like this:
+\code
+for(int i = 0; i < 50; ++i)
+  a[i] = 3*b[i] + 4*c[i] + 5*d[i];
+\endcode
+Thus, you should not be afraid of using relatively large arithmetic expressions with Eigen: it only gives Eigen
+more opportunities for optimization.
+
+\section TutorialArithmeticTranspose Transposition and conjugation
+
+The transpose \f$ a^T \f$, conjugate \f$ \bar{a} \f$, and adjoint (i.e., conjugate transpose) \f$ a^* \f$ of a matrix or vector \f$ a \f$ are obtained by the member functions \link DenseBase::transpose() transpose()\endlink, \link MatrixBase::conjugate() conjugate()\endlink, and \link MatrixBase::adjoint() adjoint()\endlink, respectively.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include tut_arithmetic_transpose_conjugate.cpp
+</td>
+<td>
+\verbinclude tut_arithmetic_transpose_conjugate.out
+</td></tr></table>
+
+For real matrices, \c conjugate() is a no-operation, and so \c adjoint() is equivalent to \c transpose().
+
+As for basic arithmetic operators, \c transpose() and \c adjoint() simply return a proxy object without doing the actual transposition. If you do <tt>b = a.transpose()</tt>, then the transpose is evaluated at the same time as the result is written into \c b. However, there is a complication here. If you do <tt>a = a.transpose()</tt>, then Eigen starts writing the result into \c a before the evaluation of the transpose is finished. Therefore, the instruction <tt>a = a.transpose()</tt> does not replace \c a with its transpose, as one would expect:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include tut_arithmetic_transpose_aliasing.cpp
+</td>
+<td>
+\verbinclude tut_arithmetic_transpose_aliasing.out
+</td></tr></table>
+This is the so-called \ref TopicAliasing "aliasing issue". In "debug mode", i.e., when \ref TopicAssertions "assertions" have not been disabled, such common pitfalls are automatically detected. 
+
+For \em in-place transposition, as for instance in <tt>a = a.transpose()</tt>, simply use the \link DenseBase::transposeInPlace() transposeInPlace()\endlink  function:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include tut_arithmetic_transpose_inplace.cpp
+</td>
+<td>
+\verbinclude tut_arithmetic_transpose_inplace.out
+</td></tr></table>
+There is also the \link MatrixBase::adjointInPlace() adjointInPlace()\endlink function for complex matrices.
+
+\section TutorialArithmeticMatrixMul Matrix-matrix and matrix-vector multiplication
+
+Matrix-matrix multiplication is again done with \c operator*. Since vectors are a special
+case of matrices, they are implicitly handled there too, so matrix-vector product is really just a special
+case of matrix-matrix product, and so is vector-vector outer product. Thus, all these cases are handled by just
+two operators:
+\li binary operator * as in \c a*b
+\li compound operator *= as in \c a*=b (this multiplies on the right: \c a*=b is equivalent to <tt>a = a*b</tt>)
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include tut_arithmetic_matrix_mul.cpp
+</td>
+<td>
+\verbinclude tut_arithmetic_matrix_mul.out
+</td></tr></table>
+
+Note: if you read the above paragraph on expression templates and are worried that doing \c m=m*m might cause
+aliasing issues, be reassured for now: Eigen treats matrix multiplication as a special case and takes care of
+introducing a temporary here, so it will compile \c m=m*m as:
+\code
+tmp = m*m;
+m = tmp;
+\endcode
+If you know your matrix product can be safely evaluated into the destination matrix without aliasing issue, then you can use the \link MatrixBase::noalias() noalias()\endlink function to avoid the temporary, e.g.:
+\code
+c.noalias() += a * b;
+\endcode
+For more details on this topic, see the page on \ref TopicAliasing "aliasing".
+
+\b Note: for BLAS users worried about performance, expressions such as <tt>c.noalias() -= 2 * a.adjoint() * b;</tt> are fully optimized and trigger a single gemm-like function call.
+
+\section TutorialArithmeticDotAndCross Dot product and cross product
+
+For dot product and cross product, you need the \link MatrixBase::dot() dot()\endlink and \link MatrixBase::cross() cross()\endlink methods. Of course, the dot product can also be obtained as a 1x1 matrix as u.adjoint()*v.
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include tut_arithmetic_dot_cross.cpp
+</td>
+<td>
+\verbinclude tut_arithmetic_dot_cross.out
+</td></tr></table>
+
+Remember that cross product is only for vectors of size 3. Dot product is for vectors of any sizes.
+When using complex numbers, Eigen's dot product is conjugate-linear in the first variable and linear in the
+second variable.
+
+\section TutorialArithmeticRedux Basic arithmetic reduction operations
+Eigen also provides some reduction operations to reduce a given matrix or vector to a single value such as the sum (computed by \link DenseBase::sum() sum()\endlink), product (\link DenseBase::prod() prod()\endlink), or the maximum (\link DenseBase::maxCoeff() maxCoeff()\endlink) and minimum (\link DenseBase::minCoeff() minCoeff()\endlink) of all its coefficients.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include tut_arithmetic_redux_basic.cpp
+</td>
+<td>
+\verbinclude tut_arithmetic_redux_basic.out
+</td></tr></table>
+
+The \em trace of a matrix, as returned by the function \link MatrixBase::trace() trace()\endlink, is the sum of the diagonal coefficients and can also be computed as efficiently using <tt>a.diagonal().sum()</tt>, as we will see later on.
+
+There also exist variants of the \c minCoeff and \c maxCoeff functions returning the coordinates of the respective coefficient via the arguments:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include tut_arithmetic_redux_minmax.cpp
+</td>
+<td>
+\verbinclude tut_arithmetic_redux_minmax.out
+</td></tr></table>
+
+
+\section TutorialArithmeticValidity Validity of operations
+Eigen checks the validity of the operations that you perform. When possible,
+it checks them at compile time, producing compilation errors. These error messages can be long and ugly,
+but Eigen writes the important message in UPPERCASE_LETTERS_SO_IT_STANDS_OUT. For example:
+\code
+  Matrix3f m;
+  Vector4f v;
+  v = m*v;      // Compile-time error: YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES
+\endcode
+
+Of course, in many cases, for example when checking dynamic sizes, the check cannot be performed at compile time.
+Eigen then uses runtime assertions. This means that the program will abort with an error message when executing an illegal operation if it is run in "debug mode", and it will probably crash if assertions are turned off.
+
+\code
+  MatrixXf m(3,3);
+  VectorXf v(4);
+  v = m * v; // Run-time assertion failure here: "invalid matrix product"
+\endcode
+
+For more details on this topic, see \ref TopicAssertions "this page".
+
+*/
+
+}

diff --git a/doc/TutorialMatrixClass.dox b/doc/TutorialMatrixClass.dox
new file mode 100644
index 0000000..2c45222
--- /dev/null
+++ b/doc/TutorialMatrixClass.dox

@@ -0,0 +1,293 @@
+namespace Eigen {
+
+/** \eigenManualPage TutorialMatrixClass The Matrix class
+
+\eigenAutoToc
+
+In Eigen, all matrices and vectors are objects of the Matrix template class.
+Vectors are just a special case of matrices, with either 1 row or 1 column.
+
+\section TutorialMatrixFirst3Params The first three template parameters of Matrix
+
+The Matrix class takes six template parameters, but for now it's enough to
+learn about the first three first parameters. The three remaining parameters have default
+values, which for now we will leave untouched, and which we
+\ref TutorialMatrixOptTemplParams "discuss below".
+
+The three mandatory template parameters of Matrix are:
+\code
+Matrix<typename Scalar, int RowsAtCompileTime, int ColsAtCompileTime>
+\endcode
+\li \c Scalar is the scalar type, i.e. the type of the coefficients.
+    That is, if you want a matrix of floats, choose \c float here.
+    See \ref TopicScalarTypes "Scalar types" for a list of all supported
+    scalar types and for how to extend support to new types.
+\li \c RowsAtCompileTime and \c ColsAtCompileTime are the number of rows
+    and columns of the matrix as known at compile time (see 
+    \ref TutorialMatrixDynamic "below" for what to do if the number is not
+    known at compile time).
+
+We offer a lot of convenience typedefs to cover the usual cases. For example, \c Matrix4f is
+a 4x4 matrix of floats. Here is how it is defined by Eigen:
+\code
+typedef Matrix<float, 4, 4> Matrix4f;
+\endcode
+We discuss \ref TutorialMatrixTypedefs "below" these convenience typedefs.
+
+\section TutorialMatrixVectors Vectors
+
+As mentioned above, in Eigen, vectors are just a special case of
+matrices, with either 1 row or 1 column. The case where they have 1 column is the most common;
+such vectors are called column-vectors, often abbreviated as just vectors. In the other case
+where they have 1 row, they are called row-vectors.
+
+For example, the convenience typedef \c Vector3f is a (column) vector of 3 floats. It is defined as follows by Eigen:
+\code
+typedef Matrix<float, 3, 1> Vector3f;
+\endcode
+We also offer convenience typedefs for row-vectors, for example:
+\code
+typedef Matrix<int, 1, 2> RowVector2i;
+\endcode
+
+\section TutorialMatrixDynamic The special value Dynamic
+
+Of course, Eigen is not limited to matrices whose dimensions are known at compile time.
+The \c RowsAtCompileTime and \c ColsAtCompileTime template parameters can take the special
+value \c Dynamic which indicates that the size is unknown at compile time, so must
+be handled as a run-time variable. In Eigen terminology, such a size is referred to as a
+\em dynamic \em size; while a size that is known at compile time is called a
+\em fixed \em size. For example, the convenience typedef \c MatrixXd, meaning
+a matrix of doubles with dynamic size, is defined as follows:
+\code
+typedef Matrix<double, Dynamic, Dynamic> MatrixXd;
+\endcode
+And similarly, we define a self-explanatory typedef \c VectorXi as follows:
+\code
+typedef Matrix<int, Dynamic, 1> VectorXi;
+\endcode
+You can perfectly have e.g. a fixed number of rows with a dynamic number of columns, as in:
+\code
+Matrix<float, 3, Dynamic>
+\endcode
+
+\section TutorialMatrixConstructors Constructors
+
+A default constructor is always available, never performs any dynamic memory allocation, and never initializes the matrix coefficients. You can do:
+\code
+Matrix3f a;
+MatrixXf b;
+\endcode
+Here,
+\li \c a is a 3-by-3 matrix, with a plain float[9] array of uninitialized coefficients,
+\li \c b is a dynamic-size matrix whose size is currently 0-by-0, and whose array of
+coefficients hasn't yet been allocated at all.
+
+Constructors taking sizes are also available. For matrices, the number of rows is always passed first.
+For vectors, just pass the vector size. They allocate the array of coefficients
+with the given size, but don't initialize the coefficients themselves:
+\code
+MatrixXf a(10,15);
+VectorXf b(30);
+\endcode
+Here,
+\li \c a is a 10x15 dynamic-size matrix, with allocated but currently uninitialized coefficients.
+\li \c b is a dynamic-size vector of size 30, with allocated but currently uninitialized coefficients.
+
+In order to offer a uniform API across fixed-size and dynamic-size matrices, it is legal to use these
+constructors on fixed-size matrices, even if passing the sizes is useless in this case. So this is legal:
+\code
+Matrix3f a(3,3);
+\endcode
+and is a no-operation.
+
+Matrices and vectors can also be initialized from lists of coefficients.
+Prior to C++11, this feature is limited to small fixed-size column or vectors up to size 4:
+\code
+Vector2d a(5.0, 6.0);
+Vector3d b(5.0, 6.0, 7.0);
+Vector4d c(5.0, 6.0, 7.0, 8.0);
+\endcode
+
+If C++11 is enabled, fixed-size column or row vectors of arbitrary size can be initialized by passing an arbitrary number of coefficients:
+\code
+Vector2i a(1, 2);                      // A column vector containing the elements {1, 2}
+Matrix<int, 5, 1> b {1, 2, 3, 4, 5};   // A row-vector containing the elements {1, 2, 3, 4, 5}
+Matrix<int, 1, 5> c = {1, 2, 3, 4, 5}; // A column vector containing the elements {1, 2, 3, 4, 5}
+\endcode
+
+In the general case of matrices and vectors with either fixed or runtime sizes,
+coefficients have to be grouped by rows and passed as an initializer list of initializer list (\link Matrix::Matrix(const std::initializer_list<std::initializer_list<Scalar>>&) details \endlink):
+\code
+MatrixXi a {      // construct a 2x2 matrix
+      {1, 2},     // first row
+      {3, 4}      // second row
+};
+Matrix<double, 2, 3> b {
+      {2, 3, 4},
+      {5, 6, 7},
+};
+\endcode
+
+For column or row vectors, implicit transposition is allowed.
+This means that a column vector can be initialized from a single row:
+\code
+VectorXd a {{1.5, 2.5, 3.5}};             // A column-vector with 3 coefficients
+RowVectorXd b {{1.0, 2.0, 3.0, 4.0}};     // A row-vector with 4 coefficients
+\endcode
+
+\section TutorialMatrixCoeffAccessors Coefficient accessors
+
+The primary coefficient accessors and mutators in Eigen are the overloaded parenthesis operators.
+For matrices, the row index is always passed first. For vectors, just pass one index.
+The numbering starts at 0. This example is self-explanatory:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include tut_matrix_coefficient_accessors.cpp
+</td>
+<td>
+\verbinclude tut_matrix_coefficient_accessors.out
+</td></tr></table>
+
+Note that the syntax <tt> m(index) </tt>
+is not restricted to vectors, it is also available for general matrices, meaning index-based access
+in the array of coefficients. This however depends on the matrix's storage order. All Eigen matrices default to
+column-major storage order, but this can be changed to row-major, see \ref TopicStorageOrders "Storage orders".
+
+The operator[] is also overloaded for index-based access in vectors, but keep in mind that C++ doesn't allow operator[] to
+take more than one argument. We restrict operator[] to vectors, because an awkwardness in the C++ language
+would make matrix[i,j] compile to the same thing as matrix[j] !
+
+\section TutorialMatrixCommaInitializer Comma-initialization
+
+%Matrix and vector coefficients can be conveniently set using the so-called \em comma-initializer syntax.
+For now, it is enough to know this example:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+<td>\include Tutorial_commainit_01.cpp </td>
+<td>\verbinclude Tutorial_commainit_01.out </td>
+</tr></table>
+
+
+The right-hand side can also contain matrix expressions as discussed in \ref TutorialAdvancedInitialization "this page".
+
+\section TutorialMatrixSizesResizing Resizing
+
+The current size of a matrix can be retrieved by \link EigenBase::rows() rows()\endlink, \link EigenBase::cols() cols() \endlink and \link EigenBase::size() size()\endlink. These methods return the number of rows, the number of columns and the number of coefficients, respectively. Resizing a dynamic-size matrix is done by the \link PlainObjectBase::resize(Index,Index) resize() \endlink method.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+<td>\include tut_matrix_resize.cpp </td>
+<td>\verbinclude tut_matrix_resize.out </td>
+</tr></table>
+
+The resize() method is a no-operation if the actual matrix size doesn't change; otherwise it is destructive: the values of the coefficients may change.
+If you want a conservative variant of resize() which does not change the coefficients, use \link PlainObjectBase::conservativeResize() conservativeResize()\endlink, see \ref TopicResizing "this page" for more details.
+
+All these methods are still available on fixed-size matrices, for the sake of API uniformity. Of course, you can't actually
+resize a fixed-size matrix. Trying to change a fixed size to an actually different value will trigger an assertion failure;
+but the following code is legal:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+<td>\include tut_matrix_resize_fixed_size.cpp </td>
+<td>\verbinclude tut_matrix_resize_fixed_size.out </td>
+</tr></table>
+
+
+\section TutorialMatrixAssignment Assignment and resizing
+
+Assignment is the action of copying a matrix into another, using \c operator=. Eigen resizes the matrix on the left-hand side automatically so that it matches the size of the matrix on the right-hand size. For example:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+<td>\include tut_matrix_assignment_resizing.cpp </td>
+<td>\verbinclude tut_matrix_assignment_resizing.out </td>
+</tr></table>
+
+Of course, if the left-hand side is of fixed size, resizing it is not allowed.
+
+If you do not want this automatic resizing to happen (for example for debugging purposes), you can disable it, see
+\ref TopicResizing "this page".
+
+
+\section TutorialMatrixFixedVsDynamic Fixed vs. Dynamic size
+
+When should one use fixed sizes (e.g. \c Matrix4f), and when should one prefer dynamic sizes (e.g. \c MatrixXf)?
+The simple answer is: use fixed
+sizes for very small sizes where you can, and use dynamic sizes for larger sizes or where you have to. For small sizes,
+especially for sizes smaller than (roughly) 16, using fixed sizes is hugely beneficial
+to performance, as it allows Eigen to avoid dynamic memory allocation and to unroll
+loops. Internally, a fixed-size Eigen matrix is just a plain array, i.e. doing
+\code Matrix4f mymatrix; \endcode
+really amounts to just doing
+\code float mymatrix[16]; \endcode
+so this really has zero runtime cost. By contrast, the array of a dynamic-size matrix
+is always allocated on the heap, so doing
+\code MatrixXf mymatrix(rows,columns); \endcode
+amounts to doing
+\code float *mymatrix = new float[rows*columns]; \endcode
+and in addition to that, the MatrixXf object stores its number of rows and columns as
+member variables.
+
+The limitation of using fixed sizes, of course, is that this is only possible
+when you know the sizes at compile time. Also, for large enough sizes, say for sizes
+greater than (roughly) 32, the performance benefit of using fixed sizes becomes negligible.
+Worse, trying to create a very large matrix using fixed sizes inside a function could result in a
+stack overflow, since Eigen will try to allocate the array automatically as a local variable, and
+this is normally done on the stack.
+Finally, depending on circumstances, Eigen can also be more aggressive trying to vectorize
+(use SIMD instructions) when dynamic sizes are used, see \ref TopicVectorization "Vectorization".
+
+\section TutorialMatrixOptTemplParams Optional template parameters
+
+We mentioned at the beginning of this page that the Matrix class takes six template parameters,
+but so far we only discussed the first three. The remaining three parameters are optional. Here is
+the complete list of template parameters:
+\code
+Matrix<typename Scalar,
+       int RowsAtCompileTime,
+       int ColsAtCompileTime,
+       int Options = 0,
+       int MaxRowsAtCompileTime = RowsAtCompileTime,
+       int MaxColsAtCompileTime = ColsAtCompileTime>
+\endcode
+\li \c Options is a bit field. Here, we discuss only one bit: \c RowMajor. It specifies that the matrices
+      of this type use row-major storage order; by default, the storage order is column-major. See the page on
+      \ref TopicStorageOrders "storage orders". For example, this type means row-major 3x3 matrices:
+      \code
+      Matrix<float, 3, 3, RowMajor>
+      \endcode
+\li \c MaxRowsAtCompileTime and \c MaxColsAtCompileTime are useful when you want to specify that, even though
+      the exact sizes of your matrices are not known at compile time, a fixed upper bound is known at
+      compile time. The biggest reason why you might want to do that is to avoid dynamic memory allocation.
+      For example the following matrix type uses a plain array of 12 floats, without dynamic memory allocation:
+      \code
+      Matrix<float, Dynamic, Dynamic, 0, 3, 4>
+      \endcode
+
+\section TutorialMatrixTypedefs Convenience typedefs
+
+Eigen defines the following Matrix typedefs:
+\li MatrixNt for Matrix<type, N, N>. For example, MatrixXi for Matrix<int, Dynamic, Dynamic>.
+\li VectorNt for Matrix<type, N, 1>. For example, Vector2f for Matrix<float, 2, 1>.
+\li RowVectorNt for Matrix<type, 1, N>. For example, RowVector3d for Matrix<double, 1, 3>.
+
+Where:
+\li N can be any one of \c 2, \c 3, \c 4, or \c X (meaning \c Dynamic).
+\li t can be any one of \c i (meaning int), \c f (meaning float), \c d (meaning double),
+      \c cf (meaning complex<float>), or \c cd (meaning complex<double>). The fact that typedefs are only
+    defined for these five types doesn't mean that they are the only supported scalar types. For example,
+    all standard integer types are supported, see \ref TopicScalarTypes "Scalar types".
+
+
+*/
+
+}

diff --git a/doc/TutorialReductionsVisitorsBroadcasting.dox b/doc/TutorialReductionsVisitorsBroadcasting.dox
new file mode 100644
index 0000000..f5322b4
--- /dev/null
+++ b/doc/TutorialReductionsVisitorsBroadcasting.dox

@@ -0,0 +1,266 @@
+namespace Eigen {
+
+/** \eigenManualPage TutorialReductionsVisitorsBroadcasting Reductions, visitors and broadcasting
+
+This page explains Eigen's reductions, visitors and broadcasting and how they are used with
+\link MatrixBase matrices \endlink and \link ArrayBase arrays \endlink.
+
+\eigenAutoToc
+
+\section TutorialReductionsVisitorsBroadcastingReductions Reductions
+In Eigen, a reduction is a function taking a matrix or array, and returning a single
+scalar value. One of the most used reductions is \link DenseBase::sum() .sum() \endlink,
+returning the sum of all the coefficients inside a given matrix or array.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include tut_arithmetic_redux_basic.cpp
+</td>
+<td>
+\verbinclude tut_arithmetic_redux_basic.out
+</td></tr></table>
+
+The \em trace of a matrix, as returned by the function \c trace(), is the sum of the diagonal coefficients and can equivalently be computed <tt>a.diagonal().sum()</tt>.
+
+
+\subsection TutorialReductionsVisitorsBroadcastingReductionsNorm Norm computations
+
+The (Euclidean a.k.a. \f$\ell^2\f$) squared norm of a vector can be obtained \link MatrixBase::squaredNorm() squaredNorm() \endlink. It is equal to the dot product of the vector by itself, and equivalently to the sum of squared absolute values of its coefficients.
+
+Eigen also provides the \link MatrixBase::norm() norm() \endlink method, which returns the square root of \link MatrixBase::squaredNorm() squaredNorm() \endlink.
+
+These operations can also operate on matrices; in that case, a n-by-p matrix is seen as a vector of size (n*p), so for example the \link MatrixBase::norm() norm() \endlink method returns the "Frobenius" or "Hilbert-Schmidt" norm. We refrain from speaking of the \f$\ell^2\f$ norm of a matrix because that can mean different things.
+
+If you want other coefficient-wise \f$\ell^p\f$ norms, use the \link MatrixBase::lpNorm lpNorm<p>() \endlink method. The template parameter \a p can take the special value \a Infinity if you want the \f$\ell^\infty\f$ norm, which is the maximum of the absolute values of the coefficients.
+
+The following example demonstrates these methods.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_ReductionsVisitorsBroadcasting_reductions_norm.cpp
+</td>
+<td>
+\verbinclude Tutorial_ReductionsVisitorsBroadcasting_reductions_norm.out
+</td></tr></table>
+
+\b Operator \b norm: The 1-norm and \f$\infty\f$-norm <a href="https://en.wikipedia.org/wiki/Operator_norm">matrix operator norms</a> can easily be computed as follows:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_ReductionsVisitorsBroadcasting_reductions_operatornorm.cpp
+</td>
+<td>
+\verbinclude Tutorial_ReductionsVisitorsBroadcasting_reductions_operatornorm.out
+</td></tr></table>
+See below for more explanations on the syntax of these expressions.
+
+\subsection TutorialReductionsVisitorsBroadcastingReductionsBool Boolean reductions
+
+The following reductions operate on boolean values:
+  - \link DenseBase::all() all() \endlink returns \b true if all of the coefficients in a given Matrix or Array evaluate to \b true .
+  - \link DenseBase::any() any() \endlink returns \b true if at least one of the coefficients in a given Matrix or Array evaluates to \b true .
+  - \link DenseBase::count() count() \endlink returns the number of coefficients in a given Matrix or Array that evaluate to  \b true.
+
+These are typically used in conjunction with the coefficient-wise comparison and equality operators provided by Array. For instance, <tt>array > 0</tt> is an %Array of the same size as \c array , with \b true at those positions where the corresponding coefficient of \c array is positive. Thus, <tt>(array > 0).all()</tt> tests whether all coefficients of \c array are positive. This can be seen in the following example:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_ReductionsVisitorsBroadcasting_reductions_bool.cpp
+</td>
+<td>
+\verbinclude Tutorial_ReductionsVisitorsBroadcasting_reductions_bool.out
+</td></tr></table>
+
+\subsection TutorialReductionsVisitorsBroadcastingReductionsUserdefined User defined reductions
+
+TODO
+
+In the meantime you can have a look at the DenseBase::redux() function.
+
+\section TutorialReductionsVisitorsBroadcastingVisitors Visitors
+Visitors are useful when one wants to obtain the location of a coefficient inside 
+a Matrix or Array. The simplest examples are 
+\link MatrixBase::maxCoeff() maxCoeff(&x,&y) \endlink and 
+\link MatrixBase::minCoeff() minCoeff(&x,&y)\endlink, which can be used to find
+the location of the greatest or smallest coefficient in a Matrix or 
+Array.
+
+The arguments passed to a visitor are pointers to the variables where the
+row and column position are to be stored. These variables should be of type
+\link Eigen::Index Index \endlink, as shown below:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_ReductionsVisitorsBroadcasting_visitors.cpp
+</td>
+<td>
+\verbinclude Tutorial_ReductionsVisitorsBroadcasting_visitors.out
+</td></tr></table>
+
+Both functions also return the value of the minimum or maximum coefficient.
+
+\section TutorialReductionsVisitorsBroadcastingPartialReductions Partial reductions
+Partial reductions are reductions that can operate column- or row-wise on a Matrix or 
+Array, applying the reduction operation on each column or row and 
+returning a column or row vector with the corresponding values. Partial reductions are applied 
+with \link DenseBase::colwise() colwise() \endlink or \link DenseBase::rowwise() rowwise() \endlink.
+
+A simple example is obtaining the maximum of the elements 
+in each column in a given matrix, storing the result in a row vector:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_ReductionsVisitorsBroadcasting_colwise.cpp
+</td>
+<td>
+\verbinclude Tutorial_ReductionsVisitorsBroadcasting_colwise.out
+</td></tr></table>
+
+The same operation can be performed row-wise:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_ReductionsVisitorsBroadcasting_rowwise.cpp
+</td>
+<td>
+\verbinclude Tutorial_ReductionsVisitorsBroadcasting_rowwise.out
+</td></tr></table>
+
+<b>Note that column-wise operations return a row vector, while row-wise operations return a column vector.</b>
+
+\subsection TutorialReductionsVisitorsBroadcastingPartialReductionsCombined Combining partial reductions with other operations
+It is also possible to use the result of a partial reduction to do further processing.
+Here is another example that finds the column whose sum of elements is the maximum
+ within a matrix. With column-wise partial reductions this can be coded as:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_ReductionsVisitorsBroadcasting_maxnorm.cpp
+</td>
+<td>
+\verbinclude Tutorial_ReductionsVisitorsBroadcasting_maxnorm.out
+</td></tr></table>
+
+The previous example applies the \link DenseBase::sum() sum() \endlink reduction on each column
+though the \link DenseBase::colwise() colwise() \endlink visitor, obtaining a new matrix whose
+size is 1x4.
+
+Therefore, if
+\f[
+\mbox{m} = \begin{bmatrix} 1 & 2 & 6 & 9 \\
+                    3 & 1 & 7 & 2 \end{bmatrix}
+\f]
+
+then
+
+\f[
+\mbox{m.colwise().sum()} = \begin{bmatrix} 4 & 3 & 13 & 11 \end{bmatrix}
+\f]
+
+The \link DenseBase::maxCoeff() maxCoeff() \endlink reduction is finally applied 
+to obtain the column index where the maximum sum is found, 
+which is the column index 2 (third column) in this case.
+
+
+\section TutorialReductionsVisitorsBroadcastingBroadcasting Broadcasting
+The concept behind broadcasting is similar to partial reductions, with the difference that broadcasting 
+constructs an expression where a vector (column or row) is interpreted as a matrix by replicating it in 
+one direction.
+
+A simple example is to add a certain column vector to each column in a matrix. 
+This can be accomplished with:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple.cpp
+</td>
+<td>
+\verbinclude Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple.out
+</td></tr></table>
+
+We can interpret the instruction <tt>mat.colwise() += v</tt> in two equivalent ways. It adds the vector \c v
+to every column of the matrix. Alternatively, it can be interpreted as repeating the vector \c v four times to
+form a four-by-two matrix which is then added to \c mat:
+\f[
+\begin{bmatrix} 1 & 2 & 6 & 9 \\ 3 & 1 & 7 & 2 \end{bmatrix}
++ \begin{bmatrix} 0 & 0 & 0 & 0 \\ 1 & 1 & 1 & 1 \end{bmatrix}
+= \begin{bmatrix} 1 & 2 & 6 & 9 \\ 4 & 2 & 8 & 3 \end{bmatrix}.
+\f]
+The operators <tt>-=</tt>, <tt>+</tt> and <tt>-</tt> can also be used column-wise and row-wise. On arrays, we 
+can also use the operators <tt>*=</tt>, <tt>/=</tt>, <tt>*</tt> and <tt>/</tt> to perform coefficient-wise 
+multiplication and division column-wise or row-wise. These operators are not available on matrices because it
+is not clear what they would do. If you want multiply column 0 of a matrix \c mat with \c v(0), column 1 with 
+\c v(1), and so on, then use <tt>mat = mat * v.asDiagonal()</tt>.
+
+It is important to point out that the vector to be added column-wise or row-wise must be of type Vector,
+and cannot be a Matrix. If this is not met then you will get compile-time error. This also means that
+broadcasting operations can only be applied with an object of type Vector, when operating with Matrix.
+The same applies for the Array class, where the equivalent for VectorXf is ArrayXf. As always, you should
+not mix arrays and matrices in the same expression.
+
+To perform the same operation row-wise we can do:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple_rowwise.cpp
+</td>
+<td>
+\verbinclude Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple_rowwise.out
+</td></tr></table>
+
+\subsection TutorialReductionsVisitorsBroadcastingBroadcastingCombined Combining broadcasting with other operations
+Broadcasting can also be combined with other operations, such as Matrix or Array operations, 
+reductions and partial reductions.
+
+Now that broadcasting, reductions and partial reductions have been introduced, we can dive into a more advanced example that finds
+the nearest neighbour of a vector <tt>v</tt> within the columns of matrix <tt>m</tt>. The Euclidean distance will be used in this example,
+computing the squared Euclidean distance with the partial reduction named \link MatrixBase::squaredNorm() squaredNorm() \endlink:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_ReductionsVisitorsBroadcasting_broadcast_1nn.cpp
+</td>
+<td>
+\verbinclude Tutorial_ReductionsVisitorsBroadcasting_broadcast_1nn.out
+</td></tr></table>
+
+The line that does the job is 
+\code
+  (m.colwise() - v).colwise().squaredNorm().minCoeff(&index);
+\endcode
+
+We will go step by step to understand what is happening:
+
+  - <tt>m.colwise() - v</tt> is a broadcasting operation, subtracting <tt>v</tt> from each column in <tt>m</tt>. The result of this operation
+is a new matrix whose size is the same as matrix <tt>m</tt>: \f[
+  \mbox{m.colwise() - v} = 
+  \begin{bmatrix}
+    -1 & 21 & 4 & 7 \\
+     0 & 8  & 4 & -1
+  \end{bmatrix}
+\f]
+
+  - <tt>(m.colwise() - v).colwise().squaredNorm()</tt> is a partial reduction, computing the squared norm column-wise. The result of
+this operation is a row vector where each coefficient is the squared Euclidean distance between each column in <tt>m</tt> and <tt>v</tt>: \f[
+  \mbox{(m.colwise() - v).colwise().squaredNorm()} =
+  \begin{bmatrix}
+     1 & 505 & 32 & 50
+  \end{bmatrix}
+\f]
+
+  - Finally, <tt>minCoeff(&index)</tt> is used to obtain the index of the column in <tt>m</tt> that is closest to <tt>v</tt> in terms of Euclidean
+distance.
+
+*/
+
+}

diff --git a/doc/TutorialReshape.dox b/doc/TutorialReshape.dox
new file mode 100644
index 0000000..5b4022a
--- /dev/null
+++ b/doc/TutorialReshape.dox

@@ -0,0 +1,82 @@
+namespace Eigen {
+
+/** \eigenManualPage TutorialReshape Reshape
+
+Since the version 3.4, %Eigen exposes convenient methods to reshape a matrix to another matrix of different sizes or vector.
+All cases are handled via the DenseBase::reshaped(NRowsType,NColsType) and DenseBase::reshaped() functions.
+Those functions do not perform in-place reshaping, but instead return a <i> view </i> on the input expression.
+
+\eigenAutoToc
+
+\section TutorialReshapeMat2Mat Reshaped 2D views
+
+The more general reshaping transformation is handled via: `reshaped(nrows,ncols)`.
+Here is an example reshaping a 4x4 matrix to a 2x8 one:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include MatrixBase_reshaped_int_int.cpp
+</td>
+<td>
+\verbinclude MatrixBase_reshaped_int_int.out
+</td></tr></table>
+
+By default, the input coefficients are always interpreted in column-major order regardless of the storage order of the input expression.
+For more control on ordering, compile-time sizes, and automatic size deduction, please see de documentation of DenseBase::reshaped(NRowsType,NColsType) that contains all the details with many examples.
+
+
+\section TutorialReshapeMat2Vec 1D linear views
+
+A very common usage of reshaping is to create a 1D linear view over a given 2D matrix or expression.
+In this case, sizes can be deduced and thus omitted as in the following example:
+
+<table class="example">
+<tr><th>Example:</th></tr>
+<tr><td>
+\include MatrixBase_reshaped_to_vector.cpp
+</td></tr>
+<tr><th>Output:</th></tr>
+<tr><td>
+\verbinclude MatrixBase_reshaped_to_vector.out
+</td></tr></table>
+
+This shortcut always returns a column vector and by default input coefficients are always interpreted in column-major order.
+Again, see the documentation of DenseBase::reshaped() for more control on the ordering.
+
+\section TutorialReshapeInPlace
+
+The above examples create reshaped views, but what about reshaping inplace a given matrix?
+Of course this task in only conceivable for matrix and arrays having runtime dimensions.
+In many cases, this can be accomplished via PlainObjectBase::resize(Index,Index):
+
+<table class="example">
+<tr><th>Example:</th></tr>
+<tr><td>
+\include Tutorial_reshaped_vs_resize_1.cpp
+</td></tr>
+<tr><th>Output:</th></tr>
+<tr><td>
+\verbinclude Tutorial_reshaped_vs_resize_1.out
+</td></tr></table>
+
+However beware that unlike \c reshaped, the result of \c resize depends on the input storage order.
+It thus behaves similarly to `reshaped<AutoOrder>`:
+
+<table class="example">
+<tr><th>Example:</th></tr>
+<tr><td>
+\include Tutorial_reshaped_vs_resize_2.cpp
+</td></tr>
+<tr><th>Output:</th></tr>
+<tr><td>
+\verbinclude Tutorial_reshaped_vs_resize_2.out
+</td></tr></table>
+
+Finally, assigning a reshaped matrix to itself is currently not supported and will result to undefined-behavior because of \link TopicAliasing aliasing \endlink.
+The following is forbidden: \code A = A.reshaped(2,8); \endcode
+This is OK: \code A = A.reshaped(2,8).eval(); \endcode
+
+*/
+
+}

diff --git a/doc/TutorialSTL.dox b/doc/TutorialSTL.dox
new file mode 100644
index 0000000..9a825bc
--- /dev/null
+++ b/doc/TutorialSTL.dox

@@ -0,0 +1,66 @@
+namespace Eigen {
+
+/** \eigenManualPage TutorialSTL STL iterators and algorithms
+
+Since the version 3.4, %Eigen's dense matrices and arrays provide STL compatible iterators.
+As demonstrated below, this makes them naturally compatible with range-for-loops and STL's algorithms.
+
+\eigenAutoToc
+
+\section TutorialSTLVectors Iterating over 1D arrays and vectors 
+
+Any dense 1D expressions exposes the pair of `begin()/end()` methods to iterate over them.
+
+This directly enables c++11 range for loops:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_range_for_loop_1d_cxx11.cpp
+</td>
+<td>
+\verbinclude Tutorial_range_for_loop_1d_cxx11.out
+</td></tr></table>
+
+One dimensional expressions can also easily be passed to STL algorithms:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_std_sort.cpp
+</td>
+<td>
+\verbinclude Tutorial_std_sort.out
+</td></tr></table>
+
+Similar to `std::vector`, 1D expressions also exposes the pair of `cbegin()/cend()` methods to conveniently get const iterators on non-const object.
+
+\section TutorialSTLMatrices Iterating over coefficients of 2D arrays and matrices
+
+STL iterators are intrinsically designed to iterate over 1D structures.
+This is why `begin()/end()` methods are disabled for 2D expressions.
+Iterating over all coefficients of a 2D expressions is still easily accomplished by creating a 1D linear view through `reshaped()`:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_range_for_loop_2d_cxx11.cpp
+</td>
+<td>
+\verbinclude Tutorial_range_for_loop_2d_cxx11.out
+</td></tr></table>
+
+\section TutorialSTLRowsColumns Iterating over rows or columns of 2D arrays and matrices
+
+It is also possible to get iterators over rows or columns of 2D expressions.
+Those are available through the `rowwise()` and `colwise()` proxies.
+Here is an example sorting each row of a matrix:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_std_sort_rows_cxx11.cpp
+</td>
+<td>
+\verbinclude Tutorial_std_sort_rows_cxx11.out
+</td></tr></table>
+
+*/
+
+}

diff --git a/doc/TutorialSlicingIndexing.dox b/doc/TutorialSlicingIndexing.dox
new file mode 100644
index 0000000..98ace43
--- /dev/null
+++ b/doc/TutorialSlicingIndexing.dox

@@ -0,0 +1,244 @@
+namespace Eigen {
+
+/** \eigenManualPage TutorialSlicingIndexing Slicing and Indexing
+
+This page presents the numerous possibilities offered by `operator()` to index sub-set of rows and columns.
+This API has been introduced in %Eigen 3.4.
+It supports all the feature proposed by the \link TutorialBlockOperations block API \endlink, and much more.
+In particular, it supports \b slicing that consists in taking a set of rows, columns, or elements, uniformly spaced within a matrix or indexed from an array of indices.
+
+\eigenAutoToc
+
+\section TutorialSlicingOverview Overview
+
+All the aforementioned operations are handled through the generic DenseBase::operator()(const RowIndices&, const ColIndices&) method.
+Each argument can be:
+  - An integer indexing a single row or column, including symbolic indices.
+  - The symbol Eigen::all representing the whole set of respective rows or columns in increasing order.
+  - An ArithmeticSequence as constructed by the Eigen::seq, Eigen::seqN, or Eigen::lastN functions.
+  - Any 1D vector/array of integers including %Eigen's vector/array, expressions, std::vector, std::array, as well as plain C arrays: `int[N]`.
+
+More generally, it can accepts any object exposing the following two member functions:
+  \code
+  <integral type> operator[](<integral type>) const;
+  <integral type> size() const;
+  \endcode
+where `<integral type>` stands for any integer type compatible with Eigen::Index (i.e. `std::ptrdiff_t`).
+
+\section TutorialSlicingBasic Basic slicing
+
+Taking a set of rows, columns, or elements, uniformly spaced within a matrix or vector is achieved through the Eigen::seq or Eigen::seqN functions where "seq" stands for arithmetic sequence. Their signatures are summarized below:
+
+<table class="manual">
+<tr>
+  <th>function</th>
+  <th>description</th>
+  <th>example</th>
+</tr>
+<tr>
+  <td>\code seq(firstIdx,lastIdx) \endcode</td>
+  <td>represents the sequence of integers ranging from \c firstIdx to \c lastIdx</td>
+  <td>\code seq(2,5) <=> {2,3,4,5} \endcode</td>
+</tr>
+<tr>
+  <td>\code seq(firstIdx,lastIdx,incr) \endcode</td>
+  <td>same but using the increment \c incr to advance from one index to the next</td>
+  <td>\code seq(2,8,2) <=> {2,4,6,8} \endcode</td>
+</tr>
+<tr>
+  <td>\code seqN(firstIdx,size) \endcode</td>
+  <td>represents the sequence of \c size integers starting from \c firstIdx</td>
+  <td>\code seqN(2,5) <=> {2,3,4,5,6} \endcode</td>
+</tr>
+<tr>
+  <td>\code seqN(firstIdx,size,incr) \endcode</td>
+  <td>same but using the increment \c incr to advance from one index to the next</td>
+  <td>\code seqN(2,3,3) <=> {2,5,8} \endcode</td>
+</tr>
+</table>
+
+The \c firstIdx and \c lastIdx parameters can also be defined with the help of the Eigen::last symbol representing the index of the last row, column or element of the underlying matrix/vector once the arithmetic sequence is passed to it through operator().
+Here are some examples for a 2D array/matrix \c A and a 1D array/vector \c v.
+<table class="manual">
+<tr>
+  <th>Intent</th>
+  <th>Code</th>
+  <th>Block-API equivalence</th>
+</tr>
+<tr>
+  <td>Bottom-left corner starting at row \c i with \c n columns</td>
+  <td>\code A(seq(i,last), seqN(0,n)) \endcode</td>
+  <td>\code A.bottomLeftCorner(A.rows()-i,n) \endcode</td>
+</tr>
+<tr>
+  <td>%Block starting at \c i,j having \c m rows, and \c n columns</td>
+  <td>\code A(seqN(i,m), seqN(i,n) \endcode</td>
+  <td>\code A.block(i,j,m,n) \endcode</td>
+</tr>
+<tr>
+  <td>%Block starting at \c i0,j0 and ending at \c i1,j1</td>
+  <td>\code A(seq(i0,i1), seq(j0,j1) \endcode</td>
+  <td>\code A.block(i0,j0,i1-i0+1,j1-j0+1) \endcode</td>
+</tr>
+<tr>
+  <td>Even columns of A</td>
+  <td>\code A(all, seq(0,last,2)) \endcode</td>
+  <td></td>
+</tr>
+<tr>
+  <td>First \c n odd rows A</td>
+  <td>\code A(seqN(1,n,2), all) \endcode</td>
+  <td></td>
+</tr>
+<tr>
+  <td>The last past one column</td>
+  <td>\code A(all, last-1) \endcode</td>
+  <td>\code A.col(A.cols()-2) \endcode</td>
+</tr>
+<tr>
+  <td>The middle row</td>
+  <td>\code A(last/2,all) \endcode</td>
+  <td>\code A.row((A.rows()-1)/2) \endcode</td>
+</tr>
+<tr>
+  <td>Last elements of v starting at i</td>
+  <td>\code v(seq(i,last)) \endcode</td>
+  <td>\code v.tail(v.size()-i) \endcode</td>
+</tr>
+<tr>
+  <td>Last \c n elements of v</td>
+  <td>\code v(seq(last+1-n,last)) \endcode</td>
+  <td>\code v.tail(n) \endcode</td>
+</tr>
+</table>
+
+As seen in the last exemple, referencing the <i> last n </i> elements (or rows/columns) is a bit cumbersome to write.
+This becomes even more tricky and error prone with a non-default increment.
+Here comes \link Eigen::lastN(SizeType) Eigen::lastN(size) \endlink, and \link Eigen::lastN(SizeType,IncrType) Eigen::lastN(size,incr) \endlink:
+
+<table class="manual">
+<tr>
+  <th>Intent</th>
+  <th>Code</th>
+  <th>Block-API equivalence</th>
+</tr>
+<tr>
+  <td>Last \c n elements of v</td>
+  <td>\code v(lastN(n)) \endcode</td>
+  <td>\code v.tail(n) \endcode</td>
+</tr>
+<tr>
+  <td>Bottom-right corner of A of size \c m times \c n</td>
+  <td>\code v(lastN(m), lastN(n)) \endcode</td>
+  <td>\code A.bottomRightCorner(m,n) \endcode</td>
+</tr>
+<tr>
+  <td>Bottom-right corner of A of size \c m times \c n</td>
+  <td>\code v(lastN(m), lastN(n)) \endcode</td>
+  <td>\code A.bottomRightCorner(m,n) \endcode</td>
+</tr>
+<tr>
+  <td>Last \c n columns taking 1 column over 3</td>
+  <td>\code A(all, lastN(n,3)) \endcode</td>
+  <td></td>
+</tr>
+</table>
+
+\section TutorialSlicingFixed Compile time size and increment
+
+In terms of performance, %Eigen and the compiler can take advantage of compile-time size and increment.
+To this end, you can enforce compile-time parameters using Eigen::fix<val>.
+Such compile-time value can be combined with the Eigen::last symbol:
+\code v(seq(last-fix<7>, last-fix<2>))
+\endcode
+In this example %Eigen knowns at compile-time that the returned expression has 6 elements.
+It is equivalent to:
+\code v(seqN(last-7, fix<6>))
+\endcode
+
+We can revisit the <i>even columns of A</i> example as follows:
+\code A(all, seq(0,last,fix<2>))
+\endcode
+
+
+\section TutorialSlicingReverse Reverse order
+
+Row/column indices can also be enumerated in decreasing order using a negative increment.
+For instance, one over two columns of A from the column 20 to 10:
+\code A(all, seq(20, 10, fix<-2>))
+\endcode
+The last \c n rows starting from the last one:
+\code A(seqN(last, n, fix<-1>), all)
+\endcode
+You can also use the ArithmeticSequence::reverse() method to reverse its order.
+The previous example can thus also be written as:
+\code A(lastN(n).reverse(), all)
+\endcode
+
+
+\section TutorialSlicingArray Array of indices
+
+The generic `operator()` can also takes as input an arbitrary list of row or column indices stored as either an `ArrayXi`, a `std::vector<int>`, `std::array<int,N>`, etc.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Slicing_stdvector_cxx11.cpp
+</td>
+<td>
+\verbinclude Slicing_stdvector_cxx11.out
+</td></tr></table>
+
+You can also directly pass a static array:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Slicing_rawarray_cxx11.cpp
+</td>
+<td>
+\verbinclude Slicing_rawarray_cxx11.out
+</td></tr></table>
+
+or expressions:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Slicing_arrayexpr.cpp
+</td>
+<td>
+\verbinclude Slicing_arrayexpr.out
+</td></tr></table>
+
+When passing an object with a compile-time size such as `Array4i`, `std::array<int,N>`, or a static array, then the returned expression also exhibit compile-time dimensions.
+
+\section TutorialSlicingCustomArray Custom index list
+
+More generally, `operator()` can accept as inputs any object \c ind of type \c T compatible with:
+\code
+Index s = ind.size(); or Index s = size(ind);
+Index i;
+i = ind[i];
+\endcode
+
+This means you can easily build your own fancy sequence generator and pass it to `operator()`.
+Here is an exemple enlarging a given matrix while padding the additional first rows and columns through repetition:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Slicing_custom_padding_cxx11.cpp
+</td>
+<td>
+\verbinclude Slicing_custom_padding_cxx11.out
+</td></tr></table>
+
+<br>
+
+*/
+
+/*
+TODO add:
+so_repeat_inner.cpp
+so_repeleme.cpp
+*/
+}

diff --git a/doc/TutorialSparse.dox b/doc/TutorialSparse.dox
new file mode 100644
index 0000000..c69171e
--- /dev/null
+++ b/doc/TutorialSparse.dox

@@ -0,0 +1,365 @@
+namespace Eigen {
+
+/** \eigenManualPage TutorialSparse Sparse matrix manipulations
+
+\eigenAutoToc
+
+Manipulating and solving sparse problems involves various modules which are summarized below:
+
+<table class="manual">
+<tr><th>Module</th><th>Header file</th><th>Contents</th></tr>
+<tr><td>\link SparseCore_Module SparseCore \endlink</td><td>\code#include <Eigen/SparseCore>\endcode</td><td>SparseMatrix and SparseVector classes, matrix assembly, basic sparse linear algebra (including sparse triangular solvers)</td></tr>
+<tr><td>\link SparseCholesky_Module SparseCholesky \endlink</td><td>\code#include <Eigen/SparseCholesky>\endcode</td><td>Direct sparse LLT and LDLT Cholesky factorization to solve sparse self-adjoint positive definite problems</td></tr>
+<tr><td>\link SparseLU_Module SparseLU \endlink</td><td>\code #include<Eigen/SparseLU> \endcode</td>
+<td>%Sparse LU factorization to solve general square sparse systems</td></tr>
+<tr><td>\link SparseQR_Module SparseQR \endlink</td><td>\code #include<Eigen/SparseQR>\endcode </td><td>%Sparse QR factorization for solving sparse linear least-squares problems</td></tr>
+<tr><td>\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlink</td><td>\code#include <Eigen/IterativeLinearSolvers>\endcode</td><td>Iterative solvers to solve large general linear square problems (including self-adjoint positive definite problems)</td></tr>
+<tr><td>\link Sparse_Module Sparse \endlink</td><td>\code#include <Eigen/Sparse>\endcode</td><td>Includes all the above modules</td></tr>
+</table>
+
+\section TutorialSparseIntro Sparse matrix format
+
+In many applications (e.g., finite element methods) it is common to deal with very large matrices where only a few coefficients are different from zero.  In such cases, memory consumption can be reduced and performance increased by using a specialized representation storing only the nonzero coefficients. Such a matrix is called a sparse matrix.
+
+\b The \b %SparseMatrix \b class
+
+The class SparseMatrix is the main sparse matrix representation of Eigen's sparse module; it offers high performance and low memory usage.
+It implements a more versatile variant of the widely-used Compressed Column (or Row) Storage scheme.
+It consists of four compact arrays:
+ - \c Values: stores the coefficient values of the non-zeros.
+ - \c InnerIndices: stores the row (resp. column) indices of the non-zeros.
+ - \c OuterStarts: stores for each column (resp. row) the index of the first non-zero in the previous two arrays.
+ - \c InnerNNZs: stores the number of non-zeros of each column (resp. row).
+The word \c inner refers to an \em inner \em vector that is a column for a column-major matrix, or a row for a row-major matrix.
+The word \c outer refers to the other direction.
+
+This storage scheme is better explained on an example. The following matrix
+<table class="manual">
+<tr><td> 0</td><td>3</td><td> 0</td><td>0</td><td> 0</td></tr>
+<tr><td>22</td><td>0</td><td> 0</td><td>0</td><td>17</td></tr>
+<tr><td> 7</td><td>5</td><td> 0</td><td>1</td><td> 0</td></tr>
+<tr><td> 0</td><td>0</td><td> 0</td><td>0</td><td> 0</td></tr>
+<tr><td> 0</td><td>0</td><td>14</td><td>0</td><td> 8</td></tr>
+</table>
+
+and one of its possible sparse, \b column \b major representation:
+<table class="manual">
+<tr><td>Values:</td>        <td>22</td><td>7</td><td>_</td><td>3</td><td>5</td><td>14</td><td>_</td><td>_</td><td>1</td><td>_</td><td>17</td><td>8</td></tr>
+<tr><td>InnerIndices:</td>  <td> 1</td><td>2</td><td>_</td><td>0</td><td>2</td><td> 4</td><td>_</td><td>_</td><td>2</td><td>_</td><td> 1</td><td>4</td></tr>
+</table>
+<table class="manual">
+<tr><td>OuterStarts:</td><td>0</td><td>3</td><td>5</td><td>8</td><td>10</td><td>\em 12 </td></tr>
+<tr><td>InnerNNZs:</td>    <td>2</td><td>2</td><td>1</td><td>1</td><td> 2</td><td></td></tr>
+</table>
+
+Currently the elements of a given inner vector are guaranteed to be always sorted by increasing inner indices.
+The \c "_" indicates available free space to quickly insert new elements.
+Assuming no reallocation is needed, the insertion of a random element is therefore in O(nnz_j) where nnz_j is the number of nonzeros of the respective inner vector.
+On the other hand, inserting elements with increasing inner indices in a given inner vector is much more efficient since this only requires to increase the respective \c InnerNNZs entry that is a O(1) operation.
+
+The case where no empty space is available is a special case, and is referred as the \em compressed mode.
+It corresponds to the widely used Compressed Column (or Row) Storage schemes (CCS or CRS).
+Any SparseMatrix can be turned to this form by calling the SparseMatrix::makeCompressed() function.
+In this case, one can remark that the \c InnerNNZs array is redundant with \c OuterStarts because we have the equality: \c InnerNNZs[j] = \c OuterStarts[j+1]-\c OuterStarts[j].
+Therefore, in practice a call to SparseMatrix::makeCompressed() frees this buffer.
+
+It is worth noting that most of our wrappers to external libraries requires compressed matrices as inputs.
+
+The results of %Eigen's operations always produces \b compressed sparse matrices.
+On the other hand, the insertion of a new element into a SparseMatrix converts this later to the \b uncompressed mode.
+
+Here is the previous matrix represented in compressed mode:
+<table class="manual">
+<tr><td>Values:</td>        <td>22</td><td>7</td><td>3</td><td>5</td><td>14</td><td>1</td><td>17</td><td>8</td></tr>
+<tr><td>InnerIndices:</td>  <td> 1</td><td>2</td><td>0</td><td>2</td><td> 4</td><td>2</td><td> 1</td><td>4</td></tr>
+</table>
+<table class="manual">
+<tr><td>OuterStarts:</td><td>0</td><td>2</td><td>4</td><td>5</td><td>6</td><td>\em 8 </td></tr>
+</table>
+
+A SparseVector is a special case of a SparseMatrix where only the \c Values and \c InnerIndices arrays are stored.
+There is no notion of compressed/uncompressed mode for a SparseVector.
+
+
+\section TutorialSparseExample First example
+
+Before describing each individual class, let's start with the following typical example: solving the Laplace equation \f$ \Delta u = 0 \f$ on a regular 2D grid using a finite difference scheme and Dirichlet boundary conditions.
+Such problem can be mathematically expressed as a linear problem of the form \f$ Ax=b \f$ where \f$ x \f$ is the vector of \c m unknowns (in our case, the values of the pixels), \f$ b \f$ is the right hand side vector resulting from the boundary conditions, and \f$ A \f$ is an \f$ m \times m \f$ matrix containing only a few non-zero elements resulting from the discretization of the Laplacian operator.
+
+<table class="manual">
+<tr><td>
+\include Tutorial_sparse_example.cpp
+</td>
+<td>
+\image html Tutorial_sparse_example.jpeg
+</td></tr></table>
+
+In this example, we start by defining a column-major sparse matrix type of double \c SparseMatrix<double>, and a triplet list of the same scalar type \c  Triplet<double>. A triplet is a simple object representing a non-zero entry as the triplet: \c row index, \c column index, \c value.
+
+In the main function, we declare a list \c coefficients of triplets (as a std vector) and the right hand side vector \f$ b \f$ which are filled by the \a buildProblem function.
+The raw and flat list of non-zero entries is then converted to a true SparseMatrix object \c A.
+Note that the elements of the list do not have to be sorted, and possible duplicate entries will be summed up.
+
+The last step consists of effectively solving the assembled problem.
+Since the resulting matrix \c A is symmetric by construction, we can perform a direct Cholesky factorization via the SimplicialLDLT class which behaves like its LDLT counterpart for dense objects.
+
+The resulting vector \c x contains the pixel values as a 1D array which is saved to a jpeg file shown on the right of the code above.
+
+Describing the \a buildProblem and \a save functions is out of the scope of this tutorial. They are given \ref TutorialSparse_example_details "here" for the curious and reproducibility purpose.
+
+
+
+
+\section TutorialSparseSparseMatrix The SparseMatrix class
+
+\b %Matrix \b and \b vector \b properties \n
+
+The SparseMatrix and SparseVector classes take three template arguments:
+ * the scalar type (e.g., double)
+ * the storage order (ColMajor or RowMajor, the default is ColMajor)
+ * the inner index type (default is \c int).
+
+As for dense Matrix objects, constructors takes the size of the object.
+Here are some examples:
+
+\code
+SparseMatrix<std::complex<float> > mat(1000,2000);         // declares a 1000x2000 column-major compressed sparse matrix of complex<float>
+SparseMatrix<double,RowMajor> mat(1000,2000);              // declares a 1000x2000 row-major compressed sparse matrix of double
+SparseVector<std::complex<float> > vec(1000);              // declares a column sparse vector of complex<float> of size 1000
+SparseVector<double,RowMajor> vec(1000);                   // declares a row sparse vector of double of size 1000
+\endcode
+
+In the rest of the tutorial, \c mat and \c vec represent any sparse-matrix and sparse-vector objects, respectively.
+
+The dimensions of a matrix can be queried using the following functions:
+<table class="manual">
+<tr><td>Standard \n dimensions</td><td>\code
+mat.rows()
+mat.cols()\endcode</td>
+<td>\code
+vec.size() \endcode</td>
+</tr>
+<tr><td>Sizes along the \n inner/outer dimensions</td><td>\code
+mat.innerSize()
+mat.outerSize()\endcode</td>
+<td></td>
+</tr>
+<tr><td>Number of non \n zero coefficients</td><td>\code
+mat.nonZeros() \endcode</td>
+<td>\code
+vec.nonZeros() \endcode</td></tr>
+</table>
+
+
+\b Iterating \b over \b the \b nonzero \b coefficients \n
+
+Random access to the elements of a sparse object can be done through the \c coeffRef(i,j) function.
+However, this function involves a quite expensive binary search.
+In most cases, one only wants to iterate over the non-zeros elements. This is achieved by a standard loop over the outer dimension, and then by iterating over the non-zeros of the current inner vector via an InnerIterator. Thus, the non-zero entries have to be visited in the same order than the storage order.
+Here is an example:
+<table class="manual">
+<tr><td>
+\code
+SparseMatrix<double> mat(rows,cols);
+for (int k=0; k<mat.outerSize(); ++k)
+  for (SparseMatrix<double>::InnerIterator it(mat,k); it; ++it)
+  {
+    it.value();
+    it.row();   // row index
+    it.col();   // col index (here it is equal to k)
+    it.index(); // inner index, here it is equal to it.row()
+  }
+\endcode
+</td><td>
+\code
+SparseVector<double> vec(size);
+for (SparseVector<double>::InnerIterator it(vec); it; ++it)
+{
+  it.value(); // == vec[ it.index() ]
+  it.index();
+}
+\endcode
+</td></tr>
+</table>
+For a writable expression, the referenced value can be modified using the valueRef() function.
+If the type of the sparse matrix or vector depends on a template parameter, then the \c typename keyword is
+required to indicate that \c InnerIterator denotes a type; see \ref TopicTemplateKeyword for details.
+
+
+\section TutorialSparseFilling Filling a sparse matrix
+
+Because of the special storage scheme of a SparseMatrix, special care has to be taken when adding new nonzero entries.
+For instance, the cost of a single purely random insertion into a SparseMatrix is \c O(nnz), where \c nnz is the current number of non-zero coefficients.
+
+The simplest way to create a sparse matrix while guaranteeing good performance is thus to first build a list of so-called \em triplets, and then convert it to a SparseMatrix.
+
+Here is a typical usage example:
+\code
+typedef Eigen::Triplet<double> T;
+std::vector<T> tripletList;
+tripletList.reserve(estimation_of_entries);
+for(...)
+{
+  // ...
+  tripletList.push_back(T(i,j,v_ij));
+}
+SparseMatrixType mat(rows,cols);
+mat.setFromTriplets(tripletList.begin(), tripletList.end());
+// mat is ready to go!
+\endcode
+The \c std::vector of triplets might contain the elements in arbitrary order, and might even contain duplicated elements that will be summed up by setFromTriplets().
+See the SparseMatrix::setFromTriplets() function and class Triplet for more details.
+
+
+In some cases, however, slightly higher performance, and lower memory consumption can be reached by directly inserting the non-zeros into the destination matrix.
+A typical scenario of this approach is illustrated below:
+\code
+1: SparseMatrix<double> mat(rows,cols);         // default is column major
+2: mat.reserve(VectorXi::Constant(cols,6));
+3: for each i,j such that v_ij != 0
+4:   mat.insert(i,j) = v_ij;                    // alternative: mat.coeffRef(i,j) += v_ij;
+5: mat.makeCompressed();                        // optional
+\endcode
+
+- The key ingredient here is the line 2 where we reserve room for 6 non-zeros per column. In many cases, the number of non-zeros per column or row can easily be known in advance. If it varies significantly for each inner vector, then it is possible to specify a reserve size for each inner vector by providing a vector object with an operator[](int j) returning the reserve size of the \c j-th inner vector (e.g., via a VectorXi or std::vector<int>). If only a rought estimate of the number of nonzeros per inner-vector can be obtained, it is highly recommended to overestimate it rather than the opposite. If this line is omitted, then the first insertion of a new element will reserve room for 2 elements per inner vector.
+- The line 4 performs a sorted insertion. In this example, the ideal case is when the \c j-th column is not full and contains non-zeros whose inner-indices are smaller than \c i. In this case, this operation boils down to trivial O(1) operation.
+- When calling insert(i,j) the element \c i \c ,j must not already exists, otherwise use the coeffRef(i,j) method that will allow to, e.g., accumulate values. This method first performs a binary search and finally calls insert(i,j) if the element does not already exist. It is more flexible than insert() but also more costly.
+- The line 5 suppresses the remaining empty space and transforms the matrix into a compressed column storage.
+
+
+
+\section TutorialSparseFeatureSet Supported operators and functions
+
+Because of their special storage format, sparse matrices cannot offer the same level of flexibility than dense matrices.
+In Eigen's sparse module we chose to expose only the subset of the dense matrix API which can be efficiently implemented.
+In the following \em sm denotes a sparse matrix, \em sv a sparse vector, \em dm a dense matrix, and \em dv a dense vector.
+
+\subsection TutorialSparse_BasicOps Basic operations
+
+%Sparse expressions support most of the unary and binary coefficient wise operations:
+\code
+sm1.real()   sm1.imag()   -sm1                    0.5*sm1
+sm1+sm2      sm1-sm2      sm1.cwiseProduct(sm2)
+\endcode
+However, <strong>a strong restriction is that the storage orders must match</strong>. For instance, in the following example:
+\code
+sm4 = sm1 + sm2 + sm3;
+\endcode
+sm1, sm2, and sm3 must all be row-major or all column-major.
+On the other hand, there is no restriction on the target matrix sm4.
+For instance, this means that for computing \f$ A^T + A \f$, the matrix \f$ A^T \f$ must be evaluated into a temporary matrix of compatible storage order:
+\code
+SparseMatrix<double> A, B;
+B = SparseMatrix<double>(A.transpose()) + A;
+\endcode
+
+Binary coefficient wise operators can also mix sparse and dense expressions:
+\code
+sm2 = sm1.cwiseProduct(dm1);
+dm2 = sm1 + dm1;
+dm2 = dm1 - sm1;
+\endcode
+Performance-wise, the adding/subtracting sparse and dense matrices is better performed in two steps. For instance, instead of doing <tt>dm2 = sm1 + dm1</tt>, better write:
+\code
+dm2 = dm1;
+dm2 += sm1;
+\endcode
+This version has the advantage to fully exploit the higher performance of dense storage (no indirection, SIMD, etc.), and to pay the cost of slow sparse evaluation on the few non-zeros of the sparse matrix only.
+
+
+%Sparse expressions also support transposition:
+\code
+sm1 = sm2.transpose();
+sm1 = sm2.adjoint();
+\endcode
+However, there is no transposeInPlace() method.
+
+
+\subsection TutorialSparse_Products Matrix products
+
+%Eigen supports various kind of sparse matrix products which are summarize below:
+  - \b sparse-dense:
+    \code
+dv2 = sm1 * dv1;
+dm2 = dm1 * sm1.adjoint();
+dm2 = 2. * sm1 * dm1;
+    \endcode
+  - \b symmetric \b sparse-dense. The product of a sparse symmetric matrix with a dense matrix (or vector) can also be optimized by specifying the symmetry with selfadjointView():
+    \code
+dm2 = sm1.selfadjointView<>() * dm1;        // if all coefficients of A are stored
+dm2 = A.selfadjointView<Upper>() * dm1;     // if only the upper part of A is stored
+dm2 = A.selfadjointView<Lower>() * dm1;     // if only the lower part of A is stored
+    \endcode
+  - \b sparse-sparse. For sparse-sparse products, two different algorithms are available. The default one is conservative and preserve the explicit zeros that might appear:
+    \code
+sm3 = sm1 * sm2;
+sm3 = 4 * sm1.adjoint() * sm2;
+    \endcode
+    The second algorithm prunes on the fly the explicit zeros, or the values smaller than a given threshold. It is enabled and controlled through the prune() functions:
+    \code
+sm3 = (sm1 * sm2).pruned();                  // removes numerical zeros
+sm3 = (sm1 * sm2).pruned(ref);               // removes elements much smaller than ref
+sm3 = (sm1 * sm2).pruned(ref,epsilon);       // removes elements smaller than ref*epsilon
+    \endcode
+
+  - \b permutations. Finally, permutations can be applied to sparse matrices too:
+    \code
+PermutationMatrix<Dynamic,Dynamic> P = ...;
+sm2 = P * sm1;
+sm2 = sm1 * P.inverse();
+sm2 = sm1.transpose() * P;
+    \endcode
+
+
+\subsection TutorialSparse_SubMatrices Block operations
+
+Regarding read-access, sparse matrices expose the same API than for dense matrices to access to sub-matrices such as blocks, columns, and rows. See \ref TutorialBlockOperations for a detailed introduction.
+However, for performance reasons, writing to a sub-sparse-matrix is much more limited, and currently only contiguous sets of columns (resp. rows) of a column-major (resp. row-major) SparseMatrix are writable. Moreover, this information has to be known at compile-time, leaving out methods such as <tt>block(...)</tt> and <tt>corner*(...)</tt>. The available API for write-access to a SparseMatrix are summarized below:
+\code
+SparseMatrix<double,ColMajor> sm1;
+sm1.col(j) = ...;
+sm1.leftCols(ncols) = ...;
+sm1.middleCols(j,ncols) = ...;
+sm1.rightCols(ncols) = ...;
+
+SparseMatrix<double,RowMajor> sm2;
+sm2.row(i) = ...;
+sm2.topRows(nrows) = ...;
+sm2.middleRows(i,nrows) = ...;
+sm2.bottomRows(nrows) = ...;
+\endcode
+
+In addition, sparse matrices expose the SparseMatrixBase::innerVector() and SparseMatrixBase::innerVectors() methods, which are aliases to the col/middleCols methods for a column-major storage, and to the row/middleRows methods for a row-major storage.
+
+\subsection TutorialSparse_TriangularSelfadjoint Triangular and selfadjoint views
+
+Just as with dense matrices, the triangularView() function can be used to address a triangular part of the matrix, and perform triangular solves with a dense right hand side:
+\code
+dm2 = sm1.triangularView<Lower>(dm1);
+dv2 = sm1.transpose().triangularView<Upper>(dv1);
+\endcode
+
+The selfadjointView() function permits various operations:
+ - optimized sparse-dense matrix products:
+    \code
+dm2 = sm1.selfadjointView<>() * dm1;        // if all coefficients of A are stored
+dm2 = A.selfadjointView<Upper>() * dm1;     // if only the upper part of A is stored
+dm2 = A.selfadjointView<Lower>() * dm1;     // if only the lower part of A is stored
+    \endcode
+ - copy of triangular parts:
+    \code
+sm2 = sm1.selfadjointView<Upper>();                               // makes a full selfadjoint matrix from the upper triangular part
+sm2.selfadjointView<Lower>() = sm1.selfadjointView<Upper>();      // copies the upper triangular part to the lower triangular part
+    \endcode
+ - application of symmetric permutations:
+ \code
+PermutationMatrix<Dynamic,Dynamic> P = ...;
+sm2 = A.selfadjointView<Upper>().twistedBy(P);                                // compute P S P' from the upper triangular part of A, and make it a full matrix
+sm2.selfadjointView<Lower>() = A.selfadjointView<Lower>().twistedBy(P);       // compute P S P' from the lower triangular part of A, and then only compute the lower part
+ \endcode
+
+Please, refer to the \link SparseQuickRefPage Quick Reference \endlink  guide for the list of supported operations. The list of linear solvers available is \link TopicSparseSystems here. \endlink
+
+*/
+
+}

diff --git a/doc/TutorialSparse_example_details.dox b/doc/TutorialSparse_example_details.dox
new file mode 100644
index 0000000..0438da8
--- /dev/null
+++ b/doc/TutorialSparse_example_details.dox

@@ -0,0 +1,4 @@
+/**
+\page TutorialSparse_example_details
+\include Tutorial_sparse_example_details.cpp
+*/

diff --git a/doc/UnalignedArrayAssert.dox b/doc/UnalignedArrayAssert.dox
new file mode 100644
index 0000000..410c8a5
--- /dev/null
+++ b/doc/UnalignedArrayAssert.dox

@@ -0,0 +1,133 @@
+namespace Eigen {
+
+/** \eigenManualPage TopicUnalignedArrayAssert Explanation of the assertion on unaligned arrays
+
+Hello! You are seeing this webpage because your program terminated on an assertion failure like this one:
+<pre>
+my_program: path/to/eigen/Eigen/src/Core/DenseStorage.h:44:
+Eigen::internal::matrix_array<T, Size, MatrixOptions, Align>::internal::matrix_array()
+[with T = double, int Size = 2, int MatrixOptions = 2, bool Align = true]:
+Assertion `(reinterpret_cast<size_t>(array) & (sizemask)) == 0 && "this assertion
+is explained here: http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html
+**** READ THIS WEB PAGE !!! ****"' failed.
+</pre>
+
+There are 4 known causes for this issue.
+If you can target \cpp17 only with a recent compiler (e.g., GCC>=7, clang>=5, MSVC>=19.12), then you're lucky: enabling c++17 should be enough (if not, please <a href="http://eigen.tuxfamily.org/bz/">report</a> to us).
+Otherwise, please read on to understand those issues and learn how to fix them.
+
+\eigenAutoToc
+
+\section where Where in my own code is the cause of the problem?
+
+First of all, you need to find out where in your own code this assertion was triggered from. At first glance, the error message doesn't look helpful, as it refers to a file inside Eigen! However, since your program crashed, if you can reproduce the crash, you can get a backtrace using any debugger. For example, if you're using GCC, you can use the GDB debugger as follows:
+\code
+$ gdb ./my_program          # Start GDB on your program
+> run                       # Start running your program
+...                         # Now reproduce the crash!
+> bt                        # Obtain the backtrace
+\endcode
+Now that you know precisely where in your own code the problem is happening, read on to understand what you need to change.
+
+\section c1 Cause 1: Structures having Eigen objects as members
+
+If you have code like this,
+
+\code
+class Foo
+{
+  //...
+  Eigen::Vector4d v;
+  //...
+};
+//...
+Foo *foo = new Foo;
+\endcode
+
+then you need to read this separate page: \ref TopicStructHavingEigenMembers "Structures Having Eigen Members".
+
+Note that here, Eigen::Vector4d is only used as an example, more generally the issue arises for all \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types".
+
+\section c2 Cause 2: STL Containers or manual memory allocation
+
+If you use STL Containers such as std::vector, std::map, ..., with %Eigen objects, or with classes containing %Eigen objects, like this,
+
+\code
+std::vector<Eigen::Matrix2d> my_vector;
+struct my_class { ... Eigen::Matrix2d m; ... };
+std::map<int, my_class> my_map;
+\endcode
+
+then you need to read this separate page: \ref TopicStlContainers "Using STL Containers with Eigen".
+
+Note that here, Eigen::Matrix2d is only used as an example, more generally the issue arises for all \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types" and \ref TopicStructHavingEigenMembers "structures having such Eigen objects as member".
+
+The same issue will be exhibited by any classes/functions by-passing operator new to allocate memory, that is, by performing custom memory allocation followed by calls to the placement new operator. This is for instance typically the case of \c `std::make_shared` or `std::allocate_shared` for which is the solution is to use an \ref aligned_allocator "aligned allocator" as detailed in the \ref TopicStlContainers "solution for STL containers".
+
+\section c3 Cause 3: Passing Eigen objects by value
+
+If some function in your code is getting an %Eigen object passed by value, like this,
+
+\code
+void func(Eigen::Vector4d v);
+\endcode
+
+then you need to read this separate page: \ref TopicPassingByValue "Passing Eigen objects by value to functions".
+
+Note that here, Eigen::Vector4d is only used as an example, more generally the issue arises for all \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types".
+
+\section c4 Cause 4: Compiler making a wrong assumption on stack alignment (for instance GCC on Windows)
+
+This is a must-read for people using GCC on Windows (like MinGW or TDM-GCC). If you have this assertion failure in an innocent function declaring a local variable like this:
+
+\code
+void foo()
+{
+  Eigen::Quaternionf q;
+  //...
+}
+\endcode
+
+then you need to read this separate page: \ref TopicWrongStackAlignment "Compiler making a wrong assumption on stack alignment".
+
+Note that here, Eigen::Quaternionf is only used as an example, more generally the issue arises for all \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types".
+
+
+\section explanation General explanation of this assertion
+
+\ref TopicFixedSizeVectorizable "Fixed-size vectorizable Eigen objects" must absolutely be created at properly aligned locations, otherwise SIMD instructions addressing them will crash.
+For instance, SSE/NEON/MSA/Altivec/VSX targets will require 16-byte-alignment, whereas AVX and AVX512 targets may require up to 32 and 64 byte alignment respectively.
+
+%Eigen normally takes care of these alignment issues for you, by setting an alignment attribute on them and by overloading their `operator new`.
+
+However there are a few corner cases where these alignment settings get overridden: they are the possible causes for this assertion.
+
+\section getrid I don't care about optimal vectorization, how do I get rid of that stuff?
+
+Three possibilities:
+<ul>
+  <li>Use the \c DontAlign option to Matrix, Array, Quaternion, etc. objects that gives you trouble. This way %Eigen won't try to over-align them, and thus won"t assume any special alignment. On the down side, you will pay the cost of unaligned loads/stores for them, but on modern CPUs, the overhead is either null or marginal. See \link StructHavingEigenMembers_othersolutions here \endlink for an example.</li>
+  <li>Define \link TopicPreprocessorDirectivesPerformance EIGEN_MAX_STATIC_ALIGN_BYTES \endlink to 0. That disables all 16-byte (and above) static alignment code, while keeping 16-byte (or above) heap alignment. This has the effect of
+      vectorizing fixed-size objects (like Matrix4d) through unaligned stores (as controlled by \link TopicPreprocessorDirectivesPerformance EIGEN_UNALIGNED_VECTORIZE \endlink), while keeping unchanged the vectorization of dynamic-size objects
+      (like MatrixXd). On 64 bytes systems, you might also define it 16 to disable only 32 and 64 bytes of over-alignment. But do note that this breaks ABI compatibility with the default behavior of static alignment.</li>
+  <li>Or define both \link TopicPreprocessorDirectivesPerformance  EIGEN_DONT_VECTORIZE \endlink and `EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT`. This keeps the
+      16-byte (or above) alignment code and thus preserves ABI compatibility, but completely disables vectorization.</li>
+</ul>
+
+If you want to know why defining `EIGEN_DONT_VECTORIZE` does not by itself disable 16-byte (or above) alignment and the assertion, here's the explanation:
+
+It doesn't disable the assertion, because otherwise code that runs fine without vectorization would suddenly crash when enabling vectorization.
+It doesn't disable 16-byte (or above) alignment, because that would mean that vectorized and non-vectorized code are not mutually ABI-compatible. This ABI compatibility is very important, even for people who develop only an in-house application, as for instance one may want to have in the same application a vectorized path and a non-vectorized path.
+
+\section checkmycode How can I check my code is safe regarding alignment issues?
+
+Unfortunately, there is no possibility in c++ to detect any of the aforementioned shortcoming at compile time (though static analyzers are becoming more and more powerful and could detect some of them).
+Even at runtime, all we can do is to catch invalid unaligned allocation and trigger the explicit assertion mentioned at the beginning of this page.
+Therefore, if your program runs fine on a given system with some given compilation flags, then this does not guarantee that your code is safe. For instance, on most 64 bits systems buffer are aligned on 16 bytes boundary and so, if you do not enable AVX instruction set, then your code will run fine. On the other hand, the same code may assert if moving to a more exotic platform, or enabling AVX instructions that required 32 bytes alignment by default.
+
+The situation is not hopeless though. Assuming your code is well covered by unit test, then you can check its alignment safety by linking it to a custom malloc library returning 8 bytes aligned buffers only. This way all alignment shortcomings should pop-up. To this end, you must also compile your program with \link TopicPreprocessorDirectivesPerformance EIGEN_MALLOC_ALREADY_ALIGNED=0 \endlink.
+
+
+*/
+
+}

diff --git a/doc/UsingBlasLapackBackends.dox b/doc/UsingBlasLapackBackends.dox
new file mode 100644
index 0000000..caa5971
--- /dev/null
+++ b/doc/UsingBlasLapackBackends.dox

@@ -0,0 +1,133 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+ Copyright (C) 2011-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Documentation on the use of BLAS/LAPACK libraries through Eigen
+ ********************************************************************************
+*/
+
+namespace Eigen {
+
+/** \page TopicUsingBlasLapack Using BLAS/LAPACK from %Eigen
+
+
+Since %Eigen version 3.3 and later, any F77 compatible BLAS or LAPACK libraries can be used as backends for dense matrix products and dense matrix decompositions.
+For instance, one can use <a href="http://eigen.tuxfamily.org/Counter/redirect_to_mkl.php">Intel® MKL</a>, Apple's Accelerate framework on OSX, <a href="http://www.openblas.net/">OpenBLAS</a>, <a href="http://www.netlib.org/lapack">Netlib LAPACK</a>, etc.
+
+Do not miss this \link TopicUsingIntelMKL page \endlink for further discussions on the specific use of Intel® MKL (also includes VML, PARDISO, etc.)
+
+In order to use an external BLAS and/or LAPACK library, you must link you own application to the respective libraries and their dependencies.
+For LAPACK, you must also link to the standard <a href="http://www.netlib.org/lapack/lapacke.html">Lapacke</a> library, which is used as a convenient think layer between %Eigen's C++ code and LAPACK F77 interface. Then you must activate their usage by defining one or multiple of the following macros (\b before including any %Eigen's header):
+
+\note For Mac users, in order to use the lapack version shipped with the Accelerate framework, you also need the lapacke library.
+Using <a href="https://www.macports.org/">MacPorts</a>, this is as easy as:
+\code
+sudo port install lapack
+\endcode
+and then use the following link flags: \c -framework \c Accelerate \c /opt/local/lib/lapack/liblapacke.dylib
+
+<table class="manual">
+<tr><td>\c EIGEN_USE_BLAS </td><td>Enables the use of external BLAS level 2 and 3 routines (compatible with any F77 BLAS interface)</td></tr>
+<tr class="alt"><td>\c EIGEN_USE_LAPACKE </td><td>Enables the use of external Lapack routines via the <a href="http://www.netlib.org/lapack/lapacke.html">Lapacke</a> C interface to Lapack (compatible with any F77 LAPACK interface)</td></tr>
+<tr><td>\c EIGEN_USE_LAPACKE_STRICT </td><td>Same as \c EIGEN_USE_LAPACKE but algorithms of lower numerical robustness are disabled. \n This currently concerns only JacobiSVD which otherwise would be replaced by \c gesvd that is less robust than Jacobi rotations.</td></tr>
+</table>
+
+When doing so, a number of %Eigen's algorithms are silently substituted with calls to BLAS or LAPACK routines.
+These substitutions apply only for \b Dynamic \b or \b large enough objects with one of the following four standard scalar types: \c float, \c double, \c complex<float>, and \c complex<double>.
+Operations on other scalar types or mixing reals and complexes will continue to use the built-in algorithms.
+
+The breadth of %Eigen functionality that can be substituted is listed in the table below.
+<table class="manual">
+<tr><th>Functional domain</th><th>Code example</th><th>BLAS/LAPACK routines</th></tr>
+<tr><td>Matrix-matrix operations \n \c EIGEN_USE_BLAS </td><td>\code
+m1*m2.transpose();
+m1.selfadjointView<Lower>()*m2;
+m1*m2.triangularView<Upper>();
+m1.selfadjointView<Lower>().rankUpdate(m2,1.0);
+\endcode</td><td>\code
+?gemm
+?symm/?hemm
+?trmm
+dsyrk/ssyrk
+\endcode</td></tr>
+<tr class="alt"><td>Matrix-vector operations \n \c EIGEN_USE_BLAS </td><td>\code
+m1.adjoint()*b;
+m1.selfadjointView<Lower>()*b;
+m1.triangularView<Upper>()*b;
+\endcode</td><td>\code
+?gemv
+?symv/?hemv
+?trmv
+\endcode</td></tr>
+<tr><td>LU decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
+v1 = m1.lu().solve(v2);
+\endcode</td><td>\code
+?getrf
+\endcode</td></tr>
+<tr class="alt"><td>Cholesky decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
+v1 = m2.selfadjointView<Upper>().llt().solve(v2);
+\endcode</td><td>\code
+?potrf
+\endcode</td></tr>
+<tr><td>QR decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
+m1.householderQr();
+m1.colPivHouseholderQr();
+\endcode</td><td>\code
+?geqrf
+?geqp3
+\endcode</td></tr>
+<tr class="alt"><td>Singular value decomposition \n \c EIGEN_USE_LAPACKE </td><td>\code
+JacobiSVD<MatrixXd> svd;
+svd.compute(m1, ComputeThinV);
+\endcode</td><td>\code
+?gesvd
+\endcode</td></tr>
+<tr><td>Eigen-value decompositions \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
+EigenSolver<MatrixXd> es(m1);
+ComplexEigenSolver<MatrixXcd> ces(m1);
+SelfAdjointEigenSolver<MatrixXd> saes(m1+m1.transpose());
+GeneralizedSelfAdjointEigenSolver<MatrixXd>
+    gsaes(m1+m1.transpose(),m2+m2.transpose());
+\endcode</td><td>\code
+?gees
+?gees
+?syev/?heev
+?syev/?heev,
+?potrf
+\endcode</td></tr>
+<tr class="alt"><td>Schur decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
+RealSchur<MatrixXd> schurR(m1);
+ComplexSchur<MatrixXcd> schurC(m1);
+\endcode</td><td>\code
+?gees
+\endcode</td></tr>
+</table>
+In the examples, m1 and m2 are dense matrices and v1 and v2 are dense vectors.
+
+*/
+
+}

diff --git a/doc/UsingIntelMKL.dox b/doc/UsingIntelMKL.dox
new file mode 100644
index 0000000..fc35c3c
--- /dev/null
+++ b/doc/UsingIntelMKL.dox

@@ -0,0 +1,113 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+ Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Documentation on the use of Intel MKL through Eigen
+ ********************************************************************************
+*/
+
+namespace Eigen {
+
+/** \page TopicUsingIntelMKL Using Intel® MKL from %Eigen
+
+<!-- \section TopicUsingIntelMKL_Intro Eigen and Intel® Math Kernel Library (Intel® MKL) -->
+
+Since %Eigen version 3.1 and later, users can benefit from built-in Intel® Math Kernel Library (MKL) optimizations with an installed copy of Intel MKL 10.3 (or later).
+
+<a href="http://eigen.tuxfamily.org/Counter/redirect_to_mkl.php"> Intel MKL </a> provides highly optimized multi-threaded mathematical routines for x86-compatible architectures.
+Intel MKL is available on Linux, Mac and Windows for both Intel64 and IA32 architectures.
+
+\note
+Intel® MKL is a proprietary software and it is the responsibility of users to buy or register for community (free) Intel MKL licenses for their products. Moreover, the license of the user product has to allow linking to proprietary software that excludes any unmodified versions of the GPL.
+
+Using Intel MKL through %Eigen is easy:
+-# define the \c EIGEN_USE_MKL_ALL macro before including any %Eigen's header
+-# link your program to MKL libraries (see the <a href="http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor/">MKL linking advisor</a>)
+-# on a 64bits system, you must use the LP64 interface (not the ILP64 one)
+
+When doing so, a number of %Eigen's algorithms are silently substituted with calls to Intel MKL routines.
+These substitutions apply only for \b Dynamic \b or \b large enough objects with one of the following four standard scalar types: \c float, \c double, \c complex<float>, and \c complex<double>.
+Operations on other scalar types or mixing reals and complexes will continue to use the built-in algorithms.
+
+In addition you can choose which parts will be substituted by defining one or multiple of the following macros:
+
+<table class="manual">
+<tr><td>\c EIGEN_USE_BLAS </td><td>Enables the use of external BLAS level 2 and 3 routines</td></tr>
+<tr class="alt"><td>\c EIGEN_USE_LAPACKE </td><td>Enables the use of external Lapack routines via the <a href="http://www.netlib.org/lapack/lapacke.html">Lapacke</a> C interface to Lapack</td></tr>
+<tr><td>\c EIGEN_USE_LAPACKE_STRICT </td><td>Same as \c EIGEN_USE_LAPACKE but algorithm of lower robustness are disabled. \n This currently concerns only JacobiSVD which otherwise would be replaced by \c gesvd that is less robust than Jacobi rotations.</td></tr>
+<tr class="alt"><td>\c EIGEN_USE_MKL_VML </td><td>Enables the use of Intel VML (vector operations)</td></tr>
+<tr><td>\c EIGEN_USE_MKL_ALL </td><td>Defines \c EIGEN_USE_BLAS, \c EIGEN_USE_LAPACKE, and \c EIGEN_USE_MKL_VML </td></tr>
+</table>
+
+The \c EIGEN_USE_BLAS and \c EIGEN_USE_LAPACKE* macros can be combined with \c EIGEN_USE_MKL to explicitly tell Eigen that the underlying BLAS/Lapack implementation is Intel MKL.
+The main effect is to enable MKL direct call feature (\c MKL_DIRECT_CALL).
+This may help to increase performance of some MKL BLAS (?GEMM, ?GEMV, ?TRSM, ?AXPY and ?DOT) and LAPACK (LU, Cholesky and QR) routines for very small matrices.
+MKL direct call can be disabled by defining \c EIGEN_MKL_NO_DIRECT_CALL.
+
+
+Note that the BLAS and LAPACKE backends can be enabled for any F77 compatible BLAS and LAPACK libraries. See this \link TopicUsingBlasLapack page \endlink for the details.
+
+Finally, the PARDISO sparse solver shipped with Intel MKL can be used through the \ref PardisoLU, \ref PardisoLLT and \ref PardisoLDLT classes of the \ref PardisoSupport_Module.
+
+The following table summarizes the list of functions covered by \c EIGEN_USE_MKL_VML:
+<table class="manual">
+<tr><th>Code example</th><th>MKL routines</th></tr>
+<tr><td>\code
+v2=v1.array().sin();
+v2=v1.array().asin();
+v2=v1.array().cos();
+v2=v1.array().acos();
+v2=v1.array().tan();
+v2=v1.array().exp();
+v2=v1.array().log();
+v2=v1.array().sqrt();
+v2=v1.array().square();
+v2=v1.array().pow(1.5);
+\endcode</td><td>\code
+v?Sin
+v?Asin
+v?Cos
+v?Acos
+v?Tan
+v?Exp
+v?Ln
+v?Sqrt
+v?Sqr
+v?Powx
+\endcode</td></tr>
+</table>
+In the examples, v1 and v2 are dense vectors.
+
+
+\section TopicUsingIntelMKL_Links Links
+- Intel MKL can be purchased and downloaded <a href="http://eigen.tuxfamily.org/Counter/redirect_to_mkl.php">here</a>.
+- Intel MKL is also bundled with <a href="http://software.intel.com/en-us/articles/intel-composer-xe/">Intel Composer XE</a>.
+
+
+*/
+
+}

diff --git a/doc/UsingNVCC.dox b/doc/UsingNVCC.dox
new file mode 100644
index 0000000..36beb2d
--- /dev/null
+++ b/doc/UsingNVCC.dox

@@ -0,0 +1,30 @@
+
+namespace Eigen {
+
+/** \page TopicCUDA Using Eigen in CUDA kernels
+
+Staring from CUDA 5.5 and Eigen 3.3, it is possible to use Eigen's matrices, vectors, and arrays for fixed size within CUDA kernels. This is especially useful when working on numerous but small problems. By default, when Eigen's headers are included within a .cu file compiled by nvcc most Eigen's functions and methods are prefixed by the \c __device__ \c __host__ keywords making them callable from both host and device code.
+This support can be disabled by defining \c EIGEN_NO_CUDA before including any Eigen's header.
+This might be useful to disable some warnings when a .cu file makes use of Eigen on the host side only.
+However, in both cases, host's SIMD vectorization has to be disabled in .cu files.
+It is thus \b strongly \b recommended to properly move all costly host computation from your .cu files to regular .cpp files.
+
+Known issues:
+
+ - \c nvcc with MS Visual Studio does not work (patch welcome)
+ 
+ - \c nvcc 5.5 with gcc-4.7 (or greater) has issues with the standard \c \<limits\> header file. To workaround this, you can add the following before including any other files:
+   \code
+    // workaround issue between gcc >= 4.7 and cuda 5.5
+    #if (defined __GNUC__) && (__GNUC__>4 || __GNUC_MINOR__>=7)
+      #undef _GLIBCXX_ATOMIC_BUILTINS
+      #undef _GLIBCXX_USE_INT128
+    #endif
+   \endcode
+   
+ - On 64bits system Eigen uses \c long \c int as the default type for indexes and sizes. On CUDA device, it would make sense to default to 32 bits \c int.
+   However, to keep host and CUDA code compatible, this cannot be done automatically by %Eigen, and the user is thus required to define \c EIGEN_DEFAULT_DENSE_INDEX_TYPE to \c int throughout his code (or only for CUDA code if there is no interaction between host and CUDA code through %Eigen's object).
+
+*/
+
+}

diff --git a/doc/WrongStackAlignment.dox b/doc/WrongStackAlignment.dox
new file mode 100644
index 0000000..17d5513
--- /dev/null
+++ b/doc/WrongStackAlignment.dox

@@ -0,0 +1,56 @@
+namespace Eigen {
+
+/** \eigenManualPage TopicWrongStackAlignment Compiler making a wrong assumption on stack alignment
+
+<h4>It appears that this was a GCC bug that has been fixed in GCC 4.5.
+If you hit this issue, please upgrade to GCC 4.5 and report to us, so we can update this page.</h4>
+
+This is an issue that, so far, we met only with GCC on Windows: for instance, MinGW and TDM-GCC.
+
+By default, in a function like this,
+
+\code
+void foo()
+{
+  Eigen::Quaternionf q;
+  //...
+}
+\endcode
+
+GCC assumes that the stack is already 16-byte-aligned so that the object \a q will be created at a 16-byte-aligned location. For this reason, it doesn't take any special care to explicitly align the object \a q, as Eigen requires.
+
+The problem is that, in some particular cases, this assumption can be wrong on Windows, where the stack is only guaranteed to have 4-byte alignment. Indeed, even though GCC takes care of aligning the stack in the main function and does its best to keep it aligned, when a function is called from another thread or from a binary compiled with another compiler, the stack alignment can be corrupted. This results in the object 'q' being created at an unaligned location, making your program crash with the \ref TopicUnalignedArrayAssert "assertion on unaligned arrays". So far we found the three following solutions.
+
+
+\section sec_sol1 Local solution
+
+A local solution is to mark such a function with this attribute:
+\code
+__attribute__((force_align_arg_pointer)) void foo()
+{
+  Eigen::Quaternionf q;
+  //...
+}
+\endcode
+Read <a href="http://gcc.gnu.org/onlinedocs/gcc-4.4.0/gcc/Function-Attributes.html#Function-Attributes">this GCC documentation</a> to understand what this does. Of course this should only be done on GCC on Windows, so for portability you'll have to encapsulate this in a macro which you leave empty on other platforms. The advantage of this solution is that you can finely select which function might have a corrupted stack alignment. Of course on the downside this has to be done for every such function, so you may prefer one of the following two global solutions.
+
+
+\section sec_sol2 Global solutions
+
+A global solution is to edit your project so that when compiling with GCC on Windows, you pass this option to GCC:
+\code
+-mincoming-stack-boundary=2
+\endcode
+Explanation: this tells GCC that the stack is only required to be aligned to 2^2=4 bytes, so that GCC now knows that it really must take extra care to honor the 16 byte alignment of \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types" when needed.
+
+Another global solution is to pass this option to gcc:
+\code
+-mstackrealign
+\endcode
+which has the same effect than adding the \c force_align_arg_pointer attribute to all functions.
+
+These global solutions are easy to use, but note that they may slowdown your program because they lead to extra prologue/epilogue instructions for every function.
+
+*/
+
+}

diff --git a/doc/eigen_navtree_hacks.js b/doc/eigen_navtree_hacks.js
new file mode 100644
index 0000000..afb97ed
--- /dev/null
+++ b/doc/eigen_navtree_hacks.js

@@ -0,0 +1,247 @@
+
+// generate a table of contents in the side-nav based on the h1/h2 tags of the current page.
+function generate_autotoc() {
+  var headers = $("h1, h2");
+  if(headers.length > 1) {
+    var toc = $("#side-nav").append('<div id="nav-toc" class="toc"><h3>Table of contents</h3></div>');
+    toc = $("#nav-toc");
+    var footer  = $("#nav-path");
+    var footerHeight = footer.height();
+    toc = toc.append('<ul></ul>');
+    toc = toc.find('ul');
+    var indices = new Array();
+    indices[0] = 0;
+    indices[1] = 0;
+
+    var h1counts = $("h1").length;
+    headers.each(function(i) {
+      var current = $(this);
+      var levelTag = current[0].tagName.charAt(1);
+      if(h1counts==0)
+        levelTag--;
+      var cur_id = current.attr("id");
+
+      indices[levelTag-1]+=1;  
+      var prefix = indices[0];
+      if (levelTag >1) {
+        prefix+="."+indices[1];
+      }
+        
+      // Uncomment to add number prefixes
+      // current.html(prefix + "   " + current.html());
+      for(var l = levelTag; l < 2; ++l){
+          indices[l] = 0;
+      }
+
+      if(cur_id == undefined) {
+        current.attr('id', 'title' + i);
+        current.addClass('anchor');
+        toc.append("<li class='level" + levelTag + "'><a id='link" + i + "' href='#title" +
+                    i + "' title='" + current.prop("tagName") + "'>" + current.text() + "</a></li>");
+      } else {
+        toc.append("<li class='level" + levelTag + "'><a id='" + cur_id + "' href='#title" +
+                    i + "' title='" + current.prop("tagName") + "'>" + current.text() + "</a></li>");
+      }
+    });
+    resizeHeight();
+  }
+}
+
+
+var global_navtree_object;
+
+// Overloaded to remove links to sections/subsections
+function getNode(o, po)
+{
+  po.childrenVisited = true;
+  var l = po.childrenData.length-1;
+  for (var i in po.childrenData) {
+    var nodeData = po.childrenData[i];
+    if((!nodeData[1]) ||  (nodeData[1].indexOf('#')==-1)) // <- we added this line
+      po.children[i] = newNode(o, po, nodeData[0], nodeData[1], nodeData[2], i==l);
+  }
+}
+
+// Overloaded to adjust the size of the navtree wrt the toc
+function resizeHeight() 
+{
+  var header  = $("#top");
+  var sidenav = $("#side-nav");
+  var content = $("#doc-content");
+  var navtree = $("#nav-tree");
+  var footer  = $("#nav-path");
+  var toc     = $("#nav-toc");
+
+  var headerHeight = header.outerHeight();
+  var footerHeight = footer.outerHeight();
+  var tocHeight    = toc.height();
+  var windowHeight = $(window).height() - headerHeight - footerHeight;
+  content.css({height:windowHeight + "px"});
+  navtree.css({height:(windowHeight-tocHeight) + "px"});
+  sidenav.css({height:windowHeight + "px"});
+}
+
+// Overloaded to save the root node into global_navtree_object
+function initNavTree(toroot,relpath)
+{
+  var o = new Object();
+  global_navtree_object = o; // <- we added this line
+  o.toroot = toroot;
+  o.node = new Object();
+  o.node.li = document.getElementById("nav-tree-contents");
+  o.node.childrenData = NAVTREE;
+  o.node.children = new Array();
+  o.node.childrenUL = document.createElement("ul");
+  o.node.getChildrenUL = function() { return o.node.childrenUL; };
+  o.node.li.appendChild(o.node.childrenUL);
+  o.node.depth = 0;
+  o.node.relpath = relpath;
+  o.node.expanded = false;
+  o.node.isLast = true;
+  o.node.plus_img = document.createElement("img");
+  o.node.plus_img.src = relpath+"ftv2pnode.png";
+  o.node.plus_img.width = 16;
+  o.node.plus_img.height = 22;
+
+  if (localStorageSupported()) {
+    var navSync = $('#nav-sync');
+    if (cachedLink()) {
+      showSyncOff(navSync,relpath);
+      navSync.removeClass('sync');
+    } else {
+      showSyncOn(navSync,relpath);
+    }
+    navSync.click(function(){ toggleSyncButton(relpath); });
+  }
+
+  navTo(o,toroot,window.location.hash,relpath);
+
+  $(window).bind('hashchange', function(){
+     if (window.location.hash && window.location.hash.length>1){
+       var a;
+       if ($(location).attr('hash')){
+         var clslink=stripPath($(location).attr('pathname'))+':'+
+                               $(location).attr('hash').substring(1);
+         a=$('.item a[class$="'+clslink+'"]');
+       }
+       if (a==null || !$(a).parent().parent().hasClass('selected')){
+         $('.item').removeClass('selected');
+         $('.item').removeAttr('id');
+       }
+       var link=stripPath2($(location).attr('pathname'));
+       navTo(o,link,$(location).attr('hash'),relpath);
+     } else if (!animationInProgress) {
+       $('#doc-content').scrollTop(0);
+       $('.item').removeClass('selected');
+       $('.item').removeAttr('id');
+       navTo(o,toroot,window.location.hash,relpath);
+     }
+  })
+
+  $(window).on("load", showRoot);
+}
+
+// return false if the the node has no children at all, or has only section/subsection children
+function checkChildrenData(node) {
+  if (!(typeof(node.childrenData)==='string')) {
+    for (var i in node.childrenData) {
+      var url = node.childrenData[i][1];
+      if(url.indexOf("#")==-1)
+        return true;
+    }
+    return false;
+  }
+  return (node.childrenData);
+}
+
+// Modified to:
+// 1 - remove the root node 
+// 2 - remove the section/subsection children
+function createIndent(o,domNode,node,level)
+{
+  var level=-2; // <- we replaced level=-1 by level=-2
+  var n = node;
+  while (n.parentNode) { level++; n=n.parentNode; }
+  if (checkChildrenData(node)) { // <- we modified this line to use checkChildrenData(node) instead of node.childrenData
+    var imgNode = document.createElement("span");
+    imgNode.className = 'arrow';
+    imgNode.style.paddingLeft=(16*level).toString()+'px';
+    imgNode.innerHTML=arrowRight;
+    node.plus_img = imgNode;
+    node.expandToggle = document.createElement("a");
+    node.expandToggle.href = "javascript:void(0)";
+    node.expandToggle.onclick = function() {
+      if (node.expanded) {
+        $(node.getChildrenUL()).slideUp("fast");
+        node.plus_img.innerHTML=arrowRight;
+        node.expanded = false;
+      } else {
+        expandNode(o, node, false, false);
+      }
+    }
+    node.expandToggle.appendChild(imgNode);
+    domNode.appendChild(node.expandToggle);
+  } else {
+    var span = document.createElement("span");
+    span.className = 'arrow';
+    span.style.width   = 16*(level+1)+'px';
+    span.innerHTML = '&#160;';
+    domNode.appendChild(span);
+  }
+}
+
+// Overloaded to automatically expand the selected node
+function selectAndHighlight(hash,n)
+{
+  var a;
+  if (hash) {
+    var link=stripPath($(location).attr('pathname'))+':'+hash.substring(1);
+    a=$('.item a[class$="'+link+'"]');
+  }
+  if (a && a.length) {
+    a.parent().parent().addClass('selected');
+    a.parent().parent().attr('id','selected');
+    highlightAnchor();
+  } else if (n) {
+    $(n.itemDiv).addClass('selected');
+    $(n.itemDiv).attr('id','selected');
+  }
+  if ($('#nav-tree-contents .item:first').hasClass('selected')) {
+    $('#nav-sync').css('top','30px');
+  } else {
+    $('#nav-sync').css('top','5px');
+  }
+  expandNode(global_navtree_object, n, true, true); // <- we added this line
+  showRoot();
+}
+
+
+$(document).ready(function() {
+  
+  generate_autotoc();
+  
+  (function (){ // wait until the first "selected" element has been created
+    try {
+      
+      // this line will triger an exception if there is no #selected element, i.e., before the tree structure is complete.
+      document.getElementById("selected").className = "item selected";
+      
+      // ok, the default tree has been created, we can keep going...
+      
+      // expand the "Chapters" node
+      if(window.location.href.indexOf('unsupported')==-1)
+        expandNode(global_navtree_object, global_navtree_object.node.children[0].children[2], true, true);
+      else
+        expandNode(global_navtree_object, global_navtree_object.node.children[0].children[1], true, true);
+      
+      // Hide the root node "Eigen"
+      $(document.getElementsByClassName('index.html')[0]).parent().parent().css({display:"none"});
+      
+    } catch (err) {
+      setTimeout(arguments.callee, 10);
+    }
+  })();
+
+  $(window).on("load", resizeHeight);
+});
+

diff --git a/doc/eigendoxy.css b/doc/eigendoxy.css
new file mode 100644
index 0000000..4e9d7d1
--- /dev/null
+++ b/doc/eigendoxy.css

@@ -0,0 +1,235 @@
+
+/******** Eigen specific CSS code ************/
+
+/**** Styles removing elements ****/
+
+/* remove the "modules|classes" link for module pages (they are already in the TOC) */
+div.summary {
+  display:none;
+}
+
+/* remove */
+div.contents hr {
+  display:none;
+}
+
+/**** ****/
+
+p, dl.warning, dl.attention, dl.note
+{
+  max-width:60em;
+  text-align:justify;
+}
+
+li {
+  max-width:55em;
+  text-align:justify;  
+}
+
+img {
+  border: 0;
+}
+
+div.fragment {
+  display:table; /* this allows the element to be larger than its parent */
+  padding: 0pt;
+}
+pre.fragment {
+  border: 1px solid #cccccc;
+
+  margin: 2px 0px 2px 0px;
+  padding: 3px 5px 3px 5px;
+}
+
+
+
+/* Common style for all Eigen's tables */
+
+table.example, table.manual, table.manual-vl, table.manual-hl {
+    max-width:100%;
+    border-collapse: collapse;
+    border-style: solid;
+    border-width: 1px;
+    border-color: #cccccc;
+    font-size: 1em;
+    
+    box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+    -moz-box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+    -webkit-box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+}
+
+table.example th, table.manual th, table.manual-vl th, table.manual-hl th {
+  padding: 0.5em 0.5em 0.5em 0.5em;
+  text-align: left;
+  padding-right: 1em;
+  color: #555555;
+  background-color: #F4F4E5;
+  
+  background-image: -webkit-gradient(linear,center top,center bottom,from(#FFFFFF), color-stop(0.3,#FFFFFF), color-stop(0.30,#FFFFFF), color-stop(0.98,#F4F4E5), to(#ECECDE));
+  background-image: -moz-linear-gradient(center top, #FFFFFF 0%, #FFFFFF 30%, #F4F4E5 98%, #ECECDE);
+  filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#FFFFFF', endColorstr='#F4F4E5');
+}
+
+table.example td, table.manual td, table.manual-vl td, table.manual-hl td {
+  vertical-align:top;
+  border-width: 1px;
+  border-color: #cccccc;
+}
+
+/* header of headers */
+table th.meta {
+  text-align:center;
+  font-size: 1.2em;
+  background-color:#FFFFFF;
+}
+
+/* intermediate header */
+table th.inter {
+  text-align:left;
+  background-color:#FFFFFF;
+  background-image:none;
+  border-style:solid solid solid solid;
+  border-width: 1px;
+	border-color: #cccccc;
+}
+
+/** class for example / output tables **/
+
+table.example {
+}
+
+table.example th {
+}
+
+table.example td {
+  padding: 0.5em 0.5em 0.5em 0.5em;
+  vertical-align:top;
+}
+
+/* standard class for the manual */
+
+table.manual, table.manual-vl, table.manual-hl {
+    padding: 0.2em 0em 0.5em 0em;
+}
+
+table.manual th, table.manual-vl th, table.manual-hl th {
+  margin: 0em 0em 0.3em 0em;
+}
+
+table.manual td, table.manual-vl td, table.manual-hl td {
+  padding: 0.3em 0.5em 0.3em 0.5em;
+  vertical-align:top;
+  border-width: 1px;
+}
+
+table.manual td.alt, table.manual tr.alt, table.manual-vl td.alt, table.manual-vl tr.alt {
+  background-color: #F4F4E5;
+}
+
+table.manual-vl th, table.manual-vl td, table.manual-vl td.alt {
+  border-color: #cccccc;
+  border-width: 1px;
+  border-style: none solid none solid;
+}
+
+table.manual-vl th.inter {
+  border-style: solid solid solid solid;
+}
+
+table.manual-hl td {
+  border-color: #cccccc;
+  border-width: 1px;
+  border-style: solid none solid none;
+}
+
+table td.code {
+  font-family: monospace;
+}
+
+h2 {
+  margin-top:2em;
+  border-style: none none solid none;
+  border-width: 1px;
+  border-color: #cccccc;
+}
+
+/**** Table of content in the side-nav ****/
+
+
+div.toc {
+  margin:0;
+  padding: 0.3em 0 0 0;
+  width:100%;
+  float:none;
+  position:absolute;
+  bottom:0;
+  border-radius:0px;
+  border-style: solid none none none;
+  max-height:50%;
+  overflow-y: scroll;
+}
+
+div.toc h3 {
+  margin-left: 0.5em;
+  margin-bottom: 0.2em;
+}
+
+div.toc ul {
+  margin: 0.2em 0 0.4em 0.5em;
+}
+
+span.cpp11,span.cpp14,span.cpp17 {
+  color: #119911;
+  font-weight: bold;
+}
+
+.newin3x {
+  color: #a37c1a;
+  font-weight: bold;
+}
+
+div.warningbox {
+  max-width:60em;
+  border-style: solid solid solid solid;
+  border-color: red;
+  border-width: 3px;
+}
+
+/**** old Eigen's styles ****/
+
+
+table.tutorial_code td {
+  border-color: transparent; /* required for Firefox */
+  padding: 3pt 5pt 3pt 5pt;
+  vertical-align: top;
+}
+
+
+/* Whenever doxygen meets a '\n' or a '<BR/>', it will put 
+ * the text containing the character into a <p class="starttd">.
+ * This little hack together with table.tutorial_code td.note
+ * aims at fixing this issue. */
+table.tutorial_code td.note p.starttd {
+  margin: 0px;
+  border: none;
+  padding: 0px;
+}
+
+div.eimainmenu {
+  text-align:     center;
+}
+
+/* center version number on main page */
+h3.version { 
+  text-align:     center;
+}
+
+
+td.width20em p.endtd {
+  width:  20em;
+}
+
+/* needed for huge screens */
+.ui-resizable-e {
+  background-repeat: repeat-y;
+}

diff --git a/doc/eigendoxy_footer.html.in b/doc/eigendoxy_footer.html.in
new file mode 100644
index 0000000..1266535
--- /dev/null
+++ b/doc/eigendoxy_footer.html.in

@@ -0,0 +1,23 @@
+<!-- start footer part -->
+<!--BEGIN GENERATE_TREEVIEW-->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    $navpath
+    <li class="footer">$generatedby
+    <a href="http://www.doxygen.org/index.html">
+    <img class="footer" src="$relpath^doxygen.png" alt="doxygen"/></a> $doxygenversion </li>
+  </ul>
+</div>
+<!--END GENERATE_TREEVIEW-->
+<!--BEGIN !GENERATE_TREEVIEW-->
+<hr class="footer"/><address class="footer"><small>
+$generatedby &#160;<a href="http://www.doxygen.org/index.html">
+<img class="footer" src="$relpath^doxygen.png" alt="doxygen"/>
+</a> $doxygenversion
+</small></address>
+<!--END !GENERATE_TREEVIEW-->
+
+</body>
+</html>
+
+

diff --git a/doc/eigendoxy_header.html.in b/doc/eigendoxy_header.html.in
new file mode 100644
index 0000000..a6b1c1d
--- /dev/null
+++ b/doc/eigendoxy_header.html.in

@@ -0,0 +1,62 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen $doxygenversion"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<!--BEGIN PROJECT_NAME--><title>$projectname: $title</title><!--END PROJECT_NAME-->
+<!--BEGIN !PROJECT_NAME--><title>$title</title><!--END !PROJECT_NAME-->
+<link href="$relpath^tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="$relpath^jquery.js"></script>
+<script type="text/javascript" src="$relpath^dynsections.js"></script>
+$treeview
+$search
+$mathjax
+<link href="$relpath^$stylesheet" rel="stylesheet" type="text/css" />
+<link href="$relpath$eigendoxy.css" rel="stylesheet" type="text/css">
+<!-- $extrastylesheet -->
+<script type="text/javascript" src="$relpath$eigen_navtree_hacks.js"></script>
+
+</head>
+<body>
+
+<div style="background:#FFDDDD;font-size:120%;text-align:center;margin:0;padding:5px">Please, help us to better know about our user community by answering the following short survey:  <a href="https://forms.gle/wpyrxWi18ox9Z5ae9">https://forms.gle/wpyrxWi18ox9Z5ae9</a></div>
+
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+
+<!--BEGIN TITLEAREA-->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <!--BEGIN PROJECT_LOGO-->
+  <td id="projectlogo"><img alt="Logo" src="$relpath^$projectlogo"/></td>
+  <!--END PROJECT_LOGO-->
+  <!--BEGIN PROJECT_NAME-->
+  <td id="projectalign" style="padding-left: 0.5em;">
+   <div id="projectname"><a href="http://eigen.tuxfamily.org">$projectname</a>
+   <!--BEGIN PROJECT_NUMBER-->&#160;<span id="projectnumber">$projectnumber</span><!--END PROJECT_NUMBER-->
+   </div>
+   <!--BEGIN PROJECT_BRIEF--><div id="projectbrief">$projectbrief</div><!--END PROJECT_BRIEF-->
+  </td>
+  <!--END PROJECT_NAME-->
+  <!--BEGIN !PROJECT_NAME-->
+   <!--BEGIN PROJECT_BRIEF-->
+    <td id="projectalign" style="padding-left: 0.5em;">
+    <div id="projectbrief">$projectbrief</div>
+    </td>
+   <!--END PROJECT_BRIEF-->
+  <!--END !PROJECT_NAME-->
+  <!--BEGIN DISABLE_INDEX-->
+   <!--BEGIN SEARCHENGINE-->
+   <td>$searchbox</td>
+   <!--END SEARCHENGINE-->
+  <!--END DISABLE_INDEX-->
+ </tr>
+ </tbody>
+</table>
+</div>
+<!--END TITLEAREA-->
+<!-- end header part -->
+

diff --git a/doc/eigendoxy_layout.xml.in b/doc/eigendoxy_layout.xml.in
new file mode 100644
index 0000000..c14b621
--- /dev/null
+++ b/doc/eigendoxy_layout.xml.in

@@ -0,0 +1,178 @@
+<?xml version="1.0"?>
+<doxygenlayout version="1.0">
+  <!-- Navigation index tabs for HTML output -->
+  <navindex>
+    <tab type="user" url="index.html" title="Overview" />
+    <tab type="user" url="@ref GettingStarted" title="Getting started" />
+    <tab type="modules" visible="yes" title="Chapters" intro=""/>
+    <tab type="mainpage" visible="yes" title=""/>
+    <tab type="classlist" visible="yes" title="" intro=""/>
+<!--     <tab type="classmembers" visible="yes" title="" intro=""/> -->
+  </navindex>
+
+  <!-- Layout definition for a class page -->
+  <class>
+    <briefdescription visible="no"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <detaileddescription title=""/>
+    <inheritancegraph visible="$CLASS_GRAPH"/>
+    <collaborationgraph visible="$COLLABORATION_GRAPH"/>
+    <allmemberslink visible="yes"/>
+    <memberdecl>
+      <nestedclasses visible="yes" title=""/>
+      <publictypes title=""/>
+      <publicslots title=""/>
+      <signals title=""/>
+      <publicmethods title=""/>
+      <publicstaticmethods title=""/>
+      <publicattributes title=""/>
+      <publicstaticattributes title=""/>
+      <protectedtypes title=""/>
+      <protectedslots title=""/>
+      <protectedmethods title=""/>
+      <protectedstaticmethods title=""/>
+      <protectedattributes title=""/>
+      <protectedstaticattributes title=""/>
+      <packagetypes title=""/>
+      <packagemethods title=""/>
+      <packagestaticmethods title=""/>
+      <packageattributes title=""/>
+      <packagestaticattributes title=""/>
+      <properties title=""/>
+      <events title=""/>
+      <privatetypes title=""/>
+      <privateslots title=""/>
+      <privatemethods title=""/>
+      <privatestaticmethods title=""/>
+      <privateattributes title=""/>
+      <privatestaticattributes title=""/>
+      <friends title=""/>
+      <related title="" subtitle=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <constructors title=""/>
+      <functions title=""/>
+      <related title=""/>
+      <variables title=""/>
+      <properties title=""/>
+      <events title=""/>
+    </memberdef>
+    <usedfiles visible="$SHOW_USED_FILES"/>
+    <authorsection visible="yes"/>
+  </class>
+
+  <!-- Layout definition for a namespace page -->
+  <namespace>
+    <briefdescription visible="yes"/>
+    <memberdecl>
+      <nestednamespaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </namespace>
+
+  <!-- Layout definition for a file page -->
+  <file>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <includegraph visible="$INCLUDE_GRAPH"/>
+    <includedbygraph visible="$INCLUDED_BY_GRAPH"/>
+    <sourcelink visible="yes"/>
+    <memberdecl>
+      <classes visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection/>
+  </file>
+
+  <!-- Layout definition for a group page -->
+  <group>
+    <briefdescription visible="no"/>
+    <detaileddescription title=""/>
+    <groupgraph visible="$GROUP_GRAPHS"/>
+    <memberdecl>
+      <nestedgroups visible="yes" title=""/>
+      <dirs visible="yes" title=""/>
+      <files visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    
+    <memberdef>
+      <pagedocs/>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </group>
+
+  <!-- Layout definition for a directory page -->
+  <directory>
+    <briefdescription visible="yes"/>
+    <directorygraph visible="yes"/>
+    <memberdecl>
+      <dirs visible="yes"/>
+      <files visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+  </directory>
+</doxygenlayout>

diff --git a/doc/eigendoxy_tabs.css b/doc/eigendoxy_tabs.css
new file mode 100644
index 0000000..2192056
--- /dev/null
+++ b/doc/eigendoxy_tabs.css

@@ -0,0 +1,59 @@
+.tabs, .tabs2, .tabs3 {
+    background-image: url('tab_b.png');
+    width: 100%;
+    z-index: 101;
+    font-size: 13px;
+}
+
+.tabs2 {
+    font-size: 10px;
+}
+.tabs3 {
+    font-size: 9px;
+}
+
+.tablist {
+    margin: 0;
+    padding: 0;
+    display: table;
+}
+
+.tablist li {
+    float: left;
+    display: table-cell;
+    background-image: url('tab_b.png');
+    line-height: 36px;
+    list-style: none;
+}
+
+.tablist a {
+    display: block;
+    padding: 0 20px;
+    font-weight: bold;
+    background-image:url('tab_s.png');
+    background-repeat:no-repeat;
+    background-position:right;
+    color: #283A5D;
+    text-shadow: 0px 1px 1px rgba(255, 255, 255, 0.9);
+    text-decoration: none;
+    outline: none;
+}
+
+.tabs3 .tablist a {
+    padding: 0 10px;
+}
+
+.tablist a:hover {
+    background-image: url('tab_h.png');
+    background-repeat:repeat-x;
+    color: #fff;
+    text-shadow: 0px 1px 1px rgba(0, 0, 0, 1.0);
+    text-decoration: none;
+}
+
+.tablist li.current a {
+    background-image: url('tab_a.png');
+    background-repeat:repeat-x;
+    color: #fff;
+    text-shadow: 0px 1px 1px rgba(0, 0, 0, 1.0);
+}

diff --git a/doc/examples/.krazy b/doc/examples/.krazy
new file mode 100644
index 0000000..00b9940
--- /dev/null
+++ b/doc/examples/.krazy

@@ -0,0 +1,2 @@
+EXCLUDE copyright
+EXCLUDE license

diff --git a/doc/examples/CMakeLists.txt b/doc/examples/CMakeLists.txt
new file mode 100644
index 0000000..a2c9d05
--- /dev/null
+++ b/doc/examples/CMakeLists.txt

@@ -0,0 +1,20 @@
+file(GLOB examples_SRCS "*.cpp")
+
+foreach(example_src ${examples_SRCS})
+  get_filename_component(example ${example_src} NAME_WE)
+  add_executable(${example} ${example_src})
+  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+    target_link_libraries(${example} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+  endif()
+  add_custom_command(
+    TARGET ${example}
+    POST_BUILD
+    COMMAND ${example}
+    ARGS >${CMAKE_CURRENT_BINARY_DIR}/${example}.out
+  )
+  add_dependencies(all_examples ${example})
+endforeach()
+
+if(EIGEN_COMPILER_SUPPORT_CPP11)
+ei_add_target_property(nullary_indexing COMPILE_FLAGS "-std=c++11")
+endif()
\ No newline at end of file

diff --git a/doc/examples/CustomizingEigen_Inheritance.cpp b/doc/examples/CustomizingEigen_Inheritance.cpp
new file mode 100644
index 0000000..48df64e
--- /dev/null
+++ b/doc/examples/CustomizingEigen_Inheritance.cpp

@@ -0,0 +1,30 @@
+#include <Eigen/Core>
+#include <iostream>
+
+class MyVectorType : public Eigen::VectorXd
+{
+public:
+    MyVectorType(void):Eigen::VectorXd() {}
+
+    // This constructor allows you to construct MyVectorType from Eigen expressions
+    template<typename OtherDerived>
+    MyVectorType(const Eigen::MatrixBase<OtherDerived>& other)
+        : Eigen::VectorXd(other)
+    { }
+
+    // This method allows you to assign Eigen expressions to MyVectorType
+    template<typename OtherDerived>
+    MyVectorType& operator=(const Eigen::MatrixBase <OtherDerived>& other)
+    {
+        this->Eigen::VectorXd::operator=(other);
+        return *this;
+    }
+};
+
+int main()
+{
+  MyVectorType v = MyVectorType::Ones(4);
+  v(2) += 10;
+  v = 2 * v;
+  std::cout << v.transpose() << std::endl;
+}

diff --git a/doc/examples/Cwise_erf.cpp b/doc/examples/Cwise_erf.cpp
new file mode 100644
index 0000000..e7cd2c1
--- /dev/null
+++ b/doc/examples/Cwise_erf.cpp

@@ -0,0 +1,9 @@
+#include <Eigen/Core>
+#include <unsupported/Eigen/SpecialFunctions>
+#include <iostream>
+using namespace Eigen;
+int main()
+{
+  Array4d v(-0.5,2,0,-7);
+  std::cout << v.erf() << std::endl;
+}

diff --git a/doc/examples/Cwise_erfc.cpp b/doc/examples/Cwise_erfc.cpp
new file mode 100644
index 0000000..d8bb04c
--- /dev/null
+++ b/doc/examples/Cwise_erfc.cpp

@@ -0,0 +1,9 @@
+#include <Eigen/Core>
+#include <unsupported/Eigen/SpecialFunctions>
+#include <iostream>
+using namespace Eigen;
+int main()
+{
+  Array4d v(-0.5,2,0,-7);
+  std::cout << v.erfc() << std::endl;
+}

diff --git a/doc/examples/Cwise_lgamma.cpp b/doc/examples/Cwise_lgamma.cpp
new file mode 100644
index 0000000..6bfaccb
--- /dev/null
+++ b/doc/examples/Cwise_lgamma.cpp

@@ -0,0 +1,9 @@
+#include <Eigen/Core>
+#include <unsupported/Eigen/SpecialFunctions>
+#include <iostream>
+using namespace Eigen;
+int main()
+{
+  Array4d v(0.5,10,0,-1);
+  std::cout << v.lgamma() << std::endl;
+}

diff --git a/doc/examples/DenseBase_middleCols_int.cpp b/doc/examples/DenseBase_middleCols_int.cpp
new file mode 100644
index 0000000..0ebd955
--- /dev/null
+++ b/doc/examples/DenseBase_middleCols_int.cpp

@@ -0,0 +1,15 @@
+#include <Eigen/Core>
+#include <iostream>
+
+using namespace Eigen;
+using namespace std;
+
+int main(void)
+{
+    int const N = 5;
+    MatrixXi A(N,N);
+    A.setRandom();
+    cout << "A =\n" << A << '\n' << endl;
+    cout << "A(1..3,:) =\n" << A.middleCols(1,3) << endl;
+    return 0;
+}

diff --git a/doc/examples/DenseBase_middleRows_int.cpp b/doc/examples/DenseBase_middleRows_int.cpp
new file mode 100644
index 0000000..a6fe9e8
--- /dev/null
+++ b/doc/examples/DenseBase_middleRows_int.cpp

@@ -0,0 +1,15 @@
+#include <Eigen/Core>
+#include <iostream>
+
+using namespace Eigen;
+using namespace std;
+
+int main(void)
+{
+    int const N = 5;
+    MatrixXi A(N,N);
+    A.setRandom();
+    cout << "A =\n" << A << '\n' << endl;
+    cout << "A(2..3,:) =\n" << A.middleRows(2,2) << endl;
+    return 0;
+}

diff --git a/doc/examples/DenseBase_template_int_middleCols.cpp b/doc/examples/DenseBase_template_int_middleCols.cpp
new file mode 100644
index 0000000..6191d79
--- /dev/null
+++ b/doc/examples/DenseBase_template_int_middleCols.cpp

@@ -0,0 +1,15 @@
+#include <Eigen/Core>
+#include <iostream>
+
+using namespace Eigen;
+using namespace std;
+
+int main(void)
+{
+    int const N = 5;
+    MatrixXi A(N,N);
+    A.setRandom();
+    cout << "A =\n" << A << '\n' << endl;
+    cout << "A(:,1..3) =\n" << A.middleCols<3>(1) << endl;
+    return 0;
+}

diff --git a/doc/examples/DenseBase_template_int_middleRows.cpp b/doc/examples/DenseBase_template_int_middleRows.cpp
new file mode 100644
index 0000000..7e8b657
--- /dev/null
+++ b/doc/examples/DenseBase_template_int_middleRows.cpp

@@ -0,0 +1,15 @@
+#include <Eigen/Core>
+#include <iostream>
+
+using namespace Eigen;
+using namespace std;
+
+int main(void)
+{
+    int const N = 5;
+    MatrixXi A(N,N);
+    A.setRandom();
+    cout << "A =\n" << A << '\n' << endl;
+    cout << "A(1..3,:) =\n" << A.middleRows<3>(1) << endl;
+    return 0;
+}

diff --git a/doc/examples/QuickStart_example.cpp b/doc/examples/QuickStart_example.cpp
new file mode 100644
index 0000000..7238c0c
--- /dev/null
+++ b/doc/examples/QuickStart_example.cpp

@@ -0,0 +1,14 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using Eigen::MatrixXd;
+
+int main()
+{
+  MatrixXd m(2,2);
+  m(0,0) = 3;
+  m(1,0) = 2.5;
+  m(0,1) = -1;
+  m(1,1) = m(1,0) + m(0,1);
+  std::cout << m << std::endl;
+}

diff --git a/doc/examples/QuickStart_example2_dynamic.cpp b/doc/examples/QuickStart_example2_dynamic.cpp
new file mode 100644
index 0000000..ff6746e
--- /dev/null
+++ b/doc/examples/QuickStart_example2_dynamic.cpp

@@ -0,0 +1,15 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace Eigen;
+using namespace std;
+
+int main()
+{
+  MatrixXd m = MatrixXd::Random(3,3);
+  m = (m + MatrixXd::Constant(3,3,1.2)) * 50;
+  cout << "m =" << endl << m << endl;
+  VectorXd v(3);
+  v << 1, 2, 3;
+  cout << "m * v =" << endl << m * v << endl;
+}

diff --git a/doc/examples/QuickStart_example2_fixed.cpp b/doc/examples/QuickStart_example2_fixed.cpp
new file mode 100644
index 0000000..d911752
--- /dev/null
+++ b/doc/examples/QuickStart_example2_fixed.cpp

@@ -0,0 +1,15 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace Eigen;
+using namespace std;
+
+int main()
+{
+  Matrix3d m = Matrix3d::Random();
+  m = (m + Matrix3d::Constant(1.2)) * 50;
+  cout << "m =" << endl << m << endl;
+  Vector3d v(1,2,3);
+  
+  cout << "m * v =" << endl << m * v << endl;
+}

diff --git a/doc/examples/TemplateKeyword_flexible.cpp b/doc/examples/TemplateKeyword_flexible.cpp
new file mode 100644
index 0000000..9d85292
--- /dev/null
+++ b/doc/examples/TemplateKeyword_flexible.cpp

@@ -0,0 +1,22 @@
+#include <Eigen/Dense>
+#include <iostream>
+
+using namespace Eigen;
+
+template <typename Derived1, typename Derived2>
+void copyUpperTriangularPart(MatrixBase<Derived1>& dst, const MatrixBase<Derived2>& src)
+{
+  /* Note the 'template' keywords in the following line! */
+  dst.template triangularView<Upper>() = src.template triangularView<Upper>();
+}
+
+int main()
+{
+  MatrixXi m1 = MatrixXi::Ones(5,5);
+  MatrixXi m2 = MatrixXi::Random(4,4);
+  std::cout << "m2 before copy:" << std::endl;
+  std::cout << m2 << std::endl << std::endl;
+  copyUpperTriangularPart(m2, m1.topLeftCorner(4,4));
+  std::cout << "m2 after copy:" << std::endl;
+  std::cout << m2 << std::endl << std::endl;
+}

diff --git a/doc/examples/TemplateKeyword_simple.cpp b/doc/examples/TemplateKeyword_simple.cpp
new file mode 100644
index 0000000..6998c17
--- /dev/null
+++ b/doc/examples/TemplateKeyword_simple.cpp

@@ -0,0 +1,20 @@
+#include <Eigen/Dense>
+#include <iostream>
+
+using namespace Eigen;
+
+void copyUpperTriangularPart(MatrixXf& dst, const MatrixXf& src)
+{
+  dst.triangularView<Upper>() = src.triangularView<Upper>();
+}
+
+int main()
+{
+  MatrixXf m1 = MatrixXf::Ones(4,4);
+  MatrixXf m2 = MatrixXf::Random(4,4);
+  std::cout << "m2 before copy:" << std::endl;
+  std::cout << m2 << std::endl << std::endl;
+  copyUpperTriangularPart(m2, m1);
+  std::cout << "m2 after copy:" << std::endl;
+  std::cout << m2 << std::endl << std::endl;
+}

diff --git a/doc/examples/TutorialInplaceLU.cpp b/doc/examples/TutorialInplaceLU.cpp
new file mode 100644
index 0000000..cb9c59b
--- /dev/null
+++ b/doc/examples/TutorialInplaceLU.cpp

@@ -0,0 +1,61 @@
+#include <iostream>
+struct init {
+  init() { std::cout << "[" << "init" << "]" << std::endl; }
+};
+init init_obj;
+// [init]
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+  MatrixXd A(2,2);
+  A << 2, -1, 1, 3;
+  cout << "Here is the input matrix A before decomposition:\n" << A << endl;
+cout << "[init]" << endl;
+
+cout << "[declaration]" << endl;
+  PartialPivLU<Ref<MatrixXd> > lu(A);
+  cout << "Here is the input matrix A after decomposition:\n" << A << endl;
+cout << "[declaration]" << endl;
+
+cout << "[matrixLU]" << endl;
+  cout << "Here is the matrix storing the L and U factors:\n" << lu.matrixLU() << endl;
+cout << "[matrixLU]" << endl;
+
+cout << "[solve]" << endl;
+  MatrixXd A0(2,2); A0 << 2, -1, 1, 3;
+  VectorXd b(2);    b << 1, 2;
+  VectorXd x = lu.solve(b);
+  cout << "Residual: " << (A0 * x - b).norm() << endl;
+cout << "[solve]" << endl;
+
+cout << "[modifyA]" << endl;
+  A << 3, 4, -2, 1;
+  x = lu.solve(b);
+  cout << "Residual: " << (A0 * x - b).norm() << endl;
+cout << "[modifyA]" << endl;
+
+cout << "[recompute]" << endl;
+  A0 = A; // save A
+  lu.compute(A);
+  x = lu.solve(b);
+  cout << "Residual: " << (A0 * x - b).norm() << endl;
+cout << "[recompute]" << endl;
+
+cout << "[recompute_bis0]" << endl;
+  MatrixXd A1(2,2);
+  A1 << 5,-2,3,4;
+  lu.compute(A1);
+  cout << "Here is the input matrix A1 after decomposition:\n" << A1 << endl;
+cout << "[recompute_bis0]" << endl;
+
+cout << "[recompute_bis1]" << endl;
+  x = lu.solve(b);
+  cout << "Residual: " << (A1 * x - b).norm() << endl;
+cout << "[recompute_bis1]" << endl;
+
+}

diff --git a/doc/examples/TutorialLinAlgComputeTwice.cpp b/doc/examples/TutorialLinAlgComputeTwice.cpp
new file mode 100644
index 0000000..06ba646
--- /dev/null
+++ b/doc/examples/TutorialLinAlgComputeTwice.cpp

@@ -0,0 +1,23 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+   Matrix2f A, b;
+   LLT<Matrix2f> llt;
+   A << 2, -1, -1, 3;
+   b << 1, 2, 3, 1;
+   cout << "Here is the matrix A:\n" << A << endl;
+   cout << "Here is the right hand side b:\n" << b << endl;
+   cout << "Computing LLT decomposition..." << endl;
+   llt.compute(A);
+   cout << "The solution is:\n" << llt.solve(b) << endl;
+   A(1,1)++;
+   cout << "The matrix A is now:\n" << A << endl;
+   cout << "Computing LLT decomposition..." << endl;
+   llt.compute(A);
+   cout << "The solution is now:\n" << llt.solve(b) << endl;
+}

diff --git a/doc/examples/TutorialLinAlgExComputeSolveError.cpp b/doc/examples/TutorialLinAlgExComputeSolveError.cpp
new file mode 100644
index 0000000..f362fb7
--- /dev/null
+++ b/doc/examples/TutorialLinAlgExComputeSolveError.cpp

@@ -0,0 +1,14 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+   MatrixXd A = MatrixXd::Random(100,100);
+   MatrixXd b = MatrixXd::Random(100,50);
+   MatrixXd x = A.fullPivLu().solve(b);
+   double relative_error = (A*x - b).norm() / b.norm(); // norm() is L2 norm
+   cout << "The relative error is:\n" << relative_error << endl;
+}

diff --git a/doc/examples/TutorialLinAlgExSolveColPivHouseholderQR.cpp b/doc/examples/TutorialLinAlgExSolveColPivHouseholderQR.cpp
new file mode 100644
index 0000000..3a99a94
--- /dev/null
+++ b/doc/examples/TutorialLinAlgExSolveColPivHouseholderQR.cpp

@@ -0,0 +1,17 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+   Matrix3f A;
+   Vector3f b;
+   A << 1,2,3,  4,5,6,  7,8,10;
+   b << 3, 3, 4;
+   cout << "Here is the matrix A:\n" << A << endl;
+   cout << "Here is the vector b:\n" << b << endl;
+   Vector3f x = A.colPivHouseholderQr().solve(b);
+   cout << "The solution is:\n" << x << endl;
+}

diff --git a/doc/examples/TutorialLinAlgExSolveLDLT.cpp b/doc/examples/TutorialLinAlgExSolveLDLT.cpp
new file mode 100644
index 0000000..f8beacd
--- /dev/null
+++ b/doc/examples/TutorialLinAlgExSolveLDLT.cpp

@@ -0,0 +1,16 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+   Matrix2f A, b;
+   A << 2, -1, -1, 3;
+   b << 1, 2, 3, 1;
+   cout << "Here is the matrix A:\n" << A << endl;
+   cout << "Here is the right hand side b:\n" << b << endl;
+   Matrix2f x = A.ldlt().solve(b);
+   cout << "The solution is:\n" << x << endl;
+}

diff --git a/doc/examples/TutorialLinAlgInverseDeterminant.cpp b/doc/examples/TutorialLinAlgInverseDeterminant.cpp
new file mode 100644
index 0000000..14dde5b
--- /dev/null
+++ b/doc/examples/TutorialLinAlgInverseDeterminant.cpp

@@ -0,0 +1,16 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+   Matrix3f A;
+   A << 1, 2, 1,
+        2, 1, 0,
+        -1, 1, 2;
+   cout << "Here is the matrix A:\n" << A << endl;
+   cout << "The determinant of A is " << A.determinant() << endl;
+   cout << "The inverse of A is:\n" << A.inverse() << endl;
+}

diff --git a/doc/examples/TutorialLinAlgRankRevealing.cpp b/doc/examples/TutorialLinAlgRankRevealing.cpp
new file mode 100644
index 0000000..c516507
--- /dev/null
+++ b/doc/examples/TutorialLinAlgRankRevealing.cpp

@@ -0,0 +1,20 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+   Matrix3f A;
+   A << 1, 2, 5,
+        2, 1, 4,
+        3, 0, 3;
+   cout << "Here is the matrix A:\n" << A << endl;
+   FullPivLU<Matrix3f> lu_decomp(A);
+   cout << "The rank of A is " << lu_decomp.rank() << endl;
+   cout << "Here is a matrix whose columns form a basis of the null-space of A:\n"
+        << lu_decomp.kernel() << endl;
+   cout << "Here is a matrix whose columns form a basis of the column-space of A:\n"
+        << lu_decomp.image(A) << endl; // yes, have to pass the original A
+}

diff --git a/doc/examples/TutorialLinAlgSVDSolve.cpp b/doc/examples/TutorialLinAlgSVDSolve.cpp
new file mode 100644
index 0000000..f109f04
--- /dev/null
+++ b/doc/examples/TutorialLinAlgSVDSolve.cpp

@@ -0,0 +1,15 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+   MatrixXf A = MatrixXf::Random(3, 2);
+   cout << "Here is the matrix A:\n" << A << endl;
+   VectorXf b = VectorXf::Random(3);
+   cout << "Here is the right hand side b:\n" << b << endl;
+   cout << "The least-squares solution is:\n"
+        << A.bdcSvd(ComputeThinU | ComputeThinV).solve(b) << endl;
+}

diff --git a/doc/examples/TutorialLinAlgSelfAdjointEigenSolver.cpp b/doc/examples/TutorialLinAlgSelfAdjointEigenSolver.cpp
new file mode 100644
index 0000000..8d1d1ed
--- /dev/null
+++ b/doc/examples/TutorialLinAlgSelfAdjointEigenSolver.cpp

@@ -0,0 +1,18 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+   Matrix2f A;
+   A << 1, 2, 2, 3;
+   cout << "Here is the matrix A:\n" << A << endl;
+   SelfAdjointEigenSolver<Matrix2f> eigensolver(A);
+   if (eigensolver.info() != Success) abort();
+   cout << "The eigenvalues of A are:\n" << eigensolver.eigenvalues() << endl;
+   cout << "Here's a matrix whose columns are eigenvectors of A \n"
+        << "corresponding to these eigenvalues:\n"
+        << eigensolver.eigenvectors() << endl;
+}

diff --git a/doc/examples/TutorialLinAlgSetThreshold.cpp b/doc/examples/TutorialLinAlgSetThreshold.cpp
new file mode 100644
index 0000000..3956b13
--- /dev/null
+++ b/doc/examples/TutorialLinAlgSetThreshold.cpp

@@ -0,0 +1,16 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+   Matrix2d A;
+   A << 2, 1,
+        2, 0.9999999999;
+   FullPivLU<Matrix2d> lu(A);
+   cout << "By default, the rank of A is found to be " << lu.rank() << endl;
+   lu.setThreshold(1e-5);
+   cout << "With threshold 1e-5, the rank of A is found to be " << lu.rank() << endl;
+}

diff --git a/doc/examples/Tutorial_ArrayClass_accessors.cpp b/doc/examples/Tutorial_ArrayClass_accessors.cpp
new file mode 100644
index 0000000..dc720ff
--- /dev/null
+++ b/doc/examples/Tutorial_ArrayClass_accessors.cpp

@@ -0,0 +1,24 @@
+#include <Eigen/Dense>
+#include <iostream>
+
+using namespace Eigen;
+using namespace std;
+
+int main()
+{
+  ArrayXXf  m(2,2);
+  
+  // assign some values coefficient by coefficient
+  m(0,0) = 1.0; m(0,1) = 2.0;
+  m(1,0) = 3.0; m(1,1) = m(0,1) + m(1,0);
+  
+  // print values to standard output
+  cout << m << endl << endl;
+ 
+  // using the comma-initializer is also allowed
+  m << 1.0,2.0,
+       3.0,4.0;
+     
+  // print values to standard output
+  cout << m << endl;
+}

diff --git a/doc/examples/Tutorial_ArrayClass_addition.cpp b/doc/examples/Tutorial_ArrayClass_addition.cpp
new file mode 100644
index 0000000..480ffb0
--- /dev/null
+++ b/doc/examples/Tutorial_ArrayClass_addition.cpp

@@ -0,0 +1,23 @@
+#include <Eigen/Dense>
+#include <iostream>
+
+using namespace Eigen;
+using namespace std;
+
+int main()
+{
+  ArrayXXf a(3,3);
+  ArrayXXf b(3,3);
+  a << 1,2,3,
+       4,5,6,
+       7,8,9;
+  b << 1,2,3,
+       1,2,3,
+       1,2,3;
+       
+  // Adding two arrays
+  cout << "a + b = " << endl << a + b << endl << endl;
+
+  // Subtracting a scalar from an array
+  cout << "a - 2 = " << endl << a - 2 << endl;
+}

diff --git a/doc/examples/Tutorial_ArrayClass_cwise_other.cpp b/doc/examples/Tutorial_ArrayClass_cwise_other.cpp
new file mode 100644
index 0000000..d9046c6
--- /dev/null
+++ b/doc/examples/Tutorial_ArrayClass_cwise_other.cpp

@@ -0,0 +1,19 @@
+#include <Eigen/Dense>
+#include <iostream>
+
+using namespace Eigen;
+using namespace std;
+
+int main()
+{
+  ArrayXf a = ArrayXf::Random(5);
+  a *= 2;
+  cout << "a =" << endl 
+       << a << endl;
+  cout << "a.abs() =" << endl 
+       << a.abs() << endl;
+  cout << "a.abs().sqrt() =" << endl 
+       << a.abs().sqrt() << endl;
+  cout << "a.min(a.abs().sqrt()) =" << endl 
+       << a.min(a.abs().sqrt()) << endl;
+}

diff --git a/doc/examples/Tutorial_ArrayClass_interop.cpp b/doc/examples/Tutorial_ArrayClass_interop.cpp
new file mode 100644
index 0000000..371f070
--- /dev/null
+++ b/doc/examples/Tutorial_ArrayClass_interop.cpp

@@ -0,0 +1,22 @@
+#include <Eigen/Dense>
+#include <iostream>
+
+using namespace Eigen;
+using namespace std;
+
+int main()
+{
+  MatrixXf m(2,2);
+  MatrixXf n(2,2);
+  MatrixXf result(2,2);
+
+  m << 1,2,
+       3,4;
+  n << 5,6,
+       7,8;
+  
+  result = (m.array() + 4).matrix() * m;
+  cout << "-- Combination 1: --" << endl << result << endl << endl;
+  result = (m.array() * n.array()).matrix() * m;
+  cout << "-- Combination 2: --" << endl << result << endl << endl;
+}

diff --git a/doc/examples/Tutorial_ArrayClass_interop_matrix.cpp b/doc/examples/Tutorial_ArrayClass_interop_matrix.cpp
new file mode 100644
index 0000000..1014275
--- /dev/null
+++ b/doc/examples/Tutorial_ArrayClass_interop_matrix.cpp

@@ -0,0 +1,26 @@
+#include <Eigen/Dense>
+#include <iostream>
+
+using namespace Eigen;
+using namespace std;
+
+int main()
+{
+  MatrixXf m(2,2);
+  MatrixXf n(2,2);
+  MatrixXf result(2,2);
+
+  m << 1,2,
+       3,4;
+  n << 5,6,
+       7,8;
+
+  result = m * n;
+  cout << "-- Matrix m*n: --" << endl << result << endl << endl;
+  result = m.array() * n.array();
+  cout << "-- Array m*n: --" << endl << result << endl << endl;
+  result = m.cwiseProduct(n);
+  cout << "-- With cwiseProduct: --" << endl << result << endl << endl;
+  result = m.array() + 4;
+  cout << "-- Array m + 4: --" << endl << result << endl << endl;
+}

diff --git a/doc/examples/Tutorial_ArrayClass_mult.cpp b/doc/examples/Tutorial_ArrayClass_mult.cpp
new file mode 100644
index 0000000..6cb439f
--- /dev/null
+++ b/doc/examples/Tutorial_ArrayClass_mult.cpp

@@ -0,0 +1,16 @@
+#include <Eigen/Dense>
+#include <iostream>
+
+using namespace Eigen;
+using namespace std;
+
+int main()
+{
+  ArrayXXf a(2,2);
+  ArrayXXf b(2,2);
+  a << 1,2,
+       3,4;
+  b << 5,6,
+       7,8;
+  cout << "a * b = " << endl << a * b << endl;
+}

diff --git a/doc/examples/Tutorial_BlockOperations_block_assignment.cpp b/doc/examples/Tutorial_BlockOperations_block_assignment.cpp
new file mode 100644
index 0000000..0b87313
--- /dev/null
+++ b/doc/examples/Tutorial_BlockOperations_block_assignment.cpp

@@ -0,0 +1,18 @@
+#include <Eigen/Dense>
+#include <iostream>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+  Array22f m;
+  m << 1,2,
+       3,4;
+  Array44f a = Array44f::Constant(0.6);
+  cout << "Here is the array a:" << endl << a << endl << endl;
+  a.block<2,2>(1,1) = m;
+  cout << "Here is now a with m copied into its central 2x2 block:" << endl << a << endl << endl;
+  a.block(0,0,2,3) = a.block(2,1,2,3);
+  cout << "Here is now a with bottom-right 2x3 block copied into top-left 2x3 block:" << endl << a << endl << endl;
+}

diff --git a/doc/examples/Tutorial_BlockOperations_colrow.cpp b/doc/examples/Tutorial_BlockOperations_colrow.cpp
new file mode 100644
index 0000000..2e7eb00
--- /dev/null
+++ b/doc/examples/Tutorial_BlockOperations_colrow.cpp

@@ -0,0 +1,17 @@
+#include <Eigen/Dense>
+#include <iostream>
+
+using namespace std;
+
+int main()
+{
+  Eigen::MatrixXf m(3,3);
+  m << 1,2,3,
+       4,5,6,
+       7,8,9;
+  cout << "Here is the matrix m:" << endl << m << endl;
+  cout << "2nd Row: " << m.row(1) << endl;
+  m.col(2) += 3 * m.col(0);
+  cout << "After adding 3 times the first column into the third column, the matrix m is:\n";
+  cout << m << endl;
+}

diff --git a/doc/examples/Tutorial_BlockOperations_corner.cpp b/doc/examples/Tutorial_BlockOperations_corner.cpp
new file mode 100644
index 0000000..3a31507
--- /dev/null
+++ b/doc/examples/Tutorial_BlockOperations_corner.cpp

@@ -0,0 +1,17 @@
+#include <Eigen/Dense>
+#include <iostream>
+
+using namespace std;
+
+int main()
+{
+  Eigen::Matrix4f m;
+  m << 1, 2, 3, 4,
+       5, 6, 7, 8,
+       9, 10,11,12,
+       13,14,15,16;
+  cout << "m.leftCols(2) =" << endl << m.leftCols(2) << endl << endl;
+  cout << "m.bottomRows<2>() =" << endl << m.bottomRows<2>() << endl << endl;
+  m.topLeftCorner(1,3) = m.bottomRightCorner(3,1).transpose();
+  cout << "After assignment, m = " << endl << m << endl;
+}

diff --git a/doc/examples/Tutorial_BlockOperations_print_block.cpp b/doc/examples/Tutorial_BlockOperations_print_block.cpp
new file mode 100644
index 0000000..edea4ae
--- /dev/null
+++ b/doc/examples/Tutorial_BlockOperations_print_block.cpp

@@ -0,0 +1,20 @@
+#include <Eigen/Dense>
+#include <iostream>
+
+using namespace std;
+
+int main()
+{
+  Eigen::MatrixXf m(4,4);
+  m <<  1, 2, 3, 4,
+        5, 6, 7, 8,
+        9,10,11,12,
+       13,14,15,16;
+  cout << "Block in the middle" << endl;
+  cout << m.block<2,2>(1,1) << endl << endl;
+  for (int i = 1; i <= 3; ++i)
+  {
+    cout << "Block of size " << i << "x" << i << endl;
+    cout << m.block(0,0,i,i) << endl << endl;
+  }
+}

diff --git a/doc/examples/Tutorial_BlockOperations_vector.cpp b/doc/examples/Tutorial_BlockOperations_vector.cpp
new file mode 100644
index 0000000..4a0b023
--- /dev/null
+++ b/doc/examples/Tutorial_BlockOperations_vector.cpp

@@ -0,0 +1,14 @@
+#include <Eigen/Dense>
+#include <iostream>
+
+using namespace std;
+
+int main()
+{
+  Eigen::ArrayXf v(6);
+  v << 1, 2, 3, 4, 5, 6;
+  cout << "v.head(3) =" << endl << v.head(3) << endl << endl;
+  cout << "v.tail<3>() = " << endl << v.tail<3>() << endl << endl;
+  v.segment(1,4) *= 2;
+  cout << "after 'v.segment(1,4) *= 2', v =" << endl << v << endl;
+}

diff --git a/doc/examples/Tutorial_PartialLU_solve.cpp b/doc/examples/Tutorial_PartialLU_solve.cpp
new file mode 100644
index 0000000..a560879
--- /dev/null
+++ b/doc/examples/Tutorial_PartialLU_solve.cpp

@@ -0,0 +1,18 @@
+#include <Eigen/Core>
+#include <Eigen/LU>
+#include <iostream>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+   Matrix3f A;
+   Vector3f b;
+   A << 1,2,3,  4,5,6,  7,8,10;
+   b << 3, 3, 4;
+   cout << "Here is the matrix A:" << endl << A << endl;
+   cout << "Here is the vector b:" << endl << b << endl;
+   Vector3f x = A.lu().solve(b);
+   cout << "The solution is:" << endl << x << endl;
+}

diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_1nn.cpp b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_1nn.cpp
new file mode 100644
index 0000000..334b4d8
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_1nn.cpp

@@ -0,0 +1,24 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+  Eigen::MatrixXf m(2,4);
+  Eigen::VectorXf v(2);
+  
+  m << 1, 23, 6, 9,
+       3, 11, 7, 2;
+       
+  v << 2,
+       3;
+
+  MatrixXf::Index index;
+  // find nearest neighbour
+  (m.colwise() - v).colwise().squaredNorm().minCoeff(&index);
+
+  cout << "Nearest neighbour is column " << index << ":" << endl;
+  cout << m.col(index) << endl;
+}

diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple.cpp b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple.cpp
new file mode 100644
index 0000000..e6c87c6
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple.cpp

@@ -0,0 +1,21 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+int main()
+{
+  Eigen::MatrixXf mat(2,4);
+  Eigen::VectorXf v(2);
+  
+  mat << 1, 2, 6, 9,
+         3, 1, 7, 2;
+         
+  v << 0,
+       1;
+       
+  //add v to each column of m
+  mat.colwise() += v;
+  
+  std::cout << "Broadcasting result: " << std::endl;
+  std::cout << mat << std::endl;
+}

diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple_rowwise.cpp b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple_rowwise.cpp
new file mode 100644
index 0000000..d87c96a
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple_rowwise.cpp

@@ -0,0 +1,20 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+int main()
+{
+  Eigen::MatrixXf mat(2,4);
+  Eigen::VectorXf v(4);
+  
+  mat << 1, 2, 6, 9,
+         3, 1, 7, 2;
+         
+  v << 0,1,2,3;
+       
+  //add v to each row of m
+  mat.rowwise() += v.transpose();
+  
+  std::cout << "Broadcasting result: " << std::endl;
+  std::cout << mat << std::endl;
+}

diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_colwise.cpp b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_colwise.cpp
new file mode 100644
index 0000000..df68256
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_colwise.cpp

@@ -0,0 +1,13 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+int main()
+{
+  Eigen::MatrixXf mat(2,4);
+  mat << 1, 2, 6, 9,
+         3, 1, 7, 2;
+  
+  std::cout << "Column's maximum: " << std::endl
+   << mat.colwise().maxCoeff() << std::endl;
+}

diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_maxnorm.cpp b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_maxnorm.cpp
new file mode 100644
index 0000000..049c747
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_maxnorm.cpp

@@ -0,0 +1,20 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+using namespace Eigen;
+int main()
+{
+  MatrixXf mat(2,4);
+  mat << 1, 2, 6, 9,
+         3, 1, 7, 2;
+  
+  MatrixXf::Index   maxIndex;
+  float maxNorm = mat.colwise().sum().maxCoeff(&maxIndex);
+  
+  std::cout << "Maximum sum at position " << maxIndex << std::endl;
+
+  std::cout << "The corresponding vector is: " << std::endl;
+  std::cout << mat.col( maxIndex ) << std::endl;
+  std::cout << "And its sum is is: " << maxNorm << std::endl;
+}

diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_bool.cpp b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_bool.cpp
new file mode 100644
index 0000000..0cca37f
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_bool.cpp

@@ -0,0 +1,21 @@
+#include <Eigen/Dense>
+#include <iostream>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+  ArrayXXf a(2,2);
+  
+  a << 1,2,
+       3,4;
+
+  cout << "(a > 0).all()   = " << (a > 0).all() << endl;
+  cout << "(a > 0).any()   = " << (a > 0).any() << endl;
+  cout << "(a > 0).count() = " << (a > 0).count() << endl;
+  cout << endl;
+  cout << "(a > 2).all()   = " << (a > 2).all() << endl;
+  cout << "(a > 2).any()   = " << (a > 2).any() << endl;
+  cout << "(a > 2).count() = " << (a > 2).count() << endl;
+}

diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_norm.cpp b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_norm.cpp
new file mode 100644
index 0000000..740439f
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_norm.cpp

@@ -0,0 +1,28 @@
+#include <Eigen/Dense>
+#include <iostream>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+  VectorXf v(2);
+  MatrixXf m(2,2), n(2,2);
+  
+  v << -1,
+       2;
+  
+  m << 1,-2,
+       -3,4;
+
+  cout << "v.squaredNorm() = " << v.squaredNorm() << endl;
+  cout << "v.norm() = " << v.norm() << endl;
+  cout << "v.lpNorm<1>() = " << v.lpNorm<1>() << endl;
+  cout << "v.lpNorm<Infinity>() = " << v.lpNorm<Infinity>() << endl;
+
+  cout << endl;
+  cout << "m.squaredNorm() = " << m.squaredNorm() << endl;
+  cout << "m.norm() = " << m.norm() << endl;
+  cout << "m.lpNorm<1>() = " << m.lpNorm<1>() << endl;
+  cout << "m.lpNorm<Infinity>() = " << m.lpNorm<Infinity>() << endl;
+}

diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_operatornorm.cpp b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_operatornorm.cpp
new file mode 100644
index 0000000..62e28fc
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_operatornorm.cpp

@@ -0,0 +1,18 @@
+#include <Eigen/Dense>
+#include <iostream>
+
+using namespace Eigen;
+using namespace std;
+
+int main()
+{
+  MatrixXf m(2,2);
+  m << 1,-2,
+       -3,4;
+
+  cout << "1-norm(m)     = " << m.cwiseAbs().colwise().sum().maxCoeff()
+       << " == "             << m.colwise().lpNorm<1>().maxCoeff() << endl;
+
+  cout << "infty-norm(m) = " << m.cwiseAbs().rowwise().sum().maxCoeff()
+       << " == "             << m.rowwise().lpNorm<1>().maxCoeff() << endl;
+}

diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_rowwise.cpp b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_rowwise.cpp
new file mode 100644
index 0000000..80427c9
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_rowwise.cpp

@@ -0,0 +1,13 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+int main()
+{
+  Eigen::MatrixXf mat(2,4);
+  mat << 1, 2, 6, 9,
+         3, 1, 7, 2;
+  
+  std::cout << "Row's maximum: " << std::endl
+   << mat.rowwise().maxCoeff() << std::endl;
+}

diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_visitors.cpp b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_visitors.cpp
new file mode 100644
index 0000000..b54e9aa
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_visitors.cpp

@@ -0,0 +1,26 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+  Eigen::MatrixXf m(2,2);
+  
+  m << 1, 2,
+       3, 4;
+
+  //get location of maximum
+  MatrixXf::Index maxRow, maxCol;
+  float max = m.maxCoeff(&maxRow, &maxCol);
+
+  //get location of minimum
+  MatrixXf::Index minRow, minCol;
+  float min = m.minCoeff(&minRow, &minCol);
+
+  cout << "Max: " << max <<  ", at: " <<
+     maxRow << "," << maxCol << endl;
+  cout << "Min: " << min << ", at: " <<
+     minRow << "," << minCol << endl;
+}

diff --git a/doc/examples/Tutorial_simple_example_dynamic_size.cpp b/doc/examples/Tutorial_simple_example_dynamic_size.cpp
new file mode 100644
index 0000000..defcb1e
--- /dev/null
+++ b/doc/examples/Tutorial_simple_example_dynamic_size.cpp

@@ -0,0 +1,22 @@
+#include <Eigen/Core>
+#include <iostream>
+
+using namespace Eigen;
+
+int main()
+{
+  for (int size=1; size<=4; ++size)
+  {
+    MatrixXi m(size,size+1);         // a (size)x(size+1)-matrix of int's
+    for (int j=0; j<m.cols(); ++j)   // loop over columns
+      for (int i=0; i<m.rows(); ++i) // loop over rows
+        m(i,j) = i+j*size;           // to access matrix coefficients,
+                                     // use operator()(int,int)
+    std::cout << m << "\n\n";
+  }
+
+  VectorXf v(4); // a vector of 4 float's
+  // to access vector coefficients, use either operator () or operator []
+  v[0] = 1; v[1] = 2; v(2) = 3; v(3) = 4;
+  std::cout << "\nv:\n" << v << std::endl;
+}

diff --git a/doc/examples/Tutorial_simple_example_fixed_size.cpp b/doc/examples/Tutorial_simple_example_fixed_size.cpp
new file mode 100644
index 0000000..bc4f95d
--- /dev/null
+++ b/doc/examples/Tutorial_simple_example_fixed_size.cpp

@@ -0,0 +1,15 @@
+#include <Eigen/Core>
+#include <iostream>
+
+using namespace Eigen;
+
+int main()
+{
+  Matrix3f m3;
+  m3 << 1, 2, 3, 4, 5, 6, 7, 8, 9;
+  Matrix4f m4 = Matrix4f::Identity();
+  Vector4i v4(1, 2, 3, 4);
+
+  std::cout << "m3\n" << m3 << "\nm4:\n"
+    << m4 << "\nv4:\n" << v4 << std::endl;
+}

diff --git a/doc/examples/class_Block.cpp b/doc/examples/class_Block.cpp
new file mode 100644
index 0000000..ace719a
--- /dev/null
+++ b/doc/examples/class_Block.cpp

@@ -0,0 +1,27 @@
+#include <Eigen/Core>
+#include <iostream>
+using namespace Eigen;
+using namespace std;
+
+template<typename Derived>
+Eigen::Block<Derived>
+topLeftCorner(MatrixBase<Derived>& m, int rows, int cols)
+{
+  return Eigen::Block<Derived>(m.derived(), 0, 0, rows, cols);
+}
+
+template<typename Derived>
+const Eigen::Block<const Derived>
+topLeftCorner(const MatrixBase<Derived>& m, int rows, int cols)
+{
+  return Eigen::Block<const Derived>(m.derived(), 0, 0, rows, cols);
+}
+
+int main(int, char**)
+{
+  Matrix4d m = Matrix4d::Identity();
+  cout << topLeftCorner(4*m, 2, 3) << endl; // calls the const version
+  topLeftCorner(m, 2, 3) *= 5;              // calls the non-const version
+  cout << "Now the matrix m is:" << endl << m << endl;
+  return 0;
+}

diff --git a/doc/examples/class_CwiseBinaryOp.cpp b/doc/examples/class_CwiseBinaryOp.cpp
new file mode 100644
index 0000000..682af46
--- /dev/null
+++ b/doc/examples/class_CwiseBinaryOp.cpp

@@ -0,0 +1,18 @@
+#include <Eigen/Core>
+#include <iostream>
+using namespace Eigen;
+using namespace std;
+
+// define a custom template binary functor
+template<typename Scalar> struct MakeComplexOp {
+  EIGEN_EMPTY_STRUCT_CTOR(MakeComplexOp)
+  typedef complex<Scalar> result_type;
+  complex<Scalar> operator()(const Scalar& a, const Scalar& b) const { return complex<Scalar>(a,b); }
+};
+
+int main(int, char**)
+{
+  Matrix4d m1 = Matrix4d::Random(), m2 = Matrix4d::Random();
+  cout << m1.binaryExpr(m2, MakeComplexOp<double>()) << endl;
+  return 0;
+}

diff --git a/doc/examples/class_CwiseUnaryOp.cpp b/doc/examples/class_CwiseUnaryOp.cpp
new file mode 100644
index 0000000..a5fcc15
--- /dev/null
+++ b/doc/examples/class_CwiseUnaryOp.cpp

@@ -0,0 +1,19 @@
+#include <Eigen/Core>
+#include <iostream>
+using namespace Eigen;
+using namespace std;
+
+// define a custom template unary functor
+template<typename Scalar>
+struct CwiseClampOp {
+  CwiseClampOp(const Scalar& inf, const Scalar& sup) : m_inf(inf), m_sup(sup) {}
+  const Scalar operator()(const Scalar& x) const { return x<m_inf ? m_inf : (x>m_sup ? m_sup : x); }
+  Scalar m_inf, m_sup;
+};
+
+int main(int, char**)
+{
+  Matrix4d m1 = Matrix4d::Random();
+  cout << m1 << endl << "becomes: " << endl << m1.unaryExpr(CwiseClampOp<double>(-0.5,0.5)) << endl;
+  return 0;
+}

diff --git a/doc/examples/class_CwiseUnaryOp_ptrfun.cpp b/doc/examples/class_CwiseUnaryOp_ptrfun.cpp
new file mode 100644
index 0000000..36706d8
--- /dev/null
+++ b/doc/examples/class_CwiseUnaryOp_ptrfun.cpp

@@ -0,0 +1,20 @@
+#include <Eigen/Core>
+#include <iostream>
+using namespace Eigen;
+using namespace std;
+
+// define function to be applied coefficient-wise
+double ramp(double x)
+{
+  if (x > 0)
+    return x;
+  else 
+    return 0;
+}
+
+int main(int, char**)
+{
+  Matrix4d m1 = Matrix4d::Random();
+  cout << m1 << endl << "becomes: " << endl << m1.unaryExpr(ptr_fun(ramp)) << endl;
+  return 0;
+}

diff --git a/doc/examples/class_FixedBlock.cpp b/doc/examples/class_FixedBlock.cpp
new file mode 100644
index 0000000..9978b32
--- /dev/null
+++ b/doc/examples/class_FixedBlock.cpp

@@ -0,0 +1,27 @@
+#include <Eigen/Core>
+#include <iostream>
+using namespace Eigen;
+using namespace std;
+
+template<typename Derived>
+Eigen::Block<Derived, 2, 2>
+topLeft2x2Corner(MatrixBase<Derived>& m)
+{
+  return Eigen::Block<Derived, 2, 2>(m.derived(), 0, 0);
+}
+
+template<typename Derived>
+const Eigen::Block<const Derived, 2, 2>
+topLeft2x2Corner(const MatrixBase<Derived>& m)
+{
+  return Eigen::Block<const Derived, 2, 2>(m.derived(), 0, 0);
+}
+
+int main(int, char**)
+{
+  Matrix3d m = Matrix3d::Identity();
+  cout << topLeft2x2Corner(4*m) << endl; // calls the const version
+  topLeft2x2Corner(m) *= 2;              // calls the non-const version
+  cout << "Now the matrix m is:" << endl << m << endl;
+  return 0;
+}

diff --git a/doc/examples/class_FixedReshaped.cpp b/doc/examples/class_FixedReshaped.cpp
new file mode 100644
index 0000000..b6d4085
--- /dev/null
+++ b/doc/examples/class_FixedReshaped.cpp

@@ -0,0 +1,22 @@
+#include <Eigen/Core>
+#include <iostream>
+using namespace Eigen;
+using namespace std;
+
+template<typename Derived>
+Eigen::Reshaped<Derived, 4, 2>
+reshape_helper(MatrixBase<Derived>& m)
+{
+  return Eigen::Reshaped<Derived, 4, 2>(m.derived());
+}
+
+int main(int, char**)
+{
+  MatrixXd m(2, 4);
+  m << 1, 2, 3, 4,
+       5, 6, 7, 8;
+  MatrixXd n = reshape_helper(m);
+  cout << "matrix m is:" << endl << m << endl;
+  cout << "matrix n is:" << endl << n << endl;
+  return 0;
+}

diff --git a/doc/examples/class_FixedVectorBlock.cpp b/doc/examples/class_FixedVectorBlock.cpp
new file mode 100644
index 0000000..c88c9fb
--- /dev/null
+++ b/doc/examples/class_FixedVectorBlock.cpp

@@ -0,0 +1,27 @@
+#include <Eigen/Core>
+#include <iostream>
+using namespace Eigen;
+using namespace std;
+
+template<typename Derived>
+Eigen::VectorBlock<Derived, 2>
+firstTwo(MatrixBase<Derived>& v)
+{
+  return Eigen::VectorBlock<Derived, 2>(v.derived(), 0);
+}
+
+template<typename Derived>
+const Eigen::VectorBlock<const Derived, 2>
+firstTwo(const MatrixBase<Derived>& v)
+{
+  return Eigen::VectorBlock<const Derived, 2>(v.derived(), 0);
+}
+
+int main(int, char**)
+{
+  Matrix<int,1,6> v; v << 1,2,3,4,5,6;
+  cout << firstTwo(4*v) << endl; // calls the const version
+  firstTwo(v) *= 2;              // calls the non-const version
+  cout << "Now the vector v is:" << endl << v << endl;
+  return 0;
+}

diff --git a/doc/examples/class_Reshaped.cpp b/doc/examples/class_Reshaped.cpp
new file mode 100644
index 0000000..18fb454
--- /dev/null
+++ b/doc/examples/class_Reshaped.cpp

@@ -0,0 +1,23 @@
+#include <Eigen/Core>
+#include <iostream>
+using namespace std;
+using namespace Eigen;
+
+template<typename Derived>
+const Reshaped<const Derived>
+reshape_helper(const MatrixBase<Derived>& m, int rows, int cols)
+{
+  return Reshaped<const Derived>(m.derived(), rows, cols);
+}
+
+int main(int, char**)
+{
+  MatrixXd m(3, 4);
+  m << 1, 4, 7, 10,
+       2, 5, 8, 11,
+       3, 6, 9, 12;
+  cout << m << endl;
+  Ref<const MatrixXd> n = reshape_helper(m, 2, 6);
+  cout << "Matrix m is:" << endl << m << endl;
+  cout << "Matrix n is:" << endl << n << endl;
+}

diff --git a/doc/examples/class_VectorBlock.cpp b/doc/examples/class_VectorBlock.cpp
new file mode 100644
index 0000000..dc213df
--- /dev/null
+++ b/doc/examples/class_VectorBlock.cpp

@@ -0,0 +1,27 @@
+#include <Eigen/Core>
+#include <iostream>
+using namespace Eigen;
+using namespace std;
+
+template<typename Derived>
+Eigen::VectorBlock<Derived>
+segmentFromRange(MatrixBase<Derived>& v, int start, int end)
+{
+  return Eigen::VectorBlock<Derived>(v.derived(), start, end-start);
+}
+
+template<typename Derived>
+const Eigen::VectorBlock<const Derived>
+segmentFromRange(const MatrixBase<Derived>& v, int start, int end)
+{
+  return Eigen::VectorBlock<const Derived>(v.derived(), start, end-start);
+}
+
+int main(int, char**)
+{
+  Matrix<int,1,6> v; v << 1,2,3,4,5,6;
+  cout << segmentFromRange(2*v, 2, 4) << endl; // calls the const version
+  segmentFromRange(v, 1, 3) *= 5;              // calls the non-const version
+  cout << "Now the vector v is:" << endl << v << endl;
+  return 0;
+}

diff --git a/doc/examples/function_taking_eigenbase.cpp b/doc/examples/function_taking_eigenbase.cpp
new file mode 100644
index 0000000..49d94b3
--- /dev/null
+++ b/doc/examples/function_taking_eigenbase.cpp

@@ -0,0 +1,18 @@
+#include <iostream>
+#include <Eigen/Core>
+using namespace Eigen;
+
+template <typename Derived>
+void print_size(const EigenBase<Derived>& b)
+{
+  std::cout << "size (rows, cols): " << b.size() << " (" << b.rows()
+            << ", " << b.cols() << ")" << std::endl;
+}
+
+int main()
+{
+    Vector3f v;
+    print_size(v);
+    // v.asDiagonal() returns a 3x3 diagonal matrix pseudo-expression
+    print_size(v.asDiagonal());
+}

diff --git a/doc/examples/function_taking_ref.cpp b/doc/examples/function_taking_ref.cpp
new file mode 100644
index 0000000..162a202
--- /dev/null
+++ b/doc/examples/function_taking_ref.cpp

@@ -0,0 +1,19 @@
+#include <iostream>
+#include <Eigen/SVD>
+using namespace Eigen;
+using namespace std;
+
+float inv_cond(const Ref<const MatrixXf>& a)
+{
+  const VectorXf sing_vals = a.jacobiSvd().singularValues();
+  return sing_vals(sing_vals.size()-1) / sing_vals(0);
+}
+
+int main()
+{
+  Matrix4f m = Matrix4f::Random();
+  cout << "matrix m:" << endl << m << endl << endl;
+  cout << "inv_cond(m):          " << inv_cond(m)                      << endl;
+  cout << "inv_cond(m(1:3,1:3)): " << inv_cond(m.topLeftCorner(3,3))   << endl;
+  cout << "inv_cond(m+I):        " << inv_cond(m+Matrix4f::Identity()) << endl;
+}

diff --git a/doc/examples/make_circulant.cpp b/doc/examples/make_circulant.cpp
new file mode 100644
index 0000000..92e6aaa
--- /dev/null
+++ b/doc/examples/make_circulant.cpp

@@ -0,0 +1,11 @@
+/*
+This program is presented in several fragments in the doc page.
+Every fragment is in its own file; this file simply combines them.
+*/
+
+#include "make_circulant.cpp.preamble"
+#include "make_circulant.cpp.traits"
+#include "make_circulant.cpp.expression"
+#include "make_circulant.cpp.evaluator"
+#include "make_circulant.cpp.entry"
+#include "make_circulant.cpp.main"

diff --git a/doc/examples/make_circulant.cpp.entry b/doc/examples/make_circulant.cpp.entry
new file mode 100644
index 0000000..f9d2eb8
--- /dev/null
+++ b/doc/examples/make_circulant.cpp.entry

@@ -0,0 +1,5 @@
+template <class ArgType>
+Circulant<ArgType> makeCirculant(const Eigen::MatrixBase<ArgType>& arg)
+{
+  return Circulant<ArgType>(arg.derived());
+}

diff --git a/doc/examples/make_circulant.cpp.evaluator b/doc/examples/make_circulant.cpp.evaluator
new file mode 100644
index 0000000..2ba79e7
--- /dev/null
+++ b/doc/examples/make_circulant.cpp.evaluator

@@ -0,0 +1,32 @@
+namespace Eigen {
+  namespace internal {
+    template<typename ArgType>
+    struct evaluator<Circulant<ArgType> >
+      : evaluator_base<Circulant<ArgType> >
+    {
+      typedef Circulant<ArgType> XprType;
+      typedef typename nested_eval<ArgType, XprType::ColsAtCompileTime>::type ArgTypeNested;
+      typedef typename remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
+      typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+      enum { 
+        CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost,
+        Flags = Eigen::ColMajor 
+      };
+      
+      evaluator(const XprType& xpr)
+        : m_argImpl(xpr.m_arg), m_rows(xpr.rows())
+      { }
+
+      CoeffReturnType coeff(Index row, Index col) const
+      {
+        Index index = row - col;
+        if (index < 0) index += m_rows;
+        return m_argImpl.coeff(index);
+      }
+
+      evaluator<ArgTypeNestedCleaned> m_argImpl;
+      const Index m_rows;
+    };
+  }
+}

diff --git a/doc/examples/make_circulant.cpp.expression b/doc/examples/make_circulant.cpp.expression
new file mode 100644
index 0000000..380cd44
--- /dev/null
+++ b/doc/examples/make_circulant.cpp.expression

@@ -0,0 +1,20 @@
+template <class ArgType>
+class Circulant : public Eigen::MatrixBase<Circulant<ArgType> >
+{
+public:
+  Circulant(const ArgType& arg)
+    : m_arg(arg)
+  { 
+    EIGEN_STATIC_ASSERT(ArgType::ColsAtCompileTime == 1,
+                        YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX);
+  }
+
+  typedef typename Eigen::internal::ref_selector<Circulant>::type Nested; 
+
+  typedef Eigen::Index Index;
+  Index rows() const { return m_arg.rows(); }
+  Index cols() const { return m_arg.rows(); }
+
+  typedef typename Eigen::internal::ref_selector<ArgType>::type ArgTypeNested;
+  ArgTypeNested m_arg;
+};

diff --git a/doc/examples/make_circulant.cpp.main b/doc/examples/make_circulant.cpp.main
new file mode 100644
index 0000000..877f97f
--- /dev/null
+++ b/doc/examples/make_circulant.cpp.main

@@ -0,0 +1,8 @@
+int main()
+{
+  Eigen::VectorXd vec(4);
+  vec << 1, 2, 4, 8;
+  Eigen::MatrixXd mat;
+  mat = makeCirculant(vec);
+  std::cout << mat << std::endl;
+}

diff --git a/doc/examples/make_circulant.cpp.preamble b/doc/examples/make_circulant.cpp.preamble
new file mode 100644
index 0000000..e575cce
--- /dev/null
+++ b/doc/examples/make_circulant.cpp.preamble

@@ -0,0 +1,4 @@
+#include <Eigen/Core>
+#include <iostream>
+
+template <class ArgType> class Circulant;

diff --git a/doc/examples/make_circulant.cpp.traits b/doc/examples/make_circulant.cpp.traits
new file mode 100644
index 0000000..4e04535
--- /dev/null
+++ b/doc/examples/make_circulant.cpp.traits

@@ -0,0 +1,19 @@
+namespace Eigen {
+  namespace internal {
+    template <class ArgType>
+    struct traits<Circulant<ArgType> >
+    {
+      typedef Eigen::Dense StorageKind;
+      typedef Eigen::MatrixXpr XprKind;
+      typedef typename ArgType::StorageIndex StorageIndex;
+      typedef typename ArgType::Scalar Scalar;
+      enum { 
+        Flags = Eigen::ColMajor,
+        RowsAtCompileTime = ArgType::RowsAtCompileTime,
+        ColsAtCompileTime = ArgType::RowsAtCompileTime,
+        MaxRowsAtCompileTime = ArgType::MaxRowsAtCompileTime,
+        MaxColsAtCompileTime = ArgType::MaxRowsAtCompileTime
+      };
+    };
+  }
+}

diff --git a/doc/examples/make_circulant2.cpp b/doc/examples/make_circulant2.cpp
new file mode 100644
index 0000000..95d3dd3
--- /dev/null
+++ b/doc/examples/make_circulant2.cpp

@@ -0,0 +1,52 @@
+#include <Eigen/Core>
+#include <iostream>
+
+using namespace Eigen;
+
+// [circulant_func]
+template<class ArgType>
+class circulant_functor {
+  const ArgType &m_vec;
+public:
+  circulant_functor(const ArgType& arg) : m_vec(arg) {}
+
+  const typename ArgType::Scalar& operator() (Index row, Index col) const {
+    Index index = row - col;
+    if (index < 0) index += m_vec.size();
+    return m_vec(index);
+  }
+};
+// [circulant_func]
+
+// [square]
+template<class ArgType>
+struct circulant_helper {
+  typedef Matrix<typename ArgType::Scalar,
+                 ArgType::SizeAtCompileTime,
+                 ArgType::SizeAtCompileTime,
+                 ColMajor,
+                 ArgType::MaxSizeAtCompileTime,
+                 ArgType::MaxSizeAtCompileTime> MatrixType;
+};
+// [square]
+
+// [makeCirculant]
+template <class ArgType>
+CwiseNullaryOp<circulant_functor<ArgType>, typename circulant_helper<ArgType>::MatrixType>
+makeCirculant(const Eigen::MatrixBase<ArgType>& arg)
+{
+  typedef typename circulant_helper<ArgType>::MatrixType MatrixType;
+  return MatrixType::NullaryExpr(arg.size(), arg.size(), circulant_functor<ArgType>(arg.derived()));
+}
+// [makeCirculant]
+
+// [main]
+int main()
+{
+  Eigen::VectorXd vec(4);
+  vec << 1, 2, 4, 8;
+  Eigen::MatrixXd mat;
+  mat = makeCirculant(vec);
+  std::cout << mat << std::endl;
+}
+// [main]

diff --git a/doc/examples/matrixfree_cg.cpp b/doc/examples/matrixfree_cg.cpp
new file mode 100644
index 0000000..7469938
--- /dev/null
+++ b/doc/examples/matrixfree_cg.cpp

@@ -0,0 +1,129 @@
+#include <iostream>
+#include <Eigen/Core>
+#include <Eigen/Dense>
+#include <Eigen/IterativeLinearSolvers>
+#include <unsupported/Eigen/IterativeSolvers>
+
+class MatrixReplacement;
+using Eigen::SparseMatrix;
+
+namespace Eigen {
+namespace internal {
+  // MatrixReplacement looks-like a SparseMatrix, so let's inherits its traits:
+  template<>
+  struct traits<MatrixReplacement> :  public Eigen::internal::traits<Eigen::SparseMatrix<double> >
+  {};
+}
+}
+
+// Example of a matrix-free wrapper from a user type to Eigen's compatible type
+// For the sake of simplicity, this example simply wrap a Eigen::SparseMatrix.
+class MatrixReplacement : public Eigen::EigenBase<MatrixReplacement> {
+public:
+  // Required typedefs, constants, and method:
+  typedef double Scalar;
+  typedef double RealScalar;
+  typedef int StorageIndex;
+  enum {
+    ColsAtCompileTime = Eigen::Dynamic,
+    MaxColsAtCompileTime = Eigen::Dynamic,
+    IsRowMajor = false
+  };
+
+  Index rows() const { return mp_mat->rows(); }
+  Index cols() const { return mp_mat->cols(); }
+
+  template<typename Rhs>
+  Eigen::Product<MatrixReplacement,Rhs,Eigen::AliasFreeProduct> operator*(const Eigen::MatrixBase<Rhs>& x) const {
+    return Eigen::Product<MatrixReplacement,Rhs,Eigen::AliasFreeProduct>(*this, x.derived());
+  }
+
+  // Custom API:
+  MatrixReplacement() : mp_mat(0) {}
+
+  void attachMyMatrix(const SparseMatrix<double> &mat) {
+    mp_mat = &mat;
+  }
+  const SparseMatrix<double> my_matrix() const { return *mp_mat; }
+
+private:
+  const SparseMatrix<double> *mp_mat;
+};
+
+
+// Implementation of MatrixReplacement * Eigen::DenseVector though a specialization of internal::generic_product_impl:
+namespace Eigen {
+namespace internal {
+
+  template<typename Rhs>
+  struct generic_product_impl<MatrixReplacement, Rhs, SparseShape, DenseShape, GemvProduct> // GEMV stands for matrix-vector
+  : generic_product_impl_base<MatrixReplacement,Rhs,generic_product_impl<MatrixReplacement,Rhs> >
+  {
+    typedef typename Product<MatrixReplacement,Rhs>::Scalar Scalar;
+
+    template<typename Dest>
+    static void scaleAndAddTo(Dest& dst, const MatrixReplacement& lhs, const Rhs& rhs, const Scalar& alpha)
+    {
+      // This method should implement "dst += alpha * lhs * rhs" inplace,
+      // however, for iterative solvers, alpha is always equal to 1, so let's not bother about it.
+      assert(alpha==Scalar(1) && "scaling is not implemented");
+      EIGEN_ONLY_USED_FOR_DEBUG(alpha);
+
+      // Here we could simply call dst.noalias() += lhs.my_matrix() * rhs,
+      // but let's do something fancier (and less efficient):
+      for(Index i=0; i<lhs.cols(); ++i)
+        dst += rhs(i) * lhs.my_matrix().col(i);
+    }
+  };
+
+}
+}
+
+int main()
+{
+  int n = 10;
+  Eigen::SparseMatrix<double> S = Eigen::MatrixXd::Random(n,n).sparseView(0.5,1);
+  S = S.transpose()*S;
+
+  MatrixReplacement A;
+  A.attachMyMatrix(S);
+
+  Eigen::VectorXd b(n), x;
+  b.setRandom();
+
+  // Solve Ax = b using various iterative solver with matrix-free version:
+  {
+    Eigen::ConjugateGradient<MatrixReplacement, Eigen::Lower|Eigen::Upper, Eigen::IdentityPreconditioner> cg;
+    cg.compute(A);
+    x = cg.solve(b);
+    std::cout << "CG:       #iterations: " << cg.iterations() << ", estimated error: " << cg.error() << std::endl;
+  }
+
+  {
+    Eigen::BiCGSTAB<MatrixReplacement, Eigen::IdentityPreconditioner> bicg;
+    bicg.compute(A);
+    x = bicg.solve(b);
+    std::cout << "BiCGSTAB: #iterations: " << bicg.iterations() << ", estimated error: " << bicg.error() << std::endl;
+  }
+
+  {
+    Eigen::GMRES<MatrixReplacement, Eigen::IdentityPreconditioner> gmres;
+    gmres.compute(A);
+    x = gmres.solve(b);
+    std::cout << "GMRES:    #iterations: " << gmres.iterations() << ", estimated error: " << gmres.error() << std::endl;
+  }
+
+  {
+    Eigen::DGMRES<MatrixReplacement, Eigen::IdentityPreconditioner> gmres;
+    gmres.compute(A);
+    x = gmres.solve(b);
+    std::cout << "DGMRES:   #iterations: " << gmres.iterations() << ", estimated error: " << gmres.error() << std::endl;
+  }
+
+  {
+    Eigen::MINRES<MatrixReplacement, Eigen::Lower|Eigen::Upper, Eigen::IdentityPreconditioner> minres;
+    minres.compute(A);
+    x = minres.solve(b);
+    std::cout << "MINRES:   #iterations: " << minres.iterations() << ", estimated error: " << minres.error() << std::endl;
+  }
+}

diff --git a/doc/examples/nullary_indexing.cpp b/doc/examples/nullary_indexing.cpp
new file mode 100644
index 0000000..b74db5f
--- /dev/null
+++ b/doc/examples/nullary_indexing.cpp

@@ -0,0 +1,66 @@
+#include <Eigen/Core>
+#include <iostream>
+
+using namespace Eigen;
+
+// [functor]
+template<class ArgType, class RowIndexType, class ColIndexType>
+class indexing_functor {
+  const ArgType &m_arg;
+  const RowIndexType &m_rowIndices;
+  const ColIndexType &m_colIndices;
+public:
+  typedef Matrix<typename ArgType::Scalar,
+                 RowIndexType::SizeAtCompileTime,
+                 ColIndexType::SizeAtCompileTime,
+                 ArgType::Flags&RowMajorBit?RowMajor:ColMajor,
+                 RowIndexType::MaxSizeAtCompileTime,
+                 ColIndexType::MaxSizeAtCompileTime> MatrixType;
+
+  indexing_functor(const ArgType& arg, const RowIndexType& row_indices, const ColIndexType& col_indices)
+    : m_arg(arg), m_rowIndices(row_indices), m_colIndices(col_indices)
+  {}
+
+  const typename ArgType::Scalar& operator() (Index row, Index col) const {
+    return m_arg(m_rowIndices[row], m_colIndices[col]);
+  }
+};
+// [functor]
+
+// [function]
+template <class ArgType, class RowIndexType, class ColIndexType>
+CwiseNullaryOp<indexing_functor<ArgType,RowIndexType,ColIndexType>, typename indexing_functor<ArgType,RowIndexType,ColIndexType>::MatrixType>
+mat_indexing(const Eigen::MatrixBase<ArgType>& arg, const RowIndexType& row_indices, const ColIndexType& col_indices)
+{
+  typedef indexing_functor<ArgType,RowIndexType,ColIndexType> Func;
+  typedef typename Func::MatrixType MatrixType;
+  return MatrixType::NullaryExpr(row_indices.size(), col_indices.size(), Func(arg.derived(), row_indices, col_indices));
+}
+// [function]
+
+
+int main()
+{
+  std::cout << "[main1]\n";
+  Eigen::MatrixXi A = Eigen::MatrixXi::Random(4,4);
+  Array3i ri(1,2,1);
+  ArrayXi ci(6); ci << 3,2,1,0,0,2;
+  Eigen::MatrixXi B = mat_indexing(A, ri, ci);
+  std::cout << "A =" << std::endl;
+  std::cout << A << std::endl << std::endl;
+  std::cout << "A([" << ri.transpose() << "], [" << ci.transpose() << "]) =" << std::endl;
+  std::cout << B << std::endl;
+  std::cout << "[main1]\n";
+
+  std::cout << "[main2]\n";
+  B =  mat_indexing(A, ri+1, ci);
+  std::cout << "A(ri+1,ci) =" << std::endl;
+  std::cout << B << std::endl << std::endl;
+#if EIGEN_COMP_CXXVER >= 11
+  B =  mat_indexing(A, ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3));
+  std::cout << "A(ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3)) =" << std::endl;
+  std::cout << B << std::endl << std::endl;
+#endif
+  std::cout << "[main2]\n";
+}
+

diff --git a/doc/examples/tut_arithmetic_add_sub.cpp b/doc/examples/tut_arithmetic_add_sub.cpp
new file mode 100644
index 0000000..e97477b
--- /dev/null
+++ b/doc/examples/tut_arithmetic_add_sub.cpp

@@ -0,0 +1,22 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace Eigen;
+
+int main()
+{
+  Matrix2d a;
+  a << 1, 2,
+       3, 4;
+  MatrixXd b(2,2);
+  b << 2, 3,
+       1, 4;
+  std::cout << "a + b =\n" << a + b << std::endl;
+  std::cout << "a - b =\n" << a - b << std::endl;
+  std::cout << "Doing a += b;" << std::endl;
+  a += b;
+  std::cout << "Now a =\n" << a << std::endl;
+  Vector3d v(1,2,3);
+  Vector3d w(1,0,0);
+  std::cout << "-v + w - v =\n" << -v + w - v << std::endl;
+}

diff --git a/doc/examples/tut_arithmetic_dot_cross.cpp b/doc/examples/tut_arithmetic_dot_cross.cpp
new file mode 100644
index 0000000..631c9a5
--- /dev/null
+++ b/doc/examples/tut_arithmetic_dot_cross.cpp

@@ -0,0 +1,15 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace Eigen;
+using namespace std;
+int main()
+{
+  Vector3d v(1,2,3);
+  Vector3d w(0,1,2);
+
+  cout << "Dot product: " << v.dot(w) << endl;
+  double dp = v.adjoint()*w; // automatic conversion of the inner product to a scalar
+  cout << "Dot product via a matrix product: " << dp << endl;
+  cout << "Cross product:\n" << v.cross(w) << endl;
+}

diff --git a/doc/examples/tut_arithmetic_matrix_mul.cpp b/doc/examples/tut_arithmetic_matrix_mul.cpp
new file mode 100644
index 0000000..f213902
--- /dev/null
+++ b/doc/examples/tut_arithmetic_matrix_mul.cpp

@@ -0,0 +1,19 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace Eigen;
+int main()
+{
+  Matrix2d mat;
+  mat << 1, 2,
+         3, 4;
+  Vector2d u(-1,1), v(2,0);
+  std::cout << "Here is mat*mat:\n" << mat*mat << std::endl;
+  std::cout << "Here is mat*u:\n" << mat*u << std::endl;
+  std::cout << "Here is u^T*mat:\n" << u.transpose()*mat << std::endl;
+  std::cout << "Here is u^T*v:\n" << u.transpose()*v << std::endl;
+  std::cout << "Here is u*v^T:\n" << u*v.transpose() << std::endl;
+  std::cout << "Let's multiply mat by itself" << std::endl;
+  mat = mat*mat;
+  std::cout << "Now mat is mat:\n" << mat << std::endl;
+}

diff --git a/doc/examples/tut_arithmetic_redux_basic.cpp b/doc/examples/tut_arithmetic_redux_basic.cpp
new file mode 100644
index 0000000..5632fb5
--- /dev/null
+++ b/doc/examples/tut_arithmetic_redux_basic.cpp

@@ -0,0 +1,16 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+int main()
+{
+  Eigen::Matrix2d mat;
+  mat << 1, 2,
+         3, 4;
+  cout << "Here is mat.sum():       " << mat.sum()       << endl;
+  cout << "Here is mat.prod():      " << mat.prod()      << endl;
+  cout << "Here is mat.mean():      " << mat.mean()      << endl;
+  cout << "Here is mat.minCoeff():  " << mat.minCoeff()  << endl;
+  cout << "Here is mat.maxCoeff():  " << mat.maxCoeff()  << endl;
+  cout << "Here is mat.trace():     " << mat.trace()     << endl;
+}

diff --git a/doc/examples/tut_arithmetic_scalar_mul_div.cpp b/doc/examples/tut_arithmetic_scalar_mul_div.cpp
new file mode 100644
index 0000000..d5f65b5
--- /dev/null
+++ b/doc/examples/tut_arithmetic_scalar_mul_div.cpp

@@ -0,0 +1,17 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace Eigen;
+
+int main()
+{
+  Matrix2d a;
+  a << 1, 2,
+       3, 4;
+  Vector3d v(1,2,3);
+  std::cout << "a * 2.5 =\n" << a * 2.5 << std::endl;
+  std::cout << "0.1 * v =\n" << 0.1 * v << std::endl;
+  std::cout << "Doing v *= 2;" << std::endl;
+  v *= 2;
+  std::cout << "Now v =\n" << v << std::endl;
+}

diff --git a/doc/examples/tut_matrix_coefficient_accessors.cpp b/doc/examples/tut_matrix_coefficient_accessors.cpp
new file mode 100644
index 0000000..c2da171
--- /dev/null
+++ b/doc/examples/tut_matrix_coefficient_accessors.cpp

@@ -0,0 +1,18 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace Eigen;
+
+int main()
+{
+  MatrixXd m(2,2);
+  m(0,0) = 3;
+  m(1,0) = 2.5;
+  m(0,1) = -1;
+  m(1,1) = m(1,0) + m(0,1);
+  std::cout << "Here is the matrix m:\n" << m << std::endl;
+  VectorXd v(2);
+  v(0) = 4;
+  v(1) = v(0) - 1;
+  std::cout << "Here is the vector v:\n" << v << std::endl;
+}

diff --git a/doc/examples/tut_matrix_resize.cpp b/doc/examples/tut_matrix_resize.cpp
new file mode 100644
index 0000000..0392c3a
--- /dev/null
+++ b/doc/examples/tut_matrix_resize.cpp

@@ -0,0 +1,18 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace Eigen;
+
+int main()
+{
+  MatrixXd m(2,5);
+  m.resize(4,3);
+  std::cout << "The matrix m is of size "
+            << m.rows() << "x" << m.cols() << std::endl;
+  std::cout << "It has " << m.size() << " coefficients" << std::endl;
+  VectorXd v(2);
+  v.resize(5);
+  std::cout << "The vector v is of size " << v.size() << std::endl;
+  std::cout << "As a matrix, v is of size "
+            << v.rows() << "x" << v.cols() << std::endl;
+}

diff --git a/doc/examples/tut_matrix_resize_fixed_size.cpp b/doc/examples/tut_matrix_resize_fixed_size.cpp
new file mode 100644
index 0000000..dcbdfa7
--- /dev/null
+++ b/doc/examples/tut_matrix_resize_fixed_size.cpp

@@ -0,0 +1,12 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace Eigen;
+
+int main()
+{
+  Matrix4d m;
+  m.resize(4,4); // no operation
+  std::cout << "The matrix m is of size "
+            << m.rows() << "x" << m.cols() << std::endl;
+}

diff --git a/doc/ftv2node.png b/doc/ftv2node.png
new file mode 100644
index 0000000..63c605b
--- /dev/null
+++ b/doc/ftv2node.png
Binary files differ

diff --git a/doc/ftv2pnode.png b/doc/ftv2pnode.png
new file mode 100644
index 0000000..c6ee22f
--- /dev/null
+++ b/doc/ftv2pnode.png
Binary files differ

diff --git a/doc/snippets/.krazy b/doc/snippets/.krazy
new file mode 100644
index 0000000..00b9940
--- /dev/null
+++ b/doc/snippets/.krazy

@@ -0,0 +1,2 @@
+EXCLUDE copyright
+EXCLUDE license

diff --git a/doc/snippets/AngleAxis_mimic_euler.cpp b/doc/snippets/AngleAxis_mimic_euler.cpp
new file mode 100644
index 0000000..456de7f
--- /dev/null
+++ b/doc/snippets/AngleAxis_mimic_euler.cpp

@@ -0,0 +1,5 @@
+Matrix3f m;
+m = AngleAxisf(0.25*M_PI, Vector3f::UnitX())
+  * AngleAxisf(0.5*M_PI,  Vector3f::UnitY())
+  * AngleAxisf(0.33*M_PI, Vector3f::UnitZ());
+cout << m << endl << "is unitary: " << m.isUnitary() << endl;

diff --git a/doc/snippets/Array_initializer_list_23_cxx11.cpp b/doc/snippets/Array_initializer_list_23_cxx11.cpp
new file mode 100644
index 0000000..2c2166e
--- /dev/null
+++ b/doc/snippets/Array_initializer_list_23_cxx11.cpp

@@ -0,0 +1,5 @@
+ArrayXXi a {
+  {1, 2, 3},
+  {3, 4, 5}
+};
+cout << a << endl;

diff --git a/doc/snippets/Array_initializer_list_vector_cxx11.cpp b/doc/snippets/Array_initializer_list_vector_cxx11.cpp
new file mode 100644
index 0000000..a668d84
--- /dev/null
+++ b/doc/snippets/Array_initializer_list_vector_cxx11.cpp

@@ -0,0 +1,2 @@
+Array<int, Dynamic, 1> v {{1, 2, 3, 4, 5}};
+cout << v << endl;

diff --git a/doc/snippets/Array_variadic_ctor_cxx11.cpp b/doc/snippets/Array_variadic_ctor_cxx11.cpp
new file mode 100644
index 0000000..0e4ec44
--- /dev/null
+++ b/doc/snippets/Array_variadic_ctor_cxx11.cpp

@@ -0,0 +1,3 @@
+Array<int, 1, 6> a(1, 2, 3, 4, 5, 6);
+Array<int, 3, 1> b {1, 2, 3};
+cout << a << "\n\n" << b << endl;

diff --git a/doc/snippets/BiCGSTAB_simple.cpp b/doc/snippets/BiCGSTAB_simple.cpp
new file mode 100644
index 0000000..8c8829f
--- /dev/null
+++ b/doc/snippets/BiCGSTAB_simple.cpp

@@ -0,0 +1,11 @@
+  int n = 10000;
+  VectorXd x(n), b(n);
+  SparseMatrix<double> A(n,n);
+  /* ... fill A and b ... */ 
+  BiCGSTAB<SparseMatrix<double> > solver;
+  solver.compute(A);
+  x = solver.solve(b);
+  std::cout << "#iterations:     " << solver.iterations() << std::endl;
+  std::cout << "estimated error: " << solver.error()      << std::endl;
+  /* ... update b ... */
+  x = solver.solve(b); // solve again

diff --git a/doc/snippets/BiCGSTAB_step_by_step.cpp b/doc/snippets/BiCGSTAB_step_by_step.cpp
new file mode 100644
index 0000000..6c95d5a
--- /dev/null
+++ b/doc/snippets/BiCGSTAB_step_by_step.cpp

@@ -0,0 +1,14 @@
+  int n = 10000;
+  VectorXd x(n), b(n);
+  SparseMatrix<double> A(n,n);
+  /* ... fill A and b ... */ 
+  BiCGSTAB<SparseMatrix<double> > solver(A);
+  // start from a random solution
+  x = VectorXd::Random(n);
+  solver.setMaxIterations(1);
+  int i = 0;
+  do {
+    x = solver.solveWithGuess(b,x);
+    std::cout << i << " : " << solver.error() << std::endl;
+    ++i;
+  } while (solver.info()!=Success && i<100);

diff --git a/doc/snippets/CMakeLists.txt b/doc/snippets/CMakeLists.txt
new file mode 100644
index 0000000..65f195a
--- /dev/null
+++ b/doc/snippets/CMakeLists.txt

@@ -0,0 +1,36 @@
+file(GLOB snippets_SRCS "*.cpp")
+
+add_custom_target(all_snippets)
+
+foreach(snippet_src ${snippets_SRCS})
+  get_filename_component(snippet ${snippet_src} NAME_WE)
+  set(compile_snippet_target compile_${snippet})
+  set(compile_snippet_src ${compile_snippet_target}.cpp)
+  if((NOT ${snippet_src} MATCHES "cxx11") OR EIGEN_COMPILER_SUPPORT_CPP11)
+    file(READ ${snippet_src} snippet_source_code)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/compile_snippet.cpp.in
+                  ${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src})
+    add_executable(${compile_snippet_target}
+                  ${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src})
+    if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+      target_link_libraries(${compile_snippet_target} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+    endif()
+    if(${snippet_src} MATCHES "cxx11")
+      set_target_properties(${compile_snippet_target} PROPERTIES COMPILE_FLAGS "-std=c++11")
+    endif()
+    if(${snippet_src} MATCHES "deprecated")
+      set_target_properties(${compile_snippet_target} PROPERTIES COMPILE_FLAGS "-DEIGEN_NO_DEPRECATED_WARNING")
+    endif()
+    add_custom_command(
+      TARGET ${compile_snippet_target}
+      POST_BUILD
+      COMMAND ${compile_snippet_target}
+      ARGS >${CMAKE_CURRENT_BINARY_DIR}/${snippet}.out
+    )
+    add_dependencies(all_snippets ${compile_snippet_target})
+    set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src}
+                                PROPERTIES OBJECT_DEPENDS ${snippet_src})
+  else()
+    message("skip snippet ${snippet_src} because compiler does not support C++11")
+  endif()
+endforeach()

diff --git a/doc/snippets/ColPivHouseholderQR_solve.cpp b/doc/snippets/ColPivHouseholderQR_solve.cpp
new file mode 100644
index 0000000..b7b204a
--- /dev/null
+++ b/doc/snippets/ColPivHouseholderQR_solve.cpp

@@ -0,0 +1,8 @@
+Matrix3f m = Matrix3f::Random();
+Matrix3f y = Matrix3f::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is the matrix y:" << endl << y << endl;
+Matrix3f x;
+x = m.colPivHouseholderQr().solve(y);
+assert(y.isApprox(m*x));
+cout << "Here is a solution x to the equation mx=y:" << endl << x << endl;

diff --git a/doc/snippets/ComplexEigenSolver_compute.cpp b/doc/snippets/ComplexEigenSolver_compute.cpp
new file mode 100644
index 0000000..11d6bd3
--- /dev/null
+++ b/doc/snippets/ComplexEigenSolver_compute.cpp

@@ -0,0 +1,16 @@
+MatrixXcf A = MatrixXcf::Random(4,4);
+cout << "Here is a random 4x4 matrix, A:" << endl << A << endl << endl;
+
+ComplexEigenSolver<MatrixXcf> ces;
+ces.compute(A);
+cout << "The eigenvalues of A are:" << endl << ces.eigenvalues() << endl;
+cout << "The matrix of eigenvectors, V, is:" << endl << ces.eigenvectors() << endl << endl;
+
+complex<float> lambda = ces.eigenvalues()[0];
+cout << "Consider the first eigenvalue, lambda = " << lambda << endl;
+VectorXcf v = ces.eigenvectors().col(0);
+cout << "If v is the corresponding eigenvector, then lambda * v = " << endl << lambda * v << endl;
+cout << "... and A * v = " << endl << A * v << endl << endl;
+
+cout << "Finally, V * D * V^(-1) = " << endl
+     << ces.eigenvectors() * ces.eigenvalues().asDiagonal() * ces.eigenvectors().inverse() << endl;

diff --git a/doc/snippets/ComplexEigenSolver_eigenvalues.cpp b/doc/snippets/ComplexEigenSolver_eigenvalues.cpp
new file mode 100644
index 0000000..5509bd8
--- /dev/null
+++ b/doc/snippets/ComplexEigenSolver_eigenvalues.cpp

@@ -0,0 +1,4 @@
+MatrixXcf ones = MatrixXcf::Ones(3,3);
+ComplexEigenSolver<MatrixXcf> ces(ones, /* computeEigenvectors = */ false);
+cout << "The eigenvalues of the 3x3 matrix of ones are:" 
+     << endl << ces.eigenvalues() << endl;

diff --git a/doc/snippets/ComplexEigenSolver_eigenvectors.cpp b/doc/snippets/ComplexEigenSolver_eigenvectors.cpp
new file mode 100644
index 0000000..adeed9a
--- /dev/null
+++ b/doc/snippets/ComplexEigenSolver_eigenvectors.cpp

@@ -0,0 +1,4 @@
+MatrixXcf ones = MatrixXcf::Ones(3,3);
+ComplexEigenSolver<MatrixXcf> ces(ones);
+cout << "The first eigenvector of the 3x3 matrix of ones is:" 
+     << endl << ces.eigenvectors().col(0) << endl;

diff --git a/doc/snippets/ComplexSchur_compute.cpp b/doc/snippets/ComplexSchur_compute.cpp
new file mode 100644
index 0000000..3a51701
--- /dev/null
+++ b/doc/snippets/ComplexSchur_compute.cpp

@@ -0,0 +1,6 @@
+MatrixXcf A = MatrixXcf::Random(4,4);
+ComplexSchur<MatrixXcf> schur(4);
+schur.compute(A);
+cout << "The matrix T in the decomposition of A is:" << endl << schur.matrixT() << endl;
+schur.compute(A.inverse());
+cout << "The matrix T in the decomposition of A^(-1) is:" << endl << schur.matrixT() << endl;

diff --git a/doc/snippets/ComplexSchur_matrixT.cpp b/doc/snippets/ComplexSchur_matrixT.cpp
new file mode 100644
index 0000000..8380571
--- /dev/null
+++ b/doc/snippets/ComplexSchur_matrixT.cpp

@@ -0,0 +1,4 @@
+MatrixXcf A = MatrixXcf::Random(4,4);
+cout << "Here is a random 4x4 matrix, A:" << endl << A << endl << endl;
+ComplexSchur<MatrixXcf> schurOfA(A, false); // false means do not compute U
+cout << "The triangular matrix T is:" << endl << schurOfA.matrixT() << endl;

diff --git a/doc/snippets/ComplexSchur_matrixU.cpp b/doc/snippets/ComplexSchur_matrixU.cpp
new file mode 100644
index 0000000..ba3d9c2
--- /dev/null
+++ b/doc/snippets/ComplexSchur_matrixU.cpp

@@ -0,0 +1,4 @@
+MatrixXcf A = MatrixXcf::Random(4,4);
+cout << "Here is a random 4x4 matrix, A:" << endl << A << endl << endl;
+ComplexSchur<MatrixXcf> schurOfA(A);
+cout << "The unitary matrix U is:" << endl << schurOfA.matrixU() << endl;

diff --git a/doc/snippets/Cwise_abs.cpp b/doc/snippets/Cwise_abs.cpp
new file mode 100644
index 0000000..0aeec3a
--- /dev/null
+++ b/doc/snippets/Cwise_abs.cpp

@@ -0,0 +1,2 @@
+Array3d v(1,-2,-3);
+cout << v.abs() << endl;

diff --git a/doc/snippets/Cwise_abs2.cpp b/doc/snippets/Cwise_abs2.cpp
new file mode 100644
index 0000000..2c4f9b3
--- /dev/null
+++ b/doc/snippets/Cwise_abs2.cpp

@@ -0,0 +1,2 @@
+Array3d v(1,-2,-3);
+cout << v.abs2() << endl;

diff --git a/doc/snippets/Cwise_acos.cpp b/doc/snippets/Cwise_acos.cpp
new file mode 100644
index 0000000..34432cb
--- /dev/null
+++ b/doc/snippets/Cwise_acos.cpp

@@ -0,0 +1,2 @@
+Array3d v(0, sqrt(2.)/2, 1);
+cout << v.acos() << endl;

diff --git a/doc/snippets/Cwise_arg.cpp b/doc/snippets/Cwise_arg.cpp
new file mode 100644
index 0000000..3f45133
--- /dev/null
+++ b/doc/snippets/Cwise_arg.cpp

@@ -0,0 +1,3 @@
+ArrayXcf v = ArrayXcf::Random(3);
+cout << v << endl << endl;
+cout << arg(v) << endl;

diff --git a/doc/snippets/Cwise_array_power_array.cpp b/doc/snippets/Cwise_array_power_array.cpp
new file mode 100644
index 0000000..432a76e
--- /dev/null
+++ b/doc/snippets/Cwise_array_power_array.cpp

@@ -0,0 +1,4 @@
+Array<double,1,3> x(8,25,3),
+                  e(1./3.,0.5,2.);
+cout << "[" << x << "]^[" << e << "] = " << x.pow(e) << endl; // using ArrayBase::pow
+cout << "[" << x << "]^[" << e << "] = " << pow(x,e) << endl; // using Eigen::pow

diff --git a/doc/snippets/Cwise_asin.cpp b/doc/snippets/Cwise_asin.cpp
new file mode 100644
index 0000000..8dad838
--- /dev/null
+++ b/doc/snippets/Cwise_asin.cpp

@@ -0,0 +1,2 @@
+Array3d v(0, sqrt(2.)/2, 1);
+cout << v.asin() << endl;

diff --git a/doc/snippets/Cwise_atan.cpp b/doc/snippets/Cwise_atan.cpp
new file mode 100644
index 0000000..4468447
--- /dev/null
+++ b/doc/snippets/Cwise_atan.cpp

@@ -0,0 +1,2 @@
+ArrayXd v = ArrayXd::LinSpaced(5,0,1);
+cout << v.atan() << endl;

diff --git a/doc/snippets/Cwise_boolean_and.cpp b/doc/snippets/Cwise_boolean_and.cpp
new file mode 100644
index 0000000..df6b60d
--- /dev/null
+++ b/doc/snippets/Cwise_boolean_and.cpp

@@ -0,0 +1,2 @@
+Array3d v(-1,2,1), w(-3,2,3);
+cout << ((v<w) && (v<0)) << endl;

diff --git a/doc/snippets/Cwise_boolean_not.cpp b/doc/snippets/Cwise_boolean_not.cpp
new file mode 100644
index 0000000..40009f1
--- /dev/null
+++ b/doc/snippets/Cwise_boolean_not.cpp

@@ -0,0 +1,5 @@
+Array3d v(1,2,3);
+v(1) *= 0.0/0.0;
+v(2) /= 0.0;
+cout << v << endl << endl;
+cout << !isfinite(v) << endl;

diff --git a/doc/snippets/Cwise_boolean_or.cpp b/doc/snippets/Cwise_boolean_or.cpp
new file mode 100644
index 0000000..83eb006
--- /dev/null
+++ b/doc/snippets/Cwise_boolean_or.cpp

@@ -0,0 +1,2 @@
+Array3d v(-1,2,1), w(-3,2,3);
+cout << ((v<w) || (v<0)) << endl;

diff --git a/doc/snippets/Cwise_boolean_xor.cpp b/doc/snippets/Cwise_boolean_xor.cpp
new file mode 100644
index 0000000..fafbec8
--- /dev/null
+++ b/doc/snippets/Cwise_boolean_xor.cpp

@@ -0,0 +1,2 @@
+Array3d v(-1,2,1), w(-3,2,3);
+cout << ((v<w) ^ (v<0)) << endl;

diff --git a/doc/snippets/Cwise_ceil.cpp b/doc/snippets/Cwise_ceil.cpp
new file mode 100644
index 0000000..76cf661
--- /dev/null
+++ b/doc/snippets/Cwise_ceil.cpp

@@ -0,0 +1,3 @@
+ArrayXd v = ArrayXd::LinSpaced(7,-2,2);
+cout << v << endl << endl;
+cout << ceil(v) << endl;

diff --git a/doc/snippets/Cwise_cos.cpp b/doc/snippets/Cwise_cos.cpp
new file mode 100644
index 0000000..f589f07
--- /dev/null
+++ b/doc/snippets/Cwise_cos.cpp

@@ -0,0 +1,2 @@
+Array3d v(M_PI, M_PI/2, M_PI/3);
+cout << v.cos() << endl;

diff --git a/doc/snippets/Cwise_cosh.cpp b/doc/snippets/Cwise_cosh.cpp
new file mode 100644
index 0000000..80ee75d
--- /dev/null
+++ b/doc/snippets/Cwise_cosh.cpp

@@ -0,0 +1,2 @@
+ArrayXd v = ArrayXd::LinSpaced(5,0,1);
+cout << cosh(v) << endl;

diff --git a/doc/snippets/Cwise_cube.cpp b/doc/snippets/Cwise_cube.cpp
new file mode 100644
index 0000000..85e41dc
--- /dev/null
+++ b/doc/snippets/Cwise_cube.cpp

@@ -0,0 +1,2 @@
+Array3d v(2,3,4);
+cout << v.cube() << endl;

diff --git a/doc/snippets/Cwise_equal_equal.cpp b/doc/snippets/Cwise_equal_equal.cpp
new file mode 100644
index 0000000..0ba96f6
--- /dev/null
+++ b/doc/snippets/Cwise_equal_equal.cpp

@@ -0,0 +1,2 @@
+Array3d v(1,2,3), w(3,2,1);
+cout << (v==w) << endl;

diff --git a/doc/snippets/Cwise_exp.cpp b/doc/snippets/Cwise_exp.cpp
new file mode 100644
index 0000000..db23618
--- /dev/null
+++ b/doc/snippets/Cwise_exp.cpp

@@ -0,0 +1,2 @@
+Array3d v(1,2,3);
+cout << v.exp() << endl;

diff --git a/doc/snippets/Cwise_floor.cpp b/doc/snippets/Cwise_floor.cpp
new file mode 100644
index 0000000..73756b4
--- /dev/null
+++ b/doc/snippets/Cwise_floor.cpp

@@ -0,0 +1,3 @@
+ArrayXd v = ArrayXd::LinSpaced(7,-2,2);
+cout << v << endl << endl;
+cout << floor(v) << endl;

diff --git a/doc/snippets/Cwise_greater.cpp b/doc/snippets/Cwise_greater.cpp
new file mode 100644
index 0000000..40ad029
--- /dev/null
+++ b/doc/snippets/Cwise_greater.cpp

@@ -0,0 +1,2 @@
+Array3d v(1,2,3), w(3,2,1);
+cout << (v>w) << endl;

diff --git a/doc/snippets/Cwise_greater_equal.cpp b/doc/snippets/Cwise_greater_equal.cpp
new file mode 100644
index 0000000..6a08f89
--- /dev/null
+++ b/doc/snippets/Cwise_greater_equal.cpp

@@ -0,0 +1,2 @@
+Array3d v(1,2,3), w(3,2,1);
+cout << (v>=w) << endl;

diff --git a/doc/snippets/Cwise_inverse.cpp b/doc/snippets/Cwise_inverse.cpp
new file mode 100644
index 0000000..3967a7e
--- /dev/null
+++ b/doc/snippets/Cwise_inverse.cpp

@@ -0,0 +1,2 @@
+Array3d v(2,3,4);
+cout << v.inverse() << endl;

diff --git a/doc/snippets/Cwise_isFinite.cpp b/doc/snippets/Cwise_isFinite.cpp
new file mode 100644
index 0000000..1da55fd
--- /dev/null
+++ b/doc/snippets/Cwise_isFinite.cpp

@@ -0,0 +1,5 @@
+Array3d v(1,2,3);
+v(1) *= 0.0/0.0;
+v(2) /= 0.0;
+cout << v << endl << endl;
+cout << isfinite(v) << endl;

diff --git a/doc/snippets/Cwise_isInf.cpp b/doc/snippets/Cwise_isInf.cpp
new file mode 100644
index 0000000..be79308
--- /dev/null
+++ b/doc/snippets/Cwise_isInf.cpp

@@ -0,0 +1,5 @@
+Array3d v(1,2,3);
+v(1) *= 0.0/0.0;
+v(2) /= 0.0;
+cout << v << endl << endl;
+cout << isinf(v) << endl;

diff --git a/doc/snippets/Cwise_isNaN.cpp b/doc/snippets/Cwise_isNaN.cpp
new file mode 100644
index 0000000..7b2a930
--- /dev/null
+++ b/doc/snippets/Cwise_isNaN.cpp

@@ -0,0 +1,5 @@
+Array3d v(1,2,3);
+v(1) *= 0.0/0.0;
+v(2) /= 0.0;
+cout << v << endl << endl;
+cout << isnan(v) << endl;

diff --git a/doc/snippets/Cwise_less.cpp b/doc/snippets/Cwise_less.cpp
new file mode 100644
index 0000000..cafd3b6
--- /dev/null
+++ b/doc/snippets/Cwise_less.cpp

@@ -0,0 +1,2 @@
+Array3d v(1,2,3), w(3,2,1);
+cout << (v<w) << endl;

diff --git a/doc/snippets/Cwise_less_equal.cpp b/doc/snippets/Cwise_less_equal.cpp
new file mode 100644
index 0000000..1600e39
--- /dev/null
+++ b/doc/snippets/Cwise_less_equal.cpp

@@ -0,0 +1,2 @@
+Array3d v(1,2,3), w(3,2,1);
+cout << (v<=w) << endl;

diff --git a/doc/snippets/Cwise_log.cpp b/doc/snippets/Cwise_log.cpp
new file mode 100644
index 0000000..f7aca72
--- /dev/null
+++ b/doc/snippets/Cwise_log.cpp

@@ -0,0 +1,2 @@
+Array3d v(1,2,3);
+cout << v.log() << endl;

diff --git a/doc/snippets/Cwise_log10.cpp b/doc/snippets/Cwise_log10.cpp
new file mode 100644
index 0000000..b7ae4a8
--- /dev/null
+++ b/doc/snippets/Cwise_log10.cpp

@@ -0,0 +1,2 @@
+Array4d v(-1,0,1,2);
+cout << log10(v) << endl;

diff --git a/doc/snippets/Cwise_max.cpp b/doc/snippets/Cwise_max.cpp
new file mode 100644
index 0000000..6602881
--- /dev/null
+++ b/doc/snippets/Cwise_max.cpp

@@ -0,0 +1,2 @@
+Array3d v(2,3,4), w(4,2,3);
+cout << v.max(w) << endl;

diff --git a/doc/snippets/Cwise_min.cpp b/doc/snippets/Cwise_min.cpp
new file mode 100644
index 0000000..1c01c76
--- /dev/null
+++ b/doc/snippets/Cwise_min.cpp

@@ -0,0 +1,2 @@
+Array3d v(2,3,4), w(4,2,3);
+cout << v.min(w) << endl;

diff --git a/doc/snippets/Cwise_minus.cpp b/doc/snippets/Cwise_minus.cpp
new file mode 100644
index 0000000..b89b9fb
--- /dev/null
+++ b/doc/snippets/Cwise_minus.cpp

@@ -0,0 +1,2 @@
+Array3d v(1,2,3);
+cout << v-5 << endl;

diff --git a/doc/snippets/Cwise_minus_equal.cpp b/doc/snippets/Cwise_minus_equal.cpp
new file mode 100644
index 0000000..dfde49d
--- /dev/null
+++ b/doc/snippets/Cwise_minus_equal.cpp

@@ -0,0 +1,3 @@
+Array3d v(1,2,3);
+v -= 5;
+cout << v << endl;

diff --git a/doc/snippets/Cwise_not_equal.cpp b/doc/snippets/Cwise_not_equal.cpp
new file mode 100644
index 0000000..57a407a
--- /dev/null
+++ b/doc/snippets/Cwise_not_equal.cpp

@@ -0,0 +1,2 @@
+Array3d v(1,2,3), w(3,2,1);
+cout << (v!=w) << endl;

diff --git a/doc/snippets/Cwise_plus.cpp b/doc/snippets/Cwise_plus.cpp
new file mode 100644
index 0000000..9d47327
--- /dev/null
+++ b/doc/snippets/Cwise_plus.cpp

@@ -0,0 +1,2 @@
+Array3d v(1,2,3);
+cout << v+5 << endl;

diff --git a/doc/snippets/Cwise_plus_equal.cpp b/doc/snippets/Cwise_plus_equal.cpp
new file mode 100644
index 0000000..d744b1e
--- /dev/null
+++ b/doc/snippets/Cwise_plus_equal.cpp

@@ -0,0 +1,3 @@
+Array3d v(1,2,3);
+v += 5;
+cout << v << endl;

diff --git a/doc/snippets/Cwise_pow.cpp b/doc/snippets/Cwise_pow.cpp
new file mode 100644
index 0000000..a723ed8
--- /dev/null
+++ b/doc/snippets/Cwise_pow.cpp

@@ -0,0 +1,2 @@
+Array3d v(8,27,64);
+cout << v.pow(0.333333) << endl;

diff --git a/doc/snippets/Cwise_product.cpp b/doc/snippets/Cwise_product.cpp
new file mode 100644
index 0000000..714d66d
--- /dev/null
+++ b/doc/snippets/Cwise_product.cpp

@@ -0,0 +1,4 @@
+Array33i a = Array33i::Random(), b = Array33i::Random();
+Array33i c = a * b;
+cout << "a:\n" << a << "\nb:\n" << b << "\nc:\n" << c << endl;
+

diff --git a/doc/snippets/Cwise_quotient.cpp b/doc/snippets/Cwise_quotient.cpp
new file mode 100644
index 0000000..7cb9f7f
--- /dev/null
+++ b/doc/snippets/Cwise_quotient.cpp

@@ -0,0 +1,2 @@
+Array3d v(2,3,4), w(4,2,3);
+cout << v/w << endl;

diff --git a/doc/snippets/Cwise_rint.cpp b/doc/snippets/Cwise_rint.cpp
new file mode 100644
index 0000000..1dc7b2f
--- /dev/null
+++ b/doc/snippets/Cwise_rint.cpp

@@ -0,0 +1,3 @@
+ArrayXd v = ArrayXd::LinSpaced(7,-2,2);
+cout << v << endl << endl;
+cout << rint(v) << endl;

diff --git a/doc/snippets/Cwise_round.cpp b/doc/snippets/Cwise_round.cpp
new file mode 100644
index 0000000..e5c8823
--- /dev/null
+++ b/doc/snippets/Cwise_round.cpp

@@ -0,0 +1,3 @@
+ArrayXd v = ArrayXd::LinSpaced(7,-2,2);
+cout << v << endl << endl;
+cout << round(v) << endl;

diff --git a/doc/snippets/Cwise_scalar_power_array.cpp b/doc/snippets/Cwise_scalar_power_array.cpp
new file mode 100644
index 0000000..c968b2c
--- /dev/null
+++ b/doc/snippets/Cwise_scalar_power_array.cpp

@@ -0,0 +1,2 @@
+Array<double,1,3> e(2,-3,1./3.);
+cout << "10^[" << e << "] = " << pow(10,e) << endl;

diff --git a/doc/snippets/Cwise_sign.cpp b/doc/snippets/Cwise_sign.cpp
new file mode 100644
index 0000000..49920e4
--- /dev/null
+++ b/doc/snippets/Cwise_sign.cpp

@@ -0,0 +1,2 @@
+Array3d v(-3,5,0);
+cout << v.sign() << endl;

diff --git a/doc/snippets/Cwise_sin.cpp b/doc/snippets/Cwise_sin.cpp
new file mode 100644
index 0000000..46fa908
--- /dev/null
+++ b/doc/snippets/Cwise_sin.cpp

@@ -0,0 +1,2 @@
+Array3d v(M_PI, M_PI/2, M_PI/3);
+cout << v.sin() << endl;

diff --git a/doc/snippets/Cwise_sinh.cpp b/doc/snippets/Cwise_sinh.cpp
new file mode 100644
index 0000000..fac9b19
--- /dev/null
+++ b/doc/snippets/Cwise_sinh.cpp

@@ -0,0 +1,2 @@
+ArrayXd v = ArrayXd::LinSpaced(5,0,1);
+cout << sinh(v) << endl;

diff --git a/doc/snippets/Cwise_slash_equal.cpp b/doc/snippets/Cwise_slash_equal.cpp
new file mode 100644
index 0000000..2efd32d
--- /dev/null
+++ b/doc/snippets/Cwise_slash_equal.cpp

@@ -0,0 +1,3 @@
+Array3d v(3,2,4), w(5,4,2);
+v /= w;
+cout << v << endl;

diff --git a/doc/snippets/Cwise_sqrt.cpp b/doc/snippets/Cwise_sqrt.cpp
new file mode 100644
index 0000000..97bafe8
--- /dev/null
+++ b/doc/snippets/Cwise_sqrt.cpp

@@ -0,0 +1,2 @@
+Array3d v(1,2,4);
+cout << v.sqrt() << endl;

diff --git a/doc/snippets/Cwise_square.cpp b/doc/snippets/Cwise_square.cpp
new file mode 100644
index 0000000..f704c5e
--- /dev/null
+++ b/doc/snippets/Cwise_square.cpp

@@ -0,0 +1,2 @@
+Array3d v(2,3,4);
+cout << v.square() << endl;

diff --git a/doc/snippets/Cwise_tan.cpp b/doc/snippets/Cwise_tan.cpp
new file mode 100644
index 0000000..b758ef0
--- /dev/null
+++ b/doc/snippets/Cwise_tan.cpp

@@ -0,0 +1,2 @@
+Array3d v(M_PI, M_PI/2, M_PI/3);
+cout << v.tan() << endl;

diff --git a/doc/snippets/Cwise_tanh.cpp b/doc/snippets/Cwise_tanh.cpp
new file mode 100644
index 0000000..30cd045
--- /dev/null
+++ b/doc/snippets/Cwise_tanh.cpp

@@ -0,0 +1,2 @@
+ArrayXd v = ArrayXd::LinSpaced(5,0,1);
+cout << tanh(v) << endl;

diff --git a/doc/snippets/Cwise_times_equal.cpp b/doc/snippets/Cwise_times_equal.cpp
new file mode 100644
index 0000000..147556c
--- /dev/null
+++ b/doc/snippets/Cwise_times_equal.cpp

@@ -0,0 +1,3 @@
+Array3d v(1,2,3), w(2,3,0);
+v *= w;
+cout << v << endl;

diff --git a/doc/snippets/DenseBase_LinSpaced.cpp b/doc/snippets/DenseBase_LinSpaced.cpp
new file mode 100644
index 0000000..8e54b17
--- /dev/null
+++ b/doc/snippets/DenseBase_LinSpaced.cpp

@@ -0,0 +1,2 @@
+cout << VectorXi::LinSpaced(4,7,10).transpose() << endl;
+cout << VectorXd::LinSpaced(5,0.0,1.0).transpose() << endl;

diff --git a/doc/snippets/DenseBase_LinSpacedInt.cpp b/doc/snippets/DenseBase_LinSpacedInt.cpp
new file mode 100644
index 0000000..0d7ae06
--- /dev/null
+++ b/doc/snippets/DenseBase_LinSpacedInt.cpp

@@ -0,0 +1,8 @@
+cout << "Even spacing inputs:" << endl;
+cout << VectorXi::LinSpaced(8,1,4).transpose() << endl;
+cout << VectorXi::LinSpaced(8,1,8).transpose() << endl;
+cout << VectorXi::LinSpaced(8,1,15).transpose() << endl;
+cout << "Uneven spacing inputs:" << endl;
+cout << VectorXi::LinSpaced(8,1,7).transpose() << endl;
+cout << VectorXi::LinSpaced(8,1,9).transpose() << endl;
+cout << VectorXi::LinSpaced(8,1,16).transpose() << endl;

diff --git a/doc/snippets/DenseBase_LinSpaced_seq_deprecated.cpp b/doc/snippets/DenseBase_LinSpaced_seq_deprecated.cpp
new file mode 100644
index 0000000..f55c508
--- /dev/null
+++ b/doc/snippets/DenseBase_LinSpaced_seq_deprecated.cpp

@@ -0,0 +1,2 @@
+cout << VectorXi::LinSpaced(Sequential,4,7,10).transpose() << endl;
+cout << VectorXd::LinSpaced(Sequential,5,0.0,1.0).transpose() << endl;

diff --git a/doc/snippets/DenseBase_setLinSpaced.cpp b/doc/snippets/DenseBase_setLinSpaced.cpp
new file mode 100644
index 0000000..46054f2
--- /dev/null
+++ b/doc/snippets/DenseBase_setLinSpaced.cpp

@@ -0,0 +1,3 @@
+VectorXf v;
+v.setLinSpaced(5,0.5f,1.5f);
+cout << v << endl;

diff --git a/doc/snippets/DirectionWise_hnormalized.cpp b/doc/snippets/DirectionWise_hnormalized.cpp
new file mode 100644
index 0000000..2451f6e
--- /dev/null
+++ b/doc/snippets/DirectionWise_hnormalized.cpp

@@ -0,0 +1,6 @@
+Matrix4Xd M = Matrix4Xd::Random(4,5);
+Projective3d P(Matrix4d::Random());
+cout << "The matrix M is:" << endl << M << endl << endl;
+cout << "M.colwise().hnormalized():" << endl << M.colwise().hnormalized() << endl << endl;
+cout << "P*M:" << endl << P*M << endl << endl;
+cout << "(P*M).colwise().hnormalized():" << endl << (P*M).colwise().hnormalized() << endl << endl;

diff --git a/doc/snippets/DirectionWise_replicate.cpp b/doc/snippets/DirectionWise_replicate.cpp
new file mode 100644
index 0000000..d92d4a3
--- /dev/null
+++ b/doc/snippets/DirectionWise_replicate.cpp

@@ -0,0 +1,4 @@
+MatrixXi m = MatrixXi::Random(2,3);
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "m.colwise().replicate<3>() = ..." << endl;
+cout << m.colwise().replicate<3>() << endl;

diff --git a/doc/snippets/DirectionWise_replicate_int.cpp b/doc/snippets/DirectionWise_replicate_int.cpp
new file mode 100644
index 0000000..f9b1b53
--- /dev/null
+++ b/doc/snippets/DirectionWise_replicate_int.cpp

@@ -0,0 +1,4 @@
+Vector3i v = Vector3i::Random();
+cout << "Here is the vector v:" << endl << v << endl;
+cout << "v.rowwise().replicate(5) = ..." << endl;
+cout << v.rowwise().replicate(5) << endl;

diff --git a/doc/snippets/EigenSolver_EigenSolver_MatrixType.cpp b/doc/snippets/EigenSolver_EigenSolver_MatrixType.cpp
new file mode 100644
index 0000000..c1d9fa8
--- /dev/null
+++ b/doc/snippets/EigenSolver_EigenSolver_MatrixType.cpp

@@ -0,0 +1,16 @@
+MatrixXd A = MatrixXd::Random(6,6);
+cout << "Here is a random 6x6 matrix, A:" << endl << A << endl << endl;
+
+EigenSolver<MatrixXd> es(A);
+cout << "The eigenvalues of A are:" << endl << es.eigenvalues() << endl;
+cout << "The matrix of eigenvectors, V, is:" << endl << es.eigenvectors() << endl << endl;
+
+complex<double> lambda = es.eigenvalues()[0];
+cout << "Consider the first eigenvalue, lambda = " << lambda << endl;
+VectorXcd v = es.eigenvectors().col(0);
+cout << "If v is the corresponding eigenvector, then lambda * v = " << endl << lambda * v << endl;
+cout << "... and A * v = " << endl << A.cast<complex<double> >() * v << endl << endl;
+
+MatrixXcd D = es.eigenvalues().asDiagonal();
+MatrixXcd V = es.eigenvectors();
+cout << "Finally, V * D * V^(-1) = " << endl << V * D * V.inverse() << endl;

diff --git a/doc/snippets/EigenSolver_compute.cpp b/doc/snippets/EigenSolver_compute.cpp
new file mode 100644
index 0000000..a5c96e9
--- /dev/null
+++ b/doc/snippets/EigenSolver_compute.cpp

@@ -0,0 +1,6 @@
+EigenSolver<MatrixXf> es;
+MatrixXf A = MatrixXf::Random(4,4);
+es.compute(A, /* computeEigenvectors = */ false);
+cout << "The eigenvalues of A are: " << es.eigenvalues().transpose() << endl;
+es.compute(A + MatrixXf::Identity(4,4), false); // re-use es to compute eigenvalues of A+I
+cout << "The eigenvalues of A+I are: " << es.eigenvalues().transpose() << endl;

diff --git a/doc/snippets/EigenSolver_eigenvalues.cpp b/doc/snippets/EigenSolver_eigenvalues.cpp
new file mode 100644
index 0000000..ed28869
--- /dev/null
+++ b/doc/snippets/EigenSolver_eigenvalues.cpp

@@ -0,0 +1,4 @@
+MatrixXd ones = MatrixXd::Ones(3,3);
+EigenSolver<MatrixXd> es(ones, false);
+cout << "The eigenvalues of the 3x3 matrix of ones are:" 
+     << endl << es.eigenvalues() << endl;

diff --git a/doc/snippets/EigenSolver_eigenvectors.cpp b/doc/snippets/EigenSolver_eigenvectors.cpp
new file mode 100644
index 0000000..8355f76
--- /dev/null
+++ b/doc/snippets/EigenSolver_eigenvectors.cpp

@@ -0,0 +1,4 @@
+MatrixXd ones = MatrixXd::Ones(3,3);
+EigenSolver<MatrixXd> es(ones);
+cout << "The first eigenvector of the 3x3 matrix of ones is:"
+     << endl << es.eigenvectors().col(0) << endl;

diff --git a/doc/snippets/EigenSolver_pseudoEigenvectors.cpp b/doc/snippets/EigenSolver_pseudoEigenvectors.cpp
new file mode 100644
index 0000000..85e2569
--- /dev/null
+++ b/doc/snippets/EigenSolver_pseudoEigenvectors.cpp

@@ -0,0 +1,9 @@
+MatrixXd A = MatrixXd::Random(6,6);
+cout << "Here is a random 6x6 matrix, A:" << endl << A << endl << endl;
+
+EigenSolver<MatrixXd> es(A);
+MatrixXd D = es.pseudoEigenvalueMatrix();
+MatrixXd V = es.pseudoEigenvectors();
+cout << "The pseudo-eigenvalue matrix D is:" << endl << D << endl;
+cout << "The pseudo-eigenvector matrix V is:" << endl << V << endl;
+cout << "Finally, V * D * V^(-1) = " << endl << V * D * V.inverse() << endl;

diff --git a/doc/snippets/FullPivHouseholderQR_solve.cpp b/doc/snippets/FullPivHouseholderQR_solve.cpp
new file mode 100644
index 0000000..23bc074
--- /dev/null
+++ b/doc/snippets/FullPivHouseholderQR_solve.cpp

@@ -0,0 +1,8 @@
+Matrix3f m = Matrix3f::Random();
+Matrix3f y = Matrix3f::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is the matrix y:" << endl << y << endl;
+Matrix3f x;
+x = m.fullPivHouseholderQr().solve(y);
+assert(y.isApprox(m*x));
+cout << "Here is a solution x to the equation mx=y:" << endl << x << endl;

diff --git a/doc/snippets/FullPivLU_image.cpp b/doc/snippets/FullPivLU_image.cpp
new file mode 100644
index 0000000..817bc1e
--- /dev/null
+++ b/doc/snippets/FullPivLU_image.cpp

@@ -0,0 +1,9 @@
+Matrix3d m;
+m << 1,1,0,
+     1,3,2,
+     0,1,1;
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Notice that the middle column is the sum of the two others, so the "
+     << "columns are linearly dependent." << endl;
+cout << "Here is a matrix whose columns have the same span but are linearly independent:"
+     << endl << m.fullPivLu().image(m) << endl;

diff --git a/doc/snippets/FullPivLU_kernel.cpp b/doc/snippets/FullPivLU_kernel.cpp
new file mode 100644
index 0000000..7086e01
--- /dev/null
+++ b/doc/snippets/FullPivLU_kernel.cpp

@@ -0,0 +1,7 @@
+MatrixXf m = MatrixXf::Random(3,5);
+cout << "Here is the matrix m:" << endl << m << endl;
+MatrixXf ker = m.fullPivLu().kernel();
+cout << "Here is a matrix whose columns form a basis of the kernel of m:"
+     << endl << ker << endl;
+cout << "By definition of the kernel, m*ker is zero:"
+     << endl << m*ker << endl;

diff --git a/doc/snippets/FullPivLU_solve.cpp b/doc/snippets/FullPivLU_solve.cpp
new file mode 100644
index 0000000..c1f8823
--- /dev/null
+++ b/doc/snippets/FullPivLU_solve.cpp

@@ -0,0 +1,11 @@
+Matrix<float,2,3> m = Matrix<float,2,3>::Random();
+Matrix2f y = Matrix2f::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is the matrix y:" << endl << y << endl;
+Matrix<float,3,2> x = m.fullPivLu().solve(y);
+if((m*x).isApprox(y))
+{
+  cout << "Here is a solution x to the equation mx=y:" << endl << x << endl;
+}
+else
+  cout << "The equation mx=y does not have any solution." << endl;

diff --git a/doc/snippets/GeneralizedEigenSolver.cpp b/doc/snippets/GeneralizedEigenSolver.cpp
new file mode 100644
index 0000000..2acda45
--- /dev/null
+++ b/doc/snippets/GeneralizedEigenSolver.cpp

@@ -0,0 +1,7 @@
+GeneralizedEigenSolver<MatrixXf> ges;
+MatrixXf A = MatrixXf::Random(4,4);
+MatrixXf B = MatrixXf::Random(4,4);
+ges.compute(A, B);
+cout << "The (complex) numerators of the generalzied eigenvalues are: " << ges.alphas().transpose() << endl;
+cout << "The (real) denominatore of the generalzied eigenvalues are: " << ges.betas().transpose() << endl;
+cout << "The (complex) generalzied eigenvalues are (alphas./beta): " << ges.eigenvalues().transpose() << endl;

diff --git a/doc/snippets/HessenbergDecomposition_compute.cpp b/doc/snippets/HessenbergDecomposition_compute.cpp
new file mode 100644
index 0000000..50e3783
--- /dev/null
+++ b/doc/snippets/HessenbergDecomposition_compute.cpp

@@ -0,0 +1,6 @@
+MatrixXcf A = MatrixXcf::Random(4,4);
+HessenbergDecomposition<MatrixXcf> hd(4);
+hd.compute(A);
+cout << "The matrix H in the decomposition of A is:" << endl << hd.matrixH() << endl;
+hd.compute(2*A); // re-use hd to compute and store decomposition of 2A
+cout << "The matrix H in the decomposition of 2A is:" << endl << hd.matrixH() << endl;

diff --git a/doc/snippets/HessenbergDecomposition_matrixH.cpp b/doc/snippets/HessenbergDecomposition_matrixH.cpp
new file mode 100644
index 0000000..af01366
--- /dev/null
+++ b/doc/snippets/HessenbergDecomposition_matrixH.cpp

@@ -0,0 +1,8 @@
+Matrix4f A = MatrixXf::Random(4,4);
+cout << "Here is a random 4x4 matrix:" << endl << A << endl;
+HessenbergDecomposition<MatrixXf> hessOfA(A);
+MatrixXf H = hessOfA.matrixH();
+cout << "The Hessenberg matrix H is:" << endl << H << endl;
+MatrixXf Q = hessOfA.matrixQ();
+cout << "The orthogonal matrix Q is:" << endl << Q << endl;
+cout << "Q H Q^T is:" << endl << Q * H * Q.transpose() << endl;

diff --git a/doc/snippets/HessenbergDecomposition_packedMatrix.cpp b/doc/snippets/HessenbergDecomposition_packedMatrix.cpp
new file mode 100644
index 0000000..4fa5957
--- /dev/null
+++ b/doc/snippets/HessenbergDecomposition_packedMatrix.cpp

@@ -0,0 +1,9 @@
+Matrix4d A = Matrix4d::Random(4,4);
+cout << "Here is a random 4x4 matrix:" << endl << A << endl;
+HessenbergDecomposition<Matrix4d> hessOfA(A);
+Matrix4d pm = hessOfA.packedMatrix();
+cout << "The packed matrix M is:" << endl << pm << endl;
+cout << "The upper Hessenberg part corresponds to the matrix H, which is:" 
+     << endl << hessOfA.matrixH() << endl;
+Vector3d hc = hessOfA.householderCoefficients();
+cout << "The vector of Householder coefficients is:" << endl << hc << endl;

diff --git a/doc/snippets/HouseholderQR_householderQ.cpp b/doc/snippets/HouseholderQR_householderQ.cpp
new file mode 100644
index 0000000..e859ce5
--- /dev/null
+++ b/doc/snippets/HouseholderQR_householderQ.cpp

@@ -0,0 +1,7 @@
+MatrixXf A(MatrixXf::Random(5,3)), thinQ(MatrixXf::Identity(5,3)), Q;
+A.setRandom();
+HouseholderQR<MatrixXf> qr(A);
+Q = qr.householderQ();
+thinQ = qr.householderQ() * thinQ;
+std::cout << "The complete unitary matrix Q is:\n" << Q << "\n\n";
+std::cout << "The thin matrix Q is:\n" << thinQ << "\n\n";

diff --git a/doc/snippets/HouseholderQR_solve.cpp b/doc/snippets/HouseholderQR_solve.cpp
new file mode 100644
index 0000000..8cce6ce
--- /dev/null
+++ b/doc/snippets/HouseholderQR_solve.cpp

@@ -0,0 +1,9 @@
+typedef Matrix<float,3,3> Matrix3x3;
+Matrix3x3 m = Matrix3x3::Random();
+Matrix3f y = Matrix3f::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is the matrix y:" << endl << y << endl;
+Matrix3f x;
+x = m.householderQr().solve(y);
+assert(y.isApprox(m*x));
+cout << "Here is a solution x to the equation mx=y:" << endl << x << endl;

diff --git a/doc/snippets/HouseholderSequence_HouseholderSequence.cpp b/doc/snippets/HouseholderSequence_HouseholderSequence.cpp
new file mode 100644
index 0000000..2632b83
--- /dev/null
+++ b/doc/snippets/HouseholderSequence_HouseholderSequence.cpp

@@ -0,0 +1,31 @@
+Matrix3d v = Matrix3d::Random();
+cout << "The matrix v is:" << endl;
+cout << v << endl;
+
+Vector3d v0(1, v(1,0), v(2,0));
+cout << "The first Householder vector is: v_0 = " << v0.transpose() << endl;
+Vector3d v1(0, 1, v(2,1));
+cout << "The second Householder vector is: v_1 = " << v1.transpose()  << endl;
+Vector3d v2(0, 0, 1);
+cout << "The third Householder vector is: v_2 = " << v2.transpose() << endl;
+
+Vector3d h = Vector3d::Random();
+cout << "The Householder coefficients are: h = " << h.transpose() << endl;
+
+Matrix3d H0 = Matrix3d::Identity() - h(0) * v0 * v0.adjoint();
+cout << "The first Householder reflection is represented by H_0 = " << endl;
+cout << H0 << endl;
+Matrix3d H1 = Matrix3d::Identity() - h(1) * v1 * v1.adjoint();
+cout << "The second Householder reflection is represented by H_1 = " << endl;
+cout << H1 << endl;
+Matrix3d H2 = Matrix3d::Identity() - h(2) * v2 * v2.adjoint();
+cout << "The third Householder reflection is represented by H_2 = " << endl;
+cout << H2 << endl;
+cout << "Their product is H_0 H_1 H_2 = " << endl;
+cout << H0 * H1 * H2 << endl;
+
+HouseholderSequence<Matrix3d, Vector3d> hhSeq(v, h);
+Matrix3d hhSeqAsMatrix(hhSeq);
+cout << "If we construct a HouseholderSequence from v and h" << endl;
+cout << "and convert it to a matrix, we get:" << endl;
+cout << hhSeqAsMatrix << endl;

diff --git a/doc/snippets/IOFormat.cpp b/doc/snippets/IOFormat.cpp
new file mode 100644
index 0000000..735f5dd
--- /dev/null
+++ b/doc/snippets/IOFormat.cpp

@@ -0,0 +1,14 @@
+std::string sep = "\n----------------------------------------\n";
+Matrix3d m1;
+m1 << 1.111111, 2, 3.33333, 4, 5, 6, 7, 8.888888, 9;
+
+IOFormat CommaInitFmt(StreamPrecision, DontAlignCols, ", ", ", ", "", "", " << ", ";");
+IOFormat CleanFmt(4, 0, ", ", "\n", "[", "]");
+IOFormat OctaveFmt(StreamPrecision, 0, ", ", ";\n", "", "", "[", "]");
+IOFormat HeavyFmt(FullPrecision, 0, ", ", ";\n", "[", "]", "[", "]");
+
+std::cout << m1 << sep;
+std::cout << m1.format(CommaInitFmt) << sep;
+std::cout << m1.format(CleanFmt) << sep;
+std::cout << m1.format(OctaveFmt) << sep;
+std::cout << m1.format(HeavyFmt) << sep;

diff --git a/doc/snippets/JacobiSVD_basic.cpp b/doc/snippets/JacobiSVD_basic.cpp
new file mode 100644
index 0000000..ab24b9b
--- /dev/null
+++ b/doc/snippets/JacobiSVD_basic.cpp

@@ -0,0 +1,9 @@
+MatrixXf m = MatrixXf::Random(3,2);
+cout << "Here is the matrix m:" << endl << m << endl;
+JacobiSVD<MatrixXf> svd(m, ComputeThinU | ComputeThinV);
+cout << "Its singular values are:" << endl << svd.singularValues() << endl;
+cout << "Its left singular vectors are the columns of the thin U matrix:" << endl << svd.matrixU() << endl;
+cout << "Its right singular vectors are the columns of the thin V matrix:" << endl << svd.matrixV() << endl;
+Vector3f rhs(1, 0, 0);
+cout << "Now consider this rhs vector:" << endl << rhs << endl;
+cout << "A least-squares solution of m*x = rhs is:" << endl << svd.solve(rhs) << endl;

diff --git a/doc/snippets/Jacobi_makeGivens.cpp b/doc/snippets/Jacobi_makeGivens.cpp
new file mode 100644
index 0000000..6f8ec05
--- /dev/null
+++ b/doc/snippets/Jacobi_makeGivens.cpp

@@ -0,0 +1,6 @@
+Vector2f v = Vector2f::Random();
+JacobiRotation<float> G;
+G.makeGivens(v.x(), v.y());
+cout << "Here is the vector v:" << endl << v << endl;
+v.applyOnTheLeft(0, 1, G.adjoint());
+cout << "Here is the vector J' * v:" << endl << v << endl;

diff --git a/doc/snippets/Jacobi_makeJacobi.cpp b/doc/snippets/Jacobi_makeJacobi.cpp
new file mode 100644
index 0000000..a86e80a
--- /dev/null
+++ b/doc/snippets/Jacobi_makeJacobi.cpp

@@ -0,0 +1,8 @@
+Matrix2f m = Matrix2f::Random();
+m = (m + m.adjoint()).eval();
+JacobiRotation<float> J;
+J.makeJacobi(m, 0, 1);
+cout << "Here is the matrix m:" << endl << m << endl;
+m.applyOnTheLeft(0, 1, J.adjoint());
+m.applyOnTheRight(0, 1, J);
+cout << "Here is the matrix J' * m * J:" << endl << m << endl;

diff --git a/doc/snippets/LLT_example.cpp b/doc/snippets/LLT_example.cpp
new file mode 100644
index 0000000..46fb407
--- /dev/null
+++ b/doc/snippets/LLT_example.cpp

@@ -0,0 +1,12 @@
+MatrixXd A(3,3);
+A << 4,-1,2, -1,6,0, 2,0,5;
+cout << "The matrix A is" << endl << A << endl;
+
+LLT<MatrixXd> lltOfA(A); // compute the Cholesky decomposition of A
+MatrixXd L = lltOfA.matrixL(); // retrieve factor L  in the decomposition
+// The previous two lines can also be written as "L = A.llt().matrixL()"
+
+cout << "The Cholesky factor L is" << endl << L << endl;
+cout << "To check this, let us compute L * L.transpose()" << endl;
+cout << L * L.transpose() << endl;
+cout << "This should equal the matrix A" << endl;

diff --git a/doc/snippets/LLT_solve.cpp b/doc/snippets/LLT_solve.cpp
new file mode 100644
index 0000000..7095d2c
--- /dev/null
+++ b/doc/snippets/LLT_solve.cpp

@@ -0,0 +1,8 @@
+typedef Matrix<float,Dynamic,2> DataMatrix;
+// let's generate some samples on the 3D plane of equation z = 2x+3y (with some noise)
+DataMatrix samples = DataMatrix::Random(12,2);
+VectorXf elevations = 2*samples.col(0) + 3*samples.col(1) + VectorXf::Random(12)*0.1;
+// and let's solve samples * [x y]^T = elevations in least square sense:
+Matrix<float,2,1> xy
+ = (samples.adjoint() * samples).llt().solve((samples.adjoint()*elevations));
+cout << xy << endl;

diff --git a/doc/snippets/LeastSquaresNormalEquations.cpp b/doc/snippets/LeastSquaresNormalEquations.cpp
new file mode 100644
index 0000000..997cf17
--- /dev/null
+++ b/doc/snippets/LeastSquaresNormalEquations.cpp

@@ -0,0 +1,4 @@
+MatrixXf A = MatrixXf::Random(3, 2);
+VectorXf b = VectorXf::Random(3);
+cout << "The solution using normal equations is:\n"
+     << (A.transpose() * A).ldlt().solve(A.transpose() * b) << endl;

diff --git a/doc/snippets/LeastSquaresQR.cpp b/doc/snippets/LeastSquaresQR.cpp
new file mode 100644
index 0000000..6c97045
--- /dev/null
+++ b/doc/snippets/LeastSquaresQR.cpp

@@ -0,0 +1,4 @@
+MatrixXf A = MatrixXf::Random(3, 2);
+VectorXf b = VectorXf::Random(3);
+cout << "The solution using the QR decomposition is:\n"
+     << A.colPivHouseholderQr().solve(b) << endl;

diff --git a/doc/snippets/Map_general_stride.cpp b/doc/snippets/Map_general_stride.cpp
new file mode 100644
index 0000000..0657e7f
--- /dev/null
+++ b/doc/snippets/Map_general_stride.cpp

@@ -0,0 +1,5 @@
+int array[24];
+for(int i = 0; i < 24; ++i) array[i] = i;
+cout << Map<MatrixXi, 0, Stride<Dynamic,2> >
+         (array, 3, 3, Stride<Dynamic,2>(8, 2))
+     << endl;

diff --git a/doc/snippets/Map_inner_stride.cpp b/doc/snippets/Map_inner_stride.cpp
new file mode 100644
index 0000000..d95ae9b
--- /dev/null
+++ b/doc/snippets/Map_inner_stride.cpp

@@ -0,0 +1,5 @@
+int array[12];
+for(int i = 0; i < 12; ++i) array[i] = i;
+cout << Map<VectorXi, 0, InnerStride<2> >
+         (array, 6) // the inner stride has already been passed as template parameter
+     << endl;

diff --git a/doc/snippets/Map_outer_stride.cpp b/doc/snippets/Map_outer_stride.cpp
new file mode 100644
index 0000000..2f6f052
--- /dev/null
+++ b/doc/snippets/Map_outer_stride.cpp

@@ -0,0 +1,3 @@
+int array[12];
+for(int i = 0; i < 12; ++i) array[i] = i;
+cout << Map<MatrixXi, 0, OuterStride<> >(array, 3, 3, OuterStride<>(4)) << endl;

diff --git a/doc/snippets/Map_placement_new.cpp b/doc/snippets/Map_placement_new.cpp
new file mode 100644
index 0000000..83b83a8
--- /dev/null
+++ b/doc/snippets/Map_placement_new.cpp

@@ -0,0 +1,5 @@
+int data[] = {1,2,3,4,5,6,7,8,9};
+Map<RowVectorXi> v(data,4);
+cout << "The mapped vector v is: " << v << "\n";
+new (&v) Map<RowVectorXi>(data+4,5);
+cout << "Now v is: " << v << "\n";

diff --git a/doc/snippets/Map_simple.cpp b/doc/snippets/Map_simple.cpp
new file mode 100644
index 0000000..423bb52
--- /dev/null
+++ b/doc/snippets/Map_simple.cpp

@@ -0,0 +1,3 @@
+int array[9];
+for(int i = 0; i < 9; ++i) array[i] = i;
+cout << Map<Matrix3i>(array) << endl;

diff --git a/doc/snippets/MatrixBase_adjoint.cpp b/doc/snippets/MatrixBase_adjoint.cpp
new file mode 100644
index 0000000..4680d59
--- /dev/null
+++ b/doc/snippets/MatrixBase_adjoint.cpp

@@ -0,0 +1,3 @@
+Matrix2cf m = Matrix2cf::Random();
+cout << "Here is the 2x2 complex matrix m:" << endl << m << endl;
+cout << "Here is the adjoint of m:" << endl << m.adjoint() << endl;

diff --git a/doc/snippets/MatrixBase_all.cpp b/doc/snippets/MatrixBase_all.cpp
new file mode 100644
index 0000000..46f26f1
--- /dev/null
+++ b/doc/snippets/MatrixBase_all.cpp

@@ -0,0 +1,7 @@
+Vector3f boxMin(Vector3f::Zero()), boxMax(Vector3f::Ones());
+Vector3f p0 = Vector3f::Random(), p1 = Vector3f::Random().cwiseAbs();
+// let's check if p0 and p1 are inside the axis aligned box defined by the corners boxMin,boxMax:
+cout << "Is (" << p0.transpose() << ") inside the box: "
+     << ((boxMin.array()<p0.array()).all() && (boxMax.array()>p0.array()).all()) << endl;
+cout << "Is (" << p1.transpose() << ") inside the box: "
+     << ((boxMin.array()<p1.array()).all() && (boxMax.array()>p1.array()).all()) << endl;

diff --git a/doc/snippets/MatrixBase_applyOnTheLeft.cpp b/doc/snippets/MatrixBase_applyOnTheLeft.cpp
new file mode 100644
index 0000000..6398c87
--- /dev/null
+++ b/doc/snippets/MatrixBase_applyOnTheLeft.cpp

@@ -0,0 +1,7 @@
+Matrix3f A = Matrix3f::Random(3,3), B;
+B << 0,1,0,  
+     0,0,1,  
+     1,0,0;
+cout << "At start, A = " << endl << A << endl;
+A.applyOnTheLeft(B); 
+cout << "After applyOnTheLeft, A = " << endl << A << endl;

diff --git a/doc/snippets/MatrixBase_applyOnTheRight.cpp b/doc/snippets/MatrixBase_applyOnTheRight.cpp
new file mode 100644
index 0000000..e4b71b2
--- /dev/null
+++ b/doc/snippets/MatrixBase_applyOnTheRight.cpp

@@ -0,0 +1,9 @@
+Matrix3f A = Matrix3f::Random(3,3), B;
+B << 0,1,0,  
+     0,0,1,  
+     1,0,0;
+cout << "At start, A = " << endl << A << endl;
+A *= B;
+cout << "After A *= B, A = " << endl << A << endl;
+A.applyOnTheRight(B);  // equivalent to A *= B
+cout << "After applyOnTheRight, A = " << endl << A << endl;

diff --git a/doc/snippets/MatrixBase_array.cpp b/doc/snippets/MatrixBase_array.cpp
new file mode 100644
index 0000000..f215086
--- /dev/null
+++ b/doc/snippets/MatrixBase_array.cpp

@@ -0,0 +1,4 @@
+Vector3d v(1,2,3);
+v.array() += 3;
+v.array() -= 2;
+cout << v << endl;

diff --git a/doc/snippets/MatrixBase_array_const.cpp b/doc/snippets/MatrixBase_array_const.cpp
new file mode 100644
index 0000000..cd3b26a
--- /dev/null
+++ b/doc/snippets/MatrixBase_array_const.cpp

@@ -0,0 +1,4 @@
+Vector3d v(-1,2,-3);
+cout << "the absolute values:" << endl << v.array().abs() << endl;
+cout << "the absolute values plus one:" << endl << v.array().abs()+1 << endl;
+cout << "sum of the squares: " << v.array().square().sum() << endl;

diff --git a/doc/snippets/MatrixBase_asDiagonal.cpp b/doc/snippets/MatrixBase_asDiagonal.cpp
new file mode 100644
index 0000000..b01082d
--- /dev/null
+++ b/doc/snippets/MatrixBase_asDiagonal.cpp

@@ -0,0 +1 @@
+cout << Matrix3i(Vector3i(2,5,6).asDiagonal()) << endl;

diff --git a/doc/snippets/MatrixBase_block_int_int.cpp b/doc/snippets/MatrixBase_block_int_int.cpp
new file mode 100644
index 0000000..f99b6d4
--- /dev/null
+++ b/doc/snippets/MatrixBase_block_int_int.cpp

@@ -0,0 +1,5 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.block<2,2>(1,1):" << endl << m.block<2,2>(1,1) << endl;
+m.block<2,2>(1,1).setZero();
+cout << "Now the matrix m is:" << endl << m << endl;

diff --git a/doc/snippets/MatrixBase_block_int_int_int_int.cpp b/doc/snippets/MatrixBase_block_int_int_int_int.cpp
new file mode 100644
index 0000000..7238cbb
--- /dev/null
+++ b/doc/snippets/MatrixBase_block_int_int_int_int.cpp

@@ -0,0 +1,5 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.block(1, 1, 2, 2):" << endl << m.block(1, 1, 2, 2) << endl;
+m.block(1, 1, 2, 2).setZero();
+cout << "Now the matrix m is:" << endl << m << endl;

diff --git a/doc/snippets/MatrixBase_bottomLeftCorner_int_int.cpp b/doc/snippets/MatrixBase_bottomLeftCorner_int_int.cpp
new file mode 100644
index 0000000..ebae95e
--- /dev/null
+++ b/doc/snippets/MatrixBase_bottomLeftCorner_int_int.cpp

@@ -0,0 +1,6 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.bottomLeftCorner(2, 2):" << endl;
+cout << m.bottomLeftCorner(2, 2) << endl;
+m.bottomLeftCorner(2, 2).setZero();
+cout << "Now the matrix m is:" << endl << m << endl;

diff --git a/doc/snippets/MatrixBase_bottomRightCorner_int_int.cpp b/doc/snippets/MatrixBase_bottomRightCorner_int_int.cpp
new file mode 100644
index 0000000..bf05093
--- /dev/null
+++ b/doc/snippets/MatrixBase_bottomRightCorner_int_int.cpp

@@ -0,0 +1,6 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.bottomRightCorner(2, 2):" << endl;
+cout << m.bottomRightCorner(2, 2) << endl;
+m.bottomRightCorner(2, 2).setZero();
+cout << "Now the matrix m is:" << endl << m << endl;

diff --git a/doc/snippets/MatrixBase_bottomRows_int.cpp b/doc/snippets/MatrixBase_bottomRows_int.cpp
new file mode 100644
index 0000000..47ca92e
--- /dev/null
+++ b/doc/snippets/MatrixBase_bottomRows_int.cpp

@@ -0,0 +1,6 @@
+Array44i a = Array44i::Random();
+cout << "Here is the array a:" << endl << a << endl;
+cout << "Here is a.bottomRows(2):" << endl;
+cout << a.bottomRows(2) << endl;
+a.bottomRows(2).setZero();
+cout << "Now the array a is:" << endl << a << endl;

diff --git a/doc/snippets/MatrixBase_cast.cpp b/doc/snippets/MatrixBase_cast.cpp
new file mode 100644
index 0000000..016880b
--- /dev/null
+++ b/doc/snippets/MatrixBase_cast.cpp

@@ -0,0 +1,3 @@
+Matrix2d md = Matrix2d::Identity() * 0.45;
+Matrix2f mf = Matrix2f::Identity();
+cout << md + mf.cast<double>() << endl;

diff --git a/doc/snippets/MatrixBase_col.cpp b/doc/snippets/MatrixBase_col.cpp
new file mode 100644
index 0000000..87c91b1
--- /dev/null
+++ b/doc/snippets/MatrixBase_col.cpp

@@ -0,0 +1,3 @@
+Matrix3d m = Matrix3d::Identity();
+m.col(1) = Vector3d(4,5,6);
+cout << m << endl;

diff --git a/doc/snippets/MatrixBase_colwise.cpp b/doc/snippets/MatrixBase_colwise.cpp
new file mode 100644
index 0000000..a048bef
--- /dev/null
+++ b/doc/snippets/MatrixBase_colwise.cpp

@@ -0,0 +1,5 @@
+Matrix3d m = Matrix3d::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is the sum of each column:" << endl << m.colwise().sum() << endl;
+cout << "Here is the maximum absolute value of each column:"
+     << endl << m.cwiseAbs().colwise().maxCoeff() << endl;

diff --git a/doc/snippets/MatrixBase_colwise_iterator_cxx11.cpp b/doc/snippets/MatrixBase_colwise_iterator_cxx11.cpp
new file mode 100644
index 0000000..116063f
--- /dev/null
+++ b/doc/snippets/MatrixBase_colwise_iterator_cxx11.cpp

@@ -0,0 +1,12 @@
+Matrix3i m = Matrix3i::Random();
+cout << "Here is the initial matrix m:" << endl << m << endl;
+int i = -1;
+for(auto c: m.colwise()) {
+  c *= i;
+  ++i;
+}
+cout << "Here is the matrix m after the for-range-loop:" << endl << m << endl;
+auto cols = m.colwise();
+auto it = std::find_if(cols.cbegin(), cols.cend(),
+                       [](Matrix3i::ConstColXpr x) { return x.squaredNorm() == 0; });
+cout << "The first empty column is: " << distance(cols.cbegin(),it) << endl;

diff --git a/doc/snippets/MatrixBase_computeInverseAndDetWithCheck.cpp b/doc/snippets/MatrixBase_computeInverseAndDetWithCheck.cpp
new file mode 100644
index 0000000..a7b084f
--- /dev/null
+++ b/doc/snippets/MatrixBase_computeInverseAndDetWithCheck.cpp

@@ -0,0 +1,13 @@
+Matrix3d m = Matrix3d::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+Matrix3d inverse;
+bool invertible;
+double determinant;
+m.computeInverseAndDetWithCheck(inverse,determinant,invertible);
+cout << "Its determinant is " << determinant << endl;
+if(invertible) {
+  cout << "It is invertible, and its inverse is:" << endl << inverse << endl;
+}
+else {
+  cout << "It is not invertible." << endl;
+}

diff --git a/doc/snippets/MatrixBase_computeInverseWithCheck.cpp b/doc/snippets/MatrixBase_computeInverseWithCheck.cpp
new file mode 100644
index 0000000..873a9f8
--- /dev/null
+++ b/doc/snippets/MatrixBase_computeInverseWithCheck.cpp

@@ -0,0 +1,11 @@
+Matrix3d m = Matrix3d::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+Matrix3d inverse;
+bool invertible;
+m.computeInverseWithCheck(inverse,invertible);
+if(invertible) {
+  cout << "It is invertible, and its inverse is:" << endl << inverse << endl;
+}
+else {
+  cout << "It is not invertible." << endl;
+}

diff --git a/doc/snippets/MatrixBase_cwiseAbs.cpp b/doc/snippets/MatrixBase_cwiseAbs.cpp
new file mode 100644
index 0000000..28a3160
--- /dev/null
+++ b/doc/snippets/MatrixBase_cwiseAbs.cpp

@@ -0,0 +1,4 @@
+MatrixXd m(2,3);
+m << 2, -4, 6,   
+     -5, 1, 0;
+cout << m.cwiseAbs() << endl;

diff --git a/doc/snippets/MatrixBase_cwiseAbs2.cpp b/doc/snippets/MatrixBase_cwiseAbs2.cpp
new file mode 100644
index 0000000..889a2e2
--- /dev/null
+++ b/doc/snippets/MatrixBase_cwiseAbs2.cpp

@@ -0,0 +1,4 @@
+MatrixXd m(2,3);
+m << 2, -4, 6,   
+     -5, 1, 0;
+cout << m.cwiseAbs2() << endl;

diff --git a/doc/snippets/MatrixBase_cwiseArg.cpp b/doc/snippets/MatrixBase_cwiseArg.cpp
new file mode 100644
index 0000000..e0857cf
--- /dev/null
+++ b/doc/snippets/MatrixBase_cwiseArg.cpp

@@ -0,0 +1,3 @@
+MatrixXcf v = MatrixXcf::Random(2, 3);
+cout << v << endl << endl;
+cout << v.cwiseArg() << endl;
\ No newline at end of file

diff --git a/doc/snippets/MatrixBase_cwiseEqual.cpp b/doc/snippets/MatrixBase_cwiseEqual.cpp
new file mode 100644
index 0000000..469af64
--- /dev/null
+++ b/doc/snippets/MatrixBase_cwiseEqual.cpp

@@ -0,0 +1,7 @@
+MatrixXi m(2,2);
+m << 1, 0,
+     1, 1;
+cout << "Comparing m with identity matrix:" << endl;
+cout << m.cwiseEqual(MatrixXi::Identity(2,2)) << endl;
+Index count = m.cwiseEqual(MatrixXi::Identity(2,2)).count();
+cout << "Number of coefficients that are equal: " << count << endl;

diff --git a/doc/snippets/MatrixBase_cwiseInverse.cpp b/doc/snippets/MatrixBase_cwiseInverse.cpp
new file mode 100644
index 0000000..23e08f7
--- /dev/null
+++ b/doc/snippets/MatrixBase_cwiseInverse.cpp

@@ -0,0 +1,4 @@
+MatrixXd m(2,3);
+m << 2, 0.5, 1,   
+     3, 0.25, 1;
+cout << m.cwiseInverse() << endl;

diff --git a/doc/snippets/MatrixBase_cwiseMax.cpp b/doc/snippets/MatrixBase_cwiseMax.cpp
new file mode 100644
index 0000000..3c95681
--- /dev/null
+++ b/doc/snippets/MatrixBase_cwiseMax.cpp

@@ -0,0 +1,2 @@
+Vector3d v(2,3,4), w(4,2,3);
+cout << v.cwiseMax(w) << endl;

diff --git a/doc/snippets/MatrixBase_cwiseMin.cpp b/doc/snippets/MatrixBase_cwiseMin.cpp
new file mode 100644
index 0000000..82fc761
--- /dev/null
+++ b/doc/snippets/MatrixBase_cwiseMin.cpp

@@ -0,0 +1,2 @@
+Vector3d v(2,3,4), w(4,2,3);
+cout << v.cwiseMin(w) << endl;

diff --git a/doc/snippets/MatrixBase_cwiseNotEqual.cpp b/doc/snippets/MatrixBase_cwiseNotEqual.cpp
new file mode 100644
index 0000000..7f0a105
--- /dev/null
+++ b/doc/snippets/MatrixBase_cwiseNotEqual.cpp

@@ -0,0 +1,7 @@
+MatrixXi m(2,2);
+m << 1, 0,
+     1, 1;
+cout << "Comparing m with identity matrix:" << endl;
+cout << m.cwiseNotEqual(MatrixXi::Identity(2,2)) << endl;
+Index count = m.cwiseNotEqual(MatrixXi::Identity(2,2)).count();
+cout << "Number of coefficients that are not equal: " << count << endl;

diff --git a/doc/snippets/MatrixBase_cwiseProduct.cpp b/doc/snippets/MatrixBase_cwiseProduct.cpp
new file mode 100644
index 0000000..1db3a11
--- /dev/null
+++ b/doc/snippets/MatrixBase_cwiseProduct.cpp

@@ -0,0 +1,4 @@
+Matrix3i a = Matrix3i::Random(), b = Matrix3i::Random();
+Matrix3i c = a.cwiseProduct(b);
+cout << "a:\n" << a << "\nb:\n" << b << "\nc:\n" << c << endl;
+

diff --git a/doc/snippets/MatrixBase_cwiseQuotient.cpp b/doc/snippets/MatrixBase_cwiseQuotient.cpp
new file mode 100644
index 0000000..9691212
--- /dev/null
+++ b/doc/snippets/MatrixBase_cwiseQuotient.cpp

@@ -0,0 +1,2 @@
+Vector3d v(2,3,4), w(4,2,3);
+cout << v.cwiseQuotient(w) << endl;

diff --git a/doc/snippets/MatrixBase_cwiseSign.cpp b/doc/snippets/MatrixBase_cwiseSign.cpp
new file mode 100644
index 0000000..efd7179
--- /dev/null
+++ b/doc/snippets/MatrixBase_cwiseSign.cpp

@@ -0,0 +1,4 @@
+MatrixXd m(2,3);
+m <<  2, -4, 6,
+     -5,  1, 0;
+cout << m.cwiseSign() << endl;

diff --git a/doc/snippets/MatrixBase_cwiseSqrt.cpp b/doc/snippets/MatrixBase_cwiseSqrt.cpp
new file mode 100644
index 0000000..4bfd75d
--- /dev/null
+++ b/doc/snippets/MatrixBase_cwiseSqrt.cpp

@@ -0,0 +1,2 @@
+Vector3d v(1,2,4);
+cout << v.cwiseSqrt() << endl;

diff --git a/doc/snippets/MatrixBase_diagonal.cpp b/doc/snippets/MatrixBase_diagonal.cpp
new file mode 100644
index 0000000..cd63413
--- /dev/null
+++ b/doc/snippets/MatrixBase_diagonal.cpp

@@ -0,0 +1,4 @@
+Matrix3i m = Matrix3i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here are the coefficients on the main diagonal of m:" << endl
+     << m.diagonal() << endl;

diff --git a/doc/snippets/MatrixBase_diagonal_int.cpp b/doc/snippets/MatrixBase_diagonal_int.cpp
new file mode 100644
index 0000000..7b66abf
--- /dev/null
+++ b/doc/snippets/MatrixBase_diagonal_int.cpp

@@ -0,0 +1,5 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here are the coefficients on the 1st super-diagonal and 2nd sub-diagonal of m:" << endl
+     << m.diagonal(1).transpose() << endl
+     << m.diagonal(-2).transpose() << endl;

diff --git a/doc/snippets/MatrixBase_diagonal_template_int.cpp b/doc/snippets/MatrixBase_diagonal_template_int.cpp
new file mode 100644
index 0000000..0e73d1c
--- /dev/null
+++ b/doc/snippets/MatrixBase_diagonal_template_int.cpp

@@ -0,0 +1,5 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here are the coefficients on the 1st super-diagonal and 2nd sub-diagonal of m:" << endl
+     << m.diagonal<1>().transpose() << endl
+     << m.diagonal<-2>().transpose() << endl;

diff --git a/doc/snippets/MatrixBase_eigenvalues.cpp b/doc/snippets/MatrixBase_eigenvalues.cpp
new file mode 100644
index 0000000..039f887
--- /dev/null
+++ b/doc/snippets/MatrixBase_eigenvalues.cpp

@@ -0,0 +1,3 @@
+MatrixXd ones = MatrixXd::Ones(3,3);
+VectorXcd eivals = ones.eigenvalues();
+cout << "The eigenvalues of the 3x3 matrix of ones are:" << endl << eivals << endl;

diff --git a/doc/snippets/MatrixBase_end_int.cpp b/doc/snippets/MatrixBase_end_int.cpp
new file mode 100644
index 0000000..03c54a9
--- /dev/null
+++ b/doc/snippets/MatrixBase_end_int.cpp

@@ -0,0 +1,5 @@
+RowVector4i v = RowVector4i::Random();
+cout << "Here is the vector v:" << endl << v << endl;
+cout << "Here is v.tail(2):" << endl << v.tail(2) << endl;
+v.tail(2).setZero();
+cout << "Now the vector v is:" << endl << v << endl;

diff --git a/doc/snippets/MatrixBase_eval.cpp b/doc/snippets/MatrixBase_eval.cpp
new file mode 100644
index 0000000..1df3aa0
--- /dev/null
+++ b/doc/snippets/MatrixBase_eval.cpp

@@ -0,0 +1,12 @@
+Matrix2f M = Matrix2f::Random();
+Matrix2f m;
+m = M;
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Now we want to copy a column into a row." << endl;
+cout << "If we do m.col(1) = m.row(0), then m becomes:" << endl;
+m.col(1) = m.row(0);
+cout << m << endl << "which is wrong!" << endl;
+cout << "Now let us instead do m.col(1) = m.row(0).eval(). Then m becomes" << endl;
+m = M;
+m.col(1) = m.row(0).eval();
+cout << m << endl << "which is right." << endl;

diff --git a/doc/snippets/MatrixBase_fixedBlock_int_int.cpp b/doc/snippets/MatrixBase_fixedBlock_int_int.cpp
new file mode 100644
index 0000000..3201127
--- /dev/null
+++ b/doc/snippets/MatrixBase_fixedBlock_int_int.cpp

@@ -0,0 +1,5 @@
+Matrix4d m = Vector4d(1,2,3,4).asDiagonal();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.fixed<2, 2>(2, 2):" << endl << m.block<2, 2>(2, 2) << endl;
+m.block<2, 2>(2, 0) = m.block<2, 2>(2, 2);
+cout << "Now the matrix m is:" << endl << m << endl;

diff --git a/doc/snippets/MatrixBase_hnormalized.cpp b/doc/snippets/MatrixBase_hnormalized.cpp
new file mode 100644
index 0000000..b714adc
--- /dev/null
+++ b/doc/snippets/MatrixBase_hnormalized.cpp

@@ -0,0 +1,6 @@
+Vector4d v = Vector4d::Random();
+Projective3d P(Matrix4d::Random());
+cout << "v                   = " << v.transpose() << "]^T" << endl;
+cout << "v.hnormalized()     = " << v.hnormalized().transpose() << "]^T" << endl;
+cout << "P*v                 = " << (P*v).transpose() << "]^T" << endl;
+cout << "(P*v).hnormalized() = " << (P*v).hnormalized().transpose() << "]^T" << endl;

diff --git a/doc/snippets/MatrixBase_homogeneous.cpp b/doc/snippets/MatrixBase_homogeneous.cpp
new file mode 100644
index 0000000..2631960
--- /dev/null
+++ b/doc/snippets/MatrixBase_homogeneous.cpp

@@ -0,0 +1,6 @@
+Vector3d v = Vector3d::Random(), w;
+Projective3d P(Matrix4d::Random());
+cout << "v                                   = [" << v.transpose() << "]^T" << endl;
+cout << "h.homogeneous()                     = [" << v.homogeneous().transpose() << "]^T" << endl;
+cout << "(P * v.homogeneous())               = [" << (P * v.homogeneous()).transpose() << "]^T" << endl;
+cout << "(P * v.homogeneous()).hnormalized() = [" << (P * v.homogeneous()).eval().hnormalized().transpose() << "]^T" << endl;

diff --git a/doc/snippets/MatrixBase_identity.cpp b/doc/snippets/MatrixBase_identity.cpp
new file mode 100644
index 0000000..b5c1e59
--- /dev/null
+++ b/doc/snippets/MatrixBase_identity.cpp

@@ -0,0 +1 @@
+cout << Matrix<double, 3, 4>::Identity() << endl;

diff --git a/doc/snippets/MatrixBase_identity_int_int.cpp b/doc/snippets/MatrixBase_identity_int_int.cpp
new file mode 100644
index 0000000..918649d
--- /dev/null
+++ b/doc/snippets/MatrixBase_identity_int_int.cpp

@@ -0,0 +1 @@
+cout << MatrixXd::Identity(4, 3) << endl;

diff --git a/doc/snippets/MatrixBase_inverse.cpp b/doc/snippets/MatrixBase_inverse.cpp
new file mode 100644
index 0000000..a56142e
--- /dev/null
+++ b/doc/snippets/MatrixBase_inverse.cpp

@@ -0,0 +1,3 @@
+Matrix3d m = Matrix3d::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Its inverse is:" << endl << m.inverse() << endl;

diff --git a/doc/snippets/MatrixBase_isDiagonal.cpp b/doc/snippets/MatrixBase_isDiagonal.cpp
new file mode 100644
index 0000000..5b1d599
--- /dev/null
+++ b/doc/snippets/MatrixBase_isDiagonal.cpp

@@ -0,0 +1,6 @@
+Matrix3d m = 10000 * Matrix3d::Identity();
+m(0,2) = 1;
+cout << "Here's the matrix m:" << endl << m << endl;
+cout << "m.isDiagonal() returns: " << m.isDiagonal() << endl;
+cout << "m.isDiagonal(1e-3) returns: " << m.isDiagonal(1e-3) << endl;
+

diff --git a/doc/snippets/MatrixBase_isIdentity.cpp b/doc/snippets/MatrixBase_isIdentity.cpp
new file mode 100644
index 0000000..17b756c
--- /dev/null
+++ b/doc/snippets/MatrixBase_isIdentity.cpp

@@ -0,0 +1,5 @@
+Matrix3d m = Matrix3d::Identity();
+m(0,2) = 1e-4;
+cout << "Here's the matrix m:" << endl << m << endl;
+cout << "m.isIdentity() returns: " << m.isIdentity() << endl;
+cout << "m.isIdentity(1e-3) returns: " << m.isIdentity(1e-3) << endl;

diff --git a/doc/snippets/MatrixBase_isOnes.cpp b/doc/snippets/MatrixBase_isOnes.cpp
new file mode 100644
index 0000000..f82f628
--- /dev/null
+++ b/doc/snippets/MatrixBase_isOnes.cpp

@@ -0,0 +1,5 @@
+Matrix3d m = Matrix3d::Ones();
+m(0,2) += 1e-4;
+cout << "Here's the matrix m:" << endl << m << endl;
+cout << "m.isOnes() returns: " << m.isOnes() << endl;
+cout << "m.isOnes(1e-3) returns: " << m.isOnes(1e-3) << endl;

diff --git a/doc/snippets/MatrixBase_isOrthogonal.cpp b/doc/snippets/MatrixBase_isOrthogonal.cpp
new file mode 100644
index 0000000..b22af06
--- /dev/null
+++ b/doc/snippets/MatrixBase_isOrthogonal.cpp

@@ -0,0 +1,6 @@
+Vector3d v(1,0,0);
+Vector3d w(1e-4,0,1);
+cout << "Here's the vector v:" << endl << v << endl;
+cout << "Here's the vector w:" << endl << w << endl;
+cout << "v.isOrthogonal(w) returns: " << v.isOrthogonal(w) << endl;
+cout << "v.isOrthogonal(w,1e-3) returns: " << v.isOrthogonal(w,1e-3) << endl;

diff --git a/doc/snippets/MatrixBase_isUnitary.cpp b/doc/snippets/MatrixBase_isUnitary.cpp
new file mode 100644
index 0000000..3877da3
--- /dev/null
+++ b/doc/snippets/MatrixBase_isUnitary.cpp

@@ -0,0 +1,5 @@
+Matrix3d m = Matrix3d::Identity();
+m(0,2) = 1e-4;
+cout << "Here's the matrix m:" << endl << m << endl;
+cout << "m.isUnitary() returns: " << m.isUnitary() << endl;
+cout << "m.isUnitary(1e-3) returns: " << m.isUnitary(1e-3) << endl;

diff --git a/doc/snippets/MatrixBase_isZero.cpp b/doc/snippets/MatrixBase_isZero.cpp
new file mode 100644
index 0000000..c2cfe22
--- /dev/null
+++ b/doc/snippets/MatrixBase_isZero.cpp

@@ -0,0 +1,5 @@
+Matrix3d m = Matrix3d::Zero();
+m(0,2) = 1e-4;
+cout << "Here's the matrix m:" << endl << m << endl;
+cout << "m.isZero() returns: " << m.isZero() << endl;
+cout << "m.isZero(1e-3) returns: " << m.isZero(1e-3) << endl;

diff --git a/doc/snippets/MatrixBase_leftCols_int.cpp b/doc/snippets/MatrixBase_leftCols_int.cpp
new file mode 100644
index 0000000..6ea984e
--- /dev/null
+++ b/doc/snippets/MatrixBase_leftCols_int.cpp

@@ -0,0 +1,6 @@
+Array44i a = Array44i::Random();
+cout << "Here is the array a:" << endl << a << endl;
+cout << "Here is a.leftCols(2):" << endl;
+cout << a.leftCols(2) << endl;
+a.leftCols(2).setZero();
+cout << "Now the array a is:" << endl << a << endl;

diff --git a/doc/snippets/MatrixBase_noalias.cpp b/doc/snippets/MatrixBase_noalias.cpp
new file mode 100644
index 0000000..3b54a79
--- /dev/null
+++ b/doc/snippets/MatrixBase_noalias.cpp

@@ -0,0 +1,3 @@
+Matrix2d a, b, c; a << 1,2,3,4; b << 5,6,7,8;
+c.noalias() = a * b; // this computes the product directly to c
+cout << c << endl;

diff --git a/doc/snippets/MatrixBase_ones.cpp b/doc/snippets/MatrixBase_ones.cpp
new file mode 100644
index 0000000..02c767c
--- /dev/null
+++ b/doc/snippets/MatrixBase_ones.cpp

@@ -0,0 +1,2 @@
+cout << Matrix2d::Ones() << endl;
+cout << 6 * RowVector4i::Ones() << endl;

diff --git a/doc/snippets/MatrixBase_ones_int.cpp b/doc/snippets/MatrixBase_ones_int.cpp
new file mode 100644
index 0000000..2ef188e
--- /dev/null
+++ b/doc/snippets/MatrixBase_ones_int.cpp

@@ -0,0 +1,2 @@
+cout << 6 * RowVectorXi::Ones(4) << endl;
+cout << VectorXf::Ones(2) << endl;

diff --git a/doc/snippets/MatrixBase_ones_int_int.cpp b/doc/snippets/MatrixBase_ones_int_int.cpp
new file mode 100644
index 0000000..60f5a31
--- /dev/null
+++ b/doc/snippets/MatrixBase_ones_int_int.cpp

@@ -0,0 +1 @@
+cout << MatrixXi::Ones(2,3) << endl;

diff --git a/doc/snippets/MatrixBase_operatorNorm.cpp b/doc/snippets/MatrixBase_operatorNorm.cpp
new file mode 100644
index 0000000..355246f
--- /dev/null
+++ b/doc/snippets/MatrixBase_operatorNorm.cpp

@@ -0,0 +1,3 @@
+MatrixXd ones = MatrixXd::Ones(3,3);
+cout << "The operator norm of the 3x3 matrix of ones is "
+     << ones.operatorNorm() << endl;

diff --git a/doc/snippets/MatrixBase_prod.cpp b/doc/snippets/MatrixBase_prod.cpp
new file mode 100644
index 0000000..d2f27bd
--- /dev/null
+++ b/doc/snippets/MatrixBase_prod.cpp

@@ -0,0 +1,3 @@
+Matrix3d m = Matrix3d::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is the product of all the coefficients:" << endl << m.prod() << endl;

diff --git a/doc/snippets/MatrixBase_random.cpp b/doc/snippets/MatrixBase_random.cpp
new file mode 100644
index 0000000..65fc524
--- /dev/null
+++ b/doc/snippets/MatrixBase_random.cpp

@@ -0,0 +1 @@
+cout << 100 * Matrix2i::Random() << endl;

diff --git a/doc/snippets/MatrixBase_random_int.cpp b/doc/snippets/MatrixBase_random_int.cpp
new file mode 100644
index 0000000..f161d03
--- /dev/null
+++ b/doc/snippets/MatrixBase_random_int.cpp

@@ -0,0 +1 @@
+cout << VectorXi::Random(2) << endl;

diff --git a/doc/snippets/MatrixBase_random_int_int.cpp b/doc/snippets/MatrixBase_random_int_int.cpp
new file mode 100644
index 0000000..3f0f7dd
--- /dev/null
+++ b/doc/snippets/MatrixBase_random_int_int.cpp

@@ -0,0 +1 @@
+cout << MatrixXi::Random(2,3) << endl;

diff --git a/doc/snippets/MatrixBase_replicate.cpp b/doc/snippets/MatrixBase_replicate.cpp
new file mode 100644
index 0000000..3ce52bc
--- /dev/null
+++ b/doc/snippets/MatrixBase_replicate.cpp

@@ -0,0 +1,4 @@
+MatrixXi m = MatrixXi::Random(2,3);
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "m.replicate<3,2>() = ..." << endl;
+cout << m.replicate<3,2>() << endl;

diff --git a/doc/snippets/MatrixBase_replicate_int_int.cpp b/doc/snippets/MatrixBase_replicate_int_int.cpp
new file mode 100644
index 0000000..b1dbc70
--- /dev/null
+++ b/doc/snippets/MatrixBase_replicate_int_int.cpp

@@ -0,0 +1,4 @@
+Vector3i v = Vector3i::Random();
+cout << "Here is the vector v:" << endl << v << endl;
+cout << "v.replicate(2,5) = ..." << endl;
+cout << v.replicate(2,5) << endl;

diff --git a/doc/snippets/MatrixBase_reshaped_auto.cpp b/doc/snippets/MatrixBase_reshaped_auto.cpp
new file mode 100644
index 0000000..59f9d3f
--- /dev/null
+++ b/doc/snippets/MatrixBase_reshaped_auto.cpp

@@ -0,0 +1,4 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.reshaped(2, AutoSize):" << endl << m.reshaped(2, AutoSize) << endl;
+cout << "Here is m.reshaped<RowMajor>(AutoSize, fix<8>):" << endl << m.reshaped<RowMajor>(AutoSize, fix<8>) << endl;

diff --git a/doc/snippets/MatrixBase_reshaped_fixed.cpp b/doc/snippets/MatrixBase_reshaped_fixed.cpp
new file mode 100644
index 0000000..3e9e2cf
--- /dev/null
+++ b/doc/snippets/MatrixBase_reshaped_fixed.cpp

@@ -0,0 +1,3 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.reshaped(fix<2>,fix<8>):" << endl << m.reshaped(fix<2>,fix<8>) << endl;

diff --git a/doc/snippets/MatrixBase_reshaped_int_int.cpp b/doc/snippets/MatrixBase_reshaped_int_int.cpp
new file mode 100644
index 0000000..af4ca59
--- /dev/null
+++ b/doc/snippets/MatrixBase_reshaped_int_int.cpp

@@ -0,0 +1,3 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.reshaped(2, 8):" << endl << m.reshaped(2, 8) << endl;

diff --git a/doc/snippets/MatrixBase_reshaped_to_vector.cpp b/doc/snippets/MatrixBase_reshaped_to_vector.cpp
new file mode 100644
index 0000000..37f65f7
--- /dev/null
+++ b/doc/snippets/MatrixBase_reshaped_to_vector.cpp

@@ -0,0 +1,4 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.reshaped().transpose():" << endl << m.reshaped().transpose() << endl;
+cout << "Here is m.reshaped<RowMajor>().transpose():  " << endl << m.reshaped<RowMajor>().transpose() << endl;

diff --git a/doc/snippets/MatrixBase_reverse.cpp b/doc/snippets/MatrixBase_reverse.cpp
new file mode 100644
index 0000000..f545a28
--- /dev/null
+++ b/doc/snippets/MatrixBase_reverse.cpp

@@ -0,0 +1,8 @@
+MatrixXi m = MatrixXi::Random(3,4);
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is the reverse of m:" << endl << m.reverse() << endl;
+cout << "Here is the coefficient (1,0) in the reverse of m:" << endl
+     << m.reverse()(1,0) << endl;
+cout << "Let us overwrite this coefficient with the value 4." << endl;
+m.reverse()(1,0) = 4;
+cout << "Now the matrix m is:" << endl << m << endl;

diff --git a/doc/snippets/MatrixBase_rightCols_int.cpp b/doc/snippets/MatrixBase_rightCols_int.cpp
new file mode 100644
index 0000000..cb51340
--- /dev/null
+++ b/doc/snippets/MatrixBase_rightCols_int.cpp

@@ -0,0 +1,6 @@
+Array44i a = Array44i::Random();
+cout << "Here is the array a:" << endl << a << endl;
+cout << "Here is a.rightCols(2):" << endl;
+cout << a.rightCols(2) << endl;
+a.rightCols(2).setZero();
+cout << "Now the array a is:" << endl << a << endl;

diff --git a/doc/snippets/MatrixBase_row.cpp b/doc/snippets/MatrixBase_row.cpp
new file mode 100644
index 0000000..b15e626
--- /dev/null
+++ b/doc/snippets/MatrixBase_row.cpp

@@ -0,0 +1,3 @@
+Matrix3d m = Matrix3d::Identity();
+m.row(1) = Vector3d(4,5,6);
+cout << m << endl;

diff --git a/doc/snippets/MatrixBase_rowwise.cpp b/doc/snippets/MatrixBase_rowwise.cpp
new file mode 100644
index 0000000..ae93964
--- /dev/null
+++ b/doc/snippets/MatrixBase_rowwise.cpp

@@ -0,0 +1,5 @@
+Matrix3d m = Matrix3d::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is the sum of each row:" << endl << m.rowwise().sum() << endl;
+cout << "Here is the maximum absolute value of each row:"
+     << endl << m.cwiseAbs().rowwise().maxCoeff() << endl;

diff --git a/doc/snippets/MatrixBase_segment_int_int.cpp b/doc/snippets/MatrixBase_segment_int_int.cpp
new file mode 100644
index 0000000..70cd6d2
--- /dev/null
+++ b/doc/snippets/MatrixBase_segment_int_int.cpp

@@ -0,0 +1,5 @@
+RowVector4i v = RowVector4i::Random();
+cout << "Here is the vector v:" << endl << v << endl;
+cout << "Here is v.segment(1, 2):" << endl << v.segment(1, 2) << endl;
+v.segment(1, 2).setZero();
+cout << "Now the vector v is:" << endl << v << endl;

diff --git a/doc/snippets/MatrixBase_select.cpp b/doc/snippets/MatrixBase_select.cpp
new file mode 100644
index 0000000..ae5477f
--- /dev/null
+++ b/doc/snippets/MatrixBase_select.cpp

@@ -0,0 +1,6 @@
+MatrixXi m(3, 3);
+m << 1, 2, 3,
+     4, 5, 6,
+     7, 8, 9;
+m = (m.array() >= 5).select(-m, m);
+cout << m << endl;

diff --git a/doc/snippets/MatrixBase_selfadjointView.cpp b/doc/snippets/MatrixBase_selfadjointView.cpp
new file mode 100644
index 0000000..4bd3c7e
--- /dev/null
+++ b/doc/snippets/MatrixBase_selfadjointView.cpp

@@ -0,0 +1,6 @@
+Matrix3i m = Matrix3i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is the symmetric matrix extracted from the upper part of m:" << endl
+     << Matrix3i(m.selfadjointView<Upper>()) << endl;
+cout << "Here is the symmetric matrix extracted from the lower part of m:" << endl
+     << Matrix3i(m.selfadjointView<Lower>()) << endl;

diff --git a/doc/snippets/MatrixBase_set.cpp b/doc/snippets/MatrixBase_set.cpp
new file mode 100644
index 0000000..50ecf5f
--- /dev/null
+++ b/doc/snippets/MatrixBase_set.cpp

@@ -0,0 +1,13 @@
+Matrix3i m1;
+m1 << 1, 2, 3,
+      4, 5, 6,
+      7, 8, 9;
+cout << m1 << endl << endl;
+Matrix3i m2 = Matrix3i::Identity();
+m2.block(0,0, 2,2) << 10, 11, 12, 13;
+cout << m2 << endl << endl;
+Vector2i v1;
+v1 << 14, 15;
+m2 << v1.transpose(), 16,
+      v1, m1.block(1,1,2,2);
+cout << m2 << endl;

diff --git a/doc/snippets/MatrixBase_setIdentity.cpp b/doc/snippets/MatrixBase_setIdentity.cpp
new file mode 100644
index 0000000..4fd0aa2
--- /dev/null
+++ b/doc/snippets/MatrixBase_setIdentity.cpp

@@ -0,0 +1,3 @@
+Matrix4i m = Matrix4i::Zero();
+m.block<3,3>(1,0).setIdentity();
+cout << m << endl;

diff --git a/doc/snippets/MatrixBase_setOnes.cpp b/doc/snippets/MatrixBase_setOnes.cpp
new file mode 100644
index 0000000..4cef9c1
--- /dev/null
+++ b/doc/snippets/MatrixBase_setOnes.cpp

@@ -0,0 +1,3 @@
+Matrix4i m = Matrix4i::Random();
+m.row(1).setOnes();
+cout << m << endl;

diff --git a/doc/snippets/MatrixBase_setRandom.cpp b/doc/snippets/MatrixBase_setRandom.cpp
new file mode 100644
index 0000000..e2c257d
--- /dev/null
+++ b/doc/snippets/MatrixBase_setRandom.cpp

@@ -0,0 +1,3 @@
+Matrix4i m = Matrix4i::Zero();
+m.col(1).setRandom();
+cout << m << endl;

diff --git a/doc/snippets/MatrixBase_setZero.cpp b/doc/snippets/MatrixBase_setZero.cpp
new file mode 100644
index 0000000..9b5b958
--- /dev/null
+++ b/doc/snippets/MatrixBase_setZero.cpp

@@ -0,0 +1,3 @@
+Matrix4i m = Matrix4i::Random();
+m.row(1).setZero();
+cout << m << endl;

diff --git a/doc/snippets/MatrixBase_start_int.cpp b/doc/snippets/MatrixBase_start_int.cpp
new file mode 100644
index 0000000..c261d2b
--- /dev/null
+++ b/doc/snippets/MatrixBase_start_int.cpp

@@ -0,0 +1,5 @@
+RowVector4i v = RowVector4i::Random();
+cout << "Here is the vector v:" << endl << v << endl;
+cout << "Here is v.head(2):" << endl << v.head(2) << endl;
+v.head(2).setZero();
+cout << "Now the vector v is:" << endl << v << endl;

diff --git a/doc/snippets/MatrixBase_template_int_bottomRows.cpp b/doc/snippets/MatrixBase_template_int_bottomRows.cpp
new file mode 100644
index 0000000..f9ea892
--- /dev/null
+++ b/doc/snippets/MatrixBase_template_int_bottomRows.cpp

@@ -0,0 +1,6 @@
+Array44i a = Array44i::Random();
+cout << "Here is the array a:" << endl << a << endl;
+cout << "Here is a.bottomRows<2>():" << endl;
+cout << a.bottomRows<2>() << endl;
+a.bottomRows<2>().setZero();
+cout << "Now the array a is:" << endl << a << endl;

diff --git a/doc/snippets/MatrixBase_template_int_end.cpp b/doc/snippets/MatrixBase_template_int_end.cpp
new file mode 100644
index 0000000..f5ccb00
--- /dev/null
+++ b/doc/snippets/MatrixBase_template_int_end.cpp

@@ -0,0 +1,5 @@
+RowVector4i v = RowVector4i::Random();
+cout << "Here is the vector v:" << endl << v << endl;
+cout << "Here is v.tail(2):" << endl << v.tail<2>() << endl;
+v.tail<2>().setZero();
+cout << "Now the vector v is:" << endl << v << endl;

diff --git a/doc/snippets/MatrixBase_template_int_int_block_int_int_int_int.cpp b/doc/snippets/MatrixBase_template_int_int_block_int_int_int_int.cpp
new file mode 100644
index 0000000..4dced03
--- /dev/null
+++ b/doc/snippets/MatrixBase_template_int_int_block_int_int_int_int.cpp

@@ -0,0 +1,5 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is the block:" << endl << m.block<2, Dynamic>(1, 1, 2, 3) << endl;
+m.block<2, Dynamic>(1, 1, 2, 3).setZero();
+cout << "Now the matrix m is:" << endl << m << endl;

diff --git a/doc/snippets/MatrixBase_template_int_int_bottomLeftCorner.cpp b/doc/snippets/MatrixBase_template_int_int_bottomLeftCorner.cpp
new file mode 100644
index 0000000..847892a
--- /dev/null
+++ b/doc/snippets/MatrixBase_template_int_int_bottomLeftCorner.cpp

@@ -0,0 +1,6 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.bottomLeftCorner<2,2>():" << endl;
+cout << m.bottomLeftCorner<2,2>() << endl;
+m.bottomLeftCorner<2,2>().setZero();
+cout << "Now the matrix m is:" << endl << m << endl;

diff --git a/doc/snippets/MatrixBase_template_int_int_bottomLeftCorner_int_int.cpp b/doc/snippets/MatrixBase_template_int_int_bottomLeftCorner_int_int.cpp
new file mode 100644
index 0000000..a1edcc8
--- /dev/null
+++ b/doc/snippets/MatrixBase_template_int_int_bottomLeftCorner_int_int.cpp

@@ -0,0 +1,6 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.bottomLeftCorner<2,Dynamic>(2,2):" << endl;
+cout << m.bottomLeftCorner<2,Dynamic>(2,2) << endl;
+m.bottomLeftCorner<2,Dynamic>(2,2).setZero();
+cout << "Now the matrix m is:" << endl << m << endl;

diff --git a/doc/snippets/MatrixBase_template_int_int_bottomRightCorner.cpp b/doc/snippets/MatrixBase_template_int_int_bottomRightCorner.cpp
new file mode 100644
index 0000000..abacb01
--- /dev/null
+++ b/doc/snippets/MatrixBase_template_int_int_bottomRightCorner.cpp

@@ -0,0 +1,6 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.bottomRightCorner<2,2>():" << endl;
+cout << m.bottomRightCorner<2,2>() << endl;
+m.bottomRightCorner<2,2>().setZero();
+cout << "Now the matrix m is:" << endl << m << endl;

diff --git a/doc/snippets/MatrixBase_template_int_int_bottomRightCorner_int_int.cpp b/doc/snippets/MatrixBase_template_int_int_bottomRightCorner_int_int.cpp
new file mode 100644
index 0000000..a65508f
--- /dev/null
+++ b/doc/snippets/MatrixBase_template_int_int_bottomRightCorner_int_int.cpp

@@ -0,0 +1,6 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.bottomRightCorner<2,Dynamic>(2,2):" << endl;
+cout << m.bottomRightCorner<2,Dynamic>(2,2) << endl;
+m.bottomRightCorner<2,Dynamic>(2,2).setZero();
+cout << "Now the matrix m is:" << endl << m << endl;

diff --git a/doc/snippets/MatrixBase_template_int_int_topLeftCorner.cpp b/doc/snippets/MatrixBase_template_int_int_topLeftCorner.cpp
new file mode 100644
index 0000000..1899d90
--- /dev/null
+++ b/doc/snippets/MatrixBase_template_int_int_topLeftCorner.cpp

@@ -0,0 +1,6 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.topLeftCorner<2,2>():" << endl;
+cout << m.topLeftCorner<2,2>() << endl;
+m.topLeftCorner<2,2>().setZero();
+cout << "Now the matrix m is:" << endl << m << endl;

diff --git a/doc/snippets/MatrixBase_template_int_int_topLeftCorner_int_int.cpp b/doc/snippets/MatrixBase_template_int_int_topLeftCorner_int_int.cpp
new file mode 100644
index 0000000..fac761f
--- /dev/null
+++ b/doc/snippets/MatrixBase_template_int_int_topLeftCorner_int_int.cpp

@@ -0,0 +1,6 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.topLeftCorner<2,Dynamic>(2,2):" << endl;
+cout << m.topLeftCorner<2,Dynamic>(2,2) << endl;
+m.topLeftCorner<2,Dynamic>(2,2).setZero();
+cout << "Now the matrix m is:" << endl << m << endl;

diff --git a/doc/snippets/MatrixBase_template_int_int_topRightCorner.cpp b/doc/snippets/MatrixBase_template_int_int_topRightCorner.cpp
new file mode 100644
index 0000000..c3a1771
--- /dev/null
+++ b/doc/snippets/MatrixBase_template_int_int_topRightCorner.cpp

@@ -0,0 +1,6 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.topRightCorner<2,2>():" << endl;
+cout << m.topRightCorner<2,2>() << endl;
+m.topRightCorner<2,2>().setZero();
+cout << "Now the matrix m is:" << endl << m << endl;

diff --git a/doc/snippets/MatrixBase_template_int_int_topRightCorner_int_int.cpp b/doc/snippets/MatrixBase_template_int_int_topRightCorner_int_int.cpp
new file mode 100644
index 0000000..a17acc0
--- /dev/null
+++ b/doc/snippets/MatrixBase_template_int_int_topRightCorner_int_int.cpp

@@ -0,0 +1,6 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.topRightCorner<2,Dynamic>(2,2):" << endl;
+cout << m.topRightCorner<2,Dynamic>(2,2) << endl;
+m.topRightCorner<2,Dynamic>(2,2).setZero();
+cout << "Now the matrix m is:" << endl << m << endl;

diff --git a/doc/snippets/MatrixBase_template_int_leftCols.cpp b/doc/snippets/MatrixBase_template_int_leftCols.cpp
new file mode 100644
index 0000000..1c425d9
--- /dev/null
+++ b/doc/snippets/MatrixBase_template_int_leftCols.cpp

@@ -0,0 +1,6 @@
+Array44i a = Array44i::Random();
+cout << "Here is the array a:" << endl << a << endl;
+cout << "Here is a.leftCols<2>():" << endl;
+cout << a.leftCols<2>() << endl;
+a.leftCols<2>().setZero();
+cout << "Now the array a is:" << endl << a << endl;

diff --git a/doc/snippets/MatrixBase_template_int_rightCols.cpp b/doc/snippets/MatrixBase_template_int_rightCols.cpp
new file mode 100644
index 0000000..fc8c0d9
--- /dev/null
+++ b/doc/snippets/MatrixBase_template_int_rightCols.cpp

@@ -0,0 +1,6 @@
+Array44i a = Array44i::Random();
+cout << "Here is the array a:" << endl << a << endl;
+cout << "Here is a.rightCols<2>():" << endl;
+cout << a.rightCols<2>() << endl;
+a.rightCols<2>().setZero();
+cout << "Now the array a is:" << endl << a << endl;

diff --git a/doc/snippets/MatrixBase_template_int_segment.cpp b/doc/snippets/MatrixBase_template_int_segment.cpp
new file mode 100644
index 0000000..e448b40
--- /dev/null
+++ b/doc/snippets/MatrixBase_template_int_segment.cpp

@@ -0,0 +1,5 @@
+RowVector4i v = RowVector4i::Random();
+cout << "Here is the vector v:" << endl << v << endl;
+cout << "Here is v.segment<2>(1):" << endl << v.segment<2>(1) << endl;
+v.segment<2>(2).setZero();
+cout << "Now the vector v is:" << endl << v << endl;

diff --git a/doc/snippets/MatrixBase_template_int_start.cpp b/doc/snippets/MatrixBase_template_int_start.cpp
new file mode 100644
index 0000000..d336b37
--- /dev/null
+++ b/doc/snippets/MatrixBase_template_int_start.cpp

@@ -0,0 +1,5 @@
+RowVector4i v = RowVector4i::Random();
+cout << "Here is the vector v:" << endl << v << endl;
+cout << "Here is v.head(2):" << endl << v.head<2>() << endl;
+v.head<2>().setZero();
+cout << "Now the vector v is:" << endl << v << endl;

diff --git a/doc/snippets/MatrixBase_template_int_topRows.cpp b/doc/snippets/MatrixBase_template_int_topRows.cpp
new file mode 100644
index 0000000..0110251
--- /dev/null
+++ b/doc/snippets/MatrixBase_template_int_topRows.cpp

@@ -0,0 +1,6 @@
+Array44i a = Array44i::Random();
+cout << "Here is the array a:" << endl << a << endl;
+cout << "Here is a.topRows<2>():" << endl;
+cout << a.topRows<2>() << endl;
+a.topRows<2>().setZero();
+cout << "Now the array a is:" << endl << a << endl;

diff --git a/doc/snippets/MatrixBase_topLeftCorner_int_int.cpp b/doc/snippets/MatrixBase_topLeftCorner_int_int.cpp
new file mode 100644
index 0000000..e52cb3b
--- /dev/null
+++ b/doc/snippets/MatrixBase_topLeftCorner_int_int.cpp

@@ -0,0 +1,6 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.topLeftCorner(2, 2):" << endl;
+cout << m.topLeftCorner(2, 2) << endl;
+m.topLeftCorner(2, 2).setZero();
+cout << "Now the matrix m is:" << endl << m << endl;

diff --git a/doc/snippets/MatrixBase_topRightCorner_int_int.cpp b/doc/snippets/MatrixBase_topRightCorner_int_int.cpp
new file mode 100644
index 0000000..811fa56
--- /dev/null
+++ b/doc/snippets/MatrixBase_topRightCorner_int_int.cpp

@@ -0,0 +1,6 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.topRightCorner(2, 2):" << endl;
+cout << m.topRightCorner(2, 2) << endl;
+m.topRightCorner(2, 2).setZero();
+cout << "Now the matrix m is:" << endl << m << endl;

diff --git a/doc/snippets/MatrixBase_topRows_int.cpp b/doc/snippets/MatrixBase_topRows_int.cpp
new file mode 100644
index 0000000..f2d75f1
--- /dev/null
+++ b/doc/snippets/MatrixBase_topRows_int.cpp

@@ -0,0 +1,6 @@
+Array44i a = Array44i::Random();
+cout << "Here is the array a:" << endl << a << endl;
+cout << "Here is a.topRows(2):" << endl;
+cout << a.topRows(2) << endl;
+a.topRows(2).setZero();
+cout << "Now the array a is:" << endl << a << endl;

diff --git a/doc/snippets/MatrixBase_transpose.cpp b/doc/snippets/MatrixBase_transpose.cpp
new file mode 100644
index 0000000..88eea83
--- /dev/null
+++ b/doc/snippets/MatrixBase_transpose.cpp

@@ -0,0 +1,8 @@
+Matrix2i m = Matrix2i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is the transpose of m:" << endl << m.transpose() << endl;
+cout << "Here is the coefficient (1,0) in the transpose of m:" << endl
+     << m.transpose()(1,0) << endl;
+cout << "Let us overwrite this coefficient with the value 0." << endl;
+m.transpose()(1,0) = 0;
+cout << "Now the matrix m is:" << endl << m << endl;

diff --git a/doc/snippets/MatrixBase_triangularView.cpp b/doc/snippets/MatrixBase_triangularView.cpp
new file mode 100644
index 0000000..03aa303
--- /dev/null
+++ b/doc/snippets/MatrixBase_triangularView.cpp

@@ -0,0 +1,9 @@
+Matrix3i m = Matrix3i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is the upper-triangular matrix extracted from m:" << endl
+     << Matrix3i(m.triangularView<Eigen::Upper>()) << endl;
+cout << "Here is the strictly-upper-triangular matrix extracted from m:" << endl
+     << Matrix3i(m.triangularView<Eigen::StrictlyUpper>()) << endl;
+cout << "Here is the unit-lower-triangular matrix extracted from m:" << endl
+     << Matrix3i(m.triangularView<Eigen::UnitLower>()) << endl;
+// FIXME need to implement output for triangularViews (Bug 885)

diff --git a/doc/snippets/MatrixBase_zero.cpp b/doc/snippets/MatrixBase_zero.cpp
new file mode 100644
index 0000000..6064936
--- /dev/null
+++ b/doc/snippets/MatrixBase_zero.cpp

@@ -0,0 +1,2 @@
+cout << Matrix2d::Zero() << endl;
+cout << RowVector4i::Zero() << endl;

diff --git a/doc/snippets/MatrixBase_zero_int.cpp b/doc/snippets/MatrixBase_zero_int.cpp
new file mode 100644
index 0000000..370a9ba
--- /dev/null
+++ b/doc/snippets/MatrixBase_zero_int.cpp

@@ -0,0 +1,2 @@
+cout << RowVectorXi::Zero(4) << endl;
+cout << VectorXf::Zero(2) << endl;

diff --git a/doc/snippets/MatrixBase_zero_int_int.cpp b/doc/snippets/MatrixBase_zero_int_int.cpp
new file mode 100644
index 0000000..4099c5d
--- /dev/null
+++ b/doc/snippets/MatrixBase_zero_int_int.cpp

@@ -0,0 +1 @@
+cout << MatrixXi::Zero(2,3) << endl;

diff --git a/doc/snippets/Matrix_Map_stride.cpp b/doc/snippets/Matrix_Map_stride.cpp
new file mode 100644
index 0000000..ae42a12
--- /dev/null
+++ b/doc/snippets/Matrix_Map_stride.cpp

@@ -0,0 +1,7 @@
+Matrix4i A;
+A << 1,  2,  3,  4,
+     5,  6,  7,  8,
+     9, 10, 11, 12,
+    13, 14, 15, 16;
+
+std::cout << Matrix2i::Map(&A(1,1),Stride<8,2>()) << std::endl;

diff --git a/doc/snippets/Matrix_initializer_list_23_cxx11.cpp b/doc/snippets/Matrix_initializer_list_23_cxx11.cpp
new file mode 100644
index 0000000..60280ab
--- /dev/null
+++ b/doc/snippets/Matrix_initializer_list_23_cxx11.cpp

@@ -0,0 +1,5 @@
+MatrixXd m {
+  {1, 2, 3},
+  {4, 5, 6}
+};
+cout << m << endl;

diff --git a/doc/snippets/Matrix_initializer_list_vector_cxx11.cpp b/doc/snippets/Matrix_initializer_list_vector_cxx11.cpp
new file mode 100644
index 0000000..325257c
--- /dev/null
+++ b/doc/snippets/Matrix_initializer_list_vector_cxx11.cpp

@@ -0,0 +1,2 @@
+VectorXi v {{1, 2}};
+cout << v << endl;

diff --git a/doc/snippets/Matrix_resize_NoChange_int.cpp b/doc/snippets/Matrix_resize_NoChange_int.cpp
new file mode 100644
index 0000000..acdf18c
--- /dev/null
+++ b/doc/snippets/Matrix_resize_NoChange_int.cpp

@@ -0,0 +1,3 @@
+MatrixXd m(3,4);
+m.resize(NoChange, 5);
+cout << "m: " << m.rows() << " rows, " << m.cols() << " cols" << endl;

diff --git a/doc/snippets/Matrix_resize_int.cpp b/doc/snippets/Matrix_resize_int.cpp
new file mode 100644
index 0000000..044c789
--- /dev/null
+++ b/doc/snippets/Matrix_resize_int.cpp

@@ -0,0 +1,6 @@
+VectorXd v(10);
+v.resize(3);
+RowVector3d w;
+w.resize(3); // this is legal, but has no effect
+cout << "v: " << v.rows() << " rows, " << v.cols() << " cols" << endl;
+cout << "w: " << w.rows() << " rows, " << w.cols() << " cols" << endl;

diff --git a/doc/snippets/Matrix_resize_int_NoChange.cpp b/doc/snippets/Matrix_resize_int_NoChange.cpp
new file mode 100644
index 0000000..5c37c90
--- /dev/null
+++ b/doc/snippets/Matrix_resize_int_NoChange.cpp

@@ -0,0 +1,3 @@
+MatrixXd m(3,4);
+m.resize(5, NoChange);
+cout << "m: " << m.rows() << " rows, " << m.cols() << " cols" << endl;

diff --git a/doc/snippets/Matrix_resize_int_int.cpp b/doc/snippets/Matrix_resize_int_int.cpp
new file mode 100644
index 0000000..bfd4741
--- /dev/null
+++ b/doc/snippets/Matrix_resize_int_int.cpp

@@ -0,0 +1,9 @@
+MatrixXd m(2,3);
+m << 1,2,3,4,5,6;
+cout << "here's the 2x3 matrix m:" << endl << m << endl;
+cout << "let's resize m to 3x2. This is a conservative resizing because 2*3==3*2." << endl;
+m.resize(3,2);
+cout << "here's the 3x2 matrix m:" << endl << m << endl;
+cout << "now let's resize m to size 2x2. This is NOT a conservative resizing, so it becomes uninitialized:" << endl;
+m.resize(2,2);
+cout << m << endl;

diff --git a/doc/snippets/Matrix_setConstant_int.cpp b/doc/snippets/Matrix_setConstant_int.cpp
new file mode 100644
index 0000000..ff5a86c
--- /dev/null
+++ b/doc/snippets/Matrix_setConstant_int.cpp

@@ -0,0 +1,3 @@
+VectorXf v;
+v.setConstant(3, 5);
+cout << v << endl;

diff --git a/doc/snippets/Matrix_setConstant_int_int.cpp b/doc/snippets/Matrix_setConstant_int_int.cpp
new file mode 100644
index 0000000..32b950c
--- /dev/null
+++ b/doc/snippets/Matrix_setConstant_int_int.cpp

@@ -0,0 +1,3 @@
+MatrixXf m;
+m.setConstant(3, 3, 5);
+cout << m << endl;

diff --git a/doc/snippets/Matrix_setIdentity_int_int.cpp b/doc/snippets/Matrix_setIdentity_int_int.cpp
new file mode 100644
index 0000000..a659671
--- /dev/null
+++ b/doc/snippets/Matrix_setIdentity_int_int.cpp

@@ -0,0 +1,3 @@
+MatrixXf m;
+m.setIdentity(3, 3);
+cout << m << endl;

diff --git a/doc/snippets/Matrix_setOnes_int.cpp b/doc/snippets/Matrix_setOnes_int.cpp
new file mode 100644
index 0000000..752cb35
--- /dev/null
+++ b/doc/snippets/Matrix_setOnes_int.cpp

@@ -0,0 +1,3 @@
+VectorXf v;
+v.setOnes(3);
+cout << v << endl;

diff --git a/doc/snippets/Matrix_setOnes_int_int.cpp b/doc/snippets/Matrix_setOnes_int_int.cpp
new file mode 100644
index 0000000..1ffb66b
--- /dev/null
+++ b/doc/snippets/Matrix_setOnes_int_int.cpp

@@ -0,0 +1,3 @@
+MatrixXf m;
+m.setOnes(3, 3);
+cout << m << endl;

diff --git a/doc/snippets/Matrix_setRandom_int.cpp b/doc/snippets/Matrix_setRandom_int.cpp
new file mode 100644
index 0000000..e160dd7
--- /dev/null
+++ b/doc/snippets/Matrix_setRandom_int.cpp

@@ -0,0 +1,3 @@
+VectorXf v;
+v.setRandom(3);
+cout << v << endl;

diff --git a/doc/snippets/Matrix_setRandom_int_int.cpp b/doc/snippets/Matrix_setRandom_int_int.cpp
new file mode 100644
index 0000000..80cda11
--- /dev/null
+++ b/doc/snippets/Matrix_setRandom_int_int.cpp

@@ -0,0 +1,3 @@
+MatrixXf m;
+m.setRandom(3, 3);
+cout << m << endl;

diff --git a/doc/snippets/Matrix_setZero_int.cpp b/doc/snippets/Matrix_setZero_int.cpp
new file mode 100644
index 0000000..0fb16c1
--- /dev/null
+++ b/doc/snippets/Matrix_setZero_int.cpp

@@ -0,0 +1,3 @@
+VectorXf v;
+v.setZero(3);
+cout << v << endl;

diff --git a/doc/snippets/Matrix_setZero_int_int.cpp b/doc/snippets/Matrix_setZero_int_int.cpp
new file mode 100644
index 0000000..ad883b9
--- /dev/null
+++ b/doc/snippets/Matrix_setZero_int_int.cpp

@@ -0,0 +1,3 @@
+MatrixXf m;
+m.setZero(3, 3);
+cout << m << endl;

diff --git a/doc/snippets/Matrix_variadic_ctor_cxx11.cpp b/doc/snippets/Matrix_variadic_ctor_cxx11.cpp
new file mode 100644
index 0000000..06d33f5
--- /dev/null
+++ b/doc/snippets/Matrix_variadic_ctor_cxx11.cpp

@@ -0,0 +1,3 @@
+Matrix<int, 1, 6> a(1, 2, 3, 4, 5, 6);
+Matrix<int, 3, 1> b {1, 2, 3};
+cout << a << "\n\n" << b << endl;

diff --git a/doc/snippets/PartialPivLU_solve.cpp b/doc/snippets/PartialPivLU_solve.cpp
new file mode 100644
index 0000000..fa3570a
--- /dev/null
+++ b/doc/snippets/PartialPivLU_solve.cpp

@@ -0,0 +1,7 @@
+MatrixXd A = MatrixXd::Random(3,3);
+MatrixXd B = MatrixXd::Random(3,2);
+cout << "Here is the invertible matrix A:" << endl << A << endl;
+cout << "Here is the matrix B:" << endl << B << endl;
+MatrixXd X = A.lu().solve(B);
+cout << "Here is the (unique) solution X to the equation AX=B:" << endl << X << endl;
+cout << "Relative error: " << (A*X-B).norm() / B.norm() << endl;

diff --git a/doc/snippets/PartialRedux_count.cpp b/doc/snippets/PartialRedux_count.cpp
new file mode 100644
index 0000000..1c3b3a2
--- /dev/null
+++ b/doc/snippets/PartialRedux_count.cpp

@@ -0,0 +1,5 @@
+Matrix3d m = Matrix3d::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+Matrix<ptrdiff_t, 3, 1> res = (m.array() >= 0.5).rowwise().count();
+cout << "Here is the count of elements larger or equal than 0.5 of each row:" << endl;
+cout << res << endl;

diff --git a/doc/snippets/PartialRedux_maxCoeff.cpp b/doc/snippets/PartialRedux_maxCoeff.cpp
new file mode 100644
index 0000000..e8fd382
--- /dev/null
+++ b/doc/snippets/PartialRedux_maxCoeff.cpp

@@ -0,0 +1,3 @@
+Matrix3d m = Matrix3d::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is the maximum of each column:" << endl << m.colwise().maxCoeff() << endl;

diff --git a/doc/snippets/PartialRedux_minCoeff.cpp b/doc/snippets/PartialRedux_minCoeff.cpp
new file mode 100644
index 0000000..d717bc0
--- /dev/null
+++ b/doc/snippets/PartialRedux_minCoeff.cpp

@@ -0,0 +1,3 @@
+Matrix3d m = Matrix3d::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is the minimum of each column:" << endl << m.colwise().minCoeff() << endl;

diff --git a/doc/snippets/PartialRedux_norm.cpp b/doc/snippets/PartialRedux_norm.cpp
new file mode 100644
index 0000000..dbcf290
--- /dev/null
+++ b/doc/snippets/PartialRedux_norm.cpp

@@ -0,0 +1,3 @@
+Matrix3d m = Matrix3d::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is the norm of each column:" << endl << m.colwise().norm() << endl;

diff --git a/doc/snippets/PartialRedux_prod.cpp b/doc/snippets/PartialRedux_prod.cpp
new file mode 100644
index 0000000..aacf09c
--- /dev/null
+++ b/doc/snippets/PartialRedux_prod.cpp

@@ -0,0 +1,3 @@
+Matrix3d m = Matrix3d::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is the product of each row:" << endl << m.rowwise().prod() << endl;

diff --git a/doc/snippets/PartialRedux_squaredNorm.cpp b/doc/snippets/PartialRedux_squaredNorm.cpp
new file mode 100644
index 0000000..9f3293e
--- /dev/null
+++ b/doc/snippets/PartialRedux_squaredNorm.cpp

@@ -0,0 +1,3 @@
+Matrix3d m = Matrix3d::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is the square norm of each row:" << endl << m.rowwise().squaredNorm() << endl;

diff --git a/doc/snippets/PartialRedux_sum.cpp b/doc/snippets/PartialRedux_sum.cpp
new file mode 100644
index 0000000..ec82d3e
--- /dev/null
+++ b/doc/snippets/PartialRedux_sum.cpp

@@ -0,0 +1,3 @@
+Matrix3d m = Matrix3d::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is the sum of each row:" << endl << m.rowwise().sum() << endl;

diff --git a/doc/snippets/RealQZ_compute.cpp b/doc/snippets/RealQZ_compute.cpp
new file mode 100644
index 0000000..a18da42
--- /dev/null
+++ b/doc/snippets/RealQZ_compute.cpp

@@ -0,0 +1,17 @@
+MatrixXf A = MatrixXf::Random(4,4);
+MatrixXf B = MatrixXf::Random(4,4);
+RealQZ<MatrixXf> qz(4); // preallocate space for 4x4 matrices
+qz.compute(A,B);  // A = Q S Z,  B = Q T Z
+
+// print original matrices and result of decomposition
+cout << "A:\n" << A << "\n" << "B:\n" << B << "\n";
+cout << "S:\n" << qz.matrixS() << "\n" << "T:\n" << qz.matrixT() << "\n";
+cout << "Q:\n" << qz.matrixQ() << "\n" << "Z:\n" << qz.matrixZ() << "\n";
+
+// verify precision
+cout << "\nErrors:"
+  << "\n|A-QSZ|: " << (A-qz.matrixQ()*qz.matrixS()*qz.matrixZ()).norm()
+  << ", |B-QTZ|: " << (B-qz.matrixQ()*qz.matrixT()*qz.matrixZ()).norm()
+  << "\n|QQ* - I|: " << (qz.matrixQ()*qz.matrixQ().adjoint() - MatrixXf::Identity(4,4)).norm()
+  << ", |ZZ* - I|: " << (qz.matrixZ()*qz.matrixZ().adjoint() - MatrixXf::Identity(4,4)).norm()
+  << "\n";

diff --git a/doc/snippets/RealSchur_RealSchur_MatrixType.cpp b/doc/snippets/RealSchur_RealSchur_MatrixType.cpp
new file mode 100644
index 0000000..a5530dc
--- /dev/null
+++ b/doc/snippets/RealSchur_RealSchur_MatrixType.cpp

@@ -0,0 +1,10 @@
+MatrixXd A = MatrixXd::Random(6,6);
+cout << "Here is a random 6x6 matrix, A:" << endl << A << endl << endl;
+
+RealSchur<MatrixXd> schur(A);
+cout << "The orthogonal matrix U is:" << endl << schur.matrixU() << endl;
+cout << "The quasi-triangular matrix T is:" << endl << schur.matrixT() << endl << endl;
+
+MatrixXd U = schur.matrixU();
+MatrixXd T = schur.matrixT();
+cout << "U * T * U^T = " << endl << U * T * U.transpose() << endl;

diff --git a/doc/snippets/RealSchur_compute.cpp b/doc/snippets/RealSchur_compute.cpp
new file mode 100644
index 0000000..20c2611
--- /dev/null
+++ b/doc/snippets/RealSchur_compute.cpp

@@ -0,0 +1,6 @@
+MatrixXf A = MatrixXf::Random(4,4);
+RealSchur<MatrixXf> schur(4);
+schur.compute(A, /* computeU = */ false);
+cout << "The matrix T in the decomposition of A is:" << endl << schur.matrixT() << endl;
+schur.compute(A.inverse(), /* computeU = */ false);
+cout << "The matrix T in the decomposition of A^(-1) is:" << endl << schur.matrixT() << endl;

diff --git a/doc/snippets/SelfAdjointEigenSolver_SelfAdjointEigenSolver.cpp b/doc/snippets/SelfAdjointEigenSolver_SelfAdjointEigenSolver.cpp
new file mode 100644
index 0000000..73a7f62
--- /dev/null
+++ b/doc/snippets/SelfAdjointEigenSolver_SelfAdjointEigenSolver.cpp

@@ -0,0 +1,7 @@
+SelfAdjointEigenSolver<Matrix4f> es;
+Matrix4f X = Matrix4f::Random(4,4);
+Matrix4f A = X + X.transpose();
+es.compute(A);
+cout << "The eigenvalues of A are: " << es.eigenvalues().transpose() << endl;
+es.compute(A + Matrix4f::Identity(4,4)); // re-use es to compute eigenvalues of A+I
+cout << "The eigenvalues of A+I are: " << es.eigenvalues().transpose() << endl;

diff --git a/doc/snippets/SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType.cpp b/doc/snippets/SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType.cpp
new file mode 100644
index 0000000..3599b17
--- /dev/null
+++ b/doc/snippets/SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType.cpp

@@ -0,0 +1,17 @@
+MatrixXd X = MatrixXd::Random(5,5);
+MatrixXd A = X + X.transpose();
+cout << "Here is a random symmetric 5x5 matrix, A:" << endl << A << endl << endl;
+
+SelfAdjointEigenSolver<MatrixXd> es(A);
+cout << "The eigenvalues of A are:" << endl << es.eigenvalues() << endl;
+cout << "The matrix of eigenvectors, V, is:" << endl << es.eigenvectors() << endl << endl;
+
+double lambda = es.eigenvalues()[0];
+cout << "Consider the first eigenvalue, lambda = " << lambda << endl;
+VectorXd v = es.eigenvectors().col(0);
+cout << "If v is the corresponding eigenvector, then lambda * v = " << endl << lambda * v << endl;
+cout << "... and A * v = " << endl << A * v << endl << endl;
+
+MatrixXd D = es.eigenvalues().asDiagonal();
+MatrixXd V = es.eigenvectors();
+cout << "Finally, V * D * V^(-1) = " << endl << V * D * V.inverse() << endl;

diff --git a/doc/snippets/SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType2.cpp b/doc/snippets/SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType2.cpp
new file mode 100644
index 0000000..bbb821e
--- /dev/null
+++ b/doc/snippets/SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType2.cpp

@@ -0,0 +1,16 @@
+MatrixXd X = MatrixXd::Random(5,5);
+MatrixXd A = X + X.transpose();
+cout << "Here is a random symmetric matrix, A:" << endl << A << endl;
+X = MatrixXd::Random(5,5);
+MatrixXd B = X * X.transpose();
+cout << "and a random postive-definite matrix, B:" << endl << B << endl << endl;
+
+GeneralizedSelfAdjointEigenSolver<MatrixXd> es(A,B);
+cout << "The eigenvalues of the pencil (A,B) are:" << endl << es.eigenvalues() << endl;
+cout << "The matrix of eigenvectors, V, is:" << endl << es.eigenvectors() << endl << endl;
+
+double lambda = es.eigenvalues()[0];
+cout << "Consider the first eigenvalue, lambda = " << lambda << endl;
+VectorXd v = es.eigenvectors().col(0);
+cout << "If v is the corresponding eigenvector, then A * v = " << endl << A * v << endl;
+cout << "... and lambda * B * v = " << endl << lambda * B * v << endl << endl;

diff --git a/doc/snippets/SelfAdjointEigenSolver_compute_MatrixType.cpp b/doc/snippets/SelfAdjointEigenSolver_compute_MatrixType.cpp
new file mode 100644
index 0000000..2975cc3
--- /dev/null
+++ b/doc/snippets/SelfAdjointEigenSolver_compute_MatrixType.cpp

@@ -0,0 +1,7 @@
+SelfAdjointEigenSolver<MatrixXf> es(4);
+MatrixXf X = MatrixXf::Random(4,4);
+MatrixXf A = X + X.transpose();
+es.compute(A);
+cout << "The eigenvalues of A are: " << es.eigenvalues().transpose() << endl;
+es.compute(A + MatrixXf::Identity(4,4)); // re-use es to compute eigenvalues of A+I
+cout << "The eigenvalues of A+I are: " << es.eigenvalues().transpose() << endl;

diff --git a/doc/snippets/SelfAdjointEigenSolver_compute_MatrixType2.cpp b/doc/snippets/SelfAdjointEigenSolver_compute_MatrixType2.cpp
new file mode 100644
index 0000000..07c92a1
--- /dev/null
+++ b/doc/snippets/SelfAdjointEigenSolver_compute_MatrixType2.cpp

@@ -0,0 +1,9 @@
+MatrixXd X = MatrixXd::Random(5,5);
+MatrixXd A = X * X.transpose();
+X = MatrixXd::Random(5,5);
+MatrixXd B = X * X.transpose();
+
+GeneralizedSelfAdjointEigenSolver<MatrixXd> es(A,B,EigenvaluesOnly);
+cout << "The eigenvalues of the pencil (A,B) are:" << endl << es.eigenvalues() << endl;
+es.compute(B,A,false);
+cout << "The eigenvalues of the pencil (B,A) are:" << endl << es.eigenvalues() << endl;

diff --git a/doc/snippets/SelfAdjointEigenSolver_eigenvalues.cpp b/doc/snippets/SelfAdjointEigenSolver_eigenvalues.cpp
new file mode 100644
index 0000000..0ff33c6
--- /dev/null
+++ b/doc/snippets/SelfAdjointEigenSolver_eigenvalues.cpp

@@ -0,0 +1,4 @@
+MatrixXd ones = MatrixXd::Ones(3,3);
+SelfAdjointEigenSolver<MatrixXd> es(ones);
+cout << "The eigenvalues of the 3x3 matrix of ones are:" 
+     << endl << es.eigenvalues() << endl;

diff --git a/doc/snippets/SelfAdjointEigenSolver_eigenvectors.cpp b/doc/snippets/SelfAdjointEigenSolver_eigenvectors.cpp
new file mode 100644
index 0000000..94b0d6e
--- /dev/null
+++ b/doc/snippets/SelfAdjointEigenSolver_eigenvectors.cpp

@@ -0,0 +1,4 @@
+MatrixXd ones = MatrixXd::Ones(3,3);
+SelfAdjointEigenSolver<MatrixXd> es(ones);
+cout << "The first eigenvector of the 3x3 matrix of ones is:" 
+     << endl << es.eigenvectors().col(0) << endl;

diff --git a/doc/snippets/SelfAdjointEigenSolver_operatorInverseSqrt.cpp b/doc/snippets/SelfAdjointEigenSolver_operatorInverseSqrt.cpp
new file mode 100644
index 0000000..114c65f
--- /dev/null
+++ b/doc/snippets/SelfAdjointEigenSolver_operatorInverseSqrt.cpp

@@ -0,0 +1,9 @@
+MatrixXd X = MatrixXd::Random(4,4);
+MatrixXd A = X * X.transpose();
+cout << "Here is a random positive-definite matrix, A:" << endl << A << endl << endl;
+
+SelfAdjointEigenSolver<MatrixXd> es(A);
+cout << "The inverse square root of A is: " << endl;
+cout << es.operatorInverseSqrt() << endl;
+cout << "We can also compute it with operatorSqrt() and inverse(). That yields: " << endl;
+cout << es.operatorSqrt().inverse() << endl;

diff --git a/doc/snippets/SelfAdjointEigenSolver_operatorSqrt.cpp b/doc/snippets/SelfAdjointEigenSolver_operatorSqrt.cpp
new file mode 100644
index 0000000..eeacca7
--- /dev/null
+++ b/doc/snippets/SelfAdjointEigenSolver_operatorSqrt.cpp

@@ -0,0 +1,8 @@
+MatrixXd X = MatrixXd::Random(4,4);
+MatrixXd A = X * X.transpose();
+cout << "Here is a random positive-definite matrix, A:" << endl << A << endl << endl;
+
+SelfAdjointEigenSolver<MatrixXd> es(A);
+MatrixXd sqrtA = es.operatorSqrt();
+cout << "The square root of A is: " << endl << sqrtA << endl;
+cout << "If we square this, we get: " << endl << sqrtA*sqrtA << endl;

diff --git a/doc/snippets/SelfAdjointView_eigenvalues.cpp b/doc/snippets/SelfAdjointView_eigenvalues.cpp
new file mode 100644
index 0000000..be19867
--- /dev/null
+++ b/doc/snippets/SelfAdjointView_eigenvalues.cpp

@@ -0,0 +1,3 @@
+MatrixXd ones = MatrixXd::Ones(3,3);
+VectorXd eivals = ones.selfadjointView<Lower>().eigenvalues();
+cout << "The eigenvalues of the 3x3 matrix of ones are:" << endl << eivals << endl;

diff --git a/doc/snippets/SelfAdjointView_operatorNorm.cpp b/doc/snippets/SelfAdjointView_operatorNorm.cpp
new file mode 100644
index 0000000..f380f55
--- /dev/null
+++ b/doc/snippets/SelfAdjointView_operatorNorm.cpp

@@ -0,0 +1,3 @@
+MatrixXd ones = MatrixXd::Ones(3,3);
+cout << "The operator norm of the 3x3 matrix of ones is "
+     << ones.selfadjointView<Lower>().operatorNorm() << endl;

diff --git a/doc/snippets/Slicing_arrayexpr.cpp b/doc/snippets/Slicing_arrayexpr.cpp
new file mode 100644
index 0000000..2df8180
--- /dev/null
+++ b/doc/snippets/Slicing_arrayexpr.cpp

@@ -0,0 +1,4 @@
+ArrayXi ind(5); ind<<4,2,5,5,3;
+MatrixXi A = MatrixXi::Random(4,6);
+cout << "Initial matrix A:\n" << A << "\n\n";
+cout << "A(all,ind-1):\n" << A(all,ind-1) << "\n\n";

diff --git a/doc/snippets/Slicing_custom_padding_cxx11.cpp b/doc/snippets/Slicing_custom_padding_cxx11.cpp
new file mode 100644
index 0000000..24db98b
--- /dev/null
+++ b/doc/snippets/Slicing_custom_padding_cxx11.cpp

@@ -0,0 +1,12 @@
+struct pad {
+  Index size() const { return out_size; }
+  Index operator[] (Index i) const { return std::max<Index>(0,i-(out_size-in_size)); }
+  Index in_size, out_size;
+};
+
+Matrix3i A;
+A.reshaped() = VectorXi::LinSpaced(9,1,9);
+cout << "Initial matrix A:\n" << A << "\n\n";
+MatrixXi B(5,5);
+B = A(pad{3,5}, pad{3,5});
+cout << "A(pad{3,N}, pad{3,N}):\n" << B << "\n\n";

diff --git a/doc/snippets/Slicing_rawarray_cxx11.cpp b/doc/snippets/Slicing_rawarray_cxx11.cpp
new file mode 100644
index 0000000..1087131
--- /dev/null
+++ b/doc/snippets/Slicing_rawarray_cxx11.cpp

@@ -0,0 +1,5 @@
+#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE
+MatrixXi A = MatrixXi::Random(4,6);
+cout << "Initial matrix A:\n" << A << "\n\n";
+cout << "A(all,{4,2,5,5,3}):\n" << A(all,{4,2,5,5,3}) << "\n\n";
+#endif

diff --git a/doc/snippets/Slicing_stdvector_cxx11.cpp b/doc/snippets/Slicing_stdvector_cxx11.cpp
new file mode 100644
index 0000000..555f662
--- /dev/null
+++ b/doc/snippets/Slicing_stdvector_cxx11.cpp

@@ -0,0 +1,4 @@
+std::vector<int> ind{4,2,5,5,3};
+MatrixXi A = MatrixXi::Random(4,6);
+cout << "Initial matrix A:\n" << A << "\n\n";
+cout << "A(all,ind):\n" << A(all,ind) << "\n\n";

diff --git a/doc/snippets/SparseMatrix_coeffs.cpp b/doc/snippets/SparseMatrix_coeffs.cpp
new file mode 100644
index 0000000..f71a69b
--- /dev/null
+++ b/doc/snippets/SparseMatrix_coeffs.cpp

@@ -0,0 +1,9 @@
+SparseMatrix<double> A(3,3);
+A.insert(1,2) = 0;
+A.insert(0,1) = 1;
+A.insert(2,0) = 2;
+A.makeCompressed();
+cout << "The matrix A is:" << endl << MatrixXd(A) << endl;
+cout << "it has " << A.nonZeros() << " stored non zero coefficients that are: " << A.coeffs().transpose() << endl;
+A.coeffs() += 10;
+cout << "After adding 10 to every stored non zero coefficient, the matrix A is:" << endl << MatrixXd(A) << endl;

diff --git a/doc/snippets/TopicAliasing_block.cpp b/doc/snippets/TopicAliasing_block.cpp
new file mode 100644
index 0000000..03282f4
--- /dev/null
+++ b/doc/snippets/TopicAliasing_block.cpp

@@ -0,0 +1,7 @@
+MatrixXi mat(3,3); 
+mat << 1, 2, 3,   4, 5, 6,   7, 8, 9;
+cout << "Here is the matrix mat:\n" << mat << endl;
+
+// This assignment shows the aliasing problem
+mat.bottomRightCorner(2,2) = mat.topLeftCorner(2,2);
+cout << "After the assignment, mat = \n" << mat << endl;

diff --git a/doc/snippets/TopicAliasing_block_correct.cpp b/doc/snippets/TopicAliasing_block_correct.cpp
new file mode 100644
index 0000000..6fee580
--- /dev/null
+++ b/doc/snippets/TopicAliasing_block_correct.cpp

@@ -0,0 +1,7 @@
+MatrixXi mat(3,3); 
+mat << 1, 2, 3,   4, 5, 6,   7, 8, 9;
+cout << "Here is the matrix mat:\n" << mat << endl;
+
+// The eval() solves the aliasing problem
+mat.bottomRightCorner(2,2) = mat.topLeftCorner(2,2).eval();
+cout << "After the assignment, mat = \n" << mat << endl;

diff --git a/doc/snippets/TopicAliasing_cwise.cpp b/doc/snippets/TopicAliasing_cwise.cpp
new file mode 100644
index 0000000..7049f6c
--- /dev/null
+++ b/doc/snippets/TopicAliasing_cwise.cpp

@@ -0,0 +1,20 @@
+MatrixXf mat(2,2); 
+mat << 1, 2,  4, 7;
+cout << "Here is the matrix mat:\n" << mat << endl << endl;
+
+mat = 2 * mat;
+cout << "After 'mat = 2 * mat', mat = \n" << mat << endl << endl;
+
+
+mat = mat - MatrixXf::Identity(2,2);
+cout << "After the subtraction, it becomes\n" << mat << endl << endl;
+
+
+ArrayXXf arr = mat;
+arr = arr.square();
+cout << "After squaring, it becomes\n" << arr << endl << endl;
+
+// Combining all operations in one statement:
+mat << 1, 2,  4, 7;
+mat = (2 * mat - MatrixXf::Identity(2,2)).array().square();
+cout << "Doing everything at once yields\n" << mat << endl << endl;

diff --git a/doc/snippets/TopicAliasing_mult1.cpp b/doc/snippets/TopicAliasing_mult1.cpp
new file mode 100644
index 0000000..cd7e900
--- /dev/null
+++ b/doc/snippets/TopicAliasing_mult1.cpp

@@ -0,0 +1,4 @@
+MatrixXf matA(2,2); 
+matA << 2, 0,  0, 2;
+matA = matA * matA;
+cout << matA;

diff --git a/doc/snippets/TopicAliasing_mult2.cpp b/doc/snippets/TopicAliasing_mult2.cpp
new file mode 100644
index 0000000..a3ff568
--- /dev/null
+++ b/doc/snippets/TopicAliasing_mult2.cpp

@@ -0,0 +1,10 @@
+MatrixXf matA(2,2), matB(2,2); 
+matA << 2, 0,  0, 2;
+
+// Simple but not quite as efficient
+matB = matA * matA;
+cout << matB << endl << endl;
+
+// More complicated but also more efficient
+matB.noalias() = matA * matA;
+cout << matB;

diff --git a/doc/snippets/TopicAliasing_mult3.cpp b/doc/snippets/TopicAliasing_mult3.cpp
new file mode 100644
index 0000000..1d12a6c
--- /dev/null
+++ b/doc/snippets/TopicAliasing_mult3.cpp

@@ -0,0 +1,4 @@
+MatrixXf matA(2,2); 
+matA << 2, 0,  0, 2;
+matA.noalias() = matA * matA;
+cout << matA;

diff --git a/doc/snippets/TopicAliasing_mult4.cpp b/doc/snippets/TopicAliasing_mult4.cpp
new file mode 100644
index 0000000..01c1c6d
--- /dev/null
+++ b/doc/snippets/TopicAliasing_mult4.cpp

@@ -0,0 +1,5 @@
+MatrixXf A(2,2), B(3,2);
+B << 2, 0,  0, 3, 1, 1;
+A << 2, 0, 0, -2;
+A = (B * A).cwiseAbs();
+cout << A;

diff --git a/doc/snippets/TopicAliasing_mult5.cpp b/doc/snippets/TopicAliasing_mult5.cpp
new file mode 100644
index 0000000..1a36def
--- /dev/null
+++ b/doc/snippets/TopicAliasing_mult5.cpp

@@ -0,0 +1,5 @@
+MatrixXf A(2,2), B(3,2);
+B << 2, 0,  0, 3, 1, 1;
+A << 2, 0, 0, -2;
+A = (B * A).eval().cwiseAbs();
+cout << A;

diff --git a/doc/snippets/TopicStorageOrders_example.cpp b/doc/snippets/TopicStorageOrders_example.cpp
new file mode 100644
index 0000000..0623ef0
--- /dev/null
+++ b/doc/snippets/TopicStorageOrders_example.cpp

@@ -0,0 +1,18 @@
+Matrix<int, 3, 4, ColMajor> Acolmajor;
+Acolmajor << 8, 2, 2, 9,
+             9, 1, 4, 4,
+	     3, 5, 4, 5;
+cout << "The matrix A:" << endl;
+cout << Acolmajor << endl << endl; 
+
+cout << "In memory (column-major):" << endl;
+for (int i = 0; i < Acolmajor.size(); i++)
+  cout << *(Acolmajor.data() + i) << "  ";
+cout << endl << endl;
+
+Matrix<int, 3, 4, RowMajor> Arowmajor = Acolmajor;
+cout << "In memory (row-major):" << endl;
+for (int i = 0; i < Arowmajor.size(); i++)
+  cout << *(Arowmajor.data() + i) << "  ";
+cout << endl;
+

diff --git a/doc/snippets/Triangular_solve.cpp b/doc/snippets/Triangular_solve.cpp
new file mode 100644
index 0000000..5484424
--- /dev/null
+++ b/doc/snippets/Triangular_solve.cpp

@@ -0,0 +1,11 @@
+Matrix3d m = Matrix3d::Zero();
+m.triangularView<Eigen::Upper>().setOnes();
+cout << "Here is the matrix m:\n" << m << endl;
+Matrix3d n = Matrix3d::Ones();
+n.triangularView<Eigen::Lower>() *= 2;
+cout << "Here is the matrix n:\n" << n << endl;
+cout << "And now here is m.inverse()*n, taking advantage of the fact that"
+        " m is upper-triangular:\n"
+     << m.triangularView<Eigen::Upper>().solve(n) << endl;
+cout << "And this is n*m.inverse():\n"
+     << m.triangularView<Eigen::Upper>().solve<Eigen::OnTheRight>(n);

diff --git a/doc/snippets/Tridiagonalization_Tridiagonalization_MatrixType.cpp b/doc/snippets/Tridiagonalization_Tridiagonalization_MatrixType.cpp
new file mode 100644
index 0000000..a260124
--- /dev/null
+++ b/doc/snippets/Tridiagonalization_Tridiagonalization_MatrixType.cpp

@@ -0,0 +1,9 @@
+MatrixXd X = MatrixXd::Random(5,5);
+MatrixXd A = X + X.transpose();
+cout << "Here is a random symmetric 5x5 matrix:" << endl << A << endl << endl;
+Tridiagonalization<MatrixXd> triOfA(A);
+MatrixXd Q = triOfA.matrixQ();
+cout << "The orthogonal matrix Q is:" << endl << Q << endl;
+MatrixXd T = triOfA.matrixT();
+cout << "The tridiagonal matrix T is:" << endl << T << endl << endl;
+cout << "Q * T * Q^T = " << endl << Q * T * Q.transpose() << endl;

diff --git a/doc/snippets/Tridiagonalization_compute.cpp b/doc/snippets/Tridiagonalization_compute.cpp
new file mode 100644
index 0000000..0062a99
--- /dev/null
+++ b/doc/snippets/Tridiagonalization_compute.cpp

@@ -0,0 +1,9 @@
+Tridiagonalization<MatrixXf> tri;
+MatrixXf X = MatrixXf::Random(4,4);
+MatrixXf A = X + X.transpose();
+tri.compute(A);
+cout << "The matrix T in the tridiagonal decomposition of A is: " << endl;
+cout << tri.matrixT() << endl;
+tri.compute(2*A); // re-use tri to compute eigenvalues of 2A
+cout << "The matrix T in the tridiagonal decomposition of 2A is: " << endl;
+cout << tri.matrixT() << endl;

diff --git a/doc/snippets/Tridiagonalization_decomposeInPlace.cpp b/doc/snippets/Tridiagonalization_decomposeInPlace.cpp
new file mode 100644
index 0000000..3cdce67
--- /dev/null
+++ b/doc/snippets/Tridiagonalization_decomposeInPlace.cpp

@@ -0,0 +1,11 @@
+MatrixXd X = MatrixXd::Random(5,5);
+MatrixXd A = X + X.transpose();
+cout << "Here is a random symmetric 5x5 matrix:" << endl << A << endl << endl;
+
+VectorXd diag(5);
+VectorXd subdiag(4);
+VectorXd hcoeffs(4);  // Scratch space for householder reflector.
+internal::tridiagonalization_inplace(A, diag, subdiag, hcoeffs, true);
+cout << "The orthogonal matrix Q is:" << endl << A << endl;
+cout << "The diagonal of the tridiagonal matrix T is:" << endl << diag << endl;
+cout << "The subdiagonal of the tridiagonal matrix T is:" << endl << subdiag << endl;

diff --git a/doc/snippets/Tridiagonalization_diagonal.cpp b/doc/snippets/Tridiagonalization_diagonal.cpp
new file mode 100644
index 0000000..6eec821
--- /dev/null
+++ b/doc/snippets/Tridiagonalization_diagonal.cpp

@@ -0,0 +1,13 @@
+MatrixXcd X = MatrixXcd::Random(4,4);
+MatrixXcd A = X + X.adjoint();
+cout << "Here is a random self-adjoint 4x4 matrix:" << endl << A << endl << endl;
+
+Tridiagonalization<MatrixXcd> triOfA(A);
+MatrixXd T = triOfA.matrixT();
+cout << "The tridiagonal matrix T is:" << endl << T << endl << endl;
+
+cout << "We can also extract the diagonals of T directly ..." << endl;
+VectorXd diag = triOfA.diagonal();
+cout << "The diagonal is:" << endl << diag << endl; 
+VectorXd subdiag = triOfA.subDiagonal();
+cout << "The subdiagonal is:" << endl << subdiag << endl;

diff --git a/doc/snippets/Tridiagonalization_householderCoefficients.cpp b/doc/snippets/Tridiagonalization_householderCoefficients.cpp
new file mode 100644
index 0000000..e5d8728
--- /dev/null
+++ b/doc/snippets/Tridiagonalization_householderCoefficients.cpp

@@ -0,0 +1,6 @@
+Matrix4d X = Matrix4d::Random(4,4);
+Matrix4d A = X + X.transpose();
+cout << "Here is a random symmetric 4x4 matrix:" << endl << A << endl;
+Tridiagonalization<Matrix4d> triOfA(A);
+Vector3d hc = triOfA.householderCoefficients();
+cout << "The vector of Householder coefficients is:" << endl << hc << endl;

diff --git a/doc/snippets/Tridiagonalization_packedMatrix.cpp b/doc/snippets/Tridiagonalization_packedMatrix.cpp
new file mode 100644
index 0000000..0f55d0c
--- /dev/null
+++ b/doc/snippets/Tridiagonalization_packedMatrix.cpp

@@ -0,0 +1,8 @@
+Matrix4d X = Matrix4d::Random(4,4);
+Matrix4d A = X + X.transpose();
+cout << "Here is a random symmetric 4x4 matrix:" << endl << A << endl;
+Tridiagonalization<Matrix4d> triOfA(A);
+Matrix4d pm = triOfA.packedMatrix();
+cout << "The packed matrix M is:" << endl << pm << endl;
+cout << "The diagonal and subdiagonal corresponds to the matrix T, which is:" 
+     << endl << triOfA.matrixT() << endl;

diff --git a/doc/snippets/Tutorial_AdvancedInitialization_Block.cpp b/doc/snippets/Tutorial_AdvancedInitialization_Block.cpp
new file mode 100644
index 0000000..96e40ac
--- /dev/null
+++ b/doc/snippets/Tutorial_AdvancedInitialization_Block.cpp

@@ -0,0 +1,5 @@
+MatrixXf matA(2, 2);
+matA << 1, 2, 3, 4;
+MatrixXf matB(4, 4);
+matB << matA, matA/10, matA/10, matA;
+std::cout << matB << std::endl;

diff --git a/doc/snippets/Tutorial_AdvancedInitialization_CommaTemporary.cpp b/doc/snippets/Tutorial_AdvancedInitialization_CommaTemporary.cpp
new file mode 100644
index 0000000..50cff4c
--- /dev/null
+++ b/doc/snippets/Tutorial_AdvancedInitialization_CommaTemporary.cpp

@@ -0,0 +1,4 @@
+MatrixXf mat = MatrixXf::Random(2, 3);
+std::cout << mat << std::endl << std::endl;
+mat = (MatrixXf(2,2) << 0, 1, 1, 0).finished() * mat;
+std::cout << mat << std::endl;

diff --git a/doc/snippets/Tutorial_AdvancedInitialization_Join.cpp b/doc/snippets/Tutorial_AdvancedInitialization_Join.cpp
new file mode 100644
index 0000000..55a2153
--- /dev/null
+++ b/doc/snippets/Tutorial_AdvancedInitialization_Join.cpp

@@ -0,0 +1,11 @@
+RowVectorXd vec1(3);
+vec1 << 1, 2, 3;
+std::cout << "vec1 = " << vec1 << std::endl;
+
+RowVectorXd vec2(4);
+vec2 << 1, 4, 9, 16;
+std::cout << "vec2 = " << vec2 << std::endl;
+
+RowVectorXd joined(7);
+joined << vec1, vec2;
+std::cout << "joined = " << joined << std::endl;

diff --git a/doc/snippets/Tutorial_AdvancedInitialization_LinSpaced.cpp b/doc/snippets/Tutorial_AdvancedInitialization_LinSpaced.cpp
new file mode 100644
index 0000000..c6a73ab
--- /dev/null
+++ b/doc/snippets/Tutorial_AdvancedInitialization_LinSpaced.cpp

@@ -0,0 +1,7 @@
+ArrayXXf table(10, 4);
+table.col(0) = ArrayXf::LinSpaced(10, 0, 90);
+table.col(1) = M_PI / 180 * table.col(0);
+table.col(2) = table.col(1).sin();
+table.col(3) = table.col(1).cos();
+std::cout << "  Degrees   Radians      Sine    Cosine\n";
+std::cout << table << std::endl;

diff --git a/doc/snippets/Tutorial_AdvancedInitialization_ThreeWays.cpp b/doc/snippets/Tutorial_AdvancedInitialization_ThreeWays.cpp
new file mode 100644
index 0000000..cb74576
--- /dev/null
+++ b/doc/snippets/Tutorial_AdvancedInitialization_ThreeWays.cpp

@@ -0,0 +1,20 @@
+const int size = 6;
+MatrixXd mat1(size, size);
+mat1.topLeftCorner(size/2, size/2)     = MatrixXd::Zero(size/2, size/2);
+mat1.topRightCorner(size/2, size/2)    = MatrixXd::Identity(size/2, size/2);
+mat1.bottomLeftCorner(size/2, size/2)  = MatrixXd::Identity(size/2, size/2);
+mat1.bottomRightCorner(size/2, size/2) = MatrixXd::Zero(size/2, size/2);
+std::cout << mat1 << std::endl << std::endl;
+
+MatrixXd mat2(size, size);
+mat2.topLeftCorner(size/2, size/2).setZero();
+mat2.topRightCorner(size/2, size/2).setIdentity();
+mat2.bottomLeftCorner(size/2, size/2).setIdentity();
+mat2.bottomRightCorner(size/2, size/2).setZero();
+std::cout << mat2 << std::endl << std::endl;
+
+MatrixXd mat3(size, size);
+mat3 << MatrixXd::Zero(size/2, size/2), MatrixXd::Identity(size/2, size/2),
+        MatrixXd::Identity(size/2, size/2), MatrixXd::Zero(size/2, size/2);
+std::cout << mat3 << std::endl;
+

diff --git a/doc/snippets/Tutorial_AdvancedInitialization_Zero.cpp b/doc/snippets/Tutorial_AdvancedInitialization_Zero.cpp
new file mode 100644
index 0000000..76a36a3
--- /dev/null
+++ b/doc/snippets/Tutorial_AdvancedInitialization_Zero.cpp

@@ -0,0 +1,13 @@
+std::cout << "A fixed-size array:\n";
+Array33f a1 = Array33f::Zero();
+std::cout << a1 << "\n\n";
+
+
+std::cout << "A one-dimensional dynamic-size array:\n";
+ArrayXf a2 = ArrayXf::Zero(3);
+std::cout << a2 << "\n\n";
+
+
+std::cout << "A two-dimensional dynamic-size array:\n";
+ArrayXXf a3 = ArrayXXf::Zero(3, 4);
+std::cout << a3 << "\n";

diff --git a/doc/snippets/Tutorial_Map_rowmajor.cpp b/doc/snippets/Tutorial_Map_rowmajor.cpp
new file mode 100644
index 0000000..fd45ace
--- /dev/null
+++ b/doc/snippets/Tutorial_Map_rowmajor.cpp

@@ -0,0 +1,7 @@
+int array[8];
+for(int i = 0; i < 8; ++i) array[i] = i;
+cout << "Column-major:\n" << Map<Matrix<int,2,4> >(array) << endl;
+cout << "Row-major:\n" << Map<Matrix<int,2,4,RowMajor> >(array) << endl;
+cout << "Row-major using stride:\n" <<
+  Map<Matrix<int,2,4>, Unaligned, Stride<1,4> >(array) << endl;
+

diff --git a/doc/snippets/Tutorial_Map_using.cpp b/doc/snippets/Tutorial_Map_using.cpp
new file mode 100644
index 0000000..e5e499f
--- /dev/null
+++ b/doc/snippets/Tutorial_Map_using.cpp

@@ -0,0 +1,21 @@
+typedef Matrix<float,1,Dynamic> MatrixType;
+typedef Map<MatrixType> MapType;
+typedef Map<const MatrixType> MapTypeConst;   // a read-only map
+const int n_dims = 5;
+  
+MatrixType m1(n_dims), m2(n_dims);
+m1.setRandom();
+m2.setRandom();
+float *p = &m2(0);  // get the address storing the data for m2
+MapType m2map(p,m2.size());   // m2map shares data with m2
+MapTypeConst m2mapconst(p,m2.size());  // a read-only accessor for m2
+
+cout << "m1: " << m1 << endl;
+cout << "m2: " << m2 << endl;
+cout << "Squared euclidean distance: " << (m1-m2).squaredNorm() << endl;
+cout << "Squared euclidean distance, using map: " <<
+  (m1-m2map).squaredNorm() << endl;
+m2map(3) = 7;   // this will change m2, since they share the same array
+cout << "Updated m2: " << m2 << endl;
+cout << "m2 coefficient 2, constant accessor: " << m2mapconst(2) << endl;
+/* m2mapconst(2) = 5; */   // this yields a compile-time error

diff --git a/doc/snippets/Tutorial_ReshapeMat2Mat.cpp b/doc/snippets/Tutorial_ReshapeMat2Mat.cpp
new file mode 100644
index 0000000..737afec
--- /dev/null
+++ b/doc/snippets/Tutorial_ReshapeMat2Mat.cpp

@@ -0,0 +1,6 @@
+MatrixXf M1(2,6);    // Column-major storage
+M1 << 1, 2, 3,  4,  5,  6,
+      7, 8, 9, 10, 11, 12;
+
+Map<MatrixXf> M2(M1.data(), 6,2);
+cout << "M2:" << endl << M2 << endl;

diff --git a/doc/snippets/Tutorial_ReshapeMat2Vec.cpp b/doc/snippets/Tutorial_ReshapeMat2Vec.cpp
new file mode 100644
index 0000000..32980a7
--- /dev/null
+++ b/doc/snippets/Tutorial_ReshapeMat2Vec.cpp

@@ -0,0 +1,11 @@
+MatrixXf M1(3,3);    // Column-major storage
+M1 << 1, 2, 3,
+      4, 5, 6,
+      7, 8, 9;
+
+Map<RowVectorXf> v1(M1.data(), M1.size());
+cout << "v1:" << endl << v1 << endl;
+
+Matrix<float,Dynamic,Dynamic,RowMajor> M2(M1);
+Map<RowVectorXf> v2(M2.data(), M2.size());
+cout << "v2:" << endl << v2 << endl;

diff --git a/doc/snippets/Tutorial_SlicingCol.cpp b/doc/snippets/Tutorial_SlicingCol.cpp
new file mode 100644
index 0000000..695d130
--- /dev/null
+++ b/doc/snippets/Tutorial_SlicingCol.cpp

@@ -0,0 +1,11 @@
+MatrixXf M1 = MatrixXf::Random(3,8);
+cout << "Column major input:" << endl << M1 << "\n";
+Map<MatrixXf,0,OuterStride<> > M2(M1.data(), M1.rows(), (M1.cols()+2)/3, OuterStride<>(M1.outerStride()*3));
+cout << "1 column over 3:" << endl << M2 << "\n";
+
+typedef Matrix<float,Dynamic,Dynamic,RowMajor> RowMajorMatrixXf;
+RowMajorMatrixXf M3(M1);
+cout << "Row major input:" << endl << M3 << "\n";
+Map<RowMajorMatrixXf,0,Stride<Dynamic,3> > M4(M3.data(), M3.rows(), (M3.cols()+2)/3,
+                                              Stride<Dynamic,3>(M3.outerStride(),3));
+cout << "1 column over 3:" << endl << M4 << "\n";

diff --git a/doc/snippets/Tutorial_SlicingVec.cpp b/doc/snippets/Tutorial_SlicingVec.cpp
new file mode 100644
index 0000000..9b82246
--- /dev/null
+++ b/doc/snippets/Tutorial_SlicingVec.cpp

@@ -0,0 +1,4 @@
+RowVectorXf v = RowVectorXf::LinSpaced(20,0,19);
+cout << "Input:" << endl << v << endl;
+Map<RowVectorXf,0,InnerStride<2> > v2(v.data(), v.size()/2);
+cout << "Even:" << v2 << endl;

diff --git a/doc/snippets/Tutorial_commainit_01.cpp b/doc/snippets/Tutorial_commainit_01.cpp
new file mode 100644
index 0000000..47ba31d
--- /dev/null
+++ b/doc/snippets/Tutorial_commainit_01.cpp

@@ -0,0 +1,5 @@
+Matrix3f m;
+m << 1, 2, 3,
+     4, 5, 6,
+     7, 8, 9;
+std::cout << m;

diff --git a/doc/snippets/Tutorial_commainit_01b.cpp b/doc/snippets/Tutorial_commainit_01b.cpp
new file mode 100644
index 0000000..2adb2e2
--- /dev/null
+++ b/doc/snippets/Tutorial_commainit_01b.cpp

@@ -0,0 +1,5 @@
+Matrix3f m;
+m.row(0) << 1, 2, 3;
+m.block(1,0,2,2) << 4, 5, 7, 8;
+m.col(2).tail(2) << 6, 9;		    
+std::cout << m;

diff --git a/doc/snippets/Tutorial_commainit_02.cpp b/doc/snippets/Tutorial_commainit_02.cpp
new file mode 100644
index 0000000..c960d6a
--- /dev/null
+++ b/doc/snippets/Tutorial_commainit_02.cpp

@@ -0,0 +1,7 @@
+int rows=5, cols=5;
+MatrixXf m(rows,cols);
+m << (Matrix3f() << 1, 2, 3, 4, 5, 6, 7, 8, 9).finished(),
+     MatrixXf::Zero(3,cols-3),
+     MatrixXf::Zero(rows-3,3),
+     MatrixXf::Identity(rows-3,cols-3);
+cout << m;

diff --git a/doc/snippets/Tutorial_range_for_loop_1d_cxx11.cpp b/doc/snippets/Tutorial_range_for_loop_1d_cxx11.cpp
new file mode 100644
index 0000000..e72e715
--- /dev/null
+++ b/doc/snippets/Tutorial_range_for_loop_1d_cxx11.cpp

@@ -0,0 +1,4 @@
+VectorXi v = VectorXi::Random(4);
+cout << "Here is the vector v:\n";
+for(auto x : v) cout << x << " ";
+cout << "\n";

diff --git a/doc/snippets/Tutorial_range_for_loop_2d_cxx11.cpp b/doc/snippets/Tutorial_range_for_loop_2d_cxx11.cpp
new file mode 100644
index 0000000..4a12d26
--- /dev/null
+++ b/doc/snippets/Tutorial_range_for_loop_2d_cxx11.cpp

@@ -0,0 +1,5 @@
+Matrix2i A = Matrix2i::Random();
+cout << "Here are the coeffs of the 2x2 matrix A:\n";
+for(auto x : A.reshaped())
+  cout << x << " ";
+cout << "\n";

diff --git a/doc/snippets/Tutorial_reshaped_vs_resize_1.cpp b/doc/snippets/Tutorial_reshaped_vs_resize_1.cpp
new file mode 100644
index 0000000..e520e8e
--- /dev/null
+++ b/doc/snippets/Tutorial_reshaped_vs_resize_1.cpp

@@ -0,0 +1,5 @@
+MatrixXi m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.reshaped(2, 8):" << endl << m.reshaped(2, 8) << endl;
+m.resize(2,8);
+cout << "Here is the matrix m after m.resize(2,8):" << endl << m << endl;

diff --git a/doc/snippets/Tutorial_reshaped_vs_resize_2.cpp b/doc/snippets/Tutorial_reshaped_vs_resize_2.cpp
new file mode 100644
index 0000000..50dc454
--- /dev/null
+++ b/doc/snippets/Tutorial_reshaped_vs_resize_2.cpp

@@ -0,0 +1,6 @@
+Matrix<int,Dynamic,Dynamic,RowMajor> m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.reshaped(2, 8):" << endl << m.reshaped(2, 8) << endl;
+cout << "Here is m.reshaped<AutoOrder>(2, 8):" << endl << m.reshaped<AutoOrder>(2, 8) << endl;
+m.resize(2,8);
+cout << "Here is the matrix m after m.resize(2,8):" << endl << m << endl;

diff --git a/doc/snippets/Tutorial_solve_matrix_inverse.cpp b/doc/snippets/Tutorial_solve_matrix_inverse.cpp
new file mode 100644
index 0000000..fff3244
--- /dev/null
+++ b/doc/snippets/Tutorial_solve_matrix_inverse.cpp

@@ -0,0 +1,6 @@
+Matrix3f A;
+Vector3f b;
+A << 1,2,3,  4,5,6,  7,8,10;
+b << 3, 3, 4;
+Vector3f x = A.inverse() * b;
+cout << "The solution is:" << endl << x << endl;

diff --git a/doc/snippets/Tutorial_solve_multiple_rhs.cpp b/doc/snippets/Tutorial_solve_multiple_rhs.cpp
new file mode 100644
index 0000000..5411a44
--- /dev/null
+++ b/doc/snippets/Tutorial_solve_multiple_rhs.cpp

@@ -0,0 +1,10 @@
+Matrix3f A(3,3);
+A << 1,2,3,  4,5,6,  7,8,10;
+Matrix<float,3,2> B;
+B << 3,1, 3,1, 4,1;
+Matrix<float,3,2> X;
+X = A.fullPivLu().solve(B);
+cout << "The solution with right-hand side (3,3,4) is:" << endl;
+cout << X.col(0) << endl;
+cout << "The solution with right-hand side (1,1,1) is:" << endl;
+cout << X.col(1) << endl;

diff --git a/doc/snippets/Tutorial_solve_reuse_decomposition.cpp b/doc/snippets/Tutorial_solve_reuse_decomposition.cpp
new file mode 100644
index 0000000..3ca0645
--- /dev/null
+++ b/doc/snippets/Tutorial_solve_reuse_decomposition.cpp

@@ -0,0 +1,13 @@
+Matrix3f A(3,3);
+A << 1,2,3,  4,5,6,  7,8,10;
+PartialPivLU<Matrix3f> luOfA(A); // compute LU decomposition of A
+Vector3f b;
+b << 3,3,4;
+Vector3f x;
+x = luOfA.solve(b);
+cout << "The solution with right-hand side (3,3,4) is:" << endl;
+cout << x << endl;
+b << 1,1,1;
+x = luOfA.solve(b);
+cout << "The solution with right-hand side (1,1,1) is:" << endl;
+cout << x << endl;

diff --git a/doc/snippets/Tutorial_solve_singular.cpp b/doc/snippets/Tutorial_solve_singular.cpp
new file mode 100644
index 0000000..abff1ef
--- /dev/null
+++ b/doc/snippets/Tutorial_solve_singular.cpp

@@ -0,0 +1,9 @@
+Matrix3f A;
+Vector3f b;
+A << 1,2,3,  4,5,6,  7,8,9;
+b << 3, 3, 4;
+cout << "Here is the matrix A:" << endl << A << endl;
+cout << "Here is the vector b:" << endl << b << endl;
+Vector3f x;
+x = A.lu().solve(b);
+cout << "The solution is:" << endl << x << endl;

diff --git a/doc/snippets/Tutorial_solve_triangular.cpp b/doc/snippets/Tutorial_solve_triangular.cpp
new file mode 100644
index 0000000..9d13f22
--- /dev/null
+++ b/doc/snippets/Tutorial_solve_triangular.cpp

@@ -0,0 +1,8 @@
+Matrix3f A;
+Vector3f b;
+A << 1,2,3,  0,5,6,  0,0,10;
+b << 3, 3, 4;
+cout << "Here is the matrix A:" << endl << A << endl;
+cout << "Here is the vector b:" << endl << b << endl;
+Vector3f x = A.triangularView<Upper>().solve(b);
+cout << "The solution is:" << endl << x << endl;

diff --git a/doc/snippets/Tutorial_solve_triangular_inplace.cpp b/doc/snippets/Tutorial_solve_triangular_inplace.cpp
new file mode 100644
index 0000000..16ae633
--- /dev/null
+++ b/doc/snippets/Tutorial_solve_triangular_inplace.cpp

@@ -0,0 +1,6 @@
+Matrix3f A;
+Vector3f b;
+A << 1,2,3,  0,5,6,  0,0,10;
+b << 3, 3, 4;
+A.triangularView<Upper>().solveInPlace(b);
+cout << "The solution is:" << endl << b << endl;

diff --git a/doc/snippets/Tutorial_std_sort.cpp b/doc/snippets/Tutorial_std_sort.cpp
new file mode 100644
index 0000000..cde2a6f
--- /dev/null
+++ b/doc/snippets/Tutorial_std_sort.cpp

@@ -0,0 +1,4 @@
+Array4i v = Array4i::Random().abs();
+cout << "Here is the initial vector v:\n" << v.transpose() << "\n";
+std::sort(v.begin(), v.end());
+cout << "Here is the sorted vector v:\n" << v.transpose() << "\n";

diff --git a/doc/snippets/Tutorial_std_sort_rows_cxx11.cpp b/doc/snippets/Tutorial_std_sort_rows_cxx11.cpp
new file mode 100644
index 0000000..0364160
--- /dev/null
+++ b/doc/snippets/Tutorial_std_sort_rows_cxx11.cpp

@@ -0,0 +1,5 @@
+ArrayXXi A = ArrayXXi::Random(4,4).abs();
+cout << "Here is the initial matrix A:\n" << A << "\n";
+for(auto row : A.rowwise())
+  std::sort(row.begin(), row.end());
+cout << "Here is the sorted matrix A:\n" << A << "\n";

diff --git a/doc/snippets/VectorwiseOp_homogeneous.cpp b/doc/snippets/VectorwiseOp_homogeneous.cpp
new file mode 100644
index 0000000..67cf573
--- /dev/null
+++ b/doc/snippets/VectorwiseOp_homogeneous.cpp

@@ -0,0 +1,6 @@
+Matrix3Xd M = Matrix3Xd::Random(3,5);
+Projective3d P(Matrix4d::Random());
+cout << "The matrix M is:" << endl << M << endl << endl;
+cout << "M.colwise().homogeneous():" << endl << M.colwise().homogeneous() << endl << endl;
+cout << "P * M.colwise().homogeneous():" << endl << P * M.colwise().homogeneous() << endl << endl;
+cout << "P * M.colwise().homogeneous().hnormalized(): " << endl << (P * M.colwise().homogeneous()).colwise().hnormalized() << endl << endl;

diff --git a/doc/snippets/Vectorwise_reverse.cpp b/doc/snippets/Vectorwise_reverse.cpp
new file mode 100644
index 0000000..2f6a350
--- /dev/null
+++ b/doc/snippets/Vectorwise_reverse.cpp

@@ -0,0 +1,10 @@
+MatrixXi m = MatrixXi::Random(3,4);
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is the rowwise reverse of m:" << endl << m.rowwise().reverse() << endl;
+cout << "Here is the colwise reverse of m:" << endl << m.colwise().reverse() << endl;
+
+cout << "Here is the coefficient (1,0) in the rowise reverse of m:" << endl
+<< m.rowwise().reverse()(1,0) << endl;
+cout << "Let us overwrite this coefficient with the value 4." << endl;
+//m.colwise().reverse()(1,0) = 4;
+cout << "Now the matrix m is:" << endl << m << endl;

diff --git a/doc/snippets/class_FullPivLU.cpp b/doc/snippets/class_FullPivLU.cpp
new file mode 100644
index 0000000..fce7fac
--- /dev/null
+++ b/doc/snippets/class_FullPivLU.cpp

@@ -0,0 +1,16 @@
+typedef Matrix<double, 5, 3> Matrix5x3;
+typedef Matrix<double, 5, 5> Matrix5x5;
+Matrix5x3 m = Matrix5x3::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+Eigen::FullPivLU<Matrix5x3> lu(m);
+cout << "Here is, up to permutations, its LU decomposition matrix:"
+     << endl << lu.matrixLU() << endl;
+cout << "Here is the L part:" << endl;
+Matrix5x5 l = Matrix5x5::Identity();
+l.block<5,3>(0,0).triangularView<StrictlyLower>() = lu.matrixLU();
+cout << l << endl;
+cout << "Here is the U part:" << endl;
+Matrix5x3 u = lu.matrixLU().triangularView<Upper>();
+cout << u << endl;
+cout << "Let us now reconstruct the original matrix m:" << endl;
+cout << lu.permutationP().inverse() * l * u * lu.permutationQ().inverse() << endl;

diff --git a/doc/snippets/compile_snippet.cpp.in b/doc/snippets/compile_snippet.cpp.in
new file mode 100644
index 0000000..c11457a
--- /dev/null
+++ b/doc/snippets/compile_snippet.cpp.in

@@ -0,0 +1,23 @@
+static bool eigen_did_assert = false;
+#define eigen_assert(X) if(!eigen_did_assert && !(X)){ std::cout << "### Assertion raised in " << __FILE__ << ":" << __LINE__ << ":\n" #X << "\n### The following would happen without assertions:\n"; eigen_did_assert = true;}
+
+#include <iostream>
+#include <Eigen/Eigen>
+
+#ifndef M_PI
+#define M_PI 3.1415926535897932384626433832795
+#endif
+
+
+using namespace Eigen;
+using namespace std;
+
+int main(int, char**)
+{
+  cout.precision(3);
+// intentionally remove indentation of snippet
+{
+${snippet_source_code}
+}
+  return 0;
+}

diff --git a/doc/snippets/tut_arithmetic_redux_minmax.cpp b/doc/snippets/tut_arithmetic_redux_minmax.cpp
new file mode 100644
index 0000000..f4ae7f4
--- /dev/null
+++ b/doc/snippets/tut_arithmetic_redux_minmax.cpp

@@ -0,0 +1,12 @@
+  Matrix3f m = Matrix3f::Random();
+  std::ptrdiff_t i, j;
+  float minOfM = m.minCoeff(&i,&j);
+  cout << "Here is the matrix m:\n" << m << endl;
+  cout << "Its minimum coefficient (" << minOfM 
+       << ") is at position (" << i << "," << j << ")\n\n";
+
+  RowVector4i v = RowVector4i::Random();
+  int maxOfV = v.maxCoeff(&i);
+  cout << "Here is the vector v: " << v << endl;
+  cout << "Its maximum coefficient (" << maxOfV 
+       << ") is at position " << i << endl;

diff --git a/doc/snippets/tut_arithmetic_transpose_aliasing.cpp b/doc/snippets/tut_arithmetic_transpose_aliasing.cpp
new file mode 100644
index 0000000..f82e6f2
--- /dev/null
+++ b/doc/snippets/tut_arithmetic_transpose_aliasing.cpp

@@ -0,0 +1,5 @@
+Matrix2i a; a << 1, 2, 3, 4;
+cout << "Here is the matrix a:\n" << a << endl;
+
+a = a.transpose(); // !!! do NOT do this !!!
+cout << "and the result of the aliasing effect:\n" << a << endl;

diff --git a/doc/snippets/tut_arithmetic_transpose_conjugate.cpp b/doc/snippets/tut_arithmetic_transpose_conjugate.cpp
new file mode 100644
index 0000000..88496b2
--- /dev/null
+++ b/doc/snippets/tut_arithmetic_transpose_conjugate.cpp

@@ -0,0 +1,12 @@
+MatrixXcf a = MatrixXcf::Random(2,2);
+cout << "Here is the matrix a\n" << a << endl;
+
+cout << "Here is the matrix a^T\n" << a.transpose() << endl;
+
+
+cout << "Here is the conjugate of a\n" << a.conjugate() << endl;
+
+
+cout << "Here is the matrix a^*\n" << a.adjoint() << endl;
+
+

diff --git a/doc/snippets/tut_arithmetic_transpose_inplace.cpp b/doc/snippets/tut_arithmetic_transpose_inplace.cpp
new file mode 100644
index 0000000..5c81c9e
--- /dev/null
+++ b/doc/snippets/tut_arithmetic_transpose_inplace.cpp

@@ -0,0 +1,6 @@
+MatrixXf a(2,3); a << 1, 2, 3, 4, 5, 6;
+cout << "Here is the initial matrix a:\n" << a << endl;
+
+
+a.transposeInPlace();
+cout << "and after being transposed:\n" << a << endl;

diff --git a/doc/snippets/tut_matrix_assignment_resizing.cpp b/doc/snippets/tut_matrix_assignment_resizing.cpp
new file mode 100644
index 0000000..cf18998
--- /dev/null
+++ b/doc/snippets/tut_matrix_assignment_resizing.cpp

@@ -0,0 +1,5 @@
+MatrixXf a(2,2);
+std::cout << "a is of size " << a.rows() << "x" << a.cols() << std::endl;
+MatrixXf b(3,3);
+a = b;
+std::cout << "a is now of size " << a.rows() << "x" << a.cols() << std::endl;

diff --git a/doc/special_examples/CMakeLists.txt b/doc/special_examples/CMakeLists.txt
new file mode 100644
index 0000000..5b00e8b
--- /dev/null
+++ b/doc/special_examples/CMakeLists.txt

@@ -0,0 +1,34 @@
+if(NOT EIGEN_TEST_NOQT)
+  find_package(Qt4)
+  if(QT4_FOUND)
+    include(${QT_USE_FILE})
+  endif()
+endif()
+
+if(QT4_FOUND)
+  add_executable(Tutorial_sparse_example Tutorial_sparse_example.cpp Tutorial_sparse_example_details.cpp)
+  target_link_libraries(Tutorial_sparse_example ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO} ${QT_QTCORE_LIBRARY} ${QT_QTGUI_LIBRARY})
+
+  add_custom_command(
+    TARGET Tutorial_sparse_example
+    POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/../html/
+    COMMAND Tutorial_sparse_example ARGS ${CMAKE_CURRENT_BINARY_DIR}/../html/Tutorial_sparse_example.jpeg
+  )
+
+  add_dependencies(all_examples Tutorial_sparse_example)
+endif()
+
+if(EIGEN_COMPILER_SUPPORT_CPP11)
+  add_executable(random_cpp11 random_cpp11.cpp)
+  target_link_libraries(random_cpp11 ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+  add_dependencies(all_examples random_cpp11)
+  ei_add_target_property(random_cpp11 COMPILE_FLAGS "-std=c++11")
+
+  add_custom_command(
+    TARGET random_cpp11
+    POST_BUILD
+    COMMAND random_cpp11
+    ARGS >${CMAKE_CURRENT_BINARY_DIR}/random_cpp11.out
+  )
+endif()

diff --git a/doc/special_examples/Tutorial_sparse_example.cpp b/doc/special_examples/Tutorial_sparse_example.cpp
new file mode 100644
index 0000000..8850db0
--- /dev/null
+++ b/doc/special_examples/Tutorial_sparse_example.cpp

@@ -0,0 +1,38 @@
+#include <Eigen/Sparse>
+#include <vector>
+#include <iostream>
+
+typedef Eigen::SparseMatrix<double> SpMat; // declares a column-major sparse matrix type of double
+typedef Eigen::Triplet<double> T;
+
+void buildProblem(std::vector<T>& coefficients, Eigen::VectorXd& b, int n);
+void saveAsBitmap(const Eigen::VectorXd& x, int n, const char* filename);
+
+int main(int argc, char** argv)
+{
+  if(argc!=2) {
+    std::cerr << "Error: expected one and only one argument.\n";
+    return -1;
+  }
+  
+  int n = 300;  // size of the image
+  int m = n*n;  // number of unknowns (=number of pixels)
+
+  // Assembly:
+  std::vector<T> coefficients;            // list of non-zeros coefficients
+  Eigen::VectorXd b(m);                   // the right hand side-vector resulting from the constraints
+  buildProblem(coefficients, b, n);
+
+  SpMat A(m,m);
+  A.setFromTriplets(coefficients.begin(), coefficients.end());
+
+  // Solving:
+  Eigen::SimplicialCholesky<SpMat> chol(A);  // performs a Cholesky factorization of A
+  Eigen::VectorXd x = chol.solve(b);         // use the factorization to solve for the given right hand side
+
+  // Export the result to a file:
+  saveAsBitmap(x, n, argv[1]);
+
+  return 0;
+}
+

diff --git a/doc/special_examples/Tutorial_sparse_example_details.cpp b/doc/special_examples/Tutorial_sparse_example_details.cpp
new file mode 100644
index 0000000..bc18b01
--- /dev/null
+++ b/doc/special_examples/Tutorial_sparse_example_details.cpp

@@ -0,0 +1,44 @@
+#include <Eigen/Sparse>
+#include <vector>
+#include <QImage>
+
+typedef Eigen::SparseMatrix<double> SpMat; // declares a column-major sparse matrix type of double
+typedef Eigen::Triplet<double> T;
+
+void insertCoefficient(int id, int i, int j, double w, std::vector<T>& coeffs,
+                       Eigen::VectorXd& b, const Eigen::VectorXd& boundary)
+{
+  int n = int(boundary.size());
+  int id1 = i+j*n;
+
+        if(i==-1 || i==n) b(id) -= w * boundary(j); // constrained coefficient
+  else  if(j==-1 || j==n) b(id) -= w * boundary(i); // constrained coefficient
+  else  coeffs.push_back(T(id,id1,w));              // unknown coefficient
+}
+
+void buildProblem(std::vector<T>& coefficients, Eigen::VectorXd& b, int n)
+{
+  b.setZero();
+  Eigen::ArrayXd boundary = Eigen::ArrayXd::LinSpaced(n, 0,M_PI).sin().pow(2);
+  for(int j=0; j<n; ++j)
+  {
+    for(int i=0; i<n; ++i)
+    {
+      int id = i+j*n;
+      insertCoefficient(id, i-1,j, -1, coefficients, b, boundary);
+      insertCoefficient(id, i+1,j, -1, coefficients, b, boundary);
+      insertCoefficient(id, i,j-1, -1, coefficients, b, boundary);
+      insertCoefficient(id, i,j+1, -1, coefficients, b, boundary);
+      insertCoefficient(id, i,j,    4, coefficients, b, boundary);
+    }
+  }
+}
+
+void saveAsBitmap(const Eigen::VectorXd& x, int n, const char* filename)
+{
+  Eigen::Array<unsigned char,Eigen::Dynamic,Eigen::Dynamic> bits = (x*255).cast<unsigned char>();
+  QImage img(bits.data(), n,n,QImage::Format_Indexed8);
+  img.setColorCount(256);
+  for(int i=0;i<256;i++) img.setColor(i,qRgb(i,i,i));
+  img.save(filename);
+}

diff --git a/doc/special_examples/random_cpp11.cpp b/doc/special_examples/random_cpp11.cpp
new file mode 100644
index 0000000..33744c0
--- /dev/null
+++ b/doc/special_examples/random_cpp11.cpp

@@ -0,0 +1,14 @@
+#include <Eigen/Core>
+#include <iostream>
+#include <random>
+
+using namespace Eigen;
+
+int main() {
+  std::default_random_engine generator;
+  std::poisson_distribution<int> distribution(4.1);
+  auto poisson = [&] () {return distribution(generator);};
+
+  RowVectorXi v = RowVectorXi::NullaryExpr(10, poisson );
+  std::cout << v << "\n";
+}

diff --git a/doc/tutorial.cpp b/doc/tutorial.cpp
new file mode 100644
index 0000000..62be7c2
--- /dev/null
+++ b/doc/tutorial.cpp

@@ -0,0 +1,62 @@
+#include <Eigen/Array>
+
+int main(int argc, char *argv[])
+{
+  std::cout.precision(2);
+
+  // demo static functions
+  Eigen::Matrix3f m3 = Eigen::Matrix3f::Random();
+  Eigen::Matrix4f m4 = Eigen::Matrix4f::Identity();
+
+  std::cout << "*** Step 1 ***\nm3:\n" << m3 << "\nm4:\n" << m4 << std::endl;
+
+  // demo non-static set... functions
+  m4.setZero();
+  m3.diagonal().setOnes();
+  
+  std::cout << "*** Step 2 ***\nm3:\n" << m3 << "\nm4:\n" << m4 << std::endl;
+
+  // demo fixed-size block() expression as lvalue and as rvalue
+  m4.block<3,3>(0,1) = m3;
+  m3.row(2) = m4.block<1,3>(2,0);
+
+  std::cout << "*** Step 3 ***\nm3:\n" << m3 << "\nm4:\n" << m4 << std::endl;
+
+  // demo dynamic-size block()
+  {
+    int rows = 3, cols = 3;
+    m4.block(0,1,3,3).setIdentity();
+    std::cout << "*** Step 4 ***\nm4:\n" << m4 << std::endl;
+  }
+
+  // demo vector blocks
+  m4.diagonal().block(1,2).setOnes();
+  std::cout << "*** Step 5 ***\nm4.diagonal():\n" << m4.diagonal() << std::endl;
+  std::cout << "m4.diagonal().start(3)\n" << m4.diagonal().start(3) << std::endl;
+
+  // demo coeff-wise operations
+  m4 = m4.cwise()*m4;
+  m3 = m3.cwise().cos();
+  std::cout << "*** Step 6 ***\nm3:\n" << m3 << "\nm4:\n" << m4 << std::endl;
+
+  // sums of coefficients
+  std::cout << "*** Step 7 ***\n m4.sum(): " << m4.sum() << std::endl;
+  std::cout << "m4.col(2).sum(): " << m4.col(2).sum() << std::endl;
+  std::cout << "m4.colwise().sum():\n" << m4.colwise().sum() << std::endl;
+  std::cout << "m4.rowwise().sum():\n" << m4.rowwise().sum() << std::endl;
+
+  // demo intelligent auto-evaluation
+  m4 = m4 * m4; // auto-evaluates so no aliasing problem (performance penalty is low)
+  Eigen::Matrix4f other = (m4 * m4).lazy(); // forces lazy evaluation
+  m4 = m4 + m4; // here Eigen goes for lazy evaluation, as with most expressions
+  m4 = -m4 + m4 + 5 * m4; // same here, Eigen chooses lazy evaluation for all that.
+  m4 = m4 * (m4 + m4); // here Eigen chooses to first evaluate m4 + m4 into a temporary.
+                       // indeed, here it is an optimization to cache this intermediate result.
+  m3 = m3 * m4.block<3,3>(1,1); // here Eigen chooses NOT to evaluate block() into a temporary
+    // because accessing coefficients of that block expression is not more costly than accessing
+    // coefficients of a plain matrix.
+  m4 = m4 * m4.transpose(); // same here, lazy evaluation of the transpose.
+  m4 = m4 * m4.transpose().eval(); // forces immediate evaluation of the transpose
+
+  std::cout << "*** Step 8 ***\nm3:\n" << m3 << "\nm4:\n" << m4 << std::endl;
+}

diff --git a/eigen3.pc.in b/eigen3.pc.in
new file mode 100644
index 0000000..3368a3a
--- /dev/null
+++ b/eigen3.pc.in

@@ -0,0 +1,9 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+
+Name: Eigen3
+Description: A C++ template library for linear algebra: vectors, matrices, and related algorithms
+Requires:
+Version: @EIGEN_VERSION_NUMBER@
+Libs:
+Cflags: -I${prefix}/@INCLUDE_INSTALL_DIR@

diff --git a/failtest/CMakeLists.txt b/failtest/CMakeLists.txt
new file mode 100644
index 0000000..256e541
--- /dev/null
+++ b/failtest/CMakeLists.txt

@@ -0,0 +1,70 @@
+
+ei_add_failtest("failtest_sanity_check")
+
+ei_add_failtest("block_nonconst_ctor_on_const_xpr_0")
+ei_add_failtest("block_nonconst_ctor_on_const_xpr_1")
+ei_add_failtest("block_nonconst_ctor_on_const_xpr_2")
+ei_add_failtest("transpose_nonconst_ctor_on_const_xpr")
+ei_add_failtest("diagonal_nonconst_ctor_on_const_xpr")
+ei_add_failtest("cwiseunaryview_nonconst_ctor_on_const_xpr")
+ei_add_failtest("triangularview_nonconst_ctor_on_const_xpr")
+ei_add_failtest("selfadjointview_nonconst_ctor_on_const_xpr")
+
+ei_add_failtest("const_qualified_block_method_retval_0")
+ei_add_failtest("const_qualified_block_method_retval_1")
+ei_add_failtest("const_qualified_transpose_method_retval")
+ei_add_failtest("const_qualified_diagonal_method_retval")
+
+ei_add_failtest("map_nonconst_ctor_on_const_ptr_0")
+ei_add_failtest("map_nonconst_ctor_on_const_ptr_1")
+ei_add_failtest("map_nonconst_ctor_on_const_ptr_2")
+ei_add_failtest("map_nonconst_ctor_on_const_ptr_3")
+ei_add_failtest("map_nonconst_ctor_on_const_ptr_4")
+
+ei_add_failtest("map_on_const_type_actually_const_0")
+ei_add_failtest("map_on_const_type_actually_const_1")
+ei_add_failtest("block_on_const_type_actually_const_0")
+ei_add_failtest("block_on_const_type_actually_const_1")
+ei_add_failtest("transpose_on_const_type_actually_const")
+ei_add_failtest("diagonal_on_const_type_actually_const")
+ei_add_failtest("cwiseunaryview_on_const_type_actually_const")
+ei_add_failtest("triangularview_on_const_type_actually_const")
+ei_add_failtest("selfadjointview_on_const_type_actually_const")
+
+ei_add_failtest("ref_1")
+ei_add_failtest("ref_2")
+ei_add_failtest("ref_3")
+ei_add_failtest("ref_4")
+ei_add_failtest("ref_5")
+
+ei_add_failtest("swap_1")
+ei_add_failtest("swap_2")
+
+ei_add_failtest("ternary_1")
+ei_add_failtest("ternary_2")
+
+ei_add_failtest("sparse_ref_1")
+ei_add_failtest("sparse_ref_2")
+ei_add_failtest("sparse_ref_3")
+ei_add_failtest("sparse_ref_4")
+ei_add_failtest("sparse_ref_5")
+
+ei_add_failtest("sparse_storage_mismatch")
+
+ei_add_failtest("partialpivlu_int")
+ei_add_failtest("fullpivlu_int")
+ei_add_failtest("llt_int")
+ei_add_failtest("ldlt_int")
+ei_add_failtest("qr_int")
+ei_add_failtest("colpivqr_int")
+ei_add_failtest("fullpivqr_int")
+ei_add_failtest("jacobisvd_int")
+ei_add_failtest("bdcsvd_int")
+ei_add_failtest("eigensolver_int")
+ei_add_failtest("eigensolver_cplx")
+
+if(EIGEN_TEST_CXX11)
+  ei_add_failtest("initializer_list_1")
+  ei_add_failtest("initializer_list_2")
+endif()
+

diff --git a/failtest/bdcsvd_int.cpp b/failtest/bdcsvd_int.cpp
new file mode 100644
index 0000000..670752c
--- /dev/null
+++ b/failtest/bdcsvd_int.cpp

@@ -0,0 +1,14 @@
+#include "../Eigen/SVD"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  BDCSVD<Matrix<SCALAR,Dynamic,Dynamic> > qr(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}

diff --git a/failtest/block_nonconst_ctor_on_const_xpr_0.cpp b/failtest/block_nonconst_ctor_on_const_xpr_0.cpp
new file mode 100644
index 0000000..40b8201
--- /dev/null
+++ b/failtest/block_nonconst_ctor_on_const_xpr_0.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(CV_QUALIFIER Matrix3d &m){
+    Block<Matrix3d,3,3> b(m,0,0);
+}
+
+int main() {}

diff --git a/failtest/block_nonconst_ctor_on_const_xpr_1.cpp b/failtest/block_nonconst_ctor_on_const_xpr_1.cpp
new file mode 100644
index 0000000..ef6d537
--- /dev/null
+++ b/failtest/block_nonconst_ctor_on_const_xpr_1.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(CV_QUALIFIER Matrix3d &m){
+    Block<Matrix3d> b(m,0,0,3,3);
+}
+
+int main() {}

diff --git a/failtest/block_nonconst_ctor_on_const_xpr_2.cpp b/failtest/block_nonconst_ctor_on_const_xpr_2.cpp
new file mode 100644
index 0000000..43f18ae
--- /dev/null
+++ b/failtest/block_nonconst_ctor_on_const_xpr_2.cpp

@@ -0,0 +1,16 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(CV_QUALIFIER Matrix3d &m){
+    // row/column constructor
+    Block<Matrix3d,3,1> b(m,0);
+}
+
+int main() {}

diff --git a/failtest/block_on_const_type_actually_const_0.cpp b/failtest/block_on_const_type_actually_const_0.cpp
new file mode 100644
index 0000000..009bebe
--- /dev/null
+++ b/failtest/block_on_const_type_actually_const_0.cpp

@@ -0,0 +1,16 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(){
+    Matrix3f m;
+    Block<CV_QUALIFIER Matrix3f>(m, 0, 0, 3, 3).coeffRef(0, 0) = 1.0f;
+}
+
+int main() {}

diff --git a/failtest/block_on_const_type_actually_const_1.cpp b/failtest/block_on_const_type_actually_const_1.cpp
new file mode 100644
index 0000000..4c3e93f
--- /dev/null
+++ b/failtest/block_on_const_type_actually_const_1.cpp

@@ -0,0 +1,16 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(){
+    MatrixXf m;
+    Block<CV_QUALIFIER MatrixXf, 3, 3>(m, 0, 0).coeffRef(0, 0) = 1.0f;
+}
+
+int main() {}

diff --git a/failtest/colpivqr_int.cpp b/failtest/colpivqr_int.cpp
new file mode 100644
index 0000000..db11910
--- /dev/null
+++ b/failtest/colpivqr_int.cpp

@@ -0,0 +1,14 @@
+#include "../Eigen/QR"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  ColPivHouseholderQR<Matrix<SCALAR,Dynamic,Dynamic> > qr(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}

diff --git a/failtest/const_qualified_block_method_retval_0.cpp b/failtest/const_qualified_block_method_retval_0.cpp
new file mode 100644
index 0000000..a6bd5fe
--- /dev/null
+++ b/failtest/const_qualified_block_method_retval_0.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(CV_QUALIFIER Matrix3d &m){
+    Block<Matrix3d,3,3> b(m.block<3,3>(0,0));
+}
+
+int main() {}

diff --git a/failtest/const_qualified_block_method_retval_1.cpp b/failtest/const_qualified_block_method_retval_1.cpp
new file mode 100644
index 0000000..ef40c24
--- /dev/null
+++ b/failtest/const_qualified_block_method_retval_1.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(CV_QUALIFIER Matrix3d &m){
+    Block<Matrix3d> b(m.block(0,0,3,3));
+}
+
+int main() {}

diff --git a/failtest/const_qualified_diagonal_method_retval.cpp b/failtest/const_qualified_diagonal_method_retval.cpp
new file mode 100644
index 0000000..809594a
--- /dev/null
+++ b/failtest/const_qualified_diagonal_method_retval.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(CV_QUALIFIER Matrix3d &m){
+    Diagonal<Matrix3d> b(m.diagonal());
+}
+
+int main() {}

diff --git a/failtest/const_qualified_transpose_method_retval.cpp b/failtest/const_qualified_transpose_method_retval.cpp
new file mode 100644
index 0000000..2d7f19c
--- /dev/null
+++ b/failtest/const_qualified_transpose_method_retval.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(CV_QUALIFIER Matrix3d &m){
+    Transpose<Matrix3d> b(m.transpose());
+}
+
+int main() {}

diff --git a/failtest/cwiseunaryview_nonconst_ctor_on_const_xpr.cpp b/failtest/cwiseunaryview_nonconst_ctor_on_const_xpr.cpp
new file mode 100644
index 0000000..e23cf8f
--- /dev/null
+++ b/failtest/cwiseunaryview_nonconst_ctor_on_const_xpr.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(CV_QUALIFIER Matrix3d &m){
+    CwiseUnaryView<internal::scalar_real_ref_op<double>,Matrix3d> t(m);
+}
+
+int main() {}

diff --git a/failtest/cwiseunaryview_on_const_type_actually_const.cpp b/failtest/cwiseunaryview_on_const_type_actually_const.cpp
new file mode 100644
index 0000000..fcd41df
--- /dev/null
+++ b/failtest/cwiseunaryview_on_const_type_actually_const.cpp

@@ -0,0 +1,16 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(){
+    MatrixXf m;
+    CwiseUnaryView<internal::scalar_real_ref_op<double>,CV_QUALIFIER MatrixXf>(m).coeffRef(0, 0) = 1.0f;
+}
+
+int main() {}

diff --git a/failtest/diagonal_nonconst_ctor_on_const_xpr.cpp b/failtest/diagonal_nonconst_ctor_on_const_xpr.cpp
new file mode 100644
index 0000000..76398a2
--- /dev/null
+++ b/failtest/diagonal_nonconst_ctor_on_const_xpr.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(CV_QUALIFIER Matrix3d &m){
+    Diagonal<Matrix3d> d(m);
+}
+
+int main() {}

diff --git a/failtest/diagonal_on_const_type_actually_const.cpp b/failtest/diagonal_on_const_type_actually_const.cpp
new file mode 100644
index 0000000..d4b2fd9
--- /dev/null
+++ b/failtest/diagonal_on_const_type_actually_const.cpp

@@ -0,0 +1,16 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(){
+    MatrixXf m;
+    Diagonal<CV_QUALIFIER MatrixXf>(m).coeffRef(0) = 1.0f;
+}
+
+int main() {}

diff --git a/failtest/eigensolver_cplx.cpp b/failtest/eigensolver_cplx.cpp
new file mode 100644
index 0000000..c2e21e1
--- /dev/null
+++ b/failtest/eigensolver_cplx.cpp

@@ -0,0 +1,14 @@
+#include "../Eigen/Eigenvalues"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR std::complex<double>
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  EigenSolver<Matrix<SCALAR,Dynamic,Dynamic> > eig(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}

diff --git a/failtest/eigensolver_int.cpp b/failtest/eigensolver_int.cpp
new file mode 100644
index 0000000..eda8dc2
--- /dev/null
+++ b/failtest/eigensolver_int.cpp

@@ -0,0 +1,14 @@
+#include "../Eigen/Eigenvalues"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  EigenSolver<Matrix<SCALAR,Dynamic,Dynamic> > eig(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}

diff --git a/failtest/failtest_sanity_check.cpp b/failtest/failtest_sanity_check.cpp
new file mode 100644
index 0000000..769fa94
--- /dev/null
+++ b/failtest/failtest_sanity_check.cpp

@@ -0,0 +1,5 @@
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+This is just some text that won't compile as a C++ file, as a basic sanity check for failtest.
+#else
+int main() {}
+#endif

diff --git a/failtest/fullpivlu_int.cpp b/failtest/fullpivlu_int.cpp
new file mode 100644
index 0000000..e9d2c6e
--- /dev/null
+++ b/failtest/fullpivlu_int.cpp

@@ -0,0 +1,14 @@
+#include "../Eigen/LU"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  FullPivLU<Matrix<SCALAR,Dynamic,Dynamic> > lu(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}

diff --git a/failtest/fullpivqr_int.cpp b/failtest/fullpivqr_int.cpp
new file mode 100644
index 0000000..d182a7b
--- /dev/null
+++ b/failtest/fullpivqr_int.cpp

@@ -0,0 +1,14 @@
+#include "../Eigen/QR"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  FullPivHouseholderQR<Matrix<SCALAR,Dynamic,Dynamic> > qr(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}

diff --git a/failtest/initializer_list_1.cpp b/failtest/initializer_list_1.cpp
new file mode 100644
index 0000000..92dfd1f
--- /dev/null
+++ b/failtest/initializer_list_1.cpp

@@ -0,0 +1,14 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define ROWS Dynamic
+#else
+#define ROWS 3
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  Matrix<int, ROWS, 1> {1, 2, 3};
+}

diff --git a/failtest/initializer_list_2.cpp b/failtest/initializer_list_2.cpp
new file mode 100644
index 0000000..1996050
--- /dev/null
+++ b/failtest/initializer_list_2.cpp

@@ -0,0 +1,16 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define ROWS Dynamic
+#define COLS Dynamic
+#else
+#define ROWS 3
+#define COLS 1
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  Matrix<int, ROWS, COLS> {1, 2, 3};
+}

diff --git a/failtest/jacobisvd_int.cpp b/failtest/jacobisvd_int.cpp
new file mode 100644
index 0000000..12790ae
--- /dev/null
+++ b/failtest/jacobisvd_int.cpp

@@ -0,0 +1,14 @@
+#include "../Eigen/SVD"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  JacobiSVD<Matrix<SCALAR,Dynamic,Dynamic> > qr(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}

diff --git a/failtest/ldlt_int.cpp b/failtest/ldlt_int.cpp
new file mode 100644
index 0000000..243e457
--- /dev/null
+++ b/failtest/ldlt_int.cpp

@@ -0,0 +1,14 @@
+#include "../Eigen/Cholesky"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  LDLT<Matrix<SCALAR,Dynamic,Dynamic> > ldlt(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}

diff --git a/failtest/llt_int.cpp b/failtest/llt_int.cpp
new file mode 100644
index 0000000..cb02065
--- /dev/null
+++ b/failtest/llt_int.cpp

@@ -0,0 +1,14 @@
+#include "../Eigen/Cholesky"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  LLT<Matrix<SCALAR,Dynamic,Dynamic> > llt(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}

diff --git a/failtest/map_nonconst_ctor_on_const_ptr_0.cpp b/failtest/map_nonconst_ctor_on_const_ptr_0.cpp
new file mode 100644
index 0000000..d75686f
--- /dev/null
+++ b/failtest/map_nonconst_ctor_on_const_ptr_0.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(CV_QUALIFIER float *ptr){
+    Map<Matrix3f> m(ptr);
+}
+
+int main() {}

diff --git a/failtest/map_nonconst_ctor_on_const_ptr_1.cpp b/failtest/map_nonconst_ctor_on_const_ptr_1.cpp
new file mode 100644
index 0000000..eda134d
--- /dev/null
+++ b/failtest/map_nonconst_ctor_on_const_ptr_1.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(CV_QUALIFIER float *ptr, DenseIndex size){
+    Map<ArrayXf> m(ptr, size);
+}
+
+int main() {}

diff --git a/failtest/map_nonconst_ctor_on_const_ptr_2.cpp b/failtest/map_nonconst_ctor_on_const_ptr_2.cpp
new file mode 100644
index 0000000..06b4b62
--- /dev/null
+++ b/failtest/map_nonconst_ctor_on_const_ptr_2.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(CV_QUALIFIER float *ptr, DenseIndex rows, DenseIndex cols){
+    Map<MatrixXf> m(ptr, rows, cols);
+}
+
+int main() {}

diff --git a/failtest/map_nonconst_ctor_on_const_ptr_3.cpp b/failtest/map_nonconst_ctor_on_const_ptr_3.cpp
new file mode 100644
index 0000000..830f6f0
--- /dev/null
+++ b/failtest/map_nonconst_ctor_on_const_ptr_3.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(CV_QUALIFIER float *ptr, DenseIndex rows, DenseIndex cols){
+    Map<MatrixXf, Aligned, InnerStride<2> > m(ptr, rows, cols, InnerStride<2>());
+}
+
+int main() {}

diff --git a/failtest/map_nonconst_ctor_on_const_ptr_4.cpp b/failtest/map_nonconst_ctor_on_const_ptr_4.cpp
new file mode 100644
index 0000000..c3e8c95
--- /dev/null
+++ b/failtest/map_nonconst_ctor_on_const_ptr_4.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER
+#else
+#define CV_QUALIFIER const
+#endif
+
+using namespace Eigen;
+
+void foo(const float *ptr, DenseIndex rows, DenseIndex cols){
+    Map<CV_QUALIFIER MatrixXf, Unaligned, OuterStride<> > m(ptr, rows, cols, OuterStride<>(2));
+}
+
+int main() {}

diff --git a/failtest/map_on_const_type_actually_const_0.cpp b/failtest/map_on_const_type_actually_const_0.cpp
new file mode 100644
index 0000000..8cb6aa0
--- /dev/null
+++ b/failtest/map_on_const_type_actually_const_0.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(float *ptr){
+    Map<CV_QUALIFIER MatrixXf>(ptr, 1, 1).coeffRef(0,0) = 1.0f;
+}
+
+int main() {}

diff --git a/failtest/map_on_const_type_actually_const_1.cpp b/failtest/map_on_const_type_actually_const_1.cpp
new file mode 100644
index 0000000..04e067c
--- /dev/null
+++ b/failtest/map_on_const_type_actually_const_1.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(float *ptr){
+    Map<CV_QUALIFIER Vector3f>(ptr).coeffRef(0) = 1.0f;
+}
+
+int main() {}

diff --git a/failtest/partialpivlu_int.cpp b/failtest/partialpivlu_int.cpp
new file mode 100644
index 0000000..98ef282
--- /dev/null
+++ b/failtest/partialpivlu_int.cpp

@@ -0,0 +1,14 @@
+#include "../Eigen/LU"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  PartialPivLU<Matrix<SCALAR,Dynamic,Dynamic> > lu(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}

diff --git a/failtest/qr_int.cpp b/failtest/qr_int.cpp
new file mode 100644
index 0000000..ce200e8
--- /dev/null
+++ b/failtest/qr_int.cpp

@@ -0,0 +1,14 @@
+#include "../Eigen/QR"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  HouseholderQR<Matrix<SCALAR,Dynamic,Dynamic> > qr(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}

diff --git a/failtest/ref_1.cpp b/failtest/ref_1.cpp
new file mode 100644
index 0000000..8b798d5
--- /dev/null
+++ b/failtest/ref_1.cpp

@@ -0,0 +1,18 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void call_ref(Ref<VectorXf> a) { }
+
+int main()
+{
+  VectorXf a(10);
+  CV_QUALIFIER VectorXf& ac(a);
+  call_ref(ac);
+}

diff --git a/failtest/ref_2.cpp b/failtest/ref_2.cpp
new file mode 100644
index 0000000..0b779cc
--- /dev/null
+++ b/failtest/ref_2.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+using namespace Eigen;
+
+void call_ref(Ref<VectorXf> a) { }
+
+int main()
+{
+  MatrixXf A(10,10);
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+  call_ref(A.row(3));
+#else
+  call_ref(A.col(3));
+#endif
+}

diff --git a/failtest/ref_3.cpp b/failtest/ref_3.cpp
new file mode 100644
index 0000000..f46027d
--- /dev/null
+++ b/failtest/ref_3.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+using namespace Eigen;
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+void call_ref(Ref<VectorXf> a) { }
+#else
+void call_ref(const Ref<const VectorXf> &a) { }
+#endif
+
+int main()
+{
+  VectorXf a(10);
+  call_ref(a+a);
+}

diff --git a/failtest/ref_4.cpp b/failtest/ref_4.cpp
new file mode 100644
index 0000000..6c11fa4
--- /dev/null
+++ b/failtest/ref_4.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+using namespace Eigen;
+
+void call_ref(Ref<MatrixXf,0,OuterStride<> > a) {}
+
+int main()
+{
+  MatrixXf A(10,10);
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+  call_ref(A.transpose());
+#else
+  call_ref(A);
+#endif
+}

diff --git a/failtest/ref_5.cpp b/failtest/ref_5.cpp
new file mode 100644
index 0000000..846d527
--- /dev/null
+++ b/failtest/ref_5.cpp

@@ -0,0 +1,16 @@
+#include "../Eigen/Core"
+
+using namespace Eigen;
+
+void call_ref(Ref<VectorXf> a) { }
+
+int main()
+{
+  VectorXf a(10);
+  DenseBase<VectorXf> &ac(a);
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+  call_ref(ac);
+#else
+  call_ref(ac.derived());
+#endif
+}

diff --git a/failtest/selfadjointview_nonconst_ctor_on_const_xpr.cpp b/failtest/selfadjointview_nonconst_ctor_on_const_xpr.cpp
new file mode 100644
index 0000000..a240f81
--- /dev/null
+++ b/failtest/selfadjointview_nonconst_ctor_on_const_xpr.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(CV_QUALIFIER Matrix3d &m){
+    SelfAdjointView<Matrix3d,Upper> t(m);
+}
+
+int main() {}

diff --git a/failtest/selfadjointview_on_const_type_actually_const.cpp b/failtest/selfadjointview_on_const_type_actually_const.cpp
new file mode 100644
index 0000000..19aaad6
--- /dev/null
+++ b/failtest/selfadjointview_on_const_type_actually_const.cpp

@@ -0,0 +1,16 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(){
+    MatrixXf m;
+    SelfAdjointView<CV_QUALIFIER MatrixXf,Upper>(m).coeffRef(0, 0) = 1.0f;
+}
+
+int main() {}

diff --git a/failtest/sparse_ref_1.cpp b/failtest/sparse_ref_1.cpp
new file mode 100644
index 0000000..d78d1f9
--- /dev/null
+++ b/failtest/sparse_ref_1.cpp

@@ -0,0 +1,18 @@
+#include "../Eigen/Sparse"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void call_ref(Ref<SparseMatrix<float> > a) { }
+
+int main()
+{
+  SparseMatrix<float> a(10,10);
+  CV_QUALIFIER SparseMatrix<float>& ac(a);
+  call_ref(ac);
+}

diff --git a/failtest/sparse_ref_2.cpp b/failtest/sparse_ref_2.cpp
new file mode 100644
index 0000000..46c9440
--- /dev/null
+++ b/failtest/sparse_ref_2.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Sparse"
+
+using namespace Eigen;
+
+void call_ref(Ref<SparseMatrix<float> > a) { }
+
+int main()
+{
+  SparseMatrix<float> A(10,10);
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+  call_ref(A.row(3));
+#else
+  call_ref(A.col(3));
+#endif
+}

diff --git a/failtest/sparse_ref_3.cpp b/failtest/sparse_ref_3.cpp
new file mode 100644
index 0000000..a9949b5
--- /dev/null
+++ b/failtest/sparse_ref_3.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Sparse"
+
+using namespace Eigen;
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+void call_ref(Ref<SparseMatrix<float> > a) { }
+#else
+void call_ref(const Ref<const SparseMatrix<float> > &a) { }
+#endif
+
+int main()
+{
+  SparseMatrix<float> a(10,10);
+  call_ref(a+a);
+}

diff --git a/failtest/sparse_ref_4.cpp b/failtest/sparse_ref_4.cpp
new file mode 100644
index 0000000..57bb6a1
--- /dev/null
+++ b/failtest/sparse_ref_4.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Sparse"
+
+using namespace Eigen;
+
+void call_ref(Ref<SparseMatrix<float> > a) {}
+
+int main()
+{
+  SparseMatrix<float> A(10,10);
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+  call_ref(A.transpose());
+#else
+  call_ref(A);
+#endif
+}

diff --git a/failtest/sparse_ref_5.cpp b/failtest/sparse_ref_5.cpp
new file mode 100644
index 0000000..4478f6f
--- /dev/null
+++ b/failtest/sparse_ref_5.cpp

@@ -0,0 +1,16 @@
+#include "../Eigen/Sparse"
+
+using namespace Eigen;
+
+void call_ref(Ref<SparseMatrix<float> > a) { }
+
+int main()
+{
+  SparseMatrix<float> a(10,10);
+  SparseMatrixBase<SparseMatrix<float> > &ac(a);
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+  call_ref(ac);
+#else
+  call_ref(ac.derived());
+#endif
+}

diff --git a/failtest/sparse_storage_mismatch.cpp b/failtest/sparse_storage_mismatch.cpp
new file mode 100644
index 0000000..51840d4
--- /dev/null
+++ b/failtest/sparse_storage_mismatch.cpp

@@ -0,0 +1,16 @@
+#include "../Eigen/Sparse"
+using namespace Eigen;
+
+typedef SparseMatrix<double,ColMajor> Mat1;
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+typedef SparseMatrix<double,RowMajor> Mat2;
+#else
+typedef SparseMatrix<double,ColMajor> Mat2;
+#endif
+
+int main()
+{
+  Mat1 a(10,10);
+  Mat2 b(10,10);
+  a += b;
+}

diff --git a/failtest/swap_1.cpp b/failtest/swap_1.cpp
new file mode 100644
index 0000000..1063797
--- /dev/null
+++ b/failtest/swap_1.cpp

@@ -0,0 +1,14 @@
+#include "../Eigen/Core"
+
+using namespace Eigen;
+
+int main()
+{
+  VectorXf a(10), b(10);
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+  const DenseBase<VectorXf> &ac(a);
+#else
+  DenseBase<VectorXf> &ac(a);
+#endif
+  b.swap(ac);
+}

diff --git a/failtest/swap_2.cpp b/failtest/swap_2.cpp
new file mode 100644
index 0000000..b386cf4
--- /dev/null
+++ b/failtest/swap_2.cpp

@@ -0,0 +1,14 @@
+#include "../Eigen/Core"
+
+using namespace Eigen;
+
+int main()
+{
+  VectorXf a(10), b(10);
+  VectorXf const &ac(a);
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+  b.swap(ac);
+#else
+  b.swap(ac.const_cast_derived());
+#endif
+}

diff --git a/failtest/ternary_1.cpp b/failtest/ternary_1.cpp
new file mode 100644
index 0000000..b40bcb0
--- /dev/null
+++ b/failtest/ternary_1.cpp

@@ -0,0 +1,13 @@
+#include "../Eigen/Core"
+
+using namespace Eigen;
+
+int main(int argc,char **)
+{
+  VectorXf a(10), b(10);
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+  b = argc>1 ? 2*a : -a;
+#else
+  b = argc>1 ? 2*a : VectorXf(-a);
+#endif
+}

diff --git a/failtest/ternary_2.cpp b/failtest/ternary_2.cpp
new file mode 100644
index 0000000..a46b12b
--- /dev/null
+++ b/failtest/ternary_2.cpp

@@ -0,0 +1,13 @@
+#include "../Eigen/Core"
+
+using namespace Eigen;
+
+int main(int argc,char **)
+{
+  VectorXf a(10), b(10);
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+  b = argc>1 ? 2*a : a+a;
+#else
+  b = argc>1 ? VectorXf(2*a) : VectorXf(a+a);
+#endif
+}

diff --git a/failtest/transpose_nonconst_ctor_on_const_xpr.cpp b/failtest/transpose_nonconst_ctor_on_const_xpr.cpp
new file mode 100644
index 0000000..4223e7f
--- /dev/null
+++ b/failtest/transpose_nonconst_ctor_on_const_xpr.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(CV_QUALIFIER Matrix3d &m){
+    Transpose<Matrix3d> t(m);
+}
+
+int main() {}

diff --git a/failtest/transpose_on_const_type_actually_const.cpp b/failtest/transpose_on_const_type_actually_const.cpp
new file mode 100644
index 0000000..d0b7d0d
--- /dev/null
+++ b/failtest/transpose_on_const_type_actually_const.cpp

@@ -0,0 +1,16 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(){
+    MatrixXf m;
+    Transpose<CV_QUALIFIER MatrixXf>(m).coeffRef(0, 0) = 1.0f;
+}
+
+int main() {}

diff --git a/failtest/triangularview_nonconst_ctor_on_const_xpr.cpp b/failtest/triangularview_nonconst_ctor_on_const_xpr.cpp
new file mode 100644
index 0000000..807447e
--- /dev/null
+++ b/failtest/triangularview_nonconst_ctor_on_const_xpr.cpp

@@ -0,0 +1,15 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(CV_QUALIFIER Matrix3d &m){
+  TriangularView<Matrix3d,Upper> t(m);
+}
+
+int main() {}

diff --git a/failtest/triangularview_on_const_type_actually_const.cpp b/failtest/triangularview_on_const_type_actually_const.cpp
new file mode 100644
index 0000000..0a381a6
--- /dev/null
+++ b/failtest/triangularview_on_const_type_actually_const.cpp

@@ -0,0 +1,16 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void foo(){
+    MatrixXf m;
+    TriangularView<CV_QUALIFIER MatrixXf,Upper>(m).coeffRef(0, 0) = 1.0f;
+}
+
+int main() {}

diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt
new file mode 100644
index 0000000..c8ca640
--- /dev/null
+++ b/lapack/CMakeLists.txt

@@ -0,0 +1,459 @@
+project(EigenLapack CXX)
+
+include(CheckLanguage)
+check_language(Fortran)
+if(CMAKE_Fortran_COMPILER)
+  enable_language(Fortran)
+  if("${CMAKE_Fortran_COMPILER_ID}" STREQUAL "GNU")
+    if ("${CMAKE_Fortran_COMPILER_VERSION}" VERSION_GREATER_EQUAL 10.0)
+      # We use an old version of LAPACK with argument type mismatches.
+      # Allow them to compile anyway with newer GNU versions.
+      set(CMAKE_Fortran_FLAGS  "${CMAKE_Fortran_FLAGS} -fallow-argument-mismatch")
+    endif()
+  endif()
+  set(EIGEN_Fortran_COMPILER_WORKS ON)
+else()
+  set(EIGEN_Fortran_COMPILER_WORKS OFF)
+endif()
+
+add_custom_target(lapack)
+include_directories(../blas)
+
+set(EigenLapack_SRCS
+single.cpp double.cpp complex_single.cpp complex_double.cpp ../blas/xerbla.cpp
+)
+
+if(EIGEN_Fortran_COMPILER_WORKS)
+
+set(EigenLapack_SRCS  ${EigenLapack_SRCS}
+  slarft.f  dlarft.f  clarft.f  zlarft.f
+  slarfb.f  dlarfb.f  clarfb.f  zlarfb.f
+  slarfg.f  dlarfg.f  clarfg.f  zlarfg.f
+  slarf.f   dlarf.f   clarf.f   zlarf.f
+  sladiv.f  dladiv.f  cladiv.f  zladiv.f
+  ilaslr.f  iladlr.f  ilaclr.f  ilazlr.f
+  ilaslc.f  iladlc.f  ilaclc.f  ilazlc.f
+  dlapy2.f  dlapy3.f  slapy2.f  slapy3.f
+  clacgv.f  zlacgv.f
+  slamch.f  dlamch.f
+  second_NONE.f dsecnd_NONE.f
+)
+
+option(EIGEN_ENABLE_LAPACK_TESTS OFF "Enable the Lapack unit tests")
+
+if(EIGEN_ENABLE_LAPACK_TESTS)
+
+  get_filename_component(eigen_full_path_to_reference_lapack "./reference/" ABSOLUTE)
+  if(NOT EXISTS ${eigen_full_path_to_reference_lapack})
+    # Download lapack and install sources and testing at the right place
+    message(STATUS "Download lapack_addons_3.4.1.tgz...")
+    
+    file(DOWNLOAD "http://downloads.tuxfamily.org/eigen/lapack_addons_3.4.1.tgz"
+                  "${CMAKE_CURRENT_SOURCE_DIR}/lapack_addons_3.4.1.tgz"
+                  INACTIVITY_TIMEOUT 15
+                  TIMEOUT 240
+                  STATUS download_status
+                  EXPECTED_MD5 ab5742640617e3221a873aba44bbdc93
+                  SHOW_PROGRESS)
+                  
+    message(STATUS ${download_status})
+    list(GET download_status 0 download_status_num)
+    set(download_status_num 0)
+    if(download_status_num EQUAL 0)
+      message(STATUS "Setup lapack reference and lapack unit tests")
+      execute_process(COMMAND tar xzf  "lapack_addons_3.4.1.tgz" WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+    else()
+      message(STATUS "Download of lapack_addons_3.4.1.tgz failed, LAPACK unit tests won't be enabled")
+      set(EIGEN_ENABLE_LAPACK_TESTS false)
+    endif()
+                  
+  endif()
+  
+  get_filename_component(eigen_full_path_to_reference_lapack "./reference/" ABSOLUTE)
+  if(EXISTS ${eigen_full_path_to_reference_lapack})
+    set(EigenLapack_funcfilenames
+        ssyev.f   dsyev.f   csyev.f   zsyev.f
+        spotrf.f  dpotrf.f  cpotrf.f  zpotrf.f
+        spotrs.f  dpotrs.f  cpotrs.f  zpotrs.f
+        sgetrf.f  dgetrf.f  cgetrf.f  zgetrf.f
+        sgetrs.f  dgetrs.f  cgetrs.f  zgetrs.f)
+    
+    file(GLOB ReferenceLapack_SRCS0 RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "reference/*.f")
+    foreach(filename1 IN LISTS ReferenceLapack_SRCS0)
+      string(REPLACE "reference/" "" filename ${filename1})
+      list(FIND EigenLapack_SRCS ${filename} id1)
+      list(FIND EigenLapack_funcfilenames ${filename} id2)
+      if((id1 EQUAL -1) AND (id2 EQUAL -1))
+        set(ReferenceLapack_SRCS ${ReferenceLapack_SRCS} reference/${filename})
+      endif()
+    endforeach()
+  endif()
+  
+  
+endif()
+
+endif()
+
+set(EIGEN_LAPACK_TARGETS "")
+
+add_library(eigen_lapack_static ${EigenLapack_SRCS} ${ReferenceLapack_SRCS})
+list(APPEND EIGEN_LAPACK_TARGETS eigen_lapack_static)
+
+if (EIGEN_BUILD_SHARED_LIBS)
+  add_library(eigen_lapack SHARED ${EigenLapack_SRCS})
+  list(APPEND EIGEN_LAPACK_TARGETS eigen_lapack)
+  target_link_libraries(eigen_lapack  eigen_blas)
+endif()
+
+foreach(target IN LISTS EIGEN_LAPACK_TARGETS)
+  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+    target_link_libraries(${target} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+  endif()
+  add_dependencies(lapack ${target})
+  install(TARGETS ${target}
+          RUNTIME DESTINATION bin
+          LIBRARY DESTINATION lib
+          ARCHIVE DESTINATION lib)
+endforeach()
+
+
+get_filename_component(eigen_full_path_to_testing_lapack "./testing/" ABSOLUTE)
+if(EXISTS ${eigen_full_path_to_testing_lapack})
+  
+  # The following comes from lapack/TESTING/CMakeLists.txt
+  # Get Python
+  find_package(PythonInterp)
+  message(STATUS "Looking for Python found - ${PYTHONINTERP_FOUND}")
+  if (PYTHONINTERP_FOUND)
+    message(STATUS "Using Python version ${PYTHON_VERSION_STRING}")
+  endif()
+
+  set(LAPACK_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+  set(LAPACK_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+  set(BUILD_SINGLE      true)
+  set(BUILD_DOUBLE      true)
+  set(BUILD_COMPLEX     true)
+  set(BUILD_COMPLEX16E  true)
+  
+  if(MSVC_VERSION)
+#  string(REPLACE "/STACK:10000000" "/STACK:900000000000000000"
+#    CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
+  string(REGEX REPLACE "(.*)/STACK:(.*) (.*)" "\\1/STACK:900000000000000000 \\3"
+    CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
+  endif()
+  file(MAKE_DIRECTORY "${LAPACK_BINARY_DIR}/TESTING")
+  add_subdirectory(testing/MATGEN)
+  add_subdirectory(testing/LIN)
+  add_subdirectory(testing/EIG)
+  macro(add_lapack_test output input target)
+    set(TEST_INPUT "${LAPACK_SOURCE_DIR}/testing/${input}")
+    set(TEST_OUTPUT "${LAPACK_BINARY_DIR}/TESTING/${output}")
+    string(REPLACE "." "_" input_name ${input})
+    set(testName "${target}_${input_name}")
+    if(EXISTS "${TEST_INPUT}")
+      add_dependencies(buildtests ${target})
+      add_test(NAME LAPACK-${testName}
+        COMMAND "${CMAKE_COMMAND}"
+        -DTEST=$<TARGET_FILE:${target}>
+        -DINPUT=${TEST_INPUT} 
+        -DOUTPUT=${TEST_OUTPUT} 
+        -DINTDIR=${CMAKE_CFG_INTDIR}
+        -P "${LAPACK_SOURCE_DIR}/testing/runtest.cmake")
+    endif()
+  endmacro()
+
+  if (BUILD_SINGLE)
+  add_lapack_test(stest.out stest.in xlintsts)
+  #
+  # ======== SINGLE RFP LIN TESTS ========================
+  add_lapack_test(stest_rfp.out stest_rfp.in xlintstrfs)
+  #
+  #
+  # ======== SINGLE EIG TESTS ===========================
+  #
+
+  add_lapack_test(snep.out nep.in xeigtsts)
+
+
+  add_lapack_test(ssep.out sep.in xeigtsts)
+
+
+  add_lapack_test(ssvd.out svd.in xeigtsts)
+
+
+  add_lapack_test(sec.out sec.in xeigtsts)
+
+
+  add_lapack_test(sed.out sed.in xeigtsts)
+
+
+  add_lapack_test(sgg.out sgg.in xeigtsts)
+
+
+  add_lapack_test(sgd.out sgd.in xeigtsts)
+
+
+  add_lapack_test(ssb.out ssb.in xeigtsts)
+
+
+  add_lapack_test(ssg.out ssg.in xeigtsts)
+
+
+  add_lapack_test(sbal.out sbal.in xeigtsts)
+
+
+  add_lapack_test(sbak.out sbak.in xeigtsts)
+
+
+  add_lapack_test(sgbal.out sgbal.in xeigtsts)
+
+
+  add_lapack_test(sgbak.out sgbak.in xeigtsts)
+
+
+  add_lapack_test(sbb.out sbb.in xeigtsts)
+
+
+  add_lapack_test(sglm.out glm.in xeigtsts)
+
+
+  add_lapack_test(sgqr.out gqr.in xeigtsts)
+
+
+  add_lapack_test(sgsv.out gsv.in xeigtsts)
+
+
+  add_lapack_test(scsd.out csd.in xeigtsts)
+
+
+  add_lapack_test(slse.out lse.in xeigtsts)
+  endif()
+
+  if (BUILD_DOUBLE)
+  #
+  # ======== DOUBLE LIN TESTS ===========================
+  add_lapack_test(dtest.out dtest.in xlintstd)
+  #
+  # ======== DOUBLE RFP LIN TESTS ========================
+  add_lapack_test(dtest_rfp.out dtest_rfp.in xlintstrfd)
+  #
+  # ======== DOUBLE EIG TESTS ===========================
+
+  add_lapack_test(dnep.out nep.in xeigtstd)
+
+
+  add_lapack_test(dsep.out sep.in xeigtstd)
+
+
+  add_lapack_test(dsvd.out svd.in xeigtstd)
+
+
+  add_lapack_test(dec.out dec.in xeigtstd)
+
+
+  add_lapack_test(ded.out ded.in xeigtstd)
+
+
+  add_lapack_test(dgg.out dgg.in xeigtstd)
+
+
+  add_lapack_test(dgd.out dgd.in xeigtstd)
+
+
+  add_lapack_test(dsb.out dsb.in xeigtstd)
+
+
+  add_lapack_test(dsg.out dsg.in xeigtstd)
+
+
+  add_lapack_test(dbal.out dbal.in xeigtstd)
+
+
+  add_lapack_test(dbak.out dbak.in xeigtstd)
+
+
+  add_lapack_test(dgbal.out dgbal.in xeigtstd)
+
+
+  add_lapack_test(dgbak.out dgbak.in xeigtstd)
+
+
+  add_lapack_test(dbb.out dbb.in xeigtstd)
+
+
+  add_lapack_test(dglm.out glm.in xeigtstd)
+
+
+  add_lapack_test(dgqr.out gqr.in xeigtstd)
+
+
+  add_lapack_test(dgsv.out gsv.in xeigtstd)
+
+
+  add_lapack_test(dcsd.out csd.in xeigtstd)
+
+
+  add_lapack_test(dlse.out lse.in xeigtstd)
+  endif()
+
+  if (BUILD_COMPLEX)
+  add_lapack_test(ctest.out ctest.in xlintstc)
+  #
+  # ======== COMPLEX RFP LIN TESTS ========================
+  add_lapack_test(ctest_rfp.out ctest_rfp.in xlintstrfc)
+  #
+  # ======== COMPLEX EIG TESTS ===========================
+
+  add_lapack_test(cnep.out nep.in xeigtstc)
+
+
+  add_lapack_test(csep.out sep.in xeigtstc)
+
+
+  add_lapack_test(csvd.out svd.in xeigtstc)
+
+
+  add_lapack_test(cec.out cec.in xeigtstc)
+
+
+  add_lapack_test(ced.out ced.in xeigtstc)
+
+
+  add_lapack_test(cgg.out cgg.in xeigtstc)
+
+
+  add_lapack_test(cgd.out cgd.in xeigtstc)
+
+
+  add_lapack_test(csb.out csb.in xeigtstc)
+
+
+  add_lapack_test(csg.out csg.in xeigtstc)
+
+
+  add_lapack_test(cbal.out cbal.in xeigtstc)
+
+
+  add_lapack_test(cbak.out cbak.in xeigtstc)
+
+
+  add_lapack_test(cgbal.out cgbal.in xeigtstc)
+
+
+  add_lapack_test(cgbak.out cgbak.in xeigtstc)
+
+
+  add_lapack_test(cbb.out cbb.in xeigtstc)
+
+
+  add_lapack_test(cglm.out glm.in xeigtstc)
+
+
+  add_lapack_test(cgqr.out gqr.in xeigtstc)
+
+
+  add_lapack_test(cgsv.out gsv.in xeigtstc)
+
+
+  add_lapack_test(ccsd.out csd.in xeigtstc)
+
+
+  add_lapack_test(clse.out lse.in xeigtstc)
+  endif()
+
+  if (BUILD_COMPLEX16)
+  #
+  # ======== COMPLEX16 LIN TESTS ========================
+  add_lapack_test(ztest.out ztest.in xlintstz)
+  #
+  # ======== COMPLEX16 RFP LIN TESTS ========================
+  add_lapack_test(ztest_rfp.out ztest_rfp.in xlintstrfz)
+  #
+  # ======== COMPLEX16 EIG TESTS ===========================
+
+  add_lapack_test(znep.out nep.in xeigtstz)
+
+
+  add_lapack_test(zsep.out sep.in xeigtstz)
+
+
+  add_lapack_test(zsvd.out svd.in xeigtstz)
+
+
+  add_lapack_test(zec.out zec.in xeigtstz)
+
+
+  add_lapack_test(zed.out zed.in xeigtstz)
+
+
+  add_lapack_test(zgg.out zgg.in xeigtstz)
+
+
+  add_lapack_test(zgd.out zgd.in xeigtstz)
+
+
+  add_lapack_test(zsb.out zsb.in xeigtstz)
+
+
+  add_lapack_test(zsg.out zsg.in xeigtstz)
+
+
+  add_lapack_test(zbal.out zbal.in xeigtstz)
+
+
+  add_lapack_test(zbak.out zbak.in xeigtstz)
+
+
+  add_lapack_test(zgbal.out zgbal.in xeigtstz)
+
+
+  add_lapack_test(zgbak.out zgbak.in xeigtstz)
+
+
+  add_lapack_test(zbb.out zbb.in xeigtstz)
+
+
+  add_lapack_test(zglm.out glm.in xeigtstz)
+
+
+  add_lapack_test(zgqr.out gqr.in xeigtstz)
+
+
+  add_lapack_test(zgsv.out gsv.in xeigtstz)
+
+
+  add_lapack_test(zcsd.out csd.in xeigtstz)
+
+
+  add_lapack_test(zlse.out lse.in xeigtstz)
+  endif()
+
+
+  if (BUILD_SIMPLE)
+      if (BUILD_DOUBLE)
+  #
+  # ======== SINGLE-DOUBLE PROTO LIN TESTS ==============
+          add_lapack_test(dstest.out dstest.in xlintstds)
+      endif()
+  endif()
+
+
+  if (BUILD_COMPLEX)
+      if (BUILD_COMPLEX16)
+  #
+  # ======== COMPLEX-COMPLEX16 LIN TESTS ========================
+          add_lapack_test(zctest.out zctest.in xlintstzc)
+      endif()
+  endif()
+
+  # ==============================================================================
+
+  execute_process(COMMAND ${CMAKE_COMMAND} -E copy ${LAPACK_SOURCE_DIR}/testing/lapack_testing.py ${LAPACK_BINARY_DIR})
+  add_test(
+    NAME LAPACK_Test_Summary
+    WORKING_DIRECTORY ${LAPACK_BINARY_DIR}
+    COMMAND ${PYTHON_EXECUTABLE} "lapack_testing.py"
+  )
+
+endif()
+

diff --git a/lapack/cholesky.cpp b/lapack/cholesky.cpp
new file mode 100644
index 0000000..ea3bc12
--- /dev/null
+++ b/lapack/cholesky.cpp

@@ -0,0 +1,72 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "lapack_common.h"
+#include <Eigen/Cholesky>
+
+// POTRF computes the Cholesky factorization of a real symmetric positive definite matrix A.
+EIGEN_LAPACK_FUNC(potrf,(char* uplo, int *n, RealScalar *pa, int *lda, int *info))
+{
+  *info = 0;
+        if(UPLO(*uplo)==INVALID) *info = -1;
+  else  if(*n<0)                 *info = -2;
+  else  if(*lda<std::max(1,*n))  *info = -4;
+  if(*info!=0)
+  {
+    int e = -*info;
+    return xerbla_(SCALAR_SUFFIX_UP"POTRF", &e, 6);
+  }
+
+  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  MatrixType A(a,*n,*n,*lda);
+  int ret;
+  if(UPLO(*uplo)==UP) ret = int(internal::llt_inplace<Scalar, Upper>::blocked(A));
+  else                ret = int(internal::llt_inplace<Scalar, Lower>::blocked(A));
+
+  if(ret>=0)
+    *info = ret+1;
+  
+  return 0;
+}
+
+// POTRS solves a system of linear equations A*X = B with a symmetric
+// positive definite matrix A using the Cholesky factorization
+// A = U**T*U or A = L*L**T computed by DPOTRF.
+EIGEN_LAPACK_FUNC(potrs,(char* uplo, int *n, int *nrhs, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, int *info))
+{
+  *info = 0;
+        if(UPLO(*uplo)==INVALID) *info = -1;
+  else  if(*n<0)                 *info = -2;
+  else  if(*nrhs<0)              *info = -3;
+  else  if(*lda<std::max(1,*n))  *info = -5;
+  else  if(*ldb<std::max(1,*n))  *info = -7;
+  if(*info!=0)
+  {
+    int e = -*info;
+    return xerbla_(SCALAR_SUFFIX_UP"POTRS", &e, 6);
+  }
+
+  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  Scalar* b = reinterpret_cast<Scalar*>(pb);
+  MatrixType A(a,*n,*n,*lda);
+  MatrixType B(b,*n,*nrhs,*ldb);
+
+  if(UPLO(*uplo)==UP)
+  {
+    A.triangularView<Upper>().adjoint().solveInPlace(B);
+    A.triangularView<Upper>().solveInPlace(B);
+  }
+  else
+  {
+    A.triangularView<Lower>().solveInPlace(B);
+    A.triangularView<Lower>().adjoint().solveInPlace(B);
+  }
+
+  return 0;
+}

diff --git a/lapack/clacgv.f b/lapack/clacgv.f
new file mode 100644
index 0000000..359eb07
--- /dev/null
+++ b/lapack/clacgv.f

@@ -0,0 +1,116 @@
+*> \brief \b CLACGV
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download CLACGV + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/clacgv.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/clacgv.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/clacgv.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CLACGV( N, X, INCX )
+* 
+*       .. Scalar Arguments ..
+*       INTEGER            INCX, N
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            X( * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CLACGV conjugates a complex vector of length N.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The length of the vector X.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] X
+*> \verbatim
+*>          X is COMPLEX array, dimension
+*>                         (1+(N-1)*abs(INCX))
+*>          On entry, the vector of length N to be conjugated.
+*>          On exit, X is overwritten with conjg(X).
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*>          INCX is INTEGER
+*>          The spacing between successive elements of X.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup complexOTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE CLACGV( N, X, INCX )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            X( * )
+*     ..
+*
+* =====================================================================
+*
+*     .. Local Scalars ..
+      INTEGER            I, IOFF
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG
+*     ..
+*     .. Executable Statements ..
+*
+      IF( INCX.EQ.1 ) THEN
+         DO 10 I = 1, N
+            X( I ) = CONJG( X( I ) )
+   10    CONTINUE
+      ELSE
+         IOFF = 1
+         IF( INCX.LT.0 )
+     $      IOFF = 1 - ( N-1 )*INCX
+         DO 20 I = 1, N
+            X( IOFF ) = CONJG( X( IOFF ) )
+            IOFF = IOFF + INCX
+   20    CONTINUE
+      END IF
+      RETURN
+*
+*     End of CLACGV
+*
+      END

diff --git a/lapack/cladiv.f b/lapack/cladiv.f
new file mode 100644
index 0000000..2807ac5
--- /dev/null
+++ b/lapack/cladiv.f

@@ -0,0 +1,97 @@
+*> \brief \b CLADIV
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download CLADIV + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cladiv.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cladiv.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cladiv.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       COMPLEX FUNCTION CLADIV( X, Y )
+* 
+*       .. Scalar Arguments ..
+*       COMPLEX            X, Y
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CLADIV := X / Y, where X and Y are complex.  The computation of X / Y
+*> will not overflow on an intermediary step unless the results
+*> overflows.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] X
+*> \verbatim
+*>          X is COMPLEX
+*> \endverbatim
+*>
+*> \param[in] Y
+*> \verbatim
+*>          Y is COMPLEX
+*>          The complex scalars X and Y.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup complexOTHERauxiliary
+*
+*  =====================================================================
+      COMPLEX FUNCTION CLADIV( X, Y )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      COMPLEX            X, Y
+*     ..
+*
+*  =====================================================================
+*
+*     .. Local Scalars ..
+      REAL               ZI, ZR
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLADIV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          AIMAG, CMPLX, REAL
+*     ..
+*     .. Executable Statements ..
+*
+      CALL SLADIV( REAL( X ), AIMAG( X ), REAL( Y ), AIMAG( Y ), ZR,
+     $             ZI )
+      CLADIV = CMPLX( ZR, ZI )
+*
+      RETURN
+*
+*     End of CLADIV
+*
+      END

diff --git a/lapack/clarf.f b/lapack/clarf.f
new file mode 100644
index 0000000..ca0328f
--- /dev/null
+++ b/lapack/clarf.f

@@ -0,0 +1,232 @@
+*> \brief \b CLARF
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download CLARF + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/clarf.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/clarf.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/clarf.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CLARF( SIDE, M, N, V, INCV, TAU, C, LDC, WORK )
+* 
+*       .. Scalar Arguments ..
+*       CHARACTER          SIDE
+*       INTEGER            INCV, LDC, M, N
+*       COMPLEX            TAU
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            C( LDC, * ), V( * ), WORK( * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CLARF applies a complex elementary reflector H to a complex M-by-N
+*> matrix C, from either the left or the right. H is represented in the
+*> form
+*>
+*>       H = I - tau * v * v**H
+*>
+*> where tau is a complex scalar and v is a complex vector.
+*>
+*> If tau = 0, then H is taken to be the unit matrix.
+*>
+*> To apply H**H (the conjugate transpose of H), supply conjg(tau) instead
+*> tau.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] SIDE
+*> \verbatim
+*>          SIDE is CHARACTER*1
+*>          = 'L': form  H * C
+*>          = 'R': form  C * H
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is COMPLEX array, dimension
+*>                     (1 + (M-1)*abs(INCV)) if SIDE = 'L'
+*>                  or (1 + (N-1)*abs(INCV)) if SIDE = 'R'
+*>          The vector v in the representation of H. V is not used if
+*>          TAU = 0.
+*> \endverbatim
+*>
+*> \param[in] INCV
+*> \verbatim
+*>          INCV is INTEGER
+*>          The increment between elements of v. INCV <> 0.
+*> \endverbatim
+*>
+*> \param[in] TAU
+*> \verbatim
+*>          TAU is COMPLEX
+*>          The value tau in the representation of H.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is COMPLEX array, dimension (LDC,N)
+*>          On entry, the M-by-N matrix C.
+*>          On exit, C is overwritten by the matrix H * C if SIDE = 'L',
+*>          or C * H if SIDE = 'R'.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension
+*>                         (N) if SIDE = 'L'
+*>                      or (M) if SIDE = 'R'
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup complexOTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE CLARF( SIDE, M, N, V, INCV, TAU, C, LDC, WORK )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      CHARACTER          SIDE
+      INTEGER            INCV, LDC, M, N
+      COMPLEX            TAU
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            C( LDC, * ), V( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX            ONE, ZERO
+      PARAMETER          ( ONE = ( 1.0E+0, 0.0E+0 ),
+     $                   ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            APPLYLEFT
+      INTEGER            I, LASTV, LASTC
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CGEMV, CGERC
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILACLR, ILACLC
+      EXTERNAL           LSAME, ILACLR, ILACLC
+*     ..
+*     .. Executable Statements ..
+*
+      APPLYLEFT = LSAME( SIDE, 'L' )
+      LASTV = 0
+      LASTC = 0
+      IF( TAU.NE.ZERO ) THEN
+!     Set up variables for scanning V.  LASTV begins pointing to the end
+!     of V.
+         IF( APPLYLEFT ) THEN
+            LASTV = M
+         ELSE
+            LASTV = N
+         END IF
+         IF( INCV.GT.0 ) THEN
+            I = 1 + (LASTV-1) * INCV
+         ELSE
+            I = 1
+         END IF
+!     Look for the last non-zero row in V.
+         DO WHILE( LASTV.GT.0 .AND. V( I ).EQ.ZERO )
+            LASTV = LASTV - 1
+            I = I - INCV
+         END DO
+         IF( APPLYLEFT ) THEN
+!     Scan for the last non-zero column in C(1:lastv,:).
+            LASTC = ILACLC(LASTV, N, C, LDC)
+         ELSE
+!     Scan for the last non-zero row in C(:,1:lastv).
+            LASTC = ILACLR(M, LASTV, C, LDC)
+         END IF
+      END IF
+!     Note that lastc.eq.0 renders the BLAS operations null; no special
+!     case is needed at this level.
+      IF( APPLYLEFT ) THEN
+*
+*        Form  H * C
+*
+         IF( LASTV.GT.0 ) THEN
+*
+*           w(1:lastc,1) := C(1:lastv,1:lastc)**H * v(1:lastv,1)
+*
+            CALL CGEMV( 'Conjugate transpose', LASTV, LASTC, ONE,
+     $           C, LDC, V, INCV, ZERO, WORK, 1 )
+*
+*           C(1:lastv,1:lastc) := C(...) - v(1:lastv,1) * w(1:lastc,1)**H
+*
+            CALL CGERC( LASTV, LASTC, -TAU, V, INCV, WORK, 1, C, LDC )
+         END IF
+      ELSE
+*
+*        Form  C * H
+*
+         IF( LASTV.GT.0 ) THEN
+*
+*           w(1:lastc,1) := C(1:lastc,1:lastv) * v(1:lastv,1)
+*
+            CALL CGEMV( 'No transpose', LASTC, LASTV, ONE, C, LDC,
+     $           V, INCV, ZERO, WORK, 1 )
+*
+*           C(1:lastc,1:lastv) := C(...) - w(1:lastc,1) * v(1:lastv,1)**H
+*
+            CALL CGERC( LASTC, LASTV, -TAU, WORK, 1, V, INCV, C, LDC )
+         END IF
+      END IF
+      RETURN
+*
+*     End of CLARF
+*
+      END

diff --git a/lapack/clarfb.f b/lapack/clarfb.f
new file mode 100644
index 0000000..40bbdf4
--- /dev/null
+++ b/lapack/clarfb.f

@@ -0,0 +1,771 @@
+*> \brief \b CLARFB
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download CLARFB + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/clarfb.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/clarfb.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/clarfb.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CLARFB( SIDE, TRANS, DIRECT, STOREV, M, N, K, V, LDV,
+*                          T, LDT, C, LDC, WORK, LDWORK )
+* 
+*       .. Scalar Arguments ..
+*       CHARACTER          DIRECT, SIDE, STOREV, TRANS
+*       INTEGER            K, LDC, LDT, LDV, LDWORK, M, N
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            C( LDC, * ), T( LDT, * ), V( LDV, * ),
+*      $                   WORK( LDWORK, * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CLARFB applies a complex block reflector H or its transpose H**H to a
+*> complex M-by-N matrix C, from either the left or the right.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] SIDE
+*> \verbatim
+*>          SIDE is CHARACTER*1
+*>          = 'L': apply H or H**H from the Left
+*>          = 'R': apply H or H**H from the Right
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          = 'N': apply H (No transpose)
+*>          = 'C': apply H**H (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] DIRECT
+*> \verbatim
+*>          DIRECT is CHARACTER*1
+*>          Indicates how H is formed from a product of elementary
+*>          reflectors
+*>          = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*>          = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*>          STOREV is CHARACTER*1
+*>          Indicates how the vectors which define the elementary
+*>          reflectors are stored:
+*>          = 'C': Columnwise
+*>          = 'R': Rowwise
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The order of the matrix T (= the number of elementary
+*>          reflectors whose product defines the block reflector).
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is COMPLEX array, dimension
+*>                                (LDV,K) if STOREV = 'C'
+*>                                (LDV,M) if STOREV = 'R' and SIDE = 'L'
+*>                                (LDV,N) if STOREV = 'R' and SIDE = 'R'
+*>          The matrix V. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V.
+*>          If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+*>          if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+*>          if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is COMPLEX array, dimension (LDT,K)
+*>          The triangular K-by-K matrix T in the representation of the
+*>          block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is COMPLEX array, dimension (LDC,N)
+*>          On entry, the M-by-N matrix C.
+*>          On exit, C is overwritten by H*C or H**H*C or C*H or C*H**H.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (LDWORK,K)
+*> \endverbatim
+*>
+*> \param[in] LDWORK
+*> \verbatim
+*>          LDWORK is INTEGER
+*>          The leading dimension of the array WORK.
+*>          If SIDE = 'L', LDWORK >= max(1,N);
+*>          if SIDE = 'R', LDWORK >= max(1,M).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup complexOTHERauxiliary
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  The shape of the matrix V and the storage of the vectors which define
+*>  the H(i) is best illustrated by the following example with n = 5 and
+*>  k = 3. The elements equal to 1 are not stored; the corresponding
+*>  array elements are modified but restored on exit. The rest of the
+*>  array is not used.
+*>
+*>  DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R':
+*>
+*>               V = (  1       )                 V = (  1 v1 v1 v1 v1 )
+*>                   ( v1  1    )                     (     1 v2 v2 v2 )
+*>                   ( v1 v2  1 )                     (        1 v3 v3 )
+*>                   ( v1 v2 v3 )
+*>                   ( v1 v2 v3 )
+*>
+*>  DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R':
+*>
+*>               V = ( v1 v2 v3 )                 V = ( v1 v1  1       )
+*>                   ( v1 v2 v3 )                     ( v2 v2 v2  1    )
+*>                   (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
+*>                   (     1 v3 )
+*>                   (        1 )
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE CLARFB( SIDE, TRANS, DIRECT, STOREV, M, N, K, V, LDV,
+     $                   T, LDT, C, LDC, WORK, LDWORK )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      CHARACTER          DIRECT, SIDE, STOREV, TRANS
+      INTEGER            K, LDC, LDT, LDV, LDWORK, M, N
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            C( LDC, * ), T( LDT, * ), V( LDV, * ),
+     $                   WORK( LDWORK, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX            ONE
+      PARAMETER          ( ONE = ( 1.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER          TRANST
+      INTEGER            I, J, LASTV, LASTC
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILACLR, ILACLC
+      EXTERNAL           LSAME, ILACLR, ILACLC
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CCOPY, CGEMM, CLACGV, CTRMM
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( M.LE.0 .OR. N.LE.0 )
+     $   RETURN
+*
+      IF( LSAME( TRANS, 'N' ) ) THEN
+         TRANST = 'C'
+      ELSE
+         TRANST = 'N'
+      END IF
+*
+      IF( LSAME( STOREV, 'C' ) ) THEN
+*
+         IF( LSAME( DIRECT, 'F' ) ) THEN
+*
+*           Let  V =  ( V1 )    (first K rows)
+*                     ( V2 )
+*           where  V1  is unit lower triangular.
+*
+            IF( LSAME( SIDE, 'L' ) ) THEN
+*
+*              Form  H * C  or  H**H * C  where  C = ( C1 )
+*                                                    ( C2 )
+*
+               LASTV = MAX( K, ILACLR( M, K, V, LDV ) )
+               LASTC = ILACLC( LASTV, N, C, LDC )
+*
+*              W := C**H * V  =  (C1**H * V1 + C2**H * V2)  (stored in WORK)
+*
+*              W := C1**H
+*
+               DO 10 J = 1, K
+                  CALL CCOPY( LASTC, C( J, 1 ), LDC, WORK( 1, J ), 1 )
+                  CALL CLACGV( LASTC, WORK( 1, J ), 1 )
+   10          CONTINUE
+*
+*              W := W * V1
+*
+               CALL CTRMM( 'Right', 'Lower', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C2**H *V2
+*
+                  CALL CGEMM( 'Conjugate transpose', 'No transpose',
+     $                 LASTC, K, LASTV-K, ONE, C( K+1, 1 ), LDC,
+     $                 V( K+1, 1 ), LDV, ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T**H  or  W * T
+*
+               CALL CTRMM( 'Right', 'Upper', TRANST, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - V * W**H
+*
+               IF( M.GT.K ) THEN
+*
+*                 C2 := C2 - V2 * W**H
+*
+                  CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $                 LASTV-K, LASTC, K, -ONE, V( K+1, 1 ), LDV,
+     $                 WORK, LDWORK, ONE, C( K+1, 1 ), LDC )
+               END IF
+*
+*              W := W * V1**H
+*
+               CALL CTRMM( 'Right', 'Lower', 'Conjugate transpose',
+     $              'Unit', LASTC, K, ONE, V, LDV, WORK, LDWORK )
+*
+*              C1 := C1 - W**H
+*
+               DO 30 J = 1, K
+                  DO 20 I = 1, LASTC
+                     C( J, I ) = C( J, I ) - CONJG( WORK( I, J ) )
+   20             CONTINUE
+   30          CONTINUE
+*
+            ELSE IF( LSAME( SIDE, 'R' ) ) THEN
+*
+*              Form  C * H  or  C * H**H  where  C = ( C1  C2 )
+*
+               LASTV = MAX( K, ILACLR( N, K, V, LDV ) )
+               LASTC = ILACLR( M, LASTV, C, LDC )
+*
+*              W := C * V  =  (C1*V1 + C2*V2)  (stored in WORK)
+*
+*              W := C1
+*
+               DO 40 J = 1, K
+                  CALL CCOPY( LASTC, C( 1, J ), 1, WORK( 1, J ), 1 )
+   40          CONTINUE
+*
+*              W := W * V1
+*
+               CALL CTRMM( 'Right', 'Lower', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C2 * V2
+*
+                  CALL CGEMM( 'No transpose', 'No transpose',
+     $                 LASTC, K, LASTV-K,
+     $                 ONE, C( 1, K+1 ), LDC, V( K+1, 1 ), LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T  or  W * T**H
+*
+               CALL CTRMM( 'Right', 'Upper', TRANS, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - W * V**H
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C2 := C2 - W * V2**H
+*
+                  CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $                 LASTC, LASTV-K, K,
+     $                 -ONE, WORK, LDWORK, V( K+1, 1 ), LDV,
+     $                 ONE, C( 1, K+1 ), LDC )
+               END IF
+*
+*              W := W * V1**H
+*
+               CALL CTRMM( 'Right', 'Lower', 'Conjugate transpose',
+     $              'Unit', LASTC, K, ONE, V, LDV, WORK, LDWORK )
+*
+*              C1 := C1 - W
+*
+               DO 60 J = 1, K
+                  DO 50 I = 1, LASTC
+                     C( I, J ) = C( I, J ) - WORK( I, J )
+   50             CONTINUE
+   60          CONTINUE
+            END IF
+*
+         ELSE
+*
+*           Let  V =  ( V1 )
+*                     ( V2 )    (last K rows)
+*           where  V2  is unit upper triangular.
+*
+            IF( LSAME( SIDE, 'L' ) ) THEN
+*
+*              Form  H * C  or  H**H * C  where  C = ( C1 )
+*                                                    ( C2 )
+*
+               LASTV = MAX( K, ILACLR( M, K, V, LDV ) )
+               LASTC = ILACLC( LASTV, N, C, LDC )
+*
+*              W := C**H * V  =  (C1**H * V1 + C2**H * V2)  (stored in WORK)
+*
+*              W := C2**H
+*
+               DO 70 J = 1, K
+                  CALL CCOPY( LASTC, C( LASTV-K+J, 1 ), LDC,
+     $                 WORK( 1, J ), 1 )
+                  CALL CLACGV( LASTC, WORK( 1, J ), 1 )
+   70          CONTINUE
+*
+*              W := W * V2
+*
+               CALL CTRMM( 'Right', 'Upper', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V( LASTV-K+1, 1 ), LDV,
+     $              WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C1**H*V1
+*
+                  CALL CGEMM( 'Conjugate transpose', 'No transpose',
+     $                 LASTC, K, LASTV-K, ONE, C, LDC, V, LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T**H  or  W * T
+*
+               CALL CTRMM( 'Right', 'Lower', TRANST, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - V * W**H
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C1 := C1 - V1 * W**H
+*
+                  CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $                 LASTV-K, LASTC, K, -ONE, V, LDV, WORK, LDWORK,
+     $                 ONE, C, LDC )
+               END IF
+*
+*              W := W * V2**H
+*
+               CALL CTRMM( 'Right', 'Upper', 'Conjugate transpose',
+     $              'Unit', LASTC, K, ONE, V( LASTV-K+1, 1 ), LDV,
+     $              WORK, LDWORK )
+*
+*              C2 := C2 - W**H
+*
+               DO 90 J = 1, K
+                  DO 80 I = 1, LASTC
+                     C( LASTV-K+J, I ) = C( LASTV-K+J, I ) -
+     $                               CONJG( WORK( I, J ) )
+   80             CONTINUE
+   90          CONTINUE
+*
+            ELSE IF( LSAME( SIDE, 'R' ) ) THEN
+*
+*              Form  C * H  or  C * H**H  where  C = ( C1  C2 )
+*
+               LASTV = MAX( K, ILACLR( N, K, V, LDV ) )
+               LASTC = ILACLR( M, LASTV, C, LDC )
+*
+*              W := C * V  =  (C1*V1 + C2*V2)  (stored in WORK)
+*
+*              W := C2
+*
+               DO 100 J = 1, K
+                  CALL CCOPY( LASTC, C( 1, LASTV-K+J ), 1,
+     $                 WORK( 1, J ), 1 )
+  100          CONTINUE
+*
+*              W := W * V2
+*
+               CALL CTRMM( 'Right', 'Upper', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V( LASTV-K+1, 1 ), LDV,
+     $              WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C1 * V1
+*
+                  CALL CGEMM( 'No transpose', 'No transpose',
+     $                 LASTC, K, LASTV-K,
+     $                 ONE, C, LDC, V, LDV, ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T  or  W * T**H
+*
+               CALL CTRMM( 'Right', 'Lower', TRANS, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - W * V**H
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C1 := C1 - W * V1**H
+*
+                  CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $                 LASTC, LASTV-K, K, -ONE, WORK, LDWORK, V, LDV,
+     $                 ONE, C, LDC )
+               END IF
+*
+*              W := W * V2**H
+*
+               CALL CTRMM( 'Right', 'Upper', 'Conjugate transpose',
+     $              'Unit', LASTC, K, ONE, V( LASTV-K+1, 1 ), LDV,
+     $              WORK, LDWORK )
+*
+*              C2 := C2 - W
+*
+               DO 120 J = 1, K
+                  DO 110 I = 1, LASTC
+                     C( I, LASTV-K+J ) = C( I, LASTV-K+J )
+     $                    - WORK( I, J )
+  110             CONTINUE
+  120          CONTINUE
+            END IF
+         END IF
+*
+      ELSE IF( LSAME( STOREV, 'R' ) ) THEN
+*
+         IF( LSAME( DIRECT, 'F' ) ) THEN
+*
+*           Let  V =  ( V1  V2 )    (V1: first K columns)
+*           where  V1  is unit upper triangular.
+*
+            IF( LSAME( SIDE, 'L' ) ) THEN
+*
+*              Form  H * C  or  H**H * C  where  C = ( C1 )
+*                                                    ( C2 )
+*
+               LASTV = MAX( K, ILACLC( K, M, V, LDV ) )
+               LASTC = ILACLC( LASTV, N, C, LDC )
+*
+*              W := C**H * V**H  =  (C1**H * V1**H + C2**H * V2**H) (stored in WORK)
+*
+*              W := C1**H
+*
+               DO 130 J = 1, K
+                  CALL CCOPY( LASTC, C( J, 1 ), LDC, WORK( 1, J ), 1 )
+                  CALL CLACGV( LASTC, WORK( 1, J ), 1 )
+  130          CONTINUE
+*
+*              W := W * V1**H
+*
+               CALL CTRMM( 'Right', 'Upper', 'Conjugate transpose',
+     $                     'Unit', LASTC, K, ONE, V, LDV, WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C2**H*V2**H
+*
+                  CALL CGEMM( 'Conjugate transpose',
+     $                 'Conjugate transpose', LASTC, K, LASTV-K,
+     $                 ONE, C( K+1, 1 ), LDC, V( 1, K+1 ), LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T**H  or  W * T
+*
+               CALL CTRMM( 'Right', 'Upper', TRANST, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - V**H * W**H
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C2 := C2 - V2**H * W**H
+*
+                  CALL CGEMM( 'Conjugate transpose',
+     $                 'Conjugate transpose', LASTV-K, LASTC, K,
+     $                 -ONE, V( 1, K+1 ), LDV, WORK, LDWORK,
+     $                 ONE, C( K+1, 1 ), LDC )
+               END IF
+*
+*              W := W * V1
+*
+               CALL CTRMM( 'Right', 'Upper', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+*
+*              C1 := C1 - W**H
+*
+               DO 150 J = 1, K
+                  DO 140 I = 1, LASTC
+                     C( J, I ) = C( J, I ) - CONJG( WORK( I, J ) )
+  140             CONTINUE
+  150          CONTINUE
+*
+            ELSE IF( LSAME( SIDE, 'R' ) ) THEN
+*
+*              Form  C * H  or  C * H**H  where  C = ( C1  C2 )
+*
+               LASTV = MAX( K, ILACLC( K, N, V, LDV ) )
+               LASTC = ILACLR( M, LASTV, C, LDC )
+*
+*              W := C * V**H  =  (C1*V1**H + C2*V2**H)  (stored in WORK)
+*
+*              W := C1
+*
+               DO 160 J = 1, K
+                  CALL CCOPY( LASTC, C( 1, J ), 1, WORK( 1, J ), 1 )
+  160          CONTINUE
+*
+*              W := W * V1**H
+*
+               CALL CTRMM( 'Right', 'Upper', 'Conjugate transpose',
+     $                     'Unit', LASTC, K, ONE, V, LDV, WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C2 * V2**H
+*
+                  CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $                 LASTC, K, LASTV-K, ONE, C( 1, K+1 ), LDC,
+     $                 V( 1, K+1 ), LDV, ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T  or  W * T**H
+*
+               CALL CTRMM( 'Right', 'Upper', TRANS, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - W * V
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C2 := C2 - W * V2
+*
+                  CALL CGEMM( 'No transpose', 'No transpose',
+     $                 LASTC, LASTV-K, K,
+     $                 -ONE, WORK, LDWORK, V( 1, K+1 ), LDV,
+     $                 ONE, C( 1, K+1 ), LDC )
+               END IF
+*
+*              W := W * V1
+*
+               CALL CTRMM( 'Right', 'Upper', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+*
+*              C1 := C1 - W
+*
+               DO 180 J = 1, K
+                  DO 170 I = 1, LASTC
+                     C( I, J ) = C( I, J ) - WORK( I, J )
+  170             CONTINUE
+  180          CONTINUE
+*
+            END IF
+*
+         ELSE
+*
+*           Let  V =  ( V1  V2 )    (V2: last K columns)
+*           where  V2  is unit lower triangular.
+*
+            IF( LSAME( SIDE, 'L' ) ) THEN
+*
+*              Form  H * C  or  H**H * C  where  C = ( C1 )
+*                                                    ( C2 )
+*
+               LASTV = MAX( K, ILACLC( K, M, V, LDV ) )
+               LASTC = ILACLC( LASTV, N, C, LDC )
+*
+*              W := C**H * V**H  =  (C1**H * V1**H + C2**H * V2**H) (stored in WORK)
+*
+*              W := C2**H
+*
+               DO 190 J = 1, K
+                  CALL CCOPY( LASTC, C( LASTV-K+J, 1 ), LDC,
+     $                 WORK( 1, J ), 1 )
+                  CALL CLACGV( LASTC, WORK( 1, J ), 1 )
+  190          CONTINUE
+*
+*              W := W * V2**H
+*
+               CALL CTRMM( 'Right', 'Lower', 'Conjugate transpose',
+     $              'Unit', LASTC, K, ONE, V( 1, LASTV-K+1 ), LDV,
+     $              WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C1**H * V1**H
+*
+                  CALL CGEMM( 'Conjugate transpose',
+     $                 'Conjugate transpose', LASTC, K, LASTV-K,
+     $                 ONE, C, LDC, V, LDV, ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T**H  or  W * T
+*
+               CALL CTRMM( 'Right', 'Lower', TRANST, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - V**H * W**H
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C1 := C1 - V1**H * W**H
+*
+                  CALL CGEMM( 'Conjugate transpose',
+     $                 'Conjugate transpose', LASTV-K, LASTC, K,
+     $                 -ONE, V, LDV, WORK, LDWORK, ONE, C, LDC )
+               END IF
+*
+*              W := W * V2
+*
+               CALL CTRMM( 'Right', 'Lower', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V( 1, LASTV-K+1 ), LDV,
+     $              WORK, LDWORK )
+*
+*              C2 := C2 - W**H
+*
+               DO 210 J = 1, K
+                  DO 200 I = 1, LASTC
+                     C( LASTV-K+J, I ) = C( LASTV-K+J, I ) -
+     $                               CONJG( WORK( I, J ) )
+  200             CONTINUE
+  210          CONTINUE
+*
+            ELSE IF( LSAME( SIDE, 'R' ) ) THEN
+*
+*              Form  C * H  or  C * H**H  where  C = ( C1  C2 )
+*
+               LASTV = MAX( K, ILACLC( K, N, V, LDV ) )
+               LASTC = ILACLR( M, LASTV, C, LDC )
+*
+*              W := C * V**H  =  (C1*V1**H + C2*V2**H)  (stored in WORK)
+*
+*              W := C2
+*
+               DO 220 J = 1, K
+                  CALL CCOPY( LASTC, C( 1, LASTV-K+J ), 1,
+     $                 WORK( 1, J ), 1 )
+  220          CONTINUE
+*
+*              W := W * V2**H
+*
+               CALL CTRMM( 'Right', 'Lower', 'Conjugate transpose',
+     $              'Unit', LASTC, K, ONE, V( 1, LASTV-K+1 ), LDV,
+     $              WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C1 * V1**H
+*
+                  CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $                 LASTC, K, LASTV-K, ONE, C, LDC, V, LDV, ONE,
+     $                 WORK, LDWORK )
+               END IF
+*
+*              W := W * T  or  W * T**H
+*
+               CALL CTRMM( 'Right', 'Lower', TRANS, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - W * V
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C1 := C1 - W * V1
+*
+                  CALL CGEMM( 'No transpose', 'No transpose',
+     $                 LASTC, LASTV-K, K, -ONE, WORK, LDWORK, V, LDV,
+     $                 ONE, C, LDC )
+               END IF
+*
+*              W := W * V2
+*
+               CALL CTRMM( 'Right', 'Lower', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V( 1, LASTV-K+1 ), LDV,
+     $              WORK, LDWORK )
+*
+*              C1 := C1 - W
+*
+               DO 240 J = 1, K
+                  DO 230 I = 1, LASTC
+                     C( I, LASTV-K+J ) = C( I, LASTV-K+J )
+     $                    - WORK( I, J )
+  230             CONTINUE
+  240          CONTINUE
+*
+            END IF
+*
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CLARFB
+*
+      END

diff --git a/lapack/clarfg.f b/lapack/clarfg.f
new file mode 100644
index 0000000..d64f396
--- /dev/null
+++ b/lapack/clarfg.f

@@ -0,0 +1,203 @@
+*> \brief \b CLARFG
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download CLARFG + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/clarfg.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/clarfg.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/clarfg.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CLARFG( N, ALPHA, X, INCX, TAU )
+* 
+*       .. Scalar Arguments ..
+*       INTEGER            INCX, N
+*       COMPLEX            ALPHA, TAU
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            X( * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CLARFG generates a complex elementary reflector H of order n, such
+*> that
+*>
+*>       H**H * ( alpha ) = ( beta ),   H**H * H = I.
+*>              (   x   )   (   0  )
+*>
+*> where alpha and beta are scalars, with beta real, and x is an
+*> (n-1)-element complex vector. H is represented in the form
+*>
+*>       H = I - tau * ( 1 ) * ( 1 v**H ) ,
+*>                     ( v )
+*>
+*> where tau is a complex scalar and v is a complex (n-1)-element
+*> vector. Note that H is not hermitian.
+*>
+*> If the elements of x are all zero and alpha is real, then tau = 0
+*> and H is taken to be the unit matrix.
+*>
+*> Otherwise  1 <= real(tau) <= 2  and  abs(tau-1) <= 1 .
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the elementary reflector.
+*> \endverbatim
+*>
+*> \param[in,out] ALPHA
+*> \verbatim
+*>          ALPHA is COMPLEX
+*>          On entry, the value alpha.
+*>          On exit, it is overwritten with the value beta.
+*> \endverbatim
+*>
+*> \param[in,out] X
+*> \verbatim
+*>          X is COMPLEX array, dimension
+*>                         (1+(N-2)*abs(INCX))
+*>          On entry, the vector x.
+*>          On exit, it is overwritten with the vector v.
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*>          INCX is INTEGER
+*>          The increment between elements of X. INCX > 0.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX
+*>          The value tau.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup complexOTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE CLARFG( N, ALPHA, X, INCX, TAU )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+      COMPLEX            ALPHA, TAU
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            X( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ONE, ZERO
+      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            J, KNT
+      REAL               ALPHI, ALPHR, BETA, RSAFMN, SAFMIN, XNORM
+*     ..
+*     .. External Functions ..
+      REAL               SCNRM2, SLAMCH, SLAPY3
+      COMPLEX            CLADIV
+      EXTERNAL           SCNRM2, SLAMCH, SLAPY3, CLADIV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, AIMAG, CMPLX, REAL, SIGN
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CSCAL, CSSCAL
+*     ..
+*     .. Executable Statements ..
+*
+      IF( N.LE.0 ) THEN
+         TAU = ZERO
+         RETURN
+      END IF
+*
+      XNORM = SCNRM2( N-1, X, INCX )
+      ALPHR = REAL( ALPHA )
+      ALPHI = AIMAG( ALPHA )
+*
+      IF( XNORM.EQ.ZERO .AND. ALPHI.EQ.ZERO ) THEN
+*
+*        H  =  I
+*
+         TAU = ZERO
+      ELSE
+*
+*        general case
+*
+         BETA = -SIGN( SLAPY3( ALPHR, ALPHI, XNORM ), ALPHR )
+         SAFMIN = SLAMCH( 'S' ) / SLAMCH( 'E' )
+         RSAFMN = ONE / SAFMIN
+*
+         KNT = 0
+         IF( ABS( BETA ).LT.SAFMIN ) THEN
+*
+*           XNORM, BETA may be inaccurate; scale X and recompute them
+*
+   10       CONTINUE
+            KNT = KNT + 1
+            CALL CSSCAL( N-1, RSAFMN, X, INCX )
+            BETA = BETA*RSAFMN
+            ALPHI = ALPHI*RSAFMN
+            ALPHR = ALPHR*RSAFMN
+            IF( ABS( BETA ).LT.SAFMIN )
+     $         GO TO 10
+*
+*           New BETA is at most 1, at least SAFMIN
+*
+            XNORM = SCNRM2( N-1, X, INCX )
+            ALPHA = CMPLX( ALPHR, ALPHI )
+            BETA = -SIGN( SLAPY3( ALPHR, ALPHI, XNORM ), ALPHR )
+         END IF
+         TAU = CMPLX( ( BETA-ALPHR ) / BETA, -ALPHI / BETA )
+         ALPHA = CLADIV( CMPLX( ONE ), ALPHA-BETA )
+         CALL CSCAL( N-1, ALPHA, X, INCX )
+*
+*        If ALPHA is subnormal, it may lose relative accuracy
+*
+         DO 20 J = 1, KNT
+            BETA = BETA*SAFMIN
+ 20      CONTINUE
+         ALPHA = BETA
+      END IF
+*
+      RETURN
+*
+*     End of CLARFG
+*
+      END

diff --git a/lapack/clarft.f b/lapack/clarft.f
new file mode 100644
index 0000000..981447f
--- /dev/null
+++ b/lapack/clarft.f

@@ -0,0 +1,328 @@
+*> \brief \b CLARFT
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download CLARFT + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/clarft.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/clarft.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/clarft.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT )
+* 
+*       .. Scalar Arguments ..
+*       CHARACTER          DIRECT, STOREV
+*       INTEGER            K, LDT, LDV, N
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            T( LDT, * ), TAU( * ), V( LDV, * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CLARFT forms the triangular factor T of a complex block reflector H
+*> of order n, which is defined as a product of k elementary reflectors.
+*>
+*> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+*>
+*> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+*>
+*> If STOREV = 'C', the vector which defines the elementary reflector
+*> H(i) is stored in the i-th column of the array V, and
+*>
+*>    H  =  I - V * T * V**H
+*>
+*> If STOREV = 'R', the vector which defines the elementary reflector
+*> H(i) is stored in the i-th row of the array V, and
+*>
+*>    H  =  I - V**H * T * V
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] DIRECT
+*> \verbatim
+*>          DIRECT is CHARACTER*1
+*>          Specifies the order in which the elementary reflectors are
+*>          multiplied to form the block reflector:
+*>          = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*>          = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*>          STOREV is CHARACTER*1
+*>          Specifies how the vectors which define the elementary
+*>          reflectors are stored (see also Further Details):
+*>          = 'C': columnwise
+*>          = 'R': rowwise
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the block reflector H. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The order of the triangular factor T (= the number of
+*>          elementary reflectors). K >= 1.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is COMPLEX array, dimension
+*>                               (LDV,K) if STOREV = 'C'
+*>                               (LDV,N) if STOREV = 'R'
+*>          The matrix V. See further details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V.
+*>          If STOREV = 'C', LDV >= max(1,N); if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] TAU
+*> \verbatim
+*>          TAU is COMPLEX array, dimension (K)
+*>          TAU(i) must contain the scalar factor of the elementary
+*>          reflector H(i).
+*> \endverbatim
+*>
+*> \param[out] T
+*> \verbatim
+*>          T is COMPLEX array, dimension (LDT,K)
+*>          The k by k triangular factor T of the block reflector.
+*>          If DIRECT = 'F', T is upper triangular; if DIRECT = 'B', T is
+*>          lower triangular. The rest of the array is not used.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complexOTHERauxiliary
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  The shape of the matrix V and the storage of the vectors which define
+*>  the H(i) is best illustrated by the following example with n = 5 and
+*>  k = 3. The elements equal to 1 are not stored.
+*>
+*>  DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R':
+*>
+*>               V = (  1       )                 V = (  1 v1 v1 v1 v1 )
+*>                   ( v1  1    )                     (     1 v2 v2 v2 )
+*>                   ( v1 v2  1 )                     (        1 v3 v3 )
+*>                   ( v1 v2 v3 )
+*>                   ( v1 v2 v3 )
+*>
+*>  DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R':
+*>
+*>               V = ( v1 v2 v3 )                 V = ( v1 v1  1       )
+*>                   ( v1 v2 v3 )                     ( v2 v2 v2  1    )
+*>                   (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
+*>                   (     1 v3 )
+*>                   (        1 )
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT )
+*
+*  -- LAPACK auxiliary routine (version 3.4.1) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*     .. Scalar Arguments ..
+      CHARACTER          DIRECT, STOREV
+      INTEGER            K, LDT, LDV, N
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            T( LDT, * ), TAU( * ), V( LDV, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX            ONE, ZERO
+      PARAMETER          ( ONE = ( 1.0E+0, 0.0E+0 ),
+     $                   ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, J, PREVLASTV, LASTV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CGEMV, CLACGV, CTRMV
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      IF( LSAME( DIRECT, 'F' ) ) THEN
+         PREVLASTV = N
+         DO I = 1, K
+            PREVLASTV = MAX( PREVLASTV, I )
+            IF( TAU( I ).EQ.ZERO ) THEN
+*
+*              H(i)  =  I
+*
+               DO J = 1, I
+                  T( J, I ) = ZERO
+               END DO
+            ELSE
+*
+*              general case
+*
+               IF( LSAME( STOREV, 'C' ) ) THEN
+*                 Skip any trailing zeros.
+                  DO LASTV = N, I+1, -1
+                     IF( V( LASTV, I ).NE.ZERO ) EXIT
+                  END DO
+                  DO J = 1, I-1
+                     T( J, I ) = -TAU( I ) * CONJG( V( I , J ) )
+                  END DO                     
+                  J = MIN( LASTV, PREVLASTV )
+*
+*                 T(1:i-1,i) := - tau(i) * V(i:j,1:i-1)**H * V(i:j,i)
+*
+                  CALL CGEMV( 'Conjugate transpose', J-I, I-1,
+     $                        -TAU( I ), V( I+1, 1 ), LDV, 
+     $                        V( I+1, I ), 1,
+     $                        ONE, T( 1, I ), 1 )
+               ELSE
+*                 Skip any trailing zeros.
+                  DO LASTV = N, I+1, -1
+                     IF( V( I, LASTV ).NE.ZERO ) EXIT
+                  END DO
+                  DO J = 1, I-1
+                     T( J, I ) = -TAU( I ) * V( J , I )
+                  END DO                     
+                  J = MIN( LASTV, PREVLASTV )
+*
+*                 T(1:i-1,i) := - tau(i) * V(1:i-1,i:j) * V(i,i:j)**H
+*
+                  CALL CGEMM( 'N', 'C', I-1, 1, J-I, -TAU( I ),
+     $                        V( 1, I+1 ), LDV, V( I, I+1 ), LDV,
+     $                        ONE, T( 1, I ), LDT )                  
+               END IF
+*
+*              T(1:i-1,i) := T(1:i-1,1:i-1) * T(1:i-1,i)
+*
+               CALL CTRMV( 'Upper', 'No transpose', 'Non-unit', I-1, T,
+     $                     LDT, T( 1, I ), 1 )
+               T( I, I ) = TAU( I )
+               IF( I.GT.1 ) THEN
+                  PREVLASTV = MAX( PREVLASTV, LASTV )
+               ELSE
+                  PREVLASTV = LASTV
+               END IF
+            END IF
+         END DO
+      ELSE
+         PREVLASTV = 1
+         DO I = K, 1, -1
+            IF( TAU( I ).EQ.ZERO ) THEN
+*
+*              H(i)  =  I
+*
+               DO J = I, K
+                  T( J, I ) = ZERO
+               END DO
+            ELSE
+*
+*              general case
+*
+               IF( I.LT.K ) THEN
+                  IF( LSAME( STOREV, 'C' ) ) THEN
+*                    Skip any leading zeros.
+                     DO LASTV = 1, I-1
+                        IF( V( LASTV, I ).NE.ZERO ) EXIT
+                     END DO
+                     DO J = I+1, K
+                        T( J, I ) = -TAU( I ) * CONJG( V( N-K+I , J ) )
+                     END DO                        
+                     J = MAX( LASTV, PREVLASTV )
+*
+*                    T(i+1:k,i) = -tau(i) * V(j:n-k+i,i+1:k)**H * V(j:n-k+i,i)
+*
+                     CALL CGEMV( 'Conjugate transpose', N-K+I-J, K-I,
+     $                           -TAU( I ), V( J, I+1 ), LDV, V( J, I ),
+     $                           1, ONE, T( I+1, I ), 1 )
+                  ELSE
+*                    Skip any leading zeros.
+                     DO LASTV = 1, I-1
+                        IF( V( I, LASTV ).NE.ZERO ) EXIT
+                     END DO
+                     DO J = I+1, K
+                        T( J, I ) = -TAU( I ) * V( J, N-K+I )
+                     END DO                      
+                     J = MAX( LASTV, PREVLASTV )
+*
+*                    T(i+1:k,i) = -tau(i) * V(i+1:k,j:n-k+i) * V(i,j:n-k+i)**H
+*
+                     CALL CGEMM( 'N', 'C', K-I, 1, N-K+I-J, -TAU( I ),
+     $                           V( I+1, J ), LDV, V( I, J ), LDV,
+     $                           ONE, T( I+1, I ), LDT )                     
+                  END IF
+*
+*                 T(i+1:k,i) := T(i+1:k,i+1:k) * T(i+1:k,i)
+*
+                  CALL CTRMV( 'Lower', 'No transpose', 'Non-unit', K-I,
+     $                        T( I+1, I+1 ), LDT, T( I+1, I ), 1 )
+                  IF( I.GT.1 ) THEN
+                     PREVLASTV = MIN( PREVLASTV, LASTV )
+                  ELSE
+                     PREVLASTV = LASTV
+                  END IF
+               END IF
+               T( I, I ) = TAU( I )
+            END IF
+         END DO
+      END IF
+      RETURN
+*
+*     End of CLARFT
+*
+      END

diff --git a/lapack/complex_double.cpp b/lapack/complex_double.cpp
new file mode 100644
index 0000000..c9c5752
--- /dev/null
+++ b/lapack/complex_double.cpp

@@ -0,0 +1,18 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define SCALAR        std::complex<double>
+#define SCALAR_SUFFIX z
+#define SCALAR_SUFFIX_UP "Z"
+#define REAL_SCALAR_SUFFIX d
+#define ISCOMPLEX     1
+
+#include "cholesky.cpp"
+#include "lu.cpp"
+#include "svd.cpp"

diff --git a/lapack/complex_single.cpp b/lapack/complex_single.cpp
new file mode 100644
index 0000000..6d11b26
--- /dev/null
+++ b/lapack/complex_single.cpp

@@ -0,0 +1,18 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define SCALAR        std::complex<float>
+#define SCALAR_SUFFIX c
+#define SCALAR_SUFFIX_UP "C"
+#define REAL_SCALAR_SUFFIX s
+#define ISCOMPLEX     1
+
+#include "cholesky.cpp"
+#include "lu.cpp"
+#include "svd.cpp"

diff --git a/lapack/dladiv.f b/lapack/dladiv.f
new file mode 100644
index 0000000..090a906
--- /dev/null
+++ b/lapack/dladiv.f

@@ -0,0 +1,128 @@
+*> \brief \b DLADIV
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download DLADIV + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dladiv.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dladiv.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dladiv.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DLADIV( A, B, C, D, P, Q )
+* 
+*       .. Scalar Arguments ..
+*       DOUBLE PRECISION   A, B, C, D, P, Q
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLADIV performs complex division in  real arithmetic
+*>
+*>                       a + i*b
+*>            p + i*q = ---------
+*>                       c + i*d
+*>
+*> The algorithm is due to Robert L. Smith and can be found
+*> in D. Knuth, The art of Computer Programming, Vol.2, p.195
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] A
+*> \verbatim
+*>          A is DOUBLE PRECISION
+*> \endverbatim
+*>
+*> \param[in] B
+*> \verbatim
+*>          B is DOUBLE PRECISION
+*> \endverbatim
+*>
+*> \param[in] C
+*> \verbatim
+*>          C is DOUBLE PRECISION
+*> \endverbatim
+*>
+*> \param[in] D
+*> \verbatim
+*>          D is DOUBLE PRECISION
+*>          The scalars a, b, c, and d in the above expression.
+*> \endverbatim
+*>
+*> \param[out] P
+*> \verbatim
+*>          P is DOUBLE PRECISION
+*> \endverbatim
+*>
+*> \param[out] Q
+*> \verbatim
+*>          Q is DOUBLE PRECISION
+*>          The scalars p and q in the above expression.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup auxOTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE DLADIV( A, B, C, D, P, Q )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   A, B, C, D, P, Q
+*     ..
+*
+*  =====================================================================
+*
+*     .. Local Scalars ..
+      DOUBLE PRECISION   E, F
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS
+*     ..
+*     .. Executable Statements ..
+*
+      IF( ABS( D ).LT.ABS( C ) ) THEN
+         E = D / C
+         F = C + D*E
+         P = ( A+B*E ) / F
+         Q = ( B-A*E ) / F
+      ELSE
+         E = C / D
+         F = D + C*E
+         P = ( B+A*E ) / F
+         Q = ( -A+B*E ) / F
+      END IF
+*
+      RETURN
+*
+*     End of DLADIV
+*
+      END

diff --git a/lapack/dlamch.f b/lapack/dlamch.f
new file mode 100644
index 0000000..eb307e5
--- /dev/null
+++ b/lapack/dlamch.f

@@ -0,0 +1,189 @@
+*> \brief \b DLAMCH
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*      DOUBLE PRECISION FUNCTION DLAMCH( CMACH )
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLAMCH determines double precision machine parameters.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] CMACH
+*> \verbatim
+*>          Specifies the value to be returned by DLAMCH:
+*>          = 'E' or 'e',   DLAMCH := eps
+*>          = 'S' or 's ,   DLAMCH := sfmin
+*>          = 'B' or 'b',   DLAMCH := base
+*>          = 'P' or 'p',   DLAMCH := eps*base
+*>          = 'N' or 'n',   DLAMCH := t
+*>          = 'R' or 'r',   DLAMCH := rnd
+*>          = 'M' or 'm',   DLAMCH := emin
+*>          = 'U' or 'u',   DLAMCH := rmin
+*>          = 'L' or 'l',   DLAMCH := emax
+*>          = 'O' or 'o',   DLAMCH := rmax
+*>          where
+*>          eps   = relative machine precision
+*>          sfmin = safe minimum, such that 1/sfmin does not overflow
+*>          base  = base of the machine
+*>          prec  = eps*base
+*>          t     = number of (base) digits in the mantissa
+*>          rnd   = 1.0 when rounding occurs in addition, 0.0 otherwise
+*>          emin  = minimum exponent before (gradual) underflow
+*>          rmin  = underflow threshold - base**(emin-1)
+*>          emax  = largest exponent before overflow
+*>          rmax  = overflow threshold  - (base**emax)*(1-eps)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup auxOTHERauxiliary
+*
+*  =====================================================================
+      DOUBLE PRECISION FUNCTION DLAMCH( CMACH )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      CHARACTER          CMACH
+*     ..
+*
+* =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE, ZERO
+      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      DOUBLE PRECISION   RND, EPS, SFMIN, SMALL, RMACH
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DIGITS, EPSILON, HUGE, MAXEXPONENT,
+     $                   MINEXPONENT, RADIX, TINY
+*     ..
+*     .. Executable Statements ..
+*
+*
+*     Assume rounding, not chopping. Always.
+*
+      RND = ONE
+*
+      IF( ONE.EQ.RND ) THEN
+         EPS = EPSILON(ZERO) * 0.5
+      ELSE
+         EPS = EPSILON(ZERO)
+      END IF
+*
+      IF( LSAME( CMACH, 'E' ) ) THEN
+         RMACH = EPS
+      ELSE IF( LSAME( CMACH, 'S' ) ) THEN
+         SFMIN = TINY(ZERO)
+         SMALL = ONE / HUGE(ZERO)
+         IF( SMALL.GE.SFMIN ) THEN
+*
+*           Use SMALL plus a bit, to avoid the possibility of rounding
+*           causing overflow when computing  1/sfmin.
+*
+            SFMIN = SMALL*( ONE+EPS )
+         END IF
+         RMACH = SFMIN
+      ELSE IF( LSAME( CMACH, 'B' ) ) THEN
+         RMACH = RADIX(ZERO)
+      ELSE IF( LSAME( CMACH, 'P' ) ) THEN
+         RMACH = EPS * RADIX(ZERO)
+      ELSE IF( LSAME( CMACH, 'N' ) ) THEN
+         RMACH = DIGITS(ZERO)
+      ELSE IF( LSAME( CMACH, 'R' ) ) THEN
+         RMACH = RND
+      ELSE IF( LSAME( CMACH, 'M' ) ) THEN
+         RMACH = MINEXPONENT(ZERO)
+      ELSE IF( LSAME( CMACH, 'U' ) ) THEN
+         RMACH = tiny(zero)
+      ELSE IF( LSAME( CMACH, 'L' ) ) THEN
+         RMACH = MAXEXPONENT(ZERO)
+      ELSE IF( LSAME( CMACH, 'O' ) ) THEN
+         RMACH = HUGE(ZERO)
+      ELSE
+         RMACH = ZERO
+      END IF
+*
+      DLAMCH = RMACH
+      RETURN
+*
+*     End of DLAMCH
+*
+      END
+************************************************************************
+*> \brief \b DLAMC3
+*> \details
+*> \b Purpose:
+*> \verbatim
+*> DLAMC3  is intended to force  A  and  B  to be stored prior to doing
+*> the addition of  A  and  B ,  for use in situations where optimizers
+*> might hold one of these in a register.
+*> \endverbatim
+*> \author LAPACK is a software package provided by Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+*> \date November 2011
+*> \ingroup auxOTHERauxiliary
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is a DOUBLE PRECISION
+*> \endverbatim
+*>
+*> \param[in] B
+*> \verbatim
+*>          B is a DOUBLE PRECISION
+*>          The values A and B.
+*> \endverbatim
+*>
+      DOUBLE PRECISION FUNCTION DLAMC3( A, B )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
+*     November 2010
+*
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   A, B
+*     ..
+* =====================================================================
+*
+*     .. Executable Statements ..
+*
+      DLAMC3 = A + B
+*
+      RETURN
+*
+*     End of DLAMC3
+*
+      END
+*
+************************************************************************

diff --git a/lapack/dlapy2.f b/lapack/dlapy2.f
new file mode 100644
index 0000000..e6a62bf
--- /dev/null
+++ b/lapack/dlapy2.f

@@ -0,0 +1,104 @@
+*> \brief \b DLAPY2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download DLAPY2 + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dlapy2.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dlapy2.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dlapy2.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       DOUBLE PRECISION FUNCTION DLAPY2( X, Y )
+* 
+*       .. Scalar Arguments ..
+*       DOUBLE PRECISION   X, Y
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLAPY2 returns sqrt(x**2+y**2), taking care not to cause unnecessary
+*> overflow.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] X
+*> \verbatim
+*>          X is DOUBLE PRECISION
+*> \endverbatim
+*>
+*> \param[in] Y
+*> \verbatim
+*>          Y is DOUBLE PRECISION
+*>          X and Y specify the values x and y.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup auxOTHERauxiliary
+*
+*  =====================================================================
+      DOUBLE PRECISION FUNCTION DLAPY2( X, Y )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   X, Y
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO
+      PARAMETER          ( ZERO = 0.0D0 )
+      DOUBLE PRECISION   ONE
+      PARAMETER          ( ONE = 1.0D0 )
+*     ..
+*     .. Local Scalars ..
+      DOUBLE PRECISION   W, XABS, YABS, Z
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+      XABS = ABS( X )
+      YABS = ABS( Y )
+      W = MAX( XABS, YABS )
+      Z = MIN( XABS, YABS )
+      IF( Z.EQ.ZERO ) THEN
+         DLAPY2 = W
+      ELSE
+         DLAPY2 = W*SQRT( ONE+( Z / W )**2 )
+      END IF
+      RETURN
+*
+*     End of DLAPY2
+*
+      END

diff --git a/lapack/dlapy3.f b/lapack/dlapy3.f
new file mode 100644
index 0000000..ae9844f
--- /dev/null
+++ b/lapack/dlapy3.f

@@ -0,0 +1,111 @@
+*> \brief \b DLAPY3
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download DLAPY3 + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dlapy3.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dlapy3.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dlapy3.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       DOUBLE PRECISION FUNCTION DLAPY3( X, Y, Z )
+* 
+*       .. Scalar Arguments ..
+*       DOUBLE PRECISION   X, Y, Z
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLAPY3 returns sqrt(x**2+y**2+z**2), taking care not to cause
+*> unnecessary overflow.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] X
+*> \verbatim
+*>          X is DOUBLE PRECISION
+*> \endverbatim
+*>
+*> \param[in] Y
+*> \verbatim
+*>          Y is DOUBLE PRECISION
+*> \endverbatim
+*>
+*> \param[in] Z
+*> \verbatim
+*>          Z is DOUBLE PRECISION
+*>          X, Y and Z specify the values x, y and z.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup auxOTHERauxiliary
+*
+*  =====================================================================
+      DOUBLE PRECISION FUNCTION DLAPY3( X, Y, Z )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   X, Y, Z
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO
+      PARAMETER          ( ZERO = 0.0D0 )
+*     ..
+*     .. Local Scalars ..
+      DOUBLE PRECISION   W, XABS, YABS, ZABS
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+      XABS = ABS( X )
+      YABS = ABS( Y )
+      ZABS = ABS( Z )
+      W = MAX( XABS, YABS, ZABS )
+      IF( W.EQ.ZERO ) THEN
+*     W can be zero for max(0,nan,0)
+*     adding all three entries together will make sure
+*     NaN will not disappear.
+         DLAPY3 =  XABS + YABS + ZABS
+      ELSE
+         DLAPY3 = W*SQRT( ( XABS / W )**2+( YABS / W )**2+
+     $            ( ZABS / W )**2 )
+      END IF
+      RETURN
+*
+*     End of DLAPY3
+*
+      END

diff --git a/lapack/dlarf.f b/lapack/dlarf.f
new file mode 100644
index 0000000..2a82ff4
--- /dev/null
+++ b/lapack/dlarf.f

@@ -0,0 +1,227 @@
+*> \brief \b DLARF
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download DLARF + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dlarf.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dlarf.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dlarf.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DLARF( SIDE, M, N, V, INCV, TAU, C, LDC, WORK )
+* 
+*       .. Scalar Arguments ..
+*       CHARACTER          SIDE
+*       INTEGER            INCV, LDC, M, N
+*       DOUBLE PRECISION   TAU
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   C( LDC, * ), V( * ), WORK( * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLARF applies a real elementary reflector H to a real m by n matrix
+*> C, from either the left or the right. H is represented in the form
+*>
+*>       H = I - tau * v * v**T
+*>
+*> where tau is a real scalar and v is a real vector.
+*>
+*> If tau = 0, then H is taken to be the unit matrix.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] SIDE
+*> \verbatim
+*>          SIDE is CHARACTER*1
+*>          = 'L': form  H * C
+*>          = 'R': form  C * H
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is DOUBLE PRECISION array, dimension
+*>                     (1 + (M-1)*abs(INCV)) if SIDE = 'L'
+*>                  or (1 + (N-1)*abs(INCV)) if SIDE = 'R'
+*>          The vector v in the representation of H. V is not used if
+*>          TAU = 0.
+*> \endverbatim
+*>
+*> \param[in] INCV
+*> \verbatim
+*>          INCV is INTEGER
+*>          The increment between elements of v. INCV <> 0.
+*> \endverbatim
+*>
+*> \param[in] TAU
+*> \verbatim
+*>          TAU is DOUBLE PRECISION
+*>          The value tau in the representation of H.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is DOUBLE PRECISION array, dimension (LDC,N)
+*>          On entry, the m by n matrix C.
+*>          On exit, C is overwritten by the matrix H * C if SIDE = 'L',
+*>          or C * H if SIDE = 'R'.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension
+*>                         (N) if SIDE = 'L'
+*>                      or (M) if SIDE = 'R'
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup doubleOTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE DLARF( SIDE, M, N, V, INCV, TAU, C, LDC, WORK )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      CHARACTER          SIDE
+      INTEGER            INCV, LDC, M, N
+      DOUBLE PRECISION   TAU
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   C( LDC, * ), V( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE, ZERO
+      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            APPLYLEFT
+      INTEGER            I, LASTV, LASTC
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DGEMV, DGER
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILADLR, ILADLC
+      EXTERNAL           LSAME, ILADLR, ILADLC
+*     ..
+*     .. Executable Statements ..
+*
+      APPLYLEFT = LSAME( SIDE, 'L' )
+      LASTV = 0
+      LASTC = 0
+      IF( TAU.NE.ZERO ) THEN
+!     Set up variables for scanning V.  LASTV begins pointing to the end
+!     of V.
+         IF( APPLYLEFT ) THEN
+            LASTV = M
+         ELSE
+            LASTV = N
+         END IF
+         IF( INCV.GT.0 ) THEN
+            I = 1 + (LASTV-1) * INCV
+         ELSE
+            I = 1
+         END IF
+!     Look for the last non-zero row in V.
+         DO WHILE( LASTV.GT.0 .AND. V( I ).EQ.ZERO )
+            LASTV = LASTV - 1
+            I = I - INCV
+         END DO
+         IF( APPLYLEFT ) THEN
+!     Scan for the last non-zero column in C(1:lastv,:).
+            LASTC = ILADLC(LASTV, N, C, LDC)
+         ELSE
+!     Scan for the last non-zero row in C(:,1:lastv).
+            LASTC = ILADLR(M, LASTV, C, LDC)
+         END IF
+      END IF
+!     Note that lastc.eq.0 renders the BLAS operations null; no special
+!     case is needed at this level.
+      IF( APPLYLEFT ) THEN
+*
+*        Form  H * C
+*
+         IF( LASTV.GT.0 ) THEN
+*
+*           w(1:lastc,1) := C(1:lastv,1:lastc)**T * v(1:lastv,1)
+*
+            CALL DGEMV( 'Transpose', LASTV, LASTC, ONE, C, LDC, V, INCV,
+     $           ZERO, WORK, 1 )
+*
+*           C(1:lastv,1:lastc) := C(...) - v(1:lastv,1) * w(1:lastc,1)**T
+*
+            CALL DGER( LASTV, LASTC, -TAU, V, INCV, WORK, 1, C, LDC )
+         END IF
+      ELSE
+*
+*        Form  C * H
+*
+         IF( LASTV.GT.0 ) THEN
+*
+*           w(1:lastc,1) := C(1:lastc,1:lastv) * v(1:lastv,1)
+*
+            CALL DGEMV( 'No transpose', LASTC, LASTV, ONE, C, LDC,
+     $           V, INCV, ZERO, WORK, 1 )
+*
+*           C(1:lastc,1:lastv) := C(...) - w(1:lastc,1) * v(1:lastv,1)**T
+*
+            CALL DGER( LASTC, LASTV, -TAU, WORK, 1, V, INCV, C, LDC )
+         END IF
+      END IF
+      RETURN
+*
+*     End of DLARF
+*
+      END

diff --git a/lapack/dlarfb.f b/lapack/dlarfb.f
new file mode 100644
index 0000000..206d3b2
--- /dev/null
+++ b/lapack/dlarfb.f

@@ -0,0 +1,762 @@
+*> \brief \b DLARFB
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download DLARFB + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dlarfb.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dlarfb.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dlarfb.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DLARFB( SIDE, TRANS, DIRECT, STOREV, M, N, K, V, LDV,
+*                          T, LDT, C, LDC, WORK, LDWORK )
+* 
+*       .. Scalar Arguments ..
+*       CHARACTER          DIRECT, SIDE, STOREV, TRANS
+*       INTEGER            K, LDC, LDT, LDV, LDWORK, M, N
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   C( LDC, * ), T( LDT, * ), V( LDV, * ),
+*      $                   WORK( LDWORK, * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLARFB applies a real block reflector H or its transpose H**T to a
+*> real m by n matrix C, from either the left or the right.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] SIDE
+*> \verbatim
+*>          SIDE is CHARACTER*1
+*>          = 'L': apply H or H**T from the Left
+*>          = 'R': apply H or H**T from the Right
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          = 'N': apply H (No transpose)
+*>          = 'T': apply H**T (Transpose)
+*> \endverbatim
+*>
+*> \param[in] DIRECT
+*> \verbatim
+*>          DIRECT is CHARACTER*1
+*>          Indicates how H is formed from a product of elementary
+*>          reflectors
+*>          = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*>          = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*>          STOREV is CHARACTER*1
+*>          Indicates how the vectors which define the elementary
+*>          reflectors are stored:
+*>          = 'C': Columnwise
+*>          = 'R': Rowwise
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The order of the matrix T (= the number of elementary
+*>          reflectors whose product defines the block reflector).
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is DOUBLE PRECISION array, dimension
+*>                                (LDV,K) if STOREV = 'C'
+*>                                (LDV,M) if STOREV = 'R' and SIDE = 'L'
+*>                                (LDV,N) if STOREV = 'R' and SIDE = 'R'
+*>          The matrix V. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V.
+*>          If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+*>          if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+*>          if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is DOUBLE PRECISION array, dimension (LDT,K)
+*>          The triangular k by k matrix T in the representation of the
+*>          block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is DOUBLE PRECISION array, dimension (LDC,N)
+*>          On entry, the m by n matrix C.
+*>          On exit, C is overwritten by H*C or H**T*C or C*H or C*H**T.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (LDWORK,K)
+*> \endverbatim
+*>
+*> \param[in] LDWORK
+*> \verbatim
+*>          LDWORK is INTEGER
+*>          The leading dimension of the array WORK.
+*>          If SIDE = 'L', LDWORK >= max(1,N);
+*>          if SIDE = 'R', LDWORK >= max(1,M).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup doubleOTHERauxiliary
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  The shape of the matrix V and the storage of the vectors which define
+*>  the H(i) is best illustrated by the following example with n = 5 and
+*>  k = 3. The elements equal to 1 are not stored; the corresponding
+*>  array elements are modified but restored on exit. The rest of the
+*>  array is not used.
+*>
+*>  DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R':
+*>
+*>               V = (  1       )                 V = (  1 v1 v1 v1 v1 )
+*>                   ( v1  1    )                     (     1 v2 v2 v2 )
+*>                   ( v1 v2  1 )                     (        1 v3 v3 )
+*>                   ( v1 v2 v3 )
+*>                   ( v1 v2 v3 )
+*>
+*>  DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R':
+*>
+*>               V = ( v1 v2 v3 )                 V = ( v1 v1  1       )
+*>                   ( v1 v2 v3 )                     ( v2 v2 v2  1    )
+*>                   (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
+*>                   (     1 v3 )
+*>                   (        1 )
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE DLARFB( SIDE, TRANS, DIRECT, STOREV, M, N, K, V, LDV,
+     $                   T, LDT, C, LDC, WORK, LDWORK )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      CHARACTER          DIRECT, SIDE, STOREV, TRANS
+      INTEGER            K, LDC, LDT, LDV, LDWORK, M, N
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   C( LDC, * ), T( LDT, * ), V( LDV, * ),
+     $                   WORK( LDWORK, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE
+      PARAMETER          ( ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER          TRANST
+      INTEGER            I, J, LASTV, LASTC
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILADLR, ILADLC
+      EXTERNAL           LSAME, ILADLR, ILADLC
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DCOPY, DGEMM, DTRMM
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( M.LE.0 .OR. N.LE.0 )
+     $   RETURN
+*
+      IF( LSAME( TRANS, 'N' ) ) THEN
+         TRANST = 'T'
+      ELSE
+         TRANST = 'N'
+      END IF
+*
+      IF( LSAME( STOREV, 'C' ) ) THEN
+*
+         IF( LSAME( DIRECT, 'F' ) ) THEN
+*
+*           Let  V =  ( V1 )    (first K rows)
+*                     ( V2 )
+*           where  V1  is unit lower triangular.
+*
+            IF( LSAME( SIDE, 'L' ) ) THEN
+*
+*              Form  H * C  or  H**T * C  where  C = ( C1 )
+*                                                    ( C2 )
+*
+               LASTV = MAX( K, ILADLR( M, K, V, LDV ) )
+               LASTC = ILADLC( LASTV, N, C, LDC )
+*
+*              W := C**T * V  =  (C1**T * V1 + C2**T * V2)  (stored in WORK)
+*
+*              W := C1**T
+*
+               DO 10 J = 1, K
+                  CALL DCOPY( LASTC, C( J, 1 ), LDC, WORK( 1, J ), 1 )
+   10          CONTINUE
+*
+*              W := W * V1
+*
+               CALL DTRMM( 'Right', 'Lower', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C2**T *V2
+*
+                  CALL DGEMM( 'Transpose', 'No transpose',
+     $                 LASTC, K, LASTV-K,
+     $                 ONE, C( K+1, 1 ), LDC, V( K+1, 1 ), LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T**T  or  W * T
+*
+               CALL DTRMM( 'Right', 'Upper', TRANST, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - V * W**T
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C2 := C2 - V2 * W**T
+*
+                  CALL DGEMM( 'No transpose', 'Transpose',
+     $                 LASTV-K, LASTC, K,
+     $                 -ONE, V( K+1, 1 ), LDV, WORK, LDWORK, ONE,
+     $                 C( K+1, 1 ), LDC )
+               END IF
+*
+*              W := W * V1**T
+*
+               CALL DTRMM( 'Right', 'Lower', 'Transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+*
+*              C1 := C1 - W**T
+*
+               DO 30 J = 1, K
+                  DO 20 I = 1, LASTC
+                     C( J, I ) = C( J, I ) - WORK( I, J )
+   20             CONTINUE
+   30          CONTINUE
+*
+            ELSE IF( LSAME( SIDE, 'R' ) ) THEN
+*
+*              Form  C * H  or  C * H**T  where  C = ( C1  C2 )
+*
+               LASTV = MAX( K, ILADLR( N, K, V, LDV ) )
+               LASTC = ILADLR( M, LASTV, C, LDC )
+*
+*              W := C * V  =  (C1*V1 + C2*V2)  (stored in WORK)
+*
+*              W := C1
+*
+               DO 40 J = 1, K
+                  CALL DCOPY( LASTC, C( 1, J ), 1, WORK( 1, J ), 1 )
+   40          CONTINUE
+*
+*              W := W * V1
+*
+               CALL DTRMM( 'Right', 'Lower', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C2 * V2
+*
+                  CALL DGEMM( 'No transpose', 'No transpose',
+     $                 LASTC, K, LASTV-K,
+     $                 ONE, C( 1, K+1 ), LDC, V( K+1, 1 ), LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T  or  W * T**T
+*
+               CALL DTRMM( 'Right', 'Upper', TRANS, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - W * V**T
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C2 := C2 - W * V2**T
+*
+                  CALL DGEMM( 'No transpose', 'Transpose',
+     $                 LASTC, LASTV-K, K,
+     $                 -ONE, WORK, LDWORK, V( K+1, 1 ), LDV, ONE,
+     $                 C( 1, K+1 ), LDC )
+               END IF
+*
+*              W := W * V1**T
+*
+               CALL DTRMM( 'Right', 'Lower', 'Transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+*
+*              C1 := C1 - W
+*
+               DO 60 J = 1, K
+                  DO 50 I = 1, LASTC
+                     C( I, J ) = C( I, J ) - WORK( I, J )
+   50             CONTINUE
+   60          CONTINUE
+            END IF
+*
+         ELSE
+*
+*           Let  V =  ( V1 )
+*                     ( V2 )    (last K rows)
+*           where  V2  is unit upper triangular.
+*
+            IF( LSAME( SIDE, 'L' ) ) THEN
+*
+*              Form  H * C  or  H**T * C  where  C = ( C1 )
+*                                                    ( C2 )
+*
+               LASTV = MAX( K, ILADLR( M, K, V, LDV ) )
+               LASTC = ILADLC( LASTV, N, C, LDC )
+*
+*              W := C**T * V  =  (C1**T * V1 + C2**T * V2)  (stored in WORK)
+*
+*              W := C2**T
+*
+               DO 70 J = 1, K
+                  CALL DCOPY( LASTC, C( LASTV-K+J, 1 ), LDC,
+     $                 WORK( 1, J ), 1 )
+   70          CONTINUE
+*
+*              W := W * V2
+*
+               CALL DTRMM( 'Right', 'Upper', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V( LASTV-K+1, 1 ), LDV,
+     $              WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C1**T*V1
+*
+                  CALL DGEMM( 'Transpose', 'No transpose',
+     $                 LASTC, K, LASTV-K, ONE, C, LDC, V, LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T**T  or  W * T
+*
+               CALL DTRMM( 'Right', 'Lower', TRANST, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - V * W**T
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C1 := C1 - V1 * W**T
+*
+                  CALL DGEMM( 'No transpose', 'Transpose',
+     $                 LASTV-K, LASTC, K, -ONE, V, LDV, WORK, LDWORK,
+     $                 ONE, C, LDC )
+               END IF
+*
+*              W := W * V2**T
+*
+               CALL DTRMM( 'Right', 'Upper', 'Transpose', 'Unit',
+     $              LASTC, K, ONE, V( LASTV-K+1, 1 ), LDV,
+     $              WORK, LDWORK )
+*
+*              C2 := C2 - W**T
+*
+               DO 90 J = 1, K
+                  DO 80 I = 1, LASTC
+                     C( LASTV-K+J, I ) = C( LASTV-K+J, I ) - WORK(I, J)
+   80             CONTINUE
+   90          CONTINUE
+*
+            ELSE IF( LSAME( SIDE, 'R' ) ) THEN
+*
+*              Form  C * H  or  C * H**T  where  C = ( C1  C2 )
+*
+               LASTV = MAX( K, ILADLR( N, K, V, LDV ) )
+               LASTC = ILADLR( M, LASTV, C, LDC )
+*
+*              W := C * V  =  (C1*V1 + C2*V2)  (stored in WORK)
+*
+*              W := C2
+*
+               DO 100 J = 1, K
+                  CALL DCOPY( LASTC, C( 1, N-K+J ), 1, WORK( 1, J ), 1 )
+  100          CONTINUE
+*
+*              W := W * V2
+*
+               CALL DTRMM( 'Right', 'Upper', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V( LASTV-K+1, 1 ), LDV,
+     $              WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C1 * V1
+*
+                  CALL DGEMM( 'No transpose', 'No transpose',
+     $                 LASTC, K, LASTV-K, ONE, C, LDC, V, LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T  or  W * T**T
+*
+               CALL DTRMM( 'Right', 'Lower', TRANS, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - W * V**T
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C1 := C1 - W * V1**T
+*
+                  CALL DGEMM( 'No transpose', 'Transpose',
+     $                 LASTC, LASTV-K, K, -ONE, WORK, LDWORK, V, LDV,
+     $                 ONE, C, LDC )
+               END IF
+*
+*              W := W * V2**T
+*
+               CALL DTRMM( 'Right', 'Upper', 'Transpose', 'Unit',
+     $              LASTC, K, ONE, V( LASTV-K+1, 1 ), LDV,
+     $              WORK, LDWORK )
+*
+*              C2 := C2 - W
+*
+               DO 120 J = 1, K
+                  DO 110 I = 1, LASTC
+                     C( I, LASTV-K+J ) = C( I, LASTV-K+J ) - WORK(I, J)
+  110             CONTINUE
+  120          CONTINUE
+            END IF
+         END IF
+*
+      ELSE IF( LSAME( STOREV, 'R' ) ) THEN
+*
+         IF( LSAME( DIRECT, 'F' ) ) THEN
+*
+*           Let  V =  ( V1  V2 )    (V1: first K columns)
+*           where  V1  is unit upper triangular.
+*
+            IF( LSAME( SIDE, 'L' ) ) THEN
+*
+*              Form  H * C  or  H**T * C  where  C = ( C1 )
+*                                                    ( C2 )
+*
+               LASTV = MAX( K, ILADLC( K, M, V, LDV ) )
+               LASTC = ILADLC( LASTV, N, C, LDC )
+*
+*              W := C**T * V**T  =  (C1**T * V1**T + C2**T * V2**T) (stored in WORK)
+*
+*              W := C1**T
+*
+               DO 130 J = 1, K
+                  CALL DCOPY( LASTC, C( J, 1 ), LDC, WORK( 1, J ), 1 )
+  130          CONTINUE
+*
+*              W := W * V1**T
+*
+               CALL DTRMM( 'Right', 'Upper', 'Transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C2**T*V2**T
+*
+                  CALL DGEMM( 'Transpose', 'Transpose',
+     $                 LASTC, K, LASTV-K,
+     $                 ONE, C( K+1, 1 ), LDC, V( 1, K+1 ), LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T**T  or  W * T
+*
+               CALL DTRMM( 'Right', 'Upper', TRANST, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - V**T * W**T
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C2 := C2 - V2**T * W**T
+*
+                  CALL DGEMM( 'Transpose', 'Transpose',
+     $                 LASTV-K, LASTC, K,
+     $                 -ONE, V( 1, K+1 ), LDV, WORK, LDWORK,
+     $                 ONE, C( K+1, 1 ), LDC )
+               END IF
+*
+*              W := W * V1
+*
+               CALL DTRMM( 'Right', 'Upper', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+*
+*              C1 := C1 - W**T
+*
+               DO 150 J = 1, K
+                  DO 140 I = 1, LASTC
+                     C( J, I ) = C( J, I ) - WORK( I, J )
+  140             CONTINUE
+  150          CONTINUE
+*
+            ELSE IF( LSAME( SIDE, 'R' ) ) THEN
+*
+*              Form  C * H  or  C * H**T  where  C = ( C1  C2 )
+*
+               LASTV = MAX( K, ILADLC( K, N, V, LDV ) )
+               LASTC = ILADLR( M, LASTV, C, LDC )
+*
+*              W := C * V**T  =  (C1*V1**T + C2*V2**T)  (stored in WORK)
+*
+*              W := C1
+*
+               DO 160 J = 1, K
+                  CALL DCOPY( LASTC, C( 1, J ), 1, WORK( 1, J ), 1 )
+  160          CONTINUE
+*
+*              W := W * V1**T
+*
+               CALL DTRMM( 'Right', 'Upper', 'Transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C2 * V2**T
+*
+                  CALL DGEMM( 'No transpose', 'Transpose',
+     $                 LASTC, K, LASTV-K,
+     $                 ONE, C( 1, K+1 ), LDC, V( 1, K+1 ), LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T  or  W * T**T
+*
+               CALL DTRMM( 'Right', 'Upper', TRANS, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - W * V
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C2 := C2 - W * V2
+*
+                  CALL DGEMM( 'No transpose', 'No transpose',
+     $                 LASTC, LASTV-K, K,
+     $                 -ONE, WORK, LDWORK, V( 1, K+1 ), LDV,
+     $                 ONE, C( 1, K+1 ), LDC )
+               END IF
+*
+*              W := W * V1
+*
+               CALL DTRMM( 'Right', 'Upper', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+*
+*              C1 := C1 - W
+*
+               DO 180 J = 1, K
+                  DO 170 I = 1, LASTC
+                     C( I, J ) = C( I, J ) - WORK( I, J )
+  170             CONTINUE
+  180          CONTINUE
+*
+            END IF
+*
+         ELSE
+*
+*           Let  V =  ( V1  V2 )    (V2: last K columns)
+*           where  V2  is unit lower triangular.
+*
+            IF( LSAME( SIDE, 'L' ) ) THEN
+*
+*              Form  H * C  or  H**T * C  where  C = ( C1 )
+*                                                    ( C2 )
+*
+               LASTV = MAX( K, ILADLC( K, M, V, LDV ) )
+               LASTC = ILADLC( LASTV, N, C, LDC )
+*
+*              W := C**T * V**T  =  (C1**T * V1**T + C2**T * V2**T) (stored in WORK)
+*
+*              W := C2**T
+*
+               DO 190 J = 1, K
+                  CALL DCOPY( LASTC, C( LASTV-K+J, 1 ), LDC,
+     $                 WORK( 1, J ), 1 )
+  190          CONTINUE
+*
+*              W := W * V2**T
+*
+               CALL DTRMM( 'Right', 'Lower', 'Transpose', 'Unit',
+     $              LASTC, K, ONE, V( 1, LASTV-K+1 ), LDV,
+     $              WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C1**T * V1**T
+*
+                  CALL DGEMM( 'Transpose', 'Transpose',
+     $                 LASTC, K, LASTV-K, ONE, C, LDC, V, LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T**T  or  W * T
+*
+               CALL DTRMM( 'Right', 'Lower', TRANST, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - V**T * W**T
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C1 := C1 - V1**T * W**T
+*
+                  CALL DGEMM( 'Transpose', 'Transpose',
+     $                 LASTV-K, LASTC, K, -ONE, V, LDV, WORK, LDWORK,
+     $                 ONE, C, LDC )
+               END IF
+*
+*              W := W * V2
+*
+               CALL DTRMM( 'Right', 'Lower', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V( 1, LASTV-K+1 ), LDV,
+     $              WORK, LDWORK )
+*
+*              C2 := C2 - W**T
+*
+               DO 210 J = 1, K
+                  DO 200 I = 1, LASTC
+                     C( LASTV-K+J, I ) = C( LASTV-K+J, I ) - WORK(I, J)
+  200             CONTINUE
+  210          CONTINUE
+*
+            ELSE IF( LSAME( SIDE, 'R' ) ) THEN
+*
+*              Form  C * H  or  C * H**T  where  C = ( C1  C2 )
+*
+               LASTV = MAX( K, ILADLC( K, N, V, LDV ) )
+               LASTC = ILADLR( M, LASTV, C, LDC )
+*
+*              W := C * V**T  =  (C1*V1**T + C2*V2**T)  (stored in WORK)
+*
+*              W := C2
+*
+               DO 220 J = 1, K
+                  CALL DCOPY( LASTC, C( 1, LASTV-K+J ), 1,
+     $                 WORK( 1, J ), 1 )
+  220          CONTINUE
+*
+*              W := W * V2**T
+*
+               CALL DTRMM( 'Right', 'Lower', 'Transpose', 'Unit',
+     $              LASTC, K, ONE, V( 1, LASTV-K+1 ), LDV,
+     $              WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C1 * V1**T
+*
+                  CALL DGEMM( 'No transpose', 'Transpose',
+     $                 LASTC, K, LASTV-K, ONE, C, LDC, V, LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T  or  W * T**T
+*
+               CALL DTRMM( 'Right', 'Lower', TRANS, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - W * V
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C1 := C1 - W * V1
+*
+                  CALL DGEMM( 'No transpose', 'No transpose',
+     $                 LASTC, LASTV-K, K, -ONE, WORK, LDWORK, V, LDV,
+     $                 ONE, C, LDC )
+               END IF
+*
+*              W := W * V2
+*
+               CALL DTRMM( 'Right', 'Lower', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V( 1, LASTV-K+1 ), LDV,
+     $              WORK, LDWORK )
+*
+*              C1 := C1 - W
+*
+               DO 240 J = 1, K
+                  DO 230 I = 1, LASTC
+                     C( I, LASTV-K+J ) = C( I, LASTV-K+J ) - WORK(I, J)
+  230             CONTINUE
+  240          CONTINUE
+*
+            END IF
+*
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of DLARFB
+*
+      END

diff --git a/lapack/dlarfg.f b/lapack/dlarfg.f
new file mode 100644
index 0000000..458ad2e
--- /dev/null
+++ b/lapack/dlarfg.f

@@ -0,0 +1,196 @@
+*> \brief \b DLARFG
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download DLARFG + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dlarfg.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dlarfg.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dlarfg.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DLARFG( N, ALPHA, X, INCX, TAU )
+* 
+*       .. Scalar Arguments ..
+*       INTEGER            INCX, N
+*       DOUBLE PRECISION   ALPHA, TAU
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   X( * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLARFG generates a real elementary reflector H of order n, such
+*> that
+*>
+*>       H * ( alpha ) = ( beta ),   H**T * H = I.
+*>           (   x   )   (   0  )
+*>
+*> where alpha and beta are scalars, and x is an (n-1)-element real
+*> vector. H is represented in the form
+*>
+*>       H = I - tau * ( 1 ) * ( 1 v**T ) ,
+*>                     ( v )
+*>
+*> where tau is a real scalar and v is a real (n-1)-element
+*> vector.
+*>
+*> If the elements of x are all zero, then tau = 0 and H is taken to be
+*> the unit matrix.
+*>
+*> Otherwise  1 <= tau <= 2.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the elementary reflector.
+*> \endverbatim
+*>
+*> \param[in,out] ALPHA
+*> \verbatim
+*>          ALPHA is DOUBLE PRECISION
+*>          On entry, the value alpha.
+*>          On exit, it is overwritten with the value beta.
+*> \endverbatim
+*>
+*> \param[in,out] X
+*> \verbatim
+*>          X is DOUBLE PRECISION array, dimension
+*>                         (1+(N-2)*abs(INCX))
+*>          On entry, the vector x.
+*>          On exit, it is overwritten with the vector v.
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*>          INCX is INTEGER
+*>          The increment between elements of X. INCX > 0.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is DOUBLE PRECISION
+*>          The value tau.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup doubleOTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE DLARFG( N, ALPHA, X, INCX, TAU )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+      DOUBLE PRECISION   ALPHA, TAU
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   X( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE, ZERO
+      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            J, KNT
+      DOUBLE PRECISION   BETA, RSAFMN, SAFMIN, XNORM
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH, DLAPY2, DNRM2
+      EXTERNAL           DLAMCH, DLAPY2, DNRM2
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, SIGN
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DSCAL
+*     ..
+*     .. Executable Statements ..
+*
+      IF( N.LE.1 ) THEN
+         TAU = ZERO
+         RETURN
+      END IF
+*
+      XNORM = DNRM2( N-1, X, INCX )
+*
+      IF( XNORM.EQ.ZERO ) THEN
+*
+*        H  =  I
+*
+         TAU = ZERO
+      ELSE
+*
+*        general case
+*
+         BETA = -SIGN( DLAPY2( ALPHA, XNORM ), ALPHA )
+         SAFMIN = DLAMCH( 'S' ) / DLAMCH( 'E' )
+         KNT = 0
+         IF( ABS( BETA ).LT.SAFMIN ) THEN
+*
+*           XNORM, BETA may be inaccurate; scale X and recompute them
+*
+            RSAFMN = ONE / SAFMIN
+   10       CONTINUE
+            KNT = KNT + 1
+            CALL DSCAL( N-1, RSAFMN, X, INCX )
+            BETA = BETA*RSAFMN
+            ALPHA = ALPHA*RSAFMN
+            IF( ABS( BETA ).LT.SAFMIN )
+     $         GO TO 10
+*
+*           New BETA is at most 1, at least SAFMIN
+*
+            XNORM = DNRM2( N-1, X, INCX )
+            BETA = -SIGN( DLAPY2( ALPHA, XNORM ), ALPHA )
+         END IF
+         TAU = ( BETA-ALPHA ) / BETA
+         CALL DSCAL( N-1, ONE / ( ALPHA-BETA ), X, INCX )
+*
+*        If ALPHA is subnormal, it may lose relative accuracy
+*
+         DO 20 J = 1, KNT
+            BETA = BETA*SAFMIN
+ 20      CONTINUE
+         ALPHA = BETA
+      END IF
+*
+      RETURN
+*
+*     End of DLARFG
+*
+      END

diff --git a/lapack/dlarft.f b/lapack/dlarft.f
new file mode 100644
index 0000000..4b75504
--- /dev/null
+++ b/lapack/dlarft.f

@@ -0,0 +1,326 @@
+*> \brief \b DLARFT
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download DLARFT + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dlarft.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dlarft.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dlarft.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT )
+* 
+*       .. Scalar Arguments ..
+*       CHARACTER          DIRECT, STOREV
+*       INTEGER            K, LDT, LDV, N
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   T( LDT, * ), TAU( * ), V( LDV, * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLARFT forms the triangular factor T of a real block reflector H
+*> of order n, which is defined as a product of k elementary reflectors.
+*>
+*> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+*>
+*> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+*>
+*> If STOREV = 'C', the vector which defines the elementary reflector
+*> H(i) is stored in the i-th column of the array V, and
+*>
+*>    H  =  I - V * T * V**T
+*>
+*> If STOREV = 'R', the vector which defines the elementary reflector
+*> H(i) is stored in the i-th row of the array V, and
+*>
+*>    H  =  I - V**T * T * V
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] DIRECT
+*> \verbatim
+*>          DIRECT is CHARACTER*1
+*>          Specifies the order in which the elementary reflectors are
+*>          multiplied to form the block reflector:
+*>          = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*>          = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*>          STOREV is CHARACTER*1
+*>          Specifies how the vectors which define the elementary
+*>          reflectors are stored (see also Further Details):
+*>          = 'C': columnwise
+*>          = 'R': rowwise
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the block reflector H. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The order of the triangular factor T (= the number of
+*>          elementary reflectors). K >= 1.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is DOUBLE PRECISION array, dimension
+*>                               (LDV,K) if STOREV = 'C'
+*>                               (LDV,N) if STOREV = 'R'
+*>          The matrix V. See further details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V.
+*>          If STOREV = 'C', LDV >= max(1,N); if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] TAU
+*> \verbatim
+*>          TAU is DOUBLE PRECISION array, dimension (K)
+*>          TAU(i) must contain the scalar factor of the elementary
+*>          reflector H(i).
+*> \endverbatim
+*>
+*> \param[out] T
+*> \verbatim
+*>          T is DOUBLE PRECISION array, dimension (LDT,K)
+*>          The k by k triangular factor T of the block reflector.
+*>          If DIRECT = 'F', T is upper triangular; if DIRECT = 'B', T is
+*>          lower triangular. The rest of the array is not used.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup doubleOTHERauxiliary
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  The shape of the matrix V and the storage of the vectors which define
+*>  the H(i) is best illustrated by the following example with n = 5 and
+*>  k = 3. The elements equal to 1 are not stored.
+*>
+*>  DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R':
+*>
+*>               V = (  1       )                 V = (  1 v1 v1 v1 v1 )
+*>                   ( v1  1    )                     (     1 v2 v2 v2 )
+*>                   ( v1 v2  1 )                     (        1 v3 v3 )
+*>                   ( v1 v2 v3 )
+*>                   ( v1 v2 v3 )
+*>
+*>  DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R':
+*>
+*>               V = ( v1 v2 v3 )                 V = ( v1 v1  1       )
+*>                   ( v1 v2 v3 )                     ( v2 v2 v2  1    )
+*>                   (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
+*>                   (     1 v3 )
+*>                   (        1 )
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT )
+*
+*  -- LAPACK auxiliary routine (version 3.4.1) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*     .. Scalar Arguments ..
+      CHARACTER          DIRECT, STOREV
+      INTEGER            K, LDT, LDV, N
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   T( LDT, * ), TAU( * ), V( LDV, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE, ZERO
+      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, J, PREVLASTV, LASTV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DGEMV, DTRMV
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      IF( LSAME( DIRECT, 'F' ) ) THEN
+         PREVLASTV = N
+         DO I = 1, K
+            PREVLASTV = MAX( I, PREVLASTV )
+            IF( TAU( I ).EQ.ZERO ) THEN
+*
+*              H(i)  =  I
+*
+               DO J = 1, I
+                  T( J, I ) = ZERO
+               END DO
+            ELSE
+*
+*              general case
+*
+               IF( LSAME( STOREV, 'C' ) ) THEN
+*                 Skip any trailing zeros.
+                  DO LASTV = N, I+1, -1
+                     IF( V( LASTV, I ).NE.ZERO ) EXIT
+                  END DO
+                  DO J = 1, I-1
+                     T( J, I ) = -TAU( I ) * V( I , J )
+                  END DO   
+                  J = MIN( LASTV, PREVLASTV )
+*
+*                 T(1:i-1,i) := - tau(i) * V(i:j,1:i-1)**T * V(i:j,i)
+*
+                  CALL DGEMV( 'Transpose', J-I, I-1, -TAU( I ), 
+     $                        V( I+1, 1 ), LDV, V( I+1, I ), 1, ONE, 
+     $                        T( 1, I ), 1 )
+               ELSE
+*                 Skip any trailing zeros.
+                  DO LASTV = N, I+1, -1
+                     IF( V( I, LASTV ).NE.ZERO ) EXIT
+                  END DO
+                  DO J = 1, I-1
+                     T( J, I ) = -TAU( I ) * V( J , I )
+                  END DO   
+                  J = MIN( LASTV, PREVLASTV )
+*
+*                 T(1:i-1,i) := - tau(i) * V(1:i-1,i:j) * V(i,i:j)**T
+*
+                  CALL DGEMV( 'No transpose', I-1, J-I, -TAU( I ),
+     $                        V( 1, I+1 ), LDV, V( I, I+1 ), LDV, ONE,
+     $                        T( 1, I ), 1 )
+               END IF
+*
+*              T(1:i-1,i) := T(1:i-1,1:i-1) * T(1:i-1,i)
+*
+               CALL DTRMV( 'Upper', 'No transpose', 'Non-unit', I-1, T,
+     $                     LDT, T( 1, I ), 1 )
+               T( I, I ) = TAU( I )
+               IF( I.GT.1 ) THEN
+                  PREVLASTV = MAX( PREVLASTV, LASTV )
+               ELSE
+                  PREVLASTV = LASTV
+               END IF
+            END IF
+         END DO
+      ELSE
+         PREVLASTV = 1
+         DO I = K, 1, -1
+            IF( TAU( I ).EQ.ZERO ) THEN
+*
+*              H(i)  =  I
+*
+               DO J = I, K
+                  T( J, I ) = ZERO
+               END DO
+            ELSE
+*
+*              general case
+*
+               IF( I.LT.K ) THEN
+                  IF( LSAME( STOREV, 'C' ) ) THEN
+*                    Skip any leading zeros.
+                     DO LASTV = 1, I-1
+                        IF( V( LASTV, I ).NE.ZERO ) EXIT
+                     END DO
+                     DO J = I+1, K
+                        T( J, I ) = -TAU( I ) * V( N-K+I , J )
+                     END DO   
+                     J = MAX( LASTV, PREVLASTV )
+*
+*                    T(i+1:k,i) = -tau(i) * V(j:n-k+i,i+1:k)**T * V(j:n-k+i,i)
+*
+                     CALL DGEMV( 'Transpose', N-K+I-J, K-I, -TAU( I ),
+     $                           V( J, I+1 ), LDV, V( J, I ), 1, ONE,
+     $                           T( I+1, I ), 1 )
+                  ELSE
+*                    Skip any leading zeros.
+                     DO LASTV = 1, I-1
+                        IF( V( I, LASTV ).NE.ZERO ) EXIT
+                     END DO
+                     DO J = I+1, K
+                        T( J, I ) = -TAU( I ) * V( J, N-K+I )
+                     END DO   
+                     J = MAX( LASTV, PREVLASTV )
+*
+*                    T(i+1:k,i) = -tau(i) * V(i+1:k,j:n-k+i) * V(i,j:n-k+i)**T
+*
+                     CALL DGEMV( 'No transpose', K-I, N-K+I-J,
+     $                    -TAU( I ), V( I+1, J ), LDV, V( I, J ), LDV,
+     $                    ONE, T( I+1, I ), 1 )
+                  END IF
+*
+*                 T(i+1:k,i) := T(i+1:k,i+1:k) * T(i+1:k,i)
+*
+                  CALL DTRMV( 'Lower', 'No transpose', 'Non-unit', K-I,
+     $                        T( I+1, I+1 ), LDT, T( I+1, I ), 1 )
+                  IF( I.GT.1 ) THEN
+                     PREVLASTV = MIN( PREVLASTV, LASTV )
+                  ELSE
+                     PREVLASTV = LASTV
+                  END IF
+               END IF
+               T( I, I ) = TAU( I )
+            END IF
+         END DO
+      END IF
+      RETURN
+*
+*     End of DLARFT
+*
+      END

diff --git a/lapack/double.cpp b/lapack/double.cpp
new file mode 100644
index 0000000..ea78bb6
--- /dev/null
+++ b/lapack/double.cpp

@@ -0,0 +1,18 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define SCALAR        double
+#define SCALAR_SUFFIX d
+#define SCALAR_SUFFIX_UP "D"
+#define ISCOMPLEX     0
+
+#include "cholesky.cpp"
+#include "lu.cpp"
+#include "eigenvalues.cpp"
+#include "svd.cpp"

diff --git a/lapack/dsecnd_NONE.f b/lapack/dsecnd_NONE.f
new file mode 100644
index 0000000..61a8dff
--- /dev/null
+++ b/lapack/dsecnd_NONE.f

@@ -0,0 +1,52 @@
+*> \brief \b DSECND returns nothing
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*      DOUBLE PRECISION FUNCTION DSECND( )
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*>  DSECND returns nothing instead of returning the user time for a process in seconds.
+*>  If you are using that routine, it means that neither EXTERNAL ETIME,
+*>  EXTERNAL ETIME_, INTERNAL ETIME, INTERNAL CPU_TIME is available  on
+*>  your machine.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup auxOTHERauxiliary
+*
+*  =====================================================================
+      DOUBLE PRECISION FUNCTION DSECND( )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+* =====================================================================
+*
+      DSECND = 0.0D+0
+      RETURN
+*
+*     End of DSECND
+*
+      END

diff --git a/lapack/eigenvalues.cpp b/lapack/eigenvalues.cpp
new file mode 100644
index 0000000..921c515
--- /dev/null
+++ b/lapack/eigenvalues.cpp

@@ -0,0 +1,62 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "lapack_common.h"
+#include <Eigen/Eigenvalues>
+
+// computes eigen values and vectors of a general N-by-N matrix A
+EIGEN_LAPACK_FUNC(syev,(char *jobz, char *uplo, int* n, Scalar* a, int *lda, Scalar* w, Scalar* /*work*/, int* lwork, int *info))
+{
+  // TODO exploit the work buffer
+  bool query_size = *lwork==-1;
+  
+  *info = 0;
+        if(*jobz!='N' && *jobz!='V')                    *info = -1;
+  else  if(UPLO(*uplo)==INVALID)                        *info = -2;
+  else  if(*n<0)                                        *info = -3;
+  else  if(*lda<std::max(1,*n))                         *info = -5;
+  else  if((!query_size) && *lwork<std::max(1,3**n-1))  *info = -8;
+    
+  if(*info!=0)
+  {
+    int e = -*info;
+    return xerbla_(SCALAR_SUFFIX_UP"SYEV ", &e, 6);
+  }
+  
+  if(query_size)
+  {
+    *lwork = 0;
+    return 0;
+  }
+  
+  if(*n==0)
+    return 0;
+  
+  PlainMatrixType mat(*n,*n);
+  if(UPLO(*uplo)==UP) mat = matrix(a,*n,*n,*lda).adjoint();
+  else                mat = matrix(a,*n,*n,*lda);
+  
+  bool computeVectors = *jobz=='V' || *jobz=='v';
+  SelfAdjointEigenSolver<PlainMatrixType> eig(mat,computeVectors?ComputeEigenvectors:EigenvaluesOnly);
+  
+  if(eig.info()==NoConvergence)
+  {
+    make_vector(w,*n).setZero();
+    if(computeVectors)
+      matrix(a,*n,*n,*lda).setIdentity();
+    //*info = 1;
+    return 0;
+  }
+  
+  make_vector(w,*n) = eig.eigenvalues();
+  if(computeVectors)
+    matrix(a,*n,*n,*lda) = eig.eigenvectors();
+  
+  return 0;
+}

diff --git a/lapack/ilaclc.f b/lapack/ilaclc.f
new file mode 100644
index 0000000..4ceb61c
--- /dev/null
+++ b/lapack/ilaclc.f

@@ -0,0 +1,118 @@
+*> \brief \b ILACLC
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download ILACLC + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ilaclc.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ilaclc.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ilaclc.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       INTEGER FUNCTION ILACLC( M, N, A, LDA )
+* 
+*       .. Scalar Arguments ..
+*       INTEGER            M, N, LDA
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            A( LDA, * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ILACLC scans A for its last non-zero column.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N)
+*>          The m by n matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup complexOTHERauxiliary
+*
+*  =====================================================================
+      INTEGER FUNCTION ILACLC( M, N, A, LDA )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      INTEGER            M, N, LDA
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX          ZERO
+      PARAMETER ( ZERO = (0.0E+0, 0.0E+0) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER I
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick test for the common case where one corner is non-zero.
+      IF( N.EQ.0 ) THEN
+         ILACLC = N
+      ELSE IF( A(1, N).NE.ZERO .OR. A(M, N).NE.ZERO ) THEN
+         ILACLC = N
+      ELSE
+*     Now scan each column from the end, returning with the first non-zero.
+         DO ILACLC = N, 1, -1
+            DO I = 1, M
+               IF( A(I, ILACLC).NE.ZERO ) RETURN
+            END DO
+         END DO
+      END IF
+      RETURN
+      END

diff --git a/lapack/ilaclr.f b/lapack/ilaclr.f
new file mode 100644
index 0000000..d8ab09c
--- /dev/null
+++ b/lapack/ilaclr.f

@@ -0,0 +1,121 @@
+*> \brief \b ILACLR
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download ILACLR + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ilaclr.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ilaclr.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ilaclr.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       INTEGER FUNCTION ILACLR( M, N, A, LDA )
+* 
+*       .. Scalar Arguments ..
+*       INTEGER            M, N, LDA
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            A( LDA, * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ILACLR scans A for its last non-zero row.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is array, dimension (LDA,N)
+*>          The m by n matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complexOTHERauxiliary
+*
+*  =====================================================================
+      INTEGER FUNCTION ILACLR( M, N, A, LDA )
+*
+*  -- LAPACK auxiliary routine (version 3.4.1) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*     .. Scalar Arguments ..
+      INTEGER            M, N, LDA
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX          ZERO
+      PARAMETER ( ZERO = (0.0E+0, 0.0E+0) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER I, J
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick test for the common case where one corner is non-zero.
+      IF( M.EQ.0 ) THEN
+         ILACLR = M
+      ELSE IF( A(M, 1).NE.ZERO .OR. A(M, N).NE.ZERO ) THEN
+         ILACLR = M
+      ELSE
+*     Scan up each column tracking the last zero row seen.
+         ILACLR = 0
+         DO J = 1, N
+            I=M
+            DO WHILE((A(MAX(I,1),J).EQ.ZERO).AND.(I.GE.1))
+               I=I-1
+            ENDDO
+            ILACLR = MAX( ILACLR, I )
+         END DO
+      END IF
+      RETURN
+      END

diff --git a/lapack/iladlc.f b/lapack/iladlc.f
new file mode 100644
index 0000000..f84bd83
--- /dev/null
+++ b/lapack/iladlc.f

@@ -0,0 +1,118 @@
+*> \brief \b ILADLC
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download ILADLC + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/iladlc.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/iladlc.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/iladlc.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       INTEGER FUNCTION ILADLC( M, N, A, LDA )
+* 
+*       .. Scalar Arguments ..
+*       INTEGER            M, N, LDA
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   A( LDA, * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ILADLC scans A for its last non-zero column.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>          The m by n matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup auxOTHERauxiliary
+*
+*  =====================================================================
+      INTEGER FUNCTION ILADLC( M, N, A, LDA )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      INTEGER            M, N, LDA
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION ZERO
+      PARAMETER ( ZERO = 0.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER I
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick test for the common case where one corner is non-zero.
+      IF( N.EQ.0 ) THEN
+         ILADLC = N
+      ELSE IF( A(1, N).NE.ZERO .OR. A(M, N).NE.ZERO ) THEN
+         ILADLC = N
+      ELSE
+*     Now scan each column from the end, returning with the first non-zero.
+         DO ILADLC = N, 1, -1
+            DO I = 1, M
+               IF( A(I, ILADLC).NE.ZERO ) RETURN
+            END DO
+         END DO
+      END IF
+      RETURN
+      END

diff --git a/lapack/iladlr.f b/lapack/iladlr.f
new file mode 100644
index 0000000..2114c61
--- /dev/null
+++ b/lapack/iladlr.f

@@ -0,0 +1,121 @@
+*> \brief \b ILADLR
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download ILADLR + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/iladlr.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/iladlr.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/iladlr.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       INTEGER FUNCTION ILADLR( M, N, A, LDA )
+* 
+*       .. Scalar Arguments ..
+*       INTEGER            M, N, LDA
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   A( LDA, * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ILADLR scans A for its last non-zero row.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>          The m by n matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup auxOTHERauxiliary
+*
+*  =====================================================================
+      INTEGER FUNCTION ILADLR( M, N, A, LDA )
+*
+*  -- LAPACK auxiliary routine (version 3.4.1) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*     .. Scalar Arguments ..
+      INTEGER            M, N, LDA
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION ZERO
+      PARAMETER ( ZERO = 0.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER I, J
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick test for the common case where one corner is non-zero.
+      IF( M.EQ.0 ) THEN
+         ILADLR = M
+      ELSE IF( A(M, 1).NE.ZERO .OR. A(M, N).NE.ZERO ) THEN
+         ILADLR = M
+      ELSE
+*     Scan up each column tracking the last zero row seen.
+         ILADLR = 0
+         DO J = 1, N
+            I=M
+            DO WHILE((A(MAX(I,1),J).EQ.ZERO).AND.(I.GE.1))
+               I=I-1
+            ENDDO
+            ILADLR = MAX( ILADLR, I )
+         END DO
+      END IF
+      RETURN
+      END

diff --git a/lapack/ilaslc.f b/lapack/ilaslc.f
new file mode 100644
index 0000000..e3db0f4
--- /dev/null
+++ b/lapack/ilaslc.f

@@ -0,0 +1,118 @@
+*> \brief \b ILASLC
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download ILASLC + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ilaslc.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ilaslc.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ilaslc.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       INTEGER FUNCTION ILASLC( M, N, A, LDA )
+* 
+*       .. Scalar Arguments ..
+*       INTEGER            M, N, LDA
+*       ..
+*       .. Array Arguments ..
+*       REAL               A( LDA, * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ILASLC scans A for its last non-zero column.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N)
+*>          The m by n matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup realOTHERauxiliary
+*
+*  =====================================================================
+      INTEGER FUNCTION ILASLC( M, N, A, LDA )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      INTEGER            M, N, LDA
+*     ..
+*     .. Array Arguments ..
+      REAL               A( LDA, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL             ZERO
+      PARAMETER ( ZERO = 0.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER I
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick test for the common case where one corner is non-zero.
+      IF( N.EQ.0 ) THEN
+         ILASLC = N
+      ELSE IF( A(1, N).NE.ZERO .OR. A(M, N).NE.ZERO ) THEN
+         ILASLC = N
+      ELSE
+*     Now scan each column from the end, returning with the first non-zero.
+         DO ILASLC = N, 1, -1
+            DO I = 1, M
+               IF( A(I, ILASLC).NE.ZERO ) RETURN
+            END DO
+         END DO
+      END IF
+      RETURN
+      END

diff --git a/lapack/ilaslr.f b/lapack/ilaslr.f
new file mode 100644
index 0000000..48b73f4
--- /dev/null
+++ b/lapack/ilaslr.f

@@ -0,0 +1,121 @@
+*> \brief \b ILASLR
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download ILASLR + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ilaslr.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ilaslr.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ilaslr.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       INTEGER FUNCTION ILASLR( M, N, A, LDA )
+* 
+*       .. Scalar Arguments ..
+*       INTEGER            M, N, LDA
+*       ..
+*       .. Array Arguments ..
+*       REAL               A( LDA, * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ILASLR scans A for its last non-zero row.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N)
+*>          The m by n matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup realOTHERauxiliary
+*
+*  =====================================================================
+      INTEGER FUNCTION ILASLR( M, N, A, LDA )
+*
+*  -- LAPACK auxiliary routine (version 3.4.1) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*     .. Scalar Arguments ..
+      INTEGER            M, N, LDA
+*     ..
+*     .. Array Arguments ..
+      REAL               A( LDA, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL             ZERO
+      PARAMETER ( ZERO = 0.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER I, J
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick test for the common case where one corner is non-zero.
+      IF( M.EQ.0 ) THEN
+         ILASLR = M
+      ELSEIF( A(M, 1).NE.ZERO .OR. A(M, N).NE.ZERO ) THEN
+         ILASLR = M
+      ELSE
+*     Scan up each column tracking the last zero row seen.
+         ILASLR = 0
+         DO J = 1, N
+            I=M
+            DO WHILE((A(MAX(I,1),J).EQ.ZERO).AND.(I.GE.1))
+               I=I-1
+            ENDDO
+            ILASLR = MAX( ILASLR, I )
+         END DO
+      END IF
+      RETURN
+      END

diff --git a/lapack/ilazlc.f b/lapack/ilazlc.f
new file mode 100644
index 0000000..15b1490
--- /dev/null
+++ b/lapack/ilazlc.f

@@ -0,0 +1,118 @@
+*> \brief \b ILAZLC
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download ILAZLC + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ilazlc.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ilazlc.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ilazlc.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       INTEGER FUNCTION ILAZLC( M, N, A, LDA )
+* 
+*       .. Scalar Arguments ..
+*       INTEGER            M, N, LDA
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         A( LDA, * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ILAZLC scans A for its last non-zero column.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>          The m by n matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup complex16OTHERauxiliary
+*
+*  =====================================================================
+      INTEGER FUNCTION ILAZLC( M, N, A, LDA )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      INTEGER            M, N, LDA
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX*16       ZERO
+      PARAMETER ( ZERO = (0.0D+0, 0.0D+0) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER I
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick test for the common case where one corner is non-zero.
+      IF( N.EQ.0 ) THEN
+         ILAZLC = N
+      ELSE IF( A(1, N).NE.ZERO .OR. A(M, N).NE.ZERO ) THEN
+         ILAZLC = N
+      ELSE
+*     Now scan each column from the end, returning with the first non-zero.
+         DO ILAZLC = N, 1, -1
+            DO I = 1, M
+               IF( A(I, ILAZLC).NE.ZERO ) RETURN
+            END DO
+         END DO
+      END IF
+      RETURN
+      END

diff --git a/lapack/ilazlr.f b/lapack/ilazlr.f
new file mode 100644
index 0000000..b2ab943
--- /dev/null
+++ b/lapack/ilazlr.f

@@ -0,0 +1,121 @@
+*> \brief \b ILAZLR
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download ILAZLR + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ilazlr.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ilazlr.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ilazlr.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       INTEGER FUNCTION ILAZLR( M, N, A, LDA )
+* 
+*       .. Scalar Arguments ..
+*       INTEGER            M, N, LDA
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         A( LDA, * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ILAZLR scans A for its last non-zero row.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>          The m by n matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complex16OTHERauxiliary
+*
+*  =====================================================================
+      INTEGER FUNCTION ILAZLR( M, N, A, LDA )
+*
+*  -- LAPACK auxiliary routine (version 3.4.1) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*     .. Scalar Arguments ..
+      INTEGER            M, N, LDA
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX*16       ZERO
+      PARAMETER ( ZERO = (0.0D+0, 0.0D+0) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER I, J
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick test for the common case where one corner is non-zero.
+      IF( M.EQ.0 ) THEN
+         ILAZLR = M
+      ELSE IF( A(M, 1).NE.ZERO .OR. A(M, N).NE.ZERO ) THEN
+         ILAZLR = M
+      ELSE
+*     Scan up each column tracking the last zero row seen.
+         ILAZLR = 0
+         DO J = 1, N
+            I=M
+            DO WHILE((A(MAX(I,1),J).EQ.ZERO).AND.(I.GE.1))
+               I=I-1
+            ENDDO
+            ILAZLR = MAX( ILAZLR, I )
+         END DO
+      END IF
+      RETURN
+      END

diff --git a/lapack/lapack_common.h b/lapack/lapack_common.h
new file mode 100644
index 0000000..c872a81
--- /dev/null
+++ b/lapack/lapack_common.h

@@ -0,0 +1,29 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LAPACK_COMMON_H
+#define EIGEN_LAPACK_COMMON_H
+
+#include "../blas/common.h"
+#include "../Eigen/src/misc/lapack.h"
+
+#define EIGEN_LAPACK_FUNC(FUNC,ARGLIST)               \
+  extern "C" { int EIGEN_BLAS_FUNC(FUNC) ARGLIST; }   \
+  int EIGEN_BLAS_FUNC(FUNC) ARGLIST
+
+typedef Eigen::Map<Eigen::Transpositions<Eigen::Dynamic,Eigen::Dynamic,int> > PivotsType;
+
+#if ISCOMPLEX
+#define EIGEN_LAPACK_ARG_IF_COMPLEX(X) X,
+#else
+#define EIGEN_LAPACK_ARG_IF_COMPLEX(X)
+#endif
+
+
+#endif // EIGEN_LAPACK_COMMON_H

diff --git a/lapack/lu.cpp b/lapack/lu.cpp
new file mode 100644
index 0000000..90cebe0
--- /dev/null
+++ b/lapack/lu.cpp

@@ -0,0 +1,89 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "common.h"
+#include <Eigen/LU>
+
+// computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges
+EIGEN_LAPACK_FUNC(getrf,(int *m, int *n, RealScalar *pa, int *lda, int *ipiv, int *info))
+{
+  *info = 0;
+        if(*m<0)                  *info = -1;
+  else  if(*n<0)                  *info = -2;
+  else  if(*lda<std::max(1,*m))   *info = -4;
+  if(*info!=0)
+  {
+    int e = -*info;
+    return xerbla_(SCALAR_SUFFIX_UP"GETRF", &e, 6);
+  }
+
+  if(*m==0 || *n==0)
+    return 0;
+
+  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  int nb_transpositions;
+  int ret = int(Eigen::internal::partial_lu_impl<Scalar,ColMajor,int>
+                     ::blocked_lu(*m, *n, a, *lda, ipiv, nb_transpositions));
+
+  for(int i=0; i<std::min(*m,*n); ++i)
+    ipiv[i]++;
+
+  if(ret>=0)
+    *info = ret+1;
+
+  return 0;
+}
+
+//GETRS solves a system of linear equations
+//    A * X = B  or  A' * X = B
+//  with a general N-by-N matrix A using the LU factorization computed  by GETRF
+EIGEN_LAPACK_FUNC(getrs,(char *trans, int *n, int *nrhs, RealScalar *pa, int *lda, int *ipiv, RealScalar *pb, int *ldb, int *info))
+{
+  *info = 0;
+        if(OP(*trans)==INVALID)  *info = -1;
+  else  if(*n<0)                 *info = -2;
+  else  if(*nrhs<0)              *info = -3;
+  else  if(*lda<std::max(1,*n))  *info = -5;
+  else  if(*ldb<std::max(1,*n))  *info = -8;
+  if(*info!=0)
+  {
+    int e = -*info;
+    return xerbla_(SCALAR_SUFFIX_UP"GETRS", &e, 6);
+  }
+
+  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  Scalar* b = reinterpret_cast<Scalar*>(pb);
+  MatrixType lu(a,*n,*n,*lda);
+  MatrixType B(b,*n,*nrhs,*ldb);
+
+  for(int i=0; i<*n; ++i)
+    ipiv[i]--;
+  if(OP(*trans)==NOTR)
+  {
+    B = PivotsType(ipiv,*n) * B;
+    lu.triangularView<UnitLower>().solveInPlace(B);
+    lu.triangularView<Upper>().solveInPlace(B);
+  }
+  else if(OP(*trans)==TR)
+  {
+    lu.triangularView<Upper>().transpose().solveInPlace(B);
+    lu.triangularView<UnitLower>().transpose().solveInPlace(B);
+    B = PivotsType(ipiv,*n).transpose() * B;
+  }
+  else if(OP(*trans)==ADJ)
+  {
+    lu.triangularView<Upper>().adjoint().solveInPlace(B);
+    lu.triangularView<UnitLower>().adjoint().solveInPlace(B);
+    B = PivotsType(ipiv,*n).transpose() * B;
+  }
+  for(int i=0; i<*n; ++i)
+    ipiv[i]++;
+
+  return 0;
+}

diff --git a/lapack/second_NONE.f b/lapack/second_NONE.f
new file mode 100644
index 0000000..d3e6d33
--- /dev/null
+++ b/lapack/second_NONE.f

@@ -0,0 +1,52 @@
+*> \brief \b SECOND returns nothing
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*      REAL FUNCTION SECOND( )
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*>  SECOND returns nothing instead of returning the user time for a process in seconds.
+*>  If you are using that routine, it means that neither EXTERNAL ETIME,
+*>  EXTERNAL ETIME_, INTERNAL ETIME, INTERNAL CPU_TIME is available  on
+*>  your machine.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup auxOTHERauxiliary
+*
+*  =====================================================================
+      REAL FUNCTION SECOND( )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+* =====================================================================
+*
+      SECOND = 0.0E+0
+      RETURN
+*
+*     End of SECOND
+*
+      END

diff --git a/lapack/single.cpp b/lapack/single.cpp
new file mode 100644
index 0000000..c7da3ef
--- /dev/null
+++ b/lapack/single.cpp

@@ -0,0 +1,18 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define SCALAR        float
+#define SCALAR_SUFFIX s
+#define SCALAR_SUFFIX_UP "S"
+#define ISCOMPLEX     0
+
+#include "cholesky.cpp"
+#include "lu.cpp"
+#include "eigenvalues.cpp"
+#include "svd.cpp"

diff --git a/lapack/sladiv.f b/lapack/sladiv.f
new file mode 100644
index 0000000..da3afa3
--- /dev/null
+++ b/lapack/sladiv.f

@@ -0,0 +1,128 @@
+*> \brief \b SLADIV
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download SLADIV + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sladiv.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sladiv.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sladiv.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SLADIV( A, B, C, D, P, Q )
+* 
+*       .. Scalar Arguments ..
+*       REAL               A, B, C, D, P, Q
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLADIV performs complex division in  real arithmetic
+*>
+*>                       a + i*b
+*>            p + i*q = ---------
+*>                       c + i*d
+*>
+*> The algorithm is due to Robert L. Smith and can be found
+*> in D. Knuth, The art of Computer Programming, Vol.2, p.195
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] A
+*> \verbatim
+*>          A is REAL
+*> \endverbatim
+*>
+*> \param[in] B
+*> \verbatim
+*>          B is REAL
+*> \endverbatim
+*>
+*> \param[in] C
+*> \verbatim
+*>          C is REAL
+*> \endverbatim
+*>
+*> \param[in] D
+*> \verbatim
+*>          D is REAL
+*>          The scalars a, b, c, and d in the above expression.
+*> \endverbatim
+*>
+*> \param[out] P
+*> \verbatim
+*>          P is REAL
+*> \endverbatim
+*>
+*> \param[out] Q
+*> \verbatim
+*>          Q is REAL
+*>          The scalars p and q in the above expression.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup auxOTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE SLADIV( A, B, C, D, P, Q )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      REAL               A, B, C, D, P, Q
+*     ..
+*
+*  =====================================================================
+*
+*     .. Local Scalars ..
+      REAL               E, F
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS
+*     ..
+*     .. Executable Statements ..
+*
+      IF( ABS( D ).LT.ABS( C ) ) THEN
+         E = D / C
+         F = C + D*E
+         P = ( A+B*E ) / F
+         Q = ( B-A*E ) / F
+      ELSE
+         E = C / D
+         F = D + C*E
+         P = ( B+A*E ) / F
+         Q = ( -A+B*E ) / F
+      END IF
+*
+      RETURN
+*
+*     End of SLADIV
+*
+      END

diff --git a/lapack/slamch.f b/lapack/slamch.f
new file mode 100644
index 0000000..4bffad0
--- /dev/null
+++ b/lapack/slamch.f

@@ -0,0 +1,192 @@
+*> \brief \b SLAMCH
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*      REAL             FUNCTION SLAMCH( CMACH )
+*
+*     .. Scalar Arguments ..
+*      CHARACTER          CMACH
+*     ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLAMCH determines single precision machine parameters.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] CMACH
+*> \verbatim
+*>          Specifies the value to be returned by SLAMCH:
+*>          = 'E' or 'e',   SLAMCH := eps
+*>          = 'S' or 's ,   SLAMCH := sfmin
+*>          = 'B' or 'b',   SLAMCH := base
+*>          = 'P' or 'p',   SLAMCH := eps*base
+*>          = 'N' or 'n',   SLAMCH := t
+*>          = 'R' or 'r',   SLAMCH := rnd
+*>          = 'M' or 'm',   SLAMCH := emin
+*>          = 'U' or 'u',   SLAMCH := rmin
+*>          = 'L' or 'l',   SLAMCH := emax
+*>          = 'O' or 'o',   SLAMCH := rmax
+*>          where
+*>          eps   = relative machine precision
+*>          sfmin = safe minimum, such that 1/sfmin does not overflow
+*>          base  = base of the machine
+*>          prec  = eps*base
+*>          t     = number of (base) digits in the mantissa
+*>          rnd   = 1.0 when rounding occurs in addition, 0.0 otherwise
+*>          emin  = minimum exponent before (gradual) underflow
+*>          rmin  = underflow threshold - base**(emin-1)
+*>          emax  = largest exponent before overflow
+*>          rmax  = overflow threshold  - (base**emax)*(1-eps)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup auxOTHERauxiliary
+*
+*  =====================================================================
+      REAL             FUNCTION SLAMCH( CMACH )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      CHARACTER          CMACH
+*     ..
+*
+* =====================================================================
+*
+*     .. Parameters ..
+      REAL               ONE, ZERO
+      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      REAL               RND, EPS, SFMIN, SMALL, RMACH
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DIGITS, EPSILON, HUGE, MAXEXPONENT,
+     $                   MINEXPONENT, RADIX, TINY
+*     ..
+*     .. Executable Statements ..
+*
+*
+*     Assume rounding, not chopping. Always.
+*
+      RND = ONE
+*
+      IF( ONE.EQ.RND ) THEN
+         EPS = EPSILON(ZERO) * 0.5
+      ELSE
+         EPS = EPSILON(ZERO)
+      END IF
+*
+      IF( LSAME( CMACH, 'E' ) ) THEN
+         RMACH = EPS
+      ELSE IF( LSAME( CMACH, 'S' ) ) THEN
+         SFMIN = TINY(ZERO)
+         SMALL = ONE / HUGE(ZERO)
+         IF( SMALL.GE.SFMIN ) THEN
+*
+*           Use SMALL plus a bit, to avoid the possibility of rounding
+*           causing overflow when computing  1/sfmin.
+*
+            SFMIN = SMALL*( ONE+EPS )
+         END IF
+         RMACH = SFMIN
+      ELSE IF( LSAME( CMACH, 'B' ) ) THEN
+         RMACH = RADIX(ZERO)
+      ELSE IF( LSAME( CMACH, 'P' ) ) THEN
+         RMACH = EPS * RADIX(ZERO)
+      ELSE IF( LSAME( CMACH, 'N' ) ) THEN
+         RMACH = DIGITS(ZERO)
+      ELSE IF( LSAME( CMACH, 'R' ) ) THEN
+         RMACH = RND
+      ELSE IF( LSAME( CMACH, 'M' ) ) THEN
+         RMACH = MINEXPONENT(ZERO)
+      ELSE IF( LSAME( CMACH, 'U' ) ) THEN
+         RMACH = tiny(zero)
+      ELSE IF( LSAME( CMACH, 'L' ) ) THEN
+         RMACH = MAXEXPONENT(ZERO)
+      ELSE IF( LSAME( CMACH, 'O' ) ) THEN
+         RMACH = HUGE(ZERO)
+      ELSE
+         RMACH = ZERO
+      END IF
+*
+      SLAMCH = RMACH
+      RETURN
+*
+*     End of SLAMCH
+*
+      END
+************************************************************************
+*> \brief \b SLAMC3
+*> \details
+*> \b Purpose:
+*> \verbatim
+*> SLAMC3  is intended to force  A  and  B  to be stored prior to doing
+*> the addition of  A  and  B ,  for use in situations where optimizers
+*> might hold one of these in a register.
+*> \endverbatim
+*> \author LAPACK is a software package provided by Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+*> \date November 2011
+*> \ingroup auxOTHERauxiliary
+*>
+*> \param[in] A
+*> \verbatim
+*> \endverbatim
+*>
+*> \param[in] B
+*> \verbatim
+*>          The values A and B.
+*> \endverbatim
+*>
+*
+      REAL             FUNCTION SLAMC3( A, B )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
+*     November 2010
+*
+*     .. Scalar Arguments ..
+      REAL               A, B
+*     ..
+* =====================================================================
+*
+*     .. Executable Statements ..
+*
+      SLAMC3 = A + B
+*
+      RETURN
+*
+*     End of SLAMC3
+*
+      END
+*
+************************************************************************

diff --git a/lapack/slapy2.f b/lapack/slapy2.f
new file mode 100644
index 0000000..1f6b1ca
--- /dev/null
+++ b/lapack/slapy2.f

@@ -0,0 +1,104 @@
+*> \brief \b SLAPY2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download SLAPY2 + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/slapy2.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/slapy2.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/slapy2.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       REAL             FUNCTION SLAPY2( X, Y )
+* 
+*       .. Scalar Arguments ..
+*       REAL               X, Y
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLAPY2 returns sqrt(x**2+y**2), taking care not to cause unnecessary
+*> overflow.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] X
+*> \verbatim
+*>          X is REAL
+*> \endverbatim
+*>
+*> \param[in] Y
+*> \verbatim
+*>          Y is REAL
+*>          X and Y specify the values x and y.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup auxOTHERauxiliary
+*
+*  =====================================================================
+      REAL             FUNCTION SLAPY2( X, Y )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      REAL               X, Y
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO
+      PARAMETER          ( ZERO = 0.0E0 )
+      REAL               ONE
+      PARAMETER          ( ONE = 1.0E0 )
+*     ..
+*     .. Local Scalars ..
+      REAL               W, XABS, YABS, Z
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+      XABS = ABS( X )
+      YABS = ABS( Y )
+      W = MAX( XABS, YABS )
+      Z = MIN( XABS, YABS )
+      IF( Z.EQ.ZERO ) THEN
+         SLAPY2 = W
+      ELSE
+         SLAPY2 = W*SQRT( ONE+( Z / W )**2 )
+      END IF
+      RETURN
+*
+*     End of SLAPY2
+*
+      END

diff --git a/lapack/slapy3.f b/lapack/slapy3.f
new file mode 100644
index 0000000..aa2f5bf
--- /dev/null
+++ b/lapack/slapy3.f

@@ -0,0 +1,111 @@
+*> \brief \b SLAPY3
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download SLAPY3 + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/slapy3.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/slapy3.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/slapy3.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       REAL             FUNCTION SLAPY3( X, Y, Z )
+* 
+*       .. Scalar Arguments ..
+*       REAL               X, Y, Z
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLAPY3 returns sqrt(x**2+y**2+z**2), taking care not to cause
+*> unnecessary overflow.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] X
+*> \verbatim
+*>          X is REAL
+*> \endverbatim
+*>
+*> \param[in] Y
+*> \verbatim
+*>          Y is REAL
+*> \endverbatim
+*>
+*> \param[in] Z
+*> \verbatim
+*>          Z is REAL
+*>          X, Y and Z specify the values x, y and z.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup auxOTHERauxiliary
+*
+*  =====================================================================
+      REAL             FUNCTION SLAPY3( X, Y, Z )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      REAL               X, Y, Z
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO
+      PARAMETER          ( ZERO = 0.0E0 )
+*     ..
+*     .. Local Scalars ..
+      REAL               W, XABS, YABS, ZABS
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+      XABS = ABS( X )
+      YABS = ABS( Y )
+      ZABS = ABS( Z )
+      W = MAX( XABS, YABS, ZABS )
+      IF( W.EQ.ZERO ) THEN
+*     W can be zero for max(0,nan,0)
+*     adding all three entries together will make sure
+*     NaN will not disappear.
+         SLAPY3 =  XABS + YABS + ZABS
+      ELSE
+         SLAPY3 = W*SQRT( ( XABS / W )**2+( YABS / W )**2+
+     $            ( ZABS / W )**2 )
+      END IF
+      RETURN
+*
+*     End of SLAPY3
+*
+      END

diff --git a/lapack/slarf.f b/lapack/slarf.f
new file mode 100644
index 0000000..8a8ff30
--- /dev/null
+++ b/lapack/slarf.f

@@ -0,0 +1,227 @@
+*> \brief \b SLARF
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download SLARF + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/slarf.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/slarf.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/slarf.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SLARF( SIDE, M, N, V, INCV, TAU, C, LDC, WORK )
+* 
+*       .. Scalar Arguments ..
+*       CHARACTER          SIDE
+*       INTEGER            INCV, LDC, M, N
+*       REAL               TAU
+*       ..
+*       .. Array Arguments ..
+*       REAL               C( LDC, * ), V( * ), WORK( * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLARF applies a real elementary reflector H to a real m by n matrix
+*> C, from either the left or the right. H is represented in the form
+*>
+*>       H = I - tau * v * v**T
+*>
+*> where tau is a real scalar and v is a real vector.
+*>
+*> If tau = 0, then H is taken to be the unit matrix.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] SIDE
+*> \verbatim
+*>          SIDE is CHARACTER*1
+*>          = 'L': form  H * C
+*>          = 'R': form  C * H
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is REAL array, dimension
+*>                     (1 + (M-1)*abs(INCV)) if SIDE = 'L'
+*>                  or (1 + (N-1)*abs(INCV)) if SIDE = 'R'
+*>          The vector v in the representation of H. V is not used if
+*>          TAU = 0.
+*> \endverbatim
+*>
+*> \param[in] INCV
+*> \verbatim
+*>          INCV is INTEGER
+*>          The increment between elements of v. INCV <> 0.
+*> \endverbatim
+*>
+*> \param[in] TAU
+*> \verbatim
+*>          TAU is REAL
+*>          The value tau in the representation of H.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is REAL array, dimension (LDC,N)
+*>          On entry, the m by n matrix C.
+*>          On exit, C is overwritten by the matrix H * C if SIDE = 'L',
+*>          or C * H if SIDE = 'R'.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension
+*>                         (N) if SIDE = 'L'
+*>                      or (M) if SIDE = 'R'
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup realOTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE SLARF( SIDE, M, N, V, INCV, TAU, C, LDC, WORK )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      CHARACTER          SIDE
+      INTEGER            INCV, LDC, M, N
+      REAL               TAU
+*     ..
+*     .. Array Arguments ..
+      REAL               C( LDC, * ), V( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ONE, ZERO
+      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            APPLYLEFT
+      INTEGER            I, LASTV, LASTC
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SGEMV, SGER
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILASLR, ILASLC
+      EXTERNAL           LSAME, ILASLR, ILASLC
+*     ..
+*     .. Executable Statements ..
+*
+      APPLYLEFT = LSAME( SIDE, 'L' )
+      LASTV = 0
+      LASTC = 0
+      IF( TAU.NE.ZERO ) THEN
+!     Set up variables for scanning V.  LASTV begins pointing to the end
+!     of V.
+         IF( APPLYLEFT ) THEN
+            LASTV = M
+         ELSE
+            LASTV = N
+         END IF
+         IF( INCV.GT.0 ) THEN
+            I = 1 + (LASTV-1) * INCV
+         ELSE
+            I = 1
+         END IF
+!     Look for the last non-zero row in V.
+         DO WHILE( LASTV.GT.0 .AND. V( I ).EQ.ZERO )
+            LASTV = LASTV - 1
+            I = I - INCV
+         END DO
+         IF( APPLYLEFT ) THEN
+!     Scan for the last non-zero column in C(1:lastv,:).
+            LASTC = ILASLC(LASTV, N, C, LDC)
+         ELSE
+!     Scan for the last non-zero row in C(:,1:lastv).
+            LASTC = ILASLR(M, LASTV, C, LDC)
+         END IF
+      END IF
+!     Note that lastc.eq.0 renders the BLAS operations null; no special
+!     case is needed at this level.
+      IF( APPLYLEFT ) THEN
+*
+*        Form  H * C
+*
+         IF( LASTV.GT.0 ) THEN
+*
+*           w(1:lastc,1) := C(1:lastv,1:lastc)**T * v(1:lastv,1)
+*
+            CALL SGEMV( 'Transpose', LASTV, LASTC, ONE, C, LDC, V, INCV,
+     $           ZERO, WORK, 1 )
+*
+*           C(1:lastv,1:lastc) := C(...) - v(1:lastv,1) * w(1:lastc,1)**T
+*
+            CALL SGER( LASTV, LASTC, -TAU, V, INCV, WORK, 1, C, LDC )
+         END IF
+      ELSE
+*
+*        Form  C * H
+*
+         IF( LASTV.GT.0 ) THEN
+*
+*           w(1:lastc,1) := C(1:lastc,1:lastv) * v(1:lastv,1)
+*
+            CALL SGEMV( 'No transpose', LASTC, LASTV, ONE, C, LDC,
+     $           V, INCV, ZERO, WORK, 1 )
+*
+*           C(1:lastc,1:lastv) := C(...) - w(1:lastc,1) * v(1:lastv,1)**T
+*
+            CALL SGER( LASTC, LASTV, -TAU, WORK, 1, V, INCV, C, LDC )
+         END IF
+      END IF
+      RETURN
+*
+*     End of SLARF
+*
+      END

diff --git a/lapack/slarfb.f b/lapack/slarfb.f
new file mode 100644
index 0000000..eb95990
--- /dev/null
+++ b/lapack/slarfb.f

@@ -0,0 +1,763 @@
+*> \brief \b SLARFB
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download SLARFB + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/slarfb.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/slarfb.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/slarfb.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SLARFB( SIDE, TRANS, DIRECT, STOREV, M, N, K, V, LDV,
+*                          T, LDT, C, LDC, WORK, LDWORK )
+* 
+*       .. Scalar Arguments ..
+*       CHARACTER          DIRECT, SIDE, STOREV, TRANS
+*       INTEGER            K, LDC, LDT, LDV, LDWORK, M, N
+*       ..
+*       .. Array Arguments ..
+*       REAL               C( LDC, * ), T( LDT, * ), V( LDV, * ),
+*      $                   WORK( LDWORK, * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLARFB applies a real block reflector H or its transpose H**T to a
+*> real m by n matrix C, from either the left or the right.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] SIDE
+*> \verbatim
+*>          SIDE is CHARACTER*1
+*>          = 'L': apply H or H**T from the Left
+*>          = 'R': apply H or H**T from the Right
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          = 'N': apply H (No transpose)
+*>          = 'T': apply H**T (Transpose)
+*> \endverbatim
+*>
+*> \param[in] DIRECT
+*> \verbatim
+*>          DIRECT is CHARACTER*1
+*>          Indicates how H is formed from a product of elementary
+*>          reflectors
+*>          = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*>          = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*>          STOREV is CHARACTER*1
+*>          Indicates how the vectors which define the elementary
+*>          reflectors are stored:
+*>          = 'C': Columnwise
+*>          = 'R': Rowwise
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The order of the matrix T (= the number of elementary
+*>          reflectors whose product defines the block reflector).
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is REAL array, dimension
+*>                                (LDV,K) if STOREV = 'C'
+*>                                (LDV,M) if STOREV = 'R' and SIDE = 'L'
+*>                                (LDV,N) if STOREV = 'R' and SIDE = 'R'
+*>          The matrix V. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V.
+*>          If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+*>          if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+*>          if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is REAL array, dimension (LDT,K)
+*>          The triangular k by k matrix T in the representation of the
+*>          block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is REAL array, dimension (LDC,N)
+*>          On entry, the m by n matrix C.
+*>          On exit, C is overwritten by H*C or H**T*C or C*H or C*H**T.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (LDWORK,K)
+*> \endverbatim
+*>
+*> \param[in] LDWORK
+*> \verbatim
+*>          LDWORK is INTEGER
+*>          The leading dimension of the array WORK.
+*>          If SIDE = 'L', LDWORK >= max(1,N);
+*>          if SIDE = 'R', LDWORK >= max(1,M).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup realOTHERauxiliary
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  The shape of the matrix V and the storage of the vectors which define
+*>  the H(i) is best illustrated by the following example with n = 5 and
+*>  k = 3. The elements equal to 1 are not stored; the corresponding
+*>  array elements are modified but restored on exit. The rest of the
+*>  array is not used.
+*>
+*>  DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R':
+*>
+*>               V = (  1       )                 V = (  1 v1 v1 v1 v1 )
+*>                   ( v1  1    )                     (     1 v2 v2 v2 )
+*>                   ( v1 v2  1 )                     (        1 v3 v3 )
+*>                   ( v1 v2 v3 )
+*>                   ( v1 v2 v3 )
+*>
+*>  DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R':
+*>
+*>               V = ( v1 v2 v3 )                 V = ( v1 v1  1       )
+*>                   ( v1 v2 v3 )                     ( v2 v2 v2  1    )
+*>                   (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
+*>                   (     1 v3 )
+*>                   (        1 )
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE SLARFB( SIDE, TRANS, DIRECT, STOREV, M, N, K, V, LDV,
+     $                   T, LDT, C, LDC, WORK, LDWORK )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      CHARACTER          DIRECT, SIDE, STOREV, TRANS
+      INTEGER            K, LDC, LDT, LDV, LDWORK, M, N
+*     ..
+*     .. Array Arguments ..
+      REAL               C( LDC, * ), T( LDT, * ), V( LDV, * ),
+     $                   WORK( LDWORK, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ONE
+      PARAMETER          ( ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER          TRANST
+      INTEGER            I, J, LASTV, LASTC
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILASLR, ILASLC
+      EXTERNAL           LSAME, ILASLR, ILASLC
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SCOPY, SGEMM, STRMM
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( M.LE.0 .OR. N.LE.0 )
+     $   RETURN
+*
+      IF( LSAME( TRANS, 'N' ) ) THEN
+         TRANST = 'T'
+      ELSE
+         TRANST = 'N'
+      END IF
+*
+      IF( LSAME( STOREV, 'C' ) ) THEN
+*
+         IF( LSAME( DIRECT, 'F' ) ) THEN
+*
+*           Let  V =  ( V1 )    (first K rows)
+*                     ( V2 )
+*           where  V1  is unit lower triangular.
+*
+            IF( LSAME( SIDE, 'L' ) ) THEN
+*
+*              Form  H * C  or  H**T * C  where  C = ( C1 )
+*                                                    ( C2 )
+*
+               LASTV = MAX( K, ILASLR( M, K, V, LDV ) )
+               LASTC = ILASLC( LASTV, N, C, LDC )
+*
+*              W := C**T * V  =  (C1**T * V1 + C2**T * V2)  (stored in WORK)
+*
+*              W := C1**T
+*
+               DO 10 J = 1, K
+                  CALL SCOPY( LASTC, C( J, 1 ), LDC, WORK( 1, J ), 1 )
+   10          CONTINUE
+*
+*              W := W * V1
+*
+               CALL STRMM( 'Right', 'Lower', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C2**T *V2
+*
+                  CALL SGEMM( 'Transpose', 'No transpose',
+     $                 LASTC, K, LASTV-K,
+     $                 ONE, C( K+1, 1 ), LDC, V( K+1, 1 ), LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T**T  or  W * T
+*
+               CALL STRMM( 'Right', 'Upper', TRANST, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - V * W**T
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C2 := C2 - V2 * W**T
+*
+                  CALL SGEMM( 'No transpose', 'Transpose',
+     $                 LASTV-K, LASTC, K,
+     $                 -ONE, V( K+1, 1 ), LDV, WORK, LDWORK, ONE,
+     $                 C( K+1, 1 ), LDC )
+               END IF
+*
+*              W := W * V1**T
+*
+               CALL STRMM( 'Right', 'Lower', 'Transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+*
+*              C1 := C1 - W**T
+*
+               DO 30 J = 1, K
+                  DO 20 I = 1, LASTC
+                     C( J, I ) = C( J, I ) - WORK( I, J )
+   20             CONTINUE
+   30          CONTINUE
+*
+            ELSE IF( LSAME( SIDE, 'R' ) ) THEN
+*
+*              Form  C * H  or  C * H**T  where  C = ( C1  C2 )
+*
+               LASTV = MAX( K, ILASLR( N, K, V, LDV ) )
+               LASTC = ILASLR( M, LASTV, C, LDC )
+*
+*              W := C * V  =  (C1*V1 + C2*V2)  (stored in WORK)
+*
+*              W := C1
+*
+               DO 40 J = 1, K
+                  CALL SCOPY( LASTC, C( 1, J ), 1, WORK( 1, J ), 1 )
+   40          CONTINUE
+*
+*              W := W * V1
+*
+               CALL STRMM( 'Right', 'Lower', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C2 * V2
+*
+                  CALL SGEMM( 'No transpose', 'No transpose',
+     $                 LASTC, K, LASTV-K,
+     $                 ONE, C( 1, K+1 ), LDC, V( K+1, 1 ), LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T  or  W * T**T
+*
+               CALL STRMM( 'Right', 'Upper', TRANS, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - W * V**T
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C2 := C2 - W * V2**T
+*
+                  CALL SGEMM( 'No transpose', 'Transpose',
+     $                 LASTC, LASTV-K, K,
+     $                 -ONE, WORK, LDWORK, V( K+1, 1 ), LDV, ONE,
+     $                 C( 1, K+1 ), LDC )
+               END IF
+*
+*              W := W * V1**T
+*
+               CALL STRMM( 'Right', 'Lower', 'Transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+*
+*              C1 := C1 - W
+*
+               DO 60 J = 1, K
+                  DO 50 I = 1, LASTC
+                     C( I, J ) = C( I, J ) - WORK( I, J )
+   50             CONTINUE
+   60          CONTINUE
+            END IF
+*
+         ELSE
+*
+*           Let  V =  ( V1 )
+*                     ( V2 )    (last K rows)
+*           where  V2  is unit upper triangular.
+*
+            IF( LSAME( SIDE, 'L' ) ) THEN
+*
+*              Form  H * C  or  H**T * C  where  C = ( C1 )
+*                                                    ( C2 )
+*
+               LASTV = MAX( K, ILASLR( M, K, V, LDV ) )
+               LASTC = ILASLC( LASTV, N, C, LDC )
+*
+*              W := C**T * V  =  (C1**T * V1 + C2**T * V2)  (stored in WORK)
+*
+*              W := C2**T
+*
+               DO 70 J = 1, K
+                  CALL SCOPY( LASTC, C( LASTV-K+J, 1 ), LDC,
+     $                 WORK( 1, J ), 1 )
+   70          CONTINUE
+*
+*              W := W * V2
+*
+               CALL STRMM( 'Right', 'Upper', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V( LASTV-K+1, 1 ), LDV,
+     $              WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C1**T*V1
+*
+                  CALL SGEMM( 'Transpose', 'No transpose',
+     $                 LASTC, K, LASTV-K, ONE, C, LDC, V, LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T**T  or  W * T
+*
+               CALL STRMM( 'Right', 'Lower', TRANST, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - V * W**T
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C1 := C1 - V1 * W**T
+*
+                  CALL SGEMM( 'No transpose', 'Transpose',
+     $                 LASTV-K, LASTC, K, -ONE, V, LDV, WORK, LDWORK,
+     $                 ONE, C, LDC )
+               END IF
+*
+*              W := W * V2**T
+*
+               CALL STRMM( 'Right', 'Upper', 'Transpose', 'Unit',
+     $              LASTC, K, ONE, V( LASTV-K+1, 1 ), LDV,
+     $              WORK, LDWORK )
+*
+*              C2 := C2 - W**T
+*
+               DO 90 J = 1, K
+                  DO 80 I = 1, LASTC
+                     C( LASTV-K+J, I ) = C( LASTV-K+J, I ) - WORK(I, J)
+   80             CONTINUE
+   90          CONTINUE
+*
+            ELSE IF( LSAME( SIDE, 'R' ) ) THEN
+*
+*              Form  C * H  or  C * H**T  where  C = ( C1  C2 )
+*
+               LASTV = MAX( K, ILASLR( N, K, V, LDV ) )
+               LASTC = ILASLR( M, LASTV, C, LDC )
+*
+*              W := C * V  =  (C1*V1 + C2*V2)  (stored in WORK)
+*
+*              W := C2
+*
+               DO 100 J = 1, K
+                  CALL SCOPY( LASTC, C( 1, N-K+J ), 1, WORK( 1, J ), 1 )
+  100          CONTINUE
+*
+*              W := W * V2
+*
+               CALL STRMM( 'Right', 'Upper', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V( LASTV-K+1, 1 ), LDV,
+     $              WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C1 * V1
+*
+                  CALL SGEMM( 'No transpose', 'No transpose',
+     $                 LASTC, K, LASTV-K, ONE, C, LDC, V, LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T  or  W * T**T
+*
+               CALL STRMM( 'Right', 'Lower', TRANS, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - W * V**T
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C1 := C1 - W * V1**T
+*
+                  CALL SGEMM( 'No transpose', 'Transpose',
+     $                 LASTC, LASTV-K, K, -ONE, WORK, LDWORK, V, LDV,
+     $                 ONE, C, LDC )
+               END IF
+*
+*              W := W * V2**T
+*
+               CALL STRMM( 'Right', 'Upper', 'Transpose', 'Unit',
+     $              LASTC, K, ONE, V( LASTV-K+1, 1 ), LDV,
+     $              WORK, LDWORK )
+*
+*              C2 := C2 - W
+*
+               DO 120 J = 1, K
+                  DO 110 I = 1, LASTC
+                     C( I, LASTV-K+J ) = C( I, LASTV-K+J ) - WORK(I, J)
+  110             CONTINUE
+  120          CONTINUE
+            END IF
+         END IF
+*
+      ELSE IF( LSAME( STOREV, 'R' ) ) THEN
+*
+         IF( LSAME( DIRECT, 'F' ) ) THEN
+*
+*           Let  V =  ( V1  V2 )    (V1: first K columns)
+*           where  V1  is unit upper triangular.
+*
+            IF( LSAME( SIDE, 'L' ) ) THEN
+*
+*              Form  H * C  or  H**T * C  where  C = ( C1 )
+*                                                    ( C2 )
+*
+               LASTV = MAX( K, ILASLC( K, M, V, LDV ) )
+               LASTC = ILASLC( LASTV, N, C, LDC )
+*
+*              W := C**T * V**T  =  (C1**T * V1**T + C2**T * V2**T) (stored in WORK)
+*
+*              W := C1**T
+*
+               DO 130 J = 1, K
+                  CALL SCOPY( LASTC, C( J, 1 ), LDC, WORK( 1, J ), 1 )
+  130          CONTINUE
+*
+*              W := W * V1**T
+*
+               CALL STRMM( 'Right', 'Upper', 'Transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C2**T*V2**T
+*
+                  CALL SGEMM( 'Transpose', 'Transpose',
+     $                 LASTC, K, LASTV-K,
+     $                 ONE, C( K+1, 1 ), LDC, V( 1, K+1 ), LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T**T  or  W * T
+*
+               CALL STRMM( 'Right', 'Upper', TRANST, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - V**T * W**T
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C2 := C2 - V2**T * W**T
+*
+                  CALL SGEMM( 'Transpose', 'Transpose',
+     $                 LASTV-K, LASTC, K,
+     $                 -ONE, V( 1, K+1 ), LDV, WORK, LDWORK,
+     $                 ONE, C( K+1, 1 ), LDC )
+               END IF
+*
+*              W := W * V1
+*
+               CALL STRMM( 'Right', 'Upper', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+*
+*              C1 := C1 - W**T
+*
+               DO 150 J = 1, K
+                  DO 140 I = 1, LASTC
+                     C( J, I ) = C( J, I ) - WORK( I, J )
+  140             CONTINUE
+  150          CONTINUE
+*
+            ELSE IF( LSAME( SIDE, 'R' ) ) THEN
+*
+*              Form  C * H  or  C * H**T  where  C = ( C1  C2 )
+*
+               LASTV = MAX( K, ILASLC( K, N, V, LDV ) )
+               LASTC = ILASLR( M, LASTV, C, LDC )
+*
+*              W := C * V**T  =  (C1*V1**T + C2*V2**T)  (stored in WORK)
+*
+*              W := C1
+*
+               DO 160 J = 1, K
+                  CALL SCOPY( LASTC, C( 1, J ), 1, WORK( 1, J ), 1 )
+  160          CONTINUE
+*
+*              W := W * V1**T
+*
+               CALL STRMM( 'Right', 'Upper', 'Transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C2 * V2**T
+*
+                  CALL SGEMM( 'No transpose', 'Transpose',
+     $                 LASTC, K, LASTV-K,
+     $                 ONE, C( 1, K+1 ), LDC, V( 1, K+1 ), LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T  or  W * T**T
+*
+               CALL STRMM( 'Right', 'Upper', TRANS, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - W * V
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C2 := C2 - W * V2
+*
+                  CALL SGEMM( 'No transpose', 'No transpose',
+     $                 LASTC, LASTV-K, K,
+     $                 -ONE, WORK, LDWORK, V( 1, K+1 ), LDV,
+     $                 ONE, C( 1, K+1 ), LDC )
+               END IF
+*
+*              W := W * V1
+*
+               CALL STRMM( 'Right', 'Upper', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+*
+*              C1 := C1 - W
+*
+               DO 180 J = 1, K
+                  DO 170 I = 1, LASTC
+                     C( I, J ) = C( I, J ) - WORK( I, J )
+  170             CONTINUE
+  180          CONTINUE
+*
+            END IF
+*
+         ELSE
+*
+*           Let  V =  ( V1  V2 )    (V2: last K columns)
+*           where  V2  is unit lower triangular.
+*
+            IF( LSAME( SIDE, 'L' ) ) THEN
+*
+*              Form  H * C  or  H**T * C  where  C = ( C1 )
+*                                                    ( C2 )
+*
+               LASTV = MAX( K, ILASLC( K, M, V, LDV ) )
+               LASTC = ILASLC( LASTV, N, C, LDC )
+*
+*              W := C**T * V**T  =  (C1**T * V1**T + C2**T * V2**T) (stored in WORK)
+*
+*              W := C2**T
+*
+               DO 190 J = 1, K
+                  CALL SCOPY( LASTC, C( LASTV-K+J, 1 ), LDC,
+     $                 WORK( 1, J ), 1 )
+  190          CONTINUE
+*
+*              W := W * V2**T
+*
+               CALL STRMM( 'Right', 'Lower', 'Transpose', 'Unit',
+     $              LASTC, K, ONE, V( 1, LASTV-K+1 ), LDV,
+     $              WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C1**T * V1**T
+*
+                  CALL SGEMM( 'Transpose', 'Transpose',
+     $                 LASTC, K, LASTV-K, ONE, C, LDC, V, LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T**T  or  W * T
+*
+               CALL STRMM( 'Right', 'Lower', TRANST, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - V**T * W**T
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C1 := C1 - V1**T * W**T
+*
+                  CALL SGEMM( 'Transpose', 'Transpose',
+     $                 LASTV-K, LASTC, K, -ONE, V, LDV, WORK, LDWORK,
+     $                 ONE, C, LDC )
+               END IF
+*
+*              W := W * V2
+*
+               CALL STRMM( 'Right', 'Lower', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V( 1, LASTV-K+1 ), LDV,
+     $              WORK, LDWORK )
+*
+*              C2 := C2 - W**T
+*
+               DO 210 J = 1, K
+                  DO 200 I = 1, LASTC
+                     C( LASTV-K+J, I ) = C( LASTV-K+J, I ) - WORK(I, J)
+  200             CONTINUE
+  210          CONTINUE
+*
+            ELSE IF( LSAME( SIDE, 'R' ) ) THEN
+*
+*              Form  C * H  or  C * H**T  where  C = ( C1  C2 )
+*
+               LASTV = MAX( K, ILASLC( K, N, V, LDV ) )
+               LASTC = ILASLR( M, LASTV, C, LDC )
+*
+*              W := C * V**T  =  (C1*V1**T + C2*V2**T)  (stored in WORK)
+*
+*              W := C2
+*
+               DO 220 J = 1, K
+                  CALL SCOPY( LASTC, C( 1, LASTV-K+J ), 1,
+     $                 WORK( 1, J ), 1 )
+  220          CONTINUE
+*
+*              W := W * V2**T
+*
+               CALL STRMM( 'Right', 'Lower', 'Transpose', 'Unit',
+     $              LASTC, K, ONE, V( 1, LASTV-K+1 ), LDV,
+     $              WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C1 * V1**T
+*
+                  CALL SGEMM( 'No transpose', 'Transpose',
+     $                 LASTC, K, LASTV-K, ONE, C, LDC, V, LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T  or  W * T**T
+*
+               CALL STRMM( 'Right', 'Lower', TRANS, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - W * V
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C1 := C1 - W * V1
+*
+                  CALL SGEMM( 'No transpose', 'No transpose',
+     $                 LASTC, LASTV-K, K, -ONE, WORK, LDWORK, V, LDV,
+     $                 ONE, C, LDC )
+               END IF
+*
+*              W := W * V2
+*
+               CALL STRMM( 'Right', 'Lower', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V( 1, LASTV-K+1 ), LDV,
+     $              WORK, LDWORK )
+*
+*              C1 := C1 - W
+*
+               DO 240 J = 1, K
+                  DO 230 I = 1, LASTC
+                     C( I, LASTV-K+J ) = C( I, LASTV-K+J )
+     $                    - WORK( I, J )
+  230             CONTINUE
+  240          CONTINUE
+*
+            END IF
+*
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of SLARFB
+*
+      END

diff --git a/lapack/slarfg.f b/lapack/slarfg.f
new file mode 100644
index 0000000..4f10ffc
--- /dev/null
+++ b/lapack/slarfg.f

@@ -0,0 +1,196 @@
+*> \brief \b SLARFG
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download SLARFG + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/slarfg.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/slarfg.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/slarfg.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SLARFG( N, ALPHA, X, INCX, TAU )
+* 
+*       .. Scalar Arguments ..
+*       INTEGER            INCX, N
+*       REAL               ALPHA, TAU
+*       ..
+*       .. Array Arguments ..
+*       REAL               X( * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLARFG generates a real elementary reflector H of order n, such
+*> that
+*>
+*>       H * ( alpha ) = ( beta ),   H**T * H = I.
+*>           (   x   )   (   0  )
+*>
+*> where alpha and beta are scalars, and x is an (n-1)-element real
+*> vector. H is represented in the form
+*>
+*>       H = I - tau * ( 1 ) * ( 1 v**T ) ,
+*>                     ( v )
+*>
+*> where tau is a real scalar and v is a real (n-1)-element
+*> vector.
+*>
+*> If the elements of x are all zero, then tau = 0 and H is taken to be
+*> the unit matrix.
+*>
+*> Otherwise  1 <= tau <= 2.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the elementary reflector.
+*> \endverbatim
+*>
+*> \param[in,out] ALPHA
+*> \verbatim
+*>          ALPHA is REAL
+*>          On entry, the value alpha.
+*>          On exit, it is overwritten with the value beta.
+*> \endverbatim
+*>
+*> \param[in,out] X
+*> \verbatim
+*>          X is REAL array, dimension
+*>                         (1+(N-2)*abs(INCX))
+*>          On entry, the vector x.
+*>          On exit, it is overwritten with the vector v.
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*>          INCX is INTEGER
+*>          The increment between elements of X. INCX > 0.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is REAL
+*>          The value tau.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup realOTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE SLARFG( N, ALPHA, X, INCX, TAU )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+      REAL               ALPHA, TAU
+*     ..
+*     .. Array Arguments ..
+      REAL               X( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ONE, ZERO
+      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            J, KNT
+      REAL               BETA, RSAFMN, SAFMIN, XNORM
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH, SLAPY2, SNRM2
+      EXTERNAL           SLAMCH, SLAPY2, SNRM2
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, SIGN
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SSCAL
+*     ..
+*     .. Executable Statements ..
+*
+      IF( N.LE.1 ) THEN
+         TAU = ZERO
+         RETURN
+      END IF
+*
+      XNORM = SNRM2( N-1, X, INCX )
+*
+      IF( XNORM.EQ.ZERO ) THEN
+*
+*        H  =  I
+*
+         TAU = ZERO
+      ELSE
+*
+*        general case
+*
+         BETA = -SIGN( SLAPY2( ALPHA, XNORM ), ALPHA )
+         SAFMIN = SLAMCH( 'S' ) / SLAMCH( 'E' )
+         KNT = 0
+         IF( ABS( BETA ).LT.SAFMIN ) THEN
+*
+*           XNORM, BETA may be inaccurate; scale X and recompute them
+*
+            RSAFMN = ONE / SAFMIN
+   10       CONTINUE
+            KNT = KNT + 1
+            CALL SSCAL( N-1, RSAFMN, X, INCX )
+            BETA = BETA*RSAFMN
+            ALPHA = ALPHA*RSAFMN
+            IF( ABS( BETA ).LT.SAFMIN )
+     $         GO TO 10
+*
+*           New BETA is at most 1, at least SAFMIN
+*
+            XNORM = SNRM2( N-1, X, INCX )
+            BETA = -SIGN( SLAPY2( ALPHA, XNORM ), ALPHA )
+         END IF
+         TAU = ( BETA-ALPHA ) / BETA
+         CALL SSCAL( N-1, ONE / ( ALPHA-BETA ), X, INCX )
+*
+*        If ALPHA is subnormal, it may lose relative accuracy
+*
+         DO 20 J = 1, KNT
+            BETA = BETA*SAFMIN
+ 20      CONTINUE
+         ALPHA = BETA
+      END IF
+*
+      RETURN
+*
+*     End of SLARFG
+*
+      END

diff --git a/lapack/slarft.f b/lapack/slarft.f
new file mode 100644
index 0000000..30b0668
--- /dev/null
+++ b/lapack/slarft.f

@@ -0,0 +1,326 @@
+*> \brief \b SLARFT
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download SLARFT + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/slarft.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/slarft.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/slarft.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT )
+* 
+*       .. Scalar Arguments ..
+*       CHARACTER          DIRECT, STOREV
+*       INTEGER            K, LDT, LDV, N
+*       ..
+*       .. Array Arguments ..
+*       REAL               T( LDT, * ), TAU( * ), V( LDV, * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLARFT forms the triangular factor T of a real block reflector H
+*> of order n, which is defined as a product of k elementary reflectors.
+*>
+*> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+*>
+*> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+*>
+*> If STOREV = 'C', the vector which defines the elementary reflector
+*> H(i) is stored in the i-th column of the array V, and
+*>
+*>    H  =  I - V * T * V**T
+*>
+*> If STOREV = 'R', the vector which defines the elementary reflector
+*> H(i) is stored in the i-th row of the array V, and
+*>
+*>    H  =  I - V**T * T * V
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] DIRECT
+*> \verbatim
+*>          DIRECT is CHARACTER*1
+*>          Specifies the order in which the elementary reflectors are
+*>          multiplied to form the block reflector:
+*>          = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*>          = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*>          STOREV is CHARACTER*1
+*>          Specifies how the vectors which define the elementary
+*>          reflectors are stored (see also Further Details):
+*>          = 'C': columnwise
+*>          = 'R': rowwise
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the block reflector H. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The order of the triangular factor T (= the number of
+*>          elementary reflectors). K >= 1.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is REAL array, dimension
+*>                               (LDV,K) if STOREV = 'C'
+*>                               (LDV,N) if STOREV = 'R'
+*>          The matrix V. See further details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V.
+*>          If STOREV = 'C', LDV >= max(1,N); if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] TAU
+*> \verbatim
+*>          TAU is REAL array, dimension (K)
+*>          TAU(i) must contain the scalar factor of the elementary
+*>          reflector H(i).
+*> \endverbatim
+*>
+*> \param[out] T
+*> \verbatim
+*>          T is REAL array, dimension (LDT,K)
+*>          The k by k triangular factor T of the block reflector.
+*>          If DIRECT = 'F', T is upper triangular; if DIRECT = 'B', T is
+*>          lower triangular. The rest of the array is not used.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup realOTHERauxiliary
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  The shape of the matrix V and the storage of the vectors which define
+*>  the H(i) is best illustrated by the following example with n = 5 and
+*>  k = 3. The elements equal to 1 are not stored.
+*>
+*>  DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R':
+*>
+*>               V = (  1       )                 V = (  1 v1 v1 v1 v1 )
+*>                   ( v1  1    )                     (     1 v2 v2 v2 )
+*>                   ( v1 v2  1 )                     (        1 v3 v3 )
+*>                   ( v1 v2 v3 )
+*>                   ( v1 v2 v3 )
+*>
+*>  DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R':
+*>
+*>               V = ( v1 v2 v3 )                 V = ( v1 v1  1       )
+*>                   ( v1 v2 v3 )                     ( v2 v2 v2  1    )
+*>                   (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
+*>                   (     1 v3 )
+*>                   (        1 )
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT )
+*
+*  -- LAPACK auxiliary routine (version 3.4.1) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*     .. Scalar Arguments ..
+      CHARACTER          DIRECT, STOREV
+      INTEGER            K, LDT, LDV, N
+*     ..
+*     .. Array Arguments ..
+      REAL               T( LDT, * ), TAU( * ), V( LDV, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ONE, ZERO
+      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, J, PREVLASTV, LASTV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SGEMV, STRMV
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      IF( LSAME( DIRECT, 'F' ) ) THEN
+         PREVLASTV = N
+         DO I = 1, K
+            PREVLASTV = MAX( I, PREVLASTV )
+            IF( TAU( I ).EQ.ZERO ) THEN
+*
+*              H(i)  =  I
+*
+               DO J = 1, I
+                  T( J, I ) = ZERO
+               END DO
+            ELSE
+*
+*              general case
+*
+               IF( LSAME( STOREV, 'C' ) ) THEN
+*                 Skip any trailing zeros.
+                  DO LASTV = N, I+1, -1
+                     IF( V( LASTV, I ).NE.ZERO ) EXIT
+                  END DO
+                  DO J = 1, I-1
+                     T( J, I ) = -TAU( I ) * V( I , J )
+                  END DO   
+                  J = MIN( LASTV, PREVLASTV )
+*
+*                 T(1:i-1,i) := - tau(i) * V(i:j,1:i-1)**T * V(i:j,i)
+*
+                  CALL SGEMV( 'Transpose', J-I, I-1, -TAU( I ),
+     $                        V( I+1, 1 ), LDV, V( I+1, I ), 1, ONE,
+     $                        T( 1, I ), 1 )
+               ELSE
+*                 Skip any trailing zeros.
+                  DO LASTV = N, I+1, -1
+                     IF( V( I, LASTV ).NE.ZERO ) EXIT
+                  END DO
+                  DO J = 1, I-1
+                     T( J, I ) = -TAU( I ) * V( J , I )
+                  END DO   
+                  J = MIN( LASTV, PREVLASTV )
+*
+*                 T(1:i-1,i) := - tau(i) * V(1:i-1,i:j) * V(i,i:j)**T
+*
+                  CALL SGEMV( 'No transpose', I-1, J-I, -TAU( I ),
+     $                        V( 1, I+1 ), LDV, V( I, I+1 ), LDV, 
+     $                        ONE, T( 1, I ), 1 )
+               END IF
+*
+*              T(1:i-1,i) := T(1:i-1,1:i-1) * T(1:i-1,i)
+*
+               CALL STRMV( 'Upper', 'No transpose', 'Non-unit', I-1, T,
+     $                     LDT, T( 1, I ), 1 )
+               T( I, I ) = TAU( I )
+               IF( I.GT.1 ) THEN
+                  PREVLASTV = MAX( PREVLASTV, LASTV )
+               ELSE
+                  PREVLASTV = LASTV
+               END IF
+            END IF
+         END DO
+      ELSE
+         PREVLASTV = 1
+         DO I = K, 1, -1
+            IF( TAU( I ).EQ.ZERO ) THEN
+*
+*              H(i)  =  I
+*
+               DO J = I, K
+                  T( J, I ) = ZERO
+               END DO
+            ELSE
+*
+*              general case
+*
+               IF( I.LT.K ) THEN
+                  IF( LSAME( STOREV, 'C' ) ) THEN
+*                    Skip any leading zeros.
+                     DO LASTV = 1, I-1
+                        IF( V( LASTV, I ).NE.ZERO ) EXIT
+                     END DO
+                     DO J = I+1, K
+                        T( J, I ) = -TAU( I ) * V( N-K+I , J )
+                     END DO   
+                     J = MAX( LASTV, PREVLASTV )
+*
+*                    T(i+1:k,i) = -tau(i) * V(j:n-k+i,i+1:k)**T * V(j:n-k+i,i)
+*
+                     CALL SGEMV( 'Transpose', N-K+I-J, K-I, -TAU( I ),
+     $                           V( J, I+1 ), LDV, V( J, I ), 1, ONE,
+     $                           T( I+1, I ), 1 )
+                  ELSE
+*                    Skip any leading zeros.
+                     DO LASTV = 1, I-1
+                        IF( V( I, LASTV ).NE.ZERO ) EXIT
+                     END DO
+                     DO J = I+1, K
+                        T( J, I ) = -TAU( I ) * V( J, N-K+I )
+                     END DO   
+                     J = MAX( LASTV, PREVLASTV )
+*
+*                    T(i+1:k,i) = -tau(i) * V(i+1:k,j:n-k+i) * V(i,j:n-k+i)**T
+*
+                     CALL SGEMV( 'No transpose', K-I, N-K+I-J,
+     $                    -TAU( I ), V( I+1, J ), LDV, V( I, J ), LDV,
+     $                    ONE, T( I+1, I ), 1 )
+                  END IF
+*
+*                 T(i+1:k,i) := T(i+1:k,i+1:k) * T(i+1:k,i)
+*
+                  CALL STRMV( 'Lower', 'No transpose', 'Non-unit', K-I,
+     $                        T( I+1, I+1 ), LDT, T( I+1, I ), 1 )
+                  IF( I.GT.1 ) THEN
+                     PREVLASTV = MIN( PREVLASTV, LASTV )
+                  ELSE
+                     PREVLASTV = LASTV
+                  END IF
+               END IF
+               T( I, I ) = TAU( I )
+            END IF
+         END DO
+      END IF
+      RETURN
+*
+*     End of SLARFT
+*
+      END

diff --git a/lapack/svd.cpp b/lapack/svd.cpp
new file mode 100644
index 0000000..77b302b
--- /dev/null
+++ b/lapack/svd.cpp

@@ -0,0 +1,138 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "lapack_common.h"
+#include <Eigen/SVD>
+
+// computes the singular values/vectors a general M-by-N matrix A using divide-and-conquer
+EIGEN_LAPACK_FUNC(gesdd,(char *jobz, int *m, int* n, Scalar* a, int *lda, RealScalar *s, Scalar *u, int *ldu, Scalar *vt, int *ldvt, Scalar* /*work*/, int* lwork,
+                         EIGEN_LAPACK_ARG_IF_COMPLEX(RealScalar */*rwork*/) int * /*iwork*/, int *info))
+{
+  // TODO exploit the work buffer
+  bool query_size = *lwork==-1;
+  int diag_size = (std::min)(*m,*n);
+  
+  *info = 0;
+        if(*jobz!='A' && *jobz!='S' && *jobz!='O' && *jobz!='N')  *info = -1;
+  else  if(*m<0)                                                  *info = -2;
+  else  if(*n<0)                                                  *info = -3;
+  else  if(*lda<std::max(1,*m))                                   *info = -5;
+  else  if(*lda<std::max(1,*m))                                   *info = -8;
+  else  if(*ldu <1 || (*jobz=='A' && *ldu <*m)
+                   || (*jobz=='O' && *m<*n && *ldu<*m))           *info = -8;
+  else  if(*ldvt<1 || (*jobz=='A' && *ldvt<*n)
+                   || (*jobz=='S' && *ldvt<diag_size)
+                   || (*jobz=='O' && *m>=*n && *ldvt<*n))         *info = -10;
+  
+  if(*info!=0)
+  {
+    int e = -*info;
+    return xerbla_(SCALAR_SUFFIX_UP"GESDD ", &e, 6);
+  }
+  
+  if(query_size)
+  {
+    *lwork = 0;
+    return 0;
+  }
+  
+  if(*n==0 || *m==0)
+    return 0;
+  
+  PlainMatrixType mat(*m,*n);
+  mat = matrix(a,*m,*n,*lda);
+  
+  int option = *jobz=='A' ? ComputeFullU|ComputeFullV
+             : *jobz=='S' ? ComputeThinU|ComputeThinV
+             : *jobz=='O' ? ComputeThinU|ComputeThinV
+             : 0;
+
+  BDCSVD<PlainMatrixType> svd(mat,option);
+  
+  make_vector(s,diag_size) = svd.singularValues().head(diag_size);
+
+  if(*jobz=='A')
+  {
+    matrix(u,*m,*m,*ldu)   = svd.matrixU();
+    matrix(vt,*n,*n,*ldvt) = svd.matrixV().adjoint();
+  }
+  else if(*jobz=='S')
+  {
+    matrix(u,*m,diag_size,*ldu)   = svd.matrixU();
+    matrix(vt,diag_size,*n,*ldvt) = svd.matrixV().adjoint();
+  }
+  else if(*jobz=='O' && *m>=*n)
+  {
+    matrix(a,*m,*n,*lda)   = svd.matrixU();
+    matrix(vt,*n,*n,*ldvt) = svd.matrixV().adjoint();
+  }
+  else if(*jobz=='O')
+  {
+    matrix(u,*m,*m,*ldu)        = svd.matrixU();
+    matrix(a,diag_size,*n,*lda) = svd.matrixV().adjoint();
+  }
+    
+  return 0;
+}
+
+// computes the singular values/vectors a general M-by-N matrix A using two sided jacobi algorithm
+EIGEN_LAPACK_FUNC(gesvd,(char *jobu, char *jobv, int *m, int* n, Scalar* a, int *lda, RealScalar *s, Scalar *u, int *ldu, Scalar *vt, int *ldvt, Scalar* /*work*/, int* lwork,
+                         EIGEN_LAPACK_ARG_IF_COMPLEX(RealScalar */*rwork*/) int *info))
+{
+  // TODO exploit the work buffer
+  bool query_size = *lwork==-1;
+  int diag_size = (std::min)(*m,*n);
+  
+  *info = 0;
+        if( *jobu!='A' && *jobu!='S' && *jobu!='O' && *jobu!='N') *info = -1;
+  else  if((*jobv!='A' && *jobv!='S' && *jobv!='O' && *jobv!='N')
+           || (*jobu=='O' && *jobv=='O'))                         *info = -2;
+  else  if(*m<0)                                                  *info = -3;
+  else  if(*n<0)                                                  *info = -4;
+  else  if(*lda<std::max(1,*m))                                   *info = -6;
+  else  if(*ldu <1 || ((*jobu=='A' || *jobu=='S') && *ldu<*m))    *info = -9;
+  else  if(*ldvt<1 || (*jobv=='A' && *ldvt<*n)
+                   || (*jobv=='S' && *ldvt<diag_size))            *info = -11;
+  
+  if(*info!=0)
+  {
+    int e = -*info;
+    return xerbla_(SCALAR_SUFFIX_UP"GESVD ", &e, 6);
+  }
+  
+  if(query_size)
+  {
+    *lwork = 0;
+    return 0;
+  }
+  
+  if(*n==0 || *m==0)
+    return 0;
+  
+  PlainMatrixType mat(*m,*n);
+  mat = matrix(a,*m,*n,*lda);
+  
+  int option = (*jobu=='A' ? ComputeFullU : *jobu=='S' || *jobu=='O' ? ComputeThinU : 0)
+             | (*jobv=='A' ? ComputeFullV : *jobv=='S' || *jobv=='O' ? ComputeThinV : 0);
+  
+  JacobiSVD<PlainMatrixType> svd(mat,option);
+  
+  make_vector(s,diag_size) = svd.singularValues().head(diag_size);
+  {
+        if(*jobu=='A') matrix(u,*m,*m,*ldu)           = svd.matrixU();
+  else  if(*jobu=='S') matrix(u,*m,diag_size,*ldu)    = svd.matrixU();
+  else  if(*jobu=='O') matrix(a,*m,diag_size,*lda)    = svd.matrixU();
+  }
+  {
+        if(*jobv=='A') matrix(vt,*n,*n,*ldvt)         = svd.matrixV().adjoint();
+  else  if(*jobv=='S') matrix(vt,diag_size,*n,*ldvt)  = svd.matrixV().adjoint();
+  else  if(*jobv=='O') matrix(a,diag_size,*n,*lda)    = svd.matrixV().adjoint();
+  }
+  return 0;
+}

diff --git a/lapack/zlacgv.f b/lapack/zlacgv.f
new file mode 100644
index 0000000..16c2e2e
--- /dev/null
+++ b/lapack/zlacgv.f

@@ -0,0 +1,116 @@
+*> \brief \b ZLACGV
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download ZLACGV + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zlacgv.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zlacgv.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zlacgv.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZLACGV( N, X, INCX )
+* 
+*       .. Scalar Arguments ..
+*       INTEGER            INCX, N
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         X( * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZLACGV conjugates a complex vector of length N.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The length of the vector X.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] X
+*> \verbatim
+*>          X is COMPLEX*16 array, dimension
+*>                         (1+(N-1)*abs(INCX))
+*>          On entry, the vector of length N to be conjugated.
+*>          On exit, X is overwritten with conjg(X).
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*>          INCX is INTEGER
+*>          The spacing between successive elements of X.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup complex16OTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE ZLACGV( N, X, INCX )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         X( * )
+*     ..
+*
+* =====================================================================
+*
+*     .. Local Scalars ..
+      INTEGER            I, IOFF
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG
+*     ..
+*     .. Executable Statements ..
+*
+      IF( INCX.EQ.1 ) THEN
+         DO 10 I = 1, N
+            X( I ) = DCONJG( X( I ) )
+   10    CONTINUE
+      ELSE
+         IOFF = 1
+         IF( INCX.LT.0 )
+     $      IOFF = 1 - ( N-1 )*INCX
+         DO 20 I = 1, N
+            X( IOFF ) = DCONJG( X( IOFF ) )
+            IOFF = IOFF + INCX
+   20    CONTINUE
+      END IF
+      RETURN
+*
+*     End of ZLACGV
+*
+      END

diff --git a/lapack/zladiv.f b/lapack/zladiv.f
new file mode 100644
index 0000000..aa71db1
--- /dev/null
+++ b/lapack/zladiv.f

@@ -0,0 +1,97 @@
+*> \brief \b ZLADIV
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download ZLADIV + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zladiv.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zladiv.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zladiv.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       COMPLEX*16     FUNCTION ZLADIV( X, Y )
+* 
+*       .. Scalar Arguments ..
+*       COMPLEX*16         X, Y
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZLADIV := X / Y, where X and Y are complex.  The computation of X / Y
+*> will not overflow on an intermediary step unless the results
+*> overflows.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] X
+*> \verbatim
+*>          X is COMPLEX*16
+*> \endverbatim
+*>
+*> \param[in] Y
+*> \verbatim
+*>          Y is COMPLEX*16
+*>          The complex scalars X and Y.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup complex16OTHERauxiliary
+*
+*  =====================================================================
+      COMPLEX*16     FUNCTION ZLADIV( X, Y )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      COMPLEX*16         X, Y
+*     ..
+*
+*  =====================================================================
+*
+*     .. Local Scalars ..
+      DOUBLE PRECISION   ZI, ZR
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLADIV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE, DCMPLX, DIMAG
+*     ..
+*     .. Executable Statements ..
+*
+      CALL DLADIV( DBLE( X ), DIMAG( X ), DBLE( Y ), DIMAG( Y ), ZR,
+     $             ZI )
+      ZLADIV = DCMPLX( ZR, ZI )
+*
+      RETURN
+*
+*     End of ZLADIV
+*
+      END

diff --git a/lapack/zlarf.f b/lapack/zlarf.f
new file mode 100644
index 0000000..53f314d
--- /dev/null
+++ b/lapack/zlarf.f

@@ -0,0 +1,232 @@
+*> \brief \b ZLARF
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download ZLARF + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zlarf.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zlarf.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zlarf.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZLARF( SIDE, M, N, V, INCV, TAU, C, LDC, WORK )
+* 
+*       .. Scalar Arguments ..
+*       CHARACTER          SIDE
+*       INTEGER            INCV, LDC, M, N
+*       COMPLEX*16         TAU
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         C( LDC, * ), V( * ), WORK( * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZLARF applies a complex elementary reflector H to a complex M-by-N
+*> matrix C, from either the left or the right. H is represented in the
+*> form
+*>
+*>       H = I - tau * v * v**H
+*>
+*> where tau is a complex scalar and v is a complex vector.
+*>
+*> If tau = 0, then H is taken to be the unit matrix.
+*>
+*> To apply H**H, supply conjg(tau) instead
+*> tau.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] SIDE
+*> \verbatim
+*>          SIDE is CHARACTER*1
+*>          = 'L': form  H * C
+*>          = 'R': form  C * H
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is COMPLEX*16 array, dimension
+*>                     (1 + (M-1)*abs(INCV)) if SIDE = 'L'
+*>                  or (1 + (N-1)*abs(INCV)) if SIDE = 'R'
+*>          The vector v in the representation of H. V is not used if
+*>          TAU = 0.
+*> \endverbatim
+*>
+*> \param[in] INCV
+*> \verbatim
+*>          INCV is INTEGER
+*>          The increment between elements of v. INCV <> 0.
+*> \endverbatim
+*>
+*> \param[in] TAU
+*> \verbatim
+*>          TAU is COMPLEX*16
+*>          The value tau in the representation of H.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is COMPLEX*16 array, dimension (LDC,N)
+*>          On entry, the M-by-N matrix C.
+*>          On exit, C is overwritten by the matrix H * C if SIDE = 'L',
+*>          or C * H if SIDE = 'R'.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension
+*>                         (N) if SIDE = 'L'
+*>                      or (M) if SIDE = 'R'
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup complex16OTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE ZLARF( SIDE, M, N, V, INCV, TAU, C, LDC, WORK )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      CHARACTER          SIDE
+      INTEGER            INCV, LDC, M, N
+      COMPLEX*16         TAU
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         C( LDC, * ), V( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX*16         ONE, ZERO
+      PARAMETER          ( ONE = ( 1.0D+0, 0.0D+0 ),
+     $                   ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            APPLYLEFT
+      INTEGER            I, LASTV, LASTC
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZGEMV, ZGERC
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAZLR, ILAZLC
+      EXTERNAL           LSAME, ILAZLR, ILAZLC
+*     ..
+*     .. Executable Statements ..
+*
+      APPLYLEFT = LSAME( SIDE, 'L' )
+      LASTV = 0
+      LASTC = 0
+      IF( TAU.NE.ZERO ) THEN
+*     Set up variables for scanning V.  LASTV begins pointing to the end
+*     of V.
+         IF( APPLYLEFT ) THEN
+            LASTV = M
+         ELSE
+            LASTV = N
+         END IF
+         IF( INCV.GT.0 ) THEN
+            I = 1 + (LASTV-1) * INCV
+         ELSE
+            I = 1
+         END IF
+*     Look for the last non-zero row in V.
+         DO WHILE( LASTV.GT.0 .AND. V( I ).EQ.ZERO )
+            LASTV = LASTV - 1
+            I = I - INCV
+         END DO
+         IF( APPLYLEFT ) THEN
+*     Scan for the last non-zero column in C(1:lastv,:).
+            LASTC = ILAZLC(LASTV, N, C, LDC)
+         ELSE
+*     Scan for the last non-zero row in C(:,1:lastv).
+            LASTC = ILAZLR(M, LASTV, C, LDC)
+         END IF
+      END IF
+*     Note that lastc.eq.0 renders the BLAS operations null; no special
+*     case is needed at this level.
+      IF( APPLYLEFT ) THEN
+*
+*        Form  H * C
+*
+         IF( LASTV.GT.0 ) THEN
+*
+*           w(1:lastc,1) := C(1:lastv,1:lastc)**H * v(1:lastv,1)
+*
+            CALL ZGEMV( 'Conjugate transpose', LASTV, LASTC, ONE,
+     $           C, LDC, V, INCV, ZERO, WORK, 1 )
+*
+*           C(1:lastv,1:lastc) := C(...) - v(1:lastv,1) * w(1:lastc,1)**H
+*
+            CALL ZGERC( LASTV, LASTC, -TAU, V, INCV, WORK, 1, C, LDC )
+         END IF
+      ELSE
+*
+*        Form  C * H
+*
+         IF( LASTV.GT.0 ) THEN
+*
+*           w(1:lastc,1) := C(1:lastc,1:lastv) * v(1:lastv,1)
+*
+            CALL ZGEMV( 'No transpose', LASTC, LASTV, ONE, C, LDC,
+     $           V, INCV, ZERO, WORK, 1 )
+*
+*           C(1:lastc,1:lastv) := C(...) - w(1:lastc,1) * v(1:lastv,1)**H
+*
+            CALL ZGERC( LASTC, LASTV, -TAU, WORK, 1, V, INCV, C, LDC )
+         END IF
+      END IF
+      RETURN
+*
+*     End of ZLARF
+*
+      END

diff --git a/lapack/zlarfb.f b/lapack/zlarfb.f
new file mode 100644
index 0000000..30fc4b9
--- /dev/null
+++ b/lapack/zlarfb.f

@@ -0,0 +1,774 @@
+*> \brief \b ZLARFB
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download ZLARFB + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zlarfb.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zlarfb.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zlarfb.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZLARFB( SIDE, TRANS, DIRECT, STOREV, M, N, K, V, LDV,
+*                          T, LDT, C, LDC, WORK, LDWORK )
+* 
+*       .. Scalar Arguments ..
+*       CHARACTER          DIRECT, SIDE, STOREV, TRANS
+*       INTEGER            K, LDC, LDT, LDV, LDWORK, M, N
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         C( LDC, * ), T( LDT, * ), V( LDV, * ),
+*      $                   WORK( LDWORK, * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZLARFB applies a complex block reflector H or its transpose H**H to a
+*> complex M-by-N matrix C, from either the left or the right.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] SIDE
+*> \verbatim
+*>          SIDE is CHARACTER*1
+*>          = 'L': apply H or H**H from the Left
+*>          = 'R': apply H or H**H from the Right
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          = 'N': apply H (No transpose)
+*>          = 'C': apply H**H (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] DIRECT
+*> \verbatim
+*>          DIRECT is CHARACTER*1
+*>          Indicates how H is formed from a product of elementary
+*>          reflectors
+*>          = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*>          = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*>          STOREV is CHARACTER*1
+*>          Indicates how the vectors which define the elementary
+*>          reflectors are stored:
+*>          = 'C': Columnwise
+*>          = 'R': Rowwise
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The order of the matrix T (= the number of elementary
+*>          reflectors whose product defines the block reflector).
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is COMPLEX*16 array, dimension
+*>                                (LDV,K) if STOREV = 'C'
+*>                                (LDV,M) if STOREV = 'R' and SIDE = 'L'
+*>                                (LDV,N) if STOREV = 'R' and SIDE = 'R'
+*>          See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V.
+*>          If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+*>          if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+*>          if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is COMPLEX*16 array, dimension (LDT,K)
+*>          The triangular K-by-K matrix T in the representation of the
+*>          block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is COMPLEX*16 array, dimension (LDC,N)
+*>          On entry, the M-by-N matrix C.
+*>          On exit, C is overwritten by H*C or H**H*C or C*H or C*H**H.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (LDWORK,K)
+*> \endverbatim
+*>
+*> \param[in] LDWORK
+*> \verbatim
+*>          LDWORK is INTEGER
+*>          The leading dimension of the array WORK.
+*>          If SIDE = 'L', LDWORK >= max(1,N);
+*>          if SIDE = 'R', LDWORK >= max(1,M).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup complex16OTHERauxiliary
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  The shape of the matrix V and the storage of the vectors which define
+*>  the H(i) is best illustrated by the following example with n = 5 and
+*>  k = 3. The elements equal to 1 are not stored; the corresponding
+*>  array elements are modified but restored on exit. The rest of the
+*>  array is not used.
+*>
+*>  DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R':
+*>
+*>               V = (  1       )                 V = (  1 v1 v1 v1 v1 )
+*>                   ( v1  1    )                     (     1 v2 v2 v2 )
+*>                   ( v1 v2  1 )                     (        1 v3 v3 )
+*>                   ( v1 v2 v3 )
+*>                   ( v1 v2 v3 )
+*>
+*>  DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R':
+*>
+*>               V = ( v1 v2 v3 )                 V = ( v1 v1  1       )
+*>                   ( v1 v2 v3 )                     ( v2 v2 v2  1    )
+*>                   (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
+*>                   (     1 v3 )
+*>                   (        1 )
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE ZLARFB( SIDE, TRANS, DIRECT, STOREV, M, N, K, V, LDV,
+     $                   T, LDT, C, LDC, WORK, LDWORK )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      CHARACTER          DIRECT, SIDE, STOREV, TRANS
+      INTEGER            K, LDC, LDT, LDV, LDWORK, M, N
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         C( LDC, * ), T( LDT, * ), V( LDV, * ),
+     $                   WORK( LDWORK, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX*16         ONE
+      PARAMETER          ( ONE = ( 1.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER          TRANST
+      INTEGER            I, J, LASTV, LASTC
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAZLR, ILAZLC
+      EXTERNAL           LSAME, ILAZLR, ILAZLC
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZCOPY, ZGEMM, ZLACGV, ZTRMM
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( M.LE.0 .OR. N.LE.0 )
+     $   RETURN
+*
+      IF( LSAME( TRANS, 'N' ) ) THEN
+         TRANST = 'C'
+      ELSE
+         TRANST = 'N'
+      END IF
+*
+      IF( LSAME( STOREV, 'C' ) ) THEN
+*
+         IF( LSAME( DIRECT, 'F' ) ) THEN
+*
+*           Let  V =  ( V1 )    (first K rows)
+*                     ( V2 )
+*           where  V1  is unit lower triangular.
+*
+            IF( LSAME( SIDE, 'L' ) ) THEN
+*
+*              Form  H * C  or  H**H * C  where  C = ( C1 )
+*                                                    ( C2 )
+*
+               LASTV = MAX( K, ILAZLR( M, K, V, LDV ) )
+               LASTC = ILAZLC( LASTV, N, C, LDC )
+*
+*              W := C**H * V  =  (C1**H * V1 + C2**H * V2)  (stored in WORK)
+*
+*              W := C1**H
+*
+               DO 10 J = 1, K
+                  CALL ZCOPY( LASTC, C( J, 1 ), LDC, WORK( 1, J ), 1 )
+                  CALL ZLACGV( LASTC, WORK( 1, J ), 1 )
+   10          CONTINUE
+*
+*              W := W * V1
+*
+               CALL ZTRMM( 'Right', 'Lower', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C2**H *V2
+*
+                  CALL ZGEMM( 'Conjugate transpose', 'No transpose',
+     $                 LASTC, K, LASTV-K, ONE, C( K+1, 1 ), LDC,
+     $                 V( K+1, 1 ), LDV, ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T**H  or  W * T
+*
+               CALL ZTRMM( 'Right', 'Upper', TRANST, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - V * W**H
+*
+               IF( M.GT.K ) THEN
+*
+*                 C2 := C2 - V2 * W**H
+*
+                  CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $                 LASTV-K, LASTC, K,
+     $                 -ONE, V( K+1, 1 ), LDV, WORK, LDWORK,
+     $                 ONE, C( K+1, 1 ), LDC )
+               END IF
+*
+*              W := W * V1**H
+*
+               CALL ZTRMM( 'Right', 'Lower', 'Conjugate transpose',
+     $              'Unit', LASTC, K, ONE, V, LDV, WORK, LDWORK )
+*
+*              C1 := C1 - W**H
+*
+               DO 30 J = 1, K
+                  DO 20 I = 1, LASTC
+                     C( J, I ) = C( J, I ) - DCONJG( WORK( I, J ) )
+   20             CONTINUE
+   30          CONTINUE
+*
+            ELSE IF( LSAME( SIDE, 'R' ) ) THEN
+*
+*              Form  C * H  or  C * H**H  where  C = ( C1  C2 )
+*
+               LASTV = MAX( K, ILAZLR( N, K, V, LDV ) )
+               LASTC = ILAZLR( M, LASTV, C, LDC )
+*
+*              W := C * V  =  (C1*V1 + C2*V2)  (stored in WORK)
+*
+*              W := C1
+*
+               DO 40 J = 1, K
+                  CALL ZCOPY( LASTC, C( 1, J ), 1, WORK( 1, J ), 1 )
+   40          CONTINUE
+*
+*              W := W * V1
+*
+               CALL ZTRMM( 'Right', 'Lower', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C2 * V2
+*
+                  CALL ZGEMM( 'No transpose', 'No transpose',
+     $                 LASTC, K, LASTV-K,
+     $                 ONE, C( 1, K+1 ), LDC, V( K+1, 1 ), LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T  or  W * T**H
+*
+               CALL ZTRMM( 'Right', 'Upper', TRANS, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - W * V**H
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C2 := C2 - W * V2**H
+*
+                  CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $                 LASTC, LASTV-K, K,
+     $                 -ONE, WORK, LDWORK, V( K+1, 1 ), LDV,
+     $                 ONE, C( 1, K+1 ), LDC )
+               END IF
+*
+*              W := W * V1**H
+*
+               CALL ZTRMM( 'Right', 'Lower', 'Conjugate transpose',
+     $              'Unit', LASTC, K, ONE, V, LDV, WORK, LDWORK )
+*
+*              C1 := C1 - W
+*
+               DO 60 J = 1, K
+                  DO 50 I = 1, LASTC
+                     C( I, J ) = C( I, J ) - WORK( I, J )
+   50             CONTINUE
+   60          CONTINUE
+            END IF
+*
+         ELSE
+*
+*           Let  V =  ( V1 )
+*                     ( V2 )    (last K rows)
+*           where  V2  is unit upper triangular.
+*
+            IF( LSAME( SIDE, 'L' ) ) THEN
+*
+*              Form  H * C  or  H**H * C  where  C = ( C1 )
+*                                                    ( C2 )
+*
+               LASTV = MAX( K, ILAZLR( M, K, V, LDV ) )
+               LASTC = ILAZLC( LASTV, N, C, LDC )
+*
+*              W := C**H * V  =  (C1**H * V1 + C2**H * V2)  (stored in WORK)
+*
+*              W := C2**H
+*
+               DO 70 J = 1, K
+                  CALL ZCOPY( LASTC, C( LASTV-K+J, 1 ), LDC,
+     $                 WORK( 1, J ), 1 )
+                  CALL ZLACGV( LASTC, WORK( 1, J ), 1 )
+   70          CONTINUE
+*
+*              W := W * V2
+*
+               CALL ZTRMM( 'Right', 'Upper', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V( LASTV-K+1, 1 ), LDV,
+     $              WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C1**H*V1
+*
+                  CALL ZGEMM( 'Conjugate transpose', 'No transpose',
+     $                 LASTC, K, LASTV-K,
+     $                 ONE, C, LDC, V, LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T**H  or  W * T
+*
+               CALL ZTRMM( 'Right', 'Lower', TRANST, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - V * W**H
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C1 := C1 - V1 * W**H
+*
+                  CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $                 LASTV-K, LASTC, K,
+     $                 -ONE, V, LDV, WORK, LDWORK,
+     $                 ONE, C, LDC )
+               END IF
+*
+*              W := W * V2**H
+*
+               CALL ZTRMM( 'Right', 'Upper', 'Conjugate transpose',
+     $              'Unit', LASTC, K, ONE, V( LASTV-K+1, 1 ), LDV,
+     $              WORK, LDWORK )
+*
+*              C2 := C2 - W**H
+*
+               DO 90 J = 1, K
+                  DO 80 I = 1, LASTC
+                     C( LASTV-K+J, I ) = C( LASTV-K+J, I ) -
+     $                               DCONJG( WORK( I, J ) )
+   80             CONTINUE
+   90          CONTINUE
+*
+            ELSE IF( LSAME( SIDE, 'R' ) ) THEN
+*
+*              Form  C * H  or  C * H**H  where  C = ( C1  C2 )
+*
+               LASTV = MAX( K, ILAZLR( N, K, V, LDV ) )
+               LASTC = ILAZLR( M, LASTV, C, LDC )
+*
+*              W := C * V  =  (C1*V1 + C2*V2)  (stored in WORK)
+*
+*              W := C2
+*
+               DO 100 J = 1, K
+                  CALL ZCOPY( LASTC, C( 1, LASTV-K+J ), 1,
+     $                 WORK( 1, J ), 1 )
+  100          CONTINUE
+*
+*              W := W * V2
+*
+               CALL ZTRMM( 'Right', 'Upper', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V( LASTV-K+1, 1 ), LDV,
+     $              WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C1 * V1
+*
+                  CALL ZGEMM( 'No transpose', 'No transpose',
+     $                 LASTC, K, LASTV-K,
+     $                 ONE, C, LDC, V, LDV, ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T  or  W * T**H
+*
+               CALL ZTRMM( 'Right', 'Lower', TRANS, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - W * V**H
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C1 := C1 - W * V1**H
+*
+                  CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $                 LASTC, LASTV-K, K, -ONE, WORK, LDWORK, V, LDV,
+     $                 ONE, C, LDC )
+               END IF
+*
+*              W := W * V2**H
+*
+               CALL ZTRMM( 'Right', 'Upper', 'Conjugate transpose',
+     $              'Unit', LASTC, K, ONE, V( LASTV-K+1, 1 ), LDV,
+     $              WORK, LDWORK )
+*
+*              C2 := C2 - W
+*
+               DO 120 J = 1, K
+                  DO 110 I = 1, LASTC
+                     C( I, LASTV-K+J ) = C( I, LASTV-K+J )
+     $                    - WORK( I, J )
+  110             CONTINUE
+  120          CONTINUE
+            END IF
+         END IF
+*
+      ELSE IF( LSAME( STOREV, 'R' ) ) THEN
+*
+         IF( LSAME( DIRECT, 'F' ) ) THEN
+*
+*           Let  V =  ( V1  V2 )    (V1: first K columns)
+*           where  V1  is unit upper triangular.
+*
+            IF( LSAME( SIDE, 'L' ) ) THEN
+*
+*              Form  H * C  or  H**H * C  where  C = ( C1 )
+*                                                    ( C2 )
+*
+               LASTV = MAX( K, ILAZLC( K, M, V, LDV ) )
+               LASTC = ILAZLC( LASTV, N, C, LDC )
+*
+*              W := C**H * V**H  =  (C1**H * V1**H + C2**H * V2**H) (stored in WORK)
+*
+*              W := C1**H
+*
+               DO 130 J = 1, K
+                  CALL ZCOPY( LASTC, C( J, 1 ), LDC, WORK( 1, J ), 1 )
+                  CALL ZLACGV( LASTC, WORK( 1, J ), 1 )
+  130          CONTINUE
+*
+*              W := W * V1**H
+*
+               CALL ZTRMM( 'Right', 'Upper', 'Conjugate transpose',
+     $                     'Unit', LASTC, K, ONE, V, LDV, WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C2**H*V2**H
+*
+                  CALL ZGEMM( 'Conjugate transpose',
+     $                 'Conjugate transpose', LASTC, K, LASTV-K,
+     $                 ONE, C( K+1, 1 ), LDC, V( 1, K+1 ), LDV,
+     $                 ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T**H  or  W * T
+*
+               CALL ZTRMM( 'Right', 'Upper', TRANST, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - V**H * W**H
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C2 := C2 - V2**H * W**H
+*
+                  CALL ZGEMM( 'Conjugate transpose',
+     $                 'Conjugate transpose', LASTV-K, LASTC, K,
+     $                 -ONE, V( 1, K+1 ), LDV, WORK, LDWORK,
+     $                 ONE, C( K+1, 1 ), LDC )
+               END IF
+*
+*              W := W * V1
+*
+               CALL ZTRMM( 'Right', 'Upper', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+*
+*              C1 := C1 - W**H
+*
+               DO 150 J = 1, K
+                  DO 140 I = 1, LASTC
+                     C( J, I ) = C( J, I ) - DCONJG( WORK( I, J ) )
+  140             CONTINUE
+  150          CONTINUE
+*
+            ELSE IF( LSAME( SIDE, 'R' ) ) THEN
+*
+*              Form  C * H  or  C * H**H  where  C = ( C1  C2 )
+*
+               LASTV = MAX( K, ILAZLC( K, N, V, LDV ) )
+               LASTC = ILAZLR( M, LASTV, C, LDC )
+*
+*              W := C * V**H  =  (C1*V1**H + C2*V2**H)  (stored in WORK)
+*
+*              W := C1
+*
+               DO 160 J = 1, K
+                  CALL ZCOPY( LASTC, C( 1, J ), 1, WORK( 1, J ), 1 )
+  160          CONTINUE
+*
+*              W := W * V1**H
+*
+               CALL ZTRMM( 'Right', 'Upper', 'Conjugate transpose',
+     $                     'Unit', LASTC, K, ONE, V, LDV, WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C2 * V2**H
+*
+                  CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $                 LASTC, K, LASTV-K, ONE, C( 1, K+1 ), LDC,
+     $                 V( 1, K+1 ), LDV, ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T  or  W * T**H
+*
+               CALL ZTRMM( 'Right', 'Upper', TRANS, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - W * V
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C2 := C2 - W * V2
+*
+                  CALL ZGEMM( 'No transpose', 'No transpose',
+     $                 LASTC, LASTV-K, K,
+     $                 -ONE, WORK, LDWORK, V( 1, K+1 ), LDV,
+     $                 ONE, C( 1, K+1 ), LDC )
+               END IF
+*
+*              W := W * V1
+*
+               CALL ZTRMM( 'Right', 'Upper', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V, LDV, WORK, LDWORK )
+*
+*              C1 := C1 - W
+*
+               DO 180 J = 1, K
+                  DO 170 I = 1, LASTC
+                     C( I, J ) = C( I, J ) - WORK( I, J )
+  170             CONTINUE
+  180          CONTINUE
+*
+            END IF
+*
+         ELSE
+*
+*           Let  V =  ( V1  V2 )    (V2: last K columns)
+*           where  V2  is unit lower triangular.
+*
+            IF( LSAME( SIDE, 'L' ) ) THEN
+*
+*              Form  H * C  or  H**H * C  where  C = ( C1 )
+*                                                    ( C2 )
+*
+               LASTV = MAX( K, ILAZLC( K, M, V, LDV ) )
+               LASTC = ILAZLC( LASTV, N, C, LDC )
+*
+*              W := C**H * V**H  =  (C1**H * V1**H + C2**H * V2**H) (stored in WORK)
+*
+*              W := C2**H
+*
+               DO 190 J = 1, K
+                  CALL ZCOPY( LASTC, C( LASTV-K+J, 1 ), LDC,
+     $                 WORK( 1, J ), 1 )
+                  CALL ZLACGV( LASTC, WORK( 1, J ), 1 )
+  190          CONTINUE
+*
+*              W := W * V2**H
+*
+               CALL ZTRMM( 'Right', 'Lower', 'Conjugate transpose',
+     $              'Unit', LASTC, K, ONE, V( 1, LASTV-K+1 ), LDV,
+     $              WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C1**H * V1**H
+*
+                  CALL ZGEMM( 'Conjugate transpose',
+     $                 'Conjugate transpose', LASTC, K, LASTV-K,
+     $                 ONE, C, LDC, V, LDV, ONE, WORK, LDWORK )
+               END IF
+*
+*              W := W * T**H  or  W * T
+*
+               CALL ZTRMM( 'Right', 'Lower', TRANST, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - V**H * W**H
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C1 := C1 - V1**H * W**H
+*
+                  CALL ZGEMM( 'Conjugate transpose',
+     $                 'Conjugate transpose', LASTV-K, LASTC, K,
+     $                 -ONE, V, LDV, WORK, LDWORK, ONE, C, LDC )
+               END IF
+*
+*              W := W * V2
+*
+               CALL ZTRMM( 'Right', 'Lower', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V( 1, LASTV-K+1 ), LDV,
+     $              WORK, LDWORK )
+*
+*              C2 := C2 - W**H
+*
+               DO 210 J = 1, K
+                  DO 200 I = 1, LASTC
+                     C( LASTV-K+J, I ) = C( LASTV-K+J, I ) -
+     $                               DCONJG( WORK( I, J ) )
+  200             CONTINUE
+  210          CONTINUE
+*
+            ELSE IF( LSAME( SIDE, 'R' ) ) THEN
+*
+*              Form  C * H  or  C * H**H  where  C = ( C1  C2 )
+*
+               LASTV = MAX( K, ILAZLC( K, N, V, LDV ) )
+               LASTC = ILAZLR( M, LASTV, C, LDC )
+*
+*              W := C * V**H  =  (C1*V1**H + C2*V2**H)  (stored in WORK)
+*
+*              W := C2
+*
+               DO 220 J = 1, K
+                  CALL ZCOPY( LASTC, C( 1, LASTV-K+J ), 1,
+     $                 WORK( 1, J ), 1 )
+  220          CONTINUE
+*
+*              W := W * V2**H
+*
+               CALL ZTRMM( 'Right', 'Lower', 'Conjugate transpose',
+     $              'Unit', LASTC, K, ONE, V( 1, LASTV-K+1 ), LDV,
+     $              WORK, LDWORK )
+               IF( LASTV.GT.K ) THEN
+*
+*                 W := W + C1 * V1**H
+*
+                  CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $                 LASTC, K, LASTV-K, ONE, C, LDC, V, LDV, ONE,
+     $                 WORK, LDWORK )
+               END IF
+*
+*              W := W * T  or  W * T**H
+*
+               CALL ZTRMM( 'Right', 'Lower', TRANS, 'Non-unit',
+     $              LASTC, K, ONE, T, LDT, WORK, LDWORK )
+*
+*              C := C - W * V
+*
+               IF( LASTV.GT.K ) THEN
+*
+*                 C1 := C1 - W * V1
+*
+                  CALL ZGEMM( 'No transpose', 'No transpose',
+     $                 LASTC, LASTV-K, K, -ONE, WORK, LDWORK, V, LDV,
+     $                 ONE, C, LDC )
+               END IF
+*
+*              W := W * V2
+*
+               CALL ZTRMM( 'Right', 'Lower', 'No transpose', 'Unit',
+     $              LASTC, K, ONE, V( 1, LASTV-K+1 ), LDV,
+     $              WORK, LDWORK )
+*
+*              C1 := C1 - W
+*
+               DO 240 J = 1, K
+                  DO 230 I = 1, LASTC
+                     C( I, LASTV-K+J ) = C( I, LASTV-K+J )
+     $                    - WORK( I, J )
+  230             CONTINUE
+  240          CONTINUE
+*
+            END IF
+*
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZLARFB
+*
+      END

diff --git a/lapack/zlarfg.f b/lapack/zlarfg.f
new file mode 100644
index 0000000..a90ae9f
--- /dev/null
+++ b/lapack/zlarfg.f

@@ -0,0 +1,203 @@
+*> \brief \b ZLARFG
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download ZLARFG + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zlarfg.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zlarfg.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zlarfg.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZLARFG( N, ALPHA, X, INCX, TAU )
+* 
+*       .. Scalar Arguments ..
+*       INTEGER            INCX, N
+*       COMPLEX*16         ALPHA, TAU
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         X( * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZLARFG generates a complex elementary reflector H of order n, such
+*> that
+*>
+*>       H**H * ( alpha ) = ( beta ),   H**H * H = I.
+*>              (   x   )   (   0  )
+*>
+*> where alpha and beta are scalars, with beta real, and x is an
+*> (n-1)-element complex vector. H is represented in the form
+*>
+*>       H = I - tau * ( 1 ) * ( 1 v**H ) ,
+*>                     ( v )
+*>
+*> where tau is a complex scalar and v is a complex (n-1)-element
+*> vector. Note that H is not hermitian.
+*>
+*> If the elements of x are all zero and alpha is real, then tau = 0
+*> and H is taken to be the unit matrix.
+*>
+*> Otherwise  1 <= real(tau) <= 2  and  abs(tau-1) <= 1 .
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the elementary reflector.
+*> \endverbatim
+*>
+*> \param[in,out] ALPHA
+*> \verbatim
+*>          ALPHA is COMPLEX*16
+*>          On entry, the value alpha.
+*>          On exit, it is overwritten with the value beta.
+*> \endverbatim
+*>
+*> \param[in,out] X
+*> \verbatim
+*>          X is COMPLEX*16 array, dimension
+*>                         (1+(N-2)*abs(INCX))
+*>          On entry, the vector x.
+*>          On exit, it is overwritten with the vector v.
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*>          INCX is INTEGER
+*>          The increment between elements of X. INCX > 0.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX*16
+*>          The value tau.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date November 2011
+*
+*> \ingroup complex16OTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE ZLARFG( N, ALPHA, X, INCX, TAU )
+*
+*  -- LAPACK auxiliary routine (version 3.4.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2011
+*
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+      COMPLEX*16         ALPHA, TAU
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         X( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE, ZERO
+      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            J, KNT
+      DOUBLE PRECISION   ALPHI, ALPHR, BETA, RSAFMN, SAFMIN, XNORM
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH, DLAPY3, DZNRM2
+      COMPLEX*16         ZLADIV
+      EXTERNAL           DLAMCH, DLAPY3, DZNRM2, ZLADIV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DBLE, DCMPLX, DIMAG, SIGN
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZDSCAL, ZSCAL
+*     ..
+*     .. Executable Statements ..
+*
+      IF( N.LE.0 ) THEN
+         TAU = ZERO
+         RETURN
+      END IF
+*
+      XNORM = DZNRM2( N-1, X, INCX )
+      ALPHR = DBLE( ALPHA )
+      ALPHI = DIMAG( ALPHA )
+*
+      IF( XNORM.EQ.ZERO .AND. ALPHI.EQ.ZERO ) THEN
+*
+*        H  =  I
+*
+         TAU = ZERO
+      ELSE
+*
+*        general case
+*
+         BETA = -SIGN( DLAPY3( ALPHR, ALPHI, XNORM ), ALPHR )
+         SAFMIN = DLAMCH( 'S' ) / DLAMCH( 'E' )
+         RSAFMN = ONE / SAFMIN
+*
+         KNT = 0
+         IF( ABS( BETA ).LT.SAFMIN ) THEN
+*
+*           XNORM, BETA may be inaccurate; scale X and recompute them
+*
+   10       CONTINUE
+            KNT = KNT + 1
+            CALL ZDSCAL( N-1, RSAFMN, X, INCX )
+            BETA = BETA*RSAFMN
+            ALPHI = ALPHI*RSAFMN
+            ALPHR = ALPHR*RSAFMN
+            IF( ABS( BETA ).LT.SAFMIN )
+     $         GO TO 10
+*
+*           New BETA is at most 1, at least SAFMIN
+*
+            XNORM = DZNRM2( N-1, X, INCX )
+            ALPHA = DCMPLX( ALPHR, ALPHI )
+            BETA = -SIGN( DLAPY3( ALPHR, ALPHI, XNORM ), ALPHR )
+         END IF
+         TAU = DCMPLX( ( BETA-ALPHR ) / BETA, -ALPHI / BETA )
+         ALPHA = ZLADIV( DCMPLX( ONE ), ALPHA-BETA )
+         CALL ZSCAL( N-1, ALPHA, X, INCX )
+*
+*        If ALPHA is subnormal, it may lose relative accuracy
+*
+         DO 20 J = 1, KNT
+            BETA = BETA*SAFMIN
+ 20      CONTINUE
+         ALPHA = BETA
+      END IF
+*
+      RETURN
+*
+*     End of ZLARFG
+*
+      END

diff --git a/lapack/zlarft.f b/lapack/zlarft.f
new file mode 100644
index 0000000..6a6151f
--- /dev/null
+++ b/lapack/zlarft.f

@@ -0,0 +1,327 @@
+*> \brief \b ZLARFT
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*> \htmlonly
+*> Download ZLARFT + dependencies 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zlarft.f"> 
+*> [TGZ]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zlarft.f"> 
+*> [ZIP]</a> 
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zlarft.f"> 
+*> [TXT]</a>
+*> \endhtmlonly 
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT )
+* 
+*       .. Scalar Arguments ..
+*       CHARACTER          DIRECT, STOREV
+*       INTEGER            K, LDT, LDV, N
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         T( LDT, * ), TAU( * ), V( LDV, * )
+*       ..
+*  
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZLARFT forms the triangular factor T of a complex block reflector H
+*> of order n, which is defined as a product of k elementary reflectors.
+*>
+*> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+*>
+*> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+*>
+*> If STOREV = 'C', the vector which defines the elementary reflector
+*> H(i) is stored in the i-th column of the array V, and
+*>
+*>    H  =  I - V * T * V**H
+*>
+*> If STOREV = 'R', the vector which defines the elementary reflector
+*> H(i) is stored in the i-th row of the array V, and
+*>
+*>    H  =  I - V**H * T * V
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] DIRECT
+*> \verbatim
+*>          DIRECT is CHARACTER*1
+*>          Specifies the order in which the elementary reflectors are
+*>          multiplied to form the block reflector:
+*>          = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*>          = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*>          STOREV is CHARACTER*1
+*>          Specifies how the vectors which define the elementary
+*>          reflectors are stored (see also Further Details):
+*>          = 'C': columnwise
+*>          = 'R': rowwise
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the block reflector H. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The order of the triangular factor T (= the number of
+*>          elementary reflectors). K >= 1.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*>          V is COMPLEX*16 array, dimension
+*>                               (LDV,K) if STOREV = 'C'
+*>                               (LDV,N) if STOREV = 'R'
+*>          The matrix V. See further details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V.
+*>          If STOREV = 'C', LDV >= max(1,N); if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] TAU
+*> \verbatim
+*>          TAU is COMPLEX*16 array, dimension (K)
+*>          TAU(i) must contain the scalar factor of the elementary
+*>          reflector H(i).
+*> \endverbatim
+*>
+*> \param[out] T
+*> \verbatim
+*>          T is COMPLEX*16 array, dimension (LDT,K)
+*>          The k by k triangular factor T of the block reflector.
+*>          If DIRECT = 'F', T is upper triangular; if DIRECT = 'B', T is
+*>          lower triangular. The rest of the array is not used.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complex16OTHERauxiliary
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  The shape of the matrix V and the storage of the vectors which define
+*>  the H(i) is best illustrated by the following example with n = 5 and
+*>  k = 3. The elements equal to 1 are not stored.
+*>
+*>  DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R':
+*>
+*>               V = (  1       )                 V = (  1 v1 v1 v1 v1 )
+*>                   ( v1  1    )                     (     1 v2 v2 v2 )
+*>                   ( v1 v2  1 )                     (        1 v3 v3 )
+*>                   ( v1 v2 v3 )
+*>                   ( v1 v2 v3 )
+*>
+*>  DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R':
+*>
+*>               V = ( v1 v2 v3 )                 V = ( v1 v1  1       )
+*>                   ( v1 v2 v3 )                     ( v2 v2 v2  1    )
+*>                   (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
+*>                   (     1 v3 )
+*>                   (        1 )
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT )
+*
+*  -- LAPACK auxiliary routine (version 3.4.1) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*     .. Scalar Arguments ..
+      CHARACTER          DIRECT, STOREV
+      INTEGER            K, LDT, LDV, N
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         T( LDT, * ), TAU( * ), V( LDV, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX*16         ONE, ZERO
+      PARAMETER          ( ONE = ( 1.0D+0, 0.0D+0 ),
+     $                   ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, J, PREVLASTV, LASTV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZGEMV, ZLACGV, ZTRMV
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      IF( LSAME( DIRECT, 'F' ) ) THEN
+         PREVLASTV = N
+         DO I = 1, K
+            PREVLASTV = MAX( PREVLASTV, I )
+            IF( TAU( I ).EQ.ZERO ) THEN
+*
+*              H(i)  =  I
+*
+               DO J = 1, I
+                  T( J, I ) = ZERO
+               END DO
+            ELSE
+*
+*              general case
+*
+               IF( LSAME( STOREV, 'C' ) ) THEN
+*                 Skip any trailing zeros.
+                  DO LASTV = N, I+1, -1
+                     IF( V( LASTV, I ).NE.ZERO ) EXIT
+                  END DO
+                  DO J = 1, I-1
+                     T( J, I ) = -TAU( I ) * CONJG( V( I , J ) )
+                  END DO                     
+                  J = MIN( LASTV, PREVLASTV )
+*
+*                 T(1:i-1,i) := - tau(i) * V(i:j,1:i-1)**H * V(i:j,i)
+*
+                  CALL ZGEMV( 'Conjugate transpose', J-I, I-1,
+     $                        -TAU( I ), V( I+1, 1 ), LDV, 
+     $                        V( I+1, I ), 1, ONE, T( 1, I ), 1 )
+               ELSE
+*                 Skip any trailing zeros.
+                  DO LASTV = N, I+1, -1
+                     IF( V( I, LASTV ).NE.ZERO ) EXIT
+                  END DO
+                  DO J = 1, I-1
+                     T( J, I ) = -TAU( I ) * V( J , I )
+                  END DO                     
+                  J = MIN( LASTV, PREVLASTV )
+*
+*                 T(1:i-1,i) := - tau(i) * V(1:i-1,i:j) * V(i,i:j)**H
+*
+                  CALL ZGEMM( 'N', 'C', I-1, 1, J-I, -TAU( I ),
+     $                        V( 1, I+1 ), LDV, V( I, I+1 ), LDV,
+     $                        ONE, T( 1, I ), LDT )                  
+               END IF
+*
+*              T(1:i-1,i) := T(1:i-1,1:i-1) * T(1:i-1,i)
+*
+               CALL ZTRMV( 'Upper', 'No transpose', 'Non-unit', I-1, T,
+     $                     LDT, T( 1, I ), 1 )
+               T( I, I ) = TAU( I )
+               IF( I.GT.1 ) THEN
+                  PREVLASTV = MAX( PREVLASTV, LASTV )
+               ELSE
+                  PREVLASTV = LASTV
+               END IF
+             END IF
+         END DO
+      ELSE
+         PREVLASTV = 1
+         DO I = K, 1, -1
+            IF( TAU( I ).EQ.ZERO ) THEN
+*
+*              H(i)  =  I
+*
+               DO J = I, K
+                  T( J, I ) = ZERO
+               END DO
+            ELSE
+*
+*              general case
+*
+               IF( I.LT.K ) THEN
+                  IF( LSAME( STOREV, 'C' ) ) THEN
+*                    Skip any leading zeros.
+                     DO LASTV = 1, I-1
+                        IF( V( LASTV, I ).NE.ZERO ) EXIT
+                     END DO
+                     DO J = I+1, K
+                        T( J, I ) = -TAU( I ) * CONJG( V( N-K+I , J ) )
+                     END DO                        
+                     J = MAX( LASTV, PREVLASTV )
+*
+*                    T(i+1:k,i) = -tau(i) * V(j:n-k+i,i+1:k)**H * V(j:n-k+i,i)
+*
+                     CALL ZGEMV( 'Conjugate transpose', N-K+I-J, K-I,
+     $                           -TAU( I ), V( J, I+1 ), LDV, V( J, I ),
+     $                           1, ONE, T( I+1, I ), 1 )
+                  ELSE
+*                    Skip any leading zeros.
+                     DO LASTV = 1, I-1
+                        IF( V( I, LASTV ).NE.ZERO ) EXIT
+                     END DO
+                     DO J = I+1, K
+                        T( J, I ) = -TAU( I ) * V( J, N-K+I )
+                     END DO                                           
+                     J = MAX( LASTV, PREVLASTV )
+*
+*                    T(i+1:k,i) = -tau(i) * V(i+1:k,j:n-k+i) * V(i,j:n-k+i)**H
+*
+                     CALL ZGEMM( 'N', 'C', K-I, 1, N-K+I-J, -TAU( I ),
+     $                           V( I+1, J ), LDV, V( I, J ), LDV,
+     $                           ONE, T( I+1, I ), LDT )                     
+                  END IF
+*
+*                 T(i+1:k,i) := T(i+1:k,i+1:k) * T(i+1:k,i)
+*
+                  CALL ZTRMV( 'Lower', 'No transpose', 'Non-unit', K-I,
+     $                        T( I+1, I+1 ), LDT, T( I+1, I ), 1 )
+                  IF( I.GT.1 ) THEN
+                     PREVLASTV = MIN( PREVLASTV, LASTV )
+                  ELSE
+                     PREVLASTV = LASTV
+                  END IF
+               END IF
+               T( I, I ) = TAU( I )
+            END IF
+         END DO
+      END IF
+      RETURN
+*
+*     End of ZLARFT
+*
+      END

diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt
new file mode 100644
index 0000000..0d9a631
--- /dev/null
+++ b/scripts/CMakeLists.txt

@@ -0,0 +1,6 @@
+get_property(EIGEN_TESTS_LIST GLOBAL PROPERTY EIGEN_TESTS_LIST)
+configure_file(buildtests.in ${CMAKE_BINARY_DIR}/buildtests.sh @ONLY)
+
+configure_file(check.in ${CMAKE_BINARY_DIR}/check.sh COPYONLY)
+configure_file(debug.in ${CMAKE_BINARY_DIR}/debug.sh COPYONLY)
+configure_file(release.in ${CMAKE_BINARY_DIR}/release.sh COPYONLY)

diff --git a/scripts/buildtests.in b/scripts/buildtests.in
new file mode 100755
index 0000000..ab9c18f
--- /dev/null
+++ b/scripts/buildtests.in

@@ -0,0 +1,22 @@
+#!/bin/bash
+
+if [[ $# != 1 || $1 == *help ]]
+then
+  echo "usage: $0 regexp"
+  echo "  Builds tests matching the regexp."
+  echo "  The EIGEN_MAKE_ARGS environment variable allows to pass args to 'make'."
+  echo "    For example, to launch 5 concurrent builds, use EIGEN_MAKE_ARGS='-j5'"
+  exit 0
+fi
+
+TESTSLIST="@EIGEN_TESTS_LIST@"
+targets_to_make=$(echo "$TESTSLIST" | grep -E "$1" | xargs echo)
+
+if [ -n "${EIGEN_MAKE_ARGS:+x}" ]
+then
+  @CMAKE_MAKE_PROGRAM@ $targets_to_make ${EIGEN_MAKE_ARGS}
+else
+  @CMAKE_MAKE_PROGRAM@ $targets_to_make @EIGEN_TEST_BUILD_FLAGS@
+fi
+exit $?
+

diff --git a/scripts/cdashtesting.cmake.in b/scripts/cdashtesting.cmake.in
new file mode 100644
index 0000000..0bf0fac
--- /dev/null
+++ b/scripts/cdashtesting.cmake.in

@@ -0,0 +1,49 @@
+
+set(CTEST_SOURCE_DIRECTORY  "@CMAKE_SOURCE_DIR@")
+set(CTEST_BINARY_DIRECTORY  "@CMAKE_BINARY_DIR@")
+set(CTEST_CMAKE_GENERATOR   "@CMAKE_GENERATOR@")
+set(CTEST_BUILD_NAME        "@BUILDNAME@")
+set(CTEST_SITE              "@SITE@")
+
+set(MODEL Experimental)
+if(${CTEST_SCRIPT_ARG} MATCHES Nightly)
+  set(MODEL Nightly)
+elseif(${CTEST_SCRIPT_ARG} MATCHES Continuous)
+  set(MODEL Continuous)
+endif()
+
+find_program(CTEST_GIT_COMMAND NAMES git)
+set(CTEST_UPDATE_COMMAND "${CTEST_GIT_COMMAND}")
+
+ctest_start(${MODEL} ${CTEST_SOURCE_DIRECTORY} ${CTEST_BINARY_DIRECTORY})
+
+ctest_update(SOURCE "${CTEST_SOURCE_DIRECTORY}")
+ctest_submit(PARTS Update Notes)
+
+# to get CTEST_PROJECT_SUBPROJECTS definition:
+include("${CTEST_SOURCE_DIRECTORY}/CTestConfig.cmake")
+
+foreach(subproject ${CTEST_PROJECT_SUBPROJECTS})
+  message("")
+  message("Process ${subproject}")
+  
+  set_property(GLOBAL PROPERTY SubProject ${subproject})
+  set_property(GLOBAL PROPERTY Label ${subproject})
+
+  ctest_configure(BUILD ${CTEST_BINARY_DIRECTORY} SOURCE ${CTEST_SOURCE_DIRECTORY} )
+  ctest_submit(PARTS Configure)
+
+  set(CTEST_BUILD_TARGET "Build${subproject}")
+  message("Build ${CTEST_BUILD_TARGET}")
+  ctest_build(BUILD "${CTEST_BINARY_DIRECTORY}" APPEND)
+  # builds target ${CTEST_BUILD_TARGET}
+  ctest_submit(PARTS Build)
+
+  ctest_test(BUILD "${CTEST_BINARY_DIRECTORY}" INCLUDE_LABEL "${subproject}" )
+  # runs only tests that have a LABELS property matching "${subproject}"
+  
+  ctest_coverage(BUILD "${CTEST_BINARY_DIRECTORY}" LABELS "${subproject}" )
+  
+  ctest_submit(PARTS Test)
+  
+endforeach()

diff --git a/scripts/check.in b/scripts/check.in
new file mode 100755
index 0000000..7717e2d
--- /dev/null
+++ b/scripts/check.in

@@ -0,0 +1,21 @@
+#!/bin/bash
+# check : shorthand for make and ctest -R
+
+if [[ $# != 1 || $1 == *help ]]
+then
+  echo "usage: $0 regexp"
+  echo "  Builds and runs tests matching the regexp."
+  echo "  The EIGEN_MAKE_ARGS environment variable allows to pass args to 'make'."
+  echo "    For example, to launch 5 concurrent builds, use EIGEN_MAKE_ARGS='-j5'"
+  echo "  The EIGEN_CTEST_ARGS environment variable allows to pass args to 'ctest'."
+  echo "    For example, with CTest 2.8, you can use EIGEN_CTEST_ARGS='-j5'."
+  exit 0
+fi
+
+if [ -n "${EIGEN_CTEST_ARGS:+x}" ]
+then
+  ./buildtests.sh "$1" && ctest -R "$1" ${EIGEN_CTEST_ARGS}
+else
+  ./buildtests.sh "$1" && ctest -R "$1"
+fi
+exit $?

diff --git a/scripts/debug.in b/scripts/debug.in
new file mode 100755
index 0000000..d339d3d
--- /dev/null
+++ b/scripts/debug.in

@@ -0,0 +1,3 @@
+#!/bin/sh
+
+cmake -DCMAKE_BUILD_TYPE=Debug .

diff --git a/scripts/eigen_gen_credits.cpp b/scripts/eigen_gen_credits.cpp
new file mode 100644
index 0000000..f2e8163
--- /dev/null
+++ b/scripts/eigen_gen_credits.cpp

@@ -0,0 +1,232 @@
+#include <string>
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <map>
+#include <list>
+
+using namespace std;
+
+// this function takes a line that may contain a name and/or email address,
+// and returns just the name, while fixing the "bad cases".
+std::string contributor_name(const std::string& line)
+{
+  string result;
+
+  // let's first take care of the case of isolated email addresses, like
+  // "user@localhost.localdomain" entries
+  if(line.find("markb@localhost.localdomain") != string::npos)
+  {
+    return "Mark Borgerding";
+  }
+
+  if(line.find("kayhman@contact.intra.cea.fr") != string::npos)
+  {
+    return "Guillaume Saupin";
+  }
+
+  // from there on we assume that we have a entry of the form
+  // either:
+  //   Bla bli Blurp
+  // or:
+  //   Bla bli Blurp <bblurp@email.com>
+  
+  size_t position_of_email_address = line.find_first_of('<');
+  if(position_of_email_address != string::npos)
+  {
+    // there is an e-mail address in <...>.
+    
+    // Hauke once committed as "John Smith", fix that.
+    if(line.find("hauke.heibel") != string::npos)
+      result = "Hauke Heibel";
+    else
+    {
+      // just remove the e-mail address
+      result = line.substr(0, position_of_email_address);
+    }
+  }
+  else
+  {
+    // there is no e-mail address in <...>.
+    
+    if(line.find("convert-repo") != string::npos)
+      result = "";
+    else
+      result = line;
+  }
+
+  // remove trailing spaces
+  size_t length = result.length();
+  while(length >= 1 && result[length-1] == ' ') result.erase(--length);
+
+  return result;
+}
+
+// parses hg churn output to generate a contributors map.
+map<string,int> contributors_map_from_churn_output(const char *filename)
+{
+  map<string,int> contributors_map;
+
+  string line;
+  ifstream churn_out;
+  churn_out.open(filename, ios::in);
+  while(!getline(churn_out,line).eof())
+  {
+    // remove the histograms "******" that hg churn may draw at the end of some lines
+    size_t first_star = line.find_first_of('*');
+    if(first_star != string::npos) line.erase(first_star);
+    
+    // remove trailing spaces
+    size_t length = line.length();
+    while(length >= 1 && line[length-1] == ' ') line.erase(--length);
+
+    // now the last space indicates where the number starts
+    size_t last_space = line.find_last_of(' ');
+    
+    // get the number (of changesets or of modified lines for each contributor)
+    int number;
+    istringstream(line.substr(last_space+1)) >> number;
+
+    // get the name of the contributor
+    line.erase(last_space);    
+    string name = contributor_name(line);
+    
+    map<string,int>::iterator it = contributors_map.find(name);
+    // if new contributor, insert
+    if(it == contributors_map.end())
+      contributors_map.insert(pair<string,int>(name, number));
+    // if duplicate, just add the number
+    else
+      it->second += number;
+  }
+  churn_out.close();
+
+  return contributors_map;
+}
+
+// find the last name, i.e. the last word.
+// for "van den Schbling" types of last names, that's not a problem, that's actually what we want.
+string lastname(const string& name)
+{
+  size_t last_space = name.find_last_of(' ');
+  if(last_space >= name.length()-1) return name;
+  else return name.substr(last_space+1);
+}
+
+struct contributor
+{
+  string name;
+  int changedlines;
+  int changesets;
+  string url;
+  string misc;
+  
+  contributor() : changedlines(0), changesets(0) {}
+  
+  bool operator < (const contributor& other)
+  {
+    return lastname(name).compare(lastname(other.name)) < 0;
+  }
+};
+
+void add_online_info_into_contributors_list(list<contributor>& contributors_list, const char *filename)
+{
+  string line;
+  ifstream online_info;
+  online_info.open(filename, ios::in);
+  while(!getline(online_info,line).eof())
+  {
+    string hgname, realname, url, misc;
+    
+    size_t last_bar = line.find_last_of('|');
+    if(last_bar == string::npos) continue;
+    if(last_bar < line.length())
+      misc = line.substr(last_bar+1);
+    line.erase(last_bar);
+    
+    last_bar = line.find_last_of('|');
+    if(last_bar == string::npos) continue;
+    if(last_bar < line.length())
+      url = line.substr(last_bar+1);
+    line.erase(last_bar);
+
+    last_bar = line.find_last_of('|');
+    if(last_bar == string::npos) continue;
+    if(last_bar < line.length())
+      realname = line.substr(last_bar+1);
+    line.erase(last_bar);
+
+    hgname = line;
+    
+    // remove the example line
+    if(hgname.find("MercurialName") != string::npos) continue;
+    
+    list<contributor>::iterator it;
+    for(it=contributors_list.begin(); it != contributors_list.end() && it->name != hgname; ++it)
+    {}
+    
+    if(it == contributors_list.end())
+    {
+      contributor c;
+      c.name = realname;
+      c.url = url;
+      c.misc = misc;
+      contributors_list.push_back(c);
+    }
+    else
+    {
+      it->name = realname;
+      it->url = url;
+      it->misc = misc;
+    }
+  }
+}
+
+int main()
+{
+  // parse the hg churn output files
+  map<string,int> contributors_map_for_changedlines = contributors_map_from_churn_output("churn-changedlines.out");
+  //map<string,int> contributors_map_for_changesets = contributors_map_from_churn_output("churn-changesets.out");
+  
+  // merge into the contributors list
+  list<contributor> contributors_list;
+  map<string,int>::iterator it;
+  for(it=contributors_map_for_changedlines.begin(); it != contributors_map_for_changedlines.end(); ++it)
+  {
+    contributor c;
+    c.name = it->first;
+    c.changedlines = it->second;
+    c.changesets = 0; //contributors_map_for_changesets.find(it->first)->second;
+    contributors_list.push_back(c);
+  }
+  
+  add_online_info_into_contributors_list(contributors_list, "online-info.out");
+  
+  contributors_list.sort();
+  
+  cout << "{| cellpadding=\"5\"\n";
+  cout << "!\n";
+  cout << "! Lines changed\n";
+  cout << "!\n";
+
+  list<contributor>::iterator itc;
+  int i = 0;
+  for(itc=contributors_list.begin(); itc != contributors_list.end(); ++itc)
+  {
+    if(itc->name.length() == 0) continue;
+    if(i%2) cout << "|-\n";
+    else cout << "|- style=\"background:#FFFFD0\"\n";
+    if(itc->url.length())
+      cout << "| [" << itc->url << " " << itc->name << "]\n";
+    else
+      cout << "| " << itc->name << "\n";
+    if(itc->changedlines)
+      cout << "| " << itc->changedlines << "\n";
+    else
+      cout << "| (no information)\n";
+    cout << "| " << itc->misc << "\n";
+    i++;
+  }
+  cout << "|}" << endl;
+}

diff --git a/scripts/eigen_gen_docs b/scripts/eigen_gen_docs
new file mode 100644
index 0000000..787dcb3
--- /dev/null
+++ b/scripts/eigen_gen_docs

@@ -0,0 +1,24 @@
+#!/bin/sh
+
+# configuration
+# You should call this script with USER set as you want, else some default
+# will be used
+USER=${USER:-'orzel'}
+UPLOAD_DIR=dox-devel
+
+#ulimit -v 1024000
+
+# step 1 : build
+rm build/doc/html -Rf
+mkdir build -p
+(cd build && cmake .. && make doc) || { echo "make failed"; exit 1; }
+
+#step 2 : upload
+# (the '/' at the end of path is very important, see rsync documentation)
+rsync -az --no-p --delete build/doc/html/ $USER@ssh.tuxfamily.org:eigen/eigen.tuxfamily.org-web/htdocs/$UPLOAD_DIR/ || { echo "upload failed"; exit 1; }
+
+#step 3 : fix the perm
+ssh $USER@ssh.tuxfamily.org "chmod -R g+w /home/eigen/eigen.tuxfamily.org-web/htdocs/$UPLOAD_DIR" || { echo "perm failed"; exit 1; }
+
+echo "Uploaded successfully"
+

diff --git a/scripts/eigen_gen_split_test_help.cmake b/scripts/eigen_gen_split_test_help.cmake
new file mode 100644
index 0000000..e43f5aa
--- /dev/null
+++ b/scripts/eigen_gen_split_test_help.cmake

@@ -0,0 +1,11 @@
+#!cmake -P
+file(WRITE split_test_helper.h "")
+foreach(i RANGE 1 999)
+  file(APPEND split_test_helper.h
+    "#if defined(EIGEN_TEST_PART_${i}) || defined(EIGEN_TEST_PART_ALL)\n"
+    "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n"
+    "#else\n"
+    "#define CALL_SUBTEST_${i}(FUNC)\n"
+    "#endif\n\n"
+  )
+endforeach()
\ No newline at end of file

diff --git a/scripts/eigen_monitor_perf.sh b/scripts/eigen_monitor_perf.sh
new file mode 100755
index 0000000..8f3425d
--- /dev/null
+++ b/scripts/eigen_monitor_perf.sh

@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# This is a script example to automatically update and upload performance unit tests.
+# The following five variables must be adjusted to match your settings.
+
+USER='ggael'
+UPLOAD_DIR=perf_monitoring/ggaelmacbook26
+EIGEN_SOURCE_PATH=$HOME/Eigen/eigen
+export PREFIX="haswell-fma"
+export CXX_FLAGS="-mfma -w"
+
+####
+
+BENCH_PATH=$EIGEN_SOURCE_PATH/bench/perf_monitoring/$PREFIX
+PREVPATH=$(pwd)
+cd $EIGEN_SOURCE_PATH/bench/perf_monitoring && ./runall.sh "Haswell 2.6GHz, FMA, Apple's clang" "$@"
+cd $PREVPATH || exit 1
+
+ALLFILES="$BENCH_PATH/*.png $BENCH_PATH/*.html $BENCH_PATH/index.html $BENCH_PATH/s1.js $BENCH_PATH/s2.js"
+
+# (the '/' at the end of path is very important, see rsync documentation)
+rsync -az --no-p --delete $ALLFILES $USER@ssh.tuxfamily.org:eigen/eigen.tuxfamily.org-web/htdocs/$UPLOAD_DIR/ || { echo "upload failed"; exit 1; }
+
+# fix the perm
+ssh $USER@ssh.tuxfamily.org "chmod -R g+w /home/eigen/eigen.tuxfamily.org-web/htdocs/perf_monitoring" || { echo "perm failed"; exit 1; }

diff --git a/scripts/release.in b/scripts/release.in
new file mode 100755
index 0000000..db2d9d9
--- /dev/null
+++ b/scripts/release.in

@@ -0,0 +1,3 @@
+#!/bin/sh
+
+cmake -DCMAKE_BUILD_TYPE=Release .

diff --git a/scripts/relicense.py b/scripts/relicense.py
new file mode 100644
index 0000000..8a5265f
--- /dev/null
+++ b/scripts/relicense.py

@@ -0,0 +1,69 @@
+# This file is part of Eigen, a lightweight C++ template library
+# for linear algebra.
+#
+# Copyright (C) 2012 Keir Mierle <mierle@gmail.com>
+#
+# This Source Code Form is subject to the terms of the Mozilla
+# Public License v. 2.0. If a copy of the MPL was not distributed
+# with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+# Author: mierle@gmail.com (Keir Mierle)
+#
+# Make the long-awaited conversion to MPL.
+
+lgpl3_header = '''
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+'''
+
+mpl2_header = """
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+"""
+
+import os
+import sys
+
+exclusions = set(['relicense.py'])
+
+def update(text):
+  if text.find(lgpl3_header) == -1:
+    return text, False
+  return text.replace(lgpl3_header, mpl2_header), True
+
+rootdir = sys.argv[1]
+for root, sub_folders, files in os.walk(rootdir):
+    for basename in files:
+        if basename in exclusions:
+          print 'SKIPPED', filename
+          continue
+        filename = os.path.join(root, basename)
+        fo = file(filename)
+        text = fo.read()
+        fo.close()
+
+        text, updated = update(text)
+        if updated:
+          fo = file(filename, "w")
+          fo.write(text)
+          fo.close()
+          print 'UPDATED', filename
+        else:
+          print '       ', filename

diff --git a/signature_of_eigen3_matrix_library b/signature_of_eigen3_matrix_library
new file mode 100644
index 0000000..80aaf46
--- /dev/null
+++ b/signature_of_eigen3_matrix_library

@@ -0,0 +1 @@
+This file is just there as a signature to help identify directories containing Eigen3. When writing a script looking for Eigen3, just look for this file. This is especially useful to help disambiguate with Eigen2...

diff --git a/test/AnnoyingScalar.h b/test/AnnoyingScalar.h
new file mode 100644
index 0000000..7ace083
--- /dev/null
+++ b/test/AnnoyingScalar.h

@@ -0,0 +1,165 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_H
+#define EIGEN_TEST_ANNOYING_SCALAR_H
+
+#include <ostream>
+
+#if EIGEN_COMP_GNUC
+#pragma GCC diagnostic ignored "-Wshadow"
+#endif
+
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+struct my_exception
+{
+  my_exception() {}
+  ~my_exception() {}
+};
+#endif
+
+// An AnnoyingScalar is a pseudo scalar type that:
+// - can randomly through an exception in operator +
+// - randomly allocate on the heap or initialize a reference to itself making it non trivially copyable, nor movable, nor relocatable.
+
+class AnnoyingScalar
+{
+  public:
+    AnnoyingScalar()                { init(); *v = 0;  }
+    AnnoyingScalar(long double _v)  { init(); *v = _v; }
+    AnnoyingScalar(double _v)       { init(); *v = _v; }
+    AnnoyingScalar(float _v)        { init(); *v = _v; }
+    AnnoyingScalar(int _v)          { init(); *v = _v; }
+    AnnoyingScalar(long _v)         { init(); *v = _v; }
+    #if EIGEN_HAS_CXX11
+    AnnoyingScalar(long long _v)    { init(); *v = _v; }
+    #endif
+    AnnoyingScalar(const AnnoyingScalar& other) { init(); *v = *(other.v); }
+    ~AnnoyingScalar() {
+      if(v!=&data)
+        delete v;
+      instances--;
+    }
+
+    void init() {
+      if(internal::random<bool>())
+        v = new float;
+      else
+        v = &data;
+      instances++;
+    }
+
+    AnnoyingScalar operator+(const AnnoyingScalar& other) const
+    {
+      #ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+      countdown--;
+      if(countdown<=0 && !dont_throw)
+        throw my_exception();
+      #endif
+      return AnnoyingScalar(*v+*other.v);
+    }
+
+    AnnoyingScalar operator-() const
+    { return AnnoyingScalar(-*v); }
+    
+    AnnoyingScalar operator-(const AnnoyingScalar& other) const
+    { return AnnoyingScalar(*v-*other.v); }
+    
+    AnnoyingScalar operator*(const AnnoyingScalar& other) const
+    { return AnnoyingScalar((*v)*(*other.v)); }
+
+    AnnoyingScalar operator/(const AnnoyingScalar& other) const
+    { return AnnoyingScalar((*v)/(*other.v)); }
+
+    AnnoyingScalar& operator+=(const AnnoyingScalar& other) { *v += *other.v; return *this; }
+    AnnoyingScalar& operator-=(const AnnoyingScalar& other) { *v -= *other.v; return *this; }
+    AnnoyingScalar& operator*=(const AnnoyingScalar& other) { *v *= *other.v; return *this; }
+    AnnoyingScalar& operator/=(const AnnoyingScalar& other) { *v /= *other.v; return *this; }
+    AnnoyingScalar& operator= (const AnnoyingScalar& other) { *v  = *other.v; return *this; }
+
+    bool operator==(const AnnoyingScalar& other) const { return *v == *other.v; }
+    bool operator!=(const AnnoyingScalar& other) const { return *v != *other.v; }
+    bool operator<=(const AnnoyingScalar& other) const { return *v <= *other.v; }
+    bool operator< (const AnnoyingScalar& other) const { return *v <  *other.v; }
+    bool operator>=(const AnnoyingScalar& other) const { return *v >= *other.v; }
+    bool operator> (const AnnoyingScalar& other) const { return *v >  *other.v; }
+  
+    float* v;
+    float data;
+    static int instances;
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+    static int countdown;
+    static bool dont_throw;
+#endif
+};
+
+AnnoyingScalar real(const AnnoyingScalar &x) { return x; }
+AnnoyingScalar imag(const AnnoyingScalar & ) { return 0; }
+AnnoyingScalar conj(const AnnoyingScalar &x) { return x; }
+AnnoyingScalar sqrt(const AnnoyingScalar &x) { return std::sqrt(*x.v); }
+AnnoyingScalar abs (const AnnoyingScalar &x) { return std::abs(*x.v); }
+AnnoyingScalar cos (const AnnoyingScalar &x) { return std::cos(*x.v); }
+AnnoyingScalar sin (const AnnoyingScalar &x) { return std::sin(*x.v); }
+AnnoyingScalar acos(const AnnoyingScalar &x) { return std::acos(*x.v); }
+AnnoyingScalar atan2(const AnnoyingScalar &y,const AnnoyingScalar &x) { return std::atan2(*y.v,*x.v); }
+
+std::ostream& operator<<(std::ostream& stream,const AnnoyingScalar& x) {
+  stream << (*(x.v));
+  return stream;
+}
+
+int AnnoyingScalar::instances = 0;
+
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+int AnnoyingScalar::countdown = 0;
+bool AnnoyingScalar::dont_throw = false;
+#endif
+
+namespace Eigen {
+template<>
+struct NumTraits<AnnoyingScalar> : NumTraits<float>
+{
+  enum {
+    RequireInitialization = 1,
+  };
+  typedef AnnoyingScalar Real;
+  typedef AnnoyingScalar Nested;
+  typedef AnnoyingScalar Literal;
+  typedef AnnoyingScalar NonInteger;
+};
+
+template<> inline AnnoyingScalar test_precision<AnnoyingScalar>() { return test_precision<float>(); }
+
+namespace numext {
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isfinite)(const AnnoyingScalar& x) {
+  return (numext::isfinite)(*x.v);
+}
+}
+
+namespace internal {
+  template<> EIGEN_STRONG_INLINE double cast(const AnnoyingScalar& x) { return double(*x.v); }
+  template<> EIGEN_STRONG_INLINE float  cast(const AnnoyingScalar& x) { return *x.v; }
+}
+}  // namespace Eigen
+
+AnnoyingScalar get_test_precision(const AnnoyingScalar&)
+{ return Eigen::test_precision<AnnoyingScalar>(); }
+
+AnnoyingScalar test_relative_error(const AnnoyingScalar &a, const AnnoyingScalar &b)
+{ return test_relative_error(*a.v, *b.v); }
+
+inline bool test_isApprox(const AnnoyingScalar &a, const AnnoyingScalar &b)
+{ return internal::isApprox(*a.v, *b.v, test_precision<float>()); }
+
+inline bool test_isMuchSmallerThan(const AnnoyingScalar &a, const AnnoyingScalar &b)
+{ return test_isMuchSmallerThan(*a.v, *b.v); }
+
+#endif // EIGEN_TEST_ANNOYING_SCALAR_H

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
new file mode 100644
index 0000000..5136f82
--- /dev/null
+++ b/test/CMakeLists.txt

@@ -0,0 +1,465 @@
+# The file split_test_helper.h was generated at first run,
+# it is now included in test/
+if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
+  file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
+endif()
+
+# check if we have a Fortran compiler
+include(CheckLanguage)
+check_language(Fortran)
+if(CMAKE_Fortran_COMPILER)
+  enable_language(Fortran)
+  set(EIGEN_Fortran_COMPILER_WORKS ON)
+else()
+  set(EIGEN_Fortran_COMPILER_WORKS OFF)
+  # search for a default Lapack library to complete Eigen's one
+  find_package(LAPACK QUIET)
+endif()
+
+# TODO do the same for EXTERNAL_LAPACK
+option(EIGEN_TEST_EXTERNAL_BLAS "Use external BLAS library for testsuite" OFF)
+if(EIGEN_TEST_EXTERNAL_BLAS)
+  find_package(BLAS REQUIRED)
+  message(STATUS "BLAS_COMPILER_FLAGS: ${BLAS_COMPILER_FLAGS}")
+  add_definitions("-DEIGEN_USE_BLAS") # is adding  ${BLAS_COMPILER_FLAGS} necessary?
+  list(APPEND EXTERNAL_LIBS "${BLAS_LIBRARIES}")
+endif()
+
+# configure blas/lapack (use Eigen's ones)
+set(EIGEN_BLAS_LIBRARIES eigen_blas)
+set(EIGEN_LAPACK_LIBRARIES eigen_lapack)
+
+set(EIGEN_TEST_MATRIX_DIR "" CACHE STRING "Enable testing of realword sparse matrices contained in the specified path")
+if(EIGEN_TEST_MATRIX_DIR)
+  if(NOT WIN32)
+    message(STATUS "Test realworld sparse matrices: ${EIGEN_TEST_MATRIX_DIR}")
+    add_definitions( -DTEST_REAL_CASES="${EIGEN_TEST_MATRIX_DIR}" )
+  else()
+    message(STATUS "REAL CASES CAN NOT BE CURRENTLY TESTED ON WIN32")
+  endif()
+endif()
+
+set(SPARSE_LIBS " ")
+
+find_package(CHOLMOD)
+if(CHOLMOD_FOUND)
+  add_definitions("-DEIGEN_CHOLMOD_SUPPORT")
+  include_directories(${CHOLMOD_INCLUDES})
+  set(SPARSE_LIBS ${SPARSE_LIBS} ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES})
+  set(CHOLMOD_ALL_LIBS  ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES})
+  ei_add_property(EIGEN_TESTED_BACKENDS "CHOLMOD, ")
+else()
+  ei_add_property(EIGEN_MISSING_BACKENDS "CHOLMOD, ")
+endif()
+
+find_package(UMFPACK)
+if(UMFPACK_FOUND)
+  add_definitions("-DEIGEN_UMFPACK_SUPPORT")
+  include_directories(${UMFPACK_INCLUDES})
+  set(SPARSE_LIBS ${SPARSE_LIBS} ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
+  set(UMFPACK_ALL_LIBS ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
+  ei_add_property(EIGEN_TESTED_BACKENDS "UMFPACK, ")
+else()
+  ei_add_property(EIGEN_MISSING_BACKENDS "UMFPACK, ")
+endif()
+
+find_package(KLU)
+if(KLU_FOUND)
+  add_definitions("-DEIGEN_KLU_SUPPORT")
+  include_directories(${KLU_INCLUDES})
+  set(SPARSE_LIBS ${SPARSE_LIBS} ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
+  set(KLU_ALL_LIBS ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
+  ei_add_property(EIGEN_TESTED_BACKENDS "KLU, ")
+else()
+  ei_add_property(EIGEN_MISSING_BACKENDS "KLU, ")
+endif()
+
+find_package(SuperLU 4.0)
+if(SuperLU_FOUND)
+  add_definitions("-DEIGEN_SUPERLU_SUPPORT")
+  include_directories(${SUPERLU_INCLUDES})
+  set(SPARSE_LIBS ${SPARSE_LIBS} ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
+  set(SUPERLU_ALL_LIBS ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
+  ei_add_property(EIGEN_TESTED_BACKENDS  "SuperLU, ")
+else()
+  ei_add_property(EIGEN_MISSING_BACKENDS  "SuperLU, ")
+endif()
+
+
+find_package(PASTIX QUIET COMPONENTS METIS SEQ)
+# check that the PASTIX found is a version without MPI
+find_path(PASTIX_pastix_nompi.h_INCLUDE_DIRS
+  NAMES pastix_nompi.h
+  HINTS ${PASTIX_INCLUDE_DIRS}
+)
+if (NOT PASTIX_pastix_nompi.h_INCLUDE_DIRS)
+  message(STATUS "A version of Pastix has been found but pastix_nompi.h does not exist in the include directory."
+                 " Because Eigen tests require a version without MPI, we disable the Pastix backend.")
+endif()
+if(PASTIX_FOUND AND PASTIX_pastix_nompi.h_INCLUDE_DIRS)
+  add_definitions("-DEIGEN_PASTIX_SUPPORT")
+  include_directories(${PASTIX_INCLUDE_DIRS_DEP})
+  if(SCOTCH_FOUND)
+    include_directories(${SCOTCH_INCLUDE_DIRS})
+    set(PASTIX_LIBRARIES ${PASTIX_LIBRARIES} ${SCOTCH_LIBRARIES})
+  elseif(METIS_FOUND)
+    include_directories(${METIS_INCLUDE_DIRS})
+    set(PASTIX_LIBRARIES ${PASTIX_LIBRARIES} ${METIS_LIBRARIES})
+  else()
+    ei_add_property(EIGEN_MISSING_BACKENDS  "PaStiX, ")
+  endif()
+  set(SPARSE_LIBS ${SPARSE_LIBS} ${PASTIX_LIBRARIES_DEP} ${ORDERING_LIBRARIES})
+  set(PASTIX_ALL_LIBS ${PASTIX_LIBRARIES_DEP})
+  ei_add_property(EIGEN_TESTED_BACKENDS  "PaStiX, ")
+else()
+  ei_add_property(EIGEN_MISSING_BACKENDS  "PaStiX, ")
+endif()
+
+if(METIS_FOUND)
+  add_definitions("-DEIGEN_METIS_SUPPORT")
+  include_directories(${METIS_INCLUDE_DIRS})
+  ei_add_property(EIGEN_TESTED_BACKENDS "METIS, ")
+else()
+  ei_add_property(EIGEN_MISSING_BACKENDS "METIS, ")
+endif()
+
+find_package(SPQR)
+if(SPQR_FOUND AND CHOLMOD_FOUND AND (EIGEN_Fortran_COMPILER_WORKS OR LAPACK_FOUND) )
+  add_definitions("-DEIGEN_SPQR_SUPPORT")
+  include_directories(${SPQR_INCLUDES})
+  set(SPQR_ALL_LIBS ${SPQR_LIBRARIES} ${CHOLMOD_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${LAPACK_LIBRARIES})
+  set(SPARSE_LIBS ${SPARSE_LIBS} ${SPQR_ALL_LIBS})
+  ei_add_property(EIGEN_TESTED_BACKENDS "SPQR, ")
+else()
+  ei_add_property(EIGEN_MISSING_BACKENDS "SPQR, ")
+endif()
+
+option(EIGEN_TEST_NOQT "Disable Qt support in unit tests" OFF)
+if(NOT EIGEN_TEST_NOQT)
+  find_package(Qt4)
+  if(QT4_FOUND)
+    include(${QT_USE_FILE})
+    ei_add_property(EIGEN_TESTED_BACKENDS  "Qt4 support, ")
+  else()
+    ei_add_property(EIGEN_MISSING_BACKENDS  "Qt4 support, ")
+  endif()
+endif()
+
+if(TEST_LIB)
+  add_definitions("-DEIGEN_EXTERN_INSTANTIATIONS=1")
+endif()
+
+set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Official")
+add_custom_target(BuildOfficial)
+
+ei_add_test(rand)
+ei_add_test(meta)
+ei_add_test(numext)
+ei_add_test(sizeof)
+ei_add_test(dynalloc)
+ei_add_test(nomalloc)
+ei_add_test(first_aligned)
+ei_add_test(type_alias)
+ei_add_test(nullary)
+ei_add_test(mixingtypes)
+ei_add_test(io)
+ei_add_test(packetmath "-DEIGEN_FAST_MATH=1")
+ei_add_test(vectorization_logic)
+ei_add_test(basicstuff)
+ei_add_test(constructor)
+ei_add_test(linearstructure)
+ei_add_test(integer_types)
+ei_add_test(unalignedcount)
+if(NOT EIGEN_TEST_NO_EXCEPTIONS AND NOT EIGEN_TEST_OPENMP)
+  ei_add_test(exceptions)
+endif()
+ei_add_test(redux)
+ei_add_test(visitor)
+ei_add_test(block)
+ei_add_test(corners)
+ei_add_test(symbolic_index)
+ei_add_test(indexed_view)
+ei_add_test(reshape)
+ei_add_test(swap)
+ei_add_test(resize)
+ei_add_test(conservative_resize)
+ei_add_test(product_small)
+ei_add_test(product_large)
+ei_add_test(product_extra)
+ei_add_test(diagonalmatrices)
+ei_add_test(adjoint)
+ei_add_test(diagonal)
+ei_add_test(miscmatrices)
+ei_add_test(commainitializer)
+ei_add_test(smallvectors)
+ei_add_test(mapped_matrix)
+ei_add_test(mapstride)
+ei_add_test(mapstaticmethods)
+ei_add_test(array_cwise)
+ei_add_test(array_for_matrix)
+ei_add_test(array_replicate)
+ei_add_test(array_reverse)
+ei_add_test(ref)
+ei_add_test(is_same_dense)
+ei_add_test(triangular)
+ei_add_test(selfadjoint)
+ei_add_test(product_selfadjoint)
+ei_add_test(product_symm)
+ei_add_test(product_syrk)
+ei_add_test(product_trmv)
+ei_add_test(product_trmm)
+ei_add_test(product_trsolve)
+ei_add_test(product_mmtr)
+ei_add_test(product_notemporary)
+ei_add_test(stable_norm)
+ei_add_test(permutationmatrices)
+ei_add_test(bandmatrix)
+ei_add_test(cholesky)
+ei_add_test(lu)
+ei_add_test(determinant)
+ei_add_test(inverse)
+ei_add_test(qr)
+ei_add_test(qr_colpivoting)
+ei_add_test(qr_fullpivoting)
+ei_add_test(upperbidiagonalization)
+ei_add_test(hessenberg)
+ei_add_test(schur_real)
+ei_add_test(schur_complex)
+ei_add_test(eigensolver_selfadjoint)
+ei_add_test(eigensolver_generic)
+ei_add_test(eigensolver_complex)
+ei_add_test(real_qz)
+ei_add_test(eigensolver_generalized_real)
+ei_add_test(jacobi)
+ei_add_test(jacobisvd)
+ei_add_test(bdcsvd)
+ei_add_test(householder)
+ei_add_test(geo_orthomethods)
+ei_add_test(geo_quaternion)
+ei_add_test(geo_eulerangles)
+ei_add_test(geo_parametrizedline)
+ei_add_test(geo_alignedbox)
+ei_add_test(geo_hyperplane)
+ei_add_test(geo_transformations)
+ei_add_test(geo_homogeneous)
+ei_add_test(stdvector)
+ei_add_test(stdvector_overload)
+ei_add_test(stdlist)
+ei_add_test(stdlist_overload)
+ei_add_test(stddeque)
+ei_add_test(stddeque_overload)
+ei_add_test(sparse_basic)
+ei_add_test(sparse_block)
+ei_add_test(sparse_vector)
+ei_add_test(sparse_product)
+ei_add_test(sparse_ref)
+ei_add_test(sparse_solvers)
+ei_add_test(sparse_permutations)
+ei_add_test(simplicial_cholesky)
+ei_add_test(conjugate_gradient)
+ei_add_test(incomplete_cholesky)
+ei_add_test(bicgstab)
+ei_add_test(lscg)
+ei_add_test(sparselu)
+ei_add_test(sparseqr)
+ei_add_test(umeyama)
+ei_add_test(nesting_ops "${CMAKE_CXX_FLAGS_DEBUG}")
+ei_add_test(nestbyvalue)
+ei_add_test(zerosized)
+ei_add_test(dontalign)
+ei_add_test(evaluators)
+if(NOT EIGEN_TEST_NO_EXCEPTIONS)
+  ei_add_test(sizeoverflow)
+endif()
+ei_add_test(prec_inverse_4x4)
+ei_add_test(vectorwiseop)
+ei_add_test(special_numbers)
+ei_add_test(rvalue_types)
+ei_add_test(dense_storage)
+ei_add_test(ctorleak)
+ei_add_test(mpl2only)
+ei_add_test(inplace_decomposition)
+ei_add_test(half_float)
+ei_add_test(bfloat16_float)
+ei_add_test(array_of_string)
+ei_add_test(num_dimensions)
+ei_add_test(stl_iterators)
+ei_add_test(blasutil)
+if(EIGEN_TEST_CXX11)
+  ei_add_test(initializer_list_construction)
+  ei_add_test(diagonal_matrix_variadic_ctor)
+endif()
+
+add_executable(bug1213 bug1213.cpp bug1213_main.cpp)
+
+check_cxx_compiler_flag("-ffast-math" COMPILER_SUPPORT_FASTMATH)
+if(COMPILER_SUPPORT_FASTMATH)
+  set(EIGEN_FASTMATH_FLAGS "-ffast-math")
+else()
+  check_cxx_compiler_flag("/fp:fast" COMPILER_SUPPORT_FPFAST)
+  if(COMPILER_SUPPORT_FPFAST)
+    set(EIGEN_FASTMATH_FLAGS "/fp:fast")
+  endif()
+endif()
+
+ei_add_test(fastmath " ${EIGEN_FASTMATH_FLAGS} ")
+
+# # ei_add_test(denseLM)
+
+if(QT4_FOUND)
+  ei_add_test(qtvector "" "${QT_QTCORE_LIBRARY}")
+endif()
+
+if(UMFPACK_FOUND)
+  ei_add_test(umfpack_support "" "${UMFPACK_ALL_LIBS}")
+endif()
+
+if(KLU_FOUND OR SuiteSparse_FOUND)
+  ei_add_test(klu_support "" "${KLU_ALL_LIBS}")
+endif()
+
+if(SUPERLU_FOUND)
+  ei_add_test(superlu_support "" "${SUPERLU_ALL_LIBS}")
+endif()
+
+if(CHOLMOD_FOUND)
+  ei_add_test(cholmod_support "" "${CHOLMOD_ALL_LIBS}")
+endif()
+
+if(PARDISO_FOUND)
+  ei_add_test(pardiso_support "" "${PARDISO_ALL_LIBS}")
+endif()
+
+if(PASTIX_FOUND AND (SCOTCH_FOUND OR METIS_FOUND))
+  ei_add_test(pastix_support "" "${PASTIX_ALL_LIBS}")
+endif()
+
+if(SPQR_FOUND AND CHOLMOD_FOUND)
+  ei_add_test(spqr_support "" "${SPQR_ALL_LIBS}")
+endif()
+
+if(METIS_FOUND)
+ei_add_test(metis_support "" "${METIS_LIBRARIES}")
+endif()
+
+string(TOLOWER "${CMAKE_CXX_COMPILER}" cmake_cxx_compiler_tolower)
+if(cmake_cxx_compiler_tolower MATCHES "qcc")
+  set(CXX_IS_QCC "ON")
+endif()
+
+ei_add_property(EIGEN_TESTING_SUMMARY "CXX:               ${CMAKE_CXX_COMPILER}\n")
+if(CMAKE_COMPILER_IS_GNUCXX AND NOT CXX_IS_QCC)
+  execute_process(COMMAND ${CMAKE_CXX_COMPILER} --version COMMAND head -n 1 OUTPUT_VARIABLE EIGEN_CXX_VERSION_STRING OUTPUT_STRIP_TRAILING_WHITESPACE)
+  ei_add_property(EIGEN_TESTING_SUMMARY "CXX_VERSION:       ${EIGEN_CXX_VERSION_STRING}\n")
+endif()
+ei_add_property(EIGEN_TESTING_SUMMARY "CXX_FLAGS:         ${CMAKE_CXX_FLAGS}\n")
+if (EIGEN_TEST_CUSTOM_CXX_FLAGS)
+  ei_add_property(EIGEN_TESTING_SUMMARY "Custom CXX flags:  ${EIGEN_TEST_CUSTOM_CXX_FLAGS}\n")
+endif()
+ei_add_property(EIGEN_TESTING_SUMMARY "Sparse lib flags:  ${SPARSE_LIBS}\n")
+
+option(EIGEN_TEST_EIGEN2 "Run whole Eigen2 test suite against EIGEN2_SUPPORT" OFF)
+mark_as_advanced(EIGEN_TEST_EIGEN2)
+if(EIGEN_TEST_EIGEN2)
+  message(WARNING "The Eigen2 test suite has been removed")
+endif()
+
+# boost MP unit test
+find_package(Boost 1.53.0)
+if(Boost_FOUND)
+  include_directories(${Boost_INCLUDE_DIRS})
+  ei_add_test(boostmultiprec "" "${Boost_LIBRARIES}")
+  ei_add_property(EIGEN_TESTED_BACKENDS "Boost.Multiprecision, ")
+else()
+  ei_add_property(EIGEN_MISSING_BACKENDS "Boost.Multiprecision, ")
+endif()
+
+
+# CUDA unit tests
+option(EIGEN_TEST_CUDA "Enable CUDA support in unit tests" OFF)
+option(EIGEN_TEST_CUDA_CLANG "Use clang instead of nvcc to compile the CUDA tests" OFF)
+
+if(EIGEN_TEST_CUDA_CLANG AND NOT CMAKE_CXX_COMPILER MATCHES "clang")
+  message(WARNING "EIGEN_TEST_CUDA_CLANG is set, but CMAKE_CXX_COMPILER does not appear to be clang.")
+endif()
+
+if(EIGEN_TEST_CUDA)
+
+find_package(CUDA 5.0)
+if(CUDA_FOUND)
+  
+  set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+  
+  set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr")
+  if (${CUDA_VERSION} STREQUAL "7.0")
+    set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr")
+  endif()
+  
+  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 
+    set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
+  endif()
+  if(EIGEN_TEST_CUDA_CLANG)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+    string(APPEND CMAKE_CXX_FLAGS " --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}")
+    foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+      string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${GPU}")
+    endforeach()
+  else()
+    foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+      string(APPEND CUDA_NVCC_FLAGS " -gencode arch=compute_${GPU},code=sm_${GPU}")
+    endforeach()
+  endif()
+  string(APPEND CUDA_NVCC_FLAGS " ${EIGEN_CUDA_RELAXED_CONSTEXPR}")
+  set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
+  
+  ei_add_test(gpu_basic)
+  
+  unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+
+endif()
+
+endif()
+
+
+# HIP unit tests
+option(EIGEN_TEST_HIP "Add HIP support." OFF)
+if (EIGEN_TEST_HIP)
+
+  set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
+
+  if (EXISTS ${HIP_PATH})
+    
+    list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake) 
+
+    find_package(HIP REQUIRED)
+    if (HIP_FOUND)
+
+      execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
+
+      if ((${HIP_PLATFORM} STREQUAL "hcc") OR (${HIP_PLATFORM} STREQUAL "amd"))
+
+	include_directories(${HIP_PATH}/include)
+
+	set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
+	ei_add_test(gpu_basic)
+	unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+	
+      elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia"))
+	message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
+      else ()
+	message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
+      endif() 
+    endif()
+  else ()
+    message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
+  endif()
+endif()
+
+cmake_dependent_option(EIGEN_TEST_BUILD_DOCUMENTATION "Test building the doxygen documentation" OFF "EIGEN_BUILD_DOC" OFF)
+if(EIGEN_TEST_BUILD_DOCUMENTATION)
+  add_dependencies(buildtests doc)
+endif()
+
+# Register all smoke tests
+include("EigenSmokeTestList")
+ei_add_smoke_tests("${ei_smoke_test_list}")

diff --git a/test/MovableScalar.h b/test/MovableScalar.h
new file mode 100644
index 0000000..6a90d03
--- /dev/null
+++ b/test/MovableScalar.h

@@ -0,0 +1,35 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 Sebastien Boisvert <seb@boisvert.info>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MISC_MOVABLE_SCALAR_H
+#define EIGEN_MISC_MOVABLE_SCALAR_H
+
+#include <vector>
+
+namespace Eigen
+{
+template <typename Scalar, typename Base = std::vector<Scalar>>
+struct MovableScalar : public Base
+{
+  MovableScalar() = default;
+  ~MovableScalar() = default;
+  MovableScalar(const MovableScalar&) = default;
+  MovableScalar(MovableScalar&& other) = default;
+  MovableScalar& operator=(const MovableScalar&) = default;
+  MovableScalar& operator=(MovableScalar&& other) = default;
+  MovableScalar(Scalar scalar) : Base(100, scalar) {}
+
+  operator Scalar() const { return this->size() > 0 ? this->back() : Scalar(); }
+};
+
+template<> struct NumTraits<MovableScalar<float>> : GenericNumTraits<float> {};
+}
+
+#endif
+

diff --git a/test/SafeScalar.h b/test/SafeScalar.h
new file mode 100644
index 0000000..c5cb757
--- /dev/null
+++ b/test/SafeScalar.h

@@ -0,0 +1,30 @@
+
+// A Scalar that asserts for uninitialized access.
+template<typename T>
+class SafeScalar {
+ public:
+  SafeScalar() : initialized_(false) {}
+  SafeScalar(const SafeScalar& other) {
+    *this = other;
+  }
+  SafeScalar& operator=(const SafeScalar& other) {
+    val_ = T(other);
+    initialized_ = true;
+    return *this;
+  }
+  
+  SafeScalar(T val) : val_(val), initialized_(true) {}
+  SafeScalar& operator=(T val) {
+    val_ = val;
+    initialized_ = true;
+  }
+  
+  operator T() const {
+    VERIFY(initialized_ && "Uninitialized access.");
+    return val_;
+  }
+ 
+ private:
+  T val_;
+  bool initialized_;
+};

diff --git a/test/adjoint.cpp b/test/adjoint.cpp
new file mode 100644
index 0000000..4c4f98b
--- /dev/null
+++ b/test/adjoint.cpp

@@ -0,0 +1,219 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_NO_STATIC_ASSERT
+
+#include "main.h"
+
+template<bool IsInteger> struct adjoint_specific;
+
+template<> struct adjoint_specific<true> {
+  template<typename Vec, typename Mat, typename Scalar>
+  static void run(const Vec& v1, const Vec& v2, Vec& v3, const Mat& square, Scalar s1, Scalar s2) {
+    VERIFY(test_isApproxWithRef((s1 * v1 + s2 * v2).dot(v3),     numext::conj(s1) * v1.dot(v3) + numext::conj(s2) * v2.dot(v3), 0));
+    VERIFY(test_isApproxWithRef(v3.dot(s1 * v1 + s2 * v2),       s1*v3.dot(v1)+s2*v3.dot(v2), 0));
+    
+    // check compatibility of dot and adjoint
+    VERIFY(test_isApproxWithRef(v1.dot(square * v2), (square.adjoint() * v1).dot(v2), 0));
+  }
+};
+
+template<> struct adjoint_specific<false> {
+  template<typename Vec, typename Mat, typename Scalar>
+  static void run(const Vec& v1, const Vec& v2, Vec& v3, const Mat& square, Scalar s1, Scalar s2) {
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    using std::abs;
+    
+    RealScalar ref = NumTraits<Scalar>::IsInteger ? RealScalar(0) : (std::max)((s1 * v1 + s2 * v2).norm(),v3.norm());
+    VERIFY(test_isApproxWithRef((s1 * v1 + s2 * v2).dot(v3),     numext::conj(s1) * v1.dot(v3) + numext::conj(s2) * v2.dot(v3), ref));
+    VERIFY(test_isApproxWithRef(v3.dot(s1 * v1 + s2 * v2),       s1*v3.dot(v1)+s2*v3.dot(v2), ref));
+  
+    VERIFY_IS_APPROX(v1.squaredNorm(),                v1.norm() * v1.norm());
+    // check normalized() and normalize()
+    VERIFY_IS_APPROX(v1, v1.norm() * v1.normalized());
+    v3 = v1;
+    v3.normalize();
+    VERIFY_IS_APPROX(v1, v1.norm() * v3);
+    VERIFY_IS_APPROX(v3, v1.normalized());
+    VERIFY_IS_APPROX(v3.norm(), RealScalar(1));
+
+    // check null inputs
+    VERIFY_IS_APPROX((v1*0).normalized(), (v1*0));
+#if (!EIGEN_ARCH_i386) || defined(EIGEN_VECTORIZE)
+    RealScalar very_small = (std::numeric_limits<RealScalar>::min)();
+    VERIFY( (v1*very_small).norm() == 0 );
+    VERIFY_IS_APPROX((v1*very_small).normalized(), (v1*very_small));
+    v3 = v1*very_small;
+    v3.normalize();
+    VERIFY_IS_APPROX(v3, (v1*very_small));
+#endif
+    
+    // check compatibility of dot and adjoint
+    ref = NumTraits<Scalar>::IsInteger ? 0 : (std::max)((std::max)(v1.norm(),v2.norm()),(std::max)((square * v2).norm(),(square.adjoint() * v1).norm()));
+    VERIFY(internal::isMuchSmallerThan(abs(v1.dot(square * v2) - (square.adjoint() * v1).dot(v2)), ref, test_precision<Scalar>()));
+    
+    // check that Random().normalized() works: tricky as the random xpr must be evaluated by
+    // normalized() in order to produce a consistent result.
+    VERIFY_IS_APPROX(Vec::Random(v1.size()).normalized().norm(), RealScalar(1));
+  }
+};
+
+template<typename MatrixType> void adjoint(const MatrixType& m)
+{
+  /* this test covers the following files:
+     Transpose.h Conjugate.h Dot.h
+  */
+  using std::abs;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType;
+  const Index PacketSize = internal::packet_traits<Scalar>::size;
+  
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m2 = MatrixType::Random(rows, cols),
+             m3(rows, cols),
+             square = SquareMatrixType::Random(rows, rows);
+  VectorType v1 = VectorType::Random(rows),
+             v2 = VectorType::Random(rows),
+             v3 = VectorType::Random(rows),
+             vzero = VectorType::Zero(rows);
+
+  Scalar s1 = internal::random<Scalar>(),
+         s2 = internal::random<Scalar>();
+
+  // check basic compatibility of adjoint, transpose, conjugate
+  VERIFY_IS_APPROX(m1.transpose().conjugate().adjoint(),    m1);
+  VERIFY_IS_APPROX(m1.adjoint().conjugate().transpose(),    m1);
+
+  // check multiplicative behavior
+  VERIFY_IS_APPROX((m1.adjoint() * m2).adjoint(),           m2.adjoint() * m1);
+  VERIFY_IS_APPROX((s1 * m1).adjoint(),                     numext::conj(s1) * m1.adjoint());
+
+  // check basic properties of dot, squaredNorm
+  VERIFY_IS_APPROX(numext::conj(v1.dot(v2)),               v2.dot(v1));
+  VERIFY_IS_APPROX(numext::real(v1.dot(v1)),               v1.squaredNorm());
+  
+  adjoint_specific<NumTraits<Scalar>::IsInteger>::run(v1, v2, v3, square, s1, s2);
+  
+  VERIFY_IS_MUCH_SMALLER_THAN(abs(vzero.dot(v1)),  static_cast<RealScalar>(1));
+  
+  // like in testBasicStuff, test operator() to check const-qualification
+  Index r = internal::random<Index>(0, rows-1),
+      c = internal::random<Index>(0, cols-1);
+  VERIFY_IS_APPROX(m1.conjugate()(r,c), numext::conj(m1(r,c)));
+  VERIFY_IS_APPROX(m1.adjoint()(c,r), numext::conj(m1(r,c)));
+
+  // check inplace transpose
+  m3 = m1;
+  m3.transposeInPlace();
+  VERIFY_IS_APPROX(m3,m1.transpose());
+  m3.transposeInPlace();
+  VERIFY_IS_APPROX(m3,m1);
+  
+  if(PacketSize<m3.rows() && PacketSize<m3.cols())
+  {
+    m3 = m1;
+    Index i = internal::random<Index>(0,m3.rows()-PacketSize);
+    Index j = internal::random<Index>(0,m3.cols()-PacketSize);
+    m3.template block<PacketSize,PacketSize>(i,j).transposeInPlace();
+    VERIFY_IS_APPROX( (m3.template block<PacketSize,PacketSize>(i,j)), (m1.template block<PacketSize,PacketSize>(i,j).transpose()) );
+    m3.template block<PacketSize,PacketSize>(i,j).transposeInPlace();
+    VERIFY_IS_APPROX(m3,m1);
+  }
+
+  // check inplace adjoint
+  m3 = m1;
+  m3.adjointInPlace();
+  VERIFY_IS_APPROX(m3,m1.adjoint());
+  m3.transposeInPlace();
+  VERIFY_IS_APPROX(m3,m1.conjugate());
+
+  // check mixed dot product
+  typedef Matrix<RealScalar, MatrixType::RowsAtCompileTime, 1> RealVectorType;
+  RealVectorType rv1 = RealVectorType::Random(rows);
+  VERIFY_IS_APPROX(v1.dot(rv1.template cast<Scalar>()), v1.dot(rv1));
+  VERIFY_IS_APPROX(rv1.template cast<Scalar>().dot(v1), rv1.dot(v1));
+
+  VERIFY( is_same_type(m1,m1.template conjugateIf<false>()) );
+  VERIFY( is_same_type(m1.conjugate(),m1.template conjugateIf<true>()) );
+}
+
+template<int>
+void adjoint_extra()
+{
+  MatrixXcf a(10,10), b(10,10);
+  VERIFY_RAISES_ASSERT(a = a.transpose());
+  VERIFY_RAISES_ASSERT(a = a.transpose() + b);
+  VERIFY_RAISES_ASSERT(a = b + a.transpose());
+  VERIFY_RAISES_ASSERT(a = a.conjugate().transpose());
+  VERIFY_RAISES_ASSERT(a = a.adjoint());
+  VERIFY_RAISES_ASSERT(a = a.adjoint() + b);
+  VERIFY_RAISES_ASSERT(a = b + a.adjoint());
+
+  // no assertion should be triggered for these cases:
+  a.transpose() = a.transpose();
+  a.transpose() += a.transpose();
+  a.transpose() += a.transpose() + b;
+  a.transpose() = a.adjoint();
+  a.transpose() += a.adjoint();
+  a.transpose() += a.adjoint() + b;
+
+  // regression tests for check_for_aliasing
+  MatrixXd c(10,10);
+  c = 1.0 * MatrixXd::Ones(10,10) + c;
+  c = MatrixXd::Ones(10,10) * 1.0 + c;
+  c = c + MatrixXd::Ones(10,10) .cwiseProduct( MatrixXd::Zero(10,10) );
+  c = MatrixXd::Ones(10,10) * MatrixXd::Zero(10,10);
+
+  // regression for bug 1646
+  for (int j = 0; j < 10; ++j) {
+    c.col(j).head(j) = c.row(j).head(j);
+  }
+
+  for (int j = 0; j < 10; ++j) {
+    c.col(j) = c.row(j);
+  }
+
+  a.conservativeResize(1,1);
+  a = a.transpose();
+
+  a.conservativeResize(0,0);
+  a = a.transpose();
+}
+
+EIGEN_DECLARE_TEST(adjoint)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( adjoint(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( adjoint(Matrix3d()) );
+    CALL_SUBTEST_3( adjoint(Matrix4f()) );
+    
+    CALL_SUBTEST_4( adjoint(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
+    CALL_SUBTEST_5( adjoint(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( adjoint(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    
+    // Complement for 128 bits vectorization:
+    CALL_SUBTEST_8( adjoint(Matrix2d()) );
+    CALL_SUBTEST_9( adjoint(Matrix<int,4,4>()) );
+    
+    // 256 bits vectorization:
+    CALL_SUBTEST_10( adjoint(Matrix<float,8,8>()) );
+    CALL_SUBTEST_11( adjoint(Matrix<double,4,4>()) );
+    CALL_SUBTEST_12( adjoint(Matrix<int,8,8>()) );
+  }
+  // test a large static matrix only once
+  CALL_SUBTEST_7( adjoint(Matrix<float, 100, 100>()) );
+
+  CALL_SUBTEST_13( adjoint_extra<0>() );
+}
+

diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp
new file mode 100644
index 0000000..0cc438b
--- /dev/null
+++ b/test/array_cwise.cpp

@@ -0,0 +1,710 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+
+// Test the corner cases of pow(x, y) for real types.
+template<typename Scalar>
+void pow_test() {
+  const Scalar zero = Scalar(0);
+  const Scalar eps = Eigen::NumTraits<Scalar>::epsilon();
+  const Scalar one = Scalar(1);
+  const Scalar two = Scalar(2);
+  const Scalar three = Scalar(3);
+  const Scalar sqrt_half = Scalar(std::sqrt(0.5));
+  const Scalar sqrt2 = Scalar(std::sqrt(2));
+  const Scalar inf = Eigen::NumTraits<Scalar>::infinity();
+  const Scalar nan = Eigen::NumTraits<Scalar>::quiet_NaN();
+  const Scalar denorm_min = std::numeric_limits<Scalar>::denorm_min();
+  const Scalar min = (std::numeric_limits<Scalar>::min)();
+  const Scalar max = (std::numeric_limits<Scalar>::max)();
+  const Scalar max_exp = (static_cast<Scalar>(int(Eigen::NumTraits<Scalar>::max_exponent())) * Scalar(EIGEN_LN2)) / eps;
+
+  const static Scalar abs_vals[] = {zero,
+                                    denorm_min,
+                                    min,
+                                    eps,
+                                    sqrt_half,
+                                    one,
+                                    sqrt2,
+                                    two,
+                                    three,
+                                    max_exp,
+                                    max,
+                                    inf,
+                                    nan};
+  const int abs_cases = 13;
+  const int num_cases = 2*abs_cases * 2*abs_cases;
+  // Repeat the same value to make sure we hit the vectorized path.
+  const int num_repeats = 32;
+  Array<Scalar, Dynamic, Dynamic> x(num_repeats, num_cases);
+  Array<Scalar, Dynamic, Dynamic> y(num_repeats, num_cases);
+  int count = 0;
+  for (int i = 0; i < abs_cases; ++i) {
+    const Scalar abs_x = abs_vals[i];
+    for (int sign_x = 0; sign_x < 2; ++sign_x) {
+      Scalar x_case = sign_x == 0 ? -abs_x : abs_x;
+      for (int j = 0; j < abs_cases; ++j) {
+        const Scalar abs_y = abs_vals[j];
+        for (int sign_y = 0; sign_y < 2; ++sign_y) {
+          Scalar y_case = sign_y == 0 ? -abs_y : abs_y;
+          for (int repeat = 0; repeat < num_repeats; ++repeat) {
+            x(repeat, count) = x_case;
+            y(repeat, count) = y_case;
+          }
+          ++count;
+        }
+      }
+    }
+  }
+
+  Array<Scalar, Dynamic, Dynamic> actual = x.pow(y);
+  const Scalar tol = test_precision<Scalar>();
+  bool all_pass = true;
+  for (int i = 0; i < 1; ++i) {
+    for (int j = 0; j < num_cases; ++j) {
+      Scalar e = static_cast<Scalar>(std::pow(x(i,j), y(i,j)));
+      Scalar a = actual(i, j);
+      bool fail = !(a==e) && !internal::isApprox(a, e, tol) && !((numext::isnan)(a) && (numext::isnan)(e));
+      all_pass &= !fail;
+      if (fail) {
+        std::cout << "pow(" << x(i,j) << "," << y(i,j) << ")   =   " << a << " !=  " << e << std::endl;
+      }
+    }
+  }
+  VERIFY(all_pass);
+}
+
+template<typename ArrayType> void array(const ArrayType& m)
+{
+  typedef typename ArrayType::Scalar Scalar;
+  typedef typename ArrayType::RealScalar RealScalar;
+  typedef Array<Scalar, ArrayType::RowsAtCompileTime, 1> ColVectorType;
+  typedef Array<Scalar, 1, ArrayType::ColsAtCompileTime> RowVectorType;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  ArrayType m1 = ArrayType::Random(rows, cols),
+             m2 = ArrayType::Random(rows, cols),
+             m3(rows, cols);
+  ArrayType m4 = m1; // copy constructor
+  VERIFY_IS_APPROX(m1, m4);
+
+  ColVectorType cv1 = ColVectorType::Random(rows);
+  RowVectorType rv1 = RowVectorType::Random(cols);
+
+  Scalar  s1 = internal::random<Scalar>(),
+          s2 = internal::random<Scalar>();
+
+  // scalar addition
+  VERIFY_IS_APPROX(m1 + s1, s1 + m1);
+  VERIFY_IS_APPROX(m1 + s1, ArrayType::Constant(rows,cols,s1) + m1);
+  VERIFY_IS_APPROX(s1 - m1, (-m1)+s1 );
+  VERIFY_IS_APPROX(m1 - s1, m1 - ArrayType::Constant(rows,cols,s1));
+  VERIFY_IS_APPROX(s1 - m1, ArrayType::Constant(rows,cols,s1) - m1);
+  VERIFY_IS_APPROX((m1*Scalar(2)) - s2, (m1+m1) - ArrayType::Constant(rows,cols,s2) );
+  m3 = m1;
+  m3 += s2;
+  VERIFY_IS_APPROX(m3, m1 + s2);
+  m3 = m1;
+  m3 -= s1;
+  VERIFY_IS_APPROX(m3, m1 - s1);
+
+  // scalar operators via Maps
+  m3 = m1;
+  ArrayType::Map(m1.data(), m1.rows(), m1.cols()) -= ArrayType::Map(m2.data(), m2.rows(), m2.cols());
+  VERIFY_IS_APPROX(m1, m3 - m2);
+
+  m3 = m1;
+  ArrayType::Map(m1.data(), m1.rows(), m1.cols()) += ArrayType::Map(m2.data(), m2.rows(), m2.cols());
+  VERIFY_IS_APPROX(m1, m3 + m2);
+
+  m3 = m1;
+  ArrayType::Map(m1.data(), m1.rows(), m1.cols()) *= ArrayType::Map(m2.data(), m2.rows(), m2.cols());
+  VERIFY_IS_APPROX(m1, m3 * m2);
+
+  m3 = m1;
+  m2 = ArrayType::Random(rows,cols);
+  m2 = (m2==0).select(1,m2);
+  ArrayType::Map(m1.data(), m1.rows(), m1.cols()) /= ArrayType::Map(m2.data(), m2.rows(), m2.cols());
+  VERIFY_IS_APPROX(m1, m3 / m2);
+
+  // reductions
+  VERIFY_IS_APPROX(m1.abs().colwise().sum().sum(), m1.abs().sum());
+  VERIFY_IS_APPROX(m1.abs().rowwise().sum().sum(), m1.abs().sum());
+  using std::abs;
+  VERIFY_IS_MUCH_SMALLER_THAN(abs(m1.colwise().sum().sum() - m1.sum()), m1.abs().sum());
+  VERIFY_IS_MUCH_SMALLER_THAN(abs(m1.rowwise().sum().sum() - m1.sum()), m1.abs().sum());
+  if (!internal::isMuchSmallerThan(abs(m1.sum() - (m1+m2).sum()), m1.abs().sum(), test_precision<Scalar>()))
+      VERIFY_IS_NOT_APPROX(((m1+m2).rowwise().sum()).sum(), m1.sum());
+  VERIFY_IS_APPROX(m1.colwise().sum(), m1.colwise().redux(internal::scalar_sum_op<Scalar,Scalar>()));
+
+  // vector-wise ops
+  m3 = m1;
+  VERIFY_IS_APPROX(m3.colwise() += cv1, m1.colwise() + cv1);
+  m3 = m1;
+  VERIFY_IS_APPROX(m3.colwise() -= cv1, m1.colwise() - cv1);
+  m3 = m1;
+  VERIFY_IS_APPROX(m3.rowwise() += rv1, m1.rowwise() + rv1);
+  m3 = m1;
+  VERIFY_IS_APPROX(m3.rowwise() -= rv1, m1.rowwise() - rv1);
+
+  // Conversion from scalar
+  VERIFY_IS_APPROX((m3 = s1), ArrayType::Constant(rows,cols,s1));
+  VERIFY_IS_APPROX((m3 = 1),  ArrayType::Constant(rows,cols,1));
+  VERIFY_IS_APPROX((m3.topLeftCorner(rows,cols) = 1),  ArrayType::Constant(rows,cols,1));
+  typedef Array<Scalar,
+                ArrayType::RowsAtCompileTime==Dynamic?2:ArrayType::RowsAtCompileTime,
+                ArrayType::ColsAtCompileTime==Dynamic?2:ArrayType::ColsAtCompileTime,
+                ArrayType::Options> FixedArrayType;
+  {
+    FixedArrayType f1(s1);
+    VERIFY_IS_APPROX(f1, FixedArrayType::Constant(s1));
+    FixedArrayType f2(numext::real(s1));
+    VERIFY_IS_APPROX(f2, FixedArrayType::Constant(numext::real(s1)));
+    FixedArrayType f3((int)100*numext::real(s1));
+    VERIFY_IS_APPROX(f3, FixedArrayType::Constant((int)100*numext::real(s1)));
+    f1.setRandom();
+    FixedArrayType f4(f1.data());
+    VERIFY_IS_APPROX(f4, f1);
+  }
+  #if EIGEN_HAS_CXX11
+  {
+    FixedArrayType f1{s1};
+    VERIFY_IS_APPROX(f1, FixedArrayType::Constant(s1));
+    FixedArrayType f2{numext::real(s1)};
+    VERIFY_IS_APPROX(f2, FixedArrayType::Constant(numext::real(s1)));
+    FixedArrayType f3{(int)100*numext::real(s1)};
+    VERIFY_IS_APPROX(f3, FixedArrayType::Constant((int)100*numext::real(s1)));
+    f1.setRandom();
+    FixedArrayType f4{f1.data()};
+    VERIFY_IS_APPROX(f4, f1);
+  }
+  #endif
+
+  // pow
+  VERIFY_IS_APPROX(m1.pow(2), m1.square());
+  VERIFY_IS_APPROX(pow(m1,2), m1.square());
+  VERIFY_IS_APPROX(m1.pow(3), m1.cube());
+  VERIFY_IS_APPROX(pow(m1,3), m1.cube());
+  VERIFY_IS_APPROX((-m1).pow(3), -m1.cube());
+  VERIFY_IS_APPROX(pow(2*m1,3), 8*m1.cube());
+  ArrayType exponents = ArrayType::Constant(rows, cols, RealScalar(2));
+  VERIFY_IS_APPROX(Eigen::pow(m1,exponents), m1.square());
+  VERIFY_IS_APPROX(m1.pow(exponents), m1.square());
+  VERIFY_IS_APPROX(Eigen::pow(2*m1,exponents), 4*m1.square());
+  VERIFY_IS_APPROX((2*m1).pow(exponents), 4*m1.square());
+  VERIFY_IS_APPROX(Eigen::pow(m1,2*exponents), m1.square().square());
+  VERIFY_IS_APPROX(m1.pow(2*exponents), m1.square().square());
+  VERIFY_IS_APPROX(Eigen::pow(m1(0,0), exponents), ArrayType::Constant(rows,cols,m1(0,0)*m1(0,0)));
+
+  // Check possible conflicts with 1D ctor
+  typedef Array<Scalar, Dynamic, 1> OneDArrayType;
+  {
+    OneDArrayType o1(rows);
+    VERIFY(o1.size()==rows);
+    OneDArrayType o2(static_cast<int>(rows));
+    VERIFY(o2.size()==rows);
+  }
+  #if EIGEN_HAS_CXX11
+  {
+    OneDArrayType o1{rows};
+    VERIFY(o1.size()==rows);
+    OneDArrayType o4{int(rows)};
+    VERIFY(o4.size()==rows);
+  }
+  #endif
+  // Check possible conflicts with 2D ctor
+  typedef Array<Scalar, Dynamic, Dynamic> TwoDArrayType;
+  typedef Array<Scalar, 2, 1> ArrayType2;
+  {
+    TwoDArrayType o1(rows,cols);
+    VERIFY(o1.rows()==rows);
+    VERIFY(o1.cols()==cols);
+    TwoDArrayType o2(static_cast<int>(rows),static_cast<int>(cols));
+    VERIFY(o2.rows()==rows);
+    VERIFY(o2.cols()==cols);
+
+    ArrayType2 o3(rows,cols);
+    VERIFY(o3(0)==Scalar(rows) && o3(1)==Scalar(cols));
+    ArrayType2 o4(static_cast<int>(rows),static_cast<int>(cols));
+    VERIFY(o4(0)==Scalar(rows) && o4(1)==Scalar(cols));
+  }
+  #if EIGEN_HAS_CXX11
+  {
+    TwoDArrayType o1{rows,cols};
+    VERIFY(o1.rows()==rows);
+    VERIFY(o1.cols()==cols);
+    TwoDArrayType o2{int(rows),int(cols)};
+    VERIFY(o2.rows()==rows);
+    VERIFY(o2.cols()==cols);
+
+    ArrayType2 o3{rows,cols};
+    VERIFY(o3(0)==Scalar(rows) && o3(1)==Scalar(cols));
+    ArrayType2 o4{int(rows),int(cols)};
+    VERIFY(o4(0)==Scalar(rows) && o4(1)==Scalar(cols));
+  }
+  #endif
+}
+
+template<typename ArrayType> void comparisons(const ArrayType& m)
+{
+  using std::abs;
+  typedef typename ArrayType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  Index r = internal::random<Index>(0, rows-1),
+        c = internal::random<Index>(0, cols-1);
+
+  ArrayType m1 = ArrayType::Random(rows, cols),
+            m2 = ArrayType::Random(rows, cols),
+            m3(rows, cols),
+            m4 = m1;
+
+  m4 = (m4.abs()==Scalar(0)).select(1,m4);
+
+  VERIFY(((m1 + Scalar(1)) > m1).all());
+  VERIFY(((m1 - Scalar(1)) < m1).all());
+  if (rows*cols>1)
+  {
+    m3 = m1;
+    m3(r,c) += 1;
+    VERIFY(! (m1 < m3).all() );
+    VERIFY(! (m1 > m3).all() );
+  }
+  VERIFY(!(m1 > m2 && m1 < m2).any());
+  VERIFY((m1 <= m2 || m1 >= m2).all());
+
+  // comparisons array to scalar
+  VERIFY( (m1 != (m1(r,c)+1) ).any() );
+  VERIFY( (m1 >  (m1(r,c)-1) ).any() );
+  VERIFY( (m1 <  (m1(r,c)+1) ).any() );
+  VERIFY( (m1 ==  m1(r,c)    ).any() );
+
+  // comparisons scalar to array
+  VERIFY( ( (m1(r,c)+1) != m1).any() );
+  VERIFY( ( (m1(r,c)-1) <  m1).any() );
+  VERIFY( ( (m1(r,c)+1) >  m1).any() );
+  VERIFY( (  m1(r,c)    == m1).any() );
+
+  // test Select
+  VERIFY_IS_APPROX( (m1<m2).select(m1,m2), m1.cwiseMin(m2) );
+  VERIFY_IS_APPROX( (m1>m2).select(m1,m2), m1.cwiseMax(m2) );
+  Scalar mid = (m1.cwiseAbs().minCoeff() + m1.cwiseAbs().maxCoeff())/Scalar(2);
+  for (int j=0; j<cols; ++j)
+  for (int i=0; i<rows; ++i)
+    m3(i,j) = abs(m1(i,j))<mid ? 0 : m1(i,j);
+  VERIFY_IS_APPROX( (m1.abs()<ArrayType::Constant(rows,cols,mid))
+                        .select(ArrayType::Zero(rows,cols),m1), m3);
+  // shorter versions:
+  VERIFY_IS_APPROX( (m1.abs()<ArrayType::Constant(rows,cols,mid))
+                        .select(0,m1), m3);
+  VERIFY_IS_APPROX( (m1.abs()>=ArrayType::Constant(rows,cols,mid))
+                        .select(m1,0), m3);
+  // even shorter version:
+  VERIFY_IS_APPROX( (m1.abs()<mid).select(0,m1), m3);
+
+  // count
+  VERIFY(((m1.abs()+1)>RealScalar(0.1)).count() == rows*cols);
+
+  // and/or
+  VERIFY( (m1<RealScalar(0) && m1>RealScalar(0)).count() == 0);
+  VERIFY( (m1<RealScalar(0) || m1>=RealScalar(0)).count() == rows*cols);
+  RealScalar a = m1.abs().mean();
+  VERIFY( (m1<-a || m1>a).count() == (m1.abs()>a).count());
+
+  typedef Array<Index, Dynamic, 1> ArrayOfIndices;
+
+  // TODO allows colwise/rowwise for array
+  VERIFY_IS_APPROX(((m1.abs()+1)>RealScalar(0.1)).colwise().count(), ArrayOfIndices::Constant(cols,rows).transpose());
+  VERIFY_IS_APPROX(((m1.abs()+1)>RealScalar(0.1)).rowwise().count(), ArrayOfIndices::Constant(rows, cols));
+}
+
+template<typename ArrayType> void array_real(const ArrayType& m)
+{
+  using std::abs;
+  using std::sqrt;
+  typedef typename ArrayType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  ArrayType m1 = ArrayType::Random(rows, cols),
+            m2 = ArrayType::Random(rows, cols),
+            m3(rows, cols),
+            m4 = m1;
+
+  m4 = (m4.abs()==Scalar(0)).select(Scalar(1),m4);
+
+  Scalar  s1 = internal::random<Scalar>();
+
+  // these tests are mostly to check possible compilation issues with free-functions.
+  VERIFY_IS_APPROX(m1.sin(), sin(m1));
+  VERIFY_IS_APPROX(m1.cos(), cos(m1));
+  VERIFY_IS_APPROX(m1.tan(), tan(m1));
+  VERIFY_IS_APPROX(m1.asin(), asin(m1));
+  VERIFY_IS_APPROX(m1.acos(), acos(m1));
+  VERIFY_IS_APPROX(m1.atan(), atan(m1));
+  VERIFY_IS_APPROX(m1.sinh(), sinh(m1));
+  VERIFY_IS_APPROX(m1.cosh(), cosh(m1));
+  VERIFY_IS_APPROX(m1.tanh(), tanh(m1));
+#if EIGEN_HAS_CXX11_MATH
+  VERIFY_IS_APPROX(m1.tanh().atanh(), atanh(tanh(m1)));
+  VERIFY_IS_APPROX(m1.sinh().asinh(), asinh(sinh(m1)));
+  VERIFY_IS_APPROX(m1.cosh().acosh(), acosh(cosh(m1)));
+#endif
+  VERIFY_IS_APPROX(m1.logistic(), logistic(m1));
+
+  VERIFY_IS_APPROX(m1.arg(), arg(m1));
+  VERIFY_IS_APPROX(m1.round(), round(m1));
+  VERIFY_IS_APPROX(m1.rint(), rint(m1));
+  VERIFY_IS_APPROX(m1.floor(), floor(m1));
+  VERIFY_IS_APPROX(m1.ceil(), ceil(m1));
+  VERIFY((m1.isNaN() == (Eigen::isnan)(m1)).all());
+  VERIFY((m1.isInf() == (Eigen::isinf)(m1)).all());
+  VERIFY((m1.isFinite() == (Eigen::isfinite)(m1)).all());
+  VERIFY_IS_APPROX(m4.inverse(), inverse(m4));
+  VERIFY_IS_APPROX(m1.abs(), abs(m1));
+  VERIFY_IS_APPROX(m1.abs2(), abs2(m1));
+  VERIFY_IS_APPROX(m1.square(), square(m1));
+  VERIFY_IS_APPROX(m1.cube(), cube(m1));
+  VERIFY_IS_APPROX(cos(m1+RealScalar(3)*m2), cos((m1+RealScalar(3)*m2).eval()));
+  VERIFY_IS_APPROX(m1.sign(), sign(m1));
+  VERIFY((m1.sqrt().sign().isNaN() == (Eigen::isnan)(sign(sqrt(m1)))).all());
+
+  // avoid inf and NaNs so verification doesn't fail
+  m3 = m4.abs();
+  VERIFY_IS_APPROX(m3.sqrt(), sqrt(abs(m3)));
+  VERIFY_IS_APPROX(m3.rsqrt(), Scalar(1)/sqrt(abs(m3)));
+  VERIFY_IS_APPROX(rsqrt(m3), Scalar(1)/sqrt(abs(m3)));
+  VERIFY_IS_APPROX(m3.log(), log(m3));
+  VERIFY_IS_APPROX(m3.log1p(), log1p(m3));
+  VERIFY_IS_APPROX(m3.log10(), log10(m3));
+  VERIFY_IS_APPROX(m3.log2(), log2(m3));
+
+
+  VERIFY((!(m1>m2) == (m1<=m2)).all());
+
+  VERIFY_IS_APPROX(sin(m1.asin()), m1);
+  VERIFY_IS_APPROX(cos(m1.acos()), m1);
+  VERIFY_IS_APPROX(tan(m1.atan()), m1);
+  VERIFY_IS_APPROX(sinh(m1), Scalar(0.5)*(exp(m1)-exp(-m1)));
+  VERIFY_IS_APPROX(cosh(m1), Scalar(0.5)*(exp(m1)+exp(-m1)));
+  VERIFY_IS_APPROX(tanh(m1), (Scalar(0.5)*(exp(m1)-exp(-m1)))/(Scalar(0.5)*(exp(m1)+exp(-m1))));
+  VERIFY_IS_APPROX(logistic(m1), (Scalar(1)/(Scalar(1)+exp(-m1))));
+  VERIFY_IS_APPROX(arg(m1), ((m1<Scalar(0)).template cast<Scalar>())*Scalar(std::acos(Scalar(-1))));
+  VERIFY((round(m1) <= ceil(m1) && round(m1) >= floor(m1)).all());
+  VERIFY((rint(m1) <= ceil(m1) && rint(m1) >= floor(m1)).all());
+  VERIFY(((ceil(m1) - round(m1)) <= Scalar(0.5) || (round(m1) - floor(m1)) <= Scalar(0.5)).all());
+  VERIFY(((ceil(m1) - round(m1)) <= Scalar(1.0) && (round(m1) - floor(m1)) <= Scalar(1.0)).all());
+  VERIFY(((ceil(m1) - rint(m1)) <= Scalar(0.5) || (rint(m1) - floor(m1)) <= Scalar(0.5)).all());
+  VERIFY(((ceil(m1) - rint(m1)) <= Scalar(1.0) && (rint(m1) - floor(m1)) <= Scalar(1.0)).all());
+  VERIFY((Eigen::isnan)((m1*Scalar(0))/Scalar(0)).all());
+  VERIFY((Eigen::isinf)(m4/Scalar(0)).all());
+  VERIFY(((Eigen::isfinite)(m1) && (!(Eigen::isfinite)(m1*Scalar(0)/Scalar(0))) && (!(Eigen::isfinite)(m4/Scalar(0)))).all());
+  VERIFY_IS_APPROX(inverse(inverse(m4)),m4);
+  VERIFY((abs(m1) == m1 || abs(m1) == -m1).all());
+  VERIFY_IS_APPROX(m3, sqrt(abs2(m3)));
+  VERIFY_IS_APPROX(m1.absolute_difference(m2), (m1 > m2).select(m1 - m2, m2 - m1));
+  VERIFY_IS_APPROX( m1.sign(), -(-m1).sign() );
+  VERIFY_IS_APPROX( m1*m1.sign(),m1.abs());
+  VERIFY_IS_APPROX(m1.sign() * m1.abs(), m1);
+
+  VERIFY_IS_APPROX(numext::abs2(numext::real(m1)) + numext::abs2(numext::imag(m1)), numext::abs2(m1));
+  VERIFY_IS_APPROX(numext::abs2(Eigen::real(m1)) + numext::abs2(Eigen::imag(m1)), numext::abs2(m1));
+  if(!NumTraits<Scalar>::IsComplex)
+    VERIFY_IS_APPROX(numext::real(m1), m1);
+
+  // shift argument of logarithm so that it is not zero
+  Scalar smallNumber = NumTraits<Scalar>::dummy_precision();
+  VERIFY_IS_APPROX((m3 + smallNumber).log() , log(abs(m3) + smallNumber));
+  VERIFY_IS_APPROX((m3 + smallNumber + Scalar(1)).log() , log1p(abs(m3) + smallNumber));
+
+  VERIFY_IS_APPROX(m1.exp() * m2.exp(), exp(m1+m2));
+  VERIFY_IS_APPROX(m1.exp(), exp(m1));
+  VERIFY_IS_APPROX(m1.exp() / m2.exp(),(m1-m2).exp());
+
+  VERIFY_IS_APPROX(m1.expm1(), expm1(m1));
+  VERIFY_IS_APPROX((m3 + smallNumber).exp() - Scalar(1), expm1(abs(m3) + smallNumber));
+
+  VERIFY_IS_APPROX(m3.pow(RealScalar(0.5)), m3.sqrt());
+  VERIFY_IS_APPROX(pow(m3,RealScalar(0.5)), m3.sqrt());
+
+  VERIFY_IS_APPROX(m3.pow(RealScalar(-0.5)), m3.rsqrt());
+  VERIFY_IS_APPROX(pow(m3,RealScalar(-0.5)), m3.rsqrt());
+
+  // Avoid inf and NaN.
+  m3 = (m1.square()<NumTraits<Scalar>::epsilon()).select(Scalar(1),m3);
+  VERIFY_IS_APPROX(m3.pow(RealScalar(-2)), m3.square().inverse());
+  pow_test<Scalar>();
+
+  VERIFY_IS_APPROX(log10(m3), log(m3)/numext::log(Scalar(10)));
+  VERIFY_IS_APPROX(log2(m3), log(m3)/numext::log(Scalar(2)));
+
+  // scalar by array division
+  const RealScalar tiny = sqrt(std::numeric_limits<RealScalar>::epsilon());
+  s1 += Scalar(tiny);
+  m1 += ArrayType::Constant(rows,cols,Scalar(tiny));
+  VERIFY_IS_APPROX(s1/m1, s1 * m1.inverse());
+
+  // check inplace transpose
+  m3 = m1;
+  m3.transposeInPlace();
+  VERIFY_IS_APPROX(m3, m1.transpose());
+  m3.transposeInPlace();
+  VERIFY_IS_APPROX(m3, m1);
+}
+
+template<typename ArrayType> void array_complex(const ArrayType& m)
+{
+  typedef typename ArrayType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  ArrayType m1 = ArrayType::Random(rows, cols),
+            m2(rows, cols),
+            m4 = m1;
+
+  m4.real() = (m4.real().abs()==RealScalar(0)).select(RealScalar(1),m4.real());
+  m4.imag() = (m4.imag().abs()==RealScalar(0)).select(RealScalar(1),m4.imag());
+
+  Array<RealScalar, -1, -1> m3(rows, cols);
+
+  for (Index i = 0; i < m.rows(); ++i)
+    for (Index j = 0; j < m.cols(); ++j)
+      m2(i,j) = sqrt(m1(i,j));
+
+  // these tests are mostly to check possible compilation issues with free-functions.
+  VERIFY_IS_APPROX(m1.sin(), sin(m1));
+  VERIFY_IS_APPROX(m1.cos(), cos(m1));
+  VERIFY_IS_APPROX(m1.tan(), tan(m1));
+  VERIFY_IS_APPROX(m1.sinh(), sinh(m1));
+  VERIFY_IS_APPROX(m1.cosh(), cosh(m1));
+  VERIFY_IS_APPROX(m1.tanh(), tanh(m1));
+  VERIFY_IS_APPROX(m1.logistic(), logistic(m1));
+  VERIFY_IS_APPROX(m1.arg(), arg(m1));
+  VERIFY((m1.isNaN() == (Eigen::isnan)(m1)).all());
+  VERIFY((m1.isInf() == (Eigen::isinf)(m1)).all());
+  VERIFY((m1.isFinite() == (Eigen::isfinite)(m1)).all());
+  VERIFY_IS_APPROX(m4.inverse(), inverse(m4));
+  VERIFY_IS_APPROX(m1.log(), log(m1));
+  VERIFY_IS_APPROX(m1.log10(), log10(m1));
+  VERIFY_IS_APPROX(m1.log2(), log2(m1));
+  VERIFY_IS_APPROX(m1.abs(), abs(m1));
+  VERIFY_IS_APPROX(m1.abs2(), abs2(m1));
+  VERIFY_IS_APPROX(m1.sqrt(), sqrt(m1));
+  VERIFY_IS_APPROX(m1.square(), square(m1));
+  VERIFY_IS_APPROX(m1.cube(), cube(m1));
+  VERIFY_IS_APPROX(cos(m1+RealScalar(3)*m2), cos((m1+RealScalar(3)*m2).eval()));
+  VERIFY_IS_APPROX(m1.sign(), sign(m1));
+
+
+  VERIFY_IS_APPROX(m1.exp() * m2.exp(), exp(m1+m2));
+  VERIFY_IS_APPROX(m1.exp(), exp(m1));
+  VERIFY_IS_APPROX(m1.exp() / m2.exp(),(m1-m2).exp());
+
+  VERIFY_IS_APPROX(m1.expm1(), expm1(m1));
+  VERIFY_IS_APPROX(expm1(m1), exp(m1) - 1.);
+  // Check for larger magnitude complex numbers that expm1 matches exp - 1.
+  VERIFY_IS_APPROX(expm1(10. * m1), exp(10. * m1) - 1.);
+
+  VERIFY_IS_APPROX(sinh(m1), 0.5*(exp(m1)-exp(-m1)));
+  VERIFY_IS_APPROX(cosh(m1), 0.5*(exp(m1)+exp(-m1)));
+  VERIFY_IS_APPROX(tanh(m1), (0.5*(exp(m1)-exp(-m1)))/(0.5*(exp(m1)+exp(-m1))));
+  VERIFY_IS_APPROX(logistic(m1), (1.0/(1.0 + exp(-m1))));
+
+  for (Index i = 0; i < m.rows(); ++i)
+    for (Index j = 0; j < m.cols(); ++j)
+      m3(i,j) = std::atan2(m1(i,j).imag(), m1(i,j).real());
+  VERIFY_IS_APPROX(arg(m1), m3);
+
+  std::complex<RealScalar> zero(0.0,0.0);
+  VERIFY((Eigen::isnan)(m1*zero/zero).all());
+#if EIGEN_COMP_MSVC
+  // msvc complex division is not robust
+  VERIFY((Eigen::isinf)(m4/RealScalar(0)).all());
+#else
+#if EIGEN_COMP_CLANG
+  // clang's complex division is notoriously broken too
+  if((numext::isinf)(m4(0,0)/RealScalar(0))) {
+#endif
+    VERIFY((Eigen::isinf)(m4/zero).all());
+#if EIGEN_COMP_CLANG
+  }
+  else
+  {
+    VERIFY((Eigen::isinf)(m4.real()/zero.real()).all());
+  }
+#endif
+#endif // MSVC
+
+  VERIFY(((Eigen::isfinite)(m1) && (!(Eigen::isfinite)(m1*zero/zero)) && (!(Eigen::isfinite)(m1/zero))).all());
+
+  VERIFY_IS_APPROX(inverse(inverse(m4)),m4);
+  VERIFY_IS_APPROX(conj(m1.conjugate()), m1);
+  VERIFY_IS_APPROX(abs(m1), sqrt(square(m1.real())+square(m1.imag())));
+  VERIFY_IS_APPROX(abs(m1), sqrt(abs2(m1)));
+  VERIFY_IS_APPROX(log10(m1), log(m1)/log(10));
+  VERIFY_IS_APPROX(log2(m1), log(m1)/log(2));
+
+  VERIFY_IS_APPROX( m1.sign(), -(-m1).sign() );
+  VERIFY_IS_APPROX( m1.sign() * m1.abs(), m1);
+
+  // scalar by array division
+  Scalar  s1 = internal::random<Scalar>();
+  const RealScalar tiny = std::sqrt(std::numeric_limits<RealScalar>::epsilon());
+  s1 += Scalar(tiny);
+  m1 += ArrayType::Constant(rows,cols,Scalar(tiny));
+  VERIFY_IS_APPROX(s1/m1, s1 * m1.inverse());
+
+  // check inplace transpose
+  m2 = m1;
+  m2.transposeInPlace();
+  VERIFY_IS_APPROX(m2, m1.transpose());
+  m2.transposeInPlace();
+  VERIFY_IS_APPROX(m2, m1);
+  // Check vectorized inplace transpose.
+  ArrayType m5 = ArrayType::Random(131, 131);
+  ArrayType m6 = m5;
+  m6.transposeInPlace();
+  VERIFY_IS_APPROX(m6, m5.transpose());
+}
+
+template<typename ArrayType> void min_max(const ArrayType& m)
+{
+  typedef typename ArrayType::Scalar Scalar;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  ArrayType m1 = ArrayType::Random(rows, cols);
+
+  // min/max with array
+  Scalar maxM1 = m1.maxCoeff();
+  Scalar minM1 = m1.minCoeff();
+
+  VERIFY_IS_APPROX(ArrayType::Constant(rows,cols, minM1), (m1.min)(ArrayType::Constant(rows,cols, minM1)));
+  VERIFY_IS_APPROX(m1, (m1.min)(ArrayType::Constant(rows,cols, maxM1)));
+
+  VERIFY_IS_APPROX(ArrayType::Constant(rows,cols, maxM1), (m1.max)(ArrayType::Constant(rows,cols, maxM1)));
+  VERIFY_IS_APPROX(m1, (m1.max)(ArrayType::Constant(rows,cols, minM1)));
+
+  // min/max with scalar input
+  VERIFY_IS_APPROX(ArrayType::Constant(rows,cols, minM1), (m1.min)( minM1));
+  VERIFY_IS_APPROX(m1, (m1.min)( maxM1));
+
+  VERIFY_IS_APPROX(ArrayType::Constant(rows,cols, maxM1), (m1.max)( maxM1));
+  VERIFY_IS_APPROX(m1, (m1.max)( minM1));
+
+
+  // min/max with various NaN propagation options.
+  if (m1.size() > 1 && !NumTraits<Scalar>::IsInteger) {
+    m1(0,0) = NumTraits<Scalar>::quiet_NaN();
+    maxM1 = m1.template maxCoeff<PropagateNaN>();
+    minM1 = m1.template minCoeff<PropagateNaN>();
+    VERIFY((numext::isnan)(maxM1));
+    VERIFY((numext::isnan)(minM1));
+
+    maxM1 = m1.template maxCoeff<PropagateNumbers>();
+    minM1 = m1.template minCoeff<PropagateNumbers>();
+    VERIFY(!(numext::isnan)(maxM1));
+    VERIFY(!(numext::isnan)(minM1));
+  }
+}
+
+template<int N>
+struct shift_left {
+  template<typename Scalar>
+  Scalar operator()(const Scalar& v) const {
+    return v << N;
+  }
+};
+
+template<int N>
+struct arithmetic_shift_right {
+  template<typename Scalar>
+  Scalar operator()(const Scalar& v) const {
+    return v >> N;
+  }
+};
+
+template<typename ArrayType> void array_integer(const ArrayType& m)
+{
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  ArrayType m1 = ArrayType::Random(rows, cols),
+            m2(rows, cols);
+
+  m2 = m1.template shiftLeft<2>();
+  VERIFY( (m2 == m1.unaryExpr(shift_left<2>())).all() );
+  m2 = m1.template shiftLeft<9>();
+  VERIFY( (m2 == m1.unaryExpr(shift_left<9>())).all() );
+  
+  m2 = m1.template shiftRight<2>();
+  VERIFY( (m2 == m1.unaryExpr(arithmetic_shift_right<2>())).all() );
+  m2 = m1.template shiftRight<9>();
+  VERIFY( (m2 == m1.unaryExpr(arithmetic_shift_right<9>())).all() );
+}
+
+EIGEN_DECLARE_TEST(array_cwise)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( array(Array<float, 1, 1>()) );
+    CALL_SUBTEST_2( array(Array22f()) );
+    CALL_SUBTEST_3( array(Array44d()) );
+    CALL_SUBTEST_4( array(ArrayXXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_5( array(ArrayXXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( array(ArrayXXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( array(Array<Index,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( array_integer(ArrayXXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( array_integer(Array<Index,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  }
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( comparisons(Array<float, 1, 1>()) );
+    CALL_SUBTEST_2( comparisons(Array22f()) );
+    CALL_SUBTEST_3( comparisons(Array44d()) );
+    CALL_SUBTEST_5( comparisons(ArrayXXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( comparisons(ArrayXXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  }
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( min_max(Array<float, 1, 1>()) );
+    CALL_SUBTEST_2( min_max(Array22f()) );
+    CALL_SUBTEST_3( min_max(Array44d()) );
+    CALL_SUBTEST_5( min_max(ArrayXXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( min_max(ArrayXXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  }
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( array_real(Array<float, 1, 1>()) );
+    CALL_SUBTEST_2( array_real(Array22f()) );
+    CALL_SUBTEST_3( array_real(Array44d()) );
+    CALL_SUBTEST_5( array_real(ArrayXXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_7( array_real(Array<Eigen::half, 32, 32>()) );
+    CALL_SUBTEST_8( array_real(Array<Eigen::bfloat16, 32, 32>()) );
+  }
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_4( array_complex(ArrayXXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  }
+
+  VERIFY((internal::is_same< internal::global_math_functions_filtering_base<int>::type, int >::value));
+  VERIFY((internal::is_same< internal::global_math_functions_filtering_base<float>::type, float >::value));
+  VERIFY((internal::is_same< internal::global_math_functions_filtering_base<Array2i>::type, ArrayBase<Array2i> >::value));
+  typedef CwiseUnaryOp<internal::scalar_abs_op<double>, ArrayXd > Xpr;
+  VERIFY((internal::is_same< internal::global_math_functions_filtering_base<Xpr>::type,
+                           ArrayBase<Xpr>
+                         >::value));
+}

diff --git a/test/array_for_matrix.cpp b/test/array_for_matrix.cpp
new file mode 100644
index 0000000..8086b34
--- /dev/null
+++ b/test/array_for_matrix.cpp

@@ -0,0 +1,332 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename MatrixType> void array_for_matrix(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> ColVectorType;
+  typedef Matrix<Scalar, 1, MatrixType::ColsAtCompileTime> RowVectorType; 
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m2 = MatrixType::Random(rows, cols),
+             m3(rows, cols);
+
+  ColVectorType cv1 = ColVectorType::Random(rows);
+  RowVectorType rv1 = RowVectorType::Random(cols);
+  
+  Scalar  s1 = internal::random<Scalar>(),
+          s2 = internal::random<Scalar>();
+          
+  // scalar addition
+  VERIFY_IS_APPROX(m1.array() + s1, s1 + m1.array());
+  VERIFY_IS_APPROX((m1.array() + s1).matrix(), MatrixType::Constant(rows,cols,s1) + m1);
+  VERIFY_IS_APPROX(((m1*Scalar(2)).array() - s2).matrix(), (m1+m1) - MatrixType::Constant(rows,cols,s2) );
+  m3 = m1;
+  m3.array() += s2;
+  VERIFY_IS_APPROX(m3, (m1.array() + s2).matrix());
+  m3 = m1;
+  m3.array() -= s1;
+  VERIFY_IS_APPROX(m3, (m1.array() - s1).matrix());
+
+  // reductions
+  VERIFY_IS_MUCH_SMALLER_THAN(m1.colwise().sum().sum() - m1.sum(), m1.squaredNorm());
+  VERIFY_IS_MUCH_SMALLER_THAN(m1.rowwise().sum().sum() - m1.sum(), m1.squaredNorm());
+  VERIFY_IS_MUCH_SMALLER_THAN(m1.colwise().sum() + m2.colwise().sum() - (m1+m2).colwise().sum(), (m1+m2).squaredNorm());
+  VERIFY_IS_MUCH_SMALLER_THAN(m1.rowwise().sum() - m2.rowwise().sum() - (m1-m2).rowwise().sum(), (m1-m2).squaredNorm());
+  VERIFY_IS_APPROX(m1.colwise().sum(), m1.colwise().redux(internal::scalar_sum_op<Scalar,Scalar>()));
+
+  // vector-wise ops
+  m3 = m1;
+  VERIFY_IS_APPROX(m3.colwise() += cv1, m1.colwise() + cv1);
+  m3 = m1;
+  VERIFY_IS_APPROX(m3.colwise() -= cv1, m1.colwise() - cv1);
+  m3 = m1;
+  VERIFY_IS_APPROX(m3.rowwise() += rv1, m1.rowwise() + rv1);
+  m3 = m1;
+  VERIFY_IS_APPROX(m3.rowwise() -= rv1, m1.rowwise() - rv1);
+  
+  // empty objects
+  VERIFY_IS_APPROX((m1.template block<0,Dynamic>(0,0,0,cols).colwise().sum()), RowVectorType::Zero(cols));
+  VERIFY_IS_APPROX((m1.template block<Dynamic,0>(0,0,rows,0).rowwise().sum()), ColVectorType::Zero(rows));
+  VERIFY_IS_APPROX((m1.template block<0,Dynamic>(0,0,0,cols).colwise().prod()), RowVectorType::Ones(cols));
+  VERIFY_IS_APPROX((m1.template block<Dynamic,0>(0,0,rows,0).rowwise().prod()), ColVectorType::Ones(rows));
+
+  VERIFY_IS_APPROX(m1.block(0,0,0,cols).colwise().sum(), RowVectorType::Zero(cols));
+  VERIFY_IS_APPROX(m1.block(0,0,rows,0).rowwise().sum(), ColVectorType::Zero(rows));
+  VERIFY_IS_APPROX(m1.block(0,0,0,cols).colwise().prod(), RowVectorType::Ones(cols));
+  VERIFY_IS_APPROX(m1.block(0,0,rows,0).rowwise().prod(), ColVectorType::Ones(rows));
+  
+  // verify the const accessors exist
+  const Scalar& ref_m1 = m.matrix().array().coeffRef(0);
+  const Scalar& ref_m2 = m.matrix().array().coeffRef(0,0);
+  const Scalar& ref_a1 = m.array().matrix().coeffRef(0);
+  const Scalar& ref_a2 = m.array().matrix().coeffRef(0,0);
+  VERIFY(&ref_a1 == &ref_m1);
+  VERIFY(&ref_a2 == &ref_m2);
+
+  // Check write accessors:
+  m1.array().coeffRef(0,0) = 1;
+  VERIFY_IS_APPROX(m1(0,0),Scalar(1));
+  m1.array()(0,0) = 2;
+  VERIFY_IS_APPROX(m1(0,0),Scalar(2));
+  m1.array().matrix().coeffRef(0,0) = 3;
+  VERIFY_IS_APPROX(m1(0,0),Scalar(3));
+  m1.array().matrix()(0,0) = 4;
+  VERIFY_IS_APPROX(m1(0,0),Scalar(4));
+}
+
+template<typename MatrixType> void comparisons(const MatrixType& m)
+{
+  using std::abs;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  Index r = internal::random<Index>(0, rows-1),
+        c = internal::random<Index>(0, cols-1);
+
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m2 = MatrixType::Random(rows, cols),
+             m3(rows, cols);
+
+  VERIFY(((m1.array() + Scalar(1)) > m1.array()).all());
+  VERIFY(((m1.array() - Scalar(1)) < m1.array()).all());
+  if (rows*cols>1)
+  {
+    m3 = m1;
+    m3(r,c) += 1;
+    VERIFY(! (m1.array() < m3.array()).all() );
+    VERIFY(! (m1.array() > m3.array()).all() );
+  }
+
+  // comparisons to scalar
+  VERIFY( (m1.array() != (m1(r,c)+1) ).any() );
+  VERIFY( (m1.array() > (m1(r,c)-1) ).any() );
+  VERIFY( (m1.array() < (m1(r,c)+1) ).any() );
+  VERIFY( (m1.array() == m1(r,c) ).any() );
+  VERIFY( m1.cwiseEqual(m1(r,c)).any() );
+
+  // test Select
+  VERIFY_IS_APPROX( (m1.array()<m2.array()).select(m1,m2), m1.cwiseMin(m2) );
+  VERIFY_IS_APPROX( (m1.array()>m2.array()).select(m1,m2), m1.cwiseMax(m2) );
+  Scalar mid = (m1.cwiseAbs().minCoeff() + m1.cwiseAbs().maxCoeff())/Scalar(2);
+  for (int j=0; j<cols; ++j)
+  for (int i=0; i<rows; ++i)
+    m3(i,j) = abs(m1(i,j))<mid ? 0 : m1(i,j);
+  VERIFY_IS_APPROX( (m1.array().abs()<MatrixType::Constant(rows,cols,mid).array())
+                        .select(MatrixType::Zero(rows,cols),m1), m3);
+  // shorter versions:
+  VERIFY_IS_APPROX( (m1.array().abs()<MatrixType::Constant(rows,cols,mid).array())
+                        .select(0,m1), m3);
+  VERIFY_IS_APPROX( (m1.array().abs()>=MatrixType::Constant(rows,cols,mid).array())
+                        .select(m1,0), m3);
+  // even shorter version:
+  VERIFY_IS_APPROX( (m1.array().abs()<mid).select(0,m1), m3);
+
+  // count
+  VERIFY(((m1.array().abs()+1)>RealScalar(0.1)).count() == rows*cols);
+
+  // and/or
+  VERIFY( ((m1.array()<RealScalar(0)).matrix() && (m1.array()>RealScalar(0)).matrix()).count() == 0);
+  VERIFY( ((m1.array()<RealScalar(0)).matrix() || (m1.array()>=RealScalar(0)).matrix()).count() == rows*cols);
+  RealScalar a = m1.cwiseAbs().mean();
+  VERIFY( ((m1.array()<-a).matrix() || (m1.array()>a).matrix()).count() == (m1.cwiseAbs().array()>a).count());
+
+  typedef Matrix<Index, Dynamic, 1> VectorOfIndices;
+
+  // TODO allows colwise/rowwise for array
+  VERIFY_IS_APPROX(((m1.array().abs()+1)>RealScalar(0.1)).matrix().colwise().count(), VectorOfIndices::Constant(cols,rows).transpose());
+  VERIFY_IS_APPROX(((m1.array().abs()+1)>RealScalar(0.1)).matrix().rowwise().count(), VectorOfIndices::Constant(rows, cols));
+}
+
+template<typename VectorType> void lpNorm(const VectorType& v)
+{
+  using std::sqrt;
+  typedef typename VectorType::RealScalar RealScalar;
+  VectorType u = VectorType::Random(v.size());
+
+  if(v.size()==0)
+  {
+    VERIFY_IS_APPROX(u.template lpNorm<Infinity>(), RealScalar(0));
+    VERIFY_IS_APPROX(u.template lpNorm<1>(), RealScalar(0));
+    VERIFY_IS_APPROX(u.template lpNorm<2>(), RealScalar(0));
+    VERIFY_IS_APPROX(u.template lpNorm<5>(), RealScalar(0));
+  }
+  else
+  {
+    VERIFY_IS_APPROX(u.template lpNorm<Infinity>(), u.cwiseAbs().maxCoeff());
+  }
+
+  VERIFY_IS_APPROX(u.template lpNorm<1>(), u.cwiseAbs().sum());
+  VERIFY_IS_APPROX(u.template lpNorm<2>(), sqrt(u.array().abs().square().sum()));
+  VERIFY_IS_APPROX(numext::pow(u.template lpNorm<5>(), typename VectorType::RealScalar(5)), u.array().abs().pow(5).sum());
+}
+
+template<typename MatrixType> void cwise_min_max(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols);
+
+  // min/max with array
+  Scalar maxM1 = m1.maxCoeff();
+  Scalar minM1 = m1.minCoeff();
+
+  VERIFY_IS_APPROX(MatrixType::Constant(rows,cols, minM1), m1.cwiseMin(MatrixType::Constant(rows,cols, minM1)));
+  VERIFY_IS_APPROX(m1, m1.cwiseMin(MatrixType::Constant(rows,cols, maxM1)));
+
+  VERIFY_IS_APPROX(MatrixType::Constant(rows,cols, maxM1), m1.cwiseMax(MatrixType::Constant(rows,cols, maxM1)));
+  VERIFY_IS_APPROX(m1, m1.cwiseMax(MatrixType::Constant(rows,cols, minM1)));
+
+  // min/max with scalar input
+  VERIFY_IS_APPROX(MatrixType::Constant(rows,cols, minM1), m1.cwiseMin( minM1));
+  VERIFY_IS_APPROX(m1, m1.cwiseMin(maxM1));
+  VERIFY_IS_APPROX(-m1, (-m1).cwiseMin(-minM1));
+  VERIFY_IS_APPROX(-m1.array(), ((-m1).array().min)( -minM1));
+
+  VERIFY_IS_APPROX(MatrixType::Constant(rows,cols, maxM1), m1.cwiseMax( maxM1));
+  VERIFY_IS_APPROX(m1, m1.cwiseMax(minM1));
+  VERIFY_IS_APPROX(-m1, (-m1).cwiseMax(-maxM1));
+  VERIFY_IS_APPROX(-m1.array(), ((-m1).array().max)(-maxM1));
+
+  VERIFY_IS_APPROX(MatrixType::Constant(rows,cols, minM1).array(), (m1.array().min)( minM1));
+  VERIFY_IS_APPROX(m1.array(), (m1.array().min)( maxM1));
+
+  VERIFY_IS_APPROX(MatrixType::Constant(rows,cols, maxM1).array(), (m1.array().max)( maxM1));
+  VERIFY_IS_APPROX(m1.array(), (m1.array().max)( minM1));
+
+  // Test NaN propagation for min/max.
+  if (!NumTraits<Scalar>::IsInteger) {
+    m1(0,0) = NumTraits<Scalar>::quiet_NaN();
+    // Elementwise.
+    VERIFY((numext::isnan)(m1.template cwiseMax<PropagateNaN>(MatrixType::Constant(rows,cols, Scalar(1)))(0,0)));
+    VERIFY((numext::isnan)(m1.template cwiseMin<PropagateNaN>(MatrixType::Constant(rows,cols, Scalar(1)))(0,0)));
+    VERIFY(!(numext::isnan)(m1.template cwiseMax<PropagateNumbers>(MatrixType::Constant(rows,cols, Scalar(1)))(0,0)));
+    VERIFY(!(numext::isnan)(m1.template cwiseMin<PropagateNumbers>(MatrixType::Constant(rows,cols, Scalar(1)))(0,0)));
+
+    VERIFY((numext::isnan)(m1.array().template max<PropagateNaN>(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0)));
+    VERIFY((numext::isnan)(m1.array().template min<PropagateNaN>(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0)));
+    VERIFY(!(numext::isnan)(m1.array().template max<PropagateNumbers>(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0)));
+    VERIFY(!(numext::isnan)(m1.array().template min<PropagateNumbers>(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0)));
+
+    // Reductions.
+    VERIFY((numext::isnan)(m1.template maxCoeff<PropagateNaN>()));
+    VERIFY((numext::isnan)(m1.template minCoeff<PropagateNaN>()));
+    if (m1.size() > 1) {
+      VERIFY(!(numext::isnan)(m1.template maxCoeff<PropagateNumbers>()));
+      VERIFY(!(numext::isnan)(m1.template minCoeff<PropagateNumbers>()));
+    } else {
+      VERIFY((numext::isnan)(m1.template maxCoeff<PropagateNumbers>()));
+      VERIFY((numext::isnan)(m1.template minCoeff<PropagateNumbers>()));
+    }
+  }
+}
+
+template<typename MatrixTraits> void resize(const MatrixTraits& t)
+{
+  typedef typename MatrixTraits::Scalar Scalar;
+  typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
+  typedef Array<Scalar,Dynamic,Dynamic> Array2DType;
+  typedef Matrix<Scalar,Dynamic,1> VectorType;
+  typedef Array<Scalar,Dynamic,1> Array1DType;
+
+  Index rows = t.rows(), cols = t.cols();
+
+  MatrixType m(rows,cols);
+  VectorType v(rows);
+  Array2DType a2(rows,cols);
+  Array1DType a1(rows);
+
+  m.array().resize(rows+1,cols+1);
+  VERIFY(m.rows()==rows+1 && m.cols()==cols+1);
+  a2.matrix().resize(rows+1,cols+1);
+  VERIFY(a2.rows()==rows+1 && a2.cols()==cols+1);
+  v.array().resize(cols);
+  VERIFY(v.size()==cols);
+  a1.matrix().resize(cols);
+  VERIFY(a1.size()==cols);
+}
+
+template<int>
+void regression_bug_654()
+{
+  ArrayXf a = RowVectorXf(3);
+  VectorXf v = Array<float,1,Dynamic>(3);
+}
+
+// Check propagation of LvalueBit through Array/Matrix-Wrapper
+template<int>
+void regrrssion_bug_1410()
+{
+  const Matrix4i M;
+  const Array4i A;
+  ArrayWrapper<const Matrix4i> MA = M.array();
+  MA.row(0);
+  MatrixWrapper<const Array4i> AM = A.matrix();
+  AM.row(0);
+
+  VERIFY((internal::traits<ArrayWrapper<const Matrix4i> >::Flags&LvalueBit)==0);
+  VERIFY((internal::traits<MatrixWrapper<const Array4i> >::Flags&LvalueBit)==0);
+
+  VERIFY((internal::traits<ArrayWrapper<Matrix4i> >::Flags&LvalueBit)==LvalueBit);
+  VERIFY((internal::traits<MatrixWrapper<Array4i> >::Flags&LvalueBit)==LvalueBit);
+}
+
+EIGEN_DECLARE_TEST(array_for_matrix)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( array_for_matrix(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( array_for_matrix(Matrix2f()) );
+    CALL_SUBTEST_3( array_for_matrix(Matrix4d()) );
+    CALL_SUBTEST_4( array_for_matrix(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_5( array_for_matrix(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( array_for_matrix(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  }
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( comparisons(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( comparisons(Matrix2f()) );
+    CALL_SUBTEST_3( comparisons(Matrix4d()) );
+    CALL_SUBTEST_5( comparisons(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( comparisons(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  }
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( cwise_min_max(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( cwise_min_max(Matrix2f()) );
+    CALL_SUBTEST_3( cwise_min_max(Matrix4d()) );
+    CALL_SUBTEST_5( cwise_min_max(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( cwise_min_max(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  }
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( lpNorm(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( lpNorm(Vector2f()) );
+    CALL_SUBTEST_7( lpNorm(Vector3d()) );
+    CALL_SUBTEST_8( lpNorm(Vector4f()) );
+    CALL_SUBTEST_5( lpNorm(VectorXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_4( lpNorm(VectorXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  }
+  CALL_SUBTEST_5( lpNorm(VectorXf(0)) );
+  CALL_SUBTEST_4( lpNorm(VectorXcf(0)) );
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_4( resize(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_5( resize(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( resize(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  }
+  CALL_SUBTEST_6( regression_bug_654<0>() );
+  CALL_SUBTEST_6( regrrssion_bug_1410<0>() );
+}

diff --git a/test/array_of_string.cpp b/test/array_of_string.cpp
new file mode 100644
index 0000000..23e5152
--- /dev/null
+++ b/test/array_of_string.cpp

@@ -0,0 +1,32 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+EIGEN_DECLARE_TEST(array_of_string)
+{
+  typedef Array<std::string,1,Dynamic> ArrayXs;
+  ArrayXs a1(3), a2(3), a3(3), a3ref(3);
+  a1 << "one", "two", "three";
+  a2 << "1", "2", "3";
+  a3ref << "one (1)", "two (2)", "three (3)";
+  std::stringstream s1;
+  s1 << a1;
+  VERIFY_IS_EQUAL(s1.str(), std::string("  one    two  three"));
+  a3 = a1 + std::string(" (") + a2 + std::string(")");
+  VERIFY((a3==a3ref).all());
+
+  a3 = a1;
+  a3 += std::string(" (") + a2 + std::string(")");
+  VERIFY((a3==a3ref).all());
+
+  a1.swap(a3);
+  VERIFY((a1==a3ref).all());
+  VERIFY((a3!=a3ref).all());
+}

diff --git a/test/array_replicate.cpp b/test/array_replicate.cpp
new file mode 100644
index 0000000..057c3c7
--- /dev/null
+++ b/test/array_replicate.cpp

@@ -0,0 +1,81 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename MatrixType> void replicate(const MatrixType& m)
+{
+  /* this test covers the following files:
+     Replicate.cpp
+  */
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+  typedef Matrix<Scalar, Dynamic, Dynamic> MatrixX;
+  typedef Matrix<Scalar, Dynamic, 1> VectorX;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m2 = MatrixType::Random(rows, cols);
+
+  VectorType v1 = VectorType::Random(rows);
+
+  MatrixX x1, x2;
+  VectorX vx1;
+
+  int  f1 = internal::random<int>(1,10),
+       f2 = internal::random<int>(1,10);
+
+  x1.resize(rows*f1,cols*f2);
+  for(int j=0; j<f2; j++)
+  for(int i=0; i<f1; i++)
+    x1.block(i*rows,j*cols,rows,cols) = m1;
+  VERIFY_IS_APPROX(x1, m1.replicate(f1,f2));
+
+  x2.resize(2*rows,3*cols);
+  x2 << m2, m2, m2,
+        m2, m2, m2;
+  VERIFY_IS_APPROX(x2, (m2.template replicate<2,3>()));
+  
+  x2.resize(rows,3*cols);
+  x2 << m2, m2, m2;
+  VERIFY_IS_APPROX(x2, (m2.template replicate<1,3>()));
+  
+  vx1.resize(3*rows,cols);
+  vx1 << m2, m2, m2;
+  VERIFY_IS_APPROX(vx1+vx1, vx1+(m2.template replicate<3,1>()));
+  
+  vx1=m2+(m2.colwise().replicate(1));
+  
+  if(m2.cols()==1)
+    VERIFY_IS_APPROX(m2.coeff(0), (m2.template replicate<3,1>().coeff(m2.rows())));
+
+  x2.resize(rows,f1);
+  for (int j=0; j<f1; ++j)
+    x2.col(j) = v1;
+  VERIFY_IS_APPROX(x2, v1.rowwise().replicate(f1));
+
+  vx1.resize(rows*f2);
+  for (int j=0; j<f2; ++j)
+    vx1.segment(j*rows,rows) = v1;
+  VERIFY_IS_APPROX(vx1, v1.colwise().replicate(f2));
+}
+
+EIGEN_DECLARE_TEST(array_replicate)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( replicate(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( replicate(Vector2f()) );
+    CALL_SUBTEST_3( replicate(Vector3d()) );
+    CALL_SUBTEST_4( replicate(Vector4f()) );
+    CALL_SUBTEST_5( replicate(VectorXf(16)) );
+    CALL_SUBTEST_6( replicate(VectorXcd(10)) );
+  }
+}

diff --git a/test/array_reverse.cpp b/test/array_reverse.cpp
new file mode 100644
index 0000000..c77528a
--- /dev/null
+++ b/test/array_reverse.cpp

@@ -0,0 +1,204 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2009 Ricard Marxer <email@ricardmarxer.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <iostream>
+
+using namespace std;
+
+template<typename MatrixType> void reverse(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  // this test relies a lot on Random.h, and there's not much more that we can do
+  // to test it, hence I consider that we will have tested Random.h
+  MatrixType m1 = MatrixType::Random(rows, cols), m2;
+  VectorType v1 = VectorType::Random(rows);
+
+  MatrixType m1_r = m1.reverse();
+  // Verify that MatrixBase::reverse() works
+  for ( int i = 0; i < rows; i++ ) {
+    for ( int j = 0; j < cols; j++ ) {
+      VERIFY_IS_APPROX(m1_r(i, j), m1(rows - 1 - i, cols - 1 - j));
+    }
+  }
+
+  Reverse<MatrixType> m1_rd(m1);
+  // Verify that a Reverse default (in both directions) of an expression works
+  for ( int i = 0; i < rows; i++ ) {
+    for ( int j = 0; j < cols; j++ ) {
+      VERIFY_IS_APPROX(m1_rd(i, j), m1(rows - 1 - i, cols - 1 - j));
+    }
+  }
+
+  Reverse<MatrixType, BothDirections> m1_rb(m1);
+  // Verify that a Reverse in both directions of an expression works
+  for ( int i = 0; i < rows; i++ ) {
+    for ( int j = 0; j < cols; j++ ) {
+      VERIFY_IS_APPROX(m1_rb(i, j), m1(rows - 1 - i, cols - 1 - j));
+    }
+  }
+
+  Reverse<MatrixType, Vertical> m1_rv(m1);
+  // Verify that a Reverse in the vertical directions of an expression works
+  for ( int i = 0; i < rows; i++ ) {
+    for ( int j = 0; j < cols; j++ ) {
+      VERIFY_IS_APPROX(m1_rv(i, j), m1(rows - 1 - i, j));
+    }
+  }
+
+  Reverse<MatrixType, Horizontal> m1_rh(m1);
+  // Verify that a Reverse in the horizontal directions of an expression works
+  for ( int i = 0; i < rows; i++ ) {
+    for ( int j = 0; j < cols; j++ ) {
+      VERIFY_IS_APPROX(m1_rh(i, j), m1(i, cols - 1 - j));
+    }
+  }
+
+  VectorType v1_r = v1.reverse();
+  // Verify that a VectorType::reverse() of an expression works
+  for ( int i = 0; i < rows; i++ ) {
+    VERIFY_IS_APPROX(v1_r(i), v1(rows - 1 - i));
+  }
+
+  MatrixType m1_cr = m1.colwise().reverse();
+  // Verify that PartialRedux::reverse() works (for colwise())
+  for ( int i = 0; i < rows; i++ ) {
+    for ( int j = 0; j < cols; j++ ) {
+      VERIFY_IS_APPROX(m1_cr(i, j), m1(rows - 1 - i, j));
+    }
+  }
+
+  MatrixType m1_rr = m1.rowwise().reverse();
+  // Verify that PartialRedux::reverse() works (for rowwise())
+  for ( int i = 0; i < rows; i++ ) {
+    for ( int j = 0; j < cols; j++ ) {
+      VERIFY_IS_APPROX(m1_rr(i, j), m1(i, cols - 1 - j));
+    }
+  }
+
+  Scalar x = internal::random<Scalar>();
+
+  Index r = internal::random<Index>(0, rows-1),
+        c = internal::random<Index>(0, cols-1);
+
+  m1.reverse()(r, c) = x;
+  VERIFY_IS_APPROX(x, m1(rows - 1 - r, cols - 1 - c));
+  
+  m2 = m1;
+  m2.reverseInPlace();
+  VERIFY_IS_APPROX(m2,m1.reverse().eval());
+  
+  m2 = m1;
+  m2.col(0).reverseInPlace();
+  VERIFY_IS_APPROX(m2.col(0),m1.col(0).reverse().eval());
+  
+  m2 = m1;
+  m2.row(0).reverseInPlace();
+  VERIFY_IS_APPROX(m2.row(0),m1.row(0).reverse().eval());
+  
+  m2 = m1;
+  m2.rowwise().reverseInPlace();
+  VERIFY_IS_APPROX(m2,m1.rowwise().reverse().eval());
+  
+  m2 = m1;
+  m2.colwise().reverseInPlace();
+  VERIFY_IS_APPROX(m2,m1.colwise().reverse().eval());
+
+  m1.colwise().reverse()(r, c) = x;
+  VERIFY_IS_APPROX(x, m1(rows - 1 - r, c));
+
+  m1.rowwise().reverse()(r, c) = x;
+  VERIFY_IS_APPROX(x, m1(r, cols - 1 - c));
+}
+
+template<int>
+void array_reverse_extra()
+{
+  Vector4f x; x << 1, 2, 3, 4;
+  Vector4f y; y << 4, 3, 2, 1;
+  VERIFY(x.reverse()[1] == 3);
+  VERIFY(x.reverse() == y);
+}
+
+// Simpler version of reverseInPlace leveraging a bug
+// in clang 6/7 with -O2 and AVX or AVX512 enabled.
+// This simpler version ensure that the clang bug is not simply hidden
+// through mis-inlining of reverseInPlace or other minor changes.
+template<typename MatrixType>
+EIGEN_DONT_INLINE
+void bug1684_job1(MatrixType& m1, MatrixType& m2)
+{
+  m2 = m1;
+  m2.col(0).swap(m2.col(3));
+  m2.col(1).swap(m2.col(2));
+}
+
+template<typename MatrixType>
+EIGEN_DONT_INLINE
+void bug1684_job2(MatrixType& m1, MatrixType& m2)
+{
+  m2 = m1; // load m1/m2 in AVX registers
+  m1.col(0) = m2.col(3); // perform 128 bits moves
+  m1.col(1) = m2.col(2);
+  m1.col(2) = m2.col(1);
+  m1.col(3) = m2.col(0);
+}
+
+template<typename MatrixType>
+EIGEN_DONT_INLINE
+void bug1684_job3(MatrixType& m1, MatrixType& m2)
+{
+  m2 = m1;
+  Vector4f tmp;
+  tmp = m2.col(0);
+  m2.col(0) = m2.col(3);
+  m2.col(3) = tmp;
+  tmp = m2.col(1);
+  m2.col(1) = m2.col(2);
+  m2.col(2) = tmp;
+  
+}
+
+template<int>
+void bug1684()
+{
+  Matrix4f m1 = Matrix4f::Random();
+  Matrix4f m2 = Matrix4f::Random();
+  bug1684_job1(m1,m2);
+  VERIFY_IS_APPROX(m2, m1.rowwise().reverse().eval());
+  bug1684_job2(m1,m2);
+  VERIFY_IS_APPROX(m2, m1.rowwise().reverse().eval());
+  // This one still fail after our swap's workaround,
+  // but I expect users not to implement their own swap.
+  // bug1684_job3(m1,m2);
+  // VERIFY_IS_APPROX(m2, m1.rowwise().reverse().eval());
+}
+
+EIGEN_DECLARE_TEST(array_reverse)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( reverse(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( reverse(Matrix2f()) );
+    CALL_SUBTEST_3( reverse(Matrix4f()) );
+    CALL_SUBTEST_4( reverse(Matrix4d()) );
+    CALL_SUBTEST_5( reverse(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( reverse(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_7( reverse(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_8( reverse(Matrix<float, 100, 100>()) );
+    CALL_SUBTEST_9( reverse(Matrix<float,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_3( bug1684<0>() );
+  }
+  CALL_SUBTEST_3( array_reverse_extra<0>() );
+}

diff --git a/test/bandmatrix.cpp b/test/bandmatrix.cpp
new file mode 100644
index 0000000..66a1b0d
--- /dev/null
+++ b/test/bandmatrix.cpp

@@ -0,0 +1,71 @@
+// This file is triangularView of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename MatrixType> void bandmatrix(const MatrixType& _m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrixType;
+
+  Index rows = _m.rows();
+  Index cols = _m.cols();
+  Index supers = _m.supers();
+  Index subs = _m.subs();
+
+  MatrixType m(rows,cols,supers,subs);
+
+  DenseMatrixType dm1(rows,cols);
+  dm1.setZero();
+
+  m.diagonal().setConstant(123);
+  dm1.diagonal().setConstant(123);
+  for (int i=1; i<=m.supers();++i)
+  {
+    m.diagonal(i).setConstant(static_cast<RealScalar>(i));
+    dm1.diagonal(i).setConstant(static_cast<RealScalar>(i));
+  }
+  for (int i=1; i<=m.subs();++i)
+  {
+    m.diagonal(-i).setConstant(-static_cast<RealScalar>(i));
+    dm1.diagonal(-i).setConstant(-static_cast<RealScalar>(i));
+  }
+  //std::cerr << m.m_data << "\n\n" << m.toDense() << "\n\n" << dm1 << "\n\n\n\n";
+  VERIFY_IS_APPROX(dm1,m.toDenseMatrix());
+
+  for (int i=0; i<cols; ++i)
+  {
+    m.col(i).setConstant(static_cast<RealScalar>(i+1));
+    dm1.col(i).setConstant(static_cast<RealScalar>(i+1));
+  }
+  Index d = (std::min)(rows,cols);
+  Index a = std::max<Index>(0,cols-d-supers);
+  Index b = std::max<Index>(0,rows-d-subs);
+  if(a>0) dm1.block(0,d+supers,rows,a).setZero();
+  dm1.block(0,supers+1,cols-supers-1-a,cols-supers-1-a).template triangularView<Upper>().setZero();
+  dm1.block(subs+1,0,rows-subs-1-b,rows-subs-1-b).template triangularView<Lower>().setZero();
+  if(b>0) dm1.block(d+subs,0,b,cols).setZero();
+  //std::cerr << m.m_data << "\n\n" << m.toDense() << "\n\n" << dm1 << "\n\n";
+  VERIFY_IS_APPROX(dm1,m.toDenseMatrix());
+
+}
+
+using Eigen::internal::BandMatrix;
+
+EIGEN_DECLARE_TEST(bandmatrix)
+{
+  for(int i = 0; i < 10*g_repeat ; i++) {
+    Index rows = internal::random<Index>(1,10);
+    Index cols = internal::random<Index>(1,10);
+    Index sups = internal::random<Index>(0,cols-1);
+    Index subs = internal::random<Index>(0,rows-1);
+    CALL_SUBTEST(bandmatrix(BandMatrix<float>(rows,cols,sups,subs)) );
+  }
+}

diff --git a/test/basicstuff.cpp b/test/basicstuff.cpp
new file mode 100644
index 0000000..4ca607c
--- /dev/null
+++ b/test/basicstuff.cpp

@@ -0,0 +1,356 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_NO_STATIC_ASSERT
+
+#include "main.h"
+#include "random_without_cast_overflow.h"
+
+template<typename MatrixType> void basicStuff(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  // this test relies a lot on Random.h, and there's not much more that we can do
+  // to test it, hence I consider that we will have tested Random.h
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m2 = MatrixType::Random(rows, cols),
+             m3(rows, cols),
+             mzero = MatrixType::Zero(rows, cols),
+             square = Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime>::Random(rows, rows);
+  VectorType v1 = VectorType::Random(rows),
+             vzero = VectorType::Zero(rows);
+  SquareMatrixType sm1 = SquareMatrixType::Random(rows,rows), sm2(rows,rows);
+
+  Scalar x = 0;
+  while(x == Scalar(0)) x = internal::random<Scalar>();
+
+  Index r = internal::random<Index>(0, rows-1),
+        c = internal::random<Index>(0, cols-1);
+
+  m1.coeffRef(r,c) = x;
+  VERIFY_IS_APPROX(x, m1.coeff(r,c));
+  m1(r,c) = x;
+  VERIFY_IS_APPROX(x, m1(r,c));
+  v1.coeffRef(r) = x;
+  VERIFY_IS_APPROX(x, v1.coeff(r));
+  v1(r) = x;
+  VERIFY_IS_APPROX(x, v1(r));
+  v1[r] = x;
+  VERIFY_IS_APPROX(x, v1[r]);
+
+  // test fetching with various index types.
+  Index r1 = internal::random<Index>(0, numext::mini(Index(127),rows-1));
+  x = v1(static_cast<char>(r1));
+  x = v1(static_cast<signed char>(r1));
+  x = v1(static_cast<unsigned char>(r1));
+  x = v1(static_cast<signed short>(r1));
+  x = v1(static_cast<unsigned short>(r1));
+  x = v1(static_cast<signed int>(r1));
+  x = v1(static_cast<unsigned int>(r1));
+  x = v1(static_cast<signed long>(r1));
+  x = v1(static_cast<unsigned long>(r1));
+#if EIGEN_HAS_CXX11
+  x = v1(static_cast<long long int>(r1));
+  x = v1(static_cast<unsigned long long int>(r1));
+#endif
+
+  VERIFY_IS_APPROX(               v1,    v1);
+  VERIFY_IS_NOT_APPROX(           v1,    2*v1);
+  VERIFY_IS_MUCH_SMALLER_THAN(    vzero, v1);
+  VERIFY_IS_MUCH_SMALLER_THAN(  vzero, v1.squaredNorm());
+  VERIFY_IS_NOT_MUCH_SMALLER_THAN(v1,    v1);
+  VERIFY_IS_APPROX(               vzero, v1-v1);
+  VERIFY_IS_APPROX(               m1,    m1);
+  VERIFY_IS_NOT_APPROX(           m1,    2*m1);
+  VERIFY_IS_MUCH_SMALLER_THAN(    mzero, m1);
+  VERIFY_IS_NOT_MUCH_SMALLER_THAN(m1,    m1);
+  VERIFY_IS_APPROX(               mzero, m1-m1);
+
+  // always test operator() on each read-only expression class,
+  // in order to check const-qualifiers.
+  // indeed, if an expression class (here Zero) is meant to be read-only,
+  // hence has no _write() method, the corresponding MatrixBase method (here zero())
+  // should return a const-qualified object so that it is the const-qualified
+  // operator() that gets called, which in turn calls _read().
+  VERIFY_IS_MUCH_SMALLER_THAN(MatrixType::Zero(rows,cols)(r,c), static_cast<Scalar>(1));
+
+  // now test copying a row-vector into a (column-)vector and conversely.
+  square.col(r) = square.row(r).eval();
+  Matrix<Scalar, 1, MatrixType::RowsAtCompileTime> rv(rows);
+  Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> cv(rows);
+  rv = square.row(r);
+  cv = square.col(r);
+
+  VERIFY_IS_APPROX(rv, cv.transpose());
+
+  if(cols!=1 && rows!=1 && MatrixType::SizeAtCompileTime!=Dynamic)
+  {
+    VERIFY_RAISES_ASSERT(m1 = (m2.block(0,0, rows-1, cols-1)));
+  }
+
+  if(cols!=1 && rows!=1)
+  {
+    VERIFY_RAISES_ASSERT(m1[0]);
+    VERIFY_RAISES_ASSERT((m1+m1)[0]);
+  }
+
+  VERIFY_IS_APPROX(m3 = m1,m1);
+  MatrixType m4;
+  VERIFY_IS_APPROX(m4 = m1,m1);
+
+  m3.real() = m1.real();
+  VERIFY_IS_APPROX(static_cast<const MatrixType&>(m3).real(), static_cast<const MatrixType&>(m1).real());
+  VERIFY_IS_APPROX(static_cast<const MatrixType&>(m3).real(), m1.real());
+
+  // check == / != operators
+  VERIFY(m1==m1);
+  VERIFY(m1!=m2);
+  VERIFY(!(m1==m2));
+  VERIFY(!(m1!=m1));
+  m1 = m2;
+  VERIFY(m1==m2);
+  VERIFY(!(m1!=m2));
+
+  // check automatic transposition
+  sm2.setZero();
+  for(Index i=0;i<rows;++i)
+    sm2.col(i) = sm1.row(i);
+  VERIFY_IS_APPROX(sm2,sm1.transpose());
+
+  sm2.setZero();
+  for(Index i=0;i<rows;++i)
+    sm2.col(i).noalias() = sm1.row(i);
+  VERIFY_IS_APPROX(sm2,sm1.transpose());
+
+  sm2.setZero();
+  for(Index i=0;i<rows;++i)
+    sm2.col(i).noalias() += sm1.row(i);
+  VERIFY_IS_APPROX(sm2,sm1.transpose());
+
+  sm2.setZero();
+  for(Index i=0;i<rows;++i)
+    sm2.col(i).noalias() -= sm1.row(i);
+  VERIFY_IS_APPROX(sm2,-sm1.transpose());
+
+  // check ternary usage
+  {
+    bool b = internal::random<int>(0,10)>5;
+    m3 = b ? m1 : m2;
+    if(b) VERIFY_IS_APPROX(m3,m1);
+    else  VERIFY_IS_APPROX(m3,m2);
+    m3 = b ? -m1 : m2;
+    if(b) VERIFY_IS_APPROX(m3,-m1);
+    else  VERIFY_IS_APPROX(m3,m2);
+    m3 = b ? m1 : -m2;
+    if(b) VERIFY_IS_APPROX(m3,m1);
+    else  VERIFY_IS_APPROX(m3,-m2);
+  }
+}
+
+template<typename MatrixType> void basicStuffComplex(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<RealScalar, MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime> RealMatrixType;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  Scalar s1 = internal::random<Scalar>(),
+         s2 = internal::random<Scalar>();
+
+  VERIFY(numext::real(s1)==numext::real_ref(s1));
+  VERIFY(numext::imag(s1)==numext::imag_ref(s1));
+  numext::real_ref(s1) = numext::real(s2);
+  numext::imag_ref(s1) = numext::imag(s2);
+  VERIFY(internal::isApprox(s1, s2, NumTraits<RealScalar>::epsilon()));
+  // extended precision in Intel FPUs means that s1 == s2 in the line above is not guaranteed.
+
+  RealMatrixType rm1 = RealMatrixType::Random(rows,cols),
+                 rm2 = RealMatrixType::Random(rows,cols);
+  MatrixType cm(rows,cols);
+  cm.real() = rm1;
+  cm.imag() = rm2;
+  VERIFY_IS_APPROX(static_cast<const MatrixType&>(cm).real(), rm1);
+  VERIFY_IS_APPROX(static_cast<const MatrixType&>(cm).imag(), rm2);
+  rm1.setZero();
+  rm2.setZero();
+  rm1 = cm.real();
+  rm2 = cm.imag();
+  VERIFY_IS_APPROX(static_cast<const MatrixType&>(cm).real(), rm1);
+  VERIFY_IS_APPROX(static_cast<const MatrixType&>(cm).imag(), rm2);
+  cm.real().setZero();
+  VERIFY(static_cast<const MatrixType&>(cm).real().isZero());
+  VERIFY(!static_cast<const MatrixType&>(cm).imag().isZero());
+}
+
+template<typename SrcScalar, typename TgtScalar>
+struct casting_test {
+  static void run() {
+    Matrix<SrcScalar,4,4> m;
+    for (int i=0; i<m.rows(); ++i) {
+      for (int j=0; j<m.cols(); ++j) {
+        m(i, j) = internal::random_without_cast_overflow<SrcScalar,TgtScalar>::value();
+      }
+    }
+    Matrix<TgtScalar,4,4> n = m.template cast<TgtScalar>();
+    for (int i=0; i<m.rows(); ++i) {
+      for (int j=0; j<m.cols(); ++j) {
+        VERIFY_IS_APPROX(n(i, j), (internal::cast<SrcScalar,TgtScalar>(m(i, j))));
+      }
+    }
+  }
+};
+
+template<typename SrcScalar, typename EnableIf = void>
+struct casting_test_runner {
+  static void run() {
+    casting_test<SrcScalar, bool>::run();
+    casting_test<SrcScalar, int8_t>::run();
+    casting_test<SrcScalar, uint8_t>::run();
+    casting_test<SrcScalar, int16_t>::run();
+    casting_test<SrcScalar, uint16_t>::run();
+    casting_test<SrcScalar, int32_t>::run();
+    casting_test<SrcScalar, uint32_t>::run();
+#if EIGEN_HAS_CXX11
+    casting_test<SrcScalar, int64_t>::run();
+    casting_test<SrcScalar, uint64_t>::run();
+#endif
+    casting_test<SrcScalar, half>::run();
+    casting_test<SrcScalar, bfloat16>::run();
+    casting_test<SrcScalar, float>::run();
+    casting_test<SrcScalar, double>::run();
+    casting_test<SrcScalar, std::complex<float> >::run();
+    casting_test<SrcScalar, std::complex<double> >::run();
+  }
+};
+
+template<typename SrcScalar>
+struct casting_test_runner<SrcScalar, typename internal::enable_if<(NumTraits<SrcScalar>::IsComplex)>::type>
+{
+  static void run() {
+    // Only a few casts from std::complex<T> are defined.
+    casting_test<SrcScalar, half>::run();
+    casting_test<SrcScalar, bfloat16>::run();
+    casting_test<SrcScalar, std::complex<float> >::run();
+    casting_test<SrcScalar, std::complex<double> >::run();
+  }
+};
+
+void casting_all() {
+  casting_test_runner<bool>::run();
+  casting_test_runner<int8_t>::run();
+  casting_test_runner<uint8_t>::run();
+  casting_test_runner<int16_t>::run();
+  casting_test_runner<uint16_t>::run();
+  casting_test_runner<int32_t>::run();
+  casting_test_runner<uint32_t>::run();
+#if EIGEN_HAS_CXX11
+  casting_test_runner<int64_t>::run();
+  casting_test_runner<uint64_t>::run();
+#endif
+  casting_test_runner<half>::run();
+  casting_test_runner<bfloat16>::run();
+  casting_test_runner<float>::run();
+  casting_test_runner<double>::run();
+  casting_test_runner<std::complex<float> >::run();
+  casting_test_runner<std::complex<double> >::run();
+}
+
+template <typename Scalar>
+void fixedSizeMatrixConstruction()
+{
+  Scalar raw[4];
+  for(int k=0; k<4; ++k)
+    raw[k] = internal::random<Scalar>();
+
+  {
+    Matrix<Scalar,4,1> m(raw);
+    Array<Scalar,4,1> a(raw);
+    for(int k=0; k<4; ++k) VERIFY(m(k) == raw[k]);
+    for(int k=0; k<4; ++k) VERIFY(a(k) == raw[k]);
+    VERIFY_IS_EQUAL(m,(Matrix<Scalar,4,1>(raw[0],raw[1],raw[2],raw[3])));
+    VERIFY((a==(Array<Scalar,4,1>(raw[0],raw[1],raw[2],raw[3]))).all());
+  }
+  {
+    Matrix<Scalar,3,1> m(raw);
+    Array<Scalar,3,1> a(raw);
+    for(int k=0; k<3; ++k) VERIFY(m(k) == raw[k]);
+    for(int k=0; k<3; ++k) VERIFY(a(k) == raw[k]);
+    VERIFY_IS_EQUAL(m,(Matrix<Scalar,3,1>(raw[0],raw[1],raw[2])));
+    VERIFY((a==Array<Scalar,3,1>(raw[0],raw[1],raw[2])).all());
+  }
+  {
+    Matrix<Scalar,2,1> m(raw), m2( (DenseIndex(raw[0])), (DenseIndex(raw[1])) );
+    Array<Scalar,2,1> a(raw),  a2( (DenseIndex(raw[0])), (DenseIndex(raw[1])) );
+    for(int k=0; k<2; ++k) VERIFY(m(k) == raw[k]);
+    for(int k=0; k<2; ++k) VERIFY(a(k) == raw[k]);
+    VERIFY_IS_EQUAL(m,(Matrix<Scalar,2,1>(raw[0],raw[1])));
+    VERIFY((a==Array<Scalar,2,1>(raw[0],raw[1])).all());
+    for(int k=0; k<2; ++k) VERIFY(m2(k) == DenseIndex(raw[k]));
+    for(int k=0; k<2; ++k) VERIFY(a2(k) == DenseIndex(raw[k]));
+  }
+  {
+    Matrix<Scalar,1,2> m(raw),
+                       m2( (DenseIndex(raw[0])), (DenseIndex(raw[1])) ),
+                       m3( (int(raw[0])), (int(raw[1])) ),
+                       m4( (float(raw[0])), (float(raw[1])) );
+    Array<Scalar,1,2> a(raw),  a2( (DenseIndex(raw[0])), (DenseIndex(raw[1])) );
+    for(int k=0; k<2; ++k) VERIFY(m(k) == raw[k]);
+    for(int k=0; k<2; ++k) VERIFY(a(k) == raw[k]);
+    VERIFY_IS_EQUAL(m,(Matrix<Scalar,1,2>(raw[0],raw[1])));
+    VERIFY((a==Array<Scalar,1,2>(raw[0],raw[1])).all());
+    for(int k=0; k<2; ++k) VERIFY(m2(k) == DenseIndex(raw[k]));
+    for(int k=0; k<2; ++k) VERIFY(a2(k) == DenseIndex(raw[k]));
+    for(int k=0; k<2; ++k) VERIFY(m3(k) == int(raw[k]));
+    for(int k=0; k<2; ++k) VERIFY((m4(k)) == Scalar(float(raw[k])));
+  }
+  {
+    Matrix<Scalar,1,1> m(raw), m1(raw[0]), m2( (DenseIndex(raw[0])) ), m3( (int(raw[0])) );
+    Array<Scalar,1,1> a(raw), a1(raw[0]), a2( (DenseIndex(raw[0])) );
+    VERIFY(m(0) == raw[0]);
+    VERIFY(a(0) == raw[0]);
+    VERIFY(m1(0) == raw[0]);
+    VERIFY(a1(0) == raw[0]);
+    VERIFY(m2(0) == DenseIndex(raw[0]));
+    VERIFY(a2(0) == DenseIndex(raw[0]));
+    VERIFY(m3(0) == int(raw[0]));
+    VERIFY_IS_EQUAL(m,(Matrix<Scalar,1,1>(raw[0])));
+    VERIFY((a==Array<Scalar,1,1>(raw[0])).all());
+  }
+}
+
+EIGEN_DECLARE_TEST(basicstuff)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( basicStuff(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( basicStuff(Matrix4d()) );
+    CALL_SUBTEST_3( basicStuff(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_4( basicStuff(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_5( basicStuff(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( basicStuff(Matrix<float, 100, 100>()) );
+    CALL_SUBTEST_7( basicStuff(Matrix<long double,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE),internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_8( casting_all() );
+
+    CALL_SUBTEST_3( basicStuffComplex(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_5( basicStuffComplex(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  }
+
+  CALL_SUBTEST_1(fixedSizeMatrixConstruction<unsigned char>());
+  CALL_SUBTEST_1(fixedSizeMatrixConstruction<float>());
+  CALL_SUBTEST_1(fixedSizeMatrixConstruction<double>());
+  CALL_SUBTEST_1(fixedSizeMatrixConstruction<int>());
+  CALL_SUBTEST_1(fixedSizeMatrixConstruction<long int>());
+  CALL_SUBTEST_1(fixedSizeMatrixConstruction<std::ptrdiff_t>());
+}

diff --git a/test/bdcsvd.cpp b/test/bdcsvd.cpp
new file mode 100644
index 0000000..4130377
--- /dev/null
+++ b/test/bdcsvd.cpp

@@ -0,0 +1,152 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Gauthier Brun <brun.gauthier@gmail.com>
+// Copyright (C) 2013 Nicolas Carre <nicolas.carre@ensimag.fr>
+// Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
+// Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/
+
+// discard stack allocation as that too bypasses malloc
+#define EIGEN_STACK_ALLOCATION_LIMIT 0
+#define EIGEN_RUNTIME_NO_MALLOC
+
+#include "main.h"
+#include <Eigen/SVD>
+#include <iostream>
+#include <Eigen/LU>
+
+
+#define SVD_DEFAULT(M) BDCSVD<M>
+#define SVD_FOR_MIN_NORM(M) BDCSVD<M>
+#include "svd_common.h"
+
+// Check all variants of JacobiSVD
+template<typename MatrixType>
+void bdcsvd(const MatrixType& a = MatrixType(), bool pickrandom = true)
+{
+  MatrixType m;
+  if(pickrandom) {
+    m.resizeLike(a);
+    svd_fill_random(m);
+  }
+  else
+    m = a;
+
+  CALL_SUBTEST(( svd_test_all_computation_options<BDCSVD<MatrixType> >(m, false)  ));
+}
+
+template<typename MatrixType>
+void bdcsvd_method()
+{
+  enum { Size = MatrixType::RowsAtCompileTime };
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef Matrix<RealScalar, Size, 1> RealVecType;
+  MatrixType m = MatrixType::Identity();
+  VERIFY_IS_APPROX(m.bdcSvd().singularValues(), RealVecType::Ones());
+  VERIFY_RAISES_ASSERT(m.bdcSvd().matrixU());
+  VERIFY_RAISES_ASSERT(m.bdcSvd().matrixV());
+  VERIFY_IS_APPROX(m.bdcSvd(ComputeFullU|ComputeFullV).solve(m), m);
+  VERIFY_IS_APPROX(m.bdcSvd(ComputeFullU|ComputeFullV).transpose().solve(m), m);
+  VERIFY_IS_APPROX(m.bdcSvd(ComputeFullU|ComputeFullV).adjoint().solve(m), m);
+}
+
+// Compare the Singular values returned with Jacobi and Bdc.
+template<typename MatrixType> 
+void compare_bdc_jacobi(const MatrixType& a = MatrixType(), unsigned int computationOptions = 0, int algoswap = 16, bool random = true)
+{
+  MatrixType m = random ? MatrixType::Random(a.rows(), a.cols()) : a;
+
+  BDCSVD<MatrixType> bdc_svd(m.rows(), m.cols(), computationOptions);
+  bdc_svd.setSwitchSize(algoswap);
+  bdc_svd.compute(m);
+
+  JacobiSVD<MatrixType> jacobi_svd(m);
+  VERIFY_IS_APPROX(bdc_svd.singularValues(), jacobi_svd.singularValues());
+
+  if(computationOptions & ComputeFullU) VERIFY_IS_APPROX(bdc_svd.matrixU(), jacobi_svd.matrixU());
+  if(computationOptions & ComputeThinU) VERIFY_IS_APPROX(bdc_svd.matrixU(), jacobi_svd.matrixU());
+  if(computationOptions & ComputeFullV) VERIFY_IS_APPROX(bdc_svd.matrixV(), jacobi_svd.matrixV());
+  if(computationOptions & ComputeThinV) VERIFY_IS_APPROX(bdc_svd.matrixV(), jacobi_svd.matrixV());
+}
+
+// Verifies total deflation is **not** triggered.
+void compare_bdc_jacobi_instance(bool structure_as_m, int algoswap = 16)
+{
+  MatrixXd m(4, 3);
+  if (structure_as_m) {
+    // The first 3 rows are the reduced form of Matrix 1 as shown below, and it
+    // has nonzero elements in the first column and diagonals only.
+    m << 1.056293, 0, 0,
+         -0.336468, 0.907359, 0,
+         -1.566245, 0, 0.149150,
+         -0.1, 0, 0;
+  } else {
+    // Matrix 1.
+    m << 0.882336, 18.3914, -26.7921,
+         -5.58135, 17.1931, -24.0892,
+         -20.794, 8.68496, -4.83103,
+         -8.4981, -10.5451, 23.9072;
+  }
+  compare_bdc_jacobi(m, 0, algoswap, false);
+}
+
+EIGEN_DECLARE_TEST(bdcsvd)
+{
+  CALL_SUBTEST_3(( svd_verify_assert<BDCSVD<Matrix3f>  >(Matrix3f()) ));
+  CALL_SUBTEST_4(( svd_verify_assert<BDCSVD<Matrix4d>  >(Matrix4d()) ));
+  CALL_SUBTEST_7(( svd_verify_assert<BDCSVD<MatrixXf>  >(MatrixXf(10,12)) ));
+  CALL_SUBTEST_8(( svd_verify_assert<BDCSVD<MatrixXcd> >(MatrixXcd(7,5)) ));
+  
+  CALL_SUBTEST_101(( svd_all_trivial_2x2(bdcsvd<Matrix2cd>) ));
+  CALL_SUBTEST_102(( svd_all_trivial_2x2(bdcsvd<Matrix2d>) ));
+
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_3(( bdcsvd<Matrix3f>() ));
+    CALL_SUBTEST_4(( bdcsvd<Matrix4d>() ));
+    CALL_SUBTEST_5(( bdcsvd<Matrix<float,3,5> >() ));
+
+    int r = internal::random<int>(1, EIGEN_TEST_MAX_SIZE/2),
+        c = internal::random<int>(1, EIGEN_TEST_MAX_SIZE/2);
+    
+    TEST_SET_BUT_UNUSED_VARIABLE(r)
+    TEST_SET_BUT_UNUSED_VARIABLE(c)
+    
+    CALL_SUBTEST_6((  bdcsvd(Matrix<double,Dynamic,2>(r,2)) ));
+    CALL_SUBTEST_7((  bdcsvd(MatrixXf(r,c)) ));
+    CALL_SUBTEST_7((  compare_bdc_jacobi(MatrixXf(r,c)) ));
+    CALL_SUBTEST_10(( bdcsvd(MatrixXd(r,c)) ));
+    CALL_SUBTEST_10(( compare_bdc_jacobi(MatrixXd(r,c)) ));
+    CALL_SUBTEST_8((  bdcsvd(MatrixXcd(r,c)) ));
+    CALL_SUBTEST_8((  compare_bdc_jacobi(MatrixXcd(r,c)) ));
+
+    // Test on inf/nan matrix
+    CALL_SUBTEST_7(  (svd_inf_nan<BDCSVD<MatrixXf>, MatrixXf>()) );
+    CALL_SUBTEST_10( (svd_inf_nan<BDCSVD<MatrixXd>, MatrixXd>()) );
+  }
+
+  // test matrixbase method
+  CALL_SUBTEST_1(( bdcsvd_method<Matrix2cd>() ));
+  CALL_SUBTEST_3(( bdcsvd_method<Matrix3f>() ));
+
+  // Test problem size constructors
+  CALL_SUBTEST_7( BDCSVD<MatrixXf>(10,10) );
+
+  // Check that preallocation avoids subsequent mallocs
+  // Disabled because not supported by BDCSVD
+  // CALL_SUBTEST_9( svd_preallocate<void>() );
+
+  CALL_SUBTEST_2( svd_underoverflow<void>() );
+
+  // Without total deflation issues.
+  CALL_SUBTEST_11((  compare_bdc_jacobi_instance(true) ));
+  CALL_SUBTEST_12((  compare_bdc_jacobi_instance(false) ));
+
+  // With total deflation issues before, when it shouldn't be triggered.
+  CALL_SUBTEST_13((  compare_bdc_jacobi_instance(true, 3) ));
+  CALL_SUBTEST_14((  compare_bdc_jacobi_instance(false, 3) ));
+}
+

diff --git a/test/bfloat16_float.cpp b/test/bfloat16_float.cpp
new file mode 100644
index 0000000..c3de0b1
--- /dev/null
+++ b/test/bfloat16_float.cpp

@@ -0,0 +1,378 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <sstream>
+#include <memory>
+#include <math.h>
+
+#include "main.h"
+
+#include <Eigen/src/Core/arch/Default/BFloat16.h>
+
+#define VERIFY_BFLOAT16_BITS_EQUAL(h, bits) \
+  VERIFY_IS_EQUAL((numext::bit_cast<numext::uint16_t>(h)), (static_cast<numext::uint16_t>(bits)))
+
+// Make sure it's possible to forward declare Eigen::bfloat16
+namespace Eigen {
+struct bfloat16;
+}
+
+using Eigen::bfloat16;
+
+float BinaryToFloat(uint32_t sign, uint32_t exponent, uint32_t high_mantissa,
+                    uint32_t low_mantissa) {
+  float dest;
+  uint32_t src = (sign << 31) + (exponent << 23) + (high_mantissa << 16) + low_mantissa;
+  memcpy(static_cast<void*>(&dest),
+         static_cast<const void*>(&src), sizeof(dest));
+  return dest;
+}
+
+template<typename T>
+ void test_roundtrip() {
+  // Representable T round trip via bfloat16
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(-std::numeric_limits<T>::infinity()))), -std::numeric_limits<T>::infinity());
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(std::numeric_limits<T>::infinity()))), std::numeric_limits<T>::infinity());
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(T(-1.0)))), T(-1.0));
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(T(-0.5)))), T(-0.5));
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(T(-0.0)))), T(-0.0));
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(T(1.0)))), T(1.0));
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(T(0.5)))), T(0.5));
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(T(0.0)))), T(0.0));
+}
+
+void test_conversion()
+{
+  using Eigen::bfloat16_impl::__bfloat16_raw;
+
+  // Round-trip casts
+  VERIFY_IS_EQUAL(
+    numext::bit_cast<bfloat16>(numext::bit_cast<numext::uint16_t>(bfloat16(1.0f))),
+    bfloat16(1.0f));
+  VERIFY_IS_EQUAL(
+    numext::bit_cast<bfloat16>(numext::bit_cast<numext::uint16_t>(bfloat16(0.5f))),
+    bfloat16(0.5f));
+  VERIFY_IS_EQUAL(
+    numext::bit_cast<bfloat16>(numext::bit_cast<numext::uint16_t>(bfloat16(-0.33333f))),
+    bfloat16(-0.33333f));
+   VERIFY_IS_EQUAL(
+    numext::bit_cast<bfloat16>(numext::bit_cast<numext::uint16_t>(bfloat16(0.0f))),
+    bfloat16(0.0f));
+
+  // Conversion from float.
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(1.0f), 0x3f80);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0.5f), 0x3f00);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0.33333f), 0x3eab);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(3.38e38f), 0x7f7e);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(3.40e38f), 0x7f80);  // Becomes infinity.
+
+  // Verify round-to-nearest-even behavior.
+  float val1 = static_cast<float>(bfloat16(__bfloat16_raw(0x3c00)));
+  float val2 = static_cast<float>(bfloat16(__bfloat16_raw(0x3c01)));
+  float val3 = static_cast<float>(bfloat16(__bfloat16_raw(0x3c02)));
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0.5f * (val1 + val2)), 0x3c00);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0.5f * (val2 + val3)), 0x3c02);
+
+  // Conversion from int.
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(-1), 0xbf80);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0), 0x0000);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(1), 0x3f80);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(2), 0x4000);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(3), 0x4040);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(12), 0x4140);
+
+  // Conversion from bool.
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(false), 0x0000);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(true), 0x3f80);
+
+  // Conversion to bool
+  VERIFY_IS_EQUAL(static_cast<bool>(bfloat16(3)), true);
+  VERIFY_IS_EQUAL(static_cast<bool>(bfloat16(0.33333f)), true);
+  VERIFY_IS_EQUAL(bfloat16(-0.0), false);
+  VERIFY_IS_EQUAL(static_cast<bool>(bfloat16(0.0)), false);
+
+  // Explicit conversion to float.
+  VERIFY_IS_EQUAL(static_cast<float>(bfloat16(__bfloat16_raw(0x0000))), 0.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(bfloat16(__bfloat16_raw(0x3f80))), 1.0f);
+
+  // Implicit conversion to float
+  VERIFY_IS_EQUAL(bfloat16(__bfloat16_raw(0x0000)), 0.0f);
+  VERIFY_IS_EQUAL(bfloat16(__bfloat16_raw(0x3f80)), 1.0f);
+
+  // Zero representations
+  VERIFY_IS_EQUAL(bfloat16(0.0f), bfloat16(0.0f));
+  VERIFY_IS_EQUAL(bfloat16(-0.0f), bfloat16(0.0f));
+  VERIFY_IS_EQUAL(bfloat16(-0.0f), bfloat16(-0.0f));
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0.0f), 0x0000);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(-0.0f), 0x8000);
+
+  // Default is zero
+  VERIFY_IS_EQUAL(static_cast<float>(bfloat16()), 0.0f);
+
+  // Representable floats round trip via bfloat16
+  test_roundtrip<float>();
+  test_roundtrip<double>();
+  test_roundtrip<std::complex<float> >();
+  test_roundtrip<std::complex<double> >();
+
+  // Conversion
+  Array<float,1,100> a;
+  for (int i = 0; i < 100; i++) a(i) = i + 1.25;
+  Array<bfloat16,1,100> b = a.cast<bfloat16>();
+  Array<float,1,100> c = b.cast<float>();
+  for (int i = 0; i < 100; ++i) {
+    VERIFY_LE(numext::abs(c(i) - a(i)), a(i) / 128);
+  }
+
+  // Epsilon
+  VERIFY_LE(1.0f, static_cast<float>((std::numeric_limits<bfloat16>::epsilon)() + bfloat16(1.0f)));
+  VERIFY_IS_EQUAL(1.0f, static_cast<float>((std::numeric_limits<bfloat16>::epsilon)() / bfloat16(2.0f) + bfloat16(1.0f)));
+
+  // Negate
+  VERIFY_IS_EQUAL(static_cast<float>(-bfloat16(3.0f)), -3.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(-bfloat16(-4.5f)), 4.5f);
+
+
+#if !EIGEN_COMP_MSVC
+  // Visual Studio errors out on divisions by 0
+  VERIFY((numext::isnan)(static_cast<float>(bfloat16(0.0 / 0.0))));
+  VERIFY((numext::isinf)(static_cast<float>(bfloat16(1.0 / 0.0))));
+  VERIFY((numext::isinf)(static_cast<float>(bfloat16(-1.0 / 0.0))));
+
+  // Visual Studio errors out on divisions by 0
+  VERIFY((numext::isnan)(bfloat16(0.0 / 0.0)));
+  VERIFY((numext::isinf)(bfloat16(1.0 / 0.0)));
+  VERIFY((numext::isinf)(bfloat16(-1.0 / 0.0)));
+#endif
+
+  // NaNs and infinities.
+  VERIFY(!(numext::isinf)(static_cast<float>(bfloat16(3.38e38f))));  // Largest finite number.
+  VERIFY(!(numext::isnan)(static_cast<float>(bfloat16(0.0f))));
+  VERIFY((numext::isinf)(static_cast<float>(bfloat16(__bfloat16_raw(0xff80)))));
+  VERIFY((numext::isnan)(static_cast<float>(bfloat16(__bfloat16_raw(0xffc0)))));
+  VERIFY((numext::isinf)(static_cast<float>(bfloat16(__bfloat16_raw(0x7f80)))));
+  VERIFY((numext::isnan)(static_cast<float>(bfloat16(__bfloat16_raw(0x7fc0)))));
+
+  // Exactly same checks as above, just directly on the bfloat16 representation.
+  VERIFY(!(numext::isinf)(bfloat16(__bfloat16_raw(0x7bff))));
+  VERIFY(!(numext::isnan)(bfloat16(__bfloat16_raw(0x0000))));
+  VERIFY((numext::isinf)(bfloat16(__bfloat16_raw(0xff80))));
+  VERIFY((numext::isnan)(bfloat16(__bfloat16_raw(0xffc0))));
+  VERIFY((numext::isinf)(bfloat16(__bfloat16_raw(0x7f80))));
+  VERIFY((numext::isnan)(bfloat16(__bfloat16_raw(0x7fc0))));
+
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(BinaryToFloat(0x0, 0xff, 0x40, 0x0)), 0x7fc0);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(BinaryToFloat(0x1, 0xff, 0x40, 0x0)), 0xffc0);
+}
+
+void test_numtraits()
+{
+  std::cout << "epsilon       = " << NumTraits<bfloat16>::epsilon() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<bfloat16>::epsilon()) << ")" << std::endl;
+  std::cout << "highest       = " << NumTraits<bfloat16>::highest() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<bfloat16>::highest()) << ")" << std::endl;
+  std::cout << "lowest        = " << NumTraits<bfloat16>::lowest() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<bfloat16>::lowest()) << ")" << std::endl;
+  std::cout << "min           = " << (std::numeric_limits<bfloat16>::min)() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>((std::numeric_limits<bfloat16>::min)()) << ")" << std::endl;
+  std::cout << "denorm min    = " << (std::numeric_limits<bfloat16>::denorm_min)() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>((std::numeric_limits<bfloat16>::denorm_min)()) << ")" << std::endl;
+  std::cout << "infinity      = " << NumTraits<bfloat16>::infinity() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<bfloat16>::infinity()) << ")" << std::endl;
+  std::cout << "quiet nan     = " << NumTraits<bfloat16>::quiet_NaN() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<bfloat16>::quiet_NaN()) << ")" << std::endl;
+  std::cout << "signaling nan = " << std::numeric_limits<bfloat16>::signaling_NaN() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(std::numeric_limits<bfloat16>::signaling_NaN()) << ")" << std::endl;
+
+  VERIFY(NumTraits<bfloat16>::IsSigned);
+
+  VERIFY_IS_EQUAL(
+    numext::bit_cast<numext::uint16_t>(std::numeric_limits<bfloat16>::infinity()),
+    numext::bit_cast<numext::uint16_t>(bfloat16(std::numeric_limits<float>::infinity())) );
+  // There is no guarantee that casting a 32-bit NaN to bfloat16 has a precise
+  // bit pattern.  We test that it is in fact a NaN, then test the signaling
+  // bit (msb of significand is 1 for quiet, 0 for signaling).
+  const numext::uint16_t BFLOAT16_QUIET_BIT = 0x0040;
+  VERIFY(
+    (numext::isnan)(std::numeric_limits<bfloat16>::quiet_NaN())
+    && (numext::isnan)(bfloat16(std::numeric_limits<float>::quiet_NaN()))
+    && ((numext::bit_cast<numext::uint16_t>(std::numeric_limits<bfloat16>::quiet_NaN()) & BFLOAT16_QUIET_BIT) > 0)
+    && ((numext::bit_cast<numext::uint16_t>(bfloat16(std::numeric_limits<float>::quiet_NaN())) & BFLOAT16_QUIET_BIT) > 0) );
+  // After a cast to bfloat16, a signaling NaN may become non-signaling. Thus,
+  // we check that both are NaN, and that only the `numeric_limits` version is
+  // signaling.
+  VERIFY(
+    (numext::isnan)(std::numeric_limits<bfloat16>::signaling_NaN())
+    && (numext::isnan)(bfloat16(std::numeric_limits<float>::signaling_NaN()))
+    && ((numext::bit_cast<numext::uint16_t>(std::numeric_limits<bfloat16>::signaling_NaN()) & BFLOAT16_QUIET_BIT) == 0) );
+
+  VERIFY( (std::numeric_limits<bfloat16>::min)() > bfloat16(0.f) );
+  VERIFY( (std::numeric_limits<bfloat16>::denorm_min)() > bfloat16(0.f) );
+  VERIFY_IS_EQUAL( (std::numeric_limits<bfloat16>::denorm_min)()/bfloat16(2), bfloat16(0.f) );
+}
+
+void test_arithmetic()
+{
+  VERIFY_IS_EQUAL(static_cast<float>(bfloat16(2) + bfloat16(2)), 4);
+  VERIFY_IS_EQUAL(static_cast<float>(bfloat16(2) + bfloat16(-2)), 0);
+  VERIFY_IS_APPROX(static_cast<float>(bfloat16(0.33333f) + bfloat16(0.66667f)), 1.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(bfloat16(2.0f) * bfloat16(-5.5f)), -11.0f);
+  VERIFY_IS_APPROX(static_cast<float>(bfloat16(1.0f) / bfloat16(3.0f)), 0.3339f);
+  VERIFY_IS_EQUAL(static_cast<float>(-bfloat16(4096.0f)), -4096.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(-bfloat16(-4096.0f)), 4096.0f);
+}
+
+void test_comparison()
+{
+  VERIFY(bfloat16(1.0f) > bfloat16(0.5f));
+  VERIFY(bfloat16(0.5f) < bfloat16(1.0f));
+  VERIFY(!(bfloat16(1.0f) < bfloat16(0.5f)));
+  VERIFY(!(bfloat16(0.5f) > bfloat16(1.0f)));
+
+  VERIFY(!(bfloat16(4.0f) > bfloat16(4.0f)));
+  VERIFY(!(bfloat16(4.0f) < bfloat16(4.0f)));
+
+  VERIFY(!(bfloat16(0.0f) < bfloat16(-0.0f)));
+  VERIFY(!(bfloat16(-0.0f) < bfloat16(0.0f)));
+  VERIFY(!(bfloat16(0.0f) > bfloat16(-0.0f)));
+  VERIFY(!(bfloat16(-0.0f) > bfloat16(0.0f)));
+
+  VERIFY(bfloat16(0.2f) > bfloat16(-1.0f));
+  VERIFY(bfloat16(-1.0f) < bfloat16(0.2f));
+  VERIFY(bfloat16(-16.0f) < bfloat16(-15.0f));
+
+  VERIFY(bfloat16(1.0f) == bfloat16(1.0f));
+  VERIFY(bfloat16(1.0f) != bfloat16(2.0f));
+
+  // Comparisons with NaNs and infinities.
+#if !EIGEN_COMP_MSVC
+  // Visual Studio errors out on divisions by 0
+  VERIFY(!(bfloat16(0.0 / 0.0) == bfloat16(0.0 / 0.0)));
+  VERIFY(bfloat16(0.0 / 0.0) != bfloat16(0.0 / 0.0));
+
+  VERIFY(!(bfloat16(1.0) == bfloat16(0.0 / 0.0)));
+  VERIFY(!(bfloat16(1.0) < bfloat16(0.0 / 0.0)));
+  VERIFY(!(bfloat16(1.0) > bfloat16(0.0 / 0.0)));
+  VERIFY(bfloat16(1.0) != bfloat16(0.0 / 0.0));
+
+  VERIFY(bfloat16(1.0) < bfloat16(1.0 / 0.0));
+  VERIFY(bfloat16(1.0) > bfloat16(-1.0 / 0.0));
+#endif
+}
+
+void test_basic_functions()
+{
+  VERIFY_IS_EQUAL(static_cast<float>(numext::abs(bfloat16(3.5f))), 3.5f);
+  VERIFY_IS_EQUAL(static_cast<float>(abs(bfloat16(3.5f))), 3.5f);
+  VERIFY_IS_EQUAL(static_cast<float>(numext::abs(bfloat16(-3.5f))), 3.5f);
+  VERIFY_IS_EQUAL(static_cast<float>(abs(bfloat16(-3.5f))), 3.5f);
+
+  VERIFY_IS_EQUAL(static_cast<float>(numext::floor(bfloat16(3.5f))), 3.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(floor(bfloat16(3.5f))), 3.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(numext::floor(bfloat16(-3.5f))), -4.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(floor(bfloat16(-3.5f))), -4.0f);
+
+  VERIFY_IS_EQUAL(static_cast<float>(numext::ceil(bfloat16(3.5f))), 4.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(ceil(bfloat16(3.5f))), 4.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(numext::ceil(bfloat16(-3.5f))), -3.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(ceil(bfloat16(-3.5f))), -3.0f);
+
+  VERIFY_IS_APPROX(static_cast<float>(numext::sqrt(bfloat16(0.0f))), 0.0f);
+  VERIFY_IS_APPROX(static_cast<float>(sqrt(bfloat16(0.0f))), 0.0f);
+  VERIFY_IS_APPROX(static_cast<float>(numext::sqrt(bfloat16(4.0f))), 2.0f);
+  VERIFY_IS_APPROX(static_cast<float>(sqrt(bfloat16(4.0f))), 2.0f);
+
+  VERIFY_IS_APPROX(static_cast<float>(numext::pow(bfloat16(0.0f), bfloat16(1.0f))), 0.0f);
+  VERIFY_IS_APPROX(static_cast<float>(pow(bfloat16(0.0f), bfloat16(1.0f))), 0.0f);
+  VERIFY_IS_APPROX(static_cast<float>(numext::pow(bfloat16(2.0f), bfloat16(2.0f))), 4.0f);
+  VERIFY_IS_APPROX(static_cast<float>(pow(bfloat16(2.0f), bfloat16(2.0f))), 4.0f);
+
+  VERIFY_IS_EQUAL(static_cast<float>(numext::exp(bfloat16(0.0f))), 1.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(exp(bfloat16(0.0f))), 1.0f);
+  VERIFY_IS_APPROX(static_cast<float>(numext::exp(bfloat16(EIGEN_PI))), 20.f + static_cast<float>(EIGEN_PI));
+  VERIFY_IS_APPROX(static_cast<float>(exp(bfloat16(EIGEN_PI))), 20.f + static_cast<float>(EIGEN_PI));
+
+  VERIFY_IS_EQUAL(static_cast<float>(numext::expm1(bfloat16(0.0f))), 0.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(expm1(bfloat16(0.0f))), 0.0f);
+  VERIFY_IS_APPROX(static_cast<float>(numext::expm1(bfloat16(2.0f))), 6.375f);
+  VERIFY_IS_APPROX(static_cast<float>(expm1(bfloat16(2.0f))), 6.375f);
+
+  VERIFY_IS_EQUAL(static_cast<float>(numext::log(bfloat16(1.0f))), 0.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(log(bfloat16(1.0f))), 0.0f);
+  VERIFY_IS_APPROX(static_cast<float>(numext::log(bfloat16(10.0f))), 2.296875f);
+  VERIFY_IS_APPROX(static_cast<float>(log(bfloat16(10.0f))), 2.296875f);
+
+  VERIFY_IS_EQUAL(static_cast<float>(numext::log1p(bfloat16(0.0f))), 0.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(log1p(bfloat16(0.0f))), 0.0f);
+  VERIFY_IS_APPROX(static_cast<float>(numext::log1p(bfloat16(10.0f))), 2.390625f);
+  VERIFY_IS_APPROX(static_cast<float>(log1p(bfloat16(10.0f))), 2.390625f);
+}
+
+void test_trigonometric_functions()
+{
+  VERIFY_IS_APPROX(numext::cos(bfloat16(0.0f)), bfloat16(cosf(0.0f)));
+  VERIFY_IS_APPROX(cos(bfloat16(0.0f)), bfloat16(cosf(0.0f)));
+  VERIFY_IS_APPROX(numext::cos(bfloat16(EIGEN_PI)), bfloat16(cosf(EIGEN_PI)));
+  // VERIFY_IS_APPROX(numext::cos(bfloat16(EIGEN_PI/2)), bfloat16(cosf(EIGEN_PI/2)));
+  // VERIFY_IS_APPROX(numext::cos(bfloat16(3*EIGEN_PI/2)), bfloat16(cosf(3*EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::cos(bfloat16(3.5f)), bfloat16(cosf(3.5f)));
+
+  VERIFY_IS_APPROX(numext::sin(bfloat16(0.0f)), bfloat16(sinf(0.0f)));
+  VERIFY_IS_APPROX(sin(bfloat16(0.0f)), bfloat16(sinf(0.0f)));
+  // VERIFY_IS_APPROX(numext::sin(bfloat16(EIGEN_PI)), bfloat16(sinf(EIGEN_PI)));
+  VERIFY_IS_APPROX(numext::sin(bfloat16(EIGEN_PI/2)), bfloat16(sinf(EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::sin(bfloat16(3*EIGEN_PI/2)), bfloat16(sinf(3*EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::sin(bfloat16(3.5f)), bfloat16(sinf(3.5f)));
+
+  VERIFY_IS_APPROX(numext::tan(bfloat16(0.0f)), bfloat16(tanf(0.0f)));
+  VERIFY_IS_APPROX(tan(bfloat16(0.0f)), bfloat16(tanf(0.0f)));
+  // VERIFY_IS_APPROX(numext::tan(bfloat16(EIGEN_PI)), bfloat16(tanf(EIGEN_PI)));
+  // VERIFY_IS_APPROX(numext::tan(bfloat16(EIGEN_PI/2)), bfloat16(tanf(EIGEN_PI/2)));
+  // VERIFY_IS_APPROX(numext::tan(bfloat16(3*EIGEN_PI/2)), bfloat16(tanf(3*EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::tan(bfloat16(3.5f)), bfloat16(tanf(3.5f)));
+}
+
+void test_array()
+{
+  typedef Array<bfloat16,1,Dynamic> ArrayXh;
+  Index size = internal::random<Index>(1,10);
+  Index i = internal::random<Index>(0,size-1);
+  ArrayXh a1 = ArrayXh::Random(size), a2 = ArrayXh::Random(size);
+  VERIFY_IS_APPROX( a1+a1, bfloat16(2)*a1 );
+  VERIFY( (a1.abs() >= bfloat16(0)).all() );
+  VERIFY_IS_APPROX( (a1*a1).sqrt(), a1.abs() );
+
+  VERIFY( ((a1.min)(a2) <= (a1.max)(a2)).all() );
+  a1(i) = bfloat16(-10.);
+  VERIFY_IS_EQUAL( a1.minCoeff(), bfloat16(-10.) );
+  a1(i) = bfloat16(10.);
+  VERIFY_IS_EQUAL( a1.maxCoeff(), bfloat16(10.) );
+
+  std::stringstream ss;
+  ss << a1;
+}
+
+void test_product()
+{
+  typedef Matrix<bfloat16,Dynamic,Dynamic> MatrixXh;
+  Index rows  = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+  Index cols  = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+  Index depth = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+  MatrixXh Ah = MatrixXh::Random(rows,depth);
+  MatrixXh Bh = MatrixXh::Random(depth,cols);
+  MatrixXh Ch = MatrixXh::Random(rows,cols);
+  MatrixXf Af = Ah.cast<float>();
+  MatrixXf Bf = Bh.cast<float>();
+  MatrixXf Cf = Ch.cast<float>();
+  VERIFY_IS_APPROX(Ch.noalias()+=Ah*Bh, (Cf.noalias()+=Af*Bf).cast<bfloat16>());
+}
+
+EIGEN_DECLARE_TEST(bfloat16_float)
+{
+  CALL_SUBTEST(test_numtraits());
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST(test_conversion());
+    CALL_SUBTEST(test_arithmetic());
+    CALL_SUBTEST(test_comparison());
+    CALL_SUBTEST(test_basic_functions());
+    CALL_SUBTEST(test_trigonometric_functions());
+    CALL_SUBTEST(test_array());
+    CALL_SUBTEST(test_product());
+  }
+}

diff --git a/test/bicgstab.cpp b/test/bicgstab.cpp
new file mode 100644
index 0000000..59c4b50
--- /dev/null
+++ b/test/bicgstab.cpp

@@ -0,0 +1,34 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "sparse_solver.h"
+#include <Eigen/IterativeLinearSolvers>
+
+template<typename T, typename I_> void test_bicgstab_T()
+{
+  BiCGSTAB<SparseMatrix<T,0,I_>, DiagonalPreconditioner<T> >     bicgstab_colmajor_diag;
+  BiCGSTAB<SparseMatrix<T,0,I_>, IdentityPreconditioner    >     bicgstab_colmajor_I;
+  BiCGSTAB<SparseMatrix<T,0,I_>, IncompleteLUT<T,I_> >              bicgstab_colmajor_ilut;
+  //BiCGSTAB<SparseMatrix<T>, SSORPreconditioner<T> >     bicgstab_colmajor_ssor;
+
+  bicgstab_colmajor_diag.setTolerance(NumTraits<T>::epsilon()*4);
+  bicgstab_colmajor_ilut.setTolerance(NumTraits<T>::epsilon()*4);
+  
+  CALL_SUBTEST( check_sparse_square_solving(bicgstab_colmajor_diag)  );
+//   CALL_SUBTEST( check_sparse_square_solving(bicgstab_colmajor_I)     );
+  CALL_SUBTEST( check_sparse_square_solving(bicgstab_colmajor_ilut)     );
+  //CALL_SUBTEST( check_sparse_square_solving(bicgstab_colmajor_ssor)     );
+}
+
+EIGEN_DECLARE_TEST(bicgstab)
+{
+  CALL_SUBTEST_1((test_bicgstab_T<double,int>()) );
+  CALL_SUBTEST_2((test_bicgstab_T<std::complex<double>, int>()));
+  CALL_SUBTEST_3((test_bicgstab_T<double,long int>()));
+}

diff --git a/test/blasutil.cpp b/test/blasutil.cpp
new file mode 100644
index 0000000..845a498
--- /dev/null
+++ b/test/blasutil.cpp

@@ -0,0 +1,210 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 Everton Constantino <everton.constantino@ibm.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/
+
+#include "main.h"
+
+// Disable "ignoring attributes on template argument"
+// for packet_traits<Packet*>
+// => The only workaround would be to wrap _m128 and the likes
+//    within wrappers.
+#if EIGEN_GNUC_AT_LEAST(6,0)
+    #pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+
+#define GET(i,j) (StorageOrder == RowMajor ? (i)*stride + (j) : (i) + (j)*stride)
+#define SCATTER(i,j,k) (StorageOrder == RowMajor ? ((i)+(k))*stride + (j) : (i) + ((j)+(k))*stride)
+
+template<typename Scalar, typename Packet>
+void compare(const Packet& a, const Packet& b)
+{
+    int pktsz = internal::packet_traits<Scalar>::size;
+    Scalar *buffA = new Scalar[pktsz];
+    Scalar *buffB = new Scalar[pktsz];
+
+    internal::pstoreu<Scalar, Packet>(buffA, a);
+    internal::pstoreu<Scalar, Packet>(buffB, b);
+
+    for(int i = 0; i < pktsz; i++)
+    {
+        VERIFY_IS_EQUAL(buffA[i], buffB[i]);
+    }
+
+    delete[] buffA;
+    delete[] buffB;
+}
+
+template<typename Scalar, int StorageOrder, int n>
+struct PacketBlockSet
+{
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+
+    void setPacketBlock(internal::PacketBlock<Packet,n>& block, Scalar value)
+    {
+        for(int idx = 0; idx < n; idx++)
+        {
+            block.packet[idx] = internal::pset1<Packet>(value);
+        }
+    }
+
+    void comparePacketBlock(Scalar *data, int i, int j, int stride, internal::PacketBlock<Packet, n>& block)
+    {
+        for(int idx = 0; idx < n; idx++)
+        {
+            Packet line = internal::ploadu<Packet>(data + SCATTER(i,j,idx));
+            compare<Scalar, Packet>(block.packet[idx], line);
+        }
+    }
+};
+
+template<typename Scalar, int StorageOrder, int BlockSize>
+void run_bdmp_spec_1()
+{
+    typedef internal::blas_data_mapper<Scalar, int, StorageOrder> BlasDataMapper;
+    int packetSize = internal::packet_traits<Scalar>::size;
+    int minSize = std::max<int>(packetSize, BlockSize);
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+
+    int szm = internal::random<int>(minSize,500), szn = internal::random<int>(minSize,500);
+    int stride = StorageOrder == RowMajor ? szn : szm;
+    Scalar *d = new Scalar[szn*szm];
+
+    // Initializing with random entries
+    for(int i = 0; i < szm*szn; i++)
+    {
+        d[i] = internal::random<Scalar>(static_cast<Scalar>(3), static_cast<Scalar>(10));
+    }
+
+    BlasDataMapper bdm(d, stride);
+
+    // Testing operator()
+    for(int i = 0; i < szm; i++)
+    {
+        for(int j = 0; j < szn; j++)
+        {
+            VERIFY_IS_EQUAL(d[GET(i,j)], bdm(i,j));
+        }
+    }
+
+    // Testing getSubMapper and getLinearMapper
+    int i0 = internal::random<int>(0,szm-2);
+    int j0 = internal::random<int>(0,szn-2);
+    for(int i = i0; i < szm; i++)
+    {
+        for(int j = j0; j < szn; j++)
+        {
+            const BlasDataMapper& bdmSM = bdm.getSubMapper(i0,j0);
+            const internal::BlasLinearMapper<Scalar, int, 0>& bdmLM = bdm.getLinearMapper(i0,j0);
+
+            Scalar v = bdmSM(i - i0, j - j0);
+            Scalar vd = d[GET(i,j)];
+            VERIFY_IS_EQUAL(vd, v);
+            VERIFY_IS_EQUAL(vd, bdmLM(GET(i-i0, j-j0)));
+        }
+    }
+
+    // Testing loadPacket
+    for(int i = 0; i < szm - minSize; i++)
+    {
+        for(int j = 0; j < szn - minSize; j++)
+        {
+            Packet pktBDM = bdm.template loadPacket<Packet>(i,j);
+            Packet pktD = internal::ploadu<Packet>(d + GET(i,j));
+
+            compare<Scalar, Packet>(pktBDM, pktD);
+        }
+    }
+
+    // Testing gatherPacket
+    Scalar *buff = new Scalar[packetSize];
+    for(int i = 0; i < szm - minSize; i++)
+    {
+        for(int j = 0; j < szn - minSize; j++)
+        {
+            Packet p = bdm.template gatherPacket<Packet>(i,j);
+            internal::pstoreu<Scalar, Packet>(buff, p);
+
+            for(int k = 0; k < packetSize; k++)
+            {
+                VERIFY_IS_EQUAL(d[SCATTER(i,j,k)], buff[k]);
+            }
+
+        }
+    }
+    delete[] buff;
+
+    // Testing scatterPacket
+    for(int i = 0; i < szm - minSize; i++)
+    {
+        for(int j = 0; j < szn - minSize; j++)
+        {
+            Packet p = internal::pset1<Packet>(static_cast<Scalar>(1));
+            bdm.template scatterPacket<Packet>(i,j,p);
+            for(int k = 0; k < packetSize; k++)
+            {
+                VERIFY_IS_EQUAL(d[SCATTER(i,j,k)], static_cast<Scalar>(1));
+            }
+        }
+    }
+
+    //Testing storePacketBlock
+    internal::PacketBlock<Packet, BlockSize> block;
+
+    PacketBlockSet<Scalar, StorageOrder, BlockSize> pbs;
+    pbs.setPacketBlock(block, static_cast<Scalar>(2));
+
+    for(int i = 0; i < szm - minSize; i++)
+    {
+        for(int j = 0; j < szn - minSize; j++)
+        {
+            bdm.template storePacketBlock<Packet, BlockSize>(i, j, block);
+
+            pbs.comparePacketBlock(d, i, j, stride, block);
+        }
+    }
+
+    delete[] d;
+}
+
+template<typename Scalar>
+void run_test()
+{
+    run_bdmp_spec_1<Scalar, RowMajor, 1>();
+    run_bdmp_spec_1<Scalar, ColMajor, 1>();
+    run_bdmp_spec_1<Scalar, RowMajor, 2>();
+    run_bdmp_spec_1<Scalar, ColMajor, 2>();
+    run_bdmp_spec_1<Scalar, RowMajor, 4>();
+    run_bdmp_spec_1<Scalar, ColMajor, 4>();
+    run_bdmp_spec_1<Scalar, RowMajor, 8>();
+    run_bdmp_spec_1<Scalar, ColMajor, 8>();
+    run_bdmp_spec_1<Scalar, RowMajor, 16>();
+    run_bdmp_spec_1<Scalar, ColMajor, 16>();
+}
+
+EIGEN_DECLARE_TEST(blasutil)
+{
+    for(int i = 0; i < g_repeat; i++)
+    {
+        CALL_SUBTEST_1(run_test<numext::int8_t>());
+        CALL_SUBTEST_2(run_test<numext::int16_t>());
+        CALL_SUBTEST_3(run_test<numext::int32_t>());
+
+// TODO: Replace this by a call to numext::int64_t as soon as we have a way to
+// detect the typedef for int64_t on all platforms
+#if EIGEN_HAS_CXX11
+        CALL_SUBTEST_4(run_test<signed long long>());
+#else
+        CALL_SUBTEST_4(run_test<signed long>());
+#endif
+
+        CALL_SUBTEST_5(run_test<float_t>());
+        CALL_SUBTEST_6(run_test<double_t>());
+        CALL_SUBTEST_7(run_test<std::complex<float> >());
+        CALL_SUBTEST_8(run_test<std::complex<double> >());
+    }
+}

diff --git a/test/block.cpp b/test/block.cpp
new file mode 100644
index 0000000..84124ab
--- /dev/null
+++ b/test/block.cpp

@@ -0,0 +1,316 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_NO_STATIC_ASSERT // otherwise we fail at compile time on unused paths
+#include "main.h"
+
+template<typename MatrixType, typename Index, typename Scalar>
+typename Eigen::internal::enable_if<!NumTraits<typename MatrixType::Scalar>::IsComplex,typename MatrixType::Scalar>::type
+block_real_only(const MatrixType &m1, Index r1, Index r2, Index c1, Index c2, const Scalar& s1) {
+  // check cwise-Functions:
+  VERIFY_IS_APPROX(m1.row(r1).cwiseMax(s1), m1.cwiseMax(s1).row(r1));
+  VERIFY_IS_APPROX(m1.col(c1).cwiseMin(s1), m1.cwiseMin(s1).col(c1));
+
+  VERIFY_IS_APPROX(m1.block(r1,c1,r2-r1+1,c2-c1+1).cwiseMin(s1), m1.cwiseMin(s1).block(r1,c1,r2-r1+1,c2-c1+1));
+  VERIFY_IS_APPROX(m1.block(r1,c1,r2-r1+1,c2-c1+1).cwiseMax(s1), m1.cwiseMax(s1).block(r1,c1,r2-r1+1,c2-c1+1));
+  
+  return Scalar(0);
+}
+
+template<typename MatrixType, typename Index, typename Scalar>
+typename Eigen::internal::enable_if<NumTraits<typename MatrixType::Scalar>::IsComplex,typename MatrixType::Scalar>::type
+block_real_only(const MatrixType &, Index, Index, Index, Index, const Scalar&) {
+  return Scalar(0);
+}
+
+// Check at compile-time that T1==T2, and at runtime-time that a==b
+template<typename T1,typename T2>
+typename internal::enable_if<internal::is_same<T1,T2>::value,bool>::type
+is_same_block(const T1& a, const T2& b)
+{
+  return a.isApprox(b);
+}
+
+template<typename MatrixType> void block(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+  typedef Matrix<Scalar, 1, MatrixType::ColsAtCompileTime> RowVectorType;
+  typedef Matrix<Scalar, Dynamic, Dynamic, MatrixType::IsRowMajor?RowMajor:ColMajor> DynamicMatrixType;
+  typedef Matrix<Scalar, Dynamic, 1> DynamicVectorType;
+  
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m1_copy = m1,
+             m2 = MatrixType::Random(rows, cols),
+             m3(rows, cols),
+             ones = MatrixType::Ones(rows, cols);
+  VectorType v1 = VectorType::Random(rows);
+
+  Scalar s1 = internal::random<Scalar>();
+
+  Index r1 = internal::random<Index>(0,rows-1);
+  Index r2 = internal::random<Index>(r1,rows-1);
+  Index c1 = internal::random<Index>(0,cols-1);
+  Index c2 = internal::random<Index>(c1,cols-1);
+
+  block_real_only(m1, r1, r2, c1, c1, s1);
+
+  //check row() and col()
+  VERIFY_IS_EQUAL(m1.col(c1).transpose(), m1.transpose().row(c1));
+  //check operator(), both constant and non-constant, on row() and col()
+  m1 = m1_copy;
+  m1.row(r1) += s1 * m1_copy.row(r2);
+  VERIFY_IS_APPROX(m1.row(r1), m1_copy.row(r1) + s1 * m1_copy.row(r2));
+  // check nested block xpr on lhs
+  m1.row(r1).row(0) += s1 * m1_copy.row(r2);
+  VERIFY_IS_APPROX(m1.row(r1), m1_copy.row(r1) + Scalar(2) * s1 * m1_copy.row(r2));
+  m1 = m1_copy;
+  m1.col(c1) += s1 * m1_copy.col(c2);
+  VERIFY_IS_APPROX(m1.col(c1), m1_copy.col(c1) + s1 * m1_copy.col(c2));
+  m1.col(c1).col(0) += s1 * m1_copy.col(c2);
+  VERIFY_IS_APPROX(m1.col(c1), m1_copy.col(c1) + Scalar(2) * s1 * m1_copy.col(c2));
+  
+  
+  //check block()
+  Matrix<Scalar,Dynamic,Dynamic> b1(1,1); b1(0,0) = m1(r1,c1);
+
+  RowVectorType br1(m1.block(r1,0,1,cols));
+  VectorType bc1(m1.block(0,c1,rows,1));
+  VERIFY_IS_EQUAL(b1, m1.block(r1,c1,1,1));
+  VERIFY_IS_EQUAL(m1.row(r1), br1);
+  VERIFY_IS_EQUAL(m1.col(c1), bc1);
+  //check operator(), both constant and non-constant, on block()
+  m1.block(r1,c1,r2-r1+1,c2-c1+1) = s1 * m2.block(0, 0, r2-r1+1,c2-c1+1);
+  m1.block(r1,c1,r2-r1+1,c2-c1+1)(r2-r1,c2-c1) = m2.block(0, 0, r2-r1+1,c2-c1+1)(0,0);
+
+  const Index BlockRows = 2;
+  const Index BlockCols = 5;
+
+  if (rows>=5 && cols>=8)
+  {
+    // test fixed block() as lvalue
+    m1.template block<BlockRows,BlockCols>(1,1) *= s1;
+    // test operator() on fixed block() both as constant and non-constant
+    m1.template block<BlockRows,BlockCols>(1,1)(0, 3) = m1.template block<2,5>(1,1)(1,2);
+    // check that fixed block() and block() agree
+    Matrix<Scalar,Dynamic,Dynamic> b = m1.template block<BlockRows,BlockCols>(3,3);
+    VERIFY_IS_EQUAL(b, m1.block(3,3,BlockRows,BlockCols));
+
+    // same tests with mixed fixed/dynamic size
+    m1.template block<BlockRows,Dynamic>(1,1,BlockRows,BlockCols) *= s1;
+    m1.template block<BlockRows,Dynamic>(1,1,BlockRows,BlockCols)(0,3) = m1.template block<2,5>(1,1)(1,2);
+    Matrix<Scalar,Dynamic,Dynamic> b2 = m1.template block<Dynamic,BlockCols>(3,3,2,5);
+    VERIFY_IS_EQUAL(b2, m1.block(3,3,BlockRows,BlockCols));
+
+    VERIFY(is_same_block(m1.block(3,3,BlockRows,BlockCols), m1.block(3,3,fix<Dynamic>(BlockRows),fix<Dynamic>(BlockCols))));
+    VERIFY(is_same_block(m1.template block<BlockRows,Dynamic>(1,1,BlockRows,BlockCols), m1.block(1,1,fix<BlockRows>,BlockCols)));
+    VERIFY(is_same_block(m1.template block<BlockRows,BlockCols>(1,1,BlockRows,BlockCols), m1.block(1,1,fix<BlockRows>(),fix<BlockCols>)));
+    VERIFY(is_same_block(m1.template block<BlockRows,BlockCols>(1,1,BlockRows,BlockCols), m1.block(1,1,fix<BlockRows>,fix<BlockCols>(BlockCols))));
+  }
+
+  if (rows>2)
+  {
+    // test sub vectors
+    VERIFY_IS_EQUAL(v1.template head<2>(), v1.block(0,0,2,1));
+    VERIFY_IS_EQUAL(v1.template head<2>(), v1.head(2));
+    VERIFY_IS_EQUAL(v1.template head<2>(), v1.segment(0,2));
+    VERIFY_IS_EQUAL(v1.template head<2>(), v1.template segment<2>(0));
+    Index i = rows-2;
+    VERIFY_IS_EQUAL(v1.template tail<2>(), v1.block(i,0,2,1));
+    VERIFY_IS_EQUAL(v1.template tail<2>(), v1.tail(2));
+    VERIFY_IS_EQUAL(v1.template tail<2>(), v1.segment(i,2));
+    VERIFY_IS_EQUAL(v1.template tail<2>(), v1.template segment<2>(i));
+    i = internal::random<Index>(0,rows-2);
+    VERIFY_IS_EQUAL(v1.segment(i,2), v1.template segment<2>(i));
+  }
+
+  // stress some basic stuffs with block matrices
+  VERIFY(numext::real(ones.col(c1).sum()) == RealScalar(rows));
+  VERIFY(numext::real(ones.row(r1).sum()) == RealScalar(cols));
+
+  VERIFY(numext::real(ones.col(c1).dot(ones.col(c2))) == RealScalar(rows));
+  VERIFY(numext::real(ones.row(r1).dot(ones.row(r2))) == RealScalar(cols));
+  
+  // check that linear acccessors works on blocks
+  m1 = m1_copy;
+  if((MatrixType::Flags&RowMajorBit)==0)
+    VERIFY_IS_EQUAL(m1.leftCols(c1).coeff(r1+c1*rows), m1(r1,c1));
+  else
+    VERIFY_IS_EQUAL(m1.topRows(r1).coeff(c1+r1*cols), m1(r1,c1));
+  
+
+  // now test some block-inside-of-block.
+  
+  // expressions with direct access
+  VERIFY_IS_EQUAL( (m1.block(r1,c1,rows-r1,cols-c1).block(r2-r1,c2-c1,rows-r2,cols-c2)) , (m1.block(r2,c2,rows-r2,cols-c2)) );
+  VERIFY_IS_EQUAL( (m1.block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , (m1.row(r1).segment(c1,c2-c1+1)) );
+  VERIFY_IS_EQUAL( (m1.block(r1,c1,r2-r1+1,c2-c1+1).col(0)) , (m1.col(c1).segment(r1,r2-r1+1)) );
+  VERIFY_IS_EQUAL( (m1.block(r1,c1,r2-r1+1,c2-c1+1).transpose().col(0)) , (m1.row(r1).segment(c1,c2-c1+1)).transpose() );
+  VERIFY_IS_EQUAL( (m1.transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0)) , (m1.row(r1).segment(c1,c2-c1+1)).transpose() );
+
+  // expressions without direct access
+  VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,rows-r1,cols-c1).block(r2-r1,c2-c1,rows-r2,cols-c2)) , ((m1+m2).block(r2,c2,rows-r2,cols-c2)) );
+  VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)) );
+  VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , ((m1+m2).eval().row(r1).segment(c1,c2-c1+1)) );
+  VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).col(0)) , ((m1+m2).col(c1).segment(r1,r2-r1+1)) );
+  VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).transpose().col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() );
+  VERIFY_IS_APPROX( ((m1+m2).transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() );
+  VERIFY_IS_APPROX( ((m1+m2).template block<Dynamic,1>(r1,c1,r2-r1+1,1)) , ((m1+m2).eval().col(c1).eval().segment(r1,r2-r1+1)) );
+  VERIFY_IS_APPROX( ((m1+m2).template block<1,Dynamic>(r1,c1,1,c2-c1+1)) , ((m1+m2).eval().row(r1).eval().segment(c1,c2-c1+1)) );
+  VERIFY_IS_APPROX( ((m1+m2).transpose().template block<1,Dynamic>(c1,r1,1,r2-r1+1)) , ((m1+m2).eval().col(c1).eval().segment(r1,r2-r1+1)).transpose() );
+  VERIFY_IS_APPROX( (m1+m2).row(r1).eval(), (m1+m2).eval().row(r1) );
+  VERIFY_IS_APPROX( (m1+m2).adjoint().col(r1).eval(), (m1+m2).adjoint().eval().col(r1) );
+  VERIFY_IS_APPROX( (m1+m2).adjoint().row(c1).eval(), (m1+m2).adjoint().eval().row(c1) );
+  VERIFY_IS_APPROX( (m1*1).row(r1).segment(c1,c2-c1+1).eval(), m1.row(r1).eval().segment(c1,c2-c1+1).eval() );
+  VERIFY_IS_APPROX( m1.col(c1).reverse().segment(r1,r2-r1+1).eval(),m1.col(c1).reverse().eval().segment(r1,r2-r1+1).eval() );
+
+  VERIFY_IS_APPROX( (m1*1).topRows(r1),  m1.topRows(r1) );
+  VERIFY_IS_APPROX( (m1*1).leftCols(c1), m1.leftCols(c1) );
+  VERIFY_IS_APPROX( (m1*1).transpose().topRows(c1), m1.transpose().topRows(c1) );
+  VERIFY_IS_APPROX( (m1*1).transpose().leftCols(r1), m1.transpose().leftCols(r1) );
+  VERIFY_IS_APPROX( (m1*1).transpose().middleRows(c1,c2-c1+1), m1.transpose().middleRows(c1,c2-c1+1) );
+  VERIFY_IS_APPROX( (m1*1).transpose().middleCols(r1,r2-r1+1), m1.transpose().middleCols(r1,r2-r1+1) );
+
+  // evaluation into plain matrices from expressions with direct access (stress MapBase)
+  DynamicMatrixType dm;
+  DynamicVectorType dv;
+  dm.setZero();
+  dm = m1.block(r1,c1,rows-r1,cols-c1).block(r2-r1,c2-c1,rows-r2,cols-c2);
+  VERIFY_IS_EQUAL(dm, (m1.block(r2,c2,rows-r2,cols-c2)));
+  dm.setZero();
+  dv.setZero();
+  dm = m1.block(r1,c1,r2-r1+1,c2-c1+1).row(0).transpose();
+  dv = m1.row(r1).segment(c1,c2-c1+1);
+  VERIFY_IS_EQUAL(dv, dm);
+  dm.setZero();
+  dv.setZero();
+  dm = m1.col(c1).segment(r1,r2-r1+1);
+  dv = m1.block(r1,c1,r2-r1+1,c2-c1+1).col(0);
+  VERIFY_IS_EQUAL(dv, dm);
+  dm.setZero();
+  dv.setZero();
+  dm = m1.block(r1,c1,r2-r1+1,c2-c1+1).transpose().col(0);
+  dv = m1.row(r1).segment(c1,c2-c1+1);
+  VERIFY_IS_EQUAL(dv, dm);
+  dm.setZero();
+  dv.setZero();
+  dm = m1.row(r1).segment(c1,c2-c1+1).transpose();
+  dv = m1.transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0);
+  VERIFY_IS_EQUAL(dv, dm);
+
+  VERIFY_IS_EQUAL( (m1.template block<Dynamic,1>(1,0,0,1)), m1.block(1,0,0,1));
+  VERIFY_IS_EQUAL( (m1.template block<1,Dynamic>(0,1,1,0)), m1.block(0,1,1,0));
+  VERIFY_IS_EQUAL( ((m1*1).template block<Dynamic,1>(1,0,0,1)), m1.block(1,0,0,1));
+  VERIFY_IS_EQUAL( ((m1*1).template block<1,Dynamic>(0,1,1,0)), m1.block(0,1,1,0));
+
+  if (rows>=2 && cols>=2)
+  {
+    VERIFY_RAISES_ASSERT( m1 += m1.col(0) );
+    VERIFY_RAISES_ASSERT( m1 -= m1.col(0) );
+    VERIFY_RAISES_ASSERT( m1.array() *= m1.col(0).array() );
+    VERIFY_RAISES_ASSERT( m1.array() /= m1.col(0).array() );
+  }
+
+  VERIFY_IS_EQUAL( m1.template subVector<Horizontal>(r1), m1.row(r1) );
+  VERIFY_IS_APPROX( (m1+m1).template subVector<Horizontal>(r1), (m1+m1).row(r1) );
+  VERIFY_IS_EQUAL( m1.template subVector<Vertical>(c1), m1.col(c1) );
+  VERIFY_IS_APPROX( (m1+m1).template subVector<Vertical>(c1), (m1+m1).col(c1) );
+  VERIFY_IS_EQUAL( m1.template subVectors<Horizontal>(), m1.rows() );
+  VERIFY_IS_EQUAL( m1.template subVectors<Vertical>(), m1.cols() );
+
+  if (rows>=2 || cols>=2) {
+    VERIFY_IS_EQUAL( int(m1.middleCols(0,0).IsRowMajor), int(m1.IsRowMajor) );
+    VERIFY_IS_EQUAL( m1.middleCols(0,0).outerSize(), m1.IsRowMajor ? rows : 0);
+    VERIFY_IS_EQUAL( m1.middleCols(0,0).innerSize(), m1.IsRowMajor ? 0 : rows);
+
+    VERIFY_IS_EQUAL( int(m1.middleRows(0,0).IsRowMajor), int(m1.IsRowMajor) );
+    VERIFY_IS_EQUAL( m1.middleRows(0,0).outerSize(), m1.IsRowMajor ? 0 : cols);
+    VERIFY_IS_EQUAL( m1.middleRows(0,0).innerSize(), m1.IsRowMajor ? cols : 0);
+  }
+}
+
+
+template<typename MatrixType>
+void compare_using_data_and_stride(const MatrixType& m)
+{
+  Index rows = m.rows();
+  Index cols = m.cols();
+  Index size = m.size();
+  Index innerStride = m.innerStride();
+  Index outerStride = m.outerStride();
+  Index rowStride = m.rowStride();
+  Index colStride = m.colStride();
+  const typename MatrixType::Scalar* data = m.data();
+
+  for(int j=0;j<cols;++j)
+    for(int i=0;i<rows;++i)
+      VERIFY(m.coeff(i,j) == data[i*rowStride + j*colStride]);
+
+  if(!MatrixType::IsVectorAtCompileTime)
+  {
+    for(int j=0;j<cols;++j)
+      for(int i=0;i<rows;++i)
+        VERIFY(m.coeff(i,j) == data[(MatrixType::Flags&RowMajorBit)
+                                     ? i*outerStride + j*innerStride
+                                     : j*outerStride + i*innerStride]);
+  }
+
+  if(MatrixType::IsVectorAtCompileTime)
+  {
+    VERIFY(innerStride == int((&m.coeff(1))-(&m.coeff(0))));
+    for (int i=0;i<size;++i)
+      VERIFY(m.coeff(i) == data[i*innerStride]);
+  }
+}
+
+template<typename MatrixType>
+void data_and_stride(const MatrixType& m)
+{
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  Index r1 = internal::random<Index>(0,rows-1);
+  Index r2 = internal::random<Index>(r1,rows-1);
+  Index c1 = internal::random<Index>(0,cols-1);
+  Index c2 = internal::random<Index>(c1,cols-1);
+
+  MatrixType m1 = MatrixType::Random(rows, cols);
+  compare_using_data_and_stride(m1.block(r1, c1, r2-r1+1, c2-c1+1));
+  compare_using_data_and_stride(m1.transpose().block(c1, r1, c2-c1+1, r2-r1+1));
+  compare_using_data_and_stride(m1.row(r1));
+  compare_using_data_and_stride(m1.col(c1));
+  compare_using_data_and_stride(m1.row(r1).transpose());
+  compare_using_data_and_stride(m1.col(c1).transpose());
+}
+
+EIGEN_DECLARE_TEST(block)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( block(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_1( block(Matrix<float, 1, Dynamic>(internal::random(2,50))) );
+    CALL_SUBTEST_1( block(Matrix<float, Dynamic, 1>(internal::random(2,50))) );
+    CALL_SUBTEST_2( block(Matrix4d()) );
+    CALL_SUBTEST_3( block(MatrixXcf(internal::random(2,50), internal::random(2,50))) );
+    CALL_SUBTEST_4( block(MatrixXi(internal::random(2,50), internal::random(2,50))) );
+    CALL_SUBTEST_5( block(MatrixXcd(internal::random(2,50), internal::random(2,50))) );
+    CALL_SUBTEST_6( block(MatrixXf(internal::random(2,50), internal::random(2,50))) );
+    CALL_SUBTEST_7( block(Matrix<int,Dynamic,Dynamic,RowMajor>(internal::random(2,50), internal::random(2,50))) );
+
+    CALL_SUBTEST_8( block(Matrix<float,Dynamic,4>(3, 4)) );
+
+#ifndef EIGEN_DEFAULT_TO_ROW_MAJOR
+    CALL_SUBTEST_6( data_and_stride(MatrixXf(internal::random(5,50), internal::random(5,50))) );
+    CALL_SUBTEST_7( data_and_stride(Matrix<int,Dynamic,Dynamic,RowMajor>(internal::random(5,50), internal::random(5,50))) );
+#endif
+  }
+}

diff --git a/test/boostmultiprec.cpp b/test/boostmultiprec.cpp
new file mode 100644
index 0000000..e83e970
--- /dev/null
+++ b/test/boostmultiprec.cpp

@@ -0,0 +1,207 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <sstream>
+
+#ifdef EIGEN_TEST_MAX_SIZE
+#undef EIGEN_TEST_MAX_SIZE
+#endif
+
+#define EIGEN_TEST_MAX_SIZE 50
+
+#ifdef EIGEN_TEST_PART_1
+#include "cholesky.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_2
+#include "lu.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_3
+#include "qr.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_4
+#include "qr_colpivoting.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_5
+#include "qr_fullpivoting.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_6
+#include "eigensolver_selfadjoint.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_7
+#include "eigensolver_generic.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_8
+#include "eigensolver_generalized_real.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_9
+#include "jacobisvd.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_10
+#include "bdcsvd.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_11
+#include "simplicial_cholesky.cpp"
+#endif
+
+#include <Eigen/Dense>
+
+#undef min
+#undef max
+#undef isnan
+#undef isinf
+#undef isfinite
+#undef I
+
+#include <boost/serialization/nvp.hpp>
+#include <boost/multiprecision/cpp_dec_float.hpp>
+#include <boost/multiprecision/number.hpp>
+#include <boost/math/special_functions.hpp>
+#include <boost/math/complex.hpp>
+
+typedef boost::multiprecision::number<boost::multiprecision::cpp_dec_float<100>, boost::multiprecision::et_on> Real;
+
+namespace Eigen {
+  template<> struct NumTraits<Real> : GenericNumTraits<Real> {
+    static inline Real dummy_precision() { return 1e-50; }
+  };
+
+  template<typename T1,typename T2,typename T3,typename T4,typename T5>
+  struct NumTraits<boost::multiprecision::detail::expression<T1,T2,T3,T4,T5> > : NumTraits<Real> {};
+
+  template<>
+  Real test_precision<Real>() { return 1e-50; }
+
+  // needed in C++93 mode where number does not support explicit cast.
+  namespace internal {
+    template<typename NewType>
+    struct cast_impl<Real,NewType> {
+      static inline NewType run(const Real& x) {
+        return x.template convert_to<NewType>();
+      }
+    };
+
+    template<>
+    struct cast_impl<Real,std::complex<Real> > {
+      static inline std::complex<Real>  run(const Real& x) {
+        return std::complex<Real>(x);
+      }
+    };
+  }
+}
+
+namespace boost {
+namespace multiprecision {
+  // to make ADL works as expected:
+  using boost::math::isfinite;
+  using boost::math::isnan;
+  using boost::math::isinf;
+  using boost::math::copysign;
+  using boost::math::hypot;
+
+  // The following is needed for std::complex<Real>:
+  Real fabs(const Real& a) { return abs EIGEN_NOT_A_MACRO (a); }
+  Real fmax(const Real& a, const Real& b) { using std::max; return max(a,b); }
+
+  // some specialization for the unit tests:
+  inline bool test_isMuchSmallerThan(const Real& a, const Real& b) {
+    return internal::isMuchSmallerThan(a, b, test_precision<Real>());
+  }
+
+  inline bool test_isApprox(const Real& a, const Real& b) {
+    return internal::isApprox(a, b, test_precision<Real>());
+  }
+
+  inline bool test_isApproxOrLessThan(const Real& a, const Real& b) {
+    return internal::isApproxOrLessThan(a, b, test_precision<Real>());
+  }
+
+  Real get_test_precision(const Real&) {
+    return test_precision<Real>();
+  }
+
+  Real test_relative_error(const Real &a, const Real &b) {
+    using Eigen::numext::abs2;
+    return sqrt(abs2<Real>(a-b)/Eigen::numext::mini<Real>(abs2(a),abs2(b)));
+  }
+}
+}
+
+namespace Eigen {
+
+}
+
+EIGEN_DECLARE_TEST(boostmultiprec)
+{
+  typedef Matrix<Real,Dynamic,Dynamic> Mat;
+  typedef Matrix<std::complex<Real>,Dynamic,Dynamic> MatC;
+
+  std::cout << "NumTraits<Real>::epsilon()         = " << NumTraits<Real>::epsilon() << std::endl;
+  std::cout << "NumTraits<Real>::dummy_precision() = " << NumTraits<Real>::dummy_precision() << std::endl;
+  std::cout << "NumTraits<Real>::lowest()          = " << NumTraits<Real>::lowest() << std::endl;
+  std::cout << "NumTraits<Real>::highest()         = " << NumTraits<Real>::highest() << std::endl;
+  std::cout << "NumTraits<Real>::digits10()        = " << NumTraits<Real>::digits10() << std::endl;
+
+  // check stream output
+  {
+    Mat A(10,10);
+    A.setRandom();
+    std::stringstream ss;
+    ss << A;
+  }
+  {
+    MatC A(10,10);
+    A.setRandom();
+    std::stringstream ss;
+    ss << A;
+  }
+
+  for(int i = 0; i < g_repeat; i++) {
+    int s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
+
+    CALL_SUBTEST_1( cholesky(Mat(s,s)) );
+
+    CALL_SUBTEST_2( lu_non_invertible<Mat>() );
+    CALL_SUBTEST_2( lu_invertible<Mat>() );
+    CALL_SUBTEST_2( lu_non_invertible<MatC>() );
+    CALL_SUBTEST_2( lu_invertible<MatC>() );
+
+    CALL_SUBTEST_3( qr(Mat(internal::random<int>(1,EIGEN_TEST_MAX_SIZE),internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_3( qr_invertible<Mat>() );
+
+    CALL_SUBTEST_4( qr<Mat>() );
+    CALL_SUBTEST_4( cod<Mat>() );
+    CALL_SUBTEST_4( qr_invertible<Mat>() );
+
+    CALL_SUBTEST_5( qr<Mat>() );
+    CALL_SUBTEST_5( qr_invertible<Mat>() );
+
+    CALL_SUBTEST_6( selfadjointeigensolver(Mat(s,s)) );
+
+    CALL_SUBTEST_7( eigensolver(Mat(s,s)) );
+
+    CALL_SUBTEST_8( generalized_eigensolver_real(Mat(s,s)) );
+
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+  }
+
+  CALL_SUBTEST_9(( jacobisvd(Mat(internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE), internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) ));
+  CALL_SUBTEST_10(( bdcsvd(Mat(internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE), internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) ));
+
+  CALL_SUBTEST_11(( test_simplicial_cholesky_T<Real,int,ColMajor>() ));
+}

diff --git a/test/bug1213.cpp b/test/bug1213.cpp
new file mode 100644
index 0000000..581760c
--- /dev/null
+++ b/test/bug1213.cpp

@@ -0,0 +1,13 @@
+
+// This anonymous enum is essential to trigger the linking issue
+enum {
+  Foo
+};
+
+#include "bug1213.h"
+
+bool bug1213_1(const Eigen::Vector3f& x)
+{
+  return bug1213_2(x);
+}
+

diff --git a/test/bug1213.h b/test/bug1213.h
new file mode 100644
index 0000000..040e5a4
--- /dev/null
+++ b/test/bug1213.h

@@ -0,0 +1,8 @@
+
+#include <Eigen/Core>
+
+template<typename T, int dim>
+bool bug1213_2(const Eigen::Matrix<T,dim,1>& x);
+
+bool bug1213_1(const Eigen::Vector3f& x);
+

diff --git a/test/bug1213_main.cpp b/test/bug1213_main.cpp
new file mode 100644
index 0000000..4802c00
--- /dev/null
+++ b/test/bug1213_main.cpp

@@ -0,0 +1,18 @@
+
+// This is a regression unit regarding a weird linking issue with gcc.
+
+#include "bug1213.h"
+
+int main()
+{
+  return 0;
+}
+
+
+template<typename T, int dim>
+bool bug1213_2(const Eigen::Matrix<T,dim,1>& )
+{
+  return true;
+}
+
+template bool bug1213_2<float,3>(const Eigen::Vector3f&);

diff --git a/test/cholesky.cpp b/test/cholesky.cpp
new file mode 100644
index 0000000..0b1a7b4
--- /dev/null
+++ b/test/cholesky.cpp

@@ -0,0 +1,532 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define TEST_ENABLE_TEMPORARY_TRACKING
+
+#include "main.h"
+#include <Eigen/Cholesky>
+#include <Eigen/QR>
+#include "solverbase.h"
+
+template<typename MatrixType, int UpLo>
+typename MatrixType::RealScalar matrix_l1_norm(const MatrixType& m) {
+  if(m.cols()==0) return typename MatrixType::RealScalar(0);
+  MatrixType symm = m.template selfadjointView<UpLo>();
+  return symm.cwiseAbs().colwise().sum().maxCoeff();
+}
+
+template<typename MatrixType,template <typename,int> class CholType> void test_chol_update(const MatrixType& symm)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+
+  MatrixType symmLo = symm.template triangularView<Lower>();
+  MatrixType symmUp = symm.template triangularView<Upper>();
+  MatrixType symmCpy = symm;
+
+  CholType<MatrixType,Lower> chollo(symmLo);
+  CholType<MatrixType,Upper> cholup(symmUp);
+
+  for (int k=0; k<10; ++k)
+  {
+    VectorType vec = VectorType::Random(symm.rows());
+    RealScalar sigma = internal::random<RealScalar>();
+    symmCpy += sigma * vec * vec.adjoint();
+
+    // we are doing some downdates, so it might be the case that the matrix is not SPD anymore
+    CholType<MatrixType,Lower> chol(symmCpy);
+    if(chol.info()!=Success)
+      break;
+
+    chollo.rankUpdate(vec, sigma);
+    VERIFY_IS_APPROX(symmCpy, chollo.reconstructedMatrix());
+
+    cholup.rankUpdate(vec, sigma);
+    VERIFY_IS_APPROX(symmCpy, cholup.reconstructedMatrix());
+  }
+}
+
+template<typename MatrixType> void cholesky(const MatrixType& m)
+{
+  /* this test covers the following files:
+     LLT.h LDLT.h
+  */
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+
+  MatrixType a0 = MatrixType::Random(rows,cols);
+  VectorType vecB = VectorType::Random(rows), vecX(rows);
+  MatrixType matB = MatrixType::Random(rows,cols), matX(rows,cols);
+  SquareMatrixType symm =  a0 * a0.adjoint();
+  // let's make sure the matrix is not singular or near singular
+  for (int k=0; k<3; ++k)
+  {
+    MatrixType a1 = MatrixType::Random(rows,cols);
+    symm += a1 * a1.adjoint();
+  }
+
+  {
+    STATIC_CHECK(( internal::is_same<typename LLT<MatrixType,Lower>::StorageIndex,int>::value ));
+    STATIC_CHECK(( internal::is_same<typename LLT<MatrixType,Upper>::StorageIndex,int>::value ));
+
+    SquareMatrixType symmUp = symm.template triangularView<Upper>();
+    SquareMatrixType symmLo = symm.template triangularView<Lower>();
+
+    LLT<SquareMatrixType,Lower> chollo(symmLo);
+    VERIFY_IS_APPROX(symm, chollo.reconstructedMatrix());
+
+    check_solverbase<VectorType, VectorType>(symm, chollo, rows, rows, 1);
+    check_solverbase<MatrixType, MatrixType>(symm, chollo, rows, cols, rows);
+
+    const MatrixType symmLo_inverse = chollo.solve(MatrixType::Identity(rows,cols));
+    RealScalar rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Lower>(symmLo)) /
+                             matrix_l1_norm<MatrixType, Lower>(symmLo_inverse);
+    RealScalar rcond_est = chollo.rcond();
+    // Verify that the estimated condition number is within a factor of 10 of the
+    // truth.
+    VERIFY(rcond_est >= rcond / 10 && rcond_est <= rcond * 10);
+
+    // test the upper mode
+    LLT<SquareMatrixType,Upper> cholup(symmUp);
+    VERIFY_IS_APPROX(symm, cholup.reconstructedMatrix());
+    vecX = cholup.solve(vecB);
+    VERIFY_IS_APPROX(symm * vecX, vecB);
+    matX = cholup.solve(matB);
+    VERIFY_IS_APPROX(symm * matX, matB);
+
+    // Verify that the estimated condition number is within a factor of 10 of the
+    // truth.
+    const MatrixType symmUp_inverse = cholup.solve(MatrixType::Identity(rows,cols));
+    rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Upper>(symmUp)) /
+                             matrix_l1_norm<MatrixType, Upper>(symmUp_inverse);
+    rcond_est = cholup.rcond();
+    VERIFY(rcond_est >= rcond / 10 && rcond_est <= rcond * 10);
+
+
+    MatrixType neg = -symmLo;
+    chollo.compute(neg);
+    VERIFY(neg.size()==0 || chollo.info()==NumericalIssue);
+
+    VERIFY_IS_APPROX(MatrixType(chollo.matrixL().transpose().conjugate()), MatrixType(chollo.matrixU()));
+    VERIFY_IS_APPROX(MatrixType(chollo.matrixU().transpose().conjugate()), MatrixType(chollo.matrixL()));
+    VERIFY_IS_APPROX(MatrixType(cholup.matrixL().transpose().conjugate()), MatrixType(cholup.matrixU()));
+    VERIFY_IS_APPROX(MatrixType(cholup.matrixU().transpose().conjugate()), MatrixType(cholup.matrixL()));
+
+    // test some special use cases of SelfCwiseBinaryOp:
+    MatrixType m1 = MatrixType::Random(rows,cols), m2(rows,cols);
+    m2 = m1;
+    m2 += symmLo.template selfadjointView<Lower>().llt().solve(matB);
+    VERIFY_IS_APPROX(m2, m1 + symmLo.template selfadjointView<Lower>().llt().solve(matB));
+    m2 = m1;
+    m2 -= symmLo.template selfadjointView<Lower>().llt().solve(matB);
+    VERIFY_IS_APPROX(m2, m1 - symmLo.template selfadjointView<Lower>().llt().solve(matB));
+    m2 = m1;
+    m2.noalias() += symmLo.template selfadjointView<Lower>().llt().solve(matB);
+    VERIFY_IS_APPROX(m2, m1 + symmLo.template selfadjointView<Lower>().llt().solve(matB));
+    m2 = m1;
+    m2.noalias() -= symmLo.template selfadjointView<Lower>().llt().solve(matB);
+    VERIFY_IS_APPROX(m2, m1 - symmLo.template selfadjointView<Lower>().llt().solve(matB));
+  }
+
+  // LDLT
+  {
+    STATIC_CHECK(( internal::is_same<typename LDLT<MatrixType,Lower>::StorageIndex,int>::value ));
+    STATIC_CHECK(( internal::is_same<typename LDLT<MatrixType,Upper>::StorageIndex,int>::value ));
+
+    int sign = internal::random<int>()%2 ? 1 : -1;
+
+    if(sign == -1)
+    {
+      symm = -symm; // test a negative matrix
+    }
+
+    SquareMatrixType symmUp = symm.template triangularView<Upper>();
+    SquareMatrixType symmLo = symm.template triangularView<Lower>();
+
+    LDLT<SquareMatrixType,Lower> ldltlo(symmLo);
+    VERIFY(ldltlo.info()==Success);
+    VERIFY_IS_APPROX(symm, ldltlo.reconstructedMatrix());
+
+    check_solverbase<VectorType, VectorType>(symm, ldltlo, rows, rows, 1);
+    check_solverbase<MatrixType, MatrixType>(symm, ldltlo, rows, cols, rows);
+
+    const MatrixType symmLo_inverse = ldltlo.solve(MatrixType::Identity(rows,cols));
+    RealScalar rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Lower>(symmLo)) /
+                             matrix_l1_norm<MatrixType, Lower>(symmLo_inverse);
+    RealScalar rcond_est = ldltlo.rcond();
+    // Verify that the estimated condition number is within a factor of 10 of the
+    // truth.
+    VERIFY(rcond_est >= rcond / 10 && rcond_est <= rcond * 10);
+
+
+    LDLT<SquareMatrixType,Upper> ldltup(symmUp);
+    VERIFY(ldltup.info()==Success);
+    VERIFY_IS_APPROX(symm, ldltup.reconstructedMatrix());
+    vecX = ldltup.solve(vecB);
+    VERIFY_IS_APPROX(symm * vecX, vecB);
+    matX = ldltup.solve(matB);
+    VERIFY_IS_APPROX(symm * matX, matB);
+
+    // Verify that the estimated condition number is within a factor of 10 of the
+    // truth.
+    const MatrixType symmUp_inverse = ldltup.solve(MatrixType::Identity(rows,cols));
+    rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Upper>(symmUp)) /
+                             matrix_l1_norm<MatrixType, Upper>(symmUp_inverse);
+    rcond_est = ldltup.rcond();
+    VERIFY(rcond_est >= rcond / 10 && rcond_est <= rcond * 10);
+
+    VERIFY_IS_APPROX(MatrixType(ldltlo.matrixL().transpose().conjugate()), MatrixType(ldltlo.matrixU()));
+    VERIFY_IS_APPROX(MatrixType(ldltlo.matrixU().transpose().conjugate()), MatrixType(ldltlo.matrixL()));
+    VERIFY_IS_APPROX(MatrixType(ldltup.matrixL().transpose().conjugate()), MatrixType(ldltup.matrixU()));
+    VERIFY_IS_APPROX(MatrixType(ldltup.matrixU().transpose().conjugate()), MatrixType(ldltup.matrixL()));
+
+    if(MatrixType::RowsAtCompileTime==Dynamic)
+    {
+      // note : each inplace permutation requires a small temporary vector (mask)
+
+      // check inplace solve
+      matX = matB;
+      VERIFY_EVALUATION_COUNT(matX = ldltlo.solve(matX), 0);
+      VERIFY_IS_APPROX(matX, ldltlo.solve(matB).eval());
+
+
+      matX = matB;
+      VERIFY_EVALUATION_COUNT(matX = ldltup.solve(matX), 0);
+      VERIFY_IS_APPROX(matX, ldltup.solve(matB).eval());
+    }
+
+    // restore
+    if(sign == -1)
+      symm = -symm;
+
+    // check matrices coming from linear constraints with Lagrange multipliers
+    if(rows>=3)
+    {
+      SquareMatrixType A = symm;
+      Index c = internal::random<Index>(0,rows-2);
+      A.bottomRightCorner(c,c).setZero();
+      // Make sure a solution exists:
+      vecX.setRandom();
+      vecB = A * vecX;
+      vecX.setZero();
+      ldltlo.compute(A);
+      VERIFY_IS_APPROX(A, ldltlo.reconstructedMatrix());
+      vecX = ldltlo.solve(vecB);
+      VERIFY_IS_APPROX(A * vecX, vecB);
+    }
+
+    // check non-full rank matrices
+    if(rows>=3)
+    {
+      Index r = internal::random<Index>(1,rows-1);
+      Matrix<Scalar,Dynamic,Dynamic> a = Matrix<Scalar,Dynamic,Dynamic>::Random(rows,r);
+      SquareMatrixType A = a * a.adjoint();
+      // Make sure a solution exists:
+      vecX.setRandom();
+      vecB = A * vecX;
+      vecX.setZero();
+      ldltlo.compute(A);
+      VERIFY_IS_APPROX(A, ldltlo.reconstructedMatrix());
+      vecX = ldltlo.solve(vecB);
+      VERIFY_IS_APPROX(A * vecX, vecB);
+    }
+
+    // check matrices with a wide spectrum
+    if(rows>=3)
+    {
+      using std::pow;
+      using std::sqrt;
+      RealScalar s = (std::min)(16,std::numeric_limits<RealScalar>::max_exponent10/8);
+      Matrix<Scalar,Dynamic,Dynamic> a = Matrix<Scalar,Dynamic,Dynamic>::Random(rows,rows);
+      Matrix<RealScalar,Dynamic,1> d =  Matrix<RealScalar,Dynamic,1>::Random(rows);
+      for(Index k=0; k<rows; ++k)
+        d(k) = d(k)*pow(RealScalar(10),internal::random<RealScalar>(-s,s));
+      SquareMatrixType A = a * d.asDiagonal() * a.adjoint();
+      // Make sure a solution exists:
+      vecX.setRandom();
+      vecB = A * vecX;
+      vecX.setZero();
+      ldltlo.compute(A);
+      VERIFY_IS_APPROX(A, ldltlo.reconstructedMatrix());
+      vecX = ldltlo.solve(vecB);
+
+      if(ldltlo.vectorD().real().cwiseAbs().minCoeff()>RealScalar(0))
+      {
+        VERIFY_IS_APPROX(A * vecX,vecB);
+      }
+      else
+      {
+        RealScalar large_tol =  sqrt(test_precision<RealScalar>());
+        VERIFY((A * vecX).isApprox(vecB, large_tol));
+
+        ++g_test_level;
+        VERIFY_IS_APPROX(A * vecX,vecB);
+        --g_test_level;
+      }
+    }
+  }
+
+  // update/downdate
+  CALL_SUBTEST(( test_chol_update<SquareMatrixType,LLT>(symm)  ));
+  CALL_SUBTEST(( test_chol_update<SquareMatrixType,LDLT>(symm) ));
+}
+
+template<typename MatrixType> void cholesky_cplx(const MatrixType& m)
+{
+  // classic test
+  cholesky(m);
+
+  // test mixing real/scalar types
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<RealScalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> RealMatrixType;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+
+  RealMatrixType a0 = RealMatrixType::Random(rows,cols);
+  VectorType vecB = VectorType::Random(rows), vecX(rows);
+  MatrixType matB = MatrixType::Random(rows,cols), matX(rows,cols);
+  RealMatrixType symm =  a0 * a0.adjoint();
+  // let's make sure the matrix is not singular or near singular
+  for (int k=0; k<3; ++k)
+  {
+    RealMatrixType a1 = RealMatrixType::Random(rows,cols);
+    symm += a1 * a1.adjoint();
+  }
+
+  {
+    RealMatrixType symmLo = symm.template triangularView<Lower>();
+
+    LLT<RealMatrixType,Lower> chollo(symmLo);
+    VERIFY_IS_APPROX(symm, chollo.reconstructedMatrix());
+
+    check_solverbase<VectorType, VectorType>(symm, chollo, rows, rows, 1);
+    //check_solverbase<MatrixType, MatrixType>(symm, chollo, rows, cols, rows);
+  }
+
+  // LDLT
+  {
+    int sign = internal::random<int>()%2 ? 1 : -1;
+
+    if(sign == -1)
+    {
+      symm = -symm; // test a negative matrix
+    }
+
+    RealMatrixType symmLo = symm.template triangularView<Lower>();
+
+    LDLT<RealMatrixType,Lower> ldltlo(symmLo);
+    VERIFY(ldltlo.info()==Success);
+    VERIFY_IS_APPROX(symm, ldltlo.reconstructedMatrix());
+
+    check_solverbase<VectorType, VectorType>(symm, ldltlo, rows, rows, 1);
+    //check_solverbase<MatrixType, MatrixType>(symm, ldltlo, rows, cols, rows);
+  }
+}
+
+// regression test for bug 241
+template<typename MatrixType> void cholesky_bug241(const MatrixType& m)
+{
+  eigen_assert(m.rows() == 2 && m.cols() == 2);
+
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+
+  MatrixType matA;
+  matA << 1, 1, 1, 1;
+  VectorType vecB;
+  vecB << 1, 1;
+  VectorType vecX = matA.ldlt().solve(vecB);
+  VERIFY_IS_APPROX(matA * vecX, vecB);
+}
+
+// LDLT is not guaranteed to work for indefinite matrices, but happens to work fine if matrix is diagonal.
+// This test checks that LDLT reports correctly that matrix is indefinite.
+// See http://forum.kde.org/viewtopic.php?f=74&t=106942 and bug 736
+template<typename MatrixType> void cholesky_definiteness(const MatrixType& m)
+{
+  eigen_assert(m.rows() == 2 && m.cols() == 2);
+  MatrixType mat;
+  LDLT<MatrixType> ldlt(2);
+
+  {
+    mat << 1, 0, 0, -1;
+    ldlt.compute(mat);
+    VERIFY(ldlt.info()==Success);
+    VERIFY(!ldlt.isNegative());
+    VERIFY(!ldlt.isPositive());
+    VERIFY_IS_APPROX(mat,ldlt.reconstructedMatrix());
+  }
+  {
+    mat << 1, 2, 2, 1;
+    ldlt.compute(mat);
+    VERIFY(ldlt.info()==Success);
+    VERIFY(!ldlt.isNegative());
+    VERIFY(!ldlt.isPositive());
+    VERIFY_IS_APPROX(mat,ldlt.reconstructedMatrix());
+  }
+  {
+    mat << 0, 0, 0, 0;
+    ldlt.compute(mat);
+    VERIFY(ldlt.info()==Success);
+    VERIFY(ldlt.isNegative());
+    VERIFY(ldlt.isPositive());
+    VERIFY_IS_APPROX(mat,ldlt.reconstructedMatrix());
+  }
+  {
+    mat << 0, 0, 0, 1;
+    ldlt.compute(mat);
+    VERIFY(ldlt.info()==Success);
+    VERIFY(!ldlt.isNegative());
+    VERIFY(ldlt.isPositive());
+    VERIFY_IS_APPROX(mat,ldlt.reconstructedMatrix());
+  }
+  {
+    mat << -1, 0, 0, 0;
+    ldlt.compute(mat);
+    VERIFY(ldlt.info()==Success);
+    VERIFY(ldlt.isNegative());
+    VERIFY(!ldlt.isPositive());
+    VERIFY_IS_APPROX(mat,ldlt.reconstructedMatrix());
+  }
+}
+
+template<typename>
+void cholesky_faillure_cases()
+{
+  MatrixXd mat;
+  LDLT<MatrixXd> ldlt;
+
+  {
+    mat.resize(2,2);
+    mat << 0, 1, 1, 0;
+    ldlt.compute(mat);
+    VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
+    VERIFY(ldlt.info()==NumericalIssue);
+  }
+#if (!EIGEN_ARCH_i386) || defined(EIGEN_VECTORIZE_SSE2)
+  {
+    mat.resize(3,3);
+    mat << -1, -3, 3,
+           -3, -8.9999999999999999999, 1,
+            3, 1, 0;
+    ldlt.compute(mat);
+    VERIFY(ldlt.info()==NumericalIssue);
+    VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
+  }
+#endif
+  {
+    mat.resize(3,3);
+    mat <<  1, 2, 3,
+            2, 4, 1,
+            3, 1, 0;
+    ldlt.compute(mat);
+    VERIFY(ldlt.info()==NumericalIssue);
+    VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
+  }
+
+  {
+    mat.resize(8,8);
+    mat <<  0.1, 0, -0.1, 0, 0, 0, 1, 0,
+            0, 4.24667, 0, 2.00333, 0, 0, 0, 0,
+            -0.1, 0, 0.2, 0, -0.1, 0, 0, 0,
+            0, 2.00333, 0, 8.49333, 0, 2.00333, 0, 0,
+            0, 0, -0.1, 0, 0.1, 0, 0, 1,
+            0, 0, 0, 2.00333, 0, 4.24667, 0, 0,
+            1, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 1, 0, 0, 0;
+    ldlt.compute(mat);
+    VERIFY(ldlt.info()==NumericalIssue);
+    VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
+  }
+
+  // bug 1479
+  {
+    mat.resize(4,4);
+    mat <<  1, 2, 0, 1,
+            2, 4, 0, 2,
+            0, 0, 0, 1,
+            1, 2, 1, 1;
+    ldlt.compute(mat);
+    VERIFY(ldlt.info()==NumericalIssue);
+    VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
+  }
+}
+
+template<typename MatrixType> void cholesky_verify_assert()
+{
+  MatrixType tmp;
+
+  LLT<MatrixType> llt;
+  VERIFY_RAISES_ASSERT(llt.matrixL())
+  VERIFY_RAISES_ASSERT(llt.matrixU())
+  VERIFY_RAISES_ASSERT(llt.solve(tmp))
+  VERIFY_RAISES_ASSERT(llt.transpose().solve(tmp))
+  VERIFY_RAISES_ASSERT(llt.adjoint().solve(tmp))
+  VERIFY_RAISES_ASSERT(llt.solveInPlace(tmp))
+
+  LDLT<MatrixType> ldlt;
+  VERIFY_RAISES_ASSERT(ldlt.matrixL())
+  VERIFY_RAISES_ASSERT(ldlt.transpositionsP())
+  VERIFY_RAISES_ASSERT(ldlt.vectorD())
+  VERIFY_RAISES_ASSERT(ldlt.isPositive())
+  VERIFY_RAISES_ASSERT(ldlt.isNegative())
+  VERIFY_RAISES_ASSERT(ldlt.solve(tmp))
+  VERIFY_RAISES_ASSERT(ldlt.transpose().solve(tmp))
+  VERIFY_RAISES_ASSERT(ldlt.adjoint().solve(tmp))
+  VERIFY_RAISES_ASSERT(ldlt.solveInPlace(tmp))
+}
+
+EIGEN_DECLARE_TEST(cholesky)
+{
+  int s = 0;
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( cholesky(Matrix<double,1,1>()) );
+    CALL_SUBTEST_3( cholesky(Matrix2d()) );
+    CALL_SUBTEST_3( cholesky_bug241(Matrix2d()) );
+    CALL_SUBTEST_3( cholesky_definiteness(Matrix2d()) );
+    CALL_SUBTEST_4( cholesky(Matrix3f()) );
+    CALL_SUBTEST_5( cholesky(Matrix4d()) );
+
+    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
+    CALL_SUBTEST_2( cholesky(MatrixXd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+
+    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
+    CALL_SUBTEST_6( cholesky_cplx(MatrixXcd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+  }
+  // empty matrix, regression test for Bug 785:
+  CALL_SUBTEST_2( cholesky(MatrixXd(0,0)) );
+
+  // This does not work yet:
+  // CALL_SUBTEST_2( cholesky(Matrix<double,0,0>()) );
+
+  CALL_SUBTEST_4( cholesky_verify_assert<Matrix3f>() );
+  CALL_SUBTEST_7( cholesky_verify_assert<Matrix3d>() );
+  CALL_SUBTEST_8( cholesky_verify_assert<MatrixXf>() );
+  CALL_SUBTEST_2( cholesky_verify_assert<MatrixXd>() );
+
+  // Test problem size constructors
+  CALL_SUBTEST_9( LLT<MatrixXf>(10) );
+  CALL_SUBTEST_9( LDLT<MatrixXf>(10) );
+
+  CALL_SUBTEST_2( cholesky_faillure_cases<void>() );
+
+  TEST_SET_BUT_UNUSED_VARIABLE(nb_temporaries)
+}

diff --git a/test/cholmod_support.cpp b/test/cholmod_support.cpp
new file mode 100644
index 0000000..89b9cf4
--- /dev/null
+++ b/test/cholmod_support.cpp

@@ -0,0 +1,69 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
+#include "sparse_solver.h"
+
+#include <Eigen/CholmodSupport>
+
+template<typename SparseType> void test_cholmod_ST()
+{
+  CholmodDecomposition<SparseType, Lower> g_chol_colmajor_lower; g_chol_colmajor_lower.setMode(CholmodSupernodalLLt);
+  CholmodDecomposition<SparseType, Upper> g_chol_colmajor_upper; g_chol_colmajor_upper.setMode(CholmodSupernodalLLt);
+  CholmodDecomposition<SparseType, Lower> g_llt_colmajor_lower;  g_llt_colmajor_lower.setMode(CholmodSimplicialLLt);
+  CholmodDecomposition<SparseType, Upper> g_llt_colmajor_upper;  g_llt_colmajor_upper.setMode(CholmodSimplicialLLt);
+  CholmodDecomposition<SparseType, Lower> g_ldlt_colmajor_lower; g_ldlt_colmajor_lower.setMode(CholmodLDLt);
+  CholmodDecomposition<SparseType, Upper> g_ldlt_colmajor_upper; g_ldlt_colmajor_upper.setMode(CholmodLDLt);
+  
+  CholmodSupernodalLLT<SparseType, Lower> chol_colmajor_lower;
+  CholmodSupernodalLLT<SparseType, Upper> chol_colmajor_upper;
+  CholmodSimplicialLLT<SparseType, Lower> llt_colmajor_lower;
+  CholmodSimplicialLLT<SparseType, Upper> llt_colmajor_upper;
+  CholmodSimplicialLDLT<SparseType, Lower> ldlt_colmajor_lower;
+  CholmodSimplicialLDLT<SparseType, Upper> ldlt_colmajor_upper;
+
+  check_sparse_spd_solving(g_chol_colmajor_lower);
+  check_sparse_spd_solving(g_chol_colmajor_upper);
+  check_sparse_spd_solving(g_llt_colmajor_lower);
+  check_sparse_spd_solving(g_llt_colmajor_upper);
+  check_sparse_spd_solving(g_ldlt_colmajor_lower);
+  check_sparse_spd_solving(g_ldlt_colmajor_upper);
+  
+  check_sparse_spd_solving(chol_colmajor_lower);
+  check_sparse_spd_solving(chol_colmajor_upper);
+  check_sparse_spd_solving(llt_colmajor_lower);
+  check_sparse_spd_solving(llt_colmajor_upper);
+  check_sparse_spd_solving(ldlt_colmajor_lower);
+  check_sparse_spd_solving(ldlt_colmajor_upper);
+
+  check_sparse_spd_determinant(chol_colmajor_lower);
+  check_sparse_spd_determinant(chol_colmajor_upper);
+  check_sparse_spd_determinant(llt_colmajor_lower);
+  check_sparse_spd_determinant(llt_colmajor_upper);
+  check_sparse_spd_determinant(ldlt_colmajor_lower);
+  check_sparse_spd_determinant(ldlt_colmajor_upper);
+}
+
+template<typename T, int flags, typename IdxType> void test_cholmod_T()
+{
+    test_cholmod_ST<SparseMatrix<T, flags, IdxType> >();
+}
+
+EIGEN_DECLARE_TEST(cholmod_support)
+{
+  CALL_SUBTEST_11( (test_cholmod_T<double              , ColMajor, int >()) );
+  CALL_SUBTEST_12( (test_cholmod_T<double              , ColMajor, long>()) );
+  CALL_SUBTEST_13( (test_cholmod_T<double              , RowMajor, int >()) );
+  CALL_SUBTEST_14( (test_cholmod_T<double              , RowMajor, long>()) );
+  CALL_SUBTEST_21( (test_cholmod_T<std::complex<double>, ColMajor, int >()) );
+  CALL_SUBTEST_22( (test_cholmod_T<std::complex<double>, ColMajor, long>()) );
+  // TODO complex row-major matrices do not work at the moment:
+  // CALL_SUBTEST_23( (test_cholmod_T<std::complex<double>, RowMajor, int >()) );
+  // CALL_SUBTEST_24( (test_cholmod_T<std::complex<double>, RowMajor, long>()) );
+}

diff --git a/test/commainitializer.cpp b/test/commainitializer.cpp
new file mode 100644
index 0000000..eb275be
--- /dev/null
+++ b/test/commainitializer.cpp

@@ -0,0 +1,118 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+
+template<int M1, int M2, int N1, int N2>
+void test_blocks()
+{
+  Matrix<int, M1+M2, N1+N2> m_fixed;
+  MatrixXi m_dynamic(M1+M2, N1+N2);
+
+  Matrix<int, M1, N1> mat11; mat11.setRandom();
+  Matrix<int, M1, N2> mat12; mat12.setRandom();
+  Matrix<int, M2, N1> mat21; mat21.setRandom();
+  Matrix<int, M2, N2> mat22; mat22.setRandom();
+
+  MatrixXi matx11 = mat11, matx12 = mat12, matx21 = mat21, matx22 = mat22;
+
+  {
+    VERIFY_IS_EQUAL((m_fixed << mat11, mat12, mat21, matx22).finished(), (m_dynamic << mat11, matx12, mat21, matx22).finished());
+    VERIFY_IS_EQUAL((m_fixed.template topLeftCorner<M1,N1>()), mat11);
+    VERIFY_IS_EQUAL((m_fixed.template topRightCorner<M1,N2>()), mat12);
+    VERIFY_IS_EQUAL((m_fixed.template bottomLeftCorner<M2,N1>()), mat21);
+    VERIFY_IS_EQUAL((m_fixed.template bottomRightCorner<M2,N2>()), mat22);
+    VERIFY_IS_EQUAL((m_fixed << mat12, mat11, matx21, mat22).finished(), (m_dynamic << mat12, matx11, matx21, mat22).finished());
+  }
+
+  if(N1 > 0)
+  {
+    if(M1 > 0)
+    {
+      VERIFY_RAISES_ASSERT((m_fixed << mat11, mat12, mat11, mat21, mat22));
+    }
+    if(M2 > 0)
+    {
+      VERIFY_RAISES_ASSERT((m_fixed << mat11, mat12, mat21, mat21, mat22));
+    }
+  }
+  else
+  {
+    // allow insertion of zero-column blocks:
+    VERIFY_IS_EQUAL((m_fixed << mat11, mat12, mat11, mat11, mat21, mat21, mat22).finished(), (m_dynamic << mat12, mat22).finished());
+  }
+  if(M1 != M2)
+  {
+    VERIFY_RAISES_ASSERT((m_fixed << mat11, mat21, mat12, mat22));
+  }
+}
+
+
+template<int depth, int N=0>
+struct test_block_recursion
+{
+  static void run()
+  {
+    test_block_recursion<depth-1, N>::run();
+    test_block_recursion<depth-1, N + (1 << (depth-1))>::run();
+  }
+};
+
+template<int N>
+struct test_block_recursion<0,N>
+{
+  static void run() {
+    test_blocks<(N>>6)&3, (N>>4)&3, (N>>2)&3, N & 3>();
+  }
+};
+
+void test_basics() {
+  Matrix3d m3;
+  Matrix4d m4;
+
+  VERIFY_RAISES_ASSERT( (m3 << 1, 2, 3, 4, 5, 6, 7, 8) );
+  
+  #ifndef _MSC_VER
+  VERIFY_RAISES_ASSERT( (m3 << 1, 2, 3, 4, 5, 6, 7, 8, 9, 10) );
+  #endif
+
+  double data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  Matrix3d ref = Map<Matrix<double,3,3,RowMajor> >(data);
+
+  m3 = Matrix3d::Random();
+  m3 << 1, 2, 3, 4, 5, 6, 7, 8, 9;
+  VERIFY_IS_APPROX(m3, ref );
+
+  Vector3d vec[3];
+  vec[0] << 1, 4, 7;
+  vec[1] << 2, 5, 8;
+  vec[2] << 3, 6, 9;
+  m3 = Matrix3d::Random();
+  m3 << vec[0], vec[1], vec[2];
+  VERIFY_IS_APPROX(m3, ref);
+
+  vec[0] << 1, 2, 3;
+  vec[1] << 4, 5, 6;
+  vec[2] << 7, 8, 9;
+  m3 = Matrix3d::Random();
+  m3 << vec[0].transpose(),
+        4, 5, 6,
+        vec[2].transpose();
+  VERIFY_IS_APPROX(m3, ref);
+}
+
+EIGEN_DECLARE_TEST(commainitializer)
+{
+
+  CALL_SUBTEST_1(test_basics());
+
+  // recursively test all block-sizes from 0 to 3:
+  CALL_SUBTEST_2(test_block_recursion<8>::run());
+}

diff --git a/test/conjugate_gradient.cpp b/test/conjugate_gradient.cpp
new file mode 100644
index 0000000..b076a12
--- /dev/null
+++ b/test/conjugate_gradient.cpp

@@ -0,0 +1,34 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "sparse_solver.h"
+#include <Eigen/IterativeLinearSolvers>
+
+template<typename T, typename I_> void test_conjugate_gradient_T()
+{
+  typedef SparseMatrix<T,0,I_> SparseMatrixType;
+  ConjugateGradient<SparseMatrixType, Lower      > cg_colmajor_lower_diag;
+  ConjugateGradient<SparseMatrixType, Upper      > cg_colmajor_upper_diag;
+  ConjugateGradient<SparseMatrixType, Lower|Upper> cg_colmajor_loup_diag;
+  ConjugateGradient<SparseMatrixType, Lower, IdentityPreconditioner> cg_colmajor_lower_I;
+  ConjugateGradient<SparseMatrixType, Upper, IdentityPreconditioner> cg_colmajor_upper_I;
+
+  CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_lower_diag)  );
+  CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_upper_diag)  );
+  CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_loup_diag)   );
+  CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_lower_I)     );
+  CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_upper_I)     );
+}
+
+EIGEN_DECLARE_TEST(conjugate_gradient)
+{
+  CALL_SUBTEST_1(( test_conjugate_gradient_T<double,int>() ));
+  CALL_SUBTEST_2(( test_conjugate_gradient_T<std::complex<double>, int>() ));
+  CALL_SUBTEST_3(( test_conjugate_gradient_T<double,long int>() ));
+}

diff --git a/test/conservative_resize.cpp b/test/conservative_resize.cpp
new file mode 100644
index 0000000..d48eb12
--- /dev/null
+++ b/test/conservative_resize.cpp

@@ -0,0 +1,167 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/Core>
+#include "AnnoyingScalar.h"
+
+using namespace Eigen;
+
+template <typename Scalar, int Storage>
+void run_matrix_tests()
+{
+  typedef Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Storage> MatrixType;
+
+  MatrixType m, n;
+
+  // boundary cases ...
+  m = n = MatrixType::Random(50,50);
+  m.conservativeResize(1,50);
+  VERIFY_IS_APPROX(m, n.block(0,0,1,50));
+
+  m = n = MatrixType::Random(50,50);
+  m.conservativeResize(50,1);
+  VERIFY_IS_APPROX(m, n.block(0,0,50,1));
+
+  m = n = MatrixType::Random(50,50);
+  m.conservativeResize(50,50);
+  VERIFY_IS_APPROX(m, n.block(0,0,50,50));
+
+  // random shrinking ...
+  for (int i=0; i<25; ++i)
+  {
+    const Index rows = internal::random<Index>(1,50);
+    const Index cols = internal::random<Index>(1,50);
+    m = n = MatrixType::Random(50,50);
+    m.conservativeResize(rows,cols);
+    VERIFY_IS_APPROX(m, n.block(0,0,rows,cols));
+  }
+
+  // random growing with zeroing ...
+  for (int i=0; i<25; ++i)
+  {
+    const Index rows = internal::random<Index>(50,75);
+    const Index cols = internal::random<Index>(50,75);
+    m = n = MatrixType::Random(50,50);
+    m.conservativeResizeLike(MatrixType::Zero(rows,cols));
+    VERIFY_IS_APPROX(m.block(0,0,n.rows(),n.cols()), n);
+    VERIFY( rows<=50 || m.block(50,0,rows-50,cols).sum() == Scalar(0) );
+    VERIFY( cols<=50 || m.block(0,50,rows,cols-50).sum() == Scalar(0) );
+  }
+}
+
+template <typename Scalar>
+void run_vector_tests()
+{
+  typedef Matrix<Scalar, 1, Eigen::Dynamic> VectorType;
+
+  VectorType m, n;
+
+  // boundary cases ...
+  m = n = VectorType::Random(50);
+  m.conservativeResize(1);
+  VERIFY_IS_APPROX(m, n.segment(0,1));
+
+  m = n = VectorType::Random(50);
+  m.conservativeResize(50);
+  VERIFY_IS_APPROX(m, n.segment(0,50));
+  
+  m = n = VectorType::Random(50);
+  m.conservativeResize(m.rows(),1);
+  VERIFY_IS_APPROX(m, n.segment(0,1));
+
+  m = n = VectorType::Random(50);
+  m.conservativeResize(m.rows(),50);
+  VERIFY_IS_APPROX(m, n.segment(0,50));
+
+  // random shrinking ...
+  for (int i=0; i<50; ++i)
+  {
+    const int size = internal::random<int>(1,50);
+    m = n = VectorType::Random(50);
+    m.conservativeResize(size);
+    VERIFY_IS_APPROX(m, n.segment(0,size));
+    
+    m = n = VectorType::Random(50);
+    m.conservativeResize(m.rows(), size);
+    VERIFY_IS_APPROX(m, n.segment(0,size));
+  }
+
+  // random growing with zeroing ...
+  for (int i=0; i<50; ++i)
+  {
+    const int size = internal::random<int>(50,100);
+    m = n = VectorType::Random(50);
+    m.conservativeResizeLike(VectorType::Zero(size));
+    VERIFY_IS_APPROX(m.segment(0,50), n);
+    VERIFY( size<=50 || m.segment(50,size-50).sum() == Scalar(0) );
+    
+    m = n = VectorType::Random(50);
+    m.conservativeResizeLike(Matrix<Scalar,Dynamic,Dynamic>::Zero(1,size));
+    VERIFY_IS_APPROX(m.segment(0,50), n);
+    VERIFY( size<=50 || m.segment(50,size-50).sum() == Scalar(0) );
+  }
+}
+
+// Basic memory leak check with a non-copyable scalar type
+template<int> void noncopyable()
+{
+  typedef Eigen::Matrix<AnnoyingScalar,Dynamic,1> VectorType;
+  typedef Eigen::Matrix<AnnoyingScalar,Dynamic,Dynamic> MatrixType;
+
+  {
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+    AnnoyingScalar::dont_throw = true;
+#endif
+    int n = 50;
+    VectorType v0(n), v1(n);
+    MatrixType m0(n,n), m1(n,n), m2(n,n);
+    v0.setOnes(); v1.setOnes();
+    m0.setOnes(); m1.setOnes(); m2.setOnes();
+    VERIFY(m0==m1);
+    m0.conservativeResize(2*n,2*n);
+    VERIFY(m0.topLeftCorner(n,n) == m1);
+    
+    VERIFY(v0.head(n) == v1);
+    v0.conservativeResize(2*n);
+    VERIFY(v0.head(n) == v1);
+  }
+  VERIFY(AnnoyingScalar::instances==0 && "global memory leak detected in noncopyable");
+}
+
+EIGEN_DECLARE_TEST(conservative_resize)
+{
+  for(int i=0; i<g_repeat; ++i)
+  {
+    CALL_SUBTEST_1((run_matrix_tests<int, Eigen::RowMajor>()));
+    CALL_SUBTEST_1((run_matrix_tests<int, Eigen::ColMajor>()));
+    CALL_SUBTEST_2((run_matrix_tests<float, Eigen::RowMajor>()));
+    CALL_SUBTEST_2((run_matrix_tests<float, Eigen::ColMajor>()));
+    CALL_SUBTEST_3((run_matrix_tests<double, Eigen::RowMajor>()));
+    CALL_SUBTEST_3((run_matrix_tests<double, Eigen::ColMajor>()));
+    CALL_SUBTEST_4((run_matrix_tests<std::complex<float>, Eigen::RowMajor>()));
+    CALL_SUBTEST_4((run_matrix_tests<std::complex<float>, Eigen::ColMajor>()));
+    CALL_SUBTEST_5((run_matrix_tests<std::complex<double>, Eigen::RowMajor>()));
+    CALL_SUBTEST_5((run_matrix_tests<std::complex<double>, Eigen::ColMajor>()));
+    CALL_SUBTEST_1((run_matrix_tests<int, Eigen::RowMajor | Eigen::DontAlign>()));
+
+    CALL_SUBTEST_1((run_vector_tests<int>()));
+    CALL_SUBTEST_2((run_vector_tests<float>()));
+    CALL_SUBTEST_3((run_vector_tests<double>()));
+    CALL_SUBTEST_4((run_vector_tests<std::complex<float> >()));
+    CALL_SUBTEST_5((run_vector_tests<std::complex<double> >()));
+
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+    AnnoyingScalar::dont_throw = true;
+#endif
+    CALL_SUBTEST_6(( run_vector_tests<AnnoyingScalar>() ));
+    CALL_SUBTEST_6(( noncopyable<0>() ));
+  }
+}

diff --git a/test/constructor.cpp b/test/constructor.cpp
new file mode 100644
index 0000000..ffd5e80
--- /dev/null
+++ b/test/constructor.cpp

@@ -0,0 +1,98 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define TEST_ENABLE_TEMPORARY_TRACKING
+
+#include "main.h"
+
+template<typename MatrixType> struct Wrapper
+{
+  MatrixType m_mat;
+  inline Wrapper(const MatrixType &x) : m_mat(x) {}
+  inline operator const MatrixType& () const { return m_mat; }
+  inline operator MatrixType& () { return m_mat; }
+};
+
+enum my_sizes { M = 12, N = 7};
+
+template<typename MatrixType> void ctor_init1(const MatrixType& m)
+{
+  // Check logic in PlainObjectBase::_init1
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m0 = MatrixType::Random(rows,cols);
+
+  VERIFY_EVALUATION_COUNT( MatrixType m1(m0), 1);
+  VERIFY_EVALUATION_COUNT( MatrixType m2(m0+m0), 1);
+  VERIFY_EVALUATION_COUNT( MatrixType m2(m0.block(0,0,rows,cols)) , 1);
+
+  Wrapper<MatrixType> wrapper(m0);
+  VERIFY_EVALUATION_COUNT( MatrixType m3(wrapper) , 1);
+}
+
+
+EIGEN_DECLARE_TEST(constructor)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( ctor_init1(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_1( ctor_init1(Matrix4d()) );
+    CALL_SUBTEST_1( ctor_init1(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_1( ctor_init1(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  }
+  {
+    Matrix<Index,1,1> a(123);
+    VERIFY_IS_EQUAL(a[0], 123);
+  }
+  {
+    Matrix<Index,1,1> a(123.0);
+    VERIFY_IS_EQUAL(a[0], 123);
+  }
+  {
+    Matrix<float,1,1> a(123);
+    VERIFY_IS_EQUAL(a[0], 123.f);
+  }
+  {
+    Array<Index,1,1> a(123);
+    VERIFY_IS_EQUAL(a[0], 123);
+  }
+  {
+    Array<Index,1,1> a(123.0);
+    VERIFY_IS_EQUAL(a[0], 123);
+  }
+  {
+    Array<float,1,1> a(123);
+    VERIFY_IS_EQUAL(a[0], 123.f);
+  }
+  {
+    Array<Index,3,3> a(123);
+    VERIFY_IS_EQUAL(a(4), 123);
+  }
+  {
+    Array<Index,3,3> a(123.0);
+    VERIFY_IS_EQUAL(a(4), 123);
+  }
+  {
+    Array<float,3,3> a(123);
+    VERIFY_IS_EQUAL(a(4), 123.f);
+  }
+  {
+    MatrixXi m1(M,N);
+    VERIFY_IS_EQUAL(m1.rows(),M);
+    VERIFY_IS_EQUAL(m1.cols(),N);
+    ArrayXXi a1(M,N);
+    VERIFY_IS_EQUAL(a1.rows(),M);
+    VERIFY_IS_EQUAL(a1.cols(),N);
+    VectorXi v1(M);
+    VERIFY_IS_EQUAL(v1.size(),M);
+    ArrayXi a2(M);
+    VERIFY_IS_EQUAL(a2.size(),M);
+  }
+}

diff --git a/test/corners.cpp b/test/corners.cpp
new file mode 100644
index 0000000..73342a8
--- /dev/null
+++ b/test/corners.cpp

@@ -0,0 +1,117 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#define COMPARE_CORNER(A,B) \
+  VERIFY_IS_EQUAL(matrix.A, matrix.B); \
+  VERIFY_IS_EQUAL(const_matrix.A, const_matrix.B);
+
+template<typename MatrixType> void corners(const MatrixType& m)
+{
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  Index r = internal::random<Index>(1,rows);
+  Index c = internal::random<Index>(1,cols);
+
+  MatrixType matrix = MatrixType::Random(rows,cols);
+  const MatrixType const_matrix = MatrixType::Random(rows,cols);
+
+  COMPARE_CORNER(topLeftCorner(r,c), block(0,0,r,c));
+  COMPARE_CORNER(topRightCorner(r,c), block(0,cols-c,r,c));
+  COMPARE_CORNER(bottomLeftCorner(r,c), block(rows-r,0,r,c));
+  COMPARE_CORNER(bottomRightCorner(r,c), block(rows-r,cols-c,r,c));
+
+  Index sr = internal::random<Index>(1,rows) - 1;
+  Index nr = internal::random<Index>(1,rows-sr);
+  Index sc = internal::random<Index>(1,cols) - 1;
+  Index nc = internal::random<Index>(1,cols-sc);
+
+  COMPARE_CORNER(topRows(r), block(0,0,r,cols));
+  COMPARE_CORNER(middleRows(sr,nr), block(sr,0,nr,cols));
+  COMPARE_CORNER(bottomRows(r), block(rows-r,0,r,cols));
+  COMPARE_CORNER(leftCols(c), block(0,0,rows,c));
+  COMPARE_CORNER(middleCols(sc,nc), block(0,sc,rows,nc));
+  COMPARE_CORNER(rightCols(c), block(0,cols-c,rows,c));
+}
+
+template<typename MatrixType, int CRows, int CCols, int SRows, int SCols> void corners_fixedsize()
+{
+  MatrixType matrix = MatrixType::Random();
+  const MatrixType const_matrix = MatrixType::Random();
+
+  enum {
+    rows = MatrixType::RowsAtCompileTime,
+    cols = MatrixType::ColsAtCompileTime,
+    r = CRows,
+    c = CCols,
+	sr = SRows,
+	sc = SCols
+  };
+
+  VERIFY_IS_EQUAL((matrix.template topLeftCorner<r,c>()), (matrix.template block<r,c>(0,0)));
+  VERIFY_IS_EQUAL((matrix.template topRightCorner<r,c>()), (matrix.template block<r,c>(0,cols-c)));
+  VERIFY_IS_EQUAL((matrix.template bottomLeftCorner<r,c>()), (matrix.template block<r,c>(rows-r,0)));
+  VERIFY_IS_EQUAL((matrix.template bottomRightCorner<r,c>()), (matrix.template block<r,c>(rows-r,cols-c)));
+
+  VERIFY_IS_EQUAL((matrix.template topLeftCorner<r,c>()), (matrix.template topLeftCorner<r,Dynamic>(r,c)));
+  VERIFY_IS_EQUAL((matrix.template topRightCorner<r,c>()), (matrix.template topRightCorner<r,Dynamic>(r,c)));
+  VERIFY_IS_EQUAL((matrix.template bottomLeftCorner<r,c>()), (matrix.template bottomLeftCorner<r,Dynamic>(r,c)));
+  VERIFY_IS_EQUAL((matrix.template bottomRightCorner<r,c>()), (matrix.template bottomRightCorner<r,Dynamic>(r,c)));
+
+  VERIFY_IS_EQUAL((matrix.template topLeftCorner<r,c>()), (matrix.template topLeftCorner<Dynamic,c>(r,c)));
+  VERIFY_IS_EQUAL((matrix.template topRightCorner<r,c>()), (matrix.template topRightCorner<Dynamic,c>(r,c)));
+  VERIFY_IS_EQUAL((matrix.template bottomLeftCorner<r,c>()), (matrix.template bottomLeftCorner<Dynamic,c>(r,c)));
+  VERIFY_IS_EQUAL((matrix.template bottomRightCorner<r,c>()), (matrix.template bottomRightCorner<Dynamic,c>(r,c)));
+
+  VERIFY_IS_EQUAL((matrix.template topRows<r>()), (matrix.template block<r,cols>(0,0)));
+  VERIFY_IS_EQUAL((matrix.template middleRows<r>(sr)), (matrix.template block<r,cols>(sr,0)));
+  VERIFY_IS_EQUAL((matrix.template bottomRows<r>()), (matrix.template block<r,cols>(rows-r,0)));
+  VERIFY_IS_EQUAL((matrix.template leftCols<c>()), (matrix.template block<rows,c>(0,0)));
+  VERIFY_IS_EQUAL((matrix.template middleCols<c>(sc)), (matrix.template block<rows,c>(0,sc)));
+  VERIFY_IS_EQUAL((matrix.template rightCols<c>()), (matrix.template block<rows,c>(0,cols-c)));
+
+  VERIFY_IS_EQUAL((const_matrix.template topLeftCorner<r,c>()), (const_matrix.template block<r,c>(0,0)));
+  VERIFY_IS_EQUAL((const_matrix.template topRightCorner<r,c>()), (const_matrix.template block<r,c>(0,cols-c)));
+  VERIFY_IS_EQUAL((const_matrix.template bottomLeftCorner<r,c>()), (const_matrix.template block<r,c>(rows-r,0)));
+  VERIFY_IS_EQUAL((const_matrix.template bottomRightCorner<r,c>()), (const_matrix.template block<r,c>(rows-r,cols-c)));
+
+  VERIFY_IS_EQUAL((const_matrix.template topLeftCorner<r,c>()), (const_matrix.template topLeftCorner<r,Dynamic>(r,c)));
+  VERIFY_IS_EQUAL((const_matrix.template topRightCorner<r,c>()), (const_matrix.template topRightCorner<r,Dynamic>(r,c)));
+  VERIFY_IS_EQUAL((const_matrix.template bottomLeftCorner<r,c>()), (const_matrix.template bottomLeftCorner<r,Dynamic>(r,c)));
+  VERIFY_IS_EQUAL((const_matrix.template bottomRightCorner<r,c>()), (const_matrix.template bottomRightCorner<r,Dynamic>(r,c)));
+
+  VERIFY_IS_EQUAL((const_matrix.template topLeftCorner<r,c>()), (const_matrix.template topLeftCorner<Dynamic,c>(r,c)));
+  VERIFY_IS_EQUAL((const_matrix.template topRightCorner<r,c>()), (const_matrix.template topRightCorner<Dynamic,c>(r,c)));
+  VERIFY_IS_EQUAL((const_matrix.template bottomLeftCorner<r,c>()), (const_matrix.template bottomLeftCorner<Dynamic,c>(r,c)));
+  VERIFY_IS_EQUAL((const_matrix.template bottomRightCorner<r,c>()), (const_matrix.template bottomRightCorner<Dynamic,c>(r,c)));
+
+  VERIFY_IS_EQUAL((const_matrix.template topRows<r>()), (const_matrix.template block<r,cols>(0,0)));
+  VERIFY_IS_EQUAL((const_matrix.template middleRows<r>(sr)), (const_matrix.template block<r,cols>(sr,0)));
+  VERIFY_IS_EQUAL((const_matrix.template bottomRows<r>()), (const_matrix.template block<r,cols>(rows-r,0)));
+  VERIFY_IS_EQUAL((const_matrix.template leftCols<c>()), (const_matrix.template block<rows,c>(0,0)));
+  VERIFY_IS_EQUAL((const_matrix.template middleCols<c>(sc)), (const_matrix.template block<rows,c>(0,sc)));
+  VERIFY_IS_EQUAL((const_matrix.template rightCols<c>()), (const_matrix.template block<rows,c>(0,cols-c)));
+}
+
+EIGEN_DECLARE_TEST(corners)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( corners(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( corners(Matrix4d()) );
+    CALL_SUBTEST_3( corners(Matrix<int,10,12>()) );
+    CALL_SUBTEST_4( corners(MatrixXcf(5, 7)) );
+    CALL_SUBTEST_5( corners(MatrixXf(21, 20)) );
+
+    CALL_SUBTEST_1(( corners_fixedsize<Matrix<float, 1, 1>, 1, 1, 0, 0>() ));
+    CALL_SUBTEST_2(( corners_fixedsize<Matrix4d,2,2,1,1>() ));
+    CALL_SUBTEST_3(( corners_fixedsize<Matrix<int,10,12>,4,7,5,2>() ));
+  }
+}

diff --git a/test/ctorleak.cpp b/test/ctorleak.cpp
new file mode 100644
index 0000000..7390417
--- /dev/null
+++ b/test/ctorleak.cpp

@@ -0,0 +1,81 @@
+#include "main.h"
+
+#include <exception>  // std::exception
+
+struct Foo
+{
+  static Index object_count;
+  static Index object_limit;
+  int dummy;
+
+  Foo() : dummy(0)
+  {
+#ifdef EIGEN_EXCEPTIONS
+    // TODO: Is this the correct way to handle this?
+    if (Foo::object_count > Foo::object_limit) { std::cout << "\nThrow!\n"; throw Foo::Fail(); }
+#endif
+	  std::cout << '+';
+    ++Foo::object_count;
+  }
+
+  ~Foo()
+  {
+	  std::cout << '-';
+    --Foo::object_count;
+  }
+
+  class Fail : public std::exception {};
+};
+
+Index Foo::object_count = 0;
+Index Foo::object_limit = 0;
+
+#undef EIGEN_TEST_MAX_SIZE
+#define EIGEN_TEST_MAX_SIZE 3
+
+EIGEN_DECLARE_TEST(ctorleak)
+{
+  typedef Matrix<Foo, Dynamic, Dynamic> MatrixX;
+  typedef Matrix<Foo, Dynamic, 1> VectorX;
+  
+  Foo::object_count = 0;
+  for(int i = 0; i < g_repeat; i++) {
+    Index rows = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE), cols = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE);
+    Foo::object_limit = rows*cols;
+    {
+    MatrixX r(rows, cols);
+    Foo::object_limit = r.size()+internal::random<Index>(0, rows*cols - 2);
+    std::cout << "object_limit =" << Foo::object_limit << std::endl;
+#ifdef EIGEN_EXCEPTIONS
+    try
+    {
+#endif
+      if(internal::random<bool>()) {
+        std::cout <<       "\nMatrixX m(" << rows << ", " << cols << ");\n";
+        MatrixX m(rows, cols);
+      }
+      else {
+        std::cout <<       "\nMatrixX m(r);\n";
+        MatrixX m(r);
+      }
+#ifdef EIGEN_EXCEPTIONS
+      VERIFY(false);  // not reached if exceptions are enabled
+    }
+    catch (const Foo::Fail&) { /* ignore */ }
+#endif
+    }
+    VERIFY_IS_EQUAL(Index(0), Foo::object_count);
+
+    {
+      Foo::object_limit = (rows+1)*(cols+1);
+      MatrixX A(rows, cols);
+      VERIFY_IS_EQUAL(Foo::object_count, rows*cols);
+      VectorX v=A.row(0);
+      VERIFY_IS_EQUAL(Foo::object_count, (rows+1)*cols);
+      v = A.col(0);
+      VERIFY_IS_EQUAL(Foo::object_count, rows*(cols+1));
+    }
+    VERIFY_IS_EQUAL(Index(0), Foo::object_count);
+  }
+  std::cout << "\n";
+}

diff --git a/test/denseLM.cpp b/test/denseLM.cpp
new file mode 100644
index 0000000..afb8004
--- /dev/null
+++ b/test/denseLM.cpp

@@ -0,0 +1,190 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Desire Nuentsa <desire.nuentsa_wakam@inria.fr>
+// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+
+#include "main.h"
+#include <Eigen/LevenbergMarquardt>
+using namespace std;
+using namespace Eigen;
+
+template<typename Scalar>
+struct DenseLM : DenseFunctor<Scalar>
+{
+  typedef DenseFunctor<Scalar> Base;
+  typedef typename Base::JacobianType JacobianType;
+  typedef Matrix<Scalar,Dynamic,1> VectorType;
+  
+  DenseLM(int n, int m) : DenseFunctor<Scalar>(n,m) 
+  { }
+ 
+  VectorType model(const VectorType& uv, VectorType& x)
+  {
+    VectorType y; // Should change to use expression template
+    int m = Base::values(); 
+    int n = Base::inputs();
+    eigen_assert(uv.size()%2 == 0);
+    eigen_assert(uv.size() == n);
+    eigen_assert(x.size() == m);
+    y.setZero(m);
+    int half = n/2;
+    VectorBlock<const VectorType> u(uv, 0, half);
+    VectorBlock<const VectorType> v(uv, half, half);
+    for (int j = 0; j < m; j++)
+    {
+      for (int i = 0; i < half; i++)
+        y(j) += u(i)*std::exp(-(x(j)-i)*(x(j)-i)/(v(i)*v(i)));
+    }
+    return y;
+    
+  }
+  void initPoints(VectorType& uv_ref, VectorType& x)
+  {
+    m_x = x;
+    m_y = this->model(uv_ref, x);
+  }
+  
+  int operator()(const VectorType& uv, VectorType& fvec)
+  {
+    
+    int m = Base::values(); 
+    int n = Base::inputs();
+    eigen_assert(uv.size()%2 == 0);
+    eigen_assert(uv.size() == n);
+    eigen_assert(fvec.size() == m);
+    int half = n/2;
+    VectorBlock<const VectorType> u(uv, 0, half);
+    VectorBlock<const VectorType> v(uv, half, half);
+    for (int j = 0; j < m; j++)
+    {
+      fvec(j) = m_y(j);
+      for (int i = 0; i < half; i++)
+      {
+        fvec(j) -= u(i) *std::exp(-(m_x(j)-i)*(m_x(j)-i)/(v(i)*v(i)));
+      }
+    }
+    
+    return 0;
+  }
+  int df(const VectorType& uv, JacobianType& fjac)
+  {
+    int m = Base::values(); 
+    int n = Base::inputs();
+    eigen_assert(n == uv.size());
+    eigen_assert(fjac.rows() == m);
+    eigen_assert(fjac.cols() == n);
+    int half = n/2;
+    VectorBlock<const VectorType> u(uv, 0, half);
+    VectorBlock<const VectorType> v(uv, half, half);
+    for (int j = 0; j < m; j++)
+    {
+      for (int i = 0; i < half; i++)
+      {
+        fjac.coeffRef(j,i) = -std::exp(-(m_x(j)-i)*(m_x(j)-i)/(v(i)*v(i)));
+        fjac.coeffRef(j,i+half) = -2.*u(i)*(m_x(j)-i)*(m_x(j)-i)/(std::pow(v(i),3)) * std::exp(-(m_x(j)-i)*(m_x(j)-i)/(v(i)*v(i)));
+      }
+    }
+    return 0;
+  }
+  VectorType m_x, m_y; //Data Points
+};
+
+template<typename FunctorType, typename VectorType>
+int test_minimizeLM(FunctorType& functor, VectorType& uv)
+{
+  LevenbergMarquardt<FunctorType> lm(functor);
+  LevenbergMarquardtSpace::Status info; 
+  
+  info = lm.minimize(uv);
+  
+  VERIFY_IS_EQUAL(info, 1);
+  //FIXME Check other parameters
+  return info;
+}
+
+template<typename FunctorType, typename VectorType>
+int test_lmder(FunctorType& functor, VectorType& uv)
+{
+  typedef typename VectorType::Scalar Scalar;
+  LevenbergMarquardtSpace::Status info; 
+  LevenbergMarquardt<FunctorType> lm(functor);
+  info = lm.lmder1(uv);
+  
+  VERIFY_IS_EQUAL(info, 1);
+  //FIXME Check other parameters
+  return info;
+}
+
+template<typename FunctorType, typename VectorType>
+int test_minimizeSteps(FunctorType& functor, VectorType& uv)
+{
+  LevenbergMarquardtSpace::Status info;   
+  LevenbergMarquardt<FunctorType> lm(functor);
+  info = lm.minimizeInit(uv);
+  if (info==LevenbergMarquardtSpace::ImproperInputParameters)
+      return info;
+  do 
+  {
+    info = lm.minimizeOneStep(uv);
+  } while (info==LevenbergMarquardtSpace::Running);
+  
+  VERIFY_IS_EQUAL(info, 1);
+  //FIXME Check other parameters
+  return info;
+}
+
+template<typename T>
+void test_denseLM_T()
+{
+  typedef Matrix<T,Dynamic,1> VectorType;
+  
+  int inputs = 10; 
+  int values = 1000; 
+  DenseLM<T> dense_gaussian(inputs, values);
+  VectorType uv(inputs),uv_ref(inputs);
+  VectorType x(values);
+  
+  // Generate the reference solution 
+  uv_ref << -2, 1, 4 ,8, 6, 1.8, 1.2, 1.1, 1.9 , 3;
+  
+  //Generate the reference data points
+  x.setRandom();
+  x = 10*x;
+  x.array() += 10;
+  dense_gaussian.initPoints(uv_ref, x);
+  
+  // Generate the initial parameters 
+  VectorBlock<VectorType> u(uv, 0, inputs/2); 
+  VectorBlock<VectorType> v(uv, inputs/2, inputs/2);
+  
+  // Solve the optimization problem
+  
+  //Solve in one go
+  u.setOnes(); v.setOnes();
+  test_minimizeLM(dense_gaussian, uv);
+  
+  //Solve until the machine precision
+  u.setOnes(); v.setOnes();
+  test_lmder(dense_gaussian, uv); 
+  
+  // Solve step by step
+  v.setOnes(); u.setOnes();
+  test_minimizeSteps(dense_gaussian, uv);
+  
+}
+
+EIGEN_DECLARE_TEST(denseLM)
+{
+  CALL_SUBTEST_2(test_denseLM_T<double>());
+  
+  // CALL_SUBTEST_2(test_sparseLM_T<std::complex<double>());
+}

diff --git a/test/dense_storage.cpp b/test/dense_storage.cpp
new file mode 100644
index 0000000..45c2bd7
--- /dev/null
+++ b/test/dense_storage.cpp

@@ -0,0 +1,190 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include "AnnoyingScalar.h"
+#include "SafeScalar.h"
+
+#include <Eigen/Core>
+
+#if EIGEN_HAS_TYPE_TRAITS && EIGEN_HAS_CXX11
+using DenseStorageD3x3 = Eigen::DenseStorage<double, 3, 3, 3, 3>;
+static_assert(std::is_trivially_move_constructible<DenseStorageD3x3>::value, "DenseStorage not trivially_move_constructible");
+static_assert(std::is_trivially_move_assignable<DenseStorageD3x3>::value, "DenseStorage not trivially_move_assignable");
+#if !defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN)
+static_assert(std::is_trivially_copy_constructible<DenseStorageD3x3>::value, "DenseStorage not trivially_copy_constructible");
+static_assert(std::is_trivially_copy_assignable<DenseStorageD3x3>::value, "DenseStorage not trivially_copy_assignable");
+static_assert(std::is_trivially_copyable<DenseStorageD3x3>::value, "DenseStorage not trivially_copyable");
+#endif
+#endif
+
+template <typename T, int Size, int Rows, int Cols>
+void dense_storage_copy(int rows, int cols)
+{
+  typedef DenseStorage<T, Size, Rows, Cols, 0> DenseStorageType;
+  
+  const int size = rows*cols;
+  DenseStorageType reference(size, rows, cols);
+  T* raw_reference = reference.data();
+  for (int i=0; i<size; ++i)
+    raw_reference[i] = static_cast<T>(i);
+    
+  DenseStorageType copied_reference(reference);
+  const T* raw_copied_reference = copied_reference.data();
+  for (int i=0; i<size; ++i)
+    VERIFY_IS_EQUAL(raw_reference[i], raw_copied_reference[i]);
+}
+
+template <typename T, int Size, int Rows, int Cols>
+void dense_storage_assignment(int rows, int cols)
+{
+  typedef DenseStorage<T, Size, Rows, Cols, 0> DenseStorageType;
+  
+  const int size = rows*cols;
+  DenseStorageType reference(size, rows, cols);
+  T* raw_reference = reference.data();
+  for (int i=0; i<size; ++i)
+    raw_reference[i] = static_cast<T>(i);
+    
+  DenseStorageType copied_reference;
+  copied_reference = reference;
+  const T* raw_copied_reference = copied_reference.data();
+  for (int i=0; i<size; ++i)
+    VERIFY_IS_EQUAL(raw_reference[i], raw_copied_reference[i]);
+}
+
+template <typename T, int Size, int Rows, int Cols>
+void dense_storage_swap(int rows0, int cols0, int rows1, int cols1)
+{
+  typedef DenseStorage<T, Size, Rows, Cols, 0> DenseStorageType;
+  
+  const int size0 = rows0*cols0;
+  DenseStorageType a(size0, rows0, cols0);
+  for (int i=0; i<size0; ++i) {
+    a.data()[i] = static_cast<T>(i);
+  }
+  
+  const int size1 = rows1*cols1;
+  DenseStorageType b(size1, rows1, cols1);
+  for (int i=0; i<size1; ++i) {
+    b.data()[i] = static_cast<T>(-i);
+  }
+  
+  a.swap(b);
+  
+  for (int i=0; i<size0; ++i) {
+    VERIFY_IS_EQUAL(b.data()[i], static_cast<T>(i));
+  }
+  
+  for (int i=0; i<size1; ++i) {
+    VERIFY_IS_EQUAL(a.data()[i], static_cast<T>(-i));
+  }
+}
+
+template<typename T, int Size, std::size_t Alignment>
+void dense_storage_alignment()
+{
+  #if EIGEN_HAS_ALIGNAS
+  
+  struct alignas(Alignment) Empty1 {};
+  VERIFY_IS_EQUAL(std::alignment_of<Empty1>::value, Alignment);
+
+  struct EIGEN_ALIGN_TO_BOUNDARY(Alignment) Empty2 {};
+  VERIFY_IS_EQUAL(std::alignment_of<Empty2>::value, Alignment);
+
+  struct Nested1 { EIGEN_ALIGN_TO_BOUNDARY(Alignment) T data[Size]; };
+  VERIFY_IS_EQUAL(std::alignment_of<Nested1>::value, Alignment);
+
+  VERIFY_IS_EQUAL( (std::alignment_of<internal::plain_array<T,Size,AutoAlign,Alignment> >::value), Alignment);
+
+  const std::size_t default_alignment = internal::compute_default_alignment<T,Size>::value;
+
+  VERIFY_IS_EQUAL( (std::alignment_of<DenseStorage<T,Size,1,1,AutoAlign> >::value), default_alignment);
+  VERIFY_IS_EQUAL( (std::alignment_of<Matrix<T,Size,1,AutoAlign> >::value), default_alignment);
+  struct Nested2 { Matrix<T,Size,1,AutoAlign> mat; };
+  VERIFY_IS_EQUAL(std::alignment_of<Nested2>::value, default_alignment);
+
+  #endif
+}
+
+template<typename T>
+void dense_storage_tests() {
+  // Dynamic Storage.
+  dense_storage_copy<T,Dynamic,Dynamic,Dynamic>(4, 3);  
+  dense_storage_copy<T,Dynamic,Dynamic,3>(4, 3);
+  dense_storage_copy<T,Dynamic,4,Dynamic>(4, 3);
+  // Fixed Storage.
+  dense_storage_copy<T,12,4,3>(4, 3);
+  dense_storage_copy<T,12,Dynamic,Dynamic>(4, 3);
+  dense_storage_copy<T,12,4,Dynamic>(4, 3);
+  dense_storage_copy<T,12,Dynamic,3>(4, 3);
+  // Fixed Storage with Uninitialized Elements.
+  dense_storage_copy<T,18,Dynamic,Dynamic>(4, 3);
+  dense_storage_copy<T,18,4,Dynamic>(4, 3);
+  dense_storage_copy<T,18,Dynamic,3>(4, 3);
+  
+  // Dynamic Storage.
+  dense_storage_assignment<T,Dynamic,Dynamic,Dynamic>(4, 3);  
+  dense_storage_assignment<T,Dynamic,Dynamic,3>(4, 3);
+  dense_storage_assignment<T,Dynamic,4,Dynamic>(4, 3);
+  // Fixed Storage.
+  dense_storage_assignment<T,12,4,3>(4, 3);
+  dense_storage_assignment<T,12,Dynamic,Dynamic>(4, 3);
+  dense_storage_assignment<T,12,4,Dynamic>(4, 3);
+  dense_storage_assignment<T,12,Dynamic,3>(4, 3);
+  // Fixed Storage with Uninitialized Elements.
+  dense_storage_assignment<T,18,Dynamic,Dynamic>(4, 3);
+  dense_storage_assignment<T,18,4,Dynamic>(4, 3);
+  dense_storage_assignment<T,18,Dynamic,3>(4, 3);
+  
+  // Dynamic Storage.
+  dense_storage_swap<T,Dynamic,Dynamic,Dynamic>(4, 3, 4, 3); 
+  dense_storage_swap<T,Dynamic,Dynamic,Dynamic>(4, 3, 2, 1);  
+  dense_storage_swap<T,Dynamic,Dynamic,Dynamic>(2, 1, 4, 3);
+  dense_storage_swap<T,Dynamic,Dynamic,3>(4, 3, 4, 3);
+  dense_storage_swap<T,Dynamic,Dynamic,3>(4, 3, 2, 3);
+  dense_storage_swap<T,Dynamic,Dynamic,3>(2, 3, 4, 3);
+  dense_storage_swap<T,Dynamic,4,Dynamic>(4, 3, 4, 3);
+  dense_storage_swap<T,Dynamic,4,Dynamic>(4, 3, 4, 1);
+  dense_storage_swap<T,Dynamic,4,Dynamic>(4, 1, 4, 3);
+  // Fixed Storage.
+  dense_storage_swap<T,12,4,3>(4, 3, 4, 3);
+  dense_storage_swap<T,12,Dynamic,Dynamic>(4, 3, 4, 3);
+  dense_storage_swap<T,12,Dynamic,Dynamic>(4, 3, 2, 1);
+  dense_storage_swap<T,12,Dynamic,Dynamic>(2, 1, 4, 3);
+  dense_storage_swap<T,12,4,Dynamic>(4, 3, 4, 3);
+  dense_storage_swap<T,12,4,Dynamic>(4, 3, 4, 1);
+  dense_storage_swap<T,12,4,Dynamic>(4, 1, 4, 3);
+  dense_storage_swap<T,12,Dynamic,3>(4, 3, 4, 3);
+  dense_storage_swap<T,12,Dynamic,3>(4, 3, 2, 3);
+  dense_storage_swap<T,12,Dynamic,3>(2, 3, 4, 3);
+  // Fixed Storage with Uninitialized Elements.
+  dense_storage_swap<T,18,Dynamic,Dynamic>(4, 3, 4, 3);
+  dense_storage_swap<T,18,Dynamic,Dynamic>(4, 3, 2, 1);
+  dense_storage_swap<T,18,Dynamic,Dynamic>(2, 1, 4, 3);
+  dense_storage_swap<T,18,4,Dynamic>(4, 3, 4, 3);
+  dense_storage_swap<T,18,4,Dynamic>(4, 3, 4, 1);
+  dense_storage_swap<T,18,4,Dynamic>(4, 1, 4, 3);
+  dense_storage_swap<T,18,Dynamic,3>(4, 3, 4, 3);
+  dense_storage_swap<T,18,Dynamic,3>(4, 3, 2, 3);
+  dense_storage_swap<T,18,Dynamic,3>(2, 3, 4, 3);
+  
+  dense_storage_alignment<T,16,8>();
+  dense_storage_alignment<T,16,16>();
+  dense_storage_alignment<T,16,32>();
+  dense_storage_alignment<T,16,64>();
+}
+
+EIGEN_DECLARE_TEST(dense_storage)
+{
+  dense_storage_tests<int>();
+  dense_storage_tests<float>();
+  dense_storage_tests<SafeScalar<float> >();
+  dense_storage_tests<AnnoyingScalar>();
+}

diff --git a/test/determinant.cpp b/test/determinant.cpp
new file mode 100644
index 0000000..7dd33c3
--- /dev/null
+++ b/test/determinant.cpp

@@ -0,0 +1,66 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/LU>
+
+template<typename MatrixType> void determinant(const MatrixType& m)
+{
+  /* this test covers the following files:
+     Determinant.h
+  */
+  Index size = m.rows();
+
+  MatrixType m1(size, size), m2(size, size);
+  m1.setRandom();
+  m2.setRandom();
+  typedef typename MatrixType::Scalar Scalar;
+  Scalar x = internal::random<Scalar>();
+  VERIFY_IS_APPROX(MatrixType::Identity(size, size).determinant(), Scalar(1));
+  VERIFY_IS_APPROX((m1*m2).eval().determinant(), m1.determinant() * m2.determinant());
+  if(size==1) return;
+  Index i = internal::random<Index>(0, size-1);
+  Index j;
+  do {
+    j = internal::random<Index>(0, size-1);
+  } while(j==i);
+  m2 = m1;
+  m2.row(i).swap(m2.row(j));
+  VERIFY_IS_APPROX(m2.determinant(), -m1.determinant());
+  m2 = m1;
+  m2.col(i).swap(m2.col(j));
+  VERIFY_IS_APPROX(m2.determinant(), -m1.determinant());
+  VERIFY_IS_APPROX(m2.determinant(), m2.transpose().determinant());
+  VERIFY_IS_APPROX(numext::conj(m2.determinant()), m2.adjoint().determinant());
+  m2 = m1;
+  m2.row(i) += x*m2.row(j);
+  VERIFY_IS_APPROX(m2.determinant(), m1.determinant());
+  m2 = m1;
+  m2.row(i) *= x;
+  VERIFY_IS_APPROX(m2.determinant(), m1.determinant() * x);
+  
+  // check empty matrix
+  VERIFY_IS_APPROX(m2.block(0,0,0,0).determinant(), Scalar(1));
+}
+
+EIGEN_DECLARE_TEST(determinant)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    int s = 0;
+    CALL_SUBTEST_1( determinant(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( determinant(Matrix<double, 2, 2>()) );
+    CALL_SUBTEST_3( determinant(Matrix<double, 3, 3>()) );
+    CALL_SUBTEST_4( determinant(Matrix<double, 4, 4>()) );
+    CALL_SUBTEST_5( determinant(Matrix<std::complex<double>, 10, 10>()) );
+    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
+    CALL_SUBTEST_6( determinant(MatrixXd(s, s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+  }
+}

diff --git a/test/diagonal.cpp b/test/diagonal.cpp
new file mode 100644
index 0000000..4e8c4b3
--- /dev/null
+++ b/test/diagonal.cpp

@@ -0,0 +1,105 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename MatrixType> void diagonal(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m2 = MatrixType::Random(rows, cols);
+
+  Scalar s1 = internal::random<Scalar>();
+
+  //check diagonal()
+  VERIFY_IS_APPROX(m1.diagonal(), m1.transpose().diagonal());
+  m2.diagonal() = 2 * m1.diagonal();
+  m2.diagonal()[0] *= 3;
+
+  if (rows>2)
+  {
+    enum {
+      N1 = MatrixType::RowsAtCompileTime>2 ?  2 : 0,
+      N2 = MatrixType::RowsAtCompileTime>1 ? -1 : 0
+    };
+
+    // check sub/super diagonal
+    if(MatrixType::SizeAtCompileTime!=Dynamic)
+    {
+      VERIFY(m1.template diagonal<N1>().RowsAtCompileTime == m1.diagonal(N1).size());
+      VERIFY(m1.template diagonal<N2>().RowsAtCompileTime == m1.diagonal(N2).size());
+    }
+
+    m2.template diagonal<N1>() = 2 * m1.template diagonal<N1>();
+    VERIFY_IS_APPROX(m2.template diagonal<N1>(), static_cast<Scalar>(2) * m1.diagonal(N1));
+    m2.template diagonal<N1>()[0] *= 3;
+    VERIFY_IS_APPROX(m2.template diagonal<N1>()[0], static_cast<Scalar>(6) * m1.template diagonal<N1>()[0]);
+
+
+    m2.template diagonal<N2>() = 2 * m1.template diagonal<N2>();
+    m2.template diagonal<N2>()[0] *= 3;
+    VERIFY_IS_APPROX(m2.template diagonal<N2>()[0], static_cast<Scalar>(6) * m1.template diagonal<N2>()[0]);
+
+    m2.diagonal(N1) = 2 * m1.diagonal(N1);
+    VERIFY_IS_APPROX(m2.template diagonal<N1>(), static_cast<Scalar>(2) * m1.diagonal(N1));
+    m2.diagonal(N1)[0] *= 3;
+    VERIFY_IS_APPROX(m2.diagonal(N1)[0], static_cast<Scalar>(6) * m1.diagonal(N1)[0]);
+
+    m2.diagonal(N2) = 2 * m1.diagonal(N2);
+    VERIFY_IS_APPROX(m2.template diagonal<N2>(), static_cast<Scalar>(2) * m1.diagonal(N2));
+    m2.diagonal(N2)[0] *= 3;
+    VERIFY_IS_APPROX(m2.diagonal(N2)[0], static_cast<Scalar>(6) * m1.diagonal(N2)[0]);
+
+    m2.diagonal(N2).x() = s1;
+    VERIFY_IS_APPROX(m2.diagonal(N2).x(), s1);
+    m2.diagonal(N2).coeffRef(0) = Scalar(2)*s1;
+    VERIFY_IS_APPROX(m2.diagonal(N2).coeff(0), Scalar(2)*s1);
+  }
+
+  VERIFY( m1.diagonal( cols).size()==0 );
+  VERIFY( m1.diagonal(-rows).size()==0 );
+}
+
+template<typename MatrixType> void diagonal_assert(const MatrixType& m) {
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols);
+
+  if (rows>=2 && cols>=2)
+  {
+    VERIFY_RAISES_ASSERT( m1 += m1.diagonal() );
+    VERIFY_RAISES_ASSERT( m1 -= m1.diagonal() );
+    VERIFY_RAISES_ASSERT( m1.array() *= m1.diagonal().array() );
+    VERIFY_RAISES_ASSERT( m1.array() /= m1.diagonal().array() );
+  }
+
+  VERIFY_RAISES_ASSERT( m1.diagonal(cols+1) );
+  VERIFY_RAISES_ASSERT( m1.diagonal(-(rows+1)) );
+}
+
+EIGEN_DECLARE_TEST(diagonal)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( diagonal(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_1( diagonal(Matrix<float, 4, 9>()) );
+    CALL_SUBTEST_1( diagonal(Matrix<float, 7, 3>()) );
+    CALL_SUBTEST_2( diagonal(Matrix4d()) );
+    CALL_SUBTEST_2( diagonal(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_2( diagonal(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_2( diagonal(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_1( diagonal(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_1( diagonal(Matrix<float,Dynamic,4>(3, 4)) );
+    CALL_SUBTEST_1( diagonal_assert(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  }
+}

diff --git a/test/diagonal_matrix_variadic_ctor.cpp b/test/diagonal_matrix_variadic_ctor.cpp
new file mode 100644
index 0000000..fbc8f84
--- /dev/null
+++ b/test/diagonal_matrix_variadic_ctor.cpp

@@ -0,0 +1,185 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2019 David Tellenbach <david.tellenbach@tellnotes.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_NO_STATIC_ASSERT
+
+#include "main.h"
+
+template <typename Scalar>
+void assertionTest()
+{
+  typedef DiagonalMatrix<Scalar, 5> DiagMatrix5;
+  typedef DiagonalMatrix<Scalar, 7> DiagMatrix7;
+  typedef DiagonalMatrix<Scalar, Dynamic> DiagMatrixX;
+
+  Scalar raw[6];
+  for (int i = 0; i < 6; ++i) {
+    raw[i] = internal::random<Scalar>();
+  }
+
+  VERIFY_RAISES_ASSERT((DiagMatrix5{raw[0], raw[1], raw[2], raw[3]}));
+  VERIFY_RAISES_ASSERT((DiagMatrix5{raw[0], raw[1], raw[3]}));
+  VERIFY_RAISES_ASSERT((DiagMatrix7{raw[0], raw[1], raw[2], raw[3]}));
+
+  VERIFY_RAISES_ASSERT((DiagMatrixX {
+    {raw[0], raw[1], raw[2]},
+    {raw[3], raw[4], raw[5]}
+  }));
+}
+
+#define VERIFY_IMPLICIT_CONVERSION_3(DIAGTYPE, V0, V1, V2) \
+  DIAGTYPE d(V0, V1, V2);                                  \
+  DIAGTYPE::DenseMatrixType Dense = d.toDenseMatrix();     \
+  VERIFY_IS_APPROX(Dense(0, 0), (Scalar)V0);               \
+  VERIFY_IS_APPROX(Dense(1, 1), (Scalar)V1);               \
+  VERIFY_IS_APPROX(Dense(2, 2), (Scalar)V2);
+
+#define VERIFY_IMPLICIT_CONVERSION_4(DIAGTYPE, V0, V1, V2, V3) \
+  DIAGTYPE d(V0, V1, V2, V3);                                  \
+  DIAGTYPE::DenseMatrixType Dense = d.toDenseMatrix();         \
+  VERIFY_IS_APPROX(Dense(0, 0), (Scalar)V0);                   \
+  VERIFY_IS_APPROX(Dense(1, 1), (Scalar)V1);                   \
+  VERIFY_IS_APPROX(Dense(2, 2), (Scalar)V2);                   \
+  VERIFY_IS_APPROX(Dense(3, 3), (Scalar)V3);
+
+#define VERIFY_IMPLICIT_CONVERSION_5(DIAGTYPE, V0, V1, V2, V3, V4) \
+  DIAGTYPE d(V0, V1, V2, V3, V4);                                  \
+  DIAGTYPE::DenseMatrixType Dense = d.toDenseMatrix();             \
+  VERIFY_IS_APPROX(Dense(0, 0), (Scalar)V0);                       \
+  VERIFY_IS_APPROX(Dense(1, 1), (Scalar)V1);                       \
+  VERIFY_IS_APPROX(Dense(2, 2), (Scalar)V2);                       \
+  VERIFY_IS_APPROX(Dense(3, 3), (Scalar)V3);                       \
+  VERIFY_IS_APPROX(Dense(4, 4), (Scalar)V4);
+
+template<typename Scalar>
+void constructorTest()
+{
+  typedef DiagonalMatrix<Scalar, 0> DiagonalMatrix0;
+  typedef DiagonalMatrix<Scalar, 3> DiagonalMatrix3;
+  typedef DiagonalMatrix<Scalar, 4> DiagonalMatrix4;
+  typedef DiagonalMatrix<Scalar, Dynamic> DiagonalMatrixX;
+
+  Scalar raw[7];
+  for (int k = 0; k < 7; ++k) raw[k] = internal::random<Scalar>();
+
+  // Fixed-sized matrices
+  {
+    DiagonalMatrix0 a {{}};
+    VERIFY(a.rows() == 0);
+    VERIFY(a.cols() == 0);
+    typename DiagonalMatrix0::DenseMatrixType m = a.toDenseMatrix();
+    for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]);
+  }
+  {
+    DiagonalMatrix3 a {{raw[0], raw[1], raw[2]}};
+    VERIFY(a.rows() == 3);
+    VERIFY(a.cols() == 3);
+    typename DiagonalMatrix3::DenseMatrixType m = a.toDenseMatrix();
+    for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]);
+  }
+  {
+    DiagonalMatrix4 a {{raw[0], raw[1], raw[2], raw[3]}};
+    VERIFY(a.rows() == 4);
+    VERIFY(a.cols() == 4);
+    typename DiagonalMatrix4::DenseMatrixType m = a.toDenseMatrix();
+    for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]);
+  }
+
+  // dynamically sized matrices
+  {
+    DiagonalMatrixX a{{}};
+    VERIFY(a.rows() == 0);
+    VERIFY(a.rows() == 0);
+    typename DiagonalMatrixX::DenseMatrixType m = a.toDenseMatrix();
+    for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]);
+  }
+  {
+    DiagonalMatrixX a{{raw[0], raw[1], raw[2], raw[3], raw[4], raw[5], raw[6]}};
+    VERIFY(a.rows() == 7);
+    VERIFY(a.rows() == 7);
+    typename DiagonalMatrixX::DenseMatrixType m = a.toDenseMatrix();
+    for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]);
+  }
+}
+
+template<>
+void constructorTest<float>()
+{
+  typedef float Scalar;
+
+  typedef DiagonalMatrix<Scalar, 0> DiagonalMatrix0;
+  typedef DiagonalMatrix<Scalar, 3> DiagonalMatrix3;
+  typedef DiagonalMatrix<Scalar, 4> DiagonalMatrix4;
+  typedef DiagonalMatrix<Scalar, 5> DiagonalMatrix5;
+  typedef DiagonalMatrix<Scalar, Dynamic> DiagonalMatrixX;
+
+  Scalar raw[7];
+  for (int k = 0; k < 7; ++k) raw[k] = internal::random<Scalar>();
+
+  // Fixed-sized matrices
+  {
+    DiagonalMatrix0 a {{}};
+    VERIFY(a.rows() == 0);
+    VERIFY(a.cols() == 0);
+    typename DiagonalMatrix0::DenseMatrixType m = a.toDenseMatrix();
+    for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]);
+  }
+  {
+    DiagonalMatrix3 a {{raw[0], raw[1], raw[2]}};
+    VERIFY(a.rows() == 3);
+    VERIFY(a.cols() == 3);
+    typename DiagonalMatrix3::DenseMatrixType m = a.toDenseMatrix();
+    for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]);
+  }
+  {
+    DiagonalMatrix4 a {{raw[0], raw[1], raw[2], raw[3]}};
+    VERIFY(a.rows() == 4);
+    VERIFY(a.cols() == 4);
+    typename DiagonalMatrix4::DenseMatrixType m = a.toDenseMatrix();
+    for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]);
+  }
+
+  // dynamically sized matrices
+  {
+    DiagonalMatrixX a{{}};
+    VERIFY(a.rows() == 0);
+    VERIFY(a.rows() == 0);
+    typename DiagonalMatrixX::DenseMatrixType m = a.toDenseMatrix();
+    for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]);
+  }
+  {
+    DiagonalMatrixX a{{raw[0], raw[1], raw[2], raw[3], raw[4], raw[5], raw[6]}};
+    VERIFY(a.rows() == 7);
+    VERIFY(a.rows() == 7);
+    typename DiagonalMatrixX::DenseMatrixType m = a.toDenseMatrix();
+    for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]);
+  }
+  { VERIFY_IMPLICIT_CONVERSION_3(DiagonalMatrix3, 1.2647, 2.56f, -3); }
+  { VERIFY_IMPLICIT_CONVERSION_4(DiagonalMatrix4, 1.2647, 2.56f, -3, 3.23f); }
+  { VERIFY_IMPLICIT_CONVERSION_5(DiagonalMatrix5, 1.2647, 2.56f, -3, 3.23f, 2); }
+}
+
+EIGEN_DECLARE_TEST(diagonal_matrix_variadic_ctor)
+{
+  CALL_SUBTEST_1(assertionTest<unsigned char>());
+  CALL_SUBTEST_1(assertionTest<float>());
+  CALL_SUBTEST_1(assertionTest<Index>());
+  CALL_SUBTEST_1(assertionTest<int>());
+  CALL_SUBTEST_1(assertionTest<long int>());
+  CALL_SUBTEST_1(assertionTest<std::ptrdiff_t>());
+  CALL_SUBTEST_1(assertionTest<std::complex<double>>());
+
+  CALL_SUBTEST_2(constructorTest<unsigned char>());
+  CALL_SUBTEST_2(constructorTest<float>());
+  CALL_SUBTEST_2(constructorTest<Index>());
+  CALL_SUBTEST_2(constructorTest<int>());
+  CALL_SUBTEST_2(constructorTest<long int>());
+  CALL_SUBTEST_2(constructorTest<std::ptrdiff_t>());
+  CALL_SUBTEST_2(constructorTest<std::complex<double>>());
+}

diff --git a/test/diagonalmatrices.cpp b/test/diagonalmatrices.cpp
new file mode 100644
index 0000000..276bead
--- /dev/null
+++ b/test/diagonalmatrices.cpp

@@ -0,0 +1,173 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+using namespace std;
+template<typename MatrixType> void diagonalmatrices(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  enum { Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime };
+  typedef Matrix<Scalar, Rows, 1> VectorType;
+  typedef Matrix<Scalar, 1, Cols> RowVectorType;
+  typedef Matrix<Scalar, Rows, Rows> SquareMatrixType;
+  typedef Matrix<Scalar, Dynamic, Dynamic> DynMatrixType;
+  typedef DiagonalMatrix<Scalar, Rows> LeftDiagonalMatrix;
+  typedef DiagonalMatrix<Scalar, Cols> RightDiagonalMatrix;
+  typedef Matrix<Scalar, Rows==Dynamic?Dynamic:2*Rows, Cols==Dynamic?Dynamic:2*Cols> BigMatrix;
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m2 = MatrixType::Random(rows, cols);
+  VectorType v1 = VectorType::Random(rows),
+             v2 = VectorType::Random(rows);
+  RowVectorType rv1 = RowVectorType::Random(cols),
+             rv2 = RowVectorType::Random(cols);
+
+  LeftDiagonalMatrix ldm1(v1), ldm2(v2);
+  RightDiagonalMatrix rdm1(rv1), rdm2(rv2);
+  
+  Scalar s1 = internal::random<Scalar>();
+
+  SquareMatrixType sq_m1 (v1.asDiagonal());
+  VERIFY_IS_APPROX(sq_m1, v1.asDiagonal().toDenseMatrix());
+  sq_m1 = v1.asDiagonal();
+  VERIFY_IS_APPROX(sq_m1, v1.asDiagonal().toDenseMatrix());
+  SquareMatrixType sq_m2 = v1.asDiagonal();
+  VERIFY_IS_APPROX(sq_m1, sq_m2);
+  
+  ldm1 = v1.asDiagonal();
+  LeftDiagonalMatrix ldm3(v1);
+  VERIFY_IS_APPROX(ldm1.diagonal(), ldm3.diagonal());
+  LeftDiagonalMatrix ldm4 = v1.asDiagonal();
+  VERIFY_IS_APPROX(ldm1.diagonal(), ldm4.diagonal());
+  
+  sq_m1.block(0,0,rows,rows) = ldm1;
+  VERIFY_IS_APPROX(sq_m1, ldm1.toDenseMatrix());
+  sq_m1.transpose() = ldm1;
+  VERIFY_IS_APPROX(sq_m1, ldm1.toDenseMatrix());
+  
+  Index i = internal::random<Index>(0, rows-1);
+  Index j = internal::random<Index>(0, cols-1);
+  
+  VERIFY_IS_APPROX( ((ldm1 * m1)(i,j))  , ldm1.diagonal()(i) * m1(i,j) );
+  VERIFY_IS_APPROX( ((ldm1 * (m1+m2))(i,j))  , ldm1.diagonal()(i) * (m1+m2)(i,j) );
+  VERIFY_IS_APPROX( ((m1 * rdm1)(i,j))  , rdm1.diagonal()(j) * m1(i,j) );
+  VERIFY_IS_APPROX( ((v1.asDiagonal() * m1)(i,j))  , v1(i) * m1(i,j) );
+  VERIFY_IS_APPROX( ((m1 * rv1.asDiagonal())(i,j))  , rv1(j) * m1(i,j) );
+  VERIFY_IS_APPROX( (((v1+v2).asDiagonal() * m1)(i,j))  , (v1+v2)(i) * m1(i,j) );
+  VERIFY_IS_APPROX( (((v1+v2).asDiagonal() * (m1+m2))(i,j))  , (v1+v2)(i) * (m1+m2)(i,j) );
+  VERIFY_IS_APPROX( ((m1 * (rv1+rv2).asDiagonal())(i,j))  , (rv1+rv2)(j) * m1(i,j) );
+  VERIFY_IS_APPROX( (((m1+m2) * (rv1+rv2).asDiagonal())(i,j))  , (rv1+rv2)(j) * (m1+m2)(i,j) );
+  
+  if(rows>1)
+  {
+    DynMatrixType tmp = m1.topRows(rows/2), res;
+    VERIFY_IS_APPROX( (res = m1.topRows(rows/2) * rv1.asDiagonal()), tmp * rv1.asDiagonal() );
+    VERIFY_IS_APPROX( (res = v1.head(rows/2).asDiagonal()*m1.topRows(rows/2)), v1.head(rows/2).asDiagonal()*tmp );
+  }
+
+  BigMatrix big;
+  big.setZero(2*rows, 2*cols);
+  
+  big.block(i,j,rows,cols) = m1;
+  big.block(i,j,rows,cols) = v1.asDiagonal() * big.block(i,j,rows,cols);
+  
+  VERIFY_IS_APPROX((big.block(i,j,rows,cols)) , v1.asDiagonal() * m1 );
+  
+  big.block(i,j,rows,cols) = m1;
+  big.block(i,j,rows,cols) = big.block(i,j,rows,cols) * rv1.asDiagonal();
+  VERIFY_IS_APPROX((big.block(i,j,rows,cols)) , m1 * rv1.asDiagonal() );
+  
+  
+  // scalar multiple
+  VERIFY_IS_APPROX(LeftDiagonalMatrix(ldm1*s1).diagonal(), ldm1.diagonal() * s1);
+  VERIFY_IS_APPROX(LeftDiagonalMatrix(s1*ldm1).diagonal(), s1 * ldm1.diagonal());
+  
+  VERIFY_IS_APPROX(m1 * (rdm1 * s1), (m1 * rdm1) * s1);
+  VERIFY_IS_APPROX(m1 * (s1 * rdm1), (m1 * rdm1) * s1);
+  
+  // Diagonal to dense
+  sq_m1.setRandom();
+  sq_m2 = sq_m1;
+  VERIFY_IS_APPROX( (sq_m1 += (s1*v1).asDiagonal()), sq_m2 += (s1*v1).asDiagonal().toDenseMatrix() );
+  VERIFY_IS_APPROX( (sq_m1 -= (s1*v1).asDiagonal()), sq_m2 -= (s1*v1).asDiagonal().toDenseMatrix() );
+  VERIFY_IS_APPROX( (sq_m1 = (s1*v1).asDiagonal()), (s1*v1).asDiagonal().toDenseMatrix() );
+
+  sq_m1.setRandom();
+  sq_m2 = v1.asDiagonal();
+  sq_m2 = sq_m1 * sq_m2;
+  VERIFY_IS_APPROX( (sq_m1*v1.asDiagonal()).col(i), sq_m2.col(i) );
+  VERIFY_IS_APPROX( (sq_m1*v1.asDiagonal()).row(i), sq_m2.row(i) );
+
+  sq_m1 = v1.asDiagonal();
+  sq_m2 = v2.asDiagonal();
+  SquareMatrixType sq_m3 = v1.asDiagonal();
+  VERIFY_IS_APPROX( sq_m3 = v1.asDiagonal() + v2.asDiagonal(), sq_m1 + sq_m2);
+  VERIFY_IS_APPROX( sq_m3 = v1.asDiagonal() - v2.asDiagonal(), sq_m1 - sq_m2);
+  VERIFY_IS_APPROX( sq_m3 = v1.asDiagonal() - 2*v2.asDiagonal() + v1.asDiagonal(), sq_m1 - 2*sq_m2 + sq_m1);
+}
+
+template<typename MatrixType> void as_scalar_product(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+  typedef Matrix<Scalar, Dynamic, Dynamic> DynMatrixType;
+  typedef Matrix<Scalar, Dynamic, 1> DynVectorType;
+  typedef Matrix<Scalar, 1, Dynamic> DynRowVectorType;
+
+  Index rows = m.rows();
+  Index depth = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+
+  VectorType v1 = VectorType::Random(rows);  
+  DynVectorType     dv1  = DynVectorType::Random(depth);
+  DynRowVectorType  drv1 = DynRowVectorType::Random(depth);
+  DynMatrixType     dm1  = dv1;
+  DynMatrixType     drm1 = drv1;
+  
+  Scalar s = v1(0);
+
+  VERIFY_IS_APPROX( v1.asDiagonal() * drv1, s*drv1 );
+  VERIFY_IS_APPROX( dv1 * v1.asDiagonal(), dv1*s );
+
+  VERIFY_IS_APPROX( v1.asDiagonal() * drm1, s*drm1 );
+  VERIFY_IS_APPROX( dm1 * v1.asDiagonal(), dm1*s );
+}
+
+template<int>
+void bug987()
+{
+  Matrix3Xd points = Matrix3Xd::Random(3, 3);
+  Vector2d diag = Vector2d::Random();
+  Matrix2Xd tmp1 = points.topRows<2>(), res1, res2;
+  VERIFY_IS_APPROX( res1 = diag.asDiagonal() * points.topRows<2>(), res2 = diag.asDiagonal() * tmp1 );
+  Matrix2d tmp2 = points.topLeftCorner<2,2>();
+  VERIFY_IS_APPROX(( res1 = points.topLeftCorner<2,2>()*diag.asDiagonal()) , res2 = tmp2*diag.asDiagonal() );
+}
+
+EIGEN_DECLARE_TEST(diagonalmatrices)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( diagonalmatrices(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_1( as_scalar_product(Matrix<float, 1, 1>()) );
+
+    CALL_SUBTEST_2( diagonalmatrices(Matrix3f()) );
+    CALL_SUBTEST_3( diagonalmatrices(Matrix<double,3,3,RowMajor>()) );
+    CALL_SUBTEST_4( diagonalmatrices(Matrix4d()) );
+    CALL_SUBTEST_5( diagonalmatrices(Matrix<float,4,4,RowMajor>()) );
+    CALL_SUBTEST_6( diagonalmatrices(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( as_scalar_product(MatrixXcf(1,1)) );
+    CALL_SUBTEST_7( diagonalmatrices(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_8( diagonalmatrices(Matrix<double,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_9( diagonalmatrices(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_9( diagonalmatrices(MatrixXf(1,1)) );
+    CALL_SUBTEST_9( as_scalar_product(MatrixXf(1,1)) );
+  }
+  CALL_SUBTEST_10( bug987<0>() );
+}

diff --git a/test/dontalign.cpp b/test/dontalign.cpp
new file mode 100644
index 0000000..2e4102b
--- /dev/null
+++ b/test/dontalign.cpp

@@ -0,0 +1,62 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined EIGEN_TEST_PART_1 || defined EIGEN_TEST_PART_2 || defined EIGEN_TEST_PART_3 || defined EIGEN_TEST_PART_4
+#define EIGEN_DONT_ALIGN
+#elif defined EIGEN_TEST_PART_5 || defined EIGEN_TEST_PART_6 || defined EIGEN_TEST_PART_7 || defined EIGEN_TEST_PART_8
+#define EIGEN_DONT_ALIGN_STATICALLY
+#endif
+
+#include "main.h"
+#include <Eigen/Dense>
+
+template<typename MatrixType>
+void dontalign(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType a = MatrixType::Random(rows,cols);
+  SquareMatrixType square = SquareMatrixType::Random(rows,rows);
+  VectorType v = VectorType::Random(rows);
+
+  VERIFY_IS_APPROX(v, square * square.colPivHouseholderQr().solve(v));
+  square = square.inverse().eval();
+  a = square * a;
+  square = square*square;
+  v = square * v;
+  v = a.adjoint() * v;
+  VERIFY(square.determinant() != Scalar(0));
+
+  // bug 219: MapAligned() was giving an assert with EIGEN_DONT_ALIGN, because Map Flags were miscomputed
+  Scalar* array = internal::aligned_new<Scalar>(rows);
+  v = VectorType::MapAligned(array, rows);
+  internal::aligned_delete(array, rows);
+}
+
+EIGEN_DECLARE_TEST(dontalign)
+{
+#if defined EIGEN_TEST_PART_1 || defined EIGEN_TEST_PART_5
+  dontalign(Matrix3d());
+  dontalign(Matrix4f());
+#elif defined EIGEN_TEST_PART_2 || defined EIGEN_TEST_PART_6
+  dontalign(Matrix3cd());
+  dontalign(Matrix4cf());
+#elif defined EIGEN_TEST_PART_3 || defined EIGEN_TEST_PART_7
+  dontalign(Matrix<float, 32, 32>());
+  dontalign(Matrix<std::complex<float>, 32, 32>());
+#elif defined EIGEN_TEST_PART_4 || defined EIGEN_TEST_PART_8
+  dontalign(MatrixXd(32, 32));
+  dontalign(MatrixXcf(32, 32));
+#endif
+}

diff --git a/test/dynalloc.cpp b/test/dynalloc.cpp
new file mode 100644
index 0000000..23c90a7
--- /dev/null
+++ b/test/dynalloc.cpp

@@ -0,0 +1,177 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#if EIGEN_MAX_ALIGN_BYTES>0
+#define ALIGNMENT EIGEN_MAX_ALIGN_BYTES
+#else
+#define ALIGNMENT 1
+#endif
+
+typedef Matrix<float,16,1> Vector16f;
+typedef Matrix<float,8,1> Vector8f;
+
+void check_handmade_aligned_malloc()
+{
+  for(int i = 1; i < 1000; i++)
+  {
+    char *p = (char*)internal::handmade_aligned_malloc(i);
+    VERIFY(internal::UIntPtr(p)%ALIGNMENT==0);
+    // if the buffer is wrongly allocated this will give a bad write --> check with valgrind
+    for(int j = 0; j < i; j++) p[j]=0;
+    internal::handmade_aligned_free(p);
+  }
+}
+
+void check_aligned_malloc()
+{
+  for(int i = ALIGNMENT; i < 1000; i++)
+  {
+    char *p = (char*)internal::aligned_malloc(i);
+    VERIFY(internal::UIntPtr(p)%ALIGNMENT==0);
+    // if the buffer is wrongly allocated this will give a bad write --> check with valgrind
+    for(int j = 0; j < i; j++) p[j]=0;
+    internal::aligned_free(p);
+  }
+}
+
+void check_aligned_new()
+{
+  for(int i = ALIGNMENT; i < 1000; i++)
+  {
+    float *p = internal::aligned_new<float>(i);
+    VERIFY(internal::UIntPtr(p)%ALIGNMENT==0);
+    // if the buffer is wrongly allocated this will give a bad write --> check with valgrind
+    for(int j = 0; j < i; j++) p[j]=0;
+    internal::aligned_delete(p,i);
+  }
+}
+
+void check_aligned_stack_alloc()
+{
+  for(int i = ALIGNMENT; i < 400; i++)
+  {
+    ei_declare_aligned_stack_constructed_variable(float,p,i,0);
+    VERIFY(internal::UIntPtr(p)%ALIGNMENT==0);
+    // if the buffer is wrongly allocated this will give a bad write --> check with valgrind
+    for(int j = 0; j < i; j++) p[j]=0;
+  }
+}
+
+
+// test compilation with both a struct and a class...
+struct MyStruct
+{
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+  char dummychar;
+  Vector16f avec;
+};
+
+class MyClassA
+{
+  public:
+    EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+    char dummychar;
+    Vector16f avec;
+};
+
+template<typename T> void check_dynaligned()
+{
+  // TODO have to be updated once we support multiple alignment values
+  if(T::SizeAtCompileTime % ALIGNMENT == 0)
+  {
+    T* obj = new T;
+    VERIFY(T::NeedsToAlign==1);
+    VERIFY(internal::UIntPtr(obj)%ALIGNMENT==0);
+    delete obj;
+  }
+}
+
+template<typename T> void check_custom_new_delete()
+{
+  {
+    T* t = new T;
+    delete t;
+  }
+  
+  {
+    std::size_t N = internal::random<std::size_t>(1,10);
+    T* t = new T[N];
+    delete[] t;
+  }
+  
+#if EIGEN_MAX_ALIGN_BYTES>0 && (!EIGEN_HAS_CXX17_OVERALIGN)
+  {
+    T* t = static_cast<T *>((T::operator new)(sizeof(T)));
+    (T::operator delete)(t, sizeof(T));
+  }
+  
+  {
+    T* t = static_cast<T *>((T::operator new)(sizeof(T)));
+    (T::operator delete)(t);
+  }
+#endif
+}
+
+EIGEN_DECLARE_TEST(dynalloc)
+{
+  // low level dynamic memory allocation
+  CALL_SUBTEST(check_handmade_aligned_malloc());
+  CALL_SUBTEST(check_aligned_malloc());
+  CALL_SUBTEST(check_aligned_new());
+  CALL_SUBTEST(check_aligned_stack_alloc());
+
+  for (int i=0; i<g_repeat*100; ++i)
+  {
+    CALL_SUBTEST( check_custom_new_delete<Vector4f>() );
+    CALL_SUBTEST( check_custom_new_delete<Vector2f>() );
+    CALL_SUBTEST( check_custom_new_delete<Matrix4f>() );
+    CALL_SUBTEST( check_custom_new_delete<MatrixXi>() );
+  }
+  
+  // check static allocation, who knows ?
+  #if EIGEN_MAX_STATIC_ALIGN_BYTES
+  for (int i=0; i<g_repeat*100; ++i)
+  {
+    CALL_SUBTEST(check_dynaligned<Vector4f>() );
+    CALL_SUBTEST(check_dynaligned<Vector2d>() );
+    CALL_SUBTEST(check_dynaligned<Matrix4f>() );
+    CALL_SUBTEST(check_dynaligned<Vector4d>() );
+    CALL_SUBTEST(check_dynaligned<Vector4i>() );
+    CALL_SUBTEST(check_dynaligned<Vector8f>() );
+    CALL_SUBTEST(check_dynaligned<Vector16f>() );
+  }
+
+  {
+    MyStruct foo0;  VERIFY(internal::UIntPtr(foo0.avec.data())%ALIGNMENT==0);
+    MyClassA fooA;  VERIFY(internal::UIntPtr(fooA.avec.data())%ALIGNMENT==0);
+  }
+  
+  // dynamic allocation, single object
+  for (int i=0; i<g_repeat*100; ++i)
+  {
+    MyStruct *foo0 = new MyStruct();  VERIFY(internal::UIntPtr(foo0->avec.data())%ALIGNMENT==0);
+    MyClassA *fooA = new MyClassA();  VERIFY(internal::UIntPtr(fooA->avec.data())%ALIGNMENT==0);
+    delete foo0;
+    delete fooA;
+  }
+
+  // dynamic allocation, array
+  const int N = 10;
+  for (int i=0; i<g_repeat*100; ++i)
+  {
+    MyStruct *foo0 = new MyStruct[N];  VERIFY(internal::UIntPtr(foo0->avec.data())%ALIGNMENT==0);
+    MyClassA *fooA = new MyClassA[N];  VERIFY(internal::UIntPtr(fooA->avec.data())%ALIGNMENT==0);
+    delete[] foo0;
+    delete[] fooA;
+  }
+  #endif
+  
+}

diff --git a/test/eigen2support.cpp b/test/eigen2support.cpp
new file mode 100644
index 0000000..49d7328
--- /dev/null
+++ b/test/eigen2support.cpp

@@ -0,0 +1,65 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN2_SUPPORT
+
+#include "main.h"
+
+template<typename MatrixType> void eigen2support(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m3(rows, cols);
+
+  Scalar  s1 = internal::random<Scalar>(),
+          s2 = internal::random<Scalar>();
+
+  // scalar addition
+  VERIFY_IS_APPROX(m1.cwise() + s1, s1 + m1.cwise());
+  VERIFY_IS_APPROX(m1.cwise() + s1, MatrixType::Constant(rows,cols,s1) + m1);
+  VERIFY_IS_APPROX((m1*Scalar(2)).cwise() - s2, (m1+m1) - MatrixType::Constant(rows,cols,s2) );
+  m3 = m1;
+  m3.cwise() += s2;
+  VERIFY_IS_APPROX(m3, m1.cwise() + s2);
+  m3 = m1;
+  m3.cwise() -= s1;
+  VERIFY_IS_APPROX(m3, m1.cwise() - s1);
+
+  VERIFY_IS_EQUAL((m1.corner(TopLeft,1,1)), (m1.block(0,0,1,1)));
+  VERIFY_IS_EQUAL((m1.template corner<1,1>(TopLeft)), (m1.template block<1,1>(0,0)));
+  VERIFY_IS_EQUAL((m1.col(0).start(1)), (m1.col(0).segment(0,1)));
+  VERIFY_IS_EQUAL((m1.col(0).template start<1>()), (m1.col(0).segment(0,1)));
+  VERIFY_IS_EQUAL((m1.col(0).end(1)), (m1.col(0).segment(rows-1,1)));
+  VERIFY_IS_EQUAL((m1.col(0).template end<1>()), (m1.col(0).segment(rows-1,1)));
+  
+  using std::cos;
+  using numext::real;
+  using numext::abs2;
+  VERIFY_IS_EQUAL(ei_cos(s1), cos(s1));
+  VERIFY_IS_EQUAL(ei_real(s1), real(s1));
+  VERIFY_IS_EQUAL(ei_abs2(s1), abs2(s1));
+
+  m1.minor(0,0);
+}
+
+EIGEN_DECLARE_TEST(eigen2support)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( eigen2support(Matrix<double,1,1>()) );
+    CALL_SUBTEST_2( eigen2support(MatrixXd(1,1)) );
+    CALL_SUBTEST_4( eigen2support(Matrix3f()) );
+    CALL_SUBTEST_5( eigen2support(Matrix4d()) );
+    CALL_SUBTEST_2( eigen2support(MatrixXf(200,200)) );
+    CALL_SUBTEST_6( eigen2support(MatrixXcd(100,100)) );
+  }
+}

diff --git a/test/eigensolver_complex.cpp b/test/eigensolver_complex.cpp
new file mode 100644
index 0000000..c5373f4
--- /dev/null
+++ b/test/eigensolver_complex.cpp

@@ -0,0 +1,176 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <limits>
+#include <Eigen/Eigenvalues>
+#include <Eigen/LU>
+
+template<typename MatrixType> bool find_pivot(typename MatrixType::Scalar tol, MatrixType &diffs, Index col=0)
+{
+  bool match = diffs.diagonal().sum() <= tol;
+  if(match || col==diffs.cols())
+  {
+    return match;
+  }
+  else
+  {
+    Index n = diffs.cols();
+    std::vector<std::pair<Index,Index> > transpositions;
+    for(Index i=col; i<n; ++i)
+    {
+      Index best_index(0);
+      if(diffs.col(col).segment(col,n-i).minCoeff(&best_index) > tol)
+        break;
+      
+      best_index += col;
+      
+      diffs.row(col).swap(diffs.row(best_index));
+      if(find_pivot(tol,diffs,col+1)) return true;
+      diffs.row(col).swap(diffs.row(best_index));
+      
+      // move current pivot to the end
+      diffs.row(n-(i-col)-1).swap(diffs.row(best_index));
+      transpositions.push_back(std::pair<Index,Index>(n-(i-col)-1,best_index));
+    }
+    // restore
+    for(Index k=transpositions.size()-1; k>=0; --k)
+      diffs.row(transpositions[k].first).swap(diffs.row(transpositions[k].second));
+  }
+  return false;
+}
+
+/* Check that two column vectors are approximately equal up to permutations.
+ * Initially, this method checked that the k-th power sums are equal for all k = 1, ..., vec1.rows(),
+ * however this strategy is numerically inacurate because of numerical cancellation issues.
+ */
+template<typename VectorType>
+void verify_is_approx_upto_permutation(const VectorType& vec1, const VectorType& vec2)
+{
+  typedef typename VectorType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  VERIFY(vec1.cols() == 1);
+  VERIFY(vec2.cols() == 1);
+  VERIFY(vec1.rows() == vec2.rows());
+  
+  Index n = vec1.rows();
+  RealScalar tol = test_precision<RealScalar>()*test_precision<RealScalar>()*numext::maxi(vec1.squaredNorm(),vec2.squaredNorm());
+  Matrix<RealScalar,Dynamic,Dynamic> diffs = (vec1.rowwise().replicate(n) - vec2.rowwise().replicate(n).transpose()).cwiseAbs2();
+  
+  VERIFY( find_pivot(tol, diffs) );
+}
+
+
+template<typename MatrixType> void eigensolver(const MatrixType& m)
+{
+  /* this test covers the following files:
+     ComplexEigenSolver.h, and indirectly ComplexSchur.h
+  */
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  MatrixType a = MatrixType::Random(rows,cols);
+  MatrixType symmA =  a.adjoint() * a;
+
+  ComplexEigenSolver<MatrixType> ei0(symmA);
+  VERIFY_IS_EQUAL(ei0.info(), Success);
+  VERIFY_IS_APPROX(symmA * ei0.eigenvectors(), ei0.eigenvectors() * ei0.eigenvalues().asDiagonal());
+
+  ComplexEigenSolver<MatrixType> ei1(a);
+  VERIFY_IS_EQUAL(ei1.info(), Success);
+  VERIFY_IS_APPROX(a * ei1.eigenvectors(), ei1.eigenvectors() * ei1.eigenvalues().asDiagonal());
+  // Note: If MatrixType is real then a.eigenvalues() uses EigenSolver and thus
+  // another algorithm so results may differ slightly
+  verify_is_approx_upto_permutation(a.eigenvalues(), ei1.eigenvalues());
+
+  ComplexEigenSolver<MatrixType> ei2;
+  ei2.setMaxIterations(ComplexSchur<MatrixType>::m_maxIterationsPerRow * rows).compute(a);
+  VERIFY_IS_EQUAL(ei2.info(), Success);
+  VERIFY_IS_EQUAL(ei2.eigenvectors(), ei1.eigenvectors());
+  VERIFY_IS_EQUAL(ei2.eigenvalues(), ei1.eigenvalues());
+  if (rows > 2) {
+    ei2.setMaxIterations(1).compute(a);
+    VERIFY_IS_EQUAL(ei2.info(), NoConvergence);
+    VERIFY_IS_EQUAL(ei2.getMaxIterations(), 1);
+  }
+
+  ComplexEigenSolver<MatrixType> eiNoEivecs(a, false);
+  VERIFY_IS_EQUAL(eiNoEivecs.info(), Success);
+  VERIFY_IS_APPROX(ei1.eigenvalues(), eiNoEivecs.eigenvalues());
+
+  // Regression test for issue #66
+  MatrixType z = MatrixType::Zero(rows,cols);
+  ComplexEigenSolver<MatrixType> eiz(z);
+  VERIFY((eiz.eigenvalues().cwiseEqual(0)).all());
+
+  MatrixType id = MatrixType::Identity(rows, cols);
+  VERIFY_IS_APPROX(id.operatorNorm(), RealScalar(1));
+
+  if (rows > 1 && rows < 20)
+  {
+    // Test matrix with NaN
+    a(0,0) = std::numeric_limits<typename MatrixType::RealScalar>::quiet_NaN();
+    ComplexEigenSolver<MatrixType> eiNaN(a);
+    VERIFY_IS_EQUAL(eiNaN.info(), NoConvergence);
+  }
+
+  // regression test for bug 1098
+  {
+    ComplexEigenSolver<MatrixType> eig(a.adjoint() * a);
+    eig.compute(a.adjoint() * a);
+  }
+
+  // regression test for bug 478
+  {
+    a.setZero();
+    ComplexEigenSolver<MatrixType> ei3(a);
+    VERIFY_IS_EQUAL(ei3.info(), Success);
+    VERIFY_IS_MUCH_SMALLER_THAN(ei3.eigenvalues().norm(),RealScalar(1));
+    VERIFY((ei3.eigenvectors().transpose()*ei3.eigenvectors().transpose()).eval().isIdentity());
+  }
+}
+
+template<typename MatrixType> void eigensolver_verify_assert(const MatrixType& m)
+{
+  ComplexEigenSolver<MatrixType> eig;
+  VERIFY_RAISES_ASSERT(eig.eigenvectors());
+  VERIFY_RAISES_ASSERT(eig.eigenvalues());
+
+  MatrixType a = MatrixType::Random(m.rows(),m.cols());
+  eig.compute(a, false);
+  VERIFY_RAISES_ASSERT(eig.eigenvectors());
+}
+
+EIGEN_DECLARE_TEST(eigensolver_complex)
+{
+  int s = 0;
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( eigensolver(Matrix4cf()) );
+    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
+    CALL_SUBTEST_2( eigensolver(MatrixXcd(s,s)) );
+    CALL_SUBTEST_3( eigensolver(Matrix<std::complex<float>, 1, 1>()) );
+    CALL_SUBTEST_4( eigensolver(Matrix3f()) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+  }
+  CALL_SUBTEST_1( eigensolver_verify_assert(Matrix4cf()) );
+  s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
+  CALL_SUBTEST_2( eigensolver_verify_assert(MatrixXcd(s,s)) );
+  CALL_SUBTEST_3( eigensolver_verify_assert(Matrix<std::complex<float>, 1, 1>()) );
+  CALL_SUBTEST_4( eigensolver_verify_assert(Matrix3f()) );
+
+  // Test problem size constructors
+  CALL_SUBTEST_5(ComplexEigenSolver<MatrixXf> tmp(s));
+  
+  TEST_SET_BUT_UNUSED_VARIABLE(s)
+}

diff --git a/test/eigensolver_generalized_real.cpp b/test/eigensolver_generalized_real.cpp
new file mode 100644
index 0000000..95ed431
--- /dev/null
+++ b/test/eigensolver_generalized_real.cpp

@@ -0,0 +1,103 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_RUNTIME_NO_MALLOC
+#include "main.h"
+#include <limits>
+#include <Eigen/Eigenvalues>
+#include <Eigen/LU>
+
+template<typename MatrixType> void generalized_eigensolver_real(const MatrixType& m)
+{
+  /* this test covers the following files:
+     GeneralizedEigenSolver.h
+  */
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  typedef typename MatrixType::Scalar Scalar;
+  typedef std::complex<Scalar> ComplexScalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+
+  MatrixType a = MatrixType::Random(rows,cols);
+  MatrixType b = MatrixType::Random(rows,cols);
+  MatrixType a1 = MatrixType::Random(rows,cols);
+  MatrixType b1 = MatrixType::Random(rows,cols);
+  MatrixType spdA =  a.adjoint() * a + a1.adjoint() * a1;
+  MatrixType spdB =  b.adjoint() * b + b1.adjoint() * b1;
+
+  // lets compare to GeneralizedSelfAdjointEigenSolver
+  {
+    GeneralizedSelfAdjointEigenSolver<MatrixType> symmEig(spdA, spdB);
+    GeneralizedEigenSolver<MatrixType> eig(spdA, spdB);
+
+    VERIFY_IS_EQUAL(eig.eigenvalues().imag().cwiseAbs().maxCoeff(), 0);
+
+    VectorType realEigenvalues = eig.eigenvalues().real();
+    std::sort(realEigenvalues.data(), realEigenvalues.data()+realEigenvalues.size());
+    VERIFY_IS_APPROX(realEigenvalues, symmEig.eigenvalues());
+
+    // check eigenvectors
+    typename GeneralizedEigenSolver<MatrixType>::EigenvectorsType D = eig.eigenvalues().asDiagonal();
+    typename GeneralizedEigenSolver<MatrixType>::EigenvectorsType V = eig.eigenvectors();
+    VERIFY_IS_APPROX(spdA*V, spdB*V*D);
+  }
+
+  // non symmetric case:
+  {
+    GeneralizedEigenSolver<MatrixType> eig(rows);
+    // TODO enable full-prealocation of required memory, this probably requires an in-place mode for HessenbergDecomposition
+    //Eigen::internal::set_is_malloc_allowed(false);
+    eig.compute(a,b);
+    //Eigen::internal::set_is_malloc_allowed(true);
+    for(Index k=0; k<cols; ++k)
+    {
+      Matrix<ComplexScalar,Dynamic,Dynamic> tmp = (eig.betas()(k)*a).template cast<ComplexScalar>() - eig.alphas()(k)*b;
+      if(tmp.size()>1 && tmp.norm()>(std::numeric_limits<Scalar>::min)())
+        tmp /= tmp.norm();
+      VERIFY_IS_MUCH_SMALLER_THAN( std::abs(tmp.determinant()), Scalar(1) );
+    }
+    // check eigenvectors
+    typename GeneralizedEigenSolver<MatrixType>::EigenvectorsType D = eig.eigenvalues().asDiagonal();
+    typename GeneralizedEigenSolver<MatrixType>::EigenvectorsType V = eig.eigenvectors();
+    VERIFY_IS_APPROX(a*V, b*V*D);
+  }
+
+  // regression test for bug 1098
+  {
+    GeneralizedSelfAdjointEigenSolver<MatrixType> eig1(a.adjoint() * a,b.adjoint() * b);
+    eig1.compute(a.adjoint() * a,b.adjoint() * b);
+    GeneralizedEigenSolver<MatrixType> eig2(a.adjoint() * a,b.adjoint() * b);
+    eig2.compute(a.adjoint() * a,b.adjoint() * b);
+  }
+
+  // check without eigenvectors
+  {
+    GeneralizedEigenSolver<MatrixType> eig1(spdA, spdB, true);
+    GeneralizedEigenSolver<MatrixType> eig2(spdA, spdB, false);
+    VERIFY_IS_APPROX(eig1.eigenvalues(), eig2.eigenvalues());
+  }
+}
+
+EIGEN_DECLARE_TEST(eigensolver_generalized_real)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    int s = 0;
+    CALL_SUBTEST_1( generalized_eigensolver_real(Matrix4f()) );
+    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
+    CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(s,s)) );
+
+    // some trivial but implementation-wise special cases
+    CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(1,1)) );
+    CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(2,2)) );
+    CALL_SUBTEST_3( generalized_eigensolver_real(Matrix<double,1,1>()) );
+    CALL_SUBTEST_4( generalized_eigensolver_real(Matrix2d()) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+  }
+}

diff --git a/test/eigensolver_generic.cpp b/test/eigensolver_generic.cpp
new file mode 100644
index 0000000..7adb986
--- /dev/null
+++ b/test/eigensolver_generic.cpp

@@ -0,0 +1,247 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010,2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <limits>
+#include <Eigen/Eigenvalues>
+
+template<typename EigType,typename MatType>
+void check_eigensolver_for_given_mat(const EigType &eig, const MatType& a)
+{
+  typedef typename NumTraits<typename MatType::Scalar>::Real RealScalar;
+  typedef Matrix<RealScalar, MatType::RowsAtCompileTime, 1> RealVectorType;
+  typedef typename std::complex<RealScalar> Complex;
+  Index n = a.rows();
+  VERIFY_IS_EQUAL(eig.info(), Success);
+  VERIFY_IS_APPROX(a * eig.pseudoEigenvectors(), eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix());
+  VERIFY_IS_APPROX(a.template cast<Complex>() * eig.eigenvectors(),
+                   eig.eigenvectors() * eig.eigenvalues().asDiagonal());
+  VERIFY_IS_APPROX(eig.eigenvectors().colwise().norm(), RealVectorType::Ones(n).transpose());
+  VERIFY_IS_APPROX(a.eigenvalues(), eig.eigenvalues());
+}
+
+template<typename MatrixType> void eigensolver(const MatrixType& m)
+{
+  /* this test covers the following files:
+     EigenSolver.h
+  */
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef typename std::complex<RealScalar> Complex;
+
+  MatrixType a = MatrixType::Random(rows,cols);
+  MatrixType a1 = MatrixType::Random(rows,cols);
+  MatrixType symmA =  a.adjoint() * a + a1.adjoint() * a1;
+
+  EigenSolver<MatrixType> ei0(symmA);
+  VERIFY_IS_EQUAL(ei0.info(), Success);
+  VERIFY_IS_APPROX(symmA * ei0.pseudoEigenvectors(), ei0.pseudoEigenvectors() * ei0.pseudoEigenvalueMatrix());
+  VERIFY_IS_APPROX((symmA.template cast<Complex>()) * (ei0.pseudoEigenvectors().template cast<Complex>()),
+    (ei0.pseudoEigenvectors().template cast<Complex>()) * (ei0.eigenvalues().asDiagonal()));
+
+  EigenSolver<MatrixType> ei1(a);
+  CALL_SUBTEST( check_eigensolver_for_given_mat(ei1,a) );
+
+  EigenSolver<MatrixType> ei2;
+  ei2.setMaxIterations(RealSchur<MatrixType>::m_maxIterationsPerRow * rows).compute(a);
+  VERIFY_IS_EQUAL(ei2.info(), Success);
+  VERIFY_IS_EQUAL(ei2.eigenvectors(), ei1.eigenvectors());
+  VERIFY_IS_EQUAL(ei2.eigenvalues(), ei1.eigenvalues());
+  if (rows > 2) {
+    ei2.setMaxIterations(1).compute(a);
+    VERIFY_IS_EQUAL(ei2.info(), NoConvergence);
+    VERIFY_IS_EQUAL(ei2.getMaxIterations(), 1);
+  }
+
+  EigenSolver<MatrixType> eiNoEivecs(a, false);
+  VERIFY_IS_EQUAL(eiNoEivecs.info(), Success);
+  VERIFY_IS_APPROX(ei1.eigenvalues(), eiNoEivecs.eigenvalues());
+  VERIFY_IS_APPROX(ei1.pseudoEigenvalueMatrix(), eiNoEivecs.pseudoEigenvalueMatrix());
+
+  MatrixType id = MatrixType::Identity(rows, cols);
+  VERIFY_IS_APPROX(id.operatorNorm(), RealScalar(1));
+
+  if (rows > 2 && rows < 20)
+  {
+    // Test matrix with NaN
+    a(0,0) = std::numeric_limits<typename MatrixType::RealScalar>::quiet_NaN();
+    EigenSolver<MatrixType> eiNaN(a);
+    VERIFY_IS_NOT_EQUAL(eiNaN.info(), Success);
+  }
+
+  // regression test for bug 1098
+  {
+    EigenSolver<MatrixType> eig(a.adjoint() * a);
+    eig.compute(a.adjoint() * a);
+  }
+
+  // regression test for bug 478
+  {
+    a.setZero();
+    EigenSolver<MatrixType> ei3(a);
+    VERIFY_IS_EQUAL(ei3.info(), Success);
+    VERIFY_IS_MUCH_SMALLER_THAN(ei3.eigenvalues().norm(),RealScalar(1));
+    VERIFY((ei3.eigenvectors().transpose()*ei3.eigenvectors().transpose()).eval().isIdentity());
+  }
+}
+
+template<typename MatrixType> void eigensolver_verify_assert(const MatrixType& m)
+{
+  EigenSolver<MatrixType> eig;
+  VERIFY_RAISES_ASSERT(eig.eigenvectors());
+  VERIFY_RAISES_ASSERT(eig.pseudoEigenvectors());
+  VERIFY_RAISES_ASSERT(eig.pseudoEigenvalueMatrix());
+  VERIFY_RAISES_ASSERT(eig.eigenvalues());
+
+  MatrixType a = MatrixType::Random(m.rows(),m.cols());
+  eig.compute(a, false);
+  VERIFY_RAISES_ASSERT(eig.eigenvectors());
+  VERIFY_RAISES_ASSERT(eig.pseudoEigenvectors());
+}
+
+
+template<typename CoeffType>
+Matrix<typename CoeffType::Scalar,Dynamic,Dynamic>
+make_companion(const CoeffType& coeffs)
+{
+  Index n = coeffs.size()-1;
+  Matrix<typename CoeffType::Scalar,Dynamic,Dynamic> res(n,n);
+  res.setZero();
+	res.row(0) = -coeffs.tail(n) / coeffs(0);
+	res.diagonal(-1).setOnes();
+  return res;
+}
+
+template<int>
+void eigensolver_generic_extra()
+{
+  {
+    // regression test for bug 793
+    MatrixXd a(3,3);
+    a << 0,  0,  1,
+        1,  1, 1,
+        1, 1e+200,  1;
+    Eigen::EigenSolver<MatrixXd> eig(a);
+    double scale = 1e-200; // scale to avoid overflow during the comparisons
+    VERIFY_IS_APPROX(a * eig.pseudoEigenvectors()*scale, eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()*scale);
+    VERIFY_IS_APPROX(a * eig.eigenvectors()*scale, eig.eigenvectors() * eig.eigenvalues().asDiagonal()*scale);
+  }
+  {
+    // check a case where all eigenvalues are null.
+    MatrixXd a(2,2);
+    a << 1,  1,
+        -1, -1;
+    Eigen::EigenSolver<MatrixXd> eig(a);
+    VERIFY_IS_APPROX(eig.pseudoEigenvectors().squaredNorm(), 2.);
+    VERIFY_IS_APPROX((a * eig.pseudoEigenvectors()).norm()+1., 1.);
+    VERIFY_IS_APPROX((eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()).norm()+1., 1.);
+    VERIFY_IS_APPROX((a * eig.eigenvectors()).norm()+1., 1.);
+    VERIFY_IS_APPROX((eig.eigenvectors() * eig.eigenvalues().asDiagonal()).norm()+1., 1.);
+  }
+
+  // regression test for bug 933
+  {
+    {
+      VectorXd coeffs(5); coeffs << 1, -3, -175, -225, 2250;
+      MatrixXd C = make_companion(coeffs);
+      EigenSolver<MatrixXd> eig(C);
+      CALL_SUBTEST( check_eigensolver_for_given_mat(eig,C) );
+    }
+    {
+      // this test is tricky because it requires high accuracy in smallest eigenvalues
+      VectorXd coeffs(5); coeffs << 6.154671e-15, -1.003870e-10, -9.819570e-01, 3.995715e+03, 2.211511e+08;
+      MatrixXd C = make_companion(coeffs);
+      EigenSolver<MatrixXd> eig(C);
+      CALL_SUBTEST( check_eigensolver_for_given_mat(eig,C) );
+      Index n = C.rows();
+      for(Index i=0;i<n;++i)
+      {
+        typedef std::complex<double> Complex;
+        MatrixXcd ac = C.cast<Complex>();
+        ac.diagonal().array() -= eig.eigenvalues()(i);
+        VectorXd sv = ac.jacobiSvd().singularValues();
+        // comparing to sv(0) is not enough here to catch the "bug",
+        // the hard-coded 1.0 is important!
+        VERIFY_IS_MUCH_SMALLER_THAN(sv(n-1), 1.0);
+      }
+    }
+  }
+  // regression test for bug 1557
+  {
+    // this test is interesting because it contains zeros on the diagonal.
+    MatrixXd A_bug1557(3,3);
+    A_bug1557 << 0, 0, 0, 1, 0, 0.5887907064808635127, 0, 1, 0;
+    EigenSolver<MatrixXd> eig(A_bug1557);
+    CALL_SUBTEST( check_eigensolver_for_given_mat(eig,A_bug1557) );
+  }
+
+  // regression test for bug 1174
+  {
+    Index n = 12;
+    MatrixXf A_bug1174(n,n);
+    A_bug1174 <<  262144, 0, 0, 262144, 786432, 0, 0, 0, 0, 0, 0, 786432,
+                  262144, 0, 0, 262144, 786432, 0, 0, 0, 0, 0, 0, 786432,
+                  262144, 0, 0, 262144, 786432, 0, 0, 0, 0, 0, 0, 786432,
+                  262144, 0, 0, 262144, 786432, 0, 0, 0, 0, 0, 0, 786432,
+                  0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0,
+                  0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0,
+                  0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0,
+                  0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0,
+                  0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0,
+                  0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0,
+                  0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0,
+                  0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0;
+    EigenSolver<MatrixXf> eig(A_bug1174);
+    CALL_SUBTEST( check_eigensolver_for_given_mat(eig,A_bug1174) );
+  }
+}
+
+EIGEN_DECLARE_TEST(eigensolver_generic)
+{
+  int s = 0;
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( eigensolver(Matrix4f()) );
+    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
+    CALL_SUBTEST_2( eigensolver(MatrixXd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+
+    // some trivial but implementation-wise tricky cases
+    CALL_SUBTEST_2( eigensolver(MatrixXd(1,1)) );
+    CALL_SUBTEST_2( eigensolver(MatrixXd(2,2)) );
+    CALL_SUBTEST_3( eigensolver(Matrix<double,1,1>()) );
+    CALL_SUBTEST_4( eigensolver(Matrix2d()) );
+  }
+
+  CALL_SUBTEST_1( eigensolver_verify_assert(Matrix4f()) );
+  s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
+  CALL_SUBTEST_2( eigensolver_verify_assert(MatrixXd(s,s)) );
+  CALL_SUBTEST_3( eigensolver_verify_assert(Matrix<double,1,1>()) );
+  CALL_SUBTEST_4( eigensolver_verify_assert(Matrix2d()) );
+
+  // Test problem size constructors
+  CALL_SUBTEST_5(EigenSolver<MatrixXf> tmp(s));
+
+  // regression test for bug 410
+  CALL_SUBTEST_2(
+  {
+     MatrixXd A(1,1);
+     A(0,0) = std::sqrt(-1.); // is Not-a-Number
+     Eigen::EigenSolver<MatrixXd> solver(A);
+     VERIFY_IS_EQUAL(solver.info(), NumericalIssue);
+  }
+  );
+  
+  CALL_SUBTEST_2( eigensolver_generic_extra<0>() );
+  
+  TEST_SET_BUT_UNUSED_VARIABLE(s)
+}

diff --git a/test/eigensolver_selfadjoint.cpp b/test/eigensolver_selfadjoint.cpp
new file mode 100644
index 0000000..0fb2f4d
--- /dev/null
+++ b/test/eigensolver_selfadjoint.cpp

@@ -0,0 +1,281 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include "svd_fill.h"
+#include <limits>
+#include <Eigen/Eigenvalues>
+#include <Eigen/SparseCore>
+
+
+template<typename MatrixType> void selfadjointeigensolver_essential_check(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  RealScalar eival_eps = numext::mini<RealScalar>(test_precision<RealScalar>(),  NumTraits<Scalar>::dummy_precision()*20000);
+  
+  SelfAdjointEigenSolver<MatrixType> eiSymm(m);
+  VERIFY_IS_EQUAL(eiSymm.info(), Success);
+
+  RealScalar scaling = m.cwiseAbs().maxCoeff();
+
+  if(scaling<(std::numeric_limits<RealScalar>::min)())
+  {
+    VERIFY(eiSymm.eigenvalues().cwiseAbs().maxCoeff() <= (std::numeric_limits<RealScalar>::min)());
+  }
+  else
+  {
+    VERIFY_IS_APPROX((m.template selfadjointView<Lower>() * eiSymm.eigenvectors())/scaling,
+                     (eiSymm.eigenvectors() * eiSymm.eigenvalues().asDiagonal())/scaling);
+  }
+  VERIFY_IS_APPROX(m.template selfadjointView<Lower>().eigenvalues(), eiSymm.eigenvalues());
+  VERIFY_IS_UNITARY(eiSymm.eigenvectors());
+
+  if(m.cols()<=4)
+  {
+    SelfAdjointEigenSolver<MatrixType> eiDirect;
+    eiDirect.computeDirect(m);  
+    VERIFY_IS_EQUAL(eiDirect.info(), Success);
+    if(! eiSymm.eigenvalues().isApprox(eiDirect.eigenvalues(), eival_eps) )
+    {
+      std::cerr << "reference eigenvalues: " << eiSymm.eigenvalues().transpose() << "\n"
+                << "obtained eigenvalues:  " << eiDirect.eigenvalues().transpose() << "\n"
+                << "diff:                  " << (eiSymm.eigenvalues()-eiDirect.eigenvalues()).transpose() << "\n"
+                << "error (eps):           " << (eiSymm.eigenvalues()-eiDirect.eigenvalues()).norm() / eiSymm.eigenvalues().norm() << "  (" << eival_eps << ")\n";
+    }
+    if(scaling<(std::numeric_limits<RealScalar>::min)())
+    {
+      VERIFY(eiDirect.eigenvalues().cwiseAbs().maxCoeff() <= (std::numeric_limits<RealScalar>::min)());
+    }
+    else
+    {
+      VERIFY_IS_APPROX(eiSymm.eigenvalues()/scaling, eiDirect.eigenvalues()/scaling);
+      VERIFY_IS_APPROX((m.template selfadjointView<Lower>() * eiDirect.eigenvectors())/scaling,
+                       (eiDirect.eigenvectors() * eiDirect.eigenvalues().asDiagonal())/scaling);
+      VERIFY_IS_APPROX(m.template selfadjointView<Lower>().eigenvalues()/scaling, eiDirect.eigenvalues()/scaling);
+    }
+
+    VERIFY_IS_UNITARY(eiDirect.eigenvectors());
+  }
+}
+
+template<typename MatrixType> void selfadjointeigensolver(const MatrixType& m)
+{
+  /* this test covers the following files:
+     EigenSolver.h, SelfAdjointEigenSolver.h (and indirectly: Tridiagonalization.h)
+  */
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  RealScalar largerEps = 10*test_precision<RealScalar>();
+
+  MatrixType a = MatrixType::Random(rows,cols);
+  MatrixType a1 = MatrixType::Random(rows,cols);
+  MatrixType symmA =  a.adjoint() * a + a1.adjoint() * a1;
+  MatrixType symmC = symmA;
+  
+  svd_fill_random(symmA,Symmetric);
+
+  symmA.template triangularView<StrictlyUpper>().setZero();
+  symmC.template triangularView<StrictlyUpper>().setZero();
+
+  MatrixType b = MatrixType::Random(rows,cols);
+  MatrixType b1 = MatrixType::Random(rows,cols);
+  MatrixType symmB = b.adjoint() * b + b1.adjoint() * b1;
+  symmB.template triangularView<StrictlyUpper>().setZero();
+  
+  CALL_SUBTEST( selfadjointeigensolver_essential_check(symmA) );
+
+  SelfAdjointEigenSolver<MatrixType> eiSymm(symmA);
+  // generalized eigen pb
+  GeneralizedSelfAdjointEigenSolver<MatrixType> eiSymmGen(symmC, symmB);
+
+  SelfAdjointEigenSolver<MatrixType> eiSymmNoEivecs(symmA, false);
+  VERIFY_IS_EQUAL(eiSymmNoEivecs.info(), Success);
+  VERIFY_IS_APPROX(eiSymm.eigenvalues(), eiSymmNoEivecs.eigenvalues());
+  
+  // generalized eigen problem Ax = lBx
+  eiSymmGen.compute(symmC, symmB,Ax_lBx);
+  VERIFY_IS_EQUAL(eiSymmGen.info(), Success);
+  VERIFY((symmC.template selfadjointView<Lower>() * eiSymmGen.eigenvectors()).isApprox(
+          symmB.template selfadjointView<Lower>() * (eiSymmGen.eigenvectors() * eiSymmGen.eigenvalues().asDiagonal()), largerEps));
+
+  // generalized eigen problem BAx = lx
+  eiSymmGen.compute(symmC, symmB,BAx_lx);
+  VERIFY_IS_EQUAL(eiSymmGen.info(), Success);
+  VERIFY((symmB.template selfadjointView<Lower>() * (symmC.template selfadjointView<Lower>() * eiSymmGen.eigenvectors())).isApprox(
+         (eiSymmGen.eigenvectors() * eiSymmGen.eigenvalues().asDiagonal()), largerEps));
+
+  // generalized eigen problem ABx = lx
+  eiSymmGen.compute(symmC, symmB,ABx_lx);
+  VERIFY_IS_EQUAL(eiSymmGen.info(), Success);
+  VERIFY((symmC.template selfadjointView<Lower>() * (symmB.template selfadjointView<Lower>() * eiSymmGen.eigenvectors())).isApprox(
+         (eiSymmGen.eigenvectors() * eiSymmGen.eigenvalues().asDiagonal()), largerEps));
+
+
+  eiSymm.compute(symmC);
+  MatrixType sqrtSymmA = eiSymm.operatorSqrt();
+  VERIFY_IS_APPROX(MatrixType(symmC.template selfadjointView<Lower>()), sqrtSymmA*sqrtSymmA);
+  VERIFY_IS_APPROX(sqrtSymmA, symmC.template selfadjointView<Lower>()*eiSymm.operatorInverseSqrt());
+
+  MatrixType id = MatrixType::Identity(rows, cols);
+  VERIFY_IS_APPROX(id.template selfadjointView<Lower>().operatorNorm(), RealScalar(1));
+
+  SelfAdjointEigenSolver<MatrixType> eiSymmUninitialized;
+  VERIFY_RAISES_ASSERT(eiSymmUninitialized.info());
+  VERIFY_RAISES_ASSERT(eiSymmUninitialized.eigenvalues());
+  VERIFY_RAISES_ASSERT(eiSymmUninitialized.eigenvectors());
+  VERIFY_RAISES_ASSERT(eiSymmUninitialized.operatorSqrt());
+  VERIFY_RAISES_ASSERT(eiSymmUninitialized.operatorInverseSqrt());
+
+  eiSymmUninitialized.compute(symmA, false);
+  VERIFY_RAISES_ASSERT(eiSymmUninitialized.eigenvectors());
+  VERIFY_RAISES_ASSERT(eiSymmUninitialized.operatorSqrt());
+  VERIFY_RAISES_ASSERT(eiSymmUninitialized.operatorInverseSqrt());
+
+  // test Tridiagonalization's methods
+  Tridiagonalization<MatrixType> tridiag(symmC);
+  VERIFY_IS_APPROX(tridiag.diagonal(), tridiag.matrixT().diagonal());
+  VERIFY_IS_APPROX(tridiag.subDiagonal(), tridiag.matrixT().template diagonal<-1>());
+  Matrix<RealScalar,Dynamic,Dynamic> T = tridiag.matrixT();
+  if(rows>1 && cols>1) {
+    // FIXME check that upper and lower part are 0:
+    //VERIFY(T.topRightCorner(rows-2, cols-2).template triangularView<Upper>().isZero());
+  }
+  VERIFY_IS_APPROX(tridiag.diagonal(), T.diagonal());
+  VERIFY_IS_APPROX(tridiag.subDiagonal(), T.template diagonal<1>());
+  VERIFY_IS_APPROX(MatrixType(symmC.template selfadjointView<Lower>()), tridiag.matrixQ() * tridiag.matrixT().eval() * MatrixType(tridiag.matrixQ()).adjoint());
+  VERIFY_IS_APPROX(MatrixType(symmC.template selfadjointView<Lower>()), tridiag.matrixQ() * tridiag.matrixT() * tridiag.matrixQ().adjoint());
+  
+  // Test computation of eigenvalues from tridiagonal matrix
+  if(rows > 1)
+  {
+    SelfAdjointEigenSolver<MatrixType> eiSymmTridiag;
+    eiSymmTridiag.computeFromTridiagonal(tridiag.matrixT().diagonal(), tridiag.matrixT().diagonal(-1), ComputeEigenvectors);
+    VERIFY_IS_APPROX(eiSymm.eigenvalues(), eiSymmTridiag.eigenvalues());
+    VERIFY_IS_APPROX(tridiag.matrixT(), eiSymmTridiag.eigenvectors().real() * eiSymmTridiag.eigenvalues().asDiagonal() * eiSymmTridiag.eigenvectors().real().transpose());
+  }
+
+  if (rows > 1 && rows < 20)
+  {
+    // Test matrix with NaN
+    symmC(0,0) = std::numeric_limits<typename MatrixType::RealScalar>::quiet_NaN();
+    SelfAdjointEigenSolver<MatrixType> eiSymmNaN(symmC);
+    VERIFY_IS_EQUAL(eiSymmNaN.info(), NoConvergence);
+  }
+
+  // regression test for bug 1098
+  {
+    SelfAdjointEigenSolver<MatrixType> eig(a.adjoint() * a);
+    eig.compute(a.adjoint() * a);
+  }
+
+  // regression test for bug 478
+  {
+    a.setZero();
+    SelfAdjointEigenSolver<MatrixType> ei3(a);
+    VERIFY_IS_EQUAL(ei3.info(), Success);
+    VERIFY_IS_MUCH_SMALLER_THAN(ei3.eigenvalues().norm(),RealScalar(1));
+    VERIFY((ei3.eigenvectors().transpose()*ei3.eigenvectors().transpose()).eval().isIdentity());
+  }
+}
+
+template<int>
+void bug_854()
+{
+  Matrix3d m;
+  m << 850.961, 51.966, 0,
+       51.966, 254.841, 0,
+            0,       0, 0;
+  selfadjointeigensolver_essential_check(m);
+}
+
+template<int>
+void bug_1014()
+{
+  Matrix3d m;
+  m <<        0.11111111111111114658, 0, 0,
+       0,     0.11111111111111109107, 0,
+       0, 0,  0.11111111111111107719;
+  selfadjointeigensolver_essential_check(m);
+}
+
+template<int>
+void bug_1225()
+{
+  Matrix3d m1, m2;
+  m1.setRandom();
+  m1 = m1*m1.transpose();
+  m2 = m1.triangularView<Upper>();
+  SelfAdjointEigenSolver<Matrix3d> eig1(m1);
+  SelfAdjointEigenSolver<Matrix3d> eig2(m2.selfadjointView<Upper>());
+  VERIFY_IS_APPROX(eig1.eigenvalues(), eig2.eigenvalues());
+}
+
+template<int>
+void bug_1204()
+{
+  SparseMatrix<double> A(2,2);
+  A.setIdentity();
+  SelfAdjointEigenSolver<Eigen::SparseMatrix<double> > eig(A);
+}
+
+EIGEN_DECLARE_TEST(eigensolver_selfadjoint)
+{
+  int s = 0;
+  for(int i = 0; i < g_repeat; i++) {
+
+    // trivial test for 1x1 matrices:
+    CALL_SUBTEST_1( selfadjointeigensolver(Matrix<float, 1, 1>()));
+    CALL_SUBTEST_1( selfadjointeigensolver(Matrix<double, 1, 1>()));
+    CALL_SUBTEST_1( selfadjointeigensolver(Matrix<std::complex<double>, 1, 1>()));
+
+    // very important to test 3x3 and 2x2 matrices since we provide special paths for them
+    CALL_SUBTEST_12( selfadjointeigensolver(Matrix2f()) );
+    CALL_SUBTEST_12( selfadjointeigensolver(Matrix2d()) );
+    CALL_SUBTEST_12( selfadjointeigensolver(Matrix2cd()) );
+    CALL_SUBTEST_13( selfadjointeigensolver(Matrix3f()) );
+    CALL_SUBTEST_13( selfadjointeigensolver(Matrix3d()) );
+    CALL_SUBTEST_13( selfadjointeigensolver(Matrix3cd()) );
+    CALL_SUBTEST_2( selfadjointeigensolver(Matrix4d()) );
+    CALL_SUBTEST_2( selfadjointeigensolver(Matrix4cd()) );
+    
+    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
+    CALL_SUBTEST_3( selfadjointeigensolver(MatrixXf(s,s)) );
+    CALL_SUBTEST_4( selfadjointeigensolver(MatrixXd(s,s)) );
+    CALL_SUBTEST_5( selfadjointeigensolver(MatrixXcd(s,s)) );
+    CALL_SUBTEST_9( selfadjointeigensolver(Matrix<std::complex<double>,Dynamic,Dynamic,RowMajor>(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+
+    // some trivial but implementation-wise tricky cases
+    CALL_SUBTEST_4( selfadjointeigensolver(MatrixXd(1,1)) );
+    CALL_SUBTEST_4( selfadjointeigensolver(MatrixXd(2,2)) );
+    CALL_SUBTEST_5( selfadjointeigensolver(MatrixXcd(1,1)) );
+    CALL_SUBTEST_5( selfadjointeigensolver(MatrixXcd(2,2)) );
+    CALL_SUBTEST_6( selfadjointeigensolver(Matrix<double,1,1>()) );
+    CALL_SUBTEST_7( selfadjointeigensolver(Matrix<double,2,2>()) );
+  }
+  
+  CALL_SUBTEST_13( bug_854<0>() );
+  CALL_SUBTEST_13( bug_1014<0>() );
+  CALL_SUBTEST_13( bug_1204<0>() );
+  CALL_SUBTEST_13( bug_1225<0>() );
+
+  // Test problem size constructors
+  s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
+  CALL_SUBTEST_8(SelfAdjointEigenSolver<MatrixXf> tmp1(s));
+  CALL_SUBTEST_8(Tridiagonalization<MatrixXf> tmp2(s));
+  
+  TEST_SET_BUT_UNUSED_VARIABLE(s)
+}
+

diff --git a/test/evaluator_common.h b/test/evaluator_common.h
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/evaluator_common.h


diff --git a/test/evaluators.cpp b/test/evaluators.cpp
new file mode 100644
index 0000000..2810cd2
--- /dev/null
+++ b/test/evaluators.cpp

@@ -0,0 +1,525 @@
+
+#include "main.h"
+
+namespace Eigen {
+
+  template<typename Lhs,typename Rhs>
+  const Product<Lhs,Rhs>
+  prod(const Lhs& lhs, const Rhs& rhs)
+  {
+    return Product<Lhs,Rhs>(lhs,rhs);
+  }
+
+  template<typename Lhs,typename Rhs>
+  const Product<Lhs,Rhs,LazyProduct>
+  lazyprod(const Lhs& lhs, const Rhs& rhs)
+  {
+    return Product<Lhs,Rhs,LazyProduct>(lhs,rhs);
+  }
+  
+  template<typename DstXprType, typename SrcXprType>
+  EIGEN_STRONG_INLINE
+  DstXprType& copy_using_evaluator(const EigenBase<DstXprType> &dst, const SrcXprType &src)
+  {
+    call_assignment(dst.const_cast_derived(), src.derived(), internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
+    return dst.const_cast_derived();
+  }
+  
+  template<typename DstXprType, template <typename> class StorageBase, typename SrcXprType>
+  EIGEN_STRONG_INLINE
+  const DstXprType& copy_using_evaluator(const NoAlias<DstXprType, StorageBase>& dst, const SrcXprType &src)
+  {
+    call_assignment(dst, src.derived(), internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
+    return dst.expression();
+  }
+  
+  template<typename DstXprType, typename SrcXprType>
+  EIGEN_STRONG_INLINE
+  DstXprType& copy_using_evaluator(const PlainObjectBase<DstXprType> &dst, const SrcXprType &src)
+  {
+    #ifdef EIGEN_NO_AUTOMATIC_RESIZING
+    eigen_assert((dst.size()==0 || (IsVectorAtCompileTime ? (dst.size() == src.size())
+                                                          : (dst.rows() == src.rows() && dst.cols() == src.cols())))
+                && "Size mismatch. Automatic resizing is disabled because EIGEN_NO_AUTOMATIC_RESIZING is defined");
+  #else
+    dst.const_cast_derived().resizeLike(src.derived());
+  #endif
+    
+    call_assignment(dst.const_cast_derived(), src.derived(), internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
+    return dst.const_cast_derived();
+  }
+
+  template<typename DstXprType, typename SrcXprType>
+  void add_assign_using_evaluator(const DstXprType& dst, const SrcXprType& src)
+  {
+    typedef typename DstXprType::Scalar Scalar;
+    call_assignment(const_cast<DstXprType&>(dst), src.derived(), internal::add_assign_op<Scalar,typename SrcXprType::Scalar>());
+  }
+
+  template<typename DstXprType, typename SrcXprType>
+  void subtract_assign_using_evaluator(const DstXprType& dst, const SrcXprType& src)
+  {
+    typedef typename DstXprType::Scalar Scalar;
+    call_assignment(const_cast<DstXprType&>(dst), src.derived(), internal::sub_assign_op<Scalar,typename SrcXprType::Scalar>());
+  }
+
+  template<typename DstXprType, typename SrcXprType>
+  void multiply_assign_using_evaluator(const DstXprType& dst, const SrcXprType& src)
+  {
+    typedef typename DstXprType::Scalar Scalar;
+    call_assignment(dst.const_cast_derived(), src.derived(), internal::mul_assign_op<Scalar,typename SrcXprType::Scalar>());
+  }
+
+  template<typename DstXprType, typename SrcXprType>
+  void divide_assign_using_evaluator(const DstXprType& dst, const SrcXprType& src)
+  {
+    typedef typename DstXprType::Scalar Scalar;
+    call_assignment(dst.const_cast_derived(), src.derived(), internal::div_assign_op<Scalar,typename SrcXprType::Scalar>());
+  }
+  
+  template<typename DstXprType, typename SrcXprType>
+  void swap_using_evaluator(const DstXprType& dst, const SrcXprType& src)
+  {
+    typedef typename DstXprType::Scalar Scalar;
+    call_assignment(dst.const_cast_derived(), src.const_cast_derived(), internal::swap_assign_op<Scalar>());
+  }
+
+  namespace internal {
+    template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
+    EIGEN_DEVICE_FUNC void call_assignment(const NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
+    {
+      call_assignment_no_alias(dst.expression(), src, func);
+    }
+
+    template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
+    EIGEN_DEVICE_FUNC void call_restricted_packet_assignment(const NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
+    {
+      call_restricted_packet_assignment_no_alias(dst.expression(), src, func);
+    }
+  }
+  
+}
+
+template<typename XprType> long get_cost(const XprType& ) { return Eigen::internal::evaluator<XprType>::CoeffReadCost; }
+
+using namespace std;
+
+#define VERIFY_IS_APPROX_EVALUATOR(DEST,EXPR) VERIFY_IS_APPROX(copy_using_evaluator(DEST,(EXPR)), (EXPR).eval());
+#define VERIFY_IS_APPROX_EVALUATOR2(DEST,EXPR,REF) VERIFY_IS_APPROX(copy_using_evaluator(DEST,(EXPR)), (REF).eval());
+
+EIGEN_DECLARE_TEST(evaluators)
+{
+  // Testing Matrix evaluator and Transpose
+  Vector2d v = Vector2d::Random();
+  const Vector2d v_const(v);
+  Vector2d v2;
+  RowVector2d w;
+
+  VERIFY_IS_APPROX_EVALUATOR(v2, v);
+  VERIFY_IS_APPROX_EVALUATOR(v2, v_const);
+
+  // Testing Transpose
+  VERIFY_IS_APPROX_EVALUATOR(w, v.transpose()); // Transpose as rvalue
+  VERIFY_IS_APPROX_EVALUATOR(w, v_const.transpose());
+
+  copy_using_evaluator(w.transpose(), v); // Transpose as lvalue
+  VERIFY_IS_APPROX(w,v.transpose().eval());
+
+  copy_using_evaluator(w.transpose(), v_const);
+  VERIFY_IS_APPROX(w,v_const.transpose().eval());
+
+  // Testing Array evaluator
+  {
+    ArrayXXf a(2,3);
+    ArrayXXf b(3,2);
+    a << 1,2,3, 4,5,6;
+    const ArrayXXf a_const(a);
+
+    VERIFY_IS_APPROX_EVALUATOR(b, a.transpose());
+
+    VERIFY_IS_APPROX_EVALUATOR(b, a_const.transpose());
+
+    // Testing CwiseNullaryOp evaluator
+    copy_using_evaluator(w, RowVector2d::Random());
+    VERIFY((w.array() >= -1).all() && (w.array() <= 1).all()); // not easy to test ...
+
+    VERIFY_IS_APPROX_EVALUATOR(w, RowVector2d::Zero());
+
+    VERIFY_IS_APPROX_EVALUATOR(w, RowVector2d::Constant(3));
+    
+    // mix CwiseNullaryOp and transpose
+    VERIFY_IS_APPROX_EVALUATOR(w, Vector2d::Zero().transpose());
+  }
+
+  {
+    // test product expressions
+    int s = internal::random<int>(1,100);
+    MatrixXf a(s,s), b(s,s), c(s,s), d(s,s);
+    a.setRandom();
+    b.setRandom();
+    c.setRandom();
+    d.setRandom();
+    VERIFY_IS_APPROX_EVALUATOR(d, (a + b));
+    VERIFY_IS_APPROX_EVALUATOR(d, (a + b).transpose());
+    VERIFY_IS_APPROX_EVALUATOR2(d, prod(a,b), a*b);
+    VERIFY_IS_APPROX_EVALUATOR2(d.noalias(), prod(a,b), a*b);
+    VERIFY_IS_APPROX_EVALUATOR2(d, prod(a,b) + c, a*b + c);
+    VERIFY_IS_APPROX_EVALUATOR2(d, s * prod(a,b), s * a*b);
+    VERIFY_IS_APPROX_EVALUATOR2(d, prod(a,b).transpose(), (a*b).transpose());
+    VERIFY_IS_APPROX_EVALUATOR2(d, prod(a,b) + prod(b,c), a*b + b*c);
+
+    // check that prod works even with aliasing present
+    c = a*a;
+    copy_using_evaluator(a, prod(a,a));
+    VERIFY_IS_APPROX(a,c);
+
+    // check compound assignment of products
+    d = c;
+    add_assign_using_evaluator(c.noalias(), prod(a,b));
+    d.noalias() += a*b;
+    VERIFY_IS_APPROX(c, d);
+
+    d = c;
+    subtract_assign_using_evaluator(c.noalias(), prod(a,b));
+    d.noalias() -= a*b;
+    VERIFY_IS_APPROX(c, d);
+  }
+
+  {
+    // test product with all possible sizes
+    int s = internal::random<int>(1,100);
+    Matrix<float,      1,      1> m11, res11;  m11.setRandom(1,1);
+    Matrix<float,      1,      4> m14, res14;  m14.setRandom(1,4);
+    Matrix<float,      1,Dynamic> m1X, res1X;  m1X.setRandom(1,s);
+    Matrix<float,      4,      1> m41, res41;  m41.setRandom(4,1);
+    Matrix<float,      4,      4> m44, res44;  m44.setRandom(4,4);
+    Matrix<float,      4,Dynamic> m4X, res4X;  m4X.setRandom(4,s);
+    Matrix<float,Dynamic,      1> mX1, resX1;  mX1.setRandom(s,1);
+    Matrix<float,Dynamic,      4> mX4, resX4;  mX4.setRandom(s,4);
+    Matrix<float,Dynamic,Dynamic> mXX, resXX;  mXX.setRandom(s,s);
+
+    VERIFY_IS_APPROX_EVALUATOR2(res11, prod(m11,m11), m11*m11);
+    VERIFY_IS_APPROX_EVALUATOR2(res11, prod(m14,m41), m14*m41);
+    VERIFY_IS_APPROX_EVALUATOR2(res11, prod(m1X,mX1), m1X*mX1);
+    VERIFY_IS_APPROX_EVALUATOR2(res14, prod(m11,m14), m11*m14);
+    VERIFY_IS_APPROX_EVALUATOR2(res14, prod(m14,m44), m14*m44);
+    VERIFY_IS_APPROX_EVALUATOR2(res14, prod(m1X,mX4), m1X*mX4);
+    VERIFY_IS_APPROX_EVALUATOR2(res1X, prod(m11,m1X), m11*m1X);
+    VERIFY_IS_APPROX_EVALUATOR2(res1X, prod(m14,m4X), m14*m4X);
+    VERIFY_IS_APPROX_EVALUATOR2(res1X, prod(m1X,mXX), m1X*mXX);
+    VERIFY_IS_APPROX_EVALUATOR2(res41, prod(m41,m11), m41*m11);
+    VERIFY_IS_APPROX_EVALUATOR2(res41, prod(m44,m41), m44*m41);
+    VERIFY_IS_APPROX_EVALUATOR2(res41, prod(m4X,mX1), m4X*mX1);
+    VERIFY_IS_APPROX_EVALUATOR2(res44, prod(m41,m14), m41*m14);
+    VERIFY_IS_APPROX_EVALUATOR2(res44, prod(m44,m44), m44*m44);
+    VERIFY_IS_APPROX_EVALUATOR2(res44, prod(m4X,mX4), m4X*mX4);
+    VERIFY_IS_APPROX_EVALUATOR2(res4X, prod(m41,m1X), m41*m1X);
+    VERIFY_IS_APPROX_EVALUATOR2(res4X, prod(m44,m4X), m44*m4X);
+    VERIFY_IS_APPROX_EVALUATOR2(res4X, prod(m4X,mXX), m4X*mXX);
+    VERIFY_IS_APPROX_EVALUATOR2(resX1, prod(mX1,m11), mX1*m11);
+    VERIFY_IS_APPROX_EVALUATOR2(resX1, prod(mX4,m41), mX4*m41);
+    VERIFY_IS_APPROX_EVALUATOR2(resX1, prod(mXX,mX1), mXX*mX1);
+    VERIFY_IS_APPROX_EVALUATOR2(resX4, prod(mX1,m14), mX1*m14);
+    VERIFY_IS_APPROX_EVALUATOR2(resX4, prod(mX4,m44), mX4*m44);
+    VERIFY_IS_APPROX_EVALUATOR2(resX4, prod(mXX,mX4), mXX*mX4);
+    VERIFY_IS_APPROX_EVALUATOR2(resXX, prod(mX1,m1X), mX1*m1X);
+    VERIFY_IS_APPROX_EVALUATOR2(resXX, prod(mX4,m4X), mX4*m4X);
+    VERIFY_IS_APPROX_EVALUATOR2(resXX, prod(mXX,mXX), mXX*mXX);
+  }
+
+  {
+    ArrayXXf a(2,3);
+    ArrayXXf b(3,2);
+    a << 1,2,3, 4,5,6;
+    const ArrayXXf a_const(a);
+    
+    // this does not work because Random is eval-before-nested: 
+    // copy_using_evaluator(w, Vector2d::Random().transpose());
+
+    // test CwiseUnaryOp
+    VERIFY_IS_APPROX_EVALUATOR(v2, 3 * v);
+    VERIFY_IS_APPROX_EVALUATOR(w, (3 * v).transpose());
+    VERIFY_IS_APPROX_EVALUATOR(b, (a + 3).transpose());
+    VERIFY_IS_APPROX_EVALUATOR(b, (2 * a_const + 3).transpose());
+
+    // test CwiseBinaryOp
+    VERIFY_IS_APPROX_EVALUATOR(v2, v + Vector2d::Ones());
+    VERIFY_IS_APPROX_EVALUATOR(w, (v + Vector2d::Ones()).transpose().cwiseProduct(RowVector2d::Constant(3)));
+
+    // dynamic matrices and arrays
+    MatrixXd mat1(6,6), mat2(6,6);
+    VERIFY_IS_APPROX_EVALUATOR(mat1, MatrixXd::Identity(6,6));
+    VERIFY_IS_APPROX_EVALUATOR(mat2, mat1);
+    copy_using_evaluator(mat2.transpose(), mat1);
+    VERIFY_IS_APPROX(mat2.transpose(), mat1);
+
+    ArrayXXd arr1(6,6), arr2(6,6);
+    VERIFY_IS_APPROX_EVALUATOR(arr1, ArrayXXd::Constant(6,6, 3.0));
+    VERIFY_IS_APPROX_EVALUATOR(arr2, arr1);
+    
+    // test automatic resizing
+    mat2.resize(3,3);
+    VERIFY_IS_APPROX_EVALUATOR(mat2, mat1);
+    arr2.resize(9,9);
+    VERIFY_IS_APPROX_EVALUATOR(arr2, arr1);
+
+    // test direct traversal
+    Matrix3f m3;
+    Array33f a3;
+    VERIFY_IS_APPROX_EVALUATOR(m3, Matrix3f::Identity());  // matrix, nullary
+    // TODO: find a way to test direct traversal with array
+    VERIFY_IS_APPROX_EVALUATOR(m3.transpose(), Matrix3f::Identity().transpose());  // transpose
+    VERIFY_IS_APPROX_EVALUATOR(m3, 2 * Matrix3f::Identity());  // unary
+    VERIFY_IS_APPROX_EVALUATOR(m3, Matrix3f::Identity() + Matrix3f::Zero());  // binary
+    VERIFY_IS_APPROX_EVALUATOR(m3.block(0,0,2,2), Matrix3f::Identity().block(1,1,2,2));  // block
+
+    // test linear traversal
+    VERIFY_IS_APPROX_EVALUATOR(m3, Matrix3f::Zero());  // matrix, nullary
+    VERIFY_IS_APPROX_EVALUATOR(a3, Array33f::Zero());  // array
+    VERIFY_IS_APPROX_EVALUATOR(m3.transpose(), Matrix3f::Zero().transpose());  // transpose
+    VERIFY_IS_APPROX_EVALUATOR(m3, 2 * Matrix3f::Zero());  // unary
+    VERIFY_IS_APPROX_EVALUATOR(m3, Matrix3f::Zero() + m3);  // binary  
+
+    // test inner vectorization
+    Matrix4f m4, m4src = Matrix4f::Random();
+    Array44f a4, a4src = Matrix4f::Random();
+    VERIFY_IS_APPROX_EVALUATOR(m4, m4src);  // matrix
+    VERIFY_IS_APPROX_EVALUATOR(a4, a4src);  // array
+    VERIFY_IS_APPROX_EVALUATOR(m4.transpose(), m4src.transpose());  // transpose
+    // TODO: find out why Matrix4f::Zero() does not allow inner vectorization
+    VERIFY_IS_APPROX_EVALUATOR(m4, 2 * m4src);  // unary
+    VERIFY_IS_APPROX_EVALUATOR(m4, m4src + m4src);  // binary
+
+    // test linear vectorization
+    MatrixXf mX(6,6), mXsrc = MatrixXf::Random(6,6);
+    ArrayXXf aX(6,6), aXsrc = ArrayXXf::Random(6,6);
+    VERIFY_IS_APPROX_EVALUATOR(mX, mXsrc);  // matrix
+    VERIFY_IS_APPROX_EVALUATOR(aX, aXsrc);  // array
+    VERIFY_IS_APPROX_EVALUATOR(mX.transpose(), mXsrc.transpose());  // transpose
+    VERIFY_IS_APPROX_EVALUATOR(mX, MatrixXf::Zero(6,6));  // nullary
+    VERIFY_IS_APPROX_EVALUATOR(mX, 2 * mXsrc);  // unary
+    VERIFY_IS_APPROX_EVALUATOR(mX, mXsrc + mXsrc);  // binary
+
+    // test blocks and slice vectorization
+    VERIFY_IS_APPROX_EVALUATOR(m4, (mXsrc.block<4,4>(1,0)));
+    VERIFY_IS_APPROX_EVALUATOR(aX, ArrayXXf::Constant(10, 10, 3.0).block(2, 3, 6, 6));
+
+    Matrix4f m4ref = m4;
+    copy_using_evaluator(m4.block(1, 1, 2, 3), m3.bottomRows(2));
+    m4ref.block(1, 1, 2, 3) = m3.bottomRows(2);
+    VERIFY_IS_APPROX(m4, m4ref);
+
+    mX.setIdentity(20,20);
+    MatrixXf mXref = MatrixXf::Identity(20,20);
+    mXsrc = MatrixXf::Random(9,12);
+    copy_using_evaluator(mX.block(4, 4, 9, 12), mXsrc);
+    mXref.block(4, 4, 9, 12) = mXsrc;
+    VERIFY_IS_APPROX(mX, mXref);
+
+    // test Map
+    const float raw[3] = {1,2,3};
+    float buffer[3] = {0,0,0};
+    Vector3f v3;
+    Array3f a3f;
+    VERIFY_IS_APPROX_EVALUATOR(v3, Map<const Vector3f>(raw));
+    VERIFY_IS_APPROX_EVALUATOR(a3f, Map<const Array3f>(raw));
+    Vector3f::Map(buffer) = 2*v3;
+    VERIFY(buffer[0] == 2);
+    VERIFY(buffer[1] == 4);
+    VERIFY(buffer[2] == 6);
+
+    // test CwiseUnaryView
+    mat1.setRandom();
+    mat2.setIdentity();
+    MatrixXcd matXcd(6,6), matXcd_ref(6,6);
+    copy_using_evaluator(matXcd.real(), mat1);
+    copy_using_evaluator(matXcd.imag(), mat2);
+    matXcd_ref.real() = mat1;
+    matXcd_ref.imag() = mat2;
+    VERIFY_IS_APPROX(matXcd, matXcd_ref);
+
+    // test Select
+    VERIFY_IS_APPROX_EVALUATOR(aX, (aXsrc > 0).select(aXsrc, -aXsrc));
+
+    // test Replicate
+    mXsrc = MatrixXf::Random(6, 6);
+    VectorXf vX = VectorXf::Random(6);
+    mX.resize(6, 6);
+    VERIFY_IS_APPROX_EVALUATOR(mX, mXsrc.colwise() + vX);
+    matXcd.resize(12, 12);
+    VERIFY_IS_APPROX_EVALUATOR(matXcd, matXcd_ref.replicate(2,2));
+    VERIFY_IS_APPROX_EVALUATOR(matXcd, (matXcd_ref.replicate<2,2>()));
+
+    // test partial reductions
+    VectorXd vec1(6);
+    VERIFY_IS_APPROX_EVALUATOR(vec1, mat1.rowwise().sum());
+    VERIFY_IS_APPROX_EVALUATOR(vec1, mat1.colwise().sum().transpose());
+
+    // test MatrixWrapper and ArrayWrapper
+    mat1.setRandom(6,6);
+    arr1.setRandom(6,6);
+    VERIFY_IS_APPROX_EVALUATOR(mat2, arr1.matrix());
+    VERIFY_IS_APPROX_EVALUATOR(arr2, mat1.array());
+    VERIFY_IS_APPROX_EVALUATOR(mat2, (arr1 + 2).matrix());
+    VERIFY_IS_APPROX_EVALUATOR(arr2, mat1.array() + 2);
+    mat2.array() = arr1 * arr1;
+    VERIFY_IS_APPROX(mat2, (arr1 * arr1).matrix());
+    arr2.matrix() = MatrixXd::Identity(6,6);
+    VERIFY_IS_APPROX(arr2, MatrixXd::Identity(6,6).array());
+
+    // test Reverse
+    VERIFY_IS_APPROX_EVALUATOR(arr2, arr1.reverse());
+    VERIFY_IS_APPROX_EVALUATOR(arr2, arr1.colwise().reverse());
+    VERIFY_IS_APPROX_EVALUATOR(arr2, arr1.rowwise().reverse());
+    arr2.reverse() = arr1;
+    VERIFY_IS_APPROX(arr2, arr1.reverse());
+    mat2.array() = mat1.array().reverse();
+    VERIFY_IS_APPROX(mat2.array(), mat1.array().reverse());
+
+    // test Diagonal
+    VERIFY_IS_APPROX_EVALUATOR(vec1, mat1.diagonal());
+    vec1.resize(5);
+    VERIFY_IS_APPROX_EVALUATOR(vec1, mat1.diagonal(1));
+    VERIFY_IS_APPROX_EVALUATOR(vec1, mat1.diagonal<-1>());
+    vec1.setRandom();
+
+    mat2 = mat1;
+    copy_using_evaluator(mat1.diagonal(1), vec1);
+    mat2.diagonal(1) = vec1;
+    VERIFY_IS_APPROX(mat1, mat2);
+
+    copy_using_evaluator(mat1.diagonal<-1>(), mat1.diagonal(1));
+    mat2.diagonal<-1>() = mat2.diagonal(1);
+    VERIFY_IS_APPROX(mat1, mat2);
+  }
+  
+  {
+    // test swapping
+    MatrixXd mat1, mat2, mat1ref, mat2ref;
+    mat1ref = mat1 = MatrixXd::Random(6, 6);
+    mat2ref = mat2 = 2 * mat1 + MatrixXd::Identity(6, 6);
+    swap_using_evaluator(mat1, mat2);
+    mat1ref.swap(mat2ref);
+    VERIFY_IS_APPROX(mat1, mat1ref);
+    VERIFY_IS_APPROX(mat2, mat2ref);
+
+    swap_using_evaluator(mat1.block(0, 0, 3, 3), mat2.block(3, 3, 3, 3));
+    mat1ref.block(0, 0, 3, 3).swap(mat2ref.block(3, 3, 3, 3));
+    VERIFY_IS_APPROX(mat1, mat1ref);
+    VERIFY_IS_APPROX(mat2, mat2ref);
+
+    swap_using_evaluator(mat1.row(2), mat2.col(3).transpose());
+    mat1.row(2).swap(mat2.col(3).transpose());
+    VERIFY_IS_APPROX(mat1, mat1ref);
+    VERIFY_IS_APPROX(mat2, mat2ref);
+  }
+
+  {
+    // test compound assignment
+    const Matrix4d mat_const = Matrix4d::Random(); 
+    Matrix4d mat, mat_ref;
+    mat = mat_ref = Matrix4d::Identity();
+    add_assign_using_evaluator(mat, mat_const);
+    mat_ref += mat_const;
+    VERIFY_IS_APPROX(mat, mat_ref);
+
+    subtract_assign_using_evaluator(mat.row(1), 2*mat.row(2));
+    mat_ref.row(1) -= 2*mat_ref.row(2);
+    VERIFY_IS_APPROX(mat, mat_ref);
+
+    const ArrayXXf arr_const = ArrayXXf::Random(5,3); 
+    ArrayXXf arr, arr_ref;
+    arr = arr_ref = ArrayXXf::Constant(5, 3, 0.5);
+    multiply_assign_using_evaluator(arr, arr_const);
+    arr_ref *= arr_const;
+    VERIFY_IS_APPROX(arr, arr_ref);
+
+    divide_assign_using_evaluator(arr.row(1), arr.row(2) + 1);
+    arr_ref.row(1) /= (arr_ref.row(2) + 1);
+    VERIFY_IS_APPROX(arr, arr_ref);
+  }
+  
+  {
+    // test triangular shapes
+    MatrixXd A = MatrixXd::Random(6,6), B(6,6), C(6,6), D(6,6);
+    A.setRandom();B.setRandom();
+    VERIFY_IS_APPROX_EVALUATOR2(B, A.triangularView<Upper>(), MatrixXd(A.triangularView<Upper>()));
+    
+    A.setRandom();B.setRandom();
+    VERIFY_IS_APPROX_EVALUATOR2(B, A.triangularView<UnitLower>(), MatrixXd(A.triangularView<UnitLower>()));
+    
+    A.setRandom();B.setRandom();
+    VERIFY_IS_APPROX_EVALUATOR2(B, A.triangularView<UnitUpper>(), MatrixXd(A.triangularView<UnitUpper>()));
+    
+    A.setRandom();B.setRandom();
+    C = B; C.triangularView<Upper>() = A;
+    copy_using_evaluator(B.triangularView<Upper>(), A);
+    VERIFY(B.isApprox(C) && "copy_using_evaluator(B.triangularView<Upper>(), A)");
+    
+    A.setRandom();B.setRandom();
+    C = B; C.triangularView<Lower>() = A.triangularView<Lower>();
+    copy_using_evaluator(B.triangularView<Lower>(), A.triangularView<Lower>());
+    VERIFY(B.isApprox(C) && "copy_using_evaluator(B.triangularView<Lower>(), A.triangularView<Lower>())");
+    
+    
+    A.setRandom();B.setRandom();
+    C = B; C.triangularView<Lower>() = A.triangularView<Upper>().transpose();
+    copy_using_evaluator(B.triangularView<Lower>(), A.triangularView<Upper>().transpose());
+    VERIFY(B.isApprox(C) && "copy_using_evaluator(B.triangularView<Lower>(), A.triangularView<Lower>().transpose())");
+    
+    
+    A.setRandom();B.setRandom(); C = B; D = A;
+    C.triangularView<Upper>().swap(D.triangularView<Upper>());
+    swap_using_evaluator(B.triangularView<Upper>(), A.triangularView<Upper>());
+    VERIFY(B.isApprox(C) && "swap_using_evaluator(B.triangularView<Upper>(), A.triangularView<Upper>())");
+    
+    
+    VERIFY_IS_APPROX_EVALUATOR2(B, prod(A.triangularView<Upper>(),A), MatrixXd(A.triangularView<Upper>()*A));
+    
+    VERIFY_IS_APPROX_EVALUATOR2(B, prod(A.selfadjointView<Upper>(),A), MatrixXd(A.selfadjointView<Upper>()*A));
+  }
+
+  {
+    // test diagonal shapes
+    VectorXd d = VectorXd::Random(6);
+    MatrixXd A = MatrixXd::Random(6,6), B(6,6);
+    A.setRandom();B.setRandom();
+    
+    VERIFY_IS_APPROX_EVALUATOR2(B, lazyprod(d.asDiagonal(),A), MatrixXd(d.asDiagonal()*A));
+    VERIFY_IS_APPROX_EVALUATOR2(B, lazyprod(A,d.asDiagonal()), MatrixXd(A*d.asDiagonal()));
+  }
+
+  {
+    // test CoeffReadCost
+    Matrix4d a, b;
+    VERIFY_IS_EQUAL( get_cost(a), 1 );
+    VERIFY_IS_EQUAL( get_cost(a+b), 3);
+    VERIFY_IS_EQUAL( get_cost(2*a+b), 4);
+    VERIFY_IS_EQUAL( get_cost(a*b), 1);
+    VERIFY_IS_EQUAL( get_cost(a.lazyProduct(b)), 15);
+    VERIFY_IS_EQUAL( get_cost(a*(a*b)), 1);
+    VERIFY_IS_EQUAL( get_cost(a.lazyProduct(a*b)), 15);
+    VERIFY_IS_EQUAL( get_cost(a*(a+b)), 1);
+    VERIFY_IS_EQUAL( get_cost(a.lazyProduct(a+b)), 15);
+  }
+
+  // regression test for PR 544 and bug 1622 (introduced in #71609c4)
+  {
+    // test restricted_packet_assignment with an unaligned destination
+    const size_t M = 2;
+    const size_t K = 2;
+    const size_t N = 5;
+    float *destMem = new float[(M*N) + 1];
+    float *dest = (internal::UIntPtr(destMem)%EIGEN_MAX_ALIGN_BYTES) == 0 ? destMem+1 : destMem;
+
+    const Matrix<float, Dynamic, Dynamic, RowMajor> a = Matrix<float, Dynamic, Dynamic, RowMajor>::Random(M, K);
+    const Matrix<float, Dynamic, Dynamic, RowMajor> b = Matrix<float, Dynamic, Dynamic, RowMajor>::Random(K, N);
+    
+    Map<Matrix<float, Dynamic, Dynamic, RowMajor> > z(dest, M, N);;
+    Product<Matrix<float, Dynamic, Dynamic, RowMajor>, Matrix<float, Dynamic, Dynamic, RowMajor>, LazyProduct> tmp(a,b);
+    internal::call_restricted_packet_assignment(z.noalias(), tmp.derived(), internal::assign_op<float, float>());
+    
+    VERIFY_IS_APPROX(z, a*b);
+    delete[] destMem;
+  }
+}

diff --git a/test/exceptions.cpp b/test/exceptions.cpp
new file mode 100644
index 0000000..3d93060
--- /dev/null
+++ b/test/exceptions.cpp

@@ -0,0 +1,49 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+// Various sanity tests with exceptions and non trivially copyable scalar type.
+//  - no memory leak when a custom scalar type trow an exceptions
+//  - todo: complete the list of tests!
+
+#define EIGEN_STACK_ALLOCATION_LIMIT 100000000
+
+#include "main.h"
+#include "AnnoyingScalar.h"
+
+#define CHECK_MEMLEAK(OP) {                                 \
+    AnnoyingScalar::countdown = 100;                        \
+    int before = AnnoyingScalar::instances;                 \
+    bool exception_thrown = false;                          \
+    try { OP; }                                             \
+    catch (my_exception) {                                  \
+      exception_thrown = true;                              \
+      VERIFY(AnnoyingScalar::instances==before && "memory leak detected in " && EIGEN_MAKESTRING(OP)); \
+    } \
+    VERIFY( (AnnoyingScalar::dont_throw) || (exception_thrown && " no exception thrown in " && EIGEN_MAKESTRING(OP)) ); \
+  }
+
+EIGEN_DECLARE_TEST(exceptions)
+{
+  typedef Eigen::Matrix<AnnoyingScalar,Dynamic,1> VectorType;
+  typedef Eigen::Matrix<AnnoyingScalar,Dynamic,Dynamic> MatrixType;
+  
+  {
+    AnnoyingScalar::dont_throw = false;
+    int n = 50;
+    VectorType v0(n), v1(n);
+    MatrixType m0(n,n), m1(n,n), m2(n,n);
+    v0.setOnes(); v1.setOnes();
+    m0.setOnes(); m1.setOnes(); m2.setOnes();
+    CHECK_MEMLEAK(v0 = m0 * m1 * v1);
+    CHECK_MEMLEAK(m2 = m0 * m1 * m2);
+    CHECK_MEMLEAK((v0+v1).dot(v0+v1));
+  }
+  VERIFY(AnnoyingScalar::instances==0 && "global memory leak detected in " && EIGEN_MAKESTRING(OP));
+}

diff --git a/test/fastmath.cpp b/test/fastmath.cpp
new file mode 100644
index 0000000..00a1a59
--- /dev/null
+++ b/test/fastmath.cpp

@@ -0,0 +1,99 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+void check(bool b, bool ref)
+{
+  std::cout << b;
+  if(b==ref)
+    std::cout << " OK  ";
+  else
+    std::cout << " BAD ";
+}
+
+#if EIGEN_COMP_MSVC && EIGEN_COMP_MSVC < 1800
+namespace std {
+  template<typename T> bool (isfinite)(T x) { return _finite(x); }
+  template<typename T> bool (isnan)(T x) { return _isnan(x); }
+  template<typename T> bool (isinf)(T x) { return _fpclass(x)==_FPCLASS_NINF || _fpclass(x)==_FPCLASS_PINF; }
+}
+#endif
+
+template<typename T>
+void check_inf_nan(bool dryrun) {
+  Matrix<T,Dynamic,1> m(10);
+  m.setRandom();
+  m(3) = std::numeric_limits<T>::quiet_NaN();
+
+  if(dryrun)
+  {
+    std::cout << "std::isfinite(" << m(3) << ") = "; check((std::isfinite)(m(3)),false); std::cout << "  ; numext::isfinite = "; check((numext::isfinite)(m(3)), false); std::cout << "\n";
+    std::cout << "std::isinf(" << m(3) << ")    = "; check((std::isinf)(m(3)),false);    std::cout << "  ; numext::isinf    = "; check((numext::isinf)(m(3)), false); std::cout << "\n";
+    std::cout << "std::isnan(" << m(3) << ")    = "; check((std::isnan)(m(3)),true);     std::cout << "  ; numext::isnan    = "; check((numext::isnan)(m(3)), true); std::cout << "\n";
+    std::cout << "allFinite: "; check(m.allFinite(), 0); std::cout << "\n";
+    std::cout << "hasNaN:    "; check(m.hasNaN(), 1);    std::cout << "\n";
+    std::cout << "\n";
+  }
+  else
+  {
+    if( (std::isfinite)(m(3))) g_test_level=1;  VERIFY( !(numext::isfinite)(m(3)) ); g_test_level=0;
+    if( (std::isinf)   (m(3))) g_test_level=1;  VERIFY( !(numext::isinf)(m(3)) );    g_test_level=0;
+    if(!(std::isnan)   (m(3))) g_test_level=1;  VERIFY(  (numext::isnan)(m(3)) );    g_test_level=0;
+    if( (std::isfinite)(m(3))) g_test_level=1;  VERIFY( !m.allFinite() );            g_test_level=0;
+    if(!(std::isnan)   (m(3))) g_test_level=1;  VERIFY(  m.hasNaN() );               g_test_level=0;
+  }
+  T hidden_zero = (std::numeric_limits<T>::min)()*(std::numeric_limits<T>::min)();
+  m(4) /= hidden_zero;
+  if(dryrun)
+  {
+    std::cout << "std::isfinite(" << m(4) << ") = "; check((std::isfinite)(m(4)),false); std::cout << "  ; numext::isfinite = "; check((numext::isfinite)(m(4)), false); std::cout << "\n";
+    std::cout << "std::isinf(" << m(4) << ")    = "; check((std::isinf)(m(4)),true);     std::cout << "  ; numext::isinf    = "; check((numext::isinf)(m(4)), true); std::cout << "\n";
+    std::cout << "std::isnan(" << m(4) << ")    = "; check((std::isnan)(m(4)),false);    std::cout << "  ; numext::isnan    = "; check((numext::isnan)(m(4)), false); std::cout << "\n";
+    std::cout << "allFinite: "; check(m.allFinite(), 0); std::cout << "\n";
+    std::cout << "hasNaN:    "; check(m.hasNaN(), 1);    std::cout << "\n";
+    std::cout << "\n";
+  }
+  else
+  {
+    if( (std::isfinite)(m(3))) g_test_level=1;  VERIFY( !(numext::isfinite)(m(4)) );  g_test_level=0;
+    if(!(std::isinf)   (m(3))) g_test_level=1;  VERIFY(  (numext::isinf)(m(4)) );     g_test_level=0;
+    if( (std::isnan)   (m(3))) g_test_level=1;  VERIFY( !(numext::isnan)(m(4)) );     g_test_level=0;
+    if( (std::isfinite)(m(3))) g_test_level=1;  VERIFY( !m.allFinite() );             g_test_level=0;
+    if(!(std::isnan)   (m(3))) g_test_level=1;  VERIFY(  m.hasNaN() );                g_test_level=0;
+  }
+  m(3) = 0;
+  if(dryrun)
+  {
+    std::cout << "std::isfinite(" << m(3) << ") = "; check((std::isfinite)(m(3)),true); std::cout << "  ; numext::isfinite = "; check((numext::isfinite)(m(3)), true); std::cout << "\n";
+    std::cout << "std::isinf(" << m(3) << ")    = "; check((std::isinf)(m(3)),false);   std::cout << "  ; numext::isinf    = "; check((numext::isinf)(m(3)), false); std::cout << "\n";
+    std::cout << "std::isnan(" << m(3) << ")    = "; check((std::isnan)(m(3)),false);   std::cout << "  ; numext::isnan    = "; check((numext::isnan)(m(3)), false); std::cout << "\n";
+    std::cout << "allFinite: "; check(m.allFinite(), 0); std::cout << "\n";
+    std::cout << "hasNaN:    "; check(m.hasNaN(), 0);    std::cout << "\n";
+    std::cout << "\n\n";
+  }
+  else
+  {
+    if(!(std::isfinite)(m(3))) g_test_level=1;  VERIFY(  (numext::isfinite)(m(3)) );  g_test_level=0;
+    if( (std::isinf)   (m(3))) g_test_level=1;  VERIFY( !(numext::isinf)(m(3)) );     g_test_level=0;
+    if( (std::isnan)   (m(3))) g_test_level=1;  VERIFY( !(numext::isnan)(m(3)) );     g_test_level=0;
+    if( (std::isfinite)(m(3))) g_test_level=1;  VERIFY( !m.allFinite() );             g_test_level=0;
+    if( (std::isnan)   (m(3))) g_test_level=1;  VERIFY( !m.hasNaN() );                g_test_level=0;
+  }
+}
+
+EIGEN_DECLARE_TEST(fastmath) {
+  std::cout << "*** float *** \n\n"; check_inf_nan<float>(true);
+  std::cout << "*** double ***\n\n"; check_inf_nan<double>(true);
+  std::cout << "*** long double *** \n\n"; check_inf_nan<long double>(true);
+
+  check_inf_nan<float>(false);
+  check_inf_nan<double>(false);
+  check_inf_nan<long double>(false);
+}

diff --git a/test/first_aligned.cpp b/test/first_aligned.cpp
new file mode 100644
index 0000000..ed99450
--- /dev/null
+++ b/test/first_aligned.cpp

@@ -0,0 +1,51 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename Scalar>
+void test_first_aligned_helper(Scalar *array, int size)
+{
+  const int packet_size = sizeof(Scalar) * internal::packet_traits<Scalar>::size;
+  VERIFY(((size_t(array) + sizeof(Scalar) * internal::first_default_aligned(array, size)) % packet_size) == 0);
+}
+
+template<typename Scalar>
+void test_none_aligned_helper(Scalar *array, int size)
+{
+  EIGEN_UNUSED_VARIABLE(array);
+  EIGEN_UNUSED_VARIABLE(size);
+  VERIFY(internal::packet_traits<Scalar>::size == 1 || internal::first_default_aligned(array, size) == size);
+}
+
+struct some_non_vectorizable_type { float x; };
+
+EIGEN_DECLARE_TEST(first_aligned)
+{
+  EIGEN_ALIGN16 float array_float[100];
+  test_first_aligned_helper(array_float, 50);
+  test_first_aligned_helper(array_float+1, 50);
+  test_first_aligned_helper(array_float+2, 50);
+  test_first_aligned_helper(array_float+3, 50);
+  test_first_aligned_helper(array_float+4, 50);
+  test_first_aligned_helper(array_float+5, 50);
+  
+  EIGEN_ALIGN16 double array_double[100];
+  test_first_aligned_helper(array_double, 50);
+  test_first_aligned_helper(array_double+1, 50);
+  test_first_aligned_helper(array_double+2, 50);
+  
+  double *array_double_plus_4_bytes = (double*)(internal::UIntPtr(array_double)+4);
+  test_none_aligned_helper(array_double_plus_4_bytes, 50);
+  test_none_aligned_helper(array_double_plus_4_bytes+1, 50);
+  
+  some_non_vectorizable_type array_nonvec[100];
+  test_first_aligned_helper(array_nonvec, 100);
+  test_none_aligned_helper(array_nonvec, 100);
+}

diff --git a/test/geo_alignedbox.cpp b/test/geo_alignedbox.cpp
new file mode 100644
index 0000000..7b1684f
--- /dev/null
+++ b/test/geo_alignedbox.cpp

@@ -0,0 +1,531 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/Geometry>
+
+using namespace std;
+
+// NOTE the following workaround was needed on some 32 bits builds to kill extra precision of x87 registers.
+// It seems that it is not needed anymore, but let's keep it here, just in case...
+
+template<typename T> EIGEN_DONT_INLINE
+void kill_extra_precision(T& /* x */) {
+  // This one worked but triggered a warning:
+  /* eigen_assert((void*)(&x) != (void*)0); */
+  // An alternative could be:
+  /* volatile T tmp = x; */
+  /* x = tmp; */
+}
+
+
+template<typename BoxType> void alignedbox(const BoxType& box)
+{
+  /* this test covers the following files:
+     AlignedBox.h
+  */
+  typedef typename BoxType::Scalar Scalar;
+  typedef NumTraits<Scalar> ScalarTraits;
+  typedef typename ScalarTraits::Real RealScalar;
+  typedef Matrix<Scalar, BoxType::AmbientDimAtCompileTime, 1> VectorType;
+
+  const Index dim = box.dim();
+
+  VectorType p0 = VectorType::Random(dim);
+  VectorType p1 = VectorType::Random(dim);
+  while( p1 == p0 ){
+      p1 =  VectorType::Random(dim); }
+  RealScalar s1 = internal::random<RealScalar>(0,1);
+
+  BoxType b0(dim);
+  BoxType b1(VectorType::Random(dim),VectorType::Random(dim));
+  BoxType b2;
+
+  kill_extra_precision(b1);
+  kill_extra_precision(p0);
+  kill_extra_precision(p1);
+
+  b0.extend(p0);
+  b0.extend(p1);
+  VERIFY(b0.contains(p0*s1+(Scalar(1)-s1)*p1));
+  VERIFY(b0.contains(b0.center()));
+  VERIFY_IS_APPROX(b0.center(),(p0+p1)/Scalar(2));
+
+  (b2 = b0).extend(b1);
+  VERIFY(b2.contains(b0));
+  VERIFY(b2.contains(b1));
+  VERIFY_IS_APPROX(b2.clamp(b0), b0);
+
+  // intersection
+  BoxType box1(VectorType::Random(dim));
+  box1.extend(VectorType::Random(dim));
+  BoxType box2(VectorType::Random(dim));
+  box2.extend(VectorType::Random(dim));
+
+  VERIFY(box1.intersects(box2) == !box1.intersection(box2).isEmpty());
+
+  // alignment -- make sure there is no memory alignment assertion
+  BoxType *bp0 = new BoxType(dim);
+  BoxType *bp1 = new BoxType(dim);
+  bp0->extend(*bp1);
+  delete bp0;
+  delete bp1;
+
+  // sampling
+  for( int i=0; i<10; ++i )
+  {
+      VectorType r = b0.sample();
+      VERIFY(b0.contains(r));
+  }
+
+}
+
+template<typename BoxType> void alignedboxTranslatable(const BoxType& box)
+{
+  typedef typename BoxType::Scalar Scalar;
+  typedef Matrix<Scalar, BoxType::AmbientDimAtCompileTime, 1> VectorType;
+  typedef Transform<Scalar, BoxType::AmbientDimAtCompileTime, Isometry> IsometryTransform;
+  typedef Transform<Scalar, BoxType::AmbientDimAtCompileTime, Affine> AffineTransform;
+
+  alignedbox(box);
+
+  const VectorType Ones = VectorType::Ones();
+  const VectorType UnitX = VectorType::UnitX();
+  const Index dim = box.dim();
+
+  // box((-1, -1, -1), (1, 1, 1))
+  BoxType a(-Ones, Ones);
+
+  VERIFY_IS_APPROX(a.sizes(), Ones * Scalar(2));
+
+  BoxType b = a;
+  VectorType translate = Ones;
+  translate[0] = Scalar(2);
+  b.translate(translate);
+  // translate by (2, 1, 1) -> box((1, 0, 0), (3, 2, 2))
+
+  VERIFY_IS_APPROX(b.sizes(), Ones * Scalar(2));
+  VERIFY_IS_APPROX((b.min)(), UnitX);
+  VERIFY_IS_APPROX((b.max)(), Ones * Scalar(2) + UnitX);
+
+  // Test transform
+
+  IsometryTransform tf = IsometryTransform::Identity();
+  tf.translation() = -translate;
+
+  BoxType c = b.transformed(tf);
+  // translate by (-2, -1, -1) -> box((-1, -1, -1), (1, 1, 1))
+  VERIFY_IS_APPROX(c.sizes(), a.sizes());
+  VERIFY_IS_APPROX((c.min)(), (a.min)());
+  VERIFY_IS_APPROX((c.max)(), (a.max)());
+
+  c.transform(tf);
+  // translate by (-2, -1, -1) -> box((-3, -2, -2), (-1, 0, 0))
+  VERIFY_IS_APPROX(c.sizes(), a.sizes());
+  VERIFY_IS_APPROX((c.min)(), Ones * Scalar(-2) - UnitX);
+  VERIFY_IS_APPROX((c.max)(), -UnitX);
+
+  // Scaling
+
+  AffineTransform atf = AffineTransform::Identity();
+  atf.scale(Scalar(3));
+  c.transform(atf);
+  // scale by 3 -> box((-9, -6, -6), (-3, 0, 0))
+  VERIFY_IS_APPROX(c.sizes(), Scalar(3) * a.sizes());
+  VERIFY_IS_APPROX((c.min)(), Ones * Scalar(-6) - UnitX * Scalar(3));
+  VERIFY_IS_APPROX((c.max)(), UnitX * Scalar(-3));
+
+  atf = AffineTransform::Identity();
+  atf.scale(Scalar(-3));
+  c.transform(atf);
+  // scale by -3 -> box((27, 18, 18), (9, 0, 0))
+  VERIFY_IS_APPROX(c.sizes(), Scalar(9) * a.sizes());
+  VERIFY_IS_APPROX((c.min)(), UnitX * Scalar(9));
+  VERIFY_IS_APPROX((c.max)(), Ones * Scalar(18) + UnitX * Scalar(9));
+
+  // Check identity transform within numerical precision.
+  BoxType transformedC = c.transformed(IsometryTransform::Identity());
+  VERIFY_IS_APPROX(transformedC, c);
+
+  for (size_t i = 0; i < 10; ++i)
+  {
+    VectorType minCorner;
+    VectorType maxCorner;
+    for (Index d = 0; d < dim; ++d)
+    {
+      minCorner[d] = internal::random<Scalar>(-10,10);
+      maxCorner[d] = minCorner[d] + internal::random<Scalar>(0, 10);
+    }
+
+    c = BoxType(minCorner, maxCorner);
+
+    translate = VectorType::Random();
+    c.translate(translate);
+
+    VERIFY_IS_APPROX((c.min)(), minCorner + translate);
+    VERIFY_IS_APPROX((c.max)(), maxCorner + translate);
+  }
+}
+
+template<typename Scalar, typename Rotation>
+Rotation rotate2D(Scalar angle) {
+  return Rotation2D<Scalar>(angle);
+}
+
+template<typename Scalar, typename Rotation>
+Rotation rotate2DIntegral(typename NumTraits<Scalar>::NonInteger angle) {
+  typedef typename NumTraits<Scalar>::NonInteger NonInteger;
+  return Rotation2D<NonInteger>(angle).toRotationMatrix().
+      template cast<Scalar>();
+}
+
+template<typename Scalar, typename Rotation>
+Rotation rotate3DZAxis(Scalar angle) {
+  return AngleAxis<Scalar>(angle, Matrix<Scalar, 3, 1>(0, 0, 1));
+}
+
+template<typename Scalar, typename Rotation>
+Rotation rotate3DZAxisIntegral(typename NumTraits<Scalar>::NonInteger angle) {
+  typedef typename NumTraits<Scalar>::NonInteger NonInteger;
+  return AngleAxis<NonInteger>(angle, Matrix<NonInteger, 3, 1>(0, 0, 1)).
+      toRotationMatrix().template cast<Scalar>();
+}
+
+template<typename Scalar, typename Rotation>
+Rotation rotate4DZWAxis(Scalar angle) {
+  Rotation result = Matrix<Scalar, 4, 4>::Identity();
+  result.block(0, 0, 3, 3) = rotate3DZAxis<Scalar, AngleAxisd>(angle).toRotationMatrix();
+  return result;
+}
+
+template <typename MatrixType>
+MatrixType randomRotationMatrix()
+{
+  // algorithm from
+  // https://www.isprs-ann-photogramm-remote-sens-spatial-inf-sci.net/III-7/103/2016/isprs-annals-III-7-103-2016.pdf
+  const MatrixType rand = MatrixType::Random();
+  const MatrixType q = rand.householderQr().householderQ();
+  const JacobiSVD<MatrixType> svd = q.jacobiSvd(ComputeFullU | ComputeFullV);
+  const typename MatrixType::Scalar det = (svd.matrixU() * svd.matrixV().transpose()).determinant();
+  MatrixType diag = rand.Identity();
+  diag(MatrixType::RowsAtCompileTime - 1, MatrixType::ColsAtCompileTime - 1) = det;
+  const MatrixType rotation = svd.matrixU() * diag * svd.matrixV().transpose();
+  return rotation;
+}
+
+template <typename Scalar, int Dim>
+Matrix<Scalar, Dim, (1<<Dim)> boxGetCorners(const Matrix<Scalar, Dim, 1>& min_, const Matrix<Scalar, Dim, 1>& max_)
+{
+  Matrix<Scalar, Dim, (1<<Dim) > result;
+  for(Index i=0; i<(1<<Dim); ++i)
+  {
+    for(Index j=0; j<Dim; ++j)
+      result(j,i) = (i & (1<<j)) ? min_(j) : max_(j);
+  }
+  return result;
+}
+
+template<typename BoxType, typename Rotation> void alignedboxRotatable(
+    const BoxType& box,
+    Rotation (*rotate)(typename NumTraits<typename BoxType::Scalar>::NonInteger /*_angle*/))
+{
+  alignedboxTranslatable(box);
+
+  typedef typename BoxType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::NonInteger NonInteger;
+  typedef Matrix<Scalar, BoxType::AmbientDimAtCompileTime, 1> VectorType;
+  typedef Transform<Scalar, BoxType::AmbientDimAtCompileTime, Isometry> IsometryTransform;
+  typedef Transform<Scalar, BoxType::AmbientDimAtCompileTime, Affine> AffineTransform;
+
+  const VectorType Zero = VectorType::Zero();
+  const VectorType Ones = VectorType::Ones();
+  const VectorType UnitX = VectorType::UnitX();
+  const VectorType UnitY = VectorType::UnitY();
+  // this is vector (0, 0, -1, -1, -1, ...), i.e. with zeros at first and second dimensions
+  const VectorType UnitZ = Ones - UnitX - UnitY;
+
+  // in this kind of comments the 3D case values will be illustrated
+  // box((-1, -1, -1), (1, 1, 1))
+  BoxType a(-Ones, Ones);
+
+  // to allow templating this test for both 2D and 3D cases, we always set all
+  // but the first coordinate to the same value; so basically 3D case works as
+  // if you were looking at the scene from top
+
+  VectorType minPoint = -2 * Ones;
+  minPoint[0] = -3;
+  VectorType maxPoint = Zero;
+  maxPoint[0] = -1;
+  BoxType c(minPoint, maxPoint);
+  // box((-3, -2, -2), (-1, 0, 0))
+
+  IsometryTransform tf2 = IsometryTransform::Identity();
+  // for some weird reason the following statement has to be put separate from
+  // the following rotate call, otherwise precision problems arise...
+  Rotation rot = rotate(NonInteger(EIGEN_PI));
+  tf2.rotate(rot);
+
+  c.transform(tf2);
+  // rotate by 180 deg around origin -> box((1, 0, -2), (3, 2, 0))
+
+  VERIFY_IS_APPROX(c.sizes(), a.sizes());
+  VERIFY_IS_APPROX((c.min)(), UnitX - UnitZ * Scalar(2));
+  VERIFY_IS_APPROX((c.max)(), UnitX * Scalar(3) + UnitY * Scalar(2));
+
+  rot = rotate(NonInteger(EIGEN_PI / 2));
+  tf2.setIdentity();
+  tf2.rotate(rot);
+
+  c.transform(tf2);
+  // rotate by 90 deg around origin ->  box((-2, 1, -2), (0, 3, 0))
+
+  VERIFY_IS_APPROX(c.sizes(), a.sizes());
+  VERIFY_IS_APPROX((c.min)(), Ones * Scalar(-2) + UnitY * Scalar(3));
+  VERIFY_IS_APPROX((c.max)(), UnitY * Scalar(3));
+
+  // box((-1, -1, -1), (1, 1, 1))
+  AffineTransform atf = AffineTransform::Identity();
+  atf.linearExt()(0, 1) = Scalar(1);
+  c = BoxType(-Ones, Ones);
+  c.transform(atf);
+  // 45 deg shear in x direction -> box((-2, -1, -1), (2, 1, 1))
+
+  VERIFY_IS_APPROX(c.sizes(), Ones * Scalar(2) + UnitX * Scalar(2));
+  VERIFY_IS_APPROX((c.min)(), -Ones - UnitX);
+  VERIFY_IS_APPROX((c.max)(), Ones + UnitX);
+}
+
+template<typename BoxType, typename Rotation> void alignedboxNonIntegralRotatable(
+    const BoxType& box,
+    Rotation (*rotate)(typename NumTraits<typename BoxType::Scalar>::NonInteger /*_angle*/))
+{
+  alignedboxRotatable(box, rotate);
+
+  typedef typename BoxType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::NonInteger NonInteger;
+  enum { Dim = BoxType::AmbientDimAtCompileTime };
+  typedef Matrix<Scalar, Dim, 1> VectorType;
+  typedef Matrix<Scalar, Dim, (1 << Dim)> CornersType;
+  typedef Transform<Scalar, Dim, Isometry> IsometryTransform;
+  typedef Transform<Scalar, Dim, Affine> AffineTransform;
+
+  const Index dim = box.dim();
+  const VectorType Zero = VectorType::Zero();
+  const VectorType Ones = VectorType::Ones();
+
+  VectorType minPoint = -2 * Ones;
+  minPoint[1] = 1;
+  VectorType maxPoint = Zero;
+  maxPoint[1] = 3;
+  BoxType c(minPoint, maxPoint);
+  // ((-2, 1, -2), (0, 3, 0))
+
+  VectorType cornerBL = (c.min)();
+  VectorType cornerTR = (c.max)();
+  VectorType cornerBR = (c.min)(); cornerBR[0] = cornerTR[0];
+  VectorType cornerTL = (c.max)(); cornerTL[0] = cornerBL[0];
+
+  NonInteger angle = NonInteger(EIGEN_PI/3);
+  Rotation rot = rotate(angle);
+  IsometryTransform tf2;
+  tf2.setIdentity();
+  tf2.rotate(rot);
+
+  c.transform(tf2);
+  // rotate by 60 deg ->  box((-3.59, -1.23, -2), (-0.86, 1.5, 0))
+
+  cornerBL = tf2 * cornerBL;
+  cornerBR = tf2 * cornerBR;
+  cornerTL = tf2 * cornerTL;
+  cornerTR = tf2 * cornerTR;
+
+  VectorType minCorner = Ones * Scalar(-2);
+  VectorType maxCorner = Zero;
+  minCorner[0] = (min)((min)(cornerBL[0], cornerBR[0]), (min)(cornerTL[0], cornerTR[0]));
+  maxCorner[0] = (max)((max)(cornerBL[0], cornerBR[0]), (max)(cornerTL[0], cornerTR[0]));
+  minCorner[1] = (min)((min)(cornerBL[1], cornerBR[1]), (min)(cornerTL[1], cornerTR[1]));
+  maxCorner[1] = (max)((max)(cornerBL[1], cornerBR[1]), (max)(cornerTL[1], cornerTR[1]));
+
+  for (Index d = 2; d < dim; ++d)
+    VERIFY_IS_APPROX(c.sizes()[d], Scalar(2));
+
+  VERIFY_IS_APPROX((c.min)(), minCorner);
+  VERIFY_IS_APPROX((c.max)(), maxCorner);
+
+  VectorType minCornerValue = Ones * Scalar(-2);
+  VectorType maxCornerValue = Zero;
+  minCornerValue[0] = Scalar(Scalar(-sqrt(2*2 + 3*3)) * Scalar(cos(Scalar(atan(2.0/3.0)) - angle/2)));
+  minCornerValue[1] = Scalar(Scalar(-sqrt(1*1 + 2*2)) * Scalar(sin(Scalar(atan(2.0/1.0)) - angle/2)));
+  maxCornerValue[0] = Scalar(-sin(angle));
+  maxCornerValue[1] = Scalar(3 * cos(angle));
+  VERIFY_IS_APPROX((c.min)(), minCornerValue);
+  VERIFY_IS_APPROX((c.max)(), maxCornerValue);
+
+  // randomized test - translate and rotate the box and compare to a box made of transformed vertices
+  for (size_t i = 0; i < 10; ++i)
+  {
+    for (Index d = 0; d < dim; ++d)
+    {
+      minCorner[d] = internal::random<Scalar>(-10,10);
+      maxCorner[d] = minCorner[d] + internal::random<Scalar>(0, 10);
+    }
+
+    c = BoxType(minCorner, maxCorner);
+
+    CornersType corners = boxGetCorners(minCorner, maxCorner);
+
+    typename AffineTransform::LinearMatrixType rotation =
+        randomRotationMatrix<typename AffineTransform::LinearMatrixType>();
+
+    tf2.setIdentity();
+    tf2.rotate(rotation);
+    tf2.translate(VectorType::Random());
+
+    c.transform(tf2);
+    corners = tf2 * corners;
+
+    minCorner = corners.rowwise().minCoeff();
+    maxCorner = corners.rowwise().maxCoeff();
+
+    VERIFY_IS_APPROX((c.min)(), minCorner);
+    VERIFY_IS_APPROX((c.max)(), maxCorner);
+  }
+
+  // randomized test - transform the box with a random affine matrix and compare to a box made of transformed vertices
+  for (size_t i = 0; i < 10; ++i)
+  {
+    for (Index d = 0; d < dim; ++d)
+    {
+      minCorner[d] = internal::random<Scalar>(-10,10);
+      maxCorner[d] = minCorner[d] + internal::random<Scalar>(0, 10);
+    }
+
+    c = BoxType(minCorner, maxCorner);
+
+    CornersType corners = boxGetCorners(minCorner, maxCorner);
+
+    AffineTransform atf = AffineTransform::Identity();
+    atf.linearExt() = AffineTransform::LinearPart::Random();
+    atf.translate(VectorType::Random());
+
+    c.transform(atf);
+    corners = atf * corners;
+
+    minCorner = corners.rowwise().minCoeff();
+    maxCorner = corners.rowwise().maxCoeff();
+
+    VERIFY_IS_APPROX((c.min)(), minCorner);
+    VERIFY_IS_APPROX((c.max)(), maxCorner);
+  }
+}
+
+template<typename BoxType>
+void alignedboxCastTests(const BoxType& box)
+{
+  // casting
+  typedef typename BoxType::Scalar Scalar;
+  typedef Matrix<Scalar, BoxType::AmbientDimAtCompileTime, 1> VectorType;
+
+  const Index dim = box.dim();
+
+  VectorType p0 = VectorType::Random(dim);
+  VectorType p1 = VectorType::Random(dim);
+
+  BoxType b0(dim);
+
+  b0.extend(p0);
+  b0.extend(p1);
+
+  const int Dim = BoxType::AmbientDimAtCompileTime;
+  typedef typename GetDifferentType<Scalar>::type OtherScalar;
+  AlignedBox<OtherScalar,Dim> hp1f = b0.template cast<OtherScalar>();
+  VERIFY_IS_APPROX(hp1f.template cast<Scalar>(),b0);
+  AlignedBox<Scalar,Dim> hp1d = b0.template cast<Scalar>();
+  VERIFY_IS_APPROX(hp1d.template cast<Scalar>(),b0);
+}
+
+
+void specificTest1()
+{
+    Vector2f m; m << -1.0f, -2.0f;
+    Vector2f M; M <<  1.0f,  5.0f;
+
+    typedef AlignedBox2f  BoxType;
+    BoxType box( m, M );
+
+    Vector2f sides = M-m;
+    VERIFY_IS_APPROX(sides, box.sizes() );
+    VERIFY_IS_APPROX(sides[1], box.sizes()[1] );
+    VERIFY_IS_APPROX(sides[1], box.sizes().maxCoeff() );
+    VERIFY_IS_APPROX(sides[0], box.sizes().minCoeff() );
+
+    VERIFY_IS_APPROX( 14.0f, box.volume() );
+    VERIFY_IS_APPROX( 53.0f, box.diagonal().squaredNorm() );
+    VERIFY_IS_APPROX( std::sqrt( 53.0f ), box.diagonal().norm() );
+
+    VERIFY_IS_APPROX( m, box.corner( BoxType::BottomLeft ) );
+    VERIFY_IS_APPROX( M, box.corner( BoxType::TopRight ) );
+    Vector2f bottomRight; bottomRight << M[0], m[1];
+    Vector2f topLeft; topLeft << m[0], M[1];
+    VERIFY_IS_APPROX( bottomRight, box.corner( BoxType::BottomRight ) );
+    VERIFY_IS_APPROX( topLeft, box.corner( BoxType::TopLeft ) );
+}
+
+
+void specificTest2()
+{
+    Vector3i m; m << -1, -2, 0;
+    Vector3i M; M <<  1,  5, 3;
+
+    typedef AlignedBox3i  BoxType;
+    BoxType box( m, M );
+
+    Vector3i sides = M-m;
+    VERIFY_IS_APPROX(sides, box.sizes() );
+    VERIFY_IS_APPROX(sides[1], box.sizes()[1] );
+    VERIFY_IS_APPROX(sides[1], box.sizes().maxCoeff() );
+    VERIFY_IS_APPROX(sides[0], box.sizes().minCoeff() );
+
+    VERIFY_IS_APPROX( 42, box.volume() );
+    VERIFY_IS_APPROX( 62, box.diagonal().squaredNorm() );
+
+    VERIFY_IS_APPROX( m, box.corner( BoxType::BottomLeftFloor ) );
+    VERIFY_IS_APPROX( M, box.corner( BoxType::TopRightCeil ) );
+    Vector3i bottomRightFloor; bottomRightFloor << M[0], m[1], m[2];
+    Vector3i topLeftFloor; topLeftFloor << m[0], M[1], m[2];
+    VERIFY_IS_APPROX( bottomRightFloor, box.corner( BoxType::BottomRightFloor ) );
+    VERIFY_IS_APPROX( topLeftFloor, box.corner( BoxType::TopLeftFloor ) );
+}
+
+
+EIGEN_DECLARE_TEST(geo_alignedbox)
+{
+  for(int i = 0; i < g_repeat; i++)
+  {
+    CALL_SUBTEST_1( (alignedboxNonIntegralRotatable<AlignedBox2f, Rotation2Df>(AlignedBox2f(), &rotate2D)) );
+    CALL_SUBTEST_2( alignedboxCastTests(AlignedBox2f()) );
+
+    CALL_SUBTEST_3( (alignedboxNonIntegralRotatable<AlignedBox3f, AngleAxisf>(AlignedBox3f(), &rotate3DZAxis)) );
+    CALL_SUBTEST_4( alignedboxCastTests(AlignedBox3f()) );
+
+    CALL_SUBTEST_5( (alignedboxNonIntegralRotatable<AlignedBox4d, Matrix4d>(AlignedBox4d(), &rotate4DZWAxis)) );
+    CALL_SUBTEST_6( alignedboxCastTests(AlignedBox4d()) );
+
+    CALL_SUBTEST_7( alignedboxTranslatable(AlignedBox1d()) );
+    CALL_SUBTEST_8( alignedboxCastTests(AlignedBox1d()) );
+
+    CALL_SUBTEST_9( alignedboxTranslatable(AlignedBox1i()) );
+    CALL_SUBTEST_10( (alignedboxRotatable<AlignedBox2i, Matrix2i>(AlignedBox2i(), &rotate2DIntegral<int, Matrix2i>)) );
+    CALL_SUBTEST_11( (alignedboxRotatable<AlignedBox3i, Matrix3i>(AlignedBox3i(), &rotate3DZAxisIntegral<int, Matrix3i>)) );
+
+    CALL_SUBTEST_14( alignedbox(AlignedBox<double,Dynamic>(4)) );
+  }
+  CALL_SUBTEST_12( specificTest1() );
+  CALL_SUBTEST_13( specificTest2() );
+}

diff --git a/test/geo_eulerangles.cpp b/test/geo_eulerangles.cpp
new file mode 100644
index 0000000..693c627
--- /dev/null
+++ b/test/geo_eulerangles.cpp

@@ -0,0 +1,112 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/Geometry>
+#include <Eigen/LU>
+#include <Eigen/SVD>
+
+
+template<typename Scalar>
+void verify_euler(const Matrix<Scalar,3,1>& ea, int i, int j, int k)
+{
+  typedef Matrix<Scalar,3,3> Matrix3;
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef AngleAxis<Scalar> AngleAxisx;
+  using std::abs;
+  Matrix3 m(AngleAxisx(ea[0], Vector3::Unit(i)) * AngleAxisx(ea[1], Vector3::Unit(j)) * AngleAxisx(ea[2], Vector3::Unit(k)));
+  Vector3 eabis = m.eulerAngles(i, j, k);
+  Matrix3 mbis(AngleAxisx(eabis[0], Vector3::Unit(i)) * AngleAxisx(eabis[1], Vector3::Unit(j)) * AngleAxisx(eabis[2], Vector3::Unit(k))); 
+  VERIFY_IS_APPROX(m,  mbis); 
+  /* If I==K, and ea[1]==0, then there no unique solution. */ 
+  /* The remark apply in the case where I!=K, and |ea[1]| is close to pi/2. */ 
+  if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) ) 
+    VERIFY((ea-eabis).norm() <= test_precision<Scalar>());
+  
+  // approx_or_less_than does not work for 0
+  VERIFY(0 < eabis[0] || test_isMuchSmallerThan(eabis[0], Scalar(1)));
+  VERIFY_IS_APPROX_OR_LESS_THAN(eabis[0], Scalar(EIGEN_PI));
+  VERIFY_IS_APPROX_OR_LESS_THAN(-Scalar(EIGEN_PI), eabis[1]);
+  VERIFY_IS_APPROX_OR_LESS_THAN(eabis[1], Scalar(EIGEN_PI));
+  VERIFY_IS_APPROX_OR_LESS_THAN(-Scalar(EIGEN_PI), eabis[2]);
+  VERIFY_IS_APPROX_OR_LESS_THAN(eabis[2], Scalar(EIGEN_PI));
+}
+
+template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea)
+{
+  verify_euler(ea, 0,1,2);
+  verify_euler(ea, 0,1,0);
+  verify_euler(ea, 0,2,1);
+  verify_euler(ea, 0,2,0);
+
+  verify_euler(ea, 1,2,0);
+  verify_euler(ea, 1,2,1);
+  verify_euler(ea, 1,0,2);
+  verify_euler(ea, 1,0,1);
+
+  verify_euler(ea, 2,0,1);
+  verify_euler(ea, 2,0,2);
+  verify_euler(ea, 2,1,0);
+  verify_euler(ea, 2,1,2);
+}
+
+template<typename Scalar> void eulerangles()
+{
+  typedef Matrix<Scalar,3,3> Matrix3;
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Array<Scalar,3,1> Array3;
+  typedef Quaternion<Scalar> Quaternionx;
+  typedef AngleAxis<Scalar> AngleAxisx;
+
+  Scalar a = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI));
+  Quaternionx q1;
+  q1 = AngleAxisx(a, Vector3::Random().normalized());
+  Matrix3 m;
+  m = q1;
+  
+  Vector3 ea = m.eulerAngles(0,1,2);
+  check_all_var(ea);
+  ea = m.eulerAngles(0,1,0);
+  check_all_var(ea);
+  
+  // Check with purely random Quaternion:
+  q1.coeffs() = Quaternionx::Coefficients::Random().normalized();
+  m = q1;
+  ea = m.eulerAngles(0,1,2);
+  check_all_var(ea);
+  ea = m.eulerAngles(0,1,0);
+  check_all_var(ea);
+  
+  // Check with random angles in range [0:pi]x[-pi:pi]x[-pi:pi].
+  ea = (Array3::Random() + Array3(1,0,0))*Scalar(EIGEN_PI)*Array3(0.5,1,1);
+  check_all_var(ea);
+  
+  ea[2] = ea[0] = internal::random<Scalar>(0,Scalar(EIGEN_PI));
+  check_all_var(ea);
+  
+  ea[0] = ea[1] = internal::random<Scalar>(0,Scalar(EIGEN_PI));
+  check_all_var(ea);
+  
+  ea[1] = 0;
+  check_all_var(ea);
+  
+  ea.head(2).setZero();
+  check_all_var(ea);
+  
+  ea.setZero();
+  check_all_var(ea);
+}
+
+EIGEN_DECLARE_TEST(geo_eulerangles)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( eulerangles<float>() );
+    CALL_SUBTEST_2( eulerangles<double>() );
+  }
+}

diff --git a/test/geo_homogeneous.cpp b/test/geo_homogeneous.cpp
new file mode 100644
index 0000000..9aebe62
--- /dev/null
+++ b/test/geo_homogeneous.cpp

@@ -0,0 +1,125 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/Geometry>
+
+template<typename Scalar,int Size> void homogeneous(void)
+{
+  /* this test covers the following files:
+     Homogeneous.h
+  */
+
+  typedef Matrix<Scalar,Size,Size> MatrixType;
+  typedef Matrix<Scalar,Size,1, ColMajor> VectorType;
+
+  typedef Matrix<Scalar,Size+1,Size> HMatrixType;
+  typedef Matrix<Scalar,Size+1,1> HVectorType;
+
+  typedef Matrix<Scalar,Size,Size+1>   T1MatrixType;
+  typedef Matrix<Scalar,Size+1,Size+1> T2MatrixType;
+  typedef Matrix<Scalar,Size+1,Size> T3MatrixType;
+
+  VectorType v0 = VectorType::Random(),
+             ones = VectorType::Ones();
+
+  HVectorType hv0 = HVectorType::Random();
+
+  MatrixType m0 = MatrixType::Random();
+
+  HMatrixType hm0 = HMatrixType::Random();
+
+  hv0 << v0, 1;
+  VERIFY_IS_APPROX(v0.homogeneous(), hv0);
+  VERIFY_IS_APPROX(v0, hv0.hnormalized());
+  
+  VERIFY_IS_APPROX(v0.homogeneous().sum(), hv0.sum());
+  VERIFY_IS_APPROX(v0.homogeneous().minCoeff(), hv0.minCoeff());
+  VERIFY_IS_APPROX(v0.homogeneous().maxCoeff(), hv0.maxCoeff());
+
+  hm0 << m0, ones.transpose();
+  VERIFY_IS_APPROX(m0.colwise().homogeneous(), hm0);
+  VERIFY_IS_APPROX(m0, hm0.colwise().hnormalized());
+  hm0.row(Size-1).setRandom();
+  for(int j=0; j<Size; ++j)
+    m0.col(j) = hm0.col(j).head(Size) / hm0(Size,j);
+  VERIFY_IS_APPROX(m0, hm0.colwise().hnormalized());
+
+  T1MatrixType t1 = T1MatrixType::Random();
+  VERIFY_IS_APPROX(t1 * (v0.homogeneous().eval()), t1 * v0.homogeneous());
+  VERIFY_IS_APPROX(t1 * (m0.colwise().homogeneous().eval()), t1 * m0.colwise().homogeneous());
+
+  T2MatrixType t2 = T2MatrixType::Random();
+  VERIFY_IS_APPROX(t2 * (v0.homogeneous().eval()), t2 * v0.homogeneous());
+  VERIFY_IS_APPROX(t2 * (m0.colwise().homogeneous().eval()), t2 * m0.colwise().homogeneous());
+  VERIFY_IS_APPROX(t2 * (v0.homogeneous().asDiagonal()), t2 * hv0.asDiagonal());
+  VERIFY_IS_APPROX((v0.homogeneous().asDiagonal()) * t2, hv0.asDiagonal() * t2);
+
+  VERIFY_IS_APPROX((v0.transpose().rowwise().homogeneous().eval()) * t2,
+                    v0.transpose().rowwise().homogeneous() * t2);
+  VERIFY_IS_APPROX((m0.transpose().rowwise().homogeneous().eval()) * t2,
+                    m0.transpose().rowwise().homogeneous() * t2);
+
+  T3MatrixType t3 = T3MatrixType::Random();
+  VERIFY_IS_APPROX((v0.transpose().rowwise().homogeneous().eval()) * t3,
+                    v0.transpose().rowwise().homogeneous() * t3);
+  VERIFY_IS_APPROX((m0.transpose().rowwise().homogeneous().eval()) * t3,
+                    m0.transpose().rowwise().homogeneous() * t3);
+
+  // test product with a Transform object
+  Transform<Scalar, Size, Affine> aff;
+  Transform<Scalar, Size, AffineCompact> caff;
+  Transform<Scalar, Size, Projective> proj;
+  Matrix<Scalar, Size, Dynamic>   pts;
+  Matrix<Scalar, Size+1, Dynamic> pts1, pts2;
+
+  aff.affine().setRandom();
+  proj = caff = aff;
+  pts.setRandom(Size,internal::random<int>(1,20));
+  
+  pts1 = pts.colwise().homogeneous();
+  VERIFY_IS_APPROX(aff  * pts.colwise().homogeneous(), (aff  * pts1).colwise().hnormalized());
+  VERIFY_IS_APPROX(caff * pts.colwise().homogeneous(), (caff * pts1).colwise().hnormalized());
+  VERIFY_IS_APPROX(proj * pts.colwise().homogeneous(), (proj * pts1));
+
+  VERIFY_IS_APPROX((aff  * pts1).colwise().hnormalized(),  aff  * pts);
+  VERIFY_IS_APPROX((caff * pts1).colwise().hnormalized(), caff * pts);
+  
+  pts2 = pts1;
+  pts2.row(Size).setRandom();
+  VERIFY_IS_APPROX((aff  * pts2).colwise().hnormalized(), aff  * pts2.colwise().hnormalized());
+  VERIFY_IS_APPROX((caff * pts2).colwise().hnormalized(), caff * pts2.colwise().hnormalized());
+  VERIFY_IS_APPROX((proj * pts2).colwise().hnormalized(), (proj * pts2.colwise().hnormalized().colwise().homogeneous()).colwise().hnormalized());
+  
+  // Test combination of homogeneous
+  
+  VERIFY_IS_APPROX( (t2 * v0.homogeneous()).hnormalized(),
+                       (t2.template topLeftCorner<Size,Size>() * v0 + t2.template topRightCorner<Size,1>())
+                     / ((t2.template bottomLeftCorner<1,Size>()*v0).value() + t2(Size,Size)) );
+  
+  VERIFY_IS_APPROX( (t2 * pts.colwise().homogeneous()).colwise().hnormalized(),
+                    (Matrix<Scalar, Size+1, Dynamic>(t2 * pts1).colwise().hnormalized()) );
+  
+  VERIFY_IS_APPROX( (t2 .lazyProduct( v0.homogeneous() )).hnormalized(), (t2 * v0.homogeneous()).hnormalized() );
+  VERIFY_IS_APPROX( (t2 .lazyProduct  ( pts.colwise().homogeneous() )).colwise().hnormalized(), (t2 * pts1).colwise().hnormalized() );
+  
+  VERIFY_IS_APPROX( (v0.transpose().homogeneous() .lazyProduct( t2 )).hnormalized(), (v0.transpose().homogeneous()*t2).hnormalized() );
+  VERIFY_IS_APPROX( (pts.transpose().rowwise().homogeneous() .lazyProduct( t2 )).rowwise().hnormalized(), (pts1.transpose()*t2).rowwise().hnormalized() );
+
+  VERIFY_IS_APPROX( (t2.template triangularView<Lower>() * v0.homogeneous()).eval(), (t2.template triangularView<Lower>()*hv0) );
+}
+
+EIGEN_DECLARE_TEST(geo_homogeneous)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1(( homogeneous<float,1>() ));
+    CALL_SUBTEST_2(( homogeneous<double,3>() ));
+    CALL_SUBTEST_3(( homogeneous<double,8>() ));
+  }
+}

diff --git a/test/geo_hyperplane.cpp b/test/geo_hyperplane.cpp
new file mode 100644
index 0000000..44b2f2a
--- /dev/null
+++ b/test/geo_hyperplane.cpp

@@ -0,0 +1,192 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/Geometry>
+#include <Eigen/LU>
+#include <Eigen/QR>
+
+template<typename HyperplaneType> void hyperplane(const HyperplaneType& _plane)
+{
+  /* this test covers the following files:
+     Hyperplane.h
+  */
+  using std::abs;
+  const Index dim = _plane.dim();
+  enum { Options = HyperplaneType::Options };
+  typedef typename HyperplaneType::Scalar Scalar;
+  typedef typename HyperplaneType::RealScalar RealScalar;
+  typedef Matrix<Scalar, HyperplaneType::AmbientDimAtCompileTime, 1> VectorType;
+  typedef Matrix<Scalar, HyperplaneType::AmbientDimAtCompileTime,
+                         HyperplaneType::AmbientDimAtCompileTime> MatrixType;
+
+  VectorType p0 = VectorType::Random(dim);
+  VectorType p1 = VectorType::Random(dim);
+
+  VectorType n0 = VectorType::Random(dim).normalized();
+  VectorType n1 = VectorType::Random(dim).normalized();
+
+  HyperplaneType pl0(n0, p0);
+  HyperplaneType pl1(n1, p1);
+  HyperplaneType pl2 = pl1;
+
+  Scalar s0 = internal::random<Scalar>();
+  Scalar s1 = internal::random<Scalar>();
+
+  VERIFY_IS_APPROX( n1.dot(n1), Scalar(1) );
+
+  VERIFY_IS_MUCH_SMALLER_THAN( pl0.absDistance(p0), Scalar(1) );
+  if(numext::abs2(s0)>RealScalar(1e-6))
+    VERIFY_IS_APPROX( pl1.signedDistance(p1 + n1 * s0), s0);
+  else
+    VERIFY_IS_MUCH_SMALLER_THAN( abs(pl1.signedDistance(p1 + n1 * s0) - s0), Scalar(1) );
+  VERIFY_IS_MUCH_SMALLER_THAN( pl1.signedDistance(pl1.projection(p0)), Scalar(1) );
+  VERIFY_IS_MUCH_SMALLER_THAN( pl1.absDistance(p1 +  pl1.normal().unitOrthogonal() * s1), Scalar(1) );
+
+  // transform
+  if (!NumTraits<Scalar>::IsComplex)
+  {
+    MatrixType rot = MatrixType::Random(dim,dim).householderQr().householderQ();
+    DiagonalMatrix<Scalar,HyperplaneType::AmbientDimAtCompileTime> scaling(VectorType::Random());
+    Translation<Scalar,HyperplaneType::AmbientDimAtCompileTime> translation(VectorType::Random());
+    
+    while(scaling.diagonal().cwiseAbs().minCoeff()<RealScalar(1e-4)) scaling.diagonal() = VectorType::Random();
+
+    pl2 = pl1;
+    VERIFY_IS_MUCH_SMALLER_THAN( pl2.transform(rot).absDistance(rot * p1), Scalar(1) );
+    pl2 = pl1;
+    VERIFY_IS_MUCH_SMALLER_THAN( pl2.transform(rot,Isometry).absDistance(rot * p1), Scalar(1) );
+    pl2 = pl1;
+    VERIFY_IS_MUCH_SMALLER_THAN( pl2.transform(rot*scaling).absDistance((rot*scaling) * p1), Scalar(1) );
+    VERIFY_IS_APPROX( pl2.normal().norm(), RealScalar(1) );
+    pl2 = pl1;
+    VERIFY_IS_MUCH_SMALLER_THAN( pl2.transform(rot*scaling*translation)
+                                  .absDistance((rot*scaling*translation) * p1), Scalar(1) );
+    VERIFY_IS_APPROX( pl2.normal().norm(), RealScalar(1) );
+    pl2 = pl1;
+    VERIFY_IS_MUCH_SMALLER_THAN( pl2.transform(rot*translation,Isometry)
+                                 .absDistance((rot*translation) * p1), Scalar(1) );
+    VERIFY_IS_APPROX( pl2.normal().norm(), RealScalar(1) );
+  }
+
+  // casting
+  const int Dim = HyperplaneType::AmbientDimAtCompileTime;
+  typedef typename GetDifferentType<Scalar>::type OtherScalar;
+  Hyperplane<OtherScalar,Dim,Options> hp1f = pl1.template cast<OtherScalar>();
+  VERIFY_IS_APPROX(hp1f.template cast<Scalar>(),pl1);
+  Hyperplane<Scalar,Dim,Options> hp1d = pl1.template cast<Scalar>();
+  VERIFY_IS_APPROX(hp1d.template cast<Scalar>(),pl1);
+}
+
+template<typename Scalar> void lines()
+{
+  using std::abs;
+  typedef Hyperplane<Scalar, 2> HLine;
+  typedef ParametrizedLine<Scalar, 2> PLine;
+  typedef Matrix<Scalar,2,1> Vector;
+  typedef Matrix<Scalar,3,1> CoeffsType;
+
+  for(int i = 0; i < 10; i++)
+  {
+    Vector center = Vector::Random();
+    Vector u = Vector::Random();
+    Vector v = Vector::Random();
+    Scalar a = internal::random<Scalar>();
+    while (abs(a-1) < Scalar(1e-4)) a = internal::random<Scalar>();
+    while (u.norm() < Scalar(1e-4)) u = Vector::Random();
+    while (v.norm() < Scalar(1e-4)) v = Vector::Random();
+
+    HLine line_u = HLine::Through(center + u, center + a*u);
+    HLine line_v = HLine::Through(center + v, center + a*v);
+
+    // the line equations should be normalized so that a^2+b^2=1
+    VERIFY_IS_APPROX(line_u.normal().norm(), Scalar(1));
+    VERIFY_IS_APPROX(line_v.normal().norm(), Scalar(1));
+
+    Vector result = line_u.intersection(line_v);
+
+    // the lines should intersect at the point we called "center"
+    if(abs(a-1) > Scalar(1e-2) && abs(v.normalized().dot(u.normalized()))<Scalar(0.9))
+      VERIFY_IS_APPROX(result, center);
+
+    // check conversions between two types of lines
+    PLine pl(line_u); // gcc 3.3 will crash if we don't name this variable.
+    HLine line_u2(pl);
+    CoeffsType converted_coeffs = line_u2.coeffs();
+    if(line_u2.normal().dot(line_u.normal())<Scalar(0))
+      converted_coeffs = -line_u2.coeffs();
+    VERIFY(line_u.coeffs().isApprox(converted_coeffs));
+  }
+}
+
+template<typename Scalar> void planes()
+{
+  using std::abs;
+  typedef Hyperplane<Scalar, 3> Plane;
+  typedef Matrix<Scalar,3,1> Vector;
+
+  for(int i = 0; i < 10; i++)
+  {
+    Vector v0 = Vector::Random();
+    Vector v1(v0), v2(v0);
+    if(internal::random<double>(0,1)>0.25)
+      v1 += Vector::Random();
+    if(internal::random<double>(0,1)>0.25)
+      v2 += v1 * std::pow(internal::random<Scalar>(0,1),internal::random<int>(1,16));
+    if(internal::random<double>(0,1)>0.25)
+      v2 += Vector::Random() * std::pow(internal::random<Scalar>(0,1),internal::random<int>(1,16));
+
+    Plane p0 = Plane::Through(v0, v1, v2);
+
+    VERIFY_IS_APPROX(p0.normal().norm(), Scalar(1));
+    VERIFY_IS_MUCH_SMALLER_THAN(p0.absDistance(v0), Scalar(1));
+    VERIFY_IS_MUCH_SMALLER_THAN(p0.absDistance(v1), Scalar(1));
+    VERIFY_IS_MUCH_SMALLER_THAN(p0.absDistance(v2), Scalar(1));
+  }
+}
+
+template<typename Scalar> void hyperplane_alignment()
+{
+  typedef Hyperplane<Scalar,3,AutoAlign> Plane3a;
+  typedef Hyperplane<Scalar,3,DontAlign> Plane3u;
+
+  EIGEN_ALIGN_MAX Scalar array1[4];
+  EIGEN_ALIGN_MAX Scalar array2[4];
+  EIGEN_ALIGN_MAX Scalar array3[4+1];
+  Scalar* array3u = array3+1;
+
+  Plane3a *p1 = ::new(reinterpret_cast<void*>(array1)) Plane3a;
+  Plane3u *p2 = ::new(reinterpret_cast<void*>(array2)) Plane3u;
+  Plane3u *p3 = ::new(reinterpret_cast<void*>(array3u)) Plane3u;
+  
+  p1->coeffs().setRandom();
+  *p2 = *p1;
+  *p3 = *p1;
+
+  VERIFY_IS_APPROX(p1->coeffs(), p2->coeffs());
+  VERIFY_IS_APPROX(p1->coeffs(), p3->coeffs());
+}
+
+
+EIGEN_DECLARE_TEST(geo_hyperplane)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( hyperplane(Hyperplane<float,2>()) );
+    CALL_SUBTEST_2( hyperplane(Hyperplane<float,3>()) );
+    CALL_SUBTEST_2( hyperplane(Hyperplane<float,3,DontAlign>()) );
+    CALL_SUBTEST_2( hyperplane_alignment<float>() );
+    CALL_SUBTEST_3( hyperplane(Hyperplane<double,4>()) );
+    CALL_SUBTEST_4( hyperplane(Hyperplane<std::complex<double>,5>()) );
+    CALL_SUBTEST_1( lines<float>() );
+    CALL_SUBTEST_3( lines<double>() );
+    CALL_SUBTEST_2( planes<float>() );
+    CALL_SUBTEST_5( planes<double>() );
+  }
+}

diff --git a/test/geo_orthomethods.cpp b/test/geo_orthomethods.cpp
new file mode 100644
index 0000000..b7b6607
--- /dev/null
+++ b/test/geo_orthomethods.cpp

@@ -0,0 +1,133 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/Geometry>
+#include <Eigen/LU>
+#include <Eigen/SVD>
+
+/* this test covers the following files:
+   Geometry/OrthoMethods.h
+*/
+
+template<typename Scalar> void orthomethods_3()
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<Scalar,3,3> Matrix3;
+  typedef Matrix<Scalar,3,1> Vector3;
+
+  typedef Matrix<Scalar,4,1> Vector4;
+
+  Vector3 v0 = Vector3::Random(),
+          v1 = Vector3::Random(),
+          v2 = Vector3::Random();
+
+  // cross product
+  VERIFY_IS_MUCH_SMALLER_THAN(v1.cross(v2).dot(v1), Scalar(1));
+  VERIFY_IS_MUCH_SMALLER_THAN(v1.dot(v1.cross(v2)), Scalar(1));
+  VERIFY_IS_MUCH_SMALLER_THAN(v1.cross(v2).dot(v2), Scalar(1));
+  VERIFY_IS_MUCH_SMALLER_THAN(v2.dot(v1.cross(v2)), Scalar(1));
+  VERIFY_IS_MUCH_SMALLER_THAN(v1.cross(Vector3::Random()).dot(v1), Scalar(1));
+  Matrix3 mat3;
+  mat3 << v0.normalized(),
+         (v0.cross(v1)).normalized(),
+         (v0.cross(v1).cross(v0)).normalized();
+  VERIFY(mat3.isUnitary());
+  
+  mat3.setRandom();
+  VERIFY_IS_APPROX(v0.cross(mat3*v1), -(mat3*v1).cross(v0));
+  VERIFY_IS_APPROX(v0.cross(mat3.lazyProduct(v1)), -(mat3.lazyProduct(v1)).cross(v0));
+
+  // colwise/rowwise cross product
+  mat3.setRandom();
+  Vector3 vec3 = Vector3::Random();
+  Matrix3 mcross;
+  int i = internal::random<int>(0,2);
+  mcross = mat3.colwise().cross(vec3);
+  VERIFY_IS_APPROX(mcross.col(i), mat3.col(i).cross(vec3));
+  
+  VERIFY_IS_MUCH_SMALLER_THAN((mat3.adjoint() * mat3.colwise().cross(vec3)).diagonal().cwiseAbs().sum(), Scalar(1));
+  VERIFY_IS_MUCH_SMALLER_THAN((mat3.adjoint() * mat3.colwise().cross(Vector3::Random())).diagonal().cwiseAbs().sum(), Scalar(1));
+  
+  VERIFY_IS_MUCH_SMALLER_THAN((vec3.adjoint() * mat3.colwise().cross(vec3)).cwiseAbs().sum(), Scalar(1));
+  VERIFY_IS_MUCH_SMALLER_THAN((vec3.adjoint() * Matrix3::Random().colwise().cross(vec3)).cwiseAbs().sum(), Scalar(1));
+  
+  mcross = mat3.rowwise().cross(vec3);
+  VERIFY_IS_APPROX(mcross.row(i), mat3.row(i).cross(vec3));
+
+  // cross3
+  Vector4 v40 = Vector4::Random(),
+          v41 = Vector4::Random(),
+          v42 = Vector4::Random();
+  v40.w() = v41.w() = v42.w() = 0;
+  v42.template head<3>() = v40.template head<3>().cross(v41.template head<3>());
+  VERIFY_IS_APPROX(v40.cross3(v41), v42);
+  VERIFY_IS_MUCH_SMALLER_THAN(v40.cross3(Vector4::Random()).dot(v40), Scalar(1));
+  
+  // check mixed product
+  typedef Matrix<RealScalar, 3, 1> RealVector3;
+  RealVector3 rv1 = RealVector3::Random();
+  VERIFY_IS_APPROX(v1.cross(rv1.template cast<Scalar>()), v1.cross(rv1));
+  VERIFY_IS_APPROX(rv1.template cast<Scalar>().cross(v1), rv1.cross(v1));
+}
+
+template<typename Scalar, int Size> void orthomethods(int size=Size)
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<Scalar,Size,1> VectorType;
+  typedef Matrix<Scalar,3,Size> Matrix3N;
+  typedef Matrix<Scalar,Size,3> MatrixN3;
+  typedef Matrix<Scalar,3,1> Vector3;
+
+  VectorType v0 = VectorType::Random(size);
+
+  // unitOrthogonal
+  VERIFY_IS_MUCH_SMALLER_THAN(v0.unitOrthogonal().dot(v0), Scalar(1));
+  VERIFY_IS_APPROX(v0.unitOrthogonal().norm(), RealScalar(1));
+
+  if (size>=3)
+  {
+    v0.template head<2>().setZero();
+    v0.tail(size-2).setRandom();
+
+    VERIFY_IS_MUCH_SMALLER_THAN(v0.unitOrthogonal().dot(v0), Scalar(1));
+    VERIFY_IS_APPROX(v0.unitOrthogonal().norm(), RealScalar(1));
+  }
+
+  // colwise/rowwise cross product
+  Vector3 vec3 = Vector3::Random();
+  int i = internal::random<int>(0,size-1);
+
+  Matrix3N mat3N(3,size), mcross3N(3,size);
+  mat3N.setRandom();
+  mcross3N = mat3N.colwise().cross(vec3);
+  VERIFY_IS_APPROX(mcross3N.col(i), mat3N.col(i).cross(vec3));
+
+  MatrixN3 matN3(size,3), mcrossN3(size,3);
+  matN3.setRandom();
+  mcrossN3 = matN3.rowwise().cross(vec3);
+  VERIFY_IS_APPROX(mcrossN3.row(i), matN3.row(i).cross(vec3));
+}
+
+EIGEN_DECLARE_TEST(geo_orthomethods)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( orthomethods_3<float>() );
+    CALL_SUBTEST_2( orthomethods_3<double>() );
+    CALL_SUBTEST_4( orthomethods_3<std::complex<double> >() );
+    CALL_SUBTEST_1( (orthomethods<float,2>()) );
+    CALL_SUBTEST_2( (orthomethods<double,2>()) );
+    CALL_SUBTEST_1( (orthomethods<float,3>()) );
+    CALL_SUBTEST_2( (orthomethods<double,3>()) );
+    CALL_SUBTEST_3( (orthomethods<float,7>()) );
+    CALL_SUBTEST_4( (orthomethods<std::complex<double>,8>()) );
+    CALL_SUBTEST_5( (orthomethods<float,Dynamic>(36)) );
+    CALL_SUBTEST_6( (orthomethods<double,Dynamic>(35)) );
+  }
+}

diff --git a/test/geo_parametrizedline.cpp b/test/geo_parametrizedline.cpp
new file mode 100644
index 0000000..e4b194a
--- /dev/null
+++ b/test/geo_parametrizedline.cpp

@@ -0,0 +1,125 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/Geometry>
+#include <Eigen/LU>
+#include <Eigen/QR>
+
+template<typename LineType> void parametrizedline(const LineType& _line)
+{
+  /* this test covers the following files:
+     ParametrizedLine.h
+  */
+  using std::abs;
+  const Index dim = _line.dim();
+  typedef typename LineType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<Scalar, LineType::AmbientDimAtCompileTime, 1> VectorType;
+  typedef Hyperplane<Scalar,LineType::AmbientDimAtCompileTime> HyperplaneType;
+  typedef Matrix<Scalar, HyperplaneType::AmbientDimAtCompileTime,
+                         HyperplaneType::AmbientDimAtCompileTime> MatrixType;
+
+  VectorType p0 = VectorType::Random(dim);
+  VectorType p1 = VectorType::Random(dim);
+
+  VectorType d0 = VectorType::Random(dim).normalized();
+
+  LineType l0(p0, d0);
+
+  Scalar s0 = internal::random<Scalar>();
+  Scalar s1 = abs(internal::random<Scalar>());
+
+  VERIFY_IS_MUCH_SMALLER_THAN( l0.distance(p0), RealScalar(1) );
+  VERIFY_IS_MUCH_SMALLER_THAN( l0.distance(p0+s0*d0), RealScalar(1) );
+  VERIFY_IS_APPROX( (l0.projection(p1)-p1).norm(), l0.distance(p1) );
+  VERIFY_IS_MUCH_SMALLER_THAN( l0.distance(l0.projection(p1)), RealScalar(1) );
+  VERIFY_IS_APPROX( Scalar(l0.distance((p0+s0*d0) + d0.unitOrthogonal() * s1)), s1 );
+
+  // casting
+  const int Dim = LineType::AmbientDimAtCompileTime;
+  typedef typename GetDifferentType<Scalar>::type OtherScalar;
+  ParametrizedLine<OtherScalar,Dim> hp1f = l0.template cast<OtherScalar>();
+  VERIFY_IS_APPROX(hp1f.template cast<Scalar>(),l0);
+  ParametrizedLine<Scalar,Dim> hp1d = l0.template cast<Scalar>();
+  VERIFY_IS_APPROX(hp1d.template cast<Scalar>(),l0);
+
+  // intersections
+  VectorType p2 = VectorType::Random(dim);
+  VectorType n2 = VectorType::Random(dim).normalized();
+  HyperplaneType hp(p2,n2);
+  Scalar t = l0.intersectionParameter(hp);
+  VectorType pi = l0.pointAt(t);
+  VERIFY_IS_MUCH_SMALLER_THAN(hp.signedDistance(pi), RealScalar(1));
+  VERIFY_IS_MUCH_SMALLER_THAN(l0.distance(pi), RealScalar(1));
+  VERIFY_IS_APPROX(l0.intersectionPoint(hp), pi);
+
+  // transform
+  if (!NumTraits<Scalar>::IsComplex)
+  {
+    MatrixType rot = MatrixType::Random(dim,dim).householderQr().householderQ();
+    DiagonalMatrix<Scalar,LineType::AmbientDimAtCompileTime> scaling(VectorType::Random());
+    Translation<Scalar,LineType::AmbientDimAtCompileTime> translation(VectorType::Random());
+
+    while(scaling.diagonal().cwiseAbs().minCoeff()<RealScalar(1e-4)) scaling.diagonal() = VectorType::Random();
+
+    LineType l1 = l0;
+    VectorType p3 = l0.pointAt(Scalar(1));
+    VERIFY_IS_MUCH_SMALLER_THAN( l1.transform(rot).distance(rot * p3), Scalar(1) );
+    l1 = l0;
+    VERIFY_IS_MUCH_SMALLER_THAN( l1.transform(rot,Isometry).distance(rot * p3), Scalar(1) );
+    l1 = l0;
+    VERIFY_IS_MUCH_SMALLER_THAN( l1.transform(rot*scaling).distance((rot*scaling) * p3), Scalar(1) );
+    l1 = l0;
+    VERIFY_IS_MUCH_SMALLER_THAN( l1.transform(rot*scaling*translation)
+                                   .distance((rot*scaling*translation) * p3), Scalar(1) );
+    l1 = l0;
+    VERIFY_IS_MUCH_SMALLER_THAN( l1.transform(rot*translation,Isometry)
+                                   .distance((rot*translation) * p3), Scalar(1) );
+  }
+
+}
+
+template<typename Scalar> void parametrizedline_alignment()
+{
+  typedef ParametrizedLine<Scalar,4,AutoAlign> Line4a;
+  typedef ParametrizedLine<Scalar,4,DontAlign> Line4u;
+
+  EIGEN_ALIGN_MAX Scalar array1[16];
+  EIGEN_ALIGN_MAX Scalar array2[16];
+  EIGEN_ALIGN_MAX Scalar array3[16+1];
+  Scalar* array3u = array3+1;
+
+  Line4a *p1 = ::new(reinterpret_cast<void*>(array1)) Line4a;
+  Line4u *p2 = ::new(reinterpret_cast<void*>(array2)) Line4u;
+  Line4u *p3 = ::new(reinterpret_cast<void*>(array3u)) Line4u;
+  
+  p1->origin().setRandom();
+  p1->direction().setRandom();
+  *p2 = *p1;
+  *p3 = *p1;
+
+  VERIFY_IS_APPROX(p1->origin(), p2->origin());
+  VERIFY_IS_APPROX(p1->origin(), p3->origin());
+  VERIFY_IS_APPROX(p1->direction(), p2->direction());
+  VERIFY_IS_APPROX(p1->direction(), p3->direction());
+}
+
+EIGEN_DECLARE_TEST(geo_parametrizedline)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( parametrizedline(ParametrizedLine<float,2>()) );
+    CALL_SUBTEST_2( parametrizedline(ParametrizedLine<float,3>()) );
+    CALL_SUBTEST_2( parametrizedline_alignment<float>() );
+    CALL_SUBTEST_3( parametrizedline(ParametrizedLine<double,4>()) );
+    CALL_SUBTEST_3( parametrizedline_alignment<double>() );
+    CALL_SUBTEST_4( parametrizedline(ParametrizedLine<std::complex<double>,5>()) );
+  }
+}

diff --git a/test/geo_quaternion.cpp b/test/geo_quaternion.cpp
new file mode 100644
index 0000000..c561fc8
--- /dev/null
+++ b/test/geo_quaternion.cpp

@@ -0,0 +1,332 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009 Mathieu Gautier <mathieu.gautier@cea.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/Geometry>
+#include <Eigen/LU>
+#include <Eigen/SVD>
+#include "AnnoyingScalar.h"
+
+template<typename T> T bounded_acos(T v)
+{
+  using std::acos;
+  using std::min;
+  using std::max;
+  return acos((max)(T(-1),(min)(v,T(1))));
+}
+
+template<typename QuatType> void check_slerp(const QuatType& q0, const QuatType& q1)
+{
+  using std::abs;
+  typedef typename QuatType::Scalar Scalar;
+  typedef AngleAxis<Scalar> AA;
+
+  Scalar largeEps = test_precision<Scalar>();
+
+  Scalar theta_tot = AA(q1*q0.inverse()).angle();
+  if(theta_tot>Scalar(EIGEN_PI))
+    theta_tot = Scalar(2.)*Scalar(EIGEN_PI)-theta_tot;
+  for(Scalar t=0; t<=Scalar(1.001); t+=Scalar(0.1))
+  {
+    QuatType q = q0.slerp(t,q1);
+    Scalar theta = AA(q*q0.inverse()).angle();
+    VERIFY(abs(q.norm() - 1) < largeEps);
+    if(theta_tot==0)  VERIFY(theta_tot==0);
+    else              VERIFY(abs(theta - t * theta_tot) < largeEps);
+  }
+}
+
+template<typename Scalar, int Options> void quaternion(void)
+{
+  /* this test covers the following files:
+     Quaternion.h
+  */
+  using std::abs;
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Matrix<Scalar,3,3> Matrix3;
+  typedef Quaternion<Scalar,Options> Quaternionx;
+  typedef AngleAxis<Scalar> AngleAxisx;
+
+  Scalar largeEps = test_precision<Scalar>();
+  if (internal::is_same<Scalar,float>::value)
+    largeEps = Scalar(1e-3);
+
+  Scalar eps = internal::random<Scalar>() * Scalar(1e-2);
+
+  Vector3 v0 = Vector3::Random(),
+          v1 = Vector3::Random(),
+          v2 = Vector3::Random(),
+          v3 = Vector3::Random();
+
+  Scalar  a = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI)),
+          b = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI));
+
+  // Quaternion: Identity(), setIdentity();
+  Quaternionx q1, q2;
+  q2.setIdentity();
+  VERIFY_IS_APPROX(Quaternionx(Quaternionx::Identity()).coeffs(), q2.coeffs());
+  q1.coeffs().setRandom();
+  VERIFY_IS_APPROX(q1.coeffs(), (q1*q2).coeffs());
+
+#ifndef EIGEN_NO_IO
+  // Printing
+  std::ostringstream ss;
+  ss << q2;
+  VERIFY(ss.str() == "0i + 0j + 0k + 1");
+#endif
+
+  // concatenation
+  q1 *= q2;
+
+  q1 = AngleAxisx(a, v0.normalized());
+  q2 = AngleAxisx(a, v1.normalized());
+
+  // angular distance
+  Scalar refangle = abs(AngleAxisx(q1.inverse()*q2).angle());
+  if (refangle>Scalar(EIGEN_PI))
+    refangle = Scalar(2)*Scalar(EIGEN_PI) - refangle;
+
+  if((q1.coeffs()-q2.coeffs()).norm() > Scalar(10)*largeEps)
+  {
+    VERIFY_IS_MUCH_SMALLER_THAN(abs(q1.angularDistance(q2) - refangle), Scalar(1));
+  }
+
+  // rotation matrix conversion
+  VERIFY_IS_APPROX(q1 * v2, q1.toRotationMatrix() * v2);
+  VERIFY_IS_APPROX(q1 * q2 * v2,
+    q1.toRotationMatrix() * q2.toRotationMatrix() * v2);
+
+  VERIFY(  (q2*q1).isApprox(q1*q2, largeEps)
+        || !(q2 * q1 * v2).isApprox(q1.toRotationMatrix() * q2.toRotationMatrix() * v2));
+
+  q2 = q1.toRotationMatrix();
+  VERIFY_IS_APPROX(q1*v1,q2*v1);
+
+  Matrix3 rot1(q1);
+  VERIFY_IS_APPROX(q1*v1,rot1*v1);
+  Quaternionx q3(rot1.transpose()*rot1);
+  VERIFY_IS_APPROX(q3*v1,v1);
+
+
+  // angle-axis conversion
+  AngleAxisx aa = AngleAxisx(q1);
+  VERIFY_IS_APPROX(q1 * v1, Quaternionx(aa) * v1);
+
+  // Do not execute the test if the rotation angle is almost zero, or
+  // the rotation axis and v1 are almost parallel.
+  if (abs(aa.angle()) > Scalar(5)*test_precision<Scalar>()
+      && (aa.axis() - v1.normalized()).norm() < Scalar(1.99)
+      && (aa.axis() + v1.normalized()).norm() < Scalar(1.99))
+  {
+    VERIFY_IS_NOT_APPROX(q1 * v1, Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1);
+  }
+
+  // from two vector creation
+  VERIFY_IS_APPROX( v2.normalized(),(q2.setFromTwoVectors(v1, v2)*v1).normalized());
+  VERIFY_IS_APPROX( v1.normalized(),(q2.setFromTwoVectors(v1, v1)*v1).normalized());
+  VERIFY_IS_APPROX(-v1.normalized(),(q2.setFromTwoVectors(v1,-v1)*v1).normalized());
+  if (internal::is_same<Scalar,double>::value)
+  {
+    v3 = (v1.array()+eps).matrix();
+    VERIFY_IS_APPROX( v3.normalized(),(q2.setFromTwoVectors(v1, v3)*v1).normalized());
+    VERIFY_IS_APPROX(-v3.normalized(),(q2.setFromTwoVectors(v1,-v3)*v1).normalized());
+  }
+
+  // from two vector creation static function
+  VERIFY_IS_APPROX( v2.normalized(),(Quaternionx::FromTwoVectors(v1, v2)*v1).normalized());
+  VERIFY_IS_APPROX( v1.normalized(),(Quaternionx::FromTwoVectors(v1, v1)*v1).normalized());
+  VERIFY_IS_APPROX(-v1.normalized(),(Quaternionx::FromTwoVectors(v1,-v1)*v1).normalized());
+  if (internal::is_same<Scalar,double>::value)
+  {
+    v3 = (v1.array()+eps).matrix();
+    VERIFY_IS_APPROX( v3.normalized(),(Quaternionx::FromTwoVectors(v1, v3)*v1).normalized());
+    VERIFY_IS_APPROX(-v3.normalized(),(Quaternionx::FromTwoVectors(v1,-v3)*v1).normalized());
+  }
+
+  // inverse and conjugate
+  VERIFY_IS_APPROX(q1 * (q1.inverse() * v1), v1);
+  VERIFY_IS_APPROX(q1 * (q1.conjugate() * v1), v1);
+
+  // test casting
+  Quaternion<float> q1f = q1.template cast<float>();
+  VERIFY_IS_APPROX(q1f.template cast<Scalar>(),q1);
+  Quaternion<double> q1d = q1.template cast<double>();
+  VERIFY_IS_APPROX(q1d.template cast<Scalar>(),q1);
+
+  // test bug 369 - improper alignment.
+  Quaternionx *q = new Quaternionx;
+  delete q;
+
+  q1 = Quaternionx::UnitRandom();
+  q2 = Quaternionx::UnitRandom();
+  check_slerp(q1,q2);
+
+  q1 = AngleAxisx(b, v1.normalized());
+  q2 = AngleAxisx(b+Scalar(EIGEN_PI), v1.normalized());
+  check_slerp(q1,q2);
+
+  q1 = AngleAxisx(b,  v1.normalized());
+  q2 = AngleAxisx(-b, -v1.normalized());
+  check_slerp(q1,q2);
+
+  q1 = Quaternionx::UnitRandom();
+  q2.coeffs() = -q1.coeffs();
+  check_slerp(q1,q2);
+}
+
+template<typename Scalar> void mapQuaternion(void){
+  typedef Map<Quaternion<Scalar>, Aligned> MQuaternionA;
+  typedef Map<const Quaternion<Scalar>, Aligned> MCQuaternionA;
+  typedef Map<Quaternion<Scalar> > MQuaternionUA;
+  typedef Map<const Quaternion<Scalar> > MCQuaternionUA;
+  typedef Quaternion<Scalar> Quaternionx;
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef AngleAxis<Scalar> AngleAxisx;
+  
+  Vector3 v0 = Vector3::Random(),
+          v1 = Vector3::Random();
+  Scalar  a = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI));
+
+  EIGEN_ALIGN_MAX Scalar array1[4];
+  EIGEN_ALIGN_MAX Scalar array2[4];
+  EIGEN_ALIGN_MAX Scalar array3[4+1];
+  Scalar* array3unaligned = array3+1;
+  
+  MQuaternionA    mq1(array1);
+  MCQuaternionA   mcq1(array1);
+  MQuaternionA    mq2(array2);
+  MQuaternionUA   mq3(array3unaligned);
+  MCQuaternionUA  mcq3(array3unaligned);
+
+//  std::cerr << array1 << " " << array2 << " " << array3 << "\n";
+  mq1 = AngleAxisx(a, v0.normalized());
+  mq2 = mq1;
+  mq3 = mq1;
+
+  Quaternionx q1 = mq1;
+  Quaternionx q2 = mq2;
+  Quaternionx q3 = mq3;
+  Quaternionx q4 = MCQuaternionUA(array3unaligned);
+
+  VERIFY_IS_APPROX(q1.coeffs(), q2.coeffs());
+  VERIFY_IS_APPROX(q1.coeffs(), q3.coeffs());
+  VERIFY_IS_APPROX(q4.coeffs(), q3.coeffs());
+    
+  VERIFY_IS_APPROX(mq1 * (mq1.inverse() * v1), v1);
+  VERIFY_IS_APPROX(mq1 * (mq1.conjugate() * v1), v1);
+  
+  VERIFY_IS_APPROX(mcq1 * (mcq1.inverse() * v1), v1);
+  VERIFY_IS_APPROX(mcq1 * (mcq1.conjugate() * v1), v1);
+  
+  VERIFY_IS_APPROX(mq3 * (mq3.inverse() * v1), v1);
+  VERIFY_IS_APPROX(mq3 * (mq3.conjugate() * v1), v1);
+  
+  VERIFY_IS_APPROX(mcq3 * (mcq3.inverse() * v1), v1);
+  VERIFY_IS_APPROX(mcq3 * (mcq3.conjugate() * v1), v1);
+  
+  VERIFY_IS_APPROX(mq1*mq2, q1*q2);
+  VERIFY_IS_APPROX(mq3*mq2, q3*q2);
+  VERIFY_IS_APPROX(mcq1*mq2, q1*q2);
+  VERIFY_IS_APPROX(mcq3*mq2, q3*q2);
+
+  // Bug 1461, compilation issue with Map<const Quat>::w(), and other reference/constness checks:
+  VERIFY_IS_APPROX(mcq3.coeffs().x() + mcq3.coeffs().y() + mcq3.coeffs().z() + mcq3.coeffs().w(), mcq3.coeffs().sum());
+  VERIFY_IS_APPROX(mcq3.x() + mcq3.y() + mcq3.z() + mcq3.w(), mcq3.coeffs().sum());
+  mq3.w() = 1;
+  const Quaternionx& cq3(q3);
+  VERIFY( &cq3.x() == &q3.x() );
+  const MQuaternionUA& cmq3(mq3);
+  VERIFY( &cmq3.x() == &mq3.x() );
+  // FIXME the following should be ok. The problem is that currently the LValueBit flag
+  // is used to determine whether we can return a coeff by reference or not, which is not enough for Map<const ...>.
+  //const MCQuaternionUA& cmcq3(mcq3);
+  //VERIFY( &cmcq3.x() == &mcq3.x() );
+
+  // test cast
+  {
+    Quaternion<float> q1f = mq1.template cast<float>();
+    VERIFY_IS_APPROX(q1f.template cast<Scalar>(),mq1);
+    Quaternion<double> q1d = mq1.template cast<double>();
+    VERIFY_IS_APPROX(q1d.template cast<Scalar>(),mq1);
+  }
+}
+
+template<typename Scalar> void quaternionAlignment(void){
+  typedef Quaternion<Scalar,AutoAlign> QuaternionA;
+  typedef Quaternion<Scalar,DontAlign> QuaternionUA;
+
+  EIGEN_ALIGN_MAX Scalar array1[4];
+  EIGEN_ALIGN_MAX Scalar array2[4];
+  EIGEN_ALIGN_MAX Scalar array3[4+1];
+  Scalar* arrayunaligned = array3+1;
+
+  QuaternionA *q1 = ::new(reinterpret_cast<void*>(array1)) QuaternionA;
+  QuaternionUA *q2 = ::new(reinterpret_cast<void*>(array2)) QuaternionUA;
+  QuaternionUA *q3 = ::new(reinterpret_cast<void*>(arrayunaligned)) QuaternionUA;
+
+  q1->coeffs().setRandom();
+  *q2 = *q1;
+  *q3 = *q1;
+
+  VERIFY_IS_APPROX(q1->coeffs(), q2->coeffs());
+  VERIFY_IS_APPROX(q1->coeffs(), q3->coeffs());
+}
+
+template<typename PlainObjectType> void check_const_correctness(const PlainObjectType&)
+{
+  // there's a lot that we can't test here while still having this test compile!
+  // the only possible approach would be to run a script trying to compile stuff and checking that it fails.
+  // CMake can help with that.
+
+  // verify that map-to-const don't have LvalueBit
+  typedef typename internal::add_const<PlainObjectType>::type ConstPlainObjectType;
+  VERIFY( !(internal::traits<Map<ConstPlainObjectType> >::Flags & LvalueBit) );
+  VERIFY( !(internal::traits<Map<ConstPlainObjectType, Aligned> >::Flags & LvalueBit) );
+  VERIFY( !(Map<ConstPlainObjectType>::Flags & LvalueBit) );
+  VERIFY( !(Map<ConstPlainObjectType, Aligned>::Flags & LvalueBit) );
+}
+
+#if EIGEN_HAS_RVALUE_REFERENCES
+
+// Regression for bug 1573
+struct MovableClass {
+  // The following line is a workaround for gcc 4.7 and 4.8 (see bug 1573 comments).
+  static_assert(std::is_nothrow_move_constructible<Quaternionf>::value,"");
+  MovableClass() = default;
+  MovableClass(const MovableClass&) = default;
+  MovableClass(MovableClass&&) noexcept = default;
+  MovableClass& operator=(const MovableClass&) = default;
+  MovableClass& operator=(MovableClass&&) = default;
+  Quaternionf m_quat;
+};
+
+#endif
+
+EIGEN_DECLARE_TEST(geo_quaternion)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1(( quaternion<float,AutoAlign>() ));
+    CALL_SUBTEST_1( check_const_correctness(Quaternionf()) );
+    CALL_SUBTEST_1(( quaternion<float,DontAlign>() ));
+    CALL_SUBTEST_1(( quaternionAlignment<float>() ));
+    CALL_SUBTEST_1( mapQuaternion<float>() );
+
+    CALL_SUBTEST_2(( quaternion<double,AutoAlign>() ));
+    CALL_SUBTEST_2( check_const_correctness(Quaterniond()) );
+    CALL_SUBTEST_2(( quaternion<double,DontAlign>() ));
+    CALL_SUBTEST_2(( quaternionAlignment<double>() ));
+    CALL_SUBTEST_2( mapQuaternion<double>() );
+
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+    AnnoyingScalar::dont_throw = true;
+#endif
+    CALL_SUBTEST_3(( quaternion<AnnoyingScalar,AutoAlign>() ));
+  }
+}

diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp
new file mode 100644
index 0000000..72c6eda
--- /dev/null
+++ b/test/geo_transformations.cpp

@@ -0,0 +1,731 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/Geometry>
+#include <Eigen/LU>
+#include <Eigen/SVD>
+
+template<typename T>
+Matrix<T,2,1> angleToVec(T a)
+{
+  return Matrix<T,2,1>(std::cos(a), std::sin(a));
+}
+
+// This permits to workaround a bug in clang/llvm code generation.
+template<typename T>
+EIGEN_DONT_INLINE
+void dont_over_optimize(T& x) { volatile typename T::Scalar tmp = x(0); x(0) = tmp; }
+
+template<typename Scalar, int Mode, int Options> void non_projective_only()
+{
+    /* this test covers the following files:
+     Cross.h Quaternion.h, Transform.cpp
+  */
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Quaternion<Scalar> Quaternionx;
+  typedef AngleAxis<Scalar> AngleAxisx;
+  typedef Transform<Scalar,3,Mode,Options> Transform3;
+  typedef DiagonalMatrix<Scalar,3> AlignedScaling3;
+  typedef Translation<Scalar,3> Translation3;
+
+  Vector3 v0 = Vector3::Random(),
+          v1 = Vector3::Random();
+
+  Transform3 t0, t1, t2;
+
+  Scalar a = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI));
+
+  Quaternionx q1, q2;
+
+  q1 = AngleAxisx(a, v0.normalized());
+
+  t0 = Transform3::Identity();
+  VERIFY_IS_APPROX(t0.matrix(), Transform3::MatrixType::Identity());
+
+  t0.linear() = q1.toRotationMatrix();
+
+  v0 << 50, 2, 1;
+  t0.scale(v0);
+
+  VERIFY_IS_APPROX( (t0 * Vector3(1,0,0)).template head<3>().norm(), v0.x());
+
+  t0.setIdentity();
+  t1.setIdentity();
+  v1 << 1, 2, 3;
+  t0.linear() = q1.toRotationMatrix();
+  t0.pretranslate(v0);
+  t0.scale(v1);
+  t1.linear() = q1.conjugate().toRotationMatrix();
+  t1.prescale(v1.cwiseInverse());
+  t1.translate(-v0);
+
+  VERIFY((t0 * t1).matrix().isIdentity(test_precision<Scalar>()));
+
+  t1.fromPositionOrientationScale(v0, q1, v1);
+  VERIFY_IS_APPROX(t1.matrix(), t0.matrix());
+  VERIFY_IS_APPROX(t1*v1, t0*v1);
+
+  // translation * vector
+  t0.setIdentity();
+  t0.translate(v0);
+  VERIFY_IS_APPROX((t0 * v1).template head<3>(), Translation3(v0) * v1);
+
+  // AlignedScaling * vector
+  t0.setIdentity();
+  t0.scale(v0);
+  VERIFY_IS_APPROX((t0 * v1).template head<3>(), AlignedScaling3(v0) * v1);
+}
+
+template<typename Scalar, int Mode, int Options> void transformations()
+{
+  /* this test covers the following files:
+     Cross.h Quaternion.h, Transform.cpp
+  */
+  using std::cos;
+  using std::abs;
+  typedef Matrix<Scalar,3,3> Matrix3;
+  typedef Matrix<Scalar,4,4> Matrix4;
+  typedef Matrix<Scalar,2,1> Vector2;
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Matrix<Scalar,4,1> Vector4;
+  typedef Quaternion<Scalar> Quaternionx;
+  typedef AngleAxis<Scalar> AngleAxisx;
+  typedef Transform<Scalar,2,Mode,Options> Transform2;
+  typedef Transform<Scalar,3,Mode,Options> Transform3;
+  typedef typename Transform3::MatrixType MatrixType;
+  typedef DiagonalMatrix<Scalar,3> AlignedScaling3;
+  typedef Translation<Scalar,2> Translation2;
+  typedef Translation<Scalar,3> Translation3;
+
+  Vector3 v0 = Vector3::Random(),
+          v1 = Vector3::Random();
+  Matrix3 matrot1, m;
+
+  Scalar a = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI));
+  Scalar s0 = internal::random<Scalar>(), s1 = internal::random<Scalar>();
+  
+  while(v0.norm() < test_precision<Scalar>()) v0 = Vector3::Random();
+  while(v1.norm() < test_precision<Scalar>()) v1 = Vector3::Random();
+
+  VERIFY_IS_APPROX(v0, AngleAxisx(a, v0.normalized()) * v0);
+  VERIFY_IS_APPROX(-v0, AngleAxisx(Scalar(EIGEN_PI), v0.unitOrthogonal()) * v0);
+  if(abs(cos(a)) > test_precision<Scalar>())
+  {
+    VERIFY_IS_APPROX(cos(a)*v0.squaredNorm(), v0.dot(AngleAxisx(a, v0.unitOrthogonal()) * v0));
+  }
+  m = AngleAxisx(a, v0.normalized()).toRotationMatrix().adjoint();
+  VERIFY_IS_APPROX(Matrix3::Identity(), m * AngleAxisx(a, v0.normalized()));
+  VERIFY_IS_APPROX(Matrix3::Identity(), AngleAxisx(a, v0.normalized()) * m);
+
+  Quaternionx q1, q2;
+  q1 = AngleAxisx(a, v0.normalized());
+  q2 = AngleAxisx(a, v1.normalized());
+
+  // rotation matrix conversion
+  matrot1 = AngleAxisx(Scalar(0.1), Vector3::UnitX())
+          * AngleAxisx(Scalar(0.2), Vector3::UnitY())
+          * AngleAxisx(Scalar(0.3), Vector3::UnitZ());
+  VERIFY_IS_APPROX(matrot1 * v1,
+       AngleAxisx(Scalar(0.1), Vector3(1,0,0)).toRotationMatrix()
+    * (AngleAxisx(Scalar(0.2), Vector3(0,1,0)).toRotationMatrix()
+    * (AngleAxisx(Scalar(0.3), Vector3(0,0,1)).toRotationMatrix() * v1)));
+
+  // angle-axis conversion
+  AngleAxisx aa = AngleAxisx(q1);
+  VERIFY_IS_APPROX(q1 * v1, Quaternionx(aa) * v1);
+  
+  // The following test is stable only if 2*angle != angle and v1 is not colinear with axis
+  if( (abs(aa.angle()) > test_precision<Scalar>()) && (abs(aa.axis().dot(v1.normalized()))<(Scalar(1)-Scalar(4)*test_precision<Scalar>())) )
+  {
+    VERIFY( !(q1 * v1).isApprox(Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1) );
+  }
+
+  aa.fromRotationMatrix(aa.toRotationMatrix());
+  VERIFY_IS_APPROX(q1 * v1, Quaternionx(aa) * v1);
+  // The following test is stable only if 2*angle != angle and v1 is not colinear with axis
+  if( (abs(aa.angle()) > test_precision<Scalar>()) && (abs(aa.axis().dot(v1.normalized()))<(Scalar(1)-Scalar(4)*test_precision<Scalar>())) )
+  {
+    VERIFY( !(q1 * v1).isApprox(Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1) );
+  }
+
+  // AngleAxis
+  VERIFY_IS_APPROX(AngleAxisx(a,v1.normalized()).toRotationMatrix(),
+    Quaternionx(AngleAxisx(a,v1.normalized())).toRotationMatrix());
+
+  AngleAxisx aa1;
+  m = q1.toRotationMatrix();
+  aa1 = m;
+  VERIFY_IS_APPROX(AngleAxisx(m).toRotationMatrix(),
+    Quaternionx(m).toRotationMatrix());
+
+  // Transform
+  // TODO complete the tests !
+  a = 0;
+  while (abs(a)<Scalar(0.1))
+    a = internal::random<Scalar>(-Scalar(0.4)*Scalar(EIGEN_PI), Scalar(0.4)*Scalar(EIGEN_PI));
+  q1 = AngleAxisx(a, v0.normalized());
+  Transform3 t0, t1, t2;
+
+  // first test setIdentity() and Identity()
+  t0.setIdentity();
+  VERIFY_IS_APPROX(t0.matrix(), Transform3::MatrixType::Identity());
+  t0.matrix().setZero();
+  t0 = Transform3::Identity();
+  VERIFY_IS_APPROX(t0.matrix(), Transform3::MatrixType::Identity());
+
+  t0.setIdentity();
+  t1.setIdentity();
+  v1 << 1, 2, 3;
+  t0.linear() = q1.toRotationMatrix();
+  t0.pretranslate(v0);
+  t0.scale(v1);
+  t1.linear() = q1.conjugate().toRotationMatrix();
+  t1.prescale(v1.cwiseInverse());
+  t1.translate(-v0);
+
+  VERIFY((t0 * t1).matrix().isIdentity(test_precision<Scalar>()));
+
+  t1.fromPositionOrientationScale(v0, q1, v1);
+  VERIFY_IS_APPROX(t1.matrix(), t0.matrix());
+
+  t0.setIdentity(); t0.scale(v0).rotate(q1.toRotationMatrix());
+  t1.setIdentity(); t1.scale(v0).rotate(q1);
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+
+  t0.setIdentity(); t0.scale(v0).rotate(AngleAxisx(q1));
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+
+  VERIFY_IS_APPROX(t0.scale(a).matrix(), t1.scale(Vector3::Constant(a)).matrix());
+  VERIFY_IS_APPROX(t0.prescale(a).matrix(), t1.prescale(Vector3::Constant(a)).matrix());
+
+  // More transform constructors, operator=, operator*=
+
+  Matrix3 mat3 = Matrix3::Random();
+  Matrix4 mat4;
+  mat4 << mat3 , Vector3::Zero() , Vector4::Zero().transpose();
+  Transform3 tmat3(mat3), tmat4(mat4);
+  if(Mode!=int(AffineCompact))
+    tmat4.matrix()(3,3) = Scalar(1);
+  VERIFY_IS_APPROX(tmat3.matrix(), tmat4.matrix());
+
+  Scalar a3 = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI));
+  Vector3 v3 = Vector3::Random().normalized();
+  AngleAxisx aa3(a3, v3);
+  Transform3 t3(aa3);
+  Transform3 t4;
+  t4 = aa3;
+  VERIFY_IS_APPROX(t3.matrix(), t4.matrix());
+  t4.rotate(AngleAxisx(-a3,v3));
+  VERIFY_IS_APPROX(t4.matrix(), MatrixType::Identity());
+  t4 *= aa3;
+  VERIFY_IS_APPROX(t3.matrix(), t4.matrix());
+
+  do {
+    v3 = Vector3::Random();
+    dont_over_optimize(v3);
+  } while (v3.cwiseAbs().minCoeff()<NumTraits<Scalar>::epsilon());
+  Translation3 tv3(v3);
+  Transform3 t5(tv3);
+  t4 = tv3;
+  VERIFY_IS_APPROX(t5.matrix(), t4.matrix());
+  t4.translate((-v3).eval());
+  VERIFY_IS_APPROX(t4.matrix(), MatrixType::Identity());
+  t4 *= tv3;
+  VERIFY_IS_APPROX(t5.matrix(), t4.matrix());
+
+  AlignedScaling3 sv3(v3);
+  Transform3 t6(sv3);
+  t4 = sv3;
+  VERIFY_IS_APPROX(t6.matrix(), t4.matrix());
+  t4.scale(v3.cwiseInverse());
+  VERIFY_IS_APPROX(t4.matrix(), MatrixType::Identity());
+  t4 *= sv3;
+  VERIFY_IS_APPROX(t6.matrix(), t4.matrix());
+
+  // matrix * transform
+  VERIFY_IS_APPROX((t3.matrix()*t4).matrix(), (t3*t4).matrix());
+
+  // chained Transform product
+  VERIFY_IS_APPROX(((t3*t4)*t5).matrix(), (t3*(t4*t5)).matrix());
+
+  // check that Transform product doesn't have aliasing problems
+  t5 = t4;
+  t5 = t5*t5;
+  VERIFY_IS_APPROX(t5, t4*t4);
+
+  // 2D transformation
+  Transform2 t20, t21;
+  Vector2 v20 = Vector2::Random();
+  Vector2 v21 = Vector2::Random();
+  for (int k=0; k<2; ++k)
+    if (abs(v21[k])<Scalar(1e-3)) v21[k] = Scalar(1e-3);
+  t21.setIdentity();
+  t21.linear() = Rotation2D<Scalar>(a).toRotationMatrix();
+  VERIFY_IS_APPROX(t20.fromPositionOrientationScale(v20,a,v21).matrix(),
+    t21.pretranslate(v20).scale(v21).matrix());
+
+  t21.setIdentity();
+  t21.linear() = Rotation2D<Scalar>(-a).toRotationMatrix();
+  VERIFY( (t20.fromPositionOrientationScale(v20,a,v21)
+        * (t21.prescale(v21.cwiseInverse()).translate(-v20))).matrix().isIdentity(test_precision<Scalar>()) );
+
+  // Transform - new API
+  // 3D
+  t0.setIdentity();
+  t0.rotate(q1).scale(v0).translate(v0);
+  // mat * aligned scaling and mat * translation
+  t1 = (Matrix3(q1) * AlignedScaling3(v0)) * Translation3(v0);
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+  t1 = (Matrix3(q1) * Eigen::Scaling(v0)) * Translation3(v0);
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+  t1 = (q1 * Eigen::Scaling(v0)) * Translation3(v0);
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+  // mat * transformation and aligned scaling * translation
+  t1 = Matrix3(q1) * (AlignedScaling3(v0) * Translation3(v0));
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+
+
+  t0.setIdentity();
+  t0.scale(s0).translate(v0);
+  t1 = Eigen::Scaling(s0) * Translation3(v0);
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+  t0.prescale(s0);
+  t1 = Eigen::Scaling(s0) * t1;
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+  
+  t0 = t3;
+  t0.scale(s0);
+  t1 = t3 * Eigen::Scaling(s0,s0,s0);
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+  t0.prescale(s0);
+  t1 = Eigen::Scaling(s0,s0,s0) * t1;
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+
+  t0 = t3;
+  t0.scale(s0);
+  t1 = t3 * Eigen::Scaling(s0);
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+  t0.prescale(s0);
+  t1 = Eigen::Scaling(s0) * t1;
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+
+  t0.setIdentity();
+  t0.prerotate(q1).prescale(v0).pretranslate(v0);
+  // translation * aligned scaling and transformation * mat
+  t1 = (Translation3(v0) * AlignedScaling3(v0)) * Transform3(q1);
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+  // scaling * mat and translation * mat
+  t1 = Translation3(v0) * (AlignedScaling3(v0) * Transform3(q1));
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+
+  t0.setIdentity();
+  t0.scale(v0).translate(v0).rotate(q1);
+  // translation * mat and aligned scaling * transformation
+  t1 = AlignedScaling3(v0) * (Translation3(v0) * Transform3(q1));
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+  // transformation * aligned scaling
+  t0.scale(v0);
+  t1 *= AlignedScaling3(v0);
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+  t1 = AlignedScaling3(v0) * (Translation3(v0) * Transform3(q1));
+  t1 = t1 * v0.asDiagonal();
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+  // transformation * translation
+  t0.translate(v0);
+  t1 = t1 * Translation3(v0);
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+  // translation * transformation
+  t0.pretranslate(v0);
+  t1 = Translation3(v0) * t1;
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+
+  // transform * quaternion
+  t0.rotate(q1);
+  t1 = t1 * q1;
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+
+  // translation * quaternion
+  t0.translate(v1).rotate(q1);
+  t1 = t1 * (Translation3(v1) * q1);
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+
+  // aligned scaling * quaternion
+  t0.scale(v1).rotate(q1);
+  t1 = t1 * (AlignedScaling3(v1) * q1);
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+
+  // quaternion * transform
+  t0.prerotate(q1);
+  t1 = q1 * t1;
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+
+  // quaternion * translation
+  t0.rotate(q1).translate(v1);
+  t1 = t1 * (q1 * Translation3(v1));
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+
+  // quaternion * aligned scaling
+  t0.rotate(q1).scale(v1);
+  t1 = t1 * (q1 * AlignedScaling3(v1));
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+
+  // test transform inversion
+  t0.setIdentity();
+  t0.translate(v0);
+  do {
+    t0.linear().setRandom();
+  } while(t0.linear().jacobiSvd().singularValues()(2)<test_precision<Scalar>());
+  Matrix4 t044 = Matrix4::Zero();
+  t044(3,3) = 1;
+  t044.block(0,0,t0.matrix().rows(),4) = t0.matrix();
+  VERIFY_IS_APPROX(t0.inverse(Affine).matrix(), t044.inverse().block(0,0,t0.matrix().rows(),4));
+  t0.setIdentity();
+  t0.translate(v0).rotate(q1);
+  t044 = Matrix4::Zero();
+  t044(3,3) = 1;
+  t044.block(0,0,t0.matrix().rows(),4) = t0.matrix();
+  VERIFY_IS_APPROX(t0.inverse(Isometry).matrix(), t044.inverse().block(0,0,t0.matrix().rows(),4));
+
+  Matrix3 mat_rotation, mat_scaling;
+  t0.setIdentity();
+  t0.translate(v0).rotate(q1).scale(v1);
+  t0.computeRotationScaling(&mat_rotation, &mat_scaling);
+  VERIFY_IS_APPROX(t0.linear(), mat_rotation * mat_scaling);
+  VERIFY_IS_APPROX(mat_rotation*mat_rotation.adjoint(), Matrix3::Identity());
+  VERIFY_IS_APPROX(mat_rotation.determinant(), Scalar(1));
+  t0.computeScalingRotation(&mat_scaling, &mat_rotation);
+  VERIFY_IS_APPROX(t0.linear(), mat_scaling * mat_rotation);
+  VERIFY_IS_APPROX(mat_rotation*mat_rotation.adjoint(), Matrix3::Identity());
+  VERIFY_IS_APPROX(mat_rotation.determinant(), Scalar(1));
+
+  // test casting
+  Transform<float,3,Mode> t1f = t1.template cast<float>();
+  VERIFY_IS_APPROX(t1f.template cast<Scalar>(),t1);
+  Transform<double,3,Mode> t1d = t1.template cast<double>();
+  VERIFY_IS_APPROX(t1d.template cast<Scalar>(),t1);
+
+  Translation3 tr1(v0);
+  Translation<float,3> tr1f = tr1.template cast<float>();
+  VERIFY_IS_APPROX(tr1f.template cast<Scalar>(),tr1);
+  Translation<double,3> tr1d = tr1.template cast<double>();
+  VERIFY_IS_APPROX(tr1d.template cast<Scalar>(),tr1);
+
+  AngleAxis<float> aa1f = aa1.template cast<float>();
+  VERIFY_IS_APPROX(aa1f.template cast<Scalar>(),aa1);
+  AngleAxis<double> aa1d = aa1.template cast<double>();
+  VERIFY_IS_APPROX(aa1d.template cast<Scalar>(),aa1);
+
+  Rotation2D<Scalar> r2d1(internal::random<Scalar>());
+  Rotation2D<float> r2d1f = r2d1.template cast<float>();
+  VERIFY_IS_APPROX(r2d1f.template cast<Scalar>(),r2d1);
+  Rotation2D<double> r2d1d = r2d1.template cast<double>();
+  VERIFY_IS_APPROX(r2d1d.template cast<Scalar>(),r2d1);
+  
+  for(int k=0; k<100; ++k)
+  {
+    Scalar angle = internal::random<Scalar>(-100,100);
+    Rotation2D<Scalar> rot2(angle);
+    VERIFY( rot2.smallestPositiveAngle() >= 0 );
+    VERIFY( rot2.smallestPositiveAngle() <= Scalar(2)*Scalar(EIGEN_PI) );
+    VERIFY_IS_APPROX( angleToVec(rot2.smallestPositiveAngle()), angleToVec(rot2.angle()) );
+    
+    VERIFY( rot2.smallestAngle() >= -Scalar(EIGEN_PI) );
+    VERIFY( rot2.smallestAngle() <=  Scalar(EIGEN_PI) );
+    VERIFY_IS_APPROX( angleToVec(rot2.smallestAngle()), angleToVec(rot2.angle()) );
+
+    Matrix<Scalar,2,2> rot2_as_mat(rot2);
+    Rotation2D<Scalar> rot3(rot2_as_mat);
+    VERIFY_IS_APPROX( angleToVec(rot2.smallestAngle()),  angleToVec(rot3.angle()) );
+  }
+
+  s0 = internal::random<Scalar>(-100,100);
+  s1 = internal::random<Scalar>(-100,100);
+  Rotation2D<Scalar> R0(s0), R1(s1);
+  
+  t20 = Translation2(v20) * (R0 * Eigen::Scaling(s0));
+  t21 = Translation2(v20) * R0 * Eigen::Scaling(s0);
+  VERIFY_IS_APPROX(t20,t21);
+  
+  t20 = Translation2(v20) * (R0 * R0.inverse() * Eigen::Scaling(s0));
+  t21 = Translation2(v20) * Eigen::Scaling(s0);
+  VERIFY_IS_APPROX(t20,t21);
+  
+  VERIFY_IS_APPROX(s0, (R0.slerp(0, R1)).angle());
+  VERIFY_IS_APPROX( angleToVec(R1.smallestPositiveAngle()), angleToVec((R0.slerp(1, R1)).smallestPositiveAngle()) );
+  VERIFY_IS_APPROX(R0.smallestPositiveAngle(), (R0.slerp(0.5, R0)).smallestPositiveAngle());
+
+  if(std::cos(s0)>0)
+    VERIFY_IS_MUCH_SMALLER_THAN((R0.slerp(0.5, R0.inverse())).smallestAngle(), Scalar(1));
+  else
+    VERIFY_IS_APPROX(Scalar(EIGEN_PI), (R0.slerp(0.5, R0.inverse())).smallestPositiveAngle());
+  
+  // Check path length
+  Scalar l = 0;
+  int path_steps = 100;
+  for(int k=0; k<path_steps; ++k)
+  {
+    Scalar a1 = R0.slerp(Scalar(k)/Scalar(path_steps), R1).angle();
+    Scalar a2 = R0.slerp(Scalar(k+1)/Scalar(path_steps), R1).angle();
+    l += std::abs(a2-a1);
+  }
+  VERIFY(l<=Scalar(EIGEN_PI)*(Scalar(1)+NumTraits<Scalar>::epsilon()*Scalar(path_steps/2)));
+  
+  // check basic features
+  {
+    Rotation2D<Scalar> r1;           // default ctor
+    r1 = Rotation2D<Scalar>(s0);     // copy assignment
+    VERIFY_IS_APPROX(r1.angle(),s0);
+    Rotation2D<Scalar> r2(r1);       // copy ctor
+    VERIFY_IS_APPROX(r2.angle(),s0);
+  }
+
+  {
+    Transform3 t32(Matrix4::Random()), t33, t34;
+    t34 = t33 = t32;
+    t32.scale(v0);
+    t33*=AlignedScaling3(v0);
+    VERIFY_IS_APPROX(t32.matrix(), t33.matrix());
+    t33 = t34 * AlignedScaling3(v0);
+    VERIFY_IS_APPROX(t32.matrix(), t33.matrix());
+  }
+
+}
+
+template<typename A1, typename A2, typename P, typename Q, typename V, typename H>
+void transform_associativity_left(const A1& a1, const A2& a2, const P& p, const Q& q, const V& v, const H& h)
+{
+  VERIFY_IS_APPROX( q*(a1*v), (q*a1)*v );
+  VERIFY_IS_APPROX( q*(a2*v), (q*a2)*v );
+  VERIFY_IS_APPROX( q*(p*h).hnormalized(),  ((q*p)*h).hnormalized() );
+}
+
+template<typename A1, typename A2, typename P, typename Q, typename V, typename H>
+void transform_associativity2(const A1& a1, const A2& a2, const P& p, const Q& q, const V& v, const H& h)
+{
+  VERIFY_IS_APPROX( a1*(q*v), (a1*q)*v );
+  VERIFY_IS_APPROX( a2*(q*v), (a2*q)*v );
+  VERIFY_IS_APPROX( p *(q*v).homogeneous(), (p *q)*v.homogeneous() );
+
+  transform_associativity_left(a1, a2,p, q, v, h);
+}
+
+template<typename Scalar, int Dim, int Options,typename RotationType>
+void transform_associativity(const RotationType& R)
+{
+  typedef Matrix<Scalar,Dim,1> VectorType;
+  typedef Matrix<Scalar,Dim+1,1> HVectorType;
+  typedef Matrix<Scalar,Dim,Dim> LinearType;
+  typedef Matrix<Scalar,Dim+1,Dim+1> MatrixType;
+  typedef Transform<Scalar,Dim,AffineCompact,Options> AffineCompactType;
+  typedef Transform<Scalar,Dim,Affine,Options> AffineType;
+  typedef Transform<Scalar,Dim,Projective,Options> ProjectiveType;
+  typedef DiagonalMatrix<Scalar,Dim> ScalingType;
+  typedef Translation<Scalar,Dim> TranslationType;
+
+  AffineCompactType A1c; A1c.matrix().setRandom();
+  AffineCompactType A2c; A2c.matrix().setRandom();
+  AffineType A1(A1c);
+  AffineType A2(A2c);
+  ProjectiveType P1; P1.matrix().setRandom();
+  VectorType v1 = VectorType::Random();
+  VectorType v2 = VectorType::Random();
+  HVectorType h1 = HVectorType::Random();
+  Scalar s1 = internal::random<Scalar>();
+  LinearType L = LinearType::Random();
+  MatrixType M = MatrixType::Random();
+
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, A2, v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, A2c, v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, v1.asDiagonal(), v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, ScalingType(v1), v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, Scaling(v1), v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, Scaling(s1), v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, TranslationType(v1), v2, h1) );
+  CALL_SUBTEST( transform_associativity_left(A1c, A1, P1, L, v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, R, v2, h1) );
+
+  VERIFY_IS_APPROX( A1*(M*h1), (A1*M)*h1 );
+  VERIFY_IS_APPROX( A1c*(M*h1), (A1c*M)*h1 );
+  VERIFY_IS_APPROX( P1*(M*h1), (P1*M)*h1 );
+
+  VERIFY_IS_APPROX( M*(A1*h1), (M*A1)*h1 );
+  VERIFY_IS_APPROX( M*(A1c*h1), (M*A1c)*h1 );
+  VERIFY_IS_APPROX( M*(P1*h1),  ((M*P1)*h1) );
+}
+
+template<typename Scalar> void transform_alignment()
+{
+  typedef Transform<Scalar,3,Projective,AutoAlign> Projective3a;
+  typedef Transform<Scalar,3,Projective,DontAlign> Projective3u;
+
+  EIGEN_ALIGN_MAX Scalar array1[16];
+  EIGEN_ALIGN_MAX Scalar array2[16];
+  EIGEN_ALIGN_MAX Scalar array3[16+1];
+  Scalar* array3u = array3+1;
+
+  Projective3a *p1 = ::new(reinterpret_cast<void*>(array1)) Projective3a;
+  Projective3u *p2 = ::new(reinterpret_cast<void*>(array2)) Projective3u;
+  Projective3u *p3 = ::new(reinterpret_cast<void*>(array3u)) Projective3u;
+  
+  p1->matrix().setRandom();
+  *p2 = *p1;
+  *p3 = *p1;
+
+  VERIFY_IS_APPROX(p1->matrix(), p2->matrix());
+  VERIFY_IS_APPROX(p1->matrix(), p3->matrix());
+  
+  VERIFY_IS_APPROX( (*p1) * (*p1), (*p2)*(*p3));
+}
+
+template<typename Scalar, int Dim, int Options> void transform_products()
+{
+  typedef Matrix<Scalar,Dim+1,Dim+1> Mat;
+  typedef Transform<Scalar,Dim,Projective,Options> Proj;
+  typedef Transform<Scalar,Dim,Affine,Options> Aff;
+  typedef Transform<Scalar,Dim,AffineCompact,Options> AffC;
+
+  Proj p; p.matrix().setRandom();
+  Aff a; a.linear().setRandom(); a.translation().setRandom();
+  AffC ac = a;
+
+  Mat p_m(p.matrix()), a_m(a.matrix());
+
+  VERIFY_IS_APPROX((p*p).matrix(), p_m*p_m);
+  VERIFY_IS_APPROX((a*a).matrix(), a_m*a_m);
+  VERIFY_IS_APPROX((p*a).matrix(), p_m*a_m);
+  VERIFY_IS_APPROX((a*p).matrix(), a_m*p_m);
+  VERIFY_IS_APPROX((ac*a).matrix(), a_m*a_m);
+  VERIFY_IS_APPROX((a*ac).matrix(), a_m*a_m);
+  VERIFY_IS_APPROX((p*ac).matrix(), p_m*a_m);
+  VERIFY_IS_APPROX((ac*p).matrix(), a_m*p_m);
+}
+
+template<typename Scalar, int Mode, int Options> void transformations_no_scale()
+{
+     /* this test covers the following files:
+     Cross.h Quaternion.h, Transform.h
+  */
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Matrix<Scalar,4,1> Vector4;
+  typedef Quaternion<Scalar> Quaternionx;
+  typedef AngleAxis<Scalar> AngleAxisx;
+  typedef Transform<Scalar,3,Mode,Options> Transform3;
+  typedef Translation<Scalar,3> Translation3;
+  typedef Matrix<Scalar,4,4> Matrix4;
+
+  Vector3 v0 = Vector3::Random(),
+          v1 = Vector3::Random();
+
+  Transform3 t0, t1, t2;
+
+  Scalar a = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI));
+
+  Quaternionx q1, q2;
+
+  q1 = AngleAxisx(a, v0.normalized());
+
+  t0 = Transform3::Identity();
+  VERIFY_IS_APPROX(t0.matrix(), Transform3::MatrixType::Identity());
+
+  t0.setIdentity();
+  t1.setIdentity();
+  v1 = Vector3::Ones();
+  t0.linear() = q1.toRotationMatrix();
+  t0.pretranslate(v0);
+  t1.linear() = q1.conjugate().toRotationMatrix();
+  t1.translate(-v0);
+
+  VERIFY((t0 * t1).matrix().isIdentity(test_precision<Scalar>()));
+
+  t1.fromPositionOrientationScale(v0, q1, v1);
+  VERIFY_IS_APPROX(t1.matrix(), t0.matrix());
+  VERIFY_IS_APPROX(t1*v1, t0*v1);
+
+  // translation * vector
+  t0.setIdentity();
+  t0.translate(v0);
+  VERIFY_IS_APPROX((t0 * v1).template head<3>(), Translation3(v0) * v1);
+
+  // Conversion to matrix.
+  Transform3 t3;
+  t3.linear() = q1.toRotationMatrix();
+  t3.translation() = v1;
+  Matrix4 m3 = t3.matrix();
+  VERIFY((m3 * m3.inverse()).isIdentity(test_precision<Scalar>()));
+  // Verify implicit last row is initialized.
+  VERIFY_IS_APPROX(Vector4(m3.row(3)), Vector4(0.0, 0.0, 0.0, 1.0));
+
+  VERIFY_IS_APPROX(t3.rotation(), t3.linear());
+  if(Mode==Isometry)
+    VERIFY(t3.rotation().data()==t3.linear().data());
+}
+
+template<typename Scalar, int Mode, int Options> void transformations_computed_scaling_continuity()
+{
+  typedef Matrix<Scalar, 3, 1> Vector3;
+  typedef Transform<Scalar, 3, Mode, Options> Transform3;
+  typedef Matrix<Scalar, 3, 3> Matrix3;
+
+  // Given: two transforms that differ by '2*eps'.
+  Scalar eps(1e-3);
+  Vector3 v0 = Vector3::Random().normalized(),
+    v1 = Vector3::Random().normalized(),
+    v3 = Vector3::Random().normalized();
+  Transform3 t0, t1;
+  // The interesting case is when their determinants have different signs.
+  Matrix3 rank2 = 50 * v0 * v0.adjoint() + 20 * v1 * v1.adjoint();
+  t0.linear() = rank2 + eps * v3 * v3.adjoint();
+  t1.linear() = rank2 - eps * v3 * v3.adjoint();
+
+  // When: computing the rotation-scaling parts
+  Matrix3 r0, s0, r1, s1;
+  t0.computeRotationScaling(&r0, &s0);
+  t1.computeRotationScaling(&r1, &s1);
+
+  // Then: the scaling parts should differ by no more than '2*eps'.
+  const Scalar c(2.1); // 2 + room for rounding errors
+  VERIFY((s0 - s1).norm() < c * eps);
+}
+
+EIGEN_DECLARE_TEST(geo_transformations)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1(( transformations<double,Affine,AutoAlign>() ));
+    CALL_SUBTEST_1(( non_projective_only<double,Affine,AutoAlign>() ));
+    CALL_SUBTEST_1(( transformations_computed_scaling_continuity<double,Affine,AutoAlign>() ));   
+    
+    CALL_SUBTEST_2(( transformations<float,AffineCompact,AutoAlign>() ));
+    CALL_SUBTEST_2(( non_projective_only<float,AffineCompact,AutoAlign>() ));
+    CALL_SUBTEST_2(( transform_alignment<float>() ));
+    
+    CALL_SUBTEST_3(( transformations<double,Projective,AutoAlign>() ));
+    CALL_SUBTEST_3(( transformations<double,Projective,DontAlign>() ));
+    CALL_SUBTEST_3(( transform_alignment<double>() ));
+
+    CALL_SUBTEST_4(( transformations<float,Affine,RowMajor|AutoAlign>() ));
+    CALL_SUBTEST_4(( non_projective_only<float,Affine,RowMajor>() ));
+    
+    CALL_SUBTEST_5(( transformations<double,AffineCompact,RowMajor|AutoAlign>() ));
+    CALL_SUBTEST_5(( non_projective_only<double,AffineCompact,RowMajor>() ));
+
+    CALL_SUBTEST_6(( transformations<double,Projective,RowMajor|AutoAlign>() ));
+    CALL_SUBTEST_6(( transformations<double,Projective,RowMajor|DontAlign>() ));
+
+
+    CALL_SUBTEST_7(( transform_products<double,3,RowMajor|AutoAlign>() ));
+    CALL_SUBTEST_7(( transform_products<float,2,AutoAlign>() ));
+
+    CALL_SUBTEST_8(( transform_associativity<double,2,ColMajor>(Rotation2D<double>(internal::random<double>()*double(EIGEN_PI))) ));
+    CALL_SUBTEST_8(( transform_associativity<double,3,ColMajor>(Quaterniond::UnitRandom()) ));
+
+    CALL_SUBTEST_9(( transformations_no_scale<double,Affine,AutoAlign>() ));
+    CALL_SUBTEST_9(( transformations_no_scale<double,Isometry,AutoAlign>() ));
+  }
+}

diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu
new file mode 100644
index 0000000..e424a93
--- /dev/null
+++ b/test/gpu_basic.cu

@@ -0,0 +1,465 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// workaround issue between gcc >= 4.7 and cuda 5.5
+#if (defined __GNUC__) && (__GNUC__>4 || __GNUC_MINOR__>=7)
+  #undef _GLIBCXX_ATOMIC_BUILTINS
+  #undef _GLIBCXX_USE_INT128
+#endif
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#include "main.h"
+#include "gpu_common.h"
+
+// Check that dense modules can be properly parsed by nvcc
+#include <Eigen/Dense>
+
+// struct Foo{
+//   EIGEN_DEVICE_FUNC
+//   void operator()(int i, const float* mats, float* vecs) const {
+//     using namespace Eigen;
+//   //   Matrix3f M(data);
+//   //   Vector3f x(data+9);
+//   //   Map<Vector3f>(data+9) = M.inverse() * x;
+//     Matrix3f M(mats+i/16);
+//     Vector3f x(vecs+i*3);
+//   //   using std::min;
+//   //   using std::sqrt;
+//     Map<Vector3f>(vecs+i*3) << x.minCoeff(), 1, 2;// / x.dot(x);//(M.inverse() *  x) / x.x();
+//     //x = x*2 + x.y() * x + x * x.maxCoeff() - x / x.sum();
+//   }
+// };
+
+template<typename T>
+struct coeff_wise {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    T x1(in+i);
+    T x2(in+i+1);
+    T x3(in+i+2);
+    Map<T> res(out+i*T::MaxSizeAtCompileTime);
+    
+    res.array() += (in[0] * x1 + x2).array() * x3.array();
+  }
+};
+
+template<typename T>
+struct complex_sqrt {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    typedef typename T::Scalar ComplexType;
+    typedef typename T::Scalar::value_type ValueType;
+    const int num_special_inputs = 18;
+    
+    if (i == 0) {
+      const ValueType nan = std::numeric_limits<ValueType>::quiet_NaN();
+      typedef Eigen::Vector<ComplexType, num_special_inputs> SpecialInputs;
+      SpecialInputs special_in;
+      special_in.setZero();
+      int idx = 0;
+      special_in[idx++] = ComplexType(0, 0);
+      special_in[idx++] = ComplexType(-0, 0);
+      special_in[idx++] = ComplexType(0, -0);
+      special_in[idx++] = ComplexType(-0, -0);
+      // GCC's fallback sqrt implementation fails for inf inputs.
+      // It is called when _GLIBCXX_USE_C99_COMPLEX is false or if
+      // clang includes the GCC header (which temporarily disables
+      // _GLIBCXX_USE_C99_COMPLEX)
+      #if !defined(_GLIBCXX_COMPLEX) || \
+        (_GLIBCXX_USE_C99_COMPLEX && !defined(__CLANG_CUDA_WRAPPERS_COMPLEX))
+      const ValueType inf = std::numeric_limits<ValueType>::infinity();
+      special_in[idx++] = ComplexType(1.0, inf);
+      special_in[idx++] = ComplexType(nan, inf);
+      special_in[idx++] = ComplexType(1.0, -inf);
+      special_in[idx++] = ComplexType(nan, -inf);
+      special_in[idx++] = ComplexType(-inf, 1.0);
+      special_in[idx++] = ComplexType(inf, 1.0);
+      special_in[idx++] = ComplexType(-inf, -1.0);
+      special_in[idx++] = ComplexType(inf, -1.0);
+      special_in[idx++] = ComplexType(-inf, nan);
+      special_in[idx++] = ComplexType(inf, nan);
+      #endif
+      special_in[idx++] = ComplexType(1.0, nan);
+      special_in[idx++] = ComplexType(nan, 1.0);
+      special_in[idx++] = ComplexType(nan, -1.0);
+      special_in[idx++] = ComplexType(nan, nan);
+      
+      Map<SpecialInputs> special_out(out);
+      special_out = special_in.cwiseSqrt();
+    }
+    
+    T x1(in + i);
+    Map<T> res(out + num_special_inputs + i*T::MaxSizeAtCompileTime);
+    res = x1.cwiseSqrt();
+  }
+};
+
+template<typename T>
+struct complex_operators {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    typedef typename T::Scalar ComplexType;
+    typedef typename T::Scalar::value_type ValueType;
+    const int num_scalar_operators = 24;
+    const int num_vector_operators = 23;  // no unary + operator.
+    int out_idx = i * (num_scalar_operators + num_vector_operators * T::MaxSizeAtCompileTime);
+    
+    // Scalar operators.
+    const ComplexType a = in[i];
+    const ComplexType b = in[i + 1];
+    
+    out[out_idx++] = +a;
+    out[out_idx++] = -a;
+    
+    out[out_idx++] = a + b;
+    out[out_idx++] = a + numext::real(b);
+    out[out_idx++] = numext::real(a) + b;
+    out[out_idx++] = a - b;
+    out[out_idx++] = a - numext::real(b);
+    out[out_idx++] = numext::real(a) - b;
+    out[out_idx++] = a * b;
+    out[out_idx++] = a * numext::real(b);
+    out[out_idx++] = numext::real(a) * b;
+    out[out_idx++] = a / b;
+    out[out_idx++] = a / numext::real(b);
+    out[out_idx++] = numext::real(a) / b;
+    
+#if !defined(EIGEN_COMP_MSVC)
+    out[out_idx] = a; out[out_idx++] += b;
+    out[out_idx] = a; out[out_idx++] -= b;
+    out[out_idx] = a; out[out_idx++] *= b;
+    out[out_idx] = a; out[out_idx++] /= b;
+#endif
+    
+    const ComplexType true_value = ComplexType(ValueType(1), ValueType(0));
+    const ComplexType false_value = ComplexType(ValueType(0), ValueType(0));
+    out[out_idx++] = (a == b ? true_value : false_value);
+    out[out_idx++] = (a == numext::real(b) ? true_value : false_value);
+    out[out_idx++] = (numext::real(a) == b ? true_value : false_value);
+    out[out_idx++] = (a != b ? true_value : false_value);
+    out[out_idx++] = (a != numext::real(b) ? true_value : false_value);
+    out[out_idx++] = (numext::real(a) != b ? true_value : false_value);
+    
+    // Vector versions.
+    T x1(in + i);
+    T x2(in + i + 1);
+    const int res_size = T::MaxSizeAtCompileTime * num_scalar_operators;
+    const int size = T::MaxSizeAtCompileTime;
+    int block_idx = 0;
+    
+    Map<VectorX<ComplexType>> res(out + out_idx, res_size);
+    res.segment(block_idx, size) = -x1;
+    block_idx += size;
+    
+    res.segment(block_idx, size) = x1 + x2;
+    block_idx += size;
+    res.segment(block_idx, size) = x1 + x2.real();
+    block_idx += size;
+    res.segment(block_idx, size) = x1.real() + x2;
+    block_idx += size;
+    res.segment(block_idx, size) = x1 - x2;
+    block_idx += size;
+    res.segment(block_idx, size) = x1 - x2.real();
+    block_idx += size;
+    res.segment(block_idx, size) = x1.real() - x2;
+    block_idx += size;
+    res.segment(block_idx, size) = x1.array() * x2.array();
+    block_idx += size;
+    res.segment(block_idx, size) = x1.array() * x2.real().array();
+    block_idx += size;
+    res.segment(block_idx, size) = x1.real().array() * x2.array();
+    block_idx += size;
+    res.segment(block_idx, size) = x1.array() / x2.array();
+    block_idx += size;
+    res.segment(block_idx, size) = x1.array() / x2.real().array();
+    block_idx += size;
+    res.segment(block_idx, size) = x1.real().array() / x2.array();
+    block_idx += size;
+    
+#if !defined(EIGEN_COMP_MSVC)
+    res.segment(block_idx, size) = x1; res.segment(block_idx, size) += x2;
+    block_idx += size;
+    res.segment(block_idx, size) = x1; res.segment(block_idx, size) -= x2;
+    block_idx += size;
+    res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() *= x2.array();
+    block_idx += size;
+    res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() /= x2.array();
+    block_idx += size;
+#endif
+
+    const T true_vector = T::Constant(true_value);
+    const T false_vector = T::Constant(false_value);
+    res.segment(block_idx, size) = (x1 == x2 ? true_vector : false_vector);
+    block_idx += size;
+    // Mixing types in equality comparison does not work.
+    // res.segment(block_idx, size) = (x1 == x2.real() ? true_vector : false_vector);
+    // block_idx += size;
+    // res.segment(block_idx, size) = (x1.real() == x2 ? true_vector : false_vector);
+    // block_idx += size;
+    res.segment(block_idx, size) = (x1 != x2 ? true_vector : false_vector);
+    block_idx += size;
+    // res.segment(block_idx, size) = (x1 != x2.real() ? true_vector : false_vector);
+    // block_idx += size;
+    // res.segment(block_idx, size) = (x1.real() != x2 ? true_vector : false_vector);
+    // block_idx += size;
+  }
+};
+
+template<typename T>
+struct replicate {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    T x1(in+i);
+    int step   = x1.size() * 4;
+    int stride = 3 * step;
+    
+    typedef Map<Array<typename T::Scalar,Dynamic,Dynamic> > MapType;
+    MapType(out+i*stride+0*step, x1.rows()*2, x1.cols()*2) = x1.replicate(2,2);
+    MapType(out+i*stride+1*step, x1.rows()*3, x1.cols()) = in[i] * x1.colwise().replicate(3);
+    MapType(out+i*stride+2*step, x1.rows(), x1.cols()*3) = in[i] * x1.rowwise().replicate(3);
+  }
+};
+
+template<typename T>
+struct alloc_new_delete {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    int offset = 2*i*T::MaxSizeAtCompileTime;
+    T* x = new T(in + offset);
+    Eigen::Map<T> u(out + offset);
+    u = *x;
+    delete x;
+    
+    offset += T::MaxSizeAtCompileTime;
+    T* y = new T[1];
+    y[0] = T(in + offset);
+    Eigen::Map<T> v(out + offset);
+    v = y[0];    
+    delete[] y;
+  }
+};
+
+template<typename T>
+struct redux {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    int N = 10;
+    T x1(in+i);
+    out[i*N+0] = x1.minCoeff();
+    out[i*N+1] = x1.maxCoeff();
+    out[i*N+2] = x1.sum();
+    out[i*N+3] = x1.prod();
+    out[i*N+4] = x1.matrix().squaredNorm();
+    out[i*N+5] = x1.matrix().norm();
+    out[i*N+6] = x1.colwise().sum().maxCoeff();
+    out[i*N+7] = x1.rowwise().maxCoeff().sum();
+    out[i*N+8] = x1.matrix().colwise().squaredNorm().sum();
+  }
+};
+
+template<typename T1, typename T2>
+struct prod_test {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
+  {
+    using namespace Eigen;
+    typedef Matrix<typename T1::Scalar, T1::RowsAtCompileTime, T2::ColsAtCompileTime> T3;
+    T1 x1(in+i);
+    T2 x2(in+i+1);
+    Map<T3> res(out+i*T3::MaxSizeAtCompileTime);
+    res += in[i] * x1 * x2;
+  }
+};
+
+template<typename T1, typename T2>
+struct diagonal {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
+  {
+    using namespace Eigen;
+    T1 x1(in+i);
+    Map<T2> res(out+i*T2::MaxSizeAtCompileTime);
+    res += x1.diagonal();
+  }
+};
+
+template<typename T>
+struct eigenvalues_direct {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec;
+    T M(in+i);
+    Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime);
+    T A = M*M.adjoint();
+    SelfAdjointEigenSolver<T> eig;
+    eig.computeDirect(A);
+    res = eig.eigenvalues();
+  }
+};
+
+template<typename T>
+struct eigenvalues {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec;
+    T M(in+i);
+    Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime);
+    T A = M*M.adjoint();
+    SelfAdjointEigenSolver<T> eig;
+    eig.compute(A);
+    res = eig.eigenvalues();
+  }
+};
+
+template<typename T>
+struct matrix_inverse {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    T M(in+i);
+    Map<T> res(out+i*T::MaxSizeAtCompileTime);
+    res = M.inverse();
+  }
+};
+
+template<typename T>
+struct numeric_limits_test {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    EIGEN_UNUSED_VARIABLE(in)
+    int out_idx = i * 5;
+    out[out_idx++] = numext::numeric_limits<float>::epsilon();
+    out[out_idx++] = (numext::numeric_limits<float>::max)();
+    out[out_idx++] = (numext::numeric_limits<float>::min)();
+    out[out_idx++] = numext::numeric_limits<float>::infinity();
+    out[out_idx++] = numext::numeric_limits<float>::quiet_NaN();
+  }
+};
+
+template<typename Type1, typename Type2>
+bool verifyIsApproxWithInfsNans(const Type1& a, const Type2& b, typename Type1::Scalar* = 0) // Enabled for Eigen's type only
+{
+  if (a.rows() != b.rows()) {
+    return false;
+  }
+  if (a.cols() != b.cols()) {
+    return false;
+  }
+  for (Index r = 0; r < a.rows(); ++r) {
+    for (Index c = 0; c < a.cols(); ++c) {
+      if (a(r, c) != b(r, c)
+          && !((numext::isnan)(a(r, c)) && (numext::isnan)(b(r, c))) 
+          && !test_isApprox(a(r, c), b(r, c))) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+template<typename Kernel, typename Input, typename Output>
+void test_with_infs_nans(const Kernel& ker, int n, const Input& in, Output& out)
+{
+  Output out_ref, out_gpu;
+  #if !defined(EIGEN_GPU_COMPILE_PHASE)
+  out_ref = out_gpu = out;
+  #else
+  EIGEN_UNUSED_VARIABLE(in);
+  EIGEN_UNUSED_VARIABLE(out);
+  #endif
+  run_on_cpu (ker, n, in,  out_ref);
+  run_on_gpu(ker, n, in, out_gpu);
+  #if !defined(EIGEN_GPU_COMPILE_PHASE)
+  verifyIsApproxWithInfsNans(out_ref, out_gpu);
+  #endif
+}
+
+EIGEN_DECLARE_TEST(gpu_basic)
+{
+  ei_test_init_gpu();
+  
+  int nthreads = 100;
+  Eigen::VectorXf in, out;
+  Eigen::VectorXcf cfin, cfout;
+  
+  #if !defined(EIGEN_GPU_COMPILE_PHASE)
+  int data_size = nthreads * 512;
+  in.setRandom(data_size);
+  out.setConstant(data_size, -1);
+  cfin.setRandom(data_size);
+  cfout.setConstant(data_size, -1);
+  #endif
+  
+  CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Vector3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Array44f>(), nthreads, in, out) );
+
+#if !defined(EIGEN_USE_HIP)
+  // FIXME
+  // These subtests result in a compile failure on the HIP platform
+  //
+  //  eigen-upstream/Eigen/src/Core/Replicate.h:61:65: error:
+  //           base class 'internal::dense_xpr_base<Replicate<Array<float, 4, 1, 0, 4, 1>, -1, -1> >::type'
+  //           (aka 'ArrayBase<Eigen::Replicate<Eigen::Array<float, 4, 1, 0, 4, 1>, -1, -1> >') has protected default constructor
+  CALL_SUBTEST( run_and_compare_to_gpu(replicate<Array4f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(replicate<Array33f>(), nthreads, in, out) );
+
+  // HIP does not support new/delete on device.
+  CALL_SUBTEST( run_and_compare_to_gpu(alloc_new_delete<Vector3f>(), nthreads, in, out) );
+#endif
+  
+  CALL_SUBTEST( run_and_compare_to_gpu(redux<Array4f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(redux<Matrix3f>(), nthreads, in, out) );
+  
+  CALL_SUBTEST( run_and_compare_to_gpu(prod_test<Matrix3f,Matrix3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(prod_test<Matrix4f,Vector4f>(), nthreads, in, out) );
+  
+  CALL_SUBTEST( run_and_compare_to_gpu(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) );
+
+  CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix2f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix4f>(), nthreads, in, out) );
+  
+  CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix2f>(), nthreads, in, out) );
+
+  // Test std::complex.
+  CALL_SUBTEST( run_and_compare_to_gpu(complex_operators<Vector3cf>(), nthreads, cfin, cfout) );
+  CALL_SUBTEST( test_with_infs_nans(complex_sqrt<Vector3cf>(), nthreads, cfin, cfout) );
+
+  // numeric_limits
+  CALL_SUBTEST( test_with_infs_nans(numeric_limits_test<Vector3f>(), 1, in, out) );
+
+#if defined(__NVCC__)
+  // FIXME
+  // These subtests compiles only with nvcc and fail with HIPCC and clang-cuda
+  CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix4f>(), nthreads, in, out) );
+  typedef Matrix<float,6,6> Matrix6f;
+  CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix6f>(), nthreads, in, out) );
+#endif
+}

diff --git a/test/gpu_common.h b/test/gpu_common.h
new file mode 100644
index 0000000..c37eaa1
--- /dev/null
+++ b/test/gpu_common.h

@@ -0,0 +1,176 @@
+#ifndef EIGEN_TEST_GPU_COMMON_H
+#define EIGEN_TEST_GPU_COMMON_H
+
+#ifdef EIGEN_USE_HIP
+  #include <hip/hip_runtime.h>
+  #include <hip/hip_runtime_api.h>
+#else
+  #include <cuda.h>
+  #include <cuda_runtime.h>
+  #include <cuda_runtime_api.h>
+#endif
+
+#include <iostream>
+
+#define EIGEN_USE_GPU
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
+dim3 threadIdx, blockDim, blockIdx;
+#endif
+
+template<typename Kernel, typename Input, typename Output>
+void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out)
+{
+  for(int i=0; i<n; i++)
+    ker(i, in.data(), out.data());
+}
+
+
+template<typename Kernel, typename Input, typename Output>
+__global__
+EIGEN_HIP_LAUNCH_BOUNDS_1024
+void run_on_gpu_meta_kernel(const Kernel ker, int n, const Input* in, Output* out)
+{
+  int i = threadIdx.x + blockIdx.x*blockDim.x;
+  if(i<n) {
+    ker(i, in, out);
+  }
+}
+
+
+template<typename Kernel, typename Input, typename Output>
+void run_on_gpu(const Kernel& ker, int n, const Input& in, Output& out)
+{
+  typename Input::Scalar*  d_in;
+  typename Output::Scalar* d_out;
+  std::ptrdiff_t in_bytes  = in.size()  * sizeof(typename Input::Scalar);
+  std::ptrdiff_t out_bytes = out.size() * sizeof(typename Output::Scalar);
+  
+  gpuMalloc((void**)(&d_in),  in_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
+  
+  gpuMemcpy(d_in,  in.data(),  in_bytes,  gpuMemcpyHostToDevice);
+  gpuMemcpy(d_out, out.data(), out_bytes, gpuMemcpyHostToDevice);
+  
+  // Simple and non-optimal 1D mapping assuming n is not too large
+  // That's only for unit testing!
+  dim3 Blocks(128);
+  dim3 Grids( (n+int(Blocks.x)-1)/int(Blocks.x) );
+
+  gpuDeviceSynchronize();
+  
+#ifdef EIGEN_USE_HIP
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(run_on_gpu_meta_kernel<Kernel,
+				     typename std::decay<decltype(*d_in)>::type,
+				     typename std::decay<decltype(*d_out)>::type>), 
+		     dim3(Grids), dim3(Blocks), 0, 0, ker, n, d_in, d_out);
+#else
+  run_on_gpu_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out);
+#endif
+  // Pre-launch errors.
+  gpuError_t err = gpuGetLastError();
+  if (err != gpuSuccess) {
+    printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
+    gpu_assert(false);
+  }
+  
+  // Kernel execution errors.
+  err = gpuDeviceSynchronize();
+  if (err != gpuSuccess) {
+    printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
+    gpu_assert(false);
+  }
+  
+  
+  // check inputs have not been modified
+  gpuMemcpy(const_cast<typename Input::Scalar*>(in.data()),  d_in,  in_bytes,  gpuMemcpyDeviceToHost);
+  gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost);
+  
+  gpuFree(d_in);
+  gpuFree(d_out);
+}
+
+
+template<typename Kernel, typename Input, typename Output>
+void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& out)
+{
+  Input  in_ref,  in_gpu;
+  Output out_ref, out_gpu;
+  #if !defined(EIGEN_GPU_COMPILE_PHASE)
+  in_ref = in_gpu = in;
+  out_ref = out_gpu = out;
+  #else
+  EIGEN_UNUSED_VARIABLE(in);
+  EIGEN_UNUSED_VARIABLE(out);
+  #endif
+  run_on_cpu (ker, n, in_ref,  out_ref);
+  run_on_gpu(ker, n, in_gpu, out_gpu);
+  #if !defined(EIGEN_GPU_COMPILE_PHASE)
+  VERIFY_IS_APPROX(in_ref, in_gpu);
+  VERIFY_IS_APPROX(out_ref, out_gpu);
+  #endif
+}
+
+struct compile_time_device_info {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const int* /*in*/, int* info) const
+  {
+    if (i == 0) {
+      EIGEN_UNUSED_VARIABLE(info)
+      #if defined(__CUDA_ARCH__)
+      info[0] = int(__CUDA_ARCH__ +0);
+      #endif
+      #if defined(EIGEN_HIP_DEVICE_COMPILE)
+      info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0);
+      #endif
+    }
+  }
+};
+
+void ei_test_init_gpu()
+{
+  int device = 0;
+  gpuDeviceProp_t deviceProp;
+  gpuGetDeviceProperties(&deviceProp, device);
+
+  ArrayXi dummy(1), info(10);
+  info = -1;
+  run_on_gpu(compile_time_device_info(),10,dummy,info);
+
+
+  std::cout << "GPU compile-time info:\n";
+  
+  #ifdef EIGEN_CUDACC
+  std::cout << "  EIGEN_CUDACC:                 " << int(EIGEN_CUDACC) << "\n";
+  #endif
+  
+  #ifdef EIGEN_CUDA_SDK_VER
+  std::cout << "  EIGEN_CUDA_SDK_VER:             " << int(EIGEN_CUDA_SDK_VER) << "\n";
+  #endif
+
+  #ifdef EIGEN_COMP_NVCC
+  std::cout << "  EIGEN_COMP_NVCC:             " << int(EIGEN_COMP_NVCC) << "\n";
+  #endif
+  
+  #ifdef EIGEN_HIPCC
+  std::cout << "  EIGEN_HIPCC:                 " << int(EIGEN_HIPCC) << "\n";
+  #endif
+
+  std::cout << "  EIGEN_CUDA_ARCH:             " << info[0] << "\n";  
+  std::cout << "  EIGEN_HIP_DEVICE_COMPILE:    " << info[1] << "\n";
+
+  std::cout << "GPU device info:\n";
+  std::cout << "  name:                        " << deviceProp.name << "\n";
+  std::cout << "  capability:                  " << deviceProp.major << "." << deviceProp.minor << "\n";
+  std::cout << "  multiProcessorCount:         " << deviceProp.multiProcessorCount << "\n";
+  std::cout << "  maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << "\n";
+  std::cout << "  warpSize:                    " << deviceProp.warpSize << "\n";
+  std::cout << "  regsPerBlock:                " << deviceProp.regsPerBlock << "\n";
+  std::cout << "  concurrentKernels:           " << deviceProp.concurrentKernels << "\n";
+  std::cout << "  clockRate:                   " << deviceProp.clockRate << "\n";
+  std::cout << "  canMapHostMemory:            " << deviceProp.canMapHostMemory << "\n";
+  std::cout << "  computeMode:                 " << deviceProp.computeMode << "\n";
+}
+
+#endif // EIGEN_TEST_GPU_COMMON_H

diff --git a/test/half_float.cpp b/test/half_float.cpp
new file mode 100644
index 0000000..729de1b
--- /dev/null
+++ b/test/half_float.cpp

@@ -0,0 +1,349 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <sstream>
+
+#include "main.h"
+
+#include <Eigen/src/Core/arch/Default/Half.h>
+
+#define VERIFY_HALF_BITS_EQUAL(h, bits) \
+  VERIFY_IS_EQUAL((numext::bit_cast<numext::uint16_t>(h)), (static_cast<numext::uint16_t>(bits)))
+
+// Make sure it's possible to forward declare Eigen::half
+namespace Eigen {
+struct half;
+}
+
+using Eigen::half;
+
+void test_conversion()
+{
+  using Eigen::half_impl::__half_raw;
+
+  // Round-trip bit-cast with uint16.
+  VERIFY_IS_EQUAL(
+    numext::bit_cast<half>(numext::bit_cast<numext::uint16_t>(half(1.0f))),
+    half(1.0f));
+  VERIFY_IS_EQUAL(
+    numext::bit_cast<half>(numext::bit_cast<numext::uint16_t>(half(0.5f))),
+    half(0.5f));
+  VERIFY_IS_EQUAL(
+    numext::bit_cast<half>(numext::bit_cast<numext::uint16_t>(half(-0.33333f))),
+    half(-0.33333f));
+   VERIFY_IS_EQUAL(
+    numext::bit_cast<half>(numext::bit_cast<numext::uint16_t>(half(0.0f))),
+    half(0.0f));
+
+  // Conversion from float.
+  VERIFY_HALF_BITS_EQUAL(half(1.0f), 0x3c00);
+  VERIFY_HALF_BITS_EQUAL(half(0.5f), 0x3800);
+  VERIFY_HALF_BITS_EQUAL(half(0.33333f), 0x3555);
+  VERIFY_HALF_BITS_EQUAL(half(0.0f), 0x0000);
+  VERIFY_HALF_BITS_EQUAL(half(-0.0f), 0x8000);
+  VERIFY_HALF_BITS_EQUAL(half(65504.0f), 0x7bff);
+  VERIFY_HALF_BITS_EQUAL(half(65536.0f), 0x7c00);  // Becomes infinity.
+
+  // Denormals.
+  VERIFY_HALF_BITS_EQUAL(half(-5.96046e-08f), 0x8001);
+  VERIFY_HALF_BITS_EQUAL(half(5.96046e-08f), 0x0001);
+  VERIFY_HALF_BITS_EQUAL(half(1.19209e-07f), 0x0002);
+
+  // Verify round-to-nearest-even behavior.
+  float val1 = float(half(__half_raw(0x3c00)));
+  float val2 = float(half(__half_raw(0x3c01)));
+  float val3 = float(half(__half_raw(0x3c02)));
+  VERIFY_HALF_BITS_EQUAL(half(0.5f * (val1 + val2)), 0x3c00);
+  VERIFY_HALF_BITS_EQUAL(half(0.5f * (val2 + val3)), 0x3c02);
+
+  // Conversion from int.
+  VERIFY_HALF_BITS_EQUAL(half(-1), 0xbc00);
+  VERIFY_HALF_BITS_EQUAL(half(0), 0x0000);
+  VERIFY_HALF_BITS_EQUAL(half(1), 0x3c00);
+  VERIFY_HALF_BITS_EQUAL(half(2), 0x4000);
+  VERIFY_HALF_BITS_EQUAL(half(3), 0x4200);
+
+  // Conversion from bool.
+  VERIFY_HALF_BITS_EQUAL(half(false), 0x0000);
+  VERIFY_HALF_BITS_EQUAL(half(true), 0x3c00);
+
+  // Conversion to float.
+  VERIFY_IS_EQUAL(float(half(__half_raw(0x0000))), 0.0f);
+  VERIFY_IS_EQUAL(float(half(__half_raw(0x3c00))), 1.0f);
+
+  // Denormals.
+  VERIFY_IS_APPROX(float(half(__half_raw(0x8001))), -5.96046e-08f);
+  VERIFY_IS_APPROX(float(half(__half_raw(0x0001))), 5.96046e-08f);
+  VERIFY_IS_APPROX(float(half(__half_raw(0x0002))), 1.19209e-07f);
+
+  // NaNs and infinities.
+  VERIFY(!(numext::isinf)(float(half(65504.0f))));  // Largest finite number.
+  VERIFY(!(numext::isnan)(float(half(0.0f))));
+  VERIFY((numext::isinf)(float(half(__half_raw(0xfc00)))));
+  VERIFY((numext::isnan)(float(half(__half_raw(0xfc01)))));
+  VERIFY((numext::isinf)(float(half(__half_raw(0x7c00)))));
+  VERIFY((numext::isnan)(float(half(__half_raw(0x7c01)))));
+
+#if !EIGEN_COMP_MSVC
+  // Visual Studio errors out on divisions by 0
+  VERIFY((numext::isnan)(float(half(0.0 / 0.0))));
+  VERIFY((numext::isinf)(float(half(1.0 / 0.0))));
+  VERIFY((numext::isinf)(float(half(-1.0 / 0.0))));
+#endif
+
+  // Exactly same checks as above, just directly on the half representation.
+  VERIFY(!(numext::isinf)(half(__half_raw(0x7bff))));
+  VERIFY(!(numext::isnan)(half(__half_raw(0x0000))));
+  VERIFY((numext::isinf)(half(__half_raw(0xfc00))));
+  VERIFY((numext::isnan)(half(__half_raw(0xfc01))));
+  VERIFY((numext::isinf)(half(__half_raw(0x7c00))));
+  VERIFY((numext::isnan)(half(__half_raw(0x7c01))));
+
+#if !EIGEN_COMP_MSVC
+  // Visual Studio errors out on divisions by 0
+  VERIFY((numext::isnan)(half(0.0 / 0.0)));
+  VERIFY((numext::isinf)(half(1.0 / 0.0)));
+  VERIFY((numext::isinf)(half(-1.0 / 0.0)));
+#endif
+
+  // Conversion to bool
+  VERIFY(!static_cast<bool>(half(0.0)));
+  VERIFY(!static_cast<bool>(half(-0.0)));
+  VERIFY(static_cast<bool>(half(__half_raw(0x7bff))));
+  VERIFY(static_cast<bool>(half(-0.33333)));
+  VERIFY(static_cast<bool>(half(1.0)));
+  VERIFY(static_cast<bool>(half(-1.0)));
+  VERIFY(static_cast<bool>(half(-5.96046e-08f)));
+}
+
+void test_numtraits()
+{
+  std::cout << "epsilon       = " << NumTraits<half>::epsilon() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<half>::epsilon()) << ")" << std::endl;
+  std::cout << "highest       = " << NumTraits<half>::highest() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<half>::highest()) << ")" << std::endl;
+  std::cout << "lowest        = " << NumTraits<half>::lowest() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<half>::lowest()) << ")" << std::endl;
+  std::cout << "min           = " << (std::numeric_limits<half>::min)() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(half((std::numeric_limits<half>::min)())) << ")" << std::endl;
+  std::cout << "denorm min    = " << (std::numeric_limits<half>::denorm_min)() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(half((std::numeric_limits<half>::denorm_min)())) << ")" << std::endl;
+  std::cout << "infinity      = " << NumTraits<half>::infinity() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<half>::infinity()) << ")" << std::endl;
+  std::cout << "quiet nan     = " << NumTraits<half>::quiet_NaN() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<half>::quiet_NaN()) << ")" << std::endl;
+  std::cout << "signaling nan = " << std::numeric_limits<half>::signaling_NaN() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(std::numeric_limits<half>::signaling_NaN()) << ")" << std::endl;
+
+  VERIFY(NumTraits<half>::IsSigned);
+
+  VERIFY_IS_EQUAL(
+    numext::bit_cast<numext::uint16_t>(std::numeric_limits<half>::infinity()),
+    numext::bit_cast<numext::uint16_t>(half(std::numeric_limits<float>::infinity())) );
+  // There is no guarantee that casting a 32-bit NaN to 16-bit has a precise
+  // bit pattern.  We test that it is in fact a NaN, then test the signaling
+  // bit (msb of significand is 1 for quiet, 0 for signaling).
+  const numext::uint16_t HALF_QUIET_BIT = 0x0200;
+  VERIFY(
+    (numext::isnan)(std::numeric_limits<half>::quiet_NaN())
+    && (numext::isnan)(half(std::numeric_limits<float>::quiet_NaN()))
+    && ((numext::bit_cast<numext::uint16_t>(std::numeric_limits<half>::quiet_NaN()) & HALF_QUIET_BIT) > 0)
+    && ((numext::bit_cast<numext::uint16_t>(half(std::numeric_limits<float>::quiet_NaN())) & HALF_QUIET_BIT) > 0) );
+  // After a cast to half, a signaling NaN may become non-signaling
+  // (e.g. in the case of casting float to native __fp16). Thus, we check that
+  // both are NaN, and that only the `numeric_limits` version is signaling.
+  VERIFY(
+    (numext::isnan)(std::numeric_limits<half>::signaling_NaN())
+    && (numext::isnan)(half(std::numeric_limits<float>::signaling_NaN()))
+    && ((numext::bit_cast<numext::uint16_t>(std::numeric_limits<half>::signaling_NaN()) & HALF_QUIET_BIT) == 0) );
+
+  VERIFY( (std::numeric_limits<half>::min)() > half(0.f) );
+  VERIFY( (std::numeric_limits<half>::denorm_min)() > half(0.f) );
+  VERIFY( (std::numeric_limits<half>::min)()/half(2) > half(0.f) );
+  VERIFY_IS_EQUAL( (std::numeric_limits<half>::denorm_min)()/half(2), half(0.f) );
+}
+
+void test_arithmetic()
+{
+  VERIFY_IS_EQUAL(float(half(2) + half(2)), 4);
+  VERIFY_IS_EQUAL(float(half(2) + half(-2)), 0);
+  VERIFY_IS_APPROX(float(half(0.33333f) + half(0.66667f)), 1.0f);
+  VERIFY_IS_EQUAL(float(half(2.0f) * half(-5.5f)), -11.0f);
+  VERIFY_IS_APPROX(float(half(1.0f) / half(3.0f)), 0.33333f);
+  VERIFY_IS_EQUAL(float(-half(4096.0f)), -4096.0f);
+  VERIFY_IS_EQUAL(float(-half(-4096.0f)), 4096.0f);
+  
+  half x(3);
+  half y = ++x;
+  VERIFY_IS_EQUAL(x, half(4));
+  VERIFY_IS_EQUAL(y, half(4));
+  y = --x;
+  VERIFY_IS_EQUAL(x, half(3));
+  VERIFY_IS_EQUAL(y, half(3));
+  y = x++;
+  VERIFY_IS_EQUAL(x, half(4));
+  VERIFY_IS_EQUAL(y, half(3));
+  y = x--;
+  VERIFY_IS_EQUAL(x, half(3));
+  VERIFY_IS_EQUAL(y, half(4));
+}
+
+void test_comparison()
+{
+  VERIFY(half(1.0f) > half(0.5f));
+  VERIFY(half(0.5f) < half(1.0f));
+  VERIFY(!(half(1.0f) < half(0.5f)));
+  VERIFY(!(half(0.5f) > half(1.0f)));
+
+  VERIFY(!(half(4.0f) > half(4.0f)));
+  VERIFY(!(half(4.0f) < half(4.0f)));
+
+  VERIFY(!(half(0.0f) < half(-0.0f)));
+  VERIFY(!(half(-0.0f) < half(0.0f)));
+  VERIFY(!(half(0.0f) > half(-0.0f)));
+  VERIFY(!(half(-0.0f) > half(0.0f)));
+
+  VERIFY(half(0.2f) > half(-1.0f));
+  VERIFY(half(-1.0f) < half(0.2f));
+  VERIFY(half(-16.0f) < half(-15.0f));
+
+  VERIFY(half(1.0f) == half(1.0f));
+  VERIFY(half(1.0f) != half(2.0f));
+
+  // Comparisons with NaNs and infinities.
+#if !EIGEN_COMP_MSVC
+  // Visual Studio errors out on divisions by 0
+  VERIFY(!(half(0.0 / 0.0) == half(0.0 / 0.0)));
+  VERIFY(half(0.0 / 0.0) != half(0.0 / 0.0));
+
+  VERIFY(!(half(1.0) == half(0.0 / 0.0)));
+  VERIFY(!(half(1.0) < half(0.0 / 0.0)));
+  VERIFY(!(half(1.0) > half(0.0 / 0.0)));
+  VERIFY(half(1.0) != half(0.0 / 0.0));
+
+  VERIFY(half(1.0) < half(1.0 / 0.0));
+  VERIFY(half(1.0) > half(-1.0 / 0.0));
+#endif
+}
+
+void test_basic_functions()
+{
+  VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f);
+  VERIFY_IS_EQUAL(float(abs(half(3.5f))), 3.5f);
+  VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f);
+  VERIFY_IS_EQUAL(float(abs(half(-3.5f))), 3.5f);
+
+  VERIFY_IS_EQUAL(float(numext::floor(half(3.5f))), 3.0f);
+  VERIFY_IS_EQUAL(float(floor(half(3.5f))), 3.0f);
+  VERIFY_IS_EQUAL(float(numext::floor(half(-3.5f))), -4.0f);
+  VERIFY_IS_EQUAL(float(floor(half(-3.5f))), -4.0f);
+
+  VERIFY_IS_EQUAL(float(numext::ceil(half(3.5f))), 4.0f);
+  VERIFY_IS_EQUAL(float(ceil(half(3.5f))), 4.0f);
+  VERIFY_IS_EQUAL(float(numext::ceil(half(-3.5f))), -3.0f);
+  VERIFY_IS_EQUAL(float(ceil(half(-3.5f))), -3.0f);
+
+  VERIFY_IS_APPROX(float(numext::sqrt(half(0.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(sqrt(half(0.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(numext::sqrt(half(4.0f))), 2.0f);
+  VERIFY_IS_APPROX(float(sqrt(half(4.0f))), 2.0f);
+
+  VERIFY_IS_APPROX(float(numext::pow(half(0.0f), half(1.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(pow(half(0.0f), half(1.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(numext::pow(half(2.0f), half(2.0f))), 4.0f);
+  VERIFY_IS_APPROX(float(pow(half(2.0f), half(2.0f))), 4.0f);
+
+  VERIFY_IS_EQUAL(float(numext::exp(half(0.0f))), 1.0f);
+  VERIFY_IS_EQUAL(float(exp(half(0.0f))), 1.0f);
+  VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), 20.f + float(EIGEN_PI));
+  VERIFY_IS_APPROX(float(exp(half(EIGEN_PI))), 20.f + float(EIGEN_PI));
+
+  VERIFY_IS_EQUAL(float(numext::expm1(half(0.0f))), 0.0f);
+  VERIFY_IS_EQUAL(float(expm1(half(0.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(numext::expm1(half(2.0f))), 6.3890561f);
+  VERIFY_IS_APPROX(float(expm1(half(2.0f))), 6.3890561f);
+
+  VERIFY_IS_EQUAL(float(numext::log(half(1.0f))), 0.0f);
+  VERIFY_IS_EQUAL(float(log(half(1.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(numext::log(half(10.0f))), 2.30273f);
+  VERIFY_IS_APPROX(float(log(half(10.0f))), 2.30273f);
+
+  VERIFY_IS_EQUAL(float(numext::log1p(half(0.0f))), 0.0f);
+  VERIFY_IS_EQUAL(float(log1p(half(0.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(numext::log1p(half(10.0f))), 2.3978953f);
+  VERIFY_IS_APPROX(float(log1p(half(10.0f))), 2.3978953f);
+  
+  VERIFY_IS_APPROX(numext::fmod(half(5.3f), half(2.0f)), half(1.3f));
+  VERIFY_IS_APPROX(fmod(half(5.3f), half(2.0f)), half(1.3f));
+  VERIFY_IS_APPROX(numext::fmod(half(-18.5f), half(-4.2f)), half(-1.7f));
+  VERIFY_IS_APPROX(fmod(half(-18.5f), half(-4.2f)), half(-1.7f));
+}
+
+void test_trigonometric_functions()
+{
+  VERIFY_IS_APPROX(numext::cos(half(0.0f)), half(cosf(0.0f)));
+  VERIFY_IS_APPROX(cos(half(0.0f)), half(cosf(0.0f)));
+  VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI)), half(cosf(EIGEN_PI)));
+  // VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI/2)), half(cosf(EIGEN_PI/2)));
+  // VERIFY_IS_APPROX(numext::cos(half(3*EIGEN_PI/2)), half(cosf(3*EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::cos(half(3.5f)), half(cosf(3.5f)));
+
+  VERIFY_IS_APPROX(numext::sin(half(0.0f)), half(sinf(0.0f)));
+  VERIFY_IS_APPROX(sin(half(0.0f)), half(sinf(0.0f)));
+  //  VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI)), half(sinf(EIGEN_PI)));
+  VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI/2)), half(sinf(EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::sin(half(3*EIGEN_PI/2)), half(sinf(3*EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::sin(half(3.5f)), half(sinf(3.5f)));
+
+  VERIFY_IS_APPROX(numext::tan(half(0.0f)), half(tanf(0.0f)));
+  VERIFY_IS_APPROX(tan(half(0.0f)), half(tanf(0.0f)));
+  //  VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI)), half(tanf(EIGEN_PI)));
+  //  VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI/2)), half(tanf(EIGEN_PI/2)));
+  //VERIFY_IS_APPROX(numext::tan(half(3*EIGEN_PI/2)), half(tanf(3*EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::tan(half(3.5f)), half(tanf(3.5f)));
+}
+
+void test_array()
+{
+  typedef Array<half,1,Dynamic> ArrayXh;
+  Index size = internal::random<Index>(1,10);
+  Index i = internal::random<Index>(0,size-1);
+  ArrayXh a1 = ArrayXh::Random(size), a2 = ArrayXh::Random(size);
+  VERIFY_IS_APPROX( a1+a1, half(2)*a1 );
+  VERIFY( (a1.abs() >= half(0)).all() );
+  VERIFY_IS_APPROX( (a1*a1).sqrt(), a1.abs() );
+
+  VERIFY( ((a1.min)(a2) <= (a1.max)(a2)).all() );
+  a1(i) = half(-10.);
+  VERIFY_IS_EQUAL( a1.minCoeff(), half(-10.) );
+  a1(i) = half(10.);
+  VERIFY_IS_EQUAL( a1.maxCoeff(), half(10.) );
+
+  std::stringstream ss;
+  ss << a1;
+}
+
+void test_product()
+{
+  typedef Matrix<half,Dynamic,Dynamic> MatrixXh;
+  Index rows  = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+  Index cols  = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+  Index depth = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+  MatrixXh Ah = MatrixXh::Random(rows,depth);
+  MatrixXh Bh = MatrixXh::Random(depth,cols);
+  MatrixXh Ch = MatrixXh::Random(rows,cols);
+  MatrixXf Af = Ah.cast<float>();
+  MatrixXf Bf = Bh.cast<float>();
+  MatrixXf Cf = Ch.cast<float>();
+  VERIFY_IS_APPROX(Ch.noalias()+=Ah*Bh, (Cf.noalias()+=Af*Bf).cast<half>());
+}
+
+EIGEN_DECLARE_TEST(half_float)
+{
+  CALL_SUBTEST(test_numtraits());
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST(test_conversion());
+    CALL_SUBTEST(test_arithmetic());
+    CALL_SUBTEST(test_comparison());
+    CALL_SUBTEST(test_basic_functions());
+    CALL_SUBTEST(test_trigonometric_functions());
+    CALL_SUBTEST(test_array());
+    CALL_SUBTEST(test_product());
+  }
+}

diff --git a/test/hessenberg.cpp b/test/hessenberg.cpp
new file mode 100644
index 0000000..0e1b009
--- /dev/null
+++ b/test/hessenberg.cpp

@@ -0,0 +1,62 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/Eigenvalues>
+
+template<typename Scalar,int Size> void hessenberg(int size = Size)
+{
+  typedef Matrix<Scalar,Size,Size> MatrixType;
+
+  // Test basic functionality: A = U H U* and H is Hessenberg
+  for(int counter = 0; counter < g_repeat; ++counter) {
+    MatrixType m = MatrixType::Random(size,size);
+    HessenbergDecomposition<MatrixType> hess(m);
+    MatrixType Q = hess.matrixQ();
+    MatrixType H = hess.matrixH();
+    VERIFY_IS_APPROX(m, Q * H * Q.adjoint());
+    for(int row = 2; row < size; ++row) {
+      for(int col = 0; col < row-1; ++col) {
+	VERIFY(H(row,col) == (typename MatrixType::Scalar)0);
+      }
+    }
+  }
+
+  // Test whether compute() and constructor returns same result
+  MatrixType A = MatrixType::Random(size, size);
+  HessenbergDecomposition<MatrixType> cs1;
+  cs1.compute(A);
+  HessenbergDecomposition<MatrixType> cs2(A);
+  VERIFY_IS_EQUAL(cs1.matrixH().eval(), cs2.matrixH().eval());
+  MatrixType cs1Q = cs1.matrixQ();
+  MatrixType cs2Q = cs2.matrixQ();  
+  VERIFY_IS_EQUAL(cs1Q, cs2Q);
+
+  // Test assertions for when used uninitialized
+  HessenbergDecomposition<MatrixType> hessUninitialized;
+  VERIFY_RAISES_ASSERT( hessUninitialized.matrixH() );
+  VERIFY_RAISES_ASSERT( hessUninitialized.matrixQ() );
+  VERIFY_RAISES_ASSERT( hessUninitialized.householderCoefficients() );
+  VERIFY_RAISES_ASSERT( hessUninitialized.packedMatrix() );
+
+  // TODO: Add tests for packedMatrix() and householderCoefficients()
+}
+
+EIGEN_DECLARE_TEST(hessenberg)
+{
+  CALL_SUBTEST_1(( hessenberg<std::complex<double>,1>() ));
+  CALL_SUBTEST_2(( hessenberg<std::complex<double>,2>() ));
+  CALL_SUBTEST_3(( hessenberg<std::complex<float>,4>() ));
+  CALL_SUBTEST_4(( hessenberg<float,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)) ));
+  CALL_SUBTEST_5(( hessenberg<std::complex<double>,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)) ));
+
+  // Test problem size constructors
+  CALL_SUBTEST_6(HessenbergDecomposition<MatrixXf>(10));
+}

diff --git a/test/householder.cpp b/test/householder.cpp
new file mode 100644
index 0000000..cad8138
--- /dev/null
+++ b/test/householder.cpp

@@ -0,0 +1,148 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/QR>
+
+template<typename MatrixType> void householder(const MatrixType& m)
+{
+  static bool even = true;
+  even = !even;
+  /* this test covers the following files:
+     Householder.h
+  */
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+  typedef Matrix<Scalar, internal::decrement_size<MatrixType::RowsAtCompileTime>::ret, 1> EssentialVectorType;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType;
+  typedef Matrix<Scalar, Dynamic, MatrixType::ColsAtCompileTime> HBlockMatrixType;
+  typedef Matrix<Scalar, Dynamic, 1> HCoeffsVectorType;
+
+  typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, MatrixType::RowsAtCompileTime> TMatrixType;
+  
+  Matrix<Scalar, EIGEN_SIZE_MAX(MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime), 1> _tmp((std::max)(rows,cols));
+  Scalar* tmp = &_tmp.coeffRef(0,0);
+
+  Scalar beta;
+  RealScalar alpha;
+  EssentialVectorType essential;
+
+  VectorType v1 = VectorType::Random(rows), v2;
+  v2 = v1;
+  v1.makeHouseholder(essential, beta, alpha);
+  v1.applyHouseholderOnTheLeft(essential,beta,tmp);
+  VERIFY_IS_APPROX(v1.norm(), v2.norm());
+  if(rows>=2) VERIFY_IS_MUCH_SMALLER_THAN(v1.tail(rows-1).norm(), v1.norm());
+  v1 = VectorType::Random(rows);
+  v2 = v1;
+  v1.applyHouseholderOnTheLeft(essential,beta,tmp);
+  VERIFY_IS_APPROX(v1.norm(), v2.norm());
+
+  // reconstruct householder matrix:
+  SquareMatrixType id, H1, H2;
+  id.setIdentity(rows, rows);
+  H1 = H2 = id;
+  VectorType vv(rows);
+  vv << Scalar(1), essential;
+  H1.applyHouseholderOnTheLeft(essential, beta, tmp);
+  H2.applyHouseholderOnTheRight(essential, beta, tmp);
+  VERIFY_IS_APPROX(H1, H2);
+  VERIFY_IS_APPROX(H1, id - beta * vv*vv.adjoint());
+
+  MatrixType m1(rows, cols),
+             m2(rows, cols);
+
+  v1 = VectorType::Random(rows);
+  if(even) v1.tail(rows-1).setZero();
+  m1.colwise() = v1;
+  m2 = m1;
+  m1.col(0).makeHouseholder(essential, beta, alpha);
+  m1.applyHouseholderOnTheLeft(essential,beta,tmp);
+  VERIFY_IS_APPROX(m1.norm(), m2.norm());
+  if(rows>=2) VERIFY_IS_MUCH_SMALLER_THAN(m1.block(1,0,rows-1,cols).norm(), m1.norm());
+  VERIFY_IS_MUCH_SMALLER_THAN(numext::imag(m1(0,0)), numext::real(m1(0,0)));
+  VERIFY_IS_APPROX(numext::real(m1(0,0)), alpha);
+
+  v1 = VectorType::Random(rows);
+  if(even) v1.tail(rows-1).setZero();
+  SquareMatrixType m3(rows,rows), m4(rows,rows);
+  m3.rowwise() = v1.transpose();
+  m4 = m3;
+  m3.row(0).makeHouseholder(essential, beta, alpha);
+  m3.applyHouseholderOnTheRight(essential.conjugate(),beta,tmp);
+  VERIFY_IS_APPROX(m3.norm(), m4.norm());
+  if(rows>=2) VERIFY_IS_MUCH_SMALLER_THAN(m3.block(0,1,rows,rows-1).norm(), m3.norm());
+  VERIFY_IS_MUCH_SMALLER_THAN(numext::imag(m3(0,0)), numext::real(m3(0,0)));
+  VERIFY_IS_APPROX(numext::real(m3(0,0)), alpha);
+
+  // test householder sequence on the left with a shift
+
+  Index shift = internal::random<Index>(0, std::max<Index>(rows-2,0));
+  Index brows = rows - shift;
+  m1.setRandom(rows, cols);
+  HBlockMatrixType hbm = m1.block(shift,0,brows,cols);
+  HouseholderQR<HBlockMatrixType> qr(hbm);
+  m2 = m1;
+  m2.block(shift,0,brows,cols) = qr.matrixQR();
+  HCoeffsVectorType hc = qr.hCoeffs().conjugate();
+  HouseholderSequence<MatrixType, HCoeffsVectorType> hseq(m2, hc);
+  hseq.setLength(hc.size()).setShift(shift);
+  VERIFY(hseq.length() == hc.size());
+  VERIFY(hseq.shift() == shift);
+  
+  MatrixType m5 = m2;
+  m5.block(shift,0,brows,cols).template triangularView<StrictlyLower>().setZero();
+  VERIFY_IS_APPROX(hseq * m5, m1); // test applying hseq directly
+  m3 = hseq;
+  VERIFY_IS_APPROX(m3 * m5, m1); // test evaluating hseq to a dense matrix, then applying
+  
+  SquareMatrixType hseq_mat = hseq;
+  SquareMatrixType hseq_mat_conj = hseq.conjugate();
+  SquareMatrixType hseq_mat_adj = hseq.adjoint();
+  SquareMatrixType hseq_mat_trans = hseq.transpose();
+  SquareMatrixType m6 = SquareMatrixType::Random(rows, rows);
+  VERIFY_IS_APPROX(hseq_mat.adjoint(),    hseq_mat_adj);
+  VERIFY_IS_APPROX(hseq_mat.conjugate(),  hseq_mat_conj);
+  VERIFY_IS_APPROX(hseq_mat.transpose(),  hseq_mat_trans);
+  VERIFY_IS_APPROX(hseq * m6,             hseq_mat * m6);
+  VERIFY_IS_APPROX(hseq.adjoint() * m6,   hseq_mat_adj * m6);
+  VERIFY_IS_APPROX(hseq.conjugate() * m6, hseq_mat_conj * m6);
+  VERIFY_IS_APPROX(hseq.transpose() * m6, hseq_mat_trans * m6);
+  VERIFY_IS_APPROX(m6 * hseq,             m6 * hseq_mat);
+  VERIFY_IS_APPROX(m6 * hseq.adjoint(),   m6 * hseq_mat_adj);
+  VERIFY_IS_APPROX(m6 * hseq.conjugate(), m6 * hseq_mat_conj);
+  VERIFY_IS_APPROX(m6 * hseq.transpose(), m6 * hseq_mat_trans);
+
+  // test householder sequence on the right with a shift
+
+  TMatrixType tm2 = m2.transpose();
+  HouseholderSequence<TMatrixType, HCoeffsVectorType, OnTheRight> rhseq(tm2, hc);
+  rhseq.setLength(hc.size()).setShift(shift);
+  VERIFY_IS_APPROX(rhseq * m5, m1); // test applying rhseq directly
+  m3 = rhseq;
+  VERIFY_IS_APPROX(m3 * m5, m1); // test evaluating rhseq to a dense matrix, then applying
+}
+
+EIGEN_DECLARE_TEST(householder)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( householder(Matrix<double,2,2>()) );
+    CALL_SUBTEST_2( householder(Matrix<float,2,3>()) );
+    CALL_SUBTEST_3( householder(Matrix<double,3,5>()) );
+    CALL_SUBTEST_4( householder(Matrix<float,4,4>()) );
+    CALL_SUBTEST_5( householder(MatrixXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE),internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( householder(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE),internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_7( householder(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE),internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_8( householder(Matrix<double,1,1>()) );
+  }
+}

diff --git a/test/incomplete_cholesky.cpp b/test/incomplete_cholesky.cpp
new file mode 100644
index 0000000..ecc17f5
--- /dev/null
+++ b/test/incomplete_cholesky.cpp

@@ -0,0 +1,69 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// #define EIGEN_DONT_VECTORIZE
+// #define EIGEN_MAX_ALIGN_BYTES 0
+#include "sparse_solver.h"
+#include <Eigen/IterativeLinearSolvers>
+#include <unsupported/Eigen/IterativeSolvers>
+
+template<typename T, typename I_> void test_incomplete_cholesky_T()
+{
+  typedef SparseMatrix<T,0,I_> SparseMatrixType;
+  ConjugateGradient<SparseMatrixType, Lower, IncompleteCholesky<T, Lower, AMDOrdering<I_> > >        cg_illt_lower_amd;
+  ConjugateGradient<SparseMatrixType, Lower, IncompleteCholesky<T, Lower, NaturalOrdering<I_> > >    cg_illt_lower_nat;
+  ConjugateGradient<SparseMatrixType, Upper, IncompleteCholesky<T, Upper, AMDOrdering<I_> > >        cg_illt_upper_amd;
+  ConjugateGradient<SparseMatrixType, Upper, IncompleteCholesky<T, Upper, NaturalOrdering<I_> > >    cg_illt_upper_nat;
+  ConjugateGradient<SparseMatrixType, Upper|Lower, IncompleteCholesky<T, Lower, AMDOrdering<I_> > >  cg_illt_uplo_amd;
+  
+
+  CALL_SUBTEST( check_sparse_spd_solving(cg_illt_lower_amd) );
+  CALL_SUBTEST( check_sparse_spd_solving(cg_illt_lower_nat) );
+  CALL_SUBTEST( check_sparse_spd_solving(cg_illt_upper_amd) );
+  CALL_SUBTEST( check_sparse_spd_solving(cg_illt_upper_nat) );
+  CALL_SUBTEST( check_sparse_spd_solving(cg_illt_uplo_amd) );
+}
+
+template<int>
+void bug1150()
+{
+  // regression for bug 1150
+  for(int N = 1; N<20; ++N)
+  {
+    Eigen::MatrixXd b( N, N );
+    b.setOnes();
+
+    Eigen::SparseMatrix<double> m( N, N );
+    m.reserve(Eigen::VectorXi::Constant(N,4));
+    for( int i = 0; i < N; ++i )
+    {
+        m.insert( i, i ) = 1;
+        m.coeffRef( i, i / 2 ) = 2;
+        m.coeffRef( i, i / 3 ) = 2;
+        m.coeffRef( i, i / 4 ) = 2;
+    }
+
+    Eigen::SparseMatrix<double> A;
+    A = m * m.transpose();
+
+    Eigen::ConjugateGradient<Eigen::SparseMatrix<double>,
+        Eigen::Lower | Eigen::Upper,
+        Eigen::IncompleteCholesky<double> > solver( A );
+    VERIFY(solver.preconditioner().info() == Eigen::Success);
+    VERIFY(solver.info() == Eigen::Success);
+  }
+}
+
+EIGEN_DECLARE_TEST(incomplete_cholesky)
+{
+  CALL_SUBTEST_1(( test_incomplete_cholesky_T<double,int>() ));
+  CALL_SUBTEST_2(( test_incomplete_cholesky_T<std::complex<double>, int>() ));
+  CALL_SUBTEST_3(( test_incomplete_cholesky_T<double,long int>() ));
+
+  CALL_SUBTEST_1(( bug1150<0>() ));
+}

diff --git a/test/indexed_view.cpp b/test/indexed_view.cpp
new file mode 100644
index 0000000..72c54af
--- /dev/null
+++ b/test/indexed_view.cpp

@@ -0,0 +1,473 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifdef EIGEN_TEST_PART_2
+// Make sure we also check c++11 max implementation
+#define EIGEN_MAX_CPP_VER 11
+#endif
+
+#ifdef EIGEN_TEST_PART_3
+// Make sure we also check c++98 max implementation
+#define EIGEN_MAX_CPP_VER 03
+
+// We need to disable this warning when compiling with c++11 while limiting Eigen to c++98
+// Ideally we would rather configure the compiler to build in c++98 mode but this needs
+// to be done at the CMakeLists.txt level.
+#if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
+  #pragma GCC diagnostic ignored "-Wdeprecated"
+#endif
+
+#if defined(__GNUC__) && (__GNUC__ >=9)
+  #pragma GCC diagnostic ignored "-Wdeprecated-copy"
+#endif
+#if defined(__clang__) && (__clang_major__ >= 10)
+  #pragma clang diagnostic ignored "-Wdeprecated-copy"
+#endif
+
+#endif
+
+#include <valarray>
+#include <vector>
+#include "main.h"
+
+#if EIGEN_HAS_CXX11
+#include <array>
+#endif
+
+typedef std::pair<Index,Index> IndexPair;
+
+int encode(Index i, Index j) {
+  return int(i*100 + j);
+}
+
+IndexPair decode(Index ij) {
+  return IndexPair(ij / 100, ij % 100);
+}
+
+template<typename T>
+bool match(const T& xpr, std::string ref, std::string str_xpr = "") {
+  EIGEN_UNUSED_VARIABLE(str_xpr);
+  std::stringstream str;
+  str << xpr;
+  if(!(str.str() == ref))
+    std::cout << str_xpr << "\n" << xpr << "\n\n";
+  return str.str() == ref;
+}
+
+#define MATCH(X,R) match(X, R, #X)
+
+template<typename T1,typename T2>
+typename internal::enable_if<internal::is_same<T1,T2>::value,bool>::type
+is_same_eq(const T1& a, const T2& b)
+{
+  return (a == b).all();
+}
+
+template<typename T1,typename T2>
+bool is_same_seq(const T1& a, const T2& b)
+{
+  bool ok = a.first()==b.first() && a.size() == b.size() && Index(a.incrObject())==Index(b.incrObject());;
+  if(!ok)
+  {
+    std::cerr << "seqN(" << a.first() << ", " << a.size() << ", " << Index(a.incrObject()) << ") != ";
+    std::cerr << "seqN(" << b.first() << ", " << b.size() << ", " << Index(b.incrObject()) << ")\n";
+  }
+  return ok;
+}
+
+template<typename T1,typename T2>
+typename internal::enable_if<internal::is_same<T1,T2>::value,bool>::type
+is_same_seq_type(const T1& a, const T2& b)
+{
+  return is_same_seq(a,b);
+}
+
+
+
+#define VERIFY_EQ_INT(A,B) VERIFY_IS_APPROX(int(A),int(B))
+
+// C++03 does not allow local or unnamed enums as index
+enum DummyEnum { XX=0, YY=1 };
+
+void check_indexed_view()
+{
+  Index n = 10;
+
+  ArrayXd a = ArrayXd::LinSpaced(n,0,n-1);
+  Array<double,1,Dynamic> b = a.transpose();
+
+  #if EIGEN_COMP_CXXVER>=14
+  ArrayXXi A = ArrayXXi::NullaryExpr(n,n, std::ref(encode));
+  #else
+  ArrayXXi A = ArrayXXi::NullaryExpr(n,n, std::ptr_fun(&encode));
+  #endif
+
+  for(Index i=0; i<n; ++i)
+    for(Index j=0; j<n; ++j)
+      VERIFY( decode(A(i,j)) == IndexPair(i,j) );
+
+  Array4i eii(4); eii << 3, 1, 6, 5;
+  std::valarray<int> vali(4); Map<ArrayXi>(&vali[0],4) = eii;
+  std::vector<int> veci(4); Map<ArrayXi>(veci.data(),4) = eii;
+
+  VERIFY( MATCH( A(3, seq(9,3,-1)),
+    "309  308  307  306  305  304  303")
+  );
+
+  VERIFY( MATCH( A(seqN(2,5), seq(9,3,-1)),
+    "209  208  207  206  205  204  203\n"
+    "309  308  307  306  305  304  303\n"
+    "409  408  407  406  405  404  403\n"
+    "509  508  507  506  505  504  503\n"
+    "609  608  607  606  605  604  603")
+  );
+
+  VERIFY( MATCH( A(seqN(2,5), 5),
+    "205\n"
+    "305\n"
+    "405\n"
+    "505\n"
+    "605")
+  );
+
+  VERIFY( MATCH( A(seqN(last,5,-1), seq(2,last)),
+    "902  903  904  905  906  907  908  909\n"
+    "802  803  804  805  806  807  808  809\n"
+    "702  703  704  705  706  707  708  709\n"
+    "602  603  604  605  606  607  608  609\n"
+    "502  503  504  505  506  507  508  509")
+  );
+
+  VERIFY( MATCH( A(eii, veci),
+    "303  301  306  305\n"
+    "103  101  106  105\n"
+    "603  601  606  605\n"
+    "503  501  506  505")
+  );
+
+  VERIFY( MATCH( A(eii, all),
+    "300  301  302  303  304  305  306  307  308  309\n"
+    "100  101  102  103  104  105  106  107  108  109\n"
+    "600  601  602  603  604  605  606  607  608  609\n"
+    "500  501  502  503  504  505  506  507  508  509")
+  );
+
+  // take row number 3, and repeat it 5 times
+  VERIFY( MATCH( A(seqN(3,5,0), all),
+    "300  301  302  303  304  305  306  307  308  309\n"
+    "300  301  302  303  304  305  306  307  308  309\n"
+    "300  301  302  303  304  305  306  307  308  309\n"
+    "300  301  302  303  304  305  306  307  308  309\n"
+    "300  301  302  303  304  305  306  307  308  309")
+  );
+
+  VERIFY( MATCH( a(seqN(3,3),0), "3\n4\n5" ) );
+  VERIFY( MATCH( a(seq(3,5)), "3\n4\n5" ) );
+  VERIFY( MATCH( a(seqN(3,3,1)), "3\n4\n5" ) );
+  VERIFY( MATCH( a(seqN(5,3,-1)), "5\n4\n3" ) );
+
+  VERIFY( MATCH( b(0,seqN(3,3)), "3  4  5" ) );
+  VERIFY( MATCH( b(seq(3,5)), "3  4  5" ) );
+  VERIFY( MATCH( b(seqN(3,3,1)), "3  4  5" ) );
+  VERIFY( MATCH( b(seqN(5,3,-1)), "5  4  3" ) );
+
+  VERIFY( MATCH( b(all), "0  1  2  3  4  5  6  7  8  9" ) );
+  VERIFY( MATCH( b(eii), "3  1  6  5" ) );
+
+  Array44i B;
+  B.setRandom();
+  VERIFY( (A(seqN(2,5), 5)).ColsAtCompileTime == 1);
+  VERIFY( (A(seqN(2,5), 5)).RowsAtCompileTime == Dynamic);
+  VERIFY_EQ_INT( (A(seqN(2,5), 5)).InnerStrideAtCompileTime , A.InnerStrideAtCompileTime);
+  VERIFY_EQ_INT( (A(seqN(2,5), 5)).OuterStrideAtCompileTime , A.col(5).OuterStrideAtCompileTime);
+
+  VERIFY_EQ_INT( (A(5,seqN(2,5))).InnerStrideAtCompileTime , A.row(5).InnerStrideAtCompileTime);
+  VERIFY_EQ_INT( (A(5,seqN(2,5))).OuterStrideAtCompileTime , A.row(5).OuterStrideAtCompileTime);
+  VERIFY_EQ_INT( (B(1,seqN(1,2))).InnerStrideAtCompileTime , B.row(1).InnerStrideAtCompileTime);
+  VERIFY_EQ_INT( (B(1,seqN(1,2))).OuterStrideAtCompileTime , B.row(1).OuterStrideAtCompileTime);
+
+  VERIFY_EQ_INT( (A(seqN(2,5), seq(1,3))).InnerStrideAtCompileTime , A.InnerStrideAtCompileTime);
+  VERIFY_EQ_INT( (A(seqN(2,5), seq(1,3))).OuterStrideAtCompileTime , A.OuterStrideAtCompileTime);
+  VERIFY_EQ_INT( (B(seqN(1,2), seq(1,3))).InnerStrideAtCompileTime , B.InnerStrideAtCompileTime);
+  VERIFY_EQ_INT( (B(seqN(1,2), seq(1,3))).OuterStrideAtCompileTime , B.OuterStrideAtCompileTime);
+  VERIFY_EQ_INT( (A(seqN(2,5,2), seq(1,3,2))).InnerStrideAtCompileTime , Dynamic);
+  VERIFY_EQ_INT( (A(seqN(2,5,2), seq(1,3,2))).OuterStrideAtCompileTime , Dynamic);
+  VERIFY_EQ_INT( (A(seqN(2,5,fix<2>), seq(1,3,fix<3>))).InnerStrideAtCompileTime , 2);
+  VERIFY_EQ_INT( (A(seqN(2,5,fix<2>), seq(1,3,fix<3>))).OuterStrideAtCompileTime , Dynamic);
+  VERIFY_EQ_INT( (B(seqN(1,2,fix<2>), seq(1,3,fix<3>))).InnerStrideAtCompileTime , 2);
+  VERIFY_EQ_INT( (B(seqN(1,2,fix<2>), seq(1,3,fix<3>))).OuterStrideAtCompileTime , 3*4);
+
+  VERIFY_EQ_INT( (A(seqN(2,fix<5>), seqN(1,fix<3>))).RowsAtCompileTime, 5);
+  VERIFY_EQ_INT( (A(seqN(2,fix<5>), seqN(1,fix<3>))).ColsAtCompileTime, 3);
+  VERIFY_EQ_INT( (A(seqN(2,fix<5>(5)), seqN(1,fix<3>(3)))).RowsAtCompileTime, 5);
+  VERIFY_EQ_INT( (A(seqN(2,fix<5>(5)), seqN(1,fix<3>(3)))).ColsAtCompileTime, 3);
+  VERIFY_EQ_INT( (A(seqN(2,fix<Dynamic>(5)), seqN(1,fix<Dynamic>(3)))).RowsAtCompileTime, Dynamic);
+  VERIFY_EQ_INT( (A(seqN(2,fix<Dynamic>(5)), seqN(1,fix<Dynamic>(3)))).ColsAtCompileTime, Dynamic);
+  VERIFY_EQ_INT( (A(seqN(2,fix<Dynamic>(5)), seqN(1,fix<Dynamic>(3)))).rows(), 5);
+  VERIFY_EQ_INT( (A(seqN(2,fix<Dynamic>(5)), seqN(1,fix<Dynamic>(3)))).cols(), 3);
+
+  VERIFY( is_same_seq_type( seqN(2,5,fix<-1>), seqN(2,5,fix<-1>(-1)) ) );
+  VERIFY( is_same_seq_type( seqN(2,5), seqN(2,5,fix<1>(1)) ) );
+  VERIFY( is_same_seq_type( seqN(2,5,3), seqN(2,5,fix<DynamicIndex>(3)) ) );
+  VERIFY( is_same_seq_type( seq(2,7,fix<3>), seqN(2,2,fix<3>) ) );
+  VERIFY( is_same_seq_type( seqN(2,fix<Dynamic>(5),3), seqN(2,5,fix<DynamicIndex>(3)) ) );
+  VERIFY( is_same_seq_type( seqN(2,fix<5>(5),fix<-2>), seqN(2,fix<5>,fix<-2>()) ) );
+
+  VERIFY( is_same_seq_type( seq(2,fix<5>), seqN(2,4) ) );
+#if EIGEN_HAS_CXX11
+  VERIFY( is_same_seq_type( seq(fix<2>,fix<5>), seqN(fix<2>,fix<4>) ) );
+  VERIFY( is_same_seq( seqN(2,std::integral_constant<int,5>(),std::integral_constant<int,-2>()), seqN(2,fix<5>,fix<-2>()) ) );
+  VERIFY( is_same_seq( seq(std::integral_constant<int,1>(),std::integral_constant<int,5>(),std::integral_constant<int,2>()),
+                       seq(fix<1>,fix<5>,fix<2>()) ) );
+  VERIFY( is_same_seq_type( seqN(2,std::integral_constant<int,5>(),std::integral_constant<int,-2>()), seqN(2,fix<5>,fix<-2>()) ) );
+  VERIFY( is_same_seq_type( seq(std::integral_constant<int,1>(),std::integral_constant<int,5>(),std::integral_constant<int,2>()),
+                            seq(fix<1>,fix<5>,fix<2>()) ) );
+
+  VERIFY( is_same_seq_type( seqN(2,std::integral_constant<int,5>()), seqN(2,fix<5>) ) );
+  VERIFY( is_same_seq_type( seq(std::integral_constant<int,1>(),std::integral_constant<int,5>()), seq(fix<1>,fix<5>) ) );
+#else
+  // sorry, no compile-time size recovery in c++98/03
+  VERIFY( is_same_seq( seq(fix<2>,fix<5>), seqN(fix<2>,fix<4>) ) );
+#endif
+
+  VERIFY( (A(seqN(2,fix<5>), 5)).RowsAtCompileTime == 5);
+  VERIFY( (A(4, all)).ColsAtCompileTime == Dynamic);
+  VERIFY( (A(4, all)).RowsAtCompileTime == 1);
+  VERIFY( (B(1, all)).ColsAtCompileTime == 4);
+  VERIFY( (B(1, all)).RowsAtCompileTime == 1);
+  VERIFY( (B(all,1)).ColsAtCompileTime == 1);
+  VERIFY( (B(all,1)).RowsAtCompileTime == 4);
+
+  VERIFY(int( (A(all, eii)).ColsAtCompileTime) == int(eii.SizeAtCompileTime));
+  VERIFY_EQ_INT( (A(eii, eii)).Flags&DirectAccessBit, (unsigned int)(0));
+  VERIFY_EQ_INT( (A(eii, eii)).InnerStrideAtCompileTime, 0);
+  VERIFY_EQ_INT( (A(eii, eii)).OuterStrideAtCompileTime, 0);
+
+  VERIFY_IS_APPROX( A(seq(n-1,2,-2), seqN(n-1-6,3,-1)), A(seq(last,2,fix<-2>), seqN(last-6,3,fix<-1>)) );
+
+  VERIFY_IS_APPROX( A(seq(n-1,2,-2), seqN(n-1-6,4)), A(seq(last,2,-2), seqN(last-6,4)) );
+  VERIFY_IS_APPROX( A(seq(n-1-6,n-1-2), seqN(n-1-6,4)), A(seq(last-6,last-2), seqN(6+last-6-6,4)) );
+  VERIFY_IS_APPROX( A(seq((n-1)/2,(n)/2+3), seqN(2,4)), A(seq(last/2,(last+1)/2+3), seqN(last+2-last,4)) );
+  VERIFY_IS_APPROX( A(seq(n-2,2,-2), seqN(n-8,4)), A(seq(lastp1-2,2,-2), seqN(lastp1-8,4)) );
+
+  // Check all combinations of seq:
+  VERIFY_IS_APPROX( A(seq(1,n-1-2,2), seq(1,n-1-2,2)), A(seq(1,last-2,2), seq(1,last-2,fix<2>)) );
+  VERIFY_IS_APPROX( A(seq(n-1-5,n-1-2,2), seq(n-1-5,n-1-2,2)), A(seq(last-5,last-2,2), seq(last-5,last-2,fix<2>)) );
+  VERIFY_IS_APPROX( A(seq(n-1-5,7,2), seq(n-1-5,7,2)), A(seq(last-5,7,2), seq(last-5,7,fix<2>)) );
+  VERIFY_IS_APPROX( A(seq(1,n-1-2), seq(n-1-5,7)), A(seq(1,last-2), seq(last-5,7)) );
+  VERIFY_IS_APPROX( A(seq(n-1-5,n-1-2), seq(n-1-5,n-1-2)), A(seq(last-5,last-2), seq(last-5,last-2)) );
+
+  VERIFY_IS_APPROX( A.col(A.cols()-1), A(all,last) );
+  VERIFY_IS_APPROX( A(A.rows()-2, A.cols()/2), A(last-1, lastp1/2) );
+  VERIFY_IS_APPROX( a(a.size()-2), a(last-1) );
+  VERIFY_IS_APPROX( a(a.size()/2), a((last+1)/2) );
+
+  // Check fall-back to Block
+  {
+    VERIFY( is_same_eq(A.col(0), A(all,0)) );
+    VERIFY( is_same_eq(A.row(0), A(0,all)) );
+    VERIFY( is_same_eq(A.block(0,0,2,2), A(seqN(0,2),seq(0,1))) );
+    VERIFY( is_same_eq(A.middleRows(2,4), A(seqN(2,4),all)) );
+    VERIFY( is_same_eq(A.middleCols(2,4), A(all,seqN(2,4))) );
+
+    VERIFY( is_same_eq(A.col(A.cols()-1), A(all,last)) );
+
+    const ArrayXXi& cA(A);
+    VERIFY( is_same_eq(cA.col(0), cA(all,0)) );
+    VERIFY( is_same_eq(cA.row(0), cA(0,all)) );
+    VERIFY( is_same_eq(cA.block(0,0,2,2), cA(seqN(0,2),seq(0,1))) );
+    VERIFY( is_same_eq(cA.middleRows(2,4), cA(seqN(2,4),all)) );
+    VERIFY( is_same_eq(cA.middleCols(2,4), cA(all,seqN(2,4))) );
+
+    VERIFY( is_same_eq(a.head(4), a(seq(0,3))) );
+    VERIFY( is_same_eq(a.tail(4), a(seqN(last-3,4))) );
+    VERIFY( is_same_eq(a.tail(4), a(seq(lastp1-4,last))) );
+    VERIFY( is_same_eq(a.segment<4>(3), a(seqN(3,fix<4>))) );
+  }
+
+  ArrayXXi A1=A, A2 = ArrayXXi::Random(4,4);
+  ArrayXi range25(4); range25 << 3,2,4,5;
+  A1(seqN(3,4),seq(2,5)) = A2;
+  VERIFY_IS_APPROX( A1.block(3,2,4,4), A2 );
+  A1 = A;
+  A2.setOnes();
+  A1(seq(6,3,-1),range25) = A2;
+  VERIFY_IS_APPROX( A1.block(3,2,4,4), A2 );
+
+  // check reverse
+  {
+    VERIFY( is_same_seq_type( seq(3,7).reverse(), seqN(7,5,fix<-1>)  ) );
+    VERIFY( is_same_seq_type( seq(7,3,fix<-2>).reverse(), seqN(3,3,fix<2>)  ) );
+    VERIFY_IS_APPROX( a(seqN(2,last/2).reverse()), a(seqN(2+(last/2-1)*1,last/2,fix<-1>)) );
+    VERIFY_IS_APPROX( a(seqN(last/2,fix<4>).reverse()),a(seqN(last/2,fix<4>)).reverse() );
+    VERIFY_IS_APPROX( A(seq(last-5,last-1,2).reverse(), seqN(last-3,3,fix<-2>).reverse()),
+                      A(seq(last-5,last-1,2), seqN(last-3,3,fix<-2>)).reverse() );
+  }
+
+#if EIGEN_HAS_CXX11
+  // check lastN
+  VERIFY_IS_APPROX( a(lastN(3)), a.tail(3) );
+  VERIFY( MATCH( a(lastN(3)), "7\n8\n9" ) );
+  VERIFY_IS_APPROX( a(lastN(fix<3>())), a.tail<3>() );
+  VERIFY( MATCH( a(lastN(3,2)), "5\n7\n9" ) );
+  VERIFY( MATCH( a(lastN(3,fix<2>())), "5\n7\n9" ) );
+  VERIFY( a(lastN(fix<3>())).SizeAtCompileTime == 3 );
+
+  VERIFY( (A(all, std::array<int,4>{{1,3,2,4}})).ColsAtCompileTime == 4);
+
+  VERIFY_IS_APPROX( (A(std::array<int,3>{{1,3,5}}, std::array<int,4>{{9,6,3,0}})), A(seqN(1,3,2), seqN(9,4,-3)) );
+
+#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE
+  VERIFY_IS_APPROX( A({3, 1, 6, 5}, all), A(std::array<int,4>{{3, 1, 6, 5}}, all) );
+  VERIFY_IS_APPROX( A(all,{3, 1, 6, 5}), A(all,std::array<int,4>{{3, 1, 6, 5}}) );
+  VERIFY_IS_APPROX( A({1,3,5},{3, 1, 6, 5}), A(std::array<int,3>{{1,3,5}},std::array<int,4>{{3, 1, 6, 5}}) );
+
+  VERIFY_IS_EQUAL( A({1,3,5},{3, 1, 6, 5}).RowsAtCompileTime, 3 );
+  VERIFY_IS_EQUAL( A({1,3,5},{3, 1, 6, 5}).ColsAtCompileTime, 4 );
+
+  VERIFY_IS_APPROX( a({3, 1, 6, 5}), a(std::array<int,4>{{3, 1, 6, 5}}) );
+  VERIFY_IS_EQUAL( a({1,3,5}).SizeAtCompileTime, 3 );
+
+  VERIFY_IS_APPROX( b({3, 1, 6, 5}), b(std::array<int,4>{{3, 1, 6, 5}}) );
+  VERIFY_IS_EQUAL( b({1,3,5}).SizeAtCompileTime, 3 );
+#endif
+
+#endif
+
+  // check mat(i,j) with weird types for i and j
+  {
+    VERIFY_IS_APPROX( A(B.RowsAtCompileTime-1, 1), A(3,1) );
+    VERIFY_IS_APPROX( A(B.RowsAtCompileTime, 1), A(4,1) );
+    VERIFY_IS_APPROX( A(B.RowsAtCompileTime-1, B.ColsAtCompileTime-1), A(3,3) );
+    VERIFY_IS_APPROX( A(B.RowsAtCompileTime, B.ColsAtCompileTime), A(4,4) );
+    const Index I_ = 3, J_ = 4;
+    VERIFY_IS_APPROX( A(I_,J_), A(3,4) );
+  }
+
+  // check extended block API
+  {
+    VERIFY( is_same_eq( A.block<3,4>(1,1), A.block(1,1,fix<3>,fix<4>)) );
+    VERIFY( is_same_eq( A.block<3,4>(1,1,3,4), A.block(1,1,fix<3>(),fix<4>(4))) );
+    VERIFY( is_same_eq( A.block<3,Dynamic>(1,1,3,4), A.block(1,1,fix<3>,4)) );
+    VERIFY( is_same_eq( A.block<Dynamic,4>(1,1,3,4), A.block(1,1,fix<Dynamic>(3),fix<4>)) );
+    VERIFY( is_same_eq( A.block(1,1,3,4), A.block(1,1,fix<Dynamic>(3),fix<Dynamic>(4))) );
+
+    VERIFY( is_same_eq( A.topLeftCorner<3,4>(), A.topLeftCorner(fix<3>,fix<4>)) );
+    VERIFY( is_same_eq( A.bottomLeftCorner<3,4>(), A.bottomLeftCorner(fix<3>,fix<4>)) );
+    VERIFY( is_same_eq( A.bottomRightCorner<3,4>(), A.bottomRightCorner(fix<3>,fix<4>)) );
+    VERIFY( is_same_eq( A.topRightCorner<3,4>(), A.topRightCorner(fix<3>,fix<4>)) );
+
+    VERIFY( is_same_eq( A.leftCols<3>(), A.leftCols(fix<3>)) );
+    VERIFY( is_same_eq( A.rightCols<3>(), A.rightCols(fix<3>)) );
+    VERIFY( is_same_eq( A.middleCols<3>(1), A.middleCols(1,fix<3>)) );
+
+    VERIFY( is_same_eq( A.topRows<3>(), A.topRows(fix<3>)) );
+    VERIFY( is_same_eq( A.bottomRows<3>(), A.bottomRows(fix<3>)) );
+    VERIFY( is_same_eq( A.middleRows<3>(1), A.middleRows(1,fix<3>)) );
+
+    VERIFY( is_same_eq( a.segment<3>(1), a.segment(1,fix<3>)) );
+    VERIFY( is_same_eq( a.head<3>(), a.head(fix<3>)) );
+    VERIFY( is_same_eq( a.tail<3>(), a.tail(fix<3>)) );
+
+    const ArrayXXi& cA(A);
+    VERIFY( is_same_eq( cA.block<Dynamic,4>(1,1,3,4), cA.block(1,1,fix<Dynamic>(3),fix<4>)) );
+
+    VERIFY( is_same_eq( cA.topLeftCorner<3,4>(), cA.topLeftCorner(fix<3>,fix<4>)) );
+    VERIFY( is_same_eq( cA.bottomLeftCorner<3,4>(), cA.bottomLeftCorner(fix<3>,fix<4>)) );
+    VERIFY( is_same_eq( cA.bottomRightCorner<3,4>(), cA.bottomRightCorner(fix<3>,fix<4>)) );
+    VERIFY( is_same_eq( cA.topRightCorner<3,4>(), cA.topRightCorner(fix<3>,fix<4>)) );
+
+    VERIFY( is_same_eq( cA.leftCols<3>(), cA.leftCols(fix<3>)) );
+    VERIFY( is_same_eq( cA.rightCols<3>(), cA.rightCols(fix<3>)) );
+    VERIFY( is_same_eq( cA.middleCols<3>(1), cA.middleCols(1,fix<3>)) );
+
+    VERIFY( is_same_eq( cA.topRows<3>(), cA.topRows(fix<3>)) );
+    VERIFY( is_same_eq( cA.bottomRows<3>(), cA.bottomRows(fix<3>)) );
+    VERIFY( is_same_eq( cA.middleRows<3>(1), cA.middleRows(1,fix<3>)) );
+  }
+
+  // Check compilation of enums as index type:
+  a(XX) = 1;
+  A(XX,YY) = 1;
+  // Anonymous enums only work with C++11
+#if EIGEN_HAS_CXX11
+  enum { X=0, Y=1 };
+  a(X) = 1;
+  A(X,Y) = 1;
+  A(XX,Y) = 1;
+  A(X,YY) = 1;
+#endif
+
+  // Check compilation of varying integer types as index types:
+  Index i = n/2;
+  short i_short(i);
+  std::size_t i_sizet(i);
+  VERIFY_IS_EQUAL( a(i), a.coeff(i_short) );
+  VERIFY_IS_EQUAL( a(i), a.coeff(i_sizet) );
+
+  VERIFY_IS_EQUAL( A(i,i), A.coeff(i_short, i_short) );
+  VERIFY_IS_EQUAL( A(i,i), A.coeff(i_short, i) );
+  VERIFY_IS_EQUAL( A(i,i), A.coeff(i, i_short) );
+  VERIFY_IS_EQUAL( A(i,i), A.coeff(i, i_sizet) );
+  VERIFY_IS_EQUAL( A(i,i), A.coeff(i_sizet, i) );
+  VERIFY_IS_EQUAL( A(i,i), A.coeff(i_sizet, i_short) );
+  VERIFY_IS_EQUAL( A(i,i), A.coeff(5, i_sizet) );
+
+  // Regression test for Max{Rows,Cols}AtCompileTime
+  {
+    Matrix3i A3 = Matrix3i::Random();
+    ArrayXi ind(5); ind << 1,1,1,1,1;
+    VERIFY_IS_EQUAL( A3(ind,ind).eval(), MatrixXi::Constant(5,5,A3(1,1)) );
+  }
+
+  // Regression for bug 1736
+  {
+    VERIFY_IS_APPROX(A(all, eii).col(0).eval(), A.col(eii(0)));
+    A(all, eii).col(0) = A.col(eii(0));
+  }
+
+  // bug 1815: IndexedView should allow linear access
+  {
+    VERIFY( MATCH( b(eii)(0), "3" ) );
+    VERIFY( MATCH( a(eii)(0), "3" ) );
+    VERIFY( MATCH( A(1,eii)(0), "103"));
+    VERIFY( MATCH( A(eii,1)(0), "301"));
+    VERIFY( MATCH( A(1,all)(1), "101"));
+    VERIFY( MATCH( A(all,1)(1), "101"));
+  }
+
+#if EIGEN_HAS_CXX11
+  //Bug IndexView with a single static row should be RowMajor:
+  {
+    // A(1, seq(0,2,1)).cwiseAbs().colwise().replicate(2).eval();
+    STATIC_CHECK(( (internal::evaluator<decltype( A(1,seq(0,2,1)) )>::Flags & RowMajorBit) == RowMajorBit ));
+  }
+#endif
+
+}
+
+EIGEN_DECLARE_TEST(indexed_view)
+{
+//   for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( check_indexed_view() );
+    CALL_SUBTEST_2( check_indexed_view() );
+    CALL_SUBTEST_3( check_indexed_view() );
+//   }
+
+  // static checks of some internals:
+  STATIC_CHECK(( internal::is_valid_index_type<int>::value ));
+  STATIC_CHECK(( internal::is_valid_index_type<unsigned int>::value ));
+  STATIC_CHECK(( internal::is_valid_index_type<short>::value ));
+  STATIC_CHECK(( internal::is_valid_index_type<std::ptrdiff_t>::value ));
+  STATIC_CHECK(( internal::is_valid_index_type<std::size_t>::value ));
+  STATIC_CHECK(( !internal::valid_indexed_view_overload<int,int>::value ));
+  STATIC_CHECK(( !internal::valid_indexed_view_overload<int,std::ptrdiff_t>::value ));
+  STATIC_CHECK(( !internal::valid_indexed_view_overload<std::ptrdiff_t,int>::value ));
+  STATIC_CHECK(( !internal::valid_indexed_view_overload<std::size_t,int>::value ));
+}

diff --git a/test/initializer_list_construction.cpp b/test/initializer_list_construction.cpp
new file mode 100644
index 0000000..7a9c49e
--- /dev/null
+++ b/test/initializer_list_construction.cpp

@@ -0,0 +1,385 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2019 David Tellenbach <david.tellenbach@tellnotes.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_NO_STATIC_ASSERT
+
+#include "main.h"
+
+template<typename Scalar, bool is_integer = NumTraits<Scalar>::IsInteger>
+struct TestMethodDispatching {
+  static void run() {}
+};
+
+template<typename Scalar>
+struct TestMethodDispatching<Scalar, 1> {
+  static void run()
+  {
+    {
+      Matrix<Scalar, Dynamic, Dynamic> m {3, 4};
+      Array<Scalar, Dynamic, Dynamic> a {3, 4};
+      VERIFY(m.rows() == 3);
+      VERIFY(m.cols() == 4);
+      VERIFY(a.rows() == 3);
+      VERIFY(a.cols() == 4);
+    }
+    {
+      Matrix<Scalar, 1, 2> m {3, 4};
+      Array<Scalar, 1, 2> a {3, 4};
+      VERIFY(m(0) == 3);
+      VERIFY(m(1) == 4);
+      VERIFY(a(0) == 3);
+      VERIFY(a(1) == 4);
+    }
+    {
+      Matrix<Scalar, 2, 1> m {3, 4};
+      Array<Scalar, 2, 1> a {3, 4};
+      VERIFY(m(0) == 3);
+      VERIFY(m(1) == 4);
+      VERIFY(a(0) == 3);
+      VERIFY(a(1) == 4);
+    }
+  }
+};
+
+template<typename Vec4, typename Vec5> void fixedsizeVariadicVectorConstruction2()
+{
+  {
+    Vec4 ref = Vec4::Random();
+    Vec4 v{ ref[0], ref[1], ref[2], ref[3] };
+    VERIFY_IS_APPROX(v, ref);
+    VERIFY_IS_APPROX(v, (Vec4( ref[0], ref[1], ref[2], ref[3] )));
+    VERIFY_IS_APPROX(v, (Vec4({ref[0], ref[1], ref[2], ref[3]})));
+
+    Vec4 v2 = { ref[0], ref[1], ref[2], ref[3] };
+    VERIFY_IS_APPROX(v2, ref);
+  }
+  {
+    Vec5 ref = Vec5::Random();
+    Vec5 v{ ref[0], ref[1], ref[2], ref[3], ref[4] };
+    VERIFY_IS_APPROX(v, ref);
+    VERIFY_IS_APPROX(v, (Vec5( ref[0], ref[1], ref[2], ref[3], ref[4] )));
+    VERIFY_IS_APPROX(v, (Vec5({ref[0], ref[1], ref[2], ref[3], ref[4]})));
+
+    Vec5 v2 = { ref[0], ref[1], ref[2], ref[3], ref[4] };
+    VERIFY_IS_APPROX(v2, ref);
+  }
+}
+
+#define CHECK_MIXSCALAR_V5_APPROX(V, A0, A1, A2, A3, A4) { \
+  VERIFY_IS_APPROX(V[0], Scalar(A0) ); \
+  VERIFY_IS_APPROX(V[1], Scalar(A1) ); \
+  VERIFY_IS_APPROX(V[2], Scalar(A2) ); \
+  VERIFY_IS_APPROX(V[3], Scalar(A3) ); \
+  VERIFY_IS_APPROX(V[4], Scalar(A4) ); \
+}
+
+#define CHECK_MIXSCALAR_V5(VEC5, A0, A1, A2, A3, A4) { \
+  typedef VEC5::Scalar Scalar; \
+  VEC5 v = { A0 , A1 , A2 , A3 , A4 }; \
+  CHECK_MIXSCALAR_V5_APPROX(v, A0 , A1 , A2 , A3 , A4); \
+}
+
+template<int> void fixedsizeVariadicVectorConstruction3()
+{
+  typedef Matrix<double,5,1> Vec5;
+  typedef Array<float,5,1> Arr5;
+  CHECK_MIXSCALAR_V5(Vec5, 1, 2., -3, 4.121, 5.53252);
+  CHECK_MIXSCALAR_V5(Arr5, 1, 2., 3.12f, 4.121, 5.53252);
+}
+
+template<typename Scalar> void fixedsizeVariadicVectorConstruction()
+{
+  CALL_SUBTEST(( fixedsizeVariadicVectorConstruction2<Matrix<Scalar,4,1>, Matrix<Scalar,5,1> >() ));
+  CALL_SUBTEST(( fixedsizeVariadicVectorConstruction2<Matrix<Scalar,1,4>, Matrix<Scalar,1,5> >() ));
+  CALL_SUBTEST(( fixedsizeVariadicVectorConstruction2<Array<Scalar,4,1>,  Array<Scalar,5,1>  >() ));
+  CALL_SUBTEST(( fixedsizeVariadicVectorConstruction2<Array<Scalar,1,4>,  Array<Scalar,1,5>  >() ));
+}
+
+
+template<typename Scalar> void initializerListVectorConstruction()
+{
+  Scalar raw[4];
+  for(int k = 0; k < 4; ++k) {
+    raw[k] = internal::random<Scalar>();
+  }
+  {
+    Matrix<Scalar, 4, 1> m { {raw[0]}, {raw[1]},{raw[2]},{raw[3]} };
+    Array<Scalar, 4, 1> a { {raw[0]}, {raw[1]}, {raw[2]}, {raw[3]} };
+    for(int k = 0; k < 4; ++k) {
+      VERIFY(m(k) == raw[k]);
+    }
+    for(int k = 0; k < 4; ++k) {
+      VERIFY(a(k) == raw[k]);
+    }
+    VERIFY_IS_EQUAL(m, (Matrix<Scalar,4,1>({ {raw[0]}, {raw[1]}, {raw[2]}, {raw[3]} })));
+    VERIFY((a == (Array<Scalar,4,1>({ {raw[0]}, {raw[1]}, {raw[2]}, {raw[3]} }))).all());
+  }
+  {
+    Matrix<Scalar, 1, 4> m { {raw[0], raw[1], raw[2], raw[3]} };
+    Array<Scalar, 1, 4> a { {raw[0], raw[1], raw[2], raw[3]} };
+    for(int k = 0; k < 4; ++k) {
+      VERIFY(m(k) == raw[k]);
+    }
+    for(int k = 0; k < 4; ++k) {
+      VERIFY(a(k) == raw[k]);
+    }
+    VERIFY_IS_EQUAL(m, (Matrix<Scalar, 1, 4>({{raw[0],raw[1],raw[2],raw[3]}})));
+    VERIFY((a == (Array<Scalar, 1, 4>({{raw[0],raw[1],raw[2],raw[3]}}))).all());
+  }
+  {
+    Matrix<Scalar, 4, Dynamic> m { {raw[0]}, {raw[1]}, {raw[2]}, {raw[3]} };
+    Array<Scalar, 4, Dynamic> a { {raw[0]}, {raw[1]}, {raw[2]}, {raw[3]} };
+    for(int k=0; k < 4; ++k) {
+      VERIFY(m(k) == raw[k]);
+    }
+    for(int k=0; k < 4; ++k) {
+      VERIFY(a(k) == raw[k]);
+    }
+    VERIFY_IS_EQUAL(m, (Matrix<Scalar, 4, Dynamic>({ {raw[0]}, {raw[1]}, {raw[2]}, {raw[3]} })));
+    VERIFY((a == (Array<Scalar, 4, Dynamic>({ {raw[0]}, {raw[1]}, {raw[2]}, {raw[3]} }))).all());
+  }
+  {
+    Matrix<Scalar, Dynamic, 4> m {{raw[0],raw[1],raw[2],raw[3]}};
+    Array<Scalar, Dynamic, 4> a {{raw[0],raw[1],raw[2],raw[3]}};
+    for(int k=0; k < 4; ++k) {
+      VERIFY(m(k) == raw[k]);
+    }
+    for(int k=0; k < 4; ++k) {
+      VERIFY(a(k) == raw[k]);
+    }
+    VERIFY_IS_EQUAL(m, (Matrix<Scalar, Dynamic, 4>({{raw[0],raw[1],raw[2],raw[3]}})));
+    VERIFY((a == (Array<Scalar, Dynamic, 4>({{raw[0],raw[1],raw[2],raw[3]}}))).all());
+  }
+}
+
+template<typename Scalar> void initializerListMatrixConstruction()
+{
+  const Index RowsAtCompileTime = 5;
+  const Index ColsAtCompileTime = 4;
+  const Index SizeAtCompileTime = RowsAtCompileTime * ColsAtCompileTime;
+
+  Scalar raw[SizeAtCompileTime];
+  for (int i = 0; i < SizeAtCompileTime; ++i) {
+    raw[i] = internal::random<Scalar>();
+  }
+  {
+    Matrix<Scalar, Dynamic, Dynamic> m {};
+    VERIFY(m.cols() == 0);
+    VERIFY(m.rows() == 0);
+    VERIFY_IS_EQUAL(m, (Matrix<Scalar, Dynamic, Dynamic>()));
+  }
+  {
+    Matrix<Scalar, 5, 4> m {
+      {raw[0], raw[1], raw[2], raw[3]},
+      {raw[4], raw[5], raw[6], raw[7]},
+      {raw[8], raw[9], raw[10], raw[11]},
+      {raw[12], raw[13], raw[14], raw[15]},
+      {raw[16], raw[17], raw[18], raw[19]}
+    };
+
+    Matrix<Scalar, 5, 4> m2;
+    m2 << raw[0], raw[1], raw[2], raw[3],
+          raw[4], raw[5], raw[6], raw[7],
+          raw[8], raw[9], raw[10], raw[11],
+          raw[12], raw[13], raw[14], raw[15],
+          raw[16], raw[17], raw[18], raw[19];
+
+    int k = 0;
+    for(int i = 0; i < RowsAtCompileTime; ++i) {
+      for (int j = 0; j < ColsAtCompileTime; ++j) {
+        VERIFY(m(i, j) == raw[k]);
+        ++k;
+      }
+    }
+    VERIFY_IS_EQUAL(m, m2);
+  }
+  {
+    Matrix<Scalar, Dynamic, Dynamic> m{
+      {raw[0], raw[1], raw[2], raw[3]},
+      {raw[4], raw[5], raw[6], raw[7]},
+      {raw[8], raw[9], raw[10], raw[11]},
+      {raw[12], raw[13], raw[14], raw[15]},
+      {raw[16], raw[17], raw[18], raw[19]}
+    };
+
+    VERIFY(m.cols() == 4);
+    VERIFY(m.rows() == 5);
+    int k = 0;
+    for(int i = 0; i < RowsAtCompileTime; ++i) {
+      for (int j = 0; j < ColsAtCompileTime; ++j) {
+        VERIFY(m(i, j) == raw[k]);
+        ++k;
+      }
+    }
+
+    Matrix<Scalar, Dynamic, Dynamic> m2(RowsAtCompileTime, ColsAtCompileTime);
+    k = 0;
+    for(int i = 0; i < RowsAtCompileTime; ++i) {
+      for (int j = 0; j < ColsAtCompileTime; ++j) {
+        m2(i, j) = raw[k];
+        ++k;
+      }
+    }
+    VERIFY_IS_EQUAL(m, m2);
+  }
+}
+
+template<typename Scalar> void initializerListArrayConstruction()
+{
+  const Index RowsAtCompileTime = 5;
+  const Index ColsAtCompileTime = 4;
+  const Index SizeAtCompileTime = RowsAtCompileTime * ColsAtCompileTime;
+
+  Scalar raw[SizeAtCompileTime];
+  for (int i = 0; i < SizeAtCompileTime; ++i) {
+    raw[i] = internal::random<Scalar>();
+  }
+  {
+    Array<Scalar, Dynamic, Dynamic> a {};
+    VERIFY(a.cols() == 0);
+    VERIFY(a.rows() == 0);
+  }
+  {
+    Array<Scalar, 5, 4> m {
+      {raw[0], raw[1], raw[2], raw[3]},
+      {raw[4], raw[5], raw[6], raw[7]},
+      {raw[8], raw[9], raw[10], raw[11]},
+      {raw[12], raw[13], raw[14], raw[15]},
+      {raw[16], raw[17], raw[18], raw[19]}
+    };
+
+    Array<Scalar, 5, 4> m2;
+    m2 << raw[0], raw[1], raw[2], raw[3],
+          raw[4], raw[5], raw[6], raw[7],
+          raw[8], raw[9], raw[10], raw[11],
+          raw[12], raw[13], raw[14], raw[15],
+          raw[16], raw[17], raw[18], raw[19];
+
+    int k = 0;
+    for(int i = 0; i < RowsAtCompileTime; ++i) {
+      for (int j = 0; j < ColsAtCompileTime; ++j) {
+        VERIFY(m(i, j) == raw[k]);
+        ++k;
+      }
+    }
+    VERIFY_IS_APPROX(m, m2);
+  }
+  {
+    Array<Scalar, Dynamic, Dynamic> m {
+      {raw[0], raw[1], raw[2], raw[3]},
+      {raw[4], raw[5], raw[6], raw[7]},
+      {raw[8], raw[9], raw[10], raw[11]},
+      {raw[12], raw[13], raw[14], raw[15]},
+      {raw[16], raw[17], raw[18], raw[19]}
+    };
+
+    VERIFY(m.cols() == 4);
+    VERIFY(m.rows() == 5);
+    int k = 0;
+    for(int i = 0; i < RowsAtCompileTime; ++i) {
+      for (int j = 0; j < ColsAtCompileTime; ++j) {
+        VERIFY(m(i, j) == raw[k]);
+        ++k;
+      }
+    }
+
+    Array<Scalar, Dynamic, Dynamic> m2(RowsAtCompileTime, ColsAtCompileTime);
+    k = 0;
+    for(int i = 0; i < RowsAtCompileTime; ++i) {
+      for (int j = 0; j < ColsAtCompileTime; ++j) {
+        m2(i, j) = raw[k];
+        ++k;
+      }
+    }
+    VERIFY_IS_APPROX(m, m2);
+  }
+}
+
+template<typename Scalar> void dynamicVectorConstruction()
+{
+  const Index size = 4;
+  Scalar raw[size];
+  for (int i = 0; i < size; ++i) {
+    raw[i] = internal::random<Scalar>();
+  }
+
+  typedef Matrix<Scalar, Dynamic, 1>  VectorX;
+
+  {
+    VectorX v {{raw[0], raw[1], raw[2], raw[3]}};
+    for (int i = 0; i < size; ++i) {
+      VERIFY(v(i) == raw[i]);
+    }
+    VERIFY(v.rows() == size);
+    VERIFY(v.cols() == 1);
+    VERIFY_IS_EQUAL(v, (VectorX {{raw[0], raw[1], raw[2], raw[3]}}));
+  }
+
+  {
+    VERIFY_RAISES_ASSERT((VectorX {raw[0], raw[1], raw[2], raw[3]}));
+  }
+  {
+    VERIFY_RAISES_ASSERT((VectorX  {
+      {raw[0], raw[1], raw[2], raw[3]},
+      {raw[0], raw[1], raw[2], raw[3]},
+    }));
+  }
+}
+
+EIGEN_DECLARE_TEST(initializer_list_construction)
+{
+  CALL_SUBTEST_1(initializerListVectorConstruction<unsigned char>());
+  CALL_SUBTEST_1(initializerListVectorConstruction<float>());
+  CALL_SUBTEST_1(initializerListVectorConstruction<double>());
+  CALL_SUBTEST_1(initializerListVectorConstruction<int>());
+  CALL_SUBTEST_1(initializerListVectorConstruction<long int>());
+  CALL_SUBTEST_1(initializerListVectorConstruction<std::ptrdiff_t>());
+  CALL_SUBTEST_1(initializerListVectorConstruction<std::complex<double>>());
+  CALL_SUBTEST_1(initializerListVectorConstruction<std::complex<float>>());
+
+  CALL_SUBTEST_2(initializerListMatrixConstruction<unsigned char>());
+  CALL_SUBTEST_2(initializerListMatrixConstruction<float>());
+  CALL_SUBTEST_2(initializerListMatrixConstruction<double>());
+  CALL_SUBTEST_2(initializerListMatrixConstruction<int>());
+  CALL_SUBTEST_2(initializerListMatrixConstruction<long int>());
+  CALL_SUBTEST_2(initializerListMatrixConstruction<std::ptrdiff_t>());
+  CALL_SUBTEST_2(initializerListMatrixConstruction<std::complex<double>>());
+  CALL_SUBTEST_2(initializerListMatrixConstruction<std::complex<float>>());
+
+  CALL_SUBTEST_3(initializerListArrayConstruction<unsigned char>());
+  CALL_SUBTEST_3(initializerListArrayConstruction<float>());
+  CALL_SUBTEST_3(initializerListArrayConstruction<double>());
+  CALL_SUBTEST_3(initializerListArrayConstruction<int>());
+  CALL_SUBTEST_3(initializerListArrayConstruction<long int>());
+  CALL_SUBTEST_3(initializerListArrayConstruction<std::ptrdiff_t>());
+  CALL_SUBTEST_3(initializerListArrayConstruction<std::complex<double>>());
+  CALL_SUBTEST_3(initializerListArrayConstruction<std::complex<float>>());
+
+  CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction<unsigned char>());
+  CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction<float>());
+  CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction<double>());
+  CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction<int>());
+  CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction<long int>());
+  CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction<std::ptrdiff_t>());
+  CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction<std::complex<double>>());
+  CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction<std::complex<float>>());
+  CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction3<0>());
+
+  CALL_SUBTEST_5(TestMethodDispatching<int>::run());
+  CALL_SUBTEST_5(TestMethodDispatching<long int>::run());
+
+  CALL_SUBTEST_6(dynamicVectorConstruction<unsigned char>());
+  CALL_SUBTEST_6(dynamicVectorConstruction<float>());
+  CALL_SUBTEST_6(dynamicVectorConstruction<double>());
+  CALL_SUBTEST_6(dynamicVectorConstruction<int>());
+  CALL_SUBTEST_6(dynamicVectorConstruction<long int>());
+  CALL_SUBTEST_6(dynamicVectorConstruction<std::ptrdiff_t>());
+  CALL_SUBTEST_6(dynamicVectorConstruction<std::complex<double>>());
+  CALL_SUBTEST_6(dynamicVectorConstruction<std::complex<float>>());
+}

diff --git a/test/inplace_decomposition.cpp b/test/inplace_decomposition.cpp
new file mode 100644
index 0000000..e3aa995
--- /dev/null
+++ b/test/inplace_decomposition.cpp

@@ -0,0 +1,110 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/LU>
+#include <Eigen/Cholesky>
+#include <Eigen/QR>
+
+// This file test inplace decomposition through Ref<>, as supported by Cholesky, LU, and QR decompositions.
+
+template<typename DecType,typename MatrixType> void inplace(bool square = false, bool SPD = false)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> RhsType;
+  typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> ResType;
+
+  Index rows = MatrixType::RowsAtCompileTime==Dynamic ? internal::random<Index>(2,EIGEN_TEST_MAX_SIZE/2) : Index(MatrixType::RowsAtCompileTime);
+  Index cols = MatrixType::ColsAtCompileTime==Dynamic ? (square?rows:internal::random<Index>(2,rows))    : Index(MatrixType::ColsAtCompileTime);
+
+  MatrixType A = MatrixType::Random(rows,cols);
+  RhsType b = RhsType::Random(rows);
+  ResType x(cols);
+
+  if(SPD)
+  {
+    assert(square);
+    A.topRows(cols) = A.topRows(cols).adjoint() * A.topRows(cols);
+    A.diagonal().array() += 1e-3;
+  }
+
+  MatrixType A0 = A;
+  MatrixType A1 = A;
+
+  DecType dec(A);
+
+  // Check that the content of A has been modified
+  VERIFY_IS_NOT_APPROX( A, A0 );
+
+  // Check that the decomposition is correct:
+  if(rows==cols)
+  {
+    VERIFY_IS_APPROX( A0 * (x = dec.solve(b)), b );
+  }
+  else
+  {
+    VERIFY_IS_APPROX( A0.transpose() * A0 * (x = dec.solve(b)), A0.transpose() * b );
+  }
+
+  // Check that modifying A breaks the current dec:
+  A.setRandom();
+  if(rows==cols)
+  {
+    VERIFY_IS_NOT_APPROX( A0 * (x = dec.solve(b)), b );
+  }
+  else
+  {
+    VERIFY_IS_NOT_APPROX( A0.transpose() * A0 * (x = dec.solve(b)), A0.transpose() * b );
+  }
+
+  // Check that calling compute(A1) does not modify A1:
+  A = A0;
+  dec.compute(A1);
+  VERIFY_IS_EQUAL(A0,A1);
+  VERIFY_IS_NOT_APPROX( A, A0 );
+  if(rows==cols)
+  {
+    VERIFY_IS_APPROX( A0 * (x = dec.solve(b)), b );
+  }
+  else
+  {
+    VERIFY_IS_APPROX( A0.transpose() * A0 * (x = dec.solve(b)), A0.transpose() * b );
+  }
+}
+
+
+EIGEN_DECLARE_TEST(inplace_decomposition)
+{
+  EIGEN_UNUSED typedef Matrix<double,4,3> Matrix43d;
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1(( inplace<LLT<Ref<MatrixXd> >, MatrixXd>(true,true) ));
+    CALL_SUBTEST_1(( inplace<LLT<Ref<Matrix4d> >, Matrix4d>(true,true) ));
+
+    CALL_SUBTEST_2(( inplace<LDLT<Ref<MatrixXd> >, MatrixXd>(true,true) ));
+    CALL_SUBTEST_2(( inplace<LDLT<Ref<Matrix4d> >, Matrix4d>(true,true) ));
+
+    CALL_SUBTEST_3(( inplace<PartialPivLU<Ref<MatrixXd> >, MatrixXd>(true,false) ));
+    CALL_SUBTEST_3(( inplace<PartialPivLU<Ref<Matrix4d> >, Matrix4d>(true,false) ));
+
+    CALL_SUBTEST_4(( inplace<FullPivLU<Ref<MatrixXd> >, MatrixXd>(true,false) ));
+    CALL_SUBTEST_4(( inplace<FullPivLU<Ref<Matrix4d> >, Matrix4d>(true,false) ));
+
+    CALL_SUBTEST_5(( inplace<HouseholderQR<Ref<MatrixXd> >, MatrixXd>(false,false) ));
+    CALL_SUBTEST_5(( inplace<HouseholderQR<Ref<Matrix43d> >, Matrix43d>(false,false) ));
+
+    CALL_SUBTEST_6(( inplace<ColPivHouseholderQR<Ref<MatrixXd> >, MatrixXd>(false,false) ));
+    CALL_SUBTEST_6(( inplace<ColPivHouseholderQR<Ref<Matrix43d> >, Matrix43d>(false,false) ));
+
+    CALL_SUBTEST_7(( inplace<FullPivHouseholderQR<Ref<MatrixXd> >, MatrixXd>(false,false) ));
+    CALL_SUBTEST_7(( inplace<FullPivHouseholderQR<Ref<Matrix43d> >, Matrix43d>(false,false) ));
+
+    CALL_SUBTEST_8(( inplace<CompleteOrthogonalDecomposition<Ref<MatrixXd> >, MatrixXd>(false,false) ));
+    CALL_SUBTEST_8(( inplace<CompleteOrthogonalDecomposition<Ref<Matrix43d> >, Matrix43d>(false,false) ));
+  }
+}

diff --git a/test/integer_types.cpp b/test/integer_types.cpp
new file mode 100644
index 0000000..31f4100
--- /dev/null
+++ b/test/integer_types.cpp

@@ -0,0 +1,173 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_NO_STATIC_ASSERT
+
+#include "main.h"
+
+#undef VERIFY_IS_APPROX
+#define VERIFY_IS_APPROX(a, b) VERIFY((a)==(b));
+#undef VERIFY_IS_NOT_APPROX
+#define VERIFY_IS_NOT_APPROX(a, b) VERIFY((a)!=(b));
+
+template<typename MatrixType> void signed_integer_type_tests(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+
+  enum { is_signed = (Scalar(-1) > Scalar(0)) ? 0 : 1 };
+  VERIFY(is_signed == 1);
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m1(rows, cols),
+             m2 = MatrixType::Random(rows, cols),
+             mzero = MatrixType::Zero(rows, cols);
+
+  do {
+    m1 = MatrixType::Random(rows, cols);
+  } while(m1 == mzero || m1 == m2);
+
+  // check linear structure
+
+  Scalar s1;
+  do {
+    s1 = internal::random<Scalar>();
+  } while(s1 == 0);
+
+  VERIFY_IS_EQUAL(-(-m1),                  m1);
+  VERIFY_IS_EQUAL(-m2+m1+m2,               m1);
+  VERIFY_IS_EQUAL((-m1+m2)*s1,             -s1*m1+s1*m2);
+}
+
+template<typename MatrixType> void integer_type_tests(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+
+  VERIFY(NumTraits<Scalar>::IsInteger);
+  enum { is_signed = (Scalar(-1) > Scalar(0)) ? 0 : 1 };
+  VERIFY(int(NumTraits<Scalar>::IsSigned) == is_signed);
+
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  // this test relies a lot on Random.h, and there's not much more that we can do
+  // to test it, hence I consider that we will have tested Random.h
+  MatrixType m1(rows, cols),
+             m2 = MatrixType::Random(rows, cols),
+             m3(rows, cols),
+             mzero = MatrixType::Zero(rows, cols);
+
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType;
+  SquareMatrixType identity = SquareMatrixType::Identity(rows, rows),
+                   square = SquareMatrixType::Random(rows, rows);
+  VectorType v1(rows),
+             v2 = VectorType::Random(rows),
+             vzero = VectorType::Zero(rows);
+
+  do {
+    m1 = MatrixType::Random(rows, cols);
+  } while(m1 == mzero || m1 == m2);
+
+  do {
+    v1 = VectorType::Random(rows);
+  } while(v1 == vzero || v1 == v2);
+
+  VERIFY_IS_APPROX(               v1,    v1);
+  VERIFY_IS_NOT_APPROX(           v1,    2*v1);
+  VERIFY_IS_APPROX(               vzero, v1-v1);
+  VERIFY_IS_APPROX(               m1,    m1);
+  VERIFY_IS_NOT_APPROX(           m1,    2*m1);
+  VERIFY_IS_APPROX(               mzero, m1-m1);
+
+  VERIFY_IS_APPROX(m3 = m1,m1);
+  MatrixType m4;
+  VERIFY_IS_APPROX(m4 = m1,m1);
+
+  m3.real() = m1.real();
+  VERIFY_IS_APPROX(static_cast<const MatrixType&>(m3).real(), static_cast<const MatrixType&>(m1).real());
+  VERIFY_IS_APPROX(static_cast<const MatrixType&>(m3).real(), m1.real());
+
+  // check == / != operators
+  VERIFY(m1==m1);
+  VERIFY(m1!=m2);
+  VERIFY(!(m1==m2));
+  VERIFY(!(m1!=m1));
+  m1 = m2;
+  VERIFY(m1==m2);
+  VERIFY(!(m1!=m2));
+
+  // check linear structure
+
+  Scalar s1;
+  do {
+    s1 = internal::random<Scalar>();
+  } while(s1 == 0);
+
+  VERIFY_IS_EQUAL(m1+m1,                   2*m1);
+  VERIFY_IS_EQUAL(m1+m2-m1,                m2);
+  VERIFY_IS_EQUAL(m1*s1,                   s1*m1);
+  VERIFY_IS_EQUAL((m1+m2)*s1,              s1*m1+s1*m2);
+  m3 = m2; m3 += m1;
+  VERIFY_IS_EQUAL(m3,                      m1+m2);
+  m3 = m2; m3 -= m1;
+  VERIFY_IS_EQUAL(m3,                      m2-m1);
+  m3 = m2; m3 *= s1;
+  VERIFY_IS_EQUAL(m3,                      s1*m2);
+
+  // check matrix product.
+
+  VERIFY_IS_APPROX(identity * m1, m1);
+  VERIFY_IS_APPROX(square * (m1 + m2), square * m1 + square * m2);
+  VERIFY_IS_APPROX((m1 + m2).transpose() * square, m1.transpose() * square + m2.transpose() * square);
+  VERIFY_IS_APPROX((m1 * m2.transpose()) * m1, m1 * (m2.transpose() * m1));
+}
+
+template<int>
+void integer_types_extra()
+{
+  VERIFY_IS_EQUAL(int(internal::scalar_div_cost<int>::value), 8);
+  VERIFY_IS_EQUAL(int(internal::scalar_div_cost<unsigned int>::value), 8);
+  if(sizeof(long)>sizeof(int)) {
+    VERIFY(int(internal::scalar_div_cost<long>::value) > int(internal::scalar_div_cost<int>::value));
+    VERIFY(int(internal::scalar_div_cost<unsigned long>::value) > int(internal::scalar_div_cost<int>::value));
+  }
+}
+
+EIGEN_DECLARE_TEST(integer_types)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( integer_type_tests(Matrix<unsigned int, 1, 1>()) );
+    CALL_SUBTEST_1( integer_type_tests(Matrix<unsigned long, 3, 4>()) );
+
+    CALL_SUBTEST_2( integer_type_tests(Matrix<long, 2, 2>()) );
+    CALL_SUBTEST_2( signed_integer_type_tests(Matrix<long, 2, 2>()) );
+
+    CALL_SUBTEST_3( integer_type_tests(Matrix<char, 2, Dynamic>(2, 10)) );
+    CALL_SUBTEST_3( signed_integer_type_tests(Matrix<signed char, 2, Dynamic>(2, 10)) );
+
+    CALL_SUBTEST_4( integer_type_tests(Matrix<unsigned char, 3, 3>()) );
+    CALL_SUBTEST_4( integer_type_tests(Matrix<unsigned char, Dynamic, Dynamic>(20, 20)) );
+
+    CALL_SUBTEST_5( integer_type_tests(Matrix<short, Dynamic, 4>(7, 4)) );
+    CALL_SUBTEST_5( signed_integer_type_tests(Matrix<short, Dynamic, 4>(7, 4)) );
+
+    CALL_SUBTEST_6( integer_type_tests(Matrix<unsigned short, 4, 4>()) );
+
+#if EIGEN_HAS_CXX11
+    CALL_SUBTEST_7( integer_type_tests(Matrix<long long, 11, 13>()) );
+    CALL_SUBTEST_7( signed_integer_type_tests(Matrix<long long, 11, 13>()) );
+
+    CALL_SUBTEST_8( integer_type_tests(Matrix<unsigned long long, Dynamic, 5>(1, 5)) );
+#endif
+  }
+  CALL_SUBTEST_9( integer_types_extra<0>() );
+}

diff --git a/test/inverse.cpp b/test/inverse.cpp
new file mode 100644
index 0000000..9cedfa1
--- /dev/null
+++ b/test/inverse.cpp

@@ -0,0 +1,150 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/LU>
+
+template<typename MatrixType>
+void inverse_for_fixed_size(const MatrixType&, typename internal::enable_if<MatrixType::SizeAtCompileTime==Dynamic>::type* = 0)
+{
+}
+
+template<typename MatrixType>
+void inverse_for_fixed_size(const MatrixType& m1, typename internal::enable_if<MatrixType::SizeAtCompileTime!=Dynamic>::type* = 0)
+{
+  using std::abs;
+
+  MatrixType m2, identity = MatrixType::Identity();
+
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> VectorType;
+  
+  //computeInverseAndDetWithCheck tests
+  //First: an invertible matrix
+  bool invertible;
+  Scalar det;
+
+  m2.setZero();
+  m1.computeInverseAndDetWithCheck(m2, det, invertible);
+  VERIFY(invertible);
+  VERIFY_IS_APPROX(identity, m1*m2);
+  VERIFY_IS_APPROX(det, m1.determinant());
+
+  m2.setZero();
+  m1.computeInverseWithCheck(m2, invertible);
+  VERIFY(invertible);
+  VERIFY_IS_APPROX(identity, m1*m2);
+
+  //Second: a rank one matrix (not invertible, except for 1x1 matrices)
+  VectorType v3 = VectorType::Random();
+  MatrixType m3 = v3*v3.transpose(), m4;
+  m3.computeInverseAndDetWithCheck(m4, det, invertible);
+  VERIFY( m1.rows()==1 ? invertible : !invertible );
+  VERIFY_IS_MUCH_SMALLER_THAN(abs(det-m3.determinant()), RealScalar(1));
+  m3.computeInverseWithCheck(m4, invertible);
+  VERIFY( m1.rows()==1 ? invertible : !invertible );
+  
+  // check with submatrices
+  {
+    Matrix<Scalar, MatrixType::RowsAtCompileTime+1, MatrixType::RowsAtCompileTime+1, MatrixType::Options> m5;
+    m5.setRandom();
+    m5.topLeftCorner(m1.rows(),m1.rows()) = m1;
+    m2 = m5.template topLeftCorner<MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime>().inverse();
+    VERIFY_IS_APPROX( (m5.template topLeftCorner<MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime>()), m2.inverse() );
+  }
+}
+
+template<typename MatrixType> void inverse(const MatrixType& m)
+{
+  /* this test covers the following files:
+     Inverse.h
+  */
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  typedef typename MatrixType::Scalar Scalar;
+
+  MatrixType m1(rows, cols),
+             m2(rows, cols),
+             identity = MatrixType::Identity(rows, rows);
+  createRandomPIMatrixOfRank(rows,rows,rows,m1);
+  m2 = m1.inverse();
+  VERIFY_IS_APPROX(m1, m2.inverse() );
+
+  VERIFY_IS_APPROX((Scalar(2)*m2).inverse(), m2.inverse()*Scalar(0.5));
+
+  VERIFY_IS_APPROX(identity, m1.inverse() * m1 );
+  VERIFY_IS_APPROX(identity, m1 * m1.inverse() );
+
+  VERIFY_IS_APPROX(m1, m1.inverse().inverse() );
+
+  // since for the general case we implement separately row-major and col-major, test that
+  VERIFY_IS_APPROX(MatrixType(m1.transpose().inverse()), MatrixType(m1.inverse().transpose()));
+
+  inverse_for_fixed_size(m1);
+
+  // check in-place inversion
+  if(MatrixType::RowsAtCompileTime>=2 && MatrixType::RowsAtCompileTime<=4)
+  {
+    // in-place is forbidden
+    VERIFY_RAISES_ASSERT(m1 = m1.inverse());
+  }
+  else
+  {
+    m2 = m1.inverse();
+    m1 = m1.inverse();
+    VERIFY_IS_APPROX(m1,m2);
+  }
+}
+
+template<typename Scalar>
+void inverse_zerosized()
+{
+  Matrix<Scalar,Dynamic,Dynamic> A(0,0);
+  {
+    Matrix<Scalar,0,1> b, x;
+    x = A.inverse() * b;
+  }
+  {
+    Matrix<Scalar,Dynamic,Dynamic> b(0,1), x;
+    x = A.inverse() * b;
+    VERIFY_IS_EQUAL(x.rows(), 0);
+    VERIFY_IS_EQUAL(x.cols(), 1);
+  }
+}
+
+EIGEN_DECLARE_TEST(inverse)
+{
+  int s = 0;
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( inverse(Matrix<double,1,1>()) );
+    CALL_SUBTEST_2( inverse(Matrix2d()) );
+    CALL_SUBTEST_3( inverse(Matrix3f()) );
+    CALL_SUBTEST_4( inverse(Matrix4f()) );
+    CALL_SUBTEST_4( inverse(Matrix<float,4,4,DontAlign>()) );
+    
+    s = internal::random<int>(50,320); 
+    CALL_SUBTEST_5( inverse(MatrixXf(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    CALL_SUBTEST_5( inverse_zerosized<float>() );
+    CALL_SUBTEST_5( inverse(MatrixXf(0, 0)) );
+    CALL_SUBTEST_5( inverse(MatrixXf(1, 1)) );
+    
+    s = internal::random<int>(25,100);
+    CALL_SUBTEST_6( inverse(MatrixXcd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
+    CALL_SUBTEST_7( inverse(Matrix4d()) );
+    CALL_SUBTEST_7( inverse(Matrix<double,4,4,DontAlign>()) );
+
+    CALL_SUBTEST_8( inverse(Matrix4cd()) );
+  }
+}

diff --git a/test/io.cpp b/test/io.cpp
new file mode 100644
index 0000000..aa14e76
--- /dev/null
+++ b/test/io.cpp

@@ -0,0 +1,71 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2019 Joel Holdsworth <joel.holdsworth@vcatechnology.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <sstream>
+
+#include "main.h"
+
+template<typename Scalar>
+struct check_ostream_impl
+{
+  static void run()
+  {
+    const Array<Scalar,1,1> array(123);
+    std::ostringstream ss;
+    ss << array;
+    VERIFY(ss.str() == "123");
+
+    check_ostream_impl< std::complex<Scalar> >::run();
+  };
+};
+
+template<>
+struct check_ostream_impl<bool>
+{
+  static void run()
+  {
+    const Array<bool,1,2> array(1, 0);
+    std::ostringstream ss;
+    ss << array;
+    VERIFY(ss.str() == "1  0");
+  };
+};
+
+template<typename Scalar>
+struct check_ostream_impl< std::complex<Scalar> >
+{
+  static void run()
+  {
+    const Array<std::complex<Scalar>,1,1> array(std::complex<Scalar>(12, 34));
+    std::ostringstream ss;
+    ss << array;
+    VERIFY(ss.str() == "(12,34)");
+  };
+};
+
+template<typename Scalar>
+static void check_ostream()
+{
+  check_ostream_impl<Scalar>::run();
+}
+
+EIGEN_DECLARE_TEST(rand)
+{
+  CALL_SUBTEST(check_ostream<bool>());
+  CALL_SUBTEST(check_ostream<float>());
+  CALL_SUBTEST(check_ostream<double>());
+  CALL_SUBTEST(check_ostream<Eigen::numext::int8_t>());
+  CALL_SUBTEST(check_ostream<Eigen::numext::uint8_t>());
+  CALL_SUBTEST(check_ostream<Eigen::numext::int16_t>());
+  CALL_SUBTEST(check_ostream<Eigen::numext::uint16_t>());
+  CALL_SUBTEST(check_ostream<Eigen::numext::int32_t>());
+  CALL_SUBTEST(check_ostream<Eigen::numext::uint32_t>());
+  CALL_SUBTEST(check_ostream<Eigen::numext::int64_t>());
+  CALL_SUBTEST(check_ostream<Eigen::numext::uint64_t>());
+}

diff --git a/test/is_same_dense.cpp b/test/is_same_dense.cpp
new file mode 100644
index 0000000..23dd806
--- /dev/null
+++ b/test/is_same_dense.cpp

@@ -0,0 +1,41 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+using internal::is_same_dense;
+
+EIGEN_DECLARE_TEST(is_same_dense)
+{
+  typedef Matrix<double,Dynamic,Dynamic,ColMajor> ColMatrixXd;
+  typedef Matrix<std::complex<double>,Dynamic,Dynamic,ColMajor> ColMatrixXcd;
+  ColMatrixXd m1(10,10);
+  ColMatrixXcd m2(10,10);
+  Ref<ColMatrixXd> ref_m1(m1);
+  Ref<ColMatrixXd,0, Stride<Dynamic,Dynamic> >  ref_m2_real(m2.real());
+  Ref<const ColMatrixXd> const_ref_m1(m1);
+
+  VERIFY(is_same_dense(m1,m1));
+  VERIFY(is_same_dense(m1,ref_m1));
+  VERIFY(is_same_dense(const_ref_m1,m1));
+  VERIFY(is_same_dense(const_ref_m1,ref_m1));
+  
+  VERIFY(is_same_dense(m1.block(0,0,m1.rows(),m1.cols()),m1));
+  VERIFY(!is_same_dense(m1.row(0),m1.col(0)));
+  
+  Ref<const ColMatrixXd> const_ref_m1_row(m1.row(1));
+  VERIFY(!is_same_dense(m1.row(1),const_ref_m1_row));
+  
+  Ref<const ColMatrixXd> const_ref_m1_col(m1.col(1));
+  VERIFY(is_same_dense(m1.col(1),const_ref_m1_col));
+
+
+  VERIFY(!is_same_dense(m1, ref_m2_real));
+  VERIFY(!is_same_dense(m2, ref_m2_real));
+}

diff --git a/test/jacobi.cpp b/test/jacobi.cpp
new file mode 100644
index 0000000..5604797
--- /dev/null
+++ b/test/jacobi.cpp

@@ -0,0 +1,80 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/SVD>
+
+template<typename MatrixType, typename JacobiScalar>
+void jacobi(const MatrixType& m = MatrixType())
+{
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime
+  };
+
+  typedef Matrix<JacobiScalar, 2, 1> JacobiVector;
+
+  const MatrixType a(MatrixType::Random(rows, cols));
+
+  JacobiVector v = JacobiVector::Random().normalized();
+  JacobiScalar c = v.x(), s = v.y();
+  JacobiRotation<JacobiScalar> rot(c, s);
+
+  {
+    Index p = internal::random<Index>(0, rows-1);
+    Index q;
+    do {
+      q = internal::random<Index>(0, rows-1);
+    } while (q == p);
+
+    MatrixType b = a;
+    b.applyOnTheLeft(p, q, rot);
+    VERIFY_IS_APPROX(b.row(p), c * a.row(p) + numext::conj(s) * a.row(q));
+    VERIFY_IS_APPROX(b.row(q), -s * a.row(p) + numext::conj(c) * a.row(q));
+  }
+
+  {
+    Index p = internal::random<Index>(0, cols-1);
+    Index q;
+    do {
+      q = internal::random<Index>(0, cols-1);
+    } while (q == p);
+
+    MatrixType b = a;
+    b.applyOnTheRight(p, q, rot);
+    VERIFY_IS_APPROX(b.col(p), c * a.col(p) - s * a.col(q));
+    VERIFY_IS_APPROX(b.col(q), numext::conj(s) * a.col(p) + numext::conj(c) * a.col(q));
+  }
+}
+
+EIGEN_DECLARE_TEST(jacobi)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1(( jacobi<Matrix3f, float>() ));
+    CALL_SUBTEST_2(( jacobi<Matrix4d, double>() ));
+    CALL_SUBTEST_3(( jacobi<Matrix4cf, float>() ));
+    CALL_SUBTEST_3(( jacobi<Matrix4cf, std::complex<float> >() ));
+
+    int r = internal::random<int>(2, internal::random<int>(1,EIGEN_TEST_MAX_SIZE)/2),
+        c = internal::random<int>(2, internal::random<int>(1,EIGEN_TEST_MAX_SIZE)/2);
+    CALL_SUBTEST_4(( jacobi<MatrixXf, float>(MatrixXf(r,c)) ));
+    CALL_SUBTEST_5(( jacobi<MatrixXcd, double>(MatrixXcd(r,c)) ));
+    CALL_SUBTEST_5(( jacobi<MatrixXcd, std::complex<double> >(MatrixXcd(r,c)) ));
+    // complex<float> is really important to test as it is the only way to cover conjugation issues in certain unaligned paths
+    CALL_SUBTEST_6(( jacobi<MatrixXcf, float>(MatrixXcf(r,c)) ));
+    CALL_SUBTEST_6(( jacobi<MatrixXcf, std::complex<float> >(MatrixXcf(r,c)) ));
+    
+    TEST_SET_BUT_UNUSED_VARIABLE(r);
+    TEST_SET_BUT_UNUSED_VARIABLE(c);
+  }
+}

diff --git a/test/jacobisvd.cpp b/test/jacobisvd.cpp
new file mode 100644
index 0000000..5b15c5a
--- /dev/null
+++ b/test/jacobisvd.cpp

@@ -0,0 +1,147 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// discard stack allocation as that too bypasses malloc
+#define EIGEN_STACK_ALLOCATION_LIMIT 0
+#define EIGEN_RUNTIME_NO_MALLOC
+#include "main.h"
+#include <Eigen/SVD>
+
+#define SVD_DEFAULT(M) JacobiSVD<M>
+#define SVD_FOR_MIN_NORM(M) JacobiSVD<M,ColPivHouseholderQRPreconditioner>
+#include "svd_common.h"
+
+// Check all variants of JacobiSVD
+template<typename MatrixType>
+void jacobisvd(const MatrixType& a = MatrixType(), bool pickrandom = true)
+{
+  MatrixType m = a;
+  if(pickrandom)
+    svd_fill_random(m);
+
+  CALL_SUBTEST(( svd_test_all_computation_options<JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner> >(m, true)  )); // check full only
+  CALL_SUBTEST(( svd_test_all_computation_options<JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>  >(m, false) ));
+  CALL_SUBTEST(( svd_test_all_computation_options<JacobiSVD<MatrixType, HouseholderQRPreconditioner>        >(m, false) ));
+  if(m.rows()==m.cols())
+    CALL_SUBTEST(( svd_test_all_computation_options<JacobiSVD<MatrixType, NoQRPreconditioner>               >(m, false) ));
+}
+
+template<typename MatrixType> void jacobisvd_verify_assert(const MatrixType& m)
+{
+  svd_verify_assert<JacobiSVD<MatrixType> >(m);
+  svd_verify_assert<JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner> >(m, true);
+  svd_verify_assert<JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner> >(m);
+  svd_verify_assert<JacobiSVD<MatrixType, HouseholderQRPreconditioner> >(m);
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  enum {
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime
+  };
+
+
+  MatrixType a = MatrixType::Zero(rows, cols);
+  a.setZero();
+
+  if (ColsAtCompileTime == Dynamic)
+  {
+    JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner> svd_fullqr;
+    VERIFY_RAISES_ASSERT(svd_fullqr.compute(a, ComputeFullU|ComputeThinV))
+    VERIFY_RAISES_ASSERT(svd_fullqr.compute(a, ComputeThinU|ComputeThinV))
+    VERIFY_RAISES_ASSERT(svd_fullqr.compute(a, ComputeThinU|ComputeFullV))
+  }
+}
+
+template<typename MatrixType>
+void jacobisvd_method()
+{
+  enum { Size = MatrixType::RowsAtCompileTime };
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef Matrix<RealScalar, Size, 1> RealVecType;
+  MatrixType m = MatrixType::Identity();
+  VERIFY_IS_APPROX(m.jacobiSvd().singularValues(), RealVecType::Ones());
+  VERIFY_RAISES_ASSERT(m.jacobiSvd().matrixU());
+  VERIFY_RAISES_ASSERT(m.jacobiSvd().matrixV());
+  VERIFY_IS_APPROX(m.jacobiSvd(ComputeFullU|ComputeFullV).solve(m), m);
+  VERIFY_IS_APPROX(m.jacobiSvd(ComputeFullU|ComputeFullV).transpose().solve(m), m);
+  VERIFY_IS_APPROX(m.jacobiSvd(ComputeFullU|ComputeFullV).adjoint().solve(m), m);
+}
+
+namespace Foo {
+// older compiler require a default constructor for Bar
+// cf: https://stackoverflow.com/questions/7411515/
+class Bar {public: Bar() {}};
+bool operator<(const Bar&, const Bar&) { return true; }
+}
+// regression test for a very strange MSVC issue for which simply
+// including SVDBase.h messes up with std::max and custom scalar type
+void msvc_workaround()
+{
+  const Foo::Bar a;
+  const Foo::Bar b;
+  std::max EIGEN_NOT_A_MACRO (a,b);
+}
+
+EIGEN_DECLARE_TEST(jacobisvd)
+{
+  CALL_SUBTEST_3(( jacobisvd_verify_assert(Matrix3f()) ));
+  CALL_SUBTEST_4(( jacobisvd_verify_assert(Matrix4d()) ));
+  CALL_SUBTEST_7(( jacobisvd_verify_assert(MatrixXf(10,12)) ));
+  CALL_SUBTEST_8(( jacobisvd_verify_assert(MatrixXcd(7,5)) ));
+  
+  CALL_SUBTEST_11(svd_all_trivial_2x2(jacobisvd<Matrix2cd>));
+  CALL_SUBTEST_12(svd_all_trivial_2x2(jacobisvd<Matrix2d>));
+
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_3(( jacobisvd<Matrix3f>() ));
+    CALL_SUBTEST_4(( jacobisvd<Matrix4d>() ));
+    CALL_SUBTEST_5(( jacobisvd<Matrix<float,3,5> >() ));
+    CALL_SUBTEST_6(( jacobisvd<Matrix<double,Dynamic,2> >(Matrix<double,Dynamic,2>(10,2)) ));
+
+    int r = internal::random<int>(1, 30),
+        c = internal::random<int>(1, 30);
+    
+    TEST_SET_BUT_UNUSED_VARIABLE(r)
+    TEST_SET_BUT_UNUSED_VARIABLE(c)
+    
+    CALL_SUBTEST_10(( jacobisvd<MatrixXd>(MatrixXd(r,c)) ));
+    CALL_SUBTEST_7(( jacobisvd<MatrixXf>(MatrixXf(r,c)) ));
+    CALL_SUBTEST_8(( jacobisvd<MatrixXcd>(MatrixXcd(r,c)) ));
+    (void) r;
+    (void) c;
+
+    // Test on inf/nan matrix
+    CALL_SUBTEST_7(  (svd_inf_nan<JacobiSVD<MatrixXf>, MatrixXf>()) );
+    CALL_SUBTEST_10( (svd_inf_nan<JacobiSVD<MatrixXd>, MatrixXd>()) );
+
+    // bug1395 test compile-time vectors as input
+    CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix<double,6,1>()) ));
+    CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix<double,1,6>()) ));
+    CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix<double,Dynamic,1>(r)) ));
+    CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix<double,1,Dynamic>(c)) ));
+  }
+
+  CALL_SUBTEST_7(( jacobisvd<MatrixXf>(MatrixXf(internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2), internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) ));
+  CALL_SUBTEST_8(( jacobisvd<MatrixXcd>(MatrixXcd(internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/3), internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/3))) ));
+
+  // test matrixbase method
+  CALL_SUBTEST_1(( jacobisvd_method<Matrix2cd>() ));
+  CALL_SUBTEST_3(( jacobisvd_method<Matrix3f>() ));
+
+  // Test problem size constructors
+  CALL_SUBTEST_7( JacobiSVD<MatrixXf>(10,10) );
+
+  // Check that preallocation avoids subsequent mallocs
+  CALL_SUBTEST_9( svd_preallocate<void>() );
+
+  CALL_SUBTEST_2( svd_underoverflow<void>() );
+
+  msvc_workaround();
+}

diff --git a/test/klu_support.cpp b/test/klu_support.cpp
new file mode 100644
index 0000000..f806ad5
--- /dev/null
+++ b/test/klu_support.cpp

@@ -0,0 +1,32 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
+#include "sparse_solver.h"
+
+#include <Eigen/KLUSupport>
+
+template<typename T> void test_klu_support_T()
+{
+  KLU<SparseMatrix<T, ColMajor> > klu_colmajor;
+  KLU<SparseMatrix<T, RowMajor> > klu_rowmajor;
+  
+  check_sparse_square_solving(klu_colmajor);
+  check_sparse_square_solving(klu_rowmajor);
+  
+  //check_sparse_square_determinant(umfpack_colmajor);
+  //check_sparse_square_determinant(umfpack_rowmajor);
+}
+
+EIGEN_DECLARE_TEST(klu_support)
+{
+  CALL_SUBTEST_1(test_klu_support_T<double>());
+  CALL_SUBTEST_2(test_klu_support_T<std::complex<double> >());
+}
+

diff --git a/test/linearstructure.cpp b/test/linearstructure.cpp
new file mode 100644
index 0000000..46ee516
--- /dev/null
+++ b/test/linearstructure.cpp

@@ -0,0 +1,147 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+static bool g_called;
+#define EIGEN_SCALAR_BINARY_OP_PLUGIN { g_called |= (!internal::is_same<LhsScalar,RhsScalar>::value); }
+
+#include "main.h"
+
+template<typename MatrixType> void linearStructure(const MatrixType& m)
+{
+  using std::abs;
+  /* this test covers the following files:
+     CwiseUnaryOp.h, CwiseBinaryOp.h, SelfCwiseBinaryOp.h 
+  */
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  // this test relies a lot on Random.h, and there's not much more that we can do
+  // to test it, hence I consider that we will have tested Random.h
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m2 = MatrixType::Random(rows, cols),
+             m3(rows, cols);
+
+  Scalar s1 = internal::random<Scalar>();
+  while (abs(s1)<RealScalar(1e-3)) s1 = internal::random<Scalar>();
+
+  Index r = internal::random<Index>(0, rows-1),
+        c = internal::random<Index>(0, cols-1);
+
+  VERIFY_IS_APPROX(-(-m1),                  m1);
+  VERIFY_IS_APPROX(m1+m1,                   2*m1);
+  VERIFY_IS_APPROX(m1+m2-m1,                m2);
+  VERIFY_IS_APPROX(-m2+m1+m2,               m1);
+  VERIFY_IS_APPROX(m1*s1,                   s1*m1);
+  VERIFY_IS_APPROX((m1+m2)*s1,              s1*m1+s1*m2);
+  VERIFY_IS_APPROX((-m1+m2)*s1,             -s1*m1+s1*m2);
+  m3 = m2; m3 += m1;
+  VERIFY_IS_APPROX(m3,                      m1+m2);
+  m3 = m2; m3 -= m1;
+  VERIFY_IS_APPROX(m3,                      m2-m1);
+  m3 = m2; m3 *= s1;
+  VERIFY_IS_APPROX(m3,                      s1*m2);
+  if(!NumTraits<Scalar>::IsInteger)
+  {
+    m3 = m2; m3 /= s1;
+    VERIFY_IS_APPROX(m3,                    m2/s1);
+  }
+
+  // again, test operator() to check const-qualification
+  VERIFY_IS_APPROX((-m1)(r,c), -(m1(r,c)));
+  VERIFY_IS_APPROX((m1-m2)(r,c), (m1(r,c))-(m2(r,c)));
+  VERIFY_IS_APPROX((m1+m2)(r,c), (m1(r,c))+(m2(r,c)));
+  VERIFY_IS_APPROX((s1*m1)(r,c), s1*(m1(r,c)));
+  VERIFY_IS_APPROX((m1*s1)(r,c), (m1(r,c))*s1);
+  if(!NumTraits<Scalar>::IsInteger)
+    VERIFY_IS_APPROX((m1/s1)(r,c), (m1(r,c))/s1);
+
+  // use .block to disable vectorization and compare to the vectorized version
+  VERIFY_IS_APPROX(m1+m1.block(0,0,rows,cols), m1+m1);
+  VERIFY_IS_APPROX(m1.cwiseProduct(m1.block(0,0,rows,cols)), m1.cwiseProduct(m1));
+  VERIFY_IS_APPROX(m1 - m1.block(0,0,rows,cols), m1 - m1);
+  VERIFY_IS_APPROX(m1.block(0,0,rows,cols) * s1, m1 * s1);
+}
+
+// Make sure that complex * real and real * complex are properly optimized
+template<typename MatrixType> void real_complex(DenseIndex rows = MatrixType::RowsAtCompileTime, DenseIndex cols = MatrixType::ColsAtCompileTime)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  
+  RealScalar s = internal::random<RealScalar>();
+  MatrixType m1 = MatrixType::Random(rows, cols);
+  
+  g_called = false;
+  VERIFY_IS_APPROX(s*m1, Scalar(s)*m1);
+  VERIFY(g_called && "real * matrix<complex> not properly optimized");
+  
+  g_called = false;
+  VERIFY_IS_APPROX(m1*s, m1*Scalar(s));
+  VERIFY(g_called && "matrix<complex> * real not properly optimized");
+  
+  g_called = false;
+  VERIFY_IS_APPROX(m1/s, m1/Scalar(s));
+  VERIFY(g_called && "matrix<complex> / real not properly optimized");
+
+  g_called = false;
+  VERIFY_IS_APPROX(s+m1.array(), Scalar(s)+m1.array());
+  VERIFY(g_called && "real + matrix<complex> not properly optimized");
+
+  g_called = false;
+  VERIFY_IS_APPROX(m1.array()+s, m1.array()+Scalar(s));
+  VERIFY(g_called && "matrix<complex> + real not properly optimized");
+
+  g_called = false;
+  VERIFY_IS_APPROX(s-m1.array(), Scalar(s)-m1.array());
+  VERIFY(g_called && "real - matrix<complex> not properly optimized");
+
+  g_called = false;
+  VERIFY_IS_APPROX(m1.array()-s, m1.array()-Scalar(s));
+  VERIFY(g_called && "matrix<complex> - real not properly optimized");
+}
+
+template<int>
+void linearstructure_overflow()
+{
+  // make sure that /=scalar and /scalar do not overflow
+  // rational: 1.0/4.94e-320 overflow, but m/4.94e-320 should not
+  Matrix4d m2, m3;
+  m3 = m2 =  Matrix4d::Random()*1e-20;
+  m2 = m2 / 4.9e-320;
+  VERIFY_IS_APPROX(m2.cwiseQuotient(m2), Matrix4d::Ones());
+  m3 /= 4.9e-320;
+  VERIFY_IS_APPROX(m3.cwiseQuotient(m3), Matrix4d::Ones());
+}
+
+EIGEN_DECLARE_TEST(linearstructure)
+{
+  g_called = true;
+  VERIFY(g_called); // avoid `unneeded-internal-declaration` warning.
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( linearStructure(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( linearStructure(Matrix2f()) );
+    CALL_SUBTEST_3( linearStructure(Vector3d()) );
+    CALL_SUBTEST_4( linearStructure(Matrix4d()) );
+    CALL_SUBTEST_5( linearStructure(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
+    CALL_SUBTEST_6( linearStructure(MatrixXf (internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_7( linearStructure(MatrixXi (internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_8( linearStructure(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
+    CALL_SUBTEST_9( linearStructure(ArrayXXf (internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_10( linearStructure(ArrayXXcf (internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    
+    CALL_SUBTEST_11( real_complex<Matrix4cd>() );
+    CALL_SUBTEST_11( real_complex<MatrixXcf>(10,10) );
+    CALL_SUBTEST_11( real_complex<ArrayXXcf>(10,10) );
+  }
+  CALL_SUBTEST_4( linearstructure_overflow<0>() );
+}

diff --git a/test/lscg.cpp b/test/lscg.cpp
new file mode 100644
index 0000000..feb2347
--- /dev/null
+++ b/test/lscg.cpp

@@ -0,0 +1,37 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "sparse_solver.h"
+#include <Eigen/IterativeLinearSolvers>
+
+template<typename T> void test_lscg_T()
+{
+  LeastSquaresConjugateGradient<SparseMatrix<T> > lscg_colmajor_diag;
+  LeastSquaresConjugateGradient<SparseMatrix<T>, IdentityPreconditioner> lscg_colmajor_I;
+  LeastSquaresConjugateGradient<SparseMatrix<T,RowMajor> > lscg_rowmajor_diag;
+  LeastSquaresConjugateGradient<SparseMatrix<T,RowMajor>, IdentityPreconditioner> lscg_rowmajor_I;
+
+  CALL_SUBTEST( check_sparse_square_solving(lscg_colmajor_diag)  );
+  CALL_SUBTEST( check_sparse_square_solving(lscg_colmajor_I)     );
+  
+  CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_colmajor_diag)  );
+  CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_colmajor_I)     );
+
+  CALL_SUBTEST( check_sparse_square_solving(lscg_rowmajor_diag)  );
+  CALL_SUBTEST( check_sparse_square_solving(lscg_rowmajor_I)     );
+
+  CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_rowmajor_diag)  );
+  CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_rowmajor_I)     );
+}
+
+EIGEN_DECLARE_TEST(lscg)
+{
+  CALL_SUBTEST_1(test_lscg_T<double>());
+  CALL_SUBTEST_2(test_lscg_T<std::complex<double> >());
+}

diff --git a/test/lu.cpp b/test/lu.cpp
new file mode 100644
index 0000000..1bbadcb
--- /dev/null
+++ b/test/lu.cpp

@@ -0,0 +1,252 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/LU>
+#include "solverbase.h"
+using namespace std;
+
+template<typename MatrixType>
+typename MatrixType::RealScalar matrix_l1_norm(const MatrixType& m) {
+  return m.cwiseAbs().colwise().sum().maxCoeff();
+}
+
+template<typename MatrixType> void lu_non_invertible()
+{
+  STATIC_CHECK(( internal::is_same<typename FullPivLU<MatrixType>::StorageIndex,int>::value ));
+
+  typedef typename MatrixType::RealScalar RealScalar;
+  /* this test covers the following files:
+     LU.h
+  */
+  Index rows, cols, cols2;
+  if(MatrixType::RowsAtCompileTime==Dynamic)
+  {
+    rows = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE);
+  }
+  else
+  {
+    rows = MatrixType::RowsAtCompileTime;
+  }
+  if(MatrixType::ColsAtCompileTime==Dynamic)
+  {
+    cols = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE);
+    cols2 = internal::random<int>(2,EIGEN_TEST_MAX_SIZE);
+  }
+  else
+  {
+    cols2 = cols = MatrixType::ColsAtCompileTime;
+  }
+
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime
+  };
+  typedef typename internal::kernel_retval_base<FullPivLU<MatrixType> >::ReturnType KernelMatrixType;
+  typedef typename internal::image_retval_base<FullPivLU<MatrixType> >::ReturnType ImageMatrixType;
+  typedef Matrix<typename MatrixType::Scalar, ColsAtCompileTime, ColsAtCompileTime>
+          CMatrixType;
+  typedef Matrix<typename MatrixType::Scalar, RowsAtCompileTime, RowsAtCompileTime>
+          RMatrixType;
+
+  Index rank = internal::random<Index>(1, (std::min)(rows, cols)-1);
+
+  // The image of the zero matrix should consist of a single (zero) column vector
+  VERIFY((MatrixType::Zero(rows,cols).fullPivLu().image(MatrixType::Zero(rows,cols)).cols() == 1));
+
+  // The kernel of the zero matrix is the entire space, and thus is an invertible matrix of dimensions cols.
+  KernelMatrixType kernel = MatrixType::Zero(rows,cols).fullPivLu().kernel();
+  VERIFY((kernel.fullPivLu().isInvertible()));
+
+  MatrixType m1(rows, cols), m3(rows, cols2);
+  CMatrixType m2(cols, cols2);
+  createRandomPIMatrixOfRank(rank, rows, cols, m1);
+
+  FullPivLU<MatrixType> lu;
+
+  // The special value 0.01 below works well in tests. Keep in mind that we're only computing the rank
+  // of singular values are either 0 or 1.
+  // So it's not clear at all that the epsilon should play any role there.
+  lu.setThreshold(RealScalar(0.01));
+  lu.compute(m1);
+
+  MatrixType u(rows,cols);
+  u = lu.matrixLU().template triangularView<Upper>();
+  RMatrixType l = RMatrixType::Identity(rows,rows);
+  l.block(0,0,rows,(std::min)(rows,cols)).template triangularView<StrictlyLower>()
+    = lu.matrixLU().block(0,0,rows,(std::min)(rows,cols));
+
+  VERIFY_IS_APPROX(lu.permutationP() * m1 * lu.permutationQ(), l*u);
+
+  KernelMatrixType m1kernel = lu.kernel();
+  ImageMatrixType m1image = lu.image(m1);
+
+  VERIFY_IS_APPROX(m1, lu.reconstructedMatrix());
+  VERIFY(rank == lu.rank());
+  VERIFY(cols - lu.rank() == lu.dimensionOfKernel());
+  VERIFY(!lu.isInjective());
+  VERIFY(!lu.isInvertible());
+  VERIFY(!lu.isSurjective());
+  VERIFY_IS_MUCH_SMALLER_THAN((m1 * m1kernel), m1);
+  VERIFY(m1image.fullPivLu().rank() == rank);
+  VERIFY_IS_APPROX(m1 * m1.adjoint() * m1image, m1image);
+
+  check_solverbase<CMatrixType, MatrixType>(m1, lu, rows, cols, cols2);
+
+  m2 = CMatrixType::Random(cols,cols2);
+  m3 = m1*m2;
+  m2 = CMatrixType::Random(cols,cols2);
+  // test that the code, which does resize(), may be applied to an xpr
+  m2.block(0,0,m2.rows(),m2.cols()) = lu.solve(m3);
+  VERIFY_IS_APPROX(m3, m1*m2);
+}
+
+template<typename MatrixType> void lu_invertible()
+{
+  /* this test covers the following files:
+     FullPivLU.h
+  */
+  typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+  Index size = MatrixType::RowsAtCompileTime;
+  if( size==Dynamic)
+    size = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+
+  MatrixType m1(size, size), m2(size, size), m3(size, size);
+  FullPivLU<MatrixType> lu;
+  lu.setThreshold(RealScalar(0.01));
+  do {
+    m1 = MatrixType::Random(size,size);
+    lu.compute(m1);
+  } while(!lu.isInvertible());
+
+  VERIFY_IS_APPROX(m1, lu.reconstructedMatrix());
+  VERIFY(0 == lu.dimensionOfKernel());
+  VERIFY(lu.kernel().cols() == 1); // the kernel() should consist of a single (zero) column vector
+  VERIFY(size == lu.rank());
+  VERIFY(lu.isInjective());
+  VERIFY(lu.isSurjective());
+  VERIFY(lu.isInvertible());
+  VERIFY(lu.image(m1).fullPivLu().isInvertible());
+
+  check_solverbase<MatrixType, MatrixType>(m1, lu, size, size, size);
+
+  MatrixType m1_inverse = lu.inverse();
+  m3 = MatrixType::Random(size,size);
+  m2 = lu.solve(m3);
+  VERIFY_IS_APPROX(m2, m1_inverse*m3);
+
+  RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m1)) / matrix_l1_norm(m1_inverse);
+  const RealScalar rcond_est = lu.rcond();
+  // Verify that the estimated condition number is within a factor of 10 of the
+  // truth.
+  VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
+
+  // Regression test for Bug 302
+  MatrixType m4 = MatrixType::Random(size,size);
+  VERIFY_IS_APPROX(lu.solve(m3*m4), lu.solve(m3)*m4);
+}
+
+template<typename MatrixType> void lu_partial_piv(Index size = MatrixType::ColsAtCompileTime)
+{
+  /* this test covers the following files:
+     PartialPivLU.h
+  */
+  typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+
+  MatrixType m1(size, size), m2(size, size), m3(size, size);
+  m1.setRandom();
+  PartialPivLU<MatrixType> plu(m1);
+
+  STATIC_CHECK(( internal::is_same<typename PartialPivLU<MatrixType>::StorageIndex,int>::value ));
+
+  VERIFY_IS_APPROX(m1, plu.reconstructedMatrix());
+
+  check_solverbase<MatrixType, MatrixType>(m1, plu, size, size, size);
+
+  MatrixType m1_inverse = plu.inverse();
+  m3 = MatrixType::Random(size,size);
+  m2 = plu.solve(m3);
+  VERIFY_IS_APPROX(m2, m1_inverse*m3);
+
+  RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m1)) / matrix_l1_norm(m1_inverse);
+  const RealScalar rcond_est = plu.rcond();
+  // Verify that the estimate is within a factor of 10 of the truth.
+  VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
+}
+
+template<typename MatrixType> void lu_verify_assert()
+{
+  MatrixType tmp;
+
+  FullPivLU<MatrixType> lu;
+  VERIFY_RAISES_ASSERT(lu.matrixLU())
+  VERIFY_RAISES_ASSERT(lu.permutationP())
+  VERIFY_RAISES_ASSERT(lu.permutationQ())
+  VERIFY_RAISES_ASSERT(lu.kernel())
+  VERIFY_RAISES_ASSERT(lu.image(tmp))
+  VERIFY_RAISES_ASSERT(lu.solve(tmp))
+  VERIFY_RAISES_ASSERT(lu.transpose().solve(tmp))
+  VERIFY_RAISES_ASSERT(lu.adjoint().solve(tmp))
+  VERIFY_RAISES_ASSERT(lu.determinant())
+  VERIFY_RAISES_ASSERT(lu.rank())
+  VERIFY_RAISES_ASSERT(lu.dimensionOfKernel())
+  VERIFY_RAISES_ASSERT(lu.isInjective())
+  VERIFY_RAISES_ASSERT(lu.isSurjective())
+  VERIFY_RAISES_ASSERT(lu.isInvertible())
+  VERIFY_RAISES_ASSERT(lu.inverse())
+
+  PartialPivLU<MatrixType> plu;
+  VERIFY_RAISES_ASSERT(plu.matrixLU())
+  VERIFY_RAISES_ASSERT(plu.permutationP())
+  VERIFY_RAISES_ASSERT(plu.solve(tmp))
+  VERIFY_RAISES_ASSERT(plu.transpose().solve(tmp))
+  VERIFY_RAISES_ASSERT(plu.adjoint().solve(tmp))
+  VERIFY_RAISES_ASSERT(plu.determinant())
+  VERIFY_RAISES_ASSERT(plu.inverse())
+}
+
+EIGEN_DECLARE_TEST(lu)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( lu_non_invertible<Matrix3f>() );
+    CALL_SUBTEST_1( lu_invertible<Matrix3f>() );
+    CALL_SUBTEST_1( lu_verify_assert<Matrix3f>() );
+    CALL_SUBTEST_1( lu_partial_piv<Matrix3f>() );
+
+    CALL_SUBTEST_2( (lu_non_invertible<Matrix<double, 4, 6> >()) );
+    CALL_SUBTEST_2( (lu_verify_assert<Matrix<double, 4, 6> >()) );
+    CALL_SUBTEST_2( lu_partial_piv<Matrix2d>() );
+    CALL_SUBTEST_2( lu_partial_piv<Matrix4d>() );
+    CALL_SUBTEST_2( (lu_partial_piv<Matrix<double,6,6> >()) );
+
+    CALL_SUBTEST_3( lu_non_invertible<MatrixXf>() );
+    CALL_SUBTEST_3( lu_invertible<MatrixXf>() );
+    CALL_SUBTEST_3( lu_verify_assert<MatrixXf>() );
+
+    CALL_SUBTEST_4( lu_non_invertible<MatrixXd>() );
+    CALL_SUBTEST_4( lu_invertible<MatrixXd>() );
+    CALL_SUBTEST_4( lu_partial_piv<MatrixXd>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)) );
+    CALL_SUBTEST_4( lu_verify_assert<MatrixXd>() );
+
+    CALL_SUBTEST_5( lu_non_invertible<MatrixXcf>() );
+    CALL_SUBTEST_5( lu_invertible<MatrixXcf>() );
+    CALL_SUBTEST_5( lu_verify_assert<MatrixXcf>() );
+
+    CALL_SUBTEST_6( lu_non_invertible<MatrixXcd>() );
+    CALL_SUBTEST_6( lu_invertible<MatrixXcd>() );
+    CALL_SUBTEST_6( lu_partial_piv<MatrixXcd>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)) );
+    CALL_SUBTEST_6( lu_verify_assert<MatrixXcd>() );
+
+    CALL_SUBTEST_7(( lu_non_invertible<Matrix<float,Dynamic,16> >() ));
+
+    // Test problem size constructors
+    CALL_SUBTEST_9( PartialPivLU<MatrixXf>(10) );
+    CALL_SUBTEST_9( FullPivLU<MatrixXf>(10, 20); );
+  }
+}

diff --git a/test/main.h b/test/main.h
new file mode 100644
index 0000000..07f3794
--- /dev/null
+++ b/test/main.h

@@ -0,0 +1,857 @@
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <cstdlib>
+#include <cerrno>
+#include <ctime>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <sstream>
+#include <vector>
+#include <typeinfo>
+#include <functional>
+
+// The following includes of STL headers have to be done _before_ the
+// definition of macros min() and max().  The reason is that many STL
+// implementations will not work properly as the min and max symbols collide
+// with the STL functions std:min() and std::max().  The STL headers may check
+// for the macro definition of min/max and issue a warning or undefine the
+// macros.
+//
+// Still, Windows defines min() and max() in windef.h as part of the regular
+// Windows system interfaces and many other Windows APIs depend on these
+// macros being available.  To prevent the macro expansion of min/max and to
+// make Eigen compatible with the Windows environment all function calls of
+// std::min() and std::max() have to be written with parenthesis around the
+// function name.
+//
+// All STL headers used by Eigen should be included here.  Because main.h is
+// included before any Eigen header and because the STL headers are guarded
+// against multiple inclusions, no STL header will see our own min/max macro
+// definitions.
+#include <limits>
+#include <algorithm>
+// Disable ICC's std::complex operator specializations so we can use our own.
+#define _OVERRIDE_COMPLEX_SPECIALIZATION_ 1
+#include <complex>
+#include <deque>
+#include <queue>
+#include <cassert>
+#include <list>
+#if __cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L)
+#include <random>
+#include <chrono>
+#ifdef EIGEN_USE_THREADS
+#include <future>
+#endif
+#endif
+
+// Same for cuda_fp16.h
+#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
+  // Means the compiler is either nvcc or clang with CUDA enabled
+  #define EIGEN_CUDACC __CUDACC__
+#endif
+#if defined(EIGEN_CUDACC)
+#include <cuda.h>
+  #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10)
+#else
+  #define EIGEN_CUDA_SDK_VER 0
+#endif
+#if EIGEN_CUDA_SDK_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
+// To test that all calls from Eigen code to std::min() and std::max() are
+// protected by parenthesis against macro expansion, the min()/max() macros
+// are defined here and any not-parenthesized min/max call will cause a
+// compiler error.
+#if !defined(__HIPCC__) && !defined(EIGEN_USE_SYCL)
+  //
+  // HIP header files include the following files
+  //  <thread>
+  //  <regex>
+  //  <unordered_map>
+  // which seem to contain not-parenthesized calls to "max"/"min", triggering the following check and causing the compile to fail
+  //
+  // Including those header files before the following macro definition for "min" / "max", only partially resolves the issue
+  // This is because other HIP header files also define "isnan" / "isinf" / "isfinite" functions, which are needed in other
+  // headers.
+  //
+  // So instead choosing to simply disable this check for HIP
+  //
+  #define min(A,B) please_protect_your_min_with_parentheses
+  #define max(A,B) please_protect_your_max_with_parentheses
+  #define isnan(X) please_protect_your_isnan_with_parentheses
+  #define isinf(X) please_protect_your_isinf_with_parentheses
+  #define isfinite(X) please_protect_your_isfinite_with_parentheses
+#endif
+
+
+// test possible conflicts
+struct real {};
+struct imag {};
+
+#ifdef M_PI
+#undef M_PI
+#endif
+#define M_PI please_use_EIGEN_PI_instead_of_M_PI
+
+#define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes
+// B0 is defined in POSIX header termios.h
+#define B0 FORBIDDEN_IDENTIFIER
+// `I` may be defined by complex.h:
+#define I  FORBIDDEN_IDENTIFIER
+
+// Unit tests calling Eigen's blas library must preserve the default blocking size
+// to avoid troubles.
+#ifndef EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
+#define EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
+#endif
+
+// shuts down ICC's remark #593: variable "XXX" was set but never used
+#define TEST_SET_BUT_UNUSED_VARIABLE(X) EIGEN_UNUSED_VARIABLE(X)
+
+#ifdef TEST_ENABLE_TEMPORARY_TRACKING
+
+static long int nb_temporaries;
+static long int nb_temporaries_on_assert = -1;
+
+inline void on_temporary_creation(long int size) {
+  // here's a great place to set a breakpoint when debugging failures in this test!
+  if(size!=0) nb_temporaries++;
+  if(nb_temporaries_on_assert>0) assert(nb_temporaries<nb_temporaries_on_assert);
+}
+
+#define EIGEN_DENSE_STORAGE_CTOR_PLUGIN { on_temporary_creation(size); }
+
+#define VERIFY_EVALUATION_COUNT(XPR,N) {\
+    nb_temporaries = 0; \
+    XPR; \
+    if(nb_temporaries!=(N)) { std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; }\
+    VERIFY( (#XPR) && nb_temporaries==(N) ); \
+  }
+
+#endif
+
+#include "split_test_helper.h"
+
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
+// On windows CE, NDEBUG is automatically defined <assert.h> if NDEBUG is not defined.
+#ifndef DEBUG
+#define DEBUG
+#endif
+
+// bounds integer values for AltiVec
+#if defined(__ALTIVEC__) || defined(__VSX__)
+#define EIGEN_MAKING_DOCS
+#endif
+
+#define DEFAULT_REPEAT 10
+
+namespace Eigen
+{
+  static std::vector<std::string> g_test_stack;
+  // level == 0 <=> abort if test fail
+  // level >= 1 <=> warning message to std::cerr if test fail
+  static int g_test_level = 0;
+  static int g_repeat = 1;
+  static unsigned int g_seed = 0;
+  static bool g_has_set_repeat = false, g_has_set_seed = false;
+
+  class EigenTest
+  {
+  public:
+    EigenTest() : m_func(0) {}
+    EigenTest(const char* a_name, void (*func)(void))
+      : m_name(a_name), m_func(func)
+    {
+      get_registered_tests().push_back(this);
+    }
+    const std::string& name() const { return m_name; }
+    void operator()() const { m_func(); }
+
+    static const std::vector<EigenTest*>& all() { return get_registered_tests(); }
+  protected:
+    static std::vector<EigenTest*>& get_registered_tests()
+    {
+      static std::vector<EigenTest*>* ms_registered_tests = new std::vector<EigenTest*>();
+      return *ms_registered_tests;
+    }
+    std::string m_name;
+    void (*m_func)(void);
+  };
+
+  // Declare and register a test, e.g.:
+  //    EIGEN_DECLARE_TEST(mytest) { ... }
+  // will create a function:
+  //    void test_mytest() { ... }
+  // that will be automatically called.
+  #define EIGEN_DECLARE_TEST(X) \
+    void EIGEN_CAT(test_,X) (); \
+    static EigenTest EIGEN_CAT(test_handler_,X) (EIGEN_MAKESTRING(X), & EIGEN_CAT(test_,X)); \
+    void EIGEN_CAT(test_,X) ()
+}
+
+#define TRACK std::cerr << __FILE__ << " " << __LINE__ << std::endl
+// #define TRACK while()
+
+#define EIGEN_DEFAULT_IO_FORMAT IOFormat(4, 0, "  ", "\n", "", "", "", "")
+
+#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
+  #define EIGEN_EXCEPTIONS
+#endif
+
+#ifndef EIGEN_NO_ASSERTION_CHECKING
+
+  namespace Eigen
+  {
+    static const bool should_raise_an_assert = false;
+
+    // Used to avoid to raise two exceptions at a time in which
+    // case the exception is not properly caught.
+    // This may happen when a second exceptions is triggered in a destructor.
+    static bool no_more_assert = false;
+    static bool report_on_cerr_on_assert_failure = true;
+
+    struct eigen_assert_exception
+    {
+      eigen_assert_exception(void) {}
+      ~eigen_assert_exception() { Eigen::no_more_assert = false; }
+    };
+
+    struct eigen_static_assert_exception
+    {
+      eigen_static_assert_exception(void) {}
+      ~eigen_static_assert_exception() { Eigen::no_more_assert = false; }
+    };
+  }
+  // If EIGEN_DEBUG_ASSERTS is defined and if no assertion is triggered while
+  // one should have been, then the list of executed assertions is printed out.
+  //
+  // EIGEN_DEBUG_ASSERTS is not enabled by default as it
+  // significantly increases the compilation time
+  // and might even introduce side effects that would hide
+  // some memory errors.
+  #ifdef EIGEN_DEBUG_ASSERTS
+
+    namespace Eigen
+    {
+      namespace internal
+      {
+        static bool push_assert = false;
+      }
+      static std::vector<std::string> eigen_assert_list;
+    }
+    #define eigen_assert(a)                       \
+      if( (!(a)) && (!no_more_assert) )     \
+      { \
+        if(report_on_cerr_on_assert_failure) \
+          std::cerr <<  #a << " " __FILE__ << "(" << __LINE__ << ")\n"; \
+        Eigen::no_more_assert = true;       \
+        EIGEN_THROW_X(Eigen::eigen_assert_exception()); \
+      }                                     \
+      else if (Eigen::internal::push_assert)       \
+      {                                     \
+        eigen_assert_list.push_back(std::string(EIGEN_MAKESTRING(__FILE__) " (" EIGEN_MAKESTRING(__LINE__) ") : " #a) ); \
+      }
+
+    #ifdef EIGEN_EXCEPTIONS
+    #define VERIFY_RAISES_ASSERT(a)                                                   \
+      {                                                                               \
+        Eigen::no_more_assert = false;                                                \
+        Eigen::eigen_assert_list.clear();                                             \
+        Eigen::internal::push_assert = true;                                          \
+        Eigen::report_on_cerr_on_assert_failure = false;                              \
+        try {                                                                         \
+          a;                                                                          \
+          std::cerr << "One of the following asserts should have been triggered:\n";  \
+          for (uint ai=0 ; ai<eigen_assert_list.size() ; ++ai)                        \
+            std::cerr << "  " << eigen_assert_list[ai] << "\n";                       \
+          VERIFY(Eigen::should_raise_an_assert && # a);                               \
+        } catch (Eigen::eigen_assert_exception) {                                     \
+          Eigen::internal::push_assert = false; VERIFY(true);                         \
+        }                                                                             \
+        Eigen::report_on_cerr_on_assert_failure = true;                               \
+        Eigen::internal::push_assert = false;                                         \
+      }
+    #endif //EIGEN_EXCEPTIONS
+
+  #elif !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(SYCL_DEVICE_ONLY) // EIGEN_DEBUG_ASSERTS
+    // see bug 89. The copy_bool here is working around a bug in gcc <= 4.3
+    #define eigen_assert(a) \
+      if( (!Eigen::internal::copy_bool(a)) && (!no_more_assert) )\
+      {                                       \
+        Eigen::no_more_assert = true;         \
+        if(report_on_cerr_on_assert_failure)  \
+          eigen_plain_assert(a);              \
+        else                                  \
+          EIGEN_THROW_X(Eigen::eigen_assert_exception()); \
+      }
+
+    #ifdef EIGEN_EXCEPTIONS
+      #define VERIFY_RAISES_ASSERT(a) {                           \
+        Eigen::no_more_assert = false;                            \
+        Eigen::report_on_cerr_on_assert_failure = false;          \
+        try {                                                     \
+          a;                                                      \
+          VERIFY(Eigen::should_raise_an_assert && # a);           \
+        }                                                         \
+        catch (Eigen::eigen_assert_exception&) { VERIFY(true); }  \
+        Eigen::report_on_cerr_on_assert_failure = true;           \
+      }
+    #endif // EIGEN_EXCEPTIONS
+  #endif // EIGEN_DEBUG_ASSERTS
+
+  #if defined(TEST_CHECK_STATIC_ASSERTIONS) && defined(EIGEN_EXCEPTIONS)
+    #define EIGEN_STATIC_ASSERT(a,MSG) \
+      if( (!Eigen::internal::copy_bool(a)) && (!no_more_assert) )\
+      {                                       \
+        Eigen::no_more_assert = true;         \
+        if(report_on_cerr_on_assert_failure)  \
+          eigen_plain_assert((a) && #MSG);      \
+        else                                  \
+          EIGEN_THROW_X(Eigen::eigen_static_assert_exception()); \
+      }
+    #define VERIFY_RAISES_STATIC_ASSERT(a) {                    \
+      Eigen::no_more_assert = false;                            \
+      Eigen::report_on_cerr_on_assert_failure = false;          \
+      try {                                                     \
+        a;                                                      \
+        VERIFY(Eigen::should_raise_an_assert && # a);           \
+      }                                                         \
+      catch (Eigen::eigen_static_assert_exception&) { VERIFY(true); }  \
+      Eigen::report_on_cerr_on_assert_failure = true;           \
+    }
+  #endif // TEST_CHECK_STATIC_ASSERTIONS
+
+#ifndef VERIFY_RAISES_ASSERT
+  #define VERIFY_RAISES_ASSERT(a) \
+    std::cout << "Can't VERIFY_RAISES_ASSERT( " #a " ) with exceptions disabled\n";
+#endif
+#ifndef VERIFY_RAISES_STATIC_ASSERT
+  #define VERIFY_RAISES_STATIC_ASSERT(a) \
+    std::cout << "Can't VERIFY_RAISES_STATIC_ASSERT( " #a " ) with exceptions disabled\n";
+#endif
+
+  #if !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(SYCL_DEVICE_ONLY)
+  #define EIGEN_USE_CUSTOM_ASSERT
+  #endif
+
+#else // EIGEN_NO_ASSERTION_CHECKING
+
+  #define VERIFY_RAISES_ASSERT(a) {}
+  #define VERIFY_RAISES_STATIC_ASSERT(a) {}
+
+#endif // EIGEN_NO_ASSERTION_CHECKING
+
+#define EIGEN_INTERNAL_DEBUGGING
+#include <Eigen/QR> // required for createRandomPIMatrixOfRank
+
+inline void verify_impl(bool condition, const char *testname, const char *file, int line, const char *condition_as_string)
+{
+  if (!condition)
+  {
+    if(Eigen::g_test_level>0)
+      std::cerr << "WARNING: ";
+    std::cerr << "Test " << testname << " failed in " << file << " (" << line << ")"
+      << std::endl << "    " << condition_as_string << std::endl;
+    std::cerr << "Stack:\n";
+    const int test_stack_size = static_cast<int>(Eigen::g_test_stack.size());
+    for(int i=test_stack_size-1; i>=0; --i)
+      std::cerr << "  - " << Eigen::g_test_stack[i] << "\n";
+    std::cerr << "\n";
+    if(Eigen::g_test_level==0)
+      abort();
+  }
+}
+
+#define VERIFY(a) ::verify_impl(a, g_test_stack.back().c_str(), __FILE__, __LINE__, EIGEN_MAKESTRING(a))
+
+#define VERIFY_GE(a, b) ::verify_impl(a >= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EIGEN_MAKESTRING(a >= b))
+#define VERIFY_LE(a, b) ::verify_impl(a <= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EIGEN_MAKESTRING(a <= b))
+
+
+#define VERIFY_IS_EQUAL(a, b) VERIFY(test_is_equal(a, b, true))
+#define VERIFY_IS_NOT_EQUAL(a, b) VERIFY(test_is_equal(a, b, false))
+#define VERIFY_IS_APPROX(a, b) VERIFY(verifyIsApprox(a, b))
+#define VERIFY_IS_NOT_APPROX(a, b) VERIFY(!test_isApprox(a, b))
+#define VERIFY_IS_MUCH_SMALLER_THAN(a, b) VERIFY(test_isMuchSmallerThan(a, b))
+#define VERIFY_IS_NOT_MUCH_SMALLER_THAN(a, b) VERIFY(!test_isMuchSmallerThan(a, b))
+#define VERIFY_IS_APPROX_OR_LESS_THAN(a, b) VERIFY(test_isApproxOrLessThan(a, b))
+#define VERIFY_IS_NOT_APPROX_OR_LESS_THAN(a, b) VERIFY(!test_isApproxOrLessThan(a, b))
+
+#define VERIFY_IS_UNITARY(a) VERIFY(test_isUnitary(a))
+
+#define STATIC_CHECK(COND) EIGEN_STATIC_ASSERT( (COND) , EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT )
+
+#define CALL_SUBTEST(FUNC) do { \
+    g_test_stack.push_back(EIGEN_MAKESTRING(FUNC)); \
+    FUNC; \
+    g_test_stack.pop_back(); \
+  } while (0)
+
+
+namespace Eigen {
+
+template<typename T1,typename T2>
+typename internal::enable_if<internal::is_same<T1,T2>::value,bool>::type
+is_same_type(const T1&, const T2&)
+{
+  return true;
+}
+
+template<typename T> inline typename NumTraits<T>::Real test_precision() { return NumTraits<T>::dummy_precision(); }
+template<> inline float test_precision<float>() { return 1e-3f; }
+template<> inline double test_precision<double>() { return 1e-6; }
+template<> inline long double test_precision<long double>() { return 1e-6l; }
+template<> inline float test_precision<std::complex<float> >() { return test_precision<float>(); }
+template<> inline double test_precision<std::complex<double> >() { return test_precision<double>(); }
+template<> inline long double test_precision<std::complex<long double> >() { return test_precision<long double>(); }
+
+#define EIGEN_TEST_SCALAR_TEST_OVERLOAD(TYPE)                             \
+  inline bool test_isApprox(TYPE a, TYPE b)                               \
+  { return internal::isApprox(a, b, test_precision<TYPE>()); }            \
+  inline bool test_isMuchSmallerThan(TYPE a, TYPE b)                      \
+  { return internal::isMuchSmallerThan(a, b, test_precision<TYPE>()); }   \
+  inline bool test_isApproxOrLessThan(TYPE a, TYPE b)                     \
+  { return internal::isApproxOrLessThan(a, b, test_precision<TYPE>()); }
+
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(short)
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(unsigned short)
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(int)
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(unsigned int)
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(long)
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(unsigned long)
+#if EIGEN_HAS_CXX11
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(long long)
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(unsigned long long)
+#endif
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(float)
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(double)
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(half)
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(bfloat16)
+
+#undef EIGEN_TEST_SCALAR_TEST_OVERLOAD
+
+#ifndef EIGEN_TEST_NO_COMPLEX
+inline bool test_isApprox(const std::complex<float>& a, const std::complex<float>& b)
+{ return internal::isApprox(a, b, test_precision<std::complex<float> >()); }
+inline bool test_isMuchSmallerThan(const std::complex<float>& a, const std::complex<float>& b)
+{ return internal::isMuchSmallerThan(a, b, test_precision<std::complex<float> >()); }
+
+inline bool test_isApprox(const std::complex<double>& a, const std::complex<double>& b)
+{ return internal::isApprox(a, b, test_precision<std::complex<double> >()); }
+inline bool test_isMuchSmallerThan(const std::complex<double>& a, const std::complex<double>& b)
+{ return internal::isMuchSmallerThan(a, b, test_precision<std::complex<double> >()); }
+
+#ifndef EIGEN_TEST_NO_LONGDOUBLE
+inline bool test_isApprox(const std::complex<long double>& a, const std::complex<long double>& b)
+{ return internal::isApprox(a, b, test_precision<std::complex<long double> >()); }
+inline bool test_isMuchSmallerThan(const std::complex<long double>& a, const std::complex<long double>& b)
+{ return internal::isMuchSmallerThan(a, b, test_precision<std::complex<long double> >()); }
+#endif
+#endif
+
+#ifndef EIGEN_TEST_NO_LONGDOUBLE
+inline bool test_isApprox(const long double& a, const long double& b)
+{
+    bool ret = internal::isApprox(a, b, test_precision<long double>());
+    if (!ret) std::cerr
+        << std::endl << "    actual   = " << a
+        << std::endl << "    expected = " << b << std::endl << std::endl;
+    return ret;
+}
+
+inline bool test_isMuchSmallerThan(const long double& a, const long double& b)
+{ return internal::isMuchSmallerThan(a, b, test_precision<long double>()); }
+inline bool test_isApproxOrLessThan(const long double& a, const long double& b)
+{ return internal::isApproxOrLessThan(a, b, test_precision<long double>()); }
+#endif // EIGEN_TEST_NO_LONGDOUBLE
+
+// test_relative_error returns the relative difference between a and b as a real scalar as used in isApprox.
+template<typename T1,typename T2>
+typename NumTraits<typename T1::RealScalar>::NonInteger test_relative_error(const EigenBase<T1> &a, const EigenBase<T2> &b)
+{
+  using std::sqrt;
+  typedef typename NumTraits<typename T1::RealScalar>::NonInteger RealScalar;
+  typename internal::nested_eval<T1,2>::type ea(a.derived());
+  typename internal::nested_eval<T2,2>::type eb(b.derived());
+  return sqrt(RealScalar((ea-eb).cwiseAbs2().sum()) / RealScalar((std::min)(eb.cwiseAbs2().sum(),ea.cwiseAbs2().sum())));
+}
+
+template<typename T1,typename T2>
+typename T1::RealScalar test_relative_error(const T1 &a, const T2 &b, const typename T1::Coefficients* = 0)
+{
+  return test_relative_error(a.coeffs(), b.coeffs());
+}
+
+template<typename T1,typename T2>
+typename T1::Scalar test_relative_error(const T1 &a, const T2 &b, const typename T1::MatrixType* = 0)
+{
+  return test_relative_error(a.matrix(), b.matrix());
+}
+
+template<typename S, int D>
+S test_relative_error(const Translation<S,D> &a, const Translation<S,D> &b)
+{
+  return test_relative_error(a.vector(), b.vector());
+}
+
+template <typename S, int D, int O>
+S test_relative_error(const ParametrizedLine<S,D,O> &a, const ParametrizedLine<S,D,O> &b)
+{
+  return (std::max)(test_relative_error(a.origin(), b.origin()), test_relative_error(a.origin(), b.origin()));
+}
+
+template <typename S, int D>
+S test_relative_error(const AlignedBox<S,D> &a, const AlignedBox<S,D> &b)
+{
+  return (std::max)(test_relative_error((a.min)(), (b.min)()), test_relative_error((a.max)(), (b.max)()));
+}
+
+template<typename Derived> class SparseMatrixBase;
+template<typename T1,typename T2>
+typename T1::RealScalar test_relative_error(const MatrixBase<T1> &a, const SparseMatrixBase<T2> &b)
+{
+  return test_relative_error(a,b.toDense());
+}
+
+template<typename Derived> class SparseMatrixBase;
+template<typename T1,typename T2>
+typename T1::RealScalar test_relative_error(const SparseMatrixBase<T1> &a, const MatrixBase<T2> &b)
+{
+  return test_relative_error(a.toDense(),b);
+}
+
+template<typename Derived> class SparseMatrixBase;
+template<typename T1,typename T2>
+typename T1::RealScalar test_relative_error(const SparseMatrixBase<T1> &a, const SparseMatrixBase<T2> &b)
+{
+  return test_relative_error(a.toDense(),b.toDense());
+}
+
+template<typename T1,typename T2>
+typename NumTraits<typename NumTraits<T1>::Real>::NonInteger test_relative_error(const T1 &a, const T2 &b, typename internal::enable_if<internal::is_arithmetic<typename NumTraits<T1>::Real>::value, T1>::type* = 0)
+{
+  typedef typename NumTraits<typename NumTraits<T1>::Real>::NonInteger RealScalar;
+  return numext::sqrt(RealScalar(numext::abs2(a-b))/(numext::mini)(RealScalar(numext::abs2(a)),RealScalar(numext::abs2(b))));
+}
+
+template<typename T>
+T test_relative_error(const Rotation2D<T> &a, const Rotation2D<T> &b)
+{
+  return test_relative_error(a.angle(), b.angle());
+}
+
+template<typename T>
+T test_relative_error(const AngleAxis<T> &a, const AngleAxis<T> &b)
+{
+  return (std::max)(test_relative_error(a.angle(), b.angle()), test_relative_error(a.axis(), b.axis()));
+}
+
+template<typename Type1, typename Type2>
+inline bool test_isApprox(const Type1& a, const Type2& b, typename Type1::Scalar* = 0) // Enabled for Eigen's type only
+{
+  return a.isApprox(b, test_precision<typename Type1::Scalar>());
+}
+
+// get_test_precision is a small wrapper to test_precision allowing to return the scalar precision for either scalars or expressions
+template<typename T>
+typename NumTraits<typename T::Scalar>::Real get_test_precision(const T&, const typename T::Scalar* = 0)
+{
+  return test_precision<typename NumTraits<typename T::Scalar>::Real>();
+}
+
+template<typename T>
+typename NumTraits<T>::Real get_test_precision(const T&,typename internal::enable_if<internal::is_arithmetic<typename NumTraits<T>::Real>::value, T>::type* = 0)
+{
+  return test_precision<typename NumTraits<T>::Real>();
+}
+
+// verifyIsApprox is a wrapper to test_isApprox that outputs the relative difference magnitude if the test fails.
+template<typename Type1, typename Type2>
+inline bool verifyIsApprox(const Type1& a, const Type2& b)
+{
+  bool ret = test_isApprox(a,b);
+  if(!ret)
+  {
+    std::cerr << "Difference too large wrt tolerance " << get_test_precision(a)  << ", relative error is: " << test_relative_error(a,b) << std::endl;
+  }
+  return ret;
+}
+
+// The idea behind this function is to compare the two scalars a and b where
+// the scalar ref is a hint about the expected order of magnitude of a and b.
+// WARNING: the scalar a and b must be positive
+// Therefore, if for some reason a and b are very small compared to ref,
+// we won't issue a false negative.
+// This test could be: abs(a-b) <= eps * ref
+// However, it seems that simply comparing a+ref and b+ref is more sensitive to true error.
+template<typename Scalar,typename ScalarRef>
+inline bool test_isApproxWithRef(const Scalar& a, const Scalar& b, const ScalarRef& ref)
+{
+  return test_isApprox(a+ref, b+ref);
+}
+
+template<typename Derived1, typename Derived2>
+inline bool test_isMuchSmallerThan(const MatrixBase<Derived1>& m1,
+                                   const MatrixBase<Derived2>& m2)
+{
+  return m1.isMuchSmallerThan(m2, test_precision<typename internal::traits<Derived1>::Scalar>());
+}
+
+template<typename Derived>
+inline bool test_isMuchSmallerThan(const MatrixBase<Derived>& m,
+                                   const typename NumTraits<typename internal::traits<Derived>::Scalar>::Real& s)
+{
+  return m.isMuchSmallerThan(s, test_precision<typename internal::traits<Derived>::Scalar>());
+}
+
+template<typename Derived>
+inline bool test_isUnitary(const MatrixBase<Derived>& m)
+{
+  return m.isUnitary(test_precision<typename internal::traits<Derived>::Scalar>());
+}
+
+// Forward declaration to avoid ICC warning
+template<typename T, typename U>
+bool test_is_equal(const T& actual, const U& expected, bool expect_equal=true);
+
+template<typename T, typename U>
+bool test_is_equal(const T& actual, const U& expected, bool expect_equal)
+{
+    if ((actual==expected) == expect_equal)
+        return true;
+    // false:
+    std::cerr
+        << "\n    actual   = " << actual
+        << "\n    expected " << (expect_equal ? "= " : "!=") << expected << "\n\n";
+    return false;
+}
+
+/** Creates a random Partial Isometry matrix of given rank.
+  *
+  * A partial isometry is a matrix all of whose singular values are either 0 or 1.
+  * This is very useful to test rank-revealing algorithms.
+  */
+// Forward declaration to avoid ICC warning
+template<typename MatrixType>
+void createRandomPIMatrixOfRank(Index desired_rank, Index rows, Index cols, MatrixType& m);
+template<typename MatrixType>
+void createRandomPIMatrixOfRank(Index desired_rank, Index rows, Index cols, MatrixType& m)
+{
+  typedef typename internal::traits<MatrixType>::Scalar Scalar;
+  enum { Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime };
+
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+  typedef Matrix<Scalar, Rows, Rows> MatrixAType;
+  typedef Matrix<Scalar, Cols, Cols> MatrixBType;
+
+  if(desired_rank == 0)
+  {
+    m.setZero(rows,cols);
+    return;
+  }
+
+  if(desired_rank == 1)
+  {
+    // here we normalize the vectors to get a partial isometry
+    m = VectorType::Random(rows).normalized() * VectorType::Random(cols).normalized().transpose();
+    return;
+  }
+
+  MatrixAType a = MatrixAType::Random(rows,rows);
+  MatrixType d = MatrixType::Identity(rows,cols);
+  MatrixBType  b = MatrixBType::Random(cols,cols);
+
+  // set the diagonal such that only desired_rank non-zero entries reamain
+  const Index diag_size = (std::min)(d.rows(),d.cols());
+  if(diag_size != desired_rank)
+    d.diagonal().segment(desired_rank, diag_size-desired_rank) = VectorType::Zero(diag_size-desired_rank);
+
+  HouseholderQR<MatrixAType> qra(a);
+  HouseholderQR<MatrixBType> qrb(b);
+  m = qra.householderQ() * d * qrb.householderQ();
+}
+
+// Forward declaration to avoid ICC warning
+template<typename PermutationVectorType>
+void randomPermutationVector(PermutationVectorType& v, Index size);
+template<typename PermutationVectorType>
+void randomPermutationVector(PermutationVectorType& v, Index size)
+{
+  typedef typename PermutationVectorType::Scalar Scalar;
+  v.resize(size);
+  for(Index i = 0; i < size; ++i) v(i) = Scalar(i);
+  if(size == 1) return;
+  for(Index n = 0; n < 3 * size; ++n)
+  {
+    Index i = internal::random<Index>(0, size-1);
+    Index j;
+    do j = internal::random<Index>(0, size-1); while(j==i);
+    std::swap(v(i), v(j));
+  }
+}
+
+template<typename T> bool isNotNaN(const T& x)
+{
+  return x==x;
+}
+
+template<typename T> bool isPlusInf(const T& x)
+{
+  return x > NumTraits<T>::highest();
+}
+
+template<typename T> bool isMinusInf(const T& x)
+{
+  return x < NumTraits<T>::lowest();
+}
+
+} // end namespace Eigen
+
+template<typename T> struct GetDifferentType;
+
+template<> struct GetDifferentType<float> { typedef double type; };
+template<> struct GetDifferentType<double> { typedef float type; };
+template<typename T> struct GetDifferentType<std::complex<T> >
+{ typedef std::complex<typename GetDifferentType<T>::type> type; };
+
+// Forward declaration to avoid ICC warning
+template<typename T> std::string type_name();
+template<typename T> std::string type_name()                    { return "other"; }
+template<> std::string type_name<float>()                       { return "float"; }
+template<> std::string type_name<double>()                      { return "double"; }
+template<> std::string type_name<long double>()                 { return "long double"; }
+template<> std::string type_name<int>()                         { return "int"; }
+template<> std::string type_name<std::complex<float> >()        { return "complex<float>"; }
+template<> std::string type_name<std::complex<double> >()       { return "complex<double>"; }
+template<> std::string type_name<std::complex<long double> >()  { return "complex<long double>"; }
+template<> std::string type_name<std::complex<int> >()          { return "complex<int>"; }
+
+using namespace Eigen;
+
+inline void set_repeat_from_string(const char *str)
+{
+  errno = 0;
+  g_repeat = int(strtoul(str, 0, 10));
+  if(errno || g_repeat <= 0)
+  {
+    std::cout << "Invalid repeat value " << str << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  g_has_set_repeat = true;
+}
+
+inline void set_seed_from_string(const char *str)
+{
+  errno = 0;
+  g_seed = int(strtoul(str, 0, 10));
+  if(errno || g_seed == 0)
+  {
+    std::cout << "Invalid seed value " << str << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  g_has_set_seed = true;
+}
+
+int main(int argc, char *argv[])
+{
+    g_has_set_repeat = false;
+    g_has_set_seed = false;
+    bool need_help = false;
+
+    for(int i = 1; i < argc; i++)
+    {
+      if(argv[i][0] == 'r')
+      {
+        if(g_has_set_repeat)
+        {
+          std::cout << "Argument " << argv[i] << " conflicting with a former argument" << std::endl;
+          return 1;
+        }
+        set_repeat_from_string(argv[i]+1);
+      }
+      else if(argv[i][0] == 's')
+      {
+        if(g_has_set_seed)
+        {
+          std::cout << "Argument " << argv[i] << " conflicting with a former argument" << std::endl;
+          return 1;
+        }
+         set_seed_from_string(argv[i]+1);
+      }
+      else
+      {
+        need_help = true;
+      }
+    }
+
+    if(need_help)
+    {
+      std::cout << "This test application takes the following optional arguments:" << std::endl;
+      std::cout << "  rN     Repeat each test N times (default: " << DEFAULT_REPEAT << ")" << std::endl;
+      std::cout << "  sN     Use N as seed for random numbers (default: based on current time)" << std::endl;
+      std::cout << std::endl;
+      std::cout << "If defined, the environment variables EIGEN_REPEAT and EIGEN_SEED" << std::endl;
+      std::cout << "will be used as default values for these parameters." << std::endl;
+      return 1;
+    }
+
+    char *env_EIGEN_REPEAT = getenv("EIGEN_REPEAT");
+    if(!g_has_set_repeat && env_EIGEN_REPEAT)
+      set_repeat_from_string(env_EIGEN_REPEAT);
+    char *env_EIGEN_SEED = getenv("EIGEN_SEED");
+    if(!g_has_set_seed && env_EIGEN_SEED)
+      set_seed_from_string(env_EIGEN_SEED);
+
+    if(!g_has_set_seed) g_seed = (unsigned int) time(NULL);
+    if(!g_has_set_repeat) g_repeat = DEFAULT_REPEAT;
+
+    std::cout << "Initializing random number generator with seed " << g_seed << std::endl;
+    std::stringstream ss;
+    ss << "Seed: " << g_seed;
+    g_test_stack.push_back(ss.str());
+    srand(g_seed);
+    std::cout << "Repeating each test " << g_repeat << " times" << std::endl;
+
+    VERIFY(EigenTest::all().size()>0);
+
+    for(std::size_t i=0; i<EigenTest::all().size(); ++i)
+    {
+      const EigenTest& current_test = *EigenTest::all()[i];
+      Eigen::g_test_stack.push_back(current_test.name());
+      current_test();
+      Eigen::g_test_stack.pop_back();
+    }
+
+    return 0;
+}
+
+// These warning are disabled here such that they are still ON when parsing Eigen's header files.
+#if defined __INTEL_COMPILER
+  // remark #383: value copied to temporary, reference to temporary used
+  //  -> this warning is raised even for legal usage as: g_test_stack.push_back("foo"); where g_test_stack is a std::vector<std::string>
+  // remark #1418: external function definition with no prior declaration
+  //  -> this warning is raised for all our test functions. Declaring them static would fix the issue.
+  // warning #279: controlling expression is constant
+  // remark #1572: floating-point equality and inequality comparisons are unreliable
+  #pragma warning disable 279 383 1418 1572
+#endif
+
+#ifdef _MSC_VER
+  // 4503 - decorated name length exceeded, name was truncated
+  #pragma warning( disable : 4503)
+#endif

diff --git a/test/mapped_matrix.cpp b/test/mapped_matrix.cpp
new file mode 100644
index 0000000..0ea136a
--- /dev/null
+++ b/test/mapped_matrix.cpp

@@ -0,0 +1,207 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NO_STATIC_ASSERT
+#define EIGEN_NO_STATIC_ASSERT // turn static asserts into runtime asserts in order to check them
+#endif
+
+#include "main.h"
+
+#define EIGEN_TESTMAP_MAX_SIZE 256
+
+template<typename VectorType> void map_class_vector(const VectorType& m)
+{
+  typedef typename VectorType::Scalar Scalar;
+
+  Index size = m.size();
+
+  Scalar* array1 = internal::aligned_new<Scalar>(size);
+  Scalar* array2 = internal::aligned_new<Scalar>(size);
+  Scalar* array3 = new Scalar[size+1];
+  Scalar* array3unaligned = (internal::UIntPtr(array3)%EIGEN_MAX_ALIGN_BYTES) == 0 ? array3+1 : array3;
+  Scalar  array4[EIGEN_TESTMAP_MAX_SIZE];
+
+  Map<VectorType, AlignedMax>(array1, size) = VectorType::Random(size);
+  Map<VectorType, AlignedMax>(array2, size) = Map<VectorType,AlignedMax>(array1, size);
+  Map<VectorType>(array3unaligned, size) = Map<VectorType>(array1, size);
+  Map<VectorType>(array4, size)          = Map<VectorType,AlignedMax>(array1, size);
+  VectorType ma1 = Map<VectorType, AlignedMax>(array1, size);
+  VectorType ma2 = Map<VectorType, AlignedMax>(array2, size);
+  VectorType ma3 = Map<VectorType>(array3unaligned, size);
+  VectorType ma4 = Map<VectorType>(array4, size);
+  VERIFY_IS_EQUAL(ma1, ma2);
+  VERIFY_IS_EQUAL(ma1, ma3);
+  VERIFY_IS_EQUAL(ma1, ma4);
+  #ifdef EIGEN_VECTORIZE
+  if(internal::packet_traits<Scalar>::Vectorizable && size>=AlignedMax)
+    VERIFY_RAISES_ASSERT((Map<VectorType,AlignedMax>(array3unaligned, size)))
+  #endif
+
+  internal::aligned_delete(array1, size);
+  internal::aligned_delete(array2, size);
+  delete[] array3;
+}
+
+template<typename MatrixType> void map_class_matrix(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+
+  Index rows = m.rows(), cols = m.cols(), size = rows*cols;
+  Scalar s1 = internal::random<Scalar>();
+
+  // array1 and array2 -> aligned heap allocation
+  Scalar* array1 = internal::aligned_new<Scalar>(size);
+  for(int i = 0; i < size; i++) array1[i] = Scalar(1);
+  Scalar* array2 = internal::aligned_new<Scalar>(size);
+  for(int i = 0; i < size; i++) array2[i] = Scalar(1);
+  // array3unaligned -> unaligned pointer to heap
+  Scalar* array3 = new Scalar[size+1];
+  Index sizep1 = size + 1; // <- without this temporary MSVC 2103 generates bad code
+  for(Index i = 0; i < sizep1; i++) array3[i] = Scalar(1);
+  Scalar* array3unaligned = (internal::UIntPtr(array3)%EIGEN_MAX_ALIGN_BYTES) == 0 ? array3+1 : array3;
+  Scalar array4[256];
+  if(size<=256)
+    for(int i = 0; i < size; i++) array4[i] = Scalar(1);
+  
+  Map<MatrixType> map1(array1, rows, cols);
+  Map<MatrixType, AlignedMax> map2(array2, rows, cols);
+  Map<MatrixType> map3(array3unaligned, rows, cols);
+  Map<MatrixType> map4(array4, rows, cols);
+  
+  VERIFY_IS_EQUAL(map1, MatrixType::Ones(rows,cols));
+  VERIFY_IS_EQUAL(map2, MatrixType::Ones(rows,cols));
+  VERIFY_IS_EQUAL(map3, MatrixType::Ones(rows,cols));
+  map1 = MatrixType::Random(rows,cols);
+  map2 = map1;
+  map3 = map1;
+  MatrixType ma1 = map1;
+  MatrixType ma2 = map2;
+  MatrixType ma3 = map3;
+  VERIFY_IS_EQUAL(map1, map2);
+  VERIFY_IS_EQUAL(map1, map3);
+  VERIFY_IS_EQUAL(ma1, ma2);
+  VERIFY_IS_EQUAL(ma1, ma3);
+  VERIFY_IS_EQUAL(ma1, map3);
+  
+  VERIFY_IS_APPROX(s1*map1, s1*map2);
+  VERIFY_IS_APPROX(s1*ma1, s1*ma2);
+  VERIFY_IS_EQUAL(s1*ma1, s1*ma3);
+  VERIFY_IS_APPROX(s1*map1, s1*map3);
+  
+  map2 *= s1;
+  map3 *= s1;
+  VERIFY_IS_APPROX(s1*map1, map2);
+  VERIFY_IS_APPROX(s1*map1, map3);
+  
+  if(size<=256)
+  {
+    VERIFY_IS_EQUAL(map4, MatrixType::Ones(rows,cols));
+    map4 = map1;
+    MatrixType ma4 = map4;
+    VERIFY_IS_EQUAL(map1, map4);
+    VERIFY_IS_EQUAL(ma1, map4);
+    VERIFY_IS_EQUAL(ma1, ma4);
+    VERIFY_IS_APPROX(s1*map1, s1*map4);
+    
+    map4 *= s1;
+    VERIFY_IS_APPROX(s1*map1, map4);
+  }
+
+  internal::aligned_delete(array1, size);
+  internal::aligned_delete(array2, size);
+  delete[] array3;
+}
+
+template<typename VectorType> void map_static_methods(const VectorType& m)
+{
+  typedef typename VectorType::Scalar Scalar;
+
+  Index size = m.size();
+
+  Scalar* array1 = internal::aligned_new<Scalar>(size);
+  Scalar* array2 = internal::aligned_new<Scalar>(size);
+  Scalar* array3 = new Scalar[size+1];
+  Scalar* array3unaligned = internal::UIntPtr(array3)%EIGEN_MAX_ALIGN_BYTES == 0 ? array3+1 : array3;
+
+  VectorType::MapAligned(array1, size) = VectorType::Random(size);
+  VectorType::Map(array2, size) = VectorType::Map(array1, size);
+  VectorType::Map(array3unaligned, size) = VectorType::Map(array1, size);
+  VectorType ma1 = VectorType::Map(array1, size);
+  VectorType ma2 = VectorType::MapAligned(array2, size);
+  VectorType ma3 = VectorType::Map(array3unaligned, size);
+  VERIFY_IS_EQUAL(ma1, ma2);
+  VERIFY_IS_EQUAL(ma1, ma3);
+
+  internal::aligned_delete(array1, size);
+  internal::aligned_delete(array2, size);
+  delete[] array3;
+}
+
+template<typename PlainObjectType> void check_const_correctness(const PlainObjectType&)
+{
+  // there's a lot that we can't test here while still having this test compile!
+  // the only possible approach would be to run a script trying to compile stuff and checking that it fails.
+  // CMake can help with that.
+
+  // verify that map-to-const don't have LvalueBit
+  typedef typename internal::add_const<PlainObjectType>::type ConstPlainObjectType;
+  VERIFY( !(internal::traits<Map<ConstPlainObjectType> >::Flags & LvalueBit) );
+  VERIFY( !(internal::traits<Map<ConstPlainObjectType, AlignedMax> >::Flags & LvalueBit) );
+  VERIFY( !(Map<ConstPlainObjectType>::Flags & LvalueBit) );
+  VERIFY( !(Map<ConstPlainObjectType, AlignedMax>::Flags & LvalueBit) );
+}
+
+template<typename Scalar>
+void map_not_aligned_on_scalar()
+{
+  typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
+  Index size = 11;
+  Scalar* array1 = internal::aligned_new<Scalar>((size+1)*(size+1)+1);
+  Scalar* array2 = reinterpret_cast<Scalar*>(sizeof(Scalar)/2+std::size_t(array1));
+  Map<MatrixType,0,OuterStride<> > map2(array2, size, size, OuterStride<>(size+1));
+  MatrixType m2 = MatrixType::Random(size,size);
+  map2 = m2;
+  VERIFY_IS_EQUAL(m2, map2);
+  
+  typedef Matrix<Scalar,Dynamic,1> VectorType;
+  Map<VectorType> map3(array2, size);
+  MatrixType v3 = VectorType::Random(size);
+  map3 = v3;
+  VERIFY_IS_EQUAL(v3, map3);
+  
+  internal::aligned_delete(array1, (size+1)*(size+1)+1);
+}
+
+EIGEN_DECLARE_TEST(mapped_matrix)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( map_class_vector(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_1( check_const_correctness(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( map_class_vector(Vector4d()) );
+    CALL_SUBTEST_2( map_class_vector(VectorXd(13)) );
+    CALL_SUBTEST_2( check_const_correctness(Matrix4d()) );
+    CALL_SUBTEST_3( map_class_vector(RowVector4f()) );
+    CALL_SUBTEST_4( map_class_vector(VectorXcf(8)) );
+    CALL_SUBTEST_5( map_class_vector(VectorXi(12)) );
+    CALL_SUBTEST_5( check_const_correctness(VectorXi(12)) );
+
+    CALL_SUBTEST_1( map_class_matrix(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( map_class_matrix(Matrix4d()) );
+    CALL_SUBTEST_11( map_class_matrix(Matrix<float,3,5>()) );
+    CALL_SUBTEST_4( map_class_matrix(MatrixXcf(internal::random<int>(1,10),internal::random<int>(1,10))) );
+    CALL_SUBTEST_5( map_class_matrix(MatrixXi(internal::random<int>(1,10),internal::random<int>(1,10))) );
+
+    CALL_SUBTEST_6( map_static_methods(Matrix<double, 1, 1>()) );
+    CALL_SUBTEST_7( map_static_methods(Vector3f()) );
+    CALL_SUBTEST_8( map_static_methods(RowVector3d()) );
+    CALL_SUBTEST_9( map_static_methods(VectorXcd(8)) );
+    CALL_SUBTEST_10( map_static_methods(VectorXf(12)) );
+    CALL_SUBTEST_11( map_not_aligned_on_scalar<double>() );
+  }
+}

diff --git a/test/mapstaticmethods.cpp b/test/mapstaticmethods.cpp
new file mode 100644
index 0000000..d0128ba
--- /dev/null
+++ b/test/mapstaticmethods.cpp

@@ -0,0 +1,177 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+// GCC<=4.8 has spurious shadow warnings, because `ptr` re-appears inside template instantiations
+// workaround: put these in an anonymous namespace
+namespace {
+float *ptr;
+const float *const_ptr;
+}
+
+template<typename PlainObjectType,
+         bool IsDynamicSize = PlainObjectType::SizeAtCompileTime == Dynamic,
+         bool IsVector = PlainObjectType::IsVectorAtCompileTime
+>
+struct mapstaticmethods_impl {};
+
+template<typename PlainObjectType, bool IsVector>
+struct mapstaticmethods_impl<PlainObjectType, false, IsVector>
+{
+  static void run(const PlainObjectType& m)
+  {
+    mapstaticmethods_impl<PlainObjectType, true, IsVector>::run(m);
+
+    int i = internal::random<int>(2,5), j = internal::random<int>(2,5);
+
+    PlainObjectType::Map(ptr).setZero();
+    PlainObjectType::MapAligned(ptr).setZero();
+    PlainObjectType::Map(const_ptr).sum();
+    PlainObjectType::MapAligned(const_ptr).sum();
+
+    PlainObjectType::Map(ptr, InnerStride<>(i)).setZero();
+    PlainObjectType::MapAligned(ptr, InnerStride<>(i)).setZero();
+    PlainObjectType::Map(const_ptr, InnerStride<>(i)).sum();
+    PlainObjectType::MapAligned(const_ptr, InnerStride<>(i)).sum();
+
+    PlainObjectType::Map(ptr, InnerStride<2>()).setZero();
+    PlainObjectType::MapAligned(ptr, InnerStride<3>()).setZero();
+    PlainObjectType::Map(const_ptr, InnerStride<4>()).sum();
+    PlainObjectType::MapAligned(const_ptr, InnerStride<5>()).sum();
+
+    PlainObjectType::Map(ptr, OuterStride<>(i)).setZero();
+    PlainObjectType::MapAligned(ptr, OuterStride<>(i)).setZero();
+    PlainObjectType::Map(const_ptr, OuterStride<>(i)).sum();
+    PlainObjectType::MapAligned(const_ptr, OuterStride<>(i)).sum();
+
+    PlainObjectType::Map(ptr, OuterStride<2>()).setZero();
+    PlainObjectType::MapAligned(ptr, OuterStride<3>()).setZero();
+    PlainObjectType::Map(const_ptr, OuterStride<4>()).sum();
+    PlainObjectType::MapAligned(const_ptr, OuterStride<5>()).sum();
+
+    PlainObjectType::Map(ptr, Stride<Dynamic, Dynamic>(i,j)).setZero();
+    PlainObjectType::MapAligned(ptr, Stride<2,Dynamic>(2,i)).setZero();
+    PlainObjectType::Map(const_ptr, Stride<Dynamic,3>(i,3)).sum();
+    PlainObjectType::MapAligned(const_ptr, Stride<Dynamic, Dynamic>(i,j)).sum();
+
+    PlainObjectType::Map(ptr, Stride<2,3>()).setZero();
+    PlainObjectType::MapAligned(ptr, Stride<3,4>()).setZero();
+    PlainObjectType::Map(const_ptr, Stride<2,4>()).sum();
+    PlainObjectType::MapAligned(const_ptr, Stride<5,3>()).sum();
+  }
+};
+
+template<typename PlainObjectType>
+struct mapstaticmethods_impl<PlainObjectType, true, false>
+{
+  static void run(const PlainObjectType& m)
+  {
+    Index rows = m.rows(), cols = m.cols();
+
+    int i = internal::random<int>(2,5), j = internal::random<int>(2,5);
+
+    PlainObjectType::Map(ptr, rows, cols).setZero();
+    PlainObjectType::MapAligned(ptr, rows, cols).setZero();
+    PlainObjectType::Map(const_ptr, rows, cols).sum();
+    PlainObjectType::MapAligned(const_ptr, rows, cols).sum();
+
+    PlainObjectType::Map(ptr, rows, cols, InnerStride<>(i)).setZero();
+    PlainObjectType::MapAligned(ptr, rows, cols, InnerStride<>(i)).setZero();
+    PlainObjectType::Map(const_ptr, rows, cols, InnerStride<>(i)).sum();
+    PlainObjectType::MapAligned(const_ptr, rows, cols, InnerStride<>(i)).sum();
+
+    PlainObjectType::Map(ptr, rows, cols, InnerStride<2>()).setZero();
+    PlainObjectType::MapAligned(ptr, rows, cols, InnerStride<3>()).setZero();
+    PlainObjectType::Map(const_ptr, rows, cols, InnerStride<4>()).sum();
+    PlainObjectType::MapAligned(const_ptr, rows, cols, InnerStride<5>()).sum();
+
+    PlainObjectType::Map(ptr, rows, cols, OuterStride<>(i)).setZero();
+    PlainObjectType::MapAligned(ptr, rows, cols, OuterStride<>(i)).setZero();
+    PlainObjectType::Map(const_ptr, rows, cols, OuterStride<>(i)).sum();
+    PlainObjectType::MapAligned(const_ptr, rows, cols, OuterStride<>(i)).sum();
+
+    PlainObjectType::Map(ptr, rows, cols, OuterStride<2>()).setZero();
+    PlainObjectType::MapAligned(ptr, rows, cols, OuterStride<3>()).setZero();
+    PlainObjectType::Map(const_ptr, rows, cols, OuterStride<4>()).sum();
+    PlainObjectType::MapAligned(const_ptr, rows, cols, OuterStride<5>()).sum();
+
+    PlainObjectType::Map(ptr, rows, cols, Stride<Dynamic, Dynamic>(i,j)).setZero();
+    PlainObjectType::MapAligned(ptr, rows, cols, Stride<2,Dynamic>(2,i)).setZero();
+    PlainObjectType::Map(const_ptr, rows, cols, Stride<Dynamic,3>(i,3)).sum();
+    PlainObjectType::MapAligned(const_ptr, rows, cols, Stride<Dynamic, Dynamic>(i,j)).sum();
+
+    PlainObjectType::Map(ptr, rows, cols, Stride<2,3>()).setZero();
+    PlainObjectType::MapAligned(ptr, rows, cols, Stride<3,4>()).setZero();
+    PlainObjectType::Map(const_ptr, rows, cols, Stride<2,4>()).sum();
+    PlainObjectType::MapAligned(const_ptr, rows, cols, Stride<5,3>()).sum();
+  }
+};
+
+template<typename PlainObjectType>
+struct mapstaticmethods_impl<PlainObjectType, true, true>
+{
+  static void run(const PlainObjectType& v)
+  {
+    Index size = v.size();
+
+    int i = internal::random<int>(2,5);
+
+    PlainObjectType::Map(ptr, size).setZero();
+    PlainObjectType::MapAligned(ptr, size).setZero();
+    PlainObjectType::Map(const_ptr, size).sum();
+    PlainObjectType::MapAligned(const_ptr, size).sum();
+
+    PlainObjectType::Map(ptr, size, InnerStride<>(i)).setZero();
+    PlainObjectType::MapAligned(ptr, size, InnerStride<>(i)).setZero();
+    PlainObjectType::Map(const_ptr, size, InnerStride<>(i)).sum();
+    PlainObjectType::MapAligned(const_ptr, size, InnerStride<>(i)).sum();
+
+    PlainObjectType::Map(ptr, size, InnerStride<2>()).setZero();
+    PlainObjectType::MapAligned(ptr, size, InnerStride<3>()).setZero();
+    PlainObjectType::Map(const_ptr, size, InnerStride<4>()).sum();
+    PlainObjectType::MapAligned(const_ptr, size, InnerStride<5>()).sum();
+  }
+};
+
+template<typename PlainObjectType>
+void mapstaticmethods(const PlainObjectType& m)
+{
+  mapstaticmethods_impl<PlainObjectType>::run(m);
+  VERIFY(true); // just to avoid 'unused function' warning
+}
+
+EIGEN_DECLARE_TEST(mapstaticmethods)
+{
+  ptr = internal::aligned_new<float>(1000);
+  for(int i = 0; i < 1000; i++) ptr[i] = float(i);
+
+  const_ptr = ptr;
+
+  CALL_SUBTEST_1(( mapstaticmethods(Matrix<float, 1, 1>()) ));
+  CALL_SUBTEST_1(( mapstaticmethods(Vector2f()) ));
+  CALL_SUBTEST_2(( mapstaticmethods(Vector3f()) ));
+  CALL_SUBTEST_2(( mapstaticmethods(Matrix2f()) ));
+  CALL_SUBTEST_3(( mapstaticmethods(Matrix4f()) ));
+  CALL_SUBTEST_3(( mapstaticmethods(Array4f()) ));
+  CALL_SUBTEST_4(( mapstaticmethods(Array3f()) ));
+  CALL_SUBTEST_4(( mapstaticmethods(Array33f()) ));
+  CALL_SUBTEST_5(( mapstaticmethods(Array44f()) ));
+  CALL_SUBTEST_5(( mapstaticmethods(VectorXf(1)) ));
+  CALL_SUBTEST_5(( mapstaticmethods(VectorXf(8)) ));
+  CALL_SUBTEST_6(( mapstaticmethods(MatrixXf(1,1)) ));
+  CALL_SUBTEST_6(( mapstaticmethods(MatrixXf(5,7)) ));
+  CALL_SUBTEST_7(( mapstaticmethods(ArrayXf(1)) ));
+  CALL_SUBTEST_7(( mapstaticmethods(ArrayXf(5)) ));
+  CALL_SUBTEST_8(( mapstaticmethods(ArrayXXf(1,1)) ));
+  CALL_SUBTEST_8(( mapstaticmethods(ArrayXXf(8,6)) ));
+
+  internal::aligned_delete(ptr, 1000);
+}
+

diff --git a/test/mapstride.cpp b/test/mapstride.cpp
new file mode 100644
index 0000000..fde73f2
--- /dev/null
+++ b/test/mapstride.cpp

@@ -0,0 +1,260 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<int Alignment,typename VectorType> void map_class_vector(const VectorType& m)
+{
+  typedef typename VectorType::Scalar Scalar;
+
+  Index size = m.size();
+
+  VectorType v = VectorType::Random(size);
+
+  Index arraysize = 3*size;
+  
+  Scalar* a_array = internal::aligned_new<Scalar>(arraysize+1);
+  Scalar* array = a_array;
+  if(Alignment!=Aligned)
+    array = (Scalar*)(internal::IntPtr(a_array) + (internal::packet_traits<Scalar>::AlignedOnScalar?sizeof(Scalar):sizeof(typename NumTraits<Scalar>::Real)));
+
+  {
+    Map<VectorType, Alignment, InnerStride<3> > map(array, size);
+    map = v;
+    for(int i = 0; i < size; ++i)
+    {
+      VERIFY(array[3*i] == v[i]);
+      VERIFY(map[i] == v[i]);
+    }
+  }
+
+  {
+    Map<VectorType, Unaligned, InnerStride<Dynamic> > map(array, size, InnerStride<Dynamic>(2));
+    map = v;
+    for(int i = 0; i < size; ++i)
+    {
+      VERIFY(array[2*i] == v[i]);
+      VERIFY(map[i] == v[i]);
+    }
+  }
+
+  internal::aligned_delete(a_array, arraysize+1);
+}
+
+template<int Alignment,typename MatrixType> void map_class_matrix(const MatrixType& _m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+
+  Index rows = _m.rows(), cols = _m.cols();
+
+  MatrixType m = MatrixType::Random(rows,cols);
+  Scalar s1 = internal::random<Scalar>();
+
+  Index arraysize = 4*(rows+4)*(cols+4);
+
+  Scalar* a_array1 = internal::aligned_new<Scalar>(arraysize+1);
+  Scalar* array1 = a_array1;
+  if(Alignment!=Aligned)
+    array1 = (Scalar*)(internal::IntPtr(a_array1) + (internal::packet_traits<Scalar>::AlignedOnScalar?sizeof(Scalar):sizeof(typename NumTraits<Scalar>::Real)));
+
+  Scalar a_array2[256];
+  Scalar* array2 = a_array2;
+  if(Alignment!=Aligned)
+    array2 = (Scalar*)(internal::IntPtr(a_array2) + (internal::packet_traits<Scalar>::AlignedOnScalar?sizeof(Scalar):sizeof(typename NumTraits<Scalar>::Real)));
+  else
+    array2 = (Scalar*)(((internal::UIntPtr(a_array2)+EIGEN_MAX_ALIGN_BYTES-1)/EIGEN_MAX_ALIGN_BYTES)*EIGEN_MAX_ALIGN_BYTES);
+  Index maxsize2 = a_array2 - array2 + 256;
+  
+  // test no inner stride and some dynamic outer stride
+  for(int k=0; k<2; ++k)
+  {
+    if(k==1 && (m.innerSize()+1)*m.outerSize() > maxsize2)
+      break;
+    Scalar* array = (k==0 ? array1 : array2);
+    
+    Map<MatrixType, Alignment, OuterStride<Dynamic> > map(array, rows, cols, OuterStride<Dynamic>(m.innerSize()+1));
+    map = m;
+    VERIFY(map.outerStride() == map.innerSize()+1);
+    for(int i = 0; i < m.outerSize(); ++i)
+      for(int j = 0; j < m.innerSize(); ++j)
+      {
+        VERIFY(array[map.outerStride()*i+j] == m.coeffByOuterInner(i,j));
+        VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j));
+      }
+    VERIFY_IS_APPROX(s1*map,s1*m);
+    map *= s1;
+    VERIFY_IS_APPROX(map,s1*m);
+  }
+
+  // test no inner stride and an outer stride of +4. This is quite important as for fixed-size matrices,
+  // this allows to hit the special case where it's vectorizable.
+  for(int k=0; k<2; ++k)
+  {
+    if(k==1 && (m.innerSize()+4)*m.outerSize() > maxsize2)
+      break;
+    Scalar* array = (k==0 ? array1 : array2);
+    
+    enum {
+      InnerSize = MatrixType::InnerSizeAtCompileTime,
+      OuterStrideAtCompileTime = InnerSize==Dynamic ? Dynamic : InnerSize+4
+    };
+    Map<MatrixType, Alignment, OuterStride<OuterStrideAtCompileTime> >
+      map(array, rows, cols, OuterStride<OuterStrideAtCompileTime>(m.innerSize()+4));
+    map = m;
+    VERIFY(map.outerStride() == map.innerSize()+4);
+    for(int i = 0; i < m.outerSize(); ++i)
+      for(int j = 0; j < m.innerSize(); ++j)
+      {
+        VERIFY(array[map.outerStride()*i+j] == m.coeffByOuterInner(i,j));
+        VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j));
+      }
+    VERIFY_IS_APPROX(s1*map,s1*m);
+    map *= s1;
+    VERIFY_IS_APPROX(map,s1*m);
+  }
+
+  // test both inner stride and outer stride
+  for(int k=0; k<2; ++k)
+  {
+    if(k==1 && (2*m.innerSize()+1)*(m.outerSize()*2) > maxsize2)
+      break;
+    Scalar* array = (k==0 ? array1 : array2);
+    
+    Map<MatrixType, Alignment, Stride<Dynamic,Dynamic> > map(array, rows, cols, Stride<Dynamic,Dynamic>(2*m.innerSize()+1, 2));
+    map = m;
+    VERIFY(map.outerStride() == 2*map.innerSize()+1);
+    VERIFY(map.innerStride() == 2);
+    for(int i = 0; i < m.outerSize(); ++i)
+      for(int j = 0; j < m.innerSize(); ++j)
+      {
+        VERIFY(array[map.outerStride()*i+map.innerStride()*j] == m.coeffByOuterInner(i,j));
+        VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j));
+      }
+    VERIFY_IS_APPROX(s1*map,s1*m);
+    map *= s1;
+    VERIFY_IS_APPROX(map,s1*m);
+  }
+
+  // test inner stride and no outer stride
+  for(int k=0; k<2; ++k)
+  {
+    if(k==1 && (m.innerSize()*2)*m.outerSize() > maxsize2)
+      break;
+    Scalar* array = (k==0 ? array1 : array2);
+
+    Map<MatrixType, Alignment, InnerStride<Dynamic> > map(array, rows, cols, InnerStride<Dynamic>(2));
+    map = m;
+    VERIFY(map.outerStride() == map.innerSize()*2);
+    for(int i = 0; i < m.outerSize(); ++i)
+      for(int j = 0; j < m.innerSize(); ++j)
+      {
+        VERIFY(array[map.innerSize()*i*2+j*2] == m.coeffByOuterInner(i,j));
+        VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j));
+      }
+    VERIFY_IS_APPROX(s1*map,s1*m);
+    map *= s1;
+    VERIFY_IS_APPROX(map,s1*m);
+  }
+
+  // test negative strides
+  {
+    Matrix<Scalar,Dynamic,1>::Map(a_array1, arraysize+1).setRandom();
+    Index outerstride = m.innerSize()+4;
+    Scalar* array = array1;
+
+    {
+      Map<MatrixType, Alignment, OuterStride<> > map1(array, rows, cols, OuterStride<>( outerstride));
+      Map<MatrixType, Unaligned, OuterStride<> > map2(array+(m.outerSize()-1)*outerstride, rows, cols, OuterStride<>(-outerstride));
+      if(MatrixType::IsRowMajor)  VERIFY_IS_APPROX(map1.colwise().reverse(), map2);
+      else                        VERIFY_IS_APPROX(map1.rowwise().reverse(), map2);
+    }
+
+    {
+      Map<MatrixType, Alignment, OuterStride<> > map1(array, rows, cols, OuterStride<>( outerstride));
+      Map<MatrixType, Unaligned, Stride<Dynamic,Dynamic> > map2(array+(m.outerSize()-1)*outerstride+m.innerSize()-1, rows, cols, Stride<Dynamic,Dynamic>(-outerstride,-1));
+      VERIFY_IS_APPROX(map1.reverse(), map2);
+    }
+
+    {
+      Map<MatrixType, Alignment, OuterStride<> > map1(array, rows, cols, OuterStride<>( outerstride));
+      Map<MatrixType, Unaligned, Stride<Dynamic,-1> > map2(array+(m.outerSize()-1)*outerstride+m.innerSize()-1, rows, cols, Stride<Dynamic,-1>(-outerstride,-1));
+      VERIFY_IS_APPROX(map1.reverse(), map2);
+    }
+  }
+
+  internal::aligned_delete(a_array1, arraysize+1);
+}
+
+// Additional tests for inner-stride but no outer-stride
+template<int>
+void bug1453()
+{
+  const int data[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+  typedef Matrix<int,Dynamic,Dynamic,RowMajor> RowMatrixXi;
+  typedef Matrix<int,2,3,ColMajor> ColMatrix23i;
+  typedef Matrix<int,3,2,ColMajor> ColMatrix32i;
+  typedef Matrix<int,2,3,RowMajor> RowMatrix23i;
+  typedef Matrix<int,3,2,RowMajor> RowMatrix32i;
+
+  VERIFY_IS_APPROX(MatrixXi::Map(data, 2, 3, InnerStride<2>()), MatrixXi::Map(data, 2, 3, Stride<4,2>()));
+  VERIFY_IS_APPROX(MatrixXi::Map(data, 2, 3, InnerStride<>(2)), MatrixXi::Map(data, 2, 3, Stride<4,2>()));
+  VERIFY_IS_APPROX(MatrixXi::Map(data, 3, 2, InnerStride<2>()), MatrixXi::Map(data, 3, 2, Stride<6,2>()));
+  VERIFY_IS_APPROX(MatrixXi::Map(data, 3, 2, InnerStride<>(2)), MatrixXi::Map(data, 3, 2, Stride<6,2>()));
+
+  VERIFY_IS_APPROX(RowMatrixXi::Map(data, 2, 3, InnerStride<2>()), RowMatrixXi::Map(data, 2, 3, Stride<6,2>()));
+  VERIFY_IS_APPROX(RowMatrixXi::Map(data, 2, 3, InnerStride<>(2)), RowMatrixXi::Map(data, 2, 3, Stride<6,2>()));
+  VERIFY_IS_APPROX(RowMatrixXi::Map(data, 3, 2, InnerStride<2>()), RowMatrixXi::Map(data, 3, 2, Stride<4,2>()));
+  VERIFY_IS_APPROX(RowMatrixXi::Map(data, 3, 2, InnerStride<>(2)), RowMatrixXi::Map(data, 3, 2, Stride<4,2>()));
+
+  VERIFY_IS_APPROX(ColMatrix23i::Map(data, InnerStride<2>()), MatrixXi::Map(data, 2, 3, Stride<4,2>()));
+  VERIFY_IS_APPROX(ColMatrix23i::Map(data, InnerStride<>(2)), MatrixXi::Map(data, 2, 3, Stride<4,2>()));
+  VERIFY_IS_APPROX(ColMatrix32i::Map(data, InnerStride<2>()), MatrixXi::Map(data, 3, 2, Stride<6,2>()));
+  VERIFY_IS_APPROX(ColMatrix32i::Map(data, InnerStride<>(2)), MatrixXi::Map(data, 3, 2, Stride<6,2>()));
+
+  VERIFY_IS_APPROX(RowMatrix23i::Map(data, InnerStride<2>()), RowMatrixXi::Map(data, 2, 3, Stride<6,2>()));
+  VERIFY_IS_APPROX(RowMatrix23i::Map(data, InnerStride<>(2)), RowMatrixXi::Map(data, 2, 3, Stride<6,2>()));
+  VERIFY_IS_APPROX(RowMatrix32i::Map(data, InnerStride<2>()), RowMatrixXi::Map(data, 3, 2, Stride<4,2>()));
+  VERIFY_IS_APPROX(RowMatrix32i::Map(data, InnerStride<>(2)), RowMatrixXi::Map(data, 3, 2, Stride<4,2>()));
+}
+
+EIGEN_DECLARE_TEST(mapstride)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    int maxn = 3;
+    CALL_SUBTEST_1( map_class_vector<Aligned>(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_1( map_class_vector<Unaligned>(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( map_class_vector<Aligned>(Vector4d()) );
+    CALL_SUBTEST_2( map_class_vector<Unaligned>(Vector4d()) );
+    CALL_SUBTEST_3( map_class_vector<Aligned>(RowVector4f()) );
+    CALL_SUBTEST_3( map_class_vector<Unaligned>(RowVector4f()) );
+    CALL_SUBTEST_4( map_class_vector<Aligned>(VectorXcf(internal::random<int>(1,maxn))) );
+    CALL_SUBTEST_4( map_class_vector<Unaligned>(VectorXcf(internal::random<int>(1,maxn))) );
+    CALL_SUBTEST_5( map_class_vector<Aligned>(VectorXi(internal::random<int>(1,maxn))) );
+    CALL_SUBTEST_5( map_class_vector<Unaligned>(VectorXi(internal::random<int>(1,maxn))) );
+
+    CALL_SUBTEST_1( map_class_matrix<Aligned>(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_1( map_class_matrix<Unaligned>(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( map_class_matrix<Aligned>(Matrix4d()) );
+    CALL_SUBTEST_2( map_class_matrix<Unaligned>(Matrix4d()) );
+    CALL_SUBTEST_3( map_class_matrix<Aligned>(Matrix<float,3,5>()) );
+    CALL_SUBTEST_3( map_class_matrix<Unaligned>(Matrix<float,3,5>()) );
+    CALL_SUBTEST_3( map_class_matrix<Aligned>(Matrix<float,4,8>()) );
+    CALL_SUBTEST_3( map_class_matrix<Unaligned>(Matrix<float,4,8>()) );
+    CALL_SUBTEST_4( map_class_matrix<Aligned>(MatrixXcf(internal::random<int>(1,maxn),internal::random<int>(1,maxn))) );
+    CALL_SUBTEST_4( map_class_matrix<Unaligned>(MatrixXcf(internal::random<int>(1,maxn),internal::random<int>(1,maxn))) );
+    CALL_SUBTEST_5( map_class_matrix<Aligned>(MatrixXi(internal::random<int>(1,maxn),internal::random<int>(1,maxn))) );
+    CALL_SUBTEST_5( map_class_matrix<Unaligned>(MatrixXi(internal::random<int>(1,maxn),internal::random<int>(1,maxn))) );
+    CALL_SUBTEST_6( map_class_matrix<Aligned>(MatrixXcd(internal::random<int>(1,maxn),internal::random<int>(1,maxn))) );
+    CALL_SUBTEST_6( map_class_matrix<Unaligned>(MatrixXcd(internal::random<int>(1,maxn),internal::random<int>(1,maxn))) );
+
+    CALL_SUBTEST_5( bug1453<0>() );
+    
+    TEST_SET_BUT_UNUSED_VARIABLE(maxn);
+  }
+}

diff --git a/test/meta.cpp b/test/meta.cpp
new file mode 100644
index 0000000..7a8b93c
--- /dev/null
+++ b/test/meta.cpp

@@ -0,0 +1,158 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename From, typename To>
+bool check_is_convertible(const From&, const To&)
+{
+  return internal::is_convertible<From,To>::value;
+}
+
+struct FooReturnType {
+  typedef int ReturnType;
+};
+
+struct MyInterface {
+  virtual void func() = 0;
+  virtual ~MyInterface() {}
+};
+struct MyImpl : public MyInterface {
+  void func() {}
+};
+
+EIGEN_DECLARE_TEST(meta)
+{
+  VERIFY((internal::conditional<(3<4),internal::true_type, internal::false_type>::type::value));
+  VERIFY(( internal::is_same<float,float>::value));
+  VERIFY((!internal::is_same<float,double>::value));
+  VERIFY((!internal::is_same<float,float&>::value));
+  VERIFY((!internal::is_same<float,const float&>::value));
+
+  VERIFY(( internal::is_same<float,internal::remove_all<const float&>::type >::value));
+  VERIFY(( internal::is_same<float,internal::remove_all<const float*>::type >::value));
+  VERIFY(( internal::is_same<float,internal::remove_all<const float*&>::type >::value));
+  VERIFY(( internal::is_same<float,internal::remove_all<float**>::type >::value));
+  VERIFY(( internal::is_same<float,internal::remove_all<float**&>::type >::value));
+  VERIFY(( internal::is_same<float,internal::remove_all<float* const *&>::type >::value));
+  VERIFY(( internal::is_same<float,internal::remove_all<float* const>::type >::value));
+
+  // test add_const
+  VERIFY(( internal::is_same< internal::add_const<float>::type, const float >::value));
+  VERIFY(( internal::is_same< internal::add_const<float*>::type, float* const>::value));
+  VERIFY(( internal::is_same< internal::add_const<float const*>::type, float const* const>::value));
+  VERIFY(( internal::is_same< internal::add_const<float&>::type, float& >::value));
+
+  // test remove_const
+  VERIFY(( internal::is_same< internal::remove_const<float const* const>::type, float const* >::value));
+  VERIFY(( internal::is_same< internal::remove_const<float const*>::type, float const* >::value));
+  VERIFY(( internal::is_same< internal::remove_const<float* const>::type, float* >::value));
+
+  // test add_const_on_value_type
+  VERIFY(( internal::is_same< internal::add_const_on_value_type<float&>::type, float const& >::value));
+  VERIFY(( internal::is_same< internal::add_const_on_value_type<float*>::type, float const* >::value));
+
+  VERIFY(( internal::is_same< internal::add_const_on_value_type<float>::type, const float >::value));
+  VERIFY(( internal::is_same< internal::add_const_on_value_type<const float>::type, const float >::value));
+
+  VERIFY(( internal::is_same< internal::add_const_on_value_type<const float* const>::type, const float* const>::value));
+  VERIFY(( internal::is_same< internal::add_const_on_value_type<float* const>::type, const float* const>::value));
+
+  VERIFY(( internal::is_same<float,internal::remove_reference<float&>::type >::value));
+  VERIFY(( internal::is_same<const float,internal::remove_reference<const float&>::type >::value));
+  VERIFY(( internal::is_same<float,internal::remove_pointer<float*>::type >::value));
+  VERIFY(( internal::is_same<const float,internal::remove_pointer<const float*>::type >::value));
+  VERIFY(( internal::is_same<float,internal::remove_pointer<float* const >::type >::value));
+
+
+  // is_convertible
+  STATIC_CHECK(( internal::is_convertible<float,double>::value ));
+  STATIC_CHECK(( internal::is_convertible<int,double>::value ));
+  STATIC_CHECK(( internal::is_convertible<int, short>::value ));
+  STATIC_CHECK(( internal::is_convertible<short, int>::value ));
+  STATIC_CHECK(( internal::is_convertible<double,int>::value ));
+  STATIC_CHECK(( internal::is_convertible<double,std::complex<double> >::value ));
+  STATIC_CHECK((!internal::is_convertible<std::complex<double>,double>::value ));
+  STATIC_CHECK(( internal::is_convertible<Array33f,Matrix3f>::value ));
+  STATIC_CHECK(( internal::is_convertible<Matrix3f&,Matrix3f>::value ));
+  STATIC_CHECK(( internal::is_convertible<Matrix3f&,Matrix3f&>::value ));
+  STATIC_CHECK(( internal::is_convertible<Matrix3f&,const Matrix3f&>::value ));
+  STATIC_CHECK(( internal::is_convertible<const Matrix3f&,Matrix3f>::value ));
+  STATIC_CHECK(( internal::is_convertible<const Matrix3f&,const Matrix3f&>::value ));
+  STATIC_CHECK((!internal::is_convertible<const Matrix3f&,Matrix3f&>::value ));
+  STATIC_CHECK((!internal::is_convertible<const Matrix3f,Matrix3f&>::value ));
+  STATIC_CHECK(!( internal::is_convertible<Matrix3f,Matrix3f&>::value ));
+
+  STATIC_CHECK(!( internal::is_convertible<int,int&>::value ));
+  STATIC_CHECK(( internal::is_convertible<const int,const int& >::value ));
+
+  //STATIC_CHECK((!internal::is_convertible<Matrix3f,Matrix3d>::value )); //does not even compile because the conversion is prevented by a static assertion
+  STATIC_CHECK((!internal::is_convertible<Array33f,int>::value ));
+  STATIC_CHECK((!internal::is_convertible<MatrixXf,float>::value ));
+  {
+    float f = 0.0f;
+    MatrixXf A, B;
+    VectorXf a, b;
+    VERIFY(( check_is_convertible(a.dot(b), f) ));
+    VERIFY(( check_is_convertible(a.transpose()*b, f) ));
+    VERIFY((!check_is_convertible(A*B, f) ));
+    VERIFY(( check_is_convertible(A*B, A) ));
+  }
+
+  #if (EIGEN_COMP_GNUC  && EIGEN_COMP_GNUC  <=  99) \
+   || (EIGEN_COMP_CLANG && EIGEN_COMP_CLANG <= 909) \
+   || (EIGEN_COMP_MSVC  && EIGEN_COMP_MSVC  <=1914)
+  // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1752,
+  // basically, a fix in the c++ standard breaks our c++98 implementation
+  // of is_convertible for abstract classes.
+  // So the following tests are expected to fail with recent compilers.
+
+  STATIC_CHECK(( !internal::is_convertible<MyInterface, MyImpl>::value ));
+  #if (!EIGEN_COMP_GNUC_STRICT) || (EIGEN_GNUC_AT_LEAST(4,8))
+  // GCC prior to 4.8 fails to compile this test:
+  // error: cannot allocate an object of abstract type 'MyInterface'
+  // In other word, it does not obey SFINAE.
+  // Nevertheless, we don't really care about supporting abstract type as scalar type!
+  STATIC_CHECK(( !internal::is_convertible<MyImpl, MyInterface>::value ));
+  #endif
+  STATIC_CHECK((  internal::is_convertible<MyImpl, const MyInterface&>::value ));
+
+  #endif
+
+  {
+    int i = 0;
+    VERIFY(( check_is_convertible(fix<3>(), i) ));
+    VERIFY((!check_is_convertible(i, fix<DynamicIndex>()) ));
+  }
+
+
+  VERIFY((  internal::has_ReturnType<FooReturnType>::value ));
+  VERIFY((  internal::has_ReturnType<ScalarBinaryOpTraits<int,int> >::value ));
+  VERIFY(( !internal::has_ReturnType<MatrixXf>::value ));
+  VERIFY(( !internal::has_ReturnType<int>::value ));
+
+  VERIFY(internal::meta_sqrt<1>::ret == 1);
+  #define VERIFY_META_SQRT(X) VERIFY(internal::meta_sqrt<X>::ret == int(std::sqrt(double(X))))
+  VERIFY_META_SQRT(2);
+  VERIFY_META_SQRT(3);
+  VERIFY_META_SQRT(4);
+  VERIFY_META_SQRT(5);
+  VERIFY_META_SQRT(6);
+  VERIFY_META_SQRT(8);
+  VERIFY_META_SQRT(9);
+  VERIFY_META_SQRT(15);
+  VERIFY_META_SQRT(16);
+  VERIFY_META_SQRT(17);
+  VERIFY_META_SQRT(255);
+  VERIFY_META_SQRT(256);
+  VERIFY_META_SQRT(257);
+  VERIFY_META_SQRT(1023);
+  VERIFY_META_SQRT(1024);
+  VERIFY_META_SQRT(1025);
+}

diff --git a/test/metis_support.cpp b/test/metis_support.cpp
new file mode 100644
index 0000000..b490dac
--- /dev/null
+++ b/test/metis_support.cpp

@@ -0,0 +1,25 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "sparse_solver.h"
+#include <Eigen/SparseLU>
+#include <Eigen/MetisSupport>
+#include <unsupported/Eigen/SparseExtra>
+
+template<typename T> void test_metis_T()
+{
+  SparseLU<SparseMatrix<T, ColMajor>, MetisOrdering<int> > sparselu_metis;
+  
+  check_sparse_square_solving(sparselu_metis); 
+}
+
+EIGEN_DECLARE_TEST(metis_support)
+{
+  CALL_SUBTEST_1(test_metis_T<double>());
+}

diff --git a/test/miscmatrices.cpp b/test/miscmatrices.cpp
new file mode 100644
index 0000000..e71712f
--- /dev/null
+++ b/test/miscmatrices.cpp

@@ -0,0 +1,46 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename MatrixType> void miscMatrices(const MatrixType& m)
+{
+  /* this test covers the following files:
+     DiagonalMatrix.h Ones.h
+  */
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  Index r = internal::random<Index>(0, rows-1), r2 = internal::random<Index>(0, rows-1), c = internal::random<Index>(0, cols-1);
+  VERIFY_IS_APPROX(MatrixType::Ones(rows,cols)(r,c), static_cast<Scalar>(1));
+  MatrixType m1 = MatrixType::Ones(rows,cols);
+  VERIFY_IS_APPROX(m1(r,c), static_cast<Scalar>(1));
+  VectorType v1 = VectorType::Random(rows);
+  v1[0];
+  Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime>
+  square(v1.asDiagonal());
+  if(r==r2) VERIFY_IS_APPROX(square(r,r2), v1[r]);
+  else VERIFY_IS_MUCH_SMALLER_THAN(square(r,r2), static_cast<Scalar>(1));
+  square = MatrixType::Zero(rows, rows);
+  square.diagonal() = VectorType::Ones(rows);
+  VERIFY_IS_APPROX(square, MatrixType::Identity(rows, rows));
+}
+
+EIGEN_DECLARE_TEST(miscmatrices)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( miscMatrices(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( miscMatrices(Matrix4d()) );
+    CALL_SUBTEST_3( miscMatrices(MatrixXcf(3, 3)) );
+    CALL_SUBTEST_4( miscMatrices(MatrixXi(8, 12)) );
+    CALL_SUBTEST_5( miscMatrices(MatrixXcd(20, 20)) );
+  }
+}

diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp
new file mode 100644
index 0000000..d450dbf
--- /dev/null
+++ b/test/mixingtypes.cpp

@@ -0,0 +1,329 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_TEST_PART_7)
+
+#ifndef EIGEN_NO_STATIC_ASSERT
+#define EIGEN_NO_STATIC_ASSERT // turn static asserts into runtime asserts in order to check them
+#endif
+
+// ignore double-promotion diagnostic for clang and gcc, if we check for static assertion anyway:
+// TODO do the same for MSVC?
+#if defined(__clang__)
+#  if (__clang_major__ * 100 + __clang_minor__) >= 308
+#    pragma clang diagnostic ignored "-Wdouble-promotion"
+#  endif
+#elif defined(__GNUC__)
+  // TODO is there a minimal GCC version for this? At least g++-4.7 seems to be fine with this.
+#  pragma GCC diagnostic ignored "-Wdouble-promotion"
+#endif
+
+#endif
+
+
+
+#if defined(EIGEN_TEST_PART_1) || defined(EIGEN_TEST_PART_2) || defined(EIGEN_TEST_PART_3)
+
+#ifndef EIGEN_DONT_VECTORIZE
+#define EIGEN_DONT_VECTORIZE
+#endif
+
+#endif
+
+static bool g_called;
+#define EIGEN_SCALAR_BINARY_OP_PLUGIN { g_called |= (!internal::is_same<LhsScalar,RhsScalar>::value); }
+
+#include "main.h"
+
+using namespace std;
+
+#define VERIFY_MIX_SCALAR(XPR,REF) \
+  g_called = false; \
+  VERIFY_IS_APPROX(XPR,REF); \
+  VERIFY( g_called && #XPR" not properly optimized");
+
+template<int SizeAtCompileType>
+void raise_assertion(Index size = SizeAtCompileType)
+{
+  // VERIFY_RAISES_ASSERT(mf+md); // does not even compile
+  Matrix<float, SizeAtCompileType, 1> vf; vf.setRandom(size);
+  Matrix<double, SizeAtCompileType, 1> vd; vd.setRandom(size);
+  VERIFY_RAISES_ASSERT(vf=vd);
+  VERIFY_RAISES_ASSERT(vf+=vd);
+  VERIFY_RAISES_ASSERT(vf-=vd);
+  VERIFY_RAISES_ASSERT(vd=vf);
+  VERIFY_RAISES_ASSERT(vd+=vf);
+  VERIFY_RAISES_ASSERT(vd-=vf);
+
+  //   vd.asDiagonal() * mf;    // does not even compile
+  //   vcd.asDiagonal() * mf;   // does not even compile
+
+#if 0 // we get other compilation errors here than just static asserts
+  VERIFY_RAISES_ASSERT(vd.dot(vf));
+#endif
+}
+
+
+template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
+{
+  typedef std::complex<float>   CF;
+  typedef std::complex<double>  CD;
+  typedef Matrix<float, SizeAtCompileType, SizeAtCompileType> Mat_f;
+  typedef Matrix<double, SizeAtCompileType, SizeAtCompileType> Mat_d;
+  typedef Matrix<std::complex<float>, SizeAtCompileType, SizeAtCompileType> Mat_cf;
+  typedef Matrix<std::complex<double>, SizeAtCompileType, SizeAtCompileType> Mat_cd;
+  typedef Matrix<float, SizeAtCompileType, 1> Vec_f;
+  typedef Matrix<double, SizeAtCompileType, 1> Vec_d;
+  typedef Matrix<std::complex<float>, SizeAtCompileType, 1> Vec_cf;
+  typedef Matrix<std::complex<double>, SizeAtCompileType, 1> Vec_cd;
+
+  Mat_f mf    = Mat_f::Random(size,size);
+  Mat_d md    = mf.template cast<double>();
+  //Mat_d rd    = md;
+  Mat_cf mcf  = Mat_cf::Random(size,size);
+  Mat_cd mcd  = mcf.template cast<complex<double> >();
+  Mat_cd rcd = mcd;
+  Vec_f vf    = Vec_f::Random(size,1);
+  Vec_d vd    = vf.template cast<double>();
+  Vec_cf vcf  = Vec_cf::Random(size,1);
+  Vec_cd vcd  = vcf.template cast<complex<double> >();
+  float           sf  = internal::random<float>();
+  double          sd  = internal::random<double>();
+  complex<float>  scf = internal::random<complex<float> >();
+  complex<double> scd = internal::random<complex<double> >();
+
+  mf+mf;
+
+  float  epsf = std::sqrt(std::numeric_limits<float> ::min EIGEN_EMPTY ());
+  double epsd = std::sqrt(std::numeric_limits<double>::min EIGEN_EMPTY ());
+
+  while(std::abs(sf )<epsf) sf  = internal::random<float>();
+  while(std::abs(sd )<epsd) sd  = internal::random<double>();
+  while(std::abs(scf)<epsf) scf = internal::random<CF>();
+  while(std::abs(scd)<epsd) scd = internal::random<CD>();
+
+  // check scalar products
+  VERIFY_MIX_SCALAR(vcf * sf , vcf * complex<float>(sf));
+  VERIFY_MIX_SCALAR(sd * vcd , complex<double>(sd) * vcd);
+  VERIFY_MIX_SCALAR(vf * scf , vf.template cast<complex<float> >() * scf);
+  VERIFY_MIX_SCALAR(scd * vd , scd * vd.template cast<complex<double> >());
+
+  VERIFY_MIX_SCALAR(vcf * 2 , vcf * complex<float>(2));
+  VERIFY_MIX_SCALAR(vcf * 2.1 , vcf * complex<float>(2.1));
+  VERIFY_MIX_SCALAR(2 * vcf, vcf * complex<float>(2));
+  VERIFY_MIX_SCALAR(2.1 * vcf , vcf * complex<float>(2.1));
+
+  // check scalar quotients
+  VERIFY_MIX_SCALAR(vcf / sf , vcf / complex<float>(sf));
+  VERIFY_MIX_SCALAR(vf / scf , vf.template cast<complex<float> >() / scf);
+  VERIFY_MIX_SCALAR(vf.array()  / scf, vf.template cast<complex<float> >().array() / scf);
+  VERIFY_MIX_SCALAR(scd / vd.array() , scd / vd.template cast<complex<double> >().array());
+
+  // check scalar increment
+  VERIFY_MIX_SCALAR(vcf.array() + sf , vcf.array() + complex<float>(sf));
+  VERIFY_MIX_SCALAR(sd  + vcd.array(), complex<double>(sd) + vcd.array());
+  VERIFY_MIX_SCALAR(vf.array()  + scf, vf.template cast<complex<float> >().array() + scf);
+  VERIFY_MIX_SCALAR(scd + vd.array() , scd + vd.template cast<complex<double> >().array());
+
+  // check scalar subtractions
+  VERIFY_MIX_SCALAR(vcf.array() - sf , vcf.array() - complex<float>(sf));
+  VERIFY_MIX_SCALAR(sd  - vcd.array(), complex<double>(sd) - vcd.array());
+  VERIFY_MIX_SCALAR(vf.array()  - scf, vf.template cast<complex<float> >().array() - scf);
+  VERIFY_MIX_SCALAR(scd - vd.array() , scd - vd.template cast<complex<double> >().array());
+
+  // check scalar powers
+  VERIFY_MIX_SCALAR( pow(vcf.array(), sf),        Eigen::pow(vcf.array(), complex<float>(sf)) );
+  VERIFY_MIX_SCALAR( vcf.array().pow(sf) ,        Eigen::pow(vcf.array(), complex<float>(sf)) );
+  VERIFY_MIX_SCALAR( pow(sd, vcd.array()),        Eigen::pow(complex<double>(sd), vcd.array()) );
+  VERIFY_MIX_SCALAR( Eigen::pow(vf.array(), scf), Eigen::pow(vf.template cast<complex<float> >().array(), scf) );
+  VERIFY_MIX_SCALAR( vf.array().pow(scf) ,        Eigen::pow(vf.template cast<complex<float> >().array(), scf) );
+  VERIFY_MIX_SCALAR( Eigen::pow(scd, vd.array()), Eigen::pow(scd, vd.template cast<complex<double> >().array()) );
+
+  // check dot product
+  vf.dot(vf);
+  VERIFY_IS_APPROX(vcf.dot(vf), vcf.dot(vf.template cast<complex<float> >()));
+
+  // check diagonal product
+  VERIFY_IS_APPROX(vf.asDiagonal() * mcf, vf.template cast<complex<float> >().asDiagonal() * mcf);
+  VERIFY_IS_APPROX(vcd.asDiagonal() * md, vcd.asDiagonal() * md.template cast<complex<double> >());
+  VERIFY_IS_APPROX(mcf * vf.asDiagonal(), mcf * vf.template cast<complex<float> >().asDiagonal());
+  VERIFY_IS_APPROX(md * vcd.asDiagonal(), md.template cast<complex<double> >() * vcd.asDiagonal());
+
+  // check inner product
+  VERIFY_IS_APPROX((vf.transpose() * vcf).value(), (vf.template cast<complex<float> >().transpose() * vcf).value());
+
+  // check outer product
+  VERIFY_IS_APPROX((vf * vcf.transpose()).eval(), (vf.template cast<complex<float> >() * vcf.transpose()).eval());
+
+  // coeff wise product
+
+  VERIFY_IS_APPROX((vf * vcf.transpose()).eval(), (vf.template cast<complex<float> >() * vcf.transpose()).eval());
+
+  Mat_cd mcd2 = mcd;
+  VERIFY_IS_APPROX(mcd.array() *= md.array(), mcd2.array() *= md.array().template cast<std::complex<double> >());
+  
+  // check matrix-matrix products
+  VERIFY_IS_APPROX(sd*md*mcd, (sd*md).template cast<CD>().eval()*mcd);
+  VERIFY_IS_APPROX(sd*mcd*md, sd*mcd*md.template cast<CD>());
+  VERIFY_IS_APPROX(scd*md*mcd, scd*md.template cast<CD>().eval()*mcd);
+  VERIFY_IS_APPROX(scd*mcd*md, scd*mcd*md.template cast<CD>());
+
+  VERIFY_IS_APPROX(sf*mf*mcf, sf*mf.template cast<CF>()*mcf);
+  VERIFY_IS_APPROX(sf*mcf*mf, sf*mcf*mf.template cast<CF>());
+  VERIFY_IS_APPROX(scf*mf*mcf, scf*mf.template cast<CF>()*mcf);
+  VERIFY_IS_APPROX(scf*mcf*mf, scf*mcf*mf.template cast<CF>());
+
+  VERIFY_IS_APPROX(sd*md.adjoint()*mcd, (sd*md).template cast<CD>().eval().adjoint()*mcd);
+  VERIFY_IS_APPROX(sd*mcd.adjoint()*md, sd*mcd.adjoint()*md.template cast<CD>());
+  VERIFY_IS_APPROX(sd*md.adjoint()*mcd.adjoint(), (sd*md).template cast<CD>().eval().adjoint()*mcd.adjoint());
+  VERIFY_IS_APPROX(sd*mcd.adjoint()*md.adjoint(), sd*mcd.adjoint()*md.template cast<CD>().adjoint());
+  VERIFY_IS_APPROX(sd*md*mcd.adjoint(), (sd*md).template cast<CD>().eval()*mcd.adjoint());
+  VERIFY_IS_APPROX(sd*mcd*md.adjoint(), sd*mcd*md.template cast<CD>().adjoint());
+
+  VERIFY_IS_APPROX(sf*mf.adjoint()*mcf, (sf*mf).template cast<CF>().eval().adjoint()*mcf);
+  VERIFY_IS_APPROX(sf*mcf.adjoint()*mf, sf*mcf.adjoint()*mf.template cast<CF>());
+  VERIFY_IS_APPROX(sf*mf.adjoint()*mcf.adjoint(), (sf*mf).template cast<CF>().eval().adjoint()*mcf.adjoint());
+  VERIFY_IS_APPROX(sf*mcf.adjoint()*mf.adjoint(), sf*mcf.adjoint()*mf.template cast<CF>().adjoint());
+  VERIFY_IS_APPROX(sf*mf*mcf.adjoint(), (sf*mf).template cast<CF>().eval()*mcf.adjoint());
+  VERIFY_IS_APPROX(sf*mcf*mf.adjoint(), sf*mcf*mf.template cast<CF>().adjoint());
+
+  VERIFY_IS_APPROX(sf*mf*vcf, (sf*mf).template cast<CF>().eval()*vcf);
+  VERIFY_IS_APPROX(scf*mf*vcf,(scf*mf.template cast<CF>()).eval()*vcf);
+  VERIFY_IS_APPROX(sf*mcf*vf, sf*mcf*vf.template cast<CF>());
+  VERIFY_IS_APPROX(scf*mcf*vf,scf*mcf*vf.template cast<CF>());
+
+  VERIFY_IS_APPROX(sf*vcf.adjoint()*mf,  sf*vcf.adjoint()*mf.template cast<CF>().eval());
+  VERIFY_IS_APPROX(scf*vcf.adjoint()*mf, scf*vcf.adjoint()*mf.template cast<CF>().eval());
+  VERIFY_IS_APPROX(sf*vf.adjoint()*mcf,  sf*vf.adjoint().template cast<CF>().eval()*mcf);
+  VERIFY_IS_APPROX(scf*vf.adjoint()*mcf, scf*vf.adjoint().template cast<CF>().eval()*mcf);
+
+  VERIFY_IS_APPROX(sd*md*vcd, (sd*md).template cast<CD>().eval()*vcd);
+  VERIFY_IS_APPROX(scd*md*vcd,(scd*md.template cast<CD>()).eval()*vcd);
+  VERIFY_IS_APPROX(sd*mcd*vd, sd*mcd*vd.template cast<CD>().eval());
+  VERIFY_IS_APPROX(scd*mcd*vd,scd*mcd*vd.template cast<CD>().eval());
+
+  VERIFY_IS_APPROX(sd*vcd.adjoint()*md,  sd*vcd.adjoint()*md.template cast<CD>().eval());
+  VERIFY_IS_APPROX(scd*vcd.adjoint()*md, scd*vcd.adjoint()*md.template cast<CD>().eval());
+  VERIFY_IS_APPROX(sd*vd.adjoint()*mcd,  sd*vd.adjoint().template cast<CD>().eval()*mcd);
+  VERIFY_IS_APPROX(scd*vd.adjoint()*mcd, scd*vd.adjoint().template cast<CD>().eval()*mcd);
+
+  VERIFY_IS_APPROX( sd*vcd.adjoint()*md.template triangularView<Upper>(),  sd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Upper>());
+  VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template triangularView<Lower>(), scd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Lower>());
+  VERIFY_IS_APPROX( sd*vcd.adjoint()*md.transpose().template triangularView<Upper>(),  sd*vcd.adjoint()*md.transpose().template cast<CD>().eval().template triangularView<Upper>());
+  VERIFY_IS_APPROX(scd*vcd.adjoint()*md.transpose().template triangularView<Lower>(), scd*vcd.adjoint()*md.transpose().template cast<CD>().eval().template triangularView<Lower>());
+  VERIFY_IS_APPROX( sd*vd.adjoint()*mcd.template triangularView<Lower>(),  sd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Lower>());
+  VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template triangularView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Upper>());
+  VERIFY_IS_APPROX( sd*vd.adjoint()*mcd.transpose().template triangularView<Lower>(),  sd*vd.adjoint().template cast<CD>().eval()*mcd.transpose().template triangularView<Lower>());
+  VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.transpose().template triangularView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.transpose().template triangularView<Upper>());
+
+  // Not supported yet: trmm
+//   VERIFY_IS_APPROX(sd*mcd*md.template triangularView<Lower>(),  sd*mcd*md.template cast<CD>().eval().template triangularView<Lower>());
+//   VERIFY_IS_APPROX(scd*mcd*md.template triangularView<Upper>(), scd*mcd*md.template cast<CD>().eval().template triangularView<Upper>());
+//   VERIFY_IS_APPROX(sd*md*mcd.template triangularView<Lower>(),  sd*md.template cast<CD>().eval()*mcd.template triangularView<Lower>());
+//   VERIFY_IS_APPROX(scd*md*mcd.template triangularView<Upper>(), scd*md.template cast<CD>().eval()*mcd.template triangularView<Upper>());
+
+  // Not supported yet: symv
+//   VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template selfadjointView<Upper>(),  sd*vcd.adjoint()*md.template cast<CD>().eval().template selfadjointView<Upper>());
+//   VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template selfadjointView<Lower>(), scd*vcd.adjoint()*md.template cast<CD>().eval().template selfadjointView<Lower>());
+//   VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template selfadjointView<Lower>(),  sd*vd.adjoint().template cast<CD>().eval()*mcd.template selfadjointView<Lower>());
+//   VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template selfadjointView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.template selfadjointView<Upper>());
+
+  // Not supported yet: symm
+//   VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template selfadjointView<Upper>(),  sd*vcd.adjoint()*md.template cast<CD>().eval().template selfadjointView<Upper>());
+//   VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template selfadjointView<Upper>(), scd*vcd.adjoint()*md.template cast<CD>().eval().template selfadjointView<Upper>());
+//   VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template selfadjointView<Upper>(),  sd*vd.adjoint().template cast<CD>().eval()*mcd.template selfadjointView<Upper>());
+//   VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template selfadjointView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.template selfadjointView<Upper>());
+
+  rcd.setZero();
+  VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView<Upper>() = sd * mcd * md),
+                   Mat_cd((sd * mcd * md.template cast<CD>().eval()).template triangularView<Upper>()));
+  VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView<Upper>() = sd * md * mcd),
+                   Mat_cd((sd * md.template cast<CD>().eval() * mcd).template triangularView<Upper>()));
+  VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView<Upper>() = scd * mcd * md),
+                   Mat_cd((scd * mcd * md.template cast<CD>().eval()).template triangularView<Upper>()));
+  VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView<Upper>() = scd * md * mcd),
+                   Mat_cd((scd * md.template cast<CD>().eval() * mcd).template triangularView<Upper>()));
+
+
+  VERIFY_IS_APPROX( md.array()  * mcd.array(), md.template cast<CD>().eval().array() * mcd.array() );
+  VERIFY_IS_APPROX( mcd.array() * md.array(),  mcd.array() * md.template cast<CD>().eval().array() );
+
+  VERIFY_IS_APPROX( md.array()  + mcd.array(), md.template cast<CD>().eval().array() + mcd.array() );
+  VERIFY_IS_APPROX( mcd.array() + md.array(),  mcd.array() + md.template cast<CD>().eval().array() );
+
+  VERIFY_IS_APPROX( md.array()  - mcd.array(), md.template cast<CD>().eval().array() - mcd.array() );
+  VERIFY_IS_APPROX( mcd.array() - md.array(),  mcd.array() - md.template cast<CD>().eval().array() );
+
+  if(mcd.array().abs().minCoeff()>epsd)
+  {
+    VERIFY_IS_APPROX( md.array() / mcd.array(), md.template cast<CD>().eval().array() / mcd.array() );
+  }
+  if(md.array().abs().minCoeff()>epsd)
+  {
+    VERIFY_IS_APPROX( mcd.array() / md.array(), mcd.array() / md.template cast<CD>().eval().array() );
+  }
+
+  if(md.array().abs().minCoeff()>epsd || mcd.array().abs().minCoeff()>epsd)
+  {
+    VERIFY_IS_APPROX( md.array().pow(mcd.array()), md.template cast<CD>().eval().array().pow(mcd.array()) );
+    VERIFY_IS_APPROX( mcd.array().pow(md.array()),  mcd.array().pow(md.template cast<CD>().eval().array()) );
+
+    VERIFY_IS_APPROX( pow(md.array(),mcd.array()), md.template cast<CD>().eval().array().pow(mcd.array()) );
+    VERIFY_IS_APPROX( pow(mcd.array(),md.array()),  mcd.array().pow(md.template cast<CD>().eval().array()) );
+  }
+
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd = md, md.template cast<CD>().eval() );
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd += md, mcd + md.template cast<CD>().eval() );
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd -= md, mcd - md.template cast<CD>().eval() );
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd.array() *= md.array(), mcd.array() * md.template cast<CD>().eval().array() );
+  rcd = mcd;
+  if(md.array().abs().minCoeff()>epsd)
+  {
+    VERIFY_IS_APPROX( rcd.array() /= md.array(), mcd.array() / md.template cast<CD>().eval().array() );
+  }
+
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd.noalias() += md + mcd*md, mcd + (md.template cast<CD>().eval()) + mcd*(md.template cast<CD>().eval()));
+
+  VERIFY_IS_APPROX( rcd.noalias()  = md*md,       ((md*md).eval().template cast<CD>()) );
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd.noalias() += md*md, mcd + ((md*md).eval().template cast<CD>()) );
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd.noalias() -= md*md, mcd - ((md*md).eval().template cast<CD>()) );
+
+  VERIFY_IS_APPROX( rcd.noalias()  = mcd + md*md,       mcd + ((md*md).eval().template cast<CD>()) );
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd.noalias() += mcd + md*md, mcd + mcd + ((md*md).eval().template cast<CD>()) );
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd.noalias() -= mcd + md*md,           - ((md*md).eval().template cast<CD>()) );
+}
+
+EIGEN_DECLARE_TEST(mixingtypes)
+{
+  g_called = false; // Silence -Wunneeded-internal-declaration.
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1(mixingtypes<3>());
+    CALL_SUBTEST_2(mixingtypes<4>());
+    CALL_SUBTEST_3(mixingtypes<Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)));
+
+    CALL_SUBTEST_4(mixingtypes<3>());
+    CALL_SUBTEST_5(mixingtypes<4>());
+    CALL_SUBTEST_6(mixingtypes<Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)));
+    CALL_SUBTEST_7(raise_assertion<Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)));
+  }
+  CALL_SUBTEST_7(raise_assertion<0>());
+  CALL_SUBTEST_7(raise_assertion<3>());
+  CALL_SUBTEST_7(raise_assertion<4>());
+  CALL_SUBTEST_7(raise_assertion<Dynamic>(0));
+}

diff --git a/test/mpl2only.cpp b/test/mpl2only.cpp
new file mode 100644
index 0000000..296350d
--- /dev/null
+++ b/test/mpl2only.cpp

@@ -0,0 +1,24 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MPL2_ONLY
+#define EIGEN_MPL2_ONLY
+#endif
+#include <Eigen/Dense>
+#include <Eigen/SparseCore>
+#include <Eigen/SparseLU>
+#include <Eigen/SparseQR>
+#include <Eigen/Sparse>
+#include <Eigen/IterativeLinearSolvers>
+#include <Eigen/Eigen>
+
+int main()
+{
+  return 0;
+}

diff --git a/test/nestbyvalue.cpp b/test/nestbyvalue.cpp
new file mode 100644
index 0000000..3a86bea
--- /dev/null
+++ b/test/nestbyvalue.cpp

@@ -0,0 +1,37 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define TEST_ENABLE_TEMPORARY_TRACKING
+
+#include "main.h"
+
+typedef NestByValue<MatrixXd> CpyMatrixXd;
+typedef CwiseBinaryOp<internal::scalar_sum_op<double,double>,const CpyMatrixXd,const CpyMatrixXd> XprType;
+
+XprType get_xpr_with_temps(const MatrixXd& a)
+{
+  MatrixXd t1 = a.rowwise().reverse();
+  MatrixXd t2 = a+a;
+  return t1.nestByValue() + t2.nestByValue();
+}
+
+EIGEN_DECLARE_TEST(nestbyvalue)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    Index rows = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+    Index cols = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+    MatrixXd a = MatrixXd::Random(rows,cols);
+    nb_temporaries = 0;
+    XprType x = get_xpr_with_temps(a);
+    VERIFY_IS_EQUAL(nb_temporaries,6);
+    MatrixXd b = x;
+    VERIFY_IS_EQUAL(nb_temporaries,6+1);
+    VERIFY_IS_APPROX(b, a.rowwise().reverse().eval() + (a+a).eval());
+  }
+}

diff --git a/test/nesting_ops.cpp b/test/nesting_ops.cpp
new file mode 100644
index 0000000..4b5fc21
--- /dev/null
+++ b/test/nesting_ops.cpp

@@ -0,0 +1,107 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define TEST_ENABLE_TEMPORARY_TRACKING
+
+#include "main.h"
+
+template <int N, typename XprType>
+void use_n_times(const XprType &xpr)
+{
+  typename internal::nested_eval<XprType,N>::type mat(xpr);
+  typename XprType::PlainObject res(mat.rows(), mat.cols());
+  nb_temporaries--; // remove res
+  res.setZero();
+  for(int i=0; i<N; ++i)
+    res += mat;
+}
+
+template <int N, typename ReferenceType, typename XprType>
+bool verify_eval_type(const XprType &, const ReferenceType&)
+{
+  typedef typename internal::nested_eval<XprType,N>::type EvalType;
+  return internal::is_same<typename internal::remove_all<EvalType>::type, typename internal::remove_all<ReferenceType>::type>::value;
+}
+
+template <typename MatrixType> void run_nesting_ops_1(const MatrixType& _m)
+{
+  typename internal::nested_eval<MatrixType,2>::type m(_m);
+
+  // Make really sure that we are in debug mode!
+  VERIFY_RAISES_ASSERT(eigen_assert(false));
+
+  // The only intention of these tests is to ensure that this code does
+  // not trigger any asserts or segmentation faults... more to come.
+  VERIFY_IS_APPROX( (m.transpose() * m).diagonal().sum(), (m.transpose() * m).diagonal().sum() );
+  VERIFY_IS_APPROX( (m.transpose() * m).diagonal().array().abs().sum(), (m.transpose() * m).diagonal().array().abs().sum() );
+
+  VERIFY_IS_APPROX( (m.transpose() * m).array().abs().sum(), (m.transpose() * m).array().abs().sum() );
+}
+
+template <typename MatrixType> void run_nesting_ops_2(const MatrixType& _m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  Index rows = _m.rows();
+  Index cols = _m.cols();
+  MatrixType m1 = MatrixType::Random(rows,cols);
+  Matrix<Scalar,MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime,ColMajor> m2;
+
+  if((MatrixType::SizeAtCompileTime==Dynamic))
+  {
+    VERIFY_EVALUATION_COUNT( use_n_times<1>(m1 + m1*m1), 1 );
+    VERIFY_EVALUATION_COUNT( use_n_times<10>(m1 + m1*m1), 1 );
+
+    VERIFY_EVALUATION_COUNT( use_n_times<1>(m1.template triangularView<Lower>().solve(m1.col(0))), 1 );
+    VERIFY_EVALUATION_COUNT( use_n_times<10>(m1.template triangularView<Lower>().solve(m1.col(0))), 1 );
+
+    VERIFY_EVALUATION_COUNT( use_n_times<1>(Scalar(2)*m1.template triangularView<Lower>().solve(m1.col(0))), 2 ); // FIXME could be one by applying the scaling in-place on the solve result
+    VERIFY_EVALUATION_COUNT( use_n_times<1>(m1.col(0)+m1.template triangularView<Lower>().solve(m1.col(0))), 2 ); // FIXME could be one by adding m1.col() inplace
+    VERIFY_EVALUATION_COUNT( use_n_times<10>(m1.col(0)+m1.template triangularView<Lower>().solve(m1.col(0))), 2 );
+  }
+
+  {
+    VERIFY( verify_eval_type<10>(m1, m1) );
+    if(!NumTraits<Scalar>::IsComplex)
+    {
+      VERIFY( verify_eval_type<3>(2*m1, 2*m1) );
+      VERIFY( verify_eval_type<4>(2*m1, m1) );
+    }
+    else
+    {
+      VERIFY( verify_eval_type<2>(2*m1, 2*m1) );
+      VERIFY( verify_eval_type<3>(2*m1, m1) );
+    }
+    VERIFY( verify_eval_type<2>(m1+m1, m1+m1) );
+    VERIFY( verify_eval_type<3>(m1+m1, m1) );
+    VERIFY( verify_eval_type<1>(m1*m1.transpose(), m2) );
+    VERIFY( verify_eval_type<1>(m1*(m1+m1).transpose(), m2) );
+    VERIFY( verify_eval_type<2>(m1*m1.transpose(), m2) );
+    VERIFY( verify_eval_type<1>(m1+m1*m1, m1) );
+
+    VERIFY( verify_eval_type<1>(m1.template triangularView<Lower>().solve(m1), m1) );
+    VERIFY( verify_eval_type<1>(m1+m1.template triangularView<Lower>().solve(m1), m1) );
+  }
+}
+
+
+EIGEN_DECLARE_TEST(nesting_ops)
+{
+  CALL_SUBTEST_1(run_nesting_ops_1(MatrixXf::Random(25,25)));
+  CALL_SUBTEST_2(run_nesting_ops_1(MatrixXcd::Random(25,25)));
+  CALL_SUBTEST_3(run_nesting_ops_1(Matrix4f::Random()));
+  CALL_SUBTEST_4(run_nesting_ops_1(Matrix2d::Random()));
+
+  Index s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
+  CALL_SUBTEST_1( run_nesting_ops_2(MatrixXf(s,s)) );
+  CALL_SUBTEST_2( run_nesting_ops_2(MatrixXcd(s,s)) );
+  CALL_SUBTEST_3( run_nesting_ops_2(Matrix4f()) );
+  CALL_SUBTEST_4( run_nesting_ops_2(Matrix2d()) );
+  TEST_SET_BUT_UNUSED_VARIABLE(s)
+}

diff --git a/test/nomalloc.cpp b/test/nomalloc.cpp
new file mode 100644
index 0000000..cb4c073
--- /dev/null
+++ b/test/nomalloc.cpp

@@ -0,0 +1,228 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// discard stack allocation as that too bypasses malloc
+#define EIGEN_STACK_ALLOCATION_LIMIT 0
+// heap allocation will raise an assert if enabled at runtime
+#define EIGEN_RUNTIME_NO_MALLOC
+
+#include "main.h"
+#include <Eigen/Cholesky>
+#include <Eigen/Eigenvalues>
+#include <Eigen/LU>
+#include <Eigen/QR>
+#include <Eigen/SVD>
+
+template<typename MatrixType> void nomalloc(const MatrixType& m)
+{
+  /* this test check no dynamic memory allocation are issued with fixed-size matrices
+  */
+  typedef typename MatrixType::Scalar Scalar;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m2 = MatrixType::Random(rows, cols),
+             m3(rows, cols);
+
+  Scalar s1 = internal::random<Scalar>();
+
+  Index r = internal::random<Index>(0, rows-1),
+        c = internal::random<Index>(0, cols-1);
+
+  VERIFY_IS_APPROX((m1+m2)*s1,              s1*m1+s1*m2);
+  VERIFY_IS_APPROX((m1+m2)(r,c), (m1(r,c))+(m2(r,c)));
+  VERIFY_IS_APPROX(m1.cwiseProduct(m1.block(0,0,rows,cols)), (m1.array()*m1.array()).matrix());
+  VERIFY_IS_APPROX((m1*m1.transpose())*m2,  m1*(m1.transpose()*m2));
+  
+  m2.col(0).noalias() = m1 * m1.col(0);
+  m2.col(0).noalias() -= m1.adjoint() * m1.col(0);
+  m2.col(0).noalias() -= m1 * m1.row(0).adjoint();
+  m2.col(0).noalias() -= m1.adjoint() * m1.row(0).adjoint();
+
+  m2.row(0).noalias() = m1.row(0) * m1;
+  m2.row(0).noalias() -= m1.row(0) * m1.adjoint();
+  m2.row(0).noalias() -= m1.col(0).adjoint() * m1;
+  m2.row(0).noalias() -= m1.col(0).adjoint() * m1.adjoint();
+  VERIFY_IS_APPROX(m2,m2);
+  
+  m2.col(0).noalias() = m1.template triangularView<Upper>() * m1.col(0);
+  m2.col(0).noalias() -= m1.adjoint().template triangularView<Upper>() * m1.col(0);
+  m2.col(0).noalias() -= m1.template triangularView<Upper>() * m1.row(0).adjoint();
+  m2.col(0).noalias() -= m1.adjoint().template triangularView<Upper>() * m1.row(0).adjoint();
+
+  m2.row(0).noalias() = m1.row(0) * m1.template triangularView<Upper>();
+  m2.row(0).noalias() -= m1.row(0) * m1.adjoint().template triangularView<Upper>();
+  m2.row(0).noalias() -= m1.col(0).adjoint() * m1.template triangularView<Upper>();
+  m2.row(0).noalias() -= m1.col(0).adjoint() * m1.adjoint().template triangularView<Upper>();
+  VERIFY_IS_APPROX(m2,m2);
+  
+  m2.col(0).noalias() = m1.template selfadjointView<Upper>() * m1.col(0);
+  m2.col(0).noalias() -= m1.adjoint().template selfadjointView<Upper>() * m1.col(0);
+  m2.col(0).noalias() -= m1.template selfadjointView<Upper>() * m1.row(0).adjoint();
+  m2.col(0).noalias() -= m1.adjoint().template selfadjointView<Upper>() * m1.row(0).adjoint();
+
+  m2.row(0).noalias() = m1.row(0) * m1.template selfadjointView<Upper>();
+  m2.row(0).noalias() -= m1.row(0) * m1.adjoint().template selfadjointView<Upper>();
+  m2.row(0).noalias() -= m1.col(0).adjoint() * m1.template selfadjointView<Upper>();
+  m2.row(0).noalias() -= m1.col(0).adjoint() * m1.adjoint().template selfadjointView<Upper>();
+  VERIFY_IS_APPROX(m2,m2);
+  
+  m2.template selfadjointView<Lower>().rankUpdate(m1.col(0),-1);
+  m2.template selfadjointView<Upper>().rankUpdate(m1.row(0),-1);
+  m2.template selfadjointView<Lower>().rankUpdate(m1.col(0), m1.col(0)); // rank-2
+
+  // The following fancy matrix-matrix products are not safe yet regarding static allocation
+  m2.template selfadjointView<Lower>().rankUpdate(m1);
+  m2 += m2.template triangularView<Upper>() * m1;
+  m2.template triangularView<Upper>() = m2 * m2;
+  m1 += m1.template selfadjointView<Lower>() * m2;
+  VERIFY_IS_APPROX(m2,m2);
+}
+
+template<typename Scalar>
+void ctms_decompositions()
+{
+  const int maxSize = 16;
+  const int size    = 12;
+
+  typedef Eigen::Matrix<Scalar,
+                        Eigen::Dynamic, Eigen::Dynamic,
+                        0,
+                        maxSize, maxSize> Matrix;
+
+  typedef Eigen::Matrix<Scalar,
+                        Eigen::Dynamic, 1,
+                        0,
+                        maxSize, 1> Vector;
+
+  typedef Eigen::Matrix<std::complex<Scalar>,
+                        Eigen::Dynamic, Eigen::Dynamic,
+                        0,
+                        maxSize, maxSize> ComplexMatrix;
+
+  const Matrix A(Matrix::Random(size, size)), B(Matrix::Random(size, size));
+  Matrix X(size,size);
+  const ComplexMatrix complexA(ComplexMatrix::Random(size, size));
+  const Matrix saA = A.adjoint() * A;
+  const Vector b(Vector::Random(size));
+  Vector x(size);
+
+  // Cholesky module
+  Eigen::LLT<Matrix>  LLT;  LLT.compute(A);
+  X = LLT.solve(B);
+  x = LLT.solve(b);
+  Eigen::LDLT<Matrix> LDLT; LDLT.compute(A);
+  X = LDLT.solve(B);
+  x = LDLT.solve(b);
+
+  // Eigenvalues module
+  Eigen::HessenbergDecomposition<ComplexMatrix> hessDecomp;        hessDecomp.compute(complexA);
+  Eigen::ComplexSchur<ComplexMatrix>            cSchur(size);      cSchur.compute(complexA);
+  Eigen::ComplexEigenSolver<ComplexMatrix>      cEigSolver;        cEigSolver.compute(complexA);
+  Eigen::EigenSolver<Matrix>                    eigSolver;         eigSolver.compute(A);
+  Eigen::SelfAdjointEigenSolver<Matrix>         saEigSolver(size); saEigSolver.compute(saA);
+  Eigen::Tridiagonalization<Matrix>             tridiag;           tridiag.compute(saA);
+
+  // LU module
+  Eigen::PartialPivLU<Matrix> ppLU; ppLU.compute(A);
+  X = ppLU.solve(B);
+  x = ppLU.solve(b);
+  Eigen::FullPivLU<Matrix>    fpLU; fpLU.compute(A);
+  X = fpLU.solve(B);
+  x = fpLU.solve(b);
+
+  // QR module
+  Eigen::HouseholderQR<Matrix>        hQR;  hQR.compute(A);
+  X = hQR.solve(B);
+  x = hQR.solve(b);
+  Eigen::ColPivHouseholderQR<Matrix>  cpQR; cpQR.compute(A);
+  X = cpQR.solve(B);
+  x = cpQR.solve(b);
+  Eigen::FullPivHouseholderQR<Matrix> fpQR; fpQR.compute(A);
+  // FIXME X = fpQR.solve(B);
+  x = fpQR.solve(b);
+
+  // SVD module
+  Eigen::JacobiSVD<Matrix> jSVD; jSVD.compute(A, ComputeFullU | ComputeFullV);
+}
+
+void test_zerosized() {
+  // default constructors:
+  Eigen::MatrixXd A;
+  Eigen::VectorXd v;
+  // explicit zero-sized:
+  Eigen::ArrayXXd A0(0,0);
+  Eigen::ArrayXd v0(0);
+
+  // assigning empty objects to each other:
+  A=A0;
+  v=v0;
+}
+
+template<typename MatrixType> void test_reference(const MatrixType& m) {
+  typedef typename MatrixType::Scalar Scalar;
+  enum { Flag          =  MatrixType::IsRowMajor ? Eigen::RowMajor : Eigen::ColMajor};
+  enum { TransposeFlag = !MatrixType::IsRowMajor ? Eigen::RowMajor : Eigen::ColMajor};
+  Index rows = m.rows(), cols=m.cols();
+  typedef Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Flag         > MatrixX;
+  typedef Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, TransposeFlag> MatrixXT;
+  // Dynamic reference:
+  typedef Eigen::Ref<const MatrixX  > Ref;
+  typedef Eigen::Ref<const MatrixXT > RefT;
+
+  Ref r1(m);
+  Ref r2(m.block(rows/3, cols/4, rows/2, cols/2));
+  RefT r3(m.transpose());
+  RefT r4(m.topLeftCorner(rows/2, cols/2).transpose());
+
+  VERIFY_RAISES_ASSERT(RefT r5(m));
+  VERIFY_RAISES_ASSERT(Ref r6(m.transpose()));
+  VERIFY_RAISES_ASSERT(Ref r7(Scalar(2) * m));
+
+  // Copy constructors shall also never malloc
+  Ref r8 = r1;
+  RefT r9 = r3;
+
+  // Initializing from a compatible Ref shall also never malloc
+  Eigen::Ref<const MatrixX, Unaligned, Stride<Dynamic, Dynamic> > r10=r8, r11=m;
+
+  // Initializing from an incompatible Ref will malloc:
+  typedef Eigen::Ref<const MatrixX, Aligned> RefAligned;
+  VERIFY_RAISES_ASSERT(RefAligned r12=r10);
+  VERIFY_RAISES_ASSERT(Ref r13=r10); // r10 has more dynamic strides
+
+}
+
+EIGEN_DECLARE_TEST(nomalloc)
+{
+  // create some dynamic objects
+  Eigen::MatrixXd M1 = MatrixXd::Random(3,3);
+  Ref<const MatrixXd> R1 = 2.0*M1; // Ref requires temporary
+
+  // from here on prohibit malloc:
+  Eigen::internal::set_is_malloc_allowed(false);
+
+  // check that our operator new is indeed called:
+  VERIFY_RAISES_ASSERT(MatrixXd dummy(MatrixXd::Random(3,3)));
+  CALL_SUBTEST_1(nomalloc(Matrix<float, 1, 1>()) );
+  CALL_SUBTEST_2(nomalloc(Matrix4d()) );
+  CALL_SUBTEST_3(nomalloc(Matrix<float,32,32>()) );
+  
+  // Check decomposition modules with dynamic matrices that have a known compile-time max size (ctms)
+  CALL_SUBTEST_4(ctms_decompositions<float>());
+
+  CALL_SUBTEST_5(test_zerosized());
+
+  CALL_SUBTEST_6(test_reference(Matrix<float,32,32>()));
+  CALL_SUBTEST_7(test_reference(R1));
+  CALL_SUBTEST_8(Ref<MatrixXd> R2 = M1.topRows<2>(); test_reference(R2));
+}

diff --git a/test/nullary.cpp b/test/nullary.cpp
new file mode 100644
index 0000000..9b25ea4
--- /dev/null
+++ b/test/nullary.cpp

@@ -0,0 +1,341 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010-2011 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename MatrixType>
+bool equalsIdentity(const MatrixType& A)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  Scalar zero = static_cast<Scalar>(0);
+
+  bool offDiagOK = true;
+  for (Index i = 0; i < A.rows(); ++i) {
+    for (Index j = i+1; j < A.cols(); ++j) {
+      offDiagOK = offDiagOK && (A(i,j) == zero);
+    }
+  }
+  for (Index i = 0; i < A.rows(); ++i) {
+    for (Index j = 0; j < (std::min)(i, A.cols()); ++j) {
+      offDiagOK = offDiagOK && (A(i,j) == zero);
+    }
+  }
+
+  bool diagOK = (A.diagonal().array() == 1).all();
+  return offDiagOK && diagOK;
+
+}
+
+template<typename VectorType>
+void check_extremity_accuracy(const VectorType &v, const typename VectorType::Scalar &low, const typename VectorType::Scalar &high)
+{
+  typedef typename VectorType::Scalar Scalar;
+  typedef typename VectorType::RealScalar RealScalar;
+
+  RealScalar prec = internal::is_same<RealScalar,float>::value ? NumTraits<RealScalar>::dummy_precision()*10 : NumTraits<RealScalar>::dummy_precision()/10;
+  Index size = v.size();
+
+  if(size<20)
+    return;
+
+  for (int i=0; i<size; ++i)
+  {
+    if(i<5 || i>size-6)
+    {
+      Scalar ref = (low*RealScalar(size-i-1))/RealScalar(size-1) + (high*RealScalar(i))/RealScalar(size-1);
+      if(std::abs(ref)>1)
+      {
+        if(!internal::isApprox(v(i), ref, prec))
+          std::cout << v(i) << " != " << ref << "  ; relative error: " << std::abs((v(i)-ref)/ref) << "  ; required precision: " << prec << "  ; range: " << low << "," << high << "  ; i: " << i << "\n";
+        VERIFY(internal::isApprox(v(i), (low*RealScalar(size-i-1))/RealScalar(size-1) + (high*RealScalar(i))/RealScalar(size-1), prec));
+      }
+    }
+  }
+}
+
+template<typename VectorType>
+void testVectorType(const VectorType& base)
+{
+  typedef typename VectorType::Scalar Scalar;
+  typedef typename VectorType::RealScalar RealScalar;
+
+  const Index size = base.size();
+  
+  Scalar high = internal::random<Scalar>(-500,500);
+  Scalar low = (size == 1 ? high : internal::random<Scalar>(-500,500));
+  if (numext::real(low)>numext::real(high)) std::swap(low,high);
+
+  // check low==high
+  if(internal::random<float>(0.f,1.f)<0.05f)
+    low = high;
+  // check abs(low) >> abs(high)
+  else if(size>2 && std::numeric_limits<RealScalar>::max_exponent10>0 && internal::random<float>(0.f,1.f)<0.1f)
+    low = -internal::random<Scalar>(1,2) * RealScalar(std::pow(RealScalar(10),std::numeric_limits<RealScalar>::max_exponent10/2));
+
+  const Scalar step = ((size == 1) ? 1 : (high-low)/RealScalar(size-1));
+
+  // check whether the result yields what we expect it to do
+  VectorType m(base);
+  m.setLinSpaced(size,low,high);
+
+  if(!NumTraits<Scalar>::IsInteger)
+  {
+    VectorType n(size);
+    for (int i=0; i<size; ++i)
+      n(i) = low+RealScalar(i)*step;
+    VERIFY_IS_APPROX(m,n);
+
+    CALL_SUBTEST( check_extremity_accuracy(m, low, high) );
+  }
+
+  RealScalar range_length = numext::real(high-low);
+  if((!NumTraits<Scalar>::IsInteger) || (range_length>=size && (Index(range_length)%(size-1))==0) || (Index(range_length+1)<size && (size%Index(range_length+1))==0))
+  {
+    VectorType n(size);
+    if((!NumTraits<Scalar>::IsInteger) || (range_length>=size))
+      for (int i=0; i<size; ++i)
+        n(i) = size==1 ? low : (low + ((high-low)*Scalar(i))/RealScalar(size-1));
+    else
+      for (int i=0; i<size; ++i)
+        n(i) = size==1 ? low : low + Scalar((double(range_length+1)*double(i))/double(size));
+    VERIFY_IS_APPROX(m,n);
+
+    // random access version
+    m = VectorType::LinSpaced(size,low,high);
+    VERIFY_IS_APPROX(m,n);
+    VERIFY( internal::isApprox(m(m.size()-1),high) );
+    VERIFY( size==1 || internal::isApprox(m(0),low) );
+    VERIFY_IS_EQUAL(m(m.size()-1) , high);
+    if(!NumTraits<Scalar>::IsInteger)
+      CALL_SUBTEST( check_extremity_accuracy(m, low, high) );
+  }
+
+  VERIFY( numext::real(m(m.size()-1)) <= numext::real(high) );
+  VERIFY( (m.array().real() <= numext::real(high)).all() );
+  VERIFY( (m.array().real() >= numext::real(low)).all() );
+
+
+  VERIFY( numext::real(m(m.size()-1)) >= numext::real(low) );
+  if(size>=1)
+  {
+    VERIFY( internal::isApprox(m(0),low) );
+    VERIFY_IS_EQUAL(m(0) , low);
+  }
+
+  // check whether everything works with row and col major vectors
+  Matrix<Scalar,Dynamic,1> row_vector(size);
+  Matrix<Scalar,1,Dynamic> col_vector(size);
+  row_vector.setLinSpaced(size,low,high);
+  col_vector.setLinSpaced(size,low,high);
+  // when using the extended precision (e.g., FPU) the relative error might exceed 1 bit
+  // when computing the squared sum in isApprox, thus the 2x factor.
+  VERIFY( row_vector.isApprox(col_vector.transpose(), RealScalar(2)*NumTraits<Scalar>::epsilon()));
+
+  Matrix<Scalar,Dynamic,1> size_changer(size+50);
+  size_changer.setLinSpaced(size,low,high);
+  VERIFY( size_changer.size() == size );
+
+  typedef Matrix<Scalar,1,1> ScalarMatrix;
+  ScalarMatrix scalar;
+  scalar.setLinSpaced(1,low,high);
+  VERIFY_IS_APPROX( scalar, ScalarMatrix::Constant(high) );
+  VERIFY_IS_APPROX( ScalarMatrix::LinSpaced(1,low,high), ScalarMatrix::Constant(high) );
+
+  // regression test for bug 526 (linear vectorized transversal)
+  if (size > 1 && (!NumTraits<Scalar>::IsInteger)) {
+    m.tail(size-1).setLinSpaced(low, high);
+    VERIFY_IS_APPROX(m(size-1), high);
+  }
+
+  // regression test for bug 1383 (LinSpaced with empty size/range)
+  {
+    Index n0 = VectorType::SizeAtCompileTime==Dynamic ? 0 : VectorType::SizeAtCompileTime;
+    low = internal::random<Scalar>();
+    m = VectorType::LinSpaced(n0,low,low-RealScalar(1));
+    VERIFY(m.size()==n0);
+
+    if(VectorType::SizeAtCompileTime==Dynamic)
+    {
+      VERIFY_IS_EQUAL(VectorType::LinSpaced(n0,0,Scalar(n0-1)).sum(),Scalar(0));
+      VERIFY_IS_EQUAL(VectorType::LinSpaced(n0,low,low-RealScalar(1)).sum(),Scalar(0));
+    }
+
+    m.setLinSpaced(n0,0,Scalar(n0-1));
+    VERIFY(m.size()==n0);
+    m.setLinSpaced(n0,low,low-RealScalar(1));
+    VERIFY(m.size()==n0);
+
+    // empty range only:
+    VERIFY_IS_APPROX(VectorType::LinSpaced(size,low,low),VectorType::Constant(size,low));
+    m.setLinSpaced(size,low,low);
+    VERIFY_IS_APPROX(m,VectorType::Constant(size,low));
+
+    if(NumTraits<Scalar>::IsInteger)
+    {
+      VERIFY_IS_APPROX( VectorType::LinSpaced(size,low,low+Scalar(size-1)), VectorType::LinSpaced(size,low+Scalar(size-1),low).reverse() );
+
+      if(VectorType::SizeAtCompileTime==Dynamic)
+      {
+        // Check negative multiplicator path:
+        for(Index k=1; k<5; ++k)
+          VERIFY_IS_APPROX( VectorType::LinSpaced(size,low,low+Scalar((size-1)*k)), VectorType::LinSpaced(size,low+Scalar((size-1)*k),low).reverse() );
+        // Check negative divisor path:
+        for(Index k=1; k<5; ++k)
+          VERIFY_IS_APPROX( VectorType::LinSpaced(size*k,low,low+Scalar(size-1)), VectorType::LinSpaced(size*k,low+Scalar(size-1),low).reverse() );
+      }
+    }
+  }
+
+  // test setUnit()
+  if(m.size()>0)
+  {
+    for(Index k=0; k<10; ++k)
+    {
+      Index i = internal::random<Index>(0,m.size()-1);
+      m.setUnit(i);
+      VERIFY_IS_APPROX( m, VectorType::Unit(m.size(), i) );
+    }
+    if(VectorType::SizeAtCompileTime==Dynamic)
+    {
+      Index i = internal::random<Index>(0,2*m.size()-1);
+      m.setUnit(2*m.size(),i);
+      VERIFY_IS_APPROX( m, VectorType::Unit(m.size(),i) );
+    }
+  }
+
+}
+
+template<typename MatrixType>
+void testMatrixType(const MatrixType& m)
+{
+  using std::abs;
+  const Index rows = m.rows();
+  const Index cols = m.cols();
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+
+  Scalar s1;
+  do {
+    s1 = internal::random<Scalar>();
+  } while(abs(s1)<RealScalar(1e-5) && (!NumTraits<Scalar>::IsInteger));
+
+  MatrixType A;
+  A.setIdentity(rows, cols);
+  VERIFY(equalsIdentity(A));
+  VERIFY(equalsIdentity(MatrixType::Identity(rows, cols)));
+
+
+  A = MatrixType::Constant(rows,cols,s1);
+  Index i = internal::random<Index>(0,rows-1);
+  Index j = internal::random<Index>(0,cols-1);
+  VERIFY_IS_APPROX( MatrixType::Constant(rows,cols,s1)(i,j), s1 );
+  VERIFY_IS_APPROX( MatrixType::Constant(rows,cols,s1).coeff(i,j), s1 );
+  VERIFY_IS_APPROX( A(i,j), s1 );
+}
+
+template<int>
+void bug79()
+{
+  // Assignment of a RowVectorXd to a MatrixXd (regression test for bug #79).
+  VERIFY( (MatrixXd(RowVectorXd::LinSpaced(3, 0, 1)) - RowVector3d(0, 0.5, 1)).norm() < std::numeric_limits<double>::epsilon() );
+}
+
+template<int>
+void bug1630()
+{
+  Array4d x4 = Array4d::LinSpaced(0.0, 1.0);
+  Array3d x3(Array4d::LinSpaced(0.0, 1.0).head(3));
+  VERIFY_IS_APPROX(x4.head(3), x3);
+}
+
+template<int>
+void nullary_overflow()
+{
+  // Check possible overflow issue
+  int n = 60000;
+  ArrayXi a1(n), a2(n);
+  a1.setLinSpaced(n, 0, n-1);
+  for(int i=0; i<n; ++i)
+    a2(i) = i;
+  VERIFY_IS_APPROX(a1,a2);
+}
+
+template<int>
+void nullary_internal_logic()
+{
+  // check some internal logic
+  VERIFY((  internal::has_nullary_operator<internal::scalar_constant_op<double> >::value ));
+  VERIFY(( !internal::has_unary_operator<internal::scalar_constant_op<double> >::value ));
+  VERIFY(( !internal::has_binary_operator<internal::scalar_constant_op<double> >::value ));
+  VERIFY((  internal::functor_has_linear_access<internal::scalar_constant_op<double> >::ret ));
+
+  VERIFY(( !internal::has_nullary_operator<internal::scalar_identity_op<double> >::value ));
+  VERIFY(( !internal::has_unary_operator<internal::scalar_identity_op<double> >::value ));
+  VERIFY((  internal::has_binary_operator<internal::scalar_identity_op<double> >::value ));
+  VERIFY(( !internal::functor_has_linear_access<internal::scalar_identity_op<double> >::ret ));
+
+  VERIFY(( !internal::has_nullary_operator<internal::linspaced_op<float> >::value ));
+  VERIFY((  internal::has_unary_operator<internal::linspaced_op<float> >::value ));
+  VERIFY(( !internal::has_binary_operator<internal::linspaced_op<float> >::value ));
+  VERIFY((  internal::functor_has_linear_access<internal::linspaced_op<float> >::ret ));
+
+  // Regression unit test for a weird MSVC bug.
+  // Search "nullary_wrapper_workaround_msvc" in CoreEvaluators.h for the details.
+  // See also traits<Ref>::match.
+  {
+    MatrixXf A = MatrixXf::Random(3,3);
+    Ref<const MatrixXf> R = 2.0*A;
+    VERIFY_IS_APPROX(R, A+A);
+
+    Ref<const MatrixXf> R1 = MatrixXf::Random(3,3)+A;
+
+    VectorXi V = VectorXi::Random(3);
+    Ref<const VectorXi> R2 = VectorXi::LinSpaced(3,1,3)+V;
+    VERIFY_IS_APPROX(R2, V+Vector3i(1,2,3));
+
+    VERIFY((  internal::has_nullary_operator<internal::scalar_constant_op<float> >::value ));
+    VERIFY(( !internal::has_unary_operator<internal::scalar_constant_op<float> >::value ));
+    VERIFY(( !internal::has_binary_operator<internal::scalar_constant_op<float> >::value ));
+    VERIFY((  internal::functor_has_linear_access<internal::scalar_constant_op<float> >::ret ));
+
+    VERIFY(( !internal::has_nullary_operator<internal::linspaced_op<int> >::value ));
+    VERIFY((  internal::has_unary_operator<internal::linspaced_op<int> >::value ));
+    VERIFY(( !internal::has_binary_operator<internal::linspaced_op<int> >::value ));
+    VERIFY((  internal::functor_has_linear_access<internal::linspaced_op<int> >::ret ));
+  }
+}
+
+EIGEN_DECLARE_TEST(nullary)
+{
+  CALL_SUBTEST_1( testMatrixType(Matrix2d()) );
+  CALL_SUBTEST_2( testMatrixType(MatrixXcf(internal::random<int>(1,300),internal::random<int>(1,300))) );
+  CALL_SUBTEST_3( testMatrixType(MatrixXf(internal::random<int>(1,300),internal::random<int>(1,300))) );
+  
+  for(int i = 0; i < g_repeat*10; i++) {
+    CALL_SUBTEST_3( testVectorType(VectorXcd(internal::random<int>(1,30000))) );
+    CALL_SUBTEST_4( testVectorType(VectorXd(internal::random<int>(1,30000))) );
+    CALL_SUBTEST_5( testVectorType(Vector4d()) );  // regression test for bug 232
+    CALL_SUBTEST_6( testVectorType(Vector3d()) );
+    CALL_SUBTEST_7( testVectorType(VectorXf(internal::random<int>(1,30000))) );
+    CALL_SUBTEST_8( testVectorType(Vector3f()) );
+    CALL_SUBTEST_8( testVectorType(Vector4f()) );
+    CALL_SUBTEST_8( testVectorType(Matrix<float,8,1>()) );
+    CALL_SUBTEST_8( testVectorType(Matrix<float,1,1>()) );
+
+    CALL_SUBTEST_9( testVectorType(VectorXi(internal::random<int>(1,10))) );
+    CALL_SUBTEST_9( testVectorType(VectorXi(internal::random<int>(9,300))) );
+    CALL_SUBTEST_9( testVectorType(Matrix<int,1,1>()) );
+  }
+
+  CALL_SUBTEST_6( bug79<0>() );
+  CALL_SUBTEST_6( bug1630<0>() );
+  CALL_SUBTEST_9( nullary_overflow<0>() );
+  CALL_SUBTEST_10( nullary_internal_logic<0>() );
+}

diff --git a/test/num_dimensions.cpp b/test/num_dimensions.cpp
new file mode 100644
index 0000000..7ad7ef6
--- /dev/null
+++ b/test/num_dimensions.cpp

@@ -0,0 +1,90 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/SparseCore>
+
+template<int ExpectedDim,typename Xpr>
+void check_dim(const Xpr& ) {
+  STATIC_CHECK( Xpr::NumDimensions == ExpectedDim );
+}
+
+#if EIGEN_HAS_CXX11
+template<template <typename,int,int> class Object>
+void map_num_dimensions()
+{
+  typedef Object<double, 1, 1> ArrayScalarType;
+  typedef Object<double, 2, 1> ArrayVectorType;
+  typedef Object<double, 1, 2> TransposeArrayVectorType;
+  typedef Object<double, 2, 2> ArrayType;
+  typedef Object<double, Eigen::Dynamic, 1> DynamicArrayVectorType;
+  typedef Object<double, 1, Eigen::Dynamic> DynamicTransposeArrayVectorType;
+  typedef Object<double, Eigen::Dynamic, Eigen::Dynamic> DynamicArrayType;
+
+  STATIC_CHECK(ArrayScalarType::NumDimensions == 0);
+  STATIC_CHECK(ArrayVectorType::NumDimensions == 1);
+  STATIC_CHECK(TransposeArrayVectorType::NumDimensions == 1);
+  STATIC_CHECK(ArrayType::NumDimensions == 2);
+  STATIC_CHECK(DynamicArrayVectorType::NumDimensions == 1);
+  STATIC_CHECK(DynamicTransposeArrayVectorType::NumDimensions == 1);
+  STATIC_CHECK(DynamicArrayType::NumDimensions == 2);
+
+  typedef Eigen::Map<ArrayScalarType> ArrayScalarMap;
+  typedef Eigen::Map<ArrayVectorType> ArrayVectorMap;
+  typedef Eigen::Map<TransposeArrayVectorType> TransposeArrayVectorMap;
+  typedef Eigen::Map<ArrayType> ArrayMap;
+  typedef Eigen::Map<DynamicArrayVectorType> DynamicArrayVectorMap;
+  typedef Eigen::Map<DynamicTransposeArrayVectorType> DynamicTransposeArrayVectorMap;
+  typedef Eigen::Map<DynamicArrayType> DynamicArrayMap;
+
+  STATIC_CHECK(ArrayScalarMap::NumDimensions == 0);
+  STATIC_CHECK(ArrayVectorMap::NumDimensions == 1);
+  STATIC_CHECK(TransposeArrayVectorMap::NumDimensions == 1);
+  STATIC_CHECK(ArrayMap::NumDimensions == 2);
+  STATIC_CHECK(DynamicArrayVectorMap::NumDimensions == 1);
+  STATIC_CHECK(DynamicTransposeArrayVectorMap::NumDimensions == 1);
+  STATIC_CHECK(DynamicArrayMap::NumDimensions == 2);
+}
+
+template<typename Scalar, int Rows, int Cols>
+using TArray = Array<Scalar,Rows,Cols>;
+
+template<typename Scalar, int Rows, int Cols>
+using TMatrix = Matrix<Scalar,Rows,Cols>;
+
+#endif
+
+EIGEN_DECLARE_TEST(num_dimensions)
+{
+  int n = 10;
+  ArrayXXd A(n,n);
+  CALL_SUBTEST( check_dim<2>(A) );
+  CALL_SUBTEST( check_dim<2>(A.block(1,1,2,2)) );
+  CALL_SUBTEST( check_dim<1>(A.col(1)) );
+  CALL_SUBTEST( check_dim<1>(A.row(1)) );
+
+  MatrixXd M(n,n);
+  CALL_SUBTEST( check_dim<0>(M.row(1)*M.col(1)) );
+
+  SparseMatrix<double> S(n,n);
+  CALL_SUBTEST( check_dim<2>(S) );
+  CALL_SUBTEST( check_dim<2>(S.block(1,1,2,2)) );
+  CALL_SUBTEST( check_dim<1>(S.col(1)) );
+  CALL_SUBTEST( check_dim<1>(S.row(1)) );
+
+  SparseVector<double> s(n);
+  CALL_SUBTEST( check_dim<1>(s) );
+  CALL_SUBTEST( check_dim<1>(s.head(2)) );
+  
+
+  #if EIGEN_HAS_CXX11
+  CALL_SUBTEST( map_num_dimensions<TArray>() );
+  CALL_SUBTEST( map_num_dimensions<TMatrix>() );
+  #endif
+}

diff --git a/test/numext.cpp b/test/numext.cpp
new file mode 100644
index 0000000..8a2fde5
--- /dev/null
+++ b/test/numext.cpp

@@ -0,0 +1,275 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename T, typename U>
+bool check_if_equal_or_nans(const T& actual, const U& expected) {
+  return ((actual == expected) || ((numext::isnan)(actual) && (numext::isnan)(expected)));
+}
+
+template<typename T, typename U>
+bool check_if_equal_or_nans(const std::complex<T>& actual, const std::complex<U>& expected) {
+  return check_if_equal_or_nans(numext::real(actual), numext::real(expected))
+         && check_if_equal_or_nans(numext::imag(actual), numext::imag(expected));
+}
+
+template<typename T, typename U>
+bool test_is_equal_or_nans(const T& actual, const U& expected)
+{
+    if (check_if_equal_or_nans(actual, expected)) {
+      return true;
+    }
+
+    // false:
+    std::cerr
+        << "\n    actual   = " << actual
+        << "\n    expected = " << expected << "\n\n";
+    return false;
+}
+
+#define VERIFY_IS_EQUAL_OR_NANS(a, b) VERIFY(test_is_equal_or_nans(a, b))
+
+template<typename T>
+void check_abs() {
+  typedef typename NumTraits<T>::Real Real;
+  Real zero(0);
+
+  if(NumTraits<T>::IsSigned)
+    VERIFY_IS_EQUAL(numext::abs(-T(1)), T(1));
+  VERIFY_IS_EQUAL(numext::abs(T(0)), T(0));
+  VERIFY_IS_EQUAL(numext::abs(T(1)), T(1));
+
+  for(int k=0; k<100; ++k)
+  {
+    T x = internal::random<T>();
+    if(!internal::is_same<T,bool>::value)
+      x = x/Real(2);
+    if(NumTraits<T>::IsSigned)
+    {
+      VERIFY_IS_EQUAL(numext::abs(x), numext::abs(-x));
+      VERIFY( numext::abs(-x) >= zero );
+    }
+    VERIFY( numext::abs(x) >= zero );
+    VERIFY_IS_APPROX( numext::abs2(x), numext::abs2(numext::abs(x)) );
+  }
+}
+
+template<typename T>
+void check_arg() {
+  typedef typename NumTraits<T>::Real Real;
+  VERIFY_IS_EQUAL(numext::abs(T(0)), T(0));
+  VERIFY_IS_EQUAL(numext::abs(T(1)), T(1));
+
+  for(int k=0; k<100; ++k)
+  {
+    T x = internal::random<T>();
+    Real y = numext::arg(x);
+    VERIFY_IS_APPROX( y, std::arg(x) );
+  }
+}
+
+template<typename T>
+struct check_sqrt_impl {
+  static void run() {
+    for (int i=0; i<1000; ++i) {
+      const T x = numext::abs(internal::random<T>());
+      const T sqrtx = numext::sqrt(x);
+      VERIFY_IS_APPROX(sqrtx*sqrtx, x);
+    }
+
+    // Corner cases.
+    const T zero = T(0);
+    const T one = T(1);
+    const T inf = std::numeric_limits<T>::infinity();
+    const T nan = std::numeric_limits<T>::quiet_NaN();
+    VERIFY_IS_EQUAL(numext::sqrt(zero), zero);
+    VERIFY_IS_EQUAL(numext::sqrt(inf), inf);
+    VERIFY((numext::isnan)(numext::sqrt(nan)));
+    VERIFY((numext::isnan)(numext::sqrt(-one)));
+  }
+};
+
+template<typename T>
+struct check_sqrt_impl<std::complex<T>  > {
+  static void run() {
+    typedef typename std::complex<T> ComplexT;
+
+    for (int i=0; i<1000; ++i) {
+      const ComplexT x = internal::random<ComplexT>();
+      const ComplexT sqrtx = numext::sqrt(x);
+      VERIFY_IS_APPROX(sqrtx*sqrtx, x);
+    }
+
+    // Corner cases.
+    const T zero = T(0);
+    const T one = T(1);
+    const T inf = std::numeric_limits<T>::infinity();
+    const T nan = std::numeric_limits<T>::quiet_NaN();
+
+    // Set of corner cases from https://en.cppreference.com/w/cpp/numeric/complex/sqrt
+    const int kNumCorners = 20;
+    const ComplexT corners[kNumCorners][2] = {
+      {ComplexT(zero, zero), ComplexT(zero, zero)},
+      {ComplexT(-zero, zero), ComplexT(zero, zero)},
+      {ComplexT(zero, -zero), ComplexT(zero, zero)},
+      {ComplexT(-zero, -zero), ComplexT(zero, zero)},
+      {ComplexT(one, inf), ComplexT(inf, inf)},
+      {ComplexT(nan, inf), ComplexT(inf, inf)},
+      {ComplexT(one, -inf), ComplexT(inf, -inf)},
+      {ComplexT(nan, -inf), ComplexT(inf, -inf)},
+      {ComplexT(-inf, one), ComplexT(zero, inf)},
+      {ComplexT(inf, one), ComplexT(inf, zero)},
+      {ComplexT(-inf, -one), ComplexT(zero, -inf)},
+      {ComplexT(inf, -one), ComplexT(inf, -zero)},
+      {ComplexT(-inf, nan), ComplexT(nan, inf)},
+      {ComplexT(inf, nan), ComplexT(inf, nan)},
+      {ComplexT(zero, nan), ComplexT(nan, nan)},
+      {ComplexT(one, nan), ComplexT(nan, nan)},
+      {ComplexT(nan, zero), ComplexT(nan, nan)},
+      {ComplexT(nan, one), ComplexT(nan, nan)},
+      {ComplexT(nan, -one), ComplexT(nan, nan)},
+      {ComplexT(nan, nan), ComplexT(nan, nan)},
+    };
+
+    for (int i=0; i<kNumCorners; ++i) {
+      const ComplexT& x = corners[i][0];
+      const ComplexT sqrtx = corners[i][1];
+      VERIFY_IS_EQUAL_OR_NANS(numext::sqrt(x), sqrtx);
+    }
+  }
+};
+
+template<typename T>
+void check_sqrt() {
+  check_sqrt_impl<T>::run();
+}
+
+template<typename T>
+struct check_rsqrt_impl {
+  static void run() {
+    const T zero = T(0);
+    const T one = T(1);
+    const T inf = std::numeric_limits<T>::infinity();
+    const T nan = std::numeric_limits<T>::quiet_NaN();
+
+    for (int i=0; i<1000; ++i) {
+      const T x = numext::abs(internal::random<T>());
+      const T rsqrtx = numext::rsqrt(x);
+      const T invx = one / x;
+      VERIFY_IS_APPROX(rsqrtx*rsqrtx, invx);
+    }
+
+    // Corner cases.
+    VERIFY_IS_EQUAL(numext::rsqrt(zero), inf);
+    VERIFY_IS_EQUAL(numext::rsqrt(inf), zero);
+    VERIFY((numext::isnan)(numext::rsqrt(nan)));
+    VERIFY((numext::isnan)(numext::rsqrt(-one)));
+  }
+};
+
+template<typename T>
+struct check_rsqrt_impl<std::complex<T> > {
+  static void run() {
+    typedef typename std::complex<T> ComplexT;
+    const T zero = T(0);
+    const T one = T(1);
+    const T inf = std::numeric_limits<T>::infinity();
+    const T nan = std::numeric_limits<T>::quiet_NaN();
+
+    for (int i=0; i<1000; ++i) {
+      const ComplexT x = internal::random<ComplexT>();
+      const ComplexT invx = ComplexT(one, zero) / x;
+      const ComplexT rsqrtx = numext::rsqrt(x);
+      VERIFY_IS_APPROX(rsqrtx*rsqrtx, invx);
+    }
+
+    // GCC and MSVC differ in their treatment of 1/(0 + 0i)
+    //   GCC/clang = (inf, nan)
+    //   MSVC = (nan, nan)
+    // and 1 / (x + inf i)
+    //   GCC/clang = (0, 0)
+    //   MSVC = (nan, nan)
+    #if (EIGEN_COMP_GNUC)
+    {
+      const int kNumCorners = 20;
+      const ComplexT corners[kNumCorners][2] = {
+        // Only consistent across GCC, clang
+        {ComplexT(zero, zero), ComplexT(zero, zero)},
+        {ComplexT(-zero, zero), ComplexT(zero, zero)},
+        {ComplexT(zero, -zero), ComplexT(zero, zero)},
+        {ComplexT(-zero, -zero), ComplexT(zero, zero)},
+        {ComplexT(one, inf), ComplexT(inf, inf)},
+        {ComplexT(nan, inf), ComplexT(inf, inf)},
+        {ComplexT(one, -inf), ComplexT(inf, -inf)},
+        {ComplexT(nan, -inf), ComplexT(inf, -inf)},
+        // Consistent across GCC, clang, MSVC
+        {ComplexT(-inf, one), ComplexT(zero, inf)},
+        {ComplexT(inf, one), ComplexT(inf, zero)},
+        {ComplexT(-inf, -one), ComplexT(zero, -inf)},
+        {ComplexT(inf, -one), ComplexT(inf, -zero)},
+        {ComplexT(-inf, nan), ComplexT(nan, inf)},
+        {ComplexT(inf, nan), ComplexT(inf, nan)},
+        {ComplexT(zero, nan), ComplexT(nan, nan)},
+        {ComplexT(one, nan), ComplexT(nan, nan)},
+        {ComplexT(nan, zero), ComplexT(nan, nan)},
+        {ComplexT(nan, one), ComplexT(nan, nan)},
+        {ComplexT(nan, -one), ComplexT(nan, nan)},
+        {ComplexT(nan, nan), ComplexT(nan, nan)},
+      };
+
+      for (int i=0; i<kNumCorners; ++i) {
+        const ComplexT& x = corners[i][0];
+        const ComplexT rsqrtx = ComplexT(one, zero) / corners[i][1];
+        VERIFY_IS_EQUAL_OR_NANS(numext::rsqrt(x), rsqrtx);
+      }
+    }
+    #endif
+  }
+};
+
+template<typename T>
+void check_rsqrt() {
+  check_rsqrt_impl<T>::run();
+}
+
+EIGEN_DECLARE_TEST(numext) {
+  for(int k=0; k<g_repeat; ++k)
+  {
+    CALL_SUBTEST( check_abs<bool>() );
+    CALL_SUBTEST( check_abs<signed char>() );
+    CALL_SUBTEST( check_abs<unsigned char>() );
+    CALL_SUBTEST( check_abs<short>() );
+    CALL_SUBTEST( check_abs<unsigned short>() );
+    CALL_SUBTEST( check_abs<int>() );
+    CALL_SUBTEST( check_abs<unsigned int>() );
+    CALL_SUBTEST( check_abs<long>() );
+    CALL_SUBTEST( check_abs<unsigned long>() );
+    CALL_SUBTEST( check_abs<half>() );
+    CALL_SUBTEST( check_abs<bfloat16>() );
+    CALL_SUBTEST( check_abs<float>() );
+    CALL_SUBTEST( check_abs<double>() );
+    CALL_SUBTEST( check_abs<long double>() );
+    CALL_SUBTEST( check_abs<std::complex<float> >() );
+    CALL_SUBTEST( check_abs<std::complex<double> >() );
+
+    CALL_SUBTEST( check_arg<std::complex<float> >() );
+    CALL_SUBTEST( check_arg<std::complex<double> >() );
+
+    CALL_SUBTEST( check_sqrt<float>() );
+    CALL_SUBTEST( check_sqrt<double>() );
+    CALL_SUBTEST( check_sqrt<std::complex<float> >() );
+    CALL_SUBTEST( check_sqrt<std::complex<double> >() );
+    
+    CALL_SUBTEST( check_rsqrt<float>() );
+    CALL_SUBTEST( check_rsqrt<double>() );
+    CALL_SUBTEST( check_rsqrt<std::complex<float> >() );
+    CALL_SUBTEST( check_rsqrt<std::complex<double> >() );
+  }
+}

diff --git a/test/packetmath.cpp b/test/packetmath.cpp
new file mode 100644
index 0000000..121ec72
--- /dev/null
+++ b/test/packetmath.cpp

@@ -0,0 +1,1302 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "packetmath_test_shared.h"
+#include "random_without_cast_overflow.h"
+
+template <typename T>
+inline T REF_ADD(const T& a, const T& b) {
+  return a + b;
+}
+template <typename T>
+inline T REF_SUB(const T& a, const T& b) {
+  return a - b;
+}
+template <typename T>
+inline T REF_MUL(const T& a, const T& b) {
+  return a * b;
+}
+template <typename T>
+inline T REF_DIV(const T& a, const T& b) {
+  return a / b;
+}
+template <typename T>
+inline T REF_ABS_DIFF(const T& a, const T& b) {
+  return a > b ? a - b : b - a;
+}
+
+// Specializations for bool.
+template <>
+inline bool REF_ADD(const bool& a, const bool& b) {
+  return a || b;
+}
+template <>
+inline bool REF_SUB(const bool& a, const bool& b) {
+  return a ^ b;
+}
+template <>
+inline bool REF_MUL(const bool& a, const bool& b) {
+  return a && b;
+}
+
+template <typename T>
+inline T REF_FREXP(const T& x, T& exp) {
+  int iexp;
+  EIGEN_USING_STD(frexp)
+  const T out = static_cast<T>(frexp(x, &iexp));
+  exp = static_cast<T>(iexp);
+  return out;
+}
+
+template <typename T>
+inline T REF_LDEXP(const T& x, const T& exp) {
+  EIGEN_USING_STD(ldexp)
+  return static_cast<T>(ldexp(x, static_cast<int>(exp)));
+}
+
+// Uses pcast to cast from one array to another.
+template <typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
+struct pcast_array;
+
+template <typename SrcPacket, typename TgtPacket, int TgtCoeffRatio>
+struct pcast_array<SrcPacket, TgtPacket, 1, TgtCoeffRatio> {
+  typedef typename internal::unpacket_traits<SrcPacket>::type SrcScalar;
+  typedef typename internal::unpacket_traits<TgtPacket>::type TgtScalar;
+  static void cast(const SrcScalar* src, size_t size, TgtScalar* dst) {
+    static const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+    static const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
+    size_t i;
+    for (i = 0; i < size && i + SrcPacketSize <= size; i += TgtPacketSize) {
+      internal::pstoreu(dst + i, internal::pcast<SrcPacket, TgtPacket>(internal::ploadu<SrcPacket>(src + i)));
+    }
+    // Leftovers that cannot be loaded into a packet.
+    for (; i < size; ++i) {
+      dst[i] = static_cast<TgtScalar>(src[i]);
+    }
+  }
+};
+
+template <typename SrcPacket, typename TgtPacket>
+struct pcast_array<SrcPacket, TgtPacket, 2, 1> {
+  static void cast(const typename internal::unpacket_traits<SrcPacket>::type* src, size_t size,
+                   typename internal::unpacket_traits<TgtPacket>::type* dst) {
+    static const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+    static const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
+    for (size_t i = 0; i < size; i += TgtPacketSize) {
+      SrcPacket a = internal::ploadu<SrcPacket>(src + i);
+      SrcPacket b = internal::ploadu<SrcPacket>(src + i + SrcPacketSize);
+      internal::pstoreu(dst + i, internal::pcast<SrcPacket, TgtPacket>(a, b));
+    }
+  }
+};
+
+template <typename SrcPacket, typename TgtPacket>
+struct pcast_array<SrcPacket, TgtPacket, 4, 1> {
+  static void cast(const typename internal::unpacket_traits<SrcPacket>::type* src, size_t size,
+                   typename internal::unpacket_traits<TgtPacket>::type* dst) {
+    static const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+    static const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
+    for (size_t i = 0; i < size; i += TgtPacketSize) {
+      SrcPacket a = internal::ploadu<SrcPacket>(src + i);
+      SrcPacket b = internal::ploadu<SrcPacket>(src + i + SrcPacketSize);
+      SrcPacket c = internal::ploadu<SrcPacket>(src + i + 2 * SrcPacketSize);
+      SrcPacket d = internal::ploadu<SrcPacket>(src + i + 3 * SrcPacketSize);
+      internal::pstoreu(dst + i, internal::pcast<SrcPacket, TgtPacket>(a, b, c, d));
+    }
+  }
+};
+
+template <typename SrcPacket, typename TgtPacket>
+struct pcast_array<SrcPacket, TgtPacket, 8, 1> {
+  static void cast(const typename internal::unpacket_traits<SrcPacket>::type* src, size_t size,
+                   typename internal::unpacket_traits<TgtPacket>::type* dst) {
+    static const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+    static const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
+    for (size_t i = 0; i < size; i += TgtPacketSize) {
+      SrcPacket a = internal::ploadu<SrcPacket>(src + i);
+      SrcPacket b = internal::ploadu<SrcPacket>(src + i + SrcPacketSize);
+      SrcPacket c = internal::ploadu<SrcPacket>(src + i + 2 * SrcPacketSize);
+      SrcPacket d = internal::ploadu<SrcPacket>(src + i + 3 * SrcPacketSize);
+      SrcPacket e = internal::ploadu<SrcPacket>(src + i + 4 * SrcPacketSize);
+      SrcPacket f = internal::ploadu<SrcPacket>(src + i + 5 * SrcPacketSize);
+      SrcPacket g = internal::ploadu<SrcPacket>(src + i + 6 * SrcPacketSize);
+      SrcPacket h = internal::ploadu<SrcPacket>(src + i + 7 * SrcPacketSize);
+      internal::pstoreu(dst + i, internal::pcast<SrcPacket, TgtPacket>(a, b, c, d, e, f, g, h));
+    }
+  }
+};
+
+template <typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio, bool CanCast = false>
+struct test_cast_helper;
+
+template <typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
+struct test_cast_helper<SrcPacket, TgtPacket, SrcCoeffRatio, TgtCoeffRatio, false> {
+  static void run() {}
+};
+
+template <typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
+struct test_cast_helper<SrcPacket, TgtPacket, SrcCoeffRatio, TgtCoeffRatio, true> {
+  static void run() {
+    typedef typename internal::unpacket_traits<SrcPacket>::type SrcScalar;
+    typedef typename internal::unpacket_traits<TgtPacket>::type TgtScalar;
+    static const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+    static const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
+    static const int BlockSize = SrcPacketSize * SrcCoeffRatio;
+    eigen_assert(BlockSize == TgtPacketSize * TgtCoeffRatio && "Packet sizes and cast ratios are mismatched.");
+
+    static const int DataSize = 10 * BlockSize;
+    EIGEN_ALIGN_MAX SrcScalar data1[DataSize];
+    EIGEN_ALIGN_MAX TgtScalar data2[DataSize];
+    EIGEN_ALIGN_MAX TgtScalar ref[DataSize];
+
+    // Construct a packet of scalars that will not overflow when casting
+    for (int i = 0; i < DataSize; ++i) {
+      data1[i] = internal::random_without_cast_overflow<SrcScalar, TgtScalar>::value();
+    }
+
+    for (int i = 0; i < DataSize; ++i) {
+      ref[i] = static_cast<const TgtScalar>(data1[i]);
+    }
+
+    pcast_array<SrcPacket, TgtPacket, SrcCoeffRatio, TgtCoeffRatio>::cast(data1, DataSize, data2);
+
+    VERIFY(test::areApprox(ref, data2, DataSize) && "internal::pcast<>");
+  }
+};
+
+template <typename SrcPacket, typename TgtPacket>
+struct test_cast {
+  static void run() {
+    typedef typename internal::unpacket_traits<SrcPacket>::type SrcScalar;
+    typedef typename internal::unpacket_traits<TgtPacket>::type TgtScalar;
+    typedef typename internal::type_casting_traits<SrcScalar, TgtScalar> TypeCastingTraits;
+    static const int SrcCoeffRatio = TypeCastingTraits::SrcCoeffRatio;
+    static const int TgtCoeffRatio = TypeCastingTraits::TgtCoeffRatio;
+    static const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+    static const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
+    static const bool HasCast =
+        internal::unpacket_traits<SrcPacket>::vectorizable && internal::unpacket_traits<TgtPacket>::vectorizable &&
+        TypeCastingTraits::VectorizedCast && (SrcPacketSize * SrcCoeffRatio == TgtPacketSize * TgtCoeffRatio);
+    test_cast_helper<SrcPacket, TgtPacket, SrcCoeffRatio, TgtCoeffRatio, HasCast>::run();
+  }
+};
+
+template <typename SrcPacket, typename TgtScalar,
+          typename TgtPacket = typename internal::packet_traits<TgtScalar>::type,
+          bool Vectorized = internal::packet_traits<TgtScalar>::Vectorizable,
+          bool HasHalf = !internal::is_same<typename internal::unpacket_traits<TgtPacket>::half, TgtPacket>::value>
+struct test_cast_runner;
+
+template <typename SrcPacket, typename TgtScalar, typename TgtPacket>
+struct test_cast_runner<SrcPacket, TgtScalar, TgtPacket, true, false> {
+  static void run() { test_cast<SrcPacket, TgtPacket>::run(); }
+};
+
+template <typename SrcPacket, typename TgtScalar, typename TgtPacket>
+struct test_cast_runner<SrcPacket, TgtScalar, TgtPacket, true, true> {
+  static void run() {
+    test_cast<SrcPacket, TgtPacket>::run();
+    test_cast_runner<SrcPacket, TgtScalar, typename internal::unpacket_traits<TgtPacket>::half>::run();
+  }
+};
+
+template <typename SrcPacket, typename TgtScalar, typename TgtPacket>
+struct test_cast_runner<SrcPacket, TgtScalar, TgtPacket, false, false> {
+  static void run() {}
+};
+
+template <typename Scalar, typename Packet, typename EnableIf = void>
+struct packetmath_pcast_ops_runner {
+  static void run() {
+    test_cast_runner<Packet, float>::run();
+    test_cast_runner<Packet, double>::run();
+    test_cast_runner<Packet, int8_t>::run();
+    test_cast_runner<Packet, uint8_t>::run();
+    test_cast_runner<Packet, int16_t>::run();
+    test_cast_runner<Packet, uint16_t>::run();
+    test_cast_runner<Packet, int32_t>::run();
+    test_cast_runner<Packet, uint32_t>::run();
+    test_cast_runner<Packet, int64_t>::run();
+    test_cast_runner<Packet, uint64_t>::run();
+    test_cast_runner<Packet, bool>::run();
+    test_cast_runner<Packet, std::complex<float> >::run();
+    test_cast_runner<Packet, std::complex<double> >::run();
+    test_cast_runner<Packet, half>::run();
+    test_cast_runner<Packet, bfloat16>::run();
+  }
+};
+
+// Only some types support cast from std::complex<>.
+template <typename Scalar, typename Packet>
+struct packetmath_pcast_ops_runner<Scalar, Packet, typename internal::enable_if<NumTraits<Scalar>::IsComplex>::type> {
+  static void run() {
+    test_cast_runner<Packet, std::complex<float> >::run();
+    test_cast_runner<Packet, std::complex<double> >::run();
+    test_cast_runner<Packet, half>::run();
+    test_cast_runner<Packet, bfloat16>::run();
+  }
+};
+
+template <typename Scalar, typename Packet>
+void packetmath_boolean_mask_ops() {
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+  const int size = 2 * PacketSize;
+  EIGEN_ALIGN_MAX Scalar data1[size];
+  EIGEN_ALIGN_MAX Scalar data2[size];
+  EIGEN_ALIGN_MAX Scalar ref[size];
+
+  for (int i = 0; i < size; ++i) {
+    data1[i] = internal::random<Scalar>();
+  }
+  CHECK_CWISE1(internal::ptrue, internal::ptrue);
+  CHECK_CWISE2_IF(true, internal::pandnot, internal::pandnot);
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = Scalar(i);
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+
+  CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq);
+
+  //Test (-0) == (0) for signed operations
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = Scalar(-0.0);
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+  CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq);
+
+  //Test NaN
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = NumTraits<Scalar>::quiet_NaN();
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+  CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq);
+}
+
+template <typename Scalar, typename Packet>
+void packetmath_boolean_mask_ops_real() {
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+  const int size = 2 * PacketSize;
+  EIGEN_ALIGN_MAX Scalar data1[size];
+  EIGEN_ALIGN_MAX Scalar data2[size];
+  EIGEN_ALIGN_MAX Scalar ref[size];
+
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = internal::random<Scalar>();
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+
+  CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
+
+  //Test (-0) <=/< (0) for signed operations
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = Scalar(-0.0);
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+  CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
+
+  //Test NaN
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = NumTraits<Scalar>::quiet_NaN();
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+  CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
+}
+
+template <typename Scalar, typename Packet>
+void packetmath_boolean_mask_ops_notcomplex() {
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+  const int size = 2 * PacketSize;
+  EIGEN_ALIGN_MAX Scalar data1[size];
+  EIGEN_ALIGN_MAX Scalar data2[size];
+  EIGEN_ALIGN_MAX Scalar ref[size];
+
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = internal::random<Scalar>();
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+
+  CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le);
+  CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt);
+
+  //Test (-0) <=/< (0) for signed operations
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = Scalar(-0.0);
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+  CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le);
+  CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt);
+
+  //Test NaN
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = NumTraits<Scalar>::quiet_NaN();
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+  CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le);
+  CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt);
+}
+
+// Packet16b representing bool does not support ptrue, pandnot or pcmp_eq, since the scalar path
+// (for some compilers) compute the bitwise and with 0x1 of the results to keep the value in [0,1].
+template<>
+void packetmath_boolean_mask_ops<bool, internal::packet_traits<bool>::type>() {}
+template<>
+void packetmath_boolean_mask_ops_notcomplex<bool, internal::packet_traits<bool>::type>() {}
+
+template <typename Scalar, typename Packet>
+void packetmath_minus_zero_add() {
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+  const int size = 2 * PacketSize;
+  EIGEN_ALIGN_MAX Scalar data1[size];
+  EIGEN_ALIGN_MAX Scalar data2[size];
+  EIGEN_ALIGN_MAX Scalar ref[size];
+
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = Scalar(-0.0);
+    data1[i + PacketSize] = Scalar(-0.0);
+  }
+  CHECK_CWISE2_IF(internal::packet_traits<Scalar>::HasAdd, REF_ADD, internal::padd);
+}
+
+// Ensure optimization barrier compiles and doesn't modify contents.
+// Only applies to raw types, so will not work for std::complex, Eigen::half
+// or Eigen::bfloat16. For those you would need to refer to an underlying
+// storage element.
+template<typename Packet, typename EnableIf = void>
+struct eigen_optimization_barrier_test {
+  static void run() {}
+};
+
+template<typename Packet>
+struct eigen_optimization_barrier_test<Packet, typename internal::enable_if<
+    !NumTraits<Packet>::IsComplex &&
+    !internal::is_same<Packet, Eigen::half>::value &&
+    !internal::is_same<Packet, Eigen::bfloat16>::value
+  >::type> {
+  static void run() {
+    typedef typename internal::unpacket_traits<Packet>::type Scalar;
+    Scalar s = internal::random<Scalar>();
+    Packet barrier = internal::pset1<Packet>(s);
+    EIGEN_OPTIMIZATION_BARRIER(barrier);
+    eigen_assert(s == internal::pfirst(barrier) && "EIGEN_OPTIMIZATION_BARRIER");
+  }
+};
+
+template <typename Scalar, typename Packet>
+void packetmath() {
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  if (g_first_pass)
+    std::cerr << "=== Testing packet of type '" << typeid(Packet).name() << "' and scalar type '"
+              << typeid(Scalar).name() << "' and size '" << PacketSize << "' ===\n";
+
+  const int max_size = PacketSize > 4 ? PacketSize : 4;
+  const int size = PacketSize * max_size;
+  EIGEN_ALIGN_MAX Scalar data1[size];
+  EIGEN_ALIGN_MAX Scalar data2[size];
+  EIGEN_ALIGN_MAX Scalar data3[size];
+  EIGEN_ALIGN_MAX Scalar ref[size];
+  RealScalar refvalue = RealScalar(0);
+
+  eigen_optimization_barrier_test<Packet>::run();
+  eigen_optimization_barrier_test<Scalar>::run();
+
+  for (int i = 0; i < size; ++i) {
+    data1[i] = internal::random<Scalar>() / RealScalar(PacketSize);
+    data2[i] = internal::random<Scalar>() / RealScalar(PacketSize);
+    refvalue = (std::max)(refvalue, numext::abs(data1[i]));
+  }
+
+  internal::pstore(data2, internal::pload<Packet>(data1));
+  VERIFY(test::areApprox(data1, data2, PacketSize) && "aligned load/store");
+
+  for (int offset = 0; offset < PacketSize; ++offset) {
+    internal::pstore(data2, internal::ploadu<Packet>(data1 + offset));
+    VERIFY(test::areApprox(data1 + offset, data2, PacketSize) && "internal::ploadu");
+  }
+
+  for (int offset = 0; offset < PacketSize; ++offset) {
+    internal::pstoreu(data2 + offset, internal::pload<Packet>(data1));
+    VERIFY(test::areApprox(data1, data2 + offset, PacketSize) && "internal::pstoreu");
+  }
+
+  if (internal::unpacket_traits<Packet>::masked_load_available) {
+    test::packet_helper<internal::unpacket_traits<Packet>::masked_load_available, Packet> h;
+    unsigned long long max_umask = (0x1ull << PacketSize);
+
+    for (int offset = 0; offset < PacketSize; ++offset) {
+      for (unsigned long long umask = 0; umask < max_umask; ++umask) {
+        h.store(data2, h.load(data1 + offset, umask));
+        for (int k = 0; k < PacketSize; ++k) data3[k] = ((umask & (0x1ull << k)) >> k) ? data1[k + offset] : Scalar(0);
+        VERIFY(test::areApprox(data3, data2, PacketSize) && "internal::ploadu masked");
+      }
+    }
+  }
+
+  if (internal::unpacket_traits<Packet>::masked_store_available) {
+    test::packet_helper<internal::unpacket_traits<Packet>::masked_store_available, Packet> h;
+    unsigned long long max_umask = (0x1ull << PacketSize);
+
+    for (int offset = 0; offset < PacketSize; ++offset) {
+      for (unsigned long long umask = 0; umask < max_umask; ++umask) {
+        internal::pstore(data2, internal::pset1<Packet>(Scalar(0)));
+        h.store(data2, h.loadu(data1 + offset), umask);
+        for (int k = 0; k < PacketSize; ++k) data3[k] = ((umask & (0x1ull << k)) >> k) ? data1[k + offset] : Scalar(0);
+        VERIFY(test::areApprox(data3, data2, PacketSize) && "internal::pstoreu masked");
+      }
+    }
+  }
+
+  VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasAdd);
+  VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasSub);
+  VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMul);
+
+  CHECK_CWISE2_IF(PacketTraits::HasAdd, REF_ADD, internal::padd);
+  CHECK_CWISE2_IF(PacketTraits::HasSub, REF_SUB, internal::psub);
+  CHECK_CWISE2_IF(PacketTraits::HasMul, REF_MUL, internal::pmul);
+  CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv);
+
+  if (PacketTraits::HasNegate) CHECK_CWISE1(internal::negate, internal::pnegate);
+  CHECK_CWISE1(numext::conj, internal::pconj);
+
+  for (int offset = 0; offset < 3; ++offset) {
+    for (int i = 0; i < PacketSize; ++i) ref[i] = data1[offset];
+    internal::pstore(data2, internal::pset1<Packet>(data1[offset]));
+    VERIFY(test::areApprox(ref, data2, PacketSize) && "internal::pset1");
+  }
+
+  {
+    for (int i = 0; i < PacketSize * 4; ++i) ref[i] = data1[i / PacketSize];
+    Packet A0, A1, A2, A3;
+    internal::pbroadcast4<Packet>(data1, A0, A1, A2, A3);
+    internal::pstore(data2 + 0 * PacketSize, A0);
+    internal::pstore(data2 + 1 * PacketSize, A1);
+    internal::pstore(data2 + 2 * PacketSize, A2);
+    internal::pstore(data2 + 3 * PacketSize, A3);
+    VERIFY(test::areApprox(ref, data2, 4 * PacketSize) && "internal::pbroadcast4");
+  }
+
+  {
+    for (int i = 0; i < PacketSize * 2; ++i) ref[i] = data1[i / PacketSize];
+    Packet A0, A1;
+    internal::pbroadcast2<Packet>(data1, A0, A1);
+    internal::pstore(data2 + 0 * PacketSize, A0);
+    internal::pstore(data2 + 1 * PacketSize, A1);
+    VERIFY(test::areApprox(ref, data2, 2 * PacketSize) && "internal::pbroadcast2");
+  }
+
+  VERIFY(internal::isApprox(data1[0], internal::pfirst(internal::pload<Packet>(data1))) && "internal::pfirst");
+
+  if (PacketSize > 1) {
+    // apply different offsets to check that ploaddup is robust to unaligned inputs
+    for (int offset = 0; offset < 4; ++offset) {
+      for (int i = 0; i < PacketSize / 2; ++i) ref[2 * i + 0] = ref[2 * i + 1] = data1[offset + i];
+      internal::pstore(data2, internal::ploaddup<Packet>(data1 + offset));
+      VERIFY(test::areApprox(ref, data2, PacketSize) && "ploaddup");
+    }
+  }
+
+  if (PacketSize > 2) {
+    // apply different offsets to check that ploadquad is robust to unaligned inputs
+    for (int offset = 0; offset < 4; ++offset) {
+      for (int i = 0; i < PacketSize / 4; ++i)
+        ref[4 * i + 0] = ref[4 * i + 1] = ref[4 * i + 2] = ref[4 * i + 3] = data1[offset + i];
+      internal::pstore(data2, internal::ploadquad<Packet>(data1 + offset));
+      VERIFY(test::areApprox(ref, data2, PacketSize) && "ploadquad");
+    }
+  }
+
+  ref[0] = Scalar(0);
+  for (int i = 0; i < PacketSize; ++i) ref[0] += data1[i];
+  VERIFY(test::isApproxAbs(ref[0], internal::predux(internal::pload<Packet>(data1)), refvalue) && "internal::predux");
+
+  if (!internal::is_same<Packet, typename internal::unpacket_traits<Packet>::half>::value) {
+    int HalfPacketSize = PacketSize > 4 ? PacketSize / 2 : PacketSize;
+    for (int i = 0; i < HalfPacketSize; ++i) ref[i] = Scalar(0);
+    for (int i = 0; i < PacketSize; ++i) ref[i % HalfPacketSize] += data1[i];
+    internal::pstore(data2, internal::predux_half_dowto4(internal::pload<Packet>(data1)));
+    VERIFY(test::areApprox(ref, data2, HalfPacketSize) && "internal::predux_half_dowto4");
+  }
+
+  ref[0] = Scalar(1);
+  for (int i = 0; i < PacketSize; ++i) ref[0] = REF_MUL(ref[0], data1[i]);
+  VERIFY(internal::isApprox(ref[0], internal::predux_mul(internal::pload<Packet>(data1))) && "internal::predux_mul");
+
+  for (int i = 0; i < PacketSize; ++i) ref[i] = data1[PacketSize - i - 1];
+  internal::pstore(data2, internal::preverse(internal::pload<Packet>(data1)));
+  VERIFY(test::areApprox(ref, data2, PacketSize) && "internal::preverse");
+
+  internal::PacketBlock<Packet> kernel;
+  for (int i = 0; i < PacketSize; ++i) {
+    kernel.packet[i] = internal::pload<Packet>(data1 + i * PacketSize);
+  }
+  ptranspose(kernel);
+  for (int i = 0; i < PacketSize; ++i) {
+    internal::pstore(data2, kernel.packet[i]);
+    for (int j = 0; j < PacketSize; ++j) {
+      VERIFY(test::isApproxAbs(data2[j], data1[i + j * PacketSize], refvalue) && "ptranspose");
+    }
+  }
+
+  // GeneralBlockPanelKernel also checks PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize>;
+  if (PacketSize > 4 && PacketSize % 4 == 0) {
+    internal::PacketBlock<Packet, PacketSize%4==0?4:PacketSize> kernel2;
+    for (int i = 0; i < 4; ++i) {
+      kernel2.packet[i] = internal::pload<Packet>(data1 + i * PacketSize);
+    }
+    ptranspose(kernel2);
+    int data_counter = 0;
+    for (int i = 0; i < PacketSize; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        data2[data_counter++] = data1[j*PacketSize + i];
+      }
+    }
+    for (int i = 0; i < 4; ++i) {
+      internal::pstore(data3, kernel2.packet[i]);
+      for (int j = 0; j < PacketSize; ++j) {
+        VERIFY(test::isApproxAbs(data3[j], data2[i*PacketSize + j], refvalue) && "ptranspose");
+      }
+    }
+  }
+
+  if (PacketTraits::HasBlend) {
+    Packet thenPacket = internal::pload<Packet>(data1);
+    Packet elsePacket = internal::pload<Packet>(data2);
+    EIGEN_ALIGN_MAX internal::Selector<PacketSize> selector;
+    for (int i = 0; i < PacketSize; ++i) {
+      selector.select[i] = i;
+    }
+
+    Packet blend = internal::pblend(selector, thenPacket, elsePacket);
+    EIGEN_ALIGN_MAX Scalar result[size];
+    internal::pstore(result, blend);
+    for (int i = 0; i < PacketSize; ++i) {
+      VERIFY(test::isApproxAbs(result[i], (selector.select[i] ? data1[i] : data2[i]), refvalue));
+    }
+  }
+
+  {
+    for (int i = 0; i < PacketSize; ++i) {
+      // "if" mask
+      unsigned char v = internal::random<bool>() ? 0xff : 0;
+      char* bytes = (char*)(data1 + i);
+      for (int k = 0; k < int(sizeof(Scalar)); ++k) {
+        bytes[k] = v;
+      }
+      // "then" packet
+      data1[i + PacketSize] = internal::random<Scalar>();
+      // "else" packet
+      data1[i + 2 * PacketSize] = internal::random<Scalar>();
+    }
+    CHECK_CWISE3_IF(true, internal::pselect, internal::pselect);
+  }
+
+  for (int i = 0; i < size; ++i) {
+    data1[i] = internal::random<Scalar>();
+  }
+  CHECK_CWISE1(internal::pzero, internal::pzero);
+  CHECK_CWISE2_IF(true, internal::por, internal::por);
+  CHECK_CWISE2_IF(true, internal::pxor, internal::pxor);
+  CHECK_CWISE2_IF(true, internal::pand, internal::pand);
+
+  packetmath_boolean_mask_ops<Scalar, Packet>();
+  packetmath_pcast_ops_runner<Scalar, Packet>::run();
+  packetmath_minus_zero_add<Scalar, Packet>();
+
+  for (int i = 0; i < size; ++i) {
+    data1[i] = numext::abs(internal::random<Scalar>());
+  }
+  CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt);
+  CHECK_CWISE1_IF(PacketTraits::HasRsqrt, numext::rsqrt, internal::prsqrt);
+}
+
+// Notice that this definition works for complex types as well.
+// c++11 has std::log2 for real, but not for complex types.
+template <typename Scalar>
+Scalar log2(Scalar x) {
+  return Scalar(EIGEN_LOG2E) * std::log(x);
+}
+
+template <typename Scalar, typename Packet>
+void packetmath_real() {
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+
+  const int size = PacketSize * 4;
+  EIGEN_ALIGN_MAX Scalar data1[PacketSize * 4];
+  EIGEN_ALIGN_MAX Scalar data2[PacketSize * 4];
+  EIGEN_ALIGN_MAX Scalar ref[PacketSize * 4];
+
+  for (int i = 0; i < size; ++i) {
+    data1[i] = Scalar(internal::random<double>(0, 1) * std::pow(10., internal::random<double>(-6, 6)));
+    data2[i] = Scalar(internal::random<double>(0, 1) * std::pow(10., internal::random<double>(-6, 6)));
+  }
+
+  if (internal::random<float>(0, 1) < 0.1f) data1[internal::random<int>(0, PacketSize)] = Scalar(0);
+
+  CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog);
+  CHECK_CWISE1_IF(PacketTraits::HasLog, log2, internal::plog2);
+  CHECK_CWISE1_IF(PacketTraits::HasRsqrt, numext::rsqrt, internal::prsqrt);
+
+  for (int i = 0; i < size; ++i) {
+    data1[i] = Scalar(internal::random<double>(-1, 1) * std::pow(10., internal::random<double>(-3, 3)));
+    data2[i] = Scalar(internal::random<double>(-1, 1) * std::pow(10., internal::random<double>(-3, 3)));
+  }
+  CHECK_CWISE1_IF(PacketTraits::HasSin, std::sin, internal::psin);
+  CHECK_CWISE1_IF(PacketTraits::HasCos, std::cos, internal::pcos);
+  CHECK_CWISE1_IF(PacketTraits::HasTan, std::tan, internal::ptan);
+
+  CHECK_CWISE1_EXACT_IF(PacketTraits::HasRound, numext::round, internal::pround);
+  CHECK_CWISE1_EXACT_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
+  CHECK_CWISE1_EXACT_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
+  CHECK_CWISE1_EXACT_IF(PacketTraits::HasRint, numext::rint, internal::print);
+
+  packetmath_boolean_mask_ops_real<Scalar,Packet>();
+  
+  // Rounding edge cases.
+  if (PacketTraits::HasRound || PacketTraits::HasCeil || PacketTraits::HasFloor || PacketTraits::HasRint) {
+    typedef typename internal::make_integer<Scalar>::type IntType;
+    // Start with values that cannot fit inside an integer, work down to less than one.
+    Scalar val = numext::mini(
+        Scalar(2) * static_cast<Scalar>(NumTraits<IntType>::highest()),
+        NumTraits<Scalar>::highest());
+    std::vector<Scalar> values;
+    while (val > Scalar(0.25)) {
+      // Cover both even and odd, positive and negative cases.
+      values.push_back(val);
+      values.push_back(val + Scalar(0.3));
+      values.push_back(val + Scalar(0.5));
+      values.push_back(val + Scalar(0.8));
+      values.push_back(val + Scalar(1));
+      values.push_back(val + Scalar(1.3));
+      values.push_back(val + Scalar(1.5));
+      values.push_back(val + Scalar(1.8));
+      values.push_back(-val);
+      values.push_back(-val - Scalar(0.3));
+      values.push_back(-val - Scalar(0.5));
+      values.push_back(-val - Scalar(0.8));
+      values.push_back(-val - Scalar(1));
+      values.push_back(-val - Scalar(1.3));
+      values.push_back(-val - Scalar(1.5));
+      values.push_back(-val - Scalar(1.8));
+      values.push_back(Scalar(-1.5) + val);  // Bug 1785.
+      val = val / Scalar(2);
+    }
+    values.push_back(NumTraits<Scalar>::infinity());
+    values.push_back(-NumTraits<Scalar>::infinity());
+    values.push_back(NumTraits<Scalar>::quiet_NaN());
+    
+    for (size_t k=0; k<values.size(); ++k) {
+      data1[0] = values[k];
+      CHECK_CWISE1_EXACT_IF(PacketTraits::HasRound, numext::round, internal::pround);
+      CHECK_CWISE1_EXACT_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
+      CHECK_CWISE1_EXACT_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
+      CHECK_CWISE1_EXACT_IF(PacketTraits::HasRint, numext::rint, internal::print);
+    }
+  }
+
+  for (int i = 0; i < size; ++i) {
+    data1[i] = Scalar(internal::random<double>(-1, 1));
+    data2[i] = Scalar(internal::random<double>(-1, 1));
+  }
+  CHECK_CWISE1_IF(PacketTraits::HasASin, std::asin, internal::pasin);
+  CHECK_CWISE1_IF(PacketTraits::HasACos, std::acos, internal::pacos);
+
+  for (int i = 0; i < size; ++i) {
+    data1[i] = Scalar(internal::random<double>(-87, 88));
+    data2[i] = Scalar(internal::random<double>(-87, 88));
+  }
+  CHECK_CWISE1_IF(PacketTraits::HasExp, std::exp, internal::pexp);
+  
+  CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
+  if (PacketTraits::HasExp) {
+    // Check denormals:
+    for (int j=0; j<3; ++j) {
+      data1[0] = Scalar(std::ldexp(1, NumTraits<Scalar>::min_exponent()-j));
+      CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
+      data1[0] = -data1[0];
+      CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
+    }
+    
+    // zero
+    data1[0] = Scalar(0);
+    CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
+    
+    // inf and NaN only compare output fraction, not exponent.
+    test::packet_helper<PacketTraits::HasExp,Packet> h;
+    Packet pout;
+    Scalar sout;
+    Scalar special[] = { NumTraits<Scalar>::infinity(), 
+                        -NumTraits<Scalar>::infinity(),
+                         NumTraits<Scalar>::quiet_NaN()};
+    for (int i=0; i<3; ++i) {
+      data1[0] = special[i];
+      ref[0] = Scalar(REF_FREXP(data1[0], ref[PacketSize]));
+      h.store(data2, internal::pfrexp(h.load(data1), h.forward_reference(pout, sout)));
+      VERIFY(test::areApprox(ref, data2, 1) && "internal::pfrexp");
+    }
+  }
+  
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = Scalar(internal::random<double>(-1, 1));
+    data2[i] = Scalar(internal::random<double>(-1, 1));
+  }
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i+PacketSize] = Scalar(internal::random<int>(-4, 4));
+    data2[i+PacketSize] = Scalar(internal::random<double>(-4, 4));
+  }
+  CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+  if (PacketTraits::HasExp) {
+    data1[0] = Scalar(-1);
+    // underflow to zero
+    data1[PacketSize] = Scalar(NumTraits<Scalar>::min_exponent()-55);
+    CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+    // overflow to inf
+    data1[PacketSize] = Scalar(NumTraits<Scalar>::max_exponent()+10);
+    CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+    // NaN stays NaN
+    data1[0] = NumTraits<Scalar>::quiet_NaN();
+    CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+    VERIFY((numext::isnan)(data2[0]));
+    // inf stays inf
+    data1[0] = NumTraits<Scalar>::infinity();
+    data1[PacketSize] = Scalar(NumTraits<Scalar>::min_exponent()-10);
+    CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+    // zero stays zero
+    data1[0] = Scalar(0);
+    data1[PacketSize] = Scalar(NumTraits<Scalar>::max_exponent()+10);
+    CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+    // Small number big exponent.
+    data1[0] = Scalar(std::ldexp(Scalar(1.0), NumTraits<Scalar>::min_exponent()-1));
+    data1[PacketSize] = Scalar(-NumTraits<Scalar>::min_exponent()
+                               +NumTraits<Scalar>::max_exponent());
+    CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+    // Big number small exponent.
+    data1[0] = Scalar(std::ldexp(Scalar(1.0), NumTraits<Scalar>::max_exponent()-1));
+    data1[PacketSize] = Scalar(+NumTraits<Scalar>::min_exponent()
+                               -NumTraits<Scalar>::max_exponent());
+    CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+  }
+
+  for (int i = 0; i < size; ++i) {
+    data1[i] = Scalar(internal::random<double>(-1, 1) * std::pow(10., internal::random<double>(-6, 6)));
+    data2[i] = Scalar(internal::random<double>(-1, 1) * std::pow(10., internal::random<double>(-6, 6)));
+  }
+  data1[0] = Scalar(1e-20);
+  CHECK_CWISE1_IF(PacketTraits::HasTanh, std::tanh, internal::ptanh);
+  if (PacketTraits::HasExp && PacketSize >= 2) {
+    const Scalar small = NumTraits<Scalar>::epsilon();
+    data1[0] = NumTraits<Scalar>::quiet_NaN();
+    data1[1] = small;
+    test::packet_helper<PacketTraits::HasExp, Packet> h;
+    h.store(data2, internal::pexp(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+    // TODO(rmlarsen): Re-enable for bfloat16.
+    if (!internal::is_same<Scalar, bfloat16>::value) {
+      VERIFY_IS_APPROX(std::exp(small), data2[1]);
+    }
+
+    data1[0] = -small;
+    data1[1] = Scalar(0);
+    h.store(data2, internal::pexp(h.load(data1)));
+    // TODO(rmlarsen): Re-enable for bfloat16.
+    if (!internal::is_same<Scalar, bfloat16>::value) {
+      VERIFY_IS_APPROX(std::exp(-small), data2[0]);
+    }
+    VERIFY_IS_EQUAL(std::exp(Scalar(0)), data2[1]);
+
+    data1[0] = (std::numeric_limits<Scalar>::min)();
+    data1[1] = -(std::numeric_limits<Scalar>::min)();
+    h.store(data2, internal::pexp(h.load(data1)));
+    VERIFY_IS_APPROX(std::exp((std::numeric_limits<Scalar>::min)()), data2[0]);
+    VERIFY_IS_APPROX(std::exp(-(std::numeric_limits<Scalar>::min)()), data2[1]);
+
+    data1[0] = std::numeric_limits<Scalar>::denorm_min();
+    data1[1] = -std::numeric_limits<Scalar>::denorm_min();
+    h.store(data2, internal::pexp(h.load(data1)));
+    VERIFY_IS_APPROX(std::exp(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
+    VERIFY_IS_APPROX(std::exp(-std::numeric_limits<Scalar>::denorm_min()), data2[1]);
+  }
+
+  if (PacketTraits::HasTanh) {
+    // NOTE this test migh fail with GCC prior to 6.3, see MathFunctionsImpl.h for details.
+    data1[0] = NumTraits<Scalar>::quiet_NaN();
+    test::packet_helper<internal::packet_traits<Scalar>::HasTanh, Packet> h;
+    h.store(data2, internal::ptanh(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+  }
+
+  if (PacketTraits::HasExp) {
+    internal::scalar_logistic_op<Scalar> logistic;
+    for (int i = 0; i < size; ++i) {
+      data1[i] = Scalar(internal::random<double>(-20, 20));
+    }
+
+    test::packet_helper<PacketTraits::HasExp, Packet> h;
+    h.store(data2, logistic.packetOp(h.load(data1)));
+    for (int i = 0; i < PacketSize; ++i) {
+      VERIFY_IS_APPROX(data2[i], logistic(data1[i]));
+    }
+  }
+
+#if EIGEN_HAS_C99_MATH && (EIGEN_COMP_CXXVER >= 11)
+  data1[0] = NumTraits<Scalar>::infinity();
+  data1[1] = Scalar(-1);
+  CHECK_CWISE1_IF(PacketTraits::HasLog1p, std::log1p, internal::plog1p);
+  data1[0] = NumTraits<Scalar>::infinity();
+  data1[1] = -NumTraits<Scalar>::infinity();
+  CHECK_CWISE1_IF(PacketTraits::HasExpm1, std::expm1, internal::pexpm1);
+#endif
+
+  if (PacketSize >= 2) {
+    data1[0] = NumTraits<Scalar>::quiet_NaN();
+    data1[1] = NumTraits<Scalar>::epsilon();
+    if (PacketTraits::HasLog) {
+      test::packet_helper<PacketTraits::HasLog, Packet> h;
+      h.store(data2, internal::plog(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+      // TODO(cantonios): Re-enable for bfloat16.
+      if (!internal::is_same<Scalar, bfloat16>::value) {
+        VERIFY_IS_APPROX(std::log(data1[1]), data2[1]);
+      }
+
+      data1[0] = -NumTraits<Scalar>::epsilon();
+      data1[1] = Scalar(0);
+      h.store(data2, internal::plog(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+      VERIFY_IS_EQUAL(std::log(Scalar(0)), data2[1]);
+
+      data1[0] = (std::numeric_limits<Scalar>::min)();
+      data1[1] = -(std::numeric_limits<Scalar>::min)();
+      h.store(data2, internal::plog(h.load(data1)));
+      // TODO(cantonios): Re-enable for bfloat16.
+      if (!internal::is_same<Scalar, bfloat16>::value) {
+        VERIFY_IS_APPROX(std::log((std::numeric_limits<Scalar>::min)()), data2[0]);
+      }
+      VERIFY((numext::isnan)(data2[1]));
+
+      // Note: 32-bit arm always flushes denorms to zero.
+#if !EIGEN_ARCH_ARM
+      if (std::numeric_limits<Scalar>::has_denorm == std::denorm_present) {
+        data1[0] = std::numeric_limits<Scalar>::denorm_min();
+        data1[1] = -std::numeric_limits<Scalar>::denorm_min();
+        h.store(data2, internal::plog(h.load(data1)));
+        // TODO(rmlarsen): Reenable.
+        //        VERIFY_IS_EQUAL(std::log(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
+        VERIFY((numext::isnan)(data2[1]));
+      }
+#endif
+
+      data1[0] = Scalar(-1.0f);
+      h.store(data2, internal::plog(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+
+      data1[0] = NumTraits<Scalar>::infinity();
+      h.store(data2, internal::plog(h.load(data1)));
+      VERIFY((numext::isinf)(data2[0]));
+    }
+    if (PacketTraits::HasLog1p) {
+      test::packet_helper<PacketTraits::HasLog1p, Packet> h;
+      data1[0] = Scalar(-2);
+      data1[1] = -NumTraits<Scalar>::infinity();
+      h.store(data2, internal::plog1p(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+      VERIFY((numext::isnan)(data2[1]));
+    }
+    if (PacketTraits::HasSqrt) {
+      test::packet_helper<PacketTraits::HasSqrt, Packet> h;
+      data1[0] = Scalar(-1.0f);
+      if (std::numeric_limits<Scalar>::has_denorm == std::denorm_present) {
+        data1[1] = -std::numeric_limits<Scalar>::denorm_min();
+      } else {
+        data1[1] = -NumTraits<Scalar>::epsilon();
+      }
+      h.store(data2, internal::psqrt(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+      VERIFY((numext::isnan)(data2[1]));
+    }
+    // TODO(rmlarsen): Re-enable for half and bfloat16.
+    if (PacketTraits::HasCos
+        && !internal::is_same<Scalar, half>::value
+        && !internal::is_same<Scalar, bfloat16>::value) {
+      test::packet_helper<PacketTraits::HasCos, Packet> h;
+      for (Scalar k = Scalar(1); k < Scalar(10000) / NumTraits<Scalar>::epsilon(); k *= Scalar(2)) {
+        for (int k1 = 0; k1 <= 1; ++k1) {
+          data1[0] = Scalar((2 * double(k) + k1) * double(EIGEN_PI) / 2 * internal::random<double>(0.8, 1.2));
+          data1[1] = Scalar((2 * double(k) + 2 + k1) * double(EIGEN_PI) / 2 * internal::random<double>(0.8, 1.2));
+          h.store(data2, internal::pcos(h.load(data1)));
+          h.store(data2 + PacketSize, internal::psin(h.load(data1)));
+          VERIFY(data2[0] <= Scalar(1.) && data2[0] >= Scalar(-1.));
+          VERIFY(data2[1] <= Scalar(1.) && data2[1] >= Scalar(-1.));
+          VERIFY(data2[PacketSize + 0] <= Scalar(1.) && data2[PacketSize + 0] >= Scalar(-1.));
+          VERIFY(data2[PacketSize + 1] <= Scalar(1.) && data2[PacketSize + 1] >= Scalar(-1.));
+
+          VERIFY_IS_APPROX(data2[0], std::cos(data1[0]));
+          VERIFY_IS_APPROX(data2[1], std::cos(data1[1]));
+          VERIFY_IS_APPROX(data2[PacketSize + 0], std::sin(data1[0]));
+          VERIFY_IS_APPROX(data2[PacketSize + 1], std::sin(data1[1]));
+
+          VERIFY_IS_APPROX(numext::abs2(data2[0]) + numext::abs2(data2[PacketSize + 0]), Scalar(1));
+          VERIFY_IS_APPROX(numext::abs2(data2[1]) + numext::abs2(data2[PacketSize + 1]), Scalar(1));
+        }
+      }
+
+      data1[0] = NumTraits<Scalar>::infinity();
+      data1[1] = -NumTraits<Scalar>::infinity();
+      h.store(data2, internal::psin(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+      VERIFY((numext::isnan)(data2[1]));
+
+      h.store(data2, internal::pcos(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+      VERIFY((numext::isnan)(data2[1]));
+
+      data1[0] = NumTraits<Scalar>::quiet_NaN();
+      h.store(data2, internal::psin(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+      h.store(data2, internal::pcos(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+
+      data1[0] = -Scalar(0.);
+      h.store(data2, internal::psin(h.load(data1)));
+      VERIFY(internal::biteq(data2[0], data1[0]));
+      h.store(data2, internal::pcos(h.load(data1)));
+      VERIFY_IS_EQUAL(data2[0], Scalar(1));
+    }
+  }
+}
+
+#define CAST_CHECK_CWISE1_IF(COND, REFOP, POP, SCALAR, REFTYPE) if(COND) { \
+  test::packet_helper<COND,Packet> h; \
+  for (int i=0; i<PacketSize; ++i) \
+    ref[i] = SCALAR(REFOP(static_cast<REFTYPE>(data1[i]))); \
+  h.store(data2, POP(h.load(data1))); \
+  VERIFY(test::areApprox(ref, data2, PacketSize) && #POP); \
+}
+
+template <typename Scalar>
+Scalar propagate_nan_max(const Scalar& a, const Scalar& b) {
+  if ((numext::isnan)(a)) return a;
+  if ((numext::isnan)(b)) return b;
+  return (numext::maxi)(a,b);
+}
+
+template <typename Scalar>
+Scalar propagate_nan_min(const Scalar& a, const Scalar& b) {
+  if ((numext::isnan)(a)) return a;
+  if ((numext::isnan)(b)) return b;
+  return (numext::mini)(a,b);
+}
+
+template <typename Scalar>
+Scalar propagate_number_max(const Scalar& a, const Scalar& b) {
+  if ((numext::isnan)(a)) return b;
+  if ((numext::isnan)(b)) return a;
+  return (numext::maxi)(a,b);
+}
+
+template <typename Scalar>
+Scalar propagate_number_min(const Scalar& a, const Scalar& b) {
+  if ((numext::isnan)(a)) return b;
+  if ((numext::isnan)(b)) return a;
+  return (numext::mini)(a,b);
+}
+
+template <typename Scalar, typename Packet>
+void packetmath_notcomplex() {
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+
+  EIGEN_ALIGN_MAX Scalar data1[PacketSize * 4];
+  EIGEN_ALIGN_MAX Scalar data2[PacketSize * 4];
+  EIGEN_ALIGN_MAX Scalar ref[PacketSize * 4];
+
+  Array<Scalar, Dynamic, 1>::Map(data1, PacketSize * 4).setRandom();
+
+  VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMin);
+  VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMax);
+
+  CHECK_CWISE2_IF(PacketTraits::HasMin, (std::min), internal::pmin);
+  CHECK_CWISE2_IF(PacketTraits::HasMax, (std::max), internal::pmax);
+
+  CHECK_CWISE2_IF(PacketTraits::HasMin, propagate_number_min, internal::pmin<PropagateNumbers>);
+  CHECK_CWISE2_IF(PacketTraits::HasMax, propagate_number_max, internal::pmax<PropagateNumbers>);
+  CHECK_CWISE1(numext::abs, internal::pabs);
+  CHECK_CWISE2_IF(PacketTraits::HasAbsDiff, REF_ABS_DIFF, internal::pabsdiff);
+
+  ref[0] = data1[0];
+  for (int i = 0; i < PacketSize; ++i) ref[0] = internal::pmin(ref[0], data1[i]);
+  VERIFY(internal::isApprox(ref[0], internal::predux_min(internal::pload<Packet>(data1))) && "internal::predux_min");
+  ref[0] = data1[0];
+  for (int i = 0; i < PacketSize; ++i) ref[0] = internal::pmax(ref[0], data1[i]);
+  VERIFY(internal::isApprox(ref[0], internal::predux_max(internal::pload<Packet>(data1))) && "internal::predux_max");
+
+  for (int i = 0; i < PacketSize; ++i) ref[i] = data1[0] + Scalar(i);
+  internal::pstore(data2, internal::plset<Packet>(data1[0]));
+  VERIFY(test::areApprox(ref, data2, PacketSize) && "internal::plset");
+
+  {
+    unsigned char* data1_bits = reinterpret_cast<unsigned char*>(data1);
+    // predux_all - not needed yet
+    // for (unsigned int i=0; i<PacketSize*sizeof(Scalar); ++i) data1_bits[i] = 0xff;
+    // VERIFY(internal::predux_all(internal::pload<Packet>(data1)) && "internal::predux_all(1111)");
+    // for(int k=0; k<PacketSize; ++k)
+    // {
+    //   for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0x0;
+    //   VERIFY( (!internal::predux_all(internal::pload<Packet>(data1))) && "internal::predux_all(0101)");
+    //   for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0xff;
+    // }
+
+    // predux_any
+    for (unsigned int i = 0; i < PacketSize * sizeof(Scalar); ++i) data1_bits[i] = 0x0;
+    VERIFY((!internal::predux_any(internal::pload<Packet>(data1))) && "internal::predux_any(0000)");
+    for (int k = 0; k < PacketSize; ++k) {
+      for (unsigned int i = 0; i < sizeof(Scalar); ++i) data1_bits[k * sizeof(Scalar) + i] = 0xff;
+      VERIFY(internal::predux_any(internal::pload<Packet>(data1)) && "internal::predux_any(0101)");
+      for (unsigned int i = 0; i < sizeof(Scalar); ++i) data1_bits[k * sizeof(Scalar) + i] = 0x00;
+    }
+  }
+
+
+  // Test NaN propagation.
+  if (!NumTraits<Scalar>::IsInteger) {
+    // Test reductions with no NaNs.
+    ref[0] = data1[0];
+    for (int i = 0; i < PacketSize; ++i) ref[0] = internal::pmin<PropagateNumbers>(ref[0], data1[i]);
+    VERIFY(internal::isApprox(ref[0], internal::predux_min<PropagateNumbers>(internal::pload<Packet>(data1))) && "internal::predux_min<PropagateNumbers>");
+    ref[0] = data1[0];
+    for (int i = 0; i < PacketSize; ++i) ref[0] = internal::pmin<PropagateNaN>(ref[0], data1[i]);
+    VERIFY(internal::isApprox(ref[0], internal::predux_min<PropagateNaN>(internal::pload<Packet>(data1))) && "internal::predux_min<PropagateNaN>");
+    ref[0] = data1[0];
+    for (int i = 0; i < PacketSize; ++i) ref[0] = internal::pmax<PropagateNumbers>(ref[0], data1[i]);
+    VERIFY(internal::isApprox(ref[0], internal::predux_max<PropagateNumbers>(internal::pload<Packet>(data1))) && "internal::predux_max<PropagateNumbers>");
+    ref[0] = data1[0];
+    for (int i = 0; i < PacketSize; ++i) ref[0] = internal::pmax<PropagateNaN>(ref[0], data1[i]);
+    VERIFY(internal::isApprox(ref[0], internal::predux_max<PropagateNaN>(internal::pload<Packet>(data1))) && "internal::predux_max<PropagateNumbers>");
+    // A single NaN.
+    const size_t index = std::numeric_limits<size_t>::quiet_NaN() % PacketSize;
+    data1[index] = NumTraits<Scalar>::quiet_NaN();
+    VERIFY(PacketSize==1 || !(numext::isnan)(internal::predux_min<PropagateNumbers>(internal::pload<Packet>(data1))));
+    VERIFY((numext::isnan)(internal::predux_min<PropagateNaN>(internal::pload<Packet>(data1))));
+    VERIFY(PacketSize==1 || !(numext::isnan)(internal::predux_max<PropagateNumbers>(internal::pload<Packet>(data1))));
+    VERIFY((numext::isnan)(internal::predux_max<PropagateNaN>(internal::pload<Packet>(data1))));
+    // All NaNs.
+    for (int i = 0; i < 4 * PacketSize; ++i) data1[i] = NumTraits<Scalar>::quiet_NaN();
+    VERIFY((numext::isnan)(internal::predux_min<PropagateNumbers>(internal::pload<Packet>(data1))));
+    VERIFY((numext::isnan)(internal::predux_min<PropagateNaN>(internal::pload<Packet>(data1))));
+    VERIFY((numext::isnan)(internal::predux_max<PropagateNumbers>(internal::pload<Packet>(data1))));
+    VERIFY((numext::isnan)(internal::predux_max<PropagateNaN>(internal::pload<Packet>(data1))));
+
+    // Test NaN propagation for coefficient-wise min and max.
+    for (int i = 0; i < PacketSize; ++i) {
+      data1[i] = internal::random<bool>() ? NumTraits<Scalar>::quiet_NaN() : Scalar(0);
+      data1[i + PacketSize] = internal::random<bool>() ? NumTraits<Scalar>::quiet_NaN() : Scalar(0);
+    }
+    // Note: NaN propagation is implementation defined for pmin/pmax, so we do not test it here.
+    CHECK_CWISE2_IF(PacketTraits::HasMin, propagate_number_min, (internal::pmin<PropagateNumbers>));
+    CHECK_CWISE2_IF(PacketTraits::HasMax, propagate_number_max, internal::pmax<PropagateNumbers>);
+    CHECK_CWISE2_IF(PacketTraits::HasMin, propagate_nan_min, (internal::pmin<PropagateNaN>));
+    CHECK_CWISE2_IF(PacketTraits::HasMax, propagate_nan_max, internal::pmax<PropagateNaN>);
+  }
+
+  packetmath_boolean_mask_ops_notcomplex<Scalar, Packet>();
+}
+
+template <typename Scalar, typename Packet, bool ConjLhs, bool ConjRhs>
+void test_conj_helper(Scalar* data1, Scalar* data2, Scalar* ref, Scalar* pval) {
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+
+  internal::conj_if<ConjLhs> cj0;
+  internal::conj_if<ConjRhs> cj1;
+  internal::conj_helper<Scalar, Scalar, ConjLhs, ConjRhs> cj;
+  internal::conj_helper<Packet, Packet, ConjLhs, ConjRhs> pcj;
+
+  for (int i = 0; i < PacketSize; ++i) {
+    ref[i] = cj0(data1[i]) * cj1(data2[i]);
+    VERIFY(internal::isApprox(ref[i], cj.pmul(data1[i], data2[i])) && "conj_helper pmul");
+  }
+  internal::pstore(pval, pcj.pmul(internal::pload<Packet>(data1), internal::pload<Packet>(data2)));
+  VERIFY(test::areApprox(ref, pval, PacketSize) && "conj_helper pmul");
+
+  for (int i = 0; i < PacketSize; ++i) {
+    Scalar tmp = ref[i];
+    ref[i] += cj0(data1[i]) * cj1(data2[i]);
+    VERIFY(internal::isApprox(ref[i], cj.pmadd(data1[i], data2[i], tmp)) && "conj_helper pmadd");
+  }
+  internal::pstore(
+      pval, pcj.pmadd(internal::pload<Packet>(data1), internal::pload<Packet>(data2), internal::pload<Packet>(pval)));
+  VERIFY(test::areApprox(ref, pval, PacketSize) && "conj_helper pmadd");
+}
+
+template <typename Scalar, typename Packet>
+void packetmath_complex() {
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  typedef typename Scalar::value_type RealScalar;
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+
+  const int size = PacketSize * 4;
+  EIGEN_ALIGN_MAX Scalar data1[PacketSize * 4];
+  EIGEN_ALIGN_MAX Scalar data2[PacketSize * 4];
+  EIGEN_ALIGN_MAX Scalar ref[PacketSize * 4];
+  EIGEN_ALIGN_MAX Scalar pval[PacketSize * 4];
+
+  for (int i = 0; i < size; ++i) {
+    data1[i] = internal::random<Scalar>() * Scalar(1e2);
+    data2[i] = internal::random<Scalar>() * Scalar(1e2);
+  }
+
+  test_conj_helper<Scalar, Packet, false, false>(data1, data2, ref, pval);
+  test_conj_helper<Scalar, Packet, false, true>(data1, data2, ref, pval);
+  test_conj_helper<Scalar, Packet, true, false>(data1, data2, ref, pval);
+  test_conj_helper<Scalar, Packet, true, true>(data1, data2, ref, pval);
+
+  // Test pcplxflip.
+  {
+    for (int i = 0; i < PacketSize; ++i) ref[i] = Scalar(std::imag(data1[i]), std::real(data1[i]));
+    internal::pstore(pval, internal::pcplxflip(internal::pload<Packet>(data1)));
+    VERIFY(test::areApprox(ref, pval, PacketSize) && "pcplxflip");
+  }
+
+  if (PacketTraits::HasSqrt) {
+    for (int i = 0; i < size; ++i) {
+      data1[i] = Scalar(internal::random<RealScalar>(), internal::random<RealScalar>());
+    }
+    CHECK_CWISE1_N(numext::sqrt, internal::psqrt, size);
+
+    // Test misc. corner cases.
+    const RealScalar zero = RealScalar(0);
+    const RealScalar one = RealScalar(1);
+    const RealScalar inf = std::numeric_limits<RealScalar>::infinity();
+    const RealScalar nan = std::numeric_limits<RealScalar>::quiet_NaN();
+    data1[0] = Scalar(zero, zero);
+    data1[1] = Scalar(-zero, zero);
+    data1[2] = Scalar(one, zero);
+    data1[3] = Scalar(zero, one);
+    CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
+    data1[0] = Scalar(-one, zero);
+    data1[1] = Scalar(zero, -one);
+    data1[2] = Scalar(one, one);
+    data1[3] = Scalar(-one, -one);
+    CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
+    data1[0] = Scalar(inf, zero);
+    data1[1] = Scalar(zero, inf);
+    data1[2] = Scalar(-inf, zero);
+    data1[3] = Scalar(zero, -inf);
+    CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
+    data1[0] = Scalar(inf, inf);
+    data1[1] = Scalar(-inf, inf);
+    data1[2] = Scalar(inf, -inf);
+    data1[3] = Scalar(-inf, -inf);
+    CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
+    data1[0] = Scalar(nan, zero);
+    data1[1] = Scalar(zero, nan);
+    data1[2] = Scalar(nan, one);
+    data1[3] = Scalar(one, nan);
+    CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
+    data1[0] = Scalar(nan, nan);
+    data1[1] = Scalar(inf, nan);
+    data1[2] = Scalar(nan, inf);
+    data1[3] = Scalar(-inf, nan);
+    CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
+  }
+}
+
+template <typename Scalar, typename Packet>
+void packetmath_scatter_gather() {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+  EIGEN_ALIGN_MAX Scalar data1[PacketSize];
+  RealScalar refvalue = RealScalar(0);
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = internal::random<Scalar>() / RealScalar(PacketSize);
+  }
+
+  int stride = internal::random<int>(1, 20);
+
+  // Buffer of zeros.
+  EIGEN_ALIGN_MAX Scalar buffer[PacketSize * 20] = {};
+
+  Packet packet = internal::pload<Packet>(data1);
+  internal::pscatter<Scalar, Packet>(buffer, packet, stride);
+
+  for (int i = 0; i < PacketSize * 20; ++i) {
+    if ((i % stride) == 0 && i < stride * PacketSize) {
+      VERIFY(test::isApproxAbs(buffer[i], data1[i / stride], refvalue) && "pscatter");
+    } else {
+      VERIFY(test::isApproxAbs(buffer[i], Scalar(0), refvalue) && "pscatter");
+    }
+  }
+
+  for (int i = 0; i < PacketSize * 7; ++i) {
+    buffer[i] = internal::random<Scalar>() / RealScalar(PacketSize);
+  }
+  packet = internal::pgather<Scalar, Packet>(buffer, 7);
+  internal::pstore(data1, packet);
+  for (int i = 0; i < PacketSize; ++i) {
+    VERIFY(test::isApproxAbs(data1[i], buffer[i * 7], refvalue) && "pgather");
+  }
+}
+
+namespace Eigen {
+namespace test {
+
+template <typename Scalar, typename PacketType>
+struct runall<Scalar, PacketType, false, false> {  // i.e. float or double
+  static void run() {
+    packetmath<Scalar, PacketType>();
+    packetmath_scatter_gather<Scalar, PacketType>();
+    packetmath_notcomplex<Scalar, PacketType>();
+    packetmath_real<Scalar, PacketType>();
+  }
+};
+
+template <typename Scalar, typename PacketType>
+struct runall<Scalar, PacketType, false, true> {  // i.e. int
+  static void run() {
+    packetmath<Scalar, PacketType>();
+    packetmath_scatter_gather<Scalar, PacketType>();
+    packetmath_notcomplex<Scalar, PacketType>();
+  }
+};
+
+template <typename Scalar, typename PacketType>
+struct runall<Scalar, PacketType, true, false> {  // i.e. complex
+  static void run() {
+    packetmath<Scalar, PacketType>();
+    packetmath_scatter_gather<Scalar, PacketType>();
+    packetmath_complex<Scalar, PacketType>();
+  }
+};
+
+}  // namespace test
+}  // namespace Eigen
+
+EIGEN_DECLARE_TEST(packetmath) {
+  g_first_pass = true;
+  for (int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1(test::runner<float>::run());
+    CALL_SUBTEST_2(test::runner<double>::run());
+    CALL_SUBTEST_3(test::runner<int8_t>::run());
+    CALL_SUBTEST_4(test::runner<uint8_t>::run());
+    CALL_SUBTEST_5(test::runner<int16_t>::run());
+    CALL_SUBTEST_6(test::runner<uint16_t>::run());
+    CALL_SUBTEST_7(test::runner<int32_t>::run());
+    CALL_SUBTEST_8(test::runner<uint32_t>::run());
+    CALL_SUBTEST_9(test::runner<int64_t>::run());
+    CALL_SUBTEST_10(test::runner<uint64_t>::run());
+    CALL_SUBTEST_11(test::runner<std::complex<float> >::run());
+    CALL_SUBTEST_12(test::runner<std::complex<double> >::run());
+    CALL_SUBTEST_13(test::runner<half>::run());
+    CALL_SUBTEST_14((packetmath<bool, internal::packet_traits<bool>::type>()));
+    CALL_SUBTEST_15(test::runner<bfloat16>::run());
+    g_first_pass = false;
+  }
+}

diff --git a/test/packetmath_test_shared.h b/test/packetmath_test_shared.h
new file mode 100644
index 0000000..8624fe2
--- /dev/null
+++ b/test/packetmath_test_shared.h

@@ -0,0 +1,275 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <typeinfo>
+
+#if defined __GNUC__ && __GNUC__>=6
+  #pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+// using namespace Eigen;
+
+bool g_first_pass = true;
+
+namespace Eigen {
+namespace internal {
+
+template<typename T> T negate(const T& x) { return -x; }
+
+template<typename T>
+Map<const Array<unsigned char,sizeof(T),1> >
+bits(const T& x) {
+  return Map<const Array<unsigned char,sizeof(T),1> >(reinterpret_cast<const unsigned char *>(&x));
+}
+
+// The following implement bitwise operations on floating point types
+template<typename T,typename Bits,typename Func>
+T apply_bit_op(Bits a, Bits b, Func f) {
+  Array<unsigned char,sizeof(T),1> data;
+  T res;
+  for(Index i = 0; i < data.size(); ++i)
+    data[i] = f(a[i], b[i]);
+  // Note: The reinterpret_cast works around GCC's class-memaccess warnings:
+  std::memcpy(reinterpret_cast<unsigned char*>(&res), data.data(), sizeof(T));
+  return res;
+}
+
+#define EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,T)             \
+  template<> T EIGEN_CAT(p,OP)(const T& a,const T& b) { \
+    return apply_bit_op<T>(bits(a),bits(b),FUNC);     \
+  }
+
+#define EIGEN_TEST_MAKE_BITWISE(OP,FUNC)                  \
+  EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,float)                 \
+  EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,double)                \
+  EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,half)                  \
+  EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,bfloat16)              \
+  EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,std::complex<float>)   \
+  EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,std::complex<double>)
+
+EIGEN_TEST_MAKE_BITWISE(xor,std::bit_xor<unsigned char>())
+EIGEN_TEST_MAKE_BITWISE(and,std::bit_and<unsigned char>())
+EIGEN_TEST_MAKE_BITWISE(or, std::bit_or<unsigned char>())
+struct bit_andnot{
+  template<typename T> T
+  operator()(T a, T b) const { return a & (~b); }
+};
+EIGEN_TEST_MAKE_BITWISE(andnot, bit_andnot())
+template<typename T>
+bool biteq(T a, T b) {
+  return (bits(a) == bits(b)).all();
+}
+
+}
+
+namespace test {
+
+// NOTE: we disable inlining for this function to workaround a GCC issue when using -O3 and the i387 FPU.
+template<typename Scalar> EIGEN_DONT_INLINE
+bool isApproxAbs(const Scalar& a, const Scalar& b, const typename NumTraits<Scalar>::Real& refvalue)
+{
+  return internal::isMuchSmallerThan(a-b, refvalue);
+}
+
+template<typename Scalar>
+inline void print_mismatch(const Scalar* ref, const Scalar* vec, int size) {
+  std::cout << "ref: [" << Map<const Matrix<Scalar,1,Dynamic> >(ref,size) << "]" << " != vec: [" << Map<const Matrix<Scalar,1,Dynamic> >(vec,size) << "]\n";
+}
+
+template<typename Scalar> bool areApproxAbs(const Scalar* a, const Scalar* b, int size, const typename NumTraits<Scalar>::Real& refvalue)
+{
+  for (int i=0; i<size; ++i)
+  {
+    if (!isApproxAbs(a[i],b[i],refvalue))
+    {
+      print_mismatch(a, b, size);
+      return false;
+    }
+  }
+  return true;
+}
+
+template<typename Scalar> bool areApprox(const Scalar* a, const Scalar* b, int size)
+{
+  for (int i=0; i<size; ++i)
+  {
+    if ( a[i]!=b[i] && !internal::isApprox(a[i],b[i]) 
+         && !((numext::isnan)(a[i]) && (numext::isnan)(b[i])) )
+    {
+      print_mismatch(a, b, size);
+      return false;
+    }
+  }
+  return true;
+}
+
+template<typename Scalar> bool areEqual(const Scalar* a, const Scalar* b, int size)
+{
+  for (int i=0; i<size; ++i)
+  {
+    if ( (a[i] != b[i]) && !((numext::isnan)(a[i]) && (numext::isnan)(b[i])) )
+    {
+      print_mismatch(a, b, size);
+      return false;
+    }
+  }
+  return true;
+}
+
+#define CHECK_CWISE1(REFOP, POP) { \
+  for (int i=0; i<PacketSize; ++i) \
+    ref[i] = REFOP(data1[i]); \
+  internal::pstore(data2, POP(internal::pload<Packet>(data1))); \
+  VERIFY(test::areApprox(ref, data2, PacketSize) && #POP); \
+}
+
+// Checks component-wise for input of size N. All of data1, data2, and ref
+// should have size at least ceil(N/PacketSize)*PacketSize to avoid memory
+// access errors.
+#define CHECK_CWISE1_N(REFOP, POP, N) { \
+  for (int i=0; i<N; ++i) \
+    ref[i] = REFOP(data1[i]); \
+  for (int j=0; j<N; j+=PacketSize) \
+    internal::pstore(data2 + j, POP(internal::pload<Packet>(data1 + j))); \
+  VERIFY(test::areApprox(ref, data2, N) && #POP); \
+}
+
+template<bool Cond,typename Packet>
+struct packet_helper
+{
+  template<typename T>
+  inline Packet load(const T* from) const { return internal::pload<Packet>(from); }
+
+  template<typename T>
+  inline Packet loadu(const T* from) const { return internal::ploadu<Packet>(from); }
+
+  template<typename T>
+  inline Packet load(const T* from, unsigned long long umask) const { return internal::ploadu<Packet>(from, umask); }
+
+  template<typename T>
+  inline void store(T* to, const Packet& x) const { internal::pstore(to,x); }
+
+  template<typename T>
+  inline void store(T* to, const Packet& x, unsigned long long umask) const { internal::pstoreu(to, x, umask); }
+
+  template<typename T>
+  inline Packet& forward_reference(Packet& packet, T& /*scalar*/) const { return packet; }
+};
+
+template<typename Packet>
+struct packet_helper<false,Packet>
+{
+  template<typename T>
+  inline T load(const T* from) const { return *from; }
+
+  template<typename T>
+  inline T loadu(const T* from) const { return *from; }
+
+  template<typename T>
+  inline T load(const T* from, unsigned long long) const { return *from; }
+
+  template<typename T>
+  inline void store(T* to, const T& x) const { *to = x; }
+
+  template<typename T>
+  inline void store(T* to, const T& x, unsigned long long) const { *to = x; }
+
+  template<typename T>
+  inline T& forward_reference(Packet& /*packet*/, T& scalar) const { return scalar; }
+};
+
+#define CHECK_CWISE1_IF(COND, REFOP, POP) if(COND) { \
+  test::packet_helper<COND,Packet> h; \
+  for (int i=0; i<PacketSize; ++i) \
+    ref[i] = Scalar(REFOP(data1[i])); \
+  h.store(data2, POP(h.load(data1))); \
+  VERIFY(test::areApprox(ref, data2, PacketSize) && #POP); \
+}
+
+#define CHECK_CWISE1_EXACT_IF(COND, REFOP, POP) if(COND) { \
+  test::packet_helper<COND,Packet> h; \
+  for (int i=0; i<PacketSize; ++i) \
+    ref[i] = Scalar(REFOP(data1[i])); \
+  h.store(data2, POP(h.load(data1))); \
+  VERIFY(test::areEqual(ref, data2, PacketSize) && #POP); \
+}
+
+#define CHECK_CWISE2_IF(COND, REFOP, POP) if(COND) { \
+  test::packet_helper<COND,Packet> h; \
+  for (int i=0; i<PacketSize; ++i) \
+    ref[i] = Scalar(REFOP(data1[i], data1[i+PacketSize]));     \
+  h.store(data2, POP(h.load(data1),h.load(data1+PacketSize))); \
+  VERIFY(test::areApprox(ref, data2, PacketSize) && #POP); \
+}
+
+// One input, one output by reference.
+#define CHECK_CWISE1_BYREF1_IF(COND, REFOP, POP) if(COND) { \
+  test::packet_helper<COND,Packet> h; \
+  for (int i=0; i<PacketSize; ++i) \
+    ref[i] = Scalar(REFOP(data1[i], ref[i+PacketSize]));     \
+  Packet pout; \
+  Scalar sout; \
+  h.store(data2, POP(h.load(data1), h.forward_reference(pout, sout))); \
+  h.store(data2+PacketSize, h.forward_reference(pout, sout)); \
+  VERIFY(test::areApprox(ref, data2, 2 * PacketSize) && #POP); \
+}
+
+#define CHECK_CWISE3_IF(COND, REFOP, POP) if (COND) {                      \
+  test::packet_helper<COND, Packet> h;                                     \
+  for (int i = 0; i < PacketSize; ++i)                                     \
+    ref[i] = Scalar(REFOP(data1[i], data1[i + PacketSize],                 \
+                          data1[i + 2 * PacketSize]));                     \
+  h.store(data2, POP(h.load(data1), h.load(data1 + PacketSize),            \
+                     h.load(data1 + 2 * PacketSize)));                     \
+  VERIFY(test::areApprox(ref, data2, PacketSize) && #POP);                 \
+}
+
+// Specialize the runall struct in your test file by defining run().
+template<
+  typename Scalar,
+  typename PacketType,
+  bool IsComplex = NumTraits<Scalar>::IsComplex,
+  bool IsInteger = NumTraits<Scalar>::IsInteger>
+struct runall;
+
+template<
+  typename Scalar,
+  typename PacketType = typename internal::packet_traits<Scalar>::type,
+  bool Vectorized = internal::packet_traits<Scalar>::Vectorizable,
+  bool HasHalf = !internal::is_same<typename internal::unpacket_traits<PacketType>::half,PacketType>::value >
+struct runner;
+
+template<typename Scalar,typename PacketType>
+struct runner<Scalar,PacketType,true,true>
+{
+  static void run() {
+    runall<Scalar,PacketType>::run();
+    runner<Scalar,typename internal::unpacket_traits<PacketType>::half>::run();
+  }
+};
+
+template<typename Scalar,typename PacketType>
+struct runner<Scalar,PacketType,true,false>
+{
+  static void run() {
+    runall<Scalar,PacketType>::run();
+  }
+};
+
+template<typename Scalar,typename PacketType>
+struct runner<Scalar,PacketType,false,false>
+{
+  static void run() {
+    runall<Scalar,PacketType>::run();
+  }
+};
+
+}
+}

diff --git a/test/pardiso_support.cpp b/test/pardiso_support.cpp
new file mode 100644
index 0000000..9c16ded
--- /dev/null
+++ b/test/pardiso_support.cpp

@@ -0,0 +1,29 @@
+/* 
+   Intel Copyright (C) ....
+*/
+
+#include "sparse_solver.h"
+#include <Eigen/PardisoSupport>
+
+template<typename T> void test_pardiso_T()
+{
+  PardisoLLT < SparseMatrix<T, RowMajor>, Lower> pardiso_llt_lower;
+  PardisoLLT < SparseMatrix<T, RowMajor>, Upper> pardiso_llt_upper;
+  PardisoLDLT < SparseMatrix<T, RowMajor>, Lower> pardiso_ldlt_lower;
+  PardisoLDLT < SparseMatrix<T, RowMajor>, Upper> pardiso_ldlt_upper;
+  PardisoLU  < SparseMatrix<T, RowMajor> > pardiso_lu;
+
+  check_sparse_spd_solving(pardiso_llt_lower);
+  check_sparse_spd_solving(pardiso_llt_upper);
+  check_sparse_spd_solving(pardiso_ldlt_lower);
+  check_sparse_spd_solving(pardiso_ldlt_upper);
+  check_sparse_square_solving(pardiso_lu);
+}
+
+EIGEN_DECLARE_TEST(pardiso_support)
+{
+  CALL_SUBTEST_1(test_pardiso_T<float>());
+  CALL_SUBTEST_2(test_pardiso_T<double>());
+  CALL_SUBTEST_3(test_pardiso_T< std::complex<float> >());
+  CALL_SUBTEST_4(test_pardiso_T< std::complex<double> >());
+}

diff --git a/test/pastix_support.cpp b/test/pastix_support.cpp
new file mode 100644
index 0000000..9b64417
--- /dev/null
+++ b/test/pastix_support.cpp

@@ -0,0 +1,54 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
+#include "sparse_solver.h"
+#include <Eigen/PaStiXSupport>
+#include <unsupported/Eigen/SparseExtra>
+
+
+template<typename T> void test_pastix_T()
+{
+  PastixLLT< SparseMatrix<T, ColMajor>, Eigen::Lower > pastix_llt_lower;
+  PastixLDLT< SparseMatrix<T, ColMajor>, Eigen::Lower > pastix_ldlt_lower;
+  PastixLLT< SparseMatrix<T, ColMajor>, Eigen::Upper > pastix_llt_upper;
+  PastixLDLT< SparseMatrix<T, ColMajor>, Eigen::Upper > pastix_ldlt_upper;
+  PastixLU< SparseMatrix<T, ColMajor> > pastix_lu;
+
+  check_sparse_spd_solving(pastix_llt_lower);
+  check_sparse_spd_solving(pastix_ldlt_lower);
+  check_sparse_spd_solving(pastix_llt_upper);
+  check_sparse_spd_solving(pastix_ldlt_upper);
+  check_sparse_square_solving(pastix_lu);
+
+  // Some compilation check:
+  pastix_llt_lower.iparm();
+  pastix_llt_lower.dparm();
+  pastix_ldlt_lower.iparm();
+  pastix_ldlt_lower.dparm();
+  pastix_lu.iparm();
+  pastix_lu.dparm();
+}
+
+// There is no support for selfadjoint matrices with PaStiX. 
+// Complex symmetric matrices should pass though
+template<typename T> void test_pastix_T_LU()
+{
+  PastixLU< SparseMatrix<T, ColMajor> > pastix_lu;
+  check_sparse_square_solving(pastix_lu);
+}
+
+EIGEN_DECLARE_TEST(pastix_support)
+{
+  CALL_SUBTEST_1(test_pastix_T<float>());
+  CALL_SUBTEST_2(test_pastix_T<double>());
+  CALL_SUBTEST_3( (test_pastix_T_LU<std::complex<float> >()) );
+  CALL_SUBTEST_4(test_pastix_T_LU<std::complex<double> >());
+} 

diff --git a/test/permutationmatrices.cpp b/test/permutationmatrices.cpp
new file mode 100644
index 0000000..d4b68b2
--- /dev/null
+++ b/test/permutationmatrices.cpp

@@ -0,0 +1,181 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define TEST_ENABLE_TEMPORARY_TRACKING
+  
+#include "main.h"
+
+using namespace std;
+template<typename MatrixType> void permutationmatrices(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  enum { Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime,
+         Options = MatrixType::Options };
+  typedef PermutationMatrix<Rows> LeftPermutationType;
+  typedef Transpositions<Rows> LeftTranspositionsType;
+  typedef Matrix<int, Rows, 1> LeftPermutationVectorType;
+  typedef Map<LeftPermutationType> MapLeftPerm;
+  typedef PermutationMatrix<Cols> RightPermutationType;
+  typedef Transpositions<Cols> RightTranspositionsType;
+  typedef Matrix<int, Cols, 1> RightPermutationVectorType;
+  typedef Map<RightPermutationType> MapRightPerm;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m_original = MatrixType::Random(rows,cols);
+  LeftPermutationVectorType lv;
+  randomPermutationVector(lv, rows);
+  LeftPermutationType lp(lv);
+  RightPermutationVectorType rv;
+  randomPermutationVector(rv, cols);
+  RightPermutationType rp(rv);
+  LeftTranspositionsType lt(lv);
+  RightTranspositionsType rt(rv);
+  MatrixType m_permuted = MatrixType::Random(rows,cols);
+  
+  VERIFY_EVALUATION_COUNT(m_permuted = lp * m_original * rp, 1); // 1 temp for sub expression "lp * m_original"
+
+  for (int i=0; i<rows; i++)
+    for (int j=0; j<cols; j++)
+        VERIFY_IS_APPROX(m_permuted(lv(i),j), m_original(i,rv(j)));
+
+  Matrix<Scalar,Rows,Rows> lm(lp);
+  Matrix<Scalar,Cols,Cols> rm(rp);
+
+  VERIFY_IS_APPROX(m_permuted, lm*m_original*rm);
+  
+  m_permuted = m_original;
+  VERIFY_EVALUATION_COUNT(m_permuted = lp * m_permuted * rp, 1);
+  VERIFY_IS_APPROX(m_permuted, lm*m_original*rm);
+
+  LeftPermutationType lpi;
+  lpi = lp.inverse();
+  VERIFY_IS_APPROX(lpi*m_permuted,lp.inverse()*m_permuted);
+
+  VERIFY_IS_APPROX(lp.inverse()*m_permuted*rp.inverse(), m_original);
+  VERIFY_IS_APPROX(lv.asPermutation().inverse()*m_permuted*rv.asPermutation().inverse(), m_original);
+  VERIFY_IS_APPROX(MapLeftPerm(lv.data(),lv.size()).inverse()*m_permuted*MapRightPerm(rv.data(),rv.size()).inverse(), m_original);
+  
+  VERIFY((lp*lp.inverse()).toDenseMatrix().isIdentity());
+  VERIFY((lv.asPermutation()*lv.asPermutation().inverse()).toDenseMatrix().isIdentity());
+  VERIFY((MapLeftPerm(lv.data(),lv.size())*MapLeftPerm(lv.data(),lv.size()).inverse()).toDenseMatrix().isIdentity());
+
+  LeftPermutationVectorType lv2;
+  randomPermutationVector(lv2, rows);
+  LeftPermutationType lp2(lv2);
+  Matrix<Scalar,Rows,Rows> lm2(lp2);
+  VERIFY_IS_APPROX((lp*lp2).toDenseMatrix().template cast<Scalar>(), lm*lm2);
+  VERIFY_IS_APPROX((lv.asPermutation()*lv2.asPermutation()).toDenseMatrix().template cast<Scalar>(), lm*lm2);
+  VERIFY_IS_APPROX((MapLeftPerm(lv.data(),lv.size())*MapLeftPerm(lv2.data(),lv2.size())).toDenseMatrix().template cast<Scalar>(), lm*lm2);
+
+  LeftPermutationType identityp;
+  identityp.setIdentity(rows);
+  VERIFY_IS_APPROX(m_original, identityp*m_original);
+  
+  // check inplace permutations
+  m_permuted = m_original;
+  VERIFY_EVALUATION_COUNT(m_permuted.noalias()= lp.inverse() * m_permuted, 1); // 1 temp to allocate the mask
+  VERIFY_IS_APPROX(m_permuted, lp.inverse()*m_original);
+  
+  m_permuted = m_original;
+  VERIFY_EVALUATION_COUNT(m_permuted.noalias() = m_permuted * rp.inverse(), 1); // 1 temp to allocate the mask
+  VERIFY_IS_APPROX(m_permuted, m_original*rp.inverse());
+  
+  m_permuted = m_original;
+  VERIFY_EVALUATION_COUNT(m_permuted.noalias() = lp * m_permuted, 1); // 1 temp to allocate the mask
+  VERIFY_IS_APPROX(m_permuted, lp*m_original);
+  
+  m_permuted = m_original;
+  VERIFY_EVALUATION_COUNT(m_permuted.noalias() = m_permuted * rp, 1); // 1 temp to allocate the mask
+  VERIFY_IS_APPROX(m_permuted, m_original*rp);
+
+  if(rows>1 && cols>1)
+  {
+    lp2 = lp;
+    Index i = internal::random<Index>(0, rows-1);
+    Index j;
+    do j = internal::random<Index>(0, rows-1); while(j==i);
+    lp2.applyTranspositionOnTheLeft(i, j);
+    lm = lp;
+    lm.row(i).swap(lm.row(j));
+    VERIFY_IS_APPROX(lm, lp2.toDenseMatrix().template cast<Scalar>());
+
+    RightPermutationType rp2 = rp;
+    i = internal::random<Index>(0, cols-1);
+    do j = internal::random<Index>(0, cols-1); while(j==i);
+    rp2.applyTranspositionOnTheRight(i, j);
+    rm = rp;
+    rm.col(i).swap(rm.col(j));
+    VERIFY_IS_APPROX(rm, rp2.toDenseMatrix().template cast<Scalar>());
+  }
+
+  {
+    // simple compilation check
+    Matrix<Scalar, Cols, Cols> A = rp;
+    Matrix<Scalar, Cols, Cols> B = rp.transpose();
+    VERIFY_IS_APPROX(A, B.transpose());
+  }
+
+  m_permuted = m_original;
+  lp = lt;
+  rp = rt;
+  VERIFY_EVALUATION_COUNT(m_permuted = lt * m_permuted * rt, 1);
+  VERIFY_IS_APPROX(m_permuted, lp*m_original*rp.transpose());
+  
+  VERIFY_IS_APPROX(lt.inverse()*m_permuted*rt.inverse(), m_original);
+
+  // Check inplace transpositions
+  m_permuted = m_original;
+  VERIFY_IS_APPROX(m_permuted = lt * m_permuted, lp * m_original);
+  m_permuted = m_original;
+  VERIFY_IS_APPROX(m_permuted = lt.inverse() * m_permuted, lp.inverse() * m_original);
+  m_permuted = m_original;
+  VERIFY_IS_APPROX(m_permuted = m_permuted * rt, m_original * rt);
+  m_permuted = m_original;
+  VERIFY_IS_APPROX(m_permuted = m_permuted * rt.inverse(), m_original * rt.inverse());
+}
+
+template<typename T>
+void bug890()
+{
+  typedef Matrix<T, Dynamic, Dynamic> MatrixType;
+  typedef Matrix<T, Dynamic, 1> VectorType;
+  typedef Stride<Dynamic,Dynamic> S;
+  typedef Map<MatrixType, Aligned, S> MapType;
+  typedef PermutationMatrix<Dynamic> Perm;
+  
+  VectorType v1(2), v2(2), op(4), rhs(2);
+  v1 << 666,667;
+  op << 1,0,0,1;
+  rhs << 42,42;
+  
+  Perm P(2);
+  P.indices() << 1, 0;
+
+  MapType(v1.data(),2,1,S(1,1)) = P * MapType(rhs.data(),2,1,S(1,1));
+  VERIFY_IS_APPROX(v1, (P * rhs).eval());
+
+  MapType(v1.data(),2,1,S(1,1)) = P.inverse() * MapType(rhs.data(),2,1,S(1,1));
+  VERIFY_IS_APPROX(v1, (P.inverse() * rhs).eval());
+}
+
+EIGEN_DECLARE_TEST(permutationmatrices)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( permutationmatrices(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( permutationmatrices(Matrix3f()) );
+    CALL_SUBTEST_3( permutationmatrices(Matrix<double,3,3,RowMajor>()) );
+    CALL_SUBTEST_4( permutationmatrices(Matrix4d()) );
+    CALL_SUBTEST_5( permutationmatrices(Matrix<double,40,60>()) );
+    CALL_SUBTEST_6( permutationmatrices(Matrix<double,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_7( permutationmatrices(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  }
+  CALL_SUBTEST_5( bug890<double>() );
+}

diff --git a/test/prec_inverse_4x4.cpp b/test/prec_inverse_4x4.cpp
new file mode 100644
index 0000000..86f0571
--- /dev/null
+++ b/test/prec_inverse_4x4.cpp

@@ -0,0 +1,82 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/LU>
+#include <algorithm>
+
+template<typename MatrixType> void inverse_permutation_4x4()
+{
+  typedef typename MatrixType::Scalar Scalar;
+  Vector4i indices(0,1,2,3);
+  for(int i = 0; i < 24; ++i)
+  {
+    MatrixType m = PermutationMatrix<4>(indices);
+    MatrixType inv = m.inverse();
+    double error = double( (m*inv-MatrixType::Identity()).norm() / NumTraits<Scalar>::epsilon() );
+    EIGEN_DEBUG_VAR(error)
+    VERIFY(error == 0.0);
+    std::next_permutation(indices.data(),indices.data()+4);
+  }
+}
+
+template<typename MatrixType> void inverse_general_4x4(int repeat)
+{
+  using std::abs;
+  typedef typename MatrixType::Scalar Scalar;
+  double error_sum = 0., error_max = 0.;
+  for(int i = 0; i < repeat; ++i)
+  {
+    MatrixType m;
+    bool is_invertible;
+    do {
+      m = MatrixType::Random();
+      is_invertible = Eigen::FullPivLU<MatrixType>(m).isInvertible();
+    } while(!is_invertible);
+    MatrixType inv = m.inverse();
+    double error = double( (m*inv-MatrixType::Identity()).norm());
+    error_sum += error;
+    error_max = (std::max)(error_max, error);
+  }
+  std::cerr << "inverse_general_4x4, Scalar = " << type_name<Scalar>() << std::endl;
+  double error_avg = error_sum / repeat;
+  EIGEN_DEBUG_VAR(error_avg);
+  EIGEN_DEBUG_VAR(error_max);
+   // FIXME that 1.25 used to be a 1.0 until the NumTraits changes on 28 April 2010, what's going wrong??
+   // FIXME that 1.25 used to be 1.2 until we tested gcc 4.1 on 30 June 2010 and got 1.21.
+  VERIFY(error_avg < (NumTraits<Scalar>::IsComplex ? 8.0 : 1.25));
+  VERIFY(error_max < (NumTraits<Scalar>::IsComplex ? 64.0 : 20.0));
+
+  {
+    int s = 5;//internal::random<int>(4,10);
+    int i = 0;//internal::random<int>(0,s-4);
+    int j = 0;//internal::random<int>(0,s-4);
+    Matrix<Scalar,5,5> mat(s,s);
+    mat.setRandom();
+    MatrixType submat = mat.template block<4,4>(i,j);
+    MatrixType mat_inv = mat.template block<4,4>(i,j).inverse();
+    VERIFY_IS_APPROX(mat_inv, submat.inverse());
+    mat.template block<4,4>(i,j) = submat.inverse();
+    VERIFY_IS_APPROX(mat_inv, (mat.template block<4,4>(i,j)));
+  }
+}
+
+EIGEN_DECLARE_TEST(prec_inverse_4x4)
+{
+  CALL_SUBTEST_1((inverse_permutation_4x4<Matrix4f>()));
+  CALL_SUBTEST_1(( inverse_general_4x4<Matrix4f>(200000 * g_repeat) ));
+  CALL_SUBTEST_1(( inverse_general_4x4<Matrix<float,4,4,RowMajor> >(200000 * g_repeat) ));
+
+  CALL_SUBTEST_2((inverse_permutation_4x4<Matrix<double,4,4,RowMajor> >()));
+  CALL_SUBTEST_2(( inverse_general_4x4<Matrix<double,4,4,ColMajor> >(200000 * g_repeat) ));
+  CALL_SUBTEST_2(( inverse_general_4x4<Matrix<double,4,4,RowMajor> >(200000 * g_repeat) ));
+
+  CALL_SUBTEST_3((inverse_permutation_4x4<Matrix4cf>()));
+  CALL_SUBTEST_3((inverse_general_4x4<Matrix4cf>(50000 * g_repeat)));
+}

diff --git a/test/product.h b/test/product.h
new file mode 100644
index 0000000..c6c78fb
--- /dev/null
+++ b/test/product.h

@@ -0,0 +1,259 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/QR>
+
+template<typename Derived1, typename Derived2>
+bool areNotApprox(const MatrixBase<Derived1>& m1, const MatrixBase<Derived2>& m2, typename Derived1::RealScalar epsilon = NumTraits<typename Derived1::RealScalar>::dummy_precision())
+{
+  return !((m1-m2).cwiseAbs2().maxCoeff() < epsilon * epsilon
+                          * (std::max)(m1.cwiseAbs2().maxCoeff(), m2.cwiseAbs2().maxCoeff()));
+}
+
+template<typename MatrixType> void product(const MatrixType& m)
+{
+  /* this test covers the following files:
+     Identity.h Product.h
+  */
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> RowVectorType;
+  typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> ColVectorType;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> RowSquareMatrixType;
+  typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, MatrixType::ColsAtCompileTime> ColSquareMatrixType;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime,
+                         MatrixType::Flags&RowMajorBit?ColMajor:RowMajor> OtherMajorMatrixType;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  // this test relies a lot on Random.h, and there's not much more that we can do
+  // to test it, hence I consider that we will have tested Random.h
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m2 = MatrixType::Random(rows, cols),
+             m3(rows, cols);
+  RowSquareMatrixType
+             identity = RowSquareMatrixType::Identity(rows, rows),
+             square = RowSquareMatrixType::Random(rows, rows),
+             res = RowSquareMatrixType::Random(rows, rows);
+  ColSquareMatrixType
+             square2 = ColSquareMatrixType::Random(cols, cols),
+             res2 = ColSquareMatrixType::Random(cols, cols);
+  RowVectorType v1 = RowVectorType::Random(rows);
+  ColVectorType vc2 = ColVectorType::Random(cols), vcres(cols);
+  OtherMajorMatrixType tm1 = m1;
+
+  Scalar s1 = internal::random<Scalar>();
+
+  Index r  = internal::random<Index>(0, rows-1),
+        c  = internal::random<Index>(0, cols-1),
+        c2 = internal::random<Index>(0, cols-1);
+
+  // begin testing Product.h: only associativity for now
+  // (we use Transpose.h but this doesn't count as a test for it)
+  VERIFY_IS_APPROX((m1*m1.transpose())*m2,  m1*(m1.transpose()*m2));
+  m3 = m1;
+  m3 *= m1.transpose() * m2;
+  VERIFY_IS_APPROX(m3,                      m1 * (m1.transpose()*m2));
+  VERIFY_IS_APPROX(m3,                      m1 * (m1.transpose()*m2));
+
+  // continue testing Product.h: distributivity
+  VERIFY_IS_APPROX(square*(m1 + m2),        square*m1+square*m2);
+  VERIFY_IS_APPROX(square*(m1 - m2),        square*m1-square*m2);
+
+  // continue testing Product.h: compatibility with ScalarMultiple.h
+  VERIFY_IS_APPROX(s1*(square*m1),          (s1*square)*m1);
+  VERIFY_IS_APPROX(s1*(square*m1),          square*(m1*s1));
+
+  // test Product.h together with Identity.h
+  VERIFY_IS_APPROX(v1,                      identity*v1);
+  VERIFY_IS_APPROX(v1.transpose(),          v1.transpose() * identity);
+  // again, test operator() to check const-qualification
+  VERIFY_IS_APPROX(MatrixType::Identity(rows, cols)(r,c), static_cast<Scalar>(r==c));
+
+  if (rows!=cols)
+     VERIFY_RAISES_ASSERT(m3 = m1*m1);
+
+  // test the previous tests were not screwed up because operator* returns 0
+  // (we use the more accurate default epsilon)
+  if (!NumTraits<Scalar>::IsInteger && (std::min)(rows,cols)>1)
+  {
+    VERIFY(areNotApprox(m1.transpose()*m2,m2.transpose()*m1));
+  }
+
+  // test optimized operator+= path
+  res = square;
+  res.noalias() += m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, square + m1 * m2.transpose());
+  if (!NumTraits<Scalar>::IsInteger && (std::min)(rows,cols)>1)
+  {
+    VERIFY(areNotApprox(res,square + m2 * m1.transpose()));
+  }
+  vcres = vc2;
+  vcres.noalias() += m1.transpose() * v1;
+  VERIFY_IS_APPROX(vcres, vc2 + m1.transpose() * v1);
+
+  // test optimized operator-= path
+  res = square;
+  res.noalias() -= m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, square - (m1 * m2.transpose()));
+  if (!NumTraits<Scalar>::IsInteger && (std::min)(rows,cols)>1)
+  {
+    VERIFY(areNotApprox(res,square - m2 * m1.transpose()));
+  }
+  vcres = vc2;
+  vcres.noalias() -= m1.transpose() * v1;
+  VERIFY_IS_APPROX(vcres, vc2 - m1.transpose() * v1);
+
+  // test scaled products
+  res = square;
+  res.noalias() = s1 * m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, ((s1*m1).eval() * m2.transpose()));
+  res = square;
+  res.noalias() += s1 * m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, square + ((s1*m1).eval() * m2.transpose()));
+  res = square;
+  res.noalias() -= s1 * m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, square - ((s1*m1).eval() * m2.transpose()));
+
+  // test d ?= a+b*c rules
+  res.noalias() = square + m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, square + m1 * m2.transpose());
+  res.noalias() += square + m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, 2*(square + m1 * m2.transpose()));
+  res.noalias() -= square + m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, square + m1 * m2.transpose());
+
+  // test d ?= a-b*c rules
+  res.noalias() = square - m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, square - m1 * m2.transpose());
+  res.noalias() += square - m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, 2*(square - m1 * m2.transpose()));
+  res.noalias() -= square - m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, square - m1 * m2.transpose());
+
+
+  tm1 = m1;
+  VERIFY_IS_APPROX(tm1.transpose() * v1, m1.transpose() * v1);
+  VERIFY_IS_APPROX(v1.transpose() * tm1, v1.transpose() * m1);
+
+  // test submatrix and matrix/vector product
+  for (int i=0; i<rows; ++i)
+    res.row(i) = m1.row(i) * m2.transpose();
+  VERIFY_IS_APPROX(res, m1 * m2.transpose());
+  // the other way round:
+  for (int i=0; i<rows; ++i)
+    res.col(i) = m1 * m2.transpose().col(i);
+  VERIFY_IS_APPROX(res, m1 * m2.transpose());
+
+  res2 = square2;
+  res2.noalias() += m1.transpose() * m2;
+  VERIFY_IS_APPROX(res2, square2 + m1.transpose() * m2);
+  if (!NumTraits<Scalar>::IsInteger && (std::min)(rows,cols)>1)
+  {
+    VERIFY(areNotApprox(res2,square2 + m2.transpose() * m1));
+  }
+
+  VERIFY_IS_APPROX(res.col(r).noalias() = square.adjoint() * square.col(r), (square.adjoint() * square.col(r)).eval());
+  VERIFY_IS_APPROX(res.col(r).noalias() = square * square.col(r), (square * square.col(r)).eval());
+
+  // vector at runtime (see bug 1166)
+  {
+    RowSquareMatrixType ref(square);
+    ColSquareMatrixType ref2(square2);
+    ref = res = square;
+    VERIFY_IS_APPROX(res.block(0,0,1,rows).noalias() = m1.col(0).transpose() * square.transpose(),            (ref.row(0) = m1.col(0).transpose() * square.transpose()));
+    VERIFY_IS_APPROX(res.block(0,0,1,rows).noalias() = m1.block(0,0,rows,1).transpose() * square.transpose(), (ref.row(0) = m1.col(0).transpose() * square.transpose()));
+    VERIFY_IS_APPROX(res.block(0,0,1,rows).noalias() = m1.col(0).transpose() * square,                        (ref.row(0) = m1.col(0).transpose() * square));
+    VERIFY_IS_APPROX(res.block(0,0,1,rows).noalias() = m1.block(0,0,rows,1).transpose() * square,             (ref.row(0) = m1.col(0).transpose() * square));
+    ref2 = res2 = square2;
+    VERIFY_IS_APPROX(res2.block(0,0,1,cols).noalias() = m1.row(0) * square2.transpose(),                      (ref2.row(0) = m1.row(0) * square2.transpose()));
+    VERIFY_IS_APPROX(res2.block(0,0,1,cols).noalias() = m1.block(0,0,1,cols) * square2.transpose(),           (ref2.row(0) = m1.row(0) * square2.transpose()));
+    VERIFY_IS_APPROX(res2.block(0,0,1,cols).noalias() = m1.row(0) * square2,                                  (ref2.row(0) = m1.row(0) * square2));
+    VERIFY_IS_APPROX(res2.block(0,0,1,cols).noalias() = m1.block(0,0,1,cols) * square2,                       (ref2.row(0) = m1.row(0) * square2));
+  }
+
+  // vector.block() (see bug 1283)
+  {
+    RowVectorType w1(rows);
+    VERIFY_IS_APPROX(square * v1.block(0,0,rows,1), square * v1);
+    VERIFY_IS_APPROX(w1.noalias() = square * v1.block(0,0,rows,1), square * v1);
+    VERIFY_IS_APPROX(w1.block(0,0,rows,1).noalias() = square * v1.block(0,0,rows,1), square * v1);
+
+    Matrix<Scalar,1,MatrixType::ColsAtCompileTime> w2(cols);
+    VERIFY_IS_APPROX(vc2.block(0,0,cols,1).transpose() * square2, vc2.transpose() * square2);
+    VERIFY_IS_APPROX(w2.noalias() = vc2.block(0,0,cols,1).transpose() * square2, vc2.transpose() * square2);
+    VERIFY_IS_APPROX(w2.block(0,0,1,cols).noalias() = vc2.block(0,0,cols,1).transpose() * square2, vc2.transpose() * square2);
+
+    vc2 = square2.block(0,0,1,cols).transpose();
+    VERIFY_IS_APPROX(square2.block(0,0,1,cols) * square2, vc2.transpose() * square2);
+    VERIFY_IS_APPROX(w2.noalias() = square2.block(0,0,1,cols) * square2, vc2.transpose() * square2);
+    VERIFY_IS_APPROX(w2.block(0,0,1,cols).noalias() = square2.block(0,0,1,cols) * square2, vc2.transpose() * square2);
+
+    vc2 = square2.block(0,0,cols,1);
+    VERIFY_IS_APPROX(square2.block(0,0,cols,1).transpose() * square2, vc2.transpose() * square2);
+    VERIFY_IS_APPROX(w2.noalias() = square2.block(0,0,cols,1).transpose() * square2, vc2.transpose() * square2);
+    VERIFY_IS_APPROX(w2.block(0,0,1,cols).noalias() = square2.block(0,0,cols,1).transpose() * square2, vc2.transpose() * square2);
+  }
+
+  // inner product
+  {
+    Scalar x = square2.row(c) * square2.col(c2);
+    VERIFY_IS_APPROX(x, square2.row(c).transpose().cwiseProduct(square2.col(c2)).sum());
+  }
+
+  // outer product
+  {
+    VERIFY_IS_APPROX(m1.col(c) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols));
+    VERIFY_IS_APPROX(m1.row(r).transpose() * m1.col(c).transpose(), m1.block(r,0,1,cols).transpose() * m1.block(0,c,rows,1).transpose());
+    VERIFY_IS_APPROX(m1.block(0,c,rows,1) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols));
+    VERIFY_IS_APPROX(m1.col(c) * m1.block(r,0,1,cols), m1.block(0,c,rows,1) * m1.block(r,0,1,cols));
+    VERIFY_IS_APPROX(m1.leftCols(1) * m1.row(r), m1.block(0,0,rows,1) * m1.block(r,0,1,cols));
+    VERIFY_IS_APPROX(m1.col(c) * m1.topRows(1), m1.block(0,c,rows,1) * m1.block(0,0,1,cols));
+  }
+
+  // Aliasing
+  {
+    ColVectorType x(cols); x.setRandom();
+    ColVectorType z(x);
+    ColVectorType y(cols); y.setZero();
+    ColSquareMatrixType A(cols,cols); A.setRandom();
+    // CwiseBinaryOp
+    VERIFY_IS_APPROX(x = y + A*x, A*z);
+    x = z;
+    VERIFY_IS_APPROX(x = y - A*x, A*(-z));
+    x = z;
+    // CwiseUnaryOp
+    VERIFY_IS_APPROX(x = Scalar(1.)*(A*x), A*z);
+  }
+
+  // regression for blas_trais
+  {
+    VERIFY_IS_APPROX(square * (square*square).transpose(), square * square.transpose() * square.transpose());
+    VERIFY_IS_APPROX(square * (-(square*square)), -square * square * square);
+    VERIFY_IS_APPROX(square * (s1*(square*square)), s1 * square * square * square);
+    VERIFY_IS_APPROX(square * (square*square).conjugate(), square * square.conjugate() * square.conjugate());
+  }
+
+  // destination with a non-default inner-stride
+  // see bug 1741
+  if(!MatrixType::IsRowMajor)
+  {
+    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixX;
+    MatrixX buffer(2*rows,2*rows);
+    Map<RowSquareMatrixType,0,Stride<Dynamic,2> > map1(buffer.data(),rows,rows,Stride<Dynamic,2>(2*rows,2));
+    buffer.setZero();
+    VERIFY_IS_APPROX(map1 = m1 * m2.transpose(), (m1 * m2.transpose()).eval());
+    buffer.setZero();
+    VERIFY_IS_APPROX(map1.noalias() = m1 * m2.transpose(), (m1 * m2.transpose()).eval());
+    buffer.setZero();
+    VERIFY_IS_APPROX(map1.noalias() += m1 * m2.transpose(), (m1 * m2.transpose()).eval());
+  }
+
+}

diff --git a/test/product_extra.cpp b/test/product_extra.cpp
new file mode 100644
index 0000000..15c6989
--- /dev/null
+++ b/test/product_extra.cpp

@@ -0,0 +1,390 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename MatrixType> void product_extra(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Matrix<Scalar, 1, Dynamic> RowVectorType;
+  typedef Matrix<Scalar, Dynamic, 1> ColVectorType;
+  typedef Matrix<Scalar, Dynamic, Dynamic,
+                         MatrixType::Flags&RowMajorBit> OtherMajorMatrixType;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m2 = MatrixType::Random(rows, cols),
+             m3(rows, cols),
+             mzero = MatrixType::Zero(rows, cols),
+             identity = MatrixType::Identity(rows, rows),
+             square = MatrixType::Random(rows, rows),
+             res = MatrixType::Random(rows, rows),
+             square2 = MatrixType::Random(cols, cols),
+             res2 = MatrixType::Random(cols, cols);
+  RowVectorType v1 = RowVectorType::Random(rows), vrres(rows);
+  ColVectorType vc2 = ColVectorType::Random(cols), vcres(cols);
+  OtherMajorMatrixType tm1 = m1;
+
+  Scalar s1 = internal::random<Scalar>(),
+         s2 = internal::random<Scalar>(),
+         s3 = internal::random<Scalar>();
+
+  VERIFY_IS_APPROX(m3.noalias() = m1 * m2.adjoint(),                 m1 * m2.adjoint().eval());
+  VERIFY_IS_APPROX(m3.noalias() = m1.adjoint() * square.adjoint(),   m1.adjoint().eval() * square.adjoint().eval());
+  VERIFY_IS_APPROX(m3.noalias() = m1.adjoint() * m2,                 m1.adjoint().eval() * m2);
+  VERIFY_IS_APPROX(m3.noalias() = (s1 * m1.adjoint()) * m2,          (s1 * m1.adjoint()).eval() * m2);
+  VERIFY_IS_APPROX(m3.noalias() = ((s1 * m1).adjoint()) * m2,        (numext::conj(s1) * m1.adjoint()).eval() * m2);
+  VERIFY_IS_APPROX(m3.noalias() = (- m1.adjoint() * s1) * (s3 * m2), (- m1.adjoint()  * s1).eval() * (s3 * m2).eval());
+  VERIFY_IS_APPROX(m3.noalias() = (s2 * m1.adjoint() * s1) * m2,     (s2 * m1.adjoint()  * s1).eval() * m2);
+  VERIFY_IS_APPROX(m3.noalias() = (-m1*s2) * s1*m2.adjoint(),        (-m1*s2).eval() * (s1*m2.adjoint()).eval());
+
+  // a very tricky case where a scale factor has to be automatically conjugated:
+  VERIFY_IS_APPROX( m1.adjoint() * (s1*m2).conjugate(), (m1.adjoint()).eval() * ((s1*m2).conjugate()).eval());
+
+
+  // test all possible conjugate combinations for the four matrix-vector product cases:
+
+  VERIFY_IS_APPROX((-m1.conjugate() * s2) * (s1 * vc2),
+                   (-m1.conjugate()*s2).eval() * (s1 * vc2).eval());
+  VERIFY_IS_APPROX((-m1 * s2) * (s1 * vc2.conjugate()),
+                   (-m1*s2).eval() * (s1 * vc2.conjugate()).eval());
+  VERIFY_IS_APPROX((-m1.conjugate() * s2) * (s1 * vc2.conjugate()),
+                   (-m1.conjugate()*s2).eval() * (s1 * vc2.conjugate()).eval());
+
+  VERIFY_IS_APPROX((s1 * vc2.transpose()) * (-m1.adjoint() * s2),
+                   (s1 * vc2.transpose()).eval() * (-m1.adjoint()*s2).eval());
+  VERIFY_IS_APPROX((s1 * vc2.adjoint()) * (-m1.transpose() * s2),
+                   (s1 * vc2.adjoint()).eval() * (-m1.transpose()*s2).eval());
+  VERIFY_IS_APPROX((s1 * vc2.adjoint()) * (-m1.adjoint() * s2),
+                   (s1 * vc2.adjoint()).eval() * (-m1.adjoint()*s2).eval());
+
+  VERIFY_IS_APPROX((-m1.adjoint() * s2) * (s1 * v1.transpose()),
+                   (-m1.adjoint()*s2).eval() * (s1 * v1.transpose()).eval());
+  VERIFY_IS_APPROX((-m1.transpose() * s2) * (s1 * v1.adjoint()),
+                   (-m1.transpose()*s2).eval() * (s1 * v1.adjoint()).eval());
+  VERIFY_IS_APPROX((-m1.adjoint() * s2) * (s1 * v1.adjoint()),
+                   (-m1.adjoint()*s2).eval() * (s1 * v1.adjoint()).eval());
+
+  VERIFY_IS_APPROX((s1 * v1) * (-m1.conjugate() * s2),
+                   (s1 * v1).eval() * (-m1.conjugate()*s2).eval());
+  VERIFY_IS_APPROX((s1 * v1.conjugate()) * (-m1 * s2),
+                   (s1 * v1.conjugate()).eval() * (-m1*s2).eval());
+  VERIFY_IS_APPROX((s1 * v1.conjugate()) * (-m1.conjugate() * s2),
+                   (s1 * v1.conjugate()).eval() * (-m1.conjugate()*s2).eval());
+
+  VERIFY_IS_APPROX((-m1.adjoint() * s2) * (s1 * v1.adjoint()),
+                   (-m1.adjoint()*s2).eval() * (s1 * v1.adjoint()).eval());
+
+  // test the vector-matrix product with non aligned starts
+  Index i = internal::random<Index>(0,m1.rows()-2);
+  Index j = internal::random<Index>(0,m1.cols()-2);
+  Index r = internal::random<Index>(1,m1.rows()-i);
+  Index c = internal::random<Index>(1,m1.cols()-j);
+  Index i2 = internal::random<Index>(0,m1.rows()-1);
+  Index j2 = internal::random<Index>(0,m1.cols()-1);
+
+  VERIFY_IS_APPROX(m1.col(j2).adjoint() * m1.block(0,j,m1.rows(),c), m1.col(j2).adjoint().eval() * m1.block(0,j,m1.rows(),c).eval());
+  VERIFY_IS_APPROX(m1.block(i,0,r,m1.cols()) * m1.row(i2).adjoint(), m1.block(i,0,r,m1.cols()).eval() * m1.row(i2).adjoint().eval());
+
+  // test negative strides
+  {
+    Map<MatrixType,Unaligned,Stride<Dynamic,Dynamic> > map1(&m1(rows-1,cols-1), rows, cols, Stride<Dynamic,Dynamic>(-m1.outerStride(),-1));
+    Map<MatrixType,Unaligned,Stride<Dynamic,Dynamic> > map2(&m2(rows-1,cols-1), rows, cols, Stride<Dynamic,Dynamic>(-m2.outerStride(),-1));
+    Map<RowVectorType,Unaligned,InnerStride<-1> > mapv1(&v1(v1.size()-1), v1.size(), InnerStride<-1>(-1));
+    Map<ColVectorType,Unaligned,InnerStride<-1> > mapvc2(&vc2(vc2.size()-1), vc2.size(), InnerStride<-1>(-1));
+    VERIFY_IS_APPROX(MatrixType(map1), m1.reverse());
+    VERIFY_IS_APPROX(MatrixType(map2), m2.reverse());
+    VERIFY_IS_APPROX(m3.noalias() = MatrixType(map1) * MatrixType(map2).adjoint(), m1.reverse() * m2.reverse().adjoint());
+    VERIFY_IS_APPROX(m3.noalias() = map1 * map2.adjoint(), m1.reverse() * m2.reverse().adjoint());
+    VERIFY_IS_APPROX(map1 * vc2, m1.reverse() * vc2);
+    VERIFY_IS_APPROX(m1 * mapvc2, m1 * mapvc2);
+    VERIFY_IS_APPROX(map1.adjoint() * v1.transpose(), m1.adjoint().reverse() * v1.transpose());
+    VERIFY_IS_APPROX(m1.adjoint() * mapv1.transpose(), m1.adjoint() * v1.reverse().transpose());
+  }
+  
+  // regression test
+  MatrixType tmp = m1 * m1.adjoint() * s1;
+  VERIFY_IS_APPROX(tmp, m1 * m1.adjoint() * s1);
+
+  // regression test for bug 1343, assignment to arrays
+  Array<Scalar,Dynamic,1> a1 = m1 * vc2;
+  VERIFY_IS_APPROX(a1.matrix(),m1*vc2);
+  Array<Scalar,Dynamic,1> a2 = s1 * (m1 * vc2);
+  VERIFY_IS_APPROX(a2.matrix(),s1*m1*vc2);
+  Array<Scalar,1,Dynamic> a3 = v1 * m1;
+  VERIFY_IS_APPROX(a3.matrix(),v1*m1);
+  Array<Scalar,Dynamic,Dynamic> a4 = m1 * m2.adjoint();
+  VERIFY_IS_APPROX(a4.matrix(),m1*m2.adjoint());
+}
+
+// Regression test for bug reported at http://forum.kde.org/viewtopic.php?f=74&t=96947
+void mat_mat_scalar_scalar_product()
+{
+  Eigen::Matrix2Xd dNdxy(2, 3);
+  dNdxy << -0.5, 0.5, 0,
+           -0.3, 0, 0.3;
+  double det = 6.0, wt = 0.5;
+  VERIFY_IS_APPROX(dNdxy.transpose()*dNdxy*det*wt, det*wt*dNdxy.transpose()*dNdxy);
+}
+
+template <typename MatrixType> 
+void zero_sized_objects(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  const int PacketSize  = internal::packet_traits<Scalar>::size;
+  const int PacketSize1 = PacketSize>1 ?  PacketSize-1 : 1;
+  Index rows = m.rows();
+  Index cols = m.cols();
+  
+  {
+    MatrixType res, a(rows,0), b(0,cols);
+    VERIFY_IS_APPROX( (res=a*b), MatrixType::Zero(rows,cols) );
+    VERIFY_IS_APPROX( (res=a*a.transpose()), MatrixType::Zero(rows,rows) );
+    VERIFY_IS_APPROX( (res=b.transpose()*b), MatrixType::Zero(cols,cols) );
+    VERIFY_IS_APPROX( (res=b.transpose()*a.transpose()), MatrixType::Zero(cols,rows) );
+  }
+  
+  {
+    MatrixType res, a(rows,cols), b(cols,0);
+    res = a*b;
+    VERIFY(res.rows()==rows && res.cols()==0);
+    b.resize(0,rows);
+    res = b*a;
+    VERIFY(res.rows()==0 && res.cols()==cols);
+  }
+  
+  {
+    Matrix<Scalar,PacketSize,0> a;
+    Matrix<Scalar,0,1> b;
+    Matrix<Scalar,PacketSize,1> res;
+    VERIFY_IS_APPROX( (res=a*b), MatrixType::Zero(PacketSize,1) );
+    VERIFY_IS_APPROX( (res=a.lazyProduct(b)), MatrixType::Zero(PacketSize,1) );
+  }
+  
+  {
+    Matrix<Scalar,PacketSize1,0> a;
+    Matrix<Scalar,0,1> b;
+    Matrix<Scalar,PacketSize1,1> res;
+    VERIFY_IS_APPROX( (res=a*b), MatrixType::Zero(PacketSize1,1) );
+    VERIFY_IS_APPROX( (res=a.lazyProduct(b)), MatrixType::Zero(PacketSize1,1) );
+  }
+  
+  {
+    Matrix<Scalar,PacketSize,Dynamic> a(PacketSize,0);
+    Matrix<Scalar,Dynamic,1> b(0,1);
+    Matrix<Scalar,PacketSize,1> res;
+    VERIFY_IS_APPROX( (res=a*b), MatrixType::Zero(PacketSize,1) );
+    VERIFY_IS_APPROX( (res=a.lazyProduct(b)), MatrixType::Zero(PacketSize,1) );
+  }
+  
+  {
+    Matrix<Scalar,PacketSize1,Dynamic> a(PacketSize1,0);
+    Matrix<Scalar,Dynamic,1> b(0,1);
+    Matrix<Scalar,PacketSize1,1> res;
+    VERIFY_IS_APPROX( (res=a*b), MatrixType::Zero(PacketSize1,1) );
+    VERIFY_IS_APPROX( (res=a.lazyProduct(b)), MatrixType::Zero(PacketSize1,1) );
+  }
+}
+
+template<int>
+void bug_127()
+{
+  // Bug 127
+  //
+  // a product of the form lhs*rhs with
+  //
+  // lhs:
+  // rows = 1, cols = 4
+  // RowsAtCompileTime = 1, ColsAtCompileTime = -1
+  // MaxRowsAtCompileTime = 1, MaxColsAtCompileTime = 5
+  //
+  // rhs:
+  // rows = 4, cols = 0
+  // RowsAtCompileTime = -1, ColsAtCompileTime = -1
+  // MaxRowsAtCompileTime = 5, MaxColsAtCompileTime = 1
+  //
+  // was failing on a runtime assertion, because it had been mis-compiled as a dot product because Product.h was using the
+  // max-sizes to detect size 1 indicating vectors, and that didn't account for 0-sized object with max-size 1.
+
+  Matrix<float,1,Dynamic,RowMajor,1,5> a(1,4);
+  Matrix<float,Dynamic,Dynamic,ColMajor,5,1> b(4,0);
+  a*b;
+}
+
+template<int> void bug_817()
+{
+  ArrayXXf B = ArrayXXf::Random(10,10), C;
+  VectorXf x = VectorXf::Random(10);
+  C = (x.transpose()*B.matrix());
+  B = (x.transpose()*B.matrix());
+  VERIFY_IS_APPROX(B,C);
+}
+
+template<int>
+void unaligned_objects()
+{
+  // Regression test for the bug reported here:
+  // http://forum.kde.org/viewtopic.php?f=74&t=107541
+  // Recall the matrix*vector kernel avoid unaligned loads by loading two packets and then reassemble then.
+  // There was a mistake in the computation of the valid range for fully unaligned objects: in some rare cases,
+  // memory was read outside the allocated matrix memory. Though the values were not used, this might raise segfault.
+  for(int m=450;m<460;++m)
+  {
+    for(int n=8;n<12;++n)
+    {
+      MatrixXf M(m, n);
+      VectorXf v1(n), r1(500);
+      RowVectorXf v2(m), r2(16);
+
+      M.setRandom();
+      v1.setRandom();
+      v2.setRandom();
+      for(int o=0; o<4; ++o)
+      {
+        r1.segment(o,m).noalias() = M * v1;
+        VERIFY_IS_APPROX(r1.segment(o,m), M * MatrixXf(v1));
+        r2.segment(o,n).noalias() = v2 * M;
+        VERIFY_IS_APPROX(r2.segment(o,n), MatrixXf(v2) * M);
+      }
+    }
+  }
+}
+
+template<typename T>
+EIGEN_DONT_INLINE
+Index test_compute_block_size(Index m, Index n, Index k)
+{
+  Index mc(m), nc(n), kc(k);
+  internal::computeProductBlockingSizes<T,T>(kc, mc, nc);
+  return kc+mc+nc;
+}
+
+template<typename T>
+Index compute_block_size()
+{
+  Index ret = 0;
+  ret += test_compute_block_size<T>(0,1,1);
+  ret += test_compute_block_size<T>(1,0,1);
+  ret += test_compute_block_size<T>(1,1,0);
+  ret += test_compute_block_size<T>(0,0,1);
+  ret += test_compute_block_size<T>(0,1,0);
+  ret += test_compute_block_size<T>(1,0,0);
+  ret += test_compute_block_size<T>(0,0,0);
+  return ret;
+}
+
+template<typename>
+void aliasing_with_resize()
+{
+  Index m = internal::random<Index>(10,50);
+  Index n = internal::random<Index>(10,50);
+  MatrixXd A, B, C(m,n), D(m,m);
+  VectorXd a, b, c(n);
+  C.setRandom();
+  D.setRandom();
+  c.setRandom();
+  double s = internal::random<double>(1,10);
+
+  A = C;
+  B = A * A.transpose();
+  A = A * A.transpose();
+  VERIFY_IS_APPROX(A,B);
+
+  A = C;
+  B = (A * A.transpose())/s;
+  A = (A * A.transpose())/s;
+  VERIFY_IS_APPROX(A,B);
+
+  A = C;
+  B = (A * A.transpose()) + D;
+  A = (A * A.transpose()) + D;
+  VERIFY_IS_APPROX(A,B);
+
+  A = C;
+  B = D + (A * A.transpose());
+  A = D + (A * A.transpose());
+  VERIFY_IS_APPROX(A,B);
+
+  A = C;
+  B = s * (A * A.transpose());
+  A = s * (A * A.transpose());
+  VERIFY_IS_APPROX(A,B);
+
+  A = C;
+  a = c;
+  b = (A * a)/s;
+  a = (A * a)/s;
+  VERIFY_IS_APPROX(a,b);
+}
+
+template<int>
+void bug_1308()
+{
+  int n = 10;
+  MatrixXd r(n,n);
+  VectorXd v = VectorXd::Random(n);
+  r = v * RowVectorXd::Ones(n);
+  VERIFY_IS_APPROX(r, v.rowwise().replicate(n));
+  r = VectorXd::Ones(n) * v.transpose();
+  VERIFY_IS_APPROX(r, v.rowwise().replicate(n).transpose());
+
+  Matrix4d ones44 = Matrix4d::Ones();
+  Matrix4d m44 = Matrix4d::Ones() * Matrix4d::Ones();
+  VERIFY_IS_APPROX(m44,Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(m44.noalias()=ones44*Matrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(m44.noalias()=ones44.transpose()*Matrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(m44.noalias()=Matrix4d::Ones()*ones44, Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(m44.noalias()=Matrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
+
+  typedef Matrix<double,4,4,RowMajor> RMatrix4d;
+  RMatrix4d r44 = Matrix4d::Ones() * Matrix4d::Ones();
+  VERIFY_IS_APPROX(r44,Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=ones44*Matrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=ones44.transpose()*Matrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=Matrix4d::Ones()*ones44, Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=Matrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=ones44*RMatrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=ones44.transpose()*RMatrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=RMatrix4d::Ones()*ones44, Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=RMatrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
+
+//   RowVector4d r4;
+  m44.setOnes();
+  r44.setZero();
+  VERIFY_IS_APPROX(r44.noalias() += m44.row(0).transpose() * RowVector4d::Ones(), ones44);
+  r44.setZero();
+  VERIFY_IS_APPROX(r44.noalias() += m44.col(0) * RowVector4d::Ones(), ones44);
+  r44.setZero();
+  VERIFY_IS_APPROX(r44.noalias() += Vector4d::Ones() * m44.row(0), ones44);
+  r44.setZero();
+  VERIFY_IS_APPROX(r44.noalias() += Vector4d::Ones() * m44.col(0).transpose(), ones44);
+}
+
+EIGEN_DECLARE_TEST(product_extra)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( product_extra(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_2( product_extra(MatrixXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_2( mat_mat_scalar_scalar_product() );
+    CALL_SUBTEST_3( product_extra(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
+    CALL_SUBTEST_4( product_extra(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
+    CALL_SUBTEST_1( zero_sized_objects(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  }
+  CALL_SUBTEST_5( bug_127<0>() );
+  CALL_SUBTEST_5( bug_817<0>() );
+  CALL_SUBTEST_5( bug_1308<0>() );
+  CALL_SUBTEST_6( unaligned_objects<0>() );
+  CALL_SUBTEST_7( compute_block_size<float>() );
+  CALL_SUBTEST_7( compute_block_size<double>() );
+  CALL_SUBTEST_7( compute_block_size<std::complex<double> >() );
+  CALL_SUBTEST_8( aliasing_with_resize<void>() );
+
+}

diff --git a/test/product_large.cpp b/test/product_large.cpp
new file mode 100644
index 0000000..3d0204b
--- /dev/null
+++ b/test/product_large.cpp

@@ -0,0 +1,131 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "product.h"
+#include <Eigen/LU>
+
+template<typename T>
+void test_aliasing()
+{
+  int rows = internal::random<int>(1,12);
+  int cols = internal::random<int>(1,12);
+  typedef Matrix<T,Dynamic,Dynamic> MatrixType;
+  typedef Matrix<T,Dynamic,1> VectorType;
+  VectorType x(cols); x.setRandom();
+  VectorType z(x);
+  VectorType y(rows); y.setZero();
+  MatrixType A(rows,cols); A.setRandom();
+  // CwiseBinaryOp
+  VERIFY_IS_APPROX(x = y + A*x, A*z);     // OK because "y + A*x" is marked as "assume-aliasing"
+  x = z;
+  // CwiseUnaryOp
+  VERIFY_IS_APPROX(x = T(1.)*(A*x), A*z); // OK because 1*(A*x) is replaced by (1*A*x) which is a Product<> expression
+  x = z;
+  // VERIFY_IS_APPROX(x = y-A*x, -A*z);   // Not OK in 3.3 because x is resized before A*x gets evaluated
+  x = z;
+}
+
+template<int>
+void product_large_regressions()
+{
+  {
+    // test a specific issue in DiagonalProduct
+    int N = 1000000;
+    VectorXf v = VectorXf::Ones(N);
+    MatrixXf m = MatrixXf::Ones(N,3);
+    m = (v+v).asDiagonal() * m;
+    VERIFY_IS_APPROX(m, MatrixXf::Constant(N,3,2));
+  }
+
+  {
+    // test deferred resizing in Matrix::operator=
+    MatrixXf a = MatrixXf::Random(10,4), b = MatrixXf::Random(4,10), c = a;
+    VERIFY_IS_APPROX((a = a * b), (c * b).eval());
+  }
+
+  {
+    // check the functions to setup blocking sizes compile and do not segfault
+    // FIXME check they do what they are supposed to do !!
+    std::ptrdiff_t l1 = internal::random<int>(10000,20000);
+    std::ptrdiff_t l2 = internal::random<int>(100000,200000);
+    std::ptrdiff_t l3 = internal::random<int>(1000000,2000000);
+    setCpuCacheSizes(l1,l2,l3);
+    VERIFY(l1==l1CacheSize());
+    VERIFY(l2==l2CacheSize());
+    std::ptrdiff_t k1 = internal::random<int>(10,100)*16;
+    std::ptrdiff_t m1 = internal::random<int>(10,100)*16;
+    std::ptrdiff_t n1 = internal::random<int>(10,100)*16;
+    // only makes sure it compiles fine
+    internal::computeProductBlockingSizes<float,float,std::ptrdiff_t>(k1,m1,n1,1);
+  }
+
+  {
+    // test regression in row-vector by matrix (bad Map type)
+    MatrixXf mat1(10,32); mat1.setRandom();
+    MatrixXf mat2(32,32); mat2.setRandom();
+    MatrixXf r1 = mat1.row(2)*mat2.transpose();
+    VERIFY_IS_APPROX(r1, (mat1.row(2)*mat2.transpose()).eval());
+
+    MatrixXf r2 = mat1.row(2)*mat2;
+    VERIFY_IS_APPROX(r2, (mat1.row(2)*mat2).eval());
+  }
+
+  {
+    Eigen::MatrixXd A(10,10), B, C;
+    A.setRandom();
+    C = A;
+    for(int k=0; k<79; ++k)
+      C = C * A;
+    B.noalias() = (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)))
+                * (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)));
+    VERIFY_IS_APPROX(B,C);
+  }
+}
+
+template<int>
+void bug_1622() {
+  typedef Matrix<double, 2, -1, 0, 2, -1> Mat2X;
+  Mat2X x(2,2); x.setRandom();
+  MatrixXd y(2,2); y.setRandom();
+  const Mat2X K1 = x * y.inverse();
+  const Matrix2d K2 = x * y.inverse();
+  VERIFY_IS_APPROX(K1,K2);
+}
+
+EIGEN_DECLARE_TEST(product_large)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( product(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_2( product(MatrixXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_2( product(MatrixXd(internal::random<int>(1,10), internal::random<int>(1,10))) );
+
+    CALL_SUBTEST_3( product(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_4( product(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
+    CALL_SUBTEST_5( product(Matrix<float,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+
+    CALL_SUBTEST_1( test_aliasing<float>() );
+
+    CALL_SUBTEST_6( bug_1622<1>() );
+
+    CALL_SUBTEST_7( product(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
+    CALL_SUBTEST_8( product(Matrix<double,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_9( product(Matrix<std::complex<float>,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_10( product(Matrix<std::complex<double>,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  }
+
+  CALL_SUBTEST_6( product_large_regressions<0>() );
+
+  // Regression test for bug 714:
+#if defined EIGEN_HAS_OPENMP
+  omp_set_dynamic(1);
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_6( product(Matrix<float,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  }
+#endif
+}

diff --git a/test/product_mmtr.cpp b/test/product_mmtr.cpp
new file mode 100644
index 0000000..8f8c5fe
--- /dev/null
+++ b/test/product_mmtr.cpp

@@ -0,0 +1,106 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010-2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#define CHECK_MMTR(DEST, TRI, OP) {                   \
+    ref3 = DEST;                                      \
+    ref2 = ref1 = DEST;                               \
+    DEST.template triangularView<TRI>() OP;           \
+    ref1 OP;                                          \
+    ref2.template triangularView<TRI>()               \
+      = ref1.template triangularView<TRI>();          \
+    VERIFY_IS_APPROX(DEST,ref2);                      \
+    \
+    DEST = ref3;                                      \
+    ref3 = ref2;                                      \
+    ref3.diagonal() = DEST.diagonal();                \
+    DEST.template triangularView<TRI|ZeroDiag>() OP;  \
+    VERIFY_IS_APPROX(DEST,ref3);                      \
+  }
+
+template<typename Scalar> void mmtr(int size)
+{
+  typedef Matrix<Scalar,Dynamic,Dynamic,ColMajor> MatrixColMaj;
+  typedef Matrix<Scalar,Dynamic,Dynamic,RowMajor> MatrixRowMaj;
+
+  DenseIndex othersize = internal::random<DenseIndex>(1,200);
+  
+  MatrixColMaj matc = MatrixColMaj::Zero(size, size);
+  MatrixRowMaj matr = MatrixRowMaj::Zero(size, size);
+  MatrixColMaj ref1(size, size), ref2(size, size), ref3(size,size);
+  
+  MatrixColMaj soc(size,othersize); soc.setRandom();
+  MatrixColMaj osc(othersize,size); osc.setRandom();
+  MatrixRowMaj sor(size,othersize); sor.setRandom();
+  MatrixRowMaj osr(othersize,size); osr.setRandom();
+  MatrixColMaj sqc(size,size); sqc.setRandom();
+  MatrixRowMaj sqr(size,size); sqr.setRandom();
+  
+  Scalar s = internal::random<Scalar>();
+  
+  CHECK_MMTR(matc, Lower, = s*soc*sor.adjoint());
+  CHECK_MMTR(matc, Upper, = s*(soc*soc.adjoint()));
+  CHECK_MMTR(matr, Lower, = s*soc*soc.adjoint());
+  CHECK_MMTR(matr, Upper, = soc*(s*sor.adjoint()));
+  
+  CHECK_MMTR(matc, Lower, += s*soc*soc.adjoint());
+  CHECK_MMTR(matc, Upper, += s*(soc*sor.transpose()));
+  CHECK_MMTR(matr, Lower, += s*sor*soc.adjoint());
+  CHECK_MMTR(matr, Upper, += soc*(s*soc.adjoint()));
+  
+  CHECK_MMTR(matc, Lower, -= s*soc*soc.adjoint());
+  CHECK_MMTR(matc, Upper, -= s*(osc.transpose()*osc.conjugate()));
+  CHECK_MMTR(matr, Lower, -= s*soc*soc.adjoint());
+  CHECK_MMTR(matr, Upper, -= soc*(s*soc.adjoint()));
+  
+  CHECK_MMTR(matc, Lower, -= s*sqr*sqc.template triangularView<Upper>());
+  CHECK_MMTR(matc, Upper, = s*sqc*sqr.template triangularView<Upper>());
+  CHECK_MMTR(matc, Lower, += s*sqr*sqc.template triangularView<Lower>());
+  CHECK_MMTR(matc, Upper, = s*sqc*sqc.template triangularView<Lower>());
+  
+  CHECK_MMTR(matc, Lower, = (s*sqr).template triangularView<Upper>()*sqc);
+  CHECK_MMTR(matc, Upper, -= (s*sqc).template triangularView<Upper>()*sqc);
+  CHECK_MMTR(matc, Lower, = (s*sqr).template triangularView<Lower>()*sqc);
+  CHECK_MMTR(matc, Upper, += (s*sqc).template triangularView<Lower>()*sqc);
+
+  // check aliasing
+  ref2 = ref1 = matc;
+  ref1 = sqc.adjoint() * matc * sqc;
+  ref2.template triangularView<Upper>() = ref1.template triangularView<Upper>();
+  matc.template triangularView<Upper>() = sqc.adjoint() * matc * sqc;
+  VERIFY_IS_APPROX(matc, ref2);
+
+  ref2 = ref1 = matc;
+  ref1 = sqc * matc * sqc.adjoint();
+  ref2.template triangularView<Lower>() = ref1.template triangularView<Lower>();
+  matc.template triangularView<Lower>() = sqc * matc * sqc.adjoint();
+  VERIFY_IS_APPROX(matc, ref2);
+
+  // destination with a non-default inner-stride
+  // see bug 1741
+  {
+    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixX;
+    MatrixX buffer(2*size,2*size);
+    Map<MatrixColMaj,0,Stride<Dynamic,Dynamic> > map1(buffer.data(),size,size,Stride<Dynamic,Dynamic>(2*size,2));
+    buffer.setZero();
+    CHECK_MMTR(map1, Lower, = s*soc*sor.adjoint());
+  }
+}
+
+EIGEN_DECLARE_TEST(product_mmtr)
+{
+  for(int i = 0; i < g_repeat ; i++)
+  {
+    CALL_SUBTEST_1((mmtr<float>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
+    CALL_SUBTEST_2((mmtr<double>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
+    CALL_SUBTEST_3((mmtr<std::complex<float> >(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))));
+    CALL_SUBTEST_4((mmtr<std::complex<double> >(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))));
+  }
+}

diff --git a/test/product_notemporary.cpp b/test/product_notemporary.cpp
new file mode 100644
index 0000000..20cb7c0
--- /dev/null
+++ b/test/product_notemporary.cpp

@@ -0,0 +1,209 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define TEST_ENABLE_TEMPORARY_TRACKING
+
+#include "main.h"
+
+template<typename Dst, typename Lhs, typename Rhs>
+void check_scalar_multiple3(Dst &dst, const Lhs& A, const Rhs& B)
+{
+  VERIFY_EVALUATION_COUNT( (dst.noalias()  = A * B), 0);
+  VERIFY_IS_APPROX( dst, (A.eval() * B.eval()).eval() );
+  VERIFY_EVALUATION_COUNT( (dst.noalias() += A * B), 0);
+  VERIFY_IS_APPROX( dst, 2*(A.eval() * B.eval()).eval() );
+  VERIFY_EVALUATION_COUNT( (dst.noalias() -= A * B), 0);
+  VERIFY_IS_APPROX( dst, (A.eval() * B.eval()).eval() );
+}
+
+template<typename Dst, typename Lhs, typename Rhs, typename S2>
+void check_scalar_multiple2(Dst &dst, const Lhs& A, const Rhs& B, S2 s2)
+{
+  CALL_SUBTEST( check_scalar_multiple3(dst, A,    B) );
+  CALL_SUBTEST( check_scalar_multiple3(dst, A,   -B) );
+  CALL_SUBTEST( check_scalar_multiple3(dst, A, s2*B) );
+  CALL_SUBTEST( check_scalar_multiple3(dst, A, B*s2) );
+  CALL_SUBTEST( check_scalar_multiple3(dst, A, (B*s2).conjugate()) );
+}
+
+template<typename Dst, typename Lhs, typename Rhs, typename S1, typename S2>
+void check_scalar_multiple1(Dst &dst, const Lhs& A, const Rhs& B, S1 s1, S2 s2)
+{
+  CALL_SUBTEST( check_scalar_multiple2(dst,    A, B, s2) );
+  CALL_SUBTEST( check_scalar_multiple2(dst,   -A, B, s2) );
+  CALL_SUBTEST( check_scalar_multiple2(dst, s1*A, B, s2) );
+  CALL_SUBTEST( check_scalar_multiple2(dst, A*s1, B, s2) );
+  CALL_SUBTEST( check_scalar_multiple2(dst, (A*s1).conjugate(), B, s2) );
+}
+
+template<typename MatrixType> void product_notemporary(const MatrixType& m)
+{
+  /* This test checks the number of temporaries created
+   * during the evaluation of a complex expression */
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef Matrix<Scalar, 1, Dynamic> RowVectorType;
+  typedef Matrix<Scalar, Dynamic, 1> ColVectorType;
+  typedef Matrix<Scalar, Dynamic, Dynamic, ColMajor> ColMajorMatrixType;
+  typedef Matrix<Scalar, Dynamic, Dynamic, RowMajor> RowMajorMatrixType;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  ColMajorMatrixType m1 = MatrixType::Random(rows, cols),
+                     m2 = MatrixType::Random(rows, cols),
+                     m3(rows, cols);
+  RowVectorType rv1 = RowVectorType::Random(rows), rvres(rows);
+  ColVectorType cv1 = ColVectorType::Random(cols), cvres(cols);
+  RowMajorMatrixType rm3(rows, cols);
+
+  Scalar s1 = internal::random<Scalar>(),
+         s2 = internal::random<Scalar>(),
+         s3 = internal::random<Scalar>();
+
+  Index c0 = internal::random<Index>(4,cols-8),
+        c1 = internal::random<Index>(8,cols-c0),
+        r0 = internal::random<Index>(4,cols-8),
+        r1 = internal::random<Index>(8,rows-r0);
+
+  VERIFY_EVALUATION_COUNT( m3 = (m1 * m2.adjoint()), 1);
+  VERIFY_EVALUATION_COUNT( m3 = (m1 * m2.adjoint()).transpose(), 1);
+  VERIFY_EVALUATION_COUNT( m3.noalias() = m1 * m2.adjoint(), 0);
+
+  VERIFY_EVALUATION_COUNT( m3 = s1 * (m1 * m2.transpose()), 1);
+//   VERIFY_EVALUATION_COUNT( m3 = m3 + s1 * (m1 * m2.transpose()), 1);
+  VERIFY_EVALUATION_COUNT( m3.noalias() = s1 * (m1 * m2.transpose()), 0);
+
+  VERIFY_EVALUATION_COUNT( m3 = m3 + (m1 * m2.adjoint()), 1);
+  VERIFY_EVALUATION_COUNT( m3 = m3 - (m1 * m2.adjoint()), 1);
+
+  VERIFY_EVALUATION_COUNT( m3 = m3 + (m1 * m2.adjoint()).transpose(), 1);
+  VERIFY_EVALUATION_COUNT( m3.noalias() = m3 + m1 * m2.transpose(), 0);
+  VERIFY_EVALUATION_COUNT( m3.noalias() += m3 + m1 * m2.transpose(), 0);
+  VERIFY_EVALUATION_COUNT( m3.noalias() -= m3 + m1 * m2.transpose(), 0);
+  VERIFY_EVALUATION_COUNT( m3.noalias() =  m3 - m1 * m2.transpose(), 0);
+  VERIFY_EVALUATION_COUNT( m3.noalias() += m3 - m1 * m2.transpose(), 0);
+  VERIFY_EVALUATION_COUNT( m3.noalias() -= m3 - m1 * m2.transpose(), 0);
+
+  VERIFY_EVALUATION_COUNT( m3.noalias() = s1 * m1 * s2 * m2.adjoint(), 0);
+  VERIFY_EVALUATION_COUNT( m3.noalias() = s1 * m1 * s2 * (m1*s3+m2*s2).adjoint(), 1);
+  VERIFY_EVALUATION_COUNT( m3.noalias() = (s1 * m1).adjoint() * s2 * m2, 0);
+  VERIFY_EVALUATION_COUNT( m3.noalias() += s1 * (-m1*s3).adjoint() * (s2 * m2 * s3), 0);
+  VERIFY_EVALUATION_COUNT( m3.noalias() -= s1 * (m1.transpose() * m2), 0);
+
+  VERIFY_EVALUATION_COUNT(( m3.block(r0,r0,r1,r1).noalias() += -m1.block(r0,c0,r1,c1) * (s2*m2.block(r0,c0,r1,c1)).adjoint() ), 0);
+  VERIFY_EVALUATION_COUNT(( m3.block(r0,r0,r1,r1).noalias() -= s1 * m1.block(r0,c0,r1,c1) * m2.block(c0,r0,c1,r1) ), 0);
+
+  // NOTE this is because the Block expression is not handled yet by our expression analyser
+  VERIFY_EVALUATION_COUNT(( m3.block(r0,r0,r1,r1).noalias() = s1 * m1.block(r0,c0,r1,c1) * (s1*m2).block(c0,r0,c1,r1) ), 1);
+
+  VERIFY_EVALUATION_COUNT( m3.noalias() -= (s1 * m1).template triangularView<Lower>() * m2, 0);
+  VERIFY_EVALUATION_COUNT( rm3.noalias() = (s1 * m1.adjoint()).template triangularView<Upper>() * (m2+m2), 1);
+  VERIFY_EVALUATION_COUNT( rm3.noalias() = (s1 * m1.adjoint()).template triangularView<UnitUpper>() * m2.adjoint(), 0);
+
+  VERIFY_EVALUATION_COUNT( m3.template triangularView<Upper>() = (m1 * m2.adjoint()), 0);
+  VERIFY_EVALUATION_COUNT( m3.template triangularView<Upper>() -= (m1 * m2.adjoint()), 0);
+
+  // NOTE this is because the blas_traits require innerstride==1 to avoid a temporary, but that doesn't seem to be actually needed for the triangular products
+  VERIFY_EVALUATION_COUNT( rm3.col(c0).noalias() = (s1 * m1.adjoint()).template triangularView<UnitUpper>() * (s2*m2.row(c0)).adjoint(), 1);
+
+  VERIFY_EVALUATION_COUNT( m1.template triangularView<Lower>().solveInPlace(m3), 0);
+  VERIFY_EVALUATION_COUNT( m1.adjoint().template triangularView<Lower>().solveInPlace(m3.transpose()), 0);
+
+  VERIFY_EVALUATION_COUNT( m3.noalias() -= (s1 * m1).adjoint().template selfadjointView<Lower>() * (-m2*s3).adjoint(), 0);
+  VERIFY_EVALUATION_COUNT( m3.noalias() = s2 * m2.adjoint() * (s1 * m1.adjoint()).template selfadjointView<Upper>(), 0);
+  VERIFY_EVALUATION_COUNT( rm3.noalias() = (s1 * m1.adjoint()).template selfadjointView<Lower>() * m2.adjoint(), 0);
+
+  // NOTE this is because the blas_traits require innerstride==1 to avoid a temporary, but that doesn't seem to be actually needed for the triangular products
+  VERIFY_EVALUATION_COUNT( m3.col(c0).noalias() = (s1 * m1).adjoint().template selfadjointView<Lower>() * (-m2.row(c0)*s3).adjoint(), 1);
+  VERIFY_EVALUATION_COUNT( m3.col(c0).noalias() -= (s1 * m1).adjoint().template selfadjointView<Upper>() * (-m2.row(c0)*s3).adjoint(), 1);
+
+  VERIFY_EVALUATION_COUNT( m3.block(r0,c0,r1,c1).noalias() += m1.block(r0,r0,r1,r1).template selfadjointView<Upper>() * (s1*m2.block(r0,c0,r1,c1)), 0);
+  VERIFY_EVALUATION_COUNT( m3.block(r0,c0,r1,c1).noalias() = m1.block(r0,r0,r1,r1).template selfadjointView<Upper>() * m2.block(r0,c0,r1,c1), 0);
+
+  VERIFY_EVALUATION_COUNT( m3.template selfadjointView<Lower>().rankUpdate(m2.adjoint()), 0);
+
+  // Here we will get 1 temporary for each resize operation of the lhs operator; resize(r1,c1) would lead to zero temporaries
+  m3.resize(1,1);
+  VERIFY_EVALUATION_COUNT( m3.noalias() = m1.block(r0,r0,r1,r1).template selfadjointView<Lower>() * m2.block(r0,c0,r1,c1), 1);
+  m3.resize(1,1);
+  VERIFY_EVALUATION_COUNT( m3.noalias() = m1.block(r0,r0,r1,r1).template triangularView<UnitUpper>()  * m2.block(r0,c0,r1,c1), 1);
+
+  // Zero temporaries for lazy products ...
+  m3.setRandom(rows,cols);
+  VERIFY_EVALUATION_COUNT( Scalar tmp = 0; tmp += Scalar(RealScalar(1)) /  (m3.transpose().lazyProduct(m3)).diagonal().sum(), 0 );
+  VERIFY_EVALUATION_COUNT( m3.noalias() = m1.conjugate().lazyProduct(m2.conjugate()), 0);
+
+  // ... and even no temporary for even deeply (>=2) nested products
+  VERIFY_EVALUATION_COUNT( Scalar tmp = 0; tmp += Scalar(RealScalar(1)) /  (m3.transpose() * m3).diagonal().sum(), 0 );
+  VERIFY_EVALUATION_COUNT( Scalar tmp = 0; tmp += Scalar(RealScalar(1)) /  (m3.transpose() * m3).diagonal().array().abs().sum(), 0 );
+
+  // Zero temporaries for ... CoeffBasedProductMode
+  VERIFY_EVALUATION_COUNT( m3.col(0).template head<5>() * m3.col(0).transpose() + m3.col(0).template head<5>() * m3.col(0).transpose(), 0 );
+
+  // Check matrix * vectors
+  VERIFY_EVALUATION_COUNT( cvres.noalias() = m1 * cv1, 0 );
+  VERIFY_EVALUATION_COUNT( cvres.noalias() -= m1 * cv1, 0 );
+  VERIFY_EVALUATION_COUNT( cvres.noalias() -= m1 * m2.col(0), 0 );
+  VERIFY_EVALUATION_COUNT( cvres.noalias() -= m1 * rv1.adjoint(), 0 );
+  VERIFY_EVALUATION_COUNT( cvres.noalias() -= m1 * m2.row(0).transpose(), 0 );
+
+  VERIFY_EVALUATION_COUNT( cvres.noalias() = (m1+m1) * cv1, 0 );
+  VERIFY_EVALUATION_COUNT( cvres.noalias() = (rm3+rm3) * cv1, 0 );
+  VERIFY_EVALUATION_COUNT( cvres.noalias() = (m1+m1) * (m1*cv1), 1 );
+  VERIFY_EVALUATION_COUNT( cvres.noalias() = (rm3+rm3) * (m1*cv1), 1 );
+
+  // Check outer products
+  #ifdef EIGEN_ALLOCA
+  bool temp_via_alloca = m3.rows()*sizeof(Scalar) <= EIGEN_STACK_ALLOCATION_LIMIT;
+  #else
+  bool temp_via_alloca = false;
+  #endif
+  m3 = cv1 * rv1;
+  VERIFY_EVALUATION_COUNT( m3.noalias() = cv1 * rv1, 0 );
+  VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 );
+  VERIFY_EVALUATION_COUNT( m3.noalias() = (m1*cv1) * (rv1), 1 );
+  VERIFY_EVALUATION_COUNT( m3.noalias() += (m1*cv1) * (rv1), 1 );
+  rm3 = cv1 * rv1;
+  VERIFY_EVALUATION_COUNT( rm3.noalias() = cv1 * rv1, 0 );
+  VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 );
+  VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1) * (rv1 * m1), 1 );
+  VERIFY_EVALUATION_COUNT( rm3.noalias() -= (cv1) * (rv1 * m1), 1 );
+  VERIFY_EVALUATION_COUNT( rm3.noalias() = (m1*cv1) * (rv1 * m1), 2 );
+  VERIFY_EVALUATION_COUNT( rm3.noalias() += (m1*cv1) * (rv1 * m1), 2 );
+
+  // Check nested products
+  VERIFY_EVALUATION_COUNT( cvres.noalias() = m1.adjoint() * m1 * cv1, 1 );
+  VERIFY_EVALUATION_COUNT( rvres.noalias() = rv1 * (m1 * m2.adjoint()), 1 );
+
+  // exhaustively check all scalar multiple combinations:
+  {
+    // Generic path:
+    check_scalar_multiple1(m3, m1, m2, s1, s2);
+    // Force fall back to coeff-based:
+    typename ColMajorMatrixType::BlockXpr m3_blck = m3.block(r0,r0,1,1);
+    check_scalar_multiple1(m3_blck, m1.block(r0,c0,1,1), m2.block(c0,r0,1,1), s1, s2);
+  }
+}
+
+EIGEN_DECLARE_TEST(product_notemporary)
+{
+  int s;
+  for(int i = 0; i < g_repeat; i++) {
+    s = internal::random<int>(16,EIGEN_TEST_MAX_SIZE);
+    CALL_SUBTEST_1( product_notemporary(MatrixXf(s, s)) );
+    CALL_SUBTEST_2( product_notemporary(MatrixXd(s, s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
+    s = internal::random<int>(16,EIGEN_TEST_MAX_SIZE/2);
+    CALL_SUBTEST_3( product_notemporary(MatrixXcf(s,s)) );
+    CALL_SUBTEST_4( product_notemporary(MatrixXcd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+  }
+}

diff --git a/test/product_selfadjoint.cpp b/test/product_selfadjoint.cpp
new file mode 100644
index 0000000..bdccd04
--- /dev/null
+++ b/test/product_selfadjoint.cpp

@@ -0,0 +1,86 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename MatrixType> void product_selfadjoint(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+  typedef Matrix<Scalar, 1, MatrixType::RowsAtCompileTime> RowVectorType;
+
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, Dynamic, RowMajor> RhsMatrixType;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m2 = MatrixType::Random(rows, cols),
+             m3;
+  VectorType v1 = VectorType::Random(rows),
+             v2 = VectorType::Random(rows),
+             v3(rows);
+  RowVectorType r1 = RowVectorType::Random(rows),
+                r2 = RowVectorType::Random(rows);
+  RhsMatrixType m4 = RhsMatrixType::Random(rows,10);
+
+  Scalar s1 = internal::random<Scalar>(),
+         s2 = internal::random<Scalar>(),
+         s3 = internal::random<Scalar>();
+
+  m1 = (m1.adjoint() + m1).eval();
+
+  // rank2 update
+  m2 = m1.template triangularView<Lower>();
+  m2.template selfadjointView<Lower>().rankUpdate(v1,v2);
+  VERIFY_IS_APPROX(m2, (m1 + v1 * v2.adjoint()+ v2 * v1.adjoint()).template triangularView<Lower>().toDenseMatrix());
+
+  m2 = m1.template triangularView<Upper>();
+  m2.template selfadjointView<Upper>().rankUpdate(-v1,s2*v2,s3);
+  VERIFY_IS_APPROX(m2, (m1 + (s3*(-v1)*(s2*v2).adjoint()+numext::conj(s3)*(s2*v2)*(-v1).adjoint())).template triangularView<Upper>().toDenseMatrix());
+
+  m2 = m1.template triangularView<Upper>();
+  m2.template selfadjointView<Upper>().rankUpdate(-s2*r1.adjoint(),r2.adjoint()*s3,s1);
+  VERIFY_IS_APPROX(m2, (m1 + s1*(-s2*r1.adjoint())*(r2.adjoint()*s3).adjoint() + numext::conj(s1)*(r2.adjoint()*s3) * (-s2*r1.adjoint()).adjoint()).template triangularView<Upper>().toDenseMatrix());
+
+  if (rows>1)
+  {
+    m2 = m1.template triangularView<Lower>();
+    m2.block(1,1,rows-1,cols-1).template selfadjointView<Lower>().rankUpdate(v1.tail(rows-1),v2.head(cols-1));
+    m3 = m1;
+    m3.block(1,1,rows-1,cols-1) += v1.tail(rows-1) * v2.head(cols-1).adjoint()+ v2.head(cols-1) * v1.tail(rows-1).adjoint();
+    VERIFY_IS_APPROX(m2, m3.template triangularView<Lower>().toDenseMatrix());
+  }
+}
+
+EIGEN_DECLARE_TEST(product_selfadjoint)
+{
+  int s = 0;
+  for(int i = 0; i < g_repeat ; i++) {
+    CALL_SUBTEST_1( product_selfadjoint(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( product_selfadjoint(Matrix<float, 2, 2>()) );
+    CALL_SUBTEST_3( product_selfadjoint(Matrix3d()) );
+    
+    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
+    CALL_SUBTEST_4( product_selfadjoint(MatrixXcf(s, s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
+    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
+    CALL_SUBTEST_5( product_selfadjoint(MatrixXcd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
+    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
+    CALL_SUBTEST_6( product_selfadjoint(MatrixXd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
+    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
+    CALL_SUBTEST_7( product_selfadjoint(Matrix<float,Dynamic,Dynamic,RowMajor>(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+  }
+}

diff --git a/test/product_small.cpp b/test/product_small.cpp
new file mode 100644
index 0000000..1d6df6e
--- /dev/null
+++ b/test/product_small.cpp

@@ -0,0 +1,323 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_NO_STATIC_ASSERT
+#include "product.h"
+#include <Eigen/LU>
+
+// regression test for bug 447
+template<int>
+void product1x1()
+{
+  Matrix<float,1,3> matAstatic;
+  Matrix<float,3,1> matBstatic;
+  matAstatic.setRandom();
+  matBstatic.setRandom();
+  VERIFY_IS_APPROX( (matAstatic * matBstatic).coeff(0,0), 
+                    matAstatic.cwiseProduct(matBstatic.transpose()).sum() );
+
+  MatrixXf matAdynamic(1,3);
+  MatrixXf matBdynamic(3,1);
+  matAdynamic.setRandom();
+  matBdynamic.setRandom();
+  VERIFY_IS_APPROX( (matAdynamic * matBdynamic).coeff(0,0), 
+                    matAdynamic.cwiseProduct(matBdynamic.transpose()).sum() );
+}
+
+template<typename TC, typename TA, typename TB>
+const TC& ref_prod(TC &C, const TA &A, const TB &B)
+{
+  for(Index i=0;i<C.rows();++i)
+    for(Index j=0;j<C.cols();++j)
+      for(Index k=0;k<A.cols();++k)
+        C.coeffRef(i,j) += A.coeff(i,k) * B.coeff(k,j);
+  return C;
+}
+
+template<typename T, int Rows, int Cols, int Depth, int OC, int OA, int OB>
+typename internal::enable_if<! ( (Rows ==1&&Depth!=1&&OA==ColMajor)
+                              || (Depth==1&&Rows !=1&&OA==RowMajor)
+                              || (Cols ==1&&Depth!=1&&OB==RowMajor)
+                              || (Depth==1&&Cols !=1&&OB==ColMajor)
+                              || (Rows ==1&&Cols !=1&&OC==ColMajor)
+                              || (Cols ==1&&Rows !=1&&OC==RowMajor)),void>::type
+test_lazy_single(int rows, int cols, int depth)
+{
+  Matrix<T,Rows,Depth,OA> A(rows,depth); A.setRandom();
+  Matrix<T,Depth,Cols,OB> B(depth,cols); B.setRandom();
+  Matrix<T,Rows,Cols,OC>  C(rows,cols);  C.setRandom();
+  Matrix<T,Rows,Cols,OC>  D(C);
+  VERIFY_IS_APPROX(C+=A.lazyProduct(B), ref_prod(D,A,B));
+}
+
+void test_dynamic_bool()
+{
+  int rows = internal::random<int>(1,64);
+  int cols = internal::random<int>(1,64);
+  int depth = internal::random<int>(1,65);
+
+  typedef Matrix<bool,Dynamic,Dynamic> MatrixX;
+  MatrixX A(rows,depth); A.setRandom();
+  MatrixX B(depth,cols); B.setRandom();
+  MatrixX C(rows,cols);  C.setRandom();
+  MatrixX D(C);
+  for(Index i=0;i<C.rows();++i)
+    for(Index j=0;j<C.cols();++j)
+      for(Index k=0;k<A.cols();++k)
+       D.coeffRef(i,j) |= A.coeff(i,k) & B.coeff(k,j);
+  C += A * B;
+  VERIFY_IS_EQUAL(C, D);
+
+  MatrixX E = B.transpose();
+  for(Index i=0;i<B.rows();++i)
+    for(Index j=0;j<B.cols();++j)
+      VERIFY_IS_EQUAL(B(i,j), E(j,i));
+}
+
+template<typename T, int Rows, int Cols, int Depth, int OC, int OA, int OB>
+typename internal::enable_if<  ( (Rows ==1&&Depth!=1&&OA==ColMajor)
+                              || (Depth==1&&Rows !=1&&OA==RowMajor)
+                              || (Cols ==1&&Depth!=1&&OB==RowMajor)
+                              || (Depth==1&&Cols !=1&&OB==ColMajor)
+                              || (Rows ==1&&Cols !=1&&OC==ColMajor)
+                              || (Cols ==1&&Rows !=1&&OC==RowMajor)),void>::type
+test_lazy_single(int, int, int)
+{
+}
+
+template<typename T, int Rows, int Cols, int Depth>
+void test_lazy_all_layout(int rows=Rows, int cols=Cols, int depth=Depth)
+{
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,ColMajor,ColMajor,ColMajor>(rows,cols,depth) ));
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,RowMajor,ColMajor,ColMajor>(rows,cols,depth) ));
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,ColMajor,RowMajor,ColMajor>(rows,cols,depth) ));
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,RowMajor,RowMajor,ColMajor>(rows,cols,depth) ));
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,ColMajor,ColMajor,RowMajor>(rows,cols,depth) ));
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,RowMajor,ColMajor,RowMajor>(rows,cols,depth) ));
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,ColMajor,RowMajor,RowMajor>(rows,cols,depth) ));
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,RowMajor,RowMajor,RowMajor>(rows,cols,depth) ));
+}  
+
+template<typename T>
+void test_lazy_l1()
+{
+  int rows = internal::random<int>(1,12);
+  int cols = internal::random<int>(1,12);
+  int depth = internal::random<int>(1,12);
+
+  // Inner
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,1,1>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,1,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,1,3>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,1,8>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,1,9>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,1,-1>(1,1,depth) ));
+
+  // Outer
+  CALL_SUBTEST(( test_lazy_all_layout<T,2,1,1>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,2,1>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,2,2,1>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,3,3,1>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,4,1>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,8,1>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,-1,1>(4,cols) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,7,-1,1>(7,cols) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,-1,8,1>(rows) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,-1,3,1>(rows) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,-1,-1,1>(rows,cols) ));
+}
+
+template<typename T>
+void test_lazy_l2()
+{
+  int rows = internal::random<int>(1,12);
+  int cols = internal::random<int>(1,12);
+  int depth = internal::random<int>(1,12);
+
+  // mat-vec
+  CALL_SUBTEST(( test_lazy_all_layout<T,2,1,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,2,1,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,1,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,1,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,5,1,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,1,5>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,1,6>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,6,1,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,8,1,8>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,-1,1,4>(rows) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,1,-1>(4,1,depth) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,-1,1,-1>(rows,1,depth) ));
+
+  // vec-mat
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,2,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,2,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,4,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,4,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,5,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,4,5>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,4,6>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,6,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,8,8>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,-1, 4>(1,cols) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1, 4,-1>(1,4,depth) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,-1,-1>(1,cols,depth) ));
+}
+
+template<typename T>
+void test_lazy_l3()
+{
+  int rows = internal::random<int>(1,12);
+  int cols = internal::random<int>(1,12);
+  int depth = internal::random<int>(1,12);
+  // mat-mat
+  CALL_SUBTEST(( test_lazy_all_layout<T,2,4,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,2,6,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,3,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,8,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,5,6,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,2,5>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,7,6>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,6,8,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,8,3,8>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,-1,6,4>(rows) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,3,-1>(4,3,depth) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,-1,6,-1>(rows,6,depth) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,8,2,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,5,2,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,4,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,8,4,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,6,5,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,4,5>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,3,4,6>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,2,6,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,7,8,8>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,8,-1, 4>(8,cols) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,3, 4,-1>(3,4,depth) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,-1,-1>(4,cols,depth) ));
+}
+
+template<typename T,int N,int M,int K>
+void test_linear_but_not_vectorizable()
+{
+  // Check tricky cases for which the result of the product is a vector and thus must exhibit the LinearBit flag,
+  // but is not vectorizable along the linear dimension.
+  Index n = N==Dynamic ? internal::random<Index>(1,32) : N;
+  Index m = M==Dynamic ? internal::random<Index>(1,32) : M;
+  Index k = K==Dynamic ? internal::random<Index>(1,32) : K;
+
+  {
+    Matrix<T,N,M+1> A; A.setRandom(n,m+1);
+    Matrix<T,M*2,K> B; B.setRandom(m*2,k);
+    Matrix<T,1,K> C;
+    Matrix<T,1,K> R;
+
+    C.noalias() = A.template topLeftCorner<1,M>() * (B.template topRows<M>()+B.template bottomRows<M>());
+    R.noalias() = A.template topLeftCorner<1,M>() * (B.template topRows<M>()+B.template bottomRows<M>()).eval();
+    VERIFY_IS_APPROX(C,R);
+  }
+
+  {
+    Matrix<T,M+1,N,RowMajor> A; A.setRandom(m+1,n);
+    Matrix<T,K,M*2,RowMajor> B; B.setRandom(k,m*2);
+    Matrix<T,K,1> C;
+    Matrix<T,K,1> R;
+
+    C.noalias() = (B.template leftCols<M>()+B.template rightCols<M>())        * A.template topLeftCorner<M,1>();
+    R.noalias() = (B.template leftCols<M>()+B.template rightCols<M>()).eval() * A.template topLeftCorner<M,1>();
+    VERIFY_IS_APPROX(C,R);
+  }
+}
+
+template<int Rows>
+void bug_1311()
+{
+  Matrix< double, Rows, 2 > A;  A.setRandom();
+  Vector2d b = Vector2d::Random() ;
+  Matrix<double,Rows,1> res;
+  res.noalias() = 1. * (A * b);
+  VERIFY_IS_APPROX(res, A*b);
+  res.noalias() = 1.*A * b;
+  VERIFY_IS_APPROX(res, A*b);
+  res.noalias() = (1.*A).lazyProduct(b);
+  VERIFY_IS_APPROX(res, A*b);
+  res.noalias() = (1.*A).lazyProduct(1.*b);
+  VERIFY_IS_APPROX(res, A*b);
+  res.noalias() = (A).lazyProduct(1.*b);
+  VERIFY_IS_APPROX(res, A*b);
+}
+
+template<int>
+void product_small_regressions()
+{
+  {
+    // test compilation of (outer_product) * vector
+    Vector3f v = Vector3f::Random();
+    VERIFY_IS_APPROX( (v * v.transpose()) * v, (v * v.transpose()).eval() * v);
+  }
+  
+  {
+    // regression test for pull-request #93
+    Eigen::Matrix<double, 1, 1> A;  A.setRandom();
+    Eigen::Matrix<double, 18, 1> B; B.setRandom();
+    Eigen::Matrix<double, 1, 18> C; C.setRandom();
+    VERIFY_IS_APPROX(B * A.inverse(), B * A.inverse()[0]);
+    VERIFY_IS_APPROX(A.inverse() * C, A.inverse()[0] * C);
+  }
+
+  {
+    Eigen::Matrix<double, 10, 10> A, B, C;
+    A.setRandom();
+    C = A;
+    for(int k=0; k<79; ++k)
+      C = C * A;
+    B.noalias() = (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)))
+                * (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)));
+    VERIFY_IS_APPROX(B,C);
+  }
+}
+
+EIGEN_DECLARE_TEST(product_small)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( product(Matrix<float, 3, 2>()) );
+    CALL_SUBTEST_2( product(Matrix<int, 3, 17>()) );
+    CALL_SUBTEST_8( product(Matrix<double, 3, 17>()) );
+    CALL_SUBTEST_3( product(Matrix3d()) );
+    CALL_SUBTEST_4( product(Matrix4d()) );
+    CALL_SUBTEST_5( product(Matrix4f()) );
+    CALL_SUBTEST_6( product1x1<0>() );
+
+    CALL_SUBTEST_11( test_lazy_l1<float>() );
+    CALL_SUBTEST_12( test_lazy_l2<float>() );
+    CALL_SUBTEST_13( test_lazy_l3<float>() );
+
+    CALL_SUBTEST_21( test_lazy_l1<double>() );
+    CALL_SUBTEST_22( test_lazy_l2<double>() );
+    CALL_SUBTEST_23( test_lazy_l3<double>() );
+
+    CALL_SUBTEST_31( test_lazy_l1<std::complex<float> >() );
+    CALL_SUBTEST_32( test_lazy_l2<std::complex<float> >() );
+    CALL_SUBTEST_33( test_lazy_l3<std::complex<float> >() );
+
+    CALL_SUBTEST_41( test_lazy_l1<std::complex<double> >() );
+    CALL_SUBTEST_42( test_lazy_l2<std::complex<double> >() );
+    CALL_SUBTEST_43( test_lazy_l3<std::complex<double> >() );
+
+    CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,2,1,Dynamic>() ));
+    CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,3,1,Dynamic>() ));
+    CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,2,1,16>() ));
+
+    CALL_SUBTEST_6( bug_1311<3>() );
+    CALL_SUBTEST_6( bug_1311<5>() );
+
+    CALL_SUBTEST_9( test_dynamic_bool() );
+  }
+
+  CALL_SUBTEST_6( product_small_regressions<0>() );
+}

diff --git a/test/product_symm.cpp b/test/product_symm.cpp
new file mode 100644
index 0000000..ea8d4d5
--- /dev/null
+++ b/test/product_symm.cpp

@@ -0,0 +1,125 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename Scalar, int Size, int OtherSize> void symm(int size = Size, int othersize = OtherSize)
+{
+  typedef Matrix<Scalar, Size, Size> MatrixType;
+  typedef Matrix<Scalar, Size, OtherSize> Rhs1;
+  typedef Matrix<Scalar, OtherSize, Size> Rhs2;
+  enum { order = OtherSize==1 ? 0 : RowMajor };
+  typedef Matrix<Scalar, Size, OtherSize,order> Rhs3;
+
+  Index rows = size;
+  Index cols = size;
+
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m2 = MatrixType::Random(rows, cols), m3;
+
+  m1 = (m1+m1.adjoint()).eval();
+
+  Rhs1 rhs1 = Rhs1::Random(cols, othersize), rhs12(cols, othersize), rhs13(cols, othersize);
+  Rhs2 rhs2 = Rhs2::Random(othersize, rows), rhs22(othersize, rows), rhs23(othersize, rows);
+  Rhs3 rhs3 = Rhs3::Random(cols, othersize), rhs32(cols, othersize), rhs33(cols, othersize);
+
+  Scalar s1 = internal::random<Scalar>(),
+         s2 = internal::random<Scalar>();
+
+  m2 = m1.template triangularView<Lower>();
+  m3 = m2.template selfadjointView<Lower>();
+  VERIFY_IS_EQUAL(m1, m3);
+  VERIFY_IS_APPROX(rhs12 = (s1*m2).template selfadjointView<Lower>() * (s2*rhs1),
+                   rhs13 = (s1*m1) * (s2*rhs1));
+
+  VERIFY_IS_APPROX(rhs12 = (s1*m2).transpose().template selfadjointView<Upper>() * (s2*rhs1),
+                   rhs13 = (s1*m1.transpose()) * (s2*rhs1));
+
+  VERIFY_IS_APPROX(rhs12 = (s1*m2).template selfadjointView<Lower>().transpose() * (s2*rhs1),
+                   rhs13 = (s1*m1.transpose()) * (s2*rhs1));
+
+  VERIFY_IS_APPROX(rhs12 = (s1*m2).conjugate().template selfadjointView<Lower>() * (s2*rhs1),
+                   rhs13 = (s1*m1).conjugate() * (s2*rhs1));
+
+  VERIFY_IS_APPROX(rhs12 = (s1*m2).template selfadjointView<Lower>().conjugate() * (s2*rhs1),
+                   rhs13 = (s1*m1).conjugate() * (s2*rhs1));
+
+  VERIFY_IS_APPROX(rhs12 = (s1*m2).adjoint().template selfadjointView<Upper>() * (s2*rhs1),
+                   rhs13 = (s1*m1).adjoint() * (s2*rhs1));
+
+  VERIFY_IS_APPROX(rhs12 = (s1*m2).template selfadjointView<Lower>().adjoint() * (s2*rhs1),
+                   rhs13 = (s1*m1).adjoint() * (s2*rhs1));
+
+  m2 = m1.template triangularView<Upper>(); rhs12.setRandom(); rhs13 = rhs12;
+  m3 = m2.template selfadjointView<Upper>();
+  VERIFY_IS_EQUAL(m1, m3);
+  VERIFY_IS_APPROX(rhs12 += (s1*m2).template selfadjointView<Upper>() * (s2*rhs1),
+                   rhs13 += (s1*m1) * (s2*rhs1));
+
+  m2 = m1.template triangularView<Lower>();
+  VERIFY_IS_APPROX(rhs12 = (s1*m2).template selfadjointView<Lower>() * (s2*rhs2.adjoint()),
+                   rhs13 = (s1*m1) * (s2*rhs2.adjoint()));
+
+  m2 = m1.template triangularView<Upper>();
+  VERIFY_IS_APPROX(rhs12 = (s1*m2).template selfadjointView<Upper>() * (s2*rhs2.adjoint()),
+                   rhs13 = (s1*m1) * (s2*rhs2.adjoint()));
+
+  m2 = m1.template triangularView<Upper>();
+  VERIFY_IS_APPROX(rhs12 = (s1*m2.adjoint()).template selfadjointView<Lower>() * (s2*rhs2.adjoint()),
+                   rhs13 = (s1*m1.adjoint()) * (s2*rhs2.adjoint()));
+
+  // test row major = <...>
+  m2 = m1.template triangularView<Lower>(); rhs32.setRandom(); rhs13 = rhs32;
+  VERIFY_IS_APPROX(rhs32.noalias() -= (s1*m2).template selfadjointView<Lower>() * (s2*rhs3),
+                   rhs13 -= (s1*m1) * (s2 * rhs3));
+
+  m2 = m1.template triangularView<Upper>();
+  VERIFY_IS_APPROX(rhs32.noalias() = (s1*m2.adjoint()).template selfadjointView<Lower>() * (s2*rhs3).conjugate(),
+                   rhs13 = (s1*m1.adjoint()) * (s2*rhs3).conjugate());
+
+
+  m2 = m1.template triangularView<Upper>(); rhs13 = rhs12;
+  VERIFY_IS_APPROX(rhs12.noalias() += s1 * ((m2.adjoint()).template selfadjointView<Lower>() * (s2*rhs3).conjugate()),
+                   rhs13 += (s1*m1.adjoint()) * (s2*rhs3).conjugate());
+
+  m2 = m1.template triangularView<Lower>();
+  VERIFY_IS_APPROX(rhs22 = (rhs2) * (m2).template selfadjointView<Lower>(), rhs23 = (rhs2) * (m1));
+  VERIFY_IS_APPROX(rhs22 = (s2*rhs2) * (s1*m2).template selfadjointView<Lower>(), rhs23 = (s2*rhs2) * (s1*m1));
+
+  // destination with a non-default inner-stride
+  // see bug 1741
+  {
+    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixX;
+    MatrixX buffer(2*cols,2*othersize);
+    Map<Rhs1,0,Stride<Dynamic,2> > map1(buffer.data(),cols,othersize,Stride<Dynamic,2>(2*rows,2));
+    buffer.setZero();
+    VERIFY_IS_APPROX( map1.noalias()  = (s1*m2).template selfadjointView<Lower>() * (s2*rhs1),
+                      rhs13 = (s1*m1) * (s2*rhs1));
+
+    Map<Rhs2,0,Stride<Dynamic,2> > map2(buffer.data(),rhs22.rows(),rhs22.cols(),Stride<Dynamic,2>(2*rhs22.outerStride(),2));
+    buffer.setZero();
+    VERIFY_IS_APPROX(map2 = (rhs2) * (m2).template selfadjointView<Lower>(), rhs23 = (rhs2) * (m1));
+  }
+}
+
+EIGEN_DECLARE_TEST(product_symm)
+{
+  for(int i = 0; i < g_repeat ; i++)
+  {
+    CALL_SUBTEST_1(( symm<float,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE),internal::random<int>(1,EIGEN_TEST_MAX_SIZE)) ));
+    CALL_SUBTEST_2(( symm<double,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE),internal::random<int>(1,EIGEN_TEST_MAX_SIZE)) ));
+    CALL_SUBTEST_3(( symm<std::complex<float>,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2),internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2)) ));
+    CALL_SUBTEST_4(( symm<std::complex<double>,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2),internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2)) ));
+
+    CALL_SUBTEST_5(( symm<float,Dynamic,1>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)) ));
+    CALL_SUBTEST_6(( symm<double,Dynamic,1>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)) ));
+    CALL_SUBTEST_7(( symm<std::complex<float>,Dynamic,1>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)) ));
+    CALL_SUBTEST_8(( symm<std::complex<double>,Dynamic,1>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)) ));
+  }
+}

diff --git a/test/product_syrk.cpp b/test/product_syrk.cpp
new file mode 100644
index 0000000..8becd37
--- /dev/null
+++ b/test/product_syrk.cpp

@@ -0,0 +1,146 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename MatrixType> void syrk(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime, RowMajor> RMatrixType;
+  typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, Dynamic> Rhs1;
+  typedef Matrix<Scalar, Dynamic, MatrixType::RowsAtCompileTime> Rhs2;
+  typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, Dynamic,RowMajor> Rhs3;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m2 = MatrixType::Random(rows, cols),
+             m3 = MatrixType::Random(rows, cols);
+  RMatrixType rm2 = MatrixType::Random(rows, cols);
+
+  Rhs1 rhs1 = Rhs1::Random(internal::random<int>(1,320), cols); Rhs1 rhs11 = Rhs1::Random(rhs1.rows(), cols);
+  Rhs2 rhs2 = Rhs2::Random(rows, internal::random<int>(1,320)); Rhs2 rhs22 = Rhs2::Random(rows, rhs2.cols());
+  Rhs3 rhs3 = Rhs3::Random(internal::random<int>(1,320), rows);
+
+  Scalar s1 = internal::random<Scalar>();
+  
+  Index c = internal::random<Index>(0,cols-1);
+
+  m2.setZero();
+  VERIFY_IS_APPROX((m2.template selfadjointView<Lower>().rankUpdate(rhs2,s1)._expression()),
+                   ((s1 * rhs2 * rhs2.adjoint()).eval().template triangularView<Lower>().toDenseMatrix()));
+  m2.setZero();
+  VERIFY_IS_APPROX(((m2.template triangularView<Lower>() += s1 * rhs2  * rhs22.adjoint()).nestedExpression()),
+                   ((s1 * rhs2 * rhs22.adjoint()).eval().template triangularView<Lower>().toDenseMatrix()));
+
+  
+  m2.setZero();
+  VERIFY_IS_APPROX(m2.template selfadjointView<Upper>().rankUpdate(rhs2,s1)._expression(),
+                   (s1 * rhs2 * rhs2.adjoint()).eval().template triangularView<Upper>().toDenseMatrix());
+  m2.setZero();
+  VERIFY_IS_APPROX((m2.template triangularView<Upper>() += s1 * rhs22 * rhs2.adjoint()).nestedExpression(),
+                   (s1 * rhs22 * rhs2.adjoint()).eval().template triangularView<Upper>().toDenseMatrix());
+
+  
+  m2.setZero();
+  VERIFY_IS_APPROX(m2.template selfadjointView<Lower>().rankUpdate(rhs1.adjoint(),s1)._expression(),
+                   (s1 * rhs1.adjoint() * rhs1).eval().template triangularView<Lower>().toDenseMatrix());
+  m2.setZero();
+  VERIFY_IS_APPROX((m2.template triangularView<Lower>() += s1 * rhs11.adjoint() * rhs1).nestedExpression(),
+                   (s1 * rhs11.adjoint() * rhs1).eval().template triangularView<Lower>().toDenseMatrix());
+  
+  
+  m2.setZero();
+  VERIFY_IS_APPROX(m2.template selfadjointView<Upper>().rankUpdate(rhs1.adjoint(),s1)._expression(),
+                   (s1 * rhs1.adjoint() * rhs1).eval().template triangularView<Upper>().toDenseMatrix());
+  VERIFY_IS_APPROX((m2.template triangularView<Upper>() = s1 * rhs1.adjoint() * rhs11).nestedExpression(),
+                   (s1 * rhs1.adjoint() * rhs11).eval().template triangularView<Upper>().toDenseMatrix());
+
+  
+  m2.setZero();
+  VERIFY_IS_APPROX(m2.template selfadjointView<Lower>().rankUpdate(rhs3.adjoint(),s1)._expression(),
+                   (s1 * rhs3.adjoint() * rhs3).eval().template triangularView<Lower>().toDenseMatrix());
+
+  m2.setZero();
+  VERIFY_IS_APPROX(m2.template selfadjointView<Upper>().rankUpdate(rhs3.adjoint(),s1)._expression(),
+                   (s1 * rhs3.adjoint() * rhs3).eval().template triangularView<Upper>().toDenseMatrix());
+                   
+  m2.setZero();
+  VERIFY_IS_APPROX((m2.template selfadjointView<Lower>().rankUpdate(m1.col(c),s1)._expression()),
+                   ((s1 * m1.col(c) * m1.col(c).adjoint()).eval().template triangularView<Lower>().toDenseMatrix()));
+                   
+  m2.setZero();
+  VERIFY_IS_APPROX((m2.template selfadjointView<Upper>().rankUpdate(m1.col(c),s1)._expression()),
+                   ((s1 * m1.col(c) * m1.col(c).adjoint()).eval().template triangularView<Upper>().toDenseMatrix()));
+  rm2.setZero();
+  VERIFY_IS_APPROX((rm2.template selfadjointView<Upper>().rankUpdate(m1.col(c),s1)._expression()),
+                   ((s1 * m1.col(c) * m1.col(c).adjoint()).eval().template triangularView<Upper>().toDenseMatrix()));
+  m2.setZero();
+  VERIFY_IS_APPROX((m2.template triangularView<Upper>() += s1 * m3.col(c) * m1.col(c).adjoint()).nestedExpression(),
+                   ((s1 * m3.col(c) * m1.col(c).adjoint()).eval().template triangularView<Upper>().toDenseMatrix()));
+  rm2.setZero();
+  VERIFY_IS_APPROX((rm2.template triangularView<Upper>() += s1 * m1.col(c) * m3.col(c).adjoint()).nestedExpression(),
+                   ((s1 * m1.col(c) * m3.col(c).adjoint()).eval().template triangularView<Upper>().toDenseMatrix()));
+  
+  m2.setZero();
+  VERIFY_IS_APPROX((m2.template selfadjointView<Lower>().rankUpdate(m1.col(c).conjugate(),s1)._expression()),
+                   ((s1 * m1.col(c).conjugate() * m1.col(c).conjugate().adjoint()).eval().template triangularView<Lower>().toDenseMatrix()));
+                   
+  m2.setZero();
+  VERIFY_IS_APPROX((m2.template selfadjointView<Upper>().rankUpdate(m1.col(c).conjugate(),s1)._expression()),
+                   ((s1 * m1.col(c).conjugate() * m1.col(c).conjugate().adjoint()).eval().template triangularView<Upper>().toDenseMatrix()));
+  
+  
+  m2.setZero();
+  VERIFY_IS_APPROX((m2.template selfadjointView<Lower>().rankUpdate(m1.row(c),s1)._expression()),
+                   ((s1 * m1.row(c).transpose() * m1.row(c).transpose().adjoint()).eval().template triangularView<Lower>().toDenseMatrix()));
+  rm2.setZero();
+  VERIFY_IS_APPROX((rm2.template selfadjointView<Lower>().rankUpdate(m1.row(c),s1)._expression()),
+                   ((s1 * m1.row(c).transpose() * m1.row(c).transpose().adjoint()).eval().template triangularView<Lower>().toDenseMatrix()));
+  m2.setZero();
+  VERIFY_IS_APPROX((m2.template triangularView<Lower>() += s1 * m3.row(c).transpose() * m1.row(c).transpose().adjoint()).nestedExpression(),
+                   ((s1 * m3.row(c).transpose() * m1.row(c).transpose().adjoint()).eval().template triangularView<Lower>().toDenseMatrix()));
+  rm2.setZero();
+  VERIFY_IS_APPROX((rm2.template triangularView<Lower>() += s1 * m3.row(c).transpose() * m1.row(c).transpose().adjoint()).nestedExpression(),
+                   ((s1 * m3.row(c).transpose() * m1.row(c).transpose().adjoint()).eval().template triangularView<Lower>().toDenseMatrix()));
+  
+  
+  m2.setZero();
+  VERIFY_IS_APPROX((m2.template selfadjointView<Upper>().rankUpdate(m1.row(c).adjoint(),s1)._expression()),
+                   ((s1 * m1.row(c).adjoint() * m1.row(c).adjoint().adjoint()).eval().template triangularView<Upper>().toDenseMatrix()));
+
+  // destination with a non-default inner-stride
+  // see bug 1741
+  {
+    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixX;
+    MatrixX buffer(2*rows,2*cols);
+    Map<MatrixType,0,Stride<Dynamic,2> > map1(buffer.data(),rows,cols,Stride<Dynamic,2>(2*rows,2));
+    buffer.setZero();
+    VERIFY_IS_APPROX((map1.template selfadjointView<Lower>().rankUpdate(rhs2,s1)._expression()),
+                      ((s1 * rhs2 * rhs2.adjoint()).eval().template triangularView<Lower>().toDenseMatrix()));
+  }
+}
+
+EIGEN_DECLARE_TEST(product_syrk)
+{
+  for(int i = 0; i < g_repeat ; i++)
+  {
+    int s;
+    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
+    CALL_SUBTEST_1( syrk(MatrixXf(s, s)) );
+    CALL_SUBTEST_2( syrk(MatrixXd(s, s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
+    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
+    CALL_SUBTEST_3( syrk(MatrixXcf(s, s)) );
+    CALL_SUBTEST_4( syrk(MatrixXcd(s, s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+  }
+}

diff --git a/test/product_trmm.cpp b/test/product_trmm.cpp
new file mode 100644
index 0000000..2bb4b9e
--- /dev/null
+++ b/test/product_trmm.cpp

@@ -0,0 +1,137 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename T>
+int get_random_size()
+{
+  const int factor = NumTraits<T>::ReadCost;
+  const int max_test_size = EIGEN_TEST_MAX_SIZE>2*factor ? EIGEN_TEST_MAX_SIZE/factor : EIGEN_TEST_MAX_SIZE;
+  return internal::random<int>(1,max_test_size);
+}
+
+template<typename Scalar, int Mode, int TriOrder, int OtherOrder, int ResOrder, int OtherCols>
+void trmm(int rows=get_random_size<Scalar>(),
+          int cols=get_random_size<Scalar>(),
+          int otherCols = OtherCols==Dynamic?get_random_size<Scalar>():OtherCols)
+{
+  typedef Matrix<Scalar,Dynamic,Dynamic,TriOrder> TriMatrix;
+  typedef Matrix<Scalar,Dynamic,OtherCols,OtherCols==1?ColMajor:OtherOrder> OnTheRight;
+  typedef Matrix<Scalar,OtherCols,Dynamic,OtherCols==1?RowMajor:OtherOrder> OnTheLeft;
+  
+  typedef Matrix<Scalar,Dynamic,OtherCols,OtherCols==1?ColMajor:ResOrder> ResXS;
+  typedef Matrix<Scalar,OtherCols,Dynamic,OtherCols==1?RowMajor:ResOrder> ResSX;
+
+  TriMatrix  mat(rows,cols), tri(rows,cols), triTr(cols,rows), s1tri(rows,cols), s1triTr(cols,rows);
+  
+  OnTheRight  ge_right(cols,otherCols);
+  OnTheLeft   ge_left(otherCols,rows);
+  ResSX       ge_sx, ge_sx_save;
+  ResXS       ge_xs, ge_xs_save;
+
+  Scalar s1 = internal::random<Scalar>(),
+         s2 = internal::random<Scalar>();
+
+  mat.setRandom();
+  tri = mat.template triangularView<Mode>();
+  triTr = mat.transpose().template triangularView<Mode>();
+  s1tri = (s1*mat).template triangularView<Mode>();
+  s1triTr = (s1*mat).transpose().template triangularView<Mode>();
+  ge_right.setRandom();
+  ge_left.setRandom();
+
+  VERIFY_IS_APPROX( ge_xs = mat.template triangularView<Mode>() * ge_right, tri * ge_right);
+  VERIFY_IS_APPROX( ge_sx = ge_left * mat.template triangularView<Mode>(), ge_left * tri);
+  
+  VERIFY_IS_APPROX( ge_xs.noalias() = mat.template triangularView<Mode>() * ge_right, tri * ge_right);
+  VERIFY_IS_APPROX( ge_sx.noalias() = ge_left * mat.template triangularView<Mode>(), ge_left * tri);
+
+  if((Mode&UnitDiag)==0)
+    VERIFY_IS_APPROX( ge_xs.noalias() = (s1*mat.adjoint()).template triangularView<Mode>() * (s2*ge_left.transpose()), s1*triTr.conjugate() * (s2*ge_left.transpose()));
+  
+  VERIFY_IS_APPROX( ge_xs.noalias() = (s1*mat.transpose()).template triangularView<Mode>() * (s2*ge_left.transpose()), s1triTr * (s2*ge_left.transpose()));
+  VERIFY_IS_APPROX( ge_sx.noalias() = (s2*ge_left) * (s1*mat).template triangularView<Mode>(), (s2*ge_left)*s1tri);
+
+  VERIFY_IS_APPROX( ge_sx.noalias() = ge_right.transpose() * mat.adjoint().template triangularView<Mode>(), ge_right.transpose() * triTr.conjugate());
+  VERIFY_IS_APPROX( ge_sx.noalias() = ge_right.adjoint() * mat.adjoint().template triangularView<Mode>(), ge_right.adjoint() * triTr.conjugate());
+  
+  ge_xs_save = ge_xs;
+  if((Mode&UnitDiag)==0)
+    VERIFY_IS_APPROX( (ge_xs_save + s1*triTr.conjugate() * (s2*ge_left.adjoint())).eval(), ge_xs.noalias() += (s1*mat.adjoint()).template triangularView<Mode>() * (s2*ge_left.adjoint()) );
+  ge_xs_save = ge_xs;
+  VERIFY_IS_APPROX( (ge_xs_save + s1triTr * (s2*ge_left.adjoint())).eval(), ge_xs.noalias() += (s1*mat.transpose()).template triangularView<Mode>() * (s2*ge_left.adjoint()) );
+  ge_sx.setRandom();
+  ge_sx_save = ge_sx;
+  if((Mode&UnitDiag)==0)
+    VERIFY_IS_APPROX( ge_sx_save - (ge_right.adjoint() * (-s1 * triTr).conjugate()).eval(), ge_sx.noalias() -= (ge_right.adjoint() * (-s1 * mat).adjoint().template triangularView<Mode>()).eval());
+  
+  if((Mode&UnitDiag)==0)
+    VERIFY_IS_APPROX( ge_xs = (s1*mat).adjoint().template triangularView<Mode>() * ge_left.adjoint(), numext::conj(s1) * triTr.conjugate() * ge_left.adjoint());
+  VERIFY_IS_APPROX( ge_xs = (s1*mat).transpose().template triangularView<Mode>() * ge_left.adjoint(), s1triTr * ge_left.adjoint());
+
+  // TODO check with sub-matrix expressions ?
+
+  // destination with a non-default inner-stride
+  // see bug 1741
+  {
+    VERIFY_IS_APPROX( ge_xs.noalias() = mat.template triangularView<Mode>() * ge_right, tri * ge_right);
+    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixX;
+    MatrixX buffer(2*ge_xs.rows(),2*ge_xs.cols());
+    Map<ResXS,0,Stride<Dynamic,2> > map1(buffer.data(),ge_xs.rows(),ge_xs.cols(),Stride<Dynamic,2>(2*ge_xs.outerStride(),2));
+    buffer.setZero();
+    VERIFY_IS_APPROX( map1.noalias() = mat.template triangularView<Mode>() * ge_right, tri * ge_right);
+  }
+}
+
+template<typename Scalar, int Mode, int TriOrder>
+void trmv(int rows=get_random_size<Scalar>(), int cols=get_random_size<Scalar>())
+{
+  trmm<Scalar,Mode,TriOrder,ColMajor,ColMajor,1>(rows,cols,1);
+}
+
+template<typename Scalar, int Mode, int TriOrder, int OtherOrder, int ResOrder>
+void trmm(int rows=get_random_size<Scalar>(), int cols=get_random_size<Scalar>(), int otherCols = get_random_size<Scalar>())
+{
+  trmm<Scalar,Mode,TriOrder,OtherOrder,ResOrder,Dynamic>(rows,cols,otherCols);
+}
+
+#define CALL_ALL_ORDERS(NB,SCALAR,MODE)                                             \
+  EIGEN_CAT(CALL_SUBTEST_,NB)((trmm<SCALAR, MODE, ColMajor,ColMajor,ColMajor>()));  \
+  EIGEN_CAT(CALL_SUBTEST_,NB)((trmm<SCALAR, MODE, ColMajor,ColMajor,RowMajor>()));  \
+  EIGEN_CAT(CALL_SUBTEST_,NB)((trmm<SCALAR, MODE, ColMajor,RowMajor,ColMajor>()));  \
+  EIGEN_CAT(CALL_SUBTEST_,NB)((trmm<SCALAR, MODE, ColMajor,RowMajor,RowMajor>()));  \
+  EIGEN_CAT(CALL_SUBTEST_,NB)((trmm<SCALAR, MODE, RowMajor,ColMajor,ColMajor>()));  \
+  EIGEN_CAT(CALL_SUBTEST_,NB)((trmm<SCALAR, MODE, RowMajor,ColMajor,RowMajor>()));  \
+  EIGEN_CAT(CALL_SUBTEST_,NB)((trmm<SCALAR, MODE, RowMajor,RowMajor,ColMajor>()));  \
+  EIGEN_CAT(CALL_SUBTEST_,NB)((trmm<SCALAR, MODE, RowMajor,RowMajor,RowMajor>()));  \
+  \
+  EIGEN_CAT(CALL_SUBTEST_1,NB)((trmv<SCALAR, MODE, ColMajor>()));                   \
+  EIGEN_CAT(CALL_SUBTEST_1,NB)((trmv<SCALAR, MODE, RowMajor>()));
+
+  
+#define CALL_ALL(NB,SCALAR)                 \
+  CALL_ALL_ORDERS(EIGEN_CAT(1,NB),SCALAR,Upper)          \
+  CALL_ALL_ORDERS(EIGEN_CAT(2,NB),SCALAR,UnitUpper)      \
+  CALL_ALL_ORDERS(EIGEN_CAT(3,NB),SCALAR,StrictlyUpper)  \
+  CALL_ALL_ORDERS(EIGEN_CAT(1,NB),SCALAR,Lower)          \
+  CALL_ALL_ORDERS(EIGEN_CAT(2,NB),SCALAR,UnitLower)      \
+  CALL_ALL_ORDERS(EIGEN_CAT(3,NB),SCALAR,StrictlyLower)
+  
+
+EIGEN_DECLARE_TEST(product_trmm)
+{
+  for(int i = 0; i < g_repeat ; i++)
+  {
+    CALL_ALL(1,float);                //  EIGEN_SUFFIXES;11;111;21;121;31;131
+    CALL_ALL(2,double);               //  EIGEN_SUFFIXES;12;112;22;122;32;132
+    CALL_ALL(3,std::complex<float>);  //  EIGEN_SUFFIXES;13;113;23;123;33;133
+    CALL_ALL(4,std::complex<double>); //  EIGEN_SUFFIXES;14;114;24;124;34;134
+  }
+}

diff --git a/test/product_trmv.cpp b/test/product_trmv.cpp
new file mode 100644
index 0000000..5eb1b5a
--- /dev/null
+++ b/test/product_trmv.cpp

@@ -0,0 +1,90 @@
+// This file is triangularView of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename MatrixType> void trmv(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+
+  RealScalar largerEps = 10*test_precision<RealScalar>();
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m3(rows, cols);
+  VectorType v1 = VectorType::Random(rows);
+
+  Scalar s1 = internal::random<Scalar>();
+
+  m1 = MatrixType::Random(rows, cols);
+
+  // check with a column-major matrix
+  m3 = m1.template triangularView<Eigen::Lower>();
+  VERIFY((m3 * v1).isApprox(m1.template triangularView<Eigen::Lower>() * v1, largerEps));
+  m3 = m1.template triangularView<Eigen::Upper>();
+  VERIFY((m3 * v1).isApprox(m1.template triangularView<Eigen::Upper>() * v1, largerEps));
+  m3 = m1.template triangularView<Eigen::UnitLower>();
+  VERIFY((m3 * v1).isApprox(m1.template triangularView<Eigen::UnitLower>() * v1, largerEps));
+  m3 = m1.template triangularView<Eigen::UnitUpper>();
+  VERIFY((m3 * v1).isApprox(m1.template triangularView<Eigen::UnitUpper>() * v1, largerEps));
+
+  // check conjugated and scalar multiple expressions (col-major)
+  m3 = m1.template triangularView<Eigen::Lower>();
+  VERIFY(((s1*m3).conjugate() * v1).isApprox((s1*m1).conjugate().template triangularView<Eigen::Lower>() * v1, largerEps));
+  m3 = m1.template triangularView<Eigen::Upper>();
+  VERIFY((m3.conjugate() * v1.conjugate()).isApprox(m1.conjugate().template triangularView<Eigen::Upper>() * v1.conjugate(), largerEps));
+
+  // check with a row-major matrix
+  m3 = m1.template triangularView<Eigen::Upper>();
+  VERIFY((m3.transpose() * v1).isApprox(m1.transpose().template triangularView<Eigen::Lower>() * v1, largerEps));
+  m3 = m1.template triangularView<Eigen::Lower>();
+  VERIFY((m3.transpose() * v1).isApprox(m1.transpose().template triangularView<Eigen::Upper>() * v1, largerEps));
+  m3 = m1.template triangularView<Eigen::UnitUpper>();
+  VERIFY((m3.transpose() * v1).isApprox(m1.transpose().template triangularView<Eigen::UnitLower>() * v1, largerEps));
+  m3 = m1.template triangularView<Eigen::UnitLower>();
+  VERIFY((m3.transpose() * v1).isApprox(m1.transpose().template triangularView<Eigen::UnitUpper>() * v1, largerEps));
+
+  // check conjugated and scalar multiple expressions (row-major)
+  m3 = m1.template triangularView<Eigen::Upper>();
+  VERIFY((m3.adjoint() * v1).isApprox(m1.adjoint().template triangularView<Eigen::Lower>() * v1, largerEps));
+  m3 = m1.template triangularView<Eigen::Lower>();
+  VERIFY((m3.adjoint() * (s1*v1.conjugate())).isApprox(m1.adjoint().template triangularView<Eigen::Upper>() * (s1*v1.conjugate()), largerEps));
+  m3 = m1.template triangularView<Eigen::UnitUpper>();
+
+  // check transposed cases:
+  m3 = m1.template triangularView<Eigen::Lower>();
+  VERIFY((v1.transpose() * m3).isApprox(v1.transpose() * m1.template triangularView<Eigen::Lower>(), largerEps));
+  VERIFY((v1.adjoint() * m3).isApprox(v1.adjoint() * m1.template triangularView<Eigen::Lower>(), largerEps));
+  VERIFY((v1.adjoint() * m3.adjoint()).isApprox(v1.adjoint() * m1.template triangularView<Eigen::Lower>().adjoint(), largerEps));
+
+  // TODO check with sub-matrices
+}
+
+EIGEN_DECLARE_TEST(product_trmv)
+{
+  int s = 0;
+  for(int i = 0; i < g_repeat ; i++) {
+    CALL_SUBTEST_1( trmv(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( trmv(Matrix<float, 2, 2>()) );
+    CALL_SUBTEST_3( trmv(Matrix3d()) );
+    
+    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
+    CALL_SUBTEST_4( trmv(MatrixXcf(s,s)) );
+    CALL_SUBTEST_5( trmv(MatrixXcd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
+    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
+    CALL_SUBTEST_6( trmv(Matrix<float,Dynamic,Dynamic,RowMajor>(s, s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+  }
+}

diff --git a/test/product_trsolve.cpp b/test/product_trsolve.cpp
new file mode 100644
index 0000000..c59748c
--- /dev/null
+++ b/test/product_trsolve.cpp

@@ -0,0 +1,127 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#define VERIFY_TRSM(TRI,XB) { \
+    (XB).setRandom(); ref = (XB); \
+    (TRI).solveInPlace(XB); \
+    VERIFY_IS_APPROX((TRI).toDenseMatrix() * (XB), ref); \
+    (XB).setRandom(); ref = (XB); \
+    (XB) = (TRI).solve(XB); \
+    VERIFY_IS_APPROX((TRI).toDenseMatrix() * (XB), ref); \
+  }
+
+#define VERIFY_TRSM_ONTHERIGHT(TRI,XB) { \
+    (XB).setRandom(); ref = (XB); \
+    (TRI).transpose().template solveInPlace<OnTheRight>(XB.transpose()); \
+    VERIFY_IS_APPROX((XB).transpose() * (TRI).transpose().toDenseMatrix(), ref.transpose()); \
+    (XB).setRandom(); ref = (XB); \
+    (XB).transpose() = (TRI).transpose().template solve<OnTheRight>(XB.transpose()); \
+    VERIFY_IS_APPROX((XB).transpose() * (TRI).transpose().toDenseMatrix(), ref.transpose()); \
+  }
+
+template<typename Scalar,int Size, int Cols> void trsolve(int size=Size,int cols=Cols)
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  Matrix<Scalar,Size,Size,ColMajor> cmLhs(size,size);
+  Matrix<Scalar,Size,Size,RowMajor> rmLhs(size,size);
+
+  enum {  colmajor = Size==1 ? RowMajor : ColMajor,
+          rowmajor = Cols==1 ? ColMajor : RowMajor };
+  Matrix<Scalar,Size,Cols,colmajor> cmRhs(size,cols);
+  Matrix<Scalar,Size,Cols,rowmajor> rmRhs(size,cols);
+  Matrix<Scalar,Dynamic,Dynamic,colmajor> ref(size,cols);
+
+  cmLhs.setRandom(); cmLhs *= static_cast<RealScalar>(0.1); cmLhs.diagonal().array() += static_cast<RealScalar>(1);
+  rmLhs.setRandom(); rmLhs *= static_cast<RealScalar>(0.1); rmLhs.diagonal().array() += static_cast<RealScalar>(1);
+
+  VERIFY_TRSM(cmLhs.conjugate().template triangularView<Lower>(), cmRhs);
+  VERIFY_TRSM(cmLhs.adjoint()  .template triangularView<Lower>(), cmRhs);
+  VERIFY_TRSM(cmLhs            .template triangularView<Upper>(), cmRhs);
+  VERIFY_TRSM(cmLhs            .template triangularView<Lower>(), rmRhs);
+  VERIFY_TRSM(cmLhs.conjugate().template triangularView<Upper>(), rmRhs);
+  VERIFY_TRSM(cmLhs.adjoint()  .template triangularView<Upper>(), rmRhs);
+
+  VERIFY_TRSM(cmLhs.conjugate().template triangularView<UnitLower>(), cmRhs);
+  VERIFY_TRSM(cmLhs            .template triangularView<UnitUpper>(), rmRhs);
+
+  VERIFY_TRSM(rmLhs            .template triangularView<Lower>(), cmRhs);
+  VERIFY_TRSM(rmLhs.conjugate().template triangularView<UnitUpper>(), rmRhs);
+
+
+  VERIFY_TRSM_ONTHERIGHT(cmLhs.conjugate().template triangularView<Lower>(), cmRhs);
+  VERIFY_TRSM_ONTHERIGHT(cmLhs            .template triangularView<Upper>(), cmRhs);
+  VERIFY_TRSM_ONTHERIGHT(cmLhs            .template triangularView<Lower>(), rmRhs);
+  VERIFY_TRSM_ONTHERIGHT(cmLhs.conjugate().template triangularView<Upper>(), rmRhs);
+
+  VERIFY_TRSM_ONTHERIGHT(cmLhs.conjugate().template triangularView<UnitLower>(), cmRhs);
+  VERIFY_TRSM_ONTHERIGHT(cmLhs            .template triangularView<UnitUpper>(), rmRhs);
+
+  VERIFY_TRSM_ONTHERIGHT(rmLhs            .template triangularView<Lower>(), cmRhs);
+  VERIFY_TRSM_ONTHERIGHT(rmLhs.conjugate().template triangularView<UnitUpper>(), rmRhs);
+
+  int c = internal::random<int>(0,cols-1);
+  VERIFY_TRSM(rmLhs.template triangularView<Lower>(), rmRhs.col(c));
+  VERIFY_TRSM(cmLhs.template triangularView<Lower>(), rmRhs.col(c));
+
+  // destination with a non-default inner-stride
+  // see bug 1741
+  {
+    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixX;
+    MatrixX buffer(2*cmRhs.rows(),2*cmRhs.cols());
+    Map<Matrix<Scalar,Size,Cols,colmajor>,0,Stride<Dynamic,2> > map1(buffer.data(),cmRhs.rows(),cmRhs.cols(),Stride<Dynamic,2>(2*cmRhs.outerStride(),2));
+    Map<Matrix<Scalar,Size,Cols,rowmajor>,0,Stride<Dynamic,2> > map2(buffer.data(),rmRhs.rows(),rmRhs.cols(),Stride<Dynamic,2>(2*rmRhs.outerStride(),2));
+    buffer.setZero();
+    VERIFY_TRSM(cmLhs.conjugate().template triangularView<Lower>(), map1);
+    buffer.setZero();
+    VERIFY_TRSM(cmLhs            .template triangularView<Lower>(), map2);
+  }
+
+  if(Size==Dynamic)
+  {
+    cmLhs.resize(0,0);
+    cmRhs.resize(0,cmRhs.cols());
+    Matrix<Scalar,Size,Cols,colmajor> res = cmLhs.template triangularView<Lower>().solve(cmRhs);
+    VERIFY_IS_EQUAL(res.rows(),0);
+    VERIFY_IS_EQUAL(res.cols(),cmRhs.cols());
+    res = cmRhs;
+    cmLhs.template triangularView<Lower>().solveInPlace(res);
+    VERIFY_IS_EQUAL(res.rows(),0);
+    VERIFY_IS_EQUAL(res.cols(),cmRhs.cols());
+  }
+}
+
+EIGEN_DECLARE_TEST(product_trsolve)
+{
+  for(int i = 0; i < g_repeat ; i++)
+  {
+    // matrices
+    CALL_SUBTEST_1((trsolve<float,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE),internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
+    CALL_SUBTEST_2((trsolve<double,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE),internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
+    CALL_SUBTEST_3((trsolve<std::complex<float>,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2),internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))));
+    CALL_SUBTEST_4((trsolve<std::complex<double>,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2),internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))));
+
+    // vectors
+    CALL_SUBTEST_5((trsolve<float,Dynamic,1>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
+    CALL_SUBTEST_6((trsolve<double,Dynamic,1>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
+    CALL_SUBTEST_7((trsolve<std::complex<float>,Dynamic,1>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
+    CALL_SUBTEST_8((trsolve<std::complex<double>,Dynamic,1>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
+    
+    // meta-unrollers
+    CALL_SUBTEST_9((trsolve<float,4,1>()));
+    CALL_SUBTEST_10((trsolve<double,4,1>()));
+    CALL_SUBTEST_11((trsolve<std::complex<float>,4,1>()));
+    CALL_SUBTEST_12((trsolve<float,1,1>()));
+    CALL_SUBTEST_13((trsolve<float,1,2>()));
+    CALL_SUBTEST_14((trsolve<float,3,1>()));
+    
+  }
+}

diff --git a/test/qr.cpp b/test/qr.cpp
new file mode 100644
index 0000000..c38e343
--- /dev/null
+++ b/test/qr.cpp

@@ -0,0 +1,130 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/QR>
+#include "solverbase.h"
+
+template<typename MatrixType> void qr(const MatrixType& m)
+{
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> MatrixQType;
+
+  MatrixType a = MatrixType::Random(rows,cols);
+  HouseholderQR<MatrixType> qrOfA(a);
+
+  MatrixQType q = qrOfA.householderQ();
+  VERIFY_IS_UNITARY(q);
+
+  MatrixType r = qrOfA.matrixQR().template triangularView<Upper>();
+  VERIFY_IS_APPROX(a, qrOfA.householderQ() * r);
+}
+
+template<typename MatrixType, int Cols2> void qr_fixedsize()
+{
+  enum { Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime };
+  typedef typename MatrixType::Scalar Scalar;
+  Matrix<Scalar,Rows,Cols> m1 = Matrix<Scalar,Rows,Cols>::Random();
+  HouseholderQR<Matrix<Scalar,Rows,Cols> > qr(m1);
+
+  Matrix<Scalar,Rows,Cols> r = qr.matrixQR();
+  // FIXME need better way to construct trapezoid
+  for(int i = 0; i < Rows; i++) for(int j = 0; j < Cols; j++) if(i>j) r(i,j) = Scalar(0);
+
+  VERIFY_IS_APPROX(m1, qr.householderQ() * r);
+
+  check_solverbase<Matrix<Scalar,Cols,Cols2>, Matrix<Scalar,Rows,Cols2> >(m1, qr, Rows, Cols, Cols2);
+}
+
+template<typename MatrixType> void qr_invertible()
+{
+  using std::log;
+  using std::abs;
+  using std::pow;
+  using std::max;
+  typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+  typedef typename MatrixType::Scalar Scalar;
+
+  STATIC_CHECK(( internal::is_same<typename HouseholderQR<MatrixType>::StorageIndex,int>::value ));
+
+  int size = internal::random<int>(10,50);
+
+  MatrixType m1(size, size), m2(size, size), m3(size, size);
+  m1 = MatrixType::Random(size,size);
+
+  if (internal::is_same<RealScalar,float>::value)
+  {
+    // let's build a matrix more stable to inverse
+    MatrixType a = MatrixType::Random(size,size*4);
+    m1 += a * a.adjoint();
+  }
+
+  HouseholderQR<MatrixType> qr(m1);
+
+  check_solverbase<MatrixType, MatrixType>(m1, qr, size, size, size);
+
+  // now construct a matrix with prescribed determinant
+  m1.setZero();
+  for(int i = 0; i < size; i++) m1(i,i) = internal::random<Scalar>();
+  RealScalar absdet = abs(m1.diagonal().prod());
+  m3 = qr.householderQ(); // get a unitary
+  m1 = m3 * m1 * m3;
+  qr.compute(m1);
+  VERIFY_IS_APPROX(log(absdet), qr.logAbsDeterminant());
+  // This test is tricky if the determinant becomes too small.
+  // Since we generate random numbers with magnitude range [0,1], the average determinant is 0.5^size
+  VERIFY_IS_MUCH_SMALLER_THAN( abs(absdet-qr.absDeterminant()), numext::maxi(RealScalar(pow(0.5,size)),numext::maxi<RealScalar>(abs(absdet),abs(qr.absDeterminant()))) );
+  
+}
+
+template<typename MatrixType> void qr_verify_assert()
+{
+  MatrixType tmp;
+
+  HouseholderQR<MatrixType> qr;
+  VERIFY_RAISES_ASSERT(qr.matrixQR())
+  VERIFY_RAISES_ASSERT(qr.solve(tmp))
+  VERIFY_RAISES_ASSERT(qr.transpose().solve(tmp))
+  VERIFY_RAISES_ASSERT(qr.adjoint().solve(tmp))
+  VERIFY_RAISES_ASSERT(qr.householderQ())
+  VERIFY_RAISES_ASSERT(qr.absDeterminant())
+  VERIFY_RAISES_ASSERT(qr.logAbsDeterminant())
+}
+
+EIGEN_DECLARE_TEST(qr)
+{
+  for(int i = 0; i < g_repeat; i++) {
+   CALL_SUBTEST_1( qr(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE),internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+   CALL_SUBTEST_2( qr(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2),internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
+   CALL_SUBTEST_3(( qr_fixedsize<Matrix<float,3,4>, 2 >() ));
+   CALL_SUBTEST_4(( qr_fixedsize<Matrix<double,6,2>, 4 >() ));
+   CALL_SUBTEST_5(( qr_fixedsize<Matrix<double,2,5>, 7 >() ));
+   CALL_SUBTEST_11( qr(Matrix<float,1,1>()) );
+  }
+
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( qr_invertible<MatrixXf>() );
+    CALL_SUBTEST_6( qr_invertible<MatrixXd>() );
+    CALL_SUBTEST_7( qr_invertible<MatrixXcf>() );
+    CALL_SUBTEST_8( qr_invertible<MatrixXcd>() );
+  }
+
+  CALL_SUBTEST_9(qr_verify_assert<Matrix3f>());
+  CALL_SUBTEST_10(qr_verify_assert<Matrix3d>());
+  CALL_SUBTEST_1(qr_verify_assert<MatrixXf>());
+  CALL_SUBTEST_6(qr_verify_assert<MatrixXd>());
+  CALL_SUBTEST_7(qr_verify_assert<MatrixXcf>());
+  CALL_SUBTEST_8(qr_verify_assert<MatrixXcd>());
+
+  // Test problem size constructors
+  CALL_SUBTEST_12(HouseholderQR<MatrixXf>(10, 20));
+}

diff --git a/test/qr_colpivoting.cpp b/test/qr_colpivoting.cpp
new file mode 100644
index 0000000..06f1643
--- /dev/null
+++ b/test/qr_colpivoting.cpp

@@ -0,0 +1,368 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/QR>
+#include <Eigen/SVD>
+#include "solverbase.h"
+
+template <typename MatrixType>
+void cod() {
+  STATIC_CHECK(( internal::is_same<typename CompleteOrthogonalDecomposition<MatrixType>::StorageIndex,int>::value ));
+
+  Index rows = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
+  Index cols = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
+  Index cols2 = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
+  Index rank = internal::random<Index>(1, (std::min)(rows, cols) - 1);
+
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime,
+                 MatrixType::RowsAtCompileTime>
+      MatrixQType;
+  MatrixType matrix;
+  createRandomPIMatrixOfRank(rank, rows, cols, matrix);
+  CompleteOrthogonalDecomposition<MatrixType> cod(matrix);
+  VERIFY(rank == cod.rank());
+  VERIFY(cols - cod.rank() == cod.dimensionOfKernel());
+  VERIFY(!cod.isInjective());
+  VERIFY(!cod.isInvertible());
+  VERIFY(!cod.isSurjective());
+
+  MatrixQType q = cod.householderQ();
+  VERIFY_IS_UNITARY(q);
+
+  MatrixType z = cod.matrixZ();
+  VERIFY_IS_UNITARY(z);
+
+  MatrixType t;
+  t.setZero(rows, cols);
+  t.topLeftCorner(rank, rank) =
+      cod.matrixT().topLeftCorner(rank, rank).template triangularView<Upper>();
+
+  MatrixType c = q * t * z * cod.colsPermutation().inverse();
+  VERIFY_IS_APPROX(matrix, c);
+
+  check_solverbase<MatrixType, MatrixType>(matrix, cod, rows, cols, cols2);
+
+  // Verify that we get the same minimum-norm solution as the SVD.
+  MatrixType exact_solution = MatrixType::Random(cols, cols2);
+  MatrixType rhs = matrix * exact_solution;
+  MatrixType cod_solution = cod.solve(rhs);
+  JacobiSVD<MatrixType> svd(matrix, ComputeThinU | ComputeThinV);
+  MatrixType svd_solution = svd.solve(rhs);
+  VERIFY_IS_APPROX(cod_solution, svd_solution);
+
+  MatrixType pinv = cod.pseudoInverse();
+  VERIFY_IS_APPROX(cod_solution, pinv * rhs);
+}
+
+template <typename MatrixType, int Cols2>
+void cod_fixedsize() {
+  enum {
+    Rows = MatrixType::RowsAtCompileTime,
+    Cols = MatrixType::ColsAtCompileTime
+  };
+  typedef typename MatrixType::Scalar Scalar;
+  typedef CompleteOrthogonalDecomposition<Matrix<Scalar, Rows, Cols> > COD;
+  int rank = internal::random<int>(1, (std::min)(int(Rows), int(Cols)) - 1);
+  Matrix<Scalar, Rows, Cols> matrix;
+  createRandomPIMatrixOfRank(rank, Rows, Cols, matrix);
+  COD cod(matrix);
+  VERIFY(rank == cod.rank());
+  VERIFY(Cols - cod.rank() == cod.dimensionOfKernel());
+  VERIFY(cod.isInjective() == (rank == Rows));
+  VERIFY(cod.isSurjective() == (rank == Cols));
+  VERIFY(cod.isInvertible() == (cod.isInjective() && cod.isSurjective()));
+
+  check_solverbase<Matrix<Scalar, Cols, Cols2>, Matrix<Scalar, Rows, Cols2> >(matrix, cod, Rows, Cols, Cols2);
+
+  // Verify that we get the same minimum-norm solution as the SVD.
+  Matrix<Scalar, Cols, Cols2> exact_solution;
+  exact_solution.setRandom(Cols, Cols2);
+  Matrix<Scalar, Rows, Cols2> rhs = matrix * exact_solution;
+  Matrix<Scalar, Cols, Cols2> cod_solution = cod.solve(rhs);
+  JacobiSVD<MatrixType> svd(matrix, ComputeFullU | ComputeFullV);
+  Matrix<Scalar, Cols, Cols2> svd_solution = svd.solve(rhs);
+  VERIFY_IS_APPROX(cod_solution, svd_solution);
+
+  typename Inverse<COD>::PlainObject pinv = cod.pseudoInverse();
+  VERIFY_IS_APPROX(cod_solution, pinv * rhs);
+}
+
+template<typename MatrixType> void qr()
+{
+  using std::sqrt;
+
+  STATIC_CHECK(( internal::is_same<typename ColPivHouseholderQR<MatrixType>::StorageIndex,int>::value ));
+
+  Index rows = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE), cols = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE), cols2 = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE);
+  Index rank = internal::random<Index>(1, (std::min)(rows, cols)-1);
+
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> MatrixQType;
+  MatrixType m1;
+  createRandomPIMatrixOfRank(rank,rows,cols,m1);
+  ColPivHouseholderQR<MatrixType> qr(m1);
+  VERIFY_IS_EQUAL(rank, qr.rank());
+  VERIFY_IS_EQUAL(cols - qr.rank(), qr.dimensionOfKernel());
+  VERIFY(!qr.isInjective());
+  VERIFY(!qr.isInvertible());
+  VERIFY(!qr.isSurjective());
+
+  MatrixQType q = qr.householderQ();
+  VERIFY_IS_UNITARY(q);
+
+  MatrixType r = qr.matrixQR().template triangularView<Upper>();
+  MatrixType c = q * r * qr.colsPermutation().inverse();
+  VERIFY_IS_APPROX(m1, c);
+
+  // Verify that the absolute value of the diagonal elements in R are
+  // non-increasing until they reach the singularity threshold.
+  RealScalar threshold =
+      sqrt(RealScalar(rows)) * numext::abs(r(0, 0)) * NumTraits<Scalar>::epsilon();
+  for (Index i = 0; i < (std::min)(rows, cols) - 1; ++i) {
+    RealScalar x = numext::abs(r(i, i));
+    RealScalar y = numext::abs(r(i + 1, i + 1));
+    if (x < threshold && y < threshold) continue;
+    if (!test_isApproxOrLessThan(y, x)) {
+      for (Index j = 0; j < (std::min)(rows, cols); ++j) {
+        std::cout << "i = " << j << ", |r_ii| = " << numext::abs(r(j, j)) << std::endl;
+      }
+      std::cout << "Failure at i=" << i << ", rank=" << rank
+                << ", threshold=" << threshold << std::endl;
+    }
+    VERIFY_IS_APPROX_OR_LESS_THAN(y, x);
+  }
+
+  check_solverbase<MatrixType, MatrixType>(m1, qr, rows, cols, cols2);
+
+  {
+    MatrixType m2, m3;
+    Index size = rows;
+    do {
+      m1 = MatrixType::Random(size,size);
+      qr.compute(m1);
+    } while(!qr.isInvertible());
+    MatrixType m1_inv = qr.inverse();
+    m3 = m1 * MatrixType::Random(size,cols2);
+    m2 = qr.solve(m3);
+    VERIFY_IS_APPROX(m2, m1_inv*m3);
+  }
+}
+
+template<typename MatrixType, int Cols2> void qr_fixedsize()
+{
+  using std::sqrt;
+  using std::abs;
+  enum { Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime };
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  int rank = internal::random<int>(1, (std::min)(int(Rows), int(Cols))-1);
+  Matrix<Scalar,Rows,Cols> m1;
+  createRandomPIMatrixOfRank(rank,Rows,Cols,m1);
+  ColPivHouseholderQR<Matrix<Scalar,Rows,Cols> > qr(m1);
+  VERIFY_IS_EQUAL(rank, qr.rank());
+  VERIFY_IS_EQUAL(Cols - qr.rank(), qr.dimensionOfKernel());
+  VERIFY_IS_EQUAL(qr.isInjective(), (rank == Rows));
+  VERIFY_IS_EQUAL(qr.isSurjective(), (rank == Cols));
+  VERIFY_IS_EQUAL(qr.isInvertible(), (qr.isInjective() && qr.isSurjective()));
+
+  Matrix<Scalar,Rows,Cols> r = qr.matrixQR().template triangularView<Upper>();
+  Matrix<Scalar,Rows,Cols> c = qr.householderQ() * r * qr.colsPermutation().inverse();
+  VERIFY_IS_APPROX(m1, c);
+
+  check_solverbase<Matrix<Scalar,Cols,Cols2>, Matrix<Scalar,Rows,Cols2> >(m1, qr, Rows, Cols, Cols2);
+
+  // Verify that the absolute value of the diagonal elements in R are
+  // non-increasing until they reache the singularity threshold.
+  RealScalar threshold =
+      sqrt(RealScalar(Rows)) * (std::abs)(r(0, 0)) * NumTraits<Scalar>::epsilon();
+  for (Index i = 0; i < (std::min)(int(Rows), int(Cols)) - 1; ++i) {
+    RealScalar x = numext::abs(r(i, i));
+    RealScalar y = numext::abs(r(i + 1, i + 1));
+    if (x < threshold && y < threshold) continue;
+    if (!test_isApproxOrLessThan(y, x)) {
+      for (Index j = 0; j < (std::min)(int(Rows), int(Cols)); ++j) {
+        std::cout << "i = " << j << ", |r_ii| = " << numext::abs(r(j, j)) << std::endl;
+      }
+      std::cout << "Failure at i=" << i << ", rank=" << rank
+                << ", threshold=" << threshold << std::endl;
+    }
+    VERIFY_IS_APPROX_OR_LESS_THAN(y, x);
+  }
+}
+
+// This test is meant to verify that pivots are chosen such that
+// even for a graded matrix, the diagonal of R falls of roughly
+// monotonically until it reaches the threshold for singularity.
+// We use the so-called Kahan matrix, which is a famous counter-example
+// for rank-revealing QR. See
+// http://www.netlib.org/lapack/lawnspdf/lawn176.pdf
+// page 3 for more detail.
+template<typename MatrixType> void qr_kahan_matrix()
+{
+  using std::sqrt;
+  using std::abs;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+
+  Index rows = 300, cols = rows;
+
+  MatrixType m1;
+  m1.setZero(rows,cols);
+  RealScalar s = std::pow(NumTraits<RealScalar>::epsilon(), 1.0 / rows);
+  RealScalar c = std::sqrt(1 - s*s);
+  RealScalar pow_s_i(1.0); // pow(s,i)
+  for (Index i = 0; i < rows; ++i) {
+    m1(i, i) = pow_s_i;
+    m1.row(i).tail(rows - i - 1) = -pow_s_i * c * MatrixType::Ones(1, rows - i - 1);
+    pow_s_i *= s;
+  }
+  m1 = (m1 + m1.transpose()).eval();
+  ColPivHouseholderQR<MatrixType> qr(m1);
+  MatrixType r = qr.matrixQR().template triangularView<Upper>();
+
+  RealScalar threshold =
+      std::sqrt(RealScalar(rows)) * numext::abs(r(0, 0)) * NumTraits<Scalar>::epsilon();
+  for (Index i = 0; i < (std::min)(rows, cols) - 1; ++i) {
+    RealScalar x = numext::abs(r(i, i));
+    RealScalar y = numext::abs(r(i + 1, i + 1));
+    if (x < threshold && y < threshold) continue;
+    if (!test_isApproxOrLessThan(y, x)) {
+      for (Index j = 0; j < (std::min)(rows, cols); ++j) {
+        std::cout << "i = " << j << ", |r_ii| = " << numext::abs(r(j, j)) << std::endl;
+      }
+      std::cout << "Failure at i=" << i << ", rank=" << qr.rank()
+                << ", threshold=" << threshold << std::endl;
+    }
+    VERIFY_IS_APPROX_OR_LESS_THAN(y, x);
+  }
+}
+
+template<typename MatrixType> void qr_invertible()
+{
+  using std::log;
+  using std::abs;
+  typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+  typedef typename MatrixType::Scalar Scalar;
+
+  int size = internal::random<int>(10,50);
+
+  MatrixType m1(size, size), m2(size, size), m3(size, size);
+  m1 = MatrixType::Random(size,size);
+
+  if (internal::is_same<RealScalar,float>::value)
+  {
+    // let's build a matrix more stable to inverse
+    MatrixType a = MatrixType::Random(size,size*2);
+    m1 += a * a.adjoint();
+  }
+
+  ColPivHouseholderQR<MatrixType> qr(m1);
+
+  check_solverbase<MatrixType, MatrixType>(m1, qr, size, size, size);
+
+  // now construct a matrix with prescribed determinant
+  m1.setZero();
+  for(int i = 0; i < size; i++) m1(i,i) = internal::random<Scalar>();
+  RealScalar absdet = abs(m1.diagonal().prod());
+  m3 = qr.householderQ(); // get a unitary
+  m1 = m3 * m1 * m3;
+  qr.compute(m1);
+  VERIFY_IS_APPROX(absdet, qr.absDeterminant());
+  VERIFY_IS_APPROX(log(absdet), qr.logAbsDeterminant());
+}
+
+template<typename MatrixType> void qr_verify_assert()
+{
+  MatrixType tmp;
+
+  ColPivHouseholderQR<MatrixType> qr;
+  VERIFY_RAISES_ASSERT(qr.matrixQR())
+  VERIFY_RAISES_ASSERT(qr.solve(tmp))
+  VERIFY_RAISES_ASSERT(qr.transpose().solve(tmp))
+  VERIFY_RAISES_ASSERT(qr.adjoint().solve(tmp))
+  VERIFY_RAISES_ASSERT(qr.householderQ())
+  VERIFY_RAISES_ASSERT(qr.dimensionOfKernel())
+  VERIFY_RAISES_ASSERT(qr.isInjective())
+  VERIFY_RAISES_ASSERT(qr.isSurjective())
+  VERIFY_RAISES_ASSERT(qr.isInvertible())
+  VERIFY_RAISES_ASSERT(qr.inverse())
+  VERIFY_RAISES_ASSERT(qr.absDeterminant())
+  VERIFY_RAISES_ASSERT(qr.logAbsDeterminant())
+}
+
+template<typename MatrixType> void cod_verify_assert()
+{
+  MatrixType tmp;
+
+  CompleteOrthogonalDecomposition<MatrixType> cod;
+  VERIFY_RAISES_ASSERT(cod.matrixQTZ())
+  VERIFY_RAISES_ASSERT(cod.solve(tmp))
+  VERIFY_RAISES_ASSERT(cod.transpose().solve(tmp))
+  VERIFY_RAISES_ASSERT(cod.adjoint().solve(tmp))
+  VERIFY_RAISES_ASSERT(cod.householderQ())
+  VERIFY_RAISES_ASSERT(cod.dimensionOfKernel())
+  VERIFY_RAISES_ASSERT(cod.isInjective())
+  VERIFY_RAISES_ASSERT(cod.isSurjective())
+  VERIFY_RAISES_ASSERT(cod.isInvertible())
+  VERIFY_RAISES_ASSERT(cod.pseudoInverse())
+  VERIFY_RAISES_ASSERT(cod.absDeterminant())
+  VERIFY_RAISES_ASSERT(cod.logAbsDeterminant())
+}
+
+EIGEN_DECLARE_TEST(qr_colpivoting)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( qr<MatrixXf>() );
+    CALL_SUBTEST_2( qr<MatrixXd>() );
+    CALL_SUBTEST_3( qr<MatrixXcd>() );
+    CALL_SUBTEST_4(( qr_fixedsize<Matrix<float,3,5>, 4 >() ));
+    CALL_SUBTEST_5(( qr_fixedsize<Matrix<double,6,2>, 3 >() ));
+    CALL_SUBTEST_5(( qr_fixedsize<Matrix<double,1,1>, 1 >() ));
+  }
+
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( cod<MatrixXf>() );
+    CALL_SUBTEST_2( cod<MatrixXd>() );
+    CALL_SUBTEST_3( cod<MatrixXcd>() );
+    CALL_SUBTEST_4(( cod_fixedsize<Matrix<float,3,5>, 4 >() ));
+    CALL_SUBTEST_5(( cod_fixedsize<Matrix<double,6,2>, 3 >() ));
+    CALL_SUBTEST_5(( cod_fixedsize<Matrix<double,1,1>, 1 >() ));
+  }
+
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( qr_invertible<MatrixXf>() );
+    CALL_SUBTEST_2( qr_invertible<MatrixXd>() );
+    CALL_SUBTEST_6( qr_invertible<MatrixXcf>() );
+    CALL_SUBTEST_3( qr_invertible<MatrixXcd>() );
+  }
+
+  CALL_SUBTEST_7(qr_verify_assert<Matrix3f>());
+  CALL_SUBTEST_8(qr_verify_assert<Matrix3d>());
+  CALL_SUBTEST_1(qr_verify_assert<MatrixXf>());
+  CALL_SUBTEST_2(qr_verify_assert<MatrixXd>());
+  CALL_SUBTEST_6(qr_verify_assert<MatrixXcf>());
+  CALL_SUBTEST_3(qr_verify_assert<MatrixXcd>());
+
+  CALL_SUBTEST_7(cod_verify_assert<Matrix3f>());
+  CALL_SUBTEST_8(cod_verify_assert<Matrix3d>());
+  CALL_SUBTEST_1(cod_verify_assert<MatrixXf>());
+  CALL_SUBTEST_2(cod_verify_assert<MatrixXd>());
+  CALL_SUBTEST_6(cod_verify_assert<MatrixXcf>());
+  CALL_SUBTEST_3(cod_verify_assert<MatrixXcd>());
+
+  // Test problem size constructors
+  CALL_SUBTEST_9(ColPivHouseholderQR<MatrixXf>(10, 20));
+
+  CALL_SUBTEST_1( qr_kahan_matrix<MatrixXf>() );
+  CALL_SUBTEST_2( qr_kahan_matrix<MatrixXd>() );
+}

diff --git a/test/qr_fullpivoting.cpp b/test/qr_fullpivoting.cpp
new file mode 100644
index 0000000..f2d8cb3
--- /dev/null
+++ b/test/qr_fullpivoting.cpp

@@ -0,0 +1,159 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/QR>
+#include "solverbase.h"
+
+template<typename MatrixType> void qr()
+{
+  STATIC_CHECK(( internal::is_same<typename FullPivHouseholderQR<MatrixType>::StorageIndex,int>::value ));
+
+  static const int Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime;
+  Index max_size = EIGEN_TEST_MAX_SIZE;
+  Index min_size = numext::maxi(1,EIGEN_TEST_MAX_SIZE/10);
+  Index rows  = Rows == Dynamic ? internal::random<Index>(min_size,max_size) : Rows,
+        cols  = Cols == Dynamic ? internal::random<Index>(min_size,max_size) : Cols,
+        cols2 = Cols == Dynamic ? internal::random<Index>(min_size,max_size) : Cols,
+        rank  = internal::random<Index>(1, (std::min)(rows, cols)-1);
+
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> MatrixQType;
+  MatrixType m1;
+  createRandomPIMatrixOfRank(rank,rows,cols,m1);
+  FullPivHouseholderQR<MatrixType> qr(m1);
+  VERIFY_IS_EQUAL(rank, qr.rank());
+  VERIFY_IS_EQUAL(cols - qr.rank(), qr.dimensionOfKernel());
+  VERIFY(!qr.isInjective());
+  VERIFY(!qr.isInvertible());
+  VERIFY(!qr.isSurjective());
+
+  MatrixType r = qr.matrixQR();
+  
+  MatrixQType q = qr.matrixQ();
+  VERIFY_IS_UNITARY(q);
+  
+  // FIXME need better way to construct trapezoid
+  for(int i = 0; i < rows; i++) for(int j = 0; j < cols; j++) if(i>j) r(i,j) = Scalar(0);
+
+  MatrixType c = qr.matrixQ() * r * qr.colsPermutation().inverse();
+
+  VERIFY_IS_APPROX(m1, c);
+  
+  // stress the ReturnByValue mechanism
+  MatrixType tmp;
+  VERIFY_IS_APPROX(tmp.noalias() = qr.matrixQ() * r, (qr.matrixQ() * r).eval());
+  
+  check_solverbase<MatrixType, MatrixType>(m1, qr, rows, cols, cols2);
+
+  {
+    MatrixType m2, m3;
+    Index size = rows;
+    do {
+      m1 = MatrixType::Random(size,size);
+      qr.compute(m1);
+    } while(!qr.isInvertible());
+    MatrixType m1_inv = qr.inverse();
+    m3 = m1 * MatrixType::Random(size,cols2);
+    m2 = qr.solve(m3);
+    VERIFY_IS_APPROX(m2, m1_inv*m3);
+  }
+}
+
+template<typename MatrixType> void qr_invertible()
+{
+  using std::log;
+  using std::abs;
+  typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+  typedef typename MatrixType::Scalar Scalar;
+
+  Index max_size = numext::mini(50,EIGEN_TEST_MAX_SIZE);
+  Index min_size = numext::maxi(1,EIGEN_TEST_MAX_SIZE/10);
+  Index size = internal::random<Index>(min_size,max_size);
+
+  MatrixType m1(size, size), m2(size, size), m3(size, size);
+  m1 = MatrixType::Random(size,size);
+
+  if (internal::is_same<RealScalar,float>::value)
+  {
+    // let's build a matrix more stable to inverse
+    MatrixType a = MatrixType::Random(size,size*2);
+    m1 += a * a.adjoint();
+  }
+
+  FullPivHouseholderQR<MatrixType> qr(m1);
+  VERIFY(qr.isInjective());
+  VERIFY(qr.isInvertible());
+  VERIFY(qr.isSurjective());
+
+  check_solverbase<MatrixType, MatrixType>(m1, qr, size, size, size);
+
+  // now construct a matrix with prescribed determinant
+  m1.setZero();
+  for(int i = 0; i < size; i++) m1(i,i) = internal::random<Scalar>();
+  RealScalar absdet = abs(m1.diagonal().prod());
+  m3 = qr.matrixQ(); // get a unitary
+  m1 = m3 * m1 * m3;
+  qr.compute(m1);
+  VERIFY_IS_APPROX(absdet, qr.absDeterminant());
+  VERIFY_IS_APPROX(log(absdet), qr.logAbsDeterminant());
+}
+
+template<typename MatrixType> void qr_verify_assert()
+{
+  MatrixType tmp;
+
+  FullPivHouseholderQR<MatrixType> qr;
+  VERIFY_RAISES_ASSERT(qr.matrixQR())
+  VERIFY_RAISES_ASSERT(qr.solve(tmp))
+  VERIFY_RAISES_ASSERT(qr.transpose().solve(tmp))
+  VERIFY_RAISES_ASSERT(qr.adjoint().solve(tmp))
+  VERIFY_RAISES_ASSERT(qr.matrixQ())
+  VERIFY_RAISES_ASSERT(qr.dimensionOfKernel())
+  VERIFY_RAISES_ASSERT(qr.isInjective())
+  VERIFY_RAISES_ASSERT(qr.isSurjective())
+  VERIFY_RAISES_ASSERT(qr.isInvertible())
+  VERIFY_RAISES_ASSERT(qr.inverse())
+  VERIFY_RAISES_ASSERT(qr.absDeterminant())
+  VERIFY_RAISES_ASSERT(qr.logAbsDeterminant())
+}
+
+EIGEN_DECLARE_TEST(qr_fullpivoting)
+{
+  for(int i = 0; i < 1; i++) {
+    CALL_SUBTEST_5( qr<Matrix3f>() );
+    CALL_SUBTEST_6( qr<Matrix3d>() );
+    CALL_SUBTEST_8( qr<Matrix2f>() );
+    CALL_SUBTEST_1( qr<MatrixXf>() );
+    CALL_SUBTEST_2( qr<MatrixXd>() );
+    CALL_SUBTEST_3( qr<MatrixXcd>() );
+  }
+
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( qr_invertible<MatrixXf>() );
+    CALL_SUBTEST_2( qr_invertible<MatrixXd>() );
+    CALL_SUBTEST_4( qr_invertible<MatrixXcf>() );
+    CALL_SUBTEST_3( qr_invertible<MatrixXcd>() );
+  }
+
+  CALL_SUBTEST_5(qr_verify_assert<Matrix3f>());
+  CALL_SUBTEST_6(qr_verify_assert<Matrix3d>());
+  CALL_SUBTEST_1(qr_verify_assert<MatrixXf>());
+  CALL_SUBTEST_2(qr_verify_assert<MatrixXd>());
+  CALL_SUBTEST_4(qr_verify_assert<MatrixXcf>());
+  CALL_SUBTEST_3(qr_verify_assert<MatrixXcd>());
+
+  // Test problem size constructors
+  CALL_SUBTEST_7(FullPivHouseholderQR<MatrixXf>(10, 20));
+  CALL_SUBTEST_7((FullPivHouseholderQR<Matrix<float,10,20> >(10,20)));
+  CALL_SUBTEST_7((FullPivHouseholderQR<Matrix<float,10,20> >(Matrix<float,10,20>::Random())));
+  CALL_SUBTEST_7((FullPivHouseholderQR<Matrix<float,20,10> >(20,10)));
+  CALL_SUBTEST_7((FullPivHouseholderQR<Matrix<float,20,10> >(Matrix<float,20,10>::Random())));
+}

diff --git a/test/qtvector.cpp b/test/qtvector.cpp
new file mode 100644
index 0000000..4ec79b1
--- /dev/null
+++ b/test/qtvector.cpp

@@ -0,0 +1,156 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_WORK_AROUND_QT_BUG_CALLING_WRONG_OPERATOR_NEW_FIXED_IN_QT_4_5
+
+#include "main.h"
+#include <QtCore/QVector>
+#include <Eigen/Geometry>
+#include <Eigen/QtAlignedMalloc>
+
+template<typename MatrixType>
+void check_qtvector_matrix(const MatrixType& m)
+{
+  Index rows = m.rows();
+  Index cols = m.cols();
+  MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
+  QVector<MatrixType> v(10, MatrixType(rows,cols)), w(20, y);
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(w[i], y);
+  }
+  v[5] = x;
+  w[6] = v[5];
+  VERIFY_IS_APPROX(w[6], v[5]);
+  v = w;
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(w[i], v[i]);
+  }
+
+  v.resize(21);
+  v[20] = x;
+  VERIFY_IS_APPROX(v[20], x);
+  v.fill(y,22);
+  VERIFY_IS_APPROX(v[21], y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v[22], x);
+  VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(MatrixType));
+
+  // do a lot of push_back such that the vector gets internally resized
+  // (with memory reallocation)
+  MatrixType* ref = &w[0];
+  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
+    v.push_back(w[i%w.size()]);
+  for(int i=23; i<v.size(); ++i)
+  {
+    VERIFY(v[i]==w[(i-23)%w.size()]);
+  }
+}
+
+template<typename TransformType>
+void check_qtvector_transform(const TransformType&)
+{
+  typedef typename TransformType::MatrixType MatrixType;
+  TransformType x(MatrixType::Random()), y(MatrixType::Random());
+  QVector<TransformType> v(10), w(20, y);
+  v[5] = x;
+  w[6] = v[5];
+  VERIFY_IS_APPROX(w[6], v[5]);
+  v = w;
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(w[i], v[i]);
+  }
+
+  v.resize(21);
+  v[20] = x;
+  VERIFY_IS_APPROX(v[20], x);
+  v.fill(y,22);
+  VERIFY_IS_APPROX(v[21], y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v[22], x);
+  VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(TransformType));
+
+  // do a lot of push_back such that the vector gets internally resized
+  // (with memory reallocation)
+  TransformType* ref = &w[0];
+  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
+    v.push_back(w[i%w.size()]);
+  for(unsigned int i=23; int(i)<v.size(); ++i)
+  {
+    VERIFY(v[i].matrix()==w[(i-23)%w.size()].matrix());
+  }
+}
+
+template<typename QuaternionType>
+void check_qtvector_quaternion(const QuaternionType&)
+{
+  typedef typename QuaternionType::Coefficients Coefficients;
+  QuaternionType x(Coefficients::Random()), y(Coefficients::Random());
+  QVector<QuaternionType> v(10), w(20, y);
+  v[5] = x;
+  w[6] = v[5];
+  VERIFY_IS_APPROX(w[6], v[5]);
+  v = w;
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(w[i], v[i]);
+  }
+
+  v.resize(21);
+  v[20] = x;
+  VERIFY_IS_APPROX(v[20], x);
+  v.fill(y,22);
+  VERIFY_IS_APPROX(v[21], y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v[22], x);
+  VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(QuaternionType));
+
+  // do a lot of push_back such that the vector gets internally resized
+  // (with memory reallocation)
+  QuaternionType* ref = &w[0];
+  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
+    v.push_back(w[i%w.size()]);
+  for(unsigned int i=23; int(i)<v.size(); ++i)
+  {
+    VERIFY(v[i].coeffs()==w[(i-23)%w.size()].coeffs());
+  }
+}
+
+EIGEN_DECLARE_TEST(qtvector)
+{
+  // some non vectorizable fixed sizes
+  CALL_SUBTEST(check_qtvector_matrix(Vector2f()));
+  CALL_SUBTEST(check_qtvector_matrix(Matrix3f()));
+  CALL_SUBTEST(check_qtvector_matrix(Matrix3d()));
+
+  // some vectorizable fixed sizes
+  CALL_SUBTEST(check_qtvector_matrix(Matrix2f()));
+  CALL_SUBTEST(check_qtvector_matrix(Vector4f()));
+  CALL_SUBTEST(check_qtvector_matrix(Matrix4f()));
+  CALL_SUBTEST(check_qtvector_matrix(Matrix4d()));
+
+  // some dynamic sizes
+  CALL_SUBTEST(check_qtvector_matrix(MatrixXd(1,1)));
+  CALL_SUBTEST(check_qtvector_matrix(VectorXd(20)));
+  CALL_SUBTEST(check_qtvector_matrix(RowVectorXf(20)));
+  CALL_SUBTEST(check_qtvector_matrix(MatrixXcf(10,10)));
+
+  // some Transform
+  CALL_SUBTEST(check_qtvector_transform(Affine2f()));
+  CALL_SUBTEST(check_qtvector_transform(Affine3f()));
+  CALL_SUBTEST(check_qtvector_transform(Affine3d()));
+  //CALL_SUBTEST(check_qtvector_transform(Transform4d()));
+
+  // some Quaternion
+  CALL_SUBTEST(check_qtvector_quaternion(Quaternionf()));
+  CALL_SUBTEST(check_qtvector_quaternion(Quaternionf()));
+}

diff --git a/test/rand.cpp b/test/rand.cpp
new file mode 100644
index 0000000..984c01f
--- /dev/null
+++ b/test/rand.cpp

@@ -0,0 +1,118 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+typedef long long int64;
+
+template<typename Scalar> Scalar check_in_range(Scalar x, Scalar y)
+{
+  Scalar r = internal::random<Scalar>(x,y);
+  VERIFY(r>=x);
+  if(y>=x)
+  {
+    VERIFY(r<=y);
+  }
+  return r;
+}
+
+template<typename Scalar> void check_all_in_range(Scalar x, Scalar y)
+{
+  Array<int,1,Dynamic> mask(y-x+1);
+  mask.fill(0);
+  long n = (y-x+1)*32;
+  for(long k=0; k<n; ++k)
+  {
+    mask( check_in_range(x,y)-x )++;
+  }
+  for(Index i=0; i<mask.size(); ++i)
+    if(mask(i)==0)
+      std::cout << "WARNING: value " << x+i << " not reached." << std::endl;
+  VERIFY( (mask>0).all() );
+}
+
+template<typename Scalar> void check_histogram(Scalar x, Scalar y, int bins)
+{
+  Array<int,1,Dynamic> hist(bins);
+  hist.fill(0);
+  int f = 100000;
+  int n = bins*f;
+  int64 range = int64(y)-int64(x);
+  int divisor = int((range+1)/bins);
+  assert(((range+1)%bins)==0);
+  for(int k=0; k<n; ++k)
+  {
+    Scalar r = check_in_range(x,y);
+    hist( int((int64(r)-int64(x))/divisor) )++;
+  }
+  VERIFY( (((hist.cast<double>()/double(f))-1.0).abs()<0.03).all() );
+}
+
+EIGEN_DECLARE_TEST(rand)
+{
+  long long_ref = NumTraits<long>::highest()/10;
+  signed char char_offset = (std::min)(g_repeat,64);
+  signed char short_offset = (std::min)(g_repeat,16000);
+
+  for(int i = 0; i < g_repeat*10000; i++) {
+    CALL_SUBTEST(check_in_range<float>(10,11));
+    CALL_SUBTEST(check_in_range<float>(1.24234523,1.24234523));
+    CALL_SUBTEST(check_in_range<float>(-1,1));
+    CALL_SUBTEST(check_in_range<float>(-1432.2352,-1432.2352));
+
+    CALL_SUBTEST(check_in_range<double>(10,11));
+    CALL_SUBTEST(check_in_range<double>(1.24234523,1.24234523));
+    CALL_SUBTEST(check_in_range<double>(-1,1));
+    CALL_SUBTEST(check_in_range<double>(-1432.2352,-1432.2352));
+
+    CALL_SUBTEST(check_in_range<int>(0,-1));
+    CALL_SUBTEST(check_in_range<short>(0,-1));
+    CALL_SUBTEST(check_in_range<long>(0,-1));
+    CALL_SUBTEST(check_in_range<int>(-673456,673456));
+    CALL_SUBTEST(check_in_range<int>(-RAND_MAX+10,RAND_MAX-10));
+    CALL_SUBTEST(check_in_range<short>(-24345,24345));
+    CALL_SUBTEST(check_in_range<long>(-long_ref,long_ref));
+  }
+
+  CALL_SUBTEST(check_all_in_range<signed char>(11,11));
+  CALL_SUBTEST(check_all_in_range<signed char>(11,11+char_offset));
+  CALL_SUBTEST(check_all_in_range<signed char>(-5,5));
+  CALL_SUBTEST(check_all_in_range<signed char>(-11-char_offset,-11));
+  CALL_SUBTEST(check_all_in_range<signed char>(-126,-126+char_offset));
+  CALL_SUBTEST(check_all_in_range<signed char>(126-char_offset,126));
+  CALL_SUBTEST(check_all_in_range<signed char>(-126,126));
+
+  CALL_SUBTEST(check_all_in_range<short>(11,11));
+  CALL_SUBTEST(check_all_in_range<short>(11,11+short_offset));
+  CALL_SUBTEST(check_all_in_range<short>(-5,5));
+  CALL_SUBTEST(check_all_in_range<short>(-11-short_offset,-11));
+  CALL_SUBTEST(check_all_in_range<short>(-24345,-24345+short_offset));
+  CALL_SUBTEST(check_all_in_range<short>(24345,24345+short_offset));
+
+  CALL_SUBTEST(check_all_in_range<int>(11,11));
+  CALL_SUBTEST(check_all_in_range<int>(11,11+g_repeat));
+  CALL_SUBTEST(check_all_in_range<int>(-5,5));
+  CALL_SUBTEST(check_all_in_range<int>(-11-g_repeat,-11));
+  CALL_SUBTEST(check_all_in_range<int>(-673456,-673456+g_repeat));
+  CALL_SUBTEST(check_all_in_range<int>(673456,673456+g_repeat));
+
+  CALL_SUBTEST(check_all_in_range<long>(11,11));
+  CALL_SUBTEST(check_all_in_range<long>(11,11+g_repeat));
+  CALL_SUBTEST(check_all_in_range<long>(-5,5));
+  CALL_SUBTEST(check_all_in_range<long>(-11-g_repeat,-11));
+  CALL_SUBTEST(check_all_in_range<long>(-long_ref,-long_ref+g_repeat));
+  CALL_SUBTEST(check_all_in_range<long>( long_ref, long_ref+g_repeat));
+
+  CALL_SUBTEST(check_histogram<int>(-5,5,11));
+  int bins = 100;
+  CALL_SUBTEST(check_histogram<int>(-3333,-3333+bins*(3333/bins)-1,bins));
+  bins = 1000;
+  CALL_SUBTEST(check_histogram<int>(-RAND_MAX+10,-RAND_MAX+10+bins*(RAND_MAX/bins)-1,bins));
+  CALL_SUBTEST(check_histogram<int>(-RAND_MAX+10,-int64(RAND_MAX)+10+bins*(2*int64(RAND_MAX)/bins)-1,bins));
+}

diff --git a/test/random_without_cast_overflow.h b/test/random_without_cast_overflow.h
new file mode 100644
index 0000000..0003451
--- /dev/null
+++ b/test/random_without_cast_overflow.h

@@ -0,0 +1,152 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 C. Antonio Sanchez <cantonios@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Utilities for generating random numbers without overflows, which might
+// otherwise result in undefined behavior.
+
+namespace Eigen {
+namespace internal {
+
+// Default implementation assuming SrcScalar fits into TgtScalar.
+template <typename SrcScalar, typename TgtScalar, typename EnableIf = void>
+struct random_without_cast_overflow {
+  static SrcScalar value() { return internal::random<SrcScalar>(); }
+};
+
+// Signed to unsigned integer widening cast.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<NumTraits<SrcScalar>::IsInteger && NumTraits<TgtScalar>::IsInteger &&
+                                 !NumTraits<TgtScalar>::IsSigned &&
+                                 (std::numeric_limits<SrcScalar>::digits < std::numeric_limits<TgtScalar>::digits ||
+                                  (std::numeric_limits<SrcScalar>::digits == std::numeric_limits<TgtScalar>::digits &&
+                                   NumTraits<SrcScalar>::IsSigned))>::type> {
+  static SrcScalar value() {
+    SrcScalar a = internal::random<SrcScalar>();
+    return a < SrcScalar(0) ? -(a + 1) : a;
+  }
+};
+
+// Integer to unsigned narrowing cast.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<
+        NumTraits<SrcScalar>::IsInteger && NumTraits<TgtScalar>::IsInteger && !NumTraits<SrcScalar>::IsSigned &&
+        (std::numeric_limits<SrcScalar>::digits > std::numeric_limits<TgtScalar>::digits)>::type> {
+  static SrcScalar value() {
+    TgtScalar b = internal::random<TgtScalar>();
+    return static_cast<SrcScalar>(b < TgtScalar(0) ? -(b + 1) : b);
+  }
+};
+
+// Integer to signed narrowing cast.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<
+        NumTraits<SrcScalar>::IsInteger && NumTraits<TgtScalar>::IsInteger && NumTraits<SrcScalar>::IsSigned &&
+        (std::numeric_limits<SrcScalar>::digits > std::numeric_limits<TgtScalar>::digits)>::type> {
+  static SrcScalar value() { return static_cast<SrcScalar>(internal::random<TgtScalar>()); }
+};
+
+// Unsigned to signed integer narrowing cast.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<NumTraits<SrcScalar>::IsInteger && NumTraits<TgtScalar>::IsInteger &&
+                                 !NumTraits<SrcScalar>::IsSigned && NumTraits<TgtScalar>::IsSigned &&
+                                 (std::numeric_limits<SrcScalar>::digits ==
+                                  std::numeric_limits<TgtScalar>::digits)>::type> {
+  static SrcScalar value() { return internal::random<SrcScalar>() / 2; }
+};
+
+// Floating-point to integer, full precision.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<
+        !NumTraits<SrcScalar>::IsInteger && !NumTraits<SrcScalar>::IsComplex && NumTraits<TgtScalar>::IsInteger &&
+        (std::numeric_limits<TgtScalar>::digits <= std::numeric_limits<SrcScalar>::digits)>::type> {
+  static SrcScalar value() { return static_cast<SrcScalar>(internal::random<TgtScalar>()); }
+};
+
+// Floating-point to integer, narrowing precision.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<
+        !NumTraits<SrcScalar>::IsInteger && !NumTraits<SrcScalar>::IsComplex && NumTraits<TgtScalar>::IsInteger &&
+        (std::numeric_limits<TgtScalar>::digits > std::numeric_limits<SrcScalar>::digits)>::type> {
+  static SrcScalar value() {
+    // NOTE: internal::random<T>() is limited by RAND_MAX, so random<int64_t> is always within that range.
+    // This prevents us from simply shifting bits, which would result in only 0 or -1.
+    // Instead, keep least-significant K bits and sign.
+    static const TgtScalar KeepMask = (static_cast<TgtScalar>(1) << std::numeric_limits<SrcScalar>::digits) - 1;
+    const TgtScalar a = internal::random<TgtScalar>();
+    return static_cast<SrcScalar>(a > TgtScalar(0) ? (a & KeepMask) : -(a & KeepMask));
+  }
+};
+
+// Integer to floating-point, re-use above logic.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<NumTraits<SrcScalar>::IsInteger && !NumTraits<TgtScalar>::IsInteger &&
+                                 !NumTraits<TgtScalar>::IsComplex>::type> {
+  static SrcScalar value() {
+    return static_cast<SrcScalar>(random_without_cast_overflow<TgtScalar, SrcScalar>::value());
+  }
+};
+
+// Floating-point narrowing conversion.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<!NumTraits<SrcScalar>::IsInteger && !NumTraits<SrcScalar>::IsComplex &&
+                                 !NumTraits<TgtScalar>::IsInteger && !NumTraits<TgtScalar>::IsComplex &&
+                                 (std::numeric_limits<SrcScalar>::digits >
+                                  std::numeric_limits<TgtScalar>::digits)>::type> {
+  static SrcScalar value() { return static_cast<SrcScalar>(internal::random<TgtScalar>()); }
+};
+
+// Complex to non-complex.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<NumTraits<SrcScalar>::IsComplex && !NumTraits<TgtScalar>::IsComplex>::type> {
+  typedef typename NumTraits<SrcScalar>::Real SrcReal;
+  static SrcScalar value() { return SrcScalar(random_without_cast_overflow<SrcReal, TgtScalar>::value(), 0); }
+};
+
+// Non-complex to complex.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<!NumTraits<SrcScalar>::IsComplex && NumTraits<TgtScalar>::IsComplex>::type> {
+  typedef typename NumTraits<TgtScalar>::Real TgtReal;
+  static SrcScalar value() { return random_without_cast_overflow<SrcScalar, TgtReal>::value(); }
+};
+
+// Complex to complex.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<NumTraits<SrcScalar>::IsComplex && NumTraits<TgtScalar>::IsComplex>::type> {
+  typedef typename NumTraits<SrcScalar>::Real SrcReal;
+  typedef typename NumTraits<TgtScalar>::Real TgtReal;
+  static SrcScalar value() {
+    return SrcScalar(random_without_cast_overflow<SrcReal, TgtReal>::value(),
+                     random_without_cast_overflow<SrcReal, TgtReal>::value());
+  }
+};
+
+}  // namespace internal
+}  // namespace Eigen

diff --git a/test/real_qz.cpp b/test/real_qz.cpp
new file mode 100644
index 0000000..1cf7aba
--- /dev/null
+++ b/test/real_qz.cpp

@@ -0,0 +1,94 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Alexey Korepanov <kaikaikai@yandex.ru>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_RUNTIME_NO_MALLOC
+#include "main.h"
+#include <limits>
+#include <Eigen/Eigenvalues>
+
+template<typename MatrixType> void real_qz(const MatrixType& m)
+{
+  /* this test covers the following files:
+     RealQZ.h
+  */
+  using std::abs;
+  typedef typename MatrixType::Scalar Scalar;
+  
+  Index dim = m.cols();
+  
+  MatrixType A = MatrixType::Random(dim,dim),
+             B = MatrixType::Random(dim,dim);
+
+
+  // Regression test for bug 985: Randomly set rows or columns to zero
+  Index k=internal::random<Index>(0, dim-1);
+  switch(internal::random<int>(0,10)) {
+  case 0:
+    A.row(k).setZero(); break;
+  case 1:
+    A.col(k).setZero(); break;
+  case 2:
+    B.row(k).setZero(); break;
+  case 3:
+    B.col(k).setZero(); break;
+  default:
+    break;
+  }
+
+  RealQZ<MatrixType> qz(dim);
+  // TODO enable full-prealocation of required memory, this probably requires an in-place mode for HessenbergDecomposition
+  //Eigen::internal::set_is_malloc_allowed(false);
+  qz.compute(A,B);
+  //Eigen::internal::set_is_malloc_allowed(true);
+  
+  VERIFY_IS_EQUAL(qz.info(), Success);
+  // check for zeros
+  bool all_zeros = true;
+  for (Index i=0; i<A.cols(); i++)
+    for (Index j=0; j<i; j++) {
+      if (abs(qz.matrixT()(i,j))!=Scalar(0.0))
+      {
+        std::cerr << "Error: T(" << i << "," << j << ") = " << qz.matrixT()(i,j) << std::endl;
+        all_zeros = false;
+      }
+      if (j<i-1 && abs(qz.matrixS()(i,j))!=Scalar(0.0))
+      {
+        std::cerr << "Error: S(" << i << "," << j << ") = " << qz.matrixS()(i,j) << std::endl;
+        all_zeros = false;
+      }
+      if (j==i-1 && j>0 && abs(qz.matrixS()(i,j))!=Scalar(0.0) && abs(qz.matrixS()(i-1,j-1))!=Scalar(0.0))
+      {
+        std::cerr << "Error: S(" << i << "," << j << ") = " << qz.matrixS()(i,j)  << " && S(" << i-1 << "," << j-1 << ") = " << qz.matrixS()(i-1,j-1) << std::endl;
+        all_zeros = false;
+      }
+    }
+  VERIFY_IS_EQUAL(all_zeros, true);
+  VERIFY_IS_APPROX(qz.matrixQ()*qz.matrixS()*qz.matrixZ(), A);
+  VERIFY_IS_APPROX(qz.matrixQ()*qz.matrixT()*qz.matrixZ(), B);
+  VERIFY_IS_APPROX(qz.matrixQ()*qz.matrixQ().adjoint(), MatrixType::Identity(dim,dim));
+  VERIFY_IS_APPROX(qz.matrixZ()*qz.matrixZ().adjoint(), MatrixType::Identity(dim,dim));
+}
+
+EIGEN_DECLARE_TEST(real_qz)
+{
+  int s = 0;
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( real_qz(Matrix4f()) );
+    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
+    CALL_SUBTEST_2( real_qz(MatrixXd(s,s)) );
+
+    // some trivial but implementation-wise tricky cases
+    CALL_SUBTEST_2( real_qz(MatrixXd(1,1)) );
+    CALL_SUBTEST_2( real_qz(MatrixXd(2,2)) );
+    CALL_SUBTEST_3( real_qz(Matrix<double,1,1>()) );
+    CALL_SUBTEST_4( real_qz(Matrix2d()) );
+  }
+  
+  TEST_SET_BUT_UNUSED_VARIABLE(s)
+}

diff --git a/test/redux.cpp b/test/redux.cpp
new file mode 100644
index 0000000..fdbab77
--- /dev/null
+++ b/test/redux.cpp

@@ -0,0 +1,183 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define TEST_ENABLE_TEMPORARY_TRACKING
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+// ^^ see bug 1449
+
+#include "main.h"
+
+template<typename MatrixType> void matrixRedux(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols);
+
+  // The entries of m1 are uniformly distributed in [0,1], so m1.prod() is very small. This may lead to test
+  // failures if we underflow into denormals. Thus, we scale so that entries are close to 1.
+  MatrixType m1_for_prod = MatrixType::Ones(rows, cols) + RealScalar(0.2) * m1;
+
+  Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> m2(rows,rows);
+  m2.setRandom();
+
+  VERIFY_IS_MUCH_SMALLER_THAN(MatrixType::Zero(rows, cols).sum(), Scalar(1));
+  VERIFY_IS_APPROX(MatrixType::Ones(rows, cols).sum(), Scalar(float(rows*cols))); // the float() here to shut up excessive MSVC warning about int->complex conversion being lossy
+  Scalar s(0), p(1), minc(numext::real(m1.coeff(0))), maxc(numext::real(m1.coeff(0)));
+  for(int j = 0; j < cols; j++)
+  for(int i = 0; i < rows; i++)
+  {
+    s += m1(i,j);
+    p *= m1_for_prod(i,j);
+    minc = (std::min)(numext::real(minc), numext::real(m1(i,j)));
+    maxc = (std::max)(numext::real(maxc), numext::real(m1(i,j)));
+  }
+  const Scalar mean = s/Scalar(RealScalar(rows*cols));
+
+  VERIFY_IS_APPROX(m1.sum(), s);
+  VERIFY_IS_APPROX(m1.mean(), mean);
+  VERIFY_IS_APPROX(m1_for_prod.prod(), p);
+  VERIFY_IS_APPROX(m1.real().minCoeff(), numext::real(minc));
+  VERIFY_IS_APPROX(m1.real().maxCoeff(), numext::real(maxc));
+  
+  // test that partial reduction works if nested expressions is forced to evaluate early
+  VERIFY_IS_APPROX((m1.matrix() * m1.matrix().transpose())       .cwiseProduct(m2.matrix()).rowwise().sum().sum(), 
+                   (m1.matrix() * m1.matrix().transpose()).eval().cwiseProduct(m2.matrix()).rowwise().sum().sum());
+
+  // test slice vectorization assuming assign is ok
+  Index r0 = internal::random<Index>(0,rows-1);
+  Index c0 = internal::random<Index>(0,cols-1);
+  Index r1 = internal::random<Index>(r0+1,rows)-r0;
+  Index c1 = internal::random<Index>(c0+1,cols)-c0;
+  VERIFY_IS_APPROX(m1.block(r0,c0,r1,c1).sum(), m1.block(r0,c0,r1,c1).eval().sum());
+  VERIFY_IS_APPROX(m1.block(r0,c0,r1,c1).mean(), m1.block(r0,c0,r1,c1).eval().mean());
+  VERIFY_IS_APPROX(m1_for_prod.block(r0,c0,r1,c1).prod(), m1_for_prod.block(r0,c0,r1,c1).eval().prod());
+  VERIFY_IS_APPROX(m1.block(r0,c0,r1,c1).real().minCoeff(), m1.block(r0,c0,r1,c1).real().eval().minCoeff());
+  VERIFY_IS_APPROX(m1.block(r0,c0,r1,c1).real().maxCoeff(), m1.block(r0,c0,r1,c1).real().eval().maxCoeff());
+
+  // regression for bug 1090
+  const int R1 = MatrixType::RowsAtCompileTime>=2 ? MatrixType::RowsAtCompileTime/2 : 6;
+  const int C1 = MatrixType::ColsAtCompileTime>=2 ? MatrixType::ColsAtCompileTime/2 : 6;
+  if(R1<=rows-r0 && C1<=cols-c0)
+  {
+    VERIFY_IS_APPROX( (m1.template block<R1,C1>(r0,c0).sum()), m1.block(r0,c0,R1,C1).sum() );
+  }
+  
+  // test empty objects
+  VERIFY_IS_APPROX(m1.block(r0,c0,0,0).sum(),   Scalar(0));
+  VERIFY_IS_APPROX(m1.block(r0,c0,0,0).prod(),  Scalar(1));
+
+  // test nesting complex expression
+  VERIFY_EVALUATION_COUNT( (m1.matrix()*m1.matrix().transpose()).sum(), (MatrixType::IsVectorAtCompileTime && MatrixType::SizeAtCompileTime!=1 ? 0 : 1) );
+  VERIFY_EVALUATION_COUNT( ((m1.matrix()*m1.matrix().transpose())+m2).sum(),(MatrixType::IsVectorAtCompileTime && MatrixType::SizeAtCompileTime!=1 ? 0 : 1));
+}
+
+template<typename VectorType> void vectorRedux(const VectorType& w)
+{
+  using std::abs;
+  typedef typename VectorType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  Index size = w.size();
+
+  VectorType v = VectorType::Random(size);
+  VectorType v_for_prod = VectorType::Ones(size) + Scalar(0.2) * v; // see comment above declaration of m1_for_prod
+
+  for(int i = 1; i < size; i++)
+  {
+    Scalar s(0), p(1);
+    RealScalar minc(numext::real(v.coeff(0))), maxc(numext::real(v.coeff(0)));
+    for(int j = 0; j < i; j++)
+    {
+      s += v[j];
+      p *= v_for_prod[j];
+      minc = (std::min)(minc, numext::real(v[j]));
+      maxc = (std::max)(maxc, numext::real(v[j]));
+    }
+    VERIFY_IS_MUCH_SMALLER_THAN(abs(s - v.head(i).sum()), Scalar(1));
+    VERIFY_IS_APPROX(p, v_for_prod.head(i).prod());
+    VERIFY_IS_APPROX(minc, v.real().head(i).minCoeff());
+    VERIFY_IS_APPROX(maxc, v.real().head(i).maxCoeff());
+  }
+
+  for(int i = 0; i < size-1; i++)
+  {
+    Scalar s(0), p(1);
+    RealScalar minc(numext::real(v.coeff(i))), maxc(numext::real(v.coeff(i)));
+    for(int j = i; j < size; j++)
+    {
+      s += v[j];
+      p *= v_for_prod[j];
+      minc = (std::min)(minc, numext::real(v[j]));
+      maxc = (std::max)(maxc, numext::real(v[j]));
+    }
+    VERIFY_IS_MUCH_SMALLER_THAN(abs(s - v.tail(size-i).sum()), Scalar(1));
+    VERIFY_IS_APPROX(p, v_for_prod.tail(size-i).prod());
+    VERIFY_IS_APPROX(minc, v.real().tail(size-i).minCoeff());
+    VERIFY_IS_APPROX(maxc, v.real().tail(size-i).maxCoeff());
+  }
+
+  for(int i = 0; i < size/2; i++)
+  {
+    Scalar s(0), p(1);
+    RealScalar minc(numext::real(v.coeff(i))), maxc(numext::real(v.coeff(i)));
+    for(int j = i; j < size-i; j++)
+    {
+      s += v[j];
+      p *= v_for_prod[j];
+      minc = (std::min)(minc, numext::real(v[j]));
+      maxc = (std::max)(maxc, numext::real(v[j]));
+    }
+    VERIFY_IS_MUCH_SMALLER_THAN(abs(s - v.segment(i, size-2*i).sum()), Scalar(1));
+    VERIFY_IS_APPROX(p, v_for_prod.segment(i, size-2*i).prod());
+    VERIFY_IS_APPROX(minc, v.real().segment(i, size-2*i).minCoeff());
+    VERIFY_IS_APPROX(maxc, v.real().segment(i, size-2*i).maxCoeff());
+  }
+  
+  // test empty objects
+  VERIFY_IS_APPROX(v.head(0).sum(),   Scalar(0));
+  VERIFY_IS_APPROX(v.tail(0).prod(),  Scalar(1));
+  VERIFY_RAISES_ASSERT(v.head(0).mean());
+  VERIFY_RAISES_ASSERT(v.head(0).minCoeff());
+  VERIFY_RAISES_ASSERT(v.head(0).maxCoeff());
+}
+
+EIGEN_DECLARE_TEST(redux)
+{
+  // the max size cannot be too large, otherwise reduxion operations obviously generate large errors.
+  int maxsize = (std::min)(100,EIGEN_TEST_MAX_SIZE);
+  TEST_SET_BUT_UNUSED_VARIABLE(maxsize);
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( matrixRedux(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_1( matrixRedux(Array<float, 1, 1>()) );
+    CALL_SUBTEST_2( matrixRedux(Matrix2f()) );
+    CALL_SUBTEST_2( matrixRedux(Array2f()) );
+    CALL_SUBTEST_2( matrixRedux(Array22f()) );
+    CALL_SUBTEST_3( matrixRedux(Matrix4d()) );
+    CALL_SUBTEST_3( matrixRedux(Array4d()) );
+    CALL_SUBTEST_3( matrixRedux(Array44d()) );
+    CALL_SUBTEST_4( matrixRedux(MatrixXcf(internal::random<int>(1,maxsize), internal::random<int>(1,maxsize))) );
+    CALL_SUBTEST_4( matrixRedux(ArrayXXcf(internal::random<int>(1,maxsize), internal::random<int>(1,maxsize))) );
+    CALL_SUBTEST_5( matrixRedux(MatrixXd (internal::random<int>(1,maxsize), internal::random<int>(1,maxsize))) );
+    CALL_SUBTEST_5( matrixRedux(ArrayXXd (internal::random<int>(1,maxsize), internal::random<int>(1,maxsize))) );
+    CALL_SUBTEST_6( matrixRedux(MatrixXi (internal::random<int>(1,maxsize), internal::random<int>(1,maxsize))) );
+    CALL_SUBTEST_6( matrixRedux(ArrayXXi (internal::random<int>(1,maxsize), internal::random<int>(1,maxsize))) );
+  }
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_7( vectorRedux(Vector4f()) );
+    CALL_SUBTEST_7( vectorRedux(Array4f()) );
+    CALL_SUBTEST_5( vectorRedux(VectorXd(internal::random<int>(1,maxsize))) );
+    CALL_SUBTEST_5( vectorRedux(ArrayXd(internal::random<int>(1,maxsize))) );
+    CALL_SUBTEST_8( vectorRedux(VectorXf(internal::random<int>(1,maxsize))) );
+    CALL_SUBTEST_8( vectorRedux(ArrayXf(internal::random<int>(1,maxsize))) );
+  }
+}

diff --git a/test/ref.cpp b/test/ref.cpp
new file mode 100644
index 0000000..63eb65e
--- /dev/null
+++ b/test/ref.cpp

@@ -0,0 +1,360 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// This unit test cannot be easily written to work with EIGEN_DEFAULT_TO_ROW_MAJOR
+#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
+#undef EIGEN_DEFAULT_TO_ROW_MAJOR
+#endif
+
+#define TEST_ENABLE_TEMPORARY_TRACKING
+#define TEST_CHECK_STATIC_ASSERTIONS
+#include "main.h"
+
+// test Ref.h
+
+// Deal with i387 extended precision
+#if EIGEN_ARCH_i386 && !(EIGEN_ARCH_x86_64)
+
+#if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(4,4)
+#pragma GCC optimize ("-ffloat-store")
+#else
+#undef VERIFY_IS_EQUAL
+#define VERIFY_IS_EQUAL(X,Y) VERIFY_IS_APPROX(X,Y)
+#endif
+
+#endif
+
+template<typename MatrixType> void ref_matrix(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef Matrix<Scalar,Dynamic,Dynamic,MatrixType::Options> DynMatrixType;
+  typedef Matrix<RealScalar,Dynamic,Dynamic,MatrixType::Options> RealDynMatrixType;
+  
+  typedef Ref<MatrixType> RefMat;
+  typedef Ref<DynMatrixType> RefDynMat;
+  typedef Ref<const DynMatrixType> ConstRefDynMat;
+  typedef Ref<RealDynMatrixType , 0, Stride<Dynamic,Dynamic> > RefRealMatWithStride;
+
+  Index rows = m.rows(), cols = m.cols();
+  
+  MatrixType  m1 = MatrixType::Random(rows, cols),
+              m2 = m1;
+  
+  Index i = internal::random<Index>(0,rows-1);
+  Index j = internal::random<Index>(0,cols-1);
+  Index brows = internal::random<Index>(1,rows-i);
+  Index bcols = internal::random<Index>(1,cols-j);
+  
+  RefMat rm0 = m1;
+  VERIFY_IS_EQUAL(rm0, m1);
+  RefDynMat rm1 = m1;
+  VERIFY_IS_EQUAL(rm1, m1);
+  RefDynMat rm2 = m1.block(i,j,brows,bcols);
+  VERIFY_IS_EQUAL(rm2, m1.block(i,j,brows,bcols));
+  rm2.setOnes();
+  m2.block(i,j,brows,bcols).setOnes();
+  VERIFY_IS_EQUAL(m1, m2);
+  
+  m2.block(i,j,brows,bcols).setRandom();
+  rm2 = m2.block(i,j,brows,bcols);
+  VERIFY_IS_EQUAL(m1, m2);
+  
+  ConstRefDynMat rm3 = m1.block(i,j,brows,bcols);
+  m1.block(i,j,brows,bcols) *= 2;
+  m2.block(i,j,brows,bcols) *= 2;
+  VERIFY_IS_EQUAL(rm3, m2.block(i,j,brows,bcols));
+  RefRealMatWithStride rm4 = m1.real();
+  VERIFY_IS_EQUAL(rm4, m2.real());
+  rm4.array() += 1;
+  m2.real().array() += 1;
+  VERIFY_IS_EQUAL(m1, m2);
+}
+
+template<typename VectorType> void ref_vector(const VectorType& m)
+{
+  typedef typename VectorType::Scalar Scalar;
+  typedef typename VectorType::RealScalar RealScalar;
+  typedef Matrix<Scalar,Dynamic,1,VectorType::Options> DynMatrixType;
+  typedef Matrix<Scalar,Dynamic,Dynamic,ColMajor> MatrixType;
+  typedef Matrix<RealScalar,Dynamic,1,VectorType::Options> RealDynMatrixType;
+  
+  typedef Ref<VectorType> RefMat;
+  typedef Ref<DynMatrixType> RefDynMat;
+  typedef Ref<const DynMatrixType> ConstRefDynMat;
+  typedef Ref<RealDynMatrixType , 0, InnerStride<> > RefRealMatWithStride;
+  typedef Ref<DynMatrixType , 0, InnerStride<> > RefMatWithStride;
+
+  Index size = m.size();
+  
+  VectorType  v1 = VectorType::Random(size),
+              v2 = v1;
+  MatrixType mat1 = MatrixType::Random(size,size),
+             mat2 = mat1,
+             mat3 = MatrixType::Random(size,size);
+  
+  Index i = internal::random<Index>(0,size-1);
+  Index bsize = internal::random<Index>(1,size-i);
+  
+  { RefMat    rm0 = v1;                   VERIFY_IS_EQUAL(rm0, v1); }
+  { RefMat    rm0 = v1.block(0,0,size,1); VERIFY_IS_EQUAL(rm0, v1); }
+  { RefDynMat rv1 = v1;                   VERIFY_IS_EQUAL(rv1, v1); }
+  { RefDynMat rv1 = v1.block(0,0,size,1); VERIFY_IS_EQUAL(rv1, v1); }
+  { VERIFY_RAISES_ASSERT( RefMat    rm0 = v1.block(0, 0, size, 0); EIGEN_UNUSED_VARIABLE(rm0); ); }
+  if(VectorType::SizeAtCompileTime!=1)
+  { VERIFY_RAISES_ASSERT( RefDynMat rv1 = v1.block(0, 0, size, 0); EIGEN_UNUSED_VARIABLE(rv1); ); }
+
+  RefDynMat rv2 = v1.segment(i,bsize);
+  VERIFY_IS_EQUAL(rv2, v1.segment(i,bsize));
+  rv2.setOnes();
+  v2.segment(i,bsize).setOnes();
+  VERIFY_IS_EQUAL(v1, v2);
+  
+  v2.segment(i,bsize).setRandom();
+  rv2 = v2.segment(i,bsize);
+  VERIFY_IS_EQUAL(v1, v2);
+  
+  ConstRefDynMat rm3 = v1.segment(i,bsize);
+  v1.segment(i,bsize) *= 2;
+  v2.segment(i,bsize) *= 2;
+  VERIFY_IS_EQUAL(rm3, v2.segment(i,bsize));
+  
+  RefRealMatWithStride rm4 = v1.real();
+  VERIFY_IS_EQUAL(rm4, v2.real());
+  rm4.array() += 1;
+  v2.real().array() += 1;
+  VERIFY_IS_EQUAL(v1, v2);
+  
+  RefMatWithStride rm5 = mat1.row(i).transpose();
+  VERIFY_IS_EQUAL(rm5, mat1.row(i).transpose());
+  rm5.array() += 1;
+  mat2.row(i).array() += 1;
+  VERIFY_IS_EQUAL(mat1, mat2);
+  rm5.noalias() = rm4.transpose() * mat3;
+  mat2.row(i) = v2.real().transpose() * mat3;
+  VERIFY_IS_APPROX(mat1, mat2);
+}
+
+template<typename Scalar, int Rows, int Cols>
+void ref_vector_fixed_sizes()
+{
+  typedef Matrix<Scalar,Rows,Cols,RowMajor> RowMajorMatrixType;
+  typedef Matrix<Scalar,Rows,Cols,ColMajor> ColMajorMatrixType;
+  typedef Matrix<Scalar,1,Cols> RowVectorType;
+  typedef Matrix<Scalar,Rows,1> ColVectorType;
+  typedef Matrix<Scalar,Cols,1> RowVectorTransposeType;
+  typedef Matrix<Scalar,1,Rows> ColVectorTransposeType;
+  typedef Stride<Dynamic, Dynamic> DynamicStride;
+
+  RowMajorMatrixType mr = RowMajorMatrixType::Random();
+  ColMajorMatrixType mc = ColMajorMatrixType::Random();
+
+  Index i = internal::random<Index>(0,Rows-1);
+  Index j = internal::random<Index>(0,Cols-1);
+
+  // Reference ith row.
+  Ref<RowVectorType, 0, DynamicStride> mr_ri = mr.row(i);
+  VERIFY_IS_EQUAL(mr_ri, mr.row(i));
+  Ref<RowVectorType, 0, DynamicStride> mc_ri = mc.row(i);
+  VERIFY_IS_EQUAL(mc_ri, mc.row(i));
+
+  // Reference jth col.
+  Ref<ColVectorType, 0, DynamicStride> mr_cj = mr.col(j);
+  VERIFY_IS_EQUAL(mr_cj, mr.col(j));
+  Ref<ColVectorType, 0, DynamicStride> mc_cj = mc.col(j);
+  VERIFY_IS_EQUAL(mc_cj, mc.col(j));
+
+  // Reference the transpose of row i.
+  Ref<RowVectorTransposeType, 0, DynamicStride> mr_rit = mr.row(i);
+  VERIFY_IS_EQUAL(mr_rit, mr.row(i).transpose());
+  Ref<RowVectorTransposeType, 0, DynamicStride> mc_rit = mc.row(i);
+  VERIFY_IS_EQUAL(mc_rit, mc.row(i).transpose());
+
+  // Reference the transpose of col j.
+  Ref<ColVectorTransposeType, 0, DynamicStride> mr_cjt = mr.col(j);
+  VERIFY_IS_EQUAL(mr_cjt, mr.col(j).transpose());
+  Ref<ColVectorTransposeType, 0, DynamicStride> mc_cjt = mc.col(j);
+  VERIFY_IS_EQUAL(mc_cjt, mc.col(j).transpose());
+  
+  // Const references without strides.
+  Ref<const RowVectorType> cmr_ri = mr.row(i);
+  VERIFY_IS_EQUAL(cmr_ri, mr.row(i));
+  Ref<const RowVectorType> cmc_ri = mc.row(i);
+  VERIFY_IS_EQUAL(cmc_ri, mc.row(i));
+
+  Ref<const ColVectorType> cmr_cj = mr.col(j);
+  VERIFY_IS_EQUAL(cmr_cj, mr.col(j));
+  Ref<const ColVectorType> cmc_cj = mc.col(j);
+  VERIFY_IS_EQUAL(cmc_cj, mc.col(j));
+
+  Ref<const RowVectorTransposeType> cmr_rit = mr.row(i);
+  VERIFY_IS_EQUAL(cmr_rit, mr.row(i).transpose());
+  Ref<const RowVectorTransposeType> cmc_rit = mc.row(i);
+  VERIFY_IS_EQUAL(cmc_rit, mc.row(i).transpose());
+
+  Ref<const ColVectorTransposeType> cmr_cjt = mr.col(j);
+  VERIFY_IS_EQUAL(cmr_cjt, mr.col(j).transpose());
+  Ref<const ColVectorTransposeType> cmc_cjt = mc.col(j);
+  VERIFY_IS_EQUAL(cmc_cjt, mc.col(j).transpose());
+}
+
+template<typename PlainObjectType> void check_const_correctness(const PlainObjectType&)
+{
+  // verify that ref-to-const don't have LvalueBit
+  typedef typename internal::add_const<PlainObjectType>::type ConstPlainObjectType;
+  VERIFY( !(internal::traits<Ref<ConstPlainObjectType> >::Flags & LvalueBit) );
+  VERIFY( !(internal::traits<Ref<ConstPlainObjectType, Aligned> >::Flags & LvalueBit) );
+  VERIFY( !(Ref<ConstPlainObjectType>::Flags & LvalueBit) );
+  VERIFY( !(Ref<ConstPlainObjectType, Aligned>::Flags & LvalueBit) );
+}
+
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_1(Ref<VectorXf> a, const B &b) { VERIFY_IS_EQUAL(a,b); }
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_2(const Ref<const VectorXf>& a, const B &b) { VERIFY_IS_EQUAL(a,b); }
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_3(Ref<VectorXf,0,InnerStride<> > a, const B &b) { VERIFY_IS_EQUAL(a,b); }
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_4(const Ref<const VectorXf,0,InnerStride<> >& a, const B &b) { VERIFY_IS_EQUAL(a,b); }
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_5(Ref<MatrixXf,0,OuterStride<> > a, const B &b) { VERIFY_IS_EQUAL(a,b); }
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_6(const Ref<const MatrixXf,0,OuterStride<> >& a, const B &b) { VERIFY_IS_EQUAL(a,b); }
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_7(Ref<Matrix<float,Dynamic,3> > a, const B &b) { VERIFY_IS_EQUAL(a,b); }
+
+void call_ref()
+{
+  VectorXcf ca  = VectorXcf::Random(10);
+  VectorXf a    = VectorXf::Random(10);
+  RowVectorXf b = RowVectorXf::Random(10);
+  MatrixXf A    = MatrixXf::Random(10,10);
+  RowVector3f c = RowVector3f::Random();
+  const VectorXf& ac(a);
+  VectorBlock<VectorXf> ab(a,0,3);
+  const VectorBlock<VectorXf> abc(a,0,3);
+  
+
+  VERIFY_EVALUATION_COUNT( call_ref_1(a,a), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_1(b,b.transpose()), 0);
+//   call_ref_1(ac,a<c);           // does not compile because ac is const
+  VERIFY_EVALUATION_COUNT( call_ref_1(ab,ab), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_1(a.head(4),a.head(4)), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_1(abc,abc), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_1(A.col(3),A.col(3)), 0);
+//   call_ref_1(A.row(3),A.row(3));    // does not compile because innerstride!=1
+  VERIFY_EVALUATION_COUNT( call_ref_3(A.row(3),A.row(3).transpose()), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_4(A.row(3),A.row(3).transpose()), 0);
+//   call_ref_1(a+a, a+a);          // does not compile for obvious reason
+
+  MatrixXf tmp = A*A.col(1);
+  VERIFY_EVALUATION_COUNT( call_ref_2(A*A.col(1), tmp), 1);     // evaluated into a temp
+  VERIFY_EVALUATION_COUNT( call_ref_2(ac.head(5),ac.head(5)), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(ac,ac), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(a,a), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(ab,ab), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(a.head(4),a.head(4)), 0);
+  tmp = a+a;
+  VERIFY_EVALUATION_COUNT( call_ref_2(a+a,tmp), 1);            // evaluated into a temp
+  VERIFY_EVALUATION_COUNT( call_ref_2(ca.imag(),ca.imag()), 1);      // evaluated into a temp
+
+  VERIFY_EVALUATION_COUNT( call_ref_4(ac.head(5),ac.head(5)), 0);
+  tmp = a+a;
+  VERIFY_EVALUATION_COUNT( call_ref_4(a+a,tmp), 1);           // evaluated into a temp
+  VERIFY_EVALUATION_COUNT( call_ref_4(ca.imag(),ca.imag()), 0);
+
+  VERIFY_EVALUATION_COUNT( call_ref_5(a,a), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_5(a.head(3),a.head(3)), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_5(A,A), 0);
+//   call_ref_5(A.transpose(),A.transpose());   // does not compile because storage order does not match
+  VERIFY_EVALUATION_COUNT( call_ref_5(A.block(1,1,2,2),A.block(1,1,2,2)), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_5(b,b), 0);             // storage order do not match, but this is a degenerate case that should work
+  VERIFY_EVALUATION_COUNT( call_ref_5(a.row(3),a.row(3)), 0);
+
+  VERIFY_EVALUATION_COUNT( call_ref_6(a,a), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_6(a.head(3),a.head(3)), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_6(A.row(3),A.row(3)), 1);           // evaluated into a temp thouth it could be avoided by viewing it as a 1xn matrix
+  tmp = A+A;
+  VERIFY_EVALUATION_COUNT( call_ref_6(A+A,tmp), 1);                // evaluated into a temp
+  VERIFY_EVALUATION_COUNT( call_ref_6(A,A), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_6(A.transpose(),A.transpose()), 1);      // evaluated into a temp because the storage orders do not match
+  VERIFY_EVALUATION_COUNT( call_ref_6(A.block(1,1,2,2),A.block(1,1,2,2)), 0);
+  
+  VERIFY_EVALUATION_COUNT( call_ref_7(c,c), 0);
+}
+
+typedef Matrix<double,Dynamic,Dynamic,RowMajor> RowMatrixXd;
+int test_ref_overload_fun1(Ref<MatrixXd> )       { return 1; }
+int test_ref_overload_fun1(Ref<RowMatrixXd> )    { return 2; }
+int test_ref_overload_fun1(Ref<MatrixXf> )       { return 3; }
+
+int test_ref_overload_fun2(Ref<const MatrixXd> ) { return 4; }
+int test_ref_overload_fun2(Ref<const MatrixXf> ) { return 5; }
+
+void test_ref_ambiguous(const Ref<const ArrayXd> &A, Ref<ArrayXd> B)
+{
+  B = A;
+  B = A - A;
+}
+
+// See also bug 969
+void test_ref_overloads()
+{
+  MatrixXd Ad, Bd;
+  RowMatrixXd rAd, rBd;
+  VERIFY( test_ref_overload_fun1(Ad)==1 );
+  VERIFY( test_ref_overload_fun1(rAd)==2 );
+  
+  MatrixXf Af, Bf;
+  VERIFY( test_ref_overload_fun2(Ad)==4 );
+  VERIFY( test_ref_overload_fun2(Ad+Bd)==4 );
+  VERIFY( test_ref_overload_fun2(Af+Bf)==5 );
+  
+  ArrayXd A, B;
+  test_ref_ambiguous(A, B);
+}
+
+void test_ref_fixed_size_assert()
+{
+  Vector4f v4 = Vector4f::Random();
+  VectorXf vx = VectorXf::Random(10);
+  VERIFY_RAISES_STATIC_ASSERT( Ref<Vector3f> y = v4; (void)y; );
+  VERIFY_RAISES_STATIC_ASSERT( Ref<Vector3f> y = vx.head<4>(); (void)y; );
+  VERIFY_RAISES_STATIC_ASSERT( Ref<const Vector3f> y = v4; (void)y; );
+  VERIFY_RAISES_STATIC_ASSERT( Ref<const Vector3f> y = vx.head<4>(); (void)y; );
+  VERIFY_RAISES_STATIC_ASSERT( Ref<const Vector3f> y = 2*v4; (void)y; );
+}
+
+EIGEN_DECLARE_TEST(ref)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( ref_vector(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_1( check_const_correctness(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( ref_vector(Vector4d()) );
+    CALL_SUBTEST_2( check_const_correctness(Matrix4d()) );
+    CALL_SUBTEST_3( ref_vector(Vector4cf()) );
+    CALL_SUBTEST_4( ref_vector(VectorXcf(8)) );
+    CALL_SUBTEST_5( ref_vector(VectorXi(12)) );
+    CALL_SUBTEST_5( check_const_correctness(VectorXi(12)) );
+
+    CALL_SUBTEST_1( ref_matrix(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( ref_matrix(Matrix4d()) );
+    CALL_SUBTEST_1( ref_matrix(Matrix<float,3,5>()) );
+    CALL_SUBTEST_4( ref_matrix(MatrixXcf(internal::random<int>(1,10),internal::random<int>(1,10))) );
+    CALL_SUBTEST_4( ref_matrix(Matrix<std::complex<double>,10,15>()) );
+    CALL_SUBTEST_5( ref_matrix(MatrixXi(internal::random<int>(1,10),internal::random<int>(1,10))) );
+    CALL_SUBTEST_6( call_ref() );
+
+    CALL_SUBTEST_8( (ref_vector_fixed_sizes<float,3,5>()) );
+    CALL_SUBTEST_8( (ref_vector_fixed_sizes<float,15,10>()) );
+  }
+  
+  CALL_SUBTEST_7( test_ref_overloads() );
+  CALL_SUBTEST_7( test_ref_fixed_size_assert() );
+}

diff --git a/test/reshape.cpp b/test/reshape.cpp
new file mode 100644
index 0000000..7b16742
--- /dev/null
+++ b/test/reshape.cpp

@@ -0,0 +1,216 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2014 yoco <peter.xiau@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename T1,typename T2>
+typename internal::enable_if<internal::is_same<T1,T2>::value,bool>::type
+is_same_eq(const T1& a, const T2& b)
+{
+  return (a.array() == b.array()).all();
+}
+
+template <int Order,typename MatType>
+void check_auto_reshape4x4(MatType m)
+{
+  internal::VariableAndFixedInt<MatType::SizeAtCompileTime==Dynamic?-1: 1>  v1( 1);
+  internal::VariableAndFixedInt<MatType::SizeAtCompileTime==Dynamic?-1: 2>  v2( 2);
+  internal::VariableAndFixedInt<MatType::SizeAtCompileTime==Dynamic?-1: 4>  v4( 4);
+  internal::VariableAndFixedInt<MatType::SizeAtCompileTime==Dynamic?-1: 8>  v8( 8);
+  internal::VariableAndFixedInt<MatType::SizeAtCompileTime==Dynamic?-1:16> v16(16);
+
+  VERIFY(is_same_eq(m.template reshaped<Order>( 1,       AutoSize), m.template reshaped<Order>( 1, 16)));
+  VERIFY(is_same_eq(m.template reshaped<Order>(AutoSize, 16      ), m.template reshaped<Order>( 1, 16)));
+  VERIFY(is_same_eq(m.template reshaped<Order>( 2,       AutoSize), m.template reshaped<Order>( 2,  8)));
+  VERIFY(is_same_eq(m.template reshaped<Order>(AutoSize, 8       ), m.template reshaped<Order>( 2,  8)));
+  VERIFY(is_same_eq(m.template reshaped<Order>( 4,       AutoSize), m.template reshaped<Order>( 4,  4)));
+  VERIFY(is_same_eq(m.template reshaped<Order>(AutoSize, 4       ), m.template reshaped<Order>( 4,  4)));
+  VERIFY(is_same_eq(m.template reshaped<Order>( 8,       AutoSize), m.template reshaped<Order>( 8,  2)));
+  VERIFY(is_same_eq(m.template reshaped<Order>(AutoSize, 2       ), m.template reshaped<Order>( 8,  2)));
+  VERIFY(is_same_eq(m.template reshaped<Order>(16,       AutoSize), m.template reshaped<Order>(16,  1)));
+  VERIFY(is_same_eq(m.template reshaped<Order>(AutoSize, 1       ), m.template reshaped<Order>(16,  1)));
+
+  VERIFY(is_same_eq(m.template reshaped<Order>(fix< 1>,   AutoSize),  m.template reshaped<Order>(fix< 1>, v16    )));
+  VERIFY(is_same_eq(m.template reshaped<Order>(AutoSize,  fix<16> ),  m.template reshaped<Order>( v1,     fix<16>)));
+  VERIFY(is_same_eq(m.template reshaped<Order>(fix< 2>,   AutoSize),  m.template reshaped<Order>(fix< 2>, v8     )));
+  VERIFY(is_same_eq(m.template reshaped<Order>(AutoSize,  fix< 8> ),  m.template reshaped<Order>( v2,     fix< 8>)));
+  VERIFY(is_same_eq(m.template reshaped<Order>(fix< 4>,   AutoSize),  m.template reshaped<Order>(fix< 4>, v4     )));
+  VERIFY(is_same_eq(m.template reshaped<Order>(AutoSize,  fix< 4> ),  m.template reshaped<Order>( v4,     fix< 4>)));
+  VERIFY(is_same_eq(m.template reshaped<Order>(fix< 8>,   AutoSize),  m.template reshaped<Order>(fix< 8>, v2     )));
+  VERIFY(is_same_eq(m.template reshaped<Order>(AutoSize,  fix< 2> ),  m.template reshaped<Order>( v8,     fix< 2>)));
+  VERIFY(is_same_eq(m.template reshaped<Order>(fix<16>,   AutoSize),  m.template reshaped<Order>(fix<16>, v1     )));
+  VERIFY(is_same_eq(m.template reshaped<Order>(AutoSize,  fix< 1> ),  m.template reshaped<Order>(v16,     fix< 1>)));
+}
+
+template <typename MatType>
+void check_direct_access_reshape4x4(MatType , internal::FixedInt<RowMajorBit>) {}
+
+template <typename MatType>
+void check_direct_access_reshape4x4(MatType m, internal::FixedInt<0>) {
+  VERIFY_IS_EQUAL(m.reshaped( 1, 16).data(), m.data());
+  VERIFY_IS_EQUAL(m.reshaped( 1, 16).innerStride(), 1);
+
+  VERIFY_IS_EQUAL(m.reshaped( 2, 8).data(), m.data());
+  VERIFY_IS_EQUAL(m.reshaped( 2, 8).innerStride(), 1);
+  VERIFY_IS_EQUAL(m.reshaped( 2, 8).outerStride(), 2);
+}
+
+// just test a 4x4 matrix, enumerate all combination manually
+template <typename MatType>
+void reshape4x4(MatType m)
+{
+  typedef typename MatType::Scalar Scalar;
+
+  internal::VariableAndFixedInt<MatType::SizeAtCompileTime==Dynamic?-1: 1>  v1( 1);
+  internal::VariableAndFixedInt<MatType::SizeAtCompileTime==Dynamic?-1: 2>  v2( 2);
+  internal::VariableAndFixedInt<MatType::SizeAtCompileTime==Dynamic?-1: 4>  v4( 4);
+  internal::VariableAndFixedInt<MatType::SizeAtCompileTime==Dynamic?-1: 8>  v8( 8);
+  internal::VariableAndFixedInt<MatType::SizeAtCompileTime==Dynamic?-1:16> v16(16);
+
+  if((MatType::Flags&RowMajorBit)==0)
+  {
+    typedef Map<MatrixXi> MapMat;
+    // dynamic
+    VERIFY_IS_EQUAL((m.reshaped( 1, 16)), MapMat(m.data(),  1, 16));
+    VERIFY_IS_EQUAL((m.reshaped( 2,  8)), MapMat(m.data(),  2,  8));
+    VERIFY_IS_EQUAL((m.reshaped( 4,  4)), MapMat(m.data(),  4,  4));
+    VERIFY_IS_EQUAL((m.reshaped( 8,  2)), MapMat(m.data(),  8,  2));
+    VERIFY_IS_EQUAL((m.reshaped(16,  1)), MapMat(m.data(), 16,  1));
+
+    // static
+    VERIFY_IS_EQUAL(m.reshaped(fix< 1>, fix<16>), MapMat(m.data(),  1, 16));
+    VERIFY_IS_EQUAL(m.reshaped(fix< 2>, fix< 8>), MapMat(m.data(),  2,  8));
+    VERIFY_IS_EQUAL(m.reshaped(fix< 4>, fix< 4>), MapMat(m.data(),  4,  4));
+    VERIFY_IS_EQUAL(m.reshaped(fix< 8>, fix< 2>), MapMat(m.data(),  8,  2));
+    VERIFY_IS_EQUAL(m.reshaped(fix<16>, fix< 1>), MapMat(m.data(), 16,  1));
+
+
+    // reshape chain
+    VERIFY_IS_EQUAL(
+      (m
+      .reshaped( 1, 16)
+      .reshaped(fix< 2>,fix< 8>)
+      .reshaped(16,  1)
+      .reshaped(fix< 8>,fix< 2>)
+      .reshaped( 2,  8)
+      .reshaped(fix< 1>,fix<16>)
+      .reshaped( 4,  4)
+      .reshaped(fix<16>,fix< 1>)
+      .reshaped( 8,  2)
+      .reshaped(fix< 4>,fix< 4>)
+      ),
+      MapMat(m.data(), 4,  4)
+    );
+  }
+
+  VERIFY(is_same_eq(m.reshaped( 1,       AutoSize), m.reshaped( 1, 16)));
+  VERIFY(is_same_eq(m.reshaped(AutoSize, 16),       m.reshaped( 1, 16)));
+  VERIFY(is_same_eq(m.reshaped( 2,       AutoSize), m.reshaped( 2,  8)));
+  VERIFY(is_same_eq(m.reshaped(AutoSize, 8),        m.reshaped( 2,  8)));
+  VERIFY(is_same_eq(m.reshaped( 4,       AutoSize), m.reshaped( 4,  4)));
+  VERIFY(is_same_eq(m.reshaped(AutoSize, 4),        m.reshaped( 4,  4)));
+  VERIFY(is_same_eq(m.reshaped( 8,       AutoSize), m.reshaped( 8,  2)));
+  VERIFY(is_same_eq(m.reshaped(AutoSize, 2),        m.reshaped( 8,  2)));
+  VERIFY(is_same_eq(m.reshaped(16,       AutoSize), m.reshaped(16,  1)));
+  VERIFY(is_same_eq(m.reshaped(AutoSize,  1),       m.reshaped(16,  1)));
+
+  VERIFY(is_same_eq(m.reshaped(fix< 1>,   AutoSize),  m.reshaped(fix< 1>, v16)));
+  VERIFY(is_same_eq(m.reshaped(AutoSize,  fix<16>),   m.reshaped( v1,     fix<16>)));
+  VERIFY(is_same_eq(m.reshaped(fix< 2>,   AutoSize),  m.reshaped(fix< 2>, v8)));
+  VERIFY(is_same_eq(m.reshaped(AutoSize,  fix< 8>),   m.reshaped( v2,     fix< 8>)));
+  VERIFY(is_same_eq(m.reshaped(fix< 4>,   AutoSize),  m.reshaped(fix< 4>, v4)));
+  VERIFY(is_same_eq(m.reshaped(AutoSize,  fix< 4>),   m.reshaped( v4,     fix< 4>)));
+  VERIFY(is_same_eq(m.reshaped(fix< 8>,   AutoSize),  m.reshaped(fix< 8>, v2)));
+  VERIFY(is_same_eq(m.reshaped(AutoSize,  fix< 2>),   m.reshaped( v8,     fix< 2>)));
+  VERIFY(is_same_eq(m.reshaped(fix<16>,   AutoSize),  m.reshaped(fix<16>, v1)));
+  VERIFY(is_same_eq(m.reshaped(AutoSize,  fix< 1>),   m.reshaped(v16,     fix< 1>)));
+
+  check_auto_reshape4x4<ColMajor> (m);
+  check_auto_reshape4x4<RowMajor> (m);
+  check_auto_reshape4x4<AutoOrder>(m);
+  check_auto_reshape4x4<ColMajor> (m.transpose());
+  check_auto_reshape4x4<ColMajor> (m.transpose());
+  check_auto_reshape4x4<AutoOrder>(m.transpose());
+
+  check_direct_access_reshape4x4(m,fix<MatType::Flags&RowMajorBit>);
+
+  if((MatType::Flags&RowMajorBit)==0)
+  {
+    VERIFY_IS_EQUAL(m.template reshaped<ColMajor>(2,8),m.reshaped(2,8));
+    VERIFY_IS_EQUAL(m.template reshaped<ColMajor>(2,8),m.template reshaped<AutoOrder>(2,8));
+    VERIFY_IS_EQUAL(m.transpose().template reshaped<RowMajor>(2,8),m.transpose().template reshaped<AutoOrder>(2,8));
+  }
+  else
+  {
+    VERIFY_IS_EQUAL(m.template reshaped<ColMajor>(2,8),m.reshaped(2,8));
+    VERIFY_IS_EQUAL(m.template reshaped<RowMajor>(2,8),m.template reshaped<AutoOrder>(2,8));
+    VERIFY_IS_EQUAL(m.transpose().template reshaped<ColMajor>(2,8),m.transpose().template reshaped<AutoOrder>(2,8));
+    VERIFY_IS_EQUAL(m.transpose().reshaped(2,8),m.transpose().template reshaped<AutoOrder>(2,8));
+  }
+
+  MatrixXi m28r1 = m.template reshaped<RowMajor>(2,8);
+  MatrixXi m28r2 = m.transpose().template reshaped<ColMajor>(8,2).transpose();
+  VERIFY_IS_EQUAL( m28r1, m28r2);
+
+  VERIFY(is_same_eq(m.reshaped(v16,fix<1>), m.reshaped()));
+  VERIFY_IS_EQUAL(m.reshaped(16,1).eval(), m.reshaped().eval());
+  VERIFY_IS_EQUAL(m.reshaped(1,16).eval(), m.reshaped().transpose().eval());
+  VERIFY_IS_EQUAL(m.reshaped().reshaped(2,8), m.reshaped(2,8));
+  VERIFY_IS_EQUAL(m.reshaped().reshaped(4,4), m.reshaped(4,4));
+  VERIFY_IS_EQUAL(m.reshaped().reshaped(8,2), m.reshaped(8,2));
+
+  VERIFY_IS_EQUAL(m.reshaped(), m.template reshaped<ColMajor>());
+  VERIFY_IS_EQUAL(m.transpose().reshaped(), m.template reshaped<RowMajor>());
+  VERIFY_IS_EQUAL(m.template reshaped<RowMajor>(AutoSize,fix<1>), m.template reshaped<RowMajor>());
+  VERIFY_IS_EQUAL(m.template reshaped<AutoOrder>(AutoSize,fix<1>), m.template reshaped<AutoOrder>());
+
+  VERIFY(is_same_eq(m.reshaped(AutoSize,fix<1>), m.reshaped()));
+  VERIFY_IS_EQUAL(m.template reshaped<RowMajor>(fix<1>,AutoSize), m.transpose().reshaped().transpose());
+
+  // check assignment
+  {
+    Matrix<Scalar,Dynamic,1> m1x(m.size()); m1x.setRandom();
+    VERIFY_IS_APPROX(m.reshaped() = m1x, m1x);
+    VERIFY_IS_APPROX(m, m1x.reshaped(4,4));
+    
+    Matrix<Scalar,Dynamic,Dynamic> m28(2,8); m28.setRandom();
+    VERIFY_IS_APPROX(m.reshaped(2,8) = m28, m28);
+    VERIFY_IS_APPROX(m, m28.reshaped(4,4));
+    VERIFY_IS_APPROX(m.template reshaped<RowMajor>(2,8) = m28, m28);
+
+    Matrix<Scalar,Dynamic,Dynamic> m24(2,4); m24.setRandom();
+    VERIFY_IS_APPROX(m(seq(0,last,2),all).reshaped(2,4) = m24, m24);
+
+    // check constness:
+    m.reshaped(2,8).nestedExpression() = m;
+  }
+}
+
+EIGEN_DECLARE_TEST(reshape)
+{
+  typedef Matrix<int,Dynamic,Dynamic,RowMajor> RowMatrixXi;
+  typedef Matrix<int,4,4,RowMajor> RowMatrix4i;
+  MatrixXi mx = MatrixXi::Random(4, 4);
+  Matrix4i m4 = Matrix4i::Random(4, 4);
+  RowMatrixXi rmx = RowMatrixXi::Random(4, 4);
+  RowMatrix4i rm4 = RowMatrix4i::Random(4, 4);
+
+  // test dynamic-size matrix
+  CALL_SUBTEST(reshape4x4(mx));
+  // test static-size matrix
+  CALL_SUBTEST(reshape4x4(m4));
+  // test dynamic-size const matrix
+  CALL_SUBTEST(reshape4x4(static_cast<const MatrixXi>(mx)));
+  // test static-size const matrix
+  CALL_SUBTEST(reshape4x4(static_cast<const Matrix4i>(m4)));
+
+  CALL_SUBTEST(reshape4x4(rmx));
+  CALL_SUBTEST(reshape4x4(rm4));
+}

diff --git a/test/resize.cpp b/test/resize.cpp
new file mode 100644
index 0000000..646a75b
--- /dev/null
+++ b/test/resize.cpp

@@ -0,0 +1,41 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Keir Mierle <mierle@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<DenseIndex rows, DenseIndex cols>
+void resizeLikeTest()
+{
+  MatrixXf A(rows, cols);
+  MatrixXf B;
+  Matrix<double, rows, cols> C;
+  B.resizeLike(A);
+  C.resizeLike(B);  // Shouldn't crash.
+  VERIFY(B.rows() == rows && B.cols() == cols);
+
+  VectorXf x(rows);
+  RowVectorXf y;
+  y.resizeLike(x);
+  VERIFY(y.rows() == 1 && y.cols() == rows);
+
+  y.resize(cols);
+  x.resizeLike(y);
+  VERIFY(x.rows() == cols && x.cols() == 1);
+}
+
+void resizeLikeTest12() { resizeLikeTest<1,2>(); }
+void resizeLikeTest1020() { resizeLikeTest<10,20>(); }
+void resizeLikeTest31() { resizeLikeTest<3,1>(); }
+
+EIGEN_DECLARE_TEST(resize)
+{
+  CALL_SUBTEST(resizeLikeTest12() );
+  CALL_SUBTEST(resizeLikeTest1020() );
+  CALL_SUBTEST(resizeLikeTest31() );
+}

diff --git a/test/rvalue_types.cpp b/test/rvalue_types.cpp
new file mode 100644
index 0000000..2c9999c
--- /dev/null
+++ b/test/rvalue_types.cpp

@@ -0,0 +1,157 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_RUNTIME_NO_MALLOC
+
+#include "main.h"
+#if EIGEN_HAS_CXX11
+#include "MovableScalar.h"
+#endif
+#include "SafeScalar.h"
+
+#include <Eigen/Core>
+
+using internal::UIntPtr;
+
+#if EIGEN_HAS_RVALUE_REFERENCES
+template <typename MatrixType>
+void rvalue_copyassign(const MatrixType& m)
+{
+
+  typedef typename internal::traits<MatrixType>::Scalar Scalar;
+  
+  // create a temporary which we are about to destroy by moving
+  MatrixType tmp = m;
+  UIntPtr src_address = reinterpret_cast<UIntPtr>(tmp.data());
+  
+  Eigen::internal::set_is_malloc_allowed(false); // moving from an rvalue reference shall never allocate
+  // move the temporary to n
+  MatrixType n = std::move(tmp);
+  UIntPtr dst_address = reinterpret_cast<UIntPtr>(n.data());
+  if (MatrixType::RowsAtCompileTime==Dynamic|| MatrixType::ColsAtCompileTime==Dynamic)
+  {
+    // verify that we actually moved the guts
+    VERIFY_IS_EQUAL(src_address, dst_address);
+    VERIFY_IS_EQUAL(tmp.size(), 0);
+    VERIFY_IS_EQUAL(reinterpret_cast<UIntPtr>(tmp.data()), UIntPtr(0));
+  }
+
+  // verify that the content did not change
+  Scalar abs_diff = (m-n).array().abs().sum();
+  VERIFY_IS_EQUAL(abs_diff, Scalar(0));
+  Eigen::internal::set_is_malloc_allowed(true);
+}
+template<typename TranspositionsType>
+void rvalue_transpositions(Index rows)
+{
+  typedef typename TranspositionsType::IndicesType PermutationVectorType;
+
+  PermutationVectorType vec;
+  randomPermutationVector(vec, rows);
+  TranspositionsType t0(vec);
+
+  Eigen::internal::set_is_malloc_allowed(false); // moving from an rvalue reference shall never allocate
+
+  UIntPtr t0_address = reinterpret_cast<UIntPtr>(t0.indices().data());
+
+  // Move constructors:
+  TranspositionsType t1 = std::move(t0);
+  UIntPtr t1_address = reinterpret_cast<UIntPtr>(t1.indices().data());
+  VERIFY_IS_EQUAL(t0_address, t1_address);
+  // t0 must be de-allocated:
+  VERIFY_IS_EQUAL(t0.size(), 0);
+  VERIFY_IS_EQUAL(reinterpret_cast<UIntPtr>(t0.indices().data()), UIntPtr(0));
+
+
+  // Move assignment:
+  t0 = std::move(t1);
+  t0_address = reinterpret_cast<UIntPtr>(t0.indices().data());
+  VERIFY_IS_EQUAL(t0_address, t1_address);
+  // t1 must be de-allocated:
+  VERIFY_IS_EQUAL(t1.size(), 0);
+  VERIFY_IS_EQUAL(reinterpret_cast<UIntPtr>(t1.indices().data()), UIntPtr(0));
+
+  Eigen::internal::set_is_malloc_allowed(true);
+}
+
+template <typename MatrixType>
+void rvalue_move(const MatrixType& m)
+{
+    // lvalue reference is copied
+    MatrixType b(m);
+    VERIFY_IS_EQUAL(b, m);
+
+    // lvalue reference is copied
+    MatrixType c{m};
+    VERIFY_IS_EQUAL(c, m);
+
+    // lvalue reference is copied
+    MatrixType d = m;
+    VERIFY_IS_EQUAL(d, m);
+
+    // rvalue reference is moved - copy constructor.
+    MatrixType e_src(m);
+    VERIFY_IS_EQUAL(e_src, m);
+    MatrixType e_dst(std::move(e_src));
+    VERIFY_IS_EQUAL(e_dst, m);
+
+    // rvalue reference is moved - copy constructor.
+    MatrixType f_src(m);
+    VERIFY_IS_EQUAL(f_src, m);
+    MatrixType f_dst = std::move(f_src);
+    VERIFY_IS_EQUAL(f_dst, m);
+    
+    // rvalue reference is moved - copy assignment.
+    MatrixType g_src(m);
+    VERIFY_IS_EQUAL(g_src, m);
+    MatrixType g_dst;
+    g_dst = std::move(g_src);
+    VERIFY_IS_EQUAL(g_dst, m);
+}
+#else
+template <typename MatrixType>
+void rvalue_copyassign(const MatrixType&) {}
+template<typename TranspositionsType>
+void rvalue_transpositions(Index) {}
+template <typename MatrixType>
+void rvalue_move(const MatrixType&) {}
+#endif
+
+EIGEN_DECLARE_TEST(rvalue_types)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1(rvalue_copyassign( MatrixXf::Random(50,50).eval() ));
+    CALL_SUBTEST_1(rvalue_copyassign( ArrayXXf::Random(50,50).eval() ));
+
+    CALL_SUBTEST_1(rvalue_copyassign( Matrix<float,1,Dynamic>::Random(50).eval() ));
+    CALL_SUBTEST_1(rvalue_copyassign( Array<float,1,Dynamic>::Random(50).eval() ));
+
+    CALL_SUBTEST_1(rvalue_copyassign( Matrix<float,Dynamic,1>::Random(50).eval() ));
+    CALL_SUBTEST_1(rvalue_copyassign( Array<float,Dynamic,1>::Random(50).eval() ));
+
+    CALL_SUBTEST_2(rvalue_copyassign( Array<float,2,1>::Random().eval() ));
+    CALL_SUBTEST_2(rvalue_copyassign( Array<float,3,1>::Random().eval() ));
+    CALL_SUBTEST_2(rvalue_copyassign( Array<float,4,1>::Random().eval() ));
+
+    CALL_SUBTEST_2(rvalue_copyassign( Array<float,2,2>::Random().eval() ));
+    CALL_SUBTEST_2(rvalue_copyassign( Array<float,3,3>::Random().eval() ));
+    CALL_SUBTEST_2(rvalue_copyassign( Array<float,4,4>::Random().eval() ));
+  
+    CALL_SUBTEST_3((rvalue_transpositions<PermutationMatrix<Dynamic, Dynamic, int> >(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
+    CALL_SUBTEST_3((rvalue_transpositions<PermutationMatrix<Dynamic, Dynamic, Index> >(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
+    CALL_SUBTEST_4((rvalue_transpositions<Transpositions<Dynamic, Dynamic, int> >(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
+    CALL_SUBTEST_4((rvalue_transpositions<Transpositions<Dynamic, Dynamic, Index> >(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
+
+#if EIGEN_HAS_CXX11
+    CALL_SUBTEST_5(rvalue_move(Eigen::Matrix<MovableScalar<float>,1,3>::Random().eval()));
+    CALL_SUBTEST_5(rvalue_move(Eigen::Matrix<SafeScalar<float>,1,3>::Random().eval()));
+    CALL_SUBTEST_5(rvalue_move(Eigen::Matrix<SafeScalar<float>,Eigen::Dynamic,Eigen::Dynamic>::Random(1,3).eval()));
+#endif
+  }
+}

diff --git a/test/schur_complex.cpp b/test/schur_complex.cpp
new file mode 100644
index 0000000..03e17e8
--- /dev/null
+++ b/test/schur_complex.cpp

@@ -0,0 +1,91 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010,2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <limits>
+#include <Eigen/Eigenvalues>
+
+template<typename MatrixType> void schur(int size = MatrixType::ColsAtCompileTime)
+{
+  typedef typename ComplexSchur<MatrixType>::ComplexScalar ComplexScalar;
+  typedef typename ComplexSchur<MatrixType>::ComplexMatrixType ComplexMatrixType;
+
+  // Test basic functionality: T is triangular and A = U T U*
+  for(int counter = 0; counter < g_repeat; ++counter) {
+    MatrixType A = MatrixType::Random(size, size);
+    ComplexSchur<MatrixType> schurOfA(A);
+    VERIFY_IS_EQUAL(schurOfA.info(), Success);
+    ComplexMatrixType U = schurOfA.matrixU();
+    ComplexMatrixType T = schurOfA.matrixT();
+    for(int row = 1; row < size; ++row) {
+      for(int col = 0; col < row; ++col) {
+        VERIFY(T(row,col) == (typename MatrixType::Scalar)0);
+      }
+    }
+    VERIFY_IS_APPROX(A.template cast<ComplexScalar>(), U * T * U.adjoint());
+  }
+
+  // Test asserts when not initialized
+  ComplexSchur<MatrixType> csUninitialized;
+  VERIFY_RAISES_ASSERT(csUninitialized.matrixT());
+  VERIFY_RAISES_ASSERT(csUninitialized.matrixU());
+  VERIFY_RAISES_ASSERT(csUninitialized.info());
+  
+  // Test whether compute() and constructor returns same result
+  MatrixType A = MatrixType::Random(size, size);
+  ComplexSchur<MatrixType> cs1;
+  cs1.compute(A);
+  ComplexSchur<MatrixType> cs2(A);
+  VERIFY_IS_EQUAL(cs1.info(), Success);
+  VERIFY_IS_EQUAL(cs2.info(), Success);
+  VERIFY_IS_EQUAL(cs1.matrixT(), cs2.matrixT());
+  VERIFY_IS_EQUAL(cs1.matrixU(), cs2.matrixU());
+
+  // Test maximum number of iterations
+  ComplexSchur<MatrixType> cs3;
+  cs3.setMaxIterations(ComplexSchur<MatrixType>::m_maxIterationsPerRow * size).compute(A);
+  VERIFY_IS_EQUAL(cs3.info(), Success);
+  VERIFY_IS_EQUAL(cs3.matrixT(), cs1.matrixT());
+  VERIFY_IS_EQUAL(cs3.matrixU(), cs1.matrixU());
+  cs3.setMaxIterations(1).compute(A);
+  VERIFY_IS_EQUAL(cs3.info(), size > 1 ? NoConvergence : Success);
+  VERIFY_IS_EQUAL(cs3.getMaxIterations(), 1);
+
+  MatrixType Atriangular = A;
+  Atriangular.template triangularView<StrictlyLower>().setZero(); 
+  cs3.setMaxIterations(1).compute(Atriangular); // triangular matrices do not need any iterations
+  VERIFY_IS_EQUAL(cs3.info(), Success);
+  VERIFY_IS_EQUAL(cs3.matrixT(), Atriangular.template cast<ComplexScalar>());
+  VERIFY_IS_EQUAL(cs3.matrixU(), ComplexMatrixType::Identity(size, size));
+
+  // Test computation of only T, not U
+  ComplexSchur<MatrixType> csOnlyT(A, false);
+  VERIFY_IS_EQUAL(csOnlyT.info(), Success);
+  VERIFY_IS_EQUAL(cs1.matrixT(), csOnlyT.matrixT());
+  VERIFY_RAISES_ASSERT(csOnlyT.matrixU());
+
+  if (size > 1 && size < 20)
+  {
+    // Test matrix with NaN
+    A(0,0) = std::numeric_limits<typename MatrixType::RealScalar>::quiet_NaN();
+    ComplexSchur<MatrixType> csNaN(A);
+    VERIFY_IS_EQUAL(csNaN.info(), NoConvergence);
+  }
+}
+
+EIGEN_DECLARE_TEST(schur_complex)
+{
+  CALL_SUBTEST_1(( schur<Matrix4cd>() ));
+  CALL_SUBTEST_2(( schur<MatrixXcf>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4)) ));
+  CALL_SUBTEST_3(( schur<Matrix<std::complex<float>, 1, 1> >() ));
+  CALL_SUBTEST_4(( schur<Matrix<float, 3, 3, Eigen::RowMajor> >() ));
+
+  // Test problem size constructors
+  CALL_SUBTEST_5(ComplexSchur<MatrixXf>(10));
+}

diff --git a/test/schur_real.cpp b/test/schur_real.cpp
new file mode 100644
index 0000000..9454610
--- /dev/null
+++ b/test/schur_real.cpp

@@ -0,0 +1,110 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010,2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <limits>
+#include <Eigen/Eigenvalues>
+
+template<typename MatrixType> void verifyIsQuasiTriangular(const MatrixType& T)
+{
+  const Index size = T.cols();
+  typedef typename MatrixType::Scalar Scalar;
+
+  // Check T is lower Hessenberg
+  for(int row = 2; row < size; ++row) {
+    for(int col = 0; col < row - 1; ++col) {
+      VERIFY(T(row,col) == Scalar(0));
+    }
+  }
+
+  // Check that any non-zero on the subdiagonal is followed by a zero and is
+  // part of a 2x2 diagonal block with imaginary eigenvalues.
+  for(int row = 1; row < size; ++row) {
+    if (T(row,row-1) != Scalar(0)) {
+      VERIFY(row == size-1 || T(row+1,row) == 0);
+      Scalar tr = T(row-1,row-1) + T(row,row);
+      Scalar det = T(row-1,row-1) * T(row,row) - T(row-1,row) * T(row,row-1);
+      VERIFY(4 * det > tr * tr);
+    }
+  }
+}
+
+template<typename MatrixType> void schur(int size = MatrixType::ColsAtCompileTime)
+{
+  // Test basic functionality: T is quasi-triangular and A = U T U*
+  for(int counter = 0; counter < g_repeat; ++counter) {
+    MatrixType A = MatrixType::Random(size, size);
+    RealSchur<MatrixType> schurOfA(A);
+    VERIFY_IS_EQUAL(schurOfA.info(), Success);
+    MatrixType U = schurOfA.matrixU();
+    MatrixType T = schurOfA.matrixT();
+    verifyIsQuasiTriangular(T);
+    VERIFY_IS_APPROX(A, U * T * U.transpose());
+  }
+
+  // Test asserts when not initialized
+  RealSchur<MatrixType> rsUninitialized;
+  VERIFY_RAISES_ASSERT(rsUninitialized.matrixT());
+  VERIFY_RAISES_ASSERT(rsUninitialized.matrixU());
+  VERIFY_RAISES_ASSERT(rsUninitialized.info());
+  
+  // Test whether compute() and constructor returns same result
+  MatrixType A = MatrixType::Random(size, size);
+  RealSchur<MatrixType> rs1;
+  rs1.compute(A);
+  RealSchur<MatrixType> rs2(A);
+  VERIFY_IS_EQUAL(rs1.info(), Success);
+  VERIFY_IS_EQUAL(rs2.info(), Success);
+  VERIFY_IS_EQUAL(rs1.matrixT(), rs2.matrixT());
+  VERIFY_IS_EQUAL(rs1.matrixU(), rs2.matrixU());
+
+  // Test maximum number of iterations
+  RealSchur<MatrixType> rs3;
+  rs3.setMaxIterations(RealSchur<MatrixType>::m_maxIterationsPerRow * size).compute(A);
+  VERIFY_IS_EQUAL(rs3.info(), Success);
+  VERIFY_IS_EQUAL(rs3.matrixT(), rs1.matrixT());
+  VERIFY_IS_EQUAL(rs3.matrixU(), rs1.matrixU());
+  if (size > 2) {
+    rs3.setMaxIterations(1).compute(A);
+    VERIFY_IS_EQUAL(rs3.info(), NoConvergence);
+    VERIFY_IS_EQUAL(rs3.getMaxIterations(), 1);
+  }
+
+  MatrixType Atriangular = A;
+  Atriangular.template triangularView<StrictlyLower>().setZero(); 
+  rs3.setMaxIterations(1).compute(Atriangular); // triangular matrices do not need any iterations
+  VERIFY_IS_EQUAL(rs3.info(), Success);
+  VERIFY_IS_APPROX(rs3.matrixT(), Atriangular); // approx because of scaling...
+  VERIFY_IS_EQUAL(rs3.matrixU(), MatrixType::Identity(size, size));
+
+  // Test computation of only T, not U
+  RealSchur<MatrixType> rsOnlyT(A, false);
+  VERIFY_IS_EQUAL(rsOnlyT.info(), Success);
+  VERIFY_IS_EQUAL(rs1.matrixT(), rsOnlyT.matrixT());
+  VERIFY_RAISES_ASSERT(rsOnlyT.matrixU());
+
+  if (size > 2 && size < 20)
+  {
+    // Test matrix with NaN
+    A(0,0) = std::numeric_limits<typename MatrixType::Scalar>::quiet_NaN();
+    RealSchur<MatrixType> rsNaN(A);
+    VERIFY_IS_EQUAL(rsNaN.info(), NoConvergence);
+  }
+}
+
+EIGEN_DECLARE_TEST(schur_real)
+{
+  CALL_SUBTEST_1(( schur<Matrix4f>() ));
+  CALL_SUBTEST_2(( schur<MatrixXd>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4)) ));
+  CALL_SUBTEST_3(( schur<Matrix<float, 1, 1> >() ));
+  CALL_SUBTEST_4(( schur<Matrix<double, 3, 3, Eigen::RowMajor> >() ));
+
+  // Test problem size constructors
+  CALL_SUBTEST_5(RealSchur<MatrixXf>(10));
+}

diff --git a/test/selfadjoint.cpp b/test/selfadjoint.cpp
new file mode 100644
index 0000000..9ca9cef
--- /dev/null
+++ b/test/selfadjoint.cpp

@@ -0,0 +1,75 @@
+// This file is triangularView of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define TEST_CHECK_STATIC_ASSERTIONS
+#include "main.h"
+
+// This file tests the basic selfadjointView API,
+// the related products and decompositions are tested in specific files.
+
+template<typename MatrixType> void selfadjoint(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m2 = MatrixType::Random(rows, cols),
+             m3(rows, cols),
+             m4(rows, cols);
+
+  m1.diagonal() = m1.diagonal().real().template cast<Scalar>();
+
+  // check selfadjoint to dense
+  m3 = m1.template selfadjointView<Upper>();
+  VERIFY_IS_APPROX(MatrixType(m3.template triangularView<Upper>()), MatrixType(m1.template triangularView<Upper>()));
+  VERIFY_IS_APPROX(m3, m3.adjoint());
+
+  m3 = m1.template selfadjointView<Lower>();
+  VERIFY_IS_APPROX(MatrixType(m3.template triangularView<Lower>()), MatrixType(m1.template triangularView<Lower>()));
+  VERIFY_IS_APPROX(m3, m3.adjoint());
+
+  m3 = m1.template selfadjointView<Upper>();
+  m4 = m2;
+  m4 += m1.template selfadjointView<Upper>();
+  VERIFY_IS_APPROX(m4, m2+m3);
+
+  m3 = m1.template selfadjointView<Lower>();
+  m4 = m2;
+  m4 -= m1.template selfadjointView<Lower>();
+  VERIFY_IS_APPROX(m4, m2-m3);
+
+  VERIFY_RAISES_STATIC_ASSERT(m2.template selfadjointView<StrictlyUpper>());
+  VERIFY_RAISES_STATIC_ASSERT(m2.template selfadjointView<UnitLower>());
+}
+
+void bug_159()
+{
+  Matrix3d m = Matrix3d::Random().selfadjointView<Lower>();
+  EIGEN_UNUSED_VARIABLE(m)
+}
+
+EIGEN_DECLARE_TEST(selfadjoint)
+{
+  for(int i = 0; i < g_repeat ; i++)
+  {
+    int s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
+
+    CALL_SUBTEST_1( selfadjoint(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( selfadjoint(Matrix<float, 2, 2>()) );
+    CALL_SUBTEST_3( selfadjoint(Matrix3cf()) );
+    CALL_SUBTEST_4( selfadjoint(MatrixXcd(s,s)) );
+    CALL_SUBTEST_5( selfadjoint(Matrix<float,Dynamic,Dynamic,RowMajor>(s, s)) );
+    
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+  }
+  
+  CALL_SUBTEST_1( bug_159() );
+}

diff --git a/test/simplicial_cholesky.cpp b/test/simplicial_cholesky.cpp
new file mode 100644
index 0000000..538d01a
--- /dev/null
+++ b/test/simplicial_cholesky.cpp

@@ -0,0 +1,50 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "sparse_solver.h"
+
+template<typename T, typename I_, int flag> void test_simplicial_cholesky_T()
+{
+  typedef SparseMatrix<T,flag,I_> SparseMatrixType;
+  SimplicialCholesky<SparseMatrixType, Lower> chol_colmajor_lower_amd;
+  SimplicialCholesky<SparseMatrixType, Upper> chol_colmajor_upper_amd;
+  SimplicialLLT<     SparseMatrixType, Lower> llt_colmajor_lower_amd;
+  SimplicialLLT<     SparseMatrixType, Upper> llt_colmajor_upper_amd;
+  SimplicialLDLT<    SparseMatrixType, Lower> ldlt_colmajor_lower_amd;
+  SimplicialLDLT<    SparseMatrixType, Upper> ldlt_colmajor_upper_amd;
+  SimplicialLDLT<    SparseMatrixType, Lower, NaturalOrdering<I_> > ldlt_colmajor_lower_nat;
+  SimplicialLDLT<    SparseMatrixType, Upper, NaturalOrdering<I_> > ldlt_colmajor_upper_nat;
+
+  check_sparse_spd_solving(chol_colmajor_lower_amd);
+  check_sparse_spd_solving(chol_colmajor_upper_amd);
+  check_sparse_spd_solving(llt_colmajor_lower_amd);
+  check_sparse_spd_solving(llt_colmajor_upper_amd);
+  check_sparse_spd_solving(ldlt_colmajor_lower_amd);
+  check_sparse_spd_solving(ldlt_colmajor_upper_amd);
+  
+  check_sparse_spd_determinant(chol_colmajor_lower_amd);
+  check_sparse_spd_determinant(chol_colmajor_upper_amd);
+  check_sparse_spd_determinant(llt_colmajor_lower_amd);
+  check_sparse_spd_determinant(llt_colmajor_upper_amd);
+  check_sparse_spd_determinant(ldlt_colmajor_lower_amd);
+  check_sparse_spd_determinant(ldlt_colmajor_upper_amd);
+  
+  check_sparse_spd_solving(ldlt_colmajor_lower_nat, (std::min)(300,EIGEN_TEST_MAX_SIZE), 1000);
+  check_sparse_spd_solving(ldlt_colmajor_upper_nat, (std::min)(300,EIGEN_TEST_MAX_SIZE), 1000);
+}
+
+EIGEN_DECLARE_TEST(simplicial_cholesky)
+{
+  CALL_SUBTEST_11(( test_simplicial_cholesky_T<double,               int, ColMajor>() ));
+  CALL_SUBTEST_12(( test_simplicial_cholesky_T<std::complex<double>, int, ColMajor>() ));
+  CALL_SUBTEST_13(( test_simplicial_cholesky_T<double,          long int, ColMajor>() ));
+  CALL_SUBTEST_21(( test_simplicial_cholesky_T<double,               int, RowMajor>() ));
+  CALL_SUBTEST_22(( test_simplicial_cholesky_T<std::complex<double>, int, RowMajor>() ));
+  CALL_SUBTEST_23(( test_simplicial_cholesky_T<double,          long int, RowMajor>() ));
+}

diff --git a/test/sizeof.cpp b/test/sizeof.cpp
new file mode 100644
index 0000000..af34e97
--- /dev/null
+++ b/test/sizeof.cpp

@@ -0,0 +1,47 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename MatrixType> void verifySizeOf(const MatrixType&)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  if (MatrixType::RowsAtCompileTime!=Dynamic && MatrixType::ColsAtCompileTime!=Dynamic)
+    VERIFY_IS_EQUAL(std::ptrdiff_t(sizeof(MatrixType)),std::ptrdiff_t(sizeof(Scalar))*std::ptrdiff_t(MatrixType::SizeAtCompileTime));
+  else
+    VERIFY_IS_EQUAL(sizeof(MatrixType),sizeof(Scalar*) + 2 * sizeof(Index));
+}
+
+EIGEN_DECLARE_TEST(sizeof)
+{
+  CALL_SUBTEST(verifySizeOf(Matrix<float, 1, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 2, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 3, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 4, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 5, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 6, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 7, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 8, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 9, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 10, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 11, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Array<float, 12, 1>()) );
+  CALL_SUBTEST(verifySizeOf(Vector2d()) );
+  CALL_SUBTEST(verifySizeOf(Vector4f()) );
+  CALL_SUBTEST(verifySizeOf(Matrix4d()) );
+  CALL_SUBTEST(verifySizeOf(Matrix<double, 4, 2>()) );
+  CALL_SUBTEST(verifySizeOf(Matrix<bool, 7, 5>()) );
+  CALL_SUBTEST(verifySizeOf(MatrixXcf(3, 3)) );
+  CALL_SUBTEST(verifySizeOf(MatrixXi(8, 12)) );
+  CALL_SUBTEST(verifySizeOf(MatrixXcd(20, 20)) );
+  CALL_SUBTEST(verifySizeOf(Matrix<float, 100, 100>()) );
+  
+  VERIFY(sizeof(std::complex<float>) == 2*sizeof(float));
+  VERIFY(sizeof(std::complex<double>) == 2*sizeof(double));
+}

diff --git a/test/sizeoverflow.cpp b/test/sizeoverflow.cpp
new file mode 100644
index 0000000..4213512
--- /dev/null
+++ b/test/sizeoverflow.cpp

@@ -0,0 +1,64 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#define VERIFY_THROWS_BADALLOC(a) {                           \
+    bool threw = false;                                       \
+    try {                                                     \
+      a;                                                      \
+    }                                                         \
+    catch (std::bad_alloc&) { threw = true; }                 \
+    VERIFY(threw && "should have thrown bad_alloc: " #a);     \
+  }
+
+template<typename MatrixType>
+void triggerMatrixBadAlloc(Index rows, Index cols)
+{
+  VERIFY_THROWS_BADALLOC( MatrixType m(rows, cols) );
+  VERIFY_THROWS_BADALLOC( MatrixType m; m.resize(rows, cols) );
+  VERIFY_THROWS_BADALLOC( MatrixType m; m.conservativeResize(rows, cols) );
+}
+
+template<typename VectorType>
+void triggerVectorBadAlloc(Index size)
+{
+  VERIFY_THROWS_BADALLOC( VectorType v(size) );
+  VERIFY_THROWS_BADALLOC( VectorType v; v.resize(size) );
+  VERIFY_THROWS_BADALLOC( VectorType v; v.conservativeResize(size) );
+}
+
+EIGEN_DECLARE_TEST(sizeoverflow)
+{
+  // there are 2 levels of overflow checking. first in PlainObjectBase.h we check for overflow in rows*cols computations.
+  // this is tested in tests of the form times_itself_gives_0 * times_itself_gives_0
+  // Then in Memory.h we check for overflow in size * sizeof(T) computations.
+  // this is tested in tests of the form times_4_gives_0 * sizeof(float)
+  
+  size_t times_itself_gives_0 = size_t(1) << (8 * sizeof(Index) / 2);
+  VERIFY(times_itself_gives_0 * times_itself_gives_0 == 0);
+
+  size_t times_4_gives_0 = size_t(1) << (8 * sizeof(Index) - 2);
+  VERIFY(times_4_gives_0 * 4 == 0);
+
+  size_t times_8_gives_0 = size_t(1) << (8 * sizeof(Index) - 3);
+  VERIFY(times_8_gives_0 * 8 == 0);
+
+  triggerMatrixBadAlloc<MatrixXf>(times_itself_gives_0, times_itself_gives_0);
+  triggerMatrixBadAlloc<MatrixXf>(times_itself_gives_0 / 4, times_itself_gives_0);
+  triggerMatrixBadAlloc<MatrixXf>(times_4_gives_0, 1);
+
+  triggerMatrixBadAlloc<MatrixXd>(times_itself_gives_0, times_itself_gives_0);
+  triggerMatrixBadAlloc<MatrixXd>(times_itself_gives_0 / 8, times_itself_gives_0);
+  triggerMatrixBadAlloc<MatrixXd>(times_8_gives_0, 1);
+  
+  triggerVectorBadAlloc<VectorXf>(times_4_gives_0);
+  
+  triggerVectorBadAlloc<VectorXd>(times_8_gives_0);
+}

diff --git a/test/smallvectors.cpp b/test/smallvectors.cpp
new file mode 100644
index 0000000..f9803ac
--- /dev/null
+++ b/test/smallvectors.cpp

@@ -0,0 +1,67 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_NO_STATIC_ASSERT
+#include "main.h"
+
+template<typename Scalar> void smallVectors()
+{
+  typedef Matrix<Scalar, 1, 2> V2;
+  typedef Matrix<Scalar, 3, 1> V3;
+  typedef Matrix<Scalar, 1, 4> V4;
+  typedef Matrix<Scalar, Dynamic, 1> VX;
+  Scalar x1 = internal::random<Scalar>(),
+         x2 = internal::random<Scalar>(),
+         x3 = internal::random<Scalar>(),
+         x4 = internal::random<Scalar>();
+  V2 v2(x1, x2);
+  V3 v3(x1, x2, x3);
+  V4 v4(x1, x2, x3, x4);
+  VERIFY_IS_APPROX(x1, v2.x());
+  VERIFY_IS_APPROX(x1, v3.x());
+  VERIFY_IS_APPROX(x1, v4.x());
+  VERIFY_IS_APPROX(x2, v2.y());
+  VERIFY_IS_APPROX(x2, v3.y());
+  VERIFY_IS_APPROX(x2, v4.y());
+  VERIFY_IS_APPROX(x3, v3.z());
+  VERIFY_IS_APPROX(x3, v4.z());
+  VERIFY_IS_APPROX(x4, v4.w());
+
+  if (!NumTraits<Scalar>::IsInteger)
+  {
+    VERIFY_RAISES_ASSERT(V3(2, 1))
+    VERIFY_RAISES_ASSERT(V3(3, 2))
+    VERIFY_RAISES_ASSERT(V3(Scalar(3), 1))
+    VERIFY_RAISES_ASSERT(V3(3, Scalar(1)))
+    VERIFY_RAISES_ASSERT(V3(Scalar(3), Scalar(1)))
+    VERIFY_RAISES_ASSERT(V3(Scalar(123), Scalar(123)))
+
+    VERIFY_RAISES_ASSERT(V4(1, 3))
+    VERIFY_RAISES_ASSERT(V4(2, 4))
+    VERIFY_RAISES_ASSERT(V4(1, Scalar(4)))
+    VERIFY_RAISES_ASSERT(V4(Scalar(1), 4))
+    VERIFY_RAISES_ASSERT(V4(Scalar(1), Scalar(4)))
+    VERIFY_RAISES_ASSERT(V4(Scalar(123), Scalar(123)))
+
+    VERIFY_RAISES_ASSERT(VX(3, 2))
+    VERIFY_RAISES_ASSERT(VX(Scalar(3), 1))
+    VERIFY_RAISES_ASSERT(VX(3, Scalar(1)))
+    VERIFY_RAISES_ASSERT(VX(Scalar(3), Scalar(1)))
+    VERIFY_RAISES_ASSERT(VX(Scalar(123), Scalar(123)))
+  }
+}
+
+EIGEN_DECLARE_TEST(smallvectors)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST(smallVectors<int>() );
+    CALL_SUBTEST(smallVectors<float>() );
+    CALL_SUBTEST(smallVectors<double>() );
+  }
+}

diff --git a/test/solverbase.h b/test/solverbase.h
new file mode 100644
index 0000000..13c0959
--- /dev/null
+++ b/test/solverbase.h

@@ -0,0 +1,36 @@
+#ifndef TEST_SOLVERBASE_H
+#define TEST_SOLVERBASE_H
+
+template<typename DstType, typename RhsType, typename MatrixType, typename SolverType>
+void check_solverbase(const MatrixType& matrix, const SolverType& solver, Index rows, Index cols, Index cols2)
+{
+  // solve
+  DstType m2               = DstType::Random(cols,cols2);
+  RhsType m3               = matrix*m2;
+  DstType solver_solution  = DstType::Random(cols,cols2);
+  solver._solve_impl(m3, solver_solution);
+  VERIFY_IS_APPROX(m3, matrix*solver_solution);
+  solver_solution          = DstType::Random(cols,cols2);
+  solver_solution          = solver.solve(m3);
+  VERIFY_IS_APPROX(m3, matrix*solver_solution);
+  // test solve with transposed
+  m3                       = RhsType::Random(rows,cols2);
+  m2                       = matrix.transpose()*m3;
+  RhsType solver_solution2 = RhsType::Random(rows,cols2);
+  solver.template _solve_impl_transposed<false>(m2, solver_solution2);
+  VERIFY_IS_APPROX(m2, matrix.transpose()*solver_solution2);
+  solver_solution2         = RhsType::Random(rows,cols2);
+  solver_solution2         = solver.transpose().solve(m2);
+  VERIFY_IS_APPROX(m2, matrix.transpose()*solver_solution2);
+  // test solve with conjugate transposed
+  m3                       = RhsType::Random(rows,cols2);
+  m2                       = matrix.adjoint()*m3;
+  solver_solution2         = RhsType::Random(rows,cols2);
+  solver.template _solve_impl_transposed<true>(m2, solver_solution2);
+  VERIFY_IS_APPROX(m2, matrix.adjoint()*solver_solution2);
+  solver_solution2         = RhsType::Random(rows,cols2);
+  solver_solution2         = solver.adjoint().solve(m2);
+  VERIFY_IS_APPROX(m2, matrix.adjoint()*solver_solution2);
+}
+
+#endif // TEST_SOLVERBASE_H

diff --git a/test/sparse.h b/test/sparse.h
new file mode 100644
index 0000000..6cd07fc
--- /dev/null
+++ b/test/sparse.h

@@ -0,0 +1,204 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TESTSPARSE_H
+#define EIGEN_TESTSPARSE_H
+
+#define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET
+
+#include "main.h"
+
+#if EIGEN_HAS_CXX11
+
+#ifdef min
+#undef min
+#endif
+
+#ifdef max
+#undef max
+#endif
+
+#include <unordered_map>
+#define EIGEN_UNORDERED_MAP_SUPPORT
+
+#endif
+
+#include <Eigen/Cholesky>
+#include <Eigen/LU>
+#include <Eigen/Sparse>
+
+enum {
+  ForceNonZeroDiag = 1,
+  MakeLowerTriangular = 2,
+  MakeUpperTriangular = 4,
+  ForceRealDiag = 8
+};
+
+/* Initializes both a sparse and dense matrix with same random values,
+ * and a ratio of \a density non zero entries.
+ * \param flags is a union of ForceNonZeroDiag, MakeLowerTriangular and MakeUpperTriangular
+ *        allowing to control the shape of the matrix.
+ * \param zeroCoords and nonzeroCoords allows to get the coordinate lists of the non zero,
+ *        and zero coefficients respectively.
+ */
+template<typename Scalar,int Opt1,int Opt2,typename StorageIndex> void
+initSparse(double density,
+           Matrix<Scalar,Dynamic,Dynamic,Opt1>& refMat,
+           SparseMatrix<Scalar,Opt2,StorageIndex>& sparseMat,
+           int flags = 0,
+           std::vector<Matrix<StorageIndex,2,1> >* zeroCoords = 0,
+           std::vector<Matrix<StorageIndex,2,1> >* nonzeroCoords = 0)
+{
+  enum { IsRowMajor = SparseMatrix<Scalar,Opt2,StorageIndex>::IsRowMajor };
+  sparseMat.setZero();
+  //sparseMat.reserve(int(refMat.rows()*refMat.cols()*density));
+  sparseMat.reserve(VectorXi::Constant(IsRowMajor ? refMat.rows() : refMat.cols(), int((1.5*density)*(IsRowMajor?refMat.cols():refMat.rows()))));
+  
+  for(Index j=0; j<sparseMat.outerSize(); j++)
+  {
+    //sparseMat.startVec(j);
+    for(Index i=0; i<sparseMat.innerSize(); i++)
+    {
+      Index ai(i), aj(j);
+      if(IsRowMajor)
+        std::swap(ai,aj);
+      Scalar v = (internal::random<double>(0,1) < density) ? internal::random<Scalar>() : Scalar(0);
+      if ((flags&ForceNonZeroDiag) && (i==j))
+      {
+        // FIXME: the following is too conservative
+        v = internal::random<Scalar>()*Scalar(3.);
+        v = v*v;
+        if(numext::real(v)>0) v += Scalar(5);
+        else                  v -= Scalar(5);
+      }
+      if ((flags & MakeLowerTriangular) && aj>ai)
+        v = Scalar(0);
+      else if ((flags & MakeUpperTriangular) && aj<ai)
+        v = Scalar(0);
+
+      if ((flags&ForceRealDiag) && (i==j))
+        v = numext::real(v);
+
+      if (v!=Scalar(0))
+      {
+        //sparseMat.insertBackByOuterInner(j,i) = v;
+        sparseMat.insertByOuterInner(j,i) = v;
+        if (nonzeroCoords)
+          nonzeroCoords->push_back(Matrix<StorageIndex,2,1> (ai,aj));
+      }
+      else if (zeroCoords)
+      {
+        zeroCoords->push_back(Matrix<StorageIndex,2,1> (ai,aj));
+      }
+      refMat(ai,aj) = v;
+    }
+  }
+  //sparseMat.finalize();
+}
+
+template<typename Scalar,int Opt1,int Opt2,typename Index> void
+initSparse(double density,
+           Matrix<Scalar,Dynamic,Dynamic, Opt1>& refMat,
+           DynamicSparseMatrix<Scalar, Opt2, Index>& sparseMat,
+           int flags = 0,
+           std::vector<Matrix<Index,2,1> >* zeroCoords = 0,
+           std::vector<Matrix<Index,2,1> >* nonzeroCoords = 0)
+{
+  enum { IsRowMajor = DynamicSparseMatrix<Scalar,Opt2,Index>::IsRowMajor };
+  sparseMat.setZero();
+  sparseMat.reserve(int(refMat.rows()*refMat.cols()*density));
+  for(int j=0; j<sparseMat.outerSize(); j++)
+  {
+    sparseMat.startVec(j); // not needed for DynamicSparseMatrix
+    for(int i=0; i<sparseMat.innerSize(); i++)
+    {
+      int ai(i), aj(j);
+      if(IsRowMajor)
+        std::swap(ai,aj);
+      Scalar v = (internal::random<double>(0,1) < density) ? internal::random<Scalar>() : Scalar(0);
+      if ((flags&ForceNonZeroDiag) && (i==j))
+      {
+        v = internal::random<Scalar>()*Scalar(3.);
+        v = v*v + Scalar(5.);
+      }
+      if ((flags & MakeLowerTriangular) && aj>ai)
+        v = Scalar(0);
+      else if ((flags & MakeUpperTriangular) && aj<ai)
+        v = Scalar(0);
+
+      if ((flags&ForceRealDiag) && (i==j))
+        v = numext::real(v);
+
+      if (v!=Scalar(0))
+      {
+        sparseMat.insertBackByOuterInner(j,i) = v;
+        if (nonzeroCoords)
+          nonzeroCoords->push_back(Matrix<Index,2,1> (ai,aj));
+      }
+      else if (zeroCoords)
+      {
+        zeroCoords->push_back(Matrix<Index,2,1> (ai,aj));
+      }
+      refMat(ai,aj) = v;
+    }
+  }
+  sparseMat.finalize();
+}
+
+template<typename Scalar,int Options,typename Index> void
+initSparse(double density,
+           Matrix<Scalar,Dynamic,1>& refVec,
+           SparseVector<Scalar,Options,Index>& sparseVec,
+           std::vector<int>* zeroCoords = 0,
+           std::vector<int>* nonzeroCoords = 0)
+{
+  sparseVec.reserve(int(refVec.size()*density));
+  sparseVec.setZero();
+  for(int i=0; i<refVec.size(); i++)
+  {
+    Scalar v = (internal::random<double>(0,1) < density) ? internal::random<Scalar>() : Scalar(0);
+    if (v!=Scalar(0))
+    {
+      sparseVec.insertBack(i) = v;
+      if (nonzeroCoords)
+        nonzeroCoords->push_back(i);
+    }
+    else if (zeroCoords)
+        zeroCoords->push_back(i);
+    refVec[i] = v;
+  }
+}
+
+template<typename Scalar,int Options,typename Index> void
+initSparse(double density,
+           Matrix<Scalar,1,Dynamic>& refVec,
+           SparseVector<Scalar,Options,Index>& sparseVec,
+           std::vector<int>* zeroCoords = 0,
+           std::vector<int>* nonzeroCoords = 0)
+{
+  sparseVec.reserve(int(refVec.size()*density));
+  sparseVec.setZero();
+  for(int i=0; i<refVec.size(); i++)
+  {
+    Scalar v = (internal::random<double>(0,1) < density) ? internal::random<Scalar>() : Scalar(0);
+    if (v!=Scalar(0))
+    {
+      sparseVec.insertBack(i) = v;
+      if (nonzeroCoords)
+        nonzeroCoords->push_back(i);
+    }
+    else if (zeroCoords)
+        zeroCoords->push_back(i);
+    refVec[i] = v;
+  }
+}
+
+
+#include <unsupported/Eigen/SparseExtra>
+#endif // EIGEN_TESTSPARSE_H

diff --git a/test/sparseLM.cpp b/test/sparseLM.cpp
new file mode 100644
index 0000000..a48fcb6
--- /dev/null
+++ b/test/sparseLM.cpp

@@ -0,0 +1,176 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Desire Nuentsa <desire.nuentsa_wakam@inria.fr>
+// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+
+#include "main.h"
+#include <Eigen/LevenbergMarquardt>
+
+using namespace std;
+using namespace Eigen;
+
+template <typename Scalar>
+struct sparseGaussianTest : SparseFunctor<Scalar, int>
+{
+  typedef Matrix<Scalar,Dynamic,1> VectorType;
+  typedef SparseFunctor<Scalar,int> Base;
+  typedef typename Base::JacobianType JacobianType;
+  sparseGaussianTest(int inputs, int values) : SparseFunctor<Scalar,int>(inputs,values)
+  { }
+  
+  VectorType model(const VectorType& uv, VectorType& x)
+  {
+    VectorType y; //Change this to use expression template
+    int m = Base::values(); 
+    int n = Base::inputs();
+    eigen_assert(uv.size()%2 == 0);
+    eigen_assert(uv.size() == n);
+    eigen_assert(x.size() == m);
+    y.setZero(m);
+    int half = n/2;
+    VectorBlock<const VectorType> u(uv, 0, half);
+    VectorBlock<const VectorType> v(uv, half, half);
+    Scalar coeff;
+    for (int j = 0; j < m; j++)
+    {
+      for (int i = 0; i < half; i++) 
+      {
+        coeff = (x(j)-i)/v(i);
+        coeff *= coeff;
+        if (coeff < 1. && coeff > 0.)
+          y(j) += u(i)*std::pow((1-coeff), 2);
+      }
+    }
+    return y;
+  }
+  void initPoints(VectorType& uv_ref, VectorType& x)
+  {
+    m_x = x;
+    m_y = this->model(uv_ref,x);
+  }
+  int operator()(const VectorType& uv, VectorType& fvec)
+  {
+    int m = Base::values(); 
+    int n = Base::inputs();
+    eigen_assert(uv.size()%2 == 0);
+    eigen_assert(uv.size() == n);
+    int half = n/2;
+    VectorBlock<const VectorType> u(uv, 0, half);
+    VectorBlock<const VectorType> v(uv, half, half);
+    fvec = m_y;
+    Scalar coeff;
+    for (int j = 0; j < m; j++)
+    {
+      for (int i = 0; i < half; i++)
+      {
+        coeff = (m_x(j)-i)/v(i);
+        coeff *= coeff;
+        if (coeff < 1. && coeff > 0.)
+          fvec(j) -= u(i)*std::pow((1-coeff), 2);
+      }
+    }
+    return 0;
+  }
+  
+  int df(const VectorType& uv, JacobianType& fjac)
+  {
+    int m = Base::values(); 
+    int n = Base::inputs();
+    eigen_assert(n == uv.size());
+    eigen_assert(fjac.rows() == m);
+    eigen_assert(fjac.cols() == n);
+    int half = n/2;
+    VectorBlock<const VectorType> u(uv, 0, half);
+    VectorBlock<const VectorType> v(uv, half, half);
+    Scalar coeff;
+    
+    //Derivatives with respect to u
+    for (int col = 0; col < half; col++)
+    {
+      for (int row = 0; row < m; row++)
+      {
+        coeff = (m_x(row)-col)/v(col);
+          coeff = coeff*coeff;
+        if(coeff < 1. && coeff > 0.)
+        {
+          fjac.coeffRef(row,col) = -(1-coeff)*(1-coeff);
+        }
+      }
+    }
+    //Derivatives with respect to v
+    for (int col = 0; col < half; col++)
+    {
+      for (int row = 0; row < m; row++)
+      {
+        coeff = (m_x(row)-col)/v(col);
+        coeff = coeff*coeff;
+        if(coeff < 1. && coeff > 0.)
+        {
+          fjac.coeffRef(row,col+half) = -4 * (u(col)/v(col))*coeff*(1-coeff);
+        }
+      }
+    }
+    return 0;
+  }
+  
+  VectorType m_x, m_y; //Data points
+};
+
+
+template<typename T>
+void test_sparseLM_T()
+{
+  typedef Matrix<T,Dynamic,1> VectorType;
+  
+  int inputs = 10;
+  int values = 2000;
+  sparseGaussianTest<T> sparse_gaussian(inputs, values);
+  VectorType uv(inputs),uv_ref(inputs);
+  VectorType x(values);
+  // Generate the reference solution 
+  uv_ref << -2, 1, 4 ,8, 6, 1.8, 1.2, 1.1, 1.9 , 3;
+  //Generate the reference data points
+  x.setRandom();
+  x = 10*x;
+  x.array() += 10;
+  sparse_gaussian.initPoints(uv_ref, x);
+  
+  
+  // Generate the initial parameters 
+  VectorBlock<VectorType> u(uv, 0, inputs/2); 
+  VectorBlock<VectorType> v(uv, inputs/2, inputs/2);
+  v.setOnes();
+  //Generate u or Solve for u from v
+  u.setOnes();
+  
+  // Solve the optimization problem
+  LevenbergMarquardt<sparseGaussianTest<T> > lm(sparse_gaussian);
+  int info;
+//   info = lm.minimize(uv);
+  
+  VERIFY_IS_EQUAL(info,1);
+    // Do a step by step solution and save the residual 
+  int maxiter = 200;
+  int iter = 0;
+  MatrixXd Err(values, maxiter);
+  MatrixXd Mod(values, maxiter);
+  LevenbergMarquardtSpace::Status status; 
+  status = lm.minimizeInit(uv);
+  if (status==LevenbergMarquardtSpace::ImproperInputParameters)
+      return ;
+
+}
+EIGEN_DECLARE_TEST(sparseLM)
+{
+  CALL_SUBTEST_1(test_sparseLM_T<double>());
+  
+  // CALL_SUBTEST_2(test_sparseLM_T<std::complex<double>());
+}

diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
new file mode 100644
index 0000000..9453111
--- /dev/null
+++ b/test/sparse_basic.cpp

@@ -0,0 +1,760 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008 Daniel Gomez Ferro <dgomezferro@gmail.com>
+// Copyright (C) 2013 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_TEST_INCLUDED_FROM_SPARSE_EXTRA
+static long g_realloc_count = 0;
+#define EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN g_realloc_count++;
+
+static long g_dense_op_sparse_count = 0;
+#define EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN g_dense_op_sparse_count++;
+#define EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN g_dense_op_sparse_count+=10;
+#define EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN g_dense_op_sparse_count+=20;
+#endif
+
+#include "sparse.h"
+
+template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& ref)
+{
+  typedef typename SparseMatrixType::StorageIndex StorageIndex;
+  typedef Matrix<StorageIndex,2,1> Vector2;
+  
+  const Index rows = ref.rows();
+  const Index cols = ref.cols();
+  //const Index inner = ref.innerSize();
+  //const Index outer = ref.outerSize();
+
+  typedef typename SparseMatrixType::Scalar Scalar;
+  typedef typename SparseMatrixType::RealScalar RealScalar;
+  enum { Flags = SparseMatrixType::Flags };
+
+  double density = (std::max)(8./(rows*cols), 0.01);
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+  typedef Matrix<Scalar,Dynamic,1> DenseVector;
+  Scalar eps = 1e-6;
+
+  Scalar s1 = internal::random<Scalar>();
+  {
+    SparseMatrixType m(rows, cols);
+    DenseMatrix refMat = DenseMatrix::Zero(rows, cols);
+    DenseVector vec1 = DenseVector::Random(rows);
+
+    std::vector<Vector2> zeroCoords;
+    std::vector<Vector2> nonzeroCoords;
+    initSparse<Scalar>(density, refMat, m, 0, &zeroCoords, &nonzeroCoords);
+
+    // test coeff and coeffRef
+    for (std::size_t i=0; i<zeroCoords.size(); ++i)
+    {
+      VERIFY_IS_MUCH_SMALLER_THAN( m.coeff(zeroCoords[i].x(),zeroCoords[i].y()), eps );
+      if(internal::is_same<SparseMatrixType,SparseMatrix<Scalar,Flags> >::value)
+        VERIFY_RAISES_ASSERT( m.coeffRef(zeroCoords[i].x(),zeroCoords[i].y()) = 5 );
+    }
+    VERIFY_IS_APPROX(m, refMat);
+
+    if(!nonzeroCoords.empty()) {
+      m.coeffRef(nonzeroCoords[0].x(), nonzeroCoords[0].y()) = Scalar(5);
+      refMat.coeffRef(nonzeroCoords[0].x(), nonzeroCoords[0].y()) = Scalar(5);
+    }
+
+    VERIFY_IS_APPROX(m, refMat);
+
+      // test assertion
+      VERIFY_RAISES_ASSERT( m.coeffRef(-1,1) = 0 );
+      VERIFY_RAISES_ASSERT( m.coeffRef(0,m.cols()) = 0 );
+    }
+
+    // test insert (inner random)
+    {
+      DenseMatrix m1(rows,cols);
+      m1.setZero();
+      SparseMatrixType m2(rows,cols);
+      bool call_reserve = internal::random<int>()%2;
+      Index nnz = internal::random<int>(1,int(rows)/2);
+      if(call_reserve)
+      {
+        if(internal::random<int>()%2)
+          m2.reserve(VectorXi::Constant(m2.outerSize(), int(nnz)));
+        else
+          m2.reserve(m2.outerSize() * nnz);
+      }
+      g_realloc_count = 0;
+      for (Index j=0; j<cols; ++j)
+      {
+        for (Index k=0; k<nnz; ++k)
+        {
+          Index i = internal::random<Index>(0,rows-1);
+          if (m1.coeff(i,j)==Scalar(0))
+            m2.insert(i,j) = m1(i,j) = internal::random<Scalar>();
+        }
+      }
+      
+      if(call_reserve && !SparseMatrixType::IsRowMajor)
+      {
+        VERIFY(g_realloc_count==0);
+      }
+      
+      m2.finalize();
+      VERIFY_IS_APPROX(m2,m1);
+    }
+
+    // test insert (fully random)
+    {
+      DenseMatrix m1(rows,cols);
+      m1.setZero();
+      SparseMatrixType m2(rows,cols);
+      if(internal::random<int>()%2)
+        m2.reserve(VectorXi::Constant(m2.outerSize(), 2));
+      for (int k=0; k<rows*cols; ++k)
+      {
+        Index i = internal::random<Index>(0,rows-1);
+        Index j = internal::random<Index>(0,cols-1);
+        if ((m1.coeff(i,j)==Scalar(0)) && (internal::random<int>()%2))
+          m2.insert(i,j) = m1(i,j) = internal::random<Scalar>();
+        else
+        {
+          Scalar v = internal::random<Scalar>();
+          m2.coeffRef(i,j) += v;
+          m1(i,j) += v;
+        }
+      }
+      VERIFY_IS_APPROX(m2,m1);
+    }
+    
+    // test insert (un-compressed)
+    for(int mode=0;mode<4;++mode)
+    {
+      DenseMatrix m1(rows,cols);
+      m1.setZero();
+      SparseMatrixType m2(rows,cols);
+      VectorXi r(VectorXi::Constant(m2.outerSize(), ((mode%2)==0) ? int(m2.innerSize()) : std::max<int>(1,int(m2.innerSize())/8)));
+      m2.reserve(r);
+      for (Index k=0; k<rows*cols; ++k)
+      {
+        Index i = internal::random<Index>(0,rows-1);
+        Index j = internal::random<Index>(0,cols-1);
+        if (m1.coeff(i,j)==Scalar(0))
+          m2.insert(i,j) = m1(i,j) = internal::random<Scalar>();
+        if(mode==3)
+          m2.reserve(r);
+      }
+      if(internal::random<int>()%2)
+        m2.makeCompressed();
+      VERIFY_IS_APPROX(m2,m1);
+    }
+
+  // test basic computations
+  {
+    DenseMatrix refM1 = DenseMatrix::Zero(rows, cols);
+    DenseMatrix refM2 = DenseMatrix::Zero(rows, cols);
+    DenseMatrix refM3 = DenseMatrix::Zero(rows, cols);
+    DenseMatrix refM4 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m1(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    SparseMatrixType m3(rows, cols);
+    SparseMatrixType m4(rows, cols);
+    initSparse<Scalar>(density, refM1, m1);
+    initSparse<Scalar>(density, refM2, m2);
+    initSparse<Scalar>(density, refM3, m3);
+    initSparse<Scalar>(density, refM4, m4);
+
+    if(internal::random<bool>())
+      m1.makeCompressed();
+
+    Index m1_nnz = m1.nonZeros();
+
+    VERIFY_IS_APPROX(m1*s1, refM1*s1);
+    VERIFY_IS_APPROX(m1+m2, refM1+refM2);
+    VERIFY_IS_APPROX(m1+m2+m3, refM1+refM2+refM3);
+    VERIFY_IS_APPROX(m3.cwiseProduct(m1+m2), refM3.cwiseProduct(refM1+refM2));
+    VERIFY_IS_APPROX(m1*s1-m2, refM1*s1-refM2);
+    VERIFY_IS_APPROX(m4=m1/s1, refM1/s1);
+    VERIFY_IS_EQUAL(m4.nonZeros(), m1_nnz);
+
+    if(SparseMatrixType::IsRowMajor)
+      VERIFY_IS_APPROX(m1.innerVector(0).dot(refM2.row(0)), refM1.row(0).dot(refM2.row(0)));
+    else
+      VERIFY_IS_APPROX(m1.innerVector(0).dot(refM2.col(0)), refM1.col(0).dot(refM2.col(0)));
+
+    DenseVector rv = DenseVector::Random(m1.cols());
+    DenseVector cv = DenseVector::Random(m1.rows());
+    Index r = internal::random<Index>(0,m1.rows()-2);
+    Index c = internal::random<Index>(0,m1.cols()-1);
+    VERIFY_IS_APPROX(( m1.template block<1,Dynamic>(r,0,1,m1.cols()).dot(rv)) , refM1.row(r).dot(rv));
+    VERIFY_IS_APPROX(m1.row(r).dot(rv), refM1.row(r).dot(rv));
+    VERIFY_IS_APPROX(m1.col(c).dot(cv), refM1.col(c).dot(cv));
+
+    VERIFY_IS_APPROX(m1.conjugate(), refM1.conjugate());
+    VERIFY_IS_APPROX(m1.real(), refM1.real());
+
+    refM4.setRandom();
+    // sparse cwise* dense
+    VERIFY_IS_APPROX(m3.cwiseProduct(refM4), refM3.cwiseProduct(refM4));
+    // dense cwise* sparse
+    VERIFY_IS_APPROX(refM4.cwiseProduct(m3), refM4.cwiseProduct(refM3));
+//     VERIFY_IS_APPROX(m3.cwise()/refM4, refM3.cwise()/refM4);
+
+    // mixed sparse-dense
+    VERIFY_IS_APPROX(refM4 + m3, refM4 + refM3);
+    VERIFY_IS_APPROX(m3 + refM4, refM3 + refM4);
+    VERIFY_IS_APPROX(refM4 - m3, refM4 - refM3);
+    VERIFY_IS_APPROX(m3 - refM4, refM3 - refM4);
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + RealScalar(0.5)*m3).eval(), RealScalar(0.5)*refM4 + RealScalar(0.5)*refM3);
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + m3*RealScalar(0.5)).eval(), RealScalar(0.5)*refM4 + RealScalar(0.5)*refM3);
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + m3.cwiseProduct(m3)).eval(), RealScalar(0.5)*refM4 + refM3.cwiseProduct(refM3));
+
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + RealScalar(0.5)*m3).eval(), RealScalar(0.5)*refM4 + RealScalar(0.5)*refM3);
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + m3*RealScalar(0.5)).eval(), RealScalar(0.5)*refM4 + RealScalar(0.5)*refM3);
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + (m3+m3)).eval(), RealScalar(0.5)*refM4 + (refM3+refM3));
+    VERIFY_IS_APPROX(((refM3+m3)+RealScalar(0.5)*m3).eval(), RealScalar(0.5)*refM3 + (refM3+refM3));
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + (refM3+m3)).eval(), RealScalar(0.5)*refM4 + (refM3+refM3));
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + (m3+refM3)).eval(), RealScalar(0.5)*refM4 + (refM3+refM3));
+
+
+    VERIFY_IS_APPROX(m1.sum(), refM1.sum());
+
+    m4 = m1; refM4 = m4;
+
+    VERIFY_IS_APPROX(m1*=s1, refM1*=s1);
+    VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz);
+    VERIFY_IS_APPROX(m1/=s1, refM1/=s1);
+    VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz);
+
+    VERIFY_IS_APPROX(m1+=m2, refM1+=refM2);
+    VERIFY_IS_APPROX(m1-=m2, refM1-=refM2);
+
+    refM3 = refM1;
+    
+    VERIFY_IS_APPROX(refM1+=m2, refM3+=refM2);
+    VERIFY_IS_APPROX(refM1-=m2, refM3-=refM2);
+
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1 =m2+refM4, refM3 =refM2+refM4);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,10);
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1+=m2+refM4, refM3+=refM2+refM4);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,1);
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1-=m2+refM4, refM3-=refM2+refM4);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,1);
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1 =refM4+m2, refM3 =refM2+refM4);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,1);
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1+=refM4+m2, refM3+=refM2+refM4);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,1);
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1-=refM4+m2, refM3-=refM2+refM4);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,1);
+
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1 =m2-refM4, refM3 =refM2-refM4);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,20);
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1+=m2-refM4, refM3+=refM2-refM4);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,1);
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1-=m2-refM4, refM3-=refM2-refM4);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,1);
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1 =refM4-m2, refM3 =refM4-refM2);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,1);
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1+=refM4-m2, refM3+=refM4-refM2);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,1);
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1-=refM4-m2, refM3-=refM4-refM2);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,1);
+    refM3 = m3;
+
+    if (rows>=2 && cols>=2)
+    {
+      VERIFY_RAISES_ASSERT( m1 += m1.innerVector(0) );
+      VERIFY_RAISES_ASSERT( m1 -= m1.innerVector(0) );
+      VERIFY_RAISES_ASSERT( refM1 -= m1.innerVector(0) );
+      VERIFY_RAISES_ASSERT( refM1 += m1.innerVector(0) );
+    }
+    m1 = m4; refM1 = refM4;
+
+    // test aliasing
+    VERIFY_IS_APPROX((m1 = -m1), (refM1 = -refM1));
+    VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz);
+    m1 = m4; refM1 = refM4;
+    VERIFY_IS_APPROX((m1 = m1.transpose()), (refM1 = refM1.transpose().eval()));
+    VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz);
+    m1 = m4; refM1 = refM4;
+    VERIFY_IS_APPROX((m1 = -m1.transpose()), (refM1 = -refM1.transpose().eval()));
+    VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz);
+    m1 = m4; refM1 = refM4;
+    VERIFY_IS_APPROX((m1 += -m1), (refM1 += -refM1));
+    VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz);
+    m1 = m4; refM1 = refM4;
+
+    if(m1.isCompressed())
+    {
+      VERIFY_IS_APPROX(m1.coeffs().sum(), m1.sum());
+      m1.coeffs() += s1;
+      for(Index j = 0; j<m1.outerSize(); ++j)
+        for(typename SparseMatrixType::InnerIterator it(m1,j); it; ++it)
+          refM1(it.row(), it.col()) += s1;
+      VERIFY_IS_APPROX(m1, refM1);
+    }
+
+    // and/or
+    {
+      typedef SparseMatrix<bool, SparseMatrixType::Options, typename SparseMatrixType::StorageIndex> SpBool;
+      SpBool mb1 = m1.real().template cast<bool>();
+      SpBool mb2 = m2.real().template cast<bool>();
+      VERIFY_IS_EQUAL(mb1.template cast<int>().sum(), refM1.real().template cast<bool>().count());
+      VERIFY_IS_EQUAL((mb1 && mb2).template cast<int>().sum(), (refM1.real().template cast<bool>() && refM2.real().template cast<bool>()).count());
+      VERIFY_IS_EQUAL((mb1 || mb2).template cast<int>().sum(), (refM1.real().template cast<bool>() || refM2.real().template cast<bool>()).count());
+      SpBool mb3 = mb1 && mb2;
+      if(mb1.coeffs().all() && mb2.coeffs().all())
+      {
+        VERIFY_IS_EQUAL(mb3.nonZeros(), (refM1.real().template cast<bool>() && refM2.real().template cast<bool>()).count());
+      }
+    }
+  }
+
+  // test reverse iterators
+  {
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    std::vector<Scalar> ref_value(m2.innerSize());
+    std::vector<Index> ref_index(m2.innerSize());
+    if(internal::random<bool>())
+      m2.makeCompressed();
+    for(Index j = 0; j<m2.outerSize(); ++j)
+    {
+      Index count_forward = 0;
+
+      for(typename SparseMatrixType::InnerIterator it(m2,j); it; ++it)
+      {
+        ref_value[ref_value.size()-1-count_forward] = it.value();
+        ref_index[ref_index.size()-1-count_forward] = it.index();
+        count_forward++;
+      }
+      Index count_reverse = 0;
+      for(typename SparseMatrixType::ReverseInnerIterator it(m2,j); it; --it)
+      {
+        VERIFY_IS_APPROX( std::abs(ref_value[ref_value.size()-count_forward+count_reverse])+1, std::abs(it.value())+1);
+        VERIFY_IS_EQUAL( ref_index[ref_index.size()-count_forward+count_reverse] , it.index());
+        count_reverse++;
+      }
+      VERIFY_IS_EQUAL(count_forward, count_reverse);
+    }
+  }
+
+  // test transpose
+  {
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    VERIFY_IS_APPROX(m2.transpose().eval(), refMat2.transpose().eval());
+    VERIFY_IS_APPROX(m2.transpose(), refMat2.transpose());
+
+    VERIFY_IS_APPROX(SparseMatrixType(m2.adjoint()), refMat2.adjoint());
+    
+    // check isApprox handles opposite storage order
+    typename Transpose<SparseMatrixType>::PlainObject m3(m2);
+    VERIFY(m2.isApprox(m3));
+  }
+
+  // test prune
+  {
+    SparseMatrixType m2(rows, cols);
+    DenseMatrix refM2(rows, cols);
+    refM2.setZero();
+    int countFalseNonZero = 0;
+    int countTrueNonZero = 0;
+    m2.reserve(VectorXi::Constant(m2.outerSize(), int(m2.innerSize())));
+    for (Index j=0; j<m2.cols(); ++j)
+    {
+      for (Index i=0; i<m2.rows(); ++i)
+      {
+        float x = internal::random<float>(0,1);
+        if (x<0.1f)
+        {
+          // do nothing
+        }
+        else if (x<0.5f)
+        {
+          countFalseNonZero++;
+          m2.insert(i,j) = Scalar(0);
+        }
+        else
+        {
+          countTrueNonZero++;
+          m2.insert(i,j) = Scalar(1);
+          refM2(i,j) = Scalar(1);
+        }
+      }
+    }
+    if(internal::random<bool>())
+      m2.makeCompressed();
+    VERIFY(countFalseNonZero+countTrueNonZero == m2.nonZeros());
+    if(countTrueNonZero>0)
+      VERIFY_IS_APPROX(m2, refM2);
+    m2.prune(Scalar(1));
+    VERIFY(countTrueNonZero==m2.nonZeros());
+    VERIFY_IS_APPROX(m2, refM2);
+  }
+
+  // test setFromTriplets
+  {
+    typedef Triplet<Scalar,StorageIndex> TripletType;
+    std::vector<TripletType> triplets;
+    Index ntriplets = rows*cols;
+    triplets.reserve(ntriplets);
+    DenseMatrix refMat_sum  = DenseMatrix::Zero(rows,cols);
+    DenseMatrix refMat_prod = DenseMatrix::Zero(rows,cols);
+    DenseMatrix refMat_last = DenseMatrix::Zero(rows,cols);
+
+    for(Index i=0;i<ntriplets;++i)
+    {
+      StorageIndex r = internal::random<StorageIndex>(0,StorageIndex(rows-1));
+      StorageIndex c = internal::random<StorageIndex>(0,StorageIndex(cols-1));
+      Scalar v = internal::random<Scalar>();
+      triplets.push_back(TripletType(r,c,v));
+      refMat_sum(r,c) += v;
+      if(std::abs(refMat_prod(r,c))==0)
+        refMat_prod(r,c) = v;
+      else
+        refMat_prod(r,c) *= v;
+      refMat_last(r,c) = v;
+    }
+    SparseMatrixType m(rows,cols);
+    m.setFromTriplets(triplets.begin(), triplets.end());
+    VERIFY_IS_APPROX(m, refMat_sum);
+
+    m.setFromTriplets(triplets.begin(), triplets.end(), std::multiplies<Scalar>());
+    VERIFY_IS_APPROX(m, refMat_prod);
+#if (EIGEN_COMP_CXXVER >= 11)
+    m.setFromTriplets(triplets.begin(), triplets.end(), [] (Scalar,Scalar b) { return b; });
+    VERIFY_IS_APPROX(m, refMat_last);
+#endif
+  }
+  
+  // test Map
+  {
+    DenseMatrix refMat2(rows, cols), refMat3(rows, cols);
+    SparseMatrixType m2(rows, cols), m3(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    initSparse<Scalar>(density, refMat3, m3);
+    {
+      Map<SparseMatrixType> mapMat2(m2.rows(), m2.cols(), m2.nonZeros(), m2.outerIndexPtr(), m2.innerIndexPtr(), m2.valuePtr(), m2.innerNonZeroPtr());
+      Map<SparseMatrixType> mapMat3(m3.rows(), m3.cols(), m3.nonZeros(), m3.outerIndexPtr(), m3.innerIndexPtr(), m3.valuePtr(), m3.innerNonZeroPtr());
+      VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
+      VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
+    }
+    {
+      MappedSparseMatrix<Scalar,SparseMatrixType::Options,StorageIndex> mapMat2(m2.rows(), m2.cols(), m2.nonZeros(), m2.outerIndexPtr(), m2.innerIndexPtr(), m2.valuePtr(), m2.innerNonZeroPtr());
+      MappedSparseMatrix<Scalar,SparseMatrixType::Options,StorageIndex> mapMat3(m3.rows(), m3.cols(), m3.nonZeros(), m3.outerIndexPtr(), m3.innerIndexPtr(), m3.valuePtr(), m3.innerNonZeroPtr());
+      VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
+      VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
+    }
+
+    Index i = internal::random<Index>(0,rows-1);
+    Index j = internal::random<Index>(0,cols-1);
+    m2.coeffRef(i,j) = 123;
+    if(internal::random<bool>())
+      m2.makeCompressed();
+    Map<SparseMatrixType> mapMat2(rows, cols, m2.nonZeros(), m2.outerIndexPtr(), m2.innerIndexPtr(), m2.valuePtr(),  m2.innerNonZeroPtr());
+    VERIFY_IS_EQUAL(m2.coeff(i,j),Scalar(123));
+    VERIFY_IS_EQUAL(mapMat2.coeff(i,j),Scalar(123));
+    mapMat2.coeffRef(i,j) = -123;
+    VERIFY_IS_EQUAL(m2.coeff(i,j),Scalar(-123));
+  }
+
+  // test triangularView
+  {
+    DenseMatrix refMat2(rows, cols), refMat3(rows, cols);
+    SparseMatrixType m2(rows, cols), m3(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    refMat3 = refMat2.template triangularView<Lower>();
+    m3 = m2.template triangularView<Lower>();
+    VERIFY_IS_APPROX(m3, refMat3);
+
+    refMat3 = refMat2.template triangularView<Upper>();
+    m3 = m2.template triangularView<Upper>();
+    VERIFY_IS_APPROX(m3, refMat3);
+
+    {
+      refMat3 = refMat2.template triangularView<UnitUpper>();
+      m3 = m2.template triangularView<UnitUpper>();
+      VERIFY_IS_APPROX(m3, refMat3);
+
+      refMat3 = refMat2.template triangularView<UnitLower>();
+      m3 = m2.template triangularView<UnitLower>();
+      VERIFY_IS_APPROX(m3, refMat3);
+    }
+
+    refMat3 = refMat2.template triangularView<StrictlyUpper>();
+    m3 = m2.template triangularView<StrictlyUpper>();
+    VERIFY_IS_APPROX(m3, refMat3);
+
+    refMat3 = refMat2.template triangularView<StrictlyLower>();
+    m3 = m2.template triangularView<StrictlyLower>();
+    VERIFY_IS_APPROX(m3, refMat3);
+
+    // check sparse-triangular to dense
+    refMat3 = m2.template triangularView<StrictlyUpper>();
+    VERIFY_IS_APPROX(refMat3, DenseMatrix(refMat2.template triangularView<StrictlyUpper>()));
+  }
+  
+  // test selfadjointView
+  if(!SparseMatrixType::IsRowMajor)
+  {
+    DenseMatrix refMat2(rows, rows), refMat3(rows, rows);
+    SparseMatrixType m2(rows, rows), m3(rows, rows);
+    initSparse<Scalar>(density, refMat2, m2);
+    refMat3 = refMat2.template selfadjointView<Lower>();
+    m3 = m2.template selfadjointView<Lower>();
+    VERIFY_IS_APPROX(m3, refMat3);
+
+    refMat3 += refMat2.template selfadjointView<Lower>();
+    m3 += m2.template selfadjointView<Lower>();
+    VERIFY_IS_APPROX(m3, refMat3);
+
+    refMat3 -= refMat2.template selfadjointView<Lower>();
+    m3 -= m2.template selfadjointView<Lower>();
+    VERIFY_IS_APPROX(m3, refMat3);
+
+    // selfadjointView only works for square matrices:
+    SparseMatrixType m4(rows, rows+1);
+    VERIFY_RAISES_ASSERT(m4.template selfadjointView<Lower>());
+    VERIFY_RAISES_ASSERT(m4.template selfadjointView<Upper>());
+  }
+  
+  // test sparseView
+  {
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, rows);
+    SparseMatrixType m2(rows, rows);
+    initSparse<Scalar>(density, refMat2, m2);
+    VERIFY_IS_APPROX(m2.eval(), refMat2.sparseView().eval());
+
+    // sparse view on expressions:
+    VERIFY_IS_APPROX((s1*m2).eval(), (s1*refMat2).sparseView().eval());
+    VERIFY_IS_APPROX((m2+m2).eval(), (refMat2+refMat2).sparseView().eval());
+    VERIFY_IS_APPROX((m2*m2).eval(), (refMat2.lazyProduct(refMat2)).sparseView().eval());
+    VERIFY_IS_APPROX((m2*m2).eval(), (refMat2*refMat2).sparseView().eval());
+  }
+
+  // test diagonal
+  {
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    VERIFY_IS_APPROX(m2.diagonal(), refMat2.diagonal().eval());
+    DenseVector d = m2.diagonal();
+    VERIFY_IS_APPROX(d, refMat2.diagonal().eval());
+    d = m2.diagonal().array();
+    VERIFY_IS_APPROX(d, refMat2.diagonal().eval());
+    VERIFY_IS_APPROX(const_cast<const SparseMatrixType&>(m2).diagonal(), refMat2.diagonal().eval());
+    
+    initSparse<Scalar>(density, refMat2, m2, ForceNonZeroDiag);
+    m2.diagonal()      += refMat2.diagonal();
+    refMat2.diagonal() += refMat2.diagonal();
+    VERIFY_IS_APPROX(m2, refMat2);
+  }
+  
+  // test diagonal to sparse
+  {
+    DenseVector d = DenseVector::Random(rows);
+    DenseMatrix refMat2 = d.asDiagonal();
+    SparseMatrixType m2;
+    m2 = d.asDiagonal();
+    VERIFY_IS_APPROX(m2, refMat2);
+    SparseMatrixType m3(d.asDiagonal());
+    VERIFY_IS_APPROX(m3, refMat2);
+    refMat2 += d.asDiagonal();
+    m2 += d.asDiagonal();
+    VERIFY_IS_APPROX(m2, refMat2);
+    m2.setZero();       m2 += d.asDiagonal();
+    refMat2.setZero();  refMat2 += d.asDiagonal();
+    VERIFY_IS_APPROX(m2, refMat2);
+    m2.setZero();       m2 -= d.asDiagonal();
+    refMat2.setZero();  refMat2 -= d.asDiagonal();
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    initSparse<Scalar>(density, refMat2, m2);
+    m2.makeCompressed();
+    m2 += d.asDiagonal();
+    refMat2 += d.asDiagonal();
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    initSparse<Scalar>(density, refMat2, m2);
+    m2.makeCompressed();
+    VectorXi res(rows);
+    for(Index i=0; i<rows; ++i)
+      res(i) = internal::random<int>(0,3);
+    m2.reserve(res);
+    m2 -= d.asDiagonal();
+    refMat2 -= d.asDiagonal();
+    VERIFY_IS_APPROX(m2, refMat2);
+  }
+  
+  // test conservative resize
+  {
+      std::vector< std::pair<StorageIndex,StorageIndex> > inc;
+      if(rows > 3 && cols > 2)
+        inc.push_back(std::pair<StorageIndex,StorageIndex>(-3,-2));
+      inc.push_back(std::pair<StorageIndex,StorageIndex>(0,0));
+      inc.push_back(std::pair<StorageIndex,StorageIndex>(3,2));
+      inc.push_back(std::pair<StorageIndex,StorageIndex>(3,0));
+      inc.push_back(std::pair<StorageIndex,StorageIndex>(0,3));
+      inc.push_back(std::pair<StorageIndex,StorageIndex>(0,-1));
+      inc.push_back(std::pair<StorageIndex,StorageIndex>(-1,0));
+      inc.push_back(std::pair<StorageIndex,StorageIndex>(-1,-1));
+
+      for(size_t i = 0; i< inc.size(); i++) {
+        StorageIndex incRows = inc[i].first;
+        StorageIndex incCols = inc[i].second;
+        SparseMatrixType m1(rows, cols);
+        DenseMatrix refMat1 = DenseMatrix::Zero(rows, cols);
+        initSparse<Scalar>(density, refMat1, m1);
+
+        SparseMatrixType m2 = m1;
+        m2.makeCompressed();
+
+        m1.conservativeResize(rows+incRows, cols+incCols);
+        m2.conservativeResize(rows+incRows, cols+incCols);
+        refMat1.conservativeResize(rows+incRows, cols+incCols);
+        if (incRows > 0) refMat1.bottomRows(incRows).setZero();
+        if (incCols > 0) refMat1.rightCols(incCols).setZero();
+
+        VERIFY_IS_APPROX(m1, refMat1);
+        VERIFY_IS_APPROX(m2, refMat1);
+
+        // Insert new values
+        if (incRows > 0) 
+          m1.insert(m1.rows()-1, 0) = refMat1(refMat1.rows()-1, 0) = 1;
+        if (incCols > 0) 
+          m1.insert(0, m1.cols()-1) = refMat1(0, refMat1.cols()-1) = 1;
+
+        VERIFY_IS_APPROX(m1, refMat1);
+
+
+      }
+  }
+
+  // test Identity matrix
+  {
+    DenseMatrix refMat1 = DenseMatrix::Identity(rows, rows);
+    SparseMatrixType m1(rows, rows);
+    m1.setIdentity();
+    VERIFY_IS_APPROX(m1, refMat1);
+    for(int k=0; k<rows*rows/4; ++k)
+    {
+      Index i = internal::random<Index>(0,rows-1);
+      Index j = internal::random<Index>(0,rows-1);
+      Scalar v = internal::random<Scalar>();
+      m1.coeffRef(i,j) = v;
+      refMat1.coeffRef(i,j) = v;
+      VERIFY_IS_APPROX(m1, refMat1);
+      if(internal::random<Index>(0,10)<2)
+        m1.makeCompressed();
+    }
+    m1.setIdentity();
+    refMat1.setIdentity();
+    VERIFY_IS_APPROX(m1, refMat1);
+  }
+
+  // test array/vector of InnerIterator
+  {
+    typedef typename SparseMatrixType::InnerIterator IteratorType;
+
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    IteratorType static_array[2];
+    static_array[0] = IteratorType(m2,0);
+    static_array[1] = IteratorType(m2,m2.outerSize()-1);
+    VERIFY( static_array[0] || m2.innerVector(static_array[0].outer()).nonZeros() == 0 );
+    VERIFY( static_array[1] || m2.innerVector(static_array[1].outer()).nonZeros() == 0 );
+    if(static_array[0] && static_array[1])
+    {
+      ++(static_array[1]);
+      static_array[1] = IteratorType(m2,0);
+      VERIFY( static_array[1] );
+      VERIFY( static_array[1].index() == static_array[0].index() );
+      VERIFY( static_array[1].outer() == static_array[0].outer() );
+      VERIFY( static_array[1].value() == static_array[0].value() );
+    }
+
+    std::vector<IteratorType> iters(2);
+    iters[0] = IteratorType(m2,0);
+    iters[1] = IteratorType(m2,m2.outerSize()-1);
+  }
+
+  // test reserve with empty rows/columns
+  {
+    SparseMatrixType m1(0,cols);
+    m1.reserve(ArrayXi::Constant(m1.outerSize(),1));
+    SparseMatrixType m2(rows,0);
+    m2.reserve(ArrayXi::Constant(m2.outerSize(),1));
+  }
+}
+
+
+template<typename SparseMatrixType>
+void big_sparse_triplet(Index rows, Index cols, double density) {
+  typedef typename SparseMatrixType::StorageIndex StorageIndex;
+  typedef typename SparseMatrixType::Scalar Scalar;
+  typedef Triplet<Scalar,Index> TripletType;
+  std::vector<TripletType> triplets;
+  double nelements = density * rows*cols;
+  VERIFY(nelements>=0 && nelements < static_cast<double>(NumTraits<StorageIndex>::highest()));
+  Index ntriplets = Index(nelements);
+  triplets.reserve(ntriplets);
+  Scalar sum = Scalar(0);
+  for(Index i=0;i<ntriplets;++i)
+  {
+    Index r = internal::random<Index>(0,rows-1);
+    Index c = internal::random<Index>(0,cols-1);
+    // use positive values to prevent numerical cancellation errors in sum
+    Scalar v = numext::abs(internal::random<Scalar>());
+    triplets.push_back(TripletType(r,c,v));
+    sum += v;
+  }
+  SparseMatrixType m(rows,cols);
+  m.setFromTriplets(triplets.begin(), triplets.end());
+  VERIFY(m.nonZeros() <= ntriplets);
+  VERIFY_IS_APPROX(sum, m.sum());
+}
+
+template<int>
+void bug1105()
+{
+  // Regression test for bug 1105
+  int n = Eigen::internal::random<int>(200,600);
+  SparseMatrix<std::complex<double>,0, long> mat(n, n);
+  std::complex<double> val;
+
+  for(int i=0; i<n; ++i)
+  {
+    mat.coeffRef(i, i%(n/10)) = val;
+    VERIFY(mat.data().allocatedSize()<20*n);
+  }
+}
+
+#ifndef EIGEN_SPARSE_TEST_INCLUDED_FROM_SPARSE_EXTRA
+
+EIGEN_DECLARE_TEST(sparse_basic)
+{
+  g_dense_op_sparse_count = 0;  // Suppresses compiler warning.
+  for(int i = 0; i < g_repeat; i++) {
+    int r = Eigen::internal::random<int>(1,200), c = Eigen::internal::random<int>(1,200);
+    if(Eigen::internal::random<int>(0,4) == 0) {
+      r = c; // check square matrices in 25% of tries
+    }
+    EIGEN_UNUSED_VARIABLE(r+c);
+    CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double>(1, 1)) ));
+    CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double>(8, 8)) ));
+    CALL_SUBTEST_2(( sparse_basic(SparseMatrix<std::complex<double>, ColMajor>(r, c)) ));
+    CALL_SUBTEST_2(( sparse_basic(SparseMatrix<std::complex<double>, RowMajor>(r, c)) ));
+    CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double>(r, c)) ));
+    CALL_SUBTEST_5(( sparse_basic(SparseMatrix<double,ColMajor,long int>(r, c)) ));
+    CALL_SUBTEST_5(( sparse_basic(SparseMatrix<double,RowMajor,long int>(r, c)) ));
+    
+    r = Eigen::internal::random<int>(1,100);
+    c = Eigen::internal::random<int>(1,100);
+    if(Eigen::internal::random<int>(0,4) == 0) {
+      r = c; // check square matrices in 25% of tries
+    }
+    
+    CALL_SUBTEST_6(( sparse_basic(SparseMatrix<double,ColMajor,short int>(short(r), short(c))) ));
+    CALL_SUBTEST_6(( sparse_basic(SparseMatrix<double,RowMajor,short int>(short(r), short(c))) ));
+  }
+
+  // Regression test for bug 900: (manually insert higher values here, if you have enough RAM):
+  CALL_SUBTEST_3((big_sparse_triplet<SparseMatrix<float, RowMajor, int> >(10000, 10000, 0.125)));
+  CALL_SUBTEST_4((big_sparse_triplet<SparseMatrix<double, ColMajor, long int> >(10000, 10000, 0.125)));
+
+  CALL_SUBTEST_7( bug1105<0>() );
+}
+#endif

diff --git a/test/sparse_block.cpp b/test/sparse_block.cpp
new file mode 100644
index 0000000..b4905b0
--- /dev/null
+++ b/test/sparse_block.cpp

@@ -0,0 +1,323 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "sparse.h"
+#include "AnnoyingScalar.h"
+
+template<typename T>
+typename Eigen::internal::enable_if<(T::Flags&RowMajorBit)==RowMajorBit, typename T::RowXpr>::type
+innervec(T& A, Index i)
+{
+  return A.row(i);
+}
+
+template<typename T>
+typename Eigen::internal::enable_if<(T::Flags&RowMajorBit)==0, typename T::ColXpr>::type
+innervec(T& A, Index i)
+{
+  return A.col(i);
+}
+
+template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& ref)
+{
+  const Index rows = ref.rows();
+  const Index cols = ref.cols();
+  const Index inner = ref.innerSize();
+  const Index outer = ref.outerSize();
+
+  typedef typename SparseMatrixType::Scalar Scalar;
+  typedef typename SparseMatrixType::RealScalar RealScalar;
+  typedef typename SparseMatrixType::StorageIndex StorageIndex;
+
+  double density = (std::max)(8./(rows*cols), 0.01);
+  typedef Matrix<Scalar,Dynamic,Dynamic,SparseMatrixType::IsRowMajor?RowMajor:ColMajor> DenseMatrix;
+  typedef Matrix<Scalar,Dynamic,1> DenseVector;
+  typedef Matrix<Scalar,1,Dynamic> RowDenseVector;
+  typedef SparseVector<Scalar> SparseVectorType;
+
+  Scalar s1 = internal::random<Scalar>();
+  {
+    SparseMatrixType m(rows, cols);
+    DenseMatrix refMat = DenseMatrix::Zero(rows, cols);
+    initSparse<Scalar>(density, refMat, m);
+
+    VERIFY_IS_APPROX(m, refMat);
+
+    // test InnerIterators and Block expressions
+    for (int t=0; t<10; ++t)
+    {
+      Index j = internal::random<Index>(0,cols-2);
+      Index i = internal::random<Index>(0,rows-2);
+      Index w = internal::random<Index>(1,cols-j);
+      Index h = internal::random<Index>(1,rows-i);
+
+      VERIFY_IS_APPROX(m.block(i,j,h,w), refMat.block(i,j,h,w));
+      for(Index c=0; c<w; c++)
+      {
+        VERIFY_IS_APPROX(m.block(i,j,h,w).col(c), refMat.block(i,j,h,w).col(c));
+        for(Index r=0; r<h; r++)
+        {
+          VERIFY_IS_APPROX(m.block(i,j,h,w).col(c).coeff(r), refMat.block(i,j,h,w).col(c).coeff(r));
+          VERIFY_IS_APPROX(m.block(i,j,h,w).coeff(r,c), refMat.block(i,j,h,w).coeff(r,c));
+        }
+      }
+      for(Index r=0; r<h; r++)
+      {
+        VERIFY_IS_APPROX(m.block(i,j,h,w).row(r), refMat.block(i,j,h,w).row(r));
+        for(Index c=0; c<w; c++)
+        {
+          VERIFY_IS_APPROX(m.block(i,j,h,w).row(r).coeff(c), refMat.block(i,j,h,w).row(r).coeff(c));
+          VERIFY_IS_APPROX(m.block(i,j,h,w).coeff(r,c), refMat.block(i,j,h,w).coeff(r,c));
+        }
+      }
+      
+      VERIFY_IS_APPROX(m.middleCols(j,w), refMat.middleCols(j,w));
+      VERIFY_IS_APPROX(m.middleRows(i,h), refMat.middleRows(i,h));
+      for(Index r=0; r<h; r++)
+      {
+        VERIFY_IS_APPROX(m.middleCols(j,w).row(r), refMat.middleCols(j,w).row(r));
+        VERIFY_IS_APPROX(m.middleRows(i,h).row(r), refMat.middleRows(i,h).row(r));
+        for(Index c=0; c<w; c++)
+        {
+          VERIFY_IS_APPROX(m.col(c).coeff(r), refMat.col(c).coeff(r));
+          VERIFY_IS_APPROX(m.row(r).coeff(c), refMat.row(r).coeff(c));
+          
+          VERIFY_IS_APPROX(m.middleCols(j,w).coeff(r,c), refMat.middleCols(j,w).coeff(r,c));
+          VERIFY_IS_APPROX(m.middleRows(i,h).coeff(r,c), refMat.middleRows(i,h).coeff(r,c));
+          if(m.middleCols(j,w).coeff(r,c) != Scalar(0))
+          {
+            VERIFY_IS_APPROX(m.middleCols(j,w).coeffRef(r,c), refMat.middleCols(j,w).coeff(r,c));
+          }
+          if(m.middleRows(i,h).coeff(r,c) != Scalar(0))
+          {
+            VERIFY_IS_APPROX(m.middleRows(i,h).coeff(r,c), refMat.middleRows(i,h).coeff(r,c));
+          }
+        }
+      }
+      for(Index c=0; c<w; c++)
+      {
+        VERIFY_IS_APPROX(m.middleCols(j,w).col(c), refMat.middleCols(j,w).col(c));
+        VERIFY_IS_APPROX(m.middleRows(i,h).col(c), refMat.middleRows(i,h).col(c));
+      }
+    }
+
+    for(Index c=0; c<cols; c++)
+    {
+      VERIFY_IS_APPROX(m.col(c) + m.col(c), (m + m).col(c));
+      VERIFY_IS_APPROX(m.col(c) + m.col(c), refMat.col(c) + refMat.col(c));
+    }
+
+    for(Index r=0; r<rows; r++)
+    {
+      VERIFY_IS_APPROX(m.row(r) + m.row(r), (m + m).row(r));
+      VERIFY_IS_APPROX(m.row(r) + m.row(r), refMat.row(r) + refMat.row(r));
+    }
+  }
+
+  // test innerVector()
+  {
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    Index j0 = internal::random<Index>(0,outer-1);
+    Index j1 = internal::random<Index>(0,outer-1);
+    Index r0 = internal::random<Index>(0,rows-1);
+    Index c0 = internal::random<Index>(0,cols-1);
+
+    VERIFY_IS_APPROX(m2.innerVector(j0), innervec(refMat2,j0));
+    VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), innervec(refMat2,j0)+innervec(refMat2,j1));
+
+    m2.innerVector(j0) *= Scalar(2);
+    innervec(refMat2,j0) *= Scalar(2);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    m2.row(r0) *= Scalar(3);
+    refMat2.row(r0) *= Scalar(3);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    m2.col(c0) *= Scalar(4);
+    refMat2.col(c0) *= Scalar(4);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    m2.row(r0) /= Scalar(3);
+    refMat2.row(r0) /= Scalar(3);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    m2.col(c0) /= Scalar(4);
+    refMat2.col(c0) /= Scalar(4);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    SparseVectorType v1;
+    VERIFY_IS_APPROX(v1 = m2.col(c0) * 4, refMat2.col(c0)*4);
+    VERIFY_IS_APPROX(v1 = m2.row(r0) * 4, refMat2.row(r0).transpose()*4);
+
+    SparseMatrixType m3(rows,cols);
+    m3.reserve(VectorXi::Constant(outer,int(inner/2)));
+    for(Index j=0; j<outer; ++j)
+      for(Index k=0; k<(std::min)(j,inner); ++k)
+        m3.insertByOuterInner(j,k) = internal::convert_index<StorageIndex>(k+1);
+    for(Index j=0; j<(std::min)(outer, inner); ++j)
+    {
+      VERIFY(j==numext::real(m3.innerVector(j).nonZeros()));
+      if(j>0)
+        VERIFY(RealScalar(j)==numext::real(m3.innerVector(j).lastCoeff()));
+    }
+    m3.makeCompressed();
+    for(Index j=0; j<(std::min)(outer, inner); ++j)
+    {
+      VERIFY(j==numext::real(m3.innerVector(j).nonZeros()));
+      if(j>0)
+        VERIFY(RealScalar(j)==numext::real(m3.innerVector(j).lastCoeff()));
+    }
+
+    VERIFY(m3.innerVector(j0).nonZeros() == m3.transpose().innerVector(j0).nonZeros());
+
+//     m2.innerVector(j0) = 2*m2.innerVector(j1);
+//     refMat2.col(j0) = 2*refMat2.col(j1);
+//     VERIFY_IS_APPROX(m2, refMat2);
+  }
+
+  // test innerVectors()
+  {
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    if(internal::random<float>(0,1)>0.5f) m2.makeCompressed();
+    Index j0 = internal::random<Index>(0,outer-2);
+    Index j1 = internal::random<Index>(0,outer-2);
+    Index n0 = internal::random<Index>(1,outer-(std::max)(j0,j1));
+    if(SparseMatrixType::IsRowMajor)
+      VERIFY_IS_APPROX(m2.innerVectors(j0,n0), refMat2.block(j0,0,n0,cols));
+    else
+      VERIFY_IS_APPROX(m2.innerVectors(j0,n0), refMat2.block(0,j0,rows,n0));
+    if(SparseMatrixType::IsRowMajor)
+      VERIFY_IS_APPROX(m2.innerVectors(j0,n0)+m2.innerVectors(j1,n0),
+                       refMat2.middleRows(j0,n0)+refMat2.middleRows(j1,n0));
+    else
+      VERIFY_IS_APPROX(m2.innerVectors(j0,n0)+m2.innerVectors(j1,n0),
+                      refMat2.block(0,j0,rows,n0)+refMat2.block(0,j1,rows,n0));
+    
+    VERIFY_IS_APPROX(m2, refMat2);
+    
+    VERIFY(m2.innerVectors(j0,n0).nonZeros() == m2.transpose().innerVectors(j0,n0).nonZeros());
+    
+    m2.innerVectors(j0,n0) = m2.innerVectors(j0,n0) + m2.innerVectors(j1,n0);
+    if(SparseMatrixType::IsRowMajor)
+      refMat2.middleRows(j0,n0) = (refMat2.middleRows(j0,n0) + refMat2.middleRows(j1,n0)).eval();
+    else
+      refMat2.middleCols(j0,n0) = (refMat2.middleCols(j0,n0) + refMat2.middleCols(j1,n0)).eval();
+    
+    VERIFY_IS_APPROX(m2, refMat2);
+  }
+
+  // test generic blocks
+  {
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    Index j0 = internal::random<Index>(0,outer-2);
+    Index j1 = internal::random<Index>(0,outer-2);
+    Index n0 = internal::random<Index>(1,outer-(std::max)(j0,j1));
+    if(SparseMatrixType::IsRowMajor)
+      VERIFY_IS_APPROX(m2.block(j0,0,n0,cols), refMat2.block(j0,0,n0,cols));
+    else
+      VERIFY_IS_APPROX(m2.block(0,j0,rows,n0), refMat2.block(0,j0,rows,n0));
+    
+    if(SparseMatrixType::IsRowMajor)
+      VERIFY_IS_APPROX(m2.block(j0,0,n0,cols)+m2.block(j1,0,n0,cols),
+                      refMat2.block(j0,0,n0,cols)+refMat2.block(j1,0,n0,cols));
+    else
+      VERIFY_IS_APPROX(m2.block(0,j0,rows,n0)+m2.block(0,j1,rows,n0),
+                      refMat2.block(0,j0,rows,n0)+refMat2.block(0,j1,rows,n0));
+      
+    Index i = internal::random<Index>(0,m2.outerSize()-1);
+    if(SparseMatrixType::IsRowMajor) {
+      m2.innerVector(i) = m2.innerVector(i) * s1;
+      refMat2.row(i) = refMat2.row(i) * s1;
+      VERIFY_IS_APPROX(m2,refMat2);
+    } else {
+      m2.innerVector(i) = m2.innerVector(i) * s1;
+      refMat2.col(i) = refMat2.col(i) * s1;
+      VERIFY_IS_APPROX(m2,refMat2);
+    }
+    
+    Index r0 = internal::random<Index>(0,rows-2);
+    Index c0 = internal::random<Index>(0,cols-2);
+    Index r1 = internal::random<Index>(1,rows-r0);
+    Index c1 = internal::random<Index>(1,cols-c0);
+    
+    VERIFY_IS_APPROX(DenseVector(m2.col(c0)), refMat2.col(c0));
+    VERIFY_IS_APPROX(m2.col(c0), refMat2.col(c0));
+    
+    VERIFY_IS_APPROX(RowDenseVector(m2.row(r0)), refMat2.row(r0));
+    VERIFY_IS_APPROX(m2.row(r0), refMat2.row(r0));
+
+    VERIFY_IS_APPROX(m2.block(r0,c0,r1,c1), refMat2.block(r0,c0,r1,c1));
+    VERIFY_IS_APPROX((2*m2).block(r0,c0,r1,c1), (2*refMat2).block(r0,c0,r1,c1));
+
+    if(m2.nonZeros()>0)
+    {
+      VERIFY_IS_APPROX(m2, refMat2);
+      SparseMatrixType m3(rows, cols);
+      DenseMatrix refMat3(rows, cols); refMat3.setZero();
+      Index n = internal::random<Index>(1,10);
+      for(Index k=0; k<n; ++k)
+      {
+        Index o1 = internal::random<Index>(0,outer-1);
+        Index o2 = internal::random<Index>(0,outer-1);
+        if(SparseMatrixType::IsRowMajor)
+        {
+          m3.innerVector(o1) = m2.row(o2);
+          refMat3.row(o1) = refMat2.row(o2);
+        }
+        else
+        {
+          m3.innerVector(o1) = m2.col(o2);
+          refMat3.col(o1) = refMat2.col(o2);
+        }
+        if(internal::random<bool>())
+          m3.makeCompressed();
+      }
+      if(m3.nonZeros()>0)
+      VERIFY_IS_APPROX(m3, refMat3);
+    }
+  }
+}
+
+EIGEN_DECLARE_TEST(sparse_block)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    int r = Eigen::internal::random<int>(1,200), c = Eigen::internal::random<int>(1,200);
+    if(Eigen::internal::random<int>(0,4) == 0) {
+      r = c; // check square matrices in 25% of tries
+    }
+    EIGEN_UNUSED_VARIABLE(r+c);
+    CALL_SUBTEST_1(( sparse_block(SparseMatrix<double>(1, 1)) ));
+    CALL_SUBTEST_1(( sparse_block(SparseMatrix<double>(8, 8)) ));
+    CALL_SUBTEST_1(( sparse_block(SparseMatrix<double>(r, c)) ));
+    CALL_SUBTEST_2(( sparse_block(SparseMatrix<std::complex<double>, ColMajor>(r, c)) ));
+    CALL_SUBTEST_2(( sparse_block(SparseMatrix<std::complex<double>, RowMajor>(r, c)) ));
+    
+    CALL_SUBTEST_3(( sparse_block(SparseMatrix<double,ColMajor,long int>(r, c)) ));
+    CALL_SUBTEST_3(( sparse_block(SparseMatrix<double,RowMajor,long int>(r, c)) ));
+    
+    r = Eigen::internal::random<int>(1,100);
+    c = Eigen::internal::random<int>(1,100);
+    if(Eigen::internal::random<int>(0,4) == 0) {
+      r = c; // check square matrices in 25% of tries
+    }
+    
+    CALL_SUBTEST_4(( sparse_block(SparseMatrix<double,ColMajor,short int>(short(r), short(c))) ));
+    CALL_SUBTEST_4(( sparse_block(SparseMatrix<double,RowMajor,short int>(short(r), short(c))) ));
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+    AnnoyingScalar::dont_throw = true;
+#endif
+    CALL_SUBTEST_5((  sparse_block(SparseMatrix<AnnoyingScalar>(r,c)) ));
+  }
+}

diff --git a/test/sparse_permutations.cpp b/test/sparse_permutations.cpp
new file mode 100644
index 0000000..e93493c
--- /dev/null
+++ b/test/sparse_permutations.cpp

@@ -0,0 +1,236 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+static long int nb_transposed_copies;
+#define EIGEN_SPARSE_TRANSPOSED_COPY_PLUGIN {nb_transposed_copies++;}
+#define VERIFY_TRANSPOSITION_COUNT(XPR,N) {\
+    nb_transposed_copies = 0; \
+    XPR; \
+    if(nb_transposed_copies!=N) std::cerr << "nb_transposed_copies == " << nb_transposed_copies << "\n"; \
+    VERIFY( (#XPR) && nb_transposed_copies==N ); \
+  }
+
+#include "sparse.h"
+
+template<typename T>
+bool is_sorted(const T& mat) {
+  for(Index k = 0; k<mat.outerSize(); ++k)
+  {
+    Index prev = -1;
+    for(typename T::InnerIterator it(mat,k); it; ++it)
+    {
+      if(prev>=it.index())
+        return false;
+      prev = it.index();
+    }
+  }
+  return true;
+}
+
+template<typename T>
+typename internal::nested_eval<T,1>::type eval(const T &xpr)
+{
+  VERIFY( int(internal::nested_eval<T,1>::type::Flags&RowMajorBit) == int(internal::evaluator<T>::Flags&RowMajorBit) );
+  return xpr;
+}
+
+template<int OtherStorage, typename SparseMatrixType> void sparse_permutations(const SparseMatrixType& ref)
+{
+  const Index rows = ref.rows();
+  const Index cols = ref.cols();
+  typedef typename SparseMatrixType::Scalar Scalar;
+  typedef typename SparseMatrixType::StorageIndex StorageIndex;
+  typedef SparseMatrix<Scalar, OtherStorage, StorageIndex> OtherSparseMatrixType;
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+  typedef Matrix<StorageIndex,Dynamic,1> VectorI;
+//   bool IsRowMajor1 = SparseMatrixType::IsRowMajor;
+//   bool IsRowMajor2 = OtherSparseMatrixType::IsRowMajor;
+  
+  double density = (std::max)(8./(rows*cols), 0.01);
+  
+  SparseMatrixType mat(rows, cols), up(rows,cols), lo(rows,cols);
+  OtherSparseMatrixType res;
+  DenseMatrix mat_d = DenseMatrix::Zero(rows, cols), up_sym_d, lo_sym_d, res_d;
+  
+  initSparse<Scalar>(density, mat_d, mat, 0);
+
+  up = mat.template triangularView<Upper>();
+  lo = mat.template triangularView<Lower>();
+  
+  up_sym_d = mat_d.template selfadjointView<Upper>();
+  lo_sym_d = mat_d.template selfadjointView<Lower>();
+  
+  VERIFY_IS_APPROX(mat, mat_d);
+  VERIFY_IS_APPROX(up, DenseMatrix(mat_d.template triangularView<Upper>()));
+  VERIFY_IS_APPROX(lo, DenseMatrix(mat_d.template triangularView<Lower>()));
+  
+  PermutationMatrix<Dynamic> p, p_null;
+  VectorI pi;
+  randomPermutationVector(pi, cols);
+  p.indices() = pi;
+
+  VERIFY( is_sorted( ::eval(mat*p) ));
+  VERIFY( is_sorted( res = mat*p ));
+  VERIFY_TRANSPOSITION_COUNT( ::eval(mat*p), 0);
+  //VERIFY_TRANSPOSITION_COUNT( res = mat*p, IsRowMajor ? 1 : 0 );
+  res_d = mat_d*p;
+  VERIFY(res.isApprox(res_d) && "mat*p");
+
+  VERIFY( is_sorted( ::eval(p*mat) ));
+  VERIFY( is_sorted( res = p*mat ));
+  VERIFY_TRANSPOSITION_COUNT( ::eval(p*mat), 0);
+  res_d = p*mat_d;
+  VERIFY(res.isApprox(res_d) && "p*mat");
+
+  VERIFY( is_sorted( (mat*p).eval() ));
+  VERIFY( is_sorted( res = mat*p.inverse() ));
+  VERIFY_TRANSPOSITION_COUNT( ::eval(mat*p.inverse()), 0);
+  res_d = mat*p.inverse();
+  VERIFY(res.isApprox(res_d) && "mat*inv(p)");
+
+  VERIFY( is_sorted( (p*mat+p*mat).eval() ));
+  VERIFY( is_sorted( res = p.inverse()*mat ));
+  VERIFY_TRANSPOSITION_COUNT( ::eval(p.inverse()*mat), 0);
+  res_d = p.inverse()*mat_d;
+  VERIFY(res.isApprox(res_d) && "inv(p)*mat");
+
+  VERIFY( is_sorted( (p * mat * p.inverse()).eval() ));
+  VERIFY( is_sorted( res = mat.twistedBy(p) ));
+  VERIFY_TRANSPOSITION_COUNT( ::eval(p * mat * p.inverse()), 0);
+  res_d = (p * mat_d) * p.inverse();
+  VERIFY(res.isApprox(res_d) && "p*mat*inv(p)");
+
+  
+  VERIFY( is_sorted( res = mat.template selfadjointView<Upper>().twistedBy(p_null) ));
+  res_d = up_sym_d;
+  VERIFY(res.isApprox(res_d) && "full selfadjoint upper to full");
+  
+  VERIFY( is_sorted( res = mat.template selfadjointView<Lower>().twistedBy(p_null) ));
+  res_d = lo_sym_d;
+  VERIFY(res.isApprox(res_d) && "full selfadjoint lower to full");
+  
+  
+  VERIFY( is_sorted( res = up.template selfadjointView<Upper>().twistedBy(p_null) ));
+  res_d = up_sym_d;
+  VERIFY(res.isApprox(res_d) && "upper selfadjoint to full");
+  
+  VERIFY( is_sorted( res = lo.template selfadjointView<Lower>().twistedBy(p_null) ));
+  res_d = lo_sym_d;
+  VERIFY(res.isApprox(res_d) && "lower selfadjoint full");
+
+
+  VERIFY( is_sorted( res = mat.template selfadjointView<Upper>() ));
+  res_d = up_sym_d;
+  VERIFY(res.isApprox(res_d) && "full selfadjoint upper to full");
+
+  VERIFY( is_sorted( res = mat.template selfadjointView<Lower>() ));
+  res_d = lo_sym_d;
+  VERIFY(res.isApprox(res_d) && "full selfadjoint lower to full");
+
+  VERIFY( is_sorted( res = up.template selfadjointView<Upper>() ));
+  res_d = up_sym_d;
+  VERIFY(res.isApprox(res_d) && "upper selfadjoint to full");
+
+  VERIFY( is_sorted( res = lo.template selfadjointView<Lower>() ));
+  res_d = lo_sym_d;
+  VERIFY(res.isApprox(res_d) && "lower selfadjoint full");
+
+
+  res.template selfadjointView<Upper>() = mat.template selfadjointView<Upper>();
+  res_d = up_sym_d.template triangularView<Upper>();
+  VERIFY(res.isApprox(res_d) && "full selfadjoint upper to upper");
+
+  res.template selfadjointView<Lower>() = mat.template selfadjointView<Upper>();
+  res_d = up_sym_d.template triangularView<Lower>();
+  VERIFY(res.isApprox(res_d) && "full selfadjoint upper to lower");
+
+  res.template selfadjointView<Upper>() = mat.template selfadjointView<Lower>();
+  res_d = lo_sym_d.template triangularView<Upper>();
+  VERIFY(res.isApprox(res_d) && "full selfadjoint lower to upper");
+
+  res.template selfadjointView<Lower>() = mat.template selfadjointView<Lower>();
+  res_d = lo_sym_d.template triangularView<Lower>();
+  VERIFY(res.isApprox(res_d) && "full selfadjoint lower to lower");
+
+  
+  
+  res.template selfadjointView<Upper>() = mat.template selfadjointView<Upper>().twistedBy(p);
+  res_d = ((p * up_sym_d) * p.inverse()).eval().template triangularView<Upper>();
+  VERIFY(res.isApprox(res_d) && "full selfadjoint upper twisted to upper");
+  
+  res.template selfadjointView<Upper>() = mat.template selfadjointView<Lower>().twistedBy(p);
+  res_d = ((p * lo_sym_d) * p.inverse()).eval().template triangularView<Upper>();
+  VERIFY(res.isApprox(res_d) && "full selfadjoint lower twisted to upper");
+  
+  res.template selfadjointView<Lower>() = mat.template selfadjointView<Lower>().twistedBy(p);
+  res_d = ((p * lo_sym_d) * p.inverse()).eval().template triangularView<Lower>();
+  VERIFY(res.isApprox(res_d) && "full selfadjoint lower twisted to lower");
+  
+  res.template selfadjointView<Lower>() = mat.template selfadjointView<Upper>().twistedBy(p);
+  res_d = ((p * up_sym_d) * p.inverse()).eval().template triangularView<Lower>();
+  VERIFY(res.isApprox(res_d) && "full selfadjoint upper twisted to lower");
+  
+  
+  res.template selfadjointView<Upper>() = up.template selfadjointView<Upper>().twistedBy(p);
+  res_d = ((p * up_sym_d) * p.inverse()).eval().template triangularView<Upper>();
+  VERIFY(res.isApprox(res_d) && "upper selfadjoint twisted to upper");
+  
+  res.template selfadjointView<Upper>() = lo.template selfadjointView<Lower>().twistedBy(p);
+  res_d = ((p * lo_sym_d) * p.inverse()).eval().template triangularView<Upper>();
+  VERIFY(res.isApprox(res_d) && "lower selfadjoint twisted to upper");
+  
+  res.template selfadjointView<Lower>() = lo.template selfadjointView<Lower>().twistedBy(p);
+  res_d = ((p * lo_sym_d) * p.inverse()).eval().template triangularView<Lower>();
+  VERIFY(res.isApprox(res_d) && "lower selfadjoint twisted to lower");
+  
+  res.template selfadjointView<Lower>() = up.template selfadjointView<Upper>().twistedBy(p);
+  res_d = ((p * up_sym_d) * p.inverse()).eval().template triangularView<Lower>();
+  VERIFY(res.isApprox(res_d) && "upper selfadjoint twisted to lower");
+
+  
+  VERIFY( is_sorted( res = mat.template selfadjointView<Upper>().twistedBy(p) ));
+  res_d = (p * up_sym_d) * p.inverse();
+  VERIFY(res.isApprox(res_d) && "full selfadjoint upper twisted to full");
+  
+  VERIFY( is_sorted( res = mat.template selfadjointView<Lower>().twistedBy(p) ));
+  res_d = (p * lo_sym_d) * p.inverse();
+  VERIFY(res.isApprox(res_d) && "full selfadjoint lower twisted to full");
+  
+  VERIFY( is_sorted( res = up.template selfadjointView<Upper>().twistedBy(p) ));
+  res_d = (p * up_sym_d) * p.inverse();
+  VERIFY(res.isApprox(res_d) && "upper selfadjoint twisted to full");
+  
+  VERIFY( is_sorted( res = lo.template selfadjointView<Lower>().twistedBy(p) ));
+  res_d = (p * lo_sym_d) * p.inverse();
+  VERIFY(res.isApprox(res_d) && "lower selfadjoint twisted to full");
+}
+
+template<typename Scalar> void sparse_permutations_all(int size)
+{
+  CALL_SUBTEST(( sparse_permutations<ColMajor>(SparseMatrix<Scalar, ColMajor>(size,size)) ));
+  CALL_SUBTEST(( sparse_permutations<ColMajor>(SparseMatrix<Scalar, RowMajor>(size,size)) ));
+  CALL_SUBTEST(( sparse_permutations<RowMajor>(SparseMatrix<Scalar, ColMajor>(size,size)) ));
+  CALL_SUBTEST(( sparse_permutations<RowMajor>(SparseMatrix<Scalar, RowMajor>(size,size)) ));
+}
+
+EIGEN_DECLARE_TEST(sparse_permutations)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    int s = Eigen::internal::random<int>(1,50);
+    CALL_SUBTEST_1((  sparse_permutations_all<double>(s) ));
+    CALL_SUBTEST_2((  sparse_permutations_all<std::complex<double> >(s) ));
+  }
+
+  VERIFY((internal::is_same<internal::permutation_matrix_product<SparseMatrix<double>,OnTheRight,false,SparseShape>::ReturnType,
+                            internal::nested_eval<Product<SparseMatrix<double>,PermutationMatrix<Dynamic,Dynamic>,AliasFreeProduct>,1>::type>::value));
+
+  VERIFY((internal::is_same<internal::permutation_matrix_product<SparseMatrix<double>,OnTheLeft,false,SparseShape>::ReturnType,
+                            internal::nested_eval<Product<PermutationMatrix<Dynamic,Dynamic>,SparseMatrix<double>,AliasFreeProduct>,1>::type>::value));
+}

diff --git a/test/sparse_product.cpp b/test/sparse_product.cpp
new file mode 100644
index 0000000..6e85f69
--- /dev/null
+++ b/test/sparse_product.cpp

@@ -0,0 +1,477 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(_MSC_VER) && (_MSC_VER==1800)
+// This unit test takes forever to compile in Release mode with MSVC 2013,
+// multiple hours. So let's switch off optimization for this one.
+#pragma optimize("",off)
+#endif
+
+static long int nb_temporaries;
+
+inline void on_temporary_creation() {
+  // here's a great place to set a breakpoint when debugging failures in this test!
+  nb_temporaries++;
+}
+
+#define EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN { on_temporary_creation(); }
+
+#include "sparse.h"
+
+#define VERIFY_EVALUATION_COUNT(XPR,N) {\
+    nb_temporaries = 0; \
+    CALL_SUBTEST( XPR ); \
+    if(nb_temporaries!=N) std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; \
+    VERIFY( (#XPR) && nb_temporaries==N ); \
+  }
+
+
+
+template<typename SparseMatrixType> void sparse_product()
+{
+  typedef typename SparseMatrixType::StorageIndex StorageIndex;
+  Index n = 100;
+  const Index rows  = internal::random<Index>(1,n);
+  const Index cols  = internal::random<Index>(1,n);
+  const Index depth = internal::random<Index>(1,n);
+  typedef typename SparseMatrixType::Scalar Scalar;
+  enum { Flags = SparseMatrixType::Flags };
+
+  double density = (std::max)(8./(rows*cols), 0.2);
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+  typedef Matrix<Scalar,Dynamic,1> DenseVector;
+  typedef Matrix<Scalar,1,Dynamic> RowDenseVector;
+  typedef SparseVector<Scalar,0,StorageIndex> ColSpVector;
+  typedef SparseVector<Scalar,RowMajor,StorageIndex> RowSpVector;
+
+  Scalar s1 = internal::random<Scalar>();
+  Scalar s2 = internal::random<Scalar>();
+
+  // test matrix-matrix product
+  {
+    DenseMatrix refMat2  = DenseMatrix::Zero(rows, depth);
+    DenseMatrix refMat2t = DenseMatrix::Zero(depth, rows);
+    DenseMatrix refMat3  = DenseMatrix::Zero(depth, cols);
+    DenseMatrix refMat3t = DenseMatrix::Zero(cols, depth);
+    DenseMatrix refMat4  = DenseMatrix::Zero(rows, cols);
+    DenseMatrix refMat4t = DenseMatrix::Zero(cols, rows);
+    DenseMatrix refMat5  = DenseMatrix::Random(depth, cols);
+    DenseMatrix refMat6  = DenseMatrix::Random(rows, rows);
+    DenseMatrix dm4 = DenseMatrix::Zero(rows, rows);
+//     DenseVector dv1 = DenseVector::Random(rows);
+    SparseMatrixType m2 (rows, depth);
+    SparseMatrixType m2t(depth, rows);
+    SparseMatrixType m3 (depth, cols);
+    SparseMatrixType m3t(cols, depth);
+    SparseMatrixType m4 (rows, cols);
+    SparseMatrixType m4t(cols, rows);
+    SparseMatrixType m6(rows, rows);
+    initSparse(density, refMat2,  m2);
+    initSparse(density, refMat2t, m2t);
+    initSparse(density, refMat3,  m3);
+    initSparse(density, refMat3t, m3t);
+    initSparse(density, refMat4,  m4);
+    initSparse(density, refMat4t, m4t);
+    initSparse(density, refMat6, m6);
+
+//     int c = internal::random<int>(0,depth-1);
+
+    // sparse * sparse
+    VERIFY_IS_APPROX(m4=m2*m3, refMat4=refMat2*refMat3);
+    VERIFY_IS_APPROX(m4=m2t.transpose()*m3, refMat4=refMat2t.transpose()*refMat3);
+    VERIFY_IS_APPROX(m4=m2t.transpose()*m3t.transpose(), refMat4=refMat2t.transpose()*refMat3t.transpose());
+    VERIFY_IS_APPROX(m4=m2*m3t.transpose(), refMat4=refMat2*refMat3t.transpose());
+
+    VERIFY_IS_APPROX(m4 = m2*m3/s1, refMat4 = refMat2*refMat3/s1);
+    VERIFY_IS_APPROX(m4 = m2*m3*s1, refMat4 = refMat2*refMat3*s1);
+    VERIFY_IS_APPROX(m4 = s2*m2*m3*s1, refMat4 = s2*refMat2*refMat3*s1);
+    VERIFY_IS_APPROX(m4 = (m2+m2)*m3, refMat4 = (refMat2+refMat2)*refMat3);
+    VERIFY_IS_APPROX(m4 = m2*m3.leftCols(cols/2), refMat4 = refMat2*refMat3.leftCols(cols/2));
+    VERIFY_IS_APPROX(m4 = m2*(m3+m3).leftCols(cols/2), refMat4 = refMat2*(refMat3+refMat3).leftCols(cols/2));
+
+    VERIFY_IS_APPROX(m4=(m2*m3).pruned(0), refMat4=refMat2*refMat3);
+    VERIFY_IS_APPROX(m4=(m2t.transpose()*m3).pruned(0), refMat4=refMat2t.transpose()*refMat3);
+    VERIFY_IS_APPROX(m4=(m2t.transpose()*m3t.transpose()).pruned(0), refMat4=refMat2t.transpose()*refMat3t.transpose());
+    VERIFY_IS_APPROX(m4=(m2*m3t.transpose()).pruned(0), refMat4=refMat2*refMat3t.transpose());
+
+#ifndef EIGEN_SPARSE_PRODUCT_IGNORE_TEMPORARY_COUNT
+    // make sure the right product implementation is called:
+    if((!SparseMatrixType::IsRowMajor) && m2.rows()<=m3.cols())
+    {
+      VERIFY_EVALUATION_COUNT(m4 = m2*m3, 2); // 2 for transposing and get a sorted result.
+      VERIFY_EVALUATION_COUNT(m4 = (m2*m3).pruned(0), 1);
+      VERIFY_EVALUATION_COUNT(m4 = (m2*m3).eval().pruned(0), 4);
+    }
+#endif
+
+    // and that pruning is effective:
+    {
+      DenseMatrix Ad(2,2);
+      Ad << -1, 1, 1, 1;
+      SparseMatrixType As(Ad.sparseView()), B(2,2);
+      VERIFY_IS_EQUAL( (As*As.transpose()).eval().nonZeros(), 4);
+      VERIFY_IS_EQUAL( (Ad*Ad.transpose()).eval().sparseView().eval().nonZeros(), 2);
+      VERIFY_IS_EQUAL( (As*As.transpose()).pruned(1e-6).eval().nonZeros(), 2);
+    }
+
+    // dense ?= sparse * sparse
+    VERIFY_IS_APPROX(dm4 =m2*m3, refMat4 =refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4+=m2*m3, refMat4+=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4-=m2*m3, refMat4-=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4 =m2t.transpose()*m3, refMat4 =refMat2t.transpose()*refMat3);
+    VERIFY_IS_APPROX(dm4+=m2t.transpose()*m3, refMat4+=refMat2t.transpose()*refMat3);
+    VERIFY_IS_APPROX(dm4-=m2t.transpose()*m3, refMat4-=refMat2t.transpose()*refMat3);
+    VERIFY_IS_APPROX(dm4 =m2t.transpose()*m3t.transpose(), refMat4 =refMat2t.transpose()*refMat3t.transpose());
+    VERIFY_IS_APPROX(dm4+=m2t.transpose()*m3t.transpose(), refMat4+=refMat2t.transpose()*refMat3t.transpose());
+    VERIFY_IS_APPROX(dm4-=m2t.transpose()*m3t.transpose(), refMat4-=refMat2t.transpose()*refMat3t.transpose());
+    VERIFY_IS_APPROX(dm4 =m2*m3t.transpose(), refMat4 =refMat2*refMat3t.transpose());
+    VERIFY_IS_APPROX(dm4+=m2*m3t.transpose(), refMat4+=refMat2*refMat3t.transpose());
+    VERIFY_IS_APPROX(dm4-=m2*m3t.transpose(), refMat4-=refMat2*refMat3t.transpose());
+    VERIFY_IS_APPROX(dm4 = m2*m3*s1, refMat4 = refMat2*refMat3*s1);
+
+    // test aliasing
+    m4 = m2; refMat4 = refMat2;
+    VERIFY_IS_APPROX(m4=m4*m3, refMat4=refMat4*refMat3);
+
+    // sparse * dense matrix
+    VERIFY_IS_APPROX(dm4=m2*refMat3, refMat4=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4=m2*refMat3t.transpose(), refMat4=refMat2*refMat3t.transpose());
+    VERIFY_IS_APPROX(dm4=m2t.transpose()*refMat3, refMat4=refMat2t.transpose()*refMat3);
+    VERIFY_IS_APPROX(dm4=m2t.transpose()*refMat3t.transpose(), refMat4=refMat2t.transpose()*refMat3t.transpose());
+
+    VERIFY_IS_APPROX(dm4=m2*refMat3, refMat4=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4=dm4+m2*refMat3, refMat4=refMat4+refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4+=m2*refMat3, refMat4+=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4-=m2*refMat3, refMat4-=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4.noalias()+=m2*refMat3, refMat4+=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4.noalias()-=m2*refMat3, refMat4-=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4=m2*(refMat3+refMat3), refMat4=refMat2*(refMat3+refMat3));
+    VERIFY_IS_APPROX(dm4=m2t.transpose()*(refMat3+refMat5)*0.5, refMat4=refMat2t.transpose()*(refMat3+refMat5)*0.5);
+
+    // sparse * dense vector
+    VERIFY_IS_APPROX(dm4.col(0)=m2*refMat3.col(0), refMat4.col(0)=refMat2*refMat3.col(0));
+    VERIFY_IS_APPROX(dm4.col(0)=m2*refMat3t.transpose().col(0), refMat4.col(0)=refMat2*refMat3t.transpose().col(0));
+    VERIFY_IS_APPROX(dm4.col(0)=m2t.transpose()*refMat3.col(0), refMat4.col(0)=refMat2t.transpose()*refMat3.col(0));
+    VERIFY_IS_APPROX(dm4.col(0)=m2t.transpose()*refMat3t.transpose().col(0), refMat4.col(0)=refMat2t.transpose()*refMat3t.transpose().col(0));
+
+    // dense * sparse
+    VERIFY_IS_APPROX(dm4=refMat2*m3, refMat4=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4=dm4+refMat2*m3, refMat4=refMat4+refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4+=refMat2*m3, refMat4+=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4-=refMat2*m3, refMat4-=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4.noalias()+=refMat2*m3, refMat4+=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4.noalias()-=refMat2*m3, refMat4-=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4=refMat2*m3t.transpose(), refMat4=refMat2*refMat3t.transpose());
+    VERIFY_IS_APPROX(dm4=refMat2t.transpose()*m3, refMat4=refMat2t.transpose()*refMat3);
+    VERIFY_IS_APPROX(dm4=refMat2t.transpose()*m3t.transpose(), refMat4=refMat2t.transpose()*refMat3t.transpose());
+
+    // sparse * dense and dense * sparse outer product
+    {
+      Index c  = internal::random<Index>(0,depth-1);
+      Index r  = internal::random<Index>(0,rows-1);
+      Index c1 = internal::random<Index>(0,cols-1);
+      Index r1 = internal::random<Index>(0,depth-1);
+      DenseMatrix dm5  = DenseMatrix::Random(depth, cols);
+
+      VERIFY_IS_APPROX( m4=m2.col(c)*dm5.col(c1).transpose(), refMat4=refMat2.col(c)*dm5.col(c1).transpose());
+      VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
+      VERIFY_IS_APPROX( m4=m2.middleCols(c,1)*dm5.col(c1).transpose(), refMat4=refMat2.col(c)*dm5.col(c1).transpose());
+      VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
+      VERIFY_IS_APPROX(dm4=m2.col(c)*dm5.col(c1).transpose(), refMat4=refMat2.col(c)*dm5.col(c1).transpose());
+
+      VERIFY_IS_APPROX(m4=dm5.col(c1)*m2.col(c).transpose(), refMat4=dm5.col(c1)*refMat2.col(c).transpose());
+      VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
+      VERIFY_IS_APPROX(m4=dm5.col(c1)*m2.middleCols(c,1).transpose(), refMat4=dm5.col(c1)*refMat2.col(c).transpose());
+      VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
+      VERIFY_IS_APPROX(dm4=dm5.col(c1)*m2.col(c).transpose(), refMat4=dm5.col(c1)*refMat2.col(c).transpose());
+
+      VERIFY_IS_APPROX( m4=dm5.row(r1).transpose()*m2.col(c).transpose(), refMat4=dm5.row(r1).transpose()*refMat2.col(c).transpose());
+      VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
+      VERIFY_IS_APPROX(dm4=dm5.row(r1).transpose()*m2.col(c).transpose(), refMat4=dm5.row(r1).transpose()*refMat2.col(c).transpose());
+
+      VERIFY_IS_APPROX( m4=m2.row(r).transpose()*dm5.col(c1).transpose(), refMat4=refMat2.row(r).transpose()*dm5.col(c1).transpose());
+      VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
+      VERIFY_IS_APPROX( m4=m2.middleRows(r,1).transpose()*dm5.col(c1).transpose(), refMat4=refMat2.row(r).transpose()*dm5.col(c1).transpose());
+      VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
+      VERIFY_IS_APPROX(dm4=m2.row(r).transpose()*dm5.col(c1).transpose(), refMat4=refMat2.row(r).transpose()*dm5.col(c1).transpose());
+
+      VERIFY_IS_APPROX( m4=dm5.col(c1)*m2.row(r), refMat4=dm5.col(c1)*refMat2.row(r));
+      VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
+      VERIFY_IS_APPROX( m4=dm5.col(c1)*m2.middleRows(r,1), refMat4=dm5.col(c1)*refMat2.row(r));
+      VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
+      VERIFY_IS_APPROX(dm4=dm5.col(c1)*m2.row(r), refMat4=dm5.col(c1)*refMat2.row(r));
+
+      VERIFY_IS_APPROX( m4=dm5.row(r1).transpose()*m2.row(r), refMat4=dm5.row(r1).transpose()*refMat2.row(r));
+      VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
+      VERIFY_IS_APPROX(dm4=dm5.row(r1).transpose()*m2.row(r), refMat4=dm5.row(r1).transpose()*refMat2.row(r));
+    }
+
+    VERIFY_IS_APPROX(m6=m6*m6, refMat6=refMat6*refMat6);
+
+    // sparse matrix * sparse vector
+    ColSpVector cv0(cols), cv1;
+    DenseVector dcv0(cols), dcv1;
+    initSparse(2*density,dcv0, cv0);
+
+    RowSpVector rv0(depth), rv1;
+    RowDenseVector drv0(depth), drv1(rv1);
+    initSparse(2*density,drv0, rv0);
+
+    VERIFY_IS_APPROX(cv1=m3*cv0, dcv1=refMat3*dcv0);
+    VERIFY_IS_APPROX(rv1=rv0*m3, drv1=drv0*refMat3);
+    VERIFY_IS_APPROX(cv1=m3t.adjoint()*cv0, dcv1=refMat3t.adjoint()*dcv0);
+    VERIFY_IS_APPROX(cv1=rv0*m3, dcv1=drv0*refMat3);
+    VERIFY_IS_APPROX(rv1=m3*cv0, drv1=refMat3*dcv0);
+  }
+
+  // test matrix - diagonal product
+  {
+    DenseMatrix refM2 = DenseMatrix::Zero(rows, cols);
+    DenseMatrix refM3 = DenseMatrix::Zero(rows, cols);
+    DenseMatrix d3 = DenseMatrix::Zero(rows, cols);
+    DiagonalMatrix<Scalar,Dynamic> d1(DenseVector::Random(cols));
+    DiagonalMatrix<Scalar,Dynamic> d2(DenseVector::Random(rows));
+    SparseMatrixType m2(rows, cols);
+    SparseMatrixType m3(rows, cols);
+    initSparse<Scalar>(density, refM2, m2);
+    initSparse<Scalar>(density, refM3, m3);
+    VERIFY_IS_APPROX(m3=m2*d1, refM3=refM2*d1);
+    VERIFY_IS_APPROX(m3=m2.transpose()*d2, refM3=refM2.transpose()*d2);
+    VERIFY_IS_APPROX(m3=d2*m2, refM3=d2*refM2);
+    VERIFY_IS_APPROX(m3=d1*m2.transpose(), refM3=d1*refM2.transpose());
+
+    // also check with a SparseWrapper:
+    DenseVector v1 = DenseVector::Random(cols);
+    DenseVector v2 = DenseVector::Random(rows);
+    DenseVector v3 = DenseVector::Random(rows);
+    VERIFY_IS_APPROX(m3=m2*v1.asDiagonal(), refM3=refM2*v1.asDiagonal());
+    VERIFY_IS_APPROX(m3=m2.transpose()*v2.asDiagonal(), refM3=refM2.transpose()*v2.asDiagonal());
+    VERIFY_IS_APPROX(m3=v2.asDiagonal()*m2, refM3=v2.asDiagonal()*refM2);
+    VERIFY_IS_APPROX(m3=v1.asDiagonal()*m2.transpose(), refM3=v1.asDiagonal()*refM2.transpose());
+
+    VERIFY_IS_APPROX(m3=v2.asDiagonal()*m2*v1.asDiagonal(), refM3=v2.asDiagonal()*refM2*v1.asDiagonal());
+
+    VERIFY_IS_APPROX(v2=m2*v1.asDiagonal()*v1, refM2*v1.asDiagonal()*v1);
+    VERIFY_IS_APPROX(v3=v2.asDiagonal()*m2*v1, v2.asDiagonal()*refM2*v1);
+
+    // evaluate to a dense matrix to check the .row() and .col() iterator functions
+    VERIFY_IS_APPROX(d3=m2*d1, refM3=refM2*d1);
+    VERIFY_IS_APPROX(d3=m2.transpose()*d2, refM3=refM2.transpose()*d2);
+    VERIFY_IS_APPROX(d3=d2*m2, refM3=d2*refM2);
+    VERIFY_IS_APPROX(d3=d1*m2.transpose(), refM3=d1*refM2.transpose());
+  }
+
+  // test self-adjoint and triangular-view products
+  {
+    DenseMatrix b = DenseMatrix::Random(rows, rows);
+    DenseMatrix x = DenseMatrix::Random(rows, rows);
+    DenseMatrix refX = DenseMatrix::Random(rows, rows);
+    DenseMatrix refUp = DenseMatrix::Zero(rows, rows);
+    DenseMatrix refLo = DenseMatrix::Zero(rows, rows);
+    DenseMatrix refS = DenseMatrix::Zero(rows, rows);
+    DenseMatrix refA = DenseMatrix::Zero(rows, rows);
+    SparseMatrixType mUp(rows, rows);
+    SparseMatrixType mLo(rows, rows);
+    SparseMatrixType mS(rows, rows);
+    SparseMatrixType mA(rows, rows);
+    initSparse<Scalar>(density, refA, mA);
+    do {
+      initSparse<Scalar>(density, refUp, mUp, ForceRealDiag|/*ForceNonZeroDiag|*/MakeUpperTriangular);
+    } while (refUp.isZero());
+    refLo = refUp.adjoint();
+    mLo = mUp.adjoint();
+    refS = refUp + refLo;
+    refS.diagonal() *= 0.5;
+    mS = mUp + mLo;
+    // TODO be able to address the diagonal....
+    for (int k=0; k<mS.outerSize(); ++k)
+      for (typename SparseMatrixType::InnerIterator it(mS,k); it; ++it)
+        if (it.index() == k)
+          it.valueRef() *= Scalar(0.5);
+
+    VERIFY_IS_APPROX(refS.adjoint(), refS);
+    VERIFY_IS_APPROX(mS.adjoint(), mS);
+    VERIFY_IS_APPROX(mS, refS);
+    VERIFY_IS_APPROX(x=mS*b, refX=refS*b);
+
+    // sparse selfadjointView with dense matrices
+    VERIFY_IS_APPROX(x=mUp.template selfadjointView<Upper>()*b, refX=refS*b);
+    VERIFY_IS_APPROX(x=mLo.template selfadjointView<Lower>()*b, refX=refS*b);
+    VERIFY_IS_APPROX(x=mS.template selfadjointView<Upper|Lower>()*b, refX=refS*b);
+
+    VERIFY_IS_APPROX(x=b * mUp.template selfadjointView<Upper>(),       refX=b*refS);
+    VERIFY_IS_APPROX(x=b * mLo.template selfadjointView<Lower>(),       refX=b*refS);
+    VERIFY_IS_APPROX(x=b * mS.template selfadjointView<Upper|Lower>(),  refX=b*refS);
+
+    VERIFY_IS_APPROX(x.noalias()+=mUp.template selfadjointView<Upper>()*b, refX+=refS*b);
+    VERIFY_IS_APPROX(x.noalias()-=mLo.template selfadjointView<Lower>()*b, refX-=refS*b);
+    VERIFY_IS_APPROX(x.noalias()+=mS.template selfadjointView<Upper|Lower>()*b, refX+=refS*b);
+
+    // sparse selfadjointView with sparse matrices
+    SparseMatrixType mSres(rows,rows);
+    VERIFY_IS_APPROX(mSres = mLo.template selfadjointView<Lower>()*mS,
+                     refX = refLo.template selfadjointView<Lower>()*refS);
+    VERIFY_IS_APPROX(mSres = mS * mLo.template selfadjointView<Lower>(),
+                     refX = refS * refLo.template selfadjointView<Lower>());
+
+    // sparse triangularView with dense matrices
+    VERIFY_IS_APPROX(x=mA.template triangularView<Upper>()*b, refX=refA.template triangularView<Upper>()*b);
+    VERIFY_IS_APPROX(x=mA.template triangularView<Lower>()*b, refX=refA.template triangularView<Lower>()*b);
+    VERIFY_IS_APPROX(x=b*mA.template triangularView<Upper>(), refX=b*refA.template triangularView<Upper>());
+    VERIFY_IS_APPROX(x=b*mA.template triangularView<Lower>(), refX=b*refA.template triangularView<Lower>());
+
+    // sparse triangularView with sparse matrices
+    VERIFY_IS_APPROX(mSres = mA.template triangularView<Lower>()*mS,   refX = refA.template triangularView<Lower>()*refS);
+    VERIFY_IS_APPROX(mSres = mS * mA.template triangularView<Lower>(), refX = refS * refA.template triangularView<Lower>());
+    VERIFY_IS_APPROX(mSres = mA.template triangularView<Upper>()*mS,   refX = refA.template triangularView<Upper>()*refS);
+    VERIFY_IS_APPROX(mSres = mS * mA.template triangularView<Upper>(), refX = refS * refA.template triangularView<Upper>());
+  }
+}
+
+// New test for Bug in SparseTimeDenseProduct
+template<typename SparseMatrixType, typename DenseMatrixType> void sparse_product_regression_test()
+{
+  // This code does not compile with afflicted versions of the bug
+  SparseMatrixType sm1(3,2);
+  DenseMatrixType m2(2,2);
+  sm1.setZero();
+  m2.setZero();
+
+  DenseMatrixType m3 = sm1*m2;
+
+
+  // This code produces a segfault with afflicted versions of another SparseTimeDenseProduct
+  // bug
+
+  SparseMatrixType sm2(20000,2);
+  sm2.setZero();
+  DenseMatrixType m4(sm2*m2);
+
+  VERIFY_IS_APPROX( m4(0,0), 0.0 );
+}
+
+template<typename Scalar>
+void bug_942()
+{
+  typedef Matrix<Scalar, Dynamic, 1>     Vector;
+  typedef SparseMatrix<Scalar, ColMajor> ColSpMat;
+  typedef SparseMatrix<Scalar, RowMajor> RowSpMat;
+  ColSpMat cmA(1,1);
+  cmA.insert(0,0) = 1;
+
+  RowSpMat rmA(1,1);
+  rmA.insert(0,0) = 1;
+
+  Vector d(1);
+  d[0] = 2;
+
+  double res = 2;
+
+  VERIFY_IS_APPROX( ( cmA*d.asDiagonal() ).eval().coeff(0,0), res );
+  VERIFY_IS_APPROX( ( d.asDiagonal()*rmA ).eval().coeff(0,0), res );
+  VERIFY_IS_APPROX( ( rmA*d.asDiagonal() ).eval().coeff(0,0), res );
+  VERIFY_IS_APPROX( ( d.asDiagonal()*cmA ).eval().coeff(0,0), res );
+}
+
+template<typename Real>
+void test_mixing_types()
+{
+  typedef std::complex<Real> Cplx;
+  typedef SparseMatrix<Real> SpMatReal;
+  typedef SparseMatrix<Cplx> SpMatCplx;
+  typedef SparseMatrix<Cplx,RowMajor> SpRowMatCplx;
+  typedef Matrix<Real,Dynamic,Dynamic> DenseMatReal;
+  typedef Matrix<Cplx,Dynamic,Dynamic> DenseMatCplx;
+
+  Index n = internal::random<Index>(1,100);
+  double density = (std::max)(8./(n*n), 0.2);
+
+  SpMatReal sR1(n,n);
+  SpMatCplx sC1(n,n), sC2(n,n), sC3(n,n);
+  SpRowMatCplx sCR(n,n);
+  DenseMatReal dR1(n,n);
+  DenseMatCplx dC1(n,n), dC2(n,n), dC3(n,n);
+
+  initSparse<Real>(density, dR1, sR1);
+  initSparse<Cplx>(density, dC1, sC1);
+  initSparse<Cplx>(density, dC2, sC2);
+
+  VERIFY_IS_APPROX( sC2 = (sR1 * sC1),                         dC3 = dR1.template cast<Cplx>() * dC1 );
+  VERIFY_IS_APPROX( sC2 = (sC1 * sR1),                         dC3 = dC1 * dR1.template cast<Cplx>() );
+  VERIFY_IS_APPROX( sC2 = (sR1.transpose() * sC1),             dC3 = dR1.template cast<Cplx>().transpose() * dC1 );
+  VERIFY_IS_APPROX( sC2 = (sC1.transpose() * sR1),             dC3 = dC1.transpose() * dR1.template cast<Cplx>() );
+  VERIFY_IS_APPROX( sC2 = (sR1 * sC1.transpose()),             dC3 = dR1.template cast<Cplx>() * dC1.transpose() );
+  VERIFY_IS_APPROX( sC2 = (sC1 * sR1.transpose()),             dC3 = dC1 * dR1.template cast<Cplx>().transpose() );
+  VERIFY_IS_APPROX( sC2 = (sR1.transpose() * sC1.transpose()), dC3 = dR1.template cast<Cplx>().transpose() * dC1.transpose() );
+  VERIFY_IS_APPROX( sC2 = (sC1.transpose() * sR1.transpose()), dC3 = dC1.transpose() * dR1.template cast<Cplx>().transpose() );
+
+  VERIFY_IS_APPROX( sCR = (sR1 * sC1),                         dC3 = dR1.template cast<Cplx>() * dC1 );
+  VERIFY_IS_APPROX( sCR = (sC1 * sR1),                         dC3 = dC1 * dR1.template cast<Cplx>() );
+  VERIFY_IS_APPROX( sCR = (sR1.transpose() * sC1),             dC3 = dR1.template cast<Cplx>().transpose() * dC1 );
+  VERIFY_IS_APPROX( sCR = (sC1.transpose() * sR1),             dC3 = dC1.transpose() * dR1.template cast<Cplx>() );
+  VERIFY_IS_APPROX( sCR = (sR1 * sC1.transpose()),             dC3 = dR1.template cast<Cplx>() * dC1.transpose() );
+  VERIFY_IS_APPROX( sCR = (sC1 * sR1.transpose()),             dC3 = dC1 * dR1.template cast<Cplx>().transpose() );
+  VERIFY_IS_APPROX( sCR = (sR1.transpose() * sC1.transpose()), dC3 = dR1.template cast<Cplx>().transpose() * dC1.transpose() );
+  VERIFY_IS_APPROX( sCR = (sC1.transpose() * sR1.transpose()), dC3 = dC1.transpose() * dR1.template cast<Cplx>().transpose() );
+
+
+  VERIFY_IS_APPROX( sC2 = (sR1 * sC1).pruned(),                         dC3 = dR1.template cast<Cplx>() * dC1 );
+  VERIFY_IS_APPROX( sC2 = (sC1 * sR1).pruned(),                         dC3 = dC1 * dR1.template cast<Cplx>() );
+  VERIFY_IS_APPROX( sC2 = (sR1.transpose() * sC1).pruned(),             dC3 = dR1.template cast<Cplx>().transpose() * dC1 );
+  VERIFY_IS_APPROX( sC2 = (sC1.transpose() * sR1).pruned(),             dC3 = dC1.transpose() * dR1.template cast<Cplx>() );
+  VERIFY_IS_APPROX( sC2 = (sR1 * sC1.transpose()).pruned(),             dC3 = dR1.template cast<Cplx>() * dC1.transpose() );
+  VERIFY_IS_APPROX( sC2 = (sC1 * sR1.transpose()).pruned(),             dC3 = dC1 * dR1.template cast<Cplx>().transpose() );
+  VERIFY_IS_APPROX( sC2 = (sR1.transpose() * sC1.transpose()).pruned(), dC3 = dR1.template cast<Cplx>().transpose() * dC1.transpose() );
+  VERIFY_IS_APPROX( sC2 = (sC1.transpose() * sR1.transpose()).pruned(), dC3 = dC1.transpose() * dR1.template cast<Cplx>().transpose() );
+
+  VERIFY_IS_APPROX( sCR = (sR1 * sC1).pruned(),                         dC3 = dR1.template cast<Cplx>() * dC1 );
+  VERIFY_IS_APPROX( sCR = (sC1 * sR1).pruned(),                         dC3 = dC1 * dR1.template cast<Cplx>() );
+  VERIFY_IS_APPROX( sCR = (sR1.transpose() * sC1).pruned(),             dC3 = dR1.template cast<Cplx>().transpose() * dC1 );
+  VERIFY_IS_APPROX( sCR = (sC1.transpose() * sR1).pruned(),             dC3 = dC1.transpose() * dR1.template cast<Cplx>() );
+  VERIFY_IS_APPROX( sCR = (sR1 * sC1.transpose()).pruned(),             dC3 = dR1.template cast<Cplx>() * dC1.transpose() );
+  VERIFY_IS_APPROX( sCR = (sC1 * sR1.transpose()).pruned(),             dC3 = dC1 * dR1.template cast<Cplx>().transpose() );
+  VERIFY_IS_APPROX( sCR = (sR1.transpose() * sC1.transpose()).pruned(), dC3 = dR1.template cast<Cplx>().transpose() * dC1.transpose() );
+  VERIFY_IS_APPROX( sCR = (sC1.transpose() * sR1.transpose()).pruned(), dC3 = dC1.transpose() * dR1.template cast<Cplx>().transpose() );
+
+
+  VERIFY_IS_APPROX( dC2 = (sR1 * sC1),                         dC3 = dR1.template cast<Cplx>() * dC1 );
+  VERIFY_IS_APPROX( dC2 = (sC1 * sR1),                         dC3 = dC1 * dR1.template cast<Cplx>() );
+  VERIFY_IS_APPROX( dC2 = (sR1.transpose() * sC1),             dC3 = dR1.template cast<Cplx>().transpose() * dC1 );
+  VERIFY_IS_APPROX( dC2 = (sC1.transpose() * sR1),             dC3 = dC1.transpose() * dR1.template cast<Cplx>() );
+  VERIFY_IS_APPROX( dC2 = (sR1 * sC1.transpose()),             dC3 = dR1.template cast<Cplx>() * dC1.transpose() );
+  VERIFY_IS_APPROX( dC2 = (sC1 * sR1.transpose()),             dC3 = dC1 * dR1.template cast<Cplx>().transpose() );
+  VERIFY_IS_APPROX( dC2 = (sR1.transpose() * sC1.transpose()), dC3 = dR1.template cast<Cplx>().transpose() * dC1.transpose() );
+  VERIFY_IS_APPROX( dC2 = (sC1.transpose() * sR1.transpose()), dC3 = dC1.transpose() * dR1.template cast<Cplx>().transpose() );
+
+
+  VERIFY_IS_APPROX( dC2 = dR1 * sC1, dC3 = dR1.template cast<Cplx>() * sC1 );
+  VERIFY_IS_APPROX( dC2 = sR1 * dC1, dC3 = sR1.template cast<Cplx>() * dC1 );
+  VERIFY_IS_APPROX( dC2 = dC1 * sR1, dC3 = dC1 * sR1.template cast<Cplx>() );
+  VERIFY_IS_APPROX( dC2 = sC1 * dR1, dC3 = sC1 * dR1.template cast<Cplx>() );
+
+  VERIFY_IS_APPROX( dC2 = dR1.row(0) * sC1, dC3 = dR1.template cast<Cplx>().row(0) * sC1 );
+  VERIFY_IS_APPROX( dC2 = sR1 * dC1.col(0), dC3 = sR1.template cast<Cplx>() * dC1.col(0) );
+  VERIFY_IS_APPROX( dC2 = dC1.row(0) * sR1, dC3 = dC1.row(0) * sR1.template cast<Cplx>() );
+  VERIFY_IS_APPROX( dC2 = sC1 * dR1.col(0), dC3 = sC1 * dR1.template cast<Cplx>().col(0) );
+}
+
+EIGEN_DECLARE_TEST(sparse_product)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( (sparse_product<SparseMatrix<double,ColMajor> >()) );
+    CALL_SUBTEST_1( (sparse_product<SparseMatrix<double,RowMajor> >()) );
+    CALL_SUBTEST_1( (bug_942<double>()) );
+    CALL_SUBTEST_2( (sparse_product<SparseMatrix<std::complex<double>, ColMajor > >()) );
+    CALL_SUBTEST_2( (sparse_product<SparseMatrix<std::complex<double>, RowMajor > >()) );
+    CALL_SUBTEST_3( (sparse_product<SparseMatrix<float,ColMajor,long int> >()) );
+    CALL_SUBTEST_4( (sparse_product_regression_test<SparseMatrix<double,RowMajor>, Matrix<double, Dynamic, Dynamic, RowMajor> >()) );
+
+    CALL_SUBTEST_5( (test_mixing_types<float>()) );
+  }
+}

diff --git a/test/sparse_ref.cpp b/test/sparse_ref.cpp
new file mode 100644
index 0000000..8f33af8
--- /dev/null
+++ b/test/sparse_ref.cpp

@@ -0,0 +1,139 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// This unit test cannot be easily written to work with EIGEN_DEFAULT_TO_ROW_MAJOR
+#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
+#undef EIGEN_DEFAULT_TO_ROW_MAJOR
+#endif
+
+static long int nb_temporaries;
+
+inline void on_temporary_creation() {
+  // here's a great place to set a breakpoint when debugging failures in this test!
+  nb_temporaries++;
+}
+
+#define EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN { on_temporary_creation(); }
+
+#include "main.h"
+#include <Eigen/SparseCore>
+
+#define VERIFY_EVALUATION_COUNT(XPR,N) {\
+    nb_temporaries = 0; \
+    CALL_SUBTEST( XPR ); \
+    if(nb_temporaries!=N) std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; \
+    VERIFY( (#XPR) && nb_temporaries==N ); \
+  }
+
+template<typename PlainObjectType> void check_const_correctness(const PlainObjectType&)
+{
+  // verify that ref-to-const don't have LvalueBit
+  typedef typename internal::add_const<PlainObjectType>::type ConstPlainObjectType;
+  VERIFY( !(internal::traits<Ref<ConstPlainObjectType> >::Flags & LvalueBit) );
+  VERIFY( !(internal::traits<Ref<ConstPlainObjectType, Aligned> >::Flags & LvalueBit) );
+  VERIFY( !(Ref<ConstPlainObjectType>::Flags & LvalueBit) );
+  VERIFY( !(Ref<ConstPlainObjectType, Aligned>::Flags & LvalueBit) );
+}
+
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_1(Ref<SparseMatrix<float> > a, const B &b) { VERIFY_IS_EQUAL(a.toDense(),b.toDense()); }
+
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_2(const Ref<const SparseMatrix<float> >& a, const B &b) { VERIFY_IS_EQUAL(a.toDense(),b.toDense()); }
+
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_3(const Ref<const SparseMatrix<float>, StandardCompressedFormat>& a, const B &b) {
+  VERIFY(a.isCompressed());
+  VERIFY_IS_EQUAL(a.toDense(),b.toDense());
+}
+
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_4(Ref<SparseVector<float> > a, const B &b) { VERIFY_IS_EQUAL(a.toDense(),b.toDense()); }
+
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_5(const Ref<const SparseVector<float> >& a, const B &b) { VERIFY_IS_EQUAL(a.toDense(),b.toDense()); }
+
+void call_ref()
+{
+  SparseMatrix<float>               A = MatrixXf::Random(10,10).sparseView(0.5,1);
+  SparseMatrix<float,RowMajor>      B = MatrixXf::Random(10,10).sparseView(0.5,1);
+  SparseMatrix<float>               C = MatrixXf::Random(10,10).sparseView(0.5,1);
+  C.reserve(VectorXi::Constant(C.outerSize(), 2));
+  const SparseMatrix<float>&        Ac(A);
+  Block<SparseMatrix<float> >       Ab(A,0,1, 3,3);
+  const Block<SparseMatrix<float> > Abc(A,0,1,3,3);
+  SparseVector<float>               vc =  VectorXf::Random(10).sparseView(0.5,1);
+  SparseVector<float,RowMajor>      vr =  VectorXf::Random(10).sparseView(0.5,1);
+  SparseMatrix<float> AA = A*A;
+  
+
+  VERIFY_EVALUATION_COUNT( call_ref_1(A, A),  0);
+//   VERIFY_EVALUATION_COUNT( call_ref_1(Ac, Ac),  0); // does not compile on purpose
+  VERIFY_EVALUATION_COUNT( call_ref_2(A, A),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_3(A, A),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(A.transpose(), A.transpose()),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_3(A.transpose(), A.transpose()),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_2(Ac,Ac), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_3(Ac,Ac), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(A+A,2*Ac), 1);
+  VERIFY_EVALUATION_COUNT( call_ref_3(A+A,2*Ac), 1);
+  VERIFY_EVALUATION_COUNT( call_ref_2(B, B),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_3(B, B),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_2(B.transpose(), B.transpose()),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_3(B.transpose(), B.transpose()),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(A*A, AA),  3);
+  VERIFY_EVALUATION_COUNT( call_ref_3(A*A, AA),  3);
+  
+  VERIFY(!C.isCompressed());
+  VERIFY_EVALUATION_COUNT( call_ref_3(C, C),  1);
+  
+  Ref<SparseMatrix<float> > Ar(A);
+  VERIFY_IS_APPROX(Ar+Ar, A+A);
+  VERIFY_EVALUATION_COUNT( call_ref_1(Ar, A),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(Ar, A),  0);
+  
+  Ref<SparseMatrix<float,RowMajor> > Br(B);
+  VERIFY_EVALUATION_COUNT( call_ref_1(Br.transpose(), Br.transpose()),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(Br, Br),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_2(Br.transpose(), Br.transpose()),  0);
+  
+  Ref<const SparseMatrix<float> > Arc(A);
+//   VERIFY_EVALUATION_COUNT( call_ref_1(Arc, Arc),  0); // does not compile on purpose
+  VERIFY_EVALUATION_COUNT( call_ref_2(Arc, Arc),  0);
+  
+  VERIFY_EVALUATION_COUNT( call_ref_2(A.middleCols(1,3), A.middleCols(1,3)),  0);
+  
+  VERIFY_EVALUATION_COUNT( call_ref_2(A.col(2), A.col(2)),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(vc, vc),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(vr.transpose(), vr.transpose()),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(vr, vr.transpose()),  0);
+  
+  VERIFY_EVALUATION_COUNT( call_ref_2(A.block(1,1,3,3), A.block(1,1,3,3)),  1); // should be 0 (allocate starts/nnz only)
+
+  VERIFY_EVALUATION_COUNT( call_ref_4(vc, vc),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_4(vr, vr.transpose()),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_5(vc, vc),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_5(vr, vr.transpose()),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_4(A.col(2), A.col(2)),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_5(A.col(2), A.col(2)),  0);
+  // VERIFY_EVALUATION_COUNT( call_ref_4(A.row(2), A.row(2).transpose()),  1); // does not compile on purpose
+  VERIFY_EVALUATION_COUNT( call_ref_5(A.row(2), A.row(2).transpose()),  1);
+}
+
+EIGEN_DECLARE_TEST(sparse_ref)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( check_const_correctness(SparseMatrix<float>()) );
+    CALL_SUBTEST_1( check_const_correctness(SparseMatrix<double,RowMajor>()) );
+    CALL_SUBTEST_2( call_ref() );
+
+    CALL_SUBTEST_3( check_const_correctness(SparseVector<float>()) );
+    CALL_SUBTEST_3( check_const_correctness(SparseVector<double,RowMajor>()) );
+  }
+}

diff --git a/test/sparse_solver.h b/test/sparse_solver.h
new file mode 100644
index 0000000..5892794
--- /dev/null
+++ b/test/sparse_solver.h

@@ -0,0 +1,699 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "sparse.h"
+#include <Eigen/SparseCore>
+#include <Eigen/SparseLU>
+#include <sstream>
+
+template<typename Solver, typename Rhs, typename Guess,typename Result>
+void solve_with_guess(IterativeSolverBase<Solver>& solver, const MatrixBase<Rhs>& b, const Guess& g, Result &x) {
+  if(internal::random<bool>())
+  {
+    // With a temporary through evaluator<SolveWithGuess>
+    x = solver.derived().solveWithGuess(b,g) + Result::Zero(x.rows(), x.cols());
+  }
+  else
+  {
+    // direct evaluation within x through Assignment<Result,SolveWithGuess>
+    x = solver.derived().solveWithGuess(b.derived(),g);
+  }
+}
+
+template<typename Solver, typename Rhs, typename Guess,typename Result>
+void solve_with_guess(SparseSolverBase<Solver>& solver, const MatrixBase<Rhs>& b, const Guess& , Result& x) {
+  if(internal::random<bool>())
+    x = solver.derived().solve(b) + Result::Zero(x.rows(), x.cols());
+  else
+    x = solver.derived().solve(b);
+}
+
+template<typename Solver, typename Rhs, typename Guess,typename Result>
+void solve_with_guess(SparseSolverBase<Solver>& solver, const SparseMatrixBase<Rhs>& b, const Guess& , Result& x) {
+  x = solver.derived().solve(b);
+}
+
+template<typename Solver, typename Rhs, typename DenseMat, typename DenseRhs>
+void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A, const Rhs& b, const DenseMat& dA, const DenseRhs& db)
+{
+  typedef typename Solver::MatrixType Mat;
+  typedef typename Mat::Scalar Scalar;
+  typedef typename Mat::StorageIndex StorageIndex;
+
+  DenseRhs refX = dA.householderQr().solve(db);
+  {
+    Rhs x(A.cols(), b.cols());
+    Rhs oldb = b;
+
+    solver.compute(A);
+    if (solver.info() != Success)
+    {
+      std::cerr << "ERROR | sparse solver testing, factorization failed (" << typeid(Solver).name() << ")\n";
+      VERIFY(solver.info() == Success);
+    }
+    x = solver.solve(b);
+    if (solver.info() != Success)
+    {
+      std::cerr << "WARNING: sparse solver testing: solving failed (" << typeid(Solver).name() << ")\n";
+      // dump call stack:
+      g_test_level++; 
+      VERIFY(solver.info() == Success);
+      g_test_level--;
+      return;
+    }
+    VERIFY(oldb.isApprox(b) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(x.isApprox(refX,test_precision<Scalar>()));
+
+    x.setZero();
+    solve_with_guess(solver, b, x, x);
+    VERIFY(solver.info() == Success && "solving failed when using solve_with_guess API");
+    VERIFY(oldb.isApprox(b) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(x.isApprox(refX,test_precision<Scalar>()));
+    
+    x.setZero();
+    // test the analyze/factorize API
+    solver.analyzePattern(A);
+    solver.factorize(A);
+    VERIFY(solver.info() == Success && "factorization failed when using analyzePattern/factorize API");
+    x = solver.solve(b);
+    VERIFY(solver.info() == Success && "solving failed when using analyzePattern/factorize API");
+    VERIFY(oldb.isApprox(b) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(x.isApprox(refX,test_precision<Scalar>()));
+    
+    x.setZero();
+    // test with Map
+    MappedSparseMatrix<Scalar,Mat::Options,StorageIndex> Am(A.rows(), A.cols(), A.nonZeros(), const_cast<StorageIndex*>(A.outerIndexPtr()), const_cast<StorageIndex*>(A.innerIndexPtr()), const_cast<Scalar*>(A.valuePtr()));
+    solver.compute(Am);
+    VERIFY(solver.info() == Success && "factorization failed when using Map");
+    DenseRhs dx(refX);
+    dx.setZero();
+    Map<DenseRhs> xm(dx.data(), dx.rows(), dx.cols());
+    Map<const DenseRhs> bm(db.data(), db.rows(), db.cols());
+    xm = solver.solve(bm);
+    VERIFY(solver.info() == Success && "solving failed when using Map");
+    VERIFY(oldb.isApprox(bm) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(xm.isApprox(refX,test_precision<Scalar>()));
+  }
+  
+  // if not too large, do some extra check:
+  if(A.rows()<2000)
+  {
+    // test initialization ctor
+    {
+      Rhs x(b.rows(), b.cols());
+      Solver solver2(A);
+      VERIFY(solver2.info() == Success);
+      x = solver2.solve(b);
+      VERIFY(x.isApprox(refX,test_precision<Scalar>()));
+    }
+
+    // test dense Block as the result and rhs:
+    {
+      DenseRhs x(refX.rows(), refX.cols());
+      DenseRhs oldb(db);
+      x.setZero();
+      x.block(0,0,x.rows(),x.cols()) = solver.solve(db.block(0,0,db.rows(),db.cols()));
+      VERIFY(oldb.isApprox(db) && "sparse solver testing: the rhs should not be modified!");
+      VERIFY(x.isApprox(refX,test_precision<Scalar>()));
+    }
+
+    // test uncompressed inputs
+    {
+      Mat A2 = A;
+      A2.reserve((ArrayXf::Random(A.outerSize())+2).template cast<typename Mat::StorageIndex>().eval());
+      solver.compute(A2);
+      Rhs x = solver.solve(b);
+      VERIFY(x.isApprox(refX,test_precision<Scalar>()));
+    }
+
+    // test expression as input
+    {
+      solver.compute(0.5*(A+A));
+      Rhs x = solver.solve(b);
+      VERIFY(x.isApprox(refX,test_precision<Scalar>()));
+
+      Solver solver2(0.5*(A+A));
+      Rhs x2 = solver2.solve(b);
+      VERIFY(x2.isApprox(refX,test_precision<Scalar>()));
+    }
+  }
+}
+
+// specialization of generic check_sparse_solving for SuperLU in order to also test adjoint and transpose solves
+template<typename Scalar, typename Rhs, typename DenseMat, typename DenseRhs>
+void check_sparse_solving(Eigen::SparseLU<Eigen::SparseMatrix<Scalar> >& solver, const typename Eigen::SparseMatrix<Scalar>& A, const Rhs& b, const DenseMat& dA, const DenseRhs& db)
+{
+  typedef typename Eigen::SparseMatrix<Scalar> Mat;
+  typedef typename Mat::StorageIndex StorageIndex;
+  typedef typename Eigen::SparseLU<Eigen::SparseMatrix<Scalar> > Solver;
+
+  // reference solutions computed by dense QR solver
+  DenseRhs refX1 = dA.householderQr().solve(db); // solution of A x = db
+  DenseRhs refX2 = dA.transpose().householderQr().solve(db); // solution of A^T * x = db (use transposed matrix A^T)
+  DenseRhs refX3 = dA.adjoint().householderQr().solve(db);  // solution of A^* * x = db (use adjoint matrix A^*)
+
+
+  {
+    Rhs x1(A.cols(), b.cols());
+    Rhs x2(A.cols(), b.cols());
+    Rhs x3(A.cols(), b.cols());
+    Rhs oldb = b;
+
+    solver.compute(A);
+    if (solver.info() != Success)
+    {
+      std::cerr << "ERROR | sparse solver testing, factorization failed (" << typeid(Solver).name() << ")\n";
+      VERIFY(solver.info() == Success);
+    }
+    x1 = solver.solve(b);
+    if (solver.info() != Success)
+    {
+      std::cerr << "WARNING | sparse solver testing: solving failed (" << typeid(Solver).name() << ")\n";
+      return;
+    }
+    VERIFY(oldb.isApprox(b,0.0) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(x1.isApprox(refX1,test_precision<Scalar>()));
+
+    // test solve with transposed
+    x2 = solver.transpose().solve(b);
+    VERIFY(oldb.isApprox(b) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(x2.isApprox(refX2,test_precision<Scalar>()));
+
+
+    // test solve with adjoint
+    //solver.template _solve_impl_transposed<true>(b, x3);
+    x3 = solver.adjoint().solve(b);
+    VERIFY(oldb.isApprox(b,0.0) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(x3.isApprox(refX3,test_precision<Scalar>()));
+
+    x1.setZero();
+    solve_with_guess(solver, b, x1, x1);
+    VERIFY(solver.info() == Success && "solving failed when using analyzePattern/factorize API");
+    VERIFY(oldb.isApprox(b,0.0) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(x1.isApprox(refX1,test_precision<Scalar>()));
+
+    x1.setZero();
+    x2.setZero();
+    x3.setZero();
+    // test the analyze/factorize API
+    solver.analyzePattern(A);
+    solver.factorize(A);
+    VERIFY(solver.info() == Success && "factorization failed when using analyzePattern/factorize API");
+    x1 = solver.solve(b);
+    x2 = solver.transpose().solve(b);
+    x3 = solver.adjoint().solve(b);
+
+    VERIFY(solver.info() == Success && "solving failed when using analyzePattern/factorize API");
+    VERIFY(oldb.isApprox(b,0.0) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(x1.isApprox(refX1,test_precision<Scalar>()));
+    VERIFY(x2.isApprox(refX2,test_precision<Scalar>()));
+    VERIFY(x3.isApprox(refX3,test_precision<Scalar>()));
+
+    x1.setZero();
+    // test with Map
+    MappedSparseMatrix<Scalar,Mat::Options,StorageIndex> Am(A.rows(), A.cols(), A.nonZeros(), const_cast<StorageIndex*>(A.outerIndexPtr()), const_cast<StorageIndex*>(A.innerIndexPtr()), const_cast<Scalar*>(A.valuePtr()));
+    solver.compute(Am);
+    VERIFY(solver.info() == Success && "factorization failed when using Map");
+    DenseRhs dx(refX1);
+    dx.setZero();
+    Map<DenseRhs> xm(dx.data(), dx.rows(), dx.cols());
+    Map<const DenseRhs> bm(db.data(), db.rows(), db.cols());
+    xm = solver.solve(bm);
+    VERIFY(solver.info() == Success && "solving failed when using Map");
+    VERIFY(oldb.isApprox(bm,0.0) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(xm.isApprox(refX1,test_precision<Scalar>()));
+  }
+
+  // if not too large, do some extra check:
+  if(A.rows()<2000)
+  {
+    // test initialization ctor
+    {
+      Rhs x(b.rows(), b.cols());
+      Solver solver2(A);
+      VERIFY(solver2.info() == Success);
+      x = solver2.solve(b);
+      VERIFY(x.isApprox(refX1,test_precision<Scalar>()));
+    }
+
+    // test dense Block as the result and rhs:
+    {
+      DenseRhs x(refX1.rows(), refX1.cols());
+      DenseRhs oldb(db);
+      x.setZero();
+      x.block(0,0,x.rows(),x.cols()) = solver.solve(db.block(0,0,db.rows(),db.cols()));
+      VERIFY(oldb.isApprox(db,0.0) && "sparse solver testing: the rhs should not be modified!");
+      VERIFY(x.isApprox(refX1,test_precision<Scalar>()));
+    }
+
+    // test uncompressed inputs
+    {
+      Mat A2 = A;
+      A2.reserve((ArrayXf::Random(A.outerSize())+2).template cast<typename Mat::StorageIndex>().eval());
+      solver.compute(A2);
+      Rhs x = solver.solve(b);
+      VERIFY(x.isApprox(refX1,test_precision<Scalar>()));
+    }
+
+    // test expression as input
+    {
+      solver.compute(0.5*(A+A));
+      Rhs x = solver.solve(b);
+      VERIFY(x.isApprox(refX1,test_precision<Scalar>()));
+
+      Solver solver2(0.5*(A+A));
+      Rhs x2 = solver2.solve(b);
+      VERIFY(x2.isApprox(refX1,test_precision<Scalar>()));
+    }
+  }
+}
+
+
+template<typename Solver, typename Rhs>
+void check_sparse_solving_real_cases(Solver& solver, const typename Solver::MatrixType& A, const Rhs& b, const typename Solver::MatrixType& fullA, const Rhs& refX)
+{
+  typedef typename Solver::MatrixType Mat;
+  typedef typename Mat::Scalar Scalar;
+  typedef typename Mat::RealScalar RealScalar;
+  
+  Rhs x(A.cols(), b.cols());
+
+  solver.compute(A);
+  if (solver.info() != Success)
+  {
+    std::cerr << "ERROR | sparse solver testing, factorization failed (" << typeid(Solver).name() << ")\n";
+    VERIFY(solver.info() == Success);
+  }
+  x = solver.solve(b);
+  
+  if (solver.info() != Success)
+  {
+    std::cerr << "WARNING | sparse solver testing, solving failed (" << typeid(Solver).name() << ")\n";
+    return;
+  }
+  
+  RealScalar res_error = (fullA*x-b).norm()/b.norm();  
+  VERIFY( (res_error <= test_precision<Scalar>() ) && "sparse solver failed without noticing it"); 
+
+  
+  if(refX.size() != 0 && (refX - x).norm()/refX.norm() > test_precision<Scalar>())
+  {
+    std::cerr << "WARNING | found solution is different from the provided reference one\n";
+  }
+  
+}
+template<typename Solver, typename DenseMat>
+void check_sparse_determinant(Solver& solver, const typename Solver::MatrixType& A, const DenseMat& dA)
+{
+  typedef typename Solver::MatrixType Mat;
+  typedef typename Mat::Scalar Scalar;
+  
+  solver.compute(A);
+  if (solver.info() != Success)
+  {
+    std::cerr << "WARNING | sparse solver testing: factorization failed (check_sparse_determinant)\n";
+    return;
+  }
+
+  Scalar refDet = dA.determinant();
+  VERIFY_IS_APPROX(refDet,solver.determinant());
+}
+template<typename Solver, typename DenseMat>
+void check_sparse_abs_determinant(Solver& solver, const typename Solver::MatrixType& A, const DenseMat& dA)
+{
+  using std::abs;
+  typedef typename Solver::MatrixType Mat;
+  typedef typename Mat::Scalar Scalar;
+  
+  solver.compute(A);
+  if (solver.info() != Success)
+  {
+    std::cerr << "WARNING | sparse solver testing: factorization failed (check_sparse_abs_determinant)\n";
+    return;
+  }
+
+  Scalar refDet = abs(dA.determinant());
+  VERIFY_IS_APPROX(refDet,solver.absDeterminant());
+}
+
+template<typename Solver, typename DenseMat>
+int generate_sparse_spd_problem(Solver& , typename Solver::MatrixType& A, typename Solver::MatrixType& halfA, DenseMat& dA, int maxSize = 300)
+{
+  typedef typename Solver::MatrixType Mat;
+  typedef typename Mat::Scalar Scalar;
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+
+  int size = internal::random<int>(1,maxSize);
+  double density = (std::max)(8./(size*size), 0.01);
+
+  Mat M(size, size);
+  DenseMatrix dM(size, size);
+
+  initSparse<Scalar>(density, dM, M, ForceNonZeroDiag);
+
+  A = M * M.adjoint();
+  dA = dM * dM.adjoint();
+  
+  halfA.resize(size,size);
+  if(Solver::UpLo==(Lower|Upper))
+    halfA = A;
+  else
+    halfA.template selfadjointView<Solver::UpLo>().rankUpdate(M);
+  
+  return size;
+}
+
+
+#ifdef TEST_REAL_CASES
+template<typename Scalar>
+inline std::string get_matrixfolder()
+{
+  std::string mat_folder = TEST_REAL_CASES; 
+  if( internal::is_same<Scalar, std::complex<float> >::value || internal::is_same<Scalar, std::complex<double> >::value )
+    mat_folder  = mat_folder + static_cast<std::string>("/complex/");
+  else
+    mat_folder = mat_folder + static_cast<std::string>("/real/");
+  return mat_folder;
+}
+std::string sym_to_string(int sym)
+{
+  if(sym==Symmetric) return "Symmetric ";
+  if(sym==SPD)       return "SPD ";
+  return "";
+}
+template<typename Derived>
+std::string solver_stats(const IterativeSolverBase<Derived> &solver)
+{
+  std::stringstream ss;
+  ss << solver.iterations() << " iters, error: " << solver.error();
+  return ss.str();
+}
+template<typename Derived>
+std::string solver_stats(const SparseSolverBase<Derived> &/*solver*/)
+{
+  return "";
+}
+#endif
+
+template<typename Solver> void check_sparse_spd_solving(Solver& solver, int maxSize = (std::min)(300,EIGEN_TEST_MAX_SIZE), int maxRealWorldSize = 100000)
+{
+  typedef typename Solver::MatrixType Mat;
+  typedef typename Mat::Scalar Scalar;
+  typedef typename Mat::StorageIndex StorageIndex;
+  typedef SparseMatrix<Scalar,ColMajor, StorageIndex> SpMat;
+  typedef SparseVector<Scalar, 0, StorageIndex> SpVec;
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+  typedef Matrix<Scalar,Dynamic,1> DenseVector;
+
+  // generate the problem
+  Mat A, halfA;
+  DenseMatrix dA;
+  for (int i = 0; i < g_repeat; i++) {
+    int size = generate_sparse_spd_problem(solver, A, halfA, dA, maxSize);
+
+    // generate the right hand sides
+    int rhsCols = internal::random<int>(1,16);
+    double density = (std::max)(8./(size*rhsCols), 0.1);
+    SpMat B(size,rhsCols);
+    DenseVector b = DenseVector::Random(size);
+    DenseMatrix dB(size,rhsCols);
+    initSparse<Scalar>(density, dB, B, ForceNonZeroDiag);
+    SpVec c = B.col(0);
+    DenseVector dc = dB.col(0);
+  
+    CALL_SUBTEST( check_sparse_solving(solver, A,     b,  dA, b)  );
+    CALL_SUBTEST( check_sparse_solving(solver, halfA, b,  dA, b)  );
+    CALL_SUBTEST( check_sparse_solving(solver, A,     dB, dA, dB) );
+    CALL_SUBTEST( check_sparse_solving(solver, halfA, dB, dA, dB) );
+    CALL_SUBTEST( check_sparse_solving(solver, A,     B,  dA, dB) );
+    CALL_SUBTEST( check_sparse_solving(solver, halfA, B,  dA, dB) );
+    CALL_SUBTEST( check_sparse_solving(solver, A,     c,  dA, dc) );
+    CALL_SUBTEST( check_sparse_solving(solver, halfA, c,  dA, dc) );
+    
+    // check only once
+    if(i==0)
+    {
+      b = DenseVector::Zero(size);
+      check_sparse_solving(solver, A, b, dA, b);
+    }
+  }
+  
+  // First, get the folder 
+#ifdef TEST_REAL_CASES
+  // Test real problems with double precision only
+  if (internal::is_same<typename NumTraits<Scalar>::Real, double>::value)
+  {
+    std::string mat_folder = get_matrixfolder<Scalar>();
+    MatrixMarketIterator<Scalar> it(mat_folder);
+    for (; it; ++it)
+    {
+      if (it.sym() == SPD){
+        A = it.matrix();
+        if(A.diagonal().size() <= maxRealWorldSize)
+        {
+          DenseVector b = it.rhs();
+          DenseVector refX = it.refX();
+          PermutationMatrix<Dynamic, Dynamic, StorageIndex> pnull;
+          halfA.resize(A.rows(), A.cols());
+          if(Solver::UpLo == (Lower|Upper))
+            halfA = A;
+          else
+            halfA.template selfadjointView<Solver::UpLo>() = A.template triangularView<Eigen::Lower>().twistedBy(pnull);
+          
+          std::cout << "INFO | Testing " << sym_to_string(it.sym()) << "sparse problem " << it.matname()
+                  << " (" << A.rows() << "x" << A.cols() << ") using " << typeid(Solver).name() << "..." << std::endl;
+          CALL_SUBTEST( check_sparse_solving_real_cases(solver, A,     b, A, refX) );
+          std::string stats = solver_stats(solver);
+          if(stats.size()>0)
+            std::cout << "INFO |  " << stats << std::endl;
+          CALL_SUBTEST( check_sparse_solving_real_cases(solver, halfA, b, A, refX) );
+        }
+        else
+        {
+          std::cout << "INFO | Skip sparse problem \"" << it.matname() << "\" (too large)" << std::endl;
+        }
+      }
+    }
+  }
+#else
+  EIGEN_UNUSED_VARIABLE(maxRealWorldSize);
+#endif
+}
+
+template<typename Solver> void check_sparse_spd_determinant(Solver& solver)
+{
+  typedef typename Solver::MatrixType Mat;
+  typedef typename Mat::Scalar Scalar;
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+
+  // generate the problem
+  Mat A, halfA;
+  DenseMatrix dA;
+  generate_sparse_spd_problem(solver, A, halfA, dA, 30);
+  
+  for (int i = 0; i < g_repeat; i++) {
+    check_sparse_determinant(solver, A,     dA);
+    check_sparse_determinant(solver, halfA, dA );
+  }
+}
+
+template<typename Solver, typename DenseMat>
+Index generate_sparse_square_problem(Solver&, typename Solver::MatrixType& A, DenseMat& dA, int maxSize = 300, int options = ForceNonZeroDiag)
+{
+  typedef typename Solver::MatrixType Mat;
+  typedef typename Mat::Scalar Scalar;
+
+  Index size = internal::random<int>(1,maxSize);
+  double density = (std::max)(8./(size*size), 0.01);
+  
+  A.resize(size,size);
+  dA.resize(size,size);
+
+  initSparse<Scalar>(density, dA, A, options);
+  
+  return size;
+}
+
+
+struct prune_column {
+  Index m_col;
+  prune_column(Index col) : m_col(col) {}
+  template<class Scalar>
+  bool operator()(Index, Index col, const Scalar&) const {
+    return col != m_col;
+  }
+};
+
+
+template<typename Solver> void check_sparse_square_solving(Solver& solver, int maxSize = 300, int maxRealWorldSize = 100000, bool checkDeficient = false)
+{
+  typedef typename Solver::MatrixType Mat;
+  typedef typename Mat::Scalar Scalar;
+  typedef SparseMatrix<Scalar,ColMajor, typename Mat::StorageIndex> SpMat;
+  typedef SparseVector<Scalar, 0, typename Mat::StorageIndex> SpVec;
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+  typedef Matrix<Scalar,Dynamic,1> DenseVector;
+
+  int rhsCols = internal::random<int>(1,16);
+
+  Mat A;
+  DenseMatrix dA;
+  for (int i = 0; i < g_repeat; i++) {
+    Index size = generate_sparse_square_problem(solver, A, dA, maxSize);
+
+    A.makeCompressed();
+    DenseVector b = DenseVector::Random(size);
+    DenseMatrix dB(size,rhsCols);
+    SpMat B(size,rhsCols);
+    double density = (std::max)(8./(size*rhsCols), 0.1);
+    initSparse<Scalar>(density, dB, B, ForceNonZeroDiag);
+    B.makeCompressed();
+    SpVec c = B.col(0);
+    DenseVector dc = dB.col(0);
+    CALL_SUBTEST(check_sparse_solving(solver, A, b,  dA, b));
+    CALL_SUBTEST(check_sparse_solving(solver, A, dB, dA, dB));
+    CALL_SUBTEST(check_sparse_solving(solver, A, B,  dA, dB));
+    CALL_SUBTEST(check_sparse_solving(solver, A, c,  dA, dc));
+    
+    // check only once
+    if(i==0)
+    {
+      CALL_SUBTEST(b = DenseVector::Zero(size); check_sparse_solving(solver, A, b, dA, b));
+    }
+    // regression test for Bug 792 (structurally rank deficient matrices):
+    if(checkDeficient && size>1) {
+      Index col = internal::random<int>(0,int(size-1));
+      A.prune(prune_column(col));
+      solver.compute(A);
+      VERIFY_IS_EQUAL(solver.info(), NumericalIssue);
+    }
+  }
+  
+  // First, get the folder 
+#ifdef TEST_REAL_CASES
+  // Test real problems with double precision only
+  if (internal::is_same<typename NumTraits<Scalar>::Real, double>::value)
+  {
+    std::string mat_folder = get_matrixfolder<Scalar>();
+    MatrixMarketIterator<Scalar> it(mat_folder);
+    for (; it; ++it)
+    {
+      A = it.matrix();
+      if(A.diagonal().size() <= maxRealWorldSize)
+      {
+        DenseVector b = it.rhs();
+        DenseVector refX = it.refX();
+        std::cout << "INFO | Testing " << sym_to_string(it.sym()) << "sparse problem " << it.matname()
+                  << " (" << A.rows() << "x" << A.cols() << ") using " << typeid(Solver).name() << "..." << std::endl;
+        CALL_SUBTEST(check_sparse_solving_real_cases(solver, A, b, A, refX));
+        std::string stats = solver_stats(solver);
+        if(stats.size()>0)
+          std::cout << "INFO |  " << stats << std::endl;
+      }
+      else
+      {
+        std::cout << "INFO | SKIP sparse problem \"" << it.matname() << "\" (too large)" << std::endl;
+      }
+    }
+  }
+#else
+  EIGEN_UNUSED_VARIABLE(maxRealWorldSize);
+#endif
+
+}
+
+template<typename Solver> void check_sparse_square_determinant(Solver& solver)
+{
+  typedef typename Solver::MatrixType Mat;
+  typedef typename Mat::Scalar Scalar;
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+  
+  for (int i = 0; i < g_repeat; i++) {
+    // generate the problem
+    Mat A;
+    DenseMatrix dA;
+    
+    int size = internal::random<int>(1,30);
+    dA.setRandom(size,size);
+    
+    dA = (dA.array().abs()<0.3).select(0,dA);
+    dA.diagonal() = (dA.diagonal().array()==0).select(1,dA.diagonal());
+    A = dA.sparseView();
+    A.makeCompressed();
+  
+    check_sparse_determinant(solver, A, dA);
+  }
+}
+
+template<typename Solver> void check_sparse_square_abs_determinant(Solver& solver)
+{
+  typedef typename Solver::MatrixType Mat;
+  typedef typename Mat::Scalar Scalar;
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+
+  for (int i = 0; i < g_repeat; i++) {
+    // generate the problem
+    Mat A;
+    DenseMatrix dA;
+    generate_sparse_square_problem(solver, A, dA, 30);
+    A.makeCompressed();
+    check_sparse_abs_determinant(solver, A, dA);
+  }
+}
+
+template<typename Solver, typename DenseMat>
+void generate_sparse_leastsquare_problem(Solver&, typename Solver::MatrixType& A, DenseMat& dA, int maxSize = 300, int options = ForceNonZeroDiag)
+{
+  typedef typename Solver::MatrixType Mat;
+  typedef typename Mat::Scalar Scalar;
+
+  int rows = internal::random<int>(1,maxSize);
+  int cols = internal::random<int>(1,rows);
+  double density = (std::max)(8./(rows*cols), 0.01);
+  
+  A.resize(rows,cols);
+  dA.resize(rows,cols);
+
+  initSparse<Scalar>(density, dA, A, options);
+}
+
+template<typename Solver> void check_sparse_leastsquare_solving(Solver& solver)
+{
+  typedef typename Solver::MatrixType Mat;
+  typedef typename Mat::Scalar Scalar;
+  typedef SparseMatrix<Scalar,ColMajor, typename Mat::StorageIndex> SpMat;
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+  typedef Matrix<Scalar,Dynamic,1> DenseVector;
+
+  int rhsCols = internal::random<int>(1,16);
+
+  Mat A;
+  DenseMatrix dA;
+  for (int i = 0; i < g_repeat; i++) {
+    generate_sparse_leastsquare_problem(solver, A, dA);
+
+    A.makeCompressed();
+    DenseVector b = DenseVector::Random(A.rows());
+    DenseMatrix dB(A.rows(),rhsCols);
+    SpMat B(A.rows(),rhsCols);
+    double density = (std::max)(8./(A.rows()*rhsCols), 0.1);
+    initSparse<Scalar>(density, dB, B, ForceNonZeroDiag);
+    B.makeCompressed();
+    check_sparse_solving(solver, A, b,  dA, b);
+    check_sparse_solving(solver, A, dB, dA, dB);
+    check_sparse_solving(solver, A, B,  dA, dB);
+    
+    // check only once
+    if(i==0)
+    {
+      b = DenseVector::Zero(A.rows());
+      check_sparse_solving(solver, A, b, dA, b);
+    }
+  }
+}

diff --git a/test/sparse_solvers.cpp b/test/sparse_solvers.cpp
new file mode 100644
index 0000000..3b7cd77
--- /dev/null
+++ b/test/sparse_solvers.cpp

@@ -0,0 +1,125 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "sparse.h"
+
+template<typename Scalar> void
+initSPD(double density,
+        Matrix<Scalar,Dynamic,Dynamic>& refMat,
+        SparseMatrix<Scalar>& sparseMat)
+{
+  Matrix<Scalar,Dynamic,Dynamic> aux(refMat.rows(),refMat.cols());
+  initSparse(density,refMat,sparseMat);
+  refMat = refMat * refMat.adjoint();
+  for (int k=0; k<2; ++k)
+  {
+    initSparse(density,aux,sparseMat,ForceNonZeroDiag);
+    refMat += aux * aux.adjoint();
+  }
+  sparseMat.setZero();
+  for (int j=0 ; j<sparseMat.cols(); ++j)
+    for (int i=j ; i<sparseMat.rows(); ++i)
+      if (refMat(i,j)!=Scalar(0))
+        sparseMat.insert(i,j) = refMat(i,j);
+  sparseMat.finalize();
+}
+
+template<typename Scalar> void sparse_solvers(int rows, int cols)
+{
+  double density = (std::max)(8./(rows*cols), 0.01);
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+  typedef Matrix<Scalar,Dynamic,1> DenseVector;
+  // Scalar eps = 1e-6;
+
+  DenseVector vec1 = DenseVector::Random(rows);
+
+  std::vector<Vector2i> zeroCoords;
+  std::vector<Vector2i> nonzeroCoords;
+
+  // test triangular solver
+  {
+    DenseVector vec2 = vec1, vec3 = vec1;
+    SparseMatrix<Scalar> m2(rows, cols);
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+
+    // lower - dense
+    initSparse<Scalar>(density, refMat2, m2, ForceNonZeroDiag|MakeLowerTriangular, &zeroCoords, &nonzeroCoords);
+    VERIFY_IS_APPROX(refMat2.template triangularView<Lower>().solve(vec2),
+                     m2.template triangularView<Lower>().solve(vec3));
+
+    // upper - dense
+    initSparse<Scalar>(density, refMat2, m2, ForceNonZeroDiag|MakeUpperTriangular, &zeroCoords, &nonzeroCoords);
+    VERIFY_IS_APPROX(refMat2.template triangularView<Upper>().solve(vec2),
+                     m2.template triangularView<Upper>().solve(vec3));
+    VERIFY_IS_APPROX(refMat2.conjugate().template triangularView<Upper>().solve(vec2),
+                     m2.conjugate().template triangularView<Upper>().solve(vec3));
+    {
+      SparseMatrix<Scalar> cm2(m2);
+      //Index rows, Index cols, Index nnz, Index* outerIndexPtr, Index* innerIndexPtr, Scalar* valuePtr
+      MappedSparseMatrix<Scalar> mm2(rows, cols, cm2.nonZeros(), cm2.outerIndexPtr(), cm2.innerIndexPtr(), cm2.valuePtr());
+      VERIFY_IS_APPROX(refMat2.conjugate().template triangularView<Upper>().solve(vec2),
+                       mm2.conjugate().template triangularView<Upper>().solve(vec3));
+    }
+
+    // lower - transpose
+    initSparse<Scalar>(density, refMat2, m2, ForceNonZeroDiag|MakeLowerTriangular, &zeroCoords, &nonzeroCoords);
+    VERIFY_IS_APPROX(refMat2.transpose().template triangularView<Upper>().solve(vec2),
+                     m2.transpose().template triangularView<Upper>().solve(vec3));
+
+    // upper - transpose
+    initSparse<Scalar>(density, refMat2, m2, ForceNonZeroDiag|MakeUpperTriangular, &zeroCoords, &nonzeroCoords);
+    VERIFY_IS_APPROX(refMat2.transpose().template triangularView<Lower>().solve(vec2),
+                     m2.transpose().template triangularView<Lower>().solve(vec3));
+
+    SparseMatrix<Scalar> matB(rows, rows);
+    DenseMatrix refMatB = DenseMatrix::Zero(rows, rows);
+
+    // lower - sparse
+    initSparse<Scalar>(density, refMat2, m2, ForceNonZeroDiag|MakeLowerTriangular);
+    initSparse<Scalar>(density, refMatB, matB);
+    refMat2.template triangularView<Lower>().solveInPlace(refMatB);
+    m2.template triangularView<Lower>().solveInPlace(matB);
+    VERIFY_IS_APPROX(matB.toDense(), refMatB);
+
+    // upper - sparse
+    initSparse<Scalar>(density, refMat2, m2, ForceNonZeroDiag|MakeUpperTriangular);
+    initSparse<Scalar>(density, refMatB, matB);
+    refMat2.template triangularView<Upper>().solveInPlace(refMatB);
+    m2.template triangularView<Upper>().solveInPlace(matB);
+    VERIFY_IS_APPROX(matB, refMatB);
+
+    // test deprecated API
+    initSparse<Scalar>(density, refMat2, m2, ForceNonZeroDiag|MakeLowerTriangular, &zeroCoords, &nonzeroCoords);
+    VERIFY_IS_APPROX(refMat2.template triangularView<Lower>().solve(vec2),
+                     m2.template triangularView<Lower>().solve(vec3));
+
+    // test empty triangular matrix
+    {
+      m2.resize(0,0);
+      refMatB.resize(0,refMatB.cols());
+      DenseMatrix res = m2.template triangularView<Lower>().solve(refMatB);
+      VERIFY_IS_EQUAL(res.rows(),0);
+      VERIFY_IS_EQUAL(res.cols(),refMatB.cols());
+      res = refMatB;
+      m2.template triangularView<Lower>().solveInPlace(res);
+      VERIFY_IS_EQUAL(res.rows(),0);
+      VERIFY_IS_EQUAL(res.cols(),refMatB.cols());
+    }
+  }
+}
+
+EIGEN_DECLARE_TEST(sparse_solvers)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1(sparse_solvers<double>(8, 8) );
+    int s = internal::random<int>(1,300);
+    CALL_SUBTEST_2(sparse_solvers<std::complex<double> >(s,s) );
+    CALL_SUBTEST_1(sparse_solvers<double>(s,s) );
+  }
+}

diff --git a/test/sparse_vector.cpp b/test/sparse_vector.cpp
new file mode 100644
index 0000000..3512927
--- /dev/null
+++ b/test/sparse_vector.cpp

@@ -0,0 +1,163 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "sparse.h"
+
+template<typename Scalar,typename StorageIndex> void sparse_vector(int rows, int cols)
+{
+  double densityMat = (std::max)(8./(rows*cols), 0.01);
+  double densityVec = (std::max)(8./(rows), 0.1);
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+  typedef Matrix<Scalar,Dynamic,1> DenseVector;
+  typedef SparseVector<Scalar,0,StorageIndex> SparseVectorType;
+  typedef SparseMatrix<Scalar,0,StorageIndex> SparseMatrixType;
+  Scalar eps = 1e-6;
+
+  SparseMatrixType m1(rows,rows);
+  SparseVectorType v1(rows), v2(rows), v3(rows);
+  DenseMatrix refM1 = DenseMatrix::Zero(rows, rows);
+  DenseVector refV1 = DenseVector::Random(rows),
+              refV2 = DenseVector::Random(rows),
+              refV3 = DenseVector::Random(rows);
+
+  std::vector<int> zerocoords, nonzerocoords;
+  initSparse<Scalar>(densityVec, refV1, v1, &zerocoords, &nonzerocoords);
+  initSparse<Scalar>(densityMat, refM1, m1);
+
+  initSparse<Scalar>(densityVec, refV2, v2);
+  initSparse<Scalar>(densityVec, refV3, v3);
+
+  Scalar s1 = internal::random<Scalar>();
+
+  // test coeff and coeffRef
+  for (unsigned int i=0; i<zerocoords.size(); ++i)
+  {
+    VERIFY_IS_MUCH_SMALLER_THAN( v1.coeff(zerocoords[i]), eps );
+    //VERIFY_RAISES_ASSERT( v1.coeffRef(zerocoords[i]) = 5 );
+  }
+  {
+    VERIFY(int(nonzerocoords.size()) == v1.nonZeros());
+    int j=0;
+    for (typename SparseVectorType::InnerIterator it(v1); it; ++it,++j)
+    {
+      VERIFY(nonzerocoords[j]==it.index());
+      VERIFY(it.value()==v1.coeff(it.index()));
+      VERIFY(it.value()==refV1.coeff(it.index()));
+    }
+  }
+  VERIFY_IS_APPROX(v1, refV1);
+  
+  // test coeffRef with reallocation
+  {
+    SparseVectorType v4(rows);
+    DenseVector v5 = DenseVector::Zero(rows);
+    for(int k=0; k<rows; ++k)
+    {
+      int i = internal::random<int>(0,rows-1);
+      Scalar v = internal::random<Scalar>();
+      v4.coeffRef(i) += v;
+      v5.coeffRef(i) += v;
+    }
+    VERIFY_IS_APPROX(v4,v5);
+  }
+
+  v1.coeffRef(nonzerocoords[0]) = Scalar(5);
+  refV1.coeffRef(nonzerocoords[0]) = Scalar(5);
+  VERIFY_IS_APPROX(v1, refV1);
+
+  VERIFY_IS_APPROX(v1+v2, refV1+refV2);
+  VERIFY_IS_APPROX(v1+v2+v3, refV1+refV2+refV3);
+
+  VERIFY_IS_APPROX(v1*s1-v2, refV1*s1-refV2);
+
+  VERIFY_IS_APPROX(v1*=s1, refV1*=s1);
+  VERIFY_IS_APPROX(v1/=s1, refV1/=s1);
+
+  VERIFY_IS_APPROX(v1+=v2, refV1+=refV2);
+  VERIFY_IS_APPROX(v1-=v2, refV1-=refV2);
+
+  VERIFY_IS_APPROX(v1.dot(v2), refV1.dot(refV2));
+  VERIFY_IS_APPROX(v1.dot(refV2), refV1.dot(refV2));
+
+  VERIFY_IS_APPROX(m1*v2, refM1*refV2);
+  VERIFY_IS_APPROX(v1.dot(m1*v2), refV1.dot(refM1*refV2));
+  {
+    int i = internal::random<int>(0,rows-1);
+    VERIFY_IS_APPROX(v1.dot(m1.col(i)), refV1.dot(refM1.col(i)));
+  }
+
+
+  VERIFY_IS_APPROX(v1.squaredNorm(), refV1.squaredNorm());
+  
+  VERIFY_IS_APPROX(v1.blueNorm(), refV1.blueNorm());
+
+  // test aliasing
+  VERIFY_IS_APPROX((v1 = -v1), (refV1 = -refV1));
+  VERIFY_IS_APPROX((v1 = v1.transpose()), (refV1 = refV1.transpose().eval()));
+  VERIFY_IS_APPROX((v1 += -v1), (refV1 += -refV1));
+  
+  // sparse matrix to sparse vector
+  SparseMatrixType mv1;
+  VERIFY_IS_APPROX((mv1=v1),v1);
+  VERIFY_IS_APPROX(mv1,(v1=mv1));
+  VERIFY_IS_APPROX(mv1,(v1=mv1.transpose()));
+  
+  // check copy to dense vector with transpose
+  refV3.resize(0);
+  VERIFY_IS_APPROX(refV3 = v1.transpose(),v1.toDense()); 
+  VERIFY_IS_APPROX(DenseVector(v1),v1.toDense()); 
+
+  // test conservative resize
+  {
+    std::vector<StorageIndex> inc;
+    if(rows > 3)
+      inc.push_back(-3);
+    inc.push_back(0);
+    inc.push_back(3);
+    inc.push_back(1);
+    inc.push_back(10);
+
+    for(std::size_t i = 0; i< inc.size(); i++) {
+      StorageIndex incRows = inc[i];
+      SparseVectorType vec1(rows);
+      DenseVector refVec1 = DenseVector::Zero(rows);
+      initSparse<Scalar>(densityVec, refVec1, vec1);
+
+      vec1.conservativeResize(rows+incRows);
+      refVec1.conservativeResize(rows+incRows);
+      if (incRows > 0) refVec1.tail(incRows).setZero();
+
+      VERIFY_IS_APPROX(vec1, refVec1);
+
+      // Insert new values
+      if (incRows > 0)
+        vec1.insert(vec1.rows()-1) = refVec1(refVec1.rows()-1) = 1;
+
+      VERIFY_IS_APPROX(vec1, refVec1);
+    }
+  }
+
+}
+
+EIGEN_DECLARE_TEST(sparse_vector)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    int r = Eigen::internal::random<int>(1,500), c = Eigen::internal::random<int>(1,500);
+    if(Eigen::internal::random<int>(0,4) == 0) {
+      r = c; // check square matrices in 25% of tries
+    }
+    EIGEN_UNUSED_VARIABLE(r+c);
+
+    CALL_SUBTEST_1(( sparse_vector<double,int>(8, 8) ));
+    CALL_SUBTEST_2(( sparse_vector<std::complex<double>, int>(r, c) ));
+    CALL_SUBTEST_1(( sparse_vector<double,long int>(r, c) ));
+    CALL_SUBTEST_1(( sparse_vector<double,short>(r, c) ));
+  }
+}
+

diff --git a/test/sparselu.cpp b/test/sparselu.cpp
new file mode 100644
index 0000000..84cc6eb
--- /dev/null
+++ b/test/sparselu.cpp

@@ -0,0 +1,45 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// SparseLU solve does not accept column major matrices for the destination.
+// However, as expected, the generic check_sparse_square_solving routines produces row-major
+// rhs and destination matrices when compiled with EIGEN_DEFAULT_TO_ROW_MAJOR
+
+#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
+#undef EIGEN_DEFAULT_TO_ROW_MAJOR
+#endif
+
+#include "sparse_solver.h"
+#include <Eigen/SparseLU>
+#include <unsupported/Eigen/SparseExtra>
+
+template<typename T> void test_sparselu_T()
+{
+  SparseLU<SparseMatrix<T, ColMajor> /*, COLAMDOrdering<int>*/ > sparselu_colamd; // COLAMDOrdering is the default
+  SparseLU<SparseMatrix<T, ColMajor>, AMDOrdering<int> > sparselu_amd; 
+  SparseLU<SparseMatrix<T, ColMajor, long int>, NaturalOrdering<long int> > sparselu_natural;
+  
+  check_sparse_square_solving(sparselu_colamd,  300, 100000, true); 
+  check_sparse_square_solving(sparselu_amd,     300,  10000, true);
+  check_sparse_square_solving(sparselu_natural, 300,   2000, true);
+  
+  check_sparse_square_abs_determinant(sparselu_colamd);
+  check_sparse_square_abs_determinant(sparselu_amd);
+  
+  check_sparse_square_determinant(sparselu_colamd);
+  check_sparse_square_determinant(sparselu_amd);
+}
+
+EIGEN_DECLARE_TEST(sparselu)
+{
+  CALL_SUBTEST_1(test_sparselu_T<float>()); 
+  CALL_SUBTEST_2(test_sparselu_T<double>());
+  CALL_SUBTEST_3(test_sparselu_T<std::complex<float> >()); 
+  CALL_SUBTEST_4(test_sparselu_T<std::complex<double> >());
+}

diff --git a/test/sparseqr.cpp b/test/sparseqr.cpp
new file mode 100644
index 0000000..3576cc6
--- /dev/null
+++ b/test/sparseqr.cpp

@@ -0,0 +1,149 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Desire Nuentsa Wakam <desire.nuentsa_wakam@inria.fr>
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+#include "sparse.h"
+#include <Eigen/SparseQR>
+
+template<typename MatrixType,typename DenseMat>
+int generate_sparse_rectangular_problem(MatrixType& A, DenseMat& dA, int maxRows = 300, int maxCols = 150)
+{
+  eigen_assert(maxRows >= maxCols);
+  typedef typename MatrixType::Scalar Scalar;
+  int rows = internal::random<int>(1,maxRows);
+  int cols = internal::random<int>(1,maxCols);
+  double density = (std::max)(8./(rows*cols), 0.01);
+  
+  A.resize(rows,cols);
+  dA.resize(rows,cols);
+  initSparse<Scalar>(density, dA, A,ForceNonZeroDiag);
+  A.makeCompressed();
+  int nop = internal::random<int>(0, internal::random<double>(0,1) > 0.5 ? cols/2 : 0);
+  for(int k=0; k<nop; ++k)
+  {
+    int j0 = internal::random<int>(0,cols-1);
+    int j1 = internal::random<int>(0,cols-1);
+    Scalar s = internal::random<Scalar>();
+    A.col(j0)  = s * A.col(j1);
+    dA.col(j0) = s * dA.col(j1);
+  }
+  
+//   if(rows<cols) {
+//     A.conservativeResize(cols,cols);
+//     dA.conservativeResize(cols,cols);
+//     dA.bottomRows(cols-rows).setZero();
+//   }
+  
+  return rows;
+}
+
+template<typename Scalar> void test_sparseqr_scalar()
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef SparseMatrix<Scalar,ColMajor> MatrixType; 
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMat;
+  typedef Matrix<Scalar,Dynamic,1> DenseVector;
+  MatrixType A;
+  DenseMat dA;
+  DenseVector refX,x,b; 
+  SparseQR<MatrixType, COLAMDOrdering<int> > solver; 
+  generate_sparse_rectangular_problem(A,dA);
+  
+  b = dA * DenseVector::Random(A.cols());
+  solver.compute(A);
+
+  // Q should be MxM
+  VERIFY_IS_EQUAL(solver.matrixQ().rows(), A.rows());
+  VERIFY_IS_EQUAL(solver.matrixQ().cols(), A.rows());
+
+  // R should be MxN
+  VERIFY_IS_EQUAL(solver.matrixR().rows(), A.rows());
+  VERIFY_IS_EQUAL(solver.matrixR().cols(), A.cols());
+
+  // Q and R can be multiplied
+  DenseMat recoveredA = solver.matrixQ()
+                      * DenseMat(solver.matrixR().template triangularView<Upper>())
+                      * solver.colsPermutation().transpose();
+  VERIFY_IS_EQUAL(recoveredA.rows(), A.rows());
+  VERIFY_IS_EQUAL(recoveredA.cols(), A.cols());
+
+  // and in the full rank case the original matrix is recovered
+  if (solver.rank() == A.cols())
+  {
+      VERIFY_IS_APPROX(A, recoveredA);
+  }
+
+  if(internal::random<float>(0,1)>0.5f)
+    solver.factorize(A);  // this checks that calling analyzePattern is not needed if the pattern do not change.
+  if (solver.info() != Success)
+  {
+    std::cerr << "sparse QR factorization failed\n";
+    exit(0);
+    return;
+  }
+  x = solver.solve(b);
+  if (solver.info() != Success)
+  {
+    std::cerr << "sparse QR factorization failed\n";
+    exit(0);
+    return;
+  }
+
+  // Compare with a dense QR solver
+  ColPivHouseholderQR<DenseMat> dqr(dA);
+  refX = dqr.solve(b);
+  
+  bool rank_deficient = A.cols()>A.rows() || dqr.rank()<A.cols();
+  if(rank_deficient)
+  {
+    // rank deficient problem -> we might have to increase the threshold
+    // to get a correct solution.
+    RealScalar th = RealScalar(20)*dA.colwise().norm().maxCoeff()*(A.rows()+A.cols()) * NumTraits<RealScalar>::epsilon();
+    for(Index k=0; (k<16) && !test_isApprox(A*x,b); ++k)
+    {
+      th *= RealScalar(10);
+      solver.setPivotThreshold(th);
+      solver.compute(A);
+      x = solver.solve(b);
+    }
+  }
+
+  VERIFY_IS_APPROX(A * x, b);
+  
+  // For rank deficient problem, the estimated rank might
+  // be slightly off, so let's only raise a warning in such cases.
+  if(rank_deficient) ++g_test_level;
+  VERIFY_IS_EQUAL(solver.rank(), dqr.rank());
+  if(rank_deficient) --g_test_level;
+
+  if(solver.rank()==A.cols()) // full rank
+    VERIFY_IS_APPROX(x, refX);
+//   else
+//     VERIFY((dA * refX - b).norm() * 2 > (A * x - b).norm() );
+
+  // Compute explicitly the matrix Q
+  MatrixType Q, QtQ, idM;
+  Q = solver.matrixQ();
+  //Check  ||Q' * Q - I ||
+  QtQ = Q * Q.adjoint();
+  idM.resize(Q.rows(), Q.rows()); idM.setIdentity();
+  VERIFY(idM.isApprox(QtQ));
+  
+  // Q to dense
+  DenseMat dQ;
+  dQ = solver.matrixQ();
+  VERIFY_IS_APPROX(Q, dQ);
+}
+EIGEN_DECLARE_TEST(sparseqr)
+{
+  for(int i=0; i<g_repeat; ++i)
+  {
+    CALL_SUBTEST_1(test_sparseqr_scalar<double>());
+    CALL_SUBTEST_2(test_sparseqr_scalar<std::complex<double> >());
+  }
+}
+

diff --git a/test/special_numbers.cpp b/test/special_numbers.cpp
new file mode 100644
index 0000000..1e1a636
--- /dev/null
+++ b/test/special_numbers.cpp

@@ -0,0 +1,58 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename Scalar> void special_numbers()
+{
+  typedef Matrix<Scalar, Dynamic,Dynamic> MatType;
+  int rows = internal::random<int>(1,300);
+  int cols = internal::random<int>(1,300);
+  
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+  Scalar inf = std::numeric_limits<Scalar>::infinity();
+  Scalar s1 = internal::random<Scalar>();
+  
+  MatType m1    = MatType::Random(rows,cols),
+          mnan  = MatType::Random(rows,cols),
+          minf  = MatType::Random(rows,cols),
+          mboth = MatType::Random(rows,cols);
+          
+  int n = internal::random<int>(1,10);
+  for(int k=0; k<n; ++k)
+  {
+    mnan(internal::random<int>(0,rows-1), internal::random<int>(0,cols-1)) = nan;
+    minf(internal::random<int>(0,rows-1), internal::random<int>(0,cols-1)) = inf;
+  }
+  mboth = mnan + minf;
+  
+  VERIFY(!m1.hasNaN());
+  VERIFY(m1.allFinite());
+  
+  VERIFY(mnan.hasNaN());
+  VERIFY((s1*mnan).hasNaN());
+  VERIFY(!minf.hasNaN());
+  VERIFY(!(2*minf).hasNaN());
+  VERIFY(mboth.hasNaN());
+  VERIFY(mboth.array().hasNaN());
+  
+  VERIFY(!mnan.allFinite());
+  VERIFY(!minf.allFinite());
+  VERIFY(!(minf-mboth).allFinite());
+  VERIFY(!mboth.allFinite());
+  VERIFY(!mboth.array().allFinite());
+}
+
+EIGEN_DECLARE_TEST(special_numbers)
+{
+  for(int i = 0; i < 10*g_repeat; i++) {
+    CALL_SUBTEST_1( special_numbers<float>() );
+    CALL_SUBTEST_1( special_numbers<double>() );
+  }
+}

diff --git a/test/split_test_helper.h b/test/split_test_helper.h
new file mode 100644
index 0000000..82e82aa
--- /dev/null
+++ b/test/split_test_helper.h

@@ -0,0 +1,5994 @@
+#if defined(EIGEN_TEST_PART_1) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_1(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_1(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_2) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_2(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_2(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_3) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_3(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_3(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_4) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_4(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_4(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_5) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_5(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_5(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_6) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_6(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_6(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_7) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_7(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_7(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_8) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_8(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_8(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_9) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_9(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_9(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_10) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_10(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_10(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_11) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_11(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_11(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_12) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_12(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_12(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_13) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_13(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_13(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_14) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_14(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_14(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_15) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_15(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_15(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_16) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_16(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_16(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_17) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_17(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_17(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_18) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_18(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_18(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_19) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_19(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_19(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_20) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_20(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_20(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_21) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_21(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_21(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_22) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_22(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_22(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_23) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_23(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_23(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_24) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_24(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_24(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_25) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_25(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_25(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_26) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_26(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_26(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_27) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_27(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_27(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_28) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_28(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_28(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_29) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_29(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_29(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_30) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_30(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_30(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_31) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_31(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_31(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_32) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_32(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_32(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_33) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_33(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_33(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_34) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_34(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_34(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_35) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_35(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_35(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_36) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_36(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_36(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_37) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_37(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_37(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_38) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_38(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_38(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_39) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_39(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_39(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_40) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_40(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_40(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_41) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_41(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_41(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_42) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_42(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_42(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_43) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_43(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_43(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_44) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_44(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_44(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_45) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_45(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_45(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_46) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_46(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_46(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_47) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_47(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_47(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_48) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_48(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_48(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_49) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_49(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_49(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_50) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_50(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_50(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_51) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_51(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_51(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_52) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_52(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_52(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_53) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_53(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_53(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_54) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_54(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_54(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_55) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_55(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_55(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_56) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_56(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_56(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_57) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_57(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_57(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_58) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_58(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_58(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_59) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_59(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_59(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_60) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_60(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_60(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_61) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_61(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_61(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_62) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_62(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_62(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_63) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_63(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_63(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_64) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_64(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_64(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_65) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_65(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_65(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_66) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_66(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_66(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_67) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_67(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_67(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_68) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_68(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_68(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_69) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_69(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_69(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_70) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_70(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_70(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_71) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_71(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_71(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_72) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_72(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_72(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_73) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_73(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_73(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_74) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_74(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_74(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_75) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_75(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_75(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_76) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_76(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_76(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_77) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_77(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_77(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_78) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_78(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_78(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_79) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_79(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_79(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_80) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_80(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_80(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_81) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_81(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_81(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_82) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_82(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_82(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_83) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_83(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_83(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_84) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_84(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_84(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_85) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_85(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_85(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_86) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_86(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_86(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_87) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_87(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_87(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_88) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_88(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_88(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_89) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_89(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_89(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_90) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_90(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_90(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_91) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_91(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_91(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_92) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_92(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_92(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_93) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_93(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_93(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_94) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_94(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_94(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_95) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_95(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_95(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_96) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_96(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_96(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_97) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_97(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_97(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_98) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_98(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_98(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_99) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_99(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_99(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_100) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_100(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_100(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_101) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_101(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_101(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_102) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_102(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_102(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_103) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_103(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_103(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_104) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_104(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_104(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_105) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_105(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_105(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_106) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_106(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_106(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_107) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_107(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_107(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_108) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_108(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_108(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_109) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_109(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_109(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_110) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_110(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_110(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_111) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_111(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_111(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_112) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_112(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_112(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_113) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_113(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_113(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_114) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_114(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_114(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_115) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_115(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_115(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_116) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_116(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_116(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_117) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_117(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_117(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_118) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_118(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_118(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_119) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_119(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_119(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_120) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_120(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_120(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_121) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_121(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_121(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_122) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_122(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_122(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_123) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_123(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_123(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_124) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_124(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_124(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_125) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_125(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_125(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_126) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_126(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_126(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_127) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_127(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_127(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_128) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_128(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_128(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_129) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_129(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_129(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_130) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_130(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_130(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_131) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_131(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_131(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_132) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_132(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_132(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_133) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_133(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_133(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_134) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_134(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_134(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_135) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_135(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_135(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_136) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_136(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_136(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_137) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_137(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_137(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_138) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_138(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_138(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_139) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_139(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_139(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_140) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_140(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_140(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_141) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_141(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_141(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_142) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_142(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_142(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_143) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_143(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_143(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_144) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_144(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_144(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_145) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_145(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_145(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_146) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_146(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_146(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_147) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_147(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_147(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_148) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_148(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_148(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_149) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_149(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_149(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_150) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_150(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_150(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_151) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_151(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_151(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_152) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_152(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_152(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_153) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_153(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_153(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_154) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_154(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_154(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_155) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_155(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_155(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_156) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_156(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_156(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_157) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_157(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_157(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_158) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_158(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_158(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_159) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_159(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_159(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_160) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_160(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_160(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_161) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_161(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_161(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_162) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_162(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_162(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_163) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_163(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_163(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_164) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_164(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_164(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_165) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_165(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_165(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_166) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_166(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_166(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_167) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_167(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_167(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_168) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_168(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_168(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_169) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_169(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_169(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_170) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_170(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_170(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_171) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_171(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_171(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_172) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_172(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_172(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_173) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_173(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_173(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_174) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_174(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_174(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_175) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_175(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_175(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_176) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_176(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_176(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_177) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_177(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_177(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_178) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_178(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_178(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_179) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_179(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_179(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_180) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_180(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_180(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_181) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_181(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_181(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_182) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_182(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_182(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_183) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_183(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_183(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_184) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_184(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_184(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_185) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_185(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_185(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_186) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_186(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_186(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_187) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_187(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_187(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_188) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_188(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_188(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_189) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_189(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_189(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_190) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_190(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_190(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_191) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_191(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_191(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_192) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_192(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_192(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_193) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_193(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_193(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_194) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_194(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_194(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_195) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_195(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_195(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_196) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_196(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_196(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_197) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_197(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_197(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_198) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_198(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_198(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_199) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_199(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_199(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_200) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_200(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_200(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_201) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_201(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_201(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_202) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_202(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_202(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_203) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_203(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_203(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_204) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_204(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_204(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_205) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_205(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_205(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_206) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_206(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_206(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_207) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_207(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_207(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_208) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_208(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_208(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_209) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_209(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_209(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_210) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_210(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_210(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_211) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_211(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_211(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_212) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_212(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_212(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_213) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_213(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_213(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_214) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_214(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_214(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_215) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_215(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_215(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_216) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_216(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_216(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_217) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_217(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_217(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_218) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_218(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_218(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_219) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_219(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_219(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_220) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_220(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_220(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_221) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_221(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_221(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_222) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_222(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_222(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_223) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_223(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_223(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_224) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_224(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_224(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_225) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_225(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_225(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_226) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_226(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_226(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_227) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_227(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_227(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_228) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_228(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_228(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_229) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_229(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_229(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_230) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_230(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_230(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_231) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_231(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_231(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_232) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_232(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_232(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_233) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_233(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_233(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_234) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_234(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_234(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_235) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_235(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_235(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_236) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_236(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_236(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_237) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_237(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_237(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_238) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_238(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_238(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_239) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_239(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_239(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_240) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_240(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_240(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_241) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_241(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_241(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_242) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_242(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_242(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_243) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_243(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_243(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_244) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_244(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_244(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_245) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_245(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_245(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_246) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_246(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_246(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_247) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_247(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_247(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_248) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_248(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_248(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_249) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_249(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_249(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_250) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_250(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_250(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_251) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_251(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_251(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_252) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_252(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_252(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_253) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_253(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_253(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_254) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_254(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_254(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_255) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_255(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_255(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_256) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_256(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_256(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_257) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_257(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_257(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_258) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_258(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_258(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_259) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_259(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_259(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_260) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_260(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_260(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_261) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_261(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_261(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_262) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_262(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_262(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_263) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_263(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_263(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_264) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_264(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_264(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_265) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_265(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_265(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_266) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_266(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_266(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_267) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_267(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_267(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_268) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_268(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_268(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_269) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_269(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_269(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_270) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_270(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_270(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_271) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_271(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_271(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_272) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_272(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_272(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_273) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_273(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_273(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_274) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_274(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_274(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_275) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_275(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_275(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_276) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_276(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_276(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_277) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_277(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_277(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_278) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_278(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_278(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_279) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_279(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_279(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_280) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_280(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_280(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_281) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_281(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_281(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_282) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_282(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_282(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_283) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_283(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_283(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_284) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_284(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_284(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_285) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_285(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_285(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_286) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_286(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_286(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_287) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_287(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_287(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_288) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_288(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_288(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_289) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_289(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_289(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_290) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_290(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_290(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_291) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_291(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_291(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_292) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_292(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_292(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_293) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_293(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_293(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_294) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_294(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_294(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_295) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_295(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_295(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_296) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_296(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_296(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_297) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_297(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_297(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_298) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_298(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_298(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_299) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_299(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_299(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_300) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_300(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_300(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_301) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_301(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_301(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_302) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_302(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_302(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_303) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_303(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_303(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_304) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_304(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_304(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_305) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_305(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_305(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_306) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_306(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_306(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_307) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_307(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_307(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_308) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_308(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_308(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_309) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_309(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_309(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_310) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_310(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_310(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_311) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_311(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_311(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_312) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_312(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_312(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_313) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_313(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_313(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_314) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_314(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_314(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_315) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_315(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_315(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_316) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_316(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_316(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_317) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_317(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_317(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_318) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_318(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_318(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_319) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_319(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_319(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_320) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_320(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_320(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_321) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_321(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_321(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_322) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_322(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_322(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_323) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_323(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_323(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_324) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_324(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_324(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_325) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_325(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_325(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_326) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_326(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_326(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_327) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_327(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_327(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_328) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_328(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_328(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_329) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_329(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_329(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_330) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_330(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_330(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_331) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_331(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_331(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_332) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_332(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_332(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_333) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_333(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_333(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_334) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_334(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_334(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_335) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_335(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_335(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_336) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_336(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_336(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_337) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_337(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_337(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_338) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_338(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_338(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_339) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_339(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_339(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_340) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_340(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_340(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_341) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_341(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_341(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_342) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_342(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_342(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_343) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_343(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_343(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_344) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_344(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_344(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_345) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_345(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_345(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_346) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_346(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_346(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_347) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_347(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_347(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_348) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_348(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_348(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_349) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_349(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_349(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_350) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_350(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_350(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_351) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_351(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_351(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_352) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_352(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_352(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_353) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_353(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_353(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_354) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_354(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_354(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_355) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_355(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_355(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_356) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_356(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_356(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_357) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_357(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_357(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_358) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_358(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_358(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_359) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_359(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_359(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_360) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_360(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_360(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_361) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_361(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_361(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_362) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_362(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_362(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_363) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_363(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_363(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_364) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_364(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_364(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_365) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_365(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_365(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_366) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_366(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_366(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_367) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_367(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_367(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_368) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_368(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_368(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_369) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_369(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_369(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_370) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_370(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_370(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_371) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_371(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_371(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_372) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_372(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_372(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_373) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_373(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_373(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_374) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_374(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_374(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_375) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_375(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_375(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_376) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_376(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_376(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_377) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_377(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_377(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_378) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_378(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_378(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_379) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_379(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_379(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_380) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_380(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_380(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_381) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_381(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_381(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_382) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_382(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_382(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_383) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_383(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_383(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_384) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_384(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_384(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_385) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_385(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_385(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_386) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_386(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_386(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_387) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_387(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_387(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_388) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_388(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_388(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_389) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_389(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_389(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_390) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_390(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_390(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_391) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_391(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_391(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_392) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_392(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_392(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_393) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_393(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_393(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_394) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_394(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_394(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_395) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_395(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_395(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_396) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_396(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_396(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_397) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_397(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_397(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_398) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_398(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_398(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_399) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_399(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_399(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_400) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_400(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_400(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_401) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_401(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_401(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_402) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_402(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_402(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_403) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_403(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_403(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_404) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_404(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_404(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_405) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_405(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_405(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_406) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_406(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_406(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_407) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_407(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_407(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_408) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_408(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_408(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_409) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_409(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_409(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_410) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_410(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_410(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_411) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_411(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_411(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_412) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_412(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_412(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_413) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_413(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_413(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_414) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_414(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_414(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_415) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_415(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_415(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_416) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_416(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_416(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_417) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_417(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_417(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_418) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_418(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_418(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_419) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_419(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_419(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_420) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_420(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_420(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_421) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_421(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_421(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_422) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_422(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_422(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_423) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_423(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_423(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_424) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_424(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_424(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_425) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_425(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_425(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_426) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_426(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_426(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_427) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_427(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_427(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_428) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_428(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_428(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_429) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_429(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_429(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_430) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_430(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_430(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_431) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_431(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_431(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_432) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_432(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_432(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_433) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_433(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_433(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_434) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_434(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_434(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_435) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_435(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_435(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_436) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_436(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_436(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_437) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_437(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_437(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_438) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_438(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_438(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_439) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_439(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_439(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_440) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_440(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_440(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_441) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_441(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_441(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_442) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_442(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_442(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_443) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_443(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_443(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_444) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_444(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_444(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_445) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_445(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_445(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_446) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_446(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_446(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_447) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_447(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_447(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_448) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_448(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_448(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_449) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_449(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_449(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_450) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_450(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_450(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_451) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_451(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_451(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_452) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_452(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_452(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_453) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_453(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_453(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_454) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_454(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_454(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_455) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_455(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_455(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_456) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_456(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_456(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_457) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_457(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_457(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_458) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_458(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_458(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_459) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_459(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_459(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_460) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_460(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_460(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_461) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_461(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_461(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_462) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_462(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_462(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_463) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_463(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_463(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_464) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_464(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_464(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_465) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_465(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_465(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_466) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_466(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_466(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_467) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_467(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_467(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_468) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_468(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_468(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_469) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_469(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_469(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_470) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_470(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_470(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_471) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_471(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_471(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_472) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_472(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_472(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_473) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_473(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_473(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_474) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_474(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_474(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_475) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_475(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_475(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_476) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_476(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_476(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_477) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_477(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_477(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_478) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_478(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_478(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_479) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_479(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_479(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_480) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_480(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_480(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_481) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_481(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_481(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_482) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_482(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_482(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_483) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_483(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_483(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_484) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_484(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_484(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_485) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_485(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_485(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_486) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_486(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_486(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_487) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_487(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_487(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_488) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_488(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_488(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_489) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_489(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_489(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_490) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_490(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_490(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_491) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_491(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_491(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_492) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_492(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_492(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_493) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_493(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_493(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_494) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_494(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_494(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_495) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_495(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_495(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_496) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_496(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_496(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_497) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_497(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_497(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_498) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_498(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_498(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_499) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_499(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_499(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_500) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_500(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_500(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_501) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_501(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_501(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_502) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_502(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_502(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_503) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_503(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_503(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_504) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_504(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_504(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_505) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_505(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_505(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_506) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_506(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_506(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_507) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_507(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_507(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_508) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_508(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_508(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_509) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_509(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_509(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_510) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_510(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_510(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_511) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_511(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_511(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_512) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_512(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_512(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_513) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_513(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_513(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_514) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_514(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_514(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_515) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_515(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_515(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_516) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_516(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_516(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_517) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_517(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_517(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_518) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_518(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_518(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_519) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_519(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_519(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_520) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_520(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_520(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_521) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_521(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_521(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_522) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_522(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_522(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_523) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_523(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_523(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_524) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_524(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_524(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_525) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_525(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_525(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_526) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_526(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_526(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_527) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_527(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_527(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_528) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_528(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_528(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_529) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_529(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_529(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_530) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_530(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_530(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_531) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_531(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_531(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_532) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_532(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_532(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_533) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_533(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_533(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_534) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_534(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_534(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_535) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_535(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_535(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_536) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_536(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_536(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_537) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_537(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_537(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_538) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_538(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_538(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_539) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_539(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_539(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_540) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_540(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_540(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_541) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_541(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_541(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_542) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_542(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_542(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_543) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_543(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_543(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_544) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_544(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_544(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_545) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_545(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_545(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_546) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_546(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_546(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_547) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_547(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_547(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_548) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_548(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_548(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_549) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_549(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_549(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_550) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_550(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_550(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_551) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_551(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_551(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_552) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_552(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_552(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_553) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_553(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_553(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_554) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_554(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_554(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_555) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_555(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_555(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_556) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_556(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_556(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_557) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_557(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_557(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_558) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_558(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_558(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_559) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_559(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_559(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_560) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_560(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_560(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_561) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_561(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_561(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_562) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_562(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_562(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_563) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_563(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_563(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_564) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_564(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_564(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_565) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_565(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_565(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_566) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_566(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_566(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_567) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_567(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_567(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_568) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_568(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_568(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_569) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_569(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_569(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_570) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_570(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_570(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_571) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_571(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_571(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_572) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_572(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_572(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_573) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_573(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_573(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_574) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_574(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_574(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_575) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_575(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_575(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_576) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_576(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_576(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_577) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_577(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_577(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_578) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_578(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_578(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_579) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_579(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_579(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_580) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_580(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_580(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_581) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_581(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_581(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_582) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_582(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_582(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_583) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_583(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_583(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_584) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_584(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_584(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_585) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_585(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_585(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_586) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_586(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_586(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_587) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_587(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_587(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_588) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_588(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_588(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_589) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_589(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_589(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_590) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_590(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_590(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_591) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_591(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_591(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_592) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_592(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_592(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_593) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_593(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_593(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_594) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_594(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_594(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_595) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_595(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_595(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_596) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_596(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_596(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_597) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_597(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_597(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_598) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_598(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_598(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_599) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_599(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_599(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_600) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_600(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_600(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_601) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_601(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_601(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_602) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_602(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_602(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_603) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_603(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_603(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_604) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_604(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_604(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_605) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_605(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_605(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_606) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_606(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_606(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_607) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_607(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_607(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_608) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_608(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_608(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_609) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_609(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_609(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_610) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_610(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_610(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_611) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_611(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_611(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_612) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_612(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_612(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_613) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_613(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_613(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_614) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_614(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_614(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_615) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_615(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_615(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_616) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_616(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_616(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_617) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_617(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_617(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_618) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_618(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_618(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_619) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_619(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_619(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_620) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_620(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_620(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_621) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_621(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_621(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_622) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_622(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_622(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_623) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_623(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_623(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_624) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_624(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_624(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_625) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_625(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_625(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_626) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_626(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_626(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_627) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_627(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_627(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_628) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_628(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_628(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_629) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_629(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_629(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_630) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_630(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_630(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_631) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_631(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_631(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_632) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_632(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_632(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_633) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_633(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_633(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_634) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_634(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_634(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_635) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_635(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_635(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_636) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_636(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_636(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_637) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_637(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_637(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_638) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_638(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_638(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_639) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_639(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_639(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_640) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_640(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_640(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_641) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_641(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_641(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_642) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_642(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_642(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_643) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_643(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_643(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_644) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_644(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_644(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_645) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_645(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_645(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_646) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_646(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_646(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_647) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_647(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_647(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_648) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_648(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_648(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_649) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_649(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_649(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_650) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_650(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_650(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_651) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_651(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_651(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_652) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_652(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_652(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_653) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_653(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_653(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_654) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_654(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_654(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_655) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_655(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_655(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_656) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_656(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_656(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_657) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_657(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_657(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_658) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_658(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_658(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_659) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_659(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_659(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_660) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_660(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_660(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_661) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_661(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_661(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_662) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_662(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_662(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_663) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_663(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_663(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_664) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_664(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_664(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_665) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_665(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_665(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_666) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_666(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_666(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_667) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_667(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_667(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_668) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_668(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_668(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_669) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_669(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_669(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_670) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_670(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_670(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_671) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_671(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_671(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_672) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_672(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_672(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_673) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_673(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_673(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_674) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_674(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_674(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_675) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_675(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_675(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_676) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_676(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_676(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_677) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_677(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_677(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_678) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_678(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_678(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_679) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_679(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_679(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_680) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_680(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_680(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_681) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_681(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_681(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_682) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_682(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_682(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_683) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_683(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_683(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_684) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_684(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_684(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_685) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_685(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_685(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_686) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_686(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_686(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_687) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_687(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_687(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_688) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_688(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_688(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_689) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_689(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_689(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_690) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_690(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_690(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_691) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_691(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_691(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_692) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_692(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_692(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_693) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_693(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_693(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_694) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_694(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_694(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_695) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_695(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_695(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_696) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_696(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_696(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_697) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_697(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_697(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_698) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_698(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_698(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_699) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_699(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_699(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_700) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_700(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_700(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_701) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_701(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_701(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_702) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_702(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_702(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_703) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_703(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_703(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_704) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_704(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_704(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_705) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_705(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_705(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_706) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_706(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_706(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_707) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_707(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_707(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_708) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_708(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_708(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_709) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_709(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_709(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_710) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_710(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_710(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_711) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_711(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_711(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_712) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_712(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_712(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_713) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_713(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_713(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_714) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_714(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_714(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_715) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_715(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_715(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_716) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_716(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_716(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_717) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_717(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_717(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_718) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_718(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_718(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_719) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_719(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_719(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_720) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_720(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_720(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_721) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_721(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_721(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_722) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_722(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_722(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_723) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_723(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_723(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_724) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_724(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_724(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_725) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_725(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_725(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_726) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_726(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_726(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_727) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_727(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_727(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_728) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_728(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_728(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_729) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_729(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_729(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_730) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_730(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_730(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_731) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_731(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_731(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_732) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_732(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_732(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_733) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_733(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_733(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_734) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_734(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_734(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_735) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_735(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_735(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_736) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_736(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_736(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_737) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_737(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_737(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_738) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_738(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_738(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_739) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_739(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_739(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_740) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_740(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_740(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_741) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_741(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_741(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_742) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_742(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_742(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_743) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_743(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_743(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_744) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_744(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_744(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_745) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_745(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_745(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_746) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_746(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_746(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_747) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_747(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_747(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_748) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_748(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_748(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_749) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_749(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_749(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_750) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_750(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_750(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_751) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_751(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_751(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_752) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_752(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_752(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_753) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_753(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_753(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_754) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_754(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_754(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_755) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_755(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_755(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_756) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_756(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_756(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_757) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_757(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_757(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_758) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_758(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_758(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_759) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_759(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_759(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_760) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_760(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_760(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_761) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_761(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_761(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_762) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_762(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_762(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_763) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_763(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_763(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_764) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_764(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_764(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_765) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_765(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_765(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_766) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_766(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_766(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_767) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_767(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_767(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_768) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_768(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_768(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_769) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_769(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_769(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_770) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_770(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_770(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_771) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_771(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_771(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_772) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_772(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_772(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_773) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_773(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_773(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_774) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_774(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_774(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_775) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_775(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_775(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_776) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_776(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_776(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_777) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_777(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_777(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_778) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_778(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_778(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_779) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_779(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_779(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_780) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_780(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_780(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_781) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_781(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_781(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_782) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_782(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_782(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_783) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_783(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_783(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_784) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_784(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_784(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_785) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_785(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_785(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_786) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_786(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_786(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_787) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_787(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_787(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_788) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_788(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_788(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_789) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_789(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_789(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_790) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_790(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_790(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_791) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_791(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_791(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_792) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_792(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_792(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_793) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_793(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_793(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_794) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_794(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_794(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_795) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_795(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_795(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_796) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_796(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_796(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_797) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_797(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_797(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_798) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_798(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_798(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_799) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_799(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_799(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_800) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_800(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_800(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_801) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_801(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_801(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_802) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_802(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_802(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_803) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_803(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_803(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_804) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_804(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_804(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_805) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_805(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_805(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_806) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_806(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_806(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_807) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_807(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_807(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_808) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_808(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_808(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_809) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_809(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_809(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_810) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_810(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_810(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_811) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_811(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_811(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_812) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_812(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_812(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_813) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_813(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_813(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_814) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_814(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_814(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_815) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_815(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_815(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_816) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_816(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_816(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_817) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_817(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_817(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_818) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_818(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_818(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_819) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_819(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_819(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_820) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_820(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_820(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_821) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_821(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_821(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_822) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_822(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_822(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_823) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_823(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_823(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_824) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_824(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_824(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_825) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_825(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_825(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_826) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_826(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_826(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_827) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_827(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_827(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_828) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_828(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_828(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_829) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_829(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_829(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_830) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_830(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_830(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_831) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_831(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_831(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_832) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_832(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_832(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_833) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_833(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_833(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_834) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_834(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_834(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_835) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_835(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_835(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_836) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_836(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_836(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_837) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_837(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_837(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_838) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_838(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_838(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_839) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_839(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_839(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_840) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_840(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_840(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_841) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_841(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_841(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_842) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_842(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_842(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_843) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_843(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_843(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_844) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_844(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_844(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_845) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_845(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_845(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_846) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_846(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_846(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_847) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_847(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_847(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_848) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_848(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_848(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_849) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_849(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_849(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_850) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_850(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_850(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_851) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_851(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_851(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_852) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_852(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_852(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_853) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_853(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_853(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_854) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_854(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_854(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_855) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_855(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_855(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_856) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_856(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_856(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_857) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_857(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_857(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_858) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_858(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_858(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_859) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_859(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_859(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_860) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_860(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_860(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_861) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_861(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_861(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_862) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_862(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_862(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_863) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_863(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_863(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_864) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_864(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_864(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_865) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_865(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_865(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_866) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_866(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_866(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_867) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_867(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_867(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_868) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_868(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_868(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_869) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_869(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_869(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_870) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_870(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_870(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_871) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_871(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_871(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_872) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_872(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_872(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_873) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_873(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_873(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_874) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_874(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_874(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_875) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_875(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_875(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_876) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_876(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_876(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_877) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_877(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_877(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_878) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_878(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_878(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_879) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_879(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_879(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_880) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_880(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_880(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_881) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_881(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_881(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_882) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_882(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_882(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_883) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_883(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_883(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_884) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_884(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_884(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_885) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_885(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_885(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_886) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_886(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_886(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_887) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_887(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_887(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_888) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_888(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_888(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_889) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_889(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_889(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_890) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_890(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_890(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_891) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_891(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_891(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_892) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_892(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_892(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_893) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_893(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_893(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_894) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_894(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_894(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_895) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_895(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_895(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_896) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_896(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_896(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_897) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_897(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_897(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_898) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_898(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_898(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_899) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_899(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_899(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_900) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_900(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_900(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_901) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_901(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_901(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_902) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_902(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_902(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_903) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_903(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_903(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_904) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_904(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_904(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_905) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_905(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_905(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_906) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_906(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_906(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_907) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_907(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_907(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_908) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_908(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_908(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_909) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_909(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_909(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_910) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_910(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_910(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_911) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_911(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_911(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_912) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_912(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_912(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_913) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_913(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_913(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_914) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_914(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_914(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_915) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_915(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_915(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_916) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_916(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_916(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_917) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_917(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_917(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_918) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_918(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_918(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_919) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_919(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_919(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_920) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_920(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_920(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_921) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_921(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_921(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_922) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_922(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_922(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_923) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_923(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_923(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_924) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_924(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_924(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_925) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_925(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_925(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_926) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_926(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_926(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_927) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_927(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_927(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_928) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_928(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_928(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_929) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_929(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_929(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_930) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_930(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_930(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_931) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_931(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_931(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_932) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_932(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_932(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_933) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_933(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_933(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_934) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_934(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_934(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_935) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_935(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_935(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_936) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_936(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_936(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_937) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_937(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_937(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_938) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_938(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_938(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_939) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_939(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_939(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_940) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_940(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_940(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_941) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_941(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_941(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_942) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_942(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_942(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_943) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_943(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_943(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_944) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_944(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_944(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_945) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_945(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_945(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_946) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_946(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_946(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_947) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_947(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_947(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_948) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_948(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_948(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_949) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_949(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_949(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_950) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_950(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_950(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_951) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_951(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_951(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_952) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_952(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_952(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_953) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_953(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_953(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_954) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_954(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_954(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_955) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_955(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_955(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_956) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_956(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_956(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_957) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_957(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_957(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_958) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_958(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_958(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_959) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_959(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_959(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_960) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_960(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_960(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_961) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_961(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_961(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_962) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_962(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_962(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_963) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_963(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_963(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_964) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_964(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_964(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_965) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_965(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_965(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_966) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_966(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_966(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_967) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_967(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_967(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_968) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_968(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_968(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_969) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_969(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_969(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_970) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_970(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_970(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_971) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_971(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_971(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_972) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_972(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_972(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_973) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_973(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_973(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_974) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_974(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_974(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_975) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_975(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_975(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_976) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_976(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_976(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_977) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_977(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_977(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_978) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_978(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_978(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_979) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_979(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_979(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_980) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_980(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_980(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_981) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_981(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_981(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_982) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_982(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_982(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_983) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_983(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_983(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_984) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_984(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_984(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_985) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_985(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_985(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_986) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_986(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_986(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_987) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_987(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_987(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_988) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_988(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_988(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_989) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_989(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_989(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_990) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_990(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_990(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_991) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_991(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_991(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_992) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_992(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_992(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_993) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_993(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_993(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_994) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_994(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_994(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_995) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_995(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_995(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_996) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_996(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_996(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_997) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_997(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_997(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_998) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_998(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_998(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_999) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_999(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_999(FUNC)
+#endif
+

diff --git a/test/spqr_support.cpp b/test/spqr_support.cpp
new file mode 100644
index 0000000..79c2c12
--- /dev/null
+++ b/test/spqr_support.cpp

@@ -0,0 +1,64 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Desire Nuentsa Wakam <desire.nuentsa_wakam@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+
+#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
+#include "sparse.h"
+#include <Eigen/SPQRSupport>
+
+
+template<typename MatrixType,typename DenseMat>
+int generate_sparse_rectangular_problem(MatrixType& A, DenseMat& dA, int maxRows = 300, int maxCols = 300)
+{
+  eigen_assert(maxRows >= maxCols);
+  typedef typename MatrixType::Scalar Scalar;
+  int rows = internal::random<int>(1,maxRows);
+  int cols = internal::random<int>(1,rows);
+  double density = (std::max)(8./(rows*cols), 0.01);
+  
+  A.resize(rows,cols);
+  dA.resize(rows,cols);
+  initSparse<Scalar>(density, dA, A,ForceNonZeroDiag);
+  A.makeCompressed();
+  return rows;
+}
+
+template<typename Scalar> void test_spqr_scalar()
+{
+  typedef SparseMatrix<Scalar,ColMajor> MatrixType; 
+  MatrixType A;
+  Matrix<Scalar,Dynamic,Dynamic> dA;
+  typedef Matrix<Scalar,Dynamic,1> DenseVector;
+  DenseVector refX,x,b; 
+  SPQR<MatrixType> solver; 
+  generate_sparse_rectangular_problem(A,dA);
+  
+  Index m = A.rows();
+  b = DenseVector::Random(m);
+  solver.compute(A);
+  if (solver.info() != Success)
+  {
+    std::cerr << "sparse QR factorization failed\n";
+    exit(0);
+    return;
+  }
+  x = solver.solve(b);
+  if (solver.info() != Success)
+  {
+    std::cerr << "sparse QR factorization failed\n";
+    exit(0);
+    return;
+  }  
+  //Compare with a dense solver
+  refX = dA.colPivHouseholderQr().solve(b);
+  VERIFY(x.isApprox(refX,test_precision<Scalar>()));
+}
+EIGEN_DECLARE_TEST(spqr_support)
+{
+  CALL_SUBTEST_1(test_spqr_scalar<double>());
+  CALL_SUBTEST_2(test_spqr_scalar<std::complex<double> >());
+}

diff --git a/test/stable_norm.cpp b/test/stable_norm.cpp
new file mode 100644
index 0000000..cb8a80c
--- /dev/null
+++ b/test/stable_norm.cpp

@@ -0,0 +1,245 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename T> EIGEN_DONT_INLINE T copy(const T& x)
+{
+  return x;
+}
+
+template<typename MatrixType> void stable_norm(const MatrixType& m)
+{
+  /* this test covers the following files:
+     StableNorm.h
+  */
+  using std::sqrt;
+  using std::abs;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  
+  bool complex_real_product_ok = true;
+
+  // Check the basic machine-dependent constants.
+  {
+    int ibeta, it, iemin, iemax;
+
+    ibeta = std::numeric_limits<RealScalar>::radix;         // base for floating-point numbers
+    it    = std::numeric_limits<RealScalar>::digits;        // number of base-beta digits in mantissa
+    iemin = std::numeric_limits<RealScalar>::min_exponent;  // minimum exponent
+    iemax = std::numeric_limits<RealScalar>::max_exponent;  // maximum exponent
+
+    VERIFY( (!(iemin > 1 - 2*it || 1+it>iemax || (it==2 && ibeta<5) || (it<=4 && ibeta <= 3 ) || it<2))
+           && "the stable norm algorithm cannot be guaranteed on this computer");
+    
+    Scalar inf = std::numeric_limits<RealScalar>::infinity();
+    if(NumTraits<Scalar>::IsComplex && (numext::isnan)(inf*RealScalar(1)) )
+    {
+      complex_real_product_ok = false;
+      static bool first = true;
+      if(first)
+        std::cerr << "WARNING: compiler mess up complex*real product, " << inf << " * " << 1.0 << " = " << inf*RealScalar(1) << std::endl;
+      first = false;
+    }
+  }
+
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  // get a non-zero random factor
+  Scalar factor = internal::random<Scalar>();
+  while(numext::abs2(factor)<RealScalar(1e-4))
+    factor = internal::random<Scalar>();
+  Scalar big = factor * ((std::numeric_limits<RealScalar>::max)() * RealScalar(1e-4));
+  
+  factor = internal::random<Scalar>();
+  while(numext::abs2(factor)<RealScalar(1e-4))
+    factor = internal::random<Scalar>();
+  Scalar small = factor * ((std::numeric_limits<RealScalar>::min)() * RealScalar(1e4));
+
+  Scalar one(1);
+
+  MatrixType  vzero = MatrixType::Zero(rows, cols),
+              vrand = MatrixType::Random(rows, cols),
+              vbig(rows, cols),
+              vsmall(rows,cols);
+
+  vbig.fill(big);
+  vsmall.fill(small);
+
+  VERIFY_IS_MUCH_SMALLER_THAN(vzero.norm(), static_cast<RealScalar>(1));
+  VERIFY_IS_APPROX(vrand.stableNorm(),      vrand.norm());
+  VERIFY_IS_APPROX(vrand.blueNorm(),        vrand.norm());
+  VERIFY_IS_APPROX(vrand.hypotNorm(),       vrand.norm());
+
+  // test with expressions as input
+  VERIFY_IS_APPROX((one*vrand).stableNorm(),      vrand.norm());
+  VERIFY_IS_APPROX((one*vrand).blueNorm(),        vrand.norm());
+  VERIFY_IS_APPROX((one*vrand).hypotNorm(),       vrand.norm());
+  VERIFY_IS_APPROX((one*vrand+one*vrand-one*vrand).stableNorm(),      vrand.norm());
+  VERIFY_IS_APPROX((one*vrand+one*vrand-one*vrand).blueNorm(),        vrand.norm());
+  VERIFY_IS_APPROX((one*vrand+one*vrand-one*vrand).hypotNorm(),       vrand.norm());
+
+  RealScalar size = static_cast<RealScalar>(m.size());
+
+  // test numext::isfinite
+  VERIFY(!(numext::isfinite)( std::numeric_limits<RealScalar>::infinity()));
+  VERIFY(!(numext::isfinite)(sqrt(-abs(big))));
+
+  // test overflow
+  VERIFY((numext::isfinite)(sqrt(size)*abs(big)));
+  VERIFY_IS_NOT_APPROX(sqrt(copy(vbig.squaredNorm())), abs(sqrt(size)*big)); // here the default norm must fail
+  VERIFY_IS_APPROX(vbig.stableNorm(), sqrt(size)*abs(big));
+  VERIFY_IS_APPROX(vbig.blueNorm(),   sqrt(size)*abs(big));
+  VERIFY_IS_APPROX(vbig.hypotNorm(),  sqrt(size)*abs(big));
+
+  // test underflow
+  VERIFY((numext::isfinite)(sqrt(size)*abs(small)));
+  VERIFY_IS_NOT_APPROX(sqrt(copy(vsmall.squaredNorm())),   abs(sqrt(size)*small)); // here the default norm must fail
+  VERIFY_IS_APPROX(vsmall.stableNorm(), sqrt(size)*abs(small));
+  VERIFY_IS_APPROX(vsmall.blueNorm(),   sqrt(size)*abs(small));
+  VERIFY_IS_APPROX(vsmall.hypotNorm(),  sqrt(size)*abs(small));
+
+  // Test compilation of cwise() version
+  VERIFY_IS_APPROX(vrand.colwise().stableNorm(),      vrand.colwise().norm());
+  VERIFY_IS_APPROX(vrand.colwise().blueNorm(),        vrand.colwise().norm());
+  VERIFY_IS_APPROX(vrand.colwise().hypotNorm(),       vrand.colwise().norm());
+  VERIFY_IS_APPROX(vrand.rowwise().stableNorm(),      vrand.rowwise().norm());
+  VERIFY_IS_APPROX(vrand.rowwise().blueNorm(),        vrand.rowwise().norm());
+  VERIFY_IS_APPROX(vrand.rowwise().hypotNorm(),       vrand.rowwise().norm());
+  
+  // test NaN, +inf, -inf 
+  MatrixType v;
+  Index i = internal::random<Index>(0,rows-1);
+  Index j = internal::random<Index>(0,cols-1);
+
+  // NaN
+  {
+    v = vrand;
+    v(i,j) = std::numeric_limits<RealScalar>::quiet_NaN();
+    VERIFY(!(numext::isfinite)(v.squaredNorm()));   VERIFY((numext::isnan)(v.squaredNorm()));
+    VERIFY(!(numext::isfinite)(v.norm()));          VERIFY((numext::isnan)(v.norm()));
+    VERIFY(!(numext::isfinite)(v.stableNorm()));    VERIFY((numext::isnan)(v.stableNorm()));
+    VERIFY(!(numext::isfinite)(v.blueNorm()));      VERIFY((numext::isnan)(v.blueNorm()));
+    VERIFY(!(numext::isfinite)(v.hypotNorm()));     VERIFY((numext::isnan)(v.hypotNorm()));
+  }
+  
+  // +inf
+  {
+    v = vrand;
+    v(i,j) = std::numeric_limits<RealScalar>::infinity();
+    VERIFY(!(numext::isfinite)(v.squaredNorm()));   VERIFY(isPlusInf(v.squaredNorm()));
+    VERIFY(!(numext::isfinite)(v.norm()));          VERIFY(isPlusInf(v.norm()));
+    VERIFY(!(numext::isfinite)(v.stableNorm()));
+    if(complex_real_product_ok){
+      VERIFY(isPlusInf(v.stableNorm()));
+    }
+    VERIFY(!(numext::isfinite)(v.blueNorm()));      VERIFY(isPlusInf(v.blueNorm()));
+    VERIFY(!(numext::isfinite)(v.hypotNorm()));     VERIFY(isPlusInf(v.hypotNorm()));
+  }
+  
+  // -inf
+  {
+    v = vrand;
+    v(i,j) = -std::numeric_limits<RealScalar>::infinity();
+    VERIFY(!(numext::isfinite)(v.squaredNorm()));   VERIFY(isPlusInf(v.squaredNorm()));
+    VERIFY(!(numext::isfinite)(v.norm()));          VERIFY(isPlusInf(v.norm()));
+    VERIFY(!(numext::isfinite)(v.stableNorm()));
+    if(complex_real_product_ok) {
+      VERIFY(isPlusInf(v.stableNorm()));
+    }
+    VERIFY(!(numext::isfinite)(v.blueNorm()));      VERIFY(isPlusInf(v.blueNorm()));
+    VERIFY(!(numext::isfinite)(v.hypotNorm()));     VERIFY(isPlusInf(v.hypotNorm()));
+  }
+  
+  // mix
+  {
+    Index i2 = internal::random<Index>(0,rows-1);
+    Index j2 = internal::random<Index>(0,cols-1);
+    v = vrand;
+    v(i,j) = -std::numeric_limits<RealScalar>::infinity();
+    v(i2,j2) = std::numeric_limits<RealScalar>::quiet_NaN();
+    VERIFY(!(numext::isfinite)(v.squaredNorm()));   VERIFY((numext::isnan)(v.squaredNorm()));
+    VERIFY(!(numext::isfinite)(v.norm()));          VERIFY((numext::isnan)(v.norm()));
+    VERIFY(!(numext::isfinite)(v.stableNorm()));    VERIFY((numext::isnan)(v.stableNorm()));
+    VERIFY(!(numext::isfinite)(v.blueNorm()));      VERIFY((numext::isnan)(v.blueNorm()));
+    if (i2 != i || j2 != j) {
+      // hypot propagates inf over NaN.
+      VERIFY(!(numext::isfinite)(v.hypotNorm()));     VERIFY((numext::isinf)(v.hypotNorm()));
+    } else {
+      // inf is overwritten by NaN, expect norm to be NaN.
+      VERIFY(!(numext::isfinite)(v.hypotNorm()));     VERIFY((numext::isnan)(v.hypotNorm()));
+    }
+  }
+
+  // stableNormalize[d]
+  {
+    VERIFY_IS_APPROX(vrand.stableNormalized(), vrand.normalized());
+    MatrixType vcopy(vrand);
+    vcopy.stableNormalize();
+    VERIFY_IS_APPROX(vcopy, vrand.normalized());
+    VERIFY_IS_APPROX((vrand.stableNormalized()).norm(), RealScalar(1));
+    VERIFY_IS_APPROX(vcopy.norm(), RealScalar(1));
+    VERIFY_IS_APPROX((vbig.stableNormalized()).norm(), RealScalar(1));
+    VERIFY_IS_APPROX((vsmall.stableNormalized()).norm(), RealScalar(1));
+    RealScalar big_scaling = ((std::numeric_limits<RealScalar>::max)() * RealScalar(1e-4));
+    VERIFY_IS_APPROX(vbig/big_scaling, (vbig.stableNorm() * vbig.stableNormalized()).eval()/big_scaling);
+    VERIFY_IS_APPROX(vsmall, vsmall.stableNorm() * vsmall.stableNormalized());
+  }
+}
+
+template<typename Scalar>
+void test_hypot()
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  Scalar factor = internal::random<Scalar>();
+  while(numext::abs2(factor)<RealScalar(1e-4))
+    factor = internal::random<Scalar>();
+  Scalar big = factor * ((std::numeric_limits<RealScalar>::max)() * RealScalar(1e-4));
+  
+  factor = internal::random<Scalar>();
+  while(numext::abs2(factor)<RealScalar(1e-4))
+    factor = internal::random<Scalar>();
+  Scalar small = factor * ((std::numeric_limits<RealScalar>::min)() * RealScalar(1e4));
+
+  Scalar  one   (1),
+          zero  (0),
+          sqrt2 (std::sqrt(2)),
+          nan   (std::numeric_limits<RealScalar>::quiet_NaN());
+
+  Scalar a = internal::random<Scalar>(-1,1);
+  Scalar b = internal::random<Scalar>(-1,1);
+  VERIFY_IS_APPROX(numext::hypot(a,b),std::sqrt(numext::abs2(a)+numext::abs2(b)));
+  VERIFY_IS_EQUAL(numext::hypot(zero,zero), zero);
+  VERIFY_IS_APPROX(numext::hypot(one, one), sqrt2);
+  VERIFY_IS_APPROX(numext::hypot(big,big), sqrt2*numext::abs(big));
+  VERIFY_IS_APPROX(numext::hypot(small,small), sqrt2*numext::abs(small));
+  VERIFY_IS_APPROX(numext::hypot(small,big), numext::abs(big));
+  VERIFY((numext::isnan)(numext::hypot(nan,a)));
+  VERIFY((numext::isnan)(numext::hypot(a,nan)));
+}
+
+EIGEN_DECLARE_TEST(stable_norm)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_3( test_hypot<double>() );
+    CALL_SUBTEST_4( test_hypot<float>() );
+    CALL_SUBTEST_5( test_hypot<std::complex<double> >() );
+    CALL_SUBTEST_6( test_hypot<std::complex<float> >() );
+
+    CALL_SUBTEST_1( stable_norm(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( stable_norm(Vector4d()) );
+    CALL_SUBTEST_3( stable_norm(VectorXd(internal::random<int>(10,2000))) );
+    CALL_SUBTEST_3( stable_norm(MatrixXd(internal::random<int>(10,200), internal::random<int>(10,200))) );
+    CALL_SUBTEST_4( stable_norm(VectorXf(internal::random<int>(10,2000))) );
+    CALL_SUBTEST_5( stable_norm(VectorXcd(internal::random<int>(10,2000))) );
+    CALL_SUBTEST_6( stable_norm(VectorXcf(internal::random<int>(10,2000))) );
+  }
+}

diff --git a/test/stddeque.cpp b/test/stddeque.cpp
new file mode 100644
index 0000000..ea85ea9
--- /dev/null
+++ b/test/stddeque.cpp

@@ -0,0 +1,130 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/StdDeque>
+#include <Eigen/Geometry>
+
+template<typename MatrixType>
+void check_stddeque_matrix(const MatrixType& m)
+{
+  Index rows = m.rows();
+  Index cols = m.cols();
+  MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
+  std::deque<MatrixType,Eigen::aligned_allocator<MatrixType> > v(10, MatrixType::Zero(rows,cols)), w(20, y);
+  v.front() = x;
+  w.front() = w.back();
+  VERIFY_IS_APPROX(w.front(), w.back());
+  v = w;
+
+  typename std::deque<MatrixType,Eigen::aligned_allocator<MatrixType> >::iterator vi = v.begin();
+  typename std::deque<MatrixType,Eigen::aligned_allocator<MatrixType> >::iterator wi = w.begin();
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(*vi, *wi);
+    ++vi;
+    ++wi;
+  }
+
+  v.resize(21,MatrixType::Zero(rows,cols));  
+  v.back() = x;
+  VERIFY_IS_APPROX(v.back(), x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(v.back(), y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v.back(), x);
+}
+
+template<typename TransformType>
+void check_stddeque_transform(const TransformType&)
+{
+  typedef typename TransformType::MatrixType MatrixType;
+  TransformType x(MatrixType::Random()), y(MatrixType::Random()), ti=TransformType::Identity();
+  std::deque<TransformType,Eigen::aligned_allocator<TransformType> > v(10,ti), w(20, y);
+  v.front() = x;
+  w.front() = w.back();
+  VERIFY_IS_APPROX(w.front(), w.back());
+  v = w;
+
+  typename std::deque<TransformType,Eigen::aligned_allocator<TransformType> >::iterator vi = v.begin();
+  typename std::deque<TransformType,Eigen::aligned_allocator<TransformType> >::iterator wi = w.begin();
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(*vi, *wi);
+    ++vi;
+    ++wi;
+  }
+
+  v.resize(21,ti);
+  v.back() = x;
+  VERIFY_IS_APPROX(v.back(), x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(v.back(), y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v.back(), x);
+}
+
+template<typename QuaternionType>
+void check_stddeque_quaternion(const QuaternionType&)
+{
+  typedef typename QuaternionType::Coefficients Coefficients;
+  QuaternionType x(Coefficients::Random()), y(Coefficients::Random()), qi=QuaternionType::Identity();
+  std::deque<QuaternionType,Eigen::aligned_allocator<QuaternionType> > v(10,qi), w(20, y);
+  v.front() = x;
+  w.front() = w.back();
+  VERIFY_IS_APPROX(w.front(), w.back());
+  v = w;
+
+  typename std::deque<QuaternionType,Eigen::aligned_allocator<QuaternionType> >::iterator vi = v.begin();
+  typename std::deque<QuaternionType,Eigen::aligned_allocator<QuaternionType> >::iterator wi = w.begin();
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(*vi, *wi);
+    ++vi;
+    ++wi;
+  }
+
+  v.resize(21,qi);
+  v.back() = x;
+  VERIFY_IS_APPROX(v.back(), x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(v.back(), y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v.back(), x);
+}
+
+EIGEN_DECLARE_TEST(stddeque)
+{
+  // some non vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stddeque_matrix(Vector2f()));
+  CALL_SUBTEST_1(check_stddeque_matrix(Matrix3f()));
+  CALL_SUBTEST_2(check_stddeque_matrix(Matrix3d()));
+
+  // some vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stddeque_matrix(Matrix2f()));
+  CALL_SUBTEST_1(check_stddeque_matrix(Vector4f()));
+  CALL_SUBTEST_1(check_stddeque_matrix(Matrix4f()));
+  CALL_SUBTEST_2(check_stddeque_matrix(Matrix4d()));
+
+  // some dynamic sizes
+  CALL_SUBTEST_3(check_stddeque_matrix(MatrixXd(1,1)));
+  CALL_SUBTEST_3(check_stddeque_matrix(VectorXd(20)));
+  CALL_SUBTEST_3(check_stddeque_matrix(RowVectorXf(20)));
+  CALL_SUBTEST_3(check_stddeque_matrix(MatrixXcf(10,10)));
+
+  // some Transform
+  CALL_SUBTEST_4(check_stddeque_transform(Affine2f()));
+  CALL_SUBTEST_4(check_stddeque_transform(Affine3f()));
+  CALL_SUBTEST_4(check_stddeque_transform(Affine3d()));
+
+  // some Quaternion
+  CALL_SUBTEST_5(check_stddeque_quaternion(Quaternionf()));
+  CALL_SUBTEST_5(check_stddeque_quaternion(Quaterniond()));
+}

diff --git a/test/stddeque_overload.cpp b/test/stddeque_overload.cpp
new file mode 100644
index 0000000..0f59f06
--- /dev/null
+++ b/test/stddeque_overload.cpp

@@ -0,0 +1,158 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/StdDeque>
+#include <Eigen/Geometry>
+
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Vector4f)
+
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Matrix2f)
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Matrix4f)
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Matrix4d)
+
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Affine3f)
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Affine3d)
+
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Quaternionf)
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Quaterniond)
+
+template<typename MatrixType>
+void check_stddeque_matrix(const MatrixType& m)
+{
+  Index rows = m.rows();
+  Index cols = m.cols();
+  MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
+  std::deque<MatrixType> v(10, MatrixType::Zero(rows,cols)), w(20, y);
+  v[5] = x;
+  w[6] = v[5];
+  VERIFY_IS_APPROX(w[6], v[5]);
+  v = w;
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(w[i], v[i]);
+  }
+
+  v.resize(21);
+  v[20] = x;
+  VERIFY_IS_APPROX(v[20], x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(v[21], y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v[22], x);
+
+  // do a lot of push_back such that the deque gets internally resized
+  // (with memory reallocation)
+  MatrixType* ref = &w[0];
+  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
+    v.push_back(w[i%w.size()]);
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(v[i]==w[(i-23)%w.size()]);
+  }
+}
+
+template<typename TransformType>
+void check_stddeque_transform(const TransformType&)
+{
+  typedef typename TransformType::MatrixType MatrixType;
+  TransformType x(MatrixType::Random()), y(MatrixType::Random()), ti=TransformType::Identity();
+  std::deque<TransformType> v(10,ti), w(20, y);
+  v[5] = x;
+  w[6] = v[5];
+  VERIFY_IS_APPROX(w[6], v[5]);
+  v = w;
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(w[i], v[i]);
+  }
+
+  v.resize(21,ti);
+  v[20] = x;
+  VERIFY_IS_APPROX(v[20], x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(v[21], y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v[22], x);
+
+  // do a lot of push_back such that the deque gets internally resized
+  // (with memory reallocation)
+  TransformType* ref = &w[0];
+  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
+    v.push_back(w[i%w.size()]);
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(v[i].matrix()==w[(i-23)%w.size()].matrix());
+  }
+}
+
+template<typename QuaternionType>
+void check_stddeque_quaternion(const QuaternionType&)
+{
+  typedef typename QuaternionType::Coefficients Coefficients;
+  QuaternionType x(Coefficients::Random()), y(Coefficients::Random()), qi=QuaternionType::Identity();
+  std::deque<QuaternionType> v(10,qi), w(20, y);
+  v[5] = x;
+  w[6] = v[5];
+  VERIFY_IS_APPROX(w[6], v[5]);
+  v = w;
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(w[i], v[i]);
+  }
+
+  v.resize(21,qi);
+  v[20] = x;
+  VERIFY_IS_APPROX(v[20], x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(v[21], y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v[22], x);
+
+  // do a lot of push_back such that the deque gets internally resized
+  // (with memory reallocation)
+  QuaternionType* ref = &w[0];
+  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
+    v.push_back(w[i%w.size()]);
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(v[i].coeffs()==w[(i-23)%w.size()].coeffs());
+  }
+}
+
+EIGEN_DECLARE_TEST(stddeque_overload)
+{
+  // some non vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stddeque_matrix(Vector2f()));
+  CALL_SUBTEST_1(check_stddeque_matrix(Matrix3f()));
+  CALL_SUBTEST_2(check_stddeque_matrix(Matrix3d()));
+
+  // some vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stddeque_matrix(Matrix2f()));
+  CALL_SUBTEST_1(check_stddeque_matrix(Vector4f()));
+  CALL_SUBTEST_1(check_stddeque_matrix(Matrix4f()));
+  CALL_SUBTEST_2(check_stddeque_matrix(Matrix4d()));
+
+  // some dynamic sizes
+  CALL_SUBTEST_3(check_stddeque_matrix(MatrixXd(1,1)));
+  CALL_SUBTEST_3(check_stddeque_matrix(VectorXd(20)));
+  CALL_SUBTEST_3(check_stddeque_matrix(RowVectorXf(20)));
+  CALL_SUBTEST_3(check_stddeque_matrix(MatrixXcf(10,10)));
+
+  // some Transform
+  CALL_SUBTEST_4(check_stddeque_transform(Affine2f())); // does not need the specialization (2+1)^2 = 9
+  CALL_SUBTEST_4(check_stddeque_transform(Affine3f()));
+  CALL_SUBTEST_4(check_stddeque_transform(Affine3d()));
+
+  // some Quaternion
+  CALL_SUBTEST_5(check_stddeque_quaternion(Quaternionf()));
+  CALL_SUBTEST_5(check_stddeque_quaternion(Quaterniond()));
+}

diff --git a/test/stdlist.cpp b/test/stdlist.cpp
new file mode 100644
index 0000000..1af9e6e
--- /dev/null
+++ b/test/stdlist.cpp

@@ -0,0 +1,130 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/StdList>
+#include <Eigen/Geometry>
+
+template<typename MatrixType>
+void check_stdlist_matrix(const MatrixType& m)
+{
+  Index rows = m.rows();
+  Index cols = m.cols();
+  MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
+  std::list<MatrixType,Eigen::aligned_allocator<MatrixType> > v(10, MatrixType::Zero(rows,cols)), w(20, y);
+  v.front() = x;
+  w.front() = w.back();
+  VERIFY_IS_APPROX(w.front(), w.back());
+  v = w;
+
+  typename std::list<MatrixType,Eigen::aligned_allocator<MatrixType> >::iterator vi = v.begin();
+  typename std::list<MatrixType,Eigen::aligned_allocator<MatrixType> >::iterator wi = w.begin();
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(*vi, *wi);
+    ++vi;
+    ++wi;
+  }
+
+  v.resize(21, MatrixType::Zero(rows,cols));  
+  v.back() = x;
+  VERIFY_IS_APPROX(v.back(), x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(v.back(), y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v.back(), x);
+}
+
+template<typename TransformType>
+void check_stdlist_transform(const TransformType&)
+{
+  typedef typename TransformType::MatrixType MatrixType;
+  TransformType x(MatrixType::Random()), y(MatrixType::Random()), ti=TransformType::Identity();
+  std::list<TransformType,Eigen::aligned_allocator<TransformType> > v(10,ti), w(20, y);
+  v.front() = x;
+  w.front() = w.back();
+  VERIFY_IS_APPROX(w.front(), w.back());
+  v = w;
+
+  typename std::list<TransformType,Eigen::aligned_allocator<TransformType> >::iterator vi = v.begin();
+  typename std::list<TransformType,Eigen::aligned_allocator<TransformType> >::iterator wi = w.begin();
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(*vi, *wi);
+    ++vi;
+    ++wi;
+  }
+
+  v.resize(21, ti);
+  v.back() = x;
+  VERIFY_IS_APPROX(v.back(), x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(v.back(), y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v.back(), x);
+}
+
+template<typename QuaternionType>
+void check_stdlist_quaternion(const QuaternionType&)
+{
+  typedef typename QuaternionType::Coefficients Coefficients;
+  QuaternionType x(Coefficients::Random()), y(Coefficients::Random()), qi=QuaternionType::Identity();
+  std::list<QuaternionType,Eigen::aligned_allocator<QuaternionType> > v(10,qi), w(20, y);
+  v.front() = x;
+  w.front() = w.back();
+  VERIFY_IS_APPROX(w.front(), w.back());
+  v = w;
+
+  typename std::list<QuaternionType,Eigen::aligned_allocator<QuaternionType> >::iterator vi = v.begin();
+  typename std::list<QuaternionType,Eigen::aligned_allocator<QuaternionType> >::iterator wi = w.begin();
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(*vi, *wi);
+    ++vi;
+    ++wi;
+  }
+
+  v.resize(21,qi);
+  v.back() = x;
+  VERIFY_IS_APPROX(v.back(), x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(v.back(), y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v.back(), x);
+}
+
+EIGEN_DECLARE_TEST(stdlist)
+{
+  // some non vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stdlist_matrix(Vector2f()));
+  CALL_SUBTEST_1(check_stdlist_matrix(Matrix3f()));
+  CALL_SUBTEST_2(check_stdlist_matrix(Matrix3d()));
+
+  // some vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stdlist_matrix(Matrix2f()));
+  CALL_SUBTEST_1(check_stdlist_matrix(Vector4f()));
+  CALL_SUBTEST_1(check_stdlist_matrix(Matrix4f()));
+  CALL_SUBTEST_2(check_stdlist_matrix(Matrix4d()));
+
+  // some dynamic sizes
+  CALL_SUBTEST_3(check_stdlist_matrix(MatrixXd(1,1)));
+  CALL_SUBTEST_3(check_stdlist_matrix(VectorXd(20)));
+  CALL_SUBTEST_3(check_stdlist_matrix(RowVectorXf(20)));
+  CALL_SUBTEST_3(check_stdlist_matrix(MatrixXcf(10,10)));
+
+  // some Transform
+  CALL_SUBTEST_4(check_stdlist_transform(Affine2f()));
+  CALL_SUBTEST_4(check_stdlist_transform(Affine3f()));
+  CALL_SUBTEST_4(check_stdlist_transform(Affine3d()));
+
+  // some Quaternion
+  CALL_SUBTEST_5(check_stdlist_quaternion(Quaternionf()));
+  CALL_SUBTEST_5(check_stdlist_quaternion(Quaterniond()));
+}

diff --git a/test/stdlist_overload.cpp b/test/stdlist_overload.cpp
new file mode 100644
index 0000000..a78516e
--- /dev/null
+++ b/test/stdlist_overload.cpp

@@ -0,0 +1,192 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/StdList>
+#include <Eigen/Geometry>
+
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Vector4f)
+
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Matrix2f)
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Matrix4f)
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Matrix4d)
+
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Affine3f)
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Affine3d)
+
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Quaternionf)
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Quaterniond)
+
+template <class Container, class Position>
+typename Container::iterator get(Container & c, Position position)
+{
+  typename Container::iterator it = c.begin();
+  std::advance(it, position);
+  return it;
+}
+
+template <class Container, class Position, class Value>
+void set(Container & c, Position position, const Value & value)
+{
+  typename Container::iterator it = c.begin();
+  std::advance(it, position);
+  *it = value;
+}
+
+template<typename MatrixType>
+void check_stdlist_matrix(const MatrixType& m)
+{
+  Index rows = m.rows();
+  Index cols = m.cols();
+  MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
+  std::list<MatrixType> v(10, MatrixType::Zero(rows,cols)), w(20, y);
+  typename std::list<MatrixType>::iterator itv = get(v, 5);
+  typename std::list<MatrixType>::iterator itw = get(w, 6);
+  *itv = x;
+  *itw = *itv;
+  VERIFY_IS_APPROX(*itw, *itv);
+  v = w;
+  itv = v.begin();
+  itw = w.begin();
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(*itw, *itv);
+    ++itv;
+    ++itw;
+  }
+
+  v.resize(21);
+  set(v, 20, x);
+  VERIFY_IS_APPROX(*get(v, 20), x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(*get(v, 21), y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(*get(v, 22), x);
+
+  // do a lot of push_back such that the list gets internally resized
+  // (with memory reallocation)
+  MatrixType* ref = &(*get(w, 0));
+  for(int i=0; i<30 || ((ref==&(*get(w, 0))) && i<300); ++i)
+    v.push_back(*get(w, i%w.size()));
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY((*get(v, i))==(*get(w, (i-23)%w.size())));
+  }
+}
+
+template<typename TransformType>
+void check_stdlist_transform(const TransformType&)
+{
+  typedef typename TransformType::MatrixType MatrixType;
+  TransformType x(MatrixType::Random()), y(MatrixType::Random()), ti=TransformType::Identity();
+  std::list<TransformType> v(10,ti), w(20, y);
+  typename std::list<TransformType>::iterator itv = get(v, 5);
+  typename std::list<TransformType>::iterator itw = get(w, 6);
+  *itv = x;
+  *itw = *itv;
+  VERIFY_IS_APPROX(*itw, *itv);
+  v = w;
+  itv = v.begin();
+  itw = w.begin();
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(*itw, *itv);
+    ++itv;
+    ++itw;
+  }
+
+  v.resize(21, ti);
+  set(v, 20, x);
+  VERIFY_IS_APPROX(*get(v, 20), x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(*get(v, 21), y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(*get(v, 22), x);
+
+  // do a lot of push_back such that the list gets internally resized
+  // (with memory reallocation)
+  TransformType* ref = &(*get(w, 0));
+  for(int i=0; i<30 || ((ref==&(*get(w, 0))) && i<300); ++i)
+    v.push_back(*get(w, i%w.size()));
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(get(v, i)->matrix()==get(w, (i-23)%w.size())->matrix());
+  }
+}
+
+template<typename QuaternionType>
+void check_stdlist_quaternion(const QuaternionType&)
+{
+  typedef typename QuaternionType::Coefficients Coefficients;
+  QuaternionType x(Coefficients::Random()), y(Coefficients::Random()), qi=QuaternionType::Identity();
+  std::list<QuaternionType> v(10,qi), w(20, y);
+  typename std::list<QuaternionType>::iterator itv = get(v, 5);
+  typename std::list<QuaternionType>::iterator itw = get(w, 6);
+  *itv = x;
+  *itw = *itv;
+  VERIFY_IS_APPROX(*itw, *itv);
+  v = w;
+  itv = v.begin();
+  itw = w.begin();
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(*itw, *itv);
+    ++itv;
+    ++itw;
+  }
+
+  v.resize(21,qi);
+  set(v, 20, x);
+  VERIFY_IS_APPROX(*get(v, 20), x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(*get(v, 21), y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(*get(v, 22), x);
+
+  // do a lot of push_back such that the list gets internally resized
+  // (with memory reallocation)
+  QuaternionType* ref = &(*get(w, 0));
+  for(int i=0; i<30 || ((ref==&(*get(w, 0))) && i<300); ++i)
+    v.push_back(*get(w, i%w.size()));
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(get(v, i)->coeffs()==get(w, (i-23)%w.size())->coeffs());
+  }
+}
+
+EIGEN_DECLARE_TEST(stdlist_overload)
+{
+  // some non vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stdlist_matrix(Vector2f()));
+  CALL_SUBTEST_1(check_stdlist_matrix(Matrix3f()));
+  CALL_SUBTEST_2(check_stdlist_matrix(Matrix3d()));
+
+  // some vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stdlist_matrix(Matrix2f()));
+  CALL_SUBTEST_1(check_stdlist_matrix(Vector4f()));
+  CALL_SUBTEST_1(check_stdlist_matrix(Matrix4f()));
+  CALL_SUBTEST_2(check_stdlist_matrix(Matrix4d()));
+
+  // some dynamic sizes
+  CALL_SUBTEST_3(check_stdlist_matrix(MatrixXd(1,1)));
+  CALL_SUBTEST_3(check_stdlist_matrix(VectorXd(20)));
+  CALL_SUBTEST_3(check_stdlist_matrix(RowVectorXf(20)));
+  CALL_SUBTEST_3(check_stdlist_matrix(MatrixXcf(10,10)));
+
+  // some Transform
+  CALL_SUBTEST_4(check_stdlist_transform(Affine2f())); // does not need the specialization (2+1)^2 = 9
+  CALL_SUBTEST_4(check_stdlist_transform(Affine3f()));
+  CALL_SUBTEST_4(check_stdlist_transform(Affine3d()));
+
+  // some Quaternion
+  CALL_SUBTEST_5(check_stdlist_quaternion(Quaternionf()));
+  CALL_SUBTEST_5(check_stdlist_quaternion(Quaterniond()));
+}

diff --git a/test/stdvector.cpp b/test/stdvector.cpp
new file mode 100644
index 0000000..18de240
--- /dev/null
+++ b/test/stdvector.cpp

@@ -0,0 +1,158 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/StdVector>
+#include <Eigen/Geometry>
+
+template<typename MatrixType>
+void check_stdvector_matrix(const MatrixType& m)
+{
+  Index rows = m.rows();
+  Index cols = m.cols();
+  MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
+  std::vector<MatrixType,Eigen::aligned_allocator<MatrixType> > v(10, MatrixType::Zero(rows,cols)), w(20, y);
+  v[5] = x;
+  w[6] = v[5];
+  VERIFY_IS_APPROX(w[6], v[5]);
+  v = w;
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(w[i], v[i]);
+  }
+
+  v.resize(21);
+  v[20] = x;
+  VERIFY_IS_APPROX(v[20], x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(v[21], y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v[22], x);
+  VERIFY((internal::UIntPtr)&(v[22]) == (internal::UIntPtr)&(v[21]) + sizeof(MatrixType));
+
+  // do a lot of push_back such that the vector gets internally resized
+  // (with memory reallocation)
+  MatrixType* ref = &w[0];
+  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
+    v.push_back(w[i%w.size()]);
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(v[i]==w[(i-23)%w.size()]);
+  }
+}
+
+template<typename TransformType>
+void check_stdvector_transform(const TransformType&)
+{
+  typedef typename TransformType::MatrixType MatrixType;
+  TransformType x(MatrixType::Random()), y(MatrixType::Random());
+  std::vector<TransformType,Eigen::aligned_allocator<TransformType> > v(10), w(20, y);
+  v[5] = x;
+  w[6] = v[5];
+  VERIFY_IS_APPROX(w[6], v[5]);
+  v = w;
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(w[i], v[i]);
+  }
+
+  v.resize(21);
+  v[20] = x;
+  VERIFY_IS_APPROX(v[20], x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(v[21], y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v[22], x);
+  VERIFY((internal::UIntPtr)&(v[22]) == (internal::UIntPtr)&(v[21]) + sizeof(TransformType));
+
+  // do a lot of push_back such that the vector gets internally resized
+  // (with memory reallocation)
+  TransformType* ref = &w[0];
+  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
+    v.push_back(w[i%w.size()]);
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(v[i].matrix()==w[(i-23)%w.size()].matrix());
+  }
+}
+
+template<typename QuaternionType>
+void check_stdvector_quaternion(const QuaternionType&)
+{
+  typedef typename QuaternionType::Coefficients Coefficients;
+  QuaternionType x(Coefficients::Random()), y(Coefficients::Random()), qi=QuaternionType::Identity();
+  std::vector<QuaternionType,Eigen::aligned_allocator<QuaternionType> > v(10,qi), w(20, y);
+  v[5] = x;
+  w[6] = v[5];
+  VERIFY_IS_APPROX(w[6], v[5]);
+  v = w;
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(w[i], v[i]);
+  }
+
+  v.resize(21);
+  v[20] = x;
+  VERIFY_IS_APPROX(v[20], x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(v[21], y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v[22], x);
+  VERIFY((internal::UIntPtr)&(v[22]) == (internal::UIntPtr)&(v[21]) + sizeof(QuaternionType));
+
+  // do a lot of push_back such that the vector gets internally resized
+  // (with memory reallocation)
+  QuaternionType* ref = &w[0];
+  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
+    v.push_back(w[i%w.size()]);
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(v[i].coeffs()==w[(i-23)%w.size()].coeffs());
+  }
+}
+
+// the code below triggered an invalid warning with gcc >= 7
+// eigen/Eigen/src/Core/util/Memory.h:189:12: warning: argument 1 value '18446744073709551612' exceeds maximum object size 9223372036854775807
+// This has been reported to gcc there: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87544
+void std_vector_gcc_warning()
+{
+  typedef Eigen::Vector3f T;
+  std::vector<T, Eigen::aligned_allocator<T> > v;
+  v.push_back(T());
+}
+
+EIGEN_DECLARE_TEST(stdvector)
+{
+  // some non vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stdvector_matrix(Vector2f()));
+  CALL_SUBTEST_1(check_stdvector_matrix(Matrix3f()));
+  CALL_SUBTEST_2(check_stdvector_matrix(Matrix3d()));
+
+  // some vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stdvector_matrix(Matrix2f()));
+  CALL_SUBTEST_1(check_stdvector_matrix(Vector4f()));
+  CALL_SUBTEST_1(check_stdvector_matrix(Matrix4f()));
+  CALL_SUBTEST_2(check_stdvector_matrix(Matrix4d()));
+
+  // some dynamic sizes
+  CALL_SUBTEST_3(check_stdvector_matrix(MatrixXd(1,1)));
+  CALL_SUBTEST_3(check_stdvector_matrix(VectorXd(20)));
+  CALL_SUBTEST_3(check_stdvector_matrix(RowVectorXf(20)));
+  CALL_SUBTEST_3(check_stdvector_matrix(MatrixXcf(10,10)));
+
+  // some Transform
+  CALL_SUBTEST_4(check_stdvector_transform(Projective2f()));
+  CALL_SUBTEST_4(check_stdvector_transform(Projective3f()));
+  CALL_SUBTEST_4(check_stdvector_transform(Projective3d()));
+  //CALL_SUBTEST(heck_stdvector_transform(Projective4d()));
+
+  // some Quaternion
+  CALL_SUBTEST_5(check_stdvector_quaternion(Quaternionf()));
+  CALL_SUBTEST_5(check_stdvector_quaternion(Quaterniond()));
+}

diff --git a/test/stdvector_overload.cpp b/test/stdvector_overload.cpp
new file mode 100644
index 0000000..da04f8a
--- /dev/null
+++ b/test/stdvector_overload.cpp

@@ -0,0 +1,161 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/StdVector>
+#include <Eigen/Geometry>
+
+EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(Vector4f)
+
+EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(Matrix2f)
+EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(Matrix4f)
+EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(Matrix4d)
+
+EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(Affine3f)
+EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(Affine3d)
+
+EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(Quaternionf)
+EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(Quaterniond)
+
+template<typename MatrixType>
+void check_stdvector_matrix(const MatrixType& m)
+{
+  Index rows = m.rows();
+  Index cols = m.cols();
+  MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
+  std::vector<MatrixType> v(10, MatrixType::Zero(rows,cols)), w(20, y);
+  v[5] = x;
+  w[6] = v[5];
+  VERIFY_IS_APPROX(w[6], v[5]);
+  v = w;
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(w[i], v[i]);
+  }
+
+  v.resize(21);
+  v[20] = x;
+  VERIFY_IS_APPROX(v[20], x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(v[21], y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v[22], x);
+  VERIFY((internal::UIntPtr)&(v[22]) == (internal::UIntPtr)&(v[21]) + sizeof(MatrixType));
+
+  // do a lot of push_back such that the vector gets internally resized
+  // (with memory reallocation)
+  MatrixType* ref = &w[0];
+  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
+    v.push_back(w[i%w.size()]);
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(v[i]==w[(i-23)%w.size()]);
+  }
+}
+
+template<typename TransformType>
+void check_stdvector_transform(const TransformType&)
+{
+  typedef typename TransformType::MatrixType MatrixType;
+  TransformType x(MatrixType::Random()), y(MatrixType::Random());
+  std::vector<TransformType> v(10), w(20, y);
+  v[5] = x;
+  w[6] = v[5];
+  VERIFY_IS_APPROX(w[6], v[5]);
+  v = w;
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(w[i], v[i]);
+  }
+
+  v.resize(21);
+  v[20] = x;
+  VERIFY_IS_APPROX(v[20], x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(v[21], y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v[22], x);
+  VERIFY((internal::UIntPtr)&(v[22]) == (internal::UIntPtr)&(v[21]) + sizeof(TransformType));
+
+  // do a lot of push_back such that the vector gets internally resized
+  // (with memory reallocation)
+  TransformType* ref = &w[0];
+  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
+    v.push_back(w[i%w.size()]);
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(v[i].matrix()==w[(i-23)%w.size()].matrix());
+  }
+}
+
+template<typename QuaternionType>
+void check_stdvector_quaternion(const QuaternionType&)
+{
+  typedef typename QuaternionType::Coefficients Coefficients;
+  QuaternionType x(Coefficients::Random()), y(Coefficients::Random()), qi=QuaternionType::Identity();
+  std::vector<QuaternionType> v(10,qi), w(20, y);
+  v[5] = x;
+  w[6] = v[5];
+  VERIFY_IS_APPROX(w[6], v[5]);
+  v = w;
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(w[i], v[i]);
+  }
+
+  v.resize(21);
+  v[20] = x;
+  VERIFY_IS_APPROX(v[20], x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(v[21], y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v[22], x);
+  VERIFY((internal::UIntPtr)&(v[22]) == (internal::UIntPtr)&(v[21]) + sizeof(QuaternionType));
+
+  // do a lot of push_back such that the vector gets internally resized
+  // (with memory reallocation)
+  QuaternionType* ref = &w[0];
+  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
+    v.push_back(w[i%w.size()]);
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(v[i].coeffs()==w[(i-23)%w.size()].coeffs());
+  }
+}
+
+EIGEN_DECLARE_TEST(stdvector_overload)
+{
+  // some non vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stdvector_matrix(Vector2f()));
+  CALL_SUBTEST_1(check_stdvector_matrix(Matrix3f()));
+  CALL_SUBTEST_2(check_stdvector_matrix(Matrix3d()));
+
+  // some vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stdvector_matrix(Matrix2f()));
+  CALL_SUBTEST_1(check_stdvector_matrix(Vector4f()));
+  CALL_SUBTEST_1(check_stdvector_matrix(Matrix4f()));
+  CALL_SUBTEST_2(check_stdvector_matrix(Matrix4d()));
+
+  // some dynamic sizes
+  CALL_SUBTEST_3(check_stdvector_matrix(MatrixXd(1,1)));
+  CALL_SUBTEST_3(check_stdvector_matrix(VectorXd(20)));
+  CALL_SUBTEST_3(check_stdvector_matrix(RowVectorXf(20)));
+  CALL_SUBTEST_3(check_stdvector_matrix(MatrixXcf(10,10)));
+
+  // some Transform
+  CALL_SUBTEST_4(check_stdvector_transform(Affine2f())); // does not need the specialization (2+1)^2 = 9
+  CALL_SUBTEST_4(check_stdvector_transform(Affine3f()));
+  CALL_SUBTEST_4(check_stdvector_transform(Affine3d()));
+
+  // some Quaternion
+  CALL_SUBTEST_5(check_stdvector_quaternion(Quaternionf()));
+  CALL_SUBTEST_5(check_stdvector_quaternion(Quaterniond()));
+}

diff --git a/test/stl_iterators.cpp b/test/stl_iterators.cpp
new file mode 100644
index 0000000..72bbf82
--- /dev/null
+++ b/test/stl_iterators.cpp

@@ -0,0 +1,562 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <iterator>
+#include <numeric>
+
+template< class Iterator >
+std::reverse_iterator<Iterator>
+make_reverse_iterator( Iterator i )
+{
+  return std::reverse_iterator<Iterator>(i);
+}
+
+#if !EIGEN_HAS_CXX11
+template<class ForwardIt>
+ForwardIt is_sorted_until(ForwardIt firstIt, ForwardIt lastIt)
+{
+    if (firstIt != lastIt) {
+        ForwardIt next = firstIt;
+        while (++next != lastIt) {
+            if (*next < *firstIt)
+                return next;
+            firstIt = next;
+        }
+    }
+    return lastIt;
+}
+template<class ForwardIt>
+bool is_sorted(ForwardIt firstIt, ForwardIt lastIt)
+{
+    return ::is_sorted_until(firstIt, lastIt) == lastIt;
+}
+#else
+using std::is_sorted;
+#endif
+
+template<typename XprType>
+bool is_pointer_based_stl_iterator(const internal::pointer_based_stl_iterator<XprType> &) { return true; }
+
+template<typename XprType>
+bool is_generic_randaccess_stl_iterator(const internal::generic_randaccess_stl_iterator<XprType> &) { return true; }
+
+template<typename Iter>
+bool is_default_constructible_and_assignable(const Iter& it)
+{
+#if EIGEN_HAS_CXX11
+  VERIFY(std::is_default_constructible<Iter>::value);
+  VERIFY(std::is_nothrow_default_constructible<Iter>::value);
+#endif
+  Iter it2;
+  it2 = it;
+  return (it==it2);
+}
+
+template<typename Xpr>
+void check_begin_end_for_loop(Xpr xpr)
+{
+  const Xpr& cxpr(xpr);
+  Index i = 0;
+
+  i = 0;
+  for(typename Xpr::iterator it = xpr.begin(); it!=xpr.end(); ++it) { VERIFY_IS_EQUAL(*it,xpr[i++]); }
+
+  i = 0;
+  for(typename Xpr::const_iterator it = xpr.cbegin(); it!=xpr.cend(); ++it) { VERIFY_IS_EQUAL(*it,xpr[i++]); }
+
+  i = 0;
+  for(typename Xpr::const_iterator it = cxpr.begin(); it!=cxpr.end(); ++it) { VERIFY_IS_EQUAL(*it,xpr[i++]); }
+
+  i = 0;
+  for(typename Xpr::const_iterator it = xpr.begin(); it!=xpr.end(); ++it) { VERIFY_IS_EQUAL(*it,xpr[i++]); }
+
+  {
+    // simple API check
+    typename Xpr::const_iterator cit = xpr.begin();
+    cit = xpr.cbegin();
+
+    #if EIGEN_HAS_CXX11
+    auto tmp1 = xpr.begin();
+    VERIFY(tmp1==xpr.begin());
+    auto tmp2 = xpr.cbegin();
+    VERIFY(tmp2==xpr.cbegin());
+    #endif
+  }
+
+  VERIFY( xpr.end() -xpr.begin()  == xpr.size() );
+  VERIFY( xpr.cend()-xpr.begin()  == xpr.size() );
+  VERIFY( xpr.end() -xpr.cbegin() == xpr.size() );
+  VERIFY( xpr.cend()-xpr.cbegin() == xpr.size() );
+
+  if(xpr.size()>0) {
+    VERIFY(xpr.begin() != xpr.end());
+    VERIFY(xpr.begin() < xpr.end());
+    VERIFY(xpr.begin() <= xpr.end());
+    VERIFY(!(xpr.begin() == xpr.end()));
+    VERIFY(!(xpr.begin() > xpr.end()));
+    VERIFY(!(xpr.begin() >= xpr.end()));
+    
+    VERIFY(xpr.cbegin() != xpr.end());
+    VERIFY(xpr.cbegin() < xpr.end());
+    VERIFY(xpr.cbegin() <= xpr.end());
+    VERIFY(!(xpr.cbegin() == xpr.end()));
+    VERIFY(!(xpr.cbegin() > xpr.end()));
+    VERIFY(!(xpr.cbegin() >= xpr.end()));
+
+    VERIFY(xpr.begin() != xpr.cend());
+    VERIFY(xpr.begin() < xpr.cend());
+    VERIFY(xpr.begin() <= xpr.cend());
+    VERIFY(!(xpr.begin() == xpr.cend()));
+    VERIFY(!(xpr.begin() > xpr.cend()));
+    VERIFY(!(xpr.begin() >= xpr.cend()));
+  }
+}
+
+template<typename Scalar, int Rows, int Cols>
+void test_stl_iterators(int rows=Rows, int cols=Cols)
+{
+  typedef Matrix<Scalar,Rows,1> VectorType;
+  #if EIGEN_HAS_CXX11
+  typedef Matrix<Scalar,1,Cols> RowVectorType;
+  #endif
+  typedef Matrix<Scalar,Rows,Cols,ColMajor> ColMatrixType;
+  typedef Matrix<Scalar,Rows,Cols,RowMajor> RowMatrixType;
+  VectorType v = VectorType::Random(rows);
+  const VectorType& cv(v);
+  ColMatrixType A = ColMatrixType::Random(rows,cols);
+  const ColMatrixType& cA(A);
+  RowMatrixType B = RowMatrixType::Random(rows,cols);
+  
+  Index i, j;
+
+  // Verify that iterators are default constructible (See bug #1900)
+  {
+    VERIFY( is_default_constructible_and_assignable(v.begin()));
+    VERIFY( is_default_constructible_and_assignable(v.end()));
+    VERIFY( is_default_constructible_and_assignable(cv.begin()));
+    VERIFY( is_default_constructible_and_assignable(cv.end()));
+
+    VERIFY( is_default_constructible_and_assignable(A.row(0).begin()));
+    VERIFY( is_default_constructible_and_assignable(A.row(0).end()));
+    VERIFY( is_default_constructible_and_assignable(cA.row(0).begin()));
+    VERIFY( is_default_constructible_and_assignable(cA.row(0).end()));
+
+    VERIFY( is_default_constructible_and_assignable(B.row(0).begin()));
+    VERIFY( is_default_constructible_and_assignable(B.row(0).end()));
+  }
+
+  // Check we got a fast pointer-based iterator when expected
+  {
+    VERIFY( is_pointer_based_stl_iterator(v.begin()) );
+    VERIFY( is_pointer_based_stl_iterator(v.end()) );
+    VERIFY( is_pointer_based_stl_iterator(cv.begin()) );
+    VERIFY( is_pointer_based_stl_iterator(cv.end()) );
+
+    j = internal::random<Index>(0,A.cols()-1);
+    VERIFY( is_pointer_based_stl_iterator(A.col(j).begin()) );
+    VERIFY( is_pointer_based_stl_iterator(A.col(j).end()) );
+    VERIFY( is_pointer_based_stl_iterator(cA.col(j).begin()) );
+    VERIFY( is_pointer_based_stl_iterator(cA.col(j).end()) );
+
+    i = internal::random<Index>(0,A.rows()-1);
+    VERIFY( is_pointer_based_stl_iterator(A.row(i).begin()) );
+    VERIFY( is_pointer_based_stl_iterator(A.row(i).end()) );
+    VERIFY( is_pointer_based_stl_iterator(cA.row(i).begin()) );
+    VERIFY( is_pointer_based_stl_iterator(cA.row(i).end()) );
+
+    VERIFY( is_pointer_based_stl_iterator(A.reshaped().begin()) );
+    VERIFY( is_pointer_based_stl_iterator(A.reshaped().end()) );
+    VERIFY( is_pointer_based_stl_iterator(cA.reshaped().begin()) );
+    VERIFY( is_pointer_based_stl_iterator(cA.reshaped().end()) );
+
+    VERIFY( is_pointer_based_stl_iterator(B.template reshaped<AutoOrder>().begin()) );
+    VERIFY( is_pointer_based_stl_iterator(B.template reshaped<AutoOrder>().end()) );
+
+    VERIFY( is_generic_randaccess_stl_iterator(A.template reshaped<RowMajor>().begin()) );
+    VERIFY( is_generic_randaccess_stl_iterator(A.template reshaped<RowMajor>().end()) );
+  }
+
+  {
+    check_begin_end_for_loop(v);
+    check_begin_end_for_loop(A.col(internal::random<Index>(0,A.cols()-1)));
+    check_begin_end_for_loop(A.row(internal::random<Index>(0,A.rows()-1)));
+    check_begin_end_for_loop(v+v);
+  }
+
+#if EIGEN_HAS_CXX11
+  // check swappable
+  {
+    using std::swap;
+    // pointer-based
+    {
+      VectorType v_copy = v;
+      auto a = v.begin();
+      auto b = v.end()-1;
+      swap(a,b);
+      VERIFY_IS_EQUAL(v,v_copy);
+      VERIFY_IS_EQUAL(*b,*v.begin());
+      VERIFY_IS_EQUAL(*b,v(0));
+      VERIFY_IS_EQUAL(*a,v.end()[-1]);
+      VERIFY_IS_EQUAL(*a,v(last));
+    }
+
+    // generic
+    {
+      RowMatrixType B_copy = B;
+      auto Br = B.reshaped();
+      auto a = Br.begin();
+      auto b = Br.end()-1;
+      swap(a,b);
+      VERIFY_IS_EQUAL(B,B_copy);
+      VERIFY_IS_EQUAL(*b,*Br.begin());
+      VERIFY_IS_EQUAL(*b,Br(0));
+      VERIFY_IS_EQUAL(*a,Br.end()[-1]);
+      VERIFY_IS_EQUAL(*a,Br(last));
+    }
+  }
+
+  // check non-const iterator with for-range loops
+  {
+    i = 0;
+    for(auto x : v) { VERIFY_IS_EQUAL(x,v[i++]); }
+
+    j = internal::random<Index>(0,A.cols()-1);
+    i = 0;
+    for(auto x : A.col(j)) { VERIFY_IS_EQUAL(x,A(i++,j)); }
+
+    i = 0;
+    for(auto x : (v+A.col(j))) { VERIFY_IS_APPROX(x,v(i)+A(i,j)); ++i; }
+
+    j = 0;
+    i = internal::random<Index>(0,A.rows()-1);
+    for(auto x : A.row(i)) { VERIFY_IS_EQUAL(x,A(i,j++)); }
+
+    i = 0;
+    for(auto x : A.reshaped()) { VERIFY_IS_EQUAL(x,A(i++)); }
+  }
+
+  // same for const_iterator
+  {
+    i = 0;
+    for(auto x : cv) { VERIFY_IS_EQUAL(x,v[i++]); }
+
+    i = 0;
+    for(auto x : cA.reshaped()) { VERIFY_IS_EQUAL(x,A(i++)); }
+
+    j = 0;
+    i = internal::random<Index>(0,A.rows()-1);
+    for(auto x : cA.row(i)) { VERIFY_IS_EQUAL(x,A(i,j++)); }
+  }
+
+  // check reshaped() on row-major
+  {
+    i = 0;
+    Matrix<Scalar,Dynamic,Dynamic,ColMajor> Bc = B;
+    for(auto x : B.reshaped()) { VERIFY_IS_EQUAL(x,Bc(i++)); }
+  }
+
+  // check write access
+  {
+    VectorType w(v.size());
+    i = 0;
+    for(auto& x : w) { x = v(i++); }
+    VERIFY_IS_EQUAL(v,w);
+  }
+
+  // check for dangling pointers
+  {
+    // no dangling because pointer-based
+    {
+      j = internal::random<Index>(0,A.cols()-1);
+      auto it = A.col(j).begin();
+      for(i=0;i<rows;++i) {
+        VERIFY_IS_EQUAL(it[i],A(i,j));
+      }
+    }
+
+    // no dangling because pointer-based
+    {
+      i = internal::random<Index>(0,A.rows()-1);
+      auto it = A.row(i).begin();
+      for(j=0;j<cols;++j) { VERIFY_IS_EQUAL(it[j],A(i,j)); }
+    }
+
+    {
+      j = internal::random<Index>(0,A.cols()-1);
+      // this would produce a dangling pointer:
+      // auto it = (A+2*A).col(j).begin(); 
+      // we need to name the temporary expression:
+      auto tmp = (A+2*A).col(j);
+      auto it = tmp.begin();
+      for(i=0;i<rows;++i) {
+        VERIFY_IS_APPROX(it[i],3*A(i,j));
+      }
+    }
+  }
+
+  {
+    // check basic for loop on vector-wise iterators
+    j=0;
+    for (auto it = A.colwise().cbegin(); it != A.colwise().cend(); ++it, ++j) {
+      VERIFY_IS_APPROX( it->coeff(0), A(0,j) );
+      VERIFY_IS_APPROX( (*it).coeff(0), A(0,j) );
+    }
+    j=0;
+    for (auto it = A.colwise().begin(); it != A.colwise().end(); ++it, ++j) {
+      (*it).coeffRef(0) = (*it).coeff(0); // compilation check
+      it->coeffRef(0) = it->coeff(0);     // compilation check
+      VERIFY_IS_APPROX( it->coeff(0), A(0,j) );
+      VERIFY_IS_APPROX( (*it).coeff(0), A(0,j) );
+    }
+
+    // check valuetype gives us a copy
+    j=0;
+    for (auto it = A.colwise().cbegin(); it != A.colwise().cend(); ++it, ++j) {
+      typename decltype(it)::value_type tmp = *it;
+      VERIFY_IS_NOT_EQUAL( tmp.data() , it->data() );
+      VERIFY_IS_APPROX( tmp, A.col(j) );
+    }
+  }
+
+#endif
+
+  if(rows>=3) {
+    VERIFY_IS_EQUAL((v.begin()+rows/2)[1], v(rows/2+1));
+
+    VERIFY_IS_EQUAL((A.rowwise().begin()+rows/2)[1], A.row(rows/2+1));
+  }
+
+  if(cols>=3) {
+    VERIFY_IS_EQUAL((A.colwise().begin()+cols/2)[1], A.col(cols/2+1));
+  }
+
+  // check std::sort
+  {
+    // first check that is_sorted returns false when required
+    if(rows>=2)
+    {
+      v(1) = v(0)-Scalar(1);
+      #if EIGEN_HAS_CXX11
+      VERIFY(!is_sorted(std::begin(v),std::end(v)));
+      #else
+      VERIFY(!is_sorted(v.cbegin(),v.cend()));
+      #endif
+    }
+
+    // on a vector
+    {
+      std::sort(v.begin(),v.end());
+      VERIFY(is_sorted(v.begin(),v.end()));
+      VERIFY(!::is_sorted(make_reverse_iterator(v.end()),make_reverse_iterator(v.begin())));
+    }
+
+    // on a column of a column-major matrix -> pointer-based iterator and default increment
+    {
+      j = internal::random<Index>(0,A.cols()-1);
+      // std::sort(begin(A.col(j)),end(A.col(j))); // does not compile because this returns const iterators
+      typename ColMatrixType::ColXpr Acol = A.col(j);
+      std::sort(Acol.begin(),Acol.end());
+      VERIFY(is_sorted(Acol.cbegin(),Acol.cend()));
+      A.setRandom();
+
+      std::sort(A.col(j).begin(),A.col(j).end());
+      VERIFY(is_sorted(A.col(j).cbegin(),A.col(j).cend()));
+      A.setRandom();
+    }
+
+    // on a row of a rowmajor matrix -> pointer-based iterator and runtime increment
+    {
+      i = internal::random<Index>(0,A.rows()-1);
+      typename ColMatrixType::RowXpr Arow = A.row(i);
+      VERIFY_IS_EQUAL( std::distance(Arow.begin(),Arow.end()), cols);
+      std::sort(Arow.begin(),Arow.end());
+      VERIFY(is_sorted(Arow.cbegin(),Arow.cend()));
+      A.setRandom();
+
+      std::sort(A.row(i).begin(),A.row(i).end());
+      VERIFY(is_sorted(A.row(i).cbegin(),A.row(i).cend()));
+      A.setRandom();
+    }
+
+    // with a generic iterator
+    {
+      Reshaped<RowMatrixType,RowMatrixType::SizeAtCompileTime,1> B1 = B.reshaped();
+      std::sort(B1.begin(),B1.end());
+      VERIFY(is_sorted(B1.cbegin(),B1.cend()));
+      B.setRandom();
+
+      // assertion because nested expressions are different
+      // std::sort(B.reshaped().begin(),B.reshaped().end());
+      // VERIFY(is_sorted(B.reshaped().cbegin(),B.reshaped().cend()));
+      // B.setRandom();
+    }
+  }
+
+  // check with partial_sum
+  {
+    j = internal::random<Index>(0,A.cols()-1);
+    typename ColMatrixType::ColXpr Acol = A.col(j);
+    std::partial_sum(Acol.begin(), Acol.end(), v.begin());
+    VERIFY_IS_APPROX(v(seq(1,last)), v(seq(0,last-1))+Acol(seq(1,last)));
+
+    // inplace
+    std::partial_sum(Acol.begin(), Acol.end(), Acol.begin());
+    VERIFY_IS_APPROX(v, Acol);
+  }
+
+  // stress random access as required by std::nth_element
+  if(rows>=3)
+  {
+    v.setRandom();
+    VectorType v1 = v;
+    std::sort(v1.begin(),v1.end());
+    std::nth_element(v.begin(), v.begin()+rows/2, v.end());
+    VERIFY_IS_APPROX(v1(rows/2), v(rows/2));
+
+    v.setRandom();
+    v1 = v;
+    std::sort(v1.begin()+rows/2,v1.end());
+    std::nth_element(v.begin()+rows/2, v.begin()+rows/4, v.end());
+    VERIFY_IS_APPROX(v1(rows/4), v(rows/4));
+  }
+
+#if EIGEN_HAS_CXX11
+  // check rows/cols iterators with range-for loops
+  {
+    j = 0;
+    for(auto c : A.colwise()) { VERIFY_IS_APPROX(c.sum(), A.col(j).sum()); ++j; }
+    j = 0;
+    for(auto c : B.colwise()) { VERIFY_IS_APPROX(c.sum(), B.col(j).sum()); ++j; }
+
+    j = 0;
+    for(auto c : B.colwise()) {
+      i = 0;
+      for(auto& x : c) {
+        VERIFY_IS_EQUAL(x, B(i,j));
+        x = A(i,j);
+        ++i;
+      }
+      ++j;
+    }
+    VERIFY_IS_APPROX(A,B);
+    B.setRandom();
+    
+    i = 0;
+    for(auto r : A.rowwise()) { VERIFY_IS_APPROX(r.sum(), A.row(i).sum()); ++i; }
+    i = 0;
+    for(auto r : B.rowwise()) { VERIFY_IS_APPROX(r.sum(), B.row(i).sum()); ++i; }
+  }
+
+
+  // check rows/cols iterators with STL algorithms
+  {
+    RowVectorType row = RowVectorType::Random(cols);
+    A.rowwise() = row;
+    VERIFY( std::all_of(A.rowwise().begin(),  A.rowwise().end(),  [&row](typename ColMatrixType::RowXpr x) { return internal::isApprox(x.squaredNorm(),row.squaredNorm()); }) );
+    VERIFY( std::all_of(A.rowwise().rbegin(), A.rowwise().rend(), [&row](typename ColMatrixType::RowXpr x) { return internal::isApprox(x.squaredNorm(),row.squaredNorm()); }) );
+
+    VectorType col = VectorType::Random(rows);
+    A.colwise() = col;
+    VERIFY( std::all_of(A.colwise().begin(),   A.colwise().end(),   [&col](typename ColMatrixType::ColXpr x) { return internal::isApprox(x.squaredNorm(),col.squaredNorm()); }) );
+    VERIFY( std::all_of(A.colwise().rbegin(),  A.colwise().rend(),  [&col](typename ColMatrixType::ColXpr x) { return internal::isApprox(x.squaredNorm(),col.squaredNorm()); }) );
+    VERIFY( std::all_of(A.colwise().cbegin(),  A.colwise().cend(),  [&col](typename ColMatrixType::ConstColXpr x) { return internal::isApprox(x.squaredNorm(),col.squaredNorm()); }) );
+    VERIFY( std::all_of(A.colwise().crbegin(), A.colwise().crend(), [&col](typename ColMatrixType::ConstColXpr x) { return internal::isApprox(x.squaredNorm(),col.squaredNorm()); }) );
+
+    i = internal::random<Index>(0,A.rows()-1);
+    A.setRandom();
+    A.row(i).setZero();
+    VERIFY_IS_EQUAL( std::find_if(A.rowwise().begin(),  A.rowwise().end(),  [](typename ColMatrixType::RowXpr x) { return x.squaredNorm() == Scalar(0); })-A.rowwise().begin(),  i );
+    VERIFY_IS_EQUAL( std::find_if(A.rowwise().rbegin(), A.rowwise().rend(), [](typename ColMatrixType::RowXpr x) { return x.squaredNorm() == Scalar(0); })-A.rowwise().rbegin(), (A.rows()-1) - i );
+
+    j = internal::random<Index>(0,A.cols()-1);
+    A.setRandom();
+    A.col(j).setZero();
+    VERIFY_IS_EQUAL( std::find_if(A.colwise().begin(),  A.colwise().end(),  [](typename ColMatrixType::ColXpr x) { return x.squaredNorm() == Scalar(0); })-A.colwise().begin(),  j );
+    VERIFY_IS_EQUAL( std::find_if(A.colwise().rbegin(), A.colwise().rend(), [](typename ColMatrixType::ColXpr x) { return x.squaredNorm() == Scalar(0); })-A.colwise().rbegin(), (A.cols()-1) - j );
+  }
+
+  {
+    using VecOp = VectorwiseOp<ArrayXXi, 0>;
+    STATIC_CHECK(( internal::is_same<VecOp::const_iterator, decltype(std::declval<const VecOp&>().cbegin())>::value ));
+    STATIC_CHECK(( internal::is_same<VecOp::const_iterator, decltype(std::declval<const VecOp&>().cend  ())>::value ));
+    #if EIGEN_COMP_CXXVER>=14
+      STATIC_CHECK(( internal::is_same<VecOp::const_iterator, decltype(std::cbegin(std::declval<const VecOp&>()))>::value ));
+      STATIC_CHECK(( internal::is_same<VecOp::const_iterator, decltype(std::cend  (std::declval<const VecOp&>()))>::value ));
+    #endif
+  }
+
+#endif
+}
+
+
+#if EIGEN_HAS_CXX11
+// When the compiler sees expression IsContainerTest<C>(0), if C is an
+// STL-style container class, the first overload of IsContainerTest
+// will be viable (since both C::iterator* and C::const_iterator* are
+// valid types and NULL can be implicitly converted to them).  It will
+// be picked over the second overload as 'int' is a perfect match for
+// the type of argument 0.  If C::iterator or C::const_iterator is not
+// a valid type, the first overload is not viable, and the second
+// overload will be picked.
+template <class C,
+          class Iterator = decltype(::std::declval<const C&>().begin()),
+          class = decltype(::std::declval<const C&>().end()),
+          class = decltype(++::std::declval<Iterator&>()),
+          class = decltype(*::std::declval<Iterator>()),
+          class = typename C::const_iterator>
+bool IsContainerType(int /* dummy */) { return true; }
+
+template <class C>
+bool IsContainerType(long /* dummy */) { return false; }
+
+template <typename Scalar, int Rows, int Cols>
+void test_stl_container_detection(int rows=Rows, int cols=Cols)
+{
+  typedef Matrix<Scalar,Rows,1> VectorType;
+  typedef Matrix<Scalar,Rows,Cols,ColMajor> ColMatrixType;
+  typedef Matrix<Scalar,Rows,Cols,RowMajor> RowMatrixType;
+
+  ColMatrixType A = ColMatrixType::Random(rows, cols);
+  RowMatrixType B = RowMatrixType::Random(rows, cols);
+
+  Index i = 1;
+
+  using ColMatrixColType = decltype(A.col(i));
+  using ColMatrixRowType = decltype(A.row(i));
+  using RowMatrixColType = decltype(B.col(i));
+  using RowMatrixRowType = decltype(B.row(i));
+
+  // Vector and matrix col/row are valid Stl-style container.
+  VERIFY_IS_EQUAL(IsContainerType<VectorType>(0), true);
+  VERIFY_IS_EQUAL(IsContainerType<ColMatrixColType>(0), true);
+  VERIFY_IS_EQUAL(IsContainerType<ColMatrixRowType>(0), true);
+  VERIFY_IS_EQUAL(IsContainerType<RowMatrixColType>(0), true);
+  VERIFY_IS_EQUAL(IsContainerType<RowMatrixRowType>(0), true);
+
+  // But the matrix itself is not a valid Stl-style container.
+  VERIFY_IS_EQUAL(IsContainerType<ColMatrixType>(0), rows == 1 || cols == 1);
+  VERIFY_IS_EQUAL(IsContainerType<RowMatrixType>(0), rows == 1 || cols == 1);
+}
+#endif
+
+EIGEN_DECLARE_TEST(stl_iterators)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1(( test_stl_iterators<double,2,3>() ));
+    CALL_SUBTEST_1(( test_stl_iterators<float,7,5>() ));
+    CALL_SUBTEST_1(( test_stl_iterators<int,Dynamic,Dynamic>(internal::random<int>(5,10), internal::random<int>(5,10)) ));
+    CALL_SUBTEST_1(( test_stl_iterators<int,Dynamic,Dynamic>(internal::random<int>(10,200), internal::random<int>(10,200)) ));
+  }
+  
+#if EIGEN_HAS_CXX11
+  CALL_SUBTEST_1(( test_stl_container_detection<float,1,1>() ));
+  CALL_SUBTEST_1(( test_stl_container_detection<float,5,5>() ));
+#endif  
+}

diff --git a/test/superlu_support.cpp b/test/superlu_support.cpp
new file mode 100644
index 0000000..55450c8
--- /dev/null
+++ b/test/superlu_support.cpp

@@ -0,0 +1,23 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
+#include "sparse_solver.h"
+
+#include <Eigen/SuperLUSupport>
+
+EIGEN_DECLARE_TEST(superlu_support)
+{
+  SuperLU<SparseMatrix<double> > superlu_double_colmajor;
+  SuperLU<SparseMatrix<std::complex<double> > > superlu_cplxdouble_colmajor;
+  CALL_SUBTEST_1( check_sparse_square_solving(superlu_double_colmajor)      );
+  CALL_SUBTEST_2( check_sparse_square_solving(superlu_cplxdouble_colmajor)  );
+  CALL_SUBTEST_1( check_sparse_square_determinant(superlu_double_colmajor)      );
+  CALL_SUBTEST_2( check_sparse_square_determinant(superlu_cplxdouble_colmajor)  );
+}

diff --git a/test/svd_common.h b/test/svd_common.h
new file mode 100644
index 0000000..eae4c0b
--- /dev/null
+++ b/test/svd_common.h

@@ -0,0 +1,521 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef SVD_DEFAULT
+#error a macro SVD_DEFAULT(MatrixType) must be defined prior to including svd_common.h
+#endif
+
+#ifndef SVD_FOR_MIN_NORM
+#error a macro SVD_FOR_MIN_NORM(MatrixType) must be defined prior to including svd_common.h
+#endif
+
+#include "svd_fill.h"
+#include "solverbase.h"
+
+// Check that the matrix m is properly reconstructed and that the U and V factors are unitary
+// The SVD must have already been computed.
+template<typename SvdType, typename MatrixType>
+void svd_check_full(const MatrixType& m, const SvdType& svd)
+{
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime
+  };
+
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime> MatrixUType;
+  typedef Matrix<Scalar, ColsAtCompileTime, ColsAtCompileTime> MatrixVType;
+
+  MatrixType sigma = MatrixType::Zero(rows,cols);
+  sigma.diagonal() = svd.singularValues().template cast<Scalar>();
+  MatrixUType u = svd.matrixU();
+  MatrixVType v = svd.matrixV();
+  RealScalar scaling = m.cwiseAbs().maxCoeff();
+  if(scaling<(std::numeric_limits<RealScalar>::min)())
+  {
+    VERIFY(sigma.cwiseAbs().maxCoeff() <= (std::numeric_limits<RealScalar>::min)());
+  }
+  else
+  {
+    VERIFY_IS_APPROX(m/scaling, u * (sigma/scaling) * v.adjoint());
+  }
+  VERIFY_IS_UNITARY(u);
+  VERIFY_IS_UNITARY(v);
+}
+
+// Compare partial SVD defined by computationOptions to a full SVD referenceSvd
+template<typename SvdType, typename MatrixType>
+void svd_compare_to_full(const MatrixType& m,
+                         unsigned int computationOptions,
+                         const SvdType& referenceSvd)
+{
+  typedef typename MatrixType::RealScalar RealScalar;
+  Index rows = m.rows();
+  Index cols = m.cols();
+  Index diagSize = (std::min)(rows, cols);
+  RealScalar prec = test_precision<RealScalar>();
+
+  SvdType svd(m, computationOptions);
+
+  VERIFY_IS_APPROX(svd.singularValues(), referenceSvd.singularValues());
+  
+  if(computationOptions & (ComputeFullV|ComputeThinV))
+  {
+    VERIFY( (svd.matrixV().adjoint()*svd.matrixV()).isIdentity(prec) );
+    VERIFY_IS_APPROX( svd.matrixV().leftCols(diagSize) * svd.singularValues().asDiagonal() * svd.matrixV().leftCols(diagSize).adjoint(),
+                      referenceSvd.matrixV().leftCols(diagSize) * referenceSvd.singularValues().asDiagonal() * referenceSvd.matrixV().leftCols(diagSize).adjoint());
+  }
+  
+  if(computationOptions & (ComputeFullU|ComputeThinU))
+  {
+    VERIFY( (svd.matrixU().adjoint()*svd.matrixU()).isIdentity(prec) );
+    VERIFY_IS_APPROX( svd.matrixU().leftCols(diagSize) * svd.singularValues().cwiseAbs2().asDiagonal() * svd.matrixU().leftCols(diagSize).adjoint(),
+                      referenceSvd.matrixU().leftCols(diagSize) * referenceSvd.singularValues().cwiseAbs2().asDiagonal() * referenceSvd.matrixU().leftCols(diagSize).adjoint());
+  }
+  
+  // The following checks are not critical.
+  // For instance, with Dived&Conquer SVD, if only the factor 'V' is computedt then different matrix-matrix product implementation will be used
+  // and the resulting 'V' factor might be significantly different when the SVD decomposition is not unique, especially with single precision float.
+  ++g_test_level;
+  if(computationOptions & ComputeFullU)  VERIFY_IS_APPROX(svd.matrixU(), referenceSvd.matrixU());
+  if(computationOptions & ComputeThinU)  VERIFY_IS_APPROX(svd.matrixU(), referenceSvd.matrixU().leftCols(diagSize));
+  if(computationOptions & ComputeFullV)  VERIFY_IS_APPROX(svd.matrixV().cwiseAbs(), referenceSvd.matrixV().cwiseAbs());
+  if(computationOptions & ComputeThinV)  VERIFY_IS_APPROX(svd.matrixV(), referenceSvd.matrixV().leftCols(diagSize));
+  --g_test_level;
+}
+
+//
+template<typename SvdType, typename MatrixType>
+void svd_least_square(const MatrixType& m, unsigned int computationOptions)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime
+  };
+
+  typedef Matrix<Scalar, RowsAtCompileTime, Dynamic> RhsType;
+  typedef Matrix<Scalar, ColsAtCompileTime, Dynamic> SolutionType;
+
+  RhsType rhs = RhsType::Random(rows, internal::random<Index>(1, cols));
+  SvdType svd(m, computationOptions);
+
+       if(internal::is_same<RealScalar,double>::value) svd.setThreshold(1e-8);
+  else if(internal::is_same<RealScalar,float>::value)  svd.setThreshold(2e-4);
+
+  SolutionType x = svd.solve(rhs);
+   
+  RealScalar residual = (m*x-rhs).norm();
+  RealScalar rhs_norm = rhs.norm();
+  if(!test_isMuchSmallerThan(residual,rhs.norm()))
+  {
+    // ^^^ If the residual is very small, then we have an exact solution, so we are already good.
+    
+    // evaluate normal equation which works also for least-squares solutions
+    if(internal::is_same<RealScalar,double>::value || svd.rank()==m.diagonal().size())
+    {
+      using std::sqrt;
+      // This test is not stable with single precision.
+      // This is probably because squaring m signicantly affects the precision.      
+      if(internal::is_same<RealScalar,float>::value) ++g_test_level;
+      
+      VERIFY_IS_APPROX(m.adjoint()*(m*x),m.adjoint()*rhs);
+      
+      if(internal::is_same<RealScalar,float>::value) --g_test_level;
+    }
+    
+    // Check that there is no significantly better solution in the neighborhood of x
+    for(Index k=0;k<x.rows();++k)
+    {
+      using std::abs;
+      
+      SolutionType y(x);
+      y.row(k) = (RealScalar(1)+2*NumTraits<RealScalar>::epsilon())*x.row(k);
+      RealScalar residual_y = (m*y-rhs).norm();
+      VERIFY( test_isMuchSmallerThan(abs(residual_y-residual), rhs_norm) || residual < residual_y );
+      if(internal::is_same<RealScalar,float>::value) ++g_test_level;
+      VERIFY( test_isApprox(residual_y,residual) || residual < residual_y );
+      if(internal::is_same<RealScalar,float>::value) --g_test_level;
+      
+      y.row(k) = (RealScalar(1)-2*NumTraits<RealScalar>::epsilon())*x.row(k);
+      residual_y = (m*y-rhs).norm();
+      VERIFY( test_isMuchSmallerThan(abs(residual_y-residual), rhs_norm) || residual < residual_y );
+      if(internal::is_same<RealScalar,float>::value) ++g_test_level;
+      VERIFY( test_isApprox(residual_y,residual) || residual < residual_y );
+      if(internal::is_same<RealScalar,float>::value) --g_test_level;
+    }
+  }
+}
+
+// check minimal norm solutions, the inoput matrix m is only used to recover problem size
+template<typename MatrixType>
+void svd_min_norm(const MatrixType& m, unsigned int computationOptions)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  Index cols = m.cols();
+
+  enum {
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime
+  };
+
+  typedef Matrix<Scalar, ColsAtCompileTime, Dynamic> SolutionType;
+
+  // generate a full-rank m x n problem with m<n
+  enum {
+    RankAtCompileTime2 = ColsAtCompileTime==Dynamic ? Dynamic : (ColsAtCompileTime)/2+1,
+    RowsAtCompileTime3 = ColsAtCompileTime==Dynamic ? Dynamic : ColsAtCompileTime+1
+  };
+  typedef Matrix<Scalar, RankAtCompileTime2, ColsAtCompileTime> MatrixType2;
+  typedef Matrix<Scalar, RankAtCompileTime2, 1> RhsType2;
+  typedef Matrix<Scalar, ColsAtCompileTime, RankAtCompileTime2> MatrixType2T;
+  Index rank = RankAtCompileTime2==Dynamic ? internal::random<Index>(1,cols) : Index(RankAtCompileTime2);
+  MatrixType2 m2(rank,cols);
+  int guard = 0;
+  do {
+    m2.setRandom();
+  } while(SVD_FOR_MIN_NORM(MatrixType2)(m2).setThreshold(test_precision<Scalar>()).rank()!=rank && (++guard)<10);
+  VERIFY(guard<10);
+
+  RhsType2 rhs2 = RhsType2::Random(rank);
+  // use QR to find a reference minimal norm solution
+  HouseholderQR<MatrixType2T> qr(m2.adjoint());
+  Matrix<Scalar,Dynamic,1> tmp = qr.matrixQR().topLeftCorner(rank,rank).template triangularView<Upper>().adjoint().solve(rhs2);
+  tmp.conservativeResize(cols);
+  tmp.tail(cols-rank).setZero();
+  SolutionType x21 = qr.householderQ() * tmp;
+  // now check with SVD
+  SVD_FOR_MIN_NORM(MatrixType2) svd2(m2, computationOptions);
+  SolutionType x22 = svd2.solve(rhs2);
+  VERIFY_IS_APPROX(m2*x21, rhs2);
+  VERIFY_IS_APPROX(m2*x22, rhs2);
+  VERIFY_IS_APPROX(x21, x22);
+
+  // Now check with a rank deficient matrix
+  typedef Matrix<Scalar, RowsAtCompileTime3, ColsAtCompileTime> MatrixType3;
+  typedef Matrix<Scalar, RowsAtCompileTime3, 1> RhsType3;
+  Index rows3 = RowsAtCompileTime3==Dynamic ? internal::random<Index>(rank+1,2*cols) : Index(RowsAtCompileTime3);
+  Matrix<Scalar,RowsAtCompileTime3,Dynamic> C = Matrix<Scalar,RowsAtCompileTime3,Dynamic>::Random(rows3,rank);
+  MatrixType3 m3 = C * m2;
+  RhsType3 rhs3 = C * rhs2;
+  SVD_FOR_MIN_NORM(MatrixType3) svd3(m3, computationOptions);
+  SolutionType x3 = svd3.solve(rhs3);
+  VERIFY_IS_APPROX(m3*x3, rhs3);
+  VERIFY_IS_APPROX(m3*x21, rhs3);
+  VERIFY_IS_APPROX(m2*x3, rhs2);
+  VERIFY_IS_APPROX(x21, x3);
+}
+
+template<typename MatrixType, typename SolverType>
+void svd_test_solvers(const MatrixType& m, const SolverType& solver) {
+    Index rows, cols, cols2;
+
+    rows = m.rows();
+    cols = m.cols();
+
+    if(MatrixType::ColsAtCompileTime==Dynamic)
+    {
+      cols2 = internal::random<int>(2,EIGEN_TEST_MAX_SIZE);
+    }
+    else
+    {
+      cols2 = cols;
+    }
+    typedef Matrix<typename MatrixType::Scalar, MatrixType::ColsAtCompileTime, MatrixType::ColsAtCompileTime> CMatrixType;
+    check_solverbase<CMatrixType, MatrixType>(m, solver, rows, cols, cols2);
+}
+
+// Check full, compare_to_full, least_square, and min_norm for all possible compute-options
+template<typename SvdType, typename MatrixType>
+void svd_test_all_computation_options(const MatrixType& m, bool full_only)
+{
+//   if (QRPreconditioner == NoQRPreconditioner && m.rows() != m.cols())
+//     return;
+  STATIC_CHECK(( internal::is_same<typename SvdType::StorageIndex,int>::value ));
+
+  SvdType fullSvd(m, ComputeFullU|ComputeFullV);
+  CALL_SUBTEST(( svd_check_full(m, fullSvd) ));
+  CALL_SUBTEST(( svd_least_square<SvdType>(m, ComputeFullU | ComputeFullV) ));
+  CALL_SUBTEST(( svd_min_norm(m, ComputeFullU | ComputeFullV) ));
+  
+  #if defined __INTEL_COMPILER
+  // remark #111: statement is unreachable
+  #pragma warning disable 111
+  #endif
+
+  svd_test_solvers(m, fullSvd);
+
+  if(full_only)
+    return;
+
+  CALL_SUBTEST(( svd_compare_to_full(m, ComputeFullU, fullSvd) ));
+  CALL_SUBTEST(( svd_compare_to_full(m, ComputeFullV, fullSvd) ));
+  CALL_SUBTEST(( svd_compare_to_full(m, 0, fullSvd) ));
+
+  if (MatrixType::ColsAtCompileTime == Dynamic) {
+    // thin U/V are only available with dynamic number of columns
+    CALL_SUBTEST(( svd_compare_to_full(m, ComputeFullU|ComputeThinV, fullSvd) ));
+    CALL_SUBTEST(( svd_compare_to_full(m,              ComputeThinV, fullSvd) ));
+    CALL_SUBTEST(( svd_compare_to_full(m, ComputeThinU|ComputeFullV, fullSvd) ));
+    CALL_SUBTEST(( svd_compare_to_full(m, ComputeThinU             , fullSvd) ));
+    CALL_SUBTEST(( svd_compare_to_full(m, ComputeThinU|ComputeThinV, fullSvd) ));
+    
+    CALL_SUBTEST(( svd_least_square<SvdType>(m, ComputeFullU | ComputeThinV) ));
+    CALL_SUBTEST(( svd_least_square<SvdType>(m, ComputeThinU | ComputeFullV) ));
+    CALL_SUBTEST(( svd_least_square<SvdType>(m, ComputeThinU | ComputeThinV) ));
+
+    CALL_SUBTEST(( svd_min_norm(m, ComputeFullU | ComputeThinV) ));
+    CALL_SUBTEST(( svd_min_norm(m, ComputeThinU | ComputeFullV) ));
+    CALL_SUBTEST(( svd_min_norm(m, ComputeThinU | ComputeThinV) ));
+
+    // test reconstruction
+    Index diagSize = (std::min)(m.rows(), m.cols());
+    SvdType svd(m, ComputeThinU | ComputeThinV);
+    VERIFY_IS_APPROX(m, svd.matrixU().leftCols(diagSize) * svd.singularValues().asDiagonal() * svd.matrixV().leftCols(diagSize).adjoint());
+  }
+}
+
+
+// work around stupid msvc error when constructing at compile time an expression that involves
+// a division by zero, even if the numeric type has floating point
+template<typename Scalar>
+EIGEN_DONT_INLINE Scalar zero() { return Scalar(0); }
+
+// workaround aggressive optimization in ICC
+template<typename T> EIGEN_DONT_INLINE  T sub(T a, T b) { return a - b; }
+
+// This function verifies we don't iterate infinitely on nan/inf values,
+// and that info() returns InvalidInput.
+template<typename SvdType, typename MatrixType>
+void svd_inf_nan()
+{
+  SvdType svd;
+  typedef typename MatrixType::Scalar Scalar;
+  Scalar some_inf = Scalar(1) / zero<Scalar>();
+  VERIFY(sub(some_inf, some_inf) != sub(some_inf, some_inf));
+  svd.compute(MatrixType::Constant(10,10,some_inf), ComputeFullU | ComputeFullV);
+  VERIFY(svd.info() == InvalidInput);
+
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+  VERIFY(nan != nan);
+  svd.compute(MatrixType::Constant(10,10,nan), ComputeFullU | ComputeFullV);
+  VERIFY(svd.info() == InvalidInput);  
+
+  MatrixType m = MatrixType::Zero(10,10);
+  m(internal::random<int>(0,9), internal::random<int>(0,9)) = some_inf;
+  svd.compute(m, ComputeFullU | ComputeFullV);
+  VERIFY(svd.info() == InvalidInput);
+
+  m = MatrixType::Zero(10,10);
+  m(internal::random<int>(0,9), internal::random<int>(0,9)) = nan;
+  svd.compute(m, ComputeFullU | ComputeFullV);
+  VERIFY(svd.info() == InvalidInput);
+  
+  // regression test for bug 791
+  m.resize(3,3);
+  m << 0,    2*NumTraits<Scalar>::epsilon(),  0.5,
+       0,   -0.5,                             0,
+       nan,  0,                               0;
+  svd.compute(m, ComputeFullU | ComputeFullV);
+  VERIFY(svd.info() == InvalidInput);
+  
+  m.resize(4,4);
+  m <<  1, 0, 0, 0,
+        0, 3, 1, 2e-308,
+        1, 0, 1, nan,
+        0, nan, nan, 0;
+  svd.compute(m, ComputeFullU | ComputeFullV);
+  VERIFY(svd.info() == InvalidInput);
+}
+
+// Regression test for bug 286: JacobiSVD loops indefinitely with some
+// matrices containing denormal numbers.
+template<typename>
+void svd_underoverflow()
+{
+#if defined __INTEL_COMPILER
+// shut up warning #239: floating point underflow
+#pragma warning push
+#pragma warning disable 239
+#endif
+  Matrix2d M;
+  M << -7.90884e-313, -4.94e-324,
+                 0, 5.60844e-313;
+  SVD_DEFAULT(Matrix2d) svd;
+  svd.compute(M,ComputeFullU|ComputeFullV);
+  CALL_SUBTEST( svd_check_full(M,svd) );
+  
+  // Check all 2x2 matrices made with the following coefficients:
+  VectorXd value_set(9);
+  value_set << 0, 1, -1, 5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324, -4.94e-223, 4.94e-223;
+  Array4i id(0,0,0,0);
+  int k = 0;
+  do
+  {
+    M << value_set(id(0)), value_set(id(1)), value_set(id(2)), value_set(id(3));
+    svd.compute(M,ComputeFullU|ComputeFullV);
+    CALL_SUBTEST( svd_check_full(M,svd) );
+
+    id(k)++;
+    if(id(k)>=value_set.size())
+    {
+      while(k<3 && id(k)>=value_set.size()) id(++k)++;
+      id.head(k).setZero();
+      k=0;
+    }
+
+  } while((id<int(value_set.size())).all());
+  
+#if defined __INTEL_COMPILER
+#pragma warning pop
+#endif
+  
+  // Check for overflow:
+  Matrix3d M3;
+  M3 << 4.4331978442502944e+307, -5.8585363752028680e+307,  6.4527017443412964e+307,
+        3.7841695601406358e+307,  2.4331702789740617e+306, -3.5235707140272905e+307,
+       -8.7190887618028355e+307, -7.3453213709232193e+307, -2.4367363684472105e+307;
+
+  SVD_DEFAULT(Matrix3d) svd3;
+  svd3.compute(M3,ComputeFullU|ComputeFullV); // just check we don't loop indefinitely
+  CALL_SUBTEST( svd_check_full(M3,svd3) );
+}
+
+// void jacobisvd(const MatrixType& a = MatrixType(), bool pickrandom = true)
+
+template<typename MatrixType>
+void svd_all_trivial_2x2( void (*cb)(const MatrixType&,bool) )
+{
+  MatrixType M;
+  VectorXd value_set(3);
+  value_set << 0, 1, -1;
+  Array4i id(0,0,0,0);
+  int k = 0;
+  do
+  {
+    M << value_set(id(0)), value_set(id(1)), value_set(id(2)), value_set(id(3));
+    
+    cb(M,false);
+    
+    id(k)++;
+    if(id(k)>=value_set.size())
+    {
+      while(k<3 && id(k)>=value_set.size()) id(++k)++;
+      id.head(k).setZero();
+      k=0;
+    }
+    
+  } while((id<int(value_set.size())).all());
+}
+
+template<typename>
+void svd_preallocate()
+{
+  Vector3f v(3.f, 2.f, 1.f);
+  MatrixXf m = v.asDiagonal();
+
+  internal::set_is_malloc_allowed(false);
+  VERIFY_RAISES_ASSERT(VectorXf tmp(10);)
+  SVD_DEFAULT(MatrixXf) svd;
+  internal::set_is_malloc_allowed(true);
+  svd.compute(m);
+  VERIFY_IS_APPROX(svd.singularValues(), v);
+
+  SVD_DEFAULT(MatrixXf) svd2(3,3);
+  internal::set_is_malloc_allowed(false);
+  svd2.compute(m);
+  internal::set_is_malloc_allowed(true);
+  VERIFY_IS_APPROX(svd2.singularValues(), v);
+  VERIFY_RAISES_ASSERT(svd2.matrixU());
+  VERIFY_RAISES_ASSERT(svd2.matrixV());
+  svd2.compute(m, ComputeFullU | ComputeFullV);
+  VERIFY_IS_APPROX(svd2.matrixU(), Matrix3f::Identity());
+  VERIFY_IS_APPROX(svd2.matrixV(), Matrix3f::Identity());
+  internal::set_is_malloc_allowed(false);
+  svd2.compute(m);
+  internal::set_is_malloc_allowed(true);
+
+  SVD_DEFAULT(MatrixXf) svd3(3,3,ComputeFullU|ComputeFullV);
+  internal::set_is_malloc_allowed(false);
+  svd2.compute(m);
+  internal::set_is_malloc_allowed(true);
+  VERIFY_IS_APPROX(svd2.singularValues(), v);
+  VERIFY_IS_APPROX(svd2.matrixU(), Matrix3f::Identity());
+  VERIFY_IS_APPROX(svd2.matrixV(), Matrix3f::Identity());
+  internal::set_is_malloc_allowed(false);
+  svd2.compute(m, ComputeFullU|ComputeFullV);
+  internal::set_is_malloc_allowed(true);
+}
+
+template<typename SvdType,typename MatrixType> 
+void svd_verify_assert(const MatrixType& m, bool fullOnly = false)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime
+  };
+
+  typedef Matrix<Scalar, RowsAtCompileTime, 1> RhsType;
+  RhsType rhs(rows);
+  SvdType svd;
+  VERIFY_RAISES_ASSERT(svd.matrixU())
+  VERIFY_RAISES_ASSERT(svd.singularValues())
+  VERIFY_RAISES_ASSERT(svd.matrixV())
+  VERIFY_RAISES_ASSERT(svd.solve(rhs))
+  VERIFY_RAISES_ASSERT(svd.transpose().solve(rhs))
+  VERIFY_RAISES_ASSERT(svd.adjoint().solve(rhs))
+  MatrixType a = MatrixType::Zero(rows, cols);
+  a.setZero();
+  svd.compute(a, 0);
+  VERIFY_RAISES_ASSERT(svd.matrixU())
+  VERIFY_RAISES_ASSERT(svd.matrixV())
+  svd.singularValues();
+  VERIFY_RAISES_ASSERT(svd.solve(rhs))
+
+  svd.compute(a, ComputeFullU);
+  svd.matrixU();
+  VERIFY_RAISES_ASSERT(svd.matrixV())
+  VERIFY_RAISES_ASSERT(svd.solve(rhs))
+  svd.compute(a, ComputeFullV);
+  svd.matrixV();
+  VERIFY_RAISES_ASSERT(svd.matrixU())
+  VERIFY_RAISES_ASSERT(svd.solve(rhs))
+
+  if (!fullOnly && ColsAtCompileTime == Dynamic)
+  {
+    svd.compute(a, ComputeThinU);
+    svd.matrixU();
+    VERIFY_RAISES_ASSERT(svd.matrixV())
+    VERIFY_RAISES_ASSERT(svd.solve(rhs))
+    svd.compute(a, ComputeThinV);
+    svd.matrixV();
+    VERIFY_RAISES_ASSERT(svd.matrixU())
+    VERIFY_RAISES_ASSERT(svd.solve(rhs))
+  }
+  else
+  {
+    VERIFY_RAISES_ASSERT(svd.compute(a, ComputeThinU))
+    VERIFY_RAISES_ASSERT(svd.compute(a, ComputeThinV))
+  }
+}
+
+#undef SVD_DEFAULT
+#undef SVD_FOR_MIN_NORM

diff --git a/test/svd_fill.h b/test/svd_fill.h
new file mode 100644
index 0000000..d68647e
--- /dev/null
+++ b/test/svd_fill.h

@@ -0,0 +1,118 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+template<typename T>
+Array<T,4,1> four_denorms();
+
+template<>
+Array4f four_denorms() { return Array4f(5.60844e-39f, -5.60844e-39f, 4.94e-44f, -4.94e-44f); }
+template<>
+Array4d four_denorms() { return Array4d(5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324); }
+template<typename T>
+Array<T,4,1> four_denorms() { return four_denorms<double>().cast<T>(); }
+
+template<typename MatrixType>
+void svd_fill_random(MatrixType &m, int Option = 0)
+{
+  using std::pow;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  Index diagSize = (std::min)(m.rows(), m.cols());
+  RealScalar s = std::numeric_limits<RealScalar>::max_exponent10/4;
+  s = internal::random<RealScalar>(1,s);
+  Matrix<RealScalar,Dynamic,1> d =  Matrix<RealScalar,Dynamic,1>::Random(diagSize);
+  for(Index k=0; k<diagSize; ++k)
+    d(k) = d(k)*pow(RealScalar(10),internal::random<RealScalar>(-s,s));
+
+  bool dup     = internal::random<int>(0,10) < 3;
+  bool unit_uv = internal::random<int>(0,10) < (dup?7:3); // if we duplicate some diagonal entries, then increase the chance to preserve them using unitary U and V factors
+  
+  // duplicate some singular values
+  if(dup)
+  {
+    Index n = internal::random<Index>(0,d.size()-1);
+    for(Index i=0; i<n; ++i)
+      d(internal::random<Index>(0,d.size()-1)) = d(internal::random<Index>(0,d.size()-1));
+  }
+  
+  Matrix<Scalar,Dynamic,Dynamic> U(m.rows(),diagSize);
+  Matrix<Scalar,Dynamic,Dynamic> VT(diagSize,m.cols());
+  if(unit_uv)
+  {
+    // in very rare cases let's try with a pure diagonal matrix
+    if(internal::random<int>(0,10) < 1)
+    {
+      U.setIdentity();
+      VT.setIdentity();
+    }
+    else
+    {
+      createRandomPIMatrixOfRank(diagSize,U.rows(), U.cols(), U);
+      createRandomPIMatrixOfRank(diagSize,VT.rows(), VT.cols(), VT);
+    }
+  }
+  else
+  {
+    U.setRandom();
+    VT.setRandom();
+  }
+  
+  Matrix<Scalar,Dynamic,1> samples(9);
+  samples << 0, four_denorms<RealScalar>(),
+            -RealScalar(1)/NumTraits<RealScalar>::highest(), RealScalar(1)/NumTraits<RealScalar>::highest(), (std::numeric_limits<RealScalar>::min)(), pow((std::numeric_limits<RealScalar>::min)(),0.8);
+  
+  if(Option==Symmetric)
+  {
+    m = U * d.asDiagonal() * U.transpose();
+    
+    // randomly nullify some rows/columns
+    {
+      Index count = internal::random<Index>(-diagSize,diagSize);
+      for(Index k=0; k<count; ++k)
+      {
+        Index i = internal::random<Index>(0,diagSize-1);
+        m.row(i).setZero();
+        m.col(i).setZero();
+      }
+      if(count<0)
+      // (partly) cancel some coeffs
+      if(!(dup && unit_uv))
+      {
+        
+        Index n = internal::random<Index>(0,m.size()-1);
+        for(Index k=0; k<n; ++k)
+        {
+          Index i = internal::random<Index>(0,m.rows()-1);
+          Index j = internal::random<Index>(0,m.cols()-1);
+          m(j,i) = m(i,j) = samples(internal::random<Index>(0,samples.size()-1));
+          if(NumTraits<Scalar>::IsComplex)
+            *(&numext::real_ref(m(j,i))+1) = *(&numext::real_ref(m(i,j))+1) = samples.real()(internal::random<Index>(0,samples.size()-1));
+        }
+      }
+    }
+  }
+  else
+  {
+    m = U * d.asDiagonal() * VT;
+    // (partly) cancel some coeffs
+    if(!(dup && unit_uv))
+    {
+      Index n = internal::random<Index>(0,m.size()-1);
+      for(Index k=0; k<n; ++k)
+      {
+        Index i = internal::random<Index>(0,m.rows()-1);
+        Index j = internal::random<Index>(0,m.cols()-1);
+        m(i,j) = samples(internal::random<Index>(0,samples.size()-1));
+        if(NumTraits<Scalar>::IsComplex)
+          *(&numext::real_ref(m(i,j))+1) = samples.real()(internal::random<Index>(0,samples.size()-1));
+      }
+    }
+  }
+}
+

diff --git a/test/swap.cpp b/test/swap.cpp
new file mode 100644
index 0000000..5b259d3
--- /dev/null
+++ b/test/swap.cpp

@@ -0,0 +1,94 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_NO_STATIC_ASSERT
+#include "main.h"
+
+template<typename T>
+struct other_matrix_type
+{
+  typedef int type;
+};
+
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct other_matrix_type<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+{
+  typedef Matrix<_Scalar, _Rows, _Cols, _Options^RowMajor, _MaxRows, _MaxCols> type;
+};
+
+template<typename MatrixType> void swap(const MatrixType& m)
+{
+  typedef typename other_matrix_type<MatrixType>::type OtherMatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+
+  eigen_assert((!internal::is_same<MatrixType,OtherMatrixType>::value));
+  Index rows = m.rows();
+  Index cols = m.cols();
+  
+  // construct 3 matrix guaranteed to be distinct
+  MatrixType m1 = MatrixType::Random(rows,cols);
+  MatrixType m2 = MatrixType::Random(rows,cols) + Scalar(100) * MatrixType::Identity(rows,cols);
+  OtherMatrixType m3 = OtherMatrixType::Random(rows,cols) + Scalar(200) * OtherMatrixType::Identity(rows,cols);
+  
+  MatrixType m1_copy = m1;
+  MatrixType m2_copy = m2;
+  OtherMatrixType m3_copy = m3;
+  
+  // test swapping 2 matrices of same type
+  Scalar *d1=m1.data(), *d2=m2.data();
+  m1.swap(m2);
+  VERIFY_IS_APPROX(m1,m2_copy);
+  VERIFY_IS_APPROX(m2,m1_copy);
+  if(MatrixType::SizeAtCompileTime==Dynamic)
+  {
+    VERIFY(m1.data()==d2);
+    VERIFY(m2.data()==d1);
+  }
+  m1 = m1_copy;
+  m2 = m2_copy;
+  
+  // test swapping 2 matrices of different types
+  m1.swap(m3);
+  VERIFY_IS_APPROX(m1,m3_copy);
+  VERIFY_IS_APPROX(m3,m1_copy);
+  m1 = m1_copy;
+  m3 = m3_copy;
+  
+  // test swapping matrix with expression
+  m1.swap(m2.block(0,0,rows,cols));
+  VERIFY_IS_APPROX(m1,m2_copy);
+  VERIFY_IS_APPROX(m2,m1_copy);
+  m1 = m1_copy;
+  m2 = m2_copy;
+
+  // test swapping two expressions of different types
+  m1.transpose().swap(m3.transpose());
+  VERIFY_IS_APPROX(m1,m3_copy);
+  VERIFY_IS_APPROX(m3,m1_copy);
+  m1 = m1_copy;
+  m3 = m3_copy;
+  
+  if(m1.rows()>1)
+  {
+    // test assertion on mismatching size -- matrix case
+    VERIFY_RAISES_ASSERT(m1.swap(m1.row(0)));
+    // test assertion on mismatching size -- xpr case
+    VERIFY_RAISES_ASSERT(m1.row(0).swap(m1));
+  }
+}
+
+EIGEN_DECLARE_TEST(swap)
+{
+  int s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
+  CALL_SUBTEST_1( swap(Matrix3f()) ); // fixed size, no vectorization 
+  CALL_SUBTEST_2( swap(Matrix4d()) ); // fixed size, possible vectorization 
+  CALL_SUBTEST_3( swap(MatrixXd(s,s)) ); // dyn size, no vectorization 
+  CALL_SUBTEST_4( swap(MatrixXf(s,s)) ); // dyn size, possible vectorization 
+  TEST_SET_BUT_UNUSED_VARIABLE(s)
+}

diff --git a/test/symbolic_index.cpp b/test/symbolic_index.cpp
new file mode 100644
index 0000000..a75ca11
--- /dev/null
+++ b/test/symbolic_index.cpp

@@ -0,0 +1,84 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifdef EIGEN_TEST_PART_2
+#define EIGEN_MAX_CPP_VER 03
+
+// see indexed_view.cpp
+#if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
+  #pragma GCC diagnostic ignored "-Wdeprecated"
+#endif
+
+#endif
+
+#include "main.h"
+
+template<typename T1,typename T2>
+bool is_same_symb(const T1& a, const T2& b, Index size)
+{
+  return a.eval(last=size-1) == b.eval(last=size-1);
+}
+
+template<typename T>
+void check_is_symbolic(const T&) {
+  STATIC_CHECK(( symbolic::is_symbolic<T>::value ))
+}
+
+template<typename T>
+void check_isnot_symbolic(const T&) {
+  STATIC_CHECK(( !symbolic::is_symbolic<T>::value ))
+}
+
+#define VERIFY_EQ_INT(A,B) VERIFY_IS_APPROX(int(A),int(B))
+
+void check_symbolic_index()
+{
+  check_is_symbolic(last);
+  check_is_symbolic(lastp1);
+  check_is_symbolic(last+1);
+  check_is_symbolic(last-lastp1);
+  check_is_symbolic(2*last-lastp1/2);
+  check_isnot_symbolic(fix<3>());
+
+  Index size=100;
+
+  // First, let's check FixedInt arithmetic:
+  VERIFY( is_same_type( (fix<5>()-fix<3>())*fix<9>()/(-fix<3>()), fix<-(5-3)*9/3>() ) );
+  VERIFY( is_same_type( (fix<5>()-fix<3>())*fix<9>()/fix<2>(), fix<(5-3)*9/2>() ) );
+  VERIFY( is_same_type( fix<9>()/fix<2>(), fix<9/2>() ) );
+  VERIFY( is_same_type( fix<9>()%fix<2>(), fix<9%2>() ) );
+  VERIFY( is_same_type( fix<9>()&fix<2>(), fix<9&2>() ) );
+  VERIFY( is_same_type( fix<9>()|fix<2>(), fix<9|2>() ) );
+  VERIFY( is_same_type( fix<9>()/2, int(9/2) ) );
+
+  VERIFY( is_same_symb( lastp1-1, last, size) );
+  VERIFY( is_same_symb( lastp1-fix<1>(), last, size) );
+
+  VERIFY_IS_EQUAL( ( (last*5-2)/3 ).eval(last=size-1), ((size-1)*5-2)/3 );
+  VERIFY_IS_EQUAL( ( (last*fix<5>()-fix<2>())/fix<3>() ).eval(last=size-1), ((size-1)*5-2)/3 );
+  VERIFY_IS_EQUAL( ( -last*lastp1  ).eval(last=size-1), -(size-1)*size );
+  VERIFY_IS_EQUAL( ( lastp1-3*last  ).eval(last=size-1), size- 3*(size-1) );
+  VERIFY_IS_EQUAL( ( (lastp1-3*last)/lastp1  ).eval(last=size-1), (size- 3*(size-1))/size );
+
+#if EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
+  {
+    struct x_tag {};  static const symbolic::SymbolExpr<x_tag> x;
+    struct y_tag {};  static const symbolic::SymbolExpr<y_tag> y;
+    struct z_tag {};  static const symbolic::SymbolExpr<z_tag> z;
+
+    VERIFY_IS_APPROX( int(((x+3)/y+z).eval(x=6,y=3,z=-13)), (6+3)/3+(-13) );
+  }
+#endif
+}
+
+EIGEN_DECLARE_TEST(symbolic_index)
+{
+  CALL_SUBTEST_1( check_symbolic_index() );
+  CALL_SUBTEST_2( check_symbolic_index() );
+}

diff --git a/test/triangular.cpp b/test/triangular.cpp
new file mode 100644
index 0000000..981a0d0
--- /dev/null
+++ b/test/triangular.cpp

@@ -0,0 +1,292 @@
+// This file is triangularView of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifdef EIGEN_TEST_PART_100
+#  define EIGEN_NO_DEPRECATED_WARNING
+#endif
+
+#include "main.h"
+
+
+template<typename MatrixType> void triangular_deprecated(const MatrixType &m)
+{
+  Index rows = m.rows();
+  Index cols = m.cols();
+  MatrixType m1, m2, m3, m4;
+  m1.setRandom(rows,cols);
+  m2.setRandom(rows,cols);
+  m3 = m1; m4 = m2;
+  // deprecated method:
+  m1.template triangularView<Eigen::Upper>().swap(m2);
+  // use this method instead:
+  m3.template triangularView<Eigen::Upper>().swap(m4.template triangularView<Eigen::Upper>());
+  VERIFY_IS_APPROX(m1,m3);
+  VERIFY_IS_APPROX(m2,m4);
+  // deprecated method:
+  m1.template triangularView<Eigen::Lower>().swap(m4);
+  // use this method instead:
+  m3.template triangularView<Eigen::Lower>().swap(m2.template triangularView<Eigen::Lower>());
+  VERIFY_IS_APPROX(m1,m3);
+  VERIFY_IS_APPROX(m2,m4);
+}
+
+
+template<typename MatrixType> void triangular_square(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+
+  RealScalar largerEps = 10*test_precision<RealScalar>();
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m2 = MatrixType::Random(rows, cols),
+             m3(rows, cols),
+             m4(rows, cols),
+             r1(rows, cols),
+             r2(rows, cols);
+  VectorType v2 = VectorType::Random(rows);
+
+  MatrixType m1up = m1.template triangularView<Upper>();
+  MatrixType m2up = m2.template triangularView<Upper>();
+
+  if (rows*cols>1)
+  {
+    VERIFY(m1up.isUpperTriangular());
+    VERIFY(m2up.transpose().isLowerTriangular());
+    VERIFY(!m2.isLowerTriangular());
+  }
+
+//   VERIFY_IS_APPROX(m1up.transpose() * m2, m1.upper().transpose().lower() * m2);
+
+  // test overloaded operator+=
+  r1.setZero();
+  r2.setZero();
+  r1.template triangularView<Upper>() +=  m1;
+  r2 += m1up;
+  VERIFY_IS_APPROX(r1,r2);
+
+  // test overloaded operator=
+  m1.setZero();
+  m1.template triangularView<Upper>() = m2.transpose() + m2;
+  m3 = m2.transpose() + m2;
+  VERIFY_IS_APPROX(m3.template triangularView<Lower>().transpose().toDenseMatrix(), m1);
+
+  // test overloaded operator=
+  m1.setZero();
+  m1.template triangularView<Lower>() = m2.transpose() + m2;
+  VERIFY_IS_APPROX(m3.template triangularView<Lower>().toDenseMatrix(), m1);
+
+  VERIFY_IS_APPROX(m3.template triangularView<Lower>().conjugate().toDenseMatrix(),
+                   m3.conjugate().template triangularView<Lower>().toDenseMatrix());
+
+  m1 = MatrixType::Random(rows, cols);
+  for (int i=0; i<rows; ++i)
+    while (numext::abs2(m1(i,i))<RealScalar(1e-1)) m1(i,i) = internal::random<Scalar>();
+
+  Transpose<MatrixType> trm4(m4);
+  // test back and forward substitution with a vector as the rhs
+  m3 = m1.template triangularView<Upper>();
+  VERIFY(v2.isApprox(m3.adjoint() * (m1.adjoint().template triangularView<Lower>().solve(v2)), largerEps));
+  m3 = m1.template triangularView<Lower>();
+  VERIFY(v2.isApprox(m3.transpose() * (m1.transpose().template triangularView<Upper>().solve(v2)), largerEps));
+  m3 = m1.template triangularView<Upper>();
+  VERIFY(v2.isApprox(m3 * (m1.template triangularView<Upper>().solve(v2)), largerEps));
+  m3 = m1.template triangularView<Lower>();
+  VERIFY(v2.isApprox(m3.conjugate() * (m1.conjugate().template triangularView<Lower>().solve(v2)), largerEps));
+
+  // test back and forward substitution with a matrix as the rhs
+  m3 = m1.template triangularView<Upper>();
+  VERIFY(m2.isApprox(m3.adjoint() * (m1.adjoint().template triangularView<Lower>().solve(m2)), largerEps));
+  m3 = m1.template triangularView<Lower>();
+  VERIFY(m2.isApprox(m3.transpose() * (m1.transpose().template triangularView<Upper>().solve(m2)), largerEps));
+  m3 = m1.template triangularView<Upper>();
+  VERIFY(m2.isApprox(m3 * (m1.template triangularView<Upper>().solve(m2)), largerEps));
+  m3 = m1.template triangularView<Lower>();
+  VERIFY(m2.isApprox(m3.conjugate() * (m1.conjugate().template triangularView<Lower>().solve(m2)), largerEps));
+
+  // check M * inv(L) using in place API
+  m4 = m3;
+  m1.transpose().template triangularView<Eigen::Upper>().solveInPlace(trm4);
+  VERIFY_IS_APPROX(m4 * m1.template triangularView<Eigen::Lower>(), m3);
+
+  // check M * inv(U) using in place API
+  m3 = m1.template triangularView<Upper>();
+  m4 = m3;
+  m3.transpose().template triangularView<Eigen::Lower>().solveInPlace(trm4);
+  VERIFY_IS_APPROX(m4 * m1.template triangularView<Eigen::Upper>(), m3);
+
+  // check solve with unit diagonal
+  m3 = m1.template triangularView<UnitUpper>();
+  VERIFY(m2.isApprox(m3 * (m1.template triangularView<UnitUpper>().solve(m2)), largerEps));
+
+//   VERIFY((  m1.template triangularView<Upper>()
+//           * m2.template triangularView<Upper>()).isUpperTriangular());
+
+  // test swap
+  m1.setOnes();
+  m2.setZero();
+  m2.template triangularView<Upper>().swap(m1.template triangularView<Eigen::Upper>());
+  m3.setZero();
+  m3.template triangularView<Upper>().setOnes();
+  VERIFY_IS_APPROX(m2,m3);
+  VERIFY_RAISES_STATIC_ASSERT(m1.template triangularView<Eigen::Lower>().swap(m2.template triangularView<Eigen::Upper>()));
+
+  m1.setRandom();
+  m3 = m1.template triangularView<Upper>();
+  Matrix<Scalar, MatrixType::ColsAtCompileTime, Dynamic> m5(cols, internal::random<int>(1,20));  m5.setRandom();
+  Matrix<Scalar, Dynamic, MatrixType::RowsAtCompileTime> m6(internal::random<int>(1,20), rows);  m6.setRandom();
+  VERIFY_IS_APPROX(m1.template triangularView<Upper>() * m5, m3*m5);
+  VERIFY_IS_APPROX(m6*m1.template triangularView<Upper>(), m6*m3);
+
+  m1up = m1.template triangularView<Upper>();
+  VERIFY_IS_APPROX(m1.template selfadjointView<Upper>().template triangularView<Upper>().toDenseMatrix(), m1up);
+  VERIFY_IS_APPROX(m1up.template selfadjointView<Upper>().template triangularView<Upper>().toDenseMatrix(), m1up);
+  VERIFY_IS_APPROX(m1.template selfadjointView<Upper>().template triangularView<Lower>().toDenseMatrix(), m1up.adjoint());
+  VERIFY_IS_APPROX(m1up.template selfadjointView<Upper>().template triangularView<Lower>().toDenseMatrix(), m1up.adjoint());
+
+  VERIFY_IS_APPROX(m1.template selfadjointView<Upper>().diagonal(), m1.diagonal());
+
+  m3.setRandom();
+  const MatrixType& m3c(m3);
+  VERIFY( is_same_type(m3c.template triangularView<Lower>(),m3.template triangularView<Lower>().template conjugateIf<false>()) );
+  VERIFY( is_same_type(m3c.template triangularView<Lower>().conjugate(),m3.template triangularView<Lower>().template conjugateIf<true>()) );
+  VERIFY_IS_APPROX(m3.template triangularView<Lower>().template conjugateIf<true>().toDenseMatrix(),
+                   m3.conjugate().template triangularView<Lower>().toDenseMatrix());
+  VERIFY_IS_APPROX(m3.template triangularView<Lower>().template conjugateIf<false>().toDenseMatrix(),
+                   m3.template triangularView<Lower>().toDenseMatrix());
+
+  VERIFY( is_same_type(m3c.template selfadjointView<Lower>(),m3.template selfadjointView<Lower>().template conjugateIf<false>()) );
+  VERIFY( is_same_type(m3c.template selfadjointView<Lower>().conjugate(),m3.template selfadjointView<Lower>().template conjugateIf<true>()) );
+  VERIFY_IS_APPROX(m3.template selfadjointView<Lower>().template conjugateIf<true>().toDenseMatrix(),
+                   m3.conjugate().template selfadjointView<Lower>().toDenseMatrix());
+  VERIFY_IS_APPROX(m3.template selfadjointView<Lower>().template conjugateIf<false>().toDenseMatrix(),
+                   m3.template selfadjointView<Lower>().toDenseMatrix());
+
+}
+
+
+template<typename MatrixType> void triangular_rect(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  enum { Rows =  MatrixType::RowsAtCompileTime, Cols =  MatrixType::ColsAtCompileTime };
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m2 = MatrixType::Random(rows, cols),
+             m3(rows, cols),
+             m4(rows, cols),
+             r1(rows, cols),
+             r2(rows, cols);
+
+  MatrixType m1up = m1.template triangularView<Upper>();
+  MatrixType m2up = m2.template triangularView<Upper>();
+
+  if (rows>1 && cols>1)
+  {
+    VERIFY(m1up.isUpperTriangular());
+    VERIFY(m2up.transpose().isLowerTriangular());
+    VERIFY(!m2.isLowerTriangular());
+  }
+
+  // test overloaded operator+=
+  r1.setZero();
+  r2.setZero();
+  r1.template triangularView<Upper>() +=  m1;
+  r2 += m1up;
+  VERIFY_IS_APPROX(r1,r2);
+
+  // test overloaded operator=
+  m1.setZero();
+  m1.template triangularView<Upper>() = 3 * m2;
+  m3 = 3 * m2;
+  VERIFY_IS_APPROX(m3.template triangularView<Upper>().toDenseMatrix(), m1);
+
+
+  m1.setZero();
+  m1.template triangularView<Lower>() = 3 * m2;
+  VERIFY_IS_APPROX(m3.template triangularView<Lower>().toDenseMatrix(), m1);
+
+  m1.setZero();
+  m1.template triangularView<StrictlyUpper>() = 3 * m2;
+  VERIFY_IS_APPROX(m3.template triangularView<StrictlyUpper>().toDenseMatrix(), m1);
+
+
+  m1.setZero();
+  m1.template triangularView<StrictlyLower>() = 3 * m2;
+  VERIFY_IS_APPROX(m3.template triangularView<StrictlyLower>().toDenseMatrix(), m1);
+  m1.setRandom();
+  m2 = m1.template triangularView<Upper>();
+  VERIFY(m2.isUpperTriangular());
+  VERIFY(!m2.isLowerTriangular());
+  m2 = m1.template triangularView<StrictlyUpper>();
+  VERIFY(m2.isUpperTriangular());
+  VERIFY(m2.diagonal().isMuchSmallerThan(RealScalar(1)));
+  m2 = m1.template triangularView<UnitUpper>();
+  VERIFY(m2.isUpperTriangular());
+  m2.diagonal().array() -= Scalar(1);
+  VERIFY(m2.diagonal().isMuchSmallerThan(RealScalar(1)));
+  m2 = m1.template triangularView<Lower>();
+  VERIFY(m2.isLowerTriangular());
+  VERIFY(!m2.isUpperTriangular());
+  m2 = m1.template triangularView<StrictlyLower>();
+  VERIFY(m2.isLowerTriangular());
+  VERIFY(m2.diagonal().isMuchSmallerThan(RealScalar(1)));
+  m2 = m1.template triangularView<UnitLower>();
+  VERIFY(m2.isLowerTriangular());
+  m2.diagonal().array() -= Scalar(1);
+  VERIFY(m2.diagonal().isMuchSmallerThan(RealScalar(1)));
+  // test swap
+  m1.setOnes();
+  m2.setZero();
+  m2.template triangularView<Upper>().swap(m1.template triangularView<Eigen::Upper>());
+  m3.setZero();
+  m3.template triangularView<Upper>().setOnes();
+  VERIFY_IS_APPROX(m2,m3);
+}
+
+void bug_159()
+{
+  Matrix3d m = Matrix3d::Random().triangularView<Lower>();
+  EIGEN_UNUSED_VARIABLE(m)
+}
+
+EIGEN_DECLARE_TEST(triangular)
+{
+  int maxsize = (std::min)(EIGEN_TEST_MAX_SIZE,20);
+  for(int i = 0; i < g_repeat ; i++)
+  {
+    int r = internal::random<int>(2,maxsize); TEST_SET_BUT_UNUSED_VARIABLE(r)
+    int c = internal::random<int>(2,maxsize); TEST_SET_BUT_UNUSED_VARIABLE(c)
+
+    CALL_SUBTEST_1( triangular_square(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( triangular_square(Matrix<float, 2, 2>()) );
+    CALL_SUBTEST_3( triangular_square(Matrix3d()) );
+    CALL_SUBTEST_4( triangular_square(Matrix<std::complex<float>,8, 8>()) );
+    CALL_SUBTEST_5( triangular_square(MatrixXcd(r,r)) );
+    CALL_SUBTEST_6( triangular_square(Matrix<float,Dynamic,Dynamic,RowMajor>(r, r)) );
+
+    CALL_SUBTEST_7( triangular_rect(Matrix<float, 4, 5>()) );
+    CALL_SUBTEST_8( triangular_rect(Matrix<double, 6, 2>()) );
+    CALL_SUBTEST_9( triangular_rect(MatrixXcf(r, c)) );
+    CALL_SUBTEST_5( triangular_rect(MatrixXcd(r, c)) );
+    CALL_SUBTEST_6( triangular_rect(Matrix<float,Dynamic,Dynamic,RowMajor>(r, c)) );
+
+    CALL_SUBTEST_100( triangular_deprecated(Matrix<float, 5, 7>()) );
+    CALL_SUBTEST_100( triangular_deprecated(MatrixXd(r,c)) );
+  }
+  
+  CALL_SUBTEST_1( bug_159() );
+}

diff --git a/test/type_alias.cpp b/test/type_alias.cpp
new file mode 100644
index 0000000..9a6616c
--- /dev/null
+++ b/test/type_alias.cpp

@@ -0,0 +1,48 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+EIGEN_DECLARE_TEST(type_alias)
+{
+  using namespace internal;
+
+  // To warm up, some basic checks:
+  STATIC_CHECK((is_same<MatrixXd,Matrix<double,Dynamic,Dynamic> >::value));
+  STATIC_CHECK((is_same<Matrix2f,Matrix<float,2,2> >::value));
+  STATIC_CHECK((is_same<Array33i,Array<int,3,3> >::value));
+
+#if EIGEN_HAS_CXX11
+  
+  STATIC_CHECK((is_same<MatrixX<double>,    MatrixXd>::value));
+  STATIC_CHECK((is_same<MatrixX<int>,       MatrixXi>::value));
+  STATIC_CHECK((is_same<Matrix2<int>,       Matrix2i>::value));
+  STATIC_CHECK((is_same<Matrix2X<float>,    Matrix2Xf>::value));
+  STATIC_CHECK((is_same<MatrixX4<double>,   MatrixX4d>::value));
+  STATIC_CHECK((is_same<VectorX<int>,       VectorXi>::value));
+  STATIC_CHECK((is_same<Vector2<float>,     Vector2f>::value));
+  STATIC_CHECK((is_same<RowVectorX<int>,    RowVectorXi>::value));
+  STATIC_CHECK((is_same<RowVector2<float>,  RowVector2f>::value));
+
+  STATIC_CHECK((is_same<ArrayXX<float>,     ArrayXXf>::value));
+  STATIC_CHECK((is_same<Array33<int>,       Array33i>::value));
+  STATIC_CHECK((is_same<Array2X<float>,     Array2Xf>::value));
+  STATIC_CHECK((is_same<ArrayX4<double>,    ArrayX4d>::value));
+  STATIC_CHECK((is_same<ArrayX<double>,     ArrayXd>::value));
+  STATIC_CHECK((is_same<Array4<double>,     Array4d>::value));
+
+  STATIC_CHECK((is_same<Vector<float,3>,        Vector3f>::value));
+  STATIC_CHECK((is_same<Vector<int,Dynamic>,    VectorXi>::value));
+  STATIC_CHECK((is_same<RowVector<float,3>,     RowVector3f>::value));
+  STATIC_CHECK((is_same<RowVector<int,Dynamic>, RowVectorXi>::value));
+
+#else
+  std::cerr << "WARNING: c++11 type aliases not tested.\n";
+#endif
+}

diff --git a/test/umeyama.cpp b/test/umeyama.cpp
new file mode 100644
index 0000000..170c28a
--- /dev/null
+++ b/test/umeyama.cpp

@@ -0,0 +1,183 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/Core>
+#include <Eigen/Geometry>
+
+#include <Eigen/LU> // required for MatrixBase::determinant
+#include <Eigen/SVD> // required for SVD
+
+using namespace Eigen;
+
+//  Constructs a random matrix from the unitary group U(size).
+template <typename T>
+Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> randMatrixUnitary(int size)
+{
+  typedef T Scalar;
+  typedef Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic> MatrixType;
+
+  MatrixType Q;
+
+  int max_tries = 40;
+  bool is_unitary = false;
+
+  while (!is_unitary && max_tries > 0)
+  {
+    // initialize random matrix
+    Q = MatrixType::Random(size, size);
+
+    // orthogonalize columns using the Gram-Schmidt algorithm
+    for (int col = 0; col < size; ++col)
+    {
+      typename MatrixType::ColXpr colVec = Q.col(col);
+      for (int prevCol = 0; prevCol < col; ++prevCol)
+      {
+        typename MatrixType::ColXpr prevColVec = Q.col(prevCol);
+        colVec -= colVec.dot(prevColVec)*prevColVec;
+      }
+      Q.col(col) = colVec.normalized();
+    }
+
+    // this additional orthogonalization is not necessary in theory but should enhance
+    // the numerical orthogonality of the matrix
+    for (int row = 0; row < size; ++row)
+    {
+      typename MatrixType::RowXpr rowVec = Q.row(row);
+      for (int prevRow = 0; prevRow < row; ++prevRow)
+      {
+        typename MatrixType::RowXpr prevRowVec = Q.row(prevRow);
+        rowVec -= rowVec.dot(prevRowVec)*prevRowVec;
+      }
+      Q.row(row) = rowVec.normalized();
+    }
+
+    // final check
+    is_unitary = Q.isUnitary();
+    --max_tries;
+  }
+
+  if (max_tries == 0)
+    eigen_assert(false && "randMatrixUnitary: Could not construct unitary matrix!");
+
+  return Q;
+}
+
+//  Constructs a random matrix from the special unitary group SU(size).
+template <typename T>
+Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> randMatrixSpecialUnitary(int size)
+{
+  typedef T Scalar;
+
+  typedef Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic> MatrixType;
+
+  // initialize unitary matrix
+  MatrixType Q = randMatrixUnitary<Scalar>(size);
+
+  // tweak the first column to make the determinant be 1
+  Q.col(0) *= numext::conj(Q.determinant());
+
+  return Q;
+}
+
+template <typename MatrixType>
+void run_test(int dim, int num_elements)
+{
+  using std::abs;
+  typedef typename internal::traits<MatrixType>::Scalar Scalar;
+  typedef Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic> MatrixX;
+  typedef Matrix<Scalar, Eigen::Dynamic, 1> VectorX;
+
+  // MUST be positive because in any other case det(cR_t) may become negative for
+  // odd dimensions!
+  const Scalar c = abs(internal::random<Scalar>());
+
+  MatrixX R = randMatrixSpecialUnitary<Scalar>(dim);
+  VectorX t = Scalar(50)*VectorX::Random(dim,1);
+
+  MatrixX cR_t = MatrixX::Identity(dim+1,dim+1);
+  cR_t.block(0,0,dim,dim) = c*R;
+  cR_t.block(0,dim,dim,1) = t;
+
+  MatrixX src = MatrixX::Random(dim+1, num_elements);
+  src.row(dim) = Matrix<Scalar, 1, Dynamic>::Constant(num_elements, Scalar(1));
+
+  MatrixX dst = cR_t*src;
+
+  MatrixX cR_t_umeyama = umeyama(src.block(0,0,dim,num_elements), dst.block(0,0,dim,num_elements));
+
+  const Scalar error = ( cR_t_umeyama*src - dst ).norm() / dst.norm();
+  VERIFY(error < Scalar(40)*std::numeric_limits<Scalar>::epsilon());
+}
+
+template<typename Scalar, int Dimension>
+void run_fixed_size_test(int num_elements)
+{
+  using std::abs;
+  typedef Matrix<Scalar, Dimension+1, Dynamic> MatrixX;
+  typedef Matrix<Scalar, Dimension+1, Dimension+1> HomMatrix;
+  typedef Matrix<Scalar, Dimension, Dimension> FixedMatrix;
+  typedef Matrix<Scalar, Dimension, 1> FixedVector;
+
+  const int dim = Dimension;
+
+  // MUST be positive because in any other case det(cR_t) may become negative for
+  // odd dimensions!
+  // Also if c is to small compared to t.norm(), problem is ill-posed (cf. Bug 744)
+  const Scalar c = internal::random<Scalar>(0.5, 2.0);
+
+  FixedMatrix R = randMatrixSpecialUnitary<Scalar>(dim);
+  FixedVector t = Scalar(32)*FixedVector::Random(dim,1);
+
+  HomMatrix cR_t = HomMatrix::Identity(dim+1,dim+1);
+  cR_t.block(0,0,dim,dim) = c*R;
+  cR_t.block(0,dim,dim,1) = t;
+
+  MatrixX src = MatrixX::Random(dim+1, num_elements);
+  src.row(dim) = Matrix<Scalar, 1, Dynamic>::Constant(num_elements, Scalar(1));
+
+  MatrixX dst = cR_t*src;
+
+  Block<MatrixX, Dimension, Dynamic> src_block(src,0,0,dim,num_elements);
+  Block<MatrixX, Dimension, Dynamic> dst_block(dst,0,0,dim,num_elements);
+
+  HomMatrix cR_t_umeyama = umeyama(src_block, dst_block);
+
+  const Scalar error = ( cR_t_umeyama*src - dst ).squaredNorm();
+
+  VERIFY(error < Scalar(16)*std::numeric_limits<Scalar>::epsilon());
+}
+
+EIGEN_DECLARE_TEST(umeyama)
+{
+  for (int i=0; i<g_repeat; ++i)
+  {
+    const int num_elements = internal::random<int>(40,500);
+
+    // works also for dimensions bigger than 3...
+    for (int dim=2; dim<8; ++dim)
+    {
+      CALL_SUBTEST_1(run_test<MatrixXd>(dim, num_elements));
+      CALL_SUBTEST_2(run_test<MatrixXf>(dim, num_elements));
+    }
+
+    CALL_SUBTEST_3((run_fixed_size_test<float, 2>(num_elements)));
+    CALL_SUBTEST_4((run_fixed_size_test<float, 3>(num_elements)));
+    CALL_SUBTEST_5((run_fixed_size_test<float, 4>(num_elements)));
+
+    CALL_SUBTEST_6((run_fixed_size_test<double, 2>(num_elements)));
+    CALL_SUBTEST_7((run_fixed_size_test<double, 3>(num_elements)));
+    CALL_SUBTEST_8((run_fixed_size_test<double, 4>(num_elements)));
+  }
+
+  // Those two calls don't compile and result in meaningful error messages!
+  // umeyama(MatrixXcf(),MatrixXcf());
+  // umeyama(MatrixXcd(),MatrixXcd());
+}

diff --git a/test/umfpack_support.cpp b/test/umfpack_support.cpp
new file mode 100644
index 0000000..d8f2a6f
--- /dev/null
+++ b/test/umfpack_support.cpp

@@ -0,0 +1,34 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
+#include "sparse_solver.h"
+
+#include <Eigen/UmfPackSupport>
+
+template<typename T1, typename T2> void test_umfpack_support_T()
+{
+  UmfPackLU<SparseMatrix<T1, ColMajor, T2> > umfpack_colmajor;
+  UmfPackLU<SparseMatrix<T1, RowMajor, T2> > umfpack_rowmajor;
+  
+  check_sparse_square_solving(umfpack_colmajor);
+  check_sparse_square_solving(umfpack_rowmajor);
+  
+  check_sparse_square_determinant(umfpack_colmajor);
+  check_sparse_square_determinant(umfpack_rowmajor);
+}
+
+EIGEN_DECLARE_TEST(umfpack_support)
+{
+  CALL_SUBTEST_1((test_umfpack_support_T<double, int>()));
+  CALL_SUBTEST_2((test_umfpack_support_T<std::complex<double>, int>()));
+  CALL_SUBTEST_3((test_umfpack_support_T<double, long >()));
+  CALL_SUBTEST_4((test_umfpack_support_T<std::complex<double>, long>()));
+}
+

diff --git a/test/unalignedcount.cpp b/test/unalignedcount.cpp
new file mode 100644
index 0000000..52cdd9e
--- /dev/null
+++ b/test/unalignedcount.cpp

@@ -0,0 +1,60 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+static int nb_load;
+static int nb_loadu;
+static int nb_store;
+static int nb_storeu;
+
+#define EIGEN_DEBUG_ALIGNED_LOAD    { nb_load++;    }
+#define EIGEN_DEBUG_UNALIGNED_LOAD  { nb_loadu++;   }
+#define EIGEN_DEBUG_ALIGNED_STORE   { nb_store++;   }
+#define EIGEN_DEBUG_UNALIGNED_STORE { nb_storeu++;  }
+
+#define VERIFY_ALIGNED_UNALIGNED_COUNT(XPR,AL,UL,AS,US) {\
+    nb_load = nb_loadu = nb_store = nb_storeu = 0; \
+    XPR; \
+    if(!(nb_load==AL && nb_loadu==UL && nb_store==AS && nb_storeu==US)) \
+      std::cerr << " >> " << nb_load << ", " << nb_loadu << ", " << nb_store << ", " << nb_storeu << "\n"; \
+    VERIFY( (#XPR) && nb_load==AL && nb_loadu==UL && nb_store==AS && nb_storeu==US ); \
+  }
+
+
+#include "main.h"
+
+EIGEN_DECLARE_TEST(unalignedcount)
+{
+  #if defined(EIGEN_VECTORIZE_AVX512)
+  VectorXf a(48), b(48);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 6, 0, 3, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,48) += b.segment(0,48), 3, 3, 3, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,48) -= b.segment(0,48), 3, 3, 3, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,48) *= 3.5, 3, 0, 3, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,48) /= 3.5, 3, 0, 3, 0);
+  #elif defined(EIGEN_VECTORIZE_AVX)
+  VectorXf a(40), b(40);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 10, 0, 5, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) += b.segment(0,40), 5, 5, 5, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) -= b.segment(0,40), 5, 5, 5, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) *= 3.5, 5, 0, 5, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) /= 3.5, 5, 0, 5, 0);
+  #elif defined(EIGEN_VECTORIZE_SSE)
+  VectorXf a(40), b(40);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 20, 0, 10, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) += b.segment(0,40), 10, 10, 10, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) -= b.segment(0,40), 10, 10, 10, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) *= 3.5, 10, 0, 10, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) /= 3.5, 10, 0, 10, 0);
+  #else
+  // The following line is to eliminate "variable not used" warnings
+  nb_load = nb_loadu = nb_store = nb_storeu = 0;
+  int a(0), b(0);
+  VERIFY(a==b);
+  #endif
+}

diff --git a/test/upperbidiagonalization.cpp b/test/upperbidiagonalization.cpp
new file mode 100644
index 0000000..945c999
--- /dev/null
+++ b/test/upperbidiagonalization.cpp

@@ -0,0 +1,43 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/SVD>
+
+template<typename MatrixType> void upperbidiag(const MatrixType& m)
+{
+  const Index rows = m.rows();
+  const Index cols = m.cols();
+
+  typedef Matrix<typename MatrixType::RealScalar, MatrixType::RowsAtCompileTime,  MatrixType::ColsAtCompileTime> RealMatrixType;
+  typedef Matrix<typename MatrixType::Scalar, MatrixType::ColsAtCompileTime,  MatrixType::RowsAtCompileTime> TransposeMatrixType;
+
+  MatrixType a = MatrixType::Random(rows,cols);
+  internal::UpperBidiagonalization<MatrixType> ubd(a);
+  RealMatrixType b(rows, cols);
+  b.setZero();
+  b.block(0,0,cols,cols) = ubd.bidiagonal();
+  MatrixType c = ubd.householderU() * b * ubd.householderV().adjoint();
+  VERIFY_IS_APPROX(a,c);
+  TransposeMatrixType d = ubd.householderV() * b.adjoint() * ubd.householderU().adjoint();
+  VERIFY_IS_APPROX(a.adjoint(),d);
+}
+
+EIGEN_DECLARE_TEST(upperbidiagonalization)
+{
+  for(int i = 0; i < g_repeat; i++) {
+   CALL_SUBTEST_1( upperbidiag(MatrixXf(3,3)) );
+   CALL_SUBTEST_2( upperbidiag(MatrixXd(17,12)) );
+   CALL_SUBTEST_3( upperbidiag(MatrixXcf(20,20)) );
+   CALL_SUBTEST_4( upperbidiag(Matrix<std::complex<double>,Dynamic,Dynamic,RowMajor>(16,15)) );
+   CALL_SUBTEST_5( upperbidiag(Matrix<float,6,4>()) );
+   CALL_SUBTEST_6( upperbidiag(Matrix<float,5,5>()) );
+   CALL_SUBTEST_7( upperbidiag(Matrix<double,4,3>()) );
+  }
+}

diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp
new file mode 100644
index 0000000..97c0bda
--- /dev/null
+++ b/test/vectorization_logic.cpp

@@ -0,0 +1,429 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifdef EIGEN_TEST_PART_1
+#define EIGEN_UNALIGNED_VECTORIZE 1
+#endif
+
+#ifdef EIGEN_TEST_PART_2
+#define EIGEN_UNALIGNED_VECTORIZE 0
+#endif
+
+#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
+#undef EIGEN_DEFAULT_TO_ROW_MAJOR
+#endif
+#define EIGEN_DEBUG_ASSIGN
+#include "main.h"
+#include <typeinfo>
+
+// Disable "ignoring attributes on template argument"
+// for packet_traits<Packet*>
+// => The only workaround would be to wrap _m128 and the likes
+//    within wrappers.
+#if EIGEN_GNUC_AT_LEAST(6,0)
+    #pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+
+using internal::demangle_flags;
+using internal::demangle_traversal;
+using internal::demangle_unrolling;
+
+template<typename Dst, typename Src>
+bool test_assign(const Dst&, const Src&, int traversal, int unrolling)
+{
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src);
+  typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>,internal::evaluator<Src>, internal::assign_op<typename Dst::Scalar,typename Src::Scalar> > traits;
+  bool res = traits::Traversal==traversal;
+  if(unrolling==InnerUnrolling+CompleteUnrolling)
+    res = res && (int(traits::Unrolling)==InnerUnrolling || int(traits::Unrolling)==CompleteUnrolling);
+  else
+    res = res && int(traits::Unrolling)==unrolling;
+  if(!res)
+  {
+    std::cerr << "Src: " << demangle_flags(Src::Flags) << std::endl;
+    std::cerr << "     " << demangle_flags(internal::evaluator<Src>::Flags) << std::endl;
+    std::cerr << "Dst: " << demangle_flags(Dst::Flags) << std::endl;
+    std::cerr << "     " << demangle_flags(internal::evaluator<Dst>::Flags) << std::endl;
+    traits::debug();
+    std::cerr << " Expected Traversal == " << demangle_traversal(traversal)
+              << " got " << demangle_traversal(traits::Traversal) << "\n";
+    std::cerr << " Expected Unrolling == " << demangle_unrolling(unrolling)
+              << " got " << demangle_unrolling(traits::Unrolling) << "\n";
+  }
+  return res;
+}
+
+template<typename Dst, typename Src>
+bool test_assign(int traversal, int unrolling)
+{
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src);
+  typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>,internal::evaluator<Src>, internal::assign_op<typename Dst::Scalar,typename Src::Scalar> > traits;
+  bool res = traits::Traversal==traversal && traits::Unrolling==unrolling;
+  if(!res)
+  {
+    std::cerr << "Src: " << demangle_flags(Src::Flags) << std::endl;
+    std::cerr << "     " << demangle_flags(internal::evaluator<Src>::Flags) << std::endl;
+    std::cerr << "Dst: " << demangle_flags(Dst::Flags) << std::endl;
+    std::cerr << "     " << demangle_flags(internal::evaluator<Dst>::Flags) << std::endl;
+    traits::debug();
+    std::cerr << " Expected Traversal == " << demangle_traversal(traversal)
+              << " got " << demangle_traversal(traits::Traversal) << "\n";
+    std::cerr << " Expected Unrolling == " << demangle_unrolling(unrolling)
+              << " got " << demangle_unrolling(traits::Unrolling) << "\n";
+  }
+  return res;
+}
+
+template<typename Xpr>
+bool test_redux(const Xpr&, int traversal, int unrolling)
+{
+  typedef typename Xpr::Scalar Scalar;
+  typedef internal::redux_traits<internal::scalar_sum_op<Scalar,Scalar>,internal::redux_evaluator<Xpr> > traits;
+  
+  bool res = traits::Traversal==traversal && traits::Unrolling==unrolling;
+  if(!res)
+  {
+    std::cerr << demangle_flags(Xpr::Flags) << std::endl;
+    std::cerr << demangle_flags(internal::evaluator<Xpr>::Flags) << std::endl;
+    traits::debug();
+    
+    std::cerr << " Expected Traversal == " << demangle_traversal(traversal)
+              << " got " << demangle_traversal(traits::Traversal) << "\n";
+    std::cerr << " Expected Unrolling == " << demangle_unrolling(unrolling)
+              << " got " << demangle_unrolling(traits::Unrolling) << "\n";
+  }
+  return res;
+}
+
+template<typename Scalar, bool Enable = internal::packet_traits<Scalar>::Vectorizable>
+struct vectorization_logic
+{
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  
+  typedef typename internal::packet_traits<Scalar>::type PacketType;
+  typedef typename internal::unpacket_traits<PacketType>::half HalfPacketType;
+  enum {
+    PacketSize = internal::unpacket_traits<PacketType>::size,
+    HalfPacketSize = internal::unpacket_traits<HalfPacketType>::size
+  };
+  static void run()
+  {
+    
+    typedef Matrix<Scalar,PacketSize,1> Vector1;
+    typedef Matrix<Scalar,Dynamic,1> VectorX;
+    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixXX;
+    typedef Matrix<Scalar,PacketSize,PacketSize> Matrix11;
+    typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?8:2*PacketSize,(Matrix11::Flags&RowMajorBit)?2*PacketSize:8>   Matrix22;
+    typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16> Matrix44;
+    typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16,DontAlign|EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION> Matrix44u;
+    typedef Matrix<Scalar,4*PacketSize,4*PacketSize,ColMajor> Matrix44c;
+    typedef Matrix<Scalar,4*PacketSize,4*PacketSize,RowMajor> Matrix44r;
+
+    typedef Matrix<Scalar,
+        (PacketSize==16 ? 8 : PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
+        (PacketSize==16 ? 2 : PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1)
+      > Matrix1;
+
+    typedef Matrix<Scalar,
+        (PacketSize==16 ? 8 : PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
+        (PacketSize==16 ? 2 : PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1),
+      DontAlign|((Matrix1::Flags&RowMajorBit)?RowMajor:ColMajor)> Matrix1u;
+
+    // this type is made such that it can only be vectorized when viewed as a linear 1D vector
+    typedef Matrix<Scalar,
+        (PacketSize==16 ?  4 : PacketSize==8 ? 4 : PacketSize==4 ? 6 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?2:3) : /*PacketSize==1 ?*/ 1),
+        (PacketSize==16 ? 12 : PacketSize==8 ? 6 : PacketSize==4 ? 2 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?3:2) : /*PacketSize==1 ?*/ 3)
+      > Matrix3;
+    
+    #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT
+    VERIFY(test_assign(Vector1(),Vector1(),
+      InnerVectorizedTraversal,CompleteUnrolling));
+    VERIFY(test_assign(Vector1(),Vector1()+Vector1(),
+      InnerVectorizedTraversal,CompleteUnrolling));
+    VERIFY(test_assign(Vector1(),Vector1().cwiseProduct(Vector1()),
+      InnerVectorizedTraversal,CompleteUnrolling));
+    VERIFY(test_assign(Vector1(),Vector1().template cast<Scalar>(),
+      InnerVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY(test_assign(Matrix44(),Matrix44()+Matrix44(),
+      InnerVectorizedTraversal,InnerUnrolling));
+
+    VERIFY(test_assign(Matrix44u(),Matrix44()+Matrix44(),
+      EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearTraversal,
+      EIGEN_UNALIGNED_VECTORIZE ? InnerUnrolling : NoUnrolling));
+
+    VERIFY(test_assign(Matrix1(),Matrix1()+Matrix1(),
+      (int(Matrix1::InnerSizeAtCompileTime) % int(PacketSize))==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal,
+      CompleteUnrolling));
+
+    VERIFY(test_assign(Matrix1u(),Matrix1()+Matrix1(),
+      EIGEN_UNALIGNED_VECTORIZE ? ((int(Matrix1::InnerSizeAtCompileTime) % int(PacketSize))==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal)
+                                : LinearTraversal, CompleteUnrolling));
+
+    VERIFY(test_assign(Matrix44c().col(1),Matrix44c().col(2)+Matrix44c().col(3),
+      InnerVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY(test_assign(Matrix44r().row(2),Matrix44r().row(1)+Matrix44r().row(1),
+      InnerVectorizedTraversal,CompleteUnrolling));
+
+    if(PacketSize>1)
+    {
+      typedef Matrix<Scalar,3,3,ColMajor> Matrix33c;
+      typedef Matrix<Scalar,3,1,ColMajor> Vector3;
+      VERIFY(test_assign(Matrix33c().row(2),Matrix33c().row(1)+Matrix33c().row(1),
+        LinearTraversal,CompleteUnrolling));
+      VERIFY(test_assign(Vector3(),Vector3()+Vector3(),
+        sizeof(Scalar)==16 ? InnerVectorizedTraversal : (EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal), CompleteUnrolling));
+      VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1),
+        EIGEN_UNALIGNED_VECTORIZE ? (sizeof(Scalar)==16 ? InnerVectorizedTraversal : LinearVectorizedTraversal)
+                                  : (sizeof(Scalar)==16 ? SliceVectorizedTraversal : LinearTraversal),
+        ((!EIGEN_UNALIGNED_VECTORIZE) && (sizeof(Scalar)==16)) ? NoUnrolling : CompleteUnrolling));
+
+      VERIFY(test_assign(Matrix3(),Matrix3().cwiseProduct(Matrix3()),
+        LinearVectorizedTraversal,CompleteUnrolling));
+
+      VERIFY(test_assign(Matrix<Scalar,17,17>(),Matrix<Scalar,17,17>()+Matrix<Scalar,17,17>(),
+        sizeof(Scalar)==16        ? InnerVectorizedTraversal  :
+        EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal :
+                                    LinearTraversal,
+        NoUnrolling));
+
+      VERIFY(test_assign(Matrix11(), Matrix11()+Matrix11(),InnerVectorizedTraversal,CompleteUnrolling));
+
+
+      VERIFY(test_assign(Matrix11(),Matrix<Scalar,21,21>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,21,21>().template block<PacketSize,PacketSize>(3,2),
+        (EIGEN_UNALIGNED_VECTORIZE) ? InnerVectorizedTraversal : DefaultTraversal, CompleteUnrolling|InnerUnrolling));
+
+      VERIFY(test_assign(Vector1(),Matrix11()*Vector1(),
+                         InnerVectorizedTraversal,CompleteUnrolling));
+
+      VERIFY(test_assign(Matrix11(),Matrix11().lazyProduct(Matrix11()),
+                         InnerVectorizedTraversal,InnerUnrolling+CompleteUnrolling));
+    }
+
+    VERIFY(test_redux(Vector1(),
+      LinearVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY(test_redux(Vector1().array()*Vector1().array(),
+      LinearVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY(test_redux((Vector1().array()*Vector1().array()).col(0),
+      LinearVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY(test_redux(Matrix<Scalar,PacketSize,3>(),
+      LinearVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY(test_redux(Matrix3(),
+      LinearVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY(test_redux(Matrix44(),
+      LinearVectorizedTraversal,NoUnrolling));
+
+    if(PacketSize>1) {
+      VERIFY(test_redux(Matrix44().template block<(Matrix1::Flags&RowMajorBit)?4:PacketSize,(Matrix1::Flags&RowMajorBit)?PacketSize:4>(1,2),
+        SliceVectorizedTraversal,CompleteUnrolling));
+
+      VERIFY(test_redux(Matrix44().template block<(Matrix1::Flags&RowMajorBit)?2:PacketSize,(Matrix1::Flags&RowMajorBit)?PacketSize:2>(1,2),
+        DefaultTraversal,CompleteUnrolling));
+    }
+
+    VERIFY(test_redux(Matrix44c().template block<2*PacketSize,1>(1,2),
+      LinearVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY(test_redux(Matrix44r().template block<1,2*PacketSize>(2,1),
+      LinearVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY((test_assign<
+            Map<Matrix22, AlignedMax, OuterStride<3*PacketSize> >,
+            Matrix22
+            >(InnerVectorizedTraversal,CompleteUnrolling)));
+
+    VERIFY((test_assign<
+            Map<Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)>, AlignedMax, InnerStride<3*PacketSize> >,
+            Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)>
+            >(DefaultTraversal,PacketSize>=8?InnerUnrolling:CompleteUnrolling)));
+
+    VERIFY((test_assign(Matrix11(), Matrix<Scalar,PacketSize,EIGEN_PLAIN_ENUM_MIN(2,PacketSize)>()*Matrix<Scalar,EIGEN_PLAIN_ENUM_MIN(2,PacketSize),PacketSize>(),
+                        InnerVectorizedTraversal, CompleteUnrolling)));
+    #endif
+
+    VERIFY(test_assign(MatrixXX(10,10),MatrixXX(20,20).block(10,10,2,3),
+      SliceVectorizedTraversal,NoUnrolling));
+
+    VERIFY(test_redux(VectorX(10),
+      LinearVectorizedTraversal,NoUnrolling));
+  }
+};
+
+template<typename Scalar> struct vectorization_logic<Scalar,false>
+{
+  static void run() {}
+};
+
+template<typename Scalar, bool Enable = !internal::is_same<typename internal::unpacket_traits<typename internal::packet_traits<Scalar>::type>::half,
+                                                           typename internal::packet_traits<Scalar>::type>::value >
+struct vectorization_logic_half
+{
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  typedef typename internal::unpacket_traits<typename internal::packet_traits<Scalar>::type>::half PacketType;
+  enum {
+    PacketSize = internal::unpacket_traits<PacketType>::size
+  };
+  static void run()
+  {
+    
+    typedef Matrix<Scalar,PacketSize,1> Vector1;
+    typedef Matrix<Scalar,PacketSize,PacketSize> Matrix11;
+    typedef Matrix<Scalar,5*PacketSize,7,ColMajor> Matrix57;
+    typedef Matrix<Scalar,3*PacketSize,5,ColMajor> Matrix35;
+    typedef Matrix<Scalar,5*PacketSize,7,DontAlign|ColMajor> Matrix57u;
+
+    typedef Matrix<Scalar,
+        (PacketSize==16 ? 8 : PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
+        (PacketSize==16 ? 2 : PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1)
+      > Matrix1;
+
+    typedef Matrix<Scalar,
+        (PacketSize==16 ? 8 : PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
+        (PacketSize==16 ? 2 : PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1),
+      DontAlign|((Matrix1::Flags&RowMajorBit)?RowMajor:ColMajor)> Matrix1u;
+
+    // this type is made such that it can only be vectorized when viewed as a linear 1D vector
+    typedef Matrix<Scalar,
+        (PacketSize==16 ?  4 : PacketSize==8 ? 4 : PacketSize==4 ? 6 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?2:3) : /*PacketSize==1 ?*/ 1),
+        (PacketSize==16 ? 12 : PacketSize==8 ? 6 : PacketSize==4 ? 2 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?3:2) : /*PacketSize==1 ?*/ 3)
+      > Matrix3;
+    
+    #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT
+    VERIFY(test_assign(Vector1(),Vector1(),
+      InnerVectorizedTraversal,CompleteUnrolling));
+    VERIFY(test_assign(Vector1(),Vector1()+Vector1(),
+      InnerVectorizedTraversal,CompleteUnrolling));
+    VERIFY(test_assign(Vector1(),Vector1().template segment<PacketSize>(0).derived(),
+      EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearVectorizedTraversal,CompleteUnrolling));
+    VERIFY(test_assign(Vector1(),Scalar(2.1)*Vector1()-Vector1(),
+      InnerVectorizedTraversal,CompleteUnrolling));
+    VERIFY(test_assign(Vector1(),(Scalar(2.1)*Vector1().template segment<PacketSize>(0)-Vector1().template segment<PacketSize>(0)).derived(),
+      EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearVectorizedTraversal,CompleteUnrolling));
+    VERIFY(test_assign(Vector1(),Vector1().cwiseProduct(Vector1()),
+      InnerVectorizedTraversal,CompleteUnrolling));
+    VERIFY(test_assign(Vector1(),Vector1().template cast<Scalar>(),
+      InnerVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY(test_assign(Matrix57(),Matrix57()+Matrix57(),
+      InnerVectorizedTraversal,InnerUnrolling));
+
+    VERIFY(test_assign(Matrix57u(),Matrix57()+Matrix57(),
+      EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearTraversal,
+      EIGEN_UNALIGNED_VECTORIZE ? InnerUnrolling : NoUnrolling));
+
+    VERIFY(test_assign(Matrix1u(),Matrix1()+Matrix1(),
+      EIGEN_UNALIGNED_VECTORIZE ? ((int(Matrix1::InnerSizeAtCompileTime) % int(PacketSize))==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal,CompleteUnrolling));
+        
+    if(PacketSize>1)
+    {
+      typedef Matrix<Scalar,3,3,ColMajor> Matrix33c;
+      VERIFY(test_assign(Matrix33c().row(2),Matrix33c().row(1)+Matrix33c().row(1),
+        LinearTraversal,CompleteUnrolling));
+      VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1),
+        EIGEN_UNALIGNED_VECTORIZE ? (sizeof(Scalar)==16 ? InnerVectorizedTraversal : LinearVectorizedTraversal)
+                                  : (sizeof(Scalar)==16 ? SliceVectorizedTraversal : LinearTraversal),
+        ((!EIGEN_UNALIGNED_VECTORIZE) && (sizeof(Scalar)==16)) ? NoUnrolling : CompleteUnrolling));
+              
+      VERIFY(test_assign(Matrix3(),Matrix3().cwiseQuotient(Matrix3()),
+        PacketTraits::HasDiv ? LinearVectorizedTraversal : LinearTraversal,CompleteUnrolling));
+        
+      VERIFY(test_assign(Matrix<Scalar,17,17>(),Matrix<Scalar,17,17>()+Matrix<Scalar,17,17>(),
+        sizeof(Scalar)==16 ? InnerVectorizedTraversal : (EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal),
+        NoUnrolling));
+        
+      VERIFY(test_assign(Matrix11(),Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(8,4),
+        EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : DefaultTraversal,InnerUnrolling+CompleteUnrolling));
+  
+
+      VERIFY(test_assign(Vector1(),Matrix11()*Vector1(),
+                         InnerVectorizedTraversal,CompleteUnrolling));
+
+      VERIFY(test_assign(Matrix11(),Matrix11().lazyProduct(Matrix11()),
+                         InnerVectorizedTraversal,InnerUnrolling+CompleteUnrolling));
+    }
+    
+    VERIFY(test_redux(Vector1(),
+      LinearVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY(test_redux(Matrix<Scalar,PacketSize,3>(),
+      LinearVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY(test_redux(Matrix3(),
+      LinearVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY(test_redux(Matrix35(),
+      LinearVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY(test_redux(Matrix57().template block<PacketSize==1?2:PacketSize,3>(1,0),
+      SliceVectorizedTraversal,CompleteUnrolling));
+
+    if(PacketSize>1) {
+      VERIFY(test_redux(Matrix57().template block<PacketSize,2>(1,0),
+        DefaultTraversal,CompleteUnrolling));
+    }
+
+    VERIFY((test_assign<
+            Map<Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)>, AlignedMax, InnerStride<3*PacketSize> >,
+            Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)>
+            >(DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling)));
+
+    VERIFY((test_assign(Matrix57(), Matrix<Scalar,5*PacketSize,3>()*Matrix<Scalar,3,7>(),
+                        InnerVectorizedTraversal, InnerUnrolling+CompleteUnrolling)));
+    #endif
+  }
+};
+
+template<typename Scalar> struct vectorization_logic_half<Scalar,false>
+{
+  static void run() {}
+};
+
+EIGEN_DECLARE_TEST(vectorization_logic)
+{
+
+#ifdef EIGEN_VECTORIZE
+
+  CALL_SUBTEST( vectorization_logic<int>::run() );
+  CALL_SUBTEST( vectorization_logic<float>::run() );
+  CALL_SUBTEST( vectorization_logic<double>::run() );
+  CALL_SUBTEST( vectorization_logic<std::complex<float> >::run() );
+  CALL_SUBTEST( vectorization_logic<std::complex<double> >::run() );
+  
+  CALL_SUBTEST( vectorization_logic_half<int>::run() );
+  CALL_SUBTEST( vectorization_logic_half<float>::run() );
+  CALL_SUBTEST( vectorization_logic_half<double>::run() );
+  CALL_SUBTEST( vectorization_logic_half<std::complex<float> >::run() );
+  CALL_SUBTEST( vectorization_logic_half<std::complex<double> >::run() );
+  
+  if(internal::packet_traits<float>::Vectorizable)
+  {
+    VERIFY(test_assign(Matrix<float,3,3>(),Matrix<float,3,3>()+Matrix<float,3,3>(),
+      EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal,CompleteUnrolling));
+      
+    VERIFY(test_redux(Matrix<float,5,2>(),
+      EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : DefaultTraversal,CompleteUnrolling));
+  }
+  
+  if(internal::packet_traits<double>::Vectorizable)
+  {
+    VERIFY(test_assign(Matrix<double,3,3>(),Matrix<double,3,3>()+Matrix<double,3,3>(),
+      EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal,CompleteUnrolling));
+    
+    VERIFY(test_redux(Matrix<double,7,3>(),
+      EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : DefaultTraversal,CompleteUnrolling));
+  }
+#endif // EIGEN_VECTORIZE
+
+}

diff --git a/test/vectorwiseop.cpp b/test/vectorwiseop.cpp
new file mode 100644
index 0000000..8ee5884
--- /dev/null
+++ b/test/vectorwiseop.cpp

@@ -0,0 +1,298 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define TEST_ENABLE_TEMPORARY_TRACKING
+#define EIGEN_NO_STATIC_ASSERT
+
+#include "main.h"
+
+template<typename ArrayType> void vectorwiseop_array(const ArrayType& m)
+{
+  typedef typename ArrayType::Scalar Scalar;
+  typedef Array<Scalar, ArrayType::RowsAtCompileTime, 1> ColVectorType;
+  typedef Array<Scalar, 1, ArrayType::ColsAtCompileTime> RowVectorType;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+  Index r = internal::random<Index>(0, rows-1),
+        c = internal::random<Index>(0, cols-1);
+
+  ArrayType m1 = ArrayType::Random(rows, cols),
+            m2(rows, cols),
+            m3(rows, cols);
+
+  ColVectorType colvec = ColVectorType::Random(rows);
+  RowVectorType rowvec = RowVectorType::Random(cols);
+
+  // test addition
+
+  m2 = m1;
+  m2.colwise() += colvec;
+  VERIFY_IS_APPROX(m2, m1.colwise() + colvec);
+  VERIFY_IS_APPROX(m2.col(c), m1.col(c) + colvec);
+
+  VERIFY_RAISES_ASSERT(m2.colwise() += colvec.transpose());
+  VERIFY_RAISES_ASSERT(m1.colwise() + colvec.transpose());
+
+  m2 = m1;
+  m2.rowwise() += rowvec;
+  VERIFY_IS_APPROX(m2, m1.rowwise() + rowvec);
+  VERIFY_IS_APPROX(m2.row(r), m1.row(r) + rowvec);
+
+  VERIFY_RAISES_ASSERT(m2.rowwise() += rowvec.transpose());
+  VERIFY_RAISES_ASSERT(m1.rowwise() + rowvec.transpose());
+
+  // test substraction
+
+  m2 = m1;
+  m2.colwise() -= colvec;
+  VERIFY_IS_APPROX(m2, m1.colwise() - colvec);
+  VERIFY_IS_APPROX(m2.col(c), m1.col(c) - colvec);
+
+  VERIFY_RAISES_ASSERT(m2.colwise() -= colvec.transpose());
+  VERIFY_RAISES_ASSERT(m1.colwise() - colvec.transpose());
+
+  m2 = m1;
+  m2.rowwise() -= rowvec;
+  VERIFY_IS_APPROX(m2, m1.rowwise() - rowvec);
+  VERIFY_IS_APPROX(m2.row(r), m1.row(r) - rowvec);
+
+  VERIFY_RAISES_ASSERT(m2.rowwise() -= rowvec.transpose());
+  VERIFY_RAISES_ASSERT(m1.rowwise() - rowvec.transpose());
+
+  // test multiplication
+
+  m2 = m1;
+  m2.colwise() *= colvec;
+  VERIFY_IS_APPROX(m2, m1.colwise() * colvec);
+  VERIFY_IS_APPROX(m2.col(c), m1.col(c) * colvec);
+
+  VERIFY_RAISES_ASSERT(m2.colwise() *= colvec.transpose());
+  VERIFY_RAISES_ASSERT(m1.colwise() * colvec.transpose());
+
+  m2 = m1;
+  m2.rowwise() *= rowvec;
+  VERIFY_IS_APPROX(m2, m1.rowwise() * rowvec);
+  VERIFY_IS_APPROX(m2.row(r), m1.row(r) * rowvec);
+
+  VERIFY_RAISES_ASSERT(m2.rowwise() *= rowvec.transpose());
+  VERIFY_RAISES_ASSERT(m1.rowwise() * rowvec.transpose());
+
+  // test quotient
+
+  m2 = m1;
+  m2.colwise() /= colvec;
+  VERIFY_IS_APPROX(m2, m1.colwise() / colvec);
+  VERIFY_IS_APPROX(m2.col(c), m1.col(c) / colvec);
+
+  VERIFY_RAISES_ASSERT(m2.colwise() /= colvec.transpose());
+  VERIFY_RAISES_ASSERT(m1.colwise() / colvec.transpose());
+
+  m2 = m1;
+  m2.rowwise() /= rowvec;
+  VERIFY_IS_APPROX(m2, m1.rowwise() / rowvec);
+  VERIFY_IS_APPROX(m2.row(r), m1.row(r) / rowvec);
+
+  VERIFY_RAISES_ASSERT(m2.rowwise() /= rowvec.transpose());
+  VERIFY_RAISES_ASSERT(m1.rowwise() / rowvec.transpose());
+
+  m2 = m1;
+  // yes, there might be an aliasing issue there but ".rowwise() /="
+  // is supposed to evaluate " m2.colwise().sum()" into a temporary to avoid
+  // evaluating the reduction multiple times
+  if(ArrayType::RowsAtCompileTime>2 || ArrayType::RowsAtCompileTime==Dynamic)
+  {
+    m2.rowwise() /= m2.colwise().sum();
+    VERIFY_IS_APPROX(m2, m1.rowwise() / m1.colwise().sum());
+  }
+
+  // all/any
+  Array<bool,Dynamic,Dynamic> mb(rows,cols);
+  mb = (m1.real()<=0.7).colwise().all();
+  VERIFY( (mb.col(c) == (m1.real().col(c)<=0.7).all()).all() );
+  mb = (m1.real()<=0.7).rowwise().all();
+  VERIFY( (mb.row(r) == (m1.real().row(r)<=0.7).all()).all() );
+
+  mb = (m1.real()>=0.7).colwise().any();
+  VERIFY( (mb.col(c) == (m1.real().col(c)>=0.7).any()).all() );
+  mb = (m1.real()>=0.7).rowwise().any();
+  VERIFY( (mb.row(r) == (m1.real().row(r)>=0.7).any()).all() );
+}
+
+template<typename MatrixType> void vectorwiseop_matrix(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> ColVectorType;
+  typedef Matrix<Scalar, 1, MatrixType::ColsAtCompileTime> RowVectorType;
+  typedef Matrix<RealScalar, MatrixType::RowsAtCompileTime, 1> RealColVectorType;
+  typedef Matrix<RealScalar, 1, MatrixType::ColsAtCompileTime> RealRowVectorType;
+  typedef Matrix<Scalar,Dynamic,Dynamic> MatrixX;
+
+  Index rows = m.rows();
+  Index cols = m.cols();
+  Index r = internal::random<Index>(0, rows-1),
+        c = internal::random<Index>(0, cols-1);
+
+  MatrixType m1 = MatrixType::Random(rows, cols),
+            m2(rows, cols),
+            m3(rows, cols);
+
+  ColVectorType colvec = ColVectorType::Random(rows);
+  RowVectorType rowvec = RowVectorType::Random(cols);
+  RealColVectorType rcres;
+  RealRowVectorType rrres;
+
+  // test broadcast assignment
+  m2 = m1;
+  m2.colwise() = colvec;
+  for(Index j=0; j<cols; ++j)
+    VERIFY_IS_APPROX(m2.col(j), colvec);
+  m2.rowwise() = rowvec;
+  for(Index i=0; i<rows; ++i)
+    VERIFY_IS_APPROX(m2.row(i), rowvec);
+  if(rows>1)
+    VERIFY_RAISES_ASSERT(m2.colwise() = colvec.transpose());
+  if(cols>1)
+    VERIFY_RAISES_ASSERT(m2.rowwise() = rowvec.transpose());
+
+  // test addition
+
+  m2 = m1;
+  m2.colwise() += colvec;
+  VERIFY_IS_APPROX(m2, m1.colwise() + colvec);
+  VERIFY_IS_APPROX(m2.col(c), m1.col(c) + colvec);
+
+  if(rows>1)
+  {
+    VERIFY_RAISES_ASSERT(m2.colwise() += colvec.transpose());
+    VERIFY_RAISES_ASSERT(m1.colwise() + colvec.transpose());
+  }
+
+  m2 = m1;
+  m2.rowwise() += rowvec;
+  VERIFY_IS_APPROX(m2, m1.rowwise() + rowvec);
+  VERIFY_IS_APPROX(m2.row(r), m1.row(r) + rowvec);
+
+  if(cols>1)
+  {
+    VERIFY_RAISES_ASSERT(m2.rowwise() += rowvec.transpose());
+    VERIFY_RAISES_ASSERT(m1.rowwise() + rowvec.transpose());
+  }
+
+  // test substraction
+
+  m2 = m1;
+  m2.colwise() -= colvec;
+  VERIFY_IS_APPROX(m2, m1.colwise() - colvec);
+  VERIFY_IS_APPROX(m2.col(c), m1.col(c) - colvec);
+
+  if(rows>1)
+  {
+    VERIFY_RAISES_ASSERT(m2.colwise() -= colvec.transpose());
+    VERIFY_RAISES_ASSERT(m1.colwise() - colvec.transpose());
+  }
+
+  m2 = m1;
+  m2.rowwise() -= rowvec;
+  VERIFY_IS_APPROX(m2, m1.rowwise() - rowvec);
+  VERIFY_IS_APPROX(m2.row(r), m1.row(r) - rowvec);
+
+  if(cols>1)
+  {
+    VERIFY_RAISES_ASSERT(m2.rowwise() -= rowvec.transpose());
+    VERIFY_RAISES_ASSERT(m1.rowwise() - rowvec.transpose());
+  }
+
+  // ------ partial reductions ------
+
+  #define TEST_PARTIAL_REDUX_BASIC(FUNC,ROW,COL,PREPROCESS) {                          \
+    ROW = m1 PREPROCESS .colwise().FUNC ;                                              \
+    for(Index k=0; k<cols; ++k) VERIFY_IS_APPROX(ROW(k), m1.col(k) PREPROCESS .FUNC ); \
+    COL = m1 PREPROCESS .rowwise().FUNC ;                                              \
+    for(Index k=0; k<rows; ++k) VERIFY_IS_APPROX(COL(k), m1.row(k) PREPROCESS .FUNC ); \
+  }
+
+  TEST_PARTIAL_REDUX_BASIC(sum(),        rowvec,colvec,EIGEN_EMPTY);
+  TEST_PARTIAL_REDUX_BASIC(prod(),       rowvec,colvec,EIGEN_EMPTY);
+  TEST_PARTIAL_REDUX_BASIC(mean(),       rowvec,colvec,EIGEN_EMPTY);
+  TEST_PARTIAL_REDUX_BASIC(minCoeff(),   rrres, rcres, .real());
+  TEST_PARTIAL_REDUX_BASIC(maxCoeff(),   rrres, rcres, .real());
+  TEST_PARTIAL_REDUX_BASIC(norm(),       rrres, rcres, EIGEN_EMPTY);
+  TEST_PARTIAL_REDUX_BASIC(squaredNorm(),rrres, rcres, EIGEN_EMPTY);
+  TEST_PARTIAL_REDUX_BASIC(redux(internal::scalar_sum_op<Scalar,Scalar>()),rowvec,colvec,EIGEN_EMPTY);
+
+  VERIFY_IS_APPROX(m1.cwiseAbs().colwise().sum(), m1.colwise().template lpNorm<1>());
+  VERIFY_IS_APPROX(m1.cwiseAbs().rowwise().sum(), m1.rowwise().template lpNorm<1>());
+  VERIFY_IS_APPROX(m1.cwiseAbs().colwise().maxCoeff(), m1.colwise().template lpNorm<Infinity>());
+  VERIFY_IS_APPROX(m1.cwiseAbs().rowwise().maxCoeff(), m1.rowwise().template lpNorm<Infinity>());
+
+  // regression for bug 1158
+  VERIFY_IS_APPROX(m1.cwiseAbs().colwise().sum().x(), m1.col(0).cwiseAbs().sum());
+
+  // test normalized
+  m2 = m1.colwise().normalized();
+  VERIFY_IS_APPROX(m2.col(c), m1.col(c).normalized());
+  m2 = m1.rowwise().normalized();
+  VERIFY_IS_APPROX(m2.row(r), m1.row(r).normalized());
+
+  // test normalize
+  m2 = m1;
+  m2.colwise().normalize();
+  VERIFY_IS_APPROX(m2.col(c), m1.col(c).normalized());
+  m2 = m1;
+  m2.rowwise().normalize();
+  VERIFY_IS_APPROX(m2.row(r), m1.row(r).normalized());
+
+  // test with partial reduction of products
+  Matrix<Scalar,MatrixType::RowsAtCompileTime,MatrixType::RowsAtCompileTime> m1m1 = m1 * m1.transpose();
+  VERIFY_IS_APPROX( (m1 * m1.transpose()).colwise().sum(), m1m1.colwise().sum());
+  Matrix<Scalar,1,MatrixType::RowsAtCompileTime> tmp(rows);
+  VERIFY_EVALUATION_COUNT( tmp = (m1 * m1.transpose()).colwise().sum(), 1);
+
+  m2 = m1.rowwise() - (m1.colwise().sum()/RealScalar(m1.rows())).eval();
+  m1 = m1.rowwise() - (m1.colwise().sum()/RealScalar(m1.rows()));
+  VERIFY_IS_APPROX( m1, m2 );
+  VERIFY_EVALUATION_COUNT( m2 = (m1.rowwise() - m1.colwise().sum()/RealScalar(m1.rows())), (MatrixType::RowsAtCompileTime!=1 ? 1 : 0) );
+
+  // test empty expressions
+  VERIFY_IS_APPROX(m1.matrix().middleCols(0,0).rowwise().sum().eval(), MatrixX::Zero(rows,1));
+  VERIFY_IS_APPROX(m1.matrix().middleRows(0,0).colwise().sum().eval(), MatrixX::Zero(1,cols));
+  VERIFY_IS_APPROX(m1.matrix().middleCols(0,fix<0>).rowwise().sum().eval(), MatrixX::Zero(rows,1));
+  VERIFY_IS_APPROX(m1.matrix().middleRows(0,fix<0>).colwise().sum().eval(), MatrixX::Zero(1,cols));
+
+  VERIFY_IS_APPROX(m1.matrix().middleCols(0,0).rowwise().prod().eval(), MatrixX::Ones(rows,1));
+  VERIFY_IS_APPROX(m1.matrix().middleRows(0,0).colwise().prod().eval(), MatrixX::Ones(1,cols));
+  VERIFY_IS_APPROX(m1.matrix().middleCols(0,fix<0>).rowwise().prod().eval(), MatrixX::Ones(rows,1));
+  VERIFY_IS_APPROX(m1.matrix().middleRows(0,fix<0>).colwise().prod().eval(), MatrixX::Ones(1,cols));
+  
+  VERIFY_IS_APPROX(m1.matrix().middleCols(0,0).rowwise().squaredNorm().eval(), MatrixX::Zero(rows,1));
+
+  VERIFY_RAISES_ASSERT(m1.real().middleCols(0,0).rowwise().minCoeff().eval());
+  VERIFY_RAISES_ASSERT(m1.real().middleRows(0,0).colwise().maxCoeff().eval());
+  VERIFY_IS_EQUAL(m1.real().middleRows(0,0).rowwise().maxCoeff().eval().rows(),0);
+  VERIFY_IS_EQUAL(m1.real().middleCols(0,0).colwise().maxCoeff().eval().cols(),0);
+  VERIFY_IS_EQUAL(m1.real().middleRows(0,fix<0>).rowwise().maxCoeff().eval().rows(),0);
+  VERIFY_IS_EQUAL(m1.real().middleCols(0,fix<0>).colwise().maxCoeff().eval().cols(),0);
+}
+
+EIGEN_DECLARE_TEST(vectorwiseop)
+{
+  CALL_SUBTEST_1( vectorwiseop_array(Array22cd()) );
+  CALL_SUBTEST_2( vectorwiseop_array(Array<double, 3, 2>()) );
+  CALL_SUBTEST_3( vectorwiseop_array(ArrayXXf(3, 4)) );
+  CALL_SUBTEST_4( vectorwiseop_matrix(Matrix4cf()) );
+  CALL_SUBTEST_5( vectorwiseop_matrix(Matrix4f()) );
+  CALL_SUBTEST_5( vectorwiseop_matrix(Vector4f()) );
+  CALL_SUBTEST_5( vectorwiseop_matrix(Matrix<float,4,5>()) );
+  CALL_SUBTEST_6( vectorwiseop_matrix(MatrixXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  CALL_SUBTEST_7( vectorwiseop_matrix(VectorXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  CALL_SUBTEST_7( vectorwiseop_matrix(RowVectorXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+}

diff --git a/test/visitor.cpp b/test/visitor.cpp
new file mode 100644
index 0000000..20fb2c3
--- /dev/null
+++ b/test/visitor.cpp

@@ -0,0 +1,193 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename MatrixType> void matrixVisitor(const MatrixType& p)
+{
+  typedef typename MatrixType::Scalar Scalar;
+
+  Index rows = p.rows();
+  Index cols = p.cols();
+
+  // construct a random matrix where all coefficients are different
+  MatrixType m;
+  m = MatrixType::Random(rows, cols);
+  for(Index i = 0; i < m.size(); i++)
+    for(Index i2 = 0; i2 < i; i2++)
+      while(m(i) == m(i2)) // yes, ==
+        m(i) = internal::random<Scalar>();
+  
+  Scalar minc = Scalar(1000), maxc = Scalar(-1000);
+  Index minrow=0,mincol=0,maxrow=0,maxcol=0;
+  for(Index j = 0; j < cols; j++)
+  for(Index i = 0; i < rows; i++)
+  {
+    if(m(i,j) < minc)
+    {
+      minc = m(i,j);
+      minrow = i;
+      mincol = j;
+    }
+    if(m(i,j) > maxc)
+    {
+      maxc = m(i,j);
+      maxrow = i;
+      maxcol = j;
+    }
+  }
+  Index eigen_minrow, eigen_mincol, eigen_maxrow, eigen_maxcol;
+  Scalar eigen_minc, eigen_maxc;
+  eigen_minc = m.minCoeff(&eigen_minrow,&eigen_mincol);
+  eigen_maxc = m.maxCoeff(&eigen_maxrow,&eigen_maxcol);
+  VERIFY(minrow == eigen_minrow);
+  VERIFY(maxrow == eigen_maxrow);
+  VERIFY(mincol == eigen_mincol);
+  VERIFY(maxcol == eigen_maxcol);
+  VERIFY_IS_APPROX(minc, eigen_minc);
+  VERIFY_IS_APPROX(maxc, eigen_maxc);
+  VERIFY_IS_APPROX(minc, m.minCoeff());
+  VERIFY_IS_APPROX(maxc, m.maxCoeff());
+
+  eigen_maxc = (m.adjoint()*m).maxCoeff(&eigen_maxrow,&eigen_maxcol);
+  Index maxrow2=0,maxcol2=0;
+  eigen_maxc = (m.adjoint()*m).eval().maxCoeff(&maxrow2,&maxcol2);
+  VERIFY(maxrow2 == eigen_maxrow);
+  VERIFY(maxcol2 == eigen_maxcol);
+
+  if (!NumTraits<Scalar>::IsInteger && m.size() > 2) {
+    // Test NaN propagation by replacing an element with NaN.
+    bool stop = false;
+    for (Index j = 0; j < cols && !stop; ++j) {
+      for (Index i = 0; i < rows && !stop; ++i) {
+        if (!(j == mincol && i == minrow) &&
+            !(j == maxcol && i == maxrow)) {
+          m(i,j) = NumTraits<Scalar>::quiet_NaN();
+          stop = true;
+          break;
+        }
+      }
+    }
+
+    eigen_minc = m.template minCoeff<PropagateNumbers>(&eigen_minrow, &eigen_mincol);
+    eigen_maxc = m.template maxCoeff<PropagateNumbers>(&eigen_maxrow, &eigen_maxcol);
+    VERIFY(minrow == eigen_minrow);
+    VERIFY(maxrow == eigen_maxrow);
+    VERIFY(mincol == eigen_mincol);
+    VERIFY(maxcol == eigen_maxcol);
+    VERIFY_IS_APPROX(minc, eigen_minc);
+    VERIFY_IS_APPROX(maxc, eigen_maxc);
+    VERIFY_IS_APPROX(minc, m.template minCoeff<PropagateNumbers>());
+    VERIFY_IS_APPROX(maxc, m.template maxCoeff<PropagateNumbers>());
+
+    eigen_minc = m.template minCoeff<PropagateNaN>(&eigen_minrow, &eigen_mincol);
+    eigen_maxc = m.template maxCoeff<PropagateNaN>(&eigen_maxrow, &eigen_maxcol);
+    VERIFY(minrow != eigen_minrow || mincol != eigen_mincol);
+    VERIFY(maxrow != eigen_maxrow || maxcol != eigen_maxcol);
+    VERIFY((numext::isnan)(eigen_minc));
+    VERIFY((numext::isnan)(eigen_maxc));
+  }
+
+}
+
+template<typename VectorType> void vectorVisitor(const VectorType& w)
+{
+  typedef typename VectorType::Scalar Scalar;
+
+  Index size = w.size();
+
+  // construct a random vector where all coefficients are different
+  VectorType v;
+  v = VectorType::Random(size);
+  for(Index i = 0; i < size; i++)
+    for(Index i2 = 0; i2 < i; i2++)
+      while(v(i) == v(i2)) // yes, ==
+        v(i) = internal::random<Scalar>();
+  
+  Scalar minc = v(0), maxc = v(0);
+  Index minidx=0, maxidx=0;
+  for(Index i = 0; i < size; i++)
+  {
+    if(v(i) < minc)
+    {
+      minc = v(i);
+      minidx = i;
+    }
+    if(v(i) > maxc)
+    {
+      maxc = v(i);
+      maxidx = i;
+    }
+  }
+  Index eigen_minidx, eigen_maxidx;
+  Scalar eigen_minc, eigen_maxc;
+  eigen_minc = v.minCoeff(&eigen_minidx);
+  eigen_maxc = v.maxCoeff(&eigen_maxidx);
+  VERIFY(minidx == eigen_minidx);
+  VERIFY(maxidx == eigen_maxidx);
+  VERIFY_IS_APPROX(minc, eigen_minc);
+  VERIFY_IS_APPROX(maxc, eigen_maxc);
+  VERIFY_IS_APPROX(minc, v.minCoeff());
+  VERIFY_IS_APPROX(maxc, v.maxCoeff());
+  
+  Index idx0 = internal::random<Index>(0,size-1);
+  Index idx1 = eigen_minidx;
+  Index idx2 = eigen_maxidx;
+  VectorType v1(v), v2(v);
+  v1(idx0) = v1(idx1);
+  v2(idx0) = v2(idx2);
+  v1.minCoeff(&eigen_minidx);
+  v2.maxCoeff(&eigen_maxidx);
+  VERIFY(eigen_minidx == (std::min)(idx0,idx1));
+  VERIFY(eigen_maxidx == (std::min)(idx0,idx2));
+
+  if (!NumTraits<Scalar>::IsInteger && size > 2) {
+    // Test NaN propagation by replacing an element with NaN.
+    for (Index i = 0; i < size; ++i) {
+      if (i != minidx && i != maxidx) {
+        v(i) = NumTraits<Scalar>::quiet_NaN();
+        break;
+      }
+    }
+    eigen_minc = v.template minCoeff<PropagateNumbers>(&eigen_minidx);
+    eigen_maxc = v.template maxCoeff<PropagateNumbers>(&eigen_maxidx);
+    VERIFY(minidx == eigen_minidx);
+    VERIFY(maxidx == eigen_maxidx);
+    VERIFY_IS_APPROX(minc, eigen_minc);
+    VERIFY_IS_APPROX(maxc, eigen_maxc);
+    VERIFY_IS_APPROX(minc, v.template minCoeff<PropagateNumbers>());
+    VERIFY_IS_APPROX(maxc, v.template maxCoeff<PropagateNumbers>());
+
+    eigen_minc = v.template minCoeff<PropagateNaN>(&eigen_minidx);
+    eigen_maxc = v.template maxCoeff<PropagateNaN>(&eigen_maxidx);
+    VERIFY(minidx != eigen_minidx);
+    VERIFY(maxidx != eigen_maxidx);
+    VERIFY((numext::isnan)(eigen_minc));
+    VERIFY((numext::isnan)(eigen_maxc));
+  }
+}
+
+EIGEN_DECLARE_TEST(visitor)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( matrixVisitor(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( matrixVisitor(Matrix2f()) );
+    CALL_SUBTEST_3( matrixVisitor(Matrix4d()) );
+    CALL_SUBTEST_4( matrixVisitor(MatrixXd(8, 12)) );
+    CALL_SUBTEST_5( matrixVisitor(Matrix<double,Dynamic,Dynamic,RowMajor>(20, 20)) );
+    CALL_SUBTEST_6( matrixVisitor(MatrixXi(8, 12)) );
+  }
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_7( vectorVisitor(Vector4f()) );
+    CALL_SUBTEST_7( vectorVisitor(Matrix<int,12,1>()) );
+    CALL_SUBTEST_8( vectorVisitor(VectorXd(10)) );
+    CALL_SUBTEST_9( vectorVisitor(RowVectorXd(10)) );
+    CALL_SUBTEST_10( vectorVisitor(VectorXf(33)) );
+  }
+}

diff --git a/test/zerosized.cpp b/test/zerosized.cpp
new file mode 100644
index 0000000..07afd0f
--- /dev/null
+++ b/test/zerosized.cpp

@@ -0,0 +1,111 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+
+template<typename MatrixType> void zeroReduction(const MatrixType& m) {
+  // Reductions that must hold for zero sized objects
+  VERIFY(m.all());
+  VERIFY(!m.any());
+  VERIFY(m.prod()==1);
+  VERIFY(m.sum()==0);
+  VERIFY(m.norm()==0);
+  VERIFY(m.squaredNorm()==0);
+  VERIFY(m.count()==0);
+  VERIFY(m.allFinite());
+  VERIFY(!m.hasNaN());
+  VERIFY_RAISES_ASSERT( m.minCoeff() );
+  VERIFY_RAISES_ASSERT( m.maxCoeff() );
+  Index i,j;
+  VERIFY_RAISES_ASSERT( m.minCoeff(&i,&j) );
+  VERIFY_RAISES_ASSERT( m.maxCoeff(&i,&j) );
+  VERIFY_RAISES_ASSERT( m.reshaped().minCoeff(&i) );
+  VERIFY_RAISES_ASSERT( m.reshaped().maxCoeff(&i) );
+}
+
+
+template<typename MatrixType> void zeroSizedMatrix()
+{
+  MatrixType t1;
+  typedef typename MatrixType::Scalar Scalar;
+
+  if (MatrixType::SizeAtCompileTime == Dynamic || MatrixType::SizeAtCompileTime == 0)
+  {
+    zeroReduction(t1);
+    if (MatrixType::RowsAtCompileTime == Dynamic)
+      VERIFY(t1.rows() == 0);
+    if (MatrixType::ColsAtCompileTime == Dynamic)
+      VERIFY(t1.cols() == 0);
+
+    if (MatrixType::RowsAtCompileTime == Dynamic && MatrixType::ColsAtCompileTime == Dynamic)
+    {
+
+      MatrixType t2(0, 0), t3(t1);
+      VERIFY(t2.rows() == 0);
+      VERIFY(t2.cols() == 0);
+
+      zeroReduction(t2);
+      VERIFY(t1==t2);
+    }
+  }
+
+  if(MatrixType::MaxColsAtCompileTime!=0 && MatrixType::MaxRowsAtCompileTime!=0)
+  {
+    Index rows = MatrixType::RowsAtCompileTime==Dynamic ? internal::random<Index>(1,10) : Index(MatrixType::RowsAtCompileTime);
+    Index cols = MatrixType::ColsAtCompileTime==Dynamic ? internal::random<Index>(1,10) : Index(MatrixType::ColsAtCompileTime);
+    MatrixType m(rows,cols);
+    zeroReduction(m.template block<0,MatrixType::ColsAtCompileTime>(0,0,0,cols));
+    zeroReduction(m.template block<MatrixType::RowsAtCompileTime,0>(0,0,rows,0));
+    zeroReduction(m.template block<0,1>(0,0));
+    zeroReduction(m.template block<1,0>(0,0));
+    Matrix<Scalar,Dynamic,Dynamic> prod = m.template block<MatrixType::RowsAtCompileTime,0>(0,0,rows,0) * m.template block<0,MatrixType::ColsAtCompileTime>(0,0,0,cols);
+    VERIFY(prod.rows()==rows && prod.cols()==cols);
+    VERIFY(prod.isZero());
+    prod = m.template block<1,0>(0,0) * m.template block<0,1>(0,0);
+    VERIFY(prod.size()==1);
+    VERIFY(prod.isZero());
+  }
+}
+
+template<typename VectorType> void zeroSizedVector()
+{
+  VectorType t1;
+
+  if (VectorType::SizeAtCompileTime == Dynamic || VectorType::SizeAtCompileTime==0)
+  {
+    zeroReduction(t1);
+    VERIFY(t1.size() == 0);
+    VectorType t2(DenseIndex(0)); // DenseIndex disambiguates with 0-the-null-pointer (error with gcc 4.4 and MSVC8)
+    VERIFY(t2.size() == 0);
+    zeroReduction(t2);
+
+    VERIFY(t1==t2);
+  }
+}
+
+EIGEN_DECLARE_TEST(zerosized)
+{
+  zeroSizedMatrix<Matrix2d>();
+  zeroSizedMatrix<Matrix3i>();
+  zeroSizedMatrix<Matrix<float, 2, Dynamic> >();
+  zeroSizedMatrix<MatrixXf>();
+  zeroSizedMatrix<Matrix<float, 0, 0> >();
+  zeroSizedMatrix<Matrix<float, Dynamic, 0, 0, 0, 0> >();
+  zeroSizedMatrix<Matrix<float, 0, Dynamic, 0, 0, 0> >();
+  zeroSizedMatrix<Matrix<float, Dynamic, Dynamic, 0, 0, 0> >();
+  zeroSizedMatrix<Matrix<float, 0, 4> >();
+  zeroSizedMatrix<Matrix<float, 4, 0> >();
+
+  zeroSizedVector<Vector2d>();
+  zeroSizedVector<Vector3i>();
+  zeroSizedVector<VectorXf>();
+  zeroSizedVector<Matrix<float, 0, 1> >();
+  zeroSizedVector<Matrix<float, 1, 0> >();
+}

diff --git a/unsupported/CMakeLists.txt b/unsupported/CMakeLists.txt
new file mode 100644
index 0000000..34408c0
--- /dev/null
+++ b/unsupported/CMakeLists.txt

@@ -0,0 +1,11 @@
+add_subdirectory(Eigen)
+if(EIGEN_BUILD_DOC)
+  add_subdirectory(doc EXCLUDE_FROM_ALL)
+endif()
+if(BUILD_TESTING)
+  if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
+    add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest
+  else()
+    add_subdirectory(test EXCLUDE_FROM_ALL)
+  endif()
+endif()

diff --git a/unsupported/Eigen/AdolcForward b/unsupported/Eigen/AdolcForward
new file mode 100644
index 0000000..56caeae
--- /dev/null
+++ b/unsupported/Eigen/AdolcForward

@@ -0,0 +1,159 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ADLOC_FORWARD
+#define EIGEN_ADLOC_FORWARD
+
+//--------------------------------------------------------------------------------
+//
+// This file provides support for adolc's adouble type in forward mode.
+// ADOL-C is a C++ automatic differentiation library,
+// see https://projects.coin-or.org/ADOL-C for more information.
+//
+// Note that the maximal number of directions is controlled by
+// the preprocessor token NUMBER_DIRECTIONS. The default is 2.
+//
+//--------------------------------------------------------------------------------
+
+#define ADOLC_TAPELESS
+#ifndef NUMBER_DIRECTIONS
+# define NUMBER_DIRECTIONS 2
+#endif
+#include <adolc/adtl.h>
+
+// adolc defines some very stupid macros:
+#if defined(malloc)
+# undef malloc
+#endif
+
+#if defined(calloc)
+# undef calloc
+#endif
+
+#if defined(realloc)
+# undef realloc
+#endif
+
+#include "../../Eigen/Core"
+
+namespace Eigen {
+
+/**
+  * \defgroup AdolcForward_Module Adolc forward module
+  * This module provides support for adolc's adouble type in forward mode.
+  * ADOL-C is a C++ automatic differentiation library,
+  * see https://projects.coin-or.org/ADOL-C for more information.
+  * It mainly consists in:
+  *  - a struct Eigen::NumTraits<adtl::adouble> specialization
+  *  - overloads of internal::* math function for adtl::adouble type.
+  *
+  * Note that the maximal number of directions is controlled by
+  * the preprocessor token NUMBER_DIRECTIONS. The default is 2.
+  *
+  * \code
+  * #include <unsupported/Eigen/AdolcSupport>
+  * \endcode
+  */
+  //@{
+
+} // namespace Eigen
+
+// Eigen's require a few additional functions which must be defined in the same namespace
+// than the custom scalar type own namespace
+namespace adtl {
+
+inline const adouble& conj(const adouble& x)  { return x; }
+inline const adouble& real(const adouble& x)  { return x; }
+inline adouble imag(const adouble&)    { return 0.; }
+inline adouble abs(const adouble&  x)  { return fabs(x); }
+inline adouble abs2(const adouble& x)  { return x*x; }
+
+inline bool (isinf)(const adouble& x) { return (Eigen::numext::isinf)(x.getValue()); }
+inline bool (isnan)(const adouble& x) { return (Eigen::numext::isnan)(x.getValue()); }
+
+}
+
+namespace Eigen {
+
+template<> struct NumTraits<adtl::adouble>
+    : NumTraits<double>
+{
+  typedef adtl::adouble Real;
+  typedef adtl::adouble NonInteger;
+  typedef adtl::adouble Nested;
+  enum {
+    IsComplex = 0,
+    IsInteger = 0,
+    IsSigned = 1,
+    RequireInitialization = 1,
+    ReadCost = 1,
+    AddCost = 1,
+    MulCost = 1
+  };
+};
+
+template<typename Functor> class AdolcForwardJacobian : public Functor
+{
+  typedef adtl::adouble ActiveScalar;
+public:
+
+  AdolcForwardJacobian() : Functor() {}
+  AdolcForwardJacobian(const Functor& f) : Functor(f) {}
+
+  // forward constructors
+  template<typename T0>
+  AdolcForwardJacobian(const T0& a0) : Functor(a0) {}
+  template<typename T0, typename T1>
+  AdolcForwardJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {}
+  template<typename T0, typename T1, typename T2>
+  AdolcForwardJacobian(const T0& a0, const T1& a1, const T1& a2) : Functor(a0, a1, a2) {}
+
+  typedef typename Functor::InputType InputType;
+  typedef typename Functor::ValueType ValueType;
+  typedef typename Functor::JacobianType JacobianType;
+
+  typedef Matrix<ActiveScalar, InputType::SizeAtCompileTime, 1> ActiveInput;
+  typedef Matrix<ActiveScalar, ValueType::SizeAtCompileTime, 1> ActiveValue;
+
+  void operator() (const InputType& x, ValueType* v, JacobianType* _jac) const
+  {
+    eigen_assert(v!=0);
+    if (!_jac)
+    {
+      Functor::operator()(x, v);
+      return;
+    }
+
+    JacobianType& jac = *_jac;
+
+    ActiveInput ax = x.template cast<ActiveScalar>();
+    ActiveValue av(jac.rows());
+
+    for (int j=0; j<jac.cols(); j++)
+      for (int i=0; i<jac.cols(); i++)
+        ax[i].setADValue(j, i==j ? 1 : 0);
+
+    Functor::operator()(ax, &av);
+
+    for (int i=0; i<jac.rows(); i++)
+    {
+      (*v)[i] = av[i].getValue();
+      for (int j=0; j<jac.cols(); j++)
+        jac.coeffRef(i,j) = av[i].getADValue(j);
+    }
+  }
+protected:
+
+};
+
+//@}
+
+}
+
+#endif // EIGEN_ADLOC_FORWARD

diff --git a/unsupported/Eigen/AlignedVector3 b/unsupported/Eigen/AlignedVector3
new file mode 100644
index 0000000..4fa1842
--- /dev/null
+++ b/unsupported/Eigen/AlignedVector3

@@ -0,0 +1,234 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ALIGNED_VECTOR3
+#define EIGEN_ALIGNED_VECTOR3
+
+#include "../../Eigen/Geometry"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+namespace Eigen {
+
+/**
+  * \defgroup AlignedVector3_Module Aligned vector3 module
+  *
+  * \code
+  * #include <unsupported/Eigen/AlignedVector3>
+  * \endcode
+  */
+  //@{
+
+
+/** \class AlignedVector3
+  *
+  * \brief A vectorization friendly 3D vector
+  *
+  * This class represents a 3D vector internally using a 4D vector
+  * such that vectorization can be seamlessly enabled. Of course,
+  * the same result can be achieved by directly using a 4D vector.
+  * This class makes this process simpler.
+  *
+  */
+// TODO specialize Cwise
+template<typename _Scalar> class AlignedVector3;
+
+namespace internal {
+template<typename _Scalar> struct traits<AlignedVector3<_Scalar> >
+  : traits<Matrix<_Scalar,3,1,0,4,1> >
+{
+};
+}
+
+template<typename _Scalar> class AlignedVector3
+  : public MatrixBase<AlignedVector3<_Scalar> >
+{
+    typedef Matrix<_Scalar,4,1> CoeffType;
+    CoeffType m_coeffs;
+  public:
+
+    typedef MatrixBase<AlignedVector3<_Scalar> > Base;	
+    EIGEN_DENSE_PUBLIC_INTERFACE(AlignedVector3)
+    using Base::operator*;
+
+    inline Index rows() const { return 3; }
+    inline Index cols() const { return 1; }
+    
+    Scalar* data() { return m_coeffs.data(); }
+    const Scalar* data() const { return m_coeffs.data(); }
+    Index innerStride() const { return 1; }
+    Index outerStride() const { return 3; }
+
+    inline const Scalar& coeff(Index row, Index col) const
+    { return m_coeffs.coeff(row, col); }
+
+    inline Scalar& coeffRef(Index row, Index col)
+    { return m_coeffs.coeffRef(row, col); }
+
+    inline const Scalar& coeff(Index index) const
+    { return m_coeffs.coeff(index); }
+
+    inline Scalar& coeffRef(Index index)
+    { return m_coeffs.coeffRef(index);}
+
+
+    inline AlignedVector3()
+    {}
+
+    inline AlignedVector3(const Scalar& x, const Scalar& y, const Scalar& z)
+      : m_coeffs(x, y, z, Scalar(0))
+    {}
+
+    inline AlignedVector3(const AlignedVector3& other)
+      : Base(), m_coeffs(other.m_coeffs)
+    {}
+
+    template<typename XprType, int Size=XprType::SizeAtCompileTime>
+    struct generic_assign_selector {};
+
+    template<typename XprType> struct generic_assign_selector<XprType,4>
+    {
+      inline static void run(AlignedVector3& dest, const XprType& src)
+      {
+        dest.m_coeffs = src;
+      }
+    };
+
+    template<typename XprType> struct generic_assign_selector<XprType,3>
+    {
+      inline static void run(AlignedVector3& dest, const XprType& src)
+      {
+        dest.m_coeffs.template head<3>() = src;
+        dest.m_coeffs.w() = Scalar(0);
+      }
+    };
+
+    template<typename Derived>
+    inline AlignedVector3(const MatrixBase<Derived>& other)
+    {
+      generic_assign_selector<Derived>::run(*this,other.derived());
+    }
+
+    inline AlignedVector3& operator=(const AlignedVector3& other)
+    { m_coeffs = other.m_coeffs; return *this; }
+
+    template <typename Derived>
+    inline AlignedVector3& operator=(const MatrixBase<Derived>& other)
+    {
+      generic_assign_selector<Derived>::run(*this,other.derived());
+      return *this;
+    }
+
+    inline AlignedVector3 operator+(const AlignedVector3& other) const
+    { return AlignedVector3(m_coeffs + other.m_coeffs); }
+
+    inline AlignedVector3& operator+=(const AlignedVector3& other)
+    { m_coeffs += other.m_coeffs; return *this; }
+
+    inline AlignedVector3 operator-(const AlignedVector3& other) const
+    { return AlignedVector3(m_coeffs - other.m_coeffs); }
+
+    inline AlignedVector3 operator-() const
+    { return AlignedVector3(-m_coeffs); }
+
+    inline AlignedVector3 operator-=(const AlignedVector3& other)
+    { m_coeffs -= other.m_coeffs; return *this; }
+
+    inline AlignedVector3 operator*(const Scalar& s) const
+    { return AlignedVector3(m_coeffs * s); }
+
+    inline friend AlignedVector3 operator*(const Scalar& s,const AlignedVector3& vec)
+    { return AlignedVector3(s * vec.m_coeffs); }
+
+    inline AlignedVector3& operator*=(const Scalar& s)
+    { m_coeffs *= s; return *this; }
+
+    inline AlignedVector3 operator/(const Scalar& s) const
+    { return AlignedVector3(m_coeffs / s); }
+
+    inline AlignedVector3& operator/=(const Scalar& s)
+    { m_coeffs /= s; return *this; }
+
+    inline Scalar dot(const AlignedVector3& other) const
+    {
+      eigen_assert(m_coeffs.w()==Scalar(0));
+      eigen_assert(other.m_coeffs.w()==Scalar(0));
+      return m_coeffs.dot(other.m_coeffs);
+    }
+
+    inline void normalize()
+    {
+      m_coeffs /= norm();
+    }
+
+    inline AlignedVector3 normalized() const
+    {
+      return AlignedVector3(m_coeffs / norm());
+    }
+
+    inline Scalar sum() const
+    {
+      eigen_assert(m_coeffs.w()==Scalar(0));
+      return m_coeffs.sum();
+    }
+
+    inline Scalar squaredNorm() const
+    {
+      eigen_assert(m_coeffs.w()==Scalar(0));
+      return m_coeffs.squaredNorm();
+    }
+
+    inline Scalar norm() const
+    {
+      using std::sqrt;
+      return sqrt(squaredNorm());
+    }
+
+    inline AlignedVector3 cross(const AlignedVector3& other) const
+    {
+      return AlignedVector3(m_coeffs.cross3(other.m_coeffs));
+    }
+
+    template<typename Derived>
+    inline bool isApprox(const MatrixBase<Derived>& other, const RealScalar& eps=NumTraits<Scalar>::dummy_precision()) const
+    {
+      return m_coeffs.template head<3>().isApprox(other,eps);
+    }
+    
+    CoeffType& coeffs() { return m_coeffs; }
+    const CoeffType& coeffs() const { return m_coeffs; }
+};
+
+namespace internal {
+
+template<typename _Scalar>
+struct eval<AlignedVector3<_Scalar>, Dense>
+{
+ typedef const AlignedVector3<_Scalar>& type;
+};
+
+template<typename Scalar>
+struct evaluator<AlignedVector3<Scalar> >
+  : evaluator<Matrix<Scalar,4,1> >
+{
+  typedef AlignedVector3<Scalar> XprType;
+  typedef evaluator<Matrix<Scalar,4,1> > Base;
+  
+  evaluator(const XprType &m) : Base(m.coeffs()) {}  
+};
+
+}
+
+//@}
+
+}
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_ALIGNED_VECTOR3

diff --git a/unsupported/Eigen/ArpackSupport b/unsupported/Eigen/ArpackSupport
new file mode 100644
index 0000000..67c4ac8
--- /dev/null
+++ b/unsupported/Eigen/ArpackSupport

@@ -0,0 +1,30 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARPACKSUPPORT_MODULE_H
+#define EIGEN_ARPACKSUPPORT_MODULE_H
+
+#include "../../Eigen/Core"
+
+/** \defgroup ArpackSupport_Module Arpack support module
+  *
+  * This module provides a wrapper to Arpack, a library for sparse eigenvalue decomposition.
+  *
+  * \code
+  * #include <Eigen/ArpackSupport>
+  * \endcode
+  */
+
+#include "../../Eigen/SparseCholesky"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+#include "src/Eigenvalues/ArpackSelfAdjointEigenSolver.h"
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_ARPACKSUPPORT_MODULE_H

diff --git a/unsupported/Eigen/AutoDiff b/unsupported/Eigen/AutoDiff
new file mode 100644
index 0000000..7a4ff46
--- /dev/null
+++ b/unsupported/Eigen/AutoDiff

@@ -0,0 +1,46 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_AUTODIFF_MODULE
+#define EIGEN_AUTODIFF_MODULE
+
+namespace Eigen {
+
+/**
+  * \defgroup AutoDiff_Module Auto Diff module
+  *
+  * This module features forward automatic differentation via a simple
+  * templated scalar type wrapper AutoDiffScalar.
+  *
+  * Warning : this should NOT be confused with numerical differentiation, which
+  * is a different method and has its own module in Eigen : \ref NumericalDiff_Module.
+  *
+  * \code
+  * #include <unsupported/Eigen/AutoDiff>
+  * \endcode
+  */
+//@{
+
+}
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+
+#include "src/AutoDiff/AutoDiffScalar.h"
+// #include "src/AutoDiff/AutoDiffVector.h"
+#include "src/AutoDiff/AutoDiffJacobian.h"
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+
+
+namespace Eigen {
+//@}
+}
+
+#endif // EIGEN_AUTODIFF_MODULE

diff --git a/unsupported/Eigen/BVH b/unsupported/Eigen/BVH
new file mode 100644
index 0000000..666c983
--- /dev/null
+++ b/unsupported/Eigen/BVH

@@ -0,0 +1,95 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Ilya Baran <ibaran@mit.edu>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BVH_MODULE_H
+#define EIGEN_BVH_MODULE_H
+
+#include "../../Eigen/Core"
+#include "../../Eigen/Geometry"
+#include "../../Eigen/StdVector"
+#include <algorithm>
+#include <queue>
+
+namespace Eigen {
+
+/**
+  * \defgroup BVH_Module BVH module
+  * \brief This module provides generic bounding volume hierarchy algorithms
+  * and reference tree implementations.
+  *
+  *
+  * \code
+  * #include <unsupported/Eigen/BVH>
+  * \endcode
+  *
+  * A bounding volume hierarchy (BVH) can accelerate many geometric queries.  This module provides a generic implementation
+  * of the two basic algorithms over a BVH: intersection of a query object against all objects in the hierarchy and minimization
+  * of a function over the objects in the hierarchy.  It also provides intersection and minimization over a cartesian product of
+  * two BVH's.  A BVH accelerates intersection by using the fact that if a query object does not intersect a volume, then it cannot
+  * intersect any object contained in that volume.  Similarly, a BVH accelerates minimization because the minimum of a function
+  * over a volume is no greater than the minimum of a function over any object contained in it.
+  *
+  * Some sample queries that can be written in terms of intersection are:
+  *   - Determine all points where a ray intersects a triangle mesh
+  *   - Given a set of points, determine which are contained in a query sphere
+  *   - Given a set of spheres, determine which contain the query point
+  *   - Given a set of disks, determine if any is completely contained in a query rectangle (represent each 2D disk as a point \f$(x,y,r)\f$
+  *     in 3D and represent the rectangle as a pyramid based on the original rectangle and shrinking in the \f$r\f$ direction)
+  *   - Given a set of points, count how many pairs are \f$d\pm\epsilon\f$ apart (done by looking at the cartesian product of the set
+  *     of points with itself)
+  *
+  * Some sample queries that can be written in terms of function minimization over a set of objects are:
+  *   - Find the intersection between a ray and a triangle mesh closest to the ray origin (function is infinite off the ray)
+  *   - Given a polyline and a query point, determine the closest point on the polyline to the query
+  *   - Find the diameter of a point cloud (done by looking at the cartesian product and using negative distance as the function)
+  *   - Determine how far two meshes are from colliding (this is also a cartesian product query)
+  *
+  * This implementation decouples the basic algorithms both from the type of hierarchy (and the types of the bounding volumes) and
+  * from the particulars of the query.  To enable abstraction from the BVH, the BVH is required to implement a generic mechanism
+  * for traversal.  To abstract from the query, the query is responsible for keeping track of results.
+  *
+  * To be used in the algorithms, a hierarchy must implement the following traversal mechanism (see KdBVH for a sample implementation): \code
+      typedef Volume  //the type of bounding volume
+      typedef Object  //the type of object in the hierarchy
+      typedef Index   //a reference to a node in the hierarchy--typically an int or a pointer
+      typedef VolumeIterator //an iterator type over node children--returns Index
+      typedef ObjectIterator //an iterator over object (leaf) children--returns const Object &
+      Index getRootIndex() const //returns the index of the hierarchy root
+      const Volume &getVolume(Index index) const //returns the bounding volume of the node at given index
+      void getChildren(Index index, VolumeIterator &outVBegin, VolumeIterator &outVEnd,
+                      ObjectIterator &outOBegin, ObjectIterator &outOEnd) const
+      //getChildren takes a node index and makes [outVBegin, outVEnd) range over its node children
+      //and [outOBegin, outOEnd) range over its object children
+    \endcode
+  *
+  * To use the hierarchy, call BVIntersect or BVMinimize, passing it a BVH (or two, for cartesian product) and a minimizer or intersector.
+  * For an intersection query on a single BVH, the intersector encapsulates the query and must provide two functions:
+  * \code
+      bool intersectVolume(const Volume &volume) //returns true if the query intersects the volume
+      bool intersectObject(const Object &object) //returns true if the intersection search should terminate immediately
+    \endcode
+  * The guarantee that BVIntersect provides is that intersectObject will be called on every object whose bounding volume
+  * intersects the query (but possibly on other objects too) unless the search is terminated prematurely.  It is the
+  * responsibility of the intersectObject function to keep track of the results in whatever manner is appropriate.
+  * The cartesian product intersection and the BVMinimize queries are similar--see their individual documentation.
+  *
+  * The following is a simple but complete example for how to use the BVH to accelerate the search for a closest red-blue point pair:
+  * \include BVH_Example.cpp
+  * Output: \verbinclude BVH_Example.out
+  */
+}
+
+//@{
+
+#include "src/BVH/BVAlgorithms.h"
+#include "src/BVH/KdBVH.h"
+
+//@}
+
+#endif // EIGEN_BVH_MODULE_H

diff --git a/unsupported/Eigen/CMakeLists.txt b/unsupported/Eigen/CMakeLists.txt
new file mode 100644
index 0000000..631a060
--- /dev/null
+++ b/unsupported/Eigen/CMakeLists.txt

@@ -0,0 +1,32 @@
+set(Eigen_HEADERS 
+  AdolcForward
+  AlignedVector3
+  ArpackSupport
+  AutoDiff
+  BVH
+  EulerAngles
+  FFT
+  IterativeSolvers 
+  KroneckerProduct
+  LevenbergMarquardt
+  MatrixFunctions 
+  MoreVectorization
+  MPRealSupport
+  NonLinearOptimization
+  NumericalDiff
+  OpenGLSupport
+  Polynomials
+  Skyline 
+  SparseExtra
+  SpecialFunctions
+  Splines
+  )
+
+install(FILES
+  ${Eigen_HEADERS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel
+  )
+
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h")
+
+add_subdirectory(CXX11)

diff --git a/unsupported/Eigen/CXX11/CMakeLists.txt b/unsupported/Eigen/CXX11/CMakeLists.txt
new file mode 100644
index 0000000..385ed24
--- /dev/null
+++ b/unsupported/Eigen/CXX11/CMakeLists.txt

@@ -0,0 +1,8 @@
+set(Eigen_CXX11_HEADERS Tensor TensorSymmetry ThreadPool)
+
+install(FILES
+  ${Eigen_CXX11_HEADERS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel
+  )
+
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h")

diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 42cc240..0938bb5 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor

@@ -1,20 +1,25 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
 // Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_CXX11_TENSOR_MODULE
-#define EIGEN_CXX11_TENSOR_MODULE
+//#ifndef EIGEN_CXX11_TENSOR_MODULE
+//#define EIGEN_CXX11_TENSOR_MODULE
 
-#include "Eigen/src/Core/util/StaticAssert.h"
-#include "unsupported/Eigen/CXX11/Core"
-#include "unsupported/Eigen/SpecialFunctions"
+#include "../../../Eigen/Core"
 
-#include "Eigen/src/Core/util/DisableStupidWarnings.h"
+#if EIGEN_HAS_CXX11
+
+#include "../SpecialFunctions"
+
+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
+#include "src/util/CXX11Meta.h"
+#include "src/util/MaxSizeVector.h"
 
 /** \defgroup CXX11_Tensor_Module Tensor Module
   *
@@ -24,143 +29,109 @@
   * \code
   * #include <Eigen/CXX11/Tensor>
   * \endcode
+  *
+  * Much of the documentation can be found \ref eigen_tensors "here".
   */
 
+#include <atomic>
+#include <chrono>
+#include <cmath>
 #include <cstddef>
 #include <cstring>
-#include <cmath>
-#if !defined HEXAGON
-using ::std::isfinite;
-using ::std::fpclassify;
-#endif
-#if defined(ANDROID) || defined(HEXAGON)
-#include <math.h>
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
-#endif
-#include <stdint.h>
-
-#if __cplusplus > 199711
 #include <random>
-#endif
-
-#ifdef EIGEN_USE_THREADS
-#include <atomic>
-#if defined(EIGEN_USE_CUSTOM_THREAD_POOL)
-// Use standard C++ synchronization primitives.
-#include <condition_variable>
-#include <mutex>
 #include <thread>
-#else
-// Use tensorflow synchronization primitives.
-#include "third_party/tensorflow/core/platform/types.h"
-#include "third_party/tensorflow/core/platform/mutex.h"
-#endif  // EIGEN_USE_CUSTOM_THREAD_POOL
 
-#include <functional>
-using ::std::binary_function;
-using ::std::equal_to;
-using ::std::greater;
-
-#endif  // EIGEN_USE_THREADS
+#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
+#include "ThreadPool"
+#endif
 
 #ifdef EIGEN_USE_GPU
-#include "third_party/tensorflow/core/platform/types.h"
-#include "third_party/tensorflow/core/platform/mutex.h"
-#include <cuda.h>
-#include <cufft.h>
-#include <cuda_runtime.h>
-#endif  // EIGEN_USE_GPU
-
-#ifdef _WIN32
-#include <winbase.h>
-#elif defined(__APPLE__)
-#include <mach/mach_time.h>
-#else
-#include <time.h>
+  #include <iostream>
+  #if defined(EIGEN_USE_HIP)
+    #include <hip/hip_runtime.h>
+  #else
+    #include <cuda_runtime.h>
+  #endif
 #endif
 
-#if defined(EIGEN_USE_LIBXSMM)
-#include "third_party/libxsmm/include/libxsmm.h"
+#include "src/Tensor/TensorMacros.h"
+#include "src/Tensor/TensorForwardDeclarations.h"
+#include "src/Tensor/TensorMeta.h"
+#include "src/Tensor/TensorFunctors.h"
+#include "src/Tensor/TensorCostModel.h"
+#include "src/Tensor/TensorDeviceDefault.h"
+#include "src/Tensor/TensorDeviceThreadPool.h"
+#include "src/Tensor/TensorDeviceGpu.h"
+#ifndef gpu_assert
+#define gpu_assert(x)
+#endif
+#include "src/Tensor/TensorDeviceSycl.h"
+#include "src/Tensor/TensorIndexList.h"
+#include "src/Tensor/TensorDimensionList.h"
+#include "src/Tensor/TensorDimensions.h"
+#include "src/Tensor/TensorInitializer.h"
+#include "src/Tensor/TensorTraits.h"
+#include "src/Tensor/TensorRandom.h"
+#include "src/Tensor/TensorUInt128.h"
+#include "src/Tensor/TensorIntDiv.h"
+#include "src/Tensor/TensorGlobalFunctions.h"
+
+#include "src/Tensor/TensorBase.h"
+#include "src/Tensor/TensorBlock.h"
+
+#include "src/Tensor/TensorEvaluator.h"
+#include "src/Tensor/TensorExpr.h"
+#include "src/Tensor/TensorReduction.h"
+#include "src/Tensor/TensorReductionGpu.h"
+#include "src/Tensor/TensorArgMax.h"
+#include "src/Tensor/TensorConcatenation.h"
+#include "src/Tensor/TensorContractionMapper.h"
+#include "src/Tensor/TensorContractionBlocking.h"
+#include "src/Tensor/TensorContraction.h"
+#include "src/Tensor/TensorContractionThreadPool.h"
+#include "src/Tensor/TensorContractionGpu.h"
+#include "src/Tensor/TensorConversion.h"
+#include "src/Tensor/TensorConvolution.h"
+#include "src/Tensor/TensorFFT.h"
+#include "src/Tensor/TensorPatch.h"
+#include "src/Tensor/TensorImagePatch.h"
+#include "src/Tensor/TensorVolumePatch.h"
+#include "src/Tensor/TensorBroadcasting.h"
+#include "src/Tensor/TensorChipping.h"
+#include "src/Tensor/TensorInflation.h"
+#include "src/Tensor/TensorLayoutSwap.h"
+#include "src/Tensor/TensorMorphing.h"
+#include "src/Tensor/TensorPadding.h"
+#include "src/Tensor/TensorReverse.h"
+#include "src/Tensor/TensorShuffling.h"
+#include "src/Tensor/TensorStriding.h"
+#include "src/Tensor/TensorCustomOp.h"
+#include "src/Tensor/TensorEvalTo.h"
+#include "src/Tensor/TensorForcedEval.h"
+#include "src/Tensor/TensorGenerator.h"
+#include "src/Tensor/TensorAssign.h"
+#include "src/Tensor/TensorScan.h"
+#include "src/Tensor/TensorTrace.h"
+
+#ifdef EIGEN_USE_SYCL
+#include "src/Tensor/TensorReductionSycl.h"
+#include "src/Tensor/TensorConvolutionSycl.h"
+#include "src/Tensor/TensorContractionSycl.h"
+#include "src/Tensor/TensorScanSycl.h"
 #endif
 
-#include "Eigen/Core"
+#include "src/Tensor/TensorExecutor.h"
+#include "src/Tensor/TensorDevice.h"
 
-// Beware: the order of the include matters to some compilers. For example
-// TensorIndexList.h should be included before TensorDimensions.h in order to
-// use index lists to encode tensor dimensions when compiling with llvm.
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/ThreadPoolInterface.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h"
-#if defined(EIGEN_USE_THREADS)
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorNonBlockingThreadPool.h"
-#endif
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h"
+#include "src/Tensor/TensorStorage.h"
+#include "src/Tensor/Tensor.h"
+#include "src/Tensor/TensorFixedSize.h"
+#include "src/Tensor/TensorMap.h"
+#include "src/Tensor/TensorRef.h"
 
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorStats.h"
+#include "src/Tensor/TensorIO.h"
 
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h"
+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h"
-
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionMappers.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorScan.h"
-
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h"
-
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorMap.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorRef.h"
-
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h"
-
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorIO.h"
-
-#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
-
-#endif // EIGEN_CXX11_TENSOR_MODULE
+#endif  // EIGEN_HAS_CXX11
+//#endif // EIGEN_CXX11_TENSOR_MODULE

diff --git a/unsupported/Eigen/CXX11/TensorSymmetry b/unsupported/Eigen/CXX11/TensorSymmetry
index 027c608..b09c5e4 100644
--- a/unsupported/Eigen/CXX11/TensorSymmetry
+++ b/unsupported/Eigen/CXX11/TensorSymmetry

@@ -10,9 +10,11 @@
 #ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE
 #define EIGEN_CXX11_TENSORSYMMETRY_MODULE
 
-#include <Eigen/CXX11/Tensor>
+#include "Tensor"
 
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#include "src/util/CXX11Meta.h"
 
 /** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module
   *
@@ -31,7 +33,7 @@
 #include "src/TensorSymmetry/StaticSymmetry.h"
 #include "src/TensorSymmetry/DynamicSymmetry.h"
 
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE
 

diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool
index 73571dd..c5cafb2 100644
--- a/unsupported/Eigen/CXX11/ThreadPool
+++ b/unsupported/Eigen/CXX11/ThreadPool

@@ -12,7 +12,7 @@
 
 #include "../../../Eigen/Core"
 
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
 /** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module
   *
@@ -30,55 +30,45 @@
 
 // The code depends on CXX11, so only include the module if the
 // compiler supports it.
-#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
+#if (EIGEN_COMP_CXXVER >= 11)
 #include <cstddef>
 #include <cstring>
-#include <stdint.h>
 #include <time.h>
 
 #include <vector>
-using ::std::vector;
 #include <atomic>
-#include <deque>
-using ::std::deque;
-#include <functional>
-using ::std::binary_function;
-using ::std::equal_to;
-using ::std::greater;
-#include <memory>
-using ::std::allocator;
-
-#if defined(EIGEN_USE_CUSTOM_THREAD_POOL)
-// Use standard C++ synchronization primitives.
 #include <condition_variable>
+#include <deque>
 #include <mutex>
 #include <thread>
-#else
-// Use tensorflow synchronization primitives.
-#include "third_party/tensorflow/core/platform/types.h"
-#include "third_party/tensorflow/core/platform/mutex.h"
-#endif  // EIGEN_USE_CUSTOM_THREAD_POOL
+#include <functional>
+#include <memory>
+#include <utility>
 
-
-#ifdef EIGEN_USE_CUSTOM_THREAD_POOL
-typedef std::mutex mutex;
-typedef std::condition_variable condition_variable;
-typedef std::unique_lock<std::mutex> mutex_lock;
-#else
-typedef tensorflow::mutex mutex;
-typedef tensorflow::condition_variable condition_variable;
-typedef tensorflow::mutex_lock mutex_lock;
+// There are non-parenthesized calls to "max" in the  <unordered_map> header,
+// which trigger a check in test/main.h causing compilation to fail.
+// We work around the check here by removing the check for max in
+// the case where we have to emulate thread_local.
+#ifdef max
+#undef max
 #endif
+#include <unordered_map>
 
-#include "unsupported/Eigen/CXX11/Core"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/ThreadPoolInterface.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorNonBlockingThreadPool.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h"
+#include "src/util/CXX11Meta.h"
+#include "src/util/MaxSizeVector.h"
+
+#include "src/ThreadPool/ThreadLocal.h"
+#include "src/ThreadPool/ThreadYield.h"
+#include "src/ThreadPool/ThreadCancel.h"
+#include "src/ThreadPool/EventCount.h"
+#include "src/ThreadPool/RunQueue.h"
+#include "src/ThreadPool/ThreadPoolInterface.h"
+#include "src/ThreadPool/ThreadEnvironment.h"
+#include "src/ThreadPool/Barrier.h"
+#include "src/ThreadPool/NonBlockingThreadPool.h"
 
 #endif
 
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_CXX11_THREADPOOL_MODULE

diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md
new file mode 100644
index 0000000..2f65b1b
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md

@@ -0,0 +1,1815 @@
+# Eigen Tensors {#eigen_tensors}
+
+Tensors are multidimensional arrays of elements. Elements are typically scalars,
+but more complex types such as strings are also supported.
+
+## Tensor Classes
+
+You can manipulate a tensor with one of the following classes.  They all are in
+the namespace `::Eigen.`
+
+
+### Class Tensor<data_type, rank>
+
+This is the class to use to create a tensor and allocate memory for it.  The
+class is templatized with the tensor datatype, such as float or int, and the
+tensor rank.  The rank is the number of dimensions, for example rank 2 is a
+matrix.
+
+Tensors of this class are resizable.  For example, if you assign a tensor of a
+different size to a Tensor, that tensor is resized to match its new value.
+
+#### Constructor Tensor<data_type, rank>(size0, size1, ...)
+
+Constructor for a Tensor.  The constructor must be passed `rank` integers
+indicating the sizes of the instance along each of the the `rank`
+dimensions.
+
+    // Create a tensor of rank 3 of sizes 2, 3, 4.  This tensor owns
+    // memory to hold 24 floating point values (24 = 2 x 3 x 4).
+    Tensor<float, 3> t_3d(2, 3, 4);
+
+    // Resize t_3d by assigning a tensor of different sizes, but same rank.
+    t_3d = Tensor<float, 3>(3, 4, 3);
+
+#### Constructor Tensor<data_type, rank>(size_array)
+
+Constructor where the sizes for the constructor are specified as an array of
+values instead of an explicitly list of parameters.  The array type to use is
+`Eigen::array<Eigen::Index>`.  The array can be constructed automatically
+from an initializer list.
+
+    // Create a tensor of strings of rank 2 with sizes 5, 7.
+    Tensor<string, 2> t_2d({5, 7});
+
+
+### Class TensorFixedSize<data_type, Sizes<size0, size1, ...>>
+
+Class to use for tensors of fixed size, where the size is known at compile
+time.  Fixed sized tensors can provide very fast computations because all their
+dimensions are known by the compiler.  FixedSize tensors are not resizable.
+
+If the total number of elements in a fixed size tensor is small enough the
+tensor data is held onto the stack and does not cause heap allocation and free.
+
+    // Create a 4 x 3 tensor of floats.
+    TensorFixedSize<float, Sizes<4, 3>> t_4x3;
+
+### Class TensorMap<Tensor<data_type, rank>>
+
+This is the class to use to create a tensor on top of memory allocated and
+owned by another part of your code.  It allows to view any piece of allocated
+memory as a Tensor.  Instances of this class do not own the memory where the
+data are stored.
+
+A TensorMap is not resizable because it does not own the memory where its data
+are stored.
+
+#### Constructor TensorMap<Tensor<data_type, rank>>(data, size0, size1, ...)
+
+Constructor for a Tensor.  The constructor must be passed a pointer to the
+storage for the data, and "rank" size attributes.  The storage has to be
+large enough to hold all the data.
+
+    // Map a tensor of ints on top of stack-allocated storage.
+    int storage[128];  // 2 x 4 x 2 x 8 = 128
+    TensorMap<Tensor<int, 4>> t_4d(storage, 2, 4, 2, 8);
+
+    // The same storage can be viewed as a different tensor.
+    // You can also pass the sizes as an array.
+    TensorMap<Tensor<int, 2>> t_2d(storage, 16, 8);
+
+    // You can also map fixed-size tensors.  Here we get a 1d view of
+    // the 2d fixed-size tensor.
+    TensorFixedSize<float, Sizes<4, 3>> t_4x3;
+    TensorMap<Tensor<float, 1>> t_12(t_4x3.data(), 12);
+
+
+#### Class TensorRef
+
+See Assigning to a TensorRef below.
+
+## Accessing Tensor Elements
+
+#### <data_type> tensor(index0, index1...)
+
+Return the element at position `(index0, index1...)` in tensor
+`tensor`.  You must pass as many parameters as the rank of `tensor`.
+The expression can be used as an l-value to set the value of the element at the
+specified position.  The value returned is of the datatype of the tensor.
+
+    // Set the value of the element at position (0, 1, 0);
+    Tensor<float, 3> t_3d(2, 3, 4);
+    t_3d(0, 1, 0) = 12.0f;
+
+    // Initialize all elements to random values.
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 4; ++k) {
+          t_3d(i, j, k) = ...some random value...;
+        }
+      }
+    }
+
+    // Print elements of a tensor.
+    for (int i = 0; i < 2; ++i) {
+      LOG(INFO) << t_3d(i, 0, 0);
+    }
+
+
+## TensorLayout
+
+The tensor library supports 2 layouts: `ColMajor` (the default) and
+`RowMajor`.  Only the default column major layout is currently fully
+supported, and it is therefore not recommended to attempt to use the row major
+layout at the moment.
+
+The layout of a tensor is optionally specified as part of its type. If not
+specified explicitly column major is assumed.
+
+    Tensor<float, 3, ColMajor> col_major;  // equivalent to Tensor<float, 3>
+    TensorMap<Tensor<float, 3, RowMajor> > row_major(data, ...);
+
+All the arguments to an expression must use the same layout. Attempting to mix
+different layouts will result in a compilation error.
+
+It is possible to change the layout of a tensor or an expression using the
+`swap_layout()` method.  Note that this will also reverse the order of the
+dimensions.
+
+    Tensor<float, 2, ColMajor> col_major(2, 4);
+    Tensor<float, 2, RowMajor> row_major(2, 4);
+
+    Tensor<float, 2> col_major_result = col_major;  // ok, layouts match
+    Tensor<float, 2> col_major_result = row_major;  // will not compile
+
+    // Simple layout swap
+    col_major_result = row_major.swap_layout();
+    eigen_assert(col_major_result.dimension(0) == 4);
+    eigen_assert(col_major_result.dimension(1) == 2);
+
+    // Swap the layout and preserve the order of the dimensions
+    array<int, 2> shuffle(1, 0);
+    col_major_result = row_major.swap_layout().shuffle(shuffle);
+    eigen_assert(col_major_result.dimension(0) == 2);
+    eigen_assert(col_major_result.dimension(1) == 4);
+
+
+## Tensor Operations
+
+The Eigen Tensor library provides a vast library of operations on Tensors:
+numerical operations such as addition and multiplication, geometry operations
+such as slicing and shuffling, etc.  These operations are available as methods
+of the Tensor classes, and in some cases as operator overloads.  For example
+the following code computes the elementwise addition of two tensors:
+
+    Tensor<float, 3> t1(2, 3, 4);
+    ...set some values in t1...
+    Tensor<float, 3> t2(2, 3, 4);
+    ...set some values in t2...
+    // Set t3 to the element wise sum of t1 and t2
+    Tensor<float, 3> t3 = t1 + t2;
+
+While the code above looks easy enough, it is important to understand that the
+expression `t1 + t2` is not actually adding the values of the tensors.  The
+expression instead constructs a "tensor operator" object of the class
+TensorCwiseBinaryOp<scalar_sum>, which has references to the tensors
+`t1` and `t2`.  This is a small C++ object that knows how to add
+`t1` and `t2`.  It is only when the value of the expression is assigned
+to the tensor `t3` that the addition is actually performed.  Technically,
+this happens through the overloading of `operator=()` in the Tensor class.
+
+This mechanism for computing tensor expressions allows for lazy evaluation and
+optimizations which are what make the tensor library very fast.
+
+Of course, the tensor operators do nest, and the expression `t1 + t2 * 0.3f`
+is actually represented with the (approximate) tree of operators:
+
+    TensorCwiseBinaryOp<scalar_sum>(t1, TensorCwiseUnaryOp<scalar_mul>(t2, 0.3f))
+
+
+### Tensor Operations and C++ "auto"
+
+Because Tensor operations create tensor operators, the C++ `auto` keyword
+does not have its intuitive meaning.  Consider these 2 lines of code:
+
+    Tensor<float, 3> t3 = t1 + t2;
+    auto t4 = t1 + t2;
+
+In the first line we allocate the tensor `t3` and it will contain the
+result of the addition of `t1` and `t2`.  In the second line, `t4`
+is actually the tree of tensor operators that will compute the addition of
+`t1` and `t2`.  In fact, `t4` is *not* a tensor and you cannot get
+the values of its elements:
+
+    Tensor<float, 3> t3 = t1 + t2;
+    cout << t3(0, 0, 0);  // OK prints the value of t1(0, 0, 0) + t2(0, 0, 0)
+
+    auto t4 = t1 + t2;
+    cout << t4(0, 0, 0);  // Compilation error!
+
+When you use `auto` you do not get a Tensor as a result but instead a
+non-evaluated expression.  So only use `auto` to delay evaluation.
+
+Unfortunately, there is no single underlying concrete type for holding
+non-evaluated expressions, hence you have to use auto in the case when you do
+want to hold non-evaluated expressions.
+
+When you need the results of set of tensor computations you have to assign the
+result to a Tensor that will be capable of holding onto them.  This can be
+either a normal Tensor, a fixed size Tensor, or a TensorMap on an existing
+piece of memory.  All the following will work:
+
+    auto t4 = t1 + t2;
+
+    Tensor<float, 3> result = t4;  // Could also be: result(t4);
+    cout << result(0, 0, 0);
+
+    TensorMap<float, 4> result(<a float* with enough space>, <size0>, ...) = t4;
+    cout << result(0, 0, 0);
+
+    TensorFixedSize<float, Sizes<size0, ...>> result = t4;
+    cout << result(0, 0, 0);
+
+Until you need the results, you can keep the operation around, and even reuse
+it for additional operations.  As long as you keep the expression as an
+operation, no computation is performed.
+
+    // One way to compute exp((t1 + t2) * 0.2f);
+    auto t3 = t1 + t2;
+    auto t4 = t3 * 0.2f;
+    auto t5 = t4.exp();
+    Tensor<float, 3> result = t5;
+
+    // Another way, exactly as efficient as the previous one:
+    Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp();
+
+### Controlling When Expression are Evaluated
+
+There are several ways to control when expressions are evaluated:
+
+*   Assignment to a Tensor, TensorFixedSize, or TensorMap.
+*   Use of the eval() method.
+*   Assignment to a TensorRef.
+
+#### Assigning to a Tensor, TensorFixedSize, or TensorMap.
+
+The most common way to evaluate an expression is to assign it to a Tensor.  In
+the example below, the `auto` declarations make the intermediate values
+"Operations", not Tensors, and do not cause the expressions to be evaluated.
+The assignment to the Tensor `result` causes the evaluation of all the
+operations.
+
+    auto t3 = t1 + t2;             // t3 is an Operation.
+    auto t4 = t3 * 0.2f;           // t4 is an Operation.
+    auto t5 = t4.exp();            // t5 is an Operation.
+    Tensor<float, 3> result = t5;  // The operations are evaluated.
+
+If you know the ranks and sizes of the Operation value you can assign the
+Operation to a TensorFixedSize instead of a Tensor, which is a bit more
+efficient.
+
+    // We know that the result is a 4x4x2 tensor!
+    TensorFixedSize<float, Sizes<4, 4, 2>> result = t5;
+
+Simiarly, assigning an expression to a TensorMap causes its evaluation.  Like
+tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to
+have the rank and sizes of the expression that are assigned to them.
+
+#### Calling eval().
+
+When you compute large composite expressions, you sometimes want to tell Eigen
+that an intermediate value in the expression tree is worth evaluating ahead of
+time.  This is done by inserting a call to the `eval()` method of the
+expression Operation.
+
+    // The previous example could have been written:
+    Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp();
+
+    // If you want to compute (t1 + t2) once ahead of time you can write:
+    Tensor<float, 3> result = ((t1 + t2).eval() * 0.2f).exp();
+
+Semantically, calling `eval()` is equivalent to materializing the value of
+the expression in a temporary Tensor of the right size.  The code above in
+effect does:
+
+    // .eval() knows the size!
+    TensorFixedSize<float, Sizes<4, 4, 2>> tmp = t1 + t2;
+    Tensor<float, 3> result = (tmp * 0.2f).exp();
+
+Note that the return value of `eval()` is itself an Operation, so the
+following code does not do what you may think:
+
+    // Here t3 is an evaluation Operation.  t3 has not been evaluated yet.
+    auto t3 = (t1 + t2).eval();
+
+    // You can use t3 in another expression.  Still no evaluation.
+    auto t4 = (t3 * 0.2f).exp();
+
+    // The value is evaluated when you assign the Operation to a Tensor, using
+    // an intermediate tensor to represent t3.x
+    Tensor<float, 3> result = t4;
+
+While in the examples above calling `eval()` does not make a difference in
+performance, in other cases it can make a huge difference.  In the expression
+below the `broadcast()` expression causes the `X.maximum()` expression
+to be evaluated many times:
+
+    Tensor<...> X ...;
+    Tensor<...> Y = ((X - X.maximum(depth_dim).reshape(dims2d).broadcast(bcast))
+                     * beta).exp();
+
+Inserting a call to `eval()` between the `maximum()` and
+`reshape()` calls guarantees that maximum() is only computed once and
+greatly speeds-up execution:
+
+    Tensor<...> Y =
+      ((X - X.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast))
+        * beta).exp();
+
+In the other example below, the tensor `Y` is both used in the expression
+and its assignment.  This is an aliasing problem and if the evaluation is not
+done in the right order Y will be updated incrementally during the evaluation
+resulting in bogus results:
+
+     Tensor<...> Y ...;
+     Y = Y / (Y.sum(depth_dim).reshape(dims2d).broadcast(bcast));
+
+Inserting a call to `eval()` between the `sum()` and `reshape()`
+expressions ensures that the sum is computed before any updates to `Y` are
+done.
+
+     Y = Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
+
+Note that an eval around the full right hand side expression is not needed
+because the generated has to compute the i-th value of the right hand side
+before assigning it to the left hand side.
+
+However, if you were assigning the expression value to a shuffle of `Y`
+then you would need to force an eval for correctness by adding an `eval()`
+call for the right hand side:
+
+     Y.shuffle(...) =
+        (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval();
+
+
+#### Assigning to a TensorRef.
+
+If you need to access only a few elements from the value of an expression you
+can avoid materializing the value in a full tensor by using a TensorRef.
+
+A TensorRef is a small wrapper class for any Eigen Operation.  It provides
+overloads for the `()` operator that let you access individual values in
+the expression.  TensorRef is convenient, because the Operation themselves do
+not provide a way to access individual elements.
+
+    // Create a TensorRef for the expression.  The expression is not
+    // evaluated yet.
+    TensorRef<Tensor<float, 3> > ref = ((t1 + t2) * 0.2f).exp();
+
+    // Use "ref" to access individual elements.  The expression is evaluated
+    // on the fly.
+    float at_0 = ref(0, 0, 0);
+    cout << ref(0, 1, 0);
+
+Only use TensorRef when you need a subset of the values of the expression.
+TensorRef only computes the values you access.  However note that if you are
+going to access all the values it will be much faster to materialize the
+results in a Tensor first.
+
+In some cases, if the full Tensor result would be very large, you may save
+memory by accessing it as a TensorRef.  But not always.  So don't count on it.
+
+
+### Controlling How Expressions Are Evaluated
+
+The tensor library provides several implementations of the various operations
+such as contractions and convolutions.  The implementations are optimized for
+different environments: single threaded on CPU, multi threaded on CPU, or on a
+GPU using cuda.  Additional implementations may be added later.
+
+You can choose which implementation to use with the `device()` call.  If
+you do not choose an implementation explicitly the default implementation that
+uses a single thread on the CPU is used.
+
+The default implementation has been optimized for recent Intel CPUs, taking
+advantage of SSE, AVX, and FMA instructions.  Work is ongoing to tune the
+library on ARM CPUs.  Note that you need to pass compiler-dependent flags
+to enable the use of SSE, AVX, and other instructions.
+
+For example, the following code adds two tensors using the default
+single-threaded CPU implementation:
+
+    Tensor<float, 2> a(30, 40);
+    Tensor<float, 2> b(30, 40);
+    Tensor<float, 2> c = a + b;
+
+To choose a different implementation you have to insert a `device()` call
+before the assignment of the result.  For technical C++ reasons this requires
+that the Tensor for the result be declared on its own.  This means that you
+have to know the size of the result.
+
+    Eigen::Tensor<float, 2> c(30, 40);
+    c.device(...) = a + b;
+
+The call to `device()` must be the last call on the left of the operator=.
+
+You must pass to the `device()` call an Eigen device object.  There are
+presently three devices you can use: DefaultDevice, ThreadPoolDevice and
+GpuDevice.
+
+
+#### Evaluating With the DefaultDevice
+
+This is exactly the same as not inserting a `device()` call.
+
+    DefaultDevice my_device;
+    c.device(my_device) = a + b;
+
+#### Evaluating with a Thread Pool
+
+    // Create the Eigen ThreadPool
+    Eigen::ThreadPool pool(8 /* number of threads in pool */)
+
+    // Create the Eigen ThreadPoolDevice.
+    Eigen::ThreadPoolDevice my_device(&pool, 4 /* number of threads to use */);
+
+    // Now just use the device when evaluating expressions.
+    Eigen::Tensor<float, 2> c(30, 50);
+    c.device(my_device) = a.contract(b, dot_product_dims);
+
+
+#### Evaluating On GPU
+
+This is presently a bit more complicated than just using a thread pool device.
+You need to create a GPU device but you also need to explicitly allocate the
+memory for tensors with cuda.
+
+
+## API Reference
+
+### Datatypes
+
+In the documentation of the tensor methods and Operation we mention datatypes
+that are tensor-type specific:
+
+#### <Tensor-Type>::Dimensions
+
+Acts like an array of ints.  Has an `int size` attribute, and can be
+indexed like an array to access individual values.  Used to represent the
+dimensions of a tensor.  See `dimensions()`.
+
+#### <Tensor-Type>::Index
+
+Acts like an `int`.  Used for indexing tensors along their dimensions.  See
+`operator()`, `dimension()`, and `size()`.
+
+#### <Tensor-Type>::Scalar
+
+Represents the datatype of individual tensor elements.  For example, for a
+`Tensor<float>`, `Scalar` is the type `float`.  See
+`setConstant()`.
+
+#### <Operation>
+
+We use this pseudo type to indicate that a tensor Operation is returned by a
+method.  We indicate in the text the type and dimensions of the tensor that the
+Operation returns after evaluation.
+
+The Operation will have to be evaluated, for example by assigning it to a
+tensor, before you can access the values of the resulting tensor.  You can also
+access the values through a TensorRef.
+
+
+## Built-in Tensor Methods
+
+These are usual C++ methods that act on tensors immediately.  They are not
+Operations which provide delayed evaluation of their results.  Unless specified
+otherwise, all the methods listed below are available on all tensor classes:
+Tensor, TensorFixedSize, and TensorMap.
+
+## Metadata
+
+### int NumDimensions
+
+Constant value indicating the number of dimensions of a Tensor.  This is also
+known as the tensor "rank".
+
+      Eigen::Tensor<float, 2> a(3, 4);
+      cout << "Dims " << a.NumDimensions;
+      => Dims 2
+
+### Dimensions dimensions()
+
+Returns an array-like object representing the dimensions of the tensor.
+The actual type of the `dimensions()` result is `<Tensor-Type>::``Dimensions`.
+
+    Eigen::Tensor<float, 2> a(3, 4);
+    const Eigen::Tensor<float, 2>::Dimensions& d = a.dimensions();
+    cout << "Dim size: " << d.size << ", dim 0: " << d[0]
+         << ", dim 1: " << d[1];
+    => Dim size: 2, dim 0: 3, dim 1: 4
+
+If you use a C++11 compiler, you can use `auto` to simplify the code:
+
+    const auto& d = a.dimensions();
+    cout << "Dim size: " << d.size << ", dim 0: " << d[0]
+         << ", dim 1: " << d[1];
+    => Dim size: 2, dim 0: 3, dim 1: 4
+
+### Index dimension(Index n)
+
+Returns the n-th dimension of the tensor.  The actual type of the
+`dimension()` result is `<Tensor-Type>::``Index`, but you can
+always use it like an int.
+
+      Eigen::Tensor<float, 2> a(3, 4);
+      int dim1 = a.dimension(1);
+      cout << "Dim 1: " << dim1;
+      => Dim 1: 4
+
+### Index size()
+
+Returns the total number of elements in the tensor.  This is the product of all
+the tensor dimensions.  The actual type of the `size()` result is
+`<Tensor-Type>::``Index`, but you can always use it like an int.
+
+    Eigen::Tensor<float, 2> a(3, 4);
+    cout << "Size: " << a.size();
+    => Size: 12
+
+
+### Getting Dimensions From An Operation
+
+A few operations provide `dimensions()` directly,
+e.g. `TensorReslicingOp`.  Most operations defer calculating dimensions
+until the operation is being evaluated.  If you need access to the dimensions
+of a deferred operation, you can wrap it in a TensorRef (see Assigning to a
+TensorRef above), which provides `dimensions()` and `dimension()` as
+above.
+
+TensorRef can also wrap the plain Tensor types, so this is a useful idiom in
+templated contexts where the underlying object could be either a raw Tensor
+or some deferred operation (e.g. a slice of a Tensor).  In this case, the
+template code can wrap the object in a TensorRef and reason about its
+dimensionality while remaining agnostic to the underlying type.
+
+
+## Constructors
+
+### Tensor
+
+Creates a tensor of the specified size. The number of arguments must be equal
+to the rank of the tensor. The content of the tensor is not initialized.
+
+    Eigen::Tensor<float, 2> a(3, 4);
+    cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl;
+    => NumRows: 3 NumCols: 4
+
+### TensorFixedSize
+
+Creates a tensor of the specified size. The number of arguments in the Sizes<>
+template parameter determines the rank of the tensor. The content of the tensor
+is not initialized.
+
+    Eigen::TensorFixedSize<float, Sizes<3, 4>> a;
+    cout << "Rank: " << a.rank() << endl;
+    => Rank: 2
+    cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl;
+    => NumRows: 3 NumCols: 4
+
+### TensorMap
+
+Creates a tensor mapping an existing array of data. The data must not be freed
+until the TensorMap is discarded, and the size of the data must be large enough
+to accommodate the coefficients of the tensor.
+
+    float data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+    Eigen::TensorMap<Tensor<float, 2>> a(data, 3, 4);
+    cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl;
+    => NumRows: 3 NumCols: 4
+    cout << "a(1, 2): " << a(1, 2) << endl;
+    => a(1, 2): 7
+
+
+## Contents Initialization
+
+When a new Tensor or a new TensorFixedSize are created, memory is allocated to
+hold all the tensor elements, but the memory is not initialized.  Similarly,
+when a new TensorMap is created on top of non-initialized memory the memory its
+contents are not initialized.
+
+You can use one of the methods below to initialize the tensor memory.  These
+have an immediate effect on the tensor and return the tensor itself as a
+result.  These are not tensor Operations which delay evaluation.
+
+### <Tensor-Type> setConstant(const Scalar& val)
+
+Sets all elements of the tensor to the constant value `val`.  `Scalar`
+is the type of data stored in the tensor.  You can pass any value that is
+convertible to that type.
+
+Returns the tensor itself in case you want to chain another call.
+
+    a.setConstant(12.3f);
+    cout << "Constant: " << endl << a << endl << endl;
+    =>
+    Constant:
+    12.3 12.3 12.3 12.3
+    12.3 12.3 12.3 12.3
+    12.3 12.3 12.3 12.3
+
+Note that `setConstant()` can be used on any tensor where the element type
+has a copy constructor and an `operator=()`:
+
+    Eigen::Tensor<string, 2> a(2, 3);
+    a.setConstant("yolo");
+    cout << "String tensor: " << endl << a << endl << endl;
+    =>
+    String tensor:
+    yolo yolo yolo
+    yolo yolo yolo
+
+
+### <Tensor-Type> setZero()
+
+Fills the tensor with zeros.  Equivalent to `setConstant(Scalar(0))`.
+Returns the tensor itself in case you want to chain another call.
+
+    a.setZero();
+    cout << "Zeros: " << endl << a << endl << endl;
+    =>
+    Zeros:
+    0 0 0 0
+    0 0 0 0
+    0 0 0 0
+
+
+### <Tensor-Type> setValues({..initializer_list})
+
+Fills the tensor with explicit values specified in a std::initializer_list.
+The type of the initializer list depends on the type and rank of the tensor.
+
+If the tensor has rank N, the initializer list must be nested N times.  The
+most deeply nested lists must contains P scalars of the Tensor type where P is
+the size of the last dimension of the Tensor.
+
+For example, for a `TensorFixedSize<float, 2, 3>` the initializer list must
+contains 2 lists of 3 floats each.
+
+`setValues()` returns the tensor itself in case you want to chain another
+call.
+
+    Eigen::Tensor<float, 2> a(2, 3);
+    a.setValues({{0.0f, 1.0f, 2.0f}, {3.0f, 4.0f, 5.0f}});
+    cout << "a" << endl << a << endl << endl;
+    =>
+    a
+    0 1 2
+    3 4 5
+
+If a list is too short, the corresponding elements of the tensor will not be
+changed.  This is valid at each level of nesting.  For example the following
+code only sets the values of the first row of the tensor.
+
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setConstant(1000);
+    a.setValues({{10, 20, 30}});
+    cout << "a" << endl << a << endl << endl;
+    =>
+    a
+    10   20   30
+    1000 1000 1000
+
+### <Tensor-Type> setRandom()
+
+Fills the tensor with random values.  Returns the tensor itself in case you
+want to chain another call.
+
+    a.setRandom();
+    cout << "Random: " << endl << a << endl << endl;
+    =>
+    Random:
+      0.680375    0.59688  -0.329554    0.10794
+     -0.211234   0.823295   0.536459 -0.0452059
+      0.566198  -0.604897  -0.444451   0.257742
+
+You can customize `setRandom()` by providing your own random number
+generator as a template argument:
+
+    a.setRandom<MyRandomGenerator>();
+
+Here, `MyRandomGenerator` must be a struct with the following member
+functions, where Scalar and Index are the same as `<Tensor-Type>::``Scalar`
+and `<Tensor-Type>::``Index`.
+
+See `struct UniformRandomGenerator` in TensorFunctors.h for an example.
+
+    // Custom number generator for use with setRandom().
+    struct MyRandomGenerator {
+      // Default and copy constructors. Both are needed
+      MyRandomGenerator() { }
+      MyRandomGenerator(const MyRandomGenerator& ) { }
+
+      // Return a random value to be used.  "element_location" is the
+      // location of the entry to set in the tensor, it can typically
+      // be ignored.
+      Scalar operator()(Eigen::DenseIndex element_location,
+                        Eigen::DenseIndex /*unused*/ = 0) const {
+        return <randomly generated value of type T>;
+      }
+
+      // Same as above but generates several numbers at a time.
+      typename internal::packet_traits<Scalar>::type packetOp(
+          Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const {
+        return <a packet of randomly generated values>;
+      }
+    };
+
+You can also use one of the 2 random number generators that are part of the
+tensor library:
+*   UniformRandomGenerator
+*   NormalRandomGenerator
+
+
+## Data Access
+
+The Tensor, TensorFixedSize, and TensorRef classes provide the following
+accessors to access the tensor coefficients:
+
+    const Scalar& operator()(const array<Index, NumIndices>& indices)
+    const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+    Scalar& operator()(const array<Index, NumIndices>& indices)
+    Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+
+The number of indices must be equal to the rank of the tensor. Moreover, these
+accessors are not available on tensor expressions. In order to access the
+values of a tensor expression, the expression must either be evaluated or
+wrapped in a TensorRef.
+
+
+### Scalar* data() and const Scalar* data() const
+
+Returns a pointer to the storage for the tensor.  The pointer is const if the
+tensor was const.  This allows direct access to the data.  The layout of the
+data depends on the tensor layout: RowMajor or ColMajor.
+
+This access is usually only needed for special cases, for example when mixing
+Eigen Tensor code with other libraries.
+
+Scalar is the type of data stored in the tensor.
+
+    Eigen::Tensor<float, 2> a(3, 4);
+    float* a_data = a.data();
+    a_data[0] = 123.45f;
+    cout << "a(0, 0): " << a(0, 0);
+    => a(0, 0): 123.45
+
+
+## Tensor Operations
+
+All the methods documented below return non evaluated tensor `Operations`.
+These can be chained: you can apply another Tensor Operation to the value
+returned by the method.
+
+The chain of Operation is evaluated lazily, typically when it is assigned to a
+tensor.  See "Controlling when Expression are Evaluated" for more details about
+their evaluation.
+
+### <Operation> constant(const Scalar& val)
+
+Returns a tensor of the same type and dimensions as the original tensor but
+where all elements have the value `val`.
+
+This is useful, for example, when you want to add or subtract a constant from a
+tensor, or multiply every element of a tensor by a scalar.
+
+    Eigen::Tensor<float, 2> a(2, 3);
+    a.setConstant(1.0f);
+    Eigen::Tensor<float, 2> b = a + a.constant(2.0f);
+    Eigen::Tensor<float, 2> c = b * b.constant(0.2f);
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    cout << "c" << endl << c << endl << endl;
+    =>
+    a
+    1 1 1
+    1 1 1
+
+    b
+    3 3 3
+    3 3 3
+
+    c
+    0.6 0.6 0.6
+    0.6 0.6 0.6
+
+### <Operation> random()
+
+Returns a tensor of the same type and dimensions as the current tensor
+but where all elements have random values.
+
+This is for example useful to add random values to an existing tensor.
+The generation of random values can be customized in the same manner
+as for `setRandom()`.
+
+    Eigen::Tensor<float, 2> a(2, 3);
+    a.setConstant(1.0f);
+    Eigen::Tensor<float, 2> b = a + a.random();
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    1 1 1
+    1 1 1
+
+    b
+    1.68038   1.5662  1.82329
+    0.788766  1.59688 0.395103
+
+
+## Unary Element Wise Operations
+
+All these operations take a single input tensor as argument and return a tensor
+of the same type and dimensions as the tensor to which they are applied.  The
+requested operations are applied to each element independently.
+
+### <Operation> operator-()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the opposite values of the original tensor.
+
+    Eigen::Tensor<float, 2> a(2, 3);
+    a.setConstant(1.0f);
+    Eigen::Tensor<float, 2> b = -a;
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    1 1 1
+    1 1 1
+
+    b
+    -1 -1 -1
+    -1 -1 -1
+
+### <Operation> sqrt()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the square roots of the original tensor.
+
+### <Operation> rsqrt()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the inverse square roots of the original tensor.
+
+### <Operation> square()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the squares of the original tensor values.
+
+### <Operation> inverse()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the inverse of the original tensor values.
+
+### <Operation> exp()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the exponential of the original tensor.
+
+### <Operation> log()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the natural logarithms of the original tensor.
+
+### <Operation> abs()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the absolute values of the original tensor.
+
+### <Operation> pow(Scalar exponent)
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the coefficients of the original tensor to the power of the
+exponent.
+
+The type of the exponent, Scalar, is always the same as the type of the
+tensor coefficients.  For example, only integer exponents can be used in
+conjuntion with tensors of integer values.
+
+You can use cast() to lift this restriction.  For example this computes
+cubic roots of an int Tensor:
+
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{0, 1, 8}, {27, 64, 125}});
+    Eigen::Tensor<double, 2> b = a.cast<double>().pow(1.0 / 3.0);
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    0   1   8
+    27  64 125
+
+    b
+    0 1 2
+    3 4 5
+
+### <Operation>  operator * (Scalar scale)
+
+Multiplies all the coefficients of the input tensor by the provided scale.
+
+### <Operation>  cwiseMax(Scalar threshold)
+TODO
+
+### <Operation>  cwiseMin(Scalar threshold)
+TODO
+
+### <Operation>  unaryExpr(const CustomUnaryOp& func)
+TODO
+
+
+## Binary Element Wise Operations
+
+These operations take two input tensors as arguments. The 2 input tensors should
+be of the same type and dimensions. The result is a tensor of the same
+dimensions as the tensors to which they are applied, and unless otherwise
+specified it is also of the same type. The requested operations are applied to
+each pair of elements independently.
+
+### <Operation> operator+(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise sums of the inputs.
+
+### <Operation> operator-(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise differences of the inputs.
+
+### <Operation> operator*(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise products of the inputs.
+
+### <Operation> operator/(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise quotients of the inputs.
+
+This operator is not supported for integer types.
+
+### <Operation> cwiseMax(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise maximums of the inputs.
+
+### <Operation> cwiseMin(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise mimimums of the inputs.
+
+### <Operation> Logical operators
+
+The following logical operators are supported as well:
+
+*   operator&&(const OtherDerived& other)
+*   operator||(const OtherDerived& other)
+*   operator<(const OtherDerived& other)
+*   operator<=(const OtherDerived& other)
+*   operator>(const OtherDerived& other)
+*   operator>=(const OtherDerived& other)
+*   operator==(const OtherDerived& other)
+*   operator!=(const OtherDerived& other)
+
+They all return a tensor of boolean values.
+
+
+## Selection (select(const ThenDerived& thenTensor, const ElseDerived& elseTensor)
+
+Selection is a coefficient-wise ternary operator that is the tensor equivalent
+to the if-then-else operation.
+
+    Tensor<bool, 3> if = ...;
+    Tensor<float, 3> then = ...;
+    Tensor<float, 3> else = ...;
+    Tensor<float, 3> result = if.select(then, else);
+
+The 3 arguments must be of the same dimensions, which will also be the dimension
+of the result.  The 'if' tensor must be of type boolean, the 'then' and the
+'else' tensor must be of the same type, which will also be the type of the
+result.
+
+Each coefficient in the result is equal to the corresponding coefficient in the
+'then' tensor if the corresponding value in the 'if' tensor is true. If not, the
+resulting coefficient will come from the 'else' tensor.
+
+
+## Contraction
+
+Tensor *contractions* are a generalization of the matrix product to the
+multidimensional case.
+
+    // Create 2 matrices using tensors of rank 2
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{1, 2, 3}, {6, 5, 4}});
+    Eigen::Tensor<int, 2> b(3, 2);
+    b.setValues({{1, 2}, {4, 5}, {5, 6}});
+
+    // Compute the traditional matrix product
+    Eigen::array<Eigen::IndexPair<int>, 1> product_dims = { Eigen::IndexPair<int>(1, 0) };
+    Eigen::Tensor<int, 2> AB = a.contract(b, product_dims);
+
+    // Compute the product of the transpose of the matrices
+    Eigen::array<Eigen::IndexPair<int>, 1> transposed_product_dims = { Eigen::IndexPair<int>(0, 1) };
+    Eigen::Tensor<int, 2> AtBt = a.contract(b, transposed_product_dims);
+
+    // Contraction to scalar value using a double contraction.
+    // First coordinate of both tensors are contracted as well as both second coordinates, i.e., this computes the sum of the squares of the elements.
+    Eigen::array<Eigen::IndexPair<int>, 2> double_contraction_product_dims = { Eigen::IndexPair<int>(0, 0), Eigen::IndexPair<int>(1, 1) };
+    Eigen::Tensor<int, 0> AdoubleContractedA = a.contract(a, double_contraction_product_dims);
+
+    // Extracting the scalar value of the tensor contraction for further usage
+    int value = AdoubleContractedA(0);
+
+## Reduction Operations
+
+A *Reduction* operation returns a tensor with fewer dimensions than the
+original tensor.  The values in the returned tensor are computed by applying a
+*reduction operator* to slices of values from the original tensor.  You specify
+the dimensions along which the slices are made.
+
+The Eigen Tensor library provides a set of predefined reduction operators such
+as `maximum()` and `sum()` and lets you define additional operators by
+implementing a few methods from a reductor template.
+
+### Reduction Dimensions
+
+All reduction operations take a single parameter of type
+`<TensorType>::``Dimensions` which can always be specified as an array of
+ints.  These are called the "reduction dimensions."  The values are the indices
+of the dimensions of the input tensor over which the reduction is done.  The
+parameter can have at most as many element as the rank of the input tensor;
+each element must be less than the tensor rank, as it indicates one of the
+dimensions to reduce.
+
+Each dimension of the input tensor should occur at most once in the reduction
+dimensions as the implementation does not remove duplicates.
+
+The order of the values in the reduction dimensions does not affect the
+results, but the code may execute faster if you list the dimensions in
+increasing order.
+
+Example: Reduction along one dimension.
+
+    // Create a tensor of 2 dimensions
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{1, 2, 3}, {6, 5, 4}});
+    // Reduce it along the second dimension (1)...
+    Eigen::array<int, 1> dims({1 /* dimension to reduce */});
+    // ...using the "maximum" operator.
+    // The result is a tensor with one dimension.  The size of
+    // that dimension is the same as the first (non-reduced) dimension of a.
+    Eigen::Tensor<int, 1> b = a.maximum(dims);
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    1 2 3
+    6 5 4
+
+    b
+    3
+    6
+
+Example: Reduction along two dimensions.
+
+    Eigen::Tensor<float, 3, Eigen::ColMajor> a(2, 3, 4);
+    a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
+                  {7.0f, 6.0f, 5.0f, 4.0f},
+                  {8.0f, 9.0f, 10.0f, 11.0f}},
+                 {{12.0f, 13.0f, 14.0f, 15.0f},
+                  {19.0f, 18.0f, 17.0f, 16.0f},
+                  {20.0f, 21.0f, 22.0f, 23.0f}}});
+    // The tensor a has 3 dimensions.  We reduce along the
+    // first 2, resulting in a tensor with a single dimension
+    // of size 4 (the last dimension of a.)
+    // Note that we pass the array of reduction dimensions
+    // directly to the maximum() call.
+    Eigen::Tensor<float, 1, Eigen::ColMajor> b =
+        a.maximum(Eigen::array<int, 2>({0, 1}));
+    cout << "b" << endl << b << endl << endl;
+    =>
+    b
+    20
+    21
+    22
+    23
+
+#### Reduction along all dimensions
+
+As a special case, if you pass no parameter to a reduction operation the
+original tensor is reduced along *all* its dimensions.  The result is a
+scalar, represented as a zero-dimension tensor.
+
+    Eigen::Tensor<float, 3> a(2, 3, 4);
+    a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
+                  {7.0f, 6.0f, 5.0f, 4.0f},
+                  {8.0f, 9.0f, 10.0f, 11.0f}},
+                 {{12.0f, 13.0f, 14.0f, 15.0f},
+                  {19.0f, 18.0f, 17.0f, 16.0f},
+                  {20.0f, 21.0f, 22.0f, 23.0f}}});
+    // Reduce along all dimensions using the sum() operator.
+    Eigen::Tensor<float, 0> b = a.sum();
+    cout << "b" << endl << b << endl << endl;
+    =>
+    b
+    276
+
+
+### <Operation> sum(const Dimensions& new_dims)
+### <Operation> sum()
+
+Reduce a tensor using the sum() operator.  The resulting values
+are the sum of the reduced values.
+
+### <Operation> mean(const Dimensions& new_dims)
+### <Operation> mean()
+
+Reduce a tensor using the mean() operator.  The resulting values
+are the mean of the reduced values.
+
+### <Operation> maximum(const Dimensions& new_dims)
+### <Operation> maximum()
+
+Reduce a tensor using the maximum() operator.  The resulting values are the
+largest of the reduced values.
+
+### <Operation> minimum(const Dimensions& new_dims)
+### <Operation> minimum()
+
+Reduce a tensor using the minimum() operator.  The resulting values
+are the smallest of the reduced values.
+
+### <Operation> prod(const Dimensions& new_dims)
+### <Operation> prod()
+
+Reduce a tensor using the prod() operator.  The resulting values
+are the product of the reduced values.
+
+### <Operation> all(const Dimensions& new_dims)
+### <Operation> all()
+Reduce a tensor using the all() operator.  Casts tensor to bool and then checks
+whether all elements are true.  Runs through all elements rather than
+short-circuiting, so may be significantly inefficient.
+
+### <Operation> any(const Dimensions& new_dims)
+### <Operation> any()
+Reduce a tensor using the any() operator.  Casts tensor to bool and then checks
+whether any element is true.  Runs through all elements rather than
+short-circuiting, so may be significantly inefficient.
+
+
+### <Operation> reduce(const Dimensions& new_dims, const Reducer& reducer)
+
+Reduce a tensor using a user-defined reduction operator.  See `SumReducer`
+in TensorFunctors.h for information on how to implement a reduction operator.
+
+
+## Trace
+
+A *Trace* operation returns a tensor with fewer dimensions than the original
+tensor. It returns a tensor whose elements are the sum of the elements of the
+original tensor along the main diagonal for a list of specified dimensions, the
+"trace dimensions". Similar to the `Reduction Dimensions`, the trace dimensions
+are passed as an input parameter to the operation, are of type `<TensorType>::``Dimensions`
+, and have the same requirements when passed as an input parameter. In addition,
+the trace dimensions must have the same size.
+
+Example: Trace along 2 dimensions.
+
+    // Create a tensor of 3 dimensions
+    Eigen::Tensor<int, 3> a(2, 2, 3);
+    a.setValues({{{1, 2, 3}, {4, 5, 6}}, {{7, 8, 9}, {10, 11, 12}}});
+    // Specify the dimensions along which the trace will be computed.
+    // In this example, the trace can only be computed along the dimensions
+    // with indices 0 and 1
+    Eigen::array<int, 2> dims({0, 1});
+    // The output tensor contains all but the trace dimensions.
+    Tensor<int, 1> a_trace = a.trace(dims);
+    cout << "a_trace:" << endl;
+    cout << a_trace << endl;
+    =>
+    a_trace:
+    11
+    13
+    15
+
+
+### <Operation> trace(const Dimensions& new_dims)
+### <Operation> trace()
+
+As a special case, if no parameter is passed to the operation, trace is computed
+along *all* dimensions of the input tensor.
+
+Example: Trace along all dimensions.
+
+    // Create a tensor of 3 dimensions, with all dimensions having the same size.
+    Eigen::Tensor<int, 3> a(3, 3, 3);
+    a.setValues({{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}},
+                {{10, 11, 12}, {13, 14, 15}, {16, 17, 18}},
+                {{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}});
+    // Result is a zero dimension tensor
+    Tensor<int, 0> a_trace = a.trace();
+    cout<<"a_trace:"<<endl;
+    cout<<a_trace<<endl;
+    =>
+    a_trace:
+    42
+
+
+## Scan Operations
+
+A *Scan* operation returns a tensor with the same dimensions as the original
+tensor. The operation performs an inclusive scan along the specified
+axis, which means it computes a running total along the axis for a given
+reduction operation.
+If the reduction operation corresponds to summation, then this computes the
+prefix sum of the tensor along the given axis.
+
+Example:
+dd a comment to this line
+
+    // Create a tensor of 2 dimensions
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{1, 2, 3}, {4, 5, 6}});
+    // Scan it along the second dimension (1) using summation
+    Eigen::Tensor<int, 2> b = a.cumsum(1);
+    // The result is a tensor with the same size as the input
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    1 2 3
+    4 5 6
+
+    b
+    1  3  6
+    4  9 15
+
+### <Operation> cumsum(const Index& axis)
+
+Perform a scan by summing consecutive entries.
+
+### <Operation> cumprod(const Index& axis)
+
+Perform a scan by multiplying consecutive entries.
+
+
+## Convolutions
+
+### <Operation> convolve(const Kernel& kernel, const Dimensions& dims)
+
+Returns a tensor that is the output of the convolution of the input tensor with the kernel,
+along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor
+which were part of the convolution will be reduced by the formula:
+output_dim_size = input_dim_size - kernel_dim_size + 1 (requires: input_dim_size >= kernel_dim_size).
+The dimension sizes for dimensions that were not part of the convolution will remain the same.
+Performance of the convolution can depend on the length of the stride(s) of the input tensor dimension(s) along which the
+convolution is computed (the first dimension has the shortest stride for ColMajor, whereas RowMajor's shortest stride is
+for the last dimension).
+
+    // Compute convolution along the second and third dimension.
+    Tensor<float, 4, DataLayout> input(3, 3, 7, 11);
+    Tensor<float, 2, DataLayout> kernel(2, 2);
+    Tensor<float, 4, DataLayout> output(3, 2, 6, 11);
+    input.setRandom();
+    kernel.setRandom();
+
+    Eigen::array<ptrdiff_t, 2> dims({1, 2});  // Specify second and third dimension for convolution.
+    output = input.convolve(kernel, dims);
+
+    for (int i = 0; i < 3; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        for (int k = 0; k < 6; ++k) {
+          for (int l = 0; l < 11; ++l) {
+            const float result = output(i,j,k,l);
+            const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
+                                   input(i,j+1,k+0,l) * kernel(1,0) +
+                                   input(i,j+0,k+1,l) * kernel(0,1) +
+                                   input(i,j+1,k+1,l) * kernel(1,1);
+            VERIFY_IS_APPROX(result, expected);
+          }
+        }
+      }
+    }
+
+
+## Geometrical Operations
+
+These operations return a Tensor with different dimensions than the original
+Tensor.  They can be used to access slices of tensors, see them with different
+dimensions, or pad tensors with additional data.
+
+### <Operation> reshape(const Dimensions& new_dims)
+
+Returns a view of the input tensor that has been reshaped to the specified
+new dimensions.  The argument new_dims is an array of Index values.  The
+rank of the resulting tensor is equal to the number of elements in new_dims.
+
+The product of all the sizes in the new dimension array must be equal to
+the number of elements in the input tensor.
+
+    // Increase the rank of the input tensor by introducing a new dimension
+    // of size 1.
+    Tensor<float, 2> input(7, 11);
+    array<int, 3> three_dims{{7, 11, 1}};
+    Tensor<float, 3> result = input.reshape(three_dims);
+
+    // Decrease the rank of the input tensor by merging 2 dimensions;
+    array<int, 1> one_dim{{7 * 11}};
+    Tensor<float, 1> result = input.reshape(one_dim);
+
+This operation does not move any data in the input tensor, so the resulting
+contents of a reshaped Tensor depend on the data layout of the original Tensor.
+
+For example this is what happens when you `reshape()` a 2D ColMajor tensor
+to one dimension:
+
+    Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3);
+    a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
+    Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2});
+    Eigen::Tensor<float, 1, Eigen::ColMajor> b = a.reshape(one_dim);
+    cout << "b" << endl << b << endl;
+    =>
+    b
+      0
+    300
+    100
+    400
+    200
+    500
+
+This is what happens when the 2D Tensor is RowMajor:
+
+    Eigen::Tensor<float, 2, Eigen::RowMajor> a(2, 3);
+    a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
+    Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2});
+    Eigen::Tensor<float, 1, Eigen::RowMajor> b = a.reshape(one_dim);
+    cout << "b" << endl << b << endl;
+    =>
+    b
+      0
+    100
+    200
+    300
+    400
+    500
+
+The reshape operation is a lvalue. In other words, it can be used on the left
+side of the assignment operator.
+
+The previous example can be rewritten as follow:
+
+    Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3);
+    a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
+    Eigen::array<Eigen::DenseIndex, 2> two_dim({2, 3});
+    Eigen::Tensor<float, 1, Eigen::ColMajor> b(6);
+    b.reshape(two_dim) = a;
+    cout << "b" << endl << b << endl;
+    =>
+    b
+      0
+    300
+    100
+    400
+    200
+    500
+
+Note that "b" itself was not reshaped but that instead the assignment is done to
+the reshape view of b.
+
+
+### <Operation> shuffle(const Shuffle& shuffle)
+
+Returns a copy of the input tensor whose dimensions have been
+reordered according to the specified permutation. The argument shuffle
+is an array of Index values. Its size is the rank of the input
+tensor. It must contain a permutation of 0, 1, ..., rank - 1. The i-th
+dimension of the output tensor equals to the size of the shuffle[i]-th
+dimension of the input tensor. For example:
+
+    // Shuffle all dimensions to the left by 1.
+    Tensor<float, 3> input(20, 30, 50);
+    // ... set some values in input.
+    Tensor<float, 3> output = input.shuffle({1, 2, 0})
+
+    eigen_assert(output.dimension(0) == 30);
+    eigen_assert(output.dimension(1) == 50);
+    eigen_assert(output.dimension(2) == 20);
+
+Indices into the output tensor are shuffled accordingly to formulate
+indices into the input tensor. For example, one can assert in the above
+code snippet that:
+
+    eigen_assert(output(3, 7, 11) == input(11, 3, 7));
+
+In general, one can assert that
+
+    eigen_assert(output(..., indices[shuffle[i]], ...) ==
+                 input(..., indices[i], ...))
+
+The shuffle operation results in a lvalue, which means that it can be assigned
+to. In other words, it can be used on the left side of the assignment operator.
+
+Let's rewrite the previous example to take advantage of this feature:
+
+    // Shuffle all dimensions to the left by 1.
+    Tensor<float, 3> input(20, 30, 50);
+    // ... set some values in input.
+    Tensor<float, 3> output(30, 50, 20);
+    output.shuffle({2, 0, 1}) = input;
+
+
+### <Operation> stride(const Strides& strides)
+
+Returns a view of the input tensor that strides (skips stride-1
+elements) along each of the dimensions.  The argument strides is an
+array of Index values.  The dimensions of the resulting tensor are
+ceil(input_dimensions[i] / strides[i]).
+
+For example this is what happens when you `stride()` a 2D tensor:
+
+    Eigen::Tensor<int, 2> a(4, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500}, {600, 700, 800}, {900, 1000, 1100}});
+    Eigen::array<Eigen::DenseIndex, 2> strides({3, 2});
+    Eigen::Tensor<int, 2> b = a.stride(strides);
+    cout << "b" << endl << b << endl;
+    =>
+    b
+       0   200
+     900  1100
+
+It is possible to assign a tensor to a stride:
+    Tensor<float, 3> input(20, 30, 50);
+    // ... set some values in input.
+    Tensor<float, 3> output(40, 90, 200);
+    output.stride({2, 3, 4}) = input;
+
+
+### <Operation> slice(const StartIndices& offsets, const Sizes& extents)
+
+Returns a sub-tensor of the given tensor. For each dimension i, the slice is
+made of the coefficients stored between offset[i] and offset[i] + extents[i] in
+the input tensor.
+
+    Eigen::Tensor<int, 2> a(4, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500},
+                 {600, 700, 800}, {900, 1000, 1100}});
+    Eigen::array<int, 2> offsets = {1, 0};
+    Eigen::array<int, 2> extents = {2, 2};
+    Eigen::Tensor<int, 1> slice = a.slice(offsets, extents);
+    cout << "a" << endl << a << endl;
+    =>
+    a
+       0   100   200
+     300   400   500
+     600   700   800
+     900  1000  1100
+    cout << "slice" << endl << slice << endl;
+    =>
+    slice
+     300   400
+     600   700
+
+
+### <Operation> chip(const Index offset, const Index dim)
+
+A chip is a special kind of slice. It is the subtensor at the given offset in
+the dimension dim. The returned tensor has one fewer dimension than the input
+tensor: the dimension dim is removed.
+
+For example, a matrix chip would be either a row or a column of the input
+matrix.
+
+    Eigen::Tensor<int, 2> a(4, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500},
+                 {600, 700, 800}, {900, 1000, 1100}});
+    Eigen::Tensor<int, 1> row_3 = a.chip(2, 0);
+    Eigen::Tensor<int, 1> col_2 = a.chip(1, 1);
+    cout << "a" << endl << a << endl;
+    =>
+    a
+       0   100   200
+     300   400   500
+     600   700   800
+     900  1000  1100
+    cout << "row_3" << endl << row_3 << endl;
+    =>
+    row_3
+       600   700   800
+    cout << "col_2" << endl << col_2 << endl;
+    =>
+    col_2
+       100   400   700    1000
+
+It is possible to assign values to a tensor chip since the chip operation is a
+lvalue. For example:
+
+    Eigen::Tensor<int, 1> a(3);
+    a.setValues({{100, 200, 300}});
+    Eigen::Tensor<int, 2> b(2, 3);
+    b.setZero();
+    b.chip(0, 0) = a;
+    cout << "a" << endl << a << endl;
+    =>
+    a
+     100
+     200
+     300
+    cout << "b" << endl << b << endl;
+    =>
+    b
+       100   200   300
+         0     0     0
+
+
+### <Operation> reverse(const ReverseDimensions& reverse)
+
+Returns a view of the input tensor that reverses the order of the coefficients
+along a subset of the dimensions.  The argument reverse is an array of boolean
+values that indicates whether or not the order of the coefficients should be
+reversed along each of the dimensions.  This operation preserves the dimensions
+of the input tensor.
+
+For example this is what happens when you `reverse()` the first dimension
+of a 2D tensor:
+
+    Eigen::Tensor<int, 2> a(4, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500},
+                {600, 700, 800}, {900, 1000, 1100}});
+    Eigen::array<bool, 2> reverse({true, false});
+    Eigen::Tensor<int, 2> b = a.reverse(reverse);
+    cout << "a" << endl << a << endl << "b" << endl << b << endl;
+    =>
+    a
+       0   100   200
+     300   400   500
+     600   700   800
+     900  1000  1100
+    b
+     900  1000  1100
+     600   700   800
+     300   400   500
+       0   100   200
+
+
+### <Operation> broadcast(const Broadcast& broadcast)
+
+Returns a view of the input tensor in which the input is replicated one to many
+times.
+The broadcast argument specifies how many copies of the input tensor need to be
+made in each of the dimensions.
+
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500}});
+    Eigen::array<int, 2> bcast({3, 2});
+    Eigen::Tensor<int, 2> b = a.broadcast(bcast);
+    cout << "a" << endl << a << endl << "b" << endl << b << endl;
+    =>
+    a
+       0   100   200
+     300   400   500
+    b
+       0   100   200    0   100   200
+     300   400   500  300   400   500
+       0   100   200    0   100   200
+     300   400   500  300   400   500
+       0   100   200    0   100   200
+     300   400   500  300   400   500
+
+### <Operation> concatenate(const OtherDerived& other, Axis axis)
+
+TODO
+
+### <Operation>  pad(const PaddingDimensions& padding)
+
+Returns a view of the input tensor in which the input is padded with zeros.
+
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500}});
+    Eigen::array<pair<int, int>, 2> paddings;
+    paddings[0] = make_pair(0, 1);
+    paddings[1] = make_pair(2, 3);
+    Eigen::Tensor<int, 2> b = a.pad(paddings);
+    cout << "a" << endl << a << endl << "b" << endl << b << endl;
+    =>
+    a
+       0   100   200
+     300   400   500
+    b
+       0     0     0    0
+       0     0     0    0
+       0   100   200    0
+     300   400   500    0
+       0     0     0    0
+       0     0     0    0
+       0     0     0    0
+
+
+### <Operation>  extract_patches(const PatchDims& patch_dims)
+
+Returns a tensor of coefficient patches extracted from the input tensor, where
+each patch is of dimension specified by 'patch_dims'. The returned tensor has
+one greater dimension than the input tensor, which is used to index each patch.
+The patch index in the output tensor depends on the data layout of the input
+tensor: the patch index is the last dimension ColMajor layout, and the first
+dimension in RowMajor layout.
+
+For example, given the following input tensor:
+
+    Eigen::Tensor<float, 2, DataLayout> tensor(3,4);
+    tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f},
+                      {4.0f, 5.0f, 6.0f, 7.0f},
+                      {8.0f, 9.0f, 10.0f, 11.0f}});
+
+    cout << "tensor: " << endl << tensor << endl;
+    =>
+    tensor:
+     0   1   2   3
+     4   5   6   7
+     8   9  10  11
+
+Six 2x2 patches can be extracted and indexed using the following code:
+
+    Eigen::Tensor<float, 3, DataLayout> patch;
+    Eigen::array<ptrdiff_t, 2> patch_dims;
+    patch_dims[0] = 2;
+    patch_dims[1] = 2;
+    patch = tensor.extract_patches(patch_dims);
+    for (int k = 0; k < 6; ++k) {
+      cout << "patch index: " << k << endl;
+      for (int i = 0; i < 2; ++i) {
+    	for (int j = 0; j < 2; ++j) {
+    	  if (DataLayout == ColMajor) {
+    		cout << patch(i, j, k) << " ";
+    	  } else {
+    		cout << patch(k, i, j) << " ";
+    	  }
+    	}
+    	cout << endl;
+      }
+    }
+
+This code results in the following output when the data layout is ColMajor:
+
+    patch index: 0
+    0 1
+    4 5
+    patch index: 1
+    4 5
+    8 9
+    patch index: 2
+    1 2
+    5 6
+    patch index: 3
+    5 6
+    9 10
+    patch index: 4
+    2 3
+    6 7
+    patch index: 5
+    6 7
+    10 11
+
+This code results in the following output when the data layout is RowMajor:
+(NOTE: the set of patches is the same as in ColMajor, but are indexed differently).
+
+    patch index: 0
+    0 1
+    4 5
+    patch index: 1
+    1 2
+    5 6
+    patch index: 2
+    2 3
+    6 7
+    patch index: 3
+    4 5
+    8 9
+    patch index: 4
+    5 6
+    9 10
+    patch index: 5
+    6 7
+    10 11
+
+### <Operation>  extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type)
+
+Returns a tensor of coefficient image patches extracted from the input tensor,
+which is expected to have dimensions ordered as follows (depending on the data
+layout of the input tensor, and the number of additional dimensions 'N'):
+
+*) ColMajor
+1st dimension: channels (of size d)
+2nd dimension: rows (of size r)
+3rd dimension: columns (of size c)
+4th-Nth dimension: time (for video) or batch (for bulk processing).
+
+*) RowMajor (reverse order of ColMajor)
+1st-Nth dimension: time (for video) or batch (for bulk processing).
+N+1'th dimension: columns (of size c)
+N+2'th dimension: rows (of size r)
+N+3'th dimension: channels (of size d)
+
+The returned tensor has one greater dimension than the input tensor, which is
+used to index each patch. The patch index in the output tensor depends on the
+data layout of the input tensor: the patch index is the 4'th dimension in
+ColMajor layout, and the 4'th from the last dimension in RowMajor layout.
+
+For example, given the following input tensor with the following dimension
+sizes:
+ *) depth:   2
+ *) rows:    3
+ *) columns: 5
+ *) batch:   7
+
+    Tensor<float, 4> tensor(2,3,5,7);
+    Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+
+2x2 image patches can be extracted and indexed using the following code:
+
+*) 2D patch: ColMajor (patch indexed by second-to-last dimension)
+
+    Tensor<float, 5> twod_patch;
+    twod_patch = tensor.extract_image_patches<2, 2>();
+    // twod_patch.dimension(0) == 2
+    // twod_patch.dimension(1) == 2
+    // twod_patch.dimension(2) == 2
+    // twod_patch.dimension(3) == 3*5
+    // twod_patch.dimension(4) == 7
+
+*) 2D patch: RowMajor (patch indexed by the second dimension)
+
+    Tensor<float, 5, RowMajor> twod_patch_row_major;
+    twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>();
+    // twod_patch_row_major.dimension(0) == 7
+    // twod_patch_row_major.dimension(1) == 3*5
+    // twod_patch_row_major.dimension(2) == 2
+    // twod_patch_row_major.dimension(3) == 2
+    // twod_patch_row_major.dimension(4) == 2
+
+## Special Operations
+
+### <Operation> cast<T>()
+
+Returns a tensor of type T with the same dimensions as the original tensor.
+The returned tensor contains the values of the original tensor converted to
+type T.
+
+    Eigen::Tensor<float, 2> a(2, 3);
+    Eigen::Tensor<int, 2> b = a.cast<int>();
+
+This can be useful for example if you need to do element-wise division of
+Tensors of integers.  This is not currently supported by the Tensor library
+but you can easily cast the tensors to floats to do the division:
+
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{0, 1, 2}, {3, 4, 5}});
+    Eigen::Tensor<int, 2> b =
+        (a.cast<float>() / a.constant(2).cast<float>()).cast<int>();
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    0 1 2
+    3 4 5
+
+    b
+    0 0 1
+    1 2 2
+
+
+### <Operation>     eval()
+
+TODO
+
+
+## Representation of scalar values
+
+Scalar values are often represented by tensors of size 1 and rank 0.For example
+Tensor<T, N>::maximum() currently returns a Tensor<T, 0>. Similarly, the inner
+product of 2 1d tensors (through contractions) returns a 0d tensor.
+
+## Limitations
+
+*   The number of tensor dimensions is currently limited to 250 when using a
+    compiler that supports cxx11. It is limited to only 5 for older compilers.
+*   The IndexList class requires a cxx11 compliant compiler. You can use an
+    array of indices instead if you don't have access to a modern compiler.
+*   On GPUs only floating point values are properly tested and optimized for.
+*   Complex and integer values are known to be broken on GPUs. If you try to use
+    them you'll most likely end up triggering a static assertion failure such as
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+

diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index 09e5143..8cac2bb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h

@@ -1,6 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
 // Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -22,12 +23,12 @@
   * The %Tensor class encompasses only dynamic-size objects so far.
   *
   * The first two template parameters are required:
-  * \tparam Scalar_ \anchor tensor_tparam_scalar Numeric type, e.g. float, double, int or std::complex<float>.
+  * \tparam Scalar_  Numeric type, e.g. float, double, int or `std::complex<float>`.
   *                 User defined scalar types are supported as well (see \ref user_defined_scalars "here").
   * \tparam NumIndices_ Number of indices (i.e. rank of the tensor)
   *
   * The remaining template parameters are optional -- in most cases you don't have to worry about them.
-  * \tparam Options_ \anchor tensor_tparam_options A combination of either \b #RowMajor or \b #ColMajor, and of either
+  * \tparam Options_  A combination of either \b #RowMajor or \b #ColMajor, and of either
   *                 \b #AutoAlign or \b #DontAlign.
   *                 The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required
   *                 for vectorization. It defaults to aligning tensors. Note that tensors currently do not support any operations that profit from vectorization.
@@ -41,13 +42,13 @@
   * \endcode
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN.
   *
   * <i><b>Some notes:</b></i>
   *
   * <dl>
   * <dt><b>Relation to other parts of Eigen:</b></dt>
-  * <dd>The midterm developement goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that
+  * <dd>The midterm development goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that
   * taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code
   * by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the %Tensor
   * class does not provide any of these features and is only available as a stand-alone class that just allows for
@@ -68,15 +69,11 @@
     typedef typename internal::traits<Self>::StorageKind StorageKind;
     typedef typename internal::traits<Self>::Index Index;
     typedef Scalar_ Scalar;
-    typedef typename internal::packet_traits<Scalar>::type Packet;
     typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef typename Base::CoeffReturnType CoeffReturnType;
-    typedef typename Base::PacketReturnType PacketReturnType;
 
     enum {
-      IsAligned = bool(EIGEN_ALIGN) & !(Options_ & DontAlign),
-      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
-      BlockAccess = false,
+      IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) & !(Options_&DontAlign),
       Layout = Options_ & RowMajor ? RowMajor : ColMajor,
       CoordAccess = true,
       RawAccess = true
@@ -87,13 +84,22 @@
     typedef DSizes<Index, NumIndices_> Dimensions;
 
   protected:
-    TensorStorage<Scalar, Dimensions, Options_> m_storage;
+    TensorStorage<Scalar, Dimensions, Options> m_storage;
+
+#ifdef EIGEN_HAS_SFINAE
+    template<typename CustomIndices>
+    struct isOfNormalIndex{
+      static const bool is_array = internal::is_base_of<array<Index, NumIndices>, CustomIndices>::value;
+      static const bool is_int = NumTraits<CustomIndices>::IsInteger;
+      static const bool value = is_array | is_int;
+    };
+#endif
 
   public:
     // Metadata
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         rank()                   const { return NumIndices; }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions&             dimensions()    const { return m_storage.dimensions(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions&             dimensions()             const { return m_storage.dimensions(); }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         size()                   const { return m_storage.size(); }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar                        *data()                        { return m_storage.data(); }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar                  *data()                  const { return m_storage.data(); }
@@ -104,22 +110,34 @@
     inline Self& base()             { return *this; }
     inline const Self& base() const { return *this; }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
-    EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return coeff(array<Index, NumIndices>{firstIndex, secondIndex, otherIndices...});
+      return coeff(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
     }
 #endif
 
+    // normal indices
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const
     {
       eigen_internal_assert(checkIndexRange(indices));
       return m_storage.data()[linearizedIndex(indices)];
     }
 
+    // custom indices
+#ifdef EIGEN_HAS_SFINAE
+    template<typename CustomIndices,
+             EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
+    >
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(CustomIndices& indices) const
+    {
+        return coeff(internal::customIndices2Array<Index,NumIndices>(indices));
+    }
+#endif
+
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const
     {
       EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -132,25 +150,37 @@
       return m_storage.data()[index];
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return coeffRef(array<Index, NumIndices>{firstIndex, secondIndex, otherIndices...});
+      return coeffRef(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
     }
 #endif
 
+    // normal indices
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
     {
       eigen_internal_assert(checkIndexRange(indices));
       return m_storage.data()[linearizedIndex(indices)];
     }
 
+    // custom indices
+#ifdef EIGEN_HAS_SFINAE
+    template<typename CustomIndices,
+             EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
+             >
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(CustomIndices& indices)
+    {
+        return coeffRef(internal::customIndices2Array<Index,NumIndices>(indices));
+    }
+#endif
+
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef()
     {
-      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
       return m_storage.data()[0];
     }
 
@@ -160,13 +190,13 @@
       return m_storage.data()[index];
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return this->operator()(array<Index, NumIndices>{firstIndex, secondIndex, otherIndices...});
+      return this->operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
     }
 #else
     EIGEN_DEVICE_FUNC
@@ -191,24 +221,35 @@
     }
 #endif
 
+    // custom indices
+#ifdef EIGEN_HAS_SFINAE
+    template<typename CustomIndices,
+             EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
+    >
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(CustomIndices& indices) const
+    {
+        return coeff(internal::customIndices2Array<Index,NumIndices>(indices));
+    }
+#endif
+
+    // normal indices
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
     {
-      eigen_assert(checkIndexRange(indices));
       return coeff(indices);
     }
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const
-    {
-      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return coeff();
-    }
-
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
     {
       eigen_internal_assert(index >= 0 && index < size());
       return coeff(index);
     }
 
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return coeff();
+    }
+
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
     {
       // The bracket operator is only for vectors, use the parenthesis operator instead.
@@ -216,13 +257,13 @@
       return coeff(index);
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return operator()(array<Index, NumIndices>{firstIndex, secondIndex, otherIndices...});
+      return operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
     }
 #else
     EIGEN_DEVICE_FUNC
@@ -247,24 +288,35 @@
     }
 #endif
 
+    // normal indices
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
     {
-      eigen_assert(checkIndexRange(indices));
       return coeffRef(indices);
     }
 
+    // custom indices
+#ifdef EIGEN_HAS_SFINAE
+    template<typename CustomIndices,
+             EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
+    >
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(CustomIndices& indices)
+    {
+      return coeffRef(internal::customIndices2Array<Index,NumIndices>(indices));
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index)
+    {
+      eigen_assert(index >= 0 && index < size());
+      return coeffRef(index);
+    }
+
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()()
     {
       EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
       return coeffRef();
     }
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index)
-    {
-      eigen_assert(index >= 0 && index < size());
-      return coeffRef(index);
-    }
-
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index)
     {
       // The bracket operator is only for vectors, use the parenthesis operator instead
@@ -284,43 +336,44 @@
     {
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
-    inline Tensor(Index firstDimension, IndexTypes... otherDimensions)
-        : m_storage(internal::array_prod(array<Index, NumIndices>{firstDimension, otherDimensions...}), array<Index, NumIndices>{firstDimension, otherDimensions...})
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions)
+        : m_storage(firstDimension, otherDimensions...)
     {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 #else
-    inline explicit Tensor(Index dim1)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1)
       : m_storage(dim1, array<Index, 1>(dim1))
     {
       EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
-    inline explicit Tensor(Index dim1, Index dim2)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2)
       : m_storage(dim1*dim2, array<Index, 2>(dim1, dim2))
     {
       EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
-    inline explicit Tensor(Index dim1, Index dim2, Index dim3)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3)
       : m_storage(dim1*dim2*dim3, array<Index, 3>(dim1, dim2, dim3))
     {
       EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
-    inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4)
       : m_storage(dim1*dim2*dim3*dim4, array<Index, 4>(dim1, dim2, dim3, dim4))
     {
       EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
-    inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5)
-      : m_storage(dim1*dim2*dim3*dim4*dim5, array<Index, 4>(dim1, dim2, dim3, dim4, dim5))
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5)
+      : m_storage(dim1*dim2*dim3*dim4*dim5, array<Index, 5>(dim1, dim2, dim3, dim4, dim5))
     {
       EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 #endif
 
-    inline explicit Tensor(const array<Index, NumIndices>& dimensions)
+    /** Normal Dimension */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(const array<Index, NumIndices>& dimensions)
         : m_storage(internal::array_prod(dimensions), dimensions)
     {
       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
@@ -335,6 +388,7 @@
       resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
       internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
     }
+
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, WriteAccessors>& other)
@@ -345,6 +399,20 @@
       internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
     }
 
+    #if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor(Self&& other)
+      : m_storage(std::move(other.m_storage))
+    {
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor& operator=(Self&& other)
+    {
+      m_storage = std::move(other.m_storage);
+      return *this;
+    }
+    #endif
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other)
     {
@@ -354,39 +422,33 @@
       internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
       return *this;
     }
-    template<typename Other>
+    template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Tensor& operator=(const Other& other)
+    EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other)
     {
-      typedef TensorAssignOp<Tensor, const Other> Assign;
+      typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
       Assign assign(*this, other);
       resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
       internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
       return *this;
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
     void resize(Index firstDimension, IndexTypes... otherDimensions)
     {
       // The number of dimensions used to resize a tensor must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      resize(array<Index, NumIndices>{firstDimension, otherDimensions...});
+      resize(array<Index, NumIndices>{{firstDimension, otherDimensions...}});
     }
 #endif
 
-    EIGEN_DEVICE_FUNC
-    void resize()
+    /** Normal Dimension */
+    EIGEN_DEVICE_FUNC void resize(const array<Index, NumIndices>& dimensions)
     {
-      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-      // Nothing to do: rank 0 tensors have fixed size
-    }
-
-    EIGEN_DEVICE_FUNC
-    void resize(const array<Index, NumIndices>& dimensions)
-    {
+      int i;
       Index size = Index(1);
-      for (size_t i = 0; i < NumIndices; i++) {
+      for (i = 0; i < NumIndices; i++) {
         internal::check_rows_cols_for_overflow<Dynamic>::run(size, dimensions[i]);
         size *= dimensions[i];
       }
@@ -399,8 +461,8 @@
       #endif
     }
 
-    EIGEN_DEVICE_FUNC
-    void resize(const DSizes<Index, NumIndices>& dimensions) {
+    // Why this overload, DSizes is derived from array ??? //
+    EIGEN_DEVICE_FUNC void resize(const DSizes<Index, NumIndices>& dimensions) {
       array<Index, NumIndices> dims;
       for (int i = 0; i < NumIndices; ++i) {
         dims[i] = dimensions[i];
@@ -408,13 +470,43 @@
       resize(dims);
     }
 
+    EIGEN_DEVICE_FUNC
+    void resize()
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      // Nothing to do: rank 0 tensors have fixed size
+    }
+
+#ifdef EIGEN_HAS_INDEX_LIST
+    template <typename FirstType, typename... OtherTypes>
+    EIGEN_DEVICE_FUNC
+    void resize(const Eigen::IndexList<FirstType, OtherTypes...>& dimensions) {
+      array<Index, NumIndices> dims;
+      for (int i = 0; i < NumIndices; ++i) {
+        dims[i] = static_cast<Index>(dimensions[i]);
+      }
+      resize(dims);
+    }
+#endif
+
+    /** Custom Dimension */
+#ifdef EIGEN_HAS_SFINAE
+    template<typename CustomDimension,
+             EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomDimension>::value) )
+    >
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(CustomDimension& dimensions)
+    {
+      resize(internal::customIndices2Array<Index,NumIndices>(dimensions));
+    }
+#endif
+
 #ifndef EIGEN_EMULATE_CXX11_META_H
-    template <typename std::size_t... Indices>
+    template <typename std::ptrdiff_t... Indices>
     EIGEN_DEVICE_FUNC
     void resize(const Sizes<Indices...>& dimensions) {
       array<Index, NumIndices> dims;
       for (int i = 0; i < NumIndices; ++i) {
-        dims[i] = dimensions[i];
+        dims[i] = static_cast<Index>(dimensions[i]);
       }
       resize(dims);
     }
@@ -424,7 +516,7 @@
     void resize(const Sizes<V1, V2, V3, V4, V5>& dimensions) {
       array<Index, NumIndices> dims;
       for (int i = 0; i < NumIndices; ++i) {
-        dims[i] = dimensions[i];
+        dims[i] = static_cast<Index>(dimensions[i]);
       }
       resize(dims);
     }

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index 5b17887..8b8fb92 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h

@@ -37,7 +37,7 @@
 template<typename XprType>
 struct eval<TensorIndexTupleOp<XprType>, Eigen::Dense>
 {
-  typedef const TensorIndexTupleOp<XprType>& type;
+  typedef const TensorIndexTupleOp<XprType>EIGEN_DEVICE_REF type;
 };
 
 template<typename XprType>
@@ -82,28 +82,35 @@
 
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
   static const int NumDims = internal::array_size<Dimensions>::value;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
     PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
     BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device) { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
     return m_impl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -112,11 +119,18 @@
     return CoeffReturnType(index, m_impl.coeff(index));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
     return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
 
  protected:
   TensorEvaluator<ArgType, Device> m_impl;
@@ -139,14 +153,14 @@
   typedef Index Scalar;
   typedef typename XprType::Nested Nested;
   typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
   static const int Layout = XprTraits::Layout;
 };
 
 template<typename ReduceOp, typename Dims, typename XprType>
 struct eval<TensorTupleReducerOp<ReduceOp, Dims, XprType>, Eigen::Dense>
 {
-  typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>& type;
+  typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>EIGEN_DEVICE_REF type;
 };
 
 template<typename ReduceOp, typename Dims, typename XprType>
@@ -171,7 +185,7 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerOp(const XprType& expr,
                                                           const ReduceOp& reduce_op,
-                                                          const int return_dim,
+                                                          const Index return_dim,
                                                           const Dims& reduce_dims)
       : m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {}
 
@@ -186,12 +200,12 @@
   const Dims& reduce_dims() const { return m_reduce_dims; }
 
   EIGEN_DEVICE_FUNC
-  int return_dim() const { return m_return_dim; }
+  Index return_dim() const { return m_return_dim; }
 
   protected:
     typename XprType::Nested m_xpr;
     const ReduceOp m_reduce_op;
-    const int m_return_dim;
+    const Index m_return_dim;
     const Dims m_reduce_dims;
 };
 
@@ -208,33 +222,52 @@
   typedef typename TensorEvaluator<const TensorIndexTupleOp<ArgType> , Device>::Dimensions InputDimensions;
   static const int NumDims = internal::array_size<InputDimensions>::value;
   typedef array<Index, NumDims> StrideDims;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef StorageMemory<TupleType, Device> TupleStorageMem;
 
   enum {
-    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
-    PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
-    BlockAccess = false,
-    Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess      = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
+    BlockAccess       = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_orig_impl(op.expression(), device),
         m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device),
-        m_return_dim(op.return_dim()),
-        m_strides(gen_strides(m_orig_impl.dimensions())),
-        m_stride_mod(gen_stride_mod(m_orig_impl.dimensions())),
-        m_stride_div(gen_stride_div()) { }
+        m_return_dim(op.return_dim())
+  {
+    gen_strides(m_orig_impl.dimensions(), m_strides);
+    if (Layout == static_cast<int>(ColMajor)) {
+      const Index total_size = internal::array_prod(m_orig_impl.dimensions());
+      m_stride_mod = (m_return_dim < NumDims - 1) ? m_strides[m_return_dim + 1] : total_size;
+    } else {
+      const Index total_size = internal::array_prod(m_orig_impl.dimensions());
+      m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size;
+    }    
+    // If m_return_dim is not a valid index, returns 1 or this can crash on Windows.
+    m_stride_div = ((m_return_dim >= 0) &&
+                    (m_return_dim < static_cast<Index>(m_strides.size())))
+                   ? m_strides[m_return_dim] : 1;
+  }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
     return m_impl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -243,19 +276,27 @@
     return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+#ifdef EIGEN_USE_SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+    m_orig_impl.bind(cgh);
+  }
+#endif
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
     const double compute_cost = 1.0 +
-        (m_return_dim < 0 ? 0 : (TensorOpCost::ModCost<Index>() + TensorOpCost::DivCost<Index>()));
+        (m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost<Index>() + TensorOpCost::DivCost<Index>()));
     return m_orig_impl.costPerCoeff(vectorized) +
            m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
-
  private:
-  EIGEN_DEVICE_FUNC StrideDims gen_strides(const InputDimensions& dims) {
-    StrideDims strides;
-    if (m_return_dim < 0) return strides;  // Won't be using these.
+  EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) {
+    if (m_return_dim < 0) {
+      return;  // Won't be using the strides.
+    }
     eigen_assert(m_return_dim < NumDims &&
                  "Asking to convert index to a dimension outside of the rank");
 
@@ -272,28 +313,15 @@
         strides[i] = strides[i+1] * dims[i+1];
       }
     }
-    return strides;
-  }
-
-  EIGEN_DEVICE_FUNC Index gen_stride_mod(const InputDimensions& dims) {
-    if (Layout == static_cast<int>(ColMajor)) {
-      return (m_return_dim < NumDims - 1) ? m_strides[m_return_dim + 1] : dims.TotalSize();
-    } else {
-      return (m_return_dim > 0) ? m_strides[m_return_dim - 1] : dims.TotalSize();
-    }
-  }
-
-  EIGEN_DEVICE_FUNC Index gen_stride_div() {
-    return m_strides[m_return_dim];
   }
 
  protected:
   TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> m_orig_impl;
   TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device> m_impl;
-  const int m_return_dim;
-  const StrideDims m_strides;
-  const Index m_stride_mod;
-  const Index m_stride_div;
+  const Index m_return_dim;
+  StrideDims m_strides;
+  Index m_stride_mod;
+  Index m_stride_div;
 };
 
 } // end namespace Eigen

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 32f2f72..e5811d6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h

@@ -34,9 +34,10 @@
   typedef typename remove_reference<RhsNested>::type _RhsNested;
   static const std::size_t NumDimensions = internal::traits<LhsXprType>::NumDimensions;
   static const int Layout = internal::traits<LhsXprType>::Layout;
+  typedef typename traits<LhsXprType>::PointerType PointerType;
 
   enum {
-    Flags = 0,
+    Flags = 0
   };
 };
 
@@ -63,9 +64,11 @@
   typedef typename Eigen::internal::traits<TensorAssignOp>::Scalar Scalar;
   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
   typedef typename LhsXprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorAssignOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
-  static const std::size_t NumDims = Eigen::internal::traits<TensorAssignOp>::NumDimensions;
+
+  static const int NumDims = Eigen::internal::traits<TensorAssignOp>::NumDimensions;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
       : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
@@ -89,44 +92,57 @@
 struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
 {
   typedef TensorAssignOp<LeftArgType, RightArgType> XprType;
-
-  enum {
-    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned &
-                TensorEvaluator<RightArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
-                   TensorEvaluator<RightArgType, Device>::PacketAccess,
-    BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
-                  TensorEvaluator<RightArgType, Device>::BlockAccess,
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    RawAccess = TensorEvaluator<RightArgType, Device>::RawAccess
-  };
-
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
-      m_leftImpl(op.lhsExpression(), device),
-      m_rightImpl(op.rhsExpression(), device)
-  {
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
-  }
-
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
-  static const std::size_t NumDims = XprType::NumDims;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
-  typedef typename internal::TensorBlock<
-    Index, typename internal::remove_const<Scalar>::type, NumDims, Layout>
-    TensorBlock;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static const int NumDims = XprType::NumDims;
+
+  enum {
+    IsAligned         = int(TensorEvaluator<LeftArgType, Device>::IsAligned) &
+                        int(TensorEvaluator<RightArgType, Device>::IsAligned),
+    PacketAccess      = int(TensorEvaluator<LeftArgType, Device>::PacketAccess) &
+                        int(TensorEvaluator<RightArgType, Device>::PacketAccess),
+    BlockAccess       = int(TensorEvaluator<LeftArgType, Device>::BlockAccess) &
+                        int(TensorEvaluator<RightArgType, Device>::BlockAccess),
+    PreferBlockAccess = int(TensorEvaluator<LeftArgType, Device>::PreferBlockAccess) |
+                        int(TensorEvaluator<RightArgType, Device>::PreferBlockAccess),
+    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess         = TensorEvaluator<LeftArgType, Device>::RawAccess
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock
+      RightTensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  TensorEvaluator(const XprType& op, const Device& device) :
+      m_leftImpl(op.lhsExpression(), device),
+      m_rightImpl(op.rhsExpression(), device)
+  {
+    EIGEN_STATIC_ASSERT(
+        (static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
+         static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+        YOU_MADE_A_PROGRAMMING_MISTAKE);
+  }
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
   {
+    // The dimensions of the lhs and the rhs tensors should be equal to prevent
+    // overflows and ensure the result is fully initialized.
     // TODO: use left impl instead if right impl dimensions are known at compile time.
     return m_rightImpl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non
@@ -135,7 +151,19 @@
     // by the rhs to the lhs.
     return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data());
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) {
+      m_rightImpl.evalSubExprsIfNeededAsync(
+          m_leftImpl.data(), [done](bool need_assign) { done(need_assign); });
+    });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
   }
@@ -144,34 +172,11 @@
     m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+
     const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
     const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
     m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i));
   }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
-      std::vector<internal::TensorOpResourceRequirements>* resources) const {
-    m_leftImpl.getResourceRequirements(resources);
-    m_rightImpl.getResourceRequirements(resources);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
-    // We assume that evalPacket or evalScalar is called to perform the
-    // assignment and account for the cost of the write here, but reduce left
-    // cost by one load because we are using m_leftImpl.coeffRef.
-    TensorOpCost left = m_leftImpl.costPerCoeff(vectorized);
-    return m_rightImpl.costPerCoeff(vectorized) +
-        TensorOpCost(numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)),
-                     left.bytes_stored(), left.compute_cycles()) +
-        TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) {
-    m_rightImpl.block(block);
-    m_leftImpl.writeBlock(*block);
-  }
-
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_leftImpl.coeff(index);
@@ -182,7 +187,54 @@
     return m_leftImpl.template packet<LoadMode>(index);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_rightImpl.data(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    // We assume that evalPacket or evalScalar is called to perform the
+    // assignment and account for the cost of the write here, but reduce left
+    // cost by one load because we are using m_leftImpl.coeffRef.
+    TensorOpCost left = m_leftImpl.costPerCoeff(vectorized);
+    return m_rightImpl.costPerCoeff(vectorized) +
+           TensorOpCost(
+               numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)),
+               left.bytes_stored(), left.compute_cycles()) +
+           TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::merge(
+        m_leftImpl.getResourceRequirements(),
+        m_rightImpl.getResourceRequirements());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(
+      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+    if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
+        m_leftImpl.data() != NULL) {
+      // If destination has raw data access, we pass it as a potential
+      // destination for a block descriptor evaluation.
+      desc.template AddDestinationBuffer<Layout>(
+          /*dst_base=*/m_leftImpl.data() + desc.offset(),
+          /*dst_strides=*/internal::strides<Layout>(m_leftImpl.dimensions()));
+    }
+
+    RightTensorBlock block = m_rightImpl.block(desc, scratch, /*root_of_expr_ast=*/true);
+    // If block was evaluated into a destination, there is no need to do assignment.
+    if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
+      m_leftImpl.writeBlock(desc, block);
+    }
+    block.cleanup();
+  }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_leftImpl.bind(cgh);
+    m_rightImpl.bind(cgh);
+  }
+#endif
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); }
 
  private:
   TensorEvaluator<LeftArgType, Device> m_leftImpl;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 993b054..35b6458 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h

@@ -20,9 +20,11 @@
   * \brief The tensor base class.
   *
   * This class is the common parent of the Tensor and TensorMap class, thus
-  * making it possible to use either class interchangably in expressions.
+  * making it possible to use either class interchangeably in expressions.
   */
-
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+// FIXME Doxygen does not like the inheritance with different template parameters
+// Since there is no doxygen documentation inside, we disable it for now
 template<typename Derived>
 class TensorBase<Derived, ReadOnlyAccessors>
 {
@@ -31,7 +33,6 @@
     typedef typename DerivedTraits::Scalar Scalar;
     typedef typename DerivedTraits::Index Index;
     typedef typename internal::remove_const<Scalar>::type CoeffReturnType;
-    typedef typename internal::packet_traits<CoeffReturnType>::type PacketReturnType;
     static const int NumDimensions = DerivedTraits::NumDimensions;
 
     // Generic nullary operation support.
@@ -81,18 +82,18 @@
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived>
-    sign() const {
-      return unaryExpr(internal::scalar_sign_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
     sqrt() const {
       return unaryExpr(internal::scalar_sqrt_op<Scalar>());
     }
 
     EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived>
+    sign() const {
+      return unaryExpr(internal::scalar_sign_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_rsqrt_op<Scalar>, const Derived>
     rsqrt() const {
       return unaryExpr(internal::scalar_rsqrt_op<Scalar>());
@@ -134,6 +135,106 @@
       return unaryExpr(internal::scalar_digamma_op<Scalar>());
     }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i0_op<Scalar>, const Derived>
+    bessel_i0() const {
+      return unaryExpr(internal::scalar_bessel_i0_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i0e_op<Scalar>, const Derived>
+    bessel_i0e() const {
+      return unaryExpr(internal::scalar_bessel_i0e_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i1_op<Scalar>, const Derived>
+    bessel_i1() const {
+      return unaryExpr(internal::scalar_bessel_i1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i1e_op<Scalar>, const Derived>
+    bessel_i1e() const {
+      return unaryExpr(internal::scalar_bessel_i1e_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_j0_op<Scalar>, const Derived>
+    bessel_j0() const {
+      return unaryExpr(internal::scalar_bessel_j0_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_y0_op<Scalar>, const Derived>
+    bessel_y0() const {
+      return unaryExpr(internal::scalar_bessel_y0_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_j1_op<Scalar>, const Derived>
+    bessel_j1() const {
+      return unaryExpr(internal::scalar_bessel_j1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_y1_op<Scalar>, const Derived>
+    bessel_y1() const {
+      return unaryExpr(internal::scalar_bessel_y1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k0_op<Scalar>, const Derived>
+    bessel_k0() const {
+      return unaryExpr(internal::scalar_bessel_k0_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k0e_op<Scalar>, const Derived>
+    bessel_k0e() const {
+      return unaryExpr(internal::scalar_bessel_k0e_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k1_op<Scalar>, const Derived>
+    bessel_k1() const {
+      return unaryExpr(internal::scalar_bessel_k1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k1e_op<Scalar>, const Derived>
+    bessel_k1e() const {
+      return unaryExpr(internal::scalar_bessel_k1e_op<Scalar>());
+    }
+
+    // igamma(a = this, x = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_igamma_op<Scalar>, const Derived, const OtherDerived>
+    igamma(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_igamma_op<Scalar>());
+    }
+
+    // igamma_der_a(a = this, x = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_igamma_der_a_op<Scalar>, const Derived, const OtherDerived>
+    igamma_der_a(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_igamma_der_a_op<Scalar>());
+    }
+
+    // gamma_sample_der_alpha(alpha = this, sample = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_gamma_sample_der_alpha_op<Scalar>, const Derived, const OtherDerived>
+    gamma_sample_der_alpha(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_gamma_sample_der_alpha_op<Scalar>());
+    }
+
+    // igammac(a = this, x = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_igammac_op<Scalar>, const Derived, const OtherDerived>
+    igammac(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_igammac_op<Scalar>());
+    }
+
     // zeta(x = this, q = other)
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_zeta_op<Scalar>, const Derived, const OtherDerived>
@@ -161,9 +262,15 @@
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sigmoid_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_ndtri_op<Scalar>, const Derived>
+    ndtri() const {
+      return unaryExpr(internal::scalar_ndtri_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_logistic_op<Scalar>, const Derived>
     sigmoid() const {
-      return unaryExpr(internal::scalar_sigmoid_op<Scalar>());
+      return unaryExpr(internal::scalar_logistic_op<Scalar>());
     }
 
     EIGEN_DEVICE_FUNC
@@ -191,116 +298,130 @@
     }
 
     EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log2_op<Scalar>, const Derived>
+    log2() const {
+      return unaryExpr(internal::scalar_log2_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
     abs() const {
       return unaryExpr(internal::scalar_abs_op<Scalar>());
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_clamp_op<Scalar>, const Derived>
+    clip(Scalar min, Scalar max) const {
+      return unaryExpr(internal::scalar_clamp_op<Scalar>(min, max));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const typename internal::conditional<NumTraits<CoeffReturnType>::IsComplex,
+                                                             TensorCwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>,
+                                                             Derived>::type
     conjugate() const {
-      return unaryExpr(internal::scalar_conjugate_op<Scalar>());
+      return choose(Cond<NumTraits<CoeffReturnType>::IsComplex>(), unaryExpr(internal::scalar_conjugate_op<Scalar>()), derived());
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >, const Derived>
     pow(Scalar exponent) const {
-      return unaryExpr(internal::scalar_pow_op<Scalar>(exponent));
+      return unaryExpr(internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >(exponent));
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_real_op<Scalar>, const Derived>
+    real() const {
+      return unaryExpr(internal::scalar_real_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_imag_op<Scalar>, const Derived>
+    imag() const {
+      return unaryExpr(internal::scalar_imag_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >, const Derived>
     operator+ (Scalar rhs) const {
-      return unaryExpr(internal::scalar_add_op<Scalar>(rhs));
+      return unaryExpr(internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >(rhs));
     }
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE friend
-    const TensorCwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_sum_op<Scalar> >, const Derived>
     operator+ (Scalar lhs, const Derived& rhs) {
-      return rhs + lhs;
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_sum_op<Scalar> >(lhs));
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >, const Derived>
     operator- (Scalar rhs) const {
-      EIGEN_STATIC_ASSERT((std::numeric_limits<Scalar>::is_signed || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return unaryExpr(internal::scalar_sub_op<Scalar>(rhs));
+      EIGEN_STATIC_ASSERT((NumTraits<Scalar>::IsSigned || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return unaryExpr(internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >(rhs));
     }
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE friend
-    const TensorCwiseUnaryOp<internal::scalar_add_op<Scalar>,
-                             const TensorCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived>>
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_difference_op<Scalar> >, const Derived>
     operator- (Scalar lhs, const Derived& rhs) {
-      return -rhs + lhs;
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_difference_op<Scalar> >(lhs));
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >, const Derived>
     operator* (Scalar rhs) const {
-      return unaryExpr(internal::scalar_multiple_op<Scalar>(rhs));
+      return unaryExpr(internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >(rhs));
     }
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE friend
-    const TensorCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived>
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_product_op<Scalar> >, const Derived>
     operator* (Scalar lhs, const Derived& rhs) {
-      return rhs * lhs;
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_product_op<Scalar> >(lhs));
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >, const Derived>
     operator/ (Scalar rhs) const {
-      // EIGEN_STATIC_ASSERT(!std::numeric_limits<Scalar>::is_integer, YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return unaryExpr(internal::scalar_quotient1_op<Scalar>(rhs));
+      return unaryExpr(internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >(rhs));
     }
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE friend
-    const TensorCwiseUnaryOp<internal::scalar_multiple_op<Scalar>,
-                             const TensorCwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived>>
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_quotient_op<Scalar> >, const Derived>
     operator/ (Scalar lhs, const Derived& rhs) {
-      return rhs.inverse() * lhs;
-    }
-
-    template <typename Scale>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_multiple2_op<Scalar, Scale>, const Derived>
-    scale (Scale rhs) const {
-      return unaryExpr(internal::scalar_multiple2_op<Scalar, Scale>(rhs));
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_quotient_op<Scalar> >(lhs));
     }
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_mod_op<Scalar>, const Derived>
     operator% (Scalar rhs) const {
-      EIGEN_STATIC_ASSERT(std::numeric_limits<Scalar>::is_integer, YOU_MADE_A_PROGRAMMING_MISTAKE_TRY_MOD);
+      EIGEN_STATIC_ASSERT(NumTraits<Scalar>::IsInteger, YOU_MADE_A_PROGRAMMING_MISTAKE_TRY_MOD);
       return unaryExpr(internal::scalar_mod_op<Scalar>(rhs));
     }
 
+    template <int NanPropagation=PropagateFast>
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_fmod_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
-    mod(Scalar rhs) const {
-      EIGEN_STATIC_ASSERT(!std::numeric_limits<Scalar>::is_integer, YOU_MADE_A_PROGRAMMING_MISTAKE_FMOD_IS_NOT_FOR_INTEGERS);
-      return mod(constant(rhs));
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+        EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NanPropagation>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     cwiseMax(Scalar threshold) const {
-      return cwiseMax(constant(threshold));
+      return cwiseMax<NanPropagation>(constant(threshold));
     }
 
+    template <int NanPropagation=PropagateFast>
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+        EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NanPropagation>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     cwiseMin(Scalar threshold) const {
-      return cwiseMin(constant(threshold));
+      return cwiseMin<NanPropagation>(constant(threshold));
     }
 
-    template <typename NewType> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorConversionOp<NewType, const Derived>
+    template<typename NewType>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const typename internal::conditional<internal::is_same<NewType, CoeffReturnType>::value,
+                                                             Derived,
+                                                             TensorConversionOp<NewType, const Derived> >::type
     cast() const {
-      return TensorConversionOp<NewType, const Derived>(derived());
+      return choose(Cond<internal::is_same<NewType, CoeffReturnType>::value>(), derived(), TensorConversionOp<NewType, const Derived>(derived()));
     }
 
     EIGEN_DEVICE_FUNC
@@ -310,6 +431,12 @@
     }
 
     EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_rint_op<Scalar>, const Derived>
+    rint() const {
+      return unaryExpr(internal::scalar_rint_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_ceil_op<Scalar>, const Derived>
     ceil() const {
       return unaryExpr(internal::scalar_ceil_op<Scalar>());
@@ -353,23 +480,16 @@
       return binaryExpr(other.derived(), internal::scalar_quotient_op<Scalar>());
     }
 
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_fmod_op<Scalar>, const Derived, const OtherDerived>
-    mod(const OtherDerived& other) const {
-      EIGEN_STATIC_ASSERT(!std::numeric_limits<Scalar>::is_integer, YOU_MADE_A_PROGRAMMING_MISTAKE_FMOD_IS_NOT_FOR_INTEGERS);
-      return binaryExpr(other.derived(), internal::scalar_fmod_op<Scalar>());
-    }
-
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const OtherDerived>
+  template<int NaNPropagation=PropagateFast, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar, NaNPropagation>, const Derived, const OtherDerived>
     cwiseMax(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_max_op<Scalar>());
+    return binaryExpr(other.derived(), internal::scalar_max_op<Scalar,Scalar, NaNPropagation>());
     }
 
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const OtherDerived>
+  template<int NaNPropagation=PropagateFast, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar, NaNPropagation>, const Derived, const OtherDerived>
     cwiseMin(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_min_op<Scalar>());
+      return binaryExpr(other.derived(), internal::scalar_min_op<Scalar,Scalar, NaNPropagation>());
     }
 
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -392,79 +512,66 @@
 
     // Comparisons and tests.
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
     operator<(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_LT>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
     operator<=(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_LE>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
     operator>(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_GT>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
     operator>=(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_GE>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>());
     }
 
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
     operator==(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_EQ>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>());
     }
+
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
     operator!=(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>());
-    }
-
-    // igamma(a = this, x = other)
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_igamma_op<Scalar>, const Derived, const OtherDerived>
-    igamma(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_igamma_op<Scalar>());
-    }
-
-    // igammac(a = this, x = other)
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_igammac_op<Scalar>, const Derived, const OtherDerived>
-    igammac(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_igammac_op<Scalar>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>());
     }
 
     // comparisons and tests for Scalars
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator<(Scalar threshold) const {
       return operator<(constant(threshold));
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator<=(Scalar threshold) const {
       return operator<=(constant(threshold));
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator>(Scalar threshold) const {
       return operator>(constant(threshold));
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator>=(Scalar threshold) const {
       return operator>=(constant(threshold));
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator==(Scalar threshold) const {
       return operator==(constant(threshold));
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator!=(Scalar threshold) const {
       return operator!=(constant(threshold));
     }
@@ -472,17 +579,17 @@
     // Checks
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_isnan_op<Scalar>, const Derived>
-    isnan() const {
+    (isnan)() const {
       return unaryExpr(internal::scalar_isnan_op<Scalar>());
     }
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_isinf_op<Scalar>, const Derived>
-    isinf() const {
+    (isinf)() const {
       return unaryExpr(internal::scalar_isinf_op<Scalar>());
     }
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_isfinite_op<Scalar>, const Derived>
-    isfinite() const {
+    (isfinite)() const {
       return unaryExpr(internal::scalar_isfinite_op<Scalar>());
     }
 
@@ -497,9 +604,15 @@
     typedef Eigen::IndexPair<Index> DimensionPair;
 
     template<typename OtherDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived>
+    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const NoOpOutputKernel>
     contract(const OtherDerived& other, const Dimensions& dims) const {
-      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived>(derived(), other.derived(), dims);
+      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const NoOpOutputKernel>(derived(), other.derived(), dims);
+    }
+
+    template<typename OtherDerived, typename Dimensions, typename OutputKernel> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const OutputKernel>
+    contract(const OtherDerived& other, const Dimensions& dims, const OutputKernel& output_kernel) const {
+      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const OutputKernel>(derived(), other.derived(), dims, output_kernel);
     }
 
     // Convolutions.
@@ -509,6 +622,13 @@
       return TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>(derived(), kernel.derived(), dims);
     }
 
+    // Fourier transforms
+    template <int FFTDataType, int FFTDirection, typename FFT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>
+    fft(const FFT& dims) const {
+      return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), dims);
+    }
+
     // Scan.
     typedef TensorScanOp<internal::SumReducer<CoeffReturnType>, const Derived> TensorScanSumOp;
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -568,25 +688,65 @@
       return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::ProdReducer<CoeffReturnType>());
     }
 
-    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived>
+    template <typename Dims,int NanPropagation=PropagateFast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>
     maximum(const Dims& dims) const {
-      return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MaxReducer<CoeffReturnType>());
+      return TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>(derived(), dims, internal::MaxReducer<CoeffReturnType,NanPropagation>());
     }
 
-    const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+    template <int NanPropagation=PropagateFast>
+    const TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>
     maximum() const {
       DimensionList<Index, NumDimensions> in_dims;
-      return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType>());
+      return TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType,NanPropagation>());
+    }
+
+    template <typename Dims,int NanPropagation=PropagateFast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>
+    minimum(const Dims& dims) const {
+      return TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>(derived(), dims, internal::MinReducer<CoeffReturnType,NanPropagation>());
+    }
+
+    template <int NanPropagation=PropagateFast>
+    const TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>
+    minimum() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType,NanPropagation>());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::AndReducer, const Dims, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
+    all(const Dims& dims) const {
+      return cast<bool>().reduce(dims, internal::AndReducer());
     }
 
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::AndReducer, const DimensionList<Index, NumDimensions>, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
+    all() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return cast<bool>().reduce(in_dims, internal::AndReducer());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::OrReducer, const Dims, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
+    any(const Dims& dims) const {
+      return cast<bool>().reduce(dims, internal::OrReducer());
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::OrReducer, const DimensionList<Index, NumDimensions>, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
+    any() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return cast<bool>().reduce(in_dims, internal::OrReducer());
+    }
+
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorTupleReducerOp<
       internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
       const array<Index, NumDimensions>, const Derived>
     argmax() const {
       array<Index, NumDimensions> in_dims;
-      for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d;
+      for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d;
       return TensorTupleReducerOp<
         internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
         const array<Index, NumDimensions>,
@@ -599,7 +759,7 @@
       const array<Index, NumDimensions>, const Derived>
     argmin() const {
       array<Index, NumDimensions> in_dims;
-      for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d;
+      for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d;
       return TensorTupleReducerOp<
         internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
         const array<Index, NumDimensions>,
@@ -610,7 +770,7 @@
     const TensorTupleReducerOp<
       internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
       const array<Index, 1>, const Derived>
-    argmax(const int return_dim) const {
+    argmax(const Index return_dim) const {
       array<Index, 1> in_dims;
       in_dims[0] = return_dim;
       return TensorTupleReducerOp<
@@ -623,7 +783,7 @@
     const TensorTupleReducerOp<
       internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
       const array<Index, 1>, const Derived>
-    argmin(const int return_dim) const {
+    argmin(const Index return_dim) const {
       array<Index, 1> in_dims;
       in_dims[0] = return_dim;
       return TensorTupleReducerOp<
@@ -632,64 +792,28 @@
         const Derived>(derived(), internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >(), return_dim, in_dims);
     }
 
-    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived>
-    minimum(const Dims& dims) const {
-      return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MinReducer<CoeffReturnType>());
-    }
-
-    const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
-    minimum() const {
-      DimensionList<Index, NumDimensions> in_dims;
-      return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType>());
-    }
-
-    // This does not short-circuit, so is potentially very inefficient.
-    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::AndReducer, const Dims, const TensorConversionOp<bool, const Derived> >
-    all(const Dims& dims) const {
-      return cast<bool>().reduce(dims, internal::AndReducer());
-    }
-
-    // This does not short-circuit, so is potentially very inefficient.
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::AndReducer, const DimensionList<Index, NumDimensions>, const TensorConversionOp<bool, const Derived> >
-    all() const {
-      DimensionList<Index, NumDimensions> in_dims;
-      return cast<bool>().reduce(in_dims, internal::AndReducer());
-    }
-
-    // This does not short-circuit, so is potentially very inefficient.
-    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::OrReducer, const Dims, const TensorConversionOp<bool, const Derived> >
-    any(const Dims& dims) const {
-      return cast<bool>().reduce(dims, internal::OrReducer());
-    }
-
-    // This does not short-circuit, so is potentially very inefficient.
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::OrReducer, const DimensionList<Index, NumDimensions>, const TensorConversionOp<bool, const Derived> >
-    any() const {
-      DimensionList<Index, NumDimensions> in_dims;
-      return cast<bool>().reduce(in_dims, internal::OrReducer());
-    }
-
     template <typename Reducer, typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorReductionOp<Reducer, const Dims, const Derived>
     reduce(const Dims& dims, const Reducer& reducer) const {
       return TensorReductionOp<Reducer, const Dims, const Derived>(derived(), dims, reducer);
     }
 
-    template <typename Broadcast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorBroadcastingOp<const Broadcast, const Derived>
-    broadcast(const Broadcast& broadcast) const {
-      return TensorBroadcastingOp<const Broadcast, const Derived>(derived(), broadcast);
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorTraceOp<const Dims, const Derived>
+    trace(const Dims& dims) const {
+      return TensorTraceOp<const Dims, const Derived>(derived(), dims);
     }
 
-    template <int FFTDataType, int FFTDirection, typename FFT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>
-    fft(const FFT& fft) const {
-      return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), fft);
+    const TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived>
+    trace() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims);
+    }
+
+    template <typename Broadcast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorBroadcastingOp<const Broadcast, const Derived>
+    broadcast(const Broadcast& bcast) const {
+      return TensorBroadcastingOp<const Broadcast, const Derived>(derived(), bcast);
     }
 
     template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -705,6 +829,30 @@
     }
 
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+    extract_image_patches(const Index patch_rows = 1, const Index patch_cols = 1,
+                          const Index row_stride = 1, const Index col_stride = 1,
+                          const Index in_row_stride = 1, const Index in_col_stride = 1,
+                          const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const {
+      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+                                                                 in_row_stride, in_col_stride, 1, 1, padding_type, padding_value);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+    extract_image_patches(const Index patch_rows, const Index patch_cols,
+                          const Index row_stride, const Index col_stride,
+                          const Index in_row_stride, const Index in_col_stride,
+                          const Index row_inflate_stride, const Index col_inflate_stride,
+                          const Index padding_top, const Index padding_bottom,
+                          const Index padding_left,const Index padding_right,
+                          const Scalar padding_value) const {
+      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+                                                                 in_row_stride, in_col_stride, row_inflate_stride, col_inflate_stride,
+                                                                 padding_top, padding_bottom, padding_left, padding_right, padding_value);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>
     extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols,
                            const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1,
@@ -724,105 +872,6 @@
       return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value);
     }
 
-    template <Index Rows, Index Cols> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorImagePatchOp<Rows, Cols, const Derived>
-    extract_image_patches() const {
-      return TensorImagePatchOp<Rows, Cols, const Derived>(derived(), Rows, Cols, 1, 1, 1, 1, 1, 1, PADDING_SAME, Scalar(0));
-    }
-
-    template <Index Rows, Index Cols> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorImagePatchOp<Rows, Cols, const Derived>
-    extract_image_patches(const PaddingType padding_type) const {
-      return TensorImagePatchOp<Rows, Cols, const Derived>(derived(), Rows, Cols, 1, 1, 1, 1, 1, 1, padding_type, Scalar(0));
-    }
-
-    template <Index Rows, Index Cols> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorImagePatchOp<Rows, Cols, const Derived>
-    extract_image_patches(const Index stride, const PaddingType padding_type) const {
-      return TensorImagePatchOp<Rows, Cols, const Derived>(derived(), Rows, Cols, stride, stride, 1, 1, 1, 1, padding_type, Scalar(0));
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
-    extract_image_patches(const Index patch_rows, const Index patch_cols,
-                          const Index row_stride = 1, const Index col_stride = 1) const {
-      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
-                                                                 1, 1, 1, 1, PADDING_SAME, Scalar(0));
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
-    extract_image_patches(const Index patch_rows, const Index patch_cols,
-                          const Index row_stride, const Index col_stride,
-                          const PaddingType padding_type) const {
-      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
-                                                                 1, 1, 1, 1, padding_type, Scalar(0));
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
-    extract_image_patches(const Index patch_rows, const Index patch_cols,
-                          const Index row_stride, const Index col_stride,
-                          const PaddingType padding_type, const Scalar padding_value) const {
-      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
-                                                                 1, 1, 1, 1, padding_type, padding_value);
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
-    extract_image_patches(const Index patch_rows, const Index patch_cols,
-                          const Index row_stride, const Index col_stride,
-                          const Index in_row_stride, const Index in_col_stride) const {
-      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
-                                                                 in_row_stride, in_col_stride, 1, 1, PADDING_SAME, Scalar(0));
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
-    extract_image_patches(const Index patch_rows, const Index patch_cols,
-                          const Index row_stride, const Index col_stride,
-                          const Index in_row_stride, const Index in_col_stride,
-                          const PaddingType padding_type) const {
-      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
-                                                                 in_row_stride, in_col_stride, 1, 1, padding_type, Scalar(0));
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
-    extract_image_patches(const Index patch_rows, const Index patch_cols,
-                          const Index row_stride, const Index col_stride,
-                          const Index in_row_stride, const Index in_col_stride,
-                          const PaddingType padding_type, const Scalar padding_value) const {
-      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
-                                                                 in_row_stride, in_col_stride, 1, 1, padding_type, padding_value);
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
-    extract_image_patches(const Index patch_rows, const Index patch_cols,
-                          const Index row_stride, const Index col_stride,
-                          const Index in_row_stride, const Index in_col_stride,
-                          const Index row_inflate_stride, const Index col_inflate_stride,
-                          const PaddingType padding_type, const Scalar padding_value) const {
-      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
-                                                                 in_row_stride, in_col_stride, row_inflate_stride, col_inflate_stride,
-                                                                 padding_type, padding_value);
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
-    extract_image_patches(const Index patch_rows, const Index patch_cols,
-                          const Index row_stride, const Index col_stride,
-                          const Index in_row_stride, const Index in_col_stride,
-                          const Index row_inflate_stride, const Index col_inflate_stride,
-                          const Index padding_top, const Index padding_bottom,
-                          const Index padding_left,const Index padding_right,
-                          const Scalar padding_value) const {
-      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
-                                                                 in_row_stride, in_col_stride, row_inflate_stride, col_inflate_stride,
-                                                                 padding_top, padding_bottom, padding_left, padding_right, padding_value);
-    }
-
     // Morphing operators.
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorLayoutSwapOp<const Derived>
@@ -863,17 +912,17 @@
     template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorPaddingOp<const PaddingDimensions, const Derived>
     pad(const PaddingDimensions& padding) const {
-      return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding, Scalar(0));
+      return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding, internal::scalar_cast_op<int, Scalar>()(0));
     }
     template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorPaddingOp<const PaddingDimensions, const Derived>
-    pad (const PaddingDimensions& padding, const Scalar padding_value) const {
+    pad(const PaddingDimensions& padding, const Scalar padding_value) const {
       return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding, padding_value);
     }
     template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorShufflingOp<const Shuffle, const Derived>
-    shuffle(const Shuffle& shuffle) const {
-      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shuffle);
+    shuffle(const Shuffle& shfl) const {
+      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shfl);
     }
     template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorStridingOp<const Strides, const Derived>
@@ -885,11 +934,15 @@
     inflate(const Strides& strides) const {
       return TensorInflationOp<const Strides, const Derived>(derived(), strides);
     }
+
+    // Returns a tensor containing index/value tuples
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorIndexTupleOp<const Derived>
     index_tuples() const {
       return TensorIndexTupleOp<const Derived>(derived());
     }
+
+    // Support for custom unary and binary operations
     template <typename CustomUnaryFunc>
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCustomUnaryOp<const CustomUnaryFunc, const Derived> customOp(const CustomUnaryFunc& op) const {
@@ -910,23 +963,26 @@
   protected:
     template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
     template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
-    template <typename OtherDerived, int AccessLevel> friend class TensorBase;
+    // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0
+    template <typename OtherDerived, int AccessLevel> friend class Eigen::TensorBase;
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
 };
 
-template<typename Derived>
-class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyAccessors> {
+template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value>
+class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
  public:
+    typedef TensorBase<Derived, ReadOnlyAccessors> Base;
     typedef internal::traits<Derived> DerivedTraits;
     typedef typename DerivedTraits::Scalar Scalar;
     typedef typename DerivedTraits::Index Index;
     typedef Scalar CoeffReturnType;
-    typedef typename internal::packet_traits<Scalar>::type PacketReturnType;
     static const int NumDimensions = DerivedTraits::NumDimensions;
 
     template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
-    template <typename OtherDerived, int AccessLevel> friend class TensorBase;
+    template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
+    // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0
+    template <typename OtherDerived, int OtherAccessLevel> friend class Eigen::TensorBase;
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& setZero() {
@@ -945,7 +1001,7 @@
       return derived() = this->template random<RandomGenerator>();
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& setValues(
         const typename internal::Initializer<Derived, NumDimensions>::InitList& vals) {
@@ -1064,13 +1120,13 @@
 
     template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorShufflingOp<const Shuffle, const Derived>
-    shuffle(const Shuffle& shuffle) const {
-      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shuffle);
+    shuffle(const Shuffle& shfl) const {
+      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shfl);
     }
     template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorShufflingOp<const Shuffle, Derived>
-    shuffle(const Shuffle& shuffle) {
-      return TensorShufflingOp<const Shuffle, Derived>(derived(), shuffle);
+    shuffle(const Shuffle& shfl) {
+      return TensorShufflingOp<const Shuffle, Derived>(derived(), shfl);
     }
 
     template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -1086,17 +1142,35 @@
 
     // Select the device on which to evaluate the expression.
     template <typename DeviceType>
-    TensorDevice<Derived, DeviceType> device(const DeviceType& device) {
-      return TensorDevice<Derived, DeviceType>(device, derived());
+    TensorDevice<Derived, DeviceType> device(const DeviceType& dev) {
+      return TensorDevice<Derived, DeviceType>(dev, derived());
+    }
+
+    // Select the async device on which to evaluate the expression.
+    template <typename DeviceType, typename DoneCallback>
+    TensorAsyncDevice<Derived, DeviceType, DoneCallback> device(const DeviceType& dev, DoneCallback done) {
+      return TensorAsyncDevice<Derived, DeviceType, DoneCallback>(dev, derived(), std::move(done));
     }
 
  protected:
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TensorBase)
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorBase)
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other)
+    {
+      typedef TensorAssignOp<Derived, const OtherDerived> Assign;
+      Assign assign(derived(), other.derived());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return derived();
+    }
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
 };
-
+#endif // EIGEN_PARSED_BY_DOXYGEN
 } // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index 9ca96d4..1e55d12 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h

@@ -1,643 +1,1559 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
 #define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
 
 namespace Eigen {
-
-/** \class TensorBlock
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor block class.
-  *
-  * This class represents a tensor block specified by the index of the
-  * first block coefficient, and the size of the block in each dimension.
-  *
-  */
-
 namespace internal {
 
-template <typename Index, typename Scalar, std::size_t NumDims, int Layout>
-class TensorBlock {
- public:
-  typedef DSizes<Index, NumDims> Dimensions;
+// -------------------------------------------------------------------------- //
+// Forward declarations for templates defined below.
+template <typename Scalar, typename IndexType, int NumDims, int Layout>
+class TensorBlockIO;
 
-  TensorBlock(const Index first_coeff_index,
-              const Dimensions& block_sizes,
-              const Dimensions& block_strides,
-              const Dimensions& tensor_strides,
-              Scalar* data)
-      : m_first_coeff_index(first_coeff_index),
-        m_block_sizes(block_sizes),
-        m_block_strides(block_strides),
-        m_tensor_strides(tensor_strides),
-        m_data(data) {}
+// -------------------------------------------------------------------------- //
+// Helper function to compute strides for densely stored buffer of given
+// dimensions.
 
-  Index first_coeff_index() const { return m_first_coeff_index; }
+// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
+// this function instead everywhere.
+template <int Layout, typename IndexType, int NumDims>
+EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
+    const DSizes<IndexType, NumDims>& dimensions) {
+  DSizes<IndexType, NumDims> strides;
+  if (NumDims == 0) return strides;
 
-  const Dimensions& block_sizes() const { return m_block_sizes; }
+  // TODO(ezhulenev): Use templates to unroll this loop (similar to
+  // h_array_reduce in CXX11meta.h)? Benchmark it.
+  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+    strides[0] = 1;
+    for (int i = 1; i < NumDims; ++i) {
+      strides[i] = strides[i - 1] * dimensions[i - 1];
+    }
+  } else {
+    strides[NumDims - 1] = 1;
+    for (int i = NumDims - 2; i >= 0; --i) {
+      strides[i] = strides[i + 1] * dimensions[i + 1];
+    }
+  }
 
-  const Dimensions& block_strides() const { return m_block_strides; }
+  return strides;
+}
 
-  const Dimensions& tensor_strides() const { return m_tensor_strides; }
+template <int Layout, typename IndexType, size_t NumDims>
+EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
+    const Eigen::array<IndexType, NumDims>& dimensions) {
+  return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
+}
 
-  Scalar* data() { return m_data; }
+template <int Layout, std::ptrdiff_t... Indices>
+EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
+    const Sizes<Indices...>& sizes) {
+  return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
+}
 
-  const Scalar* data() const { return m_data; }
+// -------------------------------------------------------------------------- //
+
+// Tensor block shape type defines what are the shape preference for the blocks
+// extracted from the larger tensor.
+//
+// Example: blocks of 100 elements from the large 100x100 tensor:
+// - tensor: 100x100
+// - target_block_size: 100
+//
+// TensorBlockShapeType:
+//  - kUniformAllDims: 100 blocks of size 10x10
+//  - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
+//                      or row major layout)
+enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
+
+struct TensorBlockResourceRequirements {
+  TensorBlockShapeType shape_type;  // target block shape
+  size_t size;                      // target block size
+  TensorOpCost cost_per_coeff;      // cost of computing a single block element
+
+#ifdef EIGEN_HIPCC
+  // For HIPCC, we need to explicitly declare as a "device fun", the constructor
+  // which is implicitly invoked in the "merge" / "any" routines. else HIPCC
+  // errors out complaining about the lack of a matching constructor
+  EIGEN_DEVICE_FUNC
+  TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_,
+				  TensorOpCost cost_)
+    : shape_type(shape_type_), size(size_), cost_per_coeff(cost_)
+  {}
+#endif
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
+      TensorBlockShapeType shape_type, size_t size_in_bytes,
+      TensorOpCost cost) {
+    const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
+    return {shape_type, size, cost};
+  }
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
+      TensorBlockShapeType shape_type, size_t size_in_bytes) {
+    // This default cost per coefficient is valid for most materialized tensor
+    // block evaluation implementations, because they typically just read
+    // coefficients from the underlying tensor storage, and write to the tensor
+    // block buffer (scratch or destination memory, reads and writes have linear
+    // access pattern). We ignore the fixed cost of block evaluation, because in
+    // practice it should negligible.
+    //
+    // Lazy block evaluation adds the cost of calling a functor for each
+    // coefficient.
+    //
+    // All non-trivial block evaluation implementations must provide their own
+    // cost approximation (e.g. shuffling inner dimension has a much higher cost
+    // because it reads memory randomly, although the total number of moved
+    // bytes is the same).
+    return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
+                                    {/*bytes_loaded=*/sizeof(Scalar),
+                                     /*bytes_stored=*/sizeof(Scalar),
+                                     /*compute_cycles=*/0});
+  }
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(
+      size_t size_in_bytes) {
+    return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,
+                                    size_in_bytes);
+  }
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(
+      size_t size_in_bytes) {
+    return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,
+                                    size_in_bytes);
+  }
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
+  merge(const TensorBlockResourceRequirements& lhs,
+        const TensorBlockResourceRequirements& rhs) {
+    return {merge(lhs.shape_type, rhs.shape_type),           // shape_type
+            merge(lhs.size, rhs.size),                       // size
+            merge(lhs.cost_per_coeff, rhs.cost_per_coeff)};  // cost_per_coeff
+  }
+
+  EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(
+      TensorOpCost cost) {
+    cost_per_coeff += cost;
+    return *this;
+  }
+
+  // This is a resource requirement that should be returned from expressions
+  // that do not have any block evaluation preference (e.g. default tensor
+  // expression with raw buffer access).
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
+    return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
+  }
 
  private:
-  Index m_first_coeff_index;
-  Dimensions m_block_sizes;
-  Dimensions m_block_strides;
-  Dimensions m_tensor_strides;
-  Scalar* m_data;  // Not owned.
-};
+  using Requirements = TensorBlockResourceRequirements;
 
-template <typename Index, typename Scalar, bool Vectorizable>
-struct TensorBlockCopyOp {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const Index num_coeff_to_copy, const Index dst_index,
-      const Index dst_stride, Scalar* EIGEN_RESTRICT dst_data, const Index src_index,
-      const Index src_stride, const Scalar* EIGEN_RESTRICT src_data) {
-    for (Index i = 0; i < num_coeff_to_copy; ++i) {
-      dst_data[dst_index + i * dst_stride] =
-          src_data[src_index + i * src_stride];
-    }
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
+    return numext::maxi(lhs_size, rhs_size);
+  }
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE TensorBlockShapeType
+  merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) {
+    return (lhs == TensorBlockShapeType::kSkewedInnerDims ||
+            rhs == TensorBlockShapeType::kSkewedInnerDims)
+               ? TensorBlockShapeType::kSkewedInnerDims
+               : TensorBlockShapeType::kUniformAllDims;
+  }
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
+                                                TensorOpCost rhs_cost) {
+    return lhs_cost + rhs_cost;
   }
 };
 
-// NOTE: Benchmarks run on an implementation of this that broke each of the
-// loops in these conditionals into it's own template specialization (to
-// avoid conditionals in the caller's loop) did not show an improvement.
-template <typename Index, typename Scalar>
-struct TensorBlockCopyOp<Index, Scalar, true> {
-  typedef typename packet_traits<Scalar>::type Packet;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const Index num_coeff_to_copy, const Index dst_index,
-      const Index dst_stride, Scalar* EIGEN_RESTRICT dst_data,
-      const Index src_index, const Index src_stride,
-      const Scalar* EIGEN_RESTRICT src_data) {
-    if (src_stride == 1) {
-      const Index packet_size = internal::unpacket_traits<Packet>::size;
-      const Index vectorized_size =
-          (num_coeff_to_copy / packet_size) * packet_size;
-      if (dst_stride == 1) {
-        // LINEAR
-        for (Index i = 0; i < vectorized_size; i += packet_size) {
-          Packet p = internal::ploadt<Packet, Unaligned>(
-              src_data + src_index + i);
-          internal::pstoret<Scalar, Packet, Unaligned>(
-              dst_data + dst_index + i, p);
-        }
-        for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
-          dst_data[dst_index + i] = src_data[src_index + i];
-        }
-      } else {
-        // SCATTER
-        for (Index i = 0; i < vectorized_size; i += packet_size) {
-          Packet p = internal::ploadt<Packet, Unaligned>(
-              src_data + src_index + i);
-          internal::pscatter<Scalar, Packet>(
-              dst_data + dst_index + i * dst_stride, p, dst_stride);
-        }
-        for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
-          dst_data[dst_index + i * dst_stride] = src_data[src_index + i];
-        }
-      }
-    } else {
-      if (dst_stride == 1) {
-        // GATHER
-        const Index packet_size = internal::unpacket_traits<Packet>::size;
-        const Index vectorized_size =
-            (num_coeff_to_copy / packet_size) * packet_size;
-        for (Index i = 0; i < vectorized_size; i += packet_size) {
-          Packet p = internal::pgather<Scalar, Packet>(
-              src_data + src_index + i * src_stride, src_stride);
-          internal::pstoret<Scalar, Packet, Unaligned>(
-              dst_data + dst_index + i, p);
-        }
-        for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
-          dst_data[dst_index + i] = src_data[src_index + i * src_stride];
-        }
-      } else {
-        // RANDOM
-        for (Index i = 0; i < num_coeff_to_copy; ++i) {
-          dst_data[dst_index + i * dst_stride] =
-              src_data[src_index + i * src_stride];
-        }
-      }
-    }
-  }
-};
+// -------------------------------------------------------------------------- //
+// TensorBlockDescriptor specifies a block offset within a tensor and the block
+// sizes along each of the tensor dimensions.
 
-/** \class TensorBlockIO
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor block IO class.
-  *
-  * This class is responsible for copying data between a tensor and a tensor
-  * block.
-  *
-  */
-template <typename Index, typename Scalar, std::size_t NumDims, int Layout,
-          bool Vectorizable, bool BlockRead>
-class TensorBlockIO {
+template <int NumDims, typename IndexType = Eigen::Index>
+class TensorBlockDescriptor {
  public:
-  typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout>
-    TensorBlock;
-  typedef typename internal::TensorBlockCopyOp<Index, Scalar, Vectorizable>
-    TensorBlockCopyOp;
+  typedef DSizes<IndexType, NumDims> Dimensions;
 
- protected:
-  struct BlockIteratorState {
-    Index input_stride;
-    Index output_stride;
-    Index input_span;
-    Index output_span;
-    Index size;
-    Index count;
+  // If we evaluate a Tensor assignment, and expression on the left, already has
+  // a memory buffer, then we might do performance optimization, and evaluate
+  // the root expression directly into the final output memory. Some time it's
+  // possible to reuse it for materializing subexpressions inside an expression
+  // tree, to to avoid dynamic memory allocation.
+  //
+  // The pointer type of the underlying storage is erased, because passing
+  // Scalar type through all the expression evaluation layers is way too many
+  // templates. In practice destination buffer type should always match the
+  // evaluated expression scalar type.
+  class DestinationBuffer {
+   public:
+    enum DestinationBufferKind : int {
+      // The above explicit specification of "int" as the enum basetype is
+      // needed to get around a HIPCC link error ("the field type is not
+      // amp-compatible")
+      // which is issued for class members with the enum type.
+      // TODO(rocm):
+      // remove the "int" basetype once HIPCC has been fixed to not error out
+      // in the above scenario.
+
+      // Destination buffer is not defined (`m_data` == nullptr).
+      kEmpty,
+
+      // Tensor block defined by an owning tensor block descriptor can fit
+      // contiguously into the destination buffer. In this case it's safe to
+      // materialize tensor block in the destination buffer, wrap it in a
+      // TensorMap, and use to build Eigen expression on top of it.
+      kContiguous,
+
+      // Destination buffer strides do not match strides of the contiguously
+      // stored block, and it's impossible to define a TensorMap over this
+      // buffer. However if we are evaluating a root of an expression tree, we
+      // still can materialize an output into this destination, because we can
+      // guarantee that no one will ever access it through block API.
+      //
+      // In theory it is possible to build valid TensorStriding<TensorMap>
+      // expression on top of this destination buffer, however it has
+      // inefficient coeff/packet access, and defeats the purpose of fast block
+      // evaluation API.
+      kStrided
+    };
+
+    template <typename Scalar>
+    Scalar* data() const {
+      eigen_assert(m_data_type_size == sizeof(Scalar));
+      return static_cast<Scalar*>(m_data);
+    }
+
+    const Dimensions& strides() const { return m_strides; }
+    const DestinationBufferKind& kind() const { return m_kind; }
+
+   private:
+    friend class TensorBlockDescriptor;
+
+    DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
+
+    template <typename Scalar>
+    DestinationBuffer(Scalar* data, const Dimensions& strides,
+                      DestinationBufferKind kind)
+        : m_data(static_cast<void*>(data)),
+          m_data_type_size(sizeof(Scalar)),
+          m_strides(strides),
+          m_kind(kind) {}
+
+    template <int Layout, typename Scalar>
+    static DestinationBuffer make(const TensorBlockDescriptor& desc,
+                                  Scalar* data, const Dimensions& strides) {
+      return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
+    }
+
+    template <int Layout>
+    static DestinationBufferKind kind(const TensorBlockDescriptor& desc,
+                                      const Dimensions& strides) {
+      const Dimensions& desc_dims = desc.dimensions();
+      const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
+      for (int i = 0; i < NumDims; ++i) {
+        if (desc_dims[i] == 1) continue;
+        if (desc_strides[i] != strides[i]) return kStrided;
+      }
+      return kContiguous;
+    }
+
+    // Storage pointer is type erased, to reduce template bloat, but we still
+    // keep the size of the underlying element type for error checking.
+    void* m_data;
+    size_t m_data_type_size;
+
+    // Destination buffer dimensions always match the dimensions of a tensor
+    // block descriptor it belongs to, however strides might be different.
+    Dimensions m_strides;
+
+    DestinationBufferKind m_kind;
   };
 
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(
-      const TensorBlock& block, Index first_coeff_index,
-      const array<Index, NumDims>& tensor_to_block_dim_map,
-      const array<Index, NumDims>& tensor_strides, const Scalar* src_data,
-      Scalar* dst_data) {
-    // Calculate strides and dimensions.
-    const Index block_dim_for_tensor_stride1_dim =
-        NumDims == 0 ? 1 :
-        tensor_to_block_dim_map[static_cast<int>(Layout) ==
-                                        static_cast<int>(ColMajor)
-                                    ? 0
-                                    : NumDims - 1];
-    const size_t block_inner_dim_size =
-        NumDims == 0 ? 1 :
-        block.block_sizes()[block_dim_for_tensor_stride1_dim];
-    const size_t block_outer_dim_size =
-        NumDims == 0 ? 1 :
-        block.block_sizes().TotalSize() / block_inner_dim_size;
+  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
+                        const DestinationBuffer& destination)
+      : m_offset(offset),
+        m_dimensions(dimensions),
+        m_destination(destination) {}
 
-    Index inputIndex;
-    Index outputIndex;
-    Index input_stride;
-    Index output_stride;
+  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
+      : m_offset(offset),
+        m_dimensions(dimensions),
+        m_destination(DestinationBuffer()) {}
 
-    // Setup strides to read/write along the tensor's stride1 dimension.
-    if (BlockRead) {
-      inputIndex = first_coeff_index;
-      outputIndex = 0;
-      input_stride = 1;
-      output_stride = NumDims == 0 ? 1
-          : block.block_strides()[block_dim_for_tensor_stride1_dim];
-    } else {
-      inputIndex = 0;
-      outputIndex = first_coeff_index;
-      input_stride = NumDims == 0 ? 1
-          : block.block_strides()[block_dim_for_tensor_stride1_dim];
-      output_stride = 1;
+  IndexType offset() const { return m_offset; }
+  const Dimensions& dimensions() const { return m_dimensions; }
+  IndexType dimension(int index) const { return m_dimensions[index]; }
+  IndexType size() const { return array_prod<IndexType>(m_dimensions); }
+
+  const DestinationBuffer& destination() const { return m_destination; }
+
+  template <int Layout, typename Scalar>
+  void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
+    eigen_assert(dst_base != NULL);
+    m_destination =
+        DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
+  }
+
+  template <int Layout, typename Scalar, typename DstStridesIndexType>
+  void AddDestinationBuffer(
+      Scalar* dst_base,
+      const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
+    // DSizes constructor will do index type promotion if it's safe.
+    AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
+  }
+
+  TensorBlockDescriptor& DropDestinationBuffer() {
+    m_destination.m_data = NULL;
+    m_destination.m_kind = DestinationBuffer::kEmpty;
+    return *this;
+  }
+
+  bool HasDestinationBuffer() const {
+    return m_destination.kind() != DestinationBuffer::kEmpty;
+  }
+
+  // Returns a copy of `*this` with updated offset.
+  TensorBlockDescriptor WithOffset(IndexType offset) const {
+    return TensorBlockDescriptor(offset, m_dimensions, m_destination);
+  }
+
+ private:
+  // Offset and dimensions are immutable after construction. Block descriptor
+  // can only be mutated by adding or dropping destination.
+  const IndexType m_offset;
+  const Dimensions m_dimensions;
+  DestinationBuffer m_destination;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockMapper is responsible for iterating over the blocks of a tensor.
+
+template <int NumDims, int Layout, typename IndexType = Eigen::Index>
+class TensorBlockMapper {
+  typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
+
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+
+  TensorBlockMapper() = default;
+  TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions,
+                    const TensorBlockResourceRequirements& requirements)
+      : m_tensor_dimensions(dimensions), m_requirements(requirements) {
+    // Compute block dimensions and the total number of blocks.
+    InitializeBlockDimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const {
+    return m_total_block_count;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const {
+    return m_block_dimensions.TotalSize();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>&
+  blockDimensions() const {
+    return m_block_dimensions;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor
+  blockDescriptor(IndexType block_index) const {
+    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
+
+    IndexType offset = 0;
+    DSizes<IndexType, NumDims> dimensions;
+
+    if (NumDims == 0) return BlockDescriptor(offset, dimensions);
+
+    // Iterate outer -> inner dimensions.
+    for (int i = NumDims - 1; i >= 0; --i) {
+      const int dim = isColMajor ? i : NumDims - i - 1;
+
+      const IndexType idx = block_index / m_block_strides[dim];
+      block_index -= idx * m_block_strides[dim];
+
+      const IndexType coord = idx * m_block_dimensions[dim];
+      dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
+                                     m_block_dimensions[dim]);
+      offset += coord * m_tensor_strides[dim];
     }
 
-    const std::size_t at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
-    array<BlockIteratorState, at_least_1_dim> block_iter_state;
+    return {offset, dimensions};
+  }
 
-    // Initialize block iterator state.
-    for (int i = 0; i < static_cast<int>(NumDims) - 1; ++i) {
-      const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                          ? i + 1
-                          : NumDims - i - 2;
-      block_iter_state[i].size =
-          block.block_sizes()[tensor_to_block_dim_map[dim]];
-      if (BlockRead) {
-        block_iter_state[i].input_stride = tensor_strides[dim];
-        block_iter_state[i].output_stride =
-            block.block_strides()[tensor_to_block_dim_map[dim]];
-      } else {
-        block_iter_state[i].input_stride =
-            block.block_strides()[tensor_to_block_dim_map[dim]];
-        block_iter_state[i].output_stride = tensor_strides[dim];
+ private:
+  void InitializeBlockDimensions() {
+    // Requested block shape and size.
+    const TensorBlockShapeType shape_type = m_requirements.shape_type;
+    IndexType target_block_size =
+        numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
+
+    IndexType tensor_size = m_tensor_dimensions.TotalSize();
+
+    // Corner case: one of the dimensions is zero. Logic below is too complex
+    // to handle this case on a general basis, just use unit block size.
+    // Note: we must not yield blocks with zero dimensions (recipe for
+    // overflows/underflows, divisions by zero and NaNs later).
+    if (tensor_size == 0) {
+      for (int i = 0; i < NumDims; ++i) {
+        m_block_dimensions[i] = 1;
       }
-      block_iter_state[i].input_span =
-          block_iter_state[i].input_stride * (block_iter_state[i].size - 1);
-      block_iter_state[i].output_span =
-          block_iter_state[i].output_stride * (block_iter_state[i].size - 1);
-      block_iter_state[i].count = 0;
+      m_total_block_count = 0;
+      return;
+    }
+
+    // If tensor fits into a target block size, evaluate it as a single block.
+    if (tensor_size <= target_block_size) {
+      m_block_dimensions = m_tensor_dimensions;
+      m_total_block_count = 1;
+      // The only valid block index is `0`, and in this case we do not need
+      // to compute real strides for tensor or blocks (see blockDescriptor).
+      for (int i = 0; i < NumDims; ++i) {
+        m_tensor_strides[i] = 0;
+        m_block_strides[i] = 1;
+      }
+      return;
+    }
+
+    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
+
+    // Block shape skewed towards inner dimension.
+    if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
+      IndexType coeff_to_allocate = target_block_size;
+
+      for (int i = 0; i < NumDims; ++i) {
+        const int dim = isColMajor ? i : NumDims - i - 1;
+        m_block_dimensions[dim] =
+            numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
+        coeff_to_allocate = divup(
+            coeff_to_allocate,
+            numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
+      }
+      eigen_assert(coeff_to_allocate == 1);
+
+    } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
+      // Tensor will not fit within 'target_block_size' budget: calculate tensor
+      // block dimension sizes based on "square" dimension size target.
+      const IndexType dim_size_target = convert_index<IndexType>(
+          std::pow(static_cast<float>(target_block_size),
+                   1.0f / static_cast<float>(m_block_dimensions.rank())));
+
+      for (int i = 0; i < NumDims; ++i) {
+        // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
+        // a multiple of the packet size. Note that reducing
+        // 'block_dim_size' in this manner can increase the number of
+        // blocks, and so will amplify any per-block overhead.
+        m_block_dimensions[i] =
+            numext::mini(dim_size_target, m_tensor_dimensions[i]);
+      }
+
+      // Add any un-allocated coefficients to inner dimension(s).
+      IndexType total_size = m_block_dimensions.TotalSize();
+      for (int i = 0; i < NumDims; ++i) {
+        const int dim = isColMajor ? i : NumDims - i - 1;
+
+        if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
+          const IndexType total_size_other_dims =
+              total_size / m_block_dimensions[dim];
+          const IndexType alloc_avail =
+              divup<IndexType>(target_block_size, total_size_other_dims);
+          if (alloc_avail == m_block_dimensions[dim]) {
+            // Insufficient excess coefficients to allocate.
+            break;
+          }
+          m_block_dimensions[dim] =
+              numext::mini(m_tensor_dimensions[dim], alloc_avail);
+          total_size = total_size_other_dims * m_block_dimensions[dim];
+        }
+      }
+
+    } else {
+      eigen_assert(false);  // unknown block shape
+    }
+
+    eigen_assert(m_block_dimensions.TotalSize() >=
+                 numext::mini<IndexType>(target_block_size,
+                                         m_tensor_dimensions.TotalSize()));
+
+    // Calculate block counts by dimension and total block count.
+    DSizes<IndexType, NumDims> block_count;
+    for (int i = 0; i < NumDims; ++i) {
+      block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
+    }
+    m_total_block_count = array_prod(block_count);
+
+    // Calculate block strides (used for enumerating blocks).
+    m_tensor_strides = strides<Layout>(m_tensor_dimensions);
+    m_block_strides = strides<Layout>(block_count);
+  }
+
+  DSizes<IndexType, NumDims> m_tensor_dimensions;
+  TensorBlockResourceRequirements m_requirements;
+
+  DSizes<IndexType, NumDims> m_block_dimensions;
+  IndexType m_total_block_count;
+
+  DSizes<IndexType, NumDims> m_tensor_strides;
+  DSizes<IndexType, NumDims> m_block_strides;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockScratchAllocator is responsible for allocating temporary buffers
+// for block evaluation (output or input block materialization). Given that
+// Eigen expression traversal order is deterministic, all temporary allocations
+// are happening in the same order, and usually have exactly the same size.
+// Scratch allocator keeps a trace of all dynamic allocations, and after the
+// first block evaluation is completed, we should be able to reuse all the
+// temporary buffers for the next block evaluation.
+
+template <typename Device>
+class TensorBlockScratchAllocator {
+ public:
+  explicit TensorBlockScratchAllocator(const Device& device)
+      : m_device(device), m_allocation_index(0) {}
+
+  ~TensorBlockScratchAllocator() {
+    for (size_t i = 0; i < m_allocations.size(); ++i) {
+      m_device.deallocate(m_allocations[i].ptr);
+    }
+  }
+
+  void* allocate(size_t size) {
+    // TODO(ezhulenev): Remove when replaced with inlined vector.
+    if (m_allocations.capacity() == 0) m_allocations.reserve(8);
+
+    // Check if we already have an existing allocation att current index.
+    const int num_allocations = static_cast<int>(m_allocations.size());
+    const bool has_allocation = m_allocation_index < num_allocations;
+
+    // Allocation index can't be larger than the number of allocations.
+    eigen_assert(m_allocation_index <= num_allocations);
+
+    // If we have existing allocation, and its size is larger or equal to
+    // requested size, we do nothing.
+
+    // If current allocation can't fit requested size, we deallocate it, and
+    // replace with a larger allocation.
+    if (has_allocation && m_allocations[m_allocation_index].size < size) {
+      m_device.deallocate(m_allocations[m_allocation_index].ptr);
+      m_allocations[m_allocation_index].ptr = m_device.allocate(size);
+      m_allocations[m_allocation_index].size = size;
+    }
+
+    // Make a new allocation if we don't have and existing one.
+    if (!has_allocation) {
+      Allocation allocation;
+      allocation.ptr = m_device.allocate(size);
+      allocation.size = size;
+      m_allocations.push_back(allocation);
+    }
+
+    eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
+    eigen_assert(m_allocations[m_allocation_index].size >= size);
+
+    return m_allocations[m_allocation_index++].ptr;
+  }
+
+  void reset() { m_allocation_index = 0; }
+
+ private:
+  struct Allocation {
+    void* ptr;
+    size_t size;
+  };
+
+  const Device& m_device;
+  int m_allocation_index;
+  // TODO(ezhulenev): This should be an inlined vector.
+  std::vector<Allocation> m_allocations;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockKind represents all possible block kinds, that can be produced by
+// TensorEvaluator::evalBlock function.
+enum TensorBlockKind {
+  // Tensor block that is a lazy expression that must be assigned to a
+  // destination using TensorBlockAssign.
+  kExpr,
+
+  // Tensor block that is a view into a memory buffer owned by an underlying
+  // Tensor expression (e.g. it can be a view into a Tensor buffer).
+  kView,
+
+  // Tensor block that was materialized in a scratch memory buffer, allocated
+  // with TensorBlockScratchAllocator. This block must be copied to a
+  // destination, similar to a block of `kExpr` type.
+  kMaterializedInScratch,
+
+  // Tensor block that was materialized directly into the final output memory
+  // buffer. For example if the left side of an assignment is a Tensor, we can
+  // directly materialize the block in the destination memory.
+  //
+  // If strides in the output buffer do not match tensor block strides, the
+  // Tensor expression will be invalid, and should not be used by
+  // TensorBlockAssign or for constructing another block expression.
+  kMaterializedInOutput
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
+// TensorEvaluators that do not support block evaluation.
+
+class TensorBlockNotImplemented {
+ public:
+  typedef void XprType;
+};
+
+// -------------------------------------------------------------------------- //
+// XprScalar extracts Scalar type from the Eigen expressions (if expression type
+// is not void). It's required to be able to define lazy block expression for
+// argument types, that do not support block evaluation.
+
+template <typename XprType>
+struct XprScalar {
+  typedef typename XprType::Scalar type;
+};
+template <>
+struct XprScalar<void> {
+  typedef void type;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorMaterializedBlock is a fully evaluated block of the original tensor,
+// and XprType is just a TensorMap over the data. This block type is typically
+// used to materialize blocks of tensor expressions, that can't be efficiently
+// represented as lazy Tensor expressions with fast coeff/packet operations,
+// e.g. we materialize all broadcasts into evaluated blocks.
+//
+// TensorMaterializedBlock does not own its memory buffer, it's either a memory
+// buffer that backs the original expression (e.g. block is just a view into a
+// Tensor), or a memory buffer allocated with scratch allocator, and in this
+// case the scratch allocator will deallocate it at the end of block based
+// expression execution.
+//
+// If the block was evaluated directly into the output buffer, and strides in
+// the output buffer do not match block strides, the TensorMap expression will
+// be invalid, and should never be used in block assignment or any other tensor
+// expression.
+
+template <typename Scalar, int NumDims, int Layout,
+          typename IndexType = Eigen::Index>
+class TensorMaterializedBlock {
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+  typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
+
+  TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
+                          const Dimensions& dimensions, bool valid_expr = true)
+      : m_kind(kind),
+        m_data(data),
+        m_dimensions(dimensions),
+        m_expr(m_data, m_dimensions),
+        m_valid_expr(valid_expr) {
+    eigen_assert(m_kind == internal::TensorBlockKind::kView ||
+                 m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
+                 m_kind == internal::TensorBlockKind::kMaterializedInOutput);
+  }
+
+  TensorBlockKind kind() const { return m_kind; }
+  // NOTE(ezhulenev): Returning XprType by value like in other block types
+  // causes asan failures. The theory is that XprType::Nested doesn't work
+  // properly for TensorMap.
+  const XprType& expr() const {
+    eigen_assert(m_valid_expr);
+    return m_expr;
+  }
+  const Scalar* data() const { return m_data; }
+  void cleanup() {}
+
+  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
+
+  // TensorMaterializedBlock can be backed by different types of storage:
+  //
+  //   (1) Contiguous block of memory allocated with scratch allocator.
+  //   (2) Contiguous block of memory reused from tensor block descriptor
+  //       destination buffer.
+  //   (3) Strided block of memory reused from tensor block descriptor
+  //       destination buffer.
+  //
+  class Storage {
+   public:
+    Scalar* data() const { return m_data; }
+    const Dimensions& dimensions() const { return m_dimensions; }
+    const Dimensions& strides() const { return m_strides; }
+
+    TensorMaterializedBlock AsTensorMaterializedBlock() const {
+      return TensorMaterializedBlock(
+          m_materialized_in_output
+              ? internal::TensorBlockKind::kMaterializedInOutput
+              : internal::TensorBlockKind::kMaterializedInScratch,
+          m_data, m_dimensions, !m_strided_storage);
+    }
+
+   private:
+    friend class TensorMaterializedBlock;
+
+    Storage(Scalar* data, const Dimensions& dimensions,
+            const Dimensions& strides, bool materialized_in_output,
+            bool strided_storage)
+        : m_data(data),
+          m_dimensions(dimensions),
+          m_strides(strides),
+          m_materialized_in_output(materialized_in_output),
+          m_strided_storage(strided_storage) {}
+
+    Scalar* m_data;
+    Dimensions m_dimensions;
+    Dimensions m_strides;
+    bool m_materialized_in_output;
+    bool m_strided_storage;
+  };
+
+  // Creates a storage for materialized block either from the block descriptor
+  // destination buffer, or allocates a new buffer with scratch allocator.
+  template <typename TensorBlockScratch>
+  EIGEN_STRONG_INLINE static Storage prepareStorage(
+      TensorBlockDesc& desc, TensorBlockScratch& scratch,
+      bool allow_strided_storage = false) {
+    // Try to reuse destination as an output block buffer.
+    typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
+
+    if (desc.destination().kind() == DestinationBuffer::kContiguous) {
+      Scalar* buffer = desc.destination().template data<Scalar>();
+      desc.DropDestinationBuffer();
+      return Storage(buffer, desc.dimensions(),
+                     internal::strides<Layout>(desc.dimensions()),
+                     /*materialized_in_output=*/true,
+                     /*strided_storage=*/false);
+
+    } else if (desc.destination().kind() == DestinationBuffer::kStrided &&
+               allow_strided_storage) {
+      Scalar* buffer = desc.destination().template data<Scalar>();
+      desc.DropDestinationBuffer();
+      return Storage(buffer, desc.dimensions(), desc.destination().strides(),
+                     /*materialized_in_output=*/true, /*strided_storage=*/true);
+
+    } else {
+      void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
+      return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
+                     internal::strides<Layout>(desc.dimensions()),
+                     /*materialized_in_output=*/false,
+                     /*strided_storage=*/false);
+    }
+  }
+
+  // Creates a materialized block for the given descriptor from a memory buffer.
+  template <typename DataDimensions, typename TensorBlockScratch>
+  EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
+      const Scalar* data, const DataDimensions& data_dims,
+      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+    eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
+
+    // If a tensor block dimensions covers a contiguous block of the underlying
+    // memory, we can skip block buffer memory allocation, and construct a block
+    // from existing `data` memory buffer.
+    //
+    // Example: (RowMajor layout)
+    //   data_dims:          [11, 12, 13, 14]
+    //   desc.dimensions():  [1,   1,  3, 14]
+    //
+    // In this case we can construct a TensorBlock starting at
+    // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
+    static const bool is_col_major = Layout == ColMajor;
+
+    // Find out how many inner dimensions have a matching size.
+    int num_matching_inner_dims = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      int dim = is_col_major ? i : NumDims - i - 1;
+      if (data_dims[dim] != desc.dimensions()[dim]) break;
+      ++num_matching_inner_dims;
+    }
+
+    // All the outer dimensions must be of size `1`, except a single dimension
+    // before the matching inner dimension (`3` in the example above).
+    bool can_use_direct_access = true;
+    for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
+      int dim = is_col_major ? i : NumDims - i - 1;
+      if (desc.dimension(dim) != 1) {
+        can_use_direct_access = false;
+        break;
+      }
+    }
+
+    if (can_use_direct_access) {
+      const Scalar* block_start = data + desc.offset();
+      return TensorMaterializedBlock(internal::TensorBlockKind::kView,
+                                     block_start, desc.dimensions());
+
+    } else {
+      // Reuse destination buffer or allocate new buffer with scratch allocator.
+      const Storage storage = prepareStorage(desc, scratch);
+
+      typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>
+          TensorBlockIO;
+      typedef typename TensorBlockIO::Dst TensorBlockIODst;
+      typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+      TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
+                           data, desc.offset());
+      TensorBlockIODst dst(storage.dimensions(), storage.strides(),
+                           storage.data());
+
+      TensorBlockIO::Copy(dst, src);
+      return storage.AsTensorMaterializedBlock();
+    }
+  }
+
+ private:
+  TensorBlockKind m_kind;
+  const Scalar* m_data;
+  Dimensions m_dimensions;
+  XprType m_expr;
+  bool m_valid_expr;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
+// functor to the blocks produced by the underlying Tensor expression.
+
+template <typename UnaryOp, typename ArgTensorBlock>
+class TensorCwiseUnaryBlock {
+  static const bool NoArgBlockAccess =
+      internal::is_void<typename ArgTensorBlock::XprType>::value;
+
+ public:
+  typedef typename conditional<
+      NoArgBlockAccess, void,
+      TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::
+      type XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
+      : m_arg_block(arg_block), m_functor(functor) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+
+  XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
+  const Scalar* data() const { return NULL; }
+  void cleanup() { m_arg_block.cleanup(); }
+
+ private:
+  ArgTensorBlock m_arg_block;
+  UnaryOp m_functor;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
+// functor to the blocks produced by the underlying Tensor expression.
+
+template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
+class TensorCwiseBinaryBlock {
+  static const bool NoArgBlockAccess =
+      internal::is_void<typename LhsTensorBlock::XprType>::value ||
+      internal::is_void<typename RhsTensorBlock::XprType>::value;
+
+ public:
+  typedef typename conditional<
+      NoArgBlockAccess, void,
+      TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
+                          const typename RhsTensorBlock::XprType> >::type
+      XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
+                         const RhsTensorBlock& right_block,
+                         const BinaryOp& functor)
+      : m_left_block(left_block),
+        m_right_block(right_block),
+        m_functor(functor) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+
+  XprType expr() const {
+    return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
+  }
+
+  const Scalar* data() const { return NULL; }
+
+  void cleanup() {
+    m_left_block.cleanup();
+    m_right_block.cleanup();
+  }
+
+ private:
+  LhsTensorBlock m_left_block;
+  RhsTensorBlock m_right_block;
+  BinaryOp m_functor;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorUnaryExprBlock is a lazy tensor expression block that can construct
+// an arbitrary tensor expression from a block of the underlying type (this is a
+// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
+
+template <typename BlockFactory, typename ArgTensorBlock>
+class TensorUnaryExprBlock {
+  typedef typename ArgTensorBlock::XprType ArgXprType;
+  static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
+
+ public:
+  typedef typename conditional<
+      NoArgBlockAccess, void,
+      typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
+                       const BlockFactory& factory)
+      : m_arg_block(arg_block), m_factory(factory) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+  XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
+  const Scalar* data() const { return NULL; }
+  void cleanup() { m_arg_block.cleanup(); }
+
+ private:
+  ArgTensorBlock m_arg_block;
+  BlockFactory m_factory;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorTernaryExprBlock is a lazy tensor expression block that can construct
+// an arbitrary tensor expression from three blocks of the underlying type.
+
+template <typename BlockFactory, typename Arg1TensorBlock,
+          typename Arg2TensorBlock, typename Arg3TensorBlock>
+class TensorTernaryExprBlock {
+  typedef typename Arg1TensorBlock::XprType Arg1XprType;
+  typedef typename Arg2TensorBlock::XprType Arg2XprType;
+  typedef typename Arg3TensorBlock::XprType Arg3XprType;
+
+  static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
+                                       internal::is_void<Arg2XprType>::value ||
+                                       internal::is_void<Arg3XprType>::value;
+
+ public:
+  typedef typename conditional<
+      NoArgBlockAccess, void,
+      typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
+                                              Arg3XprType>::type>::type XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
+                         const Arg2TensorBlock& arg2_block,
+                         const Arg3TensorBlock& arg3_block,
+                         const BlockFactory& factory)
+      : m_arg1_block(arg1_block),
+        m_arg2_block(arg2_block),
+        m_arg3_block(arg3_block),
+        m_factory(factory) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+  XprType expr() const {
+    return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
+                          m_arg3_block.expr());
+  }
+  const Scalar* data() const { return NULL; }
+  void cleanup() {
+    m_arg1_block.cleanup();
+    m_arg2_block.cleanup();
+    m_arg3_block.cleanup();
+  }
+
+ private:
+  Arg1TensorBlock m_arg1_block;
+  Arg2TensorBlock m_arg2_block;
+  Arg3TensorBlock m_arg3_block;
+  BlockFactory m_factory;
+};
+
+// -------------------------------------------------------------------------- //
+// StridedLinearBufferCopy provides a method to copy data between two linear
+// buffers with different strides, with optimized paths for scatter/gather.
+
+template <typename Scalar, typename IndexType>
+class StridedLinearBufferCopy {
+  typedef typename packet_traits<Scalar>::type Packet;
+  enum {
+    Vectorizable = packet_traits<Scalar>::Vectorizable,
+    PacketSize = packet_traits<Scalar>::size
+  };
+
+ public:
+  // Specifying linear copy kind statically gives ~30% speedup for small sizes.
+  enum class Kind {
+    Linear = 0,       // src_stride == 1 && dst_stride == 1
+    Scatter = 1,      // src_stride == 1 && dst_stride != 1
+    FillLinear = 2,   // src_stride == 0 && dst_stride == 1
+    FillScatter = 3,  // src_stride == 0 && dst_stride != 1
+    Gather = 4,       // dst_stride == 1
+    Random = 5        // everything else
+  };
+
+  struct Dst {
+    Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
+
+    IndexType offset;
+    IndexType stride;
+    Scalar* data;
+  };
+
+  struct Src {
+    Src(IndexType o, IndexType s, const Scalar* d)
+        : offset(o), stride(s), data(d) {}
+
+    IndexType offset;
+    IndexType stride;
+    const Scalar* data;
+  };
+
+  template <typename StridedLinearBufferCopy::Kind kind>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
+                                                        const Src& src,
+                                                        const size_t count) {
+    Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
+              src.data);
+  }
+
+ private:
+  template <typename StridedLinearBufferCopy::Kind kind>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const IndexType count, const IndexType dst_offset,
+      const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
+      const IndexType src_offset, const IndexType src_stride,
+      const Scalar* EIGEN_RESTRICT src_data) {
+    const Scalar* src = &src_data[src_offset];
+    Scalar* dst = &dst_data[dst_offset];
+
+    if (!Vectorizable) {
+      for (Index i = 0; i < count; ++i) {
+        dst[i * dst_stride] = src[i * src_stride];
+      }
+      return;
+    }
+
+    const IndexType vectorized_size = count - PacketSize;
+    IndexType i = 0;
+
+    if (kind == StridedLinearBufferCopy::Kind::Linear) {
+      // ******************************************************************** //
+      // Linear copy from `src` to `dst`.
+      const IndexType unrolled_size = count - 4 * PacketSize;
+      eigen_assert(src_stride == 1 && dst_stride == 1);
+      for (; i <= unrolled_size; i += 4 * PacketSize) {
+        for (int j = 0; j < 4; ++j) {
+          Packet p = ploadu<Packet>(src + i + j * PacketSize);
+          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
+        }
+      }
+      for (; i <= vectorized_size; i += PacketSize) {
+        Packet p = ploadu<Packet>(src + i);
+        pstoreu<Scalar, Packet>(dst + i, p);
+      }
+      for (; i < count; ++i) {
+        dst[i] = src[i];
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
+      // Scatter from `src` to `dst`.
+      eigen_assert(src_stride == 1 && dst_stride != 1);
+      for (; i <= vectorized_size; i += PacketSize) {
+        Packet p = ploadu<Packet>(src + i);
+        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
+      }
+      for (; i < count; ++i) {
+        dst[i * dst_stride] = src[i];
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
+      // Fill `dst` with value at `*src`.
+      eigen_assert(src_stride == 0 && dst_stride == 1);
+      const IndexType unrolled_size = count - 4 * PacketSize;
+      Packet p = pload1<Packet>(src);
+      for (; i <= unrolled_size; i += 4 * PacketSize) {
+        for (int j = 0; j < 4; ++j) {
+          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
+        }
+      }
+      for (; i <= vectorized_size; i += PacketSize) {
+        pstoreu<Scalar, Packet>(dst + i, p);
+      }
+      for (; i < count; ++i) {
+        dst[i] = *src;
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
+      // Scatter `*src` into `dst`.
+      eigen_assert(src_stride == 0 && dst_stride != 1);
+      Packet p = pload1<Packet>(src);
+      for (; i <= vectorized_size; i += PacketSize) {
+        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
+      }
+      for (; i < count; ++i) {
+        dst[i * dst_stride] = *src;
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
+      // Gather from `src` into `dst`.
+      eigen_assert(dst_stride == 1);
+      for (; i <= vectorized_size; i += PacketSize) {
+        Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
+        pstoreu<Scalar, Packet>(dst + i, p);
+      }
+      for (; i < count; ++i) {
+        dst[i] = src[i * src_stride];
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::Random) {
+      // Random.
+      for (; i < count; ++i) {
+        dst[i * dst_stride] = src[i * src_stride];
+      }
+    } else {
+      eigen_assert(false);
+    }
+  }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
+// It's possible to specify src->dst dimension mapping for the copy operation.
+// Dimensions of `dst` specify how many elements have to be copied, for the
+// `src` we need to know only stride to navigate through source memory buffer.
+
+template <typename Scalar, typename IndexType, int NumDims, int Layout>
+class TensorBlockIO {
+  static const bool IsColMajor = (Layout == ColMajor);
+
+  typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
+
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+  typedef DSizes<int, NumDims> DimensionsMap;
+
+  struct Dst {
+    Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
+        IndexType dst_offset = 0)
+        : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
+
+    Dimensions dims;
+    Dimensions strides;
+    Scalar* data;
+    IndexType offset;
+  };
+
+  struct Src {
+    Src(const Dimensions& src_strides, const Scalar* src,
+        IndexType src_offset = 0)
+        : strides(src_strides), data(src), offset(src_offset) {}
+
+    Dimensions strides;
+    const Scalar* data;
+    IndexType offset;
+  };
+
+  // Copies data to `dst` from `src`, using provided dimensions mapping:
+  //
+  //   src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
+  //
+  // Returns the number of copied elements.
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
+      const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
+    // Copy single scalar value from `src` to `dst`.
+    if (NumDims == 0) {
+      *(dst.data + dst.offset) = *(src.data + src.offset);
+      return 1;
+    }
+
+    // Both `dst` and `src` must have contiguous innermost dimension. We also
+    // accept the special case with stride '0', because it's used as a trick to
+    // implement broadcasting.
+    {
+      int inner_dim = IsColMajor ? 0 : NumDims - 1;
+      EIGEN_UNUSED_VARIABLE(inner_dim);
+      eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
+      eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
+    }
+
+    // Give a shorter name to `dst_to_src_dim_map`.
+    const DimensionsMap& dim_map = dst_to_src_dim_map;
+
+    // Do not squeeze reordered inner dimensions.
+    int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
+
+    // NOTE: We find the innermost dimension (contiguous in memory) in the dst
+    // block, and we write data linearly into that dimension, reading it from
+    // the src. If dimensions are reordered, we might end up reading data from
+    // the src with `stride != 1`.
+    //
+    // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
+    // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
+
+    // Find the innermost dimension in the dst whose size is not 1. This is the
+    // effective inner dim.
+    int num_size_one_inner_dims = 0;
+    for (int i = 0; i < num_squeezable_dims; ++i) {
+      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
+      if (dst.dims[dst_dim] != 1) break;
+      num_size_one_inner_dims++;
+    }
+
+    // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
+    if (num_size_one_inner_dims == NumDims) {
+      *(dst.data + dst.offset) = *(src.data + src.offset);
+      return 1;
+    }
+
+    // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
+    const int dst_stride1_dim = IsColMajor
+                                    ? num_size_one_inner_dims
+                                    : NumDims - num_size_one_inner_dims - 1;
+
+    // Dimension in the src that corresponds to the dst innermost dimension.
+    const int src_dim_for_dst_stride1_dim =
+        NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
+
+    // Size of the innermost dimension (length of contiguous blocks of memory).
+    IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
+
+    // Squeeze multiple inner dims into one if they are contiguous in `dst` and
+    // `src` memory, so we can do less linear copy calls.
+    for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
+      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
+      const IndexType dst_stride = dst.strides[dst_dim];
+      const IndexType src_stride = src.strides[dim_map[dst_dim]];
+      if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
+        dst_inner_dim_size *= dst.dims[dst_dim];
+        ++num_size_one_inner_dims;
+      } else {
+        break;
+      }
+    }
+
+    // Setup strides to read data from `src` and write to `dst`.
+    IndexType input_offset = src.offset;
+    IndexType output_offset = dst.offset;
+    IndexType input_stride =
+        NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
+    IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
+
+    const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
+    array<BlockIteratorState, at_least_1_dim> it;
+
+    // Initialize block iterator state. Squeeze away any dimension of size 1.
+    int idx = 0;  // currently initialized iterator state index
+    for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
+      const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
+      if (dst.dims[dst_dim] == 1) continue;
+
+      it[idx].size = dst.dims[dst_dim];
+      it[idx].input_stride = src.strides[dim_map[dst_dim]];
+      it[idx].output_stride = dst.strides[dst_dim];
+
+      it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
+      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+
+      idx++;
     }
 
     // Iterate copying data from src to dst.
-    for (Index i = 0; i < block_outer_dim_size; ++i) {
-      TensorBlockCopyOp::Run(block_inner_dim_size, outputIndex, output_stride,
-                             dst_data, inputIndex, input_stride, src_data);
+    const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
+
+#define COPY_INNER_DIM(KIND)                                           \
+  IndexType num_copied = 0;                                            \
+  for (num_copied = 0; num_copied < block_total_size;                  \
+       num_copied += dst_inner_dim_size) {                             \
+    LinCopy::template Run<KIND>(                                       \
+        typename LinCopy::Dst(output_offset, output_stride, dst.data), \
+        typename LinCopy::Src(input_offset, input_stride, src.data),   \
+        dst_inner_dim_size);                                           \
+                                                                       \
+    for (int j = 0; j < idx; ++j) {                                    \
+      if (++it[j].count < it[j].size) {                                \
+        input_offset += it[j].input_stride;                            \
+        output_offset += it[j].output_stride;                          \
+        break;                                                         \
+      }                                                                \
+      it[j].count = 0;                                                 \
+      input_offset -= it[j].input_span;                                \
+      output_offset -= it[j].output_span;                              \
+    }                                                                  \
+  }                                                                    \
+  return num_copied;
+
+    if (input_stride == 1 && output_stride == 1) {
+      COPY_INNER_DIM(LinCopy::Kind::Linear);
+    } else if (input_stride == 1 && output_stride != 1) {
+      COPY_INNER_DIM(LinCopy::Kind::Scatter);
+    } else if (input_stride == 0 && output_stride == 1) {
+      COPY_INNER_DIM(LinCopy::Kind::FillLinear);
+    } else if (input_stride == 0 && output_stride != 1) {
+      COPY_INNER_DIM(LinCopy::Kind::FillScatter);
+    } else if (output_stride == 1) {
+      COPY_INNER_DIM(LinCopy::Kind::Gather);
+    } else {
+      COPY_INNER_DIM(LinCopy::Kind::Random);
+    }
+
+#undef COPY_INNER_DIM
+  }
+
+  // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
+  // the number of copied elements.
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,
+                                                              const Src& src) {
+    DimensionsMap dst_to_src_map;
+    for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
+    return Copy(dst, src, dst_to_src_map);
+  }
+
+ private:
+  struct BlockIteratorState {
+    BlockIteratorState()
+        : size(0),
+          count(0),
+          input_stride(0),
+          output_stride(0),
+          input_span(0),
+          output_span(0) {}
+
+    IndexType size;
+    IndexType count;
+    IndexType input_stride;
+    IndexType output_stride;
+    IndexType input_span;
+    IndexType output_span;
+  };
+
+  // Compute how many inner dimensions it's allowed to squeeze when doing IO
+  // between two tensor blocks. It's safe to squeeze inner dimensions, only
+  // if they are not reordered.
+  static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
+    int num_squeezable_dims = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      if (dim_map[dim] != dim) break;
+      num_squeezable_dims++;
+    }
+    return num_squeezable_dims;
+  }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
+// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
+//
+// Currently there is no way to write from a Tensor expression to a block of
+// memory, if dimensions are reordered. If you need to do that, you should
+// materialize a Tensor block expression into a memory buffer, and then use
+// TensorBlockIO to copy data between two memory buffers with a custom
+// `target->src` dimension map (see definition above).
+//
+// Also currently the innermost dimension of `target` must have a stride '1'
+// (contiguous in memory). This restriction could be lifted with a `pscatter`,
+// but in practice it's never needed, and there is a similar TensorBlockIO
+// workaround for that.
+//
+// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
+// where `src` is a tensor expression. Explore if it is possible to rewrite IO
+// to use expressions instead of pointers, and after that TensorBlockAssignment
+// will become an alias to IO.
+template <typename Scalar, int NumDims, typename TensorBlockExpr,
+          typename IndexType = Eigen::Index>
+class TensorBlockAssignment {
+  // We will use coeff/packet path to evaluate block expressions.
+  typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
+      TensorBlockEvaluator;
+
+  typedef DSizes<IndexType, NumDims> Dimensions;
+
+  enum {
+    Vectorizable = packet_traits<Scalar>::Vectorizable,
+    PacketSize = packet_traits<Scalar>::size
+  };
+
+  template <bool Vectorizable, typename Evaluator>
+  struct InnerDimAssign {
+    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
+                                        const Evaluator& eval,
+                                        IndexType eval_offset) {
+      for (IndexType i = 0; i < count; ++i) {
+        target[i] = eval.coeff(eval_offset + i);
+      }
+    }
+  };
+
+  template <typename Evaluator>
+  struct InnerDimAssign<true, Evaluator> {
+    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
+                                        const Evaluator& eval,
+                                        IndexType eval_offset) {
+      typedef typename packet_traits<Scalar>::type Packet;
+
+      const IndexType unrolled_size = count - 4 * PacketSize;
+      const IndexType vectorized_size = count - PacketSize;
+      IndexType i = 0;
+
+      for (; i <= unrolled_size; i += 4 * PacketSize) {
+        for (int j = 0; j < 4; ++j) {
+          const IndexType idx = eval_offset + i + j * PacketSize;
+          Packet p = eval.template packet<Unaligned>(idx);
+          pstoreu<Scalar>(target + i + j * PacketSize, p);
+        }
+      }
+
+      for (; i <= vectorized_size; i += PacketSize) {
+        Packet p = eval.template packet<Unaligned>(eval_offset + i);
+        pstoreu<Scalar>(target + i, p);
+      }
+
+      for (; i < count; ++i) {
+        target[i] = eval.coeff(eval_offset + i);
+      }
+    }
+  };
+
+ public:
+  struct Target {
+    Target(const Dimensions& target_dims, const Dimensions& target_strides,
+           Scalar* target_data, IndexType target_offset = 0)
+        : dims(target_dims),
+          strides(target_strides),
+          data(target_data),
+          offset(target_offset) {}
+
+    Dimensions dims;
+    Dimensions strides;
+    Scalar* data;
+    IndexType offset;
+  };
+
+  static Target target(const Dimensions& target_dims,
+                       const Dimensions& target_strides, Scalar* target_data,
+                       IndexType target_offset = 0) {
+    return Target(target_dims, target_strides, target_data, target_offset);
+  }
+
+  template <typename TargetDimsIndexType, typename TargetStridesIndexType>
+  static Target target(
+      const DSizes<TargetDimsIndexType, NumDims>& target_dims,
+      const DSizes<TargetStridesIndexType, NumDims>& target_strides,
+      Scalar* target_data, IndexType target_offset = 0) {
+    // DSizes constructor will do index type promotion if it's safe.
+    return Target(Dimensions(target_dims), Dimensions(target_strides),
+                  target_data, target_offset);
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const Target& target, const TensorBlockExpr& expr) {
+    // Prepare evaluator for block expression.
+    DefaultDevice default_device;
+    TensorBlockEvaluator eval(expr, default_device);
+
+    // Tensor block expression dimension should match destination dimensions.
+    eigen_assert(dimensions_match(target.dims, eval.dimensions()));
+
+    static const int Layout = TensorBlockEvaluator::Layout;
+    static const bool is_col_major = Layout == ColMajor;
+
+    // Initialize output inner dimension size based on a layout.
+    const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
+    const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
+    IndexType output_inner_dim_size = target.dims[inner_dim_idx];
+
+    // Target inner dimension stride must be '1'.
+    eigen_assert(target.strides[inner_dim_idx] == 1);
+
+    // Squeeze multiple inner dims into one if they are contiguous in `target`.
+    IndexType num_squeezed_dims = 0;
+    for (Index i = 1; i < NumDims; ++i) {
+      const Index dim = is_col_major ? i : NumDims - i - 1;
+      const IndexType target_stride = target.strides[dim];
+
+      if (output_inner_dim_size == target_stride) {
+        output_inner_dim_size *= target.dims[dim];
+        num_squeezed_dims++;
+      } else {
+        break;
+      }
+    }
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims> it;
+
+    int idx = 0;  // currently initialized iterator state index
+    for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
+      const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
+
+      it[idx].count = 0;
+      it[idx].size = target.dims[dim];
+      it[idx].output_stride = target.strides[dim];
+      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+      idx++;
+    }
+
+    // We read block expression from the beginning, and start writing data to
+    // `target` at given offset.
+    IndexType input_offset = 0;
+    IndexType output_offset = target.offset;
+
+    // Iterate copying data from `eval` to `target`.
+    for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
+      // Assign to `target` at current offset.
+      InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
+                     TensorBlockEvaluator>::Run(target.data + output_offset,
+                                                output_inner_dim_size, eval,
+                                                input_offset);
+
+      // Move input offset forward by the number of assigned coefficients.
+      input_offset += output_inner_dim_size;
+
       // Update index.
-      for (int i = 0; i < static_cast<int>(NumDims) - 1; ++i) {
-        if (++block_iter_state[i].count < block_iter_state[i].size) {
-          inputIndex += block_iter_state[i].input_stride;
-          outputIndex += block_iter_state[i].output_stride;
+      for (int j = 0; j < idx; ++j) {
+        if (++it[j].count < it[j].size) {
+          output_offset += it[j].output_stride;
           break;
         }
-        block_iter_state[i].count = 0;
-        inputIndex -= block_iter_state[i].input_span;
-        outputIndex -= block_iter_state[i].output_span;
+        it[j].count = 0;
+        output_offset -= it[j].output_span;
       }
     }
   }
-};
-
-/** \class TensorBlockReader
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor block reader class.
-  *
-  * This class is responsible for reading a tensor block.
-  *
-  */
-
-template <typename Index, typename Scalar, std::size_t NumDims, int Layout,
-          bool Vectorizable>
-class TensorBlockReader : public TensorBlockIO<Index, Scalar, NumDims,
-                                               Layout, Vectorizable, true> {
- public:
-  typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout>
-      TensorBlock;
-  typedef TensorBlockIO<Index, Scalar, NumDims, Layout, Vectorizable, true>
-      Base;
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      TensorBlock* block, const Scalar* src_data) {
-    array<Index, NumDims> tensor_to_block_dim_map;
-    for (int i = 0; i < NumDims; ++i) {
-      tensor_to_block_dim_map[i] = i;
-    }
-    Base::Copy(*block, block->first_coeff_index(), tensor_to_block_dim_map,
-               block->tensor_strides(), src_data, block->data());
-  }
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      TensorBlock* block, Index first_coeff_index,
-      const array<Index, NumDims>& tensor_to_block_dim_map,
-      const array<Index, NumDims>& tensor_strides, const Scalar* src_data) {
-    Base::Copy(*block, first_coeff_index, tensor_to_block_dim_map,
-               tensor_strides, src_data, block->data());
-  }
-};
-
-/** \class TensorBlockWriter
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor block writer class.
-  *
-  * This class is responsible for writing a tensor block.
-  *
-  */
-
-template <typename Index, typename Scalar, std::size_t NumDims, int Layout,
-          bool Vectorizable>
-class TensorBlockWriter : public TensorBlockIO<Index, Scalar, NumDims,
-                                               Layout, Vectorizable, false> {
- public:
-  typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout>
-      TensorBlock;
-  typedef TensorBlockIO<Index, Scalar, NumDims, Layout, Vectorizable, false>
-      Base;
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const TensorBlock& block, Scalar* dst_data) {
-    array<Index, NumDims> tensor_to_block_dim_map;
-    for (int i = 0; i < NumDims; ++i) {
-      tensor_to_block_dim_map[i] = i;
-    }
-    Base::Copy(block, block.first_coeff_index(), tensor_to_block_dim_map,
-               block.tensor_strides(), block.data(), dst_data);
-  }
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const TensorBlock& block, Index first_coeff_index,
-      const array<Index, NumDims>& tensor_to_block_dim_map,
-      const array<Index, NumDims>& tensor_strides, Scalar* dst_data) {
-    Base::Copy(block, first_coeff_index, tensor_to_block_dim_map,
-               tensor_strides, block.data(), dst_data);
-  }
-};
-
-enum TensorBlockShapeType {
-  kUniformAllDims,
-  kSkewedInnerDims,
-};
-
-struct TensorOpResourceRequirements {
-  TensorBlockShapeType block_shape;
-  std::size_t block_total_size;
-  // TODO(andydavis) Add 'target_num_threads' to support communication of
-  // thread-resource requirements. This will allow ops deep in the
-  // expression tree (like reductions) to communicate resources
-  // requirements based on local state (like the total number of reductions
-  // to be computed).
-  TensorOpResourceRequirements(internal::TensorBlockShapeType shape,
-                               const std::size_t size)
-      : block_shape(shape), block_total_size(size) {}
-};
-
-/** \class TensorBlockMapper
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor block mapper class.
-  *
-  * This class is responsible for iterating over the blocks of a tensor.
-  *
-  */
-
-template <typename Index, typename Scalar, std::size_t NumDims, int Layout>
-class TensorBlockMapper {
- public:
-  typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout>
-      TensorBlock;
-
-  TensorBlockMapper(const Eigen::DSizes<Index, NumDims>& dims,
-                    const TensorBlockShapeType block_shape,
-                    size_t min_target_size)
-      : m_dimensions(dims), m_block_dim_sizes(dims), m_total_block_count(1) {
-    min_target_size = numext::maxi<size_t>(1, min_target_size);
-    if (m_dimensions.TotalSize() == 0) {
-      // Corner case: one of the dimensions is zero. Logic below is too complex
-      // to handle this case on a general basis, just use unit block size.
-      // Note: we must not yield blocks with zero dimensions (recipe for
-      // overflows/underflows, divisions by zero and NaNs later).
-      for (int i = 0; i < NumDims; ++i) {
-        m_block_dim_sizes[i] = 1;
-      }
-    } else if (m_block_dim_sizes.TotalSize() > min_target_size) {
-      if (block_shape == kUniformAllDims) {
-        // Tensor will not fit within 'min_target_size' budget: calculate tensor
-        // block dimension sizes based on "square" dimension size target.
-        const size_t dim_size_target =
-            std::pow(static_cast<float>(min_target_size),
-                     1.0 / static_cast<float>(m_block_dim_sizes.rank()));
-        for (size_t i = 0; i < m_block_dim_sizes.rank(); ++i) {
-          // TODO(andydavis) Adjust the inner most 'm_block_dim_size' to make it
-          // a multiple of the packet size. Note that reducing 'm_block_dim_size'
-          // in this manner can increase the number of blocks, and so will
-          // amplify any per-block overhead.
-          m_block_dim_sizes[i] =
-              numext::mini(dim_size_target, static_cast<size_t>(m_dimensions[i]));
-        }
-        // Add any un-allocated coefficients to inner dimension(s).
-        Index total_size = m_block_dim_sizes.TotalSize();
-        for (int i = 0; i < NumDims; ++i) {
-          const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-              ? i : NumDims - i - 1;
-          if (m_block_dim_sizes[dim] < m_dimensions[dim]) {
-            const Index total_size_other_dims = total_size /
-                m_block_dim_sizes[dim];
-            const Index alloc_avail = divup<Index>(min_target_size, total_size_other_dims);
-            if (alloc_avail == m_block_dim_sizes[dim]) {
-              // Insufficient excess coefficients to allocate.
-              break;
-            }
-            m_block_dim_sizes[dim] = numext::mini(m_dimensions[dim], alloc_avail);
-            total_size = total_size_other_dims * m_block_dim_sizes[dim];
-          }
-        }
-      } else {
-        eigen_assert(block_shape == kSkewedInnerDims);
-        Index coeff_to_allocate = min_target_size;
-        for (int i = 0; i < NumDims; ++i) {
-          const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-              ? i : NumDims - i - 1;
-          m_block_dim_sizes[dim] = numext::mini(coeff_to_allocate,
-                                                m_dimensions[dim]);
-          coeff_to_allocate = divup(coeff_to_allocate,
-              numext::maxi(static_cast<Index>(1), m_block_dim_sizes[dim]));
-        }
-        eigen_assert(coeff_to_allocate == 1);
-      }
-    }
-    eigen_assert(m_block_dim_sizes.TotalSize() >=
-        numext::mini(min_target_size, m_dimensions.TotalSize()));
-
-    // Calculate block counts by dimension and total block count.
-    DSizes<Index, NumDims> block_count;
-    for (size_t i = 0; i < block_count.rank(); ++i) {
-      block_count[i] =
-          (m_dimensions[i] + m_block_dim_sizes[i] - 1) / m_block_dim_sizes[i];
-    }
-    m_total_block_count = array_prod(block_count);
-
-    // Calculate block strides (used for enumerating blocks).
-    if (NumDims > 0) {
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        m_block_strides[0] = 1;
-        m_tensor_strides[0] = 1;
-        for (int i = 1; i < NumDims; ++i) {
-          m_block_strides[i] = m_block_strides[i - 1] * block_count[i - 1];
-          m_tensor_strides[i] = m_tensor_strides[i - 1] * m_dimensions[i - 1];
-        }
-      } else {
-        m_block_strides[NumDims - 1] = 1;
-        m_tensor_strides[NumDims - 1] = 1;
-        for (int i = NumDims - 2; i >= 0; --i) {
-          m_block_strides[i] = m_block_strides[i + 1] * block_count[i + 1];
-          m_tensor_strides[i] = m_tensor_strides[i + 1] * m_dimensions[i + 1];
-        }
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  GetBlockForIndex(Index block_index, Scalar* data) const {
-    Index first_coeff_index = 0;
-    DSizes<Index, NumDims> coords;
-    DSizes<Index, NumDims> sizes;
-    DSizes<Index, NumDims> strides;
-    if (NumDims > 0) {
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        for (int i = NumDims - 1; i > 0; --i) {
-          const Index idx = block_index / m_block_strides[i];
-          coords[i] = idx * m_block_dim_sizes[i];
-          sizes[i] =
-              numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
-          block_index -= idx * m_block_strides[i];
-          first_coeff_index += coords[i] * m_tensor_strides[i];
-        }
-        coords[0] = block_index * m_block_dim_sizes[0];
-        sizes[0] =
-            numext::mini((m_dimensions[0] - coords[0]), m_block_dim_sizes[0]);
-        first_coeff_index += coords[0] * m_tensor_strides[0];
-
-        strides[0] = 1;
-        for (int i = 1; i < NumDims; ++i) {
-          strides[i] = strides[i - 1] * sizes[i - 1];
-        }
-      } else {
-        for (int i = 0; i < NumDims - 1; ++i) {
-          const Index idx = block_index / m_block_strides[i];
-          coords[i] = idx * m_block_dim_sizes[i];
-          sizes[i] =
-              numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
-          block_index -= idx * m_block_strides[i];
-          first_coeff_index += coords[i] * m_tensor_strides[i];
-        }
-        coords[NumDims - 1] = block_index * m_block_dim_sizes[NumDims - 1];
-        sizes[NumDims - 1] =
-            numext::mini((m_dimensions[NumDims - 1] - coords[NumDims - 1]),
-                       m_block_dim_sizes[NumDims - 1]);
-        first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1];
-
-        strides[NumDims - 1] = 1;
-        for (int i = NumDims - 2; i >= 0; --i) {
-          strides[i] = strides[i + 1] * sizes[i + 1];
-        }
-      }
-    }
-
-    return TensorBlock(first_coeff_index, sizes, strides, m_tensor_strides,
-                       data);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const {
-    return m_total_block_count;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index block_dims_total_size() const {
-    return m_block_dim_sizes.TotalSize();
-  }
 
  private:
-  DSizes<Index, NumDims> m_dimensions;
-  DSizes<Index, NumDims> m_block_dim_sizes;
-  DSizes<Index, NumDims> m_block_strides;
-  DSizes<Index, NumDims> m_tensor_strides;
-  Index m_total_block_count;
+  struct BlockIteratorState {
+    BlockIteratorState()
+        : count(0), size(0), output_stride(0), output_span(0) {}
+
+    IndexType count;
+    IndexType size;
+    IndexType output_stride;
+    IndexType output_span;
+  };
 };
 
-/** \class TensorSliceBlockMapper
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor slice block mapper class.
-  *
-  * This class is responsible for iterating over the blocks of
-  * a slice of a tensor. Supports shuffling of the block strides
-  * for callers that want to reduce strides for dimensions to be
-  * processed together.
-  *
-  */
+// -------------------------------------------------------------------------- //
 
-template <typename Index, typename Scalar, std::size_t NumDims, int Layout>
-class TensorSliceBlockMapper {
- public:
-  typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout>
-      TensorBlock;
-  typedef DSizes<Index, NumDims> Dimensions;
-
-  TensorSliceBlockMapper(const Dimensions& tensor_dims,
-                         const Dimensions& tensor_slice_offsets,
-                         const Dimensions& tensor_slice_extents,
-                         const Dimensions& block_dim_sizes,
-                         const Dimensions& block_stride_order)
-      : m_tensor_dimensions(tensor_dims),
-        m_tensor_slice_offsets(tensor_slice_offsets),
-        m_tensor_slice_extents(tensor_slice_extents),
-        m_block_dim_sizes(block_dim_sizes),
-        m_block_stride_order(block_stride_order),
-        m_total_block_count(1) {
-    // Calculate block counts by dimension and total block count.
-    DSizes<Index, NumDims> block_count;
-    for (size_t i = 0; i < block_count.rank(); ++i) {
-      block_count[i] = (m_tensor_slice_extents[i] + m_block_dim_sizes[i] - 1) /
-          m_block_dim_sizes[i];
-    }
-    m_total_block_count = array_prod(block_count);
-
-    // Calculate block strides (used for enumerating blocks).
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_block_strides[0] = 1;
-      m_tensor_strides[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        m_block_strides[i] = m_block_strides[i - 1] * block_count[i - 1];
-        m_tensor_strides[i] = m_tensor_strides[i - 1] *
-            m_tensor_dimensions[i - 1];
-      }
-    } else {
-      m_block_strides[NumDims - 1] = 1;
-      m_tensor_strides[NumDims - 1] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        m_block_strides[i] = m_block_strides[i + 1] * block_count[i + 1];
-        m_tensor_strides[i] = m_tensor_strides[i + 1] *
-            m_tensor_dimensions[i + 1];
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  GetBlockForIndex(Index block_index, Scalar* data) const {
-    Index first_coeff_index = 0;
-    DSizes<Index, NumDims> coords;
-    DSizes<Index, NumDims> sizes;
-    DSizes<Index, NumDims> strides;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = block_index / m_block_strides[i];
-        coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
-        sizes[i] = numext::mini(m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
-                                m_block_dim_sizes[i]);
-        block_index -= idx * m_block_strides[i];
-        first_coeff_index += coords[i] * m_tensor_strides[i];
-      }
-      coords[0] = m_tensor_slice_offsets[0] +
-          block_index * m_block_dim_sizes[0];
-      sizes[0] = numext::mini(m_tensor_slice_offsets[0] + m_tensor_slice_extents[0] - coords[0],
-                                m_block_dim_sizes[0]);
-      first_coeff_index += coords[0] * m_tensor_strides[0];
-
-      Index prev_dim = m_block_stride_order[0];
-      strides[prev_dim] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        const Index curr_dim = m_block_stride_order[i];
-        strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
-        prev_dim = curr_dim;
-      }
-    } else {
-      for (int i = 0; i < static_cast<int>(NumDims) - 1; ++i) {
-        const Index idx = block_index / m_block_strides[i];
-        coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
-        sizes[i] = numext::mini(m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
-                                m_block_dim_sizes[i]);
-        block_index -= idx * m_block_strides[i];
-        first_coeff_index += coords[i] * m_tensor_strides[i];
-      }
-      coords[NumDims - 1] = m_tensor_slice_offsets[NumDims - 1] +
-          block_index * m_block_dim_sizes[NumDims - 1];
-      sizes[NumDims - 1] = numext::mini(
-          m_tensor_slice_offsets[NumDims - 1] + m_tensor_slice_extents[NumDims - 1] - coords[NumDims - 1],
-          m_block_dim_sizes[NumDims - 1]);
-      first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1];
-
-      Index prev_dim = m_block_stride_order[NumDims - 1];
-      strides[prev_dim] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        const Index curr_dim = m_block_stride_order[i];
-        strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
-        prev_dim = curr_dim;
-      }
-    }
-
-    return TensorBlock(first_coeff_index, sizes, strides, m_tensor_strides,
-                       data);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const {
-    return m_total_block_count;
-  }
-
- private:
-  Dimensions m_tensor_dimensions;
-  Dimensions m_tensor_slice_offsets;
-  Dimensions m_tensor_slice_extents;
-  Dimensions m_tensor_strides;
-  Dimensions m_block_dim_sizes;
-  Dimensions m_block_stride_order;
-  Dimensions m_block_strides;
-  Index m_total_block_count;
-};
-
-}  // end namespace internal
-
-}  // end namespace Eigen
+}  // namespace internal
+}  // namespace Eigen
 
 #endif  // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index d6d611b..7449b04 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h

@@ -25,19 +25,19 @@
 {
   typedef typename XprType::Scalar Scalar;
   typedef traits<XprType> XprTraits;
-  typedef typename packet_traits<Scalar>::type Packet;
   typedef typename XprTraits::StorageKind StorageKind;
   typedef typename XprTraits::Index Index;
   typedef typename XprType::Nested Nested;
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Broadcast, typename XprType>
 struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense>
 {
-  typedef const TensorBroadcastingOp<Broadcast, XprType>& type;
+  typedef const TensorBroadcastingOp<Broadcast, XprType> EIGEN_DEVICE_REF type;
 };
 
 template<typename Broadcast, typename XprType>
@@ -55,11 +55,12 @@
   static const bool value = true;
 };
 #ifndef EIGEN_EMULATE_CXX11_META_H
-template <typename std::size_t... Indices>
+template <typename std::ptrdiff_t... Indices>
 struct is_input_scalar<Sizes<Indices...> > {
   static const bool value = (Sizes<Indices...>::total_size == 1);
 };
 #endif
+
 }  // end namespace internal
 
 
@@ -69,10 +70,8 @@
 {
   public:
   typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar;
-  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Packet Packet;
   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
   typedef typename Eigen::internal::nested<TensorBroadcastingOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorBroadcastingOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Index Index;
@@ -103,58 +102,128 @@
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename XprType::Scalar Scalar;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
-  EIGEN_STATIC_ASSERT(NumDims == internal::array_size<Broadcast>::value, "Broadcast cannot change rank")
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  protected: //  all the non-static fields must have the same access control, otherwise the TensorEvaluator wont be standard layout;
+  bool isCopy, nByOne, oneByN;
+  public:
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = true,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess = false
+    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-    : m_impl(op.expression(), device)
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  // We do block based broadcasting using a trick with 2x tensor rank and 0
+  // strides. See block method implementation for details.
+  typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      ArgTensorBlock;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : isCopy(false), nByOne(false), oneByN(false),
+        m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device)
   {
+
+    // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
+    // and store the result in a scalar. Instead one should reshape the scalar into a N-D
+    // tensor with N >= 1 of 1 element first and then broadcast.
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
     const InputDimensions& input_dims = m_impl.dimensions();
-    const Broadcast& broadcast = op.broadcast();
+    isCopy = true;
     for (int i = 0; i < NumDims; ++i) {
       eigen_assert(input_dims[i] > 0);
-      m_dimensions[i] = input_dims[i] * broadcast[i];
+      m_dimensions[i] = input_dims[i] * m_broadcast[i];
+      if (m_broadcast[i] != 1) {
+        isCopy = false;
+      }
     }
 
-    if (NumDims > 0) {
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        m_inputStrides[0] = 1;
-        m_outputStrides[0] = 1;
-        for (int i = 1; i < NumDims; ++i) {
-          m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
-          m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStrides[0] = 1;
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+      }
+    } else {
+      m_inputStrides[NumDims-1] = 1;
+      m_outputStrides[NumDims-1] = 1;
+      for (int i = NumDims-2; i >= 0; --i) {
+        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+        m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+      }
+    }
+
+    if (input_dims[0] == 1) {
+      oneByN = true;
+      for (int i = 1; i < NumDims; ++i) {
+        if (m_broadcast[i] != 1) {
+          oneByN = false;
+          break;
         }
-      } else {
-        // NumDims is always > 0 here, but use max to avoid compiler warning
-        m_inputStrides[numext::maxi(0, NumDims-1)] = 1;
-        m_outputStrides[numext::maxi(0, NumDims-1)] = 1;
-        for (int i = NumDims-2; i >= 0; --i) {
-          m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
-          m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+      }
+    } else if (input_dims[NumDims-1] == 1) {
+      nByOne = true;
+      for (int i = 0; i < NumDims-1; ++i) {
+        if (m_broadcast[i] != 1) {
+          nByOne = false;
+          break;
+        }
+      }
+    }
+
+    // Handle special format like NCHW, its input shape is '[1, N..., 1]' and
+    // broadcast shape is '[N, 1..., N]'
+    if (!oneByN && !nByOne) {
+      if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) {
+        nByOne = true;
+        oneByN = true;
+        for (int i = 1; i < NumDims-1; ++i) {
+          if (m_broadcast[i] != 1) {
+            nByOne = false;
+            oneByN = false;
+            break;
+          }
         }
       }
     }
   }
 
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -163,76 +232,91 @@
     if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
       return m_impl.coeff(0);
     }
+
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      return coeffColMajor(index);
+      if (isCopy) {
+        return m_impl.coeff(index);
+      } else {
+        return coeffColMajor(index);
+      }
     } else {
-      return coeffRowMajor(index);
+      if (isCopy) {
+        return m_impl.coeff(index);
+      } else {
+        return coeffRowMajor(index);
+      }
     }
   }
 
   // TODO: attempt to speed this up. The integer divisions and modulo are slow
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
-  {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const {
     Index inputIndex = 0;
-    if (NumDims > 0) {
-      for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = index / m_outputStrides[i];
-        if (internal::index_statically_eq<Broadcast>()(i, 1)) {
-          eigen_assert(idx < m_impl.dimensions()[i]);
-          inputIndex += idx * m_inputStrides[i];
-        } else {
-          if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
-            eigen_assert(idx % m_impl.dimensions()[i] == 0);
-          } else {
-            inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
-          }
-        }
-        index -= idx * m_outputStrides[i];
-      }
-      if (internal::index_statically_eq<Broadcast>()(0, 1)) {
-        eigen_assert(index < m_impl.dimensions()[0]);
-        inputIndex += index;
+    EIGEN_UNROLL_LOOP
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = index / m_outputStrides[i];
+      if (internal::index_statically_eq<Broadcast>(i, 1)) {
+        eigen_assert(idx < m_impl.dimensions()[i]);
+        inputIndex += idx * m_inputStrides[i];
       } else {
-        if (internal::index_statically_eq<InputDimensions>()(0, 1)) {
-          eigen_assert(index % m_impl.dimensions()[0] == 0);
+        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
+          eigen_assert(idx % m_impl.dimensions()[i] == 0);
         } else {
-          inputIndex += (index % m_impl.dimensions()[0]);
+          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
         }
       }
+      index -= idx * m_outputStrides[i];
+    }
+    if (internal::index_statically_eq<Broadcast>(0, 1)) {
+      eigen_assert(index < m_impl.dimensions()[0]);
+      inputIndex += index;
+    } else {
+      if (internal::index_statically_eq<InputDimensions>(0, 1)) {
+        eigen_assert(index % m_impl.dimensions()[0] == 0);
+      } else {
+        inputIndex += (index % m_impl.dimensions()[0]);
+      }
     }
-    return m_impl.coeff(inputIndex);
+    return inputIndex;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
+  {
+    return m_impl.coeff(indexColMajor(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const {
+    Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = index / m_outputStrides[i];
+      if (internal::index_statically_eq<Broadcast>(i, 1)) {
+        eigen_assert(idx < m_impl.dimensions()[i]);
+        inputIndex += idx * m_inputStrides[i];
+      } else {
+        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
+          eigen_assert(idx % m_impl.dimensions()[i] == 0);
+        } else {
+          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+        }
+      }
+      index -= idx * m_outputStrides[i];
+    }
+    if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
+      eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
+      inputIndex += index;
+    } else {
+      if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
+        eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
+      } else {
+        inputIndex += (index % m_impl.dimensions()[NumDims - 1]);
+      }
+    }
+    return inputIndex;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
   {
-    Index inputIndex = 0;
-    if (NumDims > 0) {
-      for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = index / m_outputStrides[i];
-        if (internal::index_statically_eq<Broadcast>()(i, 1)) {
-          eigen_assert(idx < m_impl.dimensions()[i]);
-          inputIndex += idx * m_inputStrides[i];
-        } else {
-          if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
-            eigen_assert(idx % m_impl.dimensions()[i] == 0);
-          } else {
-            inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
-          }
-        }
-        index -= idx * m_outputStrides[i];
-      }
-      if (internal::index_statically_eq<Broadcast>()(NumDims-1, 1)) {
-        eigen_assert(index < m_impl.dimensions()[NumDims-1]);
-        inputIndex += index;
-      } else {
-        if (internal::index_statically_eq<InputDimensions>()(NumDims-1, 1)) {
-          eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
-        } else {
-          inputIndex += (index % m_impl.dimensions()[NumDims-1]);
-        }
-      }
-    }
-    return m_impl.coeff(inputIndex);
+    return m_impl.coeff(indexRowMajor(index));
   }
 
   template<int LoadMode>
@@ -241,10 +325,147 @@
     if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
       return internal::pset1<PacketReturnType>(m_impl.coeff(0));
     }
+
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      return packetColMajor<LoadMode>(index);
+      if (isCopy) {
+        #ifdef EIGEN_GPU_COMPILE_PHASE
+        // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
+        // unaligned loads here. The reason is unclear though.
+        return m_impl.template packet<Unaligned>(index);
+        #else
+        return m_impl.template packet<LoadMode>(index);
+        #endif
+      } else if (oneByN && !nByOne) {
+        return packetNByOne<LoadMode>(index);
+      } else if (!oneByN && nByOne) {
+        return packetOneByN<LoadMode>(index);
+      } else if (oneByN && nByOne) {
+        return packetOneByNByOne<LoadMode>(index);
+      } else {
+        return packetColMajor<LoadMode>(index);
+      }
     } else {
-      return packetRowMajor<LoadMode>(index);
+      if (isCopy) {
+        #ifdef EIGEN_GPU_COMPILE_PHASE
+        // See above.
+        return m_impl.template packet<Unaligned>(index);
+        #else
+        return m_impl.template packet<LoadMode>(index);
+        #endif
+      } else if (oneByN && !nByOne) {
+        return packetOneByN<LoadMode>(index);
+      } else if (!oneByN && nByOne) {
+        return packetNByOne<LoadMode>(index);
+      } else if (oneByN && nByOne) {
+        return packetOneByNByOne<LoadMode>(index);
+      } else {
+        return packetRowMajor<LoadMode>(index);
+      }
+    }
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne
+  (Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    Index startDim, endDim;
+    Index inputIndex, outputOffset, batchedIndex;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      startDim = NumDims - 1;
+      endDim = 1;
+    } else {
+      startDim = 0;
+      endDim = NumDims - 2;
+    }
+
+    batchedIndex = index % m_outputStrides[startDim];
+    inputIndex   = batchedIndex / m_outputStrides[endDim];
+    outputOffset = batchedIndex % m_outputStrides[endDim];
+
+    if (outputOffset + PacketSize <= m_outputStrides[endDim]) {
+      values[0] = m_impl.coeff(inputIndex);
+      return internal::pload1<PacketReturnType>(values);
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
+        if (outputOffset + cur < m_outputStrides[endDim]) {
+          values[i] = m_impl.coeff(inputIndex);
+        } else {
+          ++inputIndex;
+          inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex);
+          values[i] = m_impl.coeff(inputIndex);
+          outputOffset = 0;
+          cur = 0;
+        }
+      }
+      return internal::pload<PacketReturnType>(values);
+    }
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const
+  {
+    // Consider the flattened tensor [v0, ..., vN],
+    // Concatenates m_broadcast[dim] copies,
+    //    [v0, ..., vN, v0, ..., vN, ... ]
+    // with dim == NumDims - 1 for col-major, dim == 0 for row-major.
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    // Size of flattened tensor.
+    const Index M = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ?
+                      m_inputStrides[NumDims - 1] : m_inputStrides[0];
+    Index inputIndex = index % M;
+    if (inputIndex + PacketSize <= M) {
+      return m_impl.template packet<Unaligned>(inputIndex);
+    } else {
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < PacketSize; ++i) {
+        if (inputIndex > M - 1) {
+          inputIndex = 0;
+        }
+        values[i] = m_impl.coeff(inputIndex++);
+      }
+      return internal::pload<PacketReturnType>(values);
+    }
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const
+  {
+    // Consider the flattened tensor [v0, ..., vN],
+    // Interleaves m_broadcast[dim] copies,
+    //    [v0, v0, ..., v1, v1, ..., vN, vN, ... ]
+    // with dim == 0 for col-major, dim == NumDims - 1 for row-major.
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + PacketSize-1 < dimensions().TotalSize());
+
+    const Index M = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ?
+                      m_broadcast[0] : m_broadcast[NumDims - 1];
+
+    Index inputIndex   = index / M;
+    Index outputOffset = index % M;
+    if (outputOffset + PacketSize <= M) {
+      return internal::pset1<PacketReturnType>(m_impl.coeff(inputIndex));
+    } else {
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < PacketSize; ++i) {
+        if (outputOffset < M) {
+          values[i] = m_impl.coeff(inputIndex);
+          ++outputOffset;
+        } else {
+          values[i] = m_impl.coeff(++inputIndex);
+          outputOffset = 1;  // Next offset.
+        }
+      }
+      return internal::pload<PacketReturnType>(values);
     }
   }
 
@@ -253,51 +474,55 @@
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     const Index originalIndex = index;
 
     Index inputIndex = 0;
-    Index innermostLoc = 0;
-    if (NumDims > 0) {
-      for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = index / m_outputStrides[i];
-        if (internal::index_statically_eq<Broadcast>()(i, 1)) {
-          eigen_assert(idx < m_impl.dimensions()[i]);
-          inputIndex += idx * m_inputStrides[i];
-        } else {
-          if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
-            eigen_assert(idx % m_impl.dimensions()[i] == 0);
-          } else {
-            inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
-          }
-        }
-        index -= idx * m_outputStrides[i];
-      }
-      if (internal::index_statically_eq<Broadcast>()(0, 1)) {
-        eigen_assert(index < m_impl.dimensions()[0]);
-        innermostLoc = index;
+    EIGEN_UNROLL_LOOP
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = index / m_outputStrides[i];
+      if (internal::index_statically_eq<Broadcast>(i, 1)) {
+        eigen_assert(idx < m_impl.dimensions()[i]);
+        inputIndex += idx * m_inputStrides[i];
       } else {
-        if (internal::index_statically_eq<InputDimensions>()(0, 1)) {
-          eigen_assert(innermostLoc % m_impl.dimensions()[0] == 0);
-          innermostLoc = 0;
+        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
+          eigen_assert(idx % m_impl.dimensions()[i] == 0);
         } else {
-          innermostLoc = index % m_impl.dimensions()[0];
+          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
         }
       }
-      inputIndex += innermostLoc;
+      index -= idx * m_outputStrides[i];
     }
+    Index innermostLoc;
+    if (internal::index_statically_eq<Broadcast>(0, 1)) {
+      eigen_assert(index < m_impl.dimensions()[0]);
+      innermostLoc = index;
+    } else {
+      if (internal::index_statically_eq<InputDimensions>(0, 1)) {
+        eigen_assert(index % m_impl.dimensions()[0] == 0);
+        innermostLoc = 0;
+      } else {
+        innermostLoc = index % m_impl.dimensions()[0];
+      }
+    }
+    inputIndex += innermostLoc;
 
     // Todo: this could be extended to the second dimension if we're not
     // broadcasting alongside the first dimension, and so on.
     if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) {
       return m_impl.template packet<Unaligned>(inputIndex);
     } else {
-      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       values[0] = m_impl.coeff(inputIndex);
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < PacketSize; ++i) {
-        values[i] = coeffColMajor(originalIndex+i);
+        if (innermostLoc + i < m_impl.dimensions()[0]) {
+          values[i] = m_impl.coeff(inputIndex+i);
+        } else {
+          values[i] = coeffColMajor(originalIndex+i);
+        }
       }
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
       return rslt;
@@ -307,19 +532,20 @@
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     const Index originalIndex = index;
 
     Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < NumDims - 1; ++i) {
       const Index idx = index / m_outputStrides[i];
-      if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+      if (internal::index_statically_eq<Broadcast>(i, 1)) {
         eigen_assert(idx < m_impl.dimensions()[i]);
         inputIndex += idx * m_inputStrides[i];
       } else {
-        if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
+        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
           eigen_assert(idx % m_impl.dimensions()[i] == 0);
         } else {
           inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
@@ -328,12 +554,12 @@
       index -= idx * m_outputStrides[i];
     }
     Index innermostLoc;
-    if (internal::index_statically_eq<Broadcast>()(NumDims-1, 1)) {
+    if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
       eigen_assert(index < m_impl.dimensions()[NumDims-1]);
       innermostLoc = index;
     } else {
-      if (internal::index_statically_eq<InputDimensions>()(NumDims-1, 1)) {
-        eigen_assert(innermostLoc % m_impl.dimensions()[NumDims-1] == 0);
+      if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
+        eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
         innermostLoc = 0;
       } else {
         innermostLoc = index % m_impl.dimensions()[NumDims-1];
@@ -346,10 +572,15 @@
     if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) {
       return m_impl.template packet<Unaligned>(inputIndex);
     } else {
-      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       values[0] = m_impl.coeff(inputIndex);
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < PacketSize; ++i) {
-        values[i] = coeffRowMajor(originalIndex+i);
+        if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) {
+          values[i] = m_impl.coeff(inputIndex+i);
+        } else {
+          values[i] = coeffRowMajor(originalIndex+i);
+        }
       }
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
       return rslt;
@@ -359,14 +590,15 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool vectorized) const {
     double compute_cost = TensorOpCost::AddCost<Index>();
-    if (NumDims > 0) {
+    if (!isCopy && NumDims > 0) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         compute_cost += TensorOpCost::DivCost<Index>();
-        if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+        if (internal::index_statically_eq<Broadcast>(i, 1)) {
           compute_cost +=
               TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
         } else {
-          if (!internal::index_statically_eq<InputDimensions>()(i, 1)) {
+          if (!internal::index_statically_eq<InputDimensions>(i, 1)) {
             compute_cost += TensorOpCost::MulCost<Index>() +
                             TensorOpCost::ModCost<Index>() +
                             TensorOpCost::AddCost<Index>();
@@ -380,9 +612,472 @@
            TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
+    // tensors. But this might need further tuning.
+    const size_t target_size = m_device.firstLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::merge(
+        m_impl.getResourceRequirements(),
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size));
+  }
 
- protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    BlockBroadcastingParams params = blockBroadcastingParams(desc);
+
+    if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) {
+      return emptyBlock();
+    }
+
+    // Prepare storage for the materialized broadcasting result.
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(desc, scratch);
+    ScalarNoConst* materialized_output = block_storage.data();
+
+    // We potentially will need to materialize input blocks.
+    size_t materialized_input_size = 0;
+    ScalarNoConst* materialized_input = NULL;
+
+    // Initialize block broadcating iterator state for outer dimensions (outer
+    // with regard to bcast dimension). Dimension in this array are always in
+    // inner_most -> outer_most order (col major layout).
+    array<BlockBroadcastingIteratorState, NumDims> it;
+    int idx = 0;
+
+    for (int i = params.inner_dim_count + 1; i < NumDims; ++i) {
+      const Index dim = IsColMajor ? i : NumDims - 1 - i;
+      it[idx].size = params.output_dims[dim];
+      it[idx].count = 0;
+      it[idx].output_stride = m_outputStrides[dim];
+      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+      idx++;
+    }
+
+    // Write output into the beginning of `materialized_output`.
+    Index output_offset = 0;
+
+    // We will fill output block by broadcasting along the bcast dim, and
+    // iterating over outer dimension.
+    const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize();
+
+    for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) {
+      ScalarNoConst* bcast_output = materialized_output + num_output_coeffs;
+      Index bcast_offset = desc.offset() + output_offset;
+
+      // Broadcast along the bcast dimension.
+      num_output_coeffs += BroadcastBlockAlongBcastDim(
+          params, bcast_offset, scratch, bcast_output, &materialized_input,
+          &materialized_input_size);
+
+      // Switch to the next outer dimension.
+      for (int j = 0; j < idx; ++j) {
+        if (++it[j].count < it[j].size) {
+          output_offset += it[j].output_stride;
+          break;
+        }
+        it[j].count = 0;
+        output_offset -= it[j].output_span;
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+  Broadcast functor() const { return m_broadcast; }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(
+      cl::sycl::handler& cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
+ private:
+  static const bool IsColMajor =
+      static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+  // We will build a general case block broadcasting on top of broadcasting
+  // primitive that will do broadcasting only for the inner dimension(s) along
+  // the first dimension smaller than the input size (it's called `bcast_dim`).
+  //
+  // Example:
+  //           dim:  0  1  2   (ColMajor)
+  //    input size: [9, 3, 6]
+  //    block size: [9, 2, 6]
+  //
+  // We will compute broadcasted block by iterating over the outer dimensions
+  // before `bcast_dim` (only dimension `2` in this example) and computing
+  // broadcasts along the `bcast_dim` (dimension `1` in this example).
+
+  // BlockBroadcastingParams holds precomputed parameters for broadcasting a
+  // single block along the broadcasting dimension. Sizes and strides along the
+  // `bcast_dim` might be invalid, they will be adjusted later in
+  // `BroadcastBlockAlongBcastDim`.
+  struct BlockBroadcastingParams {
+    Dimensions input_dims;      // input expression dimensions
+    Dimensions output_dims;     // output block sizes
+    Dimensions output_strides;  // output block strides
+
+    int inner_dim_count;   // count inner dimensions matching in size
+    int bcast_dim;         // broadcasting dimension index
+    Index bcast_dim_size;  // broadcasting dimension size
+    Index inner_dim_size;  // inner dimensions size
+
+    // Block sizes and strides for the input block where all dimensions before
+    // `bcast_dim` are equal to `1`.
+    Dimensions input_block_sizes;
+    Dimensions input_block_strides;
+
+    // Block sizes and strides for blocks with extra dimensions and strides `0`.
+    BroadcastDimensions bcast_block_sizes;
+    BroadcastDimensions bcast_block_strides;
+    BroadcastDimensions bcast_input_strides;
+  };
+
+  struct BlockBroadcastingIteratorState {
+    Index size;
+    Index count;
+    Index output_stride;
+    Index output_span;
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams
+  blockBroadcastingParams(TensorBlockDesc& desc) const {
+    BlockBroadcastingParams params;
+
+    params.input_dims = Dimensions(m_impl.dimensions());
+
+    // Output block sizes and strides.
+    params.output_dims = desc.dimensions();
+    params.output_strides = internal::strides<Layout>(params.output_dims);
+
+    // Find the broadcasting dimension (first dimension with output size smaller
+    // that the input size).
+    params.bcast_dim = 0;
+    params.bcast_dim_size = 1;
+    params.inner_dim_size = 1;
+
+    // Count the number of inner dimensions that have the same size in the block
+    // and in the broadcast expression.
+    params.inner_dim_count = 0;
+
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+
+      if (params.output_dims[dim] == m_dimensions[dim]) {
+        params.inner_dim_size *= params.output_dims[dim];
+        ++params.inner_dim_count;
+        continue;
+      }
+
+      // First non-matching dimension is the broadcasting dimension.
+      eigen_assert(params.output_dims[dim] < m_dimensions[dim]);
+      params.bcast_dim = dim;
+      params.bcast_dim_size = params.output_dims[dim];
+      break;
+    }
+
+    // Calculate the input block size for looking into the input.
+    for (int i = 0; i < params.inner_dim_count; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      params.input_block_sizes[dim] = params.input_dims[dim];
+    }
+    for (int i = params.inner_dim_count; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      params.input_block_sizes[dim] = 1;
+    }
+    params.input_block_strides =
+        internal::strides<Layout>(params.input_block_sizes);
+
+    // Broadcast with the 0-stride trick: Create 1 extra dim for each
+    // broadcast, set the input stride to 0.
+    //
+    // When ColMajor:
+    //
+    // - bcast_block_sizes:
+    //   [d_0, b_0, d_1, b_1, ...]
+    //
+    // - bcast_block_strides:
+    //   [output_block_strides[0], output_block_strides[0] * d_0,
+    //    output_block_strides[1], output_block_strides[1] * d_1,
+    //   ...]
+    //
+    // - bcast_input_strides:
+    //   [input_block_strides[0], 0,
+    //    input_block_strides[1], 0,
+    //   ...].
+    //
+    for (int i = 0; i < params.inner_dim_count; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+
+      const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1;
+      const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1;
+
+      params.bcast_block_sizes[copy_dim] = params.input_dims[dim];
+      params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim];
+      params.bcast_block_strides[copy_dim] = params.output_strides[dim];
+      params.bcast_block_strides[broadcast_dim] =
+          params.output_strides[dim] * params.input_dims[dim];
+      params.bcast_input_strides[copy_dim] = params.input_block_strides[dim];
+      params.bcast_input_strides[broadcast_dim] = 0;
+    }
+
+    for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) {
+      const int dim = IsColMajor ? i : 2 * NumDims - i - 1;
+      params.bcast_block_sizes[dim] = 1;
+      params.bcast_block_strides[dim] = 0;
+      params.bcast_input_strides[dim] = 0;
+    }
+
+    return params;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const {
+    DSizes<Index, NumDims> dimensions;
+    for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
+    return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim(
+      BlockBroadcastingParams params, Index bcast_offset,
+      TensorBlockScratch& scratch, ScalarNoConst* materialized_output,
+      ScalarNoConst** materialized_input,
+      size_t* materialized_input_size) const {
+    if (params.bcast_dim_size == 1) {
+      // We just need one block read using the ready-set values above.
+      return BroadcastBlock(
+          params.input_block_sizes, params.input_block_strides,
+          params.bcast_block_sizes, params.bcast_block_strides,
+          params.bcast_input_strides, bcast_offset, 0, scratch,
+          materialized_output, materialized_input, materialized_input_size);
+
+    } else if (params.input_dims[params.bcast_dim] == 1) {
+      // Broadcast bcast dimension (< NumDims) by bcast_dim_size.
+      const int broadcast_bcast_dim =
+          IsColMajor ? 2 * params.inner_dim_count + 1
+                     : 2 * NumDims - 2 * params.inner_dim_count - 2;
+
+      params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size;
+      params.bcast_input_strides[broadcast_bcast_dim] = 0;
+      params.bcast_block_strides[broadcast_bcast_dim] =
+          params.output_strides[params.bcast_dim];
+
+      return BroadcastBlock(
+          params.input_block_sizes, params.input_block_strides,
+          params.bcast_block_sizes, params.bcast_block_strides,
+          params.bcast_input_strides, bcast_offset, 0, scratch,
+          materialized_output, materialized_input, materialized_input_size);
+
+    } else {
+      // Keep track of the total number of the coefficients written to the
+      // output block.
+      Index num_output_coeffs = 0;
+
+      // The general case. Let's denote the output block as
+      //
+      //   x[..., a:a+bcast_dim_size, :, ..., :]
+      //
+      // where a:a+bcast_dim_size is a slice on the bcast_dim dimension
+      // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3
+      // sub-blocks:
+      //
+      // (1) a:b, where b is the smallest multiple of
+      //     input_dims[bcast_dim_start] in [a, a+bcast_dim_size].
+      //
+      // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start]
+      //     in [a, a+bcast_dim_size].
+      //
+      // (3) c:a+bcast_dim_size .
+      //
+      // Or, when b and c do not exist, we just need to process the whole block
+      // together.
+
+      // Find a.
+      const Index bcast_dim_left_index =
+          bcast_offset / m_outputStrides[params.bcast_dim];
+
+      // Find b and c.
+      const Index input_bcast_dim_size = params.input_dims[params.bcast_dim];
+
+      // First multiple after a. This is b when <= bcast_dim_left_index +
+      // bcast_dim_size.
+      const Index first_multiple =
+          divup<Index>(bcast_dim_left_index, input_bcast_dim_size) *
+          input_bcast_dim_size;
+
+      if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) {
+        // b exists, so does c. Find it.
+        const Index last_multiple =
+            (bcast_dim_left_index + params.bcast_dim_size) /
+            input_bcast_dim_size * input_bcast_dim_size;
+        const int copy_bcast_dim =
+            IsColMajor ? 2 * params.inner_dim_count
+                       : 2 * NumDims - 2 * params.inner_dim_count - 1;
+        const int broadcast_bcast_dim =
+            IsColMajor ? 2 * params.inner_dim_count + 1
+                       : 2 * NumDims - 2 * params.inner_dim_count - 2;
+
+        if (first_multiple > bcast_dim_left_index) {
+          const Index head_size = first_multiple - bcast_dim_left_index;
+          params.input_block_sizes[params.bcast_dim] = head_size;
+          params.bcast_block_sizes[copy_bcast_dim] = head_size;
+          params.bcast_input_strides[copy_bcast_dim] =
+              params.input_block_strides[params.bcast_dim];
+          params.bcast_block_strides[copy_bcast_dim] =
+              params.output_strides[params.bcast_dim];
+          params.bcast_block_sizes[broadcast_bcast_dim] = 1;
+          params.bcast_input_strides[broadcast_bcast_dim] = 0;
+          params.bcast_block_strides[broadcast_bcast_dim] =
+              params.output_strides[params.bcast_dim] *
+              params.input_dims[params.bcast_dim];
+
+          num_output_coeffs += BroadcastBlock(
+              params.input_block_sizes, params.input_block_strides,
+              params.bcast_block_sizes, params.bcast_block_strides,
+              params.bcast_input_strides, bcast_offset, 0, scratch,
+              materialized_output, materialized_input, materialized_input_size);
+        }
+        if (first_multiple < last_multiple) {
+          params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size;
+          params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size;
+          params.bcast_input_strides[copy_bcast_dim] =
+              params.input_block_strides[params.bcast_dim];
+          params.bcast_block_strides[copy_bcast_dim] =
+              params.output_strides[params.bcast_dim];
+          params.bcast_block_sizes[broadcast_bcast_dim] =
+              (last_multiple - first_multiple) / input_bcast_dim_size;
+          params.bcast_input_strides[broadcast_bcast_dim] = 0;
+          params.bcast_block_strides[broadcast_bcast_dim] =
+              params.output_strides[params.bcast_dim] *
+              params.input_dims[params.bcast_dim];
+          const Index offset = (first_multiple - bcast_dim_left_index) *
+                               m_outputStrides[params.bcast_dim];
+
+          num_output_coeffs += BroadcastBlock(
+              params.input_block_sizes, params.input_block_strides,
+              params.bcast_block_sizes, params.bcast_block_strides,
+              params.bcast_input_strides, bcast_offset, offset, scratch,
+              materialized_output, materialized_input, materialized_input_size);
+        }
+        if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) {
+          const Index tail_size =
+              bcast_dim_left_index + params.bcast_dim_size - last_multiple;
+          params.input_block_sizes[params.bcast_dim] = tail_size;
+          params.bcast_block_sizes[copy_bcast_dim] = tail_size;
+          params.bcast_input_strides[copy_bcast_dim] =
+              params.input_block_strides[params.bcast_dim];
+          params.bcast_block_strides[copy_bcast_dim] =
+              params.output_strides[params.bcast_dim];
+          params.bcast_block_sizes[broadcast_bcast_dim] = 1;
+          params.bcast_input_strides[broadcast_bcast_dim] = 0;
+          params.bcast_block_strides[broadcast_bcast_dim] =
+              params.output_strides[params.bcast_dim] *
+              params.input_dims[params.bcast_dim];
+          const Index offset = (last_multiple - bcast_dim_left_index) *
+                               m_outputStrides[params.bcast_dim];
+
+          num_output_coeffs += BroadcastBlock(
+              params.input_block_sizes, params.input_block_strides,
+              params.bcast_block_sizes, params.bcast_block_strides,
+              params.bcast_input_strides, bcast_offset, offset, scratch,
+              materialized_output, materialized_input, materialized_input_size);
+        }
+      } else {
+        // b and c do not exist.
+        const int copy_bcast_dim =
+            IsColMajor ? 2 * params.inner_dim_count
+                       : 2 * NumDims - 2 * params.inner_dim_count - 1;
+        params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size;
+        params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size;
+        params.bcast_input_strides[copy_bcast_dim] =
+            params.input_block_strides[params.bcast_dim];
+        params.bcast_block_strides[copy_bcast_dim] =
+            params.output_strides[params.bcast_dim];
+
+        num_output_coeffs += BroadcastBlock(
+            params.input_block_sizes, params.input_block_strides,
+            params.bcast_block_sizes, params.bcast_block_strides,
+            params.bcast_input_strides, bcast_offset, 0, scratch,
+            materialized_output, materialized_input, materialized_input_size);
+      }
+
+      return num_output_coeffs;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock(
+      const Dimensions& input_block_sizes,
+      const Dimensions& input_block_strides,
+      const BroadcastDimensions& bcast_block_sizes,
+      const BroadcastDimensions& bcast_block_strides,
+      const BroadcastDimensions& bcast_input_strides, Index bcast_offset,
+      Index offset, TensorBlockScratch& scratch,
+      ScalarNoConst* materialized_output, ScalarNoConst** materialized_input,
+      size_t* materialized_input_size) const {
+    // ---------------------------------------------------------------------- //
+    // Tensor block descriptor for reading block from the input.
+    const Index input_offset = bcast_offset + offset;
+    TensorBlockDesc input_desc(
+        IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset),
+        input_block_sizes);
+
+    ArgTensorBlock input_block = m_impl.block(input_desc, scratch);
+
+    // ---------------------------------------------------------------------- //
+    // Materialize input block into a temporary memory buffer only if it's not
+    // already available in the arg block.
+    const ScalarNoConst* input_buffer = NULL;
+
+    if (input_block.data() != NULL) {
+      // Input block already has raw data, there is no need to materialize it.
+      input_buffer = input_block.data();
+
+    } else {
+      // Otherwise we have to do block assignment into a temporary buffer.
+
+      // Maybe reuse previously allocated buffer, or allocate a new one with a
+      // scratch allocator.
+      const size_t input_total_size = input_block_sizes.TotalSize();
+      if (*materialized_input == NULL ||
+          *materialized_input_size < input_total_size) {
+        *materialized_input_size = input_total_size;
+        void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar));
+        *materialized_input = static_cast<ScalarNoConst*>(mem);
+      }
+
+      typedef internal::TensorBlockAssignment<
+          ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index>
+          TensorBlockAssignment;
+
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(input_block_sizes, input_block_strides,
+                                        *materialized_input),
+          input_block.expr());
+
+      input_buffer = *materialized_input;
+    }
+
+    // ---------------------------------------------------------------------- //
+    // Copy data from materialized input block to the materialized output, using
+    // given broadcast strides (strides with zeroes).
+    typedef internal::TensorBlockIO<ScalarNoConst, Index, 2 * NumDims, Layout>
+        TensorBlockIO;
+
+    typename TensorBlockIO::Src src(bcast_input_strides, input_buffer);
+    typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides,
+                                      materialized_output + offset);
+
+    return TensorBlockIO::Copy(dst, src);
+  }
+
+protected:
+  const Device EIGEN_DEVICE_REF m_device;
+  const typename internal::remove_reference<Broadcast>::type m_broadcast;
   Dimensions m_dimensions;
   array<Index, NumDims> m_outputStrides;
   array<Index, NumDims> m_inputStrides;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index aa08f6e..3764573 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h

@@ -32,12 +32,13 @@
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions - 1;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<DenseIndex DimId, typename XprType>
 struct eval<TensorChippingOp<DimId, XprType>, Eigen::Dense>
 {
-  typedef const TensorChippingOp<DimId, XprType>& type;
+  typedef const TensorChippingOp<DimId, XprType> EIGEN_DEVICE_REF type;
 };
 
 template<DenseIndex DimId, typename XprType>
@@ -49,7 +50,8 @@
 template <DenseIndex DimId>
 struct DimensionId
 {
-  DimensionId(DenseIndex dim) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) {
+    EIGEN_UNUSED_VARIABLE(dim);
     eigen_assert(dim == DimId);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
@@ -59,7 +61,7 @@
 template <>
 struct DimensionId<Dynamic>
 {
-  DimensionId(DenseIndex dim) : actual_dim(dim) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) : actual_dim(dim) {
     eigen_assert(dim >= 0);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
@@ -78,44 +80,28 @@
 class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> >
 {
   public:
-  typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index;
+    typedef TensorBase<TensorChippingOp<DimId, XprType> > Base;
+    typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim)
-      : m_xpr(expr), m_offset(offset), m_dim(dim) {
-  }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim)
+        : m_xpr(expr), m_offset(offset), m_dim(dim) {
+    }
 
-  EIGEN_DEVICE_FUNC
-  const Index offset() const { return m_offset; }
-  EIGEN_DEVICE_FUNC
-  const Index dim() const { return m_dim.actualDim(); }
+    EIGEN_DEVICE_FUNC
+    const Index offset() const { return m_offset; }
+    EIGEN_DEVICE_FUNC
+    const Index dim() const { return m_dim.actualDim(); }
 
-  EIGEN_DEVICE_FUNC
-  const typename internal::remove_all<typename XprType::Nested>::type&
-  expression() const { return m_xpr; }
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
 
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE TensorChippingOp& operator = (const TensorChippingOp& other)
-  {
-    typedef TensorAssignOp<TensorChippingOp, const TensorChippingOp> Assign;
-    Assign assign(*this, other);
-    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-    return *this;
-  }
-
-  template<typename OtherDerived>
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other)
-  {
-    typedef TensorAssignOp<TensorChippingOp, const OtherDerived> Assign;
-    Assign assign(*this, other);
-    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-    return *this;
-  }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorChippingOp)
 
   protected:
     typename XprType::Nested m_xpr;
@@ -134,32 +120,56 @@
   typedef typename XprType::Index Index;
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     // Alignment can't be guaranteed at compile time since it depends on the
     // slice offsets.
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = false,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    // Chipping of outer-most dimension is a trivial operation, because we can
+    // read and write directly from the underlying tensor using single offset.
+    IsOuterChipping   = (static_cast<int>(Layout) == ColMajor && DimId == NumInputDims - 1) ||
+                        (static_cast<int>(Layout) == RowMajor && DimId == 0),
+    // Chipping inner-most dimension.
+    IsInnerChipping   = (static_cast<int>(Layout) == ColMajor && DimId == 0) ||
+                        (static_cast<int>(Layout) == RowMajor && DimId == NumInputDims - 1),
+    // Prefer block access if the underlying expression prefers it, otherwise
+    // only if chipping is not trivial.
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess ||
+                        !IsOuterChipping,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  typedef internal::TensorBlock<Index, ScalarNonConst, NumInputDims, Layout>
-    InputTensorBlock;
-  typedef internal::TensorBlock<Index, ScalarNonConst, NumDims, Layout>
-    OutputTensorBlock;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef internal::TensorBlockDescriptor<NumInputDims, Index>
+      ArgTensorBlockDesc;
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      ArgTensorBlock;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
   {
-    EIGEN_STATIC_ASSERT(NumInputDims >= 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
     eigen_assert(NumInputDims > m_dim.actualDim());
+
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     eigen_assert(op.offset() < input_dims[m_dim.actualDim()]);
 
@@ -186,34 +196,16 @@
     }
     m_inputStride *= input_dims[m_dim.actualDim()];
     m_inputOffset = m_stride * op.offset();
-
-    if (BlockAccess) {
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        m_inputStrides[0] = 1;
-        for (int i = 1; i < NumInputDims; ++i) {
-          m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
-        }
-      } else {
-        m_inputStrides[NumInputDims - 1] = 1;
-        for (int i = NumInputDims - 2; i >= 0; --i) {
-          m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
-        }
-      }
-
-      m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
-                                            device.lastLevelCacheSize() /
-                                            sizeof(Scalar));
-    }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -225,28 +217,23 @@
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
-    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
-         m_dim.actualDim() == 0) ||
-        (static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
-         m_dim.actualDim() == NumInputDims - 1)) {
+    if (isInnerChipping()) {
       // m_stride is equal to 1, so let's avoid the integer division.
       eigen_assert(m_stride == 1);
       Index inputIndex = index * m_inputStride + m_inputOffset;
-      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < PacketSize; ++i) {
         values[i] = m_impl.coeff(inputIndex);
         inputIndex += m_inputStride;
       }
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
       return rslt;
-    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
-                m_dim.actualDim() == NumInputDims - 1) ||
-               (static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
-                m_dim.actualDim() == 0)) {
-      // m_stride is aways greater than index, so let's avoid the integer division.
+    } else if (isOuterChipping()) {
+      // m_stride is always greater than index, so let's avoid the integer division.
       eigen_assert(m_stride > index);
       return m_impl.template packet<LoadMode>(index + m_inputOffset);
     } else {
@@ -257,7 +244,8 @@
         return m_impl.template packet<LoadMode>(inputIndex);
       } else {
         // Cross the stride boundary. Fallback to slow path.
-        EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+        EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+       EIGEN_UNROLL_LOOP
         for (int i = 0; i < PacketSize; ++i) {
           values[i] = coeff(index);
           ++index;
@@ -268,13 +256,6 @@
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
-      std::vector<internal::TensorOpResourceRequirements>* resources) const {
-    resources->push_back(internal::TensorOpResourceRequirements(
-        internal::kSkewedInnerDims, m_block_total_size_max));
-    m_impl.getResourceRequirements(resources);
-  }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool vectorized) const {
     double cost = 0;
@@ -297,83 +278,100 @@
            TensorOpCost(0, 0, cost, vectorized, PacketSize);
   }
 
-  // TODO(andydavis) Reduce the overhead of this function (experiment with
-  // using a fixed block size).
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
-      OutputTensorBlock* output_block) const {
-    // Calculate input block sizes.
-    const DSizes<Index, NumDims>& output_block_sizes =
-        output_block->block_sizes();
-    const DSizes<Index, NumDims>& output_block_strides =
-        output_block->block_strides();
-    const Index chip_dim = m_dim.actualDim();
-    DSizes<Index, NumInputDims> input_block_sizes;
-    DSizes<Index, NumInputDims> input_block_strides;
-    for (Index i = 0; i < NumInputDims; ++i) {
-      if (i < chip_dim) {
-        input_block_sizes[i] = output_block_sizes[i];
-        input_block_strides[i] = output_block_strides[i];
-      } else if (i > chip_dim) {
-        input_block_sizes[i] = output_block_sizes[i - 1];
-        input_block_strides[i] = output_block_strides[i - 1];
-      } else {
-        input_block_sizes[i] = 1;
-      }
-    }
-    // Fix up input_block_stride for chip dimension.
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      if (chip_dim == 0) {
-        input_block_strides[chip_dim] = 1;
-      } else {
-        input_block_strides[chip_dim] = input_block_strides[chip_dim - 1] *
-            input_block_sizes[chip_dim - 1];
-      }
-    } else {
-      if (chip_dim == NumInputDims - 1) {
-        input_block_strides[chip_dim] = 1;
-      } else {
-        input_block_strides[chip_dim] = input_block_strides[chip_dim + 1] *
-            input_block_sizes[chip_dim + 1];
-      }
-    }
-    // Instantiate and read input block from input tensor.
-    InputTensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
-                                 input_block_sizes,
-                                 input_block_strides,
-                                 m_inputStrides,
-                                 output_block->data());
-    m_impl.block(&input_block);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::merge(
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
+        m_impl.getResourceRequirements());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const {
-    CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data());
-    if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
-          m_dim.actualDim() == NumDims) ||
-         (static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
-          m_dim.actualDim() == 0)) &&
-        result) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool root_of_expr_ast = false) const {
+    const Index chip_dim = m_dim.actualDim();
+
+    DSizes<Index, NumInputDims> input_block_dims;
+    for (int i = 0; i < NumInputDims; ++i) {
+      input_block_dims[i]
+            = i < chip_dim ? desc.dimension(i)
+            : i > chip_dim ? desc.dimension(i - 1)
+            : 1;
+    }
+
+    ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims);
+
+    // Try to reuse destination buffer for materializing argument block.
+    if (desc.HasDestinationBuffer()) {
+      DSizes<Index, NumInputDims> arg_destination_strides;
+      for (int i = 0; i < NumInputDims; ++i) {
+      arg_destination_strides[i]
+            = i < chip_dim ? desc.destination().strides()[i]
+            : i > chip_dim ? desc.destination().strides()[i - 1]
+            : 0; // for dimensions of size `1` stride should never be used.
+      }
+
+      arg_desc.template AddDestinationBuffer<Layout>(
+          desc.destination().template data<ScalarNoConst>(),
+          arg_destination_strides);
+    }
+
+    ArgTensorBlock arg_block = m_impl.block(arg_desc, scratch, root_of_expr_ast);
+    if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
+
+    if (arg_block.data() != NULL) {
+      // Forward argument block buffer if possible.
+      return TensorBlock(arg_block.kind(), arg_block.data(),
+                           desc.dimensions());
+
+    } else {
+      // Assign argument block expression to a buffer.
+
+      // Prepare storage for the materialized chipping result.
+      const typename TensorBlock::Storage block_storage =
+          TensorBlock::prepareStorage(desc, scratch);
+
+      typedef internal::TensorBlockAssignment<
+          ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index>
+          TensorBlockAssignment;
+
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(
+              arg_desc.dimensions(),
+              internal::strides<Layout>(arg_desc.dimensions()),
+              block_storage.data()),
+          arg_block.expr());
+
+      return block_storage.AsTensorMaterializedBlock();
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
+    typename Storage::Type result = constCast(m_impl.data());
+    if (isOuterChipping() && result) {
       return result + m_inputOffset;
     } else {
       return NULL;
     }
   }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
 
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
     Index inputIndex;
-    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
-         m_dim.actualDim() == 0) ||
-        (static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
-         m_dim.actualDim() == NumInputDims - 1)) {
+    if (isInnerChipping()) {
       // m_stride is equal to 1, so let's avoid the integer division.
       eigen_assert(m_stride == 1);
       inputIndex = index * m_inputStride + m_inputOffset;
-    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
-                m_dim.actualDim() == NumInputDims - 1) ||
-               (static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
-                m_dim.actualDim() == 0)) {
-      // m_stride is aways greater than index, so let's avoid the integer division.
+    } else if (isOuterChipping()) {
+      // m_stride is always greater than index, so let's avoid the integer
+      // division.
       eigen_assert(m_stride > index);
       inputIndex = index + m_inputOffset;
     } else {
@@ -385,15 +383,25 @@
     return inputIndex;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isInnerChipping() const {
+    return IsInnerChipping ||
+           (static_cast<int>(Layout) == ColMajor && m_dim.actualDim() == 0) ||
+           (static_cast<int>(Layout) == RowMajor && m_dim.actualDim() == NumInputDims - 1);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isOuterChipping() const {
+    return IsOuterChipping ||
+           (static_cast<int>(Layout) == ColMajor && m_dim.actualDim() == NumInputDims-1) ||
+           (static_cast<int>(Layout) == RowMajor && m_dim.actualDim() == 0);
+  }
+
   Dimensions m_dimensions;
   Index m_stride;
   Index m_inputOffset;
   Index m_inputStride;
-  DSizes<Index, NumInputDims> m_inputStrides;
   TensorEvaluator<ArgType, Device> m_impl;
   const internal::DimensionId<DimId> m_dim;
-  const Device& m_device;
-  std::size_t m_block_total_size_max;
+  const Device EIGEN_DEVICE_REF m_device;
 };
 
 
@@ -409,25 +417,23 @@
   typedef typename XprType::Index Index;
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   enum {
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess = false
+    IsAligned     = false,
+    PacketAccess  = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess   = TensorEvaluator<ArgType, Device>::RawAccess,
+    Layout        = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess     = false
   };
 
-  typedef internal::TensorBlock<Index, ScalarNonConst, NumInputDims, Layout>
-    InputTensorBlock;
-  typedef internal::TensorBlock<Index, ScalarNonConst, NumDims, Layout>
-    OutputTensorBlock;
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
     { }
 
@@ -439,26 +445,21 @@
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
 
-    if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) &&
-         this->m_dim.actualDim() == 0) ||
-        (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) &&
-         this->m_dim.actualDim() == NumInputDims - 1)) {
+    if (this->isInnerChipping()) {
       // m_stride is equal to 1, so let's avoid the integer division.
       eigen_assert(this->m_stride == 1);
-      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
       Index inputIndex = index * this->m_inputStride + this->m_inputOffset;
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < PacketSize; ++i) {
         this->m_impl.coeffRef(inputIndex) = values[i];
         inputIndex += this->m_inputStride;
       }
-    } else if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) &&
-                this->m_dim.actualDim() == NumInputDims - 1) ||
-               (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) &&
-                this->m_dim.actualDim() == 0)) {
-      // m_stride is aways greater than index, so let's avoid the integer division.
+    } else if (this->isOuterChipping()) {
+      // m_stride is always greater than index, so let's avoid the integer division.
       eigen_assert(this->m_stride > index);
       this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x);
     } else {
@@ -469,8 +470,9 @@
         this->m_impl.template writePacket<StoreMode>(inputIndex, x);
       } else {
         // Cross stride boundary. Fallback to slow path.
-        EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+        EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
         internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+        EIGEN_UNROLL_LOOP
         for (int i = 0; i < PacketSize; ++i) {
           this->coeffRef(index) = values[i];
           ++index;
@@ -479,52 +481,35 @@
     }
   }
 
+  template <typename TensorBlock>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
-      const OutputTensorBlock& output_block) {
-    // Calculate input block sizes.
-    const DSizes<Index, NumDims>& output_block_sizes =
-        output_block.block_sizes();
-    const DSizes<Index, NumDims>& output_block_strides =
-        output_block.block_strides();
-    const Index chip_dim = this->m_dim.actualDim();
-    DSizes<Index, NumInputDims> input_block_sizes;
-    DSizes<Index, NumInputDims> input_block_strides;
-    for (Index i = 0; i < NumInputDims; ++i) {
-      if (i < chip_dim) {
-        input_block_sizes[i] = output_block_sizes[i];
-        input_block_strides[i] = output_block_strides[i];
-      } else if (i > chip_dim) {
-        input_block_sizes[i] = output_block_sizes[i - 1];
-        input_block_strides[i] = output_block_strides[i - 1];
-      } else {
-        input_block_sizes[i] = 1;
-      }
-    }
-    // Fix up input_block_stride for chip dimension.
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      if (chip_dim == 0) {
-        input_block_strides[chip_dim] = 1;
-      } else {
-        input_block_strides[chip_dim] = input_block_strides[chip_dim - 1] *
-            input_block_sizes[chip_dim - 1];
-      }
-    } else {
-      if (chip_dim == NumInputDims - 1) {
-        input_block_strides[chip_dim] = 1;
-      } else {
-        input_block_strides[chip_dim] = input_block_strides[chip_dim - 1] *
-            input_block_sizes[chip_dim - 1];
-      }
-    }
-    // Write input block.
-    this->m_impl.writeBlock(
-        InputTensorBlock(this->srcCoeff(output_block.first_coeff_index()),
-                         input_block_sizes,
-                         input_block_strides,
-                         this->m_inputStrides,
-                         const_cast<ScalarNonConst*>(output_block.data())));
-  }
+      const TensorBlockDesc& desc, const TensorBlock& block) {
+    assert(this->m_impl.data() != NULL);
 
+    const Index chip_dim = this->m_dim.actualDim();
+
+    DSizes<Index, NumInputDims> input_block_dims;
+    for (int i = 0; i < NumInputDims; ++i) {
+      input_block_dims[i] = i < chip_dim ? desc.dimension(i)
+                          : i > chip_dim ? desc.dimension(i - 1)
+                          : 1;
+    }
+
+    typedef TensorReshapingOp<const DSizes<Index, NumInputDims>,
+                              const typename TensorBlock::XprType>
+        TensorBlockExpr;
+
+    typedef internal::TensorBlockAssignment<Scalar, NumInputDims,
+                                            TensorBlockExpr, Index>
+        TensorBlockAssign;
+
+    TensorBlockAssign::Run(
+        TensorBlockAssign::target(
+            input_block_dims,
+            internal::strides<Layout>(this->m_impl.dimensions()),
+            this->m_impl.data(), this->srcCoeff(desc.offset())),
+        block.expr().reshape(input_block_dims));
+  }
 };
 
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index f6990b0..5235a8e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h

@@ -26,7 +26,6 @@
   // Type promotion to handle the case where the types of the lhs and the rhs are different.
   typedef typename promote_storage_type<typename LhsXprType::Scalar,
                                         typename RhsXprType::Scalar>::ret Scalar;
-  typedef typename packet_traits<Scalar>::type Packet;
   typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
                                         typename traits<RhsXprType>::StorageKind>::ret StorageKind;
   typedef typename promote_index_type<typename traits<LhsXprType>::Index,
@@ -38,6 +37,8 @@
   static const int NumDimensions = traits<LhsXprType>::NumDimensions;
   static const int Layout = traits<LhsXprType>::Layout;
   enum { Flags = 0 };
+  typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+                               typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType;
 };
 
 template<typename Axis, typename LhsXprType, typename RhsXprType>
@@ -59,15 +60,13 @@
 class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors>
 {
   public:
+    typedef TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors> Base;
     typedef typename internal::traits<TensorConcatenationOp>::Scalar Scalar;
-    typedef typename internal::traits<TensorConcatenationOp>::Packet Packet;
     typedef typename internal::traits<TensorConcatenationOp>::StorageKind StorageKind;
     typedef typename internal::traits<TensorConcatenationOp>::Index Index;
     typedef typename internal::nested<TensorConcatenationOp>::type Nested;
     typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
                                                     typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
-    typedef typename internal::promote_storage_type<typename LhsXprType::PacketReturnType,
-                                                    typename RhsXprType::PacketReturnType>::ret PacketReturnType;
     typedef typename NumTraits<Scalar>::Real RealScalar;
 
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis)
@@ -83,27 +82,7 @@
 
     EIGEN_DEVICE_FUNC const Axis& axis() const { return m_axis; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const TensorConcatenationOp& other)
-    {
-      typedef TensorAssignOp<TensorConcatenationOp, const TensorConcatenationOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(
-          assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorConcatenationOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(
-          assign, DefaultDevice());
-      return *this;
-    }
-
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorConcatenationOp)
   protected:
     typename LhsXprType::Nested m_lhs_xpr;
     typename RhsXprType::Nested m_rhs_xpr;
@@ -122,37 +101,49 @@
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
   enum {
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
-                   TensorEvaluator<RightArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
+                        TensorEvaluator<RightArgType, Device>::PacketAccess,
+    BlockAccess       = false,
+    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
   {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    EIGEN_STATIC_ASSERT(NumDims == RightNumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((NumDims == RightNumDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
     eigen_assert(0 <= m_axis && m_axis < NumDims);
     const Dimensions& lhs_dims = m_leftImpl.dimensions();
     const Dimensions& rhs_dims = m_rightImpl.dimensions();
-    int i = 0;
-    for (; i < m_axis; ++i) {
-      eigen_assert(lhs_dims[i] > 0);
-      eigen_assert(lhs_dims[i] == rhs_dims[i]);
-      m_dimensions[i] = lhs_dims[i];
-    }
-    eigen_assert(lhs_dims[i] > 0);  // Now i == m_axis.
-    eigen_assert(rhs_dims[i] > 0);
-    m_dimensions[i] = lhs_dims[i] + rhs_dims[i];
-    for (++i; i < NumDims; ++i) {
-      eigen_assert(lhs_dims[i] > 0);
-      eigen_assert(lhs_dims[i] == rhs_dims[i]);
-      m_dimensions[i] = lhs_dims[i];
+    {
+      int i = 0;
+      for (; i < m_axis; ++i) {
+        eigen_assert(lhs_dims[i] > 0);
+        eigen_assert(lhs_dims[i] == rhs_dims[i]);
+        m_dimensions[i] = lhs_dims[i];
+      }
+      eigen_assert(lhs_dims[i] > 0);  // Now i == m_axis.
+      eigen_assert(rhs_dims[i] > 0);
+      m_dimensions[i] = lhs_dims[i] + rhs_dims[i];
+      for (++i; i < NumDims; ++i) {
+        eigen_assert(lhs_dims[i] > 0);
+        eigen_assert(lhs_dims[i] == rhs_dims[i]);
+        m_dimensions[i] = lhs_dims[i];
+      }
     }
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -160,20 +151,20 @@
       m_rightStrides[0] = 1;
       m_outputStrides[0] = 1;
 
-      for (int i = 1; i < NumDims; ++i) {
-        m_leftStrides[i] = m_leftStrides[i-1] * lhs_dims[i-1];
-        m_rightStrides[i] = m_rightStrides[i-1] * rhs_dims[i-1];
-        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+      for (int j = 1; j < NumDims; ++j) {
+        m_leftStrides[j] = m_leftStrides[j-1] * lhs_dims[j-1];
+        m_rightStrides[j] = m_rightStrides[j-1] * rhs_dims[j-1];
+        m_outputStrides[j] = m_outputStrides[j-1] * m_dimensions[j-1];
       }
     } else {
       m_leftStrides[NumDims - 1] = 1;
       m_rightStrides[NumDims - 1] = 1;
       m_outputStrides[NumDims - 1] = 1;
 
-      for (int i = NumDims - 2; i >= 0; --i) {
-        m_leftStrides[i] = m_leftStrides[i+1] * lhs_dims[i+1];
-        m_rightStrides[i] = m_rightStrides[i+1] * rhs_dims[i+1];
-        m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+      for (int j = NumDims - 2; j >= 0; --j) {
+        m_leftStrides[j] = m_leftStrides[j+1] * lhs_dims[j+1];
+        m_rightStrides[j] = m_rightStrides[j+1] * rhs_dims[j+1];
+        m_outputStrides[j] = m_outputStrides[j+1] * m_dimensions[j+1];
       }
     }
   }
@@ -181,14 +172,14 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
   // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear?
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/)
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType)
   {
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     m_rightImpl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
+  EIGEN_STRONG_INLINE void cleanup()
   {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
@@ -219,11 +210,13 @@
       Index left_index;
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
         left_index = subs[0];
+        EIGEN_UNROLL_LOOP
         for (int i = 1; i < NumDims; ++i) {
           left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
         }
       } else {
         left_index = subs[NumDims - 1];
+        EIGEN_UNROLL_LOOP
         for (int i = NumDims - 2; i >= 0; --i) {
           left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
         }
@@ -235,11 +228,13 @@
       Index right_index;
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
         right_index = subs[0];
+        EIGEN_UNROLL_LOOP
         for (int i = 1; i < NumDims; ++i) {
           right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
         }
       } else {
         right_index = subs[NumDims - 1];
+        EIGEN_UNROLL_LOOP
         for (int i = NumDims - 2; i >= 0; --i) {
           right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
         }
@@ -252,11 +247,12 @@
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
 
-    EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+    EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < packetSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -279,7 +275,15 @@
            TensorOpCost(0, 0, compute_cost);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+  #ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_leftImpl.bind(cgh);
+    m_rightImpl.bind(cgh);
+  }
+  #endif
 
   protected:
     Dimensions m_dimensions;
@@ -300,14 +304,21 @@
   typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
   typedef typename Base::Dimensions Dimensions;
   enum {
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
-                   TensorEvaluator<RightArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
+                        TensorEvaluator<RightArgType, Device>::PacketAccess,
+    BlockAccess       = false,
+    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
     : Base(op, device)
   {
     EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -316,7 +327,7 @@
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
   {
@@ -349,11 +360,11 @@
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize());
 
-    EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+    EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
     internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
     for (int i = 0; i < packetSize; ++i) {
       coeffRef(index+i) = values[i];

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 014c323..8b35f79 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -21,82 +21,15 @@
   */
 namespace internal {
 
-template<typename Scalar, typename Index>
-void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index lddst, Index ldsrc) {
-  size_t psize = packet_traits<Scalar>::size;           // Packet size
-  typedef typename packet_traits<Scalar>::type Packet;  // Packet type
-  size_t alignment = psize*sizeof(Scalar);              // Needed alignment
-  if (rows % psize == 0 && (lddst*sizeof(Scalar)) % alignment == 0 &&
-     (ldsrc*sizeof(Scalar)) % alignment == 0 &&
-     reinterpret_cast<uintptr_t>(src) % alignment == 0 &&
-     reinterpret_cast<uintptr_t>(dst) % alignment == 0) {
-    // Optimized version using packets
-    size_t num_packets = rows / psize;
-    for (Index col = 0; col < cols; ++col) {
-      EIGEN_ASM_COMMENT("begin pack_simple inner copy");
-      // Unrolled manually 4 times.
-      for (size_t i=0; i < num_packets/4; ++i) {
-        internal::pstore(dst, internal::pload<Packet>(src));
-        dst += psize; src += psize;
-        internal::pstore(dst, internal::pload<Packet>(src));
-        dst += psize; src += psize;
-        internal::pstore(dst, internal::pload<Packet>(src));
-        dst += psize; src += psize;
-        internal::pstore(dst, internal::pload<Packet>(src));
-        dst += psize; src += psize;
-      }
-      for (size_t i=0; i < num_packets%4; ++i) {
-        internal::pstore(dst, internal::pload<Packet>(src));
-        dst += psize; src += psize;
-      }
-      dst += lddst - num_packets*psize;
-      src += ldsrc - num_packets*psize;
-      EIGEN_ASM_COMMENT("end pack_simple inner copy");
-    }
-  } else {
-    // Naive memcpy calls
-    for (Index col = 0; col < cols; ++col) {
-      memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar));
-    }
-  }
-}
-
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-// This is a hack needed to be able to compile contraction for Scalar
-// unsupported by libxsmm (such as int). While we do not use xsmm in that case,
-// the check is performed using if(std::is_same<...>), so compiler runs through
-// both branches and then optimizes out unused one, but still would complain.
-template<typename LhsScalar, typename RhsScalar, typename Scalar>
-struct libxsmm_wrapper {
-  libxsmm_wrapper() {}
-  libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) {}
-  void operator()(const LhsScalar* a, const RhsScalar* b, Scalar* c) {}
-  void operator()(const LhsScalar* a, const RhsScalar* b, Scalar* c, const LhsScalar* ap, const RhsScalar* bp, const Scalar* cp) {}
-};
-
-template<>
-struct libxsmm_wrapper<float, float, float>: public libxsmm_mmfunction<float> {
-  libxsmm_wrapper(): libxsmm_mmfunction() {}
-  libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) :
-    libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {}
-};
-
-template<>
-struct libxsmm_wrapper<double, double, double>: public libxsmm_mmfunction<double> {
-  libxsmm_wrapper(): libxsmm_mmfunction() {}
-  libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) :
-    libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {}
-};
-#endif
-
-template<typename Dimensions, typename LhsXprType, typename RhsXprType>
-struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
+template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
+struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> >
 {
   // Type promotion to handle the case where the types of the lhs and the rhs are different.
-  typedef typename scalar_product_traits<typename LhsXprType::Scalar, typename RhsXprType::Scalar>::ReturnType Scalar;
+  typedef typename gebp_traits<typename remove_const<typename LhsXprType::Scalar>::type,
+                               typename remove_const<typename RhsXprType::Scalar>::type>::ResScalar Scalar;
 
-  typedef typename scalar_product_traits<typename traits<LhsXprType>::StorageKind,
-                                         typename traits<RhsXprType>::StorageKind>::ReturnType StorageKind;
+  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
   typedef typename promote_index_type<typename traits<LhsXprType>::Index,
                                       typename traits<RhsXprType>::Index>::type Index;
   typedef typename LhsXprType::Nested LhsNested;
@@ -105,55 +38,308 @@
   typedef typename remove_reference<RhsNested>::type _RhsNested;
 
   // From NumDims below.
-  static const int NumDimensions = traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
+  static const int NumDimensions = traits<LhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
   static const int Layout = traits<LhsXprType>::Layout;
+  typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+                               typename traits<LhsXprType>::PointerType,
+                               typename traits<RhsXprType>::PointerType>::type
+      PointerType;
 
   enum {
-    Flags = 0,
+    Flags = 0
   };
 };
 
-template<typename Dimensions, typename LhsXprType, typename RhsXprType>
-struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, Eigen::Dense>
+template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
+struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, Eigen::Dense>
 {
-  typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType>& type;
+  typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>& type;
 };
 
-template<typename Dimensions, typename LhsXprType, typename RhsXprType>
-struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >::type>
+template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
+struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> >::type>
 {
-  typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType> type;
+  typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> type;
 };
 
-template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename Device_>
-struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_>, Device_> > {
+template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename OutputKernelType_, typename Device_>
+struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_, OutputKernelType_>, Device_> > {
   typedef Indices_ Indices;
   typedef LeftArgType_ LeftArgType;
   typedef RightArgType_ RightArgType;
+  typedef OutputKernelType_ OutputKernelType;
   typedef Device_ Device;
 
   // From NumDims below.
   static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value;
 };
 
+// Helper class to allocate and deallocate temporary memory for packed buffers.
+template <typename LhsScalar, typename RhsScalar>
+struct TensorContractionBlockMemAllocator {
+  typedef void* BlockMemHandle;
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC static BlockMemHandle allocate(Device& d, const Index bm,
+                                                   const Index bk,
+                                                   const Index bn,
+                                                   LhsScalar** lhs_block,
+                                                   RhsScalar** rhs_block) {
+    eigen_assert(lhs_block);
+    eigen_assert(rhs_block);
+    BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn);
+    char* block_mem = static_cast<char*>(d.allocate(sz.lhs_size + sz.rhs_size));
+    eigen_assert(block_mem);
+    *lhs_block = reinterpret_cast<LhsScalar*>(block_mem);
+    *rhs_block = reinterpret_cast<RhsScalar*>(block_mem + sz.lhs_size);
+    return block_mem;
+  }
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC static BlockMemHandle allocateSlices(
+      Device& d, const Index bm, const Index bk, const Index bn,
+      const Index num_lhs, const Index num_rhs, const Index num_slices,
+      std::vector<LhsScalar*>* lhs_blocks,
+      std::vector<RhsScalar*>* rhs_blocks) {
+    eigen_assert(num_slices > 0);
+    eigen_assert(num_lhs >= 0 && num_rhs >= 0);
+    eigen_assert(num_lhs == 0 || lhs_blocks);
+    eigen_assert(num_rhs == 0 || rhs_blocks);
+    BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn);
+    void* block_mem = d.allocate(
+        (num_lhs * sz.lhs_size + num_rhs * sz.rhs_size) * num_slices);
+    eigen_assert(block_mem);
+    char* mem = static_cast<char*>(block_mem);
+
+    for (Index x = 0; x < num_slices; x++) {
+      if (num_lhs > 0) lhs_blocks[x].resize(num_lhs);
+      for (Index m = 0; m < num_lhs; m++) {
+        lhs_blocks[x][m] = reinterpret_cast<LhsScalar*>(mem);
+        mem += sz.lhs_size;
+      }
+      if (num_rhs > 0) rhs_blocks[x].resize(num_rhs);
+      for (Index n = 0; n < num_rhs; n++) {
+        rhs_blocks[x][n] = reinterpret_cast<RhsScalar*>(mem);
+        mem += sz.rhs_size;
+      }
+    }
+
+    return block_mem;
+  }
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) {
+    d.deallocate(handle);
+  }
+
+ private:
+  struct BlockSizes {
+    Index lhs_size;
+    Index rhs_size;
+  };
+  EIGEN_DEVICE_FUNC static BlockSizes ComputeLhsRhsBlockSizes(const Index bm,
+                                                              const Index bk,
+                                                              const Index bn) {
+    Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
+    BlockSizes sz;
+    sz.lhs_size = divup<Index>(bm * bk * sizeof(LhsScalar), align) * align;
+    sz.rhs_size = divup<Index>(bn * bk * sizeof(RhsScalar), align) * align;
+    return sz;
+  }
+};
+
+// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in
+// ColMajor storage order. This property is guaranteed by the
+// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack
+// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix
+// multiplication for these blocks. Default tensor contraction uses
+// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see
+// GeneralBlocPanelKernel.h for details).
+//
+// By specializing contraction kernels we can use other low level libraries to
+// perform matrix multiplication, and still rely on Eigen contraction evaluator.
+// This also includes full support in TensorContractionThreadPool, assuming that
+// underlying gemm do not use it's own threading.
+//
+// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of
+//   multiplication, lhs tensor and rhs tensor respectively.
+//
+// - StorageIndex - index type for the tensor expressions. In practice almost
+//   always is Eigen::Index.
+//
+// - OutputMapper provides access to the memory of the output matrix. In
+//   practice it's always column major blas_data_mapper (it must be of ResScalar
+//   type).
+//
+// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional
+//   view into the Lhs/Rhs tensor expressions. In practice it's
+//   TensorContractionInputMapper, or some specialization of it based on the
+//   type of tensor expression (e.g. TensorImagePatchOp has optimized input
+//   mapper).
+template <typename ResScalar, typename LhsScalar, typename RhsScalar,
+    typename StorageIndex, typename OutputMapper, typename LhsMapper,
+    typename RhsMapper>
+struct TensorContractionKernel {
+  // True if `invoke()` supports `beta` in `C <- alpha * A * B + beta * C`
+  // (otherwise beta should be always equal to 1).
+  enum { HasBeta = false };
+
+  EIGEN_DEVICE_FUNC
+  TensorContractionKernel(StorageIndex m_, StorageIndex k_, StorageIndex n_,
+                          StorageIndex bm_, StorageIndex bk_, StorageIndex bn_)
+      : m(m_), k(k_), n(n_), bm(bm_), bk(bk_), bn(bn_) {}
+
+  // Pack blocks of Lhs and Rhs into contiguous blocks in memory.
+  typedef LhsScalar* LhsBlock;
+  typedef RhsScalar* RhsBlock;
+
+  // Packed Lhs/Rhs block memory allocator.
+  typedef TensorContractionBlockMemAllocator<LhsScalar, RhsScalar>
+      BlockMemAllocator;
+  typedef typename BlockMemAllocator::BlockMemHandle BlockMemHandle;
+
+  typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+  typedef internal::gemm_pack_lhs<
+      LhsScalar, StorageIndex, typename LhsMapper::SubMapper, Traits::mr,
+      Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor>
+      LhsPacker;
+
+  typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex,
+                                  typename RhsMapper::SubMapper, Traits::nr,
+                                  ColMajor>
+      RhsPacker;
+
+  typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex,
+                                OutputMapper, Traits::mr, Traits::nr,
+      /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
+      GebpKernel;
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC BlockMemHandle allocate(Device& d, LhsBlock* lhs_block,
+                                            RhsBlock* rhs_block) {
+    return BlockMemAllocator::allocate(d, bm, bk, bn, lhs_block, rhs_block);
+  }
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC BlockMemHandle allocateSlices(
+      Device& d, const StorageIndex num_lhs, const StorageIndex num_rhs,
+      const StorageIndex num_slices, std::vector<LhsBlock>* lhs_blocks,
+      std::vector<RhsBlock>* rhs_blocks) {
+    return BlockMemAllocator::allocateSlices(
+        d, bm, bk, bn, num_lhs, num_rhs, num_slices, lhs_blocks, rhs_blocks);
+  }
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) {
+    BlockMemAllocator::deallocate(d, handle);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packLhs(
+      LhsBlock* lhsBlock, const typename LhsMapper::SubMapper& data_mapper,
+      const StorageIndex depth, const StorageIndex rows) {
+    LhsPacker()(*lhsBlock, data_mapper, depth, rows, /*stride*/ 0,
+        /*offset*/ 0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packRhs(
+      RhsBlock* rhsBlock, const typename RhsMapper::SubMapper& data_mapper,
+      const StorageIndex depth, const StorageIndex cols) {
+    RhsPacker()(*rhsBlock, data_mapper, depth, cols);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void invoke(
+      const OutputMapper& output_mapper, const LhsBlock& lhsBlock,
+      const RhsBlock& rhsBlock, const StorageIndex rows,
+      const StorageIndex depth, const StorageIndex cols,
+      const ResScalar alpha, const ResScalar beta) {
+    // Default GEBP kernel does not support beta.
+    eigen_assert(beta == ResScalar(1));
+    static const int kComputeStrideFromBlockDimensions = -1;
+    GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha,
+        /*strideA*/ kComputeStrideFromBlockDimensions,
+        /*strideB*/ kComputeStrideFromBlockDimensions,
+        /*offsetA*/ 0, /*offsetB*/ 0);
+  }
+
+ private:
+  // These are dimensions of the original Tensors, and selected block sizes. The
+  // actual block sizes passed to all function above might be smaller because of
+  // the partial blocks at the end.
+  const StorageIndex m;
+  const StorageIndex k;
+  const StorageIndex n;
+  const StorageIndex bm;
+  const StorageIndex bk;
+  const StorageIndex bn;
+};
+
 }  // end namespace internal
 
-template<typename Indices, typename LhsXprType, typename RhsXprType>
-class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType> >
+// Tensor contraction params that should enable to get from output matrix
+// 2-dimensional coordinates to the output tensor dimensions.
+struct TensorContractionParams {
+  // TensorContraction evaluator assumes that both tensors are in ColMajor
+  // layout, if tensors are in RowMajor evaluator swap lhs with rhs.
+  bool swapped_arguments;
+};
+
+// Output kernel allows to fuse operations into the tensor contraction.
+//
+// Examples:
+//   1. Elementwise Relu transformation following Conv2D.
+//   2. AddBias to the Conv2D output channels dimension.
+//
+// The NoOpOutputKernel implements an output kernel that does absolutely nothing.
+struct NoOpOutputKernel {
+  /**
+   * Tensor contraction evaluator calls this kernel after finishing each block
+   * of output matrix. Output blocks belong to the 2-dimensional output tensor.
+   *
+   * TensorContractionParams contains contraction dimensions information
+   * required to map output 2-d space into the expected output tensor space
+   * (potentially higher dimensional).
+   *
+   * \param[in] output_mapper Access to output tensor memory
+   * \param[in] params   Tensor contraction parameters
+   * \param[in] i        Index of a first row available through output_mapper
+   * \param[in] j        Index of a first column available through output_mapper
+   * \param[in] num_rows Number of available rows
+   * \param[in] num_cols Number of available columns
+   */
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
+      const TensorContractionParams& params, Index i,
+      Index j, Index num_rows, Index num_cols) const {
+    EIGEN_UNUSED_VARIABLE(output_mapper);
+    EIGEN_UNUSED_VARIABLE(params);
+    EIGEN_UNUSED_VARIABLE(i);
+    EIGEN_UNUSED_VARIABLE(j);
+    EIGEN_UNUSED_VARIABLE(num_rows);
+    EIGEN_UNUSED_VARIABLE(num_cols);
+  }
+};
+
+template<typename Indices, typename LhsXprType, typename RhsXprType, typename OutputKernelType = const NoOpOutputKernel>
+class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType, OutputKernelType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
-  typedef typename internal::scalar_product_traits<typename LhsXprType::CoeffReturnType,
-                                                   typename RhsXprType::CoeffReturnType>::ReturnType CoeffReturnType;
+  typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType,
+                                         typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
   typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(
-      const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims)
-      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {}
+      const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims,
+      const OutputKernelType& output_kernel = OutputKernelType())
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims),
+        m_output_kernel(output_kernel) {}
 
-  EIGEN_DEVICE_FUNC const Indices& indices() const { return m_indices; }
+  EIGEN_DEVICE_FUNC
+  const Indices& indices() const { return m_indices; }
 
   /** \returns the nested expressions */
   EIGEN_DEVICE_FUNC
@@ -164,36 +350,48 @@
   const typename internal::remove_all<typename RhsXprType::Nested>::type&
   rhsExpression() const { return m_rhs_xpr; }
 
+  EIGEN_DEVICE_FUNC
+  const OutputKernelType& outputKernel() const { return m_output_kernel; }
+
   protected:
     typename LhsXprType::Nested m_lhs_xpr;
     typename RhsXprType::Nested m_rhs_xpr;
     const Indices m_indices;
+    const OutputKernelType m_output_kernel;
 };
 
 
 template<typename Derived>
-struct TensorContractionEvaluatorBase
+struct TensorContractionEvaluatorBase : internal::no_assignment_operator
 {
   typedef typename internal::traits<Derived>::Indices Indices;
   typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
   typedef typename internal::traits<Derived>::RightArgType RightArgType;
+  typedef typename internal::traits<Derived>::OutputKernelType OutputKernelType;
   typedef typename internal::traits<Derived>::Device Device;
 
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = true,
-    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
-    BlockAccess = false,
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = true
+    IsAligned         = true,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = false,
+    PreferBlockAccess = false,
+    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = true
   };
 
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
   // Most of the code is assuming that both input tensors are ColMajor. If the
   // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
   // If we want to compute A * B = C, where A is LHS and B is RHS, the code
@@ -203,6 +401,9 @@
   typedef typename internal::conditional<
     static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
 
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluatorType;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluatorType;
+
   static const int LDims =
       internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
   static const int RDims =
@@ -210,26 +411,26 @@
   static const int ContractDims = internal::array_size<Indices>::value;
   static const int NumDims = LDims + RDims - 2 * ContractDims;
 
-  typedef array<Index, LDims> left_dim_mapper_t;
-  typedef array<Index, RDims> right_dim_mapper_t;
   typedef array<Index, ContractDims> contract_t;
   typedef array<Index, LDims - ContractDims> left_nocontract_t;
   typedef array<Index, RDims - ContractDims> right_nocontract_t;
 
   typedef DSizes<Index, NumDims> Dimensions;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  EIGEN_STRONG_INLINE
   TensorContractionEvaluatorBase(const XprType& op, const Device& device)
       : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
                           op.lhsExpression(), op.rhsExpression()), device),
         m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
-                          op.rhsExpression(), op.lhsExpression()), device),
+                           op.rhsExpression(), op.lhsExpression()), device),
         m_device(device),
+        m_output_kernel(op.outputKernel()),
         m_result(NULL) {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
-                         static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+         static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
                         YOU_MADE_A_PROGRAMMING_MISTAKE);
 
+
     DSizes<Index, LDims> eval_left_dims;
     DSizes<Index, RDims> eval_right_dims;
     array<IndexPair<Index>, ContractDims> eval_op_indices;
@@ -263,34 +464,28 @@
     }
 
     // Check for duplicate axes and make sure the first index in eval_op_indices
-    // is increasing. Using O(n^2) sorting is OK since ContractDims is at
-    // most 8.
+    // is increasing. Using O(n^2) sorting is OK since ContractDims is small
     for (int i = 0; i < ContractDims; i++) {
       for (int j = i + 1; j < ContractDims; j++) {
         eigen_assert(eval_op_indices[j].first != eval_op_indices[i].first &&
                      eval_op_indices[j].second != eval_op_indices[i].second &&
                      "contraction axes should be unique");
         if (eval_op_indices[j].first < eval_op_indices[i].first) {
-          using numext::swap;
-          swap(eval_op_indices[j], eval_op_indices[i]);
+          numext::swap(eval_op_indices[j], eval_op_indices[i]);
         }
       }
     }
 
     array<Index, LDims> lhs_strides;
-    if (LDims > 0) {
-      lhs_strides[0] = 1;
-      for (int i = 0; i < LDims-1; ++i) {
-        lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i];
-      }
+    lhs_strides[0] = 1;
+    for (int i = 0; i < LDims-1; ++i) {
+      lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i];
     }
 
     array<Index, RDims> rhs_strides;
-    if (RDims > 0) {
-      rhs_strides[0] = 1;
-      for (int i = 0; i < RDims-1; ++i) {
-        rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i];
-      }
+    rhs_strides[0] = 1;
+    for (int i = 0; i < RDims-1; ++i) {
+      rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i];
     }
 
     if (m_i_strides.size() > 0) m_i_strides[0] = 1;
@@ -302,14 +497,12 @@
     m_k_size = 1;
 
     // To compute the dimension, we simply concatenate the non-contracting
-    // dimensions of the left and then the right tensor. Additionally, I also
-    // want to compute the cumulative products of the left non-contracting
-    // dimensions, right non-contracting dimensions, and the contracting
-    // dimensions (in the order of the contraction) to aid in the later
-    // computation of tensor indices for matrix indices.
+    // dimensions of the left and then the right tensor. Additionally, we also
+    // compute the strides corresponding to the left non-contracting
+    // dimensions and right non-contracting dimensions.
     m_lhs_inner_dim_contiguous = true;
     int dim_idx = 0;
-    int nocontract_idx = 0;
+    Index nocontract_idx = 0;
 
     for (int i = 0; i < LDims; i++) {
       // find if we are contracting on index i of left tensor
@@ -362,10 +555,11 @@
       }
     }
 
-    // now build contraction cumprod. We assumed above that non-contracting axes
-    // are represented in the same order in the matrix as they are in the tensor.
-    // This is not the case for contracting axes. As the contracting axes must be
-    // of the same size in each tensor, I'll only look at the first tensor here.
+    // Now compute the strides corresponding to the contracting dimensions. We
+    // assumed above that non-contracting axes are represented in the same order
+    // in the matrix as they are in the tensor. This is not the case for
+    // contracting axes. As the contracting axes must be of the same size in
+    // each tensor, we'll only look at the first tensor here.
     m_rhs_inner_dim_contiguous = true;
     m_rhs_inner_dim_reordered = false;
     for (int i = 0; i < ContractDims; i++) {
@@ -376,7 +570,7 @@
       eigen_assert(size == eval_right_dims[right] &&
                    "Contraction axes must be same size");
 
-      if (i+1 < internal::array_size<contract_t>::value) {
+      if (i+1 < static_cast<int>(internal::array_size<contract_t>::value)) {
         m_k_strides[i+1] = m_k_strides[i] * size;
       } else {
         m_k_size = m_k_strides[i] * size;
@@ -392,77 +586,6 @@
       }
     }
 
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-    // Check if we can use faster matmul algorithms. For contraction to be
-    // equivalent to matmul, we need both lhs and rhs contracting dims sequences
-    // to be either a prefix or suffix of all dims. Also, the order of both
-    // must be the same, so we don't have to do reordering.
-    // For example:
-    // * OK: lhs 4D, rhs 4D, contraction: [(0, 2), (1, 3)]
-    // * BAD: lhs 3D, rhs 3D, contraction: [(1,1)]
-    // * BAD: lhs 3D, rhs 3D, contraction: [(0, 0), (2, 2)]
-    // * BAD: lhs 3D, rhs 3D, contraction: [(0, 2), (1, 1)]
-    // Depending if contraction dims are prefix or suffix of all dims we need to
-    // pre-transpose matrices in matmul algorithm:
-    // lhs: prefix -> transpose, suffix -> no transpose
-    // rhs: prefix -> no transpose, suffix -> transpose
-    // For example, for lhs 2D, rhs 2D, contraction [(1, 0)] is regular,
-    // non-transposed matmul.
-    bool equivalent_to_matmul = true;
-
-    if (ContractDims == 0) {
-      // This case is totally uninteresting, filter it out to avoid problems
-      // with iterations in further tests.
-      equivalent_to_matmul = false;
-    }
-
-    // Check if RHS dims list is increasing. LHS already is, so if not, the
-    // order is different and we cannot do matmul.
-    for (int i = 1; i < ContractDims; i++) {
-      if (eval_op_indices[i].second < eval_op_indices[i-1].second) {
-        equivalent_to_matmul = false;
-      }
-    }
-
-    if (equivalent_to_matmul) {
-      // Check if no holes.
-      int diff;
-      for (int i = 1; i < ContractDims; i++) {
-        // LHS contract dims are sorted to form an increasing seq.
-        diff = eval_op_indices[i].first - eval_op_indices[i-1].first;
-        if (diff != 1) {
-          equivalent_to_matmul = false;
-        }
-        // Now we may already assume RHS contract dims seq is increasing too.
-        diff = eval_op_indices[i].second - eval_op_indices[i-1].second;
-        if (diff != 1) {
-          equivalent_to_matmul = false;
-        }
-      }
-
-      // Check if suffix or prefix.
-      if (eval_op_indices[0].first != 0 &&
-          eval_op_indices[ContractDims-1].first != LDims-1) {
-        equivalent_to_matmul = false;
-      }
-      if (eval_op_indices[0].second != 0 &&
-          eval_op_indices[ContractDims-1].second != RDims-1) {
-        equivalent_to_matmul = false;
-      }
-    }
-
-    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-    m_can_use_xsmm = equivalent_to_matmul &&
-       std::is_same<Scalar, LhsScalar>::value &&
-       std::is_same<Scalar, RhsScalar>::value &&
-       (std::is_same<Scalar, float>::value ||
-        std::is_same<Scalar, double>::value) &&
-       m_leftImpl.data() != NULL && m_rightImpl.data() != NULL;
-#else
-    m_can_use_xsmm = false;
-#endif
-
     // If the layout is RowMajor, we need to reverse the m_dimensions
     if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
       for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
@@ -470,67 +593,125 @@
       }
     }
 
-    eigen_assert(ContractDims == 0 || !m_lhs_inner_dim_contiguous ||
-                 (m_left_contracting_strides[0] == m_i_size));
+    // A set of parameters that will allow output kernel to get from output
+    // tensor dimensions (i, j) into the original tensor dimensions.
+    // TODO(ezhulenev): Add parameters required to infer output tensor index for
+    // more complex contractions than 2x2 on internal dimension.
+    m_tensor_contraction_params.swapped_arguments = static_cast<int>(Layout) == RowMajor;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     m_rightImpl.evalSubExprsIfNeeded(NULL);
     if (data) {
       evalTo(data);
       return false;
     } else {
-      m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      m_result = static_cast<EvaluatorPointerType>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
       evalTo(m_result);
       return true;
     }
   }
 
-#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS)   \
-    if (this->m_lhs_inner_dim_contiguous) { \
-      if (this->m_rhs_inner_dim_contiguous) { \
-        if (this->m_rhs_inner_dim_reordered) { \
-          METHOD<true, true, true, ALIGNMENT>ARGS;    \
-        } \
-        else { \
-          METHOD<true, true, false, ALIGNMENT>ARGS; \
-        } \
-      } \
-      else { \
-       if (this->m_rhs_inner_dim_reordered) { \
-          METHOD<true, false, true, ALIGNMENT>ARGS; \
-        } \
-        else { \
-          METHOD<true, false, false, ALIGNMENT>ARGS; \
-        } \
-      } \
-    } \
-    else { \
-      if (this->m_rhs_inner_dim_contiguous) { \
-        if (this->m_rhs_inner_dim_reordered) { \
-          METHOD<false, true, true, ALIGNMENT>ARGS; \
-        } \
-        else { \
-          METHOD<false, true, false, ALIGNMENT>ARGS; \
-        } \
-      } \
-      else { \
-       if (this->m_rhs_inner_dim_reordered) { \
-          METHOD<false, false, true, ALIGNMENT>ARGS; \
-        } \
-        else { \
-          METHOD<false, false, false, ALIGNMENT>ARGS; \
-        } \
-      } \
-    }
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType dest, EvalSubExprsCallback done) {
+    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) {
+      m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) {
+        if (dest) {
+          evalToAsync(dest, [done]() { done(false); });
+        } else {
+          m_result = static_cast<EvaluatorPointerType>(
+              m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+          evalToAsync(m_result, [done]() { done(true); });
+        }
+      });
+    });
+  }
+#endif  // EIGEN_USE_THREADS
+
+#ifndef TENSOR_CONTRACTION_DISPATCH
+#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \
+  if (this->m_lhs_inner_dim_contiguous) {                    \
+    if (this->m_rhs_inner_dim_contiguous) {                  \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<true, true, true, ALIGNMENT> ARGS;            \
+      } else {                                               \
+        METHOD<true, true, false, ALIGNMENT> ARGS;           \
+      }                                                      \
+    } else {                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<true, false, true, ALIGNMENT> ARGS;           \
+      } else {                                               \
+        METHOD<true, false, false, ALIGNMENT> ARGS;          \
+      }                                                      \
+    }                                                        \
+  } else {                                                   \
+    if (this->m_rhs_inner_dim_contiguous) {                  \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<false, true, true, ALIGNMENT> ARGS;           \
+      } else {                                               \
+        METHOD<false, true, false, ALIGNMENT> ARGS;          \
+      }                                                      \
+    } else {                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<false, false, true, ALIGNMENT> ARGS;          \
+      } else {                                               \
+        METHOD<false, false, false, ALIGNMENT> ARGS;         \
+      }                                                      \
+    }                                                        \
+  }
+#endif
+
+#ifndef TENSOR_CONTRACTION_ASYNC_DISPATCH
+#define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \
+  if (this->m_lhs_inner_dim_contiguous) {                                    \
+    if (this->m_rhs_inner_dim_contiguous) {                                  \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, true, true, true, ALIGNMENT> ARGS)->FN;            \
+      } else {                                                               \
+        (new METHOD<DONE, true, true, false, ALIGNMENT> ARGS)->FN;           \
+      }                                                                      \
+    } else {                                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, true, false, true, ALIGNMENT> ARGS)->FN;           \
+      } else {                                                               \
+        (new METHOD<DONE, true, false, false, ALIGNMENT> ARGS)->FN;          \
+      }                                                                      \
+    }                                                                        \
+  } else {                                                                   \
+    if (this->m_rhs_inner_dim_contiguous) {                                  \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, false, true, true, ALIGNMENT> ARGS)->FN;           \
+      } else {                                                               \
+        (new METHOD<DONE, false, true, false, ALIGNMENT> ARGS)->FN;          \
+      }                                                                      \
+    } else {                                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, false, false, true, ALIGNMENT> ARGS)->FN;          \
+      } else {                                                               \
+        (new METHOD<DONE, false, false, false, ALIGNMENT> ARGS)->FN;         \
+      }                                                                      \
+    }                                                                        \
+  }
+#endif
 
   EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
-    static_cast<const Derived*>(this)->template evalProduct<Unaligned>(buffer);
+   static_cast<const Derived*>(this)->template evalProduct<Unaligned>(buffer);
   }
 
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalToCallback>
+  void evalToAsync(Scalar* buffer, EvalToCallback done) const {
+    static_cast<const Derived*>(this)
+        ->template evalProductAsync<EvalToCallback, Unaligned>(buffer,
+                                                               std::move(done));
+  }
+#endif  // EIGEN_USE_THREADS
+
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
             bool rhs_inner_dim_reordered, int Alignment>
   void evalProductSequential(Scalar* buffer) const {
@@ -545,20 +726,19 @@
   }
 
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  #if !defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+  #endif
   void evalGemv(Scalar* buffer) const {
     const Index rows = m_i_size;
     const Index cols = m_k_size;
 
-    internal::EigenStatsWrapper::get()->add(internal::MatmulOp{
-      internal::MatmulOp::Algorithm::GEMV, static_cast<std::size_t>(rows), static_cast<std::size_t>(cols), 1,
-      !lhs_inner_dim_contiguous, !rhs_inner_dim_contiguous, 1});
-
     typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
     typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
     typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
     typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-    const int lhs_packet_size = PacketType<LhsScalar, Device>::size;
-    const int rhs_packet_size = PacketType<RhsScalar, Device>::size;
+    const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
+    const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
     const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned;
     const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned;
     typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
@@ -578,58 +758,49 @@
     RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides,
                   m_right_contracting_strides, m_k_strides);
 
-    const RhsScalar alpha(1);
+    const Scalar alpha(1);
     const Index resIncr(1);
 
     // zero out the result buffer (which must be of size at least rows * sizeof(Scalar)
     m_device.memset(buffer, 0, rows * sizeof(Scalar));
 
-    internal::general_matrix_vector_product
-          <Index, LhsScalar, LhsMapper, ColMajor, false,
-                  RhsScalar, RhsMapper, false>::run(
+    internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run(
         rows, cols, lhs, rhs,
         buffer, resIncr, alpha);
-  }
 
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
-            bool rhs_inner_dim_reordered, int Alignment>
-  EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
-    typedef
-        typename internal::remove_const<typename EvalLeftArgType::Scalar>::type
-        LhsScalar;
-    typedef
-        typename internal::remove_const<typename EvalRightArgType::Scalar>::type
-        RhsScalar;
-
-    // rows in left side
-    const Index m = this->m_i_size;
-
-    // columns in right side
-    const Index n = this->m_j_size;
-
-    #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-    if (m_can_use_xsmm) {
-       evalGemmPartialXSMM(buffer, 0, this->m_k_size, 1);
-       return;
-    }
-    #endif
-    // Use more generic Gebp
-
-    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
-    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-
-    this->template evalGemmPartial<lhs_inner_dim_contiguous,
-                                   rhs_inner_dim_contiguous,
-                                   rhs_inner_dim_reordered, Alignment>(
-                                       buffer, 0, this->m_k_size, 1);
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+    m_output_kernel(OutputMapper(buffer, rows), m_tensor_contraction_params,
+                    static_cast<Index>(0), static_cast<Index>(0), rows,
+                    static_cast<Index>(1));
   }
 
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
+  #if !defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+  #endif
+  void evalGemm(Scalar* buffer) const {
     // columns in left side, rows in right side
     const Index k = this->m_k_size;
+    this->template evalGemmPartial<lhs_inner_dim_contiguous,
+                                   rhs_inner_dim_contiguous,
+                                   rhs_inner_dim_reordered,
+                                   Alignment, true>(buffer, 0, k, 1);
+  }
 
-    eigen_assert(k_end >= k_start && k_start >= 0 && k_end <= k);
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+      bool rhs_inner_dim_reordered, int Alignment>
+  EIGEN_DEVICE_FUNC void evalGemmPartialWithoutOutputKernel(
+      Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
+    evalGemmPartial<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous,
+                    rhs_inner_dim_reordered, Alignment,
+        /*use_output_kernel*/ false>(buffer, k_start, k_end,
+                                     num_threads);
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment, bool use_output_kernel>
+  EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
+    eigen_assert(k_end >= k_start && k_start >= 0 && k_end <= this->m_k_size);
+    // columns in slice on left side, rows on right side
     const Index k_slice = k_end - k_start;
 
     // rows in left side
@@ -638,23 +809,15 @@
     // columns in right side
     const Index n = this->m_j_size;
 
-    internal::EigenStatsWrapper::get()->add(internal::MatmulOp{
-      internal::MatmulOp::Algorithm::GEBP, static_cast<std::size_t>(m), static_cast<std::size_t>(k), static_cast<std::size_t>(n),
-      !lhs_inner_dim_contiguous, !rhs_inner_dim_contiguous, static_cast<std::size_t>(num_threads)});
-
-    // define mr, nr, and all of my data mapper types
+    // define data mappers for Lhs and Rhs
     typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
     typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-    typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
-
-    const Index nr = Traits::nr;
-    const Index mr = Traits::mr;
 
     typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
     typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
 
-    const int lhs_packet_size = internal::packet_traits<LhsScalar>::size;
-    const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+    const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
+    const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
 
     typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
                                                    LeftEvaluator, left_nocontract_t,
@@ -670,13 +833,9 @@
 
     typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
 
-    // declare GEBP packing and kernel structs
-    // TODO: packing could be faster sometimes if we supported row major tensor mappers
-    internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs;
-    internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
-
-    // TODO: replace false, false with conjugate values?
-    internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
+    typedef internal::TensorContractionKernel<
+        Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
+        TensorContractionKernel;
 
     // initialize data mappers
     LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
@@ -687,198 +846,73 @@
 
     OutputMapper output(buffer, m);
 
-    // compute block sizes (which depend on number of threads)
-    internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
-                                        internal::ShardByCol>
+    // Sizes of the blocks to load in cache. See the Goto paper for details.
+    internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar,
+                                        Index, internal::ShardByCol>
         blocking(k_slice, m, n, num_threads);
-
     const Index kc = blocking.kc();
-    const Index mc = (std::min<Index>)(m, blocking.mc());
-    const Index nc = (std::min<Index>)(n, blocking.nc());
+    const Index mc = numext::mini(m, blocking.mc());
+    const Index nc = numext::mini(n, blocking.nc());
 
-    // sizes of submatrices to live in cache. see Goto paper.
-    int sizeA = blocking.mc() * kc;
-    int sizeB = kc * blocking.nc();
+    typedef typename TensorContractionKernel::LhsBlock LhsBlock;
+    typedef typename TensorContractionKernel::RhsBlock RhsBlock;
 
-    // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB
-    //       aren't 16 byte aligned segfaults will happen due to SIMD instructions
-    LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)));
-    RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)));
+    LhsBlock blockA;
+    RhsBlock blockB;
 
-    for (Index i2=0; i2<m; i2+=mc)
+    TensorContractionKernel kernel(m, k_slice, n, mc, kc, nc);
+
+    typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle;
+    const BlockMemHandle packed_mem =
+        kernel.allocate(this->m_device, &blockA, &blockB);
+
+    // If a contraction kernel does not support beta, explicitly initialize
+    // output buffer with zeroes.
+    if (!TensorContractionKernel::HasBeta) {
+      this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+    }
+
+    for(Index i2=0; i2<m; i2+=mc)
     {
       const Index actual_mc = numext::mini(i2+mc,m)-i2;
       for (Index k2 = k_start; k2 < k_end; k2 += kc) {
         // make sure we don't overshoot right edge of left matrix, then pack vertical panel
         const Index actual_kc = numext::mini(k2 + kc, k_end) - k2;
-        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
+        kernel.packLhs(&blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+        // If kernel supports beta, there is no need to initialize output
+        // buffer with zeroes.
+        const Scalar alpha = Scalar(1);
+        const Scalar beta = (TensorContractionKernel::HasBeta && k2 == k_start)
+                                ? Scalar(0)
+                                : Scalar(1);
 
         // series of horizontal blocks
         for (Index j2 = 0; j2 < n; j2 += nc) {
           // make sure we don't overshoot right edge of right matrix, then pack block
           const Index actual_nc = numext::mini(j2 + nc, n) - j2;
-          pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
+          kernel.packRhs(&blockB, rhs.getSubMapper(k2, j2), actual_kc,
+                         actual_nc);
 
           // call gebp (matrix kernel)
           // The parameters here are copied from Eigen's GEMM implementation
-          gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, Scalar(1), -1, -1, 0, 0);
-        }
-      }
-    }
+          const OutputMapper output_mapper = output.getSubMapper(i2, j2);
+          kernel.invoke(output_mapper, blockA, blockB, actual_mc, actual_kc,
+                        actual_nc, alpha, beta);
 
-    this->m_device.deallocate(blockA);
-    this->m_device.deallocate(blockB);
-  }
-
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-  EIGEN_DEVICE_FUNC void evalGemmPartialXSMM(Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
-    const Index k = this->m_k_size;
-
-    eigen_assert(k_end >= k_start && k_start >= 0 && k_end <= k);
-    const Index k_slice = k_end - k_start;
-
-    // rows in left side
-    const Index m = this->m_i_size;
-
-    // columns in right side
-    const Index n = this->m_j_size;
-
-    const bool transposeA = !m_lhs_inner_dim_contiguous;
-    const bool transposeB = !m_rhs_inner_dim_contiguous;
-
-    internal::EigenStatsWrapper::get()->add(internal::MatmulOp{
-        internal::MatmulOp::Algorithm::XSMM, m, k, n, transposeA, transposeB,
-        num_threads});
-
-    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-
-    internal::TensorXsmmContractionBlocking<LhsScalar, RhsScalar, Index> blocking(
-        k_end - k_start, m, n, num_threads, transposeA, transposeB);
-
-    // Outer blocks sizes
-    const Index mc_outer = blocking.outer_m();
-    const Index nc_outer = blocking.outer_n();
-    const Index kc_outer = blocking.outer_k();
-    // Inner blocks sizes
-    const Index mc = blocking.mc();
-    const Index nc = blocking.nc();
-    const Index kc = blocking.kc();
-    // Decisions whether we should copy parts of matrices
-    const bool copyA = blocking.copyA();
-    const bool copyB = blocking.copyB();
-
-    const LhsScalar * leftData = m_leftImpl.data();
-    const RhsScalar * rightData = m_rightImpl.data();
-
-    libxsmm_blasint stride_A = static_cast<libxsmm_blasint>(transposeA ? k : m);
-    libxsmm_blasint stride_B = static_cast<libxsmm_blasint>(transposeB ? n : k);
-    libxsmm_blasint stride_C = static_cast<libxsmm_blasint>(m);
-
-    libxsmm_blasint stride_blockA = static_cast<libxsmm_blasint>(mc);
-    // Use bigger stride to avoid hitting same cache line too often.
-    // This consistently gives +~0.5 Gflops.
-    libxsmm_blasint stride_panelB = static_cast<libxsmm_blasint>(
-        kc % 32 == 0 ? kc + 16 : kc
-    );
-
-    // Kernel for the general case (not edges)
-    internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar> kernel;
-
-    const LhsScalar *ap;
-    const RhsScalar *bp;
-    const Scalar *cp;
-
-    LhsScalar* blockA;
-    RhsScalar* panelB;
-
-    if (copyA) {
-      blockA = static_cast<LhsScalar*>(this->m_device.allocate(mc * kc * sizeof(LhsScalar)));
-    }
-    if (copyB) {
-      panelB = static_cast<RhsScalar*>(this->m_device.allocate(nc_outer * stride_panelB * sizeof(RhsScalar)));
-    }
-
-    Index kernel_stride_A = copyA ? stride_blockA : stride_A;
-    Index kernel_stride_B = copyB ? stride_panelB : stride_B;
-    kernel = internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, mc, nc, kc, kernel_stride_A, kernel_stride_B, stride_C, 1, 1, blocking.prefetch());
-
-    // Outer blocking
-    for (Index ki_outer = k_start; ki_outer < k_end; ki_outer += kc_outer) {
-      for (Index mi_outer = 0; mi_outer < m; mi_outer += mc_outer) {
-        for (Index ni_outer = 0; ni_outer < n; ni_outer += nc_outer) {
-          using numext::mini;
-
-          Index actual_nc_outer = mini(ni_outer+nc_outer, n) - ni_outer;
-
-          // Inner blocking
-          for (Index ki = ki_outer; ki < mini(ki_outer+kc_outer, k_end); ki += kc) {
-            const Index actual_kc = mini(ki_outer+kc_outer, mini(ki+kc, k_end)) - ki;
-
-            if (copyB) {
-              if (transposeB) {
-                libxsmm_otrans(panelB, rightData + ki*stride_B + ni_outer, sizeof(RhsScalar), actual_nc_outer, actual_kc, stride_B, stride_panelB);
-              } else {
-                internal::pack_simple<RhsScalar, Index>(panelB, rightData + ni_outer*stride_B + ki, actual_nc_outer, actual_kc, stride_panelB, stride_B);
-              }
-            }
-
-            for (Index mi = mi_outer; mi < mini(mi_outer+mc_outer, m); mi += mc) {
-              const Index actual_mc = mini(mi_outer+mc_outer, mini(mi+mc, m)) - mi;
-
-              const LhsScalar * a = transposeA ? leftData + mi*stride_A + ki :
-                                                 leftData + ki*stride_A + mi;
-
-              if (copyA) {
-                if (transposeA) {
-                  libxsmm_otrans(blockA, a, sizeof(LhsScalar), actual_kc, actual_mc, stride_A, stride_blockA);
-                } else {
-                  internal::pack_simple<LhsScalar, Index>(blockA, a, actual_kc, actual_mc, stride_blockA, stride_A);
-                }
-              }
-
-              for (Index ni = ni_outer; ni < mini(ni_outer+nc_outer, n); ni += nc) {
-                const Index actual_nc = mini(ni_outer+nc_outer, mini(ni+nc, n)) - ni;
-
-
-                const RhsScalar * b = rightData + ni*stride_B + ki;
-                Scalar * c = buffer + ni*stride_C + mi;
-                cp = c + nc*stride_C;
-
-                const LhsScalar * actual_a = copyA ? blockA : a;
-                const Index actual_lda = copyA ? stride_blockA : stride_A;
-                ap = copyA ? blockA : a;
-
-                const RhsScalar * actual_b = copyB ? panelB + (ni-ni_outer)*stride_panelB : b;
-                const Index actual_ldb = copyB ? stride_panelB : stride_B;
-                bp = copyB ? panelB + nc*stride_panelB : b + nc*stride_B;
-
-                float beta = ki == 0 ? 0 : 1;
-
-                if (actual_mc == mc && actual_kc == kc && actual_nc == nc && beta == 1) {
-                  // Most used, cached kernel.
-                  kernel(actual_a, actual_b, c, ap, bp, cp);
-                } else {
-                  // Edges - use libxsmm kernel cache.
-                  internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, actual_mc, actual_nc, actual_kc, actual_lda, actual_ldb, stride_C, 1, beta, blocking.prefetch())(actual_a, actual_b, c, ap, bp, cp);
-                }
-              }
-            }
+          // We are done with this [i2, j2] output block.
+          if (use_output_kernel && k2 + kc >= k_end) {
+            m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2,
+                            actual_mc, actual_nc);
           }
         }
       }
     }
 
-    if (copyA) {
-      this->m_device.deallocate(blockA);
-    }
-    if (copyB) {
-      this->m_device.deallocate(panelB);
-    }
+    kernel.deallocate(this->m_device, packed_mem);
   }
-#endif
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
 
@@ -892,23 +926,18 @@
     return m_result[index];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
   }
 
   template<int LoadMode>
-  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
     return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return m_result; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; }
 
-  protected:
-  // Note: nvcc doesn't like implicit copy constructor. If this is needed anywhere,
-  // then we'll have to write an explicit copy constructor...
-  //TensorContractionEvaluatorBase(const TensorContractionEvaluatorBase&);
-
-  TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
+protected:
   Dimensions m_dimensions;
 
   contract_t m_k_strides;
@@ -918,7 +947,6 @@
   bool m_lhs_inner_dim_contiguous;
   bool m_rhs_inner_dim_contiguous;
   bool m_rhs_inner_dim_reordered;
-  bool m_can_use_xsmm;
 
   left_nocontract_t m_i_strides;
   right_nocontract_t m_j_strides;
@@ -929,29 +957,32 @@
   Index m_j_size;
   Index m_k_size;
 
+  TensorContractionParams m_tensor_contraction_params;
+
   TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
   TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
-  const Device& m_device;
-  Scalar* m_result;
+  const Device EIGEN_DEVICE_REF m_device;
+  OutputKernelType m_output_kernel;
+  EvaluatorPointerType m_result;
 };
 
 
 // evaluator for default device
-template<typename Indices, typename LeftArgType, typename RightArgType, typename Device>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> :
+template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType, typename Device>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> :
     public TensorContractionEvaluatorBase<
-      TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> > {
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+      TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> > {
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
   typedef TensorContractionEvaluatorBase<Self> Base;
 
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
 
   enum {
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout
   };
 
   // Most of the code is assuming that both input tensors are ColMajor. If the
@@ -969,9 +1000,6 @@
       internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
   static const int ContractDims = internal::array_size<Indices>::value;
 
-  typedef array<Index, LDims> left_dim_mapper_t;
-  typedef array<Index, RDims> right_dim_mapper_t;
-
   typedef array<Index, ContractDims> contract_t;
   typedef array<Index, LDims - ContractDims> left_nocontract_t;
   typedef array<Index, RDims - ContractDims> right_nocontract_t;
@@ -981,8 +1009,7 @@
   // Could we use NumDimensions here?
   typedef DSizes<Index, NumDims> Dimensions;
 
-
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+  TensorEvaluator(const XprType& op, const Device& device) :
       Base(op, device) { }
 
   template <int Alignment>

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index 5bbf602..974feb0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h

@@ -10,9 +10,6 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
 #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
 
-#if defined(EIGEN_USE_LIBXSMM)
-// #include "third_party/libxsmm/include/libxsmm_cpuid.h"
-#endif
 
 namespace Eigen {
 namespace internal {
@@ -24,11 +21,28 @@
 
 
 // Default Blocking Strategy
-template <typename LhsScalar, typename RhsScalar, typename Index, int ShardingType=ShardByCol>
+template<typename ResScalar, typename LhsScalar, typename RhsScalar, typename StorageIndex, int ShardingType = ShardByCol>
 class TensorContractionBlocking {
  public:
 
-  TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
+ /*
+   adding EIGEN_DEVICE_FUNC unconditionally to 'TensorContractionBlocking' constructor in `TensorContractionBlocking.h`
+     requires adding EIGEN_DEVICE_FUNC to `computeProductBlockingSizes` in `GeneralBlockPanelKernel.h`
+     which in turn, requires adding EIGEN_DEVICE_FUNC to `evaluateProductBlockingSizesHeuristic` in `GeneralBlockPanelKernel.h`
+     which in turn, requires adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
+     (else HIPCC will error out)
+
+   However adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
+   results in NVCC erroring out with the following error
+
+   ../Eigen/src/Core/products/GeneralBlockPanelKernel.h(57): error #2901:
+      dynamic initialization is not supported for function-scope static variables within a __device__/__global__ function
+ */
+
+  #if !defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+  #endif
+ TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, StorageIndex num_threads = 1) :
       kc_(k), mc_(m), nc_(n)
   {
     if (ShardingType == ShardByCol) {
@@ -37,151 +51,22 @@
     else {
       computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, nc_, mc_, num_threads);
     }
+
+    const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+    kc_ = (rhs_packet_size <= 8 || kc_ <= rhs_packet_size) ?
+      kc_ : (kc_ / rhs_packet_size) * rhs_packet_size;
   }
 
-  EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
-  EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
-  EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
 
  private:
-  Index kc_;
-  Index mc_;
-  Index nc_;
+  StorageIndex kc_;
+  StorageIndex mc_;
+  StorageIndex nc_;
 };
 
-#if defined(EIGEN_USE_LIBXSMM)
-template <typename LhsScalar, typename RhsScalar, typename Index>
-class TensorXsmmContractionBlocking {
- public:
-  TensorXsmmContractionBlocking(Index k, Index m, Index n,
-      size_t max_num_threads = 1, bool transposeA = false,
-      bool transposeB = false):
-    k_(k), m_(m), n_(n), num_threads_(max_num_threads), transposeA_(transposeA),
-    transposeB_(transposeB) {
-#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
-    if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
-      mc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M;
-      kc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K;
-      nc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N;
-      outer_m_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_M;
-      outer_k_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_K;
-      outer_n_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_N;
-      copyA_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_A;
-      copyB_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_B;
-      outer_m_ = outer_m_ != 0 ? outer_m_ : m;
-      outer_k_ = outer_k_ != 0 ? outer_k_ : k;
-      outer_n_ = outer_n_ != 0 ? outer_n_ : n;
-    }
-#else
-    // Defaults, possibly overriden per-platform.
-    copyA_ = true;
-    copyB_ = false;
-
-    // If the matrix is small enough, don't do blocking, just call single xsmm
-    // kernel.
-    if (static_cast<double>(m)*k*n <= LIBXSMM_THRESHOLD) {
-      mc_ = m; kc_ = k; nc_ = n;
-      outer_m_ = m; outer_k_ = k; outer_n_ = n;
-      copyA_ = false; copyB_ = false;
-    } else {
-      int arch = libxsmm_cpuid_x86();
-
-      if (arch == LIBXSMM_X86_AVX512_CORE) {
-        // skylake
-        mc_ = 64; kc_ = 64; nc_ = 24;
-        outer_m_ = 512; outer_k_ = 512; outer_n_ = 24*22;
-        // Hack to use this kernel architecture as the other one has performance
-        // issues (no hardware prefetching).
-        // TODO(nishantpatil): This should be removed if the issues are fixed,
-        // or this one becomes the default.
-        setenv("LIBXSMM_AVX512_CLASSIC_GEMM", "1", 1);
-      } else if (arch == LIBXSMM_X86_AVX2) {
-        // haswell
-        mc_ = 32; kc_ = 192; nc_ = 33;
-        outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 33*16;
-      } else if (arch == LIBXSMM_X86_AVX) {
-        // ivybridge
-        mc_ = 32; kc_ = 192; nc_ = 48;
-        outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 48*11;
-      } else {
-        // generic kernel size, usually performing well
-        mc_ = 32; kc_ = 128; nc_ = 32;
-        outer_m_ = 512; outer_k_ = 512; outer_n_ = 512;
-      }
-
-      // Only copy if it makes the stride smaller.
-      copyA_ = copyA_ && (m > mc_);
-      copyB_ = copyB_ && (k > kc_);
-    }
-
-    // We need to copy anyway if transposing
-    copyA_ = copyA_ || transposeA;
-    copyB_ = copyB_ || transposeB;
-
-    // See libxsmm_gemm_prefetch_type definition in libxsmm_typedefs.h
-    prefetch_ = LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C;
-
-#endif
-
-    mc_ = mc_ > m ? m : mc_;
-    nc_ = nc_ > n ? n : nc_;
-    kc_ = kc_ > k ? k : kc_;
-
-    size_t compute_parallelism = (m / mc_) * (n / nc_);
-    size_t pack_parallelism = 0;
-    if (copyA_) {
-      pack_parallelism += (m / mc_) * (k / kc_);
-    }
-    if (copyB_) {
-      pack_parallelism += (n / nc_) * (k / kc_);
-    }
-    size_t parallelism = std::max(compute_parallelism, pack_parallelism);
-
-    num_threads_ = std::min<size_t>(num_threads_,
-                                    parallelism / MIN_JOBS_PER_THREAD);
-    num_threads_ = std::max<size_t>(num_threads_, 1);
-
-    // For optimal performance outer block sizes should be multiplies of kernel
-    // sizes, or bigger than matrix size (=no outer blocking).
-    eigen_assert(outer_m_ % mc_ == 0 || outer_m_ >= m);
-    eigen_assert(outer_k_ % kc_ == 0 || outer_k_ >= k);
-    eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n);
-  }
-
-  EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
-  EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
-  EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
-  EIGEN_ALWAYS_INLINE Index outer_k() const { return outer_k_; }
-  EIGEN_ALWAYS_INLINE Index outer_m() const { return outer_m_; }
-  EIGEN_ALWAYS_INLINE Index outer_n() const { return outer_n_; }
-  EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; }
-  EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; }
-  EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; }
-  EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; }
-  EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; }
-  EIGEN_ALWAYS_INLINE Index blocks_m() const { return divup(m_, mc_); }
-  EIGEN_ALWAYS_INLINE Index blocks_k() const { return divup(k_, kc_); }
-  EIGEN_ALWAYS_INLINE Index blocks_n() const { return divup(n_, nc_); }
-  EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const {
-    return prefetch_;
-  }
-
- private:
-  Index k_, m_, n_;
-  Index kc_, mc_, nc_;
-  Index outer_k_, outer_m_, outer_n_;
-  bool copyA_, copyB_, transposeA_, transposeB_;
-  size_t num_threads_;
-
-  // Threshold for m*k*n to skip blocking and just call libxsmm
-  const double LIBXSMM_THRESHOLD = 80*80*80;
-  // For computing optimal number of threads - so that each thread gets at least
-  // that many jobs.
-  const double MIN_JOBS_PER_THREAD = 3;
-  libxsmm_gemm_prefetch_type prefetch_;
-};
-#endif // EIGEN_USE_LIBXSMM
-
 } // end namespace internal
 } // end namespace Eigen
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index 19925d9..3f315fe 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h

@@ -1,1396 +1,6 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
+#if defined(__clang__) || defined(__GNUC__)
+#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorContractionGpu.h file"
+#endif
 
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-
-namespace Eigen {
-
-template<typename Scalar, typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper, bool needs_edge_check>
-__device__ EIGEN_STRONG_INLINE void
-EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
-                               const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
-                       const Index m_size, const Index n_size, const Index k_size) {
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 64 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  // declare and initialize 64 registers for output 8x8 block
-
-  // prefetch registers
-  Scalar lhs_pf0;
-  Scalar lhs_pf1;
-  Scalar lhs_pf2;
-  Scalar lhs_pf3;
-  Scalar lhs_pf4;
-  Scalar lhs_pf5;
-  Scalar lhs_pf6;
-  Scalar lhs_pf7;
-
-  Scalar rhs_pf0;
-  Scalar rhs_pf1;
-  Scalar rhs_pf2;
-  Scalar rhs_pf3;
-  Scalar rhs_pf4;
-  Scalar rhs_pf5;
-  Scalar rhs_pf6;
-  Scalar rhs_pf7;
-
-  // shared memory is formatted
-  // (contract idx in block, nocontract idx in block, block idx)
-  // where block idx is column major. This transposition limits the number of
-  // bank conflicts when reading the LHS. The core idea is that since the contracting
-  // index is shared by both sides, then the contracting index should be in threadIdx.x.
-
-  // On the LHS, we pad each row inside of each block with an extra element. This makes
-  // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
-  // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
-
-  // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
-  // conflicts on writes and also none on reads.
-
-  // storage indices
-  const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
-  const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
-
-  const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
-  const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
-  const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
-  const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
-  const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
-  const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
-  const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
-  const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
-
-  const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
-  const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
-  const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
-  const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
-  const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
-  const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
-  const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
-  const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
-
-  // in the loading code, the following variables are important:
-  // threadIdx.x: the vertical position in an 8x8 block
-  // threadIdx.y: the vertical index of the 8x8 block in the grid
-  // threadIdx.z: the horizontal position in an 8x8 block
-  // k: the horizontal index of the 8x8 block in the grid
-  //
-  // The k parameter is implicit (it was the loop counter for a loop that went
-  // from 0 to <8, but now that loop is unrolled in the below code.
-
-  const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
-  const Index lhs_vert = base_m + load_idx_vert;
-
-#define prefetchIntoRegisters(base_k)                           \
-  {                                                             \
-    lhs_pf0 = conv(0);                                          \
-    lhs_pf1 = conv(0);                                          \
-    lhs_pf2 = conv(0);                                          \
-    lhs_pf3 = conv(0);                                          \
-    lhs_pf4 = conv(0);                                          \
-    lhs_pf5 = conv(0);                                          \
-    lhs_pf6 = conv(0);                                          \
-    lhs_pf7 = conv(0);                                          \
-                                                                \
-    rhs_pf0 = conv(0);                                          \
-    rhs_pf1 = conv(0);                                          \
-    rhs_pf2 = conv(0);                                          \
-    rhs_pf3 = conv(0);                                          \
-    rhs_pf4 = conv(0);                                          \
-    rhs_pf5 = conv(0);                                          \
-    rhs_pf6 = conv(0);                                          \
-    rhs_pf7 = conv(0);                                          \
-                                                                \
-    if (!needs_edge_check || lhs_vert < m_size) {               \
-      const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8;   \
-      const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8;   \
-      const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8;   \
-      const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8;   \
-      const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8;   \
-      const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8;   \
-      const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8;   \
-      const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8;   \
-                                                                \
-      if (!needs_edge_check || lhs_horiz_7 < k_size) {          \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
-        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
-        lhs_pf7 = lhs(lhs_vert, lhs_horiz_7);                   \
-      } else if (lhs_horiz_6 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
-        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
-      } else if (lhs_horiz_5 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
-      } else if (lhs_horiz_4 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-      } else if (lhs_horiz_3 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-      } else if (lhs_horiz_2 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-      } else if (lhs_horiz_1 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-      } else if (lhs_horiz_0 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-      }                                                         \
-    }                                                           \
-                                                                \
-    const Index rhs_vert = base_k + load_idx_vert;              \
-    if (!needs_edge_check || rhs_vert < k_size) {               \
-      const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8;   \
-      const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8;   \
-      const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8;   \
-      const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8;   \
-      const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8;   \
-      const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8;   \
-      const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8;   \
-      const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8;   \
-                                                                \
-      if (rhs_horiz_7 < n_size) {                               \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
-        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
-        rhs_pf7 = rhs(rhs_vert, rhs_horiz_7);                   \
-      } else if (rhs_horiz_6 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
-        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
-      } else if (rhs_horiz_5 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
-      } else if (rhs_horiz_4 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-      } else if (rhs_horiz_3 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-      } else if (rhs_horiz_2 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-      } else if (rhs_horiz_1 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-      } else if (rhs_horiz_0 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-      }                                                         \
-    }                                                           \
-  }                                                             \
-
-#define writeRegToShmem(_)                      \
-  lhs_shmem[lhs_store_idx_0] = lhs_pf0;         \
-  rhs_shmem[rhs_store_idx_0] = rhs_pf0;         \
-                                                \
-  lhs_shmem[lhs_store_idx_1] = lhs_pf1;         \
-  rhs_shmem[rhs_store_idx_1] = rhs_pf1;         \
-                                                \
-  lhs_shmem[lhs_store_idx_2] = lhs_pf2;         \
-  rhs_shmem[rhs_store_idx_2] = rhs_pf2;         \
-                                                \
-  lhs_shmem[lhs_store_idx_3] = lhs_pf3;         \
-  rhs_shmem[rhs_store_idx_3] = rhs_pf3;         \
-                                                \
-  lhs_shmem[lhs_store_idx_4] = lhs_pf4;         \
-  rhs_shmem[rhs_store_idx_4] = rhs_pf4;         \
-                                                \
-  lhs_shmem[lhs_store_idx_5] = lhs_pf5;         \
-  rhs_shmem[rhs_store_idx_5] = rhs_pf5;         \
-                                                \
-  lhs_shmem[lhs_store_idx_6] = lhs_pf6;         \
-  rhs_shmem[rhs_store_idx_6] = rhs_pf6;         \
-                                                \
-  lhs_shmem[lhs_store_idx_7] = lhs_pf7;         \
-  rhs_shmem[rhs_store_idx_7] = rhs_pf7;         \
-
-  // declare and initialize result array
-#define res(i, j) _res_##i##j
-#define initResultRow(i)                        \
-  Scalar res(i, 0) = conv(0);                   \
-  Scalar res(i, 1) = conv(0);                   \
-  Scalar res(i, 2) = conv(0);                   \
-  Scalar res(i, 3) = conv(0);                   \
-  Scalar res(i, 4) = conv(0);                   \
-  Scalar res(i, 5) = conv(0);                   \
-  Scalar res(i, 6) = conv(0);                   \
-  Scalar res(i, 7) = conv(0);                   \
-
-  internal::scalar_cast_op<int, Scalar> conv;
-  initResultRow(0);
-  initResultRow(1);
-  initResultRow(2);
-  initResultRow(3);
-  initResultRow(4);
-  initResultRow(5);
-  initResultRow(6);
-  initResultRow(7);
-#undef initResultRow
-
-  for (Index base_k = 0; base_k < k_size; base_k += 64) {
-    // wait for previous iteration to finish with shmem. Despite common sense,
-    // the code is a bit faster with this here then at bottom of loop
-    __syncthreads();
-
-    prefetchIntoRegisters(base_k);
-    writeRegToShmem();
-
-    #undef prefetchIntoRegisters
-    #undef writeRegToShmem
-
-    // wait for shared mem packing to be done before starting computation
-    __syncthreads();
-
-    // compute 8x8 matrix product by outer product. This involves packing one column
-    // of LHS and one row of RHS into registers (takes 16 registers).
-
-#define lcol(i) _lcol##i
-    Scalar lcol(0);
-    Scalar lcol(1);
-    Scalar lcol(2);
-    Scalar lcol(3);
-    Scalar lcol(4);
-    Scalar lcol(5);
-    Scalar lcol(6);
-    Scalar lcol(7);
-
-#define rrow(j) _rrow##j
-    Scalar rrow(0);
-    Scalar rrow(1);
-    Scalar rrow(2);
-    Scalar rrow(3);
-    Scalar rrow(4);
-    Scalar rrow(5);
-    Scalar rrow(6);
-    Scalar rrow(7);
-
-    // Now x corresponds to k, y to m, and z to n
-    const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
-    const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
-
-#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
-#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
-
-#define loadData(i, j)                          \
-    lcol(0) = lhs_element(0, j);               \
-    rrow(0) = rhs_element(i, 0);               \
-    lcol(1) = lhs_element(1, j);               \
-    rrow(1) = rhs_element(i, 1);               \
-    lcol(2) = lhs_element(2, j);               \
-    rrow(2) = rhs_element(i, 2);               \
-    lcol(3) = lhs_element(3, j);               \
-    rrow(3) = rhs_element(i, 3);               \
-    lcol(4) = lhs_element(4, j);               \
-    rrow(4) = rhs_element(i, 4);               \
-    lcol(5) = lhs_element(5, j);               \
-    rrow(5) = rhs_element(i, 5);               \
-    lcol(6) = lhs_element(6, j);               \
-    rrow(6) = rhs_element(i, 6);               \
-    lcol(7) = lhs_element(7, j);               \
-    rrow(7) = rhs_element(i, 7);               \
-
-#define computeCol(j)                           \
-    res(0, j) += lcol(0) * rrow(j);             \
-    res(1, j) += lcol(1) * rrow(j);             \
-    res(2, j) += lcol(2) * rrow(j);             \
-    res(3, j) += lcol(3) * rrow(j);             \
-    res(4, j) += lcol(4) * rrow(j);             \
-    res(5, j) += lcol(5) * rrow(j);             \
-    res(6, j) += lcol(6) * rrow(j);             \
-    res(7, j) += lcol(7) * rrow(j);             \
-
-#define computePass(i)                          \
-    loadData(i, i);                             \
-                                                \
-    computeCol(0);                              \
-    computeCol(1);                              \
-    computeCol(2);                              \
-    computeCol(3);                              \
-    computeCol(4);                              \
-    computeCol(5);                              \
-    computeCol(6);                              \
-    computeCol(7);                              \
-
-    computePass(0);
-    computePass(1);
-    computePass(2);
-    computePass(3);
-    computePass(4);
-    computePass(5);
-    computePass(6);
-    computePass(7);
-
-#undef lcol
-#undef rrow
-#undef lhs_element
-#undef rhs_element
-#undef loadData
-#undef computeCol
-#undef computePass
-  } // end loop over k
-
-  // we've now iterated over all of the large (ie width 64) k blocks and
-  // accumulated results in registers. At this point thread (x, y, z) contains
-  // the sum across all big k blocks of the product of little k block of index (x, y)
-  // with block of index (y, z). To compute the final output, we need to reduce
-  // the 8 threads over y by summation.
-#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
-
-#define reduceRow(i, mask)                      \
-  shuffleInc(i, 0, mask);                       \
-  shuffleInc(i, 1, mask);                       \
-  shuffleInc(i, 2, mask);                       \
-  shuffleInc(i, 3, mask);                       \
-  shuffleInc(i, 4, mask);                       \
-  shuffleInc(i, 5, mask);                       \
-  shuffleInc(i, 6, mask);                       \
-  shuffleInc(i, 7, mask);                       \
-
-#define reduceMatrix(mask)                      \
-  reduceRow(0, mask);                           \
-  reduceRow(1, mask);                           \
-  reduceRow(2, mask);                           \
-  reduceRow(3, mask);                           \
-  reduceRow(4, mask);                           \
-  reduceRow(5, mask);                           \
-  reduceRow(6, mask);                           \
-  reduceRow(7, mask);                           \
-
-  // actually perform the reduction, now each thread of index (_, y, z)
-  // contains the correct values in its registers that belong in the output
-  // block
-  reduceMatrix(1);
-  reduceMatrix(2);
-  reduceMatrix(4);
-
-#undef shuffleInc
-#undef reduceRow
-#undef reduceMatrix
-
-  // now we need to copy the 64 values into main memory. We can't split work
-  // among threads because all variables are in registers. There's 2 ways
-  // to do this:
-  // (1) have 1 thread do 64 writes from registers into global memory
-  // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
-  //     each do 8 writes into global memory. We can just overwrite the shared
-  //     memory from the problem we just solved.
-  // (2) is slightly faster than (1) due to less branching and more ILP
-
-  // TODO: won't yield much gain, but could just use currently unused shared mem
-  //       and then we won't have to sync
-  // wait for shared mem to be out of use
-  __syncthreads();
-
-#define writeResultShmem(i, j)                                          \
-  lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \
-
-#define writeRow(i)                             \
-  writeResultShmem(i, 0);                       \
-  writeResultShmem(i, 1);                       \
-  writeResultShmem(i, 2);                       \
-  writeResultShmem(i, 3);                       \
-  writeResultShmem(i, 4);                       \
-  writeResultShmem(i, 5);                       \
-  writeResultShmem(i, 6);                       \
-  writeResultShmem(i, 7);                       \
-
-  if (threadIdx.x == 0) {
-    writeRow(0);
-    writeRow(1);
-    writeRow(2);
-    writeRow(3);
-    writeRow(4);
-    writeRow(5);
-    writeRow(6);
-    writeRow(7);
-  }
-#undef writeResultShmem
-#undef writeRow
-
-  const int max_i_write = (min)((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
-  const int max_j_write = (min)((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
-
-  if (threadIdx.x < max_i_write) {
-    if (max_j_write == 8) {
-      // TODO: can i trade bank conflicts for coalesced writes?
-      Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
-      Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
-      Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
-      Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
-      Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
-      Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
-      Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
-      Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
-
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
-    } else {
-#pragma unroll 7
-      for (int j = 0; j < max_j_write; j++) {
-        Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
-        output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
-      }
-    }
-  }
-#undef res
-}
-
-
-template<typename Scalar, typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper>
-__global__ void
-__launch_bounds__(512)
-EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output,
-                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ Scalar lhs_shmem[72 * 64];
-  __shared__ Scalar rhs_shmem[72 * 64];
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 64 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  if (base_m + 63 < m_size && base_n + 63 < n_size) {
-    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
-  } else {
-    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
-  }
-}
-
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
-         bool CHECK_RHS_BOUNDARY>
-__device__ EIGEN_ALWAYS_INLINE void
-EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output, float2 lhs_shmem2[][16],
-                       float2 rhs_shmem2[][8], const Index m_size,
-                       const Index n_size, const Index k_size,
-                       const Index base_m, const Index base_n) {
-  typedef float Scalar;
-
-  // prefetch registers
-  float4 lhs_pf0, rhs_pf0;
-
-  float4 results[4];
-  for (int i = 0; i < 4; i++) {
-    results[i].x = results[i].y = results[i].z = results[i].w = 0;
-  }
-
-
-#define prefetch_lhs(reg, row, col)                   \
-    if (!CHECK_LHS_BOUNDARY) {                        \
-      if (col < k_size) {                             \
-        reg =lhs.loadPacket(row, col);                \
-      }                                               \
-    } else {                                          \
-      if (col < k_size) {                             \
-        if (row + 3 < m_size) {                       \
-          reg =lhs.loadPacket(row, col);              \
-        } else if (row + 2 < m_size) {                \
-          reg.x =lhs(row + 0, col);                   \
-          reg.y =lhs(row + 1, col);                   \
-          reg.z =lhs(row + 2, col);                   \
-        } else if (row + 1 < m_size) {                \
-          reg.x =lhs(row + 0, col);                   \
-          reg.y =lhs(row + 1, col);                   \
-        } else if (row  < m_size) {                   \
-          reg.x =lhs(row + 0, col);                   \
-        }                                             \
-      }                                               \
-    }                                                 \
-
-
-  Index lhs_vert = base_m+threadIdx.x*4;
-
-  for (Index k = 0; k < k_size; k += 16) {
-    lhs_pf0 = internal::pset1<float4>(0);
-    rhs_pf0 = internal::pset1<float4>(0);
-
-    Index lhs_horiz = threadIdx.y+k;
-    prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
-
-    Index rhs_vert = k+(threadIdx.x%4)*4;
-    Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n;
-
-    if (!CHECK_RHS_BOUNDARY) {
-      if ((rhs_vert + 3) < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
-      } else if (rhs_vert + 2 < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-      } else if (rhs_vert + 1 < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-      } else if (rhs_vert  < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-      }
-    } else {
-      if (rhs_horiz0 < n_size) {
-        if ((rhs_vert + 3) < k_size) {
-          rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
-        } else if ((rhs_vert + 2) < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        } else if ((rhs_vert + 1) < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        } else if (rhs_vert  < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        }
-      }
-    }
-    float x1, x2 ;
-    // the following can be a bitwise operation..... some day.
-    if((threadIdx.x%8) < 4) {
-      x1 = rhs_pf0.y;
-      x2 = rhs_pf0.w;
-    } else {
-      x1 = rhs_pf0.x;
-      x2 = rhs_pf0.z;
-    }
-    x1 = __shfl_xor(x1, 4);
-    x2 = __shfl_xor(x2, 4);
-    if((threadIdx.x%8) < 4) {
-      rhs_pf0.y = x1;
-      rhs_pf0.w = x2;
-    } else {
-      rhs_pf0.x = x1;
-      rhs_pf0.z = x2;
-    }
-
-    // We have 64 features.
-    // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
-    // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
-    // ...
-    // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
-    // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
-    // ...
-    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
-    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
-
-    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
-    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
-    // ...
-    // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
-    // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63)
-    // ...
-
-    lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
-    lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
-
-
-#define add_vals(fl1, fl2, fr1, fr2)\
-    results[0].x += fl1.x * fr1.x;\
-    results[0].y += fl1.y * fr1.x;\
-    results[0].z += fl2.x * fr1.x;\
-    results[0].w += fl2.y * fr1.x;\
-\
-    results[1].x += fl1.x * fr1.y;\
-    results[1].y += fl1.y * fr1.y;\
-    results[1].z += fl2.x * fr1.y;\
-    results[1].w += fl2.y * fr1.y;\
-\
-    results[2].x += fl1.x * fr2.x;\
-    results[2].y += fl1.y * fr2.x;\
-    results[2].z += fl2.x * fr2.x;\
-    results[2].w += fl2.y * fr2.x;\
-\
-    results[3].x += fl1.x * fr2.y;\
-    results[3].y += fl1.y * fr2.y;\
-    results[3].z += fl2.x * fr2.y;\
-    results[3].w += fl2.y * fr2.y;\
-
-    __syncthreads();
-
-    // Do the multiplies.
-    #pragma unroll
-    for (int koff = 0; koff < 16; koff ++) {
-      // 32 x threads.
-      float2 fl1 = lhs_shmem2[koff][threadIdx.x];
-      float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
-
-      int start_feature = threadIdx.y * 4;
-      float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
-      float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
-
-      add_vals(fl1, fl2, fr1, fr2)
-    }
-    __syncthreads();
-  }
-
-#undef prefetch_lhs
-#undef add_vals
-
-  Index horiz_base = threadIdx.y*4+base_n;
-  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
-    for (int i = 0; i < 4; i++) {
-      output(lhs_vert, horiz_base + i) = results[i].x;
-      output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      output(lhs_vert + 3, horiz_base + i) = results[i].w;
-    }
-  } else if (!CHECK_RHS_BOUNDARY) {
-    // CHECK LHS
-    if (lhs_vert + 3 < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    } else if (lhs_vert + 2 < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      }
-    } else if (lhs_vert + 1 < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      }
-    } else if (lhs_vert  < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-      }
-    }
-  } else if (!CHECK_LHS_BOUNDARY) {
-    // CHECK RHS
-    /*
-    int ncols_rem = fminf(n_size- horiz_base, 4);
-    for (int i = 0; i < ncols_rem; i++) {
-      output(lhs_vert, horiz_base + i) = results[i].x;
-      output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      output(lhs_vert + 3, horiz_base + i) = results[i].w;
-    }*/
-    for (int i = 0; i < 4; i++) {
-      if (horiz_base+i < n_size) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-       }
-    }
-  } else {
-    // CHECK both boundaries.
-    for (int i = 0; i < 4; i++) {
-      if (horiz_base+i < n_size) {
-        if (lhs_vert < m_size)
-          output(lhs_vert, horiz_base + i) = results[i].x;
-        if (lhs_vert + 1 < m_size)
-          output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        if (lhs_vert + 2 < m_size)
-          output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        if (lhs_vert + 3 < m_size)
-          output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    }
-  }
-}
-
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
-         bool CHECK_RHS_BOUNDARY>
-__device__ EIGEN_ALWAYS_INLINE void
-EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output, float2 lhs_shmem2[][32],
-                       float2 rhs_shmem2[][8], const Index m_size,
-                       const Index n_size, const Index k_size,
-                       const Index base_m, const Index base_n) {
-  typedef float Scalar;
-
-  // prefetch registers
-  float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
-  float4 rhs_pf0, rhs_pf1;
-
-  float4 results[8];
-  for (int i=0; i < 8; i++) {
-    results[i].x = results[i].y = results[i].z = results[i].w = 0;
-  }
-
-
-  Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
-  for (Index k = 0; k < k_size; k += 32) {
-    lhs_pf0 = internal::pset1<float4>(0);
-    lhs_pf1 = internal::pset1<float4>(0);
-    lhs_pf2 = internal::pset1<float4>(0);
-    lhs_pf3 = internal::pset1<float4>(0);
-
-    rhs_pf0 = internal::pset1<float4>(0);
-    rhs_pf1 = internal::pset1<float4>(0);
-
-     if (!CHECK_LHS_BOUNDARY) {
-      if ((threadIdx.y/4+k+24) < k_size) {
-        lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
-        lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
-        lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24));
-      } else if ((threadIdx.y/4+k+16) < k_size) {
-        lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
-        lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
-      } else if ((threadIdx.y/4+k+8) < k_size) {
-        lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
-      } else if ((threadIdx.y/4+k) < k_size) {
-        lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
-      }
-    } else {
-      // just CHECK_LHS_BOUNDARY
-      if (lhs_vert + 3 < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
-          lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
-          lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
-          lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
-        }
-      } else if (lhs_vert + 2 < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
-          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
-          lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-        }
-      } else if (lhs_vert + 1 < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
-          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-        }
-      } else if (lhs_vert < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-        }
-      }
-    }
-    __syncthreads();
-    Index rhs_vert = k+threadIdx.x*4;
-    Index rhs_horiz0 = threadIdx.y*2+base_n;
-    Index rhs_horiz1 = threadIdx.y*2+1+base_n;
-    if (!CHECK_RHS_BOUNDARY) {
-      if ((rhs_vert + 3) < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
-        rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1);
-      } else if (rhs_vert + 2 < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-        rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
-      } else if (rhs_vert + 1 < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-      } else if (rhs_vert  < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-      }
-    } else {
-      if (rhs_horiz1 < n_size) {
-        if ((rhs_vert + 3) < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
-          rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1);
-        } else if (rhs_vert + 2 < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-          rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
-        } else if (k+threadIdx.x*4 + 1 < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-        } else if (k+threadIdx.x*4  < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        }
-      } else if (rhs_horiz0 < n_size) {
-        if ((rhs_vert + 3) < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
-        } else if ((rhs_vert + 2) < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        } else if ((rhs_vert + 1) < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        } else if (rhs_vert  < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        }
-      }
-    }
-    __syncthreads();
-    // Loaded. Do computation
-    // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
-    // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
-    // ..
-    // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
-    rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
-    // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
-    // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
-    // ..
-    rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
-    // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
-    // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
-    rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
-    // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
-    // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
-    rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
-
-    // LHS.
-    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
-    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
-    // ...
-    // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
-    // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
-
-
-#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
-      results[0].x += a_feat1.x * f1.x;\
-      results[1].x += a_feat1.x * f1.y;\
-      results[2].x += a_feat1.x * f2.x;\
-      results[3].x += a_feat1.x * f2.y;\
-      results[4].x += a_feat1.x * f3.x;\
-      results[5].x += a_feat1.x * f3.y;\
-      results[6].x += a_feat1.x * f4.x;\
-      results[7].x += a_feat1.x * f4.y;\
-\
-      results[0].y += a_feat1.y * f1.x;\
-      results[1].y += a_feat1.y * f1.y;\
-      results[2].y += a_feat1.y * f2.x;\
-      results[3].y += a_feat1.y * f2.y;\
-      results[4].y += a_feat1.y * f3.x;\
-      results[5].y += a_feat1.y * f3.y;\
-      results[6].y += a_feat1.y * f4.x;\
-      results[7].y += a_feat1.y * f4.y;\
-\
-      results[0].z += a_feat2.x * f1.x;\
-      results[1].z += a_feat2.x * f1.y;\
-      results[2].z += a_feat2.x * f2.x;\
-      results[3].z += a_feat2.x * f2.y;\
-      results[4].z += a_feat2.x * f3.x;\
-      results[5].z += a_feat2.x * f3.y;\
-      results[6].z += a_feat2.x * f4.x;\
-      results[7].z += a_feat2.x * f4.y;\
-\
-      results[0].w += a_feat2.y * f1.x;\
-      results[1].w += a_feat2.y * f1.y;\
-      results[2].w += a_feat2.y * f2.x;\
-      results[3].w += a_feat2.y * f2.y;\
-      results[4].w += a_feat2.y * f3.x;\
-      results[5].w += a_feat2.y * f3.y;\
-      results[6].w += a_feat2.y * f4.x;\
-      results[7].w += a_feat2.y * f4.y;\
-
-    lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
-    lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
-    lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
-    lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
-
-    lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
-    lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
-    lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
-    lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
-
-    __syncthreads();
-
-    // Do the multiplies.
-    #pragma unroll
-    for (int koff = 0; koff < 32; koff ++) {
-      float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
-      float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
-
-      // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
-      int start_feature = (threadIdx.y / 4) * 8;
-
-      float2 br1 = rhs_shmem2[start_feature/2 +     (koff % 4) * 32][koff/4];
-      float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
-      float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
-      float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
-
-      add_vals(a3, a4, br1, br2, br3, br4)
-    }
-    __syncthreads();
-  } // end loop over k
-
-
-  __syncthreads();
-  Index horiz_base = (threadIdx.y/4)*8+base_n;
-  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
-    #pragma unroll
-    for (int i = 0; i < 8; i++) {
-      output(lhs_vert, horiz_base + i) = results[i].x;
-      output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      output(lhs_vert + 3, horiz_base + i) = results[i].w;
-    }
-  } else if (!CHECK_RHS_BOUNDARY) {
-    if (lhs_vert + 3 < m_size) {
-      #pragma unroll
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    } else if (lhs_vert + 2 < m_size) {
-      #pragma unroll
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      }
-    } else if (lhs_vert + 1 < m_size) {
-      #pragma unroll
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      }
-    } else if (lhs_vert  < m_size) {
-      #pragma unroll
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-      }
-    }
-  } else if (!CHECK_LHS_BOUNDARY) {
-    // CHECK BOUNDARY_B
-    #pragma unroll
-    for (int i = 0; i < 8; i++) {
-      if (horiz_base + i < n_size) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    }
-  } else {
-    // CHECK both boundaries.
-    #pragma unroll
-    for (int i = 0; i < 8; i++) {
-      if (horiz_base + i < n_size) {
-        if (lhs_vert < m_size)
-          output(lhs_vert, horiz_base + i) = results[i].x;
-        if (lhs_vert + 1 < m_size)
-          output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        if (lhs_vert + 2 < m_size)
-          output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        if (lhs_vert + 3 < m_size)
-          output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    }
-  }
-}
-
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper>
-__global__ void
-__launch_bounds__(256, 2)
-EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output,
-                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ float2 lhs_shmem[64*32];
-  __shared__ float2 rhs_shmem[128*8];
-
-  typedef float2 LHS_MEM[64][32];
-  typedef float2 RHS_MEM[128][8];
-
-  typedef float2 LHS_MEM16x16[32][16];
-  typedef float2 RHS_MEM16x16[64][8];
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 128 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  const bool check_rhs = (base_n + 63) >= n_size;
-  const bool check_lhs128 = (base_m + 127) >= m_size;
-
-  if (!check_rhs) {
-    if (!check_lhs128) {
-      // >= 128 rows left
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    }
-  } else {
-    if (!check_lhs128) {
-      // >= 128 rows left
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    }
-  }
-}
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper>
-__global__ void
-__launch_bounds__(256)
-EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output,
-                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ float2 lhs_shmem[32][16];
-  __shared__ float2 rhs_shmem[64][8];
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 64 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  if (base_m + 63 < m_size) {
-    if (base_n + 63 < n_size) {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    }
-  } else {
-    if (base_n + 63 < n_size) {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    }
-  }
-}
-
-
-template<typename Indices, typename LeftArgType, typename RightArgType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> :
-    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> > {
-
-  typedef GpuDevice Device;
-
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
-  typedef TensorContractionEvaluatorBase<Self> Base;
-
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
-
-  enum {
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-  };
-
-  // Most of the code is assuming that both input tensors are ColMajor. If the
-  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
-  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
-  // will pretend B is LHS and A is RHS.
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
-
-  static const int LDims =
-      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
-  static const int RDims =
-      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
-  static const int ContractDims = internal::array_size<Indices>::value;
-
-  typedef array<Index, LDims> left_dim_mapper_t;
-  typedef array<Index, RDims> right_dim_mapper_t;
-
-  typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, LDims - ContractDims> left_nocontract_t;
-  typedef array<Index, RDims - ContractDims> right_nocontract_t;
-
-  static const int NumDims = LDims + RDims - 2 * ContractDims;
-
-  typedef DSizes<Index, NumDims> Dimensions;
-
-  // typedefs needed in evalTo
-  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-
-  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-
-  typedef typename LeftEvaluator::Dimensions LeftDimensions;
-  typedef typename RightEvaluator::Dimensions RightDimensions;
-
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
-      Base(op, device) {}
-
-  // We need to redefine this method to make nvcc happy
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
-    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
-    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
-    if (data) {
-      evalTo(data);
-      return false;
-    } else {
-      this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
-      evalTo(this->m_result);
-      return true;
-    }
-  }
-
-  void evalTo(Scalar* buffer) const {
-    if (this->m_lhs_inner_dim_contiguous) {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<true, true, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<true, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<true, false, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<true, false, false, Unaligned>(buffer);
-        }
-      }
-    }
-    else {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<false, true, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<false, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<false, false, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<false, false, false, Unaligned>(buffer);
-        }
-      }
-    }
-  }
-
-  template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
-    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
-      const Index m_blocks = (m + 63) / 64;
-      const Index n_blocks = (n + 63) / 64;
-      const dim3 num_blocks(m_blocks, n_blocks, 1);
-      const dim3 block_size(8, 8, 8);
-      LAUNCH_CUDA_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
-    }
-  };
-
-  template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
-    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
-      if (m < 768 || n < 768) {
-        const Index m_blocks = (m + 63) / 64;
-        const Index n_blocks = (n + 63) / 64;
-        const dim3 num_blocks(m_blocks, n_blocks, 1);
-        const dim3 block_size(16, 16, 1);
-        LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
-      } else {
-        const Index m_blocks = (m + 127) / 128;
-        const Index n_blocks = (n + 63) / 64;
-        const dim3 num_blocks(m_blocks, n_blocks, 1);
-        const dim3 block_size(8, 32, 1);
-        LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
-      }
-    }
-  };
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalTyped(Scalar* buffer) const {
-    // columns in left side, rows in right side
-    const Index k = this->m_k_size;
-    EIGEN_UNUSED_VARIABLE(k)
-
-    // rows in left side
-    const Index m = this->m_i_size;
-
-    // columns in right side
-    const Index n = this->m_j_size;
-
-    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
-    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-
-    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
-                                                   LeftEvaluator, left_nocontract_t,
-                                                   contract_t, 4,
-                                                   lhs_inner_dim_contiguous,
-                                                   false, Unaligned> LhsMapper;
-
-    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
-                                                   RightEvaluator, right_nocontract_t,
-                                                   contract_t, 4,
-                                                   rhs_inner_dim_contiguous,
-                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
-
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-
-
-    // initialize data mappers
-    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
-                  this->m_left_contracting_strides, this->m_k_strides);
-
-    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
-                  this->m_right_contracting_strides, this->m_k_strides);
-
-    OutputMapper output(buffer, m);
-
-    setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte);
-    LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output,  m, n, k, this->m_device);
-  }
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_USE_GPU and __CUDACC__
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
+#include "TensorContractionGpu.h"

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
new file mode 100644
index 0000000..c818038
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h

@@ -0,0 +1,1413 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
+// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
+
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+
+namespace Eigen {
+
+template<typename Scalar, typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool needs_edge_check>
+__device__ EIGEN_STRONG_INLINE void
+EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                               const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
+                       const Index m_size, const Index n_size, const Index k_size) {
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  // declare and initialize 64 registers for output 8x8 block
+
+  // prefetch registers
+  Scalar lhs_pf0;
+  Scalar lhs_pf1;
+  Scalar lhs_pf2;
+  Scalar lhs_pf3;
+  Scalar lhs_pf4;
+  Scalar lhs_pf5;
+  Scalar lhs_pf6;
+  Scalar lhs_pf7;
+
+  Scalar rhs_pf0;
+  Scalar rhs_pf1;
+  Scalar rhs_pf2;
+  Scalar rhs_pf3;
+  Scalar rhs_pf4;
+  Scalar rhs_pf5;
+  Scalar rhs_pf6;
+  Scalar rhs_pf7;
+
+  // shared memory is formatted
+  // (contract idx in block, nocontract idx in block, block idx)
+  // where block idx is column major. This transposition limits the number of
+  // bank conflicts when reading the LHS. The core idea is that since the contracting
+  // index is shared by both sides, then the contracting index should be in threadIdx.x.
+
+  // On the LHS, we pad each row inside of each block with an extra element. This makes
+  // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
+  // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
+
+  // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
+  // conflicts on writes and also none on reads.
+
+  // storage indices
+  const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
+  const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
+
+  const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
+  const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
+  const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
+  const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
+  const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
+  const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
+  const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
+  const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
+
+  const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
+  const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
+  const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
+  const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
+  const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
+  const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
+  const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
+  const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
+
+  // in the loading code, the following variables are important:
+  // threadIdx.x: the vertical position in an 8x8 block
+  // threadIdx.y: the vertical index of the 8x8 block in the grid
+  // threadIdx.z: the horizontal position in an 8x8 block
+  // k: the horizontal index of the 8x8 block in the grid
+  //
+  // The k parameter is implicit (it was the loop counter for a loop that went
+  // from 0 to <8, but now that loop is unrolled in the below code.
+
+  const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
+  const Index lhs_vert = base_m + load_idx_vert;
+
+#define prefetchIntoRegisters(base_k)                           \
+  {                                                             \
+    lhs_pf0 = conv(0);                                          \
+    lhs_pf1 = conv(0);                                          \
+    lhs_pf2 = conv(0);                                          \
+    lhs_pf3 = conv(0);                                          \
+    lhs_pf4 = conv(0);                                          \
+    lhs_pf5 = conv(0);                                          \
+    lhs_pf6 = conv(0);                                          \
+    lhs_pf7 = conv(0);                                          \
+                                                                \
+    rhs_pf0 = conv(0);                                          \
+    rhs_pf1 = conv(0);                                          \
+    rhs_pf2 = conv(0);                                          \
+    rhs_pf3 = conv(0);                                          \
+    rhs_pf4 = conv(0);                                          \
+    rhs_pf5 = conv(0);                                          \
+    rhs_pf6 = conv(0);                                          \
+    rhs_pf7 = conv(0);                                          \
+                                                                \
+    if (!needs_edge_check || lhs_vert < m_size) {               \
+      const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8;   \
+      const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8;   \
+      const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8;   \
+      const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8;   \
+      const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8;   \
+      const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8;   \
+      const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8;   \
+      const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8;   \
+                                                                \
+      if (!needs_edge_check || lhs_horiz_7 < k_size) {          \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
+        lhs_pf7 = lhs(lhs_vert, lhs_horiz_7);                   \
+      } else if (lhs_horiz_6 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
+      } else if (lhs_horiz_5 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+      } else if (lhs_horiz_4 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+      } else if (lhs_horiz_3 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+      } else if (lhs_horiz_2 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+      } else if (lhs_horiz_1 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+      } else if (lhs_horiz_0 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+      }                                                         \
+    }                                                           \
+                                                                \
+    const Index rhs_vert = base_k + load_idx_vert;              \
+    if (!needs_edge_check || rhs_vert < k_size) {               \
+      const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8;   \
+      const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8;   \
+      const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8;   \
+      const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8;   \
+      const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8;   \
+      const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8;   \
+      const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8;   \
+      const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8;   \
+                                                                \
+      if (rhs_horiz_7 < n_size) {                               \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
+        rhs_pf7 = rhs(rhs_vert, rhs_horiz_7);                   \
+      } else if (rhs_horiz_6 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
+      } else if (rhs_horiz_5 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+      } else if (rhs_horiz_4 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+      } else if (rhs_horiz_3 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+      } else if (rhs_horiz_2 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+      } else if (rhs_horiz_1 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+      } else if (rhs_horiz_0 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+      }                                                         \
+    }                                                           \
+  }                                                             \
+
+#define writeRegToShmem(_)                      \
+  lhs_shmem[lhs_store_idx_0] = lhs_pf0;         \
+  rhs_shmem[rhs_store_idx_0] = rhs_pf0;         \
+                                                \
+  lhs_shmem[lhs_store_idx_1] = lhs_pf1;         \
+  rhs_shmem[rhs_store_idx_1] = rhs_pf1;         \
+                                                \
+  lhs_shmem[lhs_store_idx_2] = lhs_pf2;         \
+  rhs_shmem[rhs_store_idx_2] = rhs_pf2;         \
+                                                \
+  lhs_shmem[lhs_store_idx_3] = lhs_pf3;         \
+  rhs_shmem[rhs_store_idx_3] = rhs_pf3;         \
+                                                \
+  lhs_shmem[lhs_store_idx_4] = lhs_pf4;         \
+  rhs_shmem[rhs_store_idx_4] = rhs_pf4;         \
+                                                \
+  lhs_shmem[lhs_store_idx_5] = lhs_pf5;         \
+  rhs_shmem[rhs_store_idx_5] = rhs_pf5;         \
+                                                \
+  lhs_shmem[lhs_store_idx_6] = lhs_pf6;         \
+  rhs_shmem[rhs_store_idx_6] = rhs_pf6;         \
+                                                \
+  lhs_shmem[lhs_store_idx_7] = lhs_pf7;         \
+  rhs_shmem[rhs_store_idx_7] = rhs_pf7;         \
+
+  // declare and initialize result array
+#define res(i, j) _res_##i##j
+#define initResultRow(i)                        \
+  Scalar res(i, 0) = conv(0);                   \
+  Scalar res(i, 1) = conv(0);                   \
+  Scalar res(i, 2) = conv(0);                   \
+  Scalar res(i, 3) = conv(0);                   \
+  Scalar res(i, 4) = conv(0);                   \
+  Scalar res(i, 5) = conv(0);                   \
+  Scalar res(i, 6) = conv(0);                   \
+  Scalar res(i, 7) = conv(0);                   \
+
+  internal::scalar_cast_op<int, Scalar> conv;
+  initResultRow(0);
+  initResultRow(1);
+  initResultRow(2);
+  initResultRow(3);
+  initResultRow(4);
+  initResultRow(5);
+  initResultRow(6);
+  initResultRow(7);
+#undef initResultRow
+
+  for (Index base_k = 0; base_k < k_size; base_k += 64) {
+    // wait for previous iteration to finish with shmem. Despite common sense,
+    // the code is a bit faster with this here then at bottom of loop
+    __syncthreads();
+
+    prefetchIntoRegisters(base_k);
+    writeRegToShmem();
+
+    #undef prefetchIntoRegisters
+    #undef writeRegToShmem
+
+    // wait for shared mem packing to be done before starting computation
+    __syncthreads();
+
+    // compute 8x8 matrix product by outer product. This involves packing one column
+    // of LHS and one row of RHS into registers (takes 16 registers).
+
+#define lcol(i) _lcol##i
+    Scalar lcol(0);
+    Scalar lcol(1);
+    Scalar lcol(2);
+    Scalar lcol(3);
+    Scalar lcol(4);
+    Scalar lcol(5);
+    Scalar lcol(6);
+    Scalar lcol(7);
+
+#define rrow(j) _rrow##j
+    Scalar rrow(0);
+    Scalar rrow(1);
+    Scalar rrow(2);
+    Scalar rrow(3);
+    Scalar rrow(4);
+    Scalar rrow(5);
+    Scalar rrow(6);
+    Scalar rrow(7);
+
+    // Now x corresponds to k, y to m, and z to n
+    const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
+    const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
+
+#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
+#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
+
+#define loadData(i, j)                          \
+    lcol(0) = lhs_element(0, j);               \
+    rrow(0) = rhs_element(i, 0);               \
+    lcol(1) = lhs_element(1, j);               \
+    rrow(1) = rhs_element(i, 1);               \
+    lcol(2) = lhs_element(2, j);               \
+    rrow(2) = rhs_element(i, 2);               \
+    lcol(3) = lhs_element(3, j);               \
+    rrow(3) = rhs_element(i, 3);               \
+    lcol(4) = lhs_element(4, j);               \
+    rrow(4) = rhs_element(i, 4);               \
+    lcol(5) = lhs_element(5, j);               \
+    rrow(5) = rhs_element(i, 5);               \
+    lcol(6) = lhs_element(6, j);               \
+    rrow(6) = rhs_element(i, 6);               \
+    lcol(7) = lhs_element(7, j);               \
+    rrow(7) = rhs_element(i, 7);               \
+
+#define computeCol(j)                           \
+    res(0, j) += lcol(0) * rrow(j);             \
+    res(1, j) += lcol(1) * rrow(j);             \
+    res(2, j) += lcol(2) * rrow(j);             \
+    res(3, j) += lcol(3) * rrow(j);             \
+    res(4, j) += lcol(4) * rrow(j);             \
+    res(5, j) += lcol(5) * rrow(j);             \
+    res(6, j) += lcol(6) * rrow(j);             \
+    res(7, j) += lcol(7) * rrow(j);             \
+
+#define computePass(i)                          \
+    loadData(i, i);                             \
+                                                \
+    computeCol(0);                              \
+    computeCol(1);                              \
+    computeCol(2);                              \
+    computeCol(3);                              \
+    computeCol(4);                              \
+    computeCol(5);                              \
+    computeCol(6);                              \
+    computeCol(7);                              \
+
+    computePass(0);
+    computePass(1);
+    computePass(2);
+    computePass(3);
+    computePass(4);
+    computePass(5);
+    computePass(6);
+    computePass(7);
+
+#undef lcol
+#undef rrow
+#undef lhs_element
+#undef rhs_element
+#undef loadData
+#undef computeCol
+#undef computePass
+  } // end loop over k
+
+  // we've now iterated over all of the large (ie width 64) k blocks and
+  // accumulated results in registers. At this point thread (x, y, z) contains
+  // the sum across all big k blocks of the product of little k block of index (x, y)
+  // with block of index (y, z). To compute the final output, we need to reduce
+  // the 8 threads over y by summation.
+#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
+#else
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
+#endif
+
+#define reduceRow(i, mask)                      \
+  shuffleInc(i, 0, mask);                       \
+  shuffleInc(i, 1, mask);                       \
+  shuffleInc(i, 2, mask);                       \
+  shuffleInc(i, 3, mask);                       \
+  shuffleInc(i, 4, mask);                       \
+  shuffleInc(i, 5, mask);                       \
+  shuffleInc(i, 6, mask);                       \
+  shuffleInc(i, 7, mask);                       \
+
+#define reduceMatrix(mask)                      \
+  reduceRow(0, mask);                           \
+  reduceRow(1, mask);                           \
+  reduceRow(2, mask);                           \
+  reduceRow(3, mask);                           \
+  reduceRow(4, mask);                           \
+  reduceRow(5, mask);                           \
+  reduceRow(6, mask);                           \
+  reduceRow(7, mask);                           \
+
+  // actually perform the reduction, now each thread of index (_, y, z)
+  // contains the correct values in its registers that belong in the output
+  // block
+  reduceMatrix(1);
+  reduceMatrix(2);
+  reduceMatrix(4);
+
+#undef shuffleInc
+#undef reduceRow
+#undef reduceMatrix
+
+  // now we need to copy the 64 values into main memory. We can't split work
+  // among threads because all variables are in registers. There's 2 ways
+  // to do this:
+  // (1) have 1 thread do 64 writes from registers into global memory
+  // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
+  //     each do 8 writes into global memory. We can just overwrite the shared
+  //     memory from the problem we just solved.
+  // (2) is slightly faster than (1) due to less branching and more ILP
+
+  // TODO: won't yield much gain, but could just use currently unused shared mem
+  //       and then we won't have to sync
+  // wait for shared mem to be out of use
+  __syncthreads();
+
+#define writeResultShmem(i, j)                                          \
+  lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \
+
+#define writeRow(i)                             \
+  writeResultShmem(i, 0);                       \
+  writeResultShmem(i, 1);                       \
+  writeResultShmem(i, 2);                       \
+  writeResultShmem(i, 3);                       \
+  writeResultShmem(i, 4);                       \
+  writeResultShmem(i, 5);                       \
+  writeResultShmem(i, 6);                       \
+  writeResultShmem(i, 7);                       \
+
+  if (threadIdx.x == 0) {
+    writeRow(0);
+    writeRow(1);
+    writeRow(2);
+    writeRow(3);
+    writeRow(4);
+    writeRow(5);
+    writeRow(6);
+    writeRow(7);
+  }
+#undef writeResultShmem
+#undef writeRow
+
+  const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
+  const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
+
+  if (threadIdx.x < max_i_write) {
+    if (max_j_write == 8) {
+      // TODO: can i trade bank conflicts for coalesced writes?
+      Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
+      Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
+      Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
+      Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
+      Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
+      Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
+      Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
+      Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
+
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
+    } else {
+#pragma unroll 7
+      for (int j = 0; j < max_j_write; j++) {
+        Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
+        output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
+      }
+    }
+  }
+#undef res
+}
+
+
+template<typename Scalar, typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+#if defined(EIGEN_HIPCC)
+__launch_bounds__(512, 1)
+#else
+__launch_bounds__(512)
+#endif
+EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ Scalar lhs_shmem[72 * 64];
+  __shared__ Scalar rhs_shmem[72 * 64];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size && base_n + 63 < n_size) {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  } else {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+         bool CHECK_RHS_BOUNDARY>
+__device__ __forceinline__ void
+EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output, float2 lhs_shmem2[][16],
+                       float2 rhs_shmem2[][8], const Index m_size,
+                       const Index n_size, const Index k_size,
+                       const Index base_m, const Index base_n) {
+
+  // prefetch registers
+  float4 lhs_pf0, rhs_pf0;
+
+  float4 results[4];
+  for (int i=0; i < 4; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+#define prefetch_lhs(reg, row, col)                            \
+    if (!CHECK_LHS_BOUNDARY) {                                 \
+      if (col < k_size) {                                      \
+        reg =lhs.template loadPacket<float4,Unaligned>(row, col);     \
+      }                                                        \
+    } else {                                                   \
+      if (col < k_size) {                                      \
+        if (row + 3 < m_size) {                                \
+          reg =lhs.template loadPacket<float4,Unaligned>(row, col);   \
+        } else if (row + 2 < m_size) {                         \
+          reg.x =lhs(row + 0, col);                            \
+          reg.y =lhs(row + 1, col);                            \
+          reg.z =lhs(row + 2, col);                            \
+        } else if (row + 1 < m_size) {                         \
+          reg.x =lhs(row + 0, col);                            \
+          reg.y =lhs(row + 1, col);                            \
+        } else if (row  < m_size) {                            \
+          reg.x =lhs(row + 0, col);                            \
+        }                                                      \
+      }                                                        \
+    }							       \
+
+  Index lhs_vert = base_m+threadIdx.x*4;
+
+  for (Index k = 0; k < k_size; k += 16) {
+
+    lhs_pf0 = internal::pset1<float4>(0);
+    rhs_pf0 = internal::pset1<float4>(0);
+
+    Index lhs_horiz = threadIdx.y+k;
+    prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
+
+    Index rhs_vert = k+(threadIdx.x%4)*4;
+    Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n;
+
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+      } else if (rhs_vert  < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+      }
+    } else {
+      if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    float x1, x2 ;
+    // the following can be a bitwise operation..... some day.
+    if((threadIdx.x%8) < 4) {
+      x1 = rhs_pf0.y;
+      x2 = rhs_pf0.w;
+    } else {
+      x1 = rhs_pf0.x;
+      x2 = rhs_pf0.z;
+    }
+    #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
+    x1 = __shfl_xor(x1, 4);
+    x2 = __shfl_xor(x2, 4);
+    #else
+    x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4);
+    x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4);
+    #endif
+    if((threadIdx.x%8) < 4) {
+      rhs_pf0.y = x1;
+      rhs_pf0.w = x2;
+    } else {
+      rhs_pf0.x = x1;
+      rhs_pf0.z = x2;
+    }
+
+    // We have 64 features.
+    // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
+    // ...
+    // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
+    // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
+    // ...
+    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
+    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
+
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // ...
+    // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63)
+    // ...
+
+    lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
+
+
+#define add_vals(fl1, fl2, fr1, fr2)\
+    results[0].x += fl1.x * fr1.x;\
+    results[0].y += fl1.y * fr1.x;\
+    results[0].z += fl2.x * fr1.x;\
+    results[0].w += fl2.y * fr1.x;\
+\
+    results[1].x += fl1.x * fr1.y;\
+    results[1].y += fl1.y * fr1.y;\
+    results[1].z += fl2.x * fr1.y;\
+    results[1].w += fl2.y * fr1.y;\
+\
+    results[2].x += fl1.x * fr2.x;\
+    results[2].y += fl1.y * fr2.x;\
+    results[2].z += fl2.x * fr2.x;\
+    results[2].w += fl2.y * fr2.x;\
+\
+    results[3].x += fl1.x * fr2.y;\
+    results[3].y += fl1.y * fr2.y;\
+    results[3].z += fl2.x * fr2.y;\
+    results[3].w += fl2.y * fr2.y;\
+
+    __syncthreads();
+
+    // Do the multiplies.
+    #pragma unroll
+    for (int koff = 0; koff < 16; koff ++) {
+      // 32 x threads.
+      float2 fl1 = lhs_shmem2[koff][threadIdx.x];
+      float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
+
+      int start_feature = threadIdx.y * 4;
+      float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+      float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+
+      add_vals(fl1, fl2, fr1, fr2)
+    }
+    __syncthreads();
+  }
+
+#undef prefetch_lhs
+#undef add_vals
+
+  Index horiz_base = threadIdx.y*4+base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    for (int i = 0; i < 4; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    // CHECK LHS
+    if (lhs_vert + 3 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert  < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK RHS
+    /*
+    int ncols_rem = fminf(n_size- horiz_base, 4);
+    for (int i = 0; i < ncols_rem; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }*/
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base+i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+       }
+    }
+  } else {
+    // CHECK both boundaries.
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base+i < n_size) {
+        if (lhs_vert < m_size)
+          output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size)
+          output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size)
+          output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size)
+          output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+         bool CHECK_RHS_BOUNDARY>
+__device__ __forceinline__ void
+EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output, float2 lhs_shmem2[][32],
+                       float2 rhs_shmem2[][8], const Index m_size,
+                       const Index n_size, const Index k_size,
+                       const Index base_m, const Index base_n) {
+
+  // prefetch registers
+  float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
+  float4 rhs_pf0, rhs_pf1;
+
+  float4 results[8];
+  for (int i=0; i < 8; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+  Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
+  for (Index k = 0; k < k_size; k += 32) {
+    lhs_pf0 = internal::pset1<float4>(0);
+    lhs_pf1 = internal::pset1<float4>(0);
+    lhs_pf2 = internal::pset1<float4>(0);
+    lhs_pf3 = internal::pset1<float4>(0);
+
+    rhs_pf0 = internal::pset1<float4>(0);
+    rhs_pf1 = internal::pset1<float4>(0);
+
+     if (!CHECK_LHS_BOUNDARY) {
+      if ((threadIdx.y/4+k+24) < k_size) {
+        lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+        lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+      } else if ((threadIdx.y/4+k+16) < k_size) {
+        lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+      } else if ((threadIdx.y/4+k+8) < k_size) {
+        lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+      } else if ((threadIdx.y/4+k) < k_size) {
+        lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+      }
+    } else {
+      // just CHECK_LHS_BOUNDARY
+      if (lhs_vert + 3 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+          lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert + 2 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
+          lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert + 1 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+        }
+      }
+    }
+    __syncthreads();
+    Index rhs_vert = k+threadIdx.x*4;
+    Index rhs_horiz0 = threadIdx.y*2+base_n;
+    Index rhs_horiz1 = threadIdx.y*2+1+base_n;
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
+        rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+      } else if (rhs_vert  < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+      }
+    } else {
+      if (rhs_horiz1 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
+          rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1);
+        } else if (rhs_vert + 2 < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+          rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+        } else if (k+threadIdx.x*4 + 1 < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        } else if (k+threadIdx.x*4  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        }
+      } else if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    __syncthreads();
+    // Loaded. Do computation
+    // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
+    // ..
+    // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
+    rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
+    // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
+    // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
+    // ..
+    rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
+    // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
+    // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
+    rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
+    // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
+    // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
+    rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
+
+    // LHS.
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // ...
+    // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+    // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+
+
+#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
+      results[0].x += a_feat1.x * f1.x;\
+      results[1].x += a_feat1.x * f1.y;\
+      results[2].x += a_feat1.x * f2.x;\
+      results[3].x += a_feat1.x * f2.y;\
+      results[4].x += a_feat1.x * f3.x;\
+      results[5].x += a_feat1.x * f3.y;\
+      results[6].x += a_feat1.x * f4.x;\
+      results[7].x += a_feat1.x * f4.y;\
+\
+      results[0].y += a_feat1.y * f1.x;\
+      results[1].y += a_feat1.y * f1.y;\
+      results[2].y += a_feat1.y * f2.x;\
+      results[3].y += a_feat1.y * f2.y;\
+      results[4].y += a_feat1.y * f3.x;\
+      results[5].y += a_feat1.y * f3.y;\
+      results[6].y += a_feat1.y * f4.x;\
+      results[7].y += a_feat1.y * f4.y;\
+\
+      results[0].z += a_feat2.x * f1.x;\
+      results[1].z += a_feat2.x * f1.y;\
+      results[2].z += a_feat2.x * f2.x;\
+      results[3].z += a_feat2.x * f2.y;\
+      results[4].z += a_feat2.x * f3.x;\
+      results[5].z += a_feat2.x * f3.y;\
+      results[6].z += a_feat2.x * f4.x;\
+      results[7].z += a_feat2.x * f4.y;\
+\
+      results[0].w += a_feat2.y * f1.x;\
+      results[1].w += a_feat2.y * f1.y;\
+      results[2].w += a_feat2.y * f2.x;\
+      results[3].w += a_feat2.y * f2.y;\
+      results[4].w += a_feat2.y * f3.x;\
+      results[5].w += a_feat2.y * f3.y;\
+      results[6].w += a_feat2.y * f4.x;\
+      results[7].w += a_feat2.y * f4.y;\
+
+    lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
+    lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
+    lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
+
+    lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
+    lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
+    lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
+    lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
+
+    __syncthreads();
+
+    // Do the multiplies.
+    #pragma unroll
+    for (int koff = 0; koff < 32; koff ++) {
+      float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
+      float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
+
+      // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
+      int start_feature = (threadIdx.y / 4) * 8;
+
+      float2 br1 = rhs_shmem2[start_feature/2 +     (koff % 4) * 32][koff/4];
+      float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
+      float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
+      float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
+
+      add_vals(a3, a4, br1, br2, br3, br4)
+    }
+    __syncthreads();
+  } // end loop over k
+
+  __syncthreads();
+  Index horiz_base = (threadIdx.y/4)*8+base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    for (int i = 0; i < 8; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    if (lhs_vert + 3 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert  < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK BOUNDARY_B
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  } else {
+    // CHECK both boundaries.
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        if (lhs_vert < m_size)
+          output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size)
+          output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size)
+          output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size)
+          output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+#if defined(EIGEN_HIPCC)
+__launch_bounds__(256, 1)
+#else
+__launch_bounds__(256)
+#endif
+EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[64*32];
+  __shared__ float2 rhs_shmem[128*8];
+
+  typedef float2 LHS_MEM[64][32];
+  typedef float2 RHS_MEM[128][8];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 128 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  bool check_rhs = (base_n + 63) >= n_size;
+  bool check_lhs128 = (base_m + 127) >= m_size;
+
+  if (!check_rhs) {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+#if defined(EIGEN_HIPCC)
+__launch_bounds__(256, 1)
+#else
+__launch_bounds__(256)
+#endif
+EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[32][16];
+  __shared__ float2 rhs_shmem[64][8];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size) {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+
+template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> :
+    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> > {
+
+  typedef GpuDevice Device;
+
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
+
+  enum {
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+  };
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  // typedefs needed in evalTo
+  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+  typedef typename LeftEvaluator::Dimensions LeftDimensions;
+  typedef typename RightEvaluator::Dimensions RightDimensions;
+
+  TensorEvaluator(const XprType& op, const Device& device) :
+      Base(op, device)
+  {
+    EIGEN_STATIC_ASSERT( (internal::is_same<OutputKernelType, const NoOpOutputKernel>::value),
+                          GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS);
+  }
+
+  // We need to redefine this method to make nvcc happy
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
+    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(this->m_result);
+      return true;
+    }
+  }
+
+  void evalTo(Scalar* buffer) const {
+    if (this->m_lhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, true, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<true, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, false, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<true, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+    else {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, true, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<false, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, false, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<false, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+  }
+
+  template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
+    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
+    const Index m_blocks = (m + 63) / 64;
+    const Index n_blocks = (n + 63) / 64;
+    const dim3 num_blocks(m_blocks, n_blocks, 1);
+    const dim3 block_size(8, 8, 8);
+    LAUNCH_GPU_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+    }
+  };
+
+  template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
+    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
+      if (m < 768 || n < 768) {
+        const Index m_blocks = (m + 63) / 64;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(16, 16, 1);
+        LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+      } else {
+        const Index m_blocks = (m + 127) / 128;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(8, 32, 1);
+        LAUNCH_GPU_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+      }
+    }
+  };
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalTyped(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+    EIGEN_UNUSED_VARIABLE(k)
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, 4,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, 4,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+#if defined(EIGEN_USE_HIP)
+    setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
+#else
+    setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
+#endif
+
+    LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output,  m, n, k, this->m_device);
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_USE_GPU and EIGEN_GPUCC
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
new file mode 100644
index 0000000..9ab900b
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h

@@ -0,0 +1,575 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
+
+namespace Eigen {
+
+namespace internal {
+
+enum {
+  Rhs = 0,
+  Lhs = 1
+};
+
+/*
+ * Implementation of the Eigen blas_data_mapper class for tensors.
+ */
+/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the default make pointer is used which
+/// is scalar * for CoeffLoader.
+template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_ = MakePointer>
+struct CoeffLoader;
+
+template <typename Scalar, typename Index, int side, typename Tensor,
+          typename nocontract_t, typename contract_t, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
+          template <class> class MakePointer_ = MakePointer>
+class BaseTensorContractionMapper;
+
+template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_>
+struct CoeffLoader {
+  enum {
+    DirectOffsets = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) { }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) {
+    eigen_assert(false && "unsupported");
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_<const typename Tensor::Scalar>::Type
+  data() const {
+    eigen_assert(false && "unsupported");
+    return NULL;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); }
+
+ template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename Tensor::PacketReturnType packet(typename Tensor::Index index) const
+  {
+    return m_tensor.template packet<LoadMode>(index);
+  }
+
+  #ifdef EIGEN_USE_SYCL
+  // The placeholder accessors require to be bound to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_tensor.bind(cgh);
+  }
+  #endif
+
+ private:
+  const Tensor m_tensor;
+};
+
+template <typename Tensor, template <class> class MakePointer_>
+struct CoeffLoader<Tensor, true, MakePointer_> {
+  enum {
+    DirectOffsets = true
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
+    m_data += offset;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_<const typename Tensor::Scalar>::Type
+  data() const {
+    return m_data;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); }
+
+ template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename Tensor::PacketReturnType packet(typename Tensor::Index index) const
+  {
+    return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index);
+  }
+
+  #ifdef EIGEN_USE_SYCL
+  // The placeholder accessors require to be bound to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_data.bind(cgh);
+  }
+  #endif
+ private:
+  typedef typename Tensor::Scalar Scalar;
+
+  typename MakePointer_<const Scalar>::Type m_data;
+};
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         int packet_size, bool inner_dim_contiguous, int Alignment, template <class> class MakePointer_ = MakePointer>
+class SimpleTensorContractionMapper {
+  public:
+  EIGEN_DEVICE_FUNC
+  SimpleTensorContractionMapper(const Tensor& tensor,
+                                const nocontract_t& nocontract_strides,
+                                const nocontract_t& ij_strides,
+                                const contract_t& contract_strides,
+                                const contract_t& k_strides) :
+      m_tensor(tensor),
+      m_nocontract_strides(nocontract_strides),
+      m_ij_strides(ij_strides),
+      m_contract_strides(contract_strides),
+      m_k_strides(k_strides) { }
+
+  enum {
+    DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>::DirectOffsets
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
+    m_tensor.offsetBuffer(offset);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator()(Index row) const {
+    // column major assumption
+    return operator()(row, 0);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const {
+    return m_tensor.coeff(computeIndex(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const {
+    const bool left = (side == Lhs);
+    EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
+    Index nocontract_val = left ? row : col;
+    Index linidx = 0;
+    EIGEN_UNROLL_LOOP
+    for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+      const Index idx = nocontract_val / m_ij_strides[i];
+      linidx += idx * m_nocontract_strides[i];
+      nocontract_val -= idx * m_ij_strides[i];
+    }
+    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      if (side == Lhs && inner_dim_contiguous) {
+        eigen_assert(m_nocontract_strides[0] == 1);
+        linidx += nocontract_val;
+      } else {
+        linidx += nocontract_val * m_nocontract_strides[0];
+      }
+    }
+
+    Index contract_val = left ? col : row;
+    if(array_size<contract_t>::value > 0) {
+      EIGEN_UNROLL_LOOP
+      for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+        const Index idx = contract_val / m_k_strides[i];
+        linidx += idx * m_contract_strides[i];
+        contract_val -= idx * m_k_strides[i];
+      }
+
+      if (side == Rhs && inner_dim_contiguous) {
+        eigen_assert(m_contract_strides[0] == 1);
+        linidx += contract_val;
+      } else {
+        linidx += contract_val * m_contract_strides[0];
+      }
+    }
+
+    return linidx;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const {
+    const bool left = (side == Lhs);
+    EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
+    Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
+    Index linidx[2] = {0, 0};
+    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      EIGEN_UNROLL_LOOP
+      for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+        const Index idx0 = nocontract_val[0] / m_ij_strides[i];
+        const Index idx1 = nocontract_val[1] / m_ij_strides[i];
+        linidx[0] += idx0 * m_nocontract_strides[i];
+        linidx[1] += idx1 * m_nocontract_strides[i];
+        nocontract_val[0] -= idx0 * m_ij_strides[i];
+        nocontract_val[1] -= idx1 * m_ij_strides[i];
+      }
+      if (side == Lhs && inner_dim_contiguous) {
+        eigen_assert(m_nocontract_strides[0] == 1);
+        linidx[0] += nocontract_val[0];
+        linidx[1] += nocontract_val[1];
+      } else {
+        linidx[0] += nocontract_val[0] * m_nocontract_strides[0];
+        linidx[1] += nocontract_val[1] * m_nocontract_strides[0];
+      }
+    }
+
+    Index contract_val[2] = {left ? col : row, left ? col : row + distance};
+    if (array_size<contract_t>::value> 0) {
+      EIGEN_UNROLL_LOOP
+      for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+        const Index idx0 = contract_val[0] / m_k_strides[i];
+        const Index idx1 = contract_val[1] / m_k_strides[i];
+        linidx[0] += idx0 * m_contract_strides[i];
+        linidx[1] += idx1 * m_contract_strides[i];
+        contract_val[0] -= idx0 * m_k_strides[i];
+        contract_val[1] -= idx1 * m_k_strides[i];
+      }
+
+      if (side == Rhs && inner_dim_contiguous) {
+        eigen_assert(m_contract_strides[0] == 1);
+        linidx[0] += contract_val[0];
+        linidx[1] += contract_val[1];
+      } else {
+        linidx[0] += contract_val[0] * m_contract_strides[0];
+        linidx[1] += contract_val[1] * m_contract_strides[0];
+      }
+    }
+    return IndexPair<Index>(linidx[0], linidx[1]);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const {
+    // Only claim alignment when we can compute the actual stride (ie when we're
+    // dealing with the lhs with inner_dim_contiguous. This is because the
+    // matrix-vector product relies on the stride when dealing with aligned inputs.
+    return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const {
+    return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
+  }
+
+  #ifdef EIGEN_USE_SYCL
+  // The placeholder accessors require to be bound to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_tensor.bind(cgh);
+  }
+  #endif
+
+  const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& tensor() const {
+    return m_tensor;
+  }
+
+  const nocontract_t& nocontract_strides() const {
+    return m_nocontract_strides;
+  }
+  const nocontract_t& ij_strides() const { return m_ij_strides; }
+  const contract_t& contract_strides() const { return m_contract_strides; }
+  const contract_t& k_strides() const { return m_k_strides; }
+
+ protected:
+  CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_> m_tensor;
+  const nocontract_t m_nocontract_strides;
+  const nocontract_t m_ij_strides;
+  const contract_t m_contract_strides;
+  const contract_t m_k_strides;
+};
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         int packet_size, bool inner_dim_contiguous,
+         bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
+class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_>
+{
+ public:
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
+
+  EIGEN_DEVICE_FUNC
+  BaseTensorContractionMapper(const Tensor& tensor,
+                              const nocontract_t& nocontract_strides,
+                              const nocontract_t& ij_strides,
+                              const contract_t& contract_strides,
+                              const contract_t& k_strides) :
+  ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+  template <typename PacketT,int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  typename internal::enable_if<internal::unpacket_traits<PacketT>::size==packet_size,PacketT>::type
+  load(Index i, Index j) const
+  {
+    // whole method makes column major assumption
+
+    // don't need to add offsets for now (because operator handles that)
+    // current code assumes packet size must be a multiple of 2
+    EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) {
+      const Index index = this->computeIndex(i, j);
+      eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1);
+      return this->m_tensor.template packet<AlignmentType>(index);
+    }
+
+    const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
+    const Index first = indexPair.first;
+    const Index lastIdx = indexPair.second;
+
+    // We can always do optimized packet reads from left hand side right now, because
+    // the vertical matrix dimension on the left hand side is never contracting.
+    // On the right hand side we need to check if the contracting dimensions may have
+    // been shuffled first.
+    if (Tensor::PacketAccess &&
+        (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
+        (lastIdx - first) == (packet_size - 1)) {
+
+      return this->m_tensor.template packet<AlignmentType>(first);
+    }
+
+    EIGEN_ALIGN_MAX Scalar data[packet_size];
+
+    data[0] = this->m_tensor.coeff(first);
+    EIGEN_UNROLL_LOOP
+    for (Index k = 1; k < packet_size - 1; k += 2) {
+      const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
+      data[k] = this->m_tensor.coeff(internal_pair.first);
+      data[k + 1] = this->m_tensor.coeff(internal_pair.second);
+    }
+    data[packet_size - 1] = this->m_tensor.coeff(lastIdx);
+
+    return pload<PacketT>(data);
+  }
+
+  template <typename PacketT,int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  typename internal::enable_if<internal::unpacket_traits<PacketT>::size!=packet_size,PacketT>::type
+  load(Index i, Index j) const
+  {
+    const Index requested_packet_size = internal::unpacket_traits<PacketT>::size;
+    EIGEN_ALIGN_MAX Scalar data[requested_packet_size];
+
+    const IndexPair<Index> indexPair = this->computeIndexPair(i, j, requested_packet_size - 1);
+    const Index first = indexPair.first;
+    const Index lastIdx = indexPair.second;
+
+    data[0] = this->m_tensor.coeff(first);
+    for (Index k = 1; k < requested_packet_size - 1; k += 2) {
+      const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
+      data[k] = this->m_tensor.coeff(internal_pair.first);
+      data[k + 1] = this->m_tensor.coeff(internal_pair.second);
+    }
+    data[requested_packet_size - 1] = this->m_tensor.coeff(lastIdx);
+
+    return pload<PacketT>(data);
+  }
+
+  template <typename PacketT,int AlignmentType>
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const {
+    return this->load<PacketT,AlignmentType>(i,j);
+  }
+};
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         bool inner_dim_contiguous,
+         bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
+class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>
+  : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_>
+{
+ public:
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
+
+  EIGEN_DEVICE_FUNC
+  BaseTensorContractionMapper(const Tensor& tensor,
+                              const nocontract_t& nocontract_strides,
+                              const nocontract_t& ij_strides,
+                              const contract_t& contract_strides,
+                              const contract_t& k_strides) :
+  ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+  template <typename PacketT,int> EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const {
+    EIGEN_ALIGN_MAX Scalar data[1];
+    data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
+    return pload<PacketT>(data);
+  }
+  template <typename PacketT,int> EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const {
+    EIGEN_ALIGN_MAX Scalar data[1];
+    data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
+    return pload<PacketT>(data);
+  }
+};
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         int packet_size,
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer>
+class TensorContractionSubMapper {
+ public:
+
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> ParentMapper;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Self;
+  typedef Self LinearMapper;
+
+  enum {
+    // We can use direct offsets iff the parent mapper supports then and we can compute the strides.
+    // TODO: we should also enable direct offsets for the Rhs case.
+    UseDirectOffsets = ParentMapper::DirectOffsets && (side == Lhs) && inner_dim_contiguous && (array_size<contract_t>::value > 0)
+  };
+
+  EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
+      : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) {
+    // Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute
+    // this offset every time we attempt to access a coefficient.
+    if (UseDirectOffsets) {
+      Index stride = m_base_mapper.stride();
+      m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper(i, 0);
+    }
+    return m_base_mapper(i + m_vert_offset, m_horiz_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper(i, j);
+    }
+    return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  template <typename PacketT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper.template loadPacket<PacketT,Alignment>(i, 0);
+    }
+    return m_base_mapper.template loadPacket<PacketT,Alignment>(i + m_vert_offset, m_horiz_offset);
+  }
+
+  template <typename PacketT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper.template loadPacket<PacketT,Alignment>(i, j);
+    }
+    return m_base_mapper.template loadPacket<PacketT,Alignment>(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  template <typename PacketT, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper.template load<PacketT,AlignmentType>(i, j);
+    }
+    return m_base_mapper.template loadPacket<PacketT,AlignmentType>(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  template <typename PacketT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketT& p) const {
+    if (UseDirectOffsets) {
+      m_base_mapper.storePacket(i, 0, p);
+    }
+    m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return LinearMapper(m_base_mapper, i, j);
+    }
+    return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  template <typename PacketT, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
+    EIGEN_STATIC_ASSERT((internal::is_same<PacketT, PacketT>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned;
+    if (UseDirectOffsets) {
+     return m_base_mapper.template loadPacket<PacketT,ActualAlignment>(i, 0);
+    }
+    return m_base_mapper.template loadPacket<PacketT,ActualAlignment>(i + m_vert_offset, m_horiz_offset);
+  }
+
+  template <typename PacketT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const {
+    return false;
+  }
+
+  #ifdef EIGEN_USE_SYCL
+  // The placeholder accessors require to be bound to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_base_mapper.bind(cgh);
+  }
+  #endif
+
+  const ParentMapper& base_mapper() const { return m_base_mapper; }
+  Index vert_offset() const { return m_vert_offset; }
+  Index horiz_offset() const { return m_horiz_offset; }
+
+ private:
+  ParentMapper m_base_mapper;
+  const Index m_vert_offset;
+  const Index m_horiz_offset;
+};
+
+
+template<typename Scalar_, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         int packet_size,
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,  template <class> class MakePointer_=MakePointer>
+class TensorContractionInputMapper
+  : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> {
+
+ public:
+  typedef Scalar_ Scalar;
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Base;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> SubMapper;
+  typedef SubMapper VectorMapper;
+
+  EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
+                               const nocontract_t& nocontract_strides,
+                               const nocontract_t& ij_strides,
+                               const contract_t& contract_strides,
+                               const contract_t& k_strides)
+      : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    return SubMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+    return VectorMapper(*this, i, j);
+  }
+  
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& get_tensor() const {
+    return Base::m_tensor;
+  }
+};
+
+
+template <typename T> struct TensorContractionInputMapperTrait;
+
+template<typename Scalar_, typename Index_, int side_,
+         typename Tensor_,
+         typename nocontract_t_, typename contract_t_,
+         int packet_size_,
+         bool inner_dim_contiguous_, bool inner_dim_reordered_, int Alignment_,  template <class> class MakePointer_>
+struct TensorContractionInputMapperTrait<TensorContractionInputMapper<Scalar_, Index_, side_, Tensor_, 
+                                                    nocontract_t_, contract_t_, packet_size_, inner_dim_contiguous_, 
+                                                    inner_dim_reordered_, Alignment_, MakePointer_> > {
+
+      typedef Tensor_ XprType;
+      static const bool  inner_dim_contiguous = inner_dim_contiguous_;
+      static const bool  inner_dim_reordered = inner_dim_reordered_;
+  };  
+
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
new file mode 100755
index 0000000..473c228
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h

@@ -0,0 +1,1650 @@
+// This file is part of Eigen, a lightweight C++ template library for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not
+// distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorContractionSycl.h
+ *
+ * \brief:
+ *  TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
+ *
+ *****************************************************************/
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
+
+namespace Eigen {
+
+namespace TensorSycl {
+namespace internal {
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+/*!
+ * \brief TVPanelSize, a template class used for setting the panel size required for launching General TensorVector
+ * contraction kernel on various hardware devices.
+ *
+ * \tparam Scalar: determines the element type of the tensor/vector
+ *
+ * \tparam StorageIndex  determines the Index type.
+ *
+ * \tparam NCWindow: determines the number of non-contracting element to be process by each work-group
+ *
+ * \tparam CFactor: determines the number of contracting element to be process by each thread
+ *
+ * \tparam NCFactor: determines the number of non-contracting element to be process by each thread
+ */
+template <typename Scalar, typename StorageIndex, StorageIndex NCWindow, StorageIndex CFactor, StorageIndex NCFactor>
+struct TVPanelSize {
+  // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0;
+  // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1;
+  // TileSizeDimNC: determines the tile size for the non-contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor;
+  // TileSizeDimC: determines the tile size for the contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC;
+  // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC;
+  // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC;
+  // BC : determines if supporting bank conflict is required
+  static EIGEN_CONSTEXPR bool BC = false;
+};
+#endif
+
+/*!
+ * \brief TTPanelSize, a template class used for setting the panel size required for launching General Tensor Tensor
+ contraction kernel on various hardware devices.
+ *
+ * \tparam Scalar: determines the element type of the tensor
+ *
+ * \tparam StorageIndex: determines the Index type.
+ *
+ * \tparam REG_SIZE_M: determines workload per thread for loading the M dimension This can be varied based on the
+ available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro).
+ *
+ * \tparam REG_SIZE_N: determines workload per thread for loading the N dimension This can be varied based on the
+ available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro).
+ *
+ * \tparam TSDK: determines Tile size for dimension K. The packet size is assumed to be considered
+ */
+
+template <typename Scalar, typename StorageIndex, StorageIndex REG_SIZE_M, StorageIndex REG_SIZE_N, StorageIndex TSDK>
+struct TTPanelSize {
+  // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK;
+  // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the
+  // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro//
+#ifndef EIGEN_SYCL_REG_M
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M;
+#else
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M;
+#endif
+// WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the
+// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro
+#ifndef EIGEN_SYCL_REG_N
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N;
+#else
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N;
+#endif
+  // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension
+  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0;
+  // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension
+  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1;
+  // TileSizeDimM: determines the tile size for the m dimension
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM;
+  // TileSizeDimN: determines the tile size for the n dimension
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN;
+  // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisable by packetsize
+  static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs =
+      ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN));
+  // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisable by packetsize
+  static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs =
+      ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM));
+  // BC : determines if supporting bank conflict is required
+  static EIGEN_CONSTEXPR bool BC = true;
+  // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by
+  // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device doesnot have sufficient  local memory)
+  static EIGEN_CONSTEXPR bool DoubleBuffer =
+#ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER
+      false;
+#else
+      true;
+#endif
+};
+
+/* !
+ * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to
+ * specialize the contraction algorithm based on device support for dedicated local memory.
+ */
+enum class contraction_type { local, no_local };
+/* !
+ * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private).
+ */
+enum class data_source { global_mem, local_mem, private_mem };
+
+/*!
+ * \brief read, a template function used for loading the data from global
+ memory. This function is used to guarantee coalesced and vectorized load whenever possible
+ *
+ * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and
+ vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+ contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+ when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam TensorMapper: determines the input tensor mapper type
+ *
+ * \tparam StorageIndex: determines the Index type
+
+ * \param tensorMapper: is the input tensor
+ *
+ * \param NCIndex: is the non-contracting dim index
+ *
+ * \param CIndex is the contracting dim index
+ *
+ * \param ld: is the leading dimension of the flattened tensor
+ */
+template <bool PacketLoad, bool is_coalesced_layout, bool, typename PacketType, typename TensorMapper,
+          typename StorageIndex>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<PacketLoad, PacketType>::type read(
+    const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) {
+  const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex;
+  const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex;
+  return tensorMapper.get_tensor().template packet<Unaligned>(row + (col * ld));
+}
+
+/*!
+ * \brief read, special overload of read function, when the read access is not vectorized
+ *
+ * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and
+  vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+  contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+  when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \tparam PacketType: determines the type of packet
+ *
+ * \tparam TensorMapper: determines the input tensor mapper type
+ *
+ * \tparam StorageIndex: determines the Index type
+
+ * \param tensorMapper: is the input tensor
+ *
+ * \param NCIndex: is the non-contracting dim index
+ *
+ * \param CIndex: is the contracting dim index
+ */
+template <bool PacketLoad, bool, bool IsRhs, typename PacketType, typename TensorMapper, typename StorageIndex>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!PacketLoad, PacketType>::type read(
+    const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) {
+  const StorageIndex row = (IsRhs) ? CIndex : NCIndex;
+  const StorageIndex col = (IsRhs) ? NCIndex : CIndex;
+  return tensorMapper(row, col);
+}
+
+/*!
+ * \brief write, a template function used for storing the data to local memory. This function is used to guarantee
+ * coalesced and vectorized store whenever possible.
+ *
+ * \tparam StorageIndex: determines the Index type
+ *
+ * \param ld is the leading dimension of the local memory. ld is a compile time value for the local memory
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ *
+ * \param CIndex is the contracting dim index
+ */
+
+template <typename StorageIndex, StorageIndex ld, data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    typename ::Eigen::internal::enable_if<dt != data_source::global_mem, void>::type
+    write(PacketType &packet_data, DataScalar ptr) {
+  EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits<PacketType>::size;
+  EIGEN_UNROLL_LOOP
+  for (int i = 0; i < PacketSize; i++) {
+    *ptr = PacketWrapper<PacketType, PacketSize>::scalarize(i, packet_data);
+    ptr += ld;
+  }
+}
+
+/*!
+ * \brief Overloading the write function for storing the data to global memory, when vectorization enabled This function
+ * is used to guarantee coalesced and vectorized store whenever possible.
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ */
+
+template <data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<
+    Eigen::internal::unpacket_traits<PacketType>::size != 1 && dt == data_source::global_mem, void>::type
+write(PacketType &packet_data, DataScalar *ptr) {
+  ::Eigen::internal::pstoreu<DataScalar, PacketType>(ptr, packet_data);
+}
+
+/*!
+ * \brief Overloading the write function for storing the data to global memory, when vectorization is disabled.
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ */
+template <data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<
+    Eigen::internal::unpacket_traits<PacketType>::size == 1 && dt == data_source::global_mem, void>::type
+write(PacketType &packet_data, DataScalar *ptr) {
+  *ptr = packet_data;
+}
+
+/*!
+ * \brief check_boundary: is used to check the edge condition for non-internal blocks.
+ *
+ * \tparam is_internal: determines if the block is internal
+ */
+template <bool is_internal>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool) {
+  return true;
+}
+
+/*!
+ * \brief check_boundary: specialization of the check_boundary for non-internal blocks.
+ *
+ * \param cond: true when the data is in range. Otherwise false
+ */
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary<false>(bool cond) {
+  return cond;
+}
+
+/*!
+ * \brief BlockProperties is a template class that provides different characteristic of a block of each Tensor processed
+ * by each workgroup.
+ *
+ * \tparam is_transposed: iff true, determines whether or not the block of the Tensor is transposed
+ *
+ * \tparam packet_load_: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam OutType: determines the type of each element for this block of tensor. If packet load is true, it will be
+ * packetType; Otherwise it will be scalar Type
+ *
+ * \param elements_per_access determines the size of each element based on OutType
+ *
+ * \param is_coalesced_layout  determines whether or not the Tensor data in a memory can be access coalesced and
+ * vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+ * contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+ * when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \param nc_stride determines the stride of non-contracting dimension to access the next adjustment element within the
+ * Tensor Block for each workgroup
+ *
+ * \param c_stride  determines the stride of contracting dimension to access the next adjustment element within the
+ * Tensor Block for each workgroup
+ */
+template <bool is_transposed, bool is_rhs_, bool packet_load_, typename PacketType>
+struct BlockProperties {
+  static EIGEN_CONSTEXPR bool packet_load = packet_load_;
+  typedef typename Eigen::internal::unpacket_traits<PacketType>::type OutScalar;
+  static EIGEN_CONSTEXPR bool is_rhs = is_rhs_;
+  typedef typename Eigen::internal::conditional<packet_load, PacketType, OutScalar>::type OutType;
+  static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits<OutType>::size;
+  static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs);
+  static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1);
+  static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access);
+};
+
+/*!
+ * \brief ThreadProperties is a template class that provides each thread's properties within a workgroup.  Please see
+ * the sycl-1.2.1 specification (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for the workgroup,
+ * work-items
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \param linearLocalThreadId: determines the linearized location of a thread within a work-group
+ *
+ * \param kGroupId: determines the logical group id in a k dimension of the flattened tensor. It will be > 1 when
+ * tall/skinny algorithm is used
+ *
+ * \param mGroupOffset: determines the logical start position of all thread within a workgroup for the m dimension of
+ * the flattened tensor.
+ *
+ * \param kGroupOffset determines the logical start position of all thread within a workgroup for the k dimension of the
+ * flattened tensor. It will be > 1 when tall/skinny algorithm is used.
+ *
+ * \param mLocalOffset: determines the logical start position of each thread within a workgroup for the m dimension of a
+ * flattened tensor. The position determines the distance of each thread within the workgroup from each other
+ * independent from their global position.
+ *
+ * \param nLocalOffset: determines the logical start position of each thread within a workgroup for the n dimension of a
+ * flattened tensor. The position determines the distance of each thread within the workgroup from each other
+ * independent from their global position.
+ *
+ * \param mGlobalOffset: determines the logical start position of each thread a thread for the m dimension on a
+ * flattened tensor
+ *
+ * \param nGlobalOffset: determines the logical start position of each thread a thread for the n dimension on a
+ * flattened tensor
+ *
+ * \param kSize : determine the number of the k elements of the flattened Tensor to be processed by each thread for the
+ * given tensor block. This is !=K dimension of Flattened Tensor when Tall/Skinny matrix is used.
+ *
+ * \param is_internal : this will determined if the thread within the work-group computes an internal block of tensor or
+ * the edge blocks. When it is internal, there is no need to check the boundaries and all the if stantement can be
+ * resolve by compiler.
+ */
+template <typename StorageIndex>
+struct ThreadProperties {
+  const StorageIndex linearLocalThreadId;
+  const StorageIndex kGroupId;
+  const StorageIndex mGroupOffset;
+  const StorageIndex nGroupOffset;
+  const StorageIndex kGroupOffset;
+  const StorageIndex mLocalOffset;
+  const StorageIndex nLocalOffset;
+  const StorageIndex mGlobalOffset;
+  const StorageIndex nGlobalOffset;
+  StorageIndex kSize;
+  const bool is_internal;
+  // this is used to adjust the last block
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties(
+      const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_,
+      const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_,
+      const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_,
+      StorageIndex kSize_, const bool is_internal_)
+      : linearLocalThreadId(linearLocalThreadId_),
+        kGroupId(kGroupId_),
+        mGroupOffset(mGroupOffset_),
+        nGroupOffset(nGroupOffset_),
+        kGroupOffset(kGroupOffset_),
+        mLocalOffset(mLocalOffset_),
+        nLocalOffset(nLocalOffset_),
+        mGlobalOffset(mGlobalOffset_),
+        nGlobalOffset(nGlobalOffset_),
+        kSize(kSize_),
+        is_internal(is_internal_) {}
+};
+
+/*!
+ * \brief TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation.
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam LhsScalar: determines the left-hand-side scalar type
+ *
+ * \tparam RhsScalar: determines the right-hand-side scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam LhsMapper determines the tensor contraction mapper type for left-hand-side matrix
+ *
+ * \tparam RhsMapper determines the tensor contraction mapper type for right-hand-side matrix
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Properties: determines the Contraction Panel properties
+ *
+ * \tparam TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \tparam input_mapper_properties : determine if the input tensors are matrix. If they are matrix, special memory
+ access is used to guarantee that always the memory access are coalesced.
+ *
+ * \tptaram IsFinal : determine if this is the final kernel. If so, the result will be written in a final output.
+ Otherwise, the result of contraction will be written iin a temporary buffer. This is the case when Tall/Skinny
+ contraction is used. So in this case, a final reduction step is required to compute final output.
+
+ * \tparam contraction_tp: it is an enum value representing whether the local memroy/no local memory implementation of
+ the algorithm to be used
+ *
+ * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group
+ *
+ * \param lhs: determines the left-hand-side flattened tensor (tensor mapper)
+ *
+ * \param rhs: determines the right-hand-side flattened tensor (tensor mapper)
+ *
+ * \param out_res: determines the output tensor containing the contraction result
+ *
+ * \param groupSizeM: a logical number determining the number of work-group for m dimension
+ *
+ * \param groupSizeN: a logical number determining the number of work-group for n dimension
+ *
+ * \param numTiles: determines total number of tiles on the k dimension
+ *
+ * \param TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix
+ */
+template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
+          typename RhsMapper, typename StorageIndex, typename Properties, typename TripleDim, bool Vectorizable,
+          typename input_mapper_properties, bool IsFinal, contraction_type contraction_tp>
+class TensorContractionKernel {
+ public:
+  typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
+      PacketReturnType;
+  static EIGEN_CONSTEXPR int PacketSize =
+      Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
+  static EIGEN_CONSTEXPR bool is_lhs_transposed =
+      !::Eigen::internal::TensorContractionInputMapperTrait<LhsMapper>::inner_dim_contiguous;
+  static EIGEN_CONSTEXPR bool is_rhs_transposed =
+      !::Eigen::internal::TensorContractionInputMapperTrait<RhsMapper>::inner_dim_contiguous;
+
+  typedef BlockProperties<is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix && Vectorizable,
+                          PacketReturnType>
+      LHSBlockProperties;
+
+  typedef BlockProperties<is_rhs_transposed, true, input_mapper_properties::is_rhs_matrix && Vectorizable,
+                          PacketReturnType>
+      RHSBlockProperties;
+
+  static EIGEN_CONSTEXPR StorageIndex NStride =
+      contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride;
+
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+  typedef cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::local_space> local_ptr;
+  typedef OutScalar * /*cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::private_space>*/ private_ptr;
+  typedef
+      typename ::Eigen::internal::conditional<contraction_tp == contraction_type::local, local_ptr, private_ptr>::type
+          tile_ptr;
+  static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local
+                                                 ? Properties::TileSizeDimM + Properties::BC
+                                                 : Properties::WorkLoadPerThreadM;
+  static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local
+                                                 ? Properties::TileSizeDimN + Properties::BC
+                                                 : Properties::WorkLoadPerThreadN;
+  static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
+
+  /**
+   * \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not
+   * allowed to have dynamic memory allocation. While the local memory is created outside of the kernel and passed to
+   * the kernel as an accessor, the private memory can only allowed to be allocated statically. Since we are abstracting
+   * the TiledMemory for both local and private memory, the MemHolder structs is used as a helper to abstract out
+   * different type of memory needed when local/no_local memory computation is called.
+   *
+   * \tparam contraction_type: it is an enum value representing whether the local memroy/no local memory implementation
+   of the algorithm to be used
+   * \tparam the private memory size
+   * \param ptr the tile memory pointer type
+   */
+  template <contraction_type, StorageIndex>
+  struct MemHolder {
+    tile_ptr ptr;
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {}
+  };
+  /**
+   * \brief specialization of memHolder class when no local memory kernel is used.
+   */
+  template <StorageIndex MemSize>
+  struct MemHolder<contraction_type::no_local, MemSize> {
+    OutScalar ptr[MemSize] = {OutScalar{0}};
+  };
+  /**
+   * \brief TiledMemory: contains required memory pointer for loading  each tile of the TensorContraction panel from
+   * global memory to local/private memory when local/no_local algorithm used.
+   *
+   * \param lhs_scratch_extract : determines the LHS tile memory. It is either private or local memory based on the
+   * selected contraction_type.
+   *
+   * \param rhs_scratch_extract : determines the RHS tile memory. It is either private or local memory based on the
+   * selected contraction_type.
+   *
+   * \param lhs_extract_index: determins the position of each thread on a local memory for lhs input. When private
+   * memory is used this is set to zero as this is not applicable in case of private memory.
+   *
+   * \param rhs_extract_index: determins the position of each thread on a local memory for rhs input. When private
+   * memory is used this is set to zero as this is not applicable in case of private memory.
+   *
+   * \param lhs_scratch_compute : determines the  location to load for computation for lhs_local memory. This is the
+   * same as lhs_scratch_extract for private memory.
+   *
+   * \param rhs_scratch_compute : determines the  location to load for computation for rhs_local memory. This is the
+   * same as rhs_scratch_extract for private memory.
+   */
+  struct TiledMemory {
+    MemHolder<contraction_tp, Properties::WorkLoadPerThreadM * Properties::TileSizeDimK> lhs_scratch_extract;
+    MemHolder<contraction_tp, Properties::WorkLoadPerThreadN * Properties::TileSizeDimK> rhs_scratch_extract;
+    tile_ptr lhs_scratch_ptr_compute;
+    tile_ptr rhs_scratch_ptr_compute;
+    const std::pair<StorageIndex, StorageIndex> lhs_extract_index;
+    const std::pair<StorageIndex, StorageIndex> rhs_extract_index;
+    template <contraction_type tp = contraction_tp>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TiledMemory(const ThreadProperties<StorageIndex> &, local_ptr,
+                typename ::Eigen::internal::enable_if<tp == contraction_type::no_local>::type * = 0)
+        : lhs_scratch_extract{},
+          rhs_scratch_extract{},
+          lhs_scratch_ptr_compute(lhs_scratch_extract.ptr),
+          rhs_scratch_ptr_compute(rhs_scratch_extract.ptr),
+          lhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})),
+          rhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})) {}
+
+    template <contraction_type tp = contraction_tp>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TiledMemory(const ThreadProperties<StorageIndex> &thread_properties, local_ptr block_start_ptr,
+                typename ::Eigen::internal::enable_if<tp == contraction_type::local>::type * = 0)
+        : lhs_scratch_extract{block_start_ptr},
+          rhs_scratch_extract{lhs_scratch_extract.ptr +
+                              ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)},
+          lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset),
+          rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset),
+          lhs_extract_index(
+              local_id_extract<LHSBlockProperties, Properties::TileSizeDimM>(thread_properties.linearLocalThreadId)),
+          rhs_extract_index(
+              local_id_extract<RHSBlockProperties, Properties::TileSizeDimN>(thread_properties.linearLocalThreadId)) {}
+  };
+
+  Scratch scratch;
+  const LhsMapper lhs;
+  const RhsMapper rhs;
+  OutAccessor out_res;
+  const StorageIndex groupSizeM;
+  const StorageIndex groupSizeN;
+  const StorageIndex numTiles;
+  const TripleDim triple_dim;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
+                                                                const RhsMapper rhs_, OutAccessor out_res_,
+                                                                const StorageIndex groupSizeM_,
+                                                                const StorageIndex groupSizeN_,
+                                                                const StorageIndex numTiles_,
+                                                                const TripleDim triple_dim_)
+      : scratch(scratch_),
+        lhs(lhs_),
+        rhs(rhs_),
+        out_res(out_res_),
+        groupSizeM(groupSizeM_),
+        groupSizeN(groupSizeN_),
+        numTiles(numTiles_),
+        triple_dim(triple_dim_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
+                                                                const RhsMapper rhs_, OutAccessor out_res_,
+                                                                const StorageIndex groupSizeM_,
+                                                                const StorageIndex numTiles_,
+                                                                const TripleDim triple_dim_)
+      : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
+    const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
+    const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
+    const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM;
+    const StorageIndex tmp = itemID.get_group(0) / groupSizeM;
+    const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN;
+    const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN;
+    const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM;
+    const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN;
+    const StorageIndex mLocalOffset = PacketSize * mLocalThreadId;
+    const StorageIndex nLocalOffset = NStride * nLocalThreadId;
+    const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset;
+    const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset;
+
+    const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK;
+    StorageIndex kGroupOffset = kGroupId * kSizePerWG;
+    const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM &&
+                             triple_dim.N - nGroupOffset >= Properties::TileSizeDimN &&
+                             triple_dim.K - kGroupOffset >= kSizePerWG;
+    // this is used to adjust the last block
+    StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset);
+    // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to
+    // tile
+    kGroupOffset += kSize;
+
+    auto thread_properties =
+        ThreadProperties<StorageIndex>(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset,
+                                       mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal);
+
+    auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N);
+
+    (thread_properties.is_internal) ? compute_panel<true>(itemID, thread_properties, out_ptr)
+                                    : compute_panel<false>(itemID, thread_properties, out_ptr);
+  }
+  // The compute block computes the contraction operation private block for each thread and store the resutl in the
+  // privateRes memory of Each computation the compute block function is independent of local and no local concepts as
+  // it only compute the block on each thread's private memory space
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
+                                                                    PacketReturnType *privateRes) {
+    StorageIndex idx = 0;
+    EIGEN_CONSTEXPR StorageIndex lhs_stride =
+        contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) {
+      auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)};
+      StorageIndex lhs_index = 0;
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
+        PacketReturnType lhsPack{};
+        Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::set_packet(lhsPack,
+                                                                                             lhs_block_ptr + lhs_index);
+        privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]);
+
+        lhs_index += lhs_stride;
+        idx++;
+      }
+    }
+  }
+  // The store function write the computed contraction operation in the private memory of each thread to the global
+  // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base
+  // class.
+  template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes,
+                                                   StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) {
+    auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {
+      return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);
+    };
+    // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is
+    // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId*
+    // WorkLoadPerThreadN slice of N
+    EIGEN_CONSTEXPR StorageIndex GlobalNStride =
+        contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) {
+      // output leading dimension
+      StorageIndex outputLD = 0;
+      // When local memory is used the PrivateNstride is always 1 because the coalesed access on N is loaded into Local
+      // memory and extracting from local to global is the same as no transposed version. However, when local memory is
+      // not used and RHS is transposed we packetize the load for RHS.
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex nId = 0; nId < PrivateNStride; nId++) {
+        StorageIndex globalRow = mGlobalOffset;
+        EIGEN_UNROLL_LOOP
+        for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
+          PacketReturnType privetOut = privateRes[wLPTM];
+          if (check_boundary<is_internal_block>(chk_bound(globalRow, nId))) {
+            // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second
+            // StorageIndex Therefore it is always coalesced layout
+            write<data_source::global_mem>(privetOut, out_ptr + outputLD + globalRow);
+          } else {
+            EIGEN_UNROLL_LOOP
+            for (StorageIndex mId = 0; mId < PacketSize; mId++) {
+              StorageIndex mOffset = globalRow + mId;
+              if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) {
+                out_ptr[mOffset + outputLD] =
+                    Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::scalarize(mId, privetOut);
+              }
+            }
+          }
+          globalRow += (PacketSize * Properties::LocalThreadSizeM);
+        }
+        outputLD += triple_dim.M;
+        privateRes += Properties::WorkLoadPerThreadM / PacketSize;
+      }
+      out_ptr += (GlobalNStride * outputLD);
+
+      nGlobalOffset += (PrivateNStride * GlobalNStride);
+    }
+  }
+  // when no local memory is used the following extract_block will be enabled
+  template <typename InputBlockProperties, bool is_internal_block, typename Input, typename PrivateReg,
+            contraction_type contract_tp = contraction_tp>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<contract_tp == contraction_type::no_local>::type
+      extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
+                    const StorageIndex &ncOffset, const StorageIndex cOffset) {
+    EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
+        InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
+    EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
+        InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM;
+    const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
+
+    auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
+      return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
+              (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
+    };
+    const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
+    StorageIndex cIndex = cOffset;
+
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) {
+      StorageIndex ncIndex = ncOffset;
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) {
+        if (check_boundary<is_internal_block>(chk_bound(cIndex, ncIndex))) {
+          auto val =
+              read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+                   InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, ncIndex, cIndex, ld);
+
+          write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
+                data_source::private_mem>(val, private_ptr);
+        } else {
+          EIGEN_UNROLL_LOOP
+          for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+            const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
+            const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
+            OutScalar val =
+                (ncInd < NC && cInd < triple_dim.K)
+                    ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+                          inpt, ncInd, cInd, ld)
+                    : OutScalar(0);
+            write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
+                  data_source::private_mem>(
+                val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) +
+                         ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC));
+          }
+        }
+
+        // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So
+        // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread.
+        ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1)
+                      ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC
+                      : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC);
+        private_ptr += InputBlockProperties::nc_stride;
+      }
+      // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC
+      private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC;
+      cIndex += InputBlockProperties::c_stride;
+    }
+  }
+  template <typename InputBlockProperties, StorageIndex TileSizeDimNC>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair<StorageIndex, StorageIndex> local_id_extract(
+      const StorageIndex &linearLocalThreadId) {
+    const StorageIndex localThreadNC =
+        (InputBlockProperties::is_coalesced_layout)
+            ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride)
+            : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+    const StorageIndex localThreadC =
+        (InputBlockProperties::is_coalesced_layout)
+            ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride)
+            : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+    return std::pair<StorageIndex, StorageIndex>(localThreadNC, localThreadC);
+  }
+
+  template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<db && ctp == contraction_type::local>::type
+      sync_mem(const cl::sycl::nd_item<1> &, bool &db_offset) noexcept {
+    db_offset = !db_offset;
+  }
+
+  template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<!db && ctp == contraction_type::local>::type
+      sync_mem(const cl::sycl::nd_item<1> &itemID, bool &) noexcept {
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+  }
+
+  template <contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<ctp == contraction_type::no_local>::type
+      sync_mem(const cl::sycl::nd_item<1> &, bool &) noexcept {
+    return;
+  }
+
+  template <bool need_sync, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::no_local>::type
+      sync_thread(const cl::sycl::nd_item<1> &
+#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
+                      itemID
+#endif
+                  ) noexcept {
+#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
+    itemID.barrier(cl::sycl::access::fence_spacce::local_space);
+#else
+    return;
+#endif
+  }
+  template <bool need_sync, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::local>::type
+      sync_thread(const cl::sycl::nd_item<1> &itemID) {
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+  }
+  template <bool need_sync>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!need_sync>::type sync_thread(
+      const cl::sycl::nd_item<1> &) {
+    return;
+  }
+
+  template <bool is_internal_block>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,
+                                                                    ThreadProperties<StorageIndex> &thread_properties,
+                                                                    TiledMemory &tiled_input_block,
+                                                                    PacketReturnType *privateRes, bool &db_offset) {
+    // Tiling the Rhs block from global to local memory
+    extract_block<RHSBlockProperties, is_internal_block>(
+        rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),
+        tiled_input_block.rhs_extract_index,
+        contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset,
+        thread_properties.kGroupOffset - thread_properties.kSize);
+
+    sync_thread<contraction_tp == contraction_type::no_local>(itemID);
+
+    // Tiling the Lhs block from global to local memory
+    extract_block<LHSBlockProperties, is_internal_block>(
+        lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK),
+        tiled_input_block.lhs_extract_index,
+        contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset,
+        thread_properties.kGroupOffset - thread_properties.kSize);
+
+    // itemID.barrier(cl::sycl::access::fence_space::local_space);
+    sync_thread<contraction_tp == contraction_type::local>(itemID);
+    // switch to compute mede
+    StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK);
+    StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR);
+    // Loop over the values of a single tile
+    for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) {
+      compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset,
+                             tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes);
+      lhs_offset += LSDL;
+      rhs_offset += LSDR;
+    }
+    // computing the K index for the next tile
+    thread_properties.kSize -= Properties::TileSizeDimK;
+    sync_mem(itemID, db_offset);
+  }
+
+  // when local memory is available the following compute_panel will be enabled
+  template <bool is_internal_block, typename OutPtr>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,
+                                                           ThreadProperties<StorageIndex> &thread_properties,
+                                                           OutPtr out_ptr) {
+    auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};
+    // Allocate register space
+    PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {
+        PacketReturnType{0}};
+    bool db_offset = 0;
+
+    while (thread_properties.kSize >= Properties::TileSizeDimK) {
+      compute_tile_per_panel<is_internal_block>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
+    }
+    if (thread_properties.kSize > 0) {
+      compute_tile_per_panel<false>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
+    }
+
+    // Storing the final results in the output
+    store<is_internal_block,
+          contraction_tp == contraction_type::local ? static_cast<StorageIndex>(1) : RHSBlockProperties::nc_stride>(
+        out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset,
+        thread_properties.nGlobalOffset);
+  }
+  // When local memory is available the following extract_block will be enabled
+  template <typename InputBlockProperties, bool is_internal_block, typename Input, typename Local,
+            contraction_type contract_tp = contraction_tp>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<contract_tp == contraction_type::local>::type
+      extract_block(const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex>& local_index,
+                    const StorageIndex &ncOffset, const StorageIndex cOffset) {
+    EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
+        InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
+    EIGEN_CONSTEXPR StorageIndex LoadPerThread =
+        InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs;
+    EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL;
+    static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) &&
+                   (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)),
+                  " LocalOffset must be divisable by stride");
+    const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
+    StorageIndex localThreadNC = local_index.first;
+    StorageIndex localThreadC = local_index.second;
+    auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
+      return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
+              (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
+    };
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) {
+      const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC);
+      const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC);
+      const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
+      if (check_boundary<is_internal_block>(chk_bound(CIndex, NCIndex))) {
+        auto val =
+            read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+                 InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, NCIndex, CIndex, ld);
+        write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
+            val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
+                     (InputBlockProperties::c_stride * localThreadC * LSD));
+      } else {
+        EIGEN_UNROLL_LOOP
+        for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+          const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
+          const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
+          OutScalar val =
+              (nCInd < NC && cInd < triple_dim.K)
+                  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+                        inpt, nCInd, cInd, ld)
+                  : OutScalar(0);
+
+          write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
+              val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
+                       (InputBlockProperties::is_coalesced_layout ? i : 0) +
+                       ((InputBlockProperties::c_stride * localThreadC +
+                         (InputBlockProperties::is_coalesced_layout ? 0 : i)) *
+                        LSD));
+        }
+      }
+      localThreadNC += (InputBlockProperties::is_coalesced_layout)
+                           ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride)
+                           : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+      localThreadC += (InputBlockProperties::is_coalesced_layout)
+                          ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride)
+                          : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+    }
+  }
+};
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+
+/*!
+ * \brief GeneralVectorTensor is a template class that provides Tensor -vector contraction operation, which is a special
+ * case of Tensor Tensor contraction.
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam VectorMapper: determines the tensor contraction mapper for the vector input (can be lhs or rhs)
+ *
+ * \tparam TensorMapper: determines the tensor contraction mapper for the tensor input (can be lhs or rhs)
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Properties: determines the Contraction Panel properties
+ *
+ * \tparam KFactor: determines the number of elements in K dimension in a Tile
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \tparam is_lhs_vec: determines whether lhs is a vector or rhs is a vector
+ *
+ * \tparam IsFinal: determine if this is the final kernel. If so, the result will be written in a final output.
+ * Otherwise, the result of contraction will be written iin a temporary buffer.
+ *
+ * \param scratch: determines the local memory containing the vector block for each work-group
+ *
+ * \param vec: determines the vector input (tensor mapper)
+ *
+ * \param mat: determines the tensor input (tensor mapper)
+ *
+ * \param out_res: determines the output vector containing the contraction result
+ *
+ * \param nonContractGroupSize: a logical number determining the number of work-group for non-contracting dimension
+ *
+ * \param nonContractDim: determines the size of non contracting dimension for the flattened tensor
+ *
+ * \param contractDim: determines the size of non contracting dimension for the flattened tensor
+ *
+ */
+template <typename OutScalar, typename OutAccessor, typename VectorMapper, typename TensorMapper, typename StorageIndex,
+          typename Properties, StorageIndex KFactor, bool Vectorizable, bool is_lhs_vec, bool IsFinal>
+struct GeneralVectorTensor {
+  typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
+      PacketReturnType;
+  static EIGEN_CONSTEXPR int PacketSize =
+      Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+
+  static EIGEN_CONSTEXPR StorageIndex OutScratchOffset =
+      KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
+
+  // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make
+  // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true.
+  typedef BlockProperties<is_lhs_vec ? false : true, is_lhs_vec ? false : true, Vectorizable, PacketReturnType>
+      VecBlockProperties;
+
+  Scratch scratch;
+  const VectorMapper vec;
+  const TensorMapper mat;
+  OutAccessor out_res;
+  const StorageIndex nonContractGroupSize;
+  const StorageIndex nonContractDim;
+  const StorageIndex contractDim;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_,
+                                                            const TensorMapper mat_, OutAccessor out_res_,
+                                                            const StorageIndex nonContractGroupSize_,
+                                                            const StorageIndex nonContractDim_,
+                                                            const StorageIndex contractDim_)
+      : scratch(scratch_),
+        vec(vec_),
+        mat(mat_),
+        out_res(out_res_),
+        nonContractGroupSize(nonContractGroupSize_),
+        nonContractDim(nonContractDim_),
+        contractDim(contractDim_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    auto scratch_ptr = scratch.get_pointer();
+    const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
+    StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
+                                            : linearLocalThreadId % Properties::LocalThreadSizeNC;
+    StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC
+                                         : linearLocalThreadId / Properties::LocalThreadSizeNC;
+    const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize;
+    const StorageIndex nonContractGroupId =
+        is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize;
+    const StorageIndex contractGroupId =
+        is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize;
+    auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : contractGroupId * nonContractDim);
+
+    const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC;
+    const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC;
+    auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
+    const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId;
+    const StorageIndex globalContractDimOffset = contractGroupOffset + contractId;
+    auto local_output = scratch_ptr + OutScratchOffset;
+    const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC &&
+                             contractDim - contractGroupOffset >= Properties::TileSizeDimC;
+    is_internal
+        ? compute_panel<true>(itemID, vec, mat, local_output, out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+                              scratch_ptr, contractGroupOffset,
+#endif
+                              nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
+                              nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex)
+        : compute_panel<false>(itemID, vec, mat, local_output, out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+                               scratch_ptr, contractGroupOffset,
+#endif
+                               nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
+                               nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex);
+  }
+  template <bool is_internal_block, typename OutPtr>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(
+      const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output,
+      OutPtr out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+      OutScalar *scratch_ptr, const StorageIndex contractGroupOffset,
+#endif
+      const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim,
+      StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId,
+      StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) {
+    OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)};
+    // Reading the vector
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+    const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId;
+    extract_block<VecBlockProperties, is_internal_block, KFactor,
+                  Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC>(vec, scratch_ptr, linearLocalThreadId,
+                                                                                vectorOffset, contractDim);
+
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+    auto in_scratch_ptr = scratch_ptr + contractId;
+#endif
+
+    StorageIndex privateOffsetC = 0;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) {
+      StorageIndex privateOffsetNC = 0;
+      bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim);
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+      auto vecScalar = *in_scratch_ptr;
+#else
+      auto vecScalar = (check_boundary<is_internal_block>(contract_conds))
+                           ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC,
+                                 is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0))
+                           : OutScalar(0);
+#endif
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+        auto matScalar = (check_boundary<is_internal_block>(
+                             contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim)))
+                             ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC
+                                              : globalNonContractDimOffset + privateOffsetNC,
+                                   is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC
+                                              : globalContractDimOffset + privateOffsetC)
+                             : OutScalar(0);
+
+        outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]);
+        privateOffsetNC += Properties::LocalThreadSizeNC;
+      }
+      privateOffsetC += Properties::LocalThreadSizeC;
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+      in_scratch_ptr += Properties::LocalThreadSizeC;
+#endif
+    }
+
+    auto out_scratch_ptr = local_output + outScratchIndex;
+    // Each block of 16*16 element in shared memory should reduce to 16*1
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+      *out_scratch_ptr = outScalar[j];
+
+      out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+    }
+    if (is_lhs_vec) {
+      nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC;
+      contractId = linearLocalThreadId / Properties::LocalThreadSizeNC;
+      outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
+    }
+
+    out_scratch_ptr = local_output + outScratchIndex;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) {
+        itemID.barrier(cl::sycl::access::fence_space::local_space);
+        if (contractId < offset) {
+          StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset);
+          *out_scratch_ptr += out_scratch_ptr[myNeigbourId];
+        }
+      }
+      // moving to next 16 by 16 block
+      out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+    }
+
+    if (contractId == 0) {
+      out_scratch_ptr = local_output + nonContractId;
+      StorageIndex global_final_offset = nonContractGroupOffset + nonContractId;
+      out_ptr += global_final_offset;
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+        if (check_boundary<is_internal_block>(global_final_offset < nonContractDim)) {
+          auto res = *out_scratch_ptr;
+
+          *out_ptr = res;
+          out_ptr += Properties::LocalThreadSizeNC;
+        }
+        // moving to next 16 by 16 block to ge the next 16 reduced elements
+        out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+        if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC;
+      }
+    }
+  }
+
+  template <typename InputBlockProperties, bool is_internal_block, int CFactor, int GroupSize, typename Input,
+            typename Local>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr,
+                                                                  const StorageIndex &linearLocalThreadId,
+                                                                  const StorageIndex &cOffset, const StorageIndex &C) {
+    local_ptr += InputBlockProperties::c_stride * linearLocalThreadId;
+    StorageIndex cIndex = cOffset;
+    for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) {
+      if (check_boundary<is_internal_block>(cIndex + InputBlockProperties::c_stride - 1 < C)) {
+        auto val = read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+                        InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, StorageIndex(0),
+                                                                                              cIndex, StorageIndex(1));
+        write<StorageIndex, 1, data_source::local_mem>(val, local_ptr);
+      } else {
+        EIGEN_UNROLL_LOOP
+        for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+          OutScalar val =
+              (cIndex + i < C)
+                  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+                        inpt, StorageIndex(0), cIndex + i, StorageIndex(1))
+                  : OutScalar(0);
+          write<StorageIndex, 1, data_source::local_mem>(val, local_ptr + i);
+        }
+      }
+      local_ptr += InputBlockProperties::c_stride * GroupSize;
+      cIndex += InputBlockProperties::c_stride * GroupSize;
+    }
+  }
+};
+#endif
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+
+/*!
+ * \brief GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contraction
+ * operation, when all the dimensions are contracting dimensions. This Kernel reduces two tensors to an scalar
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam LhsScalar: determines the left-hand-side scalar type
+ *
+ * \tparam RhsScalar: determines the right-hand-side scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam LhsMapper: determines the tensor contraction mapper type for left-hand-side matrix
+ *
+ * \tparam RhsMapper: determines the tensor contraction mapper type for right-hand-side matrix
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group
+ *
+ * \param lhs: determines the left-hand-side flattened tensor (tensor mapper)
+ *
+ * \param rhs: determines the right-hand-side flattened tensor (tensor mapper)
+ *
+ * \param out_res: determines the output tensor containing the contraction result
+ *
+ * \param rng: determins the total input data size
+ */
+template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
+          typename RhsMapper, typename StorageIndex, bool Vectorizable>
+struct GeneralScalarContraction {
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+  Scratch scratch;
+  const LhsMapper lhs;
+  const RhsMapper rhs;
+  OutAccessor out_res;
+  const StorageIndex rng;
+
+  EIGEN_DEVICE_FUNC
+  GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_,
+                           const StorageIndex rng_)
+      : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}
+
+  EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) {
+    auto out_ptr = out_res.get_pointer();
+    auto scratch_ptr = scratch.get_pointer().get();
+
+    StorageIndex globalid = itemID.get_global_id(0);
+    StorageIndex localid = itemID.get_local_id(0);
+    OutScalar accumulator = OutScalar(0);
+    for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) {
+      accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator);
+    }
+    auto out_scratch_ptr = scratch_ptr + localid;
+    *out_scratch_ptr = accumulator;
+    for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]);
+      }
+    }
+    if (localid == 0) {
+      out_ptr[itemID.get_group(0)] = accumulator;
+    }
+  }
+};
+#endif
+
+}  // namespace internal
+}  // namespace TensorSycl
+
+template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>,
+                       Eigen::SyclDevice>
+    : public TensorContractionEvaluatorBase<TensorEvaluator<
+          const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Eigen::SyclDevice>> {
+  static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value,
+                "SYCL tensor contraction does not support output kernels.");
+
+  typedef Eigen::SyclDevice Device;
+
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Index StorageIndex;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename Base::Storage Storage;
+  typedef typename Base::EvaluatorPointerType EvaluatorPointerType;
+  struct TripleDim {
+    const StorageIndex M;
+    const StorageIndex N;
+    const StorageIndex K;
+    TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {}
+  };
+  enum {
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = false,
+  };
+
+  static EIGEN_CONSTEXPR int LDims = Base::LDims;
+  static EIGEN_CONSTEXPR int RDims = Base::RDims;
+  static EIGEN_CONSTEXPR int ContractDims = Base::ContractDims;
+
+  typedef array<StorageIndex, LDims> left_dim_mapper_t;
+  typedef array<StorageIndex, RDims> right_dim_mapper_t;
+
+  typedef array<StorageIndex, ContractDims> contract_t;
+  typedef array<StorageIndex, LDims - ContractDims> left_nocontract_t;
+  typedef array<StorageIndex, RDims - ContractDims> right_nocontract_t;
+
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef DSizes<StorageIndex, NumDims> Dimensions;
+
+  typedef TensorEvaluator<typename Base::EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<typename Base::EvalRightArgType, Device> RightEvaluator;
+  typedef typename Eigen::internal::remove_const<typename LeftEvaluator::CoeffReturnType>::type LhsScalar;
+  typedef typename Eigen::internal::remove_const<typename RightEvaluator::CoeffReturnType>::type RhsScalar;
+
+  typedef typename LeftEvaluator::Dimensions LeftDimensions;
+  typedef typename RightEvaluator::Dimensions RightDimensions;
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered>
+  struct input_mapper_propertis {
+    static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous;
+    static EIGEN_CONSTEXPR bool is_rhs_matrix =
+        (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered);
+  };
+
+  TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {}
+
+  // We need to redefine this method to make nvcc happy
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) {
+    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
+    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
+    if (!data) {
+      this->m_result = this->m_device.get(
+          static_cast<Scalar *>(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar))));
+      data = this->m_result;
+    }
+    evalToSycl(data);
+    return (this->m_result != NULL);
+  }
+  const Eigen::SyclDevice &device() const { return this->m_device; }
+  void evalToSycl(typename Base::EvaluatorPointerType buffer) const {
+    if (this->m_lhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, true, true, Unaligned>(buffer);
+        } else {
+          evalTyped<true, true, false, Unaligned>(buffer);
+        }
+      } else {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, false, true, Unaligned>(buffer);
+        } else {
+          evalTyped<true, false, false, Unaligned>(buffer);
+        }
+      }
+    } else {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, true, true, Unaligned>(buffer);
+        } else {
+          evalTyped<false, true, false, Unaligned>(buffer);
+        }
+      } else {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, false, true, Unaligned>(buffer);
+        } else {
+          evalTyped<false, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalTyped(typename Base::EvaluatorPointerType buffer) const {
+    const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size};
+    typedef internal::TensorContractionInputMapper<
+        LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t,
+        PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakeSYCLPointer>
+        LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, StorageIndex, internal::Rhs, RightEvaluator,
+                                                   right_nocontract_t, contract_t,
+                                                   PacketType<CoeffReturnType, Device>::size, rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned, MakeSYCLPointer>
+        RhsMapper;
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+    if (triple_dim.M == 1 && triple_dim.N == 1) {
+      launchSC(buffer, lhs, rhs, triple_dim.K);
+    } else
+#endif
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+        if (triple_dim.M != 1 && triple_dim.N == 1) {
+      LaunchVT<false>(buffer, rhs, lhs, triple_dim.M, triple_dim.K);
+    } else if (triple_dim.M == 1 && triple_dim.N != 1) {
+      LaunchVT<true>(buffer, lhs, rhs, triple_dim.N, triple_dim.K);
+    } else  // This is equivalent of if (m!=1 && n!=1)
+#endif
+    {
+      typedef input_mapper_propertis<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>
+          inpt_mapper_properties;
+#ifndef EIGEN_SYCL_DISABLE_SKINNY
+      bool skinny = false;
+      auto platform_name = this->device().getPlatformName();
+      // This is based on empirical calculation for AMD r9-nano and Fiji
+      if (platform_name.find("AMD") == 0) {
+        skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) &&
+                 ((triple_dim.M < 1024 && triple_dim.N < 1024) ||
+                  (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K)));
+      } else {
+        skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) ||
+                  ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) ||
+                  ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100));
+      }
+      if (skinny)
+        adjustTT<true, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
+      else
+#endif  // EIGEN_SYCL_DISABLE_SKINNY
+        adjustTT<false, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
+    }
+  }
+
+  template <bool skinny, typename input_mapper_properties, typename LhsMapper, typename RhsMapper>
+  void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+                                    const TripleDim &triple_dim) const {
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+    if (device().has_local_memory()) {
+      typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 16> PanelParameters;
+      launchTT<TensorSycl::internal::contraction_type::local, skinny, input_mapper_properties, PanelParameters>(
+          buffer, lhs, rhs, triple_dim);
+    }
+#endif
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF
+    if (!(device().has_local_memory())) {
+      typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 4> PanelParameters;
+      launchTT<TensorSycl::internal::contraction_type::no_local, skinny, input_mapper_properties, PanelParameters>(
+          buffer, lhs, rhs, triple_dim);
+    }
+#endif
+  }
+
+  template <TensorSycl::internal::contraction_type ct, bool skinny, typename input_mapper_properties,
+            typename Properties, typename LhsMapper, typename RhsMapper>
+  void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+                const TripleDim &triple_dim) const {
+    const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM);
+    const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN);
+    const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM;
+    const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN;
+
+    const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK);
+    StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK;
+    StorageIndex groupSizeK =
+        skinny
+            ? std::max(std::min(totalTilesK,
+                                (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) /
+                                    (groupSizeM * groupSizeN)),
+                       StorageIndex(1))
+            : StorageIndex(1);
+
+    const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK;
+
+    const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK;
+
+    const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
+    const StorageIndex globalRange = totalGroupSize * localRange;
+
+    const StorageIndex scratchSize = (ct == TensorSycl::internal::contraction_type::local)
+                                         ? ((Properties::DoubleBuffer + 1) *
+                                            (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) +
+                                               ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) *
+                                                (Properties::TileSizeDimN + Properties::BC))
+                                         : StorageIndex(1);
+
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+    if (groupSizeK == 1) {
+      typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
+                                                            LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
+                                                            PacketAccess, input_mapper_properties, true, ct>
+          ContractKernelName;
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+          lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim);
+    } else {
+      typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
+                                                            LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
+                                                            PacketAccess, input_mapper_properties, false, ct>
+          ContractKernelName;
+      CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
+          device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType)));
+      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+          lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup,
+          triple_dim);
+
+      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+      auto op = Op();
+      typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
+                                                               EvaluatorPointerType, Op>
+          ReductionKernel;
+
+      device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
+          tmp_global_accessor, buffer,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex(
+                                    Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))),
+                                cl::sycl::range<1>(localRange)),
+          StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK);
+
+      device().deallocate_temp(temp_pointer);
+    }
+  }
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+  template <bool is_lhs_vec, typename VectorMapper, typename TensorMapper, typename StorageIndex>
+  void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat,
+                                    StorageIndex NC, StorageIndex C) const {
+    const StorageIndex nonContractDim = NC;
+    EIGEN_CONSTEXPR StorageIndex NCFactor = 1;
+    EIGEN_CONSTEXPR StorageIndex CFactor = 1;
+    EIGEN_CONSTEXPR StorageIndex NCWindow = 16;
+    typedef Eigen::TensorSycl::internal::TVPanelSize<CoeffReturnType, StorageIndex, NCWindow, CFactor, NCFactor>
+        Properties;
+    const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC);
+    const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC);
+    const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC);
+    const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC);
+    const StorageIndex globalRange =
+        (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC));
+    const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC;
+    const StorageIndex scratchSize =
+        (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+    if (cNumGroups > 1) {
+      typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
+                                                               TensorMapper, StorageIndex, Properties, CFactor, false,
+                                                               is_lhs_vec, false>
+          ContractKernelName;
+      CoeffReturnType *temp_pointer =
+          static_cast<CoeffReturnType *>(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType)));
+      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+          vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
+
+      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+      typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
+                                                               EvaluatorPointerType, Op>
+          ReductionKernel;
+
+      device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
+          tmp_global_accessor, buffer,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)),
+                                cl::sycl::range<1>(localRange)),
+          StorageIndex(1), Op(), nonContractDim, cNumGroups);
+
+      device().deallocate_temp(temp_pointer);
+    } else {
+      typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
+                                                               TensorMapper, StorageIndex, Properties, CFactor, false,
+                                                               is_lhs_vec, true>
+          ContractKernelName;
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+          vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
+    }
+  }
+#endif
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+  template <typename LhsMapper, typename RhsMapper>
+  EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+                                    StorageIndex K) const {
+    EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
+                          (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
+                        "The Local thread size must be a power of 2 for the reduction "
+                        "operation");
+    EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
+
+    // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread
+    // reduces at least 512 elementss individually, we get better performance.
+    const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1);
+    const StorageIndex global_range = num_work_group * local_range;
+
+    typedef Eigen::TensorSycl::internal::GeneralScalarContraction<
+        CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false>
+        ContractKernelName;
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+    if (num_work_group > 1) {
+      CoeffReturnType *temp_pointer =
+          static_cast<CoeffReturnType *>(device().allocate_temp(num_work_group * sizeof(CoeffReturnType)));
+      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, tmp_global_accessor,
+                                                                                    thread_range, local_range, K);
+      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+      typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
+                                                          EvaluatorPointerType, StorageIndex, local_range>
+          GenericRKernel;
+      device().template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
+          tmp_global_accessor, buffer,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op());
+
+      device().deallocate_temp(temp_pointer);
+    } else {
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, buffer, thread_range,
+                                                                                    local_range, K);
+    }
+  }
+#endif
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    this->m_leftImpl.cleanup();
+    this->m_rightImpl.cleanup();
+
+    if (this->m_result) {
+      this->m_device.deallocate_temp(this->m_result);
+      this->m_result = NULL;
+    }
+  }
+  // The placeholder accessors must bound to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    this->m_leftImpl.bind(cgh);
+    this->m_rightImpl.bind(cgh);
+    this->m_result.bind(cgh);
+  }
+};
+}  // namespace Eigen
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index fb2f618..21be6ea 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h

@@ -13,37 +13,22 @@
 // evaluator for thread pool device
 #ifdef EIGEN_USE_THREADS
 
-// This contains two implementations of threaded contraction operations:
-// 1. The first (newer) is generally faster but it relies on the new
-// non-blocking thread pool for performance.
-// 2. The second (older) is left here until we wholesale switch to the
-// non-blocking thread pool.
-
 namespace Eigen {
 
-template <typename Indices, typename LeftArgType, typename RightArgType>
-struct TensorEvaluator<
-    const TensorContractionOp<Indices, LeftArgType, RightArgType>,
-    ThreadPoolDevice>
-    : public TensorContractionEvaluatorBase<TensorEvaluator<
-          const TensorContractionOp<Indices, LeftArgType, RightArgType>,
-          ThreadPoolDevice> > {
+template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> :
+    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> > {
+
   typedef ThreadPoolDevice Device;
 
-  typedef TensorEvaluator<
-      const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device>
-      Self;
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
   typedef TensorContractionEvaluatorBase<Self> Base;
 
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
-  typedef
-      typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, ThreadPoolDevice>::type
-      PacketReturnType;
-  static const int PacketSize =
-      internal::unpacket_traits<typename Self::PacketReturnType>::size;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
 
   enum {
     Layout = TensorEvaluator<LeftArgType, Device>::Layout,
@@ -54,16 +39,14 @@
   // If we want to compute A * B = C, where A is LHS and B is RHS, the code
   // will pretend B is LHS and A is RHS.
   typedef typename internal::conditional<
-      static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType,
-      RightArgType>::type EvalLeftArgType;
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
   typedef typename internal::conditional<
-      static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType,
-      LeftArgType>::type EvalRightArgType;
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
 
-  static const int LDims = internal::array_size<
-      typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
-  static const int RDims = internal::array_size<
-      typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
   static const int ContractDims = internal::array_size<Indices>::value;
 
   typedef array<Index, LDims> left_dim_mapper_t;
@@ -78,375 +61,295 @@
   typedef DSizes<Index, NumDims> Dimensions;
 
   // typedefs needed in evalTo
-  typedef
-      typename internal::remove_const<typename EvalLeftArgType::Scalar>::type
-          LhsScalar;
-  typedef
-      typename internal::remove_const<typename EvalRightArgType::Scalar>::type
-          RhsScalar;
+  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
   typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
 
   typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
   typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
 
-  TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+  TensorEvaluator(const XprType& op, const Device& device) :
+      Base(op, device) {}
 
   template <int Alignment>
   void evalProduct(Scalar* buffer) const {
+    evalProductImpl<NoCallback, Alignment>(buffer, NoCallback());
+  }
+
+  template <typename EvalToCallback, int Alignment>
+  void evalProductAsync(Scalar* buffer, EvalToCallback done) const {
+    evalProductImpl<EvalToCallback, Alignment>(buffer, std::move(done));
+  }
+
+  template <typename DoneCallback, int Alignment>
+  void evalProductImpl(Scalar* buffer, DoneCallback done) const {
+    // This function computes a lot of heuristics in multiple steps, and it
+    // also has multiple exit points. To keep it sane, readable and all in one
+    // place, sync/async execution decision is made at runtime at the very end.
+    //
+    // (1) In sync mode we allocate Context on the stack, submit computations
+    //     to the device thread pool, and block on a barrier until it is
+    //     completed.
+    //
+    // (2) In async mode we allocate Context on the heap, and after all tasks
+    //     are finished, we call provided the done callback, and delete a
+    //     context from the heap.
+    //
+    // (*) EvalParallelContext & EvalShardedByInnerDimContext owns all the state
+    // and temporary buffers, requried for executing the tensor contraction.
+    // They are responsible for cleaning it up after contraction is done.
+    static const bool IsEvalInSyncMode =
+        std::is_same<DoneCallback, NoCallback>::value;
+
     const Index m = this->m_i_size;
     const Index n = this->m_j_size;
     const Index k = this->m_k_size;
     if (m == 0 || n == 0 || k == 0) return;
 
-    if (this->m_can_use_xsmm) {
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-      bool transposeA = !this->m_lhs_inner_dim_contiguous;
-      bool transposeB = !this->m_rhs_inner_dim_contiguous;
-      internal::TensorXsmmContractionBlocking<LhsScalar, RhsScalar, Index>
-          blocking(k, m, n, this->m_device.numThreads(), transposeA,
-                   transposeB);
+    // Compute a set of algorithm parameters:
+    // - kernel block sizes (bm, bn, bk)
+    // - task grain sizes (number of kernels executed per task: gm, gn)
+    // - number of threads
+    // - sharding by row/column
+    // - parallel packing or first lhs then rhs
+    // and some derived parameters:
+    // - number of tasks (nm, nn, nk)
+    // - number of kernels (nm0, nn0)
+    // Unfortunately, all these parameters are tightly interdependent.
+    // So in some cases we first compute approximate values, then compute other
+    // values based on these approximations and then refine the approximations.
 
-      if (blocking.num_threads() == 1) {
-        TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential,
-                                    Unaligned, (buffer));
-        return;
-      }
+    // There are lots of heuristics here. There is some reasoning behind them,
+    // but ultimately they are just tuned on contraction benchmarks for
+    // different input configurations, thread counts and instruction sets.
+    // So feel free to question any of them.
 
-      ContextXsmm<Alignment>(this, buffer, m, n, k, blocking).run();
-      return;
-#endif
+    // Compute whether we want to shard by row or by column.
+    // This is a first approximation, it will be refined later. Since we don't
+    // know number of threads yet we use 2, because what's we are most
+    // interested in at this point is whether it makes sense to use
+    // parallelization at all or not.
+    bool shard_by_col = shardByCol(m, n, 2);
+
+    // First approximation of kernel blocking sizes.
+    // Again, we don't know number of threads yet, so we use 2.
+    Index bm, bn, bk;
+    if (shard_by_col) {
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
+                                          internal::ShardByCol>
+          blocking(k, m, n, 2);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
     } else {
-      // Compute a set of algorithm parameters:
-      // - kernel block sizes (bm, bn, bk)
-      // - task grain sizes (number of kernels executed per task: gm, gn)
-      // - number of threads
-      // - sharding by row/column
-      // - parallel packing or first lhs then rhs
-      // and some derived parameters:
-      // - number of tasks (nm, nn, nk)
-      // - number of kernels (nm0, nn0)
-      // Unfortunately, all these parameters are tightly interdependent.
-      // So in some cases we first compute approximate values, then compute
-      // other values based on these approximations and then refine the
-      // approximations.
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
+                                          internal::ShardByRow>
+          blocking(k, m, n, 2);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    }
 
-      // There are lots of heuristics here. There is some reasoning behind them,
-      // but ultimately they are just tuned on contraction benchmarks for
-      // different input configurations, thread counts and instruction sets.
-      // So feel free to question any of them.
-
-      // Compute whether we want to shard by row or by column.
-      // This is a first approximation, it will be refined later. Since we don't
-      // know number of threads yet we use 2, because what's we are most
-      // interested in at this point is whether it makes sense to use
-      // parallelization at all or not.
-      bool shard_by_col = shardByCol(m, n, 2);
-
-      // First approximation of kernel blocking sizes.
-      // Again, we don't know number of threads yet, so we use 2.
-      Index bm, bn, bk;
-      if (shard_by_col) {
-        internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
-                                            internal::ShardByCol>
-            blocking(k, m, n, 2);
-        bm = blocking.mc();
-        bn = blocking.nc();
-        bk = blocking.kc();
+    // Compute optimal number of threads.
+    // Note: we use bk instead of k here because we are interested in amount of
+    // _parallelizable_ computations, and computations are not parallelizable
+    // across k dimension.
+    const TensorOpCost cost =
+        contractionCost(m, n, bm, bn, bk, shard_by_col, false);
+    int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+        static_cast<double>(n) * m, cost, this->m_device.numThreads());
+    int num_threads_by_k = numThreadsInnerDim(m, n, k);
+    if (shardByInnerDim(m, n, k, num_threads, num_threads_by_k)) {
+      // We are in the scenario where it is more effective to shard by the
+      // inner dimension.
+      if (IsEvalInSyncMode) {
+        EvalShardedByInnerDimContext<DoneCallback> ctx(
+            this, num_threads_by_k, buffer, m, n, k, std::move(done));
+        ctx.template run<Alignment>();
       } else {
-        internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
-                                            internal::ShardByRow>
-            blocking(k, m, n, 2);
-        bm = blocking.mc();
-        bn = blocking.nc();
-        bk = blocking.kc();
+        auto* ctx = new EvalShardedByInnerDimContext<DoneCallback>(
+            this, num_threads_by_k, buffer, m, n, k, std::move(done));
+        ctx->template runAsync<Alignment>();
       }
 
-      // Compute optimal number of threads.
-      // Note: we use bk instead of k here because we are interested in amount
-      // of _parallelizable_ computations, and computations are not
-      // parallelizable across k dimension.
-      const TensorOpCost cost =
-          contractionCost(m, n, bm, bn, bk, shard_by_col, false);
-      int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
-          static_cast<double>(n) * m, cost, this->m_device.numThreads());
-        int num_threads_by_k = numThreadsInnerDim(m, n, k);
-        if (shardByInnerDim(m, n, k, num_threads, num_threads_by_k)) {
-          // We are in the scenario where it is more effective to shard by the
-          // inner dimension.
-          this->template evalShardedByInnerDim<Alignment>(num_threads_by_k,
-                                                          buffer);
-          return;
-        }
+      return;
+    }
 
-      // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
-      // model is not tuned. Remove this when the cost model is tuned. Part of
-      // the reason for this is that Eigen has an optimized kernel for the n=1
-      // case, which is roughly 4x faster than the general kernel on the same
-      // matrix.
-      if (n == 1) num_threads = 1;
+    // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
+    // model is not tuned. Remove this when the cost model is tuned.
+    if (n == 1) num_threads = 1;
 
-      if (num_threads == 1) {
-        TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential,
-                                    Unaligned, (buffer));
-        return;
-      }
+    if (num_threads == 1) {
+      TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential,
+                                  Unaligned, (buffer));
+      if (!IsEvalInSyncMode) done();
+      return;
+    }
 
-      // Now that we know number of threads, recalculate sharding and blocking.
-      shard_by_col = shardByCol(m, n, num_threads);
-      if (shard_by_col) {
-        internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
-                                            internal::ShardByCol>
-            blocking(k, m, n, num_threads);
-        bm = blocking.mc();
-        bn = blocking.nc();
-        bk = blocking.kc();
-      } else {
-        internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
-                                            internal::ShardByRow>
-            blocking(k, m, n, num_threads);
-        bm = blocking.mc();
-        bn = blocking.nc();
-        bk = blocking.kc();
-      }
+    // Now that we know number of threads, recalculate sharding and blocking.
+    shard_by_col = shardByCol(m, n, num_threads);
+    if (shard_by_col) {
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
+                                          internal::ShardByCol>
+          blocking(k, m, n, num_threads);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    } else {
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
+                                          internal::ShardByRow>
+          blocking(k, m, n, num_threads);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    }
 
-      // Number of kernels for each dimension.
-      Index nm0 = divup(m, bm);
-      Index nn0 = divup(n, bn);
-      Index nk = divup(k, bk);
+    // Number of kernels for each dimension.
+    Index nm0 = divup(m, bm);
+    Index nn0 = divup(n, bn);
+    Index nk = divup(k, bk);
 
-      // Calculate task grain size (number of kernels executed per task).
-      // This task size coarsening serves two purposes:
-      // 1. It reduces per-task overheads including synchronization overheads.
-      // 2. It allows to use caches better (reuse the same packed rhs in several
-      // consecutive kernels).
-      Index gm = 1;
-      Index gn = 1;
-      // If we are sharding by column, then we prefer to reduce rows first.
-      if (shard_by_col) {
-        gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
-        gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
-      } else {
-        gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
-        gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
-      }
-      // Number of tasks in each dimension.
-      Index nm = divup(nm0, gm);
-      Index nn = divup(nn0, gn);
+    // Calculate task grain size (number of kernels executed per task).
+    // This task size coarsening serves two purposes:
+    // 1. It reduces per-task overheads including synchronization overheads.
+    // 2. It allows to use caches better (reuse the same packed rhs in several
+    // consecutive kernels).
+    Index gm = 1;
+    Index gn = 1;
+    // If we are sharding by column, then we prefer to reduce rows first.
+    if (shard_by_col) {
+      gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
+      gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
+    } else {
+      gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
+      gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
+    }
+    // Number of tasks in each dimension.
+    Index nm = divup(nm0, gm);
+    Index nn = divup(nn0, gn);
 
-      // Last by not least, decide whether we want to issue both lhs and rhs
-      // packing in parallel; or issue lhs packing first, and then issue rhs
-      // packing when lhs packing completes (for !shard_by_col lhs and rhs are
-      // swapped). Parallel packing allows more parallelism (for both packing
-      // and kernels), while sequential packing provides better locality (once
-      // a thread finishes rhs packing it proceed to kernels with that rhs).
-      // First, we are interested in parallel packing if there are few tasks.
-      bool parallel_pack = num_threads >= nm * nn;
-      // Also do parallel packing if all data fits into L2$.
-      if (m * bk * sizeof(LhsScalar) + n * bk * sizeof(RhsScalar) <=
-          l2CacheSize() * num_threads)
-        parallel_pack = true;
-      // But don't do it if we will use each rhs only once. Locality seems to be
-      // more important in this case.
-      if ((shard_by_col ? nm : nn) == 1) parallel_pack = false;
+    // If there is enough concurrency in the sharding dimension, we choose not
+    // to paralellize by the other dimension, and execute all kernels in sync
+    // mode. This reduces parallelism from the nm x nn down to nn
+    // (shard_by_col==true) or nm (shard_by_col==false).
+    const Index sharding_dim_tasks = shard_by_col ? nn : nm;
+    const int num_worker_threads = this->m_device.numThreadsInPool();
 
+    // With small number of threads we want to make sure that we do not reduce
+    // parallelism too much. With large number of threads we trade maximum
+    // parallelism for better memory locality.
+    const float oversharding_factor =
+        num_worker_threads <= 4  ? 8.0 :
+        num_worker_threads <= 8  ? 4.0 :
+        num_worker_threads <= 16 ? 2.0 :
+        num_worker_threads <= 32 ? 1.0 :
+        num_worker_threads <= 64 ? 0.8 : /* num_worker_threads > 64 */ 0.6;
+
+    const bool parallelize_by_sharding_dim_only =
+        sharding_dim_tasks >= oversharding_factor * num_worker_threads;
+
+    // Last by not least, decide whether we want to issue both lhs and rhs
+    // packing in parallel; or issue lhs packing first, and then issue rhs
+    // packing when lhs packing completes (for !shard_by_col lhs and rhs are
+    // swapped). Parallel packing allows more parallelism (for both packing and
+    // kernels), while sequential packing provides better locality (once
+    // a thread finishes rhs packing it proceed to kernels with that rhs).
+    // First, we are interested in parallel packing if there are few tasks.
+    bool parallel_pack = num_threads >= nm * nn;
+    // Also do parallel packing if all data fits into L2$.
+    if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <=
+        l2CacheSize() * num_threads)
+      parallel_pack = true;
+    // But don't do it if we will use each rhs only once. Locality seems to be
+    // more important in this case.
+    if ((shard_by_col ? nm : nn) == 1) parallel_pack = false;
+    // Also don't get in the way of parallelize_by_sharding_dim_only
+    // optimization.
+    if (parallelize_by_sharding_dim_only) parallel_pack = false;
+
+    // TODO(ezhulnev): With if contexpr we don't need SyncEvalParallelContext.
+    if (IsEvalInSyncMode) {
 #define CONTEXT_ARGS                                                        \
   (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \
-   nn0, shard_by_col, parallel_pack)                                        \
+   nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only,      \
+   NoCallback())                                                            \
       .run()
+      TENSOR_CONTRACTION_DISPATCH(SyncEvalParallelContext, Alignment,
+                                  CONTEXT_ARGS);
+#undef CONTEXT_ARGS
 
-      TENSOR_CONTRACTION_DISPATCH(Context, Alignment, CONTEXT_ARGS);
-
+    } else {
+#define CONTEXT_ARGS                                                        \
+  (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \
+   nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only,      \
+   std::move(done))
+      TENSOR_CONTRACTION_ASYNC_DISPATCH(EvalParallelContext, DoneCallback,
+                                        Alignment, CONTEXT_ARGS, run());
 #undef CONTEXT_ARGS
     }
   }
 
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-  template<int Alignment>
-  class ContextXsmm {
+  // ------------------------------------------------------------------------ //
+
+  // Dummy struct to represent an empty DoneCallback.
+
+  struct NoCallback {
+    void operator()() {
+      eigen_assert(false && "NoCallback should never be called");
+    }
+  };
+
+  // ------------------------------------------------------------------------ //
+
+  template <typename DoneCallback, typename Context>
+  class EvalParallelNotification;
+
+  // Synchronous evaluation notification that blocks caller thread in Wait().
+  template <typename Context>
+  class EvalParallelNotification<NoCallback, Context> {
    public:
-    ContextXsmm(const Self* self, Scalar* buffer, Index m, Index n, Index k,
-                const internal::TensorXsmmContractionBlocking<LhsScalar,
-                    RhsScalar, Index>& blocking):
-        m(m), n(n), k(k),
-        bm(blocking.mc()), bk(blocking.kc()), bn(blocking.nc()),
-        copyA(blocking.copyA()), copyB(blocking.copyB()),
-        transposeA(blocking.transposeA()), transposeB(blocking.transposeB()),
-        num_threads(blocking.num_threads()),
-        stride_a(blocking.transposeA() ? k : m),
-        stride_b(blocking.transposeB() ? n : k),
-        stride_c(m),
-        blocks_m(blocking.blocks_m()), blocks_k(blocking.blocks_k()),
-        blocks_n(blocking.blocks_n()),
-        buffer(buffer), device(self->m_device),
-        workers_done(blocking.num_threads()),
-        leftData(self->m_leftImpl.data()), rightData(self->m_rightImpl.data()),
-        packingA_done(blocking.blocks_m()), packingB_done(blocking.blocks_n()),
-        packingA_jobs(0), packingB_jobs(0), compute_jobs(0) {}
+    EvalParallelNotification(Context*, NoCallback) {}
+    void Notify() { done_.Notify(); }
+    void Wait() { done_.Wait(); }
+   private:
+    Eigen::Notification done_;
+  };
 
-    void worker() {
-      // Pack
+  // Asynchronous evaluation notification that does not block in Wait().
+  template <typename DoneCallback, typename Context>
+  class EvalParallelNotification {
+   public:
+    EvalParallelNotification(Context* ctx, DoneCallback done)
+        : ctx_(ctx), done_(std::move(done)) {}
 
-      if (copyA) {
-        while (true) {
-          uint32_t mk = packingA_jobs++;
-          Index mi = mk / blocks_k;
-          Index ki = mk % blocks_k;
-          if (mi >= blocks_m) break;
+    void Notify() {
+      // Make a copy of done callback, because it will be destructed when we
+      // will delete context in the next line (EvalParallelNotification is a
+      // data member of EvalParallelContext class).
+      DoneCallback done_copy = std::move(done_);
 
-          LhsScalar * blockA = blocksA + (bk*bm) * (mi*blocks_k+ki);
-          if (transposeA) {
-            const LhsScalar * current_a = leftData + (bm*mi)*stride_a + (bk*ki);
-            libxsmm_otrans(blockA, current_a, sizeof(LhsScalar), actual_bk(ki),
-                           actual_bm(mi), stride_a, bm);
-          } else {
-            const LhsScalar * current_a = leftData + (bk*ki)*stride_a + (bm*mi);
-            internal::pack_simple<LhsScalar, Index>(blockA, current_a,
-                actual_bk(ki), actual_bm(mi), bm, stride_a);
-          }
-          packingA_done.at(mi)++;
-        }
-      }
+      // Delete parallel evaluation context.
+      delete ctx_;
 
-      if (copyB) {
-        while (true) {
-          uint32_t nk = packingB_jobs++;
-          Index ni = nk / blocks_k;
-          Index ki = nk % blocks_k;
-          if (ni >= blocks_n) break;
-
-          RhsScalar * blockB = blocksB + (bk*bn) * (ni*blocks_k+ki);
-          if (transposeB) {
-            const RhsScalar * current_b = rightData + (ki*bk)*stride_b +
-                                          (ni*bn);
-            libxsmm_otrans(blockB, current_b, sizeof(RhsScalar), actual_bn(ni),
-                           actual_bk(ki), stride_b, bk);
-          } else {
-            const RhsScalar * current_b = rightData + (ni*bn)*stride_b +
-                                          (ki*bk);
-            internal::pack_simple<RhsScalar, Index>(blockB, current_b,
-                actual_bn(ni), actual_bk(ki), bk, stride_b);
-          }
-          packingB_done.at(ni)++;
-        }
-      }
-
-      // Compute
-
-      while (true) {
-        uint32_t mn = compute_jobs++;
-        Index mi = mn / blocks_n;
-        Index ni = mn % blocks_n;
-        if (mi >= blocks_m) break;
-
-        // Wait for mi, ni packings to be done. This is more fine-grained than
-        // waiting for all workers to finish packing.
-        while ((copyA && (packingA_done.at(mi) < blocks_k)) ||
-               (copyB && (packingB_done.at(ni) < blocks_k)))
-        {}
-
-        for (Index ki=0; ki < blocks_k; ++ki) {
-          const LhsScalar * current_a = copyA ?
-              blocksA + (bk*bm) * (mi*blocks_k+ki) :
-              leftData + (bk*ki)*stride_a + (bm*mi);
-          const RhsScalar * current_b = copyB ?
-              blocksB + (bk*bn) * (ni*blocks_k+ki) :
-              rightData + (ni*bn)*stride_b + (bk*ki);
-
-          Index current_stride_a = copyA ? bm : stride_a;
-          Index current_stride_b = copyB ? bk : stride_b;
-
-          // Memory may not be zeroed, overwrite instead of adding in first
-          // iteration.
-          float beta = ki == 0 ? 0 : 1;
-
-          Scalar * current_c = buffer + (mi*bm) + (ni*bn)*stride_c;
-          internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(
-              0, actual_bm(mi), actual_bn(ni), actual_bk(ki),
-              current_stride_a, current_stride_b, stride_c, 1, beta, 0)
-          (current_a, current_b, current_c);
-        }
-      }
-
-      workers_done.Notify();
+      // Now safely call the done callback.
+      done_copy();
     }
 
-    void run() {
-      // Parallelization strategy.
-      //
-      // First pack A into blocks (sharding by m, k) and B (sharding by n,k),
-      // then shard by m, n.
-      //
-      // Do not use advanced ThreadPool queuing, just run a single long-standing
-      // function in each thread.
-
-      internal::EigenStatsWrapper::get()->add(internal::MatmulOp{
-        internal::MatmulOp::Algorithm::XSMM, m, k, n, transposeA, transposeB,
-        num_threads});
-
-      if (copyA) {
-        blocksA = static_cast<LhsScalar*>(device.allocate(
-            (blocks_m*bm)*(blocks_k*bk)*sizeof(LhsScalar)));
-      }
-      if (copyB) {
-        blocksB = static_cast<RhsScalar*>(device.allocate(
-            (blocks_n*bn)*(blocks_k*bk)*sizeof(RhsScalar)));
-      }
-
-      for (Index i=0; i < num_threads; ++i) {
-          device.enqueue_function([=]() { worker(); });
-      }
-
-      workers_done.Wait();
-
-      if (copyA) {
-        device.deallocate(blocksA);
-      }
-      if (copyB) {
-        device.deallocate(blocksB);
-      }
-    }
+    void Wait() {}
 
    private:
-    // real block size for block index in [0, ..., blocks - 1].
-    Index actual_bm(Index mi) const {
-      return mi != blocks_m - 1 ? bm : m + bm - bm * blocks_m;
-    }
-    Index actual_bk(Index ki) const {
-      return ki != blocks_k - 1 ? bk : k + bk - bk * blocks_k;
-    }
-    Index actual_bn(Index ni) const {
-      return ni != blocks_n - 1 ? bn : n + bn - bn * blocks_n;
-    }
-
-    const Device& device;
-    Index m, k, n;
-    Index stride_a, stride_b, stride_c;
-    Index bm, bk, bn;  // Block sizes.
-    Index blocks_m, blocks_k, blocks_n;  // Number of blocks in each dimension.
-    bool copyA, copyB, transposeA, transposeB;
-    size_t num_threads;
-    Scalar *buffer;
-    const LhsScalar *leftData;
-    const RhsScalar *rightData;
-
-    LhsScalar *blocksA;
-    RhsScalar *blocksB;
-    // barrier for joining all threads after all done.
-    Barrier workers_done;
-    // "queues" of (mi,ki), (ki,ni), (mi,ni) jobs packed [0,p)x[0,q) -> [0, p*q)
-    std::atomic<uint32_t> packingA_jobs;
-    std::atomic<uint32_t> packingB_jobs;
-    std::atomic<uint32_t> compute_jobs;
-    // already packed blocks for each mi-panel in A and ni-panel in B.
-    std::vector<std::atomic<uint8_t>> packingA_done;
-    std::vector<std::atomic<uint8_t>> packingB_done;
+    Context* ctx_;
+    DoneCallback done_;
   };
-#endif
 
-  // Context coordinates a single parallel gemm operation.
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
-            bool rhs_inner_dim_reordered, int Alignment>
-  class Context {
+  // Context orchestrates sync/async parallel contraction evaluation. When it is
+  // executed in asynchronous mode, it owns all the shared state that might be
+  // accessible by block packing and kernel tasks.
+
+  template <typename DoneCallback, bool lhs_inner_dim_contiguous,
+            bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered,
+            int Alignment>
+  class EvalParallelContext {
    public:
     typedef internal::TensorContractionInputMapper<
         LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
@@ -458,23 +361,27 @@
         contract_t, internal::packet_traits<RhsScalar>::size,
         rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
         RhsMapper;
-    typedef internal::gemm_pack_lhs<LhsScalar, Index,
-                                    typename LhsMapper::SubMapper, Traits::mr,
-                                    Traits::LhsProgress, ColMajor>
-        LhsPacker;
-    typedef internal::gemm_pack_rhs<
-        RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
-        RhsPacker;
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
-                                  Traits::mr, Traits::nr, false, false>
-        GebpKernel;
 
-    Context(const Self* self, int num_threads, Scalar* buffer, Index m, Index n,
-            Index k, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk,
-            Index gm, Index gn, Index nm0, Index nn0, bool shard_by_col,
-            bool parallel_pack)
-        : device_(self->m_device),
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+    typedef internal::TensorContractionKernel<
+        Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
+        TensorContractionKernel;
+
+    typedef typename TensorContractionKernel::LhsBlock LhsBlock;
+    typedef typename TensorContractionKernel::RhsBlock RhsBlock;
+    typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle;
+
+    EvalParallelContext(const Self* self, int num_threads, Scalar* buffer,
+                        Index tm, Index tn, Index tk, Index bm, Index bn,
+                        Index bk, Index nm, Index nn, Index nk, Index gm,
+                        Index gn, Index nm0, Index nn0, bool shard_by_col,
+                        bool parallel_pack,
+                        bool parallelize_by_sharding_dim_only,
+                        DoneCallback done)
+        : created_by_thread_id_(std::this_thread::get_id()),
+          done_(this, std::move(done)),
+          device_(self->m_device),
           lhs_(self->m_leftImpl, self->m_left_nocontract_strides,
                self->m_i_strides, self->m_left_contracting_strides,
                self->m_k_strides),
@@ -482,11 +389,16 @@
                self->m_j_strides, self->m_right_contracting_strides,
                self->m_k_strides),
           buffer_(buffer),
-          output_(buffer, m),
+          output_(buffer, tm),
+          output_kernel_(self->m_output_kernel),
+          tensor_contraction_params_(self->m_tensor_contraction_params),
           num_threads_(num_threads),
-          m_(m),
-          n_(n),
-          k_(k),
+          shard_by_col_(shard_by_col),
+          parallel_pack_(parallel_pack),
+          parallelize_by_sharding_dim_only_(parallelize_by_sharding_dim_only),
+          m_(tm),
+          n_(tn),
+          k_(tk),
           bm_(bm),
           bn_(bn),
           bk_(bk),
@@ -497,13 +409,28 @@
           gn_(gn),
           nm0_(nm0),
           nn0_(nn0),
-          shard_by_col_(shard_by_col),
-          parallel_pack_(parallel_pack) {
+          kernel_(m_, k_, n_, bm_, bk_, bn_),
+          num_thread_local_allocations_(0),
+          // We reserve 2X more capacity for a thread local values, than the
+          // number of threads in the pool to efficiently handle task stealing
+          // by threads that are not managed by the pool.
+          thread_local_capacity(2 * (parallelize_by_sharding_dim_only_
+                                         ? device_.numThreadsInPool()
+                                         : 0)),
+          // We will use only one of the Lhs/Rhs thread local storage depending
+          // on the shard_by_col value and we parallelize by sharding dim ONLY.
+          lhs_thread_local_blocks_(shard_by_col_ ? 0 : thread_local_capacity,
+                                   {*this}, {*this}),
+          rhs_thread_local_blocks_(shard_by_col_ ? thread_local_capacity : 0,
+                                   {*this}, {*this}) {
+      // These two options are mutually exclusive.
+      eigen_assert(!(parallel_pack && parallelize_by_sharding_dim_only));
+
       for (Index x = 0; x < P; x++) {
         // Normal number of notifications for k slice switch is
         // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only
         // nm_ + nn_ notifications, because they will not receive notifications
-        // from preceeding kernels.
+        // from preceding kernels.
         state_switch_[x] =
             x == 0
                 ? 1
@@ -513,7 +440,7 @@
             parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_);
         state_kernel_[x] = new std::atomic<uint8_t>*[nm_];
         for (Index m = 0; m < nm_; m++) {
-          state_kernel_[x][m] = new std::atomic<uint8_t>[ nn_ ];
+          state_kernel_[x][m] = new std::atomic<uint8_t>[nn_];
           // Kernels generally receive 3 notifications (previous kernel + 2
           // packing), but the first slice won't get notifications from previous
           // kernels.
@@ -525,64 +452,97 @@
       }
 
       // Allocate memory for packed rhs/lhs matrices.
-      size_t align = numext::maxi(EIGEN_ALIGN_BYTES, 1);
-      size_t lhs_size =
-          divup<size_t>(bm_ * bk_ * sizeof(LhsScalar), align) * align;
-      size_t rhs_size =
-          divup<size_t>(bn_ * bk_ * sizeof(RhsScalar), align) * align;
-      packed_mem_ = static_cast<char*>(internal::aligned_malloc(
-          (nm0_ * lhs_size + nn0_ * rhs_size) * std::min<size_t>(nk_, P - 1)));
-      char* mem = static_cast<char*>(packed_mem_);
-      for (Index x = 0; x < numext::mini<size_t>(nk_, P - 1); x++) {
-        packed_lhs_[x].resize(nm0_);
-        for (Index m = 0; m < nm0_; m++) {
-          packed_lhs_[x][m] = reinterpret_cast<LhsScalar*>(mem);
-          mem += lhs_size;
-        }
-        packed_rhs_[x].resize(nn0_);
-        for (Index n = 0; n < nn0_; n++) {
-          packed_rhs_[x][n] = reinterpret_cast<RhsScalar*>(mem);
-          mem += rhs_size;
+      packed_mem_ = kernel_.allocateSlices(            //
+          device_,                                     //
+          /*num_lhs=*/nm0_,                            //
+          /*num_rhs=*/nn0_,                            //
+          /*num_slices=*/std::min<Index>(nk_, P - 1),  //
+          packed_lhs_, packed_rhs_);
+
+      if (parallelize_by_sharding_dim_only_) {
+        const int num_worker_threads = device_.numThreadsInPool();
+
+        if (shard_by_col) {
+          can_use_thread_local_packed_ = new std::atomic<bool>[nn_];
+          for (int i = 0; i < nn_; ++i)
+            can_use_thread_local_packed_[i].store(true,
+                                                  std::memory_order_relaxed);
+
+          Index num_blocks = num_worker_threads * gn_;
+          thread_local_pre_alocated_mem_ = kernel_.allocateSlices(  //
+              device_,                                              //
+              /*num_lhs=*/0,                                        //
+              /*num_rhs=*/num_blocks,                               //
+              /*num_slices=*/1,                                     //
+              /*lhs_blocks=*/nullptr, &rhs_thread_local_pre_allocated_);
+
+        } else {
+          can_use_thread_local_packed_ = new std::atomic<bool>[nm_];
+          for (int i = 0; i < nm_; ++i)
+            can_use_thread_local_packed_[i].store(true,
+                                                  std::memory_order_relaxed);
+
+          Index num_blocks = num_worker_threads * gm_;
+          thread_local_pre_alocated_mem_ = kernel_.allocateSlices(  //
+              device_,                                              //
+              /*num_lhs=*/num_blocks,                               //
+              /*num_rhs=*/0,                                        //
+              /*num_slices=*/1, &lhs_thread_local_pre_allocated_,   //
+              /*rhs_blocks=*/nullptr);
         }
       }
     }
 
-    ~Context() {
+    ~EvalParallelContext() {
       for (Index x = 0; x < P; x++) {
         for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m];
         delete[] state_kernel_[x];
       }
-      internal::aligned_free(packed_mem_);
+      kernel_.deallocate(device_, packed_mem_);
+      if (parallelize_by_sharding_dim_only_) {
+        kernel_.deallocate(device_, thread_local_pre_alocated_mem_);
+        delete[] can_use_thread_local_packed_;
+      }
     }
 
     void run() {
-      internal::EigenStatsWrapper::get()->add(internal::MatmulOp{
-        internal::MatmulOp::Algorithm::GEBP,
-        static_cast<std::size_t>(m_), static_cast<std::size_t>(k_),
-        static_cast<std::size_t>(n_),
-        !lhs_inner_dim_contiguous, !rhs_inner_dim_contiguous,
-        static_cast<std::size_t>(num_threads_)});
-
       // Kick off packing of the first slice.
       signal_switch(0, 1);
+
       // Wait for overall completion.
-      // TODO(dvyukov): this wait can lead to deadlock.
-      // If nthreads contractions are concurrently submitted from worker
-      // threads, this wait will block all worker threads and the system will
-      // deadlock.
+      //
+      // If parallel evaluation is executed in async mode, this is a no-op, and
+      // Wait() will return immediately. In synchronous mode it will block the
+      // caller thread until it will receive notification from last task.
+      //
+      // In async mode, last task when completed will call done callback from
+      // the same thread, and will delete this context.
+      //
+      // TODO(dvyukov): This wait can lead to deadlock if contraction is
+      // evaluated in synchronous mode. If nthreads contractions are
+      // concurrently submitted from worker threads, this wait will block all
+      // worker threads and the system will deadlock.
       done_.Wait();
     }
 
    private:
-    Notification done_;
+    std::thread::id created_by_thread_id_;
+
+    // This notification is specialized on the type of DoneCallback and can be
+    // blocking or non-blocking.
+    EvalParallelNotification<DoneCallback, EvalParallelContext> done_;
+
     const Device& device_;
     LhsMapper lhs_;
     RhsMapper rhs_;
     Scalar* const buffer_;
     OutputMapper output_;
+    OutputKernelType output_kernel_;
+    TensorContractionParams tensor_contraction_params_;
     const int num_threads_;
     const bool shard_by_col_;
     const bool parallel_pack_;
+    const bool parallelize_by_sharding_dim_only_;
     // Matrix sizes.
     const Index m_;
     const Index n_;
@@ -602,6 +562,8 @@
     // coarsening).
     const Index nm0_;
     const Index nn0_;
+    // Tensor contraction kernel.
+    TensorContractionKernel kernel_;
 
     // Parallelization strategy.
     //
@@ -638,9 +600,215 @@
     // actively executing + one to track completion of kernels in the second
     // slice.
     static const Index P = 3;
-    void* packed_mem_;
-    std::vector<LhsScalar*> packed_lhs_[P - 1];
-    std::vector<RhsScalar*> packed_rhs_[P - 1];
+
+    // Handle to the allocated temporary storage for Lhs/Rhs blocks.
+    BlockMemHandle packed_mem_;
+    std::vector<LhsBlock> packed_lhs_[P - 1];
+    std::vector<RhsBlock> packed_rhs_[P - 1];
+
+    // If we choose to parallelize only by the sharding dimension, each thread
+    // will have it's own "thead local" (not a c++ thread local storage) memory
+    // for packed_lhs or packed_rhs (shard_by_col = false of true). This memory
+    // can't be passed to a kernel that might execute on a different thread.
+    //
+    // In practice when we are ready to pack memory for the sharding dimension
+    // (rhs if shard_by_col==true) of the K-th slice, all kernels for K-1 slice
+    // already computed (99% of the time), and we can pack data into the thread
+    // local storage, and guarantee that all the kernels will be executed
+    // immediately in the same thread. This significantly increases L1 cache hit
+    // ratio and reduces pressure on the memory bus.
+    //
+    // It's still possible that kernel for the K-th slice will be ready before
+    // completion of the K-1 kernel, so we have to allocate "global" packed_lhs_
+    // and packed_rhs_ to allow kernels to be executed later on a thread
+    // different from the thread that was used for packing.
+
+    // Handle for pre-allocated thread local memory buffers.
+    BlockMemHandle thread_local_pre_alocated_mem_;
+
+    // Only one of these will be initialized depending on shard_by_col value
+    // (the size will be `num_worker_threads * num_grains_in_the_sharding_dim`).
+    std::vector<LhsBlock> lhs_thread_local_pre_allocated_;
+    std::vector<RhsBlock> rhs_thread_local_pre_allocated_;
+
+    // How many thread local blocks were already allocated.
+    std::atomic<int> num_thread_local_allocations_;
+    const int thread_local_capacity;
+
+    // We will use pre-allocated Lhs/Rhs blocks defined above, if the number of
+    // unique threads in a system is below or equal to the number of threads in
+    // a thread pool. We will fallback on dynamic memory allocation after that.
+
+    // ThreadLocalBlocks is a container for Lhs or Rhs thread local buffers. Its
+    // size is equal to the grain size in Lhs/Rhs sharding dimension.
+    template <typename BlockType>
+    class ThreadLocalBlocks {
+     public:
+      ThreadLocalBlocks() = default;
+
+      ThreadLocalBlocks(BlockType* base, size_t grain_size)
+          : is_pre_allocated_(true),
+            thread_local_pre_allocated_base_(base),
+            grain_size_(grain_size) {}
+
+      ThreadLocalBlocks(BlockMemHandle mem_handle,
+                        std::vector<BlockType> blocks)
+          : is_pre_allocated_(false),
+            mem_handle_(std::move(mem_handle)),
+            blocks_(std::move(blocks)) {}
+
+      BlockType& block(int grain_index) {
+        eigen_assert(grain_index >= 0);
+        eigen_assert(static_cast<size_t>(grain_index) < size());
+        return is_pre_allocated_ ? thread_local_pre_allocated_base_[grain_index]
+                                 : blocks_[grain_index];
+      }
+
+      void Release(EvalParallelContext& ctx) const {
+        if (!is_pre_allocated_) {
+          ctx.kernel_.deallocate(ctx.device_, mem_handle_);
+        }
+      }
+
+      size_t size() const {
+        return is_pre_allocated_ ? grain_size_ : blocks_.size();
+      }
+
+     private:
+      bool is_pre_allocated_;
+
+      // Reuse pre-allocated thread local buffers.
+      BlockType* thread_local_pre_allocated_base_ = nullptr;
+      size_t grain_size_ = 0;
+
+      // These will be initialized only if `is_pre_allocated == false`.
+      BlockMemHandle mem_handle_{};
+      std::vector<BlockType> blocks_;
+    };
+
+    // ThreadLocalBlocksInitialize callable does custom thread local blocks
+    // initialization, and will reuse pre-allocated buffers if possible, or will
+    // dynamically allocate new memory.
+    //
+    // Lhs/Rhs blocks might be of the same type, so we have to pass explicitly
+    // for what side do we plan to do block allocation.
+    template <typename BlockType, bool is_rhs>
+    class ThreadLocalBlocksInitialize {
+      static constexpr bool kIsLhs =
+          !is_rhs && std::is_same<BlockType, LhsBlock>::value;
+      static const bool kIsRhs =
+          is_rhs && std::is_same<BlockType, RhsBlock>::value;
+      static_assert(kIsLhs || kIsRhs, "Unkown block type");
+
+      using Blocks = ThreadLocalBlocks<BlockType>;
+
+     public:
+      ThreadLocalBlocksInitialize(EvalParallelContext& ctx)
+          : ctx_(ctx),
+            num_worker_threads_(ctx_.device_.numThreadsInPool()) {}
+
+      void operator()(Blocks& blocks) {
+        const int n = ctx_.num_thread_local_allocations_.fetch_add(
+            1, std::memory_order_relaxed);
+
+        if (n >= num_worker_threads_) {
+          ThreadLocalBlocksAllocator<is_rhs>::allocate(ctx_, blocks);
+        } else {
+          ThreadLocalBlocksAllocator<is_rhs>::reuse(ctx_, n, blocks);
+        }
+      }
+
+     private:
+      // NOTE(ezhulenev): Without 'if constexpr' we have to put calls to
+      // TensorContractionKernel::allocateSlices into template specializations.
+      // Also explicit specializations are not allowed at class scope in C++03,
+      // EvalCtx type parameter is just a workaround for that limitation.
+      template <bool pack_rhs, typename EvalCtx = EvalParallelContext>
+      struct ThreadLocalBlocksAllocator;
+
+      template <typename EvalCtx>
+      struct ThreadLocalBlocksAllocator</*pack_rhs=*/true, EvalCtx> {
+        static void allocate(EvalCtx& ctx, Blocks& blocks) {
+          std::vector<RhsBlock> rhs_blocks;
+          BlockMemHandle mem_handle = ctx.kernel_.allocateSlices(
+              ctx.device_,
+              /*num_lhs=*/0,
+              /*num_rhs=*/ctx.gn_,
+              /*num_slices=*/1,
+              /*lhs_blocks=*/nullptr, /*rhs_blocks=*/&rhs_blocks);
+
+          blocks = ThreadLocalBlocks<RhsBlock>(std::move(mem_handle),
+                                               std::move(rhs_blocks));
+        }
+
+        static void reuse(EvalCtx& ctx, int index, Blocks& blocks) {
+          RhsBlock* ptr = &ctx.rhs_thread_local_pre_allocated_[ctx.gn_ * index];
+          blocks = ThreadLocalBlocks<RhsBlock>(ptr, ctx.gn_);
+        }
+      };
+
+      template <typename EvalCtx>
+      struct ThreadLocalBlocksAllocator</*pack_rhs=*/false, EvalCtx> {
+        static void allocate(EvalCtx& ctx, Blocks& blocks) {
+          std::vector<LhsBlock> lhs_blocks;
+          BlockMemHandle mem_handle = ctx.kernel_.allocateSlices(
+              ctx.device_,
+              /*num_lhs=*/ctx.gm_,
+              /*num_rhs=*/0,
+              /*num_slices=*/1,
+              /*lhs_blocks=*/&lhs_blocks, /*rhs_blocks=*/nullptr);
+
+          blocks = ThreadLocalBlocks<LhsBlock>(std::move(mem_handle),
+                                               std::move(lhs_blocks));
+        }
+
+        static void reuse(EvalCtx& ctx, int index, Blocks& blocks) {
+          LhsBlock* ptr = &ctx.lhs_thread_local_pre_allocated_[ctx.gm_ * index];
+          blocks = ThreadLocalBlocks<LhsBlock>(ptr, ctx.gm_);
+        }
+      };
+
+      EvalParallelContext& ctx_;
+      const int num_worker_threads_;
+    };
+
+    template <typename BlockType>
+    class ThreadLocalBlocksRelease {
+     public:
+      using Blocks = ThreadLocalBlocks<BlockType>;
+      ThreadLocalBlocksRelease(EvalParallelContext& ctx) : ctx_(ctx) {}
+      void operator()(Blocks& blocks) { blocks.Release(ctx_); }
+
+     private:
+      EvalParallelContext& ctx_;
+    };
+
+    // ThreadLocalBlocks initialization callables.
+    using ThreadLocalLhsInit =
+        ThreadLocalBlocksInitialize<LhsBlock, /*is_rhs=*/false>;
+    using ThreadLocalRhsInit =
+        ThreadLocalBlocksInitialize<RhsBlock, /*is_rhs=*/true>;
+
+    // ThreadLocalBlocks release callables.
+    using ThreadLocalLhsRelease = ThreadLocalBlocksRelease<LhsBlock>;
+    using ThreadLocalRhsRelease = ThreadLocalBlocksRelease<RhsBlock>;
+
+    // Thread local containers for Lhs/Rhs block packs. In practice only one of
+    // them will be used, depending on the shard_by_col value.
+    Eigen::ThreadLocal<ThreadLocalBlocks<LhsBlock>, ThreadLocalLhsInit,
+                       ThreadLocalLhsRelease>
+        lhs_thread_local_blocks_;
+    Eigen::ThreadLocal<ThreadLocalBlocks<RhsBlock>, ThreadLocalRhsInit,
+                       ThreadLocalRhsRelease>
+        rhs_thread_local_blocks_;
+
+    // After a particular shard for Kth slice missed thread local execution
+    // opportunity (K-1 slice didn't complete kernels execution), we can no
+    // longer schedule K+1 and following slices in thread local mode, because
+    // there is no more guarantee that previous kernels were executed
+    // sequentially in the same thread (size is nn_ or nm_).
+    std::atomic<bool>* can_use_thread_local_packed_;
+
     std::atomic<uint8_t>** state_kernel_[P];
     // state_switch_ is frequently modified by worker threads, while other
     // fields are read-only after constructor. Let's move it to a separate cache
@@ -649,69 +817,168 @@
     std::atomic<Index> state_packing_ready_[P];
     std::atomic<Index> state_switch_[P];
 
+    LhsBlock& packed_lhs(Index m, Index k, Index m1, bool use_thread_local) {
+      if (use_thread_local) {
+        eigen_assert(!shard_by_col_);
+        ThreadLocalBlocks<LhsBlock>& blocks = lhs_thread_local_blocks_.local();
+
+        Index grain_index = m1 - m * gm_;
+        return blocks.block(internal::convert_index<int>(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index?
+      } else {
+        return packed_lhs_[k % (P - 1)][m1];
+      }
+    }
+
+    RhsBlock& packed_rhs(Index n, Index k, Index n1, bool use_thread_local) {
+      if (use_thread_local) {
+        eigen_assert(shard_by_col_);
+        ThreadLocalBlocks<RhsBlock>& blocks = rhs_thread_local_blocks_.local();
+
+        Index grain_index = n1 - n * gn_;
+        return blocks.block(internal::convert_index<int>(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index?
+      } else {
+        return packed_rhs_[k % (P - 1)][n1];
+      }
+    }
+
+    // In following two methods (pack_lhs and pack_rhs), if we know for sure
+    // that we'll be able to immediately call a kernel with packed data, and do
+    // not submit it to the thread pool, we can use thread local memory for
+    // packed data.
+    //
+    // We can only reliably check it if we are running all kernels in sync mode
+    // (parallelize only by sharding dim). If kernel for m==0 (n==0) is ready to
+    // run, it's guaranteed that all kernels with larger values of m (n) are
+    // also ready, because we execute them in the same order for all K slices.
+
     void pack_lhs(Index m, Index k) {
+      bool use_thread_local = false;
+
+      if (parallelize_by_sharding_dim_only_ && !shard_by_col_ &&
+          can_use_thread_local_packed_[m].load(std::memory_order_relaxed)) {
+        if (state_kernel_[k % P][m][0].load(std::memory_order_relaxed) == 1) {
+          use_thread_local = true;
+        } else {
+          // If we can't guarantee that all kernels in `k` slice will be
+          // executed sequentially in current thread, it's no longer safe to use
+          // thread local memory in following slices along the k dimensions.
+          eigen_assert(k > 0);
+          can_use_thread_local_packed_[m].store(false,
+                                                std::memory_order_relaxed);
+        }
+      }
+
       const Index mend = m * gm_ + gm(m);
       for (Index m1 = m * gm_; m1 < mend; m1++)
-        LhsPacker()(packed_lhs_[k % (P - 1)][m1],
-                    lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
+        kernel_.packLhs(&packed_lhs(m, k, m1, use_thread_local),
+                        lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
 
       if (!parallel_pack_ && shard_by_col_) {
+        assert(!use_thread_local);
         signal_packing(k);
       } else {
         signal_switch(k + 1);
-        for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0);
+        for (Index n = nn_ - 1; n >= 0; n--) {
+          bool sync = parallelize_by_sharding_dim_only_ || n == 0;
+          signal_kernel(m, n, k, sync, use_thread_local);
+        }
       }
     }
 
     void pack_rhs(Index n, Index k) {
+      bool use_thread_local = false;
+
+      if (parallelize_by_sharding_dim_only_ && shard_by_col_ &&
+          can_use_thread_local_packed_[n].load(std::memory_order_relaxed)) {
+        if (state_kernel_[k % P][0][n].load(std::memory_order_relaxed) == 1) {
+          use_thread_local = true;
+        } else {
+          // If we can't guarantee that all kernels in `k` slice will be
+          // executed sequentially in current thread, it's no longer safe to use
+          // thread local memory in followig slices along the k dimensions.
+          eigen_assert(k > 0);
+          can_use_thread_local_packed_[n].store(false,
+                                                std::memory_order_relaxed);
+        }
+      }
+
       const Index nend = n * gn_ + gn(n);
       for (Index n1 = n * gn_; n1 < nend; n1++) {
-        if (k == 0) {
-          // Zero the output memory in parallel.
-          // On 10000x2x10000 mm zeroing can easily take half of time.
-          // Zero (bn x m) row. Safe to do here because all kernels that will
-          // write to this memory depend on completion of this task.
-          // Note: don't call device_.memset() here. device_.memset() blocks on
-          // thread pool worker thread, which can lead to underutilization and
-          // deadlocks.
+        if (!TensorContractionKernel::HasBeta && k == 0) {
+          // Zero the output memory in parallel, only if contraction kernel does
+          // not support `beta`. Otherwise we will pass beta 0.0 to the first
+          // call to the `TensorContractionKernel::invoke()`.
+          //
+          // On 10000x2x10000 mm zeroing can easily take half of time. Zero (bn
+          // x m) row. Safe to do here because all kernels that will write to
+          // this memory depend on completion of this task. Note: don't call
+          // device_.memset() here. device_.memset() blocks on thread pool
+          // worker thread, which can lead to underutilization and deadlocks.
           memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
         }
-        RhsPacker()(packed_rhs_[k % (P - 1)][n1],
-                    rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
+        kernel_.packRhs(&packed_rhs(n, k, n1, use_thread_local),
+                        rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
       }
 
       if (parallel_pack_ || shard_by_col_) {
         signal_switch(k + 1);
-        for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0);
+        for (Index m = nm_ - 1; m >= 0; m--) {
+          bool sync = parallelize_by_sharding_dim_only_ || m == 0;
+          signal_kernel(m, n, k, sync, use_thread_local);
+        }
       } else {
+        assert(!use_thread_local);
         signal_packing(k);
       }
     }
 
-    void kernel(Index m, Index n, Index k) {
+    void kernel(Index m, Index n, Index k, bool use_thread_local) {
       // Note: order of iteration matters here. Iteration over m is innermost
-      // because we want to reuse the same packed rhs in consequetive tasks
+      // because we want to reuse the same packed rhs in consecutive tasks
       // (rhs fits into L2$ while lhs only into L3$).
       const Index nend = n * gn_ + gn(n);
       const Index mend = m * gm_ + gm(m);
+
+      // NOTE: output = alpha * LHS * RHS + beta * output.
+      const Scalar alpha = Scalar(1);
+      const Scalar beta =
+          (TensorContractionKernel::HasBeta && k == 0) ? Scalar(0) : Scalar(1);
+
       if (shard_by_col_) {
         for (Index n1 = n * gn_; n1 < nend; n1++) {
-          for (Index m1 = m * gm_; m1 < mend; m1++)
-            GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
-                         packed_lhs_[k % (P - 1)][m1],
-                         packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
-                         Scalar(1), -1, -1, 0, 0);
+          for (Index m1 = m * gm_; m1 < mend; m1++) {
+            const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
+            kernel_.invoke(
+                output_mapper,
+                packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local),
+                packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1),
+                bk(k), bn(n1), alpha, beta);
+
+            // We are done with the last task for the [m1, n1] block.
+            if (k + 1 == nk_) {
+              output_kernel_(output_mapper, tensor_contraction_params_,
+                             m1 * bm_, n1 * bn_, bm(m1), bn(n1));
+            }
+          }
         }
       } else {
         for (Index m1 = m * gm_; m1 < mend; m1++)
           for (Index n1 = n * gn_; n1 < nend; n1++) {
-            GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
-                         packed_lhs_[k % (P - 1)][m1],
-                         packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
-                         Scalar(1), -1, -1, 0, 0);
+            const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
+            kernel_.invoke(
+                output_mapper,
+                packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local),
+                packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1),
+                bk(k), bn(n1), alpha, beta);
+
+            // We are done with the last task for the [m1, n1] block.
+            if (k + 1 == nk_) {
+              output_kernel_(output_mapper, tensor_contraction_params_,
+                             m1 * bm_, n1 * bn_, bm(m1), bn(n1));
+            }
           }
       }
-      signal_kernel(m, n, k + 1, false);
+      signal_kernel(m, n, k + 1, /*sync=*/false, /*use_thread_local=*/false);
       signal_switch(k + 2);
     }
 
@@ -724,16 +991,23 @@
       enqueue_packing(k, shard_by_col_);
     }
 
-    void signal_kernel(Index m, Index n, Index k, bool sync) {
+    void signal_kernel(Index m, Index n, Index k, bool sync,
+                       bool use_thread_local) {
       std::atomic<uint8_t>* state = &state_kernel_[k % P][m][n];
       Index s = state->load();
       eigen_assert(s > 0);
-      if (s != 1 && state->fetch_sub(1) != 1) return;
+      if (s != 1 && state->fetch_sub(1) != 1) {
+        eigen_assert(!use_thread_local);
+        return;
+      }
       state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed);
-      if (sync)
-        kernel(m, n, k);
-      else
-        device_.enqueue_function([=]() { kernel(m, n, k); });
+      if (sync) {
+        kernel(m, n, k, use_thread_local);
+      } else {
+        eigen_assert(!use_thread_local);
+        device_.enqueueNoNotification(
+            [=]() { kernel(m, n, k, use_thread_local); });
+      }
     }
 
     void signal_switch(Index k, Index v = 1) {
@@ -747,10 +1021,6 @@
           (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) +
           nm_ * nn_;
       if (k < nk_) {
-        // It is important to copy out nm_ and nn_, because once we kick off
-        // the last packing operation this and device_ can be destroyed.
-        Index nm = nm_;
-        Index nn = nn_;
         // Issue lhs/rhs packing. Their completion will in turn kick off
         // kernels.
         if (parallel_pack_) {
@@ -787,28 +1057,403 @@
         else
           pack_lhs(start, k);
       } else {
-        Index mid = (start + end) / 2;
-        device_.enqueue_function(
-            [=]() { enqueue_packing_helper(mid, end, k, rhs); });
-        device_.enqueue_function(
-            [=]() { enqueue_packing_helper(start, mid, k, rhs); });
+        while (end - start > 1) {
+          Index mid = (start + end) / 2;
+          device_.enqueueNoNotification(
+              [=]() { enqueue_packing_helper(mid, end, k, rhs); });
+          end = mid;
+        }
+
+        // Decide if we want to run first packing task (start == 0) in
+        // async mode if we parallelize only by sharding dim:
+        // (1) pack_lhs and pack_rhs call signal_switch before completing
+        //     all calls to signal_kernel, which in sync mode might lead
+        //     to the execution of the first kernel of the k+1 slice, before
+        //     completing a call to the last kernel of the k slice.
+        // (2) all pack tasks for sharded dim must be executed in a thread
+        //     pool to get pre-allocated thead local buffers.
+        bool pack_async =
+          (start == 0) &&
+          (parallelize_by_sharding_dim_only_&& shard_by_col_ == rhs) &&
+          (k > 0 || std::this_thread::get_id() == created_by_thread_id_);
+
+        if (pack_async) {
+          device_.enqueueNoNotification(
+              [=]() { enqueue_packing_helper(start, end, k, rhs); });
+        } else {
+          enqueue_packing_helper(start, end, k, rhs);
+        }
       }
     }
 
     // Block sizes with accounting for potentially incomplete last block.
-    Index bm(Index m) { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; }
-    Index bn(Index n) { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; }
-    Index bk(Index k) { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; }
+    Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; }
+    Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; }
+    Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; }
     // Task grain sizes accounting for potentially incomplete last task.
-    Index gm(Index m) { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; }
-    Index gn(Index n) { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; }
+    Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; }
+    Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; }
 
-    Context(const Context&) = delete;
-    void operator=(const Context&) = delete;
+    EvalParallelContext(const EvalParallelContext&) = delete;
+    void operator=(const EvalParallelContext&) = delete;
   };
 
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+            bool rhs_inner_dim_reordered, int Alignment>
+  using SyncEvalParallelContext =
+      EvalParallelContext<NoCallback, lhs_inner_dim_contiguous,
+                          rhs_inner_dim_contiguous, rhs_inner_dim_reordered,
+                          Alignment>;
+
+  // ------------------------------------------------------------------------ //
+
+  // EvalShardedByInnerDimContext orchestrates sync/async contraction
+  // evaluation, when we shard by inner dimension. When it is executed in
+  // asynchronous mode, it owns all the shared state that might be accessible by
+  // block processing tasks.
+
+  template <typename DoneCallback>
+  struct EvalShardedByInnerDimContext {
+    EvalShardedByInnerDimContext(const Self* self, int num_threads,
+                                 Scalar* result_buffer,
+                                 Index m_size, Index n_size, Index k_size,
+                                 DoneCallback done_callback)
+        : evaluator(self),
+          m_lhs_inner_dim_contiguous(evaluator->m_lhs_inner_dim_contiguous),
+          m_rhs_inner_dim_contiguous(evaluator->m_rhs_inner_dim_contiguous),
+          m_rhs_inner_dim_reordered(evaluator->m_rhs_inner_dim_reordered),
+          result(result_buffer),
+          m(m_size),
+          n(n_size),
+          k(k_size),
+          done(std::move(done_callback)),
+          buffer_size_bytes(m * n * sizeof(Scalar)),
+          block_size(blockSize(k, num_threads)),
+          num_blocks(divup<Index>(k, block_size)),
+          num_pending_blocks(internal::convert_index<int>(num_blocks)),
+          l0_ranges(divup<Index>(num_blocks, l0_size)),
+          l0_state(l0_ranges),
+          block_buffers(num_blocks) {
+      // Keep count of pending gemm tasks for each l0 range.
+      for (int i = 0; i < l0_ranges; ++i) {
+        const Index num_pending_tasks = actualRangeSize(l0_ranges, l0_size, i);
+        l0_state.emplace_back(internal::convert_index<int>(num_pending_tasks));
+      }
+
+      // Allocate temporary buffers for each block.
+      for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) {
+        Scalar* buf = block_idx == 0
+                          ? result
+                          : static_cast<Scalar*>(evaluator->m_device.allocate(
+                                buffer_size_bytes));
+        block_buffers.emplace_back(buf);
+      }
+    }
+
+    ~EvalShardedByInnerDimContext() {
+      for (Index i = 1; i < num_blocks; ++i) {
+        evaluator->m_device.deallocate(block_buffers[i]);
+      }
+    }
+
+    template <int Alignment>
+    void run() {
+      Barrier barrier(internal::convert_index<int>(num_blocks));
+      eval<Alignment>(barrier, 0, num_blocks);
+      barrier.Wait();
+
+      // Aggregate partial sums from l0 ranges.
+      aggregateL0Blocks<Alignment>();
+
+      // Apply output kernel.
+      applyOutputKernel();
+    }
+
+    template <int Alignment>
+    void runAsync() {
+      evalAsync<Alignment>(0, num_blocks);
+    }
+
+   private:
+    // The underlying GEMM kernel assumes that k is a multiple of
+    // the packet size and subtle breakage occurs if this is violated.
+    static const Index packet_size = internal::packet_traits<RhsScalar>::size;
+
+    const Self* evaluator;  // TensorContraction evaluator
+
+    // These fields required fromTENSOR_CONTRACTION_DISPATCH macro.
+    bool m_lhs_inner_dim_contiguous;
+    bool m_rhs_inner_dim_contiguous;
+    bool m_rhs_inner_dim_reordered;
+
+    Scalar* result;
+
+    Index m;
+    Index n;
+    Index k;
+
+    DoneCallback done;
+
+    // ----------------------------------------------------------------------//
+    // Algorithm parameters.
+
+    // We will compute partial results into the buffers of this size.
+    Index buffer_size_bytes;
+
+    Index block_size;
+    Index num_blocks;
+
+    // Keep track of pending tasks when evaluate in async mode.
+    std::atomic<int> num_pending_blocks;
+
+    // We compute partial gemm results in parallel, and to get the final result
+    // we need to add them all together. For the large number of threads (>= 48)
+    // this adds a very expensive sequential step at the end.
+    //
+    // We split the [0, num_blocks) into small ranges, and when a task for the
+    // block finishes its partial gemm computation, it checks if it was the last
+    // gemm in the range, and if so, it will add all blocks of the range.
+    //
+    // After all tasks done, we need to add only these pre-aggregated blocks.
+
+    // For now we use just a single level of ranges to compute pre-aggregated
+    // partial sums, but in general we can use more layers to compute tree
+    // aggregation in parallel and reduce the size of the sequential step.
+    //
+    // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make
+    // sense only if number of threads >= ~128?
+    static const Index l0_size = 4;
+    Index l0_ranges;
+
+    // Keep count of pending gemm tasks for each l0 range.
+    MaxSizeVector<std::atomic<int>> l0_state;  // [0, l0_ranges)
+
+    // Buffers allocated for each temporary block computation.
+    MaxSizeVector<Scalar*> block_buffers;  // [0, num_blocks)
+
+    template <int Alignment>
+    void processBlock(Index block_idx, Index begin, Index end) {
+      Scalar* buf = block_buffers[block_idx];
+
+      TENSOR_CONTRACTION_DISPATCH(
+          evaluator->template evalGemmPartialWithoutOutputKernel, Alignment,
+          (buf, begin, end,
+           /*num_threads=*/internal::convert_index<int>(num_blocks)));
+
+      // Check if it was the last task in l0 range.
+      const Index l0_index = block_idx / l0_size;
+      const int v = l0_state[l0_index].fetch_sub(1);
+      eigen_assert(v >= 1);
+
+      // If we processed the last block of the range, we can aggregate all
+      // partial results into the first block of the range.
+      if (v == 1) {
+        const Index rng_size = actualRangeSize(l0_ranges, l0_size, l0_index);
+        const Index dst_block_idx = l0_index * l0_size;
+
+        if (rng_size == l0_size) {
+          addAllToBuffer<Alignment>(
+              m * n,
+              /*src_buf0=*/block_buffers[dst_block_idx + 1],
+              /*src_buf1=*/block_buffers[dst_block_idx + 2],
+              /*src_buf2=*/block_buffers[dst_block_idx + 3],
+              /*dst_buf= */ block_buffers[dst_block_idx]);
+        } else {
+          // Aggregate blocks of potentially incomplete last range.
+          for (int i = 1; i < rng_size; ++i) {
+            addToBuffer<Alignment>(m * n,
+                                   /*src_buf=*/block_buffers[dst_block_idx + i],
+                                   /*dst_buf=*/block_buffers[dst_block_idx]);
+          }
+        }
+      }
+    }
+
+    // Aggregate partial sums from l0 ranges.
+    template <int Alignment>
+    void aggregateL0Blocks() const {
+      Index l0_index = 1;
+
+      for (; l0_index + 2 < l0_ranges; l0_index += 3) {
+        addAllToBuffer<Alignment>(
+            m * n,
+            /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size],
+            /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size],
+            /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size],
+            /*dst_buf= */ block_buffers[0]);
+      }
+
+      for (; l0_index < l0_ranges; ++l0_index) {
+        addToBuffer<Alignment>(m * n, block_buffers[l0_index * l0_size],
+                               block_buffers[0]);
+      }
+    }
+
+    void applyOutputKernel() const {
+      typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+      evaluator->m_output_kernel(
+          OutputMapper(result, m), evaluator->m_tensor_contraction_params,
+          static_cast<Eigen::Index>(0), static_cast<Eigen::Index>(0), m, n);
+    }
+
+    // Compute block size with accounting for potentially incomplete last block.
+    Index actualBlockSize(Index block_idx) const {
+      return block_idx + 1 < num_blocks
+                 ? block_size
+                 : k + block_size - block_size * num_blocks;
+    };
+
+    // Compute range size with accounting for potentially incomplete last range.
+    Index actualRangeSize(Index num_ranges, Index range_size,
+                          Index range_idx) const {
+      eigen_assert(range_idx < num_ranges);
+      return range_idx + 1 < num_ranges
+                 ? range_size
+                 : num_blocks + range_size - range_size * num_ranges;
+    };
+
+    template <int Alignment>
+    EIGEN_STRONG_INLINE static void addToBuffer(size_t n, const Scalar* src_buf,
+                                                Scalar* tgt_buf) {
+      const int output_packet_size =
+          internal::unpacket_traits<PacketReturnType>::size;
+      size_t i = 0;
+      const size_t num_packets = n / output_packet_size;
+      for (; i < output_packet_size * num_packets; i += output_packet_size) {
+        const PacketReturnType src_val =
+            internal::pload<PacketReturnType>(src_buf + i);
+        const PacketReturnType tgt_val =
+            internal::ploadt<PacketReturnType, Alignment>(tgt_buf + i);
+        const PacketReturnType sum = internal::padd(src_val, tgt_val);
+        internal::pstoret<Scalar, PacketReturnType, Alignment>(tgt_buf + i,
+                                                               sum);
+      }
+      for (; i < n; ++i) {
+        tgt_buf[i] += src_buf[i];
+      }
+    }
+
+    template <int Alignment>
+    EIGEN_STRONG_INLINE static void addAllToBuffer(size_t n,
+                                                   const Scalar* src_buf0,
+                                                   const Scalar* src_buf1,
+                                                   const Scalar* src_buf2,
+                                                   Scalar* dst_buf) {
+      using ::Eigen::internal::padd;
+      using ::Eigen::internal::pload;
+      using ::Eigen::internal::ploadt;
+      using ::Eigen::internal::pstoret;
+
+      const int output_packet_size =
+          internal::unpacket_traits<PacketReturnType>::size;
+
+      size_t i = 0;
+      const size_t num_packets = n / output_packet_size;
+      for (; i < output_packet_size * num_packets; i += output_packet_size) {
+        const auto src_val0 = pload<PacketReturnType>(src_buf0 + i);
+        const auto src_val1 = pload<PacketReturnType>(src_buf1 + i);
+        const auto src_val2 = pload<PacketReturnType>(src_buf2 + i);
+
+        const auto dst_val = ploadt<PacketReturnType, Alignment>(dst_buf + i);
+        const auto sum =
+            padd(padd(dst_val, src_val0), padd(src_val1, src_val2));
+
+        pstoret<Scalar, PacketReturnType, Alignment>(dst_buf + i, sum);
+      }
+      for (; i < n; ++i) {
+        dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i];
+      }
+    }
+
+    template <int Alignment>
+    void eval(Barrier& barrier, Index start_block_idx, Index end_block_idx) {
+      while (end_block_idx - start_block_idx > 1) {
+        Index mid_block_idx = (start_block_idx + end_block_idx) / 2;
+        evaluator->m_device.enqueueNoNotification(
+            [this, &barrier, mid_block_idx, end_block_idx]() {
+              eval<Alignment>(barrier, mid_block_idx, end_block_idx);
+            });
+        end_block_idx = mid_block_idx;
+      }
+
+      Index block_idx = start_block_idx;
+      Index block_start = block_idx * block_size;
+      Index block_end = block_start + actualBlockSize(block_idx);
+
+      processBlock<Alignment>(block_idx, block_start, block_end);
+      barrier.Notify();
+    }
+
+    template <int Alignment>
+    void evalAsync(Index start_block_idx, Index end_block_idx) {
+      while (end_block_idx - start_block_idx > 1) {
+        Index mid_block_idx = (start_block_idx + end_block_idx) / 2;
+        evaluator->m_device.enqueueNoNotification(
+            [this, mid_block_idx, end_block_idx]() {
+              evalAsync<Alignment>(mid_block_idx, end_block_idx);
+            });
+        end_block_idx = mid_block_idx;
+      }
+
+      Index block_idx = start_block_idx;
+
+      Index block_start = block_idx * block_size;
+      Index block_end = block_start + actualBlockSize(block_idx);
+
+      processBlock<Alignment>(block_idx, block_start, block_end);
+
+      int v = num_pending_blocks.fetch_sub(1);
+      eigen_assert(v >= 1);
+
+      if (v == 1) {
+        // Aggregate partial sums from l0 ranges.
+        aggregateL0Blocks<Alignment>();
+
+        // Apply output kernel.
+        applyOutputKernel();
+
+        // NOTE: If we call `done` callback before deleting this (context),
+        // it might deallocate Self* pointer captured by context, and we'll
+        // fail in destructor trying to deallocate temporary buffers.
+
+        // Move done call back from context before it will be destructed.
+        DoneCallback done_copy = std::move(done);
+
+        // We are confident that we are the last one who touches context.
+        delete this;
+
+        // Now safely call the done callback.
+        done_copy();
+      }
+    }
+
+    // Cost model doesn't capture well the cost associated with constructing
+    // tensor contraction mappers and computing loop bounds in gemm_pack_lhs
+    // and gemm_pack_rhs, so we specify minimum desired block size.
+    static Index blockSize(Index k, int num_threads) {
+      const auto round_up = [=](Index index) -> Index {
+        const Index kmultiple = packet_size <= 8 ? 8 : packet_size;
+        return divup<Index>(index, kmultiple) * kmultiple;
+      };
+
+      const Index target_block_size = round_up(divup<Index>(k, num_threads));
+      const Index desired_min_block_size = 12 * packet_size;
+
+      return numext::mini<Index>(
+          k, numext::maxi<Index>(desired_min_block_size, target_block_size));
+    }
+
+    EvalShardedByInnerDimContext(const EvalShardedByInnerDimContext&) = delete;
+    void operator=(const EvalShardedByInnerDimContext&) = delete;
+  };
+
+  // ------------------------------------------------------------------------ //
+
+  // Below are the function used by evalProductImpl heuristics, trying to select
+  // optimcal parameters for parallelization algorithm.
+
   // Decide whether we want to shard m x n contraction by columns or by rows.
-  static bool shardByCol(Index m, Index n, int num_threads) {
+  static bool shardByCol(Index m, Index n, Index num_threads) {
     // Note: we are comparing both n and m against Traits::nr, it is not
     // a mistake. We are trying to figure out how both n and m will fit into
     // the main sharding dimension.
@@ -910,29 +1555,40 @@
     return 0;
   }
 
-  template <int Alignment>
-  EIGEN_STRONG_INLINE void addToBuffer(size_t n, const Scalar* src_buf,
-                                       Scalar* tgt_buf) const {
-    size_t i = 0;
-    const size_t num_packets = n / PacketSize;
-    for (; i < PacketSize * num_packets; i += PacketSize) {
-      const PacketReturnType src_val =
-          internal::pload<PacketReturnType>(src_buf + i);
-      const PacketReturnType tgt_val =
-          internal::pload<PacketReturnType>(tgt_buf + i);
-      const PacketReturnType sum = internal::padd(src_val, tgt_val);
-      internal::pstoret<Scalar, PacketReturnType, Alignment>(tgt_buf + i, sum);
+  TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
+                               bool shard_by_col, bool prepacked) const {
+    const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size,
+                                          PacketType<RhsScalar, Device>::size);
+    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+    const double kd = static_cast<double>(bk);
+    double compute_bandwidth = computeBandwidth(false, bm, bn, bk);
+    // Computations.
+    TensorOpCost cost = TensorOpCost(0, 0, kd * compute_bandwidth, true, packed_size);
+    // Output stores.
+    cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
+    if (prepacked) {
+      // Packing and kernels are executed in different tasks. When we calculate
+      // task grain size we look only at kernel cost assuming that kernel
+      // is more expensive than packing.
+      return cost;
     }
-    for (; i < n; ++i) {
-      tgt_buf[i] += src_buf[i];
-    }
+    // Lhs/rhs loads + computations.
+    TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n);
+    TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m);
+    // Lhs packing memory cost does not contribute considerably to overall
+    // execution time because lhs is prefetched early and accessed sequentially.
+    if (shard_by_col)
+      lhsCost.dropMemoryCost();
+    else
+      rhsCost.dropMemoryCost();
+    return cost + lhsCost + rhsCost;
   }
 
   // Decide whether we want to shard m x k x n contraction over the inner
   // (contraction) dimension (k).
   static bool shardByInnerDim(Index m, Index n, Index k, int num_threads,
                               int num_threads_by_k) {
-    size_t bufsize = m * n * sizeof(Scalar);
+    std::ptrdiff_t bufsize = m * n * sizeof(Scalar);
     bool shard_by_k = false;
     if (n == 1 ||                // If mat*vec or...
         num_threads_by_k < 2 ||  // running single threaded or...
@@ -955,73 +1611,12 @@
     return shard_by_k;
   }
 
-  template <int Alignment>
-  void evalShardedByInnerDim(int num_threads, Scalar* result) const {
-    const Index m = this->m_i_size;
-    const Index n = this->m_j_size;
-    const Index k = this->m_k_size;
-    ::memset(result, 0, m * n * sizeof(Scalar));
-    FixedSizeVector<void*> thread_buffers(this->m_device.numThreads(), nullptr);
-    mutex mu;
-    auto process_block = [=, &mu, &thread_buffers](Index first, Index last) {
-      int thread_id = this->m_device.currentThreadId();
-      eigen_assert(thread_id != -1);
-      if (!thread_buffers[thread_id]) {
-        thread_buffers[thread_id] =
-            internal::aligned_malloc(m * n * sizeof(Scalar));
-        ::memset(thread_buffers[thread_id], 0, m * n * sizeof(Scalar));
-      }
-      Scalar* buf = static_cast<Scalar*>(thread_buffers[thread_id]);
-      TENSOR_CONTRACTION_DISPATCH(
-          this->template evalGemmPartial, Alignment,
-          (buf, first, last, this->m_device.numThreads()));
-      if (mu.try_lock()) {
-        // Add partial result to the output and free the buffer.
-        addToBuffer<Alignment>(m * n, buf, result);
-        mu.unlock();
-        internal::aligned_free(thread_buffers[thread_id]);
-        thread_buffers[thread_id] = nullptr;
-      }
-    };
-    // The underlying GEMM kernel assumes that k is a multiple of 8 and
-    // subtle breakage occurs if this is violated.
-    Index block_size = 8 * divup<Index>(k, 8 * num_threads);
-    int num_blocks = divup<Index>(k, block_size);
-    Barrier barrier(num_blocks);
-    std::function<void(Index, Index)> handleRange;
-    handleRange = [=, &barrier, &handleRange, &process_block](Index first,
-                                                              Index last) {
-      if (last - first <= block_size) {
-        // Single block or less, execute directly.
-        process_block(first, last);
-        barrier.Notify();
-        return;
-      }
-      // Split into halves and submit to the pool.
-      Index mid = first + divup(last - first, 2 * block_size) * block_size;
-      this->m_device.enqueue_function(
-          [=, &handleRange]() { handleRange(mid, last); });
-      this->m_device.enqueue_function(
-          [=, &handleRange]() { handleRange(first, mid); });
-    };
-    handleRange(0, k);
-    barrier.Wait();
-
-    // Add any remaining partial results.
-    for (int i = 0; i < this->m_device.numThreads(); ++i) {
-      if (thread_buffers[i]) {
-        const Scalar* buf = static_cast<Scalar*>(thread_buffers[i]);
-        addToBuffer<Alignment>(m * n, buf, result);
-        internal::aligned_free(thread_buffers[i]);
-      }
-    }
-  }
-
   TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const {
     // Compute cost.
-    TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n);
+    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+    TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n, true, output_packet_size);
     // Output stores.
-    cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, PacketSize);
+    cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
     TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * m;
     TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * n;
     // Since the inner gemm kernel is always sharded by column, the lhs
@@ -1031,18 +1626,19 @@
   }
 
   int numThreadsInnerDim(Index m, Index n, Index k) const {
+    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
     TensorOpCost cost = contractionCostPerInnerDim(m, n, k);
     double total_parallel_cost =
         TensorCostModel<ThreadPoolDevice>::totalCost(k, cost);
     // Cost of reduction step accumulating the m*n per-thread buffers into the
     // result.
     double reduction_cost = TensorCostModel<ThreadPoolDevice>::totalCost(
-        m * n, TensorOpCost(2, 1, 1, true, PacketSize));
-    Index num_threads = 1;
+        m * n, TensorOpCost(2, 1, 1, true, output_packet_size));
+    int num_threads = 1;
     double min_cost = total_parallel_cost;
-    double kPerThreadOverHead = 4000;
+    double kPerThreadOverHead = 3000;
     double kFixedOverHead = 100000;
-    for (int nt = 2; nt <= this->m_device.numThreads(); nt++) {
+    for (int nt = 2; nt <= this->m_device.numThreads(); nt += 2) {
       double sequential_cost =
           kFixedOverHead + nt * (reduction_cost + kPerThreadOverHead);
       double parallel_cost = total_parallel_cost / nt + sequential_cost;
@@ -1075,37 +1671,9 @@
     return computeBandwidth;
   }
 
-  TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
-                               bool shard_by_col, bool prepacked) const {
-    const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size,
-                                          PacketType<RhsScalar, Device>::size);
-    const double kd = static_cast<double>(bk);
-    // Computations.
-    TensorOpCost cost =
-        TensorOpCost(0, 0, kd * computeBandwidth(shard_by_col, bm, bn, bk),
-                     true, packed_size);
-    // Output stores.
-    cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, PacketSize);
-    if (prepacked) {
-      // Packing and kernels are executed in different tasks. When we calculate
-      // task grain size we look only at kernel cost assuming that kernel
-      // is more expensive than packing.
-      return cost;
-    }
-    // Lhs/rhs loads + computations.
-    TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n);
-    TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m);
-    // Lhs packing memory cost does not contribute considerably to overall
-    // execution time because lhs is prefetched early and accessed sequentially.
-    if (shard_by_col)
-      lhsCost.dropMemoryCost();
-    else
-      rhsCost.dropMemoryCost();
-    return cost + lhsCost + rhsCost;
-  }
 };
 
-}  // end namespace Eigen
+} // end namespace Eigen
 
 #endif  // EIGEN_USE_THREADS
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index 3231605..09d2da9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h

@@ -32,6 +32,7 @@
   static const int NumDimensions = traits<XprType>::NumDimensions;
   static const int Layout = traits<XprType>::Layout;
   enum { Flags = 0 };
+  typedef typename TypeConversion<Scalar, typename traits<XprType>::PointerType>::type PointerType;
 };
 
 template<typename TargetType, typename XprType>
@@ -50,8 +51,12 @@
 
 
 template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
-struct PacketConverter {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl)
+struct PacketConverter;
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 1> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketConverter(const TensorEvaluator& impl)
       : m_impl(impl) {}
 
   template<int LoadMode, typename Index>
@@ -66,7 +71,8 @@
 
 template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
 struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 2, 1> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketConverter(const TensorEvaluator& impl)
       : m_impl(impl) {}
 
   template<int LoadMode, typename Index>
@@ -85,7 +91,8 @@
 
 template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
 struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 4, 1> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketConverter(const TensorEvaluator& impl)
       : m_impl(impl) {}
 
   template<int LoadMode, typename Index>
@@ -104,23 +111,54 @@
   const TensorEvaluator& m_impl;
 };
 
-
 template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
-struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 2> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl)
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 8, 1> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketConverter(const TensorEvaluator& impl)
+      : m_impl(impl) {}
+
+  template<int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+
+    SrcPacket src1 = m_impl.template packet<LoadMode>(index);
+    SrcPacket src2 = m_impl.template packet<LoadMode>(index + 1 * SrcPacketSize);
+    SrcPacket src3 = m_impl.template packet<LoadMode>(index + 2 * SrcPacketSize);
+    SrcPacket src4 = m_impl.template packet<LoadMode>(index + 3 * SrcPacketSize);
+    SrcPacket src5 = m_impl.template packet<LoadMode>(index + 4 * SrcPacketSize);
+    SrcPacket src6 = m_impl.template packet<LoadMode>(index + 5 * SrcPacketSize);
+    SrcPacket src7 = m_impl.template packet<LoadMode>(index + 6 * SrcPacketSize);
+    SrcPacket src8 = m_impl.template packet<LoadMode>(index + 7 * SrcPacketSize);
+    TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2, src3, src4, src5, src6, src7, src8);
+    return result;
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+};
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int TgtCoeffRatio>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, TgtCoeffRatio> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketConverter(const TensorEvaluator& impl)
       : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {}
 
   template<int LoadMode, typename Index>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
     const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
-    if (index + SrcPacketSize < m_maxIndex) {
-      return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<LoadMode>(index));
+    // Only call m_impl.packet() when we have direct access to the underlying data. This
+    // ensures that we don't compute the subexpression twice. We may however load some
+    // coefficients twice, but in practice this doesn't negatively impact performance.
+    if (m_impl.data() && (index + SrcPacketSize < m_maxIndex)) {
+      // Force unaligned memory loads since we can't ensure alignment anymore
+      return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<Unaligned>(index));
     } else {
       const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
       typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
       typedef typename internal::unpacket_traits<TgtPacket>::type TgtType;
       internal::scalar_cast_op<SrcType, TgtType> converter;
-      EIGEN_ALIGN_DEFAULT typename internal::unpacket_traits<TgtPacket>::type values[TgtPacketSize];
+      EIGEN_ALIGN_MAX typename internal::unpacket_traits<TgtPacket>::type values[TgtPacketSize];
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < TgtPacketSize; ++i) {
         values[i] = converter(m_impl.coeff(index+i));
       }
@@ -156,8 +194,114 @@
     typename XprType::Nested m_xpr;
 };
 
+template <bool SameType, typename Eval, typename EvalPointerType> struct ConversionSubExprEval {
+  static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) {
+    impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+};
 
+template <typename Eval, typename EvalPointerType> struct ConversionSubExprEval<true, Eval, EvalPointerType> {
+  static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) {
+    return impl.evalSubExprsIfNeeded(data);
+  }
+};
 
+#ifdef EIGEN_USE_THREADS
+template <bool SameType, typename Eval, typename EvalPointerType,
+          typename EvalSubExprsCallback>
+struct ConversionSubExprEvalAsync {
+  static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType, EvalSubExprsCallback done) {
+    impl.evalSubExprsIfNeededAsync(nullptr, std::move(done));
+  }
+};
+
+template <typename Eval, typename EvalPointerType,
+          typename EvalSubExprsCallback>
+struct ConversionSubExprEvalAsync<true, Eval, EvalPointerType,
+                                  EvalSubExprsCallback> {
+  static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType data, EvalSubExprsCallback done) {
+    impl.evalSubExprsIfNeededAsync(data, std::move(done));
+  }
+};
+#endif
+
+namespace internal {
+
+template <typename SrcType, typename TargetType, bool IsSameT>
+struct CoeffConv {
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+    internal::scalar_cast_op<SrcType, TargetType> converter;
+    return converter(impl.coeff(index));
+  }
+};
+
+template <typename SrcType, typename TargetType>
+struct CoeffConv<SrcType, TargetType, true> {
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+    return impl.coeff(index);
+  }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode, bool ActuallyVectorize, bool IsSameT>
+struct PacketConv {
+  typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
+  typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
+
+  static const int PacketSize = internal::unpacket_traits<TargetPacket>::size;
+
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+    internal::scalar_cast_op<SrcType, TargetType> converter;
+    EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = converter(impl.coeff(index+i));
+    }
+    TargetPacket rslt = internal::pload<TargetPacket>(values);
+    return rslt;
+  }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode, bool IsSameT>
+struct PacketConv<SrcPacket, TargetPacket, LoadMode, true, IsSameT> {
+  typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
+  typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
+
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+    const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+    const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+    PacketConverter<TensorEvaluator<ArgType, Device>, SrcPacket, TargetPacket,
+                    SrcCoeffRatio, TgtCoeffRatio> converter(impl);
+    return converter.template packet<LoadMode>(index);
+  }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode>
+struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/false, /*IsSameT=*/true> {
+  typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
+  static const int PacketSize = internal::unpacket_traits<TargetPacket>::size;
+
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+    EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) values[i] = impl.coeff(index+i);
+    return internal::pload<TargetPacket>(values);
+  }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode>
+struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/true, /*IsSameT=*/true> {
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+    return impl.template packet<LoadMode>(index);
+  }
+};
+
+}  // namespace internal
 
 // Eval as rvalue
 template<typename TargetType, typename ArgType, typename Device>
@@ -171,50 +315,98 @@
   typedef typename internal::remove_all<typename internal::traits<ArgType>::Scalar>::type SrcType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename PacketType<SrcType, Device>::type PacketSourceType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static const bool IsSameType = internal::is_same<TargetType, SrcType>::value;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = false,
-    PacketAccess = true,
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      =
+    #ifndef EIGEN_USE_SYCL
+                        true,
+    #else
+                        TensorEvaluator<ArgType, Device>::PacketAccess &
+                        internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
+    #endif
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  static const int NumDims = internal::array_size<Dimensions>::value;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      ArgTensorBlock;
+
+  struct TensorConversionOpBlockFactory {
+    template <typename ArgXprType>
+    struct XprType {
+      typedef TensorConversionOp<TargetType, const ArgXprType> type;
+    };
+
+    template <typename ArgXprType>
+    typename XprType<ArgXprType>::type expr(const ArgXprType& expr) const {
+      return typename XprType<ArgXprType>::type(expr);
+    }
+  };
+
+  typedef internal::TensorUnaryExprBlock<TensorConversionOpBlockFactory,
+                                         ArgTensorBlock>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : m_impl(op.expression(), device)
   {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data)
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data)
   {
-    if (internal::is_same<TargetType, SrcType>::value) {
-      return m_impl.evalSubExprsIfNeeded((SrcType*)data);
-    }
-    m_impl.evalSubExprsIfNeeded(NULL);
-    return true;
+    return ConversionSubExprEval<IsSameType, TensorEvaluator<ArgType, Device>, EvaluatorPointerType>::run(m_impl, data);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType data, EvalSubExprsCallback done) {
+    ConversionSubExprEvalAsync<IsSameType, TensorEvaluator<ArgType, Device>,
+                               EvaluatorPointerType,
+        EvalSubExprsCallback>::run(m_impl, data, std::move(done));
+  }
+#endif
+
+  EIGEN_STRONG_INLINE void cleanup()
   {
     m_impl.cleanup();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    internal::scalar_cast_op<SrcType, TargetType> converter;
-    return converter(m_impl.coeff(index));
+    return internal::CoeffConv<SrcType, TargetType, IsSameType>::run(m_impl,index);
   }
 
   template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType
+  packet(Index index) const {
+    // If we are not going to do the cast, we just need to check that base
+    // TensorEvaluator has packet access. Otherwise we also need to make sure,
+    // that we have an implementation of vectorized cast.
     const bool Vectorizable =
-        TensorEvaluator<ArgType, Device>::PacketAccess &
-        internal::type_casting_traits<SrcType, TargetType>::VectorizedCast;
-    return PacketConv<LoadMode, Vectorizable>::run(m_impl, index);
+        IsSameType
+        ? TensorEvaluator<ArgType, Device>::PacketAccess
+        : int(TensorEvaluator<ArgType, Device>::PacketAccess) &
+          int(internal::type_casting_traits<SrcType, TargetType>::VectorizedCast);
+
+    return internal::PacketConv<PacketSourceType, PacketReturnType, LoadMode,
+                                Vectorizable, IsSameType>::run(m_impl, index);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@@ -232,42 +424,31 @@
     }
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return m_impl.getResourceRequirements();
+  }
 
-  protected:
-   template <int LoadMode, bool ActuallyVectorize>
-   struct PacketConv {
-     static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType
-     run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
-       internal::scalar_cast_op<SrcType, TargetType> converter;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    return TensorBlock(m_impl.block(desc, scratch),
+                         TensorConversionOpBlockFactory());
+  }
 
-       EIGEN_ALIGN_DEFAULT
-           typename internal::remove_const<CoeffReturnType>::type
-               values[PacketSize];
-       for (int i = 0; i < PacketSize; ++i) {
-         values[i] = converter(impl.coeff(index + i));
-       }
-       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-       return rslt;
-     }
-   };
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
 
-   template <int LoadMode>
-   struct PacketConv<LoadMode, true> {
-     static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType
-     run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
-       const int SrcCoeffRatio =
-           internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
-       const int TgtCoeffRatio =
-           internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
-       PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType,
-                       PacketReturnType, SrcCoeffRatio, TgtCoeffRatio>
-           converter(impl);
-       return converter.template packet<LoadMode>(index);
-     }
-   };
+  /// required by sycl in order to extract the sycl accessor
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
 
-   TensorEvaluator<ArgType, Device> m_impl;
+ protected:
+  TensorEvaluator<ArgType, Device> m_impl;
 };
 
 } // end namespace Eigen

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index da78695..b20f80b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h

@@ -21,7 +21,7 @@
   */
 namespace internal {
 
-template <typename Index, typename InputDims, size_t NumKernelDims, int Layout>
+template <typename Index, typename InputDims, int NumKernelDims, int Layout>
 class IndexMapper {
  public:
   IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
@@ -54,8 +54,8 @@
       }
     }
 
-    array<Index, NumDims> cudaInputDimensions;
-    array<Index, NumDims> cudaOutputDimensions;
+    array<Index, NumDims> gpuInputDimensions;
+    array<Index, NumDims> gpuOutputDimensions;
     array<Index, NumDims> tmp = dimensions;
     array<Index, NumDims> ordering;
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
@@ -65,8 +65,8 @@
       const Index index = i + offset;
       ordering[index] = indices[i];
       tmp[indices[i]] = -1;
-      cudaInputDimensions[index] = input_dims[indices[i]];
-      cudaOutputDimensions[index] = dimensions[indices[i]];
+      gpuInputDimensions[index] = input_dims[indices[i]];
+      gpuOutputDimensions[index] = dimensions[indices[i]];
     }
 
     int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
@@ -75,8 +75,8 @@
     for (int i = 0; i < NumDims; ++i) {
       if (tmp[i] >= 0) {
         ordering[written] = i;
-        cudaInputDimensions[written] = input_dims[i];
-        cudaOutputDimensions[written] = dimensions[i];
+        gpuInputDimensions[written] = input_dims[i];
+        gpuOutputDimensions[written] = dimensions[i];
         ++written;
       }
     }
@@ -89,107 +89,107 @@
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = 0; i < NumDims; ++i) {
         if (i > NumKernelDims) {
-          m_cudaInputStrides[i] =
-              m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1];
-          m_cudaOutputStrides[i] =
-              m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1];
+          m_gpuInputStrides[i] =
+              m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
+          m_gpuOutputStrides[i] =
+              m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
         } else {
-          m_cudaInputStrides[i] = 1;
-          m_cudaOutputStrides[i] = 1;
+          m_gpuInputStrides[i] = 1;
+          m_gpuOutputStrides[i] = 1;
         }
       }
     } else {
       for (int i = NumDims - 1; i >= 0; --i) {
-        if (i + 1 < offset) {
-          m_cudaInputStrides[i] =
-              m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
-          m_cudaOutputStrides[i] =
-              m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1];
+        if (static_cast<size_t>(i + 1) < offset) {
+          m_gpuInputStrides[i] =
+              m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
+          m_gpuOutputStrides[i] =
+              m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
         } else {
-          m_cudaInputStrides[i] = 1;
-          m_cudaOutputStrides[i] = 1;
+          m_gpuInputStrides[i] = 1;
+          m_gpuOutputStrides[i] = 1;
         }
       }
     }
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int d = NumDims - 1; d > NumKernelDims; --d) {
-        const Index idx = p / m_cudaInputStrides[d];
+        const Index idx = p / m_gpuInputStrides[d];
         inputIndex += idx * m_inputStrides[d];
-        p -= idx * m_cudaInputStrides[d];
+        p -= idx * m_gpuInputStrides[d];
       }
       inputIndex += p * m_inputStrides[NumKernelDims];
     } else {
-      int limit = 0;
+      std::ptrdiff_t limit = 0;
       if (NumKernelDims < NumDims) {
         limit = NumDims - NumKernelDims - 1;
       }
       for (int d = 0; d < limit; ++d) {
-        const Index idx = p / m_cudaInputStrides[d];
+        const Index idx = p / m_gpuInputStrides[d];
         inputIndex += idx * m_inputStrides[d];
-        p -= idx * m_cudaInputStrides[d];
+        p -= idx * m_gpuInputStrides[d];
       }
       inputIndex += p * m_inputStrides[limit];
     }
     return inputIndex;
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {
     Index outputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int d = NumDims - 1; d > NumKernelDims; --d) {
-        const Index idx = p / m_cudaOutputStrides[d];
+        const Index idx = p / m_gpuOutputStrides[d];
         outputIndex += idx * m_outputStrides[d];
-        p -= idx * m_cudaOutputStrides[d];
+        p -= idx * m_gpuOutputStrides[d];
       }
       outputIndex += p * m_outputStrides[NumKernelDims];
     } else {
-      int limit = 0;
+      std::ptrdiff_t limit = 0;
       if (NumKernelDims < NumDims) {
         limit = NumDims - NumKernelDims - 1;
       }
       for (int d = 0; d < limit; ++d) {
-        const Index idx = p / m_cudaOutputStrides[d];
+        const Index idx = p / m_gpuOutputStrides[d];
         outputIndex += idx * m_outputStrides[d];
-        p -= idx * m_cudaOutputStrides[d];
+        p -= idx * m_gpuOutputStrides[d];
       }
       outputIndex += p * m_outputStrides[limit];
     }
     return outputIndex;
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
     return i * m_inputStrides[offset];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
     return i * m_outputStrides[offset];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
     return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
     return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
@@ -197,7 +197,7 @@
            k * m_inputStrides[offset + 2];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
@@ -206,11 +206,11 @@
   }
 
  private:
-  static const size_t NumDims = internal::array_size<InputDims>::value;
+  static const int NumDims = internal::array_size<InputDims>::value;
   array<Index, NumDims> m_inputStrides;
   array<Index, NumDims> m_outputStrides;
-  array<Index, NumDims> m_cudaInputStrides;
-  array<Index, NumDims> m_cudaOutputStrides;
+  array<Index, NumDims> m_gpuInputStrides;
+  array<Index, NumDims> m_gpuOutputStrides;
 };
 
 
@@ -221,7 +221,6 @@
   // Type promotion to handle the case where the types of the lhs and the rhs are different.
   typedef typename promote_storage_type<typename InputXprType::Scalar,
                                         typename KernelXprType::Scalar>::ret Scalar;
-  typedef typename packet_traits<Scalar>::type Packet;
   typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
                                         typename traits<KernelXprType>::StorageKind>::ret StorageKind;
   typedef typename promote_index_type<typename traits<InputXprType>::Index,
@@ -232,9 +231,11 @@
   typedef typename remove_reference<RhsNested>::type _RhsNested;
   static const int NumDimensions = traits<InputXprType>::NumDimensions;
   static const int Layout = traits<InputXprType>::Layout;
+  typedef typename conditional<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,
+  typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType>::type PointerType;
 
   enum {
-    Flags = 0,
+    Flags = 0
   };
 };
 
@@ -255,16 +256,13 @@
 
 
 template<typename Indices, typename InputXprType, typename KernelXprType>
-class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType> >
+class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
-  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Packet Packet;
   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
   typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
                                                   typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
-  typedef typename internal::promote_storage_type<typename InputXprType::PacketReturnType,
-                                                  typename KernelXprType::PacketReturnType>::ret PacketReturnType;
   typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
@@ -301,18 +299,28 @@
   typedef typename XprType::Index Index;
   typedef DSizes<Index, NumDims> Dimensions;
 
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
   enum {
-    IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned &
-                TensorEvaluator<KernelArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess &
-                   TensorEvaluator<KernelArgType, Device>::PacketAccess,
+    IsAligned = int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned),
+    PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) & int(TensorEvaluator<KernelArgType, Device>::PacketAccess),
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<InputArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
   {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -374,19 +382,14 @@
     }
   }
 
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
     m_inputImpl.evalSubExprsIfNeeded(NULL);
     preloadKernel();
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_inputImpl.cleanup();
     if (m_local_kernel) {
       m_device.deallocate((void*)m_kernel);
@@ -413,7 +416,6 @@
   template<int LoadMode>
   EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
   {
-    const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
     Index indices[2] = {index, index+PacketSize-1};
     Index startInputs[2] = {0, 0};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -443,7 +445,7 @@
       convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
       return result;
     } else {
-      EIGEN_ALIGN_DEFAULT Scalar data[PacketSize];
+      EIGEN_ALIGN_MAX Scalar data[PacketSize];
       data[0] = Scalar(0);
       convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
       for (int i = 1; i < PacketSize-1; ++i) {
@@ -473,7 +475,7 @@
                                        PacketSize));
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
 
  private:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
@@ -529,12 +531,11 @@
       m_local_kernel = false;
     } else {
       size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
-      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+      Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
       typedef TensorEvalToOp<const KernelArgType> EvalTo;
       EvalTo evalToTmp(local, m_kernelArg);
-      const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value;
-      const bool BlockAccess = false;
-      internal::TensorExecutor<const EvalTo, Device, PacketAccess, BlockAccess>::run(evalToTmp, m_device);
+      const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
+      internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device);
 
       m_kernel = local;
       m_local_kernel = true;
@@ -553,14 +554,14 @@
   KernelArgType m_kernelArg;
   const Scalar* m_kernel;
   bool m_local_kernel;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
 };
 
 
 
 
 // Use an optimized implementation of the evaluation code for GPUs whenever possible.
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
 
 template <int StaticKernelSize>
 struct GetKernelSize {
@@ -577,13 +578,17 @@
 
 template <typename InputEvaluator, typename Index, typename InputDims,
           int StaticKernelSize>
-__global__ void EigenConvolutionKernel1D(
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
     InputEvaluator eval,
     const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
         indexMapper,
     const float* __restrict kernel, const int numPlanes, const int numX,
     const int maxX, const int kernelSize, float* buffer) {
+#if defined(EIGEN_HIPCC)
+  HIP_DYNAMIC_SHARED(float, s)
+#else
   extern __shared__ float s[];
+#endif
 
   const int first_x = blockIdx.x * maxX;
   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
@@ -595,18 +600,18 @@
 
   for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
     // Load inputs to shared memory
-    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
     const int plane_kernel_offset = threadIdx.y * num_x_input;
     #pragma unroll
     for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
-      const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
+      const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
       s[i + plane_kernel_offset] = eval.coeff(tensor_index);
     }
 
     __syncthreads();
 
     // Compute the convolution
-    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
 
     #pragma unroll
     for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
@@ -616,7 +621,7 @@
       for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
         result += s[k + kernel_offset] * kernel[k];
       }
-      const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
+      const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
       buffer[tensor_index] = result;
     }
     __syncthreads();
@@ -625,14 +630,18 @@
 
 template <typename InputEvaluator, typename Index, typename InputDims,
           int StaticKernelSizeX, int StaticKernelSizeY>
-__global__ __launch_bounds__(1024, 1) void EigenConvolutionKernel2D(
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
     InputEvaluator eval,
     const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
         indexMapper,
     const float* __restrict kernel, const int numPlanes, const int numX,
     const int maxX, const int numY, const int maxY, const int kernelSizeX,
     const int kernelSizeY, float* buffer) {
+#if defined(EIGEN_HIPCC)
+  HIP_DYNAMIC_SHARED(float, s)
+#else
   extern __shared__ float s[];
+#endif
 
   const int first_x = blockIdx.x * maxX;
   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
@@ -649,7 +658,7 @@
 
   for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
 
-    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
     const int plane_kernel_offset = threadIdx.z * num_y_input;
 
     // Load inputs to shared memory
@@ -658,7 +667,7 @@
       const int input_offset = num_x_input * (j + plane_kernel_offset);
       #pragma unroll
       for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
-        const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
+        const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
         s[i + input_offset] = eval.coeff(tensor_index);
       }
     }
@@ -666,7 +675,7 @@
     __syncthreads();
 
     // Convolution
-    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
 
     #pragma unroll
     for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
@@ -682,7 +691,7 @@
             result += s[k + input_offset] * kernel[k + kernel_offset];
           }
         }
-        const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
+        const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
         buffer[tensor_index] = result;
       }
     }
@@ -692,7 +701,7 @@
 };
 
 template <typename InputEvaluator, typename Index, typename InputDims>
-__global__ void EigenConvolutionKernel3D(
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
     InputEvaluator eval,
     const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
         indexMapper,
@@ -700,7 +709,11 @@
     const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
     const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
     const size_t kernelSizeZ, float* buffer) {
+#if defined(EIGEN_HIPCC)
+  HIP_DYNAMIC_SHARED(float, s)
+#else
   extern __shared__ float s[];
+#endif
 
   // Load inputs to shared memory
   const int first_x = blockIdx.x * maxX;
@@ -717,13 +730,13 @@
 
   for (int p = 0; p < numPlanes; ++p) {
 
-    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
     const int plane_kernel_offset = 0;
 
     for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
       for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
         for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
-          const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
+          const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
           s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
         }
       }
@@ -735,7 +748,7 @@
     const int num_z_output = last_z - first_z + 1;
     const int num_y_output = last_y - first_y + 1;
     const int num_x_output = last_x - first_x + 1;
-    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
 
     for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
       for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
@@ -748,7 +761,7 @@
               }
             }
           }
-          const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
+          const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
           buffer[tensor_index] = result;
         }
       }
@@ -771,17 +784,21 @@
   typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
 
   enum {
-    IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned &
-                TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
+    IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
-      : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  TensorEvaluator(const XprType& op, const GpuDevice& device)
+      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
   {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -799,7 +816,7 @@
   }
 
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
   typedef typename InputArgType::Scalar Scalar;
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
@@ -844,8 +861,7 @@
       typedef TensorEvalToOp<const KernelArgType> EvalTo;
       EvalTo evalToTmp(local, m_kernelArg);
       const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
-      const bool BlockAccess = false;
-      internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess, BlockAccess>::run(evalToTmp, m_device);
+      internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
 
       m_kernel = local;
       m_local_kernel = true;
@@ -864,9 +880,9 @@
     typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
 
     const int maxSharedMem = m_device.sharedMemPerBlock();
-    const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
-    const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
-    const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
+    const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
+    const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
+    const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
     const int warpSize = 32;
 
     switch (NumKernelDims) {
@@ -885,29 +901,29 @@
         if (m_indices[0] == single_stride_dim) {
           // Maximum the reuse
           const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
-          maxX = (std::min<int>)(inner_dim, numX);
-          const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
+          maxX = numext::mini<int>(inner_dim, numX);
+          const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
           block_size.x = numext::mini(maxThreadsPerBlock, maxX);
-          block_size.y = (std::min<int>)(maxThreadsPerBlock / block_size.x, maxP);
+          block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
         }
         else {
           // Read as much as possible alongside the inner most dimension, that is the plane
           const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
-          const int maxP = (std::min<int>)(inner_dim, numP);
-          maxX = (std::min<int>)(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
+          const int maxP = numext::mini<int>(inner_dim, numP);
+          maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
 
           block_size.x = numext::mini(warpSize, maxX);
-          block_size.y = (std::min<int>)(maxThreadsPerBlock/block_size.x, maxP);
+          block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
         }
 
         const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
-        assert(shared_mem <= maxSharedMem);
+        gpu_assert(shared_mem <= maxSharedMem);
 
         const int num_x_blocks = ceil(numX, maxX);
         const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
         const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
 
-        dim3 num_blocks(num_x_blocks, std::min<int>(num_y_blocks, ceil(numP, block_size.y)));
+        dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
 
 
         //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
@@ -918,15 +934,15 @@
             m_inputImpl.dimensions(), kernel_dims, indices);
         switch(kernel_size) {
           case 4: {
-            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
             break;
           }
           case 7: {
-            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
             break;
           }
           default: {
-            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
           }
         }
         break;
@@ -948,24 +964,24 @@
 
         // Snap maxX to warp size
         int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
-        const int maxX = (std::min<int>)(inner_dim, numX);
-        const int maxY = (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
-        const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
+        const int maxX = numext::mini<int>(inner_dim, numX);
+        const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
+        const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
 
         dim3 block_size;
         block_size.x = numext::mini(1024, maxX);
-        block_size.y = (std::min<int>)(1024/block_size.x, maxY);
-        block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxP);
+        block_size.y = numext::mini<int>(1024/block_size.x, maxY);
+        block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
 
         const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
-        assert(shared_mem <= maxSharedMem);
+        gpu_assert(shared_mem <= maxSharedMem);
 
         const int num_x_blocks = ceil(numX, maxX);
         const int num_y_blocks = ceil(numY, maxY);
         const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
         const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
 
-        dim3 num_blocks(num_x_blocks, num_y_blocks, std::min<int>(num_z_blocks, ceil(numP, block_size.z)));
+        dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
 
 
         //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
@@ -979,11 +995,11 @@
           case 4: {
             switch (kernel_size_y) {
               case 7: {
-                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
+                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
                 break;
               }
               default: {
-                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
+                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
                 break;
               }
             }
@@ -992,18 +1008,18 @@
           case 7: {
             switch (kernel_size_y) {
               case 4: {
-                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
+                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
                 break;
               }
               default: {
-                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
+                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
                 break;
               }
             }
             break;
           }
           default: {
-            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
             break;
           }
         }
@@ -1027,18 +1043,18 @@
         const int numZ = dimensions()[m_indices[idxZ]];
         const int numP = dimensions().TotalSize() / (numX*numY*numZ);
 
-        const int maxX = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
-        const int maxY = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
-        const int maxZ = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
+        const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
+        const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
+        const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
 
         dim3 block_size;
         block_size.x = numext::mini(32, maxX);
         block_size.y = numext::mini(32, maxY);
-        block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxZ);
+        block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
         dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
 
         const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
-        assert(shared_mem <= maxSharedMem);
+        gpu_assert(shared_mem <= maxSharedMem);
 
         //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z  << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
         const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
@@ -1049,7 +1065,7 @@
         internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
             m_inputImpl.dimensions(), kernel_dims, indices);
 
-        LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
+        LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
         break;
       }
 
@@ -1095,7 +1111,7 @@
 
  private:
   // No assignment (copies are needed by the kernels)
-  TensorEvaluator& operator=(const TensorEvaluator&);
+  TensorEvaluator& operator = (const TensorEvaluator&);
 
   TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
   TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
new file mode 100644
index 0000000..033318f
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h

@@ -0,0 +1,544 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
+
+namespace Eigen {
+
+/** \class TensorConvolution
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor convolution class.
+ *
+ *
+ */
+
+enum class convolution_type { CONV1D, CONV2D, CONV3D };
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
+struct EigenConvolutionKernel;
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+                              Buffer_accessor, convolution_type::CONV1D> {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      Local_accessor;
+  Local_accessor local_acc;
+  Evaluator device_evaluator;
+  Kernel_accessor kernel_filter;
+  Buffer_accessor buffer_acc;
+  internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
+  const size_t kernelSize;
+  const cl::sycl::range<2> input_range;
+  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+                         Buffer_accessor buffer_acc_,
+                         internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
+                         const size_t kernelSize_, const cl::sycl::range<2> input_range_)
+      : local_acc(local_acc_),
+        device_evaluator(device_evaluator_),
+        kernel_filter(kernel_filter_),
+        buffer_acc(buffer_acc_),
+        indexMapper(indexMapper_),
+        kernelSize(kernelSize_),
+        input_range(input_range_) {}
+
+  template <typename BooleanDim2>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
+    return (boolean_check[0] && boolean_check[1]);
+  }
+  void operator()(cl::sycl::nd_item<2> itemID) {
+    auto buffer_ptr = buffer_acc.get_pointer();
+    auto kernel_ptr = kernel_filter.get_pointer();
+    // the required row to be calculated for the for each plane in shered memory
+    const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
+    const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
+    const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
+    const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
+    /// fill the shared memory
+    for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
+      const size_t local_index = i + plane_kernel_offset;
+      const size_t tensor_index =
+          plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
+
+      local_acc[local_index] =
+          (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
+              ? device_evaluator.coeff(tensor_index)
+              : CoeffReturnType(0);
+    }
+
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+    // calculate the convolution // output start x
+    const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
+    if (boundary_check(itemID.get_global_id() < input_range)) {
+      CoeffReturnType result = static_cast<CoeffReturnType>(0);
+      const size_t index = plane_kernel_offset + itemID.get_local_id(0);
+      for (size_t k = 0; k < kernelSize; ++k) {
+        result += (local_acc[k + index] * kernel_ptr[k]);
+      }
+      const size_t tensor_index =
+          indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
+          indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
+      buffer_ptr[tensor_index] = result;
+    }
+  }
+};
+
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+                              Buffer_accessor, convolution_type::CONV2D> {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      Local_accessor;
+  Local_accessor local_acc;
+  Evaluator device_evaluator;
+  Kernel_accessor kernel_filter;
+  Buffer_accessor buffer_acc;
+  internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
+  const cl::sycl::range<2> kernel_size;
+  const cl::sycl::range<3> input_range;
+  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+                         Buffer_accessor buffer_acc_,
+                         internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
+                         const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
+      : local_acc(local_acc_),
+        device_evaluator(device_evaluator_),
+        kernel_filter(kernel_filter_),
+        buffer_acc(buffer_acc_),
+        indexMapper(indexMapper_),
+        kernel_size(kernel_size_),
+        input_range(input_range_) {}
+  template <typename BooleanDim3>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
+    return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
+  }
+
+  void operator()(cl::sycl::nd_item<3> itemID) {
+    auto buffer_ptr = buffer_acc.get_pointer();
+    auto kernel_ptr = kernel_filter.get_pointer();
+    // the required row to be calculated for the for each plane in shered memory
+    const auto num_input = cl::sycl::range<2>{
+        (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
+
+    const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
+    const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
+
+    const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
+                                                 itemID.get_group(1) * itemID.get_local_range()[1]};
+      
+    // fill the local memory
+    bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
+    for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
+      const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
+      bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1)); 
+      for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
+        const size_t local_index = i + local_input_offset;
+        const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
+                                                             i + input_offset[0], j + input_offset[1]);
+        local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
+                                  in_range_dim1 && in_range_dim2)
+                                     ? device_evaluator.coeff(tensor_index)
+                                     : CoeffReturnType(0);
+      }
+    }
+
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+    // output offset start for each thread
+    const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
+                                                  itemID.get_group(1) * itemID.get_local_range()[1]};
+
+    if (boundary_check(itemID.get_global_id() < input_range)) {
+      CoeffReturnType result = static_cast<CoeffReturnType>(0);
+
+      for (size_t j = 0; j < kernel_size[1]; j++) {
+        size_t kernel_offset = kernel_size[0] * j;
+        const size_t index =
+            (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
+        for (size_t i = 0; i < kernel_size[0]; i++) {
+          result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
+        }
+      }
+      const size_t tensor_index =
+          indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
+          indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
+                                                             itemID.get_local_id(1) + output_offset[1]);
+
+      buffer_ptr[tensor_index] = result;
+    }
+  }
+};
+
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+                              Buffer_accessor, convolution_type::CONV3D> {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      Local_accessor;
+  Local_accessor local_acc;
+  Evaluator device_evaluator;
+  Kernel_accessor kernel_filter;
+  Buffer_accessor buffer_acc;
+  internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
+  const cl::sycl::range<3> kernel_size;
+  const cl::sycl::range<3> input_range;
+  const size_t numP;
+
+  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+                         Buffer_accessor buffer_acc_,
+                         internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
+                         const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
+                         const size_t numP_)
+      : local_acc(local_acc_),
+        device_evaluator(device_evaluator_),
+        kernel_filter(kernel_filter_),
+        buffer_acc(buffer_acc_),
+        indexMapper(indexMapper_),
+        kernel_size(kernel_size_),
+        input_range(input_range_),
+        numP(numP_) {}
+  template <typename BooleanDim3>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
+    return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
+  }
+  void operator()(cl::sycl::nd_item<3> itemID) {
+    auto buffer_ptr = buffer_acc.get_pointer();
+    auto kernel_ptr = kernel_filter.get_pointer();
+    const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
+
+    const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
+
+    const auto output_offset =
+          cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
+
+    for (size_t p = 0; p < numP; p++) {
+      /// fill the shared memory
+      const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
+      for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
+        size_t local_index_dim2 = num_input[0] * num_input[1] * k;
+        bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
+        for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
+          bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
+          size_t local_index_dim1 = (num_input[0] * j)  + local_index_dim2;
+          for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
+            bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
+            const size_t local_index = local_index_dim1 + i;
+            const size_t tensor_index =
+                plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
+                                         i + input_offset[0], j + input_offset[1], k + input_offset[2]);
+            local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
+          }
+        }
+      }
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+      // calculate the convolution
+
+      if (boundary_check(itemID.get_global_id() < input_range)) {
+        CoeffReturnType result = static_cast<CoeffReturnType>(0);
+        for (size_t k = 0; k < kernel_size[2]; k++) {
+          for (size_t j = 0; j < kernel_size[1]; j++) {
+            for (size_t i = 0; i < kernel_size[0]; i++) {
+              const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
+              const size_t local_index =
+                  ((i + itemID.get_local_id(0)) +
+                   num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
+
+              result += (local_acc[local_index] * kernel_ptr[kernel_index]);
+            }
+          }
+        }
+        const size_t tensor_index =
+            indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
+            indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
+        buffer_ptr[tensor_index] = result;
+      }
+
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+    }
+  }
+};
+
+template <typename Indices, typename InputArgType, typename KernelArgType>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
+  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+  static const int NumDims =
+      internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
+  static const int NumKernelDims = internal::array_size<Indices>::value;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
+  typedef const Eigen::SyclDevice Device;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
+  typedef typename InputArgType::Scalar Scalar;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
+
+  enum {
+    IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
+                TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
+    PacketAccess = false,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
+      : m_inputImpl(op.inputExpression(), device),
+        m_kernelArg(op.kernelExpression()),
+        m_kernelImpl(op.kernelExpression(), device),
+        m_indices(op.indices()),
+        m_buf(NULL),
+        m_kernel(NULL),
+        m_local_kernel(false),
+        m_device(device) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
+                         static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
+        m_kernelImpl.dimensions();
+
+    m_dimensions = m_inputImpl.dimensions();
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = op.indices()[i];
+      const Index input_dim = input_dims[index];
+      const Index kernel_dim = kernel_dims[i];
+      const Index result_dim = input_dim - kernel_dim + 1;
+      m_dimensions[index] = result_dim;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    preloadKernel();
+    m_inputImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      executeEval(data);
+      return false;
+    } else {
+      m_buf = (EvaluatorPointerType)m_device.get(
+          (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
+      executeEval(m_buf);
+      return true;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_inputImpl.cleanup();
+    if (m_buf) {
+      m_device.deallocate_temp(m_buf);
+      m_buf = NULL;
+    }
+    if (m_local_kernel) {
+      m_device.deallocate_temp(m_kernel);
+      m_local_kernel = false;
+    }
+    m_kernel = NULL;
+  }
+  /// used by sycl in order to build the sycl buffer
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; }
+  /// used by sycl in order to build the sycl buffer
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
+    // Don't make a local copy of the kernel unless we have to (i.e. it's an
+    // expression that needs to be evaluated)
+    typename KernelStorage::Type in_place = m_kernelImpl.data();
+    if (in_place) {
+      m_kernel = in_place;
+      m_local_kernel = false;
+    } else {
+      ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+      EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
+      typedef TensorEvalToOp<const KernelArgType> EvalTo;
+      EvalTo evalToTmp(m_device.get(local), m_kernelArg);
+      const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
+      internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
+      m_kernel = local;
+      m_local_kernel = true;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const {
+    typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
+    typedef typename InputEvaluator::Dimensions InputDims;
+    switch (NumKernelDims) {
+      case 1: {
+        const size_t numX = dimensions()[m_indices[0]];
+        const size_t numP = dimensions().TotalSize() / numX;
+        const auto input_dim = std::array<size_t, 2>{numX, numP};
+        auto global_range = cl::sycl::range<2>{};
+        auto local_range = cl::sycl::range<2>{};
+        const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
+
+        m_device.parallel_for_setup(input_dim, global_range, local_range);
+        const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
+        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+        const array<Index, 1> indices{{m_indices[0]}};
+        const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
+        internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+
+        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
+            ConvKernel;
+
+        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+            m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
+            indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
+        break;
+      }
+
+      case 2: {
+        auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
+                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
+        auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
+                                              (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
+        const size_t numX = dimensions()[m_indices[kernel_index[0]]];
+        const size_t numY = dimensions()[m_indices[kernel_index[1]]];
+        const size_t numP = dimensions().TotalSize() / (numX * numY);
+        auto input_dim = std::array<size_t, 3>{numX, numY, numP};
+
+        auto global_range = cl::sycl::range<3>{};
+        auto local_range = cl::sycl::range<3>{};
+
+        m_device.parallel_for_setup(input_dim, global_range, local_range);
+
+        const size_t local_memory_size =
+            (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
+        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+        const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
+        const array<Index, 2> kernel_dims{
+            {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
+        internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
+            ConvKernel;
+        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+            m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
+            indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
+        break;
+      }
+
+      case 3: {
+        auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
+                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
+                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
+
+        auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
+                                              (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
+                                              (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
+
+        const size_t numX = dimensions()[m_indices[kernel_index[0]]];
+        const size_t numY = dimensions()[m_indices[kernel_index[1]]];
+        const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
+        auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
+        const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
+
+        const array<Index, 3> indices{
+            {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
+        const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
+                                           m_kernelImpl.dimensions()[kernel_index[1]],
+                                           m_kernelImpl.dimensions()[kernel_index[2]]}};
+
+        internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+
+        auto global_range = cl::sycl::range<3>{};
+        auto local_range = cl::sycl::range<3>{};
+
+        m_device.parallel_for_setup(input_dim, global_range, local_range);
+        auto local_memory_range = (local_range + kernel_size - 1);
+        const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
+
+        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
+            ConvKernel;
+        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+            m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
+            indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
+        break;
+      }
+
+      default: {
+        EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
+                            THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(m_buf != NULL);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return m_buf[index];
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
+    eigen_assert(m_buf != NULL);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
+    // model.
+    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+    // We ignore the use of fused multiply-add.
+    const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+    const double firstIndex_compute_cost =
+        NumDims *
+        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
+    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+           kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
+                          TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
+  }
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_kernelImpl.bind(cgh);
+    m_inputImpl.bind(cgh);
+    m_buf.bind(cgh);
+    m_kernel.bind(cgh);
+  }
+
+ private:
+  // No assignment (copies are needed by the kernels)
+  TensorEvaluator &operator=(const TensorEvaluator &);
+  TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
+  KernelArgType m_kernelArg;
+  TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
+  Indices m_indices;
+  Dimensions m_dimensions;
+  EvaluatorPointerType m_buf;
+  typename KernelStorage::Type m_kernel;
+  bool m_local_kernel;
+  const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
+};  // namespace Eigen
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index 183b109..195267c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h

@@ -28,35 +28,38 @@
   // model based on minimal reciprocal throughput numbers from Intel or
   // Agner Fog's tables would be better than what is there now.
   template <typename ArgType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int MulCost() {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
     return internal::functor_traits<
-        internal::scalar_product_op<ArgType, ArgType>>::Cost;
+        internal::scalar_product_op<ArgType, ArgType> >::Cost;
   }
   template <typename ArgType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int AddCost() {
-    return internal::functor_traits<internal::scalar_sum_op<ArgType>>::Cost;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
+    return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
   }
   template <typename ArgType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int DivCost() {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
     return internal::functor_traits<
-        internal::scalar_quotient_op<ArgType, ArgType>>::Cost;
+        internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
   }
   template <typename ArgType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int ModCost() {
-    return internal::functor_traits<internal::scalar_mod_op<ArgType>>::Cost;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
+    return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
   }
   template <typename SrcType, typename TargetType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int CastCost() {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
     return internal::functor_traits<
-        internal::scalar_cast_op<SrcType, TargetType>>::Cost;
+        internal::scalar_cast_op<SrcType, TargetType> >::Cost;
   }
 
+  EIGEN_DEVICE_FUNC
   TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
+  EIGEN_DEVICE_FUNC
   TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
       : bytes_loaded_(bytes_loaded),
         bytes_stored_(bytes_stored),
         compute_cycles_(compute_cycles) {}
 
+  EIGEN_DEVICE_FUNC
   TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
                bool vectorized, double packet_size)
       : bytes_loaded_(bytes_loaded),
@@ -91,21 +94,21 @@
   }
 
   // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin(
-      const TensorOpCost& rhs) {
-    bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded());
-    bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored());
-    compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles());
-    return *this;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
+      const TensorOpCost& rhs) const {
+    double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
+    double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
+    double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
+    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
   }
 
   // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax(
-      const TensorOpCost& rhs) {
-    bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
-    bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored());
-    compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles());
-    return *this;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
+      const TensorOpCost& rhs) const {
+    double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
+    double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
+    double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
+    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
@@ -159,7 +162,8 @@
  public:
   // Scaling from Eigen compute cost to device cycles.
   static const int kDeviceCyclesPerComputeCycle = 1;
-  // Costs in device cycles.
+
+ // Costs in device cycles.
   static const int kStartupCycles = 100000;
   static const int kPerThreadCycles = 100000;
   static const int kTaskSize = 40000;
@@ -170,8 +174,11 @@
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
       double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
     double cost = totalCost(output_size, cost_per_coeff);
-    int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
-    return numext::mini(max_threads, numext::maxi(1, threads));
+    double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
+    // Make sure we don't invoke undefined behavior when we convert to an int.
+    threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
+    return numext::mini(max_threads,
+                        numext::maxi<int>(1, static_cast<int>(threads)));
   }
 
   // taskSize assesses parallel task size.
@@ -188,7 +195,7 @@
     // 11 is L2 cache latency on Haswell.
     // We don't know whether data is in L1, L2 or L3. But we are most interested
     // in single-threaded computational time around 100us-10ms (smaller time
-    // is too small for parallelization, larger time is not intersting
+    // is too small for parallelization, larger time is not interesting
     // either because we are probably using all available threads already).
     // And for the target time range, L2 seems to be what matters. Data set
     // fitting into L1 is too small to take noticeable time. Data set fitting

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
index dbb6890..95a8a84 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h

@@ -30,16 +30,17 @@
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = traits<XprType>::NumDimensions;
   static const int Layout = traits<XprType>::Layout;
+  typedef typename traits<XprType>::PointerType PointerType;
 };
 
 template<typename CustomUnaryFunc, typename XprType>
 struct eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Eigen::Dense>
 {
-  typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>& type;
+  typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>EIGEN_DEVICE_REF type;
 };
 
 template<typename CustomUnaryFunc, typename XprType>
-struct nested<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, 1, typename eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >::type>
+struct nested<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
 {
   typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> type;
 };
@@ -86,18 +87,26 @@
   typedef typename internal::remove_const<typename ArgType::Scalar>::type Scalar;
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = false,
-    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<XprType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
       : m_op(op), m_device(device), m_result(NULL)
   {
     m_dimensions = op.func().dimensions(op.expression());
@@ -105,21 +114,21 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     if (data) {
       evalTo(data);
       return false;
     } else {
-      m_result = static_cast<CoeffReturnType*>(
-          m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*)
+          m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))));
       evalTo(m_result);
       return true;
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    if (m_result != NULL) {
-      m_device.deallocate(m_result);
+  EIGEN_STRONG_INLINE void cleanup() {
+    if (m_result) {
+      m_device.deallocate_temp(m_result);
       m_result = NULL;
     }
   }
@@ -134,22 +143,29 @@
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_result.bind(cgh);
+  }
+#endif
 
  protected:
-  EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
-    TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(
-        data, m_dimensions);
+  void evalTo(EvaluatorPointerType data) {
+    TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(m_device.get(data), m_dimensions);
     m_op.func().eval(m_op.expression(), result, m_device);
   }
 
   Dimensions m_dimensions;
   const ArgType m_op;
-  const Device& m_device;
-  CoeffReturnType* m_result;
+  const Device EIGEN_DEVICE_REF m_device;
+  EvaluatorPointerType m_result;
 };
 
 
@@ -179,6 +195,8 @@
   typedef typename remove_reference<RhsNested>::type _RhsNested;
   static const int NumDimensions = traits<LhsXprType>::NumDimensions;
   static const int Layout = traits<LhsXprType>::Layout;
+  typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+                                typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType;
 };
 
 template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
@@ -188,7 +206,7 @@
 };
 
 template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
-struct nested<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, 1, typename eval<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >::type>
+struct nested<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >
 {
   typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> type;
 };
@@ -241,18 +259,27 @@
   typedef typename XprType::Scalar Scalar;
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = false,
-    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<LhsXprType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_op(op), m_device(device), m_result(NULL)
   {
     m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression());
@@ -260,20 +287,21 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     if (data) {
       evalTo(data);
       return false;
     } else {
-      m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*)
+        m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType))));
       evalTo(m_result);
       return true;
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     if (m_result != NULL) {
-      m_device.deallocate(m_result);
+      m_device.deallocate_temp(m_result);
       m_result = NULL;
     }
   }
@@ -292,18 +320,25 @@
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_result.bind(cgh);
+  }
+#endif
 
  protected:
-  EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
-    TensorMap<Tensor<Scalar, NumDims, Layout> > result(data, m_dimensions);
+  void evalTo(EvaluatorPointerType data) {
+    TensorMap<Tensor<CoeffReturnType, NumDims, Layout> > result(m_device.get(data), m_dimensions);
     m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device);
   }
 
   Dimensions m_dimensions;
   const XprType m_op;
-  const Device& m_device;
-  CoeffReturnType* m_result;
+  const Device EIGEN_DEVICE_REF m_device;
+  EvaluatorPointerType m_result;
 };
 
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
index 3c33015..96fa46c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h

@@ -21,14 +21,15 @@
   * Example:
   *    C.device(EIGEN_GPU) = A + B;
   *
-  * Todo: thread pools.
-  * Todo: operator +=, -=, *= and so on.
+  * Todo: operator *= and /=.
   */
 
 template <typename ExpressionType, typename DeviceType> class TensorDevice {
   public:
     TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
 
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorDevice)
+
     template<typename OtherDerived>
     EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
       typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
@@ -64,91 +65,73 @@
     ExpressionType& m_expression;
 };
 
+/** \class TensorAsyncDevice
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Pseudo expression providing an operator = that will evaluate its
+ * argument asynchronously on the specified device. Currently only
+ * ThreadPoolDevice implements proper asynchronous execution, while the default
+ * and GPU devices just run the expression synchronously and call m_done() on
+ * completion..
+ *
+ * Example:
+ *    auto done = []() { ... expression evaluation done ... };
+ *    C.device(thread_pool_device, std::move(done)) = A + B;
+ */
+
+template <typename ExpressionType, typename DeviceType, typename DoneCallback>
+class TensorAsyncDevice {
+ public:
+  TensorAsyncDevice(const DeviceType& device, ExpressionType& expression,
+                    DoneCallback done)
+      : m_device(device), m_expression(expression), m_done(std::move(done)) {}
+
+  template <typename OtherDerived>
+  EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
+    typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+    typedef internal::TensorExecutor<const Assign, DeviceType> Executor;
+
+    Assign assign(m_expression, other);
+    Executor::run(assign, m_device);
+    m_done();
+
+    return *this;
+  }
+
+ protected:
+  const DeviceType& m_device;
+  ExpressionType& m_expression;
+  DoneCallback m_done;
+};
+
 
 #ifdef EIGEN_USE_THREADS
-template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPoolDevice> {
-  public:
-    TensorDevice(const ThreadPoolDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
+template <typename ExpressionType, typename DoneCallback>
+class TensorAsyncDevice<ExpressionType, ThreadPoolDevice, DoneCallback> {
+ public:
+  TensorAsyncDevice(const ThreadPoolDevice& device, ExpressionType& expression,
+                    DoneCallback done)
+      : m_device(device), m_expression(expression), m_done(std::move(done)) {}
 
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
-      typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
-      Assign assign(m_expression, other);
-      internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device);
-      return *this;
-    }
+  template <typename OtherDerived>
+  EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
+    typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+    typedef internal::TensorAsyncExecutor<const Assign, ThreadPoolDevice, DoneCallback> Executor;
 
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
-      typedef typename OtherDerived::Scalar Scalar;
-      typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
-      Sum sum(m_expression, other);
-      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
-      Assign assign(m_expression, sum);
-      internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device);
-      return *this;
-    }
+    // WARNING: After assignment 'm_done' callback will be in undefined state.
+    Assign assign(m_expression, other);
+    Executor::runAsync(assign, m_device, std::move(m_done));
 
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) {
-      typedef typename OtherDerived::Scalar Scalar;
-      typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference;
-      Difference difference(m_expression, other);
-      typedef TensorAssignOp<ExpressionType, const Difference> Assign;
-      Assign assign(m_expression, difference);
-      internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device);
-      return *this;
-    }
+    return *this;
+  }
 
-  protected:
-    const ThreadPoolDevice& m_device;
-    ExpressionType& m_expression;
+ protected:
+  const ThreadPoolDevice& m_device;
+  ExpressionType& m_expression;
+  DoneCallback m_done;
 };
 #endif
 
-#if defined(EIGEN_USE_GPU)
-template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
-{
-  public:
-    TensorDevice(const GpuDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
-      typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
-      Assign assign(m_expression, other);
-      internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device);
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
-      typedef typename OtherDerived::Scalar Scalar;
-      typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
-      Sum sum(m_expression, other);
-      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
-      Assign assign(m_expression, sum);
-      internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device);
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) {
-      typedef typename OtherDerived::Scalar Scalar;
-      typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference;
-      Difference difference(m_expression, other);
-      typedef TensorAssignOp<ExpressionType, const Difference> Assign;
-      Assign assign(m_expression, difference);
-      internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device);
-      return *this;
-    }
-
-  protected:
-    const GpuDevice& m_device;
-    ExpressionType& m_expression;
-};
-#endif
-
-
 } // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
new file mode 100644
index 0000000..f779239
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h

@@ -0,0 +1,6 @@
+
+#if defined(__clang__) || defined(__GNUC__)
+#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorDeviceGpu.h file"
+#endif
+
+#include "TensorDeviceGpu.h"

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
new file mode 100644
index 0000000..46b9d3a
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h

@@ -0,0 +1,104 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
+
+
+namespace Eigen {
+
+// Default device for the machine (typically a single cpu core)
+struct DefaultDevice {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return internal::aligned_malloc(num_bytes);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    internal::aligned_free(buffer);
+  }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
+    return allocate(num_bytes);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
+    deallocate(buffer);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+    ::memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+    ::memset(buffer, c, n);
+  }
+  template<typename Type>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { 
+    return data;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+#if !defined(EIGEN_GPU_COMPILE_PHASE)
+    // Running on the host CPU
+    return 1;
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    return 64;
+#else
+    // Running on a CUDA device
+    return 32;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
+    // Running on the host CPU
+    return l1CacheSize();
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    return 48*1024; // FIXME : update this number for HIP
+#else
+    // Running on a CUDA device, return the amount of shared memory available.
+    return 48*1024;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
+    // Running single threaded on the host CPU
+    return l3CacheSize();
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    return firstLevelCacheSize(); // FIXME : update this number for HIP
+#else
+    // Running on a CUDA device
+    return firstLevelCacheSize();
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+#if !defined(EIGEN_GPU_COMPILE_PHASE)
+    // Running single threaded on the host CPU
+    // Should return an enum that encodes the ISA supported by the CPU
+    return 1;
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    // return 1 as major for HIP
+    return 1;
+#else
+    // Running on a CUDA device
+    return EIGEN_CUDA_ARCH / 100;
+#endif
+  }
+};
+
+}  // namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
new file mode 100644
index 0000000..ec2e3cb
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h

@@ -0,0 +1,389 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
+
+// This header file container defines fo gpu* macros which will resolve to
+// their equivalent hip* or cuda* versions depending on the compiler in use
+// A separate header (included at the end of this file) will undefine all 
+#include "TensorGpuHipCudaDefines.h"
+
+namespace Eigen {
+
+static const int kGpuScratchSize = 1024;
+
+// This defines an interface that GPUDevice can take to use
+// HIP / CUDA streams underneath.
+class StreamInterface {
+ public:
+  virtual ~StreamInterface() {}
+
+  virtual const gpuStream_t& stream() const = 0;
+  virtual const gpuDeviceProp_t& deviceProperties() const = 0;
+
+  // Allocate memory on the actual device where the computation will run
+  virtual void* allocate(size_t num_bytes) const = 0;
+  virtual void deallocate(void* buffer) const = 0;
+
+  // Return a scratchpad buffer of size 1k
+  virtual void* scratchpad() const = 0;
+
+  // Return a semaphore. The semaphore is initially initialized to 0, and
+  // each kernel using it is responsible for resetting to 0 upon completion
+  // to maintain the invariant that the semaphore is always equal to 0 upon
+  // each kernel start.
+  virtual unsigned int* semaphore() const = 0;
+};
+
+class GpuDeviceProperties {
+ public:
+  GpuDeviceProperties() : 
+      initialized_(false), first_(true), device_properties_(nullptr) {}
+ 
+  ~GpuDeviceProperties() {
+    if (device_properties_) {
+      delete[] device_properties_;
+    }
+  }
+  
+  EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const {
+    return device_properties_[device];
+  }
+
+  EIGEN_STRONG_INLINE bool isInitialized() const {
+    return initialized_;
+  }
+
+  void initialize() {
+    if (!initialized_) {
+      // Attempts to ensure proper behavior in the case of multiple threads
+      // calling this function simultaneously. This would be trivial to
+      // implement if we could use std::mutex, but unfortunately mutex don't
+      // compile with nvcc, so we resort to atomics and thread fences instead.
+      // Note that if the caller uses a compiler that doesn't support c++11 we
+      // can't ensure that the initialization is thread safe.
+      if (first_.exchange(false)) {
+        // We're the first thread to reach this point.
+        int num_devices;
+        gpuError_t status = gpuGetDeviceCount(&num_devices);
+        if (status != gpuSuccess) {
+          std::cerr << "Failed to get the number of GPU devices: "
+                    << gpuGetErrorString(status)
+                    << std::endl;
+          gpu_assert(status == gpuSuccess);
+        }
+        device_properties_ = new gpuDeviceProp_t[num_devices];
+        for (int i = 0; i < num_devices; ++i) {
+          status = gpuGetDeviceProperties(&device_properties_[i], i);
+          if (status != gpuSuccess) {
+            std::cerr << "Failed to initialize GPU device #"
+                      << i
+                      << ": "
+                      << gpuGetErrorString(status)
+                      << std::endl;
+            gpu_assert(status == gpuSuccess);
+          }
+        }
+
+        std::atomic_thread_fence(std::memory_order_release);
+        initialized_ = true;
+      } else {
+        // Wait for the other thread to inititialize the properties.
+        while (!initialized_) {
+          std::atomic_thread_fence(std::memory_order_acquire);
+          std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+        }
+      }
+    }
+  }
+
+ private:
+  volatile bool initialized_;
+  std::atomic<bool> first_;
+  gpuDeviceProp_t* device_properties_;
+};
+
+EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
+  static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
+  if (!deviceProperties->isInitialized()) {
+    deviceProperties->initialize();
+  }
+  return *deviceProperties;
+}
+
+EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
+  return GetGpuDeviceProperties().get(device);
+}
+
+static const gpuStream_t default_stream = gpuStreamDefault;
+
+class GpuStreamDevice : public StreamInterface {
+ public:
+  // Use the default stream on the current device
+  GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
+    gpuGetDevice(&device_);
+  }
+  // Use the default stream on the specified device
+  GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
+  // Use the specified stream. Note that it's the
+  // caller responsibility to ensure that the stream can run on
+  // the specified device. If no device is specified the code
+  // assumes that the stream is associated to the current gpu device.
+  GpuStreamDevice(const gpuStream_t* stream, int device = -1)
+      : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
+    if (device < 0) {
+      gpuGetDevice(&device_);
+    } else {
+      int num_devices;
+      gpuError_t err = gpuGetDeviceCount(&num_devices);
+      EIGEN_UNUSED_VARIABLE(err)
+      gpu_assert(err == gpuSuccess);
+      gpu_assert(device < num_devices);
+      device_ = device;
+    }
+  }
+
+  virtual ~GpuStreamDevice() {
+    if (scratch_) {
+      deallocate(scratch_);
+    }
+  }
+
+  const gpuStream_t& stream() const { return *stream_; }
+  const gpuDeviceProp_t& deviceProperties() const {
+    return GetGpuDeviceProperties(device_);
+  }
+  virtual void* allocate(size_t num_bytes) const {
+    gpuError_t err = gpuSetDevice(device_);
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+    void* result;
+    err = gpuMalloc(&result, num_bytes);
+    gpu_assert(err == gpuSuccess);
+    gpu_assert(result != NULL);
+    return result;
+  }
+  virtual void deallocate(void* buffer) const {
+    gpuError_t err = gpuSetDevice(device_);
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+    gpu_assert(buffer != NULL);
+    err = gpuFree(buffer);
+    gpu_assert(err == gpuSuccess);
+  }
+
+  virtual void* scratchpad() const {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  virtual unsigned int* semaphore() const {
+    if (semaphore_ == NULL) {
+      char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+      gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
+      EIGEN_UNUSED_VARIABLE(err)
+      gpu_assert(err == gpuSuccess);
+    }
+    return semaphore_;
+  }
+
+ private:
+  const gpuStream_t* stream_;
+  int device_;
+  mutable void* scratch_;
+  mutable unsigned int* semaphore_;
+};
+
+struct GpuDevice {
+  // The StreamInterface is not owned: the caller is
+  // responsible for its initialization and eventual destruction.
+  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
+    eigen_assert(stream);
+  }
+  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
+    eigen_assert(stream);
+  }
+  // TODO(bsteiner): This is an internal API, we should not expose it.
+  EIGEN_STRONG_INLINE const gpuStream_t& stream() const {
+    return stream_->stream();
+  }
+
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return stream_->allocate(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    stream_->deallocate(buffer);
+  }
+
+  EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
+    return stream_->allocate(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
+    stream_->deallocate(buffer);
+  }
+
+  template<typename Type>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { 
+    return data;
+  }
+
+  EIGEN_STRONG_INLINE void* scratchpad() const {
+    return stream_->scratchpad();
+  }
+
+  EIGEN_STRONG_INLINE unsigned int* semaphore() const {
+    return stream_->semaphore();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+    gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice,
+                                      stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+#else
+    EIGEN_UNUSED_VARIABLE(dst);
+    EIGEN_UNUSED_VARIABLE(src);
+    EIGEN_UNUSED_VARIABLE(n);
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    gpuError_t err =
+        gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+  }
+
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    gpuError_t err =
+        gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+    gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+#else
+  eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE size_t numThreads() const {
+    // FIXME
+    return 32;
+  }
+
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+    // FIXME
+    return 48*1024;
+  }
+
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // We won't try to take advantage of the l2 cache for the time being, and
+    // there is no l3 cache on hip/cuda devices.
+    return firstLevelCacheSize();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+    gpuError_t err = gpuStreamSynchronize(stream_->stream());
+    if (err != gpuSuccess) {
+      std::cerr << "Error detected in GPU stream: "
+                << gpuGetErrorString(err)
+                << std::endl;
+      gpu_assert(err == gpuSuccess);
+    }
+#else
+    gpu_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const {
+    return stream_->deviceProperties().multiProcessorCount;
+  }
+  EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const {
+    return stream_->deviceProperties().maxThreadsPerBlock;
+  }
+  EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const {
+    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+  }
+  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
+    return stream_->deviceProperties().sharedMemPerBlock;
+  }
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+    return stream_->deviceProperties().major;
+  }
+  EIGEN_STRONG_INLINE int minorDeviceVersion() const {
+    return stream_->deviceProperties().minor;
+  }
+
+  EIGEN_STRONG_INLINE int maxBlocks() const {
+    return max_blocks_;
+  }
+
+  // This function checks if the GPU runtime recorded an error for the
+  // underlying stream device.
+  inline bool ok() const {
+#ifdef EIGEN_GPUCC
+    gpuError_t error = gpuStreamQuery(stream_->stream());
+    return (error == gpuSuccess) || (error == gpuErrorNotReady);
+#else
+    return false;
+#endif
+  }
+
+ private:
+  const StreamInterface* stream_;
+  int max_blocks_;
+};
+
+#if defined(EIGEN_HIPCC)
+
+#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
+  hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
+  gpu_assert(hipGetLastError() == hipSuccess);
+
+#else
+ 
+#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
+  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
+  gpu_assert(cudaGetLastError() == cudaSuccess);
+
+#endif
+ 
+// FIXME: Should be device and kernel specific.
+#ifdef EIGEN_GPUCC
+static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+  gpuError_t status = gpuDeviceSetSharedMemConfig(config);
+  EIGEN_UNUSED_VARIABLE(status)
+  gpu_assert(status == gpuSuccess);
+#else
+  EIGEN_UNUSED_VARIABLE(config)
+#endif
+}
+#endif
+
+}  // end namespace Eigen
+
+// undefine all the gpu* macros we defined at the beginning of the file
+#include "TensorGpuHipCudaUndefines.h"
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
new file mode 100644
index 0000000..df591c2
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h

@@ -0,0 +1,1048 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
+#include <unordered_set>
+
+namespace Eigen {
+
+namespace TensorSycl {
+namespace internal {
+
+/// Cache all the device information needed
+struct SyclDeviceInfo {
+  SyclDeviceInfo(cl::sycl::queue queue)
+      : local_mem_type(
+            queue.get_device()
+                .template get_info<cl::sycl::info::device::local_mem_type>()),
+        max_work_item_sizes(
+            queue.get_device()
+                .template get_info<
+                    cl::sycl::info::device::max_work_item_sizes>()),
+        max_mem_alloc_size(
+            queue.get_device()
+                .template get_info<
+                    cl::sycl::info::device::max_mem_alloc_size>()),
+        max_compute_units(queue.get_device()
+                              .template get_info<
+                                  cl::sycl::info::device::max_compute_units>()),
+        max_work_group_size(
+            queue.get_device()
+                .template get_info<
+                    cl::sycl::info::device::max_work_group_size>()),
+        local_mem_size(
+            queue.get_device()
+                .template get_info<cl::sycl::info::device::local_mem_size>()),
+        platform_name(queue.get_device()
+                          .get_platform()
+                          .template get_info<cl::sycl::info::platform::name>()),
+        device_name(queue.get_device()
+                        .template get_info<cl::sycl::info::device::name>()),
+        device_vendor(
+            queue.get_device()
+                .template get_info<cl::sycl::info::device::vendor>()) {}
+
+  cl::sycl::info::local_mem_type local_mem_type;
+  cl::sycl::id<3> max_work_item_sizes;
+  unsigned long max_mem_alloc_size;
+  unsigned long max_compute_units;
+  unsigned long max_work_group_size;
+  size_t local_mem_size;
+  std::string platform_name;
+  std::string device_name;
+  std::string device_vendor;
+};
+
+}  // end namespace internal
+}  // end namespace TensorSycl
+
+typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
+// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
+// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
+// TensorFlow via the Eigen SYCL Backend.
+EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
+    -> decltype(cl::sycl::device::get_devices()) {
+#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
+  return {cl::sycl::device(cl::sycl::default_selector())};
+#else
+  std::vector<cl::sycl::device> supported_devices;
+  auto platform_list = cl::sycl::platform::get_platforms();
+  for (const auto &platform : platform_list) {
+    auto device_list = platform.get_devices();
+    auto platform_name =
+        platform.template get_info<cl::sycl::info::platform::name>();
+    std::transform(platform_name.begin(), platform_name.end(),
+                   platform_name.begin(), ::tolower);
+    for (const auto &device : device_list) {
+      auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
+      std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
+      bool unsupported_condition =
+          (device.is_cpu() && platform_name.find("amd") != std::string::npos &&
+           vendor.find("apu") == std::string::npos) ||
+          (platform_name.find("experimental") != std::string::npos) ||
+          device.is_host();
+      if (!unsupported_condition) {
+        supported_devices.push_back(device);
+      }
+    }
+  }
+  return supported_devices;
+#endif
+}
+
+class QueueInterface {
+ public:
+  /// Creating device by using cl::sycl::selector or cl::sycl::device.
+  template <typename DeviceOrSelector>
+  explicit QueueInterface(
+      const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
+      unsigned num_threads = std::thread::hardware_concurrency())
+      : m_queue(dev_or_sel, handler),
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+        m_prog(m_queue.get_context(), get_sycl_supported_devices()),
+#endif
+        m_thread_pool(num_threads),
+        m_device_info(m_queue) {
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+    m_prog.build_with_kernel_type<DeviceOrSelector>();
+    auto f = [&](cl::sycl::handler &cgh) {
+      cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
+                                        [=]() {})
+    };
+    EIGEN_SYCL_TRY_CATCH(m_queue.submit(f));
+#endif
+  }
+
+  template <typename DeviceOrSelector>
+  explicit QueueInterface(
+      const DeviceOrSelector &dev_or_sel,
+      unsigned num_threads = std::thread::hardware_concurrency())
+      : QueueInterface(dev_or_sel,
+                       [this](cl::sycl::exception_list l) {
+                         this->exception_caught_ = this->sycl_async_handler(l);
+                       },
+                       num_threads) {}
+
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+  EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; }
+#endif
+
+  /// Attach an existing buffer to the pointer map, Eigen will not reuse it
+  EIGEN_STRONG_INLINE void *attach_buffer(
+      cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    return static_cast<void *>(pMapper.add_pointer(buf));
+  }
+
+  /// Detach previously attached buffer
+  EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    TensorSycl::internal::SYCLfree<false>(p, pMapper);
+  }
+
+  /// Allocating device pointer. This pointer is actually an 8 bytes host
+  /// pointer used as key to access the sycl device buffer. The reason is that
+  /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode
+  /// expressions. So we create a key pointer to be used in Eigen expression
+  /// construction. When we convert the Eigen construction into the sycl
+  /// construction we use this pointer as a key in our buffer_map and we make
+  /// sure that we dedicate only one buffer only for this pointer. The device
+  /// pointer would be deleted by calling deallocate function.
+  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
+#if EIGEN_MAX_ALIGN_BYTES > 0
+    size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
+    if (align > 0) {
+      num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
+    }
+#endif
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
+  }
+
+  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
+#if EIGEN_MAX_ALIGN_BYTES > 0
+    size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
+    if (align > 0) {
+      num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
+    }
+#endif
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+    if (scratch_buffers.empty()) {
+      return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
+      ;
+    } else {
+      for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
+        auto buff = pMapper.get_buffer(*it);
+        if (buff.get_size() >= num_bytes) {
+          auto ptr = *it;
+          scratch_buffers.erase(it);
+          return ptr;
+        } else {
+          ++it;
+        }
+      }
+      return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
+    }
+#else
+    return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
+#endif
+  }
+  template <typename data_t>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
+      cl::sycl::access::mode::read_write, data_t>
+  get(data_t *data) const {
+    return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data);
+  }
+  template <typename data_t>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
+      TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
+                                        data_t>
+          data) const {
+    return static_cast<data_t *>(data.get_virtual_pointer());
+  }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(void *p) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+    scratch_buffers.insert(p);
+#else
+    TensorSycl::internal::SYCLfree(p, pMapper);
+#endif
+  }
+  template <cl::sycl::access::mode AcMd, typename T>
+  EIGEN_STRONG_INLINE void deallocate_temp(
+      const TensorSycl::internal::RangeAccess<AcMd, T> &p) const {
+    deallocate_temp(p.get_virtual_pointer());
+  }
+
+  /// This is used to deallocate the device pointer. p is used as a key inside
+  /// the map to find the device buffer and delete it.
+  EIGEN_STRONG_INLINE void deallocate(void *p) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    TensorSycl::internal::SYCLfree(p, pMapper);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate_all() const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    TensorSycl::internal::SYCLfreeAll(pMapper);
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+    scratch_buffers.clear();
+#endif
+  }
+
+  /// The memcpyHostToDevice is used to copy the data from host to device
+  /// The destination pointer could be deleted before the copy happend which is
+  /// why a callback function is needed. By default if none is provided, the
+  /// function is blocking.
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(
+      void *dst, const void *src, size_t n,
+      std::function<void()> callback) const {
+    static const auto write_mode = cl::sycl::access::mode::discard_write;
+    static const auto global_access = cl::sycl::access::target::global_buffer;
+    typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
+        write_accessor;
+    if (n == 0) {
+      if (callback) callback();
+      return;
+    }
+    n /= sizeof(buffer_scalar_t);
+    auto f = [&](cl::sycl::handler &cgh) {
+      write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
+      buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src);
+      auto non_deleter = [](buffer_scalar_t const *) {};
+      std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
+      cgh.copy(s_ptr, dst_acc);
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+    synchronize_and_callback(e, callback);
+  }
+
+  /// The memcpyDeviceToHost is used to copy the data from device to host.
+  /// The source pointer could be deleted before the copy happend which is
+  /// why a callback function is needed. By default if none is provided, the
+  /// function is blocking.
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(
+      void *dst, const void *src, size_t n,
+      std::function<void()> callback) const {
+    static const auto read_mode = cl::sycl::access::mode::read;
+    static const auto global_access = cl::sycl::access::target::global_buffer;
+    typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
+        read_accessor;
+    if (n == 0) {
+      if (callback) callback();
+      return;
+    }
+    n /= sizeof(buffer_scalar_t);
+    auto f = [&](cl::sycl::handler &cgh) {
+      read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n);
+      buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst);
+      auto non_deleter = [](buffer_scalar_t *) {};
+      std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
+      cgh.copy(src_acc, s_ptr);
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+    synchronize_and_callback(e, callback);
+  }
+
+  /// The memcpy function.
+  /// No callback is required here as both arguments are on the device
+  /// and SYCL can handle the dependency.
+  EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
+    static const auto read_mode = cl::sycl::access::mode::read;
+    static const auto write_mode = cl::sycl::access::mode::discard_write;
+    if (n == 0) {
+      return;
+    }
+    n /= sizeof(buffer_scalar_t);
+    auto f = [&](cl::sycl::handler &cgh) {
+      auto src_acc = get_range_accessor<read_mode>(cgh, src, n);
+      auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
+      cgh.copy(src_acc, dst_acc);
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+    async_synchronize(e);
+  }
+
+  /// the memset function.
+  /// No callback is required here as both arguments are on the device
+  /// and SYCL can handle the dependency.
+  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
+    static const auto write_mode = cl::sycl::access::mode::discard_write;
+    if (n == 0) {
+      return;
+    }
+    n /= sizeof(buffer_scalar_t);
+    auto f = [&](cl::sycl::handler &cgh) {
+      auto dst_acc = get_range_accessor<write_mode>(cgh, data, n);
+      // The cast to uint8_t is here to match the behaviour of the standard
+      // memset. The cast to buffer_scalar_t is needed to match the type of the
+      // accessor (in case buffer_scalar_t is not uint8_t)
+      cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c)));
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+    async_synchronize(e);
+  }
+
+  /// Get a range accessor to the virtual pointer's device memory. This range
+  /// accessor will allow access to the memory from the pointer to the end of
+  /// the buffer.
+  ///
+  /// NOTE: Inside a kernel the range accessor will always be indexed from the
+  /// start of the buffer, so the offset in the accessor is only used by
+  /// methods like handler::copy and will not be available inside a kernel.
+  template <cl::sycl::access::mode AcMd, typename T>
+  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
+  get_range_accessor(const void *ptr) const {
+    static const auto global_access = cl::sycl::access::target::global_buffer;
+    static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
+    typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
+    typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
+
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+
+    auto original_buffer = pMapper.get_buffer(ptr);
+    const ptrdiff_t offset = pMapper.get_offset(ptr);
+    const ptrdiff_t typed_offset = offset / sizeof(T);
+    eigen_assert(typed_offset >= 0);
+    const auto typed_size = original_buffer.get_size() / sizeof(T);
+    auto buffer = original_buffer.template reinterpret<
+        typename Eigen::internal::remove_const<T>::type>(
+        cl::sycl::range<1>(typed_size));
+    const ptrdiff_t size = buffer.get_count() - typed_offset;
+    eigen_assert(size >= 0);
+    typedef cl::sycl::accessor<typename Eigen::internal::remove_const<T>::type,
+                               1, AcMd, global_access, is_place_holder>
+        placeholder_accessor_t;
+    const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset;
+    return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size),
+                                           cl::sycl::id<1>(typed_offset)),
+                    static_cast<size_t>(typed_offset),
+                    reinterpret_cast<std::intptr_t>(start_ptr));
+  }
+
+  /// Get a range accessor to the virtual pointer's device memory with a
+  /// specified size.
+  template <cl::sycl::access::mode AcMd, typename Index>
+  EIGEN_STRONG_INLINE cl::sycl::accessor<
+      buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
+  get_range_accessor(cl::sycl::handler &cgh, const void *ptr,
+                     const Index n_bytes) const {
+    static const auto global_access = cl::sycl::access::target::global_buffer;
+    eigen_assert(n_bytes >= 0);
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    auto buffer = pMapper.get_buffer(ptr);
+    const ptrdiff_t offset = pMapper.get_offset(ptr);
+    eigen_assert(offset >= 0);
+    eigen_assert(offset + n_bytes <= buffer.get_size());
+    return buffer.template get_access<AcMd, global_access>(
+        cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
+  }
+
+  /// Creation of sycl accessor for a buffer. This function first tries to find
+  /// the buffer in the buffer_map. If found it gets the accessor from it, if
+  /// not, the function then adds an entry by creating a sycl buffer for that
+  /// particular pointer.
+  template <cl::sycl::access::mode AcMd>
+  EIGEN_STRONG_INLINE cl::sycl::accessor<
+      buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
+  get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    return pMapper.get_buffer(ptr)
+        .template get_access<AcMd, cl::sycl::access::target::global_buffer>(
+            cgh);
+  }
+
+  EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
+      const void *ptr) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    return pMapper.get_buffer(ptr);
+  }
+
+  EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    return pMapper.get_offset(ptr);
+  }
+
+  template <typename OutScalar, typename sycl_kernel, typename Lhs,
+            typename Rhs, typename OutPtr, typename Range, typename Index,
+            typename... T>
+  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
+                                                  const Rhs &rhs, OutPtr outptr,
+                                                  Range thread_range,
+                                                  Index scratchSize,
+                                                  T... var) const {
+    auto kernel_functor = [=](cl::sycl::handler &cgh) {
+      // binding the placeholder accessors to a commandgroup handler
+      lhs.bind(cgh);
+      rhs.bind(cgh);
+      outptr.bind(cgh);
+      typedef cl::sycl::accessor<OutScalar, 1,
+                                 cl::sycl::access::mode::read_write,
+                                 cl::sycl::access::target::local>
+          LocalAccessor;
+
+      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+      cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+          program().template get_kernel<sycl_kernel>(),
+#endif
+          thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+    async_synchronize(e);
+  }
+
+  template <typename OutScalar, typename sycl_kernel, typename InPtr,
+            typename OutPtr, typename Range, typename Index, typename... T>
+  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
+                                                 OutPtr &outptr,
+                                                 Range thread_range,
+                                                 Index scratchSize,
+                                                 T... var) const {
+    auto kernel_functor = [=](cl::sycl::handler &cgh) {
+      // binding the placeholder accessors to a commandgroup handler
+      inptr.bind(cgh);
+      outptr.bind(cgh);
+      typedef cl::sycl::accessor<OutScalar, 1,
+                                 cl::sycl::access::mode::read_write,
+                                 cl::sycl::access::target::local>
+          LocalAccessor;
+
+      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+      cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+          program().template get_kernel<sycl_kernel>(),
+#endif
+          thread_range, sycl_kernel(scratch, inptr, outptr, var...));
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+    async_synchronize(e);
+  }
+
+    template <typename OutScalar, typename sycl_kernel, typename InPtr,
+           typename Range, typename Index, typename... T>
+  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
+                                                 Range thread_range,
+                                                 Index scratchSize,
+                                                 T... var) const {
+    auto kernel_functor = [=](cl::sycl::handler &cgh) {
+      // binding the placeholder accessors to a commandgroup handler
+      inptr.bind(cgh);
+      typedef cl::sycl::accessor<OutScalar, 1,
+                                 cl::sycl::access::mode::read_write,
+                                 cl::sycl::access::target::local>
+          LocalAccessor;
+
+      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+      cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+          program().template get_kernel<sycl_kernel>(),
+#endif
+          thread_range, sycl_kernel(scratch, inptr, var...));
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+    async_synchronize(e);
+  }
+
+
+  EIGEN_STRONG_INLINE void synchronize() const {
+#ifdef EIGEN_EXCEPTIONS
+    m_queue.wait_and_throw();
+#else
+    m_queue.wait();
+#endif
+  }
+
+
+  EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
+    set_latest_event(e);
+#ifndef EIGEN_SYCL_ASYNC_EXECUTION
+    synchronize();
+#endif
+  }
+
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
+                                              Index &rng, Index &GRange) const {
+    tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+    tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
+                                           EIGEN_SYCL_LOCAL_THREAD_DIM1),
+                        static_cast<Index>(tileSize));
+    rng = n;
+    if (rng == 0) rng = static_cast<Index>(1);
+    GRange = rng;
+    if (tileSize > GRange)
+      tileSize = GRange;
+    else if (GRange > tileSize) {
+      Index xMode = static_cast<Index>(GRange % tileSize);
+      if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
+    }
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(
+      const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
+      cl::sycl::range<2> &local_range) const {
+    std::array<Index, 2> input_range = input_dim;
+    Index max_workgroup_Size =
+        static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+    max_workgroup_Size =
+        std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
+                                    EIGEN_SYCL_LOCAL_THREAD_DIM1),
+                 static_cast<Index>(max_workgroup_Size));
+    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+    local_range[1] =
+        static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
+    input_range[1] = input_dim[1];
+    if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
+    global_range[1] = input_range[1];
+    if (local_range[1] > global_range[1])
+      local_range[1] = global_range[1];
+    else if (global_range[1] > local_range[1]) {
+      Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
+      if (xMode != 0)
+        global_range[1] += static_cast<Index>(local_range[1] - xMode);
+    }
+    local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
+    input_range[0] = input_dim[0];
+    if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
+    global_range[0] = input_range[0];
+    if (local_range[0] > global_range[0])
+      local_range[0] = global_range[0];
+    else if (global_range[0] > local_range[0]) {
+      Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
+      if (xMode != 0)
+        global_range[0] += static_cast<Index>(local_range[0] - xMode);
+    }
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(
+      const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
+      cl::sycl::range<3> &local_range) const {
+    std::array<Index, 3> input_range = input_dim;
+    Index max_workgroup_Size =
+        static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+    max_workgroup_Size =
+        std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
+                                    EIGEN_SYCL_LOCAL_THREAD_DIM1),
+                 static_cast<Index>(max_workgroup_Size));
+    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+    local_range[2] =
+        static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
+    input_range[2] = input_dim[2];
+    if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
+    global_range[2] = input_range[2];
+    if (local_range[2] > global_range[2])
+      local_range[2] = global_range[2];
+    else if (global_range[2] > local_range[2]) {
+      Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
+      if (xMode != 0)
+        global_range[2] += static_cast<Index>(local_range[2] - xMode);
+    }
+    pow_of_2 = static_cast<Index>(
+        std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
+    local_range[1] =
+        static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
+    input_range[1] = input_dim[1];
+    if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
+    global_range[1] = input_range[1];
+    if (local_range[1] > global_range[1])
+      local_range[1] = global_range[1];
+    else if (global_range[1] > local_range[1]) {
+      Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
+      if (xMode != 0)
+        global_range[1] += static_cast<Index>(local_range[1] - xMode);
+    }
+    local_range[0] = static_cast<Index>(max_workgroup_Size /
+                                        (local_range[1] * local_range[2]));
+    input_range[0] = input_dim[0];
+    if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
+    global_range[0] = input_range[0];
+    if (local_range[0] > global_range[0])
+      local_range[0] = global_range[0];
+    else if (global_range[0] > local_range[0]) {
+      Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
+      if (xMode != 0)
+        global_range[0] += static_cast<Index>(local_range[0] - xMode);
+    }
+  }
+
+  EIGEN_STRONG_INLINE bool has_local_memory() const {
+#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
+    return false;
+#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
+    return true;
+#else
+    return m_device_info.local_mem_type ==
+           cl::sycl::info::local_mem_type::local;
+#endif
+  }
+
+  EIGEN_STRONG_INLINE unsigned long max_buffer_size() const {
+    return m_device_info.max_mem_alloc_size;
+  }
+
+  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
+    return m_device_info.max_compute_units;
+  }
+
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
+    return m_device_info.max_work_group_size;
+  }
+
+  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
+    return m_device_info.max_work_item_sizes;
+  }
+
+  /// No need for sycl it should act the same as CPU version
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
+
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
+    // OpenCL doesnot have such concept
+    return 2;
+  }
+
+  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
+    return m_device_info.local_mem_size;
+  }
+
+  // This function returns the nearest power of 2 Work-group size which is <=
+  // maximum device workgroup size.
+  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
+    return getPowerOfTwo(m_device_info.max_work_group_size, false);
+  }
+
+  EIGEN_STRONG_INLINE std::string getPlatformName() const {
+    return m_device_info.platform_name;
+  }
+
+  EIGEN_STRONG_INLINE std::string getDeviceName() const {
+    return m_device_info.device_name;
+  }
+
+  EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
+    return m_device_info.device_vendor;
+  }
+
+  // This function returns the nearest power of 2
+  // if roundup is true returns result>=wgsize
+  // else it return result <= wgsize
+  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const {
+    if (roundUp) --wGSize;
+    wGSize |= (wGSize >> 1);
+    wGSize |= (wGSize >> 2);
+    wGSize |= (wGSize >> 4);
+    wGSize |= (wGSize >> 8);
+    wGSize |= (wGSize >> 16);
+#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
+    wGSize |= (wGSize >> 32);
+#endif
+    return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
+  }
+
+  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; }
+
+  // This function checks if the runtime recorded an error for the
+  // underlying stream device.
+  EIGEN_STRONG_INLINE bool ok() const {
+    if (!exception_caught_) {
+      synchronize();
+    }
+    return !exception_caught_;
+  }
+
+  EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
+#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
+    std::lock_guard<std::mutex> lock(event_mutex_);
+    return latest_events_[std::this_thread::get_id()];
+#else
+    eigen_assert(false);
+    return cl::sycl::event();
+#endif
+  }
+
+  // destructor
+  ~QueueInterface() {
+    pMapper.clear();
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+    scratch_buffers.clear();
+#endif
+  }
+
+ protected:
+  EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const {
+#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
+    std::lock_guard<std::mutex> lock(event_mutex_);
+    latest_events_[std::this_thread::get_id()] = e;
+#else
+    EIGEN_UNUSED_VARIABLE(e);
+#endif
+  }
+
+  void synchronize_and_callback(cl::sycl::event e,
+                                const std::function<void()> &callback) const {
+    set_latest_event(e);
+    if (callback) {
+      auto callback_ = [=]() {
+#ifdef EIGEN_EXCEPTIONS
+        cl::sycl::event(e).wait_and_throw();
+#else
+        cl::sycl::event(e).wait();
+#endif
+        callback();
+      };
+      m_thread_pool.Schedule(std::move(callback_));
+    } else {
+#ifdef EIGEN_EXCEPTIONS
+      m_queue.wait_and_throw();
+#else
+      m_queue.wait();
+#endif
+    }
+  }
+
+  bool sycl_async_handler(cl::sycl::exception_list exceptions) const {
+    bool exception_caught = false;
+    for (const auto &e : exceptions) {
+      if (e) {
+        exception_caught = true;
+        EIGEN_THROW_X(e);
+      }
+    }
+    return exception_caught;
+  }
+
+  /// class members:
+  bool exception_caught_ = false;
+
+  mutable std::mutex pmapper_mutex_;
+
+#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
+  mutable std::mutex event_mutex_;
+  mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
+#endif
+
+  /// std::map is the container used to make sure that we create only one buffer
+  /// per pointer. The lifespan of the buffer now depends on the lifespan of
+  /// SyclDevice. If a non-read-only pointer is needed to be accessed on the
+  /// host we should manually deallocate it.
+  mutable TensorSycl::internal::PointerMapper pMapper;
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+  mutable std::unordered_set<void *> scratch_buffers;
+#endif
+  /// sycl queue
+  mutable cl::sycl::queue m_queue;
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+  mutable cl::sycl::program m_prog;
+#endif
+
+  /// The thread pool is used to wait on events and call callbacks
+  /// asynchronously
+  mutable Eigen::ThreadPool m_thread_pool;
+
+  const TensorSycl::internal::SyclDeviceInfo m_device_info;
+};
+
+struct SyclDeviceBase {
+  /// QueueInterface is not owned. it is the caller's responsibility to destroy
+  /// it
+  const QueueInterface *m_queue_stream;
+  explicit SyclDeviceBase(const QueueInterface *queue_stream)
+      : m_queue_stream(queue_stream) {}
+  EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const {
+    return m_queue_stream;
+  }
+};
+
+// Here is a sycl device struct which accept the sycl queue interface
+// as an input
+struct SyclDevice : public SyclDeviceBase {
+  explicit SyclDevice(const QueueInterface *queue_stream)
+      : SyclDeviceBase(queue_stream) {}
+
+  // this is the accessor used to construct the evaluator
+  template <cl::sycl::access::mode AcMd, typename T>
+  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
+  get_range_accessor(const void *ptr) const {
+    return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
+  }
+
+  // get sycl accessor
+  template <cl::sycl::access::mode AcMd>
+  EIGEN_STRONG_INLINE cl::sycl::accessor<
+      buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
+  get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
+    return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
+  }
+
+  /// Accessing the created sycl device buffer for the device pointer
+  EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
+      const void *ptr) const {
+    return queue_stream()->get_sycl_buffer(ptr);
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
+                                              Index &rng, Index &GRange) const {
+    queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(
+      const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
+      cl::sycl::range<2> &local_range) const {
+    queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(
+      const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
+      cl::sycl::range<3> &local_range) const {
+    queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
+  }
+
+  /// allocate device memory
+  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
+    return queue_stream()->allocate(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
+    return queue_stream()->allocate_temp(num_bytes);
+  }
+
+  /// deallocate device memory
+  EIGEN_STRONG_INLINE void deallocate(void *p) const {
+    queue_stream()->deallocate(p);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const {
+    queue_stream()->deallocate_temp(buffer);
+  }
+  template <cl::sycl::access::mode AcMd, typename T>
+  EIGEN_STRONG_INLINE void deallocate_temp(
+      const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const {
+    queue_stream()->deallocate_temp(buffer);
+  }
+  EIGEN_STRONG_INLINE void deallocate_all() const {
+    queue_stream()->deallocate_all();
+  }
+
+  template <typename data_t>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
+      cl::sycl::access::mode::read_write, data_t>
+  get(data_t *data) const {
+    return queue_stream()->get(data);
+  }
+  template <typename data_t>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
+      TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
+                                        data_t>
+          data) const {
+    return queue_stream()->get(data);
+  }
+
+  /// attach existing buffer
+  EIGEN_STRONG_INLINE void *attach_buffer(
+      cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
+    return queue_stream()->attach_buffer(buf);
+  }
+  /// detach buffer
+  EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
+    queue_stream()->detach_buffer(p);
+  }
+  EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
+    return queue_stream()->get_offset(ptr);
+  }
+
+  // some runtime conditions that can be applied here
+  EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
+
+  /// memcpyHostToDevice
+  template <typename Index>
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(
+      Index *dst, const Index *src, size_t n,
+      std::function<void()> callback = {}) const {
+    queue_stream()->memcpyHostToDevice(dst, src, n, callback);
+  }
+  /// memcpyDeviceToHost
+  template <typename Index>
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(
+      void *dst, const Index *src, size_t n,
+      std::function<void()> callback = {}) const {
+    queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
+  }
+  /// the memcpy function
+  template <typename Index>
+  EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
+    queue_stream()->memcpy(dst, src, n);
+  }
+  /// the memset function
+  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
+    queue_stream()->memset(data, c, n);
+  }
+  /// returning the sycl queue
+  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
+    return queue_stream()->sycl_queue();
+  }
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+  EIGEN_STRONG_INLINE cl::sycl::program &program() const {
+    return queue_stream()->program();
+  }
+#endif
+
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
+
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // We won't try to take advantage of the l2 cache for the time being, and
+    // there is no l3 cache on sycl devices.
+    return firstLevelCacheSize();
+  }
+  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
+    return queue_stream()->getNumSyclMultiProcessors();
+  }
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
+    return queue_stream()->maxSyclThreadsPerBlock();
+  }
+  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
+    return queue_stream()->maxWorkItemSizes();
+  }
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
+    // OpenCL doesnot have such concept
+    return queue_stream()->maxSyclThreadsPerMultiProcessor();
+  }
+  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
+    return queue_stream()->sharedMemPerBlock();
+  }
+  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
+    return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
+  }
+
+  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const {
+    return queue_stream()->getPowerOfTwo(val, roundUp);
+  }
+  /// No need for sycl it should act the same as CPU version
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+    return queue_stream()->majorDeviceVersion();
+  }
+
+  EIGEN_STRONG_INLINE void synchronize() const {
+    queue_stream()->synchronize();
+  }
+  EIGEN_STRONG_INLINE void async_synchronize(
+      cl::sycl::event e = cl::sycl::event()) const {
+    queue_stream()->async_synchronize(e);
+  }
+  EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
+    return queue_stream()->get_latest_event();
+  }
+
+  // This function checks if the runtime recorded an error for the
+  // underlying stream device.
+  EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); }
+
+  EIGEN_STRONG_INLINE bool has_local_memory() const {
+    return queue_stream()->has_local_memory();
+  }
+  EIGEN_STRONG_INLINE long max_buffer_size() const {
+    return queue_stream()->max_buffer_size();
+  }
+  EIGEN_STRONG_INLINE std::string getPlatformName() const {
+    return queue_stream()->getPlatformName();
+  }
+  EIGEN_STRONG_INLINE std::string getDeviceName() const {
+    return queue_stream()->getDeviceName();
+  }
+  EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
+    return queue_stream()->getDeviceVendor();
+  }
+  template <typename OutScalar, typename KernelType, typename... T>
+  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
+    queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
+        var...);
+  }
+  template <typename OutScalar, typename KernelType, typename... T>
+  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
+    queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
+        var...);
+  }
+
+  template <typename OutScalar, typename KernelType, typename... T>
+  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
+    queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
+        var...);
+  }
+};
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
new file mode 100644
index 0000000..e524b53
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h

@@ -0,0 +1,409 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_THREADS) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H
+
+namespace Eigen {
+
+// Runs an arbitrary function and then calls Notify() on the passed in
+// Notification.
+template <typename Function, typename... Args> struct FunctionWrapperWithNotification
+{
+  static void run(Notification* n, Function f, Args... args) {
+    f(args...);
+    if (n) {
+      n->Notify();
+    }
+  }
+};
+
+template <typename Function, typename... Args> struct FunctionWrapperWithBarrier
+{
+  static void run(Barrier* b, Function f, Args... args) {
+    f(args...);
+    if (b) {
+      b->Notify();
+    }
+  }
+};
+
+template <typename SyncType>
+static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
+  if (n) {
+    n->Wait();
+  }
+}
+
+// An abstract interface to a device specific memory allocator.
+class Allocator {
+ public:
+  virtual ~Allocator() {}
+  virtual void* allocate(size_t num_bytes) const = 0;
+  virtual void deallocate(void* buffer) const = 0;
+};
+
+// Build a thread pool device on top the an existing pool of threads.
+struct ThreadPoolDevice {
+  // The ownership of the thread pool remains with the caller.
+  ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores, Allocator* allocator = nullptr)
+      : pool_(pool), num_threads_(num_cores), allocator_(allocator) { }
+
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return allocator_ ? allocator_->allocate(num_bytes)
+        : internal::aligned_malloc(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    if (allocator_) {
+      allocator_->deallocate(buffer);
+    } else {
+      internal::aligned_free(buffer);
+    }
+  }
+
+    EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
+    return allocate(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
+    deallocate(buffer);
+  }
+
+  template<typename Type>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
+    return data;
+  }
+
+  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifdef __ANDROID__
+    ::memcpy(dst, src, n);
+#else
+    // TODO(rmlarsen): Align blocks on cache lines.
+    // We have observed that going beyond 4 threads usually just wastes
+    // CPU cycles due to the threads competing for memory bandwidth, so we
+    // statically schedule at most 4 block copies here.
+    const size_t kMinBlockSize = 32768;
+    const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4);
+    if (n <= kMinBlockSize || num_threads < 2) {
+      ::memcpy(dst, src, n);
+    } else {
+      const char* src_ptr = static_cast<const char*>(src);
+      char* dst_ptr = static_cast<char*>(dst);
+      const size_t blocksize = (n + (num_threads - 1)) / num_threads;
+      Barrier barrier(static_cast<int>(num_threads - 1));
+      // Launch the last 3 blocks on worker threads.
+      for (size_t i = 1; i < num_threads; ++i) {
+        enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] {
+          ::memcpy(dst_ptr + i * blocksize, src_ptr + i * blocksize,
+                   numext::mini(blocksize, n - (i * blocksize)));
+        });
+      }
+      // Launch the first block on the main thread.
+      ::memcpy(dst_ptr, src_ptr, blocksize);
+      barrier.Wait();
+    }
+#endif
+  }
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+
+  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+    ::memset(buffer, c, n);
+  }
+
+  EIGEN_STRONG_INLINE int numThreads() const {
+    return num_threads_;
+  }
+
+  // Number of theads available in the underlying thread pool. This number can
+  // be different from the value returned by numThreads().
+  EIGEN_STRONG_INLINE int numThreadsInPool() const {
+    return pool_->NumThreads();
+  }
+
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+    return l1CacheSize();
+  }
+
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // The l3 cache size is shared between all the cores.
+    return l3CacheSize() / num_threads_;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+    // Should return an enum that encodes the ISA supported by the CPU
+    return 1;
+  }
+
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE Notification* enqueue(Function&& f,
+                                            Args&&... args) const {
+    Notification* n = new Notification();
+    pool_->Schedule(
+        std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n,
+                  std::move(f), args...));
+    return n;
+  }
+
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, Function&& f,
+                                                Args&&... args) const {
+    pool_->Schedule(
+        std::bind(&FunctionWrapperWithBarrier<Function, Args...>::run, b,
+                  std::move(f), args...));
+  }
+
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f,
+                                                 Args&&... args) const {
+    if (sizeof...(args) > 0) {
+      pool_->Schedule(std::bind(std::move(f), args...));
+    } else {
+      pool_->Schedule(std::move(f));
+    }
+  }
+
+  // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if
+  // called from one of the threads in pool_. Returns -1 otherwise.
+  EIGEN_STRONG_INLINE int currentThreadId() const {
+    return pool_->CurrentThreadId();
+  }
+
+  // WARNING: This function is synchronous and will block the calling thread.
+  //
+  // Synchronous parallelFor executes f with [0, n) arguments in parallel and
+  // waits for completion. F accepts a half-open interval [first, last). Block
+  // size is chosen based on the iteration cost and resulting parallel
+  // efficiency. If block_align is not nullptr, it is called to round up the
+  // block size.
+  void parallelFor(Index n, const TensorOpCost& cost,
+                   std::function<Index(Index)> block_align,
+                   std::function<void(Index, Index)> f) const {
+    if (EIGEN_PREDICT_FALSE(n <= 0)){
+      return;
+    // Compute small problems directly in the caller thread.
+    } else if (n == 1 || numThreads() == 1 ||
+               CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
+      f(0, n);
+      return;
+    }
+
+    // Compute block size and total count of blocks.
+    ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align);
+
+    // Recursively divide size into halves until we reach block_size.
+    // Division code rounds mid to block_size, so we are guaranteed to get
+    // block_count leaves that do actual computations.
+    Barrier barrier(static_cast<unsigned int>(block.count));
+    std::function<void(Index, Index)> handleRange;
+    handleRange = [=, &handleRange, &barrier, &f](Index firstIdx,
+                                                  Index lastIdx) {
+      while (lastIdx - firstIdx > block.size) {
+        // Split into halves and schedule the second half on a different thread.
+        const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size;
+        pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); });
+        lastIdx = midIdx;
+      }
+      // Single block or less, execute directly.
+      f(firstIdx, lastIdx);
+      barrier.Notify();
+    };
+
+    if (block.count <= numThreads()) {
+      // Avoid a thread hop by running the root of the tree and one block on the
+      // main thread.
+      handleRange(0, n);
+    } else {
+      // Execute the root in the thread pool to avoid running work on more than
+      // numThreads() threads.
+      pool_->Schedule([=, &handleRange]() { handleRange(0, n); });
+    }
+
+    barrier.Wait();
+  }
+
+  // Convenience wrapper for parallelFor that does not align blocks.
+  void parallelFor(Index n, const TensorOpCost& cost,
+                   std::function<void(Index, Index)> f) const {
+    parallelFor(n, cost, nullptr, std::move(f));
+  }
+
+  // WARNING: This function is asynchronous and will not block the calling thread.
+  //
+  // Asynchronous parallelFor executes f with [0, n) arguments in parallel
+  // without waiting for completion. When the last block finished, it will call
+  // 'done' callback. F accepts a half-open interval [first, last). Block size
+  // is chosen based on the iteration cost and resulting parallel efficiency. If
+  // block_align is not nullptr, it is called to round up the block size.
+  void parallelForAsync(Index n, const TensorOpCost& cost,
+                        std::function<Index(Index)> block_align,
+                        std::function<void(Index, Index)> f,
+                        std::function<void()> done) const {
+    // Compute small problems directly in the caller thread.
+    if (n <= 1 || numThreads() == 1 ||
+        CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
+      f(0, n);
+      done();
+      return;
+    }
+
+    // Compute block size and total count of blocks.
+    ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align);
+
+    ParallelForAsyncContext* const ctx =
+        new ParallelForAsyncContext(block.count, std::move(f), std::move(done));
+
+    // Recursively divide size into halves until we reach block_size.
+    // Division code rounds mid to block_size, so we are guaranteed to get
+    // block_count leaves that do actual computations.
+    ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) {
+      while (lastIdx - firstIdx > block.size) {
+        // Split into halves and schedule the second half on a different thread.
+        const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size;
+        pool_->Schedule(
+            [ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); });
+        lastIdx = midIdx;
+      }
+
+      // Single block or less, execute directly.
+      ctx->f(firstIdx, lastIdx);
+
+      // Delete async context if it was the last block.
+      if (ctx->count.fetch_sub(1) == 1) delete ctx;
+    };
+
+    if (block.count <= numThreads()) {
+      // Avoid a thread hop by running the root of the tree and one block on the
+      // main thread.
+      ctx->handle_range(0, n);
+    } else {
+      // Execute the root in the thread pool to avoid running work on more than
+      // numThreads() threads.
+      pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); });
+    }
+  }
+
+  // Convenience wrapper for parallelForAsync that does not align blocks.
+  void parallelForAsync(Index n, const TensorOpCost& cost,
+                        std::function<void(Index, Index)> f,
+                        std::function<void()> done) const {
+    parallelForAsync(n, cost, nullptr, std::move(f), std::move(done));
+  }
+
+  // Thread pool accessor.
+  ThreadPoolInterface* getPool() const { return pool_; }
+
+  // Allocator accessor.
+  Allocator* allocator() const { return allocator_; }
+
+ private:
+  typedef TensorCostModel<ThreadPoolDevice> CostModel;
+
+  // For parallelForAsync we must keep passed in closures on the heap, and
+  // delete them only after `done` callback finished.
+  struct ParallelForAsyncContext {
+    ParallelForAsyncContext(Index block_count,
+                            std::function<void(Index, Index)> block_f,
+                            std::function<void()> done_callback)
+        : count(block_count),
+          f(std::move(block_f)),
+          done(std::move(done_callback)) {}
+    ~ParallelForAsyncContext() { done(); }
+
+    std::atomic<Index> count;
+    std::function<void(Index, Index)> f;
+    std::function<void()> done;
+
+    std::function<void(Index, Index)> handle_range;
+  };
+
+  struct ParallelForBlock {
+    Index size;   // block size
+    Index count;  // number of blocks
+  };
+
+  // Calculates block size based on (1) the iteration cost and (2) parallel
+  // efficiency. We want blocks to be not too small to mitigate parallelization
+  // overheads; not too large to mitigate tail effect and potential load
+  // imbalance and we also want number of blocks to be evenly dividable across
+  // threads.
+  ParallelForBlock CalculateParallelForBlock(
+      const Index n, const TensorOpCost& cost,
+      std::function<Index(Index)> block_align) const {
+    const double block_size_f = 1.0 / CostModel::taskSize(1, cost);
+    const Index max_oversharding_factor = 4;
+    Index block_size = numext::mini(
+        n, numext::maxi<Index>(
+               divup<Index>(n, max_oversharding_factor * numThreads()),
+               block_size_f));
+    const Index max_block_size = numext::mini(n, 2 * block_size);
+
+    if (block_align) {
+      Index new_block_size = block_align(block_size);
+      eigen_assert(new_block_size >= block_size);
+      block_size = numext::mini(n, new_block_size);
+    }
+
+    Index block_count = divup(n, block_size);
+
+    // Calculate parallel efficiency as fraction of total CPU time used for
+    // computations:
+    double max_efficiency =
+        static_cast<double>(block_count) /
+        (divup<int>(block_count, numThreads()) * numThreads());
+
+    // Now try to increase block size up to max_block_size as long as it
+    // doesn't decrease parallel efficiency.
+    for (Index prev_block_count = block_count;
+         max_efficiency < 1.0 && prev_block_count > 1;) {
+      // This is the next block size that divides size into a smaller number
+      // of blocks than the current block_size.
+      Index coarser_block_size = divup(n, prev_block_count - 1);
+      if (block_align) {
+        Index new_block_size = block_align(coarser_block_size);
+        eigen_assert(new_block_size >= coarser_block_size);
+        coarser_block_size = numext::mini(n, new_block_size);
+      }
+      if (coarser_block_size > max_block_size) {
+        break;  // Reached max block size. Stop.
+      }
+      // Recalculate parallel efficiency.
+      const Index coarser_block_count = divup(n, coarser_block_size);
+      eigen_assert(coarser_block_count < prev_block_count);
+      prev_block_count = coarser_block_count;
+      const double coarser_efficiency =
+          static_cast<double>(coarser_block_count) /
+          (divup<int>(coarser_block_count, numThreads()) * numThreads());
+      if (coarser_efficiency + 0.01 >= max_efficiency) {
+        // Taking it.
+        block_size = coarser_block_size;
+        block_count = coarser_block_count;
+        if (max_efficiency < coarser_efficiency) {
+          max_efficiency = coarser_efficiency;
+        }
+      }
+    }
+
+    return {block_size, block_count};
+  }
+
+  ThreadPoolInterface* pool_;
+  int num_threads_;
+  Allocator* allocator_;
+};
+
+
+}  // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
index 19e922f..1a30e45 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h

@@ -23,6 +23,7 @@
   */
 
 template <typename Index, std::size_t Rank> struct DimensionList {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   const Index operator[] (const Index i) const { return i; }
 };
 
@@ -35,194 +36,194 @@
   static const size_t value = Rank;
 };
 
-template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(DimensionList<Index, Rank>& a) {
+template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(DimensionList<Index, Rank>&) {
   return n;
 }
-template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(const DimensionList<Index, Rank>& a) {
+template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(const DimensionList<Index, Rank>&) {
   return n;
 }
 
 
-#if defined(EIGEN_HAS_CONSTEXPR)
+#if EIGEN_HAS_CONSTEXPR
 template <typename Index, std::size_t Rank>
-struct index_known_statically<DimensionList<Index, Rank> > {
-  constexpr bool operator() (const DenseIndex) const {
+struct index_known_statically_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
     return true;
   }
 };
 template <typename Index, std::size_t Rank>
-struct index_known_statically<const DimensionList<Index, Rank> > {
-  constexpr bool operator() (const DenseIndex) const {
+struct index_known_statically_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
     return true;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct all_indices_known_statically<DimensionList<Index, Rank> > {
-  constexpr bool operator() () const {
+struct all_indices_known_statically_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
     return true;
   }
 };
 template <typename Index, std::size_t Rank>
-struct all_indices_known_statically<const DimensionList<Index, Rank> > {
-  constexpr bool operator() () const {
+struct all_indices_known_statically_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
     return true;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct indices_statically_known_to_increase<DimensionList<Index, Rank> > {
-  constexpr bool operator() () const {
+struct indices_statically_known_to_increase_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
     return true;
   }
 };
 template <typename Index, std::size_t Rank>
-struct indices_statically_known_to_increase<const DimensionList<Index, Rank> > {
-  constexpr bool operator() () const {
+struct indices_statically_known_to_increase_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
     return true;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct index_statically_eq<DimensionList<Index, Rank> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_eq_impl<DimensionList<Index, Rank> > {
+  static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return i == value;
   }
 };
 template <typename Index, std::size_t Rank>
-struct index_statically_eq<const DimensionList<Index, Rank> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_eq_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return i == value;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct index_statically_ne<DimensionList<Index, Rank> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_ne_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return i != value;
   }
 };
 template <typename Index, std::size_t Rank>
-struct index_statically_ne<const DimensionList<Index, Rank> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_ne_impl<const DimensionList<Index, Rank> > {
+  static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return i != value;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct index_statically_gt<DimensionList<Index, Rank> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_gt_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return i > value;
   }
 };
 template <typename Index, std::size_t Rank>
-struct index_statically_gt<const DimensionList<Index, Rank> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_gt_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return i > value;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct index_statically_lt<DimensionList<Index, Rank> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_lt_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return i < value;
   }
 };
 template <typename Index, std::size_t Rank>
-struct index_statically_lt<const DimensionList<Index, Rank> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_lt_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return i < value;
   }
 };
 
 #else
 template <typename Index, std::size_t Rank>
-struct index_known_statically<DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex) const {
+struct index_known_statically_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
     return true;
   }
 };
 template <typename Index, std::size_t Rank>
-struct index_known_statically<const DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex) const {
+struct index_known_statically_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
     return true;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct all_indices_known_statically<DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() () const {
+struct all_indices_known_statically_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() {
     return true;
   }
 };
 template <typename Index, std::size_t Rank>
-struct all_indices_known_statically<const DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() () const {
+struct all_indices_known_statically_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() {
     return true;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct indices_statically_known_to_increase<DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() () const {
+struct indices_statically_known_to_increase_impl<DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
     return true;
   }
 };
 template <typename Index, std::size_t Rank>
-struct indices_statically_known_to_increase<const DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() () const {
+struct indices_statically_known_to_increase_impl<const DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
     return true;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct index_statically_eq<DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_eq_impl<DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
     return false;
   }
 };
 template <typename Index, std::size_t Rank>
-struct index_statically_eq<const DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_eq_impl<const DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
     return false;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct index_statically_ne<DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_ne_impl<DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex){
     return false;
   }
 };
 template <typename Index, std::size_t Rank>
-struct index_statically_ne<const DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_ne_impl<const DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
     return false;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct index_statically_gt<DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_gt_impl<DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
     return false;
   }
 };
 template <typename Index, std::size_t Rank>
-struct index_statically_gt<const DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_gt_impl<const DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
     return false;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct index_statically_lt<DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_lt_impl<DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
     return false;
   }
 };
 template <typename Index, std::size_t Rank>
-struct index_statically_lt<const DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_lt_impl<const DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
     return false;
   }
 };

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index 6a65cd9..f0f1e83 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h

@@ -32,16 +32,16 @@
 // Boilerplate code
 namespace internal {
 
-template<std::size_t n, typename Dimension> struct dget {
-  static const std::size_t value = get<n, typename Dimension::Base>::value;
+template<std::ptrdiff_t n, typename Dimension> struct dget {
+  static const std::ptrdiff_t value = get<n, Dimension>::value;
 };
 
 
-template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
 struct fixed_size_tensor_index_linearization_helper
 {
   template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static inline Index run(array<Index, NumIndices> const& indices,
+  static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const& indices,
                           const Dimensions& dimensions)
   {
     return array_get<RowMajor ? n - 1 : (NumIndices - n)>(indices) +
@@ -50,26 +50,25 @@
   }
 };
 
-template<typename Index, std::size_t NumIndices, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
 struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 {
   template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static inline Index run(array<Index, NumIndices> const& indices,
-                          const Dimensions&)
+  static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const&, const Dimensions&)
   {
     return 0;
   }
 };
 
-template<typename Index, std::size_t n>
+template<typename Index, std::ptrdiff_t n>
 struct fixed_size_tensor_index_extraction_helper
 {
   template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static inline Index run(const Index index,
+  static EIGEN_STRONG_INLINE Index run(const Index index,
                           const Dimensions& dimensions)
   {
-    const Index mult = (index == n) ? 1 : 0;
-    return array_get<n>(dimensions) * mult +
+    const Index mult = (index == n-1) ? 1 : 0;
+    return array_get<n-1>(dimensions) * mult +
         fixed_size_tensor_index_extraction_helper<Index, n - 1>::run(index, dimensions);
   }
 };
@@ -78,40 +77,41 @@
 struct fixed_size_tensor_index_extraction_helper<Index, 0>
 {
   template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static inline Index run(const Index index,
-                          const Dimensions& dimensions)
+  static EIGEN_STRONG_INLINE Index run(const Index,
+                          const Dimensions&)
   {
-    const Index mult = (index == 0) ? 1 : 0;
-    return array_get<0>(dimensions) * mult;
+    return 0;
   }
-};
+  };
 
 }  // end namespace internal
 
 
 // Fixed size
 #ifndef EIGEN_EMULATE_CXX11_META_H
-template <typename std::size_t... Indices>
-struct Sizes : internal::numeric_list<std::size_t, Indices...> {
-  typedef internal::numeric_list<std::size_t, Indices...> Base;
-  static const std::size_t total_size = internal::arg_prod(Indices...);
+template <typename std::ptrdiff_t... Indices>
+struct Sizes {
+  typedef internal::numeric_list<std::ptrdiff_t, Indices...> Base;
+  const Base t = Base();
+  static const std::ptrdiff_t total_size = internal::arg_prod(Indices...);
+  static const ptrdiff_t count = Base::count;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const {
     return Base::count;
   }
 
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t TotalSize() {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t TotalSize() {
     return internal::arg_prod(Indices...);
   }
 
-  Sizes() { }
+  EIGEN_DEVICE_FUNC Sizes() { }
   template <typename DenseIndex>
-  explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
+  explicit EIGEN_DEVICE_FUNC Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
     // todo: add assertion
   }
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
-  template <typename... DenseIndex> Sizes(DenseIndex...) { }
-  explicit Sizes(std::initializer_list<std::size_t> /*l*/) {
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  template <typename... DenseIndex> EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { }
+  explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) {
     // todo: add assertion
   }
 #endif
@@ -121,80 +121,80 @@
     return *this;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const int index) const {
-    return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count - 1>::run(index, *this);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::ptrdiff_t index) const {
+    return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, t);
   }
 
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *static_cast<const Base*>(this));
+  ptrdiff_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, t);
   }
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *static_cast<const Base*>(this));
+  ptrdiff_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, t);
   }
 };
 
 namespace internal {
-template <typename std::size_t... Indices>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<Indices...>&) {
+template <typename std::ptrdiff_t... Indices>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<Indices...>&) {
   return Sizes<Indices...>::total_size;
 }
 }
 
 #else
 
-template <std::size_t n>
+template <std::ptrdiff_t n>
 struct non_zero_size {
-  typedef internal::type2val<std::size_t, n> type;
+  typedef internal::type2val<std::ptrdiff_t, n> type;
 };
 template <>
 struct non_zero_size<0> {
   typedef internal::null_type type;
 };
 
-template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0, std::size_t V5=0> struct Sizes {
+template <std::ptrdiff_t V1=0, std::ptrdiff_t V2=0, std::ptrdiff_t V3=0, std::ptrdiff_t V4=0, std::ptrdiff_t V5=0> struct Sizes {
   typedef typename internal::make_type_list<typename non_zero_size<V1>::type, typename non_zero_size<V2>::type, typename non_zero_size<V3>::type, typename non_zero_size<V4>::type, typename non_zero_size<V5>::type >::type Base;
-  static const size_t count = Base::count;
-  static const std::size_t total_size = internal::arg_prod<Base>::value;
+  static const std::ptrdiff_t count = Base::count;
+  static const std::ptrdiff_t total_size = internal::arg_prod<Base>::value;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t rank() const {
     return count;
   }
 
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t TotalSize() {
     return internal::arg_prod<Base>::value;
   }
 
   Sizes() { }
   template <typename DenseIndex>
-  explicit Sizes(const array<DenseIndex, Base::count>& indices) {
+  explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
     // todo: add assertion
   }
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
-  template <typename... DenseIndex> Sizes(DenseIndex... indices) { }
-  explicit Sizes(std::initializer_list<std::size_t> l) {
-    // todo: add assertion
-  }
-#else
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0) {
-  }
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1) {
-  }
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
-  }
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
-  }
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
-  }
-#endif
-
-  template <typename T> Sizes& operator = (const T& other) {
-    // to do: check the size of other
+  template <typename T> Sizes& operator = (const T& /*other*/) {
+    // add assertion failure if the size of other is different
     return *this;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator[] (const int index) const {
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  template <typename... DenseIndex> Sizes(DenseIndex... /*indices*/) { }
+  explicit Sizes(std::initializer_list<std::ptrdiff_t>) {
+    // todo: add assertion
+  }
+#else
+  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) {
+  }
+  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex) {
+  }
+  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex) {
+  }
+  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
+  }
+  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
+  }
+#endif
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index operator[] (const Index index) const {
     switch (index) {
       case 0:
         return internal::get<0, Base>::value;
@@ -208,23 +208,23 @@
         return internal::get<4, Base>::value;
       default:
         eigen_assert(false && "index overflow");
-        return static_cast<std::size_t>(-1);
+        return static_cast<Index>(-1);
     }
   }
 
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *this);
+  ptrdiff_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *reinterpret_cast<const Base*>(this));
   }
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *this);
+  ptrdiff_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *reinterpret_cast<const Base*>(this));
   }
 };
 
 namespace internal {
-template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
+template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
   return Sizes<V1, V2, V3, V4, V5>::total_size;
 }
 }
@@ -233,7 +233,7 @@
 
 // Boilerplate
 namespace internal {
-template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
 struct tensor_index_linearization_helper
 {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -245,7 +245,7 @@
   }
 };
 
-template<typename Index, std::size_t NumIndices, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
 struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -264,12 +264,12 @@
   typedef array<DenseIndex, NumDims> Base;
   static const int count = NumDims;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const {
     return NumDims;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const {
-    return internal::array_prod(*static_cast<const Base*>(this));
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const {
+    return (NumDims == 0) ? 1 : internal::array_prod(*static_cast<const Base*>(this));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DSizes() {
@@ -277,7 +277,12 @@
       (*this)[i] = 0;
     }
   }
-  EIGEN_DEVICE_FUNC DSizes(const array<DenseIndex, NumDims>& a) : Base(a) { }
+  EIGEN_DEVICE_FUNC explicit DSizes(const array<DenseIndex, NumDims>& a) : Base(a) { }
+
+  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) {
+    eigen_assert(NumDims == 1);
+    (*this)[0] = i0;
+  }
 
   EIGEN_DEVICE_FUNC DSizes(const DimensionList<DenseIndex, NumDims>& a) {
     for (int i = 0 ; i < NumDims; ++i) {
@@ -285,15 +290,44 @@
     }
   }
 
+  // Enable DSizes index type promotion only if we are promoting to the
+  // larger type, e.g. allow to promote dimensions of type int to long.
+  template<typename OtherIndex>
+  EIGEN_DEVICE_FUNC
+  explicit DSizes(const array<OtherIndex, NumDims>& other,
+                  // Default template parameters require c++11.
+                  typename internal::enable_if<
+                     internal::is_same<
+                         DenseIndex,
+                         typename internal::promote_index_type<
+                             DenseIndex,
+                             OtherIndex
+                         >::type
+                     >::value, void*>::type = 0) {
+    for (int i = 0; i < NumDims; ++i) {
+      (*this)[i] = static_cast<DenseIndex>(other[i]);
+    }
+  }
+
+#ifdef EIGEN_HAS_INDEX_LIST
+  template <typename FirstType, typename... OtherTypes>
+  EIGEN_DEVICE_FUNC
+  explicit DSizes(const Eigen::IndexList<FirstType, OtherTypes...>& dimensions) {
+    for (int i = 0; i < dimensions.count; ++i) {
+      (*this)[i] = dimensions[i];
+    }
+  }
+#endif
+
 #ifndef EIGEN_EMULATE_CXX11_META_H
-  template <typename std::size_t... Indices>
+  template <typename std::ptrdiff_t... Indices>
   EIGEN_DEVICE_FUNC DSizes(const Sizes<Indices...>& a) {
     for (int i = 0 ; i < NumDims; ++i) {
       (*this)[i] = a[i];
     }
   }
 #else
-  template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
+  template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5>
   EIGEN_DEVICE_FUNC DSizes(const Sizes<V1, V2, V3, V4, V5>& a) {
     for (int i = 0 ; i < NumDims; ++i) {
       (*this)[i] = a[i];
@@ -301,36 +335,31 @@
   }
 #endif
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
   template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, IndexTypes... otherDimensions) {
-    EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    (*this) = array<DenseIndex, NumDims>{firstDimension, otherDimensions...};
+  EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) {
+    EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
   }
 #else
-  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) {
-    eigen_assert(NumDims == 1);
-    (*this)[0] = i0;
-  }
-  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1) {
+  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1) {
     eigen_assert(NumDims == 2);
     (*this)[0] = i0;
     (*this)[1] = i1;
   }
-  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
+  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
     eigen_assert(NumDims == 3);
     (*this)[0] = i0;
     (*this)[1] = i1;
     (*this)[2] = i2;
   }
-  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
+  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
     eigen_assert(NumDims == 4);
     (*this)[0] = i0;
     (*this)[1] = i1;
     (*this)[2] = i2;
     (*this)[3] = i3;
   }
-  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
+  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
     eigen_assert(NumDims == 5);
     (*this)[0] = i0;
     (*this)[1] = i1;
@@ -346,20 +375,29 @@
   }
 
   // A constexpr would be so much better here
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array<DenseIndex, NumDims>& indices) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfColMajor(const array<DenseIndex, NumDims>& indices) const {
     return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, false>::run(indices, *static_cast<const Base*>(this));
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const {
     return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, true>::run(indices, *static_cast<const Base*>(this));
   }
 };
 
-
-
+template <typename IndexType, int NumDims>
+std::ostream& operator<<(std::ostream& os,
+                         const DSizes<IndexType, NumDims>& dims) {
+  os << "[";
+  for (int i = 0; i < NumDims; ++i) {
+    if (i > 0) os << ", ";
+    os << dims[i];
+  }
+  os << "]";
+  return os;
+}
 
 // Boilerplate
 namespace internal {
-template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
 struct tensor_vsize_index_linearization_helper
 {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -371,7 +409,7 @@
   }
 };
 
-template<typename Index, std::size_t NumIndices, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
 struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -383,164 +421,58 @@
 }  // end namespace internal
 
 
-template <typename DenseIndex>
-struct VSizes : std::vector<DenseIndex> {
-  typedef std::vector<DenseIndex> Base;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
-    return Base::size();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const {
-    return internal::array_prod(*static_cast<const Base*>(this));
-  }
-
-  EIGEN_DEVICE_FUNC VSizes() { }
-  EIGEN_DEVICE_FUNC explicit VSizes(const std::vector<DenseIndex>& a) : Base(a) { }
-
-  template <std::size_t NumDims>
-  EIGEN_DEVICE_FUNC explicit VSizes(const array<DenseIndex, NumDims>& a) {
-    this->resize(NumDims);
-    for (int i = 0; i < NumDims; ++i) {
-      (*this)[i] = a[i];
-    }
-  }
-  template <std::size_t NumDims>
-  EIGEN_DEVICE_FUNC explicit VSizes(const DSizes<DenseIndex, NumDims>& a) {
-    this->resize(NumDims);
-    for (int i = 0; i < NumDims; ++i) {
-      (*this)[i] = a[i];
-    }
-  }
-
-  EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0) {
-    this->resize(1);
-    (*this)[0] = i0;
-  }
-  EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1) {
-    this->resize(2);
-    (*this)[0] = i0;
-    (*this)[1] = i1;
-  }
-  EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
-    this->resize(3);
-    (*this)[0] = i0;
-    (*this)[1] = i1;
-    (*this)[2] = i2;
-  }
-  EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
-    this->resize(4);
-    (*this)[0] = i0;
-    (*this)[1] = i1;
-    (*this)[2] = i2;
-    (*this)[3] = i3;
-  }
-  EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
-    this->resize(5);
-    (*this)[0] = i0;
-    (*this)[1] = i1;
-    (*this)[2] = i2;
-    (*this)[3] = i3;
-    (*this)[4] = i4;
-  }
-
-  EIGEN_DEVICE_FUNC VSizes& operator = (const std::vector<DenseIndex>& other) {
-    *static_cast<Base*>(this) = other;
-    return *this;
-  }
-  template <std::size_t NumDims>
-  EIGEN_DEVICE_FUNC VSizes& operator = (const array<DenseIndex, NumDims>& a) {
-    this->resize(NumDims);
-    for (int i = 0; i < NumDims; ++i) {
-      (*this)[i] = a[i];
-    }
-    return *this;
-  }
-  template <std::size_t NumDims>
-  EIGEN_DEVICE_FUNC VSizes& operator = (const DSizes<DenseIndex, NumDims>& a) {
-    this->resize(NumDims);
-    for (int i = 0; i < NumDims; ++i) {
-      (*this)[i] = a[i];
-    }
-    return *this;
-  }
-
-  // A constexpr would be so much better here
-  template <std::size_t NumDims>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array<DenseIndex, NumDims>& indices) const {
-    return internal::tensor_vsize_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, false>::run(indices, *static_cast<const Base*>(this));
-  }
-  template <std::size_t NumDims>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const {
-    return internal::tensor_vsize_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, true>::run(indices, *static_cast<const Base*>(this));
-  }
-};
-
-
-// Boilerplate
-namespace internal {
-template <typename DenseIndex>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex array_prod(const VSizes<DenseIndex>& sizes) {
-  DenseIndex total_size = 1;
-  for (int i = 0; i < sizes.size(); ++i) {
-    total_size *= sizes[i];
-  }
-  return total_size;
-};
-}
-
 namespace internal {
 
 template <typename DenseIndex, int NumDims> struct array_size<const DSizes<DenseIndex, NumDims> > {
-  static const size_t value = NumDims;
+  static const ptrdiff_t value = NumDims;
 };
 template <typename DenseIndex, int NumDims> struct array_size<DSizes<DenseIndex, NumDims> > {
-  static const size_t value = NumDims;
-};
-template <typename DenseIndex>
-struct array_size<VSizes<DenseIndex> > {
-  static const ptrdiff_t value = -1;
+  static const ptrdiff_t value = NumDims;
 };
 #ifndef EIGEN_EMULATE_CXX11_META_H
-template <typename std::size_t... Indices> struct array_size<const Sizes<Indices...> > {
-static const size_t value = Sizes<Indices...>::count;
+template <typename std::ptrdiff_t... Indices> struct array_size<const Sizes<Indices...> > {
+static const std::ptrdiff_t value = Sizes<Indices...>::count;
 };
-template <typename std::size_t... Indices> struct array_size<Sizes<Indices...> > {
-static const size_t value = Sizes<Indices...>::count;
+template <typename std::ptrdiff_t... Indices> struct array_size<Sizes<Indices...> > {
+static const std::ptrdiff_t value = Sizes<Indices...>::count;
 };
-template <std::size_t n, typename std::size_t... Indices> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<Indices...>&) {
-  return get<n, internal::numeric_list<std::size_t, Indices...> >::value;
+template <std::ptrdiff_t n, typename std::ptrdiff_t... Indices> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<Indices...>&) {
+  return get<n, internal::numeric_list<std::ptrdiff_t, Indices...> >::value;
+}
+template <std::ptrdiff_t n> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) {
+  eigen_assert(false && "should never be called");
+  return -1;
 }
 #else
-template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > {
-  static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
+template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > {
+  static const ptrdiff_t value = Sizes<V1,V2,V3,V4,V5>::count;
 };
-template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > {
-  static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
+template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > {
+  static const ptrdiff_t value = Sizes<V1,V2,V3,V4,V5>::count;
 };
-template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<V1,V2,V3,V4,V5>& a) {
+template <std::ptrdiff_t n, std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<V1,V2,V3,V4,V5>&) {
   return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value;
 }
 
 #endif
 
 
-template <typename Dims1, typename Dims2, size_t n, size_t m>
+template <typename Dims1, typename Dims2, ptrdiff_t n, ptrdiff_t m>
 struct sizes_match_below_dim {
-  static inline bool run(Dims1& dims1, Dims2& dims2) {
+  static EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) {
     return false;
   }
 };
-template <typename Dims1, typename Dims2, size_t n>
+template <typename Dims1, typename Dims2, ptrdiff_t n>
 struct sizes_match_below_dim<Dims1, Dims2, n, n> {
-  static inline bool run(Dims1& dims1, Dims2& dims2) {
-    return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &
+  static EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE bool run(Dims1& dims1, Dims2& dims2) {
+    return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &&
         sizes_match_below_dim<Dims1, Dims2, n-1, n-1>::run(dims1, dims2);
   }
 };
 template <typename Dims1, typename Dims2>
 struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
-  static inline bool run(Dims1& dims1, Dims2& dims2) {
+  static EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) {
     return true;
   }
 };
@@ -549,41 +481,10 @@
 
 
 template <typename Dims1, typename Dims2>
-bool dimensions_match(Dims1& dims1, Dims2& dims2) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2) {
   return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value, internal::array_size<Dims2>::value>::run(dims1, dims2);
 }
 
-template <typename IndexType, typename Dims2>
-bool dimensions_match(const VSizes<IndexType>& dims1, Dims2& dims2) {
-  if (dims1.size() != internal::array_size<Dims2>::value) {
-    return false;
-  }
-  for (int i = 0; i < internal::array_size<Dims2>::value; ++i) {
-    if (dims1[i] != dims2[i]) {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <typename Dims1, typename IndexType>
-bool dimensions_match(Dims1& dims1, const VSizes<IndexType>& dims2) {
-  if (internal::array_size<Dims1>::value != dims2.size()) {
-    return false;
-  }
-  for (int i = 0; i < internal::array_size<Dims1>::value; ++i) {
-    if (dims1[i] != dims2[i]) {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <typename IndexType>
-bool dimensions_match(const VSizes<IndexType>& dims1, const VSizes<IndexType>& dims2) {
-  return dims1 == dims2;
-}
-
 } // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 5babec6..a48d035 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h

@@ -20,8 +20,8 @@
   *
   */
 namespace internal {
-template<typename XprType>
-struct traits<TensorEvalToOp<XprType> >
+template<typename XprType, template <class> class MakePointer_>
+struct traits<TensorEvalToOp<XprType, MakePointer_> >
 {
   // Type promotion to handle the case where the types of the lhs and the rhs are different.
   typedef typename XprType::Scalar Scalar;
@@ -32,22 +32,31 @@
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename MakePointer_<Scalar>::Type PointerType;
 
   enum {
-    Flags = 0,
+    Flags = 0
+  };
+  template <class T>
+  struct MakePointer {
+    // Intermediate typedef to workaround MSVC issue.
+    typedef MakePointer_<T> MakePointerT;
+    typedef typename MakePointerT::Type Type;
+
+
   };
 };
 
-template<typename XprType>
-struct eval<TensorEvalToOp<XprType>, Eigen::Dense>
+template<typename XprType, template <class> class MakePointer_>
+struct eval<TensorEvalToOp<XprType, MakePointer_>, Eigen::Dense>
 {
-  typedef const TensorEvalToOp<XprType>& type;
+  typedef const TensorEvalToOp<XprType, MakePointer_>& type;
 };
 
-template<typename XprType>
-struct nested<TensorEvalToOp<XprType>, 1, typename eval<TensorEvalToOp<XprType> >::type>
+template<typename XprType, template <class> class MakePointer_>
+struct nested<TensorEvalToOp<XprType, MakePointer_>, 1, typename eval<TensorEvalToOp<XprType, MakePointer_> >::type>
 {
-  typedef TensorEvalToOp<XprType> type;
+  typedef TensorEvalToOp<XprType, MakePointer_> type;
 };
 
 }  // end namespace internal
@@ -55,68 +64,99 @@
 
 
 
-template<typename XprType>
-class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType> >
+template<typename XprType, template <class> class MakePointer_>
+class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType, MakePointer_>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar;
   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename MakePointer_<CoeffReturnType>::Type PointerType;
   typedef typename Eigen::internal::nested<TensorEvalToOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(CoeffReturnType* buffer, const XprType& expr)
+  static const int NumDims = Eigen::internal::traits<TensorEvalToOp>::NumDimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr)
       : m_xpr(expr), m_buffer(buffer) {}
 
     EIGEN_DEVICE_FUNC
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC CoeffReturnType* buffer() const { return m_buffer; }
+    EIGEN_DEVICE_FUNC PointerType buffer() const { return m_buffer; }
 
   protected:
     typename XprType::Nested m_xpr;
-    CoeffReturnType* m_buffer;
+    PointerType m_buffer;
 };
 
 
 
-template<typename ArgType, typename Device>
-struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
+template<typename ArgType, typename Device, template <class> class MakePointer_>
+struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
 {
-  typedef TensorEvalToOp<ArgType> XprType;
+  typedef TensorEvalToOp<ArgType, MakePointer_> XprType;
   typedef typename ArgType::Scalar Scalar;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
-
-  enum {
-    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = true
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_device(device), m_buffer(op.buffer())
-  { }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {
-  }
-
   typedef typename XprType::Index Index;
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  enum {
+    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = true,
+    PreferBlockAccess = false,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = true
+  };
+
+  static const int NumDims = internal::traits<ArgType>::NumDimensions;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      ArgTensorBlock;
+
+  typedef internal::TensorBlockAssignment<
+      CoeffReturnType, NumDims, typename ArgTensorBlock::XprType, Index>
+      TensorBlockAssignment;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){}
+
+
+  EIGEN_STRONG_INLINE ~TensorEvaluator() {
+  }
+
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* scalar) {
-    assert(scalar == NULL);
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) {
+    EIGEN_UNUSED_VARIABLE(scalar);
+    eigen_assert(scalar == NULL);
     return m_impl.evalSubExprsIfNeeded(m_buffer);
   }
 
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType scalar, EvalSubExprsCallback done) {
+    EIGEN_UNUSED_VARIABLE(scalar);
+    eigen_assert(scalar == NULL);
+    m_impl.evalSubExprsIfNeededAsync(m_buffer, std::move(done));
+  }
+#endif
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
     m_buffer[i] = m_impl.coeff(i);
   }
@@ -124,7 +164,34 @@
     internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return m_impl.getResourceRequirements();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(
+      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+    // Add `m_buffer` as destination buffer to the block descriptor.
+    desc.template AddDestinationBuffer<Layout>(
+        /*dst_base=*/m_buffer + desc.offset(),
+        /*dst_strides=*/internal::strides<Layout>(m_impl.dimensions()));
+
+    ArgTensorBlock block =
+        m_impl.block(desc, scratch, /*root_of_expr_ast=*/true);
+
+    // If block was evaluated into a destination buffer, there is no need to do
+    // an assignment.
+    if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(
+              desc.dimensions(), internal::strides<Layout>(m_impl.dimensions()),
+              m_buffer, desc.offset()),
+          block.expr());
+    }
+    block.cleanup();
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -134,13 +201,11 @@
   }
 
   template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_buffer; }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     // We assume that evalPacket or evalScalar is called to perform the
     // assignment and account for the cost of the write here.
@@ -148,10 +213,21 @@
         TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
   }
 
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; }
+  ArgType expression() const { return m_expression; }
+  #ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+    m_buffer.bind(cgh);
+  }
+  #endif
+
+
  private:
   TensorEvaluator<ArgType, Device> m_impl;
-  const Device& m_device;
-  CoeffReturnType* m_buffer;
+  EvaluatorPointerType m_buffer;
+  const ArgType m_expression;
 };
 
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 6b93634..3aff7fa 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h

@@ -24,90 +24,109 @@
   */
 
 // Generic evaluator
-template <typename Derived, typename Device>
-struct TensorEvaluator {
+template<typename Derived, typename Device>
+struct TensorEvaluator
+{
   typedef typename Derived::Index Index;
   typedef typename Derived::Scalar Scalar;
   typedef typename Derived::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename Derived::Dimensions Dimensions;
+  typedef Derived XprType;
+  static const int PacketSize =  PacketType<CoeffReturnType, Device>::size;
+  typedef typename internal::traits<Derived>::template MakePointer<Scalar>::Type TensorPointerType;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   // NumDimensions is -1 for variable dim tensors
-  static const int NumCoords = internal::traits<Derived>::NumDimensions;
-  static const int SafeNumCoords = NumCoords >= 0 ? NumCoords : 0;
+  static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
+                               internal::traits<Derived>::NumDimensions : 0;
 
   enum {
-    IsAligned = Derived::IsAligned,
-    PacketAccess = Derived::PacketAccess,
-    BlockAccess = internal::is_arithmetic<
-                      typename internal::remove_const<Scalar>::type>::value &&
-                  NumCoords >= 0,
-    Layout = Derived::Layout,
-    CoordAccess = NumCoords >= 0,
-    RawAccess = true
+    IsAligned          = Derived::IsAligned,
+    PacketAccess       = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess        = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
+    PreferBlockAccess  = false,
+    Layout             = Derived::Layout,
+    CoordAccess        = NumCoords > 0,
+    RawAccess          = true
   };
 
-  typedef typename internal::TensorBlock<
-      Index, typename internal::remove_const<Scalar>::type, SafeNumCoords,
-      Layout>
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
+                                                     Layout, Index>
       TensorBlock;
-  typedef typename internal::TensorBlockReader<
-      Index, typename internal::remove_const<Scalar>::type, SafeNumCoords,
-      Layout, PacketAccess>
-      TensorBlockReader;
-  typedef typename internal::TensorBlockWriter<
-      Index, typename internal::remove_const<Scalar>::type, SafeNumCoords,
-      Layout, PacketAccess>
-      TensorBlockWriter;
+  //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m,
-                                                        const Device& device)
-      : m_data(const_cast<Scalar*>(m.data())),
+  EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+      : m_data(device.get((const_cast<TensorPointerType>(m.data())))),
         m_dims(m.dimensions()),
-        m_device(device) {}
+        m_device(device)
+  { }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
-    return m_dims;
-  }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(
-      CoeffReturnType* dest) {
-    if (dest) {
-      m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize());
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) {
+    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && dest) {
+      m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
       return false;
     }
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {}
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType dest, EvalSubExprsCallback done) {
+    // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation.
+    done(evalSubExprsIfNeeded(dest));
+  }
+#endif  // EIGEN_USE_THREADS
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType
-  coeff(Index index) const {
-    eigen_assert(m_data);
+  EIGEN_STRONG_INLINE void cleanup() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(m_data != NULL);
     return m_data[index];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
-    eigen_assert(m_data);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) {
+    eigen_assert(m_data != NULL);
     return m_data[index];
   }
 
-  template <int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType
-  packet(Index index) const {
+  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketReturnType packet(Index index) const
+  {
     return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
   }
 
-  template <int StoreMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(
-      Index index, const PacketReturnType& x) {
-    return internal::pstoret<Scalar, PacketReturnType, StoreMode>(
-        m_data + index, x);
+  // Return a packet starting at `index` where `umask` specifies which elements
+  // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for
+  // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding
+  // float element will be loaded, otherwise 0 will be loaded.
+  // Function has been templatized to enable Sfinae.
+  template <typename PacketReturnTypeT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  typename internal::enable_if<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>::type
+  partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const
+  {
+    return internal::ploadu<PacketReturnTypeT>(m_data + index, umask);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType
-  coeff(const array<Index, SafeNumCoords>& coords) const {
-    eigen_assert(m_data);
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    return internal::pstoret<Scalar, PacketReturnType, StoreMode>(m_data + index, x);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
+    eigen_assert(m_data != NULL);
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       return m_data[m_dims.IndexOfColMajor(coords)];
     } else {
@@ -115,9 +134,9 @@
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(
-      const array<Index, SafeNumCoords>& coords) {
-    eigen_assert(m_data);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType&
+  coeffRef(const array<DenseIndex, NumCoords>& coords) {
+    eigen_assert(m_data != NULL);
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       return m_data[m_dims.IndexOfColMajor(coords)];
     } else {
@@ -125,337 +144,507 @@
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
-      std::vector<internal::TensorOpResourceRequirements>* resources) const {}
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
-                        internal::unpacket_traits<PacketReturnType>::size);
+                        PacketType<CoeffReturnType, Device>::size);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
     assert(m_data != NULL);
-    TensorBlockReader::Run(block, m_data);
+    return TensorBlock::materialize(m_data, m_dims, desc, scratch);
   }
 
+  template<typename TensorBlock>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
-      const TensorBlock& block) {
+      const TensorBlockDesc& desc, const TensorBlock& block) {
     assert(m_data != NULL);
-    TensorBlockWriter::Run(block, m_data);
+
+    typedef typename TensorBlock::XprType TensorBlockExpr;
+    typedef internal::TensorBlockAssignment<Scalar, NumCoords, TensorBlockExpr,
+                                            Index>
+        TensorBlockAssign;
+
+    TensorBlockAssign::Run(
+        TensorBlockAssign::target(desc.dimensions(),
+                                  internal::strides<Layout>(m_dims), m_data,
+                                  desc.offset()),
+        block.expr());
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
 
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_data.bind(cgh);
+  }
+#endif
  protected:
-  Scalar* m_data;
+  EvaluatorPointerType m_data;
   Dimensions m_dims;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
 };
 
 namespace {
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T loadConstant(const T* address) {
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T loadConstant(const T* address) {
   return *address;
 }
 // Use the texture cache on CUDA devices whenever possible
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float loadConstant(const float* address) {
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float loadConstant(const float* address) {
   return __ldg(address);
 }
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double loadConstant(
-    const double* address) {
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double loadConstant(const double* address) {
   return __ldg(address);
 }
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+Eigen::half loadConstant(const Eigen::half* address) {
+  return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
+}
+#endif
+#ifdef EIGEN_USE_SYCL
+// overload of load constant should be implemented here based on range access
+template <cl::sycl::access::mode AcMd, typename T>
+T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess<AcMd, T> &address) {
+  return *address;
+}
 #endif
 }
 
+
 // Default evaluator for rvalues
-template <typename Derived, typename Device>
-struct TensorEvaluator<const Derived, Device> {
+template<typename Derived, typename Device>
+struct TensorEvaluator<const Derived, Device>
+{
   typedef typename Derived::Index Index;
   typedef typename Derived::Scalar Scalar;
   typedef typename Derived::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename Derived::Dimensions Dimensions;
+  typedef const Derived XprType;
+  typedef typename internal::traits<Derived>::template MakePointer<const Scalar>::Type TensorPointerType;
+  typedef StorageMemory<const Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
   // NumDimensions is -1 for variable dim tensors
-  static const int NumCoords = internal::traits<Derived>::NumDimensions;
-  static const int SafeNumCoords = NumCoords >= 0 ? NumCoords : 0;
+  static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
+                               internal::traits<Derived>::NumDimensions : 0;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   enum {
-    IsAligned = Derived::IsAligned,
-    PacketAccess = Derived::PacketAccess,
-    BlockAccess = internal::is_arithmetic<
-                      typename internal::remove_const<Scalar>::type>::value &&
-                  NumCoords >= 0,
-    Layout = Derived::Layout,
-    CoordAccess = NumCoords >= 0,
-    RawAccess = true
+    IsAligned         = Derived::IsAligned,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = internal::is_arithmetic<ScalarNoConst>::value,
+    PreferBlockAccess = false,
+    Layout            = Derived::Layout,
+    CoordAccess       = NumCoords > 0,
+    RawAccess         = true
   };
 
-  // TODO(andydavis) Add block/writeBlock accessors to Tensor and TensorMap so
-  // we can default BlockAccess to true above.
-  typedef typename internal::TensorBlock<
-      Index, typename internal::remove_const<Scalar>::type, SafeNumCoords,
-      Layout>
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
+                                                     Layout, Index>
       TensorBlock;
-  typedef typename internal::TensorBlockReader<
-      Index, typename internal::remove_const<Scalar>::type, SafeNumCoords,
-      Layout, PacketAccess>
-      TensorBlockReader;
+  //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m,
-                                                        const Device& device)
-      : m_data(m.data()), m_dims(m.dimensions()), m_device(device) {}
+  EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+      : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device)
+  { }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
-    return m_dims;
-  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(
-      CoeffReturnType* data) {
-    if (internal::is_arithmetic<
-            typename internal::remove_const<Scalar>::type>::value &&
-        data) {
-      m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar));
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data) {
+      m_device.memcpy((void*)(m_device.get(data)),m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
       return false;
     }
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType
-  coeff(Index index) const {
-    eigen_assert(m_data);
-    return loadConstant(m_data + index);
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType dest, EvalSubExprsCallback done) {
+    // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation.
+    done(evalSubExprsIfNeeded(dest));
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(m_data != NULL);
+    return loadConstant(m_data+index);
   }
 
-  template <int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType
-  packet(Index index) const {
+  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketReturnType packet(Index index) const
+  {
     return internal::ploadt_ro<PacketReturnType, LoadMode>(m_data + index);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType
-  coeff(const array<Index, SafeNumCoords>& coords) const {
-    eigen_assert(m_data);
-    const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor))
-                            ? m_dims.IndexOfColMajor(coords)
-                            : m_dims.IndexOfRowMajor(coords);
-    return loadConstant(m_data + index);
+  // Return a packet starting at `index` where `umask` specifies which elements
+  // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for
+  // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding
+  // float element will be loaded, otherwise 0 will be loaded.
+  // Function has been templatized to enable Sfinae.
+  template <typename PacketReturnTypeT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  typename internal::enable_if<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>::type
+  partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const
+  {
+    return internal::ploadu<PacketReturnTypeT>(m_data + index, umask);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
-      std::vector<internal::TensorOpResourceRequirements>* resources) const {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
+    eigen_assert(m_data != NULL);
+    const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords)
+                        : m_dims.IndexOfRowMajor(coords);
+    return loadConstant(m_data+index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
+                        PacketType<CoeffReturnType, Device>::size);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    assert(m_data != NULL);
+    return TensorBlock::materialize(m_data, m_dims, desc, scratch);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_data.bind(cgh);
+  }
+#endif
+ protected:
+  EvaluatorPointerType m_data;
+  Dimensions m_dims;
+  const Device EIGEN_DEVICE_REF m_device;
+};
+
+
+
+
+// -------------------- CwiseNullaryOp --------------------
+
+template<typename NullaryOp, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
+{
+  typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
+
+  TensorEvaluator(const XprType& op, const Device& device)
+      : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
+  { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  enum {
+    IsAligned = true,
+    PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess
+    #ifdef EIGEN_USE_SYCL
+    &&  (PacketType<CoeffReturnType, Device>::size >1)
+    #endif
+    ,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    done(true);
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_wrapper(m_functor, index);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return m_wrapper.template packetOp<PacketReturnType, Index>(m_functor, index);
+  }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool vectorized) const {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
-                        internal::unpacket_traits<PacketReturnType>::size);
+                        PacketType<CoeffReturnType, Device>::size);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
-    assert(m_data != NULL);
-    TensorBlockReader::Run(block, m_data);
+  EIGEN_DEVICE_FUNC  EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+   // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_argImpl.bind(cgh);
   }
-
-  EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
-
- protected:
-  const Scalar* m_data;
-  Dimensions m_dims;
-  const Device& m_device;
-};
-
-// -------------------- CwiseNullaryOp --------------------
-
-template <typename NullaryOp, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> {
-  typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
-
-  enum {
-    IsAligned = true,
-    PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
-  };
-
-  EIGEN_DEVICE_FUNC
-  TensorEvaluator(const XprType& op, const Device& device)
-      : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() {}
-
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
-  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-
-  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const {
-    return m_argImpl.dimensions();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(
-      CoeffReturnType*) {
-    return true;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {}
-
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const {
-    return m_wrapper(m_functor, index);
-  }
-  template <int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType
-  packet(Index index) const {
-    return m_wrapper.template packetOp<PacketReturnType, Index>(m_functor, index);
-  }
-
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    const double functor_cost = internal::functor_traits<NullaryOp>::Cost;
-    return m_argImpl.costPerCoeff(vectorized) +
-           TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
-  }
+#endif
 
  private:
   const NullaryOp m_functor;
   TensorEvaluator<ArgType, Device> m_argImpl;
-  const internal::nullary_wrapper<CoeffReturnType, NullaryOp> m_wrapper;
+  const internal::nullary_wrapper<CoeffReturnType,NullaryOp> m_wrapper;
 };
 
+
+
 // -------------------- CwiseUnaryOp --------------------
 
-template <typename UnaryOp, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device> {
+template<typename UnaryOp, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
+{
   typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType;
 
   enum {
-    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess &
-                   internal::functor_traits<UnaryOp>::PacketAccess,
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned          = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess       = int(TensorEvaluator<ArgType, Device>::PacketAccess) &
+                         int(internal::functor_traits<UnaryOp>::PacketAccess),
+    BlockAccess        = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess  = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    Layout             = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess        = false,  // to be implemented
+    RawAccess          = false
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
-      : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device) {}
+  TensorEvaluator(const XprType& op, const Device& device)
+    : m_device(device),
+      m_functor(op.functor()),
+      m_argImpl(op.nestedExpression(), device)
+  { }
 
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
-  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  static const int NumDims = internal::array_size<Dimensions>::value;
 
-  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const {
-    return m_argImpl.dimensions();
-  }
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      ArgTensorBlock;
+
+  typedef internal::TensorCwiseUnaryBlock<UnaryOp, ArgTensorBlock>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_argImpl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_argImpl.cleanup(); }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const {
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_argImpl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
     return m_functor(m_argImpl.coeff(index));
   }
 
-  template <int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType
-  packet(Index index) const {
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
     return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
     return m_argImpl.costPerCoeff(vectorized) +
         TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    static const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
+    return m_argImpl.getResourceRequirements().addCostPerCoeff(
+        {0, 0, functor_cost / PacketSize});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    return TensorBlock(m_argImpl.block(desc, scratch), m_functor);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const{
+    m_argImpl.bind(cgh);
+  }
+#endif
+
+
  private:
+  const Device EIGEN_DEVICE_REF m_device;
   const UnaryOp m_functor;
   TensorEvaluator<ArgType, Device> m_argImpl;
 };
 
+
 // -------------------- CwiseBinaryOp --------------------
 
-template <typename BinaryOp, typename LeftArgType, typename RightArgType,
-          typename Device>
-struct TensorEvaluator<
-    const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType>, Device> {
+template<typename BinaryOp, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType>, Device>
+{
   typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
 
   enum {
-    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned &
-                TensorEvaluator<RightArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
-                   TensorEvaluator<RightArgType, Device>::PacketAccess &
-                   internal::functor_traits<BinaryOp>::PacketAccess,
-    BlockAccess = false,
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = int(TensorEvaluator<LeftArgType, Device>::IsAligned) &
+                        int(TensorEvaluator<RightArgType, Device>::IsAligned),
+    PacketAccess      = int(TensorEvaluator<LeftArgType, Device>::PacketAccess) &
+                        int(TensorEvaluator<RightArgType, Device>::PacketAccess) &
+                        int(internal::functor_traits<BinaryOp>::PacketAccess),
+    BlockAccess       = int(TensorEvaluator<LeftArgType, Device>::BlockAccess) &
+                        int(TensorEvaluator<RightArgType, Device>::BlockAccess),
+    PreferBlockAccess = int(TensorEvaluator<LeftArgType, Device>::PreferBlockAccess) |
+                        int(TensorEvaluator<RightArgType, Device>::PreferBlockAccess),
+    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
-      : m_functor(op.functor()),
-        m_leftImpl(op.lhsExpression(), device),
-        m_rightImpl(op.rhsExpression(), device) {
-    EIGEN_STATIC_ASSERT(
-        (static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
-             static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) ||
-         internal::traits<XprType>::NumDimensions <= 1),
-        YOU_MADE_A_PROGRAMMING_MISTAKE);
-    eigen_assert(
-        dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
+  TensorEvaluator(const XprType& op, const Device& device)
+    : m_device(device),
+      m_functor(op.functor()),
+      m_leftImpl(op.lhsExpression(), device),
+      m_rightImpl(op.rhsExpression(), device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
   }
 
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
-  typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
-  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const {
-    // TODO: use right impl instead if right impl dimensions are known at
-    // compile time.
+  static const int NumDims = internal::array_size<
+      typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const LeftArgType, Device>::TensorBlock
+      LeftTensorBlock;
+  typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock
+      RightTensorBlock;
+
+  typedef internal::TensorCwiseBinaryBlock<BinaryOp, LeftTensorBlock,
+                                           RightTensorBlock>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+  {
+    // TODO: use right impl instead if right impl dimensions are known at compile time.
     return m_leftImpl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(
-      CoeffReturnType*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     m_rightImpl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    // TODO(ezhulenev): Evaluate two expression in parallel?
+    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) {
+      m_rightImpl.evalSubExprsIfNeededAsync(nullptr,
+                                            [done](bool) { done(true); });
+    });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
     return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index));
   }
-  template <int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType
-  packet(Index index) const {
-    return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index),
-                              m_rightImpl.template packet<LoadMode>(index));
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index));
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool vectorized) const {
     const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
@@ -464,7 +653,34 @@
            TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    static const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
+    return internal::TensorBlockResourceRequirements::merge(
+               m_leftImpl.getResourceRequirements(),
+               m_rightImpl.getResourceRequirements())
+        .addCostPerCoeff({0, 0, functor_cost / PacketSize});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    desc.DropDestinationBuffer();
+    return TensorBlock(m_leftImpl.block(desc, scratch),
+                         m_rightImpl.block(desc, scratch), m_functor);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+  #ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_leftImpl.bind(cgh);
+    m_rightImpl.bind(cgh);
+  }
+  #endif
  private:
+  const Device EIGEN_DEVICE_REF m_device;
   const BinaryOp m_functor;
   TensorEvaluator<LeftArgType, Device> m_leftImpl;
   TensorEvaluator<RightArgType, Device> m_rightImpl;
@@ -479,15 +695,20 @@
 
   enum {
     IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
-                   internal::functor_traits<TernaryOp>::PacketAccess,
-    BlockAccess = false,
-    Layout = TensorEvaluator<Arg1Type, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    PacketAccess      = TensorEvaluator<Arg1Type, Device>::PacketAccess &&
+                        TensorEvaluator<Arg2Type, Device>::PacketAccess &&
+                        TensorEvaluator<Arg3Type, Device>::PacketAccess &&
+                        internal::functor_traits<TernaryOp>::PacketAccess,
+    BlockAccess       = false,
+    PreferBlockAccess = TensorEvaluator<Arg1Type, Device>::PreferBlockAccess ||
+                        TensorEvaluator<Arg2Type, Device>::PreferBlockAccess ||
+                        TensorEvaluator<Arg3Type, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<Arg1Type, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+  TensorEvaluator(const XprType& op, const Device& device)
     : m_functor(op.functor()),
       m_arg1Impl(op.arg1Expression(), device),
       m_arg2Impl(op.arg2Expression(), device),
@@ -515,8 +736,14 @@
   typedef typename XprType::Scalar Scalar;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
   typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
   {
@@ -524,13 +751,13 @@
     return m_arg1Impl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_arg1Impl.evalSubExprsIfNeeded(NULL);
     m_arg2Impl.evalSubExprsIfNeeded(NULL);
     m_arg3Impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_arg1Impl.cleanup();
     m_arg2Impl.cleanup();
     m_arg3Impl.cleanup();
@@ -557,94 +784,149 @@
            TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+   // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_arg1Impl.bind(cgh);
+    m_arg2Impl.bind(cgh);
+    m_arg3Impl.bind(cgh);
+  }
+#endif
 
  private:
   const TernaryOp m_functor;
   TensorEvaluator<Arg1Type, Device> m_arg1Impl;
-  TensorEvaluator<Arg1Type, Device> m_arg2Impl;
+  TensorEvaluator<Arg2Type, Device> m_arg2Impl;
   TensorEvaluator<Arg3Type, Device> m_arg3Impl;
 };
 
+
 // -------------------- SelectOp --------------------
 
-template <typename IfArgType, typename ThenArgType, typename ElseArgType,
-          typename Device>
-struct TensorEvaluator<
-    const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device> {
+template<typename IfArgType, typename ThenArgType, typename ElseArgType, typename Device>
+struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device>
+{
   typedef TensorSelectOp<IfArgType, ThenArgType, ElseArgType> XprType;
   typedef typename XprType::Scalar Scalar;
 
   enum {
-    IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned &
-                TensorEvaluator<ElseArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess &
-                   TensorEvaluator<ElseArgType, Device>::PacketAccess &
-                   internal::packet_traits<Scalar>::HasBlend,
-    BlockAccess = false,
-    Layout = TensorEvaluator<IfArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = TensorEvaluator<ThenArgType, Device>::IsAligned &
+                        TensorEvaluator<ElseArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<ThenArgType, Device>::PacketAccess &
+                        TensorEvaluator<ElseArgType, Device>::PacketAccess &
+                        PacketType<Scalar, Device>::HasBlend,
+    BlockAccess       = TensorEvaluator<IfArgType, Device>::BlockAccess &&
+                        TensorEvaluator<ThenArgType, Device>::BlockAccess &&
+                        TensorEvaluator<ElseArgType, Device>::BlockAccess,
+    PreferBlockAccess = TensorEvaluator<IfArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<ThenArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<ElseArgType, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<IfArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
-      : m_condImpl(op.ifExpression(), device),
-        m_thenImpl(op.thenExpression(), device),
-        m_elseImpl(op.elseExpression(), device) {
-    EIGEN_STATIC_ASSERT(
-        (static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) ==
-         static_cast<int>(TensorEvaluator<ThenArgType, Device>::Layout)),
-        YOU_MADE_A_PROGRAMMING_MISTAKE);
-    EIGEN_STATIC_ASSERT(
-        (static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) ==
-         static_cast<int>(TensorEvaluator<ElseArgType, Device>::Layout)),
-        YOU_MADE_A_PROGRAMMING_MISTAKE);
-    eigen_assert(
-        dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions()));
-    eigen_assert(
-        dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions()));
+  TensorEvaluator(const XprType& op, const Device& device)
+    : m_condImpl(op.ifExpression(), device),
+      m_thenImpl(op.thenExpression(), device),
+      m_elseImpl(op.elseExpression(), device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ThenArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ElseArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions()));
+    eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions()));
   }
 
   typedef typename XprType::Index Index;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
   typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
-  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const {
-    // TODO: use then or else impl instead if they happen to be known at compile
-    // time.
+  static const int NumDims = internal::array_size<Dimensions>::value;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+    typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const IfArgType, Device>::TensorBlock
+      IfArgTensorBlock;
+  typedef typename TensorEvaluator<const ThenArgType, Device>::TensorBlock
+      ThenArgTensorBlock;
+  typedef typename TensorEvaluator<const ElseArgType, Device>::TensorBlock
+      ElseArgTensorBlock;
+
+  struct TensorSelectOpBlockFactory {
+    template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
+    struct XprType {
+      typedef TensorSelectOp<const IfArgXprType, const ThenArgXprType, const ElseArgXprType> type;
+    };
+
+    template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
+    typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type expr(
+        const IfArgXprType& if_expr, const ThenArgXprType& then_expr, const ElseArgXprType& else_expr) const {
+      return typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type(if_expr, then_expr, else_expr);
+    }
+  };
+
+  typedef internal::TensorTernaryExprBlock<TensorSelectOpBlockFactory,
+                                           IfArgTensorBlock, ThenArgTensorBlock,
+                                           ElseArgTensorBlock>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+  {
+    // TODO: use then or else impl instead if they happen to be known at compile time.
     return m_condImpl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(
-      CoeffReturnType*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_condImpl.evalSubExprsIfNeeded(NULL);
     m_thenImpl.evalSubExprsIfNeeded(NULL);
     m_elseImpl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_condImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) {
+      m_thenImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) {
+        m_elseImpl.evalSubExprsIfNeeded(nullptr, [done](bool) { done(true); });
+      });
+    });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_condImpl.cleanup();
     m_thenImpl.cleanup();
     m_elseImpl.cleanup();
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const {
-    return m_condImpl.coeff(index) ? m_thenImpl.coeff(index)
-                                   : m_elseImpl.coeff(index);
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index);
   }
-  template <int LoadMode>
-  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
-    const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-    internal::Selector<PacketSize> select;
-    for (Index i = 0; i < PacketSize; ++i) {
-      select.select[i] = m_condImpl.coeff(index + i);
-    }
-    return internal::pblend(select, m_thenImpl.template packet<LoadMode>(index),
-                            m_elseImpl.template packet<LoadMode>(index));
-  }
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+  {
+     internal::Selector<PacketSize> select;
+     EIGEN_UNROLL_LOOP
+     for (Index i = 0; i < PacketSize; ++i) {
+       select.select[i] = m_condImpl.coeff(index+i);
+     }
+     return internal::pblend(select,
+                             m_thenImpl.template packet<LoadMode>(index),
+                             m_elseImpl.template packet<LoadMode>(index));
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+  }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool vectorized) const {
@@ -653,12 +935,49 @@
         .cwiseMax(m_elseImpl.costPerCoeff(vectorized));
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    auto then_req = m_thenImpl.getResourceRequirements();
+    auto else_req = m_elseImpl.getResourceRequirements();
+
+    auto merged_req =
+        internal::TensorBlockResourceRequirements::merge(then_req, else_req);
+    merged_req.cost_per_coeff =
+        then_req.cost_per_coeff.cwiseMax(else_req.cost_per_coeff);
+
+    return internal::TensorBlockResourceRequirements::merge(
+        m_condImpl.getResourceRequirements(), merged_req);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    // It's unsafe to pass destination buffer to underlying expressions, because
+    // output might be aliased with one of the inputs.
+    desc.DropDestinationBuffer();
+
+    return TensorBlock(
+        m_condImpl.block(desc, scratch), m_thenImpl.block(desc, scratch),
+        m_elseImpl.block(desc, scratch), TensorSelectOpBlockFactory());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_condImpl.bind(cgh);
+    m_thenImpl.bind(cgh);
+    m_elseImpl.bind(cgh);
+  }
+#endif
  private:
   TensorEvaluator<IfArgType, Device> m_condImpl;
   TensorEvaluator<ThenArgType, Device> m_thenImpl;
   TensorEvaluator<ElseArgType, Device> m_elseImpl;
 };
 
-}  // end namespace Eigen
 
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 5f93c19..c52fb77 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h

@@ -12,30 +12,94 @@
 
 namespace Eigen {
 
-/** \class TensorExecutor
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief The tensor executor class.
-  *
-  * This class is responsible for launch the evaluation of the expression on
-  * the specified computing device.
-  */
+/**
+ * \class TensorExecutor
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor executor class.
+ *
+ * This class is responsible for launch the evaluation of the expression on
+ * the specified computing device.
+ *
+ * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
+ *                      instructions)
+ * @tparam Tiling       can use block based tensor evaluation
+ *                      (see TensorBlock.h)
+ */
 namespace internal {
 
-// Default strategy: the expression is evaluated with a single cpu thread.
-template <typename Expression, typename Device,
-          bool Vectorizable, bool Tileable>
+/**
+ * Evaluating TensorBroadcastingOp via coefficient of packet path is extremely
+ * expensive. If expression has at least one broadcast op in it, and it supports
+ * block based evaluation, we always prefer it, even for the small tensors. For
+ * all other tileable ops, block evaluation overhead for small tensors (fits
+ * into L1) is too large, and we fallback on vectorized evaluation.
+ */
+
+// TODO(ezhulenev): Add specializations for all other types of Tensor ops.
+
+template<typename Expression>
+struct ExpressionHasTensorBroadcastingOp {
+  enum { value = false };
+};
+
+template<typename LhsXprType, typename RhsXprType>
+struct ExpressionHasTensorBroadcastingOp<
+    const TensorAssignOp<LhsXprType, RhsXprType> > {
+  enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
+};
+
+template<typename UnaryOp, typename XprType>
+struct ExpressionHasTensorBroadcastingOp<
+    const TensorCwiseUnaryOp<UnaryOp, XprType> > {
+  enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
+};
+
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct ExpressionHasTensorBroadcastingOp<
+    const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
+  enum {
+    value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value ||
+        ExpressionHasTensorBroadcastingOp<RhsXprType>::value
+  };
+};
+
+template<typename Broadcast, typename XprType>
+struct ExpressionHasTensorBroadcastingOp<
+    const TensorBroadcastingOp<Broadcast, XprType> > {
+  enum { value = true };
+};
+
+// -------------------------------------------------------------------------- //
+
+/**
+ * Default strategy: the expression is evaluated sequentially with a single cpu
+ * thread, without vectorization and block evaluation.
+ */
+template <typename Expression, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling>
 class TensorExecutor {
  public:
-  typedef typename Expression::Index Index;
-  EIGEN_DEVICE_FUNC static inline void run(const Expression& expr, const Device& device = Device())
-  {
+  typedef typename Expression::Index StorageIndex;
+
+  // Including `unsupported/Eigen/CXX11/Tensor` in different translation units
+  // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
+  // violation. If this template is instantiated with a non-default device, it
+  // means that this header file was included without defining
+  // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
+  static_assert(std::is_same<Device, DefaultDevice>::value,
+                "Default executor instantiated with non-default device. "
+                "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
+                "EIGEN_USE_SYCL before including Eigen headers.");
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
+                                      const Device& device = Device()) {
     TensorEvaluator<Expression, Device> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign)
-    {
-      const Index size = array_prod(evaluator.dimensions());
-      for (Index i = 0; i < size; ++i) {
+    if (needs_assign) {
+      const StorageIndex size = array_prod(evaluator.dimensions());
+      for (StorageIndex i = 0; i < size; ++i) {
         evaluator.evalScalar(i);
       }
     }
@@ -43,34 +107,48 @@
   }
 };
 
+/**
+ * Default async execution strategy is not implemented. Currently it's only
+ * available for ThreadPoolDevice (see definition below).
+ */
+template <typename Expression, typename Device, typename DoneCallback,
+          bool Vectorizable, TiledEvaluation Tiling>
+class TensorAsyncExecutor {};
+
+/**
+ * Process all the data with a single cpu thread, using vectorized instructions.
+ */
 template <typename Expression>
-class TensorExecutor<Expression, DefaultDevice, true, false> {
+class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
+                     /*Tiling=*/TiledEvaluation::Off> {
  public:
-  typedef typename Expression::Index Index;
+  typedef typename Expression::Index StorageIndex;
+
   EIGEN_DEVICE_FUNC
-  static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
-  {
+  static EIGEN_STRONG_INLINE void run(
+      const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
     TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign)
-    {
-      const Index size = array_prod(evaluator.dimensions());
-      const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
+    if (needs_assign) {
+      const StorageIndex size = array_prod(evaluator.dimensions());
+      const int PacketSize = unpacket_traits<typename TensorEvaluator<
+          Expression, DefaultDevice>::PacketReturnType>::size;
 
       // Give compiler a strong possibility to unroll the loop. But don't insist
       // on unrolling, because if the function is expensive compiler should not
       // unroll the loop at the expense of inlining.
-      const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
-      for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
-        for (Index j = 0; j < 4; j++) {
+      const StorageIndex UnrolledSize =
+          (size / (4 * PacketSize)) * 4 * PacketSize;
+      for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
+        for (StorageIndex j = 0; j < 4; j++) {
           evaluator.evalPacket(i + j * PacketSize);
         }
       }
-      const Index VectorizedSize = (size / PacketSize) * PacketSize;
-      for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
+      const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
+      for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
         evaluator.evalPacket(i);
       }
-      for (Index i = VectorizedSize; i < size; ++i) {
+      for (StorageIndex i = VectorizedSize; i < size; ++i) {
         evaluator.evalScalar(i);
       }
     }
@@ -78,116 +156,162 @@
   }
 };
 
+/**
+ * Process all the data with a single cpu thread, using blocks of data. By
+ * sizing a block to fit L1 cache we get better cache performance.
+ */
 template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, DefaultDevice, Vectorizable, true> {
+class TensorExecutor<Expression, DefaultDevice, Vectorizable,
+                     /*Tiling=*/TiledEvaluation::On> {
  public:
-  typedef typename Expression::Index Index;
-  EIGEN_DEVICE_FUNC
-  static inline void run(const Expression& expr,
-                         const DefaultDevice& device = DefaultDevice()) {
-    typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
-    typedef typename traits<Expression>::Scalar Scalar;
-    typedef typename traits<Expression>::Index Index;
-    const std::size_t NumDims = traits<Expression>::NumDimensions;
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef typename remove_const<Scalar>::type ScalarNoConst;
 
-    typedef TensorBlockMapper<Index,
-                              typename internal::remove_const<Scalar>::type,
-                              NumDims, Evaluator::Layout> TensorBlockMapper;
-    typedef TensorBlock<Index, typename internal::remove_const<Scalar>::type,
-                        NumDims, Evaluator::Layout> TensorBlock;
+  typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
+  typedef typename traits<Expression>::Index StorageIndex;
+
+  static const int NumDims = traits<Expression>::NumDimensions;
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
+                         const DefaultDevice& device = DefaultDevice()) {
+    typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex>
+        TensorBlockMapper;
+
+    typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>
+        TensorBlockDesc;
+    typedef internal::TensorBlockScratchAllocator<DefaultDevice>
+        TensorBlockScratch;
 
     Evaluator evaluator(expr, device);
-    std::size_t total_size = array_prod(evaluator.dimensions());
-    std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
-    if (total_size < cache_size) {
-      // TODO(andydavis) Reduce block management overhead for small tensors.
-      internal::TensorExecutor<Expression, DefaultDevice, Vectorizable,
-                               false>::run(expr, device);
-      return;
-    }
 
+    // TODO(ezhulenev): Do not use tiling for small tensors?
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+
     if (needs_assign) {
-      // Size tensor blocks to fit in cache (or requested target block size).
-      size_t block_total_size = numext::mini(cache_size, total_size);
-      TensorBlockShapeType block_shape = kUniformAllDims;
       // Query expression tree for desired block size/shape.
-      std::vector<internal::TensorOpResourceRequirements> resources;
-      evaluator.getResourceRequirements(&resources);
-      if (!resources.empty()) {
-        // TODO(andydavis) Implement different policies (i.e. revert to a
-        // default policy if block shapes/sizes conflict).
-        block_shape = resources[0].block_shape;
-        block_total_size = resources[0].block_total_size;
+      const TensorBlockResourceRequirements requirements =
+          evaluator.getResourceRequirements();
+
+      const TensorBlockMapper block_mapper(
+          typename TensorBlockDesc::Dimensions(evaluator.dimensions()),
+          requirements);
+
+      // Share scratch memory allocator between all blocks.
+      TensorBlockScratch scratch(device);
+
+      const StorageIndex total_block_count = block_mapper.blockCount();
+      for (StorageIndex i = 0; i < total_block_count; ++i) {
+        TensorBlockDesc desc = block_mapper.blockDescriptor(i);
+        evaluator.evalBlock(desc, scratch);
+        scratch.reset();
       }
-
-      TensorBlockMapper block_mapper(evaluator.dimensions(),
-                                     block_shape,
-                                     block_total_size);
-      block_total_size = block_mapper.block_dims_total_size();
-
-      Scalar* data = static_cast<Scalar*>(device.allocate(
-          block_total_size * sizeof(Scalar)));
-
-      const Index total_block_count = block_mapper.total_block_count();
-      for (Index i = 0; i < total_block_count; ++i) {
-        TensorBlock block = block_mapper.GetBlockForIndex(i, data);
-        evaluator.evalBlock(&block);
-      }
-      device.deallocate(data);
     }
     evaluator.cleanup();
   }
 };
 
-// Multicore strategy: the index space is partitioned and each partition is executed on a single core
+/**
+ * Multicore strategy: the index space is partitioned and each partition is
+ * executed on a single core.
+ *
+ * (1) TensorExecutor will submit work to the ThreadPoolDevice managed thread
+ *     pool, and will block the caller thread until all tasks are finished.
+ *
+ * (2) TensorAsyncExecutor is a non-blocking version, that will submit work to
+ *     the ThreadPoolDevice managed thread pool, and will return immediately.
+ *     It will call 'done' callback after all tasks are finished.
+ */
 #ifdef EIGEN_USE_THREADS
-template <typename Evaluator, typename Index, bool Vectorizable>
+
+template <typename TensorBlockMapper>
+struct TensorExecutorTilingContext {
+  TensorExecutorTilingContext() = default;
+  TensorExecutorTilingContext(const TensorBlockMapper& b_mapper,
+                              const TensorOpCost& b_cost, size_t b_aligned_size)
+      : block_mapper(b_mapper),
+        cost(b_cost),
+        aligned_blocksize(b_aligned_size) {}
+
+  TensorBlockMapper block_mapper;  // navigate through blocks
+  TensorOpCost cost;               // cost of computing a single block
+  size_t aligned_blocksize;        // block size after memory alignment
+};
+
+// Computes a block evaluation parameters, and allocates temporary memory buffer
+// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
+template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
+TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
+    const Evaluator& evaluator) {
+  // Query expression tree for desired block size/shape.
+  TensorBlockResourceRequirements requirements =
+      evaluator.getResourceRequirements();
+
+  // Update target block size based on cost model.
+  double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
+      1, requirements.cost_per_coeff);
+  requirements.size = static_cast<size_t>(1.0 / taskSize);
+
+  TensorBlockMapper block_mapper(
+      typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
+      requirements);
+
+  size_t block_size = block_mapper.blockTotalSize();
+  const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
+  const size_t aligned_blocksize =
+      align *
+      divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
+
+  return {block_mapper, requirements.cost_per_coeff * block_size,
+          aligned_blocksize};
+}
+
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
 struct EvalRange {
-  static void run(void* evaluator_in, const Index first, const Index last) {
-    Evaluator evaluator(*static_cast<Evaluator*>(evaluator_in));
-    eigen_assert(last >= first);
-    for (Index i = first; i < last; ++i) {
+  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
+                  const StorageIndex lastIdx) {
+    Evaluator evaluator = *evaluator_in;
+    eigen_assert(lastIdx >= firstIdx);
+    for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
       evaluator.evalScalar(i);
     }
   }
 
-  static Index alignBlockSize(Index size) {
-    return size;
-  }
+  static StorageIndex alignBlockSize(StorageIndex size) { return size; }
 };
 
-template <typename Evaluator, typename Index>
-struct EvalRange<Evaluator, Index, true> {
-  static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+template <typename Evaluator, typename StorageIndex>
+struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
+  static const int PacketSize =
+      unpacket_traits<typename Evaluator::PacketReturnType>::size;
 
-  static void run(void* evaluator_in, const Index first, const Index last) {
-    Evaluator evaluator(*static_cast<Evaluator*>(evaluator_in));
-    eigen_assert(last >= first);
-
-    Index i = first;
-    if (last - first >= PacketSize) {
-      eigen_assert(first % PacketSize == 0);
-      Index last_chunk_offset = last - 4 * PacketSize;
+  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
+                  const StorageIndex lastIdx) {
+    Evaluator evaluator = *evaluator_in;
+    eigen_assert(lastIdx >= firstIdx);
+    StorageIndex i = firstIdx;
+    if (lastIdx - firstIdx >= PacketSize) {
+      eigen_assert(firstIdx % PacketSize == 0);
+      StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
       // Give compiler a strong possibility to unroll the loop. But don't insist
       // on unrolling, because if the function is expensive compiler should not
       // unroll the loop at the expense of inlining.
-      for (; i <= last_chunk_offset; i += 4*PacketSize) {
-        for (Index j = 0; j < 4; j++) {
+      for (; i <= last_chunk_offset; i += 4 * PacketSize) {
+        for (StorageIndex j = 0; j < 4; j++) {
           evaluator.evalPacket(i + j * PacketSize);
         }
       }
-      last_chunk_offset = last - PacketSize;
+      last_chunk_offset = lastIdx - PacketSize;
       for (; i <= last_chunk_offset; i += PacketSize) {
         evaluator.evalPacket(i);
       }
     }
-    for (; i < last; ++i) {
+    for (; i < lastIdx; ++i) {
       evaluator.evalScalar(i);
     }
   }
 
-  static Index alignBlockSize(Index size) {
+  static StorageIndex alignBlockSize(StorageIndex size) {
     // Align block size to packet size and account for unrolling in run above.
     if (size >= 16 * PacketSize) {
       return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
@@ -197,98 +321,374 @@
   }
 };
 
-template <typename Expression, bool Vectorizable, bool Tileable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
  public:
-  typedef typename Expression::Index Index;
-  static inline void run(const Expression& expr, const ThreadPoolDevice& device)
-  {
+  typedef typename Expression::Index StorageIndex;
+
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
+                         const ThreadPoolDevice& device) {
     typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
-    typedef EvalRange<Evaluator, Index, Vectorizable> EvalRange;
+    typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
+
     Evaluator evaluator(expr, device);
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign)
-    {
-      const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
-      const Index size = array_prod(evaluator.dimensions());
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+    if (needs_assign) {
+      const StorageIndex size = array_prod(evaluator.dimensions());
       device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
                          EvalRange::alignBlockSize,
-                         [&evaluator](Index first, Index last) {
-                           EvalRange::run(&evaluator, first, last);
+                         [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
+                           EvalRange::run(&evaluator, firstIdx, lastIdx);
                          });
     }
     evaluator.cleanup();
   }
 };
 
-
 template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, true> {
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
+                     /*Tiling=*/TiledEvaluation::On> {
  public:
-  typedef typename Expression::Index Index;
-  static inline void run(const Expression& expr,
-                         const ThreadPoolDevice& device) {
-    typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
-    typedef typename internal::remove_const<
-        typename traits<Expression>::Scalar>::type Scalar;
-    typedef typename traits<Expression>::Index Index;
-    static const std::size_t NumDims = traits<Expression>::NumDimensions;
-    typedef TensorBlockMapper<Index, Scalar, NumDims, Evaluator::Layout>
-        TensorBlockMapper;
-    typedef TensorBlock<Index, Scalar, NumDims, Evaluator::Layout>
-        TensorBlock;
+  typedef typename traits<Expression>::Index IndexType;
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef typename remove_const<Scalar>::type ScalarNoConst;
 
+  static const int NumDims = traits<Expression>::NumDimensions;
+
+  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
+  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
+
+  typedef internal::TensorBlockDescriptor<NumDims, IndexType>
+      TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
+      TensorBlockScratch;
+
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
+                                      const ThreadPoolDevice& device) {
     Evaluator evaluator(expr, device);
-    std::size_t total_size = array_prod(evaluator.dimensions());
-    std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
-    if (total_size < cache_size) {
-      // TODO(andydavis) Reduce block management overhead for small tensors.
-      internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
-                               false>::run(expr, device);
+
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+    if (needs_assign) {
+      const TilingContext tiling =
+          internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
+                                                   Vectorizable>(evaluator);
+
+      auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
+                                                       IndexType lastBlockIdx) {
+        TensorBlockScratch scratch(device);
+
+        for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
+             ++block_idx) {
+          TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
+          evaluator.evalBlock(desc, scratch);
+          scratch.reset();
+        }
+      };
+
+      // Evaluate small expressions directly as a single block.
+      if (tiling.block_mapper.blockCount() == 1) {
+        TensorBlockScratch scratch(device);
+        TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
+        evaluator.evalBlock(desc, scratch);
+      } else {
+        device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
+                           eval_block);
+      }
+    }
+    evaluator.cleanup();
+  }
+};
+
+template <typename Expression, typename DoneCallback, bool Vectorizable,
+          TiledEvaluation Tiling>
+class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
+                          Vectorizable, Tiling> {
+ public:
+  typedef typename Expression::Index StorageIndex;
+  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+
+  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
+                                           const ThreadPoolDevice& device,
+                                           DoneCallback done) {
+    TensorAsyncExecutorContext* const ctx =
+        new TensorAsyncExecutorContext(expr, device, std::move(done));
+
+    const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
+      if (!need_assign) {
+        delete ctx;
+        return;
+      }
+
+      typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
+      const StorageIndex size = array_prod(ctx->evaluator.dimensions());
+      device.parallelForAsync(
+          size, ctx->evaluator.costPerCoeff(Vectorizable),
+          EvalRange::alignBlockSize,
+          [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
+            EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
+          },
+          [ctx]() { delete ctx; });
+    };
+
+    ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
+  }
+
+ private:
+  struct TensorAsyncExecutorContext {
+    TensorAsyncExecutorContext(const Expression& expr,
+                               const ThreadPoolDevice& thread_pool,
+                               DoneCallback done)
+        : evaluator(expr, thread_pool), on_done(std::move(done)) {}
+
+    ~TensorAsyncExecutorContext() {
       evaluator.cleanup();
-      return;
+      on_done();
     }
 
+    Evaluator evaluator;
+
+   private:
+    DoneCallback on_done;
+  };
+};
+
+template <typename Expression, typename DoneCallback, bool Vectorizable>
+class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
+                          Vectorizable, /*Tileable*/ TiledEvaluation::On> {
+ public:
+  typedef typename traits<Expression>::Index IndexType;
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef typename remove_const<Scalar>::type ScalarNoConst;
+
+  static const int NumDims = traits<Expression>::NumDimensions;
+
+  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
+  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
+
+  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
+      TensorBlockScratch;
+
+  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
+                                           const ThreadPoolDevice& device,
+                                           DoneCallback done) {
+
+    TensorAsyncExecutorContext* const ctx =
+        new TensorAsyncExecutorContext(expr, device, std::move(done));
+
+    const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
+      if (!need_assign) {
+        delete ctx;
+        return;
+      }
+
+      ctx->tiling = internal::GetTensorExecutorTilingContext<
+          Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
+
+      auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
+        TensorBlockScratch scratch(ctx->device);
+
+        for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
+             ++block_idx) {
+          TensorBlockDesc desc =
+              ctx->tiling.block_mapper.blockDescriptor(block_idx);
+          ctx->evaluator.evalBlock(desc, scratch);
+          scratch.reset();
+        }
+      };
+
+      // Evaluate small expressions directly as a single block.
+      if (ctx->tiling.block_mapper.blockCount() == 1) {
+        TensorBlockScratch scratch(ctx->device);
+        TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
+        ctx->evaluator.evalBlock(desc, scratch);
+        delete ctx;
+      } else {
+        ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
+                                     ctx->tiling.cost, eval_block,
+                                     [ctx]() { delete ctx; });
+      }
+    };
+
+    ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
+  }
+
+ private:
+  struct TensorAsyncExecutorContext {
+    TensorAsyncExecutorContext(const Expression& expr,
+                               const ThreadPoolDevice& thread_pool,
+                               DoneCallback done)
+        : device(thread_pool),
+          evaluator(expr, thread_pool),
+          on_done(std::move(done)) {}
+
+    ~TensorAsyncExecutorContext() {
+      evaluator.cleanup();
+      on_done();
+    }
+
+    const ThreadPoolDevice& device;
+    Evaluator evaluator;
+    TilingContext tiling;
+
+   private:
+    DoneCallback on_done;
+  };
+};
+
+#endif  // EIGEN_USE_THREADS
+
+// GPU: the evaluation of the expression is offloaded to a GPU.
+#if defined(EIGEN_USE_GPU)
+
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
+ public:
+  typedef typename Expression::Index StorageIndex;
+  static void run(const Expression& expr, const GpuDevice& device);
+};
+
+#if defined(EIGEN_GPUCC)
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
+struct EigenMetaKernelEval {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
+    for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) {
+      eval.evalScalar(i);
+    }
+  }
+};
+
+template <typename Evaluator, typename StorageIndex>
+struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
+    const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+    const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
+    const StorageIndex vectorized_step_size = step_size * PacketSize;
+
+    // Use the vector path
+    for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
+         i += vectorized_step_size) {
+      eval.evalPacket(i);
+    }
+    for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) {
+      eval.evalScalar(i);
+    }
+  }
+};
+
+template <typename Evaluator, typename StorageIndex>
+__global__ void
+__launch_bounds__(1024)
+EigenMetaKernel(Evaluator eval, StorageIndex size) {
+
+  const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
+  const StorageIndex step_size = blockDim.x * gridDim.x;
+
+  const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
+  EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
+}
+
+/*static*/
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(
+    const Expression& expr, const GpuDevice& device) {
+  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+  if (needs_assign) {
+
+    const int block_size = device.maxGpuThreadsPerBlock();
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const StorageIndex size = array_prod(evaluator.dimensions());
+    // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
+    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
+
+    LAUNCH_GPU_KERNEL(
+        (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
+        num_blocks, block_size, 0, device, evaluator, size);
+  }
+  evaluator.cleanup();
+}
+
+#endif  // EIGEN_GPUCC
+#endif  // EIGEN_USE_GPU
+
+// SYCL Executor policy
+#ifdef EIGEN_USE_SYCL
+
+template <typename Evaluator>
+struct ExecExprFunctorKernel {
+  typedef typename Evaluator::Index Index;
+  Evaluator evaluator;
+  const Index range;
+  template <typename Scratch>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
+      const Scratch, Evaluator evaluator_, const Index range_)
+      : evaluator(evaluator_), range(range_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
+      cl::sycl::nd_item<1> itemID) {
+    compute(itemID);
+  }
+  template <bool is_vec = Evaluator::PacketAccess>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<!is_vec>::type
+  compute(const cl::sycl::nd_item<1>& itemID) {
+    Index gId = static_cast<Index>(itemID.get_global_linear_id());
+    Index total_threads = itemID.get_global_range(0);
+
+    for (Index i = gId; i < range; i += total_threads) {
+      evaluator.evalScalar(i);
+    }
+  }
+  template <bool is_vec = Evaluator::PacketAccess>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<is_vec>::type
+  compute(const cl::sycl::nd_item<1>& itemID) {
+    const Index vectorizedRange =
+        (range / Evaluator::PacketSize) * Evaluator::PacketSize;
+    Index gId = static_cast<Index>(itemID.get_global_linear_id());
+    const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
+    const Index start = Evaluator::PacketSize * gId;
+    for (Index i = start; i < vectorizedRange; i += step) {
+      evaluator.evalPacket(i);
+    }
+    gId += vectorizedRange;
+    for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
+      evaluator.evalScalar(i);
+    }
+  }
+};
+
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
+ public:
+  typedef typename Expression::Index Index;
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
+                                      const Eigen::SyclDevice& dev) {
+    typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator;
+    Evaluator evaluator(expr, dev);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
     if (needs_assign) {
-      TensorBlockShapeType block_shape = kUniformAllDims;
-      size_t block_total_size = 0;
-      // Query expression tree for desired block size/shape.
-      std::vector<internal::TensorOpResourceRequirements> resources;
-      evaluator.getResourceRequirements(&resources);
-      if (!resources.empty()) {
-        // TODO(andydavis) Implement different shape/size policies.
-        block_shape = resources[0].block_shape;
-        block_total_size = resources[0].block_total_size;
-      }
-      int num_threads = device.numThreads();
+      Index range, GRange, tileSize;
+      Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
+      total_size = (total_size == 0) ? 1 : total_size;
+      const int PacketSize =
+          Eigen::PacketType<typename Evaluator::CoeffReturnType,
+                            Eigen::SyclDevice>::size;
+      Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
+      dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
+      range = total_size;
 
-      // Estimate minimum block size based on cost.
-      TensorOpCost cost = evaluator.costPerCoeff(Vectorizable);
-      double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, cost);
-      size_t block_size = 1.0 / taskSize;
-      TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape,
-                                     block_size);
-      block_size = block_mapper.block_dims_total_size();
-      const size_t aligned_blocksize =
-          EIGEN_MAX_ALIGN_BYTES *
-          divup<size_t>(block_size * sizeof(Scalar), EIGEN_MAX_ALIGN_BYTES);
-      void* buf = internal::aligned_malloc((num_threads+1) * aligned_blocksize);
-      device.parallelFor(
-          block_mapper.total_block_count(), cost * block_size,
-          [=, &device, &evaluator, &block_mapper](Index first, Index last) {
-            // currentThreadId() returns -1 if called from a thread not in the
-            // threadpool, such as the main thread dispatching Eigen expressions.
-            const int thread_idx = device.currentThreadId();
-            eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
-            Scalar* thread_buf = reinterpret_cast<Scalar*>(
-                static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
-            for (Index i = first; i < last; ++i) {
-              auto block = block_mapper.GetBlockForIndex(i, thread_buf);
-              evaluator.evalBlock(&block);
-            }
-          });
-      internal::aligned_free(buf);
+      dev.template nullary_kernel_launcher<
+          typename Evaluator::CoeffReturnType,
+          ExecExprFunctorKernel<Evaluator> >(
+          evaluator,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
+                                cl::sycl::range<1>(tileSize)),
+          Index(1), range);
     }
     evaluator.cleanup();
   }
@@ -296,89 +696,6 @@
 
 #endif
 
-
-// GPU: the evaluation of the expression is offloaded to a GPU.
-#if defined(EIGEN_USE_GPU)
-
-template <typename Expression, bool Vectorizable, bool Tileable>
-class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
- public:
-  typedef typename Expression::Index Index;
-  static void run(const Expression& expr, const GpuDevice& device);
-};
-
-
-#if defined(__CUDACC__)
-template <typename Evaluator, typename Index, bool Vectorizable>
-struct EigenMetaKernelEval {
-  static __device__ EIGEN_ALWAYS_INLINE
-  void run(Evaluator eval, Index first, Index last, Index step_size) {
-    for (Index i = first; i < last; i += step_size) {
-      eval.evalScalar(i);
-    }
-  }
-};
-
-template <typename Evaluator, typename Index>
-struct EigenMetaKernelEval<Evaluator, Index, true> {
-  static __device__ EIGEN_ALWAYS_INLINE
-  void run(Evaluator eval, Index first, Index last, Index step_size) {
-    const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
-    const Index vectorized_size = (last / PacketSize) * PacketSize;
-    const Index vectorized_step_size = step_size * PacketSize;
-
-    // Use the vector path
-    for (Index i = first * PacketSize; i < vectorized_size;
-         i += vectorized_step_size) {
-      eval.evalPacket(i);
-    }
-    for (Index i = vectorized_size + first; i < last; i += step_size) {
-      eval.evalScalar(i);
-    }
-  }
-};
-
-template <typename Evaluator, typename Index>
-__global__ void
-__launch_bounds__(1024)
-EigenMetaKernel(Evaluator memcopied_eval, Index size) {
-
-  const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
-  const Index step_size = blockDim.x * gridDim.x;
-
-  // Cuda memcopies the kernel arguments. That's fine for POD, but for more
-  // complex types such as evaluators we should really conform to the C++
-  // standard and call a proper copy constructor.
-  Evaluator eval(memcopied_eval);
-
-  const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
-  EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
-}
-
-/*static*/
-template <typename Expression, bool Vectorizable, bool Tileable>
-inline void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
-    const Expression& expr, const GpuDevice& device) {
-  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
-  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-  if (needs_assign) {
-    const int block_size = device.maxCudaThreadsPerBlock();
-    const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / block_size;
-    const Index size = array_prod(evaluator.dimensions());
-    // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
-    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
-
-    LAUNCH_CUDA_KERNEL(
-        (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
-        num_blocks, block_size, 0, device, evaluator, size);
-  }
-  evaluator.cleanup();
-}
-
-#endif  // __CUDACC__
-#endif  // EIGEN_USE_GPU
-
 } // end namespace internal
 
 } // end namespace Eigen

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
index 0e79d59..c9bccfc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h

@@ -38,9 +38,9 @@
   typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-
+  typedef typename XprTraits::PointerType PointerType;
   enum {
-    Flags = 0,
+    Flags = 0
   };
 };
 
@@ -89,6 +89,10 @@
   typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename TypeConversion<Scalar, 
+                                  typename XprTraits::PointerType
+                                  >::type 
+                                  PointerType;
 };
 
 template<typename UnaryOp, typename XprType>
@@ -161,9 +165,14 @@
   typedef typename remove_reference<RhsNested>::type _RhsNested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-
+  typedef typename TypeConversion<Scalar,
+                                  typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+                                                      typename traits<LhsXprType>::PointerType,
+                                                      typename traits<RhsXprType>::PointerType>::type
+                                  >::type 
+                                  PointerType;
   enum {
-    Flags = 0,
+    Flags = 0
   };
 };
 
@@ -217,6 +226,7 @@
     const BinaryOp m_functor;
 };
 
+
 namespace internal {
 template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
 struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >
@@ -237,7 +247,12 @@
   typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-
+  typedef typename TypeConversion<Scalar,
+                                  typename conditional<Pointer_type_promotion<typename Arg2XprType::Scalar, Scalar>::val,
+                                                      typename traits<Arg2XprType>::PointerType,
+                                                      typename traits<Arg3XprType>::PointerType>::type
+                                  >::type 
+                                  PointerType;
   enum {
     Flags = 0
   };
@@ -282,7 +297,7 @@
     arg1Expression() const { return m_arg1_xpr; }
 
     EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename Arg1XprType::Nested>::type&
+    const typename internal::remove_all<typename Arg2XprType::Nested>::type&
     arg2Expression() const { return m_arg2_xpr; }
 
     EIGEN_DEVICE_FUNC
@@ -291,7 +306,7 @@
 
   protected:
     typename Arg1XprType::Nested m_arg1_xpr;
-    typename Arg1XprType::Nested m_arg2_xpr;
+    typename Arg2XprType::Nested m_arg2_xpr;
     typename Arg3XprType::Nested m_arg3_xpr;
     const TernaryOp m_functor;
 };
@@ -313,6 +328,9 @@
   typedef typename ElseXprType::Nested ElseNested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename conditional<Pointer_type_promotion<typename ThenXprType::Scalar, Scalar>::val,
+                               typename traits<ThenXprType>::PointerType,
+                               typename traits<ElseXprType>::PointerType>::type PointerType;
 };
 
 template<typename IfXprType, typename ThenXprType, typename ElseXprType>
@@ -331,7 +349,7 @@
 
 
 template<typename IfXprType, typename ThenXprType, typename ElseXprType>
-class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
+class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, ReadOnlyAccessors>
 {
   public:
     typedef typename Eigen::internal::traits<TensorSelectOp>::Scalar Scalar;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index e82ac5b..4a1a068 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h

@@ -9,6 +9,7 @@
 
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H
 #define EIGEN_CXX11_TENSOR_TENSOR_FFT_H
+
 namespace Eigen {
 
 /** \class TensorFFT
@@ -24,25 +25,19 @@
 
 template <bool NeedUprade> struct MakeComplex {
   template <typename T>
-  #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
   EIGEN_DEVICE_FUNC
-  #endif
   T operator() (const T& val) const { return val; }
 };
 
 template <> struct MakeComplex<true> {
   template <typename T>
-  #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
   EIGEN_DEVICE_FUNC
-  #endif
   std::complex<T> operator() (const T& val) const { return std::complex<T>(val, 0); }
 };
 
 template <> struct MakeComplex<false> {
   template <typename T>
-  #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
   EIGEN_DEVICE_FUNC
-  #endif
   std::complex<T> operator() (const std::complex<T>& val) const { return val; }
 };
 
@@ -72,6 +67,7 @@
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename traits<XprType>::PointerType PointerType;
 };
 
 template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
@@ -131,17 +127,24 @@
   typedef OutputScalar CoeffReturnType;
   typedef typename PacketType<OutputScalar, Device>::type PacketReturnType;
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+    typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = false,
     PacketAccess = true,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_data(NULL), m_impl(op.expression(), device), m_fft(op.fft()), m_device(device) {
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     for (int i = 0; i < NumDims; ++i) {
       eigen_assert(input_dims[i] > 0);
@@ -166,20 +169,19 @@
     return m_dimensions;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     m_impl.evalSubExprsIfNeeded(NULL);
     if (data) {
       evalToBuf(data);
       return false;
     } else {
-      m_data = (CoeffReturnType*)m_device.allocate(sizeof(CoeffReturnType) * m_size);
+      m_data = (EvaluatorPointerType)m_device.get((CoeffReturnType*)(m_device.allocate_temp(sizeof(CoeffReturnType) * m_size)));
       evalToBuf(m_data);
       return true;
     }
   }
 
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     if (m_data) {
       m_device.deallocate(m_data);
       m_data = NULL;
@@ -191,59 +193,43 @@
     return m_data[index];
   }
 
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const {
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType
+  packet(Index index) const {
     return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool vectorized) const {
-    /* This cost estimate is not relevant since all the work is done in
-    evalSubExprsIfNeeded(), but it might come in handy later.
-    // We guesstimate the leading term constant as that of the butterfly.
-    const double butterfly_cost = 3 * TensorOpCost::AddCost<ComplexScalar> +
-                                  2 * TensorOpCost::MulCost<ComplexScalar>;
-
-    double loads = 0;
-    double compute_cost = 0;
-    for (int i = 0; i < m_fft.size(); ++i) {
-      int dim = m_fft[i];
-      Index line_len = m_dimensions[dim];
-      if (isPowerOfTwo(line_len)) {
-        const int log_n = getLog2(line_len);
-        loads += log_n;
-        compute_cost += log_n * butterfly_cost;
-      } else {
-        const int log_n = getLog2(findGoodComposite(line_len));
-        loads += log_n;
-        compute_cost += log_n * 3 * bufferfly_cost;
-      }
-      return TensorOpCost(loads, 0, compute_cost, vectorized,
-                          PacketSize);
-    */
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
 
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_data.bind(cgh);
+  }
+#endif
+
  private:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(OutputScalar* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) {
     const bool write_to_out = internal::is_same<OutputScalar, ComplexScalar>::value;
     ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size);
 
-    for (int i = 0; i < m_size; ++i) {
+    for (Index i = 0; i < m_size; ++i) {
       buf[i] = MakeComplex<internal::is_same<InputScalar, RealScalar>::value>()(m_impl.coeff(i));
     }
 
-    for (int i = 0; i < m_fft.size(); ++i) {
-      int dim = m_fft[i];
+    for (size_t i = 0; i < m_fft.size(); ++i) {
+      Index dim = m_fft[i];
       eigen_assert(dim >= 0 && dim < NumDims);
       Index line_len = m_dimensions[dim];
       eigen_assert(line_len >= 1);
       ComplexScalar* line_buf = (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * line_len);
       const bool is_power_of_two = isPowerOfTwo(line_len);
-      const int good_composite = is_power_of_two ? 0 : findGoodComposite(line_len);
-      const int log_len = is_power_of_two ? getLog2(line_len) : getLog2(good_composite);
+      const Index good_composite = is_power_of_two ? 0 : findGoodComposite(line_len);
+      const Index log_len = is_power_of_two ? getLog2(line_len) : getLog2(good_composite);
 
       ComplexScalar* a = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite);
       ComplexScalar* b = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite);
@@ -253,19 +239,32 @@
         //   t_n = exp(sqrt(-1) * pi * n^2 / line_len)
         // for n = 0, 1,..., line_len-1.
         // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2
-        pos_j_base_powered[0] = ComplexScalar(1, 0);
-        if (line_len > 1) {
-          const ComplexScalar pos_j_base = ComplexScalar(
-              std::cos(M_PI / line_len), std::sin(M_PI / line_len));
-          pos_j_base_powered[1] = pos_j_base;
-          if (line_len > 2) {
-            const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base;
-            for (int i = 2; i < line_len + 1; ++i) {
-              pos_j_base_powered[i] = pos_j_base_powered[i - 1] *
-                                      pos_j_base_powered[i - 1] /
-                                      pos_j_base_powered[i - 2] * pos_j_base_sq;
-            }
-          }
+
+        // The recurrence is correct in exact arithmetic, but causes
+        // numerical issues for large transforms, especially in
+        // single-precision floating point.
+        //
+        // pos_j_base_powered[0] = ComplexScalar(1, 0);
+        // if (line_len > 1) {
+        //   const ComplexScalar pos_j_base = ComplexScalar(
+        //       numext::cos(M_PI / line_len), numext::sin(M_PI / line_len));
+        //   pos_j_base_powered[1] = pos_j_base;
+        //   if (line_len > 2) {
+        //     const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base;
+        //     for (int i = 2; i < line_len + 1; ++i) {
+        //       pos_j_base_powered[i] = pos_j_base_powered[i - 1] *
+        //           pos_j_base_powered[i - 1] /
+        //           pos_j_base_powered[i - 2] *
+        //           pos_j_base_sq;
+        //     }
+        //   }
+        // }
+        // TODO(rmlarsen): Find a way to use Eigen's vectorized sin
+        // and cosine functions here.
+        for (int j = 0; j < line_len + 1; ++j) {
+          double arg = ((EIGEN_PI * j) * j) / line_len;
+          std::complex<double> tmp(numext::cos(arg), numext::sin(arg));
+          pos_j_base_powered[j] = static_cast<ComplexScalar>(tmp);
         }
       }
 
@@ -275,7 +274,7 @@
         // get data into line_buf
         const Index stride = m_strides[dim];
         if (stride == 1) {
-          memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
+          m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
         } else {
           Index offset = base_offset;
           for (int j = 0; j < line_len; ++j, offset += stride) {
@@ -283,7 +282,7 @@
           }
         }
 
-        // processs the line
+        // process the line
         if (is_power_of_two) {
           processDataLineCooleyTukey(line_buf, line_len, log_len);
         }
@@ -293,7 +292,7 @@
 
         // write back
         if (FFTDir == FFT_FORWARD && stride == 1) {
-          memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
+          m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
         } else {
           Index offset = base_offset;
           const ComplexScalar div_factor =  ComplexScalar(1.0 / line_len, 0);
@@ -311,45 +310,45 @@
     }
 
     if(!write_to_out) {
-      for (int i = 0; i < m_size; ++i) {
+      for (Index i = 0; i < m_size; ++i) {
         data[i] = PartOf<FFTResultType>()(buf[i]);
       }
       m_device.deallocate(buf);
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool isPowerOfTwo(int x) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool isPowerOfTwo(Index x) {
     eigen_assert(x > 0);
     return !(x & (x - 1));
   }
 
-  //the composite number for padding, used in Bluestein's FFT algorithm
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int findGoodComposite(int n) {
-    int i = 2;
+  // The composite number for padding, used in Bluestein's FFT algorithm
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index findGoodComposite(Index n) {
+    Index i = 2;
     while (i < 2 * n - 1) i *= 2;
     return i;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int getLog2(int m) {
-    int log2m = 0;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index getLog2(Index m) {
+    Index log2m = 0;
     while (m >>= 1) log2m++;
     return log2m;
   }
 
   // Call Cooley Tukey algorithm directly, data length must be power of 2
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineCooleyTukey(ComplexScalar* line_buf, int line_len, int log_len) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineCooleyTukey(ComplexScalar* line_buf, Index line_len, Index log_len) {
     eigen_assert(isPowerOfTwo(line_len));
     scramble_FFT(line_buf, line_len);
     compute_1D_Butterfly<FFTDir>(line_buf, line_len, log_len);
   }
 
   // Call Bluestein's FFT algorithm, m is a good composite number greater than (2 * n - 1), used as the padding length
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineBluestein(ComplexScalar* line_buf, int line_len, int good_composite, int log_len, ComplexScalar* a, ComplexScalar* b, const ComplexScalar* pos_j_base_powered) {
-    int n = line_len;
-    int m = good_composite;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineBluestein(ComplexScalar* line_buf, Index line_len, Index good_composite, Index log_len, ComplexScalar* a, ComplexScalar* b, const ComplexScalar* pos_j_base_powered) {
+    Index n = line_len;
+    Index m = good_composite;
     ComplexScalar* data = line_buf;
 
-    for (int i = 0; i < n; ++i) {
+    for (Index i = 0; i < n; ++i) {
       if(FFTDir == FFT_FORWARD) {
         a[i] = data[i] * numext::conj(pos_j_base_powered[i]);
       }
@@ -357,11 +356,11 @@
         a[i] = data[i] * pos_j_base_powered[i];
       }
     }
-    for (int i = n; i < m; ++i) {
+    for (Index i = n; i < m; ++i) {
       a[i] = ComplexScalar(0, 0);
     }
 
-    for (int i = 0; i < n; ++i) {
+    for (Index i = 0; i < n; ++i) {
       if(FFTDir == FFT_FORWARD) {
         b[i] = pos_j_base_powered[i];
       }
@@ -369,10 +368,10 @@
         b[i] = numext::conj(pos_j_base_powered[i]);
       }
     }
-    for (int i = n; i < m - n; ++i) {
+    for (Index i = n; i < m - n; ++i) {
       b[i] = ComplexScalar(0, 0);
     }
-    for (int i = m - n; i < m; ++i) {
+    for (Index i = m - n; i < m; ++i) {
       if(FFTDir == FFT_FORWARD) {
         b[i] = pos_j_base_powered[m-i];
       }
@@ -387,7 +386,7 @@
     scramble_FFT(b, m);
     compute_1D_Butterfly<FFT_FORWARD>(b, m, log_len);
 
-    for (int i = 0; i < m; ++i) {
+    for (Index i = 0; i < m; ++i) {
       a[i] *= b[i];
     }
 
@@ -395,11 +394,11 @@
     compute_1D_Butterfly<FFT_REVERSE>(a, m, log_len);
 
     //Do the scaling after ifft
-    for (int i = 0; i < m; ++i) {
+    for (Index i = 0; i < m; ++i) {
       a[i] /= m;
     }
 
-    for (int i = 0; i < n; ++i) {
+    for (Index i = 0; i < n; ++i) {
       if(FFTDir == FFT_FORWARD) {
         data[i] = a[i] * numext::conj(pos_j_base_powered[i]);
       }
@@ -409,14 +408,14 @@
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void scramble_FFT(ComplexScalar* data, int n) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void scramble_FFT(ComplexScalar* data, Index n) {
     eigen_assert(isPowerOfTwo(n));
-    int j = 1;
-    for (int i = 1; i < n; ++i){
+    Index j = 1;
+    for (Index i = 1; i < n; ++i){
       if (j > i) {
         std::swap(data[j-1], data[i-1]);
       }
-      int m = n >> 1;
+      Index m = n >> 1;
       while (m >= 2 && j > m) {
         j -= m;
         m >>= 1;
@@ -498,7 +497,7 @@
 
   template <int Dir>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge(
-      ComplexScalar* data, int n, int n_power_of_2) {
+      ComplexScalar* data, Index n, Index n_power_of_2) {
     // Original code:
     // RealScalar wtemp = std::sin(M_PI/n);
     // RealScalar wpi =  -std::sin(2 * M_PI/n);
@@ -512,9 +511,9 @@
     const ComplexScalar wp_one_2 = wp_one * wp_one;
     const ComplexScalar wp_one_3 = wp_one_2 * wp_one;
     const ComplexScalar wp_one_4 = wp_one_3 * wp_one;
-    const int n2 = n / 2;
+    const Index n2 = n / 2;
     ComplexScalar w(1.0, 0.0);
-    for (int i = 0; i < n2; i += 4) {
+    for (Index i = 0; i < n2; i += 4) {
        ComplexScalar temp0(data[i + n2] * w);
        ComplexScalar temp1(data[i + 1 + n2] * w * wp_one);
        ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2);
@@ -537,7 +536,7 @@
 
  template <int Dir>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly(
-      ComplexScalar* data, int n, int n_power_of_2) {
+      ComplexScalar* data, Index n, Index n_power_of_2) {
     eigen_assert(isPowerOfTwo(n));
     if (n > 8) {
       compute_1D_Butterfly<Dir>(data, n / 2, n_power_of_2 - 1);
@@ -552,7 +551,7 @@
     }
   }
 
-   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getBaseOffsetFromIndex(Index index, Index omitted_dim) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getBaseOffsetFromIndex(Index index, Index omitted_dim) const {
     Index result = 0;
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -565,7 +564,7 @@
       result += index;
     }
     else {
-      for (int i = 0; i < omitted_dim; ++i) {
+      for (Index i = 0; i < omitted_dim; ++i) {
         const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim];
         const Index idx = index / partial_m_stride;
         index -= idx * partial_m_stride;
@@ -583,344 +582,88 @@
   }
 
  protected:
-  int m_size;
-  const FFT& m_fft;
+  Index m_size;
+  const FFT EIGEN_DEVICE_REF m_fft;
   Dimensions m_dimensions;
   array<Index, NumDims> m_strides;
   TensorEvaluator<ArgType, Device> m_impl;
-  CoeffReturnType* m_data;
-  const Device& m_device;
+  EvaluatorPointerType m_data;
+  const Device EIGEN_DEVICE_REF m_device;
 
   // This will support a maximum FFT size of 2^32 for each dimension
   // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2;
-  RealScalar m_sin_PI_div_n_LUT[32] = {
-  0.0,
-  -2,
-  -0.999999999999999,
-  -0.292893218813453,
-  -0.0761204674887130,
-  -0.0192147195967696,
-  -0.00481527332780311,
-  -0.00120454379482761,
-  -3.01181303795779e-04,
-  -7.52981608554592e-05,
-  -1.88247173988574e-05,
-  -4.70619042382852e-06,
-  -1.17654829809007e-06,
-  -2.94137117780840e-07,
-  -7.35342821488550e-08,
-  -1.83835707061916e-08,
-  -4.59589268710903e-09,
-  -1.14897317243732e-09,
-  -2.87243293150586e-10,
-  -7.18108232902250e-11,
-  -1.79527058227174e-11,
-  -4.48817645568941e-12,
-  -1.12204411392298e-12,
-  -2.80511028480785e-13,
-  -7.01277571201985e-14,
-  -1.75319392800498e-14,
-  -4.38298482001247e-15,
-  -1.09574620500312e-15,
-  -2.73936551250781e-16,
-  -6.84841378126949e-17,
-  -1.71210344531737e-17,
-  -4.28025861329343e-18
+  const RealScalar m_sin_PI_div_n_LUT[32] = {
+    RealScalar(0.0),
+    RealScalar(-2),
+    RealScalar(-0.999999999999999),
+    RealScalar(-0.292893218813453),
+    RealScalar(-0.0761204674887130),
+    RealScalar(-0.0192147195967696),
+    RealScalar(-0.00481527332780311),
+    RealScalar(-0.00120454379482761),
+    RealScalar(-3.01181303795779e-04),
+    RealScalar(-7.52981608554592e-05),
+    RealScalar(-1.88247173988574e-05),
+    RealScalar(-4.70619042382852e-06),
+    RealScalar(-1.17654829809007e-06),
+    RealScalar(-2.94137117780840e-07),
+    RealScalar(-7.35342821488550e-08),
+    RealScalar(-1.83835707061916e-08),
+    RealScalar(-4.59589268710903e-09),
+    RealScalar(-1.14897317243732e-09),
+    RealScalar(-2.87243293150586e-10),
+    RealScalar( -7.18108232902250e-11),
+    RealScalar(-1.79527058227174e-11),
+    RealScalar(-4.48817645568941e-12),
+    RealScalar(-1.12204411392298e-12),
+    RealScalar(-2.80511028480785e-13),
+    RealScalar(-7.01277571201985e-14),
+    RealScalar(-1.75319392800498e-14),
+    RealScalar(-4.38298482001247e-15),
+    RealScalar(-1.09574620500312e-15),
+    RealScalar(-2.73936551250781e-16),
+    RealScalar(-6.84841378126949e-17),
+    RealScalar(-1.71210344531737e-17),
+    RealScalar(-4.28025861329343e-18)
   };
 
   // m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i));
-  RealScalar m_minus_sin_2_PI_div_n_LUT[32] = {
-    0.0,
-    0.0,
-   -1.00000000000000e+00,
-   -7.07106781186547e-01,
-   -3.82683432365090e-01,
-   -1.95090322016128e-01,
-   -9.80171403295606e-02,
-   -4.90676743274180e-02,
-   -2.45412285229123e-02,
-   -1.22715382857199e-02,
-   -6.13588464915448e-03,
-   -3.06795676296598e-03,
-   -1.53398018628477e-03,
-   -7.66990318742704e-04,
-   -3.83495187571396e-04,
-   -1.91747597310703e-04,
-   -9.58737990959773e-05,
-   -4.79368996030669e-05,
-   -2.39684498084182e-05,
-   -1.19842249050697e-05,
-   -5.99211245264243e-06,
-   -2.99605622633466e-06,
-   -1.49802811316901e-06,
-   -7.49014056584716e-07,
-   -3.74507028292384e-07,
-   -1.87253514146195e-07,
-   -9.36267570730981e-08,
-   -4.68133785365491e-08,
-   -2.34066892682746e-08,
-   -1.17033446341373e-08,
-   -5.85167231706864e-09,
-   -2.92583615853432e-09
+  const RealScalar m_minus_sin_2_PI_div_n_LUT[32] = {
+    RealScalar(0.0),
+    RealScalar(0.0),
+    RealScalar(-1.00000000000000e+00),
+    RealScalar(-7.07106781186547e-01),
+    RealScalar(-3.82683432365090e-01),
+    RealScalar(-1.95090322016128e-01),
+    RealScalar(-9.80171403295606e-02),
+    RealScalar(-4.90676743274180e-02),
+    RealScalar(-2.45412285229123e-02),
+    RealScalar(-1.22715382857199e-02),
+    RealScalar(-6.13588464915448e-03),
+    RealScalar(-3.06795676296598e-03),
+    RealScalar(-1.53398018628477e-03),
+    RealScalar(-7.66990318742704e-04),
+    RealScalar(-3.83495187571396e-04),
+    RealScalar(-1.91747597310703e-04),
+    RealScalar(-9.58737990959773e-05),
+    RealScalar(-4.79368996030669e-05),
+    RealScalar(-2.39684498084182e-05),
+    RealScalar(-1.19842249050697e-05),
+    RealScalar(-5.99211245264243e-06),
+    RealScalar(-2.99605622633466e-06),
+    RealScalar(-1.49802811316901e-06),
+    RealScalar(-7.49014056584716e-07),
+    RealScalar(-3.74507028292384e-07),
+    RealScalar(-1.87253514146195e-07),
+    RealScalar(-9.36267570730981e-08),
+    RealScalar(-4.68133785365491e-08),
+    RealScalar(-2.34066892682746e-08),
+    RealScalar(-1.17033446341373e-08),
+    RealScalar(-5.85167231706864e-09),
+    RealScalar(-2.92583615853432e-09)
   };
 };
 
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-
-template<typename OutputScalar, typename RealScalar, typename ComplexScalar, int ResultType>
-struct writeToDeviceData {
-  void operator()(OutputScalar* d_data, ComplexScalar* data_buf, size_t size) {
-  }
-};
-
-template<typename OutputScalar, typename RealScalar, typename ComplexScalar>
-struct writeToDeviceData<OutputScalar, RealScalar, ComplexScalar, Eigen::BothParts> {
-  void operator()(OutputScalar* d_data, ComplexScalar* data_buf, size_t size) {
-    cudaMemcpy(d_data, data_buf, size * sizeof(ComplexScalar), cudaMemcpyDeviceToDevice);
-  }
-};
-
-template<typename OutputScalar, typename RealScalar, typename ComplexScalar>
-struct writeToDeviceData<OutputScalar, RealScalar, ComplexScalar, Eigen::RealPart> {
-  void operator()(OutputScalar* d_data, ComplexScalar* data_buf, size_t size) {
-    cudaMemcpy2D(d_data, sizeof(RealScalar), (RealScalar*) data_buf, 2 * sizeof(RealScalar), sizeof(RealScalar), size, cudaMemcpyDeviceToDevice);
-  }
-};
-
-template<typename OutputScalar, typename RealScalar, typename ComplexScalar>
-struct writeToDeviceData<OutputScalar, RealScalar, ComplexScalar, Eigen::ImagPart> {
-  void operator()(OutputScalar* d_data, ComplexScalar* data_buf, size_t size) {
-    RealScalar* data_buf_offset = &(((RealScalar*) data_buf)[1]);
-    cudaMemcpy2D(d_data, sizeof(RealScalar), data_buf_offset,        2 * sizeof(RealScalar), sizeof(RealScalar), size, cudaMemcpyDeviceToDevice);
-  }
-};
-
-template <typename InputScalar, typename RealScalar, typename ComplexScalar, typename InputEvaluator>
-__global__ void copyValues(ComplexScalar* d_data, InputEvaluator eval, int total_size) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < total_size) {
-    d_data[i] = MakeComplex<internal::is_same<InputScalar, RealScalar>::value>()(eval.coeff(i));
-  }
-}
-
-template<typename Scalar, typename Index, int NumDims>
-__global__ void fillLineBuf(Scalar* line_buf, Scalar* data_buf, int line_len,
-                            array<Index, NumDims> coords, array<Index, NumDims> m_strides, int dim) {
-  int j = blockIdx.x * blockDim.x + threadIdx.x;
-  if(j < line_len) {
-    coords[dim] = j;
-    Index index = 0;
-    for (int i = 0; i < NumDims; ++i) {
-      index += coords[i] * m_strides[i];
-    }
-    line_buf[j] = data_buf[index];
-  }
-}
-
-template<typename ComplexScalar, typename RealScalar, typename Index, int NumDims>
-__global__ void writebackLineBuf(ComplexScalar* line_buf, ComplexScalar* data_buf, int line_len,
-                                 array<Index, NumDims> coords, array<Index, NumDims> m_strides, int dim, RealScalar div_factor) {
-  int j = blockIdx.x * blockDim.x + threadIdx.x;
-  if(j < line_len) {
-    coords[dim] = j;
-    Index index = 0;
-    for (int i = 0; i < NumDims; ++i) {
-      index += coords[i] * m_strides[i];
-    }
-
-    data_buf[index] = line_buf[j];
-    ((RealScalar*) data_buf)[2*index] /= div_factor;
-    ((RealScalar*) data_buf)[2*index + 1] /= div_factor;
-  }
-}
-
-template <typename FFT, typename ArgType, int FFTResultType, int FFTDir>
-struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, GpuDevice> {
-  typedef TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir> XprType;
-  typedef typename XprType::Index Index;
-  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, GpuDevice>::Dimensions>::value;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::Scalar InputScalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename std::complex<RealScalar> ComplexScalar;
-  typedef typename internal::conditional<FFTResultType == Eigen::BothParts, std::complex<RealScalar>, RealScalar>::type OutputScalar;
-  typedef typename TensorEvaluator<ArgType, GpuDevice>::Dimensions InputDimensions;
-  typedef OutputScalar CoeffReturnType;
-  typedef typename PacketType<OutputScalar, GpuDevice>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-
-  enum {
-    IsAligned = false,
-    PacketAccess = false,
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, GpuDevice>::Layout,
-    RawAccess = false
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const GpuDevice& device) : m_data_buf(NULL), m_impl(op.expression(), device), m_fft(op.fft()) {
-    const typename TensorEvaluator<ArgType, GpuDevice>::Dimensions& input_dims = m_impl.dimensions();
-    for (int i = 0; i < NumDims; ++i) {
-      eigen_assert(input_dims[i] > 0);
-      m_dimensions[i] = input_dims[i];
-    }
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_strides[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
-      }
-    } else {
-      m_strides[NumDims - 1] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
-      }
-    }
-    m_size = m_dimensions.TotalSize();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
-    return m_dimensions;
-  }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* d_data) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    if (d_data) {
-      evalToDeviceData(d_data);
-      return false;
-    } else {
-      evalToSelfDataBuf();
-      return true;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndexFromCoords(const array<Index, NumDims> & coords) const {
-    Index result = 0;
-    for (int i = 0; i < NumDims; ++i) {
-      result += coords[i] * m_strides[i];
-    }
-    return result;
-  }
-
-  EIGEN_STRONG_INLINE array<Index, NumDims> getPartialCoordsFromIndex(Index index, Index omitted_dim) const {
-    array<Index, NumDims> partial_m_strides = m_strides;
-    array<Index, NumDims> index_coords;
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (Index i = omitted_dim + 1; i < NumDims; ++i) {
-        partial_m_strides[i] /= m_dimensions[omitted_dim];
-      }
-      for (int i = NumDims - 1; i > 0; --i) {
-        if(omitted_dim == i) {
-        }
-        else {
-          const Index idx = index / partial_m_strides[i];
-          index -= idx * partial_m_strides[i];
-          index_coords[i] = idx;
-        }
-      }
-      index_coords[0] = index;
-    }
-    else {
-      for (Index i = omitted_dim - 1; i >= 0; --i) {
-        partial_m_strides[i] /= m_dimensions[omitted_dim];
-      }
-      for (int i = 0; i < NumDims - 1; ++i) {
-        if(omitted_dim == i) {
-        }
-        else {
-          const Index idx = index / partial_m_strides[i];
-          index -= idx * partial_m_strides[i];
-          index_coords[i] = idx;
-        }
-      }
-      index_coords[NumDims - 1] = index;
-    }
-    // Value of index_coords[omitted_dim] is not determined to this step
-    return index_coords;
-  }
-
-  void evalToSelfDataBuf() {
-    cudaMalloc((void**) &m_data_buf, sizeof(OutputScalar) * m_size);
-    evalToDeviceData(m_data_buf);
-  }
-
-  EIGEN_STRONG_INLINE void evalToDeviceData(OutputScalar* d_data) {
-    ComplexScalar* data_buf;
-    cudaMalloc((void**) &data_buf, sizeof(ComplexScalar) * m_size);
-
-    int block_size = 128;
-    int grid_size = m_size / block_size + 1;
-
-    copyValues<InputScalar, RealScalar, ComplexScalar, TensorEvaluator<ArgType, GpuDevice> > <<<grid_size, block_size>>>(data_buf, m_impl, m_size);
-
-    for (int i = 0; i < m_fft.size(); ++i) {
-      int dim = m_fft[i];
-      eigen_assert(dim >= 0 && dim < NumDims);
-      int line_len = m_dimensions[dim];
-      ComplexScalar* line_buf;
-      cudaMalloc((void**) &line_buf, sizeof(ComplexScalar) * line_len);
-
-      cufftHandle plan;
-      cufftPlan1d(&plan, line_len, CUFFT_C2C, 1);
-
-      for (Index partial_index = 0; partial_index < m_size/line_len; ++partial_index) {
-        array<Index, NumDims> coords = getPartialCoordsFromIndex(partial_index, dim);
-        // get data into line_buf
-        int block_size = 128;
-        int grid_size = line_len / block_size + 1;
-        fillLineBuf<ComplexScalar, Index, NumDims> <<<grid_size, block_size>>>(line_buf, data_buf, line_len, coords, m_strides, dim);
-
-        if(FFTDir == Eigen::FFT_FORWARD) {
-          cufftExecC2C(plan, reinterpret_cast<cufftComplex *>(line_buf), reinterpret_cast<cufftComplex*>(line_buf), CUFFT_FORWARD);
-        }
-        else {
-          cufftExecC2C(plan, reinterpret_cast<cufftComplex*>(line_buf), reinterpret_cast<cufftComplex*>(line_buf), CUFFT_INVERSE);
-        }
-        // write back
-        RealScalar div_factor = (FFTDir == FFT_FORWARD) ? 1.0 : line_len;
-        writebackLineBuf<ComplexScalar, RealScalar, Index, NumDims> <<<grid_size, block_size>>>(line_buf, data_buf, line_len, coords, m_strides, dim, div_factor);
-        cudaDeviceSynchronize();
-
-      }
-      cufftDestroy(plan);
-      cudaFree(line_buf);
-    }
-    writeToDeviceData<OutputScalar, RealScalar, ComplexScalar, FFTResultType>()(d_data, data_buf, m_size);
-    cudaFree(data_buf);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    if(m_data_buf != NULL) cudaFree(m_data_buf);
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const {
-    return m_data_buf[index];
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const {
-    return internal::ploadt<PacketReturnType, LoadMode>(m_data_buf + index);
-  }
-
-  EIGEN_DEVICE_FUNC Scalar* data() const { return m_data_buf; }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
-    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
-  }
-
- protected:
-  int m_size;
-  const FFT& m_fft;
-  Dimensions m_dimensions;
-  array<Index, NumDims> m_strides;
-  TensorEvaluator<ArgType, GpuDevice> m_impl;
-  OutputScalar* m_data_buf;
-
-};
-#endif
-
 }  // end namespace Eigen
-#endif //EIGEN_CXX11_TENSOR_TENSOR_FFT_H
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_FFT_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index 9382403..ca39bb8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h

@@ -33,20 +33,24 @@
     typedef typename internal::traits<Self>::StorageKind StorageKind;
     typedef typename internal::traits<Self>::Index Index;
     typedef Scalar_ Scalar;
-    typedef typename internal::packet_traits<Scalar>::type Packet;
     typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef typename Base::CoeffReturnType CoeffReturnType;
 
     static const int Options = Options_;
 
     enum {
-      IsAligned = bool(EIGEN_ALIGN),
+      IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
       PacketAccess = (internal::packet_traits<Scalar>::size > 1),
       BlockAccess = false,
+      PreferBlockAccess = false,
       Layout = Options_ & RowMajor ? RowMajor : ColMajor,
       CoordAccess = true,
       RawAccess = true
-   };
+    };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
 
   typedef Dimensions_ Dimensions;
   static const std::size_t NumIndices = Dimensions::count;
@@ -55,7 +59,7 @@
   TensorStorage<Scalar, Dimensions, Options> m_storage;
 
   public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                      rank()                   const { return NumIndices; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    rank()                   const { return NumIndices; }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions&        dimensions()             const { return m_storage.dimensions(); }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    size()                   const { return m_storage.size(); }
@@ -68,13 +72,13 @@
     inline Self& base()             { return *this; }
     inline const Self& base() const { return *this; }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
-    EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return coeff(array<Index, NumIndices>{firstIndex, otherIndices...});
+      return coeff(array<Index, NumIndices>{{firstIndex, otherIndices...}});
     }
 #endif
 
@@ -86,26 +90,27 @@
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& coeff() const
-    {
-      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return m_storage.data()[0];
-    }
-
-    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
     {
       eigen_internal_assert(index >= 0 && index < size());
       return m_storage.data()[index];
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& coeff() const
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return m_storage.data()[0];
+    }
+
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
-    inline Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return coeffRef(array<Index, NumIndices>{firstIndex, otherIndices...});
+      return coeffRef(array<Index, NumIndices>{{firstIndex, otherIndices...}});
     }
 #endif
 
@@ -117,29 +122,75 @@
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& coeffRef()
-    {
-      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return m_storage.data()[0];
-    }
-
-    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
     {
       eigen_internal_assert(index >= 0 && index < size());
       return m_storage.data()[index];
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef()
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return m_storage.data()[0];
+    }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
-    inline const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return this->operator()(array<Index, NumIndices>{firstIndex, otherIndices...});
+      return this->operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+    }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
+    {
+      if (Options&RowMajor) {
+        const Index index = i1 + i0 * m_storage.dimensions()[1];
+        return m_storage.data()[index];
+      } else {
+        const Index index = i0 + i1 * m_storage.dimensions()[0];
+        return m_storage.data()[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
+    {
+      if (Options&RowMajor) {
+         const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0);
+         return m_storage.data()[index];
+      } else {
+         const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2);
+        return m_storage.data()[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
+    {
+      if (Options&RowMajor) {
+        const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0));
+        return m_storage.data()[index];
+      } else {
+        const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3));
+        return m_storage.data()[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+    {
+      if (Options&RowMajor) {
+        const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)));
+        return m_storage.data()[index];
+      } else {
+        const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4)));
+        return m_storage.data()[index];
+      }
     }
 #endif
 
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
     {
@@ -148,13 +199,6 @@
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()() const
-    {
-      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return coeff();
-    }
-
-    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
     {
       eigen_internal_assert(index >= 0 && index < size());
@@ -162,6 +206,13 @@
     }
 
     EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()() const
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return coeff();
+    }
+
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
     {
       // The bracket operator is only for vectors, use the parenthesis operator instead.
@@ -169,13 +220,58 @@
       return coeff(index);
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
-    inline Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return operator()(array<Index, NumIndices>{firstIndex, otherIndices...});
+      return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+    }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
+    {
+       if (Options&RowMajor) {
+         const Index index = i1 + i0 * m_storage.dimensions()[1];
+        return m_storage.data()[index];
+      } else {
+        const Index index = i0 + i1 * m_storage.dimensions()[0];
+        return m_storage.data()[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
+    {
+       if (Options&RowMajor) {
+         const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0);
+        return m_storage.data()[index];
+      } else {
+         const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2);
+        return m_storage.data()[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+    {
+      if (Options&RowMajor) {
+        const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0));
+        return m_storage.data()[index];
+      } else {
+        const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3));
+        return m_storage.data()[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
+    {
+      if (Options&RowMajor) {
+        const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)));
+        return m_storage.data()[index];
+      } else {
+        const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4)));
+        return m_storage.data()[index];
+      }
     }
 #endif
 
@@ -187,13 +283,6 @@
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()()
-    {
-      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return coeffRef();
-    }
-
-    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar& operator()(Index index)
     {
       eigen_assert(index >= 0 && index < size());
@@ -201,6 +290,13 @@
     }
 
     EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()()
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return coeffRef();
+    }
+
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar& operator[](Index index)
     {
       // The bracket operator is only for vectors, use the parenthesis operator instead
@@ -208,7 +304,11 @@
       return coeffRef(index);
     }
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize() { }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorFixedSize()
+      : m_storage()
+    {
+    }
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE TensorFixedSize(const Self& other)
@@ -216,7 +316,7 @@
     {
     }
 
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other)
       : m_storage(other.m_storage)
     {
@@ -231,29 +331,20 @@
       Assign assign(*this, other.derived());
       internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
     }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorFixedSize& operator=(const TensorFixedSize& other)
-    {
-      // FIXME: check that the dimensions of other match the dimensions of *this.
-      // Unfortunately this isn't possible yet when the rhs is an expression.
-      typedef TensorAssignOp<Self, const TensorFixedSize> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other)
+    EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase<OtherDerived, WriteAccessors>& other)
     {
-      // FIXME: check that the dimensions of other match the dimensions of *this.
-      // Unfortunately this isn't possible yet when the rhs is an expression.
-      typedef TensorAssignOp<Self, const OtherDerived> Assign;
-      Assign assign(*this, other);
+      typedef TensorAssignOp<TensorFixedSize, const OtherDerived> Assign;
+      Assign assign(*this, other.derived());
       internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
     }
 
+    // FIXME: check that the dimensions of other match the dimensions of *this.
+    // Unfortunately this isn't possible yet when the rhs is an expression.
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(TensorFixedSize)
+
+
   protected:
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE bool checkIndexRange(const array<Index, NumIndices>& /*indices*/) const

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index ddabb64..e800ded 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h

@@ -32,9 +32,10 @@
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 
   enum {
-    Flags = 0,
+    Flags = 0
   };
 };
 
@@ -55,7 +56,7 @@
 
 
 template<typename XprType>
-class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType> >
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
@@ -76,51 +77,113 @@
     typename XprType::Nested m_xpr;
 };
 
+namespace internal {
+template <typename Device, typename CoeffReturnType>
+struct non_integral_type_placement_new{
+  template <typename StorageType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index numValues, StorageType m_buffer) {
+   // Initialize non-trivially constructible types.
+    if (!internal::is_arithmetic<CoeffReturnType>::value) {
+      for (Index i = 0; i < numValues; ++i) new (m_buffer + i) CoeffReturnType();
+    }
+}
+};
 
-template<typename ArgType, typename Device>
-struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
+// SYCL does not support non-integral types 
+// having new (m_buffer + i) CoeffReturnType() causes the following compiler error for SYCL Devices 
+// no matching function for call to 'operator new'
+template <typename CoeffReturnType>
+struct non_integral_type_placement_new<Eigen::SyclDevice, CoeffReturnType> {
+  template <typename StorageType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index, StorageType) {
+}
+};
+} // end namespace internal
+
+template<typename ArgType_, typename Device>
+struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
 {
+  typedef const typename internal::remove_all<ArgType_>::type ArgType;
   typedef TensorForcedEvalOp<ArgType> XprType;
   typedef typename ArgType::Scalar Scalar;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
-
-  enum {
-    IsAligned = true,
-    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess = true
-  };
-
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
-  { }
-
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  enum {
+    IsAligned         = true,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = internal::is_arithmetic<CoeffReturnType>::value,
+    PreferBlockAccess = false,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess         = true
+  };
+
+  static const int NumDims = internal::traits<ArgType>::NumDimensions;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_op(op.expression()),
+      m_device(device), m_buffer(NULL)
+  { }
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
-    const Index numValues = internal::array_prod(m_impl.dimensions());
-    m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
-    // Should initialize the memory in case we're dealing with non POD types.
-    if (!internal::is_arithmetic<CoeffReturnType>::value) {
-      for (Index i = 0; i < numValues; ++i) {
-        new(m_buffer+i) CoeffReturnType();
-      }
-    }
-    typedef TensorEvalToOp<const ArgType> EvalTo;
-    EvalTo evalToTmp(m_buffer, m_op);
-    const bool PacketAccess = internal::IsVectorizable<Device, const ArgType>::value;
-    const bool BlockAccess = false;
-    internal::TensorExecutor<const EvalTo, Device, PacketAccess, BlockAccess>::run(evalToTmp, m_device);
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    const Index numValues =  internal::array_prod(m_impl.dimensions());
+    m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType)));
+
+   internal::non_integral_type_placement_new<Device, CoeffReturnType>()(numValues, m_buffer);
+
+    typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo;
+    EvalTo evalToTmp(m_device.get(m_buffer), m_op);
+
+    internal::TensorExecutor<
+        const EvalTo, typename internal::remove_const<Device>::type,
+        /*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
+        /*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::
+        run(evalToTmp, m_device);
+
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    m_device.deallocate(m_buffer);
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    const Index numValues = internal::array_prod(m_impl.dimensions());
+    m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(
+        numValues * sizeof(CoeffReturnType)));
+    typedef TensorEvalToOp<const typename internal::remove_const<ArgType>::type>
+        EvalTo;
+    EvalTo evalToTmp(m_device.get(m_buffer), m_op);
+
+    auto on_done = std::bind([](EvalSubExprsCallback done_) { done_(true); },
+                             std::move(done));
+    internal::TensorAsyncExecutor<
+        const EvalTo, typename internal::remove_const<Device>::type,
+        decltype(on_done),
+        /*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
+        /*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::
+        runAsync(evalToTmp, m_device, std::move(on_done));
+  }
+#endif
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_device.deallocate_temp(m_buffer);
     m_buffer = NULL;
   }
 
@@ -135,17 +198,37 @@
     return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return m_buffer; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    assert(m_buffer != NULL);
+    return TensorBlock::materialize(m_buffer, m_impl.dimensions(), desc, scratch);
+  }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  EvaluatorPointerType data() const { return m_buffer; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_buffer.bind(cgh);
+    m_impl.bind(cgh);
+  }
+#endif
  private:
   TensorEvaluator<ArgType, Device> m_impl;
   const ArgType m_op;
-  const Device& m_device;
-  CoeffReturnType* m_buffer;
+  const Device EIGEN_DEVICE_REF m_device;
+  EvaluatorPointerType m_buffer;
 };
 
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index b6ac725..246ebe4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h

@@ -12,28 +12,62 @@
 
 namespace Eigen {
 
+// MakePointer class is used as a container of the address space of the pointer
+// on the host and on the device. From the host side it generates the T* pointer
+// and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to
+// T* m_data on the host. It is always called on the device.
+// Specialisation of MakePointer class for creating the sycl buffer with
+// map_allocator.
+template<typename T> struct MakePointer {
+  typedef T* Type;
+  typedef const T* ConstType;
+};
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* constCast(const T* data) {
+  return const_cast<T*>(data);
+}
+
+// The StorageMemory class is a container of the device specific pointer
+// used for refering to a Pointer on TensorEvaluator class. While the TensorExpression
+// is a device-agnostic type and need MakePointer class for type conversion,
+// the TensorEvaluator class can be specialized for a device, hence it is possible
+// to construct different types of temproray storage memory in TensorEvaluator
+// for different devices by specializing the following StorageMemory class.
+template<typename T, typename device> struct StorageMemory: MakePointer <T> {};
+
+namespace internal{
+template<typename A, typename B> struct Pointer_type_promotion {
+  static const bool val=false;
+};
+template<typename A> struct Pointer_type_promotion<A, A> {
+  static const bool val = true;
+};
+template<typename A, typename B> struct TypeConversion {
+  typedef A* type;
+};
+}
+
+
+template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap;
 template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
 template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize;
-template<typename PlainObjectType, int Options_ = Unaligned> class TensorMap;
 template<typename PlainObjectType> class TensorRef;
-template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value> class TensorBase;
+template<typename Derived, int AccessLevel> class TensorBase;
 
 template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp;
 template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp;
 template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp;
 template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> class TensorCwiseTernaryOp;
 template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp;
-template<typename Op, typename Dims, typename XprType> class TensorReductionOp;
+template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_ = MakePointer > class TensorReductionOp;
 template<typename XprType> class TensorIndexTupleOp;
 template<typename ReduceOp, typename Dims, typename XprType> class TensorTupleReducerOp;
 template<typename Axis, typename LeftXprType, typename RightXprType> class TensorConcatenationOp;
-template<typename Dimensions, typename LeftXprType, typename RightXprType> class TensorContractionOp;
+template<typename Dimensions, typename LeftXprType, typename RightXprType, typename OutputKernelType> class TensorContractionOp;
 template<typename TargetType, typename XprType> class TensorConversionOp;
 template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp;
 template<typename FFT, typename XprType, int FFTDataType, int FFTDirection> class TensorFFTOp;
-template<typename IFFT, typename XprType, int ResultType> class TensorIFFTOp;
-template<typename DFT, typename XprType, int ResultType> class TensorDFTOp;
-template<typename IDFT, typename XprType, int ResultType> class TensorIDFTOp;
 template<typename PatchDim, typename XprType> class TensorPatchOp;
 template<DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorImagePatchOp;
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorVolumePatchOp;
@@ -43,7 +77,6 @@
 template<typename XprType> class TensorLayoutSwapOp;
 template<typename StartIndices, typename Sizes, typename XprType> class TensorSlicingOp;
 template<typename ReverseDimensions, typename XprType> class TensorReverseOp;
-template<typename XprType> class TensorTrueIndicesOp;
 template<typename PaddingDimensions, typename XprType> class TensorPaddingOp;
 template<typename Shuffle, typename XprType> class TensorShufflingOp;
 template<typename Strides, typename XprType> class TensorStridingOp;
@@ -52,21 +85,51 @@
 template<typename Generator, typename XprType> class TensorGeneratorOp;
 template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
 template<typename Op, typename XprType> class TensorScanOp;
+template<typename Dims, typename XprType> class TensorTraceOp;
 
 template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
 template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
 
-template<typename XprType> class TensorEvalToOp;
+template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorEvalToOp;
 template<typename XprType> class TensorForcedEvalOp;
 
 template<typename ExpressionType, typename DeviceType> class TensorDevice;
+template<typename ExpressionType, typename DeviceType, typename DoneCallback> class TensorAsyncDevice;
 template<typename Derived, typename Device> struct TensorEvaluator;
 
-class DefaultDevice;
-class ThreadPoolDevice;
-class GpuDevice;
+struct NoOpOutputKernel;
 
-enum DFTResultType {
+struct DefaultDevice;
+struct ThreadPoolDevice;
+struct GpuDevice;
+struct SyclDevice;
+
+#ifdef EIGEN_USE_SYCL
+
+template <typename T> struct MakeSYCLPointer {
+  typedef Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T> Type;
+};
+
+template <typename T>
+EIGEN_STRONG_INLINE const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>&
+constCast(const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>& data) {
+  return data;
+}
+
+template <typename T>
+struct StorageMemory<T, SyclDevice> : MakeSYCLPointer<T> {};
+template <typename T>
+struct StorageMemory<T, const SyclDevice> : StorageMemory<T, SyclDevice> {};
+
+namespace TensorSycl {
+namespace internal{
+template <typename Evaluator, typename Op> class GenericNondeterministicReducer;
+}
+}
+#endif
+
+
+enum FFTResultType {
   RealPart = 0,
   ImagPart = 1,
   BothParts = 2
@@ -77,7 +140,9 @@
     FFT_REVERSE = 1
 };
 
+
 namespace internal {
+
 template <typename Device, typename Expression>
 struct IsVectorizable {
   static const bool value = TensorEvaluator<Expression, Device>::PacketAccess;
@@ -89,15 +154,36 @@
                             TensorEvaluator<Expression, GpuDevice>::IsAligned;
 };
 
+// Tiled evaluation strategy.
+enum TiledEvaluation {
+  Off = 0,    // tiled evaluation is not supported
+  On = 1,     // still work in progress (see TensorBlock.h)
+};
+
 template <typename Device, typename Expression>
 struct IsTileable {
-  static const bool value = TensorEvaluator<Expression, Device>::BlockAccess;
+  // Check that block evaluation is supported and it's a preferred option (at
+  // least one sub-expression has much faster block evaluation, e.g.
+  // broadcasting).
+  static const bool BlockAccess =
+      TensorEvaluator<Expression, Device>::BlockAccess &&
+      TensorEvaluator<Expression, Device>::PreferBlockAccess;
+
+  static const TiledEvaluation value =
+      BlockAccess ? TiledEvaluation::On : TiledEvaluation::Off;
 };
 
 template <typename Expression, typename Device,
-          bool Vectorizable = IsVectorizable<Device, Expression>::value,
-          bool Tileable = IsTileable<Device, Expression>::value>
+          bool Vectorizable      = IsVectorizable<Device, Expression>::value,
+          TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
 class TensorExecutor;
+
+template <typename Expression, typename Device, typename DoneCallback,
+          bool Vectorizable = IsVectorizable<Device, Expression>::value,
+          TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
+class TensorAsyncExecutor;
+
+
 }  // end namespace internal
 
 }  // end namespace Eigen

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index eb422da..d963032 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h

@@ -15,32 +15,58 @@
 
 
 /** \internal
+ * \brief Template functor to compute the modulo between an array and a scalar.
+ */
+template <typename Scalar>
+struct scalar_mod_op {
+  EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a % m_divisor; }
+  const Scalar m_divisor;
+};
+template <typename Scalar>
+struct functor_traits<scalar_mod_op<Scalar> >
+{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
+
+
+/** \internal
  * \brief Template functor to compute the modulo between 2 arrays.
  */
 template <typename Scalar>
 struct scalar_mod2_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op);
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
 };
 template <typename Scalar>
 struct functor_traits<scalar_mod2_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
+{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
 
+template <typename Scalar>
+struct scalar_fmod_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& a, const Scalar& b) const {
+    return numext::fmod(a, b);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_fmod_op<Scalar> > {
+  enum { Cost = 13,  // Reciprocal throughput of FPREM on Haswell.
+         PacketAccess = false };
+};
 
 template<typename Reducer, typename Device>
 struct reducer_traits {
   enum {
     Cost = 1,
-    PacketAccess = false
+    PacketAccess = false,
+    IsStateful = false,
+    IsExactlyAssociative = true
   };
 };
 
 // Standard reduction functors
 template <typename T> struct SumReducer
 {
-  static const bool PacketAccess = packet_traits<T>::HasAdd;
-  static const bool IsStateful = false;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
     internal::scalar_sum_op<T> sum_op;
     *accum = sum_op(*accum, t);
@@ -67,7 +93,8 @@
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
-    return saccum + predux(vaccum);
+    internal::scalar_sum_op<T> sum_op;
+    return sum_op(saccum, predux(vaccum));
   }
 };
 
@@ -75,16 +102,15 @@
 struct reducer_traits<SumReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::HasAdd
+    PacketAccess = PacketType<T, Device>::HasAdd,
+    IsStateful = false,
+    IsExactlyAssociative = NumTraits<T>::IsInteger
   };
 };
 
-
 template <typename T> struct MeanReducer
 {
-  static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger;
-  static const bool IsStateful = true;
-
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   MeanReducer() : scalarCount_(0), packetCount_(0) { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
@@ -107,15 +133,20 @@
     return pset1<Packet>(initialize());
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
-    return accum / scalarCount_;
+    internal::scalar_quotient_op<T> quotient_op;
+    return quotient_op(accum, T(scalarCount_));
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
-    return pdiv(vaccum, pset1<Packet>(packetCount_));
+    return pdiv(vaccum, pset1<Packet>(T(packetCount_)));
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
-    return (saccum + predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits<Packet>::size);
+    internal::scalar_sum_op<T> sum_op;
+    internal::scalar_quotient_op<T> quotient_op;
+    return quotient_op(
+        sum_op(saccum, predux(vaccum)),
+        T(scalarCount_ + packetCount_ * unpacket_traits<Packet>::size));
   }
 
   protected:
@@ -127,80 +158,53 @@
 struct reducer_traits<MeanReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::HasAdd
+    PacketAccess = PacketType<T, Device>::HasAdd &&
+                   PacketType<T, Device>::HasDiv && !NumTraits<T>::IsInteger,
+    IsStateful = true,
+    IsExactlyAssociative = NumTraits<T>::IsInteger
   };
 };
 
 
-struct AndReducer
-{
-  static const bool PacketAccess = false;
-  static const bool IsStateful = false;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
-    *accum = *accum && t;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const {
-    return true;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const {
-    return accum;
-  }
-};
-
-struct OrReducer {
-  static const bool PacketAccess = false;
-  static const bool IsStateful = false;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
-    *accum = *accum || t;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const {
-    return false;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const {
-    return accum;
-  }
-};
-
 template <typename T, bool IsMax = true, bool IsInteger = true>
 struct MinMaxBottomValue {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
     return Eigen::NumTraits<T>::lowest();
   }
 };
 template <typename T>
 struct MinMaxBottomValue<T, true, false> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
     return -Eigen::NumTraits<T>::infinity();
   }
 };
 template <typename T>
 struct MinMaxBottomValue<T, false, true> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
     return Eigen::NumTraits<T>::highest();
   }
 };
 template <typename T>
 struct MinMaxBottomValue<T, false, false> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
     return Eigen::NumTraits<T>::infinity();
   }
 };
 
-template <typename T> struct MaxReducer
-{
-  static const bool PacketAccess = packet_traits<T>::HasMax;
-  static const bool IsStateful = false;
 
+template <typename T, int NaNPropagation=PropagateFast> struct MaxReducer
+{
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
-    if (t > *accum) { *accum = t; }
+    scalar_max_op<T, T, NaNPropagation> op;
+    *accum = op(t, *accum);
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
-    (*accum) = pmax<Packet>(*accum, p);
+    scalar_max_op<T, T, NaNPropagation> op;
+    (*accum) = op.packetOp(*accum, p);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return MinMaxBottomValue<T, true, Eigen::NumTraits<T>::IsInteger>::bottom_value();
+    return MinMaxBottomValue<T, /*IsMax=*/true, Eigen::NumTraits<T>::IsInteger>::bottom_value();
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
@@ -215,34 +219,34 @@
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
-    return numext::maxi(saccum, predux_max(vaccum));
+    scalar_max_op<T, T, NaNPropagation> op;
+    return op(saccum, op.predux(vaccum));
   }
 };
 
-template <typename T, typename Device>
-struct reducer_traits<MaxReducer<T>, Device> {
+template <typename T, typename Device, int NaNPropagation>
+    struct reducer_traits<MaxReducer<T, NaNPropagation>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::HasMax
+    PacketAccess = PacketType<T, Device>::HasMax,
+    IsStateful = false,
+    IsExactlyAssociative = (NaNPropagation!=PropagateFast)
   };
 };
 
-
-template <typename T> struct MinReducer
+template <typename T, int NaNPropagation=PropagateFast> struct MinReducer
 {
-  static const bool PacketAccess = packet_traits<T>::HasMin;
-  static const bool IsStateful = false;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
-    if (t < *accum) { *accum = t; }
+    scalar_min_op<T, T, NaNPropagation> op;
+    *accum = op(t, *accum);
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
-    (*accum) = pmin<Packet>(*accum, p);
+    scalar_min_op<T, T, NaNPropagation> op;
+    (*accum) = op.packetOp(*accum, p);
   }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return MinMaxBottomValue<T, false, Eigen::NumTraits<T>::IsInteger>::bottom_value();
+    return MinMaxBottomValue<T, /*IsMax=*/false, Eigen::NumTraits<T>::IsInteger>::bottom_value();
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
@@ -257,32 +261,31 @@
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
-    return numext::mini(saccum, predux_min(vaccum));
+    scalar_min_op<T, T, NaNPropagation> op;
+    return op(saccum, op.predux(vaccum));
   }
 };
 
-template <typename T, typename Device>
-struct reducer_traits<MinReducer<T>, Device> {
+template <typename T, typename Device, int NaNPropagation>
+    struct reducer_traits<MinReducer<T, NaNPropagation>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::HasMin
+    PacketAccess = PacketType<T, Device>::HasMin,
+    IsStateful = false,
+    IsExactlyAssociative = (NaNPropagation!=PropagateFast)
   };
 };
 
-
 template <typename T> struct ProdReducer
 {
-  static const bool PacketAccess = packet_traits<T>::HasMul;
-  static const bool IsStateful = false;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
-    (*accum) *= t;
+    internal::scalar_product_op<T> prod_op;
+    (*accum) = prod_op(*accum, t);
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
     (*accum) = pmul<Packet>(*accum, p);
   }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
     internal::scalar_cast_op<int, T> conv;
     return conv(1);
@@ -300,7 +303,8 @@
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
-    return saccum * predux_mul(vaccum);
+    internal::scalar_product_op<T> prod_op;
+    return prod_op(saccum, predux_mul(vaccum));
   }
 };
 
@@ -308,7 +312,113 @@
 struct reducer_traits<ProdReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::MulCost,
-    PacketAccess = PacketType<T, Device>::HasMul
+    PacketAccess = PacketType<T, Device>::HasMul,
+    IsStateful = false,
+    IsExactlyAssociative = true
+  };
+};
+
+
+struct AndReducer
+{
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
+    *accum = *accum && t;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const {
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const {
+    return accum;
+  }
+};
+
+template <typename Device>
+struct reducer_traits<AndReducer, Device> {
+  enum {
+    Cost = 1,
+    PacketAccess = false,
+    IsStateful = false,
+    IsExactlyAssociative = true
+  };
+};
+
+
+struct OrReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
+    *accum = *accum || t;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const {
+    return false;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const {
+    return accum;
+  }
+};
+
+template <typename Device>
+struct reducer_traits<OrReducer, Device> {
+  enum {
+    Cost = 1,
+    PacketAccess = false,
+    IsStateful = false,
+    IsExactlyAssociative = true
+  };
+};
+
+// Argmin/Argmax reducers.  Returns the first occurrence if multiple locations
+// contain the same min/max value.
+template <typename T> struct ArgMaxTupleReducer
+{
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    if (t.second < accum->second) {
+      return;
+    } else if (t.second > accum->second || accum->first > t.first ) {
+      *accum = t;
+    }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return T(0, NumTraits<typename T::second_type>::lowest());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const {
+    return accum;
+  }
+};
+
+template <typename T, typename Device>
+struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = false,
+    IsStateful = false,
+    IsExactlyAssociative = true
+  };
+};
+
+
+template <typename T> struct ArgMinTupleReducer
+{
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const {
+    if (t.second > accum->second) {
+      return;
+    } else if (t.second < accum->second || accum->first > t.first) {
+      *accum = t;
+    }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return T(0, NumTraits<typename T::second_type>::highest());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const {
+    return accum;
+  }
+};
+
+template <typename T, typename Device>
+struct reducer_traits<ArgMinTupleReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = false,
+    IsStateful = false,
+    IsExactlyAssociative = true
   };
 };
 
@@ -320,15 +430,18 @@
 
   EIGEN_DEVICE_FUNC GaussianGenerator(const array<T, NumDims>& means,
                                       const array<T, NumDims>& std_devs)
-      : m_means(means) {
-    for (int i = 0; i < NumDims; ++i) {
+      : m_means(means)
+  {
+    EIGEN_UNROLL_LOOP
+    for (size_t i = 0; i < NumDims; ++i) {
       m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2;
     }
   }
 
-  T operator()(const array<Index, NumDims>& coordinates) const {
+  EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
     T tmp = T(0);
-    for (int i = 0; i < NumDims; ++i) {
+    EIGEN_UNROLL_LOOP
+    for (size_t i = 0; i < NumDims; ++i) {
       T offset = coordinates[i] - m_means[i];
       tmp += offset * offset / m_two_sigmas[i];
     }
@@ -350,57 +463,24 @@
   };
 };
 
-template <typename T> struct ArgMaxTupleReducer
-{
-  static const bool PacketAccess = false;
-  static const bool IsStateful = false;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
-    if (t.second > accum->second) { *accum = t; }
+template <typename Scalar>
+struct scalar_clamp_op {
+  EIGEN_DEVICE_FUNC inline scalar_clamp_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
+  operator()(const Scalar& x) const {
+    return numext::mini(numext::maxi(x, m_min), m_max);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return T(0, NumTraits<typename T::second_type>::lowest());
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& x) const {
+    return internal::pmin(internal::pmax(x, pset1<Packet>(m_min)), pset1<Packet>(m_max));
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const {
-    return accum;
-  }
+  const Scalar m_min;
+  const Scalar m_max;
 };
-
-
-template <typename T, typename Device>
-struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
-  enum {
-    Cost = NumTraits<T>::AddCost,
-    PacketAccess = false
-  };
-};
-
-
-template <typename T> struct ArgMinTupleReducer
-{
-  static const bool PacketAccess = false;
-  static const bool IsStateful = false;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const {
-    if (t.second < accum->second) { *accum = t; }
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return T(0, NumTraits<typename T::second_type>::highest());
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const {
-    return accum;
-  }
-};
-
-
-template <typename T, typename Device>
-struct reducer_traits<ArgMinTupleReducer<T>, Device> {
-  enum {
-    Cost = NumTraits<T>::AddCost,
-    PacketAccess = false
-  };
-};
-
+template<typename Scalar>
+struct functor_traits<scalar_clamp_op<Scalar> >
+{ enum { Cost = 2 * NumTraits<Scalar>::AddCost, PacketAccess = (packet_traits<Scalar>::HasMin && packet_traits<Scalar>::HasMax)}; };
 
 } // end namespace internal
 } // end namespace Eigen

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index 47175bc..174bf06 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h

@@ -12,7 +12,7 @@
 
 namespace Eigen {
 
-/** \class TensorGenerator
+/** \class TensorGeneratorOp
   * \ingroup CXX11_Tensor_Module
   *
   * \brief Tensor generator class.
@@ -25,13 +25,13 @@
 {
   typedef typename XprType::Scalar Scalar;
   typedef traits<XprType> XprTraits;
-  typedef typename packet_traits<Scalar>::type Packet;
   typedef typename XprTraits::StorageKind StorageKind;
   typedef typename XprTraits::Index Index;
   typedef typename XprType::Nested Nested;
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Generator, typename XprType>
@@ -55,10 +55,8 @@
 {
   public:
   typedef typename Eigen::internal::traits<TensorGeneratorOp>::Scalar Scalar;
-  typedef typename Eigen::internal::traits<TensorGeneratorOp>::Packet Packet;
   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
   typedef typename Eigen::internal::nested<TensorGeneratorOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorGeneratorOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorGeneratorOp>::Index Index;
@@ -88,46 +86,60 @@
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
   static const int NumDims = internal::array_size<Dimensions>::value;
   typedef typename XprType::Scalar Scalar;
-
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
   enum {
-    IsAligned = false,
-    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = true,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_generator(op.generator())
-  {
-    TensorEvaluator<ArgType, Device> impl(op.expression(), device);
-    m_dimensions = impl.dimensions();
+  typedef internal::TensorIntDivisor<Index> IndexDivisor;
 
-    if (NumDims > 0) {
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        m_strides[0] = 1;
-        for (int i = 1; i < NumDims; ++i) {
-          m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
-        }
-      } else {
-        m_strides[NumDims - 1] = 1;
-        for (int i = NumDims - 2; i >= 0; --i) {
-          m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
-        }
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      :  m_device(device), m_generator(op.generator())
+  {
+    TensorEvaluator<ArgType, Device> argImpl(op.expression(), device);
+    m_dimensions = argImpl.dimensions();
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_strides[0] = 1;
+      EIGEN_UNROLL_LOOP
+      for (int i = 1; i < NumDims; ++i) {
+        m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
+        if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
+      }
+    } else {
+      m_strides[NumDims - 1] = 1;
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
+        if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
       }
     }
   }
 
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
@@ -140,11 +152,11 @@
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+packetSize-1 < dimensions().TotalSize());
 
-    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
     for (int i = 0; i < packetSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -152,38 +164,136 @@
     return rslt;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    // TODO(rmlarsen): This is just a placeholder. Define interface to make generators return their cost.
-    return TensorOpCost(
-        0, 0, TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>());
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.firstLevelCacheSize();
+    // TODO(ezhulenev): Generator should have a cost.
+    return internal::TensorBlockResourceRequirements::skewed<Scalar>(
+        target_size);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  struct BlockIteratorState {
+    Index stride;
+    Index span;
+    Index size;
+    Index count;
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    static const bool is_col_major =
+        static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+    // Compute spatial coordinates for the first block element.
+    array<Index, NumDims> coords;
+    extract_coordinates(desc.offset(), coords);
+    array<Index, NumDims> initial_coords = coords;
+
+    // Offset in the output block buffer.
+    Index offset = 0;
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims> it;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = is_col_major ? i : NumDims - 1 - i;
+      it[i].size = desc.dimension(dim);
+      it[i].stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].stride);
+      it[i].span = it[i].stride * (it[i].size - 1);
+      it[i].count = 0;
+    }
+    eigen_assert(it[0].stride == 1);
+
+    // Prepare storage for the materialized generator result.
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(desc, scratch);
+
+    CoeffReturnType* block_buffer = block_storage.data();
+
+    static const int packet_size = PacketType<CoeffReturnType, Device>::size;
+
+    static const int inner_dim = is_col_major ? 0 : NumDims - 1;
+    const Index inner_dim_size = it[0].size;
+    const Index inner_dim_vectorized = inner_dim_size - packet_size;
+
+    while (it[NumDims - 1].count < it[NumDims - 1].size) {
+      Index i = 0;
+      // Generate data for the vectorized part of the inner-most dimension.
+      for (; i <= inner_dim_vectorized; i += packet_size) {
+        for (Index j = 0; j < packet_size; ++j) {
+          array<Index, NumDims> j_coords = coords;  // Break loop dependence.
+          j_coords[inner_dim] += j;
+          *(block_buffer + offset + i + j) = m_generator(j_coords);
+        }
+        coords[inner_dim] += packet_size;
+      }
+      // Finalize non-vectorized part of the inner-most dimension.
+      for (; i < inner_dim_size; ++i) {
+        *(block_buffer + offset + i) = m_generator(coords);
+        coords[inner_dim]++;
+      }
+      coords[inner_dim] = initial_coords[inner_dim];
+
+      // For the 1d tensor we need to generate only one inner-most dimension.
+      if (NumDims == 1) break;
+
+      // Update offset.
+      for (i = 1; i < NumDims; ++i) {
+        if (++it[i].count < it[i].size) {
+          offset += it[i].stride;
+          coords[is_col_major ? i : NumDims - 1 - i]++;
+          break;
+        }
+        if (i != NumDims - 1) it[i].count = 0;
+        coords[is_col_major ? i : NumDims - 1 - i] =
+            initial_coords[is_col_major ? i : NumDims - 1 - i];
+        offset -= it[i].span;
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool) const {
+    // TODO(rmlarsen): This is just a placeholder. Define interface to make
+    // generators return their cost.
+    return TensorOpCost(0, 0, TensorOpCost::AddCost<Scalar>() +
+                                  TensorOpCost::MulCost<Scalar>());
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType  data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler&) const {}
+#endif
 
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void extract_coordinates(Index index, array<Index, NumDims>& coords) const {
-    if (NumDims > 0) {
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        for (int i = NumDims - 1; i > 0; --i) {
-          const Index idx = index / m_strides[i];
-          index -= idx * m_strides[i];
-          coords[i] = idx;
-        }
-        coords[0] = index;
-      } else {
-        for (int i = 0; i < NumDims - 1; ++i) {
-          const Index idx = index / m_strides[i];
-          index -= idx * m_strides[i];
-          coords[i] = idx;
-        }
-        coords[NumDims-1] = index;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_fast_strides[i];
+        index -= idx * m_strides[i];
+        coords[i] = idx;
       }
+      coords[0] = index;
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_fast_strides[i];
+        index -= idx * m_strides[i];
+        coords[i] = idx;
+      }
+      coords[NumDims-1] = index;
     }
   }
 
+  const Device EIGEN_DEVICE_REF m_device;
   Dimensions m_dimensions;
   array<Index, NumDims> m_strides;
+  array<IndexDivisor, NumDims> m_fast_strides;
   Generator m_generator;
 };
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
new file mode 100644
index 0000000..cb53ce2
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h

@@ -0,0 +1,99 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
+#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
+
+// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design
+// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU,  but
+// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
+// When compiling such files, gcc will end up trying to pick up the CUDA headers by 
+// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
+// This will obviously not work when trying to compile tensorflow on a system with no CUDA
+// To work around this issue for HIP systems (and leave the default behaviour intact), the
+// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and 
+// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
+// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well
+
+#if defined(EIGEN_USE_HIP)
+
+#define gpuStream_t hipStream_t
+#define gpuDeviceProp_t hipDeviceProp_t
+#define gpuError_t hipError_t
+#define gpuSuccess hipSuccess
+#define gpuErrorNotReady hipErrorNotReady
+#define gpuGetDeviceCount hipGetDeviceCount
+#define gpuGetLastError hipGetLastError
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorName hipGetErrorName
+#define gpuGetErrorString hipGetErrorString
+#define gpuGetDeviceProperties hipGetDeviceProperties
+#define gpuStreamDefault hipStreamDefault
+#define gpuGetDevice hipGetDevice
+#define gpuSetDevice hipSetDevice
+#define gpuMalloc hipMalloc
+#define gpuFree hipFree
+#define gpuMemsetAsync hipMemsetAsync
+#define gpuMemcpyAsync hipMemcpyAsync
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuStreamQuery hipStreamQuery
+#define gpuSharedMemConfig hipSharedMemConfig
+#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig
+#define gpuStreamSynchronize hipStreamSynchronize
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuMemcpy hipMemcpy
+
+#else
+
+#define gpuStream_t cudaStream_t
+#define gpuDeviceProp_t cudaDeviceProp
+#define gpuError_t cudaError_t
+#define gpuSuccess cudaSuccess
+#define gpuErrorNotReady cudaErrorNotReady
+#define gpuGetDeviceCount cudaGetDeviceCount
+#define gpuGetLastError cudaGetLastError
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorName cudaGetErrorName
+#define gpuGetErrorString cudaGetErrorString
+#define gpuGetDeviceProperties cudaGetDeviceProperties
+#define gpuStreamDefault cudaStreamDefault
+#define gpuGetDevice cudaGetDevice
+#define gpuSetDevice cudaSetDevice
+#define gpuMalloc cudaMalloc
+#define gpuFree cudaFree
+#define gpuMemsetAsync cudaMemsetAsync
+#define gpuMemcpyAsync cudaMemcpyAsync
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuStreamQuery cudaStreamQuery
+#define gpuSharedMemConfig cudaSharedMemConfig
+#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig
+#define gpuStreamSynchronize cudaStreamSynchronize
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuMemcpy cudaMemcpy
+
+#endif
+
+// gpu_assert can be overridden
+#ifndef gpu_assert
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+// HIPCC do not support the use of assert on the GPU side.
+#define gpu_assert(COND)
+#else
+#define gpu_assert(COND) assert(COND)
+#endif
+
+#endif // gpu_assert
+
+#endif  // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
new file mode 100644
index 0000000..1d142f2
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h

@@ -0,0 +1,44 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
+
+#ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
+
+#undef gpuStream_t
+#undef gpuDeviceProp_t 
+#undef gpuError_t
+#undef gpuSuccess
+#undef gpuErrorNotReady
+#undef gpuGetDeviceCount
+#undef gpuGetErrorString
+#undef gpuGetDeviceProperties
+#undef gpuStreamDefault
+#undef gpuGetDevice
+#undef gpuSetDevice
+#undef gpuMalloc
+#undef gpuFree
+#undef gpuMemsetAsync
+#undef gpuMemcpyAsync
+#undef gpuMemcpyDeviceToDevice
+#undef gpuMemcpyDeviceToHost
+#undef gpuMemcpyHostToDevice
+#undef gpuStreamQuery
+#undef gpuSharedMemConfig
+#undef gpuDeviceSetSharedMemConfig
+#undef gpuStreamSynchronize
+#undef gpuDeviceSynchronize
+#undef gpuMemcpy
+
+#endif // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
+
+#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
+
+#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
index 53dc0b0..a901c5d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h

@@ -13,38 +13,61 @@
 namespace Eigen {
 
 namespace internal {
-template<>
-struct significant_decimals_impl<std::string>
-    : significant_decimals_default_impl<std::string, true>
-{};
-}
 
+// Print the tensor as a 2d matrix
+template <typename Tensor, int Rank>
+struct TensorPrinter {
+  static void run (std::ostream& os, const Tensor& tensor) {
+    typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
+    typedef typename Tensor::Index Index;
+    const Index total_size = internal::array_prod(tensor.dimensions());
+    if (total_size > 0) {
+      const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
+      static const int layout = Tensor::Layout;
+      Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
+      os << matrix;
+    }
+  }
+};
+
+
+// Print the tensor as a vector
+template <typename Tensor>
+struct TensorPrinter<Tensor, 1> {
+  static void run (std::ostream& os, const Tensor& tensor) {
+    typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
+    typedef typename Tensor::Index Index;
+    const Index total_size = internal::array_prod(tensor.dimensions());
+    if (total_size > 0) {
+      Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
+      os << array;
+    }
+  }
+};
+
+
+// Print the tensor as a scalar
+template <typename Tensor>
+struct TensorPrinter<Tensor, 0> {
+  static void run (std::ostream& os, const Tensor& tensor) {
+    os << tensor.coeff(0);
+  }
+};
+}
 
 template <typename T>
 std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) {
+  typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator;
+  typedef typename Evaluator::Dimensions Dimensions;
+
   // Evaluate the expression if needed
   TensorForcedEvalOp<const T> eval = expr.eval();
-  TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> tensor(eval, DefaultDevice());
+  Evaluator tensor(eval, DefaultDevice());
   tensor.evalSubExprsIfNeeded(NULL);
 
-  typedef typename internal::remove_const<typename T::Scalar>::type Scalar;
-  typedef typename T::Index Index;
-  typedef typename TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Dimensions Dimensions;
-  const Index total_size = internal::array_prod(tensor.dimensions());
-
-  // Print the tensor as a 1d vector or a 2d matrix.
+  // Print the result
   static const int rank = internal::array_size<Dimensions>::value;
-  if (rank == 0) {
-    os << tensor.coeff(0);
-  } else if (rank == 1) {
-    Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
-    os << array;
-  } else {
-    const Index first_dim = tensor.dimensions()[0];
-    static const int layout = TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Layout;
-    Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
-    os << matrix;
-  }
+  internal::TensorPrinter<Evaluator, rank>::run(os, tensor);
 
   // Cleanup.
   tensor.cleanup();

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index ac3a63b..dd51850 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h

@@ -27,18 +27,19 @@
   * patch_cols, and 1 for all the additional dimensions.
   */
 namespace internal {
+
 template<DenseIndex Rows, DenseIndex Cols, typename XprType>
 struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType>
 {
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef traits<XprType> XprTraits;
-  typedef typename packet_traits<Scalar>::type Packet;
   typedef typename XprTraits::StorageKind StorageKind;
   typedef typename XprTraits::Index Index;
   typedef typename XprType::Nested Nested;
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions + 1;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<DenseIndex Rows, DenseIndex Cols, typename XprType>
@@ -79,8 +80,8 @@
       Scalar* dst_data, const Index src_index) {
     const Impl& impl = self.impl();
     const Index packet_size = internal::unpacket_traits<Packet>::size;
-    const Index vectorized_size = (num_coeff_to_copy / packet_size) *
-        packet_size;
+    const Index vectorized_size =
+        (num_coeff_to_copy / packet_size) * packet_size;
     for (Index i = 0; i < vectorized_size; i += packet_size) {
       Packet p = impl.template packet<Unaligned>(src_index + i);
       internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i, p);
@@ -101,8 +102,8 @@
       const Index dst_index, Scalar* dst_data) {
     const Index packet_size = internal::unpacket_traits<Packet>::size;
     const Packet padded_packet = internal::pset1<Packet>(padding_value);
-    const Index vectorized_size = (num_coeff_to_pad / packet_size) *
-        packet_size;
+    const Index vectorized_size =
+        (num_coeff_to_pad / packet_size) * packet_size;
     for (Index i = 0; i < vectorized_size; i += packet_size) {
       internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i,
                                                    padded_packet);
@@ -120,10 +121,8 @@
 {
   public:
   typedef typename Eigen::internal::traits<TensorImagePatchOp>::Scalar Scalar;
-  typedef typename Eigen::internal::traits<TensorImagePatchOp>::Packet Packet;
   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
   typedef typename Eigen::internal::nested<TensorImagePatchOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorImagePatchOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorImagePatchOp>::Index Index;
@@ -133,12 +132,12 @@
                                                            DenseIndex in_row_strides, DenseIndex in_col_strides,
                                                            DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
                                                            PaddingType padding_type, Scalar padding_value)
-      : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
-        m_row_strides(row_strides), m_col_strides(col_strides),
-        m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
-        m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
-        m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
-        m_padding_type(padding_type), m_padding_value(padding_value) {}
+                                                           : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+                                                           m_row_strides(row_strides), m_col_strides(col_strides),
+                                                           m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+                                                           m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+                                                           m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
+                                                           m_padding_type(padding_type), m_padding_value(padding_value) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
                                                            DenseIndex row_strides, DenseIndex col_strides,
@@ -147,13 +146,14 @@
                                                            DenseIndex padding_top, DenseIndex padding_bottom,
                                                            DenseIndex padding_left, DenseIndex padding_right,
                                                            Scalar padding_value)
-      : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
-        m_row_strides(row_strides), m_col_strides(col_strides),
-        m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
-        m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
-        m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
-        m_padding_left(padding_left), m_padding_right(padding_right),
-        m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
+                                                           : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+                                                           m_row_strides(row_strides), m_col_strides(col_strides),
+                                                           m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+                                                           m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+                                                           m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
+                                                           m_padding_left(padding_left), m_padding_right(padding_right),
+                                                           m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
+
 
     EIGEN_DEVICE_FUNC
     DenseIndex patch_rows() const { return m_patch_rows; }
@@ -219,29 +219,33 @@
   static const int NumDims = NumInputDims + 1;
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  static const Index PacketSize = internal::unpacket_traits<PacketReturnType>::size;
   typedef TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>,
                           Device> Self;
   typedef TensorEvaluator<ArgType, Device> Impl;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = true,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = NumDims == 5,
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = false,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,
+    RawAccess         = false
   };
 
-  typedef typename internal::TensorBlock<Index, Scalar, NumDims, Layout>
-    OutputTensorBlock;
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device)
+  EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
+      : m_device(device), m_impl(op.expression(), device)
   {
-    EIGEN_STATIC_ASSERT(NumDims >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
     m_paddingValue = op.padding_value();
 
@@ -305,9 +309,15 @@
           // Calculate the padding
           m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2;
           m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2;
+          // The padding size calculation for PADDING_SAME has been updated to
+          // be consistent with how TensorFlow extracts its paddings.
+          m_rowPaddingTop = numext::maxi<Index>(0, m_rowPaddingTop);
+          m_colPaddingLeft = numext::maxi<Index>(0, m_colPaddingLeft);
           break;
         default:
           eigen_assert(false && "unexpected padding");
+          m_outputCols=0; // silence the uninitialised warning;
+          m_outputRows=0; //// silence the uninitialised warning;
       }
     }
     eigen_assert(m_outputRows > 0);
@@ -375,20 +385,24 @@
     } else {
       m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
     }
-
-    m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
-                                          device.lastLevelCacheSize() /
-                                          sizeof(Scalar));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -407,7 +421,6 @@
     const Index colIndex = patch2DIndex / m_fastOutputRows;
     const Index colOffset = patchOffset / m_fastColStride;
     const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
-
     const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0);
     if (inputCol < 0 || inputCol >= m_input_cols_eff ||
         ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
@@ -418,7 +431,6 @@
     const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
     const Index rowOffset = patchOffset - colOffset * m_colStride;
     const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
-
     const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride) : 0);
     if (inputRow < 0 || inputRow >= m_input_rows_eff ||
         ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
@@ -435,7 +447,7 @@
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) {
@@ -491,223 +503,45 @@
     return packetWithPossibleZero(index);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
-      std::vector<internal::TensorOpResourceRequirements>* resources) const {
-    resources->push_back(internal::TensorOpResourceRequirements(
-        internal::kSkewedInnerDims, m_block_total_size_max));
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
   }
+#endif
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
-      OutputTensorBlock* output_block) const {
-    typedef typename internal::ImagePatchCopyOp<Self, PacketAccess>
-        ImagePatchCopyOp;
-    typedef typename internal::ImagePatchPaddingOp<Self> ImagePatchPaddingOp;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; }
 
-    // Calculate loop limits and various input/output dim sizes.
-    const DSizes<Index, NumDims>& block_sizes = output_block->block_sizes();
-    const bool col_major =
-        static_cast<int>(Layout) == static_cast<int>(ColMajor);
-    const Index depth_dim_size = block_sizes[col_major ? 0 : NumDims - 1];
-    const Index output_depth_dim_size = m_dimensions[
-        col_major ? 0 : NumDims - 1];
-    const Index row_dim_size = block_sizes[col_major ? 1 : NumDims - 2];
-    const Index output_row_dim_size = m_dimensions[col_major ? 1 : NumDims - 2];
-    const Index col_dim_size = block_sizes[col_major ? 2 : NumDims - 3];
-    const Index block_col_stride = row_dim_size * depth_dim_size;
-    const Index patch_index_dim_size = block_sizes[col_major ? 3 : NumDims - 4];
-    const Index outer_dim_size = block_sizes.TotalSize() /
-        (depth_dim_size * row_dim_size * col_dim_size * patch_index_dim_size);
-
-    const Index patch_size = row_dim_size * col_dim_size * depth_dim_size;
-    const Index batch_size = patch_size * patch_index_dim_size;
-
-    Index output_index = output_block->first_coeff_index();
-
-    // Loop through outer dimensions.
-    for (Index outer_dim_index = 0;
-         outer_dim_index < outer_dim_size;
-         ++outer_dim_index) {
-      const Index outer_output_base_index = outer_dim_index * batch_size;
-      // Find the offset of the element wrt the location of the first element.
-      const Index patchIndexStart = output_index / m_fastPatchStride;
-      const Index patchOffset =
-          (output_index - patchIndexStart * m_patchStride) / m_fastOutputDepth;
-      const Index colOffsetStart = patchOffset / m_fastColStride;
-      // Other ways to index this element.
-      const Index otherIndex = (NumDims == 4) ?
-          0 : output_index / m_fastOtherStride;
-      const Index patch2DIndexStart = (NumDims == 4) ?
-          0 : (output_index - otherIndex * m_otherStride) / m_fastPatchStride;
-      // Calculate starting depth index.
-      const Index depth = output_index - (output_index / m_fastOutputDepth) *
-          output_depth_dim_size;
-      const Index patch_input_base_index = depth + otherIndex *
-          m_patchInputStride;
-
-      // Loop through patches.
-      for (Index patch_index_dim_index = 0;
-           patch_index_dim_index < patch_index_dim_size;
-           ++patch_index_dim_index) {
-        const Index patch_output_base_index = outer_output_base_index +
-            patch_index_dim_index * patch_size;
-        // Patch index corresponding to the passed in index.
-        const Index patchIndex = patchIndexStart + patch_index_dim_index;
-        const Index patch2DIndex = (NumDims == 4) ?
-            patchIndex : patch2DIndexStart + patch_index_dim_index;
-        const Index colIndex = patch2DIndex / m_fastOutputRows;
-        const Index input_col_base = colIndex * m_col_strides;
-        const Index row_offset_base = (patch2DIndex - colIndex * m_outputRows) *
-            m_row_strides - m_rowPaddingTop;
-
-        // Loop through columns.
-        for (Index col_dim_index = 0;
-             col_dim_index < col_dim_size;
-             ++col_dim_index) {
-          const Index col_output_base_index = patch_output_base_index +
-              col_dim_index * block_col_stride;
-
-          // Calculate col index in the input original tensor.
-          Index colOffset = colOffsetStart + col_dim_index;
-          Index inputCol = input_col_base + colOffset * m_in_col_strides -
-              m_colPaddingLeft;
-          Index origInputCol = (m_col_inflate_strides == 1) ?
-              inputCol : ((inputCol >= 0) ?
-                          (inputCol / m_fastInflateColStride) : 0);
-
-          bool pad_column = false;
-          if (inputCol < 0 || inputCol >= m_input_cols_eff ||
-              ((m_col_inflate_strides != 1) &&
-               (inputCol != origInputCol * m_col_inflate_strides))) {
-            pad_column = true;
-          }
-
-          const Index col_input_base_index = patch_input_base_index +
-              origInputCol * m_colInputStride;
-          const Index input_row_base = row_offset_base +
-              ((patchOffset + col_dim_index * output_row_dim_size) -
-               colOffset * m_colStride) * m_in_row_strides;
-          // Loop through rows.
-          for (Index row_dim_index = 0;
-               row_dim_index < row_dim_size;
-               ++row_dim_index) {
-            const Index output_base_index = col_output_base_index +
-                row_dim_index * depth_dim_size;
-            bool pad_row = false;
-            Index inputIndex;
-            if (!pad_column) {
-              Index inputRow = input_row_base + row_dim_index *
-                  m_in_row_strides;
-              Index origInputRow = (m_row_inflate_strides == 1) ?
-                  inputRow : ((inputRow >= 0) ?
-                              (inputRow / m_fastInflateRowStride) : 0);
-              if (inputRow < 0 || inputRow >= m_input_rows_eff ||
-                  ((m_row_inflate_strides != 1) &&
-                   (inputRow != origInputRow * m_row_inflate_strides))) {
-                pad_row = true;
-              } else {
-                inputIndex = col_input_base_index + origInputRow *
-                    m_rowInputStride;
-              }
-            }
-            // Copy (or pad) along depth dimension.
-            if (pad_column || pad_row) {
-              ImagePatchPaddingOp::Run(depth_dim_size, Scalar(m_paddingValue),
-                                       output_base_index, output_block->data());
-            } else {
-              ImagePatchCopyOp::Run(*this, depth_dim_size,
-                                    output_base_index, output_block->data(),
-                                    inputIndex);
-            }
-          }
-        }
-      }
-      output_index += m_otherStride;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
-
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-
-  Index rowPaddingTop() const { return m_rowPaddingTop; }
-  Index colPaddingLeft() const { return m_colPaddingLeft; }
-  Index outputRows() const { return m_outputRows; }
-  Index outputCols() const { return m_outputCols; }
-  Index userRowStride() const { return m_row_strides; }
-  Index userColStride() const { return m_col_strides; }
-  Index userInRowStride() const { return m_in_row_strides; }
-  Index userInColStride() const { return m_in_col_strides; }
-  Index rowInflateStride() const { return m_row_inflate_strides; }
-  Index colInflateStride() const { return m_col_inflate_strides; }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
-  {
-    // Location of the first element of the patch.
-    // ColMajor
-    // 0: d, 1: patch_rows, 2: patch_cols, 3: number of patches, 4: number of batches
-    // RowMajor
-    // 0: number of batches, 1: number of patches, 2: patch_cols , 3: patch_rows, 4: d
-    const Index patch2DIndex = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 3 : 1];
-
-    array<Index, NumDims-1> inputCoords;
-    Index input_col_idx = patch2DIndex / m_fastInputColsEff;
-    Index inputCol = input_col_idx  + coords[1] * m_in_row_strides - m_rowPaddingTop;
-    Index inputRow = patch2DIndex - input_col_idx * m_input_cols_eff + coords[2] * m_in_col_strides - m_colPaddingLeft;
-    const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0);
-    const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride) : 0);
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      inputCoords[0] = coords[0];  // depth
-      inputCoords[1] = origInputCol;
-      inputCoords[2] = origInputRow;
-      inputCoords[3] = coords[4];  // batch
-    } else {
-      inputCoords[3] = coords[4];  // depth
-      inputCoords[2] = origInputCol;
-      inputCoords[1] = origInputRow;
-      inputCoords[0] = coords[0];  // batch
-    }
-    // If the computed coordinates are outside the original image perimeter, return 0.
-    if (inputCol < 0 || inputCol >= m_input_cols_eff || inputRow < 0 || inputRow >= m_input_rows_eff ||
-        ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides)) ||
-        ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
-      return Scalar(m_paddingValue);
-    }
-    if (TensorEvaluator<ArgType, Device>::CoordAccess) {
-      return m_impl.coeff(inputCoords);
-    } else {
-      Index inputIndex;
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        inputIndex =
-          inputCoords[3] * m_patchInputStride +
-          inputCoords[2] * m_colInputStride +
-          inputCoords[1] * m_rowInputStride +
-          inputCoords[0];
-      } else {
-        inputIndex =
-          inputCoords[1] * m_patchInputStride +
-          inputCoords[2] * m_colInputStride +
-          inputCoords[3] * m_rowInputStride +
-          inputCoords[4];
-      }
-      return m_impl.coeff(inputIndex);
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
     // We conservatively estimate the cost for the code path where the computed
     // index is inside the original image and
     // TensorEvaluator<ArgType, Device>::CoordAccess is false.
     const double compute_cost = 3 * TensorOpCost::DivCost<Index>() +
                                 6 * TensorOpCost::MulCost<Index>() +
                                 8 * TensorOpCost::MulCost<Index>();
-    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+    return m_impl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
   }
 
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
-    const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -758,8 +592,8 @@
   internal::TensorIntDivisor<Index> m_fastOutputDepth;
 
   Scalar m_paddingValue;
-  std::size_t m_block_total_size_max;
 
+  const Device EIGEN_DEVICE_REF m_device;
   TensorEvaluator<ArgType, Device> m_impl;
 };
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index 8aefbff..2d8c7b9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h

@@ -10,25 +10,8 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
 #define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
 
-namespace Eigen {
 
-// Can't use std::pairs on cuda devices
-template <typename Index> struct IndexPair {
-  constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {}
-  constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) {}
-
-  EIGEN_DEVICE_FUNC void set(IndexPair<Index> val) {
-    first = val.first;
-    second = val.second;
-  }
-
-  Index first;
-  Index second;
-};
-
-}  // end namespace Eigen
-
-#if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES)
+#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
 
 #define EIGEN_HAS_INDEX_LIST
 
@@ -54,125 +37,230 @@
   * \sa Tensor
   */
 
-template <DenseIndex n>
+template <Index n>
 struct type2index {
-  static const DenseIndex value = n;
-  constexpr operator DenseIndex() const { return n; }
-  void set(DenseIndex val) {
+  static const Index value = n;
+  EIGEN_DEVICE_FUNC constexpr operator Index() const { return n; }
+  EIGEN_DEVICE_FUNC void set(Index val) {
     eigen_assert(val == n);
   }
 };
 
 // This can be used with IndexPairList to get compile-time constant pairs,
 // such as IndexPairList<type2indexpair<1,2>, type2indexpair<3,4>>().
-template <DenseIndex f, DenseIndex s>
+template <Index f, Index s>
 struct type2indexpair {
-  static const DenseIndex first = f;
-  static const DenseIndex second = s;
+  static const Index first = f;
+  static const Index second = s;
 
-  constexpr EIGEN_DEVICE_FUNC operator IndexPair<DenseIndex>() const {
-    return IndexPair<DenseIndex>(f, s);
+  constexpr EIGEN_DEVICE_FUNC operator IndexPair<Index>() const {
+    return IndexPair<Index>(f, s);
   }
 
-  EIGEN_DEVICE_FUNC void set(const IndexPair<DenseIndex>& val) {
+  EIGEN_DEVICE_FUNC void set(const IndexPair<Index>& val) {
     eigen_assert(val.first == f);
     eigen_assert(val.second == s);
   }
 };
 
+
+template<Index n> struct NumTraits<type2index<n> >
+{
+  typedef Index Real;
+  enum {
+    IsComplex = 0,
+    RequireInitialization = false,
+    ReadCost = 1,
+    AddCost = 1,
+    MulCost = 1
+  };
+
+  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real dummy_precision() { return 0; }
+  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real highest() { return n; }
+  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real lowest() { return n; }
+};
+
 namespace internal {
-
 template <typename T>
-void update_value(T& val, DenseIndex new_val) {
-  val = new_val;
+EIGEN_DEVICE_FUNC void update_value(T& val, Index new_val) {
+  val = internal::convert_index<T>(new_val);
 }
-template <DenseIndex n>
-void update_value(type2index<n>& val, DenseIndex new_val) {
+template <Index n>
+EIGEN_DEVICE_FUNC void update_value(type2index<n>& val, Index new_val) {
   val.set(new_val);
 }
 
 template <typename T>
-void update_value(T& val, IndexPair<DenseIndex> new_val) {
+EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair<Index> new_val) {
   val = new_val;
 }
-template <DenseIndex f, DenseIndex s>
-void update_value(type2indexpair<f, s>& val, IndexPair<DenseIndex> new_val) {
+template <Index f, Index s>
+EIGEN_DEVICE_FUNC void update_value(type2indexpair<f, s>& val, IndexPair<Index> new_val) {
   val.set(new_val);
 }
 
+
 template <typename T>
 struct is_compile_time_constant {
   static constexpr bool value = false;
 };
 
-// Next four are is_compile_time_constant for type2index.
-template <DenseIndex idx>
+template <Index idx>
 struct is_compile_time_constant<type2index<idx> > {
   static constexpr bool value = true;
 };
-template <DenseIndex idx>
+template <Index idx>
 struct is_compile_time_constant<const type2index<idx> > {
   static constexpr bool value = true;
 };
-template <DenseIndex idx>
+template <Index idx>
 struct is_compile_time_constant<type2index<idx>& > {
   static constexpr bool value = true;
 };
-template <DenseIndex idx>
+template <Index idx>
 struct is_compile_time_constant<const type2index<idx>& > {
   static constexpr bool value = true;
 };
 
-// Next four are is_compile_time_constant for type2indexpair.
-template <DenseIndex f, DenseIndex s>
+template <Index f, Index s>
 struct is_compile_time_constant<type2indexpair<f, s> > {
   static constexpr bool value = true;
 };
-template <DenseIndex f, DenseIndex s>
+template <Index f, Index s>
 struct is_compile_time_constant<const type2indexpair<f, s> > {
   static constexpr bool value = true;
 };
-template <DenseIndex f, DenseIndex s>
+template <Index f, Index s>
 struct is_compile_time_constant<type2indexpair<f, s>& > {
   static constexpr bool value = true;
 };
-template <DenseIndex f, DenseIndex s>
+template <Index f, Index s>
 struct is_compile_time_constant<const type2indexpair<f, s>& > {
   static constexpr bool value = true;
 };
 
-template <DenseIndex Idx, typename ValueT>
+
+template<typename... T>
+struct IndexTuple;
+
+template<typename T, typename... O>
+struct IndexTuple<T, O...> {
+  EIGEN_DEVICE_FUNC constexpr IndexTuple() : head(), others() { }
+  EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v, const O... o) : head(v), others(o...) { }
+
+  constexpr static int count = 1 + sizeof...(O);
+  T head;
+  IndexTuple<O...> others;
+  typedef T Head;
+  typedef IndexTuple<O...> Other;
+};
+
+template<typename T>
+  struct IndexTuple<T> {
+  EIGEN_DEVICE_FUNC constexpr IndexTuple() : head() { }
+  EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v) : head(v) { }
+
+  constexpr static int count = 1;
+  T head;
+  typedef T Head;
+};
+
+
+template<int N, typename... T>
+struct IndexTupleExtractor;
+
+template<int N, typename T, typename... O>
+struct IndexTupleExtractor<N, T, O...> {
+
+  typedef typename IndexTupleExtractor<N-1, O...>::ValType ValType;
+
+  EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple<T, O...>& val) {
+    return IndexTupleExtractor<N-1, O...>::get_val(val.others);
+  }
+
+  EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple<T, O...>& val) {
+    return IndexTupleExtractor<N-1, O...>::get_val(val.others);
+  }
+  template <typename V>
+  EIGEN_DEVICE_FUNC static void set_val(IndexTuple<T, O...>& val, V& new_val) {
+    IndexTupleExtractor<N-1, O...>::set_val(val.others, new_val);
+  }
+
+};
+
+template<typename T, typename... O>
+  struct IndexTupleExtractor<0, T, O...> {
+
+  typedef T ValType;
+
+  EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple<T, O...>& val) {
+    return val.head;
+  }
+  EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple<T, O...>& val) {
+    return val.head;
+  }
+  template <typename V>
+  EIGEN_DEVICE_FUNC static void set_val(IndexTuple<T, O...>& val, V& new_val) {
+    val.head = new_val;
+  }
+};
+
+
+
+template <int N, typename T, typename... O>
+EIGEN_DEVICE_FUNC constexpr typename IndexTupleExtractor<N, T, O...>::ValType& array_get(IndexTuple<T, O...>& tuple) {
+  return IndexTupleExtractor<N, T, O...>::get_val(tuple);
+}
+template <int N, typename T, typename... O>
+EIGEN_DEVICE_FUNC constexpr const typename IndexTupleExtractor<N, T, O...>::ValType& array_get(const IndexTuple<T, O...>& tuple) {
+  return IndexTupleExtractor<N, T, O...>::get_val(tuple);
+}
+template <typename T, typename... O>
+  struct array_size<IndexTuple<T, O...> > {
+  static const size_t value = IndexTuple<T, O...>::count;
+};
+template <typename T, typename... O>
+  struct array_size<const IndexTuple<T, O...> > {
+  static const size_t value = IndexTuple<T, O...>::count;
+};
+
+
+
+
+template <Index Idx, typename ValueT>
 struct tuple_coeff {
   template <typename... T>
-  static constexpr ValueT get(const DenseIndex i, const std::tuple<T...>& t) {
-    return (i == Idx ? std::get<Idx>(t) : tuple_coeff<Idx-1, ValueT>::get(i, t));
+  EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index i, const IndexTuple<T...>& t) {
+    //    return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
+    return (i == Idx ? array_get<Idx>(t) : tuple_coeff<Idx-1, ValueT>::get(i, t));
   }
   template <typename... T>
-  static void set(const DenseIndex i, std::tuple<T...>& t, const ValueT value) {
+  EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple<T...>& t, const ValueT& value) {
     if (i == Idx) {
-      update_value(std::get<Idx>(t), value);
+      update_value(array_get<Idx>(t), value);
     } else {
       tuple_coeff<Idx-1, ValueT>::set(i, t, value);
     }
   }
 
   template <typename... T>
-  static constexpr bool value_known_statically(const DenseIndex i, const std::tuple<T...>& t) {
-    return ((i == Idx) & is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value) ||
+  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple<T...>& t) {
+    return ((i == Idx) & is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value) ||
         tuple_coeff<Idx-1, ValueT>::value_known_statically(i, t);
   }
 
   template <typename... T>
-  static constexpr bool values_up_to_known_statically(const std::tuple<T...>& t) {
-    return is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value &&
+  EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>& t) {
+    return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
         tuple_coeff<Idx-1, ValueT>::values_up_to_known_statically(t);
   }
 
   template <typename... T>
-  static constexpr bool values_up_to_statically_known_to_increase(const std::tuple<T...>& t) {
-    return is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value &&
-           is_compile_time_constant<typename std::tuple_element<Idx-1, std::tuple<T...> >::type>::value &&
-           std::get<Idx>(t) > std::get<Idx-1>(t) &&
+  EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple<T...>& t) {
+    return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
+           is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
+           array_get<Idx>(t) > array_get<Idx-1>(t) &&
            tuple_coeff<Idx-1, ValueT>::values_up_to_statically_known_to_increase(t);
   }
 };
@@ -180,96 +268,114 @@
 template <typename ValueT>
 struct tuple_coeff<0, ValueT> {
   template <typename... T>
-  static constexpr ValueT get(const DenseIndex i, const std::tuple<T...>& t) {
+  EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index /*i*/, const IndexTuple<T...>& t) {
     //  eigen_assert (i == 0);  // gcc fails to compile assertions in constexpr
-    return std::get<0>(t);
+    return array_get<0>(t)/* * (i == 0)*/;
   }
   template <typename... T>
-  static void set(const DenseIndex i, std::tuple<T...>& t, const ValueT value) {
+  EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple<T...>& t, const ValueT value) {
     eigen_assert (i == 0);
-    update_value(std::get<0>(t), value);
+    update_value(array_get<0>(t), value);
   }
   template <typename... T>
-  static constexpr bool value_known_statically(const DenseIndex i, const std::tuple<T...>& t) {
-    //    eigen_assert (i == 0);  // gcc fails to compile assertions in constexpr
-    return is_compile_time_constant<typename std::tuple_element<0, std::tuple<T...> >::type>::value & (i == 0);
+  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple<T...>&) {
+    return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value && (i == 0);
   }
 
   template <typename... T>
-  static constexpr bool values_up_to_known_statically(const std::tuple<T...>& t) {
-    return is_compile_time_constant<typename std::tuple_element<0, std::tuple<T...> >::type>::value;
+  EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>&) {
+    return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value;
   }
 
   template <typename... T>
-  static constexpr bool values_up_to_statically_known_to_increase(const std::tuple<T...>& t) {
+  EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple<T...>&) {
     return true;
   }
 };
 }  // namespace internal
 
 
+
 template<typename FirstType, typename... OtherTypes>
-struct IndexList : std::tuple<FirstType, OtherTypes...> {
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const {
-    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::get(i, *this);
+struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index operator[] (const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::get(i, *this);
   }
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) {
-    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::set(i, *this, value);
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index get(const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::get(i, *this);
+  }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const Index value) {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::set(i, *this, value);
   }
 
-  constexpr IndexList(const std::tuple<FirstType, OtherTypes...>& other) : std::tuple<FirstType, OtherTypes...>(other) { }
-  constexpr IndexList() : std::tuple<FirstType, OtherTypes...>() { }
+  EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
+  EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other) : internal::IndexTuple<FirstType, OtherTypes...>(first, other...) { }
+  EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
 
-  constexpr bool value_known_statically(const DenseIndex i) const {
-    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this);
+  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::value_known_statically(i, *this);
   }
-  constexpr bool all_values_known_statically() const {
-    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_known_statically(*this);
+  EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::values_up_to_known_statically(*this);
   }
 
-  constexpr bool values_statically_known_to_increase() const {
-    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_statically_known_to_increase(*this);
+  EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::values_up_to_statically_known_to_increase(*this);
   }
 };
 
-
-template<typename FirstType, typename... OtherTypes>
-constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) {
-  return std::make_tuple(val1, other_vals...);
+template <typename FirstType, typename... OtherTypes>
+std::ostream& operator<<(std::ostream& os,
+                         const IndexList<FirstType, OtherTypes...>& dims) {
+  os << "[";
+  for (size_t i = 0; i < 1 + sizeof...(OtherTypes); ++i) {
+    if (i > 0) os << ", ";
+    os << dims[i];
+  }
+  os << "]";
+  return os;
 }
 
 template<typename FirstType, typename... OtherTypes>
-struct IndexPairList : std::tuple<FirstType, OtherTypes...> {
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair<DenseIndex> operator[] (const DenseIndex i) const {
-    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1, IndexPair<DenseIndex>>::get(i, *this);
+constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) {
+  return IndexList<FirstType, OtherTypes...>(val1, other_vals...);
+}
+
+
+template<typename FirstType, typename... OtherTypes>
+struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair<Index> operator[] (const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, IndexPair<Index>>::get(i, *this);
   }
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const IndexPair<DenseIndex> value) {
-    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...>>::value-1, IndexPair<DenseIndex> >::set(i, *this, value);
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const IndexPair<Index> value) {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value-1, IndexPair<Index> >::set(i, *this, value);
   }
 
-  constexpr IndexPairList(const std::tuple<FirstType, OtherTypes...>& other) : std::tuple<FirstType, OtherTypes...>(other) { }
-  constexpr IndexPairList() : std::tuple<FirstType, OtherTypes...>() { }
+  EIGEN_DEVICE_FUNC  constexpr IndexPairList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
+  EIGEN_DEVICE_FUNC  constexpr IndexPairList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
 
-  constexpr bool value_known_statically(const DenseIndex i) const {
-    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this);
+  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::value_known_statically(i, *this);
   }
 };
 
 namespace internal {
 
-template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
-  size_t result = 1;
-  for (int i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
+template<typename FirstType, typename... OtherTypes>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
+  Index result = 1;
+  EIGEN_UNROLL_LOOP
+  for (size_t i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
     result *= sizes[i];
   }
   return result;
-};
+}
 
 template<typename FirstType, typename... OtherTypes> struct array_size<IndexList<FirstType, OtherTypes...> > {
-  static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+  static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...> >::value;
 };
 template<typename FirstType, typename... OtherTypes> struct array_size<const IndexList<FirstType, OtherTypes...> > {
-  static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+  static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...> >::value;
 };
 
 template<typename FirstType, typename... OtherTypes> struct array_size<IndexPairList<FirstType, OtherTypes...> > {
@@ -279,215 +385,226 @@
   static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
 };
 
-template<DenseIndex n, typename FirstType, typename... OtherTypes> constexpr DenseIndex array_get(IndexList<FirstType, OtherTypes...>& a) {
-  return std::get<n>(a);
+template<Index N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr Index array_get(IndexList<FirstType, OtherTypes...>& a) {
+  return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
 }
-template<DenseIndex n, typename FirstType, typename... OtherTypes> constexpr DenseIndex array_get(const IndexList<FirstType, OtherTypes...>& a) {
-  return std::get<n>(a);
+template<Index N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr Index array_get(const IndexList<FirstType, OtherTypes...>& a) {
+  return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
 }
 
 template <typename T>
-struct index_known_statically {
-  constexpr bool operator() (DenseIndex) const {
+struct index_known_statically_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct index_known_statically<IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i) const {
+struct index_known_statically_impl<IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct index_known_statically<const IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i) const {
+struct index_known_statically_impl<const IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
   }
 };
 
+
 template <typename T>
-struct all_indices_known_statically {
-  constexpr bool operator() () const {
+struct all_indices_known_statically_impl {
+  static constexpr bool run() {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct all_indices_known_statically<IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() () const {
+struct all_indices_known_statically_impl<IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
     return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct all_indices_known_statically<const IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() () const {
+struct all_indices_known_statically_impl<const IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
     return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
   }
 };
 
+
 template <typename T>
-struct indices_statically_known_to_increase {
-  constexpr bool operator() () const {
+struct indices_statically_known_to_increase_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct indices_statically_known_to_increase<IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() () const {
-    return IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
+  struct indices_statically_known_to_increase_impl<IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
+    return Eigen::IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct indices_statically_known_to_increase<const IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() () const {
-    return IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
+  struct indices_statically_known_to_increase_impl<const IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
+    return Eigen::IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
   }
 };
 
+
 template <typename Tx>
-struct index_statically_eq {
-  constexpr bool operator() (DenseIndex, DenseIndex) const {
+struct index_statically_eq_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct index_statically_eq<IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
-        IndexList<FirstType, OtherTypes...>()[i] == value;
+struct index_statically_eq_impl<IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>().get(i) == value);
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct index_statically_eq<const IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
-        IndexList<FirstType, OtherTypes...>()[i] == value;
-  }
-};
-
-template <typename T>
-struct index_statically_ne {
-  constexpr bool operator() (DenseIndex, DenseIndex) const {
-  return false;
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-struct index_statically_ne<IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
-        IndexList<FirstType, OtherTypes...>()[i] != value;
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-struct index_statically_ne<const IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
-        IndexList<FirstType, OtherTypes...>()[i] != value;
+struct index_statically_eq_impl<const IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>().get(i) == value);
   }
 };
 
 
 template <typename T>
-struct index_statically_gt {
-  constexpr bool operator() (DenseIndex, DenseIndex) const {
-  return false;
+struct index_statically_ne_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
+    return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct index_statically_gt<IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
-        IndexList<FirstType, OtherTypes...>()[i] > value;
+struct index_statically_ne_impl<IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>().get(i) != value);
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct index_statically_gt<const IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
-        IndexList<FirstType, OtherTypes...>()[i] > value;
+struct index_statically_ne_impl<const IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>().get(i) != value);
   }
 };
 
+
 template <typename T>
-struct index_statically_lt {
-  constexpr bool operator() (DenseIndex, DenseIndex) const {
-  return false;
+struct index_statically_gt_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
+    return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct index_statically_lt<IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
-        IndexList<FirstType, OtherTypes...>()[i] < value;
+struct index_statically_gt_impl<IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>().get(i) > value);
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct index_statically_lt<const IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
-        IndexList<FirstType, OtherTypes...>()[i] < value;
+struct index_statically_gt_impl<const IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>().get(i) > value);
   }
 };
 
+
+
+template <typename T>
+struct index_statically_lt_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_lt_impl<IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>().get(i) < value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>().get(i) < value);
+  }
+};
+
+
+
 template <typename Tx>
-struct index_pair_first_statically_eq {
-  constexpr bool operator() (DenseIndex, DenseIndex) const {
+struct index_pair_first_statically_eq_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct index_pair_first_statically_eq<IndexPairList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
-    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &&
-        IndexPairList<FirstType, OtherTypes...>()[i].first == value;
+struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct index_pair_first_statically_eq<const IndexPairList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
-    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &&
-        IndexPairList<FirstType, OtherTypes...>()[i].first == value;
+struct index_pair_first_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
   }
 };
 
+
+
 template <typename Tx>
-struct index_pair_second_statically_eq {
-  constexpr bool operator() (DenseIndex, DenseIndex) const {
+struct index_pair_second_statically_eq_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
     return false;
   }
 };
 
-template <typename secondType, typename... OtherTypes>
-struct index_pair_second_statically_eq<IndexPairList<secondType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
-    return IndexPairList<secondType, OtherTypes...>().value_known_statically(i) &&
-        IndexPairList<secondType, OtherTypes...>()[i].second == value;
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
   }
 };
 
-template <typename secondType, typename... OtherTypes>
-struct index_pair_second_statically_eq<const IndexPairList<secondType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
-    return IndexPairList<secondType, OtherTypes...>().value_known_statically(i) &&
-        IndexPairList<secondType, OtherTypes...>()[i].second == value;
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_second_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
   }
 };
 
+
 }  // end namespace internal
 }  // end namespace Eigen
 
@@ -496,59 +613,126 @@
 namespace Eigen {
 namespace internal {
 
-// No C++11 support
 template <typename T>
-struct index_known_statically {
-  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex) const{
+struct index_known_statically_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const Index) {
     return false;
   }
 };
 
 template <typename T>
-struct all_indices_known_statically {
-  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() () const {
+struct all_indices_known_statically_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
     return false;
   }
 };
 
 template <typename T>
-struct indices_statically_known_to_increase {
-  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() () const {
+struct indices_statically_known_to_increase_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
     return false;
   }
 };
 
 template <typename T>
-struct index_statically_eq {
-  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
+struct index_statically_eq_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename T>
-struct index_statically_ne {
-  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
+struct index_statically_ne_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename T>
-struct index_statically_gt {
-  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
+struct index_statically_gt_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename T>
-struct index_statically_lt {
-  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
+struct index_statically_lt_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
     return false;
   }
 };
 
+template <typename Tx>
+struct index_pair_first_statically_eq_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
+    return false;
+  }
+};
+
+template <typename Tx>
+struct index_pair_second_statically_eq_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
+    return false;
+  }
+};
+
+
+
 }  // end namespace internal
 }  // end namespace Eigen
 
 #endif
 
+
+namespace Eigen {
+namespace internal {
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(Index i) {
+  return index_known_statically_impl<T>::run(i);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool all_indices_known_statically() {
+  return all_indices_known_statically_impl<T>::run();
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increase() {
+  return indices_statically_known_to_increase_impl<T>::run();
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(Index i, Index value) {
+  return index_statically_eq_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(Index i, Index value) {
+  return index_statically_ne_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(Index i, Index value) {
+  return index_statically_gt_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(Index i, Index value) {
+  return index_statically_lt_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(Index i, Index value) {
+  return index_pair_first_statically_eq_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(Index i, Index value) {
+  return index_pair_second_statically_eq_impl<T>::run(i, value);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+
 #endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index f80dc18..c5cb61a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h

@@ -25,13 +25,13 @@
 {
   typedef typename XprType::Scalar Scalar;
   typedef traits<XprType> XprTraits;
-  typedef typename packet_traits<Scalar>::type Packet;
   typedef typename XprTraits::StorageKind StorageKind;
   typedef typename XprTraits::Index Index;
   typedef typename XprType::Nested Nested;
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Strides, typename XprType>
@@ -53,10 +53,8 @@
 {
   public:
   typedef typename Eigen::internal::traits<TensorInflationOp>::Scalar Scalar;
-  typedef typename Eigen::internal::traits<TensorInflationOp>::Packet Packet;
   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
   typedef typename Eigen::internal::nested<TensorInflationOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorInflationOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorInflationOp>::Index Index;
@@ -86,19 +84,26 @@
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_strides(op.strides())
   {
     m_dimensions = m_impl.dimensions();
@@ -132,11 +137,11 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -147,29 +152,31 @@
     eigen_assert(index < dimensions().TotalSize());
     *inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx = index / m_outputStrides[i];
-        if (idx != (idx / m_fastStrides[i]) * m_strides[i]) {
+        if (idx != idx / m_fastStrides[i] * m_strides[i]) {
           return false;
         }
         *inputIndex += idx / m_strides[i] * m_inputStrides[i];
         index -= idx * m_outputStrides[i];
       }
-      if (index != (index / m_fastStrides[0]) * m_strides[0]) {
+      if (index != index / m_fastStrides[0] * m_strides[0]) {
         return false;
       }
       *inputIndex += index / m_strides[0];
       return true;
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx = index / m_outputStrides[i];
-        if (idx != (idx / m_fastStrides[i]) * m_strides[i]) {
+        if (idx != idx / m_fastStrides[i] * m_strides[i]) {
           return false;
         }
         *inputIndex += idx / m_strides[i] * m_inputStrides[i];
         index -= idx * m_outputStrides[i];
       }
-      if (index != (index / m_fastStrides[NumDims-1]) * m_strides[NumDims-1]) {
+      if (index != index / m_fastStrides[NumDims-1] * m_strides[NumDims-1]) {
         return false;
       }
       *inputIndex += index / m_strides[NumDims - 1];
@@ -192,12 +199,12 @@
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
-    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
     PacketReturnType rslt = internal::pload<PacketReturnType>(values);
@@ -217,7 +224,14 @@
                         compute_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
 
  protected:
   Dimensions m_dimensions;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
index 375c763..26a3818 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h

@@ -10,7 +10,7 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
 #define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
 
 #include <initializer_list>
 
@@ -32,7 +32,7 @@
                   Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
                   const InitList& vals) {
     int i = 0;
-    for (auto v : vals) {
+    for (const auto& v : vals) {
       (*indices)[traits<Derived>::NumDimensions - N] = i++;
       Initializer<Derived, N - 1>::run(tensor, indices, v);
     }
@@ -48,7 +48,7 @@
                   const InitList& vals) {
     int i = 0;
     // There is likely a faster way to do that than iterating.
-    for (auto v : vals) {
+    for (const auto& v : vals) {
       (*indices)[traits<Derived>::NumDimensions - 1] = i++;
       tensor.coeffRef(*indices) = v;
     }
@@ -56,17 +56,17 @@
 };
 
 template <typename Derived>
-struct Initializer<Derived, Dynamic> {
-  typedef std::initializer_list<typename traits<Derived>::Scalar> InitList;
+struct Initializer<Derived, 0> {
+  typedef typename traits<Derived>::Scalar InitList;
 
   static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
-                  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
-                  const InitList& vals) {
-    // Static initialization not implemented for VarDims tensors.
-    eigen_assert(false);
+                  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>*,
+                  const InitList& v) {
+    tensor.coeffRef(0) = v;
   }
 };
 
+
 template <typename Derived, int N>
 void initialize_tensor(TensorEvaluator<Derived, DefaultDevice>& tensor,
                        const typename Initializer<Derived, traits<Derived>::NumDimensions>::InitList& vals) {
@@ -79,4 +79,4 @@
 
 #endif  // EIGEN_HAS_VARIADIC_TEMPLATES
 
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index b103636..6d5cce4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h

@@ -21,7 +21,7 @@
   * \brief Fast integer division by a constant.
   *
   * See the paper from Granlund and Montgomery for explanation.
-  *   (at http://dx.doi.org/10.1145/773473.178249)
+  *   (at https://doi.org/10.1145/773473.178249)
   *
   * \sa Tensor
   */
@@ -29,43 +29,71 @@
 namespace internal {
 
 namespace {
+
   // Note: result is undefined if val == 0
   template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int count_leading_zeros(const T val)
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
   {
-#ifdef __CUDA_ARCH__
-    if (sizeof(T) == 8) {
-      return __clzll(val);
-    }
+#ifdef EIGEN_GPU_COMPILE_PHASE
     return __clz(val);
+#elif defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::clz(val);
 #elif EIGEN_COMP_MSVC
-    DWORD leading_zeros = 0;
-    if (sizeof(T) == 8) {
-      _BitScanReverse64(&leading_zero, val);
-    }
-    else {
-      _BitScanReverse(&leading_zero, val);
-    }
+    unsigned long index;
+    _BitScanReverse(&index, val);
+    return 31 - index;
 #else
-    if (sizeof(T) == 8) {
-      return __builtin_clzl(static_cast<uint64_t>(val));
-    }
+    EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
     return __builtin_clz(static_cast<uint32_t>(val));
 #endif
   }
 
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
+  {
+#ifdef EIGEN_GPU_COMPILE_PHASE
+    return __clzll(val);
+#elif defined(SYCL_DEVICE_ONLY)
+    return static_cast<int>(cl::sycl::clz(val));
+#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
+    unsigned long index;
+    _BitScanReverse64(&index, val);
+    return 63 - index;
+#elif EIGEN_COMP_MSVC
+    // MSVC's _BitScanReverse64 is not available for 32bits builds.
+    unsigned int lo = (unsigned int)(val&0xffffffff);
+    unsigned int hi = (unsigned int)((val>>32)&0xffffffff);
+    int n;
+    if(hi==0)
+      n = 32 + count_leading_zeros<unsigned int>(lo);
+    else
+      n = count_leading_zeros<unsigned int>(hi);
+    return n;
+#else
+    EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return __builtin_clzll(static_cast<uint64_t>(val));
+#endif
+  }
+
+  template <typename T>
+  struct UnsignedTraits {
+    typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type type;
+  };
 
   template <typename T>
   struct DividerTraits {
-    typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type type;
+    typedef typename UnsignedTraits<T>::type type;
     static const int N = sizeof(T) * 8;
   };
 
-
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
     return __umulhi(a, b);
+#elif defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
 #else
     return (static_cast<uint64_t>(a) * b) >> 32;
 #endif
@@ -73,9 +101,11 @@
 
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
     return __umul64hi(a, b);
-#elif defined(__SIZEOF_INT128__)
+#elif defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
+#elif EIGEN_HAS_BUILTIN_INT128
     __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
     return static_cast<uint64_t>(v >> 64);
 #else
@@ -87,23 +117,24 @@
   struct DividerHelper {
     static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) {
       EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return (static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1;
+      return static_cast<uint32_t>((static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1);
     }
   };
 
   template <typename T>
   struct DividerHelper<64, T> {
     static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
-#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__)
-      return ((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
+#if EIGEN_HAS_BUILTIN_INT128 && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
+      return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
 #else
       const uint64_t shift = 1ULL << log_div;
-      TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1));
+      TensorUInt128<uint64_t, uint64_t> result = TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider)
+                                               - TensorUInt128<static_val<1>, static_val<0> >(1, 0)
+                                               + TensorUInt128<static_val<0>, static_val<1> >(1);
       return static_cast<uint64_t>(result);
 #endif
     }
   };
-
 }
 
 
@@ -121,14 +152,14 @@
   // the __uint128_t type.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) {
     const int N = DividerTraits<T>::N;
-    eigen_assert(divider < NumTraits<UnsignedType>::highest()/2);
+    eigen_assert(static_cast<typename UnsignedTraits<T>::type>(divider) < NumTraits<UnsignedType>::highest()/2);
     eigen_assert(divider > 0);
 
     // fast ln2
     const int leading_zeros = count_leading_zeros(static_cast<UnsignedType>(divider));
     int log_div = N - leading_zeros;
     // if divider is a power of two then log_div is 1 more than it should be.
-    if ((1ull << (log_div-1)) == divider)
+    if ((static_cast<typename UnsignedTraits<T>::type>(1) << (log_div-1)) == static_cast<typename UnsignedTraits<T>::type>(divider))
       log_div--;
 
     multiplier = DividerHelper<N, T>::computeMultiplier(log_div, divider);
@@ -136,11 +167,11 @@
     shift2 = log_div > 1 ? log_div-1 : 0;
   }
 
-  // Must have 0 <= numerator. On platforms that dont support the __uint128_t
+  // Must have 0 <= numerator. On platforms that don't support the __uint128_t
   // type numerator should also be less than 2^32-1.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
-    eigen_assert(numerator < NumTraits<UnsignedType>::highest()/2);
-    eigen_assert(numerator >= 0);
+    eigen_assert(static_cast<typename UnsignedTraits<T>::type>(numerator) < NumTraits<UnsignedType>::highest()/2);
+    //eigen_assert(numerator >= 0); // this is implicitly asserted by the line above
 
     UnsignedType t1 = muluh(multiplier, numerator);
     UnsignedType t = (static_cast<UnsignedType>(numerator) - t1) >> shift1;
@@ -157,6 +188,7 @@
 
 // Optimized version for signed 32 bit integers.
 // Derived from Hacker's Delight.
+// Only works for divisors strictly greater than one
 template <>
 class TensorIntDivisor<int32_t, true> {
  public:
@@ -171,8 +203,10 @@
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
-#ifdef __CUDA_ARCH__
+#ifdef EIGEN_GPU_COMPILE_PHASE
     return (__umulhi(magic, n) >> shift);
+#elif defined(SYCL_DEVICE_ONLY)
+    return (cl::sycl::mul_hi(magic, static_cast<uint32_t>(n)) >> shift);
 #else
     uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
     return (static_cast<uint32_t>(v >> 32) >> shift);
@@ -222,6 +256,7 @@
   return divisor.divide(numerator);
 }
 
+
 } // end namespace internal
 } // end namespace Eigen
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index 9d5548b..80106c1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h

@@ -40,13 +40,13 @@
 {
   typedef typename XprType::Scalar Scalar;
   typedef traits<XprType> XprTraits;
-  typedef typename packet_traits<Scalar>::type Packet;
   typedef typename XprTraits::StorageKind StorageKind;
   typedef typename XprTraits::Index Index;
   typedef typename XprType::Nested Nested;
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = traits<XprType>::NumDimensions;
-  static const int Layout = (static_cast<int>(traits<XprType>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor;
+  static const int Layout = (traits<XprType>::Layout == ColMajor) ? RowMajor : ColMajor;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename XprType>
@@ -69,45 +69,24 @@
 class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors>
 {
   public:
-  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar;
-  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Packet Packet;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
-  typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
-  typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index;
+    typedef TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors> Base;
+    typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr)
-      : m_xpr(expr) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr)
+        : m_xpr(expr) {}
 
-  EIGEN_DEVICE_FUNC
-  const typename internal::remove_all<typename XprType::Nested>::type&
-  expression() const { return m_xpr; }
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
 
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const TensorLayoutSwapOp& other)
-  {
-    typedef TensorAssignOp<TensorLayoutSwapOp, const TensorLayoutSwapOp> Assign;
-    Assign assign(*this, other);
-    internal::TensorExecutor<const Assign, DefaultDevice>::run(
-        assign, DefaultDevice());
-    return *this;
-  }
-
-  template<typename OtherDerived>
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other)
-  {
-    typedef TensorAssignOp<TensorLayoutSwapOp, const OtherDerived> Assign;
-    Assign assign(*this, other);
-    internal::TensorExecutor<const Assign, DefaultDevice>::run(
-        assign, DefaultDevice());
-    return *this;
-  }
-
- protected:
-  typename XprType::Nested m_xpr;
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorLayoutSwapOp)
+  protected:
+    typename XprType::Nested m_xpr;
 };
 
 
@@ -124,15 +103,17 @@
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
-    Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) ==
-              static_cast<int>(ColMajor))
-                 ? RowMajor
-                 : ColMajor,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
     CoordAccess = false,  // to be implemented
     RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device)
   {
     for(int i = 0; i < NumDims; ++i) {
@@ -140,16 +121,25 @@
     }
   }
 
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
+
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     return m_impl.evalSubExprsIfNeeded(data);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -168,7 +158,9 @@
     return m_impl.costPerCoeff(vectorized);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); }
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const {
+    return constCast(m_impl.data());
+  }
 
   const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
@@ -190,23 +182,25 @@
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
-    Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) ==
-              static_cast<int>(ColMajor))
-                 ? RowMajor
-                 : ColMajor,
-    CoordAccess = false,  // to be implemented
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
+    CoordAccess = false  // to be implemented
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
   { }
 
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
   {
     return this->m_impl.coeffRef(index);
   }

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
new file mode 100644
index 0000000..73ff3d2
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h

@@ -0,0 +1,98 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H
+
+
+/** use this macro in sfinae selection in templated functions
+ *
+ *   template<typename T,
+ *            typename std::enable_if< isBanana<T>::value , int >::type = 0
+ *   >
+ *   void foo(){}
+ *
+ *   becomes =>
+ *
+ *   template<typename TopoType,
+ *           SFINAE_ENABLE_IF( isBanana<T>::value )
+ *   >
+ *   void foo(){}
+ */
+
+// SFINAE requires variadic templates
+#if !defined(EIGEN_GPUCC)
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  // SFINAE doesn't work for gcc <= 4.7
+  #ifdef EIGEN_COMP_GNUC
+    #if EIGEN_GNUC_AT_LEAST(4,8)
+      #define EIGEN_HAS_SFINAE
+    #endif
+  #else
+    #define EIGEN_HAS_SFINAE
+  #endif
+#endif
+#endif
+
+#define EIGEN_SFINAE_ENABLE_IF( __condition__ ) \
+    typename internal::enable_if< ( __condition__ ) , int >::type = 0
+
+// Define a macro to use a reference on the host but a value on the device
+#if defined(SYCL_DEVICE_ONLY)
+  #define EIGEN_DEVICE_REF
+#else
+  #define EIGEN_DEVICE_REF &
+#endif
+
+// Define a macro for catching SYCL exceptions if exceptions are enabled
+#define EIGEN_SYCL_TRY_CATCH(X) \
+  do { \
+    EIGEN_TRY {X;} \
+    EIGEN_CATCH(const cl::sycl::exception& e) { \
+      EIGEN_THROW_X(std::runtime_error("SYCL exception at " + \
+                                       std::string(__FILE__) + ":" + \
+                                       std::to_string(__LINE__) + "\n" + \
+                                       e.what())); \
+    } \
+  } while (false)
+
+// Define a macro if local memory flags are unset or one of them is set
+// Setting both flags is the same as unsetting them
+#if (!defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)) || \
+     (defined(EIGEN_SYCL_LOCAL_MEM) &&  defined(EIGEN_SYCL_NO_LOCAL_MEM))
+  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
+  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
+#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
+  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
+#elif !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
+  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
+#endif
+
+#if EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
+  #define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    using Base::operator =; \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \
+    template <typename OtherDerived> \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other) { Base::operator=(other); return *this; }
+#else
+  #define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)
+#endif
+
+/** \internal
+ * \brief Macro to manually inherit assignment operators.
+ * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined.
+ * This also inherits template<OtherDerived> operator=(const OtherDerived&) assignments.
+ * With C++11 or later this also default-implements the copy-constructor
+ */
+#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(Derived)  \
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived)
+
+#endif

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index 911213d..6834c97 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h

@@ -12,33 +12,57 @@
 
 namespace Eigen {
 
+// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_)
+
 /** \class TensorMap
   * \ingroup CXX11_Tensor_Module
   *
   * \brief A tensor expression mapping an existing array of data.
   *
   */
-
-template<typename PlainObjectType, int Options_> class TensorMap : public TensorBase<TensorMap<PlainObjectType, Options_> >
+/// `template <class> class MakePointer_` is added to convert the host pointer to the device pointer.
+/// It is added due to the fact that for our device compiler `T*` is not allowed.
+/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`.
+/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_<T>` is `T*` .
+/// Therefore, by adding the default value, we managed to convert the type and it does not break any
+/// existing code as its default value is `T*`.
+template<typename PlainObjectType, int Options_, template <class> class MakePointer_> class TensorMap : public TensorBase<TensorMap<PlainObjectType, Options_, MakePointer_> >
 {
   public:
-    typedef TensorMap<PlainObjectType, Options_> Self;
-    typedef typename PlainObjectType::Base Base;
-    typedef typename Eigen::internal::nested<Self>::type Nested;
-    typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
+    typedef TensorMap<PlainObjectType, Options_, MakePointer_> Self;
+    typedef TensorBase<TensorMap<PlainObjectType, Options_, MakePointer_> > Base;
+  #ifdef EIGEN_USE_SYCL
+    typedef  typename Eigen::internal::remove_reference<typename Eigen::internal::nested<Self>::type>::type Nested;
+  #else
+     typedef typename Eigen::internal::nested<Self>::type Nested;
+  #endif
+   typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
     typedef typename internal::traits<PlainObjectType>::Index Index;
     typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
-    typedef typename internal::packet_traits<Scalar>::type Packet;
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename Base::CoeffReturnType CoeffReturnType;
+    typedef typename PlainObjectType::Base::CoeffReturnType CoeffReturnType;
 
-  /*    typedef typename internal::conditional<
-                         bool(internal::is_lvalue<PlainObjectType>::value),
-                         Scalar *,
-                         const Scalar *>::type
-                     PointerType;*/
-    typedef Scalar* PointerType;
-    typedef PointerType PointerArgType;
+    typedef typename MakePointer_<Scalar>::Type PointerType;
+    typedef typename MakePointer_<Scalar>::ConstType PointerConstType;
+
+    // WARN: PointerType still can be a pointer to const (const Scalar*), for
+    // example in TensorMap<Tensor<const Scalar, ...>> expression. This type of
+    // expression should be illegal, but adding this restriction is not possible
+    // in practice (see https://bitbucket.org/eigen/eigen/pull-requests/488).
+    typedef typename internal::conditional<
+        bool(internal::is_lvalue<PlainObjectType>::value),
+        PointerType,      // use simple pointer in lvalue expressions
+        PointerConstType  // use const pointer in rvalue expressions
+        >::type StoragePointerType;
+
+    // If TensorMap was constructed over rvalue expression (e.g. const Tensor),
+    // we should return a reference to const from operator() (and others), even
+    // if TensorMap itself is not const.
+    typedef typename internal::conditional<
+        bool(internal::is_lvalue<PlainObjectType>::value),
+        Scalar&,
+        const Scalar&
+        >::type StorageRefType;
 
     static const int Options = Options_;
 
@@ -46,56 +70,54 @@
     typedef typename PlainObjectType::Dimensions Dimensions;
 
     enum {
-      IsAligned = ((int(Options_) & Aligned) == Aligned),
-      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
-      BlockAccess = false,
+      IsAligned = ((int(Options_)&Aligned)==Aligned),
       Layout = PlainObjectType::Layout,
       CoordAccess = true,
       RawAccess = true
     };
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr) : m_data(dataPtr), m_dimensions() {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr) : m_data(dataPtr), m_dimensions() {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 #else
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) {
       EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) {
       EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) {
       EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) {
       EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 #endif
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const array<Index, NumIndices>& dimensions)
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const array<Index, NumIndices>& dimensions)
       : m_data(dataPtr), m_dimensions(dimensions)
     { }
 
     template <typename Dimensions>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const Dimensions& dimensions)
       : m_data(dataPtr), m_dimensions(dimensions)
     { }
 
@@ -112,12 +134,12 @@
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar* data() { return m_data; }
+    EIGEN_STRONG_INLINE StoragePointerType data() { return m_data; }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar* data() const { return m_data; }
+    EIGEN_STRONG_INLINE StoragePointerType data() const { return m_data; }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices) const
     {
       //      eigen_assert(checkIndexRange(indices));
       if (PlainObjectType::Options&RowMajor) {
@@ -130,38 +152,39 @@
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()() const
+    EIGEN_STRONG_INLINE StorageRefType operator()() const
     {
-      EIGEN_STATIC_ASSERT(NumIndices == 0 || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
-      eigen_assert(rank() == 0);
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
       return m_data[0];
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
-    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index index) const
     {
-      static_assert(sizeof...(otherIndices) + 1 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+      eigen_internal_assert(index >= 0 && index < size());
+      return m_data[index];
+    }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
+    {
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
       if (PlainObjectType::Options&RowMajor) {
-        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{firstIndex, otherIndices...});
+        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
         return m_data[index];
       } else {
-        const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{firstIndex, otherIndices...});
+        const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
         return m_data[index];
       }
     }
 #else
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      return m_data[index];
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1) const
     {
       if (PlainObjectType::Options&RowMajor) {
-        const Index index = i1 + i0 * m_dimensions[0];
+        const Index index = i1 + i0 * m_dimensions[1];
         return m_data[index];
       } else {
         const Index index = i0 + i1 * m_dimensions[0];
@@ -169,10 +192,10 @@
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2) const
     {
       if (PlainObjectType::Options&RowMajor) {
-         const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0);
+         const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
          return m_data[index];
       } else {
          const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
@@ -180,7 +203,7 @@
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3) const
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
@@ -191,7 +214,7 @@
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
@@ -204,7 +227,7 @@
 #endif
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
+    EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices)
     {
       //      eigen_assert(checkIndexRange(indices));
       if (PlainObjectType::Options&RowMajor) {
@@ -217,41 +240,40 @@
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()()
+    EIGEN_STRONG_INLINE StorageRefType operator()()
     {
-      static_assert(NumIndices == 0 || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
-      eigen_internal_assert(rank() == 0);
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
       return m_data[0];
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
-    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index index)
     {
-      static_assert(sizeof...(otherIndices) + 1 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
-      const std::size_t NumDims = sizeof...(otherIndices) + 1;
+      eigen_internal_assert(index >= 0 && index < size());
+      return m_data[index];
+    }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
+    {
+      static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+       eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
+      const std::size_t NumDims = sizeof...(otherIndices) + 2;
       if (PlainObjectType::Options&RowMajor) {
-        const array<Index, NumDims> dims = {firstIndex, otherIndices...};
-        const Index index = m_dimensions.IndexOfRowMajor(dims);
+        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}});
         return m_data[index];
       } else {
-        const array<Index, NumDims> dims = {firstIndex, otherIndices...};
-        const Index index = m_dimensions.IndexOfColMajor(dims);
+        const Index index = m_dimensions.IndexOfColMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}});
         return m_data[index];
       }
     }
 #else
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index index)
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      return m_data[index];
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1)
     {
        if (PlainObjectType::Options&RowMajor) {
-         const Index index = i1 + i0 * m_dimensions[0];
+         const Index index = i1 + i0 * m_dimensions[1];
         return m_data[index];
       } else {
         const Index index = i0 + i1 * m_dimensions[0];
@@ -259,10 +281,10 @@
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2)
     {
        if (PlainObjectType::Options&RowMajor) {
-         const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0);
+         const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
         return m_data[index];
       } else {
          const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
@@ -270,7 +292,7 @@
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3)
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
@@ -281,7 +303,7 @@
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
@@ -293,26 +315,10 @@
     }
 #endif
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const Self& other)
-    {
-      typedef TensorAssignOp<Self, const Self> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Self& operator=(const OtherDerived& other)
-    {
-      typedef TensorAssignOp<Self, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorMap)
 
   private:
-    Scalar* m_data;
+    StoragePointerType m_data;
     Dimensions m_dimensions;
 };
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index aef1138..b90a1dc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h

@@ -37,6 +37,14 @@
   return static_cast<T>((x + y - 1) / y);
 }
 
+template <size_t n> struct max_n_1 {
+  static const size_t size = n;
+};
+template <> struct max_n_1<0> {
+  static const size_t size = 1;
+};
+
+
 // Default packet types
 template <typename Scalar, typename Device>
 struct PacketType : internal::packet_traits<Scalar> {
@@ -44,11 +52,13 @@
 };
 
 // For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16)
-template <>
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) && defined(EIGEN_GPU_COMPILE_PHASE)
+
+typedef ulonglong2 Packet4h2;
+template<>
 struct PacketType<half, GpuDevice> {
-  typedef half2 type;
-  static const int size = 2;
+  typedef Packet4h2 type;
+  static const int size = 8;
   enum {
     HasAdd    = 1,
     HasSub    = 1,
@@ -76,11 +86,125 @@
 };
 #endif
 
+#if defined(EIGEN_USE_SYCL)
 
-#if defined(EIGEN_HAS_CONSTEXPR)
-#define EIGEN_CONSTEXPR constexpr
-#else
-#define EIGEN_CONSTEXPR
+namespace TensorSycl {
+namespace internal {
+
+template <typename Index, Index A, Index B> struct PlusOp {
+  static constexpr Index Value = A + B;
+};
+
+template <typename Index, Index A, Index B> struct DivOp {
+  static constexpr Index Value = A / B;
+};
+
+template <typename Index, Index start, Index end, Index step,
+          template <class Indx, Indx...> class StepOp>
+struct static_for {
+  template <typename UnaryOperator>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator op) {
+    op(start);
+    static_for<Index, StepOp<Index, start, step>::Value, end, step,
+               StepOp>::loop(op);
+  }
+};
+template <typename Index, Index end, Index step,
+          template <class Indx, Indx...> class StepOp>
+struct static_for<Index, end, end, step, StepOp> {
+  template <typename UnaryOperator>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator) {}
+};
+
+template <typename OutScalar, typename Device, bool Vectorizable>
+struct Vectorise {
+  static const int PacketSize = 1;
+  typedef OutScalar PacketReturnType;
+};
+
+template <typename OutScalar, typename Device>
+struct Vectorise<OutScalar, Device, true> {
+  static const int PacketSize = Eigen::PacketType<OutScalar, Device>::size;
+  typedef typename Eigen::PacketType<OutScalar, Device>::type PacketReturnType;
+};
+
+static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index roundUp(Index x, Index y) {
+  return ((((x) + (y)-1) / (y)) * (y));
+}
+
+} // namespace internal
+} // namespace TensorSycl
+
+template <>
+  struct PacketType<half, SyclDevice> {
+  typedef half type;
+  static const int size = 1;
+  enum {
+    HasAdd    = 0,
+    HasSub    = 0,
+    HasMul    = 0,
+    HasNegate = 0,
+    HasAbs    = 0,
+    HasArg    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0,
+    HasBlend  = 0
+  };
+};
+template <typename Scalar>
+struct PacketType<Scalar, SyclDevice> : internal::default_packet_traits {
+  typedef Scalar type;
+  typedef Scalar half;
+  enum {
+    Vectorizable = 0,
+    size = 1,
+    AlignedOnScalar = 0,
+    HasHalfPacket = 0
+  };
+  enum {
+    HasAdd    = 0,
+    HasSub    = 0,
+    HasMul    = 0,
+    HasNegate = 0,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0
+  };
+
+};
+
+template <typename Scalar>
+struct PacketType<Scalar, const SyclDevice> : PacketType<Scalar, SyclDevice>{};
+
+#ifndef EIGEN_DONT_VECTORIZE_SYCL
+#define PACKET_TYPE(CVQual, Type, val, lengths, DEV)\
+template<> struct PacketType<CVQual Type, DEV> : internal::sycl_packet_traits<val, lengths> \
+{\
+  typedef typename internal::packet_traits<Type>::type type;\
+  typedef typename internal::packet_traits<Type>::half half;\
+};
+
+
+PACKET_TYPE(const, float, 1, 4, SyclDevice)
+PACKET_TYPE(, float, 1, 4, SyclDevice)
+PACKET_TYPE(const, float, 1, 4, const SyclDevice)
+PACKET_TYPE(, float, 1, 4, const SyclDevice)
+
+PACKET_TYPE(const, double, 0, 2, SyclDevice)
+PACKET_TYPE(, double, 0, 2, SyclDevice)
+PACKET_TYPE(const, double, 0, 2, const SyclDevice)
+PACKET_TYPE(, double, 0, 2, const SyclDevice)
+#undef PACKET_TYPE
+
+template<> struct PacketType<half, const SyclDevice>: PacketType<half, SyclDevice>{};
+template<> struct PacketType<const half, const SyclDevice>: PacketType<half, SyclDevice>{};
+#endif
 #endif
 
 // Tuple mimics std::pair but works on e.g. nvcc.
@@ -99,14 +223,6 @@
   Tuple(const U& f, const V& s) : first(f), second(s) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Tuple& operator= (const Tuple& rhs) {
-    if (&rhs == this) return *this;
-    first = rhs.first;
-    second = rhs.second;
-    return *this;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void swap(Tuple& rhs) {
     using numext::swap;
     swap(first, rhs.first);
@@ -126,7 +242,69 @@
   return !(x == y);
 }
 
-#undef EIGEN_CONSTEXPR
+
+// Can't use std::pairs on cuda devices
+template <typename Idx> struct IndexPair {
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {}
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {}
+
+  EIGEN_DEVICE_FUNC void set(IndexPair<Idx> val) {
+    first = val.first;
+    second = val.second;
+  }
+
+  Idx first;
+  Idx second;
+};
+
+
+#ifdef EIGEN_HAS_SFINAE
+namespace internal {
+
+  template<typename IndexType, typename Index, Index... Is>
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  array<Index, sizeof...(Is)> customIndices2Array(IndexType& idx, numeric_list<Index, Is...>) {
+    return { idx[Is]... };
+  }
+  template<typename IndexType, typename Index>
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  array<Index, 0> customIndices2Array(IndexType&, numeric_list<Index>) {
+    return array<Index, 0>();
+  }
+
+  /** Make an array (for index/dimensions) out of a custom index */
+  template<typename Index, std::size_t NumIndices, typename IndexType>
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  array<Index, NumIndices> customIndices2Array(IndexType& idx) {
+    return customIndices2Array(idx, typename gen_numeric_list<Index, NumIndices>::type{});
+  }
+
+
+  template <typename B, typename D>
+  struct is_base_of
+  {
+
+    typedef char (&yes)[1];
+    typedef char (&no)[2];
+
+    template <typename BB, typename DD>
+    struct Host
+    {
+      operator BB*() const;
+      operator DD*();
+    };
+
+    template<typename T>
+    static yes check(D*, T);
+    static no check(B*, int);
+
+    static const bool value = sizeof(check(Host<B,D>(), int())) == sizeof(yes);
+  };
+
+}
+#endif
+
+
 
 }  // namespace Eigen
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 0562530..b3f00f7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h

@@ -25,19 +25,19 @@
 {
   typedef typename XprType::Scalar Scalar;
   typedef traits<XprType> XprTraits;
-  typedef typename packet_traits<Scalar>::type Packet;
   typedef typename XprTraits::StorageKind StorageKind;
   typedef typename XprTraits::Index Index;
   typedef typename XprType::Nested Nested;
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = array_size<NewDimensions>::value;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename NewDimensions, typename XprType>
 struct eval<TensorReshapingOp<NewDimensions, XprType>, Eigen::Dense>
 {
-  typedef const TensorReshapingOp<NewDimensions, XprType>& type;
+  typedef const TensorReshapingOp<NewDimensions, XprType>EIGEN_DEVICE_REF type;
 };
 
 template<typename NewDimensions, typename XprType>
@@ -54,11 +54,9 @@
 class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors>
 {
   public:
+  typedef TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors> Base;
   typedef typename Eigen::internal::traits<TensorReshapingOp>::Scalar Scalar;
-  typedef typename Eigen::internal::traits<TensorReshapingOp>::Packet Packet;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
-  typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
   typedef typename Eigen::internal::nested<TensorReshapingOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorReshapingOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorReshapingOp>::Index Index;
@@ -73,26 +71,7 @@
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const TensorReshapingOp& other)
-    {
-      typedef TensorAssignOp<TensorReshapingOp, const TensorReshapingOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(
-          assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorReshapingOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(
-          assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReshapingOp)
 
   protected:
     typename XprType::Nested m_xpr;
@@ -107,75 +86,84 @@
   typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
   typedef NewDimensions Dimensions;
 
-  enum {
-    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    // TODO(andydavis) Re-enable BlockAccess when the performance issue
-    // with block-based reshape is resolved.
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage;
+
+  static const int NumOutputDims = internal::array_size<Dimensions>::value;
+  static const int NumInputDims  = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+
+  enum ReshapingKind {
+    // We do not use layout information to determine reshaping kind.
+    // Depending on the layout `N` can be inner or outer dimension.
+    OneByN = 0,  // expr.reshape(1, N)
+    NByOne = 1,  // expr.reshape(N, 1)
+    Runtime = 2  // Reshape dimensions are dynamic (specified at runtime).
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  // clang-format off
+  static const ReshapingKind kind =
+#if defined(EIGEN_HAS_INDEX_LIST)
+        (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/0, /*value=*/1)) ? OneByN
+      : (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/1, /*value=*/1)) ? NByOne
+      : Runtime;
+#else
+        Runtime;
+#endif
+  // clang-format on
+
+  enum {
+    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    // For trivial reshapes with raw access to underlying data we will provide
+    // zero overhead block access.
+    // TODO(ezhulenev): Consider adding block access without raw access?
+    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess &&
+                        NumInputDims > 0 && NumOutputDims > 0,
+    PreferBlockAccess = false,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = TensorEvaluator<ArgType, Device>::RawAccess
+  };
+
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumOutputDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef
+      typename internal::TensorMaterializedBlock<ScalarNoConst, NumOutputDims,
+                                                 Layout, Index>
+          TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_dimensions(op.dimensions())
   {
     // The total size of the reshaped tensor must be equal to the total size
     // of the input tensor.
     eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions()));
-
-    if (BlockAccess) {
-      const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims =
-          m_impl.dimensions();
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        m_outputStrides[0] = 1;
-        for (int i = 1; i < NumOutputDims; ++i) {
-          m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
-        }
-        m_inputStrides[0] = 1;
-        for (int i = 1; i < NumInputDims; ++i) {
-          m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
-        }
-      } else {
-#ifdef __CUDACC__
-        // TODO(andydavis) Remove the following line of code when associated
-        // nvcc bug b/22973013 is fixed.
-        for (int i = 0; i < 1; ++i) {}
-#endif
-        m_outputStrides[NumOutputDims - 1] = 1;
-        for (int i = NumOutputDims - 2; i >= 0; --i) {
-          m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
-        }
-        m_inputStrides[NumInputDims - 1] = 1;
-        for (int i = NumInputDims - 2; i >= 0; --i) {
-          m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
-        }
-      }
-    }
   }
 
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  static const std::size_t NumOutputDims =
-      internal::array_size<Dimensions>::value;
-  static const std::size_t NumInputDims = internal::array_size<
-    typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-  typedef typename internal::TensorBlock<
-    Index, typename internal::remove_const<Scalar>::type, NumOutputDims, Layout>
-  OutputTensorBlock;
-  typedef typename internal::TensorBlock<
-    Index, typename internal::remove_const<Scalar>::type, NumInputDims, Layout>
-  InputTensorBlock;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType data, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(data, std::move(done));
+  }
+#endif
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     return m_impl.evalSubExprsIfNeeded(data);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -190,148 +178,60 @@
     return m_impl.template packet<LoadMode>(index);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
-      std::vector<internal::TensorOpResourceRequirements>* resources) const {
-    m_impl.getResourceRequirements(resources);
-  }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     return m_impl.costPerCoeff(vectorized);
   }
 
-  // TODO(andydavis) Reduce the overhead of this function.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
-      OutputTensorBlock* output_block) const {
-    // Calculate output block unit-stride inner dimension length.
-    const DSizes<Index, NumOutputDims>& output_block_sizes =
-        output_block->block_sizes();
-    Index output_inner_dim_size = 1;
-    Index output_outer_dim_start = NumOutputDims;
-    for (Index i = 0; i < NumOutputDims; ++i) {
-      const Index dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-          ? i : NumOutputDims - i - 1;
-      output_inner_dim_size *= output_block_sizes[dim];
-      if (output_block_sizes[dim] < m_dimensions[dim]) {
-        output_outer_dim_start = i + 1;
-        break;
-      }
-    }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
 
-    // Initialize output block iterator state.
-    struct BlockIteratorState {
-      Index stride;
-      Index span;
-      Index size;
-      Index count;
-    };
-    array<BlockIteratorState, NumOutputDims> block_iter_state;
+  // required in block(OutputTensorBlock* output_block) const
+  // For C++03 compatibility this must be defined outside the method
+  struct BlockIteratorState {
+    Index stride;
+    Index span;
+    Index size;
+    Index count;
+  };
 
-    for (Index i = 0; i < NumOutputDims; ++i) {
-      const Index dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-          ? i : NumOutputDims - i - 1;
-      block_iter_state[i].size = output_block_sizes[dim];
-      block_iter_state[i].stride = m_outputStrides[dim];
-      block_iter_state[i].span =
-          block_iter_state[i].stride * (block_iter_state[i].size - 1);
-      block_iter_state[i].count = 0;
-    }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    eigen_assert(m_impl.data() != NULL);
+    eigen_assert((kind == Runtime) ||
+                 (kind == OneByN && desc.dimensions()[0] == 1) ||
+                 (kind == NByOne && desc.dimensions()[1] == 1));
 
-    const Index output_outer_dim_size = output_block_sizes.TotalSize() /
-        output_inner_dim_size;
-    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims =
-        m_impl.dimensions();
-
-    Index index = output_block->first_coeff_index();
-    for (Index outer_idx = 0; outer_idx < output_outer_dim_size; ++outer_idx) {
-      Index inner_idx = 0;
-      while (inner_idx < output_inner_dim_size) {
-        // Calculate input coords based on 'index'.
-        array<Index, NumInputDims> input_coords;
-        Index idx = index;
-        if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-          for (int i = NumInputDims - 1; i > 0; --i) {
-            input_coords[i] = idx / m_inputStrides[i];
-            idx -= input_coords[i] * m_inputStrides[i];
-          }
-          input_coords[0] = idx;
-        } else {
-          for (int i = 0; i < NumInputDims - 1; ++i) {
-            input_coords[i] = idx / m_inputStrides[i];
-            idx -= input_coords[i] * m_inputStrides[i];
-          }
-          input_coords[NumInputDims - 1] = idx;
-        }
-
-        // Calculate target input block shape, using at most
-        // 'output_inner_dim_size' coefficients along the input block's inner
-        // dimensions.
-        DSizes<Index, NumInputDims> input_block_sizes;
-        Index num_to_allocate = output_inner_dim_size - inner_idx;
-        for (Index i = 0; i < NumInputDims; ++i) {
-          const Index dim =
-              static_cast<int>(Layout) == static_cast<int>(ColMajor)
-              ? i : NumInputDims - i - 1;
-          input_block_sizes[dim] = numext::mini(
-              num_to_allocate, (static_cast<Index>(input_dims[dim]) -
-                                input_coords[dim]));
-          if (input_coords[dim] == 0) {
-            num_to_allocate /= input_block_sizes[dim];
-          } else {
-            num_to_allocate = 1;
-          }
-        }
-
-        // Calculate input block strides.
-        DSizes<Index, NumInputDims> input_block_strides;
-        if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-          input_block_strides[0] = 1;
-          for (int i = 1; i < NumInputDims; ++i) {
-            input_block_strides[i] = input_block_strides[i - 1] *
-                input_block_sizes[i - 1];
-          }
-        } else {
-          input_block_strides[NumInputDims - 1] = 1;
-          for (int i = NumInputDims - 2; i >= 0; --i) {
-            input_block_strides[i] = input_block_strides[i + 1] *
-                input_block_sizes[i + 1];
-          }
-        }
-
-        // Instantiate and read input block from input tensor.
-        InputTensorBlock input_block(index, input_block_sizes,
-                                     input_block_strides, m_inputStrides,
-                                     output_block->data() + outer_idx *
-                                     output_inner_dim_size + inner_idx);
-
-        m_impl.block(&input_block);
-
-        const Index input_block_total_size = input_block_sizes.TotalSize();
-        index += input_block_total_size;
-        inner_idx += input_block_total_size;
-      }
-      eigen_assert(inner_idx == output_inner_dim_size);
-      index -= output_inner_dim_size;
-      // Update index.
-      for (Index i = output_outer_dim_start; i < NumOutputDims; ++i) {
-        if (++block_iter_state[i].count < block_iter_state[i].size) {
-          index += block_iter_state[i].stride;
-          break;
-        }
-        block_iter_state[i].count = 0;
-        index -= block_iter_state[i].span;
-      }
+    if (kind == OneByN || kind == NByOne) {
+      // We can guarantee at compile time that block is just a contiguous slice
+      // of the underlying expression memory buffer.
+      return TensorBlock(internal::TensorBlockKind::kView,
+                           m_impl.data() + desc.offset(), desc.dimensions());
+    } else {
+      // This will do additional runtime checks, and in the end it might be also
+      // a view, or it might be a block materialized in the temporary buffer.
+      return TensorBlock::materialize(m_impl.data(), m_dimensions, desc,
+                                        scratch);
     }
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const {
+    return constCast(m_impl.data());
+  }
 
   EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
+  #ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+  #endif
  protected:
   TensorEvaluator<ArgType, Device> m_impl;
   NewDimensions m_dimensions;
-  DSizes<Index, NumOutputDims> m_outputStrides;
-  DSizes<Index, NumInputDims> m_inputStrides;
 };
 
 
@@ -346,32 +246,56 @@
   typedef NewDimensions Dimensions;
 
   enum {
-    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = false,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
   { }
 
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<TensorEvaluator::NumOutputDims, Index>
+      TensorBlockDesc;
+  //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
   {
     return this->m_impl.coeffRef(index);
   }
+
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
     this->m_impl.template writePacket<StoreMode>(index, x);
   }
+
+  template <typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlockDesc& desc, const TensorBlock& block) {
+    assert(this->m_impl.data() != NULL);
+
+    typedef typename TensorBlock::XprType TensorBlockExpr;
+    typedef internal::TensorBlockAssignment<
+        Scalar, TensorEvaluator::NumOutputDims, TensorBlockExpr, Index>
+        TensorBlockAssign;
+
+    TensorBlockAssign::Run(
+        TensorBlockAssign::target(desc.dimensions(),
+                                  internal::strides<Layout>(this->dimensions()),
+                                  this->m_impl.data(), desc.offset()),
+        block.expr());
+  }
 };
 
 
@@ -388,19 +312,19 @@
 {
   typedef typename XprType::Scalar Scalar;
   typedef traits<XprType> XprTraits;
-  typedef typename packet_traits<Scalar>::type Packet;
   typedef typename XprTraits::StorageKind StorageKind;
   typedef typename XprTraits::Index Index;
   typedef typename XprType::Nested Nested;
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = array_size<StartIndices>::value;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename StartIndices, typename Sizes, typename XprType>
 struct eval<TensorSlicingOp<StartIndices, Sizes, XprType>, Eigen::Dense>
 {
-  typedef const TensorSlicingOp<StartIndices, Sizes, XprType>& type;
+  typedef const TensorSlicingOp<StartIndices, Sizes, XprType>EIGEN_DEVICE_REF type;
 };
 
 template<typename StartIndices, typename Sizes, typename XprType>
@@ -417,11 +341,9 @@
 class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> >
 {
   public:
+  typedef TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> > Base;
   typedef typename Eigen::internal::traits<TensorSlicingOp>::Scalar Scalar;
-  typedef typename Eigen::internal::traits<TensorSlicingOp>::Packet Packet;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
   typedef typename Eigen::internal::nested<TensorSlicingOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorSlicingOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorSlicingOp>::Index Index;
@@ -438,26 +360,7 @@
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const TensorSlicingOp& other)
-    {
-      typedef TensorAssignOp<TensorSlicingOp, const TensorSlicingOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(
-          assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorSlicingOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(
-          assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorSlicingOp)
 
   protected:
     typename XprType::Nested m_xpr;
@@ -466,6 +369,39 @@
 };
 
 
+// Fixme: figure out the exact threshold
+namespace {
+template <typename Index, typename Device, bool BlockAccess> struct MemcpyTriggerForSlicing {
+  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { }
+  EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const {
+    const bool prefer_block_evaluation = BlockAccess && total > 32*1024;
+    return !prefer_block_evaluation && contiguous > threshold_;
+  }
+
+ private:
+  Index threshold_;
+};
+
+// It is very expensive to start the memcpy kernel on GPU: we therefore only
+// use it for large copies.
+#ifdef EIGEN_USE_GPU
+template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, GpuDevice, BlockAccess>  {
+  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { }
+  EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; }
+};
+#endif
+
+// It is very expensive to start the memcpy kernel on GPU: we therefore only
+// use it for large copies.
+#ifdef EIGEN_USE_SYCL
+template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, Eigen::SyclDevice, BlockAccess>  {
+  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { }
+  EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; }
+};
+#endif
+
+}
+
 // Eval as rvalue
 template<typename StartIndices, typename Sizes, typename ArgType, typename Device>
 struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
@@ -473,24 +409,56 @@
   typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
   static const int NumDims = internal::array_size<Sizes>::value;
 
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Sizes Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
   enum {
     // Alignment can't be guaranteed at compile time since it depends on the
     // slice offsets and sizes.
-    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess &&
+                        // FIXME: Temporary workaround for bug in slicing of bool tensors.
+                        !internal::is_same<typename internal::remove_const<Scalar>::type, bool>::value,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  // Tensor slicing does not change the block type.
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
   {
+    m_is_identity = true;
     for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) {
-      eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]);
+      eigen_assert(m_impl.dimensions()[i] >=
+                   op.sizes()[i] + op.startIndices()[i]);
+      if (m_impl.dimensions()[i] != op.sizes()[i] ||
+          op.startIndices()[i] != 0) {
+        m_is_identity = false;
+      }
     }
 
+    // No strides for scalars.
+    if (NumDims == 0) return;
+
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     const Sizes& output_dims = op.sizes();
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -499,11 +467,11 @@
         m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
       }
 
-      // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed.
+     // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed.
       m_outputStrides[0] = 1;
       for (int i = 1; i < NumDims; ++i) {
         m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
       }
     } else {
       m_inputStrides[NumDims-1] = 1;
@@ -511,33 +479,21 @@
         m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
       }
 
+     // Don't initialize m_fastOutputStrides[NumDims-1] since it won't ever be accessed.
       m_outputStrides[NumDims-1] = 1;
       for (int i = NumDims - 2; i >= 0; --i) {
         m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
       }
     }
-
-    m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
-                                          device.lastLevelCacheSize() /
-                                          sizeof(Scalar));
   }
 
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  typedef Sizes Dimensions;
-  typedef internal::TensorBlock<Index, ScalarNonConst, NumDims, Layout>
-    TensorBlock;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     m_impl.evalSubExprsIfNeeded(NULL);
-    if (internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value && data && m_impl.data()) {
+    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization
+        && data && m_impl.data()) {
       Index contiguous_values = 1;
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
         for (int i = 0; i < NumDims; ++i) {
@@ -555,11 +511,12 @@
         }
       }
       // Use memcpy if it's going to be faster than using the regular evaluation.
-      if (contiguous_values > m_device.memcpyThreshold()) {
-        Scalar* src = (Scalar*)m_impl.data();
-        for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
+      const MemcpyTriggerForSlicing<Index, Device, BlockAccess> trigger(m_device);
+      if (trigger(internal::array_prod(dimensions()), contiguous_values)) {
+        EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data();
+        for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
           Index offset = srcCoeff(i);
-          m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar));
+          m_device.memcpy((void*)(m_device.get(data + i)), m_device.get(src+offset), contiguous_values * sizeof(Scalar));
         }
         return false;
       }
@@ -567,25 +524,42 @@
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType /*data*/, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    return m_impl.coeff(srcCoeff(index));
+    if (m_is_identity) {
+      return m_impl.coeff(index);
+    } else {
+      return m_impl.coeff(srcCoeff(index));
+    }
   }
 
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-        eigen_assert(index+packetSize-1 < internal::array_prod(dimensions()));
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < internal::array_prod(dimensions()));
+
+    if (m_is_identity) {
+      return m_impl.template packet<LoadMode>(index);
+    }
 
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + packetSize - 1};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx0 = indices[0] / m_fastOutputStrides[i];
         const Index idx1 = indices[1] / m_fastOutputStrides[i];
@@ -597,6 +571,7 @@
       inputIndices[0] += (indices[0] + m_offsets[0]);
       inputIndices[1] += (indices[1] + m_offsets[0]);
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx0 = indices[0] / m_fastOutputStrides[i];
         const Index idx1 = indices[1] / m_fastOutputStrides[i];
@@ -613,9 +588,10 @@
       return rslt;
     }
     else {
-      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
       values[0] = m_impl.coeff(inputIndices[0]);
       values[packetSize-1] = m_impl.coeff(inputIndices[1]);
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < packetSize-1; ++i) {
         values[i] = coeff(index+i);
       }
@@ -624,29 +600,29 @@
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
-      std::vector<internal::TensorOpResourceRequirements>* resources) const {
-    resources->push_back(internal::TensorOpResourceRequirements(
-        internal::kSkewedInnerDims, m_block_total_size_max));
-    m_impl.getResourceRequirements(resources);
-  }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
-      TensorBlock* output_block) const {
-    TensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
-                            output_block->block_sizes(),
-                            output_block->block_strides(),
-                            m_inputStrides,
-                            output_block->data());
-    m_impl.block(&input_block);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::merge(
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
+        m_impl.getResourceRequirements());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
-    Scalar* result = m_impl.data();
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset()));
+    TensorBlock block = m_impl.block(arg_desc, scratch);
+    if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
+    return block;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
+    typename Storage::Type result = constCast(m_impl.data());
     if (result) {
       Index offset = 0;
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -680,12 +656,19 @@
     }
     return NULL;
   }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
 
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx = index / m_fastOutputStrides[i];
         inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
@@ -693,6 +676,7 @@
       }
       inputIndex += (index + m_offsets[0]);
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx = index / m_fastOutputStrides[i];
         inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
@@ -707,10 +691,10 @@
   array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
   array<Index, NumDims> m_inputStrides;
   TensorEvaluator<ArgType, Device> m_impl;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
   Dimensions m_dimensions;
+  bool m_is_identity;
   const StartIndices m_offsets;
-  std::size_t m_block_total_size_max;
 };
 
 
@@ -723,40 +707,55 @@
   typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
   static const int NumDims = internal::array_size<Sizes>::value;
 
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Sizes Dimensions;
+
   enum {
-    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,
+    RawAccess         = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
     { }
 
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  typedef Sizes Dimensions;
-  typedef internal::TensorBlock<Index, ScalarNonConst, NumDims, Layout>
-    TensorBlock;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
   {
-    return this->m_impl.coeffRef(this->srcCoeff(index));
+    if (this->m_is_identity) {
+      return this->m_impl.coeffRef(index);
+    } else {
+      return this->m_impl.coeffRef(this->srcCoeff(index));
+    }
   }
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    if (this->m_is_identity) {
+      this->m_impl.template writePacket<StoreMode>(index, x);
+      return;
+    }
+
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + packetSize - 1};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
         const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
@@ -768,6 +767,7 @@
       inputIndices[0] += (indices[0] + this->m_offsets[0]);
       inputIndices[1] += (indices[1] + this->m_offsets[0]);
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
         const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
@@ -783,25 +783,22 @@
       this->m_impl.template writePacket<StoreMode>(inputIndices[0], x);
     }
     else {
-      EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+      EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
       internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
       this->m_impl.coeffRef(inputIndices[0]) = values[0];
       this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < packetSize-1; ++i) {
         this->coeffRef(index+i) = values[i];
       }
     }
   }
 
+  template<typename TensorBlock>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
-      const TensorBlock& block) {
-    this->m_impl.writeBlock(
-        TensorBlock(this->srcCoeff(block.first_coeff_index()),
-                    block.block_sizes(),
-                    block.block_strides(),
-                    this->m_inputStrides,
-                    const_cast<ScalarNonConst*>(block.data())));
-
+      const TensorBlockDesc& desc, const TensorBlock& block) {
+    TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset()));
+    this->m_impl.writeBlock(arg_desc, block);
   }
 };
 
@@ -811,19 +808,19 @@
 {
   typedef typename XprType::Scalar Scalar;
   typedef traits<XprType> XprTraits;
-  typedef typename packet_traits<Scalar>::type Packet;
   typedef typename XprTraits::StorageKind StorageKind;
   typedef typename XprTraits::Index Index;
   typedef typename XprType::Nested Nested;
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = array_size<StartIndices>::value;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
 struct eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Eigen::Dense>
 {
-  typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>& type;
+  typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>EIGEN_DEVICE_REF type;
 };
 
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
@@ -839,14 +836,12 @@
 class TensorStridingSlicingOp : public TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >
 {
   public:
-  typedef typename Eigen::internal::traits<TensorStridingSlicingOp>::Scalar Scalar;
-  typedef typename Eigen::internal::traits<TensorStridingSlicingOp>::Packet Packet;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> > Base;
+  typedef typename internal::traits<TensorStridingSlicingOp>::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  typedef typename Eigen::internal::nested<TensorStridingSlicingOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorStridingSlicingOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorStridingSlicingOp>::Index Index;
+  typedef typename internal::nested<TensorStridingSlicingOp>::type Nested;
+  typedef typename internal::traits<TensorStridingSlicingOp>::StorageKind StorageKind;
+  typedef typename internal::traits<TensorStridingSlicingOp>::Index Index;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingSlicingOp(
     const XprType& expr, const StartIndices& startIndices,
@@ -865,26 +860,7 @@
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const TensorStridingSlicingOp& other)
-    {
-      typedef TensorAssignOp<TensorStridingSlicingOp, const TensorStridingSlicingOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(
-          assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorStridingSlicingOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(
-          assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingSlicingOp)
 
   protected:
     typename XprType::Nested m_xpr;
@@ -899,6 +875,13 @@
 {
   typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
   static const int NumDims = internal::array_size<Strides>::value;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef Strides Dimensions;
 
   enum {
     // Alignment can't be guaranteed at compile time since it depends on the
@@ -906,47 +889,58 @@
     IsAligned = false,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_device(device), m_strides(op.strides())
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device),
+        m_device(device),
+        m_strides(op.strides())
   {
-    auto clamp = [](Index value, Index min, Index max){
-      return numext::maxi(min,numext::mini(max,value));
-    };
     // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
-    DSizes<Index,NumDims> startIndicesClamped, stopIndicesClamped;
-    for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+    DSizes<Index, NumDims> startIndicesClamped, stopIndicesClamped;
+    for (ptrdiff_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
       eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
-      if(m_strides[i]>0){
-        startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
-        stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
-      }else{ 
-        /* implies m_strides[i]<0 by assert */
-        startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
-        stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
+      if (m_strides[i] > 0) {
+        startIndicesClamped[i] =
+            clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
+        stopIndicesClamped[i] =
+            clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
+      } else {
+        /* implies m_strides[i] < 0 by assert */
+        startIndicesClamped[i] =
+            clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
+        stopIndicesClamped[i] =
+            clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
       }
       m_startIndices[i] = startIndicesClamped[i];
     }
 
-    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+    const InputDimensions& input_dims = m_impl.dimensions();
 
-    // check for degenerate intervals and compute output tensor shape
-    bool degenerate = false;;
-    for(int i = 0; i < NumDims; i++){
+    // compute output tensor shape
+    m_is_identity = true;
+    for (int i = 0; i < NumDims; i++) {
       Index interval = stopIndicesClamped[i] - startIndicesClamped[i];
-      if(interval == 0 || (interval<0 != m_strides[i]<0)){
+      if (interval == 0 || ((interval < 0) != (m_strides[i] < 0))) {
         m_dimensions[i] = 0;
-        degenerate = true;
-      }else{
-        m_dimensions[i] = interval / m_strides[i]
-                          + (interval % m_strides[i] != 0 ? 1 : 0);
+      } else {
+        m_dimensions[i] =
+            (interval / m_strides[i]) + (interval % m_strides[i] != 0 ? 1 : 0);
         eigen_assert(m_dimensions[i] >= 0);
       }
+      if (m_strides[i] != 1 || interval != m_impl.dimensions()[i]) {
+        m_is_identity = false;
+      }
     }
+
     Strides output_dims = m_dimensions;
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -963,8 +957,7 @@
       m_outputStrides[0] = 1;
       for (int i = 1; i < NumDims; ++i) {
         m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
-        // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
       }
     } else {
       m_inputStrides[NumDims-1] = m_strides[NumDims-1];
@@ -979,79 +972,58 @@
       m_outputStrides[NumDims-1] = 1;
       for (int i = NumDims - 2; i >= 0; --i) {
         m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
-        // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
       }
     }
-    m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
-                                          device.lastLevelCacheSize() /
-                                          sizeof(Scalar));
   }
 
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  typedef Strides Dimensions;
-  typedef internal::TensorBlock<Index, ScalarNonConst, NumDims, Layout>
-    TensorBlock;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
-    // TODO(aselle): implement memcpy option
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    return m_impl.coeff(srcCoeff(index));
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    // TODO(aselle): implement packet access
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
-      std::vector<internal::TensorOpResourceRequirements>* resources) const {
-    resources->push_back(internal::TensorOpResourceRequirements(
-        internal::kSkewedInnerDims, m_block_total_size_max));
-    m_impl.getResourceRequirements(resources);
+    if (m_is_identity) {
+      return m_impl.coeff(index);
+    } else {
+      return m_impl.coeff(srcCoeff(index));
+    }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
-      TensorBlock* output_block) const {
-      // TODO(aselle): implement block access
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
+    return NULL;
   }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
-    return nullptr;
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
   }
-
+#endif
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i >= 0; --i) {
         const Index idx = index / m_fastOutputStrides[i];
         inputIndex += idx * m_inputStrides[i] + m_offsets[i];
         index -= idx * m_outputStrides[i];
       }
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims; ++i) {
         const Index idx = index / m_fastOutputStrides[i];
         inputIndex += idx * m_inputStrides[i] + m_offsets[i];
@@ -1061,17 +1033,24 @@
     return inputIndex;
   }
 
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
+#ifndef SYCL_DEVICE_ONLY
+    return numext::maxi(min, numext::mini(max,value));
+#else
+    return cl::sycl::clamp(value, min, max);
+#endif
+  }
+
   array<Index, NumDims> m_outputStrides;
   array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
   array<Index, NumDims> m_inputStrides;
+  bool m_is_identity;
   TensorEvaluator<ArgType, Device> m_impl;
-  const Device& m_device;
-  // TODO(aselle): could remove m_startIndices by constructin m_inputStrides differently
+  const Device EIGEN_DEVICE_REF m_device;
   DSizes<Index, NumDims> m_startIndices; // clamped startIndices
   DSizes<Index, NumDims> m_dimensions;
   DSizes<Index, NumDims> m_offsets; // offset in a flattened shape
   const Strides m_strides;
-  std::size_t m_block_total_size_max;
 };
 
 // Eval as lvalue
@@ -1087,37 +1066,33 @@
     IsAligned = false,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
     { }
 
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef Strides Dimensions;
-  typedef internal::TensorBlock<Index, ScalarNonConst, NumDims, Layout>
-    TensorBlock;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
   {
-    return this->m_impl.coeffRef(this->srcCoeff(index));
-  }
-
-  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
-  {
-    // TODO(aselle): implement packet storage
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(const TensorBlock& block) {
-    // TODO(aselle): implement write block
+    if (this->m_is_identity) {
+      return this->m_impl.coeffRef(index);
+    } else {
+      return this->m_impl.coeffRef(this->srcCoeff(index));
+    }
   }
 };
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 64ad5e3..ee44382 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h

@@ -25,13 +25,13 @@
 {
   typedef typename XprType::Scalar Scalar;
   typedef traits<XprType> XprTraits;
-  typedef typename packet_traits<Scalar>::type Packet;
   typedef typename XprTraits::StorageKind StorageKind;
   typedef typename XprTraits::Index Index;
   typedef typename XprType::Nested Nested;
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename PaddingDimensions, typename XprType>
@@ -55,16 +55,13 @@
 {
   public:
   typedef typename Eigen::internal::traits<TensorPaddingOp>::Scalar Scalar;
-  typedef typename Eigen::internal::traits<TensorPaddingOp>::Packet Packet;
   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
   typedef typename Eigen::internal::nested<TensorPaddingOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorPaddingOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorPaddingOp>::Index Index;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims,
-                                                        const Scalar padding_value)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims, const Scalar padding_value)
       : m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {}
 
     EIGEN_DEVICE_FUNC
@@ -93,21 +90,40 @@
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = true,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = true,
-    RawAccess = false
+    IsAligned         = true,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = true,
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value())
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device)
   {
+    // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
+    // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector
+    // of 1 element first and then pad.
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
     // Compute dimensions
     m_dimensions = m_impl.dimensions();
     for (int i = 0; i < NumDims; ++i) {
@@ -115,74 +131,49 @@
     }
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStrides[0] = 1;
       m_outputStrides[0] = 1;
-      if (NumDims > 0) {
-        m_inputStrides[0] = 1;
-        for (int i = 1; i < NumDims; ++i) {
-          m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
-          m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
-        }
-        m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1];
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
       }
+      m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1];
     } else {
+      m_inputStrides[NumDims - 1] = 1;
       m_outputStrides[NumDims] = 1;
-      if (NumDims > 0) {
-        m_inputStrides[NumDims - 1] = 1;
-        for (int i = NumDims - 2; i >= 0; --i) {
-          m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
-          m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1];
-        }
-        m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0];
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+        m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1];
       }
+      m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0];
     }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
-      Index index, int dim_index) const {
-#if defined(EIGEN_HAS_INDEX_LIST)
-    return (!internal::index_pair_first_statically_eq<PaddingDimensions>()(dim_index, 0) &&
-            index < m_padding[dim_index].first) ||
-        (!internal::index_pair_second_statically_eq<PaddingDimensions>()(dim_index, 0) &&
-         index >= m_dimensions[dim_index] - m_padding[dim_index].second);
-#else
-    return (index < m_padding[dim_index].first) ||
-           (index >= m_dimensions[dim_index] - m_padding[dim_index].second);
-#endif
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero(
-      int dim_index) const {
-#if defined(EIGEN_HAS_INDEX_LIST)
-    return internal::index_pair_first_statically_eq<PaddingDimensions>()(dim_index, 0);
-#else
-    return false;
-#endif
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero(
-      int dim_index) const {
-#if defined(EIGEN_HAS_INDEX_LIST)
-    return internal::index_pair_second_statically_eq<PaddingDimensions>()(dim_index, 0);
-#else
-    return false;
-#endif
-  }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
     eigen_assert(index < dimensions().TotalSize());
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx = index / m_outputStrides[i];
         if (isPaddingAtIndexForDim(idx, i)) {
@@ -191,13 +182,12 @@
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
         index -= idx * m_outputStrides[i];
       }
-      if (NumDims > 0) {
-        if (isPaddingAtIndexForDim(index, 0)) {
-          return m_paddingValue;
-        }
-        inputIndex += (index - m_padding[0].first);
+      if (isPaddingAtIndexForDim(index, 0)) {
+        return m_paddingValue;
       }
+      inputIndex += (index - m_padding[0].first);
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx = index / m_outputStrides[i+1];
         if (isPaddingAtIndexForDim(idx, i)) {
@@ -206,12 +196,10 @@
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
         index -= idx * m_outputStrides[i+1];
       }
-      if (NumDims > 0) {
-        if (isPaddingAtIndexForDim(index, NumDims - 1)) {
-          return m_paddingValue;
-        }
-        inputIndex += (index - m_padding[NumDims-1].first);
+      if (isPaddingAtIndexForDim(index, NumDims-1)) {
+        return m_paddingValue;
       }
+      inputIndex += (index - m_padding[NumDims-1].first);
     }
     return m_impl.coeff(inputIndex);
   }
@@ -225,21 +213,335 @@
     return packetRowMajor(index);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     TensorOpCost cost = m_impl.costPerCoeff(vectorized);
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims; ++i)
         updateCostPerDimension(cost, i, i == 0);
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i >= 0; --i)
         updateCostPerDimension(cost, i, i == NumDims - 1);
     }
     return cost;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::merge(
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
+        m_impl.getResourceRequirements());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    // If one of the dimensions is zero, return empty block view.
+    if (desc.size() == 0) {
+      return TensorBlock(internal::TensorBlockKind::kView, NULL,
+                           desc.dimensions());
+    }
+
+    static const bool IsColMajor = Layout == static_cast<int>(ColMajor);
+    const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1;
+
+    Index offset = desc.offset();
+
+    // Compute offsets in the output tensor corresponding to the desc.offset().
+    DSizes<Index, NumDims> output_offsets;
+    for (int i = NumDims - 1; i > 0; --i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      const int stride_dim = IsColMajor ? dim : dim + 1;
+      output_offsets[dim] = offset / m_outputStrides[stride_dim];
+      offset -= output_offsets[dim] * m_outputStrides[stride_dim];
+    }
+    output_offsets[inner_dim_idx] = offset;
+
+    // Offsets in the input corresponding to output offsets.
+    DSizes<Index, NumDims> input_offsets = output_offsets;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      input_offsets[dim] = input_offsets[dim] - m_padding[dim].first;
+    }
+
+    // Compute offset in the input buffer (at this point it might be illegal and
+    // point outside of the input buffer, because we don't check for negative
+    // offsets, it will be autocorrected in the block iteration loop below).
+    Index input_offset = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      input_offset += input_offsets[dim] * m_inputStrides[dim];
+    }
+
+    // Destination buffer and scratch buffer both indexed from 0 and have the
+    // same dimensions as the requested block (for destination buffer this
+    // property is guaranteed by `desc.destination()`).
+    Index output_offset = 0;
+    const DSizes<Index, NumDims> output_strides =
+        internal::strides<Layout>(desc.dimensions());
+
+    // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1`
+    // dimensions, skipping innermost dimension. In theory it should be possible
+    // to squeeze matching innermost dimensions, however in practice that did
+    // not show any improvements in benchmarks. Also in practice first outer
+    // dimension usually has padding, and will prevent squeezing.
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims - 1> it;
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const int dim = IsColMajor ? i + 1 : NumDims - i - 2;
+      it[i].count = 0;
+      it[i].size = desc.dimension(dim);
+
+      it[i].input_stride = m_inputStrides[dim];
+      it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+      it[i].output_stride = output_strides[dim];
+      it[i].output_span = it[i].output_stride * (it[i].size - 1);
+    }
+
+    const Index input_inner_dim_size =
+        static_cast<Index>(m_impl.dimensions()[inner_dim_idx]);
+
+    // Total output size.
+    const Index output_size = desc.size();
+
+    // We will fill inner dimension of this size in the output. It might be
+    // larger than the inner dimension in the input, so we might have to pad
+    // before/after we copy values from the input inner dimension.
+    const Index output_inner_dim_size = desc.dimension(inner_dim_idx);
+
+    // How many values to fill with padding BEFORE reading from the input inner
+    // dimension.
+    const Index output_inner_pad_before_size =
+        input_offsets[inner_dim_idx] < 0
+            ? numext::mini(numext::abs(input_offsets[inner_dim_idx]),
+                           output_inner_dim_size)
+            : 0;
+
+    // How many values we can actually copy from the input inner dimension.
+    const Index output_inner_copy_size = numext::mini(
+        // Want to copy from input.
+        (output_inner_dim_size - output_inner_pad_before_size),
+        // Can copy from input.
+        numext::maxi(input_inner_dim_size - (input_offsets[inner_dim_idx] +
+                                             output_inner_pad_before_size),
+                     Index(0)));
+
+    eigen_assert(output_inner_copy_size >= 0);
+
+    // How many values to fill with padding AFTER reading from the input inner
+    // dimension.
+    const Index output_inner_pad_after_size =
+        (output_inner_dim_size - output_inner_copy_size -
+         output_inner_pad_before_size);
+
+    // Sanity check, sum of all sizes must be equal to the output size.
+    eigen_assert(output_inner_dim_size ==
+                 (output_inner_pad_before_size + output_inner_copy_size +
+                  output_inner_pad_after_size));
+
+    // Keep track of current coordinates and padding in the output.
+    DSizes<Index, NumDims> output_coord = output_offsets;
+    DSizes<Index, NumDims> output_padded;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+    }
+
+    typedef internal::StridedLinearBufferCopy<ScalarNoConst, Index> LinCopy;
+
+    // Prepare storage for the materialized padding result.
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(desc, scratch);
+
+    // TODO(ezhulenev): Squeeze multiple non-padded inner dimensions into a
+    // single logical inner dimension.
+
+    // When possible we squeeze writes for the innermost (only if non-padded)
+    // dimension with the first padded dimension. This allows to reduce the
+    // number of calls to LinCopy and better utilize vector instructions.
+    const bool squeeze_writes =
+        NumDims > 1 &&
+        // inner dimension is not padded
+        (input_inner_dim_size == m_dimensions[inner_dim_idx]) &&
+        // and equal to the block inner dimension
+        (input_inner_dim_size == output_inner_dim_size);
+
+    const int squeeze_dim = IsColMajor ? inner_dim_idx + 1 : inner_dim_idx - 1;
+
+    // Maximum coordinate on a squeeze dimension that we can write to.
+    const Index squeeze_max_coord =
+        squeeze_writes ? numext::mini(
+                             // max non-padded element in the input
+                             static_cast<Index>(m_dimensions[squeeze_dim] -
+                                                m_padding[squeeze_dim].second),
+                             // max element in the output buffer
+                             static_cast<Index>(output_offsets[squeeze_dim] +
+                                                desc.dimension(squeeze_dim)))
+                       : static_cast<Index>(0);
+
+    // Iterate copying data from `m_impl.data()` to the output buffer.
+    for (Index size = 0; size < output_size;) {
+      // Detect if we are in the padded region (exclude innermost dimension).
+      bool is_padded = false;
+      for (int j = 1; j < NumDims; ++j) {
+        const int dim = IsColMajor ? j : NumDims - j - 1;
+        is_padded = output_padded[dim];
+        if (is_padded) break;
+      }
+
+      if (is_padded) {
+        // Fill single innermost dimension with padding value.
+        size += output_inner_dim_size;
+
+        LinCopy::template Run<LinCopy::Kind::FillLinear>(
+            typename LinCopy::Dst(output_offset, 1, block_storage.data()),
+            typename LinCopy::Src(0, 0, &m_paddingValue),
+            output_inner_dim_size);
+
+
+      } else if (squeeze_writes) {
+        // Squeeze multiple reads from innermost dimensions.
+        const Index squeeze_num = squeeze_max_coord - output_coord[squeeze_dim];
+        size += output_inner_dim_size * squeeze_num;
+
+        // Copy `squeeze_num` inner dimensions from input to output.
+        LinCopy::template Run<LinCopy::Kind::Linear>(
+            typename LinCopy::Dst(output_offset, 1, block_storage.data()),
+            typename LinCopy::Src(input_offset, 1, m_impl.data()),
+            output_inner_dim_size * squeeze_num);
+
+        // Update iteration state for only `squeeze_num - 1` processed inner
+        // dimensions, because we have another iteration state update at the end
+        // of the loop that will update iteration state for the last inner
+        // processed dimension.
+        it[0].count += (squeeze_num - 1);
+        input_offset += it[0].input_stride * (squeeze_num - 1);
+        output_offset += it[0].output_stride * (squeeze_num - 1);
+        output_coord[squeeze_dim] += (squeeze_num - 1);
+
+      } else {
+        // Single read from innermost dimension.
+        size += output_inner_dim_size;
+
+        {  // Fill with padding before copying from input inner dimension.
+          const Index out = output_offset;
+
+          LinCopy::template Run<LinCopy::Kind::FillLinear>(
+              typename LinCopy::Dst(out, 1, block_storage.data()),
+              typename LinCopy::Src(0, 0, &m_paddingValue),
+              output_inner_pad_before_size);
+        }
+
+        {  // Copy data from input inner dimension.
+          const Index out = output_offset + output_inner_pad_before_size;
+          const Index in = input_offset + output_inner_pad_before_size;
+
+          eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL);
+
+          LinCopy::template Run<LinCopy::Kind::Linear>(
+              typename LinCopy::Dst(out, 1, block_storage.data()),
+              typename LinCopy::Src(in, 1, m_impl.data()),
+              output_inner_copy_size);
+        }
+
+        {  // Fill with padding after copying from input inner dimension.
+          const Index out = output_offset + output_inner_pad_before_size +
+                            output_inner_copy_size;
+
+          LinCopy::template Run<LinCopy::Kind::FillLinear>(
+              typename LinCopy::Dst(out, 1, block_storage.data()),
+              typename LinCopy::Src(0, 0, &m_paddingValue),
+              output_inner_pad_after_size);
+        }
+      }
+
+      for (int j = 0; j < NumDims - 1; ++j) {
+        const int dim = IsColMajor ? j + 1 : NumDims - j - 2;
+
+        if (++it[j].count < it[j].size) {
+          input_offset += it[j].input_stride;
+          output_offset += it[j].output_stride;
+          output_coord[dim] += 1;
+          output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+          break;
+        }
+        it[j].count = 0;
+        input_offset -= it[j].input_span;
+        output_offset -= it[j].output_span;
+        output_coord[dim] -= it[j].size - 1;
+        output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
+
  private:
+  struct BlockIteratorState {
+    BlockIteratorState()
+        : count(0),
+          size(0),
+          input_stride(0),
+          input_span(0),
+          output_stride(0),
+          output_span(0) {}
+
+    Index count;
+    Index size;
+    Index input_stride;
+    Index input_span;
+    Index output_stride;
+    Index output_span;
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
+      Index index, int dim_index) const {
+#if defined(EIGEN_HAS_INDEX_LIST)
+    return (!internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0) &&
+            index < m_padding[dim_index].first) ||
+        (!internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0) &&
+         index >= m_dimensions[dim_index] - m_padding[dim_index].second);
+#else
+    return (index < m_padding[dim_index].first) ||
+           (index >= m_dimensions[dim_index] - m_padding[dim_index].second);
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero(
+      int dim_index) const {
+#if defined(EIGEN_HAS_INDEX_LIST)
+    return internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0);
+#else
+    EIGEN_UNUSED_VARIABLE(dim_index);
+    return false;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero(
+      int dim_index) const {
+#if defined(EIGEN_HAS_INDEX_LIST)
+    return internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0);
+#else
+    EIGEN_UNUSED_VARIABLE(dim_index);
+    return false;
+#endif
+  }
+
+
   void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const {
     const double in = static_cast<double>(m_impl.dimensions()[i]);
     const double out = in + m_padding[i].first + m_padding[i].second;
@@ -262,29 +564,28 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     const Index initialIndex = index;
     Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
     for (int i = NumDims - 1; i > 0; --i) {
-      const Index first = index;
-      const Index last = index + packetSize - 1;
+      const Index firstIdx = index;
+      const Index lastIdx = index + PacketSize - 1;
       const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
       const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
       const Index lastPaddedRight = m_outputStrides[i+1];
 
-      if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
+      if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if (!isRightPaddingCompileTimeZero(i) &&
-               first >= firstPaddedRight && last < lastPaddedRight) {
+      else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
         // all the coefficient are between the 2 padding zones.
         const Index idx = index / m_outputStrides[i];
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
@@ -296,60 +597,53 @@
       }
     }
 
-    const Index last = index + packetSize - 1;
-    const Index first = index;
+    const Index lastIdx = index + PacketSize - 1;
+    const Index firstIdx = index;
+    const Index lastPaddedLeft = m_padding[0].first;
+    const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
+    const Index lastPaddedRight = m_outputStrides[1];
 
-    if (NumDims > 0) {
-      const Index lastPaddedLeft = m_padding[0].first;
-      const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
-      const Index lastPaddedRight = m_outputStrides[1];
-
-      if (!isLeftPaddingCompileTimeZero(0) && last < lastPaddedLeft) {
-        // all the coefficient are in the padding zone.
-        return internal::pset1<PacketReturnType>(m_paddingValue);
-      }
-      else if (!isRightPaddingCompileTimeZero(0) &&
-               first >= firstPaddedRight && last < lastPaddedRight) {
-        // all the coefficient are in the padding zone.
-        return internal::pset1<PacketReturnType>(m_paddingValue);
-      }
-      else if (first >= lastPaddedLeft && last < firstPaddedRight) {
-        // all the coefficient are between the 2 padding zones.
-        inputIndex += (index - m_padding[0].first);
-        return m_impl.template packet<Unaligned>(inputIndex);
-      }
+    if (!isLeftPaddingCompileTimeZero(0) && lastIdx < lastPaddedLeft) {
+      // all the coefficient are in the padding zone.
+      return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-
+    else if (!isRightPaddingCompileTimeZero(0) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
+      // all the coefficient are in the padding zone.
+      return internal::pset1<PacketReturnType>(m_paddingValue);
+    }
+    else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
+      // all the coefficient are between the 2 padding zones.
+      inputIndex += (index - m_padding[0].first);
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
     // Every other case
     return packetWithPossibleZero(initialIndex);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     const Index initialIndex = index;
     Index inputIndex = 0;
-
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < NumDims - 1; ++i) {
-      const Index first = index;
-      const Index last = index + packetSize - 1;
+      const Index firstIdx = index;
+      const Index lastIdx = index + PacketSize - 1;
       const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1];
       const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
       const Index lastPaddedRight = m_outputStrides[i];
 
-      if (isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
+      if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if (isRightPaddingCompileTimeZero(i) &&
-               first >= firstPaddedRight && last < lastPaddedRight) {
+      else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
         // all the coefficient are between the 2 padding zones.
         const Index idx = index / m_outputStrides[i+1];
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
@@ -361,40 +655,34 @@
       }
     }
 
-    const Index last = index + packetSize - 1;
-    const Index first = index;
+    const Index lastIdx = index + PacketSize - 1;
+    const Index firstIdx = index;
+    const Index lastPaddedLeft = m_padding[NumDims-1].first;
+    const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
+    const Index lastPaddedRight = m_outputStrides[NumDims-1];
 
-    if (NumDims > 0) {
-      const Index lastPaddedLeft = m_padding[NumDims-1].first;
-      const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
-      const Index lastPaddedRight = m_outputStrides[NumDims-1];
-
-      if (isLeftPaddingCompileTimeZero(NumDims-1) &&
-          last < lastPaddedLeft) {
-        // all the coefficient are in the padding zone.
-        return internal::pset1<PacketReturnType>(m_paddingValue);
-      }
-      else if (isRightPaddingCompileTimeZero(NumDims-1) &&
-               first >= firstPaddedRight && last < lastPaddedRight) {
-        // all the coefficient are in the padding zone.
-        return internal::pset1<PacketReturnType>(m_paddingValue);
-      }
-      else if (first >= lastPaddedLeft && last < firstPaddedRight) {
-        // all the coefficient are between the 2 padding zones.
-        inputIndex += (index - m_padding[NumDims-1].first);
-        return m_impl.template packet<Unaligned>(inputIndex);
-      }
+    if (!isLeftPaddingCompileTimeZero(NumDims-1) && lastIdx < lastPaddedLeft) {
+      // all the coefficient are in the padding zone.
+      return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-
+    else if (!isRightPaddingCompileTimeZero(NumDims-1) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
+      // all the coefficient are in the padding zone.
+      return internal::pset1<PacketReturnType>(m_paddingValue);
+    }
+    else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
+      // all the coefficient are between the 2 padding zones.
+      inputIndex += (index - m_padding[NumDims-1].first);
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
     // Every other case
     return packetWithPossibleZero(initialIndex);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
     PacketReturnType rslt = internal::pload<PacketReturnType>(values);
@@ -408,6 +696,8 @@
   PaddingDimensions m_padding;
 
   Scalar m_paddingValue;
+
+  const Device EIGEN_DEVICE_REF m_device;
 };
 
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index f5ea43f..413d25d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h

@@ -25,13 +25,13 @@
 {
   typedef typename XprType::Scalar Scalar;
   typedef traits<XprType> XprTraits;
-  typedef typename packet_traits<Scalar>::type Packet;
   typedef typename XprTraits::StorageKind StorageKind;
   typedef typename XprTraits::Index Index;
   typedef typename XprType::Nested Nested;
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions + 1;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename PatchDim, typename XprType>
@@ -55,10 +55,8 @@
 {
   public:
   typedef typename Eigen::internal::traits<TensorPatchOp>::Scalar Scalar;
-  typedef typename Eigen::internal::traits<TensorPatchOp>::Packet Packet;
   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
   typedef typename Eigen::internal::nested<TensorPatchOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorPatchOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorPatchOp>::Index Index;
@@ -89,19 +87,27 @@
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
 
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = true,
+    CoordAccess = false,
     RawAccess = false
-  };
+ };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device)
   {
     Index num_patches = 1;
@@ -146,12 +152,12 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -164,6 +170,7 @@
     Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index];
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 2; i > 0; --i) {
         const Index patchIdx = patchIndex / m_patchStrides[i];
         patchIndex -= patchIdx * m_patchStrides[i];
@@ -172,6 +179,7 @@
         inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
       }
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 2; ++i) {
         const Index patchIdx = patchIndex / m_patchStrides[i];
         patchIndex -= patchIdx * m_patchStrides[i];
@@ -187,12 +195,11 @@
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
-    Index indices[2] = {index, index + packetSize - 1};
+    Index indices[2] = {index, index + PacketSize - 1};
     Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index],
                              indices[1] / m_outputStrides[output_stride_index]};
     Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index],
@@ -200,6 +207,7 @@
 
     Index inputIndices[2] = {0, 0};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 2; i > 0; --i) {
         const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
                                    patchIndices[1] / m_patchStrides[i]};
@@ -215,6 +223,7 @@
         inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
       }
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 2; ++i) {
         const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
                                    patchIndices[1] / m_patchStrides[i]};
@@ -233,15 +242,16 @@
     inputIndices[0] += (patchIndices[0] + patchOffsets[0]);
     inputIndices[1] += (patchIndices[1] + patchOffsets[1]);
 
-    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+    if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
       PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
       return rslt;
     }
     else {
-      EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+      EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
       values[0] = m_impl.coeff(inputIndices[0]);
-      values[packetSize-1] = m_impl.coeff(inputIndices[1]);
-      for (int i = 1; i < packetSize-1; ++i) {
+      values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
+      EIGEN_UNROLL_LOOP
+      for (int i = 1; i < PacketSize-1; ++i) {
         values[i] = coeff(index+i);
       }
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
@@ -249,58 +259,6 @@
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
-  {
-    Index patch_coord_idx = Layout == ColMajor ? NumDims - 1 : 0;
-    // Location of the first element of the patch.
-    const Index patchIndex = coords[patch_coord_idx];
-
-    if (TensorEvaluator<ArgType, Device>::CoordAccess) {
-      array<Index, NumDims-1> inputCoords;
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        for (int i = NumDims - 2; i > 0; --i) {
-          const Index patchIdx = patchIndex / m_patchStrides[i];
-          patchIndex -= patchIdx * m_patchStrides[i];
-          const Index offsetIdx = coords[i];
-          inputCoords[i] = coords[i] + patchIdx;
-        }
-      } else {
-        for (int i = 0; i < NumDims - 2; ++i) {
-          const Index patchIdx = patchIndex / m_patchStrides[i];
-          patchIndex -= patchIdx * m_patchStrides[i];
-          const Index offsetIdx = coords[i+1];
-          inputCoords[i] = coords[i+1] + patchIdx;
-        }
-      }
-      Index coords_idx = Layout == ColMajor ? 0 : NumDims - 1;
-      inputCoords[0] = (patchIndex + coords[coords_idx]);
-      return m_impl.coeff(inputCoords);
-    }
-    else {
-      Index inputIndex = 0;
-      if (Layout == ColMajor) {
-        for (int i = NumDims - 2; i > 0; --i) {
-          const Index patchIdx = patchIndex / m_patchStrides[i];
-          patchIndex -= patchIdx * m_patchStrides[i];
-          const Index offsetIdx = coords[i];
-          inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
-        }
-      } else {
-        for (int i = 0; i < NumDims - 2; ++i) {
-          const Index patchIdx = patchIndex / m_patchStrides[i];
-          patchIndex -= patchIdx * m_patchStrides[i];
-          const Index offsetIdx = coords[i+1];
-          inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
-        }
-      }
-      Index coords_idx = Layout == ColMajor ? 0 : NumDims - 1;
-      inputIndex += (patchIndex + coords[coords_idx]);
-      return m_impl.coeff(inputIndex);
-    }
-  }
-
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     const double compute_cost = NumDims * (TensorOpCost::DivCost<Index>() +
                                            TensorOpCost::MulCost<Index>() +
@@ -309,6 +267,15 @@
            TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
   }
 
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh); 
+  }
+#endif
+
  protected:
   Dimensions m_dimensions;
   array<Index, NumDims> m_outputStrides;
@@ -316,6 +283,7 @@
   array<Index, NumDims-1> m_patchStrides;
 
   TensorEvaluator<ArgType, Device> m_impl;
+
 };
 
 } // end namespace Eigen

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
index 601a8f0..37c1d1c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h

@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -16,45 +17,25 @@
 namespace {
 
 EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
-#ifdef __CUDA_ARCH__
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   // We don't support 3d kernels since we currently only use 1 and
   // 2d kernels.
-  assert(threadIdx.z == 0);
-  return clock64() +
-      blockIdx.x * blockDim.x + threadIdx.x +
-      gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
-
-#elif defined _WIN32
-  // TODO(bsteiner): Add a pseudo random number to the mix, e.g.
-  // https://msdn.microsoft.com/en-us/library/398ax69y.aspx
-  SYSTEMTIME st;
-  GetSystemTime(&st);
-  return st.wSecond + 1000 * st.wMilliseconds;
-
-#elif defined __APPLE__
-  // TODO(bsteiner): Add a pseudo random number to the mix, e.g.
-  // https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random
-  return mach_absolute_time();
-
+  gpu_assert(threadIdx.z == 0);
+  return blockIdx.x * blockDim.x + threadIdx.x 
+         + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
 #else
-  // Augment the current time with pseudo random number generation
-  // to ensure that we get different seeds if we try to generate seeds
-  // faster than the clock resolution.
-  timespec ts;
-  clock_gettime(CLOCK_REALTIME, &ts);
-  uint64_t rnd1 = ::random() ^ ts.tv_nsec;
-  uint64_t rnd2 = ::random() ^ ts.tv_nsec;
-  return (rnd1 << 32 | rnd2);
+  // Rely on Eigen's random implementation.
+  return random<uint64_t>();
 #endif
 }
 
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) {
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
   // TODO: Unify with the implementation in the non blocking thread pool.
   uint64_t current = *state;
   // Update the internal state
-  *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+  *state = current * 6364136223846793005ULL + (stream << 1 | 1);
   // Generate the random output (using the PCG-XSH-RS scheme)
-  return (current ^ (current >> 22)) >> (22 + (current >> 61));
+  return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
 }
 
 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
@@ -66,34 +47,42 @@
 
 
 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-T RandomToTypeUniform(uint64_t* state) {
-  unsigned rnd = PCG_XSH_RS_generator(state);
+T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
+  unsigned rnd = PCG_XSH_RS_generator(state, stream);
   return static_cast<T>(rnd);
 }
 
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) {
-  Eigen::half result;
-  // Generate 10 random bits for the mantissa
-  unsigned rnd = PCG_XSH_RS_generator(state);
-  result.x = static_cast<uint16_t>(rnd & 0x3ffu);
-  // Set the exponent
-  result.x |= (static_cast<uint16_t>(15) << 10);
+Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
+  // Generate 10 random bits for the mantissa, merge with exponent.
+  unsigned rnd = PCG_XSH_RS_generator(state, stream);
+  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
+  Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
   // Return the final result
   return result - Eigen::half(1.0f);
 }
 
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) {
+
+  // Generate 7 random bits for the mantissa, merge with exponent.
+  unsigned rnd = PCG_XSH_RS_generator(state, stream);
+  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
+  Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
+  // Return the final result
+  return result - Eigen::bfloat16(1.0f);
+}
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float RandomToTypeUniform<float>(uint64_t* state) {
+float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
   typedef union {
     uint32_t raw;
     float fp;
   } internal;
   internal result;
   // Generate 23 random bits for the mantissa mantissa
-  const unsigned rnd = PCG_XSH_RS_generator(state);
+  const unsigned rnd = PCG_XSH_RS_generator(state, stream);
   result.raw = rnd & 0x7fffffu;
   // Set the exponent
   result.raw |= (static_cast<uint32_t>(127) << 23);
@@ -102,7 +91,7 @@
 }
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double RandomToTypeUniform<double>(uint64_t* state) {
+double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
   typedef union {
     uint64_t raw;
     double dp;
@@ -111,9 +100,9 @@
   result.raw = 0;
   // Generate 52 random bits for the mantissa
   // First generate the upper 20 bits
-  unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu;
+  unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
   // The generate the lower 32 bits
-  unsigned rnd2 = PCG_XSH_RS_generator(state);
+  unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
   result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
   // Set the exponent
   result.raw |= (static_cast<uint64_t>(1023) << 52);
@@ -122,14 +111,14 @@
 }
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state) {
-  return std::complex<float>(RandomToTypeUniform<float>(state),
-                             RandomToTypeUniform<float>(state));
+std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) {
+  return std::complex<float>(RandomToTypeUniform<float>(state, stream),
+                             RandomToTypeUniform<float>(state, stream));
 }
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state) {
-  return std::complex<double>(RandomToTypeUniform<double>(state),
-                              RandomToTypeUniform<double>(state));
+std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) {
+  return std::complex<double>(RandomToTypeUniform<double>(state, stream),
+                              RandomToTypeUniform<double>(state, stream));
 }
 
 template <typename T> class UniformRandomGenerator {
@@ -140,34 +129,68 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
       uint64_t seed = 0) {
     m_state = PCG_XSH_RS_state(seed);
+    #ifdef EIGEN_USE_SYCL
+    // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
+    // Therefor, we need two step to initializate the m_state.
+    // IN SYCL, the constructor of the functor is s called on the CPU
+    // and we get the clock seed here from the CPU. However, This seed is
+    //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
+    // and only  available on the Operator() function (which is called on the GPU).
+    // Thus for CUDA (((CLOCK  + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread
+    // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds
+    // the  (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction
+    // similar to CUDA Therefore, the thread Id injection is not available at this stage.
+    //However when the operator() is called the thread ID will be avilable. So inside the opeator,
+    // we add the thrreadID, BlockId,... (which is equivalent of i)
+    //to the seed and construct the unique m_state per thead similar to cuda.
+    m_exec_once =false;
+   #endif
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
       const UniformRandomGenerator& other) {
     m_state = other.m_state;
+    #ifdef EIGEN_USE_SYCL
+     m_exec_once =other.m_exec_once;
+    #endif
   }
 
   template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   T operator()(Index i) const {
-    uint64_t local_state = m_state + i;
-    T result = RandomToTypeUniform<T>(&local_state);
-    m_state = local_state;
+    #ifdef EIGEN_USE_SYCL
+      if(!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+      // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
+       m_state += (i * 6364136223846793005ULL);
+       m_exec_once =true;
+      }
+    #endif
+    T result = RandomToTypeUniform<T>(&m_state, i);
     return result;
   }
 
-  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  typename internal::packet_traits<T>::type packetOp(Index i) const {
-    const int packetSize = internal::packet_traits<T>::size;
+  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(Index i) const {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
     EIGEN_ALIGN_MAX T values[packetSize];
-    uint64_t local_state = m_state + i;
-    for (int i = 0; i < packetSize; ++i) {
-      values[i] = RandomToTypeUniform<T>(&local_state);
+      #ifdef EIGEN_USE_SYCL
+      if(!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+       m_state += (i * 6364136223846793005ULL);
+       m_exec_once =true;
+      }
+    #endif
+    EIGEN_UNROLL_LOOP
+    for (int j = 0; j < packetSize; ++j) {
+      values[j] = RandomToTypeUniform<T>(&m_state, i);
     }
-    m_state = local_state;
-    return internal::pload<typename internal::packet_traits<T>::type>(values);
+    return internal::pload<Packet>(values);
   }
 
  private:
   mutable uint64_t m_state;
+  #ifdef EIGEN_USE_SYCL
+  mutable bool m_exec_once;
+  #endif
 };
 
 template <typename Scalar>
@@ -183,14 +206,14 @@
 
 
 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-T RandomToTypeNormal(uint64_t* state) {
+T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
   // Use the ratio of uniform method to generate numbers following a normal
   // distribution. See for example Numerical Recipes chapter 7.3.9 for the
   // details.
   T u, v, q;
   do {
-    u = RandomToTypeUniform<T>(state);
-    v = T(1.7156) * (RandomToTypeUniform<T>(state) - T(0.5));
+    u = RandomToTypeUniform<T>(state, stream);
+    v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
     const T x = u - T(0.449871);
     const T y = numext::abs(v) + T(0.386595);
     q = x*x + y * (T(0.196)*y - T(0.25472)*x);
@@ -201,14 +224,14 @@
 }
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state) {
-  return std::complex<float>(RandomToTypeNormal<float>(state),
-                             RandomToTypeNormal<float>(state));
+std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) {
+  return std::complex<float>(RandomToTypeNormal<float>(state, stream),
+                             RandomToTypeNormal<float>(state, stream));
 }
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state) {
-  return std::complex<double>(RandomToTypeNormal<double>(state),
-                              RandomToTypeNormal<double>(state));
+std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) {
+  return std::complex<double>(RandomToTypeNormal<double>(state, stream),
+                              RandomToTypeNormal<double>(state, stream));
 }
 
 
@@ -219,34 +242,64 @@
   // Uses the given "seed" if non-zero, otherwise uses a random seed.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
     m_state = PCG_XSH_RS_state(seed);
+    #ifdef EIGEN_USE_SYCL
+    // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
+    // Therefor, we need two steps to initializate the m_state.
+    // IN SYCL, the constructor of the functor is s called on the CPU
+    // and we get the clock seed here from the CPU. However, This seed is
+    //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
+    // and only  available on the Operator() function (which is called on the GPU).
+    // Therefore, the thread Id injection is not available at this stage. However when the operator()
+    //is called the thread ID will be avilable. So inside the opeator,
+    // we add the thrreadID, BlockId,... (which is equivalent of i)
+    //to the seed and construct the unique m_state per thead similar to cuda.
+    m_exec_once =false;
+   #endif
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
       const NormalRandomGenerator& other) {
     m_state = other.m_state;
+#ifdef EIGEN_USE_SYCL
+    m_exec_once=other.m_exec_once;
+#endif
   }
 
  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   T operator()(Index i) const {
-    uint64_t local_state = m_state + i;
-    T result = RandomToTypeNormal<T>(&local_state);
-    m_state = local_state;
+    #ifdef EIGEN_USE_SYCL
+    if(!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+      m_state += (i * 6364136223846793005ULL);
+      m_exec_once =true;
+    }
+    #endif
+    T result = RandomToTypeNormal<T>(&m_state, i);
     return result;
   }
 
-  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  typename internal::packet_traits<T>::type packetOp(Index i) const {
-    const int packetSize = internal::packet_traits<T>::size;
+  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(Index i) const {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
     EIGEN_ALIGN_MAX T values[packetSize];
-    uint64_t local_state = m_state + i;
-    for (int i = 0; i < packetSize; ++i) {
-      values[i] = RandomToTypeNormal<T>(&local_state);
+    #ifdef EIGEN_USE_SYCL
+    if(!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+      m_state += (i * 6364136223846793005ULL);
+      m_exec_once =true;
     }
-    m_state = local_state;
-    return internal::pload<typename internal::packet_traits<T>::type>(values);
+    #endif
+    EIGEN_UNROLL_LOOP
+    for (int j = 0; j < packetSize; ++j) {
+      values[j] = RandomToTypeNormal<T>(&m_state, i);
+    }
+    return internal::pload<Packet>(values);
   }
 
  private:
   mutable uint64_t m_state;
+   #ifdef EIGEN_USE_SYCL
+  mutable bool m_exec_once;
+  #endif
 };
 
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index da25026..583f462 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h

@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,18 +11,20 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
 
-// Kernel friend declarations may or not need the __global__ annotation
-// depending on the compiler.
+// clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
+// so we'll use a macro to make clang happy.
 #ifndef KERNEL_FRIEND
-#if defined(__clang__) && defined(__CUDA__)
-#define KERNEL_FRIEND friend __global__
+#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
+#define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
 #else
 #define KERNEL_FRIEND friend
 #endif
 #endif
 
+
 namespace Eigen {
 
+
 /** \class TensorReduction
   * \ingroup CXX11_Tensor_Module
   *
@@ -30,48 +33,69 @@
   */
 
 namespace internal {
-template<typename Op, typename Dims, typename XprType>
-struct traits<TensorReductionOp<Op, Dims, XprType> >
+  template<typename Op, typename Dims, typename XprType,template <class> class MakePointer_ >
+  struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> >
  : traits<XprType>
 {
-  typedef typename traits<XprType>::Scalar Scalar;
-  typedef typename traits<XprType>::StorageKind StorageKind;
-  typedef typename traits<XprType>::Index Index;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::Scalar Scalar;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
   typedef typename XprType::Nested Nested;
+  static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
+  static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+
+  template <class T> struct MakePointer {
+    // Intermediate typedef to workaround MSVC issue.
+    typedef MakePointer_<T> MakePointerT;
+    typedef typename MakePointerT::Type Type;
+  };
 };
 
-template<typename Op, typename Dims, typename XprType>
-struct eval<TensorReductionOp<Op, Dims, XprType>, Eigen::Dense>
+template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
+struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense>
 {
-  typedef const TensorReductionOp<Op, Dims, XprType>& type;
+  typedef const TensorReductionOp<Op, Dims, XprType, MakePointer_>& type;
 };
 
-template<typename Op, typename Dims, typename XprType>
-struct nested<TensorReductionOp<Op, Dims, XprType>, 1, typename eval<TensorReductionOp<Op, Dims, XprType> >::type>
+template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
+struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1, typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type>
 {
-  typedef TensorReductionOp<Op, Dims, XprType> type;
+  typedef TensorReductionOp<Op, Dims, XprType, MakePointer_> type;
 };
 
 
-
-template <typename InputDims, typename OutputDims, typename ReducedDims> EIGEN_DEVICE_FUNC
-static void partition_dims(const InputDims& input_dims,
-                           const array<bool, internal::array_size<InputDims>::value>& reduced,
-                           OutputDims* output_dims, ReducedDims* reduced_dims) {
-  const int NumInputDims = internal::array_size<InputDims>::value;
-  int outputIndex = 0;
-  int reduceIndex = 0;
-  for (int i = 0; i < NumInputDims; ++i) {
-    if (OutputDims::count == 0 || reduced[i]) {
-      (*reduced_dims)[reduceIndex] = input_dims[i];
-      ++reduceIndex;
-    } else {
-      (*output_dims)[outputIndex] = input_dims[i];
-      ++outputIndex;
+template <typename OutputDims> struct DimInitializer {
+  template <typename InputDims, typename ReducedDims> EIGEN_DEVICE_FUNC
+  static void run(const InputDims& input_dims,
+                  const array<bool, internal::array_size<InputDims>::value>& reduced,
+                  OutputDims* output_dims, ReducedDims* reduced_dims) {
+    const int NumInputDims = internal::array_size<InputDims>::value;
+    int outputIndex = 0;
+    int reduceIndex = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (reduced[i]) {
+        (*reduced_dims)[reduceIndex] = input_dims[i];
+        ++reduceIndex;
+      } else {
+        (*output_dims)[outputIndex] = input_dims[i];
+        ++outputIndex;
+      }
     }
   }
-}
+};
 
+template <> struct DimInitializer<Sizes<> > {
+  template <typename InputDims, typename Index, size_t Rank> EIGEN_DEVICE_FUNC
+  static void run(const InputDims& input_dims, const array<bool, Rank>&,
+                  Sizes<>*, array<Index, Rank>* reduced_dims) {
+    const int NumInputDims = internal::array_size<InputDims>::value;
+    for (int i = 0; i < NumInputDims; ++i) {
+      (*reduced_dims)[i] = input_dims[i];
+    }
+  }
+};
 
 
 template <typename ReducedDims, int NumTensorDims, int Layout>
@@ -83,35 +107,33 @@
   static const bool value = false;
 };
 
-#if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES)
-// The use of the tmp1, tmp2, tmp3 intermediate variables is needed for nvcc 7
-// to compile the code below. NVidia is working on a fix.
+#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
 template <typename ReducedDims, int NumTensorDims>
 struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
-  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()();
-  static const bool tmp2 = index_statically_eq<ReducedDims>()(0, 0);
-  static const bool tmp3 = index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+  static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
+  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
   static const bool value = tmp1 & tmp2 & tmp3;
 };
 template <typename ReducedDims, int NumTensorDims>
 struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
-  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()();
-  static const bool tmp2 = index_statically_eq<ReducedDims>()(0, NumTensorDims - array_size<ReducedDims>::value);
-  static const bool tmp3 = index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+  static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
+  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
   static const bool value = tmp1 & tmp2 & tmp3;
 
 };
 template <typename ReducedDims, int NumTensorDims>
 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
-  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()();
-  static const bool tmp2 = index_statically_gt<ReducedDims>()(0, 0);
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+  static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
   static const bool value = tmp1 & tmp2;
 
 };
 template <typename ReducedDims, int NumTensorDims>
 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
-  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()();
-  static const bool tmp2 = index_statically_lt<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+  static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
   static const bool value = tmp1 & tmp2;
 };
 #endif
@@ -120,7 +142,7 @@
 template <int DimIndex, typename Self, typename Op>
 struct GenericDimReducer {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
-    EIGEN_STATIC_ASSERT(DimIndex >= 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
     for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
       const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
       GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
@@ -128,13 +150,24 @@
   }
 };
 template <typename Self, typename Op>
-struct GenericDimReducer<-1, Self, Op> {
+struct GenericDimReducer<0, Self, Op> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
-    reducer.reduce(self.m_impl.coeff(firstIndex), accum);
+    for (int j = 0; j < self.m_reducedDims[0]; ++j) {
+      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
+      reducer.reduce(self.m_impl.coeff(input), accum);
+    }
+  }
+};
+template <typename Self, typename Op>
+struct GenericDimReducer<-1, Self, Op> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) {
+    reducer.reduce(self.m_impl.coeff(index), accum);
   }
 };
 
-template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
+          bool UseTreeReduction = (!Self::ReducerTraits::IsStateful &&
+                                   !Self::ReducerTraits::IsExactlyAssociative)>
 struct InnerMostDimReducer {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
     typename Self::CoeffReturnType accum = reducer.initialize();
@@ -146,25 +179,102 @@
 };
 
 template <typename Self, typename Op>
-struct InnerMostDimReducer<Self, Op, true> {
+struct InnerMostDimReducer<Self, Op, true, false> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
-    const int packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
+    const typename Self::Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
     const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
-    typename Self::PacketReturnType p = reducer.template initializePacket<typename Self::PacketReturnType>();
+    typename Self::PacketReturnType paccum = reducer.template initializePacket<typename Self::PacketReturnType>();
     for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
-      reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &p);
+      reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
     }
     typename Self::CoeffReturnType accum = reducer.initialize();
     for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
       reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
     }
-    return reducer.finalizeBoth(accum, p);
+    return reducer.finalizeBoth(accum, paccum);
   }
 };
 
-template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+#if !defined(EIGEN_HIPCC) 
+static const int kLeafSize = 1024;
+
+template <typename Self, typename Op>
+struct InnerMostDimReducer<Self, Op, false, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
+  reduce(const Self& self, typename Self::Index firstIndex,
+         typename Self::Index numValuesToReduce, Op& reducer) {
+    typename Self::CoeffReturnType accum = reducer.initialize();
+    if (numValuesToReduce > kLeafSize) {
+      const typename Self::Index half = numValuesToReduce / 2;
+      reducer.reduce(reduce(self, firstIndex, half, reducer), &accum);
+      reducer.reduce(
+          reduce(self, firstIndex + half, numValuesToReduce - half, reducer),
+          &accum);
+    } else {
+      for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
+        reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+      }
+    }
+    return reducer.finalize(accum);
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerMostDimReducer<Self, Op, true, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
+  reduce(const Self& self, typename Self::Index firstIndex,
+         typename Self::Index numValuesToReduce, Op& reducer) {
+    const typename Self::Index packetSize =
+        internal::unpacket_traits<typename Self::PacketReturnType>::size;
+    typename Self::CoeffReturnType accum = reducer.initialize();
+    if (numValuesToReduce > packetSize * kLeafSize) {
+      // Make sure the split point is aligned on a packet boundary.
+      const typename Self::Index split =
+          packetSize *
+          divup(firstIndex + divup(numValuesToReduce, typename Self::Index(2)),
+                packetSize);
+      const typename Self::Index num_left =
+          numext::mini(split - firstIndex, numValuesToReduce);
+      reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum);
+      if (num_left < numValuesToReduce) {
+        reducer.reduce(
+            reduce(self, split, numValuesToReduce - num_left, reducer), &accum);
+      }
+      return reducer.finalize(accum);
+    } else {
+      const typename Self::Index UnrollSize =
+          (numValuesToReduce / (2*packetSize)) * 2*packetSize;
+      const typename Self::Index VectorizedSize =
+          (numValuesToReduce / packetSize) * packetSize;
+      typename Self::PacketReturnType paccum =
+          reducer.template initializePacket<typename Self::PacketReturnType>();
+      typename Self::PacketReturnType paccum2 =
+          reducer.template initializePacket<typename Self::PacketReturnType>();
+      for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) {
+        reducer.reducePacket(
+            self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
+        reducer.reducePacket(
+            self.m_impl.template packet<Unaligned>(firstIndex + j + packetSize),
+            &paccum2);
+      }
+      for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) {
+        reducer.reducePacket(self.m_impl.template packet<Unaligned>(
+                                 firstIndex + j), &paccum);
+      }
+      reducer.reducePacket(paccum2, &paccum);
+      for (typename Self::Index j = VectorizedSize; j < numValuesToReduce;
+           ++j) {
+        reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+      }
+      return reducer.finalizeBoth(accum, paccum);
+    }
+  }
+};
+#endif
+ 
+template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
 struct InnerMostDimPreserver {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
     eigen_assert(false && "should never be called");
   }
 };
@@ -172,7 +282,7 @@
 template <int DimIndex, typename Self, typename Op>
 struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
-    EIGEN_STATIC_ASSERT(DimIndex >= 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
     for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
       const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
       InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
@@ -181,37 +291,42 @@
 };
 
 template <typename Self, typename Op>
-struct InnerMostDimPreserver<-1, Self, Op, true> {
+struct InnerMostDimPreserver<0, Self, Op, true> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
-    reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex), accum);
+    for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) {
+      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
+      reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
+    }
+  }
+};
+template <typename Self, typename Op>
+struct InnerMostDimPreserver<-1, Self, Op, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
+    eigen_assert(false && "should never be called");
   }
 };
 
 // Default full reducer
-template <typename Self, typename Op, typename Device,
-          bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
 struct FullReducer {
   static const bool HasOptimizedImplementation = false;
 
-  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer,
-                                    const Device&,
-                                    typename Self::CoeffReturnType* output) {
-    const typename Self::Index num_coeffs =
-        array_prod(self.m_impl.dimensions());
-    *output = InnerMostDimReducer<Self, Op, vectorizable>::reduce(
-        self, 0, num_coeffs, reducer);
+  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::EvaluatorPointerType output) {
+    const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
+    *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
   }
 };
 
 
 #ifdef EIGEN_USE_THREADS
+// Multithreaded full reducers
 template <typename Self, typename Op,
-          bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+          bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
 struct FullReducerShard {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
                   typename Self::Index numValuesToReduce, Op& reducer,
                   typename Self::CoeffReturnType* output) {
-    *output = InnerMostDimReducer<Self, Op, vectorizable>::reduce(
+    *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
         self, firstIndex, numValuesToReduce, reducer);
   }
 };
@@ -219,8 +334,8 @@
 // Multithreaded full reducer
 template <typename Self, typename Op, bool Vectorizable>
 struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
-  static const bool HasOptimizedImplementation = !Op::IsStateful;
-  static const int PacketSize =
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful;
+  static const Index PacketSize =
       unpacket_traits<typename Self::PacketReturnType>::size;
 
   // launch one reducer per thread and accumulate the result.
@@ -248,8 +363,8 @@
     const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
     eigen_assert(num_coeffs >= numblocks * blocksize);
 
-    Barrier barrier(numblocks);
-    FixedSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
+    Barrier barrier(internal::convert_index<unsigned int>(numblocks));
+    MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
     for (Index i = 0; i < numblocks; ++i) {
       device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
                                   self, i * blocksize, blocksize, reducer,
@@ -275,614 +390,85 @@
 #endif
 
 
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-// Full reducers for GPU, don't vectorize for now
+// Default inner reducer
+template <typename Self, typename Op, typename Device>
+struct InnerReducer {
+  static const bool HasOptimizedImplementation = false;
 
-// Reducer function that enables multiple cuda thread to safely accumulate at the same
-// output address. It basically reads the current value of the output variable, and
-// attempts to update it with the new value. If in the meantime another cuda thread
-// updated the content of the output address it will try again.
-template <typename T, typename R>
-__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
-#if __CUDA_ARCH__ >= 300
-  if (sizeof(T) == 4)
-  {
-    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
-    unsigned int newval = oldval;
-    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-    unsigned int readback;
-    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
-      oldval = readback;
-      newval = oldval;
-      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-      if (newval == oldval) {
-        return;
-      }
-    }
+  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
+    eigen_assert(false && "Not implemented");
+    return true;
   }
-  else if (sizeof(T) == 8) {
-    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
-    unsigned long long newval = oldval;
-    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-    unsigned long long readback;
-    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
-      oldval = readback;
-      newval = oldval;
-      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-      if (newval == oldval) {
-        return;
-      }
-    }
+};
+
+// Default outer reducer
+template <typename Self, typename Op, typename Device>
+struct OuterReducer {
+  static const bool HasOptimizedImplementation = false;
+
+  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
+    eigen_assert(false && "Not implemented");
+    return true;
   }
-  else {
-    assert(0 && "Wordsize not supported");
+};
+
+#ifdef EIGEN_USE_SYCL
+// Default Generic reducer
+template <typename Self, typename Op, typename Device>
+struct GenericReducer {
+  static const bool HasOptimizedImplementation = false;
+
+  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
+    eigen_assert(false && "Not implemented");
+    return true;
   }
+};
+#endif
+
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+template <int B, int N, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
+
+
+#if defined(EIGEN_HAS_GPU_FP16)
+template <typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*);
+template <int B, int N, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*);
+template <int NPT, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
+
+#endif
+
+template <int NPT, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+
+template <int NPT, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+#endif
+
+/**
+ * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op.
+ * This allows the reduction to have a different type for the accumulator than the input data type.
+ * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input
+ * with the accumulator and the other for reducing two accumulators.
+ * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for
+ * some properties of the input.
+ */
+template <typename Op, typename CoeffReturnType>
+struct ReductionReturnType {
+#if defined(EIGEN_USE_SYCL)
+  typedef typename remove_const<decltype(std::declval<Op>().initialize())>::type type;
 #else
-  assert(0 && "Shouldn't be called on unsupported device");
+  typedef typename remove_const<CoeffReturnType>::type type;
 #endif
-}
-
-template <typename T>
-__device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) {
-#if __CUDA_ARCH__ >= 300
-  atomicAdd(output, accum);
-#else
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif
-}
-
-#ifdef EIGEN_HAS_CUDA_FP16
-template <template <typename T> class R>
-__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
-#if __CUDA_ARCH__ >= 300
-  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
-  unsigned int newval = oldval;
-  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
-  if (newval == oldval) {
-    return;
-  }
-  unsigned int readback;
-  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
-    oldval = readback;
-    newval = oldval;
-    reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-  }
-#else
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif
-}
-#endif
-
-
-template <typename CoeffType, typename Index>
-__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const Index num_threads = blockDim.x * gridDim.x;
-  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-    output[i] = val;
-  }
-}
-
-
-template <int BlockSize, int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
-                                    typename Self::CoeffReturnType* output) {
-  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
-
-  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
-  if (gridDim.x == 1) {
-    if (first_index == 0) {
-      *output = reducer.initialize();
-    }
-    __syncthreads();
-  }
-
-  typename Self::CoeffReturnType accum = reducer.initialize();
-
-  // Process up to NumPerThread*BlockSize coefficient per thread while making sure that we don't go past the last coefficient of the tensor.
-  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
-#pragma unroll 8
-  for (Index i = 0; i < max_iter; i += BlockSize) {
-    const Index index = first_index + i;
-    eigen_assert(index < num_coeffs);
-    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
-    reducer.reduce(val, &accum);
-  }
-
-#pragma unroll
-  for (int offset = warpSize/2; offset > 0; offset /= 2) {
-    reducer.reduce(__shfl_down(accum, offset), &accum);
-  }
-
-  if ((threadIdx.x & (warpSize - 1)) == 0) {
-    atomicReduce(output, accum, reducer);
-  }
-}
-
-#ifdef EIGEN_HAS_CUDA_FP16
-template <typename Self, typename Reducer, typename Index>
-__global__ void ReductionInitFullReduxKernelHalfFloat(
-    Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
-  eigen_assert(threadIdx.x == 1);
-  if (num_coeffs % 2 != 0) {
-    half last = input.m_impl.coeff(num_coeffs - 1);
-    *scratch = __halves2half2(last, reducer.initialize());
-  } else {
-    *scratch = reducer.template initializePacket<half2>();
-  }
-}
-
-template <typename Self, typename Reducer, typename Index>
-__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const Index num_threads = blockDim.x * gridDim.x;
-  const Index num_packets = num_coeffs / 2;
-  for (Index i = thread_id; i < num_packets; i += num_threads) {
-    ((half2*)output)[i] = reducer.template initializePacket<half2>();
-  }
-
-  if (thread_id == 0 && num_coeffs & 0x1 != 0) {
-    output[num_coeffs-1] = reducer.initialize();
-  }
-}
-
-template <int BlockSize, int NumPerThread, typename Self, typename Reducer,
-          typename Index>
-__global__ void FullReductionKernelHalfFloat(Reducer reducer,
-                                             const Self input,
-                                             Index num_coeffs,
-                                             half* output,
-                                             half2* scratch) {
-  eigen_assert(NumPerThread & 0x1 == 0);
-
-  const Index first_index =
-      blockIdx.x * BlockSize * NumPerThread + 2 * threadIdx.x;
-
-  // Initialize the output value if it wasn't initialized by the
-  // ReductionInitKernel
-  if (gridDim.x == 1 && first_index == 0) {
-    if (num_coeffs & 0x1 != 0) {
-      half last = input.m_impl.coeff(num_coeffs - 1);
-      *scratch = __halves2half2(last, reducer.initialize());
-    } else {
-      *scratch = reducer.template initializePacket<half2>();
-    }
-  }
-
-  half2 accum = reducer.template initializePacket<half2>();
-  const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2,
-                                             NumPerThread * BlockSize / 2);
-  for (Index i = 0; i < max_iter; i += BlockSize) {
-    const Index index = first_index + 2 * i;
-    eigen_assert(index + 1 < num_coeffs);
-    half2 val = input.m_impl.template packet<Unaligned>(index);
-    reducer.reducePacket(val, &accum);
-  }
-
-#pragma unroll
-  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
-    reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
-  }
-
-  if ((threadIdx.x & (warpSize - 1)) == 0) {
-    atomicReduce(scratch, accum, reducer);
-  }
-
-  __syncthreads();
-
-  if (gridDim.x == 1 && first_index == 0) {
-    half tmp = __low2half(*scratch);
-    reducer.reduce(__high2half(*scratch), &tmp);
-    *output = tmp;
-  }
-}
-
-template <typename Op>
-__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output,
-                                                half2* scratch) {
-  eigen_assert(threadIdx.x == 1);
-  half tmp = __low2half(*scratch);
-  reducer.reduce(__high2half(*scratch), &tmp);
-  *output = tmp;
-}
-
-#endif
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
-struct FullReductionLauncher {
-  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
-    assert(false && "Should only be called on floats and half floats");
-  }
-};
-
-// Launch a full reduction on fp32. Packet access isn't required in this case.
-template <typename Self, typename Op, bool PacketAccess>
-struct FullReductionLauncher<Self, Op, float, PacketAccess> {
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs) {
-    typedef typename Self::Index Index;
-    typedef typename Self::CoeffReturnType Scalar;
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-
-    if (num_blocks > 1) {
-       // We initialize the outputs outside the reduction kernel when we can't
-      // be sure that there  won't be a race conditions between multiple
-      // thread blocks.
-      LAUNCH_CUDA_KERNEL((ReductionInitKernel<Scalar, Index>), 1, 32, 0, device,
-                         reducer.initialize(), 1, output);
-    }
-
-    LAUNCH_CUDA_KERNEL(
-        (FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
-  }
-};
-
-#ifdef EIGEN_HAS_CUDA_FP16
-// Launch a full reduction on fp16. Packet access is required in this case, so make sure we abort if a fp16 optimized reduction is attempted when packet accessors aren't available (note that this would be a programming mistake, in practice we should always fallback to the generic reduction code to handle such cases)
-template <typename Self, typename Op>
-struct FullReductionLauncher<Self, Op, Eigen::half, false> {
-  // Leave the function unimplemented to create a linking error.
-  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index);
-};
-
-
-template <typename Self, typename Op>
-struct FullReductionLauncher<Self, Op, Eigen::half, true> {
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
-    typedef typename Self::Index Index;
-
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    half2* scratch = static_cast<half2*>(device.scratchpad());
-
-    if (num_blocks > 1) {
-      // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
-                         1, 1, 0, device, reducer, self, num_coeffs, scratch);
-    }
-
-    LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
-
-    if (num_blocks > 1) {
-      // TODO(bsteiner): refactor the code to avoid the need for 3 kernel launches.
-      LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
-                         1, 1, 0, device, reducer, output, scratch);
-    }
-  }
-};
-#endif
-
-template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
-// Unfortunately nvidia doesn't support well exotic types such as complex,
-// so reduce the scope of the optimized version of the code to the simple case
-// of floats and half floats.
-#ifdef EIGEN_HAS_CUDA_FP16
-  static const bool HasOptimizedImplementation =
-      !Op::IsStateful &&
-      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value &&
-        reducer_traits<Op, GpuDevice>::PacketAccess));
-#else
-  static const bool HasOptimizedImplementation =
-      !Op::IsStateful &&
-      internal::is_same<typename Self::CoeffReturnType, float>::value;
-#endif
-
-  template <typename OutputType>
-  static void run(const Self& self, Op& reducer, const GpuDevice& device,
-                  OutputType* output) {
-    assert(HasOptimizedImplementation && "Should only be called on floats or half floats");
-    const Index num_coeffs = array_prod(self.m_impl.dimensions());
-    // Don't crash when we're called with an input tensor of size 0.
-    if (num_coeffs == 0) {
-      return;
-    }
-
-    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output,
-                                         num_coeffs);
-  }
-};
-
-
-template <int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
-                                         typename Self::CoeffReturnType* output) {
-  eigen_assert(blockDim.y == 1);
-  eigen_assert(blockDim.z == 1);
-  eigen_assert(gridDim.y == 1);
-  eigen_assert(gridDim.z == 1);
-
-  const int unroll_times = 16;
-  eigen_assert(NumPerThread % unroll_times == 0);
-
-  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
-  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
-
-  const Index num_threads = blockDim.x * gridDim.x;
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  // Initialize the output values if they weren't initialized by the ReductionInitKernel
-  if (gridDim.x == 1) {
-    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-      output[i] = reducer.initialize();
-    }
-    // We don't need a __syncthreads() here, because we don't write to output
-    // until after a __syncthreads().
-  }
-
-  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
-    const Index row = i / input_col_blocks;
-
-    if (row < num_preserved_coeffs) {
-      const Index col_block = i % input_col_blocks;
-      const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
-
-      float reduced_val = reducer.initialize();
-
-      for (Index j = 0; j < NumPerThread; j += unroll_times) {
-        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
-        if (last_col >= num_coeffs_to_reduce) {
-          for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col +=blockDim.x) {
-            const float val =input.m_impl.coeff(row * num_coeffs_to_reduce + col);
-            reducer.reduce(val, &reduced_val);
-          }
-          break;
-        } else {
-          // Faster version of the loop with no branches after unrolling.
-#pragma unroll
-          for (int k = 0; k < unroll_times; ++k) {
-            const Index col = col_begin + blockDim.x * (j + k);
-            reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
-          }
-        }
-      }
-
-      __syncthreads();
-
-      for (int offset = warpSize/2; offset > 0; offset /= 2) {
-        reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
-      }
-
-      if ((threadIdx.x & (warpSize - 1)) == 0) {
-        atomicReduce(&(output[row]), reduced_val, reducer);
-      }
-    }
-
-    __syncthreads();
-  }
-}
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
-struct InnerReductionLauncher {
-  // Leave the function unimplemented to create a linking error.
-  static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index);
-};
-
-template <typename Self, typename Op, bool PacketAccess>
-struct InnerReductionLauncher<Self, Op, float, PacketAccess> {
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    typedef typename Self::Index Index;
-
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / block_size;
-    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
-    if (num_blocks > 1) {
-      // We initialize the outputs in the reduction kernel itself when we don't have to worry
-      // about race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
-                         num_blocks, 1024, 0, device, reducer.initialize(),
-                         num_preserved_vals, output);
-    }
-
-    LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
-  }
-};
-
-
-template <typename Self, typename Op>
-struct InnerGpuReducer {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats.
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
-
-
-  template <typename Device, typename OutputType>
-  static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should only be called to reduce floats on a gpu device");
-  }
-
-  template <typename OutputType>
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    assert(HasOptimizedImplementation && "Should only be called on floats or half floats");
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    // Don't crash when we're called with an input tensor of size 0.
-    if (num_coeffs == 0) {
-      return;
-    }
-    InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
-  }
-};
-
-
-template <int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
-                                         typename Self::CoeffReturnType* output) {
-  const Index num_threads = blockDim.x * gridDim.x;
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  // Initialize the output values if they weren't initialized by the ReductionInitKernel
-  if (gridDim.x == 1) {
-    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-      output[i] = reducer.initialize();
-    }
-    __syncthreads();
-  }
-
-  // Do the reduction.
-  const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
-  for (Index i = thread_id; i < max_iter; i += num_threads) {
-    const Index input_col = i % num_preserved_coeffs;
-    const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
-    typename Self::CoeffReturnType reduced_val = reducer.initialize();
-    const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
-    for (Index j = input_row; j < max_row; j++) {
-      typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
-      reducer.reduce(val, &reduced_val);
-    }
-    atomicReduce(&(output[input_col]), reduced_val, reducer);
-  }
-}
-
-
-template <typename Self, typename Op>
-struct OuterGpuReducer {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats.
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
-
-  template <typename Device, typename OutputType>
-  static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should only be called to reduce floats on a gpu device");
-  }
-
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    typedef typename Self::Index Index;
-
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    // Don't crash when we're called with an input tensor of size 0.
-    if (num_coeffs == 0) {
-      return;
-    }
-
-    const int block_size = 256;
-    const int num_per_thread = 16;
-    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / block_size;
-    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
-    if (num_blocks > 1) {
-      // We initialize the outputs in the reduction kernel itself when we don't have to worry
-      // about race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
-                         num_blocks, 1024, 0, device, reducer.initialize(),
-                         num_preserved_vals, output);
-    }
-
-    LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
-  }
-};
-
-#endif
-
-
-template <typename Self, typename Op,
-          bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
-class BlockReducer {
- public:
-  typedef typename Self::Index Index;
-  typedef typename Self::Scalar Scalar;
-  typedef typename Self::CoeffReturnType CoeffReturnType;
-  typedef typename Self::PacketReturnType PacketReturnType;
-  explicit BlockReducer(const Op& reducer) : op_(reducer) {
-    accum_ = op_.initialize();
-  }
-  void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
-    for (Index i = 0; i < num_values_to_reduce; ++i) {
-      op_.reduce(data[index + i], &accum_);
-    }
-  }
-  CoeffReturnType Finalize() {
-    return op_.finalize(accum_);
-  }
-  PacketReturnType FinalizePacket() {
-    // TODO(andydavis) This function should not be called for Scalar
-    // reductions: clean this up or add an assert here.
-    return PacketReturnType();
-  }
-
- private:
-  CoeffReturnType accum_;
-  Op op_;
-};
-
-template <typename Self, typename Op>
-class BlockReducer<Self, Op, true> {
- public:
-  typedef typename Self::Index Index;
-  typedef typename Self::Scalar Scalar;
-  typedef typename Self::CoeffReturnType CoeffReturnType;
-  typedef typename Self::PacketReturnType PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-
-  explicit BlockReducer(const Op& reducer) : op_(reducer) {
-    vaccum_ = op_.template initializePacket<PacketReturnType>();
-    accum_ = op_.initialize();
-  }
-  void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
-    const Index vectorized_size = (num_values_to_reduce / PacketSize) *
-        PacketSize;
-    for (Index i = 0; i < vectorized_size; i += PacketSize) {
-      op_.reducePacket(internal::ploadt<PacketReturnType, Unaligned>(
-          &data[index + i]), &vaccum_);
-    }
-    for (Index i = vectorized_size; i < num_values_to_reduce; ++i) {
-      op_.reduce(data[index + i], &accum_);
-    }
-  }
-  CoeffReturnType Finalize() {
-    return op_.finalizeBoth(accum_, vaccum_);
-  }
-  PacketReturnType FinalizePacket() {
-    return op_.finalizePacket(vaccum_);
-  }
-
- private:
-  PacketReturnType vaccum_;
-  CoeffReturnType accum_;
-  Op op_;
 };
 
 }  // end namespace internal
 
 
-template <typename Op, typename Dims, typename XprType>
-class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType>, ReadOnlyAccessors> {
+template <typename Op, typename Dims, typename XprType,  template <class> class MakePointer_>
+class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> {
   public:
     typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
     typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
@@ -911,61 +497,77 @@
     const Op m_reducer;
 };
 
+template<typename ArgType, typename Device>
+struct TensorReductionEvaluatorBase;
 
 // Eval as rvalue
-template<typename Op, typename Dims, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
 {
-  typedef TensorReductionOp<Op, Dims, ArgType> XprType;
+  typedef internal::reducer_traits<Op, Device> ReducerTraits;
+  typedef Dims ReducedDims;
+  typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
   typedef typename XprType::Index Index;
+  typedef ArgType ChildType;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
   static const int NumInputDims = internal::array_size<InputDimensions>::value;
   static const int NumReducedDims = internal::array_size<Dims>::value;
-  EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
   static const int NumOutputDims = NumInputDims - NumReducedDims;
-  typedef DSizes<Index, NumOutputDims> Dimensions;
+  typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions;
   typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
-  typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> Self;
+  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
   static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
-  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename internal::ReductionReturnType<Op, typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const Index PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+    // Subset of strides of the input tensor for the non-reduced dimensions.
+  // Indexed by output dimensions.
+  static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
 
   enum {
     IsAligned = false,
-    PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
-    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = true,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  typedef typename internal::TensorBlock<Index, ScalarNonConst, NumOutputDims,
-                                         Layout> OutputTensorBlock;
-  typedef typename internal::TensorBlock<Index, ScalarNonConst, NumInputDims,
-                                         Layout> InputTensorBlock;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
 
   static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
   static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
-  static const bool RunningFullReduction = (NumInputDims==NumReducedDims);
+  static const bool RunningFullReduction = (NumOutputDims==0);
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
   {
+    EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
     EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
                         YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    // Build the bitmap indicating if an input dimension is reduced or not.
     for (int i = 0; i < NumInputDims; ++i) {
-      m_reduced_dim[i] = false;
+      m_reduced[i] = false;
     }
     for (int i = 0; i < NumReducedDims; ++i) {
       eigen_assert(op.dims()[i] >= 0);
       eigen_assert(op.dims()[i] < NumInputDims);
-      m_reduced_dim[op.dims()[i]] = true;
+      m_reduced[op.dims()[i]] = true;
     }
 
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
-    internal::partition_dims(input_dims, m_reduced_dim, &m_dimensions, &m_reducedDims);
+    internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims);
 
     // Precompute output strides.
     if (NumOutputDims > 0) {
@@ -986,148 +588,191 @@
 
     // Precompute input strides.
     if (NumInputDims > 0) {
+      array<Index, NumInputDims> input_strides;
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        m_inputStrides[0] = 1;
+        input_strides[0] = 1;
         for (int i = 1; i < NumInputDims; ++i) {
-          m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+          input_strides[i] = input_strides[i-1] * input_dims[i-1];
         }
       } else {
-        m_inputStrides[NumInputDims - 1] = 1;
+        input_strides.back() = 1;
         for (int i = NumInputDims - 2; i >= 0; --i) {
-          m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
+          input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
+        }
+      }
+
+      int outputIndex = 0;
+      int reduceIndex = 0;
+      for (int i = 0; i < NumInputDims; ++i) {
+        if (m_reduced[i]) {
+          m_reducedStrides[reduceIndex] = input_strides[i];
+          ++reduceIndex;
+        } else {
+          m_preservedStrides[outputIndex] = input_strides[i];
+          m_output_to_input_dim_map[outputIndex] = i;
+          ++outputIndex;
         }
       }
     }
 
-    int outputIndex = 0;
-    int reduceIndex = 0;
-    for (int i = 0; i < NumInputDims; ++i) {
-      if (m_reduced_dim[i]) {
-        m_reducedStrides[reduceIndex] = m_inputStrides[i];
-        ++reduceIndex;
-      } else {
-        m_preservedStrides[outputIndex] = m_inputStrides[i];
-        m_output_to_input_dim_map[outputIndex] = i;
-        ++outputIndex;
-      }
+    // Special case for full reductions
+    if (NumOutputDims == 0) {
+      m_preservedStrides[0] = internal::array_prod(input_dims);
     }
 
-    m_numValuesToReduce
-        = NumOutputDims == 0 ? internal::array_prod(input_dims)
-        : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
-            ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1];
-
-    m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
-                                        device.lastLevelCacheSize() /
-                                        sizeof(Scalar));
+    m_numValuesToReduce =
+        NumOutputDims == 0
+            ? internal::array_prod(input_dims)
+            : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
+                  ? m_preservedStrides[0]
+                  : m_preservedStrides[NumOutputDims - 1];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-
+  EIGEN_STRONG_INLINE
+  bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) {
     // Use the FullReducer if possible.
-    if (RunningFullReduction &&
+    if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction &&
         internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
         ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
-         !RunningOnGPU)) {
+         !RunningOnGPU))) {
       bool need_assign = false;
       if (!data) {
-        m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
-        // Make sure the data is aligned.
-        eigen_assert((reinterpret_cast<size_t>(m_result) & 0x3) == 0);
+        m_result = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType))));
         data = m_result;
         need_assign = true;
       }
-      else if ((reinterpret_cast<size_t>(data) & 0x3) != 0) {
-        // The data isn't aligned, so revert to the slow code path
-        return true;
-      }
       Op reducer(m_reducer);
       internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
       return need_assign;
     }
-    // Attempt to use an optimized reduction. This requires that the data is aligned.
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-    else if (RunningOnGPU && ((reinterpret_cast<size_t>(data) & 0x3) == 0) && (m_device.majorDeviceVersion() >= 3)) {
+
+    // Attempt to use an optimized reduction.
+    else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) {
       bool reducing_inner_dims = true;
       for (int i = 0; i < NumReducedDims; ++i) {
         if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-          reducing_inner_dims &= m_reduced_dim[i];
+          reducing_inner_dims &= m_reduced[i];
         } else {
-          reducing_inner_dims &= m_reduced_dim[NumInputDims - 1 - i];
+          reducing_inner_dims &= m_reduced[NumInputDims - 1 - i];
         }
       }
-      if (internal::InnerGpuReducer<Self, Op>::HasOptimizedImplementation &&
+      if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation &&
           (reducing_inner_dims || ReducingInnerMostDims)) {
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
-        if (num_values_to_reduce <= 32) {
-          // It's faster to call the usual codepath if we have fewer than warpSize
-          // values to reduce.
-          return true;
-        }
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
-        bool need_assign = false;
         if (!data) {
-          m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
-          data = m_result;
-          need_assign = true;
+          if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) || (RunningOnSycl)) {
+            data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
+            m_result = data;
+          }
+          else {
+            return true;
+          }
         }
         Op reducer(m_reducer);
-        internal::InnerGpuReducer<Self, Op>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
-        return need_assign;
+        // For SYCL this if always return false
+        if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
+          if (m_result) {
+            m_device.deallocate_temp(m_result);
+            m_result = NULL;
+          }
+          return true;
+        } else {
+          return (m_result != NULL);
+        }
       }
 
       bool preserving_inner_dims = true;
       for (int i = 0; i < NumReducedDims; ++i) {
         if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-          preserving_inner_dims &= m_reduced_dim[NumInputDims - 1 - i];
+          preserving_inner_dims &= m_reduced[NumInputDims - 1 - i];
         } else {
-          preserving_inner_dims &= m_reduced_dim[i];
+          preserving_inner_dims &= m_reduced[i];
         }
       }
-      if (internal::OuterGpuReducer<Self, Op>::HasOptimizedImplementation &&
-          (preserving_inner_dims)) {
+      if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation &&
+          preserving_inner_dims) {
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
-        if (num_values_to_reduce <= 32) {
-          // It's faster to call the usual codepath if we have fewer than warpSize
-          // values to reduce.
-          return true;
-        }
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
-        bool need_assign = false;
         if (!data) {
-          m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
-          data = m_result;
-          need_assign = true;
+          if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) || (RunningOnSycl)) {
+            data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
+            m_result = data;
+          }
+          else {
+            return true;
+          }
         }
         Op reducer(m_reducer);
-        internal::OuterGpuReducer<Self, Op>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
-        return need_assign;
+        // For SYCL this if always return false
+        if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
+          if (m_result) {
+            m_device.deallocate_temp(m_result);
+            m_result = NULL;
+          }
+          return true;
+        } else {
+          return (m_result != NULL);
+        }
       }
+      #if defined(EIGEN_USE_SYCL)
+      // If there is no Optimised version for SYCL, the reduction expression 
+      // must break into two subexpression and use the SYCL generic Reducer on the device.
+      if(RunningOnSycl) {
+         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+         if (!data) {
+           data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
+           m_result = data;
+         }
+         Op reducer(m_reducer);
+         internal::GenericReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+         return (m_result != NULL);
+       }
+      #endif
     }
-#endif
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE
+      void
+      evalSubExprsIfNeededAsync(EvaluatorPointerType data,
+                                EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) {
+      done(evalSubExprsIfNeededCommon(data));
+    });
+  }
+#endif
 
+  EIGEN_STRONG_INLINE
+  bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return evalSubExprsIfNeededCommon(data);
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
     if (m_result) {
-      m_device.deallocate(m_result);
+      m_device.deallocate_temp(m_result);
+      m_result = NULL;
     }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    if ((RunningFullReduction || RunningOnGPU) && m_result) {
+    if (( RunningFullReduction || RunningOnGPU) && m_result ) {
       return *(m_result + index);
     }
     Op reducer(m_reducer);
-    if (ReducingInnerMostDims) {
+    if (ReducingInnerMostDims || RunningFullReduction) {
+      const Index num_values_to_reduce =
+        (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
       return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index),
-                                                             m_numValuesToReduce, reducer);
+                                                             num_values_to_reduce, reducer);
     } else {
       typename Self::CoeffReturnType accum = reducer.initialize();
       internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
@@ -1139,19 +784,19 @@
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
 
     if (RunningOnGPU && m_result) {
       return internal::pload<PacketReturnType>(m_result + index);
     }
 
-    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
     if (ReducingInnerMostDims) {
-      const Index num_values_to_reduce = m_numValuesToReduce;
+      const Index num_values_to_reduce =
+        (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
       const Index firstIndex = firstInput(index);
-      for (Index i = 0; i < packetSize; ++i) {
+      for (Index i = 0; i < PacketSize; ++i) {
         Op reducer(m_reducer);
         values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
                                                                     num_values_to_reduce, reducer);
@@ -1160,18 +805,18 @@
       const Index firstIndex = firstInput(index);
       const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
       // TBD: extend this the the n innermost dimensions that we preserve.
-      if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) {
+      if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
         Op reducer(m_reducer);
         typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
         internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
         return reducer.finalizePacket(accum);
       } else {
-        for (int i = 0; i < packetSize; ++i) {
+        for (int i = 0; i < PacketSize; ++i) {
           values[i] = coeff(index + i);
         }
       }
     } else {
-      for (int i = 0; i < packetSize; ++i) {
+      for (int i = 0; i < PacketSize; ++i) {
         values[i] = coeff(index + i);
       }
     }
@@ -1179,13 +824,6 @@
     return rslt;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
-      std::vector<internal::TensorOpResourceRequirements>* resources) const {
-    resources->push_back(internal::TensorOpResourceRequirements(
-        internal::kSkewedInnerDims, m_block_total_size_max));
-    m_impl.getResourceRequirements(resources);
-  }
-
   // Must be called after evalSubExprsIfNeeded().
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     if (RunningFullReduction && m_result) {
@@ -1198,271 +836,46 @@
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void block(
-      OutputTensorBlock* output_block) const {
-    // Special case full reductions to avoid input block copy below.
-    if (NumInputDims == NumReducedDims) {
-      eigen_assert(output_block->first_coeff_index() == 0);
-      eigen_assert(output_block->block_sizes().TotalSize() == 1);
-      Op reducer(m_reducer);
-      output_block->data()[0] = internal::InnerMostDimReducer<Self, Op>::reduce(
-          *this, 0, m_numValuesToReduce, reducer);
-      return;
-    }
-
-    // Calculate input tensor 'slice' required to reduce output block coeffs.
-    DSizes<Index, NumInputDims> input_slice_sizes(m_impl.dimensions());
-    for (int i = 0; i < NumOutputDims; ++i) {
-      // Clip preserved input dimensions by output block size.
-      input_slice_sizes[m_output_to_input_dim_map[i]] =
-          output_block->block_sizes()[i];
-    }
-
-    // Shard input tensor slice into blocks (because it could be large if we
-    // need to reduce along several dimensions to calculate required output
-    // coefficients).
-    const Index max_coeff_count =
-        numext::mini(((m_device.firstLevelCacheSize()) / sizeof(Scalar)),
-                   input_slice_sizes.TotalSize());
-
-    // Calculate max output shard size needed to keep working set of reducers
-    // in L1, while leaving enough space for reducer overhead and 'PacketSize'
-    // reductions.
-    DSizes<Index, NumInputDims> target_input_block_sizes;
-    CalculateTargetInputBlockShape(max_coeff_count, input_slice_sizes,
-                                   &target_input_block_sizes);
-    // Calculate indices for first preserved dimension.
-    const Index first_preserved_dim_output_index =
-        static_cast<int>(Layout) == static_cast<int>(ColMajor) ?
-        0 : NumOutputDims - 1;
-    const Index first_preserved_dim_input_index = m_output_to_input_dim_map[
-        first_preserved_dim_output_index];
-    const bool inner_most_dim_preserved = first_preserved_dim_input_index ==
-        (static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 :
-         NumInputDims - 1) | PreservingInnerMostDims;
-
-    // Calculate output block inner/outer dimension sizes.
-    const Index output_block_inner_dim_size = output_block->block_sizes()[
-        first_preserved_dim_output_index];
-    const Index output_block_outer_dim_size =
-        output_block->block_sizes().TotalSize() / output_block_inner_dim_size;
-    // Calculate shard size for first preserved dimension.
-    const Index output_shard_size = target_input_block_sizes[
-        first_preserved_dim_input_index];
-    const Index num_output_shards =
-        (output_block_inner_dim_size + output_shard_size - 1) /
-        output_shard_size;
-
-    // Initialize 'tensor_slice_offsets' from input coords of output index.
-    DSizes<Index, NumInputDims> tensor_slice_offsets;
-    GetInputCoordsForOutputIndex(output_block->first_coeff_index(),
-                                 &tensor_slice_offsets);
-
-    // Store tensor slice offset in first preserved dimension to be used
-    // to update tensor slice extents in loop below.
-    const Index first_preserved_dim_offset_start = tensor_slice_offsets[
-        first_preserved_dim_input_index];
-
-    array<BlockIteratorState, NumOutputDims> block_iter_state;
-
-    // Initialize state used to iterate through output coefficients
-    // and update 'tensor_slice_offsets' in outer preserved dims.
-    for (int i = 0; i < NumOutputDims - 1; ++i) {
-      const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-          ? i + 1 : NumOutputDims - i - 2;
-      block_iter_state[i].input_dim = m_output_to_input_dim_map[dim];
-      block_iter_state[i].output_size = output_block->block_sizes()[dim];
-      block_iter_state[i].output_count = 0;
-    }
-
-    // Allocate input block memory.
-    ScalarNonConst* input_block_data = static_cast<ScalarNonConst*>(
-        m_device.allocate(max_coeff_count * sizeof(Scalar)));
-    // Allocate reducer memory.
-    const bool packet_reductions_enabled = (Self::InputPacketAccess &
-                                            Op::PacketAccess);
-    const Index num_reducers =
-        (inner_most_dim_preserved && packet_reductions_enabled) ?
-        (output_shard_size / PacketSize + output_shard_size % PacketSize +
-         PacketSize) : output_shard_size;
-    typedef internal::BlockReducer<Self, Op> BlockReducer;
-    BlockReducer* reducers = static_cast<BlockReducer*>(
-        m_device.allocate(num_reducers * sizeof(BlockReducer)));
-
-    InputDimensions input_tensor_dims(m_impl.dimensions());
-    for (Index output_outer_index = 0;
-         output_outer_index < output_block_outer_dim_size;
-         ++output_outer_index) {
-      for (Index output_shard_index = 0;
-           output_shard_index < num_output_shards;
-           ++output_shard_index) {
-        // Initialize 'tensor_slice_extents' for this output shard.
-        DSizes<Index, NumInputDims> tensor_slice_extents(input_slice_sizes);
-        for (int i = 0; i < NumInputDims; ++i) {
-          if (i == first_preserved_dim_input_index) {
-            // Clip first preserved dim size to output shard size.
-            tensor_slice_extents[i] = numext::mini(
-                output_shard_size,
-                input_slice_sizes[i] - (tensor_slice_offsets[i] -
-                                        first_preserved_dim_offset_start));
-
-          } else if (!m_reduced_dim[i]) {
-            // Clip outer preserved dims to size 1, so that we reduce a
-            // contiguous set of output coefficients.
-            tensor_slice_extents[i] = 1;
-          }
-        }
-
-        // Intialize output coefficient reducers.
-        for (int i = 0; i < num_reducers; ++i) {
-          new (&reducers[i]) BlockReducer(m_reducer);
-        }
-
-        typedef internal::TensorSliceBlockMapper<
-          Index, ScalarNonConst, NumInputDims, Layout> TensorSliceBlockMapper;
-
-        // TODO(andydavis) Consider removing 'input_block_stride_order' if we
-        // find that scattered reads are not worth supporting in
-        // TensorSliceBlockMapper.
-        TensorSliceBlockMapper block_mapper(
-            input_tensor_dims, tensor_slice_offsets, tensor_slice_extents,
-            target_input_block_sizes, DimensionList<Index, NumInputDims>());
-
-        const Index num_outputs_to_update = tensor_slice_extents[
-            first_preserved_dim_input_index];
-        const Index preserved_dim_vector_reducer_count =
-            (inner_most_dim_preserved && packet_reductions_enabled) ?
-            num_outputs_to_update / PacketSize: 0;
-        const Index preserved_dim_vector_coeff_count =
-            inner_most_dim_preserved ? preserved_dim_vector_reducer_count *
-            PacketSize : 0;
-        const Index preserved_dim_reducer_limit =
-            (inner_most_dim_preserved && packet_reductions_enabled) ?
-          (preserved_dim_vector_reducer_count +
-           num_outputs_to_update % PacketSize) : num_outputs_to_update;
-
-        const Index total_block_count = block_mapper.total_block_count();
-        for (Index b = 0; b < total_block_count; ++b) {
-          InputTensorBlock input_block = block_mapper.GetBlockForIndex(
-              b, input_block_data);
-          // Read.
-          m_impl.block(&input_block);
-
-          Index num_values_to_reduce = 1;
-          for (Index i = 0; i < NumInputDims; ++i) {
-            if (m_reduced_dim[i]) {
-              num_values_to_reduce *= input_block.block_sizes()[i];
-            }
-          }
-          // Reduce.
-          if (inner_most_dim_preserved) {
-            const Index input_outer_dim_size =
-                input_block.block_sizes().TotalSize() / num_outputs_to_update;
-            for (Index input_outer_dim_index = 0;
-                 input_outer_dim_index < input_outer_dim_size;
-                 ++input_outer_dim_index) {
-              const Index input_outer_dim_base = input_outer_dim_index *
-                  num_outputs_to_update;
-              for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) {
-                reducers[i].Reduce(input_outer_dim_base + i * PacketSize,
-                                   PacketSize, input_block.data());
-              }
-              const Index scalar_reducer_base = input_outer_dim_base +
-                  preserved_dim_vector_coeff_count;
-              for (Index i = preserved_dim_vector_reducer_count;
-                   i < preserved_dim_reducer_limit; ++i) {
-                reducers[i].Reduce(scalar_reducer_base + i -
-                                   preserved_dim_vector_reducer_count,
-                                   1,
-                                   input_block.data());
-              }
-            }
-          } else {
-            for (Index i = 0; i < num_outputs_to_update; ++i) {
-              reducers[i].Reduce(i * num_values_to_reduce,
-                                 num_values_to_reduce,
-                                 input_block.data());
-            }
-          }
-        }
-
-        // Finalize all reducers for this output shard.
-        const Index output_base_index =
-            output_outer_index * output_block_inner_dim_size +
-            output_shard_index * output_shard_size;
-        if (inner_most_dim_preserved) {
-          EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-          for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) {
-            const Index reducer_base = output_base_index + i * PacketSize;
-            internal::pstore<CoeffReturnType, PacketReturnType>(
-                values, reducers[i].FinalizePacket());
-            for (Index j = 0; j < PacketSize; ++j) {
-              output_block->data()[reducer_base + j] = values[j];
-            }
-          }
-          const Index scalar_reducer_base = output_base_index +
-              preserved_dim_vector_coeff_count;
-
-          for (Index i = preserved_dim_vector_reducer_count;
-               i < preserved_dim_reducer_limit; ++i) {
-            output_block->data()[
-                scalar_reducer_base + i - preserved_dim_vector_reducer_count] =
-                reducers[i].Finalize();
-          }
-        } else {
-          for (int i = 0; i < num_outputs_to_update; ++i) {
-            output_block->data()[output_base_index + i] =
-                reducers[i].Finalize();
-          }
-        }
-
-        // Update 'tensor_slice_offsets' by num outputs for this output shard.
-        tensor_slice_offsets[first_preserved_dim_input_index] +=
-            num_outputs_to_update;
-      }
-      // Update slice offset for inner preserved dim.
-      tensor_slice_offsets[first_preserved_dim_input_index] -=
-          output_block_inner_dim_size;
-      // Update slice offsets for remaining output dims.
-      for (int i = 0; i < NumOutputDims - 1; ++i) {
-        BlockIteratorState& b = block_iter_state[i];
-        if (++b.output_count < b.output_size) {
-          ++tensor_slice_offsets[b.input_dim];
-          break;
-        }
-        b.output_count = 0;
-        tensor_slice_offsets[b.input_dim] -= b.output_size - 1;
-      }
-    }
-
-    // Free memory.
-    m_device.deallocate(input_block_data);
-    m_device.deallocate(reducers);
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
+  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+    m_result.bind(cgh);
   }
-
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+#endif
 
   private:
   template <int, typename, typename> friend struct internal::GenericDimReducer;
-  template <typename, typename, bool> friend struct internal::InnerMostDimReducer;
+  template <typename, typename, bool, bool> friend struct internal::InnerMostDimReducer;
   template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
   template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer;
 #ifdef EIGEN_USE_THREADS
   template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
 #endif
-#ifdef EIGEN_HAS_CUDA_FP16
-  template <typename S, typename R, typename I> friend void internal::ReductionInitKernelHalfFloat(R, const S, I, half*);
-  template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
-  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+  template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
+#if defined(EIGEN_HAS_GPU_FP16)
+  template <typename S, typename R, typename I_> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<Eigen::half>::type*);
+  template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<Eigen::half>::type*);
+  template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
+#endif
+  template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
 
+  template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
 #endif
 
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-  template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
-  template <int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
-  template <int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+#if defined(EIGEN_USE_SYCL)
+ template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::GenericNondeterministicReducer;
+ // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
+ template <typename, typename, typename> friend struct internal::GenericReducer;
 #endif
 
+
+  template <typename S, typename O, typename D> friend struct internal::InnerReducer;
+
   struct BlockIteratorState {
     Index input_dim;
     Index output_size;
@@ -1473,114 +886,52 @@
   // used to compute the reduction at output index "index".
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
     if (ReducingInnerMostDims) {
-      return index * m_numValuesToReduce;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        return index * m_preservedStrides[0];
+      } else {
+        return index * m_preservedStrides[NumPreservedStrides - 1];
+      }
     }
+    // TBD: optimize the case where we preserve the innermost dimensions.
     Index startInput = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = NumOutputDims - 1; i > 0; --i) {
         // This is index_i in the output tensor.
-        const Index idx = index / m_fastOutputStrides[i];
+        const Index idx = index / m_outputStrides[i];
         startInput += idx * m_preservedStrides[i];
         index -= idx * m_outputStrides[i];
       }
+      if (PreservingInnerMostDims) {
+        eigen_assert(m_preservedStrides[0] == 1);
+        startInput += index;
+      } else {
+        startInput += index * m_preservedStrides[0];
+      }
     } else {
       for (int i = 0; i < NumOutputDims - 1; ++i) {
         // This is index_i in the output tensor.
-        const Index idx = index / m_fastOutputStrides[i];
+        const Index idx = index / m_outputStrides[i];
         startInput += idx * m_preservedStrides[i];
         index -= idx * m_outputStrides[i];
       }
-    }
-    if (PreservingInnerMostDims) {
-      eigen_assert(m_numValuesToReduce == 1);
-      startInput += index;
-    } else {
-      startInput += index * m_numValuesToReduce;
+      if (PreservingInnerMostDims) {
+        eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
+        startInput += index;
+      } else {
+        startInput += index * m_preservedStrides[NumPreservedStrides - 1];
+      }
     }
     return startInput;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void GetInputCoordsForOutputIndex(
-      Index index,
-      DSizes<Index, NumInputDims>* coords) const {
-    for (int i = 0; i < NumInputDims; ++i) {
-      (*coords)[i] = 0;
-    }
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = NumOutputDims - 1; i > 0; --i) {
-        const Index idx = index / m_fastOutputStrides[i];
-        (*coords)[m_output_to_input_dim_map[i]] = idx;
-        index -= idx * m_outputStrides[i];
-      }
-      (*coords)[m_output_to_input_dim_map[0]] = index;
-    } else {
-      for (int i = 0; i < NumOutputDims - 1; ++i) {
-        const Index idx = index / m_fastOutputStrides[i];
-        (*coords)[m_output_to_input_dim_map[i]] = idx;
-        index -= idx * m_outputStrides[i];
-      }
-      (*coords)[m_output_to_input_dim_map[NumOutputDims-1]] = index;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void CalculateTargetInputBlockShape(
-      const Index max_coeff_count,
-      const DSizes<Index, NumInputDims>& input_slice_sizes,
-      DSizes<Index, NumInputDims>* target_input_block_sizes) const {
-    typedef typename internal::packet_traits<Scalar>::type Packet;
-    typedef internal::BlockReducer<Self, Op> BlockReducer;
-    // TODO(andydavis) Compute reducer overhead correctly for the case where
-    // we are preserving the inner most dimension, and a single reducer
-    // reduces a packet's worth of output coefficients.
-    const Index reducer_overhead = sizeof(BlockReducer) / sizeof(Scalar);
-
-    Index coeff_to_allocate = max_coeff_count;
-    bool first_preserved_dim_allocated = false;
-    bool first_reduced_dim_allocated = false;
-    for (int i = 0; i < NumInputDims; ++i) {
-      const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-          ? i : NumInputDims - i - 1;
-      (*target_input_block_sizes)[dim] = 1;
-      if (m_reduced_dim[dim]) {
-        // TODO(andydavis) Consider allocating to multiple reduced dimensions.
-        // Watch out for cases where reduced dimensions are not contiguous,
-        // which induces scattered reads.
-        if (!first_reduced_dim_allocated) {
-          (*target_input_block_sizes)[dim] = numext::mini(input_slice_sizes[dim],
-                                                        coeff_to_allocate);
-          coeff_to_allocate /= (*target_input_block_sizes)[dim];
-          first_reduced_dim_allocated = true;
-        }
-      } else if (!first_preserved_dim_allocated) {
-        // TODO(andydavis) Include output block size in this L1 working set
-        // calculation.
-        const Index allocated = max_coeff_count - coeff_to_allocate;
-        const Index alloc_size = numext::maxi(static_cast<Index>(1),
-                                            coeff_to_allocate /
-                                            reducer_overhead);
-        (*target_input_block_sizes)[dim] = numext::mini(input_slice_sizes[dim],
-                                                      alloc_size);
-        coeff_to_allocate = numext::maxi(
-            static_cast<Index>(1),
-            coeff_to_allocate / ((*target_input_block_sizes)[dim] *
-                                 reducer_overhead));
-        first_preserved_dim_allocated = true;
-      }
-    }
-  }
-
   // Bitmap indicating if an input dimension is reduced or not.
-  array<bool, NumInputDims> m_reduced_dim;
+  array<bool, NumInputDims> m_reduced;
   // Dimensions of the output of the operation.
   Dimensions m_dimensions;
-  // Precomputed strides for the input tensor.
-  array<Index, NumInputDims> m_inputStrides;
   // Precomputed strides for the output tensor.
   array<Index, NumOutputDims> m_outputStrides;
   array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides;
-  // Subset of strides of the input tensor for the non-reduced dimensions.
-  // Indexed by output dimensions.
-  array<Index, NumOutputDims> m_preservedStrides;
+  array<Index, NumPreservedStrides> m_preservedStrides;
   // Map from output to input dimension index.
   array<Index, NumOutputDims> m_output_to_input_dim_map;
   // How many values go into each reduction
@@ -1600,15 +951,46 @@
   Op m_reducer;
 
   // For full reductions
-#ifdef EIGEN_USE_GPU
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
   static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
+  static const bool RunningOnSycl = false;
+#elif defined(EIGEN_USE_SYCL)
+static const bool RunningOnSycl = internal::is_same<typename internal::remove_all<Device>::type, Eigen::SyclDevice>::value;
+static const bool RunningOnGPU = false;
 #else
   static const bool RunningOnGPU = false;
+  static const bool RunningOnSycl = false;
 #endif
-  CoeffReturnType* m_result;
-  std::size_t m_block_total_size_max;
+  EvaluatorPointerType m_result;
 
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
+};
+
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
+: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
+  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Base;
+  EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){}
+};
+
+
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
+: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> {
+
+  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> Base;
+  EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){}
+  // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
+  //Therefore the coeff function should be overridden by for SYCL kernel
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const {
+    return *(this->data() + index);
+  }
+  // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
+  //Therefore the packet function should be overridden by for SYCL kernel
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const {
+    return internal::pload<typename Base::PacketReturnType>(this->data() + index);
+  }
 };
 
 } // end namespace Eigen

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 0b3bc7c..68780cd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h

@@ -1,751 +1,6 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Manjunath Kudlur <keveman@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
-#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
-
-#if defined(EIGEN_USE_GPU)
-
-namespace Eigen {
-namespace internal {
-
-template <typename OutExpr, typename InExpr, typename Op, typename Indices,
-          bool Tileable>
-class TensorExecutor<
-    const TensorAssignOp<
-        OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
-    GpuDevice, false, Tileable> {
- public:
-  typedef const TensorAssignOp<
-      OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>
-      Expression;
-  static void run(const Expression& expr, const GpuDevice& device);
-};
-
-template <typename OutExpr, typename InExpr, typename Op, typename Indices,
-          bool Tileable>
-class TensorExecutor<
-    const TensorAssignOp<
-        OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
-    GpuDevice, true, Tileable> {
- public:
-  typedef const TensorAssignOp<
-      OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>
-      Expression;
-  static void run(const Expression& expr, const GpuDevice& device);
-};
-
-template <typename InExpr, typename Op, typename Indices, bool Tileable>
-class TensorExecutor<const TensorEvalToOp<const TensorReductionOp<
-                         Op, const Indices, const InExpr> >,
-                     GpuDevice, false, Tileable> {
- public:
-  typedef const TensorEvalToOp<
-      const TensorReductionOp<Op, const Indices, const InExpr> > Expression;
-  static void run(const Expression& expr, const GpuDevice& device);
-};
-
-template <typename InExpr, typename Op, typename Indices, bool Tileable>
-class TensorExecutor<const TensorEvalToOp<const TensorReductionOp<
-                         Op, const Indices, const InExpr> >,
-                     GpuDevice, true, Tileable> {
- public:
-  typedef const TensorEvalToOp<
-      const TensorReductionOp<Op, const Indices, const InExpr> > Expression;
-  static void run(const Expression& expr, const GpuDevice& device);
-};
-
-}  // end namespace internal
-}  // end namespace Eigen
-
-#if defined(__CUDACC__)
-
-namespace Eigen {
-
-namespace internal {
-
-namespace {
-
-// Initialize output[0..size-1] with val
-template <typename Output>
-__global__ void InitVector(const float val, int size, Output output) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = idx; i < size; i += gridDim.x * blockDim.x) {
-    output.coeffRef(i) = val;
-  }
-}
-
-// -----------------------------------------------------------------------------
-// Column Reduction kernels
-// -----------------------------------------------------------------------------
-template <int GRID_DIM, int BLOCK_DIM, int NUM_PER_THREAD, typename Input,
-          typename Output, typename Reducer>
-__global__ void ColumnReduceKernel(Reducer reducer, const Input input, int rows,
-                                   int cols, Output output) {
-  assert(blockDim.x == BLOCK_DIM);
-  assert(blockDim.y == 1);
-  assert(blockDim.z == 1);
-
-  assert(gridDim.x == GRID_DIM);
-  assert(gridDim.y == 1);
-  assert(gridDim.z == 1);
-
-  typedef typename Input::Index Index;
-
-  const Index num_input_points = divup(rows, NUM_PER_THREAD) * cols;
-  const int bx = blockIdx.x;
-  const int tx = threadIdx.x;
-
-  for (Index i = bx * BLOCK_DIM + tx; i < num_input_points;
-       i += BLOCK_DIM * GRID_DIM) {
-    const Index input_col = i % cols;
-    const Index input_row_begin =
-        ((i / cols) % divup(rows, NUM_PER_THREAD)) * NUM_PER_THREAD;
-    float reduced_val = reducer.bottom_value();
-    for (int j = 0; j < NUM_PER_THREAD; ++j) {
-      float val = ((input_col < cols) && (input_row_begin + j < rows))
-                      ? input.coeff((input_row_begin + j) * cols + input_col)
-                      : reducer.bottom_value();
-      reduced_val = reducer(reduced_val, val);
-    }
-#if __CUDA_ARCH__ >= 300
-    reducer.atomic_reduce(&output.coeffRef(input_col), reduced_val);
+#if defined(__clang__) || defined(__GNUC__)
+#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorReductionGpu.h file"
 #endif
-  }
-}
 
-template <typename Input, typename Output, typename Reducer>
-void ColumnReduceCuda(Reducer reducer, const GpuDevice& device,
-                      const Input input, int rows, int cols, Output output) {
-  const int block_size = 256;
-  const int grid_size = 128;
-  const int num_per_thread = 16;
-  LAUNCH_CUDA_KERNEL(InitVector<Output>, 32, 1024, 0, device, reducer.bottom_value(),
-                     cols, output);
-  LAUNCH_CUDA_KERNEL(
-      (ColumnReduceKernel<grid_size, block_size, num_per_thread, Input, Output, Reducer>), grid_size,
-      block_size, 0, device, reducer, input, rows, cols, output);
-}
-
-// -----------------------------------------------------------------------------
-// Row Reduction kernel
-// -----------------------------------------------------------------------------
-
-// Template parameter to RowReduceKernel derived from the kernel's block and
-// grid dimensions.
-enum RowReduceDims {
-  ONE_WARP_PER_ROW,    // gridDim.x == 1 && blockDim.x == warpSize
-  ONE_BLOCK_PER_ROW,   // gridDim.x == 1 && blockDim.x > warpSize
-  MANY_BLOCKS_PER_ROW  // gridDim.x > 1
-};
-
-// Reduces along a matrix's rows to produce a column vector.
-template <RowReduceDims Dims, bool MultipleRowsPerThread, typename Input,
-          typename Output, typename Reducer>
-__global__ void RowReduceKernel(Reducer reducer, const Input input, int rows,
-                                int cols, Output output) {
-  typedef typename Input::Index Index;
-
-  // Keep this in sync with the constant in RowReduceCuda.
-  constexpr int ElemsPerRow = 8;
-
-  // All threads in a warp must have the same threadIdx.y so they operate on the
-  // same row.
-  assert(blockDim.x % warpSize == 0);
-  // This is a 2D kernel.
-  assert(blockDim.z == 1);
-  assert(gridDim.z == 1);
-
-  // Dims should be entirely determined by our block and grid dimensions.
-  if (blockDim.x == warpSize) {
-    assert(Dims == ONE_WARP_PER_ROW);
-    assert(gridDim.x == 1);
-  } else if (gridDim.x == 1) {
-    assert(Dims == ONE_BLOCK_PER_ROW);
-  } else {
-    assert(Dims == MANY_BLOCKS_PER_ROW);
-  }
-
-  // The MultipleRowsPerThread template parameter should be entirely determined
-  // by our block/grid dims and the number of rows.
-  assert(MultipleRowsPerThread == (gridDim.y * blockDim.y < rows));
-
-  const Index first_row = blockIdx.y * blockDim.y + threadIdx.y;
-  const Index row_step = blockDim.y * gridDim.y;
-#pragma nounroll
-  for (Index row = first_row; row < rows; row += row_step) {
-    // We have to be careful how we tell the compiler that we only make one trip
-    // through the loop if MultipleRowsPerThread is false: If Index is unsigned,
-    // the compiler won't assume that e.g. first_row + 1 does not overflow.  But
-    // clang is smart enough to understand this.
-    if (!MultipleRowsPerThread && row != first_row)
-      break;
-
-    float val = reducer.bottom_value();
-    Index col_start = ElemsPerRow * (blockIdx.x * blockDim.x + threadIdx.x);
-    if (col_start + ElemsPerRow - 1 < cols) {
-      // Manually unrolling this loop and doing a tree reduction instead of a
-      // straight-line reduction doesn't have a noticable performance impact on
-      // Kepler.
-#pragma unroll 8
-      for (Index col = col_start; col < col_start + ElemsPerRow; ++col) {
-        val = reducer(input.coeff(row * cols + col), val);
-      }
-    }
-    else {
-#pragma nounroll
-      for (Index col = col_start; col < cols; ++col) {
-        val = reducer(input.coeff(row * cols + col), val);
-      }
-    }
-
-    val = reducer(__shfl_down(val, 16), val);
-    val = reducer(__shfl_down(val, 8), val);
-    val = reducer(__shfl_down(val, 4), val);
-    val = reducer(__shfl_down(val, 2), val);
-    val = reducer(__shfl_down(val, 1), val);
-
-    const int warp_id = threadIdx.x & (warpSize - 1);
-
-    // At this point, thread 0 in each warp has the reduced value for all threads
-    // in the warp.  Time for us to write that value into output.
-    //
-    // If ONE_WARP_PER_ROW, only one warp wants to write to output[row], so we
-    // just write directly into output.  If ONE_BLOCK_PER_ROW, we initialize
-    // output and then atomically reduce into it.  And if MANY_BLOCKS_PER_ROW, we
-    // just do the atomic reduction -- our output was already initialized before
-    // this kernel started.
-    //
-    // (We could use shared memory to do the ONE_BLOCK_PER_ROW reduction, but the
-    // proper atomic reduction seems to be faster, at least on Kepler.)
-    if (Dims == ONE_WARP_PER_ROW) {
-      if (warp_id == 0) output.coeffRef(row) = val;
-    } else {
-      if (Dims == ONE_BLOCK_PER_ROW) {
-        output.coeffRef(row) = reducer.bottom_value();
-        __syncthreads();
-      }
-      if (warp_id == 0) {
-#if __CUDA_ARCH__ >= 300
-        reducer.atomic_reduce(&output.coeffRef(row), val);
-#else
-        assert(false && "This kernel requires sm_30 or higher.");
-#endif
-      }
-    }
-  }
-}
-
-template <RowReduceDims Dims, typename Input, typename Output, typename Reducer>
-void RowReduceCudaLaunchHelper(dim3 grid_size, dim3 block_size, Reducer reducer,
-                               const GpuDevice& device, const Input input,
-                               int rows, int cols, Output output) {
-  if (rows <= grid_size.y * block_size.y) {
-    LAUNCH_CUDA_KERNEL(
-        (RowReduceKernel<Dims, /* MultipleRowsPerThread = */ false, Input,
-                         Output, Reducer>),
-        grid_size, block_size, 0, device, reducer, input, rows, cols, output);
-  } else {
-    LAUNCH_CUDA_KERNEL(
-        (RowReduceKernel<Dims, /* MultipleRowsPerThread = */ true, Input,
-                         Output, Reducer>),
-        grid_size, block_size, 0, device, reducer, input, rows, cols, output);
-  }
-}
-
-template <typename Input, typename Output, typename Reducer>
-void RowReduceCuda(Reducer reducer, const GpuDevice& device, const Input input,
-                   int rows, int cols, Output output) {
-  // Each thread in the kernel processes ElemsPerRow elements per row.  This
-  // value must be kept in sync with the constant inside RowReduceKernel.
-  constexpr int ElemsPerRow = 8;
-
-  // Maximum width and height of our grid of thread blocks.  (sm_30 has a
-  // smaller limit on the x dimension, but this kernel requires sm_30+.)
-  constexpr uint32 MaxGridXDim = (static_cast<uint32>(1) << 31) - 1;
-  constexpr int32 MaxGridYDim = (static_cast<uint32>(1) << 16) - 1;
-
-  // The maximum number of warps we'll put in a thread block.
-  constexpr int MaxWarpsPerBlock = 4;
-
-  constexpr int warp_size = 32;  // gcudacc doesn't define warpSize on the host.
-
-  // We choose block_size.x so one block contains a whole row, if possible.  If
-  // each row is processed by a single block, then we can avoid launching an
-  // InitVector kernel and instead initialize our output within the reduction
-  // kernel.  If each row is processed by a single *warp*, we can further avoid
-  // doing an inter-warp reduction.
-  const int block_x_warps =
-      min(MaxWarpsPerBlock, divup(cols, ElemsPerRow * warp_size));
-  const int block_y = MaxWarpsPerBlock / block_x_warps;
-  assert(block_y > 0);
-  const dim3 block_size = dim3(block_x_warps * warp_size, block_y, 1);
-
-  // TODO(jlebar): Consider swapping the meaning of our block indices, such that
-  // the block at (x, y) actually processes the data at (y, x).
-  //
-  // Right now very large inputs cause us to process multiple rows per thread,
-  // but one thread never processes process more than ElemsPerRow elements in a
-  // particular row.  Given the option, we would rather reverse this and let a
-  // process touch more elements in just one row, because we have to do an
-  // intra- and possibly inter-warp reduction for each row a thread touches.
-  //
-  // Since the grid's max x dimension is much larger than its max y dimension,
-  // letting the x dimension correspond to our row index would let us size our
-  // grid so that one thread never has to process more than one row.  But at the
-  // moment this doesn't seem to matter because nobody seems to be reducing huge
-  // inputs.
-  //
-  // Note that if we do this, we could only switch the meaning of our *block*
-  // indices.  The thread indices would have to remain as-is, in order to ensure
-  // that all threads in a warp touch the same row.
-
-  const int grid_x = divup<int>(cols, ElemsPerRow * block_size.x);
-  // grid_x > MaxGridXDim implies that we have an input with at least
-  //   WarpSize * MaxWarpsPerBlock * ElemsPerRow * MaxGridXDim = 2^41 - 1
-  // elements.  Until we have GPUs with 2TB of memory, we don't need to worry
-  // about this.
-  assert(grid_x <= MaxGridXDim && "Unsupported giant input.");
-
-  const int orig_grid_y = divup<int>(rows, block_size.y);
-  int grid_y = numext::mini(orig_grid_y, MaxGridYDim);
-
-  const dim3 grid_size(grid_x, grid_y, 1);
-
-  if (block_size.x == warp_size) {
-    RowReduceCudaLaunchHelper<ONE_WARP_PER_ROW>(
-        grid_size, block_size, reducer, device, input, rows, cols, output);
-  } else if (grid_size.x == 1) {
-    RowReduceCudaLaunchHelper<ONE_BLOCK_PER_ROW>(
-        grid_size, block_size, reducer, device, input, rows, cols, output);
-  } else {
-    // We only need to initialize our output if multiple blocks operate on the
-    // same row.  Otherwise, the reduction kernel will handle the initialization
-    // itself.
-    LAUNCH_CUDA_KERNEL(InitVector<Output>, 32, 1024, 0, device,
-                       reducer.bottom_value(), rows, output);
-    RowReduceCudaLaunchHelper<MANY_BLOCKS_PER_ROW>(
-        grid_size, block_size, reducer, device, input, rows, cols, output);
-  }
-}
-
-// Provides arbitrary sum reductions, applying a function across the
-// right argument being reduced prior to summing
-template <typename F>
-struct FnSumReducer {
-  __host__ __device__ FnSumReducer(F f) : f_(f) {}
-  __host__ __device__ float bottom_value() { return 0.0f; }
-  __device__ float operator()(float x, float y) const { return x + f_(y); }
-  __device__ void atomic_reduce(float* x, float y) const { atomicAdd(x, y); }
-
-  F f_;
-};
-
-// Identity is used for the basic SumReduction
-struct Identity {
-  __device__ float operator()(float x) const { return x; }
-};
-
-struct CudaSumReducer : FnSumReducer<Identity> {
-  __host__ __device__ CudaSumReducer() : FnSumReducer(Identity()) {}
-};
-
-struct CudaMaxReducer {
-  // nvcc doesn't recognize numeric_limits<float>::lowest for some reason.
-  CudaMaxReducer() {
-    bottom_value_ = -3.40282347E+38F;  // std::numeric_limits<float>::lowest();
-  }
-  __host__ __device__ float bottom_value() { return bottom_value_; }
-  __device__ float operator()(float x, float y) const { return fmax(x, y); }
-
-  // This is equivalent to atomicMax(x, y), but CUDA does not have atomicMax for
-  // float data type. Instead, this atomically compares-and-swaps the old value
-  // at x with y. If the old value returned by the CAS operation was already
-  // larger than y, or what was read before, it declares success and finishes,
-  // otherwise repeats the procedure.
-  __device__ void atomic_reduce(float* x, float y) {
-    unsigned int old_val = *reinterpret_cast<unsigned int*>(x);
-    while (*reinterpret_cast<float*>(&old_val) < y) {
-      unsigned int current_val =
-          atomicCAS(reinterpret_cast<unsigned int*>(x), old_val,
-                    *reinterpret_cast<unsigned int*>(&y));
-      if (old_val == current_val) {
-        break;
-      }
-      old_val = current_val;
-    }
-  }
-  float bottom_value_;
-};
-
-}  // end namespace
-
-template <typename Op>
-struct IsFloatSumReduction {
-  static const bool value = false;
-};
-
-template <>
-struct IsFloatSumReduction<SumReducer<float> > {
-  static const bool value = true;
-};
-
-template <typename Op>
-struct IsFloatMaxReduction {
-  static const bool value = false;
-};
-
-template <>
-struct IsFloatMaxReduction<MaxReducer<float> > {
-  static const bool value = true;
-};
-
-template <typename Op>
-struct SumOrMaxOfFloat {
-  static const bool value =
-      IsFloatSumReduction<Op>::value || IsFloatMaxReduction<Op>::value;
-};
-
-enum ReductionType { ROW_REDUCE, COL_REDUCE, UNOPTIMIZED };
-
-template <typename Op, typename Expr, typename ReductionExpr>
-ReductionType GetReductionType(const Expr& expr,
-                               const ReductionExpr& reduction_expr,
-                               const GpuDevice& device, std::size_t* rows,
-                               std::size_t* cols) {
-  typedef TensorEvaluator<const Expr, GpuDevice> EvalExpr;
-  typedef TensorEvaluator<const ReductionExpr, GpuDevice> ReductionEvalExpr;
-
-  if (device.majorDeviceVersion() < 3) {
-    return UNOPTIMIZED;
-  }
-  const EvalExpr eval_expr(expr, device);
-
-  // We only have fast reductions for sum/max of float.
-  if (!SumOrMaxOfFloat<Op>::value) {
-    return UNOPTIMIZED;
-  }
-
-  if (ReductionEvalExpr::NumReducedDims == ReductionEvalExpr::NumInputDims) {
-    return UNOPTIMIZED;
-  }
-
-  if (ReductionEvalExpr::NumReducedDims > 1) {
-    return UNOPTIMIZED;
-  }
-
-  const std::size_t total_size = array_prod(eval_expr.dimensions());
-  if (total_size == 0) {
-    return UNOPTIMIZED;
-  }
-
-  const int dim = reduction_expr.dims()[0];
-  if (static_cast<int>(ReductionEvalExpr::Layout) ==
-      static_cast<int>(RowMajor)) {
-    if (dim == ReductionEvalExpr::NumInputDims - 1) {
-      *rows = total_size /
-              eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1];
-      *cols = eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1];
-      if (*cols < 32) return UNOPTIMIZED;
-      return ROW_REDUCE;
-      //      return UNOPTIMIZED;
-    } else if (dim == 0) {
-      *rows = eval_expr.dimensions()[0];
-      *cols = total_size / eval_expr.dimensions()[0];
-      if (*rows < 32) return UNOPTIMIZED;
-      return COL_REDUCE;
-      //return UNOPTIMIZED;
-    }
-  } else if (static_cast<int>(ReductionEvalExpr::Layout) ==
-             static_cast<int>(ColMajor)) {
-    if (dim == ReductionEvalExpr::NumInputDims - 1) {
-      *rows = eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1];
-      *cols = total_size /
-              eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1];
-      if (*rows < 32) return UNOPTIMIZED;
-      return COL_REDUCE;
-      //return UNOPTIMIZED;
-    } else if (dim == 0) {
-      *rows = total_size / eval_expr.dimensions()[0];
-      *cols = eval_expr.dimensions()[0];
-      if (*cols < 32) return UNOPTIMIZED;
-      return ROW_REDUCE;
-      //return UNOPTIMIZED;
-    }
-  }
-  return UNOPTIMIZED;
-}
-
-template <typename Expression, typename Index, bool Vectorizable>
-struct LaunchKernel;
-
-template <typename Expression, typename Index>
-struct LaunchKernel<Expression, Index, true> {
-  static void launch(int num_blocks, int block_size, const GpuDevice& device,
-                     const TensorEvaluator<Expression, GpuDevice>& evaluator,
-                     Index size) {
-    LAUNCH_CUDA_KERNEL(
-        (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
-        num_blocks, block_size, 0, device, evaluator, size);
-  }
-};
-
-template <typename Expression, typename Index>
-struct LaunchKernel<Expression, Index, false> {
-  static void launch(int num_blocks, int block_size, const GpuDevice& device,
-                     const TensorEvaluator<Expression, GpuDevice>& evaluator,
-                     Index size) {
-    LAUNCH_CUDA_KERNEL(
-        (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
-        num_blocks, block_size, 0, device, evaluator, size);
-  }
-};
-
-template <typename F, typename LHS, typename RHS, bool Compatible>
-struct LaunchRowReduce;
-
-template <typename F, typename LHS, typename RHS>
-struct LaunchRowReduce<F, LHS, RHS, true> {
-  static void launch(const GpuDevice& device, RHS input, std::size_t rows,
-                     std::size_t cols, LHS output) {
-    RowReduceCuda(F(), device, input, rows, cols, output);
-  }
-};
-
-template <typename F, typename LHS, typename RHS>
-struct LaunchRowReduce<F, LHS, RHS, false> {
-  static void launch(const GpuDevice& device, RHS input, std::size_t rows,
-                     std::size_t cols, LHS output) {}
-};
-
-template <typename F, typename LHS, typename RHS, bool Compatible>
-struct LaunchColReduce;
-
-template <typename F, typename LHS, typename RHS>
-struct LaunchColReduce<F, LHS, RHS, true> {
-  static void launch(const GpuDevice& device, RHS input, std::size_t rows,
-                     std::size_t cols, LHS output) {
-    ColumnReduceCuda(F(), device, input, rows, cols, output);
-  }
-};
-
-template <typename F, typename LHS, typename RHS>
-struct LaunchColReduce<F, LHS, RHS, false> {
-  static void launch(const GpuDevice& device, RHS input, std::size_t rows,
-                     std::size_t cols, LHS output) {}
-};
-
-template <typename Expression, typename Device, bool Vectorizable>
-class TensorAssignExecutorHelper;
-
-template <typename OutExpr, typename InExpr, typename Op, typename Indices,
-          bool Vectorizable>
-class TensorAssignExecutorHelper<
-    const TensorAssignOp<
-      OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
-    GpuDevice, Vectorizable> {
- public:
-  typedef const TensorAssignOp<
-    OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>
-    Expression;
-
-  typedef typename Expression::Index Index;
-  typedef TensorEvaluator<OutExpr, GpuDevice> LHSEval;
-  typedef TensorEvaluator<const InExpr, GpuDevice> RHSEval;
-  static inline void run(const Expression& expr, const GpuDevice& device) {
-    std::size_t rows, cols;
-    const ReductionType reduction_type =
-        GetReductionType<Op>(expr.rhsExpression().expression(),
-                             expr.rhsExpression(), device, &rows, &cols);
-    if (reduction_type == UNOPTIMIZED) {
-      TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
-      const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-      if (needs_assign) {
-        const int num_blocks = device.getNumCudaMultiProcessors() *
-                               device.maxCudaThreadsPerMultiProcessor() /
-                               device.maxCudaThreadsPerBlock();
-        const int block_size = device.maxCudaThreadsPerBlock();
-        const Index size = array_prod(evaluator.dimensions());
-        LaunchKernel<Expression, Index, Vectorizable>::launch(
-            num_blocks, block_size, device, evaluator, size);
-      }
-      evaluator.cleanup();
-    } else {
-      LHSEval output(expr.lhsExpression(), device);
-      RHSEval input(expr.rhsExpression().expression(), device);
-      bool lhs_needs_assign = output.evalSubExprsIfNeeded(NULL);
-      bool rhs_needs_assign = input.evalSubExprsIfNeeded(NULL);
-      if (lhs_needs_assign && rhs_needs_assign) {
-        const bool Compatible =
-            IsFloatSumReduction<Op>::value || IsFloatMaxReduction<Op>::value;
-        if (reduction_type == ROW_REDUCE) {
-          if (IsFloatSumReduction<Op>::value) {
-            LaunchRowReduce<CudaSumReducer, LHSEval, RHSEval,
-                            Compatible>::launch(device, input, rows, cols,
-                                                output);
-          } else if (IsFloatMaxReduction<Op>::value) {
-            LaunchRowReduce<CudaMaxReducer, LHSEval, RHSEval,
-                            Compatible>::launch(device, input, rows, cols,
-                                                output);
-          } else {
-            // Unsupported reduction type
-            assert(false && "Unsupported reduction function for ROW_REDUCE");
-          }
-        } else {
-          if (IsFloatSumReduction<Op>::value) {
-            LaunchColReduce<CudaSumReducer, LHSEval, RHSEval,
-                            Compatible>::launch(device, input, rows, cols,
-                                                output);
-          } else if (IsFloatMaxReduction<Op>::value) {
-            LaunchColReduce<CudaMaxReducer, LHSEval, RHSEval,
-                            Compatible>::launch(device, input, rows, cols,
-                                                output);
-          } else {
-            // Unsupported reduction type
-            assert(false && "Unsupported reduction function for COL_REDUCE");
-          }
-        }
-      }
-      input.cleanup();
-      output.cleanup();
-    }
-  }
-};
-
-template <typename OutExpr, typename InExpr, typename Op, typename Indices,
-          bool Tileable>
-inline void TensorExecutor<
-    const TensorAssignOp<
-        OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
-    GpuDevice, false, Tileable>::run(const Expression& expr,
-                                     const GpuDevice& device) {
-  TensorAssignExecutorHelper<
-      const TensorAssignOp<
-          OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
-      GpuDevice, false>::run(expr, device);
-}
-
-template <typename OutExpr, typename InExpr, typename Op, typename Indices,
-          bool Tileable>
-inline void TensorExecutor<
-    const TensorAssignOp<
-        OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
-    GpuDevice, true, Tileable>::run(const Expression& expr,
-                                    const GpuDevice& device) {
-  TensorAssignExecutorHelper<
-      const TensorAssignOp<
-          OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
-      GpuDevice, true>::run(expr, device);
-}
-
-template <typename T, typename Index>
-struct PtrWrapper {
-  EIGEN_DEVICE_FUNC PtrWrapper(T* ptr) : m_ptr(ptr) {}
-  EIGEN_DEVICE_FUNC T& coeffRef(Index i) { return *(m_ptr + i); }
-  T* m_ptr;
-};
-
-template <typename Expression, typename Device, bool Vectorizable>
-class TensorEvalToExecutorHelper;
-
-template <typename InExpr, typename Op, typename Indices, bool Vectorizable>
-class TensorEvalToExecutorHelper<const TensorEvalToOp<const TensorReductionOp<
-                                     Op, const Indices, const InExpr> >,
-                                 GpuDevice, Vectorizable> {
- public:
-  typedef const TensorEvalToOp<const TensorReductionOp<
-      Op, const Indices, const InExpr> > Expression;
-  typedef typename Expression::Index Index;
-  typedef TensorEvaluator<const InExpr, GpuDevice> RHSEval;
-
-  static inline void run(const Expression& expr, const GpuDevice& device) {
-    std::size_t rows, cols;
-    const ReductionType reduction_type =
-        GetReductionType<Op>(expr.expression().expression(), expr.expression(),
-                             device, &rows, &cols);
-    if (reduction_type == UNOPTIMIZED) {
-      TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
-      const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-      if (needs_assign) {
-        const int num_blocks = device.getNumCudaMultiProcessors() *
-                               device.maxCudaThreadsPerMultiProcessor() /
-                               device.maxCudaThreadsPerBlock();
-        const int block_size = device.maxCudaThreadsPerBlock();
-        const Index size = array_prod(evaluator.dimensions());
-        LaunchKernel<Expression, Index, Vectorizable>::launch(
-            num_blocks, block_size, device, evaluator, size);
-      }
-      evaluator.cleanup();
-    } else {
-      typedef typename internal::remove_const<typename Expression::Scalar>::type Scalar;
-      PtrWrapper<Scalar, Index> output(expr.buffer());
-      TensorEvaluator<const InExpr, GpuDevice> input(
-          expr.expression().expression(), device);
-      typedef PtrWrapper<Scalar, Index> LHSEval;
-      typedef TensorEvaluator<const InExpr, GpuDevice> RHSEval;
-      bool rhs_needs_assign = input.evalSubExprsIfNeeded(NULL);
-      if (rhs_needs_assign) {
-        const bool Compatible =
-            IsFloatSumReduction<Op>::value || IsFloatMaxReduction<Op>::value;
-        if (reduction_type == ROW_REDUCE) {
-          if (IsFloatSumReduction<Op>::value) {
-            LaunchRowReduce<CudaSumReducer, LHSEval, RHSEval,
-                            Compatible>::launch(device, input, rows, cols,
-                                                output);
-          } else if (IsFloatMaxReduction<Op>::value) {
-            LaunchRowReduce<CudaMaxReducer, LHSEval, RHSEval,
-                            Compatible>::launch(device, input, rows, cols,
-                                                output);
-          }
-        } else {
-          if (IsFloatSumReduction<Op>::value) {
-            LaunchColReduce<CudaSumReducer, LHSEval, RHSEval,
-                            Compatible>::launch(device, input, rows, cols,
-                                                output);
-          } else if (IsFloatMaxReduction<Op>::value) {
-            LaunchColReduce<CudaMaxReducer, LHSEval, RHSEval,
-                            Compatible>::launch(device, input, rows, cols,
-                                                output);
-          }
-        }
-      }
-      input.cleanup();
-    }
-  }
-};
-
-template <typename InExpr, typename Op, typename Indices, bool Tileable>
-inline void
-TensorExecutor<const TensorEvalToOp<
-                   const TensorReductionOp<Op, const Indices, const InExpr> >,
-               GpuDevice, false, Tileable>::run(const Expression& expr,
-                                                const GpuDevice& device) {
-  TensorEvalToExecutorHelper<const TensorEvalToOp<const TensorReductionOp<
-                                 Op, const Indices, const InExpr> >,
-                             GpuDevice, false>::run(expr, device);
-}
-
-template <typename InExpr, typename Op, typename Indices, bool Tileable>
-inline void
-TensorExecutor<const TensorEvalToOp<
-                   const TensorReductionOp<Op, const Indices, const InExpr> >,
-               GpuDevice, true, Tileable>::run(const Expression& expr,
-                                               const GpuDevice& device) {
-  TensorEvalToExecutorHelper<const TensorEvalToOp<const TensorReductionOp<
-                                 Op, const Indices, const InExpr> >,
-                             GpuDevice, true>::run(expr, device);
-}
-
-}  // end namespace internal
-
-}  // end namespace Eigen
-
-#endif  // __CUDACC__
-#endif  // EIGEN_USE_GPU
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
+#include "TensorReductionGpu.h"

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
new file mode 100644
index 0000000..315ccc1
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h

@@ -0,0 +1,973 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
+
+namespace Eigen {
+namespace internal {
+
+
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+// Full reducers for GPU, don't vectorize for now
+
+// Reducer function that enables multiple gpu thread to safely accumulate at the same
+// output address. It basically reads the current value of the output variable, and
+// attempts to update it with the new value. If in the meantime another gpu thread
+// updated the content of the output address it will try again.
+template <typename T, typename R>
+__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  if (sizeof(T) == 4)
+  {
+    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+    unsigned int newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned int readback;
+    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else if (sizeof(T) == 8) {
+    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
+    unsigned long long newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned long long readback;
+    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else {
+    gpu_assert(0 && "Wordsize not supported");
+  }
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+  return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename R>
+__device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
+  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+  unsigned int newval = oldval;
+  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+  if (newval == oldval) {
+    return;
+  }
+  unsigned int readback;
+  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+    oldval = readback;
+    newval = oldval;
+    reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+  }
+}
+#ifdef EIGEN_GPU_COMPILE_PHASE
+// reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
+template <typename R>
+__device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
+  half2* houtput=reinterpret_cast<half2*>(output);
+  half2* haccum=reinterpret_cast<half2*>(&accum);
+  for(int i=0;i<4;++i){
+    atomicReduce(houtput+i,*(haccum+i),reducer);
+  }
+}
+#endif  // EIGEN_GPU_COMPILE_PHASE
+#endif  // EIGEN_HAS_GPU_FP16
+
+template <>
+__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  atomicAdd(output, accum);
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+
+template <typename CoeffType, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+    output[i] = val;
+  }
+}
+
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
+                                    typename Self::CoeffReturnType* output, unsigned int* semaphore) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  // Initialize the output value
+  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      *output = reducer.initialize();
+    }
+  }
+  else {
+    if (threadIdx.x == 0) {
+      unsigned int block = atomicCAS(semaphore, 0u, 1u);
+      if (block == 0) {
+        // We're the first block to run, initialize the output value
+        atomicExchCustom(output, reducer.initialize());
+        __threadfence();
+        atomicExch(semaphore, 2u);
+      }
+      else {
+        // Wait for the first block to initialize the output value.
+        // Use atomicCAS here to ensure that the reads aren't cached
+        unsigned int val;
+        do {
+          val = atomicCAS(semaphore, 2u, 2u);
+        }
+        while (val < 2u);
+      }
+    }
+  }
+
+  __syncthreads();
+
+  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
+
+  typename Self::CoeffReturnType accum = reducer.initialize();
+  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
+  for (Index i = 0; i < max_iter; i+=BlockSize) {
+    const Index index = first_index + i;
+    eigen_assert(index < num_coeffs);
+    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
+    reducer.reduce(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize/2; offset > 0; offset /= 2) {
+  #if defined(EIGEN_HIPCC)
+    // use std::is_floating_point to determine the type of reduced_val 
+    // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error 
+    // and list the float and int versions of __shfl_down as the candidate functions. 
+    if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
+      reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
+    } else {
+      reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
+    }
+  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+    reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
+  #else
+    reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
+  #endif
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(output, accum, reducer);
+  }
+
+  if (gridDim.x > 1 && threadIdx.x == 0) {
+    // Let the last block reset the semaphore
+    atomicInc(semaphore, gridDim.x + 1);
+#if defined(EIGEN_HIPCC)
+    __threadfence_system();
+#endif
+  }
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(
+    Reducer reducer, const Self input, Index num_coeffs, half* scratch) {
+  eigen_assert(blockDim.x == 1);
+  eigen_assert(gridDim.x == 1);
+  typedef packet_traits<Eigen::half>::type packet_type;
+  Index packet_remainder =
+      num_coeffs % Index(unpacket_traits<packet_type>::size);
+  if (packet_remainder != 0) {
+    half2* h2scratch = reinterpret_cast<half2*>(scratch);
+    for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
+      *h2scratch =
+          __halves2half2(input.coeff(i), input.coeff(i + 1));
+      h2scratch++;
+    }
+    if ((num_coeffs & 1) != 0) {
+      half lastCoeff = input.coeff(num_coeffs - 1);
+      *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
+    }
+  } else {
+    packet_type reduce = reducer.template initializePacket<packet_type>();
+    internal::pstoreu(scratch, reduce);
+  }
+}
+
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  typedef typename packet_traits<Eigen::half>::type PacketType;
+
+  const Index num_packets =
+      num_coeffs / Index(unpacket_traits<PacketType>::size);
+  PacketType* p_output = reinterpret_cast<PacketType*>(output);
+  for (Index i = thread_id; i < num_packets; i += num_threads) {
+    p_output[i] = reducer.template initializePacket<PacketType>();
+  }
+  Index packet_remainder =
+      num_coeffs % Index(unpacket_traits<PacketType>::size);
+  if (thread_id < packet_remainder) {
+    output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
+  }
+}
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(
+    Reducer reducer, const Self input, Index num_coeffs,
+    half* output, half* scratch) {
+  typedef typename packet_traits<Eigen::half>::type PacketType;
+  const int packet_width = unpacket_traits<PacketType>::size;
+  eigen_assert(NumPerThread % packet_width == 0);
+  const Index first_index =
+      blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
+
+  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
+
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      int rem = num_coeffs % packet_width;
+      if (rem != 0) {
+        half2* p_scratch = reinterpret_cast<half2*>(scratch);
+        pstoreu(scratch, reducer.template initializePacket<PacketType>());
+        for (int i = 0; i < rem / 2; i++) {
+          *p_scratch = __halves2half2(
+              input.coeff(num_coeffs - packet_width + 2 * i),
+              input.coeff(num_coeffs - packet_width + 2 * i + 1));
+          p_scratch++;
+        }
+        if ((num_coeffs & 1) != 0) {
+          half last = input.coeff(num_coeffs - 1);
+          *p_scratch = __halves2half2(last, reducer.initialize());
+        }
+      } else {
+        PacketType reduce = reducer.template initializePacket<PacketType>();
+        pstoreu(scratch, reduce);
+      }
+    }
+    __syncthreads();
+  }
+
+  PacketType accum = reducer.template initializePacket<PacketType>();
+  const Index max_iter =
+      numext::mini<Index>((num_coeffs - first_index) / packet_width,
+                          NumPerThread * BlockSize / packet_width);
+  for (Index i = 0; i < max_iter; i += BlockSize) {
+    const Index index = first_index + packet_width * i;
+    eigen_assert(index + packet_width < num_coeffs);
+    PacketType val = input.template packet<Unaligned>(index);
+    reducer.reducePacket(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize/2; offset > 0; offset /= 2) {
+  #if defined(EIGEN_HIPCC)
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+      union { int i; half2 h; } wka_in, wka_out;
+      wka_in.h = hacc[i];
+      wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
+      hr[i] = wka_out.h;
+    }
+    reducer.reducePacket(r1, &accum);
+  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      hr[i] = __shfl_down(hacc[i], offset, warpSize);
+    }
+    reducer.reducePacket(r1, &accum);
+  #else
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize);
+    }
+    reducer.reducePacket(r1, &accum);
+
+  #endif
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(reinterpret_cast<PacketType*>(scratch), accum, reducer);
+  }
+
+  __syncthreads();
+  half2* rv1 = reinterpret_cast<half2*>(scratch);
+  if (packet_width > 2) {
+    reducer.reducePacket(rv1[2], rv1);
+    reducer.reducePacket(rv1[3], rv1 + 1);
+    reducer.reducePacket(rv1[1], rv1);
+  }
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      half tmp = __low2half(*rv1);
+      reducer.reduce(__high2half(*rv1), &tmp);
+      *output = tmp;
+    }
+  }
+}
+
+template <typename Op>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half* scratch) {
+  eigen_assert(threadIdx.x == 1);
+  typedef packet_traits<Eigen::half>::type packet_type;
+  if (unpacket_traits<packet_type>::size == 1) {
+    *output = *scratch;
+  } else {
+    half2* pscratch = reinterpret_cast<half2*>(scratch);
+    half tmp = __float2half(0.f);
+    for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
+      reducer.reduce(__low2half(*pscratch), &tmp);
+      reducer.reduce(__high2half(*pscratch), &tmp);
+      pscratch++;
+    }
+    *output = tmp;
+  }
+}
+
+#endif // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct FullReductionLauncher {
+  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
+    gpu_assert(false && "Should only be called on doubles, floats and half floats");
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+    Self, Op, OutputType, PacketAccess,
+    typename internal::enable_if<
+      internal::is_same<float, OutputType>::value ||
+      internal::is_same<double, OutputType>::value,
+    void>::type> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
+
+    typedef typename Self::Index Index;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+
+    unsigned int* semaphore = NULL;
+    if (num_blocks > 1) {
+      semaphore = device.semaphore();
+    }
+
+    LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
+  }
+};
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, false> {
+  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
+    gpu_assert(false && "Should not be called since there is no packet accessor");
+  }
+};
+
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, true> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
+    typedef typename Self::Index Index;
+
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    half* scratch = static_cast<half*>(device.scratchpad());
+
+    if (num_blocks > 1) {
+      // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
+                         1, 1, 0, device, reducer, self, num_coeffs, scratch);
+    }
+
+    LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
+
+    if (num_blocks > 1) {
+      LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
+                         1, 1, 0, device, reducer, output, scratch);
+    }
+  }
+};
+#endif // EIGEN_HAS_GPU_FP16
+
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple cases
+  // of doubles, floats and half floats
+#ifdef EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else // EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+                                                (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                 internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif // EIGEN_HAS_GPU_FP16
+
+  template <typename OutputType>
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+    gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return;
+    }
+
+    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
+  }
+};
+
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                         typename Self::CoeffReturnType* output) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  typedef typename Self::CoeffReturnType Type;
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  const int unroll_times = 16;
+  eigen_assert(NumPerThread % unroll_times == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
+  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = i / input_col_blocks;
+
+    if (row < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
+
+      Type reduced_val = reducer.initialize();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
+        if (last_col >= num_coeffs_to_reduce) {
+          for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
+            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+            reducer.reduce(val, &reduced_val);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k);
+            reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize/2; offset > 0; offset /= 2) {
+      #if defined(EIGEN_HIPCC)
+        // use std::is_floating_point to determine the type of reduced_val 
+       // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error 
+       // and list the float and int versions of __shfl_down as the candidate functions. 
+        if (std::is_floating_point<Type>::value) {
+          reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
+        } else {
+          reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
+        }
+      #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+        reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
+      #else
+        reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
+      #endif
+      }
+
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        atomicReduce(&(output[row]), reduced_val, reducer);
+      }
+    }
+  }
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+#ifdef EIGEN_HAS_GPU_FP16
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                              half* output) {
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  typedef typename packet_traits<Eigen::half>::type PacketType;
+  const int packet_width = unpacket_traits<PacketType>::size;
+  const int unroll_times = 16 / packet_width;
+  eigen_assert(NumPerThread % unroll_times == 0);
+  eigen_assert(unroll_times % 2 == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
+  const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    Index i = packet_width * thread_id;
+    for (; i + packet_width <= num_preserved_coeffs;
+         i += packet_width * num_threads) {
+      PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
+      *poutput = reducer.template initializePacket<PacketType>();
+    }
+    if (i < num_preserved_coeffs) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = 2 * (i / input_col_blocks);  // everybody takes 2 rows
+
+    if (row + 1 < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin =
+          packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
+
+      PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
+      PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col =
+            col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
+        if (last_col >= num_coeffs_to_reduce) {
+          Index col = col_begin + blockDim.x * j;
+          for (; col + packet_width <= num_coeffs_to_reduce;
+               col += blockDim.x) {
+            const PacketType val1 = input.m_impl.template packet<Unaligned>(
+                row * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val1, &reduced_val1);
+            const PacketType val2 = input.m_impl.template packet<Unaligned>(
+                (row + 1) * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val2, &reduced_val2);
+          }
+          if (col < num_coeffs_to_reduce) {
+            PacketType r1 = reducer.template initializePacket<PacketType>();
+            PacketType r2 = reducer.template initializePacket<PacketType>();
+            half2* hr1 = reinterpret_cast<half2*>(&r1);
+            half2* hr2 = reinterpret_cast<half2*>(&r2);
+            while (col + 1 < num_coeffs_to_reduce) {
+              *hr1 = __halves2half2(
+                  input.m_impl.coeff(row * num_coeffs_to_reduce + col),
+                  input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
+              *hr2 = __halves2half2(
+                  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
+                  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col +
+                                     1));
+              hr1++;
+              hr2++;
+              col += 2;
+            }
+            if (col < num_coeffs_to_reduce) {
+              // Peel;
+              const half last1 =
+                  input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+              *hr1 = __halves2half2(last1, reducer.initialize());
+              const half last2 =
+                  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
+              *hr2 = __halves2half2(last2, reducer.initialize());
+            }
+            reducer.reducePacket(r1, &reduced_val1);
+            reducer.reducePacket(r2, &reduced_val2);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k) * packet_width;
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>(
+                                     row * num_coeffs_to_reduce + col),
+                                 &reduced_val1);
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>(
+                                     (row + 1) * num_coeffs_to_reduce + col),
+                                 &reduced_val2);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize/2; offset > 0; offset /= 2) {
+      #if defined(EIGEN_HIPCC)
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+	  // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+	  union { int i; half2 h; } wka_in1, wka_out1;
+	  wka_in1.h = rv1[i];
+	  wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
+	  hr1[i] = wka_out1.h;
+
+	  union { int i; half2 h; } wka_in2, wka_out2;
+	  wka_in2.h = rv2[i];
+	  wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
+	  hr2[i] = wka_out2.h;
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
+      #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+          hr1[i] = __shfl_down(rv1[i], offset, warpSize);
+          hr2[i] = __shfl_down(rv2[i], offset, warpSize);
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
+      #else
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+          hr1[i] =
+              __shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize);
+          hr2[i] =
+              __shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize);
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
+
+      #endif
+      }
+      half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+      half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+      half2 val;
+      if (packet_width > 2) {
+        reducer.reducePacket(rv1[2], rv1);
+        reducer.reducePacket(rv1[3], rv1 + 1);
+        reducer.reducePacket(rv1[1], rv1);
+        reducer.reducePacket(rv2[2], rv2);
+        reducer.reducePacket(rv2[3], rv2 + 1);
+        reducer.reducePacket(rv2[1], rv2);
+      }
+      half val1 = __low2half(*rv1);
+      reducer.reduce(__high2half(*rv1), &val1);
+      half val2 = __low2half(*rv2);
+      reducer.reduce(__high2half(*rv2), &val2);
+      val = __halves2half2(val1, val2);
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        half* loc = output + row;
+        atomicReduce((half2*)loc, val, reducer);
+      }
+    }
+  }
+}
+
+#endif // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct InnerReductionLauncher {
+  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
+    return true;
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+  Self, Op, OutputType, PacketAccess,
+  typename internal::enable_if<
+    internal::is_same<float, OutputType>::value ||
+    internal::is_same<double, OutputType>::value,
+  void>::type> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
+                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
+  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should not be called since there is no packet accessor");
+    return true;
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    if (num_preserved_vals % 2 != 0) {
+      // Not supported yet, revert to the slower code path
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = /*256*/128;
+    const int num_per_thread = /*128*/64;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
+                         1, 1, 0, device, reducer, self, num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+#endif // EIGEN_HAS_GPU_FP16
+
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats and half floats.
+#ifdef EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else // EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif // EIGEN_HAS_GPU_FP16
+
+  template <typename OutputType>
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return true;
+    }
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 128) {
+      return true;
+    }
+
+    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+  }
+};
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                     typename Self::CoeffReturnType* output) {
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  // Do the reduction.
+  const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
+  for (Index i = thread_id; i < max_iter; i += num_threads) {
+    const Index input_col = i % num_preserved_coeffs;
+    const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
+    typename Self::CoeffReturnType reduced_val = reducer.initialize();
+    const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
+    for (Index j = input_row; j < max_row; j++) {
+      typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
+      reducer.reduce(val, &reduced_val);
+    }
+    atomicReduce(&(output[input_col]), reduced_val, reducer);
+  }
+}
+
+
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats.
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
+  template <typename Device, typename OutputType>
+  static
+    #if !defined(EIGEN_HIPCC)
+    // FIXME :  leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
+    //          (in the cxx11_tensor_reduction_gpu test)
+    //
+    // terminate called after throwing an instance of 'std::runtime_error'
+    //   what():  No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
+    //
+    // don't know why this happens (and why is it a runtime error instead of a compile time error)
+    //
+    // this will be fixed by HIP PR#457
+    EIGEN_DEVICE_FUNC
+    #endif
+    bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
+    return true;
+  }
+
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 32) {
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 16;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs in the reduction kernel itself when we don't have to worry
+      // about race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumGpuMultiProcessors() *
+                             device.maxGpuThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
+                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
new file mode 100644
index 0000000..474eba0
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h

@@ -0,0 +1,582 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorReductionSycl.h
+ *
+ * \brief:
+ *  This is the specialization of the reduction operation. Two phase reduction approach 
+ * is used since the GPU does not have Global Synchronization for global memory among 
+ * different work-group/thread block. To solve the problem, we need to create two kernels 
+ * to reduce the data, where the first kernel reduce the data locally and each local 
+ * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
+ * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element. 
+ * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
+ * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
+ *
+ *****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+
+template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
+struct OpDefiner {
+  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
+  typedef Op type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
+                                                                            const Index &) {
+    return accumulator;
+  }
+};
+
+template <typename CoeffReturnType, typename Index>
+struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
+  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
+    return type();
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
+                                                                           const Index &scale) {
+    ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
+    return quotient_op(accumulator, CoeffReturnType(scale));
+  }
+};
+
+template <typename CoeffReturnType, typename Index>
+struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
+  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
+  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
+    return type();
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
+                                                                            const Index &scale) {
+    return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
+  }
+};
+
+template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
+          Index local_range>
+struct SecondStepFullReducer {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
+  typedef typename OpDef::type Op;
+  LocalAccessor scratch;
+  InputAccessor aI;
+  OutputAccessor outAcc;
+  Op op;
+  SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
+      : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
+
+  void operator()(cl::sycl::nd_item<1> itemID) {
+    // Our empirical research shows that the best performance will be achieved
+    // when there is only one element per thread to reduce in the second step.
+    // in this step the second step reduction time is almost negligible.
+    // Hence, in the second step of reduction the input size is fixed to the
+    // local size, thus, there is only one element read per thread. The
+    // algorithm must be changed if the number of reduce per thread in the
+    // second step is greater than 1. Otherwise, the result will be wrong.
+    const Index localid = itemID.get_local_id(0);
+    auto aInPtr = aI.get_pointer() + localid;
+    auto aOutPtr = outAcc.get_pointer();
+    CoeffReturnType *scratchptr = scratch.get_pointer();
+    CoeffReturnType accumulator = *aInPtr;
+
+    scratchptr[localid] = op.finalize(accumulator);
+    for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        op.reduce(scratchptr[localid + offset], &accumulator);
+        scratchptr[localid] = op.finalize(accumulator);
+      }
+    }
+    if (localid == 0) *aOutPtr = op.finalize(accumulator);
+  }
+};
+
+// Full reduction first phase. In this version the vectorization is true and the reduction accept 
+// any generic reducerOp  e.g( max, min, sum, mean, iamax, iamin, etc ). 
+template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
+class FullReductionKernelFunctor {
+ public:
+  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+  typedef typename Evaluator::Index Index;
+  typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
+                    (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+      OpDef;
+
+  typedef typename OpDef::type Op;
+  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+  typedef typename Evaluator::PacketReturnType PacketReturnType;
+  typedef
+      typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
+                                              PacketReturnType, CoeffReturnType>::type OutType;
+  typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  LocalAccessor scratch;
+  Evaluator evaluator;
+  EvaluatorPointerType final_output;
+  Index rng;
+  Op op;
+
+  FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
+                             Index rng_, OpType op_)
+      : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
+
+  void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
+
+  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<Vect>::type compute_reduction(
+      const cl::sycl::nd_item<1> &itemID) {
+    auto output_ptr = final_output.get_pointer();
+    Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
+    Index globalid = itemID.get_global_id(0);
+    Index localid = itemID.get_local_id(0);
+    Index step = Evaluator::PacketSize * itemID.get_global_range(0);
+    Index start = Evaluator::PacketSize * globalid;
+    // vectorizable parts
+    PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
+    for (Index i = start; i < VectorizedRange; i += step) {
+      op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
+    }
+    globalid += VectorizedRange;
+    // non vectorizable parts
+    for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
+      op.template reducePacket<PacketReturnType>(
+          ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
+              evaluator.impl().coeff(i), op.initialize()),
+          &packetAccumulator);
+    }
+    scratch[localid] = packetAccumulator =
+        OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
+    // reduction parts // Local size is always power of 2
+    EIGEN_UNROLL_LOOP
+    for (Index offset = local_range / 2; offset > 0; offset /= 2) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
+        scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
+      }
+    }
+    if (localid == 0) {
+      output_ptr[itemID.get_group(0)] =
+          op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
+    }
+  }
+
+  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!Vect>::type compute_reduction(
+      const cl::sycl::nd_item<1> &itemID) {
+    auto output_ptr = final_output.get_pointer();
+    Index globalid = itemID.get_global_id(0);
+    Index localid = itemID.get_local_id(0);
+    // vectorizable parts
+    CoeffReturnType accumulator = op.initialize();
+    // non vectorizable parts
+    for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
+      op.reduce(evaluator.impl().coeff(i), &accumulator);
+    }
+    scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
+
+    // reduction parts. the local size is always power of 2
+    EIGEN_UNROLL_LOOP
+    for (Index offset = local_range / 2; offset > 0; offset /= 2) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        op.reduce(scratch[localid + offset], &accumulator);
+        scratch[localid] = op.finalize(accumulator);
+      }
+    }
+    if (localid == 0) {
+      output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
+    }
+  }
+};
+
+template <typename Evaluator, typename OpType>
+class GenericNondeterministicReducer {
+ public:
+  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+  typedef typename Evaluator::Index Index;
+  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
+  typedef typename OpDef::type Op;
+  template <typename Scratch>
+  GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
+                       Index range_, Index num_values_to_reduce_)
+      : evaluator(evaluator_),
+        output_accessor(output_accessor_),
+        functor(OpDef::get_op(functor_)),
+        range(range_),
+        num_values_to_reduce(num_values_to_reduce_) {}
+
+  void operator()(cl::sycl::nd_item<1> itemID) {
+    auto output_accessor_ptr = output_accessor.get_pointer();
+    /// const cast added as a naive solution to solve the qualifier drop error
+    Index globalid = static_cast<Index>(itemID.get_global_linear_id());
+    if (globalid < range) {
+      CoeffReturnType accum = functor.initialize();
+      Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
+          evaluator, evaluator.firstInput(globalid), functor, &accum);
+      output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
+    }
+  }
+
+ private:
+  Evaluator evaluator;
+  EvaluatorPointerType output_accessor;
+  Op functor;
+  Index range;
+  Index num_values_to_reduce;
+};
+
+enum class reduction_dim { inner_most, outer_most };
+// default is preserver
+template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
+struct PartialReductionKernel {
+  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+  typedef typename Evaluator::Index Index;
+  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
+  typedef typename OpDef::type Op;
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      ScratchAcc;
+  ScratchAcc scratch;
+  Evaluator evaluator;
+  EvaluatorPointerType output_accessor;
+  Op op;
+  const Index preserve_elements_num_groups;
+  const Index reduce_elements_num_groups;
+  const Index num_coeffs_to_preserve;
+  const Index num_coeffs_to_reduce;
+
+  PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
+                         const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
+                         const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
+      : scratch(scratch_),
+        evaluator(evaluator_),
+        output_accessor(output_accessor_),
+        op(OpDef::get_op(op_)),
+        preserve_elements_num_groups(preserve_elements_num_groups_),
+        reduce_elements_num_groups(reduce_elements_num_groups_),
+        num_coeffs_to_preserve(num_coeffs_to_preserve_),
+        num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
+                                                                 CoeffReturnType &accumulator) {
+    if (globalPId >= num_coeffs_to_preserve) {
+      return;
+    }
+    Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
+                                                          : globalRId + (globalPId * num_coeffs_to_reduce);
+    Index localOffset = globalRId;
+
+    const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
+    const Index per_thread_global_stride =
+        rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
+    for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
+      op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
+      localOffset += per_thread_local_stride;
+      global_offset += per_thread_global_stride;
+    }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    const Index linearLocalThreadId = itemID.get_local_id(0);
+    Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
+                                                           : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
+    Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
+                                                           : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
+    const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
+                                                           : itemID.get_group(0) / reduce_elements_num_groups;
+    const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
+                                                           : itemID.get_group(0) % reduce_elements_num_groups;
+
+    Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
+    const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
+    auto scratchPtr = scratch.get_pointer().get();
+    auto outPtr =
+        output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
+    CoeffReturnType accumulator = op.initialize();
+
+    element_wise_reduce(globalRId, globalPId, accumulator);
+
+    accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
+    scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
+        accumulator;
+    if (rt == reduction_dim::inner_most) {
+      pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
+      rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
+      globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
+    }
+
+    /* Apply the reduction operation between the current local
+     * id and the one on the other half of the vector. */
+    auto out_scratch_ptr =
+        scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+    if (rt == reduction_dim::inner_most) {
+      accumulator = *out_scratch_ptr;
+    }
+    // The Local LocalThreadSizeR is always power of 2
+    EIGEN_UNROLL_LOOP
+    for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
+      if (rLocalThreadId < offset) {
+        op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
+        // The result has already been divided for mean reducer in the
+        // previous reduction so no need to divide furthermore
+        *out_scratch_ptr = op.finalize(accumulator);
+      }
+      /* All threads collectively read from global memory into local.
+       * The barrier ensures all threads' IO is resolved before
+       * execution continues (strictly speaking, all threads within
+       * a single work-group - there is no co-ordination between
+       * work-groups, only work-items). */
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+    }
+
+    if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
+      outPtr[globalPId] = op.finalize(accumulator);
+    }
+  }
+};
+
+template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
+struct SecondStepPartialReduction {
+  typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
+  typedef typename OpDef::type Op;
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      ScratchAccessor;
+  InputAccessor input_accessor;
+  OutputAccessor output_accessor;
+  Op op;
+  const Index num_coeffs_to_preserve;
+  const Index num_coeffs_to_reduce;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
+                                                                   OutputAccessor output_accessor_, OpType op_,
+                                                                   const Index num_coeffs_to_preserve_,
+                                                                   const Index num_coeffs_to_reduce_)
+      : input_accessor(input_accessor_),
+        output_accessor(output_accessor_),
+        op(OpDef::get_op(op_)),
+        num_coeffs_to_preserve(num_coeffs_to_preserve_),
+        num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    const Index globalId = itemID.get_global_id(0);
+
+    if (globalId >= num_coeffs_to_preserve) return;
+
+    auto in_ptr = input_accessor.get_pointer() + globalId;
+
+    OutScalar accumulator = op.initialize();
+// num_coeffs_to_reduce is not bigger that 256
+    for (Index i = 0; i < num_coeffs_to_reduce; i++) {
+      op.reduce(*in_ptr, &accumulator);
+      in_ptr += num_coeffs_to_preserve;
+    }
+    output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
+  }
+};  // namespace internal
+
+template <typename Index, Index LTP, Index LTR, bool BC_>
+struct ReductionPannel {
+  static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;
+  static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;
+  static EIGEN_CONSTEXPR bool BC = BC_;
+};
+
+template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
+struct PartialReducerLauncher {
+  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::Storage Storage;
+  typedef typename Self::Index Index;
+  typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
+      PannelParameters;
+
+  typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
+
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
+                  Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
+    Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
+
+    // getPowerOfTwo makes sure local range is power of 2 and <=
+    // maxSyclThreadPerBlock this will help us to avoid extra check on the
+    // kernel
+    static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
+                    (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
+                  "The Local thread size must be a power of 2 for the reduction "
+                  "operation");
+
+    EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
+    // In this step, we force the code not to be more than 2-step reduction:
+    // Our empirical research shows that if each thread reduces at least 64
+    // elemnts individually, we get better performance. However, this can change
+    // on different platforms. In this step we force the code not to be
+    // morthan step reduction: Our empirical research shows that for inner_most
+    // dim reducer, it is better to have 8 group in a reduce dimension for sizes
+    // > 1024 to achieve the best performance.
+    const Index reductionPerThread = 64;
+    Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
+    const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
+    Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
+    const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
+    const Index globalRange = pNumGroups * rNumGroups * localRange;
+
+    EIGEN_CONSTEXPR Index scratchSize =
+        PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+    if (rNumGroups > 1) {
+      CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
+          dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
+      EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
+      dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
+          self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
+          num_coeffs_to_reduce);
+
+      typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
+          SecondStepPartialReductionKernel;
+
+      dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
+          temp_accessor, output,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),
+          reducer, num_coeffs_to_preserve, rNumGroups);
+
+      self.device().deallocate_temp(temp_pointer);
+    } else {
+      dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
+          self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
+          num_coeffs_to_reduce);
+    }
+    return false;
+  }
+};
+}  // namespace internal
+}  // namespace TensorSycl
+
+namespace internal {
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
+  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+  static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
+  static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
+    typedef typename conditional<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType>::type OutType;
+    static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
+                    (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
+                  "The Local thread size must be a power of 2 for the reduction "
+                  "operation");
+    EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
+
+    typename Self::Index inputSize = self.impl().dimensions().TotalSize();
+    // In this step we force the code not to be more than 2-step reduction:
+    // Our empirical research shows that if each thread reduces at least 512
+    // elemnts individually, we get better performance.
+    const Index reductionPerThread = 2048;
+    // const Index num_work_group =
+    Index reductionGroup = dev.getPowerOfTwo(
+        (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
+    const Index num_work_group = std::min(reductionGroup, local_range);
+    // 1
+    // ? local_range
+    // : 1);
+    const Index global_range = num_work_group * local_range;
+
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+    typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
+    if (num_work_group > 1) {
+      CoeffReturnType *temp_pointer =
+          static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
+      typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
+      dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
+                                                                      local_range, inputSize, reducer);
+
+      typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
+                                                          EvaluatorPointerType, Index, local_range>
+          GenericRKernel;
+      dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
+          tmp_global_accessor, data,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
+          reducer);
+
+      dev.deallocate_temp(temp_pointer);
+    } else {
+      dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
+                                                                      reducer);
+    }
+  }
+};
+// vectorizable inner_most most dim preserver
+// col reduction
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, Eigen::SyclDevice> {
+  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+                  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
+                  typename Self::Index num_coeffs_to_preserve) {
+    return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
+        Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
+                                                                                 num_coeffs_to_reduce,
+                                                                                 num_coeffs_to_preserve);
+  }
+};
+// row reduction
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, Eigen::SyclDevice> {
+  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+                  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
+                  typename Self::Index num_coeffs_to_preserve) {
+    return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
+        Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
+                                                                                 num_coeffs_to_reduce,
+                                                                                 num_coeffs_to_preserve);
+  }
+};
+
+// ArmgMax uses this kernel for partial reduction//
+// TODO(@mehdi.goli) come up with a better kernel
+// generic partial reduction
+template <typename Self, typename Op>
+struct GenericReducer<Self, Op, Eigen::SyclDevice> {
+  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+                  typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
+                  typename Self::Index num_coeffs_to_preserve) {
+    typename Self::Index range, GRange, tileSize;
+    dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
+
+    dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
+                                       TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
+        self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
+        reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1));
+    return false;
+  }
+};
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index eba0ab9..a27d364 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h

@@ -31,7 +31,7 @@
   int refCount() const { return m_refcount; }
 
  private:
-  // No copy, no assigment;
+  // No copy, no assignment;
   TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other);
   TensorLazyBaseEvaluator& operator = (const TensorLazyBaseEvaluator& other);
 
@@ -44,6 +44,9 @@
  public:
   //  typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions;
   typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef  TensorEvaluator<Expr, Device> EvalType;
 
   TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device), m_dummy(Scalar(0)) {
     m_dims = m_impl.dimensions();
@@ -79,6 +82,8 @@
  public:
   typedef TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> Base;
   typedef typename Base::Scalar Scalar;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) {
   }
@@ -125,7 +130,6 @@
     typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
     typedef typename internal::traits<PlainObjectType>::Index Index;
     typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
-    typedef typename internal::packet_traits<Scalar>::type Packet;
     typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef typename Base::CoeffReturnType CoeffReturnType;
     typedef Scalar* PointerType;
@@ -138,21 +142,21 @@
       IsAligned = false,
       PacketAccess = false,
       BlockAccess = false,
+      PreferBlockAccess = false,
       Layout = PlainObjectType::Layout,
       CoordAccess = false,  // to be implemented
       RawAccess = false
     };
 
+    //===- Tensor block evaluation strategy (see TensorBlock.h) -----------===//
+    typedef internal::TensorBlockNotImplemented TensorBlock;
+    //===------------------------------------------------------------------===//
+
     EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {
     }
 
     template <typename Expression>
-    EIGEN_STRONG_INLINE TensorRef(Expression& expr) : m_evaluator(new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice())) {
-      m_evaluator->incrRefCount();
-    }
-
-    template <typename Expression>
-    EIGEN_STRONG_INLINE TensorRef(const Expression& expr) : m_evaluator(new internal::TensorLazyEvaluator<Dimensions, const Expression, DefaultDevice>(expr, DefaultDevice())) {
+    EIGEN_STRONG_INLINE TensorRef(const Expression& expr) : m_evaluator(new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice())) {
       m_evaluator->incrRefCount();
     }
 
@@ -173,11 +177,6 @@
       m_evaluator->incrRefCount();
     }
 
-    TensorRef(TensorRef& other) : m_evaluator(other.m_evaluator) {
-      eigen_assert(m_evaluator->refCount() > 0);
-      m_evaluator->incrRefCount();
-    }
-
     TensorRef& operator = (const TensorRef& other) {
       if (this != &other) {
         unrefEvaluator();
@@ -205,19 +204,19 @@
       return m_evaluator->coeff(index);
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const
     {
-      const std::size_t NumIndices = (sizeof...(otherIndices) + 1);
-      const array<Index, NumIndices> indices{{firstIndex, otherIndices...}};
+      const std::size_t num_indices = (sizeof...(otherIndices) + 1);
+      const array<Index, num_indices> indices{{firstIndex, otherIndices...}};
       return coeff(indices);
     }
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
     {
-      const std::size_t NumIndices = (sizeof...(otherIndices) + 1);
-      const array<Index, NumIndices> indices{{firstIndex, otherIndices...}};
+      const std::size_t num_indices = (sizeof...(otherIndices) + 1);
+      const array<Index, num_indices> indices{{firstIndex, otherIndices...}};
       return coeffRef(indices);
     }
 #else
@@ -307,7 +306,7 @@
       Index index = 0;
       if (PlainObjectType::Options & RowMajor) {
         index += indices[0];
-        for (int i = 1; i < NumIndices; ++i) {
+        for (size_t i = 1; i < NumIndices; ++i) {
           index = index * dims[i] + indices[i];
         }
       } else {
@@ -325,7 +324,7 @@
       Index index = 0;
       if (PlainObjectType::Options & RowMajor) {
         index += indices[0];
-        for (int i = 1; i < NumIndices; ++i) {
+        for (size_t i = 1; i < NumIndices; ++i) {
           index = index * dims[i] + indices[i];
         }
       } else {
@@ -369,31 +368,37 @@
 {
   typedef typename Derived::Index Index;
   typedef typename Derived::Scalar Scalar;
-  typedef typename Derived::Packet Packet;
   typedef typename Derived::Scalar CoeffReturnType;
-  typedef typename Derived::Packet PacketReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename Derived::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = false,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorRef<Derived>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
       : m_ref(m)
   { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+  EIGEN_STRONG_INLINE void cleanup() { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     return m_ref.coeff(index);
@@ -403,13 +408,9 @@
     return m_ref.coeffRef(index);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return m_ref.data(); }
+  EIGEN_DEVICE_FUNC const Scalar* data() const { return m_ref.data(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    return TensorOpCost(0, 0, 0);
-  }
-
-protected:
+ protected:
   TensorRef<Derived> m_ref;
 };
 
@@ -420,9 +421,8 @@
 {
   typedef typename Derived::Index Index;
   typedef typename Derived::Scalar Scalar;
-  typedef typename Derived::Packet Packet;
   typedef typename Derived::Scalar CoeffReturnType;
-  typedef typename Derived::Packet PacketReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename Derived::Dimensions Dimensions;
 
   typedef TensorEvaluator<const TensorRef<Derived>, Device> Base;
@@ -431,10 +431,15 @@
     IsAligned = false,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = false,
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
   { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index bcd35bf..586ce68 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h

@@ -25,13 +25,13 @@
 {
   typedef typename XprType::Scalar Scalar;
   typedef traits<XprType> XprTraits;
-  typedef typename packet_traits<Scalar>::type Packet;
   typedef typename XprTraits::StorageKind StorageKind;
   typedef typename XprTraits::Index Index;
   typedef typename XprType::Nested Nested;
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename ReverseDimensions, typename XprType>
@@ -54,19 +54,18 @@
                                           XprType>, WriteAccessors>
 {
   public:
-  typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar;
-  typedef typename Eigen::internal::traits<TensorReverseOp>::Packet Packet;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind
-                                                                    StorageKind;
-  typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index;
+    typedef TensorBase<TensorReverseOp<ReverseDimensions, XprType>, WriteAccessors>Base;
+    typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind
+                                                                      StorageKind;
+    typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(
       const XprType& expr, const ReverseDimensions& reverse_dims)
-      : m_xpr(expr), m_reverse_dims(reverse_dims) {}
+      : m_xpr(expr), m_reverse_dims(reverse_dims) { }
 
     EIGEN_DEVICE_FUNC
     const ReverseDimensions& reverse() const { return m_reverse_dims; }
@@ -75,26 +74,8 @@
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorReverseOp& operator = (const TensorReverseOp& other)
-    {
-      typedef TensorAssignOp<TensorReverseOp, const TensorReverseOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(
-          assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReverseOp)
 
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorReverseOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorReverseOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(
-          assign, DefaultDevice());
-      return *this;
-    }
 
   protected:
     typename XprType::Nested m_xpr;
@@ -111,33 +92,56 @@
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = NumDims > 0,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
-                                                        const Device& device)
-      : m_impl(op.expression(), device), m_reverse(op.reverse())
+  typedef internal::TensorIntDivisor<Index> IndexDivisor;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      ArgTensorBlock;
+
+  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device),
+        m_reverse(op.reverse()),
+        m_device(device)
   {
+    // Reversing a scalar isn't supported yet. It would be a no-op anyway.
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
     // Compute strides
     m_dimensions = m_impl.dimensions();
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       m_strides[0] = 1;
       for (int i = 1; i < NumDims; ++i) {
         m_strides[i] = m_strides[i-1] * m_dimensions[i-1];
+        if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]);
       }
     } else {
       m_strides[NumDims-1] = 1;
       for (int i = NumDims - 2; i >= 0; --i) {
         m_strides[i] = m_strides[i+1] * m_dimensions[i+1];
+        if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]);
       }
     }
   }
@@ -145,11 +149,20 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -158,8 +171,9 @@
     eigen_assert(index < dimensions().TotalSize());
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
-        Index idx = index / m_strides[i];
+        Index idx = index / m_fastStrides[i];
         index -= idx * m_strides[i];
         if (m_reverse[i]) {
           idx = m_dimensions[i] - idx - 1;
@@ -172,8 +186,9 @@
         inputIndex += index;
       }
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
-        Index idx = index / m_strides[i];
+        Index idx = index / m_fastStrides[i];
         index -= idx * m_strides[i];
         if (m_reverse[i]) {
           idx = m_dimensions[i] - idx - 1;
@@ -198,21 +213,145 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   PacketReturnType packet(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     // TODO(ndjaitly): write a better packing routine that uses
     // local structure.
-    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type
-                                                            values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type
+                                                            values[PacketSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
     PacketReturnType rslt = internal::pload<PacketReturnType>(values);
     return rslt;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    // Block evaluation reads underlying memory in reverse order, and default
+    // cost model does not properly catch this in bytes stored/loaded.
+    return internal::TensorBlockResourceRequirements::skewed<Scalar>(
+               target_size)
+        .addCostPerCoeff({0, 0, 24});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    // TODO(ezhulenev): If underlying tensor expression supports and prefers
+    // block evaluation we must use it. Currently we use coeff and packet
+    // access into the underlying tensor expression.
+    // static const bool useBlockAccessForArgType =
+    //     TensorEvaluator<ArgType, Device>::BlockAccess &&
+    //     TensorEvaluator<ArgType, Device>::PreferBlockAccess;
+
+    static const bool isColMajor =
+        static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+    static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1;
+    const bool inner_dim_reversed = m_reverse[inner_dim_idx];
+
+    // Offset in the output block.
+    Index block_offset = 0;
+
+    // Offset in the input Tensor.
+    Index input_offset = reverseIndex(desc.offset());
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims> it;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = isColMajor ? i : NumDims - 1 - i;
+      it[i].size = desc.dimension(dim);
+      it[i].count = 0;
+      it[i].reverse = m_reverse[dim];
+
+      it[i].block_stride =
+          i == 0 ? 1 : (it[i - 1].size * it[i - 1].block_stride);
+      it[i].block_span = it[i].block_stride * (it[i].size - 1);
+
+      it[i].input_stride = m_strides[dim];
+      it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+      if (it[i].reverse) {
+        it[i].input_stride = -1 * it[i].input_stride;
+        it[i].input_span = -1 * it[i].input_span;
+      }
+    }
+
+    // If multiple inner dimensions have the same reverse flag, check if we can
+    // merge them into a single virtual inner dimension.
+    int effective_inner_dim = 0;
+    for (int i = 1; i < NumDims; ++i) {
+      if (it[i].reverse != it[effective_inner_dim].reverse) break;
+      if (it[i].block_stride != it[effective_inner_dim].size) break;
+      if (it[i].block_stride != numext::abs(it[i].input_stride)) break;
+
+      it[i].size = it[effective_inner_dim].size * it[i].size;
+
+      it[i].block_stride = 1;
+      it[i].input_stride = (inner_dim_reversed ? -1 : 1);
+
+      it[i].block_span = it[i].block_stride * (it[i].size - 1);
+      it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+      effective_inner_dim = i;
+    }
+
+    eigen_assert(it[effective_inner_dim].block_stride == 1);
+    eigen_assert(it[effective_inner_dim].input_stride ==
+                 (inner_dim_reversed ? -1 : 1));
+
+    const Index inner_dim_size = it[effective_inner_dim].size;
+
+    // Prepare storage for the materialized reverse result.
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(desc, scratch);
+    CoeffReturnType* block_buffer = block_storage.data();
+
+    while (it[NumDims - 1].count < it[NumDims - 1].size) {
+      // Copy inner-most dimension data from reversed location in input.
+      Index dst = block_offset;
+      Index src = input_offset;
+
+      // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed
+      // worse results in benchmarks than a simple coefficient loop.
+      if (inner_dim_reversed) {
+        for (Index i = 0; i < inner_dim_size; ++i) {
+          block_buffer[dst] = m_impl.coeff(src);
+          ++dst;
+          --src;
+        }
+      } else {
+        for (Index i = 0; i < inner_dim_size; ++i) {
+          block_buffer[dst] = m_impl.coeff(src);
+          ++dst;
+          ++src;
+        }
+      }
+
+      // For the 1d tensor we need to generate only one inner-most dimension.
+      if ((NumDims - effective_inner_dim) == 1) break;
+
+      // Update offset.
+      for (Index i = effective_inner_dim + 1; i < NumDims; ++i) {
+        if (++it[i].count < it[i].size) {
+          block_offset += it[i].block_stride;
+          input_offset += it[i].input_stride;
+          break;
+        }
+        if (i != NumDims - 1) it[i].count = 0;
+        block_offset -= it[i].block_span;
+        input_offset -= it[i].input_span;
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
                                      2 * TensorOpCost::MulCost<Index>() +
@@ -226,13 +365,42 @@
            TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
 
  protected:
   Dimensions m_dimensions;
   array<Index, NumDims> m_strides;
+  array<IndexDivisor, NumDims> m_fastStrides;
   TensorEvaluator<ArgType, Device> m_impl;
   ReverseDimensions m_reverse;
+  const Device EIGEN_DEVICE_REF m_device;
+
+ private:
+  struct BlockIteratorState {
+    BlockIteratorState()
+        : size(0),
+          count(0),
+          reverse(false),
+          block_stride(0),
+          block_span(0),
+          input_stride(0),
+          input_span(0) {}
+
+    Index size;
+    Index count;
+    bool reverse;
+    Index block_stride;
+    Index block_span;
+    Index input_stride;
+    Index input_span;
+  };
 };
 
 // Eval as lvalue
@@ -252,38 +420,43 @@
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
-                                                        const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : Base(op, device) {}
 
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const Dimensions& dimensions() const { return this->m_dimensions; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
-    return this->m_impl.coeffRef(Base::reverseIndex(index));
+    return this->m_impl.coeffRef(this->reverseIndex(index));
   }
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x) {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     // This code is pilfered from TensorMorphing.h
-    EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+    EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
     internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
-    for (int i = 0; i < packetSize; ++i) {
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
       this->coeffRef(index+i) = values[i];
     }
   }
-
 };
 
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 9d399fe..beae854 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h

@@ -24,6 +24,7 @@
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Op, typename XprType>
@@ -34,7 +35,7 @@
 
 template<typename Op, typename XprType>
 struct nested<TensorScanOp<Op, XprType>, 1,
-              typename eval<TensorScanOp<Op, XprType> >::type>
+            typename eval<TensorScanOp<Op, XprType> >::type>
 {
   typedef TensorScanOp<Op, XprType> type;
 };
@@ -76,181 +77,264 @@
   const bool m_exclusive;
 };
 
-template <typename Self, typename Reducer, typename Device>
-struct ScanLauncher;
 
-// Eval as rvalue
-template <typename Op, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
+namespace internal {
 
-  typedef TensorScanOp<Op, ArgType> XprType;
-  typedef typename XprType::Index Index;
-  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
-
-  enum {
-    IsAligned = false,
-    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,
-    RawAccess = true
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
-                                                        const Device& device)
-      : m_impl(op.expression(), device),
-        m_device(device),
-        m_exclusive(op.exclusive()),
-        m_accumulator(op.accumulator()),
-        m_size(m_impl.dimensions()[op.axis()]),
-        m_stride(1),
-        m_output(NULL) {
-
-    // Accumulating a scalar isn't supported.
-    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
-
-    // Compute stride of scan axis
-    const Dimensions& dims = m_impl.dimensions();
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = 0; i < op.axis(); ++i) {
-        m_stride = m_stride * dims[i];
+template <typename Self>
+EIGEN_STRONG_INLINE void ReduceScalar(Self& self, Index offset,
+                                      typename Self::CoeffReturnType* data) {
+  // Compute the scan along the axis, starting at the given offset
+  typename Self::CoeffReturnType accum = self.accumulator().initialize();
+  if (self.stride() == 1) {
+    if (self.exclusive()) {
+      for (Index curr = offset; curr < offset + self.size(); ++curr) {
+        data[curr] = self.accumulator().finalize(accum);
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
       }
     } else {
-      for (int i = NumDims - 1; i > op.axis(); --i) {
-        m_stride = m_stride * dims[i];
+      for (Index curr = offset; curr < offset + self.size(); ++curr) {
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+        data[curr] = self.accumulator().finalize(accum);
+      }
+    }
+  } else {
+    if (self.exclusive()) {
+      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+        Index curr = offset + idx3 * self.stride();
+        data[curr] = self.accumulator().finalize(accum);
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+      }
+    } else {
+      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+        Index curr = offset + idx3 * self.stride();
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+        data[curr] = self.accumulator().finalize(accum);
       }
     }
   }
+}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
-    return m_impl.dimensions();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
-    return m_stride;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
-    return m_size;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
-    return m_accumulator;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
-    return m_exclusive;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
-    return m_impl;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
-    return m_device;
-  }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    ScanLauncher<Self, Op, Device> launcher;
-    if (data) {
-      launcher(*this, data);
-      return false;
+template <typename Self>
+EIGEN_STRONG_INLINE void ReducePacket(Self& self, Index offset,
+                                      typename Self::CoeffReturnType* data) {
+  using Scalar = typename Self::CoeffReturnType;
+  using Packet = typename Self::PacketReturnType;
+  // Compute the scan along the axis, starting at the calculated offset
+  Packet accum = self.accumulator().template initializePacket<Packet>();
+  if (self.stride() == 1) {
+    if (self.exclusive()) {
+      for (Index curr = offset; curr < offset + self.size(); ++curr) {
+        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+      }
+    } else {
+      for (Index curr = offset; curr < offset + self.size(); ++curr) {
+        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+      }
     }
-
-    const Index total_size = internal::array_prod(dimensions());
-    m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar)));
-    launcher(*this, m_output);
-    return true;
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
-    return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const
-  {
-    return m_output;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    return m_output[index];
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
-    return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    if (m_output != NULL) {
-      m_device.deallocate(m_output);
-      m_output = NULL;
+  } else {
+    if (self.exclusive()) {
+      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+        const Index curr = offset + idx3 * self.stride();
+        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+      }
+    } else {
+      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+        const Index curr = offset + idx3 * self.stride();
+        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+      }
     }
-    m_impl.cleanup();
   }
+}
 
-protected:
-  TensorEvaluator<ArgType, Device> m_impl;
-  const Device& m_device;
-  const bool m_exclusive;
-  Op m_accumulator;
-  const Index m_size;
-  Index m_stride;
-  CoeffReturnType* m_output;
+template <typename Self, bool Vectorize, bool Parallel>
+struct ReduceBlock {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
+                                      typename Self::CoeffReturnType* data) {
+    for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
+      // Calculate the starting offset for the scan
+      Index offset = idx1 + idx2;
+      ReduceScalar(self, offset, data);
+    }
+  }
 };
 
-// CPU implementation of scan
-// TODO(ibab) This single-threaded implementation should be parallelized,
-// at least by running multiple scans at the same time.
-template <typename Self, typename Reducer, typename Device>
+// Specialization for vectorized reduction.
+template <typename Self>
+struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/false> {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
+                                      typename Self::CoeffReturnType* data) {
+    using Packet = typename Self::PacketReturnType;
+    const int PacketSize = internal::unpacket_traits<Packet>::size;
+    Index idx2 = 0;
+    for (; idx2 + PacketSize <= self.stride(); idx2 += PacketSize) {
+      // Calculate the starting offset for the packet scan
+      Index offset = idx1 + idx2;
+      ReducePacket(self, offset, data);
+    }
+    for (; idx2 < self.stride(); idx2++) {
+      // Calculate the starting offset for the scan
+      Index offset = idx1 + idx2;
+      ReduceScalar(self, offset, data);
+    }
+  }
+};
+
+// Single-threaded CPU implementation of scan
+template <typename Self, typename Reducer, typename Device,
+          bool Vectorize =
+              (TensorEvaluator<typename Self::ChildTypeNoConst, Device>::PacketAccess &&
+               internal::reducer_traits<Reducer, Device>::PacketAccess)>
 struct ScanLauncher {
-  void operator()(Self& self, typename Self::CoeffReturnType *data) {
+  void operator()(Self& self, typename Self::CoeffReturnType* data) {
     Index total_size = internal::array_prod(self.dimensions());
 
-    // For each coefficient of the output buffer, find the offset of the coefficient
-    // in the output buffer that is located at index 0 on the scan axis.
-    // We use 2 loops to iterate over the coefficient space: the loop indexed by idx2
-    // goes over the dimensions [0, scan_axis[, and the one indexed by idx1 iterates
-    // over the dimensions [scan_axis+1, num_input_dims[.
+    // We fix the index along the scan axis to 0 and perform a
+    // scan per remaining entry. The iteration is split into two nested
+    // loops to avoid an integer division by keeping track of each idx1 and
+    // idx2.
     for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
-      for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
-        // Calculate the starting offset for the scan
-        Index offset = idx1 + idx2;
-
-        // Compute the scan along the axis, starting at the calculated offset
-        typename Self::CoeffReturnType accum = self.accumulator().initialize();
-        for (Index idx3 = 0; idx3 < self.size(); idx3++) {
-          Index curr = offset + idx3 * self.stride();
-
-          if (self.exclusive()) {
-            data[curr] = self.accumulator().finalize(accum);
-            self.accumulator().reduce(self.inner().coeff(curr), &accum);
-          } else {
-            self.accumulator().reduce(self.inner().coeff(curr), &accum);
-            data[curr] = self.accumulator().finalize(accum);
-          }
-        }
-      }
+      ReduceBlock<Self, Vectorize, /*Parallel=*/false> block_reducer;
+      block_reducer(self, idx1, data);
     }
   }
 };
 
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#ifdef EIGEN_USE_THREADS
+
+// Adjust block_size to avoid false sharing of cachelines among
+// threads. Currently set to twice the cache line size on Intel and ARM
+// processors.
+EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) {
+  EIGEN_CONSTEXPR Index kBlockAlignment = 128;
+  const Index items_per_cacheline =
+      numext::maxi<Index>(1, kBlockAlignment / item_size);
+  return items_per_cacheline * divup(block_size, items_per_cacheline);
+}
+
+template <typename Self>
+struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/true> {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
+                                      typename Self::CoeffReturnType* data) {
+    using Scalar = typename Self::CoeffReturnType;
+    using Packet = typename Self::PacketReturnType;
+    const int PacketSize = internal::unpacket_traits<Packet>::size;
+    Index num_scalars = self.stride();
+    Index num_packets = 0;
+    if (self.stride() >= PacketSize) {
+      num_packets = self.stride() / PacketSize;
+      self.device().parallelFor(
+          num_packets,
+        TensorOpCost(PacketSize * self.size(), PacketSize * self.size(),
+                     16 * PacketSize * self.size(), true, PacketSize),
+        // Make the shard size large enough that two neighboring threads
+        // won't write to the same cacheline of `data`.
+        [=](Index blk_size) {
+          return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size);
+        },
+        [&](Index first, Index last) {
+          for (Index packet = first; packet < last; ++packet) {
+            const Index idx2 = packet * PacketSize;
+            ReducePacket(self, idx1 + idx2, data);
+          }
+        });
+      num_scalars -= num_packets * PacketSize;
+    }
+    self.device().parallelFor(
+        num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()),
+        // Make the shard size large enough that two neighboring threads
+        // won't write to the same cacheline of `data`.
+        [=](Index blk_size) {
+          return AdjustBlockSize(sizeof(Scalar), blk_size);
+        },
+        [&](Index first, Index last) {
+          for (Index scalar = first; scalar < last; ++scalar) {
+            const Index idx2 = num_packets * PacketSize + scalar;
+            ReduceScalar(self, idx1 + idx2, data);
+          }
+        });
+  }
+};
+
+template <typename Self>
+struct ReduceBlock<Self, /*Vectorize=*/false, /*Parallel=*/true> {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
+                                      typename Self::CoeffReturnType* data) {
+    using Scalar = typename Self::CoeffReturnType;
+    self.device().parallelFor(
+        self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()),
+        // Make the shard size large enough that two neighboring threads
+        // won't write to the same cacheline of `data`.
+        [=](Index blk_size) {
+          return AdjustBlockSize(sizeof(Scalar), blk_size);
+        },
+        [&](Index first, Index last) {
+          for (Index idx2 = first; idx2 < last; ++idx2) {
+            ReduceScalar(self, idx1 + idx2, data);
+          }
+        });
+  }
+};
+
+// Specialization for multi-threaded execution.
+template <typename Self, typename Reducer, bool Vectorize>
+struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
+  void operator()(Self& self, typename Self::CoeffReturnType* data) {
+    using Scalar = typename Self::CoeffReturnType;
+    using Packet = typename Self::PacketReturnType;
+    const int PacketSize = internal::unpacket_traits<Packet>::size;
+    const Index total_size = internal::array_prod(self.dimensions());
+    const Index inner_block_size = self.stride() * self.size();
+    bool parallelize_by_outer_blocks = (total_size >= (self.stride() * inner_block_size));
+
+    if ((parallelize_by_outer_blocks && total_size <= 4096) ||
+        (!parallelize_by_outer_blocks && self.stride() < PacketSize)) {
+      ScanLauncher<Self, Reducer, DefaultDevice, Vectorize> launcher;
+      launcher(self, data);
+      return;
+    }
+
+    if (parallelize_by_outer_blocks) {
+      // Parallelize over outer blocks.
+      const Index num_outer_blocks = total_size / inner_block_size;
+      self.device().parallelFor(
+          num_outer_blocks,
+          TensorOpCost(inner_block_size, inner_block_size,
+                       16 * PacketSize * inner_block_size, Vectorize,
+                       PacketSize),
+          [=](Index blk_size) {
+            return AdjustBlockSize(inner_block_size * sizeof(Scalar), blk_size);
+          },
+          [&](Index first, Index last) {
+            for (Index idx1 = first; idx1 < last; ++idx1) {
+              ReduceBlock<Self, Vectorize, /*Parallelize=*/false> block_reducer;
+              block_reducer(self, idx1 * inner_block_size, data);
+            }
+          });
+    } else {
+      // Parallelize over inner packets/scalars dimensions when the reduction
+      // axis is not an inner dimension.
+      ReduceBlock<Self, Vectorize, /*Parallelize=*/true> block_reducer;
+      for (Index idx1 = 0; idx1 < total_size;
+           idx1 += self.stride() * self.size()) {
+        block_reducer(self, idx1, data);
+      }
+    }
+  }
+};
+#endif  // EIGEN_USE_THREADS
+
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
 
 // GPU implementation of scan
 // TODO(ibab) This placeholder implementation performs multiple scans in
 // parallel, but it would be better to use a parallel scan algorithm and
 // optimize memory access.
 template <typename Self, typename Reducer>
-__global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
   // Compute offset as in the CPU version
   Index val = threadIdx.x + blockIdx.x * blockDim.x;
   Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
@@ -273,16 +357,171 @@
 
 }
 
-template <typename Self, typename Reducer>
-struct ScanLauncher<Self, Reducer, GpuDevice> {
+template <typename Self, typename Reducer, bool Vectorize>
+struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> {
   void operator()(const Self& self, typename Self::CoeffReturnType* data) {
      Index total_size = internal::array_prod(self.dimensions());
      Index num_blocks = (total_size / self.size() + 63) / 64;
      Index block_size = 64;
-     LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
+
+     LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
   }
 };
-#endif  // EIGEN_USE_GPU && __CUDACC__
+#endif  // EIGEN_USE_GPU && (EIGEN_GPUCC)
+
+}  // namespace internal
+
+// Eval as rvalue
+template <typename Op, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
+
+  typedef TensorScanOp<Op, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef const ArgType ChildTypeNoConst;
+  typedef const ArgType ChildType;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,
+    RawAccess = true
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device),
+        m_device(device),
+        m_exclusive(op.exclusive()),
+        m_accumulator(op.accumulator()),
+        m_size(m_impl.dimensions()[op.axis()]),
+        m_stride(1), m_consume_dim(op.axis()),
+        m_output(NULL) {
+
+    // Accumulating a scalar isn't supported.
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
+
+    // Compute stride of scan axis
+    const Dimensions& dims = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < op.axis(); ++i) {
+        m_stride = m_stride * dims[i];
+      }
+    } else {
+      // dims can only be indexed through unsigned integers,
+      // so let's use an unsigned type to let the compiler knows.
+      // This prevents stupid warnings: ""'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized in this function"
+      unsigned int axis = internal::convert_index<unsigned int>(op.axis());
+      for (unsigned int i = NumDims - 1; i > axis; --i) {
+        m_stride = m_stride * dims[i];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+    return m_impl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
+    return m_stride;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const {
+    return m_consume_dim;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
+    return m_size;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
+    return m_accumulator;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
+    return m_exclusive;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
+    return m_impl;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
+    return m_device;
+  }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    internal::ScanLauncher<Self, Op, Device> launcher;
+    if (data) {
+      launcher(*this, data);
+      return false;
+    }
+
+    const Index total_size = internal::array_prod(dimensions());
+    m_output = static_cast<EvaluatorPointerType>(m_device.get((Scalar*) m_device.allocate_temp(total_size * sizeof(Scalar))));
+    launcher(*this, m_output);
+    return true;
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const
+  {
+    return m_output;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_output[index];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    if (m_output) {
+      m_device.deallocate_temp(m_output);
+      m_output = NULL;
+    }
+    m_impl.cleanup();
+  }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+    m_output.bind(cgh);
+  }
+#endif
+protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Device EIGEN_DEVICE_REF m_device;
+  const bool m_exclusive;
+  Op m_accumulator;
+  const Index m_size;
+  Index m_stride;
+  Index m_consume_dim;
+  EvaluatorPointerType m_output;
+};
 
 }  // end namespace Eigen
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
new file mode 100644
index 0000000..7f68ecb
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h

@@ -0,0 +1,513 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorScanSycl.h
+ *
+ * \brief:
+ *  Tensor Scan Sycl implement the extend  version of
+ * "Efficient parallel scan algorithms for GPUs." .for Tensor operations.
+ * The algorithm requires up to 3 stage (consequently 3 kernels) depending on
+ * the size of the tensor. In the first kernel (ScanKernelFunctor), each
+ * threads within the work-group individually reduces the allocated elements per
+ * thread in order to reduces the total number of blocks. In the next step all
+ * thread within the work-group will reduce the associated blocks into the
+ * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary
+ * buffer is given as an input and all the threads within a work-group scan and
+ * reduces the boundaries between the blocks (generated from the previous
+ * kernel). and write the data on the temporary buffer. If the second kernel is
+ * required, the third and final kerenl (ScanAdjustmentKernelFunctor) will
+ * adjust the final result into the output buffer.
+ * The original algorithm for the parallel prefix sum can be found here:
+ *
+ * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel
+ * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003
+ *1, no. 1 (2008): 1-17.
+ *****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+
+#ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE
+#define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)
+#endif
+
+template <typename index_t>
+struct ScanParameters {
+  // must be power of 2
+  static EIGEN_CONSTEXPR index_t ScanPerThread = 8;
+  const index_t total_size;
+  const index_t non_scan_size;
+  const index_t scan_size;
+  const index_t non_scan_stride;
+  const index_t scan_stride;
+  const index_t panel_threads;
+  const index_t group_threads;
+  const index_t block_threads;
+  const index_t elements_per_group;
+  const index_t elements_per_block;
+  const index_t loop_range;
+
+  ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,
+                 index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,
+                 index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
+      : total_size(total_size_),
+        non_scan_size(non_scan_size_),
+        scan_size(scan_size_),
+        non_scan_stride(non_scan_stride_),
+        scan_stride(scan_stride_),
+        panel_threads(panel_threads_),
+        group_threads(group_threads_),
+        block_threads(block_threads_),
+        elements_per_group(elements_per_group_),
+        elements_per_block(elements_per_block_),
+        loop_range(loop_range_) {}
+};
+
+enum class scan_step { first, second };
+template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index,
+          scan_step stp>
+struct ScanKernelFunctor {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
+
+  LocalAccessor scratch;
+  Evaluator dev_eval;
+  OutAccessor out_accessor;
+  OutAccessor temp_accessor;
+  const ScanParameters<Index> scanParameters;
+  Op accumulator;
+  const bool inclusive;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_,
+                                                          OutAccessor out_accessor_, OutAccessor temp_accessor_,
+                                                          const ScanParameters<Index> scanParameters_, Op accumulator_,
+                                                          const bool inclusive_)
+      : scratch(scratch_),
+        dev_eval(dev_eval_),
+        out_accessor(out_accessor_),
+        temp_accessor(temp_accessor_),
+        scanParameters(scanParameters_),
+        accumulator(accumulator_),
+        inclusive(inclusive_) {}
+
+  template <scan_step sst = stp, typename Input>
+  typename ::Eigen::internal::enable_if<sst == scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
+      EIGEN_STRONG_INLINE
+      read(const Input &inpt, Index global_id) {
+    return inpt.coeff(global_id);
+  }
+
+  template <scan_step sst = stp, typename Input>
+  typename ::Eigen::internal::enable_if<sst != scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
+      EIGEN_STRONG_INLINE
+      read(const Input &inpt, Index global_id) {
+    return inpt[global_id];
+  }
+
+  template <scan_step sst = stp, typename InclusiveOp>
+  typename ::Eigen::internal::enable_if<sst == scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  first_step_inclusive_Operation(InclusiveOp inclusive_op) {
+    inclusive_op();
+  }
+
+  template <scan_step sst = stp, typename InclusiveOp>
+  typename ::Eigen::internal::enable_if<sst != scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  first_step_inclusive_Operation(InclusiveOp) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    auto out_ptr = out_accessor.get_pointer();
+    auto tmp_ptr = temp_accessor.get_pointer();
+    auto scratch_ptr = scratch.get_pointer().get();
+
+    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
+      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
+      Index tmp = data_offset % scanParameters.panel_threads;
+      const Index panel_id = data_offset / scanParameters.panel_threads;
+      const Index group_id = tmp / scanParameters.group_threads;
+      tmp = tmp % scanParameters.group_threads;
+      const Index block_id = tmp / scanParameters.block_threads;
+      const Index local_id = tmp % scanParameters.block_threads;
+      // we put one element per packet in scratch_mem
+      const Index scratch_stride = scanParameters.elements_per_block / PacketSize;
+      const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride;
+      CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread];
+      CoeffReturnType inclusive_scan;
+      // the actual panel size is scan_size * non_scan_size.
+      // elements_per_panel is roundup to power of 2 for binary tree
+      const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
+      const Index group_offset = group_id * scanParameters.non_scan_stride;
+      // This will be effective when the size is bigger than elements_per_block
+      const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
+      const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride);
+      const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
+      Index next_elements = 0;
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+        Index global_id = global_offset + next_elements;
+        private_scan[i] = ((((block_id * scanParameters.elements_per_block) +
+                             (ScanParameters<Index>::ScanPerThread * local_id) + i) < scanParameters.scan_size) &&
+                           (global_id < scanParameters.total_size))
+                              ? read(dev_eval, global_id)
+                              : accumulator.initialize();
+        next_elements += scanParameters.scan_stride;
+      }
+      first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
+        if (inclusive) {
+          inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1];
+        }
+      });
+      // This for loop must be 2
+      EIGEN_UNROLL_LOOP
+      for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
+        Index private_offset = 1;
+        // build sum in place up the tree
+        EIGEN_UNROLL_LOOP
+        for (Index d = PacketSize >> 1; d > 0; d >>= 1) {
+          EIGEN_UNROLL_LOOP
+          for (Index l = 0; l < d; l++) {
+            Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
+            Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
+            CoeffReturnType accum = accumulator.initialize();
+            accumulator.reduce(private_scan[ai], &accum);
+            accumulator.reduce(private_scan[bi], &accum);
+            private_scan[bi] = accumulator.finalize(accum);
+          }
+          private_offset *= 2;
+        }
+        scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
+            private_scan[PacketSize - 1 + packetIndex];
+        private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
+        // traverse down tree & build scan
+        EIGEN_UNROLL_LOOP
+        for (Index d = 1; d < PacketSize; d *= 2) {
+          private_offset >>= 1;
+          EIGEN_UNROLL_LOOP
+          for (Index l = 0; l < d; l++) {
+            Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
+            Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
+            CoeffReturnType accum = accumulator.initialize();
+            accumulator.reduce(private_scan[ai], &accum);
+            accumulator.reduce(private_scan[bi], &accum);
+            private_scan[ai] = private_scan[bi];
+            private_scan[bi] = accumulator.finalize(accum);
+          }
+        }
+      }
+
+      Index offset = 1;
+      // build sum in place up the tree
+      for (Index d = scratch_stride >> 1; d > 0; d >>= 1) {
+        // Synchronise
+        itemID.barrier(cl::sycl::access::fence_space::local_space);
+        if (local_id < d) {
+          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
+          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
+          CoeffReturnType accum = accumulator.initialize();
+          accumulator.reduce(scratch_ptr[ai], &accum);
+          accumulator.reduce(scratch_ptr[bi], &accum);
+          scratch_ptr[bi] = accumulator.finalize(accum);
+        }
+        offset *= 2;
+      }
+      // Synchronise
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      // next step optimisation
+      if (local_id == 0) {
+        if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) {
+          const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) *
+                                    scanParameters.non_scan_size +
+                                group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) +
+                                block_id;
+          tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset];
+        }
+        // clear the last element
+        scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
+      }
+      // traverse down tree & build scan
+      for (Index d = 1; d < scratch_stride; d *= 2) {
+        offset >>= 1;
+        // Synchronise
+        itemID.barrier(cl::sycl::access::fence_space::local_space);
+        if (local_id < d) {
+          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
+          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
+          CoeffReturnType accum = accumulator.initialize();
+          accumulator.reduce(scratch_ptr[ai], &accum);
+          accumulator.reduce(scratch_ptr[bi], &accum);
+          scratch_ptr[ai] = scratch_ptr[bi];
+          scratch_ptr[bi] = accumulator.finalize(accum);
+        }
+      }
+      // Synchronise
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      // This for loop must be 2
+      EIGEN_UNROLL_LOOP
+      for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
+        EIGEN_UNROLL_LOOP
+        for (Index i = 0; i < PacketSize; i++) {
+          CoeffReturnType accum = private_scan[packetIndex + i];
+          accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
+          private_scan[packetIndex + i] = accumulator.finalize(accum);
+        }
+      }
+      first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
+        if (inclusive) {
+          accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan);
+          private_scan[0] = accumulator.finalize(inclusive_scan);
+        }
+      });
+      next_elements = 0;
+      // right the first set of private param
+      EIGEN_UNROLL_LOOP
+      for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+        Index global_id = global_offset + next_elements;
+        if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
+             scanParameters.scan_size) &&
+            (global_id < scanParameters.total_size)) {
+          Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive));
+          out_ptr[global_id] = private_scan[private_id];
+        }
+        next_elements += scanParameters.scan_stride;
+      }
+    }  // end for loop
+  }
+};
+
+template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index>
+struct ScanAdjustmentKernelFunctor {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
+  InAccessor in_accessor;
+  OutAccessor out_accessor;
+  const ScanParameters<Index> scanParameters;
+  Op accumulator;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_,
+                                                                    OutAccessor out_accessor_,
+                                                                    const ScanParameters<Index> scanParameters_,
+                                                                    Op accumulator_)
+      : in_accessor(in_accessor_),
+        out_accessor(out_accessor_),
+        scanParameters(scanParameters_),
+        accumulator(accumulator_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    auto in_ptr = in_accessor.get_pointer();
+    auto out_ptr = out_accessor.get_pointer();
+
+    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
+      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
+      Index tmp = data_offset % scanParameters.panel_threads;
+      const Index panel_id = data_offset / scanParameters.panel_threads;
+      const Index group_id = tmp / scanParameters.group_threads;
+      tmp = tmp % scanParameters.group_threads;
+      const Index block_id = tmp / scanParameters.block_threads;
+      const Index local_id = tmp % scanParameters.block_threads;
+
+      // the actual panel size is scan_size * non_scan_size.
+      // elements_per_panel is roundup to power of 2 for binary tree
+      const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
+      const Index group_offset = group_id * scanParameters.non_scan_stride;
+      // This will be effective when the size is bigger than elements_per_block
+      const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
+      const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride;
+
+      const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
+      const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block;
+      const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id;
+      CoeffReturnType adjust_val = in_ptr[in_id];
+
+      Index next_elements = 0;
+      EIGEN_UNROLL_LOOP
+      for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+        Index global_id = global_offset + next_elements;
+        if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
+             scanParameters.scan_size) &&
+            (global_id < scanParameters.total_size)) {
+          CoeffReturnType accum = adjust_val;
+          accumulator.reduce(out_ptr[global_id], &accum);
+          out_ptr[global_id] = accumulator.finalize(accum);
+        }
+        next_elements += scanParameters.scan_stride;
+      }
+    }
+  }
+};
+
+template <typename Index>
+struct ScanInfo {
+  const Index &total_size;
+  const Index &scan_size;
+  const Index &panel_size;
+  const Index &non_scan_size;
+  const Index &scan_stride;
+  const Index &non_scan_stride;
+
+  Index max_elements_per_block;
+  Index block_size;
+  Index panel_threads;
+  Index group_threads;
+  Index block_threads;
+  Index elements_per_group;
+  Index elements_per_block;
+  Index loop_range;
+  Index global_range;
+  Index local_range;
+  const Eigen::SyclDevice &dev;
+  EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_,
+                               const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_,
+                               const Eigen::SyclDevice &dev_)
+      : total_size(total_size_),
+        scan_size(scan_size_),
+        panel_size(panel_size_),
+        non_scan_size(non_scan_size_),
+        scan_stride(scan_stride_),
+        non_scan_stride(non_scan_stride_),
+        dev(dev_) {
+    // must be power of 2
+    local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()),
+                           Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));
+
+    max_elements_per_block = local_range * ScanParameters<Index>::ScanPerThread;
+
+    elements_per_group =
+        dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true);
+    const Index elements_per_panel = elements_per_group * non_scan_size;
+    elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block));
+    panel_threads = elements_per_panel / ScanParameters<Index>::ScanPerThread;
+    group_threads = elements_per_group / ScanParameters<Index>::ScanPerThread;
+    block_threads = elements_per_block / ScanParameters<Index>::ScanPerThread;
+    block_size = elements_per_group / elements_per_block;
+#ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE
+    const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE));
+#else
+    const Index max_threads = panel_threads * panel_size;
+#endif
+    global_range = roundUp(max_threads, local_range);
+    loop_range = Index(
+        std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread)));
+  }
+  inline ScanParameters<Index> get_scan_parameter() {
+    return ScanParameters<Index>(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads,
+                                 group_threads, block_threads, elements_per_group, elements_per_block, loop_range);
+  }
+  inline cl::sycl::nd_range<1> get_thread_range() {
+    return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+  }
+};
+
+template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index>
+struct SYCLAdjustBlockOffset {
+  EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr,
+                                                           Reducer &accumulator, const Index total_size,
+                                                           const Index scan_size, const Index panel_size,
+                                                           const Index non_scan_size, const Index scan_stride,
+                                                           const Index non_scan_stride, const Eigen::SyclDevice &dev) {
+    auto scan_info =
+        ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
+
+    typedef ScanAdjustmentKernelFunctor<CoeffReturnType, EvaluatorPointerType, EvaluatorPointerType, Reducer, Index>
+        AdjustFuctor;
+    dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
+                                                                      scan_info.max_elements_per_block,
+                                                                      scan_info.get_scan_parameter(), accumulator);
+  }
+};
+
+template <typename CoeffReturnType, scan_step stp>
+struct ScanLauncher_impl {
+  template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index>
+  EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator,
+                                             const Index total_size, const Index scan_size, const Index panel_size,
+                                             const Index non_scan_size, const Index scan_stride,
+                                             const Index non_scan_stride, const bool inclusive,
+                                             const Eigen::SyclDevice &dev) {
+    auto scan_info =
+        ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
+    const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;
+    const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2);
+    CoeffReturnType *temp_pointer =
+        static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType)));
+    EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
+
+    typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor;
+    dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
+        in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
+        scan_info.get_scan_parameter(), accumulator, inclusive);
+
+    if (scan_info.block_size > 1) {
+      ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block(
+          tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,
+          non_scan_size, Index(1), scan_info.block_size, false, dev);
+
+      SYCLAdjustBlockOffset<EvaluatorPointerType, CoeffReturnType, Reducer, Index>::adjust_scan_block_offset(
+          tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,
+          non_scan_stride, dev);
+    }
+    dev.deallocate_temp(temp_pointer);
+  }
+};
+
+}  // namespace internal
+}  // namespace TensorSycl
+namespace internal {
+template <typename Self, typename Reducer, bool vectorize>
+struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> {
+  typedef typename Self::Index Index;
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::Storage Storage;
+  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
+  void operator()(Self &self, EvaluatorPointerType data) {
+    const Index total_size = internal::array_prod(self.dimensions());
+    const Index scan_size = self.size();
+    const Index scan_stride = self.stride();
+    // this is the scan op (can be sum or ...)
+    auto accumulator = self.accumulator();
+    auto inclusive = !self.exclusive();
+    auto consume_dim = self.consume_dim();
+    auto dev = self.device();
+
+    auto dims = self.inner().dimensions();
+
+    Index non_scan_size = 1;
+    Index panel_size = 1;
+    if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < consume_dim; i++) {
+        non_scan_size *= dims[i];
+      }
+      for (int i = consume_dim + 1; i < Self::NumDims; i++) {
+        panel_size *= dims[i];
+      }
+    } else {
+      for (int i = Self::NumDims - 1; i > consume_dim; i--) {
+        non_scan_size *= dims[i];
+      }
+      for (int i = consume_dim - 1; i >= 0; i--) {
+        panel_size *= dims[i];
+      }
+    }
+    const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;
+    auto eval_impl = self.inner();
+    TensorSycl::internal::ScanLauncher_impl<CoeffReturnType, TensorSycl::internal::scan_step::first>::scan_block(
+        eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,
+        inclusive, dev);
+  }
+};
+} // namespace internal
+}  // namespace Eigen
+
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 6c4eebc..e5e5efd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h

@@ -25,13 +25,13 @@
 {
   typedef typename XprType::Scalar Scalar;
   typedef traits<XprType> XprTraits;
-  typedef typename packet_traits<Scalar>::type Packet;
   typedef typename XprTraits::StorageKind StorageKind;
   typedef typename XprTraits::Index Index;
   typedef typename XprType::Nested Nested;
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Shuffle, typename XprType>
@@ -54,17 +54,16 @@
 class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> >
 {
   public:
-  typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
-  typedef typename Eigen::internal::traits<TensorShufflingOp>::Packet Packet;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
+    typedef TensorBase<TensorShufflingOp<Shuffle, XprType> > Base;
+    typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shuffle)
-      : m_xpr(expr), m_shuffle(shuffle) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shfl)
+      : m_xpr(expr), m_shuffle(shfl) {}
 
     EIGEN_DEVICE_FUNC
     const Shuffle& shufflePermutation() const { return m_shuffle; }
@@ -73,25 +72,8 @@
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const TensorShufflingOp& other)
-    {
-      typedef TensorAssignOp<TensorShufflingOp, const TensorShufflingOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(
-          assign, DefaultDevice());
-      return *this;
-    }
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorShufflingOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(
-          assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorShufflingOp)
+
 
   protected:
     typename XprType::Nested m_xpr;
@@ -103,50 +85,65 @@
 template<typename Shuffle, typename ArgType, typename Device>
 struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
 {
+  typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Self;
   typedef TensorShufflingOp<Shuffle, ArgType> XprType;
   typedef typename XprType::Index Index;
   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = false,
-    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
-    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  typedef typename internal::TensorBlock<
-    Index, typename internal::remove_const<Scalar>::type, NumDims,
-    TensorEvaluator<ArgType, Device>::Layout> TensorBlock;
-  typedef typename internal::TensorBlockReader<
-    Index, typename internal::remove_const<Scalar>::type, NumDims,
-    TensorEvaluator<ArgType, Device>::Layout, PacketAccess> TensorBlockReader;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_device(device),
+        m_impl(op.expression(), device)
   {
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     const Shuffle& shuffle = op.shufflePermutation();
+    m_is_identity = true;
     for (int i = 0; i < NumDims; ++i) {
+      m_shuffle[i] = static_cast<int>(shuffle[i]);
       m_dimensions[i] = input_dims[shuffle[i]];
       m_inverseShuffle[shuffle[i]] = i;
+      if (m_is_identity && shuffle[i] != i) {
+        m_is_identity = false;
+      }
     }
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       m_unshuffledInputStrides[0] = 1;
       m_outputStrides[0] = 1;
+
       for (int i = 1; i < NumDims; ++i) {
         m_unshuffledInputStrides[i] =
             m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
         m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
+                  m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
       }
     } else {
       m_unshuffledInputStrides[NumDims - 1] = 1;
@@ -155,161 +152,159 @@
         m_unshuffledInputStrides[i] =
             m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
         m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
+                  m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
       }
     }
 
     for (int i = 0; i < NumDims; ++i) {
       m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]];
     }
-
-    m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
-                                        device.firstLevelCacheSize() /
-                                        sizeof(Scalar));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    return m_impl.coeff(srcCoeff(index));
+    if (m_is_identity) {
+      return m_impl.coeff(index);
+    } else {
+      return m_impl.coeff(srcCoeff(index));
+    }
   }
 
+  template <int LoadMode, typename Self, bool ImplPacketAccess>
+  struct PacketLoader {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    static PacketReturnType Run(const Self& self, Index index) {
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < PacketSize; ++i) {
+        values[i] = self.coeff(index + i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  };
+
+  template<int LoadMode, typename Self>
+  struct PacketLoader<LoadMode, Self, true> {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    static PacketReturnType Run(const Self& self, Index index) {
+      if (self.m_is_identity) {
+        return self.m_impl.template packet<LoadMode>(index);
+      } else {
+        EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+        EIGEN_UNROLL_LOOP
+        for (int i = 0; i < PacketSize; ++i) {
+          values[i] = self.coeff(index + i);
+        }
+        PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+        return rslt;
+      }
+    }
+  };
+
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
-
-    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    for (int i = 0; i < PacketSize; ++i) {
-      values[i] = coeff(index+i);
-    }
-    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-    return rslt;
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+        eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+    return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*this, index);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
-      std::vector<internal::TensorOpResourceRequirements>* resources) const {
-    resources->push_back(internal::TensorOpResourceRequirements(
-        internal::kUniformAllDims, m_block_total_size_max));
-    m_impl.getResourceRequirements(resources);
-  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    static const int inner_dim =
+        Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
-      TensorBlock* output_block) const {
-    if (m_impl.data() != NULL) {
-      // Fast path: we have direct access to the data, so shuffle as we read.
-      TensorBlockReader::Run(output_block,
-                             srcCoeff(output_block->first_coeff_index()),
-                             m_inverseShuffle,
-                             m_unshuffledInputStrides,
-                             m_impl.data());
-      return;
-    }
+    const size_t target_size = m_device.firstLevelCacheSize();
+    const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim;
 
-    // Slow path: read unshuffled block from the input and shuffle in-place.
-    // Initialize input block sizes using input-to-output shuffle map.
-    DSizes<Index, NumDims> input_block_sizes;
-    for (Index i = 0; i < NumDims; ++i) {
-      input_block_sizes[i] = output_block->block_sizes()[m_inverseShuffle[i]];
-    }
-
-    // Calculate input block strides.
-    DSizes<Index, NumDims> input_block_strides;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      input_block_strides[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        input_block_strides[i] = input_block_strides[i - 1] *
-            input_block_sizes[i - 1];
-      }
+    // Shuffled inner dimensions leads to a random memory access, which is not
+    // captured by default cost model bytes loaded/stored. We add this cost
+    // explicitly. The number of cycles picked based on the benchmarks.
+    // TODO(ezhulenev): This number was picked based on a very questionable
+    // benchmarks, add benchmarks that are representative of real workloads.
+    using BlockRequirements = internal::TensorBlockResourceRequirements;
+    if (inner_dim_shuffled) {
+      return BlockRequirements::uniform<Scalar>(target_size)
+          .addCostPerCoeff({0, 0, NumDims * 28});
     } else {
-      input_block_strides[NumDims - 1] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        input_block_strides[i] = input_block_strides[i + 1] *
-            input_block_sizes[i + 1];
-      }
+      return BlockRequirements::skewed<Scalar>(target_size);
     }
+  }
 
-    // Read input block.
-    TensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
-                            input_block_sizes,
-                            input_block_strides,
-                            m_unshuffledInputStrides,
-                            output_block->data());
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool root_of_expr_ast = false) const {
+    assert(m_impl.data() != NULL);
 
-    m_impl.block(&input_block);
+    typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
+        TensorBlockIO;
+    typedef typename TensorBlockIO::Dst TensorBlockIODst;
+    typedef typename TensorBlockIO::Src TensorBlockIOSrc;
 
-    // Naive In-place shuffle: random IO but block size is O(L1 cache size).
-    // TODO(andydavis) Improve the performance of this in-place shuffle.
-    const Index total_size = input_block_sizes.TotalSize();
-    std::vector<bool> bitmap(total_size, false);
-    ScalarNonConst* data = const_cast<ScalarNonConst*>(output_block->data());
-    const DSizes<Index, NumDims>& output_block_strides =
-        output_block->block_strides();
-    for (Index input_index = 0; input_index < total_size; ++input_index) {
-      if (bitmap[input_index]) {
-        // Coefficient at this index has already been shuffled.
-        continue;
-      }
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(
+            desc, scratch, /*allow_strided_storage=*/root_of_expr_ast);
 
-      Index output_index = GetBlockOutputIndex(input_index,
-                                               input_block_strides,
-                                               output_block_strides);
-      if (output_index == input_index) {
-        // Coefficient already in place.
-        bitmap[output_index] = true;
-        continue;
-      }
+    typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides);
+    TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset()));
 
-      // The following loop starts at 'input_index', and shuffles
-      // coefficients into their shuffled location at 'output_index'.
-      // It skips through the array shuffling coefficients by following
-      // the shuffle cycle starting and ending a 'start_index'.
-      ScalarNonConst evicted_value;
-      ScalarNonConst shuffled_value = data[input_index];
-      do {
-        evicted_value = data[output_index];
-        data[output_index] = shuffled_value;
-        shuffled_value = evicted_value;
-        bitmap[output_index] = true;
-        output_index = GetBlockOutputIndex(output_index,
-                                           input_block_strides,
-                                           output_block_strides);
-      } while (output_index != input_index);
+    TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(),
+                         block_storage.data());
 
-      data[output_index] = shuffled_value;
-      bitmap[output_index] = true;
-    }
+    typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle);
+    TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
+
+    return block_storage.AsTensorMaterializedBlock();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
+    const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() :
+                                NumDims * (2 * TensorOpCost::AddCost<Index>() +
                                            2 * TensorOpCost::MulCost<Index>() +
                                            TensorOpCost::DivCost<Index>());
     return m_impl.costPerCoeff(vectorized) +
-           TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
+           TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
 
+#ifdef EIGEN_USE_SYCL
+   // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
       Index input_index,
       const DSizes<Index, NumDims>& input_block_strides,
-      const DSizes<Index, NumDims>& output_block_strides) const {
+      const DSizes<Index, NumDims>& output_block_strides,
+      const DSizes<internal::TensorIntDivisor<Index>, NumDims>& fast_input_block_strides) const {
     Index output_index = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = input_index / input_block_strides[i];
+        const Index idx = input_index / fast_input_block_strides[i];
         output_index += idx * output_block_strides[m_inverseShuffle[i]];
         input_index -= idx * input_block_strides[i];
       }
@@ -317,7 +312,7 @@
           output_block_strides[m_inverseShuffle[0]];
     } else {
       for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = input_index / input_block_strides[i];
+        const Index idx = input_index / fast_input_block_strides[i];
         output_index += idx * output_block_strides[m_inverseShuffle[i]];
         input_index -= idx * input_block_strides[i];
       }
@@ -346,13 +341,16 @@
   }
 
   Dimensions m_dimensions;
-  array<Index, NumDims> m_inverseShuffle;
+  bool m_is_identity;
+  array<int, NumDims> m_shuffle;
+  array<Index, NumDims> m_inverseShuffle;  // TODO(ezhulenev): Make it int type.
   array<Index, NumDims> m_outputStrides;
   array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
   array<Index, NumDims> m_inputStrides;
   array<Index, NumDims> m_unshuffledInputStrides;
+
+  const Device EIGEN_DEVICE_REF m_device;
   TensorEvaluator<ArgType, Device> m_impl;
-  std::size_t m_block_total_size_max;
 };
 
 
@@ -369,24 +367,25 @@
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   enum {
-    IsAligned = false,
-    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
-    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    IsAligned         = false,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess         = false
   };
 
-  typedef typename internal::TensorBlock<
-    Index, typename internal::remove_const<Scalar>::type, NumDims,
-    TensorEvaluator<ArgType, Device>::Layout> TensorBlock;
-  typedef typename internal::TensorBlockWriter<
-    Index, typename internal::remove_const<Scalar>::type, NumDims,
-    TensorEvaluator<ArgType, Device>::Layout, PacketAccess> TensorBlockWriter;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : Base(op, device)
   { }
 
@@ -398,21 +397,71 @@
   template <int StoreMode> EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
 
-    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
     internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       this->coeffRef(index+i) = values[i];
     }
   }
 
+  template <typename TensorBlock>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
-      const TensorBlock& block) {
+      const TensorBlockDesc& desc, const TensorBlock& block) {
     eigen_assert(this->m_impl.data() != NULL);
-    TensorBlockWriter::Run(block, this->srcCoeff(block.first_coeff_index()),
-                           this->m_inverseShuffle,
-                           this->m_unshuffledInputStrides, this->m_impl.data());
+
+    typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
+        TensorBlockIO;
+    typedef typename TensorBlockIO::Dst TensorBlockIODst;
+    typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+    const Scalar* block_buffer = block.data();
+
+    // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen
+    // expression with coefficient and packet access as `src`.
+    void* mem = NULL;
+    if (block_buffer == NULL) {
+      mem = this->m_device.allocate(desc.size() * sizeof(Scalar));
+      ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem);
+
+      typedef internal::TensorBlockAssignment<
+          ScalarNoConst, NumDims, typename TensorBlock::XprType, Index>
+          TensorBlockAssignment;
+
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(
+              desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
+              buf),
+          block.expr());
+
+      block_buffer = buf;
+    }
+
+    // Read from block.
+    TensorBlockIOSrc src(internal::strides<Layout>(desc.dimensions()),
+                         block_buffer);
+
+    // Write to the output buffer.
+    typename TensorBlockIO::Dimensions output_strides(
+        this->m_unshuffledInputStrides);
+    typename TensorBlockIO::Dimensions output_dimensions;
+    for (int i = 0; i < NumDims; ++i) {
+      output_dimensions[this->m_shuffle[i]] = desc.dimension(i);
+    }
+    TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(),
+                         this->srcCoeff(desc.offset()));
+
+    // Reorder dimensions according to the shuffle.
+    typename TensorBlockIO::DimensionsMap dst_to_src_dim_map;
+    for (int i = 0; i < NumDims; ++i) {
+      dst_to_src_dim_map[i] = static_cast<int>(this->m_inverseShuffle[i]);
+    }
+    TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
+
+    // Deallocate temporary buffer used for the block materialization.
+    if (mem != NULL) this->m_device.deallocate(mem);
   }
 };
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index 48de490..5ff0880 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h

@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -30,23 +31,23 @@
   *
   * \sa Tensor
   */
-template<typename T, typename Dimensions, int Options_> class TensorStorage;
+template<typename T, typename Dimensions, int Options> class TensorStorage;
 
 
 // Pure fixed-size storage
-template<typename T, int Options_, typename FixedDimensions>
-class TensorStorage<T, FixedDimensions, Options_>
+template<typename T, typename FixedDimensions, int Options_>
+class TensorStorage
 {
  private:
   static const std::size_t Size = FixedDimensions::total_size;
 
-  EIGEN_ALIGN_DEFAULT T m_data[Size];
-  FixedDimensions m_dimensions;
+  // Allocate an array of size at least one to prevent compiler warnings.
+  static const std::size_t MinSize = max_n_1<Size>::size;
+  EIGEN_ALIGN_MAX T m_data[MinSize];
 
  public:
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE TensorStorage() {
-    EIGEN_STATIC_ASSERT(Size == FixedDimensions::total_size, YOU_MADE_A_PROGRAMMING_MISTAKE)
   }
 
   EIGEN_DEVICE_FUNC
@@ -54,16 +55,19 @@
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE const T *data() const { return m_data; }
 
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const FixedDimensions& dimensions() const { return m_dimensions; }
+  static EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const FixedDimensions& dimensions()
+  {
+    static const FixedDimensions* singleton_dimensions = new FixedDimensions();
+    return *singleton_dimensions;
+  }
 
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); }
+  EIGEN_STRONG_INLINE DenseIndex size() const { return Size; }
 };
 
-
 // pure dynamic
-template<typename T, int Options_, typename IndexType, int NumIndices_>
+template<typename T, typename IndexType, int NumIndices_, int Options_>
 class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
 {
   public:
@@ -71,18 +75,24 @@
     typedef DSizes<IndexType, NumIndices_> Dimensions;
     typedef TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> Self;
 
-    EIGEN_DEVICE_FUNC TensorStorage()
-      : m_data(NumIndices_ ? 0 : internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1))
-      , m_dimensions() {}
-
+    EIGEN_DEVICE_FUNC TensorStorage() : m_data(0), m_dimensions() {
+      if (NumIndices_ == 0) {
+	m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1);
+      }
+    }
     EIGEN_DEVICE_FUNC TensorStorage(internal::constructor_without_unaligned_array_assert)
-      : m_data(NumIndices_ ? 0 : internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1))
-      , m_dimensions(internal::template repeat<NumIndices_, Index>(0)) {}
-
+      : m_data(0), m_dimensions(internal::template repeat<NumIndices_, Index>(0)) {}
     EIGEN_DEVICE_FUNC TensorStorage(Index size, const array<Index, NumIndices_>& dimensions)
         : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions)
       { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN }
 
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    template <typename... DenseIndex>
+    EIGEN_DEVICE_FUNC TensorStorage(DenseIndex... indices) : m_dimensions(indices...) {
+      m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(m_dimensions));
+    }
+#endif
+
     EIGEN_DEVICE_FUNC TensorStorage(const Self& other)
       : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(other.m_dimensions)))
       , m_dimensions(other.m_dimensions)
@@ -98,6 +108,20 @@
       return *this;
     }
 
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC TensorStorage(Self&& other) : TensorStorage()
+    {
+      *this = std::move(other);
+    }
+    
+    EIGEN_DEVICE_FUNC Self& operator=(Self&& other)
+    {
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_dimensions, other.m_dimensions);
+      return *this;
+    }
+#endif
+
     EIGEN_DEVICE_FUNC  ~TensorStorage() { internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions)); }
     EIGEN_DEVICE_FUNC  void swap(Self& other)
     { numext::swap(m_data,other.m_data); numext::swap(m_dimensions,other.m_dimensions); }
@@ -112,9 +136,12 @@
         internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, currentSz);
         if (size)
           m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size);
-        else
+        else if (NumIndices_ == 0) {
+	  m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1);
+	}
+	else 
           m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
       }
       m_dimensions = nbDimensions;
     }
@@ -129,119 +156,6 @@
   Dimensions m_dimensions;
 };
 
-
-// pure dynamic
-template<typename T, int Options_>
-class TensorStorage<T, VSizes<DenseIndex>, Options_>
-{
-    T* m_data;
-    VSizes<DenseIndex> m_dimensions;
-    typedef TensorStorage<T, VSizes<DenseIndex>, Options_> Self_;
-
-  public:
-    EIGEN_DEVICE_FUNC TensorStorage() : m_data(0), m_dimensions() {}
-
-    template <DenseIndex NumDims>
-    EIGEN_DEVICE_FUNC TensorStorage(const array<DenseIndex, NumDims>& dimensions)
-      {
-        m_dimensions.resize(NumDims);
-        for (int i = 0; i < NumDims; ++i) {
-          m_dimensions[i] = dimensions[i];
-        }
-        const DenseIndex size = array_prod(dimensions);
-        m_data = internal::conditional_managed_new_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(size);
-        EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN
-      }
-
-    EIGEN_DEVICE_FUNC TensorStorage(const std::vector<DenseIndex>& dimensions)
-        : m_dimensions(dimensions)
-      {
-        const DenseIndex size = internal::array_prod(dimensions);
-        m_data = internal::conditional_managed_new_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(size);
-        EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN
-      }
-
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
-    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    TensorStorage(IndexTypes... dimensions) {
-      const int NumDims = sizeof...(dimensions);
-      m_dimensions.resize(NumDims);
-      const array<DenseIndex, NumDims> dim{{dimensions...}};
-      DenseIndex size = 1;
-      for (int i = 0; i < NumDims; ++i) {
-        size *= dim[i];
-        m_dimensions[i] = dim[i];
-      }
-      m_data = internal::conditional_managed_new_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(size);
-      EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN
-    }
-#endif
-
-    EIGEN_DEVICE_FUNC TensorStorage(const Self_& other)
-      : m_data(internal::conditional_managed_new_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(internal::array_prod(other.m_dimensions)))
-      , m_dimensions(other.m_dimensions)
-    {
-      internal::smart_copy(other.m_data, other.m_data+internal::array_prod(other.m_dimensions), m_data);
-    }
-
-    EIGEN_DEVICE_FUNC Self_& operator=(const Self_& other)
-    {
-      if (this != &other) {
-        Self_ tmp(other);
-        this->swap(tmp);
-      }
-      return *this;
-    }
-
-    EIGEN_DEVICE_FUNC ~TensorStorage()
-    {
-      internal::conditional_managed_delete_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(m_data, internal::array_prod(m_dimensions));
-    }
-
-    EIGEN_DEVICE_FUNC void swap(Self_& other)
-    { std::swap(m_data,other.m_data); std::swap(m_dimensions,other.m_dimensions); }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const VSizes<DenseIndex>& dimensions() const { return m_dimensions; }
-
-    template <typename NewDimensions> EIGEN_DEVICE_FUNC
-    void resize(DenseIndex size, const NewDimensions& nbDimensions)
-    {
-      const DenseIndex currentSz = internal::array_prod(m_dimensions);
-      if(size != currentSz)
-      {
-        internal::conditional_managed_delete_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(m_data, currentSz);
-        if (size)
-          m_data = internal::conditional_managed_new_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(size);
-        else
-          m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
-      }
-      m_dimensions.resize(internal::array_size<NewDimensions>::value);
-      for (int i = 0; i < internal::array_size<NewDimensions>::value; ++i) {
-        m_dimensions[i] = nbDimensions[i];
-      }
-    }
-    EIGEN_DEVICE_FUNC void resize(DenseIndex size, const std::vector<DenseIndex>& nbDimensions)
-    {
-      const DenseIndex currentSz = internal::array_prod(m_dimensions);
-      if(size != currentSz)
-      {
-        internal::conditional_managed_delete_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(m_data, currentSz);
-        if (size)
-          m_data = internal::conditional_managed_new_auto<T,(Options_&DontAlign)==0,(Options_&AllocateUVM)>(size);
-        else
-          m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
-      }
-      m_dimensions = nbDimensions;
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T *data() { return m_data; }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T *data() const { return m_data; }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); }
-};
-
 } // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSORSTORAGE_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 10be2f2..2f62a66 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h

@@ -25,19 +25,19 @@
 {
   typedef typename XprType::Scalar Scalar;
   typedef traits<XprType> XprTraits;
-  typedef typename packet_traits<Scalar>::type Packet;
   typedef typename XprTraits::StorageKind StorageKind;
   typedef typename XprTraits::Index Index;
   typedef typename XprType::Nested Nested;
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Strides, typename XprType>
 struct eval<TensorStridingOp<Strides, XprType>, Eigen::Dense>
 {
-  typedef const TensorStridingOp<Strides, XprType>& type;
+  typedef const TensorStridingOp<Strides, XprType>EIGEN_DEVICE_REF type;
 };
 
 template<typename Strides, typename XprType>
@@ -54,16 +54,15 @@
 class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
 {
   public:
-  typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
-  typedef typename Eigen::internal::traits<TensorStridingOp>::Packet Packet;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index;
+    typedef TensorBase<TensorStridingOp<Strides, XprType> > Base;
+    typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims)
       : m_xpr(expr), m_dims(dims) {}
 
     EIGEN_DEVICE_FUNC
@@ -73,26 +72,7 @@
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorStridingOp& operator = (const TensorStridingOp& other)
-    {
-      typedef TensorAssignOp<TensorStridingOp, const TensorStridingOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(
-          assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorStridingOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorStridingOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(
-          assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingOp)
 
   protected:
     typename XprType::Nested m_xpr;
@@ -110,24 +90,31 @@
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device)
   {
     m_dimensions = m_impl.dimensions();
     for (int i = 0; i < NumDims; ++i) {
-      m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
+      m_dimensions[i] =Eigen::numext::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
     }
 
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@@ -152,13 +139,14 @@
     }
   }
 
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType/*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -170,12 +158,13 @@
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + PacketSize - 1};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx0 = indices[0] / m_outputStrides[i];
         const Index idx1 = indices[1] / m_outputStrides[i];
@@ -187,6 +176,7 @@
       inputIndices[0] += indices[0] * m_inputStrides[0];
       inputIndices[1] += indices[1] * m_inputStrides[0];
     } else {  // RowMajor
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx0 = indices[0] / m_outputStrides[i];
         const Index idx1 = indices[1] / m_outputStrides[i];
@@ -201,10 +191,12 @@
     if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
       PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
       return rslt;
-    } else {
-      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    }
+    else {
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       values[0] = m_impl.coeff(inputIndices[0]);
       values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < PacketSize-1; ++i) {
         values[i] = coeff(index+i);
       }
@@ -217,23 +209,30 @@
     double compute_cost = (NumDims - 1) * (TensorOpCost::AddCost<Index>() +
                                            TensorOpCost::MulCost<Index>() +
                                            TensorOpCost::DivCost<Index>()) +
-                           TensorOpCost::MulCost<Index>();
+        TensorOpCost::MulCost<Index>();
     if (vectorized) {
       compute_cost *= 2;  // packet() computes two indices
     }
     const int innerDim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : (NumDims - 1);
     return m_impl.costPerCoeff(vectorized && m_inputStrides[innerDim] == 1) +
-           // Computation is not vectorized per se, but it is done once per packet.
-           TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+        // Computation is not vectorized per se, but it is done once per packet.
+        TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
 
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx = index / m_outputStrides[i];
         inputIndex += idx * m_inputStrides[i];
@@ -241,6 +240,7 @@
       }
       inputIndex += index * m_inputStrides[0];
     } else {  // RowMajor
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx = index / m_outputStrides[i];
         inputIndex += idx * m_inputStrides[i];
@@ -257,7 +257,6 @@
   TensorEvaluator<ArgType, Device> m_impl;
 };
 
-
 // Eval as lvalue
 template<typename Strides, typename ArgType, typename Device>
 struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
@@ -265,24 +264,28 @@
 {
   typedef TensorStridingOp<Strides, ArgType> XprType;
   typedef TensorEvaluator<const XprType, Device> Base;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  //  typedef typename XprType::Index Index;
   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  //  typedef DSizes<Index, NumDims> Dimensions;
 
   enum {
-    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : Base(op, device) { }
 
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
   {
     return this->m_impl.coeffRef(this->srcCoeff(index));
@@ -291,12 +294,13 @@
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());
 
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + PacketSize - 1};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx0 = indices[0] / this->m_outputStrides[i];
         const Index idx1 = indices[1] / this->m_outputStrides[i];
@@ -308,6 +312,7 @@
       inputIndices[0] += indices[0] * this->m_inputStrides[0];
       inputIndices[1] += indices[1] * this->m_inputStrides[0];
     } else {  // RowMajor
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx0 = indices[0] / this->m_outputStrides[i];
         const Index idx1 = indices[1] / this->m_outputStrides[i];
@@ -321,11 +326,13 @@
     }
     if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
       this->m_impl.template writePacket<Unaligned>(inputIndices[0], x);
-    } else {
-      EIGEN_ALIGN_DEFAULT Scalar values[PacketSize];
+    }
+    else {
+      EIGEN_ALIGN_MAX Scalar values[PacketSize];
       internal::pstore<Scalar, PacketReturnType>(values, x);
       this->m_impl.coeffRef(inputIndices[0]) = values[0];
       this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1];
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < PacketSize-1; ++i) {
         this->coeffRef(index+i) = values[i];
       }

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
new file mode 100644
index 0000000..926ecdd
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h

@@ -0,0 +1,303 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gagan Goel <gagan.nith@gmail.com>
+// Copyright (C) 2017 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
+
+namespace Eigen {
+
+/** \class TensorTrace
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor Trace class.
+  *
+  *
+  */
+
+namespace internal {
+template<typename Dims, typename XprType>
+struct traits<TensorTraceOp<Dims, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename Dims, typename XprType>
+struct eval<TensorTraceOp<Dims, XprType>, Eigen::Dense>
+{
+  typedef const TensorTraceOp<Dims, XprType>& type;
+};
+
+template<typename Dims, typename XprType>
+struct nested<TensorTraceOp<Dims, XprType>, 1, typename eval<TensorTraceOp<Dims, XprType> >::type>
+{
+  typedef TensorTraceOp<Dims, XprType> type;
+};
+
+} // end namespace internal
+
+
+template<typename Dims, typename XprType>
+class TensorTraceOp : public TensorBase<TensorTraceOp<Dims, XprType> >
+{
+  public:
+    typedef typename Eigen::internal::traits<TensorTraceOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorTraceOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorTraceOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorTraceOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTraceOp(const XprType& expr, const Dims& dims)
+      : m_xpr(expr), m_dims(dims) {
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Dims& dims() const { return m_dims; }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<typename XprType::Nested>::type& expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const Dims m_dims;
+};
+
+
+// Eval as rvalue
+template<typename Dims, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
+{
+  typedef TensorTraceOp<Dims, ArgType> XprType;
+  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static const int NumReducedDims = internal::array_size<Dims>::value;
+  static const int NumOutputDims = NumInputDims - NumReducedDims;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumOutputDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : m_impl(op.expression(), device), m_traceDim(1), m_device(device)
+  {
+
+    EIGEN_STATIC_ASSERT((NumOutputDims >= 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumReducedDims >= 2) || ((NumReducedDims == 0) && (NumInputDims == 0)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    for (int i = 0; i < NumInputDims; ++i) {
+      m_reduced[i] = false;
+    }
+
+    const Dims& op_dims = op.dims();
+    for (int i = 0; i < NumReducedDims; ++i) {
+      eigen_assert(op_dims[i] >= 0);
+      eigen_assert(op_dims[i] < NumInputDims);
+      m_reduced[op_dims[i]] = true;
+    }
+
+    // All the dimensions should be distinct to compute the trace
+    int num_distinct_reduce_dims = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (m_reduced[i]) {
+        ++num_distinct_reduce_dims;
+      }
+    }
+
+    eigen_assert(num_distinct_reduce_dims == NumReducedDims);
+
+    // Compute the dimensions of the result.
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+    int output_index = 0;
+    int reduced_index = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (m_reduced[i]) {
+        m_reducedDims[reduced_index] = input_dims[i];
+        if (reduced_index > 0) {
+          // All the trace dimensions must have the same size
+          eigen_assert(m_reducedDims[0] == m_reducedDims[reduced_index]);
+        }
+        ++reduced_index;
+      }
+      else {
+        m_dimensions[output_index] = input_dims[i];
+        ++output_index;
+      }
+    }
+
+    if (NumReducedDims != 0) {
+      m_traceDim = m_reducedDims[0];
+    }
+
+    // Compute the output strides
+    if (NumOutputDims > 0) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        m_outputStrides[0] = 1;
+        for (int i = 1; i < NumOutputDims; ++i) {
+          m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+        }
+      }
+      else {
+        m_outputStrides.back() = 1;
+        for (int i = NumOutputDims - 2; i >= 0; --i) {
+          m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+        }
+      }
+    }
+
+    // Compute the input strides
+    if (NumInputDims > 0) {
+      array<Index, NumInputDims> input_strides;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        input_strides[0] = 1;
+        for (int i = 1; i < NumInputDims; ++i) {
+          input_strides[i] = input_strides[i - 1] * input_dims[i - 1];
+        }
+      }
+      else {
+        input_strides.back() = 1;
+        for (int i = NumInputDims - 2; i >= 0; --i) {
+          input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
+        }
+      }
+
+      output_index = 0;
+      reduced_index = 0;
+      for (int i = 0; i < NumInputDims; ++i) {
+        if(m_reduced[i]) {
+          m_reducedStrides[reduced_index] = input_strides[i];
+          ++reduced_index;
+        }
+        else {
+          m_preservedStrides[output_index] = input_strides[i];
+          ++output_index;
+        }
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+    return m_dimensions;
+  }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    // Initialize the result
+    CoeffReturnType result = internal::cast<int, CoeffReturnType>(0);
+    Index index_stride = 0;
+    for (int i = 0; i < NumReducedDims; ++i) {
+      index_stride += m_reducedStrides[i];
+    }
+
+    // If trace is requested along all dimensions, starting index would be 0
+    Index cur_index = 0;
+    if (NumOutputDims != 0)
+      cur_index = firstInput(index);
+    for (Index i = 0; i < m_traceDim; ++i) {
+        result += m_impl.coeff(cur_index);
+        cur_index += index_stride;
+    }
+
+    return result;
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) {
+        values[i] = coeff(index + i);
+    }
+    PacketReturnType result = internal::ploadt<PacketReturnType, LoadMode>(values);
+    return result;
+  }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
+
+ protected:
+  // Given the output index, finds the first index in the input tensor used to compute the trace
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+    Index startInput = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumOutputDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        startInput += idx * m_preservedStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      startInput += index * m_preservedStrides[0];
+    }
+    else {
+      for (int i = 0; i < NumOutputDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i];
+        startInput += idx * m_preservedStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      startInput += index * m_preservedStrides[NumOutputDims - 1];
+    }
+    return startInput;
+  }
+
+  Dimensions m_dimensions;
+  TensorEvaluator<ArgType, Device> m_impl;
+  // Initialize the size of the trace dimension
+  Index m_traceDim;
+  const Device EIGEN_DEVICE_REF m_device;
+  array<bool, NumInputDims> m_reduced;
+  array<Index, NumReducedDims> m_reducedDims;
+  array<Index, NumOutputDims> m_outputStrides;
+  array<Index, NumReducedDims> m_reducedStrides;
+  array<Index, NumOutputDims> m_preservedStrides;
+};
+
+
+} // End namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_TRACE_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index 75894d6..4f7fd34 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h

@@ -20,27 +20,27 @@
   enum {
     is_dynamic_size_storage = 1,
 
-    aligned_bit =
+    is_aligned =
     (
         ((Options&DontAlign)==0) && (
-#if EIGEN_ALIGN_STATICALLY
+#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
             (!is_dynamic_size_storage)
 #else
             0
 #endif
-            ||
-#if EIGEN_ALIGN
+            |
+#if EIGEN_MAX_ALIGN_BYTES>0
             is_dynamic_size_storage
 #else
             0
 #endif
       )
-    ) ? AlignedBit : 0,
-    packet_access_bit = packet_traits<Scalar>::Vectorizable && aligned_bit ? PacketAccessBit : 0
+     ),
+    packet_access_bit = packet_traits<Scalar>::Vectorizable && is_aligned ? PacketAccessBit : 0
   };
 
   public:
-    enum { ret = packet_access_bit | aligned_bit};
+    enum { ret = packet_access_bit };
 };
 
 
@@ -54,8 +54,12 @@
   static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
   enum {
     Options = Options_,
-    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0 : LvalueBit),
+    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0 : LvalueBit)
   };
+  template <typename T> struct MakePointer {
+    typedef T* Type;
+  };
+  typedef typename MakePointer<Scalar>::Type PointerType;
 };
 
 
@@ -69,12 +73,17 @@
   static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
   enum {
     Options = Options_,
-    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0: LvalueBit),
+    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0: LvalueBit)
   };
+  template <typename T> struct MakePointer {
+    typedef T* Type;
+  };
+  typedef typename MakePointer<Scalar>::Type PointerType;
 };
 
-template<typename PlainObjectType, int Options_>
-struct traits<TensorMap<PlainObjectType, Options_> >
+
+template<typename PlainObjectType, int Options_, template <class> class MakePointer_>
+struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> >
   : public traits<PlainObjectType>
 {
   typedef traits<PlainObjectType> BaseTraits;
@@ -85,8 +94,14 @@
   static const int Layout = BaseTraits::Layout;
   enum {
     Options = Options_,
-    Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
+    Flags = BaseTraits::Flags
   };
+  template <class T> struct MakePointer {
+    // Intermediate typedef to workaround MSVC issue.
+    typedef MakePointer_<T> MakePointerT;
+    typedef typename MakePointerT::Type Type;
+  };
+  typedef typename MakePointer<Scalar>::Type PointerType;
 };
 
 template<typename PlainObjectType>
@@ -101,107 +116,101 @@
   static const int Layout = BaseTraits::Layout;
   enum {
     Options = BaseTraits::Options,
-    Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
+    Flags = BaseTraits::Flags
   };
+  typedef typename BaseTraits::PointerType PointerType;
 };
 
 
 template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
 struct eval<Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
+  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
 struct eval<const Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
+  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
 struct eval<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
 struct eval<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 
-template<typename PlainObjectType, int Options>
-struct eval<TensorMap<PlainObjectType, Options>, Eigen::Dense>
+template<typename PlainObjectType, int Options, template <class> class MakePointer>
+struct eval<TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
 {
-  typedef const TensorMap<PlainObjectType, Options>& type;
+  typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
 };
 
-template<typename PlainObjectType, int Options>
-struct eval<const TensorMap<PlainObjectType, Options>, Eigen::Dense>
+template<typename PlainObjectType, int Options, template <class> class MakePointer>
+struct eval<const TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
 {
-  typedef const TensorMap<PlainObjectType, Options>& type;
+  typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
 };
 
 template<typename PlainObjectType>
 struct eval<TensorRef<PlainObjectType>, Eigen::Dense>
 {
-  typedef const TensorRef<PlainObjectType>& type;
+  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
 };
 
 template<typename PlainObjectType>
 struct eval<const TensorRef<PlainObjectType>, Eigen::Dense>
 {
-  typedef const TensorRef<PlainObjectType>& type;
+  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
 };
 
-
-template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
-struct nested<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, 1, typename eval<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >::type>
+// TODO nested<> does not exist anymore in Eigen/Core, and it thus has to be removed in favor of ref_selector.
+template<typename T, int n=1, typename PlainObject = void> struct nested
 {
-  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
+  typedef typename ref_selector<T>::type type;
 };
 
 template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
-struct nested<const Tensor<Scalar_, NumIndices_, Options_, IndexType_>, 1, typename eval<const Tensor<Scalar_, NumIndices_, Options_, IndexType_> >::type>
+struct nested<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
 {
-  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
+  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type;
+};
+
+template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
+struct nested<const Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
+{
+  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
-struct nested<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, 1, typename eval<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >::type>
+struct nested<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
-struct nested<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, 1, typename eval<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >::type>
+struct nested<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 
-template <typename PlainObjectType, int Options>
-struct nested<TensorMap<PlainObjectType, Options>, 1, typename eval<TensorMap<PlainObjectType, Options> >::type>
+template <typename PlainObjectType>
+struct nested<TensorRef<PlainObjectType> >
 {
-  typedef const TensorMap<PlainObjectType, Options>& type;
-};
-
-template <typename PlainObjectType, int Options>
-struct nested<const TensorMap<PlainObjectType, Options>, 1, typename eval<TensorMap<PlainObjectType, Options> >::type>
-{
-  typedef const TensorMap<PlainObjectType, Options>& type;
+  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
 };
 
 template <typename PlainObjectType>
-struct nested<TensorRef<PlainObjectType>, 1, typename eval<TensorRef<PlainObjectType> >::type>
+struct nested<const TensorRef<PlainObjectType> >
 {
-  typedef const TensorRef<PlainObjectType>& type;
-};
-
-template <typename PlainObjectType>
-struct nested<const TensorRef<PlainObjectType>, 1, typename eval<TensorRef<PlainObjectType> >::type>
-{
-  typedef const TensorRef<PlainObjectType>& type;
+  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
 };
 
 }  // end namespace internal
@@ -247,7 +256,7 @@
 // Pc=0.
 typedef enum {
   PADDING_VALID = 1,
-  PADDING_SAME = 2,
+  PADDING_SAME = 2
 } PaddingType;
 
 }  // end namespace Eigen

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index 070f0b5..afbcba4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h

@@ -13,14 +13,17 @@
 namespace Eigen {
 namespace internal {
 
+
 template <uint64_t n>
 struct static_val {
   static const uint64_t value = n;
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() { }
+
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
+    EIGEN_UNUSED_VARIABLE(v);
     eigen_assert(v == n);
   }
 };
@@ -52,12 +55,12 @@
   template<typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   explicit TensorUInt128(const T& x) : high(0), low(x) {
-    eigen_assert(x <= NumTraits<LOW>::highest());
+    eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= NumTraits<uint64_t>::highest()));
     eigen_assert(x >= 0);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  TensorUInt128(uint64_t y, uint64_t x) : high(y), low(x) { }
+  TensorUInt128(HIGH y, LOW x) : high(y), low(x) { }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator LOW() const {
     return low;
@@ -73,21 +76,21 @@
 
 template <typename HL, typename LL, typename HR, typename LR>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
-  return (lhs.high == rhs.high) & (lhs.low == rhs.low);
+  return (lhs.high == rhs.high) && (lhs.low == rhs.low);
 }
 
 template <typename HL, typename LL, typename HR, typename LR>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
-  return (lhs.high != rhs.high) | (lhs.low != rhs.low);
+  return (lhs.high != rhs.high) || (lhs.low != rhs.low);
 }
 
 template <typename HL, typename LL, typename HR, typename LR>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   if (lhs.high != rhs.high) {
     return lhs.high > rhs.high;
@@ -97,7 +100,7 @@
 
 template <typename HL, typename LL, typename HR, typename LR>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   if (lhs.high != rhs.high) {
     return lhs.high < rhs.high;
@@ -107,7 +110,7 @@
 
 template <typename HL, typename LL, typename HR, typename LR>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   TensorUInt128<uint64_t, uint64_t> result(lhs.high + rhs.high, lhs.low + rhs.low);
   if (result.low < rhs.low) {
@@ -118,7 +121,7 @@
 
 template <typename HL, typename LL, typename HR, typename LR>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   TensorUInt128<uint64_t, uint64_t> result(lhs.high - rhs.high, lhs.low - rhs.low);
   if (result.low > lhs.low) {
@@ -129,8 +132,8 @@
 
 
 template <typename HL, typename LL, typename HR, typename LR>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-static TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   // Split each 128-bit integer into 4 32-bit integers, and then do the
   // multiplications by hand as follow:
@@ -159,7 +162,7 @@
   // Compute the low 32 bits of low
   uint64_t acc = d * h;
   uint64_t low = acc & LOW;
-  // Compute the high 32 bits of low. Add a carry every time we wrap around
+  //  Compute the high 32 bits of low. Add a carry every time we wrap around
   acc >>= 32LL;
   uint64_t carry = 0;
   uint64_t acc2 = acc + c * h;
@@ -204,8 +207,8 @@
 }
 
 template <typename HL, typename LL, typename HR, typename LR>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-static TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   if (rhs == TensorUInt128<static_val<0>, static_val<1> >(1)) {
     return TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index 669505c..0beb9ff 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h

@@ -22,18 +22,20 @@
   * dimensions.
   */
 namespace internal {
+
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
 struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits<XprType>
 {
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef traits<XprType> XprTraits;
-  typedef typename packet_traits<Scalar>::type Packet;
   typedef typename XprTraits::StorageKind StorageKind;
   typedef typename XprTraits::Index Index;
   typedef typename XprType::Nested Nested;
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions + 1;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+
 };
 
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
@@ -55,10 +57,8 @@
 {
   public:
   typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Scalar Scalar;
-  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Packet Packet;
   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
   typedef typename Eigen::internal::nested<TensorVolumePatchOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorVolumePatchOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Index Index;
@@ -68,12 +68,12 @@
                                                             DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
                                                             DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
                                                             PaddingType padding_type, Scalar padding_value)
-      : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
-        m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
-        m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
-        m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
-        m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
-        m_padding_type(padding_type), m_padding_value(padding_value) {}
+                                                            : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+                                                            m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
+                                                            m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+                                                            m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+                                                            m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
+                                                            m_padding_type(padding_type), m_padding_value(padding_value) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
                                                            DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
@@ -83,13 +83,13 @@
                                                            DenseIndex padding_top, DenseIndex padding_bottom,
                                                            DenseIndex padding_left, DenseIndex padding_right,
                                                            Scalar padding_value)
-      : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
-        m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
-        m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
-        m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
-        m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
-        m_padding_left(padding_left), m_padding_right(padding_right),
-        m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
+                                                           : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+                                                           m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
+                                                           m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+                                                           m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+                                                           m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
+                                                           m_padding_left(padding_left), m_padding_right(padding_right),
+                                                           m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
 
     EIGEN_DEVICE_FUNC
     DenseIndex patch_planes() const { return m_patch_planes; }
@@ -175,22 +175,29 @@
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-  static const Index PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = NumDims == 6,
+    CoordAccess = false,
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) :
+ m_impl(op.expression(), device)
   {
-    EIGEN_STATIC_ASSERT(NumDims >= 5, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
     m_paddingValue = op.padding_value();
 
@@ -230,9 +237,9 @@
     m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1);
 
     if (op.padding_explicit()) {
-      m_outputPlanes = Eigen::divup(m_input_planes_eff + static_cast<Index>(op.padding_top_z() + op.padding_bottom_z()) - m_patch_planes_eff + 1, m_plane_strides);
-      m_outputRows = Eigen::divup(m_input_rows_eff + static_cast<Index>(op.padding_top() + op.padding_bottom()) - m_patch_rows_eff + 1, m_row_strides);
-      m_outputCols = Eigen::divup(m_input_cols_eff + static_cast<Index>(op.padding_left() + op.padding_right()) - m_patch_cols_eff + 1, m_col_strides);
+      m_outputPlanes = numext::ceil((m_input_planes_eff + op.padding_top_z() + op.padding_bottom_z() - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides));
+      m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
+      m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
       m_planePaddingTop = op.padding_top_z();
       m_rowPaddingTop = op.padding_top();
       m_colPaddingLeft = op.padding_left();
@@ -240,20 +247,20 @@
       // Computing padding from the type
       switch (op.padding_type()) {
         case PADDING_VALID:
-          m_outputPlanes = Eigen::divup(m_input_planes_eff - m_patch_planes_eff + 1, m_plane_strides);
-          m_outputRows = Eigen::divup(m_input_rows_eff - m_patch_rows_eff + 1, m_row_strides);
-          m_outputCols = Eigen::divup(m_input_cols_eff - m_patch_cols_eff + 1, m_col_strides);
+          m_outputPlanes = numext::ceil((m_input_planes_eff - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides));
+          m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
+          m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
           m_planePaddingTop = 0;
           m_rowPaddingTop = 0;
           m_colPaddingLeft = 0;
           break;
         case PADDING_SAME: {
-          m_outputPlanes = Eigen::divup(m_input_planes_eff, m_plane_strides);
-          m_outputRows = Eigen::divup(m_input_rows_eff, m_row_strides);
-          m_outputCols = Eigen::divup(m_input_cols_eff, m_col_strides);
-          const Index dz = numext::maxi<DenseIndex>(0, (m_outputPlanes  - 1) * m_plane_strides + m_patch_planes_eff - m_input_planes_eff);
-          const Index dy = numext::maxi<DenseIndex>(0, (m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff);
-          const Index dx = numext::maxi<DenseIndex>(0, (m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff);
+          m_outputPlanes = numext::ceil(m_input_planes_eff / static_cast<float>(m_plane_strides));
+          m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
+          m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
+          const Index dz = (m_outputPlanes - 1) * m_plane_strides + m_patch_planes_eff - m_input_planes_eff;
+          const Index dy = (m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff;
+          const Index dx = (m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff;
           m_planePaddingTop = dz / 2;
           m_rowPaddingTop = dy / 2;
           m_colPaddingLeft = dx / 2;
@@ -325,6 +332,7 @@
 
     // Fast representations of different variables.
     m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
+
     m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
     m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
     m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride);
@@ -344,12 +352,12 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -411,7 +419,7 @@
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 ||
@@ -505,103 +513,38 @@
     return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
 
   const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
-  Index planePaddingTop() const { return m_planePaddingTop; }
-  Index rowPaddingTop() const { return m_rowPaddingTop; }
-  Index colPaddingLeft() const { return m_colPaddingLeft; }
-  Index outputPlanes() const { return m_outputPlanes; }
-  Index outputRows() const { return m_outputRows; }
-  Index outputCols() const { return m_outputCols; }
-  Index userPlaneStride() const { return m_plane_strides; }
-  Index userRowStride() const { return m_row_strides; }
-  Index userColStride() const { return m_col_strides; }
-  Index userInPlaneStride() const { return m_in_plane_strides; }
-  Index userInRowStride() const { return m_in_row_strides; }
-  Index userInColStride() const { return m_in_col_strides; }
-  Index planeInflateStride() const { return m_plane_inflate_strides; }
-  Index rowInflateStride() const { return m_row_inflate_strides; }
-  Index colInflateStride() const { return m_col_inflate_strides; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
-  {
-    // ColMajor
-    //   0: depth, 1: patch_planes, 2: patch_rows, 3: patch_cols, 4: number of patches, 5: batches
-    // RowMajor
-    //   0: batches, 1: number of patches, 2: patch_cols , 3: patch_rows, 4: patch_planes, 5: depth
-    const Index patch3DIndex = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 4 : 1];
-    const Index colOffset = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 3 : 2];
-    const Index rowOffset= coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 3];
-    const Index planeOffset = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 4];
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planePaddingTop() const { return m_planePaddingTop; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputPlanes() const { return m_outputPlanes; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userPlaneStride() const { return m_plane_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInPlaneStride() const { return m_in_plane_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planeInflateStride() const { return m_plane_inflate_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; }
 
-    array<Index, NumDims-1> inputCoords;
-
-    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
-    const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
-    const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
-    if (inputCol < 0 || inputCol >= m_input_cols_eff ||
-        ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
-      return Scalar(m_paddingValue);
-    }
-
-    const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
-    const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
-    const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
-    if (inputRow < 0 || inputRow >= m_input_rows_eff ||
-        ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
-      return Scalar(m_paddingValue);
-    }
-
-    const Index planeIndex = patch3DIndex - colIndex * m_outputPlanesRows - rowIndex * m_outputRows;
-    const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop;
-    const Index origInputPlane = (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0);
-    if (inputPlane < 0 || inputPlane >= m_input_planes_eff ||
-        ((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) {
-      return Scalar(m_paddingValue);
-    }
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      inputCoords[0] = coords[0];  // depth
-      inputCoords[1] = origInputPlane;
-      inputCoords[2] = origInputRow;
-      inputCoords[3] = origInputCol;
-      inputCoords[4] = coords[5];  // batch
-    } else {
-      inputCoords[4] = coords[5];  // depth
-      inputCoords[3] = origInputPlane;
-      inputCoords[2] = origInputRow;
-      inputCoords[1] = origInputCol;
-      inputCoords[0] = coords[0];  // batch
-    }
-    if (TensorEvaluator<ArgType, Device>::CoordAccess) {
-      return m_impl.coeff(inputCoords);
-    } else {
-      Index inputIndex;
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        inputIndex =
-          inputCoords[4] * m_otherInputStride +
-          inputCoords[3] * m_colInputStride +
-          inputCoords[2] * m_rowInputStride +
-          inputCoords[1] * m_planeInputStride +
-          inputCoords[0];
-      } else {
-        inputIndex =
-          inputCoords[0] * m_otherInputStride +
-          inputCoords[1] * m_colInputStride +
-          inputCoords[2] * m_rowInputStride +
-          inputCoords[3] * m_planeInputStride +
-          inputCoords[4];
-      }
-      return m_impl.coeff(inputIndex);
-    }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
   }
-
+#endif
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
-    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -611,7 +554,7 @@
 
   Dimensions m_dimensions;
 
-  // Parameters passed to the costructor.
+  // Parameters passed to the constructor.
   Index m_plane_strides;
   Index m_row_strides;
   Index m_col_strides;
@@ -676,6 +619,8 @@
   Scalar m_paddingValue;
 
   TensorEvaluator<ArgType, Device> m_impl;
+
+
 };
 
 

diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
index 50633e2..bc4f202 100644
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h

@@ -33,7 +33,7 @@
     template<typename Op, typename RV, typename Index, std::size_t N, typename... Args>
     inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args) const
     {
-      eigen_assert(N >= m_numIndices && "Can only apply symmetry group to objects that have at least the required number of indices.");
+      eigen_assert(N >= m_numIndices && "Can only apply symmetry group to objects that have at least the required amount of indices.");
       for (std::size_t i = 0; i < size(); i++)
         initial = Op::run(h_permute(i, idx, typename internal::gen_numeric_list<int, N>::type()), m_elements[i].flags, initial, std::forward<Args>(args)...);
       return initial;
@@ -42,7 +42,7 @@
     template<typename Op, typename RV, typename Index, typename... Args>
     inline RV apply(const std::vector<Index>& idx, RV initial, Args&&... args) const
     {
-      eigen_assert(idx.size() >= m_numIndices && "Can only apply symmetry group to objects that have at least the required number of indices.");
+      eigen_assert(idx.size() >= m_numIndices && "Can only apply symmetry group to objects that have at least the required amount of indices.");
       for (std::size_t i = 0; i < size(); i++)
         initial = Op::run(h_permute(i, idx), m_elements[i].flags, initial, std::forward<Args>(args)...);
       return initial;
@@ -55,7 +55,7 @@
     inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const
     {
       static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
-      return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{firstIndex, otherIndices...});
+      return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
     }
 
     template<typename Tensor_>
@@ -90,7 +90,7 @@
     template<typename Index, std::size_t N, int... n>
     inline std::array<Index, N> h_permute(std::size_t which, const std::array<Index, N>& idx, internal::numeric_list<int, n...>) const
     {
-      return std::array<Index, N>{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... };
+      return std::array<Index, N>{{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }};
     }
 
     template<typename Index>

diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
index 255c344..942293b 100644
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h

@@ -217,7 +217,7 @@
     inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const
     {
       static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
-      return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{firstIndex, otherIndices...});
+      return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
     }
 
     template<typename Tensor_>

diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
index 0fe0b7c..54bf9db 100644
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h

@@ -17,7 +17,7 @@
 namespace group_theory {
 
 /** \internal
-  * \file CXX11/Tensor/util/TemplateGroupTheory.h
+  * \file CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
   * This file contains C++ templates that implement group theory algorithms.
   *
   * The algorithms allow for a compile-time analysis of finite groups.
@@ -167,7 +167,9 @@
   typename elements,
   bool dont_add_current_element   // = false
 >
-struct dimino_first_step_elements_helper :
+struct dimino_first_step_elements_helper
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  : // recursive inheritance is too difficult for Doxygen
   public dimino_first_step_elements_helper<
     Multiply,
     Equality,
@@ -187,6 +189,7 @@
   typename elements
 >
 struct dimino_first_step_elements_helper<Multiply, Equality, id, g, current_element, elements, true>
+#endif // EIGEN_PARSED_BY_DOXYGEN
 {
   typedef elements type;
   constexpr static int global_flags = Equality<current_element, id>::global_flags;
@@ -241,7 +244,7 @@
   * multiplying all elements in the given subgroup with the new
   * coset representative. Note that the first element of the
   * subgroup is always the identity element, so the first element of
-  * ther result of this template is going to be the coset
+  * the result of this template is going to be the coset
   * representative itself.
   *
   * Note that this template accepts an additional boolean parameter

diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h b/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h
new file mode 100644
index 0000000..e4c59dc
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h

@@ -0,0 +1,67 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Barrier is an object that allows one or more threads to wait until
+// Notify has been called a specified number of times.
+
+#ifndef EIGEN_CXX11_THREADPOOL_BARRIER_H
+#define EIGEN_CXX11_THREADPOOL_BARRIER_H
+
+namespace Eigen {
+
+class Barrier {
+ public:
+  Barrier(unsigned int count) : state_(count << 1), notified_(false) {
+    eigen_plain_assert(((count << 1) >> 1) == count);
+  }
+  ~Barrier() { eigen_plain_assert((state_ >> 1) == 0); }
+
+  void Notify() {
+    unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
+    if (v != 1) {
+      // Clear the lowest bit (waiter flag) and check that the original state
+      // value was not zero. If it was zero, it means that notify was called
+      // more times than the original count.
+      eigen_plain_assert(((v + 2) & ~1) != 0);
+      return;  // either count has not dropped to 0, or waiter is not waiting
+    }
+    std::unique_lock<std::mutex> l(mu_);
+    eigen_plain_assert(!notified_);
+    notified_ = true;
+    cv_.notify_all();
+  }
+
+  void Wait() {
+    unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
+    if ((v >> 1) == 0) return;
+    std::unique_lock<std::mutex> l(mu_);
+    while (!notified_) {
+      cv_.wait(l);
+    }
+  }
+
+ private:
+  std::mutex mu_;
+  std::condition_variable cv_;
+  std::atomic<unsigned int> state_;  // low bit is waiter flag
+  bool notified_;
+};
+
+// Notification is an object that allows a user to to wait for another
+// thread to signal a notification that an event has occurred.
+//
+// Multiple threads can wait on the same Notification object,
+// but only one caller must call Notify() on the object.
+struct Notification : Barrier {
+  Notification() : Barrier(1){};
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_BARRIER_H

diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
new file mode 100644
index 0000000..4549aa0
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h

@@ -0,0 +1,249 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
+#define EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
+
+namespace Eigen {
+
+// EventCount allows to wait for arbitrary predicates in non-blocking
+// algorithms. Think of condition variable, but wait predicate does not need to
+// be protected by a mutex. Usage:
+// Waiting thread does:
+//
+//   if (predicate)
+//     return act();
+//   EventCount::Waiter& w = waiters[my_index];
+//   ec.Prewait(&w);
+//   if (predicate) {
+//     ec.CancelWait(&w);
+//     return act();
+//   }
+//   ec.CommitWait(&w);
+//
+// Notifying thread does:
+//
+//   predicate = true;
+//   ec.Notify(true);
+//
+// Notify is cheap if there are no waiting threads. Prewait/CommitWait are not
+// cheap, but they are executed only if the preceding predicate check has
+// failed.
+//
+// Algorithm outline:
+// There are two main variables: predicate (managed by user) and state_.
+// Operation closely resembles Dekker mutual algorithm:
+// https://en.wikipedia.org/wiki/Dekker%27s_algorithm
+// Waiting thread sets state_ then checks predicate, Notifying thread sets
+// predicate then checks state_. Due to seq_cst fences in between these
+// operations it is guaranteed than either waiter will see predicate change
+// and won't block, or notifying thread will see state_ change and will unblock
+// the waiter, or both. But it can't happen that both threads don't see each
+// other changes, which would lead to deadlock.
+class EventCount {
+ public:
+  class Waiter;
+
+  EventCount(MaxSizeVector<Waiter>& waiters)
+      : state_(kStackMask), waiters_(waiters) {
+    eigen_plain_assert(waiters.size() < (1 << kWaiterBits) - 1);
+  }
+
+  ~EventCount() {
+    // Ensure there are no waiters.
+    eigen_plain_assert(state_.load() == kStackMask);
+  }
+
+  // Prewait prepares for waiting.
+  // After calling Prewait, the thread must re-check the wait predicate
+  // and then call either CancelWait or CommitWait.
+  void Prewait() {
+    uint64_t state = state_.load(std::memory_order_relaxed);
+    for (;;) {
+      CheckState(state);
+      uint64_t newstate = state + kWaiterInc;
+      CheckState(newstate);
+      if (state_.compare_exchange_weak(state, newstate,
+                                       std::memory_order_seq_cst))
+        return;
+    }
+  }
+
+  // CommitWait commits waiting after Prewait.
+  void CommitWait(Waiter* w) {
+    eigen_plain_assert((w->epoch & ~kEpochMask) == 0);
+    w->state = Waiter::kNotSignaled;
+    const uint64_t me = (w - &waiters_[0]) | w->epoch;
+    uint64_t state = state_.load(std::memory_order_seq_cst);
+    for (;;) {
+      CheckState(state, true);
+      uint64_t newstate;
+      if ((state & kSignalMask) != 0) {
+        // Consume the signal and return immidiately.
+        newstate = state - kWaiterInc - kSignalInc;
+      } else {
+        // Remove this thread from pre-wait counter and add to the waiter stack.
+        newstate = ((state & kWaiterMask) - kWaiterInc) | me;
+        w->next.store(state & (kStackMask | kEpochMask),
+                      std::memory_order_relaxed);
+      }
+      CheckState(newstate);
+      if (state_.compare_exchange_weak(state, newstate,
+                                       std::memory_order_acq_rel)) {
+        if ((state & kSignalMask) == 0) {
+          w->epoch += kEpochInc;
+          Park(w);
+        }
+        return;
+      }
+    }
+  }
+
+  // CancelWait cancels effects of the previous Prewait call.
+  void CancelWait() {
+    uint64_t state = state_.load(std::memory_order_relaxed);
+    for (;;) {
+      CheckState(state, true);
+      uint64_t newstate = state - kWaiterInc;
+      // We don't know if the thread was also notified or not,
+      // so we should not consume a signal unconditionaly.
+      // Only if number of waiters is equal to number of signals,
+      // we know that the thread was notified and we must take away the signal.
+      if (((state & kWaiterMask) >> kWaiterShift) ==
+          ((state & kSignalMask) >> kSignalShift))
+        newstate -= kSignalInc;
+      CheckState(newstate);
+      if (state_.compare_exchange_weak(state, newstate,
+                                       std::memory_order_acq_rel))
+        return;
+    }
+  }
+
+  // Notify wakes one or all waiting threads.
+  // Must be called after changing the associated wait predicate.
+  void Notify(bool notifyAll) {
+    std::atomic_thread_fence(std::memory_order_seq_cst);
+    uint64_t state = state_.load(std::memory_order_acquire);
+    for (;;) {
+      CheckState(state);
+      const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
+      const uint64_t signals = (state & kSignalMask) >> kSignalShift;
+      // Easy case: no waiters.
+      if ((state & kStackMask) == kStackMask && waiters == signals) return;
+      uint64_t newstate;
+      if (notifyAll) {
+        // Empty wait stack and set signal to number of pre-wait threads.
+        newstate =
+            (state & kWaiterMask) | (waiters << kSignalShift) | kStackMask;
+      } else if (signals < waiters) {
+        // There is a thread in pre-wait state, unblock it.
+        newstate = state + kSignalInc;
+      } else {
+        // Pop a waiter from list and unpark it.
+        Waiter* w = &waiters_[state & kStackMask];
+        uint64_t next = w->next.load(std::memory_order_relaxed);
+        newstate = (state & (kWaiterMask | kSignalMask)) | next;
+      }
+      CheckState(newstate);
+      if (state_.compare_exchange_weak(state, newstate,
+                                       std::memory_order_acq_rel)) {
+        if (!notifyAll && (signals < waiters))
+          return;  // unblocked pre-wait thread
+        if ((state & kStackMask) == kStackMask) return;
+        Waiter* w = &waiters_[state & kStackMask];
+        if (!notifyAll) w->next.store(kStackMask, std::memory_order_relaxed);
+        Unpark(w);
+        return;
+      }
+    }
+  }
+
+  class Waiter {
+    friend class EventCount;
+    // Align to 128 byte boundary to prevent false sharing with other Waiter
+    // objects in the same vector.
+    EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<uint64_t> next;
+    std::mutex mu;
+    std::condition_variable cv;
+    uint64_t epoch = 0;
+    unsigned state = kNotSignaled;
+    enum {
+      kNotSignaled,
+      kWaiting,
+      kSignaled,
+    };
+  };
+
+ private:
+  // State_ layout:
+  // - low kWaiterBits is a stack of waiters committed wait
+  //   (indexes in waiters_ array are used as stack elements,
+  //   kStackMask means empty stack).
+  // - next kWaiterBits is count of waiters in prewait state.
+  // - next kWaiterBits is count of pending signals.
+  // - remaining bits are ABA counter for the stack.
+  //   (stored in Waiter node and incremented on push).
+  static const uint64_t kWaiterBits = 14;
+  static const uint64_t kStackMask = (1ull << kWaiterBits) - 1;
+  static const uint64_t kWaiterShift = kWaiterBits;
+  static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1)
+                                      << kWaiterShift;
+  static const uint64_t kWaiterInc = 1ull << kWaiterShift;
+  static const uint64_t kSignalShift = 2 * kWaiterBits;
+  static const uint64_t kSignalMask = ((1ull << kWaiterBits) - 1)
+                                      << kSignalShift;
+  static const uint64_t kSignalInc = 1ull << kSignalShift;
+  static const uint64_t kEpochShift = 3 * kWaiterBits;
+  static const uint64_t kEpochBits = 64 - kEpochShift;
+  static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
+  static const uint64_t kEpochInc = 1ull << kEpochShift;
+  std::atomic<uint64_t> state_;
+  MaxSizeVector<Waiter>& waiters_;
+
+  static void CheckState(uint64_t state, bool waiter = false) {
+    static_assert(kEpochBits >= 20, "not enough bits to prevent ABA problem");
+    const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
+    const uint64_t signals = (state & kSignalMask) >> kSignalShift;
+    eigen_plain_assert(waiters >= signals);
+    eigen_plain_assert(waiters < (1 << kWaiterBits) - 1);
+    eigen_plain_assert(!waiter || waiters > 0);
+    (void)waiters;
+    (void)signals;
+  }
+
+  void Park(Waiter* w) {
+    std::unique_lock<std::mutex> lock(w->mu);
+    while (w->state != Waiter::kSignaled) {
+      w->state = Waiter::kWaiting;
+      w->cv.wait(lock);
+    }
+  }
+
+  void Unpark(Waiter* w) {
+    for (Waiter* next; w; w = next) {
+      uint64_t wnext = w->next.load(std::memory_order_relaxed) & kStackMask;
+      next = wnext == kStackMask ? nullptr : &waiters_[wnext];
+      unsigned state;
+      {
+        std::unique_lock<std::mutex> lock(w->mu);
+        state = w->state;
+        w->state = Waiter::kSignaled;
+      }
+      // Avoid notifying if it wasn't waiting.
+      if (state == Waiter::kWaiting) w->cv.notify_one();
+    }
+  }
+
+  EventCount(const EventCount&) = delete;
+  void operator=(const EventCount&) = delete;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_

diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
new file mode 100644
index 0000000..23a2b54
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h

@@ -0,0 +1,486 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
+#define EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
+
+namespace Eigen {
+
+template <typename Environment>
+class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
+ public:
+  typedef typename Environment::Task Task;
+  typedef RunQueue<Task, 1024> Queue;
+
+  ThreadPoolTempl(int num_threads, Environment env = Environment())
+      : ThreadPoolTempl(num_threads, true, env) {}
+
+  ThreadPoolTempl(int num_threads, bool allow_spinning,
+                  Environment env = Environment())
+      : env_(env),
+        num_threads_(num_threads),
+        allow_spinning_(allow_spinning),
+        thread_data_(num_threads),
+        all_coprimes_(num_threads),
+        waiters_(num_threads),
+        global_steal_partition_(EncodePartition(0, num_threads_)),
+        blocked_(0),
+        spinning_(0),
+        done_(false),
+        cancelled_(false),
+        ec_(waiters_) {
+    waiters_.resize(num_threads_);
+    // Calculate coprimes of all numbers [1, num_threads].
+    // Coprimes are used for random walks over all threads in Steal
+    // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
+    // a random starting thread index t and calculate num_threads - 1 subsequent
+    // indices as (t + coprime) % num_threads, we will cover all threads without
+    // repetitions (effectively getting a presudo-random permutation of thread
+    // indices).
+    eigen_plain_assert(num_threads_ < kMaxThreads);
+    for (int i = 1; i <= num_threads_; ++i) {
+      all_coprimes_.emplace_back(i);
+      ComputeCoprimes(i, &all_coprimes_.back());
+    }
+#ifndef EIGEN_THREAD_LOCAL
+    init_barrier_.reset(new Barrier(num_threads_));
+#endif
+    thread_data_.resize(num_threads_);
+    for (int i = 0; i < num_threads_; i++) {
+      SetStealPartition(i, EncodePartition(0, num_threads_));
+      thread_data_[i].thread.reset(
+          env_.CreateThread([this, i]() { WorkerLoop(i); }));
+    }
+#ifndef EIGEN_THREAD_LOCAL
+    // Wait for workers to initialize per_thread_map_. Otherwise we might race
+    // with them in Schedule or CurrentThreadId.
+    init_barrier_->Wait();
+#endif
+  }
+
+  ~ThreadPoolTempl() {
+    done_ = true;
+
+    // Now if all threads block without work, they will start exiting.
+    // But note that threads can continue to work arbitrary long,
+    // block, submit new work, unblock and otherwise live full life.
+    if (!cancelled_) {
+      ec_.Notify(true);
+    } else {
+      // Since we were cancelled, there might be entries in the queues.
+      // Empty them to prevent their destructor from asserting.
+      for (size_t i = 0; i < thread_data_.size(); i++) {
+        thread_data_[i].queue.Flush();
+      }
+    }
+    // Join threads explicitly (by destroying) to avoid destruction order within
+    // this class.
+    for (size_t i = 0; i < thread_data_.size(); ++i)
+      thread_data_[i].thread.reset();
+  }
+
+  void SetStealPartitions(const std::vector<std::pair<unsigned, unsigned>>& partitions) {
+    eigen_plain_assert(partitions.size() == static_cast<std::size_t>(num_threads_));
+
+    // Pass this information to each thread queue.
+    for (int i = 0; i < num_threads_; i++) {
+      const auto& pair = partitions[i];
+      unsigned start = pair.first, end = pair.second;
+      AssertBounds(start, end);
+      unsigned val = EncodePartition(start, end);
+      SetStealPartition(i, val);
+    }
+  }
+
+  void Schedule(std::function<void()> fn) EIGEN_OVERRIDE {
+    ScheduleWithHint(std::move(fn), 0, num_threads_);
+  }
+
+  void ScheduleWithHint(std::function<void()> fn, int start,
+                        int limit) override {
+    Task t = env_.CreateTask(std::move(fn));
+    PerThread* pt = GetPerThread();
+    if (pt->pool == this) {
+      // Worker thread of this pool, push onto the thread's queue.
+      Queue& q = thread_data_[pt->thread_id].queue;
+      t = q.PushFront(std::move(t));
+    } else {
+      // A free-standing thread (or worker of another pool), push onto a random
+      // queue.
+      eigen_plain_assert(start < limit);
+      eigen_plain_assert(limit <= num_threads_);
+      int num_queues = limit - start;
+      int rnd = Rand(&pt->rand) % num_queues;
+      eigen_plain_assert(start + rnd < limit);
+      Queue& q = thread_data_[start + rnd].queue;
+      t = q.PushBack(std::move(t));
+    }
+    // Note: below we touch this after making w available to worker threads.
+    // Strictly speaking, this can lead to a racy-use-after-free. Consider that
+    // Schedule is called from a thread that is neither main thread nor a worker
+    // thread of this pool. Then, execution of w directly or indirectly
+    // completes overall computations, which in turn leads to destruction of
+    // this. We expect that such scenario is prevented by program, that is,
+    // this is kept alive while any threads can potentially be in Schedule.
+    if (!t.f) {
+      ec_.Notify(false);
+    } else {
+      env_.ExecuteTask(t);  // Push failed, execute directly.
+    }
+  }
+
+  void Cancel() EIGEN_OVERRIDE {
+    cancelled_ = true;
+    done_ = true;
+
+    // Let each thread know it's been cancelled.
+#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION
+    for (size_t i = 0; i < thread_data_.size(); i++) {
+      thread_data_[i].thread->OnCancel();
+    }
+#endif
+
+    // Wake up the threads without work to let them exit on their own.
+    ec_.Notify(true);
+  }
+
+  int NumThreads() const EIGEN_FINAL { return num_threads_; }
+
+  int CurrentThreadId() const EIGEN_FINAL {
+    const PerThread* pt = const_cast<ThreadPoolTempl*>(this)->GetPerThread();
+    if (pt->pool == this) {
+      return pt->thread_id;
+    } else {
+      return -1;
+    }
+  }
+
+ private:
+  // Create a single atomic<int> that encodes start and limit information for
+  // each thread.
+  // We expect num_threads_ < 65536, so we can store them in a single
+  // std::atomic<unsigned>.
+  // Exposed publicly as static functions so that external callers can reuse
+  // this encode/decode logic for maintaining their own thread-safe copies of
+  // scheduling and steal domain(s).
+  static const int kMaxPartitionBits = 16;
+  static const int kMaxThreads = 1 << kMaxPartitionBits;
+
+  inline unsigned EncodePartition(unsigned start, unsigned limit) {
+    return (start << kMaxPartitionBits) | limit;
+  }
+
+  inline void DecodePartition(unsigned val, unsigned* start, unsigned* limit) {
+    *limit = val & (kMaxThreads - 1);
+    val >>= kMaxPartitionBits;
+    *start = val;
+  }
+
+  void AssertBounds(int start, int end) {
+    eigen_plain_assert(start >= 0);
+    eigen_plain_assert(start < end);  // non-zero sized partition
+    eigen_plain_assert(end <= num_threads_);
+  }
+
+  inline void SetStealPartition(size_t i, unsigned val) {
+    thread_data_[i].steal_partition.store(val, std::memory_order_relaxed);
+  }
+
+  inline unsigned GetStealPartition(int i) {
+    return thread_data_[i].steal_partition.load(std::memory_order_relaxed);
+  }
+
+  void ComputeCoprimes(int N, MaxSizeVector<unsigned>* coprimes) {
+    for (int i = 1; i <= N; i++) {
+      unsigned a = i;
+      unsigned b = N;
+      // If GCD(a, b) == 1, then a and b are coprimes.
+      while (b != 0) {
+        unsigned tmp = a;
+        a = b;
+        b = tmp % b;
+      }
+      if (a == 1) {
+        coprimes->push_back(i);
+      }
+    }
+  }
+
+  typedef typename Environment::EnvThread Thread;
+
+  struct PerThread {
+    constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) {}
+    ThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
+    uint64_t rand;          // Random generator state.
+    int thread_id;          // Worker thread index in pool.
+#ifndef EIGEN_THREAD_LOCAL
+    // Prevent false sharing.
+    char pad_[128];
+#endif
+  };
+
+  struct ThreadData {
+    constexpr ThreadData() : thread(), steal_partition(0), queue() {}
+    std::unique_ptr<Thread> thread;
+    std::atomic<unsigned> steal_partition;
+    Queue queue;
+  };
+
+  Environment env_;
+  const int num_threads_;
+  const bool allow_spinning_;
+  MaxSizeVector<ThreadData> thread_data_;
+  MaxSizeVector<MaxSizeVector<unsigned>> all_coprimes_;
+  MaxSizeVector<EventCount::Waiter> waiters_;
+  unsigned global_steal_partition_;
+  std::atomic<unsigned> blocked_;
+  std::atomic<bool> spinning_;
+  std::atomic<bool> done_;
+  std::atomic<bool> cancelled_;
+  EventCount ec_;
+#ifndef EIGEN_THREAD_LOCAL
+  std::unique_ptr<Barrier> init_barrier_;
+  std::mutex per_thread_map_mutex_;  // Protects per_thread_map_.
+  std::unordered_map<uint64_t, std::unique_ptr<PerThread>> per_thread_map_;
+#endif
+
+  // Main worker thread loop.
+  void WorkerLoop(int thread_id) {
+#ifndef EIGEN_THREAD_LOCAL
+    std::unique_ptr<PerThread> new_pt(new PerThread());
+    per_thread_map_mutex_.lock();
+    bool insertOK = per_thread_map_.emplace(GlobalThreadIdHash(), std::move(new_pt)).second;
+    eigen_plain_assert(insertOK);
+    EIGEN_UNUSED_VARIABLE(insertOK);
+    per_thread_map_mutex_.unlock();
+    init_barrier_->Notify();
+    init_barrier_->Wait();
+#endif
+    PerThread* pt = GetPerThread();
+    pt->pool = this;
+    pt->rand = GlobalThreadIdHash();
+    pt->thread_id = thread_id;
+    Queue& q = thread_data_[thread_id].queue;
+    EventCount::Waiter* waiter = &waiters_[thread_id];
+    // TODO(dvyukov,rmlarsen): The time spent in NonEmptyQueueIndex() is
+    // proportional to num_threads_ and we assume that new work is scheduled at
+    // a constant rate, so we set spin_count to 5000 / num_threads_. The
+    // constant was picked based on a fair dice roll, tune it.
+    const int spin_count =
+        allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0;
+    if (num_threads_ == 1) {
+      // For num_threads_ == 1 there is no point in going through the expensive
+      // steal loop. Moreover, since NonEmptyQueueIndex() calls PopBack() on the
+      // victim queues it might reverse the order in which ops are executed
+      // compared to the order in which they are scheduled, which tends to be
+      // counter-productive for the types of I/O workloads the single thread
+      // pools tend to be used for.
+      while (!cancelled_) {
+        Task t = q.PopFront();
+        for (int i = 0; i < spin_count && !t.f; i++) {
+          if (!cancelled_.load(std::memory_order_relaxed)) {
+            t = q.PopFront();
+          }
+        }
+        if (!t.f) {
+          if (!WaitForWork(waiter, &t)) {
+            return;
+          }
+        }
+        if (t.f) {
+          env_.ExecuteTask(t);
+        }
+      }
+    } else {
+      while (!cancelled_) {
+        Task t = q.PopFront();
+        if (!t.f) {
+          t = LocalSteal();
+          if (!t.f) {
+            t = GlobalSteal();
+            if (!t.f) {
+              // Leave one thread spinning. This reduces latency.
+              if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) {
+                for (int i = 0; i < spin_count && !t.f; i++) {
+                  if (!cancelled_.load(std::memory_order_relaxed)) {
+                    t = GlobalSteal();
+                  } else {
+                    return;
+                  }
+                }
+                spinning_ = false;
+              }
+              if (!t.f) {
+                if (!WaitForWork(waiter, &t)) {
+                  return;
+                }
+              }
+            }
+          }
+        }
+        if (t.f) {
+          env_.ExecuteTask(t);
+        }
+      }
+    }
+  }
+
+  // Steal tries to steal work from other worker threads in the range [start,
+  // limit) in best-effort manner.
+  Task Steal(unsigned start, unsigned limit) {
+    PerThread* pt = GetPerThread();
+    const size_t size = limit - start;
+    unsigned r = Rand(&pt->rand);
+    // Reduce r into [0, size) range, this utilizes trick from
+    // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+    eigen_plain_assert(all_coprimes_[size - 1].size() < (1<<30));
+    unsigned victim = ((uint64_t)r * (uint64_t)size) >> 32;
+    unsigned index = ((uint64_t) all_coprimes_[size - 1].size() * (uint64_t)r) >> 32;
+    unsigned inc = all_coprimes_[size - 1][index];
+
+    for (unsigned i = 0; i < size; i++) {
+      eigen_plain_assert(start + victim < limit);
+      Task t = thread_data_[start + victim].queue.PopBack();
+      if (t.f) {
+        return t;
+      }
+      victim += inc;
+      if (victim >= size) {
+        victim -= size;
+      }
+    }
+    return Task();
+  }
+
+  // Steals work within threads belonging to the partition.
+  Task LocalSteal() {
+    PerThread* pt = GetPerThread();
+    unsigned partition = GetStealPartition(pt->thread_id);
+    // If thread steal partition is the same as global partition, there is no
+    // need to go through the steal loop twice.
+    if (global_steal_partition_ == partition) return Task();
+    unsigned start, limit;
+    DecodePartition(partition, &start, &limit);
+    AssertBounds(start, limit);
+
+    return Steal(start, limit);
+  }
+
+  // Steals work from any other thread in the pool.
+  Task GlobalSteal() {
+    return Steal(0, num_threads_);
+  }
+
+
+  // WaitForWork blocks until new work is available (returns true), or if it is
+  // time to exit (returns false). Can optionally return a task to execute in t
+  // (in such case t.f != nullptr on return).
+  bool WaitForWork(EventCount::Waiter* waiter, Task* t) {
+    eigen_plain_assert(!t->f);
+    // We already did best-effort emptiness check in Steal, so prepare for
+    // blocking.
+    ec_.Prewait();
+    // Now do a reliable emptiness check.
+    int victim = NonEmptyQueueIndex();
+    if (victim != -1) {
+      ec_.CancelWait();
+      if (cancelled_) {
+        return false;
+      } else {
+        *t = thread_data_[victim].queue.PopBack();
+        return true;
+      }
+    }
+    // Number of blocked threads is used as termination condition.
+    // If we are shutting down and all worker threads blocked without work,
+    // that's we are done.
+    blocked_++;
+    // TODO is blocked_ required to be unsigned?
+    if (done_ && blocked_ == static_cast<unsigned>(num_threads_)) {
+      ec_.CancelWait();
+      // Almost done, but need to re-check queues.
+      // Consider that all queues are empty and all worker threads are preempted
+      // right after incrementing blocked_ above. Now a free-standing thread
+      // submits work and calls destructor (which sets done_). If we don't
+      // re-check queues, we will exit leaving the work unexecuted.
+      if (NonEmptyQueueIndex() != -1) {
+        // Note: we must not pop from queues before we decrement blocked_,
+        // otherwise the following scenario is possible. Consider that instead
+        // of checking for emptiness we popped the only element from queues.
+        // Now other worker threads can start exiting, which is bad if the
+        // work item submits other work. So we just check emptiness here,
+        // which ensures that all worker threads exit at the same time.
+        blocked_--;
+        return true;
+      }
+      // Reached stable termination state.
+      ec_.Notify(true);
+      return false;
+    }
+    ec_.CommitWait(waiter);
+    blocked_--;
+    return true;
+  }
+
+  int NonEmptyQueueIndex() {
+    PerThread* pt = GetPerThread();
+    // We intentionally design NonEmptyQueueIndex to steal work from
+    // anywhere in the queue so threads don't block in WaitForWork() forever
+    // when all threads in their partition go to sleep. Steal is still local.
+    const size_t size = thread_data_.size();
+    unsigned r = Rand(&pt->rand);
+    unsigned inc = all_coprimes_[size - 1][r % all_coprimes_[size - 1].size()];
+    unsigned victim = r % size;
+    for (unsigned i = 0; i < size; i++) {
+      if (!thread_data_[victim].queue.Empty()) {
+        return victim;
+      }
+      victim += inc;
+      if (victim >= size) {
+        victim -= size;
+      }
+    }
+    return -1;
+  }
+
+  static EIGEN_STRONG_INLINE uint64_t GlobalThreadIdHash() {
+    return std::hash<std::thread::id>()(std::this_thread::get_id());
+  }
+
+  EIGEN_STRONG_INLINE PerThread* GetPerThread() {
+#ifndef EIGEN_THREAD_LOCAL
+    static PerThread dummy;
+    auto it = per_thread_map_.find(GlobalThreadIdHash());
+    if (it == per_thread_map_.end()) {
+      return &dummy;
+    } else {
+      return it->second.get();
+    }
+#else
+    EIGEN_THREAD_LOCAL PerThread per_thread_;
+    PerThread* pt = &per_thread_;
+    return pt;
+#endif
+  }
+
+  static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) {
+    uint64_t current = *state;
+    // Update the internal state
+    *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+    // Generate the random output (using the PCG-XSH-RS scheme)
+    return static_cast<unsigned>((current ^ (current >> 22)) >>
+                                 (22 + (current >> 61)));
+  }
+};
+
+typedef ThreadPoolTempl<StlThreadEnvironment> ThreadPool;
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H

diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
new file mode 100644
index 0000000..b572ebc
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h

@@ -0,0 +1,236 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
+#define EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
+
+namespace Eigen {
+
+// RunQueue is a fixed-size, partially non-blocking deque or Work items.
+// Operations on front of the queue must be done by a single thread (owner),
+// operations on back of the queue can be done by multiple threads concurrently.
+//
+// Algorithm outline:
+// All remote threads operating on the queue back are serialized by a mutex.
+// This ensures that at most two threads access state: owner and one remote
+// thread (Size aside). The algorithm ensures that the occupied region of the
+// underlying array is logically continuous (can wraparound, but no stray
+// occupied elements). Owner operates on one end of this region, remote thread
+// operates on the other end. Synchronization between these threads
+// (potential consumption of the last element and take up of the last empty
+// element) happens by means of state variable in each element. States are:
+// empty, busy (in process of insertion of removal) and ready. Threads claim
+// elements (empty->busy and ready->busy transitions) by means of a CAS
+// operation. The finishing transition (busy->empty and busy->ready) are done
+// with plain store as the element is exclusively owned by the current thread.
+//
+// Note: we could permit only pointers as elements, then we would not need
+// separate state variable as null/non-null pointer value would serve as state,
+// but that would require malloc/free per operation for large, complex values
+// (and this is designed to store std::function<()>).
+template <typename Work, unsigned kSize>
+class RunQueue {
+ public:
+  RunQueue() : front_(0), back_(0) {
+    // require power-of-two for fast masking
+    eigen_plain_assert((kSize & (kSize - 1)) == 0);
+    eigen_plain_assert(kSize > 2);            // why would you do this?
+    eigen_plain_assert(kSize <= (64 << 10));  // leave enough space for counter
+    for (unsigned i = 0; i < kSize; i++)
+      array_[i].state.store(kEmpty, std::memory_order_relaxed);
+  }
+
+  ~RunQueue() { eigen_plain_assert(Size() == 0); }
+
+  // PushFront inserts w at the beginning of the queue.
+  // If queue is full returns w, otherwise returns default-constructed Work.
+  Work PushFront(Work w) {
+    unsigned front = front_.load(std::memory_order_relaxed);
+    Elem* e = &array_[front & kMask];
+    uint8_t s = e->state.load(std::memory_order_relaxed);
+    if (s != kEmpty ||
+        !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+      return w;
+    front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed);
+    e->w = std::move(w);
+    e->state.store(kReady, std::memory_order_release);
+    return Work();
+  }
+
+  // PopFront removes and returns the first element in the queue.
+  // If the queue was empty returns default-constructed Work.
+  Work PopFront() {
+    unsigned front = front_.load(std::memory_order_relaxed);
+    Elem* e = &array_[(front - 1) & kMask];
+    uint8_t s = e->state.load(std::memory_order_relaxed);
+    if (s != kReady ||
+        !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+      return Work();
+    Work w = std::move(e->w);
+    e->state.store(kEmpty, std::memory_order_release);
+    front = ((front - 1) & kMask2) | (front & ~kMask2);
+    front_.store(front, std::memory_order_relaxed);
+    return w;
+  }
+
+  // PushBack adds w at the end of the queue.
+  // If queue is full returns w, otherwise returns default-constructed Work.
+  Work PushBack(Work w) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    unsigned back = back_.load(std::memory_order_relaxed);
+    Elem* e = &array_[(back - 1) & kMask];
+    uint8_t s = e->state.load(std::memory_order_relaxed);
+    if (s != kEmpty ||
+        !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+      return w;
+    back = ((back - 1) & kMask2) | (back & ~kMask2);
+    back_.store(back, std::memory_order_relaxed);
+    e->w = std::move(w);
+    e->state.store(kReady, std::memory_order_release);
+    return Work();
+  }
+
+  // PopBack removes and returns the last elements in the queue.
+  Work PopBack() {
+    if (Empty()) return Work();
+    std::unique_lock<std::mutex> lock(mutex_);
+    unsigned back = back_.load(std::memory_order_relaxed);
+    Elem* e = &array_[back & kMask];
+    uint8_t s = e->state.load(std::memory_order_relaxed);
+    if (s != kReady ||
+        !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+      return Work();
+    Work w = std::move(e->w);
+    e->state.store(kEmpty, std::memory_order_release);
+    back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);
+    return w;
+  }
+
+  // PopBackHalf removes and returns half last elements in the queue.
+  // Returns number of elements removed.
+  unsigned PopBackHalf(std::vector<Work>* result) {
+    if (Empty()) return 0;
+    std::unique_lock<std::mutex> lock(mutex_);
+    unsigned back = back_.load(std::memory_order_relaxed);
+    unsigned size = Size();
+    unsigned mid = back;
+    if (size > 1) mid = back + (size - 1) / 2;
+    unsigned n = 0;
+    unsigned start = 0;
+    for (; static_cast<int>(mid - back) >= 0; mid--) {
+      Elem* e = &array_[mid & kMask];
+      uint8_t s = e->state.load(std::memory_order_relaxed);
+      if (n == 0) {
+        if (s != kReady || !e->state.compare_exchange_strong(
+                               s, kBusy, std::memory_order_acquire))
+          continue;
+        start = mid;
+      } else {
+        // Note: no need to store temporal kBusy, we exclusively own these
+        // elements.
+        eigen_plain_assert(s == kReady);
+      }
+      result->push_back(std::move(e->w));
+      e->state.store(kEmpty, std::memory_order_release);
+      n++;
+    }
+    if (n != 0)
+      back_.store(start + 1 + (kSize << 1), std::memory_order_relaxed);
+    return n;
+  }
+
+  // Size returns current queue size.
+  // Can be called by any thread at any time.
+  unsigned Size() const { return SizeOrNotEmpty<true>(); }
+
+  // Empty tests whether container is empty.
+  // Can be called by any thread at any time.
+  bool Empty() const { return SizeOrNotEmpty<false>() == 0; }
+
+  // Delete all the elements from the queue.
+  void Flush() {
+    while (!Empty()) {
+      PopFront();
+    }
+  }
+
+ private:
+  static const unsigned kMask = kSize - 1;
+  static const unsigned kMask2 = (kSize << 1) - 1;
+  struct Elem {
+    std::atomic<uint8_t> state;
+    Work w;
+  };
+  enum {
+    kEmpty,
+    kBusy,
+    kReady,
+  };
+  std::mutex mutex_;
+  // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
+  // front/back, respectively. The remaining bits contain modification counters
+  // that are incremented on Push operations. This allows us to (1) distinguish
+  // between empty and full conditions (if we would use log(kSize) bits for
+  // position, these conditions would be indistinguishable); (2) obtain
+  // consistent snapshot of front_/back_ for Size operation using the
+  // modification counters.
+  std::atomic<unsigned> front_;
+  std::atomic<unsigned> back_;
+  Elem array_[kSize];
+
+  // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false,
+  // only whether the size is 0 is guaranteed to be correct.
+  // Can be called by any thread at any time.
+  template<bool NeedSizeEstimate>
+  unsigned SizeOrNotEmpty() const {
+    // Emptiness plays critical role in thread pool blocking. So we go to great
+    // effort to not produce false positives (claim non-empty queue as empty).
+    unsigned front = front_.load(std::memory_order_acquire);
+    for (;;) {
+      // Capture a consistent snapshot of front/tail.
+      unsigned back = back_.load(std::memory_order_acquire);
+      unsigned front1 = front_.load(std::memory_order_relaxed);
+      if (front != front1) {
+        front = front1;
+        std::atomic_thread_fence(std::memory_order_acquire);
+        continue;
+      }
+      if (NeedSizeEstimate) {
+        return CalculateSize(front, back);
+      } else {
+        // This value will be 0 if the queue is empty, and undefined otherwise.
+        unsigned maybe_zero = ((front ^ back) & kMask2);
+        // Queue size estimate must agree with maybe zero check on the queue
+        // empty/non-empty state.
+        eigen_assert((CalculateSize(front, back) == 0) == (maybe_zero == 0));
+        return maybe_zero;
+      }
+    }
+  }
+
+  EIGEN_ALWAYS_INLINE
+  unsigned CalculateSize(unsigned front, unsigned back) const {
+    int size = (front & kMask2) - (back & kMask2);
+    // Fix overflow.
+    if (size < 0) size += 2 * kSize;
+    // Order of modification in push/pop is crafted to make the queue look
+    // larger than it is during concurrent modifications. E.g. push can
+    // increment size before the corresponding pop has decremented it.
+    // So the computed size can be up to kSize + 1, fix it.
+    if (size > static_cast<int>(kSize)) size = kSize;
+    return static_cast<unsigned>(size);
+  }
+
+  RunQueue(const RunQueue&) = delete;
+  void operator=(const RunQueue&) = delete;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_

diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h
new file mode 100644
index 0000000..a05685f
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h

@@ -0,0 +1,23 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
+
+// Try to come up with a portable way to cancel a thread
+#if EIGEN_OS_GNULINUX
+  #define EIGEN_THREAD_CANCEL(t)                  \
+    pthread_cancel(t.native_handle());
+  #define EIGEN_SUPPORTS_THREAD_CANCELLATION 1
+#else
+#define EIGEN_THREAD_CANCEL(t)
+#endif
+
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H

diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
new file mode 100644
index 0000000..d94a064
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h

@@ -0,0 +1,40 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
+
+namespace Eigen {
+
+struct StlThreadEnvironment {
+  struct Task {
+    std::function<void()> f;
+  };
+
+  // EnvThread constructor must start the thread,
+  // destructor must join the thread.
+  class EnvThread {
+   public:
+    EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
+    ~EnvThread() { thr_.join(); }
+    // This function is called when the threadpool is cancelled.
+    void OnCancel() { }
+
+   private:
+    std::thread thr_;
+  };
+
+  EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(std::move(f)); }
+  Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
+  void ExecuteTask(const Task& t) { t.f(); }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H

diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
new file mode 100644
index 0000000..4e68474
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h

@@ -0,0 +1,301 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
+
+#ifdef EIGEN_AVOID_THREAD_LOCAL
+
+#ifdef EIGEN_THREAD_LOCAL
+#undef EIGEN_THREAD_LOCAL
+#endif
+
+#else
+
+#if EIGEN_MAX_CPP_VER >= 11 &&                         \
+    ((EIGEN_COMP_GNUC && EIGEN_GNUC_AT_LEAST(4, 8)) || \
+     __has_feature(cxx_thread_local)                || \
+     (EIGEN_COMP_MSVC >= 1900) )
+#define EIGEN_THREAD_LOCAL static thread_local
+#endif
+
+// Disable TLS for Apple and Android builds with older toolchains.
+#if defined(__APPLE__)
+// Included for TARGET_OS_IPHONE, __IPHONE_OS_VERSION_MIN_REQUIRED,
+// __IPHONE_8_0.
+#include <Availability.h>
+#include <TargetConditionals.h>
+#endif
+// Checks whether C++11's `thread_local` storage duration specifier is
+// supported.
+#if defined(__apple_build_version__) &&     \
+    ((__apple_build_version__ < 8000042) || \
+     (TARGET_OS_IPHONE && __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0))
+// Notes: Xcode's clang did not support `thread_local` until version
+// 8, and even then not for all iOS < 9.0.
+#undef EIGEN_THREAD_LOCAL
+
+#elif defined(__ANDROID__) && EIGEN_COMP_CLANG
+// There are platforms for which TLS should not be used even though the compiler
+// makes it seem like it's supported (Android NDK < r12b for example).
+// This is primarily because of linker problems and toolchain misconfiguration:
+// TLS isn't supported until NDK r12b per
+// https://developer.android.com/ndk/downloads/revision_history.html
+// Since NDK r16, `__NDK_MAJOR__` and `__NDK_MINOR__` are defined in
+// <android/ndk-version.h>. For NDK < r16, users should define these macros,
+// e.g. `-D__NDK_MAJOR__=11 -D__NKD_MINOR__=0` for NDK r11.
+#if __has_include(<android/ndk-version.h>)
+#include <android/ndk-version.h>
+#endif  // __has_include(<android/ndk-version.h>)
+#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
+    defined(__NDK_MINOR__) &&                                               \
+    ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
+#undef EIGEN_THREAD_LOCAL
+#endif
+#endif  // defined(__ANDROID__) && defined(__clang__)
+
+#endif  // EIGEN_AVOID_THREAD_LOCAL
+
+namespace Eigen {
+
+namespace internal {
+template <typename T>
+struct ThreadLocalNoOpInitialize {
+  void operator()(T&) const {}
+};
+
+template <typename T>
+struct ThreadLocalNoOpRelease {
+  void operator()(T&) const {}
+};
+
+}  // namespace internal
+
+// Thread local container for elements of type T, that does not use thread local
+// storage. As long as the number of unique threads accessing this storage
+// is smaller than `capacity_`, it is lock-free and wait-free. Otherwise it will
+// use a mutex for synchronization.
+//
+// Type `T` has to be default constructible, and by default each thread will get
+// a default constructed value. It is possible to specify custom `initialize`
+// callable, that will be called lazily from each thread accessing this object,
+// and will be passed a default initialized object of type `T`. Also it's
+// possible to pass a custom `release` callable, that will be invoked before
+// calling ~T().
+//
+// Example:
+//
+//   struct Counter {
+//     int value = 0;
+//   }
+//
+//   Eigen::ThreadLocal<Counter> counter(10);
+//
+//   // Each thread will have access to it's own counter object.
+//   Counter& cnt = counter.local();
+//   cnt++;
+//
+// WARNING: Eigen::ThreadLocal uses the OS-specific value returned by
+// std::this_thread::get_id() to identify threads. This value is not guaranteed
+// to be unique except for the life of the thread. A newly created thread may
+// get an OS-specific ID equal to that of an already destroyed thread.
+//
+// Somewhat similar to TBB thread local storage, with similar restrictions:
+// https://www.threadingbuildingblocks.org/docs/help/reference/thread_local_storage/enumerable_thread_specific_cls.html
+//
+template <typename T,
+          typename Initialize = internal::ThreadLocalNoOpInitialize<T>,
+          typename Release = internal::ThreadLocalNoOpRelease<T>>
+class ThreadLocal {
+  // We preallocate default constructed elements in MaxSizedVector.
+  static_assert(std::is_default_constructible<T>::value,
+                "ThreadLocal data type must be default constructible");
+
+ public:
+  explicit ThreadLocal(int capacity)
+      : ThreadLocal(capacity, internal::ThreadLocalNoOpInitialize<T>(),
+                    internal::ThreadLocalNoOpRelease<T>()) {}
+
+  ThreadLocal(int capacity, Initialize initialize)
+      : ThreadLocal(capacity, std::move(initialize),
+                    internal::ThreadLocalNoOpRelease<T>()) {}
+
+  ThreadLocal(int capacity, Initialize initialize, Release release)
+      : initialize_(std::move(initialize)),
+        release_(std::move(release)),
+        capacity_(capacity),
+        data_(capacity_),
+        ptr_(capacity_),
+        filled_records_(0) {
+    eigen_assert(capacity_ >= 0);
+    data_.resize(capacity_);
+    for (int i = 0; i < capacity_; ++i) {
+      ptr_.emplace_back(nullptr);
+    }
+  }
+
+  T& local() {
+    std::thread::id this_thread = std::this_thread::get_id();
+    if (capacity_ == 0) return SpilledLocal(this_thread);
+
+    std::size_t h = std::hash<std::thread::id>()(this_thread);
+    const int start_idx = h % capacity_;
+
+    // NOTE: From the definition of `std::this_thread::get_id()` it is
+    // guaranteed that we never can have concurrent insertions with the same key
+    // to our hash-map like data structure. If we didn't find an element during
+    // the initial traversal, it's guaranteed that no one else could have
+    // inserted it while we are in this function. This allows to massively
+    // simplify out lock-free insert-only hash map.
+
+    // Check if we already have an element for `this_thread`.
+    int idx = start_idx;
+    while (ptr_[idx].load() != nullptr) {
+      ThreadIdAndValue& record = *(ptr_[idx].load());
+      if (record.thread_id == this_thread) return record.value;
+
+      idx += 1;
+      if (idx >= capacity_) idx -= capacity_;
+      if (idx == start_idx) break;
+    }
+
+    // If we are here, it means that we found an insertion point in lookup
+    // table at `idx`, or we did a full traversal and table is full.
+
+    // If lock-free storage is full, fallback on mutex.
+    if (filled_records_.load() >= capacity_) return SpilledLocal(this_thread);
+
+    // We double check that we still have space to insert an element into a lock
+    // free storage. If old value in `filled_records_` is larger than the
+    // records capacity, it means that some other thread added an element while
+    // we were traversing lookup table.
+    int insertion_index =
+        filled_records_.fetch_add(1, std::memory_order_relaxed);
+    if (insertion_index >= capacity_) return SpilledLocal(this_thread);
+
+    // At this point it's guaranteed that we can access to
+    // data_[insertion_index_] without a data race.
+    data_[insertion_index].thread_id = this_thread;
+    initialize_(data_[insertion_index].value);
+
+    // That's the pointer we'll put into the lookup table.
+    ThreadIdAndValue* inserted = &data_[insertion_index];
+
+    // We'll use nullptr pointer to ThreadIdAndValue in a compare-and-swap loop.
+    ThreadIdAndValue* empty = nullptr;
+
+    // Now we have to find an insertion point into the lookup table. We start
+    // from the `idx` that was identified as an insertion point above, it's
+    // guaranteed that we will have an empty record somewhere in a lookup table
+    // (because we created a record in the `data_`).
+    const int insertion_idx = idx;
+
+    do {
+      // Always start search from the original insertion candidate.
+      idx = insertion_idx;
+      while (ptr_[idx].load() != nullptr) {
+        idx += 1;
+        if (idx >= capacity_) idx -= capacity_;
+        // If we did a full loop, it means that we don't have any free entries
+        // in the lookup table, and this means that something is terribly wrong.
+        eigen_assert(idx != insertion_idx);
+      }
+      // Atomic CAS of the pointer guarantees that any other thread, that will
+      // follow this pointer will see all the mutations in the `data_`.
+    } while (!ptr_[idx].compare_exchange_weak(empty, inserted));
+
+    return inserted->value;
+  }
+
+  // WARN: It's not thread safe to call it concurrently with `local()`.
+  void ForEach(std::function<void(std::thread::id, T&)> f) {
+    // Reading directly from `data_` is unsafe, because only CAS to the
+    // record in `ptr_` makes all changes visible to other threads.
+    for (auto& ptr : ptr_) {
+      ThreadIdAndValue* record = ptr.load();
+      if (record == nullptr) continue;
+      f(record->thread_id, record->value);
+    }
+
+    // We did not spill into the map based storage.
+    if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
+
+    // Adds a happens before edge from the last call to SpilledLocal().
+    std::unique_lock<std::mutex> lock(mu_);
+    for (auto& kv : per_thread_map_) {
+      f(kv.first, kv.second);
+    }
+  }
+
+  // WARN: It's not thread safe to call it concurrently with `local()`.
+  ~ThreadLocal() {
+    // Reading directly from `data_` is unsafe, because only CAS to the record
+    // in `ptr_` makes all changes visible to other threads.
+    for (auto& ptr : ptr_) {
+      ThreadIdAndValue* record = ptr.load();
+      if (record == nullptr) continue;
+      release_(record->value);
+    }
+
+    // We did not spill into the map based storage.
+    if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
+
+    // Adds a happens before edge from the last call to SpilledLocal().
+    std::unique_lock<std::mutex> lock(mu_);
+    for (auto& kv : per_thread_map_) {
+      release_(kv.second);
+    }
+  }
+
+ private:
+  struct ThreadIdAndValue {
+    std::thread::id thread_id;
+    T value;
+  };
+
+  // Use unordered map guarded by a mutex when lock free storage is full.
+  T& SpilledLocal(std::thread::id this_thread) {
+    std::unique_lock<std::mutex> lock(mu_);
+
+    auto it = per_thread_map_.find(this_thread);
+    if (it == per_thread_map_.end()) {
+      auto result = per_thread_map_.emplace(this_thread, T());
+      eigen_assert(result.second);
+      initialize_((*result.first).second);
+      return (*result.first).second;
+    } else {
+      return it->second;
+    }
+  }
+
+  Initialize initialize_;
+  Release release_;
+  const int capacity_;
+
+  // Storage that backs lock-free lookup table `ptr_`. Records stored in this
+  // storage contiguously starting from index 0.
+  MaxSizeVector<ThreadIdAndValue> data_;
+
+  // Atomic pointers to the data stored in `data_`. Used as a lookup table for
+  // linear probing hash map (https://en.wikipedia.org/wiki/Linear_probing).
+  MaxSizeVector<std::atomic<ThreadIdAndValue*>> ptr_;
+
+  // Number of records stored in the `data_`.
+  std::atomic<int> filled_records_;
+
+  // We fallback on per thread map if lock-free storage is full. In practice
+  // this should never happen, if `capacity_` is a reasonable estimate of the
+  // number of threads running in a system.
+  std::mutex mu_;  // Protects per_thread_map_.
+  std::unordered_map<std::thread::id, T> per_thread_map_;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H

diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
new file mode 100644
index 0000000..25030dc
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h

@@ -0,0 +1,48 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
+
+namespace Eigen {
+
+// This defines an interface that ThreadPoolDevice can take to use
+// custom thread pools underneath.
+class ThreadPoolInterface {
+ public:
+  // Submits a closure to be run by a thread in the pool.
+  virtual void Schedule(std::function<void()> fn) = 0;
+
+  // Submits a closure to be run by threads in the range [start, end) in the
+  // pool.
+  virtual void ScheduleWithHint(std::function<void()> fn, int /*start*/,
+                                int /*end*/) {
+    // Just defer to Schedule in case sub-classes aren't interested in
+    // overriding this functionality.
+    Schedule(fn);
+  }
+
+  // If implemented, stop processing the closures that have been enqueued.
+  // Currently running closures may still be processed.
+  // If not implemented, does nothing.
+  virtual void Cancel() {}
+
+  // Returns the number of threads in the pool.
+  virtual int NumThreads() const = 0;
+
+  // Returns a logical thread index between 0 and NumThreads() - 1 if called
+  // from one of the threads in the pool. Returns -1 otherwise.
+  virtual int CurrentThreadId() const = 0;
+
+  virtual ~ThreadPoolInterface() {}
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H

diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h
new file mode 100644
index 0000000..a859c7b
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h

@@ -0,0 +1,20 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
+
+// Try to come up with a portable way to yield
+#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
+#define EIGEN_THREAD_YIELD() sched_yield()
+#else
+#define EIGEN_THREAD_YIELD() std::this_thread::yield()
+#endif
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H

diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
new file mode 100644
index 0000000..f662dee
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h

@@ -0,0 +1,538 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11META_H
+#define EIGEN_CXX11META_H
+
+#include <vector>
+#include "EmulateArray.h"
+
+#include "CXX11Workarounds.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+  * \file CXX11/util/CXX11Meta.h
+  * This file contains generic metaprogramming classes which are not specifically related to Eigen.
+  * This file expands upon Core/util/Meta.h and adds support for C++11 specific features.
+  */
+
+template<typename... tt>
+struct type_list { constexpr static int count = sizeof...(tt); };
+
+template<typename t, typename... tt>
+struct type_list<t, tt...> { constexpr static int count = sizeof...(tt) + 1; typedef t first_type; };
+
+template<typename T, T... nn>
+struct numeric_list { constexpr static std::size_t count = sizeof...(nn); };
+
+template<typename T, T n, T... nn>
+struct numeric_list<T, n, nn...> { static const std::size_t count = sizeof...(nn) + 1; const static T first_value = n; };
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+/* numeric list constructors
+ *
+ * equivalencies:
+ *     constructor                                              result
+ *     typename gen_numeric_list<int, 5>::type                  numeric_list<int, 0,1,2,3,4>
+ *     typename gen_numeric_list_reversed<int, 5>::type         numeric_list<int, 4,3,2,1,0>
+ *     typename gen_numeric_list_swapped_pair<int, 5,1,2>::type numeric_list<int, 0,2,1,3,4>
+ *     typename gen_numeric_list_repeated<int, 0, 5>::type      numeric_list<int, 0,0,0,0,0>
+ */
+
+template<typename T, std::size_t n, T start = 0, T... ii> struct gen_numeric_list                     : gen_numeric_list<T, n-1, start, start + n-1, ii...> {};
+template<typename T, T start, T... ii>                    struct gen_numeric_list<T, 0, start, ii...> { typedef numeric_list<T, ii...> type; };
+
+template<typename T, std::size_t n, T start = 0, T... ii> struct gen_numeric_list_reversed                     : gen_numeric_list_reversed<T, n-1, start, ii..., start + n-1> {};
+template<typename T, T start, T... ii>                    struct gen_numeric_list_reversed<T, 0, start, ii...> { typedef numeric_list<T, ii...> type; };
+
+template<typename T, std::size_t n, T a, T b, T start = 0, T... ii> struct gen_numeric_list_swapped_pair                           : gen_numeric_list_swapped_pair<T, n-1, a, b, start, (start + n-1) == a ? b : ((start + n-1) == b ? a : (start + n-1)), ii...> {};
+template<typename T, T a, T b, T start, T... ii>                    struct gen_numeric_list_swapped_pair<T, 0, a, b, start, ii...> { typedef numeric_list<T, ii...> type; };
+
+template<typename T, std::size_t n, T V, T... nn> struct gen_numeric_list_repeated                 : gen_numeric_list_repeated<T, n-1, V, V, nn...> {};
+template<typename T, T V, T... nn>                struct gen_numeric_list_repeated<T, 0, V, nn...> { typedef numeric_list<T, nn...> type; };
+
+/* list manipulation: concatenate */
+
+template<class a, class b> struct concat;
+
+template<typename... as, typename... bs> struct concat<type_list<as...>,       type_list<bs...>>        { typedef type_list<as..., bs...> type; };
+template<typename T, T... as, T... bs>   struct concat<numeric_list<T, as...>, numeric_list<T, bs...> > { typedef numeric_list<T, as..., bs...> type; };
+
+template<typename... p> struct mconcat;
+template<typename a>                             struct mconcat<a>           { typedef a type; };
+template<typename a, typename b>                 struct mconcat<a, b>        : concat<a, b> {};
+template<typename a, typename b, typename... cs> struct mconcat<a, b, cs...> : concat<a, typename mconcat<b, cs...>::type> {};
+
+/* list manipulation: extract slices */
+
+template<int n, typename x> struct take;
+template<int n, typename a, typename... as> struct take<n, type_list<a, as...>> : concat<type_list<a>, typename take<n-1, type_list<as...>>::type> {};
+template<int n>                             struct take<n, type_list<>>         { typedef type_list<> type; };
+template<typename a, typename... as>        struct take<0, type_list<a, as...>> { typedef type_list<> type; };
+template<>                                  struct take<0, type_list<>>         { typedef type_list<> type; };
+
+template<typename T, int n, T a, T... as> struct take<n, numeric_list<T, a, as...>> : concat<numeric_list<T, a>, typename take<n-1, numeric_list<T, as...>>::type> {};
+// XXX The following breaks in gcc-11, and is invalid anyways.
+// template<typename T, int n>               struct take<n, numeric_list<T>>           { typedef numeric_list<T> type; };
+template<typename T, T a, T... as>        struct take<0, numeric_list<T, a, as...>> { typedef numeric_list<T> type; };
+template<typename T>                      struct take<0, numeric_list<T>>           { typedef numeric_list<T> type; };
+
+template<typename T, int n, T... ii>      struct h_skip_helper_numeric;
+template<typename T, int n, T i, T... ii> struct h_skip_helper_numeric<T, n, i, ii...> : h_skip_helper_numeric<T, n-1, ii...> {};
+template<typename T, T i, T... ii>        struct h_skip_helper_numeric<T, 0, i, ii...> { typedef numeric_list<T, i, ii...> type; };
+template<typename T, int n>               struct h_skip_helper_numeric<T, n>           { typedef numeric_list<T> type; };
+template<typename T>                      struct h_skip_helper_numeric<T, 0>           { typedef numeric_list<T> type; };
+
+template<int n, typename... tt>             struct h_skip_helper_type;
+template<int n, typename t, typename... tt> struct h_skip_helper_type<n, t, tt...> : h_skip_helper_type<n-1, tt...> {};
+template<typename t, typename... tt>        struct h_skip_helper_type<0, t, tt...> { typedef type_list<t, tt...> type; };
+template<int n>                             struct h_skip_helper_type<n>           { typedef type_list<> type; };
+template<>                                  struct h_skip_helper_type<0>           { typedef type_list<> type; };
+#endif //not EIGEN_PARSED_BY_DOXYGEN
+
+template<int n>
+struct h_skip {
+  template<typename T, T... ii>
+  constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_numeric<T, n, ii...>::type helper(numeric_list<T, ii...>) { return typename h_skip_helper_numeric<T, n, ii...>::type(); }
+  template<typename... tt>
+  constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) { return typename h_skip_helper_type<n, tt...>::type(); }
+};
+
+template<int n, typename a> struct skip { typedef decltype(h_skip<n>::helper(a())) type; };
+
+template<int start, int count, typename a> struct slice : take<count, typename skip<start, a>::type> {};
+
+/* list manipulation: retrieve single element from list */
+
+template<int n, typename x> struct get;
+
+template<int n, typename a, typename... as>               struct get<n, type_list<a, as...>>   : get<n-1, type_list<as...>> {};
+template<typename a, typename... as>                      struct get<0, type_list<a, as...>>   { typedef a type; };
+
+template<typename T, int n, T a, T... as>                        struct get<n, numeric_list<T, a, as...>>   : get<n-1, numeric_list<T, as...>> {};
+template<typename T, T a, T... as>                               struct get<0, numeric_list<T, a, as...>>   { constexpr static T value = a; };
+
+template<std::size_t n, typename T, T a, T... as> constexpr T       array_get(const numeric_list<T, a, as...>&) {
+   return get<(int)n, numeric_list<T, a, as...>>::value;
+}
+
+/* always get type, regardless of dummy; good for parameter pack expansion */
+
+template<typename T, T dummy, typename t> struct id_numeric  { typedef t type; };
+template<typename dummy, typename t>      struct id_type     { typedef t type; };
+
+/* equality checking, flagged version */
+
+template<typename a, typename b> struct is_same_gf : is_same<a, b> { constexpr static int global_flags = 0; };
+
+/* apply_op to list */
+
+template<
+  bool from_left, // false
+  template<typename, typename> class op,
+  typename additional_param,
+  typename... values
+>
+struct h_apply_op_helper                                        { typedef type_list<typename op<values, additional_param>::type...> type; };
+template<
+  template<typename, typename> class op,
+  typename additional_param,
+  typename... values
+>
+struct h_apply_op_helper<true, op, additional_param, values...> { typedef type_list<typename op<additional_param, values>::type...> type; };
+
+template<
+  bool from_left,
+  template<typename, typename> class op,
+  typename additional_param
+>
+struct h_apply_op
+{
+  template<typename... values>
+  constexpr static typename h_apply_op_helper<from_left, op, additional_param, values...>::type helper(type_list<values...>)
+  { return typename h_apply_op_helper<from_left, op, additional_param, values...>::type(); }
+};
+
+template<
+  template<typename, typename> class op,
+  typename additional_param,
+  typename a
+>
+struct apply_op_from_left { typedef decltype(h_apply_op<true, op, additional_param>::helper(a())) type; };
+
+template<
+  template<typename, typename> class op,
+  typename additional_param,
+  typename a
+>
+struct apply_op_from_right { typedef decltype(h_apply_op<false, op, additional_param>::helper(a())) type; };
+
+/* see if an element is in a list */
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename h_list,
+  bool last_check_positive = false
+>
+struct contained_in_list;
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename h_list
+>
+struct contained_in_list<test, check_against, h_list, true>
+{
+  constexpr static bool value = true;
+};
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename a,
+  typename... as
+>
+struct contained_in_list<test, check_against, type_list<a, as...>, false> : contained_in_list<test, check_against, type_list<as...>, test<check_against, a>::value> {};
+
+template<
+  template<typename, typename> class test,
+  typename check_against
+  EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty)
+>
+struct contained_in_list<test, check_against, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, false> { constexpr static bool value = false; };
+
+/* see if an element is in a list and check for global flags */
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename h_list,
+  int default_flags = 0,
+  bool last_check_positive = false,
+  int last_check_flags = default_flags
+>
+struct contained_in_list_gf;
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename h_list,
+  int default_flags,
+  int last_check_flags
+>
+struct contained_in_list_gf<test, check_against, h_list, default_flags, true, last_check_flags>
+{
+  constexpr static bool value = true;
+  constexpr static int global_flags = last_check_flags;
+};
+
+template<
+  template<typename, typename> class test,
+  typename check_against,
+  typename a,
+  typename... as,
+  int default_flags,
+  int last_check_flags
+>
+struct contained_in_list_gf<test, check_against, type_list<a, as...>, default_flags, false, last_check_flags> : contained_in_list_gf<test, check_against, type_list<as...>, default_flags, test<check_against, a>::value, test<check_against, a>::global_flags> {};
+
+template<
+  template<typename, typename> class test,
+  typename check_against
+  EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty),
+  int default_flags,
+  int last_check_flags
+>
+struct contained_in_list_gf<test, check_against, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, default_flags, false, last_check_flags> { constexpr static bool value = false; constexpr static int global_flags = default_flags; };
+
+/* generic reductions */
+
+template<
+  typename Reducer,
+  typename... Ts
+> struct reduce;
+
+template<
+  typename Reducer
+> struct reduce<Reducer>
+{
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE int run() { return Reducer::Identity; }
+};
+
+template<
+  typename Reducer,
+  typename A
+> struct reduce<Reducer, A>
+{
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE A run(A a) { return a; }
+};
+
+template<
+  typename Reducer,
+  typename A,
+  typename... Ts
+> struct reduce<Reducer, A, Ts...>
+{
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
+    return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
+  }
+};
+
+/* generic binary operations */
+
+struct sum_op           {
+  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a + b)   { return a + b;   }
+  static constexpr int Identity = 0;
+};
+struct product_op       {
+  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a * b)   { return a * b;   }
+  static constexpr int Identity = 1;
+};
+
+struct logical_and_op   { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a && b)  { return a && b;  } };
+struct logical_or_op    { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a || b)  { return a || b;  } };
+
+struct equal_op         { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a == b)  { return a == b;  } };
+struct not_equal_op     { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a != b)  { return a != b;  } };
+struct lesser_op        { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a < b)   { return a < b;   } };
+struct lesser_equal_op  { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a <= b)  { return a <= b;  } };
+struct greater_op       { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a > b)   { return a > b;   } };
+struct greater_equal_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a >= b)  { return a >= b;  } };
+
+/* generic unary operations */
+
+struct not_op                { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(!a)      { return !a;      } };
+struct negation_op           { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(-a)      { return -a;      } };
+struct greater_equal_zero_op { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(a >= 0)  { return a >= 0;  } };
+
+
+/* reductions for lists */
+
+// using auto -> return value spec makes ICC 13.0 and 13.1 crash here, so we have to hack it
+// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
+// does...
+template<typename... Ts>
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
+{
+  return reduce<product_op, Ts...>::run(ts...);
+}
+
+template<typename... Ts>
+constexpr EIGEN_STRONG_INLINE decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts)
+{
+  return reduce<sum_op, Ts...>::run(ts...);
+}
+
+/* reverse arrays */
+
+template<typename Array, int... n>
+constexpr EIGEN_STRONG_INLINE Array h_array_reverse(Array arr, numeric_list<int, n...>)
+{
+  return {{array_get<sizeof...(n) - n - 1>(arr)...}};
+}
+
+template<typename T, std::size_t N>
+constexpr EIGEN_STRONG_INLINE array<T, N> array_reverse(array<T, N> arr)
+{
+  return h_array_reverse(arr, typename gen_numeric_list<int, N>::type());
+}
+
+
+/* generic array reductions */
+
+// can't reuse standard reduce() interface above because Intel's Compiler
+// *really* doesn't like it, so we just reimplement the stuff
+// (start from N - 1 and work down to 0 because specialization for
+// n == N - 1 also doesn't work in Intel's compiler, so it goes into
+// an infinite loop)
+template<typename Reducer, typename T, std::size_t N, std::size_t n = N - 1>
+struct h_array_reduce {
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(array<T, N> arr, T identity) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr)))
+  {
+    return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr));
+  }
+};
+
+template<typename Reducer, typename T, std::size_t N>
+struct h_array_reduce<Reducer, T, N, 0>
+{
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, N>& arr, T)
+  {
+    return array_get<0>(arr);
+  }
+};
+
+template<typename Reducer, typename T>
+struct h_array_reduce<Reducer, T, 0>
+{
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, 0>&, T identity)
+  {
+    return identity;
+  }
+};
+
+template<typename Reducer, typename T, std::size_t N>
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_reduce(const array<T, N>& arr, T identity) -> decltype(h_array_reduce<Reducer, T, N>::run(arr, identity))
+{
+  return h_array_reduce<Reducer, T, N>::run(arr, identity);
+}
+
+/* standard array reductions */
+
+template<typename T, std::size_t N>
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_sum(const array<T, N>& arr) -> decltype(array_reduce<sum_op, T, N>(arr, static_cast<T>(0)))
+{
+  return array_reduce<sum_op, T, N>(arr, static_cast<T>(0));
+}
+
+template<typename T, std::size_t N>
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_prod(const array<T, N>& arr) -> decltype(array_reduce<product_op, T, N>(arr, static_cast<T>(1)))
+{
+  return array_reduce<product_op, T, N>(arr, static_cast<T>(1));
+}
+
+template<typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
+  eigen_assert(a.size() > 0);
+  t prod = 1;
+  for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
+  return prod;
+}
+
+/* zip an array */
+
+template<typename Op, typename A, typename B, std::size_t N, int... n>
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())),N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
+{
+  return array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }};
+}
+
+template<typename Op, typename A, typename B, std::size_t N>
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())),N> array_zip(array<A, N> a, array<B, N> b)
+{
+  return h_array_zip<Op>(a, b, typename gen_numeric_list<int, N>::type());
+}
+
+/* zip an array and reduce the result */
+
+template<typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n>
+constexpr EIGEN_STRONG_INLINE auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...))
+{
+  return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...);
+}
+
+template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
+constexpr EIGEN_STRONG_INLINE auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type()))
+{
+  return h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type());
+}
+
+/* apply stuff to an array */
+
+template<typename Op, typename A, std::size_t N, int... n>
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())),N> h_array_apply(array<A, N> a, numeric_list<int, n...>)
+{
+  return array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }};
+}
+
+template<typename Op, typename A, std::size_t N>
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())),N> array_apply(array<A, N> a)
+{
+  return h_array_apply<Op>(a, typename gen_numeric_list<int, N>::type());
+}
+
+/* apply stuff to an array and reduce */
+
+template<typename Reducer, typename Op, typename A, std::size_t N, int... n>
+constexpr EIGEN_STRONG_INLINE auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...))
+{
+  return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...);
+}
+
+template<typename Reducer, typename Op, typename A, std::size_t N>
+constexpr EIGEN_STRONG_INLINE auto array_apply_and_reduce(array<A, N> a) -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type()))
+{
+  return h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type());
+}
+
+/* repeat a value n times (and make an array out of it
+ * usage:
+ *   array<int, 16> = repeat<16>(42);
+ */
+
+template<int n>
+struct h_repeat
+{
+  template<typename t, int... ii>
+  constexpr static EIGEN_STRONG_INLINE array<t, n> run(t v, numeric_list<int, ii...>)
+  {
+    return {{ typename id_numeric<int, ii, t>::type(v)... }};
+  }
+};
+
+template<int n, typename t>
+constexpr array<t, n> repeat(t v) { return h_repeat<n>::run(v, typename gen_numeric_list<int, n>::type()); }
+
+/* instantiate a class by a C-style array */
+template<class InstType, typename ArrType, std::size_t N, bool Reverse, typename... Ps>
+struct h_instantiate_by_c_array;
+
+template<class InstType, typename ArrType, std::size_t N, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, N, false, Ps...>
+{
+  static InstType run(ArrType* arr, Ps... args)
+  {
+    return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, Ps..., ArrType>::run(arr + 1, args..., arr[0]);
+  }
+};
+
+template<class InstType, typename ArrType, std::size_t N, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, N, true, Ps...>
+{
+  static InstType run(ArrType* arr, Ps... args)
+  {
+    return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, ArrType, Ps...>::run(arr + 1, arr[0], args...);
+  }
+};
+
+template<class InstType, typename ArrType, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, 0, false, Ps...>
+{
+  static InstType run(ArrType* arr, Ps... args)
+  {
+    (void)arr;
+    return InstType(args...);
+  }
+};
+
+template<class InstType, typename ArrType, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, 0, true, Ps...>
+{
+  static InstType run(ArrType* arr, Ps... args)
+  {
+    (void)arr;
+    return InstType(args...);
+  }
+};
+
+template<class InstType, typename ArrType, std::size_t N, bool Reverse = false>
+InstType instantiate_by_c_array(ArrType* arr)
+{
+  return h_instantiate_by_c_array<InstType, ArrType, N, Reverse>::run(arr);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11META_H

diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
new file mode 100644
index 0000000..056736c
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h

@@ -0,0 +1,88 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11WORKAROUNDS_H
+#define EIGEN_CXX11WORKAROUNDS_H
+
+/* COMPATIBILITY CHECKS
+ * (so users of compilers that are too old get some realistic error messages)
+ */
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1310)
+#error Intel Compiler only supports required C++ features since version 13.1.
+// note that most stuff in principle works with 13.0 but when combining
+// some features, at some point 13.0 will just fail with an internal assertion
+#elif defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6))
+// G++ < 4.6 by default will continue processing the source files - even if we use #error to make
+// it error out. For this reason, we use the pragma to make sure G++ aborts at the first error
+// it sees. Unfortunately, that is still not our #error directive, but at least the output is
+// short enough the user has a chance to see that the compiler version is not sufficient for
+// the funky template mojo we use.
+#pragma GCC diagnostic error "-Wfatal-errors"
+#error GNU C++ Compiler (g++) only supports required C++ features since version 4.6.
+#endif
+
+/* Check that the compiler at least claims to support C++11. It might not be sufficient
+ * because the compiler may not implement it correctly, but at least we'll know.
+ * On the other hand, visual studio still doesn't claim to support C++11 although it's
+ * compliant enugh for our purpose.
+ */
+#if (EIGEN_COMP_CXXVER < 11)
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#pragma GCC diagnostic error "-Wfatal-errors"
+#endif
+#error This library needs at least a C++11 compliant compiler. If you use g++/clang, please enable the -std=c++11 compiler flag. (-std=c++0x on older versions.)
+#endif
+
+namespace Eigen {
+
+namespace internal {
+
+/* std::get is only constexpr in C++14, not yet in C++11
+ */
+
+
+template<std::size_t I_, class T> constexpr inline T&       array_get(std::vector<T>&       a) { return a[I_]; }
+template<std::size_t I_, class T> constexpr inline T&&      array_get(std::vector<T>&&      a) { return a[I_]; }
+template<std::size_t I_, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I_]; }
+
+/* Suppose you have a template of the form
+ * template<typename T> struct X;
+ * And you want to specialize it in such a way:
+ *    template<typename S1, typename... SN> struct X<Foo<S1, SN...>> { ::: };
+ *    template<>                            struct X<Foo<>>          { ::: };
+ * This will work in Intel's compiler 13.0, but only to some extent in g++ 4.6, since
+ * g++ can only match templates called with parameter packs if the number of template
+ * arguments is not a fixed size (so inside the first specialization, referencing
+ * X<Foo<Sn...>> will fail in g++). On the other hand, g++ will accept the following:
+ *    template<typename S...> struct X<Foo<S...>> { ::: }:
+ * as an additional (!) specialization, which will then only match the empty case.
+ * But Intel's compiler 13.0 won't accept that, it will only accept the empty syntax,
+ * so we have to create a workaround for this.
+ */
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)    mt... n
+#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n)   , EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)
+#define EIGEN_TPL_PP_SPEC_HACK_USE(n)        n...
+#define EIGEN_TPL_PP_SPEC_HACK_USEC(n)       , n...
+#else
+#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)
+#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n)
+#define EIGEN_TPL_PP_SPEC_HACK_USE(n)
+#define EIGEN_TPL_PP_SPEC_HACK_USEC(n)
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11WORKAROUNDS_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */

diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
new file mode 100644
index 0000000..834b20b
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h

@@ -0,0 +1,261 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EMULATE_ARRAY_H
+#define EIGEN_EMULATE_ARRAY_H
+
+
+
+// The array class is only available starting with cxx11. Emulate our own here
+// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
+// Moreover, CUDA doesn't support the STL containers, so we use our own instead.
+#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_GPUCC) || defined(EIGEN_AVOID_STL_ARRAY)
+
+namespace Eigen {
+template <typename T, size_t n> class array {
+ public:
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& operator[] (size_t index) { eigen_internal_assert(index < size()); return values[index]; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { eigen_internal_assert(index < size()); return values[index]; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& at(size_t index) { eigen_assert(index < size()); return values[index]; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& at(size_t index) const { eigen_assert(index < size()); return values[index]; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& front() { return values[0]; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& front() const { return values[0]; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& back() { return values[n-1]; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& back() const { return values[n-1]; }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  static std::size_t size() { return n; }
+
+  T values[n];
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array() { }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v) {
+    EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2) {
+    EIGEN_STATIC_ASSERT(n==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) {
+    EIGEN_STATIC_ASSERT(n==3, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3,
+                            const T& v4) {
+    EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5) {
+    EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5, const T& v6) {
+    EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5, const T& v6, const T& v7) {
+    EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+    values[6] = v7;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(
+      const T& v1, const T& v2, const T& v3, const T& v4,
+      const T& v5, const T& v6, const T& v7, const T& v8) {
+    EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+    values[6] = v7;
+    values[7] = v8;
+  }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(std::initializer_list<T> l) {
+    eigen_assert(l.size() == n);
+    internal::smart_copy(l.begin(), l.end(), values);
+  }
+#endif
+};
+
+
+// Specialize array for zero size
+template <typename T> class array<T, 0> {
+ public:
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& operator[] (size_t) {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& operator[] (size_t) const {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& front() {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& front() const {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& back() {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& back() const {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array() : dummy() { }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
+    EIGEN_UNUSED_VARIABLE(l);
+    eigen_assert(l.size() == 0);
+  }
+#endif
+
+ private:
+  T dummy;
+};
+
+// Comparison operator
+// Todo: implement !=, <, <=, >,  and >=
+template<class T, std::size_t N>
+EIGEN_DEVICE_FUNC bool operator==(const array<T,N>& lhs, const array<T,N>& rhs) {
+  for (std::size_t i = 0; i < N; ++i) {
+    if (lhs[i] != rhs[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+
+namespace internal {
+template<std::size_t I_, class T, std::size_t N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) {
+  return a[I_];
+}
+template<std::size_t I_, class T, std::size_t N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
+  return a[I_];
+}
+
+template<class T, std::size_t N> struct array_size<array<T,N> > {
+  enum { value = N };
+};
+template<class T, std::size_t N> struct array_size<array<T,N>& > {
+  enum { value = N };
+};
+template<class T, std::size_t N> struct array_size<const array<T,N> > {
+  enum { value = N };
+};
+template<class T, std::size_t N> struct array_size<const array<T,N>& > {
+  enum { value = N };
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#else
+
+// The compiler supports c++11, and we're not targeting cuda: use std::array as Eigen::array
+#include <array>
+namespace Eigen {
+
+template <typename T, std::size_t N> using array = std::array<T, N>;
+
+namespace internal {
+/* std::get is only constexpr in C++14, not yet in C++11
+ *     - libstdc++ from version 4.7 onwards has it nevertheless,
+ *                                          so use that
+ *     - libstdc++ older versions: use _M_instance directly
+ *     - libc++ all versions so far: use __elems_ directly
+ *     - all other libs: use std::get to be portable, but
+ *                       this may not be constexpr
+ */
+#if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322
+#define STD_GET_ARR_HACK             a._M_instance[I_]
+#elif defined(_LIBCPP_VERSION)
+#define STD_GET_ARR_HACK             a.__elems_[I_]
+#else
+#define STD_GET_ARR_HACK             std::template get<I_, T, N>(a)
+#endif
+
+template<std::size_t I_, class T, std::size_t N> constexpr inline T&       array_get(std::array<T,N>&       a) { return (T&)       STD_GET_ARR_HACK; }
+template<std::size_t I_, class T, std::size_t N> constexpr inline T&&      array_get(std::array<T,N>&&      a) { return (T&&)      STD_GET_ARR_HACK; }
+template<std::size_t I_, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
+
+#undef STD_GET_ARR_HACK
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif
+
+#endif  // EIGEN_EMULATE_ARRAY_H

diff --git a/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
new file mode 100644
index 0000000..277ab14
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h

@@ -0,0 +1,158 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FIXEDSIZEVECTOR_H
+#define EIGEN_FIXEDSIZEVECTOR_H
+
+namespace Eigen {
+
+/** \class MaxSizeVector
+  * \ingroup Core
+  *
+  * \brief The MaxSizeVector class.
+  *
+  * The %MaxSizeVector provides a subset of std::vector functionality.
+  *
+  * The goal is to provide basic std::vector operations when using
+  * std::vector is not an option (e.g. on GPU or when compiling using
+  * FMA/AVX, as this can cause either compilation failures or illegal
+  * instruction failures).
+  *
+  * Beware: The constructors are not API compatible with these of
+  * std::vector.
+  */
+template <typename T>
+class MaxSizeVector {
+  static const size_t alignment = EIGEN_PLAIN_ENUM_MAX(EIGEN_ALIGNOF(T), sizeof(void*));
+ public:
+  // Construct a new MaxSizeVector, reserve n elements.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit MaxSizeVector(size_t n)
+      : reserve_(n), size_(0),
+        data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {
+  }
+
+  // Construct a new MaxSizeVector, reserve and resize to n.
+  // Copy the init value to all elements.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  MaxSizeVector(size_t n, const T& init)
+      : reserve_(n), size_(n),
+        data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {
+    size_t i = 0;
+    EIGEN_TRY
+    {
+      for(; i < size_; ++i) { new (&data_[i]) T(init); }
+    }
+    EIGEN_CATCH(...)
+    {
+      // Construction failed, destruct in reverse order:
+      for(; (i+1) > 0; --i) { data_[i-1].~T(); }
+      internal::handmade_aligned_free(data_);
+      EIGEN_THROW;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  ~MaxSizeVector() {
+    for (size_t i = size_; i > 0; --i) {
+      data_[i-1].~T();
+    }
+    internal::handmade_aligned_free(data_);
+  }
+
+  void resize(size_t n) {
+    eigen_assert(n <= reserve_);
+    for (; size_ < n; ++size_) {
+      new (&data_[size_]) T;
+    }
+    for (; size_ > n; --size_) {
+      data_[size_-1].~T();
+    }
+    eigen_assert(size_ == n);
+  }
+
+  // Append new elements (up to reserved size).
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void push_back(const T& t) {
+    eigen_assert(size_ < reserve_);
+    new (&data_[size_++]) T(t);
+  }
+
+  // For C++03 compatibility this only takes one argument
+  template<class X>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void emplace_back(const X& x) {
+    eigen_assert(size_ < reserve_);
+    new (&data_[size_++]) T(x);
+  }
+
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const T& operator[] (size_t i) const {
+    eigen_assert(i < size_);
+    return data_[i];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T& operator[] (size_t i) {
+    eigen_assert(i < size_);
+    return data_[i];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T& back() {
+    eigen_assert(size_ > 0);
+    return data_[size_ - 1];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const T& back() const {
+    eigen_assert(size_ > 0);
+    return data_[size_ - 1];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void pop_back() {
+    eigen_assert(size_ > 0);
+    data_[--size_].~T();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  size_t size() const { return size_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  bool empty() const { return size_ == 0; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T* data() { return data_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const T* data() const { return data_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T* begin() { return data_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T* end() { return data_ + size_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const T* begin() const { return data_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const T* end() const { return data_ + size_; }
+
+ private:
+  size_t reserve_;
+  size_t size_;
+  T* data_;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_FIXEDSIZEVECTOR_H

diff --git a/unsupported/Eigen/EulerAngles b/unsupported/Eigen/EulerAngles
new file mode 100644
index 0000000..f8f1c5d
--- /dev/null
+++ b/unsupported/Eigen/EulerAngles

@@ -0,0 +1,43 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERANGLES_MODULE_H
+#define EIGEN_EULERANGLES_MODULE_H
+
+
+#include "../../Eigen/Core"
+#include "../../Eigen/Geometry"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+namespace Eigen {
+
+/**
+  * \defgroup EulerAngles_Module EulerAngles module
+  * \brief This module provides generic euler angles rotation.
+  *
+  * Euler angles are a way to represent 3D rotation.
+  *
+  * In order to use this module in your code, include this header:
+  * \code
+  * #include <unsupported/Eigen/EulerAngles>
+  * \endcode
+  *
+  * See \ref EulerAngles for more information.
+  *
+  */
+
+}
+
+#include "src/EulerAngles/EulerSystem.h"
+#include "src/EulerAngles/EulerAngles.h"
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_EULERANGLES_MODULE_H

diff --git a/unsupported/Eigen/FFT b/unsupported/Eigen/FFT
index 2a640b6..c8c311a 100644
--- a/unsupported/Eigen/FFT
+++ b/unsupported/Eigen/FFT

@@ -12,11 +12,8 @@
 
 #include <complex>
 #include <vector>
-using ::std::vector;
 #include <map>
-using ::std::map;
-using ::std::multimap;
-#include <Eigen/Core>
+#include "../../Eigen/Core"
 
 
 /**
@@ -71,6 +68,8 @@
   */
  
 
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
 #ifdef EIGEN_FFTW_DEFAULT
 // FFTW: faster, GPL -- incompatible with Eigen in LGPL form, bigger code size
 #  include <fftw3.h>
@@ -132,8 +131,6 @@
   const T_SrcMat & m_src;
   T_FftIfc & m_ifc;
   Index m_nfft;
-private:
-  fft_fwd_proxy& operator=(const fft_fwd_proxy&);
 };
 
 template<typename T_SrcMat,typename T_FftIfc> 
@@ -152,8 +149,6 @@
   const T_SrcMat & m_src;
   T_FftIfc & m_ifc;
   Index m_nfft;
-private:
-  fft_inv_proxy& operator=(const fft_inv_proxy&);
 };
 
 
@@ -292,6 +287,7 @@
     void inv( MatrixBase<OutputDerived> & dst, const MatrixBase<ComplexDerived> & src, Index nfft=-1)
     {
       typedef typename ComplexDerived::Scalar src_type;
+      typedef typename ComplexDerived::RealScalar real_type;
       typedef typename OutputDerived::Scalar dst_type;
       const bool realfft= (NumTraits<dst_type>::IsComplex == 0);
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OutputDerived)
@@ -332,9 +328,9 @@
             tmp.head(nhead) = src.head(nhead);
             tmp.tail(ntail) = src.tail(ntail);
             if (resize_input<0) { //shrinking -- create the Nyquist bin as the average of the two bins that fold into it
-              tmp(nhead) = ( src(nfft/2) + src( src.size() - nfft/2 ) )*src_type(.5);
+              tmp(nhead) = ( src(nfft/2) + src( src.size() - nfft/2 ) )*real_type(.5);
             }else{ // expanding -- split the old Nyquist bin into two halves
-              tmp(nhead) = src(nhead) * src_type(.5);
+              tmp(nhead) = src(nhead) * real_type(.5);
               tmp(tmp.size()-nhead) = tmp(nhead);
             }
           }
@@ -417,5 +413,7 @@
 }
 
 }
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
 #endif
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */

diff --git a/unsupported/Eigen/IterativeSolvers b/unsupported/Eigen/IterativeSolvers
new file mode 100644
index 0000000..a3f58d6
--- /dev/null
+++ b/unsupported/Eigen/IterativeSolvers

@@ -0,0 +1,51 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ITERATIVE_SOLVERS_MODULE_H
+#define EIGEN_ITERATIVE_SOLVERS_MODULE_H
+
+#include "../../Eigen/Sparse"
+#include "../../Eigen/Jacobi"
+#include "../../Eigen/Householder"
+
+
+/**
+  * \defgroup IterativeLinearSolvers_Module Iterative solvers module
+  * This module aims to provide various iterative linear and non linear solver algorithms.
+  * It currently provides:
+  *  - a constrained conjugate gradient
+  *  - a Householder GMRES implementation
+  *  - an IDR(s) implementation
+  *  - a DGMRES implementation
+  *  - a MINRES implementation
+  *
+  * \code
+  * #include <unsupported/Eigen/IterativeSolvers>
+  * \endcode
+  */
+
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#ifndef EIGEN_MPL2_ONLY
+#include "src/IterativeSolvers/IterationController.h"
+#include "src/IterativeSolvers/ConstrainedConjGrad.h"
+#endif
+
+#include "src/IterativeSolvers/IncompleteLU.h"
+#include "src/IterativeSolvers/GMRES.h"
+#include "src/IterativeSolvers/DGMRES.h"
+//#include "src/IterativeSolvers/SSORPreconditioner.h"
+#include "src/IterativeSolvers/MINRES.h"
+#include "src/IterativeSolvers/IDRS.h"
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+
+#endif // EIGEN_ITERATIVE_SOLVERS_MODULE_H

diff --git a/unsupported/Eigen/KroneckerProduct b/unsupported/Eigen/KroneckerProduct
index c932c06..5f5afb8 100644
--- a/unsupported/Eigen/KroneckerProduct
+++ b/unsupported/Eigen/KroneckerProduct

@@ -13,6 +13,8 @@
 
 #include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
+#include "../../Eigen/src/SparseCore/SparseUtil.h"
+
 namespace Eigen {
 
 /**

diff --git a/unsupported/Eigen/LevenbergMarquardt b/unsupported/Eigen/LevenbergMarquardt
new file mode 100644
index 0000000..1090505
--- /dev/null
+++ b/unsupported/Eigen/LevenbergMarquardt

@@ -0,0 +1,49 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LEVENBERGMARQUARDT_MODULE
+#define EIGEN_LEVENBERGMARQUARDT_MODULE
+
+// #include <vector>
+
+#include "../../Eigen/Core"
+#include "../../Eigen/Jacobi"
+#include "../../Eigen/QR"
+#include "NumericalDiff"
+
+#include "../../Eigen/SparseQR"
+
+/**
+  * \defgroup LevenbergMarquardt_Module Levenberg-Marquardt module
+  *
+  * \code
+  * #include </Eigen/LevenbergMarquardt>
+  * \endcode
+  *
+  * 
+  */
+
+#include "../../Eigen/SparseCore"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+#include "src/LevenbergMarquardt/LMqrsolv.h"
+#include "src/LevenbergMarquardt/LMcovar.h"
+#include "src/LevenbergMarquardt/LMpar.h"
+
+#endif
+
+#include "src/LevenbergMarquardt/LevenbergMarquardt.h"
+#include "src/LevenbergMarquardt/LMonestep.h"
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_LEVENBERGMARQUARDT_MODULE

diff --git a/unsupported/Eigen/MPRealSupport b/unsupported/Eigen/MPRealSupport
new file mode 100644
index 0000000..c4ea4ec
--- /dev/null
+++ b/unsupported/Eigen/MPRealSupport

@@ -0,0 +1,213 @@
+// This file is part of a joint effort between Eigen, a lightweight C++ template library
+// for linear algebra, and MPFR C++, a C++ interface to MPFR library (http://www.holoborodko.com/pavel/)
+//
+// Copyright (C) 2010-2012 Pavel Holoborodko <pavel@holoborodko.com>
+// Copyright (C) 2010 Konstantin Holoborodko <konstantin@holoborodko.com>
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MPREALSUPPORT_MODULE_H
+#define EIGEN_MPREALSUPPORT_MODULE_H
+
+#include "../../Eigen/Core"
+#include <mpreal.h>
+
+namespace Eigen {
+  
+/**
+  * \defgroup MPRealSupport_Module MPFRC++ Support module
+  * \code
+  * #include <Eigen/MPRealSupport>
+  * \endcode
+  *
+  * This module provides support for multi precision floating point numbers
+  * via the <a href="http://www.holoborodko.com/pavel/mpfr">MPFR C++</a>
+  * library which itself is built upon <a href="http://www.mpfr.org/">MPFR</a>/<a href="http://gmplib.org/">GMP</a>.
+  *
+  * \warning MPFR C++ is licensed under the GPL.
+  *
+  * You can find a copy of MPFR C++ that is known to be compatible in the unsupported/test/mpreal folder.
+  *
+  * Here is an example:
+  *
+\code
+#include <iostream>
+#include <Eigen/MPRealSupport>
+#include <Eigen/LU>
+using namespace mpfr;
+using namespace Eigen;
+int main()
+{
+  // set precision to 256 bits (double has only 53 bits)
+  mpreal::set_default_prec(256);
+  // Declare matrix and vector types with multi-precision scalar type
+  typedef Matrix<mpreal,Dynamic,Dynamic>  MatrixXmp;
+  typedef Matrix<mpreal,Dynamic,1>        VectorXmp;
+
+  MatrixXmp A = MatrixXmp::Random(100,100);
+  VectorXmp b = VectorXmp::Random(100);
+
+  // Solve Ax=b using LU
+  VectorXmp x = A.lu().solve(b);
+  std::cout << "relative error: " << (A*x - b).norm() / b.norm() << std::endl;
+  return 0;
+}
+\endcode
+  *
+  */
+	
+  template<> struct NumTraits<mpfr::mpreal>
+    : GenericNumTraits<mpfr::mpreal>
+  {
+    enum {
+      IsInteger = 0,
+      IsSigned = 1,
+      IsComplex = 0,
+      RequireInitialization = 1,
+      ReadCost = HugeCost,
+      AddCost  = HugeCost,
+      MulCost  = HugeCost
+    };
+
+    typedef mpfr::mpreal Real;
+    typedef mpfr::mpreal NonInteger;
+    
+    static inline Real highest  (long Precision = mpfr::mpreal::get_default_prec()) { return  mpfr::maxval(Precision); }
+    static inline Real lowest   (long Precision = mpfr::mpreal::get_default_prec()) { return -mpfr::maxval(Precision); }
+
+    // Constants
+    static inline Real Pi      (long Precision = mpfr::mpreal::get_default_prec())  { return mpfr::const_pi(Precision);        }
+    static inline Real Euler   (long Precision = mpfr::mpreal::get_default_prec())  { return mpfr::const_euler(Precision);     }
+    static inline Real Log2    (long Precision = mpfr::mpreal::get_default_prec())  { return mpfr::const_log2(Precision);      }
+    static inline Real Catalan (long Precision = mpfr::mpreal::get_default_prec())  { return mpfr::const_catalan(Precision);   }
+
+    static inline Real epsilon (long Precision = mpfr::mpreal::get_default_prec())  { return mpfr::machine_epsilon(Precision); }
+    static inline Real epsilon (const Real& x)                                      { return mpfr::machine_epsilon(x); }
+
+#ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS
+    static inline int digits10 (long Precision = mpfr::mpreal::get_default_prec())  { return std::numeric_limits<Real>::digits10(Precision); }
+    static inline int digits10 (const Real& x)                                      { return std::numeric_limits<Real>::digits10(x); }
+    
+    static inline int digits ()               { return std::numeric_limits<Real>::digits(); }
+    static inline int digits (const Real& x)  { return std::numeric_limits<Real>::digits(x); }
+#endif
+
+    static inline Real dummy_precision()
+    {
+      mpfr_prec_t weak_prec = ((mpfr::mpreal::get_default_prec()-1) * 90) / 100;
+      return mpfr::machine_epsilon(weak_prec);
+    }
+  };
+
+  namespace internal {
+
+  template<> inline mpfr::mpreal random<mpfr::mpreal>()
+  {
+    return mpfr::random();
+  }
+
+  template<> inline mpfr::mpreal random<mpfr::mpreal>(const mpfr::mpreal& a, const mpfr::mpreal& b)
+  {
+    return a + (b-a) * random<mpfr::mpreal>();
+  }
+
+  inline bool isMuchSmallerThan(const mpfr::mpreal& a, const mpfr::mpreal& b, const mpfr::mpreal& eps)
+  {
+    return mpfr::abs(a) <= mpfr::abs(b) * eps;
+  }
+
+  inline bool isApprox(const mpfr::mpreal& a, const mpfr::mpreal& b, const mpfr::mpreal& eps)
+  {
+    return mpfr::isEqualFuzzy(a,b,eps);
+  }
+
+  inline bool isApproxOrLessThan(const mpfr::mpreal& a, const mpfr::mpreal& b, const mpfr::mpreal& eps)
+  {
+    return a <= b || mpfr::isEqualFuzzy(a,b,eps);
+  }
+
+  template<> inline long double cast<mpfr::mpreal,long double>(const mpfr::mpreal& x)
+  { return x.toLDouble(); }
+
+  template<> inline double cast<mpfr::mpreal,double>(const mpfr::mpreal& x)
+  { return x.toDouble(); }
+
+  template<> inline long cast<mpfr::mpreal,long>(const mpfr::mpreal& x)
+  { return x.toLong(); }
+
+  template<> inline int cast<mpfr::mpreal,int>(const mpfr::mpreal& x)
+  { return int(x.toLong()); }
+
+  // Specialize GEBP kernel and traits for mpreal (no need for peeling, nor complicated stuff)
+  // This also permits to directly call mpfr's routines and avoid many temporaries produced by mpreal
+    template<>
+    class gebp_traits<mpfr::mpreal, mpfr::mpreal, false, false>
+    {
+    public:
+      typedef mpfr::mpreal ResScalar;
+      enum {
+        Vectorizable = false,
+        LhsPacketSize = 1,
+        RhsPacketSize = 1,
+        ResPacketSize = 1,
+        NumberOfRegisters = 1,
+        nr = 1,
+        mr = 1,
+        LhsProgress = 1,
+        RhsProgress = 1
+      };
+      typedef ResScalar LhsPacket;
+      typedef ResScalar RhsPacket;
+      typedef ResScalar ResPacket;
+      typedef LhsPacket LhsPacket4Packing;
+      
+    };
+
+
+
+    template<typename Index, typename DataMapper, bool ConjugateLhs, bool ConjugateRhs>
+    struct gebp_kernel<mpfr::mpreal,mpfr::mpreal,Index,DataMapper,1,1,ConjugateLhs,ConjugateRhs>
+    {
+      typedef mpfr::mpreal mpreal;
+
+      EIGEN_DONT_INLINE
+      void operator()(const DataMapper& res, const mpreal* blockA, const mpreal* blockB, 
+                      Index rows, Index depth, Index cols, const mpreal& alpha,
+                      Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0)
+      {
+        if(rows==0 || cols==0 || depth==0)
+          return;
+
+        mpreal  acc1(0,mpfr_get_prec(blockA[0].mpfr_srcptr())),
+                tmp (0,mpfr_get_prec(blockA[0].mpfr_srcptr()));
+
+        if(strideA==-1) strideA = depth;
+        if(strideB==-1) strideB = depth;
+
+        for(Index i=0; i<rows; ++i)
+        {
+          for(Index j=0; j<cols; ++j)
+          {
+            const mpreal *A = blockA + i*strideA + offsetA;
+            const mpreal *B = blockB + j*strideB + offsetB;
+            
+            acc1 = 0;
+            for(Index k=0; k<depth; k++)
+            {
+              mpfr_mul(tmp.mpfr_ptr(), A[k].mpfr_srcptr(), B[k].mpfr_srcptr(), mpreal::get_default_rnd());
+              mpfr_add(acc1.mpfr_ptr(), acc1.mpfr_ptr(), tmp.mpfr_ptr(),  mpreal::get_default_rnd());
+            }
+            
+            mpfr_mul(acc1.mpfr_ptr(), acc1.mpfr_srcptr(), alpha.mpfr_srcptr(), mpreal::get_default_rnd());
+            mpfr_add(res(i,j).mpfr_ptr(), res(i,j).mpfr_srcptr(), acc1.mpfr_srcptr(),  mpreal::get_default_rnd());
+          }
+        }
+      }
+    };
+  } // end namespace internal
+}
+
+#endif // EIGEN_MPREALSUPPORT_MODULE_H

diff --git a/unsupported/Eigen/MatrixFunctions b/unsupported/Eigen/MatrixFunctions
index dfdc3ca..20c23d1 100644
--- a/unsupported/Eigen/MatrixFunctions
+++ b/unsupported/Eigen/MatrixFunctions

@@ -13,11 +13,10 @@
 
 #include <cfloat>
 #include <list>
-using ::std::list;
 
-#include <Eigen/Core>
-#include <Eigen/LU>
-#include <Eigen/Eigenvalues>
+#include "../../Eigen/Core"
+#include "../../Eigen/LU"
+#include "../../Eigen/Eigenvalues"
 
 /**
   * \defgroup MatrixFunctions_Module Matrix functions module
@@ -54,12 +53,16 @@
   *
   */
 
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
 #include "src/MatrixFunctions/MatrixExponential.h"
 #include "src/MatrixFunctions/MatrixFunction.h"
 #include "src/MatrixFunctions/MatrixSquareRoot.h"
 #include "src/MatrixFunctions/MatrixLogarithm.h"
 #include "src/MatrixFunctions/MatrixPower.h"
 
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
 
 /** 
 \page matrixbaseextra_page
@@ -83,7 +86,9 @@
 \param[in]  M  a square matrix.
 \returns  expression representing \f$ \cos(M) \f$.
 
-This function calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::cos().
+This function computes the matrix cosine. Use ArrayBase::cos() for computing the entry-wise cosine.
+
+The implementation calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::cos().
 
 \sa \ref matrixbase_sin "sin()" for an example.
 
@@ -124,6 +129,9 @@
 initial condition \f$ y(0) = y_0 \f$ is given by
 \f$ y(t) = \exp(M) y_0 \f$.
 
+The matrix exponential is different from applying the exp function to all the entries in the matrix.
+Use ArrayBase::exp() if you want to do the latter.
+
 The cost of the computation is approximately \f$ 20 n^3 \f$ for
 matrices of size \f$ n \f$. The number 20 depends weakly on the
 norm of the matrix.
@@ -157,8 +165,8 @@
 \include MatrixExponential.cpp
 Output: \verbinclude MatrixExponential.out
 
-\note \p M has to be a matrix of \c float, \c double, \c long double
-\c complex<float>, \c complex<double>, or \c complex<long double> .
+\note \p M has to be a matrix of \c float, \c double, `long double`
+\c complex<float>, \c complex<double>, or `complex<long double>` .
 
 
 \subsection matrixbase_log MatrixBase::log()
@@ -178,6 +186,9 @@
 multiple solutions; this function returns a matrix whose eigenvalues
 have imaginary part in the interval \f$ (-\pi,\pi] \f$.
 
+The matrix logarithm is different from applying the log function to all the entries in the matrix.
+Use ArrayBase::log() if you want to do the latter.
+
 In the real case, the matrix \f$ M \f$ should be invertible and
 it should have no eigenvalues which are real and negative (pairs of
 complex conjugate eigenvalues are allowed). In the complex case, it
@@ -212,9 +223,8 @@
 \include MatrixLogarithm.cpp
 Output: \verbinclude MatrixLogarithm.out
 
-\note \p M has to be a matrix of \c float, \c double, <tt>long
-double</tt>, \c complex<float>, \c complex<double>, or \c complex<long
-double> .
+\note \p M has to be a matrix of \c float, \c double, `long
+double`, \c complex<float>, \c complex<double>, or `complex<long double>`.
 
 \sa MatrixBase::exp(), MatrixBase::matrixFunction(), 
     class MatrixLogarithmAtomic, MatrixBase::sqrt().
@@ -233,7 +243,8 @@
 
 The matrix power \f$ M^p \f$ is defined as \f$ \exp(p \log(M)) \f$,
 where exp denotes the matrix exponential, and log denotes the matrix
-logarithm.
+logarithm. This is different from raising all the entries in the matrix
+to the p-th power. Use ArrayBase::pow() if you want to do the latter.
 
 If \p p is complex, the scalar type of \p M should be the type of \p
 p . \f$ M^p \f$ simply evaluates into \f$ \exp(p \log(M)) \f$.
@@ -268,15 +279,6 @@
 against inaccurate result, e.g. \code
 #include <unsupported/Eigen/MatrixFunctions>
 #include <iostream>
-using ::std::cout;
-using ::std::cin;
-using ::std::cerr;
-using ::std::ios;
-using ::std::endl;
-using ::std::iostream;
-using ::std::ios_base;
-using ::std::ostream;
-using ::std::istream;
 
 int main()
 {
@@ -327,9 +329,9 @@
 \include MatrixPower_optimal.cpp
 Output: \verbinclude MatrixPower_optimal.out
 
-\note \p M has to be a matrix of \c float, \c double, <tt>long
-double</tt>, \c complex<float>, \c complex<double>, or \c complex<long
-double> .
+\note \p M has to be a matrix of \c float, \c double, `long
+double`, \c complex<float>, \c complex<double>, or
+\c complex<long double> .
 
 \sa MatrixBase::exp(), MatrixBase::log(), class MatrixPower.
 
@@ -401,7 +403,9 @@
 \param[in]  M  a square matrix.
 \returns  expression representing \f$ \sin(M) \f$.
 
-This function calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::sin().
+This function computes the matrix sine. Use ArrayBase::sin() for computing the entry-wise sine.
+
+The implementation calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::sin().
 
 Example: \include MatrixSine.cpp
 Output: \verbinclude MatrixSine.out
@@ -438,7 +442,9 @@
 
 The matrix square root of \f$ M \f$ is the matrix \f$ M^{1/2} \f$
 whose square is the original matrix; so if \f$ S = M^{1/2} \f$ then
-\f$ S^2 = M \f$. 
+\f$ S^2 = M \f$. This is different from taking the square root of all
+the entries in the matrix; use ArrayBase::sqrt() if you want to do the
+latter.
 
 In the <b>real case</b>, the matrix \f$ M \f$ should be invertible and
 it should have no eigenvalues which are real and negative (pairs of
@@ -495,3 +501,4 @@
 */
 
 #endif // EIGEN_MATRIX_FUNCTIONS
+

diff --git a/unsupported/Eigen/MoreVectorization b/unsupported/Eigen/MoreVectorization
new file mode 100644
index 0000000..7662b47
--- /dev/null
+++ b/unsupported/Eigen/MoreVectorization

@@ -0,0 +1,24 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MOREVECTORIZATION_MODULE_H
+#define EIGEN_MOREVECTORIZATION_MODULE_H
+
+#include "../../Eigen/Core"
+
+namespace Eigen {
+
+/**
+  * \defgroup MoreVectorization More vectorization module
+  */
+
+}
+
+#include "src/MoreVectorization/MathFunctions.h"
+
+#endif // EIGEN_MOREVECTORIZATION_MODULE_H

diff --git a/unsupported/Eigen/NonLinearOptimization b/unsupported/Eigen/NonLinearOptimization
new file mode 100644
index 0000000..961f192
--- /dev/null
+++ b/unsupported/Eigen/NonLinearOptimization

@@ -0,0 +1,140 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NONLINEAROPTIMIZATION_MODULE
+#define EIGEN_NONLINEAROPTIMIZATION_MODULE
+
+#include <vector>
+
+#include "../../Eigen/Core"
+#include "../../Eigen/Jacobi"
+#include "../../Eigen/QR"
+#include "NumericalDiff"
+
+/**
+  * \defgroup NonLinearOptimization_Module Non linear optimization module
+  *
+  * \code
+  * #include <unsupported/Eigen/NonLinearOptimization>
+  * \endcode
+  *
+  * This module provides implementation of two important algorithms in non linear
+  * optimization. In both cases, we consider a system of non linear functions. Of
+  * course, this should work, and even work very well if those functions are
+  * actually linear. But if this is so, you should probably better use other
+  * methods more fitted to this special case.
+  *
+  * One algorithm allows to find a least-squares solution of such a system
+  * (Levenberg-Marquardt algorithm) and the second one is used to find 
+  * a zero for the system (Powell hybrid "dogleg" method).
+  *
+  * This code is a port of minpack (http://en.wikipedia.org/wiki/MINPACK).
+  * Minpack is a very famous, old, robust and well renowned package, written in
+  * fortran. Those implementations have been carefully tuned, tested, and used
+  * for several decades.
+  *
+  * The original fortran code was automatically translated using f2c (http://en.wikipedia.org/wiki/F2c) in C,
+  * then c++, and then cleaned by several different authors.
+  * The last one of those cleanings being our starting point : 
+  * http://devernay.free.fr/hacks/cminpack.html
+  * 
+  * Finally, we ported this code to Eigen, creating classes and API
+  * coherent with Eigen. When possible, we switched to Eigen
+  * implementation, such as most linear algebra (vectors, matrices, stable norms).
+  *
+  * Doing so, we were very careful to check the tests we setup at the very
+  * beginning, which ensure that the same results are found.
+  *
+  * \section Tests Tests
+  * 
+  * The tests are placed in the file unsupported/test/NonLinear.cpp.
+  * 
+  * There are two kinds of tests : those that come from examples bundled with cminpack.
+  * They guaranty we get the same results as the original algorithms (value for 'x',
+  * for the number of evaluations of the function, and for the number of evaluations
+  * of the Jacobian if ever).
+  * 
+  * Other tests were added by myself at the very beginning of the 
+  * process and check the results for Levenberg-Marquardt using the reference data 
+  * on http://www.itl.nist.gov/div898/strd/nls/nls_main.shtml. Since then i've 
+  * carefully checked that the same results were obtained when modifying the
+  * code. Please note that we do not always get the exact same decimals as they do,
+  * but this is ok : they use 128bits float, and we do the tests using the C type 'double',
+  * which is 64 bits on most platforms (x86 and amd64, at least).
+  * I've performed those tests on several other implementations of Levenberg-Marquardt, and
+  * (c)minpack performs VERY well compared to those, both in accuracy and speed.
+  * 
+  * The documentation for running the tests is on the wiki
+  * http://eigen.tuxfamily.org/index.php?title=Tests
+  * 
+  * \section API API: overview of methods
+  * 
+  * Both algorithms needs a functor computing the Jacobian. It can be computed by
+  * hand, using auto-differentiation (see \ref AutoDiff_Module), or using numerical
+  * differences (see \ref NumericalDiff_Module). For instance:
+  *\code
+  * MyFunc func;
+  * NumericalDiff<MyFunc> func_with_num_diff(func);
+  * LevenbergMarquardt<NumericalDiff<MyFunc> > lm(func_with_num_diff);
+  * \endcode
+  * For HybridNonLinearSolver, the method solveNumericalDiff() does the above wrapping for
+  * you.
+  * 
+  * The methods LevenbergMarquardt.lmder1()/lmdif1()/lmstr1() and 
+  * HybridNonLinearSolver.hybrj1()/hybrd1() are specific methods from the original 
+  * minpack package that you probably should NOT use until you are porting a code that
+  * was previously using minpack. They just define a 'simple' API with default values 
+  * for some parameters.
+  * 
+  * All algorithms are provided using two APIs :
+  *     - one where the user inits the algorithm, and uses '*OneStep()' as much as he wants : 
+  * this way the caller have control over the steps
+  *     - one where the user just calls a method (optimize() or solve()) which will 
+  * handle the loop: init + loop until a stop condition is met. Those are provided for
+  *  convenience.
+  * 
+  * As an example, the method LevenbergMarquardt::minimize() is 
+  * implemented as follow: 
+  * \code
+  * Status LevenbergMarquardt<FunctorType,Scalar>::minimize(FVectorType  &x, const int mode)
+  * {
+  *     Status status = minimizeInit(x, mode);
+  *     do {
+  *         status = minimizeOneStep(x, mode);
+  *     } while (status==Running);
+  *     return status;
+  * }
+  * \endcode
+  * 
+  * \section examples Examples
+  * 
+  * The easiest way to understand how to use this module is by looking at the many examples in the file
+  * unsupported/test/NonLinearOptimization.cpp.
+  */
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+#include "src/NonLinearOptimization/qrsolv.h"
+#include "src/NonLinearOptimization/r1updt.h"
+#include "src/NonLinearOptimization/r1mpyq.h"
+#include "src/NonLinearOptimization/rwupdt.h"
+#include "src/NonLinearOptimization/fdjac1.h"
+#include "src/NonLinearOptimization/lmpar.h"
+#include "src/NonLinearOptimization/dogleg.h"
+#include "src/NonLinearOptimization/covar.h"
+
+#include "src/NonLinearOptimization/chkder.h"
+
+#endif
+
+#include "src/NonLinearOptimization/HybridNonLinearSolver.h"
+#include "src/NonLinearOptimization/LevenbergMarquardt.h"
+
+
+#endif // EIGEN_NONLINEAROPTIMIZATION_MODULE

diff --git a/unsupported/Eigen/NumericalDiff b/unsupported/Eigen/NumericalDiff
new file mode 100644
index 0000000..0668f96
--- /dev/null
+++ b/unsupported/Eigen/NumericalDiff

@@ -0,0 +1,56 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NUMERICALDIFF_MODULE
+#define EIGEN_NUMERICALDIFF_MODULE
+
+#include "../../Eigen/Core"
+
+namespace Eigen {
+
+/**
+  * \defgroup NumericalDiff_Module Numerical differentiation module
+  *
+  * \code
+  * #include <unsupported/Eigen/NumericalDiff>
+  * \endcode
+  *
+  * See http://en.wikipedia.org/wiki/Numerical_differentiation
+  *
+  * Warning : this should NOT be confused with automatic differentiation, which
+  * is a different method and has its own module in Eigen : \ref
+  * AutoDiff_Module.
+  *
+  * Currently only "Forward" and "Central" schemes are implemented. Those
+  * are basic methods, and there exist some more elaborated way of
+  * computing such approximates. They are implemented using both
+  * proprietary and free software, and usually requires linking to an
+  * external library. It is very easy for you to write a functor
+  * using such software, and the purpose is quite orthogonal to what we
+  * want to achieve with Eigen.
+  *
+  * This is why we will not provide wrappers for every great numerical
+  * differentiation software that exist, but should rather stick with those
+  * basic ones, that still are useful for testing.
+  *
+  * Also, the \ref NonLinearOptimization_Module needs this in order to
+  * provide full features compatibility with the original (c)minpack
+  * package.
+  *
+  */
+}
+
+//@{
+
+#include "src/NumericalDiff/NumericalDiff.h"
+
+//@}
+
+
+#endif // EIGEN_NUMERICALDIFF_MODULE

diff --git a/unsupported/Eigen/OpenGLSupport b/unsupported/Eigen/OpenGLSupport
new file mode 100644
index 0000000..f8c2130
--- /dev/null
+++ b/unsupported/Eigen/OpenGLSupport

@@ -0,0 +1,322 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_OPENGL_MODULE
+#define EIGEN_OPENGL_MODULE
+
+#include "../../Eigen/Geometry"
+
+#if defined(__APPLE_CC__)
+  #include <OpenGL/gl.h>
+#else
+  #include <GL/gl.h>
+#endif
+
+namespace Eigen {
+
+/**
+  * \defgroup OpenGLSUpport_Module OpenGL Support module
+  *
+  * This module provides wrapper functions for a couple of OpenGL functions
+  * which simplify the way to pass Eigen's object to openGL.
+  * Here is an example:
+  * 
+  * \code
+  * // You need to add path_to_eigen/unsupported to your include path.
+  * #include <Eigen/OpenGLSupport>
+  * // ...
+  * Vector3f x, y;
+  * Matrix3f rot;
+  * 
+  * glVertex(y + x * rot);
+  * 
+  * Quaternion q;
+  * glRotate(q);
+  * 
+  * // ...
+  * \endcode
+  *
+  */
+//@{
+
+#define EIGEN_GL_FUNC_DECLARATION(FUNC)                                                                             \
+namespace internal {                                                                                                \
+  template< typename XprType,                                                                                       \
+            typename Scalar = typename XprType::Scalar,                                                             \
+            int Rows = XprType::RowsAtCompileTime,                                                                  \
+            int Cols = XprType::ColsAtCompileTime,                                                                  \
+            bool IsGLCompatible = bool(internal::evaluator<XprType>::Flags&LinearAccessBit)                         \
+                              && bool(XprType::Flags&DirectAccessBit)                                               \
+                              && (XprType::IsVectorAtCompileTime || (XprType::Flags&RowMajorBit)==0)>               \
+  struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl);                                                                      \
+                                                                                                                    \
+  template<typename XprType, typename Scalar, int Rows, int Cols>                                                   \
+  struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType,Scalar,Rows,Cols,false> {                                     \
+    inline static void run(const XprType& p) {                                                                      \
+      EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<typename plain_matrix_type_column_major<XprType>::type>::run(p); }       \
+  };                                                                                                                \
+}                                                                                                                   \
+                                                                                                                    \
+template<typename Derived> inline void FUNC(const Eigen::DenseBase<Derived>& p) {                                   \
+  EIGEN_CAT(EIGEN_CAT(internal::gl_,FUNC),_impl)<Derived>::run(p.derived());                                        \
+}
+
+
+#define EIGEN_GL_FUNC_SPECIALIZATION_MAT(FUNC,SCALAR,ROWS,COLS,SUFFIX)                                              \
+namespace internal {                                                                                                \
+  template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, ROWS, COLS, true> {      \
+    inline static void run(const XprType& p) { FUNC##SUFFIX(p.data()); }                                            \
+  };                                                                                                                \
+}
+
+  
+#define EIGEN_GL_FUNC_SPECIALIZATION_VEC(FUNC,SCALAR,SIZE,SUFFIX)                                                   \
+namespace internal {                                                                                                \
+  template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, SIZE, 1, true> {         \
+    inline static void run(const XprType& p) { FUNC##SUFFIX(p.data()); }                                            \
+  };                                                                                                                \
+  template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, 1, SIZE, true> {         \
+    inline static void run(const XprType& p) { FUNC##SUFFIX(p.data()); }                                            \
+  };                                                                                                                \
+}
+
+  
+EIGEN_GL_FUNC_DECLARATION       (glVertex)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,int,    2,2iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,short,  2,2sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,float,  2,2fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,double, 2,2dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,int,    3,3iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,short,  3,3sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,float,  3,3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,double, 3,3dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,int,    4,4iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,short,  4,4sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,float,  4,4fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,double, 4,4dv)
+
+EIGEN_GL_FUNC_DECLARATION       (glTexCoord)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,int,    2,2iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,short,  2,2sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,float,  2,2fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,double, 2,2dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,int,    3,3iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,short,  3,3sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,float,  3,3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,double, 3,3dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,int,    4,4iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,short,  4,4sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,float,  4,4fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,double, 4,4dv)
+
+EIGEN_GL_FUNC_DECLARATION       (glColor)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,int,    2,2iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,short,  2,2sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,float,  2,2fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,double, 2,2dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,int,    3,3iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,short,  3,3sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,float,  3,3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,double, 3,3dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,int,    4,4iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,short,  4,4sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,float,  4,4fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,double, 4,4dv)
+
+EIGEN_GL_FUNC_DECLARATION       (glNormal)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal,int,    3,3iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal,short,  3,3sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal,float,  3,3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal,double, 3,3dv)
+
+inline void glScale2fv(const float*  v) { glScalef(v[0], v[1], 1.f);  }
+inline void glScale2dv(const double* v) { glScaled(v[0], v[1], 1.0);  }
+inline void glScale3fv(const float*  v) { glScalef(v[0], v[1], v[2]); }
+inline void glScale3dv(const double* v) { glScaled(v[0], v[1], v[2]); }
+
+EIGEN_GL_FUNC_DECLARATION       (glScale)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale,float,  2,2fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale,double, 2,2dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale,float,  3,3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale,double, 3,3dv)
+
+template<typename Scalar> void glScale(const UniformScaling<Scalar>& s)  { glScale(Matrix<Scalar,3,1>::Constant(s.factor())); }
+
+inline void glTranslate2fv(const float*  v) { glTranslatef(v[0], v[1], 0.f);  }
+inline void glTranslate2dv(const double* v) { glTranslated(v[0], v[1], 0.0);  }
+inline void glTranslate3fv(const float*  v) { glTranslatef(v[0], v[1], v[2]); }
+inline void glTranslate3dv(const double* v) { glTranslated(v[0], v[1], v[2]); }
+
+EIGEN_GL_FUNC_DECLARATION       (glTranslate)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate,float,  2,2fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate,double, 2,2dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate,float,  3,3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate,double, 3,3dv)
+
+template<typename Scalar> void glTranslate(const Translation<Scalar,2>& t)  { glTranslate(t.vector()); }
+template<typename Scalar> void glTranslate(const Translation<Scalar,3>& t)  { glTranslate(t.vector()); }
+
+EIGEN_GL_FUNC_DECLARATION       (glMultMatrix)
+EIGEN_GL_FUNC_SPECIALIZATION_MAT(glMultMatrix,float,  4,4,f)
+EIGEN_GL_FUNC_SPECIALIZATION_MAT(glMultMatrix,double, 4,4,d)
+
+template<typename Scalar> void glMultMatrix(const Transform<Scalar,3,Affine>& t)        { glMultMatrix(t.matrix()); }
+template<typename Scalar> void glMultMatrix(const Transform<Scalar,3,Projective>& t)    { glMultMatrix(t.matrix()); }
+template<typename Scalar> void glMultMatrix(const Transform<Scalar,3,AffineCompact>& t) { glMultMatrix(Transform<Scalar,3,Affine>(t).matrix()); }
+
+EIGEN_GL_FUNC_DECLARATION       (glLoadMatrix)
+EIGEN_GL_FUNC_SPECIALIZATION_MAT(glLoadMatrix,float,  4,4,f)
+EIGEN_GL_FUNC_SPECIALIZATION_MAT(glLoadMatrix,double, 4,4,d)
+
+template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,Affine>& t)        { glLoadMatrix(t.matrix()); }
+template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,Projective>& t)    { glLoadMatrix(t.matrix()); }
+template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,AffineCompact>& t) { glLoadMatrix(Transform<Scalar,3,Affine>(t).matrix()); }
+
+inline void glRotate(const Rotation2D<float>& rot)
+{
+  glRotatef(rot.angle()*180.f/float(EIGEN_PI), 0.f, 0.f, 1.f);
+}
+inline void glRotate(const Rotation2D<double>& rot)
+{
+  glRotated(rot.angle()*180.0/double(EIGEN_PI), 0.0, 0.0, 1.0);
+}
+
+template<typename Derived> void glRotate(const RotationBase<Derived,3>& rot)
+{  
+  Transform<typename Derived::Scalar,3,Projective> tr(rot);
+  glMultMatrix(tr.matrix());
+}
+
+#define EIGEN_GL_MAKE_CONST_const const
+#define EIGEN_GL_MAKE_CONST__ 
+#define EIGEN_GL_EVAL(X) X
+
+#define EIGEN_GL_FUNC1_DECLARATION(FUNC,ARG1,CONST)                                                                             \
+namespace internal {                                                                                                            \
+  template< typename XprType,                                                                                                   \
+            typename Scalar = typename XprType::Scalar,                                                                         \
+            int Rows = XprType::RowsAtCompileTime,                                                                              \
+            int Cols = XprType::ColsAtCompileTime,                                                                              \
+            bool IsGLCompatible = bool(internal::evaluator<XprType>::Flags&LinearAccessBit)                                     \
+                              && bool(XprType::Flags&DirectAccessBit)                                                           \
+                              && (XprType::IsVectorAtCompileTime || (XprType::Flags&RowMajorBit)==0)>                           \
+  struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl);                                                                                  \
+                                                                                                                                \
+  template<typename XprType, typename Scalar, int Rows, int Cols>                                                               \
+  struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType,Scalar,Rows,Cols,false> {                                                 \
+    inline static void run(ARG1 a,EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) {                                      \
+      EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<typename plain_matrix_type_column_major<XprType>::type>::run(a,p); }                 \
+  };                                                                                                                            \
+}                                                                                                                               \
+                                                                                                                                \
+template<typename Derived> inline void FUNC(ARG1 a,EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) Eigen::DenseBase<Derived>& p) {   \
+  EIGEN_CAT(EIGEN_CAT(internal::gl_,FUNC),_impl)<Derived>::run(a,p.derived());                                                  \
+}
+
+
+#define EIGEN_GL_FUNC1_SPECIALIZATION_MAT(FUNC,ARG1,CONST,SCALAR,ROWS,COLS,SUFFIX)                                              \
+namespace internal {                                                                                                            \
+  template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, ROWS, COLS, true> {                  \
+    inline static void run(ARG1 a, EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) { FUNC##SUFFIX(a,p.data()); }         \
+  }; \
+}
+
+  
+#define EIGEN_GL_FUNC1_SPECIALIZATION_VEC(FUNC,ARG1,CONST,SCALAR,SIZE,SUFFIX)                                                   \
+namespace internal {                                                                                                            \
+  template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, SIZE, 1, true> {                     \
+    inline static void run(ARG1 a, EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) { FUNC##SUFFIX(a,p.data()); }         \
+  };                                                                                                                            \
+  template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, 1, SIZE, true> {                     \
+    inline static void run(ARG1 a, EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) { FUNC##SUFFIX(a,p.data()); }         \
+  };                                                                                                                            \
+}
+
+EIGEN_GL_FUNC1_DECLARATION       (glGet,GLenum,_)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glGet,GLenum,_,float,  4,4,Floatv)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glGet,GLenum,_,double, 4,4,Doublev)
+
+// glUniform API
+
+#ifdef GL_VERSION_2_0
+
+inline void glUniform2fv_ei  (GLint loc, const float* v)         { glUniform2fv(loc,1,v); }
+inline void glUniform2iv_ei  (GLint loc, const int* v)           { glUniform2iv(loc,1,v); }
+
+inline void glUniform3fv_ei  (GLint loc, const float* v)         { glUniform3fv(loc,1,v); }
+inline void glUniform3iv_ei  (GLint loc, const int* v)           { glUniform3iv(loc,1,v); }
+
+inline void glUniform4fv_ei  (GLint loc, const float* v)         { glUniform4fv(loc,1,v); }
+inline void glUniform4iv_ei  (GLint loc, const int* v)           { glUniform4iv(loc,1,v); }
+
+inline void glUniformMatrix2fv_ei  (GLint loc, const float* v)         { glUniformMatrix2fv(loc,1,false,v); }
+inline void glUniformMatrix3fv_ei  (GLint loc, const float* v)         { glUniformMatrix3fv(loc,1,false,v); }
+inline void glUniformMatrix4fv_ei  (GLint loc, const float* v)         { glUniformMatrix4fv(loc,1,false,v); }
+
+
+EIGEN_GL_FUNC1_DECLARATION       (glUniform,GLint,const)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,float,        2,2fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,int,          2,2iv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,float,        3,3fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,int,          3,3iv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,float,        4,4fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,int,          4,4iv_ei)
+
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        2,2,Matrix2fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        3,3,Matrix3fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        4,4,Matrix4fv_ei)
+
+#endif
+
+#ifdef GL_VERSION_2_1
+
+inline void glUniformMatrix2x3fv_ei(GLint loc, const float* v)         { glUniformMatrix2x3fv(loc,1,false,v); }
+inline void glUniformMatrix3x2fv_ei(GLint loc, const float* v)         { glUniformMatrix3x2fv(loc,1,false,v); }
+inline void glUniformMatrix2x4fv_ei(GLint loc, const float* v)         { glUniformMatrix2x4fv(loc,1,false,v); }
+inline void glUniformMatrix4x2fv_ei(GLint loc, const float* v)         { glUniformMatrix4x2fv(loc,1,false,v); }
+inline void glUniformMatrix3x4fv_ei(GLint loc, const float* v)         { glUniformMatrix3x4fv(loc,1,false,v); }
+inline void glUniformMatrix4x3fv_ei(GLint loc, const float* v)         { glUniformMatrix4x3fv(loc,1,false,v); }
+
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        2,3,Matrix2x3fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        3,2,Matrix3x2fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        2,4,Matrix2x4fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        4,2,Matrix4x2fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        3,4,Matrix3x4fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        4,3,Matrix4x3fv_ei)
+
+#endif
+
+#ifdef GL_VERSION_3_0
+
+inline void glUniform2uiv_ei (GLint loc, const unsigned int* v)  { glUniform2uiv(loc,1,v); }
+inline void glUniform3uiv_ei (GLint loc, const unsigned int* v)  { glUniform3uiv(loc,1,v); }
+inline void glUniform4uiv_ei (GLint loc, const unsigned int* v)  { glUniform4uiv(loc,1,v); }
+
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 2,2uiv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 3,3uiv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 4,4uiv_ei)
+
+#endif
+
+#ifdef GL_ARB_gpu_shader_fp64
+inline void glUniform2dv_ei  (GLint loc, const double* v)        { glUniform2dv(loc,1,v); }
+inline void glUniform3dv_ei  (GLint loc, const double* v)        { glUniform3dv(loc,1,v); }
+inline void glUniform4dv_ei  (GLint loc, const double* v)        { glUniform4dv(loc,1,v); }
+
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,double,       2,2dv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,double,       3,3dv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,double,       4,4dv_ei)
+#endif
+
+
+//@}
+
+}
+
+#endif // EIGEN_OPENGL_MODULE

diff --git a/unsupported/Eigen/Polynomials b/unsupported/Eigen/Polynomials
new file mode 100644
index 0000000..32ce2a2
--- /dev/null
+++ b/unsupported/Eigen/Polynomials

@@ -0,0 +1,137 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_POLYNOMIALS_MODULE_H
+#define EIGEN_POLYNOMIALS_MODULE_H
+
+#include "../../Eigen/Core"
+
+#include "../../Eigen/Eigenvalues"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+// Note that EIGEN_HIDE_HEAVY_CODE has to be defined per module
+#if (defined EIGEN_EXTERN_INSTANTIATIONS) && (EIGEN_EXTERN_INSTANTIATIONS>=2)
+  #ifndef EIGEN_HIDE_HEAVY_CODE
+  #define EIGEN_HIDE_HEAVY_CODE
+  #endif
+#elif defined EIGEN_HIDE_HEAVY_CODE
+  #undef EIGEN_HIDE_HEAVY_CODE
+#endif
+
+/**
+  * \defgroup Polynomials_Module Polynomials module
+  * \brief This module provides a QR based polynomial solver.
+	*
+  * To use this module, add
+  * \code
+  * #include <unsupported/Eigen/Polynomials>
+  * \endcode
+	* at the start of your source file.
+  */
+
+#include "src/Polynomials/PolynomialUtils.h"
+#include "src/Polynomials/Companion.h"
+#include "src/Polynomials/PolynomialSolver.h"
+
+/**
+	\page polynomials Polynomials defines functions for dealing with polynomials
+	and a QR based polynomial solver.
+	\ingroup Polynomials_Module
+
+	The remainder of the page documents first the functions for evaluating, computing
+	polynomials, computing estimates about polynomials and next the QR based polynomial
+	solver.
+
+	\section polynomialUtils convenient functions to deal with polynomials
+	\subsection roots_to_monicPolynomial
+	The function
+	\code
+	void roots_to_monicPolynomial( const RootVector& rv, Polynomial& poly )
+	\endcode
+	computes the coefficients \f$ a_i \f$ of
+
+	\f$ p(x) = a_0 + a_{1}x + ... + a_{n-1}x^{n-1} + x^n \f$
+
+	where \f$ p \f$ is known through its roots i.e. \f$ p(x) = (x-r_1)(x-r_2)...(x-r_n) \f$.
+
+	\subsection poly_eval
+	The function
+	\code
+	T poly_eval( const Polynomials& poly, const T& x )
+	\endcode
+	evaluates a polynomial at a given point using stabilized H&ouml;rner method.
+
+	The following code: first computes the coefficients in the monomial basis of the monic polynomial that has the provided roots;
+	then, it evaluates the computed polynomial, using a stabilized H&ouml;rner method.
+
+	\include PolynomialUtils1.cpp
+  Output: \verbinclude PolynomialUtils1.out
+
+	\subsection Cauchy bounds
+	The function
+	\code
+	Real cauchy_max_bound( const Polynomial& poly )
+	\endcode
+	provides a maximum bound (the Cauchy one: \f$C(p)\f$) for the absolute value of a root of the given polynomial i.e.
+	\f$ \forall r_i \f$ root of \f$ p(x) = \sum_{k=0}^d a_k x^k \f$,
+	\f$ |r_i| \le C(p) = \sum_{k=0}^{d} \left | \frac{a_k}{a_d} \right | \f$
+	The leading coefficient \f$ p \f$: should be non zero \f$a_d \neq 0\f$.
+
+
+	The function
+	\code
+	Real cauchy_min_bound( const Polynomial& poly )
+	\endcode
+	provides a minimum bound (the Cauchy one: \f$c(p)\f$) for the absolute value of a non zero root of the given polynomial i.e.
+	\f$ \forall r_i \neq 0 \f$ root of \f$ p(x) = \sum_{k=0}^d a_k x^k \f$,
+	\f$ |r_i| \ge c(p) = \left( \sum_{k=0}^{d} \left | \frac{a_k}{a_0} \right | \right)^{-1} \f$
+
+
+
+
+	\section QR polynomial solver class
+	Computes the complex roots of a polynomial by computing the eigenvalues of the associated companion matrix with the QR algorithm.
+	
+	The roots of \f$ p(x) = a_0 + a_1 x + a_2 x^2 + a_{3} x^3 + x^4 \f$ are the eigenvalues of
+	\f$
+	\left [
+	\begin{array}{cccc}
+	0 & 0 &  0 & a_0 \\
+	1 & 0 &  0 & a_1 \\
+	0 & 1 &  0 & a_2 \\
+	0 & 0 &  1 & a_3
+	\end{array} \right ]
+	\f$
+
+	However, the QR algorithm is not guaranteed to converge when there are several eigenvalues with same modulus.
+
+	Therefore the current polynomial solver is guaranteed to provide a correct result only when the complex roots \f$r_1,r_2,...,r_d\f$ have distinct moduli i.e.
+	
+	\f$ \forall i,j \in [1;d],~ \| r_i \| \neq \| r_j \| \f$.
+
+	With 32bit (float) floating types this problem shows up frequently.
+  However, almost always, correct accuracy is reached even in these cases for 64bit
+  (double) floating types and small polynomial degree (<20).
+
+	\include PolynomialSolver1.cpp
+	
+	In the above example:
+	
+	-# a simple use of the polynomial solver is shown;
+	-# the accuracy problem with the QR algorithm is presented: a polynomial with almost conjugate roots is provided to the solver.
+	Those roots have almost same module therefore the QR algorithm failed to converge: the accuracy
+	of the last root is bad;
+	-# a simple way to circumvent the problem is shown: use doubles instead of floats.
+
+  Output: \verbinclude PolynomialSolver1.out
+*/
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_POLYNOMIALS_MODULE_H

diff --git a/unsupported/Eigen/Skyline b/unsupported/Eigen/Skyline
new file mode 100644
index 0000000..ebdf143
--- /dev/null
+++ b/unsupported/Eigen/Skyline

@@ -0,0 +1,39 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKYLINE_MODULE_H
+#define EIGEN_SKYLINE_MODULE_H
+
+
+#include "../../Eigen/Core"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#include <map>
+#include <cstdlib>
+#include <cstring>
+#include <algorithm>
+
+/**
+ *  \defgroup Skyline_Module Skyline module
+ *
+ *
+ *
+ *
+ */
+
+#include "src/Skyline/SkylineUtil.h"
+#include "src/Skyline/SkylineMatrixBase.h"
+#include "src/Skyline/SkylineStorage.h"
+#include "src/Skyline/SkylineMatrix.h"
+#include "src/Skyline/SkylineInplaceLU.h"
+#include "src/Skyline/SkylineProduct.h"
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_SKYLINE_MODULE_H

diff --git a/unsupported/Eigen/SparseExtra b/unsupported/Eigen/SparseExtra
new file mode 100644
index 0000000..ba5cbd6
--- /dev/null
+++ b/unsupported/Eigen/SparseExtra

@@ -0,0 +1,54 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_EXTRA_MODULE_H
+#define EIGEN_SPARSE_EXTRA_MODULE_H
+
+#include "../../Eigen/Sparse"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#include <vector>
+#include <map>
+#include <cstdlib>
+#include <cstring>
+#include <algorithm>
+#include <fstream>
+#include <sstream>
+
+#ifdef EIGEN_GOOGLEHASH_SUPPORT
+  #include <google/dense_hash_map>
+  #include <google/sparse_hash_map>
+#endif
+
+/**
+  * \defgroup SparseExtra_Module SparseExtra module
+  *
+  * This module contains some experimental features extending the sparse module.
+  *
+  * \code
+  * #include <Eigen/SparseExtra>
+  * \endcode
+  */
+
+
+#include "src/SparseExtra/DynamicSparseMatrix.h"
+#include "src/SparseExtra/BlockOfDynamicSparseMatrix.h"
+#include "src/SparseExtra/RandomSetter.h"
+
+#include "src/SparseExtra/MarketIO.h"
+
+#if !defined(_WIN32)
+#include <dirent.h>
+#include "src/SparseExtra/MatrixMarketIterator.h"
+#endif
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_SPARSE_EXTRA_MODULE_H

diff --git a/unsupported/Eigen/SpecialFunctions b/unsupported/Eigen/SpecialFunctions
index 7c7493c..f6a2460 100644
--- a/unsupported/Eigen/SpecialFunctions
+++ b/unsupported/Eigen/SpecialFunctions

@@ -10,6 +10,8 @@
 #ifndef EIGEN_SPECIALFUNCTIONS_MODULE
 #define EIGEN_SPECIALFUNCTIONS_MODULE
 
+#include <math.h>
+
 #include "../../Eigen/Core"
 
 #include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
@@ -27,12 +29,29 @@
   * - erfc
   * - lgamma
   * - igamma
+  * - igamma_der_a
+  * - gamma_sample_der_alpha
   * - igammac
   * - digamma
+  * - ndtri
   * - polygamma
   * - zeta
   * - betainc
   *
+  * Bessel Functions
+  * - bessel_i0
+  * - bessel_i0e
+  * - bessel_i1
+  * - bessel_i1e
+  * - bessel_j0
+  * - bessel_j1
+  * - bessel_k0
+  * - bessel_k0e
+  * - bessel_k1
+  * - bessel_k1e
+  * - bessel_y0
+  * - bessel_y1
+  *
   * \code
   * #include <unsupported/Eigen/SpecialFunctions>
   * \endcode
@@ -41,14 +60,37 @@
 
 }
 
+#include "src/SpecialFunctions/BesselFunctionsImpl.h"
+#include "src/SpecialFunctions/BesselFunctionsBFloat16.h"
+#include "src/SpecialFunctions/BesselFunctionsHalf.h"
+#include "src/SpecialFunctions/BesselFunctionsPacketMath.h"
+#include "src/SpecialFunctions/BesselFunctionsFunctors.h"
+#include "src/SpecialFunctions/BesselFunctionsArrayAPI.h"
 #include "src/SpecialFunctions/SpecialFunctionsImpl.h"
-#include "src/SpecialFunctions/SpecialFunctionsPacketMath.h"
+#if defined(EIGEN_HIPCC)
+#include "src/SpecialFunctions/HipVectorCompatibility.h"
+#endif
+#include "src/SpecialFunctions/SpecialFunctionsBFloat16.h"
 #include "src/SpecialFunctions/SpecialFunctionsHalf.h"
+#include "src/SpecialFunctions/SpecialFunctionsPacketMath.h"
 #include "src/SpecialFunctions/SpecialFunctionsFunctors.h"
 #include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h"
 
-#if defined EIGEN_VECTORIZE_CUDA
-  #include "src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h"
+#if defined EIGEN_VECTORIZE_AVX512
+  #include "src/SpecialFunctions/arch/AVX/BesselFunctions.h"
+  #include "src/SpecialFunctions/arch/AVX/SpecialFunctions.h"
+  #include "src/SpecialFunctions/arch/AVX512/BesselFunctions.h"
+  #include "src/SpecialFunctions/arch/AVX512/SpecialFunctions.h"
+#elif defined EIGEN_VECTORIZE_AVX
+  #include "src/SpecialFunctions/arch/AVX/BesselFunctions.h"
+  #include "src/SpecialFunctions/arch/AVX/SpecialFunctions.h"
+#elif defined EIGEN_VECTORIZE_NEON
+  #include "src/SpecialFunctions/arch/NEON/BesselFunctions.h"
+  #include "src/SpecialFunctions/arch/NEON/SpecialFunctions.h"
+#endif
+
+#if defined EIGEN_VECTORIZE_GPU
+  #include "src/SpecialFunctions/arch/GPU/SpecialFunctions.h"
 #endif
 
 namespace Eigen {

diff --git a/unsupported/Eigen/Splines b/unsupported/Eigen/Splines
new file mode 100644
index 0000000..2ca5813
--- /dev/null
+++ b/unsupported/Eigen/Splines

@@ -0,0 +1,35 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 20010-2011 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPLINES_MODULE_H
+#define EIGEN_SPLINES_MODULE_H
+
+namespace Eigen 
+{
+/**
+  * \defgroup Splines_Module Spline and spline fitting module
+  *
+  * This module provides a simple multi-dimensional spline class while
+  * offering most basic functionality to fit a spline to point sets.
+  *
+  * \code
+  * #include <unsupported/Eigen/Splines>
+  * \endcode
+  */
+}
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#include "src/Splines/SplineFwd.h"
+#include "src/Splines/Spline.h"
+#include "src/Splines/SplineFitting.h"
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_SPLINES_MODULE_H

diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h b/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
new file mode 100644
index 0000000..33b6c39
--- /dev/null
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h

@@ -0,0 +1,108 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_AUTODIFF_JACOBIAN_H
+#define EIGEN_AUTODIFF_JACOBIAN_H
+
+namespace Eigen
+{
+
+template<typename Functor> class AutoDiffJacobian : public Functor
+{
+public:
+  AutoDiffJacobian() : Functor() {}
+  AutoDiffJacobian(const Functor& f) : Functor(f) {}
+
+  // forward constructors
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  template<typename... T>
+  AutoDiffJacobian(const T& ...Values) : Functor(Values...) {}
+#else
+  template<typename T0>
+  AutoDiffJacobian(const T0& a0) : Functor(a0) {}
+  template<typename T0, typename T1>
+  AutoDiffJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {}
+  template<typename T0, typename T1, typename T2>
+  AutoDiffJacobian(const T0& a0, const T1& a1, const T2& a2) : Functor(a0, a1, a2) {}
+#endif
+
+  typedef typename Functor::InputType InputType;
+  typedef typename Functor::ValueType ValueType;
+  typedef typename ValueType::Scalar Scalar;
+
+  enum {
+    InputsAtCompileTime = InputType::RowsAtCompileTime,
+    ValuesAtCompileTime = ValueType::RowsAtCompileTime
+  };
+
+  typedef Matrix<Scalar, ValuesAtCompileTime, InputsAtCompileTime> JacobianType;
+  typedef typename JacobianType::Index Index;
+
+  typedef Matrix<Scalar, InputsAtCompileTime, 1> DerivativeType;
+  typedef AutoDiffScalar<DerivativeType> ActiveScalar;
+
+  typedef Matrix<ActiveScalar, InputsAtCompileTime, 1> ActiveInput;
+  typedef Matrix<ActiveScalar, ValuesAtCompileTime, 1> ActiveValue;
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  // Some compilers don't accept variadic parameters after a default parameter,
+  // i.e., we can't just write _jac=0 but we need to overload operator():
+  EIGEN_STRONG_INLINE
+  void operator() (const InputType& x, ValueType* v) const
+  {
+      this->operator()(x, v, 0);
+  }
+  template<typename... ParamsType>
+  void operator() (const InputType& x, ValueType* v, JacobianType* _jac,
+                   const ParamsType&... Params) const
+#else
+  void operator() (const InputType& x, ValueType* v, JacobianType* _jac=0) const
+#endif
+  {
+    eigen_assert(v!=0);
+
+    if (!_jac)
+    {
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+      Functor::operator()(x, v, Params...);
+#else
+      Functor::operator()(x, v);
+#endif
+      return;
+    }
+
+    JacobianType& jac = *_jac;
+
+    ActiveInput ax = x.template cast<ActiveScalar>();
+    ActiveValue av(jac.rows());
+
+    if(InputsAtCompileTime==Dynamic)
+      for (Index j=0; j<jac.rows(); j++)
+        av[j].derivatives().resize(x.rows());
+
+    for (Index i=0; i<jac.cols(); i++)
+      ax[i].derivatives() = DerivativeType::Unit(x.rows(),i);
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    Functor::operator()(ax, &av, Params...);
+#else
+    Functor::operator()(ax, &av);
+#endif
+
+    for (Index i=0; i<jac.rows(); i++)
+    {
+      (*v)[i] = av[i].value();
+      jac.row(i) = av[i].derivatives();
+    }
+  }
+};
+
+}
+
+#endif // EIGEN_AUTODIFF_JACOBIAN_H

diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
new file mode 100755
index 0000000..0f166e3
--- /dev/null
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h

@@ -0,0 +1,730 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_AUTODIFF_SCALAR_H
+#define EIGEN_AUTODIFF_SCALAR_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename A, typename B>
+struct make_coherent_impl {
+  static void run(A&, B&) {}
+};
+
+// resize a to match b is a.size()==0, and conversely.
+template<typename A, typename B>
+void make_coherent(const A& a, const B&b)
+{
+  make_coherent_impl<A,B>::run(a.const_cast_derived(), b.const_cast_derived());
+}
+
+template<typename DerivativeType, bool Enable> struct auto_diff_special_op;
+
+} // end namespace internal
+
+template<typename DerivativeType> class AutoDiffScalar;
+
+template<typename NewDerType>
+inline AutoDiffScalar<NewDerType> MakeAutoDiffScalar(const typename NewDerType::Scalar& value, const NewDerType &der) {
+  return AutoDiffScalar<NewDerType>(value,der);
+}
+
+/** \class AutoDiffScalar
+  * \brief A scalar type replacement with automatic differentiation capability
+  *
+  * \param DerivativeType the vector type used to store/represent the derivatives. The base scalar type
+  *                 as well as the number of derivatives to compute are determined from this type.
+  *                 Typical choices include, e.g., \c Vector4f for 4 derivatives, or \c VectorXf
+  *                 if the number of derivatives is not known at compile time, and/or, the number
+  *                 of derivatives is large.
+  *                 Note that DerivativeType can also be a reference (e.g., \c VectorXf&) to wrap a
+  *                 existing vector into an AutoDiffScalar.
+  *                 Finally, DerivativeType can also be any Eigen compatible expression.
+  *
+  * This class represents a scalar value while tracking its respective derivatives using Eigen's expression
+  * template mechanism.
+  *
+  * It supports the following list of global math function:
+  *  - std::abs, std::sqrt, std::pow, std::exp, std::log, std::sin, std::cos,
+  *  - internal::abs, internal::sqrt, numext::pow, internal::exp, internal::log, internal::sin, internal::cos,
+  *  - internal::conj, internal::real, internal::imag, numext::abs2.
+  *
+  * AutoDiffScalar can be used as the scalar type of an Eigen::Matrix object. However,
+  * in that case, the expression template mechanism only occurs at the top Matrix level,
+  * while derivatives are computed right away.
+  *
+  */
+
+template<typename DerivativeType>
+class AutoDiffScalar
+  : public internal::auto_diff_special_op
+            <DerivativeType, !internal::is_same<typename internal::traits<typename internal::remove_all<DerivativeType>::type>::Scalar,
+                                          typename NumTraits<typename internal::traits<typename internal::remove_all<DerivativeType>::type>::Scalar>::Real>::value>
+{
+  public:
+    typedef internal::auto_diff_special_op
+            <DerivativeType, !internal::is_same<typename internal::traits<typename internal::remove_all<DerivativeType>::type>::Scalar,
+                       typename NumTraits<typename internal::traits<typename internal::remove_all<DerivativeType>::type>::Scalar>::Real>::value> Base;
+    typedef typename internal::remove_all<DerivativeType>::type DerType;
+    typedef typename internal::traits<DerType>::Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real Real;
+
+    using Base::operator+;
+    using Base::operator*;
+
+    /** Default constructor without any initialization. */
+    AutoDiffScalar() {}
+
+    /** Constructs an active scalar from its \a value,
+        and initializes the \a nbDer derivatives such that it corresponds to the \a derNumber -th variable */
+    AutoDiffScalar(const Scalar& value, int nbDer, int derNumber)
+      : m_value(value), m_derivatives(DerType::Zero(nbDer))
+    {
+      m_derivatives.coeffRef(derNumber) = Scalar(1);
+    }
+
+    /** Conversion from a scalar constant to an active scalar.
+      * The derivatives are set to zero. */
+    /*explicit*/ AutoDiffScalar(const Real& value)
+      : m_value(value)
+    {
+      if(m_derivatives.size()>0)
+        m_derivatives.setZero();
+    }
+
+    /** Constructs an active scalar from its \a value and derivatives \a der */
+    AutoDiffScalar(const Scalar& value, const DerType& der)
+      : m_value(value), m_derivatives(der)
+    {}
+
+    template<typename OtherDerType>
+    AutoDiffScalar(const AutoDiffScalar<OtherDerType>& other
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    , typename internal::enable_if<
+            internal::is_same<Scalar, typename internal::traits<typename internal::remove_all<OtherDerType>::type>::Scalar>::value
+        &&  internal::is_convertible<OtherDerType,DerType>::value , void*>::type = 0
+#endif
+    )
+      : m_value(other.value()), m_derivatives(other.derivatives())
+    {}
+
+    friend  std::ostream & operator << (std::ostream & s, const AutoDiffScalar& a)
+    {
+      return s << a.value();
+    }
+
+    AutoDiffScalar(const AutoDiffScalar& other)
+      : m_value(other.value()), m_derivatives(other.derivatives())
+    {}
+
+    template<typename OtherDerType>
+    inline AutoDiffScalar& operator=(const AutoDiffScalar<OtherDerType>& other)
+    {
+      m_value = other.value();
+      m_derivatives = other.derivatives();
+      return *this;
+    }
+
+    inline AutoDiffScalar& operator=(const AutoDiffScalar& other)
+    {
+      m_value = other.value();
+      m_derivatives = other.derivatives();
+      return *this;
+    }
+
+    inline AutoDiffScalar& operator=(const Scalar& other)
+    {
+      m_value = other;
+      if(m_derivatives.size()>0)
+        m_derivatives.setZero();
+      return *this;
+    }
+
+//     inline operator const Scalar& () const { return m_value; }
+//     inline operator Scalar& () { return m_value; }
+
+    inline const Scalar& value() const { return m_value; }
+    inline Scalar& value() { return m_value; }
+
+    inline const DerType& derivatives() const { return m_derivatives; }
+    inline DerType& derivatives() { return m_derivatives; }
+
+    inline bool operator< (const Scalar& other) const  { return m_value <  other; }
+    inline bool operator<=(const Scalar& other) const  { return m_value <= other; }
+    inline bool operator> (const Scalar& other) const  { return m_value >  other; }
+    inline bool operator>=(const Scalar& other) const  { return m_value >= other; }
+    inline bool operator==(const Scalar& other) const  { return m_value == other; }
+    inline bool operator!=(const Scalar& other) const  { return m_value != other; }
+
+    friend inline bool operator< (const Scalar& a, const AutoDiffScalar& b) { return a <  b.value(); }
+    friend inline bool operator<=(const Scalar& a, const AutoDiffScalar& b) { return a <= b.value(); }
+    friend inline bool operator> (const Scalar& a, const AutoDiffScalar& b) { return a >  b.value(); }
+    friend inline bool operator>=(const Scalar& a, const AutoDiffScalar& b) { return a >= b.value(); }
+    friend inline bool operator==(const Scalar& a, const AutoDiffScalar& b) { return a == b.value(); }
+    friend inline bool operator!=(const Scalar& a, const AutoDiffScalar& b) { return a != b.value(); }
+
+    template<typename OtherDerType> inline bool operator< (const AutoDiffScalar<OtherDerType>& b) const  { return m_value <  b.value(); }
+    template<typename OtherDerType> inline bool operator<=(const AutoDiffScalar<OtherDerType>& b) const  { return m_value <= b.value(); }
+    template<typename OtherDerType> inline bool operator> (const AutoDiffScalar<OtherDerType>& b) const  { return m_value >  b.value(); }
+    template<typename OtherDerType> inline bool operator>=(const AutoDiffScalar<OtherDerType>& b) const  { return m_value >= b.value(); }
+    template<typename OtherDerType> inline bool operator==(const AutoDiffScalar<OtherDerType>& b) const  { return m_value == b.value(); }
+    template<typename OtherDerType> inline bool operator!=(const AutoDiffScalar<OtherDerType>& b) const  { return m_value != b.value(); }
+
+    inline const AutoDiffScalar<DerType&> operator+(const Scalar& other) const
+    {
+      return AutoDiffScalar<DerType&>(m_value + other, m_derivatives);
+    }
+
+    friend inline const AutoDiffScalar<DerType&> operator+(const Scalar& a, const AutoDiffScalar& b)
+    {
+      return AutoDiffScalar<DerType&>(a + b.value(), b.derivatives());
+    }
+
+//     inline const AutoDiffScalar<DerType&> operator+(const Real& other) const
+//     {
+//       return AutoDiffScalar<DerType&>(m_value + other, m_derivatives);
+//     }
+
+//     friend inline const AutoDiffScalar<DerType&> operator+(const Real& a, const AutoDiffScalar& b)
+//     {
+//       return AutoDiffScalar<DerType&>(a + b.value(), b.derivatives());
+//     }
+
+    inline AutoDiffScalar& operator+=(const Scalar& other)
+    {
+      value() += other;
+      return *this;
+    }
+
+    template<typename OtherDerType>
+    inline const AutoDiffScalar<CwiseBinaryOp<internal::scalar_sum_op<Scalar>,const DerType,const typename internal::remove_all<OtherDerType>::type> >
+    operator+(const AutoDiffScalar<OtherDerType>& other) const
+    {
+      internal::make_coherent(m_derivatives, other.derivatives());
+      return AutoDiffScalar<CwiseBinaryOp<internal::scalar_sum_op<Scalar>,const DerType,const typename internal::remove_all<OtherDerType>::type> >(
+        m_value + other.value(),
+        m_derivatives + other.derivatives());
+    }
+
+    template<typename OtherDerType>
+    inline AutoDiffScalar&
+    operator+=(const AutoDiffScalar<OtherDerType>& other)
+    {
+      (*this) = (*this) + other;
+      return *this;
+    }
+
+    inline const AutoDiffScalar<DerType&> operator-(const Scalar& b) const
+    {
+      return AutoDiffScalar<DerType&>(m_value - b, m_derivatives);
+    }
+
+    friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const DerType> >
+    operator-(const Scalar& a, const AutoDiffScalar& b)
+    {
+      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const DerType> >
+            (a - b.value(), -b.derivatives());
+    }
+
+    inline AutoDiffScalar& operator-=(const Scalar& other)
+    {
+      value() -= other;
+      return *this;
+    }
+
+    template<typename OtherDerType>
+    inline const AutoDiffScalar<CwiseBinaryOp<internal::scalar_difference_op<Scalar>, const DerType,const typename internal::remove_all<OtherDerType>::type> >
+    operator-(const AutoDiffScalar<OtherDerType>& other) const
+    {
+      internal::make_coherent(m_derivatives, other.derivatives());
+      return AutoDiffScalar<CwiseBinaryOp<internal::scalar_difference_op<Scalar>, const DerType,const typename internal::remove_all<OtherDerType>::type> >(
+        m_value - other.value(),
+        m_derivatives - other.derivatives());
+    }
+
+    template<typename OtherDerType>
+    inline AutoDiffScalar&
+    operator-=(const AutoDiffScalar<OtherDerType>& other)
+    {
+      *this = *this - other;
+      return *this;
+    }
+
+    inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const DerType> >
+    operator-() const
+    {
+      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const DerType> >(
+        -m_value,
+        -m_derivatives);
+    }
+
+    inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
+    operator*(const Scalar& other) const
+    {
+      return MakeAutoDiffScalar(m_value * other, m_derivatives * other);
+    }
+
+    friend inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
+    operator*(const Scalar& other, const AutoDiffScalar& a)
+    {
+      return MakeAutoDiffScalar(a.value() * other, a.derivatives() * other);
+    }
+
+//     inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
+//     operator*(const Real& other) const
+//     {
+//       return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
+//         m_value * other,
+//         (m_derivatives * other));
+//     }
+//
+//     friend inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
+//     operator*(const Real& other, const AutoDiffScalar& a)
+//     {
+//       return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
+//         a.value() * other,
+//         a.derivatives() * other);
+//     }
+
+    inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
+    operator/(const Scalar& other) const
+    {
+      return MakeAutoDiffScalar(m_value / other, (m_derivatives * (Scalar(1)/other)));
+    }
+
+    friend inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
+    operator/(const Scalar& other, const AutoDiffScalar& a)
+    {
+      return MakeAutoDiffScalar(other / a.value(), a.derivatives() * (Scalar(-other) / (a.value()*a.value())));
+    }
+
+//     inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
+//     operator/(const Real& other) const
+//     {
+//       return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
+//         m_value / other,
+//         (m_derivatives * (Real(1)/other)));
+//     }
+//
+//     friend inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
+//     operator/(const Real& other, const AutoDiffScalar& a)
+//     {
+//       return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
+//         other / a.value(),
+//         a.derivatives() * (-Real(1)/other));
+//     }
+
+    template<typename OtherDerType>
+    inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(
+        CwiseBinaryOp<internal::scalar_difference_op<Scalar> EIGEN_COMMA
+          const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) EIGEN_COMMA
+          const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<OtherDerType>::type,Scalar,product) >,Scalar,product) >
+    operator/(const AutoDiffScalar<OtherDerType>& other) const
+    {
+      internal::make_coherent(m_derivatives, other.derivatives());
+      return MakeAutoDiffScalar(
+        m_value / other.value(),
+          ((m_derivatives * other.value()) - (other.derivatives() * m_value))
+        * (Scalar(1)/(other.value()*other.value())));
+    }
+
+    template<typename OtherDerType>
+    inline const AutoDiffScalar<CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
+        const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product),
+        const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<OtherDerType>::type,Scalar,product) > >
+    operator*(const AutoDiffScalar<OtherDerType>& other) const
+    {
+      internal::make_coherent(m_derivatives, other.derivatives());
+      return MakeAutoDiffScalar(
+        m_value * other.value(),
+        (m_derivatives * other.value()) + (other.derivatives() * m_value));
+    }
+
+    inline AutoDiffScalar& operator*=(const Scalar& other)
+    {
+      *this = *this * other;
+      return *this;
+    }
+
+    template<typename OtherDerType>
+    inline AutoDiffScalar& operator*=(const AutoDiffScalar<OtherDerType>& other)
+    {
+      *this = *this * other;
+      return *this;
+    }
+
+    inline AutoDiffScalar& operator/=(const Scalar& other)
+    {
+      *this = *this / other;
+      return *this;
+    }
+
+    template<typename OtherDerType>
+    inline AutoDiffScalar& operator/=(const AutoDiffScalar<OtherDerType>& other)
+    {
+      *this = *this / other;
+      return *this;
+    }
+
+  protected:
+    Scalar m_value;
+    DerType m_derivatives;
+
+};
+
+namespace internal {
+
+template<typename DerivativeType>
+struct auto_diff_special_op<DerivativeType, true>
+//   : auto_diff_scalar_op<DerivativeType, typename NumTraits<Scalar>::Real,
+//                            is_same<Scalar,typename NumTraits<Scalar>::Real>::value>
+{
+  typedef typename remove_all<DerivativeType>::type DerType;
+  typedef typename traits<DerType>::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real Real;
+
+//   typedef auto_diff_scalar_op<DerivativeType, typename NumTraits<Scalar>::Real,
+//                            is_same<Scalar,typename NumTraits<Scalar>::Real>::value> Base;
+
+//   using Base::operator+;
+//   using Base::operator+=;
+//   using Base::operator-;
+//   using Base::operator-=;
+//   using Base::operator*;
+//   using Base::operator*=;
+
+  const AutoDiffScalar<DerivativeType>& derived() const { return *static_cast<const AutoDiffScalar<DerivativeType>*>(this); }
+  AutoDiffScalar<DerivativeType>& derived() { return *static_cast<AutoDiffScalar<DerivativeType>*>(this); }
+
+
+  inline const AutoDiffScalar<DerType&> operator+(const Real& other) const
+  {
+    return AutoDiffScalar<DerType&>(derived().value() + other, derived().derivatives());
+  }
+
+  friend inline const AutoDiffScalar<DerType&> operator+(const Real& a, const AutoDiffScalar<DerivativeType>& b)
+  {
+    return AutoDiffScalar<DerType&>(a + b.value(), b.derivatives());
+  }
+
+  inline AutoDiffScalar<DerivativeType>& operator+=(const Real& other)
+  {
+    derived().value() += other;
+    return derived();
+  }
+
+
+  inline const AutoDiffScalar<typename CwiseUnaryOp<bind2nd_op<scalar_product_op<Scalar,Real> >, DerType>::Type >
+  operator*(const Real& other) const
+  {
+    return AutoDiffScalar<typename CwiseUnaryOp<bind2nd_op<scalar_product_op<Scalar,Real> >, DerType>::Type >(
+      derived().value() * other,
+      derived().derivatives() * other);
+  }
+
+  friend inline const AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >
+  operator*(const Real& other, const AutoDiffScalar<DerivativeType>& a)
+  {
+    return AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >(
+      a.value() * other,
+      a.derivatives() * other);
+  }
+
+  inline AutoDiffScalar<DerivativeType>& operator*=(const Scalar& other)
+  {
+    *this = *this * other;
+    return derived();
+  }
+};
+
+template<typename DerivativeType>
+struct auto_diff_special_op<DerivativeType, false>
+{
+  void operator*() const;
+  void operator-() const;
+  void operator+() const;
+};
+
+template<typename BinOp, typename A, typename B, typename RefType>
+void make_coherent_expression(CwiseBinaryOp<BinOp,A,B> xpr, const RefType &ref)
+{
+  make_coherent(xpr.const_cast_derived().lhs(), ref);
+  make_coherent(xpr.const_cast_derived().rhs(), ref);
+}
+
+template<typename UnaryOp, typename A, typename RefType>
+void make_coherent_expression(const CwiseUnaryOp<UnaryOp,A> &xpr, const RefType &ref)
+{
+  make_coherent(xpr.nestedExpression().const_cast_derived(), ref);
+}
+
+// needed for compilation only
+template<typename UnaryOp, typename A, typename RefType>
+void make_coherent_expression(const CwiseNullaryOp<UnaryOp,A> &, const RefType &)
+{}
+
+template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols, typename B>
+struct make_coherent_impl<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols>, B> {
+  typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> A;
+  static void run(A& a, B& b) {
+    if((A_Rows==Dynamic || A_Cols==Dynamic) && (a.size()==0))
+    {
+      a.resize(b.size());
+      a.setZero();
+    }
+    else if (B::SizeAtCompileTime==Dynamic && a.size()!=0 && b.size()==0)
+    {
+      make_coherent_expression(b,a);
+    }
+  }
+};
+
+template<typename A, typename B_Scalar, int B_Rows, int B_Cols, int B_Options, int B_MaxRows, int B_MaxCols>
+struct make_coherent_impl<A, Matrix<B_Scalar, B_Rows, B_Cols, B_Options, B_MaxRows, B_MaxCols> > {
+  typedef Matrix<B_Scalar, B_Rows, B_Cols, B_Options, B_MaxRows, B_MaxCols> B;
+  static void run(A& a, B& b) {
+    if((B_Rows==Dynamic || B_Cols==Dynamic) && (b.size()==0))
+    {
+      b.resize(a.size());
+      b.setZero();
+    }
+    else if (A::SizeAtCompileTime==Dynamic && b.size()!=0 && a.size()==0)
+    {
+      make_coherent_expression(a,b);
+    }
+  }
+};
+
+template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols,
+         typename B_Scalar, int B_Rows, int B_Cols, int B_Options, int B_MaxRows, int B_MaxCols>
+struct make_coherent_impl<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols>,
+                          Matrix<B_Scalar, B_Rows, B_Cols, B_Options, B_MaxRows, B_MaxCols> > {
+  typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> A;
+  typedef Matrix<B_Scalar, B_Rows, B_Cols, B_Options, B_MaxRows, B_MaxCols> B;
+  static void run(A& a, B& b) {
+    if((A_Rows==Dynamic || A_Cols==Dynamic) && (a.size()==0))
+    {
+      a.resize(b.size());
+      a.setZero();
+    }
+    else if((B_Rows==Dynamic || B_Cols==Dynamic) && (b.size()==0))
+    {
+      b.resize(a.size());
+      b.setZero();
+    }
+  }
+};
+
+} // end namespace internal
+
+template<typename DerType, typename BinOp>
+struct ScalarBinaryOpTraits<AutoDiffScalar<DerType>,typename DerType::Scalar,BinOp>
+{
+  typedef AutoDiffScalar<DerType> ReturnType;
+};
+
+template<typename DerType, typename BinOp>
+struct ScalarBinaryOpTraits<typename DerType::Scalar,AutoDiffScalar<DerType>, BinOp>
+{
+  typedef AutoDiffScalar<DerType> ReturnType;
+};
+
+
+// The following is an attempt to let Eigen's known about expression template, but that's more tricky!
+
+// template<typename DerType, typename BinOp>
+// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType>,AutoDiffScalar<DerType>, BinOp>
+// {
+//   enum { Defined = 1 };
+//   typedef AutoDiffScalar<typename DerType::PlainObject> ReturnType;
+// };
+//
+// template<typename DerType1,typename DerType2, typename BinOp>
+// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType1>,AutoDiffScalar<DerType2>, BinOp>
+// {
+//   enum { Defined = 1 };//internal::is_same<typename DerType1::Scalar,typename DerType2::Scalar>::value };
+//   typedef AutoDiffScalar<typename DerType1::PlainObject> ReturnType;
+// };
+
+#define EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(FUNC,CODE) \
+  template<typename DerType> \
+  inline const Eigen::AutoDiffScalar< \
+  EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename Eigen::internal::remove_all<DerType>::type, typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar, product) > \
+  FUNC(const Eigen::AutoDiffScalar<DerType>& x) { \
+    using namespace Eigen; \
+    typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \
+    EIGEN_UNUSED_VARIABLE(sizeof(Scalar)); \
+    CODE; \
+  }
+
+template<typename DerType>
+struct CleanedUpDerType {
+  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> type;
+};
+
+template<typename DerType>
+inline const AutoDiffScalar<DerType>& conj(const AutoDiffScalar<DerType>& x)  { return x; }
+template<typename DerType>
+inline const AutoDiffScalar<DerType>& real(const AutoDiffScalar<DerType>& x)  { return x; }
+template<typename DerType>
+inline typename DerType::Scalar imag(const AutoDiffScalar<DerType>&)    { return 0.; }
+template<typename DerType, typename T>
+inline typename CleanedUpDerType<DerType>::type (min)(const AutoDiffScalar<DerType>& x, const T& y) {
+  typedef typename CleanedUpDerType<DerType>::type ADS;
+  return (x <= y ? ADS(x) : ADS(y));
+}
+template<typename DerType, typename T>
+inline typename CleanedUpDerType<DerType>::type (max)(const AutoDiffScalar<DerType>& x, const T& y) {
+  typedef typename CleanedUpDerType<DerType>::type ADS;
+  return (x >= y ? ADS(x) : ADS(y));
+}
+template<typename DerType, typename T>
+inline typename CleanedUpDerType<DerType>::type (min)(const T& x, const AutoDiffScalar<DerType>& y) {
+  typedef typename CleanedUpDerType<DerType>::type ADS;
+  return (x < y ? ADS(x) : ADS(y));
+}
+template<typename DerType, typename T>
+inline typename CleanedUpDerType<DerType>::type (max)(const T& x, const AutoDiffScalar<DerType>& y) {
+  typedef typename CleanedUpDerType<DerType>::type ADS;
+  return (x > y ? ADS(x) : ADS(y));
+}
+template<typename DerType>
+inline typename CleanedUpDerType<DerType>::type (min)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+  return (x.value() < y.value() ? x : y);
+}
+template<typename DerType>
+inline typename CleanedUpDerType<DerType>::type (max)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+  return (x.value() >= y.value() ? x : y);
+}
+
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs,
+  using std::abs;
+  return Eigen::MakeAutoDiffScalar(abs(x.value()), x.derivatives() * (x.value()<0 ? -1 : 1) );)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs2,
+  using numext::abs2;
+  return Eigen::MakeAutoDiffScalar(abs2(x.value()), x.derivatives() * (Scalar(2)*x.value()));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sqrt,
+  using std::sqrt;
+  Scalar sqrtx = sqrt(x.value());
+  return Eigen::MakeAutoDiffScalar(sqrtx,x.derivatives() * (Scalar(0.5) / sqrtx));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cos,
+  using std::cos;
+  using std::sin;
+  return Eigen::MakeAutoDiffScalar(cos(x.value()), x.derivatives() * (-sin(x.value())));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sin,
+  using std::sin;
+  using std::cos;
+  return Eigen::MakeAutoDiffScalar(sin(x.value()),x.derivatives() * cos(x.value()));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(exp,
+  using std::exp;
+  Scalar expx = exp(x.value());
+  return Eigen::MakeAutoDiffScalar(expx,x.derivatives() * expx);)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(log,
+  using std::log;
+  return Eigen::MakeAutoDiffScalar(log(x.value()),x.derivatives() * (Scalar(1)/x.value()));)
+
+template<typename DerType>
+inline const Eigen::AutoDiffScalar<
+EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<DerType>::type,typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar,product) >
+pow(const Eigen::AutoDiffScalar<DerType> &x, const typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar &y)
+{
+  using namespace Eigen;
+  using std::pow;
+  return Eigen::MakeAutoDiffScalar(pow(x.value(),y), x.derivatives() * (y * pow(x.value(),y-1)));
+}
+
+
+template<typename DerTypeA,typename DerTypeB>
+inline const AutoDiffScalar<Matrix<typename internal::traits<typename internal::remove_all<DerTypeA>::type>::Scalar,Dynamic,1> >
+atan2(const AutoDiffScalar<DerTypeA>& a, const AutoDiffScalar<DerTypeB>& b)
+{
+  using std::atan2;
+  typedef typename internal::traits<typename internal::remove_all<DerTypeA>::type>::Scalar Scalar;
+  typedef AutoDiffScalar<Matrix<Scalar,Dynamic,1> > PlainADS;
+  PlainADS ret;
+  ret.value() = atan2(a.value(), b.value());
+  
+  Scalar squared_hypot = a.value() * a.value() + b.value() * b.value();
+  
+  // if (squared_hypot==0) the derivation is undefined and the following results in a NaN:
+  ret.derivatives() = (a.derivatives() * b.value() - a.value() * b.derivatives()) / squared_hypot;
+
+  return ret;
+}
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tan,
+  using std::tan;
+  using std::cos;
+  return Eigen::MakeAutoDiffScalar(tan(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cos(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(asin,
+  using std::sqrt;
+  using std::asin;
+  return Eigen::MakeAutoDiffScalar(asin(x.value()),x.derivatives() * (Scalar(1)/sqrt(1-numext::abs2(x.value()))));)
+  
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(acos,
+  using std::sqrt;
+  using std::acos;
+  return Eigen::MakeAutoDiffScalar(acos(x.value()),x.derivatives() * (Scalar(-1)/sqrt(1-numext::abs2(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tanh,
+  using std::cosh;
+  using std::tanh;
+  return Eigen::MakeAutoDiffScalar(tanh(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cosh(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sinh,
+  using std::sinh;
+  using std::cosh;
+  return Eigen::MakeAutoDiffScalar(sinh(x.value()),x.derivatives() * cosh(x.value()));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cosh,
+  using std::sinh;
+  using std::cosh;
+  return Eigen::MakeAutoDiffScalar(cosh(x.value()),x.derivatives() * sinh(x.value()));)
+
+#undef EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY
+
+template<typename DerType> struct NumTraits<AutoDiffScalar<DerType> >
+  : NumTraits< typename NumTraits<typename internal::remove_all<DerType>::type::Scalar>::Real >
+{
+  typedef typename internal::remove_all<DerType>::type DerTypeCleaned;
+  typedef AutoDiffScalar<Matrix<typename NumTraits<typename DerTypeCleaned::Scalar>::Real,DerTypeCleaned::RowsAtCompileTime,DerTypeCleaned::ColsAtCompileTime,
+                                0, DerTypeCleaned::MaxRowsAtCompileTime, DerTypeCleaned::MaxColsAtCompileTime> > Real;
+  typedef AutoDiffScalar<DerType> NonInteger;
+  typedef AutoDiffScalar<DerType> Nested;
+  typedef typename NumTraits<typename DerTypeCleaned::Scalar>::Literal Literal;
+  enum{
+    RequireInitialization = 1
+  };
+};
+
+}
+
+namespace std {
+
+template <typename T>
+class numeric_limits<Eigen::AutoDiffScalar<T> >
+  : public numeric_limits<typename T::Scalar> {};
+
+template <typename T>
+class numeric_limits<Eigen::AutoDiffScalar<T&> >
+  : public numeric_limits<typename T::Scalar> {};
+
+}  // namespace std
+
+#endif // EIGEN_AUTODIFF_SCALAR_H

diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffVector.h b/unsupported/Eigen/src/AutoDiff/AutoDiffVector.h
new file mode 100644
index 0000000..8c2d048
--- /dev/null
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffVector.h

@@ -0,0 +1,220 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_AUTODIFF_VECTOR_H
+#define EIGEN_AUTODIFF_VECTOR_H
+
+namespace Eigen {
+
+/* \class AutoDiffScalar
+  * \brief A scalar type replacement with automatic differentation capability
+  *
+  * \param DerType the vector type used to store/represent the derivatives (e.g. Vector3f)
+  *
+  * This class represents a scalar value while tracking its respective derivatives.
+  *
+  * It supports the following list of global math function:
+  *  - std::abs, std::sqrt, std::pow, std::exp, std::log, std::sin, std::cos,
+  *  - internal::abs, internal::sqrt, numext::pow, internal::exp, internal::log, internal::sin, internal::cos,
+  *  - internal::conj, internal::real, internal::imag, numext::abs2.
+  *
+  * AutoDiffScalar can be used as the scalar type of an Eigen::Matrix object. However,
+  * in that case, the expression template mechanism only occurs at the top Matrix level,
+  * while derivatives are computed right away.
+  *
+  */
+template<typename ValueType, typename JacobianType>
+class AutoDiffVector
+{
+  public:
+    //typedef typename internal::traits<ValueType>::Scalar Scalar;
+    typedef typename internal::traits<ValueType>::Scalar BaseScalar;
+    typedef AutoDiffScalar<Matrix<BaseScalar,JacobianType::RowsAtCompileTime,1> > ActiveScalar;
+    typedef ActiveScalar Scalar;
+    typedef AutoDiffScalar<typename JacobianType::ColXpr> CoeffType;
+    typedef typename JacobianType::Index Index;
+
+    inline AutoDiffVector() {}
+
+    inline AutoDiffVector(const ValueType& values)
+      : m_values(values)
+    {
+      m_jacobian.setZero();
+    }
+
+
+    CoeffType operator[] (Index i) { return CoeffType(m_values[i], m_jacobian.col(i)); }
+    const CoeffType operator[] (Index i) const { return CoeffType(m_values[i], m_jacobian.col(i)); }
+
+    CoeffType operator() (Index i) { return CoeffType(m_values[i], m_jacobian.col(i)); }
+    const CoeffType operator() (Index i) const { return CoeffType(m_values[i], m_jacobian.col(i)); }
+
+    CoeffType coeffRef(Index i) { return CoeffType(m_values[i], m_jacobian.col(i)); }
+    const CoeffType coeffRef(Index i) const { return CoeffType(m_values[i], m_jacobian.col(i)); }
+
+    Index size() const { return m_values.size(); }
+
+    // FIXME here we could return an expression of the sum
+    Scalar sum() const { /*std::cerr << "sum \n\n";*/ /*std::cerr << m_jacobian.rowwise().sum() << "\n\n";*/ return Scalar(m_values.sum(), m_jacobian.rowwise().sum()); }
+
+
+    inline AutoDiffVector(const ValueType& values, const JacobianType& jac)
+      : m_values(values), m_jacobian(jac)
+    {}
+
+    template<typename OtherValueType, typename OtherJacobianType>
+    inline AutoDiffVector(const AutoDiffVector<OtherValueType, OtherJacobianType>& other)
+      : m_values(other.values()), m_jacobian(other.jacobian())
+    {}
+
+    inline AutoDiffVector(const AutoDiffVector& other)
+      : m_values(other.values()), m_jacobian(other.jacobian())
+    {}
+
+    template<typename OtherValueType, typename OtherJacobianType>
+    inline AutoDiffVector& operator=(const AutoDiffVector<OtherValueType, OtherJacobianType>& other)
+    {
+      m_values = other.values();
+      m_jacobian = other.jacobian();
+      return *this;
+    }
+
+    inline AutoDiffVector& operator=(const AutoDiffVector& other)
+    {
+      m_values = other.values();
+      m_jacobian = other.jacobian();
+      return *this;
+    }
+
+    inline const ValueType& values() const { return m_values; }
+    inline ValueType& values() { return m_values; }
+
+    inline const JacobianType& jacobian() const { return m_jacobian; }
+    inline JacobianType& jacobian() { return m_jacobian; }
+
+    template<typename OtherValueType,typename OtherJacobianType>
+    inline const AutoDiffVector<
+      typename MakeCwiseBinaryOp<internal::scalar_sum_op<BaseScalar>,ValueType,OtherValueType>::Type,
+      typename MakeCwiseBinaryOp<internal::scalar_sum_op<BaseScalar>,JacobianType,OtherJacobianType>::Type >
+    operator+(const AutoDiffVector<OtherValueType,OtherJacobianType>& other) const
+    {
+      return AutoDiffVector<
+      typename MakeCwiseBinaryOp<internal::scalar_sum_op<BaseScalar>,ValueType,OtherValueType>::Type,
+      typename MakeCwiseBinaryOp<internal::scalar_sum_op<BaseScalar>,JacobianType,OtherJacobianType>::Type >(
+        m_values + other.values(),
+        m_jacobian + other.jacobian());
+    }
+
+    template<typename OtherValueType, typename OtherJacobianType>
+    inline AutoDiffVector&
+    operator+=(const AutoDiffVector<OtherValueType,OtherJacobianType>& other)
+    {
+      m_values += other.values();
+      m_jacobian += other.jacobian();
+      return *this;
+    }
+
+    template<typename OtherValueType,typename OtherJacobianType>
+    inline const AutoDiffVector<
+      typename MakeCwiseBinaryOp<internal::scalar_difference_op<Scalar>,ValueType,OtherValueType>::Type,
+      typename MakeCwiseBinaryOp<internal::scalar_difference_op<Scalar>,JacobianType,OtherJacobianType>::Type >
+    operator-(const AutoDiffVector<OtherValueType,OtherJacobianType>& other) const
+    {
+      return AutoDiffVector<
+        typename MakeCwiseBinaryOp<internal::scalar_difference_op<Scalar>,ValueType,OtherValueType>::Type,
+        typename MakeCwiseBinaryOp<internal::scalar_difference_op<Scalar>,JacobianType,OtherJacobianType>::Type >(
+          m_values - other.values(),
+          m_jacobian - other.jacobian());
+    }
+
+    template<typename OtherValueType, typename OtherJacobianType>
+    inline AutoDiffVector&
+    operator-=(const AutoDiffVector<OtherValueType,OtherJacobianType>& other)
+    {
+      m_values -= other.values();
+      m_jacobian -= other.jacobian();
+      return *this;
+    }
+
+    inline const AutoDiffVector<
+      typename MakeCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, ValueType>::Type,
+      typename MakeCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, JacobianType>::Type >
+    operator-() const
+    {
+      return AutoDiffVector<
+        typename MakeCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, ValueType>::Type,
+        typename MakeCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, JacobianType>::Type >(
+          -m_values,
+          -m_jacobian);
+    }
+
+    inline const AutoDiffVector<
+      typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, ValueType>::Type,
+      typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>::Type>
+    operator*(const BaseScalar& other) const
+    {
+      return AutoDiffVector<
+        typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, ValueType>::Type,
+        typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>::Type >(
+          m_values * other,
+          m_jacobian * other);
+    }
+
+    friend inline const AutoDiffVector<
+      typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, ValueType>::Type,
+      typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>::Type >
+    operator*(const Scalar& other, const AutoDiffVector& v)
+    {
+      return AutoDiffVector<
+        typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, ValueType>::Type,
+        typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>::Type >(
+          v.values() * other,
+          v.jacobian() * other);
+    }
+
+//     template<typename OtherValueType,typename OtherJacobianType>
+//     inline const AutoDiffVector<
+//       CwiseBinaryOp<internal::scalar_multiple_op<Scalar>, ValueType, OtherValueType>
+//       CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
+//         CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>,
+//         CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, OtherJacobianType> > >
+//     operator*(const AutoDiffVector<OtherValueType,OtherJacobianType>& other) const
+//     {
+//       return AutoDiffVector<
+//         CwiseBinaryOp<internal::scalar_multiple_op<Scalar>, ValueType, OtherValueType>
+//         CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
+//           CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>,
+//           CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, OtherJacobianType> > >(
+//             m_values.cwise() * other.values(),
+//             (m_jacobian * other.values()) + (m_values * other.jacobian()));
+//     }
+
+    inline AutoDiffVector& operator*=(const Scalar& other)
+    {
+      m_values *= other;
+      m_jacobian *= other;
+      return *this;
+    }
+
+    template<typename OtherValueType,typename OtherJacobianType>
+    inline AutoDiffVector& operator*=(const AutoDiffVector<OtherValueType,OtherJacobianType>& other)
+    {
+      *this = *this * other;
+      return *this;
+    }
+
+  protected:
+    ValueType m_values;
+    JacobianType m_jacobian;
+
+};
+
+}
+
+#endif // EIGEN_AUTODIFF_VECTOR_H

diff --git a/unsupported/Eigen/src/BVH/BVAlgorithms.h b/unsupported/Eigen/src/BVH/BVAlgorithms.h
new file mode 100644
index 0000000..994c8af
--- /dev/null
+++ b/unsupported/Eigen/src/BVH/BVAlgorithms.h

@@ -0,0 +1,293 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Ilya Baran <ibaran@mit.edu>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BVALGORITHMS_H
+#define EIGEN_BVALGORITHMS_H
+
+namespace Eigen { 
+
+namespace internal {
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename BVH, typename Intersector>
+bool intersect_helper(const BVH &tree, Intersector &intersector, typename BVH::Index root)
+{
+  typedef typename BVH::Index Index;
+  typedef typename BVH::VolumeIterator VolIter;
+  typedef typename BVH::ObjectIterator ObjIter;
+
+  VolIter vBegin = VolIter(), vEnd = VolIter();
+  ObjIter oBegin = ObjIter(), oEnd = ObjIter();
+
+  std::vector<Index> todo(1, root);
+
+  while(!todo.empty()) {
+    tree.getChildren(todo.back(), vBegin, vEnd, oBegin, oEnd);
+    todo.pop_back();
+
+    for(; vBegin != vEnd; ++vBegin) //go through child volumes
+      if(intersector.intersectVolume(tree.getVolume(*vBegin)))
+        todo.push_back(*vBegin);
+
+    for(; oBegin != oEnd; ++oBegin) //go through child objects
+      if(intersector.intersectObject(*oBegin))
+        return true; //intersector said to stop query
+  }
+  return false;
+}
+#endif //not EIGEN_PARSED_BY_DOXYGEN
+
+template<typename Volume1, typename Object1, typename Object2, typename Intersector>
+struct intersector_helper1
+{
+  intersector_helper1(const Object2 &inStored, Intersector &in) : stored(inStored), intersector(in) {}
+  bool intersectVolume(const Volume1 &vol) { return intersector.intersectVolumeObject(vol, stored); }
+  bool intersectObject(const Object1 &obj) { return intersector.intersectObjectObject(obj, stored); }
+  Object2 stored;
+  Intersector &intersector;
+private:
+  intersector_helper1& operator=(const intersector_helper1&);
+};
+
+template<typename Volume2, typename Object2, typename Object1, typename Intersector>
+struct intersector_helper2
+{
+  intersector_helper2(const Object1 &inStored, Intersector &in) : stored(inStored), intersector(in) {}
+  bool intersectVolume(const Volume2 &vol) { return intersector.intersectObjectVolume(stored, vol); }
+  bool intersectObject(const Object2 &obj) { return intersector.intersectObjectObject(stored, obj); }
+  Object1 stored;
+  Intersector &intersector;
+private:
+  intersector_helper2& operator=(const intersector_helper2&);
+};
+
+} // end namespace internal
+
+/**  Given a BVH, runs the query encapsulated by \a intersector.
+  *  The Intersector type must provide the following members: \code
+     bool intersectVolume(const BVH::Volume &volume) //returns true if volume intersects the query
+     bool intersectObject(const BVH::Object &object) //returns true if the search should terminate immediately
+  \endcode
+  */
+template<typename BVH, typename Intersector>
+void BVIntersect(const BVH &tree, Intersector &intersector)
+{
+  internal::intersect_helper(tree, intersector, tree.getRootIndex());
+}
+
+/**  Given two BVH's, runs the query on their Cartesian product encapsulated by \a intersector.
+  *  The Intersector type must provide the following members: \code
+     bool intersectVolumeVolume(const BVH1::Volume &v1, const BVH2::Volume &v2) //returns true if product of volumes intersects the query
+     bool intersectVolumeObject(const BVH1::Volume &v1, const BVH2::Object &o2) //returns true if the volume-object product intersects the query
+     bool intersectObjectVolume(const BVH1::Object &o1, const BVH2::Volume &v2) //returns true if the volume-object product intersects the query
+     bool intersectObjectObject(const BVH1::Object &o1, const BVH2::Object &o2) //returns true if the search should terminate immediately
+  \endcode
+  */
+template<typename BVH1, typename BVH2, typename Intersector>
+void BVIntersect(const BVH1 &tree1, const BVH2 &tree2, Intersector &intersector) //TODO: tandem descent when it makes sense
+{
+  typedef typename BVH1::Index Index1;
+  typedef typename BVH2::Index Index2;
+  typedef internal::intersector_helper1<typename BVH1::Volume, typename BVH1::Object, typename BVH2::Object, Intersector> Helper1;
+  typedef internal::intersector_helper2<typename BVH2::Volume, typename BVH2::Object, typename BVH1::Object, Intersector> Helper2;
+  typedef typename BVH1::VolumeIterator VolIter1;
+  typedef typename BVH1::ObjectIterator ObjIter1;
+  typedef typename BVH2::VolumeIterator VolIter2;
+  typedef typename BVH2::ObjectIterator ObjIter2;
+
+  VolIter1 vBegin1 = VolIter1(), vEnd1 = VolIter1();
+  ObjIter1 oBegin1 = ObjIter1(), oEnd1 = ObjIter1();
+  VolIter2 vBegin2 = VolIter2(), vEnd2 = VolIter2(), vCur2 = VolIter2();
+  ObjIter2 oBegin2 = ObjIter2(), oEnd2 = ObjIter2(), oCur2 = ObjIter2();
+
+  std::vector<std::pair<Index1, Index2> > todo(1, std::make_pair(tree1.getRootIndex(), tree2.getRootIndex()));
+
+  while(!todo.empty()) {
+    tree1.getChildren(todo.back().first, vBegin1, vEnd1, oBegin1, oEnd1);
+    tree2.getChildren(todo.back().second, vBegin2, vEnd2, oBegin2, oEnd2);
+    todo.pop_back();
+
+    for(; vBegin1 != vEnd1; ++vBegin1) { //go through child volumes of first tree
+      const typename BVH1::Volume &vol1 = tree1.getVolume(*vBegin1);
+      for(vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) { //go through child volumes of second tree
+        if(intersector.intersectVolumeVolume(vol1, tree2.getVolume(*vCur2)))
+          todo.push_back(std::make_pair(*vBegin1, *vCur2));
+      }
+
+      for(oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {//go through child objects of second tree
+        Helper1 helper(*oCur2, intersector);
+        if(internal::intersect_helper(tree1, helper, *vBegin1))
+          return; //intersector said to stop query
+      }
+    }
+
+    for(; oBegin1 != oEnd1; ++oBegin1) { //go through child objects of first tree
+      for(vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) { //go through child volumes of second tree
+        Helper2 helper(*oBegin1, intersector);
+        if(internal::intersect_helper(tree2, helper, *vCur2))
+          return; //intersector said to stop query
+      }
+
+      for(oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {//go through child objects of second tree
+        if(intersector.intersectObjectObject(*oBegin1, *oCur2))
+          return; //intersector said to stop query
+      }
+    }
+  }
+}
+
+namespace internal {
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename BVH, typename Minimizer>
+typename Minimizer::Scalar minimize_helper(const BVH &tree, Minimizer &minimizer, typename BVH::Index root, typename Minimizer::Scalar minimum)
+{
+  typedef typename Minimizer::Scalar Scalar;
+  typedef typename BVH::Index Index;
+  typedef std::pair<Scalar, Index> QueueElement; //first element is priority
+  typedef typename BVH::VolumeIterator VolIter;
+  typedef typename BVH::ObjectIterator ObjIter;
+
+  VolIter vBegin = VolIter(), vEnd = VolIter();
+  ObjIter oBegin = ObjIter(), oEnd = ObjIter();
+  std::priority_queue<QueueElement, std::vector<QueueElement>, std::greater<QueueElement> > todo; //smallest is at the top
+
+  todo.push(std::make_pair(Scalar(), root));
+
+  while(!todo.empty()) {
+    tree.getChildren(todo.top().second, vBegin, vEnd, oBegin, oEnd);
+    todo.pop();
+
+    for(; oBegin != oEnd; ++oBegin) //go through child objects
+      minimum = (std::min)(minimum, minimizer.minimumOnObject(*oBegin));
+
+    for(; vBegin != vEnd; ++vBegin) { //go through child volumes
+      Scalar val = minimizer.minimumOnVolume(tree.getVolume(*vBegin));
+      if(val < minimum)
+        todo.push(std::make_pair(val, *vBegin));
+    }
+  }
+
+  return minimum;
+}
+#endif //not EIGEN_PARSED_BY_DOXYGEN
+
+
+template<typename Volume1, typename Object1, typename Object2, typename Minimizer>
+struct minimizer_helper1
+{
+  typedef typename Minimizer::Scalar Scalar;
+  minimizer_helper1(const Object2 &inStored, Minimizer &m) : stored(inStored), minimizer(m) {}
+  Scalar minimumOnVolume(const Volume1 &vol) { return minimizer.minimumOnVolumeObject(vol, stored); }
+  Scalar minimumOnObject(const Object1 &obj) { return minimizer.minimumOnObjectObject(obj, stored); }
+  Object2 stored;
+  Minimizer &minimizer;
+private:
+  minimizer_helper1& operator=(const minimizer_helper1&);
+};
+
+template<typename Volume2, typename Object2, typename Object1, typename Minimizer>
+struct minimizer_helper2
+{
+  typedef typename Minimizer::Scalar Scalar;
+  minimizer_helper2(const Object1 &inStored, Minimizer &m) : stored(inStored), minimizer(m) {}
+  Scalar minimumOnVolume(const Volume2 &vol) { return minimizer.minimumOnObjectVolume(stored, vol); }
+  Scalar minimumOnObject(const Object2 &obj) { return minimizer.minimumOnObjectObject(stored, obj); }
+  Object1 stored;
+  Minimizer &minimizer;
+private:
+  minimizer_helper2& operator=(const minimizer_helper2&);
+};
+
+} // end namespace internal
+
+/**  Given a BVH, runs the query encapsulated by \a minimizer.
+  *  \returns the minimum value.
+  *  The Minimizer type must provide the following members: \code
+     typedef Scalar //the numeric type of what is being minimized--not necessarily the Scalar type of the BVH (if it has one)
+     Scalar minimumOnVolume(const BVH::Volume &volume)
+     Scalar minimumOnObject(const BVH::Object &object)
+  \endcode
+  */
+template<typename BVH, typename Minimizer>
+typename Minimizer::Scalar BVMinimize(const BVH &tree, Minimizer &minimizer)
+{
+  return internal::minimize_helper(tree, minimizer, tree.getRootIndex(), (std::numeric_limits<typename Minimizer::Scalar>::max)());
+}
+
+/**  Given two BVH's, runs the query on their cartesian product encapsulated by \a minimizer.
+  *  \returns the minimum value.
+  *  The Minimizer type must provide the following members: \code
+     typedef Scalar //the numeric type of what is being minimized--not necessarily the Scalar type of the BVH (if it has one)
+     Scalar minimumOnVolumeVolume(const BVH1::Volume &v1, const BVH2::Volume &v2)
+     Scalar minimumOnVolumeObject(const BVH1::Volume &v1, const BVH2::Object &o2)
+     Scalar minimumOnObjectVolume(const BVH1::Object &o1, const BVH2::Volume &v2)
+     Scalar minimumOnObjectObject(const BVH1::Object &o1, const BVH2::Object &o2)
+  \endcode
+  */
+template<typename BVH1, typename BVH2, typename Minimizer>
+typename Minimizer::Scalar BVMinimize(const BVH1 &tree1, const BVH2 &tree2, Minimizer &minimizer)
+{
+  typedef typename Minimizer::Scalar Scalar;
+  typedef typename BVH1::Index Index1;
+  typedef typename BVH2::Index Index2;
+  typedef internal::minimizer_helper1<typename BVH1::Volume, typename BVH1::Object, typename BVH2::Object, Minimizer> Helper1;
+  typedef internal::minimizer_helper2<typename BVH2::Volume, typename BVH2::Object, typename BVH1::Object, Minimizer> Helper2;
+  typedef std::pair<Scalar, std::pair<Index1, Index2> > QueueElement; //first element is priority
+  typedef typename BVH1::VolumeIterator VolIter1;
+  typedef typename BVH1::ObjectIterator ObjIter1;
+  typedef typename BVH2::VolumeIterator VolIter2;
+  typedef typename BVH2::ObjectIterator ObjIter2;
+
+  VolIter1 vBegin1 = VolIter1(), vEnd1 = VolIter1();
+  ObjIter1 oBegin1 = ObjIter1(), oEnd1 = ObjIter1();
+  VolIter2 vBegin2 = VolIter2(), vEnd2 = VolIter2(), vCur2 = VolIter2();
+  ObjIter2 oBegin2 = ObjIter2(), oEnd2 = ObjIter2(), oCur2 = ObjIter2();
+  std::priority_queue<QueueElement, std::vector<QueueElement>, std::greater<QueueElement> > todo; //smallest is at the top
+
+  Scalar minimum = (std::numeric_limits<Scalar>::max)();
+  todo.push(std::make_pair(Scalar(), std::make_pair(tree1.getRootIndex(), tree2.getRootIndex())));
+
+  while(!todo.empty()) {
+    tree1.getChildren(todo.top().second.first, vBegin1, vEnd1, oBegin1, oEnd1);
+    tree2.getChildren(todo.top().second.second, vBegin2, vEnd2, oBegin2, oEnd2);
+    todo.pop();
+
+    for(; oBegin1 != oEnd1; ++oBegin1) { //go through child objects of first tree
+      for(oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {//go through child objects of second tree
+        minimum = (std::min)(minimum, minimizer.minimumOnObjectObject(*oBegin1, *oCur2));
+      }
+
+      for(vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) { //go through child volumes of second tree
+        Helper2 helper(*oBegin1, minimizer);
+        minimum = (std::min)(minimum, internal::minimize_helper(tree2, helper, *vCur2, minimum));
+      }
+    }
+
+    for(; vBegin1 != vEnd1; ++vBegin1) { //go through child volumes of first tree
+      const typename BVH1::Volume &vol1 = tree1.getVolume(*vBegin1);
+
+      for(oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {//go through child objects of second tree
+        Helper1 helper(*oCur2, minimizer);
+        minimum = (std::min)(minimum, internal::minimize_helper(tree1, helper, *vBegin1, minimum));
+      }
+
+      for(vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) { //go through child volumes of second tree
+        Scalar val = minimizer.minimumOnVolumeVolume(vol1, tree2.getVolume(*vCur2));
+        if(val < minimum)
+          todo.push(std::make_pair(val, std::make_pair(*vBegin1, *vCur2)));
+      }
+    }
+  }
+  return minimum;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_BVALGORITHMS_H

diff --git a/unsupported/Eigen/src/BVH/KdBVH.h b/unsupported/Eigen/src/BVH/KdBVH.h
new file mode 100644
index 0000000..2d5b76a
--- /dev/null
+++ b/unsupported/Eigen/src/BVH/KdBVH.h

@@ -0,0 +1,223 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Ilya Baran <ibaran@mit.edu>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef KDBVH_H_INCLUDED
+#define KDBVH_H_INCLUDED
+
+namespace Eigen { 
+
+namespace internal {
+
+//internal pair class for the BVH--used instead of std::pair because of alignment
+template<typename Scalar, int Dim>
+struct vector_int_pair
+{
+EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar, Dim)
+  typedef Matrix<Scalar, Dim, 1> VectorType;
+
+  vector_int_pair(const VectorType &v, int i) : first(v), second(i) {}
+
+  VectorType first;
+  int second;
+};
+
+//these templates help the tree initializer get the bounding boxes either from a provided
+//iterator range or using bounding_box in a unified way
+template<typename ObjectList, typename VolumeList, typename BoxIter>
+struct get_boxes_helper {
+  void operator()(const ObjectList &objects, BoxIter boxBegin, BoxIter boxEnd, VolumeList &outBoxes)
+  {
+    outBoxes.insert(outBoxes.end(), boxBegin, boxEnd);
+    eigen_assert(outBoxes.size() == objects.size());
+    EIGEN_ONLY_USED_FOR_DEBUG(objects);
+  }
+};
+
+template<typename ObjectList, typename VolumeList>
+struct get_boxes_helper<ObjectList, VolumeList, int> {
+  void operator()(const ObjectList &objects, int, int, VolumeList &outBoxes)
+  {
+    outBoxes.reserve(objects.size());
+    for(int i = 0; i < (int)objects.size(); ++i)
+      outBoxes.push_back(bounding_box(objects[i]));
+  }
+};
+
+} // end namespace internal
+
+
+/** \class KdBVH
+ *  \brief A simple bounding volume hierarchy based on AlignedBox
+ *
+ *  \param _Scalar The underlying scalar type of the bounding boxes
+ *  \param _Dim The dimension of the space in which the hierarchy lives
+ *  \param _Object The object type that lives in the hierarchy.  It must have value semantics.  Either bounding_box(_Object) must
+ *                 be defined and return an AlignedBox<_Scalar, _Dim> or bounding boxes must be provided to the tree initializer.
+ *
+ *  This class provides a simple (as opposed to optimized) implementation of a bounding volume hierarchy analogous to a Kd-tree.
+ *  Given a sequence of objects, it computes their bounding boxes, constructs a Kd-tree of their centers
+ *  and builds a BVH with the structure of that Kd-tree.  When the elements of the tree are too expensive to be copied around,
+ *  it is useful for _Object to be a pointer.
+ */
+template<typename _Scalar, int _Dim, typename _Object> class KdBVH
+{
+public:
+  enum { Dim = _Dim };
+  typedef _Object Object;
+  typedef std::vector<Object, aligned_allocator<Object> > ObjectList;
+  typedef _Scalar Scalar;
+  typedef AlignedBox<Scalar, Dim> Volume;
+  typedef std::vector<Volume, aligned_allocator<Volume> > VolumeList;
+  typedef int Index;
+  typedef const int *VolumeIterator; //the iterators are just pointers into the tree's vectors
+  typedef const Object *ObjectIterator;
+
+  KdBVH() {}
+
+  /** Given an iterator range over \a Object references, constructs the BVH.  Requires that bounding_box(Object) return a Volume. */
+  template<typename Iter> KdBVH(Iter begin, Iter end) { init(begin, end, 0, 0); } //int is recognized by init as not being an iterator type
+
+  /** Given an iterator range over \a Object references and an iterator range over their bounding boxes, constructs the BVH */
+  template<typename OIter, typename BIter> KdBVH(OIter begin, OIter end, BIter boxBegin, BIter boxEnd) { init(begin, end, boxBegin, boxEnd); }
+
+  /** Given an iterator range over \a Object references, constructs the BVH, overwriting whatever is in there currently.
+    * Requires that bounding_box(Object) return a Volume. */
+  template<typename Iter> void init(Iter begin, Iter end) { init(begin, end, 0, 0); }
+
+  /** Given an iterator range over \a Object references and an iterator range over their bounding boxes,
+    * constructs the BVH, overwriting whatever is in there currently. */
+  template<typename OIter, typename BIter> void init(OIter begin, OIter end, BIter boxBegin, BIter boxEnd)
+  {
+    objects.clear();
+    boxes.clear();
+    children.clear();
+
+    objects.insert(objects.end(), begin, end);
+    int n = static_cast<int>(objects.size());
+
+    if(n < 2)
+      return; //if we have at most one object, we don't need any internal nodes
+
+    VolumeList objBoxes;
+    VIPairList objCenters;
+
+    //compute the bounding boxes depending on BIter type
+    internal::get_boxes_helper<ObjectList, VolumeList, BIter>()(objects, boxBegin, boxEnd, objBoxes);
+
+    objCenters.reserve(n);
+    boxes.reserve(n - 1);
+    children.reserve(2 * n - 2);
+
+    for(int i = 0; i < n; ++i)
+      objCenters.push_back(VIPair(objBoxes[i].center(), i));
+
+    build(objCenters, 0, n, objBoxes, 0); //the recursive part of the algorithm
+
+    ObjectList tmp(n);
+    tmp.swap(objects);
+    for(int i = 0; i < n; ++i)
+      objects[i] = tmp[objCenters[i].second];
+  }
+
+  /** \returns the index of the root of the hierarchy */
+  inline Index getRootIndex() const { return (int)boxes.size() - 1; }
+
+  /** Given an \a index of a node, on exit, \a outVBegin and \a outVEnd range over the indices of the volume children of the node
+    * and \a outOBegin and \a outOEnd range over the object children of the node */
+  EIGEN_STRONG_INLINE void getChildren(Index index, VolumeIterator &outVBegin, VolumeIterator &outVEnd,
+                                       ObjectIterator &outOBegin, ObjectIterator &outOEnd) const
+  { //inlining this function should open lots of optimization opportunities to the compiler
+    if(index < 0) {
+      outVBegin = outVEnd;
+      if(!objects.empty())
+        outOBegin = &(objects[0]);
+      outOEnd = outOBegin + objects.size(); //output all objects--necessary when the tree has only one object
+      return;
+    }
+
+    int numBoxes = static_cast<int>(boxes.size());
+
+    int idx = index * 2;
+    if(children[idx + 1] < numBoxes) { //second index is always bigger
+      outVBegin = &(children[idx]);
+      outVEnd = outVBegin + 2;
+      outOBegin = outOEnd;
+    }
+    else if(children[idx] >= numBoxes) { //if both children are objects
+      outVBegin = outVEnd;
+      outOBegin = &(objects[children[idx] - numBoxes]);
+      outOEnd = outOBegin + 2;
+    } else { //if the first child is a volume and the second is an object
+      outVBegin = &(children[idx]);
+      outVEnd = outVBegin + 1;
+      outOBegin = &(objects[children[idx + 1] - numBoxes]);
+      outOEnd = outOBegin + 1;
+    }
+  }
+
+  /** \returns the bounding box of the node at \a index */
+  inline const Volume &getVolume(Index index) const
+  {
+    return boxes[index];
+  }
+
+private:
+  typedef internal::vector_int_pair<Scalar, Dim> VIPair;
+  typedef std::vector<VIPair, aligned_allocator<VIPair> > VIPairList;
+  typedef Matrix<Scalar, Dim, 1> VectorType;
+  struct VectorComparator //compares vectors, or more specifically, VIPairs along a particular dimension
+  {
+    VectorComparator(int inDim) : dim(inDim) {}
+    inline bool operator()(const VIPair &v1, const VIPair &v2) const { return v1.first[dim] < v2.first[dim]; }
+    int dim;
+  };
+
+  //Build the part of the tree between objects[from] and objects[to] (not including objects[to]).
+  //This routine partitions the objCenters in [from, to) along the dimension dim, recursively constructs
+  //the two halves, and adds their parent node.  TODO: a cache-friendlier layout
+  void build(VIPairList &objCenters, int from, int to, const VolumeList &objBoxes, int dim)
+  {
+    eigen_assert(to - from > 1);
+    if(to - from == 2) {
+      boxes.push_back(objBoxes[objCenters[from].second].merged(objBoxes[objCenters[from + 1].second]));
+      children.push_back(from + (int)objects.size() - 1); //there are objects.size() - 1 tree nodes
+      children.push_back(from + (int)objects.size());
+    }
+    else if(to - from == 3) {
+      int mid = from + 2;
+      std::nth_element(objCenters.begin() + from, objCenters.begin() + mid,
+                        objCenters.begin() + to, VectorComparator(dim)); //partition
+      build(objCenters, from, mid, objBoxes, (dim + 1) % Dim);
+      int idx1 = (int)boxes.size() - 1;
+      boxes.push_back(boxes[idx1].merged(objBoxes[objCenters[mid].second]));
+      children.push_back(idx1);
+      children.push_back(mid + (int)objects.size() - 1);
+    }
+    else {
+      int mid = from + (to - from) / 2;
+      nth_element(objCenters.begin() + from, objCenters.begin() + mid,
+                  objCenters.begin() + to, VectorComparator(dim)); //partition
+      build(objCenters, from, mid, objBoxes, (dim + 1) % Dim);
+      int idx1 = (int)boxes.size() - 1;
+      build(objCenters, mid, to, objBoxes, (dim + 1) % Dim);
+      int idx2 = (int)boxes.size() - 1;
+      boxes.push_back(boxes[idx1].merged(boxes[idx2]));
+      children.push_back(idx1);
+      children.push_back(idx2);
+    }
+  }
+
+  std::vector<int> children; //children of x are children[2x] and children[2x+1], indices bigger than boxes.size() index into objects.
+  VolumeList boxes;
+  ObjectList objects;
+};
+
+} // end namespace Eigen
+
+#endif //KDBVH_H_INCLUDED

diff --git a/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
new file mode 100644
index 0000000..0fbd847
--- /dev/null
+++ b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h

@@ -0,0 +1,790 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 David Harmon <dharmon@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARPACKGENERALIZEDSELFADJOINTEIGENSOLVER_H
+#define EIGEN_ARPACKGENERALIZEDSELFADJOINTEIGENSOLVER_H
+
+#include "../../../../Eigen/Dense"
+
+namespace Eigen { 
+
+namespace internal {
+  template<typename Scalar, typename RealScalar> struct arpack_wrapper;
+  template<typename MatrixSolver, typename MatrixType, typename Scalar, bool BisSPD> struct OP;
+}
+
+
+
+template<typename MatrixType, typename MatrixSolver=SimplicialLLT<MatrixType>, bool BisSPD=false>
+class ArpackGeneralizedSelfAdjointEigenSolver
+{
+public:
+  //typedef typename MatrixSolver::MatrixType MatrixType;
+
+  /** \brief Scalar type for matrices of type \p MatrixType. */
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::Index Index;
+
+  /** \brief Real scalar type for \p MatrixType.
+   *
+   * This is just \c Scalar if #Scalar is real (e.g., \c float or
+   * \c Scalar), and the type of the real part of \c Scalar if #Scalar is
+   * complex.
+   */
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  /** \brief Type for vector of eigenvalues as returned by eigenvalues().
+   *
+   * This is a column vector with entries of type #RealScalar.
+   * The length of the vector is the size of \p nbrEigenvalues.
+   */
+  typedef typename internal::plain_col_type<MatrixType, RealScalar>::type RealVectorType;
+
+  /** \brief Default constructor.
+   *
+   * The default constructor is for cases in which the user intends to
+   * perform decompositions via compute().
+   *
+   */
+  ArpackGeneralizedSelfAdjointEigenSolver()
+   : m_eivec(),
+     m_eivalues(),
+     m_isInitialized(false),
+     m_eigenvectorsOk(false),
+     m_nbrConverged(0),
+     m_nbrIterations(0)
+  { }
+
+  /** \brief Constructor; computes generalized eigenvalues of given matrix with respect to another matrix.
+   *
+   * \param[in] A Self-adjoint matrix whose eigenvalues / eigenvectors will
+   *    computed. By default, the upper triangular part is used, but can be changed
+   *    through the template parameter.
+   * \param[in] B Self-adjoint matrix for the generalized eigenvalue problem.
+   * \param[in] nbrEigenvalues The number of eigenvalues / eigenvectors to compute.
+   *    Must be less than the size of the input matrix, or an error is returned.
+   * \param[in] eigs_sigma String containing either "LM", "SM", "LA", or "SA", with
+   *    respective meanings to find the largest magnitude , smallest magnitude,
+   *    largest algebraic, or smallest algebraic eigenvalues. Alternatively, this
+   *    value can contain floating point value in string form, in which case the
+   *    eigenvalues closest to this value will be found.
+   * \param[in]  options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly.
+   * \param[in] tol What tolerance to find the eigenvalues to. Default is 0, which
+   *    means machine precision.
+   *
+   * This constructor calls compute(const MatrixType&, const MatrixType&, Index, string, int, RealScalar)
+   * to compute the eigenvalues of the matrix \p A with respect to \p B. The eigenvectors are computed if
+   * \p options equals #ComputeEigenvectors.
+   *
+   */
+  ArpackGeneralizedSelfAdjointEigenSolver(const MatrixType& A, const MatrixType& B,
+                                          Index nbrEigenvalues, std::string eigs_sigma="LM",
+                               int options=ComputeEigenvectors, RealScalar tol=0.0)
+    : m_eivec(),
+      m_eivalues(),
+      m_isInitialized(false),
+      m_eigenvectorsOk(false),
+      m_nbrConverged(0),
+      m_nbrIterations(0)
+  {
+    compute(A, B, nbrEigenvalues, eigs_sigma, options, tol);
+  }
+
+  /** \brief Constructor; computes eigenvalues of given matrix.
+   *
+   * \param[in] A Self-adjoint matrix whose eigenvalues / eigenvectors will
+   *    computed. By default, the upper triangular part is used, but can be changed
+   *    through the template parameter.
+   * \param[in] nbrEigenvalues The number of eigenvalues / eigenvectors to compute.
+   *    Must be less than the size of the input matrix, or an error is returned.
+   * \param[in] eigs_sigma String containing either "LM", "SM", "LA", or "SA", with
+   *    respective meanings to find the largest magnitude , smallest magnitude,
+   *    largest algebraic, or smallest algebraic eigenvalues. Alternatively, this
+   *    value can contain floating point value in string form, in which case the
+   *    eigenvalues closest to this value will be found.
+   * \param[in]  options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly.
+   * \param[in] tol What tolerance to find the eigenvalues to. Default is 0, which
+   *    means machine precision.
+   *
+   * This constructor calls compute(const MatrixType&, Index, string, int, RealScalar)
+   * to compute the eigenvalues of the matrix \p A. The eigenvectors are computed if
+   * \p options equals #ComputeEigenvectors.
+   *
+   */
+
+  ArpackGeneralizedSelfAdjointEigenSolver(const MatrixType& A,
+                                          Index nbrEigenvalues, std::string eigs_sigma="LM",
+                               int options=ComputeEigenvectors, RealScalar tol=0.0)
+    : m_eivec(),
+      m_eivalues(),
+      m_isInitialized(false),
+      m_eigenvectorsOk(false),
+      m_nbrConverged(0),
+      m_nbrIterations(0)
+  {
+    compute(A, nbrEigenvalues, eigs_sigma, options, tol);
+  }
+
+
+  /** \brief Computes generalized eigenvalues / eigenvectors of given matrix using the external ARPACK library.
+   *
+   * \param[in]  A  Selfadjoint matrix whose eigendecomposition is to be computed.
+   * \param[in]  B  Selfadjoint matrix for generalized eigenvalues.
+   * \param[in] nbrEigenvalues The number of eigenvalues / eigenvectors to compute.
+   *    Must be less than the size of the input matrix, or an error is returned.
+   * \param[in] eigs_sigma String containing either "LM", "SM", "LA", or "SA", with
+   *    respective meanings to find the largest magnitude , smallest magnitude,
+   *    largest algebraic, or smallest algebraic eigenvalues. Alternatively, this
+   *    value can contain floating point value in string form, in which case the
+   *    eigenvalues closest to this value will be found.
+   * \param[in]  options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly.
+   * \param[in] tol What tolerance to find the eigenvalues to. Default is 0, which
+   *    means machine precision.
+   *
+   * \returns    Reference to \c *this
+   *
+   * This function computes the generalized eigenvalues of \p A with respect to \p B using ARPACK.  The eigenvalues()
+   * function can be used to retrieve them.  If \p options equals #ComputeEigenvectors,
+   * then the eigenvectors are also computed and can be retrieved by
+   * calling eigenvectors().
+   *
+   */
+  ArpackGeneralizedSelfAdjointEigenSolver& compute(const MatrixType& A, const MatrixType& B,
+                                                   Index nbrEigenvalues, std::string eigs_sigma="LM",
+                                        int options=ComputeEigenvectors, RealScalar tol=0.0);
+  
+  /** \brief Computes eigenvalues / eigenvectors of given matrix using the external ARPACK library.
+   *
+   * \param[in]  A  Selfadjoint matrix whose eigendecomposition is to be computed.
+   * \param[in] nbrEigenvalues The number of eigenvalues / eigenvectors to compute.
+   *    Must be less than the size of the input matrix, or an error is returned.
+   * \param[in] eigs_sigma String containing either "LM", "SM", "LA", or "SA", with
+   *    respective meanings to find the largest magnitude , smallest magnitude,
+   *    largest algebraic, or smallest algebraic eigenvalues. Alternatively, this
+   *    value can contain floating point value in string form, in which case the
+   *    eigenvalues closest to this value will be found.
+   * \param[in]  options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly.
+   * \param[in] tol What tolerance to find the eigenvalues to. Default is 0, which
+   *    means machine precision.
+   *
+   * \returns    Reference to \c *this
+   *
+   * This function computes the eigenvalues of \p A using ARPACK.  The eigenvalues()
+   * function can be used to retrieve them.  If \p options equals #ComputeEigenvectors,
+   * then the eigenvectors are also computed and can be retrieved by
+   * calling eigenvectors().
+   *
+   */
+  ArpackGeneralizedSelfAdjointEigenSolver& compute(const MatrixType& A,
+                                                   Index nbrEigenvalues, std::string eigs_sigma="LM",
+                                        int options=ComputeEigenvectors, RealScalar tol=0.0);
+
+
+  /** \brief Returns the eigenvectors of given matrix.
+   *
+   * \returns  A const reference to the matrix whose columns are the eigenvectors.
+   *
+   * \pre The eigenvectors have been computed before.
+   *
+   * Column \f$ k \f$ of the returned matrix is an eigenvector corresponding
+   * to eigenvalue number \f$ k \f$ as returned by eigenvalues().  The
+   * eigenvectors are normalized to have (Euclidean) norm equal to one. If
+   * this object was used to solve the eigenproblem for the selfadjoint
+   * matrix \f$ A \f$, then the matrix returned by this function is the
+   * matrix \f$ V \f$ in the eigendecomposition \f$ A V = D V \f$.
+   * For the generalized eigenproblem, the matrix returned is the solution \f$ A V = D B V \f$
+   *
+   * Example: \include SelfAdjointEigenSolver_eigenvectors.cpp
+   * Output: \verbinclude SelfAdjointEigenSolver_eigenvectors.out
+   *
+   * \sa eigenvalues()
+   */
+  const Matrix<Scalar, Dynamic, Dynamic>& eigenvectors() const
+  {
+    eigen_assert(m_isInitialized && "ArpackGeneralizedSelfAdjointEigenSolver is not initialized.");
+    eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+    return m_eivec;
+  }
+
+  /** \brief Returns the eigenvalues of given matrix.
+   *
+   * \returns A const reference to the column vector containing the eigenvalues.
+   *
+   * \pre The eigenvalues have been computed before.
+   *
+   * The eigenvalues are repeated according to their algebraic multiplicity,
+   * so there are as many eigenvalues as rows in the matrix. The eigenvalues
+   * are sorted in increasing order.
+   *
+   * Example: \include SelfAdjointEigenSolver_eigenvalues.cpp
+   * Output: \verbinclude SelfAdjointEigenSolver_eigenvalues.out
+   *
+   * \sa eigenvectors(), MatrixBase::eigenvalues()
+   */
+  const Matrix<Scalar, Dynamic, 1>& eigenvalues() const
+  {
+    eigen_assert(m_isInitialized && "ArpackGeneralizedSelfAdjointEigenSolver is not initialized.");
+    return m_eivalues;
+  }
+
+  /** \brief Computes the positive-definite square root of the matrix.
+   *
+   * \returns the positive-definite square root of the matrix
+   *
+   * \pre The eigenvalues and eigenvectors of a positive-definite matrix
+   * have been computed before.
+   *
+   * The square root of a positive-definite matrix \f$ A \f$ is the
+   * positive-definite matrix whose square equals \f$ A \f$. This function
+   * uses the eigendecomposition \f$ A = V D V^{-1} \f$ to compute the
+   * square root as \f$ A^{1/2} = V D^{1/2} V^{-1} \f$.
+   *
+   * Example: \include SelfAdjointEigenSolver_operatorSqrt.cpp
+   * Output: \verbinclude SelfAdjointEigenSolver_operatorSqrt.out
+   *
+   * \sa operatorInverseSqrt(),
+   *     \ref MatrixFunctions_Module "MatrixFunctions Module"
+   */
+  Matrix<Scalar, Dynamic, Dynamic> operatorSqrt() const
+  {
+    eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
+    eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+    return m_eivec * m_eivalues.cwiseSqrt().asDiagonal() * m_eivec.adjoint();
+  }
+
+  /** \brief Computes the inverse square root of the matrix.
+   *
+   * \returns the inverse positive-definite square root of the matrix
+   *
+   * \pre The eigenvalues and eigenvectors of a positive-definite matrix
+   * have been computed before.
+   *
+   * This function uses the eigendecomposition \f$ A = V D V^{-1} \f$ to
+   * compute the inverse square root as \f$ V D^{-1/2} V^{-1} \f$. This is
+   * cheaper than first computing the square root with operatorSqrt() and
+   * then its inverse with MatrixBase::inverse().
+   *
+   * Example: \include SelfAdjointEigenSolver_operatorInverseSqrt.cpp
+   * Output: \verbinclude SelfAdjointEigenSolver_operatorInverseSqrt.out
+   *
+   * \sa operatorSqrt(), MatrixBase::inverse(),
+   *     \ref MatrixFunctions_Module "MatrixFunctions Module"
+   */
+  Matrix<Scalar, Dynamic, Dynamic> operatorInverseSqrt() const
+  {
+    eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
+    eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+    return m_eivec * m_eivalues.cwiseInverse().cwiseSqrt().asDiagonal() * m_eivec.adjoint();
+  }
+
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful, \c NoConvergence otherwise.
+   */
+  ComputationInfo info() const
+  {
+    eigen_assert(m_isInitialized && "ArpackGeneralizedSelfAdjointEigenSolver is not initialized.");
+    return m_info;
+  }
+
+  size_t getNbrConvergedEigenValues() const
+  { return m_nbrConverged; }
+
+  size_t getNbrIterations() const
+  { return m_nbrIterations; }
+
+protected:
+  Matrix<Scalar, Dynamic, Dynamic> m_eivec;
+  Matrix<Scalar, Dynamic, 1> m_eivalues;
+  ComputationInfo m_info;
+  bool m_isInitialized;
+  bool m_eigenvectorsOk;
+
+  size_t m_nbrConverged;
+  size_t m_nbrIterations;
+};
+
+
+
+
+
+template<typename MatrixType, typename MatrixSolver, bool BisSPD>
+ArpackGeneralizedSelfAdjointEigenSolver<MatrixType, MatrixSolver, BisSPD>&
+    ArpackGeneralizedSelfAdjointEigenSolver<MatrixType, MatrixSolver, BisSPD>
+::compute(const MatrixType& A, Index nbrEigenvalues,
+          std::string eigs_sigma, int options, RealScalar tol)
+{
+    MatrixType B(0,0);
+    compute(A, B, nbrEigenvalues, eigs_sigma, options, tol);
+    
+    return *this;
+}
+
+
+template<typename MatrixType, typename MatrixSolver, bool BisSPD>
+ArpackGeneralizedSelfAdjointEigenSolver<MatrixType, MatrixSolver, BisSPD>&
+    ArpackGeneralizedSelfAdjointEigenSolver<MatrixType, MatrixSolver, BisSPD>
+::compute(const MatrixType& A, const MatrixType& B, Index nbrEigenvalues,
+          std::string eigs_sigma, int options, RealScalar tol)
+{
+  eigen_assert(A.cols() == A.rows());
+  eigen_assert(B.cols() == B.rows());
+  eigen_assert(B.rows() == 0 || A.cols() == B.rows());
+  eigen_assert((options &~ (EigVecMask | GenEigMask)) == 0
+            && (options & EigVecMask) != EigVecMask
+            && "invalid option parameter");
+
+  bool isBempty = (B.rows() == 0) || (B.cols() == 0);
+
+  // For clarity, all parameters match their ARPACK name
+  //
+  // Always 0 on the first call
+  //
+  int ido = 0;
+
+  int n = (int)A.cols();
+
+  // User options: "LA", "SA", "SM", "LM", "BE"
+  //
+  char whch[3] = "LM";
+    
+  // Specifies the shift if iparam[6] = { 3, 4, 5 }, not used if iparam[6] = { 1, 2 }
+  //
+  RealScalar sigma = 0.0;
+
+  if (eigs_sigma.length() >= 2 && isalpha(eigs_sigma[0]) && isalpha(eigs_sigma[1]))
+  {
+      eigs_sigma[0] = toupper(eigs_sigma[0]);
+      eigs_sigma[1] = toupper(eigs_sigma[1]);
+
+      // In the following special case we're going to invert the problem, since solving
+      // for larger magnitude is much much faster
+      // i.e., if 'SM' is specified, we're going to really use 'LM', the default
+      //
+      if (eigs_sigma.substr(0,2) != "SM")
+      {
+          whch[0] = eigs_sigma[0];
+          whch[1] = eigs_sigma[1];
+      }
+  }
+  else
+  {
+      eigen_assert(false && "Specifying clustered eigenvalues is not yet supported!");
+
+      // If it's not scalar values, then the user may be explicitly
+      // specifying the sigma value to cluster the evs around
+      //
+      sigma = atof(eigs_sigma.c_str());
+
+      // If atof fails, it returns 0.0, which is a fine default
+      //
+  }
+
+  // "I" means normal eigenvalue problem, "G" means generalized
+  //
+  char bmat[2] = "I";
+  if (eigs_sigma.substr(0,2) == "SM" || !(isalpha(eigs_sigma[0]) && isalpha(eigs_sigma[1])) || (!isBempty && !BisSPD))
+      bmat[0] = 'G';
+
+  // Now we determine the mode to use
+  //
+  int mode = (bmat[0] == 'G') + 1;
+  if (eigs_sigma.substr(0,2) == "SM" || !(isalpha(eigs_sigma[0]) && isalpha(eigs_sigma[1])))
+  {
+      // We're going to use shift-and-invert mode, and basically find
+      // the largest eigenvalues of the inverse operator
+      //
+      mode = 3;
+  }
+
+  // The user-specified number of eigenvalues/vectors to compute
+  //
+  int nev = (int)nbrEigenvalues;
+
+  // Allocate space for ARPACK to store the residual
+  //
+  Scalar *resid = new Scalar[n];
+
+  // Number of Lanczos vectors, must satisfy nev < ncv <= n
+  // Note that this indicates that nev != n, and we cannot compute
+  // all eigenvalues of a mtrix
+  //
+  int ncv = std::min(std::max(2*nev, 20), n);
+
+  // The working n x ncv matrix, also store the final eigenvectors (if computed)
+  //
+  Scalar *v = new Scalar[n*ncv];
+  int ldv = n;
+
+  // Working space
+  //
+  Scalar *workd = new Scalar[3*n];
+  int lworkl = ncv*ncv+8*ncv; // Must be at least this length
+  Scalar *workl = new Scalar[lworkl];
+
+  int *iparam= new int[11];
+  iparam[0] = 1; // 1 means we let ARPACK perform the shifts, 0 means we'd have to do it
+  iparam[2] = std::max(300, (int)std::ceil(2*n/std::max(ncv,1)));
+  iparam[6] = mode; // The mode, 1 is standard ev problem, 2 for generalized ev, 3 for shift-and-invert
+
+  // Used during reverse communicate to notify where arrays start
+  //
+  int *ipntr = new int[11]; 
+
+  // Error codes are returned in here, initial value of 0 indicates a random initial
+  // residual vector is used, any other values means resid contains the initial residual
+  // vector, possibly from a previous run
+  //
+  int info = 0;
+
+  Scalar scale = 1.0;
+  //if (!isBempty)
+  //{
+  //Scalar scale = B.norm() / std::sqrt(n);
+  //scale = std::pow(2, std::floor(std::log(scale+1)));
+  ////M /= scale;
+  //for (size_t i=0; i<(size_t)B.outerSize(); i++)
+  //    for (typename MatrixType::InnerIterator it(B, i); it; ++it)
+  //        it.valueRef() /= scale;
+  //}
+
+  MatrixSolver OP;
+  if (mode == 1 || mode == 2)
+  {
+      if (!isBempty)
+          OP.compute(B);
+  }
+  else if (mode == 3)
+  {
+      if (sigma == 0.0)
+      {
+          OP.compute(A);
+      }
+      else
+      {
+          // Note: We will never enter here because sigma must be 0.0
+          //
+          if (isBempty)
+          {
+            MatrixType AminusSigmaB(A);
+            for (Index i=0; i<A.rows(); ++i)
+                AminusSigmaB.coeffRef(i,i) -= sigma;
+            
+            OP.compute(AminusSigmaB);
+          }
+          else
+          {
+              MatrixType AminusSigmaB = A - sigma * B;
+              OP.compute(AminusSigmaB);
+          }
+      }
+  }
+ 
+  if (!(mode == 1 && isBempty) && !(mode == 2 && isBempty) && OP.info() != Success)
+      std::cout << "Error factoring matrix" << std::endl;
+
+  do
+  {
+    internal::arpack_wrapper<Scalar, RealScalar>::saupd(&ido, bmat, &n, whch, &nev, &tol, resid, 
+                                                        &ncv, v, &ldv, iparam, ipntr, workd, workl,
+                                                        &lworkl, &info);
+
+    if (ido == -1 || ido == 1)
+    {
+      Scalar *in  = workd + ipntr[0] - 1;
+      Scalar *out = workd + ipntr[1] - 1;
+
+      if (ido == 1 && mode != 2)
+      {
+          Scalar *out2 = workd + ipntr[2] - 1;
+          if (isBempty || mode == 1)
+            Matrix<Scalar, Dynamic, 1>::Map(out2, n) = Matrix<Scalar, Dynamic, 1>::Map(in, n);
+          else
+            Matrix<Scalar, Dynamic, 1>::Map(out2, n) = B * Matrix<Scalar, Dynamic, 1>::Map(in, n);
+          
+          in = workd + ipntr[2] - 1;
+      }
+
+      if (mode == 1)
+      {
+        if (isBempty)
+        {
+          // OP = A
+          //
+          Matrix<Scalar, Dynamic, 1>::Map(out, n) = A * Matrix<Scalar, Dynamic, 1>::Map(in, n);
+        }
+        else
+        {
+          // OP = L^{-1}AL^{-T}
+          //
+          internal::OP<MatrixSolver, MatrixType, Scalar, BisSPD>::applyOP(OP, A, n, in, out);
+        }
+      }
+      else if (mode == 2)
+      {
+        if (ido == 1)
+          Matrix<Scalar, Dynamic, 1>::Map(in, n)  = A * Matrix<Scalar, Dynamic, 1>::Map(in, n);
+        
+        // OP = B^{-1} A
+        //
+        Matrix<Scalar, Dynamic, 1>::Map(out, n) = OP.solve(Matrix<Scalar, Dynamic, 1>::Map(in, n));
+      }
+      else if (mode == 3)
+      {
+        // OP = (A-\sigmaB)B (\sigma could be 0, and B could be I)
+        // The B * in is already computed and stored at in if ido == 1
+        //
+        if (ido == 1 || isBempty)
+          Matrix<Scalar, Dynamic, 1>::Map(out, n) = OP.solve(Matrix<Scalar, Dynamic, 1>::Map(in, n));
+        else
+          Matrix<Scalar, Dynamic, 1>::Map(out, n) = OP.solve(B * Matrix<Scalar, Dynamic, 1>::Map(in, n));
+      }
+    }
+    else if (ido == 2)
+    {
+      Scalar *in  = workd + ipntr[0] - 1;
+      Scalar *out = workd + ipntr[1] - 1;
+
+      if (isBempty || mode == 1)
+        Matrix<Scalar, Dynamic, 1>::Map(out, n) = Matrix<Scalar, Dynamic, 1>::Map(in, n);
+      else
+        Matrix<Scalar, Dynamic, 1>::Map(out, n) = B * Matrix<Scalar, Dynamic, 1>::Map(in, n);
+    }
+  } while (ido != 99);
+
+  if (info == 1)
+    m_info = NoConvergence;
+  else if (info == 3)
+    m_info = NumericalIssue;
+  else if (info < 0)
+    m_info = InvalidInput;
+  else if (info != 0)
+    eigen_assert(false && "Unknown ARPACK return value!");
+  else
+  {
+    // Do we compute eigenvectors or not?
+    //
+    int rvec = (options & ComputeEigenvectors) == ComputeEigenvectors;
+
+    // "A" means "All", use "S" to choose specific eigenvalues (not yet supported in ARPACK))
+    //
+    char howmny[2] = "A"; 
+
+    // if howmny == "S", specifies the eigenvalues to compute (not implemented in ARPACK)
+    //
+    int *select = new int[ncv];
+
+    // Final eigenvalues
+    //
+    m_eivalues.resize(nev, 1);
+
+    internal::arpack_wrapper<Scalar, RealScalar>::seupd(&rvec, howmny, select, m_eivalues.data(), v, &ldv,
+                                                        &sigma, bmat, &n, whch, &nev, &tol, resid, &ncv,
+                                                        v, &ldv, iparam, ipntr, workd, workl, &lworkl, &info);
+
+    if (info == -14)
+      m_info = NoConvergence;
+    else if (info != 0)
+      m_info = InvalidInput;
+    else
+    {
+      if (rvec)
+      {
+        m_eivec.resize(A.rows(), nev);
+        for (int i=0; i<nev; i++)
+          for (int j=0; j<n; j++)
+            m_eivec(j,i) = v[i*n+j] / scale;
+      
+        if (mode == 1 && !isBempty && BisSPD)
+          internal::OP<MatrixSolver, MatrixType, Scalar, BisSPD>::project(OP, n, nev, m_eivec.data());
+
+        m_eigenvectorsOk = true;
+      }
+
+      m_nbrIterations = iparam[2];
+      m_nbrConverged  = iparam[4];
+
+      m_info = Success;
+    }
+
+    delete[] select;
+  }
+
+  delete[] v;
+  delete[] iparam;
+  delete[] ipntr;
+  delete[] workd;
+  delete[] workl;
+  delete[] resid;
+
+  m_isInitialized = true;
+
+  return *this;
+}
+
+
+// Single precision
+//
+extern "C" void ssaupd_(int *ido, char *bmat, int *n, char *which,
+    int *nev, float *tol, float *resid, int *ncv,
+    float *v, int *ldv, int *iparam, int *ipntr,
+    float *workd, float *workl, int *lworkl,
+    int *info);
+
+extern "C" void sseupd_(int *rvec, char *All, int *select, float *d,
+    float *z, int *ldz, float *sigma, 
+    char *bmat, int *n, char *which, int *nev,
+    float *tol, float *resid, int *ncv, float *v,
+    int *ldv, int *iparam, int *ipntr, float *workd,
+    float *workl, int *lworkl, int *ierr);
+
+// Double precision
+//
+extern "C" void dsaupd_(int *ido, char *bmat, int *n, char *which,
+    int *nev, double *tol, double *resid, int *ncv,
+    double *v, int *ldv, int *iparam, int *ipntr,
+    double *workd, double *workl, int *lworkl,
+    int *info);
+
+extern "C" void dseupd_(int *rvec, char *All, int *select, double *d,
+    double *z, int *ldz, double *sigma, 
+    char *bmat, int *n, char *which, int *nev,
+    double *tol, double *resid, int *ncv, double *v,
+    int *ldv, int *iparam, int *ipntr, double *workd,
+    double *workl, int *lworkl, int *ierr);
+
+
+namespace internal {
+
+template<typename Scalar, typename RealScalar> struct arpack_wrapper
+{
+  static inline void saupd(int *ido, char *bmat, int *n, char *which,
+      int *nev, RealScalar *tol, Scalar *resid, int *ncv,
+      Scalar *v, int *ldv, int *iparam, int *ipntr,
+      Scalar *workd, Scalar *workl, int *lworkl, int *info)
+  { 
+    EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL)
+  }
+
+  static inline void seupd(int *rvec, char *All, int *select, Scalar *d,
+      Scalar *z, int *ldz, RealScalar *sigma,
+      char *bmat, int *n, char *which, int *nev,
+      RealScalar *tol, Scalar *resid, int *ncv, Scalar *v,
+      int *ldv, int *iparam, int *ipntr, Scalar *workd,
+      Scalar *workl, int *lworkl, int *ierr)
+  {
+    EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL)
+  }
+};
+
+template <> struct arpack_wrapper<float, float>
+{
+  static inline void saupd(int *ido, char *bmat, int *n, char *which,
+      int *nev, float *tol, float *resid, int *ncv,
+      float *v, int *ldv, int *iparam, int *ipntr,
+      float *workd, float *workl, int *lworkl, int *info)
+  {
+    ssaupd_(ido, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info);
+  }
+
+  static inline void seupd(int *rvec, char *All, int *select, float *d,
+      float *z, int *ldz, float *sigma,
+      char *bmat, int *n, char *which, int *nev,
+      float *tol, float *resid, int *ncv, float *v,
+      int *ldv, int *iparam, int *ipntr, float *workd,
+      float *workl, int *lworkl, int *ierr)
+  {
+    sseupd_(rvec, All, select, d, z, ldz, sigma, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr,
+        workd, workl, lworkl, ierr);
+  }
+};
+
+template <> struct arpack_wrapper<double, double>
+{
+  static inline void saupd(int *ido, char *bmat, int *n, char *which,
+      int *nev, double *tol, double *resid, int *ncv,
+      double *v, int *ldv, int *iparam, int *ipntr,
+      double *workd, double *workl, int *lworkl, int *info)
+  {
+    dsaupd_(ido, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info);
+  }
+
+  static inline void seupd(int *rvec, char *All, int *select, double *d,
+      double *z, int *ldz, double *sigma,
+      char *bmat, int *n, char *which, int *nev,
+      double *tol, double *resid, int *ncv, double *v,
+      int *ldv, int *iparam, int *ipntr, double *workd,
+      double *workl, int *lworkl, int *ierr)
+  {
+    dseupd_(rvec, All, select, d, v, ldv, sigma, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr,
+        workd, workl, lworkl, ierr);
+  }
+};
+
+
+template<typename MatrixSolver, typename MatrixType, typename Scalar, bool BisSPD>
+struct OP
+{
+    static inline void applyOP(MatrixSolver &OP, const MatrixType &A, int n, Scalar *in, Scalar *out);
+    static inline void project(MatrixSolver &OP, int n, int k, Scalar *vecs);
+};
+
+template<typename MatrixSolver, typename MatrixType, typename Scalar>
+struct OP<MatrixSolver, MatrixType, Scalar, true>
+{
+  static inline void applyOP(MatrixSolver &OP, const MatrixType &A, int n, Scalar *in, Scalar *out)
+{
+    // OP = L^{-1} A L^{-T}  (B = LL^T)
+    //
+    // First solve L^T out = in
+    //
+    Matrix<Scalar, Dynamic, 1>::Map(out, n) = OP.matrixU().solve(Matrix<Scalar, Dynamic, 1>::Map(in, n));
+    Matrix<Scalar, Dynamic, 1>::Map(out, n) = OP.permutationPinv() * Matrix<Scalar, Dynamic, 1>::Map(out, n);
+
+    // Then compute out = A out
+    //
+    Matrix<Scalar, Dynamic, 1>::Map(out, n) = A * Matrix<Scalar, Dynamic, 1>::Map(out, n);
+
+    // Then solve L out = out
+    //
+    Matrix<Scalar, Dynamic, 1>::Map(out, n) = OP.permutationP() * Matrix<Scalar, Dynamic, 1>::Map(out, n);
+    Matrix<Scalar, Dynamic, 1>::Map(out, n) = OP.matrixL().solve(Matrix<Scalar, Dynamic, 1>::Map(out, n));
+}
+
+  static inline void project(MatrixSolver &OP, int n, int k, Scalar *vecs)
+{
+    // Solve L^T out = in
+    //
+    Matrix<Scalar, Dynamic, Dynamic>::Map(vecs, n, k) = OP.matrixU().solve(Matrix<Scalar, Dynamic, Dynamic>::Map(vecs, n, k));
+    Matrix<Scalar, Dynamic, Dynamic>::Map(vecs, n, k) = OP.permutationPinv() * Matrix<Scalar, Dynamic, Dynamic>::Map(vecs, n, k);
+}
+
+};
+
+template<typename MatrixSolver, typename MatrixType, typename Scalar>
+struct OP<MatrixSolver, MatrixType, Scalar, false>
+{
+  static inline void applyOP(MatrixSolver &OP, const MatrixType &A, int n, Scalar *in, Scalar *out)
+{
+    eigen_assert(false && "Should never be in here...");
+}
+
+  static inline void project(MatrixSolver &OP, int n, int k, Scalar *vecs)
+{
+    eigen_assert(false && "Should never be in here...");
+}
+
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_ARPACKSELFADJOINTEIGENSOLVER_H
+

diff --git a/unsupported/Eigen/src/EulerAngles/CMakeLists.txt b/unsupported/Eigen/src/EulerAngles/CMakeLists.txt
new file mode 100644
index 0000000..22088eb
--- /dev/null
+++ b/unsupported/Eigen/src/EulerAngles/CMakeLists.txt

@@ -0,0 +1,6 @@
+file(GLOB Eigen_EulerAngles_SRCS "*.h")
+
+install(FILES
+  ${Eigen_EulerAngles_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/EulerAngles COMPONENT Devel
+  )

diff --git a/unsupported/Eigen/src/EulerAngles/EulerAngles.h b/unsupported/Eigen/src/EulerAngles/EulerAngles.h
new file mode 100644
index 0000000..e43cdb7
--- /dev/null
+++ b/unsupported/Eigen/src/EulerAngles/EulerAngles.h

@@ -0,0 +1,355 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERANGLESCLASS_H// TODO: Fix previous "EIGEN_EULERANGLES_H" definition?
+#define EIGEN_EULERANGLESCLASS_H
+
+namespace Eigen
+{
+  /** \class EulerAngles
+    *
+    * \ingroup EulerAngles_Module
+    *
+    * \brief Represents a rotation in a 3 dimensional space as three Euler angles.
+    *
+    * Euler rotation is a set of three rotation of three angles over three fixed axes, defined by the EulerSystem given as a template parameter.
+    * 
+    * Here is how intrinsic Euler angles works:
+    *  - first, rotate the axes system over the alpha axis in angle alpha
+    *  - then, rotate the axes system over the beta axis(which was rotated in the first stage) in angle beta
+    *  - then, rotate the axes system over the gamma axis(which was rotated in the two stages above) in angle gamma
+    *
+    * \note This class support only intrinsic Euler angles for simplicity,
+    *  see EulerSystem how to easily overcome this for extrinsic systems.
+    *
+    * ### Rotation representation and conversions ###
+    *
+    * It has been proved(see Wikipedia link below) that every rotation can be represented
+    *  by Euler angles, but there is no single representation (e.g. unlike rotation matrices).
+    * Therefore, you can convert from Eigen rotation and to them
+    *  (including rotation matrices, which is not called "rotations" by Eigen design).
+    *
+    * Euler angles usually used for:
+    *  - convenient human representation of rotation, especially in interactive GUI.
+    *  - gimbal systems and robotics
+    *  - efficient encoding(i.e. 3 floats only) of rotation for network protocols.
+    *
+    * However, Euler angles are slow comparing to quaternion or matrices,
+    *  because their unnatural math definition, although it's simple for human.
+    * To overcome this, this class provide easy movement from the math friendly representation
+    *  to the human friendly representation, and vise-versa.
+    *
+    * All the user need to do is a safe simple C++ type conversion,
+    *  and this class take care for the math.
+    * Additionally, some axes related computation is done in compile time.
+    *
+    * #### Euler angles ranges in conversions ####
+    * Rotations representation as EulerAngles are not single (unlike matrices),
+    *  and even have infinite EulerAngles representations.<BR>
+    * For example, add or subtract 2*PI from either angle of EulerAngles
+    *  and you'll get the same rotation.
+    * This is the general reason for infinite representation,
+    *  but it's not the only general reason for not having a single representation.
+    *
+    * When converting rotation to EulerAngles, this class convert it to specific ranges
+    * When converting some rotation to EulerAngles, the rules for ranges are as follow:
+    * - If the rotation we converting from is an EulerAngles
+    *  (even when it represented as RotationBase explicitly), angles ranges are __undefined__.
+    * - otherwise, alpha and gamma angles will be in the range [-PI, PI].<BR>
+    *   As for Beta angle:
+    *    - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2].
+    *    - otherwise:
+    *      - If the beta axis is positive, the beta angle will be in the range [0, PI]
+    *      - If the beta axis is negative, the beta angle will be in the range [-PI, 0]
+    *
+    * \sa EulerAngles(const MatrixBase<Derived>&)
+    * \sa EulerAngles(const RotationBase<Derived, 3>&)
+    *
+    * ### Convenient user typedefs ###
+    *
+    * Convenient typedefs for EulerAngles exist for float and double scalar,
+    *  in a form of EulerAngles{A}{B}{C}{scalar},
+    *  e.g. \ref EulerAnglesXYZd, \ref EulerAnglesZYZf.
+    *
+    * Only for positive axes{+x,+y,+z} Euler systems are have convenient typedef.
+    * If you need negative axes{-x,-y,-z}, it is recommended to create you own typedef with
+    *  a word that represent what you need.
+    *
+    * ### Example ###
+    *
+    * \include EulerAngles.cpp
+    * Output: \verbinclude EulerAngles.out
+    *
+    * ### Additional reading ###
+    *
+    * If you're want to get more idea about how Euler system work in Eigen see EulerSystem.
+    *
+    * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles
+    *
+    * \tparam _Scalar the scalar type, i.e. the type of the angles.
+    *
+    * \tparam _System the EulerSystem to use, which represents the axes of rotation.
+    */
+  template <typename _Scalar, class _System>
+  class EulerAngles : public RotationBase<EulerAngles<_Scalar, _System>, 3>
+  {
+    public:
+      typedef RotationBase<EulerAngles<_Scalar, _System>, 3> Base;
+      
+      /** the scalar type of the angles */
+      typedef _Scalar Scalar;
+      typedef typename NumTraits<Scalar>::Real RealScalar;
+      
+      /** the EulerSystem to use, which represents the axes of rotation. */
+      typedef _System System;
+    
+      typedef Matrix<Scalar,3,3> Matrix3; /*!< the equivalent rotation matrix type */
+      typedef Matrix<Scalar,3,1> Vector3; /*!< the equivalent 3 dimension vector type */
+      typedef Quaternion<Scalar> QuaternionType; /*!< the equivalent quaternion type */
+      typedef AngleAxis<Scalar> AngleAxisType; /*!< the equivalent angle-axis type */
+      
+      /** \returns the axis vector of the first (alpha) rotation */
+      static Vector3 AlphaAxisVector() {
+        const Vector3& u = Vector3::Unit(System::AlphaAxisAbs - 1);
+        return System::IsAlphaOpposite ? -u : u;
+      }
+      
+      /** \returns the axis vector of the second (beta) rotation */
+      static Vector3 BetaAxisVector() {
+        const Vector3& u = Vector3::Unit(System::BetaAxisAbs - 1);
+        return System::IsBetaOpposite ? -u : u;
+      }
+      
+      /** \returns the axis vector of the third (gamma) rotation */
+      static Vector3 GammaAxisVector() {
+        const Vector3& u = Vector3::Unit(System::GammaAxisAbs - 1);
+        return System::IsGammaOpposite ? -u : u;
+      }
+
+    private:
+      Vector3 m_angles;
+
+    public:
+      /** Default constructor without initialization. */
+      EulerAngles() {}
+      /** Constructs and initialize an EulerAngles (\p alpha, \p beta, \p gamma). */
+      EulerAngles(const Scalar& alpha, const Scalar& beta, const Scalar& gamma) :
+        m_angles(alpha, beta, gamma) {}
+      
+      // TODO: Test this constructor
+      /** Constructs and initialize an EulerAngles from the array data {alpha, beta, gamma} */
+      explicit EulerAngles(const Scalar* data) : m_angles(data) {}
+      
+      /** Constructs and initializes an EulerAngles from either:
+        *  - a 3x3 rotation matrix expression(i.e. pure orthogonal matrix with determinant of +1),
+        *  - a 3D vector expression representing Euler angles.
+        *
+        * \note If \p other is a 3x3 rotation matrix, the angles range rules will be as follow:<BR>
+        *  Alpha and gamma angles will be in the range [-PI, PI].<BR>
+        *  As for Beta angle:
+        *   - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2].
+        *   - otherwise:
+        *     - If the beta axis is positive, the beta angle will be in the range [0, PI]
+        *     - If the beta axis is negative, the beta angle will be in the range [-PI, 0]
+       */
+      template<typename Derived>
+      explicit EulerAngles(const MatrixBase<Derived>& other) { *this = other; }
+      
+      /** Constructs and initialize Euler angles from a rotation \p rot.
+        *
+        * \note If \p rot is an EulerAngles (even when it represented as RotationBase explicitly),
+        *  angles ranges are __undefined__.
+        *  Otherwise, alpha and gamma angles will be in the range [-PI, PI].<BR>
+        *  As for Beta angle:
+        *   - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2].
+        *   - otherwise:
+        *     - If the beta axis is positive, the beta angle will be in the range [0, PI]
+        *     - If the beta axis is negative, the beta angle will be in the range [-PI, 0]
+      */
+      template<typename Derived>
+      EulerAngles(const RotationBase<Derived, 3>& rot) { System::CalcEulerAngles(*this, rot.toRotationMatrix()); }
+      
+      /*EulerAngles(const QuaternionType& q)
+      {
+        // TODO: Implement it in a faster way for quaternions
+        // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/
+        //  we can compute only the needed matrix cells and then convert to euler angles. (see ZYX example below)
+        // Currently we compute all matrix cells from quaternion.
+
+        // Special case only for ZYX
+        //Scalar y2 = q.y() * q.y();
+        //m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z())));
+        //m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x()));
+        //m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2)));
+      }*/
+
+      /** \returns The angle values stored in a vector (alpha, beta, gamma). */
+      const Vector3& angles() const { return m_angles; }
+      /** \returns A read-write reference to the angle values stored in a vector (alpha, beta, gamma). */
+      Vector3& angles() { return m_angles; }
+
+      /** \returns The value of the first angle. */
+      Scalar alpha() const { return m_angles[0]; }
+      /** \returns A read-write reference to the angle of the first angle. */
+      Scalar& alpha() { return m_angles[0]; }
+
+      /** \returns The value of the second angle. */
+      Scalar beta() const { return m_angles[1]; }
+      /** \returns A read-write reference to the angle of the second angle. */
+      Scalar& beta() { return m_angles[1]; }
+
+      /** \returns The value of the third angle. */
+      Scalar gamma() const { return m_angles[2]; }
+      /** \returns A read-write reference to the angle of the third angle. */
+      Scalar& gamma() { return m_angles[2]; }
+
+      /** \returns The Euler angles rotation inverse (which is as same as the negative),
+        *  (-alpha, -beta, -gamma).
+      */
+      EulerAngles inverse() const
+      {
+        EulerAngles res;
+        res.m_angles = -m_angles;
+        return res;
+      }
+
+      /** \returns The Euler angles rotation negative (which is as same as the inverse),
+        *  (-alpha, -beta, -gamma).
+      */
+      EulerAngles operator -() const
+      {
+        return inverse();
+      }
+      
+      /** Set \c *this from either:
+        *  - a 3x3 rotation matrix expression(i.e. pure orthogonal matrix with determinant of +1),
+        *  - a 3D vector expression representing Euler angles.
+        *
+        * See EulerAngles(const MatrixBase<Derived, 3>&) for more information about
+        *  angles ranges output.
+      */
+      template<class Derived>
+      EulerAngles& operator=(const MatrixBase<Derived>& other)
+      {
+        EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename Derived::Scalar>::value),
+         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+        
+        internal::eulerangles_assign_impl<System, Derived>::run(*this, other.derived());
+        return *this;
+      }
+
+      // TODO: Assign and construct from another EulerAngles (with different system)
+      
+      /** Set \c *this from a rotation.
+        *
+        * See EulerAngles(const RotationBase<Derived, 3>&) for more information about
+        *  angles ranges output.
+      */
+      template<typename Derived>
+      EulerAngles& operator=(const RotationBase<Derived, 3>& rot) {
+        System::CalcEulerAngles(*this, rot.toRotationMatrix());
+        return *this;
+      }
+      
+      /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+        * determined by \a prec.
+        *
+        * \sa MatrixBase::isApprox() */
+      bool isApprox(const EulerAngles& other,
+        const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
+      { return angles().isApprox(other.angles(), prec); }
+
+      /** \returns an equivalent 3x3 rotation matrix. */
+      Matrix3 toRotationMatrix() const
+      {
+        // TODO: Calc it faster
+        return static_cast<QuaternionType>(*this).toRotationMatrix();
+      }
+
+      /** Convert the Euler angles to quaternion. */
+      operator QuaternionType() const
+      {
+        return
+          AngleAxisType(alpha(), AlphaAxisVector()) *
+          AngleAxisType(beta(), BetaAxisVector())   *
+          AngleAxisType(gamma(), GammaAxisVector());
+      }
+      
+      friend std::ostream& operator<<(std::ostream& s, const EulerAngles<Scalar, System>& eulerAngles)
+      {
+        s << eulerAngles.angles().transpose();
+        return s;
+      }
+      
+      /** \returns \c *this with scalar type casted to \a NewScalarType */
+      template <typename NewScalarType>
+      EulerAngles<NewScalarType, System> cast() const
+      {
+        EulerAngles<NewScalarType, System> e;
+        e.angles() = angles().template cast<NewScalarType>();
+        return e;
+      }
+  };
+
+#define EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(AXES, SCALAR_TYPE, SCALAR_POSTFIX) \
+  /** \ingroup EulerAngles_Module */ \
+  typedef EulerAngles<SCALAR_TYPE, EulerSystem##AXES> EulerAngles##AXES##SCALAR_POSTFIX;
+
+#define EIGEN_EULER_ANGLES_TYPEDEFS(SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XYZ, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XYX, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XZY, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XZX, SCALAR_TYPE, SCALAR_POSTFIX) \
+ \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YZX, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YZY, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YXZ, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YXY, SCALAR_TYPE, SCALAR_POSTFIX) \
+ \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZXY, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZXZ, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZYX, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZYZ, SCALAR_TYPE, SCALAR_POSTFIX)
+
+EIGEN_EULER_ANGLES_TYPEDEFS(float, f)
+EIGEN_EULER_ANGLES_TYPEDEFS(double, d)
+
+  namespace internal
+  {
+    template<typename _Scalar, class _System>
+    struct traits<EulerAngles<_Scalar, _System> >
+    {
+      typedef _Scalar Scalar;
+    };
+    
+    // set from a rotation matrix
+    template<class System, class Other>
+    struct eulerangles_assign_impl<System,Other,3,3>
+    {
+      typedef typename Other::Scalar Scalar;
+      static void run(EulerAngles<Scalar, System>& e, const Other& m)
+      {
+        System::CalcEulerAngles(e, m);
+      }
+    };
+    
+    // set from a vector of Euler angles
+    template<class System, class Other>
+    struct eulerangles_assign_impl<System,Other,3,1>
+    {
+      typedef typename Other::Scalar Scalar;
+      static void run(EulerAngles<Scalar, System>& e, const Other& vec)
+      {
+        e.angles() = vec;
+      }
+    };
+  }
+}
+
+#endif // EIGEN_EULERANGLESCLASS_H

diff --git a/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/unsupported/Eigen/src/EulerAngles/EulerSystem.h
new file mode 100644
index 0000000..2a833b0
--- /dev/null
+++ b/unsupported/Eigen/src/EulerAngles/EulerSystem.h

@@ -0,0 +1,305 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERSYSTEM_H
+#define EIGEN_EULERSYSTEM_H
+
+namespace Eigen
+{
+  // Forward declarations
+  template <typename _Scalar, class _System>
+  class EulerAngles;
+  
+  namespace internal
+  {
+    // TODO: Add this trait to the Eigen internal API?
+    template <int Num, bool IsPositive = (Num > 0)>
+    struct Abs
+    {
+      enum { value = Num };
+    };
+  
+    template <int Num>
+    struct Abs<Num, false>
+    {
+      enum { value = -Num };
+    };
+
+    template <int Axis>
+    struct IsValidAxis
+    {
+      enum { value = Axis != 0 && Abs<Axis>::value <= 3 };
+    };
+    
+    template<typename System,
+            typename Other,
+            int OtherRows=Other::RowsAtCompileTime,
+            int OtherCols=Other::ColsAtCompileTime>
+    struct eulerangles_assign_impl;
+  }
+  
+  #define EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(COND,MSG) typedef char static_assertion_##MSG[(COND)?1:-1]
+  
+  /** \brief Representation of a fixed signed rotation axis for EulerSystem.
+    *
+    * \ingroup EulerAngles_Module
+    *
+    * Values here represent:
+    *  - The axis of the rotation: X, Y or Z.
+    *  - The sign (i.e. direction of the rotation along the axis): positive(+) or negative(-)
+    *
+    * Therefore, this could express all the axes {+X,+Y,+Z,-X,-Y,-Z}
+    *
+    * For positive axis, use +EULER_{axis}, and for negative axis use -EULER_{axis}.
+    */
+  enum EulerAxis
+  {
+    EULER_X = 1, /*!< the X axis */
+    EULER_Y = 2, /*!< the Y axis */
+    EULER_Z = 3  /*!< the Z axis */
+  };
+  
+  /** \class EulerSystem
+    *
+    * \ingroup EulerAngles_Module
+    *
+    * \brief Represents a fixed Euler rotation system.
+    *
+    * This meta-class goal is to represent the Euler system in compilation time, for EulerAngles.
+    *
+    * You can use this class to get two things:
+    *  - Build an Euler system, and then pass it as a template parameter to EulerAngles.
+    *  - Query some compile time data about an Euler system. (e.g. Whether it's Tait-Bryan)
+    *
+    * Euler rotation is a set of three rotation on fixed axes. (see \ref EulerAngles)
+    * This meta-class store constantly those signed axes. (see \ref EulerAxis)
+    *
+    * ### Types of Euler systems ###
+    *
+    * All and only valid 3 dimension Euler rotation over standard
+    *  signed axes{+X,+Y,+Z,-X,-Y,-Z} are supported:
+    *  - all axes X, Y, Z in each valid order (see below what order is valid)
+    *  - rotation over the axis is supported both over the positive and negative directions.
+    *  - both Tait-Bryan and proper/classic Euler angles (i.e. the opposite).
+    *
+    * Since EulerSystem support both positive and negative directions,
+    *  you may call this rotation distinction in other names:
+    *  - _right handed_ or _left handed_
+    *  - _counterclockwise_ or _clockwise_
+    *
+    * Notice all axed combination are valid, and would trigger a static assertion.
+    * Same unsigned axes can't be neighbors, e.g. {X,X,Y} is invalid.
+    * This yield two and only two classes:
+    *  - _Tait-Bryan_ - all unsigned axes are distinct, e.g. {X,Y,Z}
+    *  - _proper/classic Euler angles_ - The first and the third unsigned axes is equal,
+    *     and the second is different, e.g. {X,Y,X}
+    *
+    * ### Intrinsic vs extrinsic Euler systems ###
+    *
+    * Only intrinsic Euler systems are supported for simplicity.
+    *  If you want to use extrinsic Euler systems,
+    *   just use the equal intrinsic opposite order for axes and angles.
+    *  I.e axes (A,B,C) becomes (C,B,A), and angles (a,b,c) becomes (c,b,a).
+    *
+    * ### Convenient user typedefs ###
+    *
+    * Convenient typedefs for EulerSystem exist (only for positive axes Euler systems),
+    *  in a form of EulerSystem{A}{B}{C}, e.g. \ref EulerSystemXYZ.
+    *
+    * ### Additional reading ###
+    *
+    * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles
+    *
+    * \tparam _AlphaAxis the first fixed EulerAxis
+    *
+    * \tparam _BetaAxis the second fixed EulerAxis
+    *
+    * \tparam _GammaAxis the third fixed EulerAxis
+    */
+  template <int _AlphaAxis, int _BetaAxis, int _GammaAxis>
+  class EulerSystem
+  {
+    public:
+    // It's defined this way and not as enum, because I think
+    //  that enum is not guerantee to support negative numbers
+    
+    /** The first rotation axis */
+    static const int AlphaAxis = _AlphaAxis;
+    
+    /** The second rotation axis */
+    static const int BetaAxis = _BetaAxis;
+    
+    /** The third rotation axis */
+    static const int GammaAxis = _GammaAxis;
+
+    enum
+    {
+      AlphaAxisAbs = internal::Abs<AlphaAxis>::value, /*!< the first rotation axis unsigned */
+      BetaAxisAbs = internal::Abs<BetaAxis>::value, /*!< the second rotation axis unsigned */
+      GammaAxisAbs = internal::Abs<GammaAxis>::value, /*!< the third rotation axis unsigned */
+      
+      IsAlphaOpposite = (AlphaAxis < 0) ? 1 : 0, /*!< whether alpha axis is negative */
+      IsBetaOpposite = (BetaAxis < 0) ? 1 : 0, /*!< whether beta axis is negative */
+      IsGammaOpposite = (GammaAxis < 0) ? 1 : 0, /*!< whether gamma axis is negative */
+
+      // Parity is even if alpha axis X is followed by beta axis Y, or Y is followed
+      // by Z, or Z is followed by X; otherwise it is odd.
+      IsOdd = ((AlphaAxisAbs)%3 == (BetaAxisAbs - 1)%3) ? 0 : 1, /*!< whether the Euler system is odd */
+      IsEven = IsOdd ? 0 : 1, /*!< whether the Euler system is even */
+
+      IsTaitBryan = ((unsigned)AlphaAxisAbs != (unsigned)GammaAxisAbs) ? 1 : 0 /*!< whether the Euler system is Tait-Bryan */
+    };
+    
+    private:
+    
+    EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<AlphaAxis>::value,
+      ALPHA_AXIS_IS_INVALID);
+      
+    EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<BetaAxis>::value,
+      BETA_AXIS_IS_INVALID);
+      
+    EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<GammaAxis>::value,
+      GAMMA_AXIS_IS_INVALID);
+      
+    EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT((unsigned)AlphaAxisAbs != (unsigned)BetaAxisAbs,
+      ALPHA_AXIS_CANT_BE_EQUAL_TO_BETA_AXIS);
+      
+    EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT((unsigned)BetaAxisAbs != (unsigned)GammaAxisAbs,
+      BETA_AXIS_CANT_BE_EQUAL_TO_GAMMA_AXIS);
+
+    static const int
+      // I, J, K are the pivot indexes permutation for the rotation matrix, that match this Euler system. 
+      // They are used in this class converters.
+      // They are always different from each other, and their possible values are: 0, 1, or 2.
+      I_ = AlphaAxisAbs - 1,
+      J_ = (AlphaAxisAbs - 1 + 1 + IsOdd)%3,
+      K_ = (AlphaAxisAbs - 1 + 2 - IsOdd)%3
+    ;
+    
+    // TODO: Get @mat parameter in form that avoids double evaluation.
+    template <typename Derived>
+    static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar, 3, 1>& res, const MatrixBase<Derived>& mat, internal::true_type /*isTaitBryan*/)
+    {
+      using std::atan2;
+      using std::sqrt;
+      
+      typedef typename Derived::Scalar Scalar;
+
+      const Scalar plusMinus = IsEven? 1 : -1;
+      const Scalar minusPlus = IsOdd?  1 : -1;
+
+      const Scalar Rsum = sqrt((mat(I_,I_) * mat(I_,I_) + mat(I_,J_) * mat(I_,J_) + mat(J_,K_) * mat(J_,K_) + mat(K_,K_) * mat(K_,K_))/2);
+      res[1] = atan2(plusMinus * mat(I_,K_), Rsum);
+
+      // There is a singularity when cos(beta) == 0
+      if(Rsum > 4 * NumTraits<Scalar>::epsilon()) {// cos(beta) != 0
+        res[0] = atan2(minusPlus * mat(J_, K_), mat(K_, K_));
+        res[2] = atan2(minusPlus * mat(I_, J_), mat(I_, I_));
+      }
+      else if(plusMinus * mat(I_, K_) > 0) {// cos(beta) == 0 and sin(beta) == 1
+        Scalar spos = mat(J_, I_) + plusMinus * mat(K_, J_); // 2*sin(alpha + plusMinus * gamma
+        Scalar cpos = mat(J_, J_) + minusPlus * mat(K_, I_); // 2*cos(alpha + plusMinus * gamma)
+        Scalar alphaPlusMinusGamma = atan2(spos, cpos);
+        res[0] = alphaPlusMinusGamma;
+        res[2] = 0;
+      }
+      else {// cos(beta) == 0 and sin(beta) == -1
+        Scalar sneg = plusMinus * (mat(K_, J_) + minusPlus * mat(J_, I_)); // 2*sin(alpha + minusPlus*gamma)
+        Scalar cneg = mat(J_, J_) + plusMinus * mat(K_, I_);               // 2*cos(alpha + minusPlus*gamma)
+        Scalar alphaMinusPlusBeta = atan2(sneg, cneg);
+        res[0] = alphaMinusPlusBeta;
+        res[2] = 0;
+      }
+    }
+
+    template <typename Derived>
+    static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar,3,1>& res,
+                                    const MatrixBase<Derived>& mat, internal::false_type /*isTaitBryan*/)
+    {
+      using std::atan2;
+      using std::sqrt;
+
+      typedef typename Derived::Scalar Scalar;
+
+      const Scalar plusMinus = IsEven? 1 : -1;
+      const Scalar minusPlus = IsOdd?  1 : -1;
+
+      const Scalar Rsum = sqrt((mat(I_, J_) * mat(I_, J_) + mat(I_, K_) * mat(I_, K_) + mat(J_, I_) * mat(J_, I_) + mat(K_, I_) * mat(K_, I_)) / 2);
+
+      res[1] = atan2(Rsum, mat(I_, I_));
+
+      // There is a singularity when sin(beta) == 0
+      if(Rsum > 4 * NumTraits<Scalar>::epsilon()) {// sin(beta) != 0
+        res[0] = atan2(mat(J_, I_), minusPlus * mat(K_, I_));
+        res[2] = atan2(mat(I_, J_), plusMinus * mat(I_, K_));
+      }
+      else if(mat(I_, I_) > 0) {// sin(beta) == 0 and cos(beta) == 1
+        Scalar spos = plusMinus * mat(K_, J_) + minusPlus * mat(J_, K_); // 2*sin(alpha + gamma)
+        Scalar cpos = mat(J_, J_) + mat(K_, K_);                         // 2*cos(alpha + gamma)
+        res[0] = atan2(spos, cpos);
+        res[2] = 0;
+      }
+      else {// sin(beta) == 0 and cos(beta) == -1
+        Scalar sneg = plusMinus * mat(K_, J_) + plusMinus * mat(J_, K_); // 2*sin(alpha - gamma)
+        Scalar cneg = mat(J_, J_) - mat(K_, K_);                         // 2*cos(alpha - gamma)
+        res[0] = atan2(sneg, cneg);
+        res[2] = 0;
+      }
+    }
+    
+    template<typename Scalar>
+    static void CalcEulerAngles(
+      EulerAngles<Scalar, EulerSystem>& res,
+      const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat)
+    {
+      CalcEulerAngles_imp(
+        res.angles(), mat,
+        typename internal::conditional<IsTaitBryan, internal::true_type, internal::false_type>::type());
+
+      if (IsAlphaOpposite)
+        res.alpha() = -res.alpha();
+        
+      if (IsBetaOpposite)
+        res.beta() = -res.beta();
+        
+      if (IsGammaOpposite)
+        res.gamma() = -res.gamma();
+    }
+    
+    template <typename _Scalar, class _System>
+    friend class Eigen::EulerAngles;
+    
+    template<typename System,
+            typename Other,
+            int OtherRows,
+            int OtherCols>
+    friend struct internal::eulerangles_assign_impl;
+  };
+
+#define EIGEN_EULER_SYSTEM_TYPEDEF(A, B, C) \
+  /** \ingroup EulerAngles_Module */ \
+  typedef EulerSystem<EULER_##A, EULER_##B, EULER_##C> EulerSystem##A##B##C;
+  
+  EIGEN_EULER_SYSTEM_TYPEDEF(X,Y,Z)
+  EIGEN_EULER_SYSTEM_TYPEDEF(X,Y,X)
+  EIGEN_EULER_SYSTEM_TYPEDEF(X,Z,Y)
+  EIGEN_EULER_SYSTEM_TYPEDEF(X,Z,X)
+  
+  EIGEN_EULER_SYSTEM_TYPEDEF(Y,Z,X)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Y,Z,Y)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Y,X,Z)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Y,X,Y)
+  
+  EIGEN_EULER_SYSTEM_TYPEDEF(Z,X,Y)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Z,X,Z)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Z,Y,X)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Z,Y,Z)
+}
+
+#endif // EIGEN_EULERSYSTEM_H

diff --git a/unsupported/Eigen/src/FFT/ei_fftw_impl.h b/unsupported/Eigen/src/FFT/ei_fftw_impl.h
index d49aa17..1c2cd24 100644
--- a/unsupported/Eigen/src/FFT/ei_fftw_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_fftw_impl.h

@@ -231,6 +231,8 @@
   protected:
       typedef fftw_plan<Scalar> PlanData;
 
+      typedef Eigen::numext::int64_t int64_t;
+
       typedef std::map<int64_t,PlanData> PlanMap;
 
       PlanMap m_plans;
@@ -257,5 +259,3 @@
 } // end namespace internal
 
 } // end namespace Eigen
-
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */

diff --git a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
index be51b4e..430953a 100644
--- a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h

@@ -25,16 +25,47 @@
   std::vector<Complex> m_scratchBuf;
   bool m_inverse;
 
-  inline
-    void make_twiddles(int nfft,bool inverse)
+  inline void make_twiddles(int nfft, bool inverse)
+  {
+    using numext::sin;
+    using numext::cos;
+    m_inverse = inverse;
+    m_twiddles.resize(nfft);
+    double phinc =  0.25 * double(EIGEN_PI) / nfft;
+    Scalar flip = inverse ? Scalar(1) : Scalar(-1);
+    m_twiddles[0] = Complex(Scalar(1), Scalar(0));
+    if ((nfft&1)==0)
+      m_twiddles[nfft/2] = Complex(Scalar(-1), Scalar(0));
+    int i=1;
+    for (;i*8<nfft;++i)
     {
-      using std::acos;
-      m_inverse = inverse;
-      m_twiddles.resize(nfft);
-      Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
-      for (int i=0;i<nfft;++i)
-        m_twiddles[i] = exp( Complex(0,i*phinc) );
+      Scalar c = Scalar(cos(i*8*phinc));
+      Scalar s = Scalar(sin(i*8*phinc));
+      m_twiddles[i] = Complex(c, s*flip);
+      m_twiddles[nfft-i] = Complex(c, -s*flip);
     }
+    for (;i*4<nfft;++i)
+    {
+      Scalar c = Scalar(cos((2*nfft-8*i)*phinc));
+      Scalar s = Scalar(sin((2*nfft-8*i)*phinc));
+      m_twiddles[i] = Complex(s, c*flip);
+      m_twiddles[nfft-i] = Complex(s, -c*flip);
+    }
+    for (;i*8<3*nfft;++i)
+    {
+      Scalar c = Scalar(cos((8*i-2*nfft)*phinc));
+      Scalar s = Scalar(sin((8*i-2*nfft)*phinc));
+      m_twiddles[i] = Complex(-s, c*flip);
+      m_twiddles[nfft-i] = Complex(-s, -c*flip);
+    }
+    for (;i*2<nfft;++i)
+    {
+      Scalar c = Scalar(cos((4*nfft-8*i)*phinc));
+      Scalar s = Scalar(sin((4*nfft-8*i)*phinc));
+      m_twiddles[i] = Complex(-c, s*flip);
+      m_twiddles[nfft-i] = Complex(-c, -s*flip);
+    }
+  }
 
   void factorize(int nfft)
   {
@@ -316,8 +347,8 @@
 
         // use optimized mode for even real
         fwd( dst, reinterpret_cast<const Complex*> (src), ncfft);
-        Complex dc = dst[0].real() +  dst[0].imag();
-        Complex nyquist = dst[0].real() -  dst[0].imag();
+        Complex dc(dst[0].real() +  dst[0].imag());
+        Complex nyquist(dst[0].real() -  dst[0].imag());
         int k;
         for ( k=1;k <= ncfft2 ; ++k ) {
           Complex fpk = dst[k];
@@ -416,5 +447,3 @@
 } // end namespace internal
 
 } // end namespace Eigen
-
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */

diff --git a/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h b/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
new file mode 100644
index 0000000..e7d70f3
--- /dev/null
+++ b/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h

@@ -0,0 +1,187 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+
+/* NOTE The functions of this file have been adapted from the GMM++ library */
+
+//========================================================================
+//
+// Copyright (C) 2002-2007 Yves Renard
+//
+// This file is a part of GETFEM++
+//
+// Getfem++ is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; version 2.1 of the License.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Lesser General Public License for more details.
+// You should have received a copy of the GNU Lesser General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301,
+// USA.
+//
+//========================================================================
+
+#include "../../../../Eigen/src/Core/util/NonMPL2.h"
+
+#ifndef EIGEN_CONSTRAINEDCG_H
+#define EIGEN_CONSTRAINEDCG_H
+
+#include "../../../../Eigen/Core"
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \ingroup IterativeLinearSolvers_Module
+  * Compute the pseudo inverse of the non-square matrix C such that
+  * \f$ CINV = (C * C^T)^{-1} * C \f$ based on a conjugate gradient method.
+  *
+  * This function is internally used by constrained_cg.
+  */
+template <typename CMatrix, typename CINVMatrix>
+void pseudo_inverse(const CMatrix &C, CINVMatrix &CINV)
+{
+  // optimisable : copie de la ligne, precalcul de C * trans(C).
+  typedef typename CMatrix::Scalar Scalar;
+  typedef typename CMatrix::Index Index;
+  // FIXME use sparse vectors ?
+  typedef Matrix<Scalar,Dynamic,1> TmpVec;
+
+  Index rows = C.rows(), cols = C.cols();
+
+  TmpVec d(rows), e(rows), l(cols), p(rows), q(rows), r(rows);
+  Scalar rho, rho_1, alpha;
+  d.setZero();
+
+  typedef Triplet<double> T;
+  std::vector<T> tripletList;
+    
+  for (Index i = 0; i < rows; ++i)
+  {
+    d[i] = 1.0;
+    rho = 1.0;
+    e.setZero();
+    r = d;
+    p = d;
+
+    while (rho >= 1e-38)
+    { /* conjugate gradient to compute e             */
+      /* which is the i-th row of inv(C * trans(C))  */
+      l = C.transpose() * p;
+      q = C * l;
+      alpha = rho / p.dot(q);
+      e +=  alpha * p;
+      r += -alpha * q;
+      rho_1 = rho;
+      rho = r.dot(r);
+      p = (rho/rho_1) * p + r;
+    }
+
+    l = C.transpose() * e; // l is the i-th row of CINV
+    // FIXME add a generic "prune/filter" expression for both dense and sparse object to sparse
+    for (Index j=0; j<l.size(); ++j)
+      if (l[j]<1e-15)
+	tripletList.push_back(T(i,j,l(j)));
+
+	
+    d[i] = 0.0;
+  }
+  CINV.setFromTriplets(tripletList.begin(), tripletList.end());
+}
+
+
+
+/** \ingroup IterativeLinearSolvers_Module
+  * Constrained conjugate gradient
+  *
+  * Computes the minimum of \f$ 1/2((Ax).x) - bx \f$ under the constraint \f$ Cx \le f \f$
+  */
+template<typename TMatrix, typename CMatrix,
+         typename VectorX, typename VectorB, typename VectorF>
+void constrained_cg(const TMatrix& A, const CMatrix& C, VectorX& x,
+                       const VectorB& b, const VectorF& f, IterationController &iter)
+{
+  using std::sqrt;
+  typedef typename TMatrix::Scalar Scalar;
+  typedef typename TMatrix::Index Index;
+  typedef Matrix<Scalar,Dynamic,1>  TmpVec;
+
+  Scalar rho = 1.0, rho_1, lambda, gamma;
+  Index xSize = x.size();
+  TmpVec  p(xSize), q(xSize), q2(xSize),
+          r(xSize), old_z(xSize), z(xSize),
+          memox(xSize);
+  std::vector<bool> satured(C.rows());
+  p.setZero();
+  iter.setRhsNorm(sqrt(b.dot(b))); // gael vect_sp(PS, b, b)
+  if (iter.rhsNorm() == 0.0) iter.setRhsNorm(1.0);
+
+  SparseMatrix<Scalar,RowMajor> CINV(C.rows(), C.cols());
+  pseudo_inverse(C, CINV);
+
+  while(true)
+  {
+    // computation of residual
+    old_z = z;
+    memox = x;
+    r = b;
+    r += A * -x;
+    z = r;
+    bool transition = false;
+    for (Index i = 0; i < C.rows(); ++i)
+    {
+      Scalar al = C.row(i).dot(x) - f.coeff(i);
+      if (al >= -1.0E-15)
+      {
+        if (!satured[i])
+        {
+          satured[i] = true;
+          transition = true;
+        }
+        Scalar bb = CINV.row(i).dot(z);
+        if (bb > 0.0)
+          // FIXME: we should allow that: z += -bb * C.row(i);
+          for (typename CMatrix::InnerIterator it(C,i); it; ++it)
+            z.coeffRef(it.index()) -= bb*it.value();
+      }
+      else
+        satured[i] = false;
+    }
+
+    // descent direction
+    rho_1 = rho;
+    rho = r.dot(z);
+
+    if (iter.finished(rho)) break;
+    if (transition || iter.first()) gamma = 0.0;
+    else gamma = (std::max)(0.0, (rho - old_z.dot(z)) / rho_1);
+    p = z + gamma*p;
+
+    ++iter;
+    // one dimensionnal optimization
+    q = A * p;
+    lambda = rho / q.dot(p);
+    for (Index i = 0; i < C.rows(); ++i)
+    {
+      if (!satured[i])
+      {
+        Scalar bb = C.row(i).dot(p) - f[i];
+        if (bb > 0.0)
+          lambda = (std::min)(lambda, (f.coeff(i)-C.row(i).dot(x)) / bb);
+      }
+    }
+    x += lambda * p;
+    memox -= x;
+  }
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CONSTRAINEDCG_H

diff --git a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
new file mode 100644
index 0000000..5ae011b
--- /dev/null
+++ b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h

@@ -0,0 +1,511 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DGMRES_H
+#define EIGEN_DGMRES_H
+
+#include "../../../../Eigen/Eigenvalues"
+
+namespace Eigen { 
+  
+template< typename _MatrixType,
+          typename _Preconditioner = DiagonalPreconditioner<typename _MatrixType::Scalar> >
+class DGMRES;
+
+namespace internal {
+
+template< typename _MatrixType, typename _Preconditioner>
+struct traits<DGMRES<_MatrixType,_Preconditioner> >
+{
+  typedef _MatrixType MatrixType;
+  typedef _Preconditioner Preconditioner;
+};
+
+/** \brief Computes a permutation vector to have a sorted sequence
+  * \param vec The vector to reorder.
+  * \param perm gives the sorted sequence on output. Must be initialized with 0..n-1
+  * \param ncut Put  the ncut smallest elements at the end of the vector
+  * WARNING This is an expensive sort, so should be used only 
+  * for small size vectors
+  * TODO Use modified QuickSplit or std::nth_element to get the smallest values 
+  */
+template <typename VectorType, typename IndexType>
+void sortWithPermutation (VectorType& vec, IndexType& perm, typename IndexType::Scalar& ncut)
+{
+  eigen_assert(vec.size() == perm.size());
+  bool flag; 
+  for (Index k  = 0; k < ncut; k++)
+  {
+    flag = false;
+    for (Index j = 0; j < vec.size()-1; j++)
+    {
+      if ( vec(perm(j)) < vec(perm(j+1)) )
+      {
+        std::swap(perm(j),perm(j+1)); 
+        flag = true;
+      }
+      if (!flag) break; // The vector is in sorted order
+    }
+  }
+}
+
+}
+/**
+ * \ingroup IterativeLinearSolvers_Module
+ * \brief A Restarted GMRES with deflation.
+ * This class implements a modification of the GMRES solver for
+ * sparse linear systems. The basis is built with modified 
+ * Gram-Schmidt. At each restart, a few approximated eigenvectors
+ * corresponding to the smallest eigenvalues are used to build a
+ * preconditioner for the next cycle. This preconditioner 
+ * for deflation can be combined with any other preconditioner, 
+ * the IncompleteLUT for instance. The preconditioner is applied 
+ * at right of the matrix and the combination is multiplicative.
+ * 
+ * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
+ * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
+ * Typical usage :
+ * \code
+ * SparseMatrix<double> A;
+ * VectorXd x, b; 
+ * //Fill A and b ...
+ * DGMRES<SparseMatrix<double> > solver;
+ * solver.set_restart(30); // Set restarting value
+ * solver.setEigenv(1); // Set the number of eigenvalues to deflate
+ * solver.compute(A);
+ * x = solver.solve(b);
+ * \endcode
+ * 
+ * DGMRES can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+ *
+ * References :
+ * [1] D. NUENTSA WAKAM and F. PACULL, Memory Efficient Hybrid
+ *  Algebraic Solvers for Linear Systems Arising from Compressible
+ *  Flows, Computers and Fluids, In Press,
+ *  https://doi.org/10.1016/j.compfluid.2012.03.023   
+ * [2] K. Burrage and J. Erhel, On the performance of various 
+ * adaptive preconditioned GMRES strategies, 5(1998), 101-121.
+ * [3] J. Erhel, K. Burrage and B. Pohl, Restarted GMRES 
+ *  preconditioned by deflation,J. Computational and Applied
+ *  Mathematics, 69(1996), 303-318. 
+
+ * 
+ */
+template< typename _MatrixType, typename _Preconditioner>
+class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
+{
+    typedef IterativeSolverBase<DGMRES> Base;
+    using Base::matrix;
+    using Base::m_error;
+    using Base::m_iterations;
+    using Base::m_info;
+    using Base::m_isInitialized;
+    using Base::m_tolerance; 
+  public:
+    using Base::_solve_impl;
+    using Base::_solve_with_guess_impl;
+    typedef _MatrixType MatrixType;
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef _Preconditioner Preconditioner;
+    typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix; 
+    typedef Matrix<RealScalar,Dynamic,Dynamic> DenseRealMatrix; 
+    typedef Matrix<Scalar,Dynamic,1> DenseVector;
+    typedef Matrix<RealScalar,Dynamic,1> DenseRealVector; 
+    typedef Matrix<std::complex<RealScalar>, Dynamic, 1> ComplexVector;
+ 
+    
+  /** Default constructor. */
+  DGMRES() : Base(),m_restart(30),m_neig(0),m_r(0),m_maxNeig(5),m_isDeflAllocated(false),m_isDeflInitialized(false) {}
+
+  /** Initialize the solver with matrix \a A for further \c Ax=b solving.
+    * 
+    * This constructor is a shortcut for the default constructor followed
+    * by a call to compute().
+    * 
+    * \warning this class stores a reference to the matrix A as well as some
+    * precomputed values that depend on it. Therefore, if \a A is changed
+    * this class becomes invalid. Call compute() to update it with the new
+    * matrix A, or modify a copy of A.
+    */
+  template<typename MatrixDerived>
+  explicit DGMRES(const EigenBase<MatrixDerived>& A) : Base(A.derived()), m_restart(30),m_neig(0),m_r(0),m_maxNeig(5),m_isDeflAllocated(false),m_isDeflInitialized(false) {}
+
+  ~DGMRES() {}
+  
+  /** \internal */
+  template<typename Rhs,typename Dest>
+  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
+  {
+    EIGEN_STATIC_ASSERT(Rhs::ColsAtCompileTime==1 || Dest::ColsAtCompileTime==1, YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX);
+    
+    m_iterations = Base::maxIterations();
+    m_error = Base::m_tolerance;
+    
+    dgmres(matrix(), b, x, Base::m_preconditioner);
+  }
+
+  /** 
+   * Get the restart value
+    */
+  Index restart() { return m_restart; }
+  
+  /** 
+   * Set the restart value (default is 30)  
+   */
+  void set_restart(const Index restart) { m_restart=restart; }
+  
+  /** 
+   * Set the number of eigenvalues to deflate at each restart 
+   */
+  void setEigenv(const Index neig) 
+  {
+    m_neig = neig;
+    if (neig+1 > m_maxNeig) m_maxNeig = neig+1; // To allow for complex conjugates
+  }
+  
+  /** 
+   * Get the size of the deflation subspace size
+   */ 
+  Index deflSize() {return m_r; }
+  
+  /**
+   * Set the maximum size of the deflation subspace
+   */
+  void setMaxEigenv(const Index maxNeig) { m_maxNeig = maxNeig; }
+  
+  protected:
+    // DGMRES algorithm 
+    template<typename Rhs, typename Dest>
+    void dgmres(const MatrixType& mat,const Rhs& rhs, Dest& x, const Preconditioner& precond) const;
+    // Perform one cycle of GMRES
+    template<typename Dest>
+    Index dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, Index& nbIts) const; 
+    // Compute data to use for deflation 
+    Index dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const;
+    // Apply deflation to a vector
+    template<typename RhsType, typename DestType>
+    Index dgmresApplyDeflation(const RhsType& In, DestType& Out) const; 
+    ComplexVector schurValues(const ComplexSchur<DenseMatrix>& schurofH) const;
+    ComplexVector schurValues(const RealSchur<DenseMatrix>& schurofH) const;
+    // Init data for deflation
+    void dgmresInitDeflation(Index& rows) const; 
+    mutable DenseMatrix m_V; // Krylov basis vectors
+    mutable DenseMatrix m_H; // Hessenberg matrix 
+    mutable DenseMatrix m_Hes; // Initial hessenberg matrix without Givens rotations applied
+    mutable Index m_restart; // Maximum size of the Krylov subspace
+    mutable DenseMatrix m_U; // Vectors that form the basis of the invariant subspace 
+    mutable DenseMatrix m_MU; // matrix operator applied to m_U (for next cycles)
+    mutable DenseMatrix m_T; /* T=U^T*M^{-1}*A*U */
+    mutable PartialPivLU<DenseMatrix> m_luT; // LU factorization of m_T
+    mutable StorageIndex m_neig; //Number of eigenvalues to extract at each restart
+    mutable Index m_r; // Current number of deflated eigenvalues, size of m_U
+    mutable Index m_maxNeig; // Maximum number of eigenvalues to deflate
+    mutable RealScalar m_lambdaN; //Modulus of the largest eigenvalue of A
+    mutable bool m_isDeflAllocated;
+    mutable bool m_isDeflInitialized;
+    
+    //Adaptive strategy 
+    mutable RealScalar m_smv; // Smaller multiple of the remaining number of steps allowed
+    mutable bool m_force; // Force the use of deflation at each restart
+    
+}; 
+/** 
+ * \brief Perform several cycles of restarted GMRES with modified Gram Schmidt, 
+ * 
+ * A right preconditioner is used combined with deflation.
+ * 
+ */
+template< typename _MatrixType, typename _Preconditioner>
+template<typename Rhs, typename Dest>
+void DGMRES<_MatrixType, _Preconditioner>::dgmres(const MatrixType& mat,const Rhs& rhs, Dest& x,
+              const Preconditioner& precond) const
+{
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+
+  RealScalar normRhs = rhs.norm();
+  if(normRhs <= considerAsZero) 
+  {
+    x.setZero();
+    m_error = 0;
+    return;
+  }
+
+  //Initialization
+  m_isDeflInitialized = false;
+  Index n = mat.rows(); 
+  DenseVector r0(n); 
+  Index nbIts = 0; 
+  m_H.resize(m_restart+1, m_restart);
+  m_Hes.resize(m_restart, m_restart);
+  m_V.resize(n,m_restart+1);
+  //Initial residual vector and initial norm
+  if(x.squaredNorm()==0) 
+    x = precond.solve(rhs);
+  r0 = rhs - mat * x; 
+  RealScalar beta = r0.norm(); 
+  
+  m_error = beta/normRhs; 
+  if(m_error < m_tolerance)
+    m_info = Success; 
+  else
+    m_info = NoConvergence;
+  
+  // Iterative process
+  while (nbIts < m_iterations && m_info == NoConvergence)
+  {
+    dgmresCycle(mat, precond, x, r0, beta, normRhs, nbIts); 
+    
+    // Compute the new residual vector for the restart 
+    if (nbIts < m_iterations && m_info == NoConvergence) {
+      r0 = rhs - mat * x;
+      beta = r0.norm();
+    }
+  }
+} 
+
+/**
+ * \brief Perform one restart cycle of DGMRES
+ * \param mat The coefficient matrix
+ * \param precond The preconditioner
+ * \param x the new approximated solution
+ * \param r0 The initial residual vector
+ * \param beta The norm of the residual computed so far
+ * \param normRhs The norm of the right hand side vector
+ * \param nbIts The number of iterations
+ */
+template< typename _MatrixType, typename _Preconditioner>
+template<typename Dest>
+Index DGMRES<_MatrixType, _Preconditioner>::dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, Index& nbIts) const
+{
+  //Initialization 
+  DenseVector g(m_restart+1); // Right hand side of the least square problem
+  g.setZero();  
+  g(0) = Scalar(beta); 
+  m_V.col(0) = r0/beta; 
+  m_info = NoConvergence; 
+  std::vector<JacobiRotation<Scalar> >gr(m_restart); // Givens rotations
+  Index it = 0; // Number of inner iterations 
+  Index n = mat.rows();
+  DenseVector tv1(n), tv2(n);  //Temporary vectors
+  while (m_info == NoConvergence && it < m_restart && nbIts < m_iterations)
+  {    
+    // Apply preconditioner(s) at right
+    if (m_isDeflInitialized )
+    {
+      dgmresApplyDeflation(m_V.col(it), tv1); // Deflation
+      tv2 = precond.solve(tv1); 
+    }
+    else
+    {
+      tv2 = precond.solve(m_V.col(it)); // User's selected preconditioner
+    }
+    tv1 = mat * tv2; 
+   
+    // Orthogonalize it with the previous basis in the basis using modified Gram-Schmidt
+    Scalar coef; 
+    for (Index i = 0; i <= it; ++i)
+    { 
+      coef = tv1.dot(m_V.col(i));
+      tv1 = tv1 - coef * m_V.col(i); 
+      m_H(i,it) = coef; 
+      m_Hes(i,it) = coef; 
+    }
+    // Normalize the vector 
+    coef = tv1.norm(); 
+    m_V.col(it+1) = tv1/coef;
+    m_H(it+1, it) = coef;
+//     m_Hes(it+1,it) = coef; 
+    
+    // FIXME Check for happy breakdown 
+    
+    // Update Hessenberg matrix with Givens rotations
+    for (Index i = 1; i <= it; ++i) 
+    {
+      m_H.col(it).applyOnTheLeft(i-1,i,gr[i-1].adjoint());
+    }
+    // Compute the new plane rotation 
+    gr[it].makeGivens(m_H(it, it), m_H(it+1,it)); 
+    // Apply the new rotation
+    m_H.col(it).applyOnTheLeft(it,it+1,gr[it].adjoint());
+    g.applyOnTheLeft(it,it+1, gr[it].adjoint()); 
+    
+    beta = std::abs(g(it+1));
+    m_error = beta/normRhs; 
+    // std::cerr << nbIts << " Relative Residual Norm " << m_error << std::endl;
+    it++; nbIts++; 
+    
+    if (m_error < m_tolerance)
+    {
+      // The method has converged
+      m_info = Success;
+      break;
+    }
+  }
+  
+  // Compute the new coefficients by solving the least square problem
+//   it++;
+  //FIXME  Check first if the matrix is singular ... zero diagonal
+  DenseVector nrs(m_restart); 
+  nrs = m_H.topLeftCorner(it,it).template triangularView<Upper>().solve(g.head(it)); 
+  
+  // Form the new solution
+  if (m_isDeflInitialized)
+  {
+    tv1 = m_V.leftCols(it) * nrs; 
+    dgmresApplyDeflation(tv1, tv2); 
+    x = x + precond.solve(tv2);
+  }
+  else
+    x = x + precond.solve(m_V.leftCols(it) * nrs); 
+  
+  // Go for a new cycle and compute data for deflation
+  if(nbIts < m_iterations && m_info == NoConvergence && m_neig > 0 && (m_r+m_neig) < m_maxNeig)
+    dgmresComputeDeflationData(mat, precond, it, m_neig); 
+  return 0; 
+  
+}
+
+
+template< typename _MatrixType, typename _Preconditioner>
+void DGMRES<_MatrixType, _Preconditioner>::dgmresInitDeflation(Index& rows) const
+{
+  m_U.resize(rows, m_maxNeig);
+  m_MU.resize(rows, m_maxNeig); 
+  m_T.resize(m_maxNeig, m_maxNeig);
+  m_lambdaN = 0.0; 
+  m_isDeflAllocated = true; 
+}
+
+template< typename _MatrixType, typename _Preconditioner>
+inline typename DGMRES<_MatrixType, _Preconditioner>::ComplexVector DGMRES<_MatrixType, _Preconditioner>::schurValues(const ComplexSchur<DenseMatrix>& schurofH) const
+{
+  return schurofH.matrixT().diagonal();
+}
+
+template< typename _MatrixType, typename _Preconditioner>
+inline typename DGMRES<_MatrixType, _Preconditioner>::ComplexVector DGMRES<_MatrixType, _Preconditioner>::schurValues(const RealSchur<DenseMatrix>& schurofH) const
+{
+  const DenseMatrix& T = schurofH.matrixT();
+  Index it = T.rows();
+  ComplexVector eig(it);
+  Index j = 0;
+  while (j < it-1)
+  {
+    if (T(j+1,j) ==Scalar(0))
+    {
+      eig(j) = std::complex<RealScalar>(T(j,j),RealScalar(0)); 
+      j++; 
+    }
+    else
+    {
+      eig(j) = std::complex<RealScalar>(T(j,j),T(j+1,j)); 
+      eig(j+1) = std::complex<RealScalar>(T(j,j+1),T(j+1,j+1));
+      j++;
+    }
+  }
+  if (j < it-1) eig(j) = std::complex<RealScalar>(T(j,j),RealScalar(0));
+  return eig;
+}
+
+template< typename _MatrixType, typename _Preconditioner>
+Index DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const
+{
+  // First, find the Schur form of the Hessenberg matrix H
+  typename internal::conditional<NumTraits<Scalar>::IsComplex, ComplexSchur<DenseMatrix>, RealSchur<DenseMatrix> >::type schurofH; 
+  bool computeU = true;
+  DenseMatrix matrixQ(it,it); 
+  matrixQ.setIdentity();
+  schurofH.computeFromHessenberg(m_Hes.topLeftCorner(it,it), matrixQ, computeU); 
+  
+  ComplexVector eig(it);
+  Matrix<StorageIndex,Dynamic,1>perm(it);
+  eig = this->schurValues(schurofH);
+  
+  // Reorder the absolute values of Schur values
+  DenseRealVector modulEig(it); 
+  for (Index j=0; j<it; ++j) modulEig(j) = std::abs(eig(j)); 
+  perm.setLinSpaced(it,0,internal::convert_index<StorageIndex>(it-1));
+  internal::sortWithPermutation(modulEig, perm, neig);
+  
+  if (!m_lambdaN)
+  {
+    m_lambdaN = (std::max)(modulEig.maxCoeff(), m_lambdaN);
+  }
+  //Count the real number of extracted eigenvalues (with complex conjugates)
+  Index nbrEig = 0; 
+  while (nbrEig < neig)
+  {
+    if(eig(perm(it-nbrEig-1)).imag() == RealScalar(0)) nbrEig++; 
+    else nbrEig += 2; 
+  }
+  // Extract the  Schur vectors corresponding to the smallest Ritz values
+  DenseMatrix Sr(it, nbrEig); 
+  Sr.setZero();
+  for (Index j = 0; j < nbrEig; j++)
+  {
+    Sr.col(j) = schurofH.matrixU().col(perm(it-j-1));
+  }
+  
+  // Form the Schur vectors of the initial matrix using the Krylov basis
+  DenseMatrix X; 
+  X = m_V.leftCols(it) * Sr;
+  if (m_r)
+  {
+   // Orthogonalize X against m_U using modified Gram-Schmidt
+   for (Index j = 0; j < nbrEig; j++)
+     for (Index k =0; k < m_r; k++)
+      X.col(j) = X.col(j) - (m_U.col(k).dot(X.col(j)))*m_U.col(k); 
+  }
+  
+  // Compute m_MX = A * M^-1 * X
+  Index m = m_V.rows();
+  if (!m_isDeflAllocated) 
+    dgmresInitDeflation(m); 
+  DenseMatrix MX(m, nbrEig);
+  DenseVector tv1(m);
+  for (Index j = 0; j < nbrEig; j++)
+  {
+    tv1 = mat * X.col(j);
+    MX.col(j) = precond.solve(tv1);
+  }
+  
+  //Update m_T = [U'MU U'MX; X'MU X'MX]
+  m_T.block(m_r, m_r, nbrEig, nbrEig) = X.transpose() * MX; 
+  if(m_r)
+  {
+    m_T.block(0, m_r, m_r, nbrEig) = m_U.leftCols(m_r).transpose() * MX; 
+    m_T.block(m_r, 0, nbrEig, m_r) = X.transpose() * m_MU.leftCols(m_r);
+  }
+  
+  // Save X into m_U and m_MX in m_MU
+  for (Index j = 0; j < nbrEig; j++) m_U.col(m_r+j) = X.col(j);
+  for (Index j = 0; j < nbrEig; j++) m_MU.col(m_r+j) = MX.col(j);
+  // Increase the size of the invariant subspace
+  m_r += nbrEig; 
+  
+  // Factorize m_T into m_luT
+  m_luT.compute(m_T.topLeftCorner(m_r, m_r));
+  
+  //FIXME CHeck if the factorization was correctly done (nonsingular matrix)
+  m_isDeflInitialized = true;
+  return 0; 
+}
+template<typename _MatrixType, typename _Preconditioner>
+template<typename RhsType, typename DestType>
+Index DGMRES<_MatrixType, _Preconditioner>::dgmresApplyDeflation(const RhsType &x, DestType &y) const
+{
+  DenseVector x1 = m_U.leftCols(m_r).transpose() * x; 
+  y = x + m_U.leftCols(m_r) * ( m_lambdaN * m_luT.solve(x1) - x1);
+  return 0; 
+}
+
+} // end namespace Eigen
+#endif 

diff --git a/unsupported/Eigen/src/IterativeSolvers/GMRES.h b/unsupported/Eigen/src/IterativeSolvers/GMRES.h
new file mode 100644
index 0000000..ff91209
--- /dev/null
+++ b/unsupported/Eigen/src/IterativeSolvers/GMRES.h

@@ -0,0 +1,335 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2012, 2014 Kolja Brix <brix@igpm.rwth-aaachen.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GMRES_H
+#define EIGEN_GMRES_H
+
+namespace Eigen {
+
+namespace internal {
+
+/**
+* Generalized Minimal Residual Algorithm based on the
+* Arnoldi algorithm implemented with Householder reflections.
+*
+* Parameters:
+*  \param mat       matrix of linear system of equations
+*  \param rhs       right hand side vector of linear system of equations
+*  \param x         on input: initial guess, on output: solution
+*  \param precond   preconditioner used
+*  \param iters     on input: maximum number of iterations to perform
+*                   on output: number of iterations performed
+*  \param restart   number of iterations for a restart
+*  \param tol_error on input: relative residual tolerance
+*                   on output: residuum achieved
+*
+* \sa IterativeMethods::bicgstab()
+*
+*
+* For references, please see:
+*
+* Saad, Y. and Schultz, M. H.
+* GMRES: A Generalized Minimal Residual Algorithm for Solving Nonsymmetric Linear Systems.
+* SIAM J.Sci.Stat.Comp. 7, 1986, pp. 856 - 869.
+*
+* Saad, Y.
+* Iterative Methods for Sparse Linear Systems.
+* Society for Industrial and Applied Mathematics, Philadelphia, 2003.
+*
+* Walker, H. F.
+* Implementations of the GMRES method.
+* Comput.Phys.Comm. 53, 1989, pp. 311 - 320.
+*
+* Walker, H. F.
+* Implementation of the GMRES Method using Householder Transformations.
+* SIAM J.Sci.Stat.Comp. 9, 1988, pp. 152 - 163.
+*
+*/
+template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+bool gmres(const MatrixType & mat, const Rhs & rhs, Dest & x, const Preconditioner & precond,
+    Index &iters, const Index &restart, typename Dest::RealScalar & tol_error) {
+
+  using std::sqrt;
+  using std::abs;
+
+  typedef typename Dest::RealScalar RealScalar;
+  typedef typename Dest::Scalar Scalar;
+  typedef Matrix < Scalar, Dynamic, 1 > VectorType;
+  typedef Matrix < Scalar, Dynamic, Dynamic, ColMajor> FMatrixType;
+
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+
+  if(rhs.norm() <= considerAsZero) 
+  {
+    x.setZero();
+    tol_error = 0;
+    return true;
+  }
+
+  RealScalar tol = tol_error;
+  const Index maxIters = iters;
+  iters = 0;
+
+  const Index m = mat.rows();
+
+  // residual and preconditioned residual
+  VectorType p0 = rhs - mat*x;
+  VectorType r0 = precond.solve(p0);
+
+  const RealScalar r0Norm = r0.norm();
+
+  // is initial guess already good enough?
+  if(r0Norm == 0)
+  {
+    tol_error = 0;
+    return true;
+  }
+
+  // storage for Hessenberg matrix and Householder data
+  FMatrixType H   = FMatrixType::Zero(m, restart + 1);
+  VectorType w    = VectorType::Zero(restart + 1);
+  VectorType tau  = VectorType::Zero(restart + 1);
+
+  // storage for Jacobi rotations
+  std::vector < JacobiRotation < Scalar > > G(restart);
+  
+  // storage for temporaries
+  VectorType t(m), v(m), workspace(m), x_new(m);
+
+  // generate first Householder vector
+  Ref<VectorType> H0_tail = H.col(0).tail(m - 1);
+  RealScalar beta;
+  r0.makeHouseholder(H0_tail, tau.coeffRef(0), beta);
+  w(0) = Scalar(beta);
+  
+  for (Index k = 1; k <= restart; ++k)
+  {
+    ++iters;
+
+    v = VectorType::Unit(m, k - 1);
+
+    // apply Householder reflections H_{1} ... H_{k-1} to v
+    // TODO: use a HouseholderSequence
+    for (Index i = k - 1; i >= 0; --i) {
+      v.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
+    }
+
+    // apply matrix M to v:  v = mat * v;
+    t.noalias() = mat * v;
+    v = precond.solve(t);
+
+    // apply Householder reflections H_{k-1} ... H_{1} to v
+    // TODO: use a HouseholderSequence
+    for (Index i = 0; i < k; ++i) {
+      v.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
+    }
+
+    if (v.tail(m - k).norm() != 0.0)
+    {
+      if (k <= restart)
+      {
+        // generate new Householder vector
+        Ref<VectorType> Hk_tail = H.col(k).tail(m - k - 1);
+        v.tail(m - k).makeHouseholder(Hk_tail, tau.coeffRef(k), beta);
+
+        // apply Householder reflection H_{k} to v
+        v.tail(m - k).applyHouseholderOnTheLeft(Hk_tail, tau.coeffRef(k), workspace.data());
+      }
+    }
+
+    if (k > 1)
+    {
+      for (Index i = 0; i < k - 1; ++i)
+      {
+        // apply old Givens rotations to v
+        v.applyOnTheLeft(i, i + 1, G[i].adjoint());
+      }
+    }
+
+    if (k<m && v(k) != (Scalar) 0)
+    {
+      // determine next Givens rotation
+      G[k - 1].makeGivens(v(k - 1), v(k));
+
+      // apply Givens rotation to v and w
+      v.applyOnTheLeft(k - 1, k, G[k - 1].adjoint());
+      w.applyOnTheLeft(k - 1, k, G[k - 1].adjoint());
+    }
+
+    // insert coefficients into upper matrix triangle
+    H.col(k-1).head(k) = v.head(k);
+
+    tol_error = abs(w(k)) / r0Norm;
+    bool stop = (k==m || tol_error < tol || iters == maxIters);
+
+    if (stop || k == restart)
+    {
+      // solve upper triangular system
+      Ref<VectorType> y = w.head(k);
+      H.topLeftCorner(k, k).template triangularView <Upper>().solveInPlace(y);
+
+      // use Horner-like scheme to calculate solution vector
+      x_new.setZero();
+      for (Index i = k - 1; i >= 0; --i)
+      {
+        x_new(i) += y(i);
+        // apply Householder reflection H_{i} to x_new
+        x_new.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
+      }
+
+      x += x_new;
+
+      if(stop)
+      {
+        return true;
+      }
+      else
+      {
+        k=0;
+
+        // reset data for restart
+        p0.noalias() = rhs - mat*x;
+        r0 = precond.solve(p0);
+
+        // clear Hessenberg matrix and Householder data
+        H.setZero();
+        w.setZero();
+        tau.setZero();
+
+        // generate first Householder vector
+        r0.makeHouseholder(H0_tail, tau.coeffRef(0), beta);
+        w(0) = Scalar(beta);
+      }
+    }
+  }
+
+  return false;
+
+}
+
+}
+
+template< typename _MatrixType,
+          typename _Preconditioner = DiagonalPreconditioner<typename _MatrixType::Scalar> >
+class GMRES;
+
+namespace internal {
+
+template< typename _MatrixType, typename _Preconditioner>
+struct traits<GMRES<_MatrixType,_Preconditioner> >
+{
+  typedef _MatrixType MatrixType;
+  typedef _Preconditioner Preconditioner;
+};
+
+}
+
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief A GMRES solver for sparse square problems
+  *
+  * This class allows to solve for A.x = b sparse linear problems using a generalized minimal
+  * residual method. The vectors x and b can be either dense or sparse.
+  *
+  * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
+  * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
+  *
+  * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+  * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+  * and NumTraits<Scalar>::epsilon() for the tolerance.
+  *
+  * This class can be used as the direct solver classes. Here is a typical usage example:
+  * \code
+  * int n = 10000;
+  * VectorXd x(n), b(n);
+  * SparseMatrix<double> A(n,n);
+  * // fill A and b
+  * GMRES<SparseMatrix<double> > solver(A);
+  * x = solver.solve(b);
+  * std::cout << "#iterations:     " << solver.iterations() << std::endl;
+  * std::cout << "estimated error: " << solver.error()      << std::endl;
+  * // update b, and solve again
+  * x = solver.solve(b);
+  * \endcode
+  *
+  * By default the iterations start with x=0 as an initial guess of the solution.
+  * One can control the start using the solveWithGuess() method.
+  * 
+  * GMRES can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+  *
+  * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+  */
+template< typename _MatrixType, typename _Preconditioner>
+class GMRES : public IterativeSolverBase<GMRES<_MatrixType,_Preconditioner> >
+{
+  typedef IterativeSolverBase<GMRES> Base;
+  using Base::matrix;
+  using Base::m_error;
+  using Base::m_iterations;
+  using Base::m_info;
+  using Base::m_isInitialized;
+
+private:
+  Index m_restart;
+
+public:
+  using Base::_solve_impl;
+  typedef _MatrixType MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef _Preconditioner Preconditioner;
+
+public:
+
+  /** Default constructor. */
+  GMRES() : Base(), m_restart(30) {}
+
+  /** Initialize the solver with matrix \a A for further \c Ax=b solving.
+    *
+    * This constructor is a shortcut for the default constructor followed
+    * by a call to compute().
+    *
+    * \warning this class stores a reference to the matrix A as well as some
+    * precomputed values that depend on it. Therefore, if \a A is changed
+    * this class becomes invalid. Call compute() to update it with the new
+    * matrix A, or modify a copy of A.
+    */
+  template<typename MatrixDerived>
+  explicit GMRES(const EigenBase<MatrixDerived>& A) : Base(A.derived()), m_restart(30) {}
+
+  ~GMRES() {}
+
+  /** Get the number of iterations after that a restart is performed.
+    */
+  Index get_restart() { return m_restart; }
+
+  /** Set the number of iterations after that a restart is performed.
+    *  \param restart   number of iterations for a restarti, default is 30.
+    */
+  void set_restart(const Index restart) { m_restart=restart; }
+
+  /** \internal */
+  template<typename Rhs,typename Dest>
+  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
+  {
+    m_iterations = Base::maxIterations();
+    m_error = Base::m_tolerance;
+    bool ret = internal::gmres(matrix(), b, x, Base::m_preconditioner, m_iterations, m_restart, m_error);
+    m_info = (!ret) ? NumericalIssue
+          : m_error <= Base::m_tolerance ? Success
+          : NoConvergence;
+  }
+
+protected:
+
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_GMRES_H

diff --git a/unsupported/Eigen/src/IterativeSolvers/IDRS.h b/unsupported/Eigen/src/IterativeSolvers/IDRS.h
new file mode 100755
index 0000000..90d20fa
--- /dev/null
+++ b/unsupported/Eigen/src/IterativeSolvers/IDRS.h

@@ -0,0 +1,436 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 Chris Schoutrop <c.e.m.schoutrop@tue.nl>
+// Copyright (C) 2020 Jens Wehner <j.wehner@esciencecenter.nl>
+// Copyright (C) 2020 Jan van Dijk <j.v.dijk@tue.nl>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_IDRS_H
+#define EIGEN_IDRS_H
+
+namespace Eigen
+{
+
+	namespace internal
+	{
+		/**     \internal Low-level Induced Dimension Reduction algoritm
+		        \param A The matrix A
+		        \param b The right hand side vector b
+		        \param x On input and initial solution, on output the computed solution.
+		        \param precond A preconditioner being able to efficiently solve for an
+		                  approximation of Ax=b (regardless of b)
+		        \param iter On input the max number of iteration, on output the number of performed iterations.
+		        \param relres On input the tolerance error, on output an estimation of the relative error.
+		        \param S On input Number of the dimension of the shadow space.
+				\param smoothing switches residual smoothing on.
+				\param angle small omega lead to faster convergence at the expense of numerical stability
+				\param replacement switches on a residual replacement strategy to increase accuracy of residual at the expense of more Mat*vec products
+		        \return false in the case of numerical issue, for example a break down of IDRS.
+		*/
+		template<typename Vector, typename RealScalar>
+		typename Vector::Scalar omega(const Vector& t, const Vector& s, RealScalar angle)
+		{
+			using numext::abs;
+			typedef typename Vector::Scalar Scalar;
+			const RealScalar ns = s.norm();
+			const RealScalar nt = t.norm();
+			const Scalar ts = t.dot(s);
+			const RealScalar rho = abs(ts / (nt * ns));
+
+			if (rho < angle) {
+				if (ts == Scalar(0)) {
+					return Scalar(0);
+				}
+				// Original relation for om is given by
+				// om = om * angle / rho;
+				// To alleviate potential (near) division by zero this can be rewritten as
+				// om = angle * (ns / nt) * (ts / abs(ts)) = angle * (ns / nt) * sgn(ts)
+  				return angle * (ns / nt) * (ts / abs(ts));
+			}
+			return ts / (nt * nt);
+		}
+
+		template <typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+		bool idrs(const MatrixType& A, const Rhs& b, Dest& x, const Preconditioner& precond,
+			Index& iter,
+			typename Dest::RealScalar& relres, Index S, bool smoothing, typename Dest::RealScalar angle, bool replacement)
+		{
+			typedef typename Dest::RealScalar RealScalar;
+			typedef typename Dest::Scalar Scalar;
+			typedef Matrix<Scalar, Dynamic, 1> VectorType;
+			typedef Matrix<Scalar, Dynamic, Dynamic, ColMajor> DenseMatrixType;
+			const Index N = b.size();
+			S = S < x.rows() ? S : x.rows();
+			const RealScalar tol = relres;
+			const Index maxit = iter;
+
+			Index replacements = 0;
+			bool trueres = false;
+
+			FullPivLU<DenseMatrixType> lu_solver;
+
+			DenseMatrixType P;
+			{
+				HouseholderQR<DenseMatrixType> qr(DenseMatrixType::Random(N, S));
+			    P = (qr.householderQ() * DenseMatrixType::Identity(N, S));
+			}
+
+			const RealScalar normb = b.norm();
+
+			if (internal::isApprox(normb, RealScalar(0)))
+			{
+				//Solution is the zero vector
+				x.setZero();
+				iter = 0;
+				relres = 0;
+				return true;
+			}
+			 // from http://homepage.tudelft.nl/1w5b5/IDRS/manual.pdf
+			 // A peak in the residual is considered dangerously high if‖ri‖/‖b‖> C(tol/epsilon).
+			 // With epsilon the
+             // relative machine precision. The factor tol/epsilon corresponds to the size of a
+             // finite precision number that is so large that the absolute round-off error in
+             // this number, when propagated through the process, makes it impossible to
+             // achieve the required accuracy.The factor C accounts for the accumulation of
+             // round-off errors. This parameter has beenset to 10−3.
+			 // mp is epsilon/C
+			 // 10^3 * eps is very conservative, so normally no residual replacements will take place. 
+			 // It only happens if things go very wrong. Too many restarts may ruin the convergence.
+			const RealScalar mp = RealScalar(1e3) * NumTraits<Scalar>::epsilon();
+
+
+
+			//Compute initial residual
+			const RealScalar tolb = tol * normb; //Relative tolerance
+			VectorType r = b - A * x;
+
+			VectorType x_s, r_s;
+
+			if (smoothing)
+			{
+				x_s = x;
+				r_s = r;
+			}
+
+			RealScalar normr = r.norm();
+
+			if (normr <= tolb)
+			{
+				//Initial guess is a good enough solution
+				iter = 0;
+				relres = normr / normb;
+				return true;
+			}
+
+			DenseMatrixType G = DenseMatrixType::Zero(N, S);
+			DenseMatrixType U = DenseMatrixType::Zero(N, S);
+			DenseMatrixType M = DenseMatrixType::Identity(S, S);
+			VectorType t(N), v(N);
+			Scalar om = 1.;
+
+			//Main iteration loop, guild G-spaces:
+			iter = 0;
+
+			while (normr > tolb && iter < maxit)
+			{
+				//New right hand size for small system:
+				VectorType f = (r.adjoint() * P).adjoint();
+
+				for (Index k = 0; k < S; ++k)
+				{
+					//Solve small system and make v orthogonal to P:
+					//c = M(k:s,k:s)\f(k:s);
+					lu_solver.compute(M.block(k , k , S -k, S - k ));
+					VectorType c = lu_solver.solve(f.segment(k , S - k ));
+					//v = r - G(:,k:s)*c;
+					v = r - G.rightCols(S - k ) * c;
+					//Preconditioning
+					v = precond.solve(v);
+
+					//Compute new U(:,k) and G(:,k), G(:,k) is in space G_j
+					U.col(k) = U.rightCols(S - k ) * c + om * v;
+					G.col(k) = A * U.col(k );
+
+					//Bi-Orthogonalise the new basis vectors:
+					for (Index i = 0; i < k-1 ; ++i)
+					{
+						//alpha =  ( P(:,i)'*G(:,k) )/M(i,i);
+						Scalar alpha = P.col(i ).dot(G.col(k )) / M(i, i );
+						G.col(k ) = G.col(k ) - alpha * G.col(i );
+						U.col(k ) = U.col(k ) - alpha * U.col(i );
+					}
+
+					//New column of M = P'*G  (first k-1 entries are zero)
+					//M(k:s,k) = (G(:,k)'*P(:,k:s))';
+					M.block(k , k , S - k , 1) = (G.col(k ).adjoint() * P.rightCols(S - k )).adjoint();
+
+					if (internal::isApprox(M(k,k), Scalar(0)))
+					{
+						return false;
+					}
+
+					//Make r orthogonal to q_i, i = 0..k-1
+					Scalar beta = f(k ) / M(k , k );
+					r = r - beta * G.col(k );
+					x = x + beta * U.col(k );
+					normr = r.norm();
+
+					if (replacement && normr > tolb / mp)
+					{
+						trueres = true;
+					}
+
+					//Smoothing:
+					if (smoothing)
+					{
+						t = r_s - r;
+						//gamma is a Scalar, but the conversion is not allowed
+						Scalar gamma = t.dot(r_s) / t.norm();
+						r_s = r_s - gamma * t;
+						x_s = x_s - gamma * (x_s - x);
+						normr = r_s.norm();
+					}
+
+					if (normr < tolb || iter == maxit)
+					{
+						break;
+					}
+
+					//New f = P'*r (first k  components are zero)
+					if (k < S-1)
+					{
+						f.segment(k + 1, S - (k + 1) ) = f.segment(k + 1 , S - (k + 1)) - beta * M.block(k + 1 , k , S - (k + 1), 1);
+					}
+				}//end for
+
+				if (normr < tolb || iter == maxit)
+				{
+					break;
+				}
+
+				//Now we have sufficient vectors in G_j to compute residual in G_j+1
+				//Note: r is already perpendicular to P so v = r
+				//Preconditioning
+				v = r;
+				v = precond.solve(v);
+
+				//Matrix-vector multiplication:
+				t = A * v;
+
+				//Computation of a new omega
+				om = internal::omega(t, r, angle);
+
+				if (om == RealScalar(0.0))
+				{
+					return false;
+				}
+
+				r = r - om * t;
+				x = x + om * v;
+				normr = r.norm();
+
+				if (replacement && normr > tolb / mp)
+				{
+					trueres = true;
+				}
+
+				//Residual replacement?
+				if (trueres && normr < normb)
+				{
+					r = b - A * x;
+					trueres = false;
+					replacements++;
+				}
+
+				//Smoothing:
+				if (smoothing)
+				{
+					t = r_s - r;
+					Scalar gamma = t.dot(r_s) /t.norm();
+					r_s = r_s - gamma * t;
+					x_s = x_s - gamma * (x_s - x);
+					normr = r_s.norm();
+				}
+
+				iter++;
+
+			}//end while
+
+			if (smoothing)
+			{
+				x = x_s;
+			}
+			relres=normr/normb;
+			return true;
+		}
+
+	}  // namespace internal
+
+	template <typename _MatrixType, typename _Preconditioner = DiagonalPreconditioner<typename _MatrixType::Scalar> >
+	class IDRS;
+
+	namespace internal
+	{
+
+		template <typename _MatrixType, typename _Preconditioner>
+		struct traits<Eigen::IDRS<_MatrixType, _Preconditioner> >
+		{
+			typedef _MatrixType MatrixType;
+			typedef _Preconditioner Preconditioner;
+		};
+
+	}  // namespace internal
+
+
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief The Induced Dimension Reduction method (IDR(s)) is a short-recurrences Krylov method for sparse square problems.
+  *
+  * This class allows to solve for A.x = b sparse linear problems. The vectors x and b can be either dense or sparse.
+  * he Induced Dimension Reduction method, IDR(), is a robust and efficient short-recurrence Krylov subspace method for
+  * solving large nonsymmetric systems of linear equations.
+  *
+  * For indefinite systems IDR(S) outperforms both BiCGStab and BiCGStab(L). Additionally, IDR(S) can handle matrices
+  * with complex eigenvalues more efficiently than BiCGStab.
+  *
+  * Many problems that do not converge for BiCGSTAB converge for IDR(s) (for larger values of s). And if both methods 
+  * converge the convergence for IDR(s) is typically much faster for difficult systems (for example indefinite problems). 
+  *
+  * IDR(s) is a limited memory finite termination method. In exact arithmetic it converges in at most N+N/s iterations,
+  * with N the system size.  It uses a fixed number of 4+3s vector. In comparison, BiCGSTAB terminates in 2N iterations 
+  * and uses 7 vectors. GMRES terminates in at most N iterations, and uses I+3 vectors, with I the number of iterations. 
+  * Restarting GMRES limits the memory consumption, but destroys the finite termination property.
+  *
+  * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
+  * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
+  *
+  * \implsparsesolverconcept
+  *
+  * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+  * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+  * and NumTraits<Scalar>::epsilon() for the tolerance.
+  *
+  * The tolerance corresponds to the relative residual error: |Ax-b|/|b|
+  *
+  * \b Performance: when using sparse matrices, best performance is achied for a row-major sparse matrix format.
+  * Moreover, in this case multi-threading can be exploited if the user code is compiled with OpenMP enabled.
+  * See \ref TopicMultiThreading for details.
+  *
+  * By default the iterations start with x=0 as an initial guess of the solution.
+  * One can control the start using the solveWithGuess() method.
+  *
+  * IDR(s) can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+  *
+  * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+  */
+	template <typename _MatrixType, typename _Preconditioner>
+	class IDRS : public IterativeSolverBase<IDRS<_MatrixType, _Preconditioner> >
+	{
+
+		public:
+			typedef _MatrixType MatrixType;
+			typedef typename MatrixType::Scalar Scalar;
+			typedef typename MatrixType::RealScalar RealScalar;
+			typedef _Preconditioner Preconditioner;
+
+		private:
+			typedef IterativeSolverBase<IDRS> Base;
+			using Base::m_error;
+			using Base::m_info;
+			using Base::m_isInitialized;
+			using Base::m_iterations;
+			using Base::matrix;
+			Index m_S;
+			bool m_smoothing;
+			RealScalar m_angle;
+			bool m_residual;
+
+		public:
+			/** Default constructor. */
+			IDRS(): m_S(4), m_smoothing(false), m_angle(RealScalar(0.7)), m_residual(false) {}
+
+			/**     Initialize the solver with matrix \a A for further \c Ax=b solving.
+
+			        This constructor is a shortcut for the default constructor followed
+			        by a call to compute().
+
+			        \warning this class stores a reference to the matrix A as well as some
+			        precomputed values that depend on it. Therefore, if \a A is changed
+			        this class becomes invalid. Call compute() to update it with the new
+			        matrix A, or modify a copy of A.
+			*/
+			template <typename MatrixDerived>
+			explicit IDRS(const EigenBase<MatrixDerived>& A) : Base(A.derived()), m_S(4), m_smoothing(false),
+															   m_angle(RealScalar(0.7)), m_residual(false) {}
+
+
+			/** \internal */
+			/**     Loops over the number of columns of b and does the following:
+			                1. sets the tolerence and maxIterations
+			                2. Calls the function that has the core solver routine
+			*/
+			template <typename Rhs, typename Dest>
+			void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
+			{
+				m_iterations = Base::maxIterations();
+				m_error = Base::m_tolerance;
+
+				bool ret = internal::idrs(matrix(), b, x, Base::m_preconditioner, m_iterations, m_error, m_S,m_smoothing,m_angle,m_residual);
+
+				m_info = (!ret) ? NumericalIssue : m_error <= Base::m_tolerance ? Success : NoConvergence;
+			}
+
+			/** Sets the parameter S, indicating the dimension of the shadow space. Default is 4*/
+			void setS(Index S)
+			{
+				if (S < 1)
+				{
+					S = 4;
+				}
+
+				m_S = S;
+			}
+
+			/** Switches off and on smoothing.
+			Residual smoothing results in monotonically decreasing residual norms at
+			the expense of two extra vectors of storage and a few extra vector
+			operations. Although monotonic decrease of the residual norms is a
+			desirable property, the rate of convergence of the unsmoothed process and
+			the smoothed process is basically the same. Default is off */
+			void setSmoothing(bool smoothing)
+			{
+				m_smoothing=smoothing;
+			}
+
+			/** The angle must be a real scalar. In IDR(s), a value for the
+			iteration parameter omega must be chosen in every s+1th step. The most
+			natural choice is to select a value to minimize the norm of the next residual.
+			This corresponds to the parameter omega = 0. In practice, this may lead to
+			values of omega that are so small that the other iteration parameters
+			cannot be computed with sufficient accuracy. In such cases it is better to
+			increase the value of omega sufficiently such that a compromise is reached
+			between accurate computations and reduction of the residual norm. The
+			parameter angle =0.7 (”maintaining the convergence strategy”)
+			results in such a compromise. */
+			void setAngle(RealScalar angle)
+			{
+				m_angle=angle;
+			}
+
+			/** The parameter replace is a logical that determines whether a
+			residual replacement strategy is employed to increase the accuracy of the
+			solution. */
+			void setResidualUpdate(bool update)
+			{
+				m_residual=update;
+			}
+
+	};
+
+}  // namespace Eigen
+
+#endif /* EIGEN_IDRS_H */

diff --git a/unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h b/unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h
new file mode 100644
index 0000000..7d08c35
--- /dev/null
+++ b/unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h

@@ -0,0 +1,90 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INCOMPLETE_LU_H
+#define EIGEN_INCOMPLETE_LU_H
+
+namespace Eigen { 
+
+template <typename _Scalar>
+class IncompleteLU : public SparseSolverBase<IncompleteLU<_Scalar> >
+{
+  protected:
+    typedef SparseSolverBase<IncompleteLU<_Scalar> > Base;
+    using Base::m_isInitialized;
+    
+    typedef _Scalar Scalar;
+    typedef Matrix<Scalar,Dynamic,1> Vector;
+    typedef typename Vector::Index Index;
+    typedef SparseMatrix<Scalar,RowMajor> FactorType;
+
+  public:
+    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
+
+    IncompleteLU() {}
+
+    template<typename MatrixType>
+    IncompleteLU(const MatrixType& mat)
+    {
+      compute(mat);
+    }
+
+    Index rows() const { return m_lu.rows(); }
+    Index cols() const { return m_lu.cols(); }
+
+    template<typename MatrixType>
+    IncompleteLU& compute(const MatrixType& mat)
+    {
+      m_lu = mat;
+      int size = mat.cols();
+      Vector diag(size);
+      for(int i=0; i<size; ++i)
+      {
+        typename FactorType::InnerIterator k_it(m_lu,i);
+        for(; k_it && k_it.index()<i; ++k_it)
+        {
+          int k = k_it.index();
+          k_it.valueRef() /= diag(k);
+
+          typename FactorType::InnerIterator j_it(k_it);
+          typename FactorType::InnerIterator kj_it(m_lu, k);
+          while(kj_it && kj_it.index()<=k) ++kj_it;
+          for(++j_it; j_it; )
+          {
+            if(kj_it.index()==j_it.index())
+            {
+              j_it.valueRef() -= k_it.value() * kj_it.value();
+              ++j_it;
+              ++kj_it;
+            }
+            else if(kj_it.index()<j_it.index()) ++kj_it;
+            else                                ++j_it;
+          }
+        }
+        if(k_it && k_it.index()==i) diag(i) = k_it.value();
+        else                        diag(i) = 1;
+      }
+      m_isInitialized = true;
+      return *this;
+    }
+
+    template<typename Rhs, typename Dest>
+    void _solve_impl(const Rhs& b, Dest& x) const
+    {
+      x = m_lu.template triangularView<UnitLower>().solve(b);
+      x = m_lu.template triangularView<Upper>().solve(x);
+    }
+
+  protected:
+    FactorType m_lu;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_INCOMPLETE_LU_H

diff --git a/unsupported/Eigen/src/IterativeSolvers/IterationController.h b/unsupported/Eigen/src/IterativeSolvers/IterationController.h
new file mode 100644
index 0000000..a116e09
--- /dev/null
+++ b/unsupported/Eigen/src/IterativeSolvers/IterationController.h

@@ -0,0 +1,154 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+
+/* NOTE The class IterationController has been adapted from the iteration
+ *      class of the GMM++ and ITL libraries.
+ */
+
+//=======================================================================
+// Copyright (C) 1997-2001
+// Authors: Andrew Lumsdaine <lums@osl.iu.edu> 
+//          Lie-Quan Lee     <llee@osl.iu.edu>
+//
+// This file is part of the Iterative Template Library
+//
+// You should have received a copy of the License Agreement for the
+// Iterative Template Library along with the software;  see the
+// file LICENSE.  
+//
+// Permission to modify the code and to distribute modified code is
+// granted, provided the text of this NOTICE is retained, a notice that
+// the code was modified is included with the above COPYRIGHT NOTICE and
+// with the COPYRIGHT NOTICE in the LICENSE file, and that the LICENSE
+// file is distributed with the modified code.
+//
+// LICENSOR MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED.
+// By way of example, but not limitation, Licensor MAKES NO
+// REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY
+// PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE COMPONENTS
+// OR DOCUMENTATION WILL NOT INFRINGE ANY PATENTS, COPYRIGHTS, TRADEMARKS
+// OR OTHER RIGHTS.
+//=======================================================================
+
+//========================================================================
+//
+// Copyright (C) 2002-2007 Yves Renard
+//
+// This file is a part of GETFEM++
+//
+// Getfem++ is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; version 2.1 of the License.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Lesser General Public License for more details.
+// You should have received a copy of the GNU Lesser General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301,
+// USA.
+//
+//========================================================================
+
+#include "../../../../Eigen/src/Core/util/NonMPL2.h"
+
+#ifndef EIGEN_ITERATION_CONTROLLER_H
+#define EIGEN_ITERATION_CONTROLLER_H
+
+namespace Eigen { 
+
+/** \ingroup IterativeLinearSolvers_Module
+  * \class IterationController
+  *
+  * \brief Controls the iterations of the iterative solvers
+  *
+  * This class has been adapted from the iteration class of GMM++ and ITL libraries.
+  *
+  */
+class IterationController
+{
+  protected :
+    double m_rhsn;        ///< Right hand side norm
+    size_t m_maxiter;     ///< Max. number of iterations
+    int m_noise;          ///< if noise > 0 iterations are printed
+    double m_resmax;      ///< maximum residual
+    double m_resminreach, m_resadd;
+    size_t m_nit;         ///< iteration number
+    double m_res;         ///< last computed residual
+    bool m_written;
+    void (*m_callback)(const IterationController&);
+  public :
+
+    void init()
+    {
+      m_nit = 0; m_res = 0.0; m_written = false;
+      m_resminreach = 1E50; m_resadd = 0.0;
+      m_callback = 0;
+    }
+
+    IterationController(double r = 1.0E-8, int noi = 0, size_t mit = size_t(-1))
+      : m_rhsn(1.0), m_maxiter(mit), m_noise(noi), m_resmax(r) { init(); }
+
+    void operator ++(int) { m_nit++; m_written = false; m_resadd += m_res; }
+    void operator ++() { (*this)++; }
+
+    bool first() { return m_nit == 0; }
+
+    /* get/set the "noisyness" (verbosity) of the solvers */
+    int noiseLevel() const { return m_noise; }
+    void setNoiseLevel(int n) { m_noise = n; }
+    void reduceNoiseLevel() { if (m_noise > 0) m_noise--; }
+
+    double maxResidual() const { return m_resmax; }
+    void setMaxResidual(double r) { m_resmax = r; }
+
+    double residual() const { return m_res; }
+
+    /* change the user-definable callback, called after each iteration */
+    void setCallback(void (*t)(const IterationController&))
+    {
+      m_callback = t;
+    }
+
+    size_t iteration() const { return m_nit; }
+    void setIteration(size_t i) { m_nit = i; }
+
+    size_t maxIterarions() const { return m_maxiter; }
+    void setMaxIterations(size_t i) { m_maxiter = i; }
+
+    double rhsNorm() const { return m_rhsn; }
+    void setRhsNorm(double r) { m_rhsn = r; }
+
+    bool converged() const { return m_res <= m_rhsn * m_resmax; }
+    bool converged(double nr)
+    {
+      using std::abs;
+      m_res = abs(nr); 
+      m_resminreach = (std::min)(m_resminreach, m_res);
+      return converged();
+    }
+    template<typename VectorType> bool converged(const VectorType &v)
+    { return converged(v.squaredNorm()); }
+
+    bool finished(double nr)
+    {
+      if (m_callback) m_callback(*this);
+      if (m_noise > 0 && !m_written)
+      {
+        converged(nr);
+        m_written = true;
+      }
+      return (m_nit >= m_maxiter || converged(nr));
+    }
+    template <typename VectorType>
+    bool finished(const MatrixBase<VectorType> &v)
+    { return finished(double(v.squaredNorm())); }
+
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_ITERATION_CONTROLLER_H

diff --git a/unsupported/Eigen/src/IterativeSolvers/MINRES.h b/unsupported/Eigen/src/IterativeSolvers/MINRES.h
new file mode 100644
index 0000000..5db454d
--- /dev/null
+++ b/unsupported/Eigen/src/IterativeSolvers/MINRES.h

@@ -0,0 +1,267 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Giacomo Po <gpo@ucla.edu>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2018 David Hyde <dabh@stanford.edu>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_MINRES_H_
+#define EIGEN_MINRES_H_
+
+
+namespace Eigen {
+    
+    namespace internal {
+        
+        /** \internal Low-level MINRES algorithm
+         * \param mat The matrix A
+         * \param rhs The right hand side vector b
+         * \param x On input and initial solution, on output the computed solution.
+         * \param precond A right preconditioner being able to efficiently solve for an
+         *                approximation of Ax=b (regardless of b)
+         * \param iters On input the max number of iteration, on output the number of performed iterations.
+         * \param tol_error On input the tolerance error, on output an estimation of the relative error.
+         */
+        template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+        EIGEN_DONT_INLINE
+        void minres(const MatrixType& mat, const Rhs& rhs, Dest& x,
+                    const Preconditioner& precond, Index& iters,
+                    typename Dest::RealScalar& tol_error)
+        {
+            using std::sqrt;
+            typedef typename Dest::RealScalar RealScalar;
+            typedef typename Dest::Scalar Scalar;
+            typedef Matrix<Scalar,Dynamic,1> VectorType;
+
+            // Check for zero rhs
+            const RealScalar rhsNorm2(rhs.squaredNorm());
+            if(rhsNorm2 == 0)
+            {
+                x.setZero();
+                iters = 0;
+                tol_error = 0;
+                return;
+            }
+            
+            // initialize
+            const Index maxIters(iters);  // initialize maxIters to iters
+            const Index N(mat.cols());    // the size of the matrix
+            const RealScalar threshold2(tol_error*tol_error*rhsNorm2); // convergence threshold (compared to residualNorm2)
+            
+            // Initialize preconditioned Lanczos
+            VectorType v_old(N); // will be initialized inside loop
+            VectorType v( VectorType::Zero(N) ); //initialize v
+            VectorType v_new(rhs-mat*x); //initialize v_new
+            RealScalar residualNorm2(v_new.squaredNorm());
+            VectorType w(N); // will be initialized inside loop
+            VectorType w_new(precond.solve(v_new)); // initialize w_new
+//            RealScalar beta; // will be initialized inside loop
+            RealScalar beta_new2(v_new.dot(w_new));
+            eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
+            RealScalar beta_new(sqrt(beta_new2));
+            const RealScalar beta_one(beta_new);
+            // Initialize other variables
+            RealScalar c(1.0); // the cosine of the Givens rotation
+            RealScalar c_old(1.0);
+            RealScalar s(0.0); // the sine of the Givens rotation
+            RealScalar s_old(0.0); // the sine of the Givens rotation
+            VectorType p_oold(N); // will be initialized in loop
+            VectorType p_old(VectorType::Zero(N)); // initialize p_old=0
+            VectorType p(p_old); // initialize p=0
+            RealScalar eta(1.0);
+                        
+            iters = 0; // reset iters
+            while ( iters < maxIters )
+            {
+                // Preconditioned Lanczos
+                /* Note that there are 4 variants on the Lanczos algorithm. These are
+                 * described in Paige, C. C. (1972). Computational variants of
+                 * the Lanczos method for the eigenproblem. IMA Journal of Applied
+                 * Mathematics, 10(3), 373-381. The current implementation corresponds 
+                 * to the case A(2,7) in the paper. It also corresponds to 
+                 * algorithm 6.14 in Y. Saad, Iterative Methods for Sparse Linear
+                 * Systems, 2003 p.173. For the preconditioned version see 
+                 * A. Greenbaum, Iterative Methods for Solving Linear Systems, SIAM (1987).
+                 */
+                const RealScalar beta(beta_new);
+                v_old = v; // update: at first time step, this makes v_old = 0 so value of beta doesn't matter
+                v_new /= beta_new; // overwrite v_new for next iteration
+                w_new /= beta_new; // overwrite w_new for next iteration
+                v = v_new; // update
+                w = w_new; // update
+                v_new.noalias() = mat*w - beta*v_old; // compute v_new
+                const RealScalar alpha = v_new.dot(w);
+                v_new -= alpha*v; // overwrite v_new
+                w_new = precond.solve(v_new); // overwrite w_new
+                beta_new2 = v_new.dot(w_new); // compute beta_new
+                eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
+                beta_new = sqrt(beta_new2); // compute beta_new
+                
+                // Givens rotation
+                const RealScalar r2 =s*alpha+c*c_old*beta; // s, s_old, c and c_old are still from previous iteration
+                const RealScalar r3 =s_old*beta; // s, s_old, c and c_old are still from previous iteration
+                const RealScalar r1_hat=c*alpha-c_old*s*beta;
+                const RealScalar r1 =sqrt( std::pow(r1_hat,2) + std::pow(beta_new,2) );
+                c_old = c; // store for next iteration
+                s_old = s; // store for next iteration
+                c=r1_hat/r1; // new cosine
+                s=beta_new/r1; // new sine
+                
+                // Update solution
+                p_oold = p_old;
+                p_old = p;
+                p.noalias()=(w-r2*p_old-r3*p_oold) /r1; // IS NOALIAS REQUIRED?
+                x += beta_one*c*eta*p;
+                
+                /* Update the squared residual. Note that this is the estimated residual.
+                The real residual |Ax-b|^2 may be slightly larger */
+                residualNorm2 *= s*s;
+                
+                if ( residualNorm2 < threshold2)
+                {
+                    break;
+                }
+                
+                eta=-s*eta; // update eta
+                iters++; // increment iteration number (for output purposes)
+            }
+            
+            /* Compute error. Note that this is the estimated error. The real 
+             error |Ax-b|/|b| may be slightly larger */
+            tol_error = std::sqrt(residualNorm2 / rhsNorm2);
+        }
+        
+    }
+    
+    template< typename _MatrixType, int _UpLo=Lower,
+    typename _Preconditioner = IdentityPreconditioner>
+    class MINRES;
+    
+    namespace internal {
+        
+        template< typename _MatrixType, int _UpLo, typename _Preconditioner>
+        struct traits<MINRES<_MatrixType,_UpLo,_Preconditioner> >
+        {
+            typedef _MatrixType MatrixType;
+            typedef _Preconditioner Preconditioner;
+        };
+        
+    }
+    
+    /** \ingroup IterativeLinearSolvers_Module
+     * \brief A minimal residual solver for sparse symmetric problems
+     *
+     * This class allows to solve for A.x = b sparse linear problems using the MINRES algorithm
+     * of Paige and Saunders (1975). The sparse matrix A must be symmetric (possibly indefinite).
+     * The vectors x and b can be either dense or sparse.
+     *
+     * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
+     * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower,
+     *               Upper, or Lower|Upper in which the full matrix entries will be considered. Default is Lower.
+     * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
+     *
+     * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+     * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+     * and NumTraits<Scalar>::epsilon() for the tolerance.
+     *
+     * This class can be used as the direct solver classes. Here is a typical usage example:
+     * \code
+     * int n = 10000;
+     * VectorXd x(n), b(n);
+     * SparseMatrix<double> A(n,n);
+     * // fill A and b
+     * MINRES<SparseMatrix<double> > mr;
+     * mr.compute(A);
+     * x = mr.solve(b);
+     * std::cout << "#iterations:     " << mr.iterations() << std::endl;
+     * std::cout << "estimated error: " << mr.error()      << std::endl;
+     * // update b, and solve again
+     * x = mr.solve(b);
+     * \endcode
+     *
+     * By default the iterations start with x=0 as an initial guess of the solution.
+     * One can control the start using the solveWithGuess() method.
+     *
+     * MINRES can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+     *
+     * \sa class ConjugateGradient, BiCGSTAB, SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+     */
+    template< typename _MatrixType, int _UpLo, typename _Preconditioner>
+    class MINRES : public IterativeSolverBase<MINRES<_MatrixType,_UpLo,_Preconditioner> >
+    {
+        
+        typedef IterativeSolverBase<MINRES> Base;
+        using Base::matrix;
+        using Base::m_error;
+        using Base::m_iterations;
+        using Base::m_info;
+        using Base::m_isInitialized;
+    public:
+        using Base::_solve_impl;
+        typedef _MatrixType MatrixType;
+        typedef typename MatrixType::Scalar Scalar;
+        typedef typename MatrixType::RealScalar RealScalar;
+        typedef _Preconditioner Preconditioner;
+        
+        enum {UpLo = _UpLo};
+        
+    public:
+        
+        /** Default constructor. */
+        MINRES() : Base() {}
+        
+        /** Initialize the solver with matrix \a A for further \c Ax=b solving.
+         *
+         * This constructor is a shortcut for the default constructor followed
+         * by a call to compute().
+         *
+         * \warning this class stores a reference to the matrix A as well as some
+         * precomputed values that depend on it. Therefore, if \a A is changed
+         * this class becomes invalid. Call compute() to update it with the new
+         * matrix A, or modify a copy of A.
+         */
+        template<typename MatrixDerived>
+        explicit MINRES(const EigenBase<MatrixDerived>& A) : Base(A.derived()) {}
+        
+        /** Destructor. */
+        ~MINRES(){}
+
+        /** \internal */
+        template<typename Rhs,typename Dest>
+        void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
+        {
+            typedef typename Base::MatrixWrapper MatrixWrapper;
+            typedef typename Base::ActualMatrixType ActualMatrixType;
+            enum {
+              TransposeInput  =   (!MatrixWrapper::MatrixFree)
+                              &&  (UpLo==(Lower|Upper))
+                              &&  (!MatrixType::IsRowMajor)
+                              &&  (!NumTraits<Scalar>::IsComplex)
+            };
+            typedef typename internal::conditional<TransposeInput,Transpose<const ActualMatrixType>, ActualMatrixType const&>::type RowMajorWrapper;
+            EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(MatrixWrapper::MatrixFree,UpLo==(Lower|Upper)),MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY);
+            typedef typename internal::conditional<UpLo==(Lower|Upper),
+                                                  RowMajorWrapper,
+                                                  typename MatrixWrapper::template ConstSelfAdjointViewReturnType<UpLo>::Type
+                                            >::type SelfAdjointWrapper;
+
+            m_iterations = Base::maxIterations();
+            m_error = Base::m_tolerance;
+            RowMajorWrapper row_mat(matrix());
+            internal::minres(SelfAdjointWrapper(row_mat), b, x,
+                             Base::m_preconditioner, m_iterations, m_error);
+            m_info = m_error <= Base::m_tolerance ? Success : NoConvergence;
+        }
+        
+    protected:
+        
+    };
+
+} // end namespace Eigen
+
+#endif // EIGEN_MINRES_H

diff --git a/unsupported/Eigen/src/IterativeSolvers/Scaling.h b/unsupported/Eigen/src/IterativeSolvers/Scaling.h
new file mode 100644
index 0000000..9b3eb53
--- /dev/null
+++ b/unsupported/Eigen/src/IterativeSolvers/Scaling.h

@@ -0,0 +1,193 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Desire NUENTSA WAKAM <desire.nuentsa_wakam@inria.fr
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ITERSCALING_H
+#define EIGEN_ITERSCALING_H
+
+namespace Eigen {
+
+/**
+  * \ingroup IterativeSolvers_Module
+  * \brief iterative scaling algorithm to equilibrate rows and column norms in matrices
+  * 
+  * This class can be used as a preprocessing tool to accelerate the convergence of iterative methods 
+  * 
+  * This feature is  useful to limit the pivoting amount during LU/ILU factorization
+  * The  scaling strategy as presented here preserves the symmetry of the problem
+  * NOTE It is assumed that the matrix does not have empty row or column, 
+  * 
+  * Example with key steps 
+  * \code
+  * VectorXd x(n), b(n);
+  * SparseMatrix<double> A;
+  * // fill A and b;
+  * IterScaling<SparseMatrix<double> > scal; 
+  * // Compute the left and right scaling vectors. The matrix is equilibrated at output
+  * scal.computeRef(A); 
+  * // Scale the right hand side
+  * b = scal.LeftScaling().cwiseProduct(b); 
+  * // Now, solve the equilibrated linear system with any available solver
+  * 
+  * // Scale back the computed solution
+  * x = scal.RightScaling().cwiseProduct(x); 
+  * \endcode
+  * 
+  * \tparam _MatrixType the type of the matrix. It should be a real square sparsematrix
+  * 
+  * References : D. Ruiz and B. Ucar, A Symmetry Preserving Algorithm for Matrix Scaling, INRIA Research report RR-7552
+  * 
+  * \sa \ref IncompleteLUT 
+  */
+template<typename _MatrixType>
+class IterScaling
+{
+  public:
+    typedef _MatrixType MatrixType; 
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::Index Index;
+    
+  public:
+    IterScaling() { init(); }
+    
+    IterScaling(const MatrixType& matrix)
+    {
+      init();
+      compute(matrix);
+    }
+    
+    ~IterScaling() { }
+    
+    /** 
+     * Compute the left and right diagonal matrices to scale the input matrix @p mat
+     * 
+     * FIXME This algorithm will be modified such that the diagonal elements are permuted on the diagonal. 
+     * 
+     * \sa LeftScaling() RightScaling()
+     */
+    void compute (const MatrixType& mat)
+    {
+      using std::abs;
+      int m = mat.rows(); 
+      int n = mat.cols();
+      eigen_assert((m>0 && m == n) && "Please give a non - empty matrix");
+      m_left.resize(m); 
+      m_right.resize(n);
+      m_left.setOnes();
+      m_right.setOnes();
+      m_matrix = mat;
+      VectorXd Dr, Dc, DrRes, DcRes; // Temporary Left and right scaling vectors
+      Dr.resize(m); Dc.resize(n);
+      DrRes.resize(m); DcRes.resize(n);
+      double EpsRow = 1.0, EpsCol = 1.0;
+      int its = 0; 
+      do
+      { // Iterate until the infinite norm of each row and column is approximately 1
+        // Get the maximum value in each row and column
+        Dr.setZero(); Dc.setZero();
+        for (int k=0; k<m_matrix.outerSize(); ++k)
+        {
+          for (typename MatrixType::InnerIterator it(m_matrix, k); it; ++it)
+          {
+            if ( Dr(it.row()) < abs(it.value()) )
+              Dr(it.row()) = abs(it.value());
+            
+            if ( Dc(it.col()) < abs(it.value()) )
+              Dc(it.col()) = abs(it.value());
+          }
+        }
+        for (int i = 0; i < m; ++i) 
+        {
+          Dr(i) = std::sqrt(Dr(i));
+        }
+        for (int i = 0; i < n; ++i) 
+        {
+          Dc(i) = std::sqrt(Dc(i));
+        }
+        // Save the scaling factors 
+        for (int i = 0; i < m; ++i) 
+        {
+          m_left(i) /= Dr(i);
+        }
+        for (int i = 0; i < n; ++i) 
+        {
+          m_right(i) /= Dc(i);
+        }
+        // Scale the rows and the columns of the matrix
+        DrRes.setZero(); DcRes.setZero(); 
+        for (int k=0; k<m_matrix.outerSize(); ++k)
+        {
+          for (typename MatrixType::InnerIterator it(m_matrix, k); it; ++it)
+          {
+            it.valueRef() = it.value()/( Dr(it.row()) * Dc(it.col()) );
+            // Accumulate the norms of the row and column vectors   
+            if ( DrRes(it.row()) < abs(it.value()) )
+              DrRes(it.row()) = abs(it.value());
+            
+            if ( DcRes(it.col()) < abs(it.value()) )
+              DcRes(it.col()) = abs(it.value());
+          }
+        }  
+        DrRes.array() = (1-DrRes.array()).abs();
+        EpsRow = DrRes.maxCoeff();
+        DcRes.array() = (1-DcRes.array()).abs();
+        EpsCol = DcRes.maxCoeff();
+        its++;
+      }while ( (EpsRow >m_tol || EpsCol > m_tol) && (its < m_maxits) );
+      m_isInitialized = true;
+    }
+    /** Compute the left and right vectors to scale the vectors
+     * the input matrix is scaled with the computed vectors at output
+     * 
+     * \sa compute()
+     */
+    void computeRef (MatrixType& mat)
+    {
+      compute (mat);
+      mat = m_matrix;
+    }
+    /** Get the vector to scale the rows of the matrix 
+     */
+    VectorXd& LeftScaling()
+    {
+      return m_left;
+    }
+    
+    /** Get the vector to scale the columns of the matrix 
+     */
+    VectorXd& RightScaling()
+    {
+      return m_right;
+    }
+    
+    /** Set the tolerance for the convergence of the iterative scaling algorithm
+     */
+    void setTolerance(double tol)
+    {
+      m_tol = tol; 
+    }
+      
+  protected:
+    
+    void init()
+    {
+      m_tol = 1e-10;
+      m_maxits = 5;
+      m_isInitialized = false;
+    }
+    
+    MatrixType m_matrix;
+    mutable ComputationInfo m_info; 
+    bool m_isInitialized; 
+    VectorXd m_left; // Left scaling vector
+    VectorXd m_right; // m_right scaling vector
+    double m_tol; 
+    int m_maxits; // Maximum number of iterations allowed
+};
+}
+#endif

diff --git a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
index b8f2cba..6a9b0be 100644
--- a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
+++ b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h

@@ -31,7 +31,6 @@
   protected:
     typedef typename Traits::Lhs Lhs;
     typedef typename Traits::Rhs Rhs;
-    typedef typename Traits::Index Index;
 
   public:
     /*! \brief Constructor. */
@@ -134,7 +133,6 @@
 template<typename Dest>
 void KroneckerProduct<Lhs,Rhs>::evalTo(Dest& dst) const
 {
-  typedef typename Base::Index Index;
   const int BlockRows = Rhs::RowsAtCompileTime,
             BlockCols = Rhs::ColsAtCompileTime;
   const Index Br = m_B.rows(),
@@ -148,22 +146,33 @@
 template<typename Dest>
 void KroneckerProductSparse<Lhs,Rhs>::evalTo(Dest& dst) const
 {
-  typedef typename Base::Index Index;
-  const Index Br = m_B.rows(),
-              Bc = m_B.cols();
+  Index Br = m_B.rows(), Bc = m_B.cols();
   dst.resize(this->rows(), this->cols());
   dst.resizeNonZeros(0);
   
+  // 1 - evaluate the operands if needed:
+  typedef typename internal::nested_eval<Lhs,Dynamic>::type Lhs1;
+  typedef typename internal::remove_all<Lhs1>::type Lhs1Cleaned;
+  const Lhs1 lhs1(m_A);
+  typedef typename internal::nested_eval<Rhs,Dynamic>::type Rhs1;
+  typedef typename internal::remove_all<Rhs1>::type Rhs1Cleaned;
+  const Rhs1 rhs1(m_B);
+    
+  // 2 - construct respective iterators
+  typedef Eigen::InnerIterator<Lhs1Cleaned> LhsInnerIterator;
+  typedef Eigen::InnerIterator<Rhs1Cleaned> RhsInnerIterator;
+  
   // compute number of non-zeros per innervectors of dst
   {
+    // TODO VectorXi is not necessarily big enough!
     VectorXi nnzA = VectorXi::Zero(Dest::IsRowMajor ? m_A.rows() : m_A.cols());
     for (Index kA=0; kA < m_A.outerSize(); ++kA)
-      for (typename Lhs::InnerIterator itA(m_A,kA); itA; ++itA)
+      for (LhsInnerIterator itA(lhs1,kA); itA; ++itA)
         nnzA(Dest::IsRowMajor ? itA.row() : itA.col())++;
       
     VectorXi nnzB = VectorXi::Zero(Dest::IsRowMajor ? m_B.rows() : m_B.cols());
     for (Index kB=0; kB < m_B.outerSize(); ++kB)
-      for (typename Rhs::InnerIterator itB(m_B,kB); itB; ++itB)
+      for (RhsInnerIterator itB(rhs1,kB); itB; ++itB)
         nnzB(Dest::IsRowMajor ? itB.row() : itB.col())++;
     
     Matrix<int,Dynamic,Dynamic,ColMajor> nnzAB = nnzB * nnzA.transpose();
@@ -174,12 +183,12 @@
   {
     for (Index kB=0; kB < m_B.outerSize(); ++kB)
     {
-      for (typename Lhs::InnerIterator itA(m_A,kA); itA; ++itA)
+      for (LhsInnerIterator itA(lhs1,kA); itA; ++itA)
       {
-        for (typename Rhs::InnerIterator itB(m_B,kB); itB; ++itB)
+        for (RhsInnerIterator itB(rhs1,kB); itB; ++itB)
         {
-          const Index i = itA.row() * Br + itB.row(),
-                      j = itA.col() * Bc + itB.col();
+          Index i = itA.row() * Br + itB.row(),
+                j = itA.col() * Bc + itB.col();
           dst.insert(i,j) = itA.value() * itB.value();
         }
       }
@@ -194,15 +203,14 @@
 {
   typedef typename remove_all<_Lhs>::type Lhs;
   typedef typename remove_all<_Rhs>::type Rhs;
-  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
-  typedef typename promote_index_type<typename Lhs::Index, typename Rhs::Index>::type Index;
+  typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+  typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex;
 
   enum {
     Rows = size_at_compile_time<traits<Lhs>::RowsAtCompileTime, traits<Rhs>::RowsAtCompileTime>::ret,
     Cols = size_at_compile_time<traits<Lhs>::ColsAtCompileTime, traits<Rhs>::ColsAtCompileTime>::ret,
     MaxRows = size_at_compile_time<traits<Lhs>::MaxRowsAtCompileTime, traits<Rhs>::MaxRowsAtCompileTime>::ret,
-    MaxCols = size_at_compile_time<traits<Lhs>::MaxColsAtCompileTime, traits<Rhs>::MaxColsAtCompileTime>::ret,
-    CoeffReadCost = Lhs::CoeffReadCost + Rhs::CoeffReadCost + NumTraits<Scalar>::MulCost
+    MaxCols = size_at_compile_time<traits<Lhs>::MaxColsAtCompileTime, traits<Rhs>::MaxColsAtCompileTime>::ret
   };
 
   typedef Matrix<Scalar,Rows,Cols> ReturnType;
@@ -214,9 +222,9 @@
   typedef MatrixXpr XprKind;
   typedef typename remove_all<_Lhs>::type Lhs;
   typedef typename remove_all<_Rhs>::type Rhs;
-  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
-  typedef typename promote_storage_type<typename traits<Lhs>::StorageKind, typename traits<Rhs>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename Lhs::Index, typename Rhs::Index>::type Index;
+  typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+  typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind, typename traits<Rhs>::StorageKind, scalar_product_op<typename Lhs::Scalar, typename Rhs::Scalar> >::ret StorageKind;
+  typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex;
 
   enum {
     LhsFlags = Lhs::Flags,
@@ -227,15 +235,15 @@
     MaxRowsAtCompileTime = size_at_compile_time<traits<Lhs>::MaxRowsAtCompileTime, traits<Rhs>::MaxRowsAtCompileTime>::ret,
     MaxColsAtCompileTime = size_at_compile_time<traits<Lhs>::MaxColsAtCompileTime, traits<Rhs>::MaxColsAtCompileTime>::ret,
 
-    EvalToRowMajor = (LhsFlags & RhsFlags & RowMajorBit),
+    EvalToRowMajor = (int(LhsFlags) & int(RhsFlags) & RowMajorBit),
     RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit),
 
-    Flags = ((LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
-          | EvalBeforeNestingBit | EvalBeforeAssigningBit,
-    CoeffReadCost = Dynamic
+    Flags = ((int(LhsFlags) | int(RhsFlags)) & HereditaryBits & RemovedBits)
+          | EvalBeforeNestingBit,
+    CoeffReadCost = HugeCost
   };
 
-  typedef SparseMatrix<Scalar> ReturnType;
+  typedef SparseMatrix<Scalar, 0, StorageIndex> ReturnType;
 };
 
 } // end namespace internal

diff --git a/unsupported/Eigen/src/LevenbergMarquardt/CopyrightMINPACK.txt b/unsupported/Eigen/src/LevenbergMarquardt/CopyrightMINPACK.txt
new file mode 100644
index 0000000..ae7984d
--- /dev/null
+++ b/unsupported/Eigen/src/LevenbergMarquardt/CopyrightMINPACK.txt

@@ -0,0 +1,52 @@
+Minpack Copyright Notice (1999) University of Chicago.  All rights reserved
+
+Redistribution and use in source and binary forms, with or
+without modification, are permitted provided that the
+following conditions are met:
+
+1. Redistributions of source code must retain the above
+copyright notice, this list of conditions and the following
+disclaimer.
+
+2. Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following
+disclaimer in the documentation and/or other materials
+provided with the distribution.
+
+3. The end-user documentation included with the
+redistribution, if any, must include the following
+acknowledgment:
+
+   "This product includes software developed by the
+   University of Chicago, as Operator of Argonne National
+   Laboratory.
+
+Alternately, this acknowledgment may appear in the software
+itself, if and wherever such third-party acknowledgments
+normally appear.
+
+4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS"
+WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE
+UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND
+THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE
+OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY
+OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR
+USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF
+THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4)
+DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION
+UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL
+BE CORRECTED.
+
+5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT
+HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF
+ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT,
+INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF
+ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF
+PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER
+SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT
+(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE,
+EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE
+POSSIBILITY OF SUCH LOSS OR DAMAGES.
+

diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h b/unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h
new file mode 100644
index 0000000..b75bea2
--- /dev/null
+++ b/unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h

@@ -0,0 +1,84 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This code initially comes from MINPACK whose original authors are:
+// Copyright Jorge More - Argonne National Laboratory
+// Copyright Burt Garbow - Argonne National Laboratory
+// Copyright Ken Hillstrom - Argonne National Laboratory
+//
+// This Source Code Form is subject to the terms of the Minpack license
+// (a BSD-like license) described in the campaigned CopyrightMINPACK.txt file.
+
+#ifndef EIGEN_LMCOVAR_H
+#define EIGEN_LMCOVAR_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template <typename Scalar>
+void covar(
+        Matrix< Scalar, Dynamic, Dynamic > &r,
+        const VectorXi& ipvt,
+        Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon()) )
+{
+    using std::abs;
+    /* Local variables */
+    Index i, j, k, l, ii, jj;
+    bool sing;
+    Scalar temp;
+
+    /* Function Body */
+    const Index n = r.cols();
+    const Scalar tolr = tol * abs(r(0,0));
+    Matrix< Scalar, Dynamic, 1 > wa(n);
+    eigen_assert(ipvt.size()==n);
+
+    /* form the inverse of r in the full upper triangle of r. */
+    l = -1;
+    for (k = 0; k < n; ++k)
+        if (abs(r(k,k)) > tolr) {
+            r(k,k) = 1. / r(k,k);
+            for (j = 0; j <= k-1; ++j) {
+                temp = r(k,k) * r(j,k);
+                r(j,k) = 0.;
+                r.col(k).head(j+1) -= r.col(j).head(j+1) * temp;
+            }
+            l = k;
+        }
+
+    /* form the full upper triangle of the inverse of (r transpose)*r */
+    /* in the full upper triangle of r. */
+    for (k = 0; k <= l; ++k) {
+        for (j = 0; j <= k-1; ++j)
+            r.col(j).head(j+1) += r.col(k).head(j+1) * r(j,k);
+        r.col(k).head(k+1) *= r(k,k);
+    }
+
+    /* form the full lower triangle of the covariance matrix */
+    /* in the strict lower triangle of r and in wa. */
+    for (j = 0; j < n; ++j) {
+        jj = ipvt[j];
+        sing = j > l;
+        for (i = 0; i <= j; ++i) {
+            if (sing)
+                r(i,j) = 0.;
+            ii = ipvt[i];
+            if (ii > jj)
+                r(ii,jj) = r(i,j);
+            if (ii < jj)
+                r(jj,ii) = r(i,j);
+        }
+        wa[jj] = r(j,j);
+    }
+
+    /* symmetrize the covariance matrix in r. */
+    r.topLeftCorner(n,n).template triangularView<StrictlyUpper>() = r.topLeftCorner(n,n).transpose();
+    r.diagonal() = wa;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_LMCOVAR_H

diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LMonestep.h b/unsupported/Eigen/src/LevenbergMarquardt/LMonestep.h
new file mode 100644
index 0000000..25b32ec
--- /dev/null
+++ b/unsupported/Eigen/src/LevenbergMarquardt/LMonestep.h

@@ -0,0 +1,202 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+//
+// This code initially comes from MINPACK whose original authors are:
+// Copyright Jorge More - Argonne National Laboratory
+// Copyright Burt Garbow - Argonne National Laboratory
+// Copyright Ken Hillstrom - Argonne National Laboratory
+//
+// This Source Code Form is subject to the terms of the Minpack license
+// (a BSD-like license) described in the campaigned CopyrightMINPACK.txt file.
+
+#ifndef EIGEN_LMONESTEP_H
+#define EIGEN_LMONESTEP_H
+
+namespace Eigen {
+
+template<typename FunctorType>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType>::minimizeOneStep(FVectorType  &x)
+{
+  using std::abs;
+  using std::sqrt;
+  RealScalar temp, temp1,temp2; 
+  RealScalar ratio; 
+  RealScalar pnorm, xnorm, fnorm1, actred, dirder, prered;
+  eigen_assert(x.size()==n); // check the caller is not cheating us
+
+  temp = 0.0; xnorm = 0.0;
+  /* calculate the jacobian matrix. */
+  Index df_ret = m_functor.df(x, m_fjac);
+  if (df_ret<0)
+      return LevenbergMarquardtSpace::UserAsked;
+  if (df_ret>0)
+      // numerical diff, we evaluated the function df_ret times
+      m_nfev += df_ret;
+  else m_njev++;
+
+  /* compute the qr factorization of the jacobian. */
+  for (int j = 0; j < x.size(); ++j)
+    m_wa2(j) = m_fjac.col(j).blueNorm();
+  QRSolver qrfac(m_fjac);
+  if(qrfac.info() != Success) {
+    m_info = NumericalIssue;
+    return LevenbergMarquardtSpace::ImproperInputParameters;
+  }
+  // Make a copy of the first factor with the associated permutation
+  m_rfactor = qrfac.matrixR();
+  m_permutation = (qrfac.colsPermutation());
+
+  /* on the first iteration and if external scaling is not used, scale according */
+  /* to the norms of the columns of the initial jacobian. */
+  if (m_iter == 1) {
+      if (!m_useExternalScaling)
+          for (Index j = 0; j < n; ++j)
+              m_diag[j] = (m_wa2[j]==0.)? 1. : m_wa2[j];
+
+      /* on the first iteration, calculate the norm of the scaled x */
+      /* and initialize the step bound m_delta. */
+      xnorm = m_diag.cwiseProduct(x).stableNorm();
+      m_delta = m_factor * xnorm;
+      if (m_delta == 0.)
+          m_delta = m_factor;
+  }
+
+  /* form (q transpose)*m_fvec and store the first n components in */
+  /* m_qtf. */
+  m_wa4 = m_fvec;
+  m_wa4 = qrfac.matrixQ().adjoint() * m_fvec; 
+  m_qtf = m_wa4.head(n);
+
+  /* compute the norm of the scaled gradient. */
+  m_gnorm = 0.;
+  if (m_fnorm != 0.)
+      for (Index j = 0; j < n; ++j)
+          if (m_wa2[m_permutation.indices()[j]] != 0.)
+              m_gnorm = (std::max)(m_gnorm, abs( m_rfactor.col(j).head(j+1).dot(m_qtf.head(j+1)/m_fnorm) / m_wa2[m_permutation.indices()[j]]));
+
+  /* test for convergence of the gradient norm. */
+  if (m_gnorm <= m_gtol) {
+    m_info = Success;
+    return LevenbergMarquardtSpace::CosinusTooSmall;
+  }
+
+  /* rescale if necessary. */
+  if (!m_useExternalScaling)
+      m_diag = m_diag.cwiseMax(m_wa2);
+
+  do {
+    /* determine the levenberg-marquardt parameter. */
+    internal::lmpar2(qrfac, m_diag, m_qtf, m_delta, m_par, m_wa1);
+
+    /* store the direction p and x + p. calculate the norm of p. */
+    m_wa1 = -m_wa1;
+    m_wa2 = x + m_wa1;
+    pnorm = m_diag.cwiseProduct(m_wa1).stableNorm();
+
+    /* on the first iteration, adjust the initial step bound. */
+    if (m_iter == 1)
+        m_delta = (std::min)(m_delta,pnorm);
+
+    /* evaluate the function at x + p and calculate its norm. */
+    if ( m_functor(m_wa2, m_wa4) < 0)
+        return LevenbergMarquardtSpace::UserAsked;
+    ++m_nfev;
+    fnorm1 = m_wa4.stableNorm();
+
+    /* compute the scaled actual reduction. */
+    actred = -1.;
+    if (Scalar(.1) * fnorm1 < m_fnorm)
+        actred = 1. - numext::abs2(fnorm1 / m_fnorm);
+
+    /* compute the scaled predicted reduction and */
+    /* the scaled directional derivative. */
+    m_wa3 = m_rfactor.template triangularView<Upper>() * (m_permutation.inverse() *m_wa1);
+    temp1 = numext::abs2(m_wa3.stableNorm() / m_fnorm);
+    temp2 = numext::abs2(sqrt(m_par) * pnorm / m_fnorm);
+    prered = temp1 + temp2 / Scalar(.5);
+    dirder = -(temp1 + temp2);
+
+    /* compute the ratio of the actual to the predicted */
+    /* reduction. */
+    ratio = 0.;
+    if (prered != 0.)
+        ratio = actred / prered;
+
+    /* update the step bound. */
+    if (ratio <= Scalar(.25)) {
+        if (actred >= 0.)
+            temp = RealScalar(.5);
+        if (actred < 0.)
+            temp = RealScalar(.5) * dirder / (dirder + RealScalar(.5) * actred);
+        if (RealScalar(.1) * fnorm1 >= m_fnorm || temp < RealScalar(.1))
+            temp = Scalar(.1);
+        /* Computing MIN */
+        m_delta = temp * (std::min)(m_delta, pnorm / RealScalar(.1));
+        m_par /= temp;
+    } else if (!(m_par != 0. && ratio < RealScalar(.75))) {
+        m_delta = pnorm / RealScalar(.5);
+        m_par = RealScalar(.5) * m_par;
+    }
+
+    /* test for successful iteration. */
+    if (ratio >= RealScalar(1e-4)) {
+        /* successful iteration. update x, m_fvec, and their norms. */
+        x = m_wa2;
+        m_wa2 = m_diag.cwiseProduct(x);
+        m_fvec = m_wa4;
+        xnorm = m_wa2.stableNorm();
+        m_fnorm = fnorm1;
+        ++m_iter;
+    }
+
+    /* tests for convergence. */
+    if (abs(actred) <= m_ftol && prered <= m_ftol && Scalar(.5) * ratio <= 1. && m_delta <= m_xtol * xnorm)
+    {
+       m_info = Success;
+      return LevenbergMarquardtSpace::RelativeErrorAndReductionTooSmall;
+    }
+    if (abs(actred) <= m_ftol && prered <= m_ftol && Scalar(.5) * ratio <= 1.) 
+    {
+      m_info = Success;
+      return LevenbergMarquardtSpace::RelativeReductionTooSmall;
+    }
+    if (m_delta <= m_xtol * xnorm)
+    {
+      m_info = Success;
+      return LevenbergMarquardtSpace::RelativeErrorTooSmall;
+    }
+
+    /* tests for termination and stringent tolerances. */
+    if (m_nfev >= m_maxfev) 
+    {
+      m_info = NoConvergence;
+      return LevenbergMarquardtSpace::TooManyFunctionEvaluation;
+    }
+    if (abs(actred) <= NumTraits<Scalar>::epsilon() && prered <= NumTraits<Scalar>::epsilon() && Scalar(.5) * ratio <= 1.)
+    {
+      m_info = Success;
+      return LevenbergMarquardtSpace::FtolTooSmall;
+    }
+    if (m_delta <= NumTraits<Scalar>::epsilon() * xnorm) 
+    {
+      m_info = Success;
+      return LevenbergMarquardtSpace::XtolTooSmall;
+    }
+    if (m_gnorm <= NumTraits<Scalar>::epsilon())
+    {
+      m_info = Success;
+      return LevenbergMarquardtSpace::GtolTooSmall;
+    }
+
+  } while (ratio < Scalar(1e-4));
+
+  return LevenbergMarquardtSpace::Running;
+}
+
+  
+} // end namespace Eigen
+
+#endif // EIGEN_LMONESTEP_H

diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h b/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h
new file mode 100644
index 0000000..9a48365
--- /dev/null
+++ b/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h

@@ -0,0 +1,160 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This code initially comes from MINPACK whose original authors are:
+// Copyright Jorge More - Argonne National Laboratory
+// Copyright Burt Garbow - Argonne National Laboratory
+// Copyright Ken Hillstrom - Argonne National Laboratory
+//
+// This Source Code Form is subject to the terms of the Minpack license
+// (a BSD-like license) described in the campaigned CopyrightMINPACK.txt file.
+
+#ifndef EIGEN_LMPAR_H
+#define EIGEN_LMPAR_H
+
+namespace Eigen {
+
+namespace internal {
+  
+  template <typename QRSolver, typename VectorType>
+    void lmpar2(
+    const QRSolver &qr,
+    const VectorType  &diag,
+    const VectorType  &qtb,
+    typename VectorType::Scalar m_delta,
+    typename VectorType::Scalar &par,
+    VectorType  &x)
+
+  {
+    using std::sqrt;
+    using std::abs;
+    typedef typename QRSolver::MatrixType MatrixType;
+    typedef typename QRSolver::Scalar Scalar;
+//    typedef typename QRSolver::StorageIndex StorageIndex;
+
+    /* Local variables */
+    Index j;
+    Scalar fp;
+    Scalar parc, parl;
+    Index iter;
+    Scalar temp, paru;
+    Scalar gnorm;
+    Scalar dxnorm;
+    
+    // Make a copy of the triangular factor. 
+    // This copy is modified during call the qrsolv
+    MatrixType s;
+    s = qr.matrixR();
+
+    /* Function Body */
+    const Scalar dwarf = (std::numeric_limits<Scalar>::min)();
+    const Index n = qr.matrixR().cols();
+    eigen_assert(n==diag.size());
+    eigen_assert(n==qtb.size());
+
+    VectorType  wa1, wa2;
+
+    /* compute and store in x the gauss-newton direction. if the */
+    /* jacobian is rank-deficient, obtain a least squares solution. */
+
+    //    const Index rank = qr.nonzeroPivots(); // exactly double(0.)
+    const Index rank = qr.rank(); // use a threshold
+    wa1 = qtb;
+    wa1.tail(n-rank).setZero();
+    //FIXME There is no solve in place for sparse triangularView
+    wa1.head(rank) = s.topLeftCorner(rank,rank).template triangularView<Upper>().solve(qtb.head(rank));
+
+    x = qr.colsPermutation()*wa1;
+
+    /* initialize the iteration counter. */
+    /* evaluate the function at the origin, and test */
+    /* for acceptance of the gauss-newton direction. */
+    iter = 0;
+    wa2 = diag.cwiseProduct(x);
+    dxnorm = wa2.blueNorm();
+    fp = dxnorm - m_delta;
+    if (fp <= Scalar(0.1) * m_delta) {
+      par = 0;
+      return;
+    }
+
+    /* if the jacobian is not rank deficient, the newton */
+    /* step provides a lower bound, parl, for the zero of */
+    /* the function. otherwise set this bound to zero. */
+    parl = 0.;
+    if (rank==n) {
+      wa1 = qr.colsPermutation().inverse() *  diag.cwiseProduct(wa2)/dxnorm;
+      s.topLeftCorner(n,n).transpose().template triangularView<Lower>().solveInPlace(wa1);
+      temp = wa1.blueNorm();
+      parl = fp / m_delta / temp / temp;
+    }
+
+    /* calculate an upper bound, paru, for the zero of the function. */
+    for (j = 0; j < n; ++j)
+      wa1[j] = s.col(j).head(j+1).dot(qtb.head(j+1)) / diag[qr.colsPermutation().indices()(j)];
+
+    gnorm = wa1.stableNorm();
+    paru = gnorm / m_delta;
+    if (paru == 0.)
+      paru = dwarf / (std::min)(m_delta,Scalar(0.1));
+
+    /* if the input par lies outside of the interval (parl,paru), */
+    /* set par to the closer endpoint. */
+    par = (std::max)(par,parl);
+    par = (std::min)(par,paru);
+    if (par == 0.)
+      par = gnorm / dxnorm;
+
+    /* beginning of an iteration. */
+    while (true) {
+      ++iter;
+
+      /* evaluate the function at the current value of par. */
+      if (par == 0.)
+        par = (std::max)(dwarf,Scalar(.001) * paru); /* Computing MAX */
+      wa1 = sqrt(par)* diag;
+
+      VectorType sdiag(n);
+      lmqrsolv(s, qr.colsPermutation(), wa1, qtb, x, sdiag);
+
+      wa2 = diag.cwiseProduct(x);
+      dxnorm = wa2.blueNorm();
+      temp = fp;
+      fp = dxnorm - m_delta;
+
+      /* if the function is small enough, accept the current value */
+      /* of par. also test for the exceptional cases where parl */
+      /* is zero or the number of iterations has reached 10. */
+      if (abs(fp) <= Scalar(0.1) * m_delta || (parl == 0. && fp <= temp && temp < 0.) || iter == 10)
+        break;
+
+      /* compute the newton correction. */
+      wa1 = qr.colsPermutation().inverse() * diag.cwiseProduct(wa2/dxnorm);
+      // we could almost use this here, but the diagonal is outside qr, in sdiag[]
+      for (j = 0; j < n; ++j) {
+        wa1[j] /= sdiag[j];
+        temp = wa1[j];
+        for (Index i = j+1; i < n; ++i)
+          wa1[i] -= s.coeff(i,j) * temp;
+      }
+      temp = wa1.blueNorm();
+      parc = fp / m_delta / temp / temp;
+
+      /* depending on the sign of the function, update parl or paru. */
+      if (fp > 0.)
+        parl = (std::max)(parl,par);
+      if (fp < 0.)
+        paru = (std::min)(paru,par);
+
+      /* compute an improved estimate for par. */
+      par = (std::max)(parl,par+parc);
+    }
+    if (iter == 0)
+      par = 0.;
+    return;
+  }
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_LMPAR_H

diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h b/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
new file mode 100644
index 0000000..1234858
--- /dev/null
+++ b/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h

@@ -0,0 +1,188 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+// Copyright (C) 2012 Desire Nuentsa <desire.nuentsa_wakam@inria.fr>
+//
+// This code initially comes from MINPACK whose original authors are:
+// Copyright Jorge More - Argonne National Laboratory
+// Copyright Burt Garbow - Argonne National Laboratory
+// Copyright Ken Hillstrom - Argonne National Laboratory
+//
+// This Source Code Form is subject to the terms of the Minpack license
+// (a BSD-like license) described in the campaigned CopyrightMINPACK.txt file.
+
+#ifndef EIGEN_LMQRSOLV_H
+#define EIGEN_LMQRSOLV_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template <typename Scalar,int Rows, int Cols, typename PermIndex>
+void lmqrsolv(
+  Matrix<Scalar,Rows,Cols> &s,
+  const PermutationMatrix<Dynamic,Dynamic,PermIndex> &iPerm,
+  const Matrix<Scalar,Dynamic,1> &diag,
+  const Matrix<Scalar,Dynamic,1> &qtb,
+  Matrix<Scalar,Dynamic,1> &x,
+  Matrix<Scalar,Dynamic,1> &sdiag)
+{
+    /* Local variables */
+    Index i, j, k;
+    Scalar temp;
+    Index n = s.cols();
+    Matrix<Scalar,Dynamic,1>  wa(n);
+    JacobiRotation<Scalar> givens;
+
+    /* Function Body */
+    // the following will only change the lower triangular part of s, including
+    // the diagonal, though the diagonal is restored afterward
+
+    /*     copy r and (q transpose)*b to preserve input and initialize s. */
+    /*     in particular, save the diagonal elements of r in x. */
+    x = s.diagonal();
+    wa = qtb;
+    
+   
+    s.topLeftCorner(n,n).template triangularView<StrictlyLower>() = s.topLeftCorner(n,n).transpose();
+    /*     eliminate the diagonal matrix d using a givens rotation. */
+    for (j = 0; j < n; ++j) {
+
+        /*        prepare the row of d to be eliminated, locating the */
+        /*        diagonal element using p from the qr factorization. */
+        const PermIndex l = iPerm.indices()(j);
+        if (diag[l] == 0.)
+            break;
+        sdiag.tail(n-j).setZero();
+        sdiag[j] = diag[l];
+
+        /*        the transformations to eliminate the row of d */
+        /*        modify only a single element of (q transpose)*b */
+        /*        beyond the first n, which is initially zero. */
+        Scalar qtbpj = 0.;
+        for (k = j; k < n; ++k) {
+            /*           determine a givens rotation which eliminates the */
+            /*           appropriate element in the current row of d. */
+            givens.makeGivens(-s(k,k), sdiag[k]);
+
+            /*           compute the modified diagonal element of r and */
+            /*           the modified element of ((q transpose)*b,0). */
+            s(k,k) = givens.c() * s(k,k) + givens.s() * sdiag[k];
+            temp = givens.c() * wa[k] + givens.s() * qtbpj;
+            qtbpj = -givens.s() * wa[k] + givens.c() * qtbpj;
+            wa[k] = temp;
+
+            /*           accumulate the transformation in the row of s. */
+            for (i = k+1; i<n; ++i) {
+                temp = givens.c() * s(i,k) + givens.s() * sdiag[i];
+                sdiag[i] = -givens.s() * s(i,k) + givens.c() * sdiag[i];
+                s(i,k) = temp;
+            }
+        }
+    }
+  
+    /*     solve the triangular system for z. if the system is */
+    /*     singular, then obtain a least squares solution. */
+    Index nsing;
+    for(nsing=0; nsing<n && sdiag[nsing]!=0; nsing++) {}
+
+    wa.tail(n-nsing).setZero();
+    s.topLeftCorner(nsing, nsing).transpose().template triangularView<Upper>().solveInPlace(wa.head(nsing));
+  
+    // restore
+    sdiag = s.diagonal();
+    s.diagonal() = x;
+
+    /* permute the components of z back to components of x. */
+    x = iPerm * wa; 
+}
+
+template <typename Scalar, int _Options, typename Index>
+void lmqrsolv(
+  SparseMatrix<Scalar,_Options,Index> &s,
+  const PermutationMatrix<Dynamic,Dynamic> &iPerm,
+  const Matrix<Scalar,Dynamic,1> &diag,
+  const Matrix<Scalar,Dynamic,1> &qtb,
+  Matrix<Scalar,Dynamic,1> &x,
+  Matrix<Scalar,Dynamic,1> &sdiag)
+{
+  /* Local variables */
+  typedef SparseMatrix<Scalar,RowMajor,Index> FactorType;
+    Index i, j, k, l;
+    Scalar temp;
+    Index n = s.cols();
+    Matrix<Scalar,Dynamic,1>  wa(n);
+    JacobiRotation<Scalar> givens;
+
+    /* Function Body */
+    // the following will only change the lower triangular part of s, including
+    // the diagonal, though the diagonal is restored afterward
+
+    /*     copy r and (q transpose)*b to preserve input and initialize R. */
+    wa = qtb;
+    FactorType R(s);
+    // Eliminate the diagonal matrix d using a givens rotation
+    for (j = 0; j < n; ++j)
+    {
+      // Prepare the row of d to be eliminated, locating the 
+      // diagonal element using p from the qr factorization
+      l = iPerm.indices()(j);
+      if (diag(l) == Scalar(0)) 
+        break; 
+      sdiag.tail(n-j).setZero();
+      sdiag[j] = diag[l];
+      // the transformations to eliminate the row of d
+      // modify only a single element of (q transpose)*b
+      // beyond the first n, which is initially zero. 
+      
+      Scalar qtbpj = 0; 
+      // Browse the nonzero elements of row j of the upper triangular s
+      for (k = j; k < n; ++k)
+      {
+        typename FactorType::InnerIterator itk(R,k);
+        for (; itk; ++itk){
+          if (itk.index() < k) continue;
+          else break;
+        }
+        //At this point, we have the diagonal element R(k,k)
+        // Determine a givens rotation which eliminates 
+        // the appropriate element in the current row of d
+        givens.makeGivens(-itk.value(), sdiag(k));
+        
+        // Compute the modified diagonal element of r and 
+        // the modified element of ((q transpose)*b,0).
+        itk.valueRef() = givens.c() * itk.value() + givens.s() * sdiag(k);
+        temp = givens.c() * wa(k) + givens.s() * qtbpj; 
+        qtbpj = -givens.s() * wa(k) + givens.c() * qtbpj;
+        wa(k) = temp;
+        
+        // Accumulate the transformation in the remaining k row/column of R
+        for (++itk; itk; ++itk)
+        {
+          i = itk.index();
+          temp = givens.c() *  itk.value() + givens.s() * sdiag(i);
+          sdiag(i) = -givens.s() * itk.value() + givens.c() * sdiag(i);
+          itk.valueRef() = temp;
+        }
+      }
+    }
+    
+    // Solve the triangular system for z. If the system is 
+    // singular, then obtain a least squares solution
+    Index nsing;
+    for(nsing = 0; nsing<n && sdiag(nsing) !=0; nsing++) {}
+    
+    wa.tail(n-nsing).setZero();
+//     x = wa; 
+    wa.head(nsing) = R.topLeftCorner(nsing,nsing).template triangularView<Upper>().solve/*InPlace*/(wa.head(nsing));
+    
+    sdiag = R.diagonal();
+    // Permute the components of z back to components of x
+    x = iPerm * wa; 
+}
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_LMQRSOLV_H

diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
new file mode 100644
index 0000000..62561da
--- /dev/null
+++ b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h

@@ -0,0 +1,396 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+// Copyright (C) 2012 Desire Nuentsa <desire.nuentsa_wakam@inria.fr>
+//
+// The algorithm of this class initially comes from MINPACK whose original authors are:
+// Copyright Jorge More - Argonne National Laboratory
+// Copyright Burt Garbow - Argonne National Laboratory
+// Copyright Ken Hillstrom - Argonne National Laboratory
+//
+// This Source Code Form is subject to the terms of the Minpack license
+// (a BSD-like license) described in the campaigned CopyrightMINPACK.txt file.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LEVENBERGMARQUARDT_H
+#define EIGEN_LEVENBERGMARQUARDT_H
+
+
+namespace Eigen {
+namespace LevenbergMarquardtSpace {
+    enum Status {
+        NotStarted = -2,
+        Running = -1,
+        ImproperInputParameters = 0,
+        RelativeReductionTooSmall = 1,
+        RelativeErrorTooSmall = 2,
+        RelativeErrorAndReductionTooSmall = 3,
+        CosinusTooSmall = 4,
+        TooManyFunctionEvaluation = 5,
+        FtolTooSmall = 6,
+        XtolTooSmall = 7,
+        GtolTooSmall = 8,
+        UserAsked = 9
+    };
+}
+
+template <typename _Scalar, int NX=Dynamic, int NY=Dynamic>
+struct DenseFunctor
+{
+  typedef _Scalar Scalar;
+  enum {
+    InputsAtCompileTime = NX,
+    ValuesAtCompileTime = NY
+  };
+  typedef Matrix<Scalar,InputsAtCompileTime,1> InputType;
+  typedef Matrix<Scalar,ValuesAtCompileTime,1> ValueType;
+  typedef Matrix<Scalar,ValuesAtCompileTime,InputsAtCompileTime> JacobianType;
+  typedef ColPivHouseholderQR<JacobianType> QRSolver;
+  const int m_inputs, m_values;
+
+  DenseFunctor() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
+  DenseFunctor(int inputs, int values) : m_inputs(inputs), m_values(values) {}
+
+  int inputs() const { return m_inputs; }
+  int values() const { return m_values; }
+
+  //int operator()(const InputType &x, ValueType& fvec) { }
+  // should be defined in derived classes
+  
+  //int df(const InputType &x, JacobianType& fjac) { }
+  // should be defined in derived classes
+};
+
+template <typename _Scalar, typename _Index>
+struct SparseFunctor
+{
+  typedef _Scalar Scalar;
+  typedef _Index Index;
+  typedef Matrix<Scalar,Dynamic,1> InputType;
+  typedef Matrix<Scalar,Dynamic,1> ValueType;
+  typedef SparseMatrix<Scalar, ColMajor, Index> JacobianType;
+  typedef SparseQR<JacobianType, COLAMDOrdering<int> > QRSolver;
+  enum {
+    InputsAtCompileTime = Dynamic,
+    ValuesAtCompileTime = Dynamic
+  };
+  
+  SparseFunctor(int inputs, int values) : m_inputs(inputs), m_values(values) {}
+
+  int inputs() const { return m_inputs; }
+  int values() const { return m_values; }
+  
+  const int m_inputs, m_values;
+  //int operator()(const InputType &x, ValueType& fvec) { }
+  // to be defined in the functor
+  
+  //int df(const InputType &x, JacobianType& fjac) { }
+  // to be defined in the functor if no automatic differentiation
+  
+};
+namespace internal {
+template <typename QRSolver, typename VectorType>
+void lmpar2(const QRSolver &qr, const VectorType  &diag, const VectorType  &qtb,
+	    typename VectorType::Scalar m_delta, typename VectorType::Scalar &par,
+	    VectorType  &x);
+    }
+/**
+  * \ingroup NonLinearOptimization_Module
+  * \brief Performs non linear optimization over a non-linear function,
+  * using a variant of the Levenberg Marquardt algorithm.
+  *
+  * Check wikipedia for more information.
+  * http://en.wikipedia.org/wiki/Levenberg%E2%80%93Marquardt_algorithm
+  */
+template<typename _FunctorType>
+class LevenbergMarquardt : internal::no_assignment_operator
+{
+  public:
+    typedef _FunctorType FunctorType;
+    typedef typename FunctorType::QRSolver QRSolver;
+    typedef typename FunctorType::JacobianType JacobianType;
+    typedef typename JacobianType::Scalar Scalar;
+    typedef typename JacobianType::RealScalar RealScalar; 
+    typedef typename QRSolver::StorageIndex PermIndex;
+    typedef Matrix<Scalar,Dynamic,1> FVectorType;
+    typedef PermutationMatrix<Dynamic,Dynamic,int> PermutationType;
+  public:
+    LevenbergMarquardt(FunctorType& functor) 
+    : m_functor(functor),m_nfev(0),m_njev(0),m_fnorm(0.0),m_gnorm(0),
+      m_isInitialized(false),m_info(InvalidInput)
+    {
+      resetParameters();
+      m_useExternalScaling=false; 
+    }
+    
+    LevenbergMarquardtSpace::Status minimize(FVectorType &x);
+    LevenbergMarquardtSpace::Status minimizeInit(FVectorType &x);
+    LevenbergMarquardtSpace::Status minimizeOneStep(FVectorType &x);
+    LevenbergMarquardtSpace::Status lmder1(
+      FVectorType  &x, 
+      const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
+    );
+    static LevenbergMarquardtSpace::Status lmdif1(
+            FunctorType &functor,
+            FVectorType  &x,
+            Index *nfev,
+            const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
+            );
+    
+    /** Sets the default parameters */
+    void resetParameters() 
+    {
+      using std::sqrt;        
+
+      m_factor = 100.; 
+      m_maxfev = 400; 
+      m_ftol = sqrt(NumTraits<RealScalar>::epsilon());
+      m_xtol = sqrt(NumTraits<RealScalar>::epsilon());
+      m_gtol = 0. ; 
+      m_epsfcn = 0. ;
+    }
+    
+    /** Sets the tolerance for the norm of the solution vector*/
+    void setXtol(RealScalar xtol) { m_xtol = xtol; }
+    
+    /** Sets the tolerance for the norm of the vector function*/
+    void setFtol(RealScalar ftol) { m_ftol = ftol; }
+    
+    /** Sets the tolerance for the norm of the gradient of the error vector*/
+    void setGtol(RealScalar gtol) { m_gtol = gtol; }
+    
+    /** Sets the step bound for the diagonal shift */
+    void setFactor(RealScalar factor) { m_factor = factor; }    
+    
+    /** Sets the error precision  */
+    void setEpsilon (RealScalar epsfcn) { m_epsfcn = epsfcn; }
+    
+    /** Sets the maximum number of function evaluation */
+    void setMaxfev(Index maxfev) {m_maxfev = maxfev; }
+    
+    /** Use an external Scaling. If set to true, pass a nonzero diagonal to diag() */
+    void setExternalScaling(bool value) {m_useExternalScaling  = value; }
+    
+    /** \returns the tolerance for the norm of the solution vector */
+    RealScalar xtol() const {return m_xtol; }
+    
+    /** \returns the tolerance for the norm of the vector function */
+    RealScalar ftol() const {return m_ftol; }
+    
+    /** \returns the tolerance for the norm of the gradient of the error vector */
+    RealScalar gtol() const {return m_gtol; }
+    
+    /** \returns the step bound for the diagonal shift */
+    RealScalar factor() const {return m_factor; }
+    
+    /** \returns the error precision */
+    RealScalar epsilon() const {return m_epsfcn; }
+    
+    /** \returns the maximum number of function evaluation */
+    Index maxfev() const {return m_maxfev; }
+    
+    /** \returns a reference to the diagonal of the jacobian */
+    FVectorType& diag() {return m_diag; }
+    
+    /** \returns the number of iterations performed */
+    Index iterations() { return m_iter; }
+    
+    /** \returns the number of functions evaluation */
+    Index nfev() { return m_nfev; }
+    
+    /** \returns the number of jacobian evaluation */
+    Index njev() { return m_njev; }
+    
+    /** \returns the norm of current vector function */
+    RealScalar fnorm() {return m_fnorm; }
+    
+    /** \returns the norm of the gradient of the error */
+    RealScalar gnorm() {return m_gnorm; }
+    
+    /** \returns the LevenbergMarquardt parameter */
+    RealScalar lm_param(void) { return m_par; }
+    
+    /** \returns a reference to the  current vector function 
+     */
+    FVectorType& fvec() {return m_fvec; }
+    
+    /** \returns a reference to the matrix where the current Jacobian matrix is stored
+     */
+    JacobianType& jacobian() {return m_fjac; }
+    
+    /** \returns a reference to the triangular matrix R from the QR of the jacobian matrix.
+     * \sa jacobian()
+     */
+    JacobianType& matrixR() {return m_rfactor; }
+    
+    /** the permutation used in the QR factorization
+     */
+    PermutationType permutation() {return m_permutation; }
+    
+    /** 
+     * \brief Reports whether the minimization was successful
+     * \returns \c Success if the minimization was successful,
+     *         \c NumericalIssue if a numerical problem arises during the 
+     *          minimization process, for example during the QR factorization
+     *         \c NoConvergence if the minimization did not converge after 
+     *          the maximum number of function evaluation allowed
+     *          \c InvalidInput if the input matrix is invalid
+     */
+    ComputationInfo info() const
+    {
+      
+      return m_info;
+    }
+  private:
+    JacobianType m_fjac; 
+    JacobianType m_rfactor; // The triangular matrix R from the QR of the jacobian matrix m_fjac
+    FunctorType &m_functor;
+    FVectorType m_fvec, m_qtf, m_diag; 
+    Index n;
+    Index m; 
+    Index m_nfev;
+    Index m_njev; 
+    RealScalar m_fnorm; // Norm of the current vector function
+    RealScalar m_gnorm; //Norm of the gradient of the error 
+    RealScalar m_factor; //
+    Index m_maxfev; // Maximum number of function evaluation
+    RealScalar m_ftol; //Tolerance in the norm of the vector function
+    RealScalar m_xtol; // 
+    RealScalar m_gtol; //tolerance of the norm of the error gradient
+    RealScalar m_epsfcn; //
+    Index m_iter; // Number of iterations performed
+    RealScalar m_delta;
+    bool m_useExternalScaling;
+    PermutationType m_permutation;
+    FVectorType m_wa1, m_wa2, m_wa3, m_wa4; //Temporary vectors
+    RealScalar m_par;
+    bool m_isInitialized; // Check whether the minimization step has been called
+    ComputationInfo m_info; 
+};
+
+template<typename FunctorType>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType>::minimize(FVectorType  &x)
+{
+    LevenbergMarquardtSpace::Status status = minimizeInit(x);
+    if (status==LevenbergMarquardtSpace::ImproperInputParameters) {
+      m_isInitialized = true;
+      return status;
+    }
+    do {
+//       std::cout << " uv " << x.transpose() << "\n";
+        status = minimizeOneStep(x);
+    } while (status==LevenbergMarquardtSpace::Running);
+     m_isInitialized = true;
+     return status;
+}
+
+template<typename FunctorType>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType>::minimizeInit(FVectorType  &x)
+{
+    n = x.size();
+    m = m_functor.values();
+
+    m_wa1.resize(n); m_wa2.resize(n); m_wa3.resize(n);
+    m_wa4.resize(m);
+    m_fvec.resize(m);
+    //FIXME Sparse Case : Allocate space for the jacobian
+    m_fjac.resize(m, n);
+//     m_fjac.reserve(VectorXi::Constant(n,5)); // FIXME Find a better alternative
+    if (!m_useExternalScaling)
+        m_diag.resize(n);
+    eigen_assert( (!m_useExternalScaling || m_diag.size()==n) && "When m_useExternalScaling is set, the caller must provide a valid 'm_diag'");
+    m_qtf.resize(n);
+
+    /* Function Body */
+    m_nfev = 0;
+    m_njev = 0;
+
+    /*     check the input parameters for errors. */
+    if (n <= 0 || m < n || m_ftol < 0. || m_xtol < 0. || m_gtol < 0. || m_maxfev <= 0 || m_factor <= 0.){
+      m_info = InvalidInput;
+      return LevenbergMarquardtSpace::ImproperInputParameters;
+    }
+
+    if (m_useExternalScaling)
+        for (Index j = 0; j < n; ++j)
+            if (m_diag[j] <= 0.) 
+            {
+              m_info = InvalidInput;
+              return LevenbergMarquardtSpace::ImproperInputParameters;
+            }
+
+    /*     evaluate the function at the starting point */
+    /*     and calculate its norm. */
+    m_nfev = 1;
+    if ( m_functor(x, m_fvec) < 0)
+        return LevenbergMarquardtSpace::UserAsked;
+    m_fnorm = m_fvec.stableNorm();
+
+    /*     initialize levenberg-marquardt parameter and iteration counter. */
+    m_par = 0.;
+    m_iter = 1;
+
+    return LevenbergMarquardtSpace::NotStarted;
+}
+
+template<typename FunctorType>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType>::lmder1(
+        FVectorType  &x,
+        const Scalar tol
+        )
+{
+    n = x.size();
+    m = m_functor.values();
+
+    /* check the input parameters for errors. */
+    if (n <= 0 || m < n || tol < 0.)
+        return LevenbergMarquardtSpace::ImproperInputParameters;
+
+    resetParameters();
+    m_ftol = tol;
+    m_xtol = tol;
+    m_maxfev = 100*(n+1);
+
+    return minimize(x);
+}
+
+
+template<typename FunctorType>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType>::lmdif1(
+        FunctorType &functor,
+        FVectorType  &x,
+        Index *nfev,
+        const Scalar tol
+        )
+{
+    Index n = x.size();
+    Index m = functor.values();
+
+    /* check the input parameters for errors. */
+    if (n <= 0 || m < n || tol < 0.)
+        return LevenbergMarquardtSpace::ImproperInputParameters;
+
+    NumericalDiff<FunctorType> numDiff(functor);
+    // embedded LevenbergMarquardt
+    LevenbergMarquardt<NumericalDiff<FunctorType> > lm(numDiff);
+    lm.setFtol(tol);
+    lm.setXtol(tol);
+    lm.setMaxfev(200*(n+1));
+
+    LevenbergMarquardtSpace::Status info = LevenbergMarquardtSpace::Status(lm.minimize(x));
+    if (nfev)
+        * nfev = lm.nfev();
+    return info;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_LEVENBERGMARQUARDT_H

diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
new file mode 100644
index 0000000..02284b0
--- /dev/null
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h

@@ -0,0 +1,441 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009, 2010, 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2011, 2013 Chen-Pang He <jdh8@ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_EXPONENTIAL
+#define EIGEN_MATRIX_EXPONENTIAL
+
+#include "StemFunction.h"
+
+namespace Eigen {
+namespace internal {
+
+/** \brief Scaling operator.
+ *
+ * This struct is used by CwiseUnaryOp to scale a matrix by \f$ 2^{-s} \f$.
+ */
+template <typename RealScalar>
+struct MatrixExponentialScalingOp
+{
+  /** \brief Constructor.
+   *
+   * \param[in] squarings  The integer \f$ s \f$ in this document.
+   */
+  MatrixExponentialScalingOp(int squarings) : m_squarings(squarings) { }
+
+
+  /** \brief Scale a matrix coefficient.
+   *
+   * \param[in,out] x  The scalar to be scaled, becoming \f$ 2^{-s} x \f$.
+   */
+  inline const RealScalar operator() (const RealScalar& x) const
+  {
+    using std::ldexp;
+    return ldexp(x, -m_squarings);
+  }
+
+  typedef std::complex<RealScalar> ComplexScalar;
+
+  /** \brief Scale a matrix coefficient.
+   *
+   * \param[in,out] x  The scalar to be scaled, becoming \f$ 2^{-s} x \f$.
+   */
+  inline const ComplexScalar operator() (const ComplexScalar& x) const
+  {
+    using std::ldexp;
+    return ComplexScalar(ldexp(x.real(), -m_squarings), ldexp(x.imag(), -m_squarings));
+  }
+
+  private:
+    int m_squarings;
+};
+
+/** \brief Compute the (3,3)-Pad&eacute; approximant to the exponential.
+ *
+ *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ */
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade3(const MatA& A, MatU& U, MatV& V)
+{
+  typedef typename MatA::PlainObject MatrixType;
+  typedef typename NumTraits<typename traits<MatA>::Scalar>::Real RealScalar;
+  const RealScalar b[] = {120.L, 60.L, 12.L, 1.L};
+  const MatrixType A2 = A * A;
+  const MatrixType tmp = b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols());
+  U.noalias() = A * tmp;
+  V = b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols());
+}
+
+/** \brief Compute the (5,5)-Pad&eacute; approximant to the exponential.
+ *
+ *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ */
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade5(const MatA& A, MatU& U, MatV& V)
+{
+  typedef typename MatA::PlainObject MatrixType;
+  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+  const RealScalar b[] = {30240.L, 15120.L, 3360.L, 420.L, 30.L, 1.L};
+  const MatrixType A2 = A * A;
+  const MatrixType A4 = A2 * A2;
+  const MatrixType tmp = b[5] * A4 + b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols());
+  U.noalias() = A * tmp;
+  V = b[4] * A4 + b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols());
+}
+
+/** \brief Compute the (7,7)-Pad&eacute; approximant to the exponential.
+ *
+ *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ */
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade7(const MatA& A, MatU& U, MatV& V)
+{
+  typedef typename MatA::PlainObject MatrixType;
+  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+  const RealScalar b[] = {17297280.L, 8648640.L, 1995840.L, 277200.L, 25200.L, 1512.L, 56.L, 1.L};
+  const MatrixType A2 = A * A;
+  const MatrixType A4 = A2 * A2;
+  const MatrixType A6 = A4 * A2;
+  const MatrixType tmp = b[7] * A6 + b[5] * A4 + b[3] * A2 
+    + b[1] * MatrixType::Identity(A.rows(), A.cols());
+  U.noalias() = A * tmp;
+  V = b[6] * A6 + b[4] * A4 + b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols());
+
+}
+
+/** \brief Compute the (9,9)-Pad&eacute; approximant to the exponential.
+ *
+ *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ */
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade9(const MatA& A, MatU& U, MatV& V)
+{
+  typedef typename MatA::PlainObject MatrixType;
+  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+  const RealScalar b[] = {17643225600.L, 8821612800.L, 2075673600.L, 302702400.L, 30270240.L,
+                          2162160.L, 110880.L, 3960.L, 90.L, 1.L};
+  const MatrixType A2 = A * A;
+  const MatrixType A4 = A2 * A2;
+  const MatrixType A6 = A4 * A2;
+  const MatrixType A8 = A6 * A2;
+  const MatrixType tmp = b[9] * A8 + b[7] * A6 + b[5] * A4 + b[3] * A2 
+    + b[1] * MatrixType::Identity(A.rows(), A.cols());
+  U.noalias() = A * tmp;
+  V = b[8] * A8 + b[6] * A6 + b[4] * A4 + b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols());
+}
+
+/** \brief Compute the (13,13)-Pad&eacute; approximant to the exponential.
+ *
+ *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ */
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade13(const MatA& A, MatU& U, MatV& V)
+{
+  typedef typename MatA::PlainObject MatrixType;
+  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+  const RealScalar b[] = {64764752532480000.L, 32382376266240000.L, 7771770303897600.L,
+                          1187353796428800.L, 129060195264000.L, 10559470521600.L, 670442572800.L,
+                          33522128640.L, 1323241920.L, 40840800.L, 960960.L, 16380.L, 182.L, 1.L};
+  const MatrixType A2 = A * A;
+  const MatrixType A4 = A2 * A2;
+  const MatrixType A6 = A4 * A2;
+  V = b[13] * A6 + b[11] * A4 + b[9] * A2; // used for temporary storage
+  MatrixType tmp = A6 * V;
+  tmp += b[7] * A6 + b[5] * A4 + b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols());
+  U.noalias() = A * tmp;
+  tmp = b[12] * A6 + b[10] * A4 + b[8] * A2;
+  V.noalias() = A6 * tmp;
+  V += b[6] * A6 + b[4] * A4 + b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols());
+}
+
+/** \brief Compute the (17,17)-Pad&eacute; approximant to the exponential.
+ *
+ *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ *
+ *  This function activates only if your long double is double-double or quadruple.
+ */
+#if LDBL_MANT_DIG > 64
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade17(const MatA& A, MatU& U, MatV& V)
+{
+  typedef typename MatA::PlainObject MatrixType;
+  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+  const RealScalar b[] = {830034394580628357120000.L, 415017197290314178560000.L,
+                          100610229646136770560000.L, 15720348382208870400000.L,
+                          1774878043152614400000.L, 153822763739893248000.L, 10608466464820224000.L,
+                          595373117923584000.L, 27563570274240000.L, 1060137318240000.L,
+                          33924394183680.L, 899510451840.L, 19554575040.L, 341863200.L, 4651200.L,
+                          46512.L, 306.L, 1.L};
+  const MatrixType A2 = A * A;
+  const MatrixType A4 = A2 * A2;
+  const MatrixType A6 = A4 * A2;
+  const MatrixType A8 = A4 * A4;
+  V = b[17] * A8 + b[15] * A6 + b[13] * A4 + b[11] * A2; // used for temporary storage
+  MatrixType tmp = A8 * V;
+  tmp += b[9] * A8 + b[7] * A6 + b[5] * A4 + b[3] * A2 
+    + b[1] * MatrixType::Identity(A.rows(), A.cols());
+  U.noalias() = A * tmp;
+  tmp = b[16] * A8 + b[14] * A6 + b[12] * A4 + b[10] * A2;
+  V.noalias() = tmp * A8;
+  V += b[8] * A8 + b[6] * A6 + b[4] * A4 + b[2] * A2 
+    + b[0] * MatrixType::Identity(A.rows(), A.cols());
+}
+#endif
+
+template <typename MatrixType, typename RealScalar = typename NumTraits<typename traits<MatrixType>::Scalar>::Real>
+struct matrix_exp_computeUV
+{
+  /** \brief Compute Pad&eacute; approximant to the exponential.
+    *
+    * Computes \c U, \c V and \c squarings such that \f$ (V+U)(V-U)^{-1} \f$ is a Pad&eacute;
+    * approximant of \f$ \exp(2^{-\mbox{squarings}}M) \f$ around \f$ M = 0 \f$, where \f$ M \f$
+    * denotes the matrix \c arg. The degree of the Pad&eacute; approximant and the value of squarings
+    * are chosen such that the approximation error is no more than the round-off error.
+    */
+  static void run(const MatrixType& arg, MatrixType& U, MatrixType& V, int& squarings);
+};
+
+template <typename MatrixType>
+struct matrix_exp_computeUV<MatrixType, float>
+{
+  template <typename ArgType>
+  static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings)
+  {
+    using std::frexp;
+    using std::pow;
+    const float l1norm = arg.cwiseAbs().colwise().sum().maxCoeff();
+    squarings = 0;
+    if (l1norm < 4.258730016922831e-001f) {
+      matrix_exp_pade3(arg, U, V);
+    } else if (l1norm < 1.880152677804762e+000f) {
+      matrix_exp_pade5(arg, U, V);
+    } else {
+      const float maxnorm = 3.925724783138660f;
+      frexp(l1norm / maxnorm, &squarings);
+      if (squarings < 0) squarings = 0;
+      MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<float>(squarings));
+      matrix_exp_pade7(A, U, V);
+    }
+  }
+};
+
+template <typename MatrixType>
+struct matrix_exp_computeUV<MatrixType, double>
+{
+  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+  template <typename ArgType>
+  static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings)
+  {
+    using std::frexp;
+    using std::pow;
+    const RealScalar l1norm = arg.cwiseAbs().colwise().sum().maxCoeff();
+    squarings = 0;
+    if (l1norm < 1.495585217958292e-002) {
+      matrix_exp_pade3(arg, U, V);
+    } else if (l1norm < 2.539398330063230e-001) {
+      matrix_exp_pade5(arg, U, V);
+    } else if (l1norm < 9.504178996162932e-001) {
+      matrix_exp_pade7(arg, U, V);
+    } else if (l1norm < 2.097847961257068e+000) {
+      matrix_exp_pade9(arg, U, V);
+    } else {
+      const RealScalar maxnorm = 5.371920351148152;
+      frexp(l1norm / maxnorm, &squarings);
+      if (squarings < 0) squarings = 0;
+      MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<RealScalar>(squarings));
+      matrix_exp_pade13(A, U, V);
+    }
+  }
+};
+  
+template <typename MatrixType>
+struct matrix_exp_computeUV<MatrixType, long double>
+{
+  template <typename ArgType>
+  static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings)
+  {
+#if   LDBL_MANT_DIG == 53   // double precision
+    matrix_exp_computeUV<MatrixType, double>::run(arg, U, V, squarings);
+  
+#else
+  
+    using std::frexp;
+    using std::pow;
+    const long double l1norm = arg.cwiseAbs().colwise().sum().maxCoeff();
+    squarings = 0;
+  
+#if LDBL_MANT_DIG <= 64   // extended precision
+  
+    if (l1norm < 4.1968497232266989671e-003L) {
+      matrix_exp_pade3(arg, U, V);
+    } else if (l1norm < 1.1848116734693823091e-001L) {
+      matrix_exp_pade5(arg, U, V);
+    } else if (l1norm < 5.5170388480686700274e-001L) {
+      matrix_exp_pade7(arg, U, V);
+    } else if (l1norm < 1.3759868875587845383e+000L) {
+      matrix_exp_pade9(arg, U, V);
+    } else {
+      const long double maxnorm = 4.0246098906697353063L;
+      frexp(l1norm / maxnorm, &squarings);
+      if (squarings < 0) squarings = 0;
+      MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<long double>(squarings));
+      matrix_exp_pade13(A, U, V);
+    }
+  
+#elif LDBL_MANT_DIG <= 106  // double-double
+  
+    if (l1norm < 3.2787892205607026992947488108213e-005L) {
+      matrix_exp_pade3(arg, U, V);
+    } else if (l1norm < 6.4467025060072760084130906076332e-003L) {
+      matrix_exp_pade5(arg, U, V);
+    } else if (l1norm < 6.8988028496595374751374122881143e-002L) {
+      matrix_exp_pade7(arg, U, V);
+    } else if (l1norm < 2.7339737518502231741495857201670e-001L) {
+      matrix_exp_pade9(arg, U, V);
+    } else if (l1norm < 1.3203382096514474905666448850278e+000L) {
+      matrix_exp_pade13(arg, U, V);
+    } else {
+      const long double maxnorm = 3.2579440895405400856599663723517L;
+      frexp(l1norm / maxnorm, &squarings);
+      if (squarings < 0) squarings = 0;
+      MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<long double>(squarings));
+      matrix_exp_pade17(A, U, V);
+    }
+  
+#elif LDBL_MANT_DIG <= 113  // quadruple precision
+  
+    if (l1norm < 1.639394610288918690547467954466970e-005L) {
+      matrix_exp_pade3(arg, U, V);
+    } else if (l1norm < 4.253237712165275566025884344433009e-003L) {
+      matrix_exp_pade5(arg, U, V);
+    } else if (l1norm < 5.125804063165764409885122032933142e-002L) {
+      matrix_exp_pade7(arg, U, V);
+    } else if (l1norm < 2.170000765161155195453205651889853e-001L) {
+      matrix_exp_pade9(arg, U, V);
+    } else if (l1norm < 1.125358383453143065081397882891878e+000L) {
+      matrix_exp_pade13(arg, U, V);
+    } else {
+      const long double maxnorm = 2.884233277829519311757165057717815L;
+      frexp(l1norm / maxnorm, &squarings);
+      if (squarings < 0) squarings = 0;
+      MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<long double>(squarings));
+      matrix_exp_pade17(A, U, V);
+    }
+  
+#else
+  
+    // this case should be handled in compute()
+    eigen_assert(false && "Bug in MatrixExponential"); 
+  
+#endif
+#endif  // LDBL_MANT_DIG
+  }
+};
+
+template<typename T> struct is_exp_known_type : false_type {};
+template<> struct is_exp_known_type<float> : true_type {};
+template<> struct is_exp_known_type<double> : true_type {};
+#if LDBL_MANT_DIG <= 113
+template<> struct is_exp_known_type<long double> : true_type {};
+#endif
+
+template <typename ArgType, typename ResultType>
+void matrix_exp_compute(const ArgType& arg, ResultType &result, true_type) // natively supported scalar type
+{
+  typedef typename ArgType::PlainObject MatrixType;
+  MatrixType U, V;
+  int squarings;
+  matrix_exp_computeUV<MatrixType>::run(arg, U, V, squarings); // Pade approximant is (U+V) / (-U+V)
+  MatrixType numer = U + V;
+  MatrixType denom = -U + V;
+  result = denom.partialPivLu().solve(numer);
+  for (int i=0; i<squarings; i++)
+    result *= result;   // undo scaling by repeated squaring
+}
+
+
+/* Computes the matrix exponential
+ *
+ * \param arg    argument of matrix exponential (should be plain object)
+ * \param result variable in which result will be stored
+ */
+template <typename ArgType, typename ResultType>
+void matrix_exp_compute(const ArgType& arg, ResultType &result, false_type) // default
+{
+  typedef typename ArgType::PlainObject MatrixType;
+  typedef typename traits<MatrixType>::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef typename std::complex<RealScalar> ComplexScalar;
+  result = arg.matrixFunction(internal::stem_function_exp<ComplexScalar>);
+}
+
+} // end namespace Eigen::internal
+
+/** \ingroup MatrixFunctions_Module
+  *
+  * \brief Proxy for the matrix exponential of some matrix (expression).
+  *
+  * \tparam Derived  Type of the argument to the matrix exponential.
+  *
+  * This class holds the argument to the matrix exponential until it is assigned or evaluated for
+  * some other reason (so the argument should not be changed in the meantime). It is the return type
+  * of MatrixBase::exp() and most of the time this is the only way it is used.
+  */
+template<typename Derived> struct MatrixExponentialReturnValue
+: public ReturnByValue<MatrixExponentialReturnValue<Derived> >
+{
+  public:
+    /** \brief Constructor.
+      *
+      * \param src %Matrix (expression) forming the argument of the matrix exponential.
+      */
+    MatrixExponentialReturnValue(const Derived& src) : m_src(src) { }
+
+    /** \brief Compute the matrix exponential.
+      *
+      * \param result the matrix exponential of \p src in the constructor.
+      */
+    template <typename ResultType>
+    inline void evalTo(ResultType& result) const
+    {
+      const typename internal::nested_eval<Derived, 10>::type tmp(m_src);
+      internal::matrix_exp_compute(tmp, result, internal::is_exp_known_type<typename Derived::RealScalar>());
+    }
+
+    Index rows() const { return m_src.rows(); }
+    Index cols() const { return m_src.cols(); }
+
+  protected:
+    const typename internal::ref_selector<Derived>::type m_src;
+};
+
+namespace internal {
+template<typename Derived>
+struct traits<MatrixExponentialReturnValue<Derived> >
+{
+  typedef typename Derived::PlainObject ReturnType;
+};
+}
+
+template <typename Derived>
+const MatrixExponentialReturnValue<Derived> MatrixBase<Derived>::exp() const
+{
+  eigen_assert(rows() == cols());
+  return MatrixExponentialReturnValue<Derived>(derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIX_EXPONENTIAL

diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
new file mode 100644
index 0000000..cc12ab6
--- /dev/null
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h

@@ -0,0 +1,569 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2011, 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_FUNCTION_H
+#define EIGEN_MATRIX_FUNCTION_H
+
+#include "StemFunction.h"
+
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \brief Maximum distance allowed between eigenvalues to be considered "close". */
+static const float matrix_function_separation = 0.1f;
+
+/** \ingroup MatrixFunctions_Module
+  * \class MatrixFunctionAtomic
+  * \brief Helper class for computing matrix functions of atomic matrices.
+  *
+  * Here, an atomic matrix is a triangular matrix whose diagonal entries are close to each other.
+  */
+template <typename MatrixType>
+class MatrixFunctionAtomic 
+{
+  public:
+
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename stem_function<Scalar>::type StemFunction;
+
+    /** \brief Constructor
+      * \param[in]  f  matrix function to compute.
+      */
+    MatrixFunctionAtomic(StemFunction f) : m_f(f) { }
+
+    /** \brief Compute matrix function of atomic matrix
+      * \param[in]  A  argument of matrix function, should be upper triangular and atomic
+      * \returns  f(A), the matrix function evaluated at the given matrix
+      */
+    MatrixType compute(const MatrixType& A);
+
+  private:
+    StemFunction* m_f;
+};
+
+template <typename MatrixType>
+typename NumTraits<typename MatrixType::Scalar>::Real matrix_function_compute_mu(const MatrixType& A)
+{
+  typedef typename plain_col_type<MatrixType>::type VectorType;
+  Index rows = A.rows();
+  const MatrixType N = MatrixType::Identity(rows, rows) - A;
+  VectorType e = VectorType::Ones(rows);
+  N.template triangularView<Upper>().solveInPlace(e);
+  return e.cwiseAbs().maxCoeff();
+}
+
+template <typename MatrixType>
+MatrixType MatrixFunctionAtomic<MatrixType>::compute(const MatrixType& A)
+{
+  // TODO: Use that A is upper triangular
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  Index rows = A.rows();
+  Scalar avgEival = A.trace() / Scalar(RealScalar(rows));
+  MatrixType Ashifted = A - avgEival * MatrixType::Identity(rows, rows);
+  RealScalar mu = matrix_function_compute_mu(Ashifted);
+  MatrixType F = m_f(avgEival, 0) * MatrixType::Identity(rows, rows);
+  MatrixType P = Ashifted;
+  MatrixType Fincr;
+  for (Index s = 1; double(s) < 1.1 * double(rows) + 10.0; s++) { // upper limit is fairly arbitrary
+    Fincr = m_f(avgEival, static_cast<int>(s)) * P;
+    F += Fincr;
+    P = Scalar(RealScalar(1)/RealScalar(s + 1)) * P * Ashifted;
+
+    // test whether Taylor series converged
+    const RealScalar F_norm = F.cwiseAbs().rowwise().sum().maxCoeff();
+    const RealScalar Fincr_norm = Fincr.cwiseAbs().rowwise().sum().maxCoeff();
+    if (Fincr_norm < NumTraits<Scalar>::epsilon() * F_norm) {
+      RealScalar delta = 0;
+      RealScalar rfactorial = 1;
+      for (Index r = 0; r < rows; r++) {
+        RealScalar mx = 0;
+        for (Index i = 0; i < rows; i++)
+          mx = (std::max)(mx, std::abs(m_f(Ashifted(i, i) + avgEival, static_cast<int>(s+r))));
+        if (r != 0)
+          rfactorial *= RealScalar(r);
+        delta = (std::max)(delta, mx / rfactorial);
+      }
+      const RealScalar P_norm = P.cwiseAbs().rowwise().sum().maxCoeff();
+      if (mu * delta * P_norm < NumTraits<Scalar>::epsilon() * F_norm) // series converged
+        break;
+    }
+  }
+  return F;
+}
+
+/** \brief Find cluster in \p clusters containing some value 
+  * \param[in] key Value to find
+  * \returns Iterator to cluster containing \p key, or \c clusters.end() if no cluster in \p m_clusters
+  * contains \p key.
+  */
+template <typename Index, typename ListOfClusters>
+typename ListOfClusters::iterator matrix_function_find_cluster(Index key, ListOfClusters& clusters)
+{
+  typename std::list<Index>::iterator j;
+  for (typename ListOfClusters::iterator i = clusters.begin(); i != clusters.end(); ++i) {
+    j = std::find(i->begin(), i->end(), key);
+    if (j != i->end())
+      return i;
+  }
+  return clusters.end();
+}
+
+/** \brief Partition eigenvalues in clusters of ei'vals close to each other
+  * 
+  * \param[in]  eivals    Eigenvalues
+  * \param[out] clusters  Resulting partition of eigenvalues
+  *
+  * The partition satisfies the following two properties:
+  * # Any eigenvalue in a certain cluster is at most matrix_function_separation() away from another eigenvalue
+  *   in the same cluster.
+  * # The distance between two eigenvalues in different clusters is more than matrix_function_separation().  
+  * The implementation follows Algorithm 4.1 in the paper of Davies and Higham.
+  */
+template <typename EivalsType, typename Cluster>
+void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::list<Cluster>& clusters)
+{
+  typedef typename EivalsType::RealScalar RealScalar;
+  for (Index i=0; i<eivals.rows(); ++i) {
+    // Find cluster containing i-th ei'val, adding a new cluster if necessary
+    typename std::list<Cluster>::iterator qi = matrix_function_find_cluster(i, clusters);
+    if (qi == clusters.end()) {
+      Cluster l;
+      l.push_back(i);
+      clusters.push_back(l);
+      qi = clusters.end();
+      --qi;
+    }
+
+    // Look for other element to add to the set
+    for (Index j=i+1; j<eivals.rows(); ++j) {
+      if (abs(eivals(j) - eivals(i)) <= RealScalar(matrix_function_separation)
+          && std::find(qi->begin(), qi->end(), j) == qi->end()) {
+        typename std::list<Cluster>::iterator qj = matrix_function_find_cluster(j, clusters);
+        if (qj == clusters.end()) {
+          qi->push_back(j);
+        } else {
+          qi->insert(qi->end(), qj->begin(), qj->end());
+          clusters.erase(qj);
+        }
+      }
+    }
+  }
+}
+
+/** \brief Compute size of each cluster given a partitioning */
+template <typename ListOfClusters, typename Index>
+void matrix_function_compute_cluster_size(const ListOfClusters& clusters, Matrix<Index, Dynamic, 1>& clusterSize)
+{
+  const Index numClusters = static_cast<Index>(clusters.size());
+  clusterSize.setZero(numClusters);
+  Index clusterIndex = 0;
+  for (typename ListOfClusters::const_iterator cluster = clusters.begin(); cluster != clusters.end(); ++cluster) {
+    clusterSize[clusterIndex] = cluster->size();
+    ++clusterIndex;
+  }
+}
+
+/** \brief Compute start of each block using clusterSize */
+template <typename VectorType>
+void matrix_function_compute_block_start(const VectorType& clusterSize, VectorType& blockStart)
+{
+  blockStart.resize(clusterSize.rows());
+  blockStart(0) = 0;
+  for (Index i = 1; i < clusterSize.rows(); i++) {
+    blockStart(i) = blockStart(i-1) + clusterSize(i-1);
+  }
+}
+
+/** \brief Compute mapping of eigenvalue indices to cluster indices */
+template <typename EivalsType, typename ListOfClusters, typename VectorType>
+void matrix_function_compute_map(const EivalsType& eivals, const ListOfClusters& clusters, VectorType& eivalToCluster)
+{
+  eivalToCluster.resize(eivals.rows());
+  Index clusterIndex = 0;
+  for (typename ListOfClusters::const_iterator cluster = clusters.begin(); cluster != clusters.end(); ++cluster) {
+    for (Index i = 0; i < eivals.rows(); ++i) {
+      if (std::find(cluster->begin(), cluster->end(), i) != cluster->end()) {
+        eivalToCluster[i] = clusterIndex;
+      }
+    }
+    ++clusterIndex;
+  }
+}
+
+/** \brief Compute permutation which groups ei'vals in same cluster together */
+template <typename DynVectorType, typename VectorType>
+void matrix_function_compute_permutation(const DynVectorType& blockStart, const DynVectorType& eivalToCluster, VectorType& permutation)
+{
+  DynVectorType indexNextEntry = blockStart;
+  permutation.resize(eivalToCluster.rows());
+  for (Index i = 0; i < eivalToCluster.rows(); i++) {
+    Index cluster = eivalToCluster[i];
+    permutation[i] = indexNextEntry[cluster];
+    ++indexNextEntry[cluster];
+  }
+}  
+
+/** \brief Permute Schur decomposition in U and T according to permutation */
+template <typename VectorType, typename MatrixType>
+void matrix_function_permute_schur(VectorType& permutation, MatrixType& U, MatrixType& T)
+{
+  for (Index i = 0; i < permutation.rows() - 1; i++) {
+    Index j;
+    for (j = i; j < permutation.rows(); j++) {
+      if (permutation(j) == i) break;
+    }
+    eigen_assert(permutation(j) == i);
+    for (Index k = j-1; k >= i; k--) {
+      JacobiRotation<typename MatrixType::Scalar> rotation;
+      rotation.makeGivens(T(k, k+1), T(k+1, k+1) - T(k, k));
+      T.applyOnTheLeft(k, k+1, rotation.adjoint());
+      T.applyOnTheRight(k, k+1, rotation);
+      U.applyOnTheRight(k, k+1, rotation);
+      std::swap(permutation.coeffRef(k), permutation.coeffRef(k+1));
+    }
+  }
+}
+
+/** \brief Compute block diagonal part of matrix function.
+  *
+  * This routine computes the matrix function applied to the block diagonal part of \p T (which should be
+  * upper triangular), with the blocking given by \p blockStart and \p clusterSize. The matrix function of
+  * each diagonal block is computed by \p atomic. The off-diagonal parts of \p fT are set to zero.
+  */
+template <typename MatrixType, typename AtomicType, typename VectorType>
+void matrix_function_compute_block_atomic(const MatrixType& T, AtomicType& atomic, const VectorType& blockStart, const VectorType& clusterSize, MatrixType& fT)
+{ 
+  fT.setZero(T.rows(), T.cols());
+  for (Index i = 0; i < clusterSize.rows(); ++i) {
+    fT.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i))
+      = atomic.compute(T.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i)));
+  }
+}
+
+/** \brief Solve a triangular Sylvester equation AX + XB = C 
+  *
+  * \param[in]  A  the matrix A; should be square and upper triangular
+  * \param[in]  B  the matrix B; should be square and upper triangular
+  * \param[in]  C  the matrix C; should have correct size.
+  *
+  * \returns the solution X.
+  *
+  * If A is m-by-m and B is n-by-n, then both C and X are m-by-n.  The (i,j)-th component of the Sylvester
+  * equation is
+  * \f[ 
+  *     \sum_{k=i}^m A_{ik} X_{kj} + \sum_{k=1}^j X_{ik} B_{kj} = C_{ij}. 
+  * \f]
+  * This can be re-arranged to yield:
+  * \f[ 
+  *     X_{ij} = \frac{1}{A_{ii} + B_{jj}} \Bigl( C_{ij}
+  *     - \sum_{k=i+1}^m A_{ik} X_{kj} - \sum_{k=1}^{j-1} X_{ik} B_{kj} \Bigr).
+  * \f]
+  * It is assumed that A and B are such that the numerator is never zero (otherwise the Sylvester equation
+  * does not have a unique solution). In that case, these equations can be evaluated in the order 
+  * \f$ i=m,\ldots,1 \f$ and \f$ j=1,\ldots,n \f$.
+  */
+template <typename MatrixType>
+MatrixType matrix_function_solve_triangular_sylvester(const MatrixType& A, const MatrixType& B, const MatrixType& C)
+{
+  eigen_assert(A.rows() == A.cols());
+  eigen_assert(A.isUpperTriangular());
+  eigen_assert(B.rows() == B.cols());
+  eigen_assert(B.isUpperTriangular());
+  eigen_assert(C.rows() == A.rows());
+  eigen_assert(C.cols() == B.rows());
+
+  typedef typename MatrixType::Scalar Scalar;
+
+  Index m = A.rows();
+  Index n = B.rows();
+  MatrixType X(m, n);
+
+  for (Index i = m - 1; i >= 0; --i) {
+    for (Index j = 0; j < n; ++j) {
+
+      // Compute AX = \sum_{k=i+1}^m A_{ik} X_{kj}
+      Scalar AX;
+      if (i == m - 1) {
+	AX = 0; 
+      } else {
+	Matrix<Scalar,1,1> AXmatrix = A.row(i).tail(m-1-i) * X.col(j).tail(m-1-i);
+	AX = AXmatrix(0,0);
+      }
+
+      // Compute XB = \sum_{k=1}^{j-1} X_{ik} B_{kj}
+      Scalar XB;
+      if (j == 0) {
+	XB = 0; 
+      } else {
+	Matrix<Scalar,1,1> XBmatrix = X.row(i).head(j) * B.col(j).head(j);
+	XB = XBmatrix(0,0);
+      }
+
+      X(i,j) = (C(i,j) - AX - XB) / (A(i,i) + B(j,j));
+    }
+  }
+  return X;
+}
+
+/** \brief Compute part of matrix function above block diagonal.
+  *
+  * This routine completes the computation of \p fT, denoting a matrix function applied to the triangular
+  * matrix \p T. It assumes that the block diagonal part of \p fT has already been computed. The part below
+  * the diagonal is zero, because \p T is upper triangular.
+  */
+template <typename MatrixType, typename VectorType>
+void matrix_function_compute_above_diagonal(const MatrixType& T, const VectorType& blockStart, const VectorType& clusterSize, MatrixType& fT)
+{ 
+  typedef internal::traits<MatrixType> Traits;
+  typedef typename MatrixType::Scalar Scalar;
+  static const int Options = MatrixType::Options;
+  typedef Matrix<Scalar, Dynamic, Dynamic, Options, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
+
+  for (Index k = 1; k < clusterSize.rows(); k++) {
+    for (Index i = 0; i < clusterSize.rows() - k; i++) {
+      // compute (i, i+k) block
+      DynMatrixType A = T.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i));
+      DynMatrixType B = -T.block(blockStart(i+k), blockStart(i+k), clusterSize(i+k), clusterSize(i+k));
+      DynMatrixType C = fT.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i))
+        * T.block(blockStart(i), blockStart(i+k), clusterSize(i), clusterSize(i+k));
+      C -= T.block(blockStart(i), blockStart(i+k), clusterSize(i), clusterSize(i+k))
+        * fT.block(blockStart(i+k), blockStart(i+k), clusterSize(i+k), clusterSize(i+k));
+      for (Index m = i + 1; m < i + k; m++) {
+        C += fT.block(blockStart(i), blockStart(m), clusterSize(i), clusterSize(m))
+          * T.block(blockStart(m), blockStart(i+k), clusterSize(m), clusterSize(i+k));
+        C -= T.block(blockStart(i), blockStart(m), clusterSize(i), clusterSize(m))
+          * fT.block(blockStart(m), blockStart(i+k), clusterSize(m), clusterSize(i+k));
+      }
+      fT.block(blockStart(i), blockStart(i+k), clusterSize(i), clusterSize(i+k))
+        = matrix_function_solve_triangular_sylvester(A, B, C);
+    }
+  }
+}
+
+/** \ingroup MatrixFunctions_Module
+  * \brief Class for computing matrix functions.
+  * \tparam  MatrixType  type of the argument of the matrix function,
+  *                      expected to be an instantiation of the Matrix class template.
+  * \tparam  AtomicType  type for computing matrix function of atomic blocks.
+  * \tparam  IsComplex   used internally to select correct specialization.
+  *
+  * This class implements the Schur-Parlett algorithm for computing matrix functions. The spectrum of the
+  * matrix is divided in clustered of eigenvalues that lies close together. This class delegates the
+  * computation of the matrix function on every block corresponding to these clusters to an object of type
+  * \p AtomicType and uses these results to compute the matrix function of the whole matrix. The class
+  * \p AtomicType should have a \p compute() member function for computing the matrix function of a block.
+  *
+  * \sa class MatrixFunctionAtomic, class MatrixLogarithmAtomic
+  */
+template <typename MatrixType, int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex>
+struct matrix_function_compute
+{  
+    /** \brief Compute the matrix function.
+      *
+      * \param[in]  A       argument of matrix function, should be a square matrix.
+      * \param[in]  atomic  class for computing matrix function of atomic blocks.
+      * \param[out] result  the function \p f applied to \p A, as
+      * specified in the constructor.
+      *
+      * See MatrixBase::matrixFunction() for details on how this computation
+      * is implemented.
+      */
+    template <typename AtomicType, typename ResultType> 
+    static void run(const MatrixType& A, AtomicType& atomic, ResultType &result);    
+};
+
+/** \internal \ingroup MatrixFunctions_Module 
+  * \brief Partial specialization of MatrixFunction for real matrices
+  *
+  * This converts the real matrix to a complex matrix, compute the matrix function of that matrix, and then
+  * converts the result back to a real matrix.
+  */
+template <typename MatrixType>
+struct matrix_function_compute<MatrixType, 0>
+{  
+  template <typename MatA, typename AtomicType, typename ResultType>
+  static void run(const MatA& A, AtomicType& atomic, ResultType &result)
+  {
+    typedef internal::traits<MatrixType> Traits;
+    typedef typename Traits::Scalar Scalar;
+    static const int Rows = Traits::RowsAtCompileTime, Cols = Traits::ColsAtCompileTime;
+    static const int MaxRows = Traits::MaxRowsAtCompileTime, MaxCols = Traits::MaxColsAtCompileTime;
+
+    typedef std::complex<Scalar> ComplexScalar;
+    typedef Matrix<ComplexScalar, Rows, Cols, 0, MaxRows, MaxCols> ComplexMatrix;
+
+    ComplexMatrix CA = A.template cast<ComplexScalar>();
+    ComplexMatrix Cresult;
+    matrix_function_compute<ComplexMatrix>::run(CA, atomic, Cresult);
+    result = Cresult.real();
+  }
+};
+
+/** \internal \ingroup MatrixFunctions_Module 
+  * \brief Partial specialization of MatrixFunction for complex matrices
+  */
+template <typename MatrixType>
+struct matrix_function_compute<MatrixType, 1>
+{
+  template <typename MatA, typename AtomicType, typename ResultType>
+  static void run(const MatA& A, AtomicType& atomic, ResultType &result)
+  {
+    typedef internal::traits<MatrixType> Traits;
+    
+    // compute Schur decomposition of A
+    const ComplexSchur<MatrixType> schurOfA(A);
+    eigen_assert(schurOfA.info()==Success);
+    MatrixType T = schurOfA.matrixT();
+    MatrixType U = schurOfA.matrixU();
+
+    // partition eigenvalues into clusters of ei'vals "close" to each other
+    std::list<std::list<Index> > clusters; 
+    matrix_function_partition_eigenvalues(T.diagonal(), clusters);
+
+    // compute size of each cluster
+    Matrix<Index, Dynamic, 1> clusterSize;
+    matrix_function_compute_cluster_size(clusters, clusterSize);
+
+    // blockStart[i] is row index at which block corresponding to i-th cluster starts 
+    Matrix<Index, Dynamic, 1> blockStart; 
+    matrix_function_compute_block_start(clusterSize, blockStart);
+
+    // compute map so that eivalToCluster[i] = j means that i-th ei'val is in j-th cluster 
+    Matrix<Index, Dynamic, 1> eivalToCluster;
+    matrix_function_compute_map(T.diagonal(), clusters, eivalToCluster);
+
+    // compute permutation which groups ei'vals in same cluster together 
+    Matrix<Index, Traits::RowsAtCompileTime, 1> permutation;
+    matrix_function_compute_permutation(blockStart, eivalToCluster, permutation);
+
+    // permute Schur decomposition
+    matrix_function_permute_schur(permutation, U, T);
+
+    // compute result
+    MatrixType fT; // matrix function applied to T
+    matrix_function_compute_block_atomic(T, atomic, blockStart, clusterSize, fT);
+    matrix_function_compute_above_diagonal(T, blockStart, clusterSize, fT);
+    result = U * (fT.template triangularView<Upper>() * U.adjoint());
+  }
+};
+
+} // end of namespace internal
+
+/** \ingroup MatrixFunctions_Module
+  *
+  * \brief Proxy for the matrix function of some matrix (expression).
+  *
+  * \tparam Derived  Type of the argument to the matrix function.
+  *
+  * This class holds the argument to the matrix function until it is assigned or evaluated for some other
+  * reason (so the argument should not be changed in the meantime). It is the return type of
+  * matrixBase::matrixFunction() and related functions and most of the time this is the only way it is used.
+  */
+template<typename Derived> class MatrixFunctionReturnValue
+: public ReturnByValue<MatrixFunctionReturnValue<Derived> >
+{
+  public:
+    typedef typename Derived::Scalar Scalar;
+    typedef typename internal::stem_function<Scalar>::type StemFunction;
+
+  protected:
+    typedef typename internal::ref_selector<Derived>::type DerivedNested;
+
+  public:
+
+    /** \brief Constructor.
+      *
+      * \param[in] A  %Matrix (expression) forming the argument of the matrix function.
+      * \param[in] f  Stem function for matrix function under consideration.
+      */
+    MatrixFunctionReturnValue(const Derived& A, StemFunction f) : m_A(A), m_f(f) { }
+
+    /** \brief Compute the matrix function.
+      *
+      * \param[out] result \p f applied to \p A, where \p f and \p A are as in the constructor.
+      */
+    template <typename ResultType>
+    inline void evalTo(ResultType& result) const
+    {
+      typedef typename internal::nested_eval<Derived, 10>::type NestedEvalType;
+      typedef typename internal::remove_all<NestedEvalType>::type NestedEvalTypeClean;
+      typedef internal::traits<NestedEvalTypeClean> Traits;
+      typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
+      typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
+
+      typedef internal::MatrixFunctionAtomic<DynMatrixType> AtomicType;
+      AtomicType atomic(m_f);
+
+      internal::matrix_function_compute<typename NestedEvalTypeClean::PlainObject>::run(m_A, atomic, result);
+    }
+
+    Index rows() const { return m_A.rows(); }
+    Index cols() const { return m_A.cols(); }
+
+  private:
+    const DerivedNested m_A;
+    StemFunction *m_f;
+};
+
+namespace internal {
+template<typename Derived>
+struct traits<MatrixFunctionReturnValue<Derived> >
+{
+  typedef typename Derived::PlainObject ReturnType;
+};
+}
+
+
+/********** MatrixBase methods **********/
+
+
+template <typename Derived>
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::matrixFunction(typename internal::stem_function<typename internal::traits<Derived>::Scalar>::type f) const
+{
+  eigen_assert(rows() == cols());
+  return MatrixFunctionReturnValue<Derived>(derived(), f);
+}
+
+template <typename Derived>
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::sin() const
+{
+  eigen_assert(rows() == cols());
+  typedef typename internal::stem_function<Scalar>::ComplexScalar ComplexScalar;
+  return MatrixFunctionReturnValue<Derived>(derived(), internal::stem_function_sin<ComplexScalar>);
+}
+
+template <typename Derived>
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::cos() const
+{
+  eigen_assert(rows() == cols());
+  typedef typename internal::stem_function<Scalar>::ComplexScalar ComplexScalar;
+  return MatrixFunctionReturnValue<Derived>(derived(), internal::stem_function_cos<ComplexScalar>);
+}
+
+template <typename Derived>
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::sinh() const
+{
+  eigen_assert(rows() == cols());
+  typedef typename internal::stem_function<Scalar>::ComplexScalar ComplexScalar;
+  return MatrixFunctionReturnValue<Derived>(derived(), internal::stem_function_sinh<ComplexScalar>);
+}
+
+template <typename Derived>
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::cosh() const
+{
+  eigen_assert(rows() == cols());
+  typedef typename internal::stem_function<Scalar>::ComplexScalar ComplexScalar;
+  return MatrixFunctionReturnValue<Derived>(derived(), internal::stem_function_cosh<ComplexScalar>);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIX_FUNCTION_H

diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
new file mode 100644
index 0000000..e917013
--- /dev/null
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h

@@ -0,0 +1,373 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011, 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2011 Chen-Pang He <jdh8@ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_LOGARITHM
+#define EIGEN_MATRIX_LOGARITHM
+
+namespace Eigen { 
+
+namespace internal { 
+
+template <typename Scalar>
+struct matrix_log_min_pade_degree 
+{
+  static const int value = 3;
+};
+
+template <typename Scalar>
+struct matrix_log_max_pade_degree 
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static const int value = std::numeric_limits<RealScalar>::digits<= 24?  5:  // single precision
+                           std::numeric_limits<RealScalar>::digits<= 53?  7:  // double precision
+                           std::numeric_limits<RealScalar>::digits<= 64?  8:  // extended precision
+                           std::numeric_limits<RealScalar>::digits<=106? 10:  // double-double
+                                                                         11;  // quadruple precision
+};
+
+/** \brief Compute logarithm of 2x2 triangular matrix. */
+template <typename MatrixType>
+void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  using std::abs;
+  using std::ceil;
+  using std::imag;
+  using std::log;
+
+  Scalar logA00 = log(A(0,0));
+  Scalar logA11 = log(A(1,1));
+
+  result(0,0) = logA00;
+  result(1,0) = Scalar(0);
+  result(1,1) = logA11;
+
+  Scalar y = A(1,1) - A(0,0);
+  if (y==Scalar(0))
+  {
+    result(0,1) = A(0,1) / A(0,0);
+  }
+  else if ((abs(A(0,0)) < RealScalar(0.5)*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1))))
+  {
+    result(0,1) = A(0,1) * (logA11 - logA00) / y;
+  }
+  else
+  {
+    // computation in previous branch is inaccurate if A(1,1) \approx A(0,0)
+    RealScalar unwindingNumber = ceil((imag(logA11 - logA00) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI));
+    result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,RealScalar(2*EIGEN_PI)*unwindingNumber)) / y;
+  }
+}
+
+/* \brief Get suitable degree for Pade approximation. (specialized for RealScalar = float) */
+inline int matrix_log_get_pade_degree(float normTminusI)
+{
+  const float maxNormForPade[] = { 2.5111573934555054e-1 /* degree = 3 */ , 4.0535837411880493e-1,
+            5.3149729967117310e-1 };
+  const int minPadeDegree = matrix_log_min_pade_degree<float>::value;
+  const int maxPadeDegree = matrix_log_max_pade_degree<float>::value;
+  int degree = minPadeDegree;
+  for (; degree <= maxPadeDegree; ++degree) 
+    if (normTminusI <= maxNormForPade[degree - minPadeDegree])
+      break;
+  return degree;
+}
+
+/* \brief Get suitable degree for Pade approximation. (specialized for RealScalar = double) */
+inline int matrix_log_get_pade_degree(double normTminusI)
+{
+  const double maxNormForPade[] = { 1.6206284795015624e-2 /* degree = 3 */ , 5.3873532631381171e-2,
+            1.1352802267628681e-1, 1.8662860613541288e-1, 2.642960831111435e-1 };
+  const int minPadeDegree = matrix_log_min_pade_degree<double>::value;
+  const int maxPadeDegree = matrix_log_max_pade_degree<double>::value;
+  int degree = minPadeDegree;
+  for (; degree <= maxPadeDegree; ++degree)
+    if (normTminusI <= maxNormForPade[degree - minPadeDegree])
+      break;
+  return degree;
+}
+
+/* \brief Get suitable degree for Pade approximation. (specialized for RealScalar = long double) */
+inline int matrix_log_get_pade_degree(long double normTminusI)
+{
+#if   LDBL_MANT_DIG == 53         // double precision
+  const long double maxNormForPade[] = { 1.6206284795015624e-2L /* degree = 3 */ , 5.3873532631381171e-2L,
+            1.1352802267628681e-1L, 1.8662860613541288e-1L, 2.642960831111435e-1L };
+#elif LDBL_MANT_DIG <= 64         // extended precision
+  const long double maxNormForPade[] = { 5.48256690357782863103e-3L /* degree = 3 */, 2.34559162387971167321e-2L,
+            5.84603923897347449857e-2L, 1.08486423756725170223e-1L, 1.68385767881294446649e-1L,
+            2.32777776523703892094e-1L };
+#elif LDBL_MANT_DIG <= 106        // double-double
+  const long double maxNormForPade[] = { 8.58970550342939562202529664318890e-5L /* degree = 3 */,
+            9.34074328446359654039446552677759e-4L, 4.26117194647672175773064114582860e-3L,
+            1.21546224740281848743149666560464e-2L, 2.61100544998339436713088248557444e-2L,
+            4.66170074627052749243018566390567e-2L, 7.32585144444135027565872014932387e-2L,
+            1.05026503471351080481093652651105e-1L };
+#else                             // quadruple precision
+  const long double maxNormForPade[] = { 4.7419931187193005048501568167858103e-5L /* degree = 3 */,
+            5.8853168473544560470387769480192666e-4L, 2.9216120366601315391789493628113520e-3L,
+            8.8415758124319434347116734705174308e-3L, 1.9850836029449446668518049562565291e-2L,
+            3.6688019729653446926585242192447447e-2L, 5.9290962294020186998954055264528393e-2L,
+            8.6998436081634343903250580992127677e-2L, 1.1880960220216759245467951592883642e-1L };
+#endif
+  const int minPadeDegree = matrix_log_min_pade_degree<long double>::value;
+  const int maxPadeDegree = matrix_log_max_pade_degree<long double>::value;
+  int degree = minPadeDegree;
+  for (; degree <= maxPadeDegree; ++degree)
+    if (normTminusI <= maxNormForPade[degree - minPadeDegree])
+      break;
+  return degree;
+}
+
+/* \brief Compute Pade approximation to matrix logarithm */
+template <typename MatrixType>
+void matrix_log_compute_pade(MatrixType& result, const MatrixType& T, int degree)
+{
+  typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+  const int minPadeDegree = 3;
+  const int maxPadeDegree = 11;
+  assert(degree >= minPadeDegree && degree <= maxPadeDegree);
+  // FIXME this creates float-conversion-warnings if these are enabled.
+  // Either manually convert each value, or disable the warning locally
+  const RealScalar nodes[][maxPadeDegree] = { 
+    { 0.1127016653792583114820734600217600L, 0.5000000000000000000000000000000000L,  // degree 3
+      0.8872983346207416885179265399782400L }, 
+    { 0.0694318442029737123880267555535953L, 0.3300094782075718675986671204483777L,  // degree 4
+      0.6699905217924281324013328795516223L, 0.9305681557970262876119732444464048L },
+    { 0.0469100770306680036011865608503035L, 0.2307653449471584544818427896498956L,  // degree 5
+      0.5000000000000000000000000000000000L, 0.7692346550528415455181572103501044L,
+      0.9530899229693319963988134391496965L },
+    { 0.0337652428984239860938492227530027L, 0.1693953067668677431693002024900473L,  // degree 6
+      0.3806904069584015456847491391596440L, 0.6193095930415984543152508608403560L,
+      0.8306046932331322568306997975099527L, 0.9662347571015760139061507772469973L },
+    { 0.0254460438286207377369051579760744L, 0.1292344072003027800680676133596058L,  // degree 7
+      0.2970774243113014165466967939615193L, 0.5000000000000000000000000000000000L,
+      0.7029225756886985834533032060384807L, 0.8707655927996972199319323866403942L,
+      0.9745539561713792622630948420239256L },
+    { 0.0198550717512318841582195657152635L, 0.1016667612931866302042230317620848L,  // degree 8
+      0.2372337950418355070911304754053768L, 0.4082826787521750975302619288199080L,
+      0.5917173212478249024697380711800920L, 0.7627662049581644929088695245946232L,
+      0.8983332387068133697957769682379152L, 0.9801449282487681158417804342847365L },
+    { 0.0159198802461869550822118985481636L, 0.0819844463366821028502851059651326L,  // degree 9
+      0.1933142836497048013456489803292629L, 0.3378732882980955354807309926783317L,
+      0.5000000000000000000000000000000000L, 0.6621267117019044645192690073216683L,
+      0.8066857163502951986543510196707371L, 0.9180155536633178971497148940348674L,
+      0.9840801197538130449177881014518364L },
+    { 0.0130467357414141399610179939577740L, 0.0674683166555077446339516557882535L,  // degree 10
+      0.1602952158504877968828363174425632L, 0.2833023029353764046003670284171079L,
+      0.4255628305091843945575869994351400L, 0.5744371694908156054424130005648600L,
+      0.7166976970646235953996329715828921L, 0.8397047841495122031171636825574368L,
+      0.9325316833444922553660483442117465L, 0.9869532642585858600389820060422260L },
+    { 0.0108856709269715035980309994385713L, 0.0564687001159523504624211153480364L,  // degree 11
+      0.1349239972129753379532918739844233L, 0.2404519353965940920371371652706952L,
+      0.3652284220238275138342340072995692L, 0.5000000000000000000000000000000000L,
+      0.6347715779761724861657659927004308L, 0.7595480646034059079628628347293048L,
+      0.8650760027870246620467081260155767L, 0.9435312998840476495375788846519636L,
+      0.9891143290730284964019690005614287L } };
+
+  const RealScalar weights[][maxPadeDegree] = { 
+    { 0.2777777777777777777777777777777778L, 0.4444444444444444444444444444444444L,  // degree 3
+      0.2777777777777777777777777777777778L },
+    { 0.1739274225687269286865319746109997L, 0.3260725774312730713134680253890003L,  // degree 4
+      0.3260725774312730713134680253890003L, 0.1739274225687269286865319746109997L },
+    { 0.1184634425280945437571320203599587L, 0.2393143352496832340206457574178191L,  // degree 5
+      0.2844444444444444444444444444444444L, 0.2393143352496832340206457574178191L,
+      0.1184634425280945437571320203599587L },
+    { 0.0856622461895851725201480710863665L, 0.1803807865240693037849167569188581L,  // degree 6
+      0.2339569672863455236949351719947755L, 0.2339569672863455236949351719947755L,
+      0.1803807865240693037849167569188581L, 0.0856622461895851725201480710863665L },
+    { 0.0647424830844348466353057163395410L, 0.1398526957446383339507338857118898L,  // degree 7
+      0.1909150252525594724751848877444876L, 0.2089795918367346938775510204081633L,
+      0.1909150252525594724751848877444876L, 0.1398526957446383339507338857118898L,
+      0.0647424830844348466353057163395410L },
+    { 0.0506142681451881295762656771549811L, 0.1111905172266872352721779972131204L,  // degree 8
+      0.1568533229389436436689811009933007L, 0.1813418916891809914825752246385978L,
+      0.1813418916891809914825752246385978L, 0.1568533229389436436689811009933007L,
+      0.1111905172266872352721779972131204L, 0.0506142681451881295762656771549811L },
+    { 0.0406371941807872059859460790552618L, 0.0903240803474287020292360156214564L,  // degree 9
+      0.1303053482014677311593714347093164L, 0.1561735385200014200343152032922218L,
+      0.1651196775006298815822625346434870L, 0.1561735385200014200343152032922218L,
+      0.1303053482014677311593714347093164L, 0.0903240803474287020292360156214564L,
+      0.0406371941807872059859460790552618L },
+    { 0.0333356721543440687967844049466659L, 0.0747256745752902965728881698288487L,  // degree 10
+      0.1095431812579910219977674671140816L, 0.1346333596549981775456134607847347L,
+      0.1477621123573764350869464973256692L, 0.1477621123573764350869464973256692L,
+      0.1346333596549981775456134607847347L, 0.1095431812579910219977674671140816L,
+      0.0747256745752902965728881698288487L, 0.0333356721543440687967844049466659L },
+    { 0.0278342835580868332413768602212743L, 0.0627901847324523123173471496119701L,  // degree 11
+      0.0931451054638671257130488207158280L, 0.1165968822959952399592618524215876L,
+      0.1314022722551233310903444349452546L, 0.1364625433889503153572417641681711L,
+      0.1314022722551233310903444349452546L, 0.1165968822959952399592618524215876L,
+      0.0931451054638671257130488207158280L, 0.0627901847324523123173471496119701L,
+      0.0278342835580868332413768602212743L } };
+
+  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
+  result.setZero(T.rows(), T.rows());
+  for (int k = 0; k < degree; ++k) {
+    RealScalar weight = weights[degree-minPadeDegree][k];
+    RealScalar node = nodes[degree-minPadeDegree][k];
+    result += weight * (MatrixType::Identity(T.rows(), T.rows()) + node * TminusI)
+                       .template triangularView<Upper>().solve(TminusI);
+  }
+} 
+
+/** \brief Compute logarithm of triangular matrices with size > 2. 
+  * \details This uses a inverse scale-and-square algorithm. */
+template <typename MatrixType>
+void matrix_log_compute_big(const MatrixType& A, MatrixType& result)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  using std::pow;
+
+  int numberOfSquareRoots = 0;
+  int numberOfExtraSquareRoots = 0;
+  int degree;
+  MatrixType T = A, sqrtT;
+
+  const int maxPadeDegree = matrix_log_max_pade_degree<Scalar>::value;
+  const RealScalar maxNormForPade = RealScalar(
+                                    maxPadeDegree<= 5? 5.3149729967117310e-1L:                    // single precision
+                                    maxPadeDegree<= 7? 2.6429608311114350e-1L:                    // double precision
+                                    maxPadeDegree<= 8? 2.32777776523703892094e-1L:                // extended precision
+                                    maxPadeDegree<=10? 1.05026503471351080481093652651105e-1L:    // double-double
+                                                       1.1880960220216759245467951592883642e-1L); // quadruple precision
+
+  while (true) {
+    RealScalar normTminusI = (T - MatrixType::Identity(T.rows(), T.rows())).cwiseAbs().colwise().sum().maxCoeff();
+    if (normTminusI < maxNormForPade) {
+      degree = matrix_log_get_pade_degree(normTminusI);
+      int degree2 = matrix_log_get_pade_degree(normTminusI / RealScalar(2));
+      if ((degree - degree2 <= 1) || (numberOfExtraSquareRoots == 1)) 
+        break;
+      ++numberOfExtraSquareRoots;
+    }
+    matrix_sqrt_triangular(T, sqrtT);
+    T = sqrtT.template triangularView<Upper>();
+    ++numberOfSquareRoots;
+  }
+
+  matrix_log_compute_pade(result, T, degree);
+  result *= pow(RealScalar(2), RealScalar(numberOfSquareRoots)); // TODO replace by bitshift if possible
+}
+
+/** \ingroup MatrixFunctions_Module
+  * \class MatrixLogarithmAtomic
+  * \brief Helper class for computing matrix logarithm of atomic matrices.
+  *
+  * Here, an atomic matrix is a triangular matrix whose diagonal entries are close to each other.
+  *
+  * \sa class MatrixFunctionAtomic, MatrixBase::log()
+  */
+template <typename MatrixType>
+class MatrixLogarithmAtomic
+{
+public:
+  /** \brief Compute matrix logarithm of atomic matrix
+    * \param[in]  A  argument of matrix logarithm, should be upper triangular and atomic
+    * \returns  The logarithm of \p A.
+    */
+  MatrixType compute(const MatrixType& A);
+};
+
+template <typename MatrixType>
+MatrixType MatrixLogarithmAtomic<MatrixType>::compute(const MatrixType& A)
+{
+  using std::log;
+  MatrixType result(A.rows(), A.rows());
+  if (A.rows() == 1)
+    result(0,0) = log(A(0,0));
+  else if (A.rows() == 2)
+    matrix_log_compute_2x2(A, result);
+  else
+    matrix_log_compute_big(A, result);
+  return result;
+}
+
+} // end of namespace internal
+
+/** \ingroup MatrixFunctions_Module
+  *
+  * \brief Proxy for the matrix logarithm of some matrix (expression).
+  *
+  * \tparam Derived  Type of the argument to the matrix function.
+  *
+  * This class holds the argument to the matrix function until it is
+  * assigned or evaluated for some other reason (so the argument
+  * should not be changed in the meantime). It is the return type of
+  * MatrixBase::log() and most of the time this is the only way it
+  * is used.
+  */
+template<typename Derived> class MatrixLogarithmReturnValue
+: public ReturnByValue<MatrixLogarithmReturnValue<Derived> >
+{
+public:
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Index Index;
+
+protected:
+  typedef typename internal::ref_selector<Derived>::type DerivedNested;
+
+public:
+
+  /** \brief Constructor.
+    *
+    * \param[in]  A  %Matrix (expression) forming the argument of the matrix logarithm.
+    */
+  explicit MatrixLogarithmReturnValue(const Derived& A) : m_A(A) { }
+  
+  /** \brief Compute the matrix logarithm.
+    *
+    * \param[out]  result  Logarithm of \c A, where \c A is as specified in the constructor.
+    */
+  template <typename ResultType>
+  inline void evalTo(ResultType& result) const
+  {
+    typedef typename internal::nested_eval<Derived, 10>::type DerivedEvalType;
+    typedef typename internal::remove_all<DerivedEvalType>::type DerivedEvalTypeClean;
+    typedef internal::traits<DerivedEvalTypeClean> Traits;
+    typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
+    typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
+    typedef internal::MatrixLogarithmAtomic<DynMatrixType> AtomicType;
+    AtomicType atomic;
+    
+    internal::matrix_function_compute<typename DerivedEvalTypeClean::PlainObject>::run(m_A, atomic, result);
+  }
+
+  Index rows() const { return m_A.rows(); }
+  Index cols() const { return m_A.cols(); }
+  
+private:
+  const DerivedNested m_A;
+};
+
+namespace internal {
+  template<typename Derived>
+  struct traits<MatrixLogarithmReturnValue<Derived> >
+  {
+    typedef typename Derived::PlainObject ReturnType;
+  };
+}
+
+
+/********** MatrixBase method **********/
+
+
+template <typename Derived>
+const MatrixLogarithmReturnValue<Derived> MatrixBase<Derived>::log() const
+{
+  eigen_assert(rows() == cols());
+  return MatrixLogarithmReturnValue<Derived>(derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIX_LOGARITHM

diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
new file mode 100644
index 0000000..d7672d7
--- /dev/null
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h

@@ -0,0 +1,705 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012, 2013 Chen-Pang He <jdh8@ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_POWER
+#define EIGEN_MATRIX_POWER
+
+namespace Eigen {
+
+template<typename MatrixType> class MatrixPower;
+
+/**
+ * \ingroup MatrixFunctions_Module
+ *
+ * \brief Proxy for the matrix power of some matrix.
+ *
+ * \tparam MatrixType  type of the base, a matrix.
+ *
+ * This class holds the arguments to the matrix power until it is
+ * assigned or evaluated for some other reason (so the argument
+ * should not be changed in the meantime). It is the return type of
+ * MatrixPower::operator() and related functions and most of the
+ * time this is the only way it is used.
+ */
+/* TODO This class is only used by MatrixPower, so it should be nested
+ * into MatrixPower, like MatrixPower::ReturnValue. However, my
+ * compiler complained about unused template parameter in the
+ * following declaration in namespace internal.
+ *
+ * template<typename MatrixType>
+ * struct traits<MatrixPower<MatrixType>::ReturnValue>;
+ */
+template<typename MatrixType>
+class MatrixPowerParenthesesReturnValue : public ReturnByValue< MatrixPowerParenthesesReturnValue<MatrixType> >
+{
+  public:
+    typedef typename MatrixType::RealScalar RealScalar;
+
+    /**
+     * \brief Constructor.
+     *
+     * \param[in] pow  %MatrixPower storing the base.
+     * \param[in] p    scalar, the exponent of the matrix power.
+     */
+    MatrixPowerParenthesesReturnValue(MatrixPower<MatrixType>& pow, RealScalar p) : m_pow(pow), m_p(p)
+    { }
+
+    /**
+     * \brief Compute the matrix power.
+     *
+     * \param[out] result
+     */
+    template<typename ResultType>
+    inline void evalTo(ResultType& result) const
+    { m_pow.compute(result, m_p); }
+
+    Index rows() const { return m_pow.rows(); }
+    Index cols() const { return m_pow.cols(); }
+
+  private:
+    MatrixPower<MatrixType>& m_pow;
+    const RealScalar m_p;
+};
+
+/**
+ * \ingroup MatrixFunctions_Module
+ *
+ * \brief Class for computing matrix powers.
+ *
+ * \tparam MatrixType  type of the base, expected to be an instantiation
+ * of the Matrix class template.
+ *
+ * This class is capable of computing triangular real/complex matrices
+ * raised to a power in the interval \f$ (-1, 1) \f$.
+ *
+ * \note Currently this class is only used by MatrixPower. One may
+ * insist that this be nested into MatrixPower. This class is here to
+ * facilitate future development of triangular matrix functions.
+ */
+template<typename MatrixType>
+class MatrixPowerAtomic : internal::noncopyable
+{
+  private:
+    enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime
+    };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef std::complex<RealScalar> ComplexScalar;
+    typedef Block<MatrixType,Dynamic,Dynamic> ResultType;
+
+    const MatrixType& m_A;
+    RealScalar m_p;
+
+    void computePade(int degree, const MatrixType& IminusT, ResultType& res) const;
+    void compute2x2(ResultType& res, RealScalar p) const;
+    void computeBig(ResultType& res) const;
+    static int getPadeDegree(float normIminusT);
+    static int getPadeDegree(double normIminusT);
+    static int getPadeDegree(long double normIminusT);
+    static ComplexScalar computeSuperDiag(const ComplexScalar&, const ComplexScalar&, RealScalar p);
+    static RealScalar computeSuperDiag(RealScalar, RealScalar, RealScalar p);
+
+  public:
+    /**
+     * \brief Constructor.
+     *
+     * \param[in] T  the base of the matrix power.
+     * \param[in] p  the exponent of the matrix power, should be in
+     * \f$ (-1, 1) \f$.
+     *
+     * The class stores a reference to T, so it should not be changed
+     * (or destroyed) before evaluation. Only the upper triangular
+     * part of T is read.
+     */
+    MatrixPowerAtomic(const MatrixType& T, RealScalar p);
+    
+    /**
+     * \brief Compute the matrix power.
+     *
+     * \param[out] res  \f$ A^p \f$ where A and p are specified in the
+     * constructor.
+     */
+    void compute(ResultType& res) const;
+};
+
+template<typename MatrixType>
+MatrixPowerAtomic<MatrixType>::MatrixPowerAtomic(const MatrixType& T, RealScalar p) :
+  m_A(T), m_p(p)
+{
+  eigen_assert(T.rows() == T.cols());
+  eigen_assert(p > -1 && p < 1);
+}
+
+template<typename MatrixType>
+void MatrixPowerAtomic<MatrixType>::compute(ResultType& res) const
+{
+  using std::pow;
+  switch (m_A.rows()) {
+    case 0:
+      break;
+    case 1:
+      res(0,0) = pow(m_A(0,0), m_p);
+      break;
+    case 2:
+      compute2x2(res, m_p);
+      break;
+    default:
+      computeBig(res);
+  }
+}
+
+template<typename MatrixType>
+void MatrixPowerAtomic<MatrixType>::computePade(int degree, const MatrixType& IminusT, ResultType& res) const
+{
+  int i = 2*degree;
+  res = (m_p-RealScalar(degree)) / RealScalar(2*i-2) * IminusT;
+
+  for (--i; i; --i) {
+    res = (MatrixType::Identity(IminusT.rows(), IminusT.cols()) + res).template triangularView<Upper>()
+	.solve((i==1 ? -m_p : i&1 ? (-m_p-RealScalar(i/2))/RealScalar(2*i) : (m_p-RealScalar(i/2))/RealScalar(2*i-2)) * IminusT).eval();
+  }
+  res += MatrixType::Identity(IminusT.rows(), IminusT.cols());
+}
+
+// This function assumes that res has the correct size (see bug 614)
+template<typename MatrixType>
+void MatrixPowerAtomic<MatrixType>::compute2x2(ResultType& res, RealScalar p) const
+{
+  using std::abs;
+  using std::pow;
+  res.coeffRef(0,0) = pow(m_A.coeff(0,0), p);
+
+  for (Index i=1; i < m_A.cols(); ++i) {
+    res.coeffRef(i,i) = pow(m_A.coeff(i,i), p);
+    if (m_A.coeff(i-1,i-1) == m_A.coeff(i,i))
+      res.coeffRef(i-1,i) = p * pow(m_A.coeff(i,i), p-1);
+    else if (2*abs(m_A.coeff(i-1,i-1)) < abs(m_A.coeff(i,i)) || 2*abs(m_A.coeff(i,i)) < abs(m_A.coeff(i-1,i-1)))
+      res.coeffRef(i-1,i) = (res.coeff(i,i)-res.coeff(i-1,i-1)) / (m_A.coeff(i,i)-m_A.coeff(i-1,i-1));
+    else
+      res.coeffRef(i-1,i) = computeSuperDiag(m_A.coeff(i,i), m_A.coeff(i-1,i-1), p);
+    res.coeffRef(i-1,i) *= m_A.coeff(i-1,i);
+  }
+}
+
+template<typename MatrixType>
+void MatrixPowerAtomic<MatrixType>::computeBig(ResultType& res) const
+{
+  using std::ldexp;
+  const int digits = std::numeric_limits<RealScalar>::digits;
+  const RealScalar maxNormForPade = RealScalar(
+                                    digits <=  24? 4.3386528e-1L                            // single precision
+                                  : digits <=  53? 2.789358995219730e-1L                    // double precision
+                                  : digits <=  64? 2.4471944416607995472e-1L                // extended precision
+                                  : digits <= 106? 1.1016843812851143391275867258512e-1L    // double-double
+                                  :                9.134603732914548552537150753385375e-2L); // quadruple precision
+  MatrixType IminusT, sqrtT, T = m_A.template triangularView<Upper>();
+  RealScalar normIminusT;
+  int degree, degree2, numberOfSquareRoots = 0;
+  bool hasExtraSquareRoot = false;
+
+  for (Index i=0; i < m_A.cols(); ++i)
+    eigen_assert(m_A(i,i) != RealScalar(0));
+
+  while (true) {
+    IminusT = MatrixType::Identity(m_A.rows(), m_A.cols()) - T;
+    normIminusT = IminusT.cwiseAbs().colwise().sum().maxCoeff();
+    if (normIminusT < maxNormForPade) {
+      degree = getPadeDegree(normIminusT);
+      degree2 = getPadeDegree(normIminusT/2);
+      if (degree - degree2 <= 1 || hasExtraSquareRoot)
+	break;
+      hasExtraSquareRoot = true;
+    }
+    matrix_sqrt_triangular(T, sqrtT);
+    T = sqrtT.template triangularView<Upper>();
+    ++numberOfSquareRoots;
+  }
+  computePade(degree, IminusT, res);
+
+  for (; numberOfSquareRoots; --numberOfSquareRoots) {
+    compute2x2(res, ldexp(m_p, -numberOfSquareRoots));
+    res = res.template triangularView<Upper>() * res;
+  }
+  compute2x2(res, m_p);
+}
+  
+template<typename MatrixType>
+inline int MatrixPowerAtomic<MatrixType>::getPadeDegree(float normIminusT)
+{
+  const float maxNormForPade[] = { 2.8064004e-1f /* degree = 3 */ , 4.3386528e-1f };
+  int degree = 3;
+  for (; degree <= 4; ++degree)
+    if (normIminusT <= maxNormForPade[degree - 3])
+      break;
+  return degree;
+}
+
+template<typename MatrixType>
+inline int MatrixPowerAtomic<MatrixType>::getPadeDegree(double normIminusT)
+{
+  const double maxNormForPade[] = { 1.884160592658218e-2 /* degree = 3 */ , 6.038881904059573e-2, 1.239917516308172e-1,
+      1.999045567181744e-1, 2.789358995219730e-1 };
+  int degree = 3;
+  for (; degree <= 7; ++degree)
+    if (normIminusT <= maxNormForPade[degree - 3])
+      break;
+  return degree;
+}
+
+template<typename MatrixType>
+inline int MatrixPowerAtomic<MatrixType>::getPadeDegree(long double normIminusT)
+{
+#if   LDBL_MANT_DIG == 53
+  const int maxPadeDegree = 7;
+  const double maxNormForPade[] = { 1.884160592658218e-2L /* degree = 3 */ , 6.038881904059573e-2L, 1.239917516308172e-1L,
+      1.999045567181744e-1L, 2.789358995219730e-1L };
+#elif LDBL_MANT_DIG <= 64
+  const int maxPadeDegree = 8;
+  const long double maxNormForPade[] = { 6.3854693117491799460e-3L /* degree = 3 */ , 2.6394893435456973676e-2L,
+      6.4216043030404063729e-2L, 1.1701165502926694307e-1L, 1.7904284231268670284e-1L, 2.4471944416607995472e-1L };
+#elif LDBL_MANT_DIG <= 106
+  const int maxPadeDegree = 10;
+  const double maxNormForPade[] = { 1.0007161601787493236741409687186e-4L /* degree = 3 */ ,
+      1.0007161601787493236741409687186e-3L, 4.7069769360887572939882574746264e-3L, 1.3220386624169159689406653101695e-2L,
+      2.8063482381631737920612944054906e-2L, 4.9625993951953473052385361085058e-2L, 7.7367040706027886224557538328171e-2L,
+      1.1016843812851143391275867258512e-1L };
+#else
+  const int maxPadeDegree = 10;
+  const double maxNormForPade[] = { 5.524506147036624377378713555116378e-5L /* degree = 3 */ ,
+      6.640600568157479679823602193345995e-4L, 3.227716520106894279249709728084626e-3L,
+      9.619593944683432960546978734646284e-3L, 2.134595382433742403911124458161147e-2L,
+      3.908166513900489428442993794761185e-2L, 6.266780814639442865832535460550138e-2L,
+      9.134603732914548552537150753385375e-2L };
+#endif
+  int degree = 3;
+  for (; degree <= maxPadeDegree; ++degree)
+    if (normIminusT <= maxNormForPade[degree - 3])
+      break;
+  return degree;
+}
+
+template<typename MatrixType>
+inline typename MatrixPowerAtomic<MatrixType>::ComplexScalar
+MatrixPowerAtomic<MatrixType>::computeSuperDiag(const ComplexScalar& curr, const ComplexScalar& prev, RealScalar p)
+{
+  using std::ceil;
+  using std::exp;
+  using std::log;
+  using std::sinh;
+
+  ComplexScalar logCurr = log(curr);
+  ComplexScalar logPrev = log(prev);
+  RealScalar unwindingNumber = ceil((numext::imag(logCurr - logPrev) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI));
+  ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, RealScalar(EIGEN_PI)*unwindingNumber);
+  return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev);
+}
+
+template<typename MatrixType>
+inline typename MatrixPowerAtomic<MatrixType>::RealScalar
+MatrixPowerAtomic<MatrixType>::computeSuperDiag(RealScalar curr, RealScalar prev, RealScalar p)
+{
+  using std::exp;
+  using std::log;
+  using std::sinh;
+
+  RealScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2);
+  return 2 * exp(p * (log(curr) + log(prev)) / 2) * sinh(p * w) / (curr - prev);
+}
+
+/**
+ * \ingroup MatrixFunctions_Module
+ *
+ * \brief Class for computing matrix powers.
+ *
+ * \tparam MatrixType  type of the base, expected to be an instantiation
+ * of the Matrix class template.
+ *
+ * This class is capable of computing real/complex matrices raised to
+ * an arbitrary real power. Meanwhile, it saves the result of Schur
+ * decomposition if an non-integral power has even been calculated.
+ * Therefore, if you want to compute multiple (>= 2) matrix powers
+ * for the same matrix, using the class directly is more efficient than
+ * calling MatrixBase::pow().
+ *
+ * Example:
+ * \include MatrixPower_optimal.cpp
+ * Output: \verbinclude MatrixPower_optimal.out
+ */
+template<typename MatrixType>
+class MatrixPower : internal::noncopyable
+{
+  private:
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+
+  public:
+    /**
+     * \brief Constructor.
+     *
+     * \param[in] A  the base of the matrix power.
+     *
+     * The class stores a reference to A, so it should not be changed
+     * (or destroyed) before evaluation.
+     */
+    explicit MatrixPower(const MatrixType& A) :
+      m_A(A),
+      m_conditionNumber(0),
+      m_rank(A.cols()),
+      m_nulls(0)
+    { eigen_assert(A.rows() == A.cols()); }
+
+    /**
+     * \brief Returns the matrix power.
+     *
+     * \param[in] p  exponent, a real scalar.
+     * \return The expression \f$ A^p \f$, where A is specified in the
+     * constructor.
+     */
+    const MatrixPowerParenthesesReturnValue<MatrixType> operator()(RealScalar p)
+    { return MatrixPowerParenthesesReturnValue<MatrixType>(*this, p); }
+
+    /**
+     * \brief Compute the matrix power.
+     *
+     * \param[in]  p    exponent, a real scalar.
+     * \param[out] res  \f$ A^p \f$ where A is specified in the
+     * constructor.
+     */
+    template<typename ResultType>
+    void compute(ResultType& res, RealScalar p);
+    
+    Index rows() const { return m_A.rows(); }
+    Index cols() const { return m_A.cols(); }
+
+  private:
+    typedef std::complex<RealScalar> ComplexScalar;
+    typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0,
+              MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime> ComplexMatrix;
+
+    /** \brief Reference to the base of matrix power. */
+    typename MatrixType::Nested m_A;
+
+    /** \brief Temporary storage. */
+    MatrixType m_tmp;
+
+    /** \brief Store the result of Schur decomposition. */
+    ComplexMatrix m_T, m_U;
+    
+    /** \brief Store fractional power of m_T. */
+    ComplexMatrix m_fT;
+
+    /**
+     * \brief Condition number of m_A.
+     *
+     * It is initialized as 0 to avoid performing unnecessary Schur
+     * decomposition, which is the bottleneck.
+     */
+    RealScalar m_conditionNumber;
+
+    /** \brief Rank of m_A. */
+    Index m_rank;
+    
+    /** \brief Rank deficiency of m_A. */
+    Index m_nulls;
+
+    /**
+     * \brief Split p into integral part and fractional part.
+     *
+     * \param[in]  p        The exponent.
+     * \param[out] p        The fractional part ranging in \f$ (-1, 1) \f$.
+     * \param[out] intpart  The integral part.
+     *
+     * Only if the fractional part is nonzero, it calls initialize().
+     */
+    void split(RealScalar& p, RealScalar& intpart);
+
+    /** \brief Perform Schur decomposition for fractional power. */
+    void initialize();
+
+    template<typename ResultType>
+    void computeIntPower(ResultType& res, RealScalar p);
+
+    template<typename ResultType>
+    void computeFracPower(ResultType& res, RealScalar p);
+
+    template<int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+    static void revertSchur(
+        Matrix<ComplexScalar, Rows, Cols, Options, MaxRows, MaxCols>& res,
+        const ComplexMatrix& T,
+        const ComplexMatrix& U);
+
+    template<int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+    static void revertSchur(
+        Matrix<RealScalar, Rows, Cols, Options, MaxRows, MaxCols>& res,
+        const ComplexMatrix& T,
+        const ComplexMatrix& U);
+};
+
+template<typename MatrixType>
+template<typename ResultType>
+void MatrixPower<MatrixType>::compute(ResultType& res, RealScalar p)
+{
+  using std::pow;
+  switch (cols()) {
+    case 0:
+      break;
+    case 1:
+      res(0,0) = pow(m_A.coeff(0,0), p);
+      break;
+    default:
+      RealScalar intpart;
+      split(p, intpart);
+
+      res = MatrixType::Identity(rows(), cols());
+      computeIntPower(res, intpart);
+      if (p) computeFracPower(res, p);
+  }
+}
+
+template<typename MatrixType>
+void MatrixPower<MatrixType>::split(RealScalar& p, RealScalar& intpart)
+{
+  using std::floor;
+  using std::pow;
+
+  intpart = floor(p);
+  p -= intpart;
+
+  // Perform Schur decomposition if it is not yet performed and the power is
+  // not an integer.
+  if (!m_conditionNumber && p)
+    initialize();
+
+  // Choose the more stable of intpart = floor(p) and intpart = ceil(p).
+  if (p > RealScalar(0.5) && p > (1-p) * pow(m_conditionNumber, p)) {
+    --p;
+    ++intpart;
+  }
+}
+
+template<typename MatrixType>
+void MatrixPower<MatrixType>::initialize()
+{
+  const ComplexSchur<MatrixType> schurOfA(m_A);
+  JacobiRotation<ComplexScalar> rot;
+  ComplexScalar eigenvalue;
+
+  m_fT.resizeLike(m_A);
+  m_T = schurOfA.matrixT();
+  m_U = schurOfA.matrixU();
+  m_conditionNumber = m_T.diagonal().array().abs().maxCoeff() / m_T.diagonal().array().abs().minCoeff();
+
+  // Move zero eigenvalues to the bottom right corner.
+  for (Index i = cols()-1; i>=0; --i) {
+    if (m_rank <= 2)
+      return;
+    if (m_T.coeff(i,i) == RealScalar(0)) {
+      for (Index j=i+1; j < m_rank; ++j) {
+        eigenvalue = m_T.coeff(j,j);
+        rot.makeGivens(m_T.coeff(j-1,j), eigenvalue);
+        m_T.applyOnTheRight(j-1, j, rot);
+        m_T.applyOnTheLeft(j-1, j, rot.adjoint());
+        m_T.coeffRef(j-1,j-1) = eigenvalue;
+        m_T.coeffRef(j,j) = RealScalar(0);
+        m_U.applyOnTheRight(j-1, j, rot);
+      }
+      --m_rank;
+    }
+  }
+
+  m_nulls = rows() - m_rank;
+  if (m_nulls) {
+    eigen_assert(m_T.bottomRightCorner(m_nulls, m_nulls).isZero()
+        && "Base of matrix power should be invertible or with a semisimple zero eigenvalue.");
+    m_fT.bottomRows(m_nulls).fill(RealScalar(0));
+  }
+}
+
+template<typename MatrixType>
+template<typename ResultType>
+void MatrixPower<MatrixType>::computeIntPower(ResultType& res, RealScalar p)
+{
+  using std::abs;
+  using std::fmod;
+  RealScalar pp = abs(p);
+
+  if (p<0) 
+    m_tmp = m_A.inverse();
+  else     
+    m_tmp = m_A;
+
+  while (true) {
+    if (fmod(pp, 2) >= 1)
+      res = m_tmp * res;
+    pp /= 2;
+    if (pp < 1)
+      break;
+    m_tmp *= m_tmp;
+  }
+}
+
+template<typename MatrixType>
+template<typename ResultType>
+void MatrixPower<MatrixType>::computeFracPower(ResultType& res, RealScalar p)
+{
+  Block<ComplexMatrix,Dynamic,Dynamic> blockTp(m_fT, 0, 0, m_rank, m_rank);
+  eigen_assert(m_conditionNumber);
+  eigen_assert(m_rank + m_nulls == rows());
+
+  MatrixPowerAtomic<ComplexMatrix>(m_T.topLeftCorner(m_rank, m_rank), p).compute(blockTp);
+  if (m_nulls) {
+    m_fT.topRightCorner(m_rank, m_nulls) = m_T.topLeftCorner(m_rank, m_rank).template triangularView<Upper>()
+        .solve(blockTp * m_T.topRightCorner(m_rank, m_nulls));
+  }
+  revertSchur(m_tmp, m_fT, m_U);
+  res = m_tmp * res;
+}
+
+template<typename MatrixType>
+template<int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+inline void MatrixPower<MatrixType>::revertSchur(
+    Matrix<ComplexScalar, Rows, Cols, Options, MaxRows, MaxCols>& res,
+    const ComplexMatrix& T,
+    const ComplexMatrix& U)
+{ res.noalias() = U * (T.template triangularView<Upper>() * U.adjoint()); }
+
+template<typename MatrixType>
+template<int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+inline void MatrixPower<MatrixType>::revertSchur(
+    Matrix<RealScalar, Rows, Cols, Options, MaxRows, MaxCols>& res,
+    const ComplexMatrix& T,
+    const ComplexMatrix& U)
+{ res.noalias() = (U * (T.template triangularView<Upper>() * U.adjoint())).real(); }
+
+/**
+ * \ingroup MatrixFunctions_Module
+ *
+ * \brief Proxy for the matrix power of some matrix (expression).
+ *
+ * \tparam Derived  type of the base, a matrix (expression).
+ *
+ * This class holds the arguments to the matrix power until it is
+ * assigned or evaluated for some other reason (so the argument
+ * should not be changed in the meantime). It is the return type of
+ * MatrixBase::pow() and related functions and most of the
+ * time this is the only way it is used.
+ */
+template<typename Derived>
+class MatrixPowerReturnValue : public ReturnByValue< MatrixPowerReturnValue<Derived> >
+{
+  public:
+    typedef typename Derived::PlainObject PlainObject;
+    typedef typename Derived::RealScalar RealScalar;
+
+    /**
+     * \brief Constructor.
+     *
+     * \param[in] A  %Matrix (expression), the base of the matrix power.
+     * \param[in] p  real scalar, the exponent of the matrix power.
+     */
+    MatrixPowerReturnValue(const Derived& A, RealScalar p) : m_A(A), m_p(p)
+    { }
+
+    /**
+     * \brief Compute the matrix power.
+     *
+     * \param[out] result  \f$ A^p \f$ where \p A and \p p are as in the
+     * constructor.
+     */
+    template<typename ResultType>
+    inline void evalTo(ResultType& result) const
+    { MatrixPower<PlainObject>(m_A.eval()).compute(result, m_p); }
+
+    Index rows() const { return m_A.rows(); }
+    Index cols() const { return m_A.cols(); }
+
+  private:
+    const Derived& m_A;
+    const RealScalar m_p;
+};
+
+/**
+ * \ingroup MatrixFunctions_Module
+ *
+ * \brief Proxy for the matrix power of some matrix (expression).
+ *
+ * \tparam Derived  type of the base, a matrix (expression).
+ *
+ * This class holds the arguments to the matrix power until it is
+ * assigned or evaluated for some other reason (so the argument
+ * should not be changed in the meantime). It is the return type of
+ * MatrixBase::pow() and related functions and most of the
+ * time this is the only way it is used.
+ */
+template<typename Derived>
+class MatrixComplexPowerReturnValue : public ReturnByValue< MatrixComplexPowerReturnValue<Derived> >
+{
+  public:
+    typedef typename Derived::PlainObject PlainObject;
+    typedef typename std::complex<typename Derived::RealScalar> ComplexScalar;
+
+    /**
+     * \brief Constructor.
+     *
+     * \param[in] A  %Matrix (expression), the base of the matrix power.
+     * \param[in] p  complex scalar, the exponent of the matrix power.
+     */
+    MatrixComplexPowerReturnValue(const Derived& A, const ComplexScalar& p) : m_A(A), m_p(p)
+    { }
+
+    /**
+     * \brief Compute the matrix power.
+     *
+     * Because \p p is complex, \f$ A^p \f$ is simply evaluated as \f$
+     * \exp(p \log(A)) \f$.
+     *
+     * \param[out] result  \f$ A^p \f$ where \p A and \p p are as in the
+     * constructor.
+     */
+    template<typename ResultType>
+    inline void evalTo(ResultType& result) const
+    { result = (m_p * m_A.log()).exp(); }
+
+    Index rows() const { return m_A.rows(); }
+    Index cols() const { return m_A.cols(); }
+
+  private:
+    const Derived& m_A;
+    const ComplexScalar m_p;
+};
+
+namespace internal {
+
+template<typename MatrixPowerType>
+struct traits< MatrixPowerParenthesesReturnValue<MatrixPowerType> >
+{ typedef typename MatrixPowerType::PlainObject ReturnType; };
+
+template<typename Derived>
+struct traits< MatrixPowerReturnValue<Derived> >
+{ typedef typename Derived::PlainObject ReturnType; };
+
+template<typename Derived>
+struct traits< MatrixComplexPowerReturnValue<Derived> >
+{ typedef typename Derived::PlainObject ReturnType; };
+
+}
+
+template<typename Derived>
+const MatrixPowerReturnValue<Derived> MatrixBase<Derived>::pow(const RealScalar& p) const
+{ return MatrixPowerReturnValue<Derived>(derived(), p); }
+
+template<typename Derived>
+const MatrixComplexPowerReturnValue<Derived> MatrixBase<Derived>::pow(const std::complex<RealScalar>& p) const
+{ return MatrixComplexPowerReturnValue<Derived>(derived(), p); }
+
+} // namespace Eigen
+
+#endif // EIGEN_MATRIX_POWER

diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
new file mode 100644
index 0000000..e363e77
--- /dev/null
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h

@@ -0,0 +1,368 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011, 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_SQUARE_ROOT
+#define EIGEN_MATRIX_SQUARE_ROOT
+
+namespace Eigen { 
+
+namespace internal {
+
+// pre:  T.block(i,i,2,2) has complex conjugate eigenvalues
+// post: sqrtT.block(i,i,2,2) is square root of T.block(i,i,2,2)
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, Index i, ResultType& sqrtT)
+{
+  // TODO: This case (2-by-2 blocks with complex conjugate eigenvalues) is probably hidden somewhere
+  //       in EigenSolver. If we expose it, we could call it directly from here.
+  typedef typename traits<MatrixType>::Scalar Scalar;
+  Matrix<Scalar,2,2> block = T.template block<2,2>(i,i);
+  EigenSolver<Matrix<Scalar,2,2> > es(block);
+  sqrtT.template block<2,2>(i,i)
+    = (es.eigenvectors() * es.eigenvalues().cwiseSqrt().asDiagonal() * es.eigenvectors().inverse()).real();
+}
+
+// pre:  block structure of T is such that (i,j) is a 1x1 block,
+//       all blocks of sqrtT to left of and below (i,j) are correct
+// post: sqrtT(i,j) has the correct value
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
+{
+  typedef typename traits<MatrixType>::Scalar Scalar;
+  Scalar tmp = (sqrtT.row(i).segment(i+1,j-i-1) * sqrtT.col(j).segment(i+1,j-i-1)).value();
+  sqrtT.coeffRef(i,j) = (T.coeff(i,j) - tmp) / (sqrtT.coeff(i,i) + sqrtT.coeff(j,j));
+}
+
+// similar to compute1x1offDiagonalBlock()
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
+{
+  typedef typename traits<MatrixType>::Scalar Scalar;
+  Matrix<Scalar,1,2> rhs = T.template block<1,2>(i,j);
+  if (j-i > 1)
+    rhs -= sqrtT.block(i, i+1, 1, j-i-1) * sqrtT.block(i+1, j, j-i-1, 2);
+  Matrix<Scalar,2,2> A = sqrtT.coeff(i,i) * Matrix<Scalar,2,2>::Identity();
+  A += sqrtT.template block<2,2>(j,j).transpose();
+  sqrtT.template block<1,2>(i,j).transpose() = A.fullPivLu().solve(rhs.transpose());
+}
+
+// similar to compute1x1offDiagonalBlock()
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
+{
+  typedef typename traits<MatrixType>::Scalar Scalar;
+  Matrix<Scalar,2,1> rhs = T.template block<2,1>(i,j);
+  if (j-i > 2)
+    rhs -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 1);
+  Matrix<Scalar,2,2> A = sqrtT.coeff(j,j) * Matrix<Scalar,2,2>::Identity();
+  A += sqrtT.template block<2,2>(i,i);
+  sqrtT.template block<2,1>(i,j) = A.fullPivLu().solve(rhs);
+}
+
+// solves the equation A X + X B = C where all matrices are 2-by-2
+template <typename MatrixType>
+void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const MatrixType& A, const MatrixType& B, const MatrixType& C)
+{
+  typedef typename traits<MatrixType>::Scalar Scalar;
+  Matrix<Scalar,4,4> coeffMatrix = Matrix<Scalar,4,4>::Zero();
+  coeffMatrix.coeffRef(0,0) = A.coeff(0,0) + B.coeff(0,0);
+  coeffMatrix.coeffRef(1,1) = A.coeff(0,0) + B.coeff(1,1);
+  coeffMatrix.coeffRef(2,2) = A.coeff(1,1) + B.coeff(0,0);
+  coeffMatrix.coeffRef(3,3) = A.coeff(1,1) + B.coeff(1,1);
+  coeffMatrix.coeffRef(0,1) = B.coeff(1,0);
+  coeffMatrix.coeffRef(0,2) = A.coeff(0,1);
+  coeffMatrix.coeffRef(1,0) = B.coeff(0,1);
+  coeffMatrix.coeffRef(1,3) = A.coeff(0,1);
+  coeffMatrix.coeffRef(2,0) = A.coeff(1,0);
+  coeffMatrix.coeffRef(2,3) = B.coeff(1,0);
+  coeffMatrix.coeffRef(3,1) = A.coeff(1,0);
+  coeffMatrix.coeffRef(3,2) = B.coeff(0,1);
+
+  Matrix<Scalar,4,1> rhs;
+  rhs.coeffRef(0) = C.coeff(0,0);
+  rhs.coeffRef(1) = C.coeff(0,1);
+  rhs.coeffRef(2) = C.coeff(1,0);
+  rhs.coeffRef(3) = C.coeff(1,1);
+
+  Matrix<Scalar,4,1> result;
+  result = coeffMatrix.fullPivLu().solve(rhs);
+
+  X.coeffRef(0,0) = result.coeff(0);
+  X.coeffRef(0,1) = result.coeff(1);
+  X.coeffRef(1,0) = result.coeff(2);
+  X.coeffRef(1,1) = result.coeff(3);
+}
+
+// similar to compute1x1offDiagonalBlock()
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
+{
+  typedef typename traits<MatrixType>::Scalar Scalar;
+  Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i);
+  Matrix<Scalar,2,2> B = sqrtT.template block<2,2>(j,j);
+  Matrix<Scalar,2,2> C = T.template block<2,2>(i,j);
+  if (j-i > 2)
+    C -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 2);
+  Matrix<Scalar,2,2> X;
+  matrix_sqrt_quasi_triangular_solve_auxiliary_equation(X, A, B, C);
+  sqrtT.template block<2,2>(i,j) = X;
+}
+
+// pre:  T is quasi-upper-triangular and sqrtT is a zero matrix of the same size
+// post: the diagonal blocks of sqrtT are the square roots of the diagonal blocks of T
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_diagonal(const MatrixType& T, ResultType& sqrtT)
+{
+  using std::sqrt;
+  const Index size = T.rows();
+  for (Index i = 0; i < size; i++) {
+    if (i == size - 1 || T.coeff(i+1, i) == 0) {
+      eigen_assert(T(i,i) >= 0);
+      sqrtT.coeffRef(i,i) = sqrt(T.coeff(i,i));
+    }
+    else {
+      matrix_sqrt_quasi_triangular_2x2_diagonal_block(T, i, sqrtT);
+      ++i;
+    }
+  }
+}
+
+// pre:  T is quasi-upper-triangular and diagonal blocks of sqrtT are square root of diagonal blocks of T.
+// post: sqrtT is the square root of T.
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_off_diagonal(const MatrixType& T, ResultType& sqrtT)
+{
+  const Index size = T.rows();
+  for (Index j = 1; j < size; j++) {
+      if (T.coeff(j, j-1) != 0)  // if T(j-1:j, j-1:j) is a 2-by-2 block
+	continue;
+    for (Index i = j-1; i >= 0; i--) {
+      if (i > 0 && T.coeff(i, i-1) != 0)  // if T(i-1:i, i-1:i) is a 2-by-2 block
+	continue;
+      bool iBlockIs2x2 = (i < size - 1) && (T.coeff(i+1, i) != 0);
+      bool jBlockIs2x2 = (j < size - 1) && (T.coeff(j+1, j) != 0);
+      if (iBlockIs2x2 && jBlockIs2x2) 
+        matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(T, i, j, sqrtT);
+      else if (iBlockIs2x2 && !jBlockIs2x2) 
+        matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(T, i, j, sqrtT);
+      else if (!iBlockIs2x2 && jBlockIs2x2) 
+        matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(T, i, j, sqrtT);
+      else if (!iBlockIs2x2 && !jBlockIs2x2) 
+        matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(T, i, j, sqrtT);
+    }
+  }
+}
+
+} // end of namespace internal
+
+/** \ingroup MatrixFunctions_Module
+  * \brief Compute matrix square root of quasi-triangular matrix.
+  *
+  * \tparam  MatrixType  type of \p arg, the argument of matrix square root,
+  *                      expected to be an instantiation of the Matrix class template.
+  * \tparam  ResultType  type of \p result, where result is to be stored.
+  * \param[in]  arg      argument of matrix square root.
+  * \param[out] result   matrix square root of upper Hessenberg part of \p arg.
+  *
+  * This function computes the square root of the upper quasi-triangular matrix stored in the upper
+  * Hessenberg part of \p arg.  Only the upper Hessenberg part of \p result is updated, the rest is
+  * not touched.  See MatrixBase::sqrt() for details on how this computation is implemented.
+  *
+  * \sa MatrixSquareRoot, MatrixSquareRootQuasiTriangular
+  */
+template <typename MatrixType, typename ResultType> 
+void matrix_sqrt_quasi_triangular(const MatrixType &arg, ResultType &result)
+{
+  eigen_assert(arg.rows() == arg.cols());
+  result.resize(arg.rows(), arg.cols());
+  internal::matrix_sqrt_quasi_triangular_diagonal(arg, result);
+  internal::matrix_sqrt_quasi_triangular_off_diagonal(arg, result);
+}
+
+
+/** \ingroup MatrixFunctions_Module
+  * \brief Compute matrix square root of triangular matrix.
+  *
+  * \tparam  MatrixType  type of \p arg, the argument of matrix square root,
+  *                      expected to be an instantiation of the Matrix class template.
+  * \tparam  ResultType  type of \p result, where result is to be stored.
+  * \param[in]  arg      argument of matrix square root.
+  * \param[out] result   matrix square root of upper triangular part of \p arg.
+  *
+  * Only the upper triangular part (including the diagonal) of \p result is updated, the rest is not
+  * touched.  See MatrixBase::sqrt() for details on how this computation is implemented.
+  *
+  * \sa MatrixSquareRoot, MatrixSquareRootQuasiTriangular
+  */
+template <typename MatrixType, typename ResultType> 
+void matrix_sqrt_triangular(const MatrixType &arg, ResultType &result)
+{
+  using std::sqrt;
+  typedef typename MatrixType::Scalar Scalar;
+
+  eigen_assert(arg.rows() == arg.cols());
+
+  // Compute square root of arg and store it in upper triangular part of result
+  // This uses that the square root of triangular matrices can be computed directly.
+  result.resize(arg.rows(), arg.cols());
+  for (Index i = 0; i < arg.rows(); i++) {
+    result.coeffRef(i,i) = sqrt(arg.coeff(i,i));
+  }
+  for (Index j = 1; j < arg.cols(); j++) {
+    for (Index i = j-1; i >= 0; i--) {
+      // if i = j-1, then segment has length 0 so tmp = 0
+      Scalar tmp = (result.row(i).segment(i+1,j-i-1) * result.col(j).segment(i+1,j-i-1)).value();
+      // denominator may be zero if original matrix is singular
+      result.coeffRef(i,j) = (arg.coeff(i,j) - tmp) / (result.coeff(i,i) + result.coeff(j,j));
+    }
+  }
+}
+
+
+namespace internal {
+
+/** \ingroup MatrixFunctions_Module
+  * \brief Helper struct for computing matrix square roots of general matrices.
+  * \tparam  MatrixType  type of the argument of the matrix square root,
+  *                      expected to be an instantiation of the Matrix class template.
+  *
+  * \sa MatrixSquareRootTriangular, MatrixSquareRootQuasiTriangular, MatrixBase::sqrt()
+  */
+template <typename MatrixType, int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex>
+struct matrix_sqrt_compute
+{
+  /** \brief Compute the matrix square root
+    *
+    * \param[in]  arg     matrix whose square root is to be computed.
+    * \param[out] result  square root of \p arg.
+    *
+    * See MatrixBase::sqrt() for details on how this computation is implemented.
+    */
+  template <typename ResultType> static void run(const MatrixType &arg, ResultType &result);    
+};
+
+
+// ********** Partial specialization for real matrices **********
+
+template <typename MatrixType>
+struct matrix_sqrt_compute<MatrixType, 0>
+{
+  typedef typename MatrixType::PlainObject PlainType;
+  template <typename ResultType>
+  static void run(const MatrixType &arg, ResultType &result)
+  {
+    eigen_assert(arg.rows() == arg.cols());
+
+    // Compute Schur decomposition of arg
+    const RealSchur<PlainType> schurOfA(arg);
+    const PlainType& T = schurOfA.matrixT();
+    const PlainType& U = schurOfA.matrixU();
+    
+    // Compute square root of T
+    PlainType sqrtT = PlainType::Zero(arg.rows(), arg.cols());
+    matrix_sqrt_quasi_triangular(T, sqrtT);
+    
+    // Compute square root of arg
+    result = U * sqrtT * U.adjoint();
+  }
+};
+
+
+// ********** Partial specialization for complex matrices **********
+
+template <typename MatrixType>
+struct matrix_sqrt_compute<MatrixType, 1>
+{
+  typedef typename MatrixType::PlainObject PlainType;
+  template <typename ResultType>
+  static void run(const MatrixType &arg, ResultType &result)
+  {
+    eigen_assert(arg.rows() == arg.cols());
+
+    // Compute Schur decomposition of arg
+    const ComplexSchur<PlainType> schurOfA(arg);
+    const PlainType& T = schurOfA.matrixT();
+    const PlainType& U = schurOfA.matrixU();
+    
+    // Compute square root of T
+    PlainType sqrtT;
+    matrix_sqrt_triangular(T, sqrtT);
+    
+    // Compute square root of arg
+    result = U * (sqrtT.template triangularView<Upper>() * U.adjoint());
+  }
+};
+
+} // end namespace internal
+
+/** \ingroup MatrixFunctions_Module
+  *
+  * \brief Proxy for the matrix square root of some matrix (expression).
+  *
+  * \tparam Derived  Type of the argument to the matrix square root.
+  *
+  * This class holds the argument to the matrix square root until it
+  * is assigned or evaluated for some other reason (so the argument
+  * should not be changed in the meantime). It is the return type of
+  * MatrixBase::sqrt() and most of the time this is the only way it is
+  * used.
+  */
+template<typename Derived> class MatrixSquareRootReturnValue
+: public ReturnByValue<MatrixSquareRootReturnValue<Derived> >
+{
+  protected:
+    typedef typename internal::ref_selector<Derived>::type DerivedNested;
+
+  public:
+    /** \brief Constructor.
+      *
+      * \param[in]  src  %Matrix (expression) forming the argument of the
+      * matrix square root.
+      */
+    explicit MatrixSquareRootReturnValue(const Derived& src) : m_src(src) { }
+
+    /** \brief Compute the matrix square root.
+      *
+      * \param[out]  result  the matrix square root of \p src in the
+      * constructor.
+      */
+    template <typename ResultType>
+    inline void evalTo(ResultType& result) const
+    {
+      typedef typename internal::nested_eval<Derived, 10>::type DerivedEvalType;
+      typedef typename internal::remove_all<DerivedEvalType>::type DerivedEvalTypeClean;
+      DerivedEvalType tmp(m_src);
+      internal::matrix_sqrt_compute<DerivedEvalTypeClean>::run(tmp, result);
+    }
+
+    Index rows() const { return m_src.rows(); }
+    Index cols() const { return m_src.cols(); }
+
+  protected:
+    const DerivedNested m_src;
+};
+
+namespace internal {
+template<typename Derived>
+struct traits<MatrixSquareRootReturnValue<Derived> >
+{
+  typedef typename Derived::PlainObject ReturnType;
+};
+}
+
+template <typename Derived>
+const MatrixSquareRootReturnValue<Derived> MatrixBase<Derived>::sqrt() const
+{
+  eigen_assert(rows() == cols());
+  return MatrixSquareRootReturnValue<Derived>(derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIX_FUNCTION

diff --git a/unsupported/Eigen/src/MatrixFunctions/StemFunction.h b/unsupported/Eigen/src/MatrixFunctions/StemFunction.h
new file mode 100644
index 0000000..7604df9
--- /dev/null
+++ b/unsupported/Eigen/src/MatrixFunctions/StemFunction.h

@@ -0,0 +1,117 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010, 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STEM_FUNCTION
+#define EIGEN_STEM_FUNCTION
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \brief The exponential function (and its derivatives). */
+template <typename Scalar>
+Scalar stem_function_exp(Scalar x, int)
+{
+  using std::exp;
+  return exp(x);
+}
+
+/** \brief Cosine (and its derivatives). */
+template <typename Scalar>
+Scalar stem_function_cos(Scalar x, int n)
+{
+  using std::cos;
+  using std::sin;
+  Scalar res;
+
+  switch (n % 4) {
+  case 0: 
+    res = std::cos(x);
+    break;
+  case 1:
+    res = -std::sin(x);
+    break;
+  case 2:
+    res = -std::cos(x);
+    break;
+  case 3:
+    res = std::sin(x);
+    break;
+  }
+  return res;
+}
+
+/** \brief Sine (and its derivatives). */
+template <typename Scalar>
+Scalar stem_function_sin(Scalar x, int n)
+{
+  using std::cos;
+  using std::sin;
+  Scalar res;
+
+  switch (n % 4) {
+  case 0:
+    res = std::sin(x);
+    break;
+  case 1:
+    res = std::cos(x);
+    break;
+  case 2:
+    res = -std::sin(x);
+    break;
+  case 3:
+    res = -std::cos(x);
+    break;
+  }
+  return res;
+}
+
+/** \brief Hyperbolic cosine (and its derivatives). */
+template <typename Scalar>
+Scalar stem_function_cosh(Scalar x, int n)
+{
+  using std::cosh;
+  using std::sinh;
+  Scalar res;
+  
+  switch (n % 2) {
+  case 0:
+    res = std::cosh(x);
+    break;
+  case 1:
+    res = std::sinh(x);
+    break;
+  }
+  return res;
+}
+	
+/** \brief Hyperbolic sine (and its derivatives). */
+template <typename Scalar>
+Scalar stem_function_sinh(Scalar x, int n)
+{
+  using std::cosh;
+  using std::sinh;
+  Scalar res;
+  
+  switch (n % 2) {
+  case 0:
+    res = std::sinh(x);
+    break;
+  case 1:
+    res = std::cosh(x);
+    break;
+  }
+  return res;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_STEM_FUNCTION

diff --git a/unsupported/Eigen/src/MoreVectorization/MathFunctions.h b/unsupported/Eigen/src/MoreVectorization/MathFunctions.h
new file mode 100644
index 0000000..63cb28d
--- /dev/null
+++ b/unsupported/Eigen/src/MoreVectorization/MathFunctions.h

@@ -0,0 +1,95 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Rohit Garg <rpg.314@gmail.com>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MOREVECTORIZATION_MATHFUNCTIONS_H
+#define EIGEN_MOREVECTORIZATION_MATHFUNCTIONS_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \internal \returns the arcsin of \a a (coeff-wise) */
+template<typename Packet> inline static Packet pasin(Packet a) { return std::asin(a); }
+
+#ifdef EIGEN_VECTORIZE_SSE
+
+template<> EIGEN_DONT_INLINE Packet4f pasin(Packet4f x)
+{
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5);
+  _EIGEN_DECLARE_CONST_Packet4f(3half, 1.5);
+
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
+
+  _EIGEN_DECLARE_CONST_Packet4f(pi, 3.141592654);
+  _EIGEN_DECLARE_CONST_Packet4f(pi_over_2, 3.141592654*0.5);
+
+  _EIGEN_DECLARE_CONST_Packet4f(asin1, 4.2163199048E-2);
+  _EIGEN_DECLARE_CONST_Packet4f(asin2, 2.4181311049E-2);
+  _EIGEN_DECLARE_CONST_Packet4f(asin3, 4.5470025998E-2);
+  _EIGEN_DECLARE_CONST_Packet4f(asin4, 7.4953002686E-2);
+  _EIGEN_DECLARE_CONST_Packet4f(asin5, 1.6666752422E-1);
+
+  Packet4f a = pabs(x);//got the absolute value
+
+  Packet4f sign_bit= _mm_and_ps(x, p4f_sign_mask);//extracted the sign bit
+
+  Packet4f z1,z2;//will need them during computation    
+
+
+//will compute the two branches for asin
+//so first compare with half
+
+  Packet4f branch_mask= _mm_cmpgt_ps(a, p4f_half);//this is to select which branch to take
+//both will be taken, and finally results will be merged
+//the branch for values >0.5
+
+    {
+//the core series expansion 
+    z1=pmadd(p4f_minus_half,a,p4f_half);
+    Packet4f x1=psqrt(z1);
+    Packet4f s1=pmadd(p4f_asin1, z1, p4f_asin2);
+    Packet4f s2=pmadd(s1, z1, p4f_asin3);
+    Packet4f s3=pmadd(s2,z1, p4f_asin4);
+    Packet4f s4=pmadd(s3,z1, p4f_asin5);
+    Packet4f temp=pmul(s4,z1);//not really a madd but a mul by z so that the next term can be a madd
+    z1=pmadd(temp,x1,x1);
+    z1=padd(z1,z1);
+    z1=psub(p4f_pi_over_2,z1);
+    }
+
+    {
+//the core series expansion 
+    Packet4f x2=a;
+    z2=pmul(x2,x2);
+    Packet4f s1=pmadd(p4f_asin1, z2, p4f_asin2);
+    Packet4f s2=pmadd(s1, z2, p4f_asin3);
+    Packet4f s3=pmadd(s2,z2, p4f_asin4);
+    Packet4f s4=pmadd(s3,z2, p4f_asin5);
+    Packet4f temp=pmul(s4,z2);//not really a madd but a mul by z so that the next term can be a madd
+    z2=pmadd(temp,x2,x2);
+    }
+
+/* select the correct result from the two branch evaluations */
+  z1  = _mm_and_ps(branch_mask, z1);
+  z2  = _mm_andnot_ps(branch_mask, z2);
+  Packet4f z  = _mm_or_ps(z1,z2);
+
+/* update the sign */
+  return _mm_xor_ps(z, sign_bit);
+}
+
+#endif // EIGEN_VECTORIZE_SSE
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MOREVECTORIZATION_MATHFUNCTIONS_H

diff --git a/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h b/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
new file mode 100644
index 0000000..07c5ef0
--- /dev/null
+++ b/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h

@@ -0,0 +1,601 @@
+// -*- coding: utf-8
+// vim: set fileencoding=utf-8
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_HYBRIDNONLINEARSOLVER_H
+#define EIGEN_HYBRIDNONLINEARSOLVER_H
+
+namespace Eigen { 
+
+namespace HybridNonLinearSolverSpace { 
+    enum Status {
+        Running = -1,
+        ImproperInputParameters = 0,
+        RelativeErrorTooSmall = 1,
+        TooManyFunctionEvaluation = 2,
+        TolTooSmall = 3,
+        NotMakingProgressJacobian = 4,
+        NotMakingProgressIterations = 5,
+        UserAsked = 6
+    };
+}
+
+/**
+  * \ingroup NonLinearOptimization_Module
+  * \brief Finds a zero of a system of n
+  * nonlinear functions in n variables by a modification of the Powell
+  * hybrid method ("dogleg").
+  *
+  * The user must provide a subroutine which calculates the
+  * functions. The Jacobian is either provided by the user, or approximated
+  * using a forward-difference method.
+  *
+  */
+template<typename FunctorType, typename Scalar=double>
+class HybridNonLinearSolver
+{
+public:
+    typedef DenseIndex Index;
+
+    HybridNonLinearSolver(FunctorType &_functor)
+        : functor(_functor) { nfev=njev=iter = 0;  fnorm= 0.; useExternalScaling=false;}
+
+    struct Parameters {
+        Parameters()
+            : factor(Scalar(100.))
+            , maxfev(1000)
+            , xtol(numext::sqrt(NumTraits<Scalar>::epsilon()))
+            , nb_of_subdiagonals(-1)
+            , nb_of_superdiagonals(-1)
+            , epsfcn(Scalar(0.)) {}
+        Scalar factor;
+        Index maxfev;   // maximum number of function evaluation
+        Scalar xtol;
+        Index nb_of_subdiagonals;
+        Index nb_of_superdiagonals;
+        Scalar epsfcn;
+    };
+    typedef Matrix< Scalar, Dynamic, 1 > FVectorType;
+    typedef Matrix< Scalar, Dynamic, Dynamic > JacobianType;
+    /* TODO: if eigen provides a triangular storage, use it here */
+    typedef Matrix< Scalar, Dynamic, Dynamic > UpperTriangularType;
+
+    HybridNonLinearSolverSpace::Status hybrj1(
+            FVectorType  &x,
+            const Scalar tol = numext::sqrt(NumTraits<Scalar>::epsilon())
+            );
+
+    HybridNonLinearSolverSpace::Status solveInit(FVectorType  &x);
+    HybridNonLinearSolverSpace::Status solveOneStep(FVectorType  &x);
+    HybridNonLinearSolverSpace::Status solve(FVectorType  &x);
+
+    HybridNonLinearSolverSpace::Status hybrd1(
+            FVectorType  &x,
+            const Scalar tol = numext::sqrt(NumTraits<Scalar>::epsilon())
+            );
+
+    HybridNonLinearSolverSpace::Status solveNumericalDiffInit(FVectorType  &x);
+    HybridNonLinearSolverSpace::Status solveNumericalDiffOneStep(FVectorType  &x);
+    HybridNonLinearSolverSpace::Status solveNumericalDiff(FVectorType  &x);
+
+    void resetParameters(void) { parameters = Parameters(); }
+    Parameters parameters;
+    FVectorType  fvec, qtf, diag;
+    JacobianType fjac;
+    UpperTriangularType R;
+    Index nfev;
+    Index njev;
+    Index iter;
+    Scalar fnorm;
+    bool useExternalScaling; 
+private:
+    FunctorType &functor;
+    Index n;
+    Scalar sum;
+    bool sing;
+    Scalar temp;
+    Scalar delta;
+    bool jeval;
+    Index ncsuc;
+    Scalar ratio;
+    Scalar pnorm, xnorm, fnorm1;
+    Index nslow1, nslow2;
+    Index ncfail;
+    Scalar actred, prered;
+    FVectorType wa1, wa2, wa3, wa4;
+
+    HybridNonLinearSolver& operator=(const HybridNonLinearSolver&);
+};
+
+
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::hybrj1(
+        FVectorType  &x,
+        const Scalar tol
+        )
+{
+    n = x.size();
+
+    /* check the input parameters for errors. */
+    if (n <= 0 || tol < 0.)
+        return HybridNonLinearSolverSpace::ImproperInputParameters;
+
+    resetParameters();
+    parameters.maxfev = 100*(n+1);
+    parameters.xtol = tol;
+    diag.setConstant(n, 1.);
+    useExternalScaling = true;
+    return solve(x);
+}
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::solveInit(FVectorType  &x)
+{
+    n = x.size();
+
+    wa1.resize(n); wa2.resize(n); wa3.resize(n); wa4.resize(n);
+    fvec.resize(n);
+    qtf.resize(n);
+    fjac.resize(n, n);
+    if (!useExternalScaling)
+        diag.resize(n);
+    eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
+
+    /* Function Body */
+    nfev = 0;
+    njev = 0;
+
+    /*     check the input parameters for errors. */
+    if (n <= 0 || parameters.xtol < 0. || parameters.maxfev <= 0 || parameters.factor <= 0. )
+        return HybridNonLinearSolverSpace::ImproperInputParameters;
+    if (useExternalScaling)
+        for (Index j = 0; j < n; ++j)
+            if (diag[j] <= 0.)
+                return HybridNonLinearSolverSpace::ImproperInputParameters;
+
+    /*     evaluate the function at the starting point */
+    /*     and calculate its norm. */
+    nfev = 1;
+    if ( functor(x, fvec) < 0)
+        return HybridNonLinearSolverSpace::UserAsked;
+    fnorm = fvec.stableNorm();
+
+    /*     initialize iteration counter and monitors. */
+    iter = 1;
+    ncsuc = 0;
+    ncfail = 0;
+    nslow1 = 0;
+    nslow2 = 0;
+
+    return HybridNonLinearSolverSpace::Running;
+}
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::solveOneStep(FVectorType  &x)
+{
+    using std::abs;
+    
+    eigen_assert(x.size()==n); // check the caller is not cheating us
+
+    Index j;
+    std::vector<JacobiRotation<Scalar> > v_givens(n), w_givens(n);
+
+    jeval = true;
+
+    /* calculate the jacobian matrix. */
+    if ( functor.df(x, fjac) < 0)
+        return HybridNonLinearSolverSpace::UserAsked;
+    ++njev;
+
+    wa2 = fjac.colwise().blueNorm();
+
+    /* on the first iteration and if external scaling is not used, scale according */
+    /* to the norms of the columns of the initial jacobian. */
+    if (iter == 1) {
+        if (!useExternalScaling)
+            for (j = 0; j < n; ++j)
+                diag[j] = (wa2[j]==0.) ? 1. : wa2[j];
+
+        /* on the first iteration, calculate the norm of the scaled x */
+        /* and initialize the step bound delta. */
+        xnorm = diag.cwiseProduct(x).stableNorm();
+        delta = parameters.factor * xnorm;
+        if (delta == 0.)
+            delta = parameters.factor;
+    }
+
+    /* compute the qr factorization of the jacobian. */
+    HouseholderQR<JacobianType> qrfac(fjac); // no pivoting:
+
+    /* copy the triangular factor of the qr factorization into r. */
+    R = qrfac.matrixQR();
+
+    /* accumulate the orthogonal factor in fjac. */
+    fjac = qrfac.householderQ();
+
+    /* form (q transpose)*fvec and store in qtf. */
+    qtf = fjac.transpose() * fvec;
+
+    /* rescale if necessary. */
+    if (!useExternalScaling)
+        diag = diag.cwiseMax(wa2);
+
+    while (true) {
+        /* determine the direction p. */
+        internal::dogleg<Scalar>(R, diag, qtf, delta, wa1);
+
+        /* store the direction p and x + p. calculate the norm of p. */
+        wa1 = -wa1;
+        wa2 = x + wa1;
+        pnorm = diag.cwiseProduct(wa1).stableNorm();
+
+        /* on the first iteration, adjust the initial step bound. */
+        if (iter == 1)
+            delta = (std::min)(delta,pnorm);
+
+        /* evaluate the function at x + p and calculate its norm. */
+        if ( functor(wa2, wa4) < 0)
+            return HybridNonLinearSolverSpace::UserAsked;
+        ++nfev;
+        fnorm1 = wa4.stableNorm();
+
+        /* compute the scaled actual reduction. */
+        actred = -1.;
+        if (fnorm1 < fnorm) /* Computing 2nd power */
+            actred = 1. - numext::abs2(fnorm1 / fnorm);
+
+        /* compute the scaled predicted reduction. */
+        wa3 = R.template triangularView<Upper>()*wa1 + qtf;
+        temp = wa3.stableNorm();
+        prered = 0.;
+        if (temp < fnorm) /* Computing 2nd power */
+            prered = 1. - numext::abs2(temp / fnorm);
+
+        /* compute the ratio of the actual to the predicted reduction. */
+        ratio = 0.;
+        if (prered > 0.)
+            ratio = actred / prered;
+
+        /* update the step bound. */
+        if (ratio < Scalar(.1)) {
+            ncsuc = 0;
+            ++ncfail;
+            delta = Scalar(.5) * delta;
+        } else {
+            ncfail = 0;
+            ++ncsuc;
+            if (ratio >= Scalar(.5) || ncsuc > 1)
+                delta = (std::max)(delta, pnorm / Scalar(.5));
+            if (abs(ratio - 1.) <= Scalar(.1)) {
+                delta = pnorm / Scalar(.5);
+            }
+        }
+
+        /* test for successful iteration. */
+        if (ratio >= Scalar(1e-4)) {
+            /* successful iteration. update x, fvec, and their norms. */
+            x = wa2;
+            wa2 = diag.cwiseProduct(x);
+            fvec = wa4;
+            xnorm = wa2.stableNorm();
+            fnorm = fnorm1;
+            ++iter;
+        }
+
+        /* determine the progress of the iteration. */
+        ++nslow1;
+        if (actred >= Scalar(.001))
+            nslow1 = 0;
+        if (jeval)
+            ++nslow2;
+        if (actred >= Scalar(.1))
+            nslow2 = 0;
+
+        /* test for convergence. */
+        if (delta <= parameters.xtol * xnorm || fnorm == 0.)
+            return HybridNonLinearSolverSpace::RelativeErrorTooSmall;
+
+        /* tests for termination and stringent tolerances. */
+        if (nfev >= parameters.maxfev)
+            return HybridNonLinearSolverSpace::TooManyFunctionEvaluation;
+        if (Scalar(.1) * (std::max)(Scalar(.1) * delta, pnorm) <= NumTraits<Scalar>::epsilon() * xnorm)
+            return HybridNonLinearSolverSpace::TolTooSmall;
+        if (nslow2 == 5)
+            return HybridNonLinearSolverSpace::NotMakingProgressJacobian;
+        if (nslow1 == 10)
+            return HybridNonLinearSolverSpace::NotMakingProgressIterations;
+
+        /* criterion for recalculating jacobian. */
+        if (ncfail == 2)
+            break; // leave inner loop and go for the next outer loop iteration
+
+        /* calculate the rank one modification to the jacobian */
+        /* and update qtf if necessary. */
+        wa1 = diag.cwiseProduct( diag.cwiseProduct(wa1)/pnorm );
+        wa2 = fjac.transpose() * wa4;
+        if (ratio >= Scalar(1e-4))
+            qtf = wa2;
+        wa2 = (wa2-wa3)/pnorm;
+
+        /* compute the qr factorization of the updated jacobian. */
+        internal::r1updt<Scalar>(R, wa1, v_givens, w_givens, wa2, wa3, &sing);
+        internal::r1mpyq<Scalar>(n, n, fjac.data(), v_givens, w_givens);
+        internal::r1mpyq<Scalar>(1, n, qtf.data(), v_givens, w_givens);
+
+        jeval = false;
+    }
+    return HybridNonLinearSolverSpace::Running;
+}
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::solve(FVectorType  &x)
+{
+    HybridNonLinearSolverSpace::Status status = solveInit(x);
+    if (status==HybridNonLinearSolverSpace::ImproperInputParameters)
+        return status;
+    while (status==HybridNonLinearSolverSpace::Running)
+        status = solveOneStep(x);
+    return status;
+}
+
+
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::hybrd1(
+        FVectorType  &x,
+        const Scalar tol
+        )
+{
+    n = x.size();
+
+    /* check the input parameters for errors. */
+    if (n <= 0 || tol < 0.)
+        return HybridNonLinearSolverSpace::ImproperInputParameters;
+
+    resetParameters();
+    parameters.maxfev = 200*(n+1);
+    parameters.xtol = tol;
+
+    diag.setConstant(n, 1.);
+    useExternalScaling = true;
+    return solveNumericalDiff(x);
+}
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::solveNumericalDiffInit(FVectorType  &x)
+{
+    n = x.size();
+
+    if (parameters.nb_of_subdiagonals<0) parameters.nb_of_subdiagonals= n-1;
+    if (parameters.nb_of_superdiagonals<0) parameters.nb_of_superdiagonals= n-1;
+
+    wa1.resize(n); wa2.resize(n); wa3.resize(n); wa4.resize(n);
+    qtf.resize(n);
+    fjac.resize(n, n);
+    fvec.resize(n);
+    if (!useExternalScaling)
+        diag.resize(n);
+    eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
+
+    /* Function Body */
+    nfev = 0;
+    njev = 0;
+
+    /*     check the input parameters for errors. */
+    if (n <= 0 || parameters.xtol < 0. || parameters.maxfev <= 0 || parameters.nb_of_subdiagonals< 0 || parameters.nb_of_superdiagonals< 0 || parameters.factor <= 0. )
+        return HybridNonLinearSolverSpace::ImproperInputParameters;
+    if (useExternalScaling)
+        for (Index j = 0; j < n; ++j)
+            if (diag[j] <= 0.)
+                return HybridNonLinearSolverSpace::ImproperInputParameters;
+
+    /*     evaluate the function at the starting point */
+    /*     and calculate its norm. */
+    nfev = 1;
+    if ( functor(x, fvec) < 0)
+        return HybridNonLinearSolverSpace::UserAsked;
+    fnorm = fvec.stableNorm();
+
+    /*     initialize iteration counter and monitors. */
+    iter = 1;
+    ncsuc = 0;
+    ncfail = 0;
+    nslow1 = 0;
+    nslow2 = 0;
+
+    return HybridNonLinearSolverSpace::Running;
+}
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::solveNumericalDiffOneStep(FVectorType  &x)
+{
+    using std::sqrt;
+    using std::abs;
+    
+    assert(x.size()==n); // check the caller is not cheating us
+
+    Index j;
+    std::vector<JacobiRotation<Scalar> > v_givens(n), w_givens(n);
+
+    jeval = true;
+    if (parameters.nb_of_subdiagonals<0) parameters.nb_of_subdiagonals= n-1;
+    if (parameters.nb_of_superdiagonals<0) parameters.nb_of_superdiagonals= n-1;
+
+    /* calculate the jacobian matrix. */
+    if (internal::fdjac1(functor, x, fvec, fjac, parameters.nb_of_subdiagonals, parameters.nb_of_superdiagonals, parameters.epsfcn) <0)
+        return HybridNonLinearSolverSpace::UserAsked;
+    nfev += (std::min)(parameters.nb_of_subdiagonals+parameters.nb_of_superdiagonals+ 1, n);
+
+    wa2 = fjac.colwise().blueNorm();
+
+    /* on the first iteration and if external scaling is not used, scale according */
+    /* to the norms of the columns of the initial jacobian. */
+    if (iter == 1) {
+        if (!useExternalScaling)
+            for (j = 0; j < n; ++j)
+                diag[j] = (wa2[j]==0.) ? 1. : wa2[j];
+
+        /* on the first iteration, calculate the norm of the scaled x */
+        /* and initialize the step bound delta. */
+        xnorm = diag.cwiseProduct(x).stableNorm();
+        delta = parameters.factor * xnorm;
+        if (delta == 0.)
+            delta = parameters.factor;
+    }
+
+    /* compute the qr factorization of the jacobian. */
+    HouseholderQR<JacobianType> qrfac(fjac); // no pivoting:
+
+    /* copy the triangular factor of the qr factorization into r. */
+    R = qrfac.matrixQR();
+
+    /* accumulate the orthogonal factor in fjac. */
+    fjac = qrfac.householderQ();
+
+    /* form (q transpose)*fvec and store in qtf. */
+    qtf = fjac.transpose() * fvec;
+
+    /* rescale if necessary. */
+    if (!useExternalScaling)
+        diag = diag.cwiseMax(wa2);
+
+    while (true) {
+        /* determine the direction p. */
+        internal::dogleg<Scalar>(R, diag, qtf, delta, wa1);
+
+        /* store the direction p and x + p. calculate the norm of p. */
+        wa1 = -wa1;
+        wa2 = x + wa1;
+        pnorm = diag.cwiseProduct(wa1).stableNorm();
+
+        /* on the first iteration, adjust the initial step bound. */
+        if (iter == 1)
+            delta = (std::min)(delta,pnorm);
+
+        /* evaluate the function at x + p and calculate its norm. */
+        if ( functor(wa2, wa4) < 0)
+            return HybridNonLinearSolverSpace::UserAsked;
+        ++nfev;
+        fnorm1 = wa4.stableNorm();
+
+        /* compute the scaled actual reduction. */
+        actred = -1.;
+        if (fnorm1 < fnorm) /* Computing 2nd power */
+            actred = 1. - numext::abs2(fnorm1 / fnorm);
+
+        /* compute the scaled predicted reduction. */
+        wa3 = R.template triangularView<Upper>()*wa1 + qtf;
+        temp = wa3.stableNorm();
+        prered = 0.;
+        if (temp < fnorm) /* Computing 2nd power */
+            prered = 1. - numext::abs2(temp / fnorm);
+
+        /* compute the ratio of the actual to the predicted reduction. */
+        ratio = 0.;
+        if (prered > 0.)
+            ratio = actred / prered;
+
+        /* update the step bound. */
+        if (ratio < Scalar(.1)) {
+            ncsuc = 0;
+            ++ncfail;
+            delta = Scalar(.5) * delta;
+        } else {
+            ncfail = 0;
+            ++ncsuc;
+            if (ratio >= Scalar(.5) || ncsuc > 1)
+                delta = (std::max)(delta, pnorm / Scalar(.5));
+            if (abs(ratio - 1.) <= Scalar(.1)) {
+                delta = pnorm / Scalar(.5);
+            }
+        }
+
+        /* test for successful iteration. */
+        if (ratio >= Scalar(1e-4)) {
+            /* successful iteration. update x, fvec, and their norms. */
+            x = wa2;
+            wa2 = diag.cwiseProduct(x);
+            fvec = wa4;
+            xnorm = wa2.stableNorm();
+            fnorm = fnorm1;
+            ++iter;
+        }
+
+        /* determine the progress of the iteration. */
+        ++nslow1;
+        if (actred >= Scalar(.001))
+            nslow1 = 0;
+        if (jeval)
+            ++nslow2;
+        if (actred >= Scalar(.1))
+            nslow2 = 0;
+
+        /* test for convergence. */
+        if (delta <= parameters.xtol * xnorm || fnorm == 0.)
+            return HybridNonLinearSolverSpace::RelativeErrorTooSmall;
+
+        /* tests for termination and stringent tolerances. */
+        if (nfev >= parameters.maxfev)
+            return HybridNonLinearSolverSpace::TooManyFunctionEvaluation;
+        if (Scalar(.1) * (std::max)(Scalar(.1) * delta, pnorm) <= NumTraits<Scalar>::epsilon() * xnorm)
+            return HybridNonLinearSolverSpace::TolTooSmall;
+        if (nslow2 == 5)
+            return HybridNonLinearSolverSpace::NotMakingProgressJacobian;
+        if (nslow1 == 10)
+            return HybridNonLinearSolverSpace::NotMakingProgressIterations;
+
+        /* criterion for recalculating jacobian. */
+        if (ncfail == 2)
+            break; // leave inner loop and go for the next outer loop iteration
+
+        /* calculate the rank one modification to the jacobian */
+        /* and update qtf if necessary. */
+        wa1 = diag.cwiseProduct( diag.cwiseProduct(wa1)/pnorm );
+        wa2 = fjac.transpose() * wa4;
+        if (ratio >= Scalar(1e-4))
+            qtf = wa2;
+        wa2 = (wa2-wa3)/pnorm;
+
+        /* compute the qr factorization of the updated jacobian. */
+        internal::r1updt<Scalar>(R, wa1, v_givens, w_givens, wa2, wa3, &sing);
+        internal::r1mpyq<Scalar>(n, n, fjac.data(), v_givens, w_givens);
+        internal::r1mpyq<Scalar>(1, n, qtf.data(), v_givens, w_givens);
+
+        jeval = false;
+    }
+    return HybridNonLinearSolverSpace::Running;
+}
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::solveNumericalDiff(FVectorType  &x)
+{
+    HybridNonLinearSolverSpace::Status status = solveNumericalDiffInit(x);
+    if (status==HybridNonLinearSolverSpace::ImproperInputParameters)
+        return status;
+    while (status==HybridNonLinearSolverSpace::Running)
+        status = solveNumericalDiffOneStep(x);
+    return status;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_HYBRIDNONLINEARSOLVER_H
+
+//vim: ai ts=4 sts=4 et sw=4

diff --git a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
new file mode 100644
index 0000000..fe3b79c
--- /dev/null
+++ b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h

@@ -0,0 +1,657 @@
+// -*- coding: utf-8
+// vim: set fileencoding=utf-8
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LEVENBERGMARQUARDT__H
+#define EIGEN_LEVENBERGMARQUARDT__H
+
+namespace Eigen { 
+
+namespace LevenbergMarquardtSpace {
+    enum Status {
+        NotStarted = -2,
+        Running = -1,
+        ImproperInputParameters = 0,
+        RelativeReductionTooSmall = 1,
+        RelativeErrorTooSmall = 2,
+        RelativeErrorAndReductionTooSmall = 3,
+        CosinusTooSmall = 4,
+        TooManyFunctionEvaluation = 5,
+        FtolTooSmall = 6,
+        XtolTooSmall = 7,
+        GtolTooSmall = 8,
+        UserAsked = 9
+    };
+}
+
+
+
+/**
+  * \ingroup NonLinearOptimization_Module
+  * \brief Performs non linear optimization over a non-linear function,
+  * using a variant of the Levenberg Marquardt algorithm.
+  *
+  * Check wikipedia for more information.
+  * http://en.wikipedia.org/wiki/Levenberg%E2%80%93Marquardt_algorithm
+  */
+template<typename FunctorType, typename Scalar=double>
+class LevenbergMarquardt
+{
+    static Scalar sqrt_epsilon()
+    {
+      using std::sqrt;
+      return sqrt(NumTraits<Scalar>::epsilon());
+    }
+    
+public:
+    LevenbergMarquardt(FunctorType &_functor)
+        : functor(_functor) { nfev = njev = iter = 0;  fnorm = gnorm = 0.; useExternalScaling=false; }
+
+    typedef DenseIndex Index;
+    
+    struct Parameters {
+        Parameters()
+            : factor(Scalar(100.))
+            , maxfev(400)
+            , ftol(sqrt_epsilon())
+            , xtol(sqrt_epsilon())
+            , gtol(Scalar(0.))
+            , epsfcn(Scalar(0.)) {}
+        Scalar factor;
+        Index maxfev;   // maximum number of function evaluation
+        Scalar ftol;
+        Scalar xtol;
+        Scalar gtol;
+        Scalar epsfcn;
+    };
+
+    typedef Matrix< Scalar, Dynamic, 1 > FVectorType;
+    typedef Matrix< Scalar, Dynamic, Dynamic > JacobianType;
+
+    LevenbergMarquardtSpace::Status lmder1(
+            FVectorType &x,
+            const Scalar tol = sqrt_epsilon()
+            );
+
+    LevenbergMarquardtSpace::Status minimize(FVectorType &x);
+    LevenbergMarquardtSpace::Status minimizeInit(FVectorType &x);
+    LevenbergMarquardtSpace::Status minimizeOneStep(FVectorType &x);
+
+    static LevenbergMarquardtSpace::Status lmdif1(
+            FunctorType &functor,
+            FVectorType &x,
+            Index *nfev,
+            const Scalar tol = sqrt_epsilon()
+            );
+
+    LevenbergMarquardtSpace::Status lmstr1(
+            FVectorType  &x,
+            const Scalar tol = sqrt_epsilon()
+            );
+
+    LevenbergMarquardtSpace::Status minimizeOptimumStorage(FVectorType  &x);
+    LevenbergMarquardtSpace::Status minimizeOptimumStorageInit(FVectorType  &x);
+    LevenbergMarquardtSpace::Status minimizeOptimumStorageOneStep(FVectorType  &x);
+
+    void resetParameters(void) { parameters = Parameters(); }
+
+    Parameters parameters;
+    FVectorType  fvec, qtf, diag;
+    JacobianType fjac;
+    PermutationMatrix<Dynamic,Dynamic> permutation;
+    Index nfev;
+    Index njev;
+    Index iter;
+    Scalar fnorm, gnorm;
+    bool useExternalScaling; 
+
+    Scalar lm_param(void) { return par; }
+private:
+    
+    FunctorType &functor;
+    Index n;
+    Index m;
+    FVectorType wa1, wa2, wa3, wa4;
+
+    Scalar par, sum;
+    Scalar temp, temp1, temp2;
+    Scalar delta;
+    Scalar ratio;
+    Scalar pnorm, xnorm, fnorm1, actred, dirder, prered;
+
+    LevenbergMarquardt& operator=(const LevenbergMarquardt&);
+};
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::lmder1(
+        FVectorType  &x,
+        const Scalar tol
+        )
+{
+    n = x.size();
+    m = functor.values();
+
+    /* check the input parameters for errors. */
+    if (n <= 0 || m < n || tol < 0.)
+        return LevenbergMarquardtSpace::ImproperInputParameters;
+
+    resetParameters();
+    parameters.ftol = tol;
+    parameters.xtol = tol;
+    parameters.maxfev = 100*(n+1);
+
+    return minimize(x);
+}
+
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::minimize(FVectorType  &x)
+{
+    LevenbergMarquardtSpace::Status status = minimizeInit(x);
+    if (status==LevenbergMarquardtSpace::ImproperInputParameters)
+        return status;
+    do {
+        status = minimizeOneStep(x);
+    } while (status==LevenbergMarquardtSpace::Running);
+    return status;
+}
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::minimizeInit(FVectorType  &x)
+{
+    n = x.size();
+    m = functor.values();
+
+    wa1.resize(n); wa2.resize(n); wa3.resize(n);
+    wa4.resize(m);
+    fvec.resize(m);
+    fjac.resize(m, n);
+    if (!useExternalScaling)
+        diag.resize(n);
+    eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
+    qtf.resize(n);
+
+    /* Function Body */
+    nfev = 0;
+    njev = 0;
+
+    /*     check the input parameters for errors. */
+    if (n <= 0 || m < n || parameters.ftol < 0. || parameters.xtol < 0. || parameters.gtol < 0. || parameters.maxfev <= 0 || parameters.factor <= 0.)
+        return LevenbergMarquardtSpace::ImproperInputParameters;
+
+    if (useExternalScaling)
+        for (Index j = 0; j < n; ++j)
+            if (diag[j] <= 0.)
+                return LevenbergMarquardtSpace::ImproperInputParameters;
+
+    /*     evaluate the function at the starting point */
+    /*     and calculate its norm. */
+    nfev = 1;
+    if ( functor(x, fvec) < 0)
+        return LevenbergMarquardtSpace::UserAsked;
+    fnorm = fvec.stableNorm();
+
+    /*     initialize levenberg-marquardt parameter and iteration counter. */
+    par = 0.;
+    iter = 1;
+
+    return LevenbergMarquardtSpace::NotStarted;
+}
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::minimizeOneStep(FVectorType  &x)
+{
+    using std::abs;
+    using std::sqrt;
+
+    eigen_assert(x.size()==n); // check the caller is not cheating us
+
+    /* calculate the jacobian matrix. */
+    Index df_ret = functor.df(x, fjac);
+    if (df_ret<0)
+        return LevenbergMarquardtSpace::UserAsked;
+    if (df_ret>0)
+        // numerical diff, we evaluated the function df_ret times
+        nfev += df_ret;
+    else njev++;
+
+    /* compute the qr factorization of the jacobian. */
+    wa2 = fjac.colwise().blueNorm();
+    ColPivHouseholderQR<JacobianType> qrfac(fjac);
+    fjac = qrfac.matrixQR();
+    permutation = qrfac.colsPermutation();
+
+    /* on the first iteration and if external scaling is not used, scale according */
+    /* to the norms of the columns of the initial jacobian. */
+    if (iter == 1) {
+        if (!useExternalScaling)
+            for (Index j = 0; j < n; ++j)
+                diag[j] = (wa2[j]==0.)? 1. : wa2[j];
+
+        /* on the first iteration, calculate the norm of the scaled x */
+        /* and initialize the step bound delta. */
+        xnorm = diag.cwiseProduct(x).stableNorm();
+        delta = parameters.factor * xnorm;
+        if (delta == 0.)
+            delta = parameters.factor;
+    }
+
+    /* form (q transpose)*fvec and store the first n components in */
+    /* qtf. */
+    wa4 = fvec;
+    wa4.applyOnTheLeft(qrfac.householderQ().adjoint());
+    qtf = wa4.head(n);
+
+    /* compute the norm of the scaled gradient. */
+    gnorm = 0.;
+    if (fnorm != 0.)
+        for (Index j = 0; j < n; ++j)
+            if (wa2[permutation.indices()[j]] != 0.)
+                gnorm = (std::max)(gnorm, abs( fjac.col(j).head(j+1).dot(qtf.head(j+1)/fnorm) / wa2[permutation.indices()[j]]));
+
+    /* test for convergence of the gradient norm. */
+    if (gnorm <= parameters.gtol)
+        return LevenbergMarquardtSpace::CosinusTooSmall;
+
+    /* rescale if necessary. */
+    if (!useExternalScaling)
+        diag = diag.cwiseMax(wa2);
+
+    do {
+
+        /* determine the levenberg-marquardt parameter. */
+        internal::lmpar2<Scalar>(qrfac, diag, qtf, delta, par, wa1);
+
+        /* store the direction p and x + p. calculate the norm of p. */
+        wa1 = -wa1;
+        wa2 = x + wa1;
+        pnorm = diag.cwiseProduct(wa1).stableNorm();
+
+        /* on the first iteration, adjust the initial step bound. */
+        if (iter == 1)
+            delta = (std::min)(delta,pnorm);
+
+        /* evaluate the function at x + p and calculate its norm. */
+        if ( functor(wa2, wa4) < 0)
+            return LevenbergMarquardtSpace::UserAsked;
+        ++nfev;
+        fnorm1 = wa4.stableNorm();
+
+        /* compute the scaled actual reduction. */
+        actred = -1.;
+        if (Scalar(.1) * fnorm1 < fnorm)
+            actred = 1. - numext::abs2(fnorm1 / fnorm);
+
+        /* compute the scaled predicted reduction and */
+        /* the scaled directional derivative. */
+        wa3 = fjac.template triangularView<Upper>() * (qrfac.colsPermutation().inverse() *wa1);
+        temp1 = numext::abs2(wa3.stableNorm() / fnorm);
+        temp2 = numext::abs2(sqrt(par) * pnorm / fnorm);
+        prered = temp1 + temp2 / Scalar(.5);
+        dirder = -(temp1 + temp2);
+
+        /* compute the ratio of the actual to the predicted */
+        /* reduction. */
+        ratio = 0.;
+        if (prered != 0.)
+            ratio = actred / prered;
+
+        /* update the step bound. */
+        if (ratio <= Scalar(.25)) {
+            if (actred >= 0.)
+                temp = Scalar(.5);
+            if (actred < 0.)
+                temp = Scalar(.5) * dirder / (dirder + Scalar(.5) * actred);
+            if (Scalar(.1) * fnorm1 >= fnorm || temp < Scalar(.1))
+                temp = Scalar(.1);
+            /* Computing MIN */
+            delta = temp * (std::min)(delta, pnorm / Scalar(.1));
+            par /= temp;
+        } else if (!(par != 0. && ratio < Scalar(.75))) {
+            delta = pnorm / Scalar(.5);
+            par = Scalar(.5) * par;
+        }
+
+        /* test for successful iteration. */
+        if (ratio >= Scalar(1e-4)) {
+            /* successful iteration. update x, fvec, and their norms. */
+            x = wa2;
+            wa2 = diag.cwiseProduct(x);
+            fvec = wa4;
+            xnorm = wa2.stableNorm();
+            fnorm = fnorm1;
+            ++iter;
+        }
+
+        /* tests for convergence. */
+        if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1. && delta <= parameters.xtol * xnorm)
+            return LevenbergMarquardtSpace::RelativeErrorAndReductionTooSmall;
+        if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1.)
+            return LevenbergMarquardtSpace::RelativeReductionTooSmall;
+        if (delta <= parameters.xtol * xnorm)
+            return LevenbergMarquardtSpace::RelativeErrorTooSmall;
+
+        /* tests for termination and stringent tolerances. */
+        if (nfev >= parameters.maxfev)
+            return LevenbergMarquardtSpace::TooManyFunctionEvaluation;
+        if (abs(actred) <= NumTraits<Scalar>::epsilon() && prered <= NumTraits<Scalar>::epsilon() && Scalar(.5) * ratio <= 1.)
+            return LevenbergMarquardtSpace::FtolTooSmall;
+        if (delta <= NumTraits<Scalar>::epsilon() * xnorm)
+            return LevenbergMarquardtSpace::XtolTooSmall;
+        if (gnorm <= NumTraits<Scalar>::epsilon())
+            return LevenbergMarquardtSpace::GtolTooSmall;
+
+    } while (ratio < Scalar(1e-4));
+
+    return LevenbergMarquardtSpace::Running;
+}
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::lmstr1(
+        FVectorType  &x,
+        const Scalar tol
+        )
+{
+    n = x.size();
+    m = functor.values();
+
+    /* check the input parameters for errors. */
+    if (n <= 0 || m < n || tol < 0.)
+        return LevenbergMarquardtSpace::ImproperInputParameters;
+
+    resetParameters();
+    parameters.ftol = tol;
+    parameters.xtol = tol;
+    parameters.maxfev = 100*(n+1);
+
+    return minimizeOptimumStorage(x);
+}
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::minimizeOptimumStorageInit(FVectorType  &x)
+{
+    n = x.size();
+    m = functor.values();
+
+    wa1.resize(n); wa2.resize(n); wa3.resize(n);
+    wa4.resize(m);
+    fvec.resize(m);
+    // Only R is stored in fjac. Q is only used to compute 'qtf', which is
+    // Q.transpose()*rhs. qtf will be updated using givens rotation,
+    // instead of storing them in Q.
+    // The purpose it to only use a nxn matrix, instead of mxn here, so
+    // that we can handle cases where m>>n :
+    fjac.resize(n, n);
+    if (!useExternalScaling)
+        diag.resize(n);
+    eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
+    qtf.resize(n);
+
+    /* Function Body */
+    nfev = 0;
+    njev = 0;
+
+    /*     check the input parameters for errors. */
+    if (n <= 0 || m < n || parameters.ftol < 0. || parameters.xtol < 0. || parameters.gtol < 0. || parameters.maxfev <= 0 || parameters.factor <= 0.)
+        return LevenbergMarquardtSpace::ImproperInputParameters;
+
+    if (useExternalScaling)
+        for (Index j = 0; j < n; ++j)
+            if (diag[j] <= 0.)
+                return LevenbergMarquardtSpace::ImproperInputParameters;
+
+    /*     evaluate the function at the starting point */
+    /*     and calculate its norm. */
+    nfev = 1;
+    if ( functor(x, fvec) < 0)
+        return LevenbergMarquardtSpace::UserAsked;
+    fnorm = fvec.stableNorm();
+
+    /*     initialize levenberg-marquardt parameter and iteration counter. */
+    par = 0.;
+    iter = 1;
+
+    return LevenbergMarquardtSpace::NotStarted;
+}
+
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::minimizeOptimumStorageOneStep(FVectorType  &x)
+{
+    using std::abs;
+    using std::sqrt;
+    
+    eigen_assert(x.size()==n); // check the caller is not cheating us
+
+    Index i, j;
+    bool sing;
+
+    /* compute the qr factorization of the jacobian matrix */
+    /* calculated one row at a time, while simultaneously */
+    /* forming (q transpose)*fvec and storing the first */
+    /* n components in qtf. */
+    qtf.fill(0.);
+    fjac.fill(0.);
+    Index rownb = 2;
+    for (i = 0; i < m; ++i) {
+        if (functor.df(x, wa3, rownb) < 0) return LevenbergMarquardtSpace::UserAsked;
+        internal::rwupdt<Scalar>(fjac, wa3, qtf, fvec[i]);
+        ++rownb;
+    }
+    ++njev;
+
+    /* if the jacobian is rank deficient, call qrfac to */
+    /* reorder its columns and update the components of qtf. */
+    sing = false;
+    for (j = 0; j < n; ++j) {
+        if (fjac(j,j) == 0.)
+            sing = true;
+        wa2[j] = fjac.col(j).head(j).stableNorm();
+    }
+    permutation.setIdentity(n);
+    if (sing) {
+        wa2 = fjac.colwise().blueNorm();
+        // TODO We have no unit test covering this code path, do not modify
+        // until it is carefully tested
+        ColPivHouseholderQR<JacobianType> qrfac(fjac);
+        fjac = qrfac.matrixQR();
+        wa1 = fjac.diagonal();
+        fjac.diagonal() = qrfac.hCoeffs();
+        permutation = qrfac.colsPermutation();
+        // TODO : avoid this:
+        for(Index ii=0; ii< fjac.cols(); ii++) fjac.col(ii).segment(ii+1, fjac.rows()-ii-1) *= fjac(ii,ii); // rescale vectors
+
+        for (j = 0; j < n; ++j) {
+            if (fjac(j,j) != 0.) {
+                sum = 0.;
+                for (i = j; i < n; ++i)
+                    sum += fjac(i,j) * qtf[i];
+                temp = -sum / fjac(j,j);
+                for (i = j; i < n; ++i)
+                    qtf[i] += fjac(i,j) * temp;
+            }
+            fjac(j,j) = wa1[j];
+        }
+    }
+
+    /* on the first iteration and if external scaling is not used, scale according */
+    /* to the norms of the columns of the initial jacobian. */
+    if (iter == 1) {
+        if (!useExternalScaling)
+            for (j = 0; j < n; ++j)
+                diag[j] = (wa2[j]==0.)? 1. : wa2[j];
+
+        /* on the first iteration, calculate the norm of the scaled x */
+        /* and initialize the step bound delta. */
+        xnorm = diag.cwiseProduct(x).stableNorm();
+        delta = parameters.factor * xnorm;
+        if (delta == 0.)
+            delta = parameters.factor;
+    }
+
+    /* compute the norm of the scaled gradient. */
+    gnorm = 0.;
+    if (fnorm != 0.)
+        for (j = 0; j < n; ++j)
+            if (wa2[permutation.indices()[j]] != 0.)
+                gnorm = (std::max)(gnorm, abs( fjac.col(j).head(j+1).dot(qtf.head(j+1)/fnorm) / wa2[permutation.indices()[j]]));
+
+    /* test for convergence of the gradient norm. */
+    if (gnorm <= parameters.gtol)
+        return LevenbergMarquardtSpace::CosinusTooSmall;
+
+    /* rescale if necessary. */
+    if (!useExternalScaling)
+        diag = diag.cwiseMax(wa2);
+
+    do {
+
+        /* determine the levenberg-marquardt parameter. */
+        internal::lmpar<Scalar>(fjac, permutation.indices(), diag, qtf, delta, par, wa1);
+
+        /* store the direction p and x + p. calculate the norm of p. */
+        wa1 = -wa1;
+        wa2 = x + wa1;
+        pnorm = diag.cwiseProduct(wa1).stableNorm();
+
+        /* on the first iteration, adjust the initial step bound. */
+        if (iter == 1)
+            delta = (std::min)(delta,pnorm);
+
+        /* evaluate the function at x + p and calculate its norm. */
+        if ( functor(wa2, wa4) < 0)
+            return LevenbergMarquardtSpace::UserAsked;
+        ++nfev;
+        fnorm1 = wa4.stableNorm();
+
+        /* compute the scaled actual reduction. */
+        actred = -1.;
+        if (Scalar(.1) * fnorm1 < fnorm)
+            actred = 1. - numext::abs2(fnorm1 / fnorm);
+
+        /* compute the scaled predicted reduction and */
+        /* the scaled directional derivative. */
+        wa3 = fjac.topLeftCorner(n,n).template triangularView<Upper>() * (permutation.inverse() * wa1);
+        temp1 = numext::abs2(wa3.stableNorm() / fnorm);
+        temp2 = numext::abs2(sqrt(par) * pnorm / fnorm);
+        prered = temp1 + temp2 / Scalar(.5);
+        dirder = -(temp1 + temp2);
+
+        /* compute the ratio of the actual to the predicted */
+        /* reduction. */
+        ratio = 0.;
+        if (prered != 0.)
+            ratio = actred / prered;
+
+        /* update the step bound. */
+        if (ratio <= Scalar(.25)) {
+            if (actred >= 0.)
+                temp = Scalar(.5);
+            if (actred < 0.)
+                temp = Scalar(.5) * dirder / (dirder + Scalar(.5) * actred);
+            if (Scalar(.1) * fnorm1 >= fnorm || temp < Scalar(.1))
+                temp = Scalar(.1);
+            /* Computing MIN */
+            delta = temp * (std::min)(delta, pnorm / Scalar(.1));
+            par /= temp;
+        } else if (!(par != 0. && ratio < Scalar(.75))) {
+            delta = pnorm / Scalar(.5);
+            par = Scalar(.5) * par;
+        }
+
+        /* test for successful iteration. */
+        if (ratio >= Scalar(1e-4)) {
+            /* successful iteration. update x, fvec, and their norms. */
+            x = wa2;
+            wa2 = diag.cwiseProduct(x);
+            fvec = wa4;
+            xnorm = wa2.stableNorm();
+            fnorm = fnorm1;
+            ++iter;
+        }
+
+        /* tests for convergence. */
+        if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1. && delta <= parameters.xtol * xnorm)
+            return LevenbergMarquardtSpace::RelativeErrorAndReductionTooSmall;
+        if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1.)
+            return LevenbergMarquardtSpace::RelativeReductionTooSmall;
+        if (delta <= parameters.xtol * xnorm)
+            return LevenbergMarquardtSpace::RelativeErrorTooSmall;
+
+        /* tests for termination and stringent tolerances. */
+        if (nfev >= parameters.maxfev)
+            return LevenbergMarquardtSpace::TooManyFunctionEvaluation;
+        if (abs(actred) <= NumTraits<Scalar>::epsilon() && prered <= NumTraits<Scalar>::epsilon() && Scalar(.5) * ratio <= 1.)
+            return LevenbergMarquardtSpace::FtolTooSmall;
+        if (delta <= NumTraits<Scalar>::epsilon() * xnorm)
+            return LevenbergMarquardtSpace::XtolTooSmall;
+        if (gnorm <= NumTraits<Scalar>::epsilon())
+            return LevenbergMarquardtSpace::GtolTooSmall;
+
+    } while (ratio < Scalar(1e-4));
+
+    return LevenbergMarquardtSpace::Running;
+}
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::minimizeOptimumStorage(FVectorType  &x)
+{
+    LevenbergMarquardtSpace::Status status = minimizeOptimumStorageInit(x);
+    if (status==LevenbergMarquardtSpace::ImproperInputParameters)
+        return status;
+    do {
+        status = minimizeOptimumStorageOneStep(x);
+    } while (status==LevenbergMarquardtSpace::Running);
+    return status;
+}
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::lmdif1(
+        FunctorType &functor,
+        FVectorType  &x,
+        Index *nfev,
+        const Scalar tol
+        )
+{
+    Index n = x.size();
+    Index m = functor.values();
+
+    /* check the input parameters for errors. */
+    if (n <= 0 || m < n || tol < 0.)
+        return LevenbergMarquardtSpace::ImproperInputParameters;
+
+    NumericalDiff<FunctorType> numDiff(functor);
+    // embedded LevenbergMarquardt
+    LevenbergMarquardt<NumericalDiff<FunctorType>, Scalar > lm(numDiff);
+    lm.parameters.ftol = tol;
+    lm.parameters.xtol = tol;
+    lm.parameters.maxfev = 200*(n+1);
+
+    LevenbergMarquardtSpace::Status info = LevenbergMarquardtSpace::Status(lm.minimize(x));
+    if (nfev)
+        * nfev = lm.nfev;
+    return info;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_LEVENBERGMARQUARDT__H
+
+//vim: ai ts=4 sts=4 et sw=4

diff --git a/unsupported/Eigen/src/NonLinearOptimization/chkder.h b/unsupported/Eigen/src/NonLinearOptimization/chkder.h
new file mode 100644
index 0000000..db8ff7d
--- /dev/null
+++ b/unsupported/Eigen/src/NonLinearOptimization/chkder.h

@@ -0,0 +1,66 @@
+#define chkder_log10e 0.43429448190325182765
+#define chkder_factor 100.
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename Scalar>
+void chkder(
+        const Matrix< Scalar, Dynamic, 1 >  &x,
+        const Matrix< Scalar, Dynamic, 1 >  &fvec,
+        const Matrix< Scalar, Dynamic, Dynamic > &fjac,
+        Matrix< Scalar, Dynamic, 1 >  &xp,
+        const Matrix< Scalar, Dynamic, 1 >  &fvecp,
+        int mode,
+        Matrix< Scalar, Dynamic, 1 >  &err
+        )
+{
+    using std::sqrt;
+    using std::abs;
+    using std::log;
+    
+    typedef DenseIndex Index;
+
+    const Scalar eps = sqrt(NumTraits<Scalar>::epsilon());
+    const Scalar epsf = chkder_factor * NumTraits<Scalar>::epsilon();
+    const Scalar epslog = chkder_log10e * log(eps);
+    Scalar temp;
+
+    const Index m = fvec.size(), n = x.size();
+
+    if (mode != 2) {
+        /* mode = 1. */
+        xp.resize(n);
+        for (Index j = 0; j < n; ++j) {
+            temp = eps * abs(x[j]);
+            if (temp == 0.)
+                temp = eps;
+            xp[j] = x[j] + temp;
+        }
+    }
+    else {
+        /* mode = 2. */
+        err.setZero(m); 
+        for (Index j = 0; j < n; ++j) {
+            temp = abs(x[j]);
+            if (temp == 0.)
+                temp = 1.;
+            err += temp * fjac.col(j);
+        }
+        for (Index i = 0; i < m; ++i) {
+            temp = 1.;
+            if (fvec[i] != 0. && fvecp[i] != 0. && abs(fvecp[i] - fvec[i]) >= epsf * abs(fvec[i]))
+                temp = eps * abs((fvecp[i] - fvec[i]) / eps - err[i]) / (abs(fvec[i]) + abs(fvecp[i]));
+            err[i] = 1.;
+            if (temp > NumTraits<Scalar>::epsilon() && temp < eps)
+                err[i] = (chkder_log10e * log(temp) - epslog) / epslog;
+            if (temp >= eps)
+                err[i] = 0.;
+        }
+    }
+}
+
+} // end namespace internal
+
+} // end namespace Eigen

diff --git a/unsupported/Eigen/src/NonLinearOptimization/covar.h b/unsupported/Eigen/src/NonLinearOptimization/covar.h
new file mode 100644
index 0000000..68260d1
--- /dev/null
+++ b/unsupported/Eigen/src/NonLinearOptimization/covar.h

@@ -0,0 +1,70 @@
+namespace Eigen { 
+
+namespace internal {
+
+template <typename Scalar>
+void covar(
+        Matrix< Scalar, Dynamic, Dynamic > &r,
+        const VectorXi &ipvt,
+        Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon()) )
+{
+    using std::abs;
+    typedef DenseIndex Index;
+
+    /* Local variables */
+    Index i, j, k, l, ii, jj;
+    bool sing;
+    Scalar temp;
+
+    /* Function Body */
+    const Index n = r.cols();
+    const Scalar tolr = tol * abs(r(0,0));
+    Matrix< Scalar, Dynamic, 1 > wa(n);
+    eigen_assert(ipvt.size()==n);
+
+    /* form the inverse of r in the full upper triangle of r. */
+    l = -1;
+    for (k = 0; k < n; ++k)
+        if (abs(r(k,k)) > tolr) {
+            r(k,k) = 1. / r(k,k);
+            for (j = 0; j <= k-1; ++j) {
+                temp = r(k,k) * r(j,k);
+                r(j,k) = 0.;
+                r.col(k).head(j+1) -= r.col(j).head(j+1) * temp;
+            }
+            l = k;
+        }
+
+    /* form the full upper triangle of the inverse of (r transpose)*r */
+    /* in the full upper triangle of r. */
+    for (k = 0; k <= l; ++k) {
+        for (j = 0; j <= k-1; ++j)
+            r.col(j).head(j+1) += r.col(k).head(j+1) * r(j,k);
+        r.col(k).head(k+1) *= r(k,k);
+    }
+
+    /* form the full lower triangle of the covariance matrix */
+    /* in the strict lower triangle of r and in wa. */
+    for (j = 0; j < n; ++j) {
+        jj = ipvt[j];
+        sing = j > l;
+        for (i = 0; i <= j; ++i) {
+            if (sing)
+                r(i,j) = 0.;
+            ii = ipvt[i];
+            if (ii > jj)
+                r(ii,jj) = r(i,j);
+            if (ii < jj)
+                r(jj,ii) = r(i,j);
+        }
+        wa[jj] = r(j,j);
+    }
+
+    /* symmetrize the covariance matrix in r. */
+    r.topLeftCorner(n,n).template triangularView<StrictlyUpper>() = r.topLeftCorner(n,n).transpose();
+    r.diagonal() = wa;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen

diff --git a/unsupported/Eigen/src/NonLinearOptimization/dogleg.h b/unsupported/Eigen/src/NonLinearOptimization/dogleg.h
new file mode 100644
index 0000000..80c5d27
--- /dev/null
+++ b/unsupported/Eigen/src/NonLinearOptimization/dogleg.h

@@ -0,0 +1,107 @@
+namespace Eigen { 
+
+namespace internal {
+
+template <typename Scalar>
+void dogleg(
+        const Matrix< Scalar, Dynamic, Dynamic >  &qrfac,
+        const Matrix< Scalar, Dynamic, 1 >  &diag,
+        const Matrix< Scalar, Dynamic, 1 >  &qtb,
+        Scalar delta,
+        Matrix< Scalar, Dynamic, 1 >  &x)
+{
+    using std::abs;
+    using std::sqrt;
+    
+    typedef DenseIndex Index;
+
+    /* Local variables */
+    Index i, j;
+    Scalar sum, temp, alpha, bnorm;
+    Scalar gnorm, qnorm;
+    Scalar sgnorm;
+
+    /* Function Body */
+    const Scalar epsmch = NumTraits<Scalar>::epsilon();
+    const Index n = qrfac.cols();
+    eigen_assert(n==qtb.size());
+    eigen_assert(n==x.size());
+    eigen_assert(n==diag.size());
+    Matrix< Scalar, Dynamic, 1 >  wa1(n), wa2(n);
+
+    /* first, calculate the gauss-newton direction. */
+    for (j = n-1; j >=0; --j) {
+        temp = qrfac(j,j);
+        if (temp == 0.) {
+            temp = epsmch * qrfac.col(j).head(j+1).maxCoeff();
+            if (temp == 0.)
+                temp = epsmch;
+        }
+        if (j==n-1)
+            x[j] = qtb[j] / temp;
+        else
+            x[j] = (qtb[j] - qrfac.row(j).tail(n-j-1).dot(x.tail(n-j-1))) / temp;
+    }
+
+    /* test whether the gauss-newton direction is acceptable. */
+    qnorm = diag.cwiseProduct(x).stableNorm();
+    if (qnorm <= delta)
+        return;
+
+    // TODO : this path is not tested by Eigen unit tests
+
+    /* the gauss-newton direction is not acceptable. */
+    /* next, calculate the scaled gradient direction. */
+
+    wa1.fill(0.);
+    for (j = 0; j < n; ++j) {
+        wa1.tail(n-j) += qrfac.row(j).tail(n-j) * qtb[j];
+        wa1[j] /= diag[j];
+    }
+
+    /* calculate the norm of the scaled gradient and test for */
+    /* the special case in which the scaled gradient is zero. */
+    gnorm = wa1.stableNorm();
+    sgnorm = 0.;
+    alpha = delta / qnorm;
+    if (gnorm == 0.)
+        goto algo_end;
+
+    /* calculate the point along the scaled gradient */
+    /* at which the quadratic is minimized. */
+    wa1.array() /= (diag*gnorm).array();
+    // TODO : once unit tests cover this part,:
+    // wa2 = qrfac.template triangularView<Upper>() * wa1;
+    for (j = 0; j < n; ++j) {
+        sum = 0.;
+        for (i = j; i < n; ++i) {
+            sum += qrfac(j,i) * wa1[i];
+        }
+        wa2[j] = sum;
+    }
+    temp = wa2.stableNorm();
+    sgnorm = gnorm / temp / temp;
+
+    /* test whether the scaled gradient direction is acceptable. */
+    alpha = 0.;
+    if (sgnorm >= delta)
+        goto algo_end;
+
+    /* the scaled gradient direction is not acceptable. */
+    /* finally, calculate the point along the dogleg */
+    /* at which the quadratic is minimized. */
+    bnorm = qtb.stableNorm();
+    temp = bnorm / gnorm * (bnorm / qnorm) * (sgnorm / delta);
+    temp = temp - delta / qnorm * numext::abs2(sgnorm / delta) + sqrt(numext::abs2(temp - delta / qnorm) + (1.-numext::abs2(delta / qnorm)) * (1.-numext::abs2(sgnorm / delta)));
+    alpha = delta / qnorm * (1. - numext::abs2(sgnorm / delta)) / temp;
+algo_end:
+
+    /* form appropriate convex combination of the gauss-newton */
+    /* direction and the scaled gradient direction. */
+    temp = (1.-alpha) * (std::min)(sgnorm,delta);
+    x = temp * wa1 + alpha * x;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen

diff --git a/unsupported/Eigen/src/NonLinearOptimization/fdjac1.h b/unsupported/Eigen/src/NonLinearOptimization/fdjac1.h
new file mode 100644
index 0000000..bb7cf26
--- /dev/null
+++ b/unsupported/Eigen/src/NonLinearOptimization/fdjac1.h

@@ -0,0 +1,79 @@
+namespace Eigen { 
+
+namespace internal {
+
+template<typename FunctorType, typename Scalar>
+DenseIndex fdjac1(
+        const FunctorType &Functor,
+        Matrix< Scalar, Dynamic, 1 >  &x,
+        Matrix< Scalar, Dynamic, 1 >  &fvec,
+        Matrix< Scalar, Dynamic, Dynamic > &fjac,
+        DenseIndex ml, DenseIndex mu,
+        Scalar epsfcn)
+{
+    using std::sqrt;
+    using std::abs;
+    
+    typedef DenseIndex Index;
+
+    /* Local variables */
+    Scalar h;
+    Index j, k;
+    Scalar eps, temp;
+    Index msum;
+    int iflag;
+    Index start, length;
+
+    /* Function Body */
+    const Scalar epsmch = NumTraits<Scalar>::epsilon();
+    const Index n = x.size();
+    eigen_assert(fvec.size()==n);
+    Matrix< Scalar, Dynamic, 1 >  wa1(n);
+    Matrix< Scalar, Dynamic, 1 >  wa2(n);
+
+    eps = sqrt((std::max)(epsfcn,epsmch));
+    msum = ml + mu + 1;
+    if (msum >= n) {
+        /* computation of dense approximate jacobian. */
+        for (j = 0; j < n; ++j) {
+            temp = x[j];
+            h = eps * abs(temp);
+            if (h == 0.)
+                h = eps;
+            x[j] = temp + h;
+            iflag = Functor(x, wa1);
+            if (iflag < 0)
+                return iflag;
+            x[j] = temp;
+            fjac.col(j) = (wa1-fvec)/h;
+        }
+
+    }else {
+        /* computation of banded approximate jacobian. */
+        for (k = 0; k < msum; ++k) {
+            for (j = k; (msum<0) ? (j>n): (j<n); j += msum) {
+                wa2[j] = x[j];
+                h = eps * abs(wa2[j]);
+                if (h == 0.) h = eps;
+                x[j] = wa2[j] + h;
+            }
+            iflag = Functor(x, wa1);
+            if (iflag < 0)
+                return iflag;
+            for (j = k; (msum<0) ? (j>n): (j<n); j += msum) {
+                x[j] = wa2[j];
+                h = eps * abs(wa2[j]);
+                if (h == 0.) h = eps;
+                fjac.col(j).setZero();
+                start = std::max<Index>(0,j-mu);
+                length = (std::min)(n-1, j+ml) - start + 1;
+                fjac.col(j).segment(start, length) = ( wa1.segment(start, length)-fvec.segment(start, length))/h;
+            }
+        }
+    }
+    return 0;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen

diff --git a/unsupported/Eigen/src/NonLinearOptimization/lmpar.h b/unsupported/Eigen/src/NonLinearOptimization/lmpar.h
new file mode 100644
index 0000000..4c17d4c
--- /dev/null
+++ b/unsupported/Eigen/src/NonLinearOptimization/lmpar.h

@@ -0,0 +1,298 @@
+namespace Eigen { 
+
+namespace internal {
+
+template <typename Scalar>
+void lmpar(
+        Matrix< Scalar, Dynamic, Dynamic > &r,
+        const VectorXi &ipvt,
+        const Matrix< Scalar, Dynamic, 1 >  &diag,
+        const Matrix< Scalar, Dynamic, 1 >  &qtb,
+        Scalar delta,
+        Scalar &par,
+        Matrix< Scalar, Dynamic, 1 >  &x)
+{
+    using std::abs;
+    using std::sqrt;
+    typedef DenseIndex Index;
+
+    /* Local variables */
+    Index i, j, l;
+    Scalar fp;
+    Scalar parc, parl;
+    Index iter;
+    Scalar temp, paru;
+    Scalar gnorm;
+    Scalar dxnorm;
+
+
+    /* Function Body */
+    const Scalar dwarf = (std::numeric_limits<Scalar>::min)();
+    const Index n = r.cols();
+    eigen_assert(n==diag.size());
+    eigen_assert(n==qtb.size());
+    eigen_assert(n==x.size());
+
+    Matrix< Scalar, Dynamic, 1 >  wa1, wa2;
+
+    /* compute and store in x the gauss-newton direction. if the */
+    /* jacobian is rank-deficient, obtain a least squares solution. */
+    Index nsing = n-1;
+    wa1 = qtb;
+    for (j = 0; j < n; ++j) {
+        if (r(j,j) == 0. && nsing == n-1)
+            nsing = j - 1;
+        if (nsing < n-1)
+            wa1[j] = 0.;
+    }
+    for (j = nsing; j>=0; --j) {
+        wa1[j] /= r(j,j);
+        temp = wa1[j];
+        for (i = 0; i < j ; ++i)
+            wa1[i] -= r(i,j) * temp;
+    }
+
+    for (j = 0; j < n; ++j)
+        x[ipvt[j]] = wa1[j];
+
+    /* initialize the iteration counter. */
+    /* evaluate the function at the origin, and test */
+    /* for acceptance of the gauss-newton direction. */
+    iter = 0;
+    wa2 = diag.cwiseProduct(x);
+    dxnorm = wa2.blueNorm();
+    fp = dxnorm - delta;
+    if (fp <= Scalar(0.1) * delta) {
+        par = 0;
+        return;
+    }
+
+    /* if the jacobian is not rank deficient, the newton */
+    /* step provides a lower bound, parl, for the zero of */
+    /* the function. otherwise set this bound to zero. */
+    parl = 0.;
+    if (nsing >= n-1) {
+        for (j = 0; j < n; ++j) {
+            l = ipvt[j];
+            wa1[j] = diag[l] * (wa2[l] / dxnorm);
+        }
+        // it's actually a triangularView.solveInplace(), though in a weird
+        // way:
+        for (j = 0; j < n; ++j) {
+            Scalar sum = 0.;
+            for (i = 0; i < j; ++i)
+                sum += r(i,j) * wa1[i];
+            wa1[j] = (wa1[j] - sum) / r(j,j);
+        }
+        temp = wa1.blueNorm();
+        parl = fp / delta / temp / temp;
+    }
+
+    /* calculate an upper bound, paru, for the zero of the function. */
+    for (j = 0; j < n; ++j)
+        wa1[j] = r.col(j).head(j+1).dot(qtb.head(j+1)) / diag[ipvt[j]];
+
+    gnorm = wa1.stableNorm();
+    paru = gnorm / delta;
+    if (paru == 0.)
+        paru = dwarf / (std::min)(delta,Scalar(0.1));
+
+    /* if the input par lies outside of the interval (parl,paru), */
+    /* set par to the closer endpoint. */
+    par = (std::max)(par,parl);
+    par = (std::min)(par,paru);
+    if (par == 0.)
+        par = gnorm / dxnorm;
+
+    /* beginning of an iteration. */
+    while (true) {
+        ++iter;
+
+        /* evaluate the function at the current value of par. */
+        if (par == 0.)
+            par = (std::max)(dwarf,Scalar(.001) * paru); /* Computing MAX */
+        wa1 = sqrt(par)* diag;
+
+        Matrix< Scalar, Dynamic, 1 > sdiag(n);
+        qrsolv<Scalar>(r, ipvt, wa1, qtb, x, sdiag);
+
+        wa2 = diag.cwiseProduct(x);
+        dxnorm = wa2.blueNorm();
+        temp = fp;
+        fp = dxnorm - delta;
+
+        /* if the function is small enough, accept the current value */
+        /* of par. also test for the exceptional cases where parl */
+        /* is zero or the number of iterations has reached 10. */
+        if (abs(fp) <= Scalar(0.1) * delta || (parl == 0. && fp <= temp && temp < 0.) || iter == 10)
+            break;
+
+        /* compute the newton correction. */
+        for (j = 0; j < n; ++j) {
+            l = ipvt[j];
+            wa1[j] = diag[l] * (wa2[l] / dxnorm);
+        }
+        for (j = 0; j < n; ++j) {
+            wa1[j] /= sdiag[j];
+            temp = wa1[j];
+            for (i = j+1; i < n; ++i)
+                wa1[i] -= r(i,j) * temp;
+        }
+        temp = wa1.blueNorm();
+        parc = fp / delta / temp / temp;
+
+        /* depending on the sign of the function, update parl or paru. */
+        if (fp > 0.)
+            parl = (std::max)(parl,par);
+        if (fp < 0.)
+            paru = (std::min)(paru,par);
+
+        /* compute an improved estimate for par. */
+        /* Computing MAX */
+        par = (std::max)(parl,par+parc);
+
+        /* end of an iteration. */
+    }
+
+    /* termination. */
+    if (iter == 0)
+        par = 0.;
+    return;
+}
+
+template <typename Scalar>
+void lmpar2(
+        const ColPivHouseholderQR<Matrix< Scalar, Dynamic, Dynamic> > &qr,
+        const Matrix< Scalar, Dynamic, 1 >  &diag,
+        const Matrix< Scalar, Dynamic, 1 >  &qtb,
+        Scalar delta,
+        Scalar &par,
+        Matrix< Scalar, Dynamic, 1 >  &x)
+
+{
+    using std::sqrt;
+    using std::abs;
+    typedef DenseIndex Index;
+
+    /* Local variables */
+    Index j;
+    Scalar fp;
+    Scalar parc, parl;
+    Index iter;
+    Scalar temp, paru;
+    Scalar gnorm;
+    Scalar dxnorm;
+
+
+    /* Function Body */
+    const Scalar dwarf = (std::numeric_limits<Scalar>::min)();
+    const Index n = qr.matrixQR().cols();
+    eigen_assert(n==diag.size());
+    eigen_assert(n==qtb.size());
+
+    Matrix< Scalar, Dynamic, 1 >  wa1, wa2;
+
+    /* compute and store in x the gauss-newton direction. if the */
+    /* jacobian is rank-deficient, obtain a least squares solution. */
+
+//    const Index rank = qr.nonzeroPivots(); // exactly double(0.)
+    const Index rank = qr.rank(); // use a threshold
+    wa1 = qtb;
+    wa1.tail(n-rank).setZero();
+    qr.matrixQR().topLeftCorner(rank, rank).template triangularView<Upper>().solveInPlace(wa1.head(rank));
+
+    x = qr.colsPermutation()*wa1;
+
+    /* initialize the iteration counter. */
+    /* evaluate the function at the origin, and test */
+    /* for acceptance of the gauss-newton direction. */
+    iter = 0;
+    wa2 = diag.cwiseProduct(x);
+    dxnorm = wa2.blueNorm();
+    fp = dxnorm - delta;
+    if (fp <= Scalar(0.1) * delta) {
+        par = 0;
+        return;
+    }
+
+    /* if the jacobian is not rank deficient, the newton */
+    /* step provides a lower bound, parl, for the zero of */
+    /* the function. otherwise set this bound to zero. */
+    parl = 0.;
+    if (rank==n) {
+        wa1 = qr.colsPermutation().inverse() *  diag.cwiseProduct(wa2)/dxnorm;
+        qr.matrixQR().topLeftCorner(n, n).transpose().template triangularView<Lower>().solveInPlace(wa1);
+        temp = wa1.blueNorm();
+        parl = fp / delta / temp / temp;
+    }
+
+    /* calculate an upper bound, paru, for the zero of the function. */
+    for (j = 0; j < n; ++j)
+        wa1[j] = qr.matrixQR().col(j).head(j+1).dot(qtb.head(j+1)) / diag[qr.colsPermutation().indices()(j)];
+
+    gnorm = wa1.stableNorm();
+    paru = gnorm / delta;
+    if (paru == 0.)
+        paru = dwarf / (std::min)(delta,Scalar(0.1));
+
+    /* if the input par lies outside of the interval (parl,paru), */
+    /* set par to the closer endpoint. */
+    par = (std::max)(par,parl);
+    par = (std::min)(par,paru);
+    if (par == 0.)
+        par = gnorm / dxnorm;
+
+    /* beginning of an iteration. */
+    Matrix< Scalar, Dynamic, Dynamic > s = qr.matrixQR();
+    while (true) {
+        ++iter;
+
+        /* evaluate the function at the current value of par. */
+        if (par == 0.)
+            par = (std::max)(dwarf,Scalar(.001) * paru); /* Computing MAX */
+        wa1 = sqrt(par)* diag;
+
+        Matrix< Scalar, Dynamic, 1 > sdiag(n);
+        qrsolv<Scalar>(s, qr.colsPermutation().indices(), wa1, qtb, x, sdiag);
+
+        wa2 = diag.cwiseProduct(x);
+        dxnorm = wa2.blueNorm();
+        temp = fp;
+        fp = dxnorm - delta;
+
+        /* if the function is small enough, accept the current value */
+        /* of par. also test for the exceptional cases where parl */
+        /* is zero or the number of iterations has reached 10. */
+        if (abs(fp) <= Scalar(0.1) * delta || (parl == 0. && fp <= temp && temp < 0.) || iter == 10)
+            break;
+
+        /* compute the newton correction. */
+        wa1 = qr.colsPermutation().inverse() * diag.cwiseProduct(wa2/dxnorm);
+        // we could almost use this here, but the diagonal is outside qr, in sdiag[]
+        // qr.matrixQR().topLeftCorner(n, n).transpose().template triangularView<Lower>().solveInPlace(wa1);
+        for (j = 0; j < n; ++j) {
+            wa1[j] /= sdiag[j];
+            temp = wa1[j];
+            for (Index i = j+1; i < n; ++i)
+                wa1[i] -= s(i,j) * temp;
+        }
+        temp = wa1.blueNorm();
+        parc = fp / delta / temp / temp;
+
+        /* depending on the sign of the function, update parl or paru. */
+        if (fp > 0.)
+            parl = (std::max)(parl,par);
+        if (fp < 0.)
+            paru = (std::min)(paru,par);
+
+        /* compute an improved estimate for par. */
+        par = (std::max)(parl,par+parc);
+    }
+    if (iter == 0)
+        par = 0.;
+    return;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen

diff --git a/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h b/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h
new file mode 100644
index 0000000..4f2f560
--- /dev/null
+++ b/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h

@@ -0,0 +1,91 @@
+namespace Eigen { 
+
+namespace internal {
+
+// TODO : once qrsolv2 is removed, use ColPivHouseholderQR or PermutationMatrix instead of ipvt
+template <typename Scalar>
+void qrsolv(
+        Matrix< Scalar, Dynamic, Dynamic > &s,
+        // TODO : use a PermutationMatrix once lmpar is no more:
+        const VectorXi &ipvt,
+        const Matrix< Scalar, Dynamic, 1 >  &diag,
+        const Matrix< Scalar, Dynamic, 1 >  &qtb,
+        Matrix< Scalar, Dynamic, 1 >  &x,
+        Matrix< Scalar, Dynamic, 1 >  &sdiag)
+
+{
+    typedef DenseIndex Index;
+
+    /* Local variables */
+    Index i, j, k, l;
+    Scalar temp;
+    Index n = s.cols();
+    Matrix< Scalar, Dynamic, 1 >  wa(n);
+    JacobiRotation<Scalar> givens;
+
+    /* Function Body */
+    // the following will only change the lower triangular part of s, including
+    // the diagonal, though the diagonal is restored afterward
+
+    /*     copy r and (q transpose)*b to preserve input and initialize s. */
+    /*     in particular, save the diagonal elements of r in x. */
+    x = s.diagonal();
+    wa = qtb;
+
+    s.topLeftCorner(n,n).template triangularView<StrictlyLower>() = s.topLeftCorner(n,n).transpose();
+
+    /*     eliminate the diagonal matrix d using a givens rotation. */
+    for (j = 0; j < n; ++j) {
+
+        /*        prepare the row of d to be eliminated, locating the */
+        /*        diagonal element using p from the qr factorization. */
+        l = ipvt[j];
+        if (diag[l] == 0.)
+            break;
+        sdiag.tail(n-j).setZero();
+        sdiag[j] = diag[l];
+
+        /*        the transformations to eliminate the row of d */
+        /*        modify only a single element of (q transpose)*b */
+        /*        beyond the first n, which is initially zero. */
+        Scalar qtbpj = 0.;
+        for (k = j; k < n; ++k) {
+            /*           determine a givens rotation which eliminates the */
+            /*           appropriate element in the current row of d. */
+            givens.makeGivens(-s(k,k), sdiag[k]);
+
+            /*           compute the modified diagonal element of r and */
+            /*           the modified element of ((q transpose)*b,0). */
+            s(k,k) = givens.c() * s(k,k) + givens.s() * sdiag[k];
+            temp = givens.c() * wa[k] + givens.s() * qtbpj;
+            qtbpj = -givens.s() * wa[k] + givens.c() * qtbpj;
+            wa[k] = temp;
+
+            /*           accumulate the transformation in the row of s. */
+            for (i = k+1; i<n; ++i) {
+                temp = givens.c() * s(i,k) + givens.s() * sdiag[i];
+                sdiag[i] = -givens.s() * s(i,k) + givens.c() * sdiag[i];
+                s(i,k) = temp;
+            }
+        }
+    }
+
+    /*     solve the triangular system for z. if the system is */
+    /*     singular, then obtain a least squares solution. */
+    Index nsing;
+    for(nsing=0; nsing<n && sdiag[nsing]!=0; nsing++) {}
+
+    wa.tail(n-nsing).setZero();
+    s.topLeftCorner(nsing, nsing).transpose().template triangularView<Upper>().solveInPlace(wa.head(nsing));
+
+    // restore
+    sdiag = s.diagonal();
+    s.diagonal() = x;
+
+    /*     permute the components of z back to components of x. */
+    for (j = 0; j < n; ++j) x[ipvt[j]] = wa[j];
+}
+
+} // end namespace internal
+
+} // end namespace Eigen

diff --git a/unsupported/Eigen/src/NonLinearOptimization/r1mpyq.h b/unsupported/Eigen/src/NonLinearOptimization/r1mpyq.h
new file mode 100644
index 0000000..36ff700
--- /dev/null
+++ b/unsupported/Eigen/src/NonLinearOptimization/r1mpyq.h

@@ -0,0 +1,30 @@
+namespace Eigen { 
+
+namespace internal {
+
+// TODO : move this to GivensQR once there's such a thing in Eigen
+
+template <typename Scalar>
+void r1mpyq(DenseIndex m, DenseIndex n, Scalar *a, const std::vector<JacobiRotation<Scalar> > &v_givens, const std::vector<JacobiRotation<Scalar> > &w_givens)
+{
+    typedef DenseIndex Index;
+
+    /*     apply the first set of givens rotations to a. */
+    for (Index j = n-2; j>=0; --j)
+        for (Index i = 0; i<m; ++i) {
+            Scalar temp = v_givens[j].c() * a[i+m*j] - v_givens[j].s() * a[i+m*(n-1)];
+            a[i+m*(n-1)] = v_givens[j].s() * a[i+m*j] + v_givens[j].c() * a[i+m*(n-1)];
+            a[i+m*j] = temp;
+        }
+    /*     apply the second set of givens rotations to a. */
+    for (Index j = 0; j<n-1; ++j)
+        for (Index i = 0; i<m; ++i) {
+            Scalar temp = w_givens[j].c() * a[i+m*j] + w_givens[j].s() * a[i+m*(n-1)];
+            a[i+m*(n-1)] = -w_givens[j].s() * a[i+m*j] + w_givens[j].c() * a[i+m*(n-1)];
+            a[i+m*j] = temp;
+        }
+}
+
+} // end namespace internal
+
+} // end namespace Eigen

diff --git a/unsupported/Eigen/src/NonLinearOptimization/r1updt.h b/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
new file mode 100644
index 0000000..09fc652
--- /dev/null
+++ b/unsupported/Eigen/src/NonLinearOptimization/r1updt.h

@@ -0,0 +1,99 @@
+namespace Eigen { 
+
+namespace internal {
+
+template <typename Scalar>
+void r1updt(
+        Matrix< Scalar, Dynamic, Dynamic > &s,
+        const Matrix< Scalar, Dynamic, 1> &u,
+        std::vector<JacobiRotation<Scalar> > &v_givens,
+        std::vector<JacobiRotation<Scalar> > &w_givens,
+        Matrix< Scalar, Dynamic, 1> &v,
+        Matrix< Scalar, Dynamic, 1> &w,
+        bool *sing)
+{
+    typedef DenseIndex Index;
+    const JacobiRotation<Scalar> IdentityRotation = JacobiRotation<Scalar>(1,0);
+
+    /* Local variables */
+    const Index m = s.rows();
+    const Index n = s.cols();
+    Index i, j=1;
+    Scalar temp;
+    JacobiRotation<Scalar> givens;
+
+    // r1updt had a broader usecase, but we don't use it here. And, more
+    // importantly, we can not test it.
+    eigen_assert(m==n);
+    eigen_assert(u.size()==m);
+    eigen_assert(v.size()==n);
+    eigen_assert(w.size()==n);
+
+    /* move the nontrivial part of the last column of s into w. */
+    w[n-1] = s(n-1,n-1);
+
+    /* rotate the vector v into a multiple of the n-th unit vector */
+    /* in such a way that a spike is introduced into w. */
+    for (j=n-2; j>=0; --j) {
+        w[j] = 0.;
+        if (v[j] != 0.) {
+            /* determine a givens rotation which eliminates the */
+            /* j-th element of v. */
+            givens.makeGivens(-v[n-1], v[j]);
+
+            /* apply the transformation to v and store the information */
+            /* necessary to recover the givens rotation. */
+            v[n-1] = givens.s() * v[j] + givens.c() * v[n-1];
+            v_givens[j] = givens;
+
+            /* apply the transformation to s and extend the spike in w. */
+            for (i = j; i < m; ++i) {
+                temp = givens.c() * s(j,i) - givens.s() * w[i];
+                w[i] = givens.s() * s(j,i) + givens.c() * w[i];
+                s(j,i) = temp;
+            }
+        } else
+            v_givens[j] = IdentityRotation;
+    }
+
+    /* add the spike from the rank 1 update to w. */
+    w += v[n-1] * u;
+
+    /* eliminate the spike. */
+    *sing = false;
+    for (j = 0; j < n-1; ++j) {
+        if (w[j] != 0.) {
+            /* determine a givens rotation which eliminates the */
+            /* j-th element of the spike. */
+            givens.makeGivens(-s(j,j), w[j]);
+
+            /* apply the transformation to s and reduce the spike in w. */
+            for (i = j; i < m; ++i) {
+                temp = givens.c() * s(j,i) + givens.s() * w[i];
+                w[i] = -givens.s() * s(j,i) + givens.c() * w[i];
+                s(j,i) = temp;
+            }
+
+            /* store the information necessary to recover the */
+            /* givens rotation. */
+            w_givens[j] = givens;
+        } else
+            v_givens[j] = IdentityRotation;
+
+        /* test for zero diagonal elements in the output s. */
+        if (s(j,j) == 0.) {
+            *sing = true;
+        }
+    }
+    /* move w back into the last column of the output s. */
+    s(n-1,n-1) = w[n-1];
+
+    if (s(j,j) == 0.) {
+        *sing = true;
+    }
+    return;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen

diff --git a/unsupported/Eigen/src/NonLinearOptimization/rwupdt.h b/unsupported/Eigen/src/NonLinearOptimization/rwupdt.h
new file mode 100644
index 0000000..6ebf856
--- /dev/null
+++ b/unsupported/Eigen/src/NonLinearOptimization/rwupdt.h

@@ -0,0 +1,49 @@
+namespace Eigen { 
+
+namespace internal {
+
+template <typename Scalar>
+void rwupdt(
+        Matrix< Scalar, Dynamic, Dynamic >  &r,
+        const Matrix< Scalar, Dynamic, 1>  &w,
+        Matrix< Scalar, Dynamic, 1>  &b,
+        Scalar alpha)
+{
+    typedef DenseIndex Index;
+
+    const Index n = r.cols();
+    eigen_assert(r.rows()>=n);
+    std::vector<JacobiRotation<Scalar> > givens(n);
+
+    /* Local variables */
+    Scalar temp, rowj;
+
+    /* Function Body */
+    for (Index j = 0; j < n; ++j) {
+        rowj = w[j];
+
+        /* apply the previous transformations to */
+        /* r(i,j), i=0,1,...,j-1, and to w(j). */
+        for (Index i = 0; i < j; ++i) {
+            temp = givens[i].c() * r(i,j) + givens[i].s() * rowj;
+            rowj = -givens[i].s() * r(i,j) + givens[i].c() * rowj;
+            r(i,j) = temp;
+        }
+
+        /* determine a givens rotation which eliminates w(j). */
+        givens[j].makeGivens(-r(j,j), rowj);
+
+        if (rowj == 0.)
+            continue; // givens[j] is identity
+
+        /* apply the current transformation to r(j,j), b(j), and alpha. */
+        r(j,j) = givens[j].c() * r(j,j) + givens[j].s() * rowj;
+        temp = givens[j].c() * b[j] + givens[j].s() * alpha;
+        alpha = -givens[j].s() * b[j] + givens[j].c() * alpha;
+        b[j] = temp;
+    }
+}
+
+} // end namespace internal
+
+} // end namespace Eigen

diff --git a/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h b/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h
new file mode 100644
index 0000000..ea5d8bc
--- /dev/null
+++ b/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h

@@ -0,0 +1,130 @@
+// -*- coding: utf-8
+// vim: set fileencoding=utf-8
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NUMERICAL_DIFF_H
+#define EIGEN_NUMERICAL_DIFF_H
+
+namespace Eigen { 
+
+enum NumericalDiffMode {
+    Forward,
+    Central
+};
+
+
+/**
+  * This class allows you to add a method df() to your functor, which will 
+  * use numerical differentiation to compute an approximate of the
+  * derivative for the functor. Of course, if you have an analytical form
+  * for the derivative, you should rather implement df() by yourself.
+  *
+  * More information on
+  * http://en.wikipedia.org/wiki/Numerical_differentiation
+  *
+  * Currently only "Forward" and "Central" scheme are implemented.
+  */
+template<typename _Functor, NumericalDiffMode mode=Forward>
+class NumericalDiff : public _Functor
+{
+public:
+    typedef _Functor Functor;
+    typedef typename Functor::Scalar Scalar;
+    typedef typename Functor::InputType InputType;
+    typedef typename Functor::ValueType ValueType;
+    typedef typename Functor::JacobianType JacobianType;
+
+    NumericalDiff(Scalar _epsfcn=0.) : Functor(), epsfcn(_epsfcn) {}
+    NumericalDiff(const Functor& f, Scalar _epsfcn=0.) : Functor(f), epsfcn(_epsfcn) {}
+
+    // forward constructors
+    template<typename T0>
+        NumericalDiff(const T0& a0) : Functor(a0), epsfcn(0) {}
+    template<typename T0, typename T1>
+        NumericalDiff(const T0& a0, const T1& a1) : Functor(a0, a1), epsfcn(0) {}
+    template<typename T0, typename T1, typename T2>
+        NumericalDiff(const T0& a0, const T1& a1, const T2& a2) : Functor(a0, a1, a2), epsfcn(0) {}
+
+    enum {
+        InputsAtCompileTime = Functor::InputsAtCompileTime,
+        ValuesAtCompileTime = Functor::ValuesAtCompileTime
+    };
+
+    /**
+      * return the number of evaluation of functor
+     */
+    int df(const InputType& _x, JacobianType &jac) const
+    {
+        using std::sqrt;
+        using std::abs;
+        /* Local variables */
+        Scalar h;
+        int nfev=0;
+        const typename InputType::Index n = _x.size();
+        const Scalar eps = sqrt(((std::max)(epsfcn,NumTraits<Scalar>::epsilon() )));
+        ValueType val1, val2;
+        InputType x = _x;
+        // TODO : we should do this only if the size is not already known
+        val1.resize(Functor::values());
+        val2.resize(Functor::values());
+
+        // initialization
+        switch(mode) {
+            case Forward:
+                // compute f(x)
+                Functor::operator()(x, val1); nfev++;
+                break;
+            case Central:
+                // do nothing
+                break;
+            default:
+                eigen_assert(false);
+        };
+
+        // Function Body
+        for (int j = 0; j < n; ++j) {
+            h = eps * abs(x[j]);
+            if (h == 0.) {
+                h = eps;
+            }
+            switch(mode) {
+                case Forward:
+                    x[j] += h;
+                    Functor::operator()(x, val2);
+                    nfev++;
+                    x[j] = _x[j];
+                    jac.col(j) = (val2-val1)/h;
+                    break;
+                case Central:
+                    x[j] += h;
+                    Functor::operator()(x, val2); nfev++;
+                    x[j] -= 2*h;
+                    Functor::operator()(x, val1); nfev++;
+                    x[j] = _x[j];
+                    jac.col(j) = (val2-val1)/(2*h);
+                    break;
+                default:
+                    eigen_assert(false);
+            };
+        }
+        return nfev;
+    }
+private:
+    Scalar epsfcn;
+
+    NumericalDiff& operator=(const NumericalDiff&);
+};
+
+} // end namespace Eigen
+
+//vim: ai ts=4 sts=4 et sw=4
+#endif // EIGEN_NUMERICAL_DIFF_H
+

diff --git a/unsupported/Eigen/src/Polynomials/Companion.h b/unsupported/Eigen/src/Polynomials/Companion.h
new file mode 100644
index 0000000..59a15b0
--- /dev/null
+++ b/unsupported/Eigen/src/Polynomials/Companion.h

@@ -0,0 +1,280 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Manuel Yguel <manuel.yguel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPANION_H
+#define EIGEN_COMPANION_H
+
+// This file requires the user to include
+// * Eigen/Core
+// * Eigen/src/PolynomialSolver.h
+
+namespace Eigen { 
+
+namespace internal {
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+template<int Size>
+struct decrement_if_fixed_size
+{
+  enum {
+    ret = (Size == Dynamic) ? Dynamic : Size-1 };
+};
+
+#endif
+
+template< typename _Scalar, int _Deg >
+class companion
+{
+  public:
+    EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Deg==Dynamic ? Dynamic : _Deg)
+
+    enum {
+      Deg = _Deg,
+      Deg_1=decrement_if_fixed_size<Deg>::ret
+    };
+
+    typedef _Scalar                                Scalar;
+    typedef typename NumTraits<Scalar>::Real       RealScalar;
+    typedef Matrix<Scalar, Deg, 1>                 RightColumn;
+    //typedef DiagonalMatrix< Scalar, Deg_1, Deg_1 > BottomLeftDiagonal;
+    typedef Matrix<Scalar, Deg_1, 1>               BottomLeftDiagonal;
+
+    typedef Matrix<Scalar, Deg, Deg>               DenseCompanionMatrixType;
+    typedef Matrix< Scalar, _Deg, Deg_1 >          LeftBlock;
+    typedef Matrix< Scalar, Deg_1, Deg_1 >         BottomLeftBlock;
+    typedef Matrix< Scalar, 1, Deg_1 >             LeftBlockFirstRow;
+
+    typedef DenseIndex Index;
+
+  public:
+    EIGEN_STRONG_INLINE const _Scalar operator()(Index row, Index col ) const
+    {
+      if( m_bl_diag.rows() > col )
+      {
+        if( 0 < row ){ return m_bl_diag[col]; }
+        else{ return 0; }
+      }
+      else{ return m_monic[row]; }
+    }
+
+  public:
+    template<typename VectorType>
+    void setPolynomial( const VectorType& poly )
+    {
+      const Index deg = poly.size()-1;
+      m_monic = -poly.head(deg)/poly[deg];
+      m_bl_diag.setOnes(deg-1);
+    }
+
+    template<typename VectorType>
+    companion( const VectorType& poly ){
+      setPolynomial( poly ); }
+
+  public:
+    DenseCompanionMatrixType denseMatrix() const
+    {
+      const Index deg   = m_monic.size();
+      const Index deg_1 = deg-1;
+      DenseCompanionMatrixType companMat(deg,deg);
+      companMat <<
+        ( LeftBlock(deg,deg_1)
+          << LeftBlockFirstRow::Zero(1,deg_1),
+          BottomLeftBlock::Identity(deg-1,deg-1)*m_bl_diag.asDiagonal() ).finished()
+        , m_monic;
+      return companMat;
+    }
+
+
+
+  protected:
+    /** Helper function for the balancing algorithm.
+     * \returns true if the row and the column, having colNorm and rowNorm
+     * as norms, are balanced, false otherwise.
+     * colB and rowB are respectively the multipliers for
+     * the column and the row in order to balance them.
+     * */
+    bool balanced( RealScalar colNorm, RealScalar rowNorm,
+        bool& isBalanced, RealScalar& colB, RealScalar& rowB );
+
+    /** Helper function for the balancing algorithm.
+     * \returns true if the row and the column, having colNorm and rowNorm
+     * as norms, are balanced, false otherwise.
+     * colB and rowB are respectively the multipliers for
+     * the column and the row in order to balance them.
+     * */
+    bool balancedR( RealScalar colNorm, RealScalar rowNorm,
+        bool& isBalanced, RealScalar& colB, RealScalar& rowB );
+
+  public:
+    /**
+     * Balancing algorithm from B. N. PARLETT and C. REINSCH (1969)
+     * "Balancing a matrix for calculation of eigenvalues and eigenvectors"
+     * adapted to the case of companion matrices.
+     * A matrix with non zero row and non zero column is balanced
+     * for a certain norm if the i-th row and the i-th column
+     * have same norm for all i.
+     */
+    void balance();
+
+  protected:
+      RightColumn                m_monic;
+      BottomLeftDiagonal         m_bl_diag;
+};
+
+
+
+template< typename _Scalar, int _Deg >
+inline
+bool companion<_Scalar,_Deg>::balanced( RealScalar colNorm, RealScalar rowNorm,
+    bool& isBalanced, RealScalar& colB, RealScalar& rowB )
+{
+  if( RealScalar(0) == colNorm || RealScalar(0) == rowNorm 
+      || !(numext::isfinite)(colNorm) || !(numext::isfinite)(rowNorm)){
+    return true;
+  }
+  else
+  {
+    //To find the balancing coefficients, if the radix is 2,
+    //one finds \f$ \sigma \f$ such that
+    // \f$ 2^{2\sigma-1} < rowNorm / colNorm \le 2^{2\sigma+1} \f$
+    // then the balancing coefficient for the row is \f$ 1/2^{\sigma} \f$
+    // and the balancing coefficient for the column is \f$ 2^{\sigma} \f$
+    const RealScalar radix = RealScalar(2);
+    const RealScalar radix2 = RealScalar(4);
+    
+    rowB = rowNorm / radix;
+    colB = RealScalar(1);
+    const RealScalar s = colNorm + rowNorm;
+
+    // Find sigma s.t. rowNorm / 2 <= 2^(2*sigma) * colNorm
+    RealScalar scout = colNorm;
+    while (scout < rowB)
+    {
+      colB *= radix;
+      scout *= radix2;
+    }
+    
+    // We now have an upper-bound for sigma, try to lower it.
+    // Find sigma s.t. 2^(2*sigma) * colNorm / 2 < rowNorm
+    scout = colNorm * (colB / radix) * colB;  // Avoid overflow.
+    while (scout >= rowNorm)
+    {
+      colB /= radix;
+      scout /= radix2;
+    }
+
+    // This line is used to avoid insubstantial balancing.
+    if ((rowNorm + radix * scout) < RealScalar(0.95) * s * colB)
+    {
+      isBalanced = false;
+      rowB = RealScalar(1) / colB;
+      return false;
+    }
+    else
+    {
+      return true;
+    }
+  }
+}
+
+template< typename _Scalar, int _Deg >
+inline
+bool companion<_Scalar,_Deg>::balancedR( RealScalar colNorm, RealScalar rowNorm,
+    bool& isBalanced, RealScalar& colB, RealScalar& rowB )
+{
+  if( RealScalar(0) == colNorm || RealScalar(0) == rowNorm ){ return true; }
+  else
+  {
+    /**
+     * Set the norm of the column and the row to the geometric mean
+     * of the row and column norm
+     */
+    const RealScalar q = colNorm/rowNorm;
+    if( !isApprox( q, _Scalar(1) ) )
+    {
+      rowB = sqrt( colNorm/rowNorm );
+      colB = RealScalar(1)/rowB;
+
+      isBalanced = false;
+      return false;
+    }
+    else{
+      return true; }
+  }
+}
+
+
+template< typename _Scalar, int _Deg >
+void companion<_Scalar,_Deg>::balance()
+{
+  using std::abs;
+  EIGEN_STATIC_ASSERT( Deg == Dynamic || 1 < Deg, YOU_MADE_A_PROGRAMMING_MISTAKE );
+  const Index deg   = m_monic.size();
+  const Index deg_1 = deg-1;
+
+  bool hasConverged=false;
+  while( !hasConverged )
+  {
+    hasConverged = true;
+    RealScalar colNorm,rowNorm;
+    RealScalar colB,rowB;
+
+    //First row, first column excluding the diagonal
+    //==============================================
+    colNorm = abs(m_bl_diag[0]);
+    rowNorm = abs(m_monic[0]);
+
+    //Compute balancing of the row and the column
+    if( !balanced( colNorm, rowNorm, hasConverged, colB, rowB ) )
+    {
+      m_bl_diag[0] *= colB;
+      m_monic[0] *= rowB;
+    }
+
+    //Middle rows and columns excluding the diagonal
+    //==============================================
+    for( Index i=1; i<deg_1; ++i )
+    {
+      // column norm, excluding the diagonal
+      colNorm = abs(m_bl_diag[i]);
+
+      // row norm, excluding the diagonal
+      rowNorm = abs(m_bl_diag[i-1]) + abs(m_monic[i]);
+
+      //Compute balancing of the row and the column
+      if( !balanced( colNorm, rowNorm, hasConverged, colB, rowB ) )
+      {
+        m_bl_diag[i]   *= colB;
+        m_bl_diag[i-1] *= rowB;
+        m_monic[i]     *= rowB;
+      }
+    }
+
+    //Last row, last column excluding the diagonal
+    //============================================
+    const Index ebl = m_bl_diag.size()-1;
+    VectorBlock<RightColumn,Deg_1> headMonic( m_monic, 0, deg_1 );
+    colNorm = headMonic.array().abs().sum();
+    rowNorm = abs( m_bl_diag[ebl] );
+
+    //Compute balancing of the row and the column
+    if( !balanced( colNorm, rowNorm, hasConverged, colB, rowB ) )
+    {
+      headMonic      *= colB;
+      m_bl_diag[ebl] *= rowB;
+    }
+  }
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPANION_H

diff --git a/unsupported/Eigen/src/Polynomials/PolynomialSolver.h b/unsupported/Eigen/src/Polynomials/PolynomialSolver.h
new file mode 100644
index 0000000..5e0ecbb
--- /dev/null
+++ b/unsupported/Eigen/src/Polynomials/PolynomialSolver.h

@@ -0,0 +1,428 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Manuel Yguel <manuel.yguel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_POLYNOMIAL_SOLVER_H
+#define EIGEN_POLYNOMIAL_SOLVER_H
+
+namespace Eigen { 
+
+/** \ingroup Polynomials_Module
+ *  \class PolynomialSolverBase.
+ *
+ * \brief Defined to be inherited by polynomial solvers: it provides
+ * convenient methods such as
+ *  - real roots,
+ *  - greatest, smallest complex roots,
+ *  - real roots with greatest, smallest absolute real value,
+ *  - greatest, smallest real roots.
+ *
+ * It stores the set of roots as a vector of complexes.
+ *
+ */
+template< typename _Scalar, int _Deg >
+class PolynomialSolverBase
+{
+  public:
+    EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Deg==Dynamic ? Dynamic : _Deg)
+
+    typedef _Scalar                             Scalar;
+    typedef typename NumTraits<Scalar>::Real    RealScalar;
+    typedef std::complex<RealScalar>            RootType;
+    typedef Matrix<RootType,_Deg,1>             RootsType;
+
+    typedef DenseIndex Index;
+
+  protected:
+    template< typename OtherPolynomial >
+    inline void setPolynomial( const OtherPolynomial& poly ){
+      m_roots.resize(poly.size()-1); }
+
+  public:
+    template< typename OtherPolynomial >
+    inline PolynomialSolverBase( const OtherPolynomial& poly ){
+      setPolynomial( poly() ); }
+
+    inline PolynomialSolverBase(){}
+
+  public:
+    /** \returns the complex roots of the polynomial */
+    inline const RootsType& roots() const { return m_roots; }
+
+  public:
+    /** Clear and fills the back insertion sequence with the real roots of the polynomial
+     * i.e. the real part of the complex roots that have an imaginary part which
+     * absolute value is smaller than absImaginaryThreshold.
+     * absImaginaryThreshold takes the dummy_precision associated
+     * with the _Scalar template parameter of the PolynomialSolver class as the default value.
+     *
+     * \param[out] bi_seq : the back insertion sequence (stl concept)
+     * \param[in]  absImaginaryThreshold : the maximum bound of the imaginary part of a complex
+     *  number that is considered as real.
+     * */
+    template<typename Stl_back_insertion_sequence>
+    inline void realRoots( Stl_back_insertion_sequence& bi_seq,
+        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
+    {
+      using std::abs;
+      bi_seq.clear();
+      for(Index i=0; i<m_roots.size(); ++i )
+      {
+        if( abs( m_roots[i].imag() ) < absImaginaryThreshold ){
+          bi_seq.push_back( m_roots[i].real() ); }
+      }
+    }
+
+  protected:
+    template<typename squaredNormBinaryPredicate>
+    inline const RootType& selectComplexRoot_withRespectToNorm( squaredNormBinaryPredicate& pred ) const
+    {
+      Index res=0;
+      RealScalar norm2 = numext::abs2( m_roots[0] );
+      for( Index i=1; i<m_roots.size(); ++i )
+      {
+        const RealScalar currNorm2 = numext::abs2( m_roots[i] );
+        if( pred( currNorm2, norm2 ) ){
+          res=i; norm2=currNorm2; }
+      }
+      return m_roots[res];
+    }
+
+  public:
+    /**
+     * \returns the complex root with greatest norm.
+     */
+    inline const RootType& greatestRoot() const
+    {
+      std::greater<RealScalar> greater;
+      return selectComplexRoot_withRespectToNorm( greater );
+    }
+
+    /**
+     * \returns the complex root with smallest norm.
+     */
+    inline const RootType& smallestRoot() const
+    {
+      std::less<RealScalar> less;
+      return selectComplexRoot_withRespectToNorm( less );
+    }
+
+  protected:
+    template<typename squaredRealPartBinaryPredicate>
+    inline const RealScalar& selectRealRoot_withRespectToAbsRealPart(
+        squaredRealPartBinaryPredicate& pred,
+        bool& hasArealRoot,
+        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
+    {
+      using std::abs;
+      hasArealRoot = false;
+      Index res=0;
+      RealScalar abs2(0);
+
+      for( Index i=0; i<m_roots.size(); ++i )
+      {
+        if( abs( m_roots[i].imag() ) <= absImaginaryThreshold )
+        {
+          if( !hasArealRoot )
+          {
+            hasArealRoot = true;
+            res = i;
+            abs2 = m_roots[i].real() * m_roots[i].real();
+          }
+          else
+          {
+            const RealScalar currAbs2 = m_roots[i].real() * m_roots[i].real();
+            if( pred( currAbs2, abs2 ) )
+            {
+              abs2 = currAbs2;
+              res = i;
+            }
+          }
+        }
+        else if(!hasArealRoot)
+        {
+          if( abs( m_roots[i].imag() ) < abs( m_roots[res].imag() ) ){
+            res = i;}
+        }
+      }
+      return numext::real_ref(m_roots[res]);
+    }
+
+
+    template<typename RealPartBinaryPredicate>
+    inline const RealScalar& selectRealRoot_withRespectToRealPart(
+        RealPartBinaryPredicate& pred,
+        bool& hasArealRoot,
+        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
+    {
+      using std::abs;
+      hasArealRoot = false;
+      Index res=0;
+      RealScalar val(0);
+
+      for( Index i=0; i<m_roots.size(); ++i )
+      {
+        if( abs( m_roots[i].imag() ) <= absImaginaryThreshold )
+        {
+          if( !hasArealRoot )
+          {
+            hasArealRoot = true;
+            res = i;
+            val = m_roots[i].real();
+          }
+          else
+          {
+            const RealScalar curr = m_roots[i].real();
+            if( pred( curr, val ) )
+            {
+              val = curr;
+              res = i;
+            }
+          }
+        }
+        else
+        {
+          if( abs( m_roots[i].imag() ) < abs( m_roots[res].imag() ) ){
+            res = i; }
+        }
+      }
+      return numext::real_ref(m_roots[res]);
+    }
+
+  public:
+    /**
+     * \returns a real root with greatest absolute magnitude.
+     * A real root is defined as the real part of a complex root with absolute imaginary
+     * part smallest than absImaginaryThreshold.
+     * absImaginaryThreshold takes the dummy_precision associated
+     * with the _Scalar template parameter of the PolynomialSolver class as the default value.
+     * If no real root is found the boolean hasArealRoot is set to false and the real part of
+     * the root with smallest absolute imaginary part is returned instead.
+     *
+     * \param[out] hasArealRoot : boolean true if a real root is found according to the
+     *  absImaginaryThreshold criterion, false otherwise.
+     * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide
+     *  whether or not a root is real.
+     */
+    inline const RealScalar& absGreatestRealRoot(
+        bool& hasArealRoot,
+        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
+    {
+      std::greater<RealScalar> greater;
+      return selectRealRoot_withRespectToAbsRealPart( greater, hasArealRoot, absImaginaryThreshold );
+    }
+
+
+    /**
+     * \returns a real root with smallest absolute magnitude.
+     * A real root is defined as the real part of a complex root with absolute imaginary
+     * part smallest than absImaginaryThreshold.
+     * absImaginaryThreshold takes the dummy_precision associated
+     * with the _Scalar template parameter of the PolynomialSolver class as the default value.
+     * If no real root is found the boolean hasArealRoot is set to false and the real part of
+     * the root with smallest absolute imaginary part is returned instead.
+     *
+     * \param[out] hasArealRoot : boolean true if a real root is found according to the
+     *  absImaginaryThreshold criterion, false otherwise.
+     * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide
+     *  whether or not a root is real.
+     */
+    inline const RealScalar& absSmallestRealRoot(
+        bool& hasArealRoot,
+        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
+    {
+      std::less<RealScalar> less;
+      return selectRealRoot_withRespectToAbsRealPart( less, hasArealRoot, absImaginaryThreshold );
+    }
+
+
+    /**
+     * \returns the real root with greatest value.
+     * A real root is defined as the real part of a complex root with absolute imaginary
+     * part smallest than absImaginaryThreshold.
+     * absImaginaryThreshold takes the dummy_precision associated
+     * with the _Scalar template parameter of the PolynomialSolver class as the default value.
+     * If no real root is found the boolean hasArealRoot is set to false and the real part of
+     * the root with smallest absolute imaginary part is returned instead.
+     *
+     * \param[out] hasArealRoot : boolean true if a real root is found according to the
+     *  absImaginaryThreshold criterion, false otherwise.
+     * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide
+     *  whether or not a root is real.
+     */
+    inline const RealScalar& greatestRealRoot(
+        bool& hasArealRoot,
+        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
+    {
+      std::greater<RealScalar> greater;
+      return selectRealRoot_withRespectToRealPart( greater, hasArealRoot, absImaginaryThreshold );
+    }
+
+
+    /**
+     * \returns the real root with smallest value.
+     * A real root is defined as the real part of a complex root with absolute imaginary
+     * part smallest than absImaginaryThreshold.
+     * absImaginaryThreshold takes the dummy_precision associated
+     * with the _Scalar template parameter of the PolynomialSolver class as the default value.
+     * If no real root is found the boolean hasArealRoot is set to false and the real part of
+     * the root with smallest absolute imaginary part is returned instead.
+     *
+     * \param[out] hasArealRoot : boolean true if a real root is found according to the
+     *  absImaginaryThreshold criterion, false otherwise.
+     * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide
+     *  whether or not a root is real.
+     */
+    inline const RealScalar& smallestRealRoot(
+        bool& hasArealRoot,
+        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
+    {
+      std::less<RealScalar> less;
+      return selectRealRoot_withRespectToRealPart( less, hasArealRoot, absImaginaryThreshold );
+    }
+
+  protected:
+    RootsType               m_roots;
+};
+
+#define EIGEN_POLYNOMIAL_SOLVER_BASE_INHERITED_TYPES( BASE )  \
+  typedef typename BASE::Scalar                 Scalar;       \
+  typedef typename BASE::RealScalar             RealScalar;   \
+  typedef typename BASE::RootType               RootType;     \
+  typedef typename BASE::RootsType              RootsType;
+
+
+
+/** \ingroup Polynomials_Module
+  *
+  * \class PolynomialSolver
+  *
+  * \brief A polynomial solver
+  *
+  * Computes the complex roots of a real polynomial.
+  *
+  * \param _Scalar the scalar type, i.e., the type of the polynomial coefficients
+  * \param _Deg the degree of the polynomial, can be a compile time value or Dynamic.
+  *             Notice that the number of polynomial coefficients is _Deg+1.
+  *
+  * This class implements a polynomial solver and provides convenient methods such as
+  * - real roots,
+  * - greatest, smallest complex roots,
+  * - real roots with greatest, smallest absolute real value.
+  * - greatest, smallest real roots.
+  *
+  * WARNING: this polynomial solver is experimental, part of the unsupported Eigen modules.
+  *
+  *
+  * Currently a QR algorithm is used to compute the eigenvalues of the companion matrix of
+  * the polynomial to compute its roots.
+  * This supposes that the complex moduli of the roots are all distinct: e.g. there should
+  * be no multiple roots or conjugate roots for instance.
+  * With 32bit (float) floating types this problem shows up frequently.
+  * However, almost always, correct accuracy is reached even in these cases for 64bit
+  * (double) floating types and small polynomial degree (<20).
+  */
+template<typename _Scalar, int _Deg>
+class PolynomialSolver : public PolynomialSolverBase<_Scalar,_Deg>
+{
+  public:
+    EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Deg==Dynamic ? Dynamic : _Deg)
+
+    typedef PolynomialSolverBase<_Scalar,_Deg>    PS_Base;
+    EIGEN_POLYNOMIAL_SOLVER_BASE_INHERITED_TYPES( PS_Base )
+
+    typedef Matrix<Scalar,_Deg,_Deg>                 CompanionMatrixType;
+    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
+                                          ComplexEigenSolver<CompanionMatrixType>,
+                                          EigenSolver<CompanionMatrixType> >::type EigenSolverType;
+    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex, Scalar, std::complex<Scalar> >::type ComplexScalar;
+
+  public:
+    /** Computes the complex roots of a new polynomial. */
+    template< typename OtherPolynomial >
+    void compute( const OtherPolynomial& poly )
+    {
+      eigen_assert( Scalar(0) != poly[poly.size()-1] );
+      eigen_assert( poly.size() > 1 );
+      if(poly.size() >  2 )
+      {
+        internal::companion<Scalar,_Deg> companion( poly );
+        companion.balance();
+        m_eigenSolver.compute( companion.denseMatrix() );
+        m_roots = m_eigenSolver.eigenvalues();
+        // cleanup noise in imaginary part of real roots:
+        // if the imaginary part is rather small compared to the real part
+        // and that cancelling the imaginary part yield a smaller evaluation,
+        // then it's safe to keep the real part only.
+        RealScalar coarse_prec = RealScalar(std::pow(4,poly.size()+1))*NumTraits<RealScalar>::epsilon();
+        for(Index i = 0; i<m_roots.size(); ++i)
+        {
+          if( internal::isMuchSmallerThan(numext::abs(numext::imag(m_roots[i])),
+                                          numext::abs(numext::real(m_roots[i])),
+                                          coarse_prec) )
+          {
+            ComplexScalar as_real_root = ComplexScalar(numext::real(m_roots[i]));
+            if(    numext::abs(poly_eval(poly, as_real_root))
+                <= numext::abs(poly_eval(poly, m_roots[i])))
+            {
+              m_roots[i] = as_real_root;
+            }
+          }
+        }
+      }
+      else if(poly.size () == 2)
+      {
+        m_roots.resize(1);
+        m_roots[0] = -poly[0]/poly[1];
+      }
+    }
+
+  public:
+    template< typename OtherPolynomial >
+    inline PolynomialSolver( const OtherPolynomial& poly ){
+      compute( poly ); }
+
+    inline PolynomialSolver(){}
+
+  protected:
+    using                   PS_Base::m_roots;
+    EigenSolverType         m_eigenSolver;
+};
+
+
+template< typename _Scalar >
+class PolynomialSolver<_Scalar,1> : public PolynomialSolverBase<_Scalar,1>
+{
+  public:
+    typedef PolynomialSolverBase<_Scalar,1>    PS_Base;
+    EIGEN_POLYNOMIAL_SOLVER_BASE_INHERITED_TYPES( PS_Base )
+
+  public:
+    /** Computes the complex roots of a new polynomial. */
+    template< typename OtherPolynomial >
+    void compute( const OtherPolynomial& poly )
+    {
+      eigen_assert( poly.size() == 2 );
+      eigen_assert( Scalar(0) != poly[1] );
+      m_roots[0] = -poly[0]/poly[1];
+    }
+
+  public:
+    template< typename OtherPolynomial >
+    inline PolynomialSolver( const OtherPolynomial& poly ){
+      compute( poly ); }
+
+    inline PolynomialSolver(){}
+
+  protected:
+    using                   PS_Base::m_roots;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_POLYNOMIAL_SOLVER_H

diff --git a/unsupported/Eigen/src/Polynomials/PolynomialUtils.h b/unsupported/Eigen/src/Polynomials/PolynomialUtils.h
new file mode 100644
index 0000000..394e857
--- /dev/null
+++ b/unsupported/Eigen/src/Polynomials/PolynomialUtils.h

@@ -0,0 +1,143 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Manuel Yguel <manuel.yguel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_POLYNOMIAL_UTILS_H
+#define EIGEN_POLYNOMIAL_UTILS_H
+
+namespace Eigen { 
+
+/** \ingroup Polynomials_Module
+ * \returns the evaluation of the polynomial at x using Horner algorithm.
+ *
+ * \param[in] poly : the vector of coefficients of the polynomial ordered
+ *  by degrees i.e. poly[i] is the coefficient of degree i of the polynomial
+ *  e.g. \f$ 1 + 3x^2 \f$ is stored as a vector \f$ [ 1, 0, 3 ] \f$.
+ * \param[in] x : the value to evaluate the polynomial at.
+ *
+ * \note for stability:
+ *   \f$ |x| \le 1 \f$
+ */
+template <typename Polynomials, typename T>
+inline
+T poly_eval_horner( const Polynomials& poly, const T& x )
+{
+  T val=poly[poly.size()-1];
+  for(DenseIndex i=poly.size()-2; i>=0; --i ){
+    val = val*x + poly[i]; }
+  return val;
+}
+
+/** \ingroup Polynomials_Module
+ * \returns the evaluation of the polynomial at x using stabilized Horner algorithm.
+ *
+ * \param[in] poly : the vector of coefficients of the polynomial ordered
+ *  by degrees i.e. poly[i] is the coefficient of degree i of the polynomial
+ *  e.g. \f$ 1 + 3x^2 \f$ is stored as a vector \f$ [ 1, 0, 3 ] \f$.
+ * \param[in] x : the value to evaluate the polynomial at.
+ */
+template <typename Polynomials, typename T>
+inline
+T poly_eval( const Polynomials& poly, const T& x )
+{
+  typedef typename NumTraits<T>::Real Real;
+
+  if( numext::abs2( x ) <= Real(1) ){
+    return poly_eval_horner( poly, x ); }
+  else
+  {
+    T val=poly[0];
+    T inv_x = T(1)/x;
+    for( DenseIndex i=1; i<poly.size(); ++i ){
+      val = val*inv_x + poly[i]; }
+
+    return numext::pow(x,(T)(poly.size()-1)) * val;
+  }
+}
+
+/** \ingroup Polynomials_Module
+ * \returns a maximum bound for the absolute value of any root of the polynomial.
+ *
+ * \param[in] poly : the vector of coefficients of the polynomial ordered
+ *  by degrees i.e. poly[i] is the coefficient of degree i of the polynomial
+ *  e.g. \f$ 1 + 3x^2 \f$ is stored as a vector \f$ [ 1, 0, 3 ] \f$.
+ *
+ *  \pre
+ *   the leading coefficient of the input polynomial poly must be non zero
+ */
+template <typename Polynomial>
+inline
+typename NumTraits<typename Polynomial::Scalar>::Real cauchy_max_bound( const Polynomial& poly )
+{
+  using std::abs;
+  typedef typename Polynomial::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real Real;
+
+  eigen_assert( Scalar(0) != poly[poly.size()-1] );
+  const Scalar inv_leading_coeff = Scalar(1)/poly[poly.size()-1];
+  Real cb(0);
+
+  for( DenseIndex i=0; i<poly.size()-1; ++i ){
+    cb += abs(poly[i]*inv_leading_coeff); }
+  return cb + Real(1);
+}
+
+/** \ingroup Polynomials_Module
+ * \returns a minimum bound for the absolute value of any non zero root of the polynomial.
+ * \param[in] poly : the vector of coefficients of the polynomial ordered
+ *  by degrees i.e. poly[i] is the coefficient of degree i of the polynomial
+ *  e.g. \f$ 1 + 3x^2 \f$ is stored as a vector \f$ [ 1, 0, 3 ] \f$.
+ */
+template <typename Polynomial>
+inline
+typename NumTraits<typename Polynomial::Scalar>::Real cauchy_min_bound( const Polynomial& poly )
+{
+  using std::abs;
+  typedef typename Polynomial::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real Real;
+
+  DenseIndex i=0;
+  while( i<poly.size()-1 && Scalar(0) == poly(i) ){ ++i; }
+  if( poly.size()-1 == i ){
+    return Real(1); }
+
+  const Scalar inv_min_coeff = Scalar(1)/poly[i];
+  Real cb(1);
+  for( DenseIndex j=i+1; j<poly.size(); ++j ){
+    cb += abs(poly[j]*inv_min_coeff); }
+  return Real(1)/cb;
+}
+
+/** \ingroup Polynomials_Module
+ * Given the roots of a polynomial compute the coefficients in the
+ * monomial basis of the monic polynomial with same roots and minimal degree.
+ * If RootVector is a vector of complexes, Polynomial should also be a vector
+ * of complexes.
+ * \param[in] rv : a vector containing the roots of a polynomial.
+ * \param[out] poly : the vector of coefficients of the polynomial ordered
+ *  by degrees i.e. poly[i] is the coefficient of degree i of the polynomial
+ *  e.g. \f$ 3 + x^2 \f$ is stored as a vector \f$ [ 3, 0, 1 ] \f$.
+ */
+template <typename RootVector, typename Polynomial>
+void roots_to_monicPolynomial( const RootVector& rv, Polynomial& poly )
+{
+
+  typedef typename Polynomial::Scalar Scalar;
+
+  poly.setZero( rv.size()+1 );
+  poly[0] = -rv[0]; poly[1] = Scalar(1);
+  for( DenseIndex i=1; i< rv.size(); ++i )
+  {
+    for( DenseIndex j=i+1; j>0; --j ){ poly[j] = poly[j-1] - rv[i]*poly[j]; }
+    poly[0] = -rv[i]*poly[0];
+  }
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_POLYNOMIAL_UTILS_H

diff --git a/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h b/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
new file mode 100644
index 0000000..6d0370d
--- /dev/null
+++ b/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h

@@ -0,0 +1,352 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Guillaume Saupin <guillaume.saupin@cea.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKYLINEINPLACELU_H
+#define EIGEN_SKYLINEINPLACELU_H
+
+namespace Eigen { 
+
+/** \ingroup Skyline_Module
+ *
+ * \class SkylineInplaceLU
+ *
+ * \brief Inplace LU decomposition of a skyline matrix and associated features
+ *
+ * \param MatrixType the type of the matrix of which we are computing the LU factorization
+ *
+ */
+template<typename MatrixType>
+class SkylineInplaceLU {
+protected:
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::Index Index;
+    
+    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+
+public:
+
+    /** Creates a LU object and compute the respective factorization of \a matrix using
+     * flags \a flags. */
+    SkylineInplaceLU(MatrixType& matrix, int flags = 0)
+    : /*m_matrix(matrix.rows(), matrix.cols()),*/ m_flags(flags), m_status(0), m_lu(matrix) {
+        m_precision = RealScalar(0.1) * Eigen::dummy_precision<RealScalar > ();
+        m_lu.IsRowMajor ? computeRowMajor() : compute();
+    }
+
+    /** Sets the relative threshold value used to prune zero coefficients during the decomposition.
+     *
+     * Setting a value greater than zero speeds up computation, and yields to an incomplete
+     * factorization with fewer non zero coefficients. Such approximate factors are especially
+     * useful to initialize an iterative solver.
+     *
+     * Note that the exact meaning of this parameter might depends on the actual
+     * backend. Moreover, not all backends support this feature.
+     *
+     * \sa precision() */
+    void setPrecision(RealScalar v) {
+        m_precision = v;
+    }
+
+    /** \returns the current precision.
+     *
+     * \sa setPrecision() */
+    RealScalar precision() const {
+        return m_precision;
+    }
+
+    /** Sets the flags. Possible values are:
+     *  - CompleteFactorization
+     *  - IncompleteFactorization
+     *  - MemoryEfficient
+     *  - one of the ordering methods
+     *  - etc...
+     *
+     * \sa flags() */
+    void setFlags(int f) {
+        m_flags = f;
+    }
+
+    /** \returns the current flags */
+    int flags() const {
+        return m_flags;
+    }
+
+    void setOrderingMethod(int m) {
+        m_flags = m;
+    }
+
+    int orderingMethod() const {
+        return m_flags;
+    }
+
+    /** Computes/re-computes the LU factorization */
+    void compute();
+    void computeRowMajor();
+
+    /** \returns the lower triangular matrix L */
+    //inline const MatrixType& matrixL() const { return m_matrixL; }
+
+    /** \returns the upper triangular matrix U */
+    //inline const MatrixType& matrixU() const { return m_matrixU; }
+
+    template<typename BDerived, typename XDerived>
+    bool solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>* x,
+            const int transposed = 0) const;
+
+    /** \returns true if the factorization succeeded */
+    inline bool succeeded(void) const {
+        return m_succeeded;
+    }
+
+protected:
+    RealScalar m_precision;
+    int m_flags;
+    mutable int m_status;
+    bool m_succeeded;
+    MatrixType& m_lu;
+};
+
+/** Computes / recomputes the in place LU decomposition of the SkylineInplaceLU.
+ * using the default algorithm.
+ */
+template<typename MatrixType>
+//template<typename _Scalar>
+void SkylineInplaceLU<MatrixType>::compute() {
+    const size_t rows = m_lu.rows();
+    const size_t cols = m_lu.cols();
+
+    eigen_assert(rows == cols && "We do not (yet) support rectangular LU.");
+    eigen_assert(!m_lu.IsRowMajor && "LU decomposition does not work with rowMajor Storage");
+
+    for (Index row = 0; row < rows; row++) {
+        const double pivot = m_lu.coeffDiag(row);
+
+        //Lower matrix Columns update
+        const Index& col = row;
+        for (typename MatrixType::InnerLowerIterator lIt(m_lu, col); lIt; ++lIt) {
+            lIt.valueRef() /= pivot;
+        }
+
+        //Upper matrix update -> contiguous memory access
+        typename MatrixType::InnerLowerIterator lIt(m_lu, col);
+        for (Index rrow = row + 1; rrow < m_lu.rows(); rrow++) {
+            typename MatrixType::InnerUpperIterator uItPivot(m_lu, row);
+            typename MatrixType::InnerUpperIterator uIt(m_lu, rrow);
+            const double coef = lIt.value();
+
+            uItPivot += (rrow - row - 1);
+
+            //update upper part  -> contiguous memory access
+            for (++uItPivot; uIt && uItPivot;) {
+                uIt.valueRef() -= uItPivot.value() * coef;
+
+                ++uIt;
+                ++uItPivot;
+            }
+            ++lIt;
+        }
+
+        //Upper matrix update -> non contiguous memory access
+        typename MatrixType::InnerLowerIterator lIt3(m_lu, col);
+        for (Index rrow = row + 1; rrow < m_lu.rows(); rrow++) {
+            typename MatrixType::InnerUpperIterator uItPivot(m_lu, row);
+            const double coef = lIt3.value();
+
+            //update lower part ->  non contiguous memory access
+            for (Index i = 0; i < rrow - row - 1; i++) {
+                m_lu.coeffRefLower(rrow, row + i + 1) -= uItPivot.value() * coef;
+                ++uItPivot;
+            }
+            ++lIt3;
+        }
+        //update diag -> contiguous
+        typename MatrixType::InnerLowerIterator lIt2(m_lu, col);
+        for (Index rrow = row + 1; rrow < m_lu.rows(); rrow++) {
+
+            typename MatrixType::InnerUpperIterator uItPivot(m_lu, row);
+            typename MatrixType::InnerUpperIterator uIt(m_lu, rrow);
+            const double coef = lIt2.value();
+
+            uItPivot += (rrow - row - 1);
+            m_lu.coeffRefDiag(rrow) -= uItPivot.value() * coef;
+            ++lIt2;
+        }
+    }
+}
+
+template<typename MatrixType>
+void SkylineInplaceLU<MatrixType>::computeRowMajor() {
+    const size_t rows = m_lu.rows();
+    const size_t cols = m_lu.cols();
+
+    eigen_assert(rows == cols && "We do not (yet) support rectangular LU.");
+    eigen_assert(m_lu.IsRowMajor && "You're trying to apply rowMajor decomposition on a ColMajor matrix !");
+
+    for (Index row = 0; row < rows; row++) {
+        typename MatrixType::InnerLowerIterator llIt(m_lu, row);
+
+
+        for (Index col = llIt.col(); col < row; col++) {
+            if (m_lu.coeffExistLower(row, col)) {
+                const double diag = m_lu.coeffDiag(col);
+
+                typename MatrixType::InnerLowerIterator lIt(m_lu, row);
+                typename MatrixType::InnerUpperIterator uIt(m_lu, col);
+
+
+                const Index offset = lIt.col() - uIt.row();
+
+
+                Index stop = offset > 0 ? col - lIt.col() : col - uIt.row();
+
+                //#define VECTORIZE
+#ifdef VECTORIZE
+                Map<VectorXd > rowVal(lIt.valuePtr() + (offset > 0 ? 0 : -offset), stop);
+                Map<VectorXd > colVal(uIt.valuePtr() + (offset > 0 ? offset : 0), stop);
+
+
+                Scalar newCoeff = m_lu.coeffLower(row, col) - rowVal.dot(colVal);
+#else
+                if (offset > 0) //Skip zero value of lIt
+                    uIt += offset;
+                else //Skip zero values of uIt
+                    lIt += -offset;
+                Scalar newCoeff = m_lu.coeffLower(row, col);
+
+                for (Index k = 0; k < stop; ++k) {
+                    const Scalar tmp = newCoeff;
+                    newCoeff = tmp - lIt.value() * uIt.value();
+                    ++lIt;
+                    ++uIt;
+                }
+#endif
+
+                m_lu.coeffRefLower(row, col) = newCoeff / diag;
+            }
+        }
+
+        //Upper matrix update
+        const Index col = row;
+        typename MatrixType::InnerUpperIterator uuIt(m_lu, col);
+        for (Index rrow = uuIt.row(); rrow < col; rrow++) {
+
+            typename MatrixType::InnerLowerIterator lIt(m_lu, rrow);
+            typename MatrixType::InnerUpperIterator uIt(m_lu, col);
+            const Index offset = lIt.col() - uIt.row();
+
+            Index stop = offset > 0 ? rrow - lIt.col() : rrow - uIt.row();
+
+#ifdef VECTORIZE
+            Map<VectorXd > rowVal(lIt.valuePtr() + (offset > 0 ? 0 : -offset), stop);
+            Map<VectorXd > colVal(uIt.valuePtr() + (offset > 0 ? offset : 0), stop);
+
+            Scalar newCoeff = m_lu.coeffUpper(rrow, col) - rowVal.dot(colVal);
+#else
+            if (offset > 0) //Skip zero value of lIt
+                uIt += offset;
+            else //Skip zero values of uIt
+                lIt += -offset;
+            Scalar newCoeff = m_lu.coeffUpper(rrow, col);
+            for (Index k = 0; k < stop; ++k) {
+                const Scalar tmp = newCoeff;
+                newCoeff = tmp - lIt.value() * uIt.value();
+
+                ++lIt;
+                ++uIt;
+            }
+#endif
+            m_lu.coeffRefUpper(rrow, col) = newCoeff;
+        }
+
+
+        //Diag matrix update
+        typename MatrixType::InnerLowerIterator lIt(m_lu, row);
+        typename MatrixType::InnerUpperIterator uIt(m_lu, row);
+
+        const Index offset = lIt.col() - uIt.row();
+
+
+        Index stop = offset > 0 ? lIt.size() : uIt.size();
+#ifdef VECTORIZE
+        Map<VectorXd > rowVal(lIt.valuePtr() + (offset > 0 ? 0 : -offset), stop);
+        Map<VectorXd > colVal(uIt.valuePtr() + (offset > 0 ? offset : 0), stop);
+        Scalar newCoeff = m_lu.coeffDiag(row) - rowVal.dot(colVal);
+#else
+        if (offset > 0) //Skip zero value of lIt
+            uIt += offset;
+        else //Skip zero values of uIt
+            lIt += -offset;
+        Scalar newCoeff = m_lu.coeffDiag(row);
+        for (Index k = 0; k < stop; ++k) {
+            const Scalar tmp = newCoeff;
+            newCoeff = tmp - lIt.value() * uIt.value();
+            ++lIt;
+            ++uIt;
+        }
+#endif
+        m_lu.coeffRefDiag(row) = newCoeff;
+    }
+}
+
+/** Computes *x = U^-1 L^-1 b
+ *
+ * If \a transpose is set to SvTranspose or SvAdjoint, the solution
+ * of the transposed/adjoint system is computed instead.
+ *
+ * Not all backends implement the solution of the transposed or
+ * adjoint system.
+ */
+template<typename MatrixType>
+template<typename BDerived, typename XDerived>
+bool SkylineInplaceLU<MatrixType>::solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>* x, const int transposed) const {
+    const size_t rows = m_lu.rows();
+    const size_t cols = m_lu.cols();
+
+
+    for (Index row = 0; row < rows; row++) {
+        x->coeffRef(row) = b.coeff(row);
+        Scalar newVal = x->coeff(row);
+        typename MatrixType::InnerLowerIterator lIt(m_lu, row);
+
+        Index col = lIt.col();
+        while (lIt.col() < row) {
+
+            newVal -= x->coeff(col++) * lIt.value();
+            ++lIt;
+        }
+
+        x->coeffRef(row) = newVal;
+    }
+
+
+    for (Index col = rows - 1; col > 0; col--) {
+        x->coeffRef(col) = x->coeff(col) / m_lu.coeffDiag(col);
+
+        const Scalar x_col = x->coeff(col);
+
+        typename MatrixType::InnerUpperIterator uIt(m_lu, col);
+        uIt += uIt.size()-1;
+
+
+        while (uIt) {
+            x->coeffRef(uIt.row()) -= x_col * uIt.value();
+            //TODO : introduce --operator
+            uIt += -1;
+        }
+
+
+    }
+    x->coeffRef(0) = x->coeff(0) / m_lu.coeffDiag(0);
+
+    return true;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SKYLINEINPLACELU_H

diff --git a/unsupported/Eigen/src/Skyline/SkylineMatrix.h b/unsupported/Eigen/src/Skyline/SkylineMatrix.h
new file mode 100644
index 0000000..7c7eace
--- /dev/null
+++ b/unsupported/Eigen/src/Skyline/SkylineMatrix.h

@@ -0,0 +1,862 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Guillaume Saupin <guillaume.saupin@cea.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKYLINEMATRIX_H
+#define EIGEN_SKYLINEMATRIX_H
+
+#include "SkylineStorage.h"
+#include "SkylineMatrixBase.h"
+
+namespace Eigen { 
+
+/** \ingroup Skyline_Module
+ *
+ * \class SkylineMatrix
+ *
+ * \brief The main skyline matrix class
+ *
+ * This class implements a skyline matrix using the very uncommon storage
+ * scheme.
+ *
+ * \param _Scalar the scalar type, i.e. the type of the coefficients
+ * \param _Options Union of bit flags controlling the storage scheme. Currently the only possibility
+ *                 is RowMajor. The default is 0 which means column-major.
+ *
+ *
+ */
+namespace internal {
+template<typename _Scalar, int _Options>
+struct traits<SkylineMatrix<_Scalar, _Options> > {
+    typedef _Scalar Scalar;
+    typedef Sparse StorageKind;
+
+    enum {
+        RowsAtCompileTime = Dynamic,
+        ColsAtCompileTime = Dynamic,
+        MaxRowsAtCompileTime = Dynamic,
+        MaxColsAtCompileTime = Dynamic,
+        Flags = SkylineBit | _Options,
+        CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    };
+};
+}
+
+template<typename _Scalar, int _Options>
+class SkylineMatrix
+: public SkylineMatrixBase<SkylineMatrix<_Scalar, _Options> > {
+public:
+    EIGEN_SKYLINE_GENERIC_PUBLIC_INTERFACE(SkylineMatrix)
+    EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(SkylineMatrix, +=)
+    EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(SkylineMatrix, -=)
+
+    using Base::IsRowMajor;
+
+protected:
+
+    typedef SkylineMatrix<Scalar, (Flags&~RowMajorBit) | (IsRowMajor ? RowMajorBit : 0) > TransposedSkylineMatrix;
+
+    Index m_outerSize;
+    Index m_innerSize;
+
+public:
+    Index* m_colStartIndex;
+    Index* m_rowStartIndex;
+    SkylineStorage<Scalar> m_data;
+
+public:
+
+    inline Index rows() const {
+        return IsRowMajor ? m_outerSize : m_innerSize;
+    }
+
+    inline Index cols() const {
+        return IsRowMajor ? m_innerSize : m_outerSize;
+    }
+
+    inline Index innerSize() const {
+        return m_innerSize;
+    }
+
+    inline Index outerSize() const {
+        return m_outerSize;
+    }
+
+    inline Index upperNonZeros() const {
+        return m_data.upperSize();
+    }
+
+    inline Index lowerNonZeros() const {
+        return m_data.lowerSize();
+    }
+
+    inline Index upperNonZeros(Index j) const {
+        return m_colStartIndex[j + 1] - m_colStartIndex[j];
+    }
+
+    inline Index lowerNonZeros(Index j) const {
+        return m_rowStartIndex[j + 1] - m_rowStartIndex[j];
+    }
+
+    inline const Scalar* _diagPtr() const {
+        return &m_data.diag(0);
+    }
+
+    inline Scalar* _diagPtr() {
+        return &m_data.diag(0);
+    }
+
+    inline const Scalar* _upperPtr() const {
+        return &m_data.upper(0);
+    }
+
+    inline Scalar* _upperPtr() {
+        return &m_data.upper(0);
+    }
+
+    inline const Scalar* _lowerPtr() const {
+        return &m_data.lower(0);
+    }
+
+    inline Scalar* _lowerPtr() {
+        return &m_data.lower(0);
+    }
+
+    inline const Index* _upperProfilePtr() const {
+        return &m_data.upperProfile(0);
+    }
+
+    inline Index* _upperProfilePtr() {
+        return &m_data.upperProfile(0);
+    }
+
+    inline const Index* _lowerProfilePtr() const {
+        return &m_data.lowerProfile(0);
+    }
+
+    inline Index* _lowerProfilePtr() {
+        return &m_data.lowerProfile(0);
+    }
+
+    inline Scalar coeff(Index row, Index col) const {
+        const Index outer = IsRowMajor ? row : col;
+        const Index inner = IsRowMajor ? col : row;
+
+        eigen_assert(outer < outerSize());
+        eigen_assert(inner < innerSize());
+
+        if (outer == inner)
+            return this->m_data.diag(outer);
+
+        if (IsRowMajor) {
+            if (inner > outer) //upper matrix
+            {
+                const Index minOuterIndex = inner - m_data.upperProfile(inner);
+                if (outer >= minOuterIndex)
+                    return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
+                else
+                    return Scalar(0);
+            }
+            if (inner < outer) //lower matrix
+            {
+                const Index minInnerIndex = outer - m_data.lowerProfile(outer);
+                if (inner >= minInnerIndex)
+                    return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
+                else
+                    return Scalar(0);
+            }
+            return m_data.upper(m_colStartIndex[inner] + outer - inner);
+        } else {
+            if (outer > inner) //upper matrix
+            {
+                const Index maxOuterIndex = inner + m_data.upperProfile(inner);
+                if (outer <= maxOuterIndex)
+                    return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
+                else
+                    return Scalar(0);
+            }
+            if (outer < inner) //lower matrix
+            {
+                const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
+
+                if (inner <= maxInnerIndex)
+                    return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
+                else
+                    return Scalar(0);
+            }
+        }
+    }
+
+    inline Scalar& coeffRef(Index row, Index col) {
+        const Index outer = IsRowMajor ? row : col;
+        const Index inner = IsRowMajor ? col : row;
+
+        eigen_assert(outer < outerSize());
+        eigen_assert(inner < innerSize());
+
+        if (outer == inner)
+            return this->m_data.diag(outer);
+
+        if (IsRowMajor) {
+            if (col > row) //upper matrix
+            {
+                const Index minOuterIndex = inner - m_data.upperProfile(inner);
+                eigen_assert(outer >= minOuterIndex && "You tried to access a coeff that does not exist in the storage");
+                return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
+            }
+            if (col < row) //lower matrix
+            {
+                const Index minInnerIndex = outer - m_data.lowerProfile(outer);
+                eigen_assert(inner >= minInnerIndex && "You tried to access a coeff that does not exist in the storage");
+                return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
+            }
+        } else {
+            if (outer > inner) //upper matrix
+            {
+                const Index maxOuterIndex = inner + m_data.upperProfile(inner);
+                eigen_assert(outer <= maxOuterIndex && "You tried to access a coeff that does not exist in the storage");
+                return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
+            }
+            if (outer < inner) //lower matrix
+            {
+                const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
+                eigen_assert(inner <= maxInnerIndex && "You tried to access a coeff that does not exist in the storage");
+                return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
+            }
+        }
+    }
+
+    inline Scalar coeffDiag(Index idx) const {
+        eigen_assert(idx < outerSize());
+        eigen_assert(idx < innerSize());
+        return this->m_data.diag(idx);
+    }
+
+    inline Scalar coeffLower(Index row, Index col) const {
+        const Index outer = IsRowMajor ? row : col;
+        const Index inner = IsRowMajor ? col : row;
+
+        eigen_assert(outer < outerSize());
+        eigen_assert(inner < innerSize());
+        eigen_assert(inner != outer);
+
+        if (IsRowMajor) {
+            const Index minInnerIndex = outer - m_data.lowerProfile(outer);
+            if (inner >= minInnerIndex)
+                return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
+            else
+                return Scalar(0);
+
+        } else {
+            const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
+            if (inner <= maxInnerIndex)
+                return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
+            else
+                return Scalar(0);
+        }
+    }
+
+    inline Scalar coeffUpper(Index row, Index col) const {
+        const Index outer = IsRowMajor ? row : col;
+        const Index inner = IsRowMajor ? col : row;
+
+        eigen_assert(outer < outerSize());
+        eigen_assert(inner < innerSize());
+        eigen_assert(inner != outer);
+
+        if (IsRowMajor) {
+            const Index minOuterIndex = inner - m_data.upperProfile(inner);
+            if (outer >= minOuterIndex)
+                return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
+            else
+                return Scalar(0);
+        } else {
+            const Index maxOuterIndex = inner + m_data.upperProfile(inner);
+            if (outer <= maxOuterIndex)
+                return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
+            else
+                return Scalar(0);
+        }
+    }
+
+    inline Scalar& coeffRefDiag(Index idx) {
+        eigen_assert(idx < outerSize());
+        eigen_assert(idx < innerSize());
+        return this->m_data.diag(idx);
+    }
+
+    inline Scalar& coeffRefLower(Index row, Index col) {
+        const Index outer = IsRowMajor ? row : col;
+        const Index inner = IsRowMajor ? col : row;
+
+        eigen_assert(outer < outerSize());
+        eigen_assert(inner < innerSize());
+        eigen_assert(inner != outer);
+
+        if (IsRowMajor) {
+            const Index minInnerIndex = outer - m_data.lowerProfile(outer);
+            eigen_assert(inner >= minInnerIndex && "You tried to access a coeff that does not exist in the storage");
+            return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
+        } else {
+            const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
+            eigen_assert(inner <= maxInnerIndex && "You tried to access a coeff that does not exist in the storage");
+            return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
+        }
+    }
+
+    inline bool coeffExistLower(Index row, Index col) {
+        const Index outer = IsRowMajor ? row : col;
+        const Index inner = IsRowMajor ? col : row;
+
+        eigen_assert(outer < outerSize());
+        eigen_assert(inner < innerSize());
+        eigen_assert(inner != outer);
+
+        if (IsRowMajor) {
+            const Index minInnerIndex = outer - m_data.lowerProfile(outer);
+            return inner >= minInnerIndex;
+        } else {
+            const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
+            return inner <= maxInnerIndex;
+        }
+    }
+
+    inline Scalar& coeffRefUpper(Index row, Index col) {
+        const Index outer = IsRowMajor ? row : col;
+        const Index inner = IsRowMajor ? col : row;
+
+        eigen_assert(outer < outerSize());
+        eigen_assert(inner < innerSize());
+        eigen_assert(inner != outer);
+
+        if (IsRowMajor) {
+            const Index minOuterIndex = inner - m_data.upperProfile(inner);
+            eigen_assert(outer >= minOuterIndex && "You tried to access a coeff that does not exist in the storage");
+            return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
+        } else {
+            const Index maxOuterIndex = inner + m_data.upperProfile(inner);
+            eigen_assert(outer <= maxOuterIndex && "You tried to access a coeff that does not exist in the storage");
+            return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
+        }
+    }
+
+    inline bool coeffExistUpper(Index row, Index col) {
+        const Index outer = IsRowMajor ? row : col;
+        const Index inner = IsRowMajor ? col : row;
+
+        eigen_assert(outer < outerSize());
+        eigen_assert(inner < innerSize());
+        eigen_assert(inner != outer);
+
+        if (IsRowMajor) {
+            const Index minOuterIndex = inner - m_data.upperProfile(inner);
+            return outer >= minOuterIndex;
+        } else {
+            const Index maxOuterIndex = inner + m_data.upperProfile(inner);
+            return outer <= maxOuterIndex;
+        }
+    }
+
+
+protected:
+
+public:
+    class InnerUpperIterator;
+    class InnerLowerIterator;
+
+    class OuterUpperIterator;
+    class OuterLowerIterator;
+
+    /** Removes all non zeros */
+    inline void setZero() {
+        m_data.clear();
+        memset(m_colStartIndex, 0, (m_outerSize + 1) * sizeof (Index));
+        memset(m_rowStartIndex, 0, (m_outerSize + 1) * sizeof (Index));
+    }
+
+    /** \returns the number of non zero coefficients */
+    inline Index nonZeros() const {
+        return m_data.diagSize() + m_data.upperSize() + m_data.lowerSize();
+    }
+
+    /** Preallocates \a reserveSize non zeros */
+    inline void reserve(Index reserveSize, Index reserveUpperSize, Index reserveLowerSize) {
+        m_data.reserve(reserveSize, reserveUpperSize, reserveLowerSize);
+    }
+
+    /** \returns a reference to a novel non zero coefficient with coordinates \a row x \a col.
+
+     *
+     * \warning This function can be extremely slow if the non zero coefficients
+     * are not inserted in a coherent order.
+     *
+     * After an insertion session, you should call the finalize() function.
+     */
+    EIGEN_DONT_INLINE Scalar & insert(Index row, Index col) {
+        const Index outer = IsRowMajor ? row : col;
+        const Index inner = IsRowMajor ? col : row;
+
+        eigen_assert(outer < outerSize());
+        eigen_assert(inner < innerSize());
+
+        if (outer == inner)
+            return m_data.diag(col);
+
+        if (IsRowMajor) {
+            if (outer < inner) //upper matrix
+            {
+                Index minOuterIndex = 0;
+                minOuterIndex = inner - m_data.upperProfile(inner);
+
+                if (outer < minOuterIndex) //The value does not yet exist
+                {
+                    const Index previousProfile = m_data.upperProfile(inner);
+
+                    m_data.upperProfile(inner) = inner - outer;
+
+
+                    const Index bandIncrement = m_data.upperProfile(inner) - previousProfile;
+                    //shift data stored after this new one
+                    const Index stop = m_colStartIndex[cols()];
+                    const Index start = m_colStartIndex[inner];
+
+
+                    for (Index innerIdx = stop; innerIdx >= start; innerIdx--) {
+                        m_data.upper(innerIdx + bandIncrement) = m_data.upper(innerIdx);
+                    }
+
+                    for (Index innerIdx = cols(); innerIdx > inner; innerIdx--) {
+                        m_colStartIndex[innerIdx] += bandIncrement;
+                    }
+
+                    //zeros new data
+                    memset(this->_upperPtr() + start, 0, (bandIncrement - 1) * sizeof (Scalar));
+
+                    return m_data.upper(m_colStartIndex[inner]);
+                } else {
+                    return m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
+                }
+            }
+
+            if (outer > inner) //lower matrix
+            {
+                const Index minInnerIndex = outer - m_data.lowerProfile(outer);
+                if (inner < minInnerIndex) //The value does not yet exist
+                {
+                    const Index previousProfile = m_data.lowerProfile(outer);
+                    m_data.lowerProfile(outer) = outer - inner;
+
+                    const Index bandIncrement = m_data.lowerProfile(outer) - previousProfile;
+                    //shift data stored after this new one
+                    const Index stop = m_rowStartIndex[rows()];
+                    const Index start = m_rowStartIndex[outer];
+
+
+                    for (Index innerIdx = stop; innerIdx >= start; innerIdx--) {
+                        m_data.lower(innerIdx + bandIncrement) = m_data.lower(innerIdx);
+                    }
+
+                    for (Index innerIdx = rows(); innerIdx > outer; innerIdx--) {
+                        m_rowStartIndex[innerIdx] += bandIncrement;
+                    }
+
+                    //zeros new data
+                    memset(this->_lowerPtr() + start, 0, (bandIncrement - 1) * sizeof (Scalar));
+                    return m_data.lower(m_rowStartIndex[outer]);
+                } else {
+                    return m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
+                }
+            }
+        } else {
+            if (outer > inner) //upper matrix
+            {
+                const Index maxOuterIndex = inner + m_data.upperProfile(inner);
+                if (outer > maxOuterIndex) //The value does not yet exist
+                {
+                    const Index previousProfile = m_data.upperProfile(inner);
+                    m_data.upperProfile(inner) = outer - inner;
+
+                    const Index bandIncrement = m_data.upperProfile(inner) - previousProfile;
+                    //shift data stored after this new one
+                    const Index stop = m_rowStartIndex[rows()];
+                    const Index start = m_rowStartIndex[inner + 1];
+
+                    for (Index innerIdx = stop; innerIdx >= start; innerIdx--) {
+                        m_data.upper(innerIdx + bandIncrement) = m_data.upper(innerIdx);
+                    }
+
+                    for (Index innerIdx = inner + 1; innerIdx < outerSize() + 1; innerIdx++) {
+                        m_rowStartIndex[innerIdx] += bandIncrement;
+                    }
+                    memset(this->_upperPtr() + m_rowStartIndex[inner] + previousProfile + 1, 0, (bandIncrement - 1) * sizeof (Scalar));
+                    return m_data.upper(m_rowStartIndex[inner] + m_data.upperProfile(inner));
+                } else {
+                    return m_data.upper(m_rowStartIndex[inner] + (outer - inner));
+                }
+            }
+
+            if (outer < inner) //lower matrix
+            {
+                const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
+                if (inner > maxInnerIndex) //The value does not yet exist
+                {
+                    const Index previousProfile = m_data.lowerProfile(outer);
+                    m_data.lowerProfile(outer) = inner - outer;
+
+                    const Index bandIncrement = m_data.lowerProfile(outer) - previousProfile;
+                    //shift data stored after this new one
+                    const Index stop = m_colStartIndex[cols()];
+                    const Index start = m_colStartIndex[outer + 1];
+
+                    for (Index innerIdx = stop; innerIdx >= start; innerIdx--) {
+                        m_data.lower(innerIdx + bandIncrement) = m_data.lower(innerIdx);
+                    }
+
+                    for (Index innerIdx = outer + 1; innerIdx < outerSize() + 1; innerIdx++) {
+                        m_colStartIndex[innerIdx] += bandIncrement;
+                    }
+                    memset(this->_lowerPtr() + m_colStartIndex[outer] + previousProfile + 1, 0, (bandIncrement - 1) * sizeof (Scalar));
+                    return m_data.lower(m_colStartIndex[outer] + m_data.lowerProfile(outer));
+                } else {
+                    return m_data.lower(m_colStartIndex[outer] + (inner - outer));
+                }
+            }
+        }
+    }
+
+    /** Must be called after inserting a set of non zero entries.
+     */
+    inline void finalize() {
+        if (IsRowMajor) {
+            if (rows() > cols())
+                m_data.resize(cols(), cols(), rows(), m_colStartIndex[cols()] + 1, m_rowStartIndex[rows()] + 1);
+            else
+                m_data.resize(rows(), cols(), rows(), m_colStartIndex[cols()] + 1, m_rowStartIndex[rows()] + 1);
+
+            //            eigen_assert(rows() == cols() && "memory reorganisatrion only works with suare matrix");
+            //
+            //            Scalar* newArray = new Scalar[m_colStartIndex[cols()] + 1 + m_rowStartIndex[rows()] + 1];
+            //            Index dataIdx = 0;
+            //            for (Index row = 0; row < rows(); row++) {
+            //
+            //                const Index nbLowerElts = m_rowStartIndex[row + 1] - m_rowStartIndex[row];
+            //                //                std::cout << "nbLowerElts" << nbLowerElts << std::endl;
+            //                memcpy(newArray + dataIdx, m_data.m_lower + m_rowStartIndex[row], nbLowerElts * sizeof (Scalar));
+            //                m_rowStartIndex[row] = dataIdx;
+            //                dataIdx += nbLowerElts;
+            //
+            //                const Index nbUpperElts = m_colStartIndex[row + 1] - m_colStartIndex[row];
+            //                memcpy(newArray + dataIdx, m_data.m_upper + m_colStartIndex[row], nbUpperElts * sizeof (Scalar));
+            //                m_colStartIndex[row] = dataIdx;
+            //                dataIdx += nbUpperElts;
+            //
+            //
+            //            }
+            //            //todo : don't access m_data profile directly : add an accessor from SkylineMatrix
+            //            m_rowStartIndex[rows()] = m_rowStartIndex[rows()-1] + m_data.lowerProfile(rows()-1);
+            //            m_colStartIndex[cols()] = m_colStartIndex[cols()-1] + m_data.upperProfile(cols()-1);
+            //
+            //            delete[] m_data.m_lower;
+            //            delete[] m_data.m_upper;
+            //
+            //            m_data.m_lower = newArray;
+            //            m_data.m_upper = newArray;
+        } else {
+            if (rows() > cols())
+                m_data.resize(cols(), rows(), cols(), m_rowStartIndex[cols()] + 1, m_colStartIndex[cols()] + 1);
+            else
+                m_data.resize(rows(), rows(), cols(), m_rowStartIndex[rows()] + 1, m_colStartIndex[rows()] + 1);
+        }
+    }
+
+    inline void squeeze() {
+        finalize();
+        m_data.squeeze();
+    }
+
+    void prune(Scalar reference, RealScalar epsilon = dummy_precision<RealScalar > ()) {
+        //TODO
+    }
+
+    /** Resizes the matrix to a \a rows x \a cols matrix and initializes it to zero
+     * \sa resizeNonZeros(Index), reserve(), setZero()
+     */
+    void resize(size_t rows, size_t cols) {
+        const Index diagSize = rows > cols ? cols : rows;
+        m_innerSize = IsRowMajor ? cols : rows;
+
+        eigen_assert(rows == cols && "Skyline matrix must be square matrix");
+
+        if (diagSize % 2) { // diagSize is odd
+            const Index k = (diagSize - 1) / 2;
+
+            m_data.resize(diagSize, IsRowMajor ? cols : rows, IsRowMajor ? rows : cols,
+                    2 * k * k + k + 1,
+                    2 * k * k + k + 1);
+
+        } else // diagSize is even
+        {
+            const Index k = diagSize / 2;
+            m_data.resize(diagSize, IsRowMajor ? cols : rows, IsRowMajor ? rows : cols,
+                    2 * k * k - k + 1,
+                    2 * k * k - k + 1);
+        }
+
+        if (m_colStartIndex && m_rowStartIndex) {
+            delete[] m_colStartIndex;
+            delete[] m_rowStartIndex;
+        }
+        m_colStartIndex = new Index [cols + 1];
+        m_rowStartIndex = new Index [rows + 1];
+        m_outerSize = diagSize;
+
+        m_data.reset();
+        m_data.clear();
+
+        m_outerSize = diagSize;
+        memset(m_colStartIndex, 0, (cols + 1) * sizeof (Index));
+        memset(m_rowStartIndex, 0, (rows + 1) * sizeof (Index));
+    }
+
+    void resizeNonZeros(Index size) {
+        m_data.resize(size);
+    }
+
+    inline SkylineMatrix()
+    : m_outerSize(-1), m_innerSize(0), m_colStartIndex(0), m_rowStartIndex(0) {
+        resize(0, 0);
+    }
+
+    inline SkylineMatrix(size_t rows, size_t cols)
+    : m_outerSize(0), m_innerSize(0), m_colStartIndex(0), m_rowStartIndex(0) {
+        resize(rows, cols);
+    }
+
+    template<typename OtherDerived>
+    inline SkylineMatrix(const SkylineMatrixBase<OtherDerived>& other)
+    : m_outerSize(0), m_innerSize(0), m_colStartIndex(0), m_rowStartIndex(0) {
+        *this = other.derived();
+    }
+
+    inline SkylineMatrix(const SkylineMatrix & other)
+    : Base(), m_outerSize(0), m_innerSize(0), m_colStartIndex(0), m_rowStartIndex(0) {
+        *this = other.derived();
+    }
+
+    inline void swap(SkylineMatrix & other) {
+        //EIGEN_DBG_SKYLINE(std::cout << "SkylineMatrix:: swap\n");
+        std::swap(m_colStartIndex, other.m_colStartIndex);
+        std::swap(m_rowStartIndex, other.m_rowStartIndex);
+        std::swap(m_innerSize, other.m_innerSize);
+        std::swap(m_outerSize, other.m_outerSize);
+        m_data.swap(other.m_data);
+    }
+
+    inline SkylineMatrix & operator=(const SkylineMatrix & other) {
+        std::cout << "SkylineMatrix& operator=(const SkylineMatrix& other)\n";
+        if (other.isRValue()) {
+            swap(other.const_cast_derived());
+        } else {
+            resize(other.rows(), other.cols());
+            memcpy(m_colStartIndex, other.m_colStartIndex, (m_outerSize + 1) * sizeof (Index));
+            memcpy(m_rowStartIndex, other.m_rowStartIndex, (m_outerSize + 1) * sizeof (Index));
+            m_data = other.m_data;
+        }
+        return *this;
+    }
+
+    template<typename OtherDerived>
+            inline SkylineMatrix & operator=(const SkylineMatrixBase<OtherDerived>& other) {
+        const bool needToTranspose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
+        if (needToTranspose) {
+            //         TODO
+            //            return *this;
+        } else {
+            // there is no special optimization
+            return SkylineMatrixBase<SkylineMatrix>::operator=(other.derived());
+        }
+    }
+
+    friend std::ostream & operator <<(std::ostream & s, const SkylineMatrix & m) {
+
+        EIGEN_DBG_SKYLINE(
+        std::cout << "upper elements : " << std::endl;
+        for (Index i = 0; i < m.m_data.upperSize(); i++)
+            std::cout << m.m_data.upper(i) << "\t";
+        std::cout << std::endl;
+        std::cout << "upper profile : " << std::endl;
+        for (Index i = 0; i < m.m_data.upperProfileSize(); i++)
+            std::cout << m.m_data.upperProfile(i) << "\t";
+        std::cout << std::endl;
+        std::cout << "lower startIdx : " << std::endl;
+        for (Index i = 0; i < m.m_data.upperProfileSize(); i++)
+            std::cout << (IsRowMajor ? m.m_colStartIndex[i] : m.m_rowStartIndex[i]) << "\t";
+        std::cout << std::endl;
+
+
+        std::cout << "lower elements : " << std::endl;
+        for (Index i = 0; i < m.m_data.lowerSize(); i++)
+            std::cout << m.m_data.lower(i) << "\t";
+        std::cout << std::endl;
+        std::cout << "lower profile : " << std::endl;
+        for (Index i = 0; i < m.m_data.lowerProfileSize(); i++)
+            std::cout << m.m_data.lowerProfile(i) << "\t";
+        std::cout << std::endl;
+        std::cout << "lower startIdx : " << std::endl;
+        for (Index i = 0; i < m.m_data.lowerProfileSize(); i++)
+            std::cout << (IsRowMajor ? m.m_rowStartIndex[i] : m.m_colStartIndex[i]) << "\t";
+        std::cout << std::endl;
+        );
+        for (Index rowIdx = 0; rowIdx < m.rows(); rowIdx++) {
+            for (Index colIdx = 0; colIdx < m.cols(); colIdx++) {
+                s << m.coeff(rowIdx, colIdx) << "\t";
+            }
+            s << std::endl;
+        }
+        return s;
+    }
+
+    /** Destructor */
+    inline ~SkylineMatrix() {
+        delete[] m_colStartIndex;
+        delete[] m_rowStartIndex;
+    }
+
+    /** Overloaded for performance */
+    Scalar sum() const;
+};
+
+template<typename Scalar, int _Options>
+class SkylineMatrix<Scalar, _Options>::InnerUpperIterator {
+public:
+
+    InnerUpperIterator(const SkylineMatrix& mat, Index outer)
+    : m_matrix(mat), m_outer(outer),
+    m_id(_Options == RowMajor ? mat.m_colStartIndex[outer] : mat.m_rowStartIndex[outer] + 1),
+    m_start(m_id),
+    m_end(_Options == RowMajor ? mat.m_colStartIndex[outer + 1] : mat.m_rowStartIndex[outer + 1] + 1) {
+    }
+
+    inline InnerUpperIterator & operator++() {
+        m_id++;
+        return *this;
+    }
+
+    inline InnerUpperIterator & operator+=(Index shift) {
+        m_id += shift;
+        return *this;
+    }
+
+    inline Scalar value() const {
+        return m_matrix.m_data.upper(m_id);
+    }
+
+    inline Scalar* valuePtr() {
+        return const_cast<Scalar*> (&(m_matrix.m_data.upper(m_id)));
+    }
+
+    inline Scalar& valueRef() {
+        return const_cast<Scalar&> (m_matrix.m_data.upper(m_id));
+    }
+
+    inline Index index() const {
+        return IsRowMajor ? m_outer - m_matrix.m_data.upperProfile(m_outer) + (m_id - m_start) :
+                m_outer + (m_id - m_start) + 1;
+    }
+
+    inline Index row() const {
+        return IsRowMajor ? index() : m_outer;
+    }
+
+    inline Index col() const {
+        return IsRowMajor ? m_outer : index();
+    }
+
+    inline size_t size() const {
+        return m_matrix.m_data.upperProfile(m_outer);
+    }
+
+    inline operator bool() const {
+        return (m_id < m_end) && (m_id >= m_start);
+    }
+
+protected:
+    const SkylineMatrix& m_matrix;
+    const Index m_outer;
+    Index m_id;
+    const Index m_start;
+    const Index m_end;
+};
+
+template<typename Scalar, int _Options>
+class SkylineMatrix<Scalar, _Options>::InnerLowerIterator {
+public:
+
+    InnerLowerIterator(const SkylineMatrix& mat, Index outer)
+    : m_matrix(mat),
+    m_outer(outer),
+    m_id(_Options == RowMajor ? mat.m_rowStartIndex[outer] : mat.m_colStartIndex[outer] + 1),
+    m_start(m_id),
+    m_end(_Options == RowMajor ? mat.m_rowStartIndex[outer + 1] : mat.m_colStartIndex[outer + 1] + 1) {
+    }
+
+    inline InnerLowerIterator & operator++() {
+        m_id++;
+        return *this;
+    }
+
+    inline InnerLowerIterator & operator+=(Index shift) {
+        m_id += shift;
+        return *this;
+    }
+
+    inline Scalar value() const {
+        return m_matrix.m_data.lower(m_id);
+    }
+
+    inline Scalar* valuePtr() {
+        return const_cast<Scalar*> (&(m_matrix.m_data.lower(m_id)));
+    }
+
+    inline Scalar& valueRef() {
+        return const_cast<Scalar&> (m_matrix.m_data.lower(m_id));
+    }
+
+    inline Index index() const {
+        return IsRowMajor ? m_outer - m_matrix.m_data.lowerProfile(m_outer) + (m_id - m_start) :
+                m_outer + (m_id - m_start) + 1;
+        ;
+    }
+
+    inline Index row() const {
+        return IsRowMajor ? m_outer : index();
+    }
+
+    inline Index col() const {
+        return IsRowMajor ? index() : m_outer;
+    }
+
+    inline size_t size() const {
+        return m_matrix.m_data.lowerProfile(m_outer);
+    }
+
+    inline operator bool() const {
+        return (m_id < m_end) && (m_id >= m_start);
+    }
+
+protected:
+    const SkylineMatrix& m_matrix;
+    const Index m_outer;
+    Index m_id;
+    const Index m_start;
+    const Index m_end;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SKYLINEMATRIX_H

diff --git a/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h b/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
new file mode 100644
index 0000000..b0d5e10
--- /dev/null
+++ b/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h

@@ -0,0 +1,212 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Guillaume Saupin <guillaume.saupin@cea.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKYLINEMATRIXBASE_H
+#define EIGEN_SKYLINEMATRIXBASE_H
+
+#include "SkylineUtil.h"
+
+namespace Eigen {
+
+/** \ingroup Skyline_Module
+ *
+ * \class SkylineMatrixBase
+ *
+ * \brief Base class of any skyline matrices or skyline expressions
+ *
+ * \param Derived
+ *
+ */
+template<typename Derived> class SkylineMatrixBase : public EigenBase<Derived> {
+public:
+
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::index<StorageKind>::type Index;
+
+    enum {
+        RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+        /**< The number of rows at compile-time. This is just a copy of the value provided
+         * by the \a Derived type. If a value is not known at compile-time,
+         * it is set to the \a Dynamic constant.
+         * \sa MatrixBase::rows(), MatrixBase::cols(), ColsAtCompileTime, SizeAtCompileTime */
+
+        ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+        /**< The number of columns at compile-time. This is just a copy of the value provided
+         * by the \a Derived type. If a value is not known at compile-time,
+         * it is set to the \a Dynamic constant.
+         * \sa MatrixBase::rows(), MatrixBase::cols(), RowsAtCompileTime, SizeAtCompileTime */
+
+
+        SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
+        internal::traits<Derived>::ColsAtCompileTime>::ret),
+        /**< This is equal to the number of coefficients, i.e. the number of
+         * rows times the number of columns, or to \a Dynamic if this is not
+         * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
+
+        MaxRowsAtCompileTime = RowsAtCompileTime,
+        MaxColsAtCompileTime = ColsAtCompileTime,
+
+        MaxSizeAtCompileTime = (internal::size_at_compile_time<MaxRowsAtCompileTime,
+        MaxColsAtCompileTime>::ret),
+
+        IsVectorAtCompileTime = RowsAtCompileTime == 1 || ColsAtCompileTime == 1,
+        /**< This is set to true if either the number of rows or the number of
+         * columns is known at compile-time to be equal to 1. Indeed, in that case,
+         * we are dealing with a column-vector (if there is only one column) or with
+         * a row-vector (if there is only one row). */
+
+        Flags = internal::traits<Derived>::Flags,
+        /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
+         * constructed from this one. See the \ref flags "list of flags".
+         */
+
+        CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
+        /**< This is a rough measure of how expensive it is to read one coefficient from
+         * this expression.
+         */
+
+        IsRowMajor = Flags & RowMajorBit ? 1 : 0
+    };
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is the "real scalar" type; if the \a Scalar type is already real numbers
+     * (e.g. int, float or double) then \a RealScalar is just the same as \a Scalar. If
+     * \a Scalar is \a std::complex<T> then RealScalar is \a T.
+     *
+     * \sa class NumTraits
+     */
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    /** type of the equivalent square matrix */
+    typedef Matrix<Scalar, EIGEN_SIZE_MAX(RowsAtCompileTime, ColsAtCompileTime),
+                           EIGEN_SIZE_MAX(RowsAtCompileTime, ColsAtCompileTime) > SquareMatrixType;
+
+    inline const Derived& derived() const {
+        return *static_cast<const Derived*> (this);
+    }
+
+    inline Derived& derived() {
+        return *static_cast<Derived*> (this);
+    }
+
+    inline Derived& const_cast_derived() const {
+        return *static_cast<Derived*> (const_cast<SkylineMatrixBase*> (this));
+    }
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+    /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
+    inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT {
+        return derived().rows();
+    }
+
+    /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
+    inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT {
+        return derived().cols();
+    }
+
+    /** \returns the number of coefficients, which is \a rows()*cols().
+     * \sa rows(), cols(), SizeAtCompileTime. */
+    inline EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT {
+        return rows() * cols();
+    }
+
+    /** \returns the number of nonzero coefficients which is in practice the number
+     * of stored coefficients. */
+    inline Index nonZeros() const {
+        return derived().nonZeros();
+    }
+
+    /** \returns the size of the storage major dimension,
+     * i.e., the number of columns for a columns major matrix, and the number of rows otherwise */
+    Index outerSize() const {
+        return (int(Flags) & RowMajorBit) ? this->rows() : this->cols();
+    }
+
+    /** \returns the size of the inner dimension according to the storage order,
+     * i.e., the number of rows for a columns major matrix, and the number of cols otherwise */
+    Index innerSize() const {
+        return (int(Flags) & RowMajorBit) ? this->cols() : this->rows();
+    }
+
+    bool isRValue() const {
+        return m_isRValue;
+    }
+
+    Derived& markAsRValue() {
+        m_isRValue = true;
+        return derived();
+    }
+
+    SkylineMatrixBase() : m_isRValue(false) {
+        /* TODO check flags */
+    }
+
+    inline Derived & operator=(const Derived& other) {
+        this->operator=<Derived > (other);
+        return derived();
+    }
+
+    template<typename OtherDerived>
+    inline void assignGeneric(const OtherDerived& other) {
+        derived().resize(other.rows(), other.cols());
+        for (Index row = 0; row < rows(); row++)
+            for (Index col = 0; col < cols(); col++) {
+                if (other.coeff(row, col) != Scalar(0))
+                    derived().insert(row, col) = other.coeff(row, col);
+            }
+        derived().finalize();
+    }
+
+    template<typename OtherDerived>
+            inline Derived & operator=(const SkylineMatrixBase<OtherDerived>& other) {
+        //TODO
+    }
+
+    template<typename Lhs, typename Rhs>
+            inline Derived & operator=(const SkylineProduct<Lhs, Rhs, SkylineTimeSkylineProduct>& product);
+
+    friend std::ostream & operator <<(std::ostream & s, const SkylineMatrixBase& m) {
+        s << m.derived();
+        return s;
+    }
+
+    template<typename OtherDerived>
+    const typename SkylineProductReturnType<Derived, OtherDerived>::Type
+    operator*(const MatrixBase<OtherDerived> &other) const;
+
+    /** \internal use operator= */
+    template<typename DenseDerived>
+    void evalTo(MatrixBase<DenseDerived>& dst) const {
+        dst.setZero();
+        for (Index i = 0; i < rows(); i++)
+            for (Index j = 0; j < rows(); j++)
+                dst(i, j) = derived().coeff(i, j);
+    }
+
+    Matrix<Scalar, RowsAtCompileTime, ColsAtCompileTime> toDense() const {
+        return derived();
+    }
+
+    /** \returns the matrix or vector obtained by evaluating this expression.
+     *
+     * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
+     * a const reference, in order to avoid a useless copy.
+     */
+    EIGEN_STRONG_INLINE const typename internal::eval<Derived, IsSkyline>::type eval() const {
+        return typename internal::eval<Derived>::type(derived());
+    }
+
+protected:
+    bool m_isRValue;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SKYLINEMATRIXBASE_H

diff --git a/unsupported/Eigen/src/Skyline/SkylineProduct.h b/unsupported/Eigen/src/Skyline/SkylineProduct.h
new file mode 100644
index 0000000..d9eb814
--- /dev/null
+++ b/unsupported/Eigen/src/Skyline/SkylineProduct.h

@@ -0,0 +1,295 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Guillaume Saupin <guillaume.saupin@cea.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKYLINEPRODUCT_H
+#define EIGEN_SKYLINEPRODUCT_H
+
+namespace Eigen { 
+
+template<typename Lhs, typename Rhs, int ProductMode>
+struct SkylineProductReturnType {
+    typedef const typename internal::nested_eval<Lhs, Rhs::RowsAtCompileTime>::type LhsNested;
+    typedef const typename internal::nested_eval<Rhs, Lhs::RowsAtCompileTime>::type RhsNested;
+
+    typedef SkylineProduct<LhsNested, RhsNested, ProductMode> Type;
+};
+
+template<typename LhsNested, typename RhsNested, int ProductMode>
+struct internal::traits<SkylineProduct<LhsNested, RhsNested, ProductMode> > {
+    // clean the nested types:
+    typedef typename internal::remove_all<LhsNested>::type _LhsNested;
+    typedef typename internal::remove_all<RhsNested>::type _RhsNested;
+    typedef typename _LhsNested::Scalar Scalar;
+
+    enum {
+        LhsCoeffReadCost = _LhsNested::CoeffReadCost,
+        RhsCoeffReadCost = _RhsNested::CoeffReadCost,
+        LhsFlags = _LhsNested::Flags,
+        RhsFlags = _RhsNested::Flags,
+
+        RowsAtCompileTime = _LhsNested::RowsAtCompileTime,
+        ColsAtCompileTime = _RhsNested::ColsAtCompileTime,
+        InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
+
+        MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
+        MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
+
+        EvalToRowMajor = (RhsFlags & LhsFlags & RowMajorBit),
+        ResultIsSkyline = ProductMode == SkylineTimeSkylineProduct,
+
+        RemovedBits = ~((EvalToRowMajor ? 0 : RowMajorBit) | (ResultIsSkyline ? 0 : SkylineBit)),
+
+        Flags = (int(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
+        | EvalBeforeAssigningBit
+        | EvalBeforeNestingBit,
+
+        CoeffReadCost = HugeCost
+    };
+
+    typedef typename internal::conditional<ResultIsSkyline,
+            SkylineMatrixBase<SkylineProduct<LhsNested, RhsNested, ProductMode> >,
+            MatrixBase<SkylineProduct<LhsNested, RhsNested, ProductMode> > >::type Base;
+};
+
+namespace internal {
+template<typename LhsNested, typename RhsNested, int ProductMode>
+class SkylineProduct : no_assignment_operator,
+public traits<SkylineProduct<LhsNested, RhsNested, ProductMode> >::Base {
+public:
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(SkylineProduct)
+
+private:
+
+    typedef typename traits<SkylineProduct>::_LhsNested _LhsNested;
+    typedef typename traits<SkylineProduct>::_RhsNested _RhsNested;
+
+public:
+
+    template<typename Lhs, typename Rhs>
+    EIGEN_STRONG_INLINE SkylineProduct(const Lhs& lhs, const Rhs& rhs)
+    : m_lhs(lhs), m_rhs(rhs) {
+        eigen_assert(lhs.cols() == rhs.rows());
+
+        enum {
+            ProductIsValid = _LhsNested::ColsAtCompileTime == Dynamic
+            || _RhsNested::RowsAtCompileTime == Dynamic
+            || int(_LhsNested::ColsAtCompileTime) == int(_RhsNested::RowsAtCompileTime),
+            AreVectors = _LhsNested::IsVectorAtCompileTime && _RhsNested::IsVectorAtCompileTime,
+            SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(_LhsNested, _RhsNested)
+        };
+        // note to the lost user:
+        //    * for a dot product use: v1.dot(v2)
+        //    * for a coeff-wise product use: v1.cwise()*v2
+        EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes),
+                INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS)
+                EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors),
+                INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
+                EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
+    }
+
+    EIGEN_STRONG_INLINE Index rows() const {
+        return m_lhs.rows();
+    }
+
+    EIGEN_STRONG_INLINE Index cols() const {
+        return m_rhs.cols();
+    }
+
+    EIGEN_STRONG_INLINE const _LhsNested& lhs() const {
+        return m_lhs;
+    }
+
+    EIGEN_STRONG_INLINE const _RhsNested& rhs() const {
+        return m_rhs;
+    }
+
+protected:
+    LhsNested m_lhs;
+    RhsNested m_rhs;
+};
+
+// dense = skyline * dense
+// Note that here we force no inlining and separate the setZero() because GCC messes up otherwise
+
+template<typename Lhs, typename Rhs, typename Dest>
+EIGEN_DONT_INLINE void skyline_row_major_time_dense_product(const Lhs& lhs, const Rhs& rhs, Dest& dst) {
+    typedef typename remove_all<Lhs>::type _Lhs;
+    typedef typename remove_all<Rhs>::type _Rhs;
+    typedef typename traits<Lhs>::Scalar Scalar;
+
+    enum {
+        LhsIsRowMajor = (_Lhs::Flags & RowMajorBit) == RowMajorBit,
+        LhsIsSelfAdjoint = (_Lhs::Flags & SelfAdjointBit) == SelfAdjointBit,
+        ProcessFirstHalf = LhsIsSelfAdjoint
+        && (((_Lhs::Flags & (UpperTriangularBit | LowerTriangularBit)) == 0)
+        || ((_Lhs::Flags & UpperTriangularBit) && !LhsIsRowMajor)
+        || ((_Lhs::Flags & LowerTriangularBit) && LhsIsRowMajor)),
+        ProcessSecondHalf = LhsIsSelfAdjoint && (!ProcessFirstHalf)
+    };
+
+    //Use matrix diagonal part <- Improvement : use inner iterator on dense matrix.
+    for (Index col = 0; col < rhs.cols(); col++) {
+        for (Index row = 0; row < lhs.rows(); row++) {
+            dst(row, col) = lhs.coeffDiag(row) * rhs(row, col);
+        }
+    }
+    //Use matrix lower triangular part
+    for (Index row = 0; row < lhs.rows(); row++) {
+        typename _Lhs::InnerLowerIterator lIt(lhs, row);
+        const Index stop = lIt.col() + lIt.size();
+        for (Index col = 0; col < rhs.cols(); col++) {
+
+            Index k = lIt.col();
+            Scalar tmp = 0;
+            while (k < stop) {
+                tmp +=
+                        lIt.value() *
+                        rhs(k++, col);
+                ++lIt;
+            }
+            dst(row, col) += tmp;
+            lIt += -lIt.size();
+        }
+
+    }
+
+    //Use matrix upper triangular part
+    for (Index lhscol = 0; lhscol < lhs.cols(); lhscol++) {
+        typename _Lhs::InnerUpperIterator uIt(lhs, lhscol);
+        const Index stop = uIt.size() + uIt.row();
+        for (Index rhscol = 0; rhscol < rhs.cols(); rhscol++) {
+
+
+            const Scalar rhsCoeff = rhs.coeff(lhscol, rhscol);
+            Index k = uIt.row();
+            while (k < stop) {
+                dst(k++, rhscol) +=
+                        uIt.value() *
+                        rhsCoeff;
+                ++uIt;
+            }
+            uIt += -uIt.size();
+        }
+    }
+
+}
+
+template<typename Lhs, typename Rhs, typename Dest>
+EIGEN_DONT_INLINE void skyline_col_major_time_dense_product(const Lhs& lhs, const Rhs& rhs, Dest& dst) {
+    typedef typename remove_all<Lhs>::type _Lhs;
+    typedef typename remove_all<Rhs>::type _Rhs;
+    typedef typename traits<Lhs>::Scalar Scalar;
+
+    enum {
+        LhsIsRowMajor = (_Lhs::Flags & RowMajorBit) == RowMajorBit,
+        LhsIsSelfAdjoint = (_Lhs::Flags & SelfAdjointBit) == SelfAdjointBit,
+        ProcessFirstHalf = LhsIsSelfAdjoint
+        && (((_Lhs::Flags & (UpperTriangularBit | LowerTriangularBit)) == 0)
+        || ((_Lhs::Flags & UpperTriangularBit) && !LhsIsRowMajor)
+        || ((_Lhs::Flags & LowerTriangularBit) && LhsIsRowMajor)),
+        ProcessSecondHalf = LhsIsSelfAdjoint && (!ProcessFirstHalf)
+    };
+
+    //Use matrix diagonal part <- Improvement : use inner iterator on dense matrix.
+    for (Index col = 0; col < rhs.cols(); col++) {
+        for (Index row = 0; row < lhs.rows(); row++) {
+            dst(row, col) = lhs.coeffDiag(row) * rhs(row, col);
+        }
+    }
+
+    //Use matrix upper triangular part
+    for (Index row = 0; row < lhs.rows(); row++) {
+        typename _Lhs::InnerUpperIterator uIt(lhs, row);
+        const Index stop = uIt.col() + uIt.size();
+        for (Index col = 0; col < rhs.cols(); col++) {
+
+            Index k = uIt.col();
+            Scalar tmp = 0;
+            while (k < stop) {
+                tmp +=
+                        uIt.value() *
+                        rhs(k++, col);
+                ++uIt;
+            }
+
+
+            dst(row, col) += tmp;
+            uIt += -uIt.size();
+        }
+    }
+
+    //Use matrix lower triangular part
+    for (Index lhscol = 0; lhscol < lhs.cols(); lhscol++) {
+        typename _Lhs::InnerLowerIterator lIt(lhs, lhscol);
+        const Index stop = lIt.size() + lIt.row();
+        for (Index rhscol = 0; rhscol < rhs.cols(); rhscol++) {
+
+            const Scalar rhsCoeff = rhs.coeff(lhscol, rhscol);
+            Index k = lIt.row();
+            while (k < stop) {
+                dst(k++, rhscol) +=
+                        lIt.value() *
+                        rhsCoeff;
+                ++lIt;
+            }
+            lIt += -lIt.size();
+        }
+    }
+
+}
+
+template<typename Lhs, typename Rhs, typename ResultType,
+        int LhsStorageOrder = traits<Lhs>::Flags&RowMajorBit>
+        struct skyline_product_selector;
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct skyline_product_selector<Lhs, Rhs, ResultType, RowMajor> {
+    typedef typename traits<typename remove_all<Lhs>::type>::Scalar Scalar;
+
+    static void run(const Lhs& lhs, const Rhs& rhs, ResultType & res) {
+        skyline_row_major_time_dense_product<Lhs, Rhs, ResultType > (lhs, rhs, res);
+    }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct skyline_product_selector<Lhs, Rhs, ResultType, ColMajor> {
+    typedef typename traits<typename remove_all<Lhs>::type>::Scalar Scalar;
+
+    static void run(const Lhs& lhs, const Rhs& rhs, ResultType & res) {
+        skyline_col_major_time_dense_product<Lhs, Rhs, ResultType > (lhs, rhs, res);
+    }
+};
+
+} // end namespace internal
+
+// template<typename Derived>
+// template<typename Lhs, typename Rhs >
+// Derived & MatrixBase<Derived>::lazyAssign(const SkylineProduct<Lhs, Rhs, SkylineTimeDenseProduct>& product) {
+//     typedef typename internal::remove_all<Lhs>::type _Lhs;
+//     internal::skyline_product_selector<typename internal::remove_all<Lhs>::type,
+//             typename internal::remove_all<Rhs>::type,
+//             Derived>::run(product.lhs(), product.rhs(), derived());
+// 
+//     return derived();
+// }
+
+// skyline * dense
+
+template<typename Derived>
+template<typename OtherDerived >
+EIGEN_STRONG_INLINE const typename SkylineProductReturnType<Derived, OtherDerived>::Type
+SkylineMatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const {
+
+    return typename SkylineProductReturnType<Derived, OtherDerived>::Type(derived(), other.derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SKYLINEPRODUCT_H

diff --git a/unsupported/Eigen/src/Skyline/SkylineStorage.h b/unsupported/Eigen/src/Skyline/SkylineStorage.h
new file mode 100644
index 0000000..cc7514f
--- /dev/null
+++ b/unsupported/Eigen/src/Skyline/SkylineStorage.h

@@ -0,0 +1,259 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Guillaume Saupin <guillaume.saupin@cea.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKYLINE_STORAGE_H
+#define EIGEN_SKYLINE_STORAGE_H
+
+namespace Eigen { 
+
+/** Stores a skyline set of values in three structures :
+ * The diagonal elements
+ * The upper elements
+ * The lower elements
+ *
+ */
+template<typename Scalar>
+class SkylineStorage {
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef SparseIndex Index;
+public:
+
+    SkylineStorage()
+    : m_diag(0),
+    m_lower(0),
+    m_upper(0),
+    m_lowerProfile(0),
+    m_upperProfile(0),
+    m_diagSize(0),
+    m_upperSize(0),
+    m_lowerSize(0),
+    m_upperProfileSize(0),
+    m_lowerProfileSize(0),
+    m_allocatedSize(0) {
+    }
+
+    SkylineStorage(const SkylineStorage& other)
+    : m_diag(0),
+    m_lower(0),
+    m_upper(0),
+    m_lowerProfile(0),
+    m_upperProfile(0),
+    m_diagSize(0),
+    m_upperSize(0),
+    m_lowerSize(0),
+    m_upperProfileSize(0),
+    m_lowerProfileSize(0),
+    m_allocatedSize(0) {
+        *this = other;
+    }
+
+    SkylineStorage & operator=(const SkylineStorage& other) {
+        resize(other.diagSize(), other.m_upperProfileSize, other.m_lowerProfileSize, other.upperSize(), other.lowerSize());
+        memcpy(m_diag, other.m_diag, m_diagSize * sizeof (Scalar));
+        memcpy(m_upper, other.m_upper, other.upperSize() * sizeof (Scalar));
+        memcpy(m_lower, other.m_lower, other.lowerSize() * sizeof (Scalar));
+        memcpy(m_upperProfile, other.m_upperProfile, m_upperProfileSize * sizeof (Index));
+        memcpy(m_lowerProfile, other.m_lowerProfile, m_lowerProfileSize * sizeof (Index));
+        return *this;
+    }
+
+    void swap(SkylineStorage& other) {
+        std::swap(m_diag, other.m_diag);
+        std::swap(m_upper, other.m_upper);
+        std::swap(m_lower, other.m_lower);
+        std::swap(m_upperProfile, other.m_upperProfile);
+        std::swap(m_lowerProfile, other.m_lowerProfile);
+        std::swap(m_diagSize, other.m_diagSize);
+        std::swap(m_upperSize, other.m_upperSize);
+        std::swap(m_lowerSize, other.m_lowerSize);
+        std::swap(m_allocatedSize, other.m_allocatedSize);
+    }
+
+    ~SkylineStorage() {
+        delete[] m_diag;
+        delete[] m_upper;
+        if (m_upper != m_lower)
+            delete[] m_lower;
+        delete[] m_upperProfile;
+        delete[] m_lowerProfile;
+    }
+
+    void reserve(Index size, Index upperProfileSize, Index lowerProfileSize, Index upperSize, Index lowerSize) {
+        Index newAllocatedSize = size + upperSize + lowerSize;
+        if (newAllocatedSize > m_allocatedSize)
+            reallocate(size, upperProfileSize, lowerProfileSize, upperSize, lowerSize);
+    }
+
+    void squeeze() {
+        if (m_allocatedSize > m_diagSize + m_upperSize + m_lowerSize)
+            reallocate(m_diagSize, m_upperProfileSize, m_lowerProfileSize, m_upperSize, m_lowerSize);
+    }
+
+    void resize(Index diagSize, Index upperProfileSize, Index lowerProfileSize, Index upperSize, Index lowerSize, float reserveSizeFactor = 0) {
+        if (m_allocatedSize < diagSize + upperSize + lowerSize)
+            reallocate(diagSize, upperProfileSize, lowerProfileSize, upperSize + Index(reserveSizeFactor * upperSize), lowerSize + Index(reserveSizeFactor * lowerSize));
+        m_diagSize = diagSize;
+        m_upperSize = upperSize;
+        m_lowerSize = lowerSize;
+        m_upperProfileSize = upperProfileSize;
+        m_lowerProfileSize = lowerProfileSize;
+    }
+
+    inline Index diagSize() const {
+        return m_diagSize;
+    }
+
+    inline Index upperSize() const {
+        return m_upperSize;
+    }
+
+    inline Index lowerSize() const {
+        return m_lowerSize;
+    }
+
+    inline Index upperProfileSize() const {
+        return m_upperProfileSize;
+    }
+
+    inline Index lowerProfileSize() const {
+        return m_lowerProfileSize;
+    }
+
+    inline Index allocatedSize() const {
+        return m_allocatedSize;
+    }
+
+    inline void clear() {
+        m_diagSize = 0;
+    }
+
+    inline Scalar& diag(Index i) {
+        return m_diag[i];
+    }
+
+    inline const Scalar& diag(Index i) const {
+        return m_diag[i];
+    }
+
+    inline Scalar& upper(Index i) {
+        return m_upper[i];
+    }
+
+    inline const Scalar& upper(Index i) const {
+        return m_upper[i];
+    }
+
+    inline Scalar& lower(Index i) {
+        return m_lower[i];
+    }
+
+    inline const Scalar& lower(Index i) const {
+        return m_lower[i];
+    }
+
+    inline Index& upperProfile(Index i) {
+        return m_upperProfile[i];
+    }
+
+    inline const Index& upperProfile(Index i) const {
+        return m_upperProfile[i];
+    }
+
+    inline Index& lowerProfile(Index i) {
+        return m_lowerProfile[i];
+    }
+
+    inline const Index& lowerProfile(Index i) const {
+        return m_lowerProfile[i];
+    }
+
+    static SkylineStorage Map(Index* upperProfile, Index* lowerProfile, Scalar* diag, Scalar* upper, Scalar* lower, Index size, Index upperSize, Index lowerSize) {
+        SkylineStorage res;
+        res.m_upperProfile = upperProfile;
+        res.m_lowerProfile = lowerProfile;
+        res.m_diag = diag;
+        res.m_upper = upper;
+        res.m_lower = lower;
+        res.m_allocatedSize = res.m_diagSize = size;
+        res.m_upperSize = upperSize;
+        res.m_lowerSize = lowerSize;
+        return res;
+    }
+
+    inline void reset() {
+        memset(m_diag, 0, m_diagSize * sizeof (Scalar));
+        memset(m_upper, 0, m_upperSize * sizeof (Scalar));
+        memset(m_lower, 0, m_lowerSize * sizeof (Scalar));
+        memset(m_upperProfile, 0, m_diagSize * sizeof (Index));
+        memset(m_lowerProfile, 0, m_diagSize * sizeof (Index));
+    }
+
+    void prune(Scalar reference, RealScalar epsilon = dummy_precision<RealScalar>()) {
+        //TODO
+    }
+
+protected:
+
+    inline void reallocate(Index diagSize, Index upperProfileSize, Index lowerProfileSize, Index upperSize, Index lowerSize) {
+
+        Scalar* diag = new Scalar[diagSize];
+        Scalar* upper = new Scalar[upperSize];
+        Scalar* lower = new Scalar[lowerSize];
+        Index* upperProfile = new Index[upperProfileSize];
+        Index* lowerProfile = new Index[lowerProfileSize];
+
+        Index copyDiagSize = (std::min)(diagSize, m_diagSize);
+        Index copyUpperSize = (std::min)(upperSize, m_upperSize);
+        Index copyLowerSize = (std::min)(lowerSize, m_lowerSize);
+        Index copyUpperProfileSize = (std::min)(upperProfileSize, m_upperProfileSize);
+        Index copyLowerProfileSize = (std::min)(lowerProfileSize, m_lowerProfileSize);
+
+        // copy
+        memcpy(diag, m_diag, copyDiagSize * sizeof (Scalar));
+        memcpy(upper, m_upper, copyUpperSize * sizeof (Scalar));
+        memcpy(lower, m_lower, copyLowerSize * sizeof (Scalar));
+        memcpy(upperProfile, m_upperProfile, copyUpperProfileSize * sizeof (Index));
+        memcpy(lowerProfile, m_lowerProfile, copyLowerProfileSize * sizeof (Index));
+
+
+
+        // delete old stuff
+        delete[] m_diag;
+        delete[] m_upper;
+        delete[] m_lower;
+        delete[] m_upperProfile;
+        delete[] m_lowerProfile;
+        m_diag = diag;
+        m_upper = upper;
+        m_lower = lower;
+        m_upperProfile = upperProfile;
+        m_lowerProfile = lowerProfile;
+        m_allocatedSize = diagSize + upperSize + lowerSize;
+        m_upperSize = upperSize;
+        m_lowerSize = lowerSize;
+    }
+
+public:
+    Scalar* m_diag;
+    Scalar* m_upper;
+    Scalar* m_lower;
+    Index* m_upperProfile;
+    Index* m_lowerProfile;
+    Index m_diagSize;
+    Index m_upperSize;
+    Index m_lowerSize;
+    Index m_upperProfileSize;
+    Index m_lowerProfileSize;
+    Index m_allocatedSize;
+
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SKYLINE_STORAGE_H

diff --git a/unsupported/Eigen/src/Skyline/SkylineUtil.h b/unsupported/Eigen/src/Skyline/SkylineUtil.h
new file mode 100644
index 0000000..75eb612
--- /dev/null
+++ b/unsupported/Eigen/src/Skyline/SkylineUtil.h

@@ -0,0 +1,89 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Guillaume Saupin <guillaume.saupin@cea.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKYLINEUTIL_H
+#define EIGEN_SKYLINEUTIL_H
+
+namespace Eigen { 
+
+#ifdef NDEBUG
+#define EIGEN_DBG_SKYLINE(X)
+#else
+#define EIGEN_DBG_SKYLINE(X) X
+#endif
+
+const unsigned int SkylineBit = 0x1200;
+template<typename Lhs, typename Rhs, int ProductMode> class SkylineProduct;
+enum AdditionalProductEvaluationMode {SkylineTimeDenseProduct, SkylineTimeSkylineProduct, DenseTimeSkylineProduct};
+enum {IsSkyline = SkylineBit};
+
+
+#define EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(Derived, Op) \
+template<typename OtherDerived> \
+EIGEN_STRONG_INLINE Derived& operator Op(const Eigen::SkylineMatrixBase<OtherDerived>& other) \
+{ \
+  return Base::operator Op(other.derived()); \
+} \
+EIGEN_STRONG_INLINE Derived& operator Op(const Derived& other) \
+{ \
+  return Base::operator Op(other); \
+}
+
+#define EIGEN_SKYLINE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, Op) \
+template<typename Other> \
+EIGEN_STRONG_INLINE Derived& operator Op(const Other& scalar) \
+{ \
+  return Base::operator Op(scalar); \
+}
+
+#define EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
+  EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(Derived, =) \
+  EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(Derived, +=) \
+  EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(Derived, -=) \
+  EIGEN_SKYLINE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, *=) \
+  EIGEN_SKYLINE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=)
+
+#define _EIGEN_SKYLINE_GENERIC_PUBLIC_INTERFACE(Derived, BaseClass) \
+  typedef BaseClass Base; \
+  typedef typename Eigen::internal::traits<Derived>::Scalar Scalar; \
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; \
+  typedef typename Eigen::internal::traits<Derived>::StorageKind StorageKind; \
+  typedef typename Eigen::internal::index<StorageKind>::type Index; \
+  enum {  Flags = Eigen::internal::traits<Derived>::Flags, };
+
+#define EIGEN_SKYLINE_GENERIC_PUBLIC_INTERFACE(Derived) \
+  _EIGEN_SKYLINE_GENERIC_PUBLIC_INTERFACE(Derived, Eigen::SkylineMatrixBase<Derived>)
+
+template<typename Derived> class SkylineMatrixBase;
+template<typename _Scalar, int _Flags = 0> class SkylineMatrix;
+template<typename _Scalar, int _Flags = 0> class DynamicSkylineMatrix;
+template<typename _Scalar, int _Flags = 0> class SkylineVector;
+template<typename _Scalar, int _Flags = 0> class MappedSkylineMatrix;
+
+namespace internal {
+
+template<typename Lhs, typename Rhs> struct skyline_product_mode;
+template<typename Lhs, typename Rhs, int ProductMode = skyline_product_mode<Lhs,Rhs>::value> struct SkylineProductReturnType;
+
+template<typename T> class eval<T,IsSkyline>
+{
+    typedef typename traits<T>::Scalar _Scalar;
+    enum {
+          _Flags = traits<T>::Flags
+    };
+
+  public:
+    typedef SkylineMatrix<_Scalar, _Flags> type;
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SKYLINEUTIL_H

diff --git a/unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h b/unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h
new file mode 100644
index 0000000..e9ec746
--- /dev/null
+++ b/unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h

@@ -0,0 +1,122 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_BLOCKFORDYNAMICMATRIX_H
+#define EIGEN_SPARSE_BLOCKFORDYNAMICMATRIX_H
+
+namespace Eigen { 
+
+#if 0
+
+// NOTE Have to be reimplemented as a specialization of BlockImpl< DynamicSparseMatrix<_Scalar, _Options, _Index>, ... >
+// See SparseBlock.h for an example
+
+
+/***************************************************************************
+* specialisation for DynamicSparseMatrix
+***************************************************************************/
+
+template<typename _Scalar, int _Options, typename _Index, int Size>
+class SparseInnerVectorSet<DynamicSparseMatrix<_Scalar, _Options, _Index>, Size>
+  : public SparseMatrixBase<SparseInnerVectorSet<DynamicSparseMatrix<_Scalar, _Options, _Index>, Size> >
+{
+    typedef DynamicSparseMatrix<_Scalar, _Options, _Index> MatrixType;
+  public:
+
+    enum { IsRowMajor = internal::traits<SparseInnerVectorSet>::IsRowMajor };
+
+    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseInnerVectorSet)
+    class InnerIterator: public MatrixType::InnerIterator
+    {
+      public:
+        inline InnerIterator(const SparseInnerVectorSet& xpr, Index outer)
+          : MatrixType::InnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
+        {}
+        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
+        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
+      protected:
+        Index m_outer;
+    };
+
+    inline SparseInnerVectorSet(const MatrixType& matrix, Index outerStart, Index outerSize)
+      : m_matrix(matrix), m_outerStart(outerStart), m_outerSize(outerSize)
+    {
+      eigen_assert( (outerStart>=0) && ((outerStart+outerSize)<=matrix.outerSize()) );
+    }
+
+    inline SparseInnerVectorSet(const MatrixType& matrix, Index outer)
+      : m_matrix(matrix), m_outerStart(outer), m_outerSize(Size)
+    {
+      eigen_assert(Size!=Dynamic);
+      eigen_assert( (outer>=0) && (outer<matrix.outerSize()) );
+    }
+
+    template<typename OtherDerived>
+    inline SparseInnerVectorSet& operator=(const SparseMatrixBase<OtherDerived>& other)
+    {
+      if (IsRowMajor != ((OtherDerived::Flags&RowMajorBit)==RowMajorBit))
+      {
+        // need to transpose => perform a block evaluation followed by a big swap
+        DynamicSparseMatrix<Scalar,IsRowMajor?RowMajorBit:0> aux(other);
+        *this = aux.markAsRValue();
+      }
+      else
+      {
+        // evaluate/copy vector per vector
+        for (Index j=0; j<m_outerSize.value(); ++j)
+        {
+          SparseVector<Scalar,IsRowMajor ? RowMajorBit : 0> aux(other.innerVector(j));
+          m_matrix.const_cast_derived()._data()[m_outerStart+j].swap(aux._data());
+        }
+      }
+      return *this;
+    }
+
+    inline SparseInnerVectorSet& operator=(const SparseInnerVectorSet& other)
+    {
+      return operator=<SparseInnerVectorSet>(other);
+    }
+
+    Index nonZeros() const
+    {
+      Index count = 0;
+      for (Index j=0; j<m_outerSize.value(); ++j)
+        count += m_matrix._data()[m_outerStart+j].size();
+      return count;
+    }
+
+    const Scalar& lastCoeff() const
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(SparseInnerVectorSet);
+      eigen_assert(m_matrix.data()[m_outerStart].size()>0);
+      return m_matrix.data()[m_outerStart].vale(m_matrix.data()[m_outerStart].size()-1);
+    }
+
+//     template<typename Sparse>
+//     inline SparseInnerVectorSet& operator=(const SparseMatrixBase<OtherDerived>& other)
+//     {
+//       return *this;
+//     }
+
+    EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
+    EIGEN_STRONG_INLINE Index cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
+
+  protected:
+
+    const typename MatrixType::Nested m_matrix;
+    Index m_outerStart;
+    const internal::variable_if_dynamic<Index, Size> m_outerSize;
+
+};
+
+#endif
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_BLOCKFORDYNAMICMATRIX_H

diff --git a/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h b/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h
new file mode 100644
index 0000000..536a0c3
--- /dev/null
+++ b/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h

@@ -0,0 +1,1079 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Desire Nuentsa <desire.nuentsa_wakam@inria.fr>
+// Copyright (C) 2013 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSEBLOCKMATRIX_H
+#define EIGEN_SPARSEBLOCKMATRIX_H
+
+namespace Eigen { 
+/** \ingroup SparseCore_Module
+  *
+  * \class BlockSparseMatrix
+  *
+  * \brief A versatile sparse matrix representation where each element is a block
+  *
+  * This class provides routines to manipulate block sparse matrices stored in a
+  * BSR-like representation. There are two main types :
+  *
+  * 1. All blocks have the same number of rows and columns, called block size
+  * in the following. In this case, if this block size is known at compile time,
+  * it can be given as a template parameter like
+  * \code
+  * BlockSparseMatrix<Scalar, 3, ColMajor> bmat(b_rows, b_cols);
+  * \endcode
+  * Here, bmat is a b_rows x b_cols block sparse matrix
+  * where each coefficient is a 3x3 dense matrix.
+  * If the block size is fixed but will be given at runtime,
+  * \code
+  * BlockSparseMatrix<Scalar, Dynamic, ColMajor> bmat(b_rows, b_cols);
+  * bmat.setBlockSize(block_size);
+  * \endcode
+  *
+  * 2. The second case is for variable-block sparse matrices.
+  * Here each block has its own dimensions. The only restriction is that all the blocks
+  * in a row (resp. a column) should have the same number of rows (resp. of columns).
+  * It is thus required in this case to describe the layout of the matrix by calling
+  * setBlockLayout(rowBlocks, colBlocks).
+  *
+  * In any of the previous case, the matrix can be filled by calling setFromTriplets().
+  * A regular sparse matrix can be converted to a block sparse matrix and vice versa.
+  * It is obviously required to describe the block layout beforehand by calling either
+  * setBlockSize() for fixed-size blocks or setBlockLayout for variable-size blocks.
+  *
+  * \tparam _Scalar The Scalar type
+  * \tparam _BlockAtCompileTime The block layout option. It takes the following values
+  * Dynamic : block size known at runtime
+  * a numeric number : fixed-size block known at compile time
+  */
+template<typename _Scalar, int _BlockAtCompileTime=Dynamic, int _Options=ColMajor, typename _StorageIndex=int> class BlockSparseMatrix;
+
+template<typename BlockSparseMatrixT> class BlockSparseMatrixView;
+
+namespace internal {
+template<typename _Scalar, int _BlockAtCompileTime, int _Options, typename _Index>
+struct traits<BlockSparseMatrix<_Scalar,_BlockAtCompileTime,_Options, _Index> >
+{
+  typedef _Scalar Scalar;
+  typedef _Index Index;
+  typedef Sparse StorageKind; // FIXME Where is it used ??
+  typedef MatrixXpr XprKind;
+  enum {
+    RowsAtCompileTime = Dynamic,
+    ColsAtCompileTime = Dynamic,
+    MaxRowsAtCompileTime = Dynamic,
+    MaxColsAtCompileTime = Dynamic,
+    BlockSize = _BlockAtCompileTime,
+    Flags = _Options | NestByRefBit | LvalueBit,
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    SupportedAccessPatterns = InnerRandomAccessPattern
+  };
+};
+template<typename BlockSparseMatrixT>
+struct traits<BlockSparseMatrixView<BlockSparseMatrixT> >
+{
+  typedef Ref<Matrix<typename BlockSparseMatrixT::Scalar, BlockSparseMatrixT::BlockSize, BlockSparseMatrixT::BlockSize> > Scalar;
+  typedef Ref<Matrix<typename BlockSparseMatrixT::RealScalar, BlockSparseMatrixT::BlockSize, BlockSparseMatrixT::BlockSize> > RealScalar;
+
+};
+
+// Function object to sort a triplet list
+template<typename Iterator, bool IsColMajor>
+struct TripletComp
+{
+  typedef typename Iterator::value_type Triplet;
+  bool operator()(const Triplet& a, const Triplet& b)
+  { if(IsColMajor)
+      return ((a.col() == b.col() && a.row() < b.row()) || (a.col() < b.col()));
+    else
+      return ((a.row() == b.row() && a.col() < b.col()) || (a.row() < b.row()));
+  }
+};
+} // end namespace internal
+
+
+/* Proxy to view the block sparse matrix as a regular sparse matrix */
+template<typename BlockSparseMatrixT>
+class BlockSparseMatrixView : public SparseMatrixBase<BlockSparseMatrixT>
+{
+  public:
+    typedef Ref<typename BlockSparseMatrixT::BlockScalar> Scalar;
+    typedef Ref<typename BlockSparseMatrixT::BlockRealScalar> RealScalar;
+    typedef typename BlockSparseMatrixT::Index Index;
+    typedef  BlockSparseMatrixT Nested;
+    enum {
+      Flags = BlockSparseMatrixT::Options,
+      Options = BlockSparseMatrixT::Options,
+      RowsAtCompileTime = BlockSparseMatrixT::RowsAtCompileTime,
+      ColsAtCompileTime = BlockSparseMatrixT::ColsAtCompileTime,
+      MaxColsAtCompileTime = BlockSparseMatrixT::MaxColsAtCompileTime,
+      MaxRowsAtCompileTime = BlockSparseMatrixT::MaxRowsAtCompileTime
+    };
+  public:
+    BlockSparseMatrixView(const BlockSparseMatrixT& spblockmat)
+     : m_spblockmat(spblockmat)
+    {}
+
+    Index outerSize() const
+    {
+      return (Flags&RowMajorBit) == 1 ? this->rows() : this->cols();
+    }
+    Index cols() const
+    {
+      return m_spblockmat.blockCols();
+    }
+    Index rows() const
+    {
+      return m_spblockmat.blockRows();
+    }
+    Scalar coeff(Index row, Index col)
+    {
+      return m_spblockmat.coeff(row, col);
+    }
+    Scalar coeffRef(Index row, Index col)
+    {
+      return m_spblockmat.coeffRef(row, col);
+    }
+    // Wrapper to iterate over all blocks
+    class InnerIterator : public BlockSparseMatrixT::BlockInnerIterator
+    {
+      public:
+      InnerIterator(const BlockSparseMatrixView& mat, Index outer)
+          : BlockSparseMatrixT::BlockInnerIterator(mat.m_spblockmat, outer)
+      {}
+
+    };
+
+  protected:
+    const BlockSparseMatrixT& m_spblockmat;
+};
+
+// Proxy to view a regular vector as a block vector
+template<typename BlockSparseMatrixT, typename VectorType>
+class BlockVectorView
+{
+  public:
+    enum {
+      BlockSize = BlockSparseMatrixT::BlockSize,
+      ColsAtCompileTime = VectorType::ColsAtCompileTime,
+      RowsAtCompileTime = VectorType::RowsAtCompileTime,
+      Flags = VectorType::Flags
+    };
+    typedef Ref<const Matrix<typename BlockSparseMatrixT::Scalar, (RowsAtCompileTime==1)? 1 : BlockSize, (ColsAtCompileTime==1)? 1 : BlockSize> >Scalar;
+    typedef typename BlockSparseMatrixT::Index Index;
+  public:
+    BlockVectorView(const BlockSparseMatrixT& spblockmat, const VectorType& vec)
+    : m_spblockmat(spblockmat),m_vec(vec)
+    { }
+    inline Index cols() const
+    {
+      return m_vec.cols();
+    }
+    inline Index size() const
+    {
+      return m_spblockmat.blockRows();
+    }
+    inline Scalar coeff(Index bi) const
+    {
+      Index startRow = m_spblockmat.blockRowsIndex(bi);
+      Index rowSize = m_spblockmat.blockRowsIndex(bi+1) - startRow;
+      return m_vec.middleRows(startRow, rowSize);
+    }
+    inline Scalar coeff(Index bi, Index j) const
+    {
+      Index startRow = m_spblockmat.blockRowsIndex(bi);
+      Index rowSize = m_spblockmat.blockRowsIndex(bi+1) - startRow;
+      return m_vec.block(startRow, j, rowSize, 1);
+    }
+  protected:
+    const BlockSparseMatrixT& m_spblockmat;
+    const VectorType& m_vec;
+};
+
+template<typename VectorType, typename Index> class BlockVectorReturn;
+
+
+// Proxy to view a regular vector as a block vector
+template<typename BlockSparseMatrixT, typename VectorType>
+class BlockVectorReturn
+{
+  public:
+    enum {
+      ColsAtCompileTime = VectorType::ColsAtCompileTime,
+      RowsAtCompileTime = VectorType::RowsAtCompileTime,
+      Flags = VectorType::Flags
+    };
+    typedef Ref<Matrix<typename VectorType::Scalar, RowsAtCompileTime, ColsAtCompileTime> > Scalar;
+    typedef typename BlockSparseMatrixT::Index Index;
+  public:
+    BlockVectorReturn(const BlockSparseMatrixT& spblockmat, VectorType& vec)
+    : m_spblockmat(spblockmat),m_vec(vec)
+    { }
+    inline Index size() const
+    {
+      return m_spblockmat.blockRows();
+    }
+    inline Scalar coeffRef(Index bi)
+    {
+      Index startRow = m_spblockmat.blockRowsIndex(bi);
+      Index rowSize = m_spblockmat.blockRowsIndex(bi+1) - startRow;
+      return m_vec.middleRows(startRow, rowSize);
+    }
+    inline Scalar coeffRef(Index bi, Index j)
+    {
+      Index startRow = m_spblockmat.blockRowsIndex(bi);
+      Index rowSize = m_spblockmat.blockRowsIndex(bi+1) - startRow;
+      return m_vec.block(startRow, j, rowSize, 1);
+    }
+
+  protected:
+    const BlockSparseMatrixT& m_spblockmat;
+    VectorType& m_vec;
+};
+
+// Block version of the sparse dense product
+template<typename Lhs, typename Rhs>
+class BlockSparseTimeDenseProduct;
+
+namespace internal {
+
+template<typename BlockSparseMatrixT, typename VecType>
+struct traits<BlockSparseTimeDenseProduct<BlockSparseMatrixT, VecType> >
+{
+  typedef Dense StorageKind;
+  typedef MatrixXpr XprKind;
+  typedef typename BlockSparseMatrixT::Scalar Scalar;
+  typedef typename BlockSparseMatrixT::Index Index;
+  enum {
+    RowsAtCompileTime = Dynamic,
+    ColsAtCompileTime = Dynamic,
+    MaxRowsAtCompileTime = Dynamic,
+    MaxColsAtCompileTime = Dynamic,
+    Flags = 0,
+    CoeffReadCost = internal::traits<BlockSparseMatrixT>::CoeffReadCost
+  };
+};
+} // end namespace internal
+
+template<typename Lhs, typename Rhs>
+class BlockSparseTimeDenseProduct
+  : public ProductBase<BlockSparseTimeDenseProduct<Lhs,Rhs>, Lhs, Rhs>
+{
+  public:
+    EIGEN_PRODUCT_PUBLIC_INTERFACE(BlockSparseTimeDenseProduct)
+
+    BlockSparseTimeDenseProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
+    {}
+
+    template<typename Dest> void scaleAndAddTo(Dest& dest, const typename Rhs::Scalar& alpha) const
+    {
+      BlockVectorReturn<Lhs,Dest> tmpDest(m_lhs, dest);
+      internal::sparse_time_dense_product( BlockSparseMatrixView<Lhs>(m_lhs),  BlockVectorView<Lhs, Rhs>(m_lhs, m_rhs), tmpDest, alpha);
+    }
+
+  private:
+    BlockSparseTimeDenseProduct& operator=(const BlockSparseTimeDenseProduct&);
+};
+
+template<typename _Scalar, int _BlockAtCompileTime, int _Options, typename _StorageIndex>
+class BlockSparseMatrix : public SparseMatrixBase<BlockSparseMatrix<_Scalar,_BlockAtCompileTime, _Options,_StorageIndex> >
+{
+  public:
+    typedef _Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef _StorageIndex StorageIndex;
+    typedef typename internal::ref_selector<BlockSparseMatrix<_Scalar, _BlockAtCompileTime, _Options, _StorageIndex> >::type Nested;
+
+    enum {
+      Options = _Options,
+      Flags = Options,
+      BlockSize=_BlockAtCompileTime,
+      RowsAtCompileTime = Dynamic,
+      ColsAtCompileTime = Dynamic,
+      MaxRowsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic,
+      IsVectorAtCompileTime = 0,
+      IsColMajor = Flags&RowMajorBit ? 0 : 1
+    };
+    typedef Matrix<Scalar, _BlockAtCompileTime, _BlockAtCompileTime,IsColMajor ? ColMajor : RowMajor> BlockScalar;
+    typedef Matrix<RealScalar, _BlockAtCompileTime, _BlockAtCompileTime,IsColMajor ? ColMajor : RowMajor> BlockRealScalar;
+    typedef typename internal::conditional<_BlockAtCompileTime==Dynamic, Scalar, BlockScalar>::type BlockScalarReturnType;
+    typedef BlockSparseMatrix<Scalar, BlockSize, IsColMajor ? ColMajor : RowMajor, StorageIndex> PlainObject;
+  public:
+    // Default constructor
+    BlockSparseMatrix()
+    : m_innerBSize(0),m_outerBSize(0),m_innerOffset(0),m_outerOffset(0),
+      m_nonzerosblocks(0),m_values(0),m_blockPtr(0),m_indices(0),
+      m_outerIndex(0),m_blockSize(BlockSize)
+    { }
+
+
+    /**
+     * \brief Construct and resize
+     *
+     */
+    BlockSparseMatrix(Index brow, Index bcol)
+      : m_innerBSize(IsColMajor ? brow : bcol),
+        m_outerBSize(IsColMajor ? bcol : brow),
+        m_innerOffset(0),m_outerOffset(0),m_nonzerosblocks(0),
+        m_values(0),m_blockPtr(0),m_indices(0),
+        m_outerIndex(0),m_blockSize(BlockSize)
+    { }
+
+    /**
+     * \brief Copy-constructor
+     */
+    BlockSparseMatrix(const BlockSparseMatrix& other)
+      : m_innerBSize(other.m_innerBSize),m_outerBSize(other.m_outerBSize),
+        m_nonzerosblocks(other.m_nonzerosblocks),m_nonzeros(other.m_nonzeros),
+        m_blockPtr(0),m_blockSize(other.m_blockSize)
+    {
+      // should we allow copying between variable-size blocks and fixed-size blocks ??
+      eigen_assert(m_blockSize == BlockSize && " CAN NOT COPY BETWEEN FIXED-SIZE AND VARIABLE-SIZE BLOCKS");
+
+      std::copy(other.m_innerOffset, other.m_innerOffset+m_innerBSize+1, m_innerOffset);
+      std::copy(other.m_outerOffset, other.m_outerOffset+m_outerBSize+1, m_outerOffset);
+      std::copy(other.m_values, other.m_values+m_nonzeros, m_values);
+
+      if(m_blockSize != Dynamic)
+        std::copy(other.m_blockPtr, other.m_blockPtr+m_nonzerosblocks, m_blockPtr);
+
+      std::copy(other.m_indices, other.m_indices+m_nonzerosblocks, m_indices);
+      std::copy(other.m_outerIndex, other.m_outerIndex+m_outerBSize, m_outerIndex);
+    }
+
+    friend void swap(BlockSparseMatrix& first, BlockSparseMatrix& second)
+    {
+      std::swap(first.m_innerBSize, second.m_innerBSize);
+      std::swap(first.m_outerBSize, second.m_outerBSize);
+      std::swap(first.m_innerOffset, second.m_innerOffset);
+      std::swap(first.m_outerOffset, second.m_outerOffset);
+      std::swap(first.m_nonzerosblocks, second.m_nonzerosblocks);
+      std::swap(first.m_nonzeros, second.m_nonzeros);
+      std::swap(first.m_values, second.m_values);
+      std::swap(first.m_blockPtr, second.m_blockPtr);
+      std::swap(first.m_indices, second.m_indices);
+      std::swap(first.m_outerIndex, second.m_outerIndex);
+      std::swap(first.m_BlockSize, second.m_blockSize);
+    }
+
+    BlockSparseMatrix& operator=(BlockSparseMatrix other)
+    {
+      //Copy-and-swap paradigm ... avoid leaked data if thrown
+      swap(*this, other);
+      return *this;
+    }
+
+    // Destructor
+    ~BlockSparseMatrix()
+    {
+      delete[] m_outerIndex;
+      delete[] m_innerOffset;
+      delete[] m_outerOffset;
+      delete[] m_indices;
+      delete[] m_blockPtr;
+      delete[] m_values;
+    }
+
+
+    /**
+      * \brief Constructor from a sparse matrix
+      *
+      */
+    template<typename MatrixType>
+    inline BlockSparseMatrix(const MatrixType& spmat) : m_blockSize(BlockSize)
+    {
+      EIGEN_STATIC_ASSERT((m_blockSize != Dynamic), THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE);
+
+      *this = spmat;
+    }
+
+    /**
+      * \brief Assignment from a sparse matrix with the same storage order
+      *
+      * Convert from a sparse matrix to block sparse matrix.
+      * \warning Before calling this function, tt is necessary to call
+      * either setBlockLayout() (matrices with variable-size blocks)
+      * or setBlockSize() (for fixed-size blocks).
+      */
+    template<typename MatrixType>
+    inline BlockSparseMatrix& operator=(const MatrixType& spmat)
+    {
+      eigen_assert((m_innerBSize != 0 && m_outerBSize != 0)
+                   && "Trying to assign to a zero-size matrix, call resize() first");
+      eigen_assert(((MatrixType::Options&RowMajorBit) != IsColMajor) && "Wrong storage order");
+      typedef SparseMatrix<bool,MatrixType::Options,typename MatrixType::Index> MatrixPatternType;
+      MatrixPatternType  blockPattern(blockRows(), blockCols());
+      m_nonzeros = 0;
+
+      // First, compute the number of nonzero blocks and their locations
+      for(StorageIndex bj = 0; bj < m_outerBSize; ++bj)
+      {
+        // Browse each outer block and compute the structure
+        std::vector<bool> nzblocksFlag(m_innerBSize,false);  // Record the existing blocks
+        blockPattern.startVec(bj);
+        for(StorageIndex j = blockOuterIndex(bj); j < blockOuterIndex(bj+1); ++j)
+        {
+          typename MatrixType::InnerIterator it_spmat(spmat, j);
+          for(; it_spmat; ++it_spmat)
+          {
+            StorageIndex bi = innerToBlock(it_spmat.index()); // Index of the current nonzero block
+            if(!nzblocksFlag[bi])
+            {
+              // Save the index of this nonzero block
+              nzblocksFlag[bi] = true;
+              blockPattern.insertBackByOuterInnerUnordered(bj, bi) = true;
+              // Compute the total number of nonzeros (including explicit zeros in blocks)
+              m_nonzeros += blockOuterSize(bj) * blockInnerSize(bi);
+            }
+          }
+        } // end current outer block
+      }
+      blockPattern.finalize();
+
+      // Allocate the internal arrays
+      setBlockStructure(blockPattern);
+
+      for(StorageIndex nz = 0; nz < m_nonzeros; ++nz) m_values[nz] = Scalar(0);
+      for(StorageIndex bj = 0; bj < m_outerBSize; ++bj)
+      {
+        // Now copy the values
+        for(StorageIndex j = blockOuterIndex(bj); j < blockOuterIndex(bj+1); ++j)
+        {
+          // Browse the outer block column by column (for column-major matrices)
+          typename MatrixType::InnerIterator it_spmat(spmat, j);
+          for(; it_spmat; ++it_spmat)
+          {
+            StorageIndex idx = 0; // Position of this block in the column block
+            StorageIndex bi = innerToBlock(it_spmat.index()); // Index of the current nonzero block
+            // Go to the inner block where this element belongs to
+            while(bi > m_indices[m_outerIndex[bj]+idx]) ++idx; // Not expensive for ordered blocks
+            StorageIndex idxVal;// Get the right position in the array of values for this element
+            if(m_blockSize == Dynamic)
+            {
+              // Offset from all blocks before ...
+              idxVal =  m_blockPtr[m_outerIndex[bj]+idx];
+              // ... and offset inside the block
+              idxVal += (j - blockOuterIndex(bj)) * blockOuterSize(bj) + it_spmat.index() - m_innerOffset[bi];
+            }
+            else
+            {
+              // All blocks before
+              idxVal = (m_outerIndex[bj] + idx) * m_blockSize * m_blockSize;
+              // inside the block
+              idxVal += (j - blockOuterIndex(bj)) * m_blockSize + (it_spmat.index()%m_blockSize);
+            }
+            // Insert the value
+            m_values[idxVal] = it_spmat.value();
+          } // end of this column
+        } // end of this block
+      } // end of this outer block
+
+      return *this;
+    }
+
+    /**
+      * \brief Set the nonzero block pattern of the matrix
+      *
+      * Given a sparse matrix describing the nonzero block pattern,
+      * this function prepares the internal pointers for values.
+      * After calling this function, any *nonzero* block (bi, bj) can be set
+      * with a simple call to coeffRef(bi,bj).
+      *
+      *
+      * \warning Before calling this function, tt is necessary to call
+      * either setBlockLayout() (matrices with variable-size blocks)
+      * or setBlockSize() (for fixed-size blocks).
+      *
+      * \param blockPattern Sparse matrix of boolean elements describing the block structure
+      *
+      * \sa setBlockLayout() \sa setBlockSize()
+      */
+    template<typename MatrixType>
+    void setBlockStructure(const MatrixType& blockPattern)
+    {
+      resize(blockPattern.rows(), blockPattern.cols());
+      reserve(blockPattern.nonZeros());
+
+      // Browse the block pattern and set up the various pointers
+      m_outerIndex[0] = 0;
+      if(m_blockSize == Dynamic) m_blockPtr[0] = 0;
+      for(StorageIndex nz = 0; nz < m_nonzeros; ++nz) m_values[nz] = Scalar(0);
+      for(StorageIndex bj = 0; bj < m_outerBSize; ++bj)
+      {
+        //Browse each outer block
+
+        //First, copy and save the indices of nonzero blocks
+        //FIXME : find a way to avoid this ...
+        std::vector<int> nzBlockIdx;
+        typename MatrixType::InnerIterator it(blockPattern, bj);
+        for(; it; ++it)
+        {
+          nzBlockIdx.push_back(it.index());
+        }
+        std::sort(nzBlockIdx.begin(), nzBlockIdx.end());
+
+        // Now, fill block indices and (eventually) pointers to blocks
+        for(StorageIndex idx = 0; idx < nzBlockIdx.size(); ++idx)
+        {
+          StorageIndex offset = m_outerIndex[bj]+idx; // offset in m_indices
+          m_indices[offset] = nzBlockIdx[idx];
+          if(m_blockSize == Dynamic)
+            m_blockPtr[offset] = m_blockPtr[offset-1] + blockInnerSize(nzBlockIdx[idx]) * blockOuterSize(bj);
+          // There is no blockPtr for fixed-size blocks... not needed !???
+        }
+        // Save the pointer to the next outer block
+        m_outerIndex[bj+1] = m_outerIndex[bj] + nzBlockIdx.size();
+      }
+    }
+
+    /**
+      * \brief Set the number of rows and columns blocks
+      */
+    inline void resize(Index brow, Index bcol)
+    {
+      m_innerBSize = IsColMajor ? brow : bcol;
+      m_outerBSize = IsColMajor ? bcol : brow;
+    }
+
+    /**
+      * \brief set the block size at runtime for fixed-size block layout
+      *
+      * Call this only for fixed-size blocks
+      */
+    inline void setBlockSize(Index blockSize)
+    {
+      m_blockSize = blockSize;
+    }
+
+    /**
+      * \brief Set the row and column block layouts,
+      *
+      * This function set the size of each row and column block.
+      * So this function should be used only for blocks with variable size.
+      * \param rowBlocks : Number of rows per row block
+      * \param colBlocks : Number of columns per column block
+      * \sa resize(), setBlockSize()
+      */
+    inline void setBlockLayout(const VectorXi& rowBlocks, const VectorXi& colBlocks)
+    {
+      const VectorXi& innerBlocks = IsColMajor ? rowBlocks : colBlocks;
+      const VectorXi& outerBlocks = IsColMajor ? colBlocks : rowBlocks;
+      eigen_assert(m_innerBSize == innerBlocks.size() && "CHECK THE NUMBER OF ROW OR COLUMN BLOCKS");
+      eigen_assert(m_outerBSize == outerBlocks.size() && "CHECK THE NUMBER OF ROW OR COLUMN BLOCKS");
+      m_outerBSize = outerBlocks.size();
+      //  starting index of blocks... cumulative sums
+      m_innerOffset = new StorageIndex[m_innerBSize+1];
+      m_outerOffset = new StorageIndex[m_outerBSize+1];
+      m_innerOffset[0] = 0;
+      m_outerOffset[0] = 0;
+      std::partial_sum(&innerBlocks[0], &innerBlocks[m_innerBSize-1]+1, &m_innerOffset[1]);
+      std::partial_sum(&outerBlocks[0], &outerBlocks[m_outerBSize-1]+1, &m_outerOffset[1]);
+
+      // Compute the total number of nonzeros
+      m_nonzeros = 0;
+      for(StorageIndex bj = 0; bj < m_outerBSize; ++bj)
+        for(StorageIndex bi = 0; bi < m_innerBSize; ++bi)
+          m_nonzeros += outerBlocks[bj] * innerBlocks[bi];
+
+    }
+
+    /**
+      * \brief Allocate the internal array of pointers to blocks and their inner indices
+      *
+      * \note For fixed-size blocks, call setBlockSize() to set the block.
+      * And For variable-size blocks, call setBlockLayout() before using this function
+      *
+      * \param nonzerosblocks Number of nonzero blocks. The total number of nonzeros is
+      * is computed in setBlockLayout() for variable-size blocks
+      * \sa setBlockSize()
+      */
+    inline void reserve(const Index nonzerosblocks)
+    {
+      eigen_assert((m_innerBSize != 0 && m_outerBSize != 0) &&
+          "TRYING TO RESERVE ZERO-SIZE MATRICES, CALL resize() first");
+
+      //FIXME Should free if already allocated
+      m_outerIndex = new StorageIndex[m_outerBSize+1];
+
+      m_nonzerosblocks = nonzerosblocks;
+      if(m_blockSize != Dynamic)
+      {
+        m_nonzeros = nonzerosblocks * (m_blockSize * m_blockSize);
+        m_blockPtr = 0;
+      }
+      else
+      {
+        // m_nonzeros  is already computed in setBlockLayout()
+        m_blockPtr = new StorageIndex[m_nonzerosblocks+1];
+      }
+      m_indices = new StorageIndex[m_nonzerosblocks+1];
+      m_values = new Scalar[m_nonzeros];
+    }
+
+
+    /**
+      * \brief Fill values in a matrix  from a triplet list.
+      *
+      * Each triplet item has a block stored in an Eigen dense matrix.
+      * The InputIterator class should provide the functions row(), col() and value()
+      *
+      * \note For fixed-size blocks, call setBlockSize() before this function.
+      *
+      * FIXME Do not accept duplicates
+      */
+    template<typename InputIterator>
+    void setFromTriplets(const InputIterator& begin, const InputIterator& end)
+    {
+      eigen_assert((m_innerBSize!=0 && m_outerBSize !=0) && "ZERO BLOCKS, PLEASE CALL resize() before");
+
+      /* First, sort the triplet list
+        * FIXME This can be unnecessarily expensive since only the inner indices have to be sorted
+        * The best approach is like in SparseMatrix::setFromTriplets()
+        */
+      internal::TripletComp<InputIterator, IsColMajor> tripletcomp;
+      std::sort(begin, end, tripletcomp);
+
+      /* Count the number of rows and column blocks,
+       * and the number of nonzero blocks per outer dimension
+       */
+      VectorXi rowBlocks(m_innerBSize); // Size of each block row
+      VectorXi colBlocks(m_outerBSize); // Size of each block column
+      rowBlocks.setZero(); colBlocks.setZero();
+      VectorXi nzblock_outer(m_outerBSize); // Number of nz blocks per outer vector
+      VectorXi nz_outer(m_outerBSize); // Number of nz per outer vector...for variable-size blocks
+      nzblock_outer.setZero();
+      nz_outer.setZero();
+      for(InputIterator it(begin); it !=end; ++it)
+      {
+        eigen_assert(it->row() >= 0 && it->row() < this->blockRows() && it->col() >= 0 && it->col() < this->blockCols());
+        eigen_assert((it->value().rows() == it->value().cols() && (it->value().rows() == m_blockSize))
+                     || (m_blockSize == Dynamic));
+
+        if(m_blockSize == Dynamic)
+        {
+          eigen_assert((rowBlocks[it->row()] == 0 || rowBlocks[it->row()] == it->value().rows()) &&
+              "NON CORRESPONDING SIZES FOR ROW BLOCKS");
+          eigen_assert((colBlocks[it->col()] == 0 || colBlocks[it->col()] == it->value().cols()) &&
+              "NON CORRESPONDING SIZES FOR COLUMN BLOCKS");
+          rowBlocks[it->row()] =it->value().rows();
+          colBlocks[it->col()] = it->value().cols();
+        }
+        nz_outer(IsColMajor ? it->col() : it->row()) += it->value().rows() * it->value().cols();
+        nzblock_outer(IsColMajor ? it->col() : it->row())++;
+      }
+      // Allocate member arrays
+      if(m_blockSize == Dynamic) setBlockLayout(rowBlocks, colBlocks);
+      StorageIndex nzblocks = nzblock_outer.sum();
+      reserve(nzblocks);
+
+       // Temporary markers
+      VectorXi block_id(m_outerBSize); // To be used as a block marker during insertion
+
+      // Setup outer index pointers and markers
+      m_outerIndex[0] = 0;
+      if (m_blockSize == Dynamic)  m_blockPtr[0] =  0;
+      for(StorageIndex bj = 0; bj < m_outerBSize; ++bj)
+      {
+        m_outerIndex[bj+1] = m_outerIndex[bj] + nzblock_outer(bj);
+        block_id(bj) = m_outerIndex[bj];
+        if(m_blockSize==Dynamic)
+        {
+          m_blockPtr[m_outerIndex[bj+1]] = m_blockPtr[m_outerIndex[bj]] + nz_outer(bj);
+        }
+      }
+
+      // Fill the matrix
+      for(InputIterator it(begin); it!=end; ++it)
+      {
+        StorageIndex outer = IsColMajor ? it->col() : it->row();
+        StorageIndex inner = IsColMajor ? it->row() : it->col();
+        m_indices[block_id(outer)] = inner;
+        StorageIndex block_size = it->value().rows()*it->value().cols();
+        StorageIndex nz_marker = blockPtr(block_id[outer]);
+        memcpy(&(m_values[nz_marker]), it->value().data(), block_size * sizeof(Scalar));
+        if(m_blockSize == Dynamic)
+        {
+          m_blockPtr[block_id(outer)+1] = m_blockPtr[block_id(outer)] + block_size;
+        }
+        block_id(outer)++;
+      }
+
+      // An alternative when the outer indices are sorted...no need to use an array of markers
+//      for(Index bcol = 0; bcol < m_outerBSize; ++bcol)
+//      {
+//      Index id = 0, id_nz = 0, id_nzblock = 0;
+//      for(InputIterator it(begin); it!=end; ++it)
+//      {
+//        while (id<bcol) // one pass should do the job unless there are empty columns
+//        {
+//          id++;
+//          m_outerIndex[id+1]=m_outerIndex[id];
+//        }
+//        m_outerIndex[id+1] += 1;
+//        m_indices[id_nzblock]=brow;
+//        Index block_size = it->value().rows()*it->value().cols();
+//        m_blockPtr[id_nzblock+1] = m_blockPtr[id_nzblock] + block_size;
+//        id_nzblock++;
+//        memcpy(&(m_values[id_nz]),it->value().data(), block_size*sizeof(Scalar));
+//        id_nz += block_size;
+//      }
+//      while(id < m_outerBSize-1) // Empty columns at the end
+//      {
+//        id++;
+//        m_outerIndex[id+1]=m_outerIndex[id];
+//      }
+//      }
+    }
+
+
+    /**
+      * \returns the number of rows
+      */
+    inline Index rows() const
+    {
+//      return blockRows();
+      return (IsColMajor ? innerSize() : outerSize());
+    }
+
+    /**
+      * \returns the number of cols
+      */
+    inline Index cols() const
+    {
+//      return blockCols();
+      return (IsColMajor ? outerSize() : innerSize());
+    }
+
+    inline Index innerSize() const
+    {
+      if(m_blockSize == Dynamic) return m_innerOffset[m_innerBSize];
+      else return  (m_innerBSize * m_blockSize) ;
+    }
+
+    inline Index outerSize() const
+    {
+      if(m_blockSize == Dynamic) return m_outerOffset[m_outerBSize];
+      else return  (m_outerBSize * m_blockSize) ;
+    }
+    /** \returns the number of rows grouped by blocks */
+    inline Index blockRows() const
+    {
+      return (IsColMajor ? m_innerBSize : m_outerBSize);
+    }
+    /** \returns the number of columns grouped by blocks */
+    inline Index blockCols() const
+    {
+      return (IsColMajor ? m_outerBSize : m_innerBSize);
+    }
+
+    inline Index outerBlocks() const { return m_outerBSize; }
+    inline Index innerBlocks() const { return m_innerBSize; }
+
+    /** \returns the block index where outer belongs to */
+    inline Index outerToBlock(Index outer) const
+    {
+      eigen_assert(outer < outerSize() && "OUTER INDEX OUT OF BOUNDS");
+
+      if(m_blockSize != Dynamic)
+        return (outer / m_blockSize); // Integer division
+
+      StorageIndex b_outer = 0;
+      while(m_outerOffset[b_outer] <= outer) ++b_outer;
+      return b_outer - 1;
+    }
+    /** \returns  the block index where inner belongs to */
+    inline Index innerToBlock(Index inner) const
+    {
+      eigen_assert(inner < innerSize() && "OUTER INDEX OUT OF BOUNDS");
+
+      if(m_blockSize != Dynamic)
+        return (inner / m_blockSize); // Integer division
+
+      StorageIndex b_inner = 0;
+      while(m_innerOffset[b_inner] <= inner) ++b_inner;
+      return b_inner - 1;
+    }
+
+    /**
+      *\returns a reference to the (i,j) block as an Eigen Dense Matrix
+      */
+    Ref<BlockScalar> coeffRef(Index brow, Index bcol)
+    {
+      eigen_assert(brow < blockRows() && "BLOCK ROW INDEX OUT OF BOUNDS");
+      eigen_assert(bcol < blockCols() && "BLOCK nzblocksFlagCOLUMN OUT OF BOUNDS");
+
+      StorageIndex rsize = IsColMajor ? blockInnerSize(brow): blockOuterSize(bcol);
+      StorageIndex csize = IsColMajor ? blockOuterSize(bcol) : blockInnerSize(brow);
+      StorageIndex inner = IsColMajor ? brow : bcol;
+      StorageIndex outer = IsColMajor ? bcol : brow;
+      StorageIndex offset = m_outerIndex[outer];
+      while(offset < m_outerIndex[outer+1] && m_indices[offset] != inner)
+        offset++;
+      if(m_indices[offset] == inner)
+      {
+        return Map<BlockScalar>(&(m_values[blockPtr(offset)]), rsize, csize);
+      }
+      else
+      {
+        //FIXME the block does not exist, Insert it !!!!!!!!!
+        eigen_assert("DYNAMIC INSERTION IS NOT YET SUPPORTED");
+      }
+    }
+
+    /**
+      * \returns the value of the (i,j) block as an Eigen Dense Matrix
+      */
+    Map<const BlockScalar> coeff(Index brow, Index bcol) const
+    {
+      eigen_assert(brow < blockRows() && "BLOCK ROW INDEX OUT OF BOUNDS");
+      eigen_assert(bcol < blockCols() && "BLOCK COLUMN OUT OF BOUNDS");
+
+      StorageIndex rsize = IsColMajor ? blockInnerSize(brow): blockOuterSize(bcol);
+      StorageIndex csize = IsColMajor ? blockOuterSize(bcol) : blockInnerSize(brow);
+      StorageIndex inner = IsColMajor ? brow : bcol;
+      StorageIndex outer = IsColMajor ? bcol : brow;
+      StorageIndex offset = m_outerIndex[outer];
+      while(offset < m_outerIndex[outer+1] && m_indices[offset] != inner) offset++;
+      if(m_indices[offset] == inner)
+      {
+        return Map<const BlockScalar> (&(m_values[blockPtr(offset)]), rsize, csize);
+      }
+      else
+//        return BlockScalar::Zero(rsize, csize);
+        eigen_assert("NOT YET SUPPORTED");
+    }
+
+    // Block Matrix times vector product
+    template<typename VecType>
+    BlockSparseTimeDenseProduct<BlockSparseMatrix, VecType> operator*(const VecType& lhs) const
+    {
+      return BlockSparseTimeDenseProduct<BlockSparseMatrix, VecType>(*this, lhs);
+    }
+
+    /** \returns the number of nonzero blocks */
+    inline Index nonZerosBlocks() const { return m_nonzerosblocks; }
+    /** \returns the total number of nonzero elements, including eventual explicit zeros in blocks */
+    inline Index nonZeros() const { return m_nonzeros; }
+
+    inline BlockScalarReturnType *valuePtr() {return static_cast<BlockScalarReturnType *>(m_values);}
+//    inline Scalar *valuePtr(){ return m_values; }
+    inline StorageIndex *innerIndexPtr() {return m_indices; }
+    inline const StorageIndex *innerIndexPtr() const {return m_indices; }
+    inline StorageIndex *outerIndexPtr() {return m_outerIndex; }
+    inline const StorageIndex* outerIndexPtr() const {return m_outerIndex; }
+
+    /** \brief for compatibility purposes with the SparseMatrix class */
+    inline bool isCompressed() const {return true;}
+    /**
+      * \returns the starting index of the bi row block
+      */
+    inline Index blockRowsIndex(Index bi) const
+    {
+      return IsColMajor ? blockInnerIndex(bi) : blockOuterIndex(bi);
+    }
+
+    /**
+      * \returns the starting index of the bj col block
+      */
+    inline Index blockColsIndex(Index bj) const
+    {
+      return IsColMajor ? blockOuterIndex(bj) : blockInnerIndex(bj);
+    }
+
+    inline Index blockOuterIndex(Index bj) const
+    {
+      return (m_blockSize == Dynamic) ? m_outerOffset[bj] : (bj * m_blockSize);
+    }
+    inline Index blockInnerIndex(Index bi) const
+    {
+      return (m_blockSize == Dynamic) ? m_innerOffset[bi] : (bi * m_blockSize);
+    }
+
+    // Not needed ???
+    inline Index blockInnerSize(Index bi) const
+    {
+      return (m_blockSize == Dynamic) ? (m_innerOffset[bi+1] - m_innerOffset[bi]) : m_blockSize;
+    }
+    inline Index blockOuterSize(Index bj) const
+    {
+      return (m_blockSize == Dynamic) ? (m_outerOffset[bj+1]- m_outerOffset[bj]) : m_blockSize;
+    }
+
+    /**
+      * \brief Browse the matrix by outer index
+      */
+    class InnerIterator; // Browse column by column
+
+    /**
+      * \brief Browse the matrix by block outer index
+      */
+    class BlockInnerIterator; // Browse block by block
+
+    friend std::ostream & operator << (std::ostream & s, const BlockSparseMatrix& m)
+    {
+      for (StorageIndex j = 0; j < m.outerBlocks(); ++j)
+      {
+        BlockInnerIterator itb(m, j);
+        for(; itb; ++itb)
+        {
+          s << "("<<itb.row() << ", " << itb.col() << ")\n";
+          s << itb.value() <<"\n";
+        }
+      }
+      s << std::endl;
+      return s;
+    }
+
+    /**
+      * \returns the starting position of the block \p id in the array of values
+      */
+    Index blockPtr(Index id) const
+    {
+      if(m_blockSize == Dynamic) return m_blockPtr[id];
+      else return id * m_blockSize * m_blockSize;
+      //return blockDynIdx(id, typename internal::conditional<(BlockSize==Dynamic), internal::true_type, internal::false_type>::type());
+    }
+
+
+  protected:
+//    inline Index blockDynIdx(Index id, internal::true_type) const
+//    {
+//      return m_blockPtr[id];
+//    }
+//    inline Index blockDynIdx(Index id, internal::false_type) const
+//    {
+//      return id * BlockSize * BlockSize;
+//    }
+
+    // To be implemented
+    // Insert a block at a particular location... need to make a room for that
+    Map<BlockScalar> insert(Index brow, Index bcol);
+
+    Index m_innerBSize; // Number of block rows
+    Index m_outerBSize; // Number of block columns
+    StorageIndex *m_innerOffset; // Starting index of each inner block (size m_innerBSize+1)
+    StorageIndex *m_outerOffset; // Starting index of each outer block (size m_outerBSize+1)
+    Index m_nonzerosblocks; // Total nonzeros blocks (lower than  m_innerBSize x m_outerBSize)
+    Index m_nonzeros; // Total nonzeros elements
+    Scalar *m_values; //Values stored block column after block column (size m_nonzeros)
+    StorageIndex *m_blockPtr; // Pointer to the beginning of each block in m_values, size m_nonzeroblocks ... null for fixed-size blocks
+    StorageIndex *m_indices; //Inner block indices, size m_nonzerosblocks ... OK
+    StorageIndex *m_outerIndex; // Starting pointer of each block column in m_indices (size m_outerBSize)... OK
+    Index m_blockSize; // Size of a block for fixed-size blocks, otherwise -1
+};
+
+template<typename _Scalar, int _BlockAtCompileTime, int _Options, typename _StorageIndex>
+class BlockSparseMatrix<_Scalar, _BlockAtCompileTime, _Options, _StorageIndex>::BlockInnerIterator
+{
+  public:
+
+    enum{
+      Flags = _Options
+    };
+
+    BlockInnerIterator(const BlockSparseMatrix& mat, const Index outer)
+    : m_mat(mat),m_outer(outer),
+      m_id(mat.m_outerIndex[outer]),
+      m_end(mat.m_outerIndex[outer+1])
+    {
+    }
+
+    inline BlockInnerIterator& operator++() {m_id++; return *this; }
+
+    inline const Map<const BlockScalar> value() const
+    {
+      return Map<const BlockScalar>(&(m_mat.m_values[m_mat.blockPtr(m_id)]),
+          rows(),cols());
+    }
+    inline Map<BlockScalar> valueRef()
+    {
+      return Map<BlockScalar>(&(m_mat.m_values[m_mat.blockPtr(m_id)]),
+          rows(),cols());
+    }
+    // Block inner index
+    inline Index index() const {return m_mat.m_indices[m_id]; }
+    inline Index outer() const { return m_outer; }
+    // block row index
+    inline Index row() const  {return index(); }
+    // block column index
+    inline Index col() const {return outer(); }
+    // FIXME Number of rows in the current block
+    inline Index rows() const { return (m_mat.m_blockSize==Dynamic) ? (m_mat.m_innerOffset[index()+1] - m_mat.m_innerOffset[index()]) : m_mat.m_blockSize; }
+    // Number of columns in the current block ...
+    inline Index cols() const { return (m_mat.m_blockSize==Dynamic) ? (m_mat.m_outerOffset[m_outer+1]-m_mat.m_outerOffset[m_outer]) : m_mat.m_blockSize;}
+    inline operator bool() const { return (m_id < m_end); }
+
+  protected:
+    const BlockSparseMatrix<_Scalar, _BlockAtCompileTime, _Options, StorageIndex>& m_mat;
+    const Index m_outer;
+    Index m_id;
+    Index m_end;
+};
+
+template<typename _Scalar, int _BlockAtCompileTime, int _Options, typename _StorageIndex>
+class BlockSparseMatrix<_Scalar, _BlockAtCompileTime, _Options, _StorageIndex>::InnerIterator
+{
+  public:
+    InnerIterator(const BlockSparseMatrix& mat, Index outer)
+    : m_mat(mat),m_outerB(mat.outerToBlock(outer)),m_outer(outer),
+      itb(mat, mat.outerToBlock(outer)),
+      m_offset(outer - mat.blockOuterIndex(m_outerB))
+     {
+        if (itb)
+        {
+          m_id = m_mat.blockInnerIndex(itb.index());
+          m_start = m_id;
+          m_end = m_mat.blockInnerIndex(itb.index()+1);
+        }
+     }
+    inline InnerIterator& operator++()
+    {
+      m_id++;
+      if (m_id >= m_end)
+      {
+        ++itb;
+        if (itb)
+        {
+          m_id = m_mat.blockInnerIndex(itb.index());
+          m_start = m_id;
+          m_end = m_mat.blockInnerIndex(itb.index()+1);
+        }
+      }
+      return *this;
+    }
+    inline const Scalar& value() const
+    {
+      return itb.value().coeff(m_id - m_start, m_offset);
+    }
+    inline Scalar& valueRef()
+    {
+      return itb.valueRef().coeff(m_id - m_start, m_offset);
+    }
+    inline Index index() const { return m_id; }
+    inline Index outer() const {return m_outer; }
+    inline Index col() const {return outer(); }
+    inline Index row() const { return index();}
+    inline operator bool() const
+    {
+      return itb;
+    }
+  protected:
+    const BlockSparseMatrix& m_mat;
+    const Index m_outer;
+    const Index m_outerB;
+    BlockInnerIterator itb; // Iterator through the blocks
+    const Index m_offset; // Position of this column in the block
+    Index m_start; // starting inner index of this block
+    Index m_id; // current inner index in the block
+    Index m_end; // starting inner index of the next block
+
+};
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSEBLOCKMATRIX_H

diff --git a/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h b/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
new file mode 100644
index 0000000..42c99e4
--- /dev/null
+++ b/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h

@@ -0,0 +1,404 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DYNAMIC_SPARSEMATRIX_H
+#define EIGEN_DYNAMIC_SPARSEMATRIX_H
+
+namespace Eigen { 
+
+/** \deprecated use a SparseMatrix in an uncompressed mode
+  *
+  * \class DynamicSparseMatrix
+  *
+  * \brief A sparse matrix class designed for matrix assembly purpose
+  *
+  * \param _Scalar the scalar type, i.e. the type of the coefficients
+  *
+  * Unlike SparseMatrix, this class provides a much higher degree of flexibility. In particular, it allows
+  * random read/write accesses in log(rho*outer_size) where \c rho is the probability that a coefficient is
+  * nonzero and outer_size is the number of columns if the matrix is column-major and the number of rows
+  * otherwise.
+  *
+  * Internally, the data are stored as a std::vector of compressed vector. The performances of random writes might
+  * decrease as the number of nonzeros per inner-vector increase. In practice, we observed very good performance
+  * till about 100 nonzeros/vector, and the performance remains relatively good till 500 nonzeros/vectors.
+  *
+  * \see SparseMatrix
+  */
+
+namespace internal {
+template<typename _Scalar, int _Options, typename _StorageIndex>
+struct traits<DynamicSparseMatrix<_Scalar, _Options, _StorageIndex> >
+{
+  typedef _Scalar Scalar;
+  typedef _StorageIndex StorageIndex;
+  typedef Sparse StorageKind;
+  typedef MatrixXpr XprKind;
+  enum {
+    RowsAtCompileTime = Dynamic,
+    ColsAtCompileTime = Dynamic,
+    MaxRowsAtCompileTime = Dynamic,
+    MaxColsAtCompileTime = Dynamic,
+    Flags = _Options | NestByRefBit | LvalueBit,
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    SupportedAccessPatterns = OuterRandomAccessPattern
+  };
+};
+}
+
+template<typename _Scalar, int _Options, typename _StorageIndex>
+ class  DynamicSparseMatrix
+  : public SparseMatrixBase<DynamicSparseMatrix<_Scalar, _Options, _StorageIndex> >
+{
+    typedef SparseMatrixBase<DynamicSparseMatrix> Base;
+    using Base::convert_index;
+  public:
+    EIGEN_SPARSE_PUBLIC_INTERFACE(DynamicSparseMatrix)
+    // FIXME: why are these operator already alvailable ???
+    // EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(DynamicSparseMatrix, +=)
+    // EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(DynamicSparseMatrix, -=)
+    typedef MappedSparseMatrix<Scalar,Flags> Map;
+    using Base::IsRowMajor;
+    using Base::operator=;
+    enum {
+      Options = _Options
+    };
+
+  protected:
+
+    typedef DynamicSparseMatrix<Scalar,(Flags&~RowMajorBit)|(IsRowMajor?RowMajorBit:0), StorageIndex> TransposedSparseMatrix;
+
+    Index m_innerSize;
+    std::vector<internal::CompressedStorage<Scalar,StorageIndex> > m_data;
+
+  public:
+
+    inline Index rows() const { return IsRowMajor ? outerSize() : m_innerSize; }
+    inline Index cols() const { return IsRowMajor ? m_innerSize : outerSize(); }
+    inline Index innerSize() const { return m_innerSize; }
+    inline Index outerSize() const { return convert_index(m_data.size()); }
+    inline Index innerNonZeros(Index j) const { return m_data[j].size(); }
+
+    std::vector<internal::CompressedStorage<Scalar,StorageIndex> >& _data() { return m_data; }
+    const std::vector<internal::CompressedStorage<Scalar,StorageIndex> >& _data() const { return m_data; }
+
+    /** \returns the coefficient value at given position \a row, \a col
+      * This operation involes a log(rho*outer_size) binary search.
+      */
+    inline Scalar coeff(Index row, Index col) const
+    {
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+      return m_data[outer].at(inner);
+    }
+
+    /** \returns a reference to the coefficient value at given position \a row, \a col
+      * This operation involes a log(rho*outer_size) binary search. If the coefficient does not
+      * exist yet, then a sorted insertion into a sequential buffer is performed.
+      */
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+      return m_data[outer].atWithInsertion(inner);
+    }
+
+    class InnerIterator;
+    class ReverseInnerIterator;
+
+    void setZero()
+    {
+      for (Index j=0; j<outerSize(); ++j)
+        m_data[j].clear();
+    }
+
+    /** \returns the number of non zero coefficients */
+    Index nonZeros() const
+    {
+      Index res = 0;
+      for (Index j=0; j<outerSize(); ++j)
+        res += m_data[j].size();
+      return res;
+    }
+
+
+
+    void reserve(Index reserveSize = 1000)
+    {
+      if (outerSize()>0)
+      {
+        Index reserveSizePerVector = (std::max)(reserveSize/outerSize(),Index(4));
+        for (Index j=0; j<outerSize(); ++j)
+        {
+          m_data[j].reserve(reserveSizePerVector);
+        }
+      }
+    }
+
+    /** Does nothing: provided for compatibility with SparseMatrix */
+    inline void startVec(Index /*outer*/) {}
+
+    /** \returns a reference to the non zero coefficient at position \a row, \a col assuming that:
+      * - the nonzero does not already exist
+      * - the new coefficient is the last one of the given inner vector.
+      *
+      * \sa insert, insertBackByOuterInner */
+    inline Scalar& insertBack(Index row, Index col)
+    {
+      return insertBackByOuterInner(IsRowMajor?row:col, IsRowMajor?col:row);
+    }
+
+    /** \sa insertBack */
+    inline Scalar& insertBackByOuterInner(Index outer, Index inner)
+    {
+      eigen_assert(outer<Index(m_data.size()) && inner<m_innerSize && "out of range");
+      eigen_assert(((m_data[outer].size()==0) || (m_data[outer].index(m_data[outer].size()-1)<inner))
+                && "wrong sorted insertion");
+      m_data[outer].append(0, inner);
+      return m_data[outer].value(m_data[outer].size()-1);
+    }
+
+    inline Scalar& insert(Index row, Index col)
+    {
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+
+      Index startId = 0;
+      Index id = static_cast<Index>(m_data[outer].size()) - 1;
+      m_data[outer].resize(id+2,1);
+
+      while ( (id >= startId) && (m_data[outer].index(id) > inner) )
+      {
+        m_data[outer].index(id+1) = m_data[outer].index(id);
+        m_data[outer].value(id+1) = m_data[outer].value(id);
+        --id;
+      }
+      m_data[outer].index(id+1) = inner;
+      m_data[outer].value(id+1) = 0;
+      return m_data[outer].value(id+1);
+    }
+
+    /** Does nothing: provided for compatibility with SparseMatrix */
+    inline void finalize() {}
+
+    /** Suppress all nonzeros which are smaller than \a reference under the tolerance \a epsilon */
+    void prune(Scalar reference, RealScalar epsilon = NumTraits<RealScalar>::dummy_precision())
+    {
+      for (Index j=0; j<outerSize(); ++j)
+        m_data[j].prune(reference,epsilon);
+    }
+
+    /** Resize the matrix without preserving the data (the matrix is set to zero)
+      */
+    void resize(Index rows, Index cols)
+    {
+      const Index outerSize = IsRowMajor ? rows : cols;
+      m_innerSize = convert_index(IsRowMajor ? cols : rows);
+      setZero();
+      if (Index(m_data.size()) != outerSize)
+      {
+        m_data.resize(outerSize);
+      }
+    }
+
+    void resizeAndKeepData(Index rows, Index cols)
+    {
+      const Index outerSize = IsRowMajor ? rows : cols;
+      const Index innerSize = IsRowMajor ? cols : rows;
+      if (m_innerSize>innerSize)
+      {
+        // remove all coefficients with innerCoord>=innerSize
+        // TODO
+        //std::cerr << "not implemented yet\n";
+        exit(2);
+      }
+      if (m_data.size() != outerSize)
+      {
+        m_data.resize(outerSize);
+      }
+    }
+
+    /** The class DynamicSparseMatrix is deprecated */
+    EIGEN_DEPRECATED inline DynamicSparseMatrix()
+      : m_innerSize(0), m_data(0)
+    {
+      #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+        EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+      #endif
+      eigen_assert(innerSize()==0 && outerSize()==0);
+    }
+
+    /** The class DynamicSparseMatrix is deprecated */
+    EIGEN_DEPRECATED inline DynamicSparseMatrix(Index rows, Index cols)
+      : m_innerSize(0)
+    {
+      #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+        EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+      #endif
+      resize(rows, cols);
+    }
+
+    /** The class DynamicSparseMatrix is deprecated */
+    template<typename OtherDerived>
+    EIGEN_DEPRECATED explicit inline DynamicSparseMatrix(const SparseMatrixBase<OtherDerived>& other)
+      : m_innerSize(0)
+    {
+      #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+        EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+      #endif
+      Base::operator=(other.derived());
+    }
+
+    inline DynamicSparseMatrix(const DynamicSparseMatrix& other)
+      : Base(), m_innerSize(0)
+    {
+      #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+        EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+      #endif
+      *this = other.derived();
+    }
+
+    inline void swap(DynamicSparseMatrix& other)
+    {
+      //EIGEN_DBG_SPARSE(std::cout << "SparseMatrix:: swap\n");
+      std::swap(m_innerSize, other.m_innerSize);
+      //std::swap(m_outerSize, other.m_outerSize);
+      m_data.swap(other.m_data);
+    }
+
+    inline DynamicSparseMatrix& operator=(const DynamicSparseMatrix& other)
+    {
+      if (other.isRValue())
+      {
+        swap(other.const_cast_derived());
+      }
+      else
+      {
+        resize(other.rows(), other.cols());
+        m_data = other.m_data;
+      }
+      return *this;
+    }
+
+    /** Destructor */
+    inline ~DynamicSparseMatrix() {}
+
+  public:
+
+    /** \deprecated
+      * Set the matrix to zero and reserve the memory for \a reserveSize nonzero coefficients. */
+    EIGEN_DEPRECATED void startFill(Index reserveSize = 1000)
+    {
+      setZero();
+      reserve(reserveSize);
+    }
+
+    /** \deprecated use insert()
+      * inserts a nonzero coefficient at given coordinates \a row, \a col and returns its reference assuming that:
+      *  1 - the coefficient does not exist yet
+      *  2 - this the coefficient with greater inner coordinate for the given outer coordinate.
+      * In other words, assuming \c *this is column-major, then there must not exists any nonzero coefficient of coordinates
+      * \c i \c x \a col such that \c i >= \a row. Otherwise the matrix is invalid.
+      *
+      * \see fillrand(), coeffRef()
+      */
+    EIGEN_DEPRECATED Scalar& fill(Index row, Index col)
+    {
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+      return insertBack(outer,inner);
+    }
+
+    /** \deprecated use insert()
+      * Like fill() but with random inner coordinates.
+      * Compared to the generic coeffRef(), the unique limitation is that we assume
+      * the coefficient does not exist yet.
+      */
+    EIGEN_DEPRECATED Scalar& fillrand(Index row, Index col)
+    {
+      return insert(row,col);
+    }
+
+    /** \deprecated use finalize()
+      * Does nothing. Provided for compatibility with SparseMatrix. */
+    EIGEN_DEPRECATED void endFill() {}
+    
+#   ifdef EIGEN_DYNAMICSPARSEMATRIX_PLUGIN
+#     include EIGEN_DYNAMICSPARSEMATRIX_PLUGIN
+#   endif
+ };
+
+template<typename Scalar, int _Options, typename _StorageIndex>
+class DynamicSparseMatrix<Scalar,_Options,_StorageIndex>::InnerIterator : public SparseVector<Scalar,_Options,_StorageIndex>::InnerIterator
+{
+    typedef typename SparseVector<Scalar,_Options,_StorageIndex>::InnerIterator Base;
+  public:
+    InnerIterator(const DynamicSparseMatrix& mat, Index outer)
+      : Base(mat.m_data[outer]), m_outer(outer)
+    {}
+
+    inline Index row() const { return IsRowMajor ? m_outer : Base::index(); }
+    inline Index col() const { return IsRowMajor ? Base::index() : m_outer; }
+    inline Index outer() const { return m_outer; }
+
+  protected:
+    const Index m_outer;
+};
+
+template<typename Scalar, int _Options, typename _StorageIndex>
+class DynamicSparseMatrix<Scalar,_Options,_StorageIndex>::ReverseInnerIterator : public SparseVector<Scalar,_Options,_StorageIndex>::ReverseInnerIterator
+{
+    typedef typename SparseVector<Scalar,_Options,_StorageIndex>::ReverseInnerIterator Base;
+  public:
+    ReverseInnerIterator(const DynamicSparseMatrix& mat, Index outer)
+      : Base(mat.m_data[outer]), m_outer(outer)
+    {}
+
+    inline Index row() const { return IsRowMajor ? m_outer : Base::index(); }
+    inline Index col() const { return IsRowMajor ? Base::index() : m_outer; }
+    inline Index outer() const { return m_outer; }
+
+  protected:
+    const Index m_outer;
+};
+
+namespace internal {
+
+template<typename _Scalar, int _Options, typename _StorageIndex>
+struct evaluator<DynamicSparseMatrix<_Scalar,_Options,_StorageIndex> >
+  : evaluator_base<DynamicSparseMatrix<_Scalar,_Options,_StorageIndex> >
+{
+  typedef _Scalar Scalar;
+  typedef DynamicSparseMatrix<_Scalar,_Options,_StorageIndex> SparseMatrixType;
+  typedef typename SparseMatrixType::InnerIterator InnerIterator;
+  typedef typename SparseMatrixType::ReverseInnerIterator ReverseInnerIterator;
+  
+  enum {
+    CoeffReadCost = NumTraits<_Scalar>::ReadCost,
+    Flags = SparseMatrixType::Flags
+  };
+  
+  evaluator() : m_matrix(0) {}
+  evaluator(const SparseMatrixType &mat) : m_matrix(&mat) {}
+  
+  operator SparseMatrixType&() { return m_matrix->const_cast_derived(); }
+  operator const SparseMatrixType&() const { return *m_matrix; }
+  
+  Scalar coeff(Index row, Index col) const { return m_matrix->coeff(row,col); }
+  
+  Index nonZerosEstimate() const { return m_matrix->nonZeros(); }
+
+  const SparseMatrixType *m_matrix;
+};
+
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_DYNAMIC_SPARSEMATRIX_H

diff --git a/unsupported/Eigen/src/SparseExtra/MarketIO.h b/unsupported/Eigen/src/SparseExtra/MarketIO.h
new file mode 100644
index 0000000..dd786d5
--- /dev/null
+++ b/unsupported/Eigen/src/SparseExtra/MarketIO.h

@@ -0,0 +1,282 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2012 Desire NUENTSA WAKAM <desire.nuentsa_wakam@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_MARKET_IO_H
+#define EIGEN_SPARSE_MARKET_IO_H
+
+#include <iostream>
+#include <vector>
+
+namespace Eigen { 
+
+namespace internal 
+{
+  template <typename Scalar, typename StorageIndex>
+  inline void GetMarketLine (const char* line, StorageIndex& i, StorageIndex& j, Scalar& value)
+  {
+    std::stringstream sline(line);
+    sline >> i >> j >> value;
+  }
+
+  template<> inline void GetMarketLine (const char* line, int& i, int& j, float& value)
+  { std::sscanf(line, "%d %d %g", &i, &j, &value); }
+
+  template<> inline void GetMarketLine (const char* line, int& i, int& j, double& value)
+  { std::sscanf(line, "%d %d %lg", &i, &j, &value); }
+
+  template<> inline void GetMarketLine (const char* line, int& i, int& j, std::complex<float>& value)
+  { std::sscanf(line, "%d %d %g %g", &i, &j, &numext::real_ref(value), &numext::imag_ref(value)); }
+
+  template<> inline void GetMarketLine (const char* line, int& i, int& j, std::complex<double>& value)
+  { std::sscanf(line, "%d %d %lg %lg", &i, &j, &numext::real_ref(value), &numext::imag_ref(value)); }
+
+  template <typename Scalar, typename StorageIndex>
+  inline void GetMarketLine (const char* line, StorageIndex& i, StorageIndex& j, std::complex<Scalar>& value)
+  {
+    std::stringstream sline(line);
+    Scalar valR, valI;
+    sline >> i >> j >> valR >> valI;
+    value = std::complex<Scalar>(valR,valI);
+  }
+
+  template <typename RealScalar>
+  inline void  GetVectorElt (const std::string& line, RealScalar& val)
+  {
+    std::istringstream newline(line);
+    newline >> val;  
+  }
+
+  template <typename RealScalar>
+  inline void GetVectorElt (const std::string& line, std::complex<RealScalar>& val)
+  {
+    RealScalar valR, valI; 
+    std::istringstream newline(line);
+    newline >> valR >> valI; 
+    val = std::complex<RealScalar>(valR, valI);
+  }
+  
+  template<typename Scalar>
+  inline void putMarketHeader(std::string& header,int sym)
+  {
+    header= "%%MatrixMarket matrix coordinate ";
+    if(internal::is_same<Scalar, std::complex<float> >::value || internal::is_same<Scalar, std::complex<double> >::value)
+    {
+      header += " complex"; 
+      if(sym == Symmetric) header += " symmetric";
+      else if (sym == SelfAdjoint) header += " Hermitian";
+      else header += " general";
+    }
+    else
+    {
+      header += " real"; 
+      if(sym == Symmetric) header += " symmetric";
+      else header += " general";
+    }
+  }
+
+  template<typename Scalar, typename StorageIndex>
+  inline void PutMatrixElt(Scalar value, StorageIndex row, StorageIndex col, std::ofstream& out)
+  {
+    out << row << " "<< col << " " << value << "\n";
+  }
+  template<typename Scalar, typename StorageIndex>
+  inline void PutMatrixElt(std::complex<Scalar> value, StorageIndex row, StorageIndex col, std::ofstream& out)
+  {
+    out << row << " " << col << " " << value.real() << " " << value.imag() << "\n";
+  }
+
+
+  template<typename Scalar>
+  inline void putVectorElt(Scalar value, std::ofstream& out)
+  {
+    out << value << "\n"; 
+  }
+  template<typename Scalar>
+  inline void putVectorElt(std::complex<Scalar> value, std::ofstream& out)
+  {
+    out << value.real() << " " << value.imag()<< "\n"; 
+  }
+
+} // end namespace internal
+
+inline bool getMarketHeader(const std::string& filename, int& sym, bool& iscomplex, bool& isvector)
+{
+  sym = 0; 
+  iscomplex = false;
+  isvector = false;
+  std::ifstream in(filename.c_str(),std::ios::in);
+  if(!in)
+    return false;
+  
+  std::string line; 
+  // The matrix header is always the first line in the file 
+  std::getline(in, line); eigen_assert(in.good());
+  
+  std::stringstream fmtline(line); 
+  std::string substr[5];
+  fmtline>> substr[0] >> substr[1] >> substr[2] >> substr[3] >> substr[4];
+  if(substr[2].compare("array") == 0) isvector = true;
+  if(substr[3].compare("complex") == 0) iscomplex = true;
+  if(substr[4].compare("symmetric") == 0) sym = Symmetric;
+  else if (substr[4].compare("Hermitian") == 0) sym = SelfAdjoint;
+  
+  return true;
+}
+  
+template<typename SparseMatrixType>
+bool loadMarket(SparseMatrixType& mat, const std::string& filename)
+{
+  typedef typename SparseMatrixType::Scalar Scalar;
+  typedef typename SparseMatrixType::StorageIndex StorageIndex;
+  std::ifstream input(filename.c_str(),std::ios::in);
+  if(!input)
+    return false;
+
+  char rdbuffer[4096];
+  input.rdbuf()->pubsetbuf(rdbuffer, 4096);
+  
+  const int maxBuffersize = 2048;
+  char buffer[maxBuffersize];
+  
+  bool readsizes = false;
+
+  typedef Triplet<Scalar,StorageIndex> T;
+  std::vector<T> elements;
+  
+  Index M(-1), N(-1), NNZ(-1);
+  Index count = 0;
+  while(input.getline(buffer, maxBuffersize))
+  {
+    // skip comments   
+    //NOTE An appropriate test should be done on the header to get the  symmetry
+    if(buffer[0]=='%')
+      continue;
+
+    if(!readsizes)
+    {
+      std::stringstream line(buffer);
+      line >> M >> N >> NNZ;
+      if(M > 0 && N > 0)
+      {
+        readsizes = true;
+        mat.resize(M,N);
+        mat.reserve(NNZ);
+      }
+    }
+    else
+    { 
+      StorageIndex i(-1), j(-1);
+      Scalar value; 
+      internal::GetMarketLine(buffer, i, j, value);
+
+      i--;
+      j--;
+      if(i>=0 && j>=0 && i<M && j<N)
+      {
+        ++count;
+        elements.push_back(T(i,j,value));
+      }
+      else
+        std::cerr << "Invalid read: " << i << "," << j << "\n";        
+    }
+  }
+
+  mat.setFromTriplets(elements.begin(), elements.end());
+  if(count!=NNZ)
+    std::cerr << count << "!=" << NNZ << "\n";
+  
+  input.close();
+  return true;
+}
+
+template<typename VectorType>
+bool loadMarketVector(VectorType& vec, const std::string& filename)
+{
+   typedef typename VectorType::Scalar Scalar;
+  std::ifstream in(filename.c_str(), std::ios::in);
+  if(!in)
+    return false;
+  
+  std::string line; 
+  int n(0), col(0); 
+  do 
+  { // Skip comments
+    std::getline(in, line); eigen_assert(in.good());
+  } while (line[0] == '%');
+  std::istringstream newline(line);
+  newline  >> n >> col; 
+  eigen_assert(n>0 && col>0);
+  vec.resize(n);
+  int i = 0; 
+  Scalar value; 
+  while ( std::getline(in, line) && (i < n) ){
+    internal::GetVectorElt(line, value); 
+    vec(i++) = value; 
+  }
+  in.close();
+  if (i!=n){
+    std::cerr<< "Unable to read all elements from file " << filename << "\n";
+    return false;
+  }
+  return true;
+}
+
+template<typename SparseMatrixType>
+bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sym = 0)
+{
+  typedef typename SparseMatrixType::Scalar Scalar;
+  typedef typename SparseMatrixType::RealScalar RealScalar;
+  std::ofstream out(filename.c_str(),std::ios::out);
+  if(!out)
+    return false;
+  
+  out.flags(std::ios_base::scientific);
+  out.precision(std::numeric_limits<RealScalar>::digits10 + 2);
+  std::string header; 
+  internal::putMarketHeader<Scalar>(header, sym); 
+  out << header << std::endl; 
+  out << mat.rows() << " " << mat.cols() << " " << mat.nonZeros() << "\n";
+  int count = 0;
+  for(int j=0; j<mat.outerSize(); ++j)
+    for(typename SparseMatrixType::InnerIterator it(mat,j); it; ++it)
+    {
+      ++ count;
+      internal::PutMatrixElt(it.value(), it.row()+1, it.col()+1, out);
+    }
+  out.close();
+  return true;
+}
+
+template<typename VectorType>
+bool saveMarketVector (const VectorType& vec, const std::string& filename)
+{
+ typedef typename VectorType::Scalar Scalar;
+ typedef typename VectorType::RealScalar RealScalar;
+ std::ofstream out(filename.c_str(),std::ios::out);
+  if(!out)
+    return false;
+  
+  out.flags(std::ios_base::scientific);
+  out.precision(std::numeric_limits<RealScalar>::digits10 + 2);
+  if(internal::is_same<Scalar, std::complex<float> >::value || internal::is_same<Scalar, std::complex<double> >::value)
+      out << "%%MatrixMarket matrix array complex general\n"; 
+  else
+    out << "%%MatrixMarket matrix array real general\n"; 
+  out << vec.size() << " "<< 1 << "\n";
+  for (int i=0; i < vec.size(); i++){
+    internal::putVectorElt(vec(i), out); 
+  }
+  out.close();
+  return true; 
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_MARKET_IO_H

diff --git a/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h b/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
new file mode 100644
index 0000000..02916ea
--- /dev/null
+++ b/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h

@@ -0,0 +1,247 @@
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Desire NUENTSA WAKAM <desire.nuentsa_wakam@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BROWSE_MATRICES_H
+#define EIGEN_BROWSE_MATRICES_H
+
+namespace Eigen {
+
+enum {
+  SPD = 0x100,
+  NonSymmetric = 0x0
+}; 
+
+/** 
+ * @brief Iterator to browse matrices from a specified folder
+ * 
+ * This is used to load all the matrices from a folder. 
+ * The matrices should be in Matrix Market format
+ * It is assumed that the matrices are named as matname.mtx
+ * and matname_SPD.mtx if the matrix is Symmetric and positive definite (or Hermitian)
+ * The right hand side vectors are loaded as well, if they exist.
+ * They should be named as matname_b.mtx. 
+ * Note that the right hand side for a SPD matrix is named as matname_SPD_b.mtx
+ * 
+ * Sometimes a reference solution is available. In this case, it should be named as matname_x.mtx
+ * 
+ * Sample code
+ * \code
+ * 
+ * \endcode
+ * 
+ * \tparam Scalar The scalar type 
+ */
+template <typename Scalar>
+class MatrixMarketIterator 
+{
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+  public:
+    typedef Matrix<Scalar,Dynamic,1> VectorType; 
+    typedef SparseMatrix<Scalar,ColMajor> MatrixType; 
+  
+  public:
+    MatrixMarketIterator(const std::string &folder)
+      : m_sym(0), m_isvalid(false), m_matIsLoaded(false), m_hasRhs(false), m_hasrefX(false), m_folder(folder)
+    {
+      m_folder_id = opendir(folder.c_str());
+      if(m_folder_id)
+        Getnextvalidmatrix();
+    }
+    
+    ~MatrixMarketIterator()
+    {
+      if (m_folder_id) closedir(m_folder_id); 
+    }
+    
+    inline MatrixMarketIterator& operator++()
+    {
+      m_matIsLoaded = false;
+      m_hasrefX = false;
+      m_hasRhs = false;
+      Getnextvalidmatrix();
+      return *this;
+    }
+    inline operator bool() const { return m_isvalid;}
+    
+    /** Return the sparse matrix corresponding to the current file */
+    inline MatrixType& matrix() 
+    { 
+      // Read the matrix
+      if (m_matIsLoaded) return m_mat;
+      
+      std::string matrix_file = m_folder + "/" + m_matname + ".mtx";
+      if ( !loadMarket(m_mat, matrix_file)) 
+      {
+        std::cerr << "Warning loadMarket failed when loading \"" << matrix_file << "\"" << std::endl;
+        m_matIsLoaded = false;
+        return m_mat;
+      }
+      m_matIsLoaded = true; 
+
+      if (m_sym != NonSymmetric) 
+      {
+        // Check whether we need to restore a full matrix:
+        RealScalar diag_norm  = m_mat.diagonal().norm();
+        RealScalar lower_norm = m_mat.template triangularView<Lower>().norm();
+        RealScalar upper_norm = m_mat.template triangularView<Upper>().norm();
+        if(lower_norm>diag_norm && upper_norm==diag_norm)
+        {
+          // only the lower part is stored
+          MatrixType tmp(m_mat);
+          m_mat = tmp.template selfadjointView<Lower>();
+        }
+        else if(upper_norm>diag_norm && lower_norm==diag_norm)
+        {
+          // only the upper part is stored
+          MatrixType tmp(m_mat);
+          m_mat = tmp.template selfadjointView<Upper>();
+        }
+      }
+      return m_mat; 
+    }
+    
+    /** Return the right hand side corresponding to the current matrix. 
+     * If the rhs file is not provided, a random rhs is generated
+     */
+    inline VectorType& rhs() 
+    { 
+       // Get the right hand side
+      if (m_hasRhs) return m_rhs;
+      
+      std::string rhs_file;
+      rhs_file = m_folder + "/" + m_matname + "_b.mtx"; // The pattern is matname_b.mtx
+      m_hasRhs = Fileexists(rhs_file);
+      if (m_hasRhs)
+      {
+        m_rhs.resize(m_mat.cols());
+        m_hasRhs = loadMarketVector(m_rhs, rhs_file);
+      }
+      if (!m_hasRhs)
+      {
+        // Generate a random right hand side
+        if (!m_matIsLoaded) this->matrix(); 
+        m_refX.resize(m_mat.cols());
+        m_refX.setRandom();
+        m_rhs = m_mat * m_refX;
+        m_hasrefX = true;
+        m_hasRhs = true;
+      }
+      return m_rhs; 
+    }
+    
+    /** Return a reference solution
+     * If it is not provided and if the right hand side is not available
+     * then refX is randomly generated such that A*refX = b 
+     * where A and b are the matrix and the rhs. 
+     * Note that when a rhs is provided, refX is not available 
+     */
+    inline VectorType& refX() 
+    { 
+      // Check if a reference solution is provided
+      if (m_hasrefX) return m_refX;
+      
+      std::string lhs_file;
+      lhs_file = m_folder + "/" + m_matname + "_x.mtx"; 
+      m_hasrefX = Fileexists(lhs_file);
+      if (m_hasrefX)
+      {
+        m_refX.resize(m_mat.cols());
+        m_hasrefX = loadMarketVector(m_refX, lhs_file);
+      }
+      else
+        m_refX.resize(0);
+      return m_refX; 
+    }
+    
+    inline std::string& matname() { return m_matname; }
+    
+    inline int sym() { return m_sym; }
+    
+    bool hasRhs() {return m_hasRhs; }
+    bool hasrefX() {return m_hasrefX; }
+    bool isFolderValid() { return bool(m_folder_id); }
+    
+  protected:
+    
+    inline bool Fileexists(std::string file)
+    {
+      std::ifstream file_id(file.c_str());
+      if (!file_id.good() ) 
+      {
+        return false;
+      }
+      else 
+      {
+        file_id.close();
+        return true;
+      }
+    }
+    
+    void Getnextvalidmatrix( )
+    {
+      m_isvalid = false;
+      // Here, we return with the next valid matrix in the folder
+      while ( (m_curs_id = readdir(m_folder_id)) != NULL) {
+        m_isvalid = false;
+        std::string curfile;
+        curfile = m_folder + "/" + m_curs_id->d_name;
+        // Discard if it is a folder
+        if (m_curs_id->d_type == DT_DIR) continue; //FIXME This may not be available on non BSD systems
+//         struct stat st_buf; 
+//         stat (curfile.c_str(), &st_buf);
+//         if (S_ISDIR(st_buf.st_mode)) continue;
+        
+        // Determine from the header if it is a matrix or a right hand side 
+        bool isvector,iscomplex=false;
+        if(!getMarketHeader(curfile,m_sym,iscomplex,isvector)) continue;
+        if(isvector) continue;
+        if (!iscomplex)
+        {
+          if(internal::is_same<Scalar, std::complex<float> >::value || internal::is_same<Scalar, std::complex<double> >::value)
+            continue; 
+        }
+        if (iscomplex)
+        {
+          if(internal::is_same<Scalar, float>::value || internal::is_same<Scalar, double>::value)
+            continue; 
+        }
+        
+        
+        // Get the matrix name
+        std::string filename = m_curs_id->d_name;
+        m_matname = filename.substr(0, filename.length()-4); 
+        
+        // Find if the matrix is SPD 
+        size_t found = m_matname.find("SPD");
+        if( (found!=std::string::npos) && (m_sym != NonSymmetric) )
+          m_sym = SPD;
+       
+        m_isvalid = true;
+        break; 
+      }
+    }
+    int m_sym; // Symmetry of the matrix
+    MatrixType m_mat; // Current matrix  
+    VectorType m_rhs;  // Current vector
+    VectorType m_refX; // The reference solution, if exists
+    std::string m_matname; // Matrix Name
+    bool m_isvalid; 
+    bool m_matIsLoaded; // Determine if the matrix has already been loaded from the file
+    bool m_hasRhs; // The right hand side exists
+    bool m_hasrefX; // A reference solution is provided
+    std::string m_folder;
+    DIR * m_folder_id;
+    struct dirent *m_curs_id; 
+    
+};
+
+} // end namespace Eigen
+
+#endif

diff --git a/unsupported/Eigen/src/SparseExtra/RandomSetter.h b/unsupported/Eigen/src/SparseExtra/RandomSetter.h
new file mode 100644
index 0000000..985702b
--- /dev/null
+++ b/unsupported/Eigen/src/SparseExtra/RandomSetter.h

@@ -0,0 +1,349 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_RANDOMSETTER_H
+#define EIGEN_RANDOMSETTER_H
+
+#if defined(EIGEN_GOOGLEHASH_SUPPORT)
+// Ensure the ::google namespace exists, required for checking existence of 
+// ::google::dense_hash_map and ::google::sparse_hash_map.
+namespace google {}
+#endif
+
+namespace Eigen {
+
+/** Represents a std::map
+  *
+  * \see RandomSetter
+  */
+template<typename Scalar> struct StdMapTraits
+{
+  typedef int KeyType;
+  typedef std::map<KeyType,Scalar> Type;
+  enum {
+    IsSorted = 1
+  };
+
+  static void setInvalidKey(Type&, const KeyType&) {}
+};
+
+#ifdef EIGEN_UNORDERED_MAP_SUPPORT
+/** Represents a std::unordered_map
+  *
+  * To use it you need to both define EIGEN_UNORDERED_MAP_SUPPORT and include the unordered_map header file
+  * yourself making sure that unordered_map is defined in the std namespace.
+  *
+  * For instance, with current version of gcc you can either enable C++0x standard (-std=c++0x) or do:
+  * \code
+  * #include <tr1/unordered_map>
+  * #define EIGEN_UNORDERED_MAP_SUPPORT
+  * namespace std {
+  *   using std::tr1::unordered_map;
+  * }
+  * \endcode
+  *
+  * \see RandomSetter
+  */
+template<typename Scalar> struct StdUnorderedMapTraits
+{
+  typedef int KeyType;
+  typedef std::unordered_map<KeyType,Scalar> Type;
+  enum {
+    IsSorted = 0
+  };
+
+  static void setInvalidKey(Type&, const KeyType&) {}
+};
+#endif // EIGEN_UNORDERED_MAP_SUPPORT
+
+#if defined(EIGEN_GOOGLEHASH_SUPPORT)
+
+namespace google {
+  
+// Namespace work-around, since sometimes dense_hash_map and sparse_hash_map
+// are in the global namespace, and other times they are under ::google.
+using namespace ::google;
+
+template<typename KeyType, typename Scalar>
+struct DenseHashMap {
+  typedef dense_hash_map<KeyType, Scalar> type;
+};
+
+template<typename KeyType, typename Scalar>
+struct SparseHashMap {
+  typedef sparse_hash_map<KeyType, Scalar> type;
+};
+
+} // namespace google
+
+/** Represents a google::dense_hash_map
+  *
+  * \see RandomSetter
+  */
+template<typename Scalar> struct GoogleDenseHashMapTraits
+{
+  typedef int KeyType;
+  typedef typename google::DenseHashMap<KeyType,Scalar>::type Type;
+  enum {
+    IsSorted = 0
+  };
+
+  static void setInvalidKey(Type& map, const KeyType& k)
+  { map.set_empty_key(k); }
+};
+
+/** Represents a google::sparse_hash_map
+  *
+  * \see RandomSetter
+  */
+template<typename Scalar> struct GoogleSparseHashMapTraits
+{
+  typedef int KeyType;
+  typedef typename google::SparseHashMap<KeyType,Scalar>::type Type;
+  enum {
+    IsSorted = 0
+  };
+
+  static void setInvalidKey(Type&, const KeyType&) {}
+};
+#endif
+
+/** \class RandomSetter
+  *
+  * \brief The RandomSetter is a wrapper object allowing to set/update a sparse matrix with random access
+  *
+  * \tparam SparseMatrixType the type of the sparse matrix we are updating
+  * \tparam MapTraits a traits class representing the map implementation used for the temporary sparse storage.
+  *                  Its default value depends on the system.
+  * \tparam OuterPacketBits defines the number of rows (or columns) manage by a single map object
+  *                        as a power of two exponent.
+  *
+  * This class temporarily represents a sparse matrix object using a generic map implementation allowing for
+  * efficient random access. The conversion from the compressed representation to a hash_map object is performed
+  * in the RandomSetter constructor, while the sparse matrix is updated back at destruction time. This strategy
+  * suggest the use of nested blocks as in this example:
+  *
+  * \code
+  * SparseMatrix<double> m(rows,cols);
+  * {
+  *   RandomSetter<SparseMatrix<double> > w(m);
+  *   // don't use m but w instead with read/write random access to the coefficients:
+  *   for(;;)
+  *     w(rand(),rand()) = rand;
+  * }
+  * // when w is deleted, the data are copied back to m
+  * // and m is ready to use.
+  * \endcode
+  *
+  * Since hash_map objects are not fully sorted, representing a full matrix as a single hash_map would
+  * involve a big and costly sort to update the compressed matrix back. To overcome this issue, a RandomSetter
+  * use multiple hash_map, each representing 2^OuterPacketBits columns or rows according to the storage order.
+  * To reach optimal performance, this value should be adjusted according to the average number of nonzeros
+  * per rows/columns.
+  *
+  * The possible values for the template parameter MapTraits are:
+  *  - \b StdMapTraits: corresponds to std::map. (does not perform very well)
+  *  - \b GnuHashMapTraits: corresponds to __gnu_cxx::hash_map (available only with GCC)
+  *  - \b GoogleDenseHashMapTraits: corresponds to google::dense_hash_map (best efficiency, reasonable memory consumption)
+  *  - \b GoogleSparseHashMapTraits: corresponds to google::sparse_hash_map (best memory consumption, relatively good performance)
+  *
+  * The default map implementation depends on the availability, and the preferred order is:
+  * GoogleSparseHashMapTraits, GnuHashMapTraits, and finally StdMapTraits.
+  *
+  * For performance and memory consumption reasons it is highly recommended to use one of
+  * Google's hash_map implementations. To enable the support for them, you must define
+  * EIGEN_GOOGLEHASH_SUPPORT. This will include both <google/dense_hash_map> and
+  * <google/sparse_hash_map> for you.
+  *
+  * \see https://github.com/sparsehash/sparsehash
+  */
+template<typename SparseMatrixType,
+         template <typename T> class MapTraits =
+#if defined(EIGEN_GOOGLEHASH_SUPPORT)
+          GoogleDenseHashMapTraits
+#elif defined(_HASH_MAP)
+          GnuHashMapTraits
+#else
+          StdMapTraits
+#endif
+         ,int OuterPacketBits = 6>
+class RandomSetter
+{
+    typedef typename SparseMatrixType::Scalar Scalar;
+    typedef typename SparseMatrixType::StorageIndex StorageIndex;
+
+    struct ScalarWrapper
+    {
+      ScalarWrapper() : value(0) {}
+      Scalar value;
+    };
+    typedef typename MapTraits<ScalarWrapper>::KeyType KeyType;
+    typedef typename MapTraits<ScalarWrapper>::Type HashMapType;
+    static const int OuterPacketMask = (1 << OuterPacketBits) - 1;
+    enum {
+      SwapStorage = 1 - MapTraits<ScalarWrapper>::IsSorted,
+      TargetRowMajor = (SparseMatrixType::Flags & RowMajorBit) ? 1 : 0,
+      SetterRowMajor = SwapStorage ? 1-TargetRowMajor : TargetRowMajor
+    };
+
+  public:
+
+    /** Constructs a random setter object from the sparse matrix \a target
+      *
+      * Note that the initial value of \a target are imported. If you want to re-set
+      * a sparse matrix from scratch, then you must set it to zero first using the
+      * setZero() function.
+      */
+    inline RandomSetter(SparseMatrixType& target)
+      : mp_target(&target)
+    {
+      const Index outerSize = SwapStorage ? target.innerSize() : target.outerSize();
+      const Index innerSize = SwapStorage ? target.outerSize() : target.innerSize();
+      m_outerPackets = outerSize >> OuterPacketBits;
+      if (outerSize&OuterPacketMask)
+        m_outerPackets += 1;
+      m_hashmaps = new HashMapType[m_outerPackets];
+      // compute number of bits needed to store inner indices
+      Index aux = innerSize - 1;
+      m_keyBitsOffset = 0;
+      while (aux)
+      {
+        ++m_keyBitsOffset;
+        aux = aux >> 1;
+      }
+      KeyType ik = (1<<(OuterPacketBits+m_keyBitsOffset));
+      for (Index k=0; k<m_outerPackets; ++k)
+        MapTraits<ScalarWrapper>::setInvalidKey(m_hashmaps[k],ik);
+
+      // insert current coeffs
+      for (Index j=0; j<mp_target->outerSize(); ++j)
+        for (typename SparseMatrixType::InnerIterator it(*mp_target,j); it; ++it)
+          (*this)(TargetRowMajor?j:it.index(), TargetRowMajor?it.index():j) = it.value();
+    }
+
+    /** Destructor updating back the sparse matrix target */
+    ~RandomSetter()
+    {
+      KeyType keyBitsMask = (1<<m_keyBitsOffset)-1;
+      if (!SwapStorage) // also means the map is sorted
+      {
+        mp_target->setZero();
+        mp_target->makeCompressed();
+        mp_target->reserve(nonZeros());
+        Index prevOuter = -1;
+        for (Index k=0; k<m_outerPackets; ++k)
+        {
+          const Index outerOffset = (1<<OuterPacketBits) * k;
+          typename HashMapType::iterator end = m_hashmaps[k].end();
+          for (typename HashMapType::iterator it = m_hashmaps[k].begin(); it!=end; ++it)
+          {
+            const Index outer = (it->first >> m_keyBitsOffset) + outerOffset;
+            const Index inner = it->first & keyBitsMask;
+            if (prevOuter!=outer)
+            {
+              for (Index j=prevOuter+1;j<=outer;++j)
+                mp_target->startVec(j);
+              prevOuter = outer;
+            }
+            mp_target->insertBackByOuterInner(outer, inner) = it->second.value;
+          }
+        }
+        mp_target->finalize();
+      }
+      else
+      {
+        VectorXi positions(mp_target->outerSize());
+        positions.setZero();
+        // pass 1
+        for (Index k=0; k<m_outerPackets; ++k)
+        {
+          typename HashMapType::iterator end = m_hashmaps[k].end();
+          for (typename HashMapType::iterator it = m_hashmaps[k].begin(); it!=end; ++it)
+          {
+            const Index outer = it->first & keyBitsMask;
+            ++positions[outer];
+          }
+        }
+        // prefix sum
+        StorageIndex count = 0;
+        for (Index j=0; j<mp_target->outerSize(); ++j)
+        {
+          StorageIndex tmp = positions[j];
+          mp_target->outerIndexPtr()[j] = count;
+          positions[j] = count;
+          count += tmp;
+        }
+        mp_target->makeCompressed();
+        mp_target->outerIndexPtr()[mp_target->outerSize()] = count;
+        mp_target->resizeNonZeros(count);
+        // pass 2
+        for (Index k=0; k<m_outerPackets; ++k)
+        {
+          const Index outerOffset = (1<<OuterPacketBits) * k;
+          typename HashMapType::iterator end = m_hashmaps[k].end();
+          for (typename HashMapType::iterator it = m_hashmaps[k].begin(); it!=end; ++it)
+          {
+            const Index inner = (it->first >> m_keyBitsOffset) + outerOffset;
+            const Index outer = it->first & keyBitsMask;
+            // sorted insertion
+            // Note that we have to deal with at most 2^OuterPacketBits unsorted coefficients,
+            // moreover those 2^OuterPacketBits coeffs are likely to be sparse, an so only a
+            // small fraction of them have to be sorted, whence the following simple procedure:
+            Index posStart = mp_target->outerIndexPtr()[outer];
+            Index i = (positions[outer]++) - 1;
+            while ( (i >= posStart) && (mp_target->innerIndexPtr()[i] > inner) )
+            {
+              mp_target->valuePtr()[i+1] = mp_target->valuePtr()[i];
+              mp_target->innerIndexPtr()[i+1] = mp_target->innerIndexPtr()[i];
+              --i;
+            }
+            mp_target->innerIndexPtr()[i+1] = internal::convert_index<StorageIndex>(inner);
+            mp_target->valuePtr()[i+1] = it->second.value;
+          }
+        }
+      }
+      delete[] m_hashmaps;
+    }
+
+    /** \returns a reference to the coefficient at given coordinates \a row, \a col */
+    Scalar& operator() (Index row, Index col)
+    {
+      const Index outer = SetterRowMajor ? row : col;
+      const Index inner = SetterRowMajor ? col : row;
+      const Index outerMajor = outer >> OuterPacketBits; // index of the packet/map
+      const Index outerMinor = outer & OuterPacketMask;  // index of the inner vector in the packet
+      const KeyType key = internal::convert_index<KeyType>((outerMinor<<m_keyBitsOffset) | inner);
+      return m_hashmaps[outerMajor][key].value;
+    }
+
+    /** \returns the number of non zero coefficients
+      *
+      * \note According to the underlying map/hash_map implementation,
+      * this function might be quite expensive.
+      */
+    Index nonZeros() const
+    {
+      Index nz = 0;
+      for (Index k=0; k<m_outerPackets; ++k)
+        nz += static_cast<Index>(m_hashmaps[k].size());
+      return nz;
+    }
+
+
+  protected:
+
+    HashMapType* m_hashmaps;
+    SparseMatrixType* mp_target;
+    Index m_outerPackets;
+    unsigned char m_keyBitsOffset;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_RANDOMSETTER_H

diff --git a/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h
new file mode 100644
index 0000000..41d2bf6
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h

@@ -0,0 +1,286 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_BESSELFUNCTIONS_ARRAYAPI_H
+#define EIGEN_BESSELFUNCTIONS_ARRAYAPI_H
+
+namespace Eigen {
+
+/** \returns an expression of the coefficient-wise i0(\a x) to the given
+ * arrays.
+  *
+  * It returns the modified Bessel function of the first kind of order zero.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of i0(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_i0()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_i0_op<typename Derived::Scalar>, const Derived>
+bessel_i0(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_i0_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise i0e(\a x) to the given
+ * arrays.
+  *
+  * It returns the exponentially scaled modified Bessel
+  * function of the first kind of order zero.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of i0e(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_i0e()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_i0e_op<typename Derived::Scalar>, const Derived>
+bessel_i0e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_i0e_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise i1(\a x) to the given
+ * arrays.
+  *
+  * It returns the modified Bessel function of the first kind of order one.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of i1(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_i1()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_i1_op<typename Derived::Scalar>, const Derived>
+bessel_i1(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_i1_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise i1e(\a x) to the given
+ * arrays.
+  *
+  * It returns the exponentially scaled modified Bessel
+  * function of the first kind of order one.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of i1e(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_i1e()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_i1e_op<typename Derived::Scalar>, const Derived>
+bessel_i1e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_i1e_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k0(\a x) to the given
+ * arrays.
+  *
+  * It returns the modified Bessel function of the second kind of order zero.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of k0(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_k0()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_k0_op<typename Derived::Scalar>, const Derived>
+bessel_k0(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_k0_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k0e(\a x) to the given
+ * arrays.
+  *
+  * It returns the exponentially scaled modified Bessel
+  * function of the second kind of order zero.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of k0e(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_k0e()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_k0e_op<typename Derived::Scalar>, const Derived>
+bessel_k0e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_k0e_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k1(\a x) to the given
+ * arrays.
+  *
+  * It returns the modified Bessel function of the second kind of order one.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of k1(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_k1()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_k1_op<typename Derived::Scalar>, const Derived>
+bessel_k1(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_k1_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k1e(\a x) to the given
+ * arrays.
+  *
+  * It returns the exponentially scaled modified Bessel
+  * function of the second kind of order one.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of k1e(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_k1e()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_k1e_op<typename Derived::Scalar>, const Derived>
+bessel_k1e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_k1e_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise j0(\a x) to the given
+ * arrays.
+  *
+  * It returns the Bessel function of the first kind of order zero.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of j0(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_j0()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_j0_op<typename Derived::Scalar>, const Derived>
+bessel_j0(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_j0_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise y0(\a x) to the given
+ * arrays.
+  *
+  * It returns the Bessel function of the second kind of order zero.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of y0(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_y0()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_y0_op<typename Derived::Scalar>, const Derived>
+bessel_y0(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_y0_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise j1(\a x) to the given
+ * arrays.
+  *
+  * It returns the modified Bessel function of the first kind of order one.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of j1(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_j1()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_j1_op<typename Derived::Scalar>, const Derived>
+bessel_j1(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_j1_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise y1(\a x) to the given
+ * arrays.
+  *
+  * It returns the Bessel function of the second kind of order one.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of y1(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_y1()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_y1_op<typename Derived::Scalar>, const Derived>
+bessel_y1(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_y1_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_BESSELFUNCTIONS_ARRAYAPI_H

diff --git a/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h
new file mode 100644
index 0000000..6049cc2
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h

@@ -0,0 +1,68 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_BFLOAT16_H
+#define EIGEN_BESSELFUNCTIONS_BFLOAT16_H
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i0(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_i0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i0e(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_i0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i1(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_i1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i1e(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_i1e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_j0(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_j0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_j1(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_j1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_y0(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_y0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_y1(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_y1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k0(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_k0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k0e(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_k0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k1(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_k1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k1e(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_k1e(static_cast<float>(x)));
+}
+#endif
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_BESSELFUNCTIONS_BFLOAT16_H

diff --git a/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h
new file mode 100644
index 0000000..8606a9f
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h

@@ -0,0 +1,357 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_FUNCTORS_H
+#define EIGEN_BESSELFUNCTIONS_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the first
+ * kind of order zero.
+ * \sa class CwiseUnaryOp, Cwise::bessel_i0()
+ */
+template <typename Scalar>
+struct scalar_bessel_i0_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i0_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_i0;
+    return bessel_i0(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_i0(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i0_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions. We also add
+    // the cost of an additional exp over i0e.
+    Cost = 28 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the first kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_i0e()
+ */
+template <typename Scalar>
+struct scalar_bessel_i0e_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i0e_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_i0e;
+    return bessel_i0e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_i0e(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i0e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions.
+    Cost = 20 * NumTraits<Scalar>::MulCost + 40 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the first
+ * kind of order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_i1()
+ */
+template <typename Scalar>
+struct scalar_bessel_i1_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i1_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_i1;
+    return bessel_i1(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_i1(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i1_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions. We also add
+    // the cost of an additional exp over i1e.
+    Cost = 28 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the first kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_i1e()
+ */
+template <typename Scalar>
+struct scalar_bessel_i1e_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i1e_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_i1e;
+    return bessel_i1e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_i1e(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i1e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions.
+    Cost = 20 * NumTraits<Scalar>::MulCost + 40 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the second kind of
+ * order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_j0()
+ */
+template <typename Scalar>
+struct scalar_bessel_j0_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_j0_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_j0;
+    return bessel_j0(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_j0(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_j0_op<Scalar> > {
+  enum {
+    // 6 polynomial of order ~N=8 is computed.
+    // The cost is N multiplications and N additions each, along with a
+    // sine, cosine and rsqrt cost.
+    Cost = 63 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the second kind of
+ * order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_y0()
+ */
+template <typename Scalar>
+struct scalar_bessel_y0_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_y0_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_y0;
+    return bessel_y0(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_y0(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_y0_op<Scalar> > {
+  enum {
+    // 6 polynomial of order ~N=8 is computed.
+    // The cost is N multiplications and N additions each, along with a
+    // sine, cosine, rsqrt and j0 cost.
+    Cost = 126 * NumTraits<Scalar>::MulCost + 96 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the first kind of
+ * order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_j1()
+ */
+template <typename Scalar>
+struct scalar_bessel_j1_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_j1_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_j1;
+    return bessel_j1(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_j1(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_j1_op<Scalar> > {
+  enum {
+    // 6 polynomial of order ~N=8 is computed.
+    // The cost is N multiplications and N additions each, along with a
+    // sine, cosine and rsqrt cost.
+    Cost = 63 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the second kind of
+ * order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_j1e()
+ */
+template <typename Scalar>
+struct scalar_bessel_y1_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_y1_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_y1;
+    return bessel_y1(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_y1(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_y1_op<Scalar> > {
+  enum {
+    // 6 polynomial of order ~N=8 is computed.
+    // The cost is N multiplications and N additions each, along with a
+    // sine, cosine, rsqrt and j1 cost.
+    Cost = 126 * NumTraits<Scalar>::MulCost + 96 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the second
+ * kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_k0()
+ */
+template <typename Scalar>
+struct scalar_bessel_k0_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k0_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_k0;
+    return bessel_k0(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_k0(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k0_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=10 is computed.
+    // The cost is N multiplications and 2N additions. In addition we compute
+    // i0, a log, exp and prsqrt and sin and cos.
+    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the second kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_k0e()
+ */
+template <typename Scalar>
+struct scalar_bessel_k0e_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k0e_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_k0e;
+    return bessel_k0e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_k0e(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k0e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=10 is computed.
+    // The cost is N multiplications and 2N additions. In addition we compute
+    // i0, a log, exp and prsqrt and sin and cos.
+    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the
+ * second kind of order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_k1()
+ */
+template <typename Scalar>
+struct scalar_bessel_k1_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k1_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_k1;
+    return bessel_k1(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_k1(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k1_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=10 is computed.
+    // The cost is N multiplications and 2N additions. In addition we compute
+    // i1, a log, exp and prsqrt and sin and cos.
+    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the second kind of order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_k1e()
+ */
+template <typename Scalar>
+struct scalar_bessel_k1e_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k1e_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_k1e;
+    return bessel_k1e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_k1e(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k1e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=10 is computed.
+    // The cost is N multiplications and 2N additions. In addition we compute
+    // i1, a log, exp and prsqrt and sin and cos.
+    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_BESSELFUNCTIONS_FUNCTORS_H

diff --git a/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h
new file mode 100644
index 0000000..8930d1a
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h

@@ -0,0 +1,66 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_HALF_H
+#define EIGEN_BESSELFUNCTIONS_HALF_H
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i0(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_i0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i0e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_i0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i1(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_i1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i1e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_i1e(static_cast<float>(x)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_j0(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_j0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_j1(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_j1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_y0(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_y0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_y1(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_y1(static_cast<float>(x)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k0(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_k0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k0e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_k0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k1(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_k1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k1e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_k1e(static_cast<float>(x)));
+}
+#endif
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_BESSELFUNCTIONS_HALF_H

diff --git a/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h
new file mode 100644
index 0000000..24812be
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h

@@ -0,0 +1,1959 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSEL_FUNCTIONS_H
+#define EIGEN_BESSEL_FUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+//  Parts of this code are based on the Cephes Math Library.
+//
+//  Cephes Math Library Release 2.8:  June, 2000
+//  Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier
+//
+//  Permission has been kindly provided by the original author
+//  to incorporate the Cephes software into the Eigen codebase:
+//
+//    From: Stephen Moshier
+//    To: Eugene Brevdo
+//    Subject: Re: Permission to wrap several cephes functions in Eigen
+//
+//    Hello Eugene,
+//
+//    Thank you for writing.
+//
+//    If your licensing is similar to BSD, the formal way that has been
+//    handled is simply to add a statement to the effect that you are incorporating
+//    the Cephes software by permission of the author.
+//
+//    Good luck with your project,
+//    Steve
+
+
+/****************************************************************************
+ * Implementation of Bessel function, based on Cephes                       *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct bessel_i0e_retval {
+  typedef Scalar type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_i0e {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_i0e<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  i0ef.c
+     *
+     *  Modified Bessel function of order zero,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, i0ef();
+     *
+     * y = i0ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order zero of the argument.
+     *
+     * The function is defined as i0e(x) = exp(-|x|) j0( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        100000      3.7e-7      7.0e-8
+     * See i0f().
+     *
+     */
+
+    const float A[] = {-1.30002500998624804212E-8f, 6.04699502254191894932E-8f,
+                       -2.67079385394061173391E-7f, 1.11738753912010371815E-6f,
+                       -4.41673835845875056359E-6f, 1.64484480707288970893E-5f,
+                       -5.75419501008210370398E-5f, 1.88502885095841655729E-4f,
+                       -5.76375574538582365885E-4f, 1.63947561694133579842E-3f,
+                       -4.32430999505057594430E-3f, 1.05464603945949983183E-2f,
+                       -2.37374148058994688156E-2f, 4.93052842396707084878E-2f,
+                       -9.49010970480476444210E-2f, 1.71620901522208775349E-1f,
+                       -3.04682672343198398683E-1f, 6.76795274409476084995E-1f};
+
+    const float B[] = {3.39623202570838634515E-9f, 2.26666899049817806459E-8f,
+                       2.04891858946906374183E-7f, 2.89137052083475648297E-6f,
+                       6.88975834691682398426E-5f, 3.36911647825569408990E-3f,
+                       8.04490411014108831608E-1f};
+    T y = pabs(x);
+    T y_le_eight = internal::pchebevl<T, 18>::run(
+        pmadd(pset1<T>(0.5f), y, pset1<T>(-2.0f)), A);
+    T y_gt_eight = pmul(
+        internal::pchebevl<T, 7>::run(
+            psub(pdiv(pset1<T>(32.0f), y), pset1<T>(2.0f)), B),
+        prsqrt(y));
+    // TODO: Perhaps instead check whether all packet elements are in
+    // [-8, 8] and evaluate a branch based off of that. It's possible
+    // in practice most elements are in this region.
+    return pselect(pcmp_le(y, pset1<T>(8.0f)), y_le_eight, y_gt_eight);
+  }
+};
+
+template <typename T>
+struct generic_i0e<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  i0e.c
+     *
+     *  Modified Bessel function of order zero,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, i0e();
+     *
+     * y = i0e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order zero of the argument.
+     *
+     * The function is defined as i0e(x) = exp(-|x|) j0( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        30000       5.4e-16     1.2e-16
+     * See i0().
+     *
+     */
+
+    const double A[] = {-4.41534164647933937950E-18, 3.33079451882223809783E-17,
+                        -2.43127984654795469359E-16, 1.71539128555513303061E-15,
+                        -1.16853328779934516808E-14, 7.67618549860493561688E-14,
+                        -4.85644678311192946090E-13, 2.95505266312963983461E-12,
+                        -1.72682629144155570723E-11, 9.67580903537323691224E-11,
+                        -5.18979560163526290666E-10, 2.65982372468238665035E-9,
+                        -1.30002500998624804212E-8,  6.04699502254191894932E-8,
+                        -2.67079385394061173391E-7,  1.11738753912010371815E-6,
+                        -4.41673835845875056359E-6,  1.64484480707288970893E-5,
+                        -5.75419501008210370398E-5,  1.88502885095841655729E-4,
+                        -5.76375574538582365885E-4,  1.63947561694133579842E-3,
+                        -4.32430999505057594430E-3,  1.05464603945949983183E-2,
+                        -2.37374148058994688156E-2,  4.93052842396707084878E-2,
+                        -9.49010970480476444210E-2,  1.71620901522208775349E-1,
+                        -3.04682672343198398683E-1,  6.76795274409476084995E-1};
+    const double B[] = {
+        -7.23318048787475395456E-18, -4.83050448594418207126E-18,
+        4.46562142029675999901E-17,  3.46122286769746109310E-17,
+        -2.82762398051658348494E-16, -3.42548561967721913462E-16,
+        1.77256013305652638360E-15,  3.81168066935262242075E-15,
+        -9.55484669882830764870E-15, -4.15056934728722208663E-14,
+        1.54008621752140982691E-14,  3.85277838274214270114E-13,
+        7.18012445138366623367E-13,  -1.79417853150680611778E-12,
+        -1.32158118404477131188E-11, -3.14991652796324136454E-11,
+        1.18891471078464383424E-11,  4.94060238822496958910E-10,
+        3.39623202570838634515E-9,   2.26666899049817806459E-8,
+        2.04891858946906374183E-7,   2.89137052083475648297E-6,
+        6.88975834691682398426E-5,   3.36911647825569408990E-3,
+        8.04490411014108831608E-1};
+    T y = pabs(x);
+    T y_le_eight = internal::pchebevl<T, 30>::run(
+        pmadd(pset1<T>(0.5), y, pset1<T>(-2.0)), A);
+    T y_gt_eight = pmul(
+        internal::pchebevl<T, 25>::run(
+            psub(pdiv(pset1<T>(32.0), y), pset1<T>(2.0)), B),
+        prsqrt(y));
+    // TODO: Perhaps instead check whether all packet elements are in
+    // [-8, 8] and evaluate a branch based off of that. It's possible
+    // in practice most elements are in this region.
+    return pselect(pcmp_le(y, pset1<T>(8.0)), y_le_eight, y_gt_eight);
+  }
+};
+
+template <typename T>
+struct bessel_i0e_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_i0e<T>::run(x);
+  }
+};
+
+template <typename Scalar>
+struct bessel_i0_retval {
+  typedef Scalar type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_i0 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    return pmul(
+        pexp(pabs(x)),
+        generic_i0e<T, ScalarType>::run(x));
+  }
+};
+
+template <typename T>
+struct bessel_i0_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_i0<T>::run(x);
+  }
+};
+
+template <typename Scalar>
+struct bessel_i1e_retval {
+  typedef Scalar type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type >
+struct generic_i1e {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_i1e<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* i1ef.c
+     *
+     *  Modified Bessel function of order one,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, i1ef();
+     *
+     * y = i1ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order one of the argument.
+     *
+     * The function is defined as i1(x) = -i exp(-|x|) j1( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       1.5e-6      1.5e-7
+     * See i1().
+     *
+     */
+    const float A[] = {9.38153738649577178388E-9f, -4.44505912879632808065E-8f,
+                       2.00329475355213526229E-7f, -8.56872026469545474066E-7f,
+                       3.47025130813767847674E-6f, -1.32731636560394358279E-5f,
+                       4.78156510755005422638E-5f, -1.61760815825896745588E-4f,
+                       5.12285956168575772895E-4f, -1.51357245063125314899E-3f,
+                       4.15642294431288815669E-3f, -1.05640848946261981558E-2f,
+                       2.47264490306265168283E-2f, -5.29459812080949914269E-2f,
+                       1.02643658689847095384E-1f, -1.76416518357834055153E-1f,
+                       2.52587186443633654823E-1f};
+
+    const float B[] = {-3.83538038596423702205E-9f, -2.63146884688951950684E-8f,
+                       -2.51223623787020892529E-7f, -3.88256480887769039346E-6f,
+                       -1.10588938762623716291E-4f, -9.76109749136146840777E-3f,
+                       7.78576235018280120474E-1f};
+
+
+    T y = pabs(x);
+    T y_le_eight = pmul(y, internal::pchebevl<T, 17>::run(
+        pmadd(pset1<T>(0.5f), y, pset1<T>(-2.0f)), A));
+    T y_gt_eight = pmul(
+        internal::pchebevl<T, 7>::run(
+            psub(pdiv(pset1<T>(32.0f), y),
+                 pset1<T>(2.0f)), B),
+        prsqrt(y));
+    // TODO: Perhaps instead check whether all packet elements are in
+    // [-8, 8] and evaluate a branch based off of that. It's possible
+    // in practice most elements are in this region.
+    y = pselect(pcmp_le(y, pset1<T>(8.0f)), y_le_eight, y_gt_eight);
+    return pselect(pcmp_lt(x, pset1<T>(0.0f)), pnegate(y), y);
+  }
+};
+
+template <typename T>
+struct generic_i1e<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  i1e.c
+     *
+     *  Modified Bessel function of order one,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, i1e();
+     *
+     * y = i1e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order one of the argument.
+     *
+     * The function is defined as i1(x) = -i exp(-|x|) j1( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       2.0e-15     2.0e-16
+     * See i1().
+     *
+     */
+    const double A[] = {2.77791411276104639959E-18, -2.11142121435816608115E-17,
+                        1.55363195773620046921E-16, -1.10559694773538630805E-15,
+                        7.60068429473540693410E-15, -5.04218550472791168711E-14,
+                        3.22379336594557470981E-13, -1.98397439776494371520E-12,
+                        1.17361862988909016308E-11, -6.66348972350202774223E-11,
+                        3.62559028155211703701E-10, -1.88724975172282928790E-9,
+                        9.38153738649577178388E-9,  -4.44505912879632808065E-8,
+                        2.00329475355213526229E-7,  -8.56872026469545474066E-7,
+                        3.47025130813767847674E-6,  -1.32731636560394358279E-5,
+                        4.78156510755005422638E-5,  -1.61760815825896745588E-4,
+                        5.12285956168575772895E-4,  -1.51357245063125314899E-3,
+                        4.15642294431288815669E-3,  -1.05640848946261981558E-2,
+                        2.47264490306265168283E-2,  -5.29459812080949914269E-2,
+                        1.02643658689847095384E-1,  -1.76416518357834055153E-1,
+                        2.52587186443633654823E-1};
+    const double B[] = {
+        7.51729631084210481353E-18,  4.41434832307170791151E-18,
+        -4.65030536848935832153E-17, -3.20952592199342395980E-17,
+        2.96262899764595013876E-16,  3.30820231092092828324E-16,
+        -1.88035477551078244854E-15, -3.81440307243700780478E-15,
+        1.04202769841288027642E-14,  4.27244001671195135429E-14,
+        -2.10154184277266431302E-14, -4.08355111109219731823E-13,
+        -7.19855177624590851209E-13, 2.03562854414708950722E-12,
+        1.41258074366137813316E-11,  3.25260358301548823856E-11,
+        -1.89749581235054123450E-11, -5.58974346219658380687E-10,
+        -3.83538038596423702205E-9,  -2.63146884688951950684E-8,
+        -2.51223623787020892529E-7,  -3.88256480887769039346E-6,
+        -1.10588938762623716291E-4,  -9.76109749136146840777E-3,
+        7.78576235018280120474E-1};
+    T y = pabs(x);
+    T y_le_eight = pmul(y, internal::pchebevl<T, 29>::run(
+        pmadd(pset1<T>(0.5), y, pset1<T>(-2.0)), A));
+    T y_gt_eight = pmul(
+        internal::pchebevl<T, 25>::run(
+            psub(pdiv(pset1<T>(32.0), y),
+                 pset1<T>(2.0)), B),
+        prsqrt(y));
+    // TODO: Perhaps instead check whether all packet elements are in
+    // [-8, 8] and evaluate a branch based off of that. It's possible
+    // in practice most elements are in this region.
+    y = pselect(pcmp_le(y, pset1<T>(8.0)), y_le_eight, y_gt_eight);
+    return pselect(pcmp_lt(x, pset1<T>(0.0)), pnegate(y), y);
+  }
+};
+
+template <typename T>
+struct bessel_i1e_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_i1e<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_i1_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_i1 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    return pmul(
+        pexp(pabs(x)),
+        generic_i1e<T, ScalarType>::run(x));
+  }
+};
+
+template <typename T>
+struct bessel_i1_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_i1<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_k0e_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k0e {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_k0e<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k0ef.c
+     *	Modified Bessel function, third kind, order zero,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k0ef();
+     *
+     * y = k0ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order zero of the argument.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       8.1e-7      7.8e-8
+     * See k0().
+     *
+     */
+
+    const float A[] = {1.90451637722020886025E-9f, 2.53479107902614945675E-7f,
+                       2.28621210311945178607E-5f, 1.26461541144692592338E-3f,
+                       3.59799365153615016266E-2f, 3.44289899924628486886E-1f,
+                       -5.35327393233902768720E-1f};
+
+    const float B[] = {-1.69753450938905987466E-9f, 8.57403401741422608519E-9f,
+                       -4.66048989768794782956E-8f, 2.76681363944501510342E-7f,
+                       -1.83175552271911948767E-6f, 1.39498137188764993662E-5f,
+                       -1.28495495816278026384E-4f, 1.56988388573005337491E-3f,
+                       -3.14481013119645005427E-2f, 2.44030308206595545468E0f};
+    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = internal::pchebevl<T, 7>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A);
+    x_le_two = pmadd(
+        generic_i0<T, float>::run(x), pnegate(
+            plog(pmul(pset1<T>(0.5), x))), x_le_two);
+    x_le_two = pmul(pexp(x), x_le_two);
+    T x_gt_two = pmul(
+            internal::pchebevl<T, 10>::run(
+                psub(pdiv(pset1<T>(8.0), x), two), B),
+            prsqrt(x));
+    return pselect(
+        pcmp_le(x, pset1<T>(0.0)),
+        MAXNUM,
+        pselect(pcmp_le(x, two), x_le_two, x_gt_two));
+  }
+};
+
+template <typename T>
+struct generic_k0e<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k0e.c
+     *	Modified Bessel function, third kind, order zero,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, k0e();
+     *
+     * y = k0e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order zero of the argument.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       1.4e-15     1.4e-16
+     * See k0().
+     *
+     */
+
+    const double A[] = {
+      1.37446543561352307156E-16,
+      4.25981614279661018399E-14,
+      1.03496952576338420167E-11,
+      1.90451637722020886025E-9,
+      2.53479107902614945675E-7,
+      2.28621210311945178607E-5,
+      1.26461541144692592338E-3,
+      3.59799365153615016266E-2,
+      3.44289899924628486886E-1,
+      -5.35327393233902768720E-1};
+    const double B[] = {
+       5.30043377268626276149E-18, -1.64758043015242134646E-17,
+       5.21039150503902756861E-17, -1.67823109680541210385E-16,
+       5.51205597852431940784E-16, -1.84859337734377901440E-15,
+       6.34007647740507060557E-15, -2.22751332699166985548E-14,
+       8.03289077536357521100E-14, -2.98009692317273043925E-13,
+       1.14034058820847496303E-12, -4.51459788337394416547E-12,
+       1.85594911495471785253E-11, -7.95748924447710747776E-11,
+       3.57739728140030116597E-10, -1.69753450938905987466E-9,
+       8.57403401741422608519E-9, -4.66048989768794782956E-8,
+       2.76681363944501510342E-7, -1.83175552271911948767E-6,
+       1.39498137188764993662E-5, -1.28495495816278026384E-4,
+       1.56988388573005337491E-3, -3.14481013119645005427E-2,
+       2.44030308206595545468E0
+    };
+    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = internal::pchebevl<T, 10>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A);
+    x_le_two = pmadd(
+        generic_i0<T, double>::run(x), pmul(
+            pset1<T>(-1.0), plog(pmul(pset1<T>(0.5), x))), x_le_two);
+    x_le_two = pmul(pexp(x), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(
+            internal::pchebevl<T, 25>::run(
+                psub(pdiv(pset1<T>(8.0), x), two), B),
+            prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct bessel_k0e_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_k0e<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_k0_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k0 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_k0<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k0f.c
+     *	Modified Bessel function, third kind, order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k0f();
+     *
+     * y = k0f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns modified Bessel function of the third kind
+     * of order zero of the argument.
+     *
+     * The range is partitioned into the two intervals [0,8] and
+     * (8, infinity).  Chebyshev polynomial expansions are employed
+     * in each interval.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     * Tested at 2000 random points between 0 and 8.  Peak absolute
+     * error (relative when K0 > 1) was 1.46e-14; rms, 4.26e-15.
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       7.8e-7      8.5e-8
+     *
+     * ERROR MESSAGES:
+     *
+     *   message         condition      value returned
+     *  K0 domain          x <= 0          MAXNUM
+     *
+     */
+
+    const float A[] = {1.90451637722020886025E-9f, 2.53479107902614945675E-7f,
+                       2.28621210311945178607E-5f, 1.26461541144692592338E-3f,
+                       3.59799365153615016266E-2f, 3.44289899924628486886E-1f,
+                       -5.35327393233902768720E-1f};
+
+    const float B[] = {-1.69753450938905987466E-9f, 8.57403401741422608519E-9f,
+                       -4.66048989768794782956E-8f, 2.76681363944501510342E-7f,
+                       -1.83175552271911948767E-6f, 1.39498137188764993662E-5f,
+                       -1.28495495816278026384E-4f, 1.56988388573005337491E-3f,
+                       -3.14481013119645005427E-2f, 2.44030308206595545468E0f};
+    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = internal::pchebevl<T, 7>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A);
+    x_le_two = pmadd(
+        generic_i0<T, float>::run(x), pnegate(
+            plog(pmul(pset1<T>(0.5), x))), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(
+        pmul(
+            pexp(pnegate(x)),
+            internal::pchebevl<T, 10>::run(
+                psub(pdiv(pset1<T>(8.0), x), two), B)),
+        prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_k0<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*
+     *
+     *	Modified Bessel function, third kind, order zero,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, k0();
+     *
+     * y = k0( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order zero of the argument.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       1.4e-15     1.4e-16
+     * See k0().
+     *
+     */
+    const double A[] = {
+      1.37446543561352307156E-16,
+      4.25981614279661018399E-14,
+      1.03496952576338420167E-11,
+      1.90451637722020886025E-9,
+      2.53479107902614945675E-7,
+      2.28621210311945178607E-5,
+      1.26461541144692592338E-3,
+      3.59799365153615016266E-2,
+      3.44289899924628486886E-1,
+      -5.35327393233902768720E-1};
+    const double B[] = {
+       5.30043377268626276149E-18, -1.64758043015242134646E-17,
+       5.21039150503902756861E-17, -1.67823109680541210385E-16,
+       5.51205597852431940784E-16, -1.84859337734377901440E-15,
+       6.34007647740507060557E-15, -2.22751332699166985548E-14,
+       8.03289077536357521100E-14, -2.98009692317273043925E-13,
+       1.14034058820847496303E-12, -4.51459788337394416547E-12,
+       1.85594911495471785253E-11, -7.95748924447710747776E-11,
+       3.57739728140030116597E-10, -1.69753450938905987466E-9,
+       8.57403401741422608519E-9, -4.66048989768794782956E-8,
+       2.76681363944501510342E-7, -1.83175552271911948767E-6,
+       1.39498137188764993662E-5, -1.28495495816278026384E-4,
+       1.56988388573005337491E-3, -3.14481013119645005427E-2,
+       2.44030308206595545468E0
+    };
+    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = internal::pchebevl<T, 10>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A);
+    x_le_two = pmadd(
+        generic_i0<T, double>::run(x), pnegate(
+            plog(pmul(pset1<T>(0.5), x))), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(
+        pmul(
+            pexp(-x),
+            internal::pchebevl<T, 25>::run(
+                psub(pdiv(pset1<T>(8.0), x), two), B)),
+        prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct bessel_k0_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_k0<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_k1e_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k1e {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_k1e<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* k1ef.c
+     *
+     *	Modified Bessel function, third kind, order one,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k1ef();
+     *
+     * y = k1ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order one of the argument:
+     *
+     *      k1e(x) = exp(x) * k1(x).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       4.9e-7      6.7e-8
+     * See k1().
+     *
+     */
+
+    const float A[] = {-2.21338763073472585583E-8f, -2.43340614156596823496E-6f,
+                        -1.73028895751305206302E-4f, -6.97572385963986435018E-3f,
+                        -1.22611180822657148235E-1f, -3.53155960776544875667E-1f,
+                        1.52530022733894777053E0f};
+    const float B[] = {2.01504975519703286596E-9f, -1.03457624656780970260E-8f,
+                       5.74108412545004946722E-8f, -3.50196060308781257119E-7f,
+                       2.40648494783721712015E-6f, -1.93619797416608296024E-5f,
+                       1.95215518471351631108E-4f, -2.85781685962277938680E-3f,
+                       1.03923736576817238437E-1f, 2.72062619048444266945E0f};
+    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = pdiv(internal::pchebevl<T, 7>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A), x);
+    x_le_two = pmadd(
+        generic_i1<T, float>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+    x_le_two = pmul(x_le_two, pexp(x));
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(
+        internal::pchebevl<T, 10>::run(
+            psub(pdiv(pset1<T>(8.0), x), two), B),
+        prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_k1e<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k1e.c
+     *
+     *	Modified Bessel function, third kind, order one,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, k1e();
+     *
+     * y = k1e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order one of the argument:
+     *
+     *      k1e(x) = exp(x) * k1(x).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       7.8e-16     1.2e-16
+     * See k1().
+     *
+     */
+    const double A[] = {-7.02386347938628759343E-18, -2.42744985051936593393E-15,
+                        -6.66690169419932900609E-13, -1.41148839263352776110E-10,
+                        -2.21338763073472585583E-8, -2.43340614156596823496E-6,
+                        -1.73028895751305206302E-4, -6.97572385963986435018E-3,
+                        -1.22611180822657148235E-1, -3.53155960776544875667E-1,
+                        1.52530022733894777053E0};
+    const double B[] = {-5.75674448366501715755E-18, 1.79405087314755922667E-17,
+                        -5.68946255844285935196E-17, 1.83809354436663880070E-16,
+                        -6.05704724837331885336E-16, 2.03870316562433424052E-15,
+                        -7.01983709041831346144E-15, 2.47715442448130437068E-14,
+                        -8.97670518232499435011E-14, 3.34841966607842919884E-13,
+                        -1.28917396095102890680E-12, 5.13963967348173025100E-12,
+                        -2.12996783842756842877E-11, 9.21831518760500529508E-11,
+                        -4.19035475934189648750E-10, 2.01504975519703286596E-9,
+                        -1.03457624656780970260E-8, 5.74108412545004946722E-8,
+                        -3.50196060308781257119E-7, 2.40648494783721712015E-6,
+                        -1.93619797416608296024E-5, 1.95215518471351631108E-4,
+                        -2.85781685962277938680E-3, 1.03923736576817238437E-1,
+                        2.72062619048444266945E0};
+    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = pdiv(internal::pchebevl<T, 11>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A), x);
+    x_le_two = pmadd(
+        generic_i1<T, double>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+    x_le_two = pmul(x_le_two, pexp(x));
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(
+        internal::pchebevl<T, 25>::run(
+            psub(pdiv(pset1<T>(8.0), x), two), B),
+        prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct bessel_k1e_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_k1e<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_k1_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k1 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_k1<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* k1f.c
+     *	Modified Bessel function, third kind, order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k1f();
+     *
+     * y = k1f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Computes the modified Bessel function of the third kind
+     * of order one of the argument.
+     *
+     * The range is partitioned into the two intervals [0,2] and
+     * (2, infinity).  Chebyshev polynomial expansions are employed
+     * in each interval.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       4.6e-7      7.6e-8
+     *
+     * ERROR MESSAGES:
+     *
+     *   message         condition      value returned
+     * k1 domain          x <= 0          MAXNUM
+     *
+     */
+
+    const float A[] = {-2.21338763073472585583E-8f, -2.43340614156596823496E-6f,
+                        -1.73028895751305206302E-4f, -6.97572385963986435018E-3f,
+                        -1.22611180822657148235E-1f, -3.53155960776544875667E-1f,
+                        1.52530022733894777053E0f};
+    const float B[] = {2.01504975519703286596E-9f, -1.03457624656780970260E-8f,
+                       5.74108412545004946722E-8f, -3.50196060308781257119E-7f,
+                       2.40648494783721712015E-6f, -1.93619797416608296024E-5f,
+                       1.95215518471351631108E-4f, -2.85781685962277938680E-3f,
+                       1.03923736576817238437E-1f, 2.72062619048444266945E0f};
+    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = pdiv(internal::pchebevl<T, 7>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A), x);
+    x_le_two = pmadd(
+        generic_i1<T, float>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(
+        pexp(pnegate(x)),
+        pmul(
+            internal::pchebevl<T, 10>::run(
+                psub(pdiv(pset1<T>(8.0), x), two), B),
+            prsqrt(x)));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_k1<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k1.c
+     *	Modified Bessel function, third kind, order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k1f();
+     *
+     * y = k1f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Computes the modified Bessel function of the third kind
+     * of order one of the argument.
+     *
+     * The range is partitioned into the two intervals [0,2] and
+     * (2, infinity).  Chebyshev polynomial expansions are employed
+     * in each interval.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       4.6e-7      7.6e-8
+     *
+     * ERROR MESSAGES:
+     *
+     *   message         condition      value returned
+     * k1 domain          x <= 0          MAXNUM
+     *
+     */
+    const double A[] = {-7.02386347938628759343E-18, -2.42744985051936593393E-15,
+                        -6.66690169419932900609E-13, -1.41148839263352776110E-10,
+                        -2.21338763073472585583E-8, -2.43340614156596823496E-6,
+                        -1.73028895751305206302E-4, -6.97572385963986435018E-3,
+                        -1.22611180822657148235E-1, -3.53155960776544875667E-1,
+                        1.52530022733894777053E0};
+    const double B[] = {-5.75674448366501715755E-18, 1.79405087314755922667E-17,
+                        -5.68946255844285935196E-17, 1.83809354436663880070E-16,
+                        -6.05704724837331885336E-16, 2.03870316562433424052E-15,
+                        -7.01983709041831346144E-15, 2.47715442448130437068E-14,
+                        -8.97670518232499435011E-14, 3.34841966607842919884E-13,
+                        -1.28917396095102890680E-12, 5.13963967348173025100E-12,
+                        -2.12996783842756842877E-11, 9.21831518760500529508E-11,
+                        -4.19035475934189648750E-10, 2.01504975519703286596E-9,
+                        -1.03457624656780970260E-8, 5.74108412545004946722E-8,
+                        -3.50196060308781257119E-7, 2.40648494783721712015E-6,
+                        -1.93619797416608296024E-5, 1.95215518471351631108E-4,
+                        -2.85781685962277938680E-3, 1.03923736576817238437E-1,
+                        2.72062619048444266945E0};
+    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = pdiv(internal::pchebevl<T, 11>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A), x);
+    x_le_two = pmadd(
+        generic_i1<T, double>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(
+        pexp(-x),
+        pmul(
+            internal::pchebevl<T, 25>::run(
+                psub(pdiv(pset1<T>(8.0), x), two), B),
+            prsqrt(x)));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct bessel_k1_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_k1<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_j0_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_j0 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_j0<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* j0f.c
+     *	Bessel function of order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, j0f();
+     *
+     * y = j0f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of order zero of the argument.
+     *
+     * The domain is divided into the intervals [0, 2] and
+     * (2, infinity). In the first interval the following polynomial
+     * approximation is used:
+     *
+     *
+     *        2         2         2
+     * (w - r  ) (w - r  ) (w - r  ) P(w)
+     *       1         2         3
+     *
+     *            2
+     * where w = x  and the three r's are zeros of the function.
+     *
+     * In the second interval, the modulus and phase are approximated
+     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+     * and Phase(x) = x + 1/x R(1/x^2) - pi/4.  The function is
+     *
+     *   j0(x) = Modulus(x) cos( Phase(x) ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 2        100000      1.3e-7      3.6e-8
+     *    IEEE      2, 32       100000      1.9e-7      5.4e-8
+     *
+     */
+
+    const float JP[] = {-6.068350350393235E-008f, 6.388945720783375E-006f,
+                        -3.969646342510940E-004f, 1.332913422519003E-002f,
+                        -1.729150680240724E-001f};
+    const float MO[] = {-6.838999669318810E-002f, 1.864949361379502E-001f,
+                        -2.145007480346739E-001f, 1.197549369473540E-001f,
+                        -3.560281861530129E-003f, -4.969382655296620E-002f,
+                        -3.355424622293709E-006f, 7.978845717621440E-001f};
+    const float PH[] = {3.242077816988247E+001f, -3.630592630518434E+001f,
+                        1.756221482109099E+001f, -4.974978466280903E+000f,
+                        1.001973420681837E+000f, -1.939906941791308E-001f,
+                        6.490598792654666E-002f, -1.249992184872738E-001f};
+    const T DR1 =  pset1<T>(5.78318596294678452118f);
+    const T NEG_PIO4F = pset1<T>(-0.7853981633974483096f); /* -pi / 4 */
+    T y = pabs(x);
+    T z = pmul(y, y);
+    T y_le_two = pselect(
+        pcmp_lt(y, pset1<T>(1.0e-3f)),
+        pmadd(z, pset1<T>(-0.25f), pset1<T>(1.0f)),
+        pmul(psub(z, DR1), internal::ppolevl<T, 4>::run(z, JP)));
+    T q = pdiv(pset1<T>(1.0f), y);
+    T w = prsqrt(y);
+    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO));
+    w = pmul(q, q);
+    T yn = pmadd(q, internal::ppolevl<T, 7>::run(w, PH), NEG_PIO4F);
+    T y_gt_two = pmul(p, pcos(padd(yn, y)));
+    return pselect(pcmp_le(y, pset1<T>(2.0)), y_le_two, y_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_j0<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  j0.c
+     *	Bessel function of order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, j0();
+     *
+     * y = j0( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of order zero of the argument.
+     *
+     * The domain is divided into the intervals [0, 5] and
+     * (5, infinity). In the first interval the following rational
+     * approximation is used:
+     *
+     *
+     *        2         2
+     * (w - r  ) (w - r  ) P (w) / Q (w)
+     *       1         2    3       8
+     *
+     *            2
+     * where w = x  and the two r's are zeros of the function.
+     *
+     * In the second interval, the Hankel asymptotic expansion
+     * is employed with two rational functions of degree 6/6
+     * and 7/7.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain     # trials      peak         rms
+     *    DEC       0, 30       10000       4.4e-17     6.3e-18
+     *    IEEE      0, 30       60000       4.2e-16     1.1e-16
+     *
+     */
+    const double PP[] = {7.96936729297347051624E-4, 8.28352392107440799803E-2,
+                        1.23953371646414299388E0, 5.44725003058768775090E0,
+                        8.74716500199817011941E0, 5.30324038235394892183E0,
+                        9.99999999999999997821E-1};
+    const double PQ[] = {9.24408810558863637013E-4, 8.56288474354474431428E-2,
+                         1.25352743901058953537E0, 5.47097740330417105182E0,
+                         8.76190883237069594232E0, 5.30605288235394617618E0,
+                         1.00000000000000000218E0};
+    const double QP[] = {-1.13663838898469149931E-2, -1.28252718670509318512E0,
+                         -1.95539544257735972385E1, -9.32060152123768231369E1,
+                         -1.77681167980488050595E2, -1.47077505154951170175E2,
+                         -5.14105326766599330220E1, -6.05014350600728481186E0};
+    const double QQ[] = {1.00000000000000000000E0, 6.43178256118178023184E1,
+                         8.56430025976980587198E2, 3.88240183605401609683E3,
+                         7.24046774195652478189E3, 5.93072701187316984827E3,
+                         2.06209331660327847417E3, 2.42005740240291393179E2};
+    const double RP[] = {-4.79443220978201773821E9, 1.95617491946556577543E12,
+                         -2.49248344360967716204E14, 9.70862251047306323952E15};
+    const double RQ[] = {1.00000000000000000000E0, 4.99563147152651017219E2,
+                         1.73785401676374683123E5, 4.84409658339962045305E7,
+                         1.11855537045356834862E10, 2.11277520115489217587E12,
+                         3.10518229857422583814E14, 3.18121955943204943306E16,
+                         1.71086294081043136091E18};
+    const T DR1 = pset1<T>(5.78318596294678452118E0);
+    const T DR2 = pset1<T>(3.04712623436620863991E1);
+    const T SQ2OPI = pset1<T>(7.9788456080286535587989E-1); /* sqrt(2 / pi) */
+    const T NEG_PIO4 = pset1<T>(-0.7853981633974483096); /* pi / 4 */
+
+    T y = pabs(x);
+    T z = pmul(y, y);
+    T y_le_five = pselect(
+        pcmp_lt(y, pset1<T>(1.0e-5)),
+        pmadd(z, pset1<T>(-0.25), pset1<T>(1.0)),
+        pmul(pmul(psub(z, DR1), psub(z, DR2)),
+             pdiv(internal::ppolevl<T, 3>::run(z, RP),
+                  internal::ppolevl<T, 8>::run(z, RQ))));
+    T s = pdiv(pset1<T>(25.0), z);
+    T p = pdiv(
+        internal::ppolevl<T, 6>::run(s, PP),
+        internal::ppolevl<T, 6>::run(s, PQ));
+    T q = pdiv(
+        internal::ppolevl<T, 7>::run(s, QP),
+        internal::ppolevl<T, 7>::run(s, QQ));
+    T yn = padd(y, NEG_PIO4);
+    T w = pdiv(pset1<T>(-5.0), y);
+    p = pmadd(p, pcos(yn), pmul(w, pmul(q, psin(yn))));
+    T y_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(y)));
+    return pselect(pcmp_le(y, pset1<T>(5.0)), y_le_five, y_gt_five);
+  }
+};
+
+template <typename T>
+struct bessel_j0_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_j0<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_y0_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_y0 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_y0<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* j0f.c
+     * 	Bessel function of the second kind, order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, y0f();
+     *
+     * y = y0f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of the second kind, of order
+     * zero, of the argument.
+     *
+     * The domain is divided into the intervals [0, 2] and
+     * (2, infinity). In the first interval a rational approximation
+     * R(x) is employed to compute
+     *
+     *                  2         2         2
+     * y0(x)  =  (w - r  ) (w - r  ) (w - r  ) R(x)  +  2/pi ln(x) j0(x).
+     *                 1         2         3
+     *
+     * Thus a call to j0() is required.  The three zeros are removed
+     * from R(x) to improve its numerical stability.
+     *
+     * In the second interval, the modulus and phase are approximated
+     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+     * and Phase(x) = x + 1/x S(1/x^2) - pi/4.  Then the function is
+     *
+     *   y0(x) = Modulus(x) sin( Phase(x) ).
+     *
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *  Absolute error, when y0(x) < 1; else relative error:
+     *
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,  2       100000      2.4e-7      3.4e-8
+     *    IEEE      2, 32       100000      1.8e-7      5.3e-8
+     *
+     */
+
+    const float YP[] = {9.454583683980369E-008f, -9.413212653797057E-006f,
+                        5.344486707214273E-004f, -1.584289289821316E-002f,
+                        1.707584643733568E-001f};
+    const float MO[] = {-6.838999669318810E-002f, 1.864949361379502E-001f,
+                        -2.145007480346739E-001f, 1.197549369473540E-001f,
+                        -3.560281861530129E-003f, -4.969382655296620E-002f,
+                        -3.355424622293709E-006f, 7.978845717621440E-001f};
+    const float PH[] = {3.242077816988247E+001f, -3.630592630518434E+001f,
+                        1.756221482109099E+001f, -4.974978466280903E+000f,
+                        1.001973420681837E+000f, -1.939906941791308E-001f,
+                        6.490598792654666E-002f, -1.249992184872738E-001f};
+    const T YZ1 = pset1<T>(0.43221455686510834878f);
+    const T TWOOPI =  pset1<T>(0.636619772367581343075535f); /* 2 / pi */
+    const T NEG_PIO4F = pset1<T>(-0.7853981633974483096f); /* -pi / 4 */
+    const T NEG_MAXNUM = pset1<T>(-NumTraits<float>::infinity());
+    T z = pmul(x, x);
+    T x_le_two = pmul(TWOOPI, pmul(plog(x), generic_j0<T, float>::run(x)));
+    x_le_two = pmadd(
+        psub(z, YZ1), internal::ppolevl<T, 4>::run(z, YP), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), NEG_MAXNUM, x_le_two);
+    T q = pdiv(pset1<T>(1.0), x);
+    T w = prsqrt(x);
+    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO));
+    T u = pmul(q, q);
+    T xn = pmadd(q, internal::ppolevl<T, 7>::run(u, PH), NEG_PIO4F);
+    T x_gt_two = pmul(p, psin(padd(xn, x)));
+    return pselect(pcmp_le(x, pset1<T>(2.0)), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_y0<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  j0.c
+     *	Bessel function of the second kind, order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, y0();
+     *
+     * y = y0( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of the second kind, of order
+     * zero, of the argument.
+     *
+     * The domain is divided into the intervals [0, 5] and
+     * (5, infinity). In the first interval a rational approximation
+     * R(x) is employed to compute
+     *   y0(x)  = R(x)  +   2 * log(x) * j0(x) / PI.
+     * Thus a call to j0() is required.
+     *
+     * In the second interval, the Hankel asymptotic expansion
+     * is employed with two rational functions of degree 6/6
+     * and 7/7.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *  Absolute error, when y0(x) < 1; else relative error:
+     *
+     * arithmetic   domain     # trials      peak         rms
+     *    DEC       0, 30        9400       7.0e-17     7.9e-18
+     *    IEEE      0, 30       30000       1.3e-15     1.6e-16
+     *
+     */
+    const double PP[] = {7.96936729297347051624E-4, 8.28352392107440799803E-2,
+                        1.23953371646414299388E0, 5.44725003058768775090E0,
+                        8.74716500199817011941E0, 5.30324038235394892183E0,
+                        9.99999999999999997821E-1};
+    const double PQ[] = {9.24408810558863637013E-4, 8.56288474354474431428E-2,
+                         1.25352743901058953537E0, 5.47097740330417105182E0,
+                         8.76190883237069594232E0, 5.30605288235394617618E0,
+                         1.00000000000000000218E0};
+    const double QP[] = {-1.13663838898469149931E-2, -1.28252718670509318512E0,
+                         -1.95539544257735972385E1, -9.32060152123768231369E1,
+                         -1.77681167980488050595E2, -1.47077505154951170175E2,
+                         -5.14105326766599330220E1, -6.05014350600728481186E0};
+    const double QQ[] = {1.00000000000000000000E0, 6.43178256118178023184E1,
+                         8.56430025976980587198E2, 3.88240183605401609683E3,
+                         7.24046774195652478189E3, 5.93072701187316984827E3,
+                         2.06209331660327847417E3, 2.42005740240291393179E2};
+    const double YP[] = {1.55924367855235737965E4, -1.46639295903971606143E7,
+                         5.43526477051876500413E9, -9.82136065717911466409E11,
+                         8.75906394395366999549E13, -3.46628303384729719441E15,
+                         4.42733268572569800351E16, -1.84950800436986690637E16};
+    const double YQ[] = {1.00000000000000000000E0,  1.04128353664259848412E3,
+                         6.26107330137134956842E5, 2.68919633393814121987E8,
+                         8.64002487103935000337E10, 2.02979612750105546709E13,
+                         3.17157752842975028269E15, 2.50596256172653059228E17};
+    const T SQ2OPI = pset1<T>(7.9788456080286535587989E-1); /* sqrt(2 / pi) */
+    const T TWOOPI =  pset1<T>(0.636619772367581343075535); /* 2 / pi */
+    const T NEG_PIO4 = pset1<T>(-0.7853981633974483096); /* -pi / 4 */
+    const T NEG_MAXNUM = pset1<T>(-NumTraits<double>::infinity());
+
+    T z = pmul(x, x);
+    T x_le_five = pdiv(internal::ppolevl<T, 7>::run(z, YP),
+                       internal::ppolevl<T, 7>::run(z, YQ));
+    x_le_five = pmadd(
+        pmul(TWOOPI, plog(x)), generic_j0<T, double>::run(x), x_le_five);
+    x_le_five = pselect(pcmp_le(x, pset1<T>(0.0)), NEG_MAXNUM, x_le_five);
+    T s = pdiv(pset1<T>(25.0), z);
+    T p = pdiv(
+        internal::ppolevl<T, 6>::run(s, PP),
+        internal::ppolevl<T, 6>::run(s, PQ));
+    T q = pdiv(
+        internal::ppolevl<T, 7>::run(s, QP),
+        internal::ppolevl<T, 7>::run(s, QQ));
+    T xn = padd(x, NEG_PIO4);
+    T w = pdiv(pset1<T>(5.0), x);
+    p = pmadd(p, psin(xn), pmul(w, pmul(q, pcos(xn))));
+    T x_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(x)));
+    return pselect(pcmp_le(x, pset1<T>(5.0)), x_le_five, x_gt_five);
+  }
+};
+
+template <typename T>
+struct bessel_y0_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_y0<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_j1_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_j1 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_j1<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* j1f.c
+     *	Bessel function of order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, j1f();
+     *
+     * y = j1f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of order one of the argument.
+     *
+     * The domain is divided into the intervals [0, 2] and
+     * (2, infinity). In the first interval a polynomial approximation
+     *        2
+     * (w - r  ) x P(w)
+     *       1
+     *                     2
+     * is used, where w = x  and r is the first zero of the function.
+     *
+     * In the second interval, the modulus and phase are approximated
+     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+     * and Phase(x) = x + 1/x R(1/x^2) - 3pi/4.  The function is
+     *
+     *   j0(x) = Modulus(x) cos( Phase(x) ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain      # trials      peak       rms
+     *    IEEE      0,  2       100000       1.2e-7     2.5e-8
+     *    IEEE      2, 32       100000       2.0e-7     5.3e-8
+     *
+     *
+     */
+
+    const float JP[] = {-4.878788132172128E-009f, 6.009061827883699E-007f,
+                        -4.541343896997497E-005f, 1.937383947804541E-003f,
+                        -3.405537384615824E-002f};
+    const float MO1[] = {6.913942741265801E-002f, -2.284801500053359E-001f,
+                        3.138238455499697E-001f, -2.102302420403875E-001f,
+                        5.435364690523026E-003f, 1.493389585089498E-001f,
+                        4.976029650847191E-006f, 7.978845453073848E-001f};
+    const float PH1[] = {-4.497014141919556E+001f, 5.073465654089319E+001f,
+                        -2.485774108720340E+001f, 7.222973196770240E+000f,
+                        -1.544842782180211E+000f, 3.503787691653334E-001f,
+                        -1.637986776941202E-001f, 3.749989509080821E-001f};
+    const T Z1 = pset1<T>(1.46819706421238932572E1f);
+    const T NEG_THPIO4F = pset1<T>(-2.35619449019234492885f);    /* -3*pi/4 */
+
+    T y = pabs(x);
+    T z = pmul(y, y);
+    T y_le_two = pmul(
+        psub(z, Z1),
+        pmul(x, internal::ppolevl<T, 4>::run(z, JP)));
+    T q = pdiv(pset1<T>(1.0f), y);
+    T w = prsqrt(y);
+    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO1));
+    w = pmul(q, q);
+    T yn = pmadd(q, internal::ppolevl<T, 7>::run(w, PH1), NEG_THPIO4F);
+    T y_gt_two = pmul(p, pcos(padd(yn, y)));
+    // j1 is an odd function. This implementation differs from cephes to
+    // take this fact in to account. Cephes returns -j1(x) for y > 2 range.
+    y_gt_two = pselect(
+        pcmp_lt(x, pset1<T>(0.0f)), pnegate(y_gt_two), y_gt_two);
+    return pselect(pcmp_le(y, pset1<T>(2.0f)), y_le_two, y_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_j1<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  j1.c
+     *	Bessel function of order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, j1();
+     *
+     * y = j1( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of order one of the argument.
+     *
+     * The domain is divided into the intervals [0, 8] and
+     * (8, infinity). In the first interval a 24 term Chebyshev
+     * expansion is used. In the second, the asymptotic
+     * trigonometric representation is employed using two
+     * rational functions of degree 5/5.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain      # trials      peak         rms
+     *    DEC       0, 30       10000       4.0e-17     1.1e-17
+     *    IEEE      0, 30       30000       2.6e-16     1.1e-16
+     *
+     */
+    const double PP[] = {7.62125616208173112003E-4, 7.31397056940917570436E-2,
+                         1.12719608129684925192E0, 5.11207951146807644818E0,
+                         8.42404590141772420927E0, 5.21451598682361504063E0,
+                         1.00000000000000000254E0};
+    const double PQ[] = {5.71323128072548699714E-4, 6.88455908754495404082E-2,
+                         1.10514232634061696926E0, 5.07386386128601488557E0,
+                         8.39985554327604159757E0, 5.20982848682361821619E0,
+                         9.99999999999999997461E-1};
+    const double QP[] = {5.10862594750176621635E-2, 4.98213872951233449420E0,
+                         7.58238284132545283818E1, 3.66779609360150777800E2,
+                         7.10856304998926107277E2, 5.97489612400613639965E2,
+                         2.11688757100572135698E2, 2.52070205858023719784E1};
+    const double QQ[] = {1.00000000000000000000E0, 7.42373277035675149943E1,
+                         1.05644886038262816351E3, 4.98641058337653607651E3,
+                         9.56231892404756170795E3, 7.99704160447350683650E3,
+                         2.82619278517639096600E3, 3.36093607810698293419E2};
+    const double RP[] = {-8.99971225705559398224E8, 4.52228297998194034323E11,
+                         -7.27494245221818276015E13, 3.68295732863852883286E15};
+    const double RQ[] = {1.00000000000000000000E0, 6.20836478118054335476E2,
+                         2.56987256757748830383E5, 8.35146791431949253037E7,
+                         2.21511595479792499675E10, 4.74914122079991414898E12,
+                         7.84369607876235854894E14, 8.95222336184627338078E16,
+                         5.32278620332680085395E18};
+    const T Z1 = pset1<T>(1.46819706421238932572E1);
+    const T Z2 = pset1<T>(4.92184563216946036703E1);
+    const T NEG_THPIO4 = pset1<T>(-2.35619449019234492885);    /* -3*pi/4 */
+    const T SQ2OPI = pset1<T>(7.9788456080286535587989E-1); /* sqrt(2 / pi) */
+    T y = pabs(x);
+    T z = pmul(y, y);
+    T y_le_five = pdiv(internal::ppolevl<T, 3>::run(z, RP),
+                       internal::ppolevl<T, 8>::run(z, RQ));
+    y_le_five = pmul(pmul(pmul(y_le_five, x), psub(z, Z1)), psub(z, Z2));
+    T s = pdiv(pset1<T>(25.0), z);
+    T p = pdiv(
+        internal::ppolevl<T, 6>::run(s, PP),
+        internal::ppolevl<T, 6>::run(s, PQ));
+    T q = pdiv(
+        internal::ppolevl<T, 7>::run(s, QP),
+        internal::ppolevl<T, 7>::run(s, QQ));
+    T yn = padd(y, NEG_THPIO4);
+    T w = pdiv(pset1<T>(-5.0), y);
+    p = pmadd(p, pcos(yn), pmul(w, pmul(q, psin(yn))));
+    T y_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(y)));
+    // j1 is an odd function. This implementation differs from cephes to
+    // take this fact in to account. Cephes returns -j1(x) for y > 5 range.
+    y_gt_five = pselect(
+        pcmp_lt(x, pset1<T>(0.0)), pnegate(y_gt_five), y_gt_five);
+    return pselect(pcmp_le(y, pset1<T>(5.0)), y_le_five, y_gt_five);
+  }
+};
+
+template <typename T>
+struct bessel_j1_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_j1<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_y1_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_y1 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_y1<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* j1f.c
+     *	Bessel function of second kind of order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, y1();
+     *
+     * y = y1( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of the second kind of order one
+     * of the argument.
+     *
+     * The domain is divided into the intervals [0, 2] and
+     * (2, infinity). In the first interval a rational approximation
+     * R(x) is employed to compute
+     *
+     *                  2
+     * y0(x)  =  (w - r  ) x R(x^2)  +  2/pi (ln(x) j1(x) - 1/x) .
+     *                 1
+     *
+     * Thus a call to j1() is required.
+     *
+     * In the second interval, the modulus and phase are approximated
+     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+     * and Phase(x) = x + 1/x S(1/x^2) - 3pi/4.  Then the function is
+     *
+     *   y0(x) = Modulus(x) sin( Phase(x) ).
+     *
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain      # trials      peak         rms
+     *    IEEE      0,  2       100000       2.2e-7     4.6e-8
+     *    IEEE      2, 32       100000       1.9e-7     5.3e-8
+     *
+     * (error criterion relative when |y1| > 1).
+     *
+     */
+
+    const float YP[] = {8.061978323326852E-009f, -9.496460629917016E-007f,
+                        6.719543806674249E-005f, -2.641785726447862E-003f,
+                        4.202369946500099E-002f};
+    const float MO1[] = {6.913942741265801E-002f, -2.284801500053359E-001f,
+                        3.138238455499697E-001f, -2.102302420403875E-001f,
+                        5.435364690523026E-003f, 1.493389585089498E-001f,
+                        4.976029650847191E-006f, 7.978845453073848E-001f};
+    const float PH1[] = {-4.497014141919556E+001f, 5.073465654089319E+001f,
+                        -2.485774108720340E+001f, 7.222973196770240E+000f,
+                        -1.544842782180211E+000f, 3.503787691653334E-001f,
+                        -1.637986776941202E-001f, 3.749989509080821E-001f};
+    const T YO1 = pset1<T>(4.66539330185668857532f);
+    const T NEG_THPIO4F = pset1<T>(-2.35619449019234492885f);    /* -3*pi/4 */
+    const T TWOOPI = pset1<T>(0.636619772367581343075535f); /* 2/pi */
+    const T NEG_MAXNUM = pset1<T>(-NumTraits<float>::infinity());
+
+    T z = pmul(x, x);
+    T x_le_two = pmul(psub(z, YO1), internal::ppolevl<T, 4>::run(z, YP));
+    x_le_two = pmadd(
+       x_le_two, x,
+       pmul(TWOOPI, pmadd(
+           generic_j1<T, float>::run(x), plog(x),
+           pdiv(pset1<T>(-1.0f), x))));
+    x_le_two = pselect(pcmp_lt(x, pset1<T>(0.0f)), NEG_MAXNUM, x_le_two);
+
+    T q = pdiv(pset1<T>(1.0), x);
+    T w = prsqrt(x);
+    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO1));
+    w = pmul(q, q);
+    T xn = pmadd(q, internal::ppolevl<T, 7>::run(w, PH1), NEG_THPIO4F);
+    T x_gt_two = pmul(p, psin(padd(xn, x)));
+    return pselect(pcmp_le(x, pset1<T>(2.0)), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_y1<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  j1.c
+     *	Bessel function of second kind of order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, y1();
+     *
+     * y = y1( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of the second kind of order one
+     * of the argument.
+     *
+     * The domain is divided into the intervals [0, 8] and
+     * (8, infinity). In the first interval a 25 term Chebyshev
+     * expansion is used, and a call to j1() is required.
+     * In the second, the asymptotic trigonometric representation
+     * is employed using two rational functions of degree 5/5.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain      # trials      peak         rms
+     *    DEC       0, 30       10000       8.6e-17     1.3e-17
+     *    IEEE      0, 30       30000       1.0e-15     1.3e-16
+     *
+     * (error criterion relative when |y1| > 1).
+     *
+     */
+    const double PP[] = {7.62125616208173112003E-4, 7.31397056940917570436E-2,
+                         1.12719608129684925192E0, 5.11207951146807644818E0,
+                         8.42404590141772420927E0, 5.21451598682361504063E0,
+                         1.00000000000000000254E0};
+    const double PQ[] = {5.71323128072548699714E-4, 6.88455908754495404082E-2,
+                         1.10514232634061696926E0, 5.07386386128601488557E0,
+                         8.39985554327604159757E0, 5.20982848682361821619E0,
+                         9.99999999999999997461E-1};
+    const double QP[] = {5.10862594750176621635E-2, 4.98213872951233449420E0,
+                         7.58238284132545283818E1, 3.66779609360150777800E2,
+                         7.10856304998926107277E2, 5.97489612400613639965E2,
+                         2.11688757100572135698E2, 2.52070205858023719784E1};
+    const double QQ[] = {1.00000000000000000000E0, 7.42373277035675149943E1,
+                         1.05644886038262816351E3, 4.98641058337653607651E3,
+                         9.56231892404756170795E3, 7.99704160447350683650E3,
+                         2.82619278517639096600E3, 3.36093607810698293419E2};
+    const double YP[] = {1.26320474790178026440E9, -6.47355876379160291031E11,
+                         1.14509511541823727583E14, -8.12770255501325109621E15,
+                         2.02439475713594898196E17, -7.78877196265950026825E17};
+    const double YQ[] = {1.00000000000000000000E0, 5.94301592346128195359E2,
+                         2.35564092943068577943E5, 7.34811944459721705660E7,
+                         1.87601316108706159478E10, 3.88231277496238566008E12,
+                         6.20557727146953693363E14, 6.87141087355300489866E16,
+                         3.97270608116560655612E18};
+    const T SQ2OPI = pset1<T>(.79788456080286535588);
+    const T NEG_THPIO4 = pset1<T>(-2.35619449019234492885);    /* -3*pi/4 */
+    const T TWOOPI = pset1<T>(0.636619772367581343075535); /* 2/pi */
+    const T NEG_MAXNUM = pset1<T>(-NumTraits<double>::infinity());
+
+    T z = pmul(x, x);
+    T x_le_five = pdiv(internal::ppolevl<T, 5>::run(z, YP),
+                   internal::ppolevl<T, 8>::run(z, YQ));
+    x_le_five = pmadd(
+        x_le_five, x, pmul(
+            TWOOPI, pmadd(generic_j1<T, double>::run(x), plog(x),
+                          pdiv(pset1<T>(-1.0), x))));
+
+    x_le_five = pselect(pcmp_le(x, pset1<T>(0.0)), NEG_MAXNUM, x_le_five);
+    T s = pdiv(pset1<T>(25.0), z);
+    T p = pdiv(
+        internal::ppolevl<T, 6>::run(s, PP),
+        internal::ppolevl<T, 6>::run(s, PQ));
+    T q = pdiv(
+        internal::ppolevl<T, 7>::run(s, QP),
+        internal::ppolevl<T, 7>::run(s, QQ));
+    T xn = padd(x, NEG_THPIO4);
+    T w = pdiv(pset1<T>(5.0), x);
+    p = pmadd(p, psin(xn), pmul(w, pmul(q, pcos(xn))));
+    T x_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(x)));
+    return pselect(pcmp_le(x, pset1<T>(5.0)), x_le_five, x_gt_five);
+  }
+};
+
+template <typename T>
+struct bessel_y1_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_y1<T>::run(x);
+  }
+};
+
+}  // end namespace internal
+
+namespace numext {
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i0, Scalar)
+    bessel_i0(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_i0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i0e, Scalar)
+    bessel_i0e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_i0e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i1, Scalar)
+    bessel_i1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_i1, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i1e, Scalar)
+    bessel_i1e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_i1e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k0, Scalar)
+    bessel_k0(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_k0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k0e, Scalar)
+    bessel_k0e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_k0e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k1, Scalar)
+    bessel_k1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_k1, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k1e, Scalar)
+    bessel_k1e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_k1e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_j0, Scalar)
+    bessel_j0(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_j0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_y0, Scalar)
+    bessel_y0(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_y0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_j1, Scalar)
+    bessel_j1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_j1, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_y1, Scalar)
+    bessel_y1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_y1, Scalar)::run(x);
+}
+
+}  // end namespace numext
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_BESSEL_FUNCTIONS_H

diff --git a/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h
new file mode 100644
index 0000000..943d10f
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h

@@ -0,0 +1,118 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_PACKETMATH_H
+#define EIGEN_BESSELFUNCTIONS_PACKETMATH_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero i0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_i0(const Packet& x) {
+  return numext::bessel_i0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero i0e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_i0e(const Packet& x) {
+  return numext::bessel_i0e(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one i1(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_i1(const Packet& x) {
+  return numext::bessel_i1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one i1e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_i1e(const Packet& x) {
+  return numext::bessel_i1e(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero j0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_j0(const Packet& x) {
+  return numext::bessel_j0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero j1(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_j1(const Packet& x) {
+  return numext::bessel_j1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one y0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_y0(const Packet& x) {
+  return numext::bessel_y0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one y1(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_y1(const Packet& x) {
+  return numext::bessel_y1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero k0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_k0(const Packet& x) {
+  return numext::bessel_k0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero k0e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_k0e(const Packet& x) {
+  return numext::bessel_k0e(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one k1e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_k1(const Packet& x) {
+  return numext::bessel_k1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one k1e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_k1e(const Packet& x) {
+  return numext::bessel_k1e(x);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_BESSELFUNCTIONS_PACKETMATH_H
+

diff --git a/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h b/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h
new file mode 100644
index 0000000..d7b231a
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h

@@ -0,0 +1,67 @@
+#ifndef HIP_VECTOR_COMPATIBILITY_H
+#define HIP_VECTOR_COMPATIBILITY_H
+
+namespace hip_impl {
+  template <typename, typename, unsigned int> struct Scalar_accessor;
+}   // end namespace hip_impl
+
+namespace Eigen {
+namespace internal {
+
+#define HIP_SCALAR_ACCESSOR_BUILDER(NAME) \
+template <typename T, typename U, unsigned int n> \
+struct NAME <hip_impl::Scalar_accessor<T, U, n>> : NAME <T> {};
+
+#define HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(NAME) \
+template <typename T, typename U, unsigned int n> \
+struct NAME##_impl <hip_impl::Scalar_accessor<T, U, n>> : NAME##_impl <T> {}; \
+template <typename T, typename U, unsigned int n> \
+struct NAME##_retval <hip_impl::Scalar_accessor<T, U, n>> : NAME##_retval <T> {};
+
+#define HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(NAME) \
+template <typename T, typename U, unsigned int n, IgammaComputationMode mode> \
+struct NAME <hip_impl::Scalar_accessor<T, U, n>, mode> : NAME <T, mode> {};
+
+#if EIGEN_HAS_C99_MATH
+HIP_SCALAR_ACCESSOR_BUILDER(betainc_helper)
+HIP_SCALAR_ACCESSOR_BUILDER(incbeta_cfe)
+
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(erf)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(erfc)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igammac)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(lgamma)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(ndtri)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(polygamma)
+
+HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igamma_generic_impl)
+#endif
+
+HIP_SCALAR_ACCESSOR_BUILDER(digamma_impl_maybe_poly)
+HIP_SCALAR_ACCESSOR_BUILDER(zeta_impl_series)
+
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i0e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i1e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_j0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_j1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k0e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k1e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_y0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_y1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(betainc)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(digamma)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(gamma_sample_der_alpha)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igamma_der_a)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igamma)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(zeta)
+
+HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igamma_series_impl)
+HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igammac_cf_impl)
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // HIP_VECTOR_COMPATIBILITY_H

diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
index d8b7755..691ff4d 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h

@@ -7,138 +7,161 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+
 #ifndef EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
 #define EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
 
 namespace Eigen {
 
-/** \cpp11 \returns an expression of the coefficient-wise igamma(\a a, \a x) to
- * the given arrays.
+/** \cpp11 \returns an expression of the coefficient-wise igamma(\a a, \a x) to the given arrays.
   *
   * This function computes the coefficient-wise incomplete gamma function.
   *
-  * \note This function supports only float and double scalar types in c++11
- * mode. To support other scalar types,
-  * or float/double in non c++11 mode, the user has to provide implementations
- * of igammac(T,T) for any scalar
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar
   * type T to be supported.
   *
   * \sa Eigen::igammac(), Eigen::lgamma()
   */
-template <typename Derived, typename ExponentDerived>
-inline const Eigen::CwiseBinaryOp<
-    Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived,
-    const ExponentDerived>
-igamma(const Eigen::ArrayBase<Derived>& a,
-       const Eigen::ArrayBase<ExponentDerived>& x) {
-  return Eigen::CwiseBinaryOp<
-      Eigen::internal::scalar_igamma_op<typename Derived::Scalar>,
-      const Derived, const ExponentDerived>(a.derived(), x.derived());
+template<typename Derived,typename ExponentDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
+{
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
+    a.derived(),
+    x.derived()
+  );
 }
 
-/** \cpp11 \returns an expression of the coefficient-wise igammac(\a a, \a x) to
- * the given arrays.
+/** \cpp11 \returns an expression of the coefficient-wise igamma_der_a(\a a, \a x) to the given arrays.
   *
-  * This function computes the coefficient-wise complementary incomplete gamma
- * function.
+  * This function computes the coefficient-wise derivative of the incomplete
+  * gamma function with respect to the parameter a.
   *
   * \note This function supports only float and double scalar types in c++11
- * mode. To support other scalar types,
+  * mode. To support other scalar types,
   * or float/double in non c++11 mode, the user has to provide implementations
- * of igammac(T,T) for any scalar
+  * of igamma_der_a(T,T) for any scalar
   * type T to be supported.
   *
   * \sa Eigen::igamma(), Eigen::lgamma()
   */
 template <typename Derived, typename ExponentDerived>
-inline const Eigen::CwiseBinaryOp<
-    Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived,
-    const ExponentDerived>
-igammac(const Eigen::ArrayBase<Derived>& a,
-        const Eigen::ArrayBase<ExponentDerived>& x) {
-  return Eigen::CwiseBinaryOp<
-      Eigen::internal::scalar_igammac_op<typename Derived::Scalar>,
-      const Derived, const ExponentDerived>(a.derived(), x.derived());
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_der_a_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+igamma_der_a(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) {
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_der_a_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
+    a.derived(),
+    x.derived());
 }
 
-/** \cpp11 \returns an expression of the coefficient-wise polygamma(\a n, \a x)
- * to the given arrays.
+/** \cpp11 \returns an expression of the coefficient-wise gamma_sample_der_alpha(\a alpha, \a sample) to the given arrays.
+  *
+  * This function computes the coefficient-wise derivative of the sample
+  * of a Gamma(alpha, 1) random variable with respect to the parameter alpha.
+  *
+  * \note This function supports only float and double scalar types in c++11
+  * mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations
+  * of gamma_sample_der_alpha(T,T) for any scalar
+  * type T to be supported.
+  *
+  * \sa Eigen::igamma(), Eigen::lgamma()
+  */
+template <typename AlphaDerived, typename SampleDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_gamma_sample_der_alpha_op<typename AlphaDerived::Scalar>, const AlphaDerived, const SampleDerived>
+gamma_sample_der_alpha(const Eigen::ArrayBase<AlphaDerived>& alpha, const Eigen::ArrayBase<SampleDerived>& sample) {
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_gamma_sample_der_alpha_op<typename AlphaDerived::Scalar>, const AlphaDerived, const SampleDerived>(
+      alpha.derived(),
+      sample.derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays.
+  *
+  * This function computes the coefficient-wise complementary incomplete gamma function.
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar
+  * type T to be supported.
+  *
+  * \sa Eigen::igamma(), Eigen::lgamma()
+  */
+template<typename Derived,typename ExponentDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
+{
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
+    a.derived(),
+    x.derived()
+  );
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise polygamma(\a n, \a x) to the given arrays.
   *
   * It returns the \a n -th derivative of the digamma(psi) evaluated at \c x.
   *
-  * \note This function supports only float and double scalar types in c++11
- * mode. To support other scalar types,
-  * or float/double in non c++11 mode, the user has to provide implementations
- * of polygamma(T,T) for any scalar
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of polygamma(T,T) for any scalar
   * type T to be supported.
   *
   * \sa Eigen::digamma()
   */
-// * \warning Be careful with the order of the parameters: x.polygamma(n) is
-// equivalent to polygamma(n,x)
+// * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x)
 // * \sa ArrayBase::polygamma()
-template <typename DerivedN, typename DerivedX>
-inline const Eigen::CwiseBinaryOp<
-    Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>,
-    const DerivedN, const DerivedX>
-polygamma(const Eigen::ArrayBase<DerivedN>& n,
-          const Eigen::ArrayBase<DerivedX>& x) {
-  return Eigen::CwiseBinaryOp<
-      Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>,
-      const DerivedN, const DerivedX>(n.derived(), x.derived());
+template<typename DerivedN,typename DerivedX>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>
+polygamma(const Eigen::ArrayBase<DerivedN>& n, const Eigen::ArrayBase<DerivedX>& x)
+{
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>(
+    n.derived(),
+    x.derived()
+  );
 }
 
-/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a
- * b) to the given arrays.
+/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given arrays.
   *
   * This function computes the regularized incomplete beta function (integral).
   *
-  * \note This function supports only float and double scalar types in c++11
- * mode. To support other scalar types,
-  * or float/double in non c++11 mode, the user has to provide implementations
- * of betainc(T,T,T) for any scalar
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of betainc(T,T,T) for any scalar
   * type T to be supported.
   *
   * \sa Eigen::betainc(), Eigen::lgamma()
   */
-template <typename ArgADerived, typename ArgBDerived, typename ArgXDerived>
-inline const Eigen::CwiseTernaryOp<
-    Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>,
-    const ArgADerived, const ArgBDerived, const ArgXDerived>
-betainc(const Eigen::ArrayBase<ArgADerived>& a,
-        const Eigen::ArrayBase<ArgBDerived>& b,
-        const Eigen::ArrayBase<ArgXDerived>& x) {
-  return Eigen::CwiseTernaryOp<
-      Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>,
-      const ArgADerived, const ArgBDerived, const ArgXDerived>(
-      a.derived(), b.derived(), x.derived());
+template<typename ArgADerived, typename ArgBDerived, typename ArgXDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>
+betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDerived>& b, const Eigen::ArrayBase<ArgXDerived>& x)
+{
+  return Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>(
+    a.derived(),
+    b.derived(),
+    x.derived()
+  );
 }
 
-/** \returns an expression of the coefficient-wise zeta(\a x, \a q) to the given
- * arrays.
+
+/** \returns an expression of the coefficient-wise zeta(\a x, \a q) to the given arrays.
   *
   * It returns the Riemann zeta function of two arguments \a x and \a q:
   *
-  * \param x is the exposent, it must be > 1
+  * \param x is the exponent, it must be > 1
   * \param q is the shift, it must be > 0
   *
-  * \note This function supports only float and double scalar types. To support
- * other scalar types, the user has
-  * to provide implementations of zeta(T,T) for any scalar type T to be
- * supported.
+  * \note This function supports only float and double scalar types. To support other scalar types, the user has
+  * to provide implementations of zeta(T,T) for any scalar type T to be supported.
   *
   * \sa ArrayBase::zeta()
   */
-template <typename DerivedX, typename DerivedQ>
-inline const Eigen::CwiseBinaryOp<
-    Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX,
-    const DerivedQ>
-zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q) {
-  return Eigen::CwiseBinaryOp<
-      Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>,
-      const DerivedX, const DerivedQ>(x.derived(), q.derived());
+template<typename DerivedX,typename DerivedQ>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>
+zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q)
+{
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>(
+    x.derived(),
+    q.derived()
+  );
 }
 
-}  // end namespace Eigen
 
-#endif  // EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H

diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h
new file mode 100644
index 0000000..2d94231
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h

@@ -0,0 +1,58 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_BFLOAT16_H
+#define EIGEN_SPECIALFUNCTIONS_BFLOAT16_H
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 lgamma(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::lgamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 digamma(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::digamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 zeta(const Eigen::bfloat16& x, const Eigen::bfloat16& q) {
+  return Eigen::bfloat16(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 polygamma(const Eigen::bfloat16& n, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 erf(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::erf(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 erfc(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::erfc(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 ndtri(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::ndtri(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 igamma(const Eigen::bfloat16& a, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 igamma_der_a(const Eigen::bfloat16& a, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::igamma_der_a(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 gamma_sample_der_alpha(const Eigen::bfloat16& alpha, const Eigen::bfloat16& sample) {
+  return Eigen::bfloat16(Eigen::numext::gamma_sample_der_alpha(static_cast<float>(alpha), static_cast<float>(sample)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 igammac(const Eigen::bfloat16& a, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 betainc(const Eigen::bfloat16& a, const Eigen::bfloat16& b, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::betainc(static_cast<float>(a), static_cast<float>(b), static_cast<float>(x)));
+}
+#endif
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIALFUNCTIONS_BFLOAT16_H

diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
index 0e51c03..abefe99 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h

@@ -15,27 +15,24 @@
 
 namespace internal {
 
+
 /** \internal
-  * \brief Template functor to compute the incomplete gamma function igamma(a,
- * x)
+  * \brief Template functor to compute the incomplete gamma function igamma(a, x)
   *
   * \sa class CwiseBinaryOp, Cwise::igamma
   */
-template <typename Scalar>
-struct scalar_igamma_op {
+template<typename Scalar> struct scalar_igamma_op : binary_op_base<Scalar,Scalar>
+{
   EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
-  operator()(const Scalar& a, const Scalar& x) const {
-    using numext::igamma;
-    return igamma(a, x);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
+    using numext::igamma; return igamma(a, x);
   }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
-  packetOp(const Packet& a, const Packet& x) const {
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
     return internal::pigamma(a, x);
   }
 };
-template <typename Scalar>
+template<typename Scalar>
 struct functor_traits<scalar_igamma_op<Scalar> > {
   enum {
     // Guesstimate
@@ -45,26 +42,78 @@
 };
 
 /** \internal
-  * \brief Template functor to compute the complementary incomplete gamma
- * function igammac(a, x)
+  * \brief Template functor to compute the derivative of the incomplete gamma
+  * function igamma_der_a(a, x)
   *
-  * \sa class CwiseBinaryOp, Cwise::igammac
+  * \sa class CwiseBinaryOp, Cwise::igamma_der_a
   */
 template <typename Scalar>
-struct scalar_igammac_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_igammac_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
-  operator()(const Scalar& a, const Scalar& x) const {
-    using numext::igammac;
-    return igammac(a, x);
+struct scalar_igamma_der_a_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_der_a_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a, const Scalar& x) const {
+    using numext::igamma_der_a;
+    return igamma_der_a(a, x);
   }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
-  packetOp(const Packet& a, const Packet& x) const {
-    return internal::pigammac(a, x);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
+    return internal::pigamma_der_a(a, x);
   }
 };
 template <typename Scalar>
+struct functor_traits<scalar_igamma_der_a_op<Scalar> > {
+  enum {
+    // 2x the cost of igamma
+    Cost = 40 * NumTraits<Scalar>::MulCost + 20 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasIGammaDerA
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the derivative of the sample
+  * of a Gamma(alpha, 1) random variable with respect to the parameter alpha
+  * gamma_sample_der_alpha(alpha, sample)
+  *
+  * \sa class CwiseBinaryOp, Cwise::gamma_sample_der_alpha
+  */
+template <typename Scalar>
+struct scalar_gamma_sample_der_alpha_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_gamma_sample_der_alpha_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& alpha, const Scalar& sample) const {
+    using numext::gamma_sample_der_alpha;
+    return gamma_sample_der_alpha(alpha, sample);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& alpha, const Packet& sample) const {
+    return internal::pgamma_sample_der_alpha(alpha, sample);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_gamma_sample_der_alpha_op<Scalar> > {
+  enum {
+    // 2x the cost of igamma, minus the lgamma cost (the lgamma cancels out)
+    Cost = 30 * NumTraits<Scalar>::MulCost + 15 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasGammaSampleDerAlpha
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x)
+  *
+  * \sa class CwiseBinaryOp, Cwise::igammac
+  */
+template<typename Scalar> struct scalar_igammac_op : binary_op_base<Scalar,Scalar>
+{
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_igammac_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
+    using numext::igammac; return igammac(a, x);
+  }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const
+  {
+    return internal::pigammac(a, x);
+  }
+};
+template<typename Scalar>
 struct functor_traits<scalar_igammac_op<Scalar> > {
   enum {
     // Guesstimate
@@ -73,26 +122,23 @@
   };
 };
 
+
 /** \internal
-  * \brief Template functor to compute the incomplete beta integral betainc(a,
- * b, x)
+  * \brief Template functor to compute the incomplete beta integral betainc(a, b, x)
   *
   */
-template <typename Scalar>
-struct scalar_betainc_op {
+template<typename Scalar> struct scalar_betainc_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_betainc_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
-  operator()(const Scalar& x, const Scalar& a, const Scalar& b) const {
-    using numext::betainc;
-    return betainc(x, a, b);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& x, const Scalar& a, const Scalar& b) const {
+    using numext::betainc; return betainc(x, a, b);
   }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
-  packetOp(const Packet& x, const Packet& a, const Packet& b) const {
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& x, const Packet& a, const Packet& b) const
+  {
     return internal::pbetainc(x, a, b);
   }
 };
-template <typename Scalar>
+template<typename Scalar>
 struct functor_traits<scalar_betainc_op<Scalar> > {
   enum {
     // Guesstimate
@@ -101,25 +147,23 @@
   };
 };
 
+
 /** \internal
  * \brief Template functor to compute the natural log of the absolute
  * value of Gamma of a scalar
  * \sa class CwiseUnaryOp, Cwise::lgamma()
  */
-template <typename Scalar>
-struct scalar_lgamma_op {
+template<typename Scalar> struct scalar_lgamma_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const {
-    using numext::lgamma;
-    return lgamma(a);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
+    using numext::lgamma; return lgamma(a);
   }
   typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
-    return internal::plgamma(a);
-  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
 };
-template <typename Scalar>
-struct functor_traits<scalar_lgamma_op<Scalar> > {
+template<typename Scalar>
+struct functor_traits<scalar_lgamma_op<Scalar> >
+{
   enum {
     // Guesstimate
     Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
@@ -131,20 +175,17 @@
  * \brief Template functor to compute psi, the derivative of lgamma of a scalar.
  * \sa class CwiseUnaryOp, Cwise::digamma()
  */
-template <typename Scalar>
-struct scalar_digamma_op {
+template<typename Scalar> struct scalar_digamma_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const {
-    using numext::digamma;
-    return digamma(a);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
+    using numext::digamma; return digamma(a);
   }
   typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
-    return internal::pdigamma(a);
-  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
 };
-template <typename Scalar>
-struct functor_traits<scalar_digamma_op<Scalar> > {
+template<typename Scalar>
+struct functor_traits<scalar_digamma_op<Scalar> >
+{
   enum {
     // Guesstimate
     Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
@@ -153,83 +194,84 @@
 };
 
 /** \internal
- * \brief Template functor to compute the Riemann Zeta function of two
- * arguments.
+ * \brief Template functor to compute the Riemann Zeta function of two arguments.
  * \sa class CwiseUnaryOp, Cwise::zeta()
  */
-template <typename Scalar>
-struct scalar_zeta_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_zeta_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& x,
-                                                   const Scalar& q) const {
-    using numext::zeta;
-    return zeta(x, q);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x,
-                                           const Packet& q) const {
-    return internal::pzeta(x, q);
-  }
+template<typename Scalar> struct scalar_zeta_op {
+    EIGEN_EMPTY_STRUCT_CTOR(scalar_zeta_op)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& x, const Scalar& q) const {
+        using numext::zeta; return zeta(x, q);
+    }
+    typedef typename packet_traits<Scalar>::type Packet;
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); }
 };
-template <typename Scalar>
-struct functor_traits<scalar_zeta_op<Scalar> > {
-  enum {
-    // Guesstimate
-    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasZeta
-  };
+template<typename Scalar>
+struct functor_traits<scalar_zeta_op<Scalar> >
+{
+    enum {
+        // Guesstimate
+        Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+        PacketAccess = packet_traits<Scalar>::HasZeta
+    };
 };
 
 /** \internal
  * \brief Template functor to compute the polygamma function.
  * \sa class CwiseUnaryOp, Cwise::polygamma()
  */
-template <typename Scalar>
-struct scalar_polygamma_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_polygamma_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& n,
-                                                   const Scalar& x) const {
-    using numext::polygamma;
-    return polygamma(n, x);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& n,
-                                           const Packet& x) const {
-    return internal::ppolygamma(n, x);
-  }
+template<typename Scalar> struct scalar_polygamma_op {
+    EIGEN_EMPTY_STRUCT_CTOR(scalar_polygamma_op)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& n, const Scalar& x) const {
+        using numext::polygamma; return polygamma(n, x);
+    }
+    typedef typename packet_traits<Scalar>::type Packet;
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); }
 };
-template <typename Scalar>
-struct functor_traits<scalar_polygamma_op<Scalar> > {
-  enum {
-    // Guesstimate
-    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasPolygamma
-  };
+template<typename Scalar>
+struct functor_traits<scalar_polygamma_op<Scalar> >
+{
+    enum {
+        // Guesstimate
+        Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+        PacketAccess = packet_traits<Scalar>::HasPolygamma
+    };
 };
 
 /** \internal
- * \brief Template functor to compute the Gauss error function of a
- * scalar
- * \sa class CwiseUnaryOp, Cwise::erf()
+ * \brief Template functor to compute the error function of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::erf()
  */
-template <typename Scalar>
-struct scalar_erf_op {
+template<typename Scalar> struct scalar_erf_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const {
-    using numext::erf;
-    return erf(a);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
+  operator()(const Scalar& a) const {
+    return numext::erf(a);
   }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
-    return internal::perf(a);
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return perf(x);
   }
 };
 template <typename Scalar>
 struct functor_traits<scalar_erf_op<Scalar> > {
   enum {
-    // Guesstimate
-    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasErf
+    PacketAccess = packet_traits<Scalar>::HasErf,
+    Cost =
+        (PacketAccess
+#ifdef EIGEN_VECTORIZE_FMA
+             // TODO(rmlarsen): Move the FMA cost model to a central location.
+             // Haswell can issue 2 add/mul/madd per cycle.
+             // 10 pmadd, 2 pmul, 1 div, 2 other
+             ? (2 * NumTraits<Scalar>::AddCost +
+                7 * NumTraits<Scalar>::MulCost +
+                scalar_div_cost<Scalar, packet_traits<Scalar>::HasDiv>::value)
+#else
+             ? (12 * NumTraits<Scalar>::AddCost +
+                12 * NumTraits<Scalar>::MulCost +
+                scalar_div_cost<Scalar, packet_traits<Scalar>::HasDiv>::value)
+#endif
+             // Assume for simplicity that this is as expensive as an exp().
+             : (functor_traits<scalar_exp_op<Scalar> >::Cost))
   };
 };
 
@@ -238,20 +280,17 @@
  * of a scalar
  * \sa class CwiseUnaryOp, Cwise::erfc()
  */
-template <typename Scalar>
-struct scalar_erfc_op {
+template<typename Scalar> struct scalar_erfc_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const {
-    using numext::erfc;
-    return erfc(a);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
+    using numext::erfc; return erfc(a);
   }
   typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
-    return internal::perfc(a);
-  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::perfc(a); }
 };
-template <typename Scalar>
-struct functor_traits<scalar_erfc_op<Scalar> > {
+template<typename Scalar>
+struct functor_traits<scalar_erfc_op<Scalar> >
+{
   enum {
     // Guesstimate
     Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
@@ -259,8 +298,33 @@
   };
 };
 
-}  // end namespace internal
+/** \internal
+ * \brief Template functor to compute the Inverse of the normal distribution
+ * function of a scalar
+ * \sa class CwiseUnaryOp, Cwise::ndtri()
+ */
+template<typename Scalar> struct scalar_ndtri_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_ndtri_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
+    using numext::ndtri; return ndtri(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::pndtri(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_ndtri_op<Scalar> >
+{
+  enum {
+    // On average, We are evaluating rational functions with degree N=9 in the
+    // numerator and denominator. This results in 2*N additions and 2*N
+    // multiplications.
+    Cost = 18 * NumTraits<Scalar>::MulCost + 18 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasNdtri
+  };
+};
 
-}  // end namespace Eigen
+} // end namespace internal
 
-#endif  // EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_FUNCTORS_H

diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
index 78588ea..2a3a531 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h

@@ -11,53 +11,46 @@
 namespace Eigen {
 namespace numext {
 
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) {
+#if EIGEN_HAS_C99_MATH
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) {
   return Eigen::half(Eigen::numext::lgamma(static_cast<float>(a)));
 }
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half digamma(
-    const Eigen::half& a) {
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half digamma(const Eigen::half& a) {
   return Eigen::half(Eigen::numext::digamma(static_cast<float>(a)));
 }
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half zeta(const Eigen::half& x,
-                                                       const Eigen::half& q) {
-  return Eigen::half(
-      Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q)));
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half zeta(const Eigen::half& x, const Eigen::half& q) {
+  return Eigen::half(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q)));
 }
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half polygamma(
-    const Eigen::half& n, const Eigen::half& x) {
-  return Eigen::half(
-      Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x)));
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half polygamma(const Eigen::half& n, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x)));
 }
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) {
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) {
   return Eigen::half(Eigen::numext::erf(static_cast<float>(a)));
 }
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) {
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) {
   return Eigen::half(Eigen::numext::erfc(static_cast<float>(a)));
 }
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a,
-                                                         const Eigen::half& x) {
-  return Eigen::half(
-      Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ndtri(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::ndtri(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
 }
 template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(
-    const Eigen::half& a, const Eigen::half& x) {
-  return Eigen::half(
-      Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma_der_a(const Eigen::half& a, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::igamma_der_a(static_cast<float>(a), static_cast<float>(x)));
 }
 template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half betainc(
-    const Eigen::half& a, const Eigen::half& b, const Eigen::half& x) {
-  return Eigen::half(Eigen::numext::betainc(
-      static_cast<float>(a), static_cast<float>(b), static_cast<float>(x)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half gamma_sample_der_alpha(const Eigen::half& alpha, const Eigen::half& sample) {
+  return Eigen::half(Eigen::numext::gamma_sample_der_alpha(static_cast<float>(alpha), static_cast<float>(sample)));
 }
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half betainc(const Eigen::half& a, const Eigen::half& b, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::betainc(static_cast<float>(a), static_cast<float>(b), static_cast<float>(x)));
+}
+#endif
 
 }  // end namespace numext
 }  // end namespace Eigen

diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index 8591f16..f1c260e 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h

@@ -30,73 +30,12 @@
 //    Thank you for writing.
 //
 //    If your licensing is similar to BSD, the formal way that has been
-//    handled is simply to add a statement to the effect that you are
-//    incorporating
+//    handled is simply to add a statement to the effect that you are incorporating
 //    the Cephes software by permission of the author.
 //
 //    Good luck with your project,
 //    Steve
 
-namespace cephes {
-
-/* polevl (modified for Eigen)
- *
- *      Evaluate polynomial
- *
- *
- *
- * SYNOPSIS:
- *
- * int N;
- * Scalar x, y, coef[N+1];
- *
- * y = polevl<decltype(x), N>( x, coef);
- *
- *
- *
- * DESCRIPTION:
- *
- * Evaluates polynomial of degree N:
- *
- *                     2          N
- * y  =  C  + C x + C x  +...+ C x
- *        0    1     2          N
- *
- * Coefficients are stored in reverse order:
- *
- * coef[0] = C  , ..., coef[N] = C  .
- *            N                   0
- *
- *  The function p1evl() assumes that coef[N] = 1.0 and is
- * omitted from the array.  Its calling arguments are
- * otherwise the same as polevl().
- *
- *
- * The Eigen implementation is templatized.  For best speed, store
- * coef as a const array (constexpr), e.g.
- *
- * const double coef[] = {1.0, 2.0, 3.0, ...};
- *
- */
-template <typename Scalar, int N>
-struct polevl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar x, const Scalar coef[]) {
-    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    return polevl<Scalar, N - 1>::run(x, coef) * x + coef[N];
-  }
-};
-
-template <typename Scalar>
-struct polevl<Scalar, 0> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar, const Scalar coef[]) {
-    return coef[0];
-  }
-};
-
-}  // end namespace cephes
 
 /****************************************************************************
  * Implementation of lgamma, requires C++11/C99                             *
@@ -117,13 +56,28 @@
   typedef Scalar type;
 };
 
+#if EIGEN_HAS_C99_MATH
+// Since glibc 2.19
+#if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 19) || __GLIBC__>2) \
+ && (defined(_DEFAULT_SOURCE) || defined(_BSD_SOURCE) || defined(_SVID_SOURCE))
+#define EIGEN_HAS_LGAMMA_R
+#endif
+
+// Glibc versions before 2.19
+#if defined(__GLIBC__) && ((__GLIBC__==2 && __GLIBC_MINOR__ < 19) || __GLIBC__<2) \
+ && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE))
+#define EIGEN_HAS_LGAMMA_R
+#endif
+
 template <>
 struct lgamma_impl<float> {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE float run(float x) {
-#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE))
-    int signgam;
-    return ::lgammaf_r(x, &signgam);
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && defined (EIGEN_HAS_LGAMMA_R) && !defined(__APPLE__)
+    int dummy;
+    return ::lgammaf_r(x, &dummy);
+#elif defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::lgamma(x);
 #else
     return ::lgammaf(x);
 #endif
@@ -134,15 +88,20 @@
 struct lgamma_impl<double> {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE double run(double x) {
-#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE))
-    int signgam;
-    return ::lgamma_r(x, &signgam);
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && defined(EIGEN_HAS_LGAMMA_R) && !defined(__APPLE__)
+    int dummy;
+    return ::lgamma_r(x, &dummy);
+#elif defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::lgamma(x);
 #else
     return ::lgamma(x);
 #endif
   }
 };
 
+#undef EIGEN_HAS_LGAMMA_R
+#endif
+
 /****************************************************************************
  * Implementation of digamma (psi), based on Cephes                         *
  ****************************************************************************/
@@ -175,19 +134,23 @@
   }
 };
 
+
 template <>
 struct digamma_impl_maybe_poly<float> {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE float run(const float s) {
-    const float A[] = {-4.16666666666666666667E-3f, 3.96825396825396825397E-3f,
-                       -8.33333333333333333333E-3f, 8.33333333333333333333E-2f};
+    const float A[] = {
+      -4.16666666666666666667E-3f,
+      3.96825396825396825397E-3f,
+      -8.33333333333333333333E-3f,
+      8.33333333333333333333E-2f
+    };
 
     float z;
     if (s < 1.0e8f) {
       z = 1.0f / (s * s);
-      return z * cephes::polevl<float, 3>::run(z, A);
-    } else
-      return 0.0f;
+      return z * internal::ppolevl<float, 3>::run(z, A);
+    } else return 0.0f;
   }
 };
 
@@ -195,17 +158,22 @@
 struct digamma_impl_maybe_poly<double> {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE double run(const double s) {
-    const double A[] = {8.33333333333333333333E-2, -2.10927960927960927961E-2,
-                        7.57575757575757575758E-3, -4.16666666666666666667E-3,
-                        3.96825396825396825397E-3, -8.33333333333333333333E-3,
-                        8.33333333333333333333E-2};
+    const double A[] = {
+      8.33333333333333333333E-2,
+      -2.10927960927960927961E-2,
+      7.57575757575757575758E-3,
+      -4.16666666666666666667E-3,
+      3.96825396825396825397E-3,
+      -8.33333333333333333333E-3,
+      8.33333333333333333333E-2
+    };
 
     double z;
     if (s < 1.0e17) {
       z = 1.0 / (s * s);
-      return z * cephes::polevl<double, 6>::run(z, A);
-    } else
-      return 0.0;
+      return z * internal::ppolevl<double, 6>::run(z, A);
+    }
+    else return 0.0;
   }
 };
 
@@ -273,7 +241,7 @@
     Scalar p, q, nz, s, w, y;
     bool negative = false;
 
-    const Scalar maxnum = NumTraits<Scalar>::infinity();
+    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
     const Scalar m_pi = Scalar(EIGEN_PI);
 
     const Scalar zero = Scalar(0);
@@ -286,7 +254,7 @@
       q = x;
       p = numext::floor(q);
       if (p == q) {
-        return maxnum;
+        return nan;
       }
       /* Remove the zeros of tan(m_pi x)
        * by subtracting the nearest integer from x
@@ -298,7 +266,8 @@
           nz = q - p;
         }
         nz = m_pi / numext::tan(m_pi * nz);
-      } else {
+      }
+      else {
         nz = zero;
       }
       x = one - x;
@@ -324,13 +293,63 @@
  * Implementation of erf, requires C++11/C99                                *
  ****************************************************************************/
 
-template <typename Scalar>
+/** \internal \returns the error function of \a a (coeff-wise)
+    Doesn't do anything fancy, just a 13/8-degree rational interpolant which
+    is accurate up to a couple of ulp in the range [-4, 4], outside of which
+    fl(erf(x)) = +/-1.
+
+    This implementation works on both scalars and Ts.
+*/
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& a_x) {
+  // Clamp the inputs to the range [-4, 4] since anything outside
+  // this range is +/-1.0f in single-precision.
+  const T plus_4 = pset1<T>(4.f);
+  const T minus_4 = pset1<T>(-4.f);
+  const T x = pmax(pmin(a_x, plus_4), minus_4);
+  // The monomial coefficients of the numerator polynomial (odd).
+  const T alpha_1 = pset1<T>(-1.60960333262415e-02f);
+  const T alpha_3 = pset1<T>(-2.95459980854025e-03f);
+  const T alpha_5 = pset1<T>(-7.34990630326855e-04f);
+  const T alpha_7 = pset1<T>(-5.69250639462346e-05f);
+  const T alpha_9 = pset1<T>(-2.10102402082508e-06f);
+  const T alpha_11 = pset1<T>(2.77068142495902e-08f);
+  const T alpha_13 = pset1<T>(-2.72614225801306e-10f);
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const T beta_0 = pset1<T>(-1.42647390514189e-02f);
+  const T beta_2 = pset1<T>(-7.37332916720468e-03f);
+  const T beta_4 = pset1<T>(-1.68282697438203e-03f);
+  const T beta_6 = pset1<T>(-2.13374055278905e-04f);
+  const T beta_8 = pset1<T>(-1.45660718464996e-05f);
+
+  // Since the polynomials are odd/even, we need x^2.
+  const T x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial p.
+  T p = pmadd(x2, alpha_13, alpha_11);
+  p = pmadd(x2, p, alpha_9);
+  p = pmadd(x2, p, alpha_7);
+  p = pmadd(x2, p, alpha_5);
+  p = pmadd(x2, p, alpha_3);
+  p = pmadd(x2, p, alpha_1);
+  p = pmul(x, p);
+
+  // Evaluate the denominator polynomial p.
+  T q = pmadd(x2, beta_8, beta_6);
+  q = pmadd(x2, q, beta_4);
+  q = pmadd(x2, q, beta_2);
+  q = pmadd(x2, q, beta_0);
+
+  // Divide the numerator by the denominator.
+  return pdiv(p, q);
+}
+
+template <typename T>
 struct erf_impl {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
-    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
-                        THIS_TYPE_IS_NOT_SUPPORTED);
-    return Scalar(0);
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    return generic_fast_erf_float(x);
   }
 };
 
@@ -339,17 +358,31 @@
   typedef Scalar type;
 };
 
+#if EIGEN_HAS_C99_MATH
 template <>
 struct erf_impl<float> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE float run(float x) { return ::erff(x); }
+  static EIGEN_STRONG_INLINE float run(float x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erf(x);
+#else
+    return generic_fast_erf_float(x);
+#endif
+  }
 };
 
 template <>
 struct erf_impl<double> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE double run(double x) { return ::erf(x); }
+  static EIGEN_STRONG_INLINE double run(double x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erf(x);
+#else
+    return ::erf(x);
+#endif
+  }
 };
+#endif  // EIGEN_HAS_C99_MATH
 
 /***************************************************************************
 * Implementation of erfc, requires C++11/C99                               *
@@ -370,22 +403,276 @@
   typedef Scalar type;
 };
 
-
+#if EIGEN_HAS_C99_MATH
 template <>
 struct erfc_impl<float> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE float run(const float x) { return ::erfcf(x); }
+  static EIGEN_STRONG_INLINE float run(const float x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erfc(x);
+#else
+    return ::erfcf(x);
+#endif
+  }
 };
 
 template <>
 struct erfc_impl<double> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE double run(const double x) { return ::erfc(x); }
+  static EIGEN_STRONG_INLINE double run(const double x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erfc(x);
+#else
+    return ::erfc(x);
+#endif
+  }
+};
+#endif  // EIGEN_HAS_C99_MATH
+
+
+/***************************************************************************
+* Implementation of ndtri.                                                 *
+****************************************************************************/
+
+/* Inverse of Normal distribution function (modified for Eigen).
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, y, ndtri();
+ *
+ * x = ndtri( y );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns the argument, x, for which the area under the
+ * Gaussian probability density function (integrated from
+ * minus infinity to x) is equal to y.
+ *
+ *
+ * For small arguments 0 < y < exp(-2), the program computes
+ * z = sqrt( -2.0 * log(y) );  then the approximation is
+ * x = z - log(z)/z  - (1/z) P(1/z) / Q(1/z).
+ * There are two rational functions P/Q, one for 0 < y < exp(-32)
+ * and the other for y up to exp(-2).  For larger arguments,
+ * w = y - 0.5, and  x/sqrt(2pi) = w + w**3 R(w**2)/S(w**2)).
+ *
+ *
+ * ACCURACY:
+ *
+ *                      Relative error:
+ * arithmetic   domain        # trials      peak         rms
+ *    DEC      0.125, 1         5500       9.5e-17     2.1e-17
+ *    DEC      6e-39, 0.135     3500       5.7e-17     1.3e-17
+ *    IEEE     0.125, 1        20000       7.2e-16     1.3e-16
+ *    IEEE     3e-308, 0.135   50000       4.6e-16     9.8e-17
+ *
+ *
+ * ERROR MESSAGES:
+ *
+ *   message         condition    value returned
+ * ndtri domain       x <= 0        -MAXNUM
+ * ndtri domain       x >= 1         MAXNUM
+ *
+ */
+ /*
+   Cephes Math Library Release 2.2: June, 1992
+   Copyright 1985, 1987, 1992 by Stephen L. Moshier
+   Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+ */
+
+
+// TODO: Add a cheaper approximation for float.
+
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T flipsign(
+    const T& should_flipsign, const T& x) {
+  typedef typename unpacket_traits<T>::type Scalar;
+  const T sign_mask = pset1<T>(Scalar(-0.0));
+  T sign_bit = pand<T>(should_flipsign, sign_mask);
+  return pxor<T>(sign_bit, x);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double flipsign<double>(
+    const double& should_flipsign, const double& x) {
+  return should_flipsign == 0 ? x : -x;
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float flipsign<float>(
+    const float& should_flipsign, const float& x) {
+  return should_flipsign == 0 ? x : -x;
+}
+
+// We split this computation in to two so that in the scalar path
+// only one branch is evaluated (due to our template specialization of pselect
+// being an if statement.)
+
+template <typename T, typename ScalarType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_gt_exp_neg_two(const T& b) {
+  const ScalarType p0[] = {
+    ScalarType(-5.99633501014107895267e1),
+    ScalarType(9.80010754185999661536e1),
+    ScalarType(-5.66762857469070293439e1),
+    ScalarType(1.39312609387279679503e1),
+    ScalarType(-1.23916583867381258016e0)
+  };
+  const ScalarType q0[] = {
+    ScalarType(1.0),
+    ScalarType(1.95448858338141759834e0),
+    ScalarType(4.67627912898881538453e0),
+    ScalarType(8.63602421390890590575e1),
+    ScalarType(-2.25462687854119370527e2),
+    ScalarType(2.00260212380060660359e2),
+    ScalarType(-8.20372256168333339912e1),
+    ScalarType(1.59056225126211695515e1),
+    ScalarType(-1.18331621121330003142e0)
+  };
+  const T sqrt2pi = pset1<T>(ScalarType(2.50662827463100050242e0));
+  const T half = pset1<T>(ScalarType(0.5));
+  T c, c2, ndtri_gt_exp_neg_two;
+
+  c = psub(b, half);
+  c2 = pmul(c, c);
+  ndtri_gt_exp_neg_two = pmadd(c, pmul(
+      c2, pdiv(
+          internal::ppolevl<T, 4>::run(c2, p0),
+          internal::ppolevl<T, 8>::run(c2, q0))), c);
+  return pmul(ndtri_gt_exp_neg_two, sqrt2pi);
+}
+
+template <typename T, typename ScalarType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_lt_exp_neg_two(
+    const T& b, const T& should_flipsign) {
+  /* Approximation for interval z = sqrt(-2 log a ) between 2 and 8
+   * i.e., a between exp(-2) = .135 and exp(-32) = 1.27e-14.
+   */
+  const ScalarType p1[] = {
+    ScalarType(4.05544892305962419923e0),
+    ScalarType(3.15251094599893866154e1),
+    ScalarType(5.71628192246421288162e1),
+    ScalarType(4.40805073893200834700e1),
+    ScalarType(1.46849561928858024014e1),
+    ScalarType(2.18663306850790267539e0),
+    ScalarType(-1.40256079171354495875e-1),
+    ScalarType(-3.50424626827848203418e-2),
+    ScalarType(-8.57456785154685413611e-4)
+  };
+  const ScalarType q1[] = {
+    ScalarType(1.0),
+    ScalarType(1.57799883256466749731e1),
+    ScalarType(4.53907635128879210584e1),
+    ScalarType(4.13172038254672030440e1),
+    ScalarType(1.50425385692907503408e1),
+    ScalarType(2.50464946208309415979e0),
+    ScalarType(-1.42182922854787788574e-1),
+    ScalarType(-3.80806407691578277194e-2),
+    ScalarType(-9.33259480895457427372e-4)
+  };
+  /* Approximation for interval z = sqrt(-2 log a ) between 8 and 64
+   * i.e., a between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890.
+   */
+  const ScalarType p2[] = {
+    ScalarType(3.23774891776946035970e0),
+    ScalarType(6.91522889068984211695e0),
+    ScalarType(3.93881025292474443415e0),
+    ScalarType(1.33303460815807542389e0),
+    ScalarType(2.01485389549179081538e-1),
+    ScalarType(1.23716634817820021358e-2),
+    ScalarType(3.01581553508235416007e-4),
+    ScalarType(2.65806974686737550832e-6),
+    ScalarType(6.23974539184983293730e-9)
+  };
+  const ScalarType q2[] = {
+    ScalarType(1.0),
+    ScalarType(6.02427039364742014255e0),
+    ScalarType(3.67983563856160859403e0),
+    ScalarType(1.37702099489081330271e0),
+    ScalarType(2.16236993594496635890e-1),
+    ScalarType(1.34204006088543189037e-2),
+    ScalarType(3.28014464682127739104e-4),
+    ScalarType(2.89247864745380683936e-6),
+    ScalarType(6.79019408009981274425e-9)
+  };
+  const T eight = pset1<T>(ScalarType(8.0));
+  const T one = pset1<T>(ScalarType(1));
+  const T neg_two = pset1<T>(ScalarType(-2));
+  T x, x0, x1, z;
+
+  x = psqrt(pmul(neg_two, plog(b)));
+  x0 = psub(x, pdiv(plog(x), x));
+  z = pdiv(one, x);
+  x1 = pmul(
+      z, pselect(
+          pcmp_lt(x, eight),
+          pdiv(internal::ppolevl<T, 8>::run(z, p1),
+               internal::ppolevl<T, 8>::run(z, q1)),
+          pdiv(internal::ppolevl<T, 8>::run(z, p2),
+               internal::ppolevl<T, 8>::run(z, q2))));
+  return flipsign(should_flipsign, psub(x0, x1));
+}
+
+template <typename T, typename ScalarType>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T generic_ndtri(const T& a) {
+  const T maxnum = pset1<T>(NumTraits<ScalarType>::infinity());
+  const T neg_maxnum = pset1<T>(-NumTraits<ScalarType>::infinity());
+
+  const T zero = pset1<T>(ScalarType(0));
+  const T one = pset1<T>(ScalarType(1));
+  // exp(-2)
+  const T exp_neg_two = pset1<T>(ScalarType(0.13533528323661269189));
+  T b, ndtri, should_flipsign;
+
+  should_flipsign = pcmp_le(a, psub(one, exp_neg_two));
+  b = pselect(should_flipsign, a, psub(one, a));
+
+  ndtri = pselect(
+      pcmp_lt(exp_neg_two, b),
+      generic_ndtri_gt_exp_neg_two<T, ScalarType>(b),
+      generic_ndtri_lt_exp_neg_two<T, ScalarType>(b, should_flipsign));
+
+  return pselect(
+      pcmp_le(a, zero), neg_maxnum,
+      pselect(pcmp_le(one, a), maxnum, ndtri));
+}
+
+template <typename Scalar>
+struct ndtri_retval {
+  typedef Scalar type;
 };
 
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct ndtri_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+# else
+
+template <typename Scalar>
+struct ndtri_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
+    return generic_ndtri<Scalar, Scalar>(x);
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+
 /**************************************************************************************************************
- * Implementation of igammac (complemented incomplete gamma integral), based on
- *Cephes but requires C++11/C99 *
+ * Implementation of igammac (complemented incomplete gamma integral), based on Cephes but requires C++11/C99 *
  **************************************************************************************************************/
 
 template <typename Scalar>
@@ -397,20 +684,11 @@
 template <typename Scalar>
 struct cephes_helper {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar machep() {
-    assert(false && "machep not supported for this type");
-    return 0.0;
-  }
+  static EIGEN_STRONG_INLINE Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; }
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar big() {
-    assert(false && "big not supported for this type");
-    return 0.0;
-  }
+  static EIGEN_STRONG_INLINE Scalar big() { assert(false && "big not supported for this type"); return 0.0; }
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar biginv() {
-    assert(false && "biginv not supported for this type");
-    return 0.0;
-  }
+  static EIGEN_STRONG_INLINE Scalar biginv() { assert(false && "biginv not supported for this type"); return 0.0; }
 };
 
 template <>
@@ -448,8 +726,241 @@
   }
 };
 
+enum IgammaComputationMode { VALUE, DERIVATIVE, SAMPLE_DERIVATIVE };
+
 template <typename Scalar>
-struct igamma_impl;  // predeclare igamma_impl
+EIGEN_DEVICE_FUNC
+static EIGEN_STRONG_INLINE Scalar main_igamma_term(Scalar a, Scalar x) {
+    /* Compute  x**a * exp(-x) / gamma(a)  */
+    Scalar logax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
+    if (logax < -numext::log(NumTraits<Scalar>::highest()) ||
+        // Assuming x and a aren't Nan.
+        (numext::isnan)(logax)) {
+      return Scalar(0);
+    }
+    return numext::exp(logax);
+}
+
+template <typename Scalar, IgammaComputationMode mode>
+EIGEN_DEVICE_FUNC
+int igamma_num_iterations() {
+  /* Returns the maximum number of internal iterations for igamma computation.
+   */
+  if (mode == VALUE) {
+    return 2000;
+  }
+
+  if (internal::is_same<Scalar, float>::value) {
+    return 200;
+  } else if (internal::is_same<Scalar, double>::value) {
+    return 500;
+  } else {
+    return 2000;
+  }
+}
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igammac_cf_impl {
+  /* Computes igamc(a, x) or derivative (depending on the mode)
+   * using the continued fraction expansion of the complementary
+   * incomplete Gamma function.
+   *
+   * Preconditions:
+   *   a > 0
+   *   x >= 1
+   *   x >= a
+   */
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar a, Scalar x) {
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar two = 2;
+    const Scalar machep = cephes_helper<Scalar>::machep();
+    const Scalar big = cephes_helper<Scalar>::big();
+    const Scalar biginv = cephes_helper<Scalar>::biginv();
+
+    if ((numext::isinf)(x)) {
+      return zero;
+    }
+
+    Scalar ax = main_igamma_term<Scalar>(a, x);
+    // This is independent of mode. If this value is zero,
+    // then the function value is zero. If the function value is zero,
+    // then we are in a neighborhood where the function value evalutes to zero,
+    // so the derivative is zero.
+    if (ax == zero) {
+      return zero;
+    }
+
+    // continued fraction
+    Scalar y = one - a;
+    Scalar z = x + y + one;
+    Scalar c = zero;
+    Scalar pkm2 = one;
+    Scalar qkm2 = x;
+    Scalar pkm1 = x + one;
+    Scalar qkm1 = z * x;
+    Scalar ans = pkm1 / qkm1;
+
+    Scalar dpkm2_da = zero;
+    Scalar dqkm2_da = zero;
+    Scalar dpkm1_da = zero;
+    Scalar dqkm1_da = -x;
+    Scalar dans_da = (dpkm1_da - ans * dqkm1_da) / qkm1;
+
+    for (int i = 0; i < igamma_num_iterations<Scalar, mode>(); i++) {
+      c += one;
+      y += one;
+      z += two;
+
+      Scalar yc = y * c;
+      Scalar pk = pkm1 * z - pkm2 * yc;
+      Scalar qk = qkm1 * z - qkm2 * yc;
+
+      Scalar dpk_da = dpkm1_da * z - pkm1 - dpkm2_da * yc + pkm2 * c;
+      Scalar dqk_da = dqkm1_da * z - qkm1 - dqkm2_da * yc + qkm2 * c;
+
+      if (qk != zero) {
+        Scalar ans_prev = ans;
+        ans = pk / qk;
+
+        Scalar dans_da_prev = dans_da;
+        dans_da = (dpk_da - ans * dqk_da) / qk;
+
+        if (mode == VALUE) {
+          if (numext::abs(ans_prev - ans) <= machep * numext::abs(ans)) {
+            break;
+          }
+        } else {
+          if (numext::abs(dans_da - dans_da_prev) <= machep) {
+            break;
+          }
+        }
+      }
+
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+
+      dpkm2_da = dpkm1_da;
+      dpkm1_da = dpk_da;
+      dqkm2_da = dqkm1_da;
+      dqkm1_da = dqk_da;
+
+      if (numext::abs(pk) > big) {
+        pkm2 *= biginv;
+        pkm1 *= biginv;
+        qkm2 *= biginv;
+        qkm1 *= biginv;
+
+        dpkm2_da *= biginv;
+        dpkm1_da *= biginv;
+        dqkm2_da *= biginv;
+        dqkm1_da *= biginv;
+      }
+    }
+
+    /* Compute  x**a * exp(-x) / gamma(a)  */
+    Scalar dlogax_da = numext::log(x) - digamma_impl<Scalar>::run(a);
+    Scalar dax_da = ax * dlogax_da;
+
+    switch (mode) {
+      case VALUE:
+        return ans * ax;
+      case DERIVATIVE:
+        return ans * dax_da + dans_da * ax;
+      case SAMPLE_DERIVATIVE:
+      default: // this is needed to suppress clang warning
+        return -(dans_da + ans * dlogax_da) * x;
+    }
+  }
+};
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igamma_series_impl {
+  /* Computes igam(a, x) or its derivative (depending on the mode)
+   * using the series expansion of the incomplete Gamma function.
+   *
+   * Preconditions:
+   *   x > 0
+   *   a > 0
+   *   !(x > 1 && x > a)
+   */
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar a, Scalar x) {
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar machep = cephes_helper<Scalar>::machep();
+
+    Scalar ax = main_igamma_term<Scalar>(a, x);
+
+    // This is independent of mode. If this value is zero,
+    // then the function value is zero. If the function value is zero,
+    // then we are in a neighborhood where the function value evalutes to zero,
+    // so the derivative is zero.
+    if (ax == zero) {
+      return zero;
+    }
+
+    ax /= a;
+
+    /* power series */
+    Scalar r = a;
+    Scalar c = one;
+    Scalar ans = one;
+
+    Scalar dc_da = zero;
+    Scalar dans_da = zero;
+
+    for (int i = 0; i < igamma_num_iterations<Scalar, mode>(); i++) {
+      r += one;
+      Scalar term = x / r;
+      Scalar dterm_da = -x / (r * r);
+      dc_da = term * dc_da + dterm_da * c;
+      dans_da += dc_da;
+      c *= term;
+      ans += c;
+
+      if (mode == VALUE) {
+        if (c <= machep * ans) {
+          break;
+        }
+      } else {
+        if (numext::abs(dc_da) <= machep * numext::abs(dans_da)) {
+          break;
+        }
+      }
+    }
+
+    Scalar dlogax_da = numext::log(x) - digamma_impl<Scalar>::run(a + one);
+    Scalar dax_da = ax * dlogax_da;
+
+    switch (mode) {
+      case VALUE:
+        return ans * ax;
+      case DERIVATIVE:
+        return ans * dax_da + dans_da * ax;
+      case SAMPLE_DERIVATIVE:
+      default: // this is needed to suppress clang warning
+        return -(dans_da + ans * dlogax_da) * x / a;
+    }
+  }
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct igammac_impl {
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar a, Scalar x) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+#else
 
 template <typename Scalar>
 struct igammac_impl {
@@ -518,168 +1029,49 @@
       return nan;
     }
 
+    if ((numext::isnan)(a) || (numext::isnan)(x)) {  // propagate nans
+      return nan;
+    }
+
     if ((x < one) || (x < a)) {
-      /* The checks above ensure that we meet the preconditions for
-       * igamma_impl::Impl(), so call it, rather than igamma_impl::Run().
-       * Calling Run() would also work, but in that case the compiler may not be
-       * able to prove that igammac_impl::Run and igamma_impl::Run are not
-       * mutually recursive.  This leads to worse code, particularly on
-       * platforms like nvptx, where recursion is allowed only begrudgingly.
-       */
-      return (one - igamma_impl<Scalar>::Impl(a, x));
+      return (one - igamma_series_impl<Scalar, VALUE>::run(a, x));
     }
 
-    return Impl(a, x);
-  }
-
- private:
-  /* igamma_impl calls igammac_impl::Impl. */
-  friend struct igamma_impl<Scalar>;
-
-  /* Actually computes igamc(a, x).
-   *
-   * Preconditions:
-   *   a > 0
-   *   x >= 1
-   *   x >= a
-   */
-  EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
-    const Scalar zero = 0;
-    const Scalar one = 1;
-    const Scalar two = 2;
-    const Scalar machep = cephes_helper<Scalar>::machep();
-    const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
-    const Scalar big = cephes_helper<Scalar>::big();
-    const Scalar biginv = cephes_helper<Scalar>::biginv();
-    const Scalar inf = NumTraits<Scalar>::infinity();
-
-    Scalar ans, ax, c, yc, r, t, y, z;
-    Scalar pk, pkm1, pkm2, qk, qkm1, qkm2;
-
-    if (x == inf) return zero;  // std::isinf crashes on CUDA
-
-    /* Compute  x**a * exp(-x) / gamma(a)  */
-    ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
-    if (ax < -maxlog) {  // underflow
-      return zero;
-    }
-    ax = numext::exp(ax);
-
-    // continued fraction
-    y = one - a;
-    z = x + y + one;
-    c = zero;
-    pkm2 = one;
-    qkm2 = x;
-    pkm1 = x + one;
-    qkm1 = z * x;
-    ans = pkm1 / qkm1;
-
-    while (true) {
-      c += one;
-      y += one;
-      z += two;
-      yc = y * c;
-      pk = pkm1 * z - pkm2 * yc;
-      qk = qkm1 * z - qkm2 * yc;
-      if (qk != zero) {
-        r = pk / qk;
-        t = numext::abs((ans - r) / r);
-        ans = r;
-      } else {
-        t = one;
-      }
-      pkm2 = pkm1;
-      pkm1 = pk;
-      qkm2 = qkm1;
-      qkm1 = qk;
-      if (numext::abs(pk) > big) {
-        pkm2 *= biginv;
-        pkm1 *= biginv;
-        qkm2 *= biginv;
-        qkm1 *= biginv;
-      }
-      if (t <= machep) {
-        break;
-      }
-    }
-
-    return (ans * ax);
+    return igammac_cf_impl<Scalar, VALUE>::run(a, x);
   }
 };
 
+#endif  // EIGEN_HAS_C99_MATH
+
 /************************************************************************************************
- * Implementation of igamma (incomplete gamma integral), based on Cephes but
- *requires C++11/C99 *
+ * Implementation of igamma (incomplete gamma integral), based on Cephes but requires C++11/C99 *
  ************************************************************************************************/
 
-template <typename Scalar>
-struct igamma_retval {
-  typedef Scalar type;
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igamma_generic_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar x) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
 };
 
-template <typename Scalar>
-struct igamma_impl {
+#else
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igamma_generic_impl {
   EIGEN_DEVICE_FUNC
   static Scalar run(Scalar a, Scalar x) {
-    /*	igam()
-     *	Incomplete gamma integral
+    /* Depending on the mode, returns
+     * - VALUE: incomplete Gamma function igamma(a, x)
+     * - DERIVATIVE: derivative of incomplete Gamma function d/da igamma(a, x)
+     * - SAMPLE_DERIVATIVE: implicit derivative of a Gamma random variable
+     * x ~ Gamma(x | a, 1), dx/da = -1 / Gamma(x | a, 1) * d igamma(a, x) / dx
      *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * double a, x, y, igam();
-     *
-     * y = igam( a, x );
-     *
-     * DESCRIPTION:
-     *
-     * The function is defined by
-     *
-     *                           x
-     *                            -
-     *                   1       | |  -t  a-1
-     *  igam(a,x)  =   -----     |   e   t   dt.
-     *                  -      | |
-     *                 | (a)    -
-     *                           0
-     *
-     *
-     * In this implementation both arguments must be positive.
-     * The integral is evaluated by either a power series or
-     * continued fraction expansion, depending on the relative
-     * values of a and x.
-     *
-     * ACCURACY (double):
-     *
-     *                      Relative error:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0,30       200000       3.6e-14     2.9e-15
-     *    IEEE      0,100      300000       9.9e-14     1.5e-14
-     *
-     *
-     * ACCURACY (float):
-     *
-     *                      Relative error:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0,30        20000       7.8e-6      5.9e-7
-     *
-     */
-    /*
-      Cephes Math Library Release 2.2: June, 1992
-      Copyright 1985, 1987, 1992 by Stephen L. Moshier
-      Direct inquiries to 30 Frost Street, Cambridge, MA 02140
-    */
-
-    /* left tail of incomplete gamma function:
-     *
-     *          inf.      k
-     *   a  -x   -       x
-     *  x  e     >   ----------
-     *           -     -
-     *          k=0   | (a+k+1)
-     *
+     * Derivatives are implemented by forward-mode differentiation.
      */
     const Scalar zero = 0;
     const Scalar one = 1;
@@ -691,72 +1083,174 @@
       return nan;
     }
 
+    if ((numext::isnan)(a) || (numext::isnan)(x)) {  // propagate nans
+      return nan;
+    }
+
     if ((x > one) && (x > a)) {
-      /* The checks above ensure that we meet the preconditions for
-       * igammac_impl::Impl(), so call it, rather than igammac_impl::Run().
-       * Calling Run() would also work, but in that case the compiler may not be
-       * able to prove that igammac_impl::Run and igamma_impl::Run are not
-       * mutually recursive.  This leads to worse code, particularly on
-       * platforms like nvptx, where recursion is allowed only begrudgingly.
-       */
-      return (one - igammac_impl<Scalar>::Impl(a, x));
-    }
-
-    return Impl(a, x);
-  }
-
- private:
-  /* igammac_impl calls igamma_impl::Impl. */
-  friend struct igammac_impl<Scalar>;
-
-  /* Actually computes igam(a, x).
-   *
-   * Preconditions:
-   *   x > 0
-   *   a > 0
-   *   !(x > 1 && x > a)
-   */
-  EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
-    const Scalar zero = 0;
-    const Scalar one = 1;
-    const Scalar machep = cephes_helper<Scalar>::machep();
-    const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
-
-    Scalar ans, ax, c, r;
-
-    /* Compute  x**a * exp(-x) / gamma(a)  */
-    ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
-    if (ax < -maxlog) {
-      // underflow
-      return zero;
-    }
-    ax = numext::exp(ax);
-
-    /* power series */
-    r = a;
-    c = one;
-    ans = one;
-
-    while (true) {
-      r += one;
-      c *= x / r;
-      ans += c;
-      if (c / ans <= machep) {
-        break;
+      Scalar ret = igammac_cf_impl<Scalar, mode>::run(a, x);
+      if (mode == VALUE) {
+        return one - ret;
+      } else {
+        return -ret;
       }
     }
 
-    return (ans * ax / a);
+    return igamma_series_impl<Scalar, mode>::run(a, x);
   }
 };
 
+#endif  // EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct igamma_retval {
+  typedef Scalar type;
+};
+
+template <typename Scalar>
+struct igamma_impl : igamma_generic_impl<Scalar, VALUE> {
+  /* igam()
+   * Incomplete gamma integral.
+   *
+   * The CDF of Gamma(a, 1) random variable at the point x.
+   *
+   * Accuracy estimation. For each a in [10^-2, 10^-1...10^3] we sample
+   * 50 Gamma random variables x ~ Gamma(x | a, 1), a total of 300 points.
+   * The ground truth is computed by mpmath. Mean absolute error:
+   * float: 1.26713e-05
+   * double: 2.33606e-12
+   *
+   * Cephes documentation below.
+   *
+   * SYNOPSIS:
+   *
+   * double a, x, y, igam();
+   *
+   * y = igam( a, x );
+   *
+   * DESCRIPTION:
+   *
+   * The function is defined by
+   *
+   *                           x
+   *                            -
+   *                   1       | |  -t  a-1
+   *  igam(a,x)  =   -----     |   e   t   dt.
+   *                  -      | |
+   *                 | (a)    -
+   *                           0
+   *
+   *
+   * In this implementation both arguments must be positive.
+   * The integral is evaluated by either a power series or
+   * continued fraction expansion, depending on the relative
+   * values of a and x.
+   *
+   * ACCURACY (double):
+   *
+   *                      Relative error:
+   * arithmetic   domain     # trials      peak         rms
+   *    IEEE      0,30       200000       3.6e-14     2.9e-15
+   *    IEEE      0,100      300000       9.9e-14     1.5e-14
+   *
+   *
+   * ACCURACY (float):
+   *
+   *                      Relative error:
+   * arithmetic   domain     # trials      peak         rms
+   *    IEEE      0,30        20000       7.8e-6      5.9e-7
+   *
+   */
+  /*
+    Cephes Math Library Release 2.2: June, 1992
+    Copyright 1985, 1987, 1992 by Stephen L. Moshier
+    Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+  */
+
+  /* left tail of incomplete gamma function:
+   *
+   *          inf.      k
+   *   a  -x   -       x
+   *  x  e     >   ----------
+   *           -     -
+   *          k=0   | (a+k+1)
+   *
+   */
+};
+
+template <typename Scalar>
+struct igamma_der_a_retval : igamma_retval<Scalar> {};
+
+template <typename Scalar>
+struct igamma_der_a_impl : igamma_generic_impl<Scalar, DERIVATIVE> {
+  /* Derivative of the incomplete Gamma function with respect to a.
+   *
+   * Computes d/da igamma(a, x) by forward differentiation of the igamma code.
+   *
+   * Accuracy estimation. For each a in [10^-2, 10^-1...10^3] we sample
+   * 50 Gamma random variables x ~ Gamma(x | a, 1), a total of 300 points.
+   * The ground truth is computed by mpmath. Mean absolute error:
+   * float: 6.17992e-07
+   * double: 4.60453e-12
+   *
+   * Reference:
+   * R. Moore. "Algorithm AS 187: Derivatives of the incomplete gamma
+   * integral". Journal of the Royal Statistical Society. 1982
+   */
+};
+
+template <typename Scalar>
+struct gamma_sample_der_alpha_retval : igamma_retval<Scalar> {};
+
+template <typename Scalar>
+struct gamma_sample_der_alpha_impl
+    : igamma_generic_impl<Scalar, SAMPLE_DERIVATIVE> {
+  /* Derivative of a Gamma random variable sample with respect to alpha.
+   *
+   * Consider a sample of a Gamma random variable with the concentration
+   * parameter alpha: sample ~ Gamma(alpha, 1). The reparameterization
+   * derivative that we want to compute is dsample / dalpha =
+   * d igammainv(alpha, u) / dalpha, where u = igamma(alpha, sample).
+   * However, this formula is numerically unstable and expensive, so instead
+   * we use implicit differentiation:
+   *
+   * igamma(alpha, sample) = u, where u ~ Uniform(0, 1).
+   * Apply d / dalpha to both sides:
+   * d igamma(alpha, sample) / dalpha
+   *     + d igamma(alpha, sample) / dsample * dsample/dalpha  = 0
+   * d igamma(alpha, sample) / dalpha
+   *     + Gamma(sample | alpha, 1) dsample / dalpha = 0
+   * dsample/dalpha = - (d igamma(alpha, sample) / dalpha)
+   *                   / Gamma(sample | alpha, 1)
+   *
+   * Here Gamma(sample | alpha, 1) is the PDF of the Gamma distribution
+   * (note that the derivative of the CDF w.r.t. sample is the PDF).
+   * See the reference below for more details.
+   *
+   * The derivative of igamma(alpha, sample) is computed by forward
+   * differentiation of the igamma code. Division by the Gamma PDF is performed
+   * in the same code, increasing the accuracy and speed due to cancellation
+   * of some terms.
+   *
+   * Accuracy estimation. For each alpha in [10^-2, 10^-1...10^3] we sample
+   * 50 Gamma random variables sample ~ Gamma(sample | alpha, 1), a total of 300
+   * points. The ground truth is computed by mpmath. Mean absolute error:
+   * float: 2.1686e-06
+   * double: 1.4774e-12
+   *
+   * Reference:
+   * M. Figurnov, S. Mohamed, A. Mnih "Implicit Reparameterization Gradients".
+   * 2018
+   */
+};
+
 /*****************************************************************************
  * Implementation of Riemann zeta function of two arguments, based on Cephes *
  *****************************************************************************/
 
 template <typename Scalar>
 struct zeta_retval {
-  typedef Scalar type;
+    typedef Scalar type;
 };
 
 template <typename Scalar>
@@ -772,18 +1266,19 @@
 template <>
 struct zeta_impl_series<float> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE bool run(float& a, float& b, float& s,
-                                      const float x, const float machep) {
+  static EIGEN_STRONG_INLINE bool run(float& a, float& b, float& s, const float x, const float machep) {
     int i = 0;
-    while (i < 9) {
-      i += 1;
-      a += 1.0f;
-      b = numext::pow(a, -x);
-      s += b;
-      if (numext::abs(b / s) < machep) return true;
+    while(i < 9)
+    {
+        i += 1;
+        a += 1.0f;
+        b = numext::pow( a, -x );
+        s += b;
+        if( numext::abs(b/s) < machep )
+            return true;
     }
 
-    // Return whether we are done
+    //Return whether we are done
     return false;
   }
 };
@@ -791,157 +1286,170 @@
 template <>
 struct zeta_impl_series<double> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE bool run(double& a, double& b, double& s,
-                                      const double x, const double machep) {
+  static EIGEN_STRONG_INLINE bool run(double& a, double& b, double& s, const double x, const double machep) {
     int i = 0;
-    while ((i < 9) || (a <= 9.0)) {
-      i += 1;
-      a += 1.0;
-      b = numext::pow(a, -x);
-      s += b;
-      if (numext::abs(b / s) < machep) return true;
+    while( (i < 9) || (a <= 9.0) )
+    {
+        i += 1;
+        a += 1.0;
+        b = numext::pow( a, -x );
+        s += b;
+        if( numext::abs(b/s) < machep )
+            return true;
     }
 
-    // Return whether we are done
+    //Return whether we are done
     return false;
   }
 };
 
 template <typename Scalar>
 struct zeta_impl {
-  EIGEN_DEVICE_FUNC
-  static Scalar run(Scalar x, Scalar q) {
-    /*							zeta.c
-     *
-     *	Riemann zeta function of two arguments
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * double x, q, y, zeta();
-     *
-     * y = zeta( x, q );
-     *
-     *
-     *
-     * DESCRIPTION:
-     *
-     *
-     *
-     *                 inf.
-     *                  -        -x
-     *   zeta(x,q)  =   >   (k+q)
-     *                  -
-     *                 k=0
-     *
-     * where x > 1 and q is not a negative integer or zero.
-     * The Euler-Maclaurin summation formula is used to obtain
-     * the expansion
-     *
-     *                n
-     *                -       -x
-     * zeta(x,q)  =   >  (k+q)
-     *                -
-     *               k=1
-     *
-     *           1-x                 inf.  B   x(x+1)...(x+2j)
-     *      (n+q)           1         -     2j
-     *  +  ---------  -  -------  +   >    --------------------
-     *        x-1              x      -                   x+2j+1
-     *                   2(n+q)      j=1       (2j)! (n+q)
-     *
-     * where the B2j are Bernoulli numbers.  Note that (see zetac.c)
-     * zeta(x,1) = zetac(x) + 1.
-     *
-     *
-     *
-     * ACCURACY:
-     *
-     * Relative error for single precision:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0,25        10000       6.9e-7      1.0e-7
-     *
-     * Large arguments may produce underflow in powf(), in which
-     * case the results are inaccurate.
-     *
-     * REFERENCE:
-     *
-     * Gradshteyn, I. S., and I. M. Ryzhik, Tables of Integrals,
-     * Series, and Products, p. 1073; Academic Press, 1980.
-     *
-     */
+    EIGEN_DEVICE_FUNC
+    static Scalar run(Scalar x, Scalar q) {
+        /*							zeta.c
+         *
+         *	Riemann zeta function of two arguments
+         *
+         *
+         *
+         * SYNOPSIS:
+         *
+         * double x, q, y, zeta();
+         *
+         * y = zeta( x, q );
+         *
+         *
+         *
+         * DESCRIPTION:
+         *
+         *
+         *
+         *                 inf.
+         *                  -        -x
+         *   zeta(x,q)  =   >   (k+q)
+         *                  -
+         *                 k=0
+         *
+         * where x > 1 and q is not a negative integer or zero.
+         * The Euler-Maclaurin summation formula is used to obtain
+         * the expansion
+         *
+         *                n
+         *                -       -x
+         * zeta(x,q)  =   >  (k+q)
+         *                -
+         *               k=1
+         *
+         *           1-x                 inf.  B   x(x+1)...(x+2j)
+         *      (n+q)           1         -     2j
+         *  +  ---------  -  -------  +   >    --------------------
+         *        x-1              x      -                   x+2j+1
+         *                   2(n+q)      j=1       (2j)! (n+q)
+         *
+         * where the B2j are Bernoulli numbers.  Note that (see zetac.c)
+         * zeta(x,1) = zetac(x) + 1.
+         *
+         *
+         *
+         * ACCURACY:
+         *
+         * Relative error for single precision:
+         * arithmetic   domain     # trials      peak         rms
+         *    IEEE      0,25        10000       6.9e-7      1.0e-7
+         *
+         * Large arguments may produce underflow in powf(), in which
+         * case the results are inaccurate.
+         *
+         * REFERENCE:
+         *
+         * Gradshteyn, I. S., and I. M. Ryzhik, Tables of Integrals,
+         * Series, and Products, p. 1073; Academic Press, 1980.
+         *
+         */
 
-    int i;
-    Scalar p, r, a, b, k, s, t, w;
+        int i;
+        Scalar p, r, a, b, k, s, t, w;
 
-    const Scalar A[] = {
-        Scalar(12.0), Scalar(-720.0), Scalar(30240.0), Scalar(-1209600.0),
-        Scalar(47900160.0),
-        Scalar(-1.8924375803183791606e9), /*1.307674368e12/691*/
-        Scalar(7.47242496e10),
-        Scalar(-2.950130727918164224e12),  /*1.067062284288e16/3617*/
-        Scalar(1.1646782814350067249e14),  /*5.109094217170944e18/43867*/
-        Scalar(-4.5979787224074726105e15), /*8.028576626982912e20/174611*/
-        Scalar(1.8152105401943546773e17),  /*1.5511210043330985984e23/854513*/
-        Scalar(
-            -7.1661652561756670113e18) /*1.6938241367317436694528e27/236364091*/
-    };
+        const Scalar A[] = {
+            Scalar(12.0),
+            Scalar(-720.0),
+            Scalar(30240.0),
+            Scalar(-1209600.0),
+            Scalar(47900160.0),
+            Scalar(-1.8924375803183791606e9), /*1.307674368e12/691*/
+            Scalar(7.47242496e10),
+            Scalar(-2.950130727918164224e12), /*1.067062284288e16/3617*/
+            Scalar(1.1646782814350067249e14), /*5.109094217170944e18/43867*/
+            Scalar(-4.5979787224074726105e15), /*8.028576626982912e20/174611*/
+            Scalar(1.8152105401943546773e17), /*1.5511210043330985984e23/854513*/
+            Scalar(-7.1661652561756670113e18) /*1.6938241367317436694528e27/236364091*/
+            };
 
-    const Scalar maxnum = NumTraits<Scalar>::infinity();
-    const Scalar zero = 0.0, half = 0.5, one = 1.0;
-    const Scalar machep = cephes_helper<Scalar>::machep();
-    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+        const Scalar maxnum = NumTraits<Scalar>::infinity();
+        const Scalar zero = 0.0, half = 0.5, one = 1.0;
+        const Scalar machep = cephes_helper<Scalar>::machep();
+        const Scalar nan = NumTraits<Scalar>::quiet_NaN();
 
-    if (x == one) return maxnum;
+        if( x == one )
+            return maxnum;
 
-    if (x < one) {
-      return nan;
-    }
+        if( x < one )
+        {
+            return nan;
+        }
 
-    if (q <= zero) {
-      if (q == numext::floor(q)) {
-        return maxnum;
-      }
-      p = x;
-      r = numext::floor(p);
-      if (p != r) return nan;
-    }
+        if( q <= zero )
+        {
+            if(q == numext::floor(q))
+            {
+                if (x == numext::floor(x) && long(x) % 2 == 0) {
+                    return maxnum;
+                }
+                else {
+                    return nan;
+                }
+            }
+            p = x;
+            r = numext::floor(p);
+            if (p != r)
+                return nan;
+        }
 
-    /* Permit negative q but continue sum until n+q > +9 .
-     * This case should be handled by a reflection formula.
-     * If q<0 and x is an integer, there is a relation to
-     * the polygamma function.
-     */
-    s = numext::pow(q, -x);
-    a = q;
-    b = zero;
-    // Run the summation in a helper function that is specific to the floating
-    // precision
-    if (zeta_impl_series<Scalar>::run(a, b, s, x, machep)) {
-      return s;
-    }
+        /* Permit negative q but continue sum until n+q > +9 .
+         * This case should be handled by a reflection formula.
+         * If q<0 and x is an integer, there is a relation to
+         * the polygamma function.
+         */
+        s = numext::pow( q, -x );
+        a = q;
+        b = zero;
+        // Run the summation in a helper function that is specific to the floating precision
+        if (zeta_impl_series<Scalar>::run(a, b, s, x, machep)) {
+            return s;
+        }
 
-    w = a;
-    s += b * w / (x - one);
-    s -= half * b;
-    a = one;
-    k = zero;
-    for (i = 0; i < 12; i++) {
-      a *= x + k;
-      b /= w;
-      t = a * b / A[i];
-      s = s + t;
-      t = numext::abs(t / s);
-      if (t < machep) {
-        break;
-      }
-      k += one;
-      a *= x + k;
-      b /= w;
-      k += one;
-    }
-    return s;
+        w = a;
+        s += b*w/(x-one);
+        s -= half * b;
+        a = one;
+        k = zero;
+        for( i=0; i<12; i++ )
+        {
+            a *= x + k;
+            b /= w;
+            t = a*b/A[i];
+            s = s + t;
+            t = numext::abs(t/s);
+            if( t < machep ) {
+              break;
+            }
+            k += one;
+            a *= x + k;
+            b /= w;
+            k += one;
+        }
+        return s;
   }
 };
 
@@ -951,37 +1459,51 @@
 
 template <typename Scalar>
 struct polygamma_retval {
-  typedef Scalar type;
+    typedef Scalar type;
 };
 
+#if !EIGEN_HAS_C99_MATH
+
 template <typename Scalar>
 struct polygamma_impl {
-  EIGEN_DEVICE_FUNC
-  static Scalar run(Scalar n, Scalar x) {
-    Scalar zero = 0.0, one = 1.0;
-    Scalar nplus = n + one;
-    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+    EIGEN_DEVICE_FUNC
+    static EIGEN_STRONG_INLINE Scalar run(Scalar n, Scalar x) {
+        EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                            THIS_TYPE_IS_NOT_SUPPORTED);
+        return Scalar(0);
+    }
+};
 
-    // Check that n is an integer
-    if (numext::floor(n) != n) {
-      return nan;
-    }
-    // Just return the digamma function for n = 1
-    else if (n == zero) {
-      return digamma_impl<Scalar>::run(x);
-    }
-    // Use the same implementation as scipy
-    else {
-      Scalar factorial = numext::exp(lgamma_impl<Scalar>::run(nplus));
-      return numext::pow(-one, nplus) * factorial *
-             zeta_impl<Scalar>::run(nplus, x);
-    }
+#else
+
+template <typename Scalar>
+struct polygamma_impl {
+    EIGEN_DEVICE_FUNC
+    static Scalar run(Scalar n, Scalar x) {
+        Scalar zero = 0.0, one = 1.0;
+        Scalar nplus = n + one;
+        const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+        // Check that n is a non-negative integer
+        if (numext::floor(n) != n || n < zero) {
+            return nan;
+        }
+        // Just return the digamma function for n = 0
+        else if (n == zero) {
+            return digamma_impl<Scalar>::run(x);
+        }
+        // Use the same implementation as scipy
+        else {
+            Scalar factorial = numext::exp(lgamma_impl<Scalar>::run(nplus));
+            return numext::pow(-one, nplus) * factorial * zeta_impl<Scalar>::run(nplus, x);
+        }
   }
 };
 
+#endif  // EIGEN_HAS_C99_MATH
+
 /************************************************************************************************
- * Implementation of betainc (incomplete beta integral), based on Cephes but
- *requires C++11/C99 *
+ * Implementation of betainc (incomplete beta integral), based on Cephes but requires C++11/C99 *
  ************************************************************************************************/
 
 template <typename Scalar>
@@ -989,6 +1511,20 @@
   typedef Scalar type;
 };
 
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct betainc_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+#else
+
 template <typename Scalar>
 struct betainc_impl {
   EIGEN_DEVICE_FUNC
@@ -1068,16 +1604,13 @@
   }
 };
 
-/* Continued fraction expansion #1 for incomplete beta integral (small_branch =
- * True)
- * Continued fraction expansion #2 for incomplete beta integral (small_branch =
- * False)
+/* Continued fraction expansion #1 for incomplete beta integral (small_branch = True)
+ * Continued fraction expansion #2 for incomplete beta integral (small_branch = False)
  */
 template <typename Scalar>
 struct incbeta_cfe {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x,
-                                        bool small_branch) {
+  static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x, bool small_branch) {
     EIGEN_STATIC_ASSERT((internal::is_same<Scalar, float>::value ||
                          internal::is_same<Scalar, double>::value),
                         THIS_TYPE_IS_NOT_SUPPORTED);
@@ -1146,7 +1679,6 @@
       qkm2 = qkm1;
       qkm1 = qk;
 
-      EIGEN_DISABLE_FLOAT_EQUALITY_WARNING
       if (qk != zero) {
         r = pk / qk;
         if (numext::abs(ans - r) < numext::abs(r) * thresh) {
@@ -1154,7 +1686,6 @@
         }
         ans = r;
       }
-      EIGEN_ENABLE_FLOAT_EQUALITY_WARNING
 
       k1 += one;
       k2 += k26update;
@@ -1325,7 +1856,7 @@
     if ((a + b) < maxgam && numext::abs(u) < maxlog) {
       t = gamma(a + b) / (gamma(a) * gamma(b));
       s = s * t * pow(x, a);
-    } else {
+    }
     */
     t = lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) -
         lgamma_impl<double>::run(b) + u + numext::log(s);
@@ -1430,6 +1961,8 @@
   }
 };
 
+#endif  // EIGEN_HAS_C99_MATH
+
 }  // end namespace internal
 
 namespace numext {
@@ -1448,14 +1981,14 @@
 
 template <typename Scalar>
 EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(zeta, Scalar)
-    zeta(const Scalar& x, const Scalar& q) {
-  return EIGEN_MATHFUNC_IMPL(zeta, Scalar)::run(x, q);
+zeta(const Scalar& x, const Scalar& q) {
+    return EIGEN_MATHFUNC_IMPL(zeta, Scalar)::run(x, q);
 }
 
 template <typename Scalar>
 EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(polygamma, Scalar)
-    polygamma(const Scalar& n, const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(polygamma, Scalar)::run(n, x);
+polygamma(const Scalar& n, const Scalar& x) {
+    return EIGEN_MATHFUNC_IMPL(polygamma, Scalar)::run(n, x);
 }
 
 template <typename Scalar>
@@ -1471,12 +2004,30 @@
 }
 
 template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(ndtri, Scalar)
+    ndtri(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(ndtri, Scalar)::run(x);
+}
+
+template <typename Scalar>
 EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar)
     igamma(const Scalar& a, const Scalar& x) {
   return EIGEN_MATHFUNC_IMPL(igamma, Scalar)::run(a, x);
 }
 
 template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma_der_a, Scalar)
+    igamma_der_a(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(igamma_der_a, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(gamma_sample_der_alpha, Scalar)
+    gamma_sample_der_alpha(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(gamma_sample_der_alpha, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
 EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igammac, Scalar)
     igammac(const Scalar& a, const Scalar& x) {
   return EIGEN_MATHFUNC_IMPL(igammac, Scalar)::run(a, x);
@@ -1489,7 +2040,6 @@
 }
 
 }  // end namespace numext
-
 }  // end namespace Eigen
 
 #endif  // EIGEN_SPECIAL_FUNCTIONS_H

diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
index dec8d37..2bb0179 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h

@@ -15,82 +15,65 @@
 namespace internal {
 
 /** \internal \returns the ln(|gamma(\a a)|) (coeff-wise) */
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet
-plgamma(const Packet &a) {
-  using numext::lgamma;
-  return lgamma(a);
-}
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); }
 
 /** \internal \returns the derivative of lgamma, psi(\a a) (coeff-wise) */
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet
-pdigamma(const Packet &a) {
-  using numext::digamma;
-  return digamma(a);
-}
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pdigamma(const Packet& a) { using numext::digamma; return digamma(a); }
 
 /** \internal \returns the zeta function of two arguments (coeff-wise) */
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet
-pzeta(const Packet& x, const Packet& q) {
-  using numext::zeta;
-  return zeta(x, q);
-}
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pzeta(const Packet& x, const Packet& q) { using numext::zeta; return zeta(x, q); }
 
 /** \internal \returns the polygamma function (coeff-wise) */
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet
-ppolygamma(const Packet& n, const Packet& x) {
-  using numext::polygamma;
-  return polygamma(n, x);
-}
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet ppolygamma(const Packet& n, const Packet& x) { using numext::polygamma; return polygamma(n, x); }
 
 /** \internal \returns the erf(\a a) (coeff-wise) */
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet
-perf(const Packet& a) {
-  using numext::erf;
-  return erf(a);
-}
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet perf(const Packet& a) { using numext::erf; return erf(a); }
 
 /** \internal \returns the erfc(\a a) (coeff-wise) */
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet
-perfc(const Packet& a) {
-  using numext::erfc;
-  return erfc(a);
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); }
+
+/** \internal \returns the ndtri(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pndtri(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type ScalarType;
+  using internal::generic_ndtri; return generic_ndtri<Packet, ScalarType>(a);
 }
 
 /** \internal \returns the incomplete gamma function igamma(\a a, \a x) */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); }
+
+/** \internal \returns the derivative of the incomplete gamma function
+ * igamma_der_a(\a a, \a x) */
 template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pigamma(const Packet& a,
-                                                     const Packet& x) {
-  using numext::igamma;
-  return igamma(a, x);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pigamma_der_a(const Packet& a, const Packet& x) {
+  using numext::igamma_der_a; return igamma_der_a(a, x);
 }
 
-/** \internal \returns the complementary incomplete gamma function igammac(\a a,
- * \a x) */
+/** \internal \returns compute the derivative of the sample
+  * of Gamma(alpha, 1) random variable with respect to the parameter a
+  * gamma_sample_der_alpha(\a alpha, \a sample) */
 template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pigammac(const Packet& a,
-                                                      const Packet& x) {
-  using numext::igammac;
-  return igammac(a, x);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pgamma_sample_der_alpha(const Packet& alpha, const Packet& sample) {
+  using numext::gamma_sample_der_alpha; return gamma_sample_der_alpha(alpha, sample);
 }
 
-/** \internal \returns the complementary incomplete gamma function betainc(\a a,
- * \a b, \a x) */
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pbetainc(const Packet& a,
-                                                      const Packet& b,
-                                                      const Packet& x) {
-  using numext::betainc;
-  return betainc(a, b, x);
-}
+/** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); }
 
-}  // end namespace internal
+/** \internal \returns the complementary incomplete gamma function betainc(\a a, \a b, \a x) */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Packet pbetainc(const Packet& a, const Packet& b,const Packet& x) { using numext::betainc; return betainc(a, b, x); }
 
-}  // end namespace Eigen
+} // end namespace internal
 
-#endif  // EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_PACKETMATH_H

diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h
new file mode 100644
index 0000000..2d76692
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h

@@ -0,0 +1,46 @@
+#ifndef EIGEN_AVX_BESSELFUNCTIONS_H
+#define EIGEN_AVX_BESSELFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i0e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i0e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i1)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i1e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i1e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_j0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_j0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_j1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_j1)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k0e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k0e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k1)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k1e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k1e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_y0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_y0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_y1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_y1)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_AVX_BESSELFUNCTIONS_H

diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h
new file mode 100644
index 0000000..35e62a8
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h

@@ -0,0 +1,16 @@
+#ifndef EIGEN_AVX_SPECIALFUNCTIONS_H
+#define EIGEN_AVX_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, perf)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, perf)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pndtri)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pndtri)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_AVX_SPECIAL_FUNCTIONS_H

diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
new file mode 100644
index 0000000..7dd3c3e
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h

@@ -0,0 +1,46 @@
+#ifndef EIGEN_AVX512_BESSELFUNCTIONS_H
+#define EIGEN_AVX512_BESSELFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i1)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i1e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i1e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_j0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_j0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_j1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_j1)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k0e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k0e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k1)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k1e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k1e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y1)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_AVX512_BESSELFUNCTIONS_H

diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h
new file mode 100644
index 0000000..79878f2
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h

@@ -0,0 +1,16 @@
+#ifndef EIGEN_AVX512_SPECIALFUNCTIONS_H
+#define EIGEN_AVX512_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, perf)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, perf)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pndtri)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pndtri)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_AVX512_SPECIAL_FUNCTIONS_H

diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h
new file mode 100644
index 0000000..dd3bf4d
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h

@@ -0,0 +1,369 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GPU_SPECIALFUNCTIONS_H
+#define EIGEN_GPU_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 plgamma<float4>(const float4& a)
+{
+  return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 plgamma<double2>(const double2& a)
+{
+  using numext::lgamma;
+  return make_double2(lgamma(a.x), lgamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pdigamma<float4>(const float4& a)
+{
+  using numext::digamma;
+  return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pdigamma<double2>(const double2& a)
+{
+  using numext::digamma;
+  return make_double2(digamma(a.x), digamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pzeta<float4>(const float4& x, const float4& q)
+{
+    using numext::zeta;
+    return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pzeta<double2>(const double2& x, const double2& q)
+{
+    using numext::zeta;
+    return make_double2(zeta(x.x, q.x), zeta(x.y, q.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 ppolygamma<float4>(const float4& n, const float4& x)
+{
+    using numext::polygamma;
+    return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 ppolygamma<double2>(const double2& n, const double2& x)
+{
+    using numext::polygamma;
+    return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perf<float4>(const float4& a)
+{
+  return make_float4(erff(a.x), erff(a.y), erff(a.z), erff(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perf<double2>(const double2& a)
+{
+  using numext::erf;
+  return make_double2(erf(a.x), erf(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perfc<float4>(const float4& a)
+{
+  using numext::erfc;
+  return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perfc<double2>(const double2& a)
+{
+  using numext::erfc;
+  return make_double2(erfc(a.x), erfc(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pndtri<float4>(const float4& a)
+{
+  using numext::ndtri;
+  return make_float4(ndtri(a.x), ndtri(a.y), ndtri(a.z), ndtri(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pndtri<double2>(const double2& a)
+{
+  using numext::ndtri;
+  return make_double2(ndtri(a.x), ndtri(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigamma<float4>(const float4& a, const float4& x)
+{
+  using numext::igamma;
+  return make_float4(
+      igamma(a.x, x.x),
+      igamma(a.y, x.y),
+      igamma(a.z, x.z),
+      igamma(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigamma<double2>(const double2& a, const double2& x)
+{
+  using numext::igamma;
+  return make_double2(igamma(a.x, x.x), igamma(a.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pigamma_der_a<float4>(
+    const float4& a, const float4& x) {
+  using numext::igamma_der_a;
+  return make_float4(igamma_der_a(a.x, x.x), igamma_der_a(a.y, x.y),
+                     igamma_der_a(a.z, x.z), igamma_der_a(a.w, x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pigamma_der_a<double2>(const double2& a, const double2& x) {
+  using numext::igamma_der_a;
+  return make_double2(igamma_der_a(a.x, x.x), igamma_der_a(a.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pgamma_sample_der_alpha<float4>(
+    const float4& alpha, const float4& sample) {
+  using numext::gamma_sample_der_alpha;
+  return make_float4(
+      gamma_sample_der_alpha(alpha.x, sample.x),
+      gamma_sample_der_alpha(alpha.y, sample.y),
+      gamma_sample_der_alpha(alpha.z, sample.z),
+      gamma_sample_der_alpha(alpha.w, sample.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pgamma_sample_der_alpha<double2>(const double2& alpha, const double2& sample) {
+  using numext::gamma_sample_der_alpha;
+  return make_double2(
+      gamma_sample_der_alpha(alpha.x, sample.x),
+      gamma_sample_der_alpha(alpha.y, sample.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigammac<float4>(const float4& a, const float4& x)
+{
+  using numext::igammac;
+  return make_float4(
+      igammac(a.x, x.x),
+      igammac(a.y, x.y),
+      igammac(a.z, x.z),
+      igammac(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigammac<double2>(const double2& a, const double2& x)
+{
+  using numext::igammac;
+  return make_double2(igammac(a.x, x.x), igammac(a.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pbetainc<float4>(const float4& a, const float4& b, const float4& x)
+{
+  using numext::betainc;
+  return make_float4(
+      betainc(a.x, b.x, x.x),
+      betainc(a.y, b.y, x.y),
+      betainc(a.z, b.z, x.z),
+      betainc(a.w, b.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pbetainc<double2>(const double2& a, const double2& b, const double2& x)
+{
+  using numext::betainc;
+  return make_double2(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i0e<float4>(const float4& x) {
+  using numext::bessel_i0e;
+  return make_float4(bessel_i0e(x.x), bessel_i0e(x.y), bessel_i0e(x.z), bessel_i0e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_i0e<double2>(const double2& x) {
+  using numext::bessel_i0e;
+  return make_double2(bessel_i0e(x.x), bessel_i0e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i0<float4>(const float4& x) {
+  using numext::bessel_i0;
+  return make_float4(bessel_i0(x.x), bessel_i0(x.y), bessel_i0(x.z), bessel_i0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_i0<double2>(const double2& x) {
+  using numext::bessel_i0;
+  return make_double2(bessel_i0(x.x), bessel_i0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i1e<float4>(const float4& x) {
+  using numext::bessel_i1e;
+  return make_float4(bessel_i1e(x.x), bessel_i1e(x.y), bessel_i1e(x.z), bessel_i1e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_i1e<double2>(const double2& x) {
+  using numext::bessel_i1e;
+  return make_double2(bessel_i1e(x.x), bessel_i1e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i1<float4>(const float4& x) {
+  using numext::bessel_i1;
+  return make_float4(bessel_i1(x.x), bessel_i1(x.y), bessel_i1(x.z), bessel_i1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_i1<double2>(const double2& x) {
+  using numext::bessel_i1;
+  return make_double2(bessel_i1(x.x), bessel_i1(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k0e<float4>(const float4& x) {
+  using numext::bessel_k0e;
+  return make_float4(bessel_k0e(x.x), bessel_k0e(x.y), bessel_k0e(x.z), bessel_k0e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_k0e<double2>(const double2& x) {
+  using numext::bessel_k0e;
+  return make_double2(bessel_k0e(x.x), bessel_k0e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k0<float4>(const float4& x) {
+  using numext::bessel_k0;
+  return make_float4(bessel_k0(x.x), bessel_k0(x.y), bessel_k0(x.z), bessel_k0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_k0<double2>(const double2& x) {
+  using numext::bessel_k0;
+  return make_double2(bessel_k0(x.x), bessel_k0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k1e<float4>(const float4& x) {
+  using numext::bessel_k1e;
+  return make_float4(bessel_k1e(x.x), bessel_k1e(x.y), bessel_k1e(x.z), bessel_k1e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_k1e<double2>(const double2& x) {
+  using numext::bessel_k1e;
+  return make_double2(bessel_k1e(x.x), bessel_k1e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k1<float4>(const float4& x) {
+  using numext::bessel_k1;
+  return make_float4(bessel_k1(x.x), bessel_k1(x.y), bessel_k1(x.z), bessel_k1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_k1<double2>(const double2& x) {
+  using numext::bessel_k1;
+  return make_double2(bessel_k1(x.x), bessel_k1(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_j0<float4>(const float4& x) {
+  using numext::bessel_j0;
+  return make_float4(bessel_j0(x.x), bessel_j0(x.y), bessel_j0(x.z), bessel_j0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_j0<double2>(const double2& x) {
+  using numext::bessel_j0;
+  return make_double2(bessel_j0(x.x), bessel_j0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_j1<float4>(const float4& x) {
+  using numext::bessel_j1;
+  return make_float4(bessel_j1(x.x), bessel_j1(x.y), bessel_j1(x.z), bessel_j1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_j1<double2>(const double2& x) {
+  using numext::bessel_j1;
+  return make_double2(bessel_j1(x.x), bessel_j1(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_y0<float4>(const float4& x) {
+  using numext::bessel_y0;
+  return make_float4(bessel_y0(x.x), bessel_y0(x.y), bessel_y0(x.z), bessel_y0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_y0<double2>(const double2& x) {
+  using numext::bessel_y0;
+  return make_double2(bessel_y0(x.x), bessel_y0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_y1<float4>(const float4& x) {
+  using numext::bessel_y1;
+  return make_float4(bessel_y1(x.x), bessel_y1(x.y), bessel_y1(x.z), bessel_y1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_y1<double2>(const double2& x) {
+  using numext::bessel_y1;
+  return make_double2(bessel_y1(x.x), bessel_y1(x.y));
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GPU_SPECIALFUNCTIONS_H

diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h
new file mode 100644
index 0000000..67433b0
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h

@@ -0,0 +1,54 @@
+#ifndef EIGEN_NEON_BESSELFUNCTIONS_H
+#define EIGEN_NEON_BESSELFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+
+#define NEON_HALF_TO_FLOAT_FUNCTIONS(METHOD)                            \
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                       \
+Packet8hf METHOD<Packet8hf>(const Packet8hf& x) {                       \
+  const Packet4f lo = METHOD<Packet4f>(vcvt_f32_f16(vget_low_f16(x)));  \
+  const Packet4f hi = METHOD<Packet4f>(vcvt_f32_f16(vget_high_f16(x))); \
+  return vcombine_f16(vcvt_f16_f32(lo), vcvt_f16_f32(hi));              \
+}                                                                       \
+                                                                        \
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                       \
+Packet4hf METHOD<Packet4hf>(const Packet4hf& x) {                       \
+  return vcvt_f16_f32(METHOD<Packet4f>(vcvt_f32_f16(x)));               \
+}
+
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i0e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i1)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i1e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_j0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_j1)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k0e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k1)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k1e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_y0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_y1)
+
+#undef NEON_HALF_TO_FLOAT_FUNCTIONS
+#endif
+
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i0e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i1)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i1e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_j0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_j1)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k0e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k1)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k1e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_y0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_y1)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_NEON_BESSELFUNCTIONS_H

diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h
new file mode 100644
index 0000000..ec92951
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h

@@ -0,0 +1,34 @@
+#ifndef EIGEN_NEON_SPECIALFUNCTIONS_H
+#define EIGEN_NEON_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+
+#define NEON_HALF_TO_FLOAT_FUNCTIONS(METHOD)                            \
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                       \
+Packet8hf METHOD<Packet8hf>(const Packet8hf& x) {                       \
+  const Packet4f lo = METHOD<Packet4f>(vcvt_f32_f16(vget_low_f16(x)));  \
+  const Packet4f hi = METHOD<Packet4f>(vcvt_f32_f16(vget_high_f16(x))); \
+  return vcombine_f16(vcvt_f16_f32(lo), vcvt_f16_f32(hi));              \
+}                                                                       \
+                                                                        \
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                       \
+Packet4hf METHOD<Packet4hf>(const Packet4hf& x) {                       \
+  return vcvt_f16_f32(METHOD<Packet4f>(vcvt_f32_f16(x)));               \
+}
+
+NEON_HALF_TO_FLOAT_FUNCTIONS(perf)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pndtri)
+
+#undef NEON_HALF_TO_FLOAT_FUNCTIONS
+#endif
+
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, perf)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pndtri)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_NEON_SPECIALFUNCTIONS_H

diff --git a/unsupported/Eigen/src/Splines/Spline.h b/unsupported/Eigen/src/Splines/Spline.h
new file mode 100644
index 0000000..79edd52
--- /dev/null
+++ b/unsupported/Eigen/src/Splines/Spline.h

@@ -0,0 +1,507 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 20010-2011 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPLINE_H
+#define EIGEN_SPLINE_H
+
+#include "SplineFwd.h"
+
+namespace Eigen
+{
+    /**
+     * \ingroup Splines_Module
+     * \class Spline
+     * \brief A class representing multi-dimensional spline curves.
+     *
+     * The class represents B-splines with non-uniform knot vectors. Each control
+     * point of the B-spline is associated with a basis function
+     * \f{align*}
+     *   C(u) & = \sum_{i=0}^{n}N_{i,p}(u)P_i
+     * \f}
+     *
+     * \tparam _Scalar The underlying data type (typically float or double)
+     * \tparam _Dim The curve dimension (e.g. 2 or 3)
+     * \tparam _Degree Per default set to Dynamic; could be set to the actual desired
+     *                degree for optimization purposes (would result in stack allocation
+     *                of several temporary variables).
+     **/
+  template <typename _Scalar, int _Dim, int _Degree>
+  class Spline
+  {
+  public:
+    typedef _Scalar Scalar; /*!< The spline curve's scalar type. */
+    enum { Dimension = _Dim /*!< The spline curve's dimension. */ };
+    enum { Degree = _Degree /*!< The spline curve's degree. */ };
+
+    /** \brief The point type the spline is representing. */
+    typedef typename SplineTraits<Spline>::PointType PointType;
+    
+    /** \brief The data type used to store knot vectors. */
+    typedef typename SplineTraits<Spline>::KnotVectorType KnotVectorType;
+
+    /** \brief The data type used to store parameter vectors. */
+    typedef typename SplineTraits<Spline>::ParameterVectorType ParameterVectorType;
+    
+    /** \brief The data type used to store non-zero basis functions. */
+    typedef typename SplineTraits<Spline>::BasisVectorType BasisVectorType;
+
+    /** \brief The data type used to store the values of the basis function derivatives. */
+    typedef typename SplineTraits<Spline>::BasisDerivativeType BasisDerivativeType;
+    
+    /** \brief The data type representing the spline's control points. */
+    typedef typename SplineTraits<Spline>::ControlPointVectorType ControlPointVectorType;
+    
+    /**
+    * \brief Creates a (constant) zero spline.
+    * For Splines with dynamic degree, the resulting degree will be 0.
+    **/
+    Spline() 
+    : m_knots(1, (Degree==Dynamic ? 2 : 2*Degree+2))
+    , m_ctrls(ControlPointVectorType::Zero(Dimension,(Degree==Dynamic ? 1 : Degree+1))) 
+    {
+      // in theory this code can go to the initializer list but it will get pretty
+      // much unreadable ...
+      enum { MinDegree = (Degree==Dynamic ? 0 : Degree) };
+      m_knots.template segment<MinDegree+1>(0) = Array<Scalar,1,MinDegree+1>::Zero();
+      m_knots.template segment<MinDegree+1>(MinDegree+1) = Array<Scalar,1,MinDegree+1>::Ones();
+    }
+
+    /**
+    * \brief Creates a spline from a knot vector and control points.
+    * \param knots The spline's knot vector.
+    * \param ctrls The spline's control point vector.
+    **/
+    template <typename OtherVectorType, typename OtherArrayType>
+    Spline(const OtherVectorType& knots, const OtherArrayType& ctrls) : m_knots(knots), m_ctrls(ctrls) {}
+
+    /**
+    * \brief Copy constructor for splines.
+    * \param spline The input spline.
+    **/
+    template <int OtherDegree>
+    Spline(const Spline<Scalar, Dimension, OtherDegree>& spline) : 
+    m_knots(spline.knots()), m_ctrls(spline.ctrls()) {}
+
+    /**
+     * \brief Returns the knots of the underlying spline.
+     **/
+    const KnotVectorType& knots() const { return m_knots; }
+    
+    /**
+     * \brief Returns the ctrls of the underlying spline.
+     **/    
+    const ControlPointVectorType& ctrls() const { return m_ctrls; }
+
+    /**
+     * \brief Returns the spline value at a given site \f$u\f$.
+     *
+     * The function returns
+     * \f{align*}
+     *   C(u) & = \sum_{i=0}^{n}N_{i,p}P_i
+     * \f}
+     *
+     * \param u Parameter \f$u \in [0;1]\f$ at which the spline is evaluated.
+     * \return The spline value at the given location \f$u\f$.
+     **/
+    PointType operator()(Scalar u) const;
+
+    /**
+     * \brief Evaluation of spline derivatives of up-to given order.
+     *
+     * The function returns
+     * \f{align*}
+     *   \frac{d^i}{du^i}C(u) & = \sum_{i=0}^{n} \frac{d^i}{du^i} N_{i,p}(u)P_i
+     * \f}
+     * for i ranging between 0 and order.
+     *
+     * \param u Parameter \f$u \in [0;1]\f$ at which the spline derivative is evaluated.
+     * \param order The order up to which the derivatives are computed.
+     **/
+    typename SplineTraits<Spline>::DerivativeType
+      derivatives(Scalar u, DenseIndex order) const;
+
+    /**
+     * \copydoc Spline::derivatives
+     * Using the template version of this function is more efficieent since
+     * temporary objects are allocated on the stack whenever this is possible.
+     **/    
+    template <int DerivativeOrder>
+    typename SplineTraits<Spline,DerivativeOrder>::DerivativeType
+      derivatives(Scalar u, DenseIndex order = DerivativeOrder) const;
+
+    /**
+     * \brief Computes the non-zero basis functions at the given site.
+     *
+     * Splines have local support and a point from their image is defined
+     * by exactly \f$p+1\f$ control points \f$P_i\f$ where \f$p\f$ is the
+     * spline degree.
+     *
+     * This function computes the \f$p+1\f$ non-zero basis function values
+     * for a given parameter value \f$u\f$. It returns
+     * \f{align*}{
+     *   N_{i,p}(u), \hdots, N_{i+p+1,p}(u)
+     * \f}
+     *
+     * \param u Parameter \f$u \in [0;1]\f$ at which the non-zero basis functions 
+     *          are computed.
+     **/
+    typename SplineTraits<Spline>::BasisVectorType
+      basisFunctions(Scalar u) const;
+
+    /**
+     * \brief Computes the non-zero spline basis function derivatives up to given order.
+     *
+     * The function computes
+     * \f{align*}{
+     *   \frac{d^i}{du^i} N_{i,p}(u), \hdots, \frac{d^i}{du^i} N_{i+p+1,p}(u)
+     * \f}
+     * with i ranging from 0 up to the specified order.
+     *
+     * \param u Parameter \f$u \in [0;1]\f$ at which the non-zero basis function
+     *          derivatives are computed.
+     * \param order The order up to which the basis function derivatives are computes.
+     **/
+    typename SplineTraits<Spline>::BasisDerivativeType
+      basisFunctionDerivatives(Scalar u, DenseIndex order) const;
+
+    /**
+     * \copydoc Spline::basisFunctionDerivatives
+     * Using the template version of this function is more efficieent since
+     * temporary objects are allocated on the stack whenever this is possible.
+     **/    
+    template <int DerivativeOrder>
+    typename SplineTraits<Spline,DerivativeOrder>::BasisDerivativeType
+      basisFunctionDerivatives(Scalar u, DenseIndex order = DerivativeOrder) const;
+
+    /**
+     * \brief Returns the spline degree.
+     **/ 
+    DenseIndex degree() const;
+
+    /** 
+     * \brief Returns the span within the knot vector in which u is falling.
+     * \param u The site for which the span is determined.
+     **/
+    DenseIndex span(Scalar u) const;
+
+    /**
+     * \brief Computes the span within the provided knot vector in which u is falling.
+     **/
+    static DenseIndex Span(typename SplineTraits<Spline>::Scalar u, DenseIndex degree, const typename SplineTraits<Spline>::KnotVectorType& knots);
+    
+    /**
+     * \brief Returns the spline's non-zero basis functions.
+     *
+     * The function computes and returns
+     * \f{align*}{
+     *   N_{i,p}(u), \hdots, N_{i+p+1,p}(u)
+     * \f}
+     *
+     * \param u The site at which the basis functions are computed.
+     * \param degree The degree of the underlying spline.
+     * \param knots The underlying spline's knot vector.
+     **/
+    static BasisVectorType BasisFunctions(Scalar u, DenseIndex degree, const KnotVectorType& knots);
+
+    /**
+     * \copydoc Spline::basisFunctionDerivatives
+     * \param degree The degree of the underlying spline
+     * \param knots The underlying spline's knot vector.
+     **/    
+    static BasisDerivativeType BasisFunctionDerivatives(
+      const Scalar u, const DenseIndex order, const DenseIndex degree, const KnotVectorType& knots);
+
+  private:
+    KnotVectorType m_knots; /*!< Knot vector. */
+    ControlPointVectorType  m_ctrls; /*!< Control points. */
+
+    template <typename DerivativeType>
+    static void BasisFunctionDerivativesImpl(
+      const typename Spline<_Scalar, _Dim, _Degree>::Scalar u,
+      const DenseIndex order,
+      const DenseIndex p, 
+      const typename Spline<_Scalar, _Dim, _Degree>::KnotVectorType& U,
+      DerivativeType& N_);
+  };
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  DenseIndex Spline<_Scalar, _Dim, _Degree>::Span(
+    typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::Scalar u,
+    DenseIndex degree,
+    const typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::KnotVectorType& knots)
+  {
+    // Piegl & Tiller, "The NURBS Book", A2.1 (p. 68)
+    if (u <= knots(0)) return degree;
+    const Scalar* pos = std::upper_bound(knots.data()+degree-1, knots.data()+knots.size()-degree-1, u);
+    return static_cast<DenseIndex>( std::distance(knots.data(), pos) - 1 );
+  }
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  typename Spline<_Scalar, _Dim, _Degree>::BasisVectorType
+    Spline<_Scalar, _Dim, _Degree>::BasisFunctions(
+    typename Spline<_Scalar, _Dim, _Degree>::Scalar u,
+    DenseIndex degree,
+    const typename Spline<_Scalar, _Dim, _Degree>::KnotVectorType& knots)
+  {
+    const DenseIndex p = degree;
+    const DenseIndex i = Spline::Span(u, degree, knots);
+
+    const KnotVectorType& U = knots;
+
+    BasisVectorType left(p+1); left(0) = Scalar(0);
+    BasisVectorType right(p+1); right(0) = Scalar(0);
+
+    VectorBlock<BasisVectorType,Degree>(left,1,p) = u - VectorBlock<const KnotVectorType,Degree>(U,i+1-p,p).reverse();
+    VectorBlock<BasisVectorType,Degree>(right,1,p) = VectorBlock<const KnotVectorType,Degree>(U,i+1,p) - u;
+
+    BasisVectorType N(1,p+1);
+    N(0) = Scalar(1);
+    for (DenseIndex j=1; j<=p; ++j)
+    {
+      Scalar saved = Scalar(0);
+      for (DenseIndex r=0; r<j; r++)
+      {
+        const Scalar tmp = N(r)/(right(r+1)+left(j-r));
+        N[r] = saved + right(r+1)*tmp;
+        saved = left(j-r)*tmp;
+      }
+      N(j) = saved;
+    }
+    return N;
+  }
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  DenseIndex Spline<_Scalar, _Dim, _Degree>::degree() const
+  {
+    if (_Degree == Dynamic)
+      return m_knots.size() - m_ctrls.cols() - 1;
+    else
+      return _Degree;
+  }
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  DenseIndex Spline<_Scalar, _Dim, _Degree>::span(Scalar u) const
+  {
+    return Spline::Span(u, degree(), knots());
+  }
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  typename Spline<_Scalar, _Dim, _Degree>::PointType Spline<_Scalar, _Dim, _Degree>::operator()(Scalar u) const
+  {
+    enum { Order = SplineTraits<Spline>::OrderAtCompileTime };
+
+    const DenseIndex span = this->span(u);
+    const DenseIndex p = degree();
+    const BasisVectorType basis_funcs = basisFunctions(u);
+
+    const Replicate<BasisVectorType,Dimension,1> ctrl_weights(basis_funcs);
+    const Block<const ControlPointVectorType,Dimension,Order> ctrl_pts(ctrls(),0,span-p,Dimension,p+1);
+    return (ctrl_weights * ctrl_pts).rowwise().sum();
+  }
+
+  /* --------------------------------------------------------------------------------------------- */
+
+  template <typename SplineType, typename DerivativeType>
+  void derivativesImpl(const SplineType& spline, typename SplineType::Scalar u, DenseIndex order, DerivativeType& der)
+  {    
+    enum { Dimension = SplineTraits<SplineType>::Dimension };
+    enum { Order = SplineTraits<SplineType>::OrderAtCompileTime };
+    enum { DerivativeOrder = DerivativeType::ColsAtCompileTime };
+
+    typedef typename SplineTraits<SplineType>::ControlPointVectorType ControlPointVectorType;
+    typedef typename SplineTraits<SplineType,DerivativeOrder>::BasisDerivativeType BasisDerivativeType;
+    typedef typename BasisDerivativeType::ConstRowXpr BasisDerivativeRowXpr;    
+
+    const DenseIndex p = spline.degree();
+    const DenseIndex span = spline.span(u);
+
+    const DenseIndex n = (std::min)(p, order);
+
+    der.resize(Dimension,n+1);
+
+    // Retrieve the basis function derivatives up to the desired order...    
+    const BasisDerivativeType basis_func_ders = spline.template basisFunctionDerivatives<DerivativeOrder>(u, n+1);
+
+    // ... and perform the linear combinations of the control points.
+    for (DenseIndex der_order=0; der_order<n+1; ++der_order)
+    {
+      const Replicate<BasisDerivativeRowXpr,Dimension,1> ctrl_weights( basis_func_ders.row(der_order) );
+      const Block<const ControlPointVectorType,Dimension,Order> ctrl_pts(spline.ctrls(),0,span-p,Dimension,p+1);
+      der.col(der_order) = (ctrl_weights * ctrl_pts).rowwise().sum();
+    }
+  }
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::DerivativeType
+    Spline<_Scalar, _Dim, _Degree>::derivatives(Scalar u, DenseIndex order) const
+  {
+    typename SplineTraits< Spline >::DerivativeType res;
+    derivativesImpl(*this, u, order, res);
+    return res;
+  }
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  template <int DerivativeOrder>
+  typename SplineTraits< Spline<_Scalar, _Dim, _Degree>, DerivativeOrder >::DerivativeType
+    Spline<_Scalar, _Dim, _Degree>::derivatives(Scalar u, DenseIndex order) const
+  {
+    typename SplineTraits< Spline, DerivativeOrder >::DerivativeType res;
+    derivativesImpl(*this, u, order, res);
+    return res;
+  }
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::BasisVectorType
+    Spline<_Scalar, _Dim, _Degree>::basisFunctions(Scalar u) const
+  {
+    return Spline::BasisFunctions(u, degree(), knots());
+  }
+
+  /* --------------------------------------------------------------------------------------------- */
+  
+  
+  template <typename _Scalar, int _Dim, int _Degree>
+  template <typename DerivativeType>
+  void Spline<_Scalar, _Dim, _Degree>::BasisFunctionDerivativesImpl(
+    const typename Spline<_Scalar, _Dim, _Degree>::Scalar u,
+    const DenseIndex order,
+    const DenseIndex p, 
+    const typename Spline<_Scalar, _Dim, _Degree>::KnotVectorType& U,
+    DerivativeType& N_)
+  {
+    typedef Spline<_Scalar, _Dim, _Degree> SplineType;
+    enum { Order = SplineTraits<SplineType>::OrderAtCompileTime };
+
+    const DenseIndex span = SplineType::Span(u, p, U);
+
+    const DenseIndex n = (std::min)(p, order);
+
+    N_.resize(n+1, p+1);
+
+    BasisVectorType left = BasisVectorType::Zero(p+1);
+    BasisVectorType right = BasisVectorType::Zero(p+1);
+
+    Matrix<Scalar,Order,Order> ndu(p+1,p+1);
+
+    Scalar saved, temp; // FIXME These were double instead of Scalar. Was there a reason for that?
+
+    ndu(0,0) = 1.0;
+
+    DenseIndex j;
+    for (j=1; j<=p; ++j)
+    {
+      left[j] = u-U[span+1-j];
+      right[j] = U[span+j]-u;
+      saved = 0.0;
+
+      for (DenseIndex r=0; r<j; ++r)
+      {
+        /* Lower triangle */
+        ndu(j,r) = right[r+1]+left[j-r];
+        temp = ndu(r,j-1)/ndu(j,r);
+        /* Upper triangle */
+        ndu(r,j) = static_cast<Scalar>(saved+right[r+1] * temp);
+        saved = left[j-r] * temp;
+      }
+
+      ndu(j,j) = static_cast<Scalar>(saved);
+    }
+
+    for (j = p; j>=0; --j) 
+      N_(0,j) = ndu(j,p);
+
+    // Compute the derivatives
+    DerivativeType a(n+1,p+1);
+    DenseIndex r=0;
+    for (; r<=p; ++r)
+    {
+      DenseIndex s1,s2;
+      s1 = 0; s2 = 1; // alternate rows in array a
+      a(0,0) = 1.0;
+
+      // Compute the k-th derivative
+      for (DenseIndex k=1; k<=static_cast<DenseIndex>(n); ++k)
+      {
+        Scalar d = 0.0;
+        DenseIndex rk,pk,j1,j2;
+        rk = r-k; pk = p-k;
+
+        if (r>=k)
+        {
+          a(s2,0) = a(s1,0)/ndu(pk+1,rk);
+          d = a(s2,0)*ndu(rk,pk);
+        }
+
+        if (rk>=-1) j1 = 1;
+        else        j1 = -rk;
+
+        if (r-1 <= pk) j2 = k-1;
+        else           j2 = p-r;
+
+        for (j=j1; j<=j2; ++j)
+        {
+          a(s2,j) = (a(s1,j)-a(s1,j-1))/ndu(pk+1,rk+j);
+          d += a(s2,j)*ndu(rk+j,pk);
+        }
+
+        if (r<=pk)
+        {
+          a(s2,k) = -a(s1,k-1)/ndu(pk+1,r);
+          d += a(s2,k)*ndu(r,pk);
+        }
+
+        N_(k,r) = static_cast<Scalar>(d);
+        j = s1; s1 = s2; s2 = j; // Switch rows
+      }
+    }
+
+    /* Multiply through by the correct factors */
+    /* (Eq. [2.9])                             */
+    r = p;
+    for (DenseIndex k=1; k<=static_cast<DenseIndex>(n); ++k)
+    {
+      for (j=p; j>=0; --j) N_(k,j) *= r;
+      r *= p-k;
+    }
+  }
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::BasisDerivativeType
+    Spline<_Scalar, _Dim, _Degree>::basisFunctionDerivatives(Scalar u, DenseIndex order) const
+  {
+    typename SplineTraits<Spline<_Scalar, _Dim, _Degree> >::BasisDerivativeType der;
+    BasisFunctionDerivativesImpl(u, order, degree(), knots(), der);
+    return der;
+  }
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  template <int DerivativeOrder>
+  typename SplineTraits< Spline<_Scalar, _Dim, _Degree>, DerivativeOrder >::BasisDerivativeType
+    Spline<_Scalar, _Dim, _Degree>::basisFunctionDerivatives(Scalar u, DenseIndex order) const
+  {
+    typename SplineTraits< Spline<_Scalar, _Dim, _Degree>, DerivativeOrder >::BasisDerivativeType der;
+    BasisFunctionDerivativesImpl(u, order, degree(), knots(), der);
+    return der;
+  }
+
+  template <typename _Scalar, int _Dim, int _Degree>
+  typename SplineTraits<Spline<_Scalar, _Dim, _Degree> >::BasisDerivativeType
+  Spline<_Scalar, _Dim, _Degree>::BasisFunctionDerivatives(
+    const typename Spline<_Scalar, _Dim, _Degree>::Scalar u,
+    const DenseIndex order,
+    const DenseIndex degree,
+    const typename Spline<_Scalar, _Dim, _Degree>::KnotVectorType& knots)
+  {
+    typename SplineTraits<Spline>::BasisDerivativeType der;
+    BasisFunctionDerivativesImpl(u, order, degree, knots, der);
+    return der;
+  }
+}
+
+#endif // EIGEN_SPLINE_H

diff --git a/unsupported/Eigen/src/Splines/SplineFitting.h b/unsupported/Eigen/src/Splines/SplineFitting.h
new file mode 100644
index 0000000..9f6e8af
--- /dev/null
+++ b/unsupported/Eigen/src/Splines/SplineFitting.h

@@ -0,0 +1,431 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 20010-2011 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPLINE_FITTING_H
+#define EIGEN_SPLINE_FITTING_H
+
+#include <algorithm>
+#include <functional>
+#include <numeric>
+#include <vector>
+
+#include "SplineFwd.h"
+
+#include "../../../../Eigen/LU"
+#include "../../../../Eigen/QR"
+
+namespace Eigen
+{
+  /**
+   * \brief Computes knot averages.
+   * \ingroup Splines_Module
+   *
+   * The knots are computed as
+   * \f{align*}
+   *  u_0 & = \hdots = u_p = 0 \\
+   *  u_{m-p} & = \hdots = u_{m} = 1 \\
+   *  u_{j+p} & = \frac{1}{p}\sum_{i=j}^{j+p-1}\bar{u}_i \quad\quad j=1,\hdots,n-p
+   * \f}
+   * where \f$p\f$ is the degree and \f$m+1\f$ the number knots
+   * of the desired interpolating spline.
+   *
+   * \param[in] parameters The input parameters. During interpolation one for each data point.
+   * \param[in] degree The spline degree which is used during the interpolation.
+   * \param[out] knots The output knot vector.
+   *
+   * \sa Les Piegl and Wayne Tiller, The NURBS book (2nd ed.), 1997, 9.2.1 Global Curve Interpolation to Point Data
+   **/
+  template <typename KnotVectorType>
+  void KnotAveraging(const KnotVectorType& parameters, DenseIndex degree, KnotVectorType& knots)
+  {
+    knots.resize(parameters.size()+degree+1);      
+
+    for (DenseIndex j=1; j<parameters.size()-degree; ++j)
+      knots(j+degree) = parameters.segment(j,degree).mean();
+
+    knots.segment(0,degree+1) = KnotVectorType::Zero(degree+1);
+    knots.segment(knots.size()-degree-1,degree+1) = KnotVectorType::Ones(degree+1);
+  }
+
+  /**
+   * \brief Computes knot averages when derivative constraints are present.
+   * Note that this is a technical interpretation of the referenced article
+   * since the algorithm contained therein is incorrect as written.
+   * \ingroup Splines_Module
+   *
+   * \param[in] parameters The parameters at which the interpolation B-Spline
+   *            will intersect the given interpolation points. The parameters
+   *            are assumed to be a non-decreasing sequence.
+   * \param[in] degree The degree of the interpolating B-Spline. This must be
+   *            greater than zero.
+   * \param[in] derivativeIndices The indices corresponding to parameters at
+   *            which there are derivative constraints. The indices are assumed
+   *            to be a non-decreasing sequence.
+   * \param[out] knots The calculated knot vector. These will be returned as a
+   *             non-decreasing sequence
+   *
+   * \sa Les A. Piegl, Khairan Rajab, Volha Smarodzinana. 2008.
+   * Curve interpolation with directional constraints for engineering design. 
+   * Engineering with Computers
+   **/
+  template <typename KnotVectorType, typename ParameterVectorType, typename IndexArray>
+  void KnotAveragingWithDerivatives(const ParameterVectorType& parameters,
+                                    const unsigned int degree,
+                                    const IndexArray& derivativeIndices,
+                                    KnotVectorType& knots)
+  {
+    typedef typename ParameterVectorType::Scalar Scalar;
+
+    DenseIndex numParameters = parameters.size();
+    DenseIndex numDerivatives = derivativeIndices.size();
+
+    if (numDerivatives < 1)
+    {
+      KnotAveraging(parameters, degree, knots);
+      return;
+    }
+
+    DenseIndex startIndex;
+    DenseIndex endIndex;
+  
+    DenseIndex numInternalDerivatives = numDerivatives;
+    
+    if (derivativeIndices[0] == 0)
+    {
+      startIndex = 0;
+      --numInternalDerivatives;
+    }
+    else
+    {
+      startIndex = 1;
+    }
+    if (derivativeIndices[numDerivatives - 1] == numParameters - 1)
+    {
+      endIndex = numParameters - degree;
+      --numInternalDerivatives;
+    }
+    else
+    {
+      endIndex = numParameters - degree - 1;
+    }
+
+    // There are (endIndex - startIndex + 1) knots obtained from the averaging
+    // and 2 for the first and last parameters.
+    DenseIndex numAverageKnots = endIndex - startIndex + 3;
+    KnotVectorType averageKnots(numAverageKnots);
+    averageKnots[0] = parameters[0];
+
+    int newKnotIndex = 0;
+    for (DenseIndex i = startIndex; i <= endIndex; ++i)
+      averageKnots[++newKnotIndex] = parameters.segment(i, degree).mean();
+    averageKnots[++newKnotIndex] = parameters[numParameters - 1];
+
+    newKnotIndex = -1;
+  
+    ParameterVectorType temporaryParameters(numParameters + 1);
+    KnotVectorType derivativeKnots(numInternalDerivatives);
+    for (DenseIndex i = 0; i < numAverageKnots - 1; ++i)
+    {
+      temporaryParameters[0] = averageKnots[i];
+      ParameterVectorType parameterIndices(numParameters);
+      int temporaryParameterIndex = 1;
+      for (DenseIndex j = 0; j < numParameters; ++j)
+      {
+        Scalar parameter = parameters[j];
+        if (parameter >= averageKnots[i] && parameter < averageKnots[i + 1])
+        {
+          parameterIndices[temporaryParameterIndex] = j;
+          temporaryParameters[temporaryParameterIndex++] = parameter;
+        }
+      }
+      temporaryParameters[temporaryParameterIndex] = averageKnots[i + 1];
+
+      for (int j = 0; j <= temporaryParameterIndex - 2; ++j)
+      {
+        for (DenseIndex k = 0; k < derivativeIndices.size(); ++k)
+        {
+          if (parameterIndices[j + 1] == derivativeIndices[k]
+              && parameterIndices[j + 1] != 0
+              && parameterIndices[j + 1] != numParameters - 1)
+          {
+            derivativeKnots[++newKnotIndex] = temporaryParameters.segment(j, 3).mean();
+            break;
+          }
+        }
+      }
+    }
+    
+    KnotVectorType temporaryKnots(averageKnots.size() + derivativeKnots.size());
+
+    std::merge(averageKnots.data(), averageKnots.data() + averageKnots.size(),
+               derivativeKnots.data(), derivativeKnots.data() + derivativeKnots.size(),
+               temporaryKnots.data());
+
+    // Number of knots (one for each point and derivative) plus spline order.
+    DenseIndex numKnots = numParameters + numDerivatives + degree + 1;
+    knots.resize(numKnots);
+
+    knots.head(degree).fill(temporaryKnots[0]);
+    knots.tail(degree).fill(temporaryKnots.template tail<1>()[0]);
+    knots.segment(degree, temporaryKnots.size()) = temporaryKnots;
+  }
+
+  /**
+   * \brief Computes chord length parameters which are required for spline interpolation.
+   * \ingroup Splines_Module
+   *
+   * \param[in] pts The data points to which a spline should be fit.
+   * \param[out] chord_lengths The resulting chord length vector.
+   *
+   * \sa Les Piegl and Wayne Tiller, The NURBS book (2nd ed.), 1997, 9.2.1 Global Curve Interpolation to Point Data
+   **/   
+  template <typename PointArrayType, typename KnotVectorType>
+  void ChordLengths(const PointArrayType& pts, KnotVectorType& chord_lengths)
+  {
+    typedef typename KnotVectorType::Scalar Scalar;
+
+    const DenseIndex n = pts.cols();
+
+    // 1. compute the column-wise norms
+    chord_lengths.resize(pts.cols());
+    chord_lengths[0] = 0;
+    chord_lengths.rightCols(n-1) = (pts.array().leftCols(n-1) - pts.array().rightCols(n-1)).matrix().colwise().norm();
+
+    // 2. compute the partial sums
+    std::partial_sum(chord_lengths.data(), chord_lengths.data()+n, chord_lengths.data());
+
+    // 3. normalize the data
+    chord_lengths /= chord_lengths(n-1);
+    chord_lengths(n-1) = Scalar(1);
+  }
+
+  /**
+   * \brief Spline fitting methods.
+   * \ingroup Splines_Module
+   **/     
+  template <typename SplineType>
+  struct SplineFitting
+  {
+    typedef typename SplineType::KnotVectorType KnotVectorType;
+    typedef typename SplineType::ParameterVectorType ParameterVectorType;
+
+    /**
+     * \brief Fits an interpolating Spline to the given data points.
+     *
+     * \param pts The points for which an interpolating spline will be computed.
+     * \param degree The degree of the interpolating spline.
+     *
+     * \returns A spline interpolating the initially provided points.
+     **/
+    template <typename PointArrayType>
+    static SplineType Interpolate(const PointArrayType& pts, DenseIndex degree);
+
+    /**
+     * \brief Fits an interpolating Spline to the given data points.
+     *
+     * \param pts The points for which an interpolating spline will be computed.
+     * \param degree The degree of the interpolating spline.
+     * \param knot_parameters The knot parameters for the interpolation.
+     *
+     * \returns A spline interpolating the initially provided points.
+     **/
+    template <typename PointArrayType>
+    static SplineType Interpolate(const PointArrayType& pts, DenseIndex degree, const KnotVectorType& knot_parameters);
+
+    /**
+     * \brief Fits an interpolating spline to the given data points and
+     * derivatives.
+     * 
+     * \param points The points for which an interpolating spline will be computed.
+     * \param derivatives The desired derivatives of the interpolating spline at interpolation
+     *                    points.
+     * \param derivativeIndices An array indicating which point each derivative belongs to. This
+     *                          must be the same size as @a derivatives.
+     * \param degree The degree of the interpolating spline.
+     *
+     * \returns A spline interpolating @a points with @a derivatives at those points.
+     *
+     * \sa Les A. Piegl, Khairan Rajab, Volha Smarodzinana. 2008.
+     * Curve interpolation with directional constraints for engineering design. 
+     * Engineering with Computers
+     **/
+    template <typename PointArrayType, typename IndexArray>
+    static SplineType InterpolateWithDerivatives(const PointArrayType& points,
+                                                 const PointArrayType& derivatives,
+                                                 const IndexArray& derivativeIndices,
+                                                 const unsigned int degree);
+
+    /**
+     * \brief Fits an interpolating spline to the given data points and derivatives.
+     * 
+     * \param points The points for which an interpolating spline will be computed.
+     * \param derivatives The desired derivatives of the interpolating spline at interpolation points.
+     * \param derivativeIndices An array indicating which point each derivative belongs to. This
+     *                          must be the same size as @a derivatives.
+     * \param degree The degree of the interpolating spline.
+     * \param parameters The parameters corresponding to the interpolation points.
+     *
+     * \returns A spline interpolating @a points with @a derivatives at those points.
+     *
+     * \sa Les A. Piegl, Khairan Rajab, Volha Smarodzinana. 2008.
+     * Curve interpolation with directional constraints for engineering design. 
+     * Engineering with Computers
+     */
+    template <typename PointArrayType, typename IndexArray>
+    static SplineType InterpolateWithDerivatives(const PointArrayType& points,
+                                                 const PointArrayType& derivatives,
+                                                 const IndexArray& derivativeIndices,
+                                                 const unsigned int degree,
+                                                 const ParameterVectorType& parameters);
+  };
+
+  template <typename SplineType>
+  template <typename PointArrayType>
+  SplineType SplineFitting<SplineType>::Interpolate(const PointArrayType& pts, DenseIndex degree, const KnotVectorType& knot_parameters)
+  {
+    typedef typename SplineType::KnotVectorType::Scalar Scalar;      
+    typedef typename SplineType::ControlPointVectorType ControlPointVectorType;      
+
+    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
+
+    KnotVectorType knots;
+    KnotAveraging(knot_parameters, degree, knots);
+
+    DenseIndex n = pts.cols();
+    MatrixType A = MatrixType::Zero(n,n);
+    for (DenseIndex i=1; i<n-1; ++i)
+    {
+      const DenseIndex span = SplineType::Span(knot_parameters[i], degree, knots);
+
+      // The segment call should somehow be told the spline order at compile time.
+      A.row(i).segment(span-degree, degree+1) = SplineType::BasisFunctions(knot_parameters[i], degree, knots);
+    }
+    A(0,0) = 1.0;
+    A(n-1,n-1) = 1.0;
+
+    HouseholderQR<MatrixType> qr(A);
+
+    // Here, we are creating a temporary due to an Eigen issue.
+    ControlPointVectorType ctrls = qr.solve(MatrixType(pts.transpose())).transpose();
+
+    return SplineType(knots, ctrls);
+  }
+
+  template <typename SplineType>
+  template <typename PointArrayType>
+  SplineType SplineFitting<SplineType>::Interpolate(const PointArrayType& pts, DenseIndex degree)
+  {
+    KnotVectorType chord_lengths; // knot parameters
+    ChordLengths(pts, chord_lengths);
+    return Interpolate(pts, degree, chord_lengths);
+  }
+  
+  template <typename SplineType>
+  template <typename PointArrayType, typename IndexArray>
+  SplineType 
+  SplineFitting<SplineType>::InterpolateWithDerivatives(const PointArrayType& points,
+                                                        const PointArrayType& derivatives,
+                                                        const IndexArray& derivativeIndices,
+                                                        const unsigned int degree,
+                                                        const ParameterVectorType& parameters)
+  {
+    typedef typename SplineType::KnotVectorType::Scalar Scalar;      
+    typedef typename SplineType::ControlPointVectorType ControlPointVectorType;
+
+    typedef Matrix<Scalar, Dynamic, Dynamic> MatrixType;
+
+    const DenseIndex n = points.cols() + derivatives.cols();
+    
+    KnotVectorType knots;
+
+    KnotAveragingWithDerivatives(parameters, degree, derivativeIndices, knots);
+    
+    // fill matrix
+    MatrixType A = MatrixType::Zero(n, n);
+
+    // Use these dimensions for quicker populating, then transpose for solving.
+    MatrixType b(points.rows(), n);
+
+    DenseIndex startRow;
+    DenseIndex derivativeStart;
+
+    // End derivatives.
+    if (derivativeIndices[0] == 0)
+    {
+      A.template block<1, 2>(1, 0) << -1, 1;
+      
+      Scalar y = (knots(degree + 1) - knots(0)) / degree;
+      b.col(1) = y*derivatives.col(0);
+      
+      startRow = 2;
+      derivativeStart = 1;
+    }
+    else
+    {
+      startRow = 1;
+      derivativeStart = 0;
+    }
+    if (derivativeIndices[derivatives.cols() - 1] == points.cols() - 1)
+    {
+      A.template block<1, 2>(n - 2, n - 2) << -1, 1;
+
+      Scalar y = (knots(knots.size() - 1) - knots(knots.size() - (degree + 2))) / degree;
+      b.col(b.cols() - 2) = y*derivatives.col(derivatives.cols() - 1);
+    }
+    
+    DenseIndex row = startRow;
+    DenseIndex derivativeIndex = derivativeStart;
+    for (DenseIndex i = 1; i < parameters.size() - 1; ++i)
+    {
+      const DenseIndex span = SplineType::Span(parameters[i], degree, knots);
+
+      if (derivativeIndex < derivativeIndices.size() && derivativeIndices[derivativeIndex] == i)
+      {
+        A.block(row, span - degree, 2, degree + 1)
+          = SplineType::BasisFunctionDerivatives(parameters[i], 1, degree, knots);
+
+        b.col(row++) = points.col(i);
+        b.col(row++) = derivatives.col(derivativeIndex++);
+      }
+      else
+      {
+        A.row(row).segment(span - degree, degree + 1)
+          = SplineType::BasisFunctions(parameters[i], degree, knots);
+        b.col(row++) = points.col(i);
+      }
+    }
+    b.col(0) = points.col(0);
+    b.col(b.cols() - 1) = points.col(points.cols() - 1);
+    A(0,0) = 1;
+    A(n - 1, n - 1) = 1;
+    
+    // Solve
+    FullPivLU<MatrixType> lu(A);
+    ControlPointVectorType controlPoints = lu.solve(MatrixType(b.transpose())).transpose();
+
+    SplineType spline(knots, controlPoints);
+    
+    return spline;
+  }
+  
+  template <typename SplineType>
+  template <typename PointArrayType, typename IndexArray>
+  SplineType
+  SplineFitting<SplineType>::InterpolateWithDerivatives(const PointArrayType& points,
+                                                        const PointArrayType& derivatives,
+                                                        const IndexArray& derivativeIndices,
+                                                        const unsigned int degree)
+  {
+    ParameterVectorType parameters;
+    ChordLengths(points, parameters);
+    return InterpolateWithDerivatives(points, derivatives, derivativeIndices, degree, parameters);
+  }
+}
+
+#endif // EIGEN_SPLINE_FITTING_H

diff --git a/unsupported/Eigen/src/Splines/SplineFwd.h b/unsupported/Eigen/src/Splines/SplineFwd.h
new file mode 100644
index 0000000..00d6b49
--- /dev/null
+++ b/unsupported/Eigen/src/Splines/SplineFwd.h

@@ -0,0 +1,93 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 20010-2011 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPLINES_FWD_H
+#define EIGEN_SPLINES_FWD_H
+
+#include "../../../../Eigen/Core"
+
+namespace Eigen
+{
+    template <typename Scalar, int Dim, int Degree = Dynamic> class Spline;
+
+    template < typename SplineType, int DerivativeOrder = Dynamic > struct SplineTraits {};
+
+    /**
+     * \ingroup Splines_Module
+     * \brief Compile-time attributes of the Spline class for Dynamic degree.
+     **/
+    template <typename _Scalar, int _Dim, int _Degree>
+    struct SplineTraits< Spline<_Scalar, _Dim, _Degree>, Dynamic >
+    {
+      typedef _Scalar Scalar; /*!< The spline curve's scalar type. */
+      enum { Dimension = _Dim /*!< The spline curve's dimension. */ };
+      enum { Degree = _Degree /*!< The spline curve's degree. */ };
+
+      enum { OrderAtCompileTime = _Degree==Dynamic ? Dynamic : _Degree+1 /*!< The spline curve's order at compile-time. */ };
+      enum { NumOfDerivativesAtCompileTime = OrderAtCompileTime /*!< The number of derivatives defined for the current spline. */ };
+      
+      enum { DerivativeMemoryLayout = Dimension==1 ? RowMajor : ColMajor /*!< The derivative type's memory layout. */ };
+
+      /** \brief The data type used to store non-zero basis functions. */
+      typedef Array<Scalar,1,OrderAtCompileTime> BasisVectorType;
+
+      /** \brief The data type used to store the values of the basis function derivatives. */
+      typedef Array<Scalar,Dynamic,Dynamic,RowMajor,NumOfDerivativesAtCompileTime,OrderAtCompileTime> BasisDerivativeType;
+      
+      /** \brief The data type used to store the spline's derivative values. */
+      typedef Array<Scalar,Dimension,Dynamic,DerivativeMemoryLayout,Dimension,NumOfDerivativesAtCompileTime> DerivativeType;
+
+      /** \brief The point type the spline is representing. */
+      typedef Array<Scalar,Dimension,1> PointType;
+      
+      /** \brief The data type used to store knot vectors. */
+      typedef Array<Scalar,1,Dynamic> KnotVectorType;
+
+      /** \brief The data type used to store parameter vectors. */
+      typedef Array<Scalar,1,Dynamic> ParameterVectorType;
+      
+      /** \brief The data type representing the spline's control points. */
+      typedef Array<Scalar,Dimension,Dynamic> ControlPointVectorType;
+    };
+
+    /**
+     * \ingroup Splines_Module
+     * \brief Compile-time attributes of the Spline class for fixed degree.
+     *
+     * The traits class inherits all attributes from the SplineTraits of Dynamic degree.
+     **/
+    template < typename _Scalar, int _Dim, int _Degree, int _DerivativeOrder >
+    struct SplineTraits< Spline<_Scalar, _Dim, _Degree>, _DerivativeOrder > : public SplineTraits< Spline<_Scalar, _Dim, _Degree> >
+    {
+      enum { OrderAtCompileTime = _Degree==Dynamic ? Dynamic : _Degree+1 /*!< The spline curve's order at compile-time. */ };
+      enum { NumOfDerivativesAtCompileTime = _DerivativeOrder==Dynamic ? Dynamic : _DerivativeOrder+1 /*!< The number of derivatives defined for the current spline. */ };
+      
+      enum { DerivativeMemoryLayout = _Dim==1 ? RowMajor : ColMajor /*!< The derivative type's memory layout. */ };
+
+      /** \brief The data type used to store the values of the basis function derivatives. */
+      typedef Array<_Scalar,Dynamic,Dynamic,RowMajor,NumOfDerivativesAtCompileTime,OrderAtCompileTime> BasisDerivativeType;
+      
+      /** \brief The data type used to store the spline's derivative values. */      
+      typedef Array<_Scalar,_Dim,Dynamic,DerivativeMemoryLayout,_Dim,NumOfDerivativesAtCompileTime> DerivativeType;
+    };
+
+    /** \brief 2D float B-spline with dynamic degree. */
+    typedef Spline<float,2> Spline2f;
+    
+    /** \brief 3D float B-spline with dynamic degree. */
+    typedef Spline<float,3> Spline3f;
+
+    /** \brief 2D double B-spline with dynamic degree. */
+    typedef Spline<double,2> Spline2d;
+    
+    /** \brief 3D double B-spline with dynamic degree. */
+    typedef Spline<double,3> Spline3d;
+}
+
+#endif // EIGEN_SPLINES_FWD_H

diff --git a/unsupported/README.txt b/unsupported/README.txt
new file mode 100644
index 0000000..70793bf
--- /dev/null
+++ b/unsupported/README.txt

@@ -0,0 +1,50 @@
+This directory contains contributions from various users.
+They are provided "as is", without any support. Nevertheless,
+most of them are subject to be included in Eigen in the future.
+
+In order to use an unsupported module you have to do either:
+
+ - add the path_to_eigen/unsupported directory to your include path and do:
+   #include <Eigen/ModuleHeader>
+
+ - or directly do:
+   #include <unsupported/Eigen/ModuleHeader>
+
+
+If you are interested in contributing to one of them, or have other stuff
+you would like to share, feel free to contact us:
+http://eigen.tuxfamily.org/index.php?title=Main_Page#Mailing_list
+
+Any kind of contributions are much appreciated, even very preliminary ones.
+However, it:
+ - must rely on Eigen,
+ - must be highly related to math,
+ - should have some general purpose in the sense that it could
+   potentially become an official Eigen module (or be merged into another one).
+
+In doubt feel free to contact us. For instance, if your addons is very too specific
+but it shows an interesting way of using Eigen, then it could be a nice demo.
+
+
+This directory is organized as follow:
+
+unsupported/Eigen/ModuleHeader1
+unsupported/Eigen/ModuleHeader2
+unsupported/Eigen/...
+unsupported/Eigen/src/Module1/SourceFile1.h
+unsupported/Eigen/src/Module1/SourceFile2.h
+unsupported/Eigen/src/Module1/...
+unsupported/Eigen/src/Module2/SourceFile1.h
+unsupported/Eigen/src/Module2/SourceFile2.h
+unsupported/Eigen/src/Module2/...
+unsupported/Eigen/src/...
+unsupported/doc/snippets/.cpp   <- code snippets for the doc
+unsupported/doc/examples/.cpp   <- examples for the doc
+unsupported/doc/TutorialModule1.dox
+unsupported/doc/TutorialModule2.dox
+unsupported/doc/...
+unsupported/test/.cpp           <- unit test files
+
+The documentation is generated at the same time than the main Eigen documentation.
+The .html files are generated in: build_dir/doc/html/unsupported/
+

diff --git a/unsupported/bench/bench_svd.cpp b/unsupported/bench/bench_svd.cpp
new file mode 100644
index 0000000..e7028a2
--- /dev/null
+++ b/unsupported/bench/bench_svd.cpp

@@ -0,0 +1,123 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Gauthier Brun <brun.gauthier@gmail.com>
+// Copyright (C) 2013 Nicolas Carre <nicolas.carre@ensimag.fr>
+// Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
+// Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/
+
+// Bench to compare the efficiency of SVD algorithms
+
+#include <iostream>
+#include <bench/BenchTimer.h>
+#include <unsupported/Eigen/SVD>
+
+
+using namespace Eigen;
+using namespace std;
+
+// number of computations of each algorithm before the print of the time
+#ifndef REPEAT
+#define REPEAT 10
+#endif
+
+// number of tests of the same type
+#ifndef NUMBER_SAMPLE
+#define NUMBER_SAMPLE 2
+#endif
+
+template<typename MatrixType>
+void bench_svd(const MatrixType& a = MatrixType())
+{
+  MatrixType m = MatrixType::Random(a.rows(), a.cols());
+  BenchTimer timerJacobi;
+  BenchTimer timerBDC;
+  timerJacobi.reset();
+  timerBDC.reset();
+
+  cout << " Only compute Singular Values" <<endl;
+  for (int k=1; k<=NUMBER_SAMPLE; ++k)
+  {
+    timerBDC.start();
+    for (int i=0; i<REPEAT; ++i) 
+    {
+      BDCSVD<MatrixType> bdc_matrix(m);
+    }
+    timerBDC.stop();
+    
+    timerJacobi.start();
+    for (int i=0; i<REPEAT; ++i) 
+    {
+      JacobiSVD<MatrixType> jacobi_matrix(m);
+    }
+    timerJacobi.stop();
+
+
+    cout << "Sample " << k << " : " << REPEAT << " computations :  Jacobi : " << fixed << timerJacobi.value() << "s ";
+    cout << " || " << " BDC : " << timerBDC.value() << "s " <<endl <<endl;
+      
+    if (timerBDC.value() >= timerJacobi.value())  
+      cout << "KO : BDC is " <<  timerJacobi.value() / timerBDC.value() << "  times faster than Jacobi" <<endl;
+    else 
+      cout << "OK : BDC is " << timerJacobi.value() / timerBDC.value() << "  times faster than Jacobi"  <<endl;
+      
+  }
+  cout << "       =================" <<endl;
+  std::cout<< std::endl;
+  timerJacobi.reset();
+  timerBDC.reset();
+  cout << " Computes rotation matrix" <<endl;
+  for (int k=1; k<=NUMBER_SAMPLE; ++k)
+  {
+    timerBDC.start();
+    for (int i=0; i<REPEAT; ++i) 
+    {
+      BDCSVD<MatrixType> bdc_matrix(m, ComputeFullU|ComputeFullV);
+    }
+    timerBDC.stop();
+    
+    timerJacobi.start();
+    for (int i=0; i<REPEAT; ++i) 
+    {
+      JacobiSVD<MatrixType> jacobi_matrix(m, ComputeFullU|ComputeFullV);
+    }
+    timerJacobi.stop();
+
+
+    cout << "Sample " << k << " : " << REPEAT << " computations :  Jacobi : " << fixed << timerJacobi.value() << "s ";
+    cout << " || " << " BDC : " << timerBDC.value() << "s " <<endl <<endl;
+      
+    if (timerBDC.value() >= timerJacobi.value())  
+      cout << "KO : BDC is " <<  timerJacobi.value() / timerBDC.value() << "  times faster than Jacobi" <<endl;
+    else 
+      cout << "OK : BDC is " << timerJacobi.value() / timerBDC.value() << "  times faster than Jacobi"  <<endl;
+      
+  }
+  std::cout<< std::endl;
+}
+
+
+
+int main(int argc, char* argv[])
+{
+  std::cout<< std::endl;
+
+  std::cout<<"On a (Dynamic, Dynamic) (6, 6) Matrix" <<std::endl;
+  bench_svd<Matrix<double,Dynamic,Dynamic> >(Matrix<double,Dynamic,Dynamic>(6, 6));
+  
+  std::cout<<"On a (Dynamic, Dynamic) (32, 32) Matrix" <<std::endl;
+  bench_svd<Matrix<double,Dynamic,Dynamic> >(Matrix<double,Dynamic,Dynamic>(32, 32));
+
+  //std::cout<<"On a (Dynamic, Dynamic) (128, 128) Matrix" <<std::endl;
+  //bench_svd<Matrix<double,Dynamic,Dynamic> >(Matrix<double,Dynamic,Dynamic>(128, 128));
+
+  std::cout<<"On a (Dynamic, Dynamic) (160, 160) Matrix" <<std::endl;
+  bench_svd<Matrix<double,Dynamic,Dynamic> >(Matrix<double,Dynamic,Dynamic>(160, 160));
+  
+  std::cout<< "--------------------------------------------------------------------"<< std::endl;
+           
+}

diff --git a/unsupported/doc/CMakeLists.txt b/unsupported/doc/CMakeLists.txt
new file mode 100644
index 0000000..9e9ab98
--- /dev/null
+++ b/unsupported/doc/CMakeLists.txt

@@ -0,0 +1,4 @@
+set_directory_properties(PROPERTIES EXCLUDE_FROM_ALL TRUE)
+
+add_subdirectory(examples)
+add_subdirectory(snippets)

diff --git a/unsupported/doc/Overview.dox b/unsupported/doc/Overview.dox
new file mode 100644
index 0000000..bae51dc
--- /dev/null
+++ b/unsupported/doc/Overview.dox

@@ -0,0 +1,31 @@
+/// \brief Namespace containing all symbols from the %Eigen library.
+namespace Eigen {
+
+/** \mainpage %Eigen's unsupported modules
+
+This is the API documentation for %Eigen's unsupported modules.
+
+These modules are contributions from various users. They are provided "as is", without any support.
+
+Click on the \e Modules tab at the top of this page to get a list of all unsupported modules.
+
+Don't miss the <a href="../index.html">official Eigen documentation</a>.
+
+ \subpage SYCL_EIGEN "SYCL backend for Eigen"
+
+*/
+
+/*
+
+\defgroup Unsupported_modules Unsupported modules
+
+The unsupported modules are contributions from various users. They are
+provided "as is", without any support. Nevertheless, some of them are
+subject to be included in %Eigen in the future.
+
+*/
+
+/// \internal \brief Namespace containing low-level routines from the %Eigen library.
+namespace internal {}
+}
+

diff --git a/unsupported/doc/SYCL.dox b/unsupported/doc/SYCL.dox
new file mode 100644
index 0000000..2295adf
--- /dev/null
+++ b/unsupported/doc/SYCL.dox

@@ -0,0 +1,9 @@
+/** \page SYCL_EIGEN Eigen SYCL Backend
+
+Useful information for Eigen SYCL Backend:
+
+- <a href="https://developer.codeplay.com/computecppce/latest/getting-started-with-eigen">Getting Started with Eigen</a> 
+
+- <a href="https://developer.codeplay.com/computecppce/latest/options-for-building-eigen-sycl">Options for Building Eigen SYCL</a>  
+
+*/

diff --git a/unsupported/doc/eigendoxy_layout.xml.in b/unsupported/doc/eigendoxy_layout.xml.in
new file mode 100644
index 0000000..c93621e
--- /dev/null
+++ b/unsupported/doc/eigendoxy_layout.xml.in

@@ -0,0 +1,177 @@
+<?xml version="1.0"?>
+<doxygenlayout version="1.0">
+  <!-- Navigation index tabs for HTML output -->
+  <navindex>
+    <tab type="user" url="index.html" title="Overview" />
+    <tab type="modules" visible="yes" title="Unsupported Modules" intro=""/>
+<!--     <tab type="mainpage" visible="yes" title=""/> -->
+    <tab type="classlist" visible="yes" title="" intro=""/>
+<!--     <tab type="classmembers" visible="yes" title="" intro=""/> -->
+  </navindex>
+
+  <!-- Layout definition for a class page -->
+  <class>
+    <briefdescription visible="no"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <detaileddescription title=""/>
+    <inheritancegraph visible="$CLASS_GRAPH"/>
+    <collaborationgraph visible="$COLLABORATION_GRAPH"/>
+    <allmemberslink visible="yes"/>
+    <memberdecl>
+      <nestedclasses visible="yes" title=""/>
+      <publictypes title=""/>
+      <publicslots title=""/>
+      <signals title=""/>
+      <publicmethods title=""/>
+      <publicstaticmethods title=""/>
+      <publicattributes title=""/>
+      <publicstaticattributes title=""/>
+      <protectedtypes title=""/>
+      <protectedslots title=""/>
+      <protectedmethods title=""/>
+      <protectedstaticmethods title=""/>
+      <protectedattributes title=""/>
+      <protectedstaticattributes title=""/>
+      <packagetypes title=""/>
+      <packagemethods title=""/>
+      <packagestaticmethods title=""/>
+      <packageattributes title=""/>
+      <packagestaticattributes title=""/>
+      <properties title=""/>
+      <events title=""/>
+      <privatetypes title=""/>
+      <privateslots title=""/>
+      <privatemethods title=""/>
+      <privatestaticmethods title=""/>
+      <privateattributes title=""/>
+      <privatestaticattributes title=""/>
+      <friends title=""/>
+      <related title="" subtitle=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <constructors title=""/>
+      <functions title=""/>
+      <related title=""/>
+      <variables title=""/>
+      <properties title=""/>
+      <events title=""/>
+    </memberdef>
+    <usedfiles visible="$SHOW_USED_FILES"/>
+    <authorsection visible="yes"/>
+  </class>
+
+  <!-- Layout definition for a namespace page -->
+  <namespace>
+    <briefdescription visible="yes"/>
+    <memberdecl>
+      <nestednamespaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </namespace>
+
+  <!-- Layout definition for a file page -->
+  <file>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <includegraph visible="$INCLUDE_GRAPH"/>
+    <includedbygraph visible="$INCLUDED_BY_GRAPH"/>
+    <sourcelink visible="yes"/>
+    <memberdecl>
+      <classes visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection/>
+  </file>
+
+  <!-- Layout definition for a group page -->
+  <group>
+    <briefdescription visible="no"/>
+    <detaileddescription title=""/>
+    <groupgraph visible="$GROUP_GRAPHS"/>
+    <memberdecl>
+      <nestedgroups visible="yes" title=""/>
+      <dirs visible="yes" title=""/>
+      <files visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    
+    <memberdef>
+      <pagedocs/>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </group>
+
+  <!-- Layout definition for a directory page -->
+  <directory>
+    <briefdescription visible="yes"/>
+    <directorygraph visible="yes"/>
+    <memberdecl>
+      <dirs visible="yes"/>
+      <files visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+  </directory>
+</doxygenlayout>

diff --git a/unsupported/doc/examples/BVH_Example.cpp b/unsupported/doc/examples/BVH_Example.cpp
new file mode 100644
index 0000000..afb0c94
--- /dev/null
+++ b/unsupported/doc/examples/BVH_Example.cpp

@@ -0,0 +1,50 @@
+#include <Eigen/StdVector>
+#include <unsupported/Eigen/BVH>
+#include <iostream>
+
+using namespace Eigen;
+typedef AlignedBox<double, 2> Box2d;
+
+namespace Eigen {
+  Box2d bounding_box(const Vector2d &v) { return Box2d(v, v); } //compute the bounding box of a single point
+}
+
+struct PointPointMinimizer //how to compute squared distances between points and rectangles
+{
+  PointPointMinimizer() : calls(0) {}
+  typedef double Scalar;
+
+  double minimumOnVolumeVolume(const Box2d &r1, const Box2d &r2) { ++calls; return r1.squaredExteriorDistance(r2); }
+  double minimumOnVolumeObject(const Box2d &r, const Vector2d &v) { ++calls; return r.squaredExteriorDistance(v); }
+  double minimumOnObjectVolume(const Vector2d &v, const Box2d &r) { ++calls; return r.squaredExteriorDistance(v); }
+  double minimumOnObjectObject(const Vector2d &v1, const Vector2d &v2) { ++calls; return (v1 - v2).squaredNorm(); }
+
+  int calls;
+};
+
+int main()
+{
+  typedef std::vector<Vector2d, aligned_allocator<Vector2d> > StdVectorOfVector2d;
+  StdVectorOfVector2d redPoints, bluePoints;
+  for(int i = 0; i < 100; ++i) { //initialize random set of red points and blue points
+    redPoints.push_back(Vector2d::Random());
+    bluePoints.push_back(Vector2d::Random());
+  }
+
+  PointPointMinimizer minimizer;
+  double minDistSq = std::numeric_limits<double>::max();
+
+  //brute force to find closest red-blue pair
+  for(int i = 0; i < (int)redPoints.size(); ++i)
+    for(int j = 0; j < (int)bluePoints.size(); ++j)
+      minDistSq = std::min(minDistSq, minimizer.minimumOnObjectObject(redPoints[i], bluePoints[j]));
+  std::cout << "Brute force distance = " << sqrt(minDistSq) << ", calls = " << minimizer.calls << std::endl;
+
+  //using BVH to find closest red-blue pair
+  minimizer.calls = 0;
+  KdBVH<double, 2, Vector2d> redTree(redPoints.begin(), redPoints.end()), blueTree(bluePoints.begin(), bluePoints.end()); //construct the trees
+  minDistSq = BVMinimize(redTree, blueTree, minimizer); //actual BVH minimization call
+  std::cout << "BVH distance         = " << sqrt(minDistSq) << ", calls = " << minimizer.calls << std::endl;
+
+  return 0;
+}

diff --git a/unsupported/doc/examples/CMakeLists.txt b/unsupported/doc/examples/CMakeLists.txt
new file mode 100644
index 0000000..7bb6773
--- /dev/null
+++ b/unsupported/doc/examples/CMakeLists.txt

@@ -0,0 +1,24 @@
+file(GLOB examples_SRCS "*.cpp")
+
+add_custom_target(unsupported_examples)
+
+include_directories(../../../unsupported ../../../unsupported/test)
+
+foreach(example_src ${examples_SRCS})
+  get_filename_component(example ${example_src} NAME_WE)
+  add_executable(example_${example} ${example_src})
+  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+    target_link_libraries(example_${example} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+  endif()
+  add_custom_command(
+    TARGET example_${example}
+    POST_BUILD
+    COMMAND example_${example}
+    ARGS >${CMAKE_CURRENT_BINARY_DIR}/${example}.out
+  )
+  add_dependencies(unsupported_examples example_${example})
+endforeach(example_src)
+
+if(EIGEN_TEST_SYCL)
+  add_subdirectory(SYCL)
+endif(EIGEN_TEST_SYCL)

diff --git a/unsupported/doc/examples/EulerAngles.cpp b/unsupported/doc/examples/EulerAngles.cpp
new file mode 100644
index 0000000..3f8ca8c
--- /dev/null
+++ b/unsupported/doc/examples/EulerAngles.cpp

@@ -0,0 +1,46 @@
+#include <unsupported/Eigen/EulerAngles>
+#include <iostream>
+
+using namespace Eigen;
+
+int main()
+{
+  // A common Euler system by many armies around the world,
+  //  where the first one is the azimuth(the angle from the north -
+  //   the same angle that is show in compass)
+  //  and the second one is elevation(the angle from the horizon)
+  //  and the third one is roll(the angle between the horizontal body
+  //   direction and the plane ground surface)
+  // Keep remembering we're using radian angles here!
+  typedef EulerSystem<-EULER_Z, EULER_Y, EULER_X> MyArmySystem;
+  typedef EulerAngles<double, MyArmySystem> MyArmyAngles;
+  
+  MyArmyAngles vehicleAngles(
+    3.14/*PI*/ / 2, /* heading to east, notice that this angle is counter-clockwise */
+    -0.3, /* going down from a mountain */
+    0.1); /* slightly rolled to the right */
+  
+  // Some Euler angles representation that our plane use.
+  EulerAnglesZYZd planeAngles(0.78474, 0.5271, -0.513794);
+  
+  MyArmyAngles planeAnglesInMyArmyAngles(planeAngles);
+  
+  std::cout << "vehicle angles(MyArmy):     " << vehicleAngles << std::endl;
+  std::cout << "plane angles(ZYZ):        " << planeAngles << std::endl;
+  std::cout << "plane angles(MyArmy):     " << planeAnglesInMyArmyAngles << std::endl;
+  
+  // Now lets rotate the plane a little bit
+  std::cout << "==========================================================\n";
+  std::cout << "rotating plane now!\n";
+  std::cout << "==========================================================\n";
+  
+  Quaterniond planeRotated = AngleAxisd(-0.342, Vector3d::UnitY()) * planeAngles;
+  
+  planeAngles = planeRotated;
+  planeAnglesInMyArmyAngles = planeRotated;
+  
+  std::cout << "new plane angles(ZYZ):     " << planeAngles << std::endl;
+  std::cout << "new plane angles(MyArmy): " << planeAnglesInMyArmyAngles << std::endl;
+  
+  return 0;
+}

diff --git a/unsupported/doc/examples/FFT.cpp b/unsupported/doc/examples/FFT.cpp
new file mode 100644
index 0000000..85e8a02
--- /dev/null
+++ b/unsupported/doc/examples/FFT.cpp

@@ -0,0 +1,118 @@
+//  To use the simple FFT implementation
+//  g++ -o demofft -I.. -Wall -O3 FFT.cpp 
+
+//  To use the FFTW implementation
+//  g++ -o demofft -I.. -DUSE_FFTW -Wall -O3 FFT.cpp -lfftw3 -lfftw3f -lfftw3l
+
+#ifdef USE_FFTW
+#include <fftw3.h>
+#endif
+
+#include <vector>
+#include <complex>
+#include <algorithm>
+#include <iterator>
+#include <iostream>
+#include <Eigen/Core>
+#include <unsupported/Eigen/FFT>
+
+using namespace std;
+using namespace Eigen;
+
+template <typename T>
+T mag2(T a)
+{
+    return a*a;
+}
+template <typename T>
+T mag2(std::complex<T> a)
+{
+    return norm(a);
+}
+
+template <typename T>
+T mag2(const std::vector<T> & vec)
+{
+    T out=0;
+    for (size_t k=0;k<vec.size();++k)
+        out += mag2(vec[k]);
+    return out;
+}
+
+template <typename T>
+T mag2(const std::vector<std::complex<T> > & vec)
+{
+    T out=0;
+    for (size_t k=0;k<vec.size();++k)
+        out += mag2(vec[k]);
+    return out;
+}
+
+template <typename T>
+vector<T> operator-(const vector<T> & a,const vector<T> & b )
+{
+    vector<T> c(a);
+    for (size_t k=0;k<b.size();++k) 
+        c[k] -= b[k];
+    return c;
+}
+
+template <typename T>
+void RandomFill(std::vector<T> & vec)
+{
+    for (size_t k=0;k<vec.size();++k)
+        vec[k] = T( rand() )/T(RAND_MAX) - T(.5);
+}
+
+template <typename T>
+void RandomFill(std::vector<std::complex<T> > & vec)
+{
+    for (size_t k=0;k<vec.size();++k)
+        vec[k] = std::complex<T> ( T( rand() )/T(RAND_MAX) - T(.5), T( rand() )/T(RAND_MAX) - T(.5));
+}
+
+template <typename T_time,typename T_freq>
+void fwd_inv(size_t nfft)
+{
+    typedef typename NumTraits<T_freq>::Real Scalar;
+    vector<T_time> timebuf(nfft);
+    RandomFill(timebuf);
+
+    vector<T_freq> freqbuf;
+    static FFT<Scalar> fft;
+    fft.fwd(freqbuf,timebuf);
+
+    vector<T_time> timebuf2;
+    fft.inv(timebuf2,freqbuf);
+
+    T_time rmse = mag2(timebuf - timebuf2) / mag2(timebuf);
+    cout << "roundtrip rmse: " << rmse << endl;
+}
+
+template <typename T_scalar>
+void two_demos(int nfft)
+{
+    cout << "     scalar ";
+    fwd_inv<T_scalar,std::complex<T_scalar> >(nfft);
+    cout << "    complex ";
+    fwd_inv<std::complex<T_scalar>,std::complex<T_scalar> >(nfft);
+}
+
+void demo_all_types(int nfft)
+{
+    cout << "nfft=" << nfft << endl;
+    cout << "   float" << endl;
+    two_demos<float>(nfft);
+    cout << "   double" << endl;
+    two_demos<double>(nfft);
+    cout << "   long double" << endl;
+    two_demos<long double>(nfft);
+}
+
+int main()
+{
+    demo_all_types( 2*3*4*5*7 );
+    demo_all_types( 2*9*16*25 );
+    demo_all_types( 1024 );
+    return 0;
+}

diff --git a/unsupported/doc/examples/MatrixExponential.cpp b/unsupported/doc/examples/MatrixExponential.cpp
new file mode 100644
index 0000000..ebd3b96
--- /dev/null
+++ b/unsupported/doc/examples/MatrixExponential.cpp

@@ -0,0 +1,16 @@
+#include <unsupported/Eigen/MatrixFunctions>
+#include <iostream>
+
+using namespace Eigen;
+
+int main()
+{
+  const double pi = std::acos(-1.0);
+
+  MatrixXd A(3,3);
+  A << 0,    -pi/4, 0,
+       pi/4, 0,     0,
+       0,    0,     0;
+  std::cout << "The matrix A is:\n" << A << "\n\n";
+  std::cout << "The matrix exponential of A is:\n" << A.exp() << "\n\n";
+}

diff --git a/unsupported/doc/examples/MatrixFunction.cpp b/unsupported/doc/examples/MatrixFunction.cpp
new file mode 100644
index 0000000..a4172e4
--- /dev/null
+++ b/unsupported/doc/examples/MatrixFunction.cpp

@@ -0,0 +1,23 @@
+#include <unsupported/Eigen/MatrixFunctions>
+#include <iostream>
+
+using namespace Eigen;
+
+std::complex<double> expfn(std::complex<double> x, int)
+{
+  return std::exp(x);
+}
+
+int main()
+{
+  const double pi = std::acos(-1.0);
+
+  MatrixXd A(3,3);
+  A << 0,    -pi/4, 0,
+       pi/4, 0,     0,
+       0,    0,     0;
+
+  std::cout << "The matrix A is:\n" << A << "\n\n";
+  std::cout << "The matrix exponential of A is:\n" 
+            << A.matrixFunction(expfn) << "\n\n";
+}

diff --git a/unsupported/doc/examples/MatrixLogarithm.cpp b/unsupported/doc/examples/MatrixLogarithm.cpp
new file mode 100644
index 0000000..8c5d970
--- /dev/null
+++ b/unsupported/doc/examples/MatrixLogarithm.cpp

@@ -0,0 +1,15 @@
+#include <unsupported/Eigen/MatrixFunctions>
+#include <iostream>
+
+using namespace Eigen;
+
+int main()
+{
+  using std::sqrt;
+  MatrixXd A(3,3);
+  A << 0.5*sqrt(2), -0.5*sqrt(2), 0,
+       0.5*sqrt(2),  0.5*sqrt(2), 0,
+       0,            0,           1;
+  std::cout << "The matrix A is:\n" << A << "\n\n";
+  std::cout << "The matrix logarithm of A is:\n" << A.log() << "\n";
+}

diff --git a/unsupported/doc/examples/MatrixPower.cpp b/unsupported/doc/examples/MatrixPower.cpp
new file mode 100644
index 0000000..2224524
--- /dev/null
+++ b/unsupported/doc/examples/MatrixPower.cpp

@@ -0,0 +1,16 @@
+#include <unsupported/Eigen/MatrixFunctions>
+#include <iostream>
+
+using namespace Eigen;
+
+int main()
+{
+  const double pi = std::acos(-1.0);
+  Matrix3d A;
+  A << cos(1), -sin(1), 0,
+       sin(1),  cos(1), 0,
+	   0 ,      0 , 1;
+  std::cout << "The matrix A is:\n" << A << "\n\n"
+	       "The matrix power A^(pi/4) is:\n" << A.pow(pi/4) << std::endl;
+  return 0;
+}

diff --git a/unsupported/doc/examples/MatrixPower_optimal.cpp b/unsupported/doc/examples/MatrixPower_optimal.cpp
new file mode 100644
index 0000000..86470ba
--- /dev/null
+++ b/unsupported/doc/examples/MatrixPower_optimal.cpp

@@ -0,0 +1,17 @@
+#include <unsupported/Eigen/MatrixFunctions>
+#include <iostream>
+
+using namespace Eigen;
+
+int main()
+{
+  Matrix4cd A = Matrix4cd::Random();
+  MatrixPower<Matrix4cd> Apow(A);
+
+  std::cout << "The matrix A is:\n" << A << "\n\n"
+	       "A^3.1 is:\n" << Apow(3.1) << "\n\n"
+	       "A^3.3 is:\n" << Apow(3.3) << "\n\n"
+	       "A^3.7 is:\n" << Apow(3.7) << "\n\n"
+	       "A^3.9 is:\n" << Apow(3.9) << std::endl;
+  return 0;
+}

diff --git a/unsupported/doc/examples/MatrixSine.cpp b/unsupported/doc/examples/MatrixSine.cpp
new file mode 100644
index 0000000..9eea9a0
--- /dev/null
+++ b/unsupported/doc/examples/MatrixSine.cpp

@@ -0,0 +1,20 @@
+#include <unsupported/Eigen/MatrixFunctions>
+#include <iostream>
+
+using namespace Eigen;
+
+int main()
+{
+  MatrixXd A = MatrixXd::Random(3,3);
+  std::cout << "A = \n" << A << "\n\n";
+
+  MatrixXd sinA = A.sin();
+  std::cout << "sin(A) = \n" << sinA << "\n\n";
+
+  MatrixXd cosA = A.cos();
+  std::cout << "cos(A) = \n" << cosA << "\n\n";
+  
+  // The matrix functions satisfy sin^2(A) + cos^2(A) = I, 
+  // like the scalar functions.
+  std::cout << "sin^2(A) + cos^2(A) = \n" << sinA*sinA + cosA*cosA << "\n\n";
+}

diff --git a/unsupported/doc/examples/MatrixSinh.cpp b/unsupported/doc/examples/MatrixSinh.cpp
new file mode 100644
index 0000000..f771867
--- /dev/null
+++ b/unsupported/doc/examples/MatrixSinh.cpp

@@ -0,0 +1,20 @@
+#include <unsupported/Eigen/MatrixFunctions>
+#include <iostream>
+
+using namespace Eigen;
+
+int main()
+{
+  MatrixXf A = MatrixXf::Random(3,3);
+  std::cout << "A = \n" << A << "\n\n";
+
+  MatrixXf sinhA = A.sinh();
+  std::cout << "sinh(A) = \n" << sinhA << "\n\n";
+
+  MatrixXf coshA = A.cosh();
+  std::cout << "cosh(A) = \n" << coshA << "\n\n";
+  
+  // The matrix functions satisfy cosh^2(A) - sinh^2(A) = I, 
+  // like the scalar functions.
+  std::cout << "cosh^2(A) - sinh^2(A) = \n" << coshA*coshA - sinhA*sinhA << "\n\n";
+}

diff --git a/unsupported/doc/examples/MatrixSquareRoot.cpp b/unsupported/doc/examples/MatrixSquareRoot.cpp
new file mode 100644
index 0000000..88e7557
--- /dev/null
+++ b/unsupported/doc/examples/MatrixSquareRoot.cpp

@@ -0,0 +1,16 @@
+#include <unsupported/Eigen/MatrixFunctions>
+#include <iostream>
+
+using namespace Eigen;
+
+int main()
+{
+  const double pi = std::acos(-1.0);
+
+  MatrixXd A(2,2);
+  A << cos(pi/3), -sin(pi/3), 
+       sin(pi/3),  cos(pi/3);
+  std::cout << "The matrix A is:\n" << A << "\n\n";
+  std::cout << "The matrix square root of A is:\n" << A.sqrt() << "\n\n";
+  std::cout << "The square of the last matrix is:\n" << A.sqrt() * A.sqrt() << "\n";
+}

diff --git a/unsupported/doc/examples/PolynomialSolver1.cpp b/unsupported/doc/examples/PolynomialSolver1.cpp
new file mode 100644
index 0000000..cd777a4
--- /dev/null
+++ b/unsupported/doc/examples/PolynomialSolver1.cpp

@@ -0,0 +1,53 @@
+#include <unsupported/Eigen/Polynomials>
+#include <vector>
+#include <iostream>
+
+using namespace Eigen;
+using namespace std;
+
+int main()
+{
+  typedef Matrix<double,5,1> Vector5d;
+
+  Vector5d roots = Vector5d::Random();
+  cout << "Roots: " << roots.transpose() << endl;
+  Eigen::Matrix<double,6,1> polynomial;
+  roots_to_monicPolynomial( roots, polynomial );
+
+  PolynomialSolver<double,5> psolve( polynomial );
+  cout << "Complex roots: " << psolve.roots().transpose() << endl;
+
+  std::vector<double> realRoots;
+  psolve.realRoots( realRoots );
+  Map<Vector5d> mapRR( &realRoots[0] );
+  cout << "Real roots: " << mapRR.transpose() << endl;
+
+  cout << endl;
+  cout << "Illustration of the convergence problem with the QR algorithm: " << endl;
+  cout << "---------------------------------------------------------------" << endl;
+  Eigen::Matrix<float,7,1> hardCase_polynomial;
+  hardCase_polynomial <<
+  -0.957, 0.9219, 0.3516, 0.9453, -0.4023, -0.5508, -0.03125;
+  cout << "Hard case polynomial defined by floats: " << hardCase_polynomial.transpose() << endl;
+  PolynomialSolver<float,6> psolvef( hardCase_polynomial );
+  cout << "Complex roots: " << psolvef.roots().transpose() << endl;
+  Eigen::Matrix<float,6,1> evals;
+  for( int i=0; i<6; ++i ){ evals[i] = std::abs( poly_eval( hardCase_polynomial, psolvef.roots()[i] ) ); }
+  cout << "Norms of the evaluations of the polynomial at the roots: " << evals.transpose() << endl << endl;
+
+  cout << "Using double's almost always solves the problem for small degrees: " << endl;
+  cout << "-------------------------------------------------------------------" << endl;
+  PolynomialSolver<double,6> psolve6d( hardCase_polynomial.cast<double>() );
+  cout << "Complex roots: " << psolve6d.roots().transpose() << endl;
+  for( int i=0; i<6; ++i )
+  {
+    std::complex<float> castedRoot( psolve6d.roots()[i].real(), psolve6d.roots()[i].imag() );
+    evals[i] = std::abs( poly_eval( hardCase_polynomial, castedRoot ) );
+  }
+  cout << "Norms of the evaluations of the polynomial at the roots: " << evals.transpose() << endl << endl;
+
+  cout.precision(10);
+  cout << "The last root in float then in double: " << psolvef.roots()[5] << "\t" << psolve6d.roots()[5] << endl;
+  std::complex<float> castedRoot( psolve6d.roots()[5].real(), psolve6d.roots()[5].imag() );
+  cout << "Norm of the difference: " << std::abs( psolvef.roots()[5] - castedRoot ) << endl;
+}

diff --git a/unsupported/doc/examples/PolynomialUtils1.cpp b/unsupported/doc/examples/PolynomialUtils1.cpp
new file mode 100644
index 0000000..dbfe520
--- /dev/null
+++ b/unsupported/doc/examples/PolynomialUtils1.cpp

@@ -0,0 +1,20 @@
+#include <unsupported/Eigen/Polynomials>
+#include <iostream>
+
+using namespace Eigen;
+using namespace std;
+
+int main()
+{
+  Vector4d roots = Vector4d::Random();
+  cout << "Roots: " << roots.transpose() << endl;
+  Eigen::Matrix<double,5,1> polynomial;
+  roots_to_monicPolynomial( roots, polynomial );
+  cout << "Polynomial: ";
+  for( int i=0; i<4; ++i ){ cout << polynomial[i] << ".x^" << i << "+ "; }
+  cout << polynomial[4] << ".x^4" << endl;
+  Vector4d evaluation;
+  for( int i=0; i<4; ++i ){
+    evaluation[i] = poly_eval( polynomial, roots[i] ); }
+  cout << "Evaluation of the polynomial at the roots: " << evaluation.transpose();
+}

diff --git a/unsupported/doc/examples/SYCL/CMakeLists.txt b/unsupported/doc/examples/SYCL/CMakeLists.txt
new file mode 100644
index 0000000..1d0f721
--- /dev/null
+++ b/unsupported/doc/examples/SYCL/CMakeLists.txt

@@ -0,0 +1,37 @@
+FILE(GLOB examples_SRCS "*.cpp")
+
+set(EIGEN_SYCL ON)
+list(APPEND CMAKE_EXE_LINKER_FLAGS -pthread)
+if(EIGEN_SYCL_TRISYCL)
+  set(CMAKE_CXX_STANDARD 17)
+else(EIGEN_SYCL_TRISYCL)
+  if(MSVC)
+    # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11
+    # can cause issues with the ComputeCpp device compiler parsing Visual Studio Headers.
+    set(CMAKE_CXX_STANDARD 14)
+    list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
+  else()
+    set(CMAKE_CXX_STANDARD 11)
+    list(APPEND COMPUTECPP_USER_FLAGS -Wall)
+  endif()
+  # The following flags are not supported by Clang and can cause warnings
+  # if used with -Werror so they are removed here.
+  if(COMPUTECPP_USE_COMPILER_DRIVER)
+    set(CMAKE_CXX_COMPILER ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE})
+    string(REPLACE "-Wlogical-op" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+    string(REPLACE "-Wno-psabi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+    string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  endif()
+  list(APPEND COMPUTECPP_USER_FLAGS
+      -DEIGEN_NO_ASSERTION_CHECKING=1
+      -no-serial-memop
+      -Xclang
+      -cl-mad-enable)
+endif(EIGEN_SYCL_TRISYCL)
+
+FOREACH(example_src ${examples_SRCS})
+  GET_FILENAME_COMPONENT(example ${example_src} NAME_WE)
+  ei_add_test_internal(${example} example_${example})
+  ADD_DEPENDENCIES(unsupported_examples example_${example})
+ENDFOREACH(example_src)
+set(EIGEN_SYCL OFF)

diff --git a/unsupported/doc/examples/SYCL/CwiseMul.cpp b/unsupported/doc/examples/SYCL/CwiseMul.cpp
new file mode 100644
index 0000000..a7c3314
--- /dev/null
+++ b/unsupported/doc/examples/SYCL/CwiseMul.cpp

@@ -0,0 +1,63 @@
+#include <iostream>
+#define EIGEN_USE_SYCL
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+int main()
+{
+  using DataType = float;
+  using IndexType = int64_t;
+  constexpr auto DataLayout = Eigen::RowMajor;
+
+  auto devices = Eigen::get_sycl_supported_devices();
+  const auto device_selector = *devices.begin();
+  Eigen::QueueInterface queueInterface(device_selector);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  
+  // create the tensors to be used in the operation
+  IndexType sizeDim1 = 3;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 3;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+
+  // initialize the tensors with the data we want manipulate to
+  Tensor<DataType, 3,DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> in2(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> out(tensorRange);
+
+  // set up some random data in the tensors to be multiplied
+  in1 = in1.random();
+  in2 = in2.random();
+
+  // allocate memory for the tensors
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.size()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
+
+  // 
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
+
+  // copy the memory to the device and do the c=a*b calculation
+  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.size())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.size())*sizeof(DataType));
+  gpu_out.device(sycl_device) = gpu_in1 * gpu_in2;
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+
+  // print out the results
+   for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        std::cout << "device_out" << "(" << i << ", " << j << ", " << k << ") : " << out(i,j,k) 
+                  << " vs host_out" << "(" << i << ", " << j << ", " << k << ") : " << in1(i,j,k) * in2(i,j,k) << "\n";
+      }
+    }
+  }
+  printf("c=a*b Done\n");
+}

diff --git a/unsupported/doc/snippets/CMakeLists.txt b/unsupported/doc/snippets/CMakeLists.txt
new file mode 100644
index 0000000..adf95a8
--- /dev/null
+++ b/unsupported/doc/snippets/CMakeLists.txt

@@ -0,0 +1,26 @@
+file(GLOB snippets_SRCS "*.cpp")
+
+add_custom_target(unsupported_snippets)
+
+foreach(snippet_src ${snippets_SRCS})
+  get_filename_component(snippet ${snippet_src} NAME_WE)
+  set(compile_snippet_target compile_${snippet})
+  set(compile_snippet_src ${compile_snippet_target}.cpp)
+  file(READ ${snippet_src} snippet_source_code)
+  configure_file(${PROJECT_SOURCE_DIR}/doc/snippets/compile_snippet.cpp.in
+                 ${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src})
+  add_executable(${compile_snippet_target}
+                 ${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src})
+  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+    target_link_libraries(${compile_snippet_target} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+  endif()
+  add_custom_command(
+    TARGET ${compile_snippet_target}
+    POST_BUILD
+    COMMAND ${compile_snippet_target}
+    ARGS >${CMAKE_CURRENT_BINARY_DIR}/${snippet}.out
+  )
+  add_dependencies(unsupported_snippets ${compile_snippet_target})
+  set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src}
+                              PROPERTIES OBJECT_DEPENDS ${snippet_src})
+endforeach(snippet_src)

diff --git a/unsupported/test/BVH.cpp b/unsupported/test/BVH.cpp
new file mode 100644
index 0000000..d8c39d5
--- /dev/null
+++ b/unsupported/test/BVH.cpp

@@ -0,0 +1,222 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Ilya Baran <ibaran@mit.edu>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/StdVector>
+#include <Eigen/Geometry>
+#include <unsupported/Eigen/BVH>
+
+namespace Eigen {
+
+template<typename Scalar, int Dim> AlignedBox<Scalar, Dim> bounding_box(const Matrix<Scalar, Dim, 1> &v) { return AlignedBox<Scalar, Dim>(v); }
+
+}
+
+
+template<int Dim>
+struct Ball
+{
+EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(double, Dim)
+
+  typedef Matrix<double, Dim, 1> VectorType;
+
+  Ball() {}
+  Ball(const VectorType &c, double r) : center(c), radius(r) {}
+
+  VectorType center;
+  double radius;
+};
+template<int Dim> AlignedBox<double, Dim> bounding_box(const Ball<Dim> &b)
+{ return AlignedBox<double, Dim>(b.center.array() - b.radius, b.center.array() + b.radius); }
+
+inline double SQR(double x) { return x * x; }
+
+template<int Dim>
+struct BallPointStuff //this class provides functions to be both an intersector and a minimizer, both for a ball and a point and for two trees
+{
+  typedef double Scalar;
+  typedef Matrix<double, Dim, 1> VectorType;
+  typedef Ball<Dim> BallType;
+  typedef AlignedBox<double, Dim> BoxType;
+
+  BallPointStuff() : calls(0), count(0) {}
+  BallPointStuff(const VectorType &inP) : p(inP), calls(0), count(0) {}
+
+
+  bool intersectVolume(const BoxType &r) { ++calls; return r.contains(p); }
+  bool intersectObject(const BallType &b) {
+    ++calls;
+    if((b.center - p).squaredNorm() < SQR(b.radius))
+      ++count;
+    return false; //continue
+  }
+
+  bool intersectVolumeVolume(const BoxType &r1, const BoxType &r2) { ++calls; return !(r1.intersection(r2)).isNull(); }
+  bool intersectVolumeObject(const BoxType &r, const BallType &b) { ++calls; return r.squaredExteriorDistance(b.center) < SQR(b.radius); }
+  bool intersectObjectVolume(const BallType &b, const BoxType &r) { ++calls; return r.squaredExteriorDistance(b.center) < SQR(b.radius); }
+  bool intersectObjectObject(const BallType &b1, const BallType &b2){
+    ++calls;
+    if((b1.center - b2.center).norm() < b1.radius + b2.radius)
+      ++count;
+    return false;
+  }
+  bool intersectVolumeObject(const BoxType &r, const VectorType &v) { ++calls; return r.contains(v); }
+  bool intersectObjectObject(const BallType &b, const VectorType &v){
+    ++calls;
+    if((b.center - v).squaredNorm() < SQR(b.radius))
+      ++count;
+    return false;
+  }
+
+  double minimumOnVolume(const BoxType &r) { ++calls; return r.squaredExteriorDistance(p); }
+  double minimumOnObject(const BallType &b) { ++calls; return (std::max)(0., (b.center - p).squaredNorm() - SQR(b.radius)); }
+  double minimumOnVolumeVolume(const BoxType &r1, const BoxType &r2) { ++calls; return r1.squaredExteriorDistance(r2); }
+  double minimumOnVolumeObject(const BoxType &r, const BallType &b) { ++calls; return SQR((std::max)(0., r.exteriorDistance(b.center) - b.radius)); }
+  double minimumOnObjectVolume(const BallType &b, const BoxType &r) { ++calls; return SQR((std::max)(0., r.exteriorDistance(b.center) - b.radius)); }
+  double minimumOnObjectObject(const BallType &b1, const BallType &b2){ ++calls; return SQR((std::max)(0., (b1.center - b2.center).norm() - b1.radius - b2.radius)); }
+  double minimumOnVolumeObject(const BoxType &r, const VectorType &v) { ++calls; return r.squaredExteriorDistance(v); }
+  double minimumOnObjectObject(const BallType &b, const VectorType &v){ ++calls; return SQR((std::max)(0., (b.center - v).norm() - b.radius)); }
+
+  VectorType p;
+  int calls;
+  int count;
+};
+
+
+template<int Dim>
+struct TreeTest
+{
+  typedef Matrix<double, Dim, 1> VectorType;
+  typedef std::vector<VectorType, aligned_allocator<VectorType> > VectorTypeList;
+  typedef Ball<Dim> BallType;
+  typedef std::vector<BallType, aligned_allocator<BallType> > BallTypeList;
+  typedef AlignedBox<double, Dim> BoxType;
+
+  void testIntersect1()
+  {
+    BallTypeList b;
+    for(int i = 0; i < 500; ++i) {
+        b.push_back(BallType(VectorType::Random(), 0.5 * internal::random(0., 1.)));
+    }
+    KdBVH<double, Dim, BallType> tree(b.begin(), b.end());
+
+    VectorType pt = VectorType::Random();
+    BallPointStuff<Dim> i1(pt), i2(pt);
+
+    for(int i = 0; i < (int)b.size(); ++i)
+      i1.intersectObject(b[i]);
+
+    BVIntersect(tree, i2);
+
+    VERIFY(i1.count == i2.count);
+  }
+
+  void testMinimize1()
+  {
+    BallTypeList b;
+    for(int i = 0; i < 500; ++i) {
+        b.push_back(BallType(VectorType::Random(), 0.01 * internal::random(0., 1.)));
+    }
+    KdBVH<double, Dim, BallType> tree(b.begin(), b.end());
+
+    VectorType pt = VectorType::Random();
+    BallPointStuff<Dim> i1(pt), i2(pt);
+
+    double m1 = (std::numeric_limits<double>::max)(), m2 = m1;
+
+    for(int i = 0; i < (int)b.size(); ++i)
+      m1 = (std::min)(m1, i1.minimumOnObject(b[i]));
+
+    m2 = BVMinimize(tree, i2);
+
+    VERIFY_IS_APPROX(m1, m2);
+  }
+
+  void testIntersect2()
+  {
+    BallTypeList b;
+    VectorTypeList v;
+
+    for(int i = 0; i < 50; ++i) {
+        b.push_back(BallType(VectorType::Random(), 0.5 * internal::random(0., 1.)));
+        for(int j = 0; j < 3; ++j)
+            v.push_back(VectorType::Random());
+    }
+
+    KdBVH<double, Dim, BallType> tree(b.begin(), b.end());
+    KdBVH<double, Dim, VectorType> vTree(v.begin(), v.end());
+
+    BallPointStuff<Dim> i1, i2;
+
+    for(int i = 0; i < (int)b.size(); ++i)
+        for(int j = 0; j < (int)v.size(); ++j)
+            i1.intersectObjectObject(b[i], v[j]);
+
+    BVIntersect(tree, vTree, i2);
+
+    VERIFY(i1.count == i2.count);
+  }
+
+  void testMinimize2()
+  {
+    BallTypeList b;
+    VectorTypeList v;
+
+    for(int i = 0; i < 50; ++i) {
+        b.push_back(BallType(VectorType::Random(), 1e-7 + 1e-6 * internal::random(0., 1.)));
+        for(int j = 0; j < 3; ++j)
+            v.push_back(VectorType::Random());
+    }
+
+    KdBVH<double, Dim, BallType> tree(b.begin(), b.end());
+    KdBVH<double, Dim, VectorType> vTree(v.begin(), v.end());
+
+    BallPointStuff<Dim> i1, i2;
+
+    double m1 = (std::numeric_limits<double>::max)(), m2 = m1;
+
+    for(int i = 0; i < (int)b.size(); ++i)
+        for(int j = 0; j < (int)v.size(); ++j)
+            m1 = (std::min)(m1, i1.minimumOnObjectObject(b[i], v[j]));
+
+    m2 = BVMinimize(tree, vTree, i2);
+
+    VERIFY_IS_APPROX(m1, m2);
+  }
+};
+
+
+EIGEN_DECLARE_TEST(BVH)
+{
+  for(int i = 0; i < g_repeat; i++) {
+#ifdef EIGEN_TEST_PART_1
+    TreeTest<2> test2;
+    CALL_SUBTEST(test2.testIntersect1());
+    CALL_SUBTEST(test2.testMinimize1());
+    CALL_SUBTEST(test2.testIntersect2());
+    CALL_SUBTEST(test2.testMinimize2());
+#endif
+
+#ifdef EIGEN_TEST_PART_2
+    TreeTest<3> test3;
+    CALL_SUBTEST(test3.testIntersect1());
+    CALL_SUBTEST(test3.testMinimize1());
+    CALL_SUBTEST(test3.testIntersect2());
+    CALL_SUBTEST(test3.testMinimize2());
+#endif
+
+#ifdef EIGEN_TEST_PART_3
+    TreeTest<4> test4;
+    CALL_SUBTEST(test4.testIntersect1());
+    CALL_SUBTEST(test4.testMinimize1());
+    CALL_SUBTEST(test4.testIntersect2());
+    CALL_SUBTEST(test4.testMinimize2());
+#endif
+  }
+}

diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
new file mode 100644
index 0000000..d30fa62
--- /dev/null
+++ b/unsupported/test/CMakeLists.txt

@@ -0,0 +1,417 @@
+# The file split_test_helper.h was generated at first run,
+# it is now included in test/
+if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
+  file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
+endif()
+
+set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Unsupported")
+add_custom_target(BuildUnsupported)
+
+include_directories(../../test ../../unsupported ../../Eigen
+                    ${CMAKE_CURRENT_BINARY_DIR}/../../test)
+
+find_package (Threads)
+
+find_package(GoogleHash)
+if(GoogleHash_FOUND)
+  add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT")
+  include_directories(${GOOGLEHASH_INCLUDES})
+  ei_add_property(EIGEN_TESTED_BACKENDS  "GoogleHash, ")
+else()
+  ei_add_property(EIGEN_MISSING_BACKENDS  "GoogleHash, ")
+endif()
+
+
+find_package(Adolc)
+if(Adolc_FOUND)
+  include_directories(${ADOLC_INCLUDES})
+  ei_add_property(EIGEN_TESTED_BACKENDS "Adolc, ")
+  if(EIGEN_TEST_CXX11)
+    ei_add_test(forward_adolc "" ${ADOLC_LIBRARIES})
+  else()
+    message(STATUS "Adolc found, but tests require C++11 mode")
+  endif()
+else()
+  ei_add_property(EIGEN_MISSING_BACKENDS "Adolc, ")
+endif()
+
+# this test seems to never have been successful on x87, so is considered to contain a FP-related bug.
+# see thread: "non-linear optimization test summary"
+ei_add_test(NonLinearOptimization)
+
+ei_add_test(NumericalDiff)
+ei_add_test(autodiff_scalar)
+ei_add_test(autodiff)
+
+ei_add_test(BVH)
+
+ei_add_test(matrix_exponential)
+ei_add_test(matrix_function)
+ei_add_test(matrix_power)
+ei_add_test(matrix_square_root)
+ei_add_test(alignedvector3)
+
+ei_add_test(FFT)
+
+ei_add_test(EulerAngles)
+
+find_package(MPREAL)
+if(MPREAL_FOUND AND EIGEN_COMPILER_SUPPORT_CPP11)
+  ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ")
+  include_directories(${MPREAL_INCLUDES})
+  ei_add_test(mpreal_support "-std=c++11" "${MPREAL_LIBRARIES}" )
+else()
+  ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ")
+endif()
+
+ei_add_test(sparse_extra   "" "")
+
+find_package(FFTW)
+if(FFTW_FOUND)
+  ei_add_property(EIGEN_TESTED_BACKENDS "fftw, ")
+  include_directories( ${FFTW_INCLUDES} )
+  if(FFTWL_LIB)
+    ei_add_test(FFTW  "-DEIGEN_FFTW_DEFAULT -DEIGEN_HAS_FFTWL" "${FFTW_LIBRARIES}" )
+  else()
+    ei_add_test(FFTW  "-DEIGEN_FFTW_DEFAULT" "${FFTW_LIBRARIES}" )
+  endif()
+else()
+  ei_add_property(EIGEN_MISSING_BACKENDS "fftw, ")
+endif()
+
+option(EIGEN_TEST_OPENGL "Enable OpenGL support in unit tests" OFF)
+if(EIGEN_TEST_OPENGL)
+  find_package(OpenGL)
+  find_package(GLUT)
+  find_package(GLEW)
+  if(OPENGL_FOUND AND GLUT_FOUND AND GLEW_FOUND)
+    include_directories(${OPENGL_INCLUDE_DIR} ${GLUT_INCLUDE_DIR} ${GLEW_INCLUDE_DIRS})
+    ei_add_property(EIGEN_TESTED_BACKENDS "OpenGL, ")
+    set(EIGEN_GL_LIB ${GLUT_LIBRARIES} ${GLEW_LIBRARIES} ${OPENGL_LIBRARIES})
+    ei_add_test(openglsupport  "" "${EIGEN_GL_LIB}" )
+  else()
+    ei_add_property(EIGEN_MISSING_BACKENDS "OpenGL, ")
+  endif()
+else()
+    ei_add_property(EIGEN_MISSING_BACKENDS "OpenGL, ")
+endif()
+
+ei_add_test(polynomialsolver)
+ei_add_test(polynomialutils)
+ei_add_test(splines)
+ei_add_test(gmres)
+ei_add_test(dgmres)
+ei_add_test(minres)
+ei_add_test(idrs)
+ei_add_test(levenberg_marquardt)
+ei_add_test(kronecker_product)
+ei_add_test(bessel_functions)
+ei_add_test(special_functions)
+ei_add_test(special_packetmath "-DEIGEN_FAST_MATH=1")
+
+if(EIGEN_TEST_CXX11)
+  if(EIGEN_TEST_SYCL)
+    set(EIGEN_SYCL ON)
+    # Forward CMake options as preprocessor definitions
+    if(EIGEN_SYCL_USE_DEFAULT_SELECTOR)
+      add_definitions(-DEIGEN_SYCL_USE_DEFAULT_SELECTOR=${EIGEN_SYCL_USE_DEFAULT_SELECTOR})
+    endif()
+    if(EIGEN_SYCL_NO_LOCAL_MEM)
+      add_definitions(-DEIGEN_SYCL_NO_LOCAL_MEM=${EIGEN_SYCL_NO_LOCAL_MEM})
+    endif()
+    if(EIGEN_SYCL_LOCAL_MEM)
+      add_definitions(-DEIGEN_SYCL_LOCAL_MEM=${EIGEN_SYCL_LOCAL_MEM})
+    endif()
+    if(EIGEN_SYCL_MAX_GLOBAL_RANGE)
+      add_definitions(-DEIGEN_SYCL_MAX_GLOBAL_RANGE=${EIGEN_SYCL_MAX_GLOBAL_RANGE})
+    endif()
+    if(EIGEN_SYCL_LOCAL_THREAD_DIM0)
+      add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM0=${EIGEN_SYCL_LOCAL_THREAD_DIM0})
+    endif()
+    if(EIGEN_SYCL_LOCAL_THREAD_DIM1)
+      add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM1=${EIGEN_SYCL_LOCAL_THREAD_DIM1})
+    endif()
+    if(EIGEN_SYCL_REG_M)
+      add_definitions(-DEIGEN_SYCL_REG_M=${EIGEN_SYCL_REG_M})
+    endif()
+    if(EIGEN_SYCL_REG_N)
+      add_definitions(-DEIGEN_SYCL_REG_N=${EIGEN_SYCL_REG_N})
+    endif()
+    if(EIGEN_SYCL_USE_PROGRAM_CLASS)
+      add_definitions(-DEIGEN_SYCL_USE_PROGRAM_CLASS=${EIGEN_SYCL_USE_PROGRAM_CLASS})
+    endif()
+    if(EIGEN_SYCL_ASYNC_EXECUTION)
+      add_definitions(-DEIGEN_SYCL_ASYNC_EXECUTION=${EIGEN_SYCL_ASYNC_EXECUTION})
+    endif()
+    if(EIGEN_SYCL_DISABLE_SKINNY)
+      add_definitions(-DEIGEN_SYCL_DISABLE_SKINNY=${EIGEN_SYCL_DISABLE_SKINNY})
+    endif()
+    if(EIGEN_SYCL_DISABLE_DOUBLE_BUFFER)
+    add_definitions(-DEIGEN_SYCL_DISABLE_DOUBLE_BUFFER=${EIGEN_SYCL_DISABLE_DOUBLE_BUFFER})
+  endif()
+    if(EIGEN_SYCL_DISABLE_RANK1)
+      add_definitions(-DEIGEN_SYCL_DISABLE_RANK1=${EIGEN_SYCL_DISABLE_RANK1})
+    endif()
+    if(EIGEN_SYCL_DISABLE_SCALAR)
+      add_definitions(-DEIGEN_SYCL_DISABLE_SCALAR=${EIGEN_SYCL_DISABLE_SCALAR})
+    endif()
+    if(EIGEN_SYCL_DISABLE_GEMV)
+      add_definitions(-DEIGEN_SYCL_DISABLE_GEMV=${EIGEN_SYCL_DISABLE_GEMV})
+    endif()
+    if(EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION)
+      add_definitions(-DEIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION=${EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION})
+    endif()
+
+    if(EIGEN_SYCL_TRISYCL)
+      # triSYCL now requires c++17.
+      set(CMAKE_CXX_STANDARD 17)
+    else()
+      if(MSVC)
+        # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11
+        # can cause issues with the ComputeCpp device compiler parsing Visual Studio Headers.
+        set(CMAKE_CXX_STANDARD 14)
+        list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
+      else()
+        set(CMAKE_CXX_STANDARD 11)
+        list(APPEND COMPUTECPP_USER_FLAGS -Wall)
+      endif()
+      # The following flags are not supported by Clang and can cause warnings
+      # if used with -Werror so they are removed here.
+      if(COMPUTECPP_USE_COMPILER_DRIVER)
+        set(CMAKE_CXX_COMPILER ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE})
+        string(REPLACE "-Wlogical-op" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+        string(REPLACE "-Wno-psabi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+        string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+      endif()
+      list(APPEND COMPUTECPP_USER_FLAGS
+          -DEIGEN_NO_ASSERTION_CHECKING=1
+          -no-serial-memop
+          -Xclang
+          -cl-mad-enable)
+    endif()
+
+    ei_add_test(cxx11_tensor_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_image_op_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_math_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_forced_eval_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_broadcast_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_device_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_reduction_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_morphing_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_shuffling_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_padding_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_builtins_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_contract_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_concatenation_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_reverse_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_convolution_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_striding_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_chipping_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_layout_swap_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_inflation_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_random_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_generator_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_patch_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_image_patch_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_volume_patch_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_argmax_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_scan_sycl ${STD_CXX_FLAG})
+    set(EIGEN_SYCL OFF)
+  endif()
+
+  ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_non_blocking_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+
+  ei_add_test(cxx11_meta)
+  ei_add_test(cxx11_maxsizevector)
+  ei_add_test(cxx11_tensor_argmax)
+  ei_add_test(cxx11_tensor_assign)
+  ei_add_test(cxx11_tensor_block_access)
+  ei_add_test(cxx11_tensor_block_eval)
+  ei_add_test(cxx11_tensor_block_io)
+  ei_add_test(cxx11_tensor_broadcasting)
+  ei_add_test(cxx11_tensor_casts)
+  ei_add_test(cxx11_tensor_chipping)
+  ei_add_test(cxx11_tensor_comparisons)
+  ei_add_test(cxx11_tensor_concatenation)
+  ei_add_test(cxx11_tensor_const)
+  ei_add_test(cxx11_tensor_contraction)
+  ei_add_test(cxx11_tensor_convolution)
+  ei_add_test(cxx11_tensor_custom_index)
+  ei_add_test(cxx11_tensor_custom_op)
+  ei_add_test(cxx11_tensor_dimension)
+  ei_add_test(cxx11_tensor_empty)
+  ei_add_test(cxx11_tensor_executor "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_tensor_expr)
+  ei_add_test(cxx11_tensor_fft)
+  ei_add_test(cxx11_tensor_fixed_size)
+  ei_add_test(cxx11_tensor_forced_eval)
+  ei_add_test(cxx11_tensor_generator)
+  ei_add_test(cxx11_tensor_ifft)
+  ei_add_test(cxx11_tensor_image_patch)
+  ei_add_test(cxx11_tensor_index_list)
+  ei_add_test(cxx11_tensor_inflation)
+  ei_add_test(cxx11_tensor_intdiv)
+  ei_add_test(cxx11_tensor_io)
+  ei_add_test(cxx11_tensor_layout_swap)
+  ei_add_test(cxx11_tensor_lvalue)
+  ei_add_test(cxx11_tensor_map)
+  ei_add_test(cxx11_tensor_math)
+  ei_add_test(cxx11_tensor_mixed_indices)
+  ei_add_test(cxx11_tensor_morphing)
+  ei_add_test(cxx11_tensor_move)
+  ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_tensor_of_complex)
+  ei_add_test(cxx11_tensor_of_const_values)
+  ei_add_test(cxx11_tensor_of_strings)
+  ei_add_test(cxx11_tensor_padding)
+  ei_add_test(cxx11_tensor_patch)
+  ei_add_test(cxx11_tensor_random)
+  ei_add_test(cxx11_tensor_reduction)
+  ei_add_test(cxx11_tensor_ref)
+  ei_add_test(cxx11_tensor_roundings)
+  ei_add_test(cxx11_tensor_scan)
+  ei_add_test(cxx11_tensor_shuffling)
+  ei_add_test(cxx11_tensor_simple)
+  ei_add_test(cxx11_tensor_striding)
+  ei_add_test(cxx11_tensor_sugar)
+  ei_add_test(cxx11_tensor_thread_local "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_tensor_trace)
+  ei_add_test(cxx11_tensor_volume_patch)
+#  ei_add_test(cxx11_tensor_symmetry)
+if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+  # This test requires __uint128_t which is only available on 64bit systems
+  ei_add_test(cxx11_tensor_uint128)
+endif()
+
+endif()
+
+# These tests needs nvcc
+find_package(CUDA 7.0)
+if(CUDA_FOUND AND EIGEN_TEST_CUDA)
+  # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
+  # and -fno-check-new flags since they trigger thousands of compilation warnings
+  # in the CUDA runtime
+  # Also remove -ansi that is incompatible with std=c++11.
+  string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
+  message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS})
+
+  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+    set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
+  endif()
+  if(EIGEN_TEST_CUDA_CLANG)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+    string(APPEND CMAKE_CXX_FLAGS " --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}")
+    foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+        string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${ARCH}")
+    endforeach()
+  endif()
+
+  set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr")
+  if (${CUDA_VERSION} STREQUAL "7.0")
+    set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr")
+  endif()
+
+  set(NVCC_ARCH_FLAGS)
+  foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+    string(APPEND NVCC_ARCH_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}")
+  endforeach()
+  set(CUDA_NVCC_FLAGS  "${EIGEN_CUDA_RELAXED_CONSTEXPR} -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS}")
+  cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
+  set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
+
+  ei_add_test(cxx11_tensor_complex_gpu)
+  ei_add_test(cxx11_tensor_complex_cwise_ops_gpu)
+  ei_add_test(cxx11_tensor_reduction_gpu)
+  ei_add_test(cxx11_tensor_argmax_gpu)
+  ei_add_test(cxx11_tensor_cast_float16_gpu)
+  ei_add_test(cxx11_tensor_scan_gpu)
+
+  set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH 9999)
+  foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+    if(${ARCH} LESS ${EIGEN_CUDA_OLDEST_COMPUTE_ARCH})
+      set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH ${ARCH})
+    endif()
+  endforeach()
+
+  # Contractions require arch 3.0 or higher
+  if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 29)
+    ei_add_test(cxx11_tensor_device)
+    ei_add_test(cxx11_tensor_gpu)
+    ei_add_test(cxx11_tensor_contract_gpu)
+    ei_add_test(cxx11_tensor_of_float16_gpu)
+  endif()
+
+  # The random number generation code requires arch 3.5 or greater.
+  if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 34)
+    ei_add_test(cxx11_tensor_random_gpu)
+  endif()
+
+
+  unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+endif()
+
+# Add HIP specific tests
+if (EIGEN_TEST_HIP)
+
+  set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
+
+  if (EXISTS ${HIP_PATH})
+
+    list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake)
+
+    find_package(HIP REQUIRED)
+    if (HIP_FOUND)
+
+      execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
+
+      if ((${HIP_PLATFORM} STREQUAL "hcc") OR (${HIP_PLATFORM} STREQUAL "amd"))
+
+	include_directories(${CMAKE_CURRENT_BINARY_DIR})
+	include_directories(${HIP_PATH}/include)
+
+	set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
+	#
+	# complex datatype is not yet supported by HIP
+	# so leaving out those tests for now
+	#
+	# ei_add_test(cxx11_tensor_complex_gpu)
+	# ei_add_test(cxx11_tensor_complex_cwise_ops_gpu)
+	#
+	ei_add_test(cxx11_tensor_reduction_gpu)
+	ei_add_test(cxx11_tensor_argmax_gpu)
+	ei_add_test(cxx11_tensor_cast_float16_gpu)
+	ei_add_test(cxx11_tensor_scan_gpu)
+	ei_add_test(cxx11_tensor_device)
+
+	ei_add_test(cxx11_tensor_gpu)
+	ei_add_test(cxx11_tensor_contract_gpu)
+	ei_add_test(cxx11_tensor_of_float16_gpu)
+	ei_add_test(cxx11_tensor_random_gpu)
+
+	unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+
+      elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia"))
+	message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
+      else ()
+	message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
+      endif()
+
+    endif()
+
+  else ()
+
+    message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
+
+  endif()
+
+endif()
+

diff --git a/unsupported/test/EulerAngles.cpp b/unsupported/test/EulerAngles.cpp
new file mode 100644
index 0000000..0955795
--- /dev/null
+++ b/unsupported/test/EulerAngles.cpp

@@ -0,0 +1,296 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <unsupported/Eigen/EulerAngles>
+
+using namespace Eigen;
+
+// Unfortunately, we need to specialize it in order to work. (We could add it in main.h test framework)
+template <typename Scalar, class System>
+bool verifyIsApprox(const Eigen::EulerAngles<Scalar, System>& a, const Eigen::EulerAngles<Scalar, System>& b)
+{
+  return verifyIsApprox(a.angles(), b.angles());
+}
+
+// Verify that x is in the approxed range [a, b]
+#define VERIFY_APPROXED_RANGE(a, x, b) \
+  do { \
+  VERIFY_IS_APPROX_OR_LESS_THAN(a, x); \
+  VERIFY_IS_APPROX_OR_LESS_THAN(x, b); \
+  } while(0)
+
+const char X = EULER_X;
+const char Y = EULER_Y;
+const char Z = EULER_Z;
+
+template<typename Scalar, class EulerSystem>
+void verify_euler(const EulerAngles<Scalar, EulerSystem>& e)
+{
+  typedef EulerAngles<Scalar, EulerSystem> EulerAnglesType;
+  typedef Matrix<Scalar,3,3> Matrix3;
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Quaternion<Scalar> QuaternionType;
+  typedef AngleAxis<Scalar> AngleAxisType;
+  
+  const Scalar ONE = Scalar(1);
+  const Scalar HALF_PI = Scalar(EIGEN_PI / 2);
+  const Scalar PI = Scalar(EIGEN_PI);
+  
+  // It's very important calc the acceptable precision depending on the distance from the pole.
+  const Scalar longitudeRadius = std::abs(
+    EulerSystem::IsTaitBryan ?
+    std::cos(e.beta()) :
+    std::sin(e.beta())
+    );
+  Scalar precision = test_precision<Scalar>() / longitudeRadius;
+  
+  Scalar betaRangeStart, betaRangeEnd;
+  if (EulerSystem::IsTaitBryan)
+  {
+    betaRangeStart = -HALF_PI;
+    betaRangeEnd = HALF_PI;
+  }
+  else
+  {
+    if (!EulerSystem::IsBetaOpposite)
+    {
+      betaRangeStart = 0;
+      betaRangeEnd = PI;
+    }
+    else
+    {
+      betaRangeStart = -PI;
+      betaRangeEnd = 0;
+    }
+  }
+  
+  const Vector3 I_ = EulerAnglesType::AlphaAxisVector();
+  const Vector3 J_ = EulerAnglesType::BetaAxisVector();
+  const Vector3 K_ = EulerAnglesType::GammaAxisVector();
+  
+  // Is approx checks
+  VERIFY(e.isApprox(e));
+  VERIFY_IS_APPROX(e, e);
+  VERIFY_IS_NOT_APPROX(e, EulerAnglesType(e.alpha() + ONE, e.beta() + ONE, e.gamma() + ONE));
+
+  const Matrix3 m(e);
+  VERIFY_IS_APPROX(Scalar(m.determinant()), ONE);
+
+  EulerAnglesType ebis(m);
+  
+  // When no roll(acting like polar representation), we have the best precision.
+  // One of those cases is when the Euler angles are on the pole, and because it's singular case,
+  //  the computation returns no roll.
+  if (ebis.beta() == 0)
+    precision = test_precision<Scalar>();
+  
+  // Check that eabis in range
+  VERIFY_APPROXED_RANGE(-PI, ebis.alpha(), PI);
+  VERIFY_APPROXED_RANGE(betaRangeStart, ebis.beta(), betaRangeEnd);
+  VERIFY_APPROXED_RANGE(-PI, ebis.gamma(), PI);
+
+  const Matrix3 mbis(AngleAxisType(ebis.alpha(), I_) * AngleAxisType(ebis.beta(), J_) * AngleAxisType(ebis.gamma(), K_));
+  VERIFY_IS_APPROX(Scalar(mbis.determinant()), ONE);
+  VERIFY_IS_APPROX(mbis, ebis.toRotationMatrix());
+  /*std::cout << "===================\n" <<
+    "e: " << e << std::endl <<
+    "eabis: " << eabis.transpose() << std::endl <<
+    "m: " << m << std::endl <<
+    "mbis: " << mbis << std::endl <<
+    "X: " << (m * Vector3::UnitX()).transpose() << std::endl <<
+    "X: " << (mbis * Vector3::UnitX()).transpose() << std::endl;*/
+  VERIFY(m.isApprox(mbis, precision));
+
+  // Test if ea and eabis are the same
+  // Need to check both singular and non-singular cases
+  // There are two singular cases.
+  // 1. When I==K and sin(ea(1)) == 0
+  // 2. When I!=K and cos(ea(1)) == 0
+
+  // TODO: Make this test work well, and use range saturation function.
+  /*// If I==K, and ea[1]==0, then there no unique solution.
+  // The remark apply in the case where I!=K, and |ea[1]| is close to +-pi/2.
+  if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) ) 
+      VERIFY_IS_APPROX(ea, eabis);*/
+  
+  // Quaternions
+  const QuaternionType q(e);
+  ebis = q;
+  const QuaternionType qbis(ebis);
+  VERIFY(internal::isApprox<Scalar>(std::abs(q.dot(qbis)), ONE, precision));
+  //VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same
+  
+  // A suggestion for simple product test when will be supported.
+  /*EulerAnglesType e2(PI/2, PI/2, PI/2);
+  Matrix3 m2(e2);
+  VERIFY_IS_APPROX(e*e2, m*m2);*/
+}
+
+template<signed char A, signed char B, signed char C, typename Scalar>
+void verify_euler_vec(const Matrix<Scalar,3,1>& ea)
+{
+  verify_euler(EulerAngles<Scalar, EulerSystem<A, B, C> >(ea[0], ea[1], ea[2]));
+}
+
+template<signed char A, signed char B, signed char C, typename Scalar>
+void verify_euler_all_neg(const Matrix<Scalar,3,1>& ea)
+{
+  verify_euler_vec<+A,+B,+C>(ea);
+  verify_euler_vec<+A,+B,-C>(ea);
+  verify_euler_vec<+A,-B,+C>(ea);
+  verify_euler_vec<+A,-B,-C>(ea);
+  
+  verify_euler_vec<-A,+B,+C>(ea);
+  verify_euler_vec<-A,+B,-C>(ea);
+  verify_euler_vec<-A,-B,+C>(ea);
+  verify_euler_vec<-A,-B,-C>(ea);
+}
+
+template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea)
+{
+  verify_euler_all_neg<X,Y,Z>(ea);
+  verify_euler_all_neg<X,Y,X>(ea);
+  verify_euler_all_neg<X,Z,Y>(ea);
+  verify_euler_all_neg<X,Z,X>(ea);
+  
+  verify_euler_all_neg<Y,Z,X>(ea);
+  verify_euler_all_neg<Y,Z,Y>(ea);
+  verify_euler_all_neg<Y,X,Z>(ea);
+  verify_euler_all_neg<Y,X,Y>(ea);
+  
+  verify_euler_all_neg<Z,X,Y>(ea);
+  verify_euler_all_neg<Z,X,Z>(ea);
+  verify_euler_all_neg<Z,Y,X>(ea);
+  verify_euler_all_neg<Z,Y,Z>(ea);
+}
+
+template<typename Scalar> void check_singular_cases(const Scalar& singularBeta)
+{
+  typedef Matrix<Scalar,3,1> Vector3;
+  const Scalar PI = Scalar(EIGEN_PI);
+  
+  for (Scalar epsilon = NumTraits<Scalar>::epsilon(); epsilon < 1; epsilon *= Scalar(1.2))
+  {
+    check_all_var(Vector3(PI/4, singularBeta, PI/3));
+    check_all_var(Vector3(PI/4, singularBeta - epsilon, PI/3));
+    check_all_var(Vector3(PI/4, singularBeta - Scalar(1.5)*epsilon, PI/3));
+    check_all_var(Vector3(PI/4, singularBeta - 2*epsilon, PI/3));
+    check_all_var(Vector3(PI*Scalar(0.8), singularBeta - epsilon, Scalar(0.9)*PI));
+    check_all_var(Vector3(PI*Scalar(-0.9), singularBeta + epsilon, PI*Scalar(0.3)));
+    check_all_var(Vector3(PI*Scalar(-0.6), singularBeta + Scalar(1.5)*epsilon, PI*Scalar(0.3)));
+    check_all_var(Vector3(PI*Scalar(-0.5), singularBeta + 2*epsilon, PI*Scalar(0.4)));
+    check_all_var(Vector3(PI*Scalar(0.9), singularBeta + epsilon, Scalar(0.8)*PI));
+  }
+  
+  // This one for sanity, it had a problem with near pole cases in float scalar.
+  check_all_var(Vector3(PI*Scalar(0.8), singularBeta - Scalar(1E-6), Scalar(0.9)*PI));
+}
+
+template<typename Scalar> void eulerangles_manual()
+{
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Matrix<Scalar,Dynamic,1> VectorX;
+  const Vector3 Zero = Vector3::Zero();
+  const Scalar PI = Scalar(EIGEN_PI);
+  
+  check_all_var(Zero);
+  
+  // singular cases
+  check_singular_cases(PI/2);
+  check_singular_cases(-PI/2);
+  
+  check_singular_cases(Scalar(0));
+  check_singular_cases(Scalar(-0));
+  
+  check_singular_cases(PI);
+  check_singular_cases(-PI);
+  
+  // non-singular cases
+  VectorX alpha = VectorX::LinSpaced(20, Scalar(-0.99) * PI, PI);
+  VectorX beta =  VectorX::LinSpaced(20, Scalar(-0.49) * PI, Scalar(0.49) * PI);
+  VectorX gamma = VectorX::LinSpaced(20, Scalar(-0.99) * PI, PI);
+  for (int i = 0; i < alpha.size(); ++i) {
+    for (int j = 0; j < beta.size(); ++j) {
+      for (int k = 0; k < gamma.size(); ++k) {
+        check_all_var(Vector3(alpha(i), beta(j), gamma(k)));
+      }
+    }
+  }
+}
+
+template<typename Scalar> void eulerangles_rand()
+{
+  typedef Matrix<Scalar,3,3> Matrix3;
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Array<Scalar,3,1> Array3;
+  typedef Quaternion<Scalar> Quaternionx;
+  typedef AngleAxis<Scalar> AngleAxisType;
+
+  Scalar a = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI));
+  Quaternionx q1;
+  q1 = AngleAxisType(a, Vector3::Random().normalized());
+  Matrix3 m;
+  m = q1;
+  
+  Vector3 ea = m.eulerAngles(0,1,2);
+  check_all_var(ea);
+  ea = m.eulerAngles(0,1,0);
+  check_all_var(ea);
+  
+  // Check with purely random Quaternion:
+  q1.coeffs() = Quaternionx::Coefficients::Random().normalized();
+  m = q1;
+  ea = m.eulerAngles(0,1,2);
+  check_all_var(ea);
+  ea = m.eulerAngles(0,1,0);
+  check_all_var(ea);
+  
+  // Check with random angles in range [0:pi]x[-pi:pi]x[-pi:pi].
+  ea = (Array3::Random() + Array3(1,0,0))*Scalar(EIGEN_PI)*Array3(0.5,1,1);
+  check_all_var(ea);
+  
+  ea[2] = ea[0] = internal::random<Scalar>(0,Scalar(EIGEN_PI));
+  check_all_var(ea);
+  
+  ea[0] = ea[1] = internal::random<Scalar>(0,Scalar(EIGEN_PI));
+  check_all_var(ea);
+  
+  ea[1] = 0;
+  check_all_var(ea);
+  
+  ea.head(2).setZero();
+  check_all_var(ea);
+  
+  ea.setZero();
+  check_all_var(ea);
+}
+
+EIGEN_DECLARE_TEST(EulerAngles)
+{
+  // Simple cast test
+  EulerAnglesXYZd onesEd(1, 1, 1);
+  EulerAnglesXYZf onesEf = onesEd.cast<float>();
+  VERIFY_IS_APPROX(onesEd, onesEf.cast<double>());
+
+  // Simple Construction from Vector3 test
+  VERIFY_IS_APPROX(onesEd, EulerAnglesXYZd(Vector3d::Ones()));
+  
+  CALL_SUBTEST_1( eulerangles_manual<float>() );
+  CALL_SUBTEST_2( eulerangles_manual<double>() );
+  
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_3( eulerangles_rand<float>() );
+    CALL_SUBTEST_4( eulerangles_rand<double>() );
+  }
+  
+  // TODO: Add tests for auto diff
+  // TODO: Add tests for complex numbers
+}

diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
new file mode 100644
index 0000000..45c87f5
--- /dev/null
+++ b/unsupported/test/FFT.cpp

@@ -0,0 +1,2 @@
+#define test_FFTW test_FFT
+#include "FFTW.cpp"

diff --git a/unsupported/test/FFTW.cpp b/unsupported/test/FFTW.cpp
new file mode 100644
index 0000000..cfe559e
--- /dev/null
+++ b/unsupported/test/FFTW.cpp

@@ -0,0 +1,262 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <unsupported/Eigen/FFT>
+
+template <typename T> 
+std::complex<T> RandomCpx() { return std::complex<T>( (T)(rand()/(T)RAND_MAX - .5), (T)(rand()/(T)RAND_MAX - .5) ); }
+
+using namespace std;
+using namespace Eigen;
+
+
+template < typename T>
+complex<long double>  promote(complex<T> x) { return complex<long double>((long double)x.real(),(long double)x.imag()); }
+
+complex<long double>  promote(float x) { return complex<long double>((long double)x); }
+complex<long double>  promote(double x) { return complex<long double>((long double)x); }
+complex<long double>  promote(long double x) { return complex<long double>((long double)x); }
+    
+
+    template <typename VT1,typename VT2>
+    long double fft_rmse( const VT1 & fftbuf,const VT2 & timebuf)
+    {
+        long double totalpower=0;
+        long double difpower=0;
+        long double pi = acos((long double)-1 );
+        for (size_t k0=0;k0<(size_t)fftbuf.size();++k0) {
+            complex<long double> acc = 0;
+            long double phinc = (long double)(-2.)*k0* pi / timebuf.size();
+            for (size_t k1=0;k1<(size_t)timebuf.size();++k1) {
+                acc +=  promote( timebuf[k1] ) * exp( complex<long double>(0,k1*phinc) );
+            }
+            totalpower += numext::abs2(acc);
+            complex<long double> x = promote(fftbuf[k0]); 
+            complex<long double> dif = acc - x;
+            difpower += numext::abs2(dif);
+            //cerr << k0 << "\t" << acc << "\t" <<  x << "\t" << sqrt(numext::abs2(dif)) << endl;
+        }
+        cerr << "rmse:" << sqrt(difpower/totalpower) << endl;
+        return sqrt(difpower/totalpower);
+    }
+
+    template <typename VT1,typename VT2>
+    long double dif_rmse( const VT1 buf1,const VT2 buf2)
+    {
+        long double totalpower=0;
+        long double difpower=0;
+        size_t n = (min)( buf1.size(),buf2.size() );
+        for (size_t k=0;k<n;++k) {
+            totalpower += (long double)((numext::abs2( buf1[k] ) + numext::abs2(buf2[k]) )/2);
+            difpower += (long double)(numext::abs2(buf1[k] - buf2[k]));
+        }
+        return sqrt(difpower/totalpower);
+    }
+
+enum { StdVectorContainer, EigenVectorContainer };
+
+template<int Container, typename Scalar> struct VectorType;
+
+template<typename Scalar> struct VectorType<StdVectorContainer,Scalar>
+{
+  typedef vector<Scalar> type;
+};
+
+template<typename Scalar> struct VectorType<EigenVectorContainer,Scalar>
+{
+  typedef Matrix<Scalar,Dynamic,1> type;
+};
+
+template <int Container, typename T>
+void test_scalar_generic(int nfft)
+{
+    typedef typename FFT<T>::Complex Complex;
+    typedef typename FFT<T>::Scalar Scalar;
+    typedef typename VectorType<Container,Scalar>::type ScalarVector;
+    typedef typename VectorType<Container,Complex>::type ComplexVector;
+
+    FFT<T> fft;
+    ScalarVector tbuf(nfft);
+    ComplexVector freqBuf;
+    for (int k=0;k<nfft;++k)
+        tbuf[k]= (T)( rand()/(double)RAND_MAX - .5);
+
+    // make sure it DOESN'T give the right full spectrum answer
+    // if we've asked for half-spectrum
+    fft.SetFlag(fft.HalfSpectrum );
+    fft.fwd( freqBuf,tbuf);
+    VERIFY((size_t)freqBuf.size() == (size_t)( (nfft>>1)+1) );
+    VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>()  );// gross check
+
+    fft.ClearFlag(fft.HalfSpectrum );
+    fft.fwd( freqBuf,tbuf);
+    VERIFY( (size_t)freqBuf.size() == (size_t)nfft);
+    VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>()  );// gross check
+
+    if (nfft&1)
+        return; // odd FFTs get the wrong size inverse FFT
+
+    ScalarVector tbuf2;
+    fft.inv( tbuf2 , freqBuf);
+    VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>()  );// gross check
+
+
+    // verify that the Unscaled flag takes effect
+    ScalarVector tbuf3;
+    fft.SetFlag(fft.Unscaled);
+
+    fft.inv( tbuf3 , freqBuf);
+
+    for (int k=0;k<nfft;++k)
+        tbuf3[k] *= T(1./nfft);
+
+
+    //for (size_t i=0;i<(size_t) tbuf.size();++i)
+    //    cout << "freqBuf=" << freqBuf[i] << " in2=" << tbuf3[i] << " -  in=" << tbuf[i] << " => " << (tbuf3[i] - tbuf[i] ) <<  endl;
+
+    VERIFY( T(dif_rmse(tbuf,tbuf3)) < test_precision<T>()  );// gross check
+
+    // verify that ClearFlag works
+    fft.ClearFlag(fft.Unscaled);
+    fft.inv( tbuf2 , freqBuf);
+    VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>()  );// gross check
+}
+
+template <typename T>
+void test_scalar(int nfft)
+{
+  test_scalar_generic<StdVectorContainer,T>(nfft);
+  //test_scalar_generic<EigenVectorContainer,T>(nfft);
+}
+
+
+template <int Container, typename T>
+void test_complex_generic(int nfft)
+{
+    typedef typename FFT<T>::Complex Complex;
+    typedef typename VectorType<Container,Complex>::type ComplexVector;
+
+    FFT<T> fft;
+
+    ComplexVector inbuf(nfft);
+    ComplexVector outbuf;
+    ComplexVector buf3;
+    for (int k=0;k<nfft;++k)
+        inbuf[k]= Complex( (T)(rand()/(double)RAND_MAX - .5), (T)(rand()/(double)RAND_MAX - .5) );
+    fft.fwd( outbuf , inbuf);
+
+    VERIFY( T(fft_rmse(outbuf,inbuf)) < test_precision<T>()  );// gross check
+    fft.inv( buf3 , outbuf);
+
+    VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>()  );// gross check
+
+    // verify that the Unscaled flag takes effect
+    ComplexVector buf4;
+    fft.SetFlag(fft.Unscaled);
+    fft.inv( buf4 , outbuf);
+    for (int k=0;k<nfft;++k)
+        buf4[k] *= T(1./nfft);
+    VERIFY( T(dif_rmse(inbuf,buf4)) < test_precision<T>()  );// gross check
+
+    // verify that ClearFlag works
+    fft.ClearFlag(fft.Unscaled);
+    fft.inv( buf3 , outbuf);
+    VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>()  );// gross check
+}
+
+template <typename T>
+void test_complex(int nfft)
+{
+  test_complex_generic<StdVectorContainer,T>(nfft);
+  test_complex_generic<EigenVectorContainer,T>(nfft);
+}
+/*
+template <typename T,int nrows,int ncols>
+void test_complex2d()
+{
+    typedef typename Eigen::FFT<T>::Complex Complex;
+    FFT<T> fft;
+    Eigen::Matrix<Complex,nrows,ncols> src,src2,dst,dst2;
+
+    src = Eigen::Matrix<Complex,nrows,ncols>::Random();
+    //src =  Eigen::Matrix<Complex,nrows,ncols>::Identity();
+
+    for (int k=0;k<ncols;k++) {
+        Eigen::Matrix<Complex,nrows,1> tmpOut;
+        fft.fwd( tmpOut,src.col(k) );
+        dst2.col(k) = tmpOut;
+    }
+
+    for (int k=0;k<nrows;k++) {
+        Eigen::Matrix<Complex,1,ncols> tmpOut;
+        fft.fwd( tmpOut,  dst2.row(k) );
+        dst2.row(k) = tmpOut;
+    }
+
+    fft.fwd2(dst.data(),src.data(),ncols,nrows);
+    fft.inv2(src2.data(),dst.data(),ncols,nrows);
+    VERIFY( (src-src2).norm() < test_precision<T>() );
+    VERIFY( (dst-dst2).norm() < test_precision<T>() );
+}
+*/
+
+
+void test_return_by_value(int len)
+{
+    VectorXf in;
+    VectorXf in1;
+    in.setRandom( len );
+    VectorXcf out1,out2;
+    FFT<float> fft;
+
+    fft.SetFlag(fft.HalfSpectrum );
+
+    fft.fwd(out1,in);
+    out2 = fft.fwd(in);
+    VERIFY( (out1-out2).norm() < test_precision<float>() );
+    in1 = fft.inv(out1);
+    VERIFY( (in1-in).norm() < test_precision<float>() );
+}
+
+EIGEN_DECLARE_TEST(FFTW)
+{
+  CALL_SUBTEST( test_return_by_value(32) );
+  //CALL_SUBTEST( ( test_complex2d<float,4,8> () ) ); CALL_SUBTEST( ( test_complex2d<double,4,8> () ) );
+  //CALL_SUBTEST( ( test_complex2d<long double,4,8> () ) );
+  CALL_SUBTEST( test_complex<float>(32) ); CALL_SUBTEST( test_complex<double>(32) ); 
+  CALL_SUBTEST( test_complex<float>(256) ); CALL_SUBTEST( test_complex<double>(256) ); 
+  CALL_SUBTEST( test_complex<float>(3*8) ); CALL_SUBTEST( test_complex<double>(3*8) ); 
+  CALL_SUBTEST( test_complex<float>(5*32) ); CALL_SUBTEST( test_complex<double>(5*32) ); 
+  CALL_SUBTEST( test_complex<float>(2*3*4) ); CALL_SUBTEST( test_complex<double>(2*3*4) ); 
+  CALL_SUBTEST( test_complex<float>(2*3*4*5) ); CALL_SUBTEST( test_complex<double>(2*3*4*5) ); 
+  CALL_SUBTEST( test_complex<float>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<double>(2*3*4*5*7) ); 
+
+  CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); 
+  CALL_SUBTEST( test_scalar<float>(45) ); CALL_SUBTEST( test_scalar<double>(45) ); 
+  CALL_SUBTEST( test_scalar<float>(50) ); CALL_SUBTEST( test_scalar<double>(50) ); 
+  CALL_SUBTEST( test_scalar<float>(256) ); CALL_SUBTEST( test_scalar<double>(256) ); 
+  CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); 
+  
+  #ifdef EIGEN_HAS_FFTWL
+  CALL_SUBTEST( test_complex<long double>(32) );
+  CALL_SUBTEST( test_complex<long double>(256) );
+  CALL_SUBTEST( test_complex<long double>(3*8) );
+  CALL_SUBTEST( test_complex<long double>(5*32) );
+  CALL_SUBTEST( test_complex<long double>(2*3*4) );
+  CALL_SUBTEST( test_complex<long double>(2*3*4*5) );
+  CALL_SUBTEST( test_complex<long double>(2*3*4*5*7) );
+  
+  CALL_SUBTEST( test_scalar<long double>(32) );
+  CALL_SUBTEST( test_scalar<long double>(45) );
+  CALL_SUBTEST( test_scalar<long double>(50) );
+  CALL_SUBTEST( test_scalar<long double>(256) );
+  CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
+  #endif
+}

diff --git a/unsupported/test/NonLinearOptimization.cpp b/unsupported/test/NonLinearOptimization.cpp
new file mode 100644
index 0000000..c667b72
--- /dev/null
+++ b/unsupported/test/NonLinearOptimization.cpp

@@ -0,0 +1,1849 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+
+#include <stdio.h>
+
+#include "main.h"
+#include <unsupported/Eigen/NonLinearOptimization>
+
+// This disables some useless Warnings on MSVC.
+// It is intended to be done for this test only.
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+
+// tolerance for chekcing number of iterations
+#define LM_EVAL_COUNT_TOL 4/3
+
+#define LM_CHECK_N_ITERS(SOLVER,NFEV,NJEV) { \
+            ++g_test_level; \
+            VERIFY_IS_EQUAL(SOLVER.nfev, NFEV); \
+            VERIFY_IS_EQUAL(SOLVER.njev, NJEV); \
+            --g_test_level; \
+            VERIFY(SOLVER.nfev <= NFEV * LM_EVAL_COUNT_TOL); \
+            VERIFY(SOLVER.njev <= NJEV * LM_EVAL_COUNT_TOL); \
+        }
+
+int fcn_chkder(const VectorXd &x, VectorXd &fvec, MatrixXd &fjac, int iflag)
+{
+    /*      subroutine fcn for chkder example. */
+
+    int i;
+    assert(15 ==  fvec.size());
+    assert(3 ==  x.size());
+    double tmp1, tmp2, tmp3, tmp4;
+    static const double y[15]={1.4e-1, 1.8e-1, 2.2e-1, 2.5e-1, 2.9e-1, 3.2e-1, 3.5e-1,
+        3.9e-1, 3.7e-1, 5.8e-1, 7.3e-1, 9.6e-1, 1.34, 2.1, 4.39};
+
+
+    if (iflag == 0)
+        return 0;
+
+    if (iflag != 2)
+        for (i=0; i<15; i++) {
+            tmp1 = i+1;
+            tmp2 = 16-i-1;
+            tmp3 = tmp1;
+            if (i >= 8) tmp3 = tmp2;
+            fvec[i] = y[i] - (x[0] + tmp1/(x[1]*tmp2 + x[2]*tmp3));
+        }
+    else {
+        for (i = 0; i < 15; i++) {
+            tmp1 = i+1;
+            tmp2 = 16-i-1;
+
+            /* error introduced into next statement for illustration. */
+            /* corrected statement should read    tmp3 = tmp1 . */
+
+            tmp3 = tmp2;
+            if (i >= 8) tmp3 = tmp2;
+            tmp4 = (x[1]*tmp2 + x[2]*tmp3); tmp4=tmp4*tmp4;
+            fjac(i,0) = -1.;
+            fjac(i,1) = tmp1*tmp2/tmp4;
+            fjac(i,2) = tmp1*tmp3/tmp4;
+        }
+    }
+    return 0;
+}
+
+
+void testChkder()
+{
+  const int m=15, n=3;
+  VectorXd x(n), fvec(m), xp, fvecp(m), err;
+  MatrixXd fjac(m,n);
+  VectorXi ipvt;
+
+  /*      the following values should be suitable for */
+  /*      checking the jacobian matrix. */
+  x << 9.2e-1, 1.3e-1, 5.4e-1;
+
+  internal::chkder(x, fvec, fjac, xp, fvecp, 1, err);
+  fcn_chkder(x, fvec, fjac, 1);
+  fcn_chkder(x, fvec, fjac, 2);
+  fcn_chkder(xp, fvecp, fjac, 1);
+  internal::chkder(x, fvec, fjac, xp, fvecp, 2, err);
+
+  fvecp -= fvec;
+
+  // check those
+  VectorXd fvec_ref(m), fvecp_ref(m), err_ref(m);
+  fvec_ref <<
+      -1.181606, -1.429655, -1.606344,
+      -1.745269, -1.840654, -1.921586,
+      -1.984141, -2.022537, -2.468977,
+      -2.827562, -3.473582, -4.437612,
+      -6.047662, -9.267761, -18.91806;
+  fvecp_ref <<
+      -7.724666e-09, -3.432406e-09, -2.034843e-10,
+      2.313685e-09,  4.331078e-09,  5.984096e-09,
+      7.363281e-09,   8.53147e-09,  1.488591e-08,
+      2.33585e-08,  3.522012e-08,  5.301255e-08,
+      8.26666e-08,  1.419747e-07,   3.19899e-07;
+  err_ref <<
+      0.1141397,  0.09943516,  0.09674474,
+      0.09980447,  0.1073116, 0.1220445,
+      0.1526814, 1, 1,
+      1, 1, 1,
+      1, 1, 1;
+
+  VERIFY_IS_APPROX(fvec, fvec_ref);
+  VERIFY_IS_APPROX(fvecp, fvecp_ref);
+  VERIFY_IS_APPROX(err, err_ref);
+}
+
+// Generic functor
+template<typename _Scalar, int NX=Dynamic, int NY=Dynamic>
+struct Functor
+{
+  typedef _Scalar Scalar;
+  enum {
+    InputsAtCompileTime = NX,
+    ValuesAtCompileTime = NY
+  };
+  typedef Matrix<Scalar,InputsAtCompileTime,1> InputType;
+  typedef Matrix<Scalar,ValuesAtCompileTime,1> ValueType;
+  typedef Matrix<Scalar,ValuesAtCompileTime,InputsAtCompileTime> JacobianType;
+
+  const int m_inputs, m_values;
+
+  Functor() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
+  Functor(int inputs, int values) : m_inputs(inputs), m_values(values) {}
+
+  int inputs() const { return m_inputs; }
+  int values() const { return m_values; }
+
+  // you should define that in the subclass :
+//  void operator() (const InputType& x, ValueType* v, JacobianType* _j=0) const;
+};
+
+struct lmder_functor : Functor<double>
+{
+    lmder_functor(void): Functor<double>(3,15) {}
+    int operator()(const VectorXd &x, VectorXd &fvec) const
+    {
+        double tmp1, tmp2, tmp3;
+        static const double y[15] = {1.4e-1, 1.8e-1, 2.2e-1, 2.5e-1, 2.9e-1, 3.2e-1, 3.5e-1,
+            3.9e-1, 3.7e-1, 5.8e-1, 7.3e-1, 9.6e-1, 1.34, 2.1, 4.39};
+
+        for (int i = 0; i < values(); i++)
+        {
+            tmp1 = i+1;
+            tmp2 = 16 - i - 1;
+            tmp3 = (i>=8)? tmp2 : tmp1;
+            fvec[i] = y[i] - (x[0] + tmp1/(x[1]*tmp2 + x[2]*tmp3));
+        }
+        return 0;
+    }
+
+    int df(const VectorXd &x, MatrixXd &fjac) const
+    {
+        double tmp1, tmp2, tmp3, tmp4;
+        for (int i = 0; i < values(); i++)
+        {
+            tmp1 = i+1;
+            tmp2 = 16 - i - 1;
+            tmp3 = (i>=8)? tmp2 : tmp1;
+            tmp4 = (x[1]*tmp2 + x[2]*tmp3); tmp4 = tmp4*tmp4;
+            fjac(i,0) = -1;
+            fjac(i,1) = tmp1*tmp2/tmp4;
+            fjac(i,2) = tmp1*tmp3/tmp4;
+        }
+        return 0;
+    }
+};
+
+void testLmder1()
+{
+  int n=3, info;
+
+  VectorXd x;
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, 1.);
+
+  // do the computation
+  lmder_functor functor;
+  LevenbergMarquardt<lmder_functor> lm(functor);
+  info = lm.lmder1(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 6, 5);
+
+  // check norm
+  VERIFY_IS_APPROX(lm.fvec.blueNorm(), 0.09063596);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << 0.08241058, 1.133037, 2.343695;
+  VERIFY_IS_APPROX(x, x_ref);
+}
+
+void testLmder()
+{
+  const int m=15, n=3;
+  int info;
+  double fnorm, covfac;
+  VectorXd x;
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, 1.);
+
+  // do the computation
+  lmder_functor functor;
+  LevenbergMarquardt<lmder_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return values
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 6, 5);
+
+  // check norm
+  fnorm = lm.fvec.blueNorm();
+  VERIFY_IS_APPROX(fnorm, 0.09063596);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << 0.08241058, 1.133037, 2.343695;
+  VERIFY_IS_APPROX(x, x_ref);
+
+  // check covariance
+  covfac = fnorm*fnorm/(m-n);
+  internal::covar(lm.fjac, lm.permutation.indices()); // TODO : move this as a function of lm
+
+  MatrixXd cov_ref(n,n);
+  cov_ref <<
+      0.0001531202,   0.002869941,  -0.002656662,
+      0.002869941,    0.09480935,   -0.09098995,
+      -0.002656662,   -0.09098995,    0.08778727;
+
+//  std::cout << fjac*covfac << std::endl;
+
+  MatrixXd cov;
+  cov =  covfac*lm.fjac.topLeftCorner<n,n>();
+  VERIFY_IS_APPROX( cov, cov_ref);
+  // TODO: why isn't this allowed ? :
+  // VERIFY_IS_APPROX( covfac*fjac.topLeftCorner<n,n>() , cov_ref);
+}
+
+struct hybrj_functor : Functor<double>
+{
+    hybrj_functor(void) : Functor<double>(9,9) {}
+
+    int operator()(const VectorXd &x, VectorXd &fvec)
+    {
+        double temp, temp1, temp2;
+        const VectorXd::Index n = x.size();
+        assert(fvec.size()==n);
+        for (VectorXd::Index k = 0; k < n; k++)
+        {
+            temp = (3. - 2.*x[k])*x[k];
+            temp1 = 0.;
+            if (k) temp1 = x[k-1];
+            temp2 = 0.;
+            if (k != n-1) temp2 = x[k+1];
+            fvec[k] = temp - temp1 - 2.*temp2 + 1.;
+        }
+        return 0;
+    }
+    int df(const VectorXd &x, MatrixXd &fjac)
+    {
+        const VectorXd::Index n = x.size();
+        assert(fjac.rows()==n);
+        assert(fjac.cols()==n);
+        for (VectorXd::Index k = 0; k < n; k++)
+        {
+            for (VectorXd::Index j = 0; j < n; j++)
+                fjac(k,j) = 0.;
+            fjac(k,k) = 3.- 4.*x[k];
+            if (k) fjac(k,k-1) = -1.;
+            if (k != n-1) fjac(k,k+1) = -2.;
+        }
+        return 0;
+    }
+};
+
+
+void testHybrj1()
+{
+  const int n=9;
+  int info;
+  VectorXd x(n);
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, -1.);
+
+  // do the computation
+  hybrj_functor functor;
+  HybridNonLinearSolver<hybrj_functor> solver(functor);
+  info = solver.hybrj1(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(solver, 11, 1);
+
+  // check norm
+  VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08);
+
+
+// check x
+  VectorXd x_ref(n);
+  x_ref <<
+     -0.5706545,    -0.6816283,    -0.7017325,
+     -0.7042129,     -0.701369,    -0.6918656,
+     -0.665792,    -0.5960342,    -0.4164121;
+  VERIFY_IS_APPROX(x, x_ref);
+}
+
+void testHybrj()
+{
+  const int n=9;
+  int info;
+  VectorXd x(n);
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, -1.);
+
+
+  // do the computation
+  hybrj_functor functor;
+  HybridNonLinearSolver<hybrj_functor> solver(functor);
+  solver.diag.setConstant(n, 1.);
+  solver.useExternalScaling = true;
+  info = solver.solve(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(solver, 11, 1);
+
+  // check norm
+  VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08);
+
+
+// check x
+  VectorXd x_ref(n);
+  x_ref <<
+     -0.5706545,    -0.6816283,    -0.7017325,
+     -0.7042129,     -0.701369,    -0.6918656,
+     -0.665792,    -0.5960342,    -0.4164121;
+  VERIFY_IS_APPROX(x, x_ref);
+
+}
+
+struct hybrd_functor : Functor<double>
+{
+    hybrd_functor(void) : Functor<double>(9,9) {}
+    int operator()(const VectorXd &x, VectorXd &fvec) const
+    {
+        double temp, temp1, temp2;
+        const VectorXd::Index n = x.size();
+
+        assert(fvec.size()==n);
+        for (VectorXd::Index k=0; k < n; k++)
+        {
+            temp = (3. - 2.*x[k])*x[k];
+            temp1 = 0.;
+            if (k) temp1 = x[k-1];
+            temp2 = 0.;
+            if (k != n-1) temp2 = x[k+1];
+            fvec[k] = temp - temp1 - 2.*temp2 + 1.;
+        }
+        return 0;
+    }
+};
+
+void testHybrd1()
+{
+  int n=9, info;
+  VectorXd x(n);
+
+  /* the following starting values provide a rough solution. */
+  x.setConstant(n, -1.);
+
+  // do the computation
+  hybrd_functor functor;
+  HybridNonLinearSolver<hybrd_functor> solver(functor);
+  info = solver.hybrd1(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(solver.nfev, 20);
+
+  // check norm
+  VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << -0.5706545, -0.6816283, -0.7017325, -0.7042129, -0.701369, -0.6918656, -0.665792, -0.5960342, -0.4164121;
+  VERIFY_IS_APPROX(x, x_ref);
+}
+
+void testHybrd()
+{
+  const int n=9;
+  int info;
+  VectorXd x;
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, -1.);
+
+  // do the computation
+  hybrd_functor functor;
+  HybridNonLinearSolver<hybrd_functor> solver(functor);
+  solver.parameters.nb_of_subdiagonals = 1;
+  solver.parameters.nb_of_superdiagonals = 1;
+  solver.diag.setConstant(n, 1.);
+  solver.useExternalScaling = true;
+  info = solver.solveNumericalDiff(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(solver.nfev, 14);
+
+  // check norm
+  VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref <<
+      -0.5706545,    -0.6816283,    -0.7017325,
+      -0.7042129,     -0.701369,    -0.6918656,
+      -0.665792,    -0.5960342,    -0.4164121;
+  VERIFY_IS_APPROX(x, x_ref);
+}
+
+struct lmstr_functor : Functor<double>
+{
+    lmstr_functor(void) : Functor<double>(3,15) {}
+    int operator()(const VectorXd &x, VectorXd &fvec)
+    {
+        /*  subroutine fcn for lmstr1 example. */
+        double tmp1, tmp2, tmp3;
+        static const double y[15]={1.4e-1, 1.8e-1, 2.2e-1, 2.5e-1, 2.9e-1, 3.2e-1, 3.5e-1,
+            3.9e-1, 3.7e-1, 5.8e-1, 7.3e-1, 9.6e-1, 1.34, 2.1, 4.39};
+
+        assert(15==fvec.size());
+        assert(3==x.size());
+
+        for (int i=0; i<15; i++)
+        {
+            tmp1 = i+1;
+            tmp2 = 16 - i - 1;
+            tmp3 = (i>=8)? tmp2 : tmp1;
+            fvec[i] = y[i] - (x[0] + tmp1/(x[1]*tmp2 + x[2]*tmp3));
+        }
+        return 0;
+    }
+    int df(const VectorXd &x, VectorXd &jac_row, VectorXd::Index rownb)
+    {
+        assert(x.size()==3);
+        assert(jac_row.size()==x.size());
+        double tmp1, tmp2, tmp3, tmp4;
+
+        VectorXd::Index i = rownb-2;
+        tmp1 = i+1;
+        tmp2 = 16 - i - 1;
+        tmp3 = (i>=8)? tmp2 : tmp1;
+        tmp4 = (x[1]*tmp2 + x[2]*tmp3); tmp4 = tmp4*tmp4;
+        jac_row[0] = -1;
+        jac_row[1] = tmp1*tmp2/tmp4;
+        jac_row[2] = tmp1*tmp3/tmp4;
+        return 0;
+    }
+};
+
+void testLmstr1()
+{
+  const int n=3;
+  int info;
+
+  VectorXd x(n);
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, 1.);
+
+  // do the computation
+  lmstr_functor functor;
+  LevenbergMarquardt<lmstr_functor> lm(functor);
+  info = lm.lmstr1(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 6, 5);
+
+  // check norm
+  VERIFY_IS_APPROX(lm.fvec.blueNorm(), 0.09063596);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << 0.08241058, 1.133037, 2.343695 ;
+  VERIFY_IS_APPROX(x, x_ref);
+}
+
+void testLmstr()
+{
+  const int n=3;
+  int info;
+  double fnorm;
+  VectorXd x(n);
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, 1.);
+
+  // do the computation
+  lmstr_functor functor;
+  LevenbergMarquardt<lmstr_functor> lm(functor);
+  info = lm.minimizeOptimumStorage(x);
+
+  // check return values
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 6, 5);
+
+  // check norm
+  fnorm = lm.fvec.blueNorm();
+  VERIFY_IS_APPROX(fnorm, 0.09063596);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << 0.08241058, 1.133037, 2.343695;
+  VERIFY_IS_APPROX(x, x_ref);
+
+}
+
+struct lmdif_functor : Functor<double>
+{
+    lmdif_functor(void) : Functor<double>(3,15) {}
+    int operator()(const VectorXd &x, VectorXd &fvec) const
+    {
+        int i;
+        double tmp1,tmp2,tmp3;
+        static const double y[15]={1.4e-1,1.8e-1,2.2e-1,2.5e-1,2.9e-1,3.2e-1,3.5e-1,3.9e-1,
+            3.7e-1,5.8e-1,7.3e-1,9.6e-1,1.34e0,2.1e0,4.39e0};
+
+        assert(x.size()==3);
+        assert(fvec.size()==15);
+        for (i=0; i<15; i++)
+        {
+            tmp1 = i+1;
+            tmp2 = 15 - i;
+            tmp3 = tmp1;
+
+            if (i >= 8) tmp3 = tmp2;
+            fvec[i] = y[i] - (x[0] + tmp1/(x[1]*tmp2 + x[2]*tmp3));
+        }
+        return 0;
+    }
+};
+
+void testLmdif1()
+{
+  const int n=3;
+  int info;
+
+  VectorXd x(n), fvec(15);
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, 1.);
+
+  // do the computation
+  lmdif_functor functor;
+  DenseIndex nfev = -1; // initialize to avoid maybe-uninitialized warning
+  info = LevenbergMarquardt<lmdif_functor>::lmdif1(functor, x, &nfev);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(nfev, 26);
+
+  // check norm
+  functor(x, fvec);
+  VERIFY_IS_APPROX(fvec.blueNorm(), 0.09063596);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << 0.0824106, 1.1330366, 2.3436947;
+  VERIFY_IS_APPROX(x, x_ref);
+
+}
+
+void testLmdif()
+{
+  const int m=15, n=3;
+  int info;
+  double fnorm, covfac;
+  VectorXd x(n);
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, 1.);
+
+  // do the computation
+  lmdif_functor functor;
+  NumericalDiff<lmdif_functor> numDiff(functor);
+  LevenbergMarquardt<NumericalDiff<lmdif_functor> > lm(numDiff);
+  info = lm.minimize(x);
+
+  // check return values
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 26);
+
+  // check norm
+  fnorm = lm.fvec.blueNorm();
+  VERIFY_IS_APPROX(fnorm, 0.09063596);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << 0.08241058, 1.133037, 2.343695;
+  VERIFY_IS_APPROX(x, x_ref);
+
+  // check covariance
+  covfac = fnorm*fnorm/(m-n);
+  internal::covar(lm.fjac, lm.permutation.indices()); // TODO : move this as a function of lm
+
+  MatrixXd cov_ref(n,n);
+  cov_ref <<
+      0.0001531202,   0.002869942,  -0.002656662,
+      0.002869942,    0.09480937,   -0.09098997,
+      -0.002656662,   -0.09098997,    0.08778729;
+
+//  std::cout << fjac*covfac << std::endl;
+
+  MatrixXd cov;
+  cov =  covfac*lm.fjac.topLeftCorner<n,n>();
+  VERIFY_IS_APPROX( cov, cov_ref);
+  // TODO: why isn't this allowed ? :
+  // VERIFY_IS_APPROX( covfac*fjac.topLeftCorner<n,n>() , cov_ref);
+}
+
+struct chwirut2_functor : Functor<double>
+{
+    chwirut2_functor(void) : Functor<double>(3,54) {}
+    static const double m_x[54];
+    static const double m_y[54];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        int i;
+
+        assert(b.size()==3);
+        assert(fvec.size()==54);
+        for(i=0; i<54; i++) {
+            double x = m_x[i];
+            fvec[i] = exp(-b[0]*x)/(b[1]+b[2]*x) - m_y[i];
+        }
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==3);
+        assert(fjac.rows()==54);
+        assert(fjac.cols()==3);
+        for(int i=0; i<54; i++) {
+            double x = m_x[i];
+            double factor = 1./(b[1]+b[2]*x);
+            double e = exp(-b[0]*x);
+            fjac(i,0) = -x*e*factor;
+            fjac(i,1) = -e*factor*factor;
+            fjac(i,2) = -x*e*factor*factor;
+        }
+        return 0;
+    }
+};
+const double chwirut2_functor::m_x[54] = { 0.500E0, 1.000E0, 1.750E0, 3.750E0, 5.750E0, 0.875E0, 2.250E0, 3.250E0, 5.250E0, 0.750E0, 1.750E0, 2.750E0, 4.750E0, 0.625E0, 1.250E0, 2.250E0, 4.250E0, .500E0, 3.000E0, .750E0, 3.000E0, 1.500E0, 6.000E0, 3.000E0, 6.000E0, 1.500E0, 3.000E0, .500E0, 2.000E0, 4.000E0, .750E0, 2.000E0, 5.000E0, .750E0, 2.250E0, 3.750E0, 5.750E0, 3.000E0, .750E0, 2.500E0, 4.000E0, .750E0, 2.500E0, 4.000E0, .750E0, 2.500E0, 4.000E0, .500E0, 6.000E0, 3.000E0, .500E0, 2.750E0, .500E0, 1.750E0};
+const double chwirut2_functor::m_y[54] = { 92.9000E0 ,57.1000E0 ,31.0500E0 ,11.5875E0 ,8.0250E0 ,63.6000E0 ,21.4000E0 ,14.2500E0 ,8.4750E0 ,63.8000E0 ,26.8000E0 ,16.4625E0 ,7.1250E0 ,67.3000E0 ,41.0000E0 ,21.1500E0 ,8.1750E0 ,81.5000E0 ,13.1200E0 ,59.9000E0 ,14.6200E0 ,32.9000E0 ,5.4400E0 ,12.5600E0 ,5.4400E0 ,32.0000E0 ,13.9500E0 ,75.8000E0 ,20.0000E0 ,10.4200E0 ,59.5000E0 ,21.6700E0 ,8.5500E0 ,62.0000E0 ,20.2000E0 ,7.7600E0 ,3.7500E0 ,11.8100E0 ,54.7000E0 ,23.7000E0 ,11.5500E0 ,61.3000E0 ,17.7000E0 ,8.7400E0 ,59.2000E0 ,16.3000E0 ,8.6200E0 ,81.0000E0 ,4.8700E0 ,14.6200E0 ,81.7000E0 ,17.1700E0 ,81.3000E0 ,28.9000E0  };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/chwirut2.shtml
+void testNistChwirut2(void)
+{
+  const int n=3;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 0.1, 0.01, 0.02;
+  // do the computation
+  chwirut2_functor functor;
+  LevenbergMarquardt<chwirut2_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 10, 8);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.1304802941E+02);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.6657666537E-01);
+  VERIFY_IS_APPROX(x[1], 5.1653291286E-03);
+  VERIFY_IS_APPROX(x[2], 1.2150007096E-02);
+
+  /*
+   * Second try
+   */
+  x<< 0.15, 0.008, 0.010;
+  // do the computation
+  lm.resetParameters();
+  lm.parameters.ftol = 1.E6*NumTraits<double>::epsilon();
+  lm.parameters.xtol = 1.E6*NumTraits<double>::epsilon();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 7, 6);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.1304802941E+02);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.6657666537E-01);
+  VERIFY_IS_APPROX(x[1], 5.1653291286E-03);
+  VERIFY_IS_APPROX(x[2], 1.2150007096E-02);
+}
+
+
+struct misra1a_functor : Functor<double>
+{
+    misra1a_functor(void) : Functor<double>(2,14) {}
+    static const double m_x[14];
+    static const double m_y[14];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==2);
+        assert(fvec.size()==14);
+        for(int i=0; i<14; i++) {
+            fvec[i] = b[0]*(1.-exp(-b[1]*m_x[i])) - m_y[i] ;
+        }
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==2);
+        assert(fjac.rows()==14);
+        assert(fjac.cols()==2);
+        for(int i=0; i<14; i++) {
+            fjac(i,0) = (1.-exp(-b[1]*m_x[i]));
+            fjac(i,1) = (b[0]*m_x[i]*exp(-b[1]*m_x[i]));
+        }
+        return 0;
+    }
+};
+const double misra1a_functor::m_x[14] = { 77.6E0, 114.9E0, 141.1E0, 190.8E0, 239.9E0, 289.0E0, 332.8E0, 378.4E0, 434.8E0, 477.3E0, 536.8E0, 593.1E0, 689.1E0, 760.0E0};
+const double misra1a_functor::m_y[14] = { 10.07E0, 14.73E0, 17.94E0, 23.93E0, 29.61E0, 35.18E0, 40.02E0, 44.82E0, 50.76E0, 55.05E0, 61.01E0, 66.40E0, 75.47E0, 81.78E0};
+
+// http://www.itl.nist.gov/div898/strd/nls/data/misra1a.shtml
+void testNistMisra1a(void)
+{
+  const int n=2;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 500., 0.0001;
+  // do the computation
+  misra1a_functor functor;
+  LevenbergMarquardt<misra1a_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 19, 15);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.2455138894E-01);
+  // check x
+  VERIFY_IS_APPROX(x[0], 2.3894212918E+02);
+  VERIFY_IS_APPROX(x[1], 5.5015643181E-04);
+
+  /*
+   * Second try
+   */
+  x<< 250., 0.0005;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 5, 4);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.2455138894E-01);
+  // check x
+  VERIFY_IS_APPROX(x[0], 2.3894212918E+02);
+  VERIFY_IS_APPROX(x[1], 5.5015643181E-04);
+}
+
+struct hahn1_functor : Functor<double>
+{
+    hahn1_functor(void) : Functor<double>(7,236) {}
+    static const double m_x[236];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        static const double m_y[236] = { .591E0 , 1.547E0 , 2.902E0 , 2.894E0 , 4.703E0 , 6.307E0 , 7.03E0  , 7.898E0 , 9.470E0 , 9.484E0 , 10.072E0 , 10.163E0 , 11.615E0 , 12.005E0 , 12.478E0 , 12.982E0 , 12.970E0 , 13.926E0 , 14.452E0 , 14.404E0 , 15.190E0 , 15.550E0 , 15.528E0 , 15.499E0 , 16.131E0 , 16.438E0 , 16.387E0 , 16.549E0 , 16.872E0 , 16.830E0 , 16.926E0 , 16.907E0 , 16.966E0 , 17.060E0 , 17.122E0 , 17.311E0 , 17.355E0 , 17.668E0 , 17.767E0 , 17.803E0 , 17.765E0 , 17.768E0 , 17.736E0 , 17.858E0 , 17.877E0 , 17.912E0 , 18.046E0 , 18.085E0 , 18.291E0 , 18.357E0 , 18.426E0 , 18.584E0 , 18.610E0 , 18.870E0 , 18.795E0 , 19.111E0 , .367E0 , .796E0 , 0.892E0 , 1.903E0 , 2.150E0 , 3.697E0 , 5.870E0 , 6.421E0 , 7.422E0 , 9.944E0 , 11.023E0 , 11.87E0  , 12.786E0 , 14.067E0 , 13.974E0 , 14.462E0 , 14.464E0 , 15.381E0 , 15.483E0 , 15.59E0  , 16.075E0 , 16.347E0 , 16.181E0 , 16.915E0 , 17.003E0 , 16.978E0 , 17.756E0 , 17.808E0 , 17.868E0 , 18.481E0 , 18.486E0 , 19.090E0 , 16.062E0 , 16.337E0 , 16.345E0 ,
+        16.388E0 , 17.159E0 , 17.116E0 , 17.164E0 , 17.123E0 , 17.979E0 , 17.974E0 , 18.007E0 , 17.993E0 , 18.523E0 , 18.669E0 , 18.617E0 , 19.371E0 , 19.330E0 , 0.080E0 , 0.248E0 , 1.089E0 , 1.418E0 , 2.278E0 , 3.624E0 , 4.574E0 , 5.556E0 , 7.267E0 , 7.695E0 , 9.136E0 , 9.959E0 , 9.957E0 , 11.600E0 , 13.138E0 , 13.564E0 , 13.871E0 , 13.994E0 , 14.947E0 , 15.473E0 , 15.379E0 , 15.455E0 , 15.908E0 , 16.114E0 , 17.071E0 , 17.135E0 , 17.282E0 , 17.368E0 , 17.483E0 , 17.764E0 , 18.185E0 , 18.271E0 , 18.236E0 , 18.237E0 , 18.523E0 , 18.627E0 , 18.665E0 , 19.086E0 , 0.214E0 , 0.943E0 , 1.429E0 , 2.241E0 , 2.951E0 , 3.782E0 , 4.757E0 , 5.602E0 , 7.169E0 , 8.920E0 , 10.055E0 , 12.035E0 , 12.861E0 , 13.436E0 , 14.167E0 , 14.755E0 , 15.168E0 , 15.651E0 , 15.746E0 , 16.216E0 , 16.445E0 , 16.965E0 , 17.121E0 , 17.206E0 , 17.250E0 , 17.339E0 , 17.793E0 , 18.123E0 , 18.49E0  , 18.566E0 , 18.645E0 , 18.706E0 , 18.924E0 , 19.1E0   , 0.375E0 , 0.471E0 , 1.504E0 , 2.204E0 , 2.813E0 , 4.765E0 , 9.835E0 , 10.040E0 , 11.946E0 , 12.596E0 , 
+13.303E0 , 13.922E0 , 14.440E0 , 14.951E0 , 15.627E0 , 15.639E0 , 15.814E0 , 16.315E0 , 16.334E0 , 16.430E0 , 16.423E0 , 17.024E0 , 17.009E0 , 17.165E0 , 17.134E0 , 17.349E0 , 17.576E0 , 17.848E0 , 18.090E0 , 18.276E0 , 18.404E0 , 18.519E0 , 19.133E0 , 19.074E0 , 19.239E0 , 19.280E0 , 19.101E0 , 19.398E0 , 19.252E0 , 19.89E0  , 20.007E0 , 19.929E0 , 19.268E0 , 19.324E0 , 20.049E0 , 20.107E0 , 20.062E0 , 20.065E0 , 19.286E0 , 19.972E0 , 20.088E0 , 20.743E0 , 20.83E0  , 20.935E0 , 21.035E0 , 20.93E0  , 21.074E0 , 21.085E0 , 20.935E0 };
+
+        //        int called=0; printf("call hahn1_functor with  iflag=%d, called=%d\n", iflag, called); if (iflag==1) called++;
+
+        assert(b.size()==7);
+        assert(fvec.size()==236);
+        for(int i=0; i<236; i++) {
+            double x=m_x[i], xx=x*x, xxx=xx*x;
+            fvec[i] = (b[0]+b[1]*x+b[2]*xx+b[3]*xxx) / (1.+b[4]*x+b[5]*xx+b[6]*xxx) - m_y[i];
+        }
+        return 0;
+    }
+
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==7);
+        assert(fjac.rows()==236);
+        assert(fjac.cols()==7);
+        for(int i=0; i<236; i++) {
+            double x=m_x[i], xx=x*x, xxx=xx*x;
+            double fact = 1./(1.+b[4]*x+b[5]*xx+b[6]*xxx);
+            fjac(i,0) = 1.*fact;
+            fjac(i,1) = x*fact;
+            fjac(i,2) = xx*fact;
+            fjac(i,3) = xxx*fact;
+            fact = - (b[0]+b[1]*x+b[2]*xx+b[3]*xxx) * fact * fact;
+            fjac(i,4) = x*fact;
+            fjac(i,5) = xx*fact;
+            fjac(i,6) = xxx*fact;
+        }
+        return 0;
+    }
+};
+const double hahn1_functor::m_x[236] = { 24.41E0 , 34.82E0 , 44.09E0 , 45.07E0 , 54.98E0 , 65.51E0 , 70.53E0 , 75.70E0 , 89.57E0 , 91.14E0 , 96.40E0 , 97.19E0 , 114.26E0 , 120.25E0 , 127.08E0 , 133.55E0 , 133.61E0 , 158.67E0 , 172.74E0 , 171.31E0 , 202.14E0 , 220.55E0 , 221.05E0 , 221.39E0 , 250.99E0 , 268.99E0 , 271.80E0 , 271.97E0 , 321.31E0 , 321.69E0 , 330.14E0 , 333.03E0 , 333.47E0 , 340.77E0 , 345.65E0 , 373.11E0 , 373.79E0 , 411.82E0 , 419.51E0 , 421.59E0 , 422.02E0 , 422.47E0 , 422.61E0 , 441.75E0 , 447.41E0 , 448.7E0  , 472.89E0 , 476.69E0 , 522.47E0 , 522.62E0 , 524.43E0 , 546.75E0 , 549.53E0 , 575.29E0 , 576.00E0 , 625.55E0 , 20.15E0 , 28.78E0 , 29.57E0 , 37.41E0 , 39.12E0 , 50.24E0 , 61.38E0 , 66.25E0 , 73.42E0 , 95.52E0 , 107.32E0 , 122.04E0 , 134.03E0 , 163.19E0 , 163.48E0 , 175.70E0 , 179.86E0 , 211.27E0 , 217.78E0 , 219.14E0 , 262.52E0 , 268.01E0 , 268.62E0 , 336.25E0 , 337.23E0 , 339.33E0 , 427.38E0 , 428.58E0 , 432.68E0 , 528.99E0 , 531.08E0 , 628.34E0 , 253.24E0 , 273.13E0 , 273.66E0 ,
+282.10E0 , 346.62E0 , 347.19E0 , 348.78E0 , 351.18E0 , 450.10E0 , 450.35E0 , 451.92E0 , 455.56E0 , 552.22E0 , 553.56E0 , 555.74E0 , 652.59E0 , 656.20E0 , 14.13E0 , 20.41E0 , 31.30E0 , 33.84E0 , 39.70E0 , 48.83E0 , 54.50E0 , 60.41E0 , 72.77E0 , 75.25E0 , 86.84E0 , 94.88E0 , 96.40E0 , 117.37E0 , 139.08E0 , 147.73E0 , 158.63E0 , 161.84E0 , 192.11E0 , 206.76E0 , 209.07E0 , 213.32E0 , 226.44E0 , 237.12E0 , 330.90E0 , 358.72E0 , 370.77E0 , 372.72E0 , 396.24E0 , 416.59E0 , 484.02E0 , 495.47E0 , 514.78E0 , 515.65E0 , 519.47E0 , 544.47E0 , 560.11E0 , 620.77E0 , 18.97E0 , 28.93E0 , 33.91E0 , 40.03E0 , 44.66E0 , 49.87E0 , 55.16E0 , 60.90E0 , 72.08E0 , 85.15E0 , 97.06E0 , 119.63E0 , 133.27E0 , 143.84E0 , 161.91E0 , 180.67E0 , 198.44E0 , 226.86E0 , 229.65E0 , 258.27E0 , 273.77E0 , 339.15E0 , 350.13E0 , 362.75E0 , 371.03E0 , 393.32E0 , 448.53E0 , 473.78E0 , 511.12E0 , 524.70E0 , 548.75E0 , 551.64E0 , 574.02E0 , 623.86E0 , 21.46E0 , 24.33E0 , 33.43E0 , 39.22E0 , 44.18E0 , 55.02E0 , 94.33E0 , 96.44E0 , 118.82E0 , 128.48E0 ,
+141.94E0 , 156.92E0 , 171.65E0 , 190.00E0 , 223.26E0 , 223.88E0 , 231.50E0 , 265.05E0 , 269.44E0 , 271.78E0 , 273.46E0 , 334.61E0 , 339.79E0 , 349.52E0 , 358.18E0 , 377.98E0 , 394.77E0 , 429.66E0 , 468.22E0 , 487.27E0 , 519.54E0 , 523.03E0 , 612.99E0 , 638.59E0 , 641.36E0 , 622.05E0 , 631.50E0 , 663.97E0 , 646.9E0  , 748.29E0 , 749.21E0 , 750.14E0 , 647.04E0 , 646.89E0 , 746.9E0  , 748.43E0 , 747.35E0 , 749.27E0 , 647.61E0 , 747.78E0 , 750.51E0 , 851.37E0 , 845.97E0 , 847.54E0 , 849.93E0 , 851.61E0 , 849.75E0 , 850.98E0 , 848.23E0};
+
+// http://www.itl.nist.gov/div898/strd/nls/data/hahn1.shtml
+void testNistHahn1(void)
+{
+  const int  n=7;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 10., -1., .05, -.00001, -.05, .001, -.000001;
+  // do the computation
+  hahn1_functor functor;
+  LevenbergMarquardt<hahn1_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 11, 10);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.5324382854E+00);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.0776351733E+00);
+  VERIFY_IS_APPROX(x[1],-1.2269296921E-01);
+  VERIFY_IS_APPROX(x[2], 4.0863750610E-03);
+  VERIFY_IS_APPROX(x[3],-1.426264e-06); // shoulde be : -1.4262662514E-06
+  VERIFY_IS_APPROX(x[4],-5.7609940901E-03);
+  VERIFY_IS_APPROX(x[5], 2.4053735503E-04);
+  VERIFY_IS_APPROX(x[6],-1.2314450199E-07);
+
+  /*
+   * Second try
+   */
+  x<< .1, -.1, .005, -.000001, -.005, .0001, -.0000001;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 11, 10);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.5324382854E+00);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.077640); // should be :  1.0776351733E+00
+  VERIFY_IS_APPROX(x[1], -0.1226933); // should be : -1.2269296921E-01
+  VERIFY_IS_APPROX(x[2], 0.004086383); // should be : 4.0863750610E-03
+  VERIFY_IS_APPROX(x[3], -1.426277e-06); // shoulde be : -1.4262662514E-06
+  VERIFY_IS_APPROX(x[4],-5.7609940901E-03);
+  VERIFY_IS_APPROX(x[5], 0.00024053772); // should be : 2.4053735503E-04
+  VERIFY_IS_APPROX(x[6], -1.231450e-07); // should be : -1.2314450199E-07
+
+}
+
+struct misra1d_functor : Functor<double>
+{
+    misra1d_functor(void) : Functor<double>(2,14) {}
+    static const double x[14];
+    static const double y[14];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==2);
+        assert(fvec.size()==14);
+        for(int i=0; i<14; i++) {
+            fvec[i] = b[0]*b[1]*x[i]/(1.+b[1]*x[i]) - y[i];
+        }
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==2);
+        assert(fjac.rows()==14);
+        assert(fjac.cols()==2);
+        for(int i=0; i<14; i++) {
+            double den = 1.+b[1]*x[i];
+            fjac(i,0) = b[1]*x[i] / den;
+            fjac(i,1) = b[0]*x[i]*(den-b[1]*x[i])/den/den;
+        }
+        return 0;
+    }
+};
+const double misra1d_functor::x[14] = { 77.6E0, 114.9E0, 141.1E0, 190.8E0, 239.9E0, 289.0E0, 332.8E0, 378.4E0, 434.8E0, 477.3E0, 536.8E0, 593.1E0, 689.1E0, 760.0E0};
+const double misra1d_functor::y[14] = { 10.07E0, 14.73E0, 17.94E0, 23.93E0, 29.61E0, 35.18E0, 40.02E0, 44.82E0, 50.76E0, 55.05E0, 61.01E0, 66.40E0, 75.47E0, 81.78E0};
+
+// http://www.itl.nist.gov/div898/strd/nls/data/misra1d.shtml
+void testNistMisra1d(void)
+{
+  const int n=2;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 500., 0.0001;
+  // do the computation
+  misra1d_functor functor;
+  LevenbergMarquardt<misra1d_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 3);
+  LM_CHECK_N_ITERS(lm, 9, 7);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6419295283E-02);
+  // check x
+  VERIFY_IS_APPROX(x[0], 4.3736970754E+02);
+  VERIFY_IS_APPROX(x[1], 3.0227324449E-04);
+
+  /*
+   * Second try
+   */
+  x<< 450., 0.0003;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 4, 3);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6419295283E-02);
+  // check x
+  VERIFY_IS_APPROX(x[0], 4.3736970754E+02);
+  VERIFY_IS_APPROX(x[1], 3.0227324449E-04);
+}
+
+
+struct lanczos1_functor : Functor<double>
+{
+    lanczos1_functor(void) : Functor<double>(6,24) {}
+    static const double x[24];
+    static const double y[24];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==6);
+        assert(fvec.size()==24);
+        for(int i=0; i<24; i++)
+            fvec[i] = b[0]*exp(-b[1]*x[i]) + b[2]*exp(-b[3]*x[i]) + b[4]*exp(-b[5]*x[i])  - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==6);
+        assert(fjac.rows()==24);
+        assert(fjac.cols()==6);
+        for(int i=0; i<24; i++) {
+            fjac(i,0) = exp(-b[1]*x[i]);
+            fjac(i,1) = -b[0]*x[i]*exp(-b[1]*x[i]);
+            fjac(i,2) = exp(-b[3]*x[i]);
+            fjac(i,3) = -b[2]*x[i]*exp(-b[3]*x[i]);
+            fjac(i,4) = exp(-b[5]*x[i]);
+            fjac(i,5) = -b[4]*x[i]*exp(-b[5]*x[i]);
+        }
+        return 0;
+    }
+};
+const double lanczos1_functor::x[24] = { 0.000000000000E+00, 5.000000000000E-02, 1.000000000000E-01, 1.500000000000E-01, 2.000000000000E-01, 2.500000000000E-01, 3.000000000000E-01, 3.500000000000E-01, 4.000000000000E-01, 4.500000000000E-01, 5.000000000000E-01, 5.500000000000E-01, 6.000000000000E-01, 6.500000000000E-01, 7.000000000000E-01, 7.500000000000E-01, 8.000000000000E-01, 8.500000000000E-01, 9.000000000000E-01, 9.500000000000E-01, 1.000000000000E+00, 1.050000000000E+00, 1.100000000000E+00, 1.150000000000E+00 };
+const double lanczos1_functor::y[24] = { 2.513400000000E+00 ,2.044333373291E+00 ,1.668404436564E+00 ,1.366418021208E+00 ,1.123232487372E+00 ,9.268897180037E-01 ,7.679338563728E-01 ,6.388775523106E-01 ,5.337835317402E-01 ,4.479363617347E-01 ,3.775847884350E-01 ,3.197393199326E-01 ,2.720130773746E-01 ,2.324965529032E-01 ,1.996589546065E-01 ,1.722704126914E-01 ,1.493405660168E-01 ,1.300700206922E-01 ,1.138119324644E-01 ,1.000415587559E-01 ,8.833209084540E-02 ,7.833544019350E-02 ,6.976693743449E-02 ,6.239312536719E-02 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/lanczos1.shtml
+void testNistLanczos1(void)
+{
+  const int n=6;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 1.2, 0.3, 5.6, 5.5, 6.5, 7.6;
+  // do the computation
+  lanczos1_functor functor;
+  LevenbergMarquardt<lanczos1_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 2);
+  LM_CHECK_N_ITERS(lm, 79, 72);
+  // check norm^2
+  std::cout.precision(30);
+  std::cout << lm.fvec.squaredNorm() << "\n";
+  VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25);
+  // check x
+  VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
+  VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
+  VERIFY_IS_APPROX(x[2], 8.6070000013E-01);
+  VERIFY_IS_APPROX(x[3], 3.0000000002E+00);
+  VERIFY_IS_APPROX(x[4], 1.5575999998E+00);
+  VERIFY_IS_APPROX(x[5], 5.0000000001E+00);
+
+  /*
+   * Second try
+   */
+  x<< 0.5, 0.7, 3.6, 4.2, 4., 6.3;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 2);
+  LM_CHECK_N_ITERS(lm, 9, 8);
+  // check norm^2
+  VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25);
+  // check x
+  VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
+  VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
+  VERIFY_IS_APPROX(x[2], 8.6070000013E-01);
+  VERIFY_IS_APPROX(x[3], 3.0000000002E+00);
+  VERIFY_IS_APPROX(x[4], 1.5575999998E+00);
+  VERIFY_IS_APPROX(x[5], 5.0000000001E+00);
+
+}
+
+struct rat42_functor : Functor<double>
+{
+    rat42_functor(void) : Functor<double>(3,9) {}
+    static const double x[9];
+    static const double y[9];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==3);
+        assert(fvec.size()==9);
+        for(int i=0; i<9; i++) {
+            fvec[i] = b[0] / (1.+exp(b[1]-b[2]*x[i])) - y[i];
+        }
+        return 0;
+    }
+
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==3);
+        assert(fjac.rows()==9);
+        assert(fjac.cols()==3);
+        for(int i=0; i<9; i++) {
+            double e = exp(b[1]-b[2]*x[i]);
+            fjac(i,0) = 1./(1.+e);
+            fjac(i,1) = -b[0]*e/(1.+e)/(1.+e);
+            fjac(i,2) = +b[0]*e*x[i]/(1.+e)/(1.+e);
+        }
+        return 0;
+    }
+};
+const double rat42_functor::x[9] = { 9.000E0, 14.000E0, 21.000E0, 28.000E0, 42.000E0, 57.000E0, 63.000E0, 70.000E0, 79.000E0 };
+const double rat42_functor::y[9] = { 8.930E0 ,10.800E0 ,18.590E0 ,22.330E0 ,39.350E0 ,56.110E0 ,61.730E0 ,64.620E0 ,67.080E0 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/ratkowsky2.shtml
+void testNistRat42(void)
+{
+  const int n=3;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 100., 1., 0.1;
+  // do the computation
+  rat42_functor functor;
+  LevenbergMarquardt<rat42_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 10, 8);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.0565229338E+00);
+  // check x
+  VERIFY_IS_APPROX(x[0], 7.2462237576E+01);
+  VERIFY_IS_APPROX(x[1], 2.6180768402E+00);
+  VERIFY_IS_APPROX(x[2], 6.7359200066E-02);
+
+  /*
+   * Second try
+   */
+  x<< 75., 2.5, 0.07;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 6, 5);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.0565229338E+00);
+  // check x
+  VERIFY_IS_APPROX(x[0], 7.2462237576E+01);
+  VERIFY_IS_APPROX(x[1], 2.6180768402E+00);
+  VERIFY_IS_APPROX(x[2], 6.7359200066E-02);
+}
+
+struct MGH10_functor : Functor<double>
+{
+    MGH10_functor(void) : Functor<double>(3,16) {}
+    static const double x[16];
+    static const double y[16];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==3);
+        assert(fvec.size()==16);
+        for(int i=0; i<16; i++)
+            fvec[i] =  b[0] * exp(b[1]/(x[i]+b[2])) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==3);
+        assert(fjac.rows()==16);
+        assert(fjac.cols()==3);
+        for(int i=0; i<16; i++) {
+            double factor = 1./(x[i]+b[2]);
+            double e = exp(b[1]*factor);
+            fjac(i,0) = e;
+            fjac(i,1) = b[0]*factor*e;
+            fjac(i,2) = -b[1]*b[0]*factor*factor*e;
+        }
+        return 0;
+    }
+};
+const double MGH10_functor::x[16] = { 5.000000E+01, 5.500000E+01, 6.000000E+01, 6.500000E+01, 7.000000E+01, 7.500000E+01, 8.000000E+01, 8.500000E+01, 9.000000E+01, 9.500000E+01, 1.000000E+02, 1.050000E+02, 1.100000E+02, 1.150000E+02, 1.200000E+02, 1.250000E+02 };
+const double MGH10_functor::y[16] = { 3.478000E+04, 2.861000E+04, 2.365000E+04, 1.963000E+04, 1.637000E+04, 1.372000E+04, 1.154000E+04, 9.744000E+03, 8.261000E+03, 7.030000E+03, 6.005000E+03, 5.147000E+03, 4.427000E+03, 3.820000E+03, 3.307000E+03, 2.872000E+03 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/mgh10.shtml
+void testNistMGH10(void)
+{
+  const int n=3;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 2., 400000., 25000.;
+  // do the computation
+  MGH10_functor functor;
+  LevenbergMarquardt<MGH10_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 2); 
+  LM_CHECK_N_ITERS(lm, 284, 249); 
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7945855171E+01);
+  // check x
+  VERIFY_IS_APPROX(x[0], 5.6096364710E-03);
+  VERIFY_IS_APPROX(x[1], 6.1813463463E+03);
+  VERIFY_IS_APPROX(x[2], 3.4522363462E+02);
+
+  /*
+   * Second try
+   */
+  x<< 0.02, 4000., 250.;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 3);
+  LM_CHECK_N_ITERS(lm, 126, 116);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7945855171E+01);
+  // check x
+  VERIFY_IS_APPROX(x[0], 5.6096364710E-03);
+  VERIFY_IS_APPROX(x[1], 6.1813463463E+03);
+  VERIFY_IS_APPROX(x[2], 3.4522363462E+02);
+}
+
+
+struct BoxBOD_functor : Functor<double>
+{
+    BoxBOD_functor(void) : Functor<double>(2,6) {}
+    static const double x[6];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        static const double y[6] = { 109., 149., 149., 191., 213., 224. };
+        assert(b.size()==2);
+        assert(fvec.size()==6);
+        for(int i=0; i<6; i++)
+            fvec[i] =  b[0]*(1.-exp(-b[1]*x[i])) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==2);
+        assert(fjac.rows()==6);
+        assert(fjac.cols()==2);
+        for(int i=0; i<6; i++) {
+            double e = exp(-b[1]*x[i]);
+            fjac(i,0) = 1.-e;
+            fjac(i,1) = b[0]*x[i]*e;
+        }
+        return 0;
+    }
+};
+const double BoxBOD_functor::x[6] = { 1., 2., 3., 5., 7., 10. };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/boxbod.shtml
+void testNistBoxBOD(void)
+{
+  const int n=2;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 1., 1.;
+  // do the computation
+  BoxBOD_functor functor;
+  LevenbergMarquardt<BoxBOD_functor> lm(functor);
+  lm.parameters.ftol = 1.E6*NumTraits<double>::epsilon();
+  lm.parameters.xtol = 1.E6*NumTraits<double>::epsilon();
+  lm.parameters.factor = 10.;
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 31, 25);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.1680088766E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 2.1380940889E+02);
+  VERIFY_IS_APPROX(x[1], 5.4723748542E-01);
+
+  /*
+   * Second try
+   */
+  x<< 100., 0.75;
+  // do the computation
+  lm.resetParameters();
+  lm.parameters.ftol = NumTraits<double>::epsilon();
+  lm.parameters.xtol = NumTraits<double>::epsilon();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 15, 14);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.1680088766E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 2.1380940889E+02);
+  VERIFY_IS_APPROX(x[1], 5.4723748542E-01);
+}
+
+struct MGH17_functor : Functor<double>
+{
+    MGH17_functor(void) : Functor<double>(5,33) {}
+    static const double x[33];
+    static const double y[33];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==5);
+        assert(fvec.size()==33);
+        for(int i=0; i<33; i++)
+            fvec[i] =  b[0] + b[1]*exp(-b[3]*x[i]) +  b[2]*exp(-b[4]*x[i]) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==5);
+        assert(fjac.rows()==33);
+        assert(fjac.cols()==5);
+        for(int i=0; i<33; i++) {
+            fjac(i,0) = 1.;
+            fjac(i,1) = exp(-b[3]*x[i]);
+            fjac(i,2) = exp(-b[4]*x[i]);
+            fjac(i,3) = -x[i]*b[1]*exp(-b[3]*x[i]);
+            fjac(i,4) = -x[i]*b[2]*exp(-b[4]*x[i]);
+        }
+        return 0;
+    }
+};
+const double MGH17_functor::x[33] = { 0.000000E+00, 1.000000E+01, 2.000000E+01, 3.000000E+01, 4.000000E+01, 5.000000E+01, 6.000000E+01, 7.000000E+01, 8.000000E+01, 9.000000E+01, 1.000000E+02, 1.100000E+02, 1.200000E+02, 1.300000E+02, 1.400000E+02, 1.500000E+02, 1.600000E+02, 1.700000E+02, 1.800000E+02, 1.900000E+02, 2.000000E+02, 2.100000E+02, 2.200000E+02, 2.300000E+02, 2.400000E+02, 2.500000E+02, 2.600000E+02, 2.700000E+02, 2.800000E+02, 2.900000E+02, 3.000000E+02, 3.100000E+02, 3.200000E+02 };
+const double MGH17_functor::y[33] = { 8.440000E-01, 9.080000E-01, 9.320000E-01, 9.360000E-01, 9.250000E-01, 9.080000E-01, 8.810000E-01, 8.500000E-01, 8.180000E-01, 7.840000E-01, 7.510000E-01, 7.180000E-01, 6.850000E-01, 6.580000E-01, 6.280000E-01, 6.030000E-01, 5.800000E-01, 5.580000E-01, 5.380000E-01, 5.220000E-01, 5.060000E-01, 4.900000E-01, 4.780000E-01, 4.670000E-01, 4.570000E-01, 4.480000E-01, 4.380000E-01, 4.310000E-01, 4.240000E-01, 4.200000E-01, 4.140000E-01, 4.110000E-01, 4.060000E-01 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/mgh17.shtml
+void testNistMGH17(void)
+{
+  const int n=5;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 50., 150., -100., 1., 2.;
+  // do the computation
+  MGH17_functor functor;
+  LevenbergMarquardt<MGH17_functor> lm(functor);
+  lm.parameters.ftol = NumTraits<double>::epsilon();
+  lm.parameters.xtol = NumTraits<double>::epsilon();
+  lm.parameters.maxfev = 1000;
+  info = lm.minimize(x);
+
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.4648946975E-05);
+  // check x
+  VERIFY_IS_APPROX(x[0], 3.7541005211E-01);
+  VERIFY_IS_APPROX(x[1], 1.9358469127E+00);
+  VERIFY_IS_APPROX(x[2], -1.4646871366E+00);
+  VERIFY_IS_APPROX(x[3], 1.2867534640E-02);
+  VERIFY_IS_APPROX(x[4], 2.2122699662E-02);
+  
+  // check return value
+  VERIFY_IS_EQUAL(info, 2); 
+  LM_CHECK_N_ITERS(lm, 602, 545);
+
+  /*
+   * Second try
+   */
+  x<< 0.5  ,1.5  ,-1   ,0.01 ,0.02;
+  // do the computation
+  lm.resetParameters();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 18, 15);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.4648946975E-05);
+  // check x
+  VERIFY_IS_APPROX(x[0], 3.7541005211E-01);
+  VERIFY_IS_APPROX(x[1], 1.9358469127E+00);
+  VERIFY_IS_APPROX(x[2], -1.4646871366E+00);
+  VERIFY_IS_APPROX(x[3], 1.2867534640E-02);
+  VERIFY_IS_APPROX(x[4], 2.2122699662E-02);
+}
+
+struct MGH09_functor : Functor<double>
+{
+    MGH09_functor(void) : Functor<double>(4,11) {}
+    static const double _x[11];
+    static const double y[11];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==4);
+        assert(fvec.size()==11);
+        for(int i=0; i<11; i++) {
+            double x = _x[i], xx=x*x;
+            fvec[i] = b[0]*(xx+x*b[1])/(xx+x*b[2]+b[3]) - y[i];
+        }
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==4);
+        assert(fjac.rows()==11);
+        assert(fjac.cols()==4);
+        for(int i=0; i<11; i++) {
+            double x = _x[i], xx=x*x;
+            double factor = 1./(xx+x*b[2]+b[3]);
+            fjac(i,0) = (xx+x*b[1]) * factor;
+            fjac(i,1) = b[0]*x* factor;
+            fjac(i,2) = - b[0]*(xx+x*b[1]) * x * factor * factor;
+            fjac(i,3) = - b[0]*(xx+x*b[1]) * factor * factor;
+        }
+        return 0;
+    }
+};
+const double MGH09_functor::_x[11] = { 4., 2., 1., 5.E-1 , 2.5E-01, 1.670000E-01, 1.250000E-01,  1.E-01, 8.330000E-02, 7.140000E-02, 6.250000E-02 };
+const double MGH09_functor::y[11] = { 1.957000E-01, 1.947000E-01, 1.735000E-01, 1.600000E-01, 8.440000E-02, 6.270000E-02, 4.560000E-02, 3.420000E-02, 3.230000E-02, 2.350000E-02, 2.460000E-02 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/mgh09.shtml
+void testNistMGH09(void)
+{
+  const int n=4;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 25., 39, 41.5, 39.;
+  // do the computation
+  MGH09_functor functor;
+  LevenbergMarquardt<MGH09_functor> lm(functor);
+  lm.parameters.maxfev = 1000;
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 490, 376);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 3.0750560385E-04);
+  // check x
+  VERIFY_IS_APPROX(x[0], 0.1928077089); // should be 1.9280693458E-01
+  VERIFY_IS_APPROX(x[1], 0.19126423573); // should be 1.9128232873E-01
+  VERIFY_IS_APPROX(x[2], 0.12305309914); // should be 1.2305650693E-01
+  VERIFY_IS_APPROX(x[3], 0.13605395375); // should be 1.3606233068E-01
+
+  /*
+   * Second try
+   */
+  x<< 0.25, 0.39, 0.415, 0.39;
+  // do the computation
+  lm.resetParameters();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 18, 16);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 3.0750560385E-04);
+  // check x
+  VERIFY_IS_APPROX(x[0], 0.19280781); // should be 1.9280693458E-01
+  VERIFY_IS_APPROX(x[1], 0.19126265); // should be 1.9128232873E-01
+  VERIFY_IS_APPROX(x[2], 0.12305280); // should be 1.2305650693E-01
+  VERIFY_IS_APPROX(x[3], 0.13605322); // should be 1.3606233068E-01
+}
+
+
+
+struct Bennett5_functor : Functor<double>
+{
+    Bennett5_functor(void) : Functor<double>(3,154) {}
+    static const double x[154];
+    static const double y[154];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==3);
+        assert(fvec.size()==154);
+        for(int i=0; i<154; i++)
+            fvec[i] = b[0]* pow(b[1]+x[i],-1./b[2]) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==3);
+        assert(fjac.rows()==154);
+        assert(fjac.cols()==3);
+        for(int i=0; i<154; i++) {
+            double e = pow(b[1]+x[i],-1./b[2]);
+            fjac(i,0) = e;
+            fjac(i,1) = - b[0]*e/b[2]/(b[1]+x[i]);
+            fjac(i,2) = b[0]*e*log(b[1]+x[i])/b[2]/b[2];
+        }
+        return 0;
+    }
+};
+const double Bennett5_functor::x[154] = { 7.447168E0, 8.102586E0, 8.452547E0, 8.711278E0, 8.916774E0, 9.087155E0, 9.232590E0, 9.359535E0, 9.472166E0, 9.573384E0, 9.665293E0, 9.749461E0, 9.827092E0, 9.899128E0, 9.966321E0, 10.029280E0, 10.088510E0, 10.144430E0, 10.197380E0, 10.247670E0, 10.295560E0, 10.341250E0, 10.384950E0, 10.426820E0, 10.467000E0, 10.505640E0, 10.542830E0, 10.578690E0, 10.613310E0, 10.646780E0, 10.679150E0, 10.710520E0, 10.740920E0, 10.770440E0, 10.799100E0, 10.826970E0, 10.854080E0, 10.880470E0, 10.906190E0, 10.931260E0, 10.955720E0, 10.979590E0, 11.002910E0, 11.025700E0, 11.047980E0, 11.069770E0, 11.091100E0, 11.111980E0, 11.132440E0, 11.152480E0, 11.172130E0, 11.191410E0, 11.210310E0, 11.228870E0, 11.247090E0, 11.264980E0, 11.282560E0, 11.299840E0, 11.316820E0, 11.333520E0, 11.349940E0, 11.366100E0, 11.382000E0, 11.397660E0, 11.413070E0, 11.428240E0, 11.443200E0, 11.457930E0, 11.472440E0, 11.486750E0, 11.500860E0, 11.514770E0, 11.528490E0, 11.542020E0, 11.555380E0, 11.568550E0,
+11.581560E0, 11.594420E0, 11.607121E0, 11.619640E0, 11.632000E0, 11.644210E0, 11.656280E0, 11.668200E0, 11.679980E0, 11.691620E0, 11.703130E0, 11.714510E0, 11.725760E0, 11.736880E0, 11.747890E0, 11.758780E0, 11.769550E0, 11.780200E0, 11.790730E0, 11.801160E0, 11.811480E0, 11.821700E0, 11.831810E0, 11.841820E0, 11.851730E0, 11.861550E0, 11.871270E0, 11.880890E0, 11.890420E0, 11.899870E0, 11.909220E0, 11.918490E0, 11.927680E0, 11.936780E0, 11.945790E0, 11.954730E0, 11.963590E0, 11.972370E0, 11.981070E0, 11.989700E0, 11.998260E0, 12.006740E0, 12.015150E0, 12.023490E0, 12.031760E0, 12.039970E0, 12.048100E0, 12.056170E0, 12.064180E0, 12.072120E0, 12.080010E0, 12.087820E0, 12.095580E0, 12.103280E0, 12.110920E0, 12.118500E0, 12.126030E0, 12.133500E0, 12.140910E0, 12.148270E0, 12.155570E0, 12.162830E0, 12.170030E0, 12.177170E0, 12.184270E0, 12.191320E0, 12.198320E0, 12.205270E0, 12.212170E0, 12.219030E0, 12.225840E0, 12.232600E0, 12.239320E0, 12.245990E0, 12.252620E0, 12.259200E0, 12.265750E0, 12.272240E0 };
+const double Bennett5_functor::y[154] = { -34.834702E0 ,-34.393200E0 ,-34.152901E0 ,-33.979099E0 ,-33.845901E0 ,-33.732899E0 ,-33.640301E0 ,-33.559200E0 ,-33.486801E0 ,-33.423100E0 ,-33.365101E0 ,-33.313000E0 ,-33.260899E0 ,-33.217400E0 ,-33.176899E0 ,-33.139198E0 ,-33.101601E0 ,-33.066799E0 ,-33.035000E0 ,-33.003101E0 ,-32.971298E0 ,-32.942299E0 ,-32.916302E0 ,-32.890202E0 ,-32.864101E0 ,-32.841000E0 ,-32.817799E0 ,-32.797501E0 ,-32.774300E0 ,-32.757000E0 ,-32.733799E0 ,-32.716400E0 ,-32.699100E0 ,-32.678799E0 ,-32.661400E0 ,-32.644001E0 ,-32.626701E0 ,-32.612202E0 ,-32.597698E0 ,-32.583199E0 ,-32.568699E0 ,-32.554298E0 ,-32.539799E0 ,-32.525299E0 ,-32.510799E0 ,-32.499199E0 ,-32.487598E0 ,-32.473202E0 ,-32.461601E0 ,-32.435501E0 ,-32.435501E0 ,-32.426800E0 ,-32.412300E0 ,-32.400799E0 ,-32.392101E0 ,-32.380501E0 ,-32.366001E0 ,-32.357300E0 ,-32.348598E0 ,-32.339901E0 ,-32.328400E0 ,-32.319698E0 ,-32.311001E0 ,-32.299400E0 ,-32.290699E0 ,-32.282001E0 ,-32.273300E0 ,-32.264599E0 ,-32.256001E0 ,-32.247299E0
+,-32.238602E0 ,-32.229900E0 ,-32.224098E0 ,-32.215401E0 ,-32.203800E0 ,-32.198002E0 ,-32.189400E0 ,-32.183601E0 ,-32.174900E0 ,-32.169102E0 ,-32.163300E0 ,-32.154598E0 ,-32.145901E0 ,-32.140099E0 ,-32.131401E0 ,-32.125599E0 ,-32.119801E0 ,-32.111198E0 ,-32.105400E0 ,-32.096699E0 ,-32.090900E0 ,-32.088001E0 ,-32.079300E0 ,-32.073502E0 ,-32.067699E0 ,-32.061901E0 ,-32.056099E0 ,-32.050301E0 ,-32.044498E0 ,-32.038799E0 ,-32.033001E0 ,-32.027199E0 ,-32.024300E0 ,-32.018501E0 ,-32.012699E0 ,-32.004002E0 ,-32.001099E0 ,-31.995300E0 ,-31.989500E0 ,-31.983700E0 ,-31.977900E0 ,-31.972099E0 ,-31.969299E0 ,-31.963501E0 ,-31.957701E0 ,-31.951900E0 ,-31.946100E0 ,-31.940300E0 ,-31.937401E0 ,-31.931601E0 ,-31.925800E0 ,-31.922899E0 ,-31.917101E0 ,-31.911301E0 ,-31.908400E0 ,-31.902599E0 ,-31.896900E0 ,-31.893999E0 ,-31.888201E0 ,-31.885300E0 ,-31.882401E0 ,-31.876600E0 ,-31.873699E0 ,-31.867901E0 ,-31.862101E0 ,-31.859200E0 ,-31.856300E0 ,-31.850500E0 ,-31.844700E0 ,-31.841801E0 ,-31.838900E0 ,-31.833099E0 ,-31.830200E0 ,
+-31.827299E0 ,-31.821600E0 ,-31.818701E0 ,-31.812901E0 ,-31.809999E0 ,-31.807100E0 ,-31.801300E0 ,-31.798401E0 ,-31.795500E0 ,-31.789700E0 ,-31.786800E0 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/bennett5.shtml
+void testNistBennett5(void)
+{
+  const int  n=3;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< -2000., 50., 0.8;
+  // do the computation
+  Bennett5_functor functor;
+  LevenbergMarquardt<Bennett5_functor> lm(functor);
+  lm.parameters.maxfev = 1000;
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 758, 744);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.2404744073E-04);
+  // check x
+  VERIFY_IS_APPROX(x[0], -2.5235058043E+03);
+  VERIFY_IS_APPROX(x[1], 4.6736564644E+01);
+  VERIFY_IS_APPROX(x[2], 9.3218483193E-01);
+  /*
+   * Second try
+   */
+  x<< -1500., 45., 0.85;
+  // do the computation
+  lm.resetParameters();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 203, 192);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.2404744073E-04);
+  // check x
+  VERIFY_IS_APPROX(x[0], -2523.3007865); // should be -2.5235058043E+03
+  VERIFY_IS_APPROX(x[1], 46.735705771); // should be 4.6736564644E+01);
+  VERIFY_IS_APPROX(x[2], 0.93219881891); // should be 9.3218483193E-01);
+}
+
+struct thurber_functor : Functor<double>
+{
+    thurber_functor(void) : Functor<double>(7,37) {}
+    static const double _x[37];
+    static const double _y[37];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        //        int called=0; printf("call hahn1_functor with  iflag=%d, called=%d\n", iflag, called); if (iflag==1) called++;
+        assert(b.size()==7);
+        assert(fvec.size()==37);
+        for(int i=0; i<37; i++) {
+            double x=_x[i], xx=x*x, xxx=xx*x;
+            fvec[i] = (b[0]+b[1]*x+b[2]*xx+b[3]*xxx) / (1.+b[4]*x+b[5]*xx+b[6]*xxx) - _y[i];
+        }
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==7);
+        assert(fjac.rows()==37);
+        assert(fjac.cols()==7);
+        for(int i=0; i<37; i++) {
+            double x=_x[i], xx=x*x, xxx=xx*x;
+            double fact = 1./(1.+b[4]*x+b[5]*xx+b[6]*xxx);
+            fjac(i,0) = 1.*fact;
+            fjac(i,1) = x*fact;
+            fjac(i,2) = xx*fact;
+            fjac(i,3) = xxx*fact;
+            fact = - (b[0]+b[1]*x+b[2]*xx+b[3]*xxx) * fact * fact;
+            fjac(i,4) = x*fact;
+            fjac(i,5) = xx*fact;
+            fjac(i,6) = xxx*fact;
+        }
+        return 0;
+    }
+};
+const double thurber_functor::_x[37] = { -3.067E0, -2.981E0, -2.921E0, -2.912E0, -2.840E0, -2.797E0, -2.702E0, -2.699E0, -2.633E0, -2.481E0, -2.363E0, -2.322E0, -1.501E0, -1.460E0, -1.274E0, -1.212E0, -1.100E0, -1.046E0, -0.915E0, -0.714E0, -0.566E0, -0.545E0, -0.400E0, -0.309E0, -0.109E0, -0.103E0, 0.010E0, 0.119E0, 0.377E0, 0.790E0, 0.963E0, 1.006E0, 1.115E0, 1.572E0, 1.841E0, 2.047E0, 2.200E0 };
+const double thurber_functor::_y[37] = { 80.574E0, 84.248E0, 87.264E0, 87.195E0, 89.076E0, 89.608E0, 89.868E0, 90.101E0, 92.405E0, 95.854E0, 100.696E0, 101.060E0, 401.672E0, 390.724E0, 567.534E0, 635.316E0, 733.054E0, 759.087E0, 894.206E0, 990.785E0, 1090.109E0, 1080.914E0, 1122.643E0, 1178.351E0, 1260.531E0, 1273.514E0, 1288.339E0, 1327.543E0, 1353.863E0, 1414.509E0, 1425.208E0, 1421.384E0, 1442.962E0, 1464.350E0, 1468.705E0, 1447.894E0, 1457.628E0};
+
+// http://www.itl.nist.gov/div898/strd/nls/data/thurber.shtml
+void testNistThurber(void)
+{
+  const int n=7;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 1000 ,1000 ,400 ,40 ,0.7,0.3,0.0 ;
+  // do the computation
+  thurber_functor functor;
+  LevenbergMarquardt<thurber_functor> lm(functor);
+  lm.parameters.ftol = 1.E4*NumTraits<double>::epsilon();
+  lm.parameters.xtol = 1.E4*NumTraits<double>::epsilon();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 39,36);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6427082397E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.2881396800E+03);
+  VERIFY_IS_APPROX(x[1], 1.4910792535E+03);
+  VERIFY_IS_APPROX(x[2], 5.8323836877E+02);
+  VERIFY_IS_APPROX(x[3], 7.5416644291E+01);
+  VERIFY_IS_APPROX(x[4], 9.6629502864E-01);
+  VERIFY_IS_APPROX(x[5], 3.9797285797E-01);
+  VERIFY_IS_APPROX(x[6], 4.9727297349E-02);
+
+  /*
+   * Second try
+   */
+  x<< 1300 ,1500 ,500  ,75   ,1    ,0.4  ,0.05  ;
+  // do the computation
+  lm.resetParameters();
+  lm.parameters.ftol = 1.E4*NumTraits<double>::epsilon();
+  lm.parameters.xtol = 1.E4*NumTraits<double>::epsilon();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 29, 28);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6427082397E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.2881396800E+03);
+  VERIFY_IS_APPROX(x[1], 1.4910792535E+03);
+  VERIFY_IS_APPROX(x[2], 5.8323836877E+02);
+  VERIFY_IS_APPROX(x[3], 7.5416644291E+01);
+  VERIFY_IS_APPROX(x[4], 9.6629502864E-01);
+  VERIFY_IS_APPROX(x[5], 3.9797285797E-01);
+  VERIFY_IS_APPROX(x[6], 4.9727297349E-02);
+}
+
+struct rat43_functor : Functor<double>
+{
+    rat43_functor(void) : Functor<double>(4,15) {}
+    static const double x[15];
+    static const double y[15];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==4);
+        assert(fvec.size()==15);
+        for(int i=0; i<15; i++)
+            fvec[i] = b[0] * pow(1.+exp(b[1]-b[2]*x[i]),-1./b[3]) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==4);
+        assert(fjac.rows()==15);
+        assert(fjac.cols()==4);
+        for(int i=0; i<15; i++) {
+            double e = exp(b[1]-b[2]*x[i]);
+            double power = -1./b[3];
+            fjac(i,0) = pow(1.+e, power);
+            fjac(i,1) = power*b[0]*e*pow(1.+e, power-1.);
+            fjac(i,2) = -power*b[0]*e*x[i]*pow(1.+e, power-1.);
+            fjac(i,3) = b[0]*power*power*log(1.+e)*pow(1.+e, power);
+        }
+        return 0;
+    }
+};
+const double rat43_functor::x[15] = { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15. };
+const double rat43_functor::y[15] = { 16.08, 33.83, 65.80, 97.20, 191.55, 326.20, 386.87, 520.53, 590.03, 651.92, 724.93, 699.56, 689.96, 637.56, 717.41 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/ratkowsky3.shtml
+void testNistRat43(void)
+{
+  const int n=4;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 100., 10., 1., 1.;
+  // do the computation
+  rat43_functor functor;
+  LevenbergMarquardt<rat43_functor> lm(functor);
+  lm.parameters.ftol = 1.E6*NumTraits<double>::epsilon();
+  lm.parameters.xtol = 1.E6*NumTraits<double>::epsilon();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 27, 20);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7864049080E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 6.9964151270E+02);
+  VERIFY_IS_APPROX(x[1], 5.2771253025E+00);
+  VERIFY_IS_APPROX(x[2], 7.5962938329E-01);
+  VERIFY_IS_APPROX(x[3], 1.2792483859E+00);
+
+  /*
+   * Second try
+   */
+  x<< 700., 5., 0.75, 1.3;
+  // do the computation
+  lm.resetParameters();
+  lm.parameters.ftol = 1.E5*NumTraits<double>::epsilon();
+  lm.parameters.xtol = 1.E5*NumTraits<double>::epsilon();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 9, 8);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7864049080E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 6.9964151270E+02);
+  VERIFY_IS_APPROX(x[1], 5.2771253025E+00);
+  VERIFY_IS_APPROX(x[2], 7.5962938329E-01);
+  VERIFY_IS_APPROX(x[3], 1.2792483859E+00);
+}
+
+
+
+struct eckerle4_functor : Functor<double>
+{
+    eckerle4_functor(void) : Functor<double>(3,35) {}
+    static const double x[35];
+    static const double y[35];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==3);
+        assert(fvec.size()==35);
+        for(int i=0; i<35; i++)
+            fvec[i] = b[0]/b[1] * exp(-0.5*(x[i]-b[2])*(x[i]-b[2])/(b[1]*b[1])) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==3);
+        assert(fjac.rows()==35);
+        assert(fjac.cols()==3);
+        for(int i=0; i<35; i++) {
+            double b12 = b[1]*b[1];
+            double e = exp(-0.5*(x[i]-b[2])*(x[i]-b[2])/b12);
+            fjac(i,0) = e / b[1];
+            fjac(i,1) = ((x[i]-b[2])*(x[i]-b[2])/b12-1.) * b[0]*e/b12;
+            fjac(i,2) = (x[i]-b[2])*e*b[0]/b[1]/b12;
+        }
+        return 0;
+    }
+};
+const double eckerle4_functor::x[35] = { 400.0, 405.0, 410.0, 415.0, 420.0, 425.0, 430.0, 435.0, 436.5, 438.0, 439.5, 441.0, 442.5, 444.0, 445.5, 447.0, 448.5, 450.0, 451.5, 453.0, 454.5, 456.0, 457.5, 459.0, 460.5, 462.0, 463.5, 465.0, 470.0, 475.0, 480.0, 485.0, 490.0, 495.0, 500.0};
+const double eckerle4_functor::y[35] = { 0.0001575, 0.0001699, 0.0002350, 0.0003102, 0.0004917, 0.0008710, 0.0017418, 0.0046400, 0.0065895, 0.0097302, 0.0149002, 0.0237310, 0.0401683, 0.0712559, 0.1264458, 0.2073413, 0.2902366, 0.3445623, 0.3698049, 0.3668534, 0.3106727, 0.2078154, 0.1164354, 0.0616764, 0.0337200, 0.0194023, 0.0117831, 0.0074357, 0.0022732, 0.0008800, 0.0004579, 0.0002345, 0.0001586, 0.0001143, 0.0000710 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/eckerle4.shtml
+void testNistEckerle4(void)
+{
+  const int n=3;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 1., 10., 500.;
+  // do the computation
+  eckerle4_functor functor;
+  LevenbergMarquardt<eckerle4_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 18, 15);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4635887487E-03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.5543827178);
+  VERIFY_IS_APPROX(x[1], 4.0888321754);
+  VERIFY_IS_APPROX(x[2], 4.5154121844E+02);
+
+  /*
+   * Second try
+   */
+  x<< 1.5, 5., 450.;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 7, 6);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4635887487E-03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.5543827178);
+  VERIFY_IS_APPROX(x[1], 4.0888321754);
+  VERIFY_IS_APPROX(x[2], 4.5154121844E+02);
+}
+
+EIGEN_DECLARE_TEST(NonLinearOptimization)
+{
+    // Tests using the examples provided by (c)minpack
+    CALL_SUBTEST/*_1*/(testChkder());
+    CALL_SUBTEST/*_1*/(testLmder1());
+    CALL_SUBTEST/*_1*/(testLmder());
+    CALL_SUBTEST/*_2*/(testHybrj1());
+    CALL_SUBTEST/*_2*/(testHybrj());
+    CALL_SUBTEST/*_2*/(testHybrd1());
+    CALL_SUBTEST/*_2*/(testHybrd());
+    CALL_SUBTEST/*_3*/(testLmstr1());
+    CALL_SUBTEST/*_3*/(testLmstr());
+    CALL_SUBTEST/*_3*/(testLmdif1());
+    CALL_SUBTEST/*_3*/(testLmdif());
+
+    // NIST tests, level of difficulty = "Lower"
+    CALL_SUBTEST/*_4*/(testNistMisra1a());
+    CALL_SUBTEST/*_4*/(testNistChwirut2());
+
+    // NIST tests, level of difficulty = "Average"
+    CALL_SUBTEST/*_5*/(testNistHahn1());
+    CALL_SUBTEST/*_6*/(testNistMisra1d());
+    CALL_SUBTEST/*_7*/(testNistMGH17());
+    CALL_SUBTEST/*_8*/(testNistLanczos1());
+
+//     // NIST tests, level of difficulty = "Higher"
+    CALL_SUBTEST/*_9*/(testNistRat42());
+//     CALL_SUBTEST/*_10*/(testNistMGH10());
+    CALL_SUBTEST/*_11*/(testNistBoxBOD());
+//     CALL_SUBTEST/*_12*/(testNistMGH09());
+    CALL_SUBTEST/*_13*/(testNistBennett5());
+    CALL_SUBTEST/*_14*/(testNistThurber());
+    CALL_SUBTEST/*_15*/(testNistRat43());
+    CALL_SUBTEST/*_16*/(testNistEckerle4());
+}
+
+/*
+ * Can be useful for debugging...
+  printf("info, nfev : %d, %d\n", info, lm.nfev);
+  printf("info, nfev, njev : %d, %d, %d\n", info, solver.nfev, solver.njev);
+  printf("info, nfev : %d, %d\n", info, solver.nfev);
+  printf("x[0] : %.32g\n", x[0]);
+  printf("x[1] : %.32g\n", x[1]);
+  printf("x[2] : %.32g\n", x[2]);
+  printf("x[3] : %.32g\n", x[3]);
+  printf("fvec.blueNorm() : %.32g\n", solver.fvec.blueNorm());
+  printf("fvec.blueNorm() : %.32g\n", lm.fvec.blueNorm());
+
+  printf("info, nfev, njev : %d, %d, %d\n", info, lm.nfev, lm.njev);
+  printf("fvec.squaredNorm() : %.13g\n", lm.fvec.squaredNorm());
+  std::cout << x << std::endl;
+  std::cout.precision(9);
+  std::cout << x[0] << std::endl;
+  std::cout << x[1] << std::endl;
+  std::cout << x[2] << std::endl;
+  std::cout << x[3] << std::endl;
+*/
+

diff --git a/unsupported/test/NumericalDiff.cpp b/unsupported/test/NumericalDiff.cpp
new file mode 100644
index 0000000..6d83641
--- /dev/null
+++ b/unsupported/test/NumericalDiff.cpp

@@ -0,0 +1,114 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+
+#include <stdio.h>
+
+#include "main.h"
+#include <unsupported/Eigen/NumericalDiff>
+    
+// Generic functor
+template<typename _Scalar, int NX=Dynamic, int NY=Dynamic>
+struct Functor
+{
+  typedef _Scalar Scalar;
+  enum {
+    InputsAtCompileTime = NX,
+    ValuesAtCompileTime = NY
+  };
+  typedef Matrix<Scalar,InputsAtCompileTime,1> InputType;
+  typedef Matrix<Scalar,ValuesAtCompileTime,1> ValueType;
+  typedef Matrix<Scalar,ValuesAtCompileTime,InputsAtCompileTime> JacobianType;
+  
+  int m_inputs, m_values;
+  
+  Functor() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
+  Functor(int inputs_, int values_) : m_inputs(inputs_), m_values(values_) {}
+  
+  int inputs() const { return m_inputs; }
+  int values() const { return m_values; }
+
+};
+
+struct my_functor : Functor<double>
+{
+    my_functor(void): Functor<double>(3,15) {}
+    int operator()(const VectorXd &x, VectorXd &fvec) const
+    {
+        double tmp1, tmp2, tmp3;
+        double y[15] = {1.4e-1, 1.8e-1, 2.2e-1, 2.5e-1, 2.9e-1, 3.2e-1, 3.5e-1,
+            3.9e-1, 3.7e-1, 5.8e-1, 7.3e-1, 9.6e-1, 1.34, 2.1, 4.39};
+
+        for (int i = 0; i < values(); i++)
+        {
+            tmp1 = i+1;
+            tmp2 = 16 - i - 1;
+            tmp3 = (i>=8)? tmp2 : tmp1;
+            fvec[i] = y[i] - (x[0] + tmp1/(x[1]*tmp2 + x[2]*tmp3));
+        }
+        return 0;
+    }
+
+    int actual_df(const VectorXd &x, MatrixXd &fjac) const
+    {
+        double tmp1, tmp2, tmp3, tmp4;
+        for (int i = 0; i < values(); i++)
+        {
+            tmp1 = i+1;
+            tmp2 = 16 - i - 1;
+            tmp3 = (i>=8)? tmp2 : tmp1;
+            tmp4 = (x[1]*tmp2 + x[2]*tmp3); tmp4 = tmp4*tmp4;
+            fjac(i,0) = -1;
+            fjac(i,1) = tmp1*tmp2/tmp4;
+            fjac(i,2) = tmp1*tmp3/tmp4;
+        }
+        return 0;
+    }
+};
+
+void test_forward()
+{
+    VectorXd x(3);
+    MatrixXd jac(15,3);
+    MatrixXd actual_jac(15,3);
+    my_functor functor;
+
+    x << 0.082, 1.13, 2.35;
+
+    // real one 
+    functor.actual_df(x, actual_jac);
+//    std::cout << actual_jac << std::endl << std::endl;
+
+    // using NumericalDiff
+    NumericalDiff<my_functor> numDiff(functor);
+    numDiff.df(x, jac);
+//    std::cout << jac << std::endl;
+
+    VERIFY_IS_APPROX(jac, actual_jac);
+}
+
+void test_central()
+{
+    VectorXd x(3);
+    MatrixXd jac(15,3);
+    MatrixXd actual_jac(15,3);
+    my_functor functor;
+
+    x << 0.082, 1.13, 2.35;
+
+    // real one 
+    functor.actual_df(x, actual_jac);
+
+    // using NumericalDiff
+    NumericalDiff<my_functor,Central> numDiff(functor);
+    numDiff.df(x, jac);
+
+    VERIFY_IS_APPROX(jac, actual_jac);
+}
+
+EIGEN_DECLARE_TEST(NumericalDiff)
+{
+    CALL_SUBTEST(test_forward());
+    CALL_SUBTEST(test_central());
+}

diff --git a/unsupported/test/alignedvector3.cpp b/unsupported/test/alignedvector3.cpp
new file mode 100644
index 0000000..f442e41
--- /dev/null
+++ b/unsupported/test/alignedvector3.cpp

@@ -0,0 +1,87 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <unsupported/Eigen/AlignedVector3>
+
+namespace Eigen {
+
+template<typename T,typename Derived>
+T test_relative_error(const AlignedVector3<T> &a, const MatrixBase<Derived> &b)
+{
+  return test_relative_error(a.coeffs().template head<3>(), b);
+}
+
+}
+
+template<typename Scalar>
+void alignedvector3()
+{
+  Scalar s1 = internal::random<Scalar>();
+  Scalar s2 = internal::random<Scalar>();
+  typedef Matrix<Scalar,3,1> RefType;
+  typedef Matrix<Scalar,3,3> Mat33;
+  typedef AlignedVector3<Scalar> FastType;
+  RefType  r1(RefType::Random()), r2(RefType::Random()), r3(RefType::Random()),
+           r4(RefType::Random()), r5(RefType::Random());
+  FastType f1(r1), f2(r2), f3(r3), f4(r4), f5(r5);
+  Mat33 m1(Mat33::Random());
+  
+  VERIFY_IS_APPROX(f1,r1);
+  VERIFY_IS_APPROX(f4,r4);
+
+  VERIFY_IS_APPROX(f4+f1,r4+r1);
+  VERIFY_IS_APPROX(f4-f1,r4-r1);
+  VERIFY_IS_APPROX(f4+f1-f2,r4+r1-r2);
+  VERIFY_IS_APPROX(f4+=f3,r4+=r3);
+  VERIFY_IS_APPROX(f4-=f5,r4-=r5);
+  VERIFY_IS_APPROX(f4-=f5+f1,r4-=r5+r1);
+  VERIFY_IS_APPROX(f5+f1-s1*f2,r5+r1-s1*r2);
+  VERIFY_IS_APPROX(f5+f1/s2-s1*f2,r5+r1/s2-s1*r2);
+  
+  VERIFY_IS_APPROX(m1*f4,m1*r4);
+  VERIFY_IS_APPROX(f4.transpose()*m1,r4.transpose()*m1);
+  
+  VERIFY_IS_APPROX(f2.dot(f3),r2.dot(r3));
+  VERIFY_IS_APPROX(f2.cross(f3),r2.cross(r3));
+  VERIFY_IS_APPROX(f2.norm(),r2.norm());
+
+  VERIFY_IS_APPROX(f2.normalized(),r2.normalized());
+
+  VERIFY_IS_APPROX((f2+f1).normalized(),(r2+r1).normalized());
+  
+  f2.normalize();
+  r2.normalize();
+  VERIFY_IS_APPROX(f2,r2);
+  
+  {
+    FastType f6 = RefType::Zero();
+    FastType f7 = FastType::Zero();
+    VERIFY_IS_APPROX(f6,f7);
+    f6 = r4+r1;
+    VERIFY_IS_APPROX(f6,r4+r1);
+    f6 -= Scalar(2)*r4;
+    VERIFY_IS_APPROX(f6,r1-r4);
+  }
+  
+  FastType f8, f9(0,0,0);
+  VERIFY_IS_APPROX(f9-f1,-f1);
+
+  std::stringstream ss1, ss2;
+  ss1 << f1;
+  ss2 << r1;
+  VERIFY(ss1.str()==ss2.str());
+}
+
+EIGEN_DECLARE_TEST(alignedvector3)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST( alignedvector3<float>() );
+  }
+}

diff --git a/unsupported/test/autodiff.cpp b/unsupported/test/autodiff.cpp
new file mode 100644
index 0000000..2cea56b
--- /dev/null
+++ b/unsupported/test/autodiff.cpp

@@ -0,0 +1,387 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <unsupported/Eigen/AutoDiff>
+
+template<typename Scalar>
+EIGEN_DONT_INLINE Scalar foo(const Scalar& x, const Scalar& y)
+{
+  using namespace std;
+//   return x+std::sin(y);
+  EIGEN_ASM_COMMENT("mybegin");
+  // pow(float, int) promotes to pow(double, double)
+  return x*2 - 1 + static_cast<Scalar>(pow(1+x,2)) + 2*sqrt(y*y+0) - 4 * sin(0+x) + 2 * cos(y+0) - exp(Scalar(-0.5)*x*x+0);
+  //return x+2*y*x;//x*2 -std::pow(x,2);//(2*y/x);// - y*2;
+  EIGEN_ASM_COMMENT("myend");
+}
+
+template<typename Vector>
+EIGEN_DONT_INLINE typename Vector::Scalar foo(const Vector& p)
+{
+  typedef typename Vector::Scalar Scalar;
+  return (p-Vector(Scalar(-1),Scalar(1.))).norm() + (p.array() * p.array()).sum() + p.dot(p);
+}
+
+template<typename _Scalar, int NX=Dynamic, int NY=Dynamic>
+struct TestFunc1
+{
+  typedef _Scalar Scalar;
+  enum {
+    InputsAtCompileTime = NX,
+    ValuesAtCompileTime = NY
+  };
+  typedef Matrix<Scalar,InputsAtCompileTime,1> InputType;
+  typedef Matrix<Scalar,ValuesAtCompileTime,1> ValueType;
+  typedef Matrix<Scalar,ValuesAtCompileTime,InputsAtCompileTime> JacobianType;
+
+  int m_inputs, m_values;
+
+  TestFunc1() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
+  TestFunc1(int inputs_, int values_) : m_inputs(inputs_), m_values(values_) {}
+
+  int inputs() const { return m_inputs; }
+  int values() const { return m_values; }
+
+  template<typename T>
+  void operator() (const Matrix<T,InputsAtCompileTime,1>& x, Matrix<T,ValuesAtCompileTime,1>* _v) const
+  {
+    Matrix<T,ValuesAtCompileTime,1>& v = *_v;
+
+    v[0] = 2 * x[0] * x[0] + x[0] * x[1];
+    v[1] = 3 * x[1] * x[0] + 0.5 * x[1] * x[1];
+    if(inputs()>2)
+    {
+      v[0] += 0.5 * x[2];
+      v[1] += x[2];
+    }
+    if(values()>2)
+    {
+      v[2] = 3 * x[1] * x[0] * x[0];
+    }
+    if (inputs()>2 && values()>2)
+      v[2] *= x[2];
+  }
+
+  void operator() (const InputType& x, ValueType* v, JacobianType* _j) const
+  {
+    (*this)(x, v);
+
+    if(_j)
+    {
+      JacobianType& j = *_j;
+
+      j(0,0) = 4 * x[0] + x[1];
+      j(1,0) = 3 * x[1];
+
+      j(0,1) = x[0];
+      j(1,1) = 3 * x[0] + 2 * 0.5 * x[1];
+
+      if (inputs()>2)
+      {
+        j(0,2) = 0.5;
+        j(1,2) = 1;
+      }
+      if(values()>2)
+      {
+        j(2,0) = 3 * x[1] * 2 * x[0];
+        j(2,1) = 3 * x[0] * x[0];
+      }
+      if (inputs()>2 && values()>2)
+      {
+        j(2,0) *= x[2];
+        j(2,1) *= x[2];
+
+        j(2,2) = 3 * x[1] * x[0] * x[0];
+        j(2,2) = 3 * x[1] * x[0] * x[0];
+      }
+    }
+  }
+};
+
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+/* Test functor for the C++11 features. */
+template <typename Scalar>
+struct integratorFunctor
+{
+    typedef Matrix<Scalar, 2, 1> InputType;
+    typedef Matrix<Scalar, 2, 1> ValueType;
+
+    /*
+     * Implementation starts here.
+     */
+    integratorFunctor(const Scalar gain) : _gain(gain) {}
+    integratorFunctor(const integratorFunctor& f) : _gain(f._gain) {}
+    const Scalar _gain;
+
+    template <typename T1, typename T2>
+    void operator() (const T1 &input, T2 *output, const Scalar dt) const
+    {
+        T2 &o = *output;
+
+        /* Integrator to test the AD. */
+        o[0] = input[0] + input[1] * dt * _gain;
+        o[1] = input[1] * _gain;
+    }
+
+    /* Only needed for the test */
+    template <typename T1, typename T2, typename T3>
+    void operator() (const T1 &input, T2 *output, T3 *jacobian, const Scalar dt) const
+    {
+        T2 &o = *output;
+
+        /* Integrator to test the AD. */
+        o[0] = input[0] + input[1] * dt * _gain;
+        o[1] = input[1] * _gain;
+
+        if (jacobian)
+        {
+            T3 &j = *jacobian;
+
+            j(0, 0) = 1;
+            j(0, 1) = dt * _gain;
+            j(1, 0) = 0;
+            j(1, 1) = _gain;
+        }
+    }
+
+};
+
+template<typename Func> void forward_jacobian_cpp11(const Func& f)
+{
+    typedef typename Func::ValueType::Scalar Scalar;
+    typedef typename Func::ValueType ValueType;
+    typedef typename Func::InputType InputType;
+    typedef typename AutoDiffJacobian<Func>::JacobianType JacobianType;
+
+    InputType x = InputType::Random(InputType::RowsAtCompileTime);
+    ValueType y, yref;
+    JacobianType j, jref;
+
+    const Scalar dt = internal::random<double>();
+
+    jref.setZero();
+    yref.setZero();
+    f(x, &yref, &jref, dt);
+
+    //std::cerr << "y, yref, jref: " << "\n";
+    //std::cerr << y.transpose() << "\n\n";
+    //std::cerr << yref << "\n\n";
+    //std::cerr << jref << "\n\n";
+
+    AutoDiffJacobian<Func> autoj(f);
+    autoj(x, &y, &j, dt);
+
+    //std::cerr << "y j (via autodiff): " << "\n";
+    //std::cerr << y.transpose() << "\n\n";
+    //std::cerr << j << "\n\n";
+
+    VERIFY_IS_APPROX(y, yref);
+    VERIFY_IS_APPROX(j, jref);
+}
+#endif
+
+template<typename Func> void forward_jacobian(const Func& f)
+{
+    typename Func::InputType x = Func::InputType::Random(f.inputs());
+    typename Func::ValueType y(f.values()), yref(f.values());
+    typename Func::JacobianType j(f.values(),f.inputs()), jref(f.values(),f.inputs());
+
+    jref.setZero();
+    yref.setZero();
+    f(x,&yref,&jref);
+//     std::cerr << y.transpose() << "\n\n";;
+//     std::cerr << j << "\n\n";;
+
+    j.setZero();
+    y.setZero();
+    AutoDiffJacobian<Func> autoj(f);
+    autoj(x, &y, &j);
+//     std::cerr << y.transpose() << "\n\n";;
+//     std::cerr << j << "\n\n";;
+
+    VERIFY_IS_APPROX(y, yref);
+    VERIFY_IS_APPROX(j, jref);
+}
+
+// TODO also check actual derivatives!
+template <int>
+void test_autodiff_scalar()
+{
+  Vector2f p = Vector2f::Random();
+  typedef AutoDiffScalar<Vector2f> AD;
+  AD ax(p.x(),Vector2f::UnitX());
+  AD ay(p.y(),Vector2f::UnitY());
+  AD res = foo<AD>(ax,ay);
+  VERIFY_IS_APPROX(res.value(), foo(p.x(),p.y()));
+}
+
+
+// TODO also check actual derivatives!
+template <int>
+void test_autodiff_vector()
+{
+  Vector2f p = Vector2f::Random();
+  typedef AutoDiffScalar<Vector2f> AD;
+  typedef Matrix<AD,2,1> VectorAD;
+  VectorAD ap = p.cast<AD>();
+  ap.x().derivatives() = Vector2f::UnitX();
+  ap.y().derivatives() = Vector2f::UnitY();
+
+  AD res = foo<VectorAD>(ap);
+  VERIFY_IS_APPROX(res.value(), foo(p));
+}
+
+template <int>
+void test_autodiff_jacobian()
+{
+  CALL_SUBTEST(( forward_jacobian(TestFunc1<double,2,2>()) ));
+  CALL_SUBTEST(( forward_jacobian(TestFunc1<double,2,3>()) ));
+  CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,2>()) ));
+  CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,3>()) ));
+  CALL_SUBTEST(( forward_jacobian(TestFunc1<double>(3,3)) ));
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  CALL_SUBTEST(( forward_jacobian_cpp11(integratorFunctor<double>(10)) ));
+#endif
+}
+
+
+template <int>
+void test_autodiff_hessian()
+{
+  typedef AutoDiffScalar<VectorXd> AD;
+  typedef Matrix<AD,Eigen::Dynamic,1> VectorAD;
+  typedef AutoDiffScalar<VectorAD> ADD;
+  typedef Matrix<ADD,Eigen::Dynamic,1> VectorADD;
+  VectorADD x(2);
+  double s1 = internal::random<double>(), s2 = internal::random<double>(), s3 = internal::random<double>(), s4 = internal::random<double>();
+  x(0).value()=s1;
+  x(1).value()=s2;
+
+  //set unit vectors for the derivative directions (partial derivatives of the input vector)
+  x(0).derivatives().resize(2);
+  x(0).derivatives().setZero();
+  x(0).derivatives()(0)= 1;
+  x(1).derivatives().resize(2);
+  x(1).derivatives().setZero();
+  x(1).derivatives()(1)=1;
+
+  //repeat partial derivatives for the inner AutoDiffScalar
+  x(0).value().derivatives() = VectorXd::Unit(2,0);
+  x(1).value().derivatives() = VectorXd::Unit(2,1);
+
+  //set the hessian matrix to zero
+  for(int idx=0; idx<2; idx++) {
+      x(0).derivatives()(idx).derivatives()  = VectorXd::Zero(2);
+      x(1).derivatives()(idx).derivatives()  = VectorXd::Zero(2);
+  }
+
+  ADD y = sin(AD(s3)*x(0) + AD(s4)*x(1));
+
+  VERIFY_IS_APPROX(y.value().derivatives()(0), y.derivatives()(0).value());
+  VERIFY_IS_APPROX(y.value().derivatives()(1), y.derivatives()(1).value());
+  VERIFY_IS_APPROX(y.value().derivatives()(0), s3*std::cos(s1*s3+s2*s4));
+  VERIFY_IS_APPROX(y.value().derivatives()(1), s4*std::cos(s1*s3+s2*s4));
+  VERIFY_IS_APPROX(y.derivatives()(0).derivatives(), -std::sin(s1*s3+s2*s4)*Vector2d(s3*s3,s4*s3));
+  VERIFY_IS_APPROX(y.derivatives()(1).derivatives(),  -std::sin(s1*s3+s2*s4)*Vector2d(s3*s4,s4*s4));
+
+  ADD z = x(0)*x(1);
+  VERIFY_IS_APPROX(z.derivatives()(0).derivatives(), Vector2d(0,1));
+  VERIFY_IS_APPROX(z.derivatives()(1).derivatives(), Vector2d(1,0));
+}
+
+double bug_1222() {
+  typedef Eigen::AutoDiffScalar<Eigen::Vector3d> AD;
+  const double _cv1_3 = 1.0;
+  const AD chi_3 = 1.0;
+  // this line did not work, because operator+ returns ADS<DerType&>, which then cannot be converted to ADS<DerType>
+  const AD denom = chi_3 + _cv1_3;
+  return denom.value();
+}
+
+#ifdef EIGEN_TEST_PART_5
+
+double bug_1223() {
+  using std::min;
+  typedef Eigen::AutoDiffScalar<Eigen::Vector3d> AD;
+
+  const double _cv1_3 = 1.0;
+  const AD chi_3 = 1.0;
+  const AD denom = 1.0;
+
+  // failed because implementation of min attempts to construct ADS<DerType&> via constructor AutoDiffScalar(const Real& value)
+  // without initializing m_derivatives (which is a reference in this case)
+  #define EIGEN_TEST_SPACE
+  const AD t = min EIGEN_TEST_SPACE (denom / chi_3, 1.0);
+
+  const AD t2 = min EIGEN_TEST_SPACE (denom / (chi_3 * _cv1_3), 1.0);
+
+  return t.value() + t2.value();
+}
+
+// regression test for some compilation issues with specializations of ScalarBinaryOpTraits
+void bug_1260() {
+  Matrix4d A = Matrix4d::Ones();
+  Vector4d v = Vector4d::Ones();
+  A*v;
+}
+
+// check a compilation issue with numext::max
+double bug_1261() {
+  typedef AutoDiffScalar<Matrix2d> AD;
+  typedef Matrix<AD,2,1> VectorAD;
+
+  VectorAD v(0.,0.);
+  const AD maxVal = v.maxCoeff();
+  const AD minVal = v.minCoeff();
+  return maxVal.value() + minVal.value();
+}
+
+double bug_1264() {
+  typedef AutoDiffScalar<Vector2d> AD;
+  const AD s = 0.;
+  const Matrix<AD, 3, 1> v1(0.,0.,0.);
+  const Matrix<AD, 3, 1> v2 = (s + 3.0) * v1;
+  return v2(0).value();
+}
+
+// check with expressions on constants
+double bug_1281() {
+  int n = 2;
+  typedef AutoDiffScalar<VectorXd> AD;
+  const AD c = 1.;
+  AD x0(2,n,0);
+  AD y1 = (AD(c)+AD(c))*x0;
+  y1 = x0 * (AD(c)+AD(c));
+  AD y2 = (-AD(c))+x0;
+  y2 = x0+(-AD(c));
+  AD y3 = (AD(c)*(-AD(c))+AD(c))*x0;
+  y3 = x0 * (AD(c)*(-AD(c))+AD(c));
+  return (y1+y2+y3).value();
+}
+
+#endif
+
+EIGEN_DECLARE_TEST(autodiff)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( test_autodiff_scalar<1>() );
+    CALL_SUBTEST_2( test_autodiff_vector<1>() );
+    CALL_SUBTEST_3( test_autodiff_jacobian<1>() );
+    CALL_SUBTEST_4( test_autodiff_hessian<1>() );
+  }
+
+  CALL_SUBTEST_5( bug_1222() );
+  CALL_SUBTEST_5( bug_1223() );
+  CALL_SUBTEST_5( bug_1260() );
+  CALL_SUBTEST_5( bug_1261() );
+  CALL_SUBTEST_5( bug_1281() );
+}
+

diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp
new file mode 100644
index 0000000..e81a778
--- /dev/null
+++ b/unsupported/test/autodiff_scalar.cpp

@@ -0,0 +1,101 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christoph Hertzberg <chtz@informatik.uni-bremen.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <unsupported/Eigen/AutoDiff>
+
+/*
+ * In this file scalar derivations are tested for correctness.
+ * TODO add more tests!
+ */
+
+template<typename Scalar> void check_atan2()
+{
+  typedef Matrix<Scalar, 1, 1> Deriv1;
+  typedef AutoDiffScalar<Deriv1> AD;
+  
+  AD x(internal::random<Scalar>(-3.0, 3.0), Deriv1::UnitX());
+  
+  using std::exp;
+  Scalar r = exp(internal::random<Scalar>(-10, 10));
+  
+  AD s = sin(x), c = cos(x);
+  AD res = atan2(r*s, r*c);
+  
+  VERIFY_IS_APPROX(res.value(), x.value());
+  VERIFY_IS_APPROX(res.derivatives(), x.derivatives());
+
+  res = atan2(r*s+0, r*c+0);
+  VERIFY_IS_APPROX(res.value(), x.value());
+  VERIFY_IS_APPROX(res.derivatives(), x.derivatives());
+}
+
+template<typename Scalar> void check_hyperbolic_functions()
+{
+  using std::sinh;
+  using std::cosh;
+  using std::tanh;
+  typedef Matrix<Scalar, 1, 1> Deriv1;
+  typedef AutoDiffScalar<Deriv1> AD;
+  Deriv1 p = Deriv1::Random();
+  AD val(p.x(),Deriv1::UnitX());
+
+  Scalar cosh_px = std::cosh(p.x());
+  AD res1 = tanh(val);
+  VERIFY_IS_APPROX(res1.value(), std::tanh(p.x()));
+  VERIFY_IS_APPROX(res1.derivatives().x(), Scalar(1.0) / (cosh_px * cosh_px));
+
+  AD res2 = sinh(val);
+  VERIFY_IS_APPROX(res2.value(), std::sinh(p.x()));
+  VERIFY_IS_APPROX(res2.derivatives().x(), cosh_px);
+
+  AD res3 = cosh(val);
+  VERIFY_IS_APPROX(res3.value(), cosh_px);
+  VERIFY_IS_APPROX(res3.derivatives().x(), std::sinh(p.x()));
+
+  // Check constant values.
+  const Scalar sample_point = Scalar(1) / Scalar(3); 
+  val = AD(sample_point,Deriv1::UnitX());
+  res1 = tanh(val);
+  VERIFY_IS_APPROX(res1.derivatives().x(), Scalar(0.896629559604914));
+
+  res2 = sinh(val);
+  VERIFY_IS_APPROX(res2.derivatives().x(), Scalar(1.056071867829939));
+
+  res3 = cosh(val);
+  VERIFY_IS_APPROX(res3.derivatives().x(), Scalar(0.339540557256150));
+}
+
+template <typename Scalar>
+void check_limits_specialization()
+{
+  typedef Eigen::Matrix<Scalar, 1, 1> Deriv;
+  typedef Eigen::AutoDiffScalar<Deriv> AD;
+
+  typedef std::numeric_limits<AD> A;
+  typedef std::numeric_limits<Scalar> B;
+
+  // workaround "unused typedef" warning:
+  VERIFY(!bool(internal::is_same<B, A>::value));
+
+#if EIGEN_HAS_CXX11
+  VERIFY(bool(std::is_base_of<B, A>::value));
+#endif
+}
+
+EIGEN_DECLARE_TEST(autodiff_scalar)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( check_atan2<float>() );
+    CALL_SUBTEST_2( check_atan2<double>() );
+    CALL_SUBTEST_3( check_hyperbolic_functions<float>() );
+    CALL_SUBTEST_4( check_hyperbolic_functions<double>() );
+    CALL_SUBTEST_5( check_limits_specialization<double>());
+  }
+}

diff --git a/unsupported/test/bessel_functions.cpp b/unsupported/test/bessel_functions.cpp
new file mode 100644
index 0000000..06765bf
--- /dev/null
+++ b/unsupported/test/bessel_functions.cpp

@@ -0,0 +1,370 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include "../Eigen/SpecialFunctions"
+
+template<typename X, typename Y>
+void verify_component_wise(const X& x, const Y& y)
+{
+  for(Index i=0; i<x.size(); ++i)
+  {
+    if((numext::isfinite)(y(i))) {
+      VERIFY_IS_APPROX( x(i), y(i) );
+    }
+    else if((numext::isnan)(y(i)))
+      VERIFY((numext::isnan)(x(i)));
+    else
+      VERIFY_IS_EQUAL( x(i), y(i) );
+  }
+}
+
+template<typename ArrayType> void array_bessel_functions() 
+{
+  // Test Bessel function i0. Reference results obtained with SciPy.
+  {
+    ArrayType x(21);
+    ArrayType expected(21);
+    ArrayType res(21);
+
+    x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+        2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+    expected << 4.35582826e+07, 6.21841242e+06, 8.93446228e+05, 1.29418563e+05,
+       1.89489253e+04, 2.81571663e+03, 4.27564116e+02, 6.72344070e+01,
+       1.13019220e+01, 2.27958530e+00, 1.00000000e+00, 2.27958530e+00,
+       1.13019220e+01, 6.72344070e+01, 4.27564116e+02, 2.81571663e+03,
+       1.89489253e+04, 1.29418563e+05, 8.93446228e+05, 6.21841242e+06,
+       4.35582826e+07;
+
+    CALL_SUBTEST(res = bessel_i0(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function i0e. Reference results obtained with SciPy.
+  {
+    ArrayType x(21);
+    ArrayType expected(21);
+    ArrayType res(21);
+
+    x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+        2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+    expected << 0.0897803118848, 0.0947062952128, 0.100544127361,
+        0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857,
+        0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554,
+        0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163,
+        0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128,
+        0.0897803118848;
+
+    CALL_SUBTEST(res = bessel_i0e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function i1. Reference results obtained with SciPy.
+  {
+    ArrayType x(21);
+    ArrayType expected(21);
+    ArrayType res(21);
+
+    x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+        2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+    expected << -4.24549734e+07, -6.04313324e+06, -8.65059436e+05, -1.24707259e+05,
+       -1.81413488e+04, -2.67098830e+03, -3.99873137e+02, -6.13419368e+01,
+       -9.75946515e+00, -1.59063685e+00,  0.00000000e+00,  1.59063685e+00,
+        9.75946515e+00,  6.13419368e+01,  3.99873137e+02,  2.67098830e+03,
+        1.81413488e+04,  1.24707259e+05,  8.65059436e+05,  6.04313324e+06,
+        4.24549734e+07;
+
+    CALL_SUBTEST(res = bessel_i1(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function i1e. Reference results obtained with SciPy.
+  {
+    ArrayType x(21);
+    ArrayType expected(21);
+    ArrayType res(21);
+
+    x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+        2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+    expected << -0.0875062221833, -0.092036796872, -0.0973496147565,
+        -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293,
+        -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249,
+        0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384,
+        0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872,
+        0.0875062221833;
+
+    CALL_SUBTEST(res = bessel_i1e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function j0. Reference results obtained with SciPy.
+  {
+    ArrayType x(77);
+    ArrayType expected(77);
+    ArrayType res(77);
+
+    x << -38., -37., -36., -35., -34., -33., -32., -31., -30.,
+      -29., -28., -27., -26., -25., -24., -23., -22., -21., -20., -19.,
+      -18., -17., -16., -15., -14., -13., -12., -11., -10.,  -9.,  -8.,
+       -7.,  -6.,  -5.,  -4.,  -3.,  -2.,  -1.,   0.,   1.,   2.,   3.,
+        4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,  12.,  13.,  14.,
+       15.,  16.,  17.,  18.,  19.,  20.,  21.,  22.,  23.,  24.,  25.,
+       26.,  27.,  28.,  29.,  30.,  31.,  32.,  33.,  34.,  35.,  36.,
+       37.,  38.;
+
+    expected << 0.11433274,  0.01086237, -0.10556738,
+             -0.12684568, -0.03042119,  0.09727067,  0.13807901,  0.05120815,
+             -0.08636798, -0.14784876, -0.07315701,  0.07274192,  0.15599932,
+              0.09626678, -0.05623027, -0.16241278, -0.12065148,  0.03657907,
+              0.16702466,  0.14662944, -0.01335581, -0.16985425, -0.17489907,
+             -0.01422447,  0.17107348,  0.2069261 ,  0.04768931, -0.1711903 ,
+             -0.24593576, -0.09033361,  0.17165081,  0.30007927,  0.15064526,
+             -0.17759677, -0.39714981, -0.26005195,  0.22389078,  0.76519769,
+              1.        ,  0.76519769,  0.22389078, -0.26005195, -0.39714981,
+             -0.17759677,  0.15064526,  0.30007927,  0.17165081, -0.09033361,
+             -0.24593576, -0.1711903 ,  0.04768931,  0.2069261 ,  0.17107348,
+             -0.01422447, -0.17489907, -0.16985425, -0.01335581,  0.14662944,
+              0.16702466,  0.03657907, -0.12065148, -0.16241278, -0.05623027,
+              0.09626678,  0.15599932,  0.07274192, -0.07315701, -0.14784876,
+             -0.08636798,  0.05120815,  0.13807901,  0.09727067, -0.03042119,
+             -0.12684568, -0.10556738,  0.01086237,  0.11433274;
+
+    CALL_SUBTEST(res = bessel_j0(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function j1. Reference results obtained with SciPy.
+  {
+    ArrayType x(81);
+    ArrayType expected(81);
+    ArrayType res(81);
+
+    x << -40., -39., -38., -37., -36., -35., -34., -33., -32., -31., -30.,
+      -29., -28., -27., -26., -25., -24., -23., -22., -21., -20., -19.,
+      -18., -17., -16., -15., -14., -13., -12., -11., -10.,  -9.,  -8.,
+       -7.,  -6.,  -5.,  -4.,  -3.,  -2.,  -1.,   0.,   1.,   2.,   3.,
+        4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,  12.,  13.,  14.,
+       15.,  16.,  17.,  18.,  19.,  20.,  21.,  22.,  23.,  24.,  25.,
+       26.,  27.,  28.,  29.,  30.,  31.,  32.,  33.,  34.,  35.,  36.,
+       37.,  38.,  39.,  40.;
+
+    expected << -0.12603832, -0.0640561 ,  0.05916189,  0.13058004,  0.08232981,
+             -0.04399094, -0.13297118, -0.10061965,  0.02658903,  0.13302432,
+              0.11875106, -0.0069342 , -0.13055149, -0.13658472, -0.01504573,
+              0.12535025,  0.15403807,  0.03951932, -0.11717779, -0.17112027,
+             -0.06683312,  0.10570143,  0.18799489,  0.09766849, -0.09039718,
+             -0.20510404, -0.13337515,  0.07031805,  0.2234471 ,  0.1767853 ,
+             -0.04347275, -0.24531179, -0.23463635,  0.00468282,  0.27668386,
+              0.32757914,  0.06604333, -0.33905896, -0.57672481, -0.44005059,
+              0.        ,  0.44005059,  0.57672481,  0.33905896, -0.06604333,
+             -0.32757914, -0.27668386, -0.00468282,  0.23463635,  0.24531179,
+              0.04347275, -0.1767853 , -0.2234471 , -0.07031805,  0.13337515,
+              0.20510404,  0.09039718, -0.09766849, -0.18799489, -0.10570143,
+              0.06683312,  0.17112027,  0.11717779, -0.03951932, -0.15403807,
+             -0.12535025,  0.01504573,  0.13658472,  0.13055149,  0.0069342 ,
+             -0.11875106, -0.13302432, -0.02658903,  0.10061965,  0.13297118,
+              0.04399094, -0.08232981, -0.13058004, -0.05916189,  0.0640561 ,
+              0.12603832;
+
+    CALL_SUBTEST(res = bessel_j1(x);
+                 verify_component_wise(res, expected););
+  }
+  // Test Bessel function k0e. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << 1.97933385, 1.52410939, 1.14446308, 0.84156822,
+             0.6977616 , 0.60929767, 0.54780756, 0.50186313, 0.4658451 ,
+             0.43662302, 0.41229555, 0.39163193, 0.3737955 , 0.35819488,
+             0.34439865, 0.33208364, 0.32100235, 0.31096159, 0.30180802,
+             0.29341821, 0.28569149, 0.27854488, 0.2719092 , 0.26572635,
+             0.25994703, 0.25452917, 0.2494366 , 0.24463801, 0.24010616,
+             0.23581722, 0.23175022, 0.22788667, 0.22421014, 0.22070602,
+             0.21736123, 0.21416406, 0.21110397, 0.20817141, 0.20535778,
+             0.20265524, 0.20005668, 0.19755558;
+
+    CALL_SUBTEST(res = bessel_k0e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function k0. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << 1.54150675, 0.92441907, 4.21024438e-01, 1.13893873e-01,
+             3.47395044e-02, 1.11596761e-02, 3.69109833e-03, 1.24399433e-03,
+             4.24795742e-04, 1.46470705e-04, 5.08813130e-05, 1.77800623e-05,
+             6.24302055e-06, 2.20082540e-06, 7.78454386e-07, 2.76137082e-07,
+             9.81953648e-08, 3.49941166e-08, 1.24946640e-08, 4.46875334e-09,
+             1.60067129e-09, 5.74123782e-10, 2.06176797e-10, 7.41235161e-11,
+             2.66754511e-11, 9.60881878e-12, 3.46416156e-12, 1.24987740e-12,
+             4.51286453e-13, 1.63053459e-13, 5.89495073e-14, 2.13247750e-14,
+             7.71838266e-15, 2.79505752e-15, 1.01266123e-15, 3.67057597e-16,
+             1.33103515e-16, 4.82858338e-17, 1.75232770e-17, 6.36161716e-18,
+             2.31029936e-18, 8.39286110e-19;
+
+    CALL_SUBTEST(res = bessel_k0(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function k0e. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << 1.97933385, 1.52410939, 1.14446308, 0.84156822,
+             0.6977616 , 0.60929767, 0.54780756, 0.50186313,
+             0.4658451 , 0.43662302, 0.41229555, 0.39163193,
+             0.3737955 , 0.35819488, 0.34439865, 0.33208364,
+             0.32100235, 0.31096159, 0.30180802, 0.29341821,
+             0.28569149, 0.27854488, 0.2719092 , 0.26572635,
+             0.25994703, 0.25452917, 0.2494366 , 0.24463801,
+             0.24010616, 0.23581722, 0.23175022, 0.22788667,
+             0.22421014, 0.22070602, 0.21736123, 0.21416406,
+             0.21110397, 0.20817141, 0.20535778, 0.20265524,
+             0.20005668, 0.19755558;
+
+    CALL_SUBTEST(res = bessel_k0e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function k1. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << 3.74702597, 1.65644112, 6.01907230e-01, 1.39865882e-01,
+             4.01564311e-02, 1.24834989e-02, 4.04461345e-03, 1.34391972e-03,
+             4.54182487e-04, 1.55369212e-04, 5.36370164e-05, 1.86487735e-05,
+             6.52086067e-06, 2.29075746e-06, 8.07858841e-07, 2.85834365e-07,
+             1.01417294e-07, 3.60715712e-08, 1.28570417e-08, 4.59124963e-09,
+             1.64226697e-09, 5.88305797e-10, 2.11029922e-10, 7.57898116e-11,
+             2.72493059e-11, 9.80699893e-12, 3.53277807e-12, 1.27369078e-12,
+             4.59568940e-13, 1.65940011e-13, 5.99574032e-14, 2.16773200e-14,
+             7.84189960e-15, 2.83839927e-15, 1.02789171e-15, 3.72416929e-16,
+             1.34991783e-16, 4.89519373e-17, 1.77585196e-17, 6.44478588e-18,
+             2.33973340e-18, 8.49713195e-19;
+
+    CALL_SUBTEST(res = bessel_k1(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function k1e. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << 4.81127659, 2.73100971, 1.63615349, 1.03347685,
+             0.80656348, 0.68157595, 0.60027386, 0.54217591,
+             0.49807158, 0.46314909, 0.43462525, 0.41076657,
+             0.39043094, 0.37283175, 0.35740757, 0.34374563,
+             0.33153489, 0.32053597, 0.31056123, 0.30146131,
+             0.29311559, 0.2854255 , 0.27830958, 0.27169987,
+             0.26553913, 0.25977879, 0.25437733, 0.249299  ,
+             0.24451285, 0.23999191, 0.2357126 , 0.23165413,
+             0.22779816, 0.22412841, 0.22063036, 0.21729103,
+             0.21409878, 0.21104314, 0.20811462, 0.20530466,
+             0.20260547, 0.20000997;
+
+    CALL_SUBTEST(res = bessel_k1e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function y0. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << -0.93157302, -0.44451873, 0.08825696,  0.51037567,  0.37685001,
+             -0.01694074, -0.30851763, -0.28819468, -0.02594974,  0.22352149,
+             0.2499367 ,  0.05567117, -0.16884732, -0.22523731, -0.07820786,
+             0.12719257,  0.2054643 , 0.095811  , -0.0926372 , -0.18755216,
+             -0.10951969,  0.0626406 , 0.17020176,  0.1198876 , -0.03598179,
+             -0.15283403, -0.12724943, 0.01204463,  0.13521498,  0.13183647,
+             0.00948116, -0.11729573, -0.13383266, -0.02874248,  0.09913483,
+             0.13340405,  0.04579799, -0.08085609, -0.13071488, -0.06066076,
+             0.06262353,  0.12593642;
+
+    CALL_SUBTEST(res = bessel_y0(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function y1. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << -2.70410523, -1.47147239, -0.78121282, -0.10703243,
+             0.32467442,  0.39792571,  0.14786314, -0.17501034, -0.30266724,
+             -0.15806046,  0.10431458,  0.24901542, 0.16370554, -0.05709922,
+             -0.21008141, -0.16664484,  0.02107363, 0.17797517,  0.16720504,
+             0.00815513, -0.14956011, -0.16551161, -0.03253926,  0.12340586,
+             0.1616692 ,  0.05305978, -0.09882996, -0.15579655, -0.07025124,
+             0.07552213,  0.14803412,  0.08442557, -0.05337283, -0.13854483,
+             -0.09578012,  0.03238588,  0.12751273, 0.10445477, -0.01262946,
+             -0.11514066, -0.11056411, -0.00579351;
+
+    CALL_SUBTEST(res = bessel_y1(x);
+                 verify_component_wise(res, expected););
+  }
+}
+
+EIGEN_DECLARE_TEST(bessel_functions)
+{
+  CALL_SUBTEST_1(array_bessel_functions<ArrayXf>());
+  CALL_SUBTEST_2(array_bessel_functions<ArrayXd>());
+}

diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp
new file mode 100644
index 0000000..7bf4e96
--- /dev/null
+++ b/unsupported/test/cxx11_eventcount.cpp

@@ -0,0 +1,142 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+#include "main.h"
+#include <Eigen/CXX11/ThreadPool>
+
+// Visual studio doesn't implement a rand_r() function since its
+// implementation of rand() is already thread safe
+int rand_reentrant(unsigned int* s) {
+#ifdef EIGEN_COMP_MSVC_STRICT
+  EIGEN_UNUSED_VARIABLE(s);
+  return rand();
+#else
+  return rand_r(s);
+#endif
+}
+
+static void test_basic_eventcount()
+{
+  MaxSizeVector<EventCount::Waiter> waiters(1);
+  waiters.resize(1);
+  EventCount ec(waiters);
+  EventCount::Waiter& w = waiters[0];
+  ec.Notify(false);
+  ec.Prewait();
+  ec.Notify(true);
+  ec.CommitWait(&w);
+  ec.Prewait();
+  ec.CancelWait();
+}
+
+// Fake bounded counter-based queue.
+struct TestQueue {
+  std::atomic<int> val_;
+  static const int kQueueSize = 10;
+
+  TestQueue() : val_() {}
+
+  ~TestQueue() { VERIFY_IS_EQUAL(val_.load(), 0); }
+
+  bool Push() {
+    int val = val_.load(std::memory_order_relaxed);
+    for (;;) {
+      VERIFY_GE(val, 0);
+      VERIFY_LE(val, kQueueSize);
+      if (val == kQueueSize) return false;
+      if (val_.compare_exchange_weak(val, val + 1, std::memory_order_relaxed))
+        return true;
+    }
+  }
+
+  bool Pop() {
+    int val = val_.load(std::memory_order_relaxed);
+    for (;;) {
+      VERIFY_GE(val, 0);
+      VERIFY_LE(val, kQueueSize);
+      if (val == 0) return false;
+      if (val_.compare_exchange_weak(val, val - 1, std::memory_order_relaxed))
+        return true;
+    }
+  }
+
+  bool Empty() { return val_.load(std::memory_order_relaxed) == 0; }
+};
+
+const int TestQueue::kQueueSize;
+
+// A number of producers send messages to a set of consumers using a set of
+// fake queues. Ensure that it does not crash, consumers don't deadlock and
+// number of blocked and unblocked threads match.
+static void test_stress_eventcount()
+{
+  const int kThreads = std::thread::hardware_concurrency();
+  static const int kEvents = 1 << 16;
+  static const int kQueues = 10;
+
+  MaxSizeVector<EventCount::Waiter> waiters(kThreads);
+  waiters.resize(kThreads);
+  EventCount ec(waiters);
+  TestQueue queues[kQueues];
+
+  std::vector<std::unique_ptr<std::thread>> producers;
+  for (int i = 0; i < kThreads; i++) {
+    producers.emplace_back(new std::thread([&ec, &queues]() {
+      unsigned int rnd = static_cast<unsigned int>(std::hash<std::thread::id>()(std::this_thread::get_id()));
+      for (int j = 0; j < kEvents; j++) {
+        unsigned idx = rand_reentrant(&rnd) % kQueues;
+        if (queues[idx].Push()) {
+          ec.Notify(false);
+          continue;
+        }
+        EIGEN_THREAD_YIELD();
+        j--;
+      }
+    }));
+  }
+
+  std::vector<std::unique_ptr<std::thread>> consumers;
+  for (int i = 0; i < kThreads; i++) {
+    consumers.emplace_back(new std::thread([&ec, &queues, &waiters, i]() {
+      EventCount::Waiter& w = waiters[i];
+      unsigned int rnd = static_cast<unsigned int>(std::hash<std::thread::id>()(std::this_thread::get_id()));
+      for (int j = 0; j < kEvents; j++) {
+        unsigned idx = rand_reentrant(&rnd) % kQueues;
+        if (queues[idx].Pop()) continue;
+        j--;
+        ec.Prewait();
+        bool empty = true;
+        for (int q = 0; q < kQueues; q++) {
+          if (!queues[q].Empty()) {
+            empty = false;
+            break;
+          }
+        }
+        if (!empty) {
+          ec.CancelWait();
+          continue;
+        }
+        ec.CommitWait(&w);
+      }
+    }));
+  }
+
+  for (int i = 0; i < kThreads; i++) {
+    producers[i]->join();
+    consumers[i]->join();
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_eventcount)
+{
+  CALL_SUBTEST(test_basic_eventcount());
+  CALL_SUBTEST(test_stress_eventcount());
+}

diff --git a/unsupported/test/cxx11_maxsizevector.cpp b/unsupported/test/cxx11_maxsizevector.cpp
new file mode 100644
index 0000000..46b689a
--- /dev/null
+++ b/unsupported/test/cxx11_maxsizevector.cpp

@@ -0,0 +1,77 @@
+#include "main.h"
+
+#include <exception>  // std::exception
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+struct Foo
+{
+  static Index object_count;
+  static Index object_limit;
+  EIGEN_ALIGN_TO_BOUNDARY(128) int dummy;
+
+  Foo(int x=0) : dummy(x)
+  {
+#ifdef EIGEN_EXCEPTIONS
+    // TODO: Is this the correct way to handle this?
+    if (Foo::object_count > Foo::object_limit) { std::cout << "\nThrow!\n"; throw Foo::Fail(); }
+#endif
+    std::cout << '+';
+    ++Foo::object_count;
+    eigen_assert((internal::UIntPtr(this) & (127)) == 0);
+  }
+  Foo(const Foo&)
+  {
+    std::cout << 'c';
+    ++Foo::object_count;
+    eigen_assert((internal::UIntPtr(this) & (127)) == 0);
+  }
+
+  ~Foo()
+  {
+    std::cout << '~';
+    --Foo::object_count;
+  }
+
+  class Fail : public std::exception {};
+};
+
+Index Foo::object_count = 0;
+Index Foo::object_limit = 0;
+
+
+
+EIGEN_DECLARE_TEST(cxx11_maxsizevector)
+{
+  typedef MaxSizeVector<Foo> VectorX;
+  Foo::object_count = 0;
+  for(int r = 0; r < g_repeat; r++) {
+    Index rows = internal::random<Index>(3,30);
+    Foo::object_limit = internal::random<Index>(0, rows - 2);
+    std::cout << "object_limit = " << Foo::object_limit << std::endl;
+    bool exception_raised = false;
+#ifdef EIGEN_EXCEPTIONS
+    try
+    {
+#endif
+      std::cout <<       "\nVectorX m(" << rows << ");\n";
+      VectorX vect(rows);
+      for(int i=0; i<rows; ++i)
+          vect.push_back(Foo());
+#ifdef EIGEN_EXCEPTIONS
+      VERIFY(false);  // not reached if exceptions are enabled
+    }
+    catch (const Foo::Fail&) { exception_raised = true; }
+    VERIFY(exception_raised);
+#endif
+    VERIFY_IS_EQUAL(Index(0), Foo::object_count);
+
+    {
+      Foo::object_limit = rows+1;
+      VectorX vect2(rows, Foo());
+      VERIFY_IS_EQUAL(Foo::object_count, rows);
+    }
+    VERIFY_IS_EQUAL(Index(0), Foo::object_count);
+    std::cout << '\n';
+  }
+}

diff --git a/unsupported/test/cxx11_meta.cpp b/unsupported/test/cxx11_meta.cpp
new file mode 100644
index 0000000..510e110
--- /dev/null
+++ b/unsupported/test/cxx11_meta.cpp

@@ -0,0 +1,357 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <array>
+#include <Eigen/CXX11/src/util/CXX11Meta.h>
+
+using Eigen::internal::is_same;
+using Eigen::internal::type_list;
+using Eigen::internal::numeric_list;
+using Eigen::internal::gen_numeric_list;
+using Eigen::internal::gen_numeric_list_reversed;
+using Eigen::internal::gen_numeric_list_swapped_pair;
+using Eigen::internal::gen_numeric_list_repeated;
+using Eigen::internal::concat;
+using Eigen::internal::mconcat;
+using Eigen::internal::take;
+using Eigen::internal::skip;
+using Eigen::internal::slice;
+using Eigen::internal::get;
+using Eigen::internal::id_numeric;
+using Eigen::internal::id_type;
+using Eigen::internal::is_same_gf;
+using Eigen::internal::apply_op_from_left;
+using Eigen::internal::apply_op_from_right;
+using Eigen::internal::contained_in_list;
+using Eigen::internal::contained_in_list_gf;
+using Eigen::internal::arg_prod;
+using Eigen::internal::arg_sum;
+using Eigen::internal::sum_op;
+using Eigen::internal::product_op;
+using Eigen::internal::array_reverse;
+using Eigen::internal::array_sum;
+using Eigen::internal::array_prod;
+using Eigen::internal::array_reduce;
+using Eigen::internal::array_zip;
+using Eigen::internal::array_zip_and_reduce;
+using Eigen::internal::array_apply;
+using Eigen::internal::array_apply_and_reduce;
+using Eigen::internal::repeat;
+using Eigen::internal::instantiate_by_c_array;
+
+struct dummy_a {};
+struct dummy_b {};
+struct dummy_c {};
+struct dummy_d {};
+struct dummy_e {};
+
+// dummy operation for testing apply
+template<typename A, typename B> struct dummy_op;
+template<> struct dummy_op<dummy_a, dummy_b> { typedef dummy_c type; };
+template<> struct dummy_op<dummy_b, dummy_a> { typedef dummy_d type; };
+template<> struct dummy_op<dummy_b, dummy_c> { typedef dummy_a type; };
+template<> struct dummy_op<dummy_c, dummy_b> { typedef dummy_d type; };
+template<> struct dummy_op<dummy_c, dummy_a> { typedef dummy_b type; };
+template<> struct dummy_op<dummy_a, dummy_c> { typedef dummy_d type; };
+template<> struct dummy_op<dummy_a, dummy_a> { typedef dummy_e type; };
+template<> struct dummy_op<dummy_b, dummy_b> { typedef dummy_e type; };
+template<> struct dummy_op<dummy_c, dummy_c> { typedef dummy_e type; };
+
+template<typename A, typename B> struct dummy_test { constexpr static bool value = false; constexpr static int global_flags = 0; };
+template<> struct dummy_test<dummy_a, dummy_a>     { constexpr static bool value = true;  constexpr static int global_flags = 1; };
+template<> struct dummy_test<dummy_b, dummy_b>     { constexpr static bool value = true;  constexpr static int global_flags = 2; };
+template<> struct dummy_test<dummy_c, dummy_c>     { constexpr static bool value = true;  constexpr static int global_flags = 4; };
+
+struct times2_op { template<typename A> static A run(A v) { return v * 2; } };
+
+struct dummy_inst
+{
+  int c;
+
+  dummy_inst() : c(0) {}
+  explicit dummy_inst(int) : c(1) {}
+  dummy_inst(int, int) : c(2) {}
+  dummy_inst(int, int, int) : c(3) {}
+  dummy_inst(int, int, int, int) : c(4) {}
+  dummy_inst(int, int, int, int, int) : c(5) {}
+};
+
+static void test_gen_numeric_list()
+{
+  VERIFY((is_same<typename gen_numeric_list<int, 0>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 1>::type, numeric_list<int, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 2>::type, numeric_list<int, 0, 1>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 5>::type, numeric_list<int, 0, 1, 2, 3, 4>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 10>::type, numeric_list<int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9>>::value));
+
+  VERIFY((is_same<typename gen_numeric_list<int, 0, 42>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 1, 42>::type, numeric_list<int, 42>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 2, 42>::type, numeric_list<int, 42, 43>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 5, 42>::type, numeric_list<int, 42, 43, 44, 45, 46>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 10, 42>::type, numeric_list<int, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51>>::value));
+
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 0>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 1>::type, numeric_list<int, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 2>::type, numeric_list<int, 1, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 5>::type, numeric_list<int, 4, 3, 2, 1, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 10>::type, numeric_list<int, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>>::value));
+
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 0, 42>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 1, 42>::type, numeric_list<int, 42>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 2, 42>::type, numeric_list<int, 43, 42>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 5, 42>::type, numeric_list<int, 46, 45, 44, 43, 42>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 10, 42>::type, numeric_list<int, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42>>::value));
+
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 0, 2, 3>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 1, 2, 3>::type, numeric_list<int, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 2, 2, 3>::type, numeric_list<int, 0, 1>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 5, 2, 3>::type, numeric_list<int, 0, 1, 3, 2, 4>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 10, 2, 3>::type, numeric_list<int, 0, 1, 3, 2, 4, 5, 6, 7, 8, 9>>::value));
+
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 0, 44, 45, 42>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 1, 44, 45, 42>::type, numeric_list<int, 42>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 2, 44, 45, 42>::type, numeric_list<int, 42, 43>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 5, 44, 45, 42>::type, numeric_list<int, 42, 43, 45, 44, 46>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 10, 44, 45, 42>::type, numeric_list<int, 42, 43, 45, 44, 46, 47, 48, 49, 50, 51>>::value));
+
+  VERIFY((is_same<typename gen_numeric_list_repeated<int, 0, 0>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename gen_numeric_list_repeated<int, 1, 0>::type, numeric_list<int, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list_repeated<int, 2, 0>::type, numeric_list<int, 0, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list_repeated<int, 5, 0>::type, numeric_list<int, 0, 0, 0, 0, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list_repeated<int, 10, 0>::type, numeric_list<int, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>>::value));
+}
+
+static void test_concat()
+{
+  VERIFY((is_same<typename concat<type_list<dummy_a, dummy_a>, type_list<>>::type, type_list<dummy_a, dummy_a>>::value));
+  VERIFY((is_same<typename concat<type_list<>, type_list<dummy_a, dummy_a>>::type, type_list<dummy_a, dummy_a>>::value));
+  VERIFY((is_same<typename concat<type_list<dummy_a, dummy_a>, type_list<dummy_a, dummy_a>>::type, type_list<dummy_a, dummy_a, dummy_a, dummy_a>>::value));
+  VERIFY((is_same<typename concat<type_list<dummy_a, dummy_a>, type_list<dummy_b, dummy_c>>::type, type_list<dummy_a, dummy_a, dummy_b, dummy_c>>::value));
+  VERIFY((is_same<typename concat<type_list<dummy_a>, type_list<dummy_b, dummy_c>>::type, type_list<dummy_a, dummy_b, dummy_c>>::value));
+
+  VERIFY((is_same<typename concat<numeric_list<int, 0, 0>, numeric_list<int>>::type, numeric_list<int, 0, 0>>::value));
+  VERIFY((is_same<typename concat<numeric_list<int>, numeric_list<int, 0, 0>>::type, numeric_list<int, 0, 0>>::value));
+  VERIFY((is_same<typename concat<numeric_list<int, 0, 0>, numeric_list<int, 0, 0>>::type, numeric_list<int, 0, 0, 0, 0>>::value));
+  VERIFY((is_same<typename concat<numeric_list<int, 0, 0>, numeric_list<int, 1, 2>>::type, numeric_list<int, 0, 0, 1, 2>>::value));
+  VERIFY((is_same<typename concat<numeric_list<int, 0>, numeric_list<int, 1, 2>>::type, numeric_list<int, 0, 1, 2>>::value));
+
+  VERIFY((is_same<typename mconcat<type_list<dummy_a>>::type, type_list<dummy_a>>::value));
+  VERIFY((is_same<typename mconcat<type_list<dummy_a>, type_list<dummy_b>>::type, type_list<dummy_a, dummy_b>>::value));
+  VERIFY((is_same<typename mconcat<type_list<dummy_a>, type_list<dummy_b>, type_list<dummy_c>>::type, type_list<dummy_a, dummy_b, dummy_c>>::value));
+  VERIFY((is_same<typename mconcat<type_list<dummy_a>, type_list<dummy_b, dummy_c>>::type, type_list<dummy_a, dummy_b, dummy_c>>::value));
+  VERIFY((is_same<typename mconcat<type_list<dummy_a, dummy_b>, type_list<dummy_c>>::type, type_list<dummy_a, dummy_b, dummy_c>>::value));
+
+  VERIFY((is_same<typename mconcat<numeric_list<int, 0>>::type, numeric_list<int, 0>>::value));
+  VERIFY((is_same<typename mconcat<numeric_list<int, 0>, numeric_list<int, 1>>::type, numeric_list<int, 0, 1>>::value));
+  VERIFY((is_same<typename mconcat<numeric_list<int, 0>, numeric_list<int, 1>, numeric_list<int, 2>>::type, numeric_list<int, 0, 1, 2>>::value));
+  VERIFY((is_same<typename mconcat<numeric_list<int, 0>, numeric_list<int, 1, 2>>::type, numeric_list<int, 0, 1, 2>>::value));
+  VERIFY((is_same<typename mconcat<numeric_list<int, 0, 1>, numeric_list<int, 2>>::type, numeric_list<int, 0, 1, 2>>::value));
+}
+
+static void test_slice()
+{
+  typedef type_list<dummy_a, dummy_a, dummy_b, dummy_b, dummy_c, dummy_c> tl;
+  typedef numeric_list<int, 0, 1, 2, 3, 4, 5> il;
+
+  VERIFY((is_same<typename take<0, tl>::type, type_list<>>::value));
+  VERIFY((is_same<typename take<1, tl>::type, type_list<dummy_a>>::value));
+  VERIFY((is_same<typename take<2, tl>::type, type_list<dummy_a, dummy_a>>::value));
+  VERIFY((is_same<typename take<3, tl>::type, type_list<dummy_a, dummy_a, dummy_b>>::value));
+  VERIFY((is_same<typename take<4, tl>::type, type_list<dummy_a, dummy_a, dummy_b, dummy_b>>::value));
+  VERIFY((is_same<typename take<5, tl>::type, type_list<dummy_a, dummy_a, dummy_b, dummy_b, dummy_c>>::value));
+  VERIFY((is_same<typename take<6, tl>::type, type_list<dummy_a, dummy_a, dummy_b, dummy_b, dummy_c, dummy_c>>::value));
+
+  VERIFY((is_same<typename take<0, il>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename take<1, il>::type, numeric_list<int, 0>>::value));
+  VERIFY((is_same<typename take<2, il>::type, numeric_list<int, 0, 1>>::value));
+  VERIFY((is_same<typename take<3, il>::type, numeric_list<int, 0, 1, 2>>::value));
+  VERIFY((is_same<typename take<4, il>::type, numeric_list<int, 0, 1, 2, 3>>::value));
+  VERIFY((is_same<typename take<5, il>::type, numeric_list<int, 0, 1, 2, 3, 4>>::value));
+  VERIFY((is_same<typename take<6, il>::type, numeric_list<int, 0, 1, 2, 3, 4, 5>>::value));
+  
+  VERIFY((is_same<typename skip<0, tl>::type, type_list<dummy_a, dummy_a, dummy_b, dummy_b, dummy_c, dummy_c>>::value));
+  VERIFY((is_same<typename skip<1, tl>::type, type_list<dummy_a, dummy_b, dummy_b, dummy_c, dummy_c>>::value));
+  VERIFY((is_same<typename skip<2, tl>::type, type_list<dummy_b, dummy_b, dummy_c, dummy_c>>::value));
+  VERIFY((is_same<typename skip<3, tl>::type, type_list<dummy_b, dummy_c, dummy_c>>::value));
+  VERIFY((is_same<typename skip<4, tl>::type, type_list<dummy_c, dummy_c>>::value));
+  VERIFY((is_same<typename skip<5, tl>::type, type_list<dummy_c>>::value));
+  VERIFY((is_same<typename skip<6, tl>::type, type_list<>>::value));
+
+  VERIFY((is_same<typename skip<0, il>::type, numeric_list<int, 0, 1, 2, 3, 4, 5>>::value));
+  VERIFY((is_same<typename skip<1, il>::type, numeric_list<int, 1, 2, 3, 4, 5>>::value));
+  VERIFY((is_same<typename skip<2, il>::type, numeric_list<int, 2, 3, 4, 5>>::value));
+  VERIFY((is_same<typename skip<3, il>::type, numeric_list<int, 3, 4, 5>>::value));
+  VERIFY((is_same<typename skip<4, il>::type, numeric_list<int, 4, 5>>::value));
+  VERIFY((is_same<typename skip<5, il>::type, numeric_list<int, 5>>::value));
+  VERIFY((is_same<typename skip<6, il>::type, numeric_list<int>>::value));
+
+  VERIFY((is_same<typename slice<0, 3, tl>::type, typename take<3, tl>::type>::value));
+  VERIFY((is_same<typename slice<0, 3, il>::type, typename take<3, il>::type>::value));
+  VERIFY((is_same<typename slice<1, 3, tl>::type, type_list<dummy_a, dummy_b, dummy_b>>::value));
+  VERIFY((is_same<typename slice<1, 3, il>::type, numeric_list<int, 1, 2, 3>>::value));
+}
+
+static void test_get()
+{
+  typedef type_list<dummy_a, dummy_a, dummy_b, dummy_b, dummy_c, dummy_c> tl;
+  typedef numeric_list<int, 4, 8, 15, 16, 23, 42> il;
+
+  VERIFY((is_same<typename get<0, tl>::type, dummy_a>::value));
+  VERIFY((is_same<typename get<1, tl>::type, dummy_a>::value));
+  VERIFY((is_same<typename get<2, tl>::type, dummy_b>::value));
+  VERIFY((is_same<typename get<3, tl>::type, dummy_b>::value));
+  VERIFY((is_same<typename get<4, tl>::type, dummy_c>::value));
+  VERIFY((is_same<typename get<5, tl>::type, dummy_c>::value));
+
+  VERIFY_IS_EQUAL(((int)get<0, il>::value), 4);
+  VERIFY_IS_EQUAL(((int)get<1, il>::value), 8);
+  VERIFY_IS_EQUAL(((int)get<2, il>::value), 15);
+  VERIFY_IS_EQUAL(((int)get<3, il>::value), 16);
+  VERIFY_IS_EQUAL(((int)get<4, il>::value), 23);
+  VERIFY_IS_EQUAL(((int)get<5, il>::value), 42);
+}
+
+static void test_id_helper(dummy_a a, dummy_a b, dummy_a c)
+{
+  (void)a;
+  (void)b;
+  (void)c;
+}
+
+template<int... ii>
+static void test_id_numeric()
+{
+  test_id_helper(typename id_numeric<int, ii, dummy_a>::type()...);
+}
+
+template<typename... tt>
+static void test_id_type()
+{
+  test_id_helper(typename id_type<tt, dummy_a>::type()...);
+}
+
+static void test_id()
+{
+  // don't call VERIFY here, just assume it works if it compiles
+  // (otherwise it will complain that it can't find the function)
+  test_id_numeric<1, 4, 6>();
+  test_id_type<dummy_a, dummy_b, dummy_c>();
+}
+
+static void test_is_same_gf()
+{
+  VERIFY((!is_same_gf<dummy_a, dummy_b>::value));
+  VERIFY((!!is_same_gf<dummy_a, dummy_a>::value));
+  VERIFY_IS_EQUAL((!!is_same_gf<dummy_a, dummy_b>::global_flags), false);
+  VERIFY_IS_EQUAL((!!is_same_gf<dummy_a, dummy_a>::global_flags), false);
+}
+
+static void test_apply_op()
+{
+  typedef type_list<dummy_a, dummy_b, dummy_c> tl;
+  VERIFY((!!is_same<typename apply_op_from_left<dummy_op, dummy_a, tl>::type, type_list<dummy_e, dummy_c, dummy_d>>::value));
+  VERIFY((!!is_same<typename apply_op_from_right<dummy_op, dummy_a, tl>::type, type_list<dummy_e, dummy_d, dummy_b>>::value));
+}
+
+static void test_contained_in_list()
+{
+  typedef type_list<dummy_a, dummy_b, dummy_c> tl;
+
+  VERIFY((!!contained_in_list<is_same, dummy_a, tl>::value));
+  VERIFY((!!contained_in_list<is_same, dummy_b, tl>::value));
+  VERIFY((!!contained_in_list<is_same, dummy_c, tl>::value));
+  VERIFY((!contained_in_list<is_same, dummy_d, tl>::value));
+  VERIFY((!contained_in_list<is_same, dummy_e, tl>::value));
+
+  VERIFY((!!contained_in_list_gf<dummy_test, dummy_a, tl>::value));
+  VERIFY((!!contained_in_list_gf<dummy_test, dummy_b, tl>::value));
+  VERIFY((!!contained_in_list_gf<dummy_test, dummy_c, tl>::value));
+  VERIFY((!contained_in_list_gf<dummy_test, dummy_d, tl>::value));
+  VERIFY((!contained_in_list_gf<dummy_test, dummy_e, tl>::value));
+
+  VERIFY_IS_EQUAL(((int)contained_in_list_gf<dummy_test, dummy_a, tl>::global_flags), 1);
+  VERIFY_IS_EQUAL(((int)contained_in_list_gf<dummy_test, dummy_b, tl>::global_flags), 2);
+  VERIFY_IS_EQUAL(((int)contained_in_list_gf<dummy_test, dummy_c, tl>::global_flags), 4);
+  VERIFY_IS_EQUAL(((int)contained_in_list_gf<dummy_test, dummy_d, tl>::global_flags), 0);
+  VERIFY_IS_EQUAL(((int)contained_in_list_gf<dummy_test, dummy_e, tl>::global_flags), 0);
+}
+
+static void test_arg_reductions()
+{
+  VERIFY_IS_EQUAL(arg_sum(1,2,3,4), 10);
+  VERIFY_IS_EQUAL(arg_prod(1,2,3,4), 24);
+  VERIFY_IS_APPROX(arg_sum(0.5, 2, 5), 7.5);
+  VERIFY_IS_APPROX(arg_prod(0.5, 2, 5), 5.0);
+}
+
+static void test_array_reverse_and_reduce()
+{
+  array<int, 6> a{{4, 8, 15, 16, 23, 42}};
+  array<int, 6> b{{42, 23, 16, 15, 8, 4}};
+
+  // there is no operator<< for std::array, so VERIFY_IS_EQUAL will
+  // not compile
+  VERIFY((array_reverse(a) == b));
+  VERIFY((array_reverse(b) == a));
+  VERIFY_IS_EQUAL((array_sum(a)), 108);
+  VERIFY_IS_EQUAL((array_sum(b)), 108);
+  VERIFY_IS_EQUAL((array_prod(a)), 7418880);
+  VERIFY_IS_EQUAL((array_prod(b)), 7418880);
+}
+
+static void test_array_zip_and_apply()
+{
+  array<int, 6> a{{4, 8, 15, 16, 23, 42}};
+  array<int, 6> b{{0, 1, 2, 3, 4, 5}};
+  array<int, 6> c{{4, 9, 17, 19, 27, 47}};
+  array<int, 6> d{{0, 8, 30, 48, 92, 210}};
+  array<int, 6> e{{0, 2, 4, 6, 8, 10}};
+
+  VERIFY((array_zip<sum_op>(a, b) == c));
+  VERIFY((array_zip<product_op>(a, b) == d));
+  VERIFY((array_apply<times2_op>(b) == e));
+  VERIFY_IS_EQUAL((array_apply_and_reduce<sum_op, times2_op>(a)), 216);
+  VERIFY_IS_EQUAL((array_apply_and_reduce<sum_op, times2_op>(b)), 30);
+  VERIFY_IS_EQUAL((array_zip_and_reduce<product_op, sum_op>(a, b)), 14755932);
+  VERIFY_IS_EQUAL((array_zip_and_reduce<sum_op, product_op>(a, b)), 388);
+}
+
+static void test_array_misc()
+{
+  array<int, 3> a3{{1, 1, 1}};
+  array<int, 6> a6{{2, 2, 2, 2, 2, 2}};
+  VERIFY((repeat<3, int>(1) == a3));
+  VERIFY((repeat<6, int>(2) == a6));
+
+  int data[5] = { 0, 1, 2, 3, 4 };
+  VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 0>(data).c), 0);
+  VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 1>(data).c), 1);
+  VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 2>(data).c), 2);
+  VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 3>(data).c), 3);
+  VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 4>(data).c), 4);
+  VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 5>(data).c), 5);
+}
+
+EIGEN_DECLARE_TEST(cxx11_meta)
+{
+  CALL_SUBTEST(test_gen_numeric_list());
+  CALL_SUBTEST(test_concat());
+  CALL_SUBTEST(test_slice());
+  CALL_SUBTEST(test_get());
+  CALL_SUBTEST(test_id());
+  CALL_SUBTEST(test_is_same_gf());
+  CALL_SUBTEST(test_apply_op());
+  CALL_SUBTEST(test_contained_in_list());
+  CALL_SUBTEST(test_arg_reductions());
+  CALL_SUBTEST(test_array_reverse_and_reduce());
+  CALL_SUBTEST(test_array_zip_and_apply());
+  CALL_SUBTEST(test_array_misc());
+}

diff --git a/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
new file mode 100644
index 0000000..993ee17
--- /dev/null
+++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp

@@ -0,0 +1,180 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+#include "main.h"
+#include "Eigen/CXX11/ThreadPool"
+#include "Eigen/CXX11/Tensor"
+
+static void test_create_destroy_empty_pool()
+{
+  // Just create and destroy the pool. This will wind up and tear down worker
+  // threads. Ensure there are no issues in that logic.
+  for (int i = 0; i < 16; ++i) {
+    ThreadPool tp(i);
+  }
+}
+
+
+static void test_parallelism(bool allow_spinning)
+{
+  // Test we never-ever fail to match available tasks with idle threads.
+  const int kThreads = 16;  // code below expects that this is a multiple of 4
+  ThreadPool tp(kThreads, allow_spinning);
+  VERIFY_IS_EQUAL(tp.NumThreads(), kThreads);
+  VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1);
+  for (int iter = 0; iter < 100; ++iter) {
+    std::atomic<int> running(0);
+    std::atomic<int> done(0);
+    std::atomic<int> phase(0);
+    // Schedule kThreads tasks and ensure that they all are running.
+    for (int i = 0; i < kThreads; ++i) {
+      tp.Schedule([&]() {
+        const int thread_id = tp.CurrentThreadId();
+        VERIFY_GE(thread_id, 0);
+        VERIFY_LE(thread_id, kThreads - 1);
+        running++;
+        while (phase < 1) {
+        }
+        done++;
+      });
+    }
+    while (running != kThreads) {
+    }
+    running = 0;
+    phase = 1;
+    // Now, while the previous tasks exit, schedule another kThreads tasks and
+    // ensure that they are running.
+    for (int i = 0; i < kThreads; ++i) {
+      tp.Schedule([&, i]() {
+        running++;
+        while (phase < 2) {
+        }
+        // When all tasks are running, half of tasks exit, quarter of tasks
+        // continue running and quarter of tasks schedule another 2 tasks each.
+        // Concurrently main thread schedules another quarter of tasks.
+        // This gives us another kThreads tasks and we ensure that they all
+        // are running.
+        if (i < kThreads / 2) {
+        } else if (i < 3 * kThreads / 4) {
+          running++;
+          while (phase < 3) {
+          }
+          done++;
+        } else {
+          for (int j = 0; j < 2; ++j) {
+            tp.Schedule([&]() {
+              running++;
+              while (phase < 3) {
+              }
+              done++;
+            });
+          }
+        }
+        done++;
+      });
+    }
+    while (running != kThreads) {
+    }
+    running = 0;
+    phase = 2;
+    for (int i = 0; i < kThreads / 4; ++i) {
+      tp.Schedule([&]() {
+        running++;
+        while (phase < 3) {
+        }
+        done++;
+      });
+    }
+    while (running != kThreads) {
+    }
+    phase = 3;
+    while (done != 3 * kThreads) {
+    }
+  }
+}
+
+
+static void test_cancel()
+{
+  ThreadPool tp(2);
+
+  // Schedule a large number of closure that each sleeps for one second. This
+  // will keep the thread pool busy for much longer than the default test timeout.
+  for (int i = 0; i < 1000; ++i) {
+    tp.Schedule([]() {
+      std::this_thread::sleep_for(std::chrono::milliseconds(2000));
+    });
+  }
+
+  // Cancel the processing of all the closures that are still pending.
+  tp.Cancel();
+}
+
+static void test_pool_partitions() {
+  const int kThreads = 2;
+  ThreadPool tp(kThreads);
+
+  // Assign each thread to its own partition, so that stealing other work only
+  // occurs globally when a thread is idle.
+  std::vector<std::pair<unsigned, unsigned>> steal_partitions(kThreads);
+  for (int i = 0; i < kThreads; ++i) {
+    steal_partitions[i] = std::make_pair(i, i + 1);
+  }
+  tp.SetStealPartitions(steal_partitions);
+
+  std::atomic<int> running(0);
+  std::atomic<int> done(0);
+  std::atomic<int> phase(0);
+
+  // Schedule kThreads tasks and ensure that they all are running.
+  for (int i = 0; i < kThreads; ++i) {
+    tp.Schedule([&]() {
+      const int thread_id = tp.CurrentThreadId();
+      VERIFY_GE(thread_id, 0);
+      VERIFY_LE(thread_id, kThreads - 1);
+      ++running;
+      while (phase < 1) {
+      }
+      ++done;
+    });
+  }
+  while (running != kThreads) {
+  }
+  // Schedule each closure to only run on thread 'i' and verify that it does.
+  for (int i = 0; i < kThreads; ++i) {
+    tp.ScheduleWithHint(
+        [&, i]() {
+          ++running;
+          const int thread_id = tp.CurrentThreadId();
+          VERIFY_IS_EQUAL(thread_id, i);
+          while (phase < 2) {
+          }
+          ++done;
+        },
+        i, i + 1);
+  }
+  running = 0;
+  phase = 1;
+  while (running != kThreads) {
+  }
+  running = 0;
+  phase = 2;
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_non_blocking_thread_pool)
+{
+  CALL_SUBTEST(test_create_destroy_empty_pool());
+  CALL_SUBTEST(test_parallelism(true));
+  CALL_SUBTEST(test_parallelism(false));
+  CALL_SUBTEST(test_cancel());
+  CALL_SUBTEST(test_pool_partitions());
+}

diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp
new file mode 100644
index 0000000..8fc5a30
--- /dev/null
+++ b/unsupported/test/cxx11_runqueue.cpp

@@ -0,0 +1,235 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+#include <cstdlib>
+#include "main.h"
+#include <Eigen/CXX11/ThreadPool>
+
+
+// Visual studio doesn't implement a rand_r() function since its
+// implementation of rand() is already thread safe
+int rand_reentrant(unsigned int* s) {
+#ifdef EIGEN_COMP_MSVC_STRICT
+  EIGEN_UNUSED_VARIABLE(s);
+  return rand();
+#else
+  return rand_r(s);
+#endif
+}
+
+void test_basic_runqueue()
+{
+  RunQueue<int, 4> q;
+  // Check empty state.
+  VERIFY(q.Empty());
+  VERIFY_IS_EQUAL(0u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PopFront());
+  std::vector<int> stolen;
+  VERIFY_IS_EQUAL(0u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(0u, stolen.size());
+  // Push one front, pop one front.
+  VERIFY_IS_EQUAL(0, q.PushFront(1));
+  VERIFY_IS_EQUAL(1u, q.Size());
+  VERIFY_IS_EQUAL(1, q.PopFront());
+  VERIFY_IS_EQUAL(0u, q.Size());
+  // Push front to overflow.
+  VERIFY_IS_EQUAL(0, q.PushFront(2));
+  VERIFY_IS_EQUAL(1u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushFront(3));
+  VERIFY_IS_EQUAL(2u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushFront(4));
+  VERIFY_IS_EQUAL(3u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushFront(5));
+  VERIFY_IS_EQUAL(4u, q.Size());
+  VERIFY_IS_EQUAL(6, q.PushFront(6));
+  VERIFY_IS_EQUAL(4u, q.Size());
+  VERIFY_IS_EQUAL(5, q.PopFront());
+  VERIFY_IS_EQUAL(3u, q.Size());
+  VERIFY_IS_EQUAL(4, q.PopFront());
+  VERIFY_IS_EQUAL(2u, q.Size());
+  VERIFY_IS_EQUAL(3, q.PopFront());
+  VERIFY_IS_EQUAL(1u, q.Size());
+  VERIFY_IS_EQUAL(2, q.PopFront());
+  VERIFY_IS_EQUAL(0u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PopFront());
+  // Push one back, pop one back.
+  VERIFY_IS_EQUAL(0, q.PushBack(7));
+  VERIFY_IS_EQUAL(1u, q.Size());
+  VERIFY_IS_EQUAL(1u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(1u, stolen.size());
+  VERIFY_IS_EQUAL(7, stolen[0]);
+  VERIFY_IS_EQUAL(0u, q.Size());
+  stolen.clear();
+  // Push back to overflow.
+  VERIFY_IS_EQUAL(0, q.PushBack(8));
+  VERIFY_IS_EQUAL(1u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushBack(9));
+  VERIFY_IS_EQUAL(2u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushBack(10));
+  VERIFY_IS_EQUAL(3u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushBack(11));
+  VERIFY_IS_EQUAL(4u, q.Size());
+  VERIFY_IS_EQUAL(12, q.PushBack(12));
+  VERIFY_IS_EQUAL(4u, q.Size());
+  // Pop back in halves.
+  VERIFY_IS_EQUAL(2u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(2u, stolen.size());
+  VERIFY_IS_EQUAL(10, stolen[0]);
+  VERIFY_IS_EQUAL(11, stolen[1]);
+  VERIFY_IS_EQUAL(2u, q.Size());
+  stolen.clear();
+  VERIFY_IS_EQUAL(1u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(1u, stolen.size());
+  VERIFY_IS_EQUAL(9, stolen[0]);
+  VERIFY_IS_EQUAL(1u, q.Size());
+  stolen.clear();
+  VERIFY_IS_EQUAL(1u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(1u, stolen.size());
+  VERIFY_IS_EQUAL(8, stolen[0]);
+  stolen.clear();
+  VERIFY_IS_EQUAL(0u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(0u, stolen.size());
+  // Empty again.
+  VERIFY(q.Empty());
+  VERIFY_IS_EQUAL(0u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushFront(1));
+  VERIFY_IS_EQUAL(0, q.PushFront(2));
+  VERIFY_IS_EQUAL(0, q.PushFront(3));
+  VERIFY_IS_EQUAL(1, q.PopBack());
+  VERIFY_IS_EQUAL(2, q.PopBack());
+  VERIFY_IS_EQUAL(3, q.PopBack());
+  VERIFY(q.Empty());
+  VERIFY_IS_EQUAL(0u, q.Size());
+}
+
+// Empty tests that the queue is not claimed to be empty when is is in fact not.
+// Emptiness property is crucial part of thread pool blocking scheme,
+// so we go to great effort to ensure this property. We create a queue with
+// 1 element and then push 1 element (either front or back at random) and pop
+// 1 element (either front or back at random). So queue always contains at least
+// 1 element, but otherwise changes chaotically. Another thread constantly tests
+// that the queue is not claimed to be empty.
+void test_empty_runqueue()
+{
+  RunQueue<int, 4> q;
+  q.PushFront(1);
+  std::atomic<bool> done(false);
+  std::thread mutator([&q, &done]() {
+    unsigned rnd = 0;
+    std::vector<int> stolen;
+    for (int i = 0; i < 1 << 18; i++) {
+      if (rand_reentrant(&rnd) % 2)
+        VERIFY_IS_EQUAL(0, q.PushFront(1));
+      else
+        VERIFY_IS_EQUAL(0, q.PushBack(1));
+      if (rand_reentrant(&rnd) % 2)
+        VERIFY_IS_EQUAL(1, q.PopFront());
+      else {
+        for (;;) {
+          if (q.PopBackHalf(&stolen) == 1) {
+            stolen.clear();
+            break;
+          }
+          VERIFY_IS_EQUAL(0u, stolen.size());
+        }
+      }
+    }
+    done = true;
+  });
+  while (!done) {
+    VERIFY(!q.Empty());
+    int size = q.Size();
+    VERIFY_GE(size, 1);
+    VERIFY_LE(size, 2);
+  }
+  VERIFY_IS_EQUAL(1, q.PopFront());
+  mutator.join();
+}
+
+// Stress is a chaotic random test.
+// One thread (owner) calls PushFront/PopFront, other threads call PushBack/
+// PopBack. Ensure that we don't crash, deadlock, and all sanity checks pass.
+void test_stress_runqueue()
+{
+  static const int kEvents = 1 << 18;
+  RunQueue<int, 8> q;
+  std::atomic<int> total(0);
+  std::vector<std::unique_ptr<std::thread>> threads;
+  threads.emplace_back(new std::thread([&q, &total]() {
+    int sum = 0;
+    int pushed = 1;
+    int popped = 1;
+    while (pushed < kEvents || popped < kEvents) {
+      if (pushed < kEvents) {
+        if (q.PushFront(pushed) == 0) {
+          sum += pushed;
+          pushed++;
+        }
+      }
+      if (popped < kEvents) {
+        int v = q.PopFront();
+        if (v != 0) {
+          sum -= v;
+          popped++;
+        }
+      }
+    }
+    total += sum;
+  }));
+  for (int i = 0; i < 2; i++) {
+    threads.emplace_back(new std::thread([&q, &total]() {
+      int sum = 0;
+      for (int j = 1; j < kEvents; j++) {
+        if (q.PushBack(j) == 0) {
+          sum += j;
+          continue;
+        }
+        EIGEN_THREAD_YIELD();
+        j--;
+      }
+      total += sum;
+    }));
+    threads.emplace_back(new std::thread([&q, &total]() {
+      int sum = 0;
+      std::vector<int> stolen;
+      for (int j = 1; j < kEvents;) {
+        if (q.PopBackHalf(&stolen) == 0) {
+          EIGEN_THREAD_YIELD();
+          continue;
+        }
+        while (stolen.size() && j < kEvents) {
+          int v = stolen.back();
+          stolen.pop_back();
+          VERIFY_IS_NOT_EQUAL(v, 0);
+          sum += v;
+          j++;
+        }
+      }
+      while (stolen.size()) {
+        int v = stolen.back();
+        stolen.pop_back();
+        VERIFY_IS_NOT_EQUAL(v, 0);
+        while ((v = q.PushBack(v)) != 0) EIGEN_THREAD_YIELD();
+      }
+      total -= sum;
+    }));
+  }
+  for (size_t i = 0; i < threads.size(); i++) threads[i]->join();
+  VERIFY(q.Empty());
+  VERIFY(total.load() == 0);
+}
+
+EIGEN_DECLARE_TEST(cxx11_runqueue)
+{
+  CALL_SUBTEST_1(test_basic_runqueue());
+  CALL_SUBTEST_2(test_empty_runqueue());
+  CALL_SUBTEST_3(test_stress_runqueue());
+}

diff --git a/unsupported/test/cxx11_tensor_argmax.cpp b/unsupported/test/cxx11_tensor_argmax.cpp
new file mode 100644
index 0000000..4a0c896
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_argmax.cpp

@@ -0,0 +1,294 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@google.com>
+//                    Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::array;
+using Eigen::Tuple;
+
+template <int DataLayout>
+static void test_simple_index_tuples()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  tensor = (tensor + tensor.constant(0.5)).log();
+
+  Tensor<Tuple<DenseIndex, float>, 4, DataLayout> index_tuples(2,3,5,7);
+  index_tuples = tensor.index_tuples();
+
+  for (DenseIndex n = 0; n < 2*3*5*7; ++n) {
+    const Tuple<DenseIndex, float>& v = index_tuples.coeff(n);
+    VERIFY_IS_EQUAL(v.first, n);
+    VERIFY_IS_EQUAL(v.second, tensor.coeff(n));
+  }
+}
+
+template <int DataLayout>
+static void test_index_tuples_dim()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  tensor = (tensor + tensor.constant(0.5)).log();
+
+  Tensor<Tuple<DenseIndex, float>, 4, DataLayout> index_tuples(2,3,5,7);
+
+  index_tuples = tensor.index_tuples();
+
+  for (Eigen::DenseIndex n = 0; n < tensor.size(); ++n) {
+    const Tuple<DenseIndex, float>& v = index_tuples(n); //(i, j, k, l);
+    VERIFY_IS_EQUAL(v.first, n);
+    VERIFY_IS_EQUAL(v.second, tensor(n));
+  }
+}
+
+template <int DataLayout>
+static void test_argmax_tuple_reducer()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  tensor = (tensor + tensor.constant(0.5)).log();
+
+  Tensor<Tuple<DenseIndex, float>, 4, DataLayout> index_tuples(2,3,5,7);
+  index_tuples = tensor.index_tuples();
+
+  Tensor<Tuple<DenseIndex, float>, 0, DataLayout> reduced;
+  DimensionList<DenseIndex, 4> dims;
+  reduced = index_tuples.reduce(
+      dims, internal::ArgMaxTupleReducer<Tuple<DenseIndex, float> >());
+
+  Tensor<float, 0, DataLayout> maxi = tensor.maximum();
+
+  VERIFY_IS_EQUAL(maxi(), reduced(0).second);
+
+  array<DenseIndex, 3> reduce_dims;
+  for (int d = 0; d < 3; ++d) reduce_dims[d] = d;
+  Tensor<Tuple<DenseIndex, float>, 1, DataLayout> reduced_by_dims(7);
+  reduced_by_dims = index_tuples.reduce(
+      reduce_dims, internal::ArgMaxTupleReducer<Tuple<DenseIndex, float> >());
+
+  Tensor<float, 1, DataLayout> max_by_dims = tensor.maximum(reduce_dims);
+
+  for (int l = 0; l < 7; ++l) {
+    VERIFY_IS_EQUAL(max_by_dims(l), reduced_by_dims(l).second);
+  }
+}
+
+template <int DataLayout>
+static void test_argmin_tuple_reducer()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  tensor = (tensor + tensor.constant(0.5)).log();
+
+  Tensor<Tuple<DenseIndex, float>, 4, DataLayout> index_tuples(2,3,5,7);
+  index_tuples = tensor.index_tuples();
+
+  Tensor<Tuple<DenseIndex, float>, 0, DataLayout> reduced;
+  DimensionList<DenseIndex, 4> dims;
+  reduced = index_tuples.reduce(
+      dims, internal::ArgMinTupleReducer<Tuple<DenseIndex, float> >());
+
+  Tensor<float, 0, DataLayout> mini = tensor.minimum();
+
+  VERIFY_IS_EQUAL(mini(), reduced(0).second);
+
+  array<DenseIndex, 3> reduce_dims;
+  for (int d = 0; d < 3; ++d) reduce_dims[d] = d;
+  Tensor<Tuple<DenseIndex, float>, 1, DataLayout> reduced_by_dims(7);
+  reduced_by_dims = index_tuples.reduce(
+      reduce_dims, internal::ArgMinTupleReducer<Tuple<DenseIndex, float> >());
+
+  Tensor<float, 1, DataLayout> min_by_dims = tensor.minimum(reduce_dims);
+
+  for (int l = 0; l < 7; ++l) {
+    VERIFY_IS_EQUAL(min_by_dims(l), reduced_by_dims(l).second);
+  }
+}
+
+template <int DataLayout>
+static void test_simple_argmax()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  tensor = (tensor + tensor.constant(0.5)).log();
+  tensor(0,0,0,0) = 10.0;
+
+  Tensor<DenseIndex, 0, DataLayout> tensor_argmax;
+
+  tensor_argmax = tensor.argmax();
+
+  VERIFY_IS_EQUAL(tensor_argmax(0), 0);
+
+  tensor(1,2,4,6) = 20.0;
+
+  tensor_argmax = tensor.argmax();
+
+  VERIFY_IS_EQUAL(tensor_argmax(0), 2*3*5*7 - 1);
+}
+
+template <int DataLayout>
+static void test_simple_argmin()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  tensor = (tensor + tensor.constant(0.5)).log();
+  tensor(0,0,0,0) = -10.0;
+
+  Tensor<DenseIndex, 0, DataLayout> tensor_argmin;
+
+  tensor_argmin = tensor.argmin();
+
+  VERIFY_IS_EQUAL(tensor_argmin(0), 0);
+
+  tensor(1,2,4,6) = -20.0;
+
+  tensor_argmin = tensor.argmin();
+
+  VERIFY_IS_EQUAL(tensor_argmin(0), 2*3*5*7 - 1);
+}
+
+template <int DataLayout>
+static void test_argmax_dim()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  std::vector<int> dims {2, 3, 5, 7};
+
+  for (int dim = 0; dim < 4; ++dim) {
+    tensor.setRandom();
+    tensor = (tensor + tensor.constant(0.5)).log();
+
+    Tensor<DenseIndex, 3, DataLayout> tensor_argmax;
+    array<DenseIndex, 4> ix;
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != 0) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
+            tensor(ix) = 10.0;
+          }
+        }
+      }
+    }
+
+    tensor_argmax = tensor.argmax(dim);
+
+    VERIFY_IS_EQUAL(tensor_argmax.size(),
+                    ptrdiff_t(2*3*5*7 / tensor.dimension(dim)));
+    for (ptrdiff_t n = 0; n < tensor_argmax.size(); ++n) {
+      // Expect max to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_argmax.data()[n], 0);
+    }
+
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != tensor.dimension(dim) - 1) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+            tensor(ix) = 20.0;
+          }
+        }
+      }
+    }
+
+    tensor_argmax = tensor.argmax(dim);
+
+    VERIFY_IS_EQUAL(tensor_argmax.size(),
+                    ptrdiff_t(2*3*5*7 / tensor.dimension(dim)));
+    for (ptrdiff_t n = 0; n < tensor_argmax.size(); ++n) {
+      // Expect max to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_argmax.data()[n], tensor.dimension(dim) - 1);
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_argmin_dim()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  std::vector<int> dims {2, 3, 5, 7};
+
+  for (int dim = 0; dim < 4; ++dim) {
+    tensor.setRandom();
+    tensor = (tensor + tensor.constant(0.5)).log();
+
+    Tensor<DenseIndex, 3, DataLayout> tensor_argmin;
+    array<DenseIndex, 4> ix;
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != 0) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0
+            tensor(ix) = -10.0;
+          }
+        }
+      }
+    }
+
+    tensor_argmin = tensor.argmin(dim);
+
+    VERIFY_IS_EQUAL(tensor_argmin.size(),
+                    ptrdiff_t(2*3*5*7 / tensor.dimension(dim)));
+    for (ptrdiff_t n = 0; n < tensor_argmin.size(); ++n) {
+      // Expect min to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_argmin.data()[n], 0);
+    }
+
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != tensor.dimension(dim) - 1) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0
+            tensor(ix) = -20.0;
+          }
+        }
+      }
+    }
+
+    tensor_argmin = tensor.argmin(dim);
+
+    VERIFY_IS_EQUAL(tensor_argmin.size(),
+                    ptrdiff_t(2*3*5*7 / tensor.dimension(dim)));
+    for (ptrdiff_t n = 0; n < tensor_argmin.size(); ++n) {
+      // Expect min to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_argmin.data()[n], tensor.dimension(dim) - 1);
+    }
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_argmax)
+{
+  CALL_SUBTEST(test_simple_index_tuples<RowMajor>());
+  CALL_SUBTEST(test_simple_index_tuples<ColMajor>());
+  CALL_SUBTEST(test_index_tuples_dim<RowMajor>());
+  CALL_SUBTEST(test_index_tuples_dim<ColMajor>());
+  CALL_SUBTEST(test_argmax_tuple_reducer<RowMajor>());
+  CALL_SUBTEST(test_argmax_tuple_reducer<ColMajor>());
+  CALL_SUBTEST(test_argmin_tuple_reducer<RowMajor>());
+  CALL_SUBTEST(test_argmin_tuple_reducer<ColMajor>());
+  CALL_SUBTEST(test_simple_argmax<RowMajor>());
+  CALL_SUBTEST(test_simple_argmax<ColMajor>());
+  CALL_SUBTEST(test_simple_argmin<RowMajor>());
+  CALL_SUBTEST(test_simple_argmin<ColMajor>());
+  CALL_SUBTEST(test_argmax_dim<RowMajor>());
+  CALL_SUBTEST(test_argmax_dim<ColMajor>());
+  CALL_SUBTEST(test_argmin_dim<RowMajor>());
+  CALL_SUBTEST(test_argmin_dim<ColMajor>());
+}

diff --git a/unsupported/test/cxx11_tensor_argmax_gpu.cu b/unsupported/test/cxx11_tensor_argmax_gpu.cu
new file mode 100644
index 0000000..79f4066
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_argmax_gpu.cu

@@ -0,0 +1,253 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
+using Eigen::Tensor;
+
+template <int Layout>
+void test_gpu_simple_argmax()
+{
+  Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
+  Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
+  Tensor<DenseIndex, 1, Layout> out_min(Eigen::array<DenseIndex, 1>(1));
+  in.setRandom();
+  in *= in.constant(100.0);
+  in(0, 0, 0) = -1000.0;
+  in(71, 52, 96) = 1000.0;
+
+  std::size_t in_bytes = in.size() * sizeof(double);
+  std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
+
+  double* d_in;
+  DenseIndex* d_out_max;
+  DenseIndex* d_out_min;
+  gpuMalloc((void**)(&d_in), in_bytes);
+  gpuMalloc((void**)(&d_out_max), out_bytes);
+  gpuMalloc((void**)(&d_out_min), out_bytes);
+
+  gpuMemcpy(d_in, in.data(), in_bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_max(d_out_max, Eigen::array<DenseIndex, 1>(1));
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_min(d_out_min, Eigen::array<DenseIndex, 1>(1));
+
+  gpu_out_max.device(gpu_device) = gpu_in.argmax();
+  gpu_out_min.device(gpu_device) = gpu_in.argmin();
+
+  assert(gpuMemcpyAsync(out_max.data(), d_out_max, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuMemcpyAsync(out_min.data(), d_out_min, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
+  VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
+
+  gpuFree(d_in);
+  gpuFree(d_out_max);
+  gpuFree(d_out_min);
+}
+
+template <int DataLayout>
+void test_gpu_argmax_dim()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  std::vector<int> dims;
+  dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
+
+  for (int dim = 0; dim < 4; ++dim) {
+    tensor.setRandom();
+    tensor = (tensor + tensor.constant(0.5)).log();
+
+    array<DenseIndex, 3> out_shape;
+    for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
+
+    Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
+
+    array<DenseIndex, 4> ix;
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != 0) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
+            tensor(ix) = 10.0;
+          }
+        }
+      }
+    }
+
+    std::size_t in_bytes = tensor.size() * sizeof(float);
+    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+    float* d_in;
+    DenseIndex* d_out;
+    gpuMalloc((void**)(&d_in), in_bytes);
+    gpuMalloc((void**)(&d_out), out_bytes);
+
+    gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
+
+    Eigen::GpuStreamDevice stream;
+    Eigen::GpuDevice gpu_device(&stream);
+
+    Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
+    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
+
+    gpu_out.device(gpu_device) = gpu_in.argmax(dim);
+
+    assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+    assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+    VERIFY_IS_EQUAL(tensor_arg.size(),
+                    size_t(2*3*5*7 / tensor.dimension(dim)));
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+    }
+
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != tensor.dimension(dim) - 1) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+            tensor(ix) = 20.0;
+          }
+        }
+      }
+    }
+
+    gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
+
+    gpu_out.device(gpu_device) = gpu_in.argmax(dim);
+
+    assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+    assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+    }
+
+    gpuFree(d_in);
+    gpuFree(d_out);
+  }
+}
+
+template <int DataLayout>
+void test_gpu_argmin_dim()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  std::vector<int> dims;
+  dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
+
+  for (int dim = 0; dim < 4; ++dim) {
+    tensor.setRandom();
+    tensor = (tensor + tensor.constant(0.5)).log();
+
+    array<DenseIndex, 3> out_shape;
+    for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
+
+    Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
+
+    array<DenseIndex, 4> ix;
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != 0) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
+            tensor(ix) = -10.0;
+          }
+        }
+      }
+    }
+
+    std::size_t in_bytes = tensor.size() * sizeof(float);
+    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+    float* d_in;
+    DenseIndex* d_out;
+    gpuMalloc((void**)(&d_in), in_bytes);
+    gpuMalloc((void**)(&d_out), out_bytes);
+
+    gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
+
+    Eigen::GpuStreamDevice stream;
+    Eigen::GpuDevice gpu_device(&stream);
+
+    Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
+    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
+
+    gpu_out.device(gpu_device) = gpu_in.argmin(dim);
+
+    assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+    assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+    VERIFY_IS_EQUAL(tensor_arg.size(),
+                    2*3*5*7 / tensor.dimension(dim));
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect min to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+    }
+
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != tensor.dimension(dim) - 1) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+            tensor(ix) = -20.0;
+          }
+        }
+      }
+    }
+
+    gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
+
+    gpu_out.device(gpu_device) = gpu_in.argmin(dim);
+
+    assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+    assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+    }
+
+    gpuFree(d_in);
+    gpuFree(d_out);
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_argmax_gpu)
+{
+  CALL_SUBTEST_1(test_gpu_simple_argmax<RowMajor>());
+  CALL_SUBTEST_1(test_gpu_simple_argmax<ColMajor>());
+  CALL_SUBTEST_2(test_gpu_argmax_dim<RowMajor>());
+  CALL_SUBTEST_2(test_gpu_argmax_dim<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_argmin_dim<RowMajor>());
+  CALL_SUBTEST_3(test_gpu_argmin_dim<ColMajor>());
+}

diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
new file mode 100644
index 0000000..7ac7128
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp

@@ -0,0 +1,258 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+#define EIGEN_HAS_CONSTEXPR 1
+
+#include "main.h"
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <typename DataType, int Layout, typename DenseIndex>
+static void test_sycl_simple_argmax(const Eigen::SyclDevice& sycl_device) {
+  Tensor<DataType, 3, Layout, DenseIndex> in(Eigen::array<DenseIndex, 3>{{2, 2, 2}});
+  Tensor<DenseIndex, 0, Layout, DenseIndex> out_max;
+  Tensor<DenseIndex, 0, Layout, DenseIndex> out_min;
+  in.setRandom();
+  in *= in.constant(100.0);
+  in(0, 0, 0) = -1000.0;
+  in(1, 1, 1) = 1000.0;
+
+  std::size_t in_bytes = in.size() * sizeof(DataType);
+  std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
+
+  DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+  DenseIndex* d_out_max = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+  DenseIndex* d_out_min = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, Layout, DenseIndex> > gpu_in(d_in,
+                                                                           Eigen::array<DenseIndex, 3>{{2, 2, 2}});
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_max(d_out_max);
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_min(d_out_min);
+  sycl_device.memcpyHostToDevice(d_in, in.data(), in_bytes);
+
+  gpu_out_max.device(sycl_device) = gpu_in.argmax();
+  gpu_out_min.device(sycl_device) = gpu_in.argmin();
+
+  sycl_device.memcpyDeviceToHost(out_max.data(), d_out_max, out_bytes);
+  sycl_device.memcpyDeviceToHost(out_min.data(), d_out_min, out_bytes);
+
+  VERIFY_IS_EQUAL(out_max(), 2 * 2 * 2 - 1);
+  VERIFY_IS_EQUAL(out_min(), 0);
+
+  sycl_device.deallocate(d_in);
+  sycl_device.deallocate(d_out_max);
+  sycl_device.deallocate(d_out_min);
+}
+
+template <typename DataType, int DataLayout, typename DenseIndex>
+static void test_sycl_argmax_dim(const Eigen::SyclDevice& sycl_device) {
+  DenseIndex sizeDim0 = 9;
+  DenseIndex sizeDim1 = 3;
+  DenseIndex sizeDim2 = 5;
+  DenseIndex sizeDim3 = 7;
+  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3);
+
+  std::vector<DenseIndex> dims;
+  dims.push_back(sizeDim0);
+  dims.push_back(sizeDim1);
+  dims.push_back(sizeDim2);
+  dims.push_back(sizeDim3);
+  for (DenseIndex dim = 0; dim < 4; ++dim) {
+    array<DenseIndex, 3> out_shape;
+    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1];
+
+    Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
+
+    array<DenseIndex, 4> ix;
+    for (DenseIndex i = 0; i < sizeDim0; ++i) {
+      for (DenseIndex j = 0; j < sizeDim1; ++j) {
+        for (DenseIndex k = 0; k < sizeDim2; ++k) {
+          for (DenseIndex l = 0; l < sizeDim3; ++l) {
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l)
+            // = 10.0
+            tensor(ix) = (ix[dim] != 0) ? -1.0 : 10.0;
+          }
+        }
+      }
+    }
+
+    std::size_t in_bytes = tensor.size() * sizeof(DataType);
+    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+    DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+    DenseIndex* d_out = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+
+    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(
+        d_in, Eigen::array<DenseIndex, 4>{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}});
+    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
+
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
+    gpu_out.device(sycl_device) = gpu_in.argmax(dim);
+    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+    VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
+                    size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim)));
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+    }
+
+    sycl_device.synchronize();
+
+    for (DenseIndex i = 0; i < sizeDim0; ++i) {
+      for (DenseIndex j = 0; j < sizeDim1; ++j) {
+        for (DenseIndex k = 0; k < sizeDim2; ++k) {
+          for (DenseIndex l = 0; l < sizeDim3; ++l) {
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+            tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0;
+          }
+        }
+      }
+    }
+
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
+    gpu_out.device(sycl_device) = gpu_in.argmax(dim);
+    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+    }
+    sycl_device.deallocate(d_in);
+    sycl_device.deallocate(d_out);
+  }
+}
+
+template <typename DataType, int DataLayout, typename DenseIndex>
+static void test_sycl_argmin_dim(const Eigen::SyclDevice& sycl_device) {
+  DenseIndex sizeDim0 = 9;
+  DenseIndex sizeDim1 = 3;
+  DenseIndex sizeDim2 = 5;
+  DenseIndex sizeDim3 = 7;
+  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3);
+
+  std::vector<DenseIndex> dims;
+  dims.push_back(sizeDim0);
+  dims.push_back(sizeDim1);
+  dims.push_back(sizeDim2);
+  dims.push_back(sizeDim3);
+  for (DenseIndex dim = 0; dim < 4; ++dim) {
+    array<DenseIndex, 3> out_shape;
+    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1];
+
+    Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
+
+    array<DenseIndex, 4> ix;
+    for (DenseIndex i = 0; i < sizeDim0; ++i) {
+      for (DenseIndex j = 0; j < sizeDim1; ++j) {
+        for (DenseIndex k = 0; k < sizeDim2; ++k) {
+          for (DenseIndex l = 0; l < sizeDim3; ++l) {
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0
+            tensor(ix) = (ix[dim] != 0) ? 1.0 : -10.0;
+          }
+        }
+      }
+    }
+
+    std::size_t in_bytes = tensor.size() * sizeof(DataType);
+    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+    DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+    DenseIndex* d_out = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+
+    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(
+        d_in, Eigen::array<DenseIndex, 4>{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}});
+    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
+
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
+    gpu_out.device(sycl_device) = gpu_in.argmin(dim);
+    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+    VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
+                    size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim)));
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+    }
+
+    sycl_device.synchronize();
+
+    for (DenseIndex i = 0; i < sizeDim0; ++i) {
+      for (DenseIndex j = 0; j < sizeDim1; ++j) {
+        for (DenseIndex k = 0; k < sizeDim2; ++k) {
+          for (DenseIndex l = 0; l < sizeDim3; ++l) {
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0
+            tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0;
+          }
+        }
+      }
+    }
+
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
+    gpu_out.device(sycl_device) = gpu_in.argmin(dim);
+    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+    }
+    sycl_device.deallocate(d_in);
+    sycl_device.deallocate(d_out);
+  }
+}
+
+template <typename DataType, typename Device_Selector>
+void sycl_argmax_test_per_device(const Device_Selector& d) {
+  QueueInterface queueInterface(d);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_sycl_simple_argmax<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_simple_argmax<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_argmax_dim<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_argmax_dim<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_argmin_dim<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_argmin_dim<DataType, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_argmax_test_per_device<float>(device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp
new file mode 100644
index 0000000..ce9d243
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_assign.cpp

@@ -0,0 +1,370 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_1d()
+{
+  Tensor<int, 1> vec1(6);
+  Tensor<int, 1, RowMajor> vec2(6);
+  vec1(0) = 4;  vec2(0) = 0;
+  vec1(1) = 8;  vec2(1) = 1;
+  vec1(2) = 15; vec2(2) = 2;
+  vec1(3) = 16; vec2(3) = 3;
+  vec1(4) = 23; vec2(4) = 4;
+  vec1(5) = 42; vec2(5) = 5;
+
+  int col_major[6];
+  int row_major[6];
+  memset(col_major, 0, 6*sizeof(int));
+  memset(row_major, 0, 6*sizeof(int));
+  TensorMap<Tensor<int, 1> > vec3(col_major, 6);
+  TensorMap<Tensor<int, 1, RowMajor> > vec4(row_major, 6);
+
+  vec3 = vec1;
+  vec4 = vec2;
+
+  VERIFY_IS_EQUAL(vec3(0), 4);
+  VERIFY_IS_EQUAL(vec3(1), 8);
+  VERIFY_IS_EQUAL(vec3(2), 15);
+  VERIFY_IS_EQUAL(vec3(3), 16);
+  VERIFY_IS_EQUAL(vec3(4), 23);
+  VERIFY_IS_EQUAL(vec3(5), 42);
+
+  VERIFY_IS_EQUAL(vec4(0), 0);
+  VERIFY_IS_EQUAL(vec4(1), 1);
+  VERIFY_IS_EQUAL(vec4(2), 2);
+  VERIFY_IS_EQUAL(vec4(3), 3);
+  VERIFY_IS_EQUAL(vec4(4), 4);
+  VERIFY_IS_EQUAL(vec4(5), 5);
+
+  vec1.setZero();
+  vec2.setZero();
+  vec1 = vec3;
+  vec2 = vec4;
+
+  VERIFY_IS_EQUAL(vec1(0), 4);
+  VERIFY_IS_EQUAL(vec1(1), 8);
+  VERIFY_IS_EQUAL(vec1(2), 15);
+  VERIFY_IS_EQUAL(vec1(3), 16);
+  VERIFY_IS_EQUAL(vec1(4), 23);
+  VERIFY_IS_EQUAL(vec1(5), 42);
+
+  VERIFY_IS_EQUAL(vec2(0), 0);
+  VERIFY_IS_EQUAL(vec2(1), 1);
+  VERIFY_IS_EQUAL(vec2(2), 2);
+  VERIFY_IS_EQUAL(vec2(3), 3);
+  VERIFY_IS_EQUAL(vec2(4), 4);
+  VERIFY_IS_EQUAL(vec2(5), 5);
+}
+
+static void test_2d()
+{
+  Tensor<int, 2> mat1(2,3);
+  Tensor<int, 2, RowMajor> mat2(2,3);
+
+  mat1(0,0) = 0;
+  mat1(0,1) = 1;
+  mat1(0,2) = 2;
+  mat1(1,0) = 3;
+  mat1(1,1) = 4;
+  mat1(1,2) = 5;
+
+  mat2(0,0) = 0;
+  mat2(0,1) = 1;
+  mat2(0,2) = 2;
+  mat2(1,0) = 3;
+  mat2(1,1) = 4;
+  mat2(1,2) = 5;
+
+  int col_major[6];
+  int row_major[6];
+  memset(col_major, 0, 6*sizeof(int));
+  memset(row_major, 0, 6*sizeof(int));
+  TensorMap<Tensor<int, 2> > mat3(row_major, 2, 3);
+  TensorMap<Tensor<int, 2, RowMajor> > mat4(col_major, 2, 3);
+
+  mat3 = mat1;
+  mat4 = mat2;
+
+  VERIFY_IS_EQUAL(mat3(0,0), 0);
+  VERIFY_IS_EQUAL(mat3(0,1), 1);
+  VERIFY_IS_EQUAL(mat3(0,2), 2);
+  VERIFY_IS_EQUAL(mat3(1,0), 3);
+  VERIFY_IS_EQUAL(mat3(1,1), 4);
+  VERIFY_IS_EQUAL(mat3(1,2), 5);
+
+  VERIFY_IS_EQUAL(mat4(0,0), 0);
+  VERIFY_IS_EQUAL(mat4(0,1), 1);
+  VERIFY_IS_EQUAL(mat4(0,2), 2);
+  VERIFY_IS_EQUAL(mat4(1,0), 3);
+  VERIFY_IS_EQUAL(mat4(1,1), 4);
+  VERIFY_IS_EQUAL(mat4(1,2), 5);
+
+  mat1.setZero();
+  mat2.setZero();
+  mat1 = mat3;
+  mat2 = mat4;
+
+  VERIFY_IS_EQUAL(mat1(0,0), 0);
+  VERIFY_IS_EQUAL(mat1(0,1), 1);
+  VERIFY_IS_EQUAL(mat1(0,2), 2);
+  VERIFY_IS_EQUAL(mat1(1,0), 3);
+  VERIFY_IS_EQUAL(mat1(1,1), 4);
+  VERIFY_IS_EQUAL(mat1(1,2), 5);
+
+  VERIFY_IS_EQUAL(mat2(0,0), 0);
+  VERIFY_IS_EQUAL(mat2(0,1), 1);
+  VERIFY_IS_EQUAL(mat2(0,2), 2);
+  VERIFY_IS_EQUAL(mat2(1,0), 3);
+  VERIFY_IS_EQUAL(mat2(1,1), 4);
+  VERIFY_IS_EQUAL(mat2(1,2), 5);
+}
+
+static void test_3d()
+{
+  Tensor<int, 3> mat1(2,3,7);
+  Tensor<int, 3, RowMajor> mat2(2,3,7);
+
+  int val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        mat2(i,j,k) = val;
+        val++;
+      }
+    }
+  }
+
+  int col_major[2*3*7];
+  int row_major[2*3*7];
+  memset(col_major, 0, 2*3*7*sizeof(int));
+  memset(row_major, 0, 2*3*7*sizeof(int));
+  TensorMap<Tensor<int, 3> > mat3(col_major, 2, 3, 7);
+  TensorMap<Tensor<int, 3, RowMajor> > mat4(row_major, 2, 3, 7);
+
+  mat3 = mat1;
+  mat4 = mat2;
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(mat3(i,j,k), val);
+        VERIFY_IS_EQUAL(mat4(i,j,k), val);
+        val++;
+      }
+    }
+  }
+
+  mat1.setZero();
+  mat2.setZero();
+  mat1 = mat3;
+  mat2 = mat4;
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(mat1(i,j,k), val);
+        VERIFY_IS_EQUAL(mat2(i,j,k), val);
+        val++;
+      }
+    }
+  }
+}
+
+static void test_same_type()
+{
+  Tensor<int, 1> orig_tensor(5);
+  Tensor<int, 1> dest_tensor(5);
+  orig_tensor.setRandom();
+  dest_tensor.setRandom();
+  int* orig_data = orig_tensor.data();
+  int* dest_data = dest_tensor.data();
+  dest_tensor = orig_tensor;
+  VERIFY_IS_EQUAL(orig_tensor.data(), orig_data);
+  VERIFY_IS_EQUAL(dest_tensor.data(), dest_data);
+  for (int i = 0; i < 5; ++i) {
+    VERIFY_IS_EQUAL(dest_tensor(i), orig_tensor(i));
+  }
+
+  TensorFixedSize<int, Sizes<5> > orig_array;
+  TensorFixedSize<int, Sizes<5> > dest_array;
+  orig_array.setRandom();
+  dest_array.setRandom();
+  orig_data = orig_array.data();
+  dest_data = dest_array.data();
+  dest_array = orig_array;
+  VERIFY_IS_EQUAL(orig_array.data(), orig_data);
+  VERIFY_IS_EQUAL(dest_array.data(), dest_data);
+  for (int i = 0; i < 5; ++i) {
+    VERIFY_IS_EQUAL(dest_array(i), orig_array(i));
+  }
+
+  int orig[5] = {1, 2, 3, 4, 5};
+  int dest[5] = {6, 7, 8, 9, 10};
+  TensorMap<Tensor<int, 1> > orig_map(orig, 5);
+  TensorMap<Tensor<int, 1> > dest_map(dest, 5);
+  orig_data = orig_map.data();
+  dest_data = dest_map.data();
+  dest_map = orig_map;
+  VERIFY_IS_EQUAL(orig_map.data(), orig_data);
+  VERIFY_IS_EQUAL(dest_map.data(), dest_data);
+  for (int i = 0; i < 5; ++i) {
+    VERIFY_IS_EQUAL(dest[i], i+1);
+  }
+}
+
+static void test_auto_resize()
+{
+  Tensor<int, 1> tensor1;
+  Tensor<int, 1> tensor2(3);
+  Tensor<int, 1> tensor3(5);
+  Tensor<int, 1> tensor4(7);
+
+  Tensor<int, 1> new_tensor(5);
+  new_tensor.setRandom();
+
+  tensor1 = tensor2 = tensor3 = tensor4 = new_tensor;
+
+  VERIFY_IS_EQUAL(tensor1.dimension(0), new_tensor.dimension(0));
+  VERIFY_IS_EQUAL(tensor2.dimension(0), new_tensor.dimension(0));
+  VERIFY_IS_EQUAL(tensor3.dimension(0), new_tensor.dimension(0));
+  VERIFY_IS_EQUAL(tensor4.dimension(0), new_tensor.dimension(0));
+  for (int i = 0; i < new_tensor.dimension(0); ++i) {
+    VERIFY_IS_EQUAL(tensor1(i), new_tensor(i));
+    VERIFY_IS_EQUAL(tensor2(i), new_tensor(i));
+    VERIFY_IS_EQUAL(tensor3(i), new_tensor(i));
+    VERIFY_IS_EQUAL(tensor4(i), new_tensor(i));
+  }
+}
+
+
+static void test_compound_assign()
+{
+  Tensor<int, 1> start_tensor(10);
+  Tensor<int, 1> offset_tensor(10);
+  start_tensor.setRandom();
+  offset_tensor.setRandom();
+
+  Tensor<int, 1> tensor = start_tensor;
+  tensor += offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) + offset_tensor(i));
+  }
+
+  tensor = start_tensor;
+  tensor -= offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) - offset_tensor(i));
+  }
+
+  tensor = start_tensor;
+  tensor *= offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) * offset_tensor(i));
+  }
+
+  tensor = start_tensor;
+  tensor /= offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) / offset_tensor(i));
+  }
+}
+
+static void test_std_initializers_tensor() {
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  Tensor<int, 1> a(3);
+  a.setValues({0, 1, 2});
+  VERIFY_IS_EQUAL(a(0), 0);
+  VERIFY_IS_EQUAL(a(1), 1);
+  VERIFY_IS_EQUAL(a(2), 2);
+
+  // It fills the top-left slice.
+  a.setValues({10, 20});
+  VERIFY_IS_EQUAL(a(0), 10);
+  VERIFY_IS_EQUAL(a(1), 20);
+  VERIFY_IS_EQUAL(a(2), 2);
+
+  // Chaining.
+  Tensor<int, 1> a2(3);
+  a2 = a.setValues({100, 200, 300});
+  VERIFY_IS_EQUAL(a(0), 100);
+  VERIFY_IS_EQUAL(a(1), 200);
+  VERIFY_IS_EQUAL(a(2), 300);
+  VERIFY_IS_EQUAL(a2(0), 100);
+  VERIFY_IS_EQUAL(a2(1), 200);
+  VERIFY_IS_EQUAL(a2(2), 300);
+
+  Tensor<int, 2> b(2, 3);
+  b.setValues({{0, 1, 2}, {3, 4, 5}});
+  VERIFY_IS_EQUAL(b(0, 0), 0);
+  VERIFY_IS_EQUAL(b(0, 1), 1);
+  VERIFY_IS_EQUAL(b(0, 2), 2);
+  VERIFY_IS_EQUAL(b(1, 0), 3);
+  VERIFY_IS_EQUAL(b(1, 1), 4);
+  VERIFY_IS_EQUAL(b(1, 2), 5);
+
+  // It fills the top-left slice.
+  b.setValues({{10, 20}, {30}});
+  VERIFY_IS_EQUAL(b(0, 0), 10);
+  VERIFY_IS_EQUAL(b(0, 1), 20);
+  VERIFY_IS_EQUAL(b(0, 2), 2);
+  VERIFY_IS_EQUAL(b(1, 0), 30);
+  VERIFY_IS_EQUAL(b(1, 1), 4);
+  VERIFY_IS_EQUAL(b(1, 2), 5);
+
+  Eigen::Tensor<int, 3> c(3, 2, 4);
+  c.setValues({{{0, 1, 2, 3}, {4, 5, 6, 7}},
+               {{10, 11, 12, 13}, {14, 15, 16, 17}},
+               {{20, 21, 22, 23}, {24, 25, 26, 27}}});
+  VERIFY_IS_EQUAL(c(0, 0, 0), 0);
+  VERIFY_IS_EQUAL(c(0, 0, 1), 1);
+  VERIFY_IS_EQUAL(c(0, 0, 2), 2);
+  VERIFY_IS_EQUAL(c(0, 0, 3), 3);
+  VERIFY_IS_EQUAL(c(0, 1, 0), 4);
+  VERIFY_IS_EQUAL(c(0, 1, 1), 5);
+  VERIFY_IS_EQUAL(c(0, 1, 2), 6);
+  VERIFY_IS_EQUAL(c(0, 1, 3), 7);
+  VERIFY_IS_EQUAL(c(1, 0, 0), 10);
+  VERIFY_IS_EQUAL(c(1, 0, 1), 11);
+  VERIFY_IS_EQUAL(c(1, 0, 2), 12);
+  VERIFY_IS_EQUAL(c(1, 0, 3), 13);
+  VERIFY_IS_EQUAL(c(1, 1, 0), 14);
+  VERIFY_IS_EQUAL(c(1, 1, 1), 15);
+  VERIFY_IS_EQUAL(c(1, 1, 2), 16);
+  VERIFY_IS_EQUAL(c(1, 1, 3), 17);
+  VERIFY_IS_EQUAL(c(2, 0, 0), 20);
+  VERIFY_IS_EQUAL(c(2, 0, 1), 21);
+  VERIFY_IS_EQUAL(c(2, 0, 2), 22);
+  VERIFY_IS_EQUAL(c(2, 0, 3), 23);
+  VERIFY_IS_EQUAL(c(2, 1, 0), 24);
+  VERIFY_IS_EQUAL(c(2, 1, 1), 25);
+  VERIFY_IS_EQUAL(c(2, 1, 2), 26);
+  VERIFY_IS_EQUAL(c(2, 1, 3), 27);
+#endif  // EIGEN_HAS_VARIADIC_TEMPLATES
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_assign)
+{
+  CALL_SUBTEST(test_1d());
+  CALL_SUBTEST(test_2d());
+  CALL_SUBTEST(test_3d());
+  CALL_SUBTEST(test_same_type());
+  CALL_SUBTEST(test_auto_resize());
+  CALL_SUBTEST(test_compound_assign());
+  CALL_SUBTEST(test_std_initializers_tensor());
+}

diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp
new file mode 100644
index 0000000..5fb12e0
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_block_access.cpp

@@ -0,0 +1,576 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Andy Davis <andydavis@google.com>
+// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <algorithm>
+#include <set>
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::Index;
+using Eigen::RowMajor;
+using Eigen::ColMajor;
+using Eigen::internal::TensorBlockShapeType;
+
+static TensorOpCost zeroCost() { return {0, 0, 0}; }
+
+template<typename T>
+static const T& choose(int layout, const T& col, const T& row) {
+  return layout == ColMajor ? col : row;
+}
+
+static TensorBlockShapeType RandomShape() {
+  return internal::random<bool>()
+         ? TensorBlockShapeType::kUniformAllDims
+         : TensorBlockShapeType::kSkewedInnerDims;
+}
+
+template <int NumDims>
+static size_t RandomTargetSize(const DSizes<Index, NumDims>& dims) {
+  return internal::random<size_t>(1, dims.TotalSize());
+}
+
+template <int NumDims>
+static DSizes<Index, NumDims> RandomDims() {
+  array<Index, NumDims> dims;
+  for (int i = 0; i < NumDims; ++i) {
+    dims[i] = internal::random<int>(1, 20);
+  }
+  return DSizes<Index, NumDims>(dims);
+}
+
+template <typename T>
+static T* GenerateRandomData(const Index& size) {
+  T* data = new T[size];
+  for (int i = 0; i < size; ++i) {
+    data[i] = internal::random<T>();
+  }
+  return data;
+}
+
+template <int NumDims>
+static void Debug(DSizes<Index, NumDims> dims) {
+  for (int i = 0; i < NumDims; ++i) {
+    std::cout << dims[i] << "; ";
+  }
+  std::cout << std::endl;
+}
+
+template <int Layout>
+static void test_block_mapper_sanity()
+{
+  typedef internal::TensorBlockMapper<2, Layout> TensorBlockMapper;
+
+  DSizes<Index, 2> tensor_dims(100, 100);
+
+  // Test uniform blocks.
+  TensorBlockMapper uniform_block_mapper(
+      tensor_dims, {TensorBlockShapeType::kUniformAllDims, 100, zeroCost()});
+
+  VERIFY_IS_EQUAL(uniform_block_mapper.blockCount(), 100);
+  VERIFY_IS_EQUAL(uniform_block_mapper.blockTotalSize(), 100);
+
+  // 10x10 blocks
+  auto uniform_b0 = uniform_block_mapper.blockDescriptor(0);
+  VERIFY_IS_EQUAL(uniform_b0.dimensions().at(0), 10);
+  VERIFY_IS_EQUAL(uniform_b0.dimensions().at(1), 10);
+
+  // Test skewed to inner dims blocks.
+  TensorBlockMapper skewed_block_mapper(
+      tensor_dims, {TensorBlockShapeType::kSkewedInnerDims, 100, zeroCost()});
+
+  VERIFY_IS_EQUAL(skewed_block_mapper.blockCount(), 100);
+  VERIFY_IS_EQUAL(skewed_block_mapper.blockTotalSize(), 100);
+
+  // 1x100 (100x1) rows/cols depending on a tensor layout.
+  auto skewed_b0 = skewed_block_mapper.blockDescriptor(0);
+  VERIFY_IS_EQUAL(skewed_b0.dimensions().at(0), choose(Layout, 100, 1));
+  VERIFY_IS_EQUAL(skewed_b0.dimensions().at(1), choose(Layout, 1, 100));
+}
+
+// Given a TensorBlock "visit" every element accessible though it, and a keep an
+// index in the visited set. Verify that every coeff accessed only once.
+template<int NumDims, int Layout>
+static void UpdateCoeffSet(
+    const DSizes<Index, NumDims>& tensor_strides,
+    const internal::TensorBlockDescriptor<NumDims>& block,
+    Index first_coeff_index, int dim_index, std::set<Index>* visited_coeffs) {
+  const DSizes<Index, NumDims>& block_sizes = block.dimensions();
+
+  for (int i = 0; i < block_sizes[dim_index]; ++i) {
+    if (tensor_strides[dim_index] == 1) {
+      typedef std::pair<std::set<Index>::iterator, bool> ReturnType;
+      ReturnType inserted = visited_coeffs->insert(first_coeff_index + i);
+      VERIFY_IS_EQUAL(inserted.second, true);
+    } else {
+      int next_dim_index = dim_index + choose(Layout, -1, 1);
+      UpdateCoeffSet<NumDims, Layout>(tensor_strides, block, first_coeff_index,
+                                         next_dim_index, visited_coeffs);
+      first_coeff_index += tensor_strides[dim_index];
+    }
+  }
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_block_mapper_maps_every_element() {
+  typedef internal::TensorBlockMapper<NumDims, Layout> TensorBlockMapper;
+
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>();
+  DSizes<Index, NumDims> strides = internal::strides<Layout>(dims);
+
+  // Keep track of elements indices available via block access.
+  std::set<Index> coeff_set;
+
+  // Try different combinations of block types and sizes.
+  TensorBlockMapper block_mapper(
+      dims, {RandomShape(), RandomTargetSize(dims), zeroCost()});
+
+  for (int i = 0; i < block_mapper.blockCount(); ++i) {
+    auto block = block_mapper.blockDescriptor(i);
+    UpdateCoeffSet<NumDims, Layout>(strides, block, block.offset(),
+                                    choose(Layout, NumDims - 1, 0),
+                                    &coeff_set);
+  }
+
+  // Verify that every coefficient in the original Tensor is accessible through
+  // TensorBlock only once.
+  Index total_coeffs = dims.TotalSize();
+  VERIFY_IS_EQUAL(Index(coeff_set.size()), total_coeffs);
+  VERIFY_IS_EQUAL(*coeff_set.begin(), 0);
+  VERIFY_IS_EQUAL(*coeff_set.rbegin(), total_coeffs - 1);
+}
+
+template <int Layout, int NumDims>
+static Index GetInputIndex(Index output_index,
+                         const array<Index, NumDims>& output_to_input_dim_map,
+                         const array<Index, NumDims>& input_strides,
+                         const array<Index, NumDims>& output_strides) {
+  int input_index = 0;
+  if (Layout == ColMajor) {
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = output_index / output_strides[i];
+      input_index += idx * input_strides[output_to_input_dim_map[i]];
+      output_index -= idx * output_strides[i];
+    }
+    return input_index +
+           output_index * input_strides[output_to_input_dim_map[0]];
+  } else {
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = output_index / output_strides[i];
+      input_index += idx * input_strides[output_to_input_dim_map[i]];
+      output_index -= idx * output_strides[i];
+    }
+    return input_index +
+           output_index * input_strides[output_to_input_dim_map[NumDims - 1]];
+  }
+}
+
+template <int Layout, int NumDims>
+static array<Index, NumDims> ComputeStrides(
+    const array<Index, NumDims>& sizes) {
+  array<Index, NumDims> strides;
+  if (Layout == ColMajor) {
+    strides[0] = 1;
+    for (int i = 1; i < NumDims; ++i) {
+      strides[i] = strides[i - 1] * sizes[i - 1];
+    }
+  } else {
+    strides[NumDims - 1] = 1;
+    for (int i = NumDims - 2; i >= 0; --i) {
+      strides[i] = strides[i + 1] * sizes[i + 1];
+    }
+  }
+  return strides;
+}
+
+template<typename Scalar, typename StorageIndex, int Dim>
+class EqualityChecker
+{
+    const Scalar* input_data;
+    const DSizes<StorageIndex, Dim> &input_dims, &input_strides, &output_dims, &output_strides;
+    void check_recursive(const Scalar* input, const Scalar* output, int depth=0) const
+    {
+        if(depth==Dim)
+        {
+            VERIFY_IS_EQUAL(*input, *output);
+            return;
+        }
+
+        for(int i=0; i<output_dims[depth]; ++i)
+        {
+            check_recursive(input + i % input_dims[depth] * input_strides[depth], output + i*output_strides[depth], depth+1);
+        }
+    }
+public:
+    EqualityChecker(const Scalar* input_data_,
+            const DSizes<StorageIndex, Dim> &input_dims_, const DSizes<StorageIndex, Dim> &input_strides_,
+            const DSizes<StorageIndex, Dim> &output_dims_, const DSizes<StorageIndex, Dim> &output_strides_)
+        : input_data(input_data_)
+        , input_dims(input_dims_), input_strides(input_strides_)
+        , output_dims(output_dims_), output_strides(output_strides_)
+        {}
+
+    void operator()(const Scalar* output_data) const
+    {
+        check_recursive(input_data, output_data);
+    }
+};
+
+template <int Layout>
+static void test_uniform_block_shape()
+{
+  typedef internal::TensorBlockDescriptor<5> TensorBlock;
+  typedef internal::TensorBlockMapper<5, Layout> TensorBlockMapper;
+
+  {
+    // Test shape 'UniformAllDims' with uniform 'max_coeff count'.
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 5 * 5 * 5 * 5 * 5;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    for (int i = 0; i < 5; ++i) {
+      VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
+  // partially into first inner-most dimension.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 7 * 5 * 5 * 5 * 5;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[0]);
+    for (int i = 1; i < 5; ++i) {
+      VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 5 * 5 * 5 * 5 * 6;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(6, block.dimensions()[4]);
+    for (int i = 3; i >= 0; --i) {
+      VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
+  // fully into first inner-most dimension.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 5 * 5 * 5 * 5;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    for (int i = 1; i < 5; ++i) {
+      VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 5 * 5 * 5 * 5 * 7;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    for (int i = 3; i >= 0; --i) {
+      VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
+  // fully into first few inner-most dimensions.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(7, 5, 6, 17, 7);
+    const Index max_coeff_count = 7 * 5 * 6 * 7 * 5;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[0]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(7, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[4]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(7, 5, 6, 9, 7);
+    const Index max_coeff_count = 5 * 5 * 5 * 6 * 7;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[0]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'UniformAllDims' with full allocation to all dims.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(7, 5, 6, 17, 7);
+    const Index max_coeff_count = 7 * 5 * 6 * 17 * 7;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[0]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(17, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(7, 5, 6, 9, 7);
+    const Index max_coeff_count = 7 * 5 * 6 * 9 * 7;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY_IS_EQUAL(9, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(7, block.dimensions()[0]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+}
+
+template <int Layout>
+static void test_skewed_inner_dim_block_shape()
+{
+  typedef internal::TensorBlockDescriptor<5> TensorBlock;
+  typedef internal::TensorBlockMapper<5, Layout> TensorBlockMapper;
+
+  // Test shape 'SkewedInnerDims' with partial allocation to inner-most dim.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 10 * 1 * 1 * 1 * 1;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(10, block.dimensions()[0]);
+    for (int i = 1; i < 5; ++i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 1 * 1 * 1 * 1 * 6;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(6, block.dimensions()[4]);
+    for (int i = 3; i >= 0; --i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'SkewedInnerDims' with full allocation to inner-most dim.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 1 * 1 * 1 * 1;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    for (int i = 1; i < 5; ++i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 1 * 1 * 1 * 1 * 7;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    for (int i = 3; i >= 0; --i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'SkewedInnerDims' with full allocation to inner-most dim,
+  // and partial allocation to second inner-dim.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 3 * 1 * 1 * 1;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    VERIFY_IS_EQUAL(3, block.dimensions()[1]);
+    for (int i = 2; i < 5; ++i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 1 * 1 * 1 * 15 * 7;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY_IS_EQUAL(15, block.dimensions()[3]);
+    for (int i = 2; i >= 0; --i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'SkewedInnerDims' with full allocation to inner-most dim,
+  // and partial allocation to third inner-dim.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 5 * 5 * 1 * 1;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[2]);
+    for (int i = 3; i < 5; ++i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 1 * 1 * 5 * 17 * 7;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY_IS_EQUAL(17, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[2]);
+    for (int i = 1; i >= 0; --i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'SkewedInnerDims' with full allocation to all dims.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 5 * 6 * 17 * 7;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(17, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 5 * 6 * 17 * 7;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY_IS_EQUAL(17, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+}
+
+template <int Layout>
+static void test_empty_dims(const internal::TensorBlockShapeType block_shape)
+{
+  // Test blocking of tensors with zero dimensions:
+  //  - we must not crash on asserts and divisions by zero
+  //  - we must not return block with zero dimensions
+  //    (recipe for overflows/underflows, divisions by zero and NaNs later)
+  //  - total block count must be zero
+  {
+    typedef internal::TensorBlockMapper<1, Layout> TensorBlockMapper;
+
+    DSizes<Index, 1> dims(0);
+    for (size_t max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) {
+      TensorBlockMapper block_mapper(
+          dims, {block_shape, max_coeff_count, zeroCost()});
+      VERIFY_IS_EQUAL(block_mapper.blockCount(), 0);
+      VERIFY(block_mapper.blockTotalSize() >= 1);
+    }
+  }
+
+  {
+    typedef internal::TensorBlockMapper<2, Layout> TensorBlockMapper;
+
+    for (int dim1 = 0; dim1 < 3; ++dim1) {
+      for (int dim2 = 0; dim2 < 3; ++dim2) {
+        DSizes<Index, 2> dims(dim1, dim2);
+        for (size_t max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) {
+          TensorBlockMapper block_mapper(
+              dims, {block_shape, max_coeff_count, zeroCost()});
+          if (dim1 * dim2 == 0) {
+            VERIFY_IS_EQUAL(block_mapper.blockCount(), 0);
+          }
+          VERIFY(block_mapper.blockTotalSize() >= 1);
+        }
+      }
+    }
+  }
+}
+
+#define TEST_LAYOUTS(NAME) \
+  CALL_SUBTEST(NAME<ColMajor>()); \
+  CALL_SUBTEST(NAME<RowMajor>())
+
+#define TEST_LAYOUTS_AND_DIMS(TYPE, NAME)    \
+  CALL_SUBTEST((NAME<TYPE, 1, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 1, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 2, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 2, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 3, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 3, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 4, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 4, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 5, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 5, RowMajor>()))
+
+#define TEST_LAYOUTS_WITH_ARG(NAME, ARG) \
+  CALL_SUBTEST(NAME<ColMajor>(ARG)); \
+  CALL_SUBTEST(NAME<RowMajor>(ARG))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
+  TEST_LAYOUTS(test_block_mapper_sanity);
+  TEST_LAYOUTS_AND_DIMS(float, test_block_mapper_maps_every_element);
+  TEST_LAYOUTS(test_uniform_block_shape);
+  TEST_LAYOUTS(test_skewed_inner_dim_block_shape);
+  TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kUniformAllDims);
+  TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kSkewedInnerDims);
+}
+
+#undef TEST_LAYOUTS
+#undef TEST_LAYOUTS_WITH_ARG

diff --git a/unsupported/test/cxx11_tensor_block_eval.cpp b/unsupported/test/cxx11_tensor_block_eval.cpp
new file mode 100644
index 0000000..b2e26eb
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_block_eval.cpp

@@ -0,0 +1,858 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// clang-format off
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+// clang-format on
+
+using Eigen::internal::TensorBlockDescriptor;
+using Eigen::internal::TensorExecutor;
+
+// -------------------------------------------------------------------------- //
+// Utility functions to generate random tensors, blocks, and evaluate them.
+
+template <int NumDims>
+static DSizes<Index, NumDims> RandomDims(Index min, Index max) {
+  DSizes<Index, NumDims> dims;
+  for (int i = 0; i < NumDims; ++i) {
+    dims[i] = internal::random<Index>(min, max);
+  }
+  return DSizes<Index, NumDims>(dims);
+}
+
+// Block offsets and extents allows to construct a TensorSlicingOp corresponding
+// to a TensorBlockDescriptor.
+template <int NumDims>
+struct TensorBlockParams {
+  DSizes<Index, NumDims> offsets;
+  DSizes<Index, NumDims> sizes;
+  TensorBlockDescriptor<NumDims, Index> desc;
+};
+
+template <int Layout, int NumDims>
+static TensorBlockParams<NumDims> RandomBlock(DSizes<Index, NumDims> dims,
+                                              Index min, Index max) {
+  // Choose random offsets and sizes along all tensor dimensions.
+  DSizes<Index, NumDims> offsets(RandomDims<NumDims>(min, max));
+  DSizes<Index, NumDims> sizes(RandomDims<NumDims>(min, max));
+
+  // Make sure that offset + size do not overflow dims.
+  for (int i = 0; i < NumDims; ++i) {
+    offsets[i] = numext::mini(dims[i] - 1, offsets[i]);
+    sizes[i] = numext::mini(sizes[i], dims[i] - offsets[i]);
+  }
+
+  Index offset = 0;
+  DSizes<Index, NumDims> strides = Eigen::internal::strides<Layout>(dims);
+  for (int i = 0; i < NumDims; ++i) {
+    offset += strides[i] * offsets[i];
+  }
+
+  return {offsets, sizes, TensorBlockDescriptor<NumDims, Index>(offset, sizes)};
+}
+
+// Generate block with block sizes skewed towards inner dimensions. This type of
+// block is required for evaluating broadcast expressions.
+template <int Layout, int NumDims>
+static TensorBlockParams<NumDims> SkewedInnerBlock(
+    DSizes<Index, NumDims> dims) {
+  using BlockMapper = internal::TensorBlockMapper<NumDims, Layout, Index>;
+  BlockMapper block_mapper(dims,
+                           {internal::TensorBlockShapeType::kSkewedInnerDims,
+                            internal::random<size_t>(1, dims.TotalSize()),
+                            {0, 0, 0}});
+
+  Index total_blocks = block_mapper.blockCount();
+  Index block_index = internal::random<Index>(0, total_blocks - 1);
+  auto block = block_mapper.blockDescriptor(block_index);
+  DSizes<Index, NumDims> sizes = block.dimensions();
+
+  auto strides = internal::strides<Layout>(dims);
+  DSizes<Index, NumDims> offsets;
+
+  // Compute offsets for the first block coefficient.
+  Index index = block.offset();
+  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = index / strides[i];
+      index -= idx * strides[i];
+      offsets[i] = idx;
+    }
+    if (NumDims > 0) offsets[0] = index;
+  } else {
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = index / strides[i];
+      index -= idx * strides[i];
+      offsets[i] = idx;
+    }
+    if (NumDims > 0) offsets[NumDims - 1] = index;
+  }
+
+  return {offsets, sizes, block};
+}
+
+template <int NumDims>
+static TensorBlockParams<NumDims> FixedSizeBlock(DSizes<Index, NumDims> dims) {
+  DSizes<Index, NumDims> offsets;
+  for (int i = 0; i < NumDims; ++i) offsets[i] = 0;
+
+  return {offsets, dims, TensorBlockDescriptor<NumDims, Index>(0, dims)};
+}
+
+inline Eigen::IndexList<Index, Eigen::type2index<1>> NByOne(Index n) {
+  Eigen::IndexList<Index, Eigen::type2index<1>> ret;
+  ret.set(0, n);
+  return ret;
+}
+inline Eigen::IndexList<Eigen::type2index<1>, Index> OneByM(Index m) {
+  Eigen::IndexList<Eigen::type2index<1>, Index> ret;
+  ret.set(1, m);
+  return ret;
+}
+
+// -------------------------------------------------------------------------- //
+// Verify that block expression evaluation produces the same result as a
+// TensorSliceOp (reading a tensor block is same to taking a tensor slice).
+
+template <typename T, int NumDims, int Layout, typename Expression,
+          typename GenBlockParams>
+static void VerifyBlockEvaluator(Expression expr, GenBlockParams gen_block) {
+  using Device = DefaultDevice;
+  auto d = Device();
+
+  // Scratch memory allocator for block evaluation.
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+  TensorBlockScratch scratch(d);
+
+  // TensorEvaluator is needed to produce tensor blocks of the expression.
+  auto eval = TensorEvaluator<const decltype(expr), Device>(expr, d);
+  eval.evalSubExprsIfNeeded(nullptr);
+
+  // Choose a random offsets, sizes and TensorBlockDescriptor.
+  TensorBlockParams<NumDims> block_params = gen_block();
+
+  // Evaluate TensorBlock expression into a tensor.
+  Tensor<T, NumDims, Layout> block(block_params.desc.dimensions());
+
+  // Dimensions for the potential destination buffer.
+  DSizes<Index, NumDims> dst_dims;
+  if (internal::random<bool>()) {
+    dst_dims = block_params.desc.dimensions();
+  } else {
+    for (int i = 0; i < NumDims; ++i) {
+      Index extent = internal::random<Index>(0, 5);
+      dst_dims[i] = block_params.desc.dimension(i) + extent;
+    }
+  }
+
+  // Maybe use this tensor as a block desc destination.
+  Tensor<T, NumDims, Layout> dst(dst_dims);
+  dst.setZero();
+  if (internal::random<bool>()) {
+    block_params.desc.template AddDestinationBuffer<Layout>(
+        dst.data(), internal::strides<Layout>(dst.dimensions()));
+  }
+
+  const bool root_of_expr = internal::random<bool>();
+  auto tensor_block = eval.block(block_params.desc, scratch, root_of_expr);
+
+  if (tensor_block.kind() == internal::TensorBlockKind::kMaterializedInOutput) {
+    // Copy data from destination buffer.
+    if (dimensions_match(dst.dimensions(), block.dimensions())) {
+      block = dst;
+    } else {
+      DSizes<Index, NumDims> offsets;
+      for (int i = 0; i < NumDims; ++i) offsets[i] = 0;
+      block = dst.slice(offsets, block.dimensions());
+    }
+
+  } else {
+    // Assign to block from expression.
+    auto b_expr = tensor_block.expr();
+
+    // We explicitly disable vectorization and tiling, to run a simple coefficient
+    // wise assignment loop, because it's very simple and should be correct.
+    using BlockAssign = TensorAssignOp<decltype(block), const decltype(b_expr)>;
+    using BlockExecutor = TensorExecutor<const BlockAssign, Device, false,
+                                         internal::TiledEvaluation::Off>;
+    BlockExecutor::run(BlockAssign(block, b_expr), d);
+  }
+
+  // Cleanup temporary buffers owned by a tensor block.
+  tensor_block.cleanup();
+
+  // Compute a Tensor slice corresponding to a Tensor block.
+  Tensor<T, NumDims, Layout> slice(block_params.desc.dimensions());
+  auto s_expr = expr.slice(block_params.offsets, block_params.sizes);
+
+  // Explicitly use coefficient assignment to evaluate slice expression.
+  using SliceAssign = TensorAssignOp<decltype(slice), const decltype(s_expr)>;
+  using SliceExecutor = TensorExecutor<const SliceAssign, Device, false,
+                                       internal::TiledEvaluation::Off>;
+  SliceExecutor::run(SliceAssign(slice, s_expr), d);
+
+  // Tensor block and tensor slice must be the same.
+  for (Index i = 0; i < block.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(block.coeff(i), slice.coeff(i));
+  }
+}
+
+// -------------------------------------------------------------------------- //
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_block() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // Identity tensor expression transformation.
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input, [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_unary_expr_block() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.abs(), [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_binary_expr_block() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> lhs(dims), rhs(dims);
+  lhs.setRandom();
+  rhs.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      lhs * rhs, [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_binary_with_unary_expr_block() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> lhs(dims), rhs(dims);
+  lhs.setRandom();
+  rhs.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      (lhs.square() + rhs.square()).sqrt(),
+      [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_broadcast() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  DSizes<Index, NumDims> bcast = RandomDims<NumDims>(1, 5);
+
+  DSizes<Index, NumDims> bcasted_dims;
+  for (int i = 0; i < NumDims; ++i) bcasted_dims[i] = dims[i] * bcast[i];
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.broadcast(bcast),
+      [&bcasted_dims]() { return SkewedInnerBlock<Layout>(bcasted_dims); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.broadcast(bcast),
+      [&bcasted_dims]() { return RandomBlock<Layout>(bcasted_dims, 5, 10); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.broadcast(bcast),
+      [&bcasted_dims]() { return FixedSizeBlock(bcasted_dims); });
+
+  // Check that desc.destination() memory is not shared between two broadcast
+  // materializations.
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.broadcast(bcast) * input.abs().broadcast(bcast),
+      [&bcasted_dims]() { return SkewedInnerBlock<Layout>(bcasted_dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_reshape() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 10);
+
+  DSizes<Index, NumDims> shuffled = dims;
+  std::shuffle(&shuffled[0], &shuffled[NumDims - 1], std::mt19937(g_seed));
+
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.reshape(shuffled),
+      [&shuffled]() { return RandomBlock<Layout>(shuffled, 1, 10); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.reshape(shuffled),
+      [&shuffled]() { return SkewedInnerBlock<Layout>(shuffled); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_cast() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.template cast<int>().template cast<T>(),
+      [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_select() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> lhs(dims);
+  Tensor<T, NumDims, Layout> rhs(dims);
+  Tensor<bool, NumDims, Layout> cond(dims);
+  lhs.setRandom();
+  rhs.setRandom();
+  cond.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(cond.select(lhs, rhs), [&dims]() {
+    return RandomBlock<Layout>(dims, 1, 20);
+  });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_padding() {
+  const int inner_dim = Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  DSizes<Index, NumDims> pad_before = RandomDims<NumDims>(0, 4);
+  DSizes<Index, NumDims> pad_after = RandomDims<NumDims>(0, 4);
+  array<std::pair<Index, Index>, NumDims> paddings;
+  for (int i = 0; i < NumDims; ++i) {
+    paddings[i] = std::make_pair(pad_before[i], pad_after[i]);
+  }
+
+  // Test squeezing reads from inner dim.
+  if (internal::random<bool>()) {
+    pad_before[inner_dim] = 0;
+    pad_after[inner_dim] = 0;
+    paddings[inner_dim] = std::make_pair(0, 0);
+  }
+
+  DSizes<Index, NumDims> padded_dims;
+  for (int i = 0; i < NumDims; ++i) {
+    padded_dims[i] = dims[i] + pad_before[i] + pad_after[i];
+  }
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.pad(paddings),
+      [&padded_dims]() { return FixedSizeBlock(padded_dims); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.pad(paddings),
+      [&padded_dims]() { return RandomBlock<Layout>(padded_dims, 1, 10); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.pad(paddings),
+      [&padded_dims]() { return SkewedInnerBlock<Layout>(padded_dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_chipping() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  Index chip_dim = internal::random<int>(0, NumDims - 1);
+  Index chip_offset = internal::random<Index>(0, dims[chip_dim] - 2);
+
+  DSizes<Index, NumDims - 1> chipped_dims;
+  for (Index i = 0; i < chip_dim; ++i) {
+    chipped_dims[i] = dims[i];
+  }
+  for (Index i = chip_dim + 1; i < NumDims; ++i) {
+    chipped_dims[i - 1] = dims[i];
+  }
+
+  // Block buffer forwarding.
+  VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+      input.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+
+  VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+      input.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
+
+  // Block expression assignment.
+  VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+      input.abs().chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+
+  VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+      input.abs().chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
+}
+
+
+template<typename T, int NumDims>
+struct SimpleTensorGenerator {
+  T operator()(const array<Index, NumDims>& coords) const {
+    T result = static_cast<T>(0);
+    for (int i = 0; i < NumDims; ++i) {
+      result += static_cast<T>((i + 1) * coords[i]);
+    }
+    return result;
+  }
+};
+
+// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC.
+template<int NumDims>
+struct SimpleTensorGenerator<bool, NumDims> {
+  bool operator()(const array<Index, NumDims>& coords) const {
+    bool result = false;
+    for (int i = 0; i < NumDims; ++i) {
+      result ^= coords[i];
+    }
+    return result;
+  }
+};
+
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_generator() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  auto generator = SimpleTensorGenerator<T, NumDims>();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.generate(generator), [&dims]() { return FixedSizeBlock(dims); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.generate(generator),
+      [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_reverse() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // Randomly reverse dimensions.
+  Eigen::DSizes<bool, NumDims> reverse;
+  for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.reverse(reverse), [&dims]() { return FixedSizeBlock(dims); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(input.reverse(reverse), [&dims]() {
+    return RandomBlock<Layout>(dims, 1, 10);
+  });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_slice() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // Pick a random slice of an input tensor.
+  DSizes<Index, NumDims> slice_start = RandomDims<NumDims>(5, 10);
+  DSizes<Index, NumDims> slice_size = RandomDims<NumDims>(5, 10);
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < NumDims; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.slice(slice_start, slice_size),
+      [&slice_size]() { return FixedSizeBlock(slice_size); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.slice(slice_start, slice_size),
+      [&slice_size]() { return RandomBlock<Layout>(slice_size, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_shuffle() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(5, 15);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  DSizes<Index, NumDims> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+  do {
+    DSizes<Index, NumDims> shuffled_dims;
+    for (int i = 0; i < NumDims; ++i) shuffled_dims[i] = dims[shuffle[i]];
+
+    VerifyBlockEvaluator<T, NumDims, Layout>(
+        input.shuffle(shuffle),
+        [&shuffled_dims]() { return FixedSizeBlock(shuffled_dims); });
+
+    VerifyBlockEvaluator<T, NumDims, Layout>(
+        input.shuffle(shuffle), [&shuffled_dims]() {
+          return RandomBlock<Layout>(shuffled_dims, 1, 5);
+        });
+
+    break;
+
+  } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+template <typename T, int Layout>
+static void test_eval_tensor_reshape_with_bcast() {
+  Index dim = internal::random<Index>(1, 100);
+
+  Tensor<T, 2, Layout> lhs(1, dim);
+  Tensor<T, 2, Layout> rhs(dim, 1);
+  lhs.setRandom();
+  rhs.setRandom();
+
+  auto reshapeLhs = NByOne(dim);
+  auto reshapeRhs = OneByM(dim);
+
+  auto bcastLhs = OneByM(dim);
+  auto bcastRhs = NByOne(dim);
+
+  DSizes<Index, 2> dims(dim, dim);
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      lhs.reshape(reshapeLhs).broadcast(bcastLhs) *
+          rhs.reshape(reshapeRhs).broadcast(bcastRhs),
+      [dims]() { return SkewedInnerBlock<Layout, 2>(dims); });
+}
+
+template <typename T, int Layout>
+static void test_eval_tensor_forced_eval() {
+  Index dim = internal::random<Index>(1, 100);
+
+  Tensor<T, 2, Layout> lhs(dim, 1);
+  Tensor<T, 2, Layout> rhs(1, dim);
+  lhs.setRandom();
+  rhs.setRandom();
+
+  auto bcastLhs = OneByM(dim);
+  auto bcastRhs = NByOne(dim);
+
+  DSizes<Index, 2> dims(dim, dim);
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      (lhs.broadcast(bcastLhs) * rhs.broadcast(bcastRhs)).eval().reshape(dims),
+      [dims]() { return SkewedInnerBlock<Layout, 2>(dims); });
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      (lhs.broadcast(bcastLhs) * rhs.broadcast(bcastRhs)).eval().reshape(dims),
+      [dims]() { return RandomBlock<Layout, 2>(dims, 1, 50); });
+}
+
+template <typename T, int Layout>
+static void test_eval_tensor_chipping_of_bcast() {
+  if (Layout != static_cast<int>(RowMajor)) return;
+
+  Index dim0 = internal::random<Index>(1, 10);
+  Index dim1 = internal::random<Index>(1, 10);
+  Index dim2 = internal::random<Index>(1, 10);
+
+  Tensor<T, 3, Layout> input(1, dim1, dim2);
+  input.setRandom();
+
+  Eigen::array<Index, 3> bcast = {{dim0, 1, 1}};
+  DSizes<Index, 2> chipped_dims(dim0, dim2);
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      input.broadcast(bcast).chip(0, 1),
+      [chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      input.broadcast(bcast).chip(0, 1),
+      [chipped_dims]() { return SkewedInnerBlock<Layout, 2>(chipped_dims); });
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      input.broadcast(bcast).chip(0, 1),
+      [chipped_dims]() { return RandomBlock<Layout, 2>(chipped_dims, 1, 5); });
+}
+
+// -------------------------------------------------------------------------- //
+// Verify that assigning block to a Tensor expression produces the same result
+// as an assignment to TensorSliceOp (writing a block is is identical to
+// assigning one tensor to a slice of another tensor).
+
+template <typename T, int NumDims, int Layout, int NumExprDims = NumDims,
+          typename Expression, typename GenBlockParams>
+static void VerifyBlockAssignment(Tensor<T, NumDims, Layout>& tensor,
+                                  Expression expr, GenBlockParams gen_block) {
+  using Device = DefaultDevice;
+  auto d = Device();
+
+  // We use tensor evaluator as a target for block and slice assignments.
+  auto eval = TensorEvaluator<decltype(expr), Device>(expr, d);
+
+  // Generate a random block, or choose a block that fits in full expression.
+  TensorBlockParams<NumExprDims> block_params = gen_block();
+
+  // Generate random data of the selected block size.
+  Tensor<T, NumExprDims, Layout> block(block_params.desc.dimensions());
+  block.setRandom();
+
+  // ************************************************************************ //
+  // (1) Assignment from a block.
+
+  // Construct a materialize block from a random generated block tensor.
+  internal::TensorMaterializedBlock<T, NumExprDims, Layout> blk(
+      internal::TensorBlockKind::kView, block.data(), block.dimensions());
+
+  // Reset all underlying tensor values to zero.
+  tensor.setZero();
+
+  // Use evaluator to write block into a tensor.
+  eval.writeBlock(block_params.desc, blk);
+
+  // Make a copy of the result after assignment.
+  Tensor<T, NumDims, Layout> block_assigned = tensor;
+
+  // ************************************************************************ //
+  // (2) Assignment to a slice
+
+  // Reset all underlying tensor values to zero.
+  tensor.setZero();
+
+  // Assign block to a slice of original expression
+  auto s_expr = expr.slice(block_params.offsets, block_params.sizes);
+
+  // Explicitly use coefficient assignment to evaluate slice expression.
+  using SliceAssign = TensorAssignOp<decltype(s_expr), const decltype(block)>;
+  using SliceExecutor = TensorExecutor<const SliceAssign, Device, false,
+                                       internal::TiledEvaluation::Off>;
+  SliceExecutor::run(SliceAssign(s_expr, block), d);
+
+  // Make a copy of the result after assignment.
+  Tensor<T, NumDims, Layout> slice_assigned = tensor;
+
+  for (Index i = 0; i < tensor.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(block_assigned.coeff(i), slice_assigned.coeff(i));
+  }
+}
+
+// -------------------------------------------------------------------------- //
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map, [&dims]() { return RandomBlock<Layout>(dims, 10, 20); });
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map, [&dims]() { return FixedSizeBlock(dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_reshape() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  DSizes<Index, NumDims> shuffled = dims;
+  std::shuffle(&shuffled[0], &shuffled[NumDims - 1], std::mt19937(g_seed));
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.reshape(shuffled),
+      [&shuffled]() { return RandomBlock<Layout>(shuffled, 1, 10); });
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.reshape(shuffled),
+      [&shuffled]() { return SkewedInnerBlock<Layout>(shuffled); });
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.reshape(shuffled),
+      [&shuffled]() { return FixedSizeBlock(shuffled); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_chipping() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  Index chip_dim = internal::random<int>(0, NumDims - 1);
+  Index chip_offset = internal::random<Index>(0, dims[chip_dim] - 2);
+
+  DSizes<Index, NumDims - 1> chipped_dims;
+  for (Index i = 0; i < chip_dim; ++i) {
+    chipped_dims[i] = dims[i];
+  }
+  for (Index i = chip_dim + 1; i < NumDims; ++i) {
+    chipped_dims[i - 1] = dims[i];
+  }
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>(
+      tensor, map.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
+
+  VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>(
+      tensor, map.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return SkewedInnerBlock<Layout>(chipped_dims); });
+
+  VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>(
+      tensor, map.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_slice() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  // Pick a random slice of tensor.
+  DSizes<Index, NumDims> slice_start = RandomDims<NumDims>(5, 10);
+  DSizes<Index, NumDims> slice_size = RandomDims<NumDims>(5, 10);
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < NumDims; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.slice(slice_start, slice_size),
+      [&slice_size]() { return RandomBlock<Layout>(slice_size, 1, 10); });
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.slice(slice_start, slice_size),
+      [&slice_size]() { return SkewedInnerBlock<Layout>(slice_size); });
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.slice(slice_start, slice_size),
+      [&slice_size]() { return FixedSizeBlock(slice_size); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_shuffle() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(5, 15);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  DSizes<Index, NumDims> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  do {
+    DSizes<Index, NumDims> shuffled_dims;
+    for (int i = 0; i < NumDims; ++i) shuffled_dims[i] = dims[shuffle[i]];
+
+    VerifyBlockAssignment<T, NumDims, Layout>(
+        tensor, map.shuffle(shuffle),
+        [&shuffled_dims]() { return FixedSizeBlock(shuffled_dims); });
+
+    VerifyBlockAssignment<T, NumDims, Layout>(
+        tensor, map.shuffle(shuffle), [&shuffled_dims]() {
+          return RandomBlock<Layout>(shuffled_dims, 1, 5);
+        });
+
+  } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+// -------------------------------------------------------------------------- //
+
+#define CALL_SUBTEST_PART(PART) \
+  CALL_SUBTEST_##PART
+
+#define CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(PART, NAME)           \
+  CALL_SUBTEST_PART(PART)((NAME<float, 1, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 2, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 3, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 5, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 1, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 2, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 5, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 1, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 2, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 3, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 4, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 5, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 1, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 2, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 5, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 1, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 2, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 3, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 4, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 5, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 1, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 2, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 5, ColMajor>()))
+
+#define CALL_SUBTESTS_DIMS_LAYOUTS(PART, NAME)     \
+  CALL_SUBTEST_PART(PART)((NAME<float, 1, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 2, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 3, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 5, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 1, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 2, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 5, ColMajor>()))
+
+#define CALL_SUBTESTS_LAYOUTS_TYPES(PART, NAME)       \
+  CALL_SUBTEST_PART(PART)((NAME<float, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, ColMajor>()));  \
+  CALL_SUBTEST_PART(PART)((NAME<bool, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, ColMajor>()))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_block_eval) {
+  // clang-format off
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(1, test_eval_tensor_block);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(1, test_eval_tensor_binary_expr_block);
+  CALL_SUBTESTS_DIMS_LAYOUTS(1, test_eval_tensor_unary_expr_block);
+  CALL_SUBTESTS_DIMS_LAYOUTS(2, test_eval_tensor_binary_with_unary_expr_block);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(2, test_eval_tensor_broadcast);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(2, test_eval_tensor_reshape);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_cast);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_select);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_padding);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(4, test_eval_tensor_chipping);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(4, test_eval_tensor_generator);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(4, test_eval_tensor_reverse);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(5, test_eval_tensor_slice);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(5, test_eval_tensor_shuffle);
+
+  CALL_SUBTESTS_LAYOUTS_TYPES(6, test_eval_tensor_reshape_with_bcast);
+  CALL_SUBTESTS_LAYOUTS_TYPES(6, test_eval_tensor_forced_eval);
+  CALL_SUBTESTS_LAYOUTS_TYPES(6, test_eval_tensor_chipping_of_bcast);
+
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(7, test_assign_to_tensor);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(7, test_assign_to_tensor_reshape);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(7, test_assign_to_tensor_chipping);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(8, test_assign_to_tensor_slice);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(8, test_assign_to_tensor_shuffle);
+
+  // Force CMake to split this test.
+  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8
+
+  // clang-format on
+}

diff --git a/unsupported/test/cxx11_tensor_block_io.cpp b/unsupported/test/cxx11_tensor_block_io.cpp
new file mode 100644
index 0000000..52f7dde
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_block_io.cpp

@@ -0,0 +1,445 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// clang-format off
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+// clang-format on
+
+// -------------------------------------------------------------------------- //
+// A set of tests for TensorBlockIO: copying data between tensor blocks.
+
+template <int NumDims>
+static DSizes<Index, NumDims> RandomDims(Index min, Index max) {
+  DSizes<Index, NumDims> dims;
+  for (int i = 0; i < NumDims; ++i) {
+    dims[i] = internal::random<Index>(min, max);
+  }
+  return DSizes<Index, NumDims>(dims);
+}
+
+static internal::TensorBlockShapeType RandomBlockShape() {
+  return internal::random<bool>()
+         ? internal::TensorBlockShapeType::kUniformAllDims
+         : internal::TensorBlockShapeType::kSkewedInnerDims;
+}
+
+template <int NumDims>
+static size_t RandomTargetBlockSize(const DSizes<Index, NumDims>& dims) {
+  return internal::random<size_t>(1, dims.TotalSize());
+}
+
+template <int Layout, int NumDims>
+static Index GetInputIndex(Index output_index,
+                           const array<Index, NumDims>& output_to_input_dim_map,
+                           const array<Index, NumDims>& input_strides,
+                           const array<Index, NumDims>& output_strides) {
+  int input_index = 0;
+  if (Layout == ColMajor) {
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = output_index / output_strides[i];
+      input_index += idx * input_strides[output_to_input_dim_map[i]];
+      output_index -= idx * output_strides[i];
+    }
+    return input_index +
+           output_index * input_strides[output_to_input_dim_map[0]];
+  } else {
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = output_index / output_strides[i];
+      input_index += idx * input_strides[output_to_input_dim_map[i]];
+      output_index -= idx * output_strides[i];
+    }
+    return input_index +
+           output_index * input_strides[output_to_input_dim_map[NumDims - 1]];
+  }
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_block_io_copy_data_from_source_to_target() {
+  using TensorBlockIO = internal::TensorBlockIO<T, Index, NumDims, Layout>;
+  using IODst = typename TensorBlockIO::Dst;
+  using IOSrc = typename TensorBlockIO::Src;
+
+  // Generate a random input Tensor.
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 30);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // Write data to an output Tensor.
+  Tensor<T, NumDims, Layout> output(dims);
+
+  // Construct a tensor block mapper.
+  using TensorBlockMapper =
+      internal::TensorBlockMapper<NumDims, Layout, Index>;
+  TensorBlockMapper block_mapper(
+      dims, {RandomBlockShape(), RandomTargetBlockSize(dims), {0, 0, 0}});
+
+  // We will copy data from input to output through this buffer.
+  Tensor<T, NumDims, Layout> block(block_mapper.blockDimensions());
+
+  // Precompute strides for TensorBlockIO::Copy.
+  auto input_strides = internal::strides<Layout>(dims);
+  auto output_strides = internal::strides<Layout>(dims);
+
+  const T* input_data = input.data();
+  T* output_data = output.data();
+  T* block_data = block.data();
+
+  for (int i = 0; i < block_mapper.blockCount(); ++i) {
+    auto desc = block_mapper.blockDescriptor(i);
+
+    auto blk_dims = desc.dimensions();
+    auto blk_strides = internal::strides<Layout>(blk_dims);
+
+    {
+      // Read from input into a block buffer.
+      IODst dst(blk_dims, blk_strides, block_data, 0);
+      IOSrc src(input_strides, input_data, desc.offset());
+
+      TensorBlockIO::Copy(dst, src);
+    }
+
+    {
+      // Write from block buffer to output.
+      IODst dst(blk_dims, output_strides, output_data, desc.offset());
+      IOSrc src(blk_strides, block_data, 0);
+
+      TensorBlockIO::Copy(dst, src);
+    }
+  }
+
+  for (int i = 0; i < dims.TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(input_data[i], output_data[i]);
+  }
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_block_io_copy_using_reordered_dimensions() {
+  // Generate a random input Tensor.
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 30);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // Create a random dimension re-ordering/shuffle.
+  std::vector<int> shuffle;
+
+  for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
+  std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937(g_seed));
+
+  DSizes<Index, NumDims> output_tensor_dims;
+  DSizes<Index, NumDims> input_to_output_dim_map;
+  DSizes<Index, NumDims> output_to_input_dim_map;
+  for (Index i = 0; i < NumDims; ++i) {
+    output_tensor_dims[shuffle[i]] = dims[i];
+    input_to_output_dim_map[i] = shuffle[i];
+    output_to_input_dim_map[shuffle[i]] = i;
+  }
+
+  // Write data to an output Tensor.
+  Tensor<T, NumDims, Layout> output(output_tensor_dims);
+
+  // Construct a tensor block mapper.
+  // NOTE: Tensor block mapper works with shuffled dimensions.
+  using TensorBlockMapper =
+      internal::TensorBlockMapper<NumDims, Layout, Index>;
+  TensorBlockMapper block_mapper(output_tensor_dims,
+                                 {RandomBlockShape(),
+                                  RandomTargetBlockSize(output_tensor_dims),
+                                  {0, 0, 0}});
+
+  // We will copy data from input to output through this buffer.
+  Tensor<T, NumDims, Layout> block(block_mapper.blockDimensions());
+
+  // Precompute strides for TensorBlockIO::Copy.
+  auto input_strides = internal::strides<Layout>(dims);
+  auto output_strides = internal::strides<Layout>(output_tensor_dims);
+
+  const T* input_data = input.data();
+  T* output_data = output.data();
+  T* block_data = block.data();
+
+  for (Index i = 0; i < block_mapper.blockCount(); ++i) {
+    auto desc = block_mapper.blockDescriptor(i);
+
+    const Index first_coeff_index = GetInputIndex<Layout, NumDims>(
+        desc.offset(), output_to_input_dim_map, input_strides,
+        output_strides);
+
+    // NOTE: Block dimensions are in the same order as output dimensions.
+
+    using TensorBlockIO = internal::TensorBlockIO<T, Index, NumDims, Layout>;
+    using IODst = typename TensorBlockIO::Dst;
+    using IOSrc = typename TensorBlockIO::Src;
+
+    auto blk_dims = desc.dimensions();
+    auto blk_strides = internal::strides<Layout>(blk_dims);
+
+    {
+      // Read from input into a block buffer.
+      IODst dst(blk_dims, blk_strides, block_data, 0);
+      IOSrc src(input_strides, input_data, first_coeff_index);
+
+      // TODO(ezhulenev): Remove when fully switched to TensorBlock.
+      DSizes<int, NumDims> dim_map;
+      for (int j = 0; j < NumDims; ++j)
+        dim_map[j] = static_cast<int>(output_to_input_dim_map[j]);
+      TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/dim_map);
+    }
+
+    {
+      // We need to convert block dimensions from output to input order.
+      auto dst_dims = blk_dims;
+      for (int out_dim = 0; out_dim < NumDims; ++out_dim) {
+        dst_dims[output_to_input_dim_map[out_dim]] = blk_dims[out_dim];
+      }
+
+      // Write from block buffer to output.
+      IODst dst(dst_dims, input_strides, output_data, first_coeff_index);
+      IOSrc src(blk_strides, block_data, 0);
+
+      // TODO(ezhulenev): Remove when fully switched to TensorBlock.
+      DSizes<int, NumDims> dim_map;
+      for (int j = 0; j < NumDims; ++j)
+        dim_map[j] = static_cast<int>(input_to_output_dim_map[j]);
+      TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/dim_map);
+    }
+  }
+
+  for (Index i = 0; i < dims.TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(input_data[i], output_data[i]);
+  }
+}
+
+// This is the special case for reading data with reordering, when dimensions
+// before/after reordering are the same. Squeezing reads along inner dimensions
+// in this case is illegal, because we reorder innermost dimension.
+template <int Layout>
+static void test_block_io_copy_using_reordered_dimensions_do_not_squeeze() {
+  DSizes<Index, 3> tensor_dims(7, 9, 7);
+  DSizes<Index, 3> block_dims = tensor_dims;
+
+  DSizes<int, 3> block_to_tensor_dim;
+  block_to_tensor_dim[0] = 2;
+  block_to_tensor_dim[1] = 1;
+  block_to_tensor_dim[2] = 0;
+
+  auto tensor_strides = internal::strides<Layout>(tensor_dims);
+  auto block_strides = internal::strides<Layout>(block_dims);
+
+  Tensor<float, 3, Layout> block(block_dims);
+  Tensor<float, 3, Layout> tensor(tensor_dims);
+  tensor.setRandom();
+
+  float* tensor_data = tensor.data();
+  float* block_data = block.data();
+
+  using TensorBlockIO = internal::TensorBlockIO<float, Index, 3, Layout>;
+  using IODst = typename TensorBlockIO::Dst;
+  using IOSrc = typename TensorBlockIO::Src;
+
+  // Read from a tensor into a block.
+  IODst dst(block_dims, block_strides, block_data, 0);
+  IOSrc src(tensor_strides, tensor_data, 0);
+
+  TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/block_to_tensor_dim);
+
+  TensorMap<Tensor<float, 3, Layout> > block_tensor(block_data, block_dims);
+  TensorMap<Tensor<float, 3, Layout> > tensor_tensor(tensor_data, tensor_dims);
+
+  for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) {
+    for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) {
+      for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) {
+        float block_value = block_tensor(d2, d1, d0);
+        float tensor_value = tensor_tensor(d0, d1, d2);
+        VERIFY_IS_EQUAL(block_value, tensor_value);
+      }
+    }
+  }
+}
+
+// This is the special case for reading data with reordering, when dimensions
+// before/after reordering are the same. Squeezing reads in this case is allowed
+// because we reorder outer dimensions.
+template <int Layout>
+static void test_block_io_copy_using_reordered_dimensions_squeeze() {
+  DSizes<Index, 4> tensor_dims(7, 5, 9, 9);
+  DSizes<Index, 4> block_dims = tensor_dims;
+
+  DSizes<int, 4> block_to_tensor_dim;
+  block_to_tensor_dim[0] = 0;
+  block_to_tensor_dim[1] = 1;
+  block_to_tensor_dim[2] = 3;
+  block_to_tensor_dim[3] = 2;
+
+  auto tensor_strides = internal::strides<Layout>(tensor_dims);
+  auto block_strides = internal::strides<Layout>(block_dims);
+
+  Tensor<float, 4, Layout> block(block_dims);
+  Tensor<float, 4, Layout> tensor(tensor_dims);
+  tensor.setRandom();
+
+  float* tensor_data = tensor.data();
+  float* block_data = block.data();
+
+  using TensorBlockIO = internal::TensorBlockIO<float, Index, 4, Layout>;
+  using IODst = typename TensorBlockIO::Dst;
+  using IOSrc = typename TensorBlockIO::Src;
+
+  // Read from a tensor into a block.
+  IODst dst(block_dims, block_strides, block_data, 0);
+  IOSrc src(tensor_strides, tensor_data, 0);
+
+  TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/block_to_tensor_dim);
+
+  TensorMap<Tensor<float, 4, Layout> > block_tensor(block_data, block_dims);
+  TensorMap<Tensor<float, 4, Layout> > tensor_tensor(tensor_data, tensor_dims);
+
+  for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) {
+    for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) {
+      for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) {
+        for (Index d3 = 0; d3 < tensor_dims[3]; ++d3) {
+          float block_value = block_tensor(d0, d1, d3, d2);
+          float tensor_value = tensor_tensor(d0, d1, d2, d3);
+          VERIFY_IS_EQUAL(block_value, tensor_value);
+        }
+      }
+    }
+  }
+}
+
+template <int Layout>
+static void test_block_io_zero_stride() {
+  DSizes<Index, 5> rnd_dims = RandomDims<5>(1, 30);
+
+  DSizes<Index, 5> input_tensor_dims = rnd_dims;
+  input_tensor_dims[0] = 1;
+  input_tensor_dims[2] = 1;
+  input_tensor_dims[4] = 1;
+
+  Tensor<float, 5, Layout> input(input_tensor_dims);
+  input.setRandom();
+
+  DSizes<Index, 5> output_tensor_dims = rnd_dims;
+
+  auto input_tensor_strides = internal::strides<Layout>(input_tensor_dims);
+  auto output_tensor_strides = internal::strides<Layout>(output_tensor_dims);
+
+  auto input_tensor_strides_with_zeros = input_tensor_strides;
+  input_tensor_strides_with_zeros[0] = 0;
+  input_tensor_strides_with_zeros[2] = 0;
+  input_tensor_strides_with_zeros[4] = 0;
+
+  Tensor<float, 5, Layout> output(output_tensor_dims);
+  output.setRandom();
+
+  using TensorBlockIO = internal::TensorBlockIO<float, Index, 5, Layout>;
+  using IODst = typename TensorBlockIO::Dst;
+  using IOSrc = typename TensorBlockIO::Src;
+
+  // Write data from input to output with broadcasting in dims [0, 2, 4].
+  IODst dst(output_tensor_dims, output_tensor_strides, output.data(), 0);
+  IOSrc src(input_tensor_strides_with_zeros, input.data(), 0);
+  TensorBlockIO::Copy(dst, src);
+
+  for (int i = 0; i < output_tensor_dims[0]; ++i) {
+    for (int j = 0; j < output_tensor_dims[1]; ++j) {
+      for (int k = 0; k < output_tensor_dims[2]; ++k) {
+        for (int l = 0; l < output_tensor_dims[3]; ++l) {
+          for (int m = 0; m < output_tensor_dims[4]; ++m) {
+            float input_value = input(0, j, 0, l, 0);
+            float output_value = output(i, j, k, l, m);
+            VERIFY_IS_EQUAL(input_value, output_value);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <int Layout>
+static void test_block_io_squeeze_ones() {
+  using TensorBlockIO = internal::TensorBlockIO<float, Index, 5, Layout>;
+  using IODst = typename TensorBlockIO::Dst;
+  using IOSrc = typename TensorBlockIO::Src;
+
+  // Total size > 1.
+  {
+    DSizes<Index, 5> block_sizes(1, 2, 1, 2, 1);
+    auto strides = internal::strides<Layout>(block_sizes);
+
+    // Create a random input tensor.
+    Tensor<float, 5> input(block_sizes);
+    input.setRandom();
+
+    Tensor<float, 5> output(block_sizes);
+
+    IODst dst(block_sizes, strides, output.data(), 0);
+    IOSrc src(strides, input.data());
+    TensorBlockIO::Copy(dst, src);
+
+    for (Index i = 0; i < block_sizes.TotalSize(); ++i) {
+      VERIFY_IS_EQUAL(output.data()[i], input.data()[i]);
+    }
+  }
+
+  // Total size == 1.
+  {
+    DSizes<Index, 5> block_sizes(1, 1, 1, 1, 1);
+    auto strides = internal::strides<Layout>(block_sizes);
+
+    // Create a random input tensor.
+    Tensor<float, 5> input(block_sizes);
+    input.setRandom();
+
+    Tensor<float, 5> output(block_sizes);
+
+    IODst dst(block_sizes, strides, output.data(), 0);
+    IOSrc src(strides, input.data());
+    TensorBlockIO::Copy(dst, src);
+
+    for (Index i = 0; i < block_sizes.TotalSize(); ++i) {
+      VERIFY_IS_EQUAL(output.data()[i], input.data()[i]);
+    }
+  }
+}
+
+#define CALL_SUBTESTS(NAME)                   \
+  CALL_SUBTEST((NAME<float, 1, RowMajor>())); \
+  CALL_SUBTEST((NAME<float, 2, RowMajor>())); \
+  CALL_SUBTEST((NAME<float, 4, RowMajor>())); \
+  CALL_SUBTEST((NAME<float, 5, RowMajor>())); \
+  CALL_SUBTEST((NAME<float, 1, ColMajor>())); \
+  CALL_SUBTEST((NAME<float, 2, ColMajor>())); \
+  CALL_SUBTEST((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST((NAME<float, 5, ColMajor>())); \
+  CALL_SUBTEST((NAME<bool, 1, RowMajor>())); \
+  CALL_SUBTEST((NAME<bool, 2, RowMajor>())); \
+  CALL_SUBTEST((NAME<bool, 4, RowMajor>())); \
+  CALL_SUBTEST((NAME<bool, 5, RowMajor>())); \
+  CALL_SUBTEST((NAME<bool, 1, ColMajor>())); \
+  CALL_SUBTEST((NAME<bool, 2, ColMajor>())); \
+  CALL_SUBTEST((NAME<bool, 4, ColMajor>())); \
+  CALL_SUBTEST((NAME<bool, 5, ColMajor>()))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_block_io) {
+  // clang-format off
+  CALL_SUBTESTS(test_block_io_copy_data_from_source_to_target);
+  CALL_SUBTESTS(test_block_io_copy_using_reordered_dimensions);
+
+  CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_do_not_squeeze<RowMajor>());
+  CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_do_not_squeeze<ColMajor>());
+
+  CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_squeeze<RowMajor>());
+  CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_squeeze<ColMajor>());
+
+  CALL_SUBTEST(test_block_io_zero_stride<RowMajor>());
+  CALL_SUBTEST(test_block_io_zero_stride<ColMajor>());
+
+  CALL_SUBTEST(test_block_io_squeeze_ones<RowMajor>());
+  CALL_SUBTEST(test_block_io_squeeze_ones<ColMajor>());
+  // clang-format on
+}

diff --git a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
new file mode 100644
index 0000000..20f84b8
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp

@@ -0,0 +1,144 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_broadcast_sycl_fixed(const Eigen::SyclDevice &sycl_device){
+
+  // BROADCAST test:
+  IndexType inDim1=2;
+  IndexType inDim2=3;
+  IndexType inDim3=5;
+  IndexType inDim4=7;
+  IndexType bDim1=2;
+  IndexType bDim2=3;
+  IndexType bDim3=1;
+  IndexType bDim4=4;
+  array<IndexType, 4> in_range   = {{inDim1, inDim2, inDim3, inDim4}};
+  array<IndexType, 4> broadcasts = {{bDim1, bDim2, bDim3, bDim4}};
+  array<IndexType, 4> out_range;  // = in_range * broadcasts
+  for (size_t i = 0; i < out_range.size(); ++i)
+    out_range[i] = in_range[i] * broadcasts[i];
+
+  Tensor<DataType, 4, DataLayout, IndexType>  input(in_range);
+  Tensor<DataType, 4, DataLayout, IndexType> out(out_range);
+
+  for (size_t i = 0; i < in_range.size(); ++i)
+    VERIFY_IS_EQUAL(out.dimension(i), out_range[i]);
+
+
+  for (IndexType i = 0; i < input.size(); ++i)
+    input(i) = static_cast<DataType>(i);
+
+  DataType * gpu_in_data  = static_cast<DataType*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data  = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+
+  TensorMap<TensorFixedSize<DataType, Sizes<2, 3, 5, 7>, DataLayout, IndexType>> gpu_in(gpu_in_data, in_range);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_out(gpu_out_data, out_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(DataType));
+  gpu_out.device(sycl_device) = gpu_in.broadcast(broadcasts);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+
+  for (IndexType i = 0; i < inDim1*bDim1; ++i) {
+    for (IndexType j = 0; j < inDim2*bDim2; ++j) {
+      for (IndexType k = 0; k < inDim3*bDim3; ++k) {
+        for (IndexType l = 0; l < inDim4*bDim4; ++l) {
+          VERIFY_IS_APPROX(input(i%2,j%3,k%5,l%7), out(i,j,k,l));
+        }
+      }
+    }
+  }
+  printf("Broadcast Test with fixed size Passed\n");
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_broadcast_sycl(const Eigen::SyclDevice &sycl_device){
+
+  // BROADCAST test:
+  IndexType inDim1=2;
+  IndexType inDim2=3;
+  IndexType inDim3=5;
+  IndexType inDim4=7;
+  IndexType bDim1=2;
+  IndexType bDim2=3;
+  IndexType bDim3=1;
+  IndexType bDim4=4;
+  array<IndexType, 4> in_range   = {{inDim1, inDim2, inDim3, inDim4}};
+  array<IndexType, 4> broadcasts = {{bDim1, bDim2, bDim3, bDim4}};
+  array<IndexType, 4> out_range;  // = in_range * broadcasts
+  for (size_t i = 0; i < out_range.size(); ++i)
+    out_range[i] = in_range[i] * broadcasts[i];
+
+  Tensor<DataType, 4, DataLayout, IndexType>  input(in_range);
+  Tensor<DataType, 4, DataLayout, IndexType> out(out_range);
+
+  for (size_t i = 0; i < in_range.size(); ++i)
+    VERIFY_IS_EQUAL(out.dimension(i), out_range[i]);
+
+
+  for (IndexType i = 0; i < input.size(); ++i)
+    input(i) = static_cast<DataType>(i);
+
+  DataType * gpu_in_data  = static_cast<DataType*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data  = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>>  gpu_in(gpu_in_data, in_range);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_out(gpu_out_data, out_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(DataType));
+  gpu_out.device(sycl_device) = gpu_in.broadcast(broadcasts);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+
+  for (IndexType i = 0; i < inDim1*bDim1; ++i) {
+    for (IndexType j = 0; j < inDim2*bDim2; ++j) {
+      for (IndexType k = 0; k < inDim3*bDim3; ++k) {
+        for (IndexType l = 0; l < inDim4*bDim4; ++l) {
+          VERIFY_IS_APPROX(input(i%inDim1,j%inDim2,k%inDim3,l%inDim4), out(i,j,k,l));
+        }
+      }
+    }
+  }
+  printf("Broadcast Test Passed\n");
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template<typename DataType> void sycl_broadcast_test_per_device(const cl::sycl::device& d){
+  std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
+  QueueInterface queueInterface(d);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_broadcast_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_broadcast_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_broadcast_sycl_fixed<DataType, RowMajor, int64_t>(sycl_device);
+  test_broadcast_sycl_fixed<DataType, ColMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_broadcast_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_broadcast_test_per_device<float>(device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp
new file mode 100644
index 0000000..cbd92c3
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_broadcasting.cpp

@@ -0,0 +1,349 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int DataLayout>
+static void test_simple_broadcasting()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 1;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 1;
+
+  Tensor<float, 4, DataLayout> no_broadcast;
+  no_broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(no_broadcast.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_broadcast.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_broadcast.dimension(2), 5);
+  VERIFY_IS_EQUAL(no_broadcast.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  broadcasts[0] = 2;
+  broadcasts[1] = 3;
+  broadcasts[2] = 1;
+  broadcasts[3] = 4;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 4);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 28);
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 28; ++l) {
+          VERIFY_IS_EQUAL(tensor(i%2,j%3,k%5,l%7), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_vectorized_broadcasting()
+{
+  Tensor<float, 3, DataLayout> tensor(8,3,5);
+  tensor.setRandom();
+  array<ptrdiff_t, 3> broadcasts;
+  broadcasts[0] = 2;
+  broadcasts[1] = 3;
+  broadcasts[2] = 4;
+
+  Tensor<float, 3, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 16);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
+
+  for (int i = 0; i < 16; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 20; ++k) {
+        VERIFY_IS_EQUAL(tensor(i%8,j%3,k%5), broadcast(i,j,k));
+      }
+    }
+  }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  tensor.resize(11,3,5);
+#else
+  array<Index, 3> new_dims;
+  new_dims[0] = 11;
+  new_dims[1] = 3;
+  new_dims[2] = 5;
+  tensor.resize(new_dims);
+#endif
+
+  tensor.setRandom();
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 22);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
+
+  for (int i = 0; i < 22; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 20; ++k) {
+        VERIFY_IS_EQUAL(tensor(i%11,j%3,k%5), broadcast(i,j,k));
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_static_broadcasting()
+{
+  Tensor<float, 3, DataLayout> tensor(8,3,5);
+  tensor.setRandom();
+
+#if defined(EIGEN_HAS_INDEX_LIST)
+  Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> broadcasts;
+#else
+  Eigen::array<int, 3> broadcasts;
+  broadcasts[0] = 2;
+  broadcasts[1] = 3;
+  broadcasts[2] = 4;
+#endif
+
+  Tensor<float, 3, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 16);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
+
+  for (int i = 0; i < 16; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 20; ++k) {
+        VERIFY_IS_EQUAL(tensor(i%8,j%3,k%5), broadcast(i,j,k));
+      }
+    }
+  }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  tensor.resize(11,3,5);
+#else
+  array<Index, 3> new_dims;
+  new_dims[0] = 11;
+  new_dims[1] = 3;
+  new_dims[2] = 5;
+  tensor.resize(new_dims);
+#endif
+
+  tensor.setRandom();
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 22);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
+
+  for (int i = 0; i < 22; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 20; ++k) {
+        VERIFY_IS_EQUAL(tensor(i%11,j%3,k%5), broadcast(i,j,k));
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_fixed_size_broadcasting()
+{
+  // Need to add a [] operator to the Size class for this to work
+#if 0
+  Tensor<float, 1, DataLayout> t1(10);
+  t1.setRandom();
+  TensorFixedSize<float, Sizes<1>, DataLayout> t2;
+  t2 = t2.constant(20.0f);
+
+  Tensor<float, 1, DataLayout> t3 = t1 + t2.broadcast(Eigen::array<int, 1>{{10}});
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_APPROX(t3(i), t1(i) + t2(0));
+  }
+
+  TensorMap<TensorFixedSize<float, Sizes<1>, DataLayout> > t4(t2.data(), {{1}});
+  Tensor<float, 1, DataLayout> t5 = t1 + t4.broadcast(Eigen::array<int, 1>{{10}});
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_APPROX(t5(i), t1(i) + t2(0));
+  }
+#endif
+}
+
+template <int DataLayout>
+static void test_simple_broadcasting_one_by_n()
+{
+  Tensor<float, 4, DataLayout> tensor(1,13,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 9;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 1;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 13);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 7);
+
+  for (int i = 0; i < 9; ++i) {
+    for (int j = 0; j < 13; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i%1,j%13,k%5,l%7), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_simple_broadcasting_n_by_one()
+{
+  Tensor<float, 4, DataLayout> tensor(7,3,5,1);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 1;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 19;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 7);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 3);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 19);
+
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 19; ++l) {
+          VERIFY_IS_EQUAL(tensor(i%7,j%3,k%5,l%1), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_size_one_broadcasting()
+{
+  Tensor<float, 1, DataLayout> tensor(1);
+  tensor.setRandom();
+  array<ptrdiff_t, 1> broadcasts = {64};
+  Tensor<float, 1, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), broadcasts[0]);
+
+  for (int i = 0; i < broadcasts[0]; ++i) {
+    VERIFY_IS_EQUAL(tensor(0), broadcast(i));
+  }
+}
+
+template <int DataLayout>
+static void test_simple_broadcasting_one_by_n_by_one_1d()
+{
+  Tensor<float, 3, DataLayout> tensor(1,7,1);
+  tensor.setRandom();
+  array<ptrdiff_t, 3> broadcasts;
+  broadcasts[0] = 5;
+  broadcasts[1] = 1;
+  broadcasts[2] = 13;
+  Tensor<float, 3, DataLayout> broadcasted;
+  broadcasted = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcasted.dimension(0), 5);
+  VERIFY_IS_EQUAL(broadcasted.dimension(1), 7);
+  VERIFY_IS_EQUAL(broadcasted.dimension(2), 13);
+
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 13; ++k) {
+        VERIFY_IS_EQUAL(tensor(0,j%7,0), broadcasted(i,j,k));
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_simple_broadcasting_one_by_n_by_one_2d()
+{
+  Tensor<float, 4, DataLayout> tensor(1,7,13,1);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 5;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 19;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 7);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 13);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 19);
+
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 13; ++k) {
+        for (int l = 0; l < 19; ++l) {
+          VERIFY_IS_EQUAL(tensor(0,j%7,k%13,0), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_broadcasting)
+{
+  CALL_SUBTEST(test_simple_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting<RowMajor>());
+  CALL_SUBTEST(test_vectorized_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_vectorized_broadcasting<RowMajor>());
+  CALL_SUBTEST(test_static_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_static_broadcasting<RowMajor>());
+  CALL_SUBTEST(test_fixed_size_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_fixed_size_broadcasting<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_n_by_one<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_n_by_one<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<RowMajor>());
+  CALL_SUBTEST(test_size_one_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_size_one_broadcasting<RowMajor>());
+}

diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
new file mode 100644
index 0000000..72cb62f
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp

@@ -0,0 +1,354 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+// Functions used to compare the TensorMap implementation on the device with
+// the equivalent on the host
+namespace cl {
+namespace sycl {
+template <typename T> T abs(T x) { return cl::sycl::fabs(x); }
+template <typename T> T square(T x) { return x * x; }
+template <typename T> T cube(T x) { return x * x * x; }
+template <typename T> T inverse(T x) { return T(1) / x; }
+template <typename T> T cwiseMax(T x, T y) { return cl::sycl::max(x, y); }
+template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); }
+}
+}
+
+struct EqualAssignement {
+  template <typename Lhs, typename Rhs>
+  void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; }
+};
+
+struct PlusEqualAssignement {
+  template <typename Lhs, typename Rhs>
+  void operator()(Lhs& lhs, const Rhs& rhs) { lhs += rhs; }
+};
+
+template <typename DataType, int DataLayout,
+          typename Assignement, typename Operator>
+void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device,
+                                    const array<int64_t, 3>& tensor_range) {
+  Operator op;
+  Assignement asgn;
+  {
+    /* Assignement(out, Operator(in)) */
+    Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
+    Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+    in = in.random() + DataType(0.01);
+    out = out.random() + DataType(0.01);
+    Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+    DataType *gpu_data = static_cast<DataType *>(
+        sycl_device.allocate(in.size() * sizeof(DataType)));
+    DataType *gpu_data_out = static_cast<DataType *>(
+        sycl_device.allocate(out.size() * sizeof(DataType)));
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+    sycl_device.memcpyHostToDevice(gpu_data, in.data(),
+                                   (in.size()) * sizeof(DataType));
+    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
+                                   (out.size()) * sizeof(DataType));
+    auto device_expr = gpu_out.device(sycl_device);
+    asgn(device_expr, op(gpu));
+    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                   (out.size()) * sizeof(DataType));
+    for (int64_t i = 0; i < out.size(); ++i) {
+      DataType ver = reference(i);
+      asgn(ver, op(in(i)));
+      VERIFY_IS_APPROX(out(i), ver);
+    }
+    sycl_device.deallocate(gpu_data);
+    sycl_device.deallocate(gpu_data_out);
+  }
+  {
+    /* Assignement(out, Operator(out)) */
+    Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+    out = out.random() + DataType(0.01);
+    Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+    DataType *gpu_data_out = static_cast<DataType *>(
+        sycl_device.allocate(out.size() * sizeof(DataType)));
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
+                                   (out.size()) * sizeof(DataType));
+    auto device_expr = gpu_out.device(sycl_device);
+    asgn(device_expr, op(gpu_out));
+    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                   (out.size()) * sizeof(DataType));
+    for (int64_t i = 0; i < out.size(); ++i) {
+      DataType ver = reference(i);
+      asgn(ver, op(reference(i)));
+      VERIFY_IS_APPROX(out(i), ver);
+    }
+    sycl_device.deallocate(gpu_data_out);
+  }
+}
+
+#define DECLARE_UNARY_STRUCT(FUNC)                                 \
+  struct op_##FUNC {                                               \
+    template <typename T>                                          \
+    auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) {   \
+      return cl::sycl::FUNC(x);                                    \
+    }                                                              \
+    template <typename T>                                          \
+    auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \
+      return x.FUNC();                                             \
+    }                                                              \
+  };
+
+DECLARE_UNARY_STRUCT(abs)
+DECLARE_UNARY_STRUCT(sqrt)
+DECLARE_UNARY_STRUCT(rsqrt)
+DECLARE_UNARY_STRUCT(square)
+DECLARE_UNARY_STRUCT(cube)
+DECLARE_UNARY_STRUCT(inverse)
+DECLARE_UNARY_STRUCT(tanh)
+DECLARE_UNARY_STRUCT(exp)
+DECLARE_UNARY_STRUCT(expm1)
+DECLARE_UNARY_STRUCT(log)
+DECLARE_UNARY_STRUCT(ceil)
+DECLARE_UNARY_STRUCT(floor)
+DECLARE_UNARY_STRUCT(round)
+DECLARE_UNARY_STRUCT(log1p)
+DECLARE_UNARY_STRUCT(sign)
+DECLARE_UNARY_STRUCT(isnan)
+DECLARE_UNARY_STRUCT(isfinite)
+DECLARE_UNARY_STRUCT(isinf)
+
+template <typename DataType, int DataLayout, typename Assignement>
+void test_unary_builtins_for_assignement(const Eigen::SyclDevice& sycl_device,
+                                         const array<int64_t, 3>& tensor_range) {
+#define RUN_UNARY_TEST(FUNC) \
+  test_unary_builtins_for_scalar<DataType, DataLayout, Assignement, \
+                                 op_##FUNC>(sycl_device, tensor_range)
+  RUN_UNARY_TEST(abs);
+  RUN_UNARY_TEST(sqrt);
+  RUN_UNARY_TEST(rsqrt);
+  RUN_UNARY_TEST(square);
+  RUN_UNARY_TEST(cube);
+  RUN_UNARY_TEST(inverse);
+  RUN_UNARY_TEST(tanh);
+  RUN_UNARY_TEST(exp);
+  RUN_UNARY_TEST(expm1);
+  RUN_UNARY_TEST(log);
+  RUN_UNARY_TEST(ceil);
+  RUN_UNARY_TEST(floor);
+  RUN_UNARY_TEST(round);
+  RUN_UNARY_TEST(log1p);
+  RUN_UNARY_TEST(sign);
+}
+
+template <typename DataType, int DataLayout, typename Operator>
+void test_unary_builtins_return_bool(const Eigen::SyclDevice& sycl_device,
+                                     const array<int64_t, 3>& tensor_range) {
+  /* out = op(in) */
+  Operator op;
+  Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
+  Tensor<bool, 3, DataLayout, int64_t> out(tensor_range);
+  in = in.random() + DataType(0.01);
+  DataType *gpu_data = static_cast<DataType *>(
+      sycl_device.allocate(in.size() * sizeof(DataType)));
+  bool *gpu_data_out =
+      static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool)));
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
+  TensorMap<Tensor<bool, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+  sycl_device.memcpyHostToDevice(gpu_data, in.data(),
+                                 (in.size()) * sizeof(DataType));
+  gpu_out.device(sycl_device) = op(gpu);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                 (out.size()) * sizeof(bool));
+  for (int64_t i = 0; i < out.size(); ++i) {
+    VERIFY_IS_EQUAL(out(i), op(in(i)));
+  }
+  sycl_device.deallocate(gpu_data);
+  sycl_device.deallocate(gpu_data_out);
+}
+
+template <typename DataType, int DataLayout>
+void test_unary_builtins(const Eigen::SyclDevice& sycl_device,
+                         const array<int64_t, 3>& tensor_range) {
+  test_unary_builtins_for_assignement<DataType, DataLayout,
+                                      PlusEqualAssignement>(sycl_device, tensor_range);
+  test_unary_builtins_for_assignement<DataType, DataLayout,
+                                      EqualAssignement>(sycl_device, tensor_range);
+  test_unary_builtins_return_bool<DataType, DataLayout,
+                                  op_isnan>(sycl_device, tensor_range);
+  test_unary_builtins_return_bool<DataType, DataLayout,
+                                  op_isfinite>(sycl_device, tensor_range);
+  test_unary_builtins_return_bool<DataType, DataLayout,
+                                  op_isinf>(sycl_device, tensor_range);
+}
+
+template <typename DataType>
+static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) {
+  int64_t sizeDim1 = 10;
+  int64_t sizeDim2 = 10;
+  int64_t sizeDim3 = 10;
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+
+  test_unary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
+  test_unary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
+}
+
+template <typename DataType, int DataLayout, typename Operator>
+void test_binary_builtins_func(const Eigen::SyclDevice& sycl_device,
+                               const array<int64_t, 3>& tensor_range) {
+  /* out = op(in_1, in_2) */
+  Operator op;
+  Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
+  Tensor<DataType, 3, DataLayout, int64_t> in_2(tensor_range);
+  Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+  in_1 = in_1.random() + DataType(0.01);
+  in_2 = in_2.random() + DataType(0.01);
+  Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+  DataType *gpu_data_1 = static_cast<DataType *>(
+      sycl_device.allocate(in_1.size() * sizeof(DataType)));
+  DataType *gpu_data_2 = static_cast<DataType *>(
+      sycl_device.allocate(in_2.size() * sizeof(DataType)));
+  DataType *gpu_data_out = static_cast<DataType *>(
+      sycl_device.allocate(out.size() * sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_2(gpu_data_2, tensor_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+  sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
+                                 (in_1.size()) * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),
+                                 (in_2.size()) * sizeof(DataType));
+  gpu_out.device(sycl_device) = op(gpu_1, gpu_2);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                 (out.size()) * sizeof(DataType));
+  for (int64_t i = 0; i < out.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), op(in_1(i), in_2(i)));
+  }
+  sycl_device.deallocate(gpu_data_1);
+  sycl_device.deallocate(gpu_data_2);
+  sycl_device.deallocate(gpu_data_out);
+}
+
+template <typename DataType, int DataLayout, typename Operator>
+void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device,
+                                     const array<int64_t, 3>& tensor_range) {
+  /* out = op(in_1, 2) */
+  Operator op;
+  const DataType arg2(2);
+  Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
+  Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+  in_1 = in_1.random();
+  Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+  DataType *gpu_data_1 = static_cast<DataType *>(
+      sycl_device.allocate(in_1.size() * sizeof(DataType)));
+  DataType *gpu_data_out = static_cast<DataType *>(
+      sycl_device.allocate(out.size() * sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+  sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
+                                 (in_1.size()) * sizeof(DataType));
+  gpu_out.device(sycl_device) = op(gpu_1, arg2);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                 (out.size()) * sizeof(DataType));
+  for (int64_t i = 0; i < out.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), op(in_1(i), arg2));
+  }
+  sycl_device.deallocate(gpu_data_1);
+  sycl_device.deallocate(gpu_data_out);
+}
+
+#define DECLARE_BINARY_STRUCT(FUNC)                                                          \
+  struct op_##FUNC {                                                                         \
+    template <typename T1, typename T2>                                                      \
+    auto operator()(const T1& x, const T2& y) -> decltype(cl::sycl::FUNC(x, y)) {            \
+      return cl::sycl::FUNC(x, y);                                                           \
+    }                                                                                        \
+    template <typename T1, typename T2>                                                      \
+    auto operator()(const TensorMap<T1>& x, const TensorMap<T2>& y) -> decltype(x.FUNC(y)) { \
+      return x.FUNC(y);                                                                      \
+    }                                                                                        \
+  };
+
+DECLARE_BINARY_STRUCT(cwiseMax)
+DECLARE_BINARY_STRUCT(cwiseMin)
+
+#define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR)                          \
+  struct op_##NAME {                                                      \
+    template <typename T1, typename T2>                                   \
+    auto operator()(const T1& x, const T2& y) -> decltype(x OPERATOR y) { \
+      return x OPERATOR y;                                                \
+    }                                                                     \
+  };
+
+DECLARE_BINARY_STRUCT_OP(plus, +)
+DECLARE_BINARY_STRUCT_OP(minus, -)
+DECLARE_BINARY_STRUCT_OP(times, *)
+DECLARE_BINARY_STRUCT_OP(divide, /)
+DECLARE_BINARY_STRUCT_OP(modulo, %)
+
+template <typename DataType, int DataLayout>
+void test_binary_builtins(const Eigen::SyclDevice& sycl_device,
+                          const array<int64_t, 3>& tensor_range) {
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_cwiseMax>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_cwiseMin>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_plus>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_minus>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_times>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_divide>(sycl_device, tensor_range);
+}
+
+template <typename DataType>
+static void test_floating_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
+  int64_t sizeDim1 = 10;
+  int64_t sizeDim2 = 10;
+  int64_t sizeDim3 = 10;
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+  test_binary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
+  test_binary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
+}
+
+template <typename DataType>
+static void test_integer_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
+  int64_t sizeDim1 = 10;
+  int64_t sizeDim2 = 10;
+  int64_t sizeDim3 = 10;
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+  test_binary_builtins_fixed_arg2<DataType, RowMajor,
+                                  op_modulo>(sycl_device, tensor_range);
+  test_binary_builtins_fixed_arg2<DataType, ColMajor,
+                                  op_modulo>(sycl_device, tensor_range);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    QueueInterface queueInterface(device);
+    Eigen::SyclDevice sycl_device(&queueInterface);
+    CALL_SUBTEST_1(test_builtin_unary_sycl<float>(sycl_device));
+    CALL_SUBTEST_2(test_floating_builtin_binary_sycl<float>(sycl_device));
+    CALL_SUBTEST_3(test_integer_builtin_binary_sycl<int>(sycl_device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_cast_float16_gpu.cu b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu
new file mode 100644
index 0000000..97923d1
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu

@@ -0,0 +1,79 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_gpu_conversion() {
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  Tensor<float, 1> floats(num_elem);
+  floats.setRandom();
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
+      d_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
+      d_conv, num_elem);
+
+  gpu_device.memcpyHostToDevice(d_float, floats.data(), num_elem*sizeof(float));
+
+  gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
+  gpu_conv.device(gpu_device) = gpu_half.cast<float>();
+
+  Tensor<float, 1> initial(num_elem);
+  Tensor<float, 1> final(num_elem);
+  gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    VERIFY_IS_APPROX(initial(i), final(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_half);
+  gpu_device.deallocate(d_conv);
+}
+
+
+void test_fallback_conversion() {
+  int num_elem = 101;
+  Tensor<float, 1> floats(num_elem);
+  floats.setRandom();
+
+  Eigen::Tensor<Eigen::half, 1> halfs = floats.cast<Eigen::half>();
+  Eigen::Tensor<float, 1> conv = halfs.cast<float>();
+
+  for (int i = 0; i < num_elem; ++i) {
+    VERIFY_IS_APPROX(floats(i), conv(i));
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_cast_float16_gpu)
+{
+  CALL_SUBTEST(test_gpu_conversion());
+  CALL_SUBTEST(test_fallback_conversion());
+}

diff --git a/unsupported/test/cxx11_tensor_casts.cpp b/unsupported/test/cxx11_tensor_casts.cpp
new file mode 100644
index 0000000..45456f3
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_casts.cpp

@@ -0,0 +1,186 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include "random_without_cast_overflow.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::array;
+
+static void test_simple_cast()
+{
+  Tensor<float, 2> ftensor(20,30);
+  ftensor = ftensor.random() * 100.f;
+  Tensor<char, 2> chartensor(20,30);
+  chartensor.setRandom();
+  Tensor<std::complex<float>, 2> cplextensor(20,30);
+  cplextensor.setRandom();
+
+  chartensor = ftensor.cast<char>();
+  cplextensor = ftensor.cast<std::complex<float> >();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(chartensor(i,j), static_cast<char>(ftensor(i,j)));
+      VERIFY_IS_EQUAL(cplextensor(i,j), static_cast<std::complex<float> >(ftensor(i,j)));
+    }
+  }
+}
+
+
+static void test_vectorized_cast()
+{
+  Tensor<int, 2> itensor(20,30);
+  itensor = itensor.random() / 1000;
+  Tensor<float, 2> ftensor(20,30);
+  ftensor.setRandom();
+  Tensor<double, 2> dtensor(20,30);
+  dtensor.setRandom();
+
+  ftensor = itensor.cast<float>();
+  dtensor = itensor.cast<double>();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(itensor(i,j), static_cast<int>(ftensor(i,j)));
+      VERIFY_IS_EQUAL(dtensor(i,j), static_cast<double>(ftensor(i,j)));
+    }
+  }
+}
+
+
+static void test_float_to_int_cast()
+{
+  Tensor<float, 2> ftensor(20,30);
+  ftensor = ftensor.random() * 1000.0f;
+  Tensor<double, 2> dtensor(20,30);
+  dtensor = dtensor.random() * 1000.0;
+
+  Tensor<int, 2> i1tensor = ftensor.cast<int>();
+  Tensor<int, 2> i2tensor = dtensor.cast<int>();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(i1tensor(i,j), static_cast<int>(ftensor(i,j)));
+      VERIFY_IS_EQUAL(i2tensor(i,j), static_cast<int>(dtensor(i,j)));
+    }
+  }
+}
+
+
+static void test_big_to_small_type_cast()
+{
+  Tensor<double, 2> dtensor(20, 30);
+  dtensor.setRandom();
+  Tensor<float, 2> ftensor(20, 30);
+  ftensor = dtensor.cast<float>();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_APPROX(dtensor(i,j), static_cast<double>(ftensor(i,j)));
+    }
+  }
+}
+
+
+static void test_small_to_big_type_cast()
+{
+  Tensor<float, 2> ftensor(20, 30);
+  ftensor.setRandom();
+  Tensor<double, 2> dtensor(20, 30);
+  dtensor = ftensor.cast<double>();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_APPROX(dtensor(i,j), static_cast<double>(ftensor(i,j)));
+    }
+  }
+}
+
+template <typename FromType, typename ToType>
+static void test_type_cast() {
+  Tensor<FromType, 2> ftensor(100, 200);
+  // Generate random values for a valid cast.
+  for (int i = 0; i < 100; ++i) {
+    for (int j = 0; j < 200; ++j) {
+      ftensor(i, j) = internal::random_without_cast_overflow<FromType,ToType>::value();
+    }
+  }
+
+  Tensor<ToType, 2> ttensor(100, 200);
+  ttensor = ftensor.template cast<ToType>();
+
+  for (int i = 0; i < 100; ++i) {
+    for (int j = 0; j < 200; ++j) {
+      const ToType ref = internal::cast<FromType,ToType>(ftensor(i, j));
+      VERIFY_IS_APPROX(ttensor(i, j), ref);
+    }
+  }
+}
+
+template<typename Scalar, typename EnableIf = void>
+struct test_cast_runner {
+  static void run() {
+    test_type_cast<Scalar, bool>();
+    test_type_cast<Scalar, int8_t>();
+    test_type_cast<Scalar, int16_t>();
+    test_type_cast<Scalar, int32_t>();
+    test_type_cast<Scalar, int64_t>();
+    test_type_cast<Scalar, uint8_t>();
+    test_type_cast<Scalar, uint16_t>();
+    test_type_cast<Scalar, uint32_t>();
+    test_type_cast<Scalar, uint64_t>();
+    test_type_cast<Scalar, half>();
+    test_type_cast<Scalar, bfloat16>();
+    test_type_cast<Scalar, float>();
+    test_type_cast<Scalar, double>();
+    test_type_cast<Scalar, std::complex<float>>();
+    test_type_cast<Scalar, std::complex<double>>();
+  }
+};
+
+// Only certain types allow cast from std::complex<>.
+template<typename Scalar>
+struct test_cast_runner<Scalar, typename internal::enable_if<NumTraits<Scalar>::IsComplex>::type> {
+  static void run() {
+    test_type_cast<Scalar, half>();
+    test_type_cast<Scalar, bfloat16>();
+    test_type_cast<Scalar, std::complex<float>>();
+    test_type_cast<Scalar, std::complex<double>>();
+  }
+};
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_casts)
+{
+  CALL_SUBTEST(test_simple_cast());
+  CALL_SUBTEST(test_vectorized_cast());
+  CALL_SUBTEST(test_float_to_int_cast());
+  CALL_SUBTEST(test_big_to_small_type_cast());
+  CALL_SUBTEST(test_small_to_big_type_cast());
+
+  CALL_SUBTEST(test_cast_runner<bool>::run());
+  CALL_SUBTEST(test_cast_runner<int8_t>::run());
+  CALL_SUBTEST(test_cast_runner<int16_t>::run());
+  CALL_SUBTEST(test_cast_runner<int32_t>::run());
+  CALL_SUBTEST(test_cast_runner<int64_t>::run());
+  CALL_SUBTEST(test_cast_runner<uint8_t>::run());
+  CALL_SUBTEST(test_cast_runner<uint16_t>::run());
+  CALL_SUBTEST(test_cast_runner<uint32_t>::run());
+  CALL_SUBTEST(test_cast_runner<uint64_t>::run());
+  CALL_SUBTEST(test_cast_runner<half>::run());
+  CALL_SUBTEST(test_cast_runner<bfloat16>::run());
+  CALL_SUBTEST(test_cast_runner<float>::run());
+  CALL_SUBTEST(test_cast_runner<double>::run());
+  CALL_SUBTEST(test_cast_runner<std::complex<float>>::run());
+  CALL_SUBTEST(test_cast_runner<std::complex<double>>::run());
+
+}

diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp
new file mode 100644
index 0000000..9222744
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_chipping.cpp

@@ -0,0 +1,425 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_simple_chip()
+{
+  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  tensor.setRandom();
+
+  Tensor<float, 4, DataLayout> chip1;
+  chip1 = tensor.template chip<0>(1);
+
+  VERIFY_IS_EQUAL(chip1.dimension(0), 3);
+  VERIFY_IS_EQUAL(chip1.dimension(1), 5);
+  VERIFY_IS_EQUAL(chip1.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip1.dimension(3), 11);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip2 = tensor.template chip<1>(1);
+  VERIFY_IS_EQUAL(chip2.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip2.dimension(1), 5);
+  VERIFY_IS_EQUAL(chip2.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip2.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip3 = tensor.template chip<2>(2);
+  VERIFY_IS_EQUAL(chip3.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip3.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip3.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip3.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip4(tensor.template chip<3>(5));
+  VERIFY_IS_EQUAL(chip4.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip4.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip4.dimension(2), 5);
+  VERIFY_IS_EQUAL(chip4.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip5(tensor.template chip<4>(7));
+  VERIFY_IS_EQUAL(chip5.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip5.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip5.dimension(2), 5);
+  VERIFY_IS_EQUAL(chip5.dimension(3), 7);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7));
+        }
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_dynamic_chip()
+{
+  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  tensor.setRandom();
+
+  Tensor<float, 4, DataLayout> chip1;
+  chip1 = tensor.chip(1, 0);
+  VERIFY_IS_EQUAL(chip1.dimension(0), 3);
+  VERIFY_IS_EQUAL(chip1.dimension(1), 5);
+  VERIFY_IS_EQUAL(chip1.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip1.dimension(3), 11);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip2 = tensor.chip(1, 1);
+  VERIFY_IS_EQUAL(chip2.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip2.dimension(1), 5);
+  VERIFY_IS_EQUAL(chip2.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip2.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip3 = tensor.chip(2, 2);
+  VERIFY_IS_EQUAL(chip3.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip3.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip3.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip3.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip4(tensor.chip(5, 3));
+  VERIFY_IS_EQUAL(chip4.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip4.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip4.dimension(2), 5);
+  VERIFY_IS_EQUAL(chip4.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip5(tensor.chip(7, 4));
+  VERIFY_IS_EQUAL(chip5.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip5.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip5.dimension(2), 5);
+  VERIFY_IS_EQUAL(chip5.dimension(3), 7);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7));
+        }
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_chip_in_expr() {
+  Tensor<float, 5, DataLayout> input1(2,3,5,7,11);
+  input1.setRandom();
+  Tensor<float, 4, DataLayout> input2(3,5,7,11);
+  input2.setRandom();
+
+  Tensor<float, 4, DataLayout> result = input1.template chip<0>(0) + input2;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          float expected = input1(0,i,j,k,l) + input2(i,j,k,l);
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected);
+        }
+      }
+    }
+  }
+
+  Tensor<float, 3, DataLayout> input3(3,7,11);
+  input3.setRandom();
+  Tensor<float, 3, DataLayout> result2 = input1.template chip<0>(0).template chip<1>(2) + input3;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        float expected = input1(0,i,2,j,k) + input3(i,j,k);
+        VERIFY_IS_EQUAL(result2(i,j,k), expected);
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_chip_as_lvalue()
+{
+  Tensor<float, 5, DataLayout> input1(2,3,5,7,11);
+  input1.setRandom();
+
+  Tensor<float, 4, DataLayout> input2(3,5,7,11);
+  input2.setRandom();
+  Tensor<float, 5, DataLayout> tensor = input1;
+  tensor.template chip<0>(1) = input2;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (i != 1) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input2(j,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> input3(2,5,7,11);
+  input3.setRandom();
+  tensor = input1;
+  tensor.template chip<1>(1) = input3;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (j != 1) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input3(i,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> input4(2,3,7,11);
+  input4.setRandom();
+  tensor = input1;
+  tensor.template chip<2>(3) = input4;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (k != 3) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input4(i,j,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> input5(2,3,5,11);
+  input5.setRandom();
+  tensor = input1;
+  tensor.template chip<3>(4) = input5;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (l != 4) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input5(i,j,k,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> input6(2,3,5,7);
+  input6.setRandom();
+  tensor = input1;
+  tensor.template chip<4>(5) = input6;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (m != 5) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input6(i,j,k,l));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 5, DataLayout> input7(2,3,5,7,11);
+  input7.setRandom();
+  tensor = input1;
+  tensor.chip(0, 0) = input7.chip(0, 0);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (i != 0) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input7(i,j,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void test_chip_raw_data_col_major()
+{
+  Tensor<float, 5, ColMajor> tensor(2,3,5,7,11);
+  tensor.setRandom();
+
+  typedef TensorEvaluator<decltype(tensor.chip<4>(3)), DefaultDevice> Evaluator4;
+  auto chip = Evaluator4(tensor.chip<4>(3), DefaultDevice());
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          int chip_index = i + 2 * (j + 3 * (k + 5 * l));
+          VERIFY_IS_EQUAL(chip.data()[chip_index], tensor(i,j,k,l,3));
+        }
+      }
+    }
+  }
+
+  typedef TensorEvaluator<decltype(tensor.chip<0>(0)), DefaultDevice> Evaluator0;
+  auto chip0 = Evaluator0(tensor.chip<0>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip0.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.chip<1>(0)), DefaultDevice> Evaluator1;
+  auto chip1 = Evaluator1(tensor.chip<1>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip1.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.chip<2>(0)), DefaultDevice> Evaluator2;
+  auto chip2 = Evaluator2(tensor.chip<2>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip2.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.chip<3>(0)), DefaultDevice> Evaluator3;
+  auto chip3 = Evaluator3(tensor.chip<3>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip3.data(), static_cast<float*>(0));
+}
+
+static void test_chip_raw_data_row_major()
+{
+  Tensor<float, 5, RowMajor> tensor(11,7,5,3,2);
+  tensor.setRandom();
+
+  typedef TensorEvaluator<decltype(tensor.chip<0>(3)), DefaultDevice> Evaluator0;
+  auto chip = Evaluator0(tensor.chip<0>(3), DefaultDevice());
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 2; ++l) {
+          int chip_index = l + 2 * (k + 3 * (j + 5 * i));
+          VERIFY_IS_EQUAL(chip.data()[chip_index], tensor(3,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  typedef TensorEvaluator<decltype(tensor.chip<1>(0)), DefaultDevice> Evaluator1;
+  auto chip1 = Evaluator1(tensor.chip<1>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip1.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.chip<2>(0)), DefaultDevice> Evaluator2;
+  auto chip2 = Evaluator2(tensor.chip<2>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip2.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.chip<3>(0)), DefaultDevice> Evaluator3;
+  auto chip3 = Evaluator3(tensor.chip<3>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip3.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.chip<4>(0)), DefaultDevice> Evaluator4;
+  auto chip4 = Evaluator4(tensor.chip<4>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip4.data(), static_cast<float*>(0));
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_chipping)
+{
+  CALL_SUBTEST(test_simple_chip<ColMajor>());
+  CALL_SUBTEST(test_simple_chip<RowMajor>());
+  CALL_SUBTEST(test_dynamic_chip<ColMajor>());
+  CALL_SUBTEST(test_dynamic_chip<RowMajor>());
+  CALL_SUBTEST(test_chip_in_expr<ColMajor>());
+  CALL_SUBTEST(test_chip_in_expr<RowMajor>());
+  CALL_SUBTEST(test_chip_as_lvalue<ColMajor>());
+  CALL_SUBTEST(test_chip_as_lvalue<RowMajor>());
+  CALL_SUBTEST(test_chip_raw_data_col_major());
+  CALL_SUBTEST(test_chip_raw_data_row_major());
+}

diff --git a/unsupported/test/cxx11_tensor_chipping_sycl.cpp b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
new file mode 100644
index 0000000..1e70931
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_chipping_sycl.cpp

@@ -0,0 +1,623 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_static_chip_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+  Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange);
+
+  tensor.setRandom();
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_chip1  = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(1l);
+  sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2);
+  VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3);
+  VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim2; ++i) {
+    for (IndexType j = 0; j < sizeDim3; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip2(chip2TensorRange);
+  const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType);
+  DataType* gpu_data_chip2  = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange);
+
+  gpu_chip2.device(sycl_device)=gpu_tensor.template chip<1l>(1l);
+  sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3);
+  VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim3; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip3(chip3TensorRange);
+  const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType);
+  DataType* gpu_data_chip3  = static_cast<DataType*>(sycl_device.allocate(chip3TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip3(gpu_data_chip3, chip3TensorRange);
+
+  gpu_chip3.device(sycl_device)=gpu_tensor.template chip<2l>(2l);
+  sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip4(chip4TensorRange);
+  const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType);
+  DataType* gpu_data_chip4  = static_cast<DataType*>(sycl_device.allocate(chip4TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip4(gpu_data_chip4, chip4TensorRange);
+
+  gpu_chip4.device(sycl_device)=gpu_tensor.template chip<3l>(5l);
+  sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l));
+        }
+      }
+    }
+  }
+
+
+  array<IndexType, 4> chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip5(chip5TensorRange);
+  const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType);
+  DataType* gpu_data_chip5  = static_cast<DataType*>(sycl_device.allocate(chip5TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip5(gpu_data_chip5, chip5TensorRange);
+
+  gpu_chip5.device(sycl_device)=gpu_tensor.template chip<4l>(7l);
+  sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
+          VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l));
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_chip1);
+  sycl_device.deallocate(gpu_data_chip2);
+  sycl_device.deallocate(gpu_data_chip3);
+  sycl_device.deallocate(gpu_data_chip4);
+  sycl_device.deallocate(gpu_data_chip5);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_dynamic_chip_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+  Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange);
+
+  tensor.setRandom();
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_chip1  = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  gpu_chip1.device(sycl_device)=gpu_tensor.chip(1l,0l);
+  sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2);
+  VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3);
+  VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim2; ++i) {
+    for (IndexType j = 0; j < sizeDim3; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip2(chip2TensorRange);
+  const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType);
+  DataType* gpu_data_chip2  = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange);
+
+  gpu_chip2.device(sycl_device)=gpu_tensor.chip(1l,1l);
+  sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3);
+  VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim3; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip3(chip3TensorRange);
+  const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType);
+  DataType* gpu_data_chip3  = static_cast<DataType*>(sycl_device.allocate(chip3TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip3(gpu_data_chip3, chip3TensorRange);
+
+  gpu_chip3.device(sycl_device)=gpu_tensor.chip(2l,2l);
+  sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip4(chip4TensorRange);
+  const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType);
+  DataType* gpu_data_chip4  = static_cast<DataType*>(sycl_device.allocate(chip4TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip4(gpu_data_chip4, chip4TensorRange);
+
+  gpu_chip4.device(sycl_device)=gpu_tensor.chip(5l,3l);
+  sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l));
+        }
+      }
+    }
+  }
+
+
+  array<IndexType, 4> chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip5(chip5TensorRange);
+  const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType);
+  DataType* gpu_data_chip5  = static_cast<DataType*>(sycl_device.allocate(chip5TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip5(gpu_data_chip5, chip5TensorRange);
+
+  gpu_chip5.device(sycl_device)=gpu_tensor.chip(7l,4l);
+  sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
+          VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l));
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_chip1);
+  sycl_device.deallocate(gpu_data_chip2);
+  sycl_device.deallocate(gpu_data_chip3);
+  sycl_device.deallocate(gpu_data_chip4);
+  sycl_device.deallocate(gpu_data_chip5);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_chip_in_expr(const Eigen::SyclDevice& sycl_device) {
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+  Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+
+  Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> tensor1(chip1TensorRange);
+  tensor.setRandom();
+  tensor1.setRandom();
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_chip1  = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+  DataType* gpu_data_tensor1  = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor1(gpu_data_tensor1, chip1TensorRange);
+
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  sycl_device.memcpyHostToDevice(gpu_data_tensor1, tensor1.data(), chip1TensorBuffSize);
+  gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(0l) + gpu_tensor1;
+  sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize);
+
+  for (int i = 0; i < sizeDim2; ++i) {
+    for (int j = 0; j < sizeDim3; ++j) {
+      for (int k = 0; k < sizeDim4; ++k) {
+        for (int l = 0; l < sizeDim5; ++l) {
+          float expected = tensor(0l,i,j,k,l) + tensor1(i,j,k,l);
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), expected);
+        }
+      }
+    }
+  }
+
+  array<IndexType, 3> chip2TensorRange = {{sizeDim2, sizeDim4, sizeDim5}};
+  Tensor<DataType, 3, DataLayout,IndexType> tensor2(chip2TensorRange);
+  Tensor<DataType, 3, DataLayout,IndexType> chip2(chip2TensorRange);
+  tensor2.setRandom();
+  const size_t chip2TensorBuffSize =tensor2.size()*sizeof(DataType);
+  DataType* gpu_data_tensor2  = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+  DataType* gpu_data_chip2  = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+  TensorMap<Tensor<DataType, 3, DataLayout,IndexType>> gpu_tensor2(gpu_data_tensor2, chip2TensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor2, tensor2.data(), chip2TensorBuffSize);
+  gpu_chip2.device(sycl_device)=gpu_tensor.template chip<0l>(0l).template chip<1l>(2l) + gpu_tensor2;
+  sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize);
+
+  for (int i = 0; i < sizeDim2; ++i) {
+    for (int j = 0; j < sizeDim4; ++j) {
+      for (int k = 0; k < sizeDim5; ++k) {
+        float expected = tensor(0l,i,2l,j,k) + tensor2(i,j,k);
+        VERIFY_IS_EQUAL(chip2(i,j,k), expected);
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_tensor1);
+  sycl_device.deallocate(gpu_data_chip1);
+  sycl_device.deallocate(gpu_data_tensor2);
+  sycl_device.deallocate(gpu_data_chip2);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_chip_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  array<IndexType, 4> input2TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+  Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 5, DataLayout,IndexType> input1(tensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> input2(input2TensorRange);
+  input1.setRandom();
+  input2.setRandom();
+
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  const size_t input2TensorBuffSize =input2.size()*sizeof(DataType);
+  std::cout << tensorBuffSize << " , "<<  input2TensorBuffSize << std::endl;
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_input1  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_input2  = static_cast<DataType*>(sycl_device.allocate(input2TensorBuffSize));
+
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_input1(gpu_data_input1, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input2(gpu_data_input2, input2TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input1, input1.data(), tensorBuffSize);
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  sycl_device.memcpyHostToDevice(gpu_data_input2, input2.data(), input2TensorBuffSize);
+  gpu_tensor.template chip<0l>(1l).device(sycl_device)=gpu_input2;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k < sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (i != 1) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input2(j,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  array<IndexType, 4> input3TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> input3(input3TensorRange);
+  input3.setRandom();
+
+  const size_t input3TensorBuffSize =input3.size()*sizeof(DataType);
+  DataType* gpu_data_input3  = static_cast<DataType*>(sycl_device.allocate(input3TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input3(gpu_data_input3, input3TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input3, input3.data(), input3TensorBuffSize);
+  gpu_tensor.template chip<1l>(1l).device(sycl_device)=gpu_input3;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (j != 1) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input3(i,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  array<IndexType, 4> input4TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> input4(input4TensorRange);
+  input4.setRandom();
+
+  const size_t input4TensorBuffSize =input4.size()*sizeof(DataType);
+  DataType* gpu_data_input4  = static_cast<DataType*>(sycl_device.allocate(input4TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input4(gpu_data_input4, input4TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input4, input4.data(), input4TensorBuffSize);
+  gpu_tensor.template chip<2l>(3l).device(sycl_device)=gpu_input4;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (k != 3) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input4(i,j,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  array<IndexType, 4> input5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> input5(input5TensorRange);
+  input5.setRandom();
+
+  const size_t input5TensorBuffSize =input5.size()*sizeof(DataType);
+  DataType* gpu_data_input5  = static_cast<DataType*>(sycl_device.allocate(input5TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input5(gpu_data_input5, input5TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input5, input5.data(), input5TensorBuffSize);
+  gpu_tensor.template chip<3l>(4l).device(sycl_device)=gpu_input5;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (l != 4) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input5(i,j,k,m));
+            }
+          }
+        }
+      }
+    }
+  }
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  array<IndexType, 4> input6TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> input6(input6TensorRange);
+  input6.setRandom();
+
+  const size_t input6TensorBuffSize =input6.size()*sizeof(DataType);
+  DataType* gpu_data_input6  = static_cast<DataType*>(sycl_device.allocate(input6TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input6(gpu_data_input6, input6TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input6, input6.data(), input6TensorBuffSize);
+  gpu_tensor.template chip<4l>(5l).device(sycl_device)=gpu_input6;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (m != 5) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input6(i,j,k,l));
+            }
+          }
+        }
+      }
+    }
+  }
+
+
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  Tensor<DataType, 5, DataLayout,IndexType> input7(tensorRange);
+  input7.setRandom();
+
+  DataType* gpu_data_input7  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_input7(gpu_data_input7, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input7, input7.data(), tensorBuffSize);
+  gpu_tensor.chip(0l,0l).device(sycl_device)=gpu_input7.chip(0l,0l);
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (i != 0) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input7(i,j,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_input1);
+  sycl_device.deallocate(gpu_data_input2);
+  sycl_device.deallocate(gpu_data_input3);
+  sycl_device.deallocate(gpu_data_input4);
+  sycl_device.deallocate(gpu_data_input5);
+  sycl_device.deallocate(gpu_data_input6);
+  sycl_device.deallocate(gpu_data_input7);
+
+}
+
+template<typename DataType, typename dev_Selector> void sycl_chipping_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ /* test_static_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_static_chip_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_dynamic_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_dynamic_chip_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_chip_in_expr<DataType, RowMajor, int64_t>(sycl_device);
+  test_chip_in_expr<DataType, ColMajor, int64_t>(sycl_device);*/
+  test_chip_as_lvalue_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ // test_chip_as_lvalue_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_chipping_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_chipping_test_per_device<float>(device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_comparisons.cpp b/unsupported/test/cxx11_tensor_comparisons.cpp
new file mode 100644
index 0000000..1a18e07
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_comparisons.cpp

@@ -0,0 +1,84 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_orderings()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<bool, 3> lt(2,3,7);
+  Tensor<bool, 3> le(2,3,7);
+  Tensor<bool, 3> gt(2,3,7);
+  Tensor<bool, 3> ge(2,3,7);
+
+  mat1.setRandom();
+  mat2.setRandom();
+
+  lt = mat1 < mat2;
+  le = mat1 <= mat2;
+  gt = mat1 > mat2;
+  ge = mat1 >= mat2;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(lt(i,j,k), mat1(i,j,k) < mat2(i,j,k));
+        VERIFY_IS_EQUAL(le(i,j,k), mat1(i,j,k) <= mat2(i,j,k));
+        VERIFY_IS_EQUAL(gt(i,j,k), mat1(i,j,k) > mat2(i,j,k));
+        VERIFY_IS_EQUAL(ge(i,j,k), mat1(i,j,k) >= mat2(i,j,k));
+      }
+    }
+  }
+}
+
+
+static void test_equality()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+
+  mat1.setRandom();
+  mat2.setRandom();
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        if (internal::random<bool>()) {
+          mat2(i,j,k) = mat1(i,j,k);
+        }
+      }
+    }
+  }
+
+  Tensor<bool, 3> eq(2,3,7);
+  Tensor<bool, 3> ne(2,3,7);
+  eq = (mat1 == mat2);
+  ne = (mat1 != mat2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(eq(i,j,k), mat1(i,j,k) == mat2(i,j,k));
+        VERIFY_IS_EQUAL(ne(i,j,k), mat1(i,j,k) != mat2(i,j,k));
+      }
+    }
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_comparisons)
+{
+  CALL_SUBTEST(test_orderings());
+  CALL_SUBTEST(test_equality());
+}

diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu
new file mode 100644
index 0000000..99447b2
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu

@@ -0,0 +1,102 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<typename T>
+void test_cuda_complex_cwise_ops() {
+  const int kNumItems = 2;
+  std::size_t complex_bytes = kNumItems * sizeof(std::complex<T>);
+
+  std::complex<T>* d_in1;
+  std::complex<T>* d_in2;
+  std::complex<T>* d_out;
+  cudaMalloc((void**)(&d_in1), complex_bytes);
+  cudaMalloc((void**)(&d_in2), complex_bytes);
+  cudaMalloc((void**)(&d_out), complex_bytes);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1(
+      d_in1, kNumItems);
+  Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in2(
+      d_in2, kNumItems);
+  Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_out(
+      d_out, kNumItems);
+
+  const std::complex<T> a(3.14f, 2.7f);
+  const std::complex<T> b(-10.6f, 1.4f);
+
+  gpu_in1.device(gpu_device) = gpu_in1.constant(a);
+  gpu_in2.device(gpu_device) = gpu_in2.constant(b);
+
+  enum CwiseOp {
+    Add = 0,
+    Sub,
+    Mul,
+    Div,
+    Neg,
+    NbOps
+  };
+
+  Tensor<std::complex<T>, 1, 0, int> actual(kNumItems);
+  for (int op = Add; op < NbOps; op++) {
+    std::complex<T> expected;
+    switch (static_cast<CwiseOp>(op)) {
+      case Add:
+        gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
+        expected = a + b;
+        break;
+      case Sub:
+        gpu_out.device(gpu_device) = gpu_in1 - gpu_in2;
+        expected = a - b;
+        break;
+      case Mul:
+        gpu_out.device(gpu_device) = gpu_in1 * gpu_in2;
+        expected = a * b;
+        break;
+      case Div:
+        gpu_out.device(gpu_device) = gpu_in1 / gpu_in2;
+        expected = a / b;
+        break;
+      case Neg:
+        gpu_out.device(gpu_device) = -gpu_in1;
+        expected = -a;
+        break;
+      case NbOps:
+        break;
+    }
+    assert(cudaMemcpyAsync(actual.data(), d_out, complex_bytes, cudaMemcpyDeviceToHost,
+                           gpu_device.stream()) == cudaSuccess);
+    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+    for (int i = 0; i < kNumItems; ++i) {
+      VERIFY_IS_APPROX(actual(i), expected);
+    }
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+  cudaFree(d_out);
+}
+
+
+EIGEN_DECLARE_TEST(test_cxx11_tensor_complex_cwise_ops)
+{
+  CALL_SUBTEST(test_cuda_complex_cwise_ops<float>());
+  CALL_SUBTEST(test_cuda_complex_cwise_ops<double>());
+}

diff --git a/unsupported/test/cxx11_tensor_complex_gpu.cu b/unsupported/test/cxx11_tensor_complex_gpu.cu
new file mode 100644
index 0000000..f8b8ae7
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_complex_gpu.cu

@@ -0,0 +1,186 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_cuda_nullary() {
+  Tensor<std::complex<float>, 1, 0, int> in1(2);
+  Tensor<std::complex<float>, 1, 0, int> in2(2);
+  in1.setRandom();
+  in2.setRandom();
+
+  std::size_t float_bytes = in1.size() * sizeof(float);
+  std::size_t complex_bytes = in1.size() * sizeof(std::complex<float>);
+
+  std::complex<float>* d_in1;
+  std::complex<float>* d_in2;
+  float* d_out2;
+  cudaMalloc((void**)(&d_in1), complex_bytes);
+  cudaMalloc((void**)(&d_in2), complex_bytes);
+  cudaMalloc((void**)(&d_out2), float_bytes);
+  cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1(
+      d_in1, 2);
+  Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in2(
+      d_in2, 2);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_out2(
+      d_out2, 2);
+
+  gpu_in1.device(gpu_device) = gpu_in1.constant(std::complex<float>(3.14f, 2.7f));
+  gpu_out2.device(gpu_device) = gpu_in2.abs();
+
+  Tensor<std::complex<float>, 1, 0, int> new1(2);
+  Tensor<float, 1, 0, int> new2(2);
+
+  assert(cudaMemcpyAsync(new1.data(), d_in1, complex_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+  assert(cudaMemcpyAsync(new2.data(), d_out2, float_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 2; ++i) {
+    VERIFY_IS_APPROX(new1(i), std::complex<float>(3.14f, 2.7f));
+    VERIFY_IS_APPROX(new2(i), std::abs(in2(i)));
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+  cudaFree(d_out2);
+}
+
+
+static void test_cuda_sum_reductions() {
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  const int num_rows = internal::random<int>(1024, 5*1024);
+  const int num_cols = internal::random<int>(1024, 5*1024);
+
+  Tensor<std::complex<float>, 2> in(num_rows, num_cols);
+  in.setRandom();
+
+  Tensor<std::complex<float>, 0> full_redux;
+  full_redux = in.sum();
+
+  std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
+  std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
+  std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
+  std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
+  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
+
+  TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+  TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
+
+  out_gpu.device(gpu_device) = in_gpu.sum();
+
+  Tensor<std::complex<float>, 0> full_redux_gpu;
+  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
+  gpu_device.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+  gpu_device.deallocate(gpu_in_ptr);
+  gpu_device.deallocate(gpu_out_ptr);
+}
+
+static void test_cuda_mean_reductions() {
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  const int num_rows = internal::random<int>(1024, 5*1024);
+  const int num_cols = internal::random<int>(1024, 5*1024);
+
+  Tensor<std::complex<float>, 2> in(num_rows, num_cols);
+  in.setRandom();
+
+  Tensor<std::complex<float>, 0> full_redux;
+  full_redux = in.mean();
+
+  std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
+  std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
+  std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
+  std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
+  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
+
+  TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+  TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
+
+  out_gpu.device(gpu_device) = in_gpu.mean();
+
+  Tensor<std::complex<float>, 0> full_redux_gpu;
+  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
+  gpu_device.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+  gpu_device.deallocate(gpu_in_ptr);
+  gpu_device.deallocate(gpu_out_ptr);
+}
+
+static void test_cuda_product_reductions() {
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  const int num_rows = internal::random<int>(1024, 5*1024);
+  const int num_cols = internal::random<int>(1024, 5*1024);
+
+  Tensor<std::complex<float>, 2> in(num_rows, num_cols);
+  in.setRandom();
+
+  Tensor<std::complex<float>, 0> full_redux;
+  full_redux = in.prod();
+
+  std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
+  std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
+  std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
+  std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
+  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
+
+  TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+  TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
+
+  out_gpu.device(gpu_device) = in_gpu.prod();
+
+  Tensor<std::complex<float>, 0> full_redux_gpu;
+  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
+  gpu_device.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+  gpu_device.deallocate(gpu_in_ptr);
+  gpu_device.deallocate(gpu_out_ptr);
+}
+
+
+EIGEN_DECLARE_TEST(test_cxx11_tensor_complex)
+{
+  CALL_SUBTEST(test_cuda_nullary());
+  CALL_SUBTEST(test_cuda_sum_reductions());
+  CALL_SUBTEST(test_cuda_mean_reductions());
+  CALL_SUBTEST(test_cuda_product_reductions());
+}

diff --git a/unsupported/test/cxx11_tensor_concatenation.cpp b/unsupported/test/cxx11_tensor_concatenation.cpp
new file mode 100644
index 0000000..bb9418d
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_concatenation.cpp

@@ -0,0 +1,143 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_dimension_failures()
+{
+  Tensor<int, 3, DataLayout> left(2, 3, 1);
+  Tensor<int, 3, DataLayout> right(3, 3, 1);
+  left.setRandom();
+  right.setRandom();
+
+  // Okay; other dimensions are equal.
+  Tensor<int, 3, DataLayout> concatenation = left.concatenate(right, 0);
+
+  // Dimension mismatches.
+  VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 1));
+  VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 2));
+
+  // Axis > NumDims or < 0.
+  VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 3));
+  VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, -1));
+}
+
+template<int DataLayout>
+static void test_static_dimension_failure()
+{
+  Tensor<int, 2, DataLayout> left(2, 3);
+  Tensor<int, 3, DataLayout> right(2, 3, 1);
+
+#ifdef CXX11_TENSOR_CONCATENATION_STATIC_DIMENSION_FAILURE
+  // Technically compatible, but we static assert that the inputs have same
+  // NumDims.
+  Tensor<int, 3, DataLayout> concatenation = left.concatenate(right, 0);
+#endif
+
+  // This can be worked around in this case.
+  Tensor<int, 3, DataLayout> concatenation = left
+      .reshape(Tensor<int, 3>::Dimensions(2, 3, 1))
+      .concatenate(right, 0);
+  Tensor<int, 2, DataLayout> alternative = left
+   // Clang compiler break with {{{}}} with an ambiguous error on copy constructor
+  // the variadic DSize constructor added for #ifndef EIGEN_EMULATE_CXX11_META_H.
+  // Solution:
+  // either the code should change to 
+  //  Tensor<int, 2>::Dimensions{{2, 3}}
+  // or Tensor<int, 2>::Dimensions{Tensor<int, 2>::Dimensions{{2, 3}}}
+      .concatenate(right.reshape(Tensor<int, 2>::Dimensions(2, 3)), 0);
+}
+
+template<int DataLayout>
+static void test_simple_concatenation()
+{
+  Tensor<int, 3, DataLayout> left(2, 3, 1);
+  Tensor<int, 3, DataLayout> right(2, 3, 1);
+  left.setRandom();
+  right.setRandom();
+
+  Tensor<int, 3, DataLayout> concatenation = left.concatenate(right, 0);
+  VERIFY_IS_EQUAL(concatenation.dimension(0), 4);
+  VERIFY_IS_EQUAL(concatenation.dimension(1), 3);
+  VERIFY_IS_EQUAL(concatenation.dimension(2), 1);
+  for (int j = 0; j < 3; ++j) {
+    for (int i = 0; i < 2; ++i) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0));
+    }
+    for (int i = 2; i < 4; ++i) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), right(i - 2, j, 0));
+    }
+  }
+
+  concatenation = left.concatenate(right, 1);
+  VERIFY_IS_EQUAL(concatenation.dimension(0), 2);
+  VERIFY_IS_EQUAL(concatenation.dimension(1), 6);
+  VERIFY_IS_EQUAL(concatenation.dimension(2), 1);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0));
+    }
+    for (int j = 3; j < 6; ++j) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), right(i, j - 3, 0));
+    }
+  }
+
+  concatenation = left.concatenate(right, 2);
+  VERIFY_IS_EQUAL(concatenation.dimension(0), 2);
+  VERIFY_IS_EQUAL(concatenation.dimension(1), 3);
+  VERIFY_IS_EQUAL(concatenation.dimension(2), 2);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0));
+      VERIFY_IS_EQUAL(concatenation(i, j, 1), right(i, j, 0));
+    }
+  }
+}
+
+
+// TODO(phli): Add test once we have a real vectorized implementation.
+// static void test_vectorized_concatenation() {}
+
+static void test_concatenation_as_lvalue()
+{
+  Tensor<int, 2> t1(2, 3);
+  Tensor<int, 2> t2(2, 3);
+  t1.setRandom();
+  t2.setRandom();
+
+  Tensor<int, 2> result(4, 3);
+  result.setRandom();
+  t1.concatenate(t2, 0) = result;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(t1(i, j), result(i, j));
+      VERIFY_IS_EQUAL(t2(i, j), result(i+2, j));
+    }
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_concatenation)
+{
+   CALL_SUBTEST(test_dimension_failures<ColMajor>());
+   CALL_SUBTEST(test_dimension_failures<RowMajor>());
+   CALL_SUBTEST(test_static_dimension_failure<ColMajor>());
+   CALL_SUBTEST(test_static_dimension_failure<RowMajor>());
+   CALL_SUBTEST(test_simple_concatenation<ColMajor>());
+   CALL_SUBTEST(test_simple_concatenation<RowMajor>());
+   // CALL_SUBTEST(test_vectorized_concatenation());
+   CALL_SUBTEST(test_concatenation_as_lvalue());
+
+}

diff --git a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
new file mode 100644
index 0000000..765991b
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp

@@ -0,0 +1,180 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType leftDim1 = 2;
+  IndexType leftDim2 = 3;
+  IndexType leftDim3 = 1;
+  Eigen::array<IndexType, 3> leftRange = {{leftDim1, leftDim2, leftDim3}};
+  IndexType rightDim1 = 2;
+  IndexType rightDim2 = 3;
+  IndexType rightDim3 = 1;
+  Eigen::array<IndexType, 3> rightRange = {{rightDim1, rightDim2, rightDim3}};
+
+  //IndexType concatDim1 = 3;
+//	IndexType concatDim2 = 3;
+//	IndexType concatDim3 = 1;
+  //Eigen::array<IndexType, 3> concatRange = {{concatDim1, concatDim2, concatDim3}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> left(leftRange);
+  Tensor<DataType, 3, DataLayout, IndexType> right(rightRange);
+  left.setRandom();
+  right.setRandom();
+
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType)));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange);
+  sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
+  ///
+  Tensor<DataType, 3, DataLayout, IndexType> concatenation1(leftDim1+rightDim1, leftDim2, leftDim3);
+  DataType * gpu_out_data1 =  static_cast<DataType*>(sycl_device.allocate(concatenation1.dimensions().TotalSize()*sizeof(DataType)));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out1(gpu_out_data1, concatenation1.dimensions());
+
+  //concatenation = left.concatenate(right, 0);
+  gpu_out1.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 0);
+  sycl_device.memcpyDeviceToHost(concatenation1.data(), gpu_out_data1,(concatenation1.dimensions().TotalSize())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(concatenation1.dimension(0), 4);
+  VERIFY_IS_EQUAL(concatenation1.dimension(1), 3);
+  VERIFY_IS_EQUAL(concatenation1.dimension(2), 1);
+  for (IndexType j = 0; j < 3; ++j) {
+    for (IndexType i = 0; i < 2; ++i) {
+      VERIFY_IS_EQUAL(concatenation1(i, j, 0), left(i, j, 0));
+    }
+    for (IndexType i = 2; i < 4; ++i) {
+      VERIFY_IS_EQUAL(concatenation1(i, j, 0), right(i - 2, j, 0));
+    }
+  }
+
+  sycl_device.deallocate(gpu_out_data1);
+  Tensor<DataType, 3, DataLayout, IndexType> concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3);
+  DataType * gpu_out_data2 =  static_cast<DataType*>(sycl_device.allocate(concatenation2.dimensions().TotalSize()*sizeof(DataType)));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out2(gpu_out_data2, concatenation2.dimensions());
+  gpu_out2.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 1);
+  sycl_device.memcpyDeviceToHost(concatenation2.data(), gpu_out_data2,(concatenation2.dimensions().TotalSize())*sizeof(DataType));
+
+  //concatenation = left.concatenate(right, 1);
+  VERIFY_IS_EQUAL(concatenation2.dimension(0), 2);
+  VERIFY_IS_EQUAL(concatenation2.dimension(1), 6);
+  VERIFY_IS_EQUAL(concatenation2.dimension(2), 1);
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(concatenation2(i, j, 0), left(i, j, 0));
+    }
+    for (IndexType j = 3; j < 6; ++j) {
+      VERIFY_IS_EQUAL(concatenation2(i, j, 0), right(i, j - 3, 0));
+    }
+  }
+  sycl_device.deallocate(gpu_out_data2);
+  Tensor<DataType, 3, DataLayout, IndexType> concatenation3(leftDim1, leftDim2, leftDim3+rightDim3);
+  DataType * gpu_out_data3 =  static_cast<DataType*>(sycl_device.allocate(concatenation3.dimensions().TotalSize()*sizeof(DataType)));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out3(gpu_out_data3, concatenation3.dimensions());
+  gpu_out3.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 2);
+  sycl_device.memcpyDeviceToHost(concatenation3.data(), gpu_out_data3,(concatenation3.dimensions().TotalSize())*sizeof(DataType));
+
+  //concatenation = left.concatenate(right, 2);
+  VERIFY_IS_EQUAL(concatenation3.dimension(0), 2);
+  VERIFY_IS_EQUAL(concatenation3.dimension(1), 3);
+  VERIFY_IS_EQUAL(concatenation3.dimension(2), 2);
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(concatenation3(i, j, 0), left(i, j, 0));
+      VERIFY_IS_EQUAL(concatenation3(i, j, 1), right(i, j, 0));
+    }
+  }
+  sycl_device.deallocate(gpu_out_data3);
+  sycl_device.deallocate(gpu_in1_data);
+  sycl_device.deallocate(gpu_in2_data);
+}
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device)
+{
+
+  IndexType leftDim1 = 2;
+  IndexType leftDim2 = 3;
+  Eigen::array<IndexType, 2> leftRange = {{leftDim1, leftDim2}};
+
+  IndexType rightDim1 = 2;
+  IndexType rightDim2 = 3;
+  Eigen::array<IndexType, 2> rightRange = {{rightDim1, rightDim2}};
+
+  IndexType concatDim1 = 4;
+  IndexType concatDim2 = 3;
+  Eigen::array<IndexType, 2> resRange = {{concatDim1, concatDim2}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> left(leftRange);
+  Tensor<DataType, 2, DataLayout, IndexType> right(rightRange);
+  Tensor<DataType, 2, DataLayout, IndexType> result(resRange);
+
+  left.setRandom();
+  right.setRandom();
+  result.setRandom();
+
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType)));
+
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(gpu_out_data, resRange);
+
+  sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_out_data, result.data(),(result.dimensions().TotalSize())*sizeof(DataType));
+
+//  t1.concatenate(t2, 0) = result;
+ gpu_in1.concatenate(gpu_in2, 0).device(sycl_device) =gpu_out;
+ sycl_device.memcpyDeviceToHost(left.data(), gpu_in1_data,(left.dimensions().TotalSize())*sizeof(DataType));
+ sycl_device.memcpyDeviceToHost(right.data(), gpu_in2_data,(right.dimensions().TotalSize())*sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(left(i, j), result(i, j));
+      VERIFY_IS_EQUAL(right(i, j), result(i+2, j));
+    }
+  }
+  sycl_device.deallocate(gpu_in1_data);
+  sycl_device.deallocate(gpu_in2_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+
+template <typename DataType, typename Dev_selector> void tensorConcat_perDevice(Dev_selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_concatenation<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_concatenation<DataType, ColMajor, int64_t>(sycl_device);
+  test_concatenation_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_concatenation_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(tensorConcat_perDevice<float>(device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_const.cpp b/unsupported/test/cxx11_tensor_const.cpp
new file mode 100644
index 0000000..9d806ee
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_const.cpp

@@ -0,0 +1,62 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+using Eigen::Tensor;
+
+
+static void test_simple_assign()
+{
+  Tensor<int, 3> random(2,3,7);
+  random.setRandom();
+
+  TensorMap<Tensor<const int, 3> > constant(random.data(), 2, 3, 7);
+  Tensor<int, 3> result(2,3,7);
+  result = constant;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL((result(i,j,k)), random(i,j,k));
+      }
+    }
+  }
+}
+
+
+static void test_assign_of_const_tensor()
+{
+  Tensor<int, 3> random(2,3,7);
+  random.setRandom();
+
+  TensorMap<Tensor<const int, 3> > constant1(random.data(), 2, 3, 7);
+  TensorMap<const Tensor<int, 3> > constant2(random.data(), 2, 3, 7);
+  const TensorMap<Tensor<int, 3> > constant3(random.data(), 2, 3, 7);
+
+  Tensor<int, 2> result1 = constant1.chip(0, 2);
+  Tensor<int, 2> result2 = constant2.chip(0, 2);
+  Tensor<int, 2> result3 = constant3.chip(0, 2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL((result1(i,j)), random(i,j,0));
+      VERIFY_IS_EQUAL((result2(i,j)), random(i,j,0));
+      VERIFY_IS_EQUAL((result3(i,j)), random(i,j,0));
+    }
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_const)
+{
+  CALL_SUBTEST(test_simple_assign());
+  CALL_SUBTEST(test_assign_of_const_tensor());
+}

diff --git a/unsupported/test/cxx11_tensor_contract_gpu.cu b/unsupported/test/cxx11_tensor_contract_gpu.cu
new file mode 100644
index 0000000..575bdc1
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_contract_gpu.cu

@@ -0,0 +1,218 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template<int DataLayout>
+void test_gpu_contraction(int m_size, int k_size, int n_size)
+{
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
+  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
+  Tensor<float, 2, DataLayout> t_result(m_size, n_size);
+  Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
+  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
+  std::size_t t_right_bytes = t_right.size() * sizeof(float);
+  std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+  float* d_t_left;
+  float* d_t_right;
+  float* d_t_result;
+
+  gpuMalloc((void**)(&d_t_left), t_left_bytes);
+  gpuMalloc((void**)(&d_t_right), t_right_bytes);
+  gpuMalloc((void**)(&d_t_result), t_result_bytes);
+
+  gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size));
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size));
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size));
+
+
+  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+  t_result = t_left.contract(t_right, dims);
+
+  gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
+  for (DenseIndex i = 0; i < t_result.size(); i++) {
+    if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+              << " vs " <<  t_result_gpu(i) << std::endl;
+    assert(false);
+  }
+
+  gpuFree((void*)d_t_left);
+  gpuFree((void*)d_t_right);
+  gpuFree((void*)d_t_result);
+}
+
+
+template<int DataLayout>
+void test_scalar(int m_size, int k_size, int n_size)
+{
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
+  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
+  Tensor<float, 0, DataLayout> t_result;
+  Tensor<float, 0, DataLayout> t_result_gpu;
+  Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1));
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
+  std::size_t t_right_bytes = t_right.size() * sizeof(float);
+  std::size_t t_result_bytes = sizeof(float);
+
+  float* d_t_left;
+  float* d_t_right;
+  float* d_t_result;
+
+  gpuMalloc((void**)(&d_t_left), t_left_bytes);
+  gpuMalloc((void**)(&d_t_right), t_right_bytes);
+  gpuMalloc((void**)(&d_t_result), t_result_bytes);
+
+  gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_left(d_t_left, m_size, k_size);
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_right(d_t_right, k_size, n_size);
+  Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> >
+      gpu_t_result(d_t_result);
+
+  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+  t_result = t_left.contract(t_right, dims);
+
+  gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
+  if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
+      !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
+    std::cout << "mismatch detected: " << t_result()
+              << " vs " <<  t_result_gpu() << std::endl;
+    assert(false);
+  }
+
+  gpuFree((void*)d_t_left);
+  gpuFree((void*)d_t_right);
+  gpuFree((void*)d_t_result);
+}
+
+
+template<int DataLayout>
+void test_gpu_contraction_m() {
+  for (int k = 32; k < 256; k++) {
+    test_gpu_contraction<ColMajor>(k, 128, 128);
+    test_gpu_contraction<RowMajor>(k, 128, 128);
+  }
+}
+
+template<int DataLayout>
+void test_gpu_contraction_k() {
+  for (int k = 32; k < 256; k++) {
+    test_gpu_contraction<ColMajor>(128, k, 128);
+    test_gpu_contraction<RowMajor>(128, k, 128);
+  }
+}
+
+template<int DataLayout>
+void test_gpu_contraction_n() {
+  for (int k = 32; k < 256; k++) {
+    test_gpu_contraction<ColMajor>(128, 128, k);
+    test_gpu_contraction<RowMajor>(128, 128, k);
+  }
+}
+
+
+template<int DataLayout>
+void test_gpu_contraction_sizes() {
+  int m_sizes[] = { 31,  39,   63,   64,   65,
+                   127, 129,  255,  257 , 511,
+                   512, 513, 1023, 1024, 1025};
+
+  int n_sizes[] = { 31,  39,   63,   64,   65,
+                   127, 129,  255,  257,  511,
+                   512, 513, 1023, 1024, 1025};
+
+  int k_sizes[] = {  31,   39,  63,  64,   65,
+                     95,   96, 127, 129,  255,
+                    257,  511, 512, 513, 1023,
+                   1024, 1025};
+
+  for (int i = 0; i < 15; i++) {
+    for (int j = 0; j < 15; j++) {
+      for (int k = 0; k < 17; k++) {
+        test_gpu_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
+      }
+    }
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_contract_gpu)
+{
+  CALL_SUBTEST_1(test_gpu_contraction<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_gpu_contraction<RowMajor>(128, 128, 128));
+
+  CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128));
+
+  CALL_SUBTEST_2(test_gpu_contraction_m<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_contraction_m<RowMajor>());
+
+  CALL_SUBTEST_4(test_gpu_contraction_k<ColMajor>());
+  CALL_SUBTEST_5(test_gpu_contraction_k<RowMajor>());
+
+  CALL_SUBTEST_6(test_gpu_contraction_n<ColMajor>());
+  CALL_SUBTEST_7(test_gpu_contraction_n<RowMajor>());
+
+#if !defined(EIGEN_USE_HIP)
+// disable these subtests for HIP
+  CALL_SUBTEST_8(test_gpu_contraction_sizes<ColMajor>());
+  CALL_SUBTEST_9(test_gpu_contraction_sizes<RowMajor>());
+#endif	
+}

diff --git a/unsupported/test/cxx11_tensor_contract_sycl.cpp b/unsupported/test/cxx11_tensor_contract_sycl.cpp
new file mode 100644
index 0000000..fbcc293
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp

@@ -0,0 +1,1026 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include <algorithm>
+#include <chrono>
+#include <ctime>
+#include <iostream>
+
+#include "main.h"
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void static test_sycl_contraction(const Device &sycl_device, IndexType m_size,
+                                  IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(m_size, n_size);
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, result_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_m(const Device &sycl_device) {
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128,
+                                                           128);
+  }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_k(const Device &sycl_device) {
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k,
+                                                           128);
+  }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_n(const Device &sycl_device) {
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128,
+                                                           128, k);
+  }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_sizes(const Device &sycl_device) {
+  IndexType m_sizes[] = {31,  39,  63,  64,  65,   127,  129, 255,
+                         257, 511, 512, 513, 1023, 1024, 1025};
+
+  IndexType n_sizes[] = {31,  39,  63,  64,  65,   127,  129, 255,
+                         257, 511, 512, 513, 1023, 1024, 1025};
+
+  IndexType k_sizes[] = {31,  39,  63,  64,  65,  95,   96,   127, 129,
+                         255, 257, 511, 512, 513, 1023, 1024, 1025};
+
+  for (IndexType i = 0; i < 15; i++) {
+    for (IndexType j = 0; j < 15; j++) {
+      for (IndexType k = 0; k < 17; k++) {
+        test_sycl_contraction<DataLayout, DataType, IndexType>(
+            sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
+      }
+    }
+  }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void static test_no_out_of_bounds(const Device &sycl_device, IndexType m_size,
+                                  IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
+
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  // Allocate buffers twice as big to check for invalid read and write
+  auto padded_left_size = 2 * t_left.size();
+  auto padded_right_size = 2 * t_right.size();
+  auto padded_result_size = 2 * t_result.size();
+
+  std::size_t t_left_bytes = padded_left_size * sizeof(DataType);
+  std::size_t t_right_bytes = padded_right_size * sizeof(DataType);
+  std::size_t t_result_bytes = padded_result_size * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  // TensorMaps are still of the same size than the Tensors
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, result_dims);
+
+  // Write nan after the actual buffer to propagate nans everywhere in case of
+  // invalid reads
+  DataType nan = std::numeric_limits<DataType>::quiet_NaN();
+  auto host_left_data = new DataType[padded_left_size];
+  std::copy_n(t_left.data(), t_left.size(), host_left_data);
+  std::fill_n(host_left_data + t_left.size(), t_left.size(), nan);
+  auto host_right_data = new DataType[padded_right_size];
+  std::copy_n(t_right.data(), t_right.size(), host_right_data);
+  std::fill_n(host_right_data + t_right.size(), t_right.size(), nan);
+  auto host_result_data = new DataType[padded_result_size];
+  std::fill_n(host_result_data, padded_result_size, nan);
+
+  sycl_device.memcpyHostToDevice(d_t_left, host_left_data, t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, host_right_data, t_right_bytes);
+  sycl_device.memcpyHostToDevice(d_t_result, host_result_data, t_result_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(host_result_data, d_t_result, t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - host_result_data[i]))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), host_result_data[i],
+                                  error_threshold)) {
+      continue;
+    }
+    if (std::isnan(host_result_data[i])) {
+      std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+                << ", invalid read detected at IndexType " << i << ": "
+                << t_result(i) << " vs " << host_result_data[i] << std::endl;
+    } else {
+      std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+                << ", mismatch detected at IndexType " << i << ": "
+                << t_result(i) << " vs " << host_result_data[i] << std::endl;
+    }
+    VERIFY_IS_APPROX(host_result_data[i], t_result(i));
+  }
+  // Make sure that the rest of the result is still nans
+  for (IndexType i = t_result.size(); i < padded_result_size; i++) {
+    if (std::isnan(host_result_data[i])) {
+      continue;
+    }
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", invalid write detected at IndexType " << i << ": "
+              << host_result_data[i] << std::endl;
+    VERIFY_IS_APPROX(host_result_data[i], t_result(i));
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+
+  delete[] host_left_data;
+  delete[] host_right_data;
+  delete[] host_result_data;
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_scalar(const Device &sycl_device, IndexType m_size, IndexType k_size,
+                 IndexType n_size) {
+  // std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size <<
+  // ")" << std::endl;
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+  Tensor<DataType, 0, DataLayout, IndexType> t_result;
+  Tensor<DataType, 0, DataLayout, IndexType> t_result_gpu;
+  Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}};
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType>>
+      gpu_t_result(d_t_result);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+          t_result() - t_result_gpu()))) > error_threshold &&
+      !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) {
+    std::cout << "K: " << k_size << ", N: " << n_size << ", M: " << m_size
+              << " : mismatch detected: " << t_result() << " vs "
+              << t_result_gpu() << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(), t_result());
+  }
+
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_batch(const Device &sycl_device, IndexType m_size,
+                       IndexType k_size, IndexType n_size, IndexType m_batch,
+                       IndexType start, IndexType limit) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  typedef Eigen::array<IndexType, 3> TensorDim;
+  typedef Eigen::Tensor<DataType, 3, DataLayout, IndexType> TensorType;
+  TensorDim left_dims = {{m_batch, k_size, m_size}};
+  TensorDim right_dims = {{m_batch, n_size, k_size}};
+  TensorDim res_dims = {{m_batch, m_size, n_size}};
+  Eigen::array<DimPair, 1> contract_pairs = {{DimPair(0, 1)}};
+
+  TensorType t_left(left_dims);
+  TensorType t_right(right_dims);
+  TensorType t_result_gpu(res_dims);
+  TensorType t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<TensorType> gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<TensorType> gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<TensorType> gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+  for (int i = start; i < limit; ++i) {
+    auto x = gpu_t_left.template chip<0>(i);
+    auto y = gpu_t_right.template chip<0>(i);
+    auto z = gpu_t_result.template chip<0>(i);
+    z.device(sycl_device) = x.contract(y, contract_pairs);
+  }
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  for (int i = start; i < limit; ++i) {
+    auto x = t_left.template chip<0>(i);
+    auto y = t_right.template chip<0>(i);
+    auto z = t_result.template chip<0>(i);
+    z = x.contract(y, contract_pairs);
+  }
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_rhs_transposed(const Device &sycl_device, IndexType m_size,
+                                IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
+  Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 1)}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType j = 0; j < m_size; j++) {
+    for (IndexType i = 0; i < n_size; i++) {
+      if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+              t_result(j, i) - t_result_gpu(j, i)))) < error_threshold) {
+        continue;
+      }
+      if (Eigen::internal::isApprox(t_result(j, i), t_result_gpu(j, i),
+                                    error_threshold)) {
+        continue;
+      }
+      std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+                << ", mismatch detected at IndexType m: " << j << " n: " << i
+                << " CPU : " << t_result(j, i)
+                << " vs SYCL:" << t_result_gpu(j, i) << std::endl;
+      VERIFY_IS_APPROX(t_result_gpu(j, i), t_result(j, i));
+    }
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_lhs_transposed(const Device &sycl_device, IndexType m_size,
+                                IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
+  Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_both_transposed(const Device &sycl_device, IndexType m_size,
+                                 IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
+  Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
+  Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
+  Eigen::array<DimPair, 1> dims = {{DimPair(0, 1)}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <typename Dev>
+void inline tensorOutofBound(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Test out of bound for Tensor-Tensor
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
+                                                       1024);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
+                                                       4096);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 4096, 1024,
+                                                       2048);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
+                                                       1024);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 2048, 1024,
+                                                       784);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
+                                                       10);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 513, 4096,
+                                                       513);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 783, 1024,
+                                                       783);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
+                                                       784);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 11, 1024,
+                                                       11);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor out of bound tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 128, 128,
+                                                       128);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 128, 128,
+                                                       128);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_m(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction_m<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_m<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_n(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction_n<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_n<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_k(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  test_sycl_contraction_k<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_k<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_sizes(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction_sizes<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_sizes<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+template <typename Dev>
+void inline vectorVector(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // VECTOR-VECTOR
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1,
+                                                       1025);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1025, 1,
+                                                       1025);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1024, 1,
+                                                       1024);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1,
+                                                       1024);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1,
+                                                       1023);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1,
+                                                       1023);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "contracted tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline vectorTensor(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Vector-Tensor
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1025,
+                                                       1025);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1025,
+                                                       1025);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1024,
+                                                       1024);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1024,
+                                                       1024);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1023,
+                                                       1023);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1023,
+                                                       1023);
+
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4097,
+                                                       4097);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4097,
+                                                       4097);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4096,
+                                                       4096);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4096,
+                                                       4096);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4095,
+                                                       4095);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4095,
+                                                       4095);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 802816,
+                                                       32);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorVector(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Matrix-Vector
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1025,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1125, 1025,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1224, 1024,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1023,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1023,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4097, 4197,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4097, 4097,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4096, 4096,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4096, 8196,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4095, 4095,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4095, 4095,
+                                                       1);
+// If the GEMV disabled it will creates one kernel to calculate the contraction.
+// Therefore the acumuation of float number will overflow the precision
+// threshold for float and cause the test to fail. While it the GMV multiple
+// kernel will be created and each one run the overflow of accumutation breaks
+// among the kernels.
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 32, 802032,
+                                                       1);
+#endif
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorScalar(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // SCALAR Contraction
+  test_scalar<ColMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
+  test_scalar<RowMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
+  test_scalar<ColMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
+  test_scalar<RowMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
+  test_scalar<ColMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
+  test_scalar<RowMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline skinnyTensor_row(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 257, 131073,
+                                                       257);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 256, 131072,
+                                                       256);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 131073,
+                                                       16);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 17, 131072,
+                                                       17);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline skinnyTensor_col(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 257, 131073,
+                                                       257);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 256, 131072,
+                                                       256);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 131073,
+                                                       16);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 17, 131072,
+                                                       17);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_batch_per_device(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_batch<RowMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
+                                                   0, 4);
+  contraction_batch<ColMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
+                                                   0, 4);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_lhs_transposed_per_device(
+    const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 8, 4,
+                                                            8);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
+                                                            32);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
+                                                            64);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 784,
+                                                            2048, 1024);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
+                                                            10, 1024);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
+                                                            1024, 1024);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
+                                                            4096, 1024);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_rhs_transposed_per_device(
+    const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 16, 4,
+                                                            16);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
+                                                            17);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
+                                                            32);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
+                                                            64);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 10,
+                                                            1024, 1024);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
+                                                            1024, 4096);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
+                                                            1024, 2048);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
+                                                            1024, 784);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_both_transposed_per_device(
+    const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
+                                                             17);
+  contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
+                                                             32);
+  contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 64,
+                                                             16, 64);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_contract_sycl) {
+  for (const auto &device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.template get_info<cl::sycl::info::device::name>()
+              << std::endl;
+    QueueInterface queueInterface(device);
+    auto sycl_device = Eigen::SyclDevice(&queueInterface);
+    CALL_SUBTEST_1(tensorOutofBound(sycl_device));
+    CALL_SUBTEST_2(tensorTensor(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_m(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_n(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_k(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_sizes(sycl_device));
+    CALL_SUBTEST_3(vectorVector(sycl_device));
+    CALL_SUBTEST_4(vectorTensor(sycl_device));
+    CALL_SUBTEST_5(tensorVector(sycl_device));
+    CALL_SUBTEST_6(tensorScalar(sycl_device));
+    CALL_SUBTEST_7(skinnyTensor_row(sycl_device));
+    CALL_SUBTEST_7(skinnyTensor_col(sycl_device));
+    CALL_SUBTEST_8(tensor_contraction_batch_per_device(sycl_device));
+    CALL_SUBTEST_9(tensor_contraction_lhs_transposed_per_device(sycl_device));
+    CALL_SUBTEST_10(tensor_contraction_rhs_transposed_per_device(sycl_device));
+    CALL_SUBTEST_11(tensor_contraction_both_transposed_per_device(sycl_device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp
new file mode 100644
index 0000000..3b5c6a1
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_contraction.cpp

@@ -0,0 +1,601 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::DefaultDevice;
+using Eigen::Tensor;
+
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template<int DataLayout>
+static void test_evals()
+{
+  Tensor<float, 2, DataLayout> mat1(2, 3);
+  Tensor<float, 2, DataLayout> mat2(2, 3);
+  Tensor<float, 2, DataLayout> mat3(3, 2);
+
+  mat1.setRandom();
+  mat2.setRandom();
+  mat3.setRandom();
+
+  Tensor<float, 2, DataLayout> mat4(3,3);
+  mat4.setZero();
+  Eigen::array<DimPair, 1> dims3 = {{DimPair(0, 0)}};
+  typedef TensorEvaluator<decltype(mat1.contract(mat2, dims3)), DefaultDevice> Evaluator;
+  Evaluator eval(mat1.contract(mat2, dims3), DefaultDevice());
+  eval.evalTo(mat4.data());
+  EIGEN_STATIC_ASSERT(Evaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval.dimensions()[0], 3);
+  VERIFY_IS_EQUAL(eval.dimensions()[1], 3);
+
+  VERIFY_IS_APPROX(mat4(0,0), mat1(0,0)*mat2(0,0) + mat1(1,0)*mat2(1,0));
+  VERIFY_IS_APPROX(mat4(0,1), mat1(0,0)*mat2(0,1) + mat1(1,0)*mat2(1,1));
+  VERIFY_IS_APPROX(mat4(0,2), mat1(0,0)*mat2(0,2) + mat1(1,0)*mat2(1,2));
+  VERIFY_IS_APPROX(mat4(1,0), mat1(0,1)*mat2(0,0) + mat1(1,1)*mat2(1,0));
+  VERIFY_IS_APPROX(mat4(1,1), mat1(0,1)*mat2(0,1) + mat1(1,1)*mat2(1,1));
+  VERIFY_IS_APPROX(mat4(1,2), mat1(0,1)*mat2(0,2) + mat1(1,1)*mat2(1,2));
+  VERIFY_IS_APPROX(mat4(2,0), mat1(0,2)*mat2(0,0) + mat1(1,2)*mat2(1,0));
+  VERIFY_IS_APPROX(mat4(2,1), mat1(0,2)*mat2(0,1) + mat1(1,2)*mat2(1,1));
+  VERIFY_IS_APPROX(mat4(2,2), mat1(0,2)*mat2(0,2) + mat1(1,2)*mat2(1,2));
+
+  Tensor<float, 2, DataLayout> mat5(2,2);
+  mat5.setZero();
+  Eigen::array<DimPair, 1> dims4 = {{DimPair(1, 1)}};
+  typedef TensorEvaluator<decltype(mat1.contract(mat2, dims4)), DefaultDevice> Evaluator2;
+  Evaluator2 eval2(mat1.contract(mat2, dims4), DefaultDevice());
+  eval2.evalTo(mat5.data());
+  EIGEN_STATIC_ASSERT(Evaluator2::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval2.dimensions()[0], 2);
+  VERIFY_IS_EQUAL(eval2.dimensions()[1], 2);
+
+  VERIFY_IS_APPROX(mat5(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(0,1) + mat1(0,2)*mat2(0,2));
+  VERIFY_IS_APPROX(mat5(0,1), mat1(0,0)*mat2(1,0) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(1,2));
+  VERIFY_IS_APPROX(mat5(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(0,1) + mat1(1,2)*mat2(0,2));
+  VERIFY_IS_APPROX(mat5(1,1), mat1(1,0)*mat2(1,0) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(1,2));
+
+  Tensor<float, 2, DataLayout> mat6(2,2);
+  mat6.setZero();
+  Eigen::array<DimPair, 1> dims6 = {{DimPair(1, 0)}};
+  typedef TensorEvaluator<decltype(mat1.contract(mat3, dims6)), DefaultDevice> Evaluator3;
+  Evaluator3 eval3(mat1.contract(mat3, dims6), DefaultDevice());
+  eval3.evalTo(mat6.data());
+  EIGEN_STATIC_ASSERT(Evaluator3::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval3.dimensions()[0], 2);
+  VERIFY_IS_EQUAL(eval3.dimensions()[1], 2);
+
+  VERIFY_IS_APPROX(mat6(0,0), mat1(0,0)*mat3(0,0) + mat1(0,1)*mat3(1,0) + mat1(0,2)*mat3(2,0));
+  VERIFY_IS_APPROX(mat6(0,1), mat1(0,0)*mat3(0,1) + mat1(0,1)*mat3(1,1) + mat1(0,2)*mat3(2,1));
+  VERIFY_IS_APPROX(mat6(1,0), mat1(1,0)*mat3(0,0) + mat1(1,1)*mat3(1,0) + mat1(1,2)*mat3(2,0));
+  VERIFY_IS_APPROX(mat6(1,1), mat1(1,0)*mat3(0,1) + mat1(1,1)*mat3(1,1) + mat1(1,2)*mat3(2,1));
+}
+
+template<int DataLayout>
+static void test_scalar()
+{
+  Tensor<float, 1, DataLayout> vec1({6});
+  Tensor<float, 1, DataLayout> vec2({6});
+
+  vec1.setRandom();
+  vec2.setRandom();
+
+  Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}};
+  Tensor<float, 0, DataLayout> scalar = vec1.contract(vec2, dims);
+
+  float expected = 0.0f;
+  for (int i = 0; i < 6; ++i) {
+    expected += vec1(i) * vec2(i);
+  }
+  VERIFY_IS_APPROX(scalar(), expected);
+}
+
+template<int DataLayout>
+static void test_multidims()
+{
+  Tensor<float, 3, DataLayout> mat1(2, 2, 2);
+  Tensor<float, 4, DataLayout> mat2(2, 2, 2, 2);
+
+  mat1.setRandom();
+  mat2.setRandom();
+
+  Tensor<float, 3, DataLayout> mat3(2, 2, 2);
+  mat3.setZero();
+  Eigen::array<DimPair, 2> dims = {{DimPair(1, 2), DimPair(2, 3)}};
+  typedef TensorEvaluator<decltype(mat1.contract(mat2, dims)), DefaultDevice> Evaluator;
+  Evaluator eval(mat1.contract(mat2, dims), DefaultDevice());
+  eval.evalTo(mat3.data());
+  EIGEN_STATIC_ASSERT(Evaluator::NumDims==3ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval.dimensions()[0], 2);
+  VERIFY_IS_EQUAL(eval.dimensions()[1], 2);
+  VERIFY_IS_EQUAL(eval.dimensions()[2], 2);
+
+  VERIFY_IS_APPROX(mat3(0,0,0), mat1(0,0,0)*mat2(0,0,0,0) + mat1(0,1,0)*mat2(0,0,1,0) +
+                                mat1(0,0,1)*mat2(0,0,0,1) + mat1(0,1,1)*mat2(0,0,1,1));
+  VERIFY_IS_APPROX(mat3(0,0,1), mat1(0,0,0)*mat2(0,1,0,0) + mat1(0,1,0)*mat2(0,1,1,0) +
+                                mat1(0,0,1)*mat2(0,1,0,1) + mat1(0,1,1)*mat2(0,1,1,1));
+  VERIFY_IS_APPROX(mat3(0,1,0), mat1(0,0,0)*mat2(1,0,0,0) + mat1(0,1,0)*mat2(1,0,1,0) +
+                                mat1(0,0,1)*mat2(1,0,0,1) + mat1(0,1,1)*mat2(1,0,1,1));
+  VERIFY_IS_APPROX(mat3(0,1,1), mat1(0,0,0)*mat2(1,1,0,0) + mat1(0,1,0)*mat2(1,1,1,0) +
+                                mat1(0,0,1)*mat2(1,1,0,1) + mat1(0,1,1)*mat2(1,1,1,1));
+  VERIFY_IS_APPROX(mat3(1,0,0), mat1(1,0,0)*mat2(0,0,0,0) + mat1(1,1,0)*mat2(0,0,1,0) +
+                                mat1(1,0,1)*mat2(0,0,0,1) + mat1(1,1,1)*mat2(0,0,1,1));
+  VERIFY_IS_APPROX(mat3(1,0,1), mat1(1,0,0)*mat2(0,1,0,0) + mat1(1,1,0)*mat2(0,1,1,0) +
+                                mat1(1,0,1)*mat2(0,1,0,1) + mat1(1,1,1)*mat2(0,1,1,1));
+  VERIFY_IS_APPROX(mat3(1,1,0), mat1(1,0,0)*mat2(1,0,0,0) + mat1(1,1,0)*mat2(1,0,1,0) +
+                                mat1(1,0,1)*mat2(1,0,0,1) + mat1(1,1,1)*mat2(1,0,1,1));
+  VERIFY_IS_APPROX(mat3(1,1,1), mat1(1,0,0)*mat2(1,1,0,0) + mat1(1,1,0)*mat2(1,1,1,0) +
+                                mat1(1,0,1)*mat2(1,1,0,1) + mat1(1,1,1)*mat2(1,1,1,1));
+
+  Tensor<float, 2, DataLayout> mat4(2, 2);
+  Tensor<float, 3, DataLayout> mat5(2, 2, 2);
+
+  mat4.setRandom();
+  mat5.setRandom();
+
+  Tensor<float, 1, DataLayout> mat6(2);
+  mat6.setZero();
+  Eigen::array<DimPair, 2> dims2({{DimPair(0, 1), DimPair(1, 0)}});
+  typedef TensorEvaluator<decltype(mat4.contract(mat5, dims2)), DefaultDevice> Evaluator2;
+  Evaluator2 eval2(mat4.contract(mat5, dims2), DefaultDevice());
+  eval2.evalTo(mat6.data());
+  EIGEN_STATIC_ASSERT(Evaluator2::NumDims==1ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval2.dimensions()[0], 2);
+
+  VERIFY_IS_APPROX(mat6(0), mat4(0,0)*mat5(0,0,0) + mat4(1,0)*mat5(0,1,0) +
+                   mat4(0,1)*mat5(1,0,0) + mat4(1,1)*mat5(1,1,0));
+  VERIFY_IS_APPROX(mat6(1), mat4(0,0)*mat5(0,0,1) + mat4(1,0)*mat5(0,1,1) +
+                   mat4(0,1)*mat5(1,0,1) + mat4(1,1)*mat5(1,1,1));
+}
+
+template<int DataLayout>
+static void test_holes() {
+  Tensor<float, 4, DataLayout> t1(2, 5, 7, 3);
+  Tensor<float, 5, DataLayout> t2(2, 7, 11, 13, 3);
+  t1.setRandom();
+  t2.setRandom();
+
+  Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(3, 4)}};
+  Tensor<float, 5, DataLayout> result = t1.contract(t2, dims);
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  VERIFY_IS_EQUAL(result.dimension(2), 7);
+  VERIFY_IS_EQUAL(result.dimension(3), 11);
+  VERIFY_IS_EQUAL(result.dimension(4), 13);
+
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          for (int m = 0; m < 5; ++m) {
+            VERIFY_IS_APPROX(result(i, j, k, l, m),
+                             t1(0, i, j, 0) * t2(0, k, l, m, 0) +
+                             t1(1, i, j, 0) * t2(1, k, l, m, 0) +
+                             t1(0, i, j, 1) * t2(0, k, l, m, 1) +
+                             t1(1, i, j, 1) * t2(1, k, l, m, 1) +
+                             t1(0, i, j, 2) * t2(0, k, l, m, 2) +
+                             t1(1, i, j, 2) * t2(1, k, l, m, 2));
+          }
+        }
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_full_redux()
+{
+  Tensor<float, 2, DataLayout> t1(2, 2);
+  Tensor<float, 3, DataLayout> t2(2, 2, 2);
+  t1.setRandom();
+  t2.setRandom();
+
+  Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}};
+  Tensor<float, 1, DataLayout> result = t1.contract(t2, dims);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) +  t1(1, 0) * t2(1, 0, 0)
+                            + t1(0, 1) * t2(0, 1, 0) +  t1(1, 1) * t2(1, 1, 0));
+  VERIFY_IS_APPROX(result(1), t1(0, 0) * t2(0, 0, 1) +  t1(1, 0) * t2(1, 0, 1)
+                            + t1(0, 1) * t2(0, 1, 1) +  t1(1, 1) * t2(1, 1, 1));
+
+  dims[0] = DimPair(1, 0);
+  dims[1] = DimPair(2, 1);
+  result = t2.contract(t1, dims);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) +  t1(1, 0) * t2(0, 1, 0)
+                            + t1(0, 1) * t2(0, 0, 1) +  t1(1, 1) * t2(0, 1, 1));
+  VERIFY_IS_APPROX(result(1), t1(0, 0) * t2(1, 0, 0) +  t1(1, 0) * t2(1, 1, 0)
+                            + t1(0, 1) * t2(1, 0, 1) +  t1(1, 1) * t2(1, 1, 1));
+}
+
+template<int DataLayout>
+static void test_contraction_of_contraction()
+{
+  Tensor<float, 2, DataLayout> t1(2, 2);
+  Tensor<float, 2, DataLayout> t2(2, 2);
+  Tensor<float, 2, DataLayout> t3(2, 2);
+  Tensor<float, 2, DataLayout> t4(2, 2);
+  t1.setRandom();
+  t2.setRandom();
+  t3.setRandom();
+  t4.setRandom();
+
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+  auto contract1 = t1.contract(t2, dims);
+  auto diff = t3 - contract1;
+  auto contract2 = t1.contract(t4, dims);
+  Tensor<float, 2, DataLayout> result = contract2.contract(diff, dims);
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 2);
+
+  Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>>
+      m1(t1.data(), 2, 2), m2(t2.data(), 2, 2), m3(t3.data(), 2, 2),
+      m4(t4.data(), 2, 2);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>
+      expected = (m1 * m4) * (m3 - m1 * m2);
+
+  VERIFY_IS_APPROX(result(0, 0), expected(0, 0));
+  VERIFY_IS_APPROX(result(0, 1), expected(0, 1));
+  VERIFY_IS_APPROX(result(1, 0), expected(1, 0));
+  VERIFY_IS_APPROX(result(1, 1), expected(1, 1));
+}
+
+template<int DataLayout>
+static void test_expr()
+{
+  Tensor<float, 2, DataLayout> mat1(2, 3);
+  Tensor<float, 2, DataLayout> mat2(3, 2);
+  mat1.setRandom();
+  mat2.setRandom();
+
+  Tensor<float, 2, DataLayout> mat3(2,2);
+
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+  mat3 = mat1.contract(mat2, dims);
+
+  VERIFY_IS_APPROX(mat3(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(1,0) + mat1(0,2)*mat2(2,0));
+  VERIFY_IS_APPROX(mat3(0,1), mat1(0,0)*mat2(0,1) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(2,1));
+  VERIFY_IS_APPROX(mat3(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(1,0) + mat1(1,2)*mat2(2,0));
+  VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1));
+}
+
+template<int DataLayout>
+static void test_out_of_order_contraction()
+{
+  Tensor<float, 3, DataLayout> mat1(2, 2, 2);
+  Tensor<float, 3, DataLayout> mat2(2, 2, 2);
+
+  mat1.setRandom();
+  mat2.setRandom();
+
+  Tensor<float, 2, DataLayout> mat3(2, 2);
+
+  Eigen::array<DimPair, 2> dims = {{DimPair(2, 0), DimPair(0, 2)}};
+  mat3 = mat1.contract(mat2, dims);
+
+  VERIFY_IS_APPROX(mat3(0, 0),
+                   mat1(0,0,0)*mat2(0,0,0) + mat1(1,0,0)*mat2(0,0,1) +
+                   mat1(0,0,1)*mat2(1,0,0) + mat1(1,0,1)*mat2(1,0,1));
+  VERIFY_IS_APPROX(mat3(1, 0),
+                   mat1(0,1,0)*mat2(0,0,0) + mat1(1,1,0)*mat2(0,0,1) +
+                   mat1(0,1,1)*mat2(1,0,0) + mat1(1,1,1)*mat2(1,0,1));
+  VERIFY_IS_APPROX(mat3(0, 1),
+                   mat1(0,0,0)*mat2(0,1,0) + mat1(1,0,0)*mat2(0,1,1) +
+                   mat1(0,0,1)*mat2(1,1,0) + mat1(1,0,1)*mat2(1,1,1));
+  VERIFY_IS_APPROX(mat3(1, 1),
+                   mat1(0,1,0)*mat2(0,1,0) + mat1(1,1,0)*mat2(0,1,1) +
+                   mat1(0,1,1)*mat2(1,1,0) + mat1(1,1,1)*mat2(1,1,1));
+
+  Eigen::array<DimPair, 2> dims2 = {{DimPair(0, 2), DimPair(2, 0)}};
+  mat3 = mat1.contract(mat2, dims2);
+
+  VERIFY_IS_APPROX(mat3(0, 0),
+                   mat1(0,0,0)*mat2(0,0,0) + mat1(1,0,0)*mat2(0,0,1) +
+                   mat1(0,0,1)*mat2(1,0,0) + mat1(1,0,1)*mat2(1,0,1));
+  VERIFY_IS_APPROX(mat3(1, 0),
+                   mat1(0,1,0)*mat2(0,0,0) + mat1(1,1,0)*mat2(0,0,1) +
+                   mat1(0,1,1)*mat2(1,0,0) + mat1(1,1,1)*mat2(1,0,1));
+  VERIFY_IS_APPROX(mat3(0, 1),
+                   mat1(0,0,0)*mat2(0,1,0) + mat1(1,0,0)*mat2(0,1,1) +
+                   mat1(0,0,1)*mat2(1,1,0) + mat1(1,0,1)*mat2(1,1,1));
+  VERIFY_IS_APPROX(mat3(1, 1),
+                   mat1(0,1,0)*mat2(0,1,0) + mat1(1,1,0)*mat2(0,1,1) +
+                   mat1(0,1,1)*mat2(1,1,0) + mat1(1,1,1)*mat2(1,1,1));
+
+}
+
+template<int DataLayout>
+static void test_consistency()
+{
+  // this does something like testing (A*B)^T = (B^T * A^T)
+
+  Tensor<float, 3, DataLayout> mat1(4, 3, 5);
+  Tensor<float, 5, DataLayout> mat2(3, 2, 1, 5, 4);
+  mat1.setRandom();
+  mat2.setRandom();
+
+  Tensor<float, 4, DataLayout> mat3(5, 2, 1, 5);
+  Tensor<float, 4, DataLayout> mat4(2, 1, 5, 5);
+
+  // contract on dimensions of size 4 and 3
+  Eigen::array<DimPair, 2> dims1 = {{DimPair(0, 4), DimPair(1, 0)}};
+  Eigen::array<DimPair, 2> dims2 = {{DimPair(4, 0), DimPair(0, 1)}};
+
+  mat3 = mat1.contract(mat2, dims1);
+  mat4 = mat2.contract(mat1, dims2);
+
+  // check that these are equal except for ordering of dimensions
+  if (DataLayout == ColMajor) {
+    for (size_t i = 0; i < 5; i++) {
+      for (size_t j = 0; j < 10; j++) {
+        VERIFY_IS_APPROX(mat3.data()[i + 5 * j], mat4.data()[j + 10 * i]);
+      }
+    }
+  } else {
+    // Row major
+    for (size_t i = 0; i < 5; i++) {
+      for (size_t j = 0; j < 10; j++) {
+        VERIFY_IS_APPROX(mat3.data()[10 * i + j], mat4.data()[i + 5 * j]);
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_large_contraction()
+{
+  Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
+  Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
+  Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 1500, 248);
+  MapXf m_right(t_right.data(), 248, 1400);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 2> dims = {{DimPair(2, 0), DimPair(3, 1)}};
+
+  // compute results by separate methods
+  t_result = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+
+  for (int i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+template<int DataLayout>
+static void test_matrix_vector()
+{
+  Tensor<float, 2, DataLayout> t_left(30, 50);
+  Tensor<float, 1, DataLayout> t_right(50);
+  Tensor<float, 1, DataLayout> t_result(30);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 30, 50);
+  MapXf m_right(t_right.data(), 50, 1);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(30, 1);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims{{DimPair(1, 0)}};
+
+  // compute results by separate methods
+  t_result = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+
+  for (int i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(internal::isApprox(t_result(i), m_result(i, 0), 1));
+  }
+}
+
+
+template<int DataLayout>
+static void test_tensor_vector()
+{
+  Tensor<float, 3, DataLayout> t_left(7, 13, 17);
+  Tensor<float, 2, DataLayout> t_right(1, 7);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  typedef typename Tensor<float, 1, DataLayout>::DimensionPair DimensionPair;
+  Eigen::array<DimensionPair, 1> dim_pair01{{{0, 1}}};
+  Tensor<float, 3, DataLayout> t_result = t_left.contract(t_right, dim_pair01);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 7, 13*17);
+  MapXf m_right(t_right.data(), 1, 7);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result = m_left.transpose() * m_right.transpose();
+
+  for (int i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(internal::isApprox(t_result(i), m_result(i, 0), 1));
+  }
+}
+
+
+template<int DataLayout>
+static void test_small_blocking_factors()
+{
+  Tensor<float, 4, DataLayout> t_left(30, 5, 3, 31);
+  Tensor<float, 5, DataLayout> t_right(3, 31, 7, 20, 1);
+  t_left.setRandom();
+  t_right.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  // Force the cache sizes, which results in smaller blocking factors.
+  Eigen::setCpuCacheSizes(896, 1920, 2944);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 2> dims = {{DimPair(2, 0), DimPair(3, 1)}};
+  Tensor<float, 5, DataLayout> t_result;
+  t_result = t_left.contract(t_right, dims);
+
+  // compute result using a simple eigen matrix product
+  Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> m_left(t_left.data(), 150, 93);
+  Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> m_right(t_right.data(), 93, 140);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result = m_left * m_right;
+
+  for (int i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+template<int DataLayout>
+static void test_tensor_product()
+{
+  Tensor<float, 2, DataLayout> mat1(2, 3);
+  Tensor<float, 2, DataLayout> mat2(4, 1);
+  mat1.setRandom();
+  mat2.setRandom();
+
+  Eigen::array<DimPair, 0> dims;
+  Tensor<float, 4, DataLayout> result = mat1.contract(mat2, dims);
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 3);
+  VERIFY_IS_EQUAL(result.dimension(2), 4);
+  VERIFY_IS_EQUAL(result.dimension(3), 1);
+  for (int i = 0; i < result.dimension(0); ++i) {
+    for (int j = 0; j < result.dimension(1); ++j) {
+      for (int k = 0; k < result.dimension(2); ++k) {
+        for (int l = 0; l < result.dimension(3); ++l) {
+			VERIFY_IS_APPROX(result(i, j, k, l), mat1(i, j) * mat2(k, l) );
+        }
+      }
+    }
+  }
+}
+
+
+template<int DataLayout>
+static void test_const_inputs()
+{
+  Tensor<float, 2, DataLayout> in1(2, 3);
+  Tensor<float, 2, DataLayout> in2(3, 2);
+  in1.setRandom();
+  in2.setRandom();
+
+  TensorMap<Tensor<const float, 2, DataLayout> > mat1(in1.data(), 2, 3);
+  TensorMap<Tensor<const float, 2, DataLayout> > mat2(in2.data(), 3, 2);
+  Tensor<float, 2, DataLayout> mat3(2,2);
+
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+  mat3 = mat1.contract(mat2, dims);
+
+  VERIFY_IS_APPROX(mat3(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(1,0) + mat1(0,2)*mat2(2,0));
+  VERIFY_IS_APPROX(mat3(0,1), mat1(0,0)*mat2(0,1) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(2,1));
+  VERIFY_IS_APPROX(mat3(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(1,0) + mat1(1,2)*mat2(2,0));
+  VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1));
+}
+
+// Apply Sqrt to all output elements.
+struct SqrtOutputKernel {
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
+      const TensorContractionParams&, Index, Index, Index num_rows,
+      Index num_cols) const {
+    for (int i = 0; i < num_rows; ++i) {
+      for (int j = 0; j < num_cols; ++j) {
+        output_mapper(i, j) = std::sqrt(output_mapper(i, j));
+      }
+    }
+  }
+};
+
+template <int DataLayout>
+static void test_large_contraction_with_output_kernel() {
+  Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
+  Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
+  Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in mat4 to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 1500, 248);
+  MapXf m_right(t_right.data(), 248, 1400);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+
+  // compute results by separate methods
+  t_result = t_left.contract(t_right, dims, SqrtOutputKernel());
+
+  m_result = m_left * m_right;
+
+  for (std::ptrdiff_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+    VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_contraction)
+{
+  CALL_SUBTEST_1(test_evals<ColMajor>());
+  CALL_SUBTEST_1(test_evals<RowMajor>());
+  CALL_SUBTEST_1(test_scalar<ColMajor>());
+  CALL_SUBTEST_1(test_scalar<RowMajor>());
+  CALL_SUBTEST_2(test_multidims<ColMajor>());
+  CALL_SUBTEST_2(test_multidims<RowMajor>());
+  CALL_SUBTEST_2(test_holes<ColMajor>());
+  CALL_SUBTEST_2(test_holes<RowMajor>());
+  CALL_SUBTEST_3(test_full_redux<ColMajor>());
+  CALL_SUBTEST_3(test_full_redux<RowMajor>());
+  CALL_SUBTEST_3(test_contraction_of_contraction<ColMajor>());
+  CALL_SUBTEST_3(test_contraction_of_contraction<RowMajor>());
+  CALL_SUBTEST_4(test_expr<ColMajor>());
+  CALL_SUBTEST_4(test_expr<RowMajor>());
+  CALL_SUBTEST_4(test_out_of_order_contraction<ColMajor>());
+  CALL_SUBTEST_4(test_out_of_order_contraction<RowMajor>());
+  CALL_SUBTEST_5(test_consistency<ColMajor>());
+  CALL_SUBTEST_5(test_consistency<RowMajor>());
+  CALL_SUBTEST_5(test_large_contraction<ColMajor>());
+  CALL_SUBTEST_5(test_large_contraction<RowMajor>());
+  CALL_SUBTEST_6(test_matrix_vector<ColMajor>());
+  CALL_SUBTEST_6(test_matrix_vector<RowMajor>());
+  CALL_SUBTEST_6(test_tensor_vector<ColMajor>());
+  CALL_SUBTEST_6(test_tensor_vector<RowMajor>());
+  CALL_SUBTEST_7(test_small_blocking_factors<ColMajor>());
+  CALL_SUBTEST_7(test_small_blocking_factors<RowMajor>());
+  CALL_SUBTEST_7(test_tensor_product<ColMajor>());
+  CALL_SUBTEST_7(test_tensor_product<RowMajor>());
+  CALL_SUBTEST_8(test_const_inputs<ColMajor>());
+  CALL_SUBTEST_8(test_const_inputs<RowMajor>());
+  CALL_SUBTEST_8(test_large_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST_8(test_large_contraction_with_output_kernel<RowMajor>());
+
+  // Force CMake to split this test.
+  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8
+
+}

diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp
new file mode 100644
index 0000000..c3688f6
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_convolution.cpp

@@ -0,0 +1,150 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::DefaultDevice;
+
+template <int DataLayout>
+static void test_evals()
+{
+  Tensor<float, 2, DataLayout> input(3, 3);
+  Tensor<float, 1, DataLayout> kernel(2);
+
+  input.setRandom();
+  kernel.setRandom();
+
+  Tensor<float, 2, DataLayout> result(2,3);
+  result.setZero();
+  Eigen::array<Tensor<float, 2>::Index, 1> dims3;
+  dims3[0] = 0;
+
+  typedef TensorEvaluator<decltype(input.convolve(kernel, dims3)), DefaultDevice> Evaluator;
+  Evaluator eval(input.convolve(kernel, dims3), DefaultDevice());
+  eval.evalTo(result.data());
+  EIGEN_STATIC_ASSERT(Evaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval.dimensions()[0], 2);
+  VERIFY_IS_EQUAL(eval.dimensions()[1], 3);
+
+  VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1));  // index 0
+  VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1));  // index 2
+  VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1));  // index 4
+  VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1));  // index 1
+  VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1));  // index 3
+  VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1));  // index 5
+}
+
+template <int DataLayout>
+static void test_expr()
+{
+  Tensor<float, 2, DataLayout> input(3, 3);
+  Tensor<float, 2, DataLayout> kernel(2, 2);
+  input.setRandom();
+  kernel.setRandom();
+
+  Tensor<float, 2, DataLayout> result(2,2);
+  Eigen::array<ptrdiff_t, 2> dims;
+  dims[0] = 0;
+  dims[1] = 1;
+  result = input.convolve(kernel, dims);
+
+  VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) +
+                                input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1));
+  VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) +
+                                input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1));
+  VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) +
+                                input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1));
+  VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) +
+                                input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1));
+}
+
+template <int DataLayout>
+static void test_modes() {
+  Tensor<float, 1, DataLayout> input(3);
+  Tensor<float, 1, DataLayout> kernel(3);
+  input(0) = 1.0f;
+  input(1) = 2.0f;
+  input(2) = 3.0f;
+  kernel(0) = 0.5f;
+  kernel(1) = 1.0f;
+  kernel(2) = 0.0f;
+
+  Eigen::array<ptrdiff_t, 1> dims;
+  dims[0] = 0;
+  Eigen::array<std::pair<ptrdiff_t, ptrdiff_t>, 1> padding;
+
+  // Emulate VALID mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(0, 0);
+  Tensor<float, 1, DataLayout> valid(1);
+  valid = input.pad(padding).convolve(kernel, dims);
+  VERIFY_IS_EQUAL(valid.dimension(0), 1);
+  VERIFY_IS_APPROX(valid(0), 2.5f);
+
+  // Emulate SAME mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(1, 1);
+  Tensor<float, 1, DataLayout> same(3);
+  same = input.pad(padding).convolve(kernel, dims);
+  VERIFY_IS_EQUAL(same.dimension(0), 3);
+  VERIFY_IS_APPROX(same(0), 1.0f);
+  VERIFY_IS_APPROX(same(1), 2.5f);
+  VERIFY_IS_APPROX(same(2), 4.0f);
+
+  // Emulate FULL mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(2, 2);
+  Tensor<float, 1, DataLayout> full(5);
+  full = input.pad(padding).convolve(kernel, dims);
+  VERIFY_IS_EQUAL(full.dimension(0), 5);
+  VERIFY_IS_APPROX(full(0), 0.0f);
+  VERIFY_IS_APPROX(full(1), 1.0f);
+  VERIFY_IS_APPROX(full(2), 2.5f);
+  VERIFY_IS_APPROX(full(3), 4.0f);
+  VERIFY_IS_APPROX(full(4), 1.5f);
+}
+
+template <int DataLayout>
+static void test_strides() {
+  Tensor<float, 1, DataLayout> input(13);
+  Tensor<float, 1, DataLayout> kernel(3);
+  input.setRandom();
+  kernel.setRandom();
+
+  Eigen::array<ptrdiff_t, 1> dims;
+  dims[0] = 0;
+  Eigen::array<ptrdiff_t, 1> stride_of_3;
+  stride_of_3[0] = 3;
+  Eigen::array<ptrdiff_t, 1> stride_of_2;
+  stride_of_2[0] = 2;
+
+  Tensor<float, 1, DataLayout> result;
+  result = input.stride(stride_of_3).convolve(kernel, dims).stride(stride_of_2);
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) +
+                               input(6)*kernel(2)));
+  VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) +
+                               input(12)*kernel(2)));
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_convolution)
+{
+  CALL_SUBTEST(test_evals<ColMajor>());
+  CALL_SUBTEST(test_evals<RowMajor>());
+  CALL_SUBTEST(test_expr<ColMajor>());
+  CALL_SUBTEST(test_expr<RowMajor>());
+  CALL_SUBTEST(test_modes<ColMajor>());
+  CALL_SUBTEST(test_modes<RowMajor>());
+  CALL_SUBTEST(test_strides<ColMajor>());
+  CALL_SUBTEST(test_strides<RowMajor>());
+}

diff --git a/unsupported/test/cxx11_tensor_convolution_sycl.cpp b/unsupported/test/cxx11_tensor_convolution_sycl.cpp
new file mode 100644
index 0000000..3954c8a
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_convolution_sycl.cpp

@@ -0,0 +1,469 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include <iostream>
+#include <chrono>
+#include <ctime>
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+#include <iomanip>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+static const float error_threshold =1e-4f;
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType indim0 =53;
+  IndexType indim1= 55;
+  IndexType indim2= 51;
+  IndexType outdim0=50;
+  IndexType outdim1=55;
+  IndexType outdim2=51;
+  Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
+  Eigen::array<IndexType, 1> kernel_dims = {{4}};
+  Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
+
+  Eigen::array<IndexType, 1> dims3{{0}};
+
+  input.setRandom();
+  kernel.setRandom();
+  result.setZero();
+  result_host.setZero();
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  result_host=input.convolve(kernel, dims3);
+
+for(IndexType i=0; i< outdim0; i++ ){
+  for(IndexType j=0; j< outdim1; j++ ){
+    for(IndexType k=0; k< outdim2; k++ ){
+      if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
+        std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
+        assert(false);
+      }
+    }
+  }
+}
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType indim0 =53;
+  IndexType indim1= 55;
+  IndexType indim2= 51;
+  IndexType outdim0=50;
+  IndexType outdim1=51;
+  IndexType outdim2=51;
+  Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
+  Eigen::array<IndexType, 2> kernel_dims = {{4,5}};
+  Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 2, DataLayout,IndexType> kernel(kernel_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
+
+  Eigen::array<IndexType, 2> dims3{{0,1}};
+
+  input.setRandom();
+  kernel.setRandom();
+  result.setZero();
+  result_host.setZero();
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  result_host=input.convolve(kernel, dims3);
+
+for(IndexType i=0; i< outdim0; i++ ){
+  for(IndexType j=0; j< outdim1; j++ ){
+    for(IndexType k=0; k< outdim2; k++ ){
+      if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
+        std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
+        assert(false);
+      }
+    }
+  }
+}
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType indim0 =53;
+  IndexType indim1= 55;
+  IndexType indim2= 51;
+  IndexType outdim0=50;
+  IndexType outdim1=51;
+  IndexType outdim2=49;
+  Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
+  Eigen::array<IndexType, 3> kernel_dims = {{4,5,3}};
+  Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> kernel(kernel_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
+
+  Eigen::array<IndexType, 3> dims3{{0,1,2}};
+
+  input.setRandom();
+  kernel.setRandom();
+  result.setZero();
+  result_host.setZero();
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  result_host=input.convolve(kernel, dims3);
+
+for(IndexType i=0; i< outdim0; i++ ){
+  for(IndexType j=0; j< outdim1; j++ ){
+    for(IndexType k=0; k< outdim2; k++ ){
+      if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
+        std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
+        assert(false);
+      }
+    }
+  }
+}
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_evals(const Eigen::SyclDevice& sycl_device)
+{
+  Eigen::array<IndexType, 2> input_dims = {{3, 3}};
+  Eigen::array<IndexType, 1> kernel_dims = {{2}};
+  Eigen::array<IndexType, 2> result_dims = {{2, 3}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
+  Tensor<DataType, 2, DataLayout,IndexType> result(result_dims);
+
+  Eigen::array<IndexType, 1> dims3{{0}};
+
+  input.setRandom();
+  kernel.setRandom();
+  result.setZero();
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1));  // index 0
+  VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1));  // index 2
+  VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1));  // index 4
+  VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1));  // index 1
+  VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1));  // index 3
+  VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1));  // index 5
+
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_expr(const Eigen::SyclDevice& sycl_device)
+{
+  Eigen::array<IndexType, 2> input_dims = {{3, 3}};
+  Eigen::array<IndexType, 2> kernel_dims = {{2, 2}};
+  Eigen::array<IndexType, 2> result_dims = {{2, 2}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> kernel(kernel_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> result(result_dims);
+
+  input.setRandom();
+  kernel.setRandom();
+  Eigen::array<IndexType, 2> dims;
+  dims[0] = 0;
+  dims[1] = 1;
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) +
+                                input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1));
+  VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) +
+                                input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1));
+  VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) +
+                                input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1));
+  VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) +
+                                input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1));
+
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_modes(const Eigen::SyclDevice& sycl_device){
+
+Eigen::array<IndexType, 1> input_dims = {{3}};
+Eigen::array<IndexType, 1> kernel_dims = {{3}};
+
+Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
+Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
+
+input.setRandom();
+kernel.setRandom();
+Eigen::array<IndexType, 1> dims;
+dims[0] = 0;
+
+  input(0) = 1.0f;
+  input(1) = 2.0f;
+  input(2) = 3.0f;
+  kernel(0) = 0.5f;
+  kernel(1) = 1.0f;
+  kernel(2) = 0.0f;
+
+  Eigen::array<std::pair<IndexType, IndexType>, 1> padding;
+
+  // Emulate VALID mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(0, 0);
+  Tensor<DataType, 1, DataLayout, IndexType> valid(1);
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t valid_bytes = valid.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_valid =  static_cast<DataType*>(sycl_device.allocate(valid_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_valid(d_valid, valid.dimensions());
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_valid.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
+  sycl_device.memcpyDeviceToHost(valid.data(), d_valid, valid_bytes);
+
+  VERIFY_IS_EQUAL(valid.dimension(0), 1);
+  VERIFY_IS_APPROX(valid(0), 2.5f);
+
+  // Emulate SAME mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(1, 1);
+  Tensor<DataType, 1, DataLayout, IndexType> same(3);
+  std::size_t same_bytes = same.size() * sizeof(DataType);
+  DataType * d_same =  static_cast<DataType*>(sycl_device.allocate(same_bytes));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_same(d_same, same.dimensions());
+  gpu_same.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
+  sycl_device.memcpyDeviceToHost(same.data(), d_same, same_bytes);
+
+  VERIFY_IS_EQUAL(same.dimension(0), 3);
+  VERIFY_IS_APPROX(same(0), 1.0f);
+  VERIFY_IS_APPROX(same(1), 2.5f);
+  VERIFY_IS_APPROX(same(2), 4.0f);
+
+  // Emulate FULL mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(2, 2);
+
+  Tensor<DataType, 1, DataLayout, IndexType> full(5);
+  std::size_t full_bytes = full.size() * sizeof(DataType);
+  DataType * d_full =  static_cast<DataType*>(sycl_device.allocate(full_bytes));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_full(d_full, full.dimensions());
+  gpu_full.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
+  sycl_device.memcpyDeviceToHost(full.data(), d_full, full_bytes);
+
+  VERIFY_IS_EQUAL(full.dimension(0), 5);
+  VERIFY_IS_APPROX(full(0), 0.0f);
+  VERIFY_IS_APPROX(full(1), 1.0f);
+  VERIFY_IS_APPROX(full(2), 2.5f);
+  VERIFY_IS_APPROX(full(3), 4.0f);
+  VERIFY_IS_APPROX(full(4), 1.5f);
+
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_valid);
+  sycl_device.deallocate(d_same);
+  sycl_device.deallocate(d_full);
+
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_strides(const Eigen::SyclDevice& sycl_device){
+
+  Eigen::array<IndexType, 1> input_dims = {{13}};
+  Eigen::array<IndexType, 1> kernel_dims = {{3}};
+
+  Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
+  Tensor<DataType, 1, DataLayout, IndexType> result(2);
+
+  input.setRandom();
+  kernel.setRandom();
+  Eigen::array<IndexType, 1> dims;
+  dims[0] = 0;
+
+  Eigen::array<IndexType, 1> stride_of_3;
+  stride_of_3[0] = 3;
+  Eigen::array<IndexType, 1> stride_of_2;
+  stride_of_2[0] = 2;
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_result(d_result, result.dimensions());
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.stride(stride_of_3).convolve(gpu_kernel, dims).stride(stride_of_2);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) +
+                               input(6)*kernel(2)));
+  VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) +
+                               input(12)*kernel(2)));
+}
+
+template <typename Dev_selector> void tensorConvolutionPerDevice(Dev_selector& s){
+  QueueInterface queueInterface(s);
+  auto sycl_device=Eigen::SyclDevice(&queueInterface);
+  test_larg_expr1D<float, RowMajor, int64_t>(sycl_device);
+  test_larg_expr1D<float, ColMajor, int64_t>(sycl_device);
+  test_larg_expr2D<float, RowMajor, int64_t>(sycl_device);
+  test_larg_expr2D<float, ColMajor, int64_t>(sycl_device);
+  test_larg_expr3D<float, RowMajor, int64_t>(sycl_device);
+  test_larg_expr3D<float, ColMajor, int64_t>(sycl_device);
+  test_evals<float, ColMajor, int64_t>(sycl_device);
+  test_evals<float, RowMajor, int64_t>(sycl_device);
+  test_expr<float, ColMajor, int64_t>(sycl_device);
+  test_expr<float, RowMajor, int64_t>(sycl_device);
+  test_modes<float, ColMajor, int64_t>(sycl_device);
+  test_modes<float, RowMajor, int64_t>(sycl_device);
+  test_strides<float, ColMajor, int64_t>(sycl_device);
+  test_strides<float, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_convolution_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(tensorConvolutionPerDevice(device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_custom_index.cpp b/unsupported/test/cxx11_tensor_custom_index.cpp
new file mode 100644
index 0000000..b5dbc97
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_custom_index.cpp

@@ -0,0 +1,100 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <limits>
+#include <map>
+
+#include <Eigen/Dense>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+
+template <int DataLayout>
+static void test_map_as_index()
+{
+#ifdef EIGEN_HAS_SFINAE
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+
+  using NormalIndex = DSizes<ptrdiff_t, 4>;
+  using CustomIndex = std::map<ptrdiff_t, ptrdiff_t>;
+  CustomIndex coeffC;
+  coeffC[0] = 1;
+  coeffC[1] = 2;
+  coeffC[2] = 4;
+  coeffC[3] = 1;
+  NormalIndex coeff(1,2,4,1);
+
+  VERIFY_IS_EQUAL(tensor.coeff(coeffC), tensor.coeff(coeff));
+  VERIFY_IS_EQUAL(tensor.coeffRef(coeffC), tensor.coeffRef(coeff));
+#endif
+}
+
+
+template <int DataLayout>
+static void test_matrix_as_index()
+{
+#ifdef EIGEN_HAS_SFINAE
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+
+  using NormalIndex = DSizes<ptrdiff_t, 4>;
+  using CustomIndex = Matrix<unsigned int, 4, 1>;
+  CustomIndex coeffC(1,2,4,1);
+  NormalIndex coeff(1,2,4,1);
+
+  VERIFY_IS_EQUAL(tensor.coeff(coeffC), tensor.coeff(coeff));
+  VERIFY_IS_EQUAL(tensor.coeffRef(coeffC), tensor.coeffRef(coeff));
+#endif
+}
+
+
+template <int DataLayout>
+static void test_varlist_as_index()
+{
+#ifdef EIGEN_HAS_SFINAE
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+
+  DSizes<ptrdiff_t, 4> coeff(1,2,4,1);
+
+  VERIFY_IS_EQUAL(tensor.coeff({1,2,4,1}), tensor.coeff(coeff));
+  VERIFY_IS_EQUAL(tensor.coeffRef({1,2,4,1}), tensor.coeffRef(coeff));
+#endif
+}
+
+
+template <int DataLayout>
+static void test_sizes_as_index()
+{
+#ifdef EIGEN_HAS_SFINAE
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+
+  DSizes<ptrdiff_t, 4> coeff(1,2,4,1);
+  Sizes<1,2,4,1> coeffC;
+
+  VERIFY_IS_EQUAL(tensor.coeff(coeffC), tensor.coeff(coeff));
+  VERIFY_IS_EQUAL(tensor.coeffRef(coeffC), tensor.coeffRef(coeff));
+#endif
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_custom_index) {
+  test_map_as_index<ColMajor>();
+  test_map_as_index<RowMajor>();
+  test_matrix_as_index<ColMajor>();
+  test_matrix_as_index<RowMajor>();
+  test_varlist_as_index<ColMajor>();
+  test_varlist_as_index<RowMajor>();
+  test_sizes_as_index<ColMajor>();
+  test_sizes_as_index<RowMajor>();
+}

diff --git a/unsupported/test/cxx11_tensor_custom_op.cpp b/unsupported/test/cxx11_tensor_custom_op.cpp
new file mode 100644
index 0000000..875ea57
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_custom_op.cpp

@@ -0,0 +1,111 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+
+struct InsertZeros {
+  DSizes<DenseIndex, 2> dimensions(const Tensor<float, 2>& input) const {
+    DSizes<DenseIndex, 2> result;
+    result[0] = input.dimension(0) * 2;
+    result[1] = input.dimension(1) * 2;
+    return result;
+  }
+
+  template <typename Output, typename Device>
+  void eval(const Tensor<float, 2>& input, Output& output, const Device& device) const
+  {
+    array<DenseIndex, 2> strides;
+    strides[0] = 2;
+    strides[1] = 2;
+    output.stride(strides).device(device) = input;
+
+    Eigen::DSizes<DenseIndex, 2> offsets(1,1);
+    Eigen::DSizes<DenseIndex, 2> extents(output.dimension(0)-1, output.dimension(1)-1);
+    output.slice(offsets, extents).stride(strides).device(device) = input.constant(0.0f);
+  }
+};
+
+static void test_custom_unary_op()
+{
+  Tensor<float, 2> tensor(3,5);
+  tensor.setRandom();
+
+  Tensor<float, 2> result = tensor.customOp(InsertZeros());
+  VERIFY_IS_EQUAL(result.dimension(0), 6);
+  VERIFY_IS_EQUAL(result.dimension(1), 10);
+
+  for (int i = 0; i < 6; i+=2) {
+    for (int j = 0; j < 10; j+=2) {
+      VERIFY_IS_EQUAL(result(i, j), tensor(i/2, j/2));
+    }
+  }
+  for (int i = 1; i < 6; i+=2) {
+    for (int j = 1; j < 10; j+=2) {
+      VERIFY_IS_EQUAL(result(i, j), 0);
+    }
+  }
+}
+
+
+struct BatchMatMul {
+  DSizes<DenseIndex, 3> dimensions(const Tensor<float, 3>& input1, const Tensor<float, 3>& input2) const {
+    DSizes<DenseIndex, 3> result;
+    result[0] = input1.dimension(0);
+    result[1] = input2.dimension(1);
+    result[2] = input2.dimension(2);
+    return result;
+  }
+
+  template <typename Output, typename Device>
+  void eval(const Tensor<float, 3>& input1, const Tensor<float, 3>& input2,
+            Output& output, const Device& device) const
+  {
+    typedef Tensor<float, 3>::DimensionPair DimPair;
+    array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    for (int i = 0; i < output.dimension(2); ++i) {
+      output.template chip<2>(i).device(device) = input1.chip<2>(i).contract(input2.chip<2>(i), dims);
+    }
+  }
+};
+
+
+static void test_custom_binary_op()
+{
+  Tensor<float, 3> tensor1(2,3,5);
+  tensor1.setRandom();
+  Tensor<float, 3> tensor2(3,7,5);
+  tensor2.setRandom();
+
+  Tensor<float, 3> result = tensor1.customOp(tensor2, BatchMatMul());
+  for (int i = 0; i < 5; ++i) {
+    typedef Tensor<float, 3>::DimensionPair DimPair;
+    array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    Tensor<float, 2> reference = tensor1.chip<2>(i).contract(tensor2.chip<2>(i), dims);
+    TensorRef<Tensor<float, 2> > val = result.chip<2>(i);
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(val(j, k), reference(j, k));
+      }
+    }
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_custom_op)
+{
+  CALL_SUBTEST(test_custom_unary_op());
+  CALL_SUBTEST(test_custom_binary_op());
+}

diff --git a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
new file mode 100644
index 0000000..d947ead
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp

@@ -0,0 +1,170 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+template<typename TensorType>
+struct InsertZeros {
+  DSizes<DenseIndex, 2> dimensions(const TensorType& input) const {
+    DSizes<DenseIndex, 2> result;
+    result[0] = input.dimension(0) * 2;
+    result[1] = input.dimension(1) * 2;
+    return result;
+  }
+
+  template <typename Output, typename Device>
+  void eval(const TensorType& input, Output& output, const Device& device) const
+  {
+    array<DenseIndex, 2> strides;
+    strides[0] = 2;
+    strides[1] = 2;
+    output.stride(strides).device(device) = input;
+
+    Eigen::DSizes<DenseIndex, 2> offsets(1,1);
+    Eigen::DSizes<DenseIndex, 2> extents(output.dimension(0)-1, output.dimension(1)-1);
+    output.slice(offsets, extents).stride(strides).device(device) = input.constant(0.0f);
+  }
+};
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_custom_unary_op_sycl(const Eigen::SyclDevice &sycl_device)
+{
+  IndexType sizeDim1 = 3;
+  IndexType sizeDim2 = 5;
+  Eigen::array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}};
+  Eigen::array<IndexType, 2> tensorResultRange = {{6, 10}};
+
+  Eigen::Tensor<DataType, 2, DataLayout, IndexType> in1(tensorRange);
+  Eigen::Tensor<DataType, 2, DataLayout, IndexType> out(tensorResultRange);
+
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+
+  typedef Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > TensorType;
+  TensorType gpu_in1(gpu_in1_data, tensorRange);
+  TensorType gpu_out(gpu_out_data, tensorResultRange);
+
+  in1.setRandom();
+  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+  gpu_out.device(sycl_device) = gpu_in1.customOp(InsertZeros<TensorType>());
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(out.dimension(0), 6);
+  VERIFY_IS_EQUAL(out.dimension(1), 10);
+
+  for (int i = 0; i < 6; i+=2) {
+    for (int j = 0; j < 10; j+=2) {
+      VERIFY_IS_EQUAL(out(i, j), in1(i/2, j/2));
+    }
+  }
+  for (int i = 1; i < 6; i+=2) {
+    for (int j = 1; j < 10; j+=2) {
+      VERIFY_IS_EQUAL(out(i, j), 0);
+    }
+  }
+  sycl_device.deallocate(gpu_in1_data);
+sycl_device.deallocate(gpu_out_data);
+}
+
+template<typename TensorType>
+struct BatchMatMul {
+  DSizes<DenseIndex, 3> dimensions(const TensorType& input1, const TensorType& input2) const {
+    DSizes<DenseIndex, 3> result;
+    result[0] = input1.dimension(0);
+    result[1] = input2.dimension(1);
+    result[2] = input2.dimension(2);
+    return result;
+  }
+
+  template <typename Output, typename Device>
+  void eval(const TensorType& input1, const TensorType& input2,
+            Output& output, const Device& device) const
+  {
+    typedef typename TensorType::DimensionPair DimPair;
+    array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    for (int64_t i = 0; i < output.dimension(2); ++i) {
+      output.template chip<2>(i).device(device) = input1.template chip<2>(i).contract(input2.template chip<2>(i), dims);
+    }
+  }
+};
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_custom_binary_op_sycl(const Eigen::SyclDevice &sycl_device)
+{
+
+  Eigen::array<IndexType, 3> tensorRange1 = {{2, 3, 5}};
+  Eigen::array<IndexType, 3> tensorRange2 = {{3,7,5}};
+  Eigen::array<IndexType, 3> tensorResultRange  = {{2, 7, 5}};
+
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange1);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange2);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorResultRange);
+
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+
+  typedef Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > TensorType;
+  TensorType gpu_in1(gpu_in1_data, tensorRange1);
+  TensorType gpu_in2(gpu_in2_data, tensorRange2);
+  TensorType gpu_out(gpu_out_data, tensorResultRange);
+
+  in1.setRandom();
+  in2.setRandom();
+
+  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType));
+
+  gpu_out.device(sycl_device) = gpu_in1.customOp(gpu_in2, BatchMatMul<TensorType>());
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+
+  for (IndexType i = 0; i < 5; ++i) {
+    typedef typename Eigen::Tensor<DataType, 3, DataLayout, IndexType>::DimensionPair DimPair;
+    array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    Eigen::Tensor<DataType, 2, DataLayout, IndexType> reference = in1.template chip<2>(i).contract(in2.template chip<2>(i), dims);
+    TensorRef<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > val = out.template chip<2>(i);
+    for (IndexType j = 0; j < 2; ++j) {
+      for (IndexType k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(val(j, k), reference(j, k));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_in1_data);
+  sycl_device.deallocate(gpu_in2_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, typename Dev_selector> void custom_op_perDevice(Dev_selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_custom_unary_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_custom_unary_op_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_custom_binary_op_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_custom_binary_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
+
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_custom_op_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(custom_op_perDevice<float>(device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu
new file mode 100644
index 0000000..c9f78d2
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_device.cu

@@ -0,0 +1,396 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+// Context for evaluation on cpu
+struct CPUContext {
+  CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) {
+    kernel_1d_(0) = 3.14f;
+    kernel_1d_(1) = 2.7f;
+
+    kernel_2d_(0,0) = 3.14f;
+    kernel_2d_(1,0) = 2.7f;
+    kernel_2d_(0,1) = 0.2f;
+    kernel_2d_(1,1) = 7.0f;
+
+    kernel_3d_(0,0,0) = 3.14f;
+    kernel_3d_(0,1,0) = 2.7f;
+    kernel_3d_(0,0,1) = 0.2f;
+    kernel_3d_(0,1,1) = 7.0f;
+    kernel_3d_(1,0,0) = -1.0f;
+    kernel_3d_(1,1,0) = -0.3f;
+    kernel_3d_(1,0,1) = -0.7f;
+    kernel_3d_(1,1,1) = -0.5f;
+  }
+
+  const Eigen::DefaultDevice& device() const { return cpu_device_; }
+
+  const Eigen::Tensor<float, 3>& in1() const { return in1_; }
+  const Eigen::Tensor<float, 3>& in2() const { return in2_; }
+  Eigen::Tensor<float, 3>& out() { return out_; }
+  const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; }
+  const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; }
+  const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; }
+
+ private:
+  const Eigen::Tensor<float, 3>& in1_;
+  const Eigen::Tensor<float, 3>& in2_;
+  Eigen::Tensor<float, 3>& out_;
+
+  Eigen::Tensor<float, 1> kernel_1d_;
+  Eigen::Tensor<float, 2> kernel_2d_;
+  Eigen::Tensor<float, 3> kernel_3d_;
+
+  Eigen::DefaultDevice cpu_device_;
+};
+
+
+// Context for evaluation on GPU
+struct GPUContext {
+  GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
+    assert(gpuMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == gpuSuccess);
+    float kernel_1d_val[] = {3.14f, 2.7f};
+    assert(gpuMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
+
+    assert(gpuMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == gpuSuccess);
+    float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
+    assert(gpuMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
+
+    assert(gpuMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == gpuSuccess);
+    float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
+    assert(gpuMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
+  }
+  ~GPUContext() {
+    assert(gpuFree(kernel_1d_) == gpuSuccess);
+    assert(gpuFree(kernel_2d_) == gpuSuccess);
+    assert(gpuFree(kernel_3d_) == gpuSuccess);
+  }
+
+  const Eigen::GpuDevice& device() const { return gpu_device_; }
+
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
+  Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; }
+  Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); }
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); }
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); }
+
+ private:
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
+  Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
+
+  float* kernel_1d_;
+  float* kernel_2d_;
+  float* kernel_3d_;
+
+  Eigen::GpuStreamDevice stream_;
+  Eigen::GpuDevice gpu_device_;
+};
+
+
+// The actual expression to evaluate
+template <typename Context>
+void test_contextual_eval(Context* context)
+{
+  context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
+}
+
+template <typename Context>
+void test_forced_contextual_eval(Context* context)
+{
+  context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
+}
+
+template <typename Context>
+void test_compound_assignment(Context* context)
+{
+  context->out().device(context->device()) = context->in1().constant(2.718f);
+  context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
+}
+
+
+template <typename Context>
+void test_contraction(Context* context)
+{
+  Eigen::array<std::pair<int, int>, 2> dims;
+  dims[0] = std::make_pair(1, 1);
+  dims[1] = std::make_pair(2, 2);
+
+  Eigen::array<int, 2> shape(40, 50*70);
+
+  Eigen::DSizes<int, 2> indices(0,0);
+  Eigen::DSizes<int, 2> sizes(40,40);
+
+  context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims);
+}
+
+
+template <typename Context>
+void test_1d_convolution(Context* context)
+{
+  Eigen::DSizes<int, 3> indices(0,0,0);
+  Eigen::DSizes<int, 3> sizes(40,49,70);
+
+  Eigen::array<int, 1> dims(1);
+  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims);
+}
+
+template <typename Context>
+void test_2d_convolution(Context* context)
+{
+  Eigen::DSizes<int, 3> indices(0,0,0);
+  Eigen::DSizes<int, 3> sizes(40,49,69);
+
+  Eigen::array<int, 2> dims(1,2);
+  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims);
+}
+
+template <typename Context>
+void test_3d_convolution(Context* context)
+{
+  Eigen::DSizes<int, 3> indices(0,0,0);
+  Eigen::DSizes<int, 3> sizes(39,49,69);
+
+  Eigen::array<int, 3> dims(0,1,2);
+  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
+}
+
+
+void test_cpu() {
+  Eigen::Tensor<float, 3> in1(40,50,70);
+  Eigen::Tensor<float, 3> in2(40,50,70);
+  Eigen::Tensor<float, 3> out(40,50,70);
+
+  in1 = in1.random() + in1.constant(10.0f);
+  in2 = in2.random() + in2.constant(10.0f);
+
+  CPUContext context(in1, in2, out);
+  test_contextual_eval(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_forced_contextual_eval(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_compound_assignment(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_contraction(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 40; ++j) {
+      const float result = out(i,j,0);
+      float expected = 0;
+      for (int k = 0; k < 50; ++k) {
+        for (int l = 0; l < 70; ++l) {
+          expected += in1(i, k, l) * in2(j, k, l);
+        }
+      }
+      VERIFY_IS_APPROX(expected, result);
+    }
+  }
+
+  test_1d_convolution(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
+      }
+    }
+  }
+
+  test_2d_convolution(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 69; ++k) {
+        const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
+                               (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
+        if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
+          continue;
+        }
+        VERIFY_IS_APPROX(expected, result);
+      }
+    }
+  }
+
+  test_3d_convolution(&context);
+  for (int i = 0; i < 39; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 69; ++k) {
+        const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
+                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
+                               (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
+                                in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
+        if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
+          continue;
+        }
+        VERIFY_IS_APPROX(expected, result);
+      }
+    }
+  }
+}
+
+void test_gpu() {
+  Eigen::Tensor<float, 3> in1(40,50,70);
+  Eigen::Tensor<float, 3> in2(40,50,70);
+  Eigen::Tensor<float, 3> out(40,50,70);
+  in1 = in1.random() + in1.constant(10.0f);
+  in2 = in2.random() + in2.constant(10.0f);
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t in2_bytes = in2.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  float* d_out;
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_in2), in2_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
+
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70);
+
+  GPUContext context(gpu_in1, gpu_in2, gpu_out);
+  test_contextual_eval(&context);
+  assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_forced_contextual_eval(&context);
+  assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_compound_assignment(&context);
+  assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_contraction(&context);
+  assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 40; ++j) {
+      const float result = out(i,j,0);
+      float expected = 0;
+      for (int k = 0; k < 50; ++k) {
+        for (int l = 0; l < 70; ++l) {
+          expected += in1(i, k, l) * in2(j, k, l);
+        }
+      }
+      VERIFY_IS_APPROX(expected, result);
+    }
+  }
+
+  test_1d_convolution(&context);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
+      }
+    }
+  }
+
+  test_2d_convolution(&context);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 69; ++k) {
+        const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
+                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
+        VERIFY_IS_APPROX(expected, result);
+      }
+    }
+  }
+
+#if !defined(EIGEN_USE_HIP)
+// disable this test on the HIP platform
+// 3D tensor convolutions seem to hang on the HIP platform
+
+  test_3d_convolution(&context);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
+  for (int i = 0; i < 39; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 69; ++k) {
+       const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
+                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f +
+                                in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
+                                in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
+        VERIFY_IS_APPROX(expected, result);
+      }
+    }
+  }
+
+#endif
+ 
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_device)
+{
+  CALL_SUBTEST_1(test_cpu());
+  CALL_SUBTEST_2(test_gpu());
+}

diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp
new file mode 100644
index 0000000..5095cb0
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp

@@ -0,0 +1,77 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+#include <stdint.h>
+#include <iostream>
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_device_memory(const Eigen::SyclDevice &sycl_device) {
+  std::cout << "Running on : "
+            << sycl_device.sycl_queue().get_device(). template get_info<cl::sycl::info::device::name>()
+            <<std::endl;
+  IndexType sizeDim1 = 100;
+  array<IndexType, 1> tensorRange = {{sizeDim1}};
+  Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange);
+  Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange);
+  memset(in1.data(), 1, in1.size() * sizeof(DataType));
+  DataType* gpu_in_data  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
+  sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType));
+  for (IndexType i=0; i<in.size(); i++) {
+    VERIFY_IS_EQUAL(in(i), in1(i));
+  }
+  sycl_device.deallocate(gpu_in_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_device_exceptions(const Eigen::SyclDevice &sycl_device) {
+  VERIFY(sycl_device.ok());
+  IndexType sizeDim1 = 100;
+  array<IndexType, 1> tensorDims = {{sizeDim1}};
+  DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(sizeDim1*sizeof(DataType)));
+  sycl_device.memset(gpu_data, 1, sizeDim1*sizeof(DataType));
+
+  TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> in(gpu_data, tensorDims);
+  TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> out(gpu_data, tensorDims);
+  out.device(sycl_device) = in / in.constant(0);
+
+  sycl_device.synchronize();
+  VERIFY(!sycl_device.ok());
+  sycl_device.deallocate(gpu_data);
+}
+
+template<typename DataType> void sycl_device_test_per_device(const cl::sycl::device& d){
+  std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
+  QueueInterface queueInterface(d);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_device_memory<DataType, RowMajor, int64_t>(sycl_device);
+  test_device_memory<DataType, ColMajor, int64_t>(sycl_device);
+  /// this test throw an exception. enable it if you want to see the exception
+  //test_device_exceptions<DataType, RowMajor>(sycl_device);
+  /// this test throw an exception. enable it if you want to see the exception
+  //test_device_exceptions<DataType, ColMajor>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_device_test_per_device<float>(device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp
new file mode 100644
index 0000000..ee416e1
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_dimension.cpp

@@ -0,0 +1,88 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+
+static void test_dynamic_size()
+{
+  Eigen::DSizes<int, 3> dimensions(2,3,7);
+
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2);
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3);
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7);
+  VERIFY_IS_EQUAL((int)dimensions.TotalSize(), 2*3*7);
+  VERIFY_IS_EQUAL((int)dimensions[0], 2);
+  VERIFY_IS_EQUAL((int)dimensions[1], 3);
+  VERIFY_IS_EQUAL((int)dimensions[2], 7);
+}
+
+static void test_fixed_size()
+{
+  Eigen::Sizes<2,3,7> dimensions;
+
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2);
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3);
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7);
+  VERIFY_IS_EQUAL((int)dimensions.TotalSize(), 2*3*7);
+}
+
+static void test_match()
+{
+  Eigen::DSizes<unsigned int, 3> dyn((unsigned int)2,(unsigned int)3,(unsigned int)7);
+  Eigen::Sizes<2,3,7> stat;
+  VERIFY_IS_EQUAL(Eigen::dimensions_match(dyn, stat), true);
+
+  Eigen::DSizes<int, 3> dyn1(2,3,7);
+  Eigen::DSizes<int, 2> dyn2(2,3);
+  VERIFY_IS_EQUAL(Eigen::dimensions_match(dyn1, dyn2), false);
+}
+
+static void test_rank_zero()
+{
+  Eigen::Sizes<> scalar;
+  VERIFY_IS_EQUAL((int)scalar.TotalSize(), 1);
+  VERIFY_IS_EQUAL((int)scalar.rank(), 0);
+  VERIFY_IS_EQUAL((int)internal::array_prod(scalar), 1);
+
+  Eigen::DSizes<ptrdiff_t, 0> dscalar;
+  VERIFY_IS_EQUAL((int)dscalar.TotalSize(), 1);
+  VERIFY_IS_EQUAL((int)dscalar.rank(), 0);
+}
+
+static void test_index_type_promotion() {
+  Eigen::DSizes<int, 3> src0(1, 2, 3);
+  Eigen::array<int, 3> src1;
+  src1[0] = 4;
+  src1[1] = 5;
+  src1[2] = 6;
+
+  Eigen::DSizes<long, 3> dst0(src0);
+  Eigen::DSizes<long, 3> dst1(src1);
+
+  VERIFY_IS_EQUAL(dst0[0], 1L);
+  VERIFY_IS_EQUAL(dst0[1], 2L);
+  VERIFY_IS_EQUAL(dst0[2], 3L);
+  VERIFY_IS_EQUAL(dst1[0], 4L);
+  VERIFY_IS_EQUAL(dst1[1], 5L);
+  VERIFY_IS_EQUAL(dst1[2], 6L);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_dimension)
+{
+  CALL_SUBTEST(test_dynamic_size());
+  CALL_SUBTEST(test_fixed_size());
+  CALL_SUBTEST(test_match());
+  CALL_SUBTEST(test_rank_zero());
+  CALL_SUBTEST(test_index_type_promotion());
+}

diff --git a/unsupported/test/cxx11_tensor_empty.cpp b/unsupported/test/cxx11_tensor_empty.cpp
new file mode 100644
index 0000000..fd889c4
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_empty.cpp

@@ -0,0 +1,40 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+static void test_empty_tensor()
+{
+  Tensor<float, 2> source;
+  Tensor<float, 2> tgt1 = source;
+  Tensor<float, 2> tgt2(source);
+  Tensor<float, 2> tgt3;
+  tgt3 = tgt1;
+  tgt3 = tgt2;
+}
+
+static void test_empty_fixed_size_tensor()
+{
+  TensorFixedSize<float, Sizes<0> > source;
+  TensorFixedSize<float, Sizes<0> > tgt1 = source;
+  TensorFixedSize<float, Sizes<0> > tgt2(source);
+  TensorFixedSize<float, Sizes<0> > tgt3;
+  tgt3 = tgt1;
+  tgt3 = tgt2;
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_empty)
+{
+   CALL_SUBTEST(test_empty_tensor());
+   CALL_SUBTEST(test_empty_fixed_size_tensor());
+}

diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
new file mode 100644
index 0000000..66b06e8
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_executor.cpp

@@ -0,0 +1,731 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+using Eigen::ColMajor;
+using Eigen::internal::TiledEvaluation;
+
+// A set of tests to verify that different TensorExecutor strategies yields the
+// same results for all the ops, supporting tiled evaluation.
+
+// Default assignment that does no use block evaluation or vectorization.
+// We assume that default coefficient evaluation is well tested and correct.
+template <typename Dst, typename Expr>
+static void DefaultAssign(Dst& dst, Expr expr) {
+  using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
+  using Executor =
+      Eigen::internal::TensorExecutor<const Assign, DefaultDevice,
+                                      /*Vectorizable=*/false,
+                                      /*Tiling=*/TiledEvaluation::Off>;
+
+  Executor::run(Assign(dst, expr), DefaultDevice());
+}
+
+// Assignment with specified device and tiling strategy.
+template <bool Vectorizable, TiledEvaluation Tiling, typename Device,
+          typename Dst, typename Expr>
+static void DeviceAssign(Device& d, Dst& dst, Expr expr) {
+  using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
+  using Executor = Eigen::internal::TensorExecutor<const Assign, Device,
+                                                   Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+}
+
+template <int NumDims>
+static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
+  array<Index, NumDims> dims;
+  for (int i = 0; i < NumDims; ++i) {
+    dims[i] = internal::random<int>(min_dim, max_dim);
+  }
+  return dims;
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_unary_expr(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+  Tensor<T, NumDims, Options, Index> src(dims);
+  Tensor<T, NumDims, Options, Index> dst(dims);
+
+  src.setRandom();
+  const auto expr = src.square();
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    T square = src.coeff(i) * src.coeff(i);
+    VERIFY_IS_EQUAL(square, dst.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_binary_expr(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+  Tensor<T, NumDims, Options, Index> lhs(dims);
+  Tensor<T, NumDims, Options, Index> rhs(dims);
+  Tensor<T, NumDims, Options, Index> dst(dims);
+
+  lhs.setRandom();
+  rhs.setRandom();
+
+  const auto expr = lhs + rhs;
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    T sum = lhs.coeff(i) + rhs.coeff(i);
+    VERIFY_IS_EQUAL(sum, dst.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_broadcasting(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  const auto broadcasts = RandomDims<NumDims>(1, 7);
+  const auto expr = src.broadcast(broadcasts);
+
+  // We assume that broadcasting on a default device is tested and correct, so
+  // we can rely on it to verify correctness of tensor executor and tiling.
+  Tensor<T, NumDims, Options, Index> golden;
+  golden = expr;
+
+  // Now do the broadcasting using configured tensor executor.
+  Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_chipping_rvalue(Device d)
+{
+  auto dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Layout, Index> src(dims);
+  src.setRandom();
+
+#define TEST_CHIPPING(CHIP_DIM)                                           \
+  if (NumDims > (CHIP_DIM)) {                                             \
+    const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
+    const auto expr = src.template chip<(CHIP_DIM)>(offset);              \
+                                                                          \
+    Tensor<T, NumDims - 1, Layout, Index> golden;                         \
+    golden = expr;                                                        \
+                                                                          \
+    Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions());       \
+                                                                          \
+    using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;   \
+    using Executor = internal::TensorExecutor<const Assign, Device,       \
+                                              Vectorizable, Tiling>;      \
+                                                                          \
+    Executor::run(Assign(dst, expr), d);                                  \
+                                                                          \
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {            \
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));                     \
+    }                                                                     \
+  }
+
+  TEST_CHIPPING(0)
+  TEST_CHIPPING(1)
+  TEST_CHIPPING(2)
+  TEST_CHIPPING(3)
+  TEST_CHIPPING(4)
+  TEST_CHIPPING(5)
+
+#undef TEST_CHIPPING
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_chipping_lvalue(Device d)
+{
+  auto dims = RandomDims<NumDims>(1, 10);
+
+#define TEST_CHIPPING(CHIP_DIM)                                             \
+  if (NumDims > (CHIP_DIM)) {                                               \
+    /* Generate random data that we'll assign to the chipped tensor dim. */ \
+    array<Index, NumDims - 1> src_dims;                                     \
+    for (int i = 0; i < NumDims - 1; ++i) {                                 \
+      int dim = i < (CHIP_DIM) ? i : i + 1;                                 \
+      src_dims[i] = dims[dim];                                              \
+    }                                                                       \
+                                                                            \
+    Tensor<T, NumDims - 1, Layout, Index> src(src_dims);                    \
+    src.setRandom();                                                        \
+                                                                            \
+    const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1);   \
+                                                                            \
+    Tensor<T, NumDims, Layout, Index> random(dims);                         \
+    random.setZero();                                                       \
+                                                                            \
+    Tensor<T, NumDims, Layout, Index> golden(dims);                         \
+    golden = random;                                                        \
+    golden.template chip<(CHIP_DIM)>(offset) = src;                         \
+                                                                            \
+    Tensor<T, NumDims, Layout, Index> dst(dims);                            \
+    dst = random;                                                           \
+    auto expr = dst.template chip<(CHIP_DIM)>(offset);                      \
+                                                                            \
+    using Assign = TensorAssignOp<decltype(expr), const decltype(src)>;     \
+    using Executor = internal::TensorExecutor<const Assign, Device,         \
+                                              Vectorizable, Tiling>;        \
+                                                                            \
+    Executor::run(Assign(expr, src), d);                                    \
+                                                                            \
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {              \
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));                       \
+    }                                                                       \
+  }
+
+  TEST_CHIPPING(0)
+  TEST_CHIPPING(1)
+  TEST_CHIPPING(2)
+  TEST_CHIPPING(3)
+  TEST_CHIPPING(4)
+  TEST_CHIPPING(5)
+
+#undef TEST_CHIPPING
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_shuffle_rvalue(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  DSizes<Index, NumDims> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+  // Test all possible shuffle permutations.
+  do {
+    DSizes<Index, NumDims> shuffled_dims;
+    for (int i = 0; i < NumDims; ++i) {
+      shuffled_dims[i] = dims[shuffle[i]];
+    }
+
+    const auto expr = src.shuffle(shuffle);
+
+    // We assume that shuffling on a default device is tested and correct, so
+    // we can rely on it to verify correctness of tensor executor and tiling.
+    Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
+    DefaultAssign(golden, expr);
+
+    // Now do the shuffling using configured tensor executor.
+    Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
+    DeviceAssign<Vectorizable, Tiling>(d, dst, expr);
+
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+    }
+
+  } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_shuffle_lvalue(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(5, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  DSizes<Index, NumDims> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+  // Test all possible shuffle permutations.
+  do {
+    DSizes<Index, NumDims> shuffled_dims;
+    for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i];
+
+    // We assume that shuffling on a default device is tested and correct, so
+    // we can rely on it to verify correctness of tensor executor and tiling.
+    Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
+    auto golden_shuffle = golden.shuffle(shuffle);
+    DefaultAssign(golden_shuffle, src);
+
+    // Now do the shuffling using configured tensor executor.
+    Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
+    auto dst_shuffle = dst.shuffle(shuffle);
+    DeviceAssign<Vectorizable, Tiling>(d, dst_shuffle, src);
+
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+    }
+
+  } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_reshape(Device d)
+{
+  static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+
+  static constexpr int ReshapedDims = NumDims - 1;
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(5, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Multiple 0th dimension and then shuffle.
+  std::vector<Index> shuffle;
+  for (int i = 0; i < ReshapedDims; ++i) shuffle.push_back(i);
+  std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
+
+  DSizes<Index, ReshapedDims> reshaped_dims;
+  reshaped_dims[shuffle[0]] = dims[0] * dims[1];
+  for (int i = 1; i < ReshapedDims; ++i) reshaped_dims[shuffle[i]] = dims[i + 1];
+
+  Tensor<T, ReshapedDims, Options, Index> golden = src.reshape(reshaped_dims);
+
+  // Now reshape using configured tensor executor.
+  Tensor<T, ReshapedDims, Options, Index> dst(golden.dimensions());
+
+  auto expr = src.reshape(reshaped_dims);
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_slice_rvalue(Device d)
+{
+  static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(5, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Pick a random slice of src tensor.
+  auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>());
+  auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>());
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < NumDims; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  Tensor<T, NumDims, Options, Index> golden =
+      src.slice(slice_start, slice_size);
+
+  // Now reshape using configured tensor executor.
+  Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+  auto expr = src.slice(slice_start, slice_size);
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_slice_lvalue(Device d)
+{
+  static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(5, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Pick a random slice of src tensor.
+  auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
+  auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < NumDims; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  Tensor<T, NumDims, Options, Index> slice(slice_size);
+  slice.setRandom();
+
+  // Assign a slice using default executor.
+  Tensor<T, NumDims, Options, Index> golden = src;
+  golden.slice(slice_start, slice_size) = slice;
+
+  // And using configured execution strategy.
+  Tensor<T, NumDims, Options, Index> dst = src;
+  auto expr = dst.slice(slice_start, slice_size);
+
+  using Assign = TensorAssignOp<decltype(expr), const decltype(slice)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(expr, slice), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_broadcasting_of_forced_eval(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  const auto broadcasts = RandomDims<NumDims>(1, 7);
+  const auto expr = src.square().eval().broadcast(broadcasts);
+
+  // We assume that broadcasting on a default device is tested and correct, so
+  // we can rely on it to verify correctness of tensor executor and tiling.
+  Tensor<T, NumDims, Options, Index> golden;
+  golden = expr;
+
+  // Now do the broadcasting using configured tensor executor.
+  Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template<typename T, int NumDims>
+struct DummyGenerator {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  T operator()(const array <Index, NumDims>& dims) const {
+    T result = static_cast<T>(0);
+    for (int i = 0; i < NumDims; ++i) {
+      result += static_cast<T>((i + 1) * dims[i]);
+    }
+    return result;
+  }
+};
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_generator_op(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(20, 30);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  const auto expr = src.generate(DummyGenerator<T, NumDims>());
+
+  // We assume that generator on a default device is tested and correct, so
+  // we can rely on it to verify correctness of tensor executor and tiling.
+  Tensor<T, NumDims, Options, Index> golden;
+  golden = expr;
+
+  // Now do the broadcasting using configured tensor executor.
+  Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+    internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_reverse_rvalue(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(1, numext::pow(1000000.0, 1.0 / NumDims));
+  Tensor <T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Reverse half of the dimensions.
+  Eigen::array<bool, NumDims> reverse;
+  for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>();
+
+  const auto expr = src.reverse(reverse);
+
+  // We assume that reversing on a default device is tested and correct, so
+  // we can rely on it to verify correctness of tensor executor and tiling.
+  Tensor <T, NumDims, Options, Index> golden;
+  golden = expr;
+
+  // Now do the reversing using configured tensor executor.
+  Tensor <T, NumDims, Options, Index> dst(golden.dimensions());
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+    internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_async_execute_unary_expr(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+  Tensor<T, NumDims, Options, Index> src(dims);
+  Tensor<T, NumDims, Options, Index> dst(dims);
+
+  src.setRandom();
+  const auto expr = src.square();
+
+  Eigen::Barrier done(1);
+  auto on_done = [&done]() { done.Notify(); };
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using DoneCallback = decltype(on_done);
+  using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
+                                                 Vectorizable, Tiling>;
+
+  Executor::runAsync(Assign(dst, expr), d, on_done);
+  done.Wait();
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    T square = src.coeff(i) * src.coeff(i);
+    VERIFY_IS_EQUAL(square, dst.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_async_execute_binary_expr(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+  Tensor<T, NumDims, Options, Index> lhs(dims);
+  Tensor<T, NumDims, Options, Index> rhs(dims);
+  Tensor<T, NumDims, Options, Index> dst(dims);
+
+  lhs.setRandom();
+  rhs.setRandom();
+
+  const auto expr = lhs + rhs;
+
+  Eigen::Barrier done(1);
+  auto on_done = [&done]() { done.Notify(); };
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using DoneCallback = decltype(on_done);
+  using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
+                                                 Vectorizable, Tiling>;
+
+  Executor::runAsync(Assign(dst, expr), d, on_done);
+  done.Wait();
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    T sum = lhs.coeff(i) + rhs.coeff(i);
+    VERIFY_IS_EQUAL(sum, dst.coeff(i));
+  }
+}
+
+#ifdef EIGEN_DONT_VECTORIZE
+#define VECTORIZABLE(VAL) !EIGEN_DONT_VECTORIZE && VAL
+#else
+#define VECTORIZABLE(VAL) VAL
+#endif
+
+#define CALL_SUBTEST_PART(PART) \
+  CALL_SUBTEST_##PART
+
+#define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                                                 \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,  ColMajor>(default_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(default_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,  RowMajor>(default_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(default_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  ColMajor>(tp_device)));          \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(tp_device)));          \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  RowMajor>(tp_device)));          \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(tp_device)))
+
+// NOTE: Currently only ThreadPoolDevice supports async expression evaluation.
+#define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                                      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  ColMajor>(tp_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(tp_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  RowMajor>(tp_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(tp_device)))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
+  Eigen::DefaultDevice default_device;
+  // Default device is unused in ASYNC tests.
+  EIGEN_UNUSED_VARIABLE(default_device);
+
+  const auto num_threads = internal::random<int>(20, 24);
+  Eigen::ThreadPool tp(num_threads);
+  Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
+
+  CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 3);
+  CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 4);
+  CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 3);
+  CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 4);
+  CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 3);
+  CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 4);
+  CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 2);
+  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 3);
+  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 4);
+  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2);
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3);
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4);
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2);
+  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3);
+  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4);
+  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5);
+
+  CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3);
+  CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4);
+  CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 5);
+
+  CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 3);
+  CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 4);
+  CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 5);
+
+  // Force CMake to split this test.
+  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16
+}

diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp
new file mode 100644
index 0000000..27c2845
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_expr.cpp

@@ -0,0 +1,464 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <numeric>
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_1d()
+{
+  Tensor<float, 1> vec1(6);
+  Tensor<float, 1, RowMajor> vec2(6);
+
+  vec1(0) = 4.0;  vec2(0) = 0.0;
+  vec1(1) = 8.0;  vec2(1) = 1.0;
+  vec1(2) = 15.0; vec2(2) = 2.0;
+  vec1(3) = 16.0; vec2(3) = 3.0;
+  vec1(4) = 23.0; vec2(4) = 4.0;
+  vec1(5) = 42.0; vec2(5) = 5.0;
+
+  float data3[6];
+  TensorMap<Tensor<float, 1>> vec3(data3, 6);
+  vec3 = vec1.sqrt();
+  float data4[6];
+  TensorMap<Tensor<float, 1, RowMajor>> vec4(data4, 6);
+  vec4 = vec2.square();
+  float data5[6];
+  TensorMap<Tensor<float, 1, RowMajor>> vec5(data5, 6);
+  vec5 = vec2.cube();
+
+  VERIFY_IS_APPROX(vec3(0), sqrtf(4.0));
+  VERIFY_IS_APPROX(vec3(1), sqrtf(8.0));
+  VERIFY_IS_APPROX(vec3(2), sqrtf(15.0));
+  VERIFY_IS_APPROX(vec3(3), sqrtf(16.0));
+  VERIFY_IS_APPROX(vec3(4), sqrtf(23.0));
+  VERIFY_IS_APPROX(vec3(5), sqrtf(42.0));
+
+  VERIFY_IS_APPROX(vec4(0), 0.0f);
+  VERIFY_IS_APPROX(vec4(1), 1.0f);
+  VERIFY_IS_APPROX(vec4(2), 2.0f * 2.0f);
+  VERIFY_IS_APPROX(vec4(3), 3.0f * 3.0f);
+  VERIFY_IS_APPROX(vec4(4), 4.0f * 4.0f);
+  VERIFY_IS_APPROX(vec4(5), 5.0f * 5.0f);
+
+  VERIFY_IS_APPROX(vec5(0), 0.0f);
+  VERIFY_IS_APPROX(vec5(1), 1.0f);
+  VERIFY_IS_APPROX(vec5(2), 2.0f * 2.0f * 2.0f);
+  VERIFY_IS_APPROX(vec5(3), 3.0f * 3.0f * 3.0f);
+  VERIFY_IS_APPROX(vec5(4), 4.0f * 4.0f * 4.0f);
+  VERIFY_IS_APPROX(vec5(5), 5.0f * 5.0f * 5.0f);
+
+  vec3 = vec1 + vec2;
+  VERIFY_IS_APPROX(vec3(0), 4.0f + 0.0f);
+  VERIFY_IS_APPROX(vec3(1), 8.0f + 1.0f);
+  VERIFY_IS_APPROX(vec3(2), 15.0f + 2.0f);
+  VERIFY_IS_APPROX(vec3(3), 16.0f + 3.0f);
+  VERIFY_IS_APPROX(vec3(4), 23.0f + 4.0f);
+  VERIFY_IS_APPROX(vec3(5), 42.0f + 5.0f);
+}
+
+static void test_2d()
+{
+  float data1[6];
+  TensorMap<Tensor<float, 2>> mat1(data1, 2, 3);
+  float data2[6];
+  TensorMap<Tensor<float, 2, RowMajor>> mat2(data2, 2, 3);
+
+  mat1(0,0) = 0.0;
+  mat1(0,1) = 1.0;
+  mat1(0,2) = 2.0;
+  mat1(1,0) = 3.0;
+  mat1(1,1) = 4.0;
+  mat1(1,2) = 5.0;
+
+  mat2(0,0) = -0.0;
+  mat2(0,1) = -1.0;
+  mat2(0,2) = -2.0;
+  mat2(1,0) = -3.0;
+  mat2(1,1) = -4.0;
+  mat2(1,2) = -5.0;
+
+  Tensor<float, 2> mat3(2,3);
+  Tensor<float, 2, RowMajor> mat4(2,3);
+  mat3 = mat1.abs();
+  mat4 = mat2.abs();
+
+  VERIFY_IS_APPROX(mat3(0,0), 0.0f);
+  VERIFY_IS_APPROX(mat3(0,1), 1.0f);
+  VERIFY_IS_APPROX(mat3(0,2), 2.0f);
+  VERIFY_IS_APPROX(mat3(1,0), 3.0f);
+  VERIFY_IS_APPROX(mat3(1,1), 4.0f);
+  VERIFY_IS_APPROX(mat3(1,2), 5.0f);
+
+  VERIFY_IS_APPROX(mat4(0,0), 0.0f);
+  VERIFY_IS_APPROX(mat4(0,1), 1.0f);
+  VERIFY_IS_APPROX(mat4(0,2), 2.0f);
+  VERIFY_IS_APPROX(mat4(1,0), 3.0f);
+  VERIFY_IS_APPROX(mat4(1,1), 4.0f);
+  VERIFY_IS_APPROX(mat4(1,2), 5.0f);
+}
+
+static void test_3d()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3, RowMajor> mat2(2,3,7);
+
+  float val = 1.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        mat2(i,j,k) = val;
+        val += 1.0f;
+      }
+    }
+  }
+
+  Tensor<float, 3> mat3(2,3,7);
+  mat3 = mat1 + mat1;
+  Tensor<float, 3, RowMajor> mat4(2,3,7);
+  mat4 = mat2 * 3.14f;
+  Tensor<float, 3> mat5(2,3,7);
+  mat5 = mat1.inverse().log();
+  Tensor<float, 3, RowMajor> mat6(2,3,7);
+  mat6 = mat2.pow(0.5f) * 3.14f;
+  Tensor<float, 3> mat7(2,3,7);
+  mat7 = mat1.cwiseMax(mat5 * 2.0f).exp();
+  Tensor<float, 3, RowMajor> mat8(2,3,7);
+  mat8 = (-mat2).exp() * 3.14f;
+  Tensor<float, 3, RowMajor> mat9(2,3,7);
+  mat9 = mat2 + 3.14f;
+  Tensor<float, 3, RowMajor> mat10(2,3,7);
+  mat10 = mat2 - 3.14f;
+  Tensor<float, 3, RowMajor> mat11(2,3,7);
+  mat11 = mat2 / 3.14f;
+
+  val = 1.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), val + val);
+        VERIFY_IS_APPROX(mat4(i,j,k), val * 3.14f);
+        VERIFY_IS_APPROX(mat5(i,j,k), logf(1.0f/val));
+        VERIFY_IS_APPROX(mat6(i,j,k), sqrtf(val) * 3.14f);
+        VERIFY_IS_APPROX(mat7(i,j,k), expf((std::max)(val, mat5(i,j,k) * 2.0f)));
+        VERIFY_IS_APPROX(mat8(i,j,k), expf(-val) * 3.14f);
+        VERIFY_IS_APPROX(mat9(i,j,k), val + 3.14f);
+        VERIFY_IS_APPROX(mat10(i,j,k), val - 3.14f);
+        VERIFY_IS_APPROX(mat11(i,j,k), val / 3.14f);
+        val += 1.0f;
+      }
+    }
+  }
+}
+
+static void test_constants()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<float, 3> mat3(2,3,7);
+
+  float val = 1.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        val += 1.0f;
+      }
+    }
+  }
+  mat2 = mat1.constant(3.14f);
+  mat3 = mat1.cwiseMax(7.3f).exp();
+
+  val = 1.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat2(i,j,k), 3.14f);
+        VERIFY_IS_APPROX(mat3(i,j,k), expf((std::max)(val, 7.3f)));
+        val += 1.0f;
+      }
+    }
+  }
+}
+
+static void test_boolean()
+{
+  const int kSize = 31;
+  Tensor<int, 1> vec(kSize);
+  std::iota(vec.data(), vec.data() + kSize, 0);
+
+  // Test ||.
+  Tensor<bool, 1> bool1 = vec < vec.constant(1) || vec > vec.constant(4);
+  for (int i = 0; i < kSize; ++i) {
+    bool expected = i < 1 || i > 4;
+    VERIFY_IS_EQUAL(bool1[i], expected);
+  }
+
+  // Test &&, including cast of operand vec.
+  Tensor<bool, 1> bool2 = vec.cast<bool>() && vec < vec.constant(4);
+  for (int i = 0; i < kSize; ++i) {
+    bool expected = bool(i) && i < 4;
+    VERIFY_IS_EQUAL(bool2[i], expected);
+  }
+
+  // Compilation tests:
+  // Test Tensor<bool> against results of cast or comparison; verifies that
+  // CoeffReturnType is set to match Op return type of bool for Unary and Binary
+  // Ops.
+  Tensor<bool, 1> bool3 = vec.cast<bool>() && bool2;
+  bool3 = vec < vec.constant(4) && bool2;
+}
+
+static void test_functors()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<float, 3> mat3(2,3,7);
+
+  float val = 1.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        val += 1.0f;
+      }
+    }
+  }
+  mat2 = mat1.inverse().unaryExpr(&asinf);
+  mat3 = mat1.unaryExpr(&tanhf);
+
+  val = 1.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat2(i,j,k), asinf(1.0f / mat1(i,j,k)));
+        VERIFY_IS_APPROX(mat3(i,j,k), tanhf(mat1(i,j,k)));
+        val += 1.0f;
+      }
+    }
+  }
+}
+
+static void test_type_casting()
+{
+  Tensor<bool, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<double, 3> mat3(2,3,7);
+  mat1.setRandom();
+  mat2.setRandom();
+
+  mat3 = mat1.cast<double>();
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), mat1(i,j,k) ? 1.0 : 0.0);
+      }
+    }
+  }
+
+  mat3 = mat2.cast<double>();
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), static_cast<double>(mat2(i,j,k)));
+      }
+    }
+  }
+}
+
+static void test_select()
+{
+  Tensor<float, 3> selector(2,3,7);
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<float, 3> result(2,3,7);
+
+  selector.setRandom();
+  mat1.setRandom();
+  mat2.setRandom();
+  result = (selector > selector.constant(0.5f)).select(mat1, mat2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(result(i,j,k), (selector(i,j,k) > 0.5f) ? mat1(i,j,k) : mat2(i,j,k));
+      }
+    }
+  }
+}
+
+template <typename Scalar>
+void test_minmax_nan_propagation_templ() {
+  for (int size = 1; size < 17; ++size) {
+    const Scalar kNaN = std::numeric_limits<Scalar>::quiet_NaN();
+    const Scalar kInf = std::numeric_limits<Scalar>::infinity();
+    const Scalar kZero(0);
+    Tensor<Scalar, 1> vec_full_nan(size);
+    Tensor<Scalar, 1> vec_one_nan(size);
+    Tensor<Scalar, 1> vec_zero(size);
+    vec_full_nan.setConstant(kNaN);
+    vec_zero.setZero();
+    vec_one_nan.setZero();
+    vec_one_nan(size/2) = kNaN;
+
+    auto verify_all_nan = [&](const Tensor<Scalar, 1>& v) {
+      for (int i = 0; i < size; ++i) {
+        VERIFY((numext::isnan)(v(i)));
+      }
+    };
+
+    auto verify_all_zero = [&](const Tensor<Scalar, 1>& v) {
+      for (int i = 0; i < size; ++i) {
+        VERIFY_IS_EQUAL(v(i), Scalar(0));
+      }
+    };
+
+    // Test NaN propagating max.
+    // max(nan, nan) = nan
+    // max(nan, 0) = nan
+    // max(0, nan) = nan
+    // max(0, 0) = 0
+    verify_all_nan(vec_full_nan.template cwiseMax<PropagateNaN>(kNaN));
+    verify_all_nan(vec_full_nan.template cwiseMax<PropagateNaN>(vec_full_nan));
+    verify_all_nan(vec_full_nan.template cwiseMax<PropagateNaN>(kZero));
+    verify_all_nan(vec_full_nan.template cwiseMax<PropagateNaN>(vec_zero));
+    verify_all_nan(vec_zero.template cwiseMax<PropagateNaN>(kNaN));
+    verify_all_nan(vec_zero.template cwiseMax<PropagateNaN>(vec_full_nan));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNaN>(kZero));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNaN>(vec_zero));
+
+    // Test number propagating max.
+    // max(nan, nan) = nan
+    // max(nan, 0) = 0
+    // max(0, nan) = 0
+    // max(0, 0) = 0
+    verify_all_nan(vec_full_nan.template cwiseMax<PropagateNumbers>(kNaN));
+    verify_all_nan(vec_full_nan.template cwiseMax<PropagateNumbers>(vec_full_nan));
+    verify_all_zero(vec_full_nan.template cwiseMax<PropagateNumbers>(kZero));
+    verify_all_zero(vec_full_nan.template cwiseMax<PropagateNumbers>(vec_zero));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(kNaN));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(vec_full_nan));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(kZero));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(vec_zero));
+
+    // Test NaN propagating min.
+    // min(nan, nan) = nan
+    // min(nan, 0) = nan
+    // min(0, nan) = nan
+    // min(0, 0) = 0
+    verify_all_nan(vec_full_nan.template cwiseMin<PropagateNaN>(kNaN));
+    verify_all_nan(vec_full_nan.template cwiseMin<PropagateNaN>(vec_full_nan));
+    verify_all_nan(vec_full_nan.template cwiseMin<PropagateNaN>(kZero));
+    verify_all_nan(vec_full_nan.template cwiseMin<PropagateNaN>(vec_zero));
+    verify_all_nan(vec_zero.template cwiseMin<PropagateNaN>(kNaN));
+    verify_all_nan(vec_zero.template cwiseMin<PropagateNaN>(vec_full_nan));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNaN>(kZero));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNaN>(vec_zero));
+
+    // Test number propagating min.
+    // min(nan, nan) = nan
+    // min(nan, 0) = 0
+    // min(0, nan) = 0
+    // min(0, 0) = 0
+    verify_all_nan(vec_full_nan.template cwiseMin<PropagateNumbers>(kNaN));
+    verify_all_nan(vec_full_nan.template cwiseMin<PropagateNumbers>(vec_full_nan));
+    verify_all_zero(vec_full_nan.template cwiseMin<PropagateNumbers>(kZero));
+    verify_all_zero(vec_full_nan.template cwiseMin<PropagateNumbers>(vec_zero));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(kNaN));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(vec_full_nan));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(kZero));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(vec_zero));
+
+    // Test min and max reduction
+    Tensor<Scalar, 0> val;
+    val = vec_zero.minimum();
+    VERIFY_IS_EQUAL(val(), kZero);
+    val = vec_zero.template minimum<PropagateNaN>();
+    VERIFY_IS_EQUAL(val(), kZero);
+    val = vec_zero.template minimum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), kZero);
+    val = vec_zero.maximum();
+    VERIFY_IS_EQUAL(val(), kZero);
+    val = vec_zero.template maximum<PropagateNaN>();
+    VERIFY_IS_EQUAL(val(), kZero);
+    val = vec_zero.template maximum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), kZero);
+
+    // Test NaN propagation for tensor of all NaNs.
+    val = vec_full_nan.template minimum<PropagateNaN>();
+    VERIFY((numext::isnan)(val()));
+    val = vec_full_nan.template minimum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), kInf);
+    val = vec_full_nan.template maximum<PropagateNaN>();
+    VERIFY((numext::isnan)(val()));
+    val = vec_full_nan.template maximum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), -kInf);
+
+    // Test NaN propagation for tensor with a single NaN.
+    val = vec_one_nan.template minimum<PropagateNaN>();
+    VERIFY((numext::isnan)(val()));
+    val = vec_one_nan.template minimum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), (size == 1 ? kInf : kZero));
+    val = vec_one_nan.template maximum<PropagateNaN>();
+    VERIFY((numext::isnan)(val()));
+    val = vec_one_nan.template maximum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), (size == 1 ? -kInf : kZero));
+  }
+}
+
+static void test_clip()
+{
+  Tensor<float, 1> vec(6);
+  vec(0) = 4.0;
+  vec(1) = 8.0;
+  vec(2) = 15.0;
+  vec(3) = 16.0;
+  vec(4) = 23.0;
+  vec(5) = 42.0;
+
+  float kMin = 20;
+  float kMax = 30;
+
+  Tensor<float, 1> vec_clipped(6);
+  vec_clipped = vec.clip(kMin, kMax);
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(vec_clipped(i), numext::mini(numext::maxi(vec(i), kMin), kMax));
+  }
+}
+
+static void test_minmax_nan_propagation()
+{
+  test_minmax_nan_propagation_templ<float>();
+  test_minmax_nan_propagation_templ<double>();
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_expr)
+{
+  CALL_SUBTEST(test_1d());
+  CALL_SUBTEST(test_2d());
+  CALL_SUBTEST(test_3d());
+  CALL_SUBTEST(test_constants());
+  CALL_SUBTEST(test_boolean());
+  CALL_SUBTEST(test_functors());
+  CALL_SUBTEST(test_type_casting());
+  CALL_SUBTEST(test_select());
+  CALL_SUBTEST(test_clip());
+
+// Nan propagation does currently not work like one would expect from std::max/std::min,
+// so we disable it for now
+#if !EIGEN_ARCH_ARM_OR_ARM64
+  CALL_SUBTEST(test_minmax_nan_propagation());
+#endif
+}

diff --git a/unsupported/test/cxx11_tensor_fft.cpp b/unsupported/test/cxx11_tensor_fft.cpp
new file mode 100644
index 0000000..2e1008e
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_fft.cpp

@@ -0,0 +1,304 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Jianwei Cui <thucjw@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int DataLayout>
+static void test_fft_2D_golden() {
+  Tensor<float, 2, DataLayout> input(2, 3);
+  input(0, 0) = 1;
+  input(0, 1) = 2;
+  input(0, 2) = 3;
+  input(1, 0) = 4;
+  input(1, 1) = 5;
+  input(1, 2) = 6;
+
+  array<ptrdiff_t, 2> fft;
+  fft[0] = 0;
+  fft[1] = 1;
+
+  Tensor<std::complex<float>, 2, DataLayout> output = input.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
+
+  std::complex<float> output_golden[6]; // in ColMajor order
+  output_golden[0] = std::complex<float>(21, 0);
+  output_golden[1] = std::complex<float>(-9, 0);
+  output_golden[2] = std::complex<float>(-3, 1.73205);
+  output_golden[3] = std::complex<float>( 0, 0);
+  output_golden[4] = std::complex<float>(-3, -1.73205);
+  output_golden[5] = std::complex<float>(0 ,0);
+
+  std::complex<float> c_offset = std::complex<float>(1.0, 1.0);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_APPROX(output(0) + c_offset, output_golden[0] + c_offset);
+    VERIFY_IS_APPROX(output(1) + c_offset, output_golden[1] + c_offset);
+    VERIFY_IS_APPROX(output(2) + c_offset, output_golden[2] + c_offset);
+    VERIFY_IS_APPROX(output(3) + c_offset, output_golden[3] + c_offset);
+    VERIFY_IS_APPROX(output(4) + c_offset, output_golden[4] + c_offset);
+    VERIFY_IS_APPROX(output(5) + c_offset, output_golden[5] + c_offset);
+  }
+  else {
+    VERIFY_IS_APPROX(output(0)+ c_offset, output_golden[0]+ c_offset);
+    VERIFY_IS_APPROX(output(1)+ c_offset, output_golden[2]+ c_offset);
+    VERIFY_IS_APPROX(output(2)+ c_offset, output_golden[4]+ c_offset);
+    VERIFY_IS_APPROX(output(3)+ c_offset, output_golden[1]+ c_offset);
+    VERIFY_IS_APPROX(output(4)+ c_offset, output_golden[3]+ c_offset);
+    VERIFY_IS_APPROX(output(5)+ c_offset, output_golden[5]+ c_offset);
+  }
+}
+
+static void test_fft_complex_input_golden() {
+  Tensor<std::complex<float>, 1, ColMajor> input(5);
+  input(0) = std::complex<float>(1, 1);
+  input(1) = std::complex<float>(2, 2);
+  input(2) = std::complex<float>(3, 3);
+  input(3) = std::complex<float>(4, 4);
+  input(4) = std::complex<float>(5, 5);
+
+  array<ptrdiff_t, 1> fft;
+  fft[0] = 0;
+
+  Tensor<std::complex<float>, 1, ColMajor> forward_output_both_parts = input.fft<BothParts, FFT_FORWARD>(fft);
+  Tensor<std::complex<float>, 1, ColMajor> reverse_output_both_parts = input.fft<BothParts, FFT_REVERSE>(fft);
+
+  Tensor<float, 1, ColMajor> forward_output_real_part = input.fft<RealPart, FFT_FORWARD>(fft);
+  Tensor<float, 1, ColMajor> reverse_output_real_part = input.fft<RealPart, FFT_REVERSE>(fft);
+
+  Tensor<float, 1, ColMajor> forward_output_imag_part = input.fft<ImagPart, FFT_FORWARD>(fft);
+  Tensor<float, 1, ColMajor> reverse_output_imag_part = input.fft<ImagPart, FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(forward_output_both_parts.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_both_parts.dimension(0), input.dimension(0));
+
+  VERIFY_IS_EQUAL(forward_output_real_part.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_real_part.dimension(0), input.dimension(0));
+
+  VERIFY_IS_EQUAL(forward_output_imag_part.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_imag_part.dimension(0), input.dimension(0));
+
+  std::complex<float> forward_golden_result[5];
+  std::complex<float> reverse_golden_result[5];
+
+  forward_golden_result[0] = std::complex<float>(15.000000000000000,+15.000000000000000);
+  forward_golden_result[1] = std::complex<float>(-5.940954801177935, +0.940954801177934);
+  forward_golden_result[2] = std::complex<float>(-3.312299240582266, -1.687700759417735);
+  forward_golden_result[3] = std::complex<float>(-1.687700759417735, -3.312299240582266);
+  forward_golden_result[4] = std::complex<float>( 0.940954801177934, -5.940954801177935);
+
+  reverse_golden_result[0] = std::complex<float>( 3.000000000000000, + 3.000000000000000);
+  reverse_golden_result[1] = std::complex<float>( 0.188190960235587, - 1.188190960235587);
+  reverse_golden_result[2] = std::complex<float>(-0.337540151883547, - 0.662459848116453);
+  reverse_golden_result[3] = std::complex<float>(-0.662459848116453, - 0.337540151883547);
+  reverse_golden_result[4] = std::complex<float>(-1.188190960235587, + 0.188190960235587);
+
+  for(int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(forward_output_both_parts(i), forward_golden_result[i]);
+    VERIFY_IS_APPROX(forward_output_real_part(i), forward_golden_result[i].real());
+    VERIFY_IS_APPROX(forward_output_imag_part(i), forward_golden_result[i].imag());
+  }
+
+  for(int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(reverse_output_both_parts(i), reverse_golden_result[i]);
+    VERIFY_IS_APPROX(reverse_output_real_part(i), reverse_golden_result[i].real());
+    VERIFY_IS_APPROX(reverse_output_imag_part(i), reverse_golden_result[i].imag());
+  }
+}
+
+static void test_fft_real_input_golden() {
+  Tensor<float, 1, ColMajor> input(5);
+  input(0) = 1.0;
+  input(1) = 2.0;
+  input(2) = 3.0;
+  input(3) = 4.0;
+  input(4) = 5.0;
+
+  array<ptrdiff_t, 1> fft;
+  fft[0] = 0;
+
+  Tensor<std::complex<float>, 1, ColMajor> forward_output_both_parts = input.fft<BothParts, FFT_FORWARD>(fft);
+  Tensor<std::complex<float>, 1, ColMajor> reverse_output_both_parts = input.fft<BothParts, FFT_REVERSE>(fft);
+
+  Tensor<float, 1, ColMajor> forward_output_real_part = input.fft<RealPart, FFT_FORWARD>(fft);
+  Tensor<float, 1, ColMajor> reverse_output_real_part = input.fft<RealPart, FFT_REVERSE>(fft);
+
+  Tensor<float, 1, ColMajor> forward_output_imag_part = input.fft<ImagPart, FFT_FORWARD>(fft);
+  Tensor<float, 1, ColMajor> reverse_output_imag_part = input.fft<ImagPart, FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(forward_output_both_parts.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_both_parts.dimension(0), input.dimension(0));
+
+  VERIFY_IS_EQUAL(forward_output_real_part.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_real_part.dimension(0), input.dimension(0));
+
+  VERIFY_IS_EQUAL(forward_output_imag_part.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_imag_part.dimension(0), input.dimension(0));
+
+  std::complex<float> forward_golden_result[5];
+  std::complex<float> reverse_golden_result[5];
+
+
+  forward_golden_result[0] = std::complex<float>(  15, 0);
+  forward_golden_result[1] = std::complex<float>(-2.5, +3.44095480117793);
+  forward_golden_result[2] = std::complex<float>(-2.5, +0.81229924058227);
+  forward_golden_result[3] = std::complex<float>(-2.5, -0.81229924058227);
+  forward_golden_result[4] = std::complex<float>(-2.5, -3.44095480117793);
+
+  reverse_golden_result[0] = std::complex<float>( 3.0, 0);
+  reverse_golden_result[1] = std::complex<float>(-0.5, -0.688190960235587);
+  reverse_golden_result[2] = std::complex<float>(-0.5, -0.162459848116453);
+  reverse_golden_result[3] = std::complex<float>(-0.5, +0.162459848116453);
+  reverse_golden_result[4] = std::complex<float>(-0.5, +0.688190960235587);
+
+  std::complex<float> c_offset(1.0, 1.0);
+  float r_offset = 1.0;
+
+  for(int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(forward_output_both_parts(i) + c_offset, forward_golden_result[i] + c_offset);
+    VERIFY_IS_APPROX(forward_output_real_part(i)  + r_offset, forward_golden_result[i].real() + r_offset);
+    VERIFY_IS_APPROX(forward_output_imag_part(i)  + r_offset, forward_golden_result[i].imag() + r_offset);
+  }
+
+  for(int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(reverse_output_both_parts(i) + c_offset, reverse_golden_result[i] + c_offset);
+    VERIFY_IS_APPROX(reverse_output_real_part(i)  + r_offset, reverse_golden_result[i].real() + r_offset);
+    VERIFY_IS_APPROX(reverse_output_imag_part(i)  + r_offset, reverse_golden_result[i].imag() + r_offset);
+  }
+}
+
+
+template <int DataLayout, typename RealScalar, bool isComplexInput, int FFTResultType, int FFTDirection, int TensorRank>
+static void test_fft_real_input_energy() {
+
+  Eigen::DSizes<ptrdiff_t, TensorRank> dimensions;
+  ptrdiff_t total_size = 1;
+  for (int i = 0; i < TensorRank; ++i) {
+    dimensions[i] = rand() % 20 + 1;
+    total_size *= dimensions[i];
+  }
+  const DSizes<ptrdiff_t, TensorRank> arr = dimensions;
+
+  typedef typename internal::conditional<isComplexInput == true, std::complex<RealScalar>, RealScalar>::type InputScalar;
+
+  Tensor<InputScalar, TensorRank, DataLayout> input;
+  input.resize(arr);
+  input.setRandom();
+
+  array<ptrdiff_t, TensorRank> fft;
+  for (int i = 0; i < TensorRank; ++i) {
+    fft[i] = i;
+  }
+
+  typedef typename internal::conditional<FFTResultType == Eigen::BothParts, std::complex<RealScalar>, RealScalar>::type OutputScalar;
+  Tensor<OutputScalar, TensorRank, DataLayout> output;
+  output = input.template fft<FFTResultType, FFTDirection>(fft);
+
+  for (int i = 0; i < TensorRank; ++i) {
+    VERIFY_IS_EQUAL(output.dimension(i), input.dimension(i));
+  }
+
+  RealScalar energy_original = 0.0;
+  RealScalar energy_after_fft = 0.0;
+
+  for (int i = 0; i < total_size; ++i) {
+    energy_original += numext::abs2(input(i));
+  }
+
+  for (int i = 0; i < total_size; ++i) {
+    energy_after_fft += numext::abs2(output(i));
+  }
+
+  if(FFTDirection == FFT_FORWARD) {
+    VERIFY_IS_APPROX(energy_original, energy_after_fft / total_size);
+  }
+  else {
+    VERIFY_IS_APPROX(energy_original, energy_after_fft * total_size);
+  }
+}
+
+template <typename RealScalar>
+static void test_fft_non_power_of_2_round_trip(int exponent) {
+  int n = (1 << exponent) + 1;
+
+  Eigen::DSizes<ptrdiff_t, 1> dimensions;
+  dimensions[0] = n;
+  const DSizes<ptrdiff_t, 1> arr = dimensions;
+  Tensor<RealScalar, 1, ColMajor, ptrdiff_t> input;
+
+  input.resize(arr);
+  input.setRandom();
+
+  array<int, 1> fft;
+  fft[0] = 0;
+
+  Tensor<std::complex<RealScalar>, 1, ColMajor> forward =
+      input.template fft<BothParts, FFT_FORWARD>(fft);
+
+  Tensor<RealScalar, 1, ColMajor, ptrdiff_t> output =
+      forward.template fft<RealPart, FFT_REVERSE>(fft);
+
+  for (int i = 0; i < n; ++i) {
+    RealScalar tol = test_precision<RealScalar>() *
+                     (std::abs(input[i]) + std::abs(output[i]) + 1);
+    VERIFY_IS_APPROX_OR_LESS_THAN(std::abs(input[i] - output[i]), tol);
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_fft) {
+    test_fft_complex_input_golden();
+    test_fft_real_input_golden();
+
+    test_fft_2D_golden<ColMajor>();
+    test_fft_2D_golden<RowMajor>();
+
+    test_fft_real_input_energy<ColMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<ColMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<ColMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<ColMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 1>();
+
+    test_fft_real_input_energy<ColMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<ColMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<ColMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<ColMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 2>();
+
+    test_fft_real_input_energy<ColMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<ColMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<ColMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<ColMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 3>();
+
+    test_fft_real_input_energy<ColMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<ColMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<ColMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<ColMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 4>();
+
+    test_fft_real_input_energy<RowMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<RowMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<RowMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<RowMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 1>();
+
+    test_fft_real_input_energy<RowMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<RowMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<RowMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<RowMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 2>();
+
+    test_fft_real_input_energy<RowMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<RowMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<RowMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<RowMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 3>();
+
+    test_fft_real_input_energy<RowMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<RowMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<RowMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<RowMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 4>();
+
+    test_fft_non_power_of_2_round_trip<float>(7);
+    test_fft_non_power_of_2_round_trip<double>(7);
+}

diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp
new file mode 100644
index 0000000..456ce6b
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_fixed_size.cpp

@@ -0,0 +1,261 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+
+static void test_0d()
+{
+  TensorFixedSize<float, Sizes<> > scalar1;
+  TensorFixedSize<float, Sizes<>, RowMajor> scalar2;
+  VERIFY_IS_EQUAL(scalar1.rank(), 0);
+  VERIFY_IS_EQUAL(scalar1.size(), 1);
+  VERIFY_IS_EQUAL(internal::array_prod(scalar1.dimensions()), 1);
+
+  scalar1() = 7.0;
+  scalar2() = 13.0;
+
+  // Test against shallow copy.
+  TensorFixedSize<float, Sizes<> > copy = scalar1;
+  VERIFY_IS_NOT_EQUAL(scalar1.data(), copy.data());
+  VERIFY_IS_APPROX(scalar1(), copy());
+  copy = scalar1;
+  VERIFY_IS_NOT_EQUAL(scalar1.data(), copy.data());
+  VERIFY_IS_APPROX(scalar1(), copy());
+
+  TensorFixedSize<float, Sizes<> > scalar3 = scalar1.sqrt();
+  TensorFixedSize<float, Sizes<>, RowMajor> scalar4 = scalar2.sqrt();
+  VERIFY_IS_EQUAL(scalar3.rank(), 0);
+  VERIFY_IS_APPROX(scalar3(), sqrtf(7.0));
+  VERIFY_IS_APPROX(scalar4(), sqrtf(13.0));
+
+  scalar3 = scalar1 + scalar2;
+  VERIFY_IS_APPROX(scalar3(), 7.0f + 13.0f);
+}
+
+static void test_1d()
+{
+  TensorFixedSize<float, Sizes<6> > vec1;
+  TensorFixedSize<float, Sizes<6>, RowMajor> vec2;
+
+  VERIFY_IS_EQUAL((vec1.size()), 6);
+  //  VERIFY_IS_EQUAL((vec1.dimensions()[0]), 6);
+  //  VERIFY_IS_EQUAL((vec1.dimension(0)), 6);
+
+  vec1(0) = 4.0;  vec2(0) = 0.0;
+  vec1(1) = 8.0;  vec2(1) = 1.0;
+  vec1(2) = 15.0; vec2(2) = 2.0;
+  vec1(3) = 16.0; vec2(3) = 3.0;
+  vec1(4) = 23.0; vec2(4) = 4.0;
+  vec1(5) = 42.0; vec2(5) = 5.0;
+
+  // Test against shallow copy.
+  TensorFixedSize<float, Sizes<6> > copy = vec1;
+  VERIFY_IS_NOT_EQUAL(vec1.data(), copy.data());
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_APPROX(vec1(i), copy(i));
+  }
+  copy = vec1;
+  VERIFY_IS_NOT_EQUAL(vec1.data(), copy.data());
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_APPROX(vec1(i), copy(i));
+  }
+
+  TensorFixedSize<float, Sizes<6> > vec3 = vec1.sqrt();
+  TensorFixedSize<float, Sizes<6>, RowMajor> vec4 = vec2.sqrt();
+
+  VERIFY_IS_EQUAL((vec3.size()), 6);
+  VERIFY_IS_EQUAL(vec3.rank(), 1);
+  //  VERIFY_IS_EQUAL((vec3.dimensions()[0]), 6);
+  //  VERIFY_IS_EQUAL((vec3.dimension(0)), 6);
+
+  VERIFY_IS_APPROX(vec3(0), sqrtf(4.0));
+  VERIFY_IS_APPROX(vec3(1), sqrtf(8.0));
+  VERIFY_IS_APPROX(vec3(2), sqrtf(15.0));
+  VERIFY_IS_APPROX(vec3(3), sqrtf(16.0));
+  VERIFY_IS_APPROX(vec3(4), sqrtf(23.0));
+  VERIFY_IS_APPROX(vec3(5), sqrtf(42.0));
+
+  VERIFY_IS_APPROX(vec4(0), sqrtf(0.0));
+  VERIFY_IS_APPROX(vec4(1), sqrtf(1.0));
+  VERIFY_IS_APPROX(vec4(2), sqrtf(2.0));
+  VERIFY_IS_APPROX(vec4(3), sqrtf(3.0));
+  VERIFY_IS_APPROX(vec4(4), sqrtf(4.0));
+  VERIFY_IS_APPROX(vec4(5), sqrtf(5.0));
+
+  vec3 = vec1 + vec2;
+  VERIFY_IS_APPROX(vec3(0), 4.0f + 0.0f);
+  VERIFY_IS_APPROX(vec3(1), 8.0f + 1.0f);
+  VERIFY_IS_APPROX(vec3(2), 15.0f + 2.0f);
+  VERIFY_IS_APPROX(vec3(3), 16.0f + 3.0f);
+  VERIFY_IS_APPROX(vec3(4), 23.0f + 4.0f);
+  VERIFY_IS_APPROX(vec3(5), 42.0f + 5.0f);
+}
+
+static void test_tensor_map()
+{
+  TensorFixedSize<float, Sizes<6> > vec1;
+  TensorFixedSize<float, Sizes<6>, RowMajor> vec2;
+
+  vec1(0) = 4.0;  vec2(0) = 0.0;
+  vec1(1) = 8.0;  vec2(1) = 1.0;
+  vec1(2) = 15.0; vec2(2) = 2.0;
+  vec1(3) = 16.0; vec2(3) = 3.0;
+  vec1(4) = 23.0; vec2(4) = 4.0;
+  vec1(5) = 42.0; vec2(5) = 5.0;
+
+  float data3[6];
+  TensorMap<TensorFixedSize<float, Sizes<6> > > vec3(data3, 6);
+  vec3 = vec1.sqrt() + vec2;
+
+  VERIFY_IS_APPROX(vec3(0), sqrtf(4.0));
+  VERIFY_IS_APPROX(vec3(1), sqrtf(8.0) + 1.0f);
+  VERIFY_IS_APPROX(vec3(2), sqrtf(15.0) + 2.0f);
+  VERIFY_IS_APPROX(vec3(3), sqrtf(16.0) + 3.0f);
+  VERIFY_IS_APPROX(vec3(4), sqrtf(23.0) + 4.0f);
+  VERIFY_IS_APPROX(vec3(5), sqrtf(42.0) + 5.0f);
+}
+
+static void test_2d()
+{
+  float data1[6];
+  TensorMap<TensorFixedSize<float, Sizes<2, 3> > > mat1(data1,2,3);
+  float data2[6];
+  TensorMap<TensorFixedSize<float, Sizes<2, 3>, RowMajor> > mat2(data2,2,3);
+
+  VERIFY_IS_EQUAL((mat1.size()), 2*3);
+  VERIFY_IS_EQUAL(mat1.rank(), 2);
+  //  VERIFY_IS_EQUAL((mat1.dimension(0)), 2);
+  //  VERIFY_IS_EQUAL((mat1.dimension(1)), 3);
+
+  mat1(0,0) = 0.0;
+  mat1(0,1) = 1.0;
+  mat1(0,2) = 2.0;
+  mat1(1,0) = 3.0;
+  mat1(1,1) = 4.0;
+  mat1(1,2) = 5.0;
+
+  mat2(0,0) = -0.0;
+  mat2(0,1) = -1.0;
+  mat2(0,2) = -2.0;
+  mat2(1,0) = -3.0;
+  mat2(1,1) = -4.0;
+  mat2(1,2) = -5.0;
+
+  TensorFixedSize<float, Sizes<2, 3> > mat3;
+  TensorFixedSize<float, Sizes<2, 3>, RowMajor> mat4;
+  mat3 = mat1.abs();
+  mat4 = mat2.abs();
+
+  VERIFY_IS_EQUAL((mat3.size()), 2*3);
+    //  VERIFY_IS_EQUAL((mat3.dimension(0)), 2);
+    //  VERIFY_IS_EQUAL((mat3.dimension(1)), 3);
+
+  VERIFY_IS_APPROX(mat3(0,0), 0.0f);
+  VERIFY_IS_APPROX(mat3(0,1), 1.0f);
+  VERIFY_IS_APPROX(mat3(0,2), 2.0f);
+  VERIFY_IS_APPROX(mat3(1,0), 3.0f);
+  VERIFY_IS_APPROX(mat3(1,1), 4.0f);
+  VERIFY_IS_APPROX(mat3(1,2), 5.0f);
+
+  VERIFY_IS_APPROX(mat4(0,0), 0.0f);
+  VERIFY_IS_APPROX(mat4(0,1), 1.0f);
+  VERIFY_IS_APPROX(mat4(0,2), 2.0f);
+  VERIFY_IS_APPROX(mat4(1,0), 3.0f);
+  VERIFY_IS_APPROX(mat4(1,1), 4.0f);
+  VERIFY_IS_APPROX(mat4(1,2), 5.0f);
+}
+
+static void test_3d()
+{
+  TensorFixedSize<float, Sizes<2, 3, 7> > mat1;
+  TensorFixedSize<float, Sizes<2, 3, 7>, RowMajor> mat2;
+
+  VERIFY_IS_EQUAL((mat1.size()), 2*3*7);
+  VERIFY_IS_EQUAL(mat1.rank(), 3);
+  //  VERIFY_IS_EQUAL((mat1.dimension(0)), 2);
+  //  VERIFY_IS_EQUAL((mat1.dimension(1)), 3);
+  //  VERIFY_IS_EQUAL((mat1.dimension(2)), 7);
+
+  float val = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        mat2(i,j,k) = val;
+        val += 1.0f;
+      }
+    }
+  }
+
+  TensorFixedSize<float, Sizes<2, 3, 7> > mat3;
+  mat3 = mat1.sqrt();
+  TensorFixedSize<float, Sizes<2, 3, 7>, RowMajor> mat4;
+  mat4 = mat2.sqrt();
+
+  VERIFY_IS_EQUAL((mat3.size()), 2*3*7);
+  //  VERIFY_IS_EQUAL((mat3.dimension(0)), 2);
+  //  VERIFY_IS_EQUAL((mat3.dimension(1)), 3);
+  //  VERIFY_IS_EQUAL((mat3.dimension(2)), 7);
+
+
+  val = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), sqrtf(val));
+        VERIFY_IS_APPROX(mat4(i,j,k), sqrtf(val));
+        val += 1.0f;
+      }
+    }
+  }
+}
+
+
+static void test_array()
+{
+  TensorFixedSize<float, Sizes<2, 3, 7> > mat1;
+  float val = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        val += 1.0f;
+      }
+    }
+  }
+
+  TensorFixedSize<float, Sizes<2, 3, 7> > mat3;
+  mat3 = mat1.pow(3.5f);
+
+  val = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), powf(val, 3.5f));
+        val += 1.0f;
+      }
+    }
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_fixed_size)
+{
+  CALL_SUBTEST(test_0d());
+  CALL_SUBTEST(test_1d());
+  CALL_SUBTEST(test_tensor_map());
+  CALL_SUBTEST(test_2d());
+  CALL_SUBTEST(test_3d());
+  CALL_SUBTEST(test_array());
+}

diff --git a/unsupported/test/cxx11_tensor_forced_eval.cpp b/unsupported/test/cxx11_tensor_forced_eval.cpp
new file mode 100644
index 0000000..a21a02b
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_forced_eval.cpp

@@ -0,0 +1,79 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/Core>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::MatrixXf;
+using Eigen::Tensor;
+
+static void test_simple()
+{
+  MatrixXf m1(3,3);
+  MatrixXf m2(3,3);
+  m1.setRandom();
+  m2.setRandom();
+
+  TensorMap<Tensor<float, 2> > mat1(m1.data(), 3,3);
+  TensorMap<Tensor<float, 2> > mat2(m2.data(), 3,3);
+
+  Tensor<float, 2> mat3(3,3);
+  mat3 = mat1;
+
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(1, 0);
+
+  mat3 = mat3.contract(mat2, dims).eval();
+
+  VERIFY_IS_APPROX(mat3(0, 0), (m1*m2).eval()(0,0));
+  VERIFY_IS_APPROX(mat3(0, 1), (m1*m2).eval()(0,1));
+  VERIFY_IS_APPROX(mat3(0, 2), (m1*m2).eval()(0,2));
+  VERIFY_IS_APPROX(mat3(1, 0), (m1*m2).eval()(1,0));
+  VERIFY_IS_APPROX(mat3(1, 1), (m1*m2).eval()(1,1));
+  VERIFY_IS_APPROX(mat3(1, 2), (m1*m2).eval()(1,2));
+  VERIFY_IS_APPROX(mat3(2, 0), (m1*m2).eval()(2,0));
+  VERIFY_IS_APPROX(mat3(2, 1), (m1*m2).eval()(2,1));
+  VERIFY_IS_APPROX(mat3(2, 2), (m1*m2).eval()(2,2));
+}
+
+
+static void test_const()
+{
+  MatrixXf input(3,3);
+  input.setRandom();
+  MatrixXf output = input;
+  output.rowwise() -= input.colwise().maxCoeff();
+
+  Eigen::array<int, 1> depth_dim;
+  depth_dim[0] = 0;
+  Tensor<float, 2>::Dimensions dims2d;
+  dims2d[0] = 1;
+  dims2d[1] = 3;
+  Eigen::array<int, 2> bcast;
+  bcast[0] = 3;
+  bcast[1] = 1;
+  const TensorMap<const Tensor<float, 2> > input_tensor(input.data(), 3, 3);
+  Tensor<float, 2> output_tensor= (input_tensor - input_tensor.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_APPROX(output(i, j), output_tensor(i, j));
+    }
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_forced_eval)
+{
+  CALL_SUBTEST(test_simple());
+  CALL_SUBTEST(test_const());
+}

diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
new file mode 100644
index 0000000..a55a5ad
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp

@@ -0,0 +1,77 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+template <typename DataType, int DataLayout, typename IndexType>
+void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
+
+  IndexType sizeDim1 = 100;
+  IndexType sizeDim2 = 20;
+  IndexType sizeDim3 = 20;
+  Eigen::array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
+
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+
+  in1 = in1.random() + in1.constant(static_cast<DataType>(10.0f));
+  in2 = in2.random() + in2.constant(static_cast<DataType>(10.0f));
+
+  // creating TensorMap from tensor
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
+  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType));
+  /// c=(a+b)*b
+  gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2;
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(out(i, j, k),
+                         (in1(i, j, k) + in2(i, j, k)) * in2(i, j, k));
+      }
+    }
+  }
+  printf("(a+b)*b Test Passed\n");
+  sycl_device.deallocate(gpu_in1_data);
+  sycl_device.deallocate(gpu_in2_data);
+  sycl_device.deallocate(gpu_out_data);
+
+}
+
+template <typename DataType, typename Dev_selector> void tensorForced_evalperDevice(Dev_selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_forced_eval_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_forced_eval_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_forced_eval_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(tensorForced_evalperDevice<float>(device));
+    CALL_SUBTEST(tensorForced_evalperDevice<half>(device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_generator.cpp b/unsupported/test/cxx11_tensor_generator.cpp
new file mode 100644
index 0000000..6dcf676
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_generator.cpp

@@ -0,0 +1,91 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+struct Generator1D {
+  Generator1D() { }
+
+  float operator()(const array<Eigen::DenseIndex, 1>& coordinates) const {
+    return coordinates[0];
+  }
+};
+
+template <int DataLayout>
+static void test_1D()
+{
+  Tensor<float, 1> vec(6);
+  Tensor<float, 1> result = vec.generate(Generator1D());
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(result(i), i);
+  }
+}
+
+
+struct Generator2D {
+  Generator2D() { }
+
+  float operator()(const array<Eigen::DenseIndex, 2>& coordinates) const {
+    return 3 * coordinates[0] + 11 * coordinates[1];
+  }
+};
+
+template <int DataLayout>
+static void test_2D()
+{
+  Tensor<float, 2> matrix(512, 512);
+  Tensor<float, 2> result = matrix.generate(Generator2D());
+
+  for (int i = 0; i < 512; ++i) {
+    for (int j = 0; j < 512; ++j) {
+      VERIFY_IS_EQUAL(result(i, j), 3*i + 11*j);
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_gaussian()
+{
+  int rows = 32;
+  int cols = 48;
+  array<float, 2> means;
+  means[0] = rows / 2.0f;
+  means[1] = cols / 2.0f;
+  array<float, 2> std_devs;
+  std_devs[0] = 3.14f;
+  std_devs[1] = 2.7f;
+  internal::GaussianGenerator<float, Eigen::DenseIndex, 2> gaussian_gen(means, std_devs);
+
+  Tensor<float, 2> matrix(rows, cols);
+  Tensor<float, 2> result = matrix.generate(gaussian_gen);
+
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      float g_rows = powf(rows/2.0f - i, 2) / (3.14f * 3.14f) * 0.5f;
+      float g_cols = powf(cols/2.0f - j, 2) / (2.7f * 2.7f) * 0.5f;
+      float gaussian = expf(-g_rows - g_cols);
+      VERIFY_IS_EQUAL(result(i, j), gaussian);
+    }
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_generator)
+{
+  CALL_SUBTEST(test_1D<ColMajor>());
+  CALL_SUBTEST(test_1D<RowMajor>());
+  CALL_SUBTEST(test_2D<ColMajor>());
+  CALL_SUBTEST(test_2D<RowMajor>());
+  CALL_SUBTEST(test_gaussian<ColMajor>());
+  CALL_SUBTEST(test_gaussian<RowMajor>());
+}

diff --git a/unsupported/test/cxx11_tensor_generator_sycl.cpp b/unsupported/test/cxx11_tensor_generator_sycl.cpp
new file mode 100644
index 0000000..fb6e3d9
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_generator_sycl.cpp

@@ -0,0 +1,147 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+static const float error_threshold =1e-8f;
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+struct Generator1D {
+  Generator1D() { }
+
+  float operator()(const array<Eigen::DenseIndex, 1>& coordinates) const {
+    return coordinates[0];
+  }
+};
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_1D_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+  IndexType sizeDim1 = 6;
+  array<IndexType, 1> tensorRange = {{sizeDim1}};
+  Tensor<DataType, 1, DataLayout,IndexType> vec(tensorRange);
+  Tensor<DataType, 1, DataLayout,IndexType> result(tensorRange);
+
+  const size_t tensorBuffSize =vec.size()*sizeof(DataType);
+  DataType* gpu_data_vec  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_result  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+  TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> gpu_vec(gpu_data_vec, tensorRange);
+  TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_vec, vec.data(), tensorBuffSize);
+  gpu_result.device(sycl_device)=gpu_vec.generate(Generator1D());
+  sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize);
+
+  for (IndexType i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(result(i), i);
+  }
+}
+
+
+struct Generator2D {
+  Generator2D() { }
+
+  float operator()(const array<Eigen::DenseIndex, 2>& coordinates) const {
+    return 3 * coordinates[0] + 11 * coordinates[1];
+  }
+};
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_2D_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 5;
+  IndexType sizeDim2 = 7;
+  array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}};
+  Tensor<DataType, 2, DataLayout,IndexType> matrix(tensorRange);
+  Tensor<DataType, 2, DataLayout,IndexType> result(tensorRange);
+
+  const size_t tensorBuffSize =matrix.size()*sizeof(DataType);
+  DataType* gpu_data_matrix  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_result  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+  TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_matrix(gpu_data_matrix, tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_matrix, matrix.data(), tensorBuffSize);
+  gpu_result.device(sycl_device)=gpu_matrix.generate(Generator2D());
+  sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize);
+
+  for (IndexType i = 0; i < 5; ++i) {
+    for (IndexType j = 0; j < 5; ++j) {
+      VERIFY_IS_EQUAL(result(i, j), 3*i + 11*j);
+    }
+  }
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_gaussian_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType rows = 32;
+  IndexType cols = 48;
+  array<DataType, 2> means;
+  means[0] = rows / 2.0f;
+  means[1] = cols / 2.0f;
+  array<DataType, 2> std_devs;
+  std_devs[0] = 3.14f;
+  std_devs[1] = 2.7f;
+  internal::GaussianGenerator<DataType, Eigen::DenseIndex, 2> gaussian_gen(means, std_devs);
+
+  array<IndexType, 2> tensorRange = {{rows, cols}};
+  Tensor<DataType, 2, DataLayout,IndexType> matrix(tensorRange);
+  Tensor<DataType, 2, DataLayout,IndexType> result(tensorRange);
+
+  const size_t tensorBuffSize =matrix.size()*sizeof(DataType);
+  DataType* gpu_data_matrix  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_result  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+  TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_matrix(gpu_data_matrix, tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_matrix, matrix.data(), tensorBuffSize);
+  gpu_result.device(sycl_device)=gpu_matrix.generate(gaussian_gen);
+  sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize);
+
+  for (IndexType i = 0; i < rows; ++i) {
+    for (IndexType j = 0; j < cols; ++j) {
+      DataType g_rows = powf(rows/2.0f - i, 2) / (3.14f * 3.14f) * 0.5f;
+      DataType g_cols = powf(cols/2.0f - j, 2) / (2.7f * 2.7f) * 0.5f;
+      DataType gaussian = expf(-g_rows - g_cols);
+      Eigen::internal::isApprox(result(i, j), gaussian, error_threshold);
+    }
+  }
+}
+
+template<typename DataType, typename dev_Selector> void sycl_generator_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_1D_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_1D_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_2D_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_2D_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_gaussian_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_gaussian_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_generator_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_generator_test_per_device<float>(device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu
new file mode 100644
index 0000000..137d0d5
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_gpu.cu

@@ -0,0 +1,1643 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
+#define EIGEN_GPU_TEST_C99_MATH  EIGEN_HAS_CXX11
+
+using Eigen::Tensor;
+
+void test_gpu_nullary() {
+  Tensor<float, 1, 0, int> in1(2);
+  Tensor<float, 1, 0, int> in2(2);
+  in1.setRandom();
+  in2.setRandom();
+
+  std::size_t tensor_bytes = in1.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  gpuMalloc((void**)(&d_in1), tensor_bytes);
+  gpuMalloc((void**)(&d_in2), tensor_bytes);
+  gpuMemcpy(d_in1, in1.data(), tensor_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in2, in2.data(), tensor_bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
+      d_in1, 2);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
+      d_in2, 2);
+
+  gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
+  gpu_in2.device(gpu_device) = gpu_in2.random();
+
+  Tensor<float, 1, 0, int> new1(2);
+  Tensor<float, 1, 0, int> new2(2);
+
+  assert(gpuMemcpyAsync(new1.data(), d_in1, tensor_bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuMemcpyAsync(new2.data(), d_in2, tensor_bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 2; ++i) {
+    VERIFY_IS_APPROX(new1(i), 3.14f);
+    VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
+  }
+
+  gpuFree(d_in1);
+  gpuFree(d_in2);
+}
+
+void test_gpu_elementwise_small() {
+  Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
+  Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
+  Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2));
+  in1.setRandom();
+  in2.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t in2_bytes = in2.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  float* d_out;
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_in2), in2_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
+
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
+      d_in1, Eigen::array<Eigen::DenseIndex, 1>(2));
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
+      d_in2, Eigen::array<Eigen::DenseIndex, 1>(2));
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
+      d_out, Eigen::array<Eigen::DenseIndex, 1>(2));
+
+  gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
+
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 2; ++i) {
+    VERIFY_IS_APPROX(
+        out(Eigen::array<Eigen::DenseIndex, 1>(i)),
+        in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i)));
+  }
+
+  gpuFree(d_in1);
+  gpuFree(d_in2);
+  gpuFree(d_out);
+}
+
+void test_gpu_elementwise()
+{
+  Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  in1.setRandom();
+  in2.setRandom();
+  in3.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t in2_bytes = in2.size() * sizeof(float);
+  std::size_t in3_bytes = in3.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  float* d_in3;
+  float* d_out;
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_in2), in2_bytes);
+  gpuMalloc((void**)(&d_in3), in3_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
+
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in3, in3.data(), in3_bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+
+  gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
+
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 53; ++j) {
+      for (int k = 0; k < 97; ++k) {
+        VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)), in1(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) + in2(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) * in3(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)));
+      }
+    }
+  }
+
+  gpuFree(d_in1);
+  gpuFree(d_in2);
+  gpuFree(d_in3);
+  gpuFree(d_out);
+}
+
+void test_gpu_props() {
+  Tensor<float, 1> in1(200);
+  Tensor<bool, 1> out(200);
+  in1.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(bool);
+
+  float* d_in1;
+  bool* d_out;
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
+
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
+      d_in1, 200);
+  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out(
+      d_out, 200);
+
+  gpu_out.device(gpu_device) = (gpu_in1.isnan)();
+
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 200; ++i) {
+    VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
+  }
+
+  gpuFree(d_in1);
+  gpuFree(d_out);
+}
+
+void test_gpu_reduction()
+{
+  Tensor<float, 4> in1(72,53,97,113);
+  Tensor<float, 2> out(72,97);
+  in1.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_out;
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
+
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
+
+  array<Eigen::DenseIndex, 2> reduction_axis;
+  reduction_axis[0] = 1;
+  reduction_axis[1] = 3;
+
+  gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      float expected = 0;
+      for (int k = 0; k < 53; ++k) {
+        for (int l = 0; l < 113; ++l) {
+          expected =
+              std::max<float>(expected, in1(i, k, j, l));
+        }
+      }
+      VERIFY_IS_APPROX(out(i,j), expected);
+    }
+  }
+
+  gpuFree(d_in1);
+  gpuFree(d_out);
+}
+
+template<int DataLayout>
+void test_gpu_contraction()
+{
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
+  Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>(3, 31, 7, 20, 1));
+  Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>(6, 50, 7, 20, 1));
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
+  std::size_t t_right_bytes = t_right.size() * sizeof(float);
+  std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+  float* d_t_left;
+  float* d_t_right;
+  float* d_t_result;
+
+  gpuMalloc((void**)(&d_t_left), t_left_bytes);
+  gpuMalloc((void**)(&d_t_right), t_right_bytes);
+  gpuMalloc((void**)(&d_t_result), t_result_bytes);
+
+  gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1);
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1);
+
+  typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
+  MapXf m_left(t_left.data(), 300, 93);
+  MapXf m_right(t_right.data(), 93, 140);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140);
+
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims;
+  dims[0] = DimPair(2, 0);
+  dims[1] = DimPair(3, 1);
+
+  m_result = m_left * m_right;
+  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+
+  gpuMemcpy(t_result.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
+
+  for (DenseIndex i = 0; i < t_result.size(); i++) {
+    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
+      std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  gpuFree(d_t_left);
+  gpuFree(d_t_right);
+  gpuFree(d_t_result);
+}
+
+template<int DataLayout>
+void test_gpu_convolution_1d()
+{
+  Tensor<float, 4, DataLayout> input(74,37,11,137);
+  Tensor<float, 1, DataLayout> kernel(4);
+  Tensor<float, 4, DataLayout> out(74,34,11,137);
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
+
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
+
+  Eigen::array<Eigen::DenseIndex, 1> dims(1);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 74; ++i) {
+    for (int j = 0; j < 34; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        for (int l = 0; l < 137; ++l) {
+          const float result = out(i,j,k,l);
+          const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) +
+                                 input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3);
+          VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
+}
+
+void test_gpu_convolution_inner_dim_col_major_1d()
+{
+  Tensor<float, 4, ColMajor> input(74,9,11,7);
+  Tensor<float, 1, ColMajor> kernel(4);
+  Tensor<float, 4, ColMajor> out(71,9,11,7);
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
+
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
+  Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
+
+  Eigen::array<Eigen::DenseIndex, 1> dims(0);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 71; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          const float result = out(i,j,k,l);
+          const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) +
+                                 input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3);
+          VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
+}
+
+void test_gpu_convolution_inner_dim_row_major_1d()
+{
+  Tensor<float, 4, RowMajor> input(7,9,11,74);
+  Tensor<float, 1, RowMajor> kernel(4);
+  Tensor<float, 4, RowMajor> out(7,9,11,71);
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
+
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
+  Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
+
+  Eigen::array<Eigen::DenseIndex, 1> dims(3);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        for (int l = 0; l < 71; ++l) {
+          const float result = out(i,j,k,l);
+          const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) +
+                                 input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3);
+          VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
+}
+
+template<int DataLayout>
+void test_gpu_convolution_2d()
+{
+  Tensor<float, 4, DataLayout> input(74,37,11,137);
+  Tensor<float, 2, DataLayout> kernel(3,4);
+  Tensor<float, 4, DataLayout> out(74,35,8,137);
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
+
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
+
+  Eigen::array<Eigen::DenseIndex, 2> dims(1,2);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 74; ++i) {
+    for (int j = 0; j < 35; ++j) {
+      for (int k = 0; k < 8; ++k) {
+        for (int l = 0; l < 137; ++l) {
+          const float result = out(i,j,k,l);
+          const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
+                                 input(i,j+1,k+0,l) * kernel(1,0) +
+                                 input(i,j+2,k+0,l) * kernel(2,0) +
+                                 input(i,j+0,k+1,l) * kernel(0,1) +
+                                 input(i,j+1,k+1,l) * kernel(1,1) +
+                                 input(i,j+2,k+1,l) * kernel(2,1) +
+                                 input(i,j+0,k+2,l) * kernel(0,2) +
+                                 input(i,j+1,k+2,l) * kernel(1,2) +
+                                 input(i,j+2,k+2,l) * kernel(2,2) +
+                                 input(i,j+0,k+3,l) * kernel(0,3) +
+                                 input(i,j+1,k+3,l) * kernel(1,3) +
+                                 input(i,j+2,k+3,l) * kernel(2,3);
+          VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
+}
+
+template<int DataLayout>
+void test_gpu_convolution_3d()
+{
+  Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17));
+  Tensor<float, 3, DataLayout> kernel(3,4,2);
+  Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>(74,35,8,136,17));
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
+
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;    
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
+  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
+
+  Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 74; ++i) {
+    for (int j = 0; j < 35; ++j) {
+      for (int k = 0; k < 8; ++k) {
+        for (int l = 0; l < 136; ++l) {
+          for (int m = 0; m < 17; ++m) {
+            const float result = out(i,j,k,l,m);
+            const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) +
+                                   input(i,j+1,k+0,l+0,m) * kernel(1,0,0) +
+                                   input(i,j+2,k+0,l+0,m) * kernel(2,0,0) +
+                                   input(i,j+0,k+1,l+0,m) * kernel(0,1,0) +
+                                   input(i,j+1,k+1,l+0,m) * kernel(1,1,0) +
+                                   input(i,j+2,k+1,l+0,m) * kernel(2,1,0) +
+                                   input(i,j+0,k+2,l+0,m) * kernel(0,2,0) +
+                                   input(i,j+1,k+2,l+0,m) * kernel(1,2,0) +
+                                   input(i,j+2,k+2,l+0,m) * kernel(2,2,0) +
+                                   input(i,j+0,k+3,l+0,m) * kernel(0,3,0) +
+                                   input(i,j+1,k+3,l+0,m) * kernel(1,3,0) +
+                                   input(i,j+2,k+3,l+0,m) * kernel(2,3,0) +
+                                   input(i,j+0,k+0,l+1,m) * kernel(0,0,1) +
+                                   input(i,j+1,k+0,l+1,m) * kernel(1,0,1) +
+                                   input(i,j+2,k+0,l+1,m) * kernel(2,0,1) +
+                                   input(i,j+0,k+1,l+1,m) * kernel(0,1,1) +
+                                   input(i,j+1,k+1,l+1,m) * kernel(1,1,1) +
+                                   input(i,j+2,k+1,l+1,m) * kernel(2,1,1) +
+                                   input(i,j+0,k+2,l+1,m) * kernel(0,2,1) +
+                                   input(i,j+1,k+2,l+1,m) * kernel(1,2,1) +
+                                   input(i,j+2,k+2,l+1,m) * kernel(2,2,1) +
+                                   input(i,j+0,k+3,l+1,m) * kernel(0,3,1) +
+                                   input(i,j+1,k+3,l+1,m) * kernel(1,3,1) +
+                                   input(i,j+2,k+3,l+1,m) * kernel(2,3,1);
+            VERIFY_IS_APPROX(result, expected);
+          }
+        }
+      }
+    }
+  }
+
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
+}
+
+
+#if EIGEN_GPU_TEST_C99_MATH
+template <typename Scalar>
+void test_gpu_lgamma(const Scalar stddev)
+{
+  Tensor<Scalar, 2> in(72,97);
+  in.setRandom();
+  in *= in.constant(stddev);
+  Tensor<Scalar, 2> out(72,97);
+  out.setZero();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+  gpu_out.device(gpu_device) = gpu_in.lgamma();
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
+    }
+  }
+
+  gpuFree(d_in);
+  gpuFree(d_out);
+}
+#endif
+
+template <typename Scalar>
+void test_gpu_digamma()
+{
+  Tensor<Scalar, 1> in(7);
+  Tensor<Scalar, 1> out(7);
+  Tensor<Scalar, 1> expected_out(7);
+  out.setZero();
+
+  in(0) = Scalar(1);
+  in(1) = Scalar(1.5);
+  in(2) = Scalar(4);
+  in(3) = Scalar(-10.5);
+  in(4) = Scalar(10000.5);
+  in(5) = Scalar(0);
+  in(6) = Scalar(-1);
+
+  expected_out(0) = Scalar(-0.5772156649015329);
+  expected_out(1) = Scalar(0.03648997397857645);
+  expected_out(2) = Scalar(1.2561176684318);
+  expected_out(3) = Scalar(2.398239129535781);
+  expected_out(4) = Scalar(9.210340372392849);
+  expected_out(5) = std::numeric_limits<Scalar>::infinity();
+  expected_out(6) = std::numeric_limits<Scalar>::infinity();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
+
+  gpu_out.device(gpu_device) = gpu_in.digamma();
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+  for (int i = 5; i < 7; ++i) {
+    VERIFY_IS_EQUAL(out(i), expected_out(i));
+  }
+
+  gpuFree(d_in);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_zeta()
+{
+  Tensor<Scalar, 1> in_x(6);
+  Tensor<Scalar, 1> in_q(6);
+  Tensor<Scalar, 1> out(6);
+  Tensor<Scalar, 1> expected_out(6);
+  out.setZero();
+
+  in_x(0) = Scalar(1);
+  in_x(1) = Scalar(1.5);
+  in_x(2) = Scalar(4);
+  in_x(3) = Scalar(-10.5);
+  in_x(4) = Scalar(10000.5);
+  in_x(5) = Scalar(3);
+  
+  in_q(0) = Scalar(1.2345);
+  in_q(1) = Scalar(2);
+  in_q(2) = Scalar(1.5);
+  in_q(3) = Scalar(3);
+  in_q(4) = Scalar(1.0001);
+  in_q(5) = Scalar(-2.5);
+
+  expected_out(0) = std::numeric_limits<Scalar>::infinity();
+  expected_out(1) = Scalar(1.61237534869);
+  expected_out(2) = Scalar(0.234848505667);
+  expected_out(3) = Scalar(1.03086757337e-5);
+  expected_out(4) = Scalar(0.367879440865);
+  expected_out(5) = Scalar(0.054102025820864097);
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in_x;
+  Scalar* d_in_q;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_in_x), bytes);
+  gpuMalloc((void**)(&d_in_q), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in_q, in_q.data(), bytes, gpuMemcpyHostToDevice);
+  
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
+
+  gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  VERIFY_IS_EQUAL(out(0), expected_out(0));
+  VERIFY((std::isnan)(out(3)));
+
+  for (int i = 1; i < 6; ++i) {
+    if (i != 3) {
+      VERIFY_IS_APPROX(out(i), expected_out(i));
+    }
+  }
+
+  gpuFree(d_in_x);
+  gpuFree(d_in_q);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_polygamma()
+{
+  Tensor<Scalar, 1> in_x(7);
+  Tensor<Scalar, 1> in_n(7);
+  Tensor<Scalar, 1> out(7);
+  Tensor<Scalar, 1> expected_out(7);
+  out.setZero();
+
+  in_n(0) = Scalar(1);
+  in_n(1) = Scalar(1);
+  in_n(2) = Scalar(1);
+  in_n(3) = Scalar(17);
+  in_n(4) = Scalar(31);
+  in_n(5) = Scalar(28);
+  in_n(6) = Scalar(8);
+  
+  in_x(0) = Scalar(2);
+  in_x(1) = Scalar(3);
+  in_x(2) = Scalar(25.5);
+  in_x(3) = Scalar(4.7);
+  in_x(4) = Scalar(11.8);
+  in_x(5) = Scalar(17.7);
+  in_x(6) = Scalar(30.2);
+
+  expected_out(0) = Scalar(0.644934066848);
+  expected_out(1) = Scalar(0.394934066848);
+  expected_out(2) = Scalar(0.0399946696496);
+  expected_out(3) = Scalar(293.334565435);
+  expected_out(4) = Scalar(0.445487887616);
+  expected_out(5) = Scalar(-2.47810300902e-07);
+  expected_out(6) = Scalar(-8.29668781082e-09);
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in_x;
+  Scalar* d_in_n;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_in_x), bytes);
+  gpuMalloc((void**)(&d_in_n), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in_n, in_n.data(), bytes, gpuMemcpyHostToDevice);
+  
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
+
+  gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 7; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  gpuFree(d_in_x);
+  gpuFree(d_in_n);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_igamma()
+{
+  Tensor<Scalar, 2> a(6, 6);
+  Tensor<Scalar, 2> x(6, 6);
+  Tensor<Scalar, 2> out(6, 6);
+  out.setZero();
+
+  Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+  Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      a(i, j) = a_s[i];
+      x(i, j) = x_s[j];
+    }
+  }
+
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+  Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
+                          {0.0, 0.6321205588285578, 0.7768698398515702,
+                           0.9816843611112658, 9.999500016666262e-05, 1.0},
+                          {0.0, 0.4275932955291202, 0.608374823728911,
+                           0.9539882943107686, 7.522076445089201e-07, 1.0},
+                          {0.0, 0.01898815687615381, 0.06564245437845008,
+                           0.5665298796332909, 4.166333347221828e-18, 1.0},
+                          {0.0, 0.9999780593618628, 0.9999899967080838,
+                           0.9999996219837988, 0.9991370418689945, 1.0},
+                          {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
+
+
+
+  std::size_t bytes = a.size() * sizeof(Scalar);
+
+  Scalar* d_a;
+  Scalar* d_x;
+  Scalar* d_out;
+  assert(gpuMalloc((void**)(&d_a), bytes) == gpuSuccess);
+  assert(gpuMalloc((void**)(&d_x), bytes) == gpuSuccess);
+  assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess);
+
+  gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
+
+  gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      if ((std::isnan)(igamma_s[i][j])) {
+        VERIFY((std::isnan)(out(i, j)));
+      } else {
+        VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
+      }
+    }
+  }
+
+  gpuFree(d_a);
+  gpuFree(d_x);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_igammac()
+{
+  Tensor<Scalar, 2> a(6, 6);
+  Tensor<Scalar, 2> x(6, 6);
+  Tensor<Scalar, 2> out(6, 6);
+  out.setZero();
+
+  Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+  Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      a(i, j) = a_s[i];
+      x(i, j) = x_s[j];
+    }
+  }
+
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+  Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
+                           {1.0, 0.36787944117144233, 0.22313016014842982,
+                            0.018315638888734182, 0.9999000049998333, 0.0},
+                           {1.0, 0.5724067044708798, 0.3916251762710878,
+                            0.04601170568923136, 0.9999992477923555, 0.0},
+                           {1.0, 0.9810118431238462, 0.9343575456215499,
+                            0.4334701203667089, 1.0, 0.0},
+                           {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
+                            3.7801620118431334e-07, 0.0008629581310054535,
+                            0.0},
+                           {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
+
+  std::size_t bytes = a.size() * sizeof(Scalar);
+
+  Scalar* d_a;
+  Scalar* d_x;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_a), bytes);
+  gpuMalloc((void**)(&d_x), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
+
+  gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      if ((std::isnan)(igammac_s[i][j])) {
+        VERIFY((std::isnan)(out(i, j)));
+      } else {
+        VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
+      }
+    }
+  }
+
+  gpuFree(d_a);
+  gpuFree(d_x);
+  gpuFree(d_out);
+}
+
+#if EIGEN_GPU_TEST_C99_MATH
+template <typename Scalar>
+void test_gpu_erf(const Scalar stddev)
+{
+  Tensor<Scalar, 2> in(72,97);
+  in.setRandom();
+  in *= in.constant(stddev);
+  Tensor<Scalar, 2> out(72,97);
+  out.setZero();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  assert(gpuMalloc((void**)(&d_in), bytes) == gpuSuccess);
+  assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess);
+
+  gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+  gpu_out.device(gpu_device) = gpu_in.erf();
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
+    }
+  }
+
+  gpuFree(d_in);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_erfc(const Scalar stddev)
+{
+  Tensor<Scalar, 2> in(72,97);
+  in.setRandom();
+  in *= in.constant(stddev);
+  Tensor<Scalar, 2> out(72,97);
+  out.setZero();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+  gpu_out.device(gpu_device) = gpu_in.erfc();
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
+    }
+  }
+
+  gpuFree(d_in);
+  gpuFree(d_out);
+}
+#endif
+template <typename Scalar>
+void test_gpu_ndtri()
+{
+  Tensor<Scalar, 1> in_x(8);
+  Tensor<Scalar, 1> out(8);
+  Tensor<Scalar, 1> expected_out(8);
+  out.setZero();
+
+  in_x(0) = Scalar(1);
+  in_x(1) = Scalar(0.);
+  in_x(2) = Scalar(0.5);
+  in_x(3) = Scalar(0.2);
+  in_x(4) = Scalar(0.8);
+  in_x(5) = Scalar(0.9);
+  in_x(6) = Scalar(0.1);
+  in_x(7) = Scalar(0.99);
+  in_x(8) = Scalar(0.01);
+
+  expected_out(0) = std::numeric_limits<Scalar>::infinity();
+  expected_out(1) = -std::numeric_limits<Scalar>::infinity();
+  expected_out(2) = Scalar(0.0);
+  expected_out(3) = Scalar(-0.8416212335729142);
+  expected_out(4) = Scalar(0.8416212335729142);
+  expected_out(5) = Scalar(1.2815515655446004);
+  expected_out(6) = Scalar(-1.2815515655446004);
+  expected_out(7) = Scalar(2.3263478740408408);
+  expected_out(8) = Scalar(-2.3263478740408408);
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in_x;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_in_x), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
+
+  gpu_out.device(gpu_device) = gpu_in_x.ndtri();
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  VERIFY_IS_EQUAL(out(0), expected_out(0));
+  VERIFY((std::isnan)(out(3)));
+
+  for (int i = 1; i < 6; ++i) {
+    if (i != 3) {
+      VERIFY_IS_APPROX(out(i), expected_out(i));
+    }
+  }
+
+  gpuFree(d_in_x);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_betainc()
+{
+  Tensor<Scalar, 1> in_x(125);
+  Tensor<Scalar, 1> in_a(125);
+  Tensor<Scalar, 1> in_b(125);
+  Tensor<Scalar, 1> out(125);
+  Tensor<Scalar, 1> expected_out(125);
+  out.setZero();
+
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+
+  Array<Scalar, 1, Dynamic> x(125);
+  Array<Scalar, 1, Dynamic> a(125);
+  Array<Scalar, 1, Dynamic> b(125);
+  Array<Scalar, 1, Dynamic> v(125);
+
+  a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+      0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+      0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+      999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+      999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999;
+
+  b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+      0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+      0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+      0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999;
+
+  x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+      1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+      0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+      0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
+      0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
+      -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+      1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+      0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+      0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
+
+  v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+      nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+      nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
+      0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
+      0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
+      0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan,
+      nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
+      0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
+      0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
+      0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
+      0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
+      1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan,
+      nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444,
+      nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229,
+      nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan,
+      nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306,
+      2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
+      1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252,
+      2.9303043666183996e-60, nan, nan, 2.248913486879199e-196,
+      0.5000000000004947, 0.9999999999999999, nan;
+
+  for (int i = 0; i < 125; ++i) {
+    in_x(i) = x(i);
+    in_a(i) = a(i);
+    in_b(i) = b(i);
+    expected_out(i) = v(i);
+  }
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in_x;
+  Scalar* d_in_a;
+  Scalar* d_in_b;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_in_x), bytes);
+  gpuMalloc((void**)(&d_in_a), bytes);
+  gpuMalloc((void**)(&d_in_b), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in_a, in_a.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in_b, in_b.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125);
+
+  gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 1; i < 125; ++i) {
+    if ((std::isnan)(expected_out(i))) {
+      VERIFY((std::isnan)(out(i)));
+    } else {
+      VERIFY_IS_APPROX(out(i), expected_out(i));
+    }
+  }
+
+  gpuFree(d_in_x);
+  gpuFree(d_in_a);
+  gpuFree(d_in_b);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_i0e()
+{
+  Tensor<Scalar, 1> in_x(21);
+  Tensor<Scalar, 1> out(21);
+  Tensor<Scalar, 1> expected_out(21);
+  out.setZero();
+
+  Array<Scalar, 1, Dynamic> in_x_array(21);
+  Array<Scalar, 1, Dynamic> expected_out_array(21);
+
+  in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0,
+      -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+  expected_out_array << 0.0897803118848, 0.0947062952128, 0.100544127361,
+      0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857,
+      0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554,
+      0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163,
+      0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128,
+      0.0897803118848;
+
+  for (int i = 0; i < 21; ++i) {
+    in_x(i) = in_x_array(i);
+    expected_out(i) = expected_out_array(i);
+  }
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21);
+
+  gpu_out.device(gpu_device) = gpu_in.bessel_i0e();
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 21; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  gpuFree(d_in);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_i1e()
+{
+  Tensor<Scalar, 1> in_x(21);
+  Tensor<Scalar, 1> out(21);
+  Tensor<Scalar, 1> expected_out(21);
+  out.setZero();
+
+  Array<Scalar, 1, Dynamic> in_x_array(21);
+  Array<Scalar, 1, Dynamic> expected_out_array(21);
+
+  in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0,
+      -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+  expected_out_array << -0.0875062221833, -0.092036796872, -0.0973496147565,
+      -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293,
+      -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249,
+      0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384,
+      0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872,
+      0.0875062221833;
+
+  for (int i = 0; i < 21; ++i) {
+    in_x(i) = in_x_array(i);
+    expected_out(i) = expected_out_array(i);
+  }
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21);
+
+  gpu_out.device(gpu_device) = gpu_in.bessel_i1e();
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 21; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  gpuFree(d_in);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_igamma_der_a()
+{
+  Tensor<Scalar, 1> in_x(30);
+  Tensor<Scalar, 1> in_a(30);
+  Tensor<Scalar, 1> out(30);
+  Tensor<Scalar, 1> expected_out(30);
+  out.setZero();
+
+  Array<Scalar, 1, Dynamic> in_a_array(30);
+  Array<Scalar, 1, Dynamic> in_x_array(30);
+  Array<Scalar, 1, Dynamic> expected_out_array(30);
+
+  // See special_functions.cpp for the Python code that generates the test data.
+
+  in_a_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0,
+      1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
+      100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+  in_x_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+      1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065,
+      0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288,
+      1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458,
+      7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233,
+      92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677,
+      968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+  expected_out_array << -32.7256441441, -36.4394150514, -9.66467612263,
+      -36.4394150514, -36.4394150514, -1.0891900302, -2.66351229645,
+      -2.48666868596, -0.929700494428, -3.56327722764, -0.455320135314,
+      -0.391437214323, -0.491352055991, -0.350454834292, -0.471773162921,
+      -0.104084440522, -0.0723646747909, -0.0992828975532, -0.121638215446,
+      -0.122619605294, -0.0317670267286, -0.0359974812869, -0.0154359225363,
+      -0.0375775365921, -0.00794899153653, -0.00777303219211, -0.00796085782042,
+      -0.0125850719397, -0.00455500206958, -0.00476436993148;
+
+  for (int i = 0; i < 30; ++i) {
+    in_x(i) = in_x_array(i);
+    in_a(i) = in_a_array(i);
+    expected_out(i) = expected_out_array(i);
+  }
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_a;
+  Scalar* d_x;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_a), bytes);
+  gpuMalloc((void**)(&d_x), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_a, in_a.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_a(d_a, 30);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_x(d_x, 30);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30);
+
+  gpu_out.device(gpu_device) = gpu_a.igamma_der_a(gpu_x);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 30; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  gpuFree(d_a);
+  gpuFree(d_x);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_gamma_sample_der_alpha()
+{
+  Tensor<Scalar, 1> in_alpha(30);
+  Tensor<Scalar, 1> in_sample(30);
+  Tensor<Scalar, 1> out(30);
+  Tensor<Scalar, 1> expected_out(30);
+  out.setZero();
+
+  Array<Scalar, 1, Dynamic> in_alpha_array(30);
+  Array<Scalar, 1, Dynamic> in_sample_array(30);
+  Array<Scalar, 1, Dynamic> expected_out_array(30);
+
+  // See special_functions.cpp for the Python code that generates the test data.
+
+  in_alpha_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0,
+      1.0, 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0,
+      100.0, 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+  in_sample_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+      1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065,
+      0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288,
+      1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458,
+      7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233,
+      92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677,
+      968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+  expected_out_array << 7.42424742367e-23, 1.02004297287e-34, 0.0130155240738,
+      1.02004297287e-34, 1.02004297287e-34, 1.96505168277e-13, 0.525575786243,
+      0.713903991771, 2.32077561808e-14, 0.000179348049886, 0.635500453302,
+      1.27561284917, 0.878125852156, 0.41565819538, 1.03606488534,
+      0.885964824887, 1.16424049334, 1.10764479598, 1.04590810812,
+      1.04193666963, 0.965193152414, 0.976217589464, 0.93008035061,
+      0.98153216096, 0.909196397698, 0.98434963993, 0.984738050206,
+      1.00106492525, 0.97734200649, 1.02198794179;
+
+  for (int i = 0; i < 30; ++i) {
+    in_alpha(i) = in_alpha_array(i);
+    in_sample(i) = in_sample_array(i);
+    expected_out(i) = expected_out_array(i);
+  }
+
+  std::size_t bytes = in_alpha.size() * sizeof(Scalar);
+
+  Scalar* d_alpha;
+  Scalar* d_sample;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_alpha), bytes);
+  gpuMalloc((void**)(&d_sample), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_alpha, in_alpha.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_sample, in_sample.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_alpha(d_alpha, 30);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_sample(d_sample, 30);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30);
+
+  gpu_out.device(gpu_device) = gpu_alpha.gamma_sample_der_alpha(gpu_sample);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 30; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  gpuFree(d_alpha);
+  gpuFree(d_sample);
+  gpuFree(d_out);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_gpu)
+{
+  CALL_SUBTEST_1(test_gpu_nullary());
+  CALL_SUBTEST_1(test_gpu_elementwise_small());
+  CALL_SUBTEST_1(test_gpu_elementwise());
+  CALL_SUBTEST_1(test_gpu_props());
+  CALL_SUBTEST_1(test_gpu_reduction());
+  CALL_SUBTEST_2(test_gpu_contraction<ColMajor>());
+  CALL_SUBTEST_2(test_gpu_contraction<RowMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_1d<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_1d<RowMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_inner_dim_col_major_1d());
+  CALL_SUBTEST_3(test_gpu_convolution_inner_dim_row_major_1d());
+  CALL_SUBTEST_3(test_gpu_convolution_2d<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_2d<RowMajor>());
+#if !defined(EIGEN_USE_HIP)
+// disable these tests on HIP for now.
+// they hang..need to investigate and fix
+  CALL_SUBTEST_3(test_gpu_convolution_3d<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_3d<RowMajor>());
+#endif
+
+#if EIGEN_GPU_TEST_C99_MATH
+  // std::erf, std::erfc, and so on where only added in c++11. We use them
+  // as a golden reference to validate the results produced by Eigen. Therefore
+  // we can only run these tests if we use a c++11 compiler.
+  CALL_SUBTEST_4(test_gpu_lgamma<float>(1.0f));
+  CALL_SUBTEST_4(test_gpu_lgamma<float>(100.0f));
+  CALL_SUBTEST_4(test_gpu_lgamma<float>(0.01f));
+  CALL_SUBTEST_4(test_gpu_lgamma<float>(0.001f));
+
+  CALL_SUBTEST_4(test_gpu_lgamma<double>(1.0));
+  CALL_SUBTEST_4(test_gpu_lgamma<double>(100.0));
+  CALL_SUBTEST_4(test_gpu_lgamma<double>(0.01));
+  CALL_SUBTEST_4(test_gpu_lgamma<double>(0.001));
+
+  CALL_SUBTEST_4(test_gpu_erf<float>(1.0f));
+  CALL_SUBTEST_4(test_gpu_erf<float>(100.0f));
+  CALL_SUBTEST_4(test_gpu_erf<float>(0.01f));
+  CALL_SUBTEST_4(test_gpu_erf<float>(0.001f));
+
+  CALL_SUBTEST_4(test_gpu_erfc<float>(1.0f));
+  // CALL_SUBTEST(test_gpu_erfc<float>(100.0f));
+  CALL_SUBTEST_4(test_gpu_erfc<float>(5.0f)); // GPU erfc lacks precision for large inputs
+  CALL_SUBTEST_4(test_gpu_erfc<float>(0.01f));
+  CALL_SUBTEST_4(test_gpu_erfc<float>(0.001f));
+
+  CALL_SUBTEST_4(test_gpu_erf<double>(1.0));
+  CALL_SUBTEST_4(test_gpu_erf<double>(100.0));
+  CALL_SUBTEST_4(test_gpu_erf<double>(0.01));
+  CALL_SUBTEST_4(test_gpu_erf<double>(0.001));
+
+  CALL_SUBTEST_4(test_gpu_erfc<double>(1.0));
+  // CALL_SUBTEST(test_gpu_erfc<double>(100.0));
+  CALL_SUBTEST_4(test_gpu_erfc<double>(5.0)); // GPU erfc lacks precision for large inputs
+  CALL_SUBTEST_4(test_gpu_erfc<double>(0.01));
+  CALL_SUBTEST_4(test_gpu_erfc<double>(0.001));
+
+#if !defined(EIGEN_USE_HIP)
+// disable these tests on HIP for now.
+
+  CALL_SUBTEST_5(test_gpu_ndtri<float>());
+  CALL_SUBTEST_5(test_gpu_ndtri<double>());
+
+  CALL_SUBTEST_5(test_gpu_digamma<float>());
+  CALL_SUBTEST_5(test_gpu_digamma<double>());
+
+  CALL_SUBTEST_5(test_gpu_polygamma<float>());
+  CALL_SUBTEST_5(test_gpu_polygamma<double>());
+
+  CALL_SUBTEST_5(test_gpu_zeta<float>());
+  CALL_SUBTEST_5(test_gpu_zeta<double>());
+#endif
+
+  CALL_SUBTEST_5(test_gpu_igamma<float>());
+  CALL_SUBTEST_5(test_gpu_igammac<float>());
+
+  CALL_SUBTEST_5(test_gpu_igamma<double>());
+  CALL_SUBTEST_5(test_gpu_igammac<double>());
+
+#if !defined(EIGEN_USE_HIP)
+// disable these tests on HIP for now.
+  CALL_SUBTEST_6(test_gpu_betainc<float>());
+  CALL_SUBTEST_6(test_gpu_betainc<double>());
+
+  CALL_SUBTEST_6(test_gpu_i0e<float>());
+  CALL_SUBTEST_6(test_gpu_i0e<double>());
+
+  CALL_SUBTEST_6(test_gpu_i1e<float>());
+  CALL_SUBTEST_6(test_gpu_i1e<double>());
+
+  CALL_SUBTEST_6(test_gpu_i1e<float>());
+  CALL_SUBTEST_6(test_gpu_i1e<double>());
+
+  CALL_SUBTEST_6(test_gpu_igamma_der_a<float>());
+  CALL_SUBTEST_6(test_gpu_igamma_der_a<double>());
+
+  CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<float>());
+  CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<double>());
+#endif
+
+#endif
+}

diff --git a/unsupported/test/cxx11_tensor_ifft.cpp b/unsupported/test/cxx11_tensor_ifft.cpp
new file mode 100644
index 0000000..c20edd9
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_ifft.cpp

@@ -0,0 +1,154 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Jianwei Cui <thucjw@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <complex>
+#include <cmath>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int DataLayout>
+static void test_1D_fft_ifft_invariant(int sequence_length) {
+  Tensor<double, 1, DataLayout> tensor(sequence_length);
+  tensor.setRandom();
+
+  array<int, 1> fft;
+  fft[0] = 0;
+
+  Tensor<std::complex<double>, 1, DataLayout> tensor_after_fft;
+  Tensor<std::complex<double>, 1, DataLayout> tensor_after_fft_ifft;
+
+  tensor_after_fft = tensor.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
+  tensor_after_fft_ifft = tensor_after_fft.template fft<Eigen::BothParts, Eigen::FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(0), sequence_length);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(0), sequence_length);
+
+  for (int i = 0; i < sequence_length; ++i) {
+    VERIFY_IS_APPROX(static_cast<float>(tensor(i)), static_cast<float>(std::real(tensor_after_fft_ifft(i))));
+  }
+}
+
+template <int DataLayout>
+static void test_2D_fft_ifft_invariant(int dim0, int dim1) {
+  Tensor<double, 2, DataLayout> tensor(dim0, dim1);
+  tensor.setRandom();
+
+  array<int, 2> fft;
+  fft[0] = 0;
+  fft[1] = 1;
+
+  Tensor<std::complex<double>, 2, DataLayout> tensor_after_fft;
+  Tensor<std::complex<double>, 2, DataLayout> tensor_after_fft_ifft;
+
+  tensor_after_fft = tensor.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
+  tensor_after_fft_ifft = tensor_after_fft.template fft<Eigen::BothParts, Eigen::FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(1), dim1);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(1), dim1);
+
+  for (int i = 0; i < dim0; ++i) {
+    for (int j = 0; j < dim1; ++j) {
+      //std::cout << "[" << i << "][" << j << "]" <<  "  Original data: " << tensor(i,j) << " Transformed data:" << tensor_after_fft_ifft(i,j) << std::endl;
+      VERIFY_IS_APPROX(static_cast<float>(tensor(i,j)), static_cast<float>(std::real(tensor_after_fft_ifft(i,j))));
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_3D_fft_ifft_invariant(int dim0, int dim1, int dim2) {
+  Tensor<double, 3, DataLayout> tensor(dim0, dim1, dim2);
+  tensor.setRandom();
+
+  array<int, 3> fft;
+  fft[0] = 0;
+  fft[1] = 1;
+  fft[2] = 2;
+
+  Tensor<std::complex<double>, 3, DataLayout> tensor_after_fft;
+  Tensor<std::complex<double>, 3, DataLayout> tensor_after_fft_ifft;
+
+  tensor_after_fft = tensor.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
+  tensor_after_fft_ifft = tensor_after_fft.template fft<Eigen::BothParts, Eigen::FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(1), dim1);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(2), dim2);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(1), dim1);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(2), dim2);
+
+  for (int i = 0; i < dim0; ++i) {
+    for (int j = 0; j < dim1; ++j) {
+      for (int k = 0; k < dim2; ++k) {
+        VERIFY_IS_APPROX(static_cast<float>(tensor(i,j,k)), static_cast<float>(std::real(tensor_after_fft_ifft(i,j,k))));
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_sub_fft_ifft_invariant(int dim0, int dim1, int dim2, int dim3) {
+  Tensor<double, 4, DataLayout> tensor(dim0, dim1, dim2, dim3);
+  tensor.setRandom();
+
+  array<int, 2> fft;
+  fft[0] = 2;
+  fft[1] = 0;
+
+  Tensor<std::complex<double>, 4, DataLayout> tensor_after_fft;
+  Tensor<double, 4, DataLayout> tensor_after_fft_ifft;
+
+  tensor_after_fft = tensor.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
+  tensor_after_fft_ifft = tensor_after_fft.template fft<Eigen::RealPart, Eigen::FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(1), dim1);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(2), dim2);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(3), dim3);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(1), dim1);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(2), dim2);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(3), dim3);
+
+  for (int i = 0; i < dim0; ++i) {
+    for (int j = 0; j < dim1; ++j) {
+      for (int k = 0; k < dim2; ++k) {
+        for (int l = 0; l < dim3; ++l) {
+          VERIFY_IS_APPROX(static_cast<float>(tensor(i,j,k,l)), static_cast<float>(tensor_after_fft_ifft(i,j,k,l)));
+        }
+      }
+    }
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_ifft) {
+  CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(4));
+  CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(16));
+  CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(32));
+  CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(1024*1024));
+
+  CALL_SUBTEST(test_2D_fft_ifft_invariant<ColMajor>(4,4));
+  CALL_SUBTEST(test_2D_fft_ifft_invariant<ColMajor>(8,16));
+  CALL_SUBTEST(test_2D_fft_ifft_invariant<ColMajor>(16,32));
+  CALL_SUBTEST(test_2D_fft_ifft_invariant<ColMajor>(1024,1024));
+
+  CALL_SUBTEST(test_3D_fft_ifft_invariant<ColMajor>(4,4,4));
+  CALL_SUBTEST(test_3D_fft_ifft_invariant<ColMajor>(8,16,32));
+  CALL_SUBTEST(test_3D_fft_ifft_invariant<ColMajor>(16,4,8));
+  CALL_SUBTEST(test_3D_fft_ifft_invariant<ColMajor>(256,256,256));
+
+  CALL_SUBTEST(test_sub_fft_ifft_invariant<ColMajor>(4,4,4,4));
+  CALL_SUBTEST(test_sub_fft_ifft_invariant<ColMajor>(8,16,32,64));
+  CALL_SUBTEST(test_sub_fft_ifft_invariant<ColMajor>(16,4,8,12));
+  CALL_SUBTEST(test_sub_fft_ifft_invariant<ColMajor>(64,64,64,64));
+}

diff --git a/unsupported/test/cxx11_tensor_image_op_sycl.cpp b/unsupported/test/cxx11_tensor_image_op_sycl.cpp
new file mode 100644
index 0000000..db1c020
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_image_op_sycl.cpp

@@ -0,0 +1,103 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_image_op_sycl(const Eigen::SyclDevice &sycl_device)
+{
+  IndexType sizeDim1 = 245;
+  IndexType sizeDim2 = 343;
+  IndexType sizeDim3 = 577;
+
+  array<IndexType, 3> input_range ={{sizeDim1, sizeDim2, sizeDim3}};
+  array<IndexType, 3> slice_range ={{sizeDim1-1, sizeDim2, sizeDim3}};
+
+  Tensor<DataType, 3,DataLayout, IndexType> tensor1(input_range);
+  Tensor<DataType, 3,DataLayout, IndexType> tensor2(input_range);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor3(slice_range);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor3_cpu(slice_range);
+
+
+
+  typedef Eigen::DSizes<IndexType, 3> Index3;
+  Index3 strides1(1L,1L, 1L);
+  Index3 indicesStart1(1L, 0L, 0L);
+  Index3 indicesStop1(sizeDim1, sizeDim2, sizeDim3);
+
+  Index3 strides2(1L,1L, 1L);
+  Index3 indicesStart2(0L, 0L, 0L);
+  Index3 indicesStop2(sizeDim1-1, sizeDim2, sizeDim3);
+  Eigen::DSizes<IndexType, 3> sizes(sizeDim1-1,sizeDim2,sizeDim3);
+
+  tensor1.setRandom();
+  tensor2.setRandom();
+
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, input_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, input_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu3(gpu_data3, slice_range);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data2, tensor2.data(),(tensor2.size())*sizeof(DataType));
+  gpu3.device(sycl_device)= gpu1.slice(indicesStart1, sizes) - gpu2.slice(indicesStart2, sizes);
+  sycl_device.memcpyDeviceToHost(tensor3.data(), gpu_data3,(tensor3.size())*sizeof(DataType));
+
+  tensor3_cpu = tensor1.stridedSlice(indicesStart1,indicesStop1,strides1) - tensor2.stridedSlice(indicesStart2,indicesStop2,strides2);
+
+
+  for (IndexType i = 0; i <slice_range[0] ; ++i) {
+    for (IndexType j = 0; j < slice_range[1]; ++j) {
+      for (IndexType k = 0; k < slice_range[2]; ++k) {
+        VERIFY_IS_EQUAL(tensor3_cpu(i,j,k), tensor3(i,j,k));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_image_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_image_op_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) { 
+   CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
+#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
+   CALL_SUBTEST(sycl_computing_test_per_device<double>(device));
+#endif
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_image_patch.cpp b/unsupported/test/cxx11_tensor_image_patch.cpp
new file mode 100644
index 0000000..862f1f7
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_image_patch.cpp

@@ -0,0 +1,809 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_simple_patch()
+{
+  Tensor<float, 4> tensor(2,3,5,7);
+  tensor.setRandom();
+  Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(3), tensor_row_major.dimension(0));
+
+  // Single pixel patch: ColMajor
+  Tensor<float, 5> single_pixel_patch;
+  single_pixel_patch = tensor.extract_image_patches(1, 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(4), 7);
+
+  // Single pixel patch: RowMajor
+  Tensor<float, 5, RowMajor> single_pixel_patch_row_major;
+  single_pixel_patch_row_major = tensor_row_major.extract_image_patches(1, 1);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(1), 3*5);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(4), 2);
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    // ColMajor
+    if (tensor.data()[i] != single_pixel_patch.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : "
+           << tensor.data()[i] << " vs " << single_pixel_patch.data()[i]
+           << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]);
+    // RowMajor
+    if (tensor_row_major.data()[i] != single_pixel_patch_row_major.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : "
+           << tensor.data()[i] << " vs "
+           << single_pixel_patch_row_major.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_pixel_patch_row_major.data()[i],
+                    tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor.data()[i], tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(single_pixel_patch.data()[i],
+                    single_pixel_patch_row_major.data()[i]);
+  }
+
+  // Entire image patch: ColMajor
+  Tensor<float, 5> entire_image_patch;
+  entire_image_patch = tensor.extract_image_patches(3, 5);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(1), 3);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(4), 7);
+
+  // Entire image patch: RowMajor
+  Tensor<float, 5, RowMajor> entire_image_patch_row_major;
+  entire_image_patch_row_major = tensor_row_major.extract_image_patches(3, 5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(4), 2);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      int patchId = i+3*j;
+      for (int r = 0; r < 3; ++r) {
+        for (int c = 0; c < 5; ++c) {
+          for (int d = 0; d < 2; ++d) {
+            for (int b = 0; b < 7; ++b) {
+              float expected = 0.0f;
+              float expected_row_major = 0.0f;
+              if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
+                expected = tensor(d, r-1+i, c-2+j, b);
+                expected_row_major = tensor_row_major(b, c-2+j, r-1+i, d);
+              }
+              // ColMajor
+              if (entire_image_patch(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (entire_image_patch_row_major(b, patchId, c, r, d) !=
+                  expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j
+                     << " r=" << r << " c=" << c << " d=" << d << " b=" << b
+                     << std::endl;
+              }
+              VERIFY_IS_EQUAL(entire_image_patch_row_major(b, patchId, c, r, d),
+                              expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // 2D patch: ColMajor
+  Tensor<float, 5> twod_patch;
+  twod_patch = tensor.extract_image_patches(2, 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(twod_patch.dimension(4), 7);
+
+  // 2D patch: RowMajor
+  Tensor<float, 5, RowMajor> twod_patch_row_major;
+  twod_patch_row_major = tensor_row_major.extract_image_patches(2, 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 3*5);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(4), 2);
+
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  int row_padding = 0;
+  int col_padding = 0;
+  int stride = 1;
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      int patchId = i+3*j;
+      for (int r = 0; r < 2; ++r) {
+        for (int c = 0; c < 2; ++c) {
+          for (int d = 0; d < 2; ++d) {
+            for (int b = 0; b < 7; ++b) {
+              float expected = 0.0f;
+              float expected_row_major = 0.0f;
+              int row_offset = r*stride + i - row_padding;
+              int col_offset = c*stride + j - col_padding;
+              // ColMajor
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor.dimension(1) && col_offset < tensor.dimension(2)) {
+                expected = tensor(d, row_offset, col_offset, b);
+              }
+              if (twod_patch(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId, b), expected);
+
+              // RowMajor
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(2) && col_offset < tensor_row_major.dimension(1)) {
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+
+              }
+              if (twod_patch_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(twod_patch_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Verifies VALID padding (no padding) with incrementing values.
+void test_patch_padding_valid()
+{
+  int input_depth = 3;
+  int input_rows = 3;
+  int input_cols = 3;
+  int input_batches = 1;
+  int ksize = 2;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  int stride = 2;  // Only same stride is supported.
+  Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
+  // Initializes tensor with incrementing numbers.
+  for (int i = 0; i < tensor.size(); ++i) {
+    tensor.data()[i] = i + 1;
+  }
+  // ColMajor
+  Tensor<float, 5> result = tensor.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+
+  VERIFY_IS_EQUAL(result.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result.dimension(3), 1);  // number of patches
+  VERIFY_IS_EQUAL(result.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+  Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(3), tensor_row_major.dimension(0));
+
+  Tensor<float, 5, RowMajor> result_row_major = tensor_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  VERIFY_IS_EQUAL(result.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result.dimension(4), result_row_major.dimension(0));
+
+  // No padding is carried out.
+  int row_padding = 0;
+  int col_padding = 0;
+
+  for (int i = 0; (i+stride+ksize-1) < input_rows; i += stride) {  // input rows
+    for (int j = 0; (j+stride+ksize-1) < input_cols; j += stride) {  // input cols
+      int patchId = i+input_rows*j;
+      for (int r = 0; r < ksize; ++r) {  // patch rows
+        for (int c = 0; c < ksize; ++c) {  // patch cols
+          for (int d = 0; d < input_depth; ++d) {  // depth
+            for (int b = 0; b < input_batches; ++b) {  // batch
+              float expected = 0.0f;
+              float expected_row_major = 0.0f;
+              int row_offset = r + i - row_padding;
+              int col_offset = c + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected = tensor(d, row_offset, col_offset, b);
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+              }
+              // ColMajor
+              if (result(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Verifies VALID padding (no padding) with the same value.
+void test_patch_padding_valid_same_value()
+{
+  int input_depth = 1;
+  int input_rows = 5;
+  int input_cols = 5;
+  int input_batches = 2;
+  int ksize = 3;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  int stride = 2;  // Only same stride is supported.
+  // ColMajor
+  Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
+  tensor = tensor.constant(11.0f);
+  Tensor<float, 5> result = tensor.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+
+  VERIFY_IS_EQUAL(result.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result.dimension(3), 4);  // number of patches
+  VERIFY_IS_EQUAL(result.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+  Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(3), tensor_row_major.dimension(0));
+
+  Tensor<float, 5, RowMajor> result_row_major = tensor_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  VERIFY_IS_EQUAL(result.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result.dimension(4), result_row_major.dimension(0));
+
+  // No padding is carried out.
+  int row_padding = 0;
+  int col_padding = 0;
+
+  for (int i = 0; (i+stride+ksize-1) <= input_rows; i += stride) {  // input rows
+    for (int j = 0; (j+stride+ksize-1) <= input_cols; j += stride) {  // input cols
+      int patchId = i+input_rows*j;
+      for (int r = 0; r < ksize; ++r) {  // patch rows
+        for (int c = 0; c < ksize; ++c) {  // patch cols
+          for (int d = 0; d < input_depth; ++d) {  // depth
+            for (int b = 0; b < input_batches; ++b) {  // batch
+              float expected = 0.0f;
+              float expected_row_major = 0.0f;
+              int row_offset = r + i - row_padding;
+              int col_offset = c + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected = tensor(d, row_offset, col_offset, b);
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+              }
+              // ColMajor
+              if (result(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Verifies SAME padding.
+void test_patch_padding_same()
+{
+  int input_depth = 3;
+  int input_rows = 4;
+  int input_cols = 2;
+  int input_batches = 1;
+  int ksize = 2;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  int stride = 2;  // Only same stride is supported.
+  // ColMajor
+  Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
+  // Initializes tensor with incrementing numbers.
+  for (int i = 0; i < tensor.size(); ++i) {
+    tensor.data()[i] = i + 1;
+  }
+  Tensor<float, 5> result = tensor.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME);
+
+  VERIFY_IS_EQUAL(result.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result.dimension(3), 2);  // number of patches
+  VERIFY_IS_EQUAL(result.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+  Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(3), tensor_row_major.dimension(0));
+
+  Tensor<float, 5, RowMajor> result_row_major = tensor_row_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME);
+  VERIFY_IS_EQUAL(result.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result.dimension(4), result_row_major.dimension(0));
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be
+  // 0.
+  int row_padding = 0;
+  int col_padding = 0;
+
+  for (int i = 0; (i+stride+ksize-1) <= input_rows; i += stride) {  // input rows
+    for (int j = 0; (j+stride+ksize-1) <= input_cols; j += stride) {  // input cols
+      int patchId = i+input_rows*j;
+      for (int r = 0; r < ksize; ++r) {  // patch rows
+        for (int c = 0; c < ksize; ++c) {  // patch cols
+          for (int d = 0; d < input_depth; ++d) {  // depth
+            for (int b = 0; b < input_batches; ++b) {  // batch
+              float expected = 0.0f;
+              float expected_row_major = 0.0f;
+              int row_offset = r*stride + i - row_padding;
+              int col_offset = c*stride + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected = tensor(d, row_offset, col_offset, b);
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+              }
+              // ColMajor
+              if (result(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Verifies that SAME padding, when computed as negative values, will be clipped
+// to zero.
+void test_patch_padding_same_negative_padding_clip_to_zero() {
+  int input_depth = 1;
+  int input_rows = 15;
+  int input_cols = 1;
+  int input_batches = 1;
+  int ksize = 1;  // Corresponds to the Rows and Cols for
+                  // tensor.extract_image_patches<>.
+  int row_stride = 5;
+  int col_stride = 1;
+  // ColMajor
+  Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
+  // Initializes tensor with incrementing numbers.
+  for (int i = 0; i < tensor.size(); ++i) {
+    tensor.data()[i] = i + 1;
+  }
+  Tensor<float, 5> result = tensor.extract_image_patches(
+      ksize, ksize, row_stride, col_stride, 1, 1, PADDING_SAME);
+  // row padding will be computed as -2 originally and then be clipped to 0.
+  VERIFY_IS_EQUAL(result.coeff(0), 1.0f);
+  VERIFY_IS_EQUAL(result.coeff(1), 6.0f);
+  VERIFY_IS_EQUAL(result.coeff(2), 11.0f);
+
+  VERIFY_IS_EQUAL(result.dimension(0), input_depth);    // depth
+  VERIFY_IS_EQUAL(result.dimension(1), ksize);          // kernel rows
+  VERIFY_IS_EQUAL(result.dimension(2), ksize);          // kernel cols
+  VERIFY_IS_EQUAL(result.dimension(3), 3);              // number of patches
+  VERIFY_IS_EQUAL(result.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+  Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(3), tensor_row_major.dimension(0));
+
+  Tensor<float, 5, RowMajor> result_row_major =
+      tensor_row_major.extract_image_patches(ksize, ksize, row_stride,
+                                             col_stride, 1, 1, PADDING_SAME);
+  VERIFY_IS_EQUAL(result_row_major.coeff(0), 1.0f);
+  VERIFY_IS_EQUAL(result_row_major.coeff(1), 6.0f);
+  VERIFY_IS_EQUAL(result_row_major.coeff(2), 11.0f);
+
+  VERIFY_IS_EQUAL(result.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result.dimension(4), result_row_major.dimension(0));
+}
+
+void test_patch_no_extra_dim()
+{
+  Tensor<float, 3> tensor(2,3,5);
+  tensor.setRandom();
+  Tensor<float, 3, RowMajor> tensor_row_major = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(0));
+
+  // Single pixel patch: ColMajor
+  Tensor<float, 4> single_pixel_patch;
+  single_pixel_patch = tensor.extract_image_patches(1, 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(3), 3*5);
+
+  // Single pixel patch: RowMajor
+  Tensor<float, 4, RowMajor> single_pixel_patch_row_major;
+  single_pixel_patch_row_major = tensor_row_major.extract_image_patches(1, 1);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(0), 3*5);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(3), 2);
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    // ColMajor
+    if (tensor.data()[i] != single_pixel_patch.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : " << tensor.data()[i] << " vs " << single_pixel_patch.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]);
+    // RowMajor
+    if (tensor_row_major.data()[i] != single_pixel_patch_row_major.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : "
+           << tensor.data()[i] << " vs "
+           << single_pixel_patch_row_major.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_pixel_patch_row_major.data()[i],
+                    tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor.data()[i], tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(single_pixel_patch.data()[i],
+                    single_pixel_patch_row_major.data()[i]);
+  }
+
+  // Entire image patch: ColMajor
+  Tensor<float, 4> entire_image_patch;
+  entire_image_patch = tensor.extract_image_patches(3, 5);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(1), 3);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(3), 3*5);
+
+  // Entire image patch: RowMajor
+  Tensor<float, 4, RowMajor> entire_image_patch_row_major;
+  entire_image_patch_row_major = tensor_row_major.extract_image_patches(3, 5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 2);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      int patchId = i+3*j;
+      for (int r = 0; r < 3; ++r) {
+        for (int c = 0; c < 5; ++c) {
+          for (int d = 0; d < 2; ++d) {
+            float expected = 0.0f;
+            float expected_row_major = 0.0f;
+            if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
+              expected = tensor(d, r-1+i, c-2+j);
+              expected_row_major = tensor_row_major(c-2+j, r-1+i, d);
+            }
+            // ColMajor
+            if (entire_image_patch(d, r, c, patchId) != expected) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId), expected);
+            // RowMajor
+            if (entire_image_patch_row_major(patchId, c, r, d) !=
+                expected_row_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(entire_image_patch_row_major(patchId, c, r, d),
+                            expected_row_major);
+            // Check that ColMajor and RowMajor agree.
+            VERIFY_IS_EQUAL(expected, expected_row_major);
+          }
+        }
+      }
+    }
+  }
+
+  // 2D patch: ColMajor
+  Tensor<float, 4> twod_patch;
+  twod_patch = tensor.extract_image_patches(2, 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5);
+
+  // 2D patch: RowMajor
+  Tensor<float, 4, RowMajor> twod_patch_row_major;
+  twod_patch_row_major = tensor_row_major.extract_image_patches(2, 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 3*5);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2);
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  int row_padding = 0;
+  int col_padding = 0;
+  int stride = 1;
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      int patchId = i+3*j;
+      for (int r = 0; r < 2; ++r) {
+        for (int c = 0; c < 2; ++c) {
+          for (int d = 0; d < 2; ++d) {
+            float expected = 0.0f;
+            float expected_row_major = 0.0f;
+            int row_offset = r*stride + i - row_padding;
+            int col_offset = c*stride + j - col_padding;
+            // ColMajor
+            if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor.dimension(1) && col_offset < tensor.dimension(2)) {
+              expected = tensor(d, row_offset, col_offset);
+            }
+            if (twod_patch(d, r, c, patchId) != expected) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId), expected);
+            // RowMajor
+            if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(1) && col_offset < tensor_row_major.dimension(0)) {
+              expected_row_major = tensor_row_major(col_offset, row_offset, d);
+            }
+            if (twod_patch_row_major(patchId, c, r, d) != expected_row_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(twod_patch_row_major(patchId, c, r, d), expected_row_major);
+            // Check that ColMajor and RowMajor agree.
+            VERIFY_IS_EQUAL(expected, expected_row_major);
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_imagenet_patches()
+{
+  // Test the code on typical configurations used by the 'imagenet' benchmarks at
+  // https://github.com/soumith/convnet-benchmarks
+  // ColMajor
+  Tensor<float, 4> l_in(3, 128, 128, 16);
+  l_in.setRandom();
+  Tensor<float, 5> l_out = l_in.extract_image_patches(11, 11);
+  VERIFY_IS_EQUAL(l_out.dimension(0), 3);
+  VERIFY_IS_EQUAL(l_out.dimension(1), 11);
+  VERIFY_IS_EQUAL(l_out.dimension(2), 11);
+  VERIFY_IS_EQUAL(l_out.dimension(3), 128*128);
+  VERIFY_IS_EQUAL(l_out.dimension(4), 16);
+
+  // RowMajor
+  Tensor<float, 5, RowMajor> l_out_row_major = l_in.swap_layout().extract_image_patches(11, 11);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 16);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 128*128);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 11);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 11);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 3);
+
+  for (int b = 0; b < 16; ++b) {
+    for (int i = 0; i < 128; ++i) {
+      for (int j = 0; j < 128; ++j) {
+        int patchId = i+128*j;
+        for (int c = 0; c < 11; ++c) {
+          for (int r = 0; r < 11; ++r) {
+            for (int d = 0; d < 3; ++d) {
+              float expected = 0.0f;
+              if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) {
+                expected = l_in(d, r-5+i, c-5+j, b);
+              }
+              // ColMajor
+              if (l_out(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) !=
+                  expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j
+                     << " r=" << r << " c=" << c << " d=" << d << " b=" << b
+                     << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d),
+                              expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ColMajor
+  l_in.resize(16, 64, 64, 32);
+  l_in.setRandom();
+  l_out = l_in.extract_image_patches(9, 9);
+  VERIFY_IS_EQUAL(l_out.dimension(0), 16);
+  VERIFY_IS_EQUAL(l_out.dimension(1), 9);
+  VERIFY_IS_EQUAL(l_out.dimension(2), 9);
+  VERIFY_IS_EQUAL(l_out.dimension(3), 64*64);
+  VERIFY_IS_EQUAL(l_out.dimension(4), 32);
+
+  // RowMajor
+  l_out_row_major = l_in.swap_layout().extract_image_patches(9, 9);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 64*64);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 9);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 9);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 16);
+
+  for (int b = 0; b < 32; ++b) {
+    for (int i = 0; i < 64; ++i) {
+      for (int j = 0; j < 64; ++j) {
+        int patchId = i+64*j;
+        for (int c = 0; c < 9; ++c) {
+          for (int r = 0; r < 9; ++r) {
+            for (int d = 0; d < 16; ++d) {
+              float expected = 0.0f;
+              if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) {
+                expected = l_in(d, r-4+i, c-4+j, b);
+              }
+              // ColMajor
+              if (l_out(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ColMajor
+  l_in.resize(32, 16, 16, 32);
+  l_in.setRandom();
+  l_out = l_in.extract_image_patches(7, 7);
+  VERIFY_IS_EQUAL(l_out.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out.dimension(1), 7);
+  VERIFY_IS_EQUAL(l_out.dimension(2), 7);
+  VERIFY_IS_EQUAL(l_out.dimension(3), 16*16);
+  VERIFY_IS_EQUAL(l_out.dimension(4), 32);
+
+  // RowMajor
+  l_out_row_major = l_in.swap_layout().extract_image_patches(7, 7);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 16*16);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 7);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 7);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 32);
+
+  for (int b = 0; b < 32; ++b) {
+    for (int i = 0; i < 16; ++i) {
+      for (int j = 0; j < 16; ++j) {
+        int patchId = i+16*j;
+        for (int c = 0; c < 7; ++c) {
+          for (int r = 0; r < 7; ++r) {
+            for (int d = 0; d < 32; ++d) {
+              float expected = 0.0f;
+              if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) {
+                expected = l_in(d, r-3+i, c-3+j, b);
+              }
+              // ColMajor
+              if (l_out(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ColMajor
+  l_in.resize(64, 13, 13, 32);
+  l_in.setRandom();
+  l_out = l_in.extract_image_patches(3, 3);
+  VERIFY_IS_EQUAL(l_out.dimension(0), 64);
+  VERIFY_IS_EQUAL(l_out.dimension(1), 3);
+  VERIFY_IS_EQUAL(l_out.dimension(2), 3);
+  VERIFY_IS_EQUAL(l_out.dimension(3), 13*13);
+  VERIFY_IS_EQUAL(l_out.dimension(4), 32);
+
+  // RowMajor
+  l_out_row_major = l_in.swap_layout().extract_image_patches(3, 3);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 13*13);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 3);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 3);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 64);
+
+  for (int b = 0; b < 32; ++b) {
+    for (int i = 0; i < 13; ++i) {
+      for (int j = 0; j < 13; ++j) {
+        int patchId = i+13*j;
+        for (int c = 0; c < 3; ++c) {
+          for (int r = 0; r < 3; ++r) {
+            for (int d = 0; d < 64; ++d) {
+              float expected = 0.0f;
+              if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) {
+                expected = l_in(d, r-1+i, c-1+j, b);
+              }
+              // ColMajor
+              if (l_out(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_image_patch)
+{
+  CALL_SUBTEST_1(test_simple_patch());
+  CALL_SUBTEST_2(test_patch_no_extra_dim());
+  CALL_SUBTEST_3(test_patch_padding_valid());
+  CALL_SUBTEST_4(test_patch_padding_valid_same_value());
+  CALL_SUBTEST_5(test_patch_padding_same());
+  CALL_SUBTEST_6(test_imagenet_patches());
+  CALL_SUBTEST_7(test_patch_padding_same_negative_padding_clip_to_zero());
+}

diff --git a/unsupported/test/cxx11_tensor_image_patch_sycl.cpp b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp
new file mode 100644
index 0000000..c1828a0
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp

@@ -0,0 +1,1092 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+static const int DataLayout = ColMajor;
+
+template <typename DataType, typename IndexType>
+static void test_simple_image_patch_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  array<IndexType, 4> tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+  tensor_col_major.setRandom();
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+  // Single pixel patch: ColMajor
+  array<IndexType, 5> patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3, sizeDim4}};
+  Tensor<DataType, 5, DataLayout,IndexType> single_patch_col_major(patchColMajorTensorRange);
+  size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange);
+  gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1);
+  sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), 2);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(4), 7);
+
+  // Single pixel patch: RowMajor
+  array<IndexType, 5> patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 1, 1, sizeDim1}};
+  Tensor<DataType, 5, RowMajor,IndexType> single_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange);
+  gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1);
+  sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 3*5);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(4), 2);
+
+  for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+    // ColMajor
+    if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) {
+      std::cout << "Mismatch detected at index colmajor " << i << " : "
+           << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i]
+           << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]);
+    // RowMajor
+    if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) {
+      std::cout << "Mismatch detected at index row major" << i << " : "
+           << tensor_row_major.data()[i] << " vs "
+           << single_patch_row_major.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_patch_row_major.data()[i],
+                    tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(single_patch_col_major.data()[i],
+                    single_patch_row_major.data()[i]);
+  }
+
+
+  // Entire image patch: ColMajor
+  patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3, sizeDim4}};
+  Tensor<DataType, 5, DataLayout,IndexType> entire_image_patch_col_major(patchColMajorTensorRange);
+  patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_entire_image_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange);
+  gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5);
+  sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(4), 7);
+
+  // Entire image patch: RowMajor
+  patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}};
+  Tensor<DataType, 5, RowMajor,IndexType> entire_image_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_entire_image_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange);
+  gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5);
+  sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(4), 2);
+
+  for (IndexType i = 0; i < 3; ++i) {
+    for (IndexType j = 0; j < 5; ++j) {
+      IndexType patchId = i+3*j;
+      for (IndexType r = 0; r < 3; ++r) {
+        for (IndexType c = 0; c < 5; ++c) {
+          for (IndexType d = 0; d < 2; ++d) {
+            for (IndexType b = 0; b < 7; ++b) {
+              DataType expected_col_major = 0.0f;
+              DataType expected_row_major = 0.0f;
+              if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
+                expected_col_major = tensor_col_major(d, r-1+i, c-2+j, b);
+                expected_row_major = tensor_row_major(b, c-2+j, r-1+i, d);
+              }
+              // ColMajor
+              if (entire_image_patch_col_major(d, r, c, patchId, b) != expected_col_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId, b), expected_col_major);
+              // RowMajor
+              if (entire_image_patch_row_major(b, patchId, c, r, d) !=
+                  expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j
+                     << " r=" << r << " c=" << c << " d=" << d << " b=" << b
+                     << std::endl;
+              }
+              VERIFY_IS_EQUAL(entire_image_patch_row_major(b, patchId, c, r, d),
+                              expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // 2D patch: ColMajor
+  patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3, sizeDim4}};
+  Tensor<DataType, 5, DataLayout,IndexType> twod_patch_col_major(patchColMajorTensorRange);
+  patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_twod_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange);
+  gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2);
+  sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(4), 7);
+
+  // 2D patch: RowMajor
+  patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 2, 2, sizeDim1}};
+  Tensor<DataType, 5, RowMajor,IndexType> twod_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_twod_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange);
+  gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2);
+  sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 3*5);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(4), 2);
+
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  IndexType row_padding = 0;
+  IndexType col_padding = 0;
+  IndexType stride = 1;
+
+  for (IndexType i = 0; i < 3; ++i) {
+    for (IndexType j = 0; j < 5; ++j) {
+      IndexType patchId = i+3*j;
+      for (IndexType r = 0; r < 2; ++r) {
+        for (IndexType c = 0; c < 2; ++c) {
+          for (IndexType d = 0; d < 2; ++d) {
+            for (IndexType b = 0; b < 7; ++b) {
+              DataType expected_col_major = 0.0f;
+              DataType expected_row_major = 0.0f;
+              IndexType row_offset = r*stride + i - row_padding;
+              IndexType col_offset = c*stride + j - col_padding;
+              // ColMajor
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) {
+                expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+              }
+              if (twod_patch_col_major(d, r, c, patchId, b) != expected_col_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId, b), expected_col_major);
+
+              // RowMajor
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(2) && col_offset < tensor_row_major.dimension(1)) {
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+
+              }
+              if (twod_patch_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(twod_patch_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(gpu_data_col_major);
+  sycl_device.deallocate(gpu_data_row_major);
+  sycl_device.deallocate(gpu_data_single_patch_col_major);
+  sycl_device.deallocate(gpu_data_single_patch_row_major);
+  sycl_device.deallocate(gpu_data_entire_image_patch_col_major);
+  sycl_device.deallocate(gpu_data_entire_image_patch_row_major);
+  sycl_device.deallocate(gpu_data_twod_patch_col_major);
+  sycl_device.deallocate(gpu_data_twod_patch_row_major);
+
+}
+
+
+// Verifies VALID padding (no padding) with incrementing values.
+template <typename DataType, typename IndexType>
+static void test_patch_padding_valid_sycl(const Eigen::SyclDevice& sycl_device){
+  IndexType input_depth = 3;
+  IndexType input_rows = 3;
+  IndexType input_cols = 3;
+  IndexType input_batches = 1;
+  IndexType ksize = 2;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  IndexType stride = 2;  // Only same stride is supported.
+
+  array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}};
+  array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+  // Initializes tensor with incrementing numbers.
+  for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+    tensor_col_major.data()[i] = i + 1;
+  }
+  // ColMajor
+  array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 1, input_batches}};
+  Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange);
+  size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_result_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange);
+  gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), 1);  // number of patches
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+  array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 1, ksize, ksize, input_depth }};
+  Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =result_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_result_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange);
+  gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0));
+
+  // No padding is carried out.
+  IndexType row_padding = 0;
+  IndexType col_padding = 0;
+
+  for (IndexType i = 0; (i+stride+ksize-1) < input_rows; i += stride) {  // input rows
+    for (IndexType j = 0; (j+stride+ksize-1) < input_cols; j += stride) {  // input cols
+      IndexType patchId = i+input_rows*j;
+      for (IndexType r = 0; r < ksize; ++r) {  // patch rows
+        for (IndexType c = 0; c < ksize; ++c) {  // patch cols
+          for (IndexType d = 0; d < input_depth; ++d) {  // depth
+            for (IndexType b = 0; b < input_batches; ++b) {  // batch
+              DataType expected_col_major = 0.0f;
+              DataType expected_row_major = 0.0f;
+              IndexType row_offset = r + i - row_padding;
+              IndexType col_offset = c + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+              }
+              // ColMajor
+              if (result_col_major(d, r, c, patchId, b) != expected_col_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major);
+              // RowMajor
+              if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_col_major);
+  sycl_device.deallocate(gpu_data_row_major);
+  sycl_device.deallocate(gpu_data_result_col_major);
+  sycl_device.deallocate(gpu_data_result_row_major);
+}
+
+// Verifies VALID padding (no padding) with the same value.
+template <typename DataType, typename IndexType>
+static void test_patch_padding_valid_same_value_sycl(const Eigen::SyclDevice& sycl_device){
+  IndexType input_depth = 1;
+  IndexType input_rows = 5;
+  IndexType input_cols = 5;
+  IndexType input_batches = 2;
+  IndexType ksize = 3;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  IndexType stride = 2;  // Only same stride is supported.
+  // ColMajor
+
+  array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}};
+  array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+  gpu_col_major.device(sycl_device)=gpu_col_major.constant(11.0f);
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor_col_major.data(), gpu_data_col_major, (tensor_col_major.size())*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+  array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 4, input_batches}};
+  Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange);
+  size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_result_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange);
+  gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), 4);  // number of patches
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+  array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 4, ksize, ksize, input_depth }};
+  Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =result_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_result_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange);
+  gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0));
+
+  // No padding is carried out.
+  IndexType row_padding = 0;
+  IndexType col_padding = 0;
+
+  for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) {  // input rows
+    for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) {  // input cols
+      IndexType patchId = i+input_rows*j;
+      for (IndexType r = 0; r < ksize; ++r) {  // patch rows
+        for (IndexType c = 0; c < ksize; ++c) {  // patch cols
+          for (IndexType d = 0; d < input_depth; ++d) {  // depth
+            for (IndexType b = 0; b < input_batches; ++b) {  // batch
+              DataType expected_col_major = 0.0f;
+              DataType expected_row_major = 0.0f;
+              IndexType row_offset = r + i - row_padding;
+              IndexType col_offset = c + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+              }
+              // ColMajor
+              if (result_col_major(d, r, c, patchId, b) != expected_col_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major);
+              // RowMajor
+              if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Verifies SAME padding.
+template <typename DataType, typename IndexType>
+static void test_patch_padding_same_sycl(const Eigen::SyclDevice& sycl_device){
+  IndexType input_depth = 3;
+  IndexType input_rows = 4;
+  IndexType input_cols = 2;
+  IndexType input_batches = 1;
+  IndexType ksize = 2;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  IndexType stride = 2;  // Only same stride is supported.
+
+  // ColMajor
+  array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}};
+  array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+  // Initializes tensor with incrementing numbers.
+  for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+    tensor_col_major.data()[i] = i + 1;
+  }
+
+array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 2, input_batches}};
+Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange);
+size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType);
+DataType* gpu_data_result_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange);
+gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME);
+sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize);
+
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), 2);  // number of patches
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+
+  array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 2, ksize, ksize, input_depth }};
+  Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =result_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_result_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange);
+  gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME);
+  sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0));
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  IndexType row_padding = 0;
+  IndexType col_padding = 0;
+
+  for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) {  // input rows
+    for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) {  // input cols
+      IndexType patchId = i+input_rows*j;
+      for (IndexType r = 0; r < ksize; ++r) {  // patch rows
+        for (IndexType c = 0; c < ksize; ++c) {  // patch cols
+          for (IndexType d = 0; d < input_depth; ++d) {  // depth
+            for (IndexType b = 0; b < input_batches; ++b) {  // batch
+              DataType expected_col_major = 0.0f;
+              DataType expected_row_major = 0.0f;
+              IndexType row_offset = r*stride + i - row_padding;
+              IndexType col_offset = c*stride + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+              }
+              // ColMajor
+              if (result_col_major(d, r, c, patchId, b) != expected_col_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major);
+              // RowMajor
+              if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+
+template <typename DataType, typename IndexType>
+static void test_patch_no_extra_dim_sycl(const Eigen::SyclDevice& sycl_device){
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+
+  // ColMajor
+  array<IndexType, 3> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  array<IndexType, 3> tensorRowMajorRange = {{sizeDim3, sizeDim2, sizeDim1}};
+  Tensor<DataType, 3, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  tensor_col_major.setRandom();
+  Tensor<DataType, 3, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(0));
+
+
+  // Single pixel patch: ColMajor
+  array<IndexType, 4> patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3}};
+  Tensor<DataType, 4, DataLayout,IndexType> single_patch_col_major(patchColMajorTensorRange);
+  size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange);
+  gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1);
+  sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), sizeDim2*sizeDim3);
+
+  // Single pixel patch: RowMajor
+  array<IndexType, 4> patchRowMajorTensorRange={{sizeDim2*sizeDim3, 1, 1, sizeDim1}};
+  Tensor<DataType, 4, RowMajor,IndexType> single_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange);
+  gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1);
+  sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), sizeDim2*sizeDim3);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), sizeDim1);
+
+  for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+    // ColMajor
+    if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : " << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]);
+    // RowMajor
+    if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : "
+           << tensor_col_major.data()[i] << " vs "
+           << single_patch_row_major.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_patch_row_major.data()[i],
+                    tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(single_patch_col_major.data()[i],
+                    single_patch_row_major.data()[i]);
+  }
+
+  // Entire image patch: ColMajor
+  patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3}};
+  Tensor<DataType, 4, DataLayout,IndexType> entire_image_patch_col_major(patchColMajorTensorRange);
+  patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_entire_image_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange);
+  gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5);
+  sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5);
+
+  // Entire image patch: RowMajor
+patchRowMajorTensorRange={{sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}};
+Tensor<DataType, 4, RowMajor,IndexType> entire_image_patch_row_major(patchRowMajorTensorRange);
+patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType);
+DataType* gpu_data_entire_image_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange);
+gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5);
+sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 2);
+
+  for (IndexType i = 0; i < 3; ++i) {
+    for (IndexType j = 0; j < 5; ++j) {
+      IndexType patchId = i+3*j;
+      for (IndexType r = 0; r < 3; ++r) {
+        for (IndexType c = 0; c < 5; ++c) {
+          for (IndexType d = 0; d < 2; ++d) {
+            DataType expected_col_major = 0.0f;
+            DataType expected_row_major = 0.0f;
+            if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
+              expected_col_major = tensor_col_major(d, r-1+i, c-2+j);
+              expected_row_major = tensor_row_major(c-2+j, r-1+i, d);
+            }
+            // ColMajor
+            if (entire_image_patch_col_major(d, r, c, patchId) != expected_col_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId), expected_col_major);
+            // RowMajor
+            if (entire_image_patch_row_major(patchId, c, r, d) !=
+                expected_row_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(entire_image_patch_row_major(patchId, c, r, d),
+                            expected_row_major);
+            // Check that ColMajor and RowMajor agree.
+            VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+          }
+        }
+      }
+    }
+  }
+
+  // 2D patch: ColMajor
+  patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3}};
+  Tensor<DataType, 4, DataLayout,IndexType> twod_patch_col_major(patchColMajorTensorRange);
+  patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_twod_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange);
+  gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2);
+  sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5);
+
+  // 2D patch: RowMajor
+  patchRowMajorTensorRange={{sizeDim2*sizeDim3, 2, 2, sizeDim1}};
+  Tensor<DataType, 4, RowMajor,IndexType> twod_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_twod_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange);
+  gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2);
+  sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 3*5);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2);
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  IndexType row_padding = 0;
+  IndexType col_padding = 0;
+  IndexType stride = 1;
+
+  for (IndexType i = 0; i < 3; ++i) {
+    for (IndexType j = 0; j < 5; ++j) {
+      IndexType patchId = i+3*j;
+      for (IndexType r = 0; r < 2; ++r) {
+        for (IndexType c = 0; c < 2; ++c) {
+          for (IndexType d = 0; d < 2; ++d) {
+            DataType expected_col_major = 0.0f;
+            DataType expected_row_major = 0.0f;
+            IndexType row_offset = r*stride + i - row_padding;
+            IndexType col_offset = c*stride + j - col_padding;
+            // ColMajor
+            if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) {
+              expected_col_major = tensor_col_major(d, row_offset, col_offset);
+            }
+            if (twod_patch_col_major(d, r, c, patchId) != expected_col_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId), expected_col_major);
+            // RowMajor
+            if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(1) && col_offset < tensor_row_major.dimension(0)) {
+              expected_row_major = tensor_row_major(col_offset, row_offset, d);
+            }
+            if (twod_patch_row_major(patchId, c, r, d) != expected_row_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(twod_patch_row_major(patchId, c, r, d), expected_row_major);
+            // Check that ColMajor and RowMajor agree.
+            VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+          }
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(gpu_data_col_major);
+  sycl_device.deallocate(gpu_data_row_major);
+  sycl_device.deallocate(gpu_data_single_patch_col_major);
+  sycl_device.deallocate(gpu_data_single_patch_row_major);
+  sycl_device.deallocate(gpu_data_entire_image_patch_col_major);
+  sycl_device.deallocate(gpu_data_entire_image_patch_row_major);
+  sycl_device.deallocate(gpu_data_twod_patch_col_major);
+  sycl_device.deallocate(gpu_data_twod_patch_row_major);
+}
+
+template <typename DataType, typename IndexType>
+static void test_imagenet_patches_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  // Test the code on typical configurations used by the 'imagenet' benchmarks at
+  // https://github.com/soumith/convnet-benchmarks
+  // ColMajor
+  IndexType sizeDim1 = 3;
+  IndexType sizeDim2 = 128;
+  IndexType sizeDim3 = 128;
+  IndexType sizeDim4 = 16;
+  array<IndexType, 4> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> l_in_col_major(tensorColMajorRange);
+  l_in_col_major.setRandom();
+
+  DataType* gpu_data_l_in_col_major  = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_l_in_col_major(gpu_data_l_in_col_major, tensorColMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+
+  array<IndexType, 5> patchTensorRange={{sizeDim1, 11, 11, sizeDim2*sizeDim3, sizeDim4}};
+  Tensor<DataType, 5, DataLayout,IndexType> l_out_col_major(patchTensorRange);
+  size_t patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_l_out_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_l_out_col_major(gpu_data_l_out_col_major, patchTensorRange);
+  gpu_l_out_col_major.device(sycl_device)=gpu_l_in_col_major.extract_image_patches(11, 11);
+  sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 11);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 11);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(3), sizeDim2*sizeDim3);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(4), sizeDim4);
+
+  // RowMajor
+  patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 11, 11, sizeDim1}};
+  Tensor<DataType, 5, RowMajor,IndexType> l_out_row_major(patchTensorRange);
+  patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_l_out_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_l_out_row_major(gpu_data_l_out_row_major, patchTensorRange);
+  gpu_l_out_row_major.device(sycl_device)=gpu_l_in_col_major.swap_layout().extract_image_patches(11, 11);
+  sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), sizeDim4);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), sizeDim2*sizeDim3);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 11);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 11);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), sizeDim1);
+
+  for (IndexType b = 0; b < 16; ++b) {
+    for (IndexType i = 0; i < 128; ++i) {
+      for (IndexType j = 0; j < 128; ++j) {
+        IndexType patchId = i+128*j;
+        for (IndexType c = 0; c < 11; ++c) {
+          for (IndexType r = 0; r < 11; ++r) {
+            for (IndexType d = 0; d < 3; ++d) {
+              DataType expected = 0.0f;
+              if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) {
+                expected = l_in_col_major(d, r-5+i, c-5+j, b);
+              }
+              // ColMajor
+              if (l_out_col_major(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) !=
+                  expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j
+                     << " r=" << r << " c=" << c << " d=" << d << " b=" << b
+                     << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d),
+                              expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ColMajor
+  sycl_device.deallocate(gpu_data_l_in_col_major);
+  sycl_device.deallocate(gpu_data_l_out_col_major);
+  sizeDim1 = 16;
+  sizeDim2 = 64;
+  sizeDim3 = 64;
+  sizeDim4 = 32;
+  tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  l_in_col_major.resize(tensorColMajorRange);
+  l_in_col_major.setRandom();
+  gpu_data_l_in_col_major  = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize1(gpu_data_l_in_col_major, tensorColMajorRange);
+
+  patchTensorRange={{sizeDim1, 9, 9, sizeDim2*sizeDim3, sizeDim4}};
+  l_out_col_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+  gpu_data_l_out_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize1(gpu_data_l_out_col_major, patchTensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+  gpu_l_out_col_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.extract_image_patches(9, 9);
+  sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 16);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 9);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 9);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 64*64);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32);
+
+// RowMajor
+  sycl_device.deallocate(gpu_data_l_out_row_major);
+  patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 9, 9 ,sizeDim1}};
+  l_out_row_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+  gpu_data_l_out_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize1(gpu_data_l_out_row_major, patchTensorRange);
+  gpu_l_out_row_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.swap_layout().extract_image_patches(9, 9);
+  sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 64*64);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 9);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 9);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 16);
+
+  for (IndexType b = 0; b < 32; ++b) {
+    for (IndexType i = 0; i < 64; ++i) {
+      for (IndexType j = 0; j < 64; ++j) {
+        IndexType patchId = i+64*j;
+        for (IndexType c = 0; c < 9; ++c) {
+          for (IndexType r = 0; r < 9; ++r) {
+            for (IndexType d = 0; d < 16; ++d) {
+              DataType expected = 0.0f;
+              if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) {
+                expected = l_in_col_major(d, r-4+i, c-4+j, b);
+              }
+              // ColMajor
+              if (l_out_col_major(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ColMajor
+
+  sycl_device.deallocate(gpu_data_l_in_col_major);
+  sycl_device.deallocate(gpu_data_l_out_col_major);
+  sizeDim1 = 32;
+  sizeDim2 = 16;
+  sizeDim3 = 16;
+  sizeDim4 = 32;
+  tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  l_in_col_major.resize(tensorColMajorRange);
+  l_in_col_major.setRandom();
+  gpu_data_l_in_col_major  = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize2(gpu_data_l_in_col_major, tensorColMajorRange);
+
+  patchTensorRange={{sizeDim1, 7, 7, sizeDim2*sizeDim3, sizeDim4}};
+  l_out_col_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+  gpu_data_l_out_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize2(gpu_data_l_out_col_major, patchTensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+  gpu_l_out_col_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.extract_image_patches(7, 7);
+  sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 7);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 7);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 16*16);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32);
+
+  // RowMajor
+  sycl_device.deallocate(gpu_data_l_out_row_major);
+  patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 7, 7 ,sizeDim1}};
+  l_out_row_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+  gpu_data_l_out_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize2(gpu_data_l_out_row_major, patchTensorRange);
+  gpu_l_out_row_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.swap_layout().extract_image_patches(7, 7);
+  sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 16*16);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 7);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 7);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 32);
+
+  for (IndexType b = 0; b < 32; ++b) {
+    for (IndexType i = 0; i < 16; ++i) {
+      for (IndexType j = 0; j < 16; ++j) {
+        IndexType patchId = i+16*j;
+        for (IndexType c = 0; c < 7; ++c) {
+          for (IndexType r = 0; r < 7; ++r) {
+            for (IndexType d = 0; d < 32; ++d) {
+              DataType expected = 0.0f;
+              if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) {
+                expected = l_in_col_major(d, r-3+i, c-3+j, b);
+              }
+              // ColMajor
+              if (l_out_col_major(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ColMajor
+  sycl_device.deallocate(gpu_data_l_in_col_major);
+  sycl_device.deallocate(gpu_data_l_out_col_major);
+  sizeDim1 = 64;
+  sizeDim2 = 13;
+  sizeDim3 = 13;
+  sizeDim4 = 32;
+  tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  l_in_col_major.resize(tensorColMajorRange);
+  l_in_col_major.setRandom();
+  gpu_data_l_in_col_major  = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize3(gpu_data_l_in_col_major, tensorColMajorRange);
+
+  patchTensorRange={{sizeDim1, 3, 3, sizeDim2*sizeDim3, sizeDim4}};
+  l_out_col_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+  gpu_data_l_out_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize3(gpu_data_l_out_col_major, patchTensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+  gpu_l_out_col_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.extract_image_patches(3, 3);
+  sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 64);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 3);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 3);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 13*13);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32);
+
+  // RowMajor
+  sycl_device.deallocate(gpu_data_l_out_row_major);
+  patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 3, 3 ,sizeDim1}};
+  l_out_row_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+  gpu_data_l_out_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize3(gpu_data_l_out_row_major, patchTensorRange);
+  gpu_l_out_row_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.swap_layout().extract_image_patches(3, 3);
+  sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 13*13);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 3);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 3);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 64);
+
+  for (IndexType b = 0; b < 32; ++b) {
+    for (IndexType i = 0; i < 13; ++i) {
+      for (IndexType j = 0; j < 13; ++j) {
+        IndexType patchId = i+13*j;
+        for (IndexType c = 0; c < 3; ++c) {
+          for (IndexType r = 0; r < 3; ++r) {
+            for (IndexType d = 0; d < 64; ++d) {
+              DataType expected = 0.0f;
+              if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) {
+                expected = l_in_col_major(d, r-1+i, c-1+j, b);
+              }
+              // ColMajor
+              if (l_out_col_major(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_l_in_col_major);
+  sycl_device.deallocate(gpu_data_l_out_col_major);
+  sycl_device.deallocate(gpu_data_l_out_row_major);
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_image_patch_test_per_device(dev_Selector s){
+QueueInterface queueInterface(s);
+auto sycl_device = Eigen::SyclDevice(&queueInterface);
+test_simple_image_patch_sycl<DataType, int64_t>(sycl_device);
+test_patch_padding_valid_sycl<DataType, int64_t>(sycl_device);
+test_patch_padding_valid_same_value_sycl<DataType, int64_t>(sycl_device);
+test_patch_padding_same_sycl<DataType, int64_t>(sycl_device);
+test_patch_no_extra_dim_sycl<DataType, int64_t>(sycl_device);
+test_imagenet_patches_sycl<DataType, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_image_patch_sycl)
+{
+for (const auto& device :Eigen::get_sycl_supported_devices()) {
+  CALL_SUBTEST(sycl_tensor_image_patch_test_per_device<float>(device));
+}
+}

diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp
new file mode 100644
index 0000000..2166532
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_index_list.cpp

@@ -0,0 +1,385 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+#ifdef EIGEN_HAS_INDEX_LIST
+
+static void test_static_index_list()
+{
+  Tensor<float, 4> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  constexpr auto reduction_axis = make_index_list(0, 1, 2);
+  VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 0);
+  VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
+  VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[0]), 0);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[1]), 1);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[2]), 2);
+
+  EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_axis) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::array_get<1>(reduction_axis) == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::array_get<2>(reduction_axis) == 2), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  Tensor<float, 1> result = tensor.sum(reduction_axis);
+  for (int i = 0; i < result.size(); ++i) {
+    float expected = 0.0f;
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          expected += tensor(j,k,l,i);
+        }
+      }
+    }
+    VERIFY_IS_APPROX(result(i), expected);
+  }
+}
+
+
+static void test_type2index_list()
+{
+  Tensor<float, 5> tensor(2,3,5,7,11);
+  tensor.setRandom();
+  tensor += tensor.constant(10.0f);
+
+  typedef Eigen::IndexList<Eigen::type2index<0>> Dims0;
+  typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>> Dims1;
+  typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>, Eigen::type2index<2>> Dims2;
+  typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>, Eigen::type2index<2>, Eigen::type2index<3>> Dims3;
+  typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>, Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> Dims4;
+
+#if 0
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims0>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims1>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims2>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims3>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims4>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+#endif
+
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims0, 1, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims1, 2, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims2, 3, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims3, 4, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims4, 5, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims0, 1, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims1, 2, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims2, 3, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims3, 4, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims4, 5, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  const Dims0 reduction_axis0;
+  Tensor<float, 4> result0 = tensor.sum(reduction_axis0);
+  for (int m = 0; m < 11; ++m) {
+    for (int l = 0; l < 7; ++l) {
+      for (int k = 0; k < 5; ++k) {
+        for (int j = 0; j < 3; ++j) {
+          float expected = 0.0f;
+          for (int i = 0; i < 2; ++i) {
+            expected += tensor(i,j,k,l,m);
+          }
+          VERIFY_IS_APPROX(result0(j,k,l,m), expected);
+        }
+      }
+    }
+  }
+
+  const Dims1 reduction_axis1;
+  Tensor<float, 3> result1 = tensor.sum(reduction_axis1);
+  for (int m = 0; m < 11; ++m) {
+    for (int l = 0; l < 7; ++l) {
+      for (int k = 0; k < 5; ++k) {
+        float expected = 0.0f;
+        for (int j = 0; j < 3; ++j) {
+          for (int i = 0; i < 2; ++i) {
+            expected += tensor(i,j,k,l,m);
+          }
+        }
+        VERIFY_IS_APPROX(result1(k,l,m), expected);
+      }
+    }
+  }
+
+  const Dims2 reduction_axis2;
+  Tensor<float, 2> result2 = tensor.sum(reduction_axis2);
+  for (int m = 0; m < 11; ++m) {
+    for (int l = 0; l < 7; ++l) {
+      float expected = 0.0f;
+      for (int k = 0; k < 5; ++k) {
+        for (int j = 0; j < 3; ++j) {
+          for (int i = 0; i < 2; ++i) {
+            expected += tensor(i,j,k,l,m);
+          }
+        }
+      }
+      VERIFY_IS_APPROX(result2(l,m), expected);
+    }
+  }
+
+  const Dims3 reduction_axis3;
+  Tensor<float, 1> result3 = tensor.sum(reduction_axis3);
+  for (int m = 0; m < 11; ++m) {
+    float expected = 0.0f;
+    for (int l = 0; l < 7; ++l) {
+      for (int k = 0; k < 5; ++k) {
+        for (int j = 0; j < 3; ++j) {
+          for (int i = 0; i < 2; ++i) {
+            expected += tensor(i,j,k,l,m);
+          }
+        }
+      }
+    }
+    VERIFY_IS_APPROX(result3(m), expected);
+  }
+
+  const Dims4 reduction_axis4;
+  Tensor<float, 0> result4 = tensor.sum(reduction_axis4);
+  float expected = 0.0f;
+  for (int m = 0; m < 11; ++m) {
+    for (int l = 0; l < 7; ++l) {
+      for (int k = 0; k < 5; ++k) {
+        for (int j = 0; j < 3; ++j) {
+          for (int i = 0; i < 2; ++i) {
+            expected += tensor(i,j,k,l,m);
+          }
+        }
+      }
+    }
+  }
+  VERIFY_IS_APPROX(result4(), expected);
+}
+
+
+static void test_type2indexpair_list()
+{
+  Tensor<float, 5> tensor(2,3,5,7,11);
+  tensor.setRandom();
+  tensor += tensor.constant(10.0f);
+
+  typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>> Dims0;
+  typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::type2indexpair<1,11>, Eigen::type2indexpair<2,12>> Dims2_a;
+  typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::IndexPair<Index>, Eigen::type2indexpair<2,12>> Dims2_b;
+  typedef Eigen::IndexPairList<Eigen::IndexPair<Index>, Eigen::type2indexpair<1,11>, Eigen::IndexPair<Index>> Dims2_c;
+
+  Dims2_a d2_a;
+
+  Dims2_b d2_b;
+  d2_b.set(1, Eigen::IndexPair<Index>(1,11));
+
+  Dims2_c d2_c;
+  d2_c.set(0, Eigen::IndexPair<Index>(Eigen::IndexPair<Index>(0,10)));
+  d2_c.set(1, Eigen::IndexPair<Index>(1,11));  // setting type2indexpair to correct value.
+  d2_c.set(2, Eigen::IndexPair<Index>(2,12));
+
+  VERIFY_IS_EQUAL(d2_a[0].first, 0);
+  VERIFY_IS_EQUAL(d2_a[0].second, 10);
+  VERIFY_IS_EQUAL(d2_a[1].first, 1);
+  VERIFY_IS_EQUAL(d2_a[1].second, 11);
+  VERIFY_IS_EQUAL(d2_a[2].first, 2);
+  VERIFY_IS_EQUAL(d2_a[2].second, 12);
+
+  VERIFY_IS_EQUAL(d2_b[0].first, 0);
+  VERIFY_IS_EQUAL(d2_b[0].second, 10);
+  VERIFY_IS_EQUAL(d2_b[1].first, 1);
+  VERIFY_IS_EQUAL(d2_b[1].second, 11);
+  VERIFY_IS_EQUAL(d2_b[2].first, 2);
+  VERIFY_IS_EQUAL(d2_b[2].second, 12);
+
+  VERIFY_IS_EQUAL(d2_c[0].first, 0);
+  VERIFY_IS_EQUAL(d2_c[0].second, 10);
+  VERIFY_IS_EQUAL(d2_c[1].first, 1);
+  VERIFY_IS_EQUAL(d2_c[1].second, 11);
+  VERIFY_IS_EQUAL(d2_c[2].first, 2);
+  VERIFY_IS_EQUAL(d2_c[2].second, 12);
+
+  EIGEN_STATIC_ASSERT((d2_a.value_known_statically(0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_a.value_known_statically(1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_a.value_known_statically(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((d2_b.value_known_statically(0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_b.value_known_statically(1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_b.value_known_statically(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((d2_c.value_known_statically(0) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_c.value_known_statically(1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_c.value_known_statically(2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims0>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims0>(0, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(0, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(1, 1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(1, 2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(2, 3) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(0, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(1, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(1, 2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(2, 3) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(0, 0) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(0, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(1, 1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(1, 2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(2, 2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(2, 3) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims0>(0, 10) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims0>(0, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(0, 10) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(0, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(1, 11) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(1, 12) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(2, 12) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(2, 13) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(0, 10) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(0, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(1, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(1, 12) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(2, 12) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(2, 13) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(0, 10) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(0, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(1, 11) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(1, 12) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(2, 12) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(2, 13) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+}
+
+
+static void test_dynamic_index_list()
+{
+  Tensor<float, 4> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  int dim1 = 2;
+  int dim2 = 1;
+  int dim3 = 0;
+
+  auto reduction_axis = make_index_list(dim1, dim2, dim3);
+
+  VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 2);
+  VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
+  VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 0);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[0]), 2);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[1]), 1);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[2]), 0);
+
+  Tensor<float, 1> result = tensor.sum(reduction_axis);
+  for (int i = 0; i < result.size(); ++i) {
+    float expected = 0.0f;
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          expected += tensor(j,k,l,i);
+        }
+      }
+    }
+    VERIFY_IS_APPROX(result(i), expected);
+  }
+}
+
+static void test_mixed_index_list()
+{
+  Tensor<float, 4> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  int dim2 = 1;
+  int dim4 = 3;
+
+  auto reduction_axis = make_index_list(0, dim2, 2, dim4);
+
+  VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 0);
+  VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
+  VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2);
+  VERIFY_IS_EQUAL(internal::array_get<3>(reduction_axis), 3);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[0]), 0);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[1]), 1);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[2]), 2);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[3]), 3);
+
+  typedef IndexList<type2index<0>, int, type2index<2>, int> ReductionIndices;
+  ReductionIndices reduction_indices;
+  reduction_indices.set(1, 1);
+  reduction_indices.set(3, 3);
+  EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_indices) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::array_get<2>(reduction_indices) == 2), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_known_statically<ReductionIndices>(0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_known_statically<ReductionIndices>(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionIndices>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionIndices>(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+#if 0
+  EIGEN_STATIC_ASSERT((internal::all_indices_known_statically<ReductionIndices>() == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<ReductionIndices>() == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+#endif
+
+  typedef IndexList<type2index<0>, type2index<1>, type2index<2>, type2index<3>> ReductionList;
+  ReductionList reduction_list;
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>(1, 1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>(3, 3) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+#if 0
+  EIGEN_STATIC_ASSERT((internal::all_indices_known_statically<ReductionList>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<ReductionList>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+#endif
+
+  Tensor<float, 0> result1 = tensor.sum(reduction_axis);
+  Tensor<float, 0> result2 = tensor.sum(reduction_indices);
+  Tensor<float, 0> result3 = tensor.sum(reduction_list);
+
+  float expected = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          expected += tensor(i,j,k,l);
+        }
+      }
+    }
+  }
+  VERIFY_IS_APPROX(result1(), expected);
+  VERIFY_IS_APPROX(result2(), expected);
+  VERIFY_IS_APPROX(result3(), expected);
+}
+
+
+static void test_dim_check()
+{
+  Eigen::IndexList<Eigen::type2index<1>, int> dim1;
+  dim1.set(1, 2);
+  Eigen::IndexList<Eigen::type2index<1>, int> dim2;
+  dim2.set(1, 2);
+  VERIFY(dimensions_match(dim1, dim2));
+}
+
+
+#endif
+
+EIGEN_DECLARE_TEST(cxx11_tensor_index_list)
+{
+#ifdef EIGEN_HAS_INDEX_LIST
+  CALL_SUBTEST(test_static_index_list());
+  CALL_SUBTEST(test_type2index_list());
+  CALL_SUBTEST(test_type2indexpair_list());
+  CALL_SUBTEST(test_dynamic_index_list());
+  CALL_SUBTEST(test_mixed_index_list());
+  CALL_SUBTEST(test_dim_check());
+#endif
+}

diff --git a/unsupported/test/cxx11_tensor_inflation.cpp b/unsupported/test/cxx11_tensor_inflation.cpp
new file mode 100644
index 0000000..75089e8
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_inflation.cpp

@@ -0,0 +1,81 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Ke Yang <yangke@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_simple_inflation()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> strides;
+
+  strides[0] = 1;
+  strides[1] = 1;
+  strides[2] = 1;
+  strides[3] = 1;
+
+  Tensor<float, 4, DataLayout> no_stride;
+  no_stride = tensor.inflate(strides);
+
+  VERIFY_IS_EQUAL(no_stride.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_stride.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_stride.dimension(2), 5);
+  VERIFY_IS_EQUAL(no_stride.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+  Tensor<float, 4, DataLayout> inflated;
+  inflated = tensor.inflate(strides);
+
+  VERIFY_IS_EQUAL(inflated.dimension(0), 3);
+  VERIFY_IS_EQUAL(inflated.dimension(1), 9);
+  VERIFY_IS_EQUAL(inflated.dimension(2), 9);
+  VERIFY_IS_EQUAL(inflated.dimension(3), 19);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 9; ++k) {
+        for (int l = 0; l < 19; ++l) {
+          if (i % 2 == 0 &&
+              j % 4 == 0 &&
+              k % 2 == 0 &&
+              l % 3 == 0) {
+            VERIFY_IS_EQUAL(inflated(i,j,k,l),
+                            tensor(i/2, j/4, k/2, l/3));
+          } else {
+            VERIFY_IS_EQUAL(0, inflated(i,j,k,l));
+          }
+        }
+      }
+    }
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_inflation)
+{
+  CALL_SUBTEST(test_simple_inflation<ColMajor>());
+  CALL_SUBTEST(test_simple_inflation<RowMajor>());
+}

diff --git a/unsupported/test/cxx11_tensor_inflation_sycl.cpp b/unsupported/test/cxx11_tensor_inflation_sycl.cpp
new file mode 100644
index 0000000..521ae0c
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_inflation_sycl.cpp

@@ -0,0 +1,136 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+// Inflation Definition for each dimension the inflated val would be
+//((dim-1)*strid[dim] +1)
+
+// for 1 dimension vector of size 3 with value (4,4,4) with the inflated stride value of 3 would be changed to
+// tensor of size (2*3) +1 = 7 with the value of
+// (4, 0, 0, 4, 0, 0, 4).
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_simple_inflation_sycl(const Eigen::SyclDevice &sycl_device) {
+
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> no_stride(tensorRange);
+  tensor.setRandom();
+
+  array<IndexType, 4> strides;
+  strides[0] = 1;
+  strides[1] = 1;
+  strides[2] = 1;
+  strides[3] = 1;
+
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_no_stride  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_no_stride(gpu_data_no_stride, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  gpu_no_stride.device(sycl_device)=gpu_tensor.inflate(strides);
+  sycl_device.memcpyDeviceToHost(no_stride.data(), gpu_data_no_stride, tensorBuffSize);
+
+  VERIFY_IS_EQUAL(no_stride.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(no_stride.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(no_stride.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(no_stride.dimension(3), sizeDim4);
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
+        }
+      }
+    }
+  }
+
+
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+
+  IndexType inflatedSizeDim1 = 3;
+  IndexType inflatedSizeDim2 = 9;
+  IndexType inflatedSizeDim3 = 9;
+  IndexType inflatedSizeDim4 = 19;
+  array<IndexType, 4> inflatedTensorRange = {{inflatedSizeDim1, inflatedSizeDim2, inflatedSizeDim3, inflatedSizeDim4}};
+
+  Tensor<DataType, 4, DataLayout, IndexType> inflated(inflatedTensorRange);
+
+  const size_t inflatedTensorBuffSize =inflated.size()*sizeof(DataType);
+  DataType* gpu_data_inflated  = static_cast<DataType*>(sycl_device.allocate(inflatedTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_inflated(gpu_data_inflated, inflatedTensorRange);
+  gpu_inflated.device(sycl_device)=gpu_tensor.inflate(strides);
+  sycl_device.memcpyDeviceToHost(inflated.data(), gpu_data_inflated, inflatedTensorBuffSize);
+
+  VERIFY_IS_EQUAL(inflated.dimension(0), inflatedSizeDim1);
+  VERIFY_IS_EQUAL(inflated.dimension(1), inflatedSizeDim2);
+  VERIFY_IS_EQUAL(inflated.dimension(2), inflatedSizeDim3);
+  VERIFY_IS_EQUAL(inflated.dimension(3), inflatedSizeDim4);
+
+  for (IndexType i = 0; i < inflatedSizeDim1; ++i) {
+    for (IndexType j = 0; j < inflatedSizeDim2; ++j) {
+      for (IndexType k = 0; k < inflatedSizeDim3; ++k) {
+        for (IndexType l = 0; l < inflatedSizeDim4; ++l) {
+          if (i % strides[0] == 0 &&
+              j % strides[1] == 0 &&
+              k % strides[2] == 0 &&
+              l % strides[3] == 0) {
+            VERIFY_IS_EQUAL(inflated(i,j,k,l),
+                            tensor(i/strides[0], j/strides[1], k/strides[2], l/strides[3]));
+          } else {
+            VERIFY_IS_EQUAL(0, inflated(i,j,k,l));
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_no_stride);
+  sycl_device.deallocate(gpu_data_inflated);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_inflation_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_inflation_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_inflation_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_inflation_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_inflation_test_per_device<float>(device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_intdiv.cpp b/unsupported/test/cxx11_tensor_intdiv.cpp
new file mode 100644
index 0000000..d18a05e
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_intdiv.cpp

@@ -0,0 +1,147 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+void test_signed_32bit()
+{
+  // Divide by one
+  const Eigen::internal::TensorIntDivisor<int32_t, false> div_by_one(1);
+
+  for (int32_t j = 0; j < 25000; ++j) {
+    const int32_t fast_div = j / div_by_one;
+    const int32_t slow_div = j / 1;
+    VERIFY_IS_EQUAL(fast_div, slow_div);
+  }
+
+  // Standard divide by 2 or more
+  for (int32_t i = 2; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<int32_t, false> div(i);
+
+    for (int32_t j = 0; j < 25000; ++j) {
+      const int32_t fast_div = j / div;
+      const int32_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+
+  // Optimized divide by 2 or more
+  for (int32_t i = 2; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<int32_t, true> div(i);
+
+    for (int32_t j = 0; j < 25000; ++j) {
+      const int32_t fast_div = j / div;
+      const int32_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+}
+
+
+void test_unsigned_32bit()
+{
+  for (uint32_t i = 1; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<uint32_t> div(i);
+
+    for (uint32_t j = 0; j < 25000; ++j) {
+      const uint32_t fast_div = j / div;
+      const uint32_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+}
+
+
+void test_signed_64bit()
+{
+  for (int64_t i = 1; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<int64_t> div(i);
+
+    for (int64_t j = 0; j < 25000; ++j) {
+      const int64_t fast_div = j / div;
+      const int64_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+}
+
+
+void test_unsigned_64bit()
+{
+  for (uint64_t i = 1; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<uint64_t> div(i);
+
+    for (uint64_t j = 0; j < 25000; ++j) {
+      const uint64_t fast_div = j / div;
+      const uint64_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+}
+
+void test_powers_32bit() {
+  for (int expon = 1; expon < 31; expon++) {
+    int32_t div = (1 << expon);
+    for (int num_expon = 0; num_expon < 32; num_expon++) {
+      int32_t start_num = (1 << num_expon) - 100;
+      int32_t end_num = (1 << num_expon) + 100;
+      if (start_num < 0)
+        start_num = 0;
+      for (int32_t num = start_num; num < end_num; num++) {
+        Eigen::internal::TensorIntDivisor<int32_t> divider =
+          Eigen::internal::TensorIntDivisor<int32_t>(div);
+        int32_t result = num/div;
+        int32_t result_op = divider.divide(num);
+        VERIFY_IS_EQUAL(result_op, result);
+      }
+    }
+  }
+}
+
+void test_powers_64bit() {
+  for (int expon = 0; expon < 63; expon++) {
+    int64_t div = (1ull << expon);
+    for (int num_expon = 0; num_expon < 63; num_expon++) {
+      int64_t start_num = (1ull << num_expon) - 10;
+      int64_t end_num = (1ull << num_expon) + 10;
+      if (start_num < 0)
+        start_num = 0;
+      for (int64_t num = start_num; num < end_num; num++) {
+        Eigen::internal::TensorIntDivisor<int64_t> divider(div);
+        int64_t result = num/div;
+        int64_t result_op = divider.divide(num);
+        VERIFY_IS_EQUAL(result_op, result);
+      }
+    }
+  }
+}
+
+void test_specific() {
+  // A particular combination that was previously failing
+  int64_t div = 209715200;
+  int64_t num = 3238002688ll;
+  Eigen::internal::TensorIntDivisor<int64_t> divider(div);
+  int64_t result = num/div;
+  int64_t result_op = divider.divide(num);
+  VERIFY_IS_EQUAL(result, result_op);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_intdiv)
+{
+  CALL_SUBTEST_1(test_signed_32bit());
+  CALL_SUBTEST_2(test_unsigned_32bit());
+  CALL_SUBTEST_3(test_signed_64bit());
+  CALL_SUBTEST_4(test_unsigned_64bit());
+  CALL_SUBTEST_5(test_powers_32bit());
+  CALL_SUBTEST_6(test_powers_64bit());
+  CALL_SUBTEST_7(test_specific());
+}

diff --git a/unsupported/test/cxx11_tensor_io.cpp b/unsupported/test/cxx11_tensor_io.cpp
new file mode 100644
index 0000000..2c638f9
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_io.cpp

@@ -0,0 +1,136 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <sstream>
+#include <string>
+#include <Eigen/CXX11/Tensor>
+
+
+template<int DataLayout>
+static void test_output_0d()
+{
+  Tensor<int, 0, DataLayout> tensor;
+  tensor() = 123;
+
+  std::stringstream os;
+  os << tensor;
+
+  std::string expected("123");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+template<int DataLayout>
+static void test_output_1d()
+{
+  Tensor<int, 1, DataLayout> tensor(5);
+  for (int i = 0; i < 5; ++i) {
+    tensor(i) = i;
+  }
+
+  std::stringstream os;
+  os << tensor;
+
+  std::string expected("0\n1\n2\n3\n4");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+
+  Eigen::Tensor<double,1,DataLayout> empty_tensor(0);
+  std::stringstream empty_os;
+  empty_os << empty_tensor;
+  std::string empty_string;
+  VERIFY_IS_EQUAL(std::string(empty_os.str()), empty_string);
+}
+
+
+template<int DataLayout>
+static void test_output_2d()
+{
+  Tensor<int, 2, DataLayout> tensor(5, 3);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      tensor(i, j) = i*j;
+    }
+  }
+
+  std::stringstream os;
+  os << tensor;
+
+  std::string expected("0  0  0\n0  1  2\n0  2  4\n0  3  6\n0  4  8");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+template<int DataLayout>
+static void test_output_expr()
+{
+  Tensor<int, 1, DataLayout> tensor1(5);
+  Tensor<int, 1, DataLayout> tensor2(5);
+  for (int i = 0; i < 5; ++i) {
+    tensor1(i) = i;
+    tensor2(i) = 7;
+  }
+
+  std::stringstream os;
+  os << tensor1 + tensor2;
+
+  std::string expected(" 7\n 8\n 9\n10\n11");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+template<int DataLayout>
+static void test_output_string()
+{
+  Tensor<std::string, 2, DataLayout> tensor(5, 3);
+  tensor.setConstant(std::string("foo"));
+
+  std::cout << tensor << std::endl;
+
+  std::stringstream os;
+  os << tensor;
+
+  std::string expected("foo  foo  foo\nfoo  foo  foo\nfoo  foo  foo\nfoo  foo  foo\nfoo  foo  foo");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+template<int DataLayout>
+static void test_output_const()
+{
+  Tensor<int, 1, DataLayout> tensor(5);
+  for (int i = 0; i < 5; ++i) {
+    tensor(i) = i;
+  }
+
+  TensorMap<Tensor<const int, 1, DataLayout> > tensor_map(tensor.data(), 5);
+
+  std::stringstream os;
+  os << tensor_map;
+
+  std::string expected("0\n1\n2\n3\n4");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_io)
+{
+  CALL_SUBTEST(test_output_0d<ColMajor>());
+  CALL_SUBTEST(test_output_0d<RowMajor>());
+  CALL_SUBTEST(test_output_1d<ColMajor>());
+  CALL_SUBTEST(test_output_1d<RowMajor>());
+  CALL_SUBTEST(test_output_2d<ColMajor>());
+  CALL_SUBTEST(test_output_2d<RowMajor>());
+  CALL_SUBTEST(test_output_expr<ColMajor>());
+  CALL_SUBTEST(test_output_expr<RowMajor>());
+  CALL_SUBTEST(test_output_string<ColMajor>());
+  CALL_SUBTEST(test_output_string<RowMajor>());
+  CALL_SUBTEST(test_output_const<ColMajor>());
+  CALL_SUBTEST(test_output_const<RowMajor>());
+}

diff --git a/unsupported/test/cxx11_tensor_layout_swap.cpp b/unsupported/test/cxx11_tensor_layout_swap.cpp
new file mode 100644
index 0000000..efb3333
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_layout_swap.cpp

@@ -0,0 +1,61 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+static void test_simple_swap()
+{
+  Tensor<float, 3, ColMajor> tensor(2,3,7);
+  tensor.setRandom();
+
+  Tensor<float, 3, RowMajor> tensor2 = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor2.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor2.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor2.dimension(0));
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor(i,j,k), tensor2(k,j,i));
+      }
+    }
+  }
+}
+
+
+static void test_swap_as_lvalue()
+{
+  Tensor<float, 3, ColMajor> tensor(2,3,7);
+  tensor.setRandom();
+
+  Tensor<float, 3, RowMajor> tensor2(7,3,2);
+  tensor2.swap_layout() = tensor;
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor2.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor2.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor2.dimension(0));
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor(i,j,k), tensor2(k,j,i));
+      }
+    }
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap)
+{
+  CALL_SUBTEST(test_simple_swap());
+  CALL_SUBTEST(test_swap_as_lvalue());
+}

diff --git a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
new file mode 100644
index 0000000..9546b91
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp

@@ -0,0 +1,126 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <typename DataType, typename IndexType>
+static void test_simple_swap_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 7;
+  array<IndexType, 3> tensorColRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  array<IndexType, 3> tensorRowRange = {{sizeDim3, sizeDim2, sizeDim1}};
+
+
+  Tensor<DataType, 3, ColMajor, IndexType> tensor1(tensorColRange);
+  Tensor<DataType, 3, RowMajor, IndexType> tensor2(tensorRowRange);
+  tensor1.setRandom();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu1(gpu_data1, tensorColRange);
+  TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu2(gpu_data2, tensorRowRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType));
+
+
+//  Tensor<float, 3, ColMajor> tensor(2,3,7);
+  //tensor.setRandom();
+
+//  Tensor<float, 3, RowMajor> tensor2 = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor1.dimension(0), tensor2.dimension(2));
+  VERIFY_IS_EQUAL(tensor1.dimension(1), tensor2.dimension(1));
+  VERIFY_IS_EQUAL(tensor1.dimension(2), tensor2.dimension(0));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor1(i,j,k), tensor2(k,j,i));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+}
+
+template <typename DataType, typename IndexType>
+static void test_swap_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 7;
+  array<IndexType, 3> tensorColRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  array<IndexType, 3> tensorRowRange = {{sizeDim3, sizeDim2, sizeDim1}};
+
+  Tensor<DataType, 3, ColMajor, IndexType> tensor1(tensorColRange);
+  Tensor<DataType, 3, RowMajor, IndexType> tensor2(tensorRowRange);
+  tensor1.setRandom();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu1(gpu_data1, tensorColRange);
+  TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu2(gpu_data2, tensorRowRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+  gpu2.swap_layout().device(sycl_device)=gpu1;
+  sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType));
+
+
+//  Tensor<float, 3, ColMajor> tensor(2,3,7);
+//  tensor.setRandom();
+
+  //Tensor<float, 3, RowMajor> tensor2(7,3,2);
+//  tensor2.swap_layout() = tensor;
+  VERIFY_IS_EQUAL(tensor1.dimension(0), tensor2.dimension(2));
+  VERIFY_IS_EQUAL(tensor1.dimension(1), tensor2.dimension(1));
+  VERIFY_IS_EQUAL(tensor1.dimension(2), tensor2.dimension(0));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor1(i,j,k), tensor2(k,j,i));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_layout_swap_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_swap_sycl<DataType, int64_t>(sycl_device);
+  test_swap_as_lvalue_sycl<DataType, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<float>(device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_lvalue.cpp b/unsupported/test/cxx11_tensor_lvalue.cpp
new file mode 100644
index 0000000..6ba9a21
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_lvalue.cpp

@@ -0,0 +1,42 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+
+static void test_compound_assignment()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<float, 3> mat3(2,3,7);
+
+  mat1.setRandom();
+  mat2.setRandom();
+  mat3 = mat1;
+  mat3 += mat2;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), mat1(i,j,k) + mat2(i,j,k));
+      }
+    }
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_lvalue)
+{
+  CALL_SUBTEST(test_compound_assignment());
+}

diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp
new file mode 100644
index 0000000..4d4f689
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_map.cpp

@@ -0,0 +1,327 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_0d()
+{
+  Tensor<int, 0> scalar1;
+  Tensor<int, 0, RowMajor> scalar2;
+
+  TensorMap<const Tensor<int, 0> > scalar3(scalar1.data());
+  TensorMap<const Tensor<int, 0, RowMajor> > scalar4(scalar2.data());
+
+  scalar1() = 7;
+  scalar2() = 13;
+
+  VERIFY_IS_EQUAL(scalar1.rank(), 0);
+  VERIFY_IS_EQUAL(scalar1.size(), 1);
+
+  VERIFY_IS_EQUAL(scalar3(), 7);
+  VERIFY_IS_EQUAL(scalar4(), 13);
+}
+
+static void test_1d()
+{
+  Tensor<int, 1> vec1(6);
+  Tensor<int, 1, RowMajor> vec2(6);
+
+  TensorMap<const Tensor<int, 1> > vec3(vec1.data(), 6);
+  TensorMap<const Tensor<int, 1, RowMajor> > vec4(vec2.data(), 6);
+
+  vec1(0) = 4;  vec2(0) = 0;
+  vec1(1) = 8;  vec2(1) = 1;
+  vec1(2) = 15; vec2(2) = 2;
+  vec1(3) = 16; vec2(3) = 3;
+  vec1(4) = 23; vec2(4) = 4;
+  vec1(5) = 42; vec2(5) = 5;
+
+  VERIFY_IS_EQUAL(vec1.rank(), 1);
+  VERIFY_IS_EQUAL(vec1.size(), 6);
+  VERIFY_IS_EQUAL(vec1.dimension(0), 6);
+
+  VERIFY_IS_EQUAL(vec3(0), 4);
+  VERIFY_IS_EQUAL(vec3(1), 8);
+  VERIFY_IS_EQUAL(vec3(2), 15);
+  VERIFY_IS_EQUAL(vec3(3), 16);
+  VERIFY_IS_EQUAL(vec3(4), 23);
+  VERIFY_IS_EQUAL(vec3(5), 42);
+
+  VERIFY_IS_EQUAL(vec4(0), 0);
+  VERIFY_IS_EQUAL(vec4(1), 1);
+  VERIFY_IS_EQUAL(vec4(2), 2);
+  VERIFY_IS_EQUAL(vec4(3), 3);
+  VERIFY_IS_EQUAL(vec4(4), 4);
+  VERIFY_IS_EQUAL(vec4(5), 5);
+}
+
+static void test_2d()
+{
+  Tensor<int, 2> mat1(2,3);
+  Tensor<int, 2, RowMajor> mat2(2,3);
+
+  mat1(0,0) = 0;
+  mat1(0,1) = 1;
+  mat1(0,2) = 2;
+  mat1(1,0) = 3;
+  mat1(1,1) = 4;
+  mat1(1,2) = 5;
+
+  mat2(0,0) = 0;
+  mat2(0,1) = 1;
+  mat2(0,2) = 2;
+  mat2(1,0) = 3;
+  mat2(1,1) = 4;
+  mat2(1,2) = 5;
+
+  TensorMap<const Tensor<int, 2> > mat3(mat1.data(), 2, 3);
+  TensorMap<const Tensor<int, 2, RowMajor> > mat4(mat2.data(), 2, 3);
+
+  VERIFY_IS_EQUAL(mat3.rank(), 2);
+  VERIFY_IS_EQUAL(mat3.size(), 6);
+  VERIFY_IS_EQUAL(mat3.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat3.dimension(1), 3);
+
+  VERIFY_IS_EQUAL(mat4.rank(), 2);
+  VERIFY_IS_EQUAL(mat4.size(), 6);
+  VERIFY_IS_EQUAL(mat4.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat4.dimension(1), 3);
+
+  VERIFY_IS_EQUAL(mat3(0,0), 0);
+  VERIFY_IS_EQUAL(mat3(0,1), 1);
+  VERIFY_IS_EQUAL(mat3(0,2), 2);
+  VERIFY_IS_EQUAL(mat3(1,0), 3);
+  VERIFY_IS_EQUAL(mat3(1,1), 4);
+  VERIFY_IS_EQUAL(mat3(1,2), 5);
+
+  VERIFY_IS_EQUAL(mat4(0,0), 0);
+  VERIFY_IS_EQUAL(mat4(0,1), 1);
+  VERIFY_IS_EQUAL(mat4(0,2), 2);
+  VERIFY_IS_EQUAL(mat4(1,0), 3);
+  VERIFY_IS_EQUAL(mat4(1,1), 4);
+  VERIFY_IS_EQUAL(mat4(1,2), 5);
+}
+
+static void test_3d()
+{
+  Tensor<int, 3> mat1(2,3,7);
+  Tensor<int, 3, RowMajor> mat2(2,3,7);
+
+  int val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        mat2(i,j,k) = val;
+        val++;
+      }
+    }
+  }
+
+  TensorMap<const Tensor<int, 3> > mat3(mat1.data(), 2, 3, 7);
+  TensorMap<const Tensor<int, 3, RowMajor> > mat4(mat2.data(), 2, 3, 7);
+
+  VERIFY_IS_EQUAL(mat3.rank(), 3);
+  VERIFY_IS_EQUAL(mat3.size(), 2*3*7);
+  VERIFY_IS_EQUAL(mat3.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat3.dimension(1), 3);
+  VERIFY_IS_EQUAL(mat3.dimension(2), 7);
+
+  VERIFY_IS_EQUAL(mat4.rank(), 3);
+  VERIFY_IS_EQUAL(mat4.size(), 2*3*7);
+  VERIFY_IS_EQUAL(mat4.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat4.dimension(1), 3);
+  VERIFY_IS_EQUAL(mat4.dimension(2), 7);
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(mat3(i,j,k), val);
+        VERIFY_IS_EQUAL(mat4(i,j,k), val);
+        val++;
+      }
+    }
+  }
+}
+
+
+static void test_from_tensor()
+{
+  Tensor<int, 3> mat1(2,3,7);
+  Tensor<int, 3, RowMajor> mat2(2,3,7);
+
+  int val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        mat2(i,j,k) = val;
+        val++;
+      }
+    }
+  }
+
+  TensorMap<Tensor<int, 3> > mat3(mat1);
+  TensorMap<Tensor<int, 3, RowMajor> > mat4(mat2);
+
+  VERIFY_IS_EQUAL(mat3.rank(), 3);
+  VERIFY_IS_EQUAL(mat3.size(), 2*3*7);
+  VERIFY_IS_EQUAL(mat3.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat3.dimension(1), 3);
+  VERIFY_IS_EQUAL(mat3.dimension(2), 7);
+
+  VERIFY_IS_EQUAL(mat4.rank(), 3);
+  VERIFY_IS_EQUAL(mat4.size(), 2*3*7);
+  VERIFY_IS_EQUAL(mat4.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat4.dimension(1), 3);
+  VERIFY_IS_EQUAL(mat4.dimension(2), 7);
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(mat3(i,j,k), val);
+        VERIFY_IS_EQUAL(mat4(i,j,k), val);
+        val++;
+      }
+    }
+  }
+
+  TensorFixedSize<int, Sizes<2,3,7> > mat5;
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        array<ptrdiff_t, 3> coords;
+        coords[0] = i;
+        coords[1] = j;
+        coords[2] = k;
+        mat5(coords) = val;
+        val++;
+      }
+    }
+  }
+
+  TensorMap<TensorFixedSize<int, Sizes<2,3,7> > > mat6(mat5);
+
+  VERIFY_IS_EQUAL(mat6.rank(), 3);
+  VERIFY_IS_EQUAL(mat6.size(), 2*3*7);
+  VERIFY_IS_EQUAL(mat6.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat6.dimension(1), 3);
+  VERIFY_IS_EQUAL(mat6.dimension(2), 7);
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(mat6(i,j,k), val);
+        val++;
+      }
+    }
+  }
+}
+
+
+static int f(const TensorMap<Tensor<int, 3> >& tensor) {
+  //  Size<0> empty;
+  EIGEN_STATIC_ASSERT((internal::array_size<Sizes<> >::value == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::array_size<DSizes<int, 0> >::value == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  Tensor<int, 0> result = tensor.sum();
+  return result();
+}
+
+static void test_casting()
+{
+  Tensor<int, 3> tensor(2,3,7);
+
+  int val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        tensor(i,j,k) = val;
+        val++;
+      }
+    }
+  }
+
+  TensorMap<Tensor<int, 3> > map(tensor);
+  int sum1 = f(map);
+  int sum2 = f(tensor);
+
+  VERIFY_IS_EQUAL(sum1, sum2);
+  VERIFY_IS_EQUAL(sum1, 861);
+}
+
+template<typename T>
+static const T& add_const(T& value) {
+  return value;
+}
+
+static void test_0d_const_tensor()
+{
+  Tensor<int, 0> scalar1;
+  Tensor<int, 0, RowMajor> scalar2;
+
+  TensorMap<const Tensor<int, 0> > scalar3(add_const(scalar1).data());
+  TensorMap<const Tensor<int, 0, RowMajor> > scalar4(add_const(scalar2).data());
+
+  scalar1() = 7;
+  scalar2() = 13;
+
+  VERIFY_IS_EQUAL(scalar1.rank(), 0);
+  VERIFY_IS_EQUAL(scalar1.size(), 1);
+
+  VERIFY_IS_EQUAL(scalar3(), 7);
+  VERIFY_IS_EQUAL(scalar4(), 13);
+}
+
+static void test_0d_const_tensor_map()
+{
+  Tensor<int, 0> scalar1;
+  Tensor<int, 0, RowMajor> scalar2;
+
+  const TensorMap<Tensor<int, 0> > scalar3(scalar1.data());
+  const TensorMap<Tensor<int, 0, RowMajor> > scalar4(scalar2.data());
+
+  // Although TensorMap is constant, we still can write to the underlying
+  // storage, because we map over non-constant Tensor.
+  scalar3() = 7;
+  scalar4() = 13;
+
+  VERIFY_IS_EQUAL(scalar1(), 7);
+  VERIFY_IS_EQUAL(scalar2(), 13);
+
+  // Pointer to the underlying storage is also non-const.
+  scalar3.data()[0] = 8;
+  scalar4.data()[0] = 14;
+
+  VERIFY_IS_EQUAL(scalar1(), 8);
+  VERIFY_IS_EQUAL(scalar2(), 14);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_map)
+{
+  CALL_SUBTEST(test_0d());
+  CALL_SUBTEST(test_1d());
+  CALL_SUBTEST(test_2d());
+  CALL_SUBTEST(test_3d());
+
+  CALL_SUBTEST(test_from_tensor());
+  CALL_SUBTEST(test_casting());
+
+  CALL_SUBTEST(test_0d_const_tensor());
+  CALL_SUBTEST(test_0d_const_tensor_map());
+}

diff --git a/unsupported/test/cxx11_tensor_math.cpp b/unsupported/test/cxx11_tensor_math.cpp
new file mode 100644
index 0000000..82a1a26
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_math.cpp

@@ -0,0 +1,46 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_tanh()
+{
+  Tensor<float, 1> vec1(6);
+  vec1.setRandom();
+
+  Tensor<float, 1> vec2 = vec1.tanh();
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_APPROX(vec2(i), tanhf(vec1(i)));
+  }
+}
+
+static void test_sigmoid()
+{
+  Tensor<float, 1> vec1(6);
+  vec1.setRandom();
+
+  Tensor<float, 1> vec2 = vec1.sigmoid();
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_APPROX(vec2(i), 1.0f / (1.0f + std::exp(-vec1(i))));
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_math)
+{
+  CALL_SUBTEST(test_tanh());
+  CALL_SUBTEST(test_sigmoid());
+}

diff --git a/unsupported/test/cxx11_tensor_math_sycl.cpp b/unsupported/test/cxx11_tensor_math_sycl.cpp
new file mode 100644
index 0000000..029653e
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_math_sycl.cpp

@@ -0,0 +1,105 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_tanh_sycl(const Eigen::SyclDevice &sycl_device)
+{
+
+  IndexType sizeDim1 = 4;
+  IndexType sizeDim2 = 4;
+  IndexType sizeDim3 = 1;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out_cpu(tensorRange);
+
+  in = in.random();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType));
+  gpu2.device(sycl_device) = gpu1.tanh();
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType));
+
+  out_cpu=in.tanh();
+
+  for (int i = 0; i < in.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), out_cpu(i));
+  }
+}
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_sigmoid_sycl(const Eigen::SyclDevice &sycl_device)
+{
+
+  IndexType sizeDim1 = 4;
+  IndexType sizeDim2 = 4;
+  IndexType sizeDim3 = 1;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out_cpu(tensorRange);
+
+  in = in.random();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType));
+  gpu2.device(sycl_device) = gpu1.sigmoid();
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType));
+
+  out_cpu=in.sigmoid();
+
+  for (int i = 0; i < in.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), out_cpu(i));
+  }
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_tanh_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_tanh_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_sigmoid_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_sigmoid_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_math_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_mixed_indices.cpp b/unsupported/test/cxx11_tensor_mixed_indices.cpp
new file mode 100644
index 0000000..ee2616f
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_mixed_indices.cpp

@@ -0,0 +1,53 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+static void test_simple()
+{
+  Tensor<float, 1, ColMajor> vec1(6);
+  Tensor<float, 1, ColMajor, int> vec2(6);
+
+  vec1(0) = 4.0;  vec2(0) = 0.0;
+  vec1(1) = 8.0;  vec2(1) = 1.0;
+  vec1(2) = 15.0; vec2(2) = 2.0;
+  vec1(3) = 16.0; vec2(3) = 3.0;
+  vec1(4) = 23.0; vec2(4) = 4.0;
+  vec1(5) = 42.0; vec2(5) = 5.0;
+
+  float data3[6];
+  TensorMap<Tensor<float, 1, ColMajor>> vec3(data3, 6);
+  vec3 = vec1.sqrt();
+  float data4[6];
+  TensorMap<Tensor<float, 1, ColMajor, int>> vec4(data4, 6);
+  vec4 = vec2.square();
+
+  VERIFY_IS_APPROX(vec3(0), sqrtf(4.0));
+  VERIFY_IS_APPROX(vec3(1), sqrtf(8.0));
+  VERIFY_IS_APPROX(vec3(2), sqrtf(15.0));
+  VERIFY_IS_APPROX(vec3(3), sqrtf(16.0));
+  VERIFY_IS_APPROX(vec3(4), sqrtf(23.0));
+  VERIFY_IS_APPROX(vec3(5), sqrtf(42.0));
+
+  VERIFY_IS_APPROX(vec4(0), 0.0f);
+  VERIFY_IS_APPROX(vec4(1), 1.0f);
+  VERIFY_IS_APPROX(vec4(2), 2.0f * 2.0f);
+  VERIFY_IS_APPROX(vec4(3), 3.0f * 3.0f);
+  VERIFY_IS_APPROX(vec4(4), 4.0f * 4.0f);
+  VERIFY_IS_APPROX(vec4(5), 5.0f * 5.0f);
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_mixed_indices)
+{
+  CALL_SUBTEST(test_simple());
+}

diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp
new file mode 100644
index 0000000..ed5d5ad
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_morphing.cpp

@@ -0,0 +1,565 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<typename>
+static void test_simple_reshape()
+{
+  Tensor<float, 5> tensor1(2,3,1,7,1);
+  tensor1.setRandom();
+
+  Tensor<float, 3> tensor2(2,3,7);
+  Tensor<float, 2> tensor3(6,7);
+  Tensor<float, 2> tensor4(2,21);
+
+  Tensor<float, 3>::Dimensions dim1(2,3,7);
+  tensor2 = tensor1.reshape(dim1);
+  Tensor<float, 2>::Dimensions dim2(6,7);
+  tensor3 = tensor1.reshape(dim2);
+  Tensor<float, 2>::Dimensions dim3(2,21);
+  tensor4 = tensor1.reshape(dim1).reshape(dim3);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k));
+        VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i+2*j,k));
+        VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j+3*k));
+      }
+    }
+  }
+}
+
+template <typename>
+static void test_static_reshape() {
+#if defined(EIGEN_HAS_INDEX_LIST)
+  using Eigen::type2index;
+
+  Tensor<float, 5> tensor(2, 3, 1, 7, 1);
+  tensor.setRandom();
+
+  // New dimensions: [2, 3, 7]
+  Eigen::IndexList<type2index<2>, type2index<3>, type2index<7>> dim;
+  Tensor<float, 3> reshaped = tensor.reshape(static_cast<Eigen::DSizes<ptrdiff_t,3>>(dim));
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor(i, j, 0, k, 0), reshaped(i, j, k));
+      }
+    }
+  }
+#endif
+}
+
+template <typename>
+static void test_reshape_in_expr() {
+  MatrixXf m1(2,3*5*7*11);
+  MatrixXf m2(3*5*7*11,13);
+  m1.setRandom();
+  m2.setRandom();
+  MatrixXf m3 = m1 * m2;
+
+  TensorMap<Tensor<float, 5>> tensor1(m1.data(), 2,3,5,7,11);
+  TensorMap<Tensor<float, 5>> tensor2(m2.data(), 3,5,7,11,13);
+  Tensor<float, 2>::Dimensions newDims1(2,3*5*7*11);
+  Tensor<float, 2>::Dimensions newDims2(3*5*7*11,13);
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  array<DimPair, 1> contract_along{{DimPair(1, 0)}};
+  Tensor<float, 2> tensor3(2,13);
+  tensor3 = tensor1.reshape(newDims1).contract(tensor2.reshape(newDims2), contract_along);
+
+  Map<MatrixXf> res(tensor3.data(), 2, 13);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 13; ++j) {
+      VERIFY_IS_APPROX(res(i,j), m3(i,j));
+    }
+  }
+}
+
+template<typename>
+static void test_reshape_as_lvalue()
+{
+  Tensor<float, 3> tensor(2,3,7);
+  tensor.setRandom();
+
+  Tensor<float, 2> tensor2d(6,7);
+  Tensor<float, 3>::Dimensions dim(2,3,7);
+  tensor2d.reshape(dim) = tensor;
+
+  float scratch[2*3*1*7*1];
+  TensorMap<Tensor<float, 5>> tensor5d(scratch, 2,3,1,7,1);
+  tensor5d.reshape(dim).device(Eigen::DefaultDevice()) = tensor;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor2d(i+2*j,k), tensor(i,j,k));
+        VERIFY_IS_EQUAL(tensor5d(i,j,0,k,0), tensor(i,j,k));
+      }
+    }
+  }
+}
+
+template<typename T, int DataLayout>
+static void test_simple_slice()
+{
+  Tensor<T, 5, DataLayout> tensor(2,3,5,7,11);
+  tensor.setRandom();
+
+  Tensor<T, 5, DataLayout> slice1(1,1,1,1,1);
+  Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5);
+  Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1);
+  slice1 = tensor.slice(indices, sizes);
+  VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
+
+  Tensor<T, 5, DataLayout> slice2(1,1,2,2,3);
+  Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5);
+  Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3);
+  slice2 = tensor.slice(indices2, sizes2);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+      }
+    }
+  }
+}
+
+template<typename T>
+static void test_const_slice()
+{
+  const T b[1] = {42};
+  TensorMap<Tensor<const T, 1> > m(b, 1);
+  DSizes<DenseIndex, 1> offsets;
+  offsets[0] = 0;
+  TensorRef<Tensor<const T, 1> > slice_ref(m.slice(offsets, m.dimensions()));
+  VERIFY_IS_EQUAL(slice_ref(0), 42);
+}
+
+template<typename T, int DataLayout>
+static void test_slice_in_expr() {
+  typedef Matrix<T, Dynamic, Dynamic, DataLayout> Mtx;
+  Mtx m1(7,7);
+  Mtx m2(3,3);
+  m1.setRandom();
+  m2.setRandom();
+
+  Mtx m3 = m1.block(1, 2, 3, 3) * m2.block(0, 2, 3, 1);
+
+  TensorMap<Tensor<T, 2, DataLayout>> tensor1(m1.data(), 7, 7);
+  TensorMap<Tensor<T, 2, DataLayout>> tensor2(m2.data(), 3, 3);
+  Tensor<T, 2, DataLayout> tensor3(3,1);
+  typedef typename Tensor<T, 1>::DimensionPair DimPair;
+  array<DimPair, 1> contract_along{{DimPair(1, 0)}};
+
+  Eigen::DSizes<ptrdiff_t, 2> indices1(1,2);
+  Eigen::DSizes<ptrdiff_t, 2> sizes1(3,3);
+  Eigen::DSizes<ptrdiff_t, 2> indices2(0,2);
+  Eigen::DSizes<ptrdiff_t, 2> sizes2(3,1);
+  tensor3 = tensor1.slice(indices1, sizes1).contract(tensor2.slice(indices2, sizes2), contract_along);
+
+  Map<Mtx> res(tensor3.data(), 3, 1);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 1; ++j) {
+      VERIFY_IS_APPROX(res(i,j), m3(i,j));
+    }
+  }
+
+  // Take an arbitrary slice of an arbitrarily sized tensor.
+  TensorMap<Tensor<const T, 2, DataLayout>> tensor4(m1.data(), 7, 7);
+  Tensor<T, 1, DataLayout> tensor6 = tensor4.reshape(DSizes<ptrdiff_t, 1>(7*7)).exp().slice(DSizes<ptrdiff_t, 1>(0), DSizes<ptrdiff_t, 1>(35));
+  for (int i = 0; i < 35; ++i) {
+    VERIFY_IS_APPROX(tensor6(i), expf(tensor4.data()[i]));
+  }
+}
+
+template<typename T, int DataLayout>
+static void test_slice_as_lvalue()
+{
+  Tensor<T, 3, DataLayout> tensor1(2,2,7);
+  tensor1.setRandom();
+  Tensor<T, 3, DataLayout> tensor2(2,2,7);
+  tensor2.setRandom();
+  Tensor<T, 3, DataLayout> tensor3(4,3,5);
+  tensor3.setRandom();
+  Tensor<T, 3, DataLayout> tensor4(4,3,2);
+  tensor4.setRandom();
+  Tensor<T, 3, DataLayout> tensor5(10,13,12);
+  tensor5.setRandom();
+
+  Tensor<T, 3, DataLayout> result(4,5,7);
+  Eigen::DSizes<ptrdiff_t, 3> sizes12(2,2,7);
+  Eigen::DSizes<ptrdiff_t, 3> first_slice(0,0,0);
+  result.slice(first_slice, sizes12) = tensor1;
+  Eigen::DSizes<ptrdiff_t, 3> second_slice(2,0,0);
+  result.slice(second_slice, sizes12).device(Eigen::DefaultDevice()) = tensor2;
+
+  Eigen::DSizes<ptrdiff_t, 3> sizes3(4,3,5);
+  Eigen::DSizes<ptrdiff_t, 3> third_slice(0,2,0);
+  result.slice(third_slice, sizes3) = tensor3;
+
+  Eigen::DSizes<ptrdiff_t, 3> sizes4(4,3,2);
+  Eigen::DSizes<ptrdiff_t, 3> fourth_slice(0,2,5);
+  result.slice(fourth_slice, sizes4) = tensor4;
+
+  for (int j = 0; j < 2; ++j) {
+    for (int k = 0; k < 7; ++k) {
+      for (int i = 0; i < 2; ++i) {
+        VERIFY_IS_EQUAL(result(i,j,k), tensor1(i,j,k));
+        VERIFY_IS_EQUAL(result(i+2,j,k), tensor2(i,j,k));
+      }
+    }
+  }
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 2; j < 5; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        VERIFY_IS_EQUAL(result(i,j,k), tensor3(i,j-2,k));
+      }
+      for (int k = 5; k < 7; ++k) {
+        VERIFY_IS_EQUAL(result(i,j,k), tensor4(i,j-2,k-5));
+      }
+    }
+  }
+
+  Eigen::DSizes<ptrdiff_t, 3> sizes5(4,5,7);
+  Eigen::DSizes<ptrdiff_t, 3> fifth_slice(0,0,0);
+  result.slice(fifth_slice, sizes5) = tensor5.slice(fifth_slice, sizes5);
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 2; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(result(i,j,k), tensor5(i,j,k));
+      }
+    }
+  }
+}
+
+template<typename T, int DataLayout>
+static void test_slice_raw_data()
+{
+  Tensor<T, 4, DataLayout> tensor(3,5,7,11);
+  tensor.setRandom();
+
+  Eigen::DSizes<ptrdiff_t, 4> offsets(1,2,3,4);
+  Eigen::DSizes<ptrdiff_t, 4> extents(1,1,1,1);
+  typedef TensorEvaluator<decltype(tensor.slice(offsets, extents)), DefaultDevice> SliceEvaluator;
+  auto slice1 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+  VERIFY_IS_EQUAL(slice1.dimensions().TotalSize(), 1);
+  VERIFY_IS_EQUAL(slice1.data()[0], tensor(1,2,3,4));
+
+  if (DataLayout == ColMajor) {
+    extents = Eigen::DSizes<ptrdiff_t, 4>(2,1,1,1);
+    auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2);
+    VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4));
+    VERIFY_IS_EQUAL(slice2.data()[1], tensor(2,2,3,4));
+  } else {
+    extents = Eigen::DSizes<ptrdiff_t, 4>(1,1,1,2);
+    auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2);
+    VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4));
+    VERIFY_IS_EQUAL(slice2.data()[1], tensor(1,2,3,5));
+  }
+
+  extents = Eigen::DSizes<ptrdiff_t, 4>(1,2,1,1);
+  auto slice3 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+  VERIFY_IS_EQUAL(slice3.dimensions().TotalSize(), 2);
+  VERIFY_IS_EQUAL(slice3.data(), static_cast<T*>(0));
+
+  if (DataLayout == ColMajor) {
+    offsets = Eigen::DSizes<ptrdiff_t, 4>(0,2,3,4);
+    extents = Eigen::DSizes<ptrdiff_t, 4>(3,2,1,1);
+    auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 6);
+    for (int i = 0; i < 3; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        VERIFY_IS_EQUAL(slice4.data()[i+3*j], tensor(i,2+j,3,4));
+      }
+    }
+  } else {
+    offsets = Eigen::DSizes<ptrdiff_t, 4>(1,2,3,0);
+    extents = Eigen::DSizes<ptrdiff_t, 4>(1,1,2,11);
+    auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 22);
+    for (int l = 0; l < 11; ++l) {
+      for (int k = 0; k < 2; ++k) {
+        VERIFY_IS_EQUAL(slice4.data()[l+11*k], tensor(1,2,3+k,l));
+      }
+    }
+  }
+
+  if (DataLayout == ColMajor) {
+    offsets = Eigen::DSizes<ptrdiff_t, 4>(0,0,0,4);
+    extents = Eigen::DSizes<ptrdiff_t, 4>(3,5,7,2);
+    auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 210);
+    for (int i = 0; i < 3; ++i) {
+      for (int j = 0; j < 5; ++j) {
+        for (int k = 0; k < 7; ++k) {
+          for (int l = 0; l < 2; ++l) {
+            int slice_index = i + 3 * (j + 5 * (k + 7 * l));
+            VERIFY_IS_EQUAL(slice5.data()[slice_index], tensor(i,j,k,l+4));
+          }
+        }
+      }
+    }
+  } else {
+    offsets = Eigen::DSizes<ptrdiff_t, 4>(1,0,0,0);
+    extents = Eigen::DSizes<ptrdiff_t, 4>(2,5,7,11);
+    auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 770);
+    for (int l = 0; l < 11; ++l) {
+      for (int k = 0; k < 7; ++k) {
+        for (int j = 0; j < 5; ++j) {
+          for (int i = 0; i < 2; ++i) {
+            int slice_index = l + 11 * (k + 7 * (j + 5 * i));
+            VERIFY_IS_EQUAL(slice5.data()[slice_index], tensor(i+1,j,k,l));
+          }
+        }
+      }
+    }
+
+  }
+
+  offsets = Eigen::DSizes<ptrdiff_t, 4>(0,0,0,0);
+  extents = Eigen::DSizes<ptrdiff_t, 4>(3,5,7,11);
+  auto slice6 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+  VERIFY_IS_EQUAL(slice6.dimensions().TotalSize(), 3*5*7*11);
+  VERIFY_IS_EQUAL(slice6.data(), tensor.data());
+}
+
+
+template<typename T, int DataLayout>
+static void test_strided_slice()
+{
+  typedef Tensor<T, 5, DataLayout> Tensor5f;
+  typedef Eigen::DSizes<Eigen::DenseIndex, 5> Index5;
+  typedef Tensor<T, 2, DataLayout> Tensor2f;
+  typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2;
+  Tensor<T, 5, DataLayout> tensor(2,3,5,7,11);
+  Tensor<T, 2, DataLayout> tensor2(7,11);
+  tensor.setRandom();
+  tensor2.setRandom();
+
+  if (true) {
+    Tensor2f slice(2,3);
+    Index2 strides(-2,-1);
+    Index2 indicesStart(5,7);
+    Index2 indicesStop(0,4);
+    slice = tensor2.stridedSlice(indicesStart, indicesStop, strides);
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice(j,k), tensor2(5-2*j,7-k));
+      }
+    }
+  }
+
+  if(true) {
+    Tensor2f slice(0,1);
+    Index2 strides(1,1);
+    Index2 indicesStart(5,4);
+    Index2 indicesStop(5,5);
+    slice = tensor2.stridedSlice(indicesStart, indicesStop, strides);
+  }
+
+  if(true) { // test clamped degenerate interavls
+    Tensor2f slice(7,11);
+    Index2 strides(1,-1);
+    Index2 indicesStart(-3,20); // should become 0,10
+    Index2 indicesStop(20,-11); // should become 11, -1
+    slice = tensor2.stridedSlice(indicesStart, indicesStop, strides);
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        VERIFY_IS_EQUAL(slice(j,k), tensor2(j,10-k));
+      }
+    }
+  }
+
+  if(true) {
+    Tensor5f slice1(1,1,1,1,1);
+    Eigen::DSizes<Eigen::DenseIndex, 5> indicesStart(1, 2, 3, 4, 5);
+    Eigen::DSizes<Eigen::DenseIndex, 5> indicesStop(2, 3, 4, 5, 6);
+    Eigen::DSizes<Eigen::DenseIndex, 5> strides(1, 1, 1, 1, 1);
+    slice1 = tensor.stridedSlice(indicesStart, indicesStop, strides);
+    VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
+  }
+
+  if(true) {
+    Tensor5f slice(1,1,2,2,3);
+    Index5 start(1, 1, 3, 4, 5);
+    Index5 stop(2, 2, 5, 6, 8);
+    Index5 strides(1, 1, 1, 1, 1);
+    slice = tensor.stridedSlice(start, stop, strides);
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        for (int k = 0; k < 3; ++k) {
+          VERIFY_IS_EQUAL(slice(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+        }
+      }
+    }
+  }
+
+  if(true) {
+    Tensor5f slice(1,1,2,2,3);
+    Index5 strides3(1, 1, -2, 1, -1);
+    Index5 indices3Start(1, 1, 4, 4, 7);
+    Index5 indices3Stop(2, 2, 0, 6, 4);
+    slice = tensor.stridedSlice(indices3Start, indices3Stop, strides3);
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        for (int k = 0; k < 3; ++k) {
+          VERIFY_IS_EQUAL(slice(0,0,i,j,k), tensor(1,1,4-2*i,4+j,7-k));
+        }
+      }
+    }
+  }
+
+  if(false) { // tests degenerate interval
+    Tensor5f slice(1,1,2,2,3);
+    Index5 strides3(1, 1, 2, 1, 1);
+    Index5 indices3Start(1, 1, 4, 4, 7);
+    Index5 indices3Stop(2, 2, 0, 6, 4);
+    slice = tensor.stridedSlice(indices3Start, indices3Stop, strides3);
+  }
+}
+
+template<typename T, int DataLayout>
+static void test_strided_slice_write()
+{
+  typedef Tensor<T, 2, DataLayout> Tensor2f;
+  typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2;
+
+  Tensor<T, 2, DataLayout> tensor(7,11),tensor2(7,11);
+  tensor.setRandom();
+  tensor2=tensor;
+  Tensor2f slice(2,3);
+
+  slice.setRandom();
+
+  Index2 strides(1,1);
+  Index2 indicesStart(3,4);
+  Index2 indicesStop(5,7);
+  Index2 lengths(2,3);
+
+  tensor.slice(indicesStart,lengths)=slice;
+  tensor2.stridedSlice(indicesStart,indicesStop,strides)=slice;
+
+  for(int i=0;i<7;i++) for(int j=0;j<11;j++){
+    VERIFY_IS_EQUAL(tensor(i,j), tensor2(i,j));
+  }
+}
+
+template<typename T, int DataLayout>
+static void test_composition()
+{
+  Eigen::Tensor<T, 2, DataLayout> matrix(7, 11);
+  matrix.setRandom();
+
+  const DSizes<ptrdiff_t, 3> newDims(1, 1, 11);
+  Eigen::Tensor<T, 3, DataLayout> tensor =
+      matrix.slice(DSizes<ptrdiff_t, 2>(2, 0), DSizes<ptrdiff_t, 2>(1, 11)).reshape(newDims);
+
+  VERIFY_IS_EQUAL(tensor.dimensions().TotalSize(), 11);
+  VERIFY_IS_EQUAL(tensor.dimension(0), 1);
+  VERIFY_IS_EQUAL(tensor.dimension(1), 1);
+  VERIFY_IS_EQUAL(tensor.dimension(2), 11);
+  for (int i = 0; i < 11; ++i) {
+    VERIFY_IS_EQUAL(tensor(0,0,i), matrix(2,i));
+  }
+}
+
+template<typename T, int DataLayout>
+static void test_empty_slice()
+{
+  Tensor<T, 3, DataLayout> tensor(2,3,5);
+  tensor.setRandom();
+  Tensor<T, 3, DataLayout> copy = tensor;
+
+  // empty size in first dimension
+  Eigen::DSizes<ptrdiff_t, 3> indices1(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes1(0,1,2);
+  Tensor<T, 3, DataLayout> slice1(0,1,2);
+  slice1.setRandom();
+  tensor.slice(indices1, sizes1) = slice1;
+
+  // empty size in second dimension
+  Eigen::DSizes<ptrdiff_t, 3> indices2(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes2(1,0,2);
+  Tensor<T, 3, DataLayout> slice2(1,0,2);
+  slice2.setRandom();
+  tensor.slice(indices2, sizes2) = slice2;
+
+  // empty size in third dimension
+  Eigen::DSizes<ptrdiff_t, 3> indices3(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes3(1,1,0);
+  Tensor<T, 3, DataLayout> slice3(1,1,0);
+  slice3.setRandom();
+  tensor.slice(indices3, sizes3) = slice3;
+
+  // empty size in first and second dimension
+  Eigen::DSizes<ptrdiff_t, 3> indices4(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes4(0,0,2);
+  Tensor<T, 3, DataLayout> slice4(0,0,2);
+  slice4.setRandom();
+  tensor.slice(indices4, sizes4) = slice4;
+
+  // empty size in second and third dimension
+  Eigen::DSizes<ptrdiff_t, 3> indices5(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes5(1,0,0);
+  Tensor<T, 3, DataLayout> slice5(1,0,0);
+  slice5.setRandom();
+  tensor.slice(indices5, sizes5) = slice5;
+
+  // empty size in all dimensions
+  Eigen::DSizes<ptrdiff_t, 3> indices6(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes6(0,0,0);
+  Tensor<T, 3, DataLayout> slice6(0,0,0);
+  slice6.setRandom();
+  tensor.slice(indices6, sizes6) = slice6;
+
+  // none of these operations should change the tensor's components
+  // because all of the rvalue slices have at least one zero dimension
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+          VERIFY_IS_EQUAL(tensor(i,j,k), copy(i,j,k));
+      }
+    }
+  }
+}
+
+#define CALL_SUBTEST_PART(PART) \
+  CALL_SUBTEST_##PART
+
+#define CALL_SUBTESTS_TYPES_LAYOUTS(PART, NAME)       \
+  CALL_SUBTEST_PART(PART)((NAME<float, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, RowMajor>()))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_morphing)
+{
+  CALL_SUBTEST_1(test_simple_reshape<void>());
+  CALL_SUBTEST_1(test_static_reshape<void>());
+  CALL_SUBTEST_1(test_reshape_as_lvalue<void>());
+  CALL_SUBTEST_1(test_reshape_in_expr<void>());
+  CALL_SUBTEST_1(test_const_slice<float>());
+
+  CALL_SUBTESTS_TYPES_LAYOUTS(2, test_simple_slice);
+  CALL_SUBTESTS_TYPES_LAYOUTS(3, test_slice_as_lvalue);
+  CALL_SUBTESTS_TYPES_LAYOUTS(4, test_slice_raw_data);
+  CALL_SUBTESTS_TYPES_LAYOUTS(5, test_strided_slice_write);
+  CALL_SUBTESTS_TYPES_LAYOUTS(6, test_strided_slice);
+  CALL_SUBTESTS_TYPES_LAYOUTS(7, test_composition);
+}

diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
new file mode 100644
index 0000000..bf001b4
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp

@@ -0,0 +1,386 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_reshape(const Eigen::SyclDevice& sycl_device)
+{
+  typename Tensor<DataType, 5 ,DataLayout, IndexType>::Dimensions dim1(2,3,1,7,1);
+  typename Tensor<DataType, 3 ,DataLayout, IndexType>::Dimensions dim2(2,3,7);
+  typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim3(6,7);
+  typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim4(2,21);
+
+  Tensor<DataType, 5, DataLayout, IndexType> tensor1(dim1);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor2(dim2);
+  Tensor<DataType, 2, DataLayout, IndexType> tensor3(dim3);
+  Tensor<DataType, 2, DataLayout, IndexType> tensor4(dim4);
+
+  tensor1.setRandom();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType)));
+  DataType* gpu_data4  = static_cast<DataType*>(sycl_device.allocate(tensor4.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, dim1);
+  TensorMap<Tensor<DataType, 3,DataLayout, IndexType>> gpu2(gpu_data2, dim2);
+  TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu3(gpu_data3, dim3);
+  TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu4(gpu_data4, dim4);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+
+  gpu2.device(sycl_device)=gpu1.reshape(dim2);
+  sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor1.size())*sizeof(DataType));
+
+  gpu3.device(sycl_device)=gpu1.reshape(dim3);
+  sycl_device.memcpyDeviceToHost(tensor3.data(), gpu_data3,(tensor3.size())*sizeof(DataType));
+
+  gpu4.device(sycl_device)=gpu1.reshape(dim2).reshape(dim4);
+  sycl_device.memcpyDeviceToHost(tensor4.data(), gpu_data4,(tensor4.size())*sizeof(DataType));
+  for (IndexType i = 0; i < 2; ++i){
+    for (IndexType j = 0; j < 3; ++j){
+      for (IndexType k = 0; k < 7; ++k){
+        VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k));      ///ColMajor
+        if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) {
+          VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i+2*j,k));    ///ColMajor
+          VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j+3*k));    ///ColMajor
+        }
+        else{
+          //VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k));      /// RowMajor
+          VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j*7 +k));   /// RowMajor
+          VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i*3 +j,k));   /// RowMajor
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+  sycl_device.deallocate(gpu_data4);
+}
+
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device)
+{
+  typename Tensor<DataType, 3, DataLayout, IndexType>::Dimensions dim1(2,3,7);
+  typename Tensor<DataType, 2, DataLayout, IndexType>::Dimensions dim2(6,7);
+  typename Tensor<DataType, 5, DataLayout, IndexType>::Dimensions dim3(2,3,1,7,1);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor(dim1);
+  Tensor<DataType, 2, DataLayout, IndexType> tensor2d(dim2);
+  Tensor<DataType, 5, DataLayout, IndexType> tensor5d(dim3);
+
+  tensor.setRandom();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2d.size()*sizeof(DataType)));
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(tensor5d.size()*sizeof(DataType)));
+
+  TensorMap< Tensor<DataType, 3, DataLayout, IndexType> > gpu1(gpu_data1, dim1);
+  TensorMap< Tensor<DataType, 2, DataLayout, IndexType> > gpu2(gpu_data2, dim2);
+  TensorMap< Tensor<DataType, 5, DataLayout, IndexType> > gpu3(gpu_data3, dim3);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+
+  gpu2.reshape(dim1).device(sycl_device)=gpu1;
+  sycl_device.memcpyDeviceToHost(tensor2d.data(), gpu_data2,(tensor2d.size())*sizeof(DataType));
+
+  gpu3.reshape(dim1).device(sycl_device)=gpu1;
+  sycl_device.memcpyDeviceToHost(tensor5d.data(), gpu_data3,(tensor5d.size())*sizeof(DataType));
+
+
+  for (IndexType i = 0; i < 2; ++i){
+    for (IndexType j = 0; j < 3; ++j){
+      for (IndexType k = 0; k < 7; ++k){
+        VERIFY_IS_EQUAL(tensor5d(i,j,0,k,0), tensor(i,j,k));
+        if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) {
+          VERIFY_IS_EQUAL(tensor2d(i+2*j,k), tensor(i,j,k));    ///ColMajor
+        }
+        else{
+          VERIFY_IS_EQUAL(tensor2d(i*3 +j,k),tensor(i,j,k));   /// RowMajor
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_slice(const Eigen::SyclDevice &sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 5,DataLayout, IndexType> tensor(tensorRange);
+  tensor.setRandom();
+  array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range);
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range);
+  Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1);
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.slice(indices, sizes);
+  sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType));
+  VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
+
+
+  array<IndexType, 5> slice2_range ={{1,1,2,2,3}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range);
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range);
+  Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3);
+  gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2);
+  sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType));
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 2; ++j) {
+      for (IndexType k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_strided_slice_as_rhs_sycl(const Eigen::SyclDevice &sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+  typedef Eigen::DSizes<IndexType, 5> Index5;
+  Index5 strides(1L,1L,1L,1L,1L);
+  Index5 indicesStart(1L,2L,3L,4L,5L);
+  Index5 indicesStop(2L,3L,4L,5L,6L);
+  Index5 lengths(1L,1L,1L,1L,1L);
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 5, DataLayout, IndexType> tensor(tensorRange);
+  tensor.setRandom();
+
+  array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range);
+  Tensor<DataType, 5, DataLayout, IndexType> slice_stride1(slice1_range);
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType)));
+  DataType* gpu_data_stride2  = static_cast<DataType*>(sycl_device.allocate(slice_stride1.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu_stride2(gpu_data_stride2, slice1_range);
+
+  Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1);
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.slice(indices, sizes);
+  sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType));
+
+  gpu_stride2.device(sycl_device)=gpu1.stridedSlice(indicesStart,indicesStop,strides);
+  sycl_device.memcpyDeviceToHost(slice_stride1.data(), gpu_data_stride2,(slice_stride1.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
+  VERIFY_IS_EQUAL(slice_stride1(0,0,0,0,0), tensor(1,2,3,4,5));
+
+  array<IndexType, 5> slice2_range ={{1,1,2,2,3}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range);
+  Tensor<DataType, 5, DataLayout, IndexType> strideSlice2(slice2_range);
+
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType)));
+  DataType* gpu_data_stride3  = static_cast<DataType*>(sycl_device.allocate(strideSlice2.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu_stride3(gpu_data_stride3, slice2_range);
+  Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3);
+  Index5 strides2(1L,1L,1L,1L,1L);
+  Index5 indicesStart2(1L,1L,3L,4L,5L);
+  Index5 indicesStop2(2L,2L,5L,6L,8L);
+
+  gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2);
+  sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType));
+
+  gpu_stride3.device(sycl_device)=gpu1.stridedSlice(indicesStart2,indicesStop2,strides2);
+  sycl_device.memcpyDeviceToHost(strideSlice2.data(), gpu_data_stride3,(strideSlice2.size())*sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 2; ++j) {
+      for (IndexType k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+        VERIFY_IS_EQUAL(strideSlice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  typedef Tensor<DataType, 2, DataLayout, IndexType> Tensor2f;
+  typedef Eigen::DSizes<IndexType, 2> Index2;
+  IndexType sizeDim1 = 7L;
+  IndexType sizeDim2 = 11L;
+  array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}};
+  Tensor<DataType, 2, DataLayout, IndexType> tensor(tensorRange),tensor2(tensorRange);
+  IndexType sliceDim1 = 2;
+  IndexType sliceDim2 = 3;
+  array<IndexType, 2> sliceRange = {{sliceDim1, sliceDim2}};
+  Tensor2f slice(sliceRange);
+  Index2 strides(1L,1L);
+  Index2 indicesStart(3L,4L);
+  Index2 indicesStop(5L,7L);
+  Index2 lengths(2L,3L);
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(slice.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu2(gpu_data2, tensorRange);
+  TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu3(gpu_data3, sliceRange);
+
+
+  tensor.setRandom();
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1;
+
+  slice.setRandom();
+  sycl_device.memcpyHostToDevice(gpu_data3, slice.data(),(slice.size())*sizeof(DataType));
+
+
+  gpu1.slice(indicesStart,lengths).device(sycl_device)=gpu3;
+  gpu2.stridedSlice(indicesStart,indicesStop,strides).device(sycl_device)=gpu3;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data1,(tensor.size())*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType));
+
+  for(IndexType i=0;i<sizeDim1;i++)
+    for(IndexType j=0;j<sizeDim2;j++){
+    VERIFY_IS_EQUAL(tensor(i,j), tensor2(i,j));
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+template <typename OutIndex, typename DSizes>
+Eigen::array<OutIndex, DSizes::count> To32BitDims(const DSizes& in) {
+  Eigen::array<OutIndex, DSizes::count> out;
+  for (int i = 0; i < DSizes::count; ++i) {
+    out[i] = in[i];
+  }
+  return out;
+}
+
+template <class DataType, int DataLayout, typename IndexType, typename ConvertedIndexType>
+int run_eigen(const SyclDevice& sycl_device) {
+  using TensorI64 = Tensor<DataType, 5, DataLayout, IndexType>;
+  using TensorI32 = Tensor<DataType, 5, DataLayout, ConvertedIndexType>;
+  using TensorMI64 = TensorMap<TensorI64>;
+  using TensorMI32 = TensorMap<TensorI32>;
+  Eigen::array<IndexType, 5> tensor_range{{4, 1, 1, 1, 6}};
+  Eigen::array<IndexType, 5> slice_range{{4, 1, 1, 1, 3}};
+
+  TensorI64 out_tensor_gpu(tensor_range);
+  TensorI64 out_tensor_cpu(tensor_range);
+  out_tensor_cpu.setRandom();
+
+  TensorI64 sub_tensor(slice_range);
+  sub_tensor.setRandom();
+
+  DataType* out_gpu_data = static_cast<DataType*>(sycl_device.allocate(out_tensor_cpu.size() * sizeof(DataType)));
+  DataType* sub_gpu_data = static_cast<DataType*>(sycl_device.allocate(sub_tensor.size() * sizeof(DataType)));
+  TensorMI64 out_gpu(out_gpu_data, tensor_range);
+  TensorMI64 sub_gpu(sub_gpu_data, slice_range);
+
+  sycl_device.memcpyHostToDevice(out_gpu_data, out_tensor_cpu.data(), out_tensor_cpu.size() * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(sub_gpu_data, sub_tensor.data(), sub_tensor.size() * sizeof(DataType));
+
+  Eigen::array<ConvertedIndexType, 5> slice_offset_32{{0, 0, 0, 0, 3}};
+  Eigen::array<ConvertedIndexType, 5> slice_range_32{{4, 1, 1, 1, 3}};
+  TensorMI32 out_cpu_32(out_tensor_cpu.data(), To32BitDims<ConvertedIndexType>(out_tensor_cpu.dimensions()));
+  TensorMI32 sub_cpu_32(sub_tensor.data(), To32BitDims<ConvertedIndexType>(sub_tensor.dimensions()));
+  TensorMI32 out_gpu_32(out_gpu.data(), To32BitDims<ConvertedIndexType>(out_gpu.dimensions()));
+  TensorMI32 sub_gpu_32(sub_gpu.data(), To32BitDims<ConvertedIndexType>(sub_gpu.dimensions()));
+
+  out_gpu_32.slice(slice_offset_32, slice_range_32).device(sycl_device) = sub_gpu_32;
+
+  out_cpu_32.slice(slice_offset_32, slice_range_32) = sub_cpu_32;
+
+  sycl_device.memcpyDeviceToHost(out_tensor_gpu.data(), out_gpu_data, out_tensor_cpu.size() * sizeof(DataType));
+  int has_err = 0;
+  for (IndexType i = 0; i < out_tensor_cpu.size(); ++i) {
+    auto exp = out_tensor_cpu(i);
+    auto val = out_tensor_gpu(i);
+    if (val != exp) {
+      std::cout << "#" << i << " got " << val << " but expected " << exp << std::endl;
+      has_err = 1;
+    }
+  }
+  sycl_device.deallocate(out_gpu_data);
+  sycl_device.deallocate(sub_gpu_data);
+  return has_err;
+}
+
+template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_slice<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_slice<DataType, ColMajor, int64_t>(sycl_device);
+  test_simple_reshape<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_reshape<DataType, ColMajor, int64_t>(sycl_device);
+  test_reshape_as_lvalue<DataType, RowMajor, int64_t>(sycl_device);
+  test_reshape_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
+  test_strided_slice_write_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_strided_slice_write_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_strided_slice_as_rhs_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_strided_slice_as_rhs_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  run_eigen<float, RowMajor, long, int>(sycl_device); 
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_morphing_test_per_device<float>(device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_move.cpp b/unsupported/test/cxx11_tensor_move.cpp
new file mode 100644
index 0000000..a298231
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_move.cpp

@@ -0,0 +1,76 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Viktor Csomor <viktor.csomor@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+#include <utility>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void calc_indices(int i, int& x, int& y, int& z)
+{
+  x = i / 4;
+  y = (i % 4) / 2;
+  z = i % 2;
+}
+
+static void test_move()
+{
+  int x;
+  int y;
+  int z;
+
+  Tensor<int,3> tensor1(2, 2, 2);
+  Tensor<int,3,RowMajor> tensor2(2, 2, 2);
+
+  for (int i = 0; i < 8; i++)
+  {
+    calc_indices(i, x, y, z);
+    tensor1(x,y,z) = i;
+    tensor2(x,y,z) = 2 * i;
+  }
+
+  // Invokes the move constructor.
+  Tensor<int,3> moved_tensor1 = std::move(tensor1);
+  Tensor<int,3,RowMajor> moved_tensor2 = std::move(tensor2);
+
+  VERIFY_IS_EQUAL(tensor1.size(), 0);
+  VERIFY_IS_EQUAL(tensor2.size(), 0);
+
+  for (int i = 0; i < 8; i++)
+  {
+    calc_indices(i, x, y, z);
+    VERIFY_IS_EQUAL(moved_tensor1(x,y,z), i);
+    VERIFY_IS_EQUAL(moved_tensor2(x,y,z), 2 * i);
+  }
+
+  Tensor<int,3> moved_tensor3(2,2,2);
+  Tensor<int,3,RowMajor> moved_tensor4(2,2,2);
+
+  moved_tensor3.setZero();
+  moved_tensor4.setZero();
+
+  // Invokes the move assignment operator.
+  moved_tensor3 = std::move(moved_tensor1);
+  moved_tensor4 = std::move(moved_tensor2);
+
+  for (int i = 0; i < 8; i++)
+  {
+    calc_indices(i, x, y, z);
+    VERIFY_IS_EQUAL(moved_tensor3(x,y,z), i);
+    VERIFY_IS_EQUAL(moved_tensor4(x,y,z), 2 * i);
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_move)
+{
+  CALL_SUBTEST(test_move());
+}

diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp
new file mode 100644
index 0000000..8e81653
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_notification.cpp

@@ -0,0 +1,64 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Vijay Vasudevan <vrv@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+#include <atomic>
+
+#include <stdlib.h>
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+
+static void test_notification_single()
+{
+  ThreadPool thread_pool(1);
+
+  std::atomic<int> counter(0);
+  Eigen::Notification n;
+  auto func = [&n, &counter](){ n.Wait(); ++counter;};
+  thread_pool.Schedule(func);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+
+  // The thread should be waiting for the notification.
+  VERIFY_IS_EQUAL(counter, 0);
+
+  // Unblock the thread
+  n.Notify();
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+
+  // Verify the counter has been incremented
+  VERIFY_IS_EQUAL(counter, 1);
+}
+
+// Like test_notification_single() but enqueues multiple threads to
+// validate that all threads get notified by Notify().
+static void test_notification_multiple()
+{
+  ThreadPool thread_pool(1);
+
+  std::atomic<int> counter(0);
+  Eigen::Notification n;
+  auto func = [&n, &counter](){ n.Wait(); ++counter;};
+  thread_pool.Schedule(func);
+  thread_pool.Schedule(func);
+  thread_pool.Schedule(func);
+  thread_pool.Schedule(func);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+  VERIFY_IS_EQUAL(counter, 0);
+  n.Notify();
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+  VERIFY_IS_EQUAL(counter, 4);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_notification)
+{
+  CALL_SUBTEST(test_notification_single());
+  CALL_SUBTEST(test_notification_multiple());
+}

diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp
new file mode 100644
index 0000000..99e1807
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_of_complex.cpp

@@ -0,0 +1,103 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+
+
+static void test_additions()
+{
+  Tensor<std::complex<float>, 1> data1(3);
+  Tensor<std::complex<float>, 1> data2(3);
+  for (int i = 0; i < 3; ++i) {
+    data1(i) = std::complex<float>(i, -i);
+    data2(i) = std::complex<float>(i, 7 * i);
+  }
+
+  Tensor<std::complex<float>, 1> sum = data1 + data2;
+  for (int i = 0; i < 3; ++i) {
+    VERIFY_IS_EQUAL(sum(i),  std::complex<float>(2*i, 6*i));
+  }
+}
+
+
+static void test_abs()
+{
+  Tensor<std::complex<float>, 1> data1(3);
+  Tensor<std::complex<double>, 1> data2(3);
+  data1.setRandom();
+  data2.setRandom();
+
+  Tensor<float, 1> abs1 = data1.abs();
+  Tensor<double, 1> abs2 = data2.abs();
+  for (int i = 0; i < 3; ++i) {
+    VERIFY_IS_APPROX(abs1(i), std::abs(data1(i)));
+    VERIFY_IS_APPROX(abs2(i), std::abs(data2(i)));
+  }
+}
+
+
+static void test_conjugate()
+{
+  Tensor<std::complex<float>, 1> data1(3);
+  Tensor<std::complex<double>, 1> data2(3);
+  Tensor<int, 1> data3(3);
+  data1.setRandom();
+  data2.setRandom();
+  data3.setRandom();
+
+  Tensor<std::complex<float>, 1> conj1 = data1.conjugate();
+  Tensor<std::complex<double>, 1> conj2 = data2.conjugate();
+  Tensor<int, 1> conj3 = data3.conjugate();
+  for (int i = 0; i < 3; ++i) {
+    VERIFY_IS_APPROX(conj1(i), std::conj(data1(i)));
+    VERIFY_IS_APPROX(conj2(i), std::conj(data2(i)));
+    VERIFY_IS_APPROX(conj3(i), data3(i));
+  }
+}
+
+static void test_contractions()
+{
+  Tensor<std::complex<float>, 4> t_left(30, 50, 8, 31);
+  Tensor<std::complex<float>, 5> t_right(8, 31, 7, 20, 10);
+  Tensor<std::complex<float>, 5> t_result(30, 50, 7, 20, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  typedef Map<Matrix<std::complex<float>, Dynamic, Dynamic>> MapXcf;
+  MapXcf m_left(t_left.data(), 1500, 248);
+  MapXcf m_right(t_right.data(), 248, 1400);
+  Matrix<std::complex<float>, Dynamic, Dynamic> m_result(1500, 1400);
+
+  // This contraction should be equivalent to a regular matrix multiplication
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims;
+  dims[0] = DimPair(2, 0);
+  dims[1] = DimPair(3, 1);
+  t_result = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+  for (int i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_of_complex)
+{
+  CALL_SUBTEST(test_additions());
+  CALL_SUBTEST(test_abs());
+  CALL_SUBTEST(test_conjugate());
+  CALL_SUBTEST(test_contractions());
+}

diff --git a/unsupported/test/cxx11_tensor_of_const_values.cpp b/unsupported/test/cxx11_tensor_of_const_values.cpp
new file mode 100644
index 0000000..344d678
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_of_const_values.cpp

@@ -0,0 +1,105 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_assign()
+{
+  float data1[6];
+  TensorMap<Tensor<const float, 2>> mat1(data1, 2, 3);
+  float data2[6];
+  const TensorMap<Tensor<float, 2>> mat2(data2, 2, 3);
+
+  for (int i = 0; i < 6; ++i) {
+    data1[i] = i;
+    data2[i] = -i;
+  }
+
+  Tensor<float, 2> rslt1;
+  rslt1 = mat1;
+  Tensor<float, 2> rslt2;
+  rslt2 = mat2;
+
+  Tensor<float, 2> rslt3 = mat1;
+  Tensor<float, 2> rslt4 = mat2;
+
+  Tensor<float, 2> rslt5(mat1);
+  Tensor<float, 2> rslt6(mat2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_APPROX(rslt1(i,j), static_cast<float>(i + 2*j));
+      VERIFY_IS_APPROX(rslt2(i,j), static_cast<float>(-i - 2*j));
+      VERIFY_IS_APPROX(rslt3(i,j), static_cast<float>(i + 2*j));
+      VERIFY_IS_APPROX(rslt4(i,j), static_cast<float>(-i - 2*j));
+      VERIFY_IS_APPROX(rslt5(i,j), static_cast<float>(i + 2*j));
+      VERIFY_IS_APPROX(rslt6(i,j), static_cast<float>(-i - 2*j));
+    }
+  }
+}
+
+
+static void test_plus()
+{
+  float data1[6];
+  TensorMap<Tensor<const float, 2>> mat1(data1, 2, 3);
+  float data2[6];
+  TensorMap<Tensor<float, 2>> mat2(data2, 2, 3);
+
+  for (int i = 0; i < 6; ++i) {
+    data1[i] = i;
+    data2[i] = -i;
+  }
+
+  Tensor<float, 2> sum1;
+  sum1 = mat1 + mat2;
+  Tensor<float, 2> sum2;
+  sum2 = mat2 + mat1;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_APPROX(sum1(i,j), 0.0f);
+      VERIFY_IS_APPROX(sum2(i,j), 0.0f);
+    }
+  }
+}
+
+
+static void test_plus_equal()
+{
+  float data1[6];
+  TensorMap<Tensor<const float, 2>> mat1(data1, 2, 3);
+  float data2[6];
+  TensorMap<Tensor<float, 2>> mat2(data2, 2, 3);
+
+  for (int i = 0; i < 6; ++i) {
+    data1[i] = i;
+    data2[i] = -i;
+  }
+  mat2 += mat1;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_APPROX(mat2(i,j), 0.0f);
+    }
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_of_const_values)
+{
+  CALL_SUBTEST(test_assign());
+  CALL_SUBTEST(test_plus());
+  CALL_SUBTEST(test_plus_equal());
+}

diff --git a/unsupported/test/cxx11_tensor_of_float16_gpu.cu b/unsupported/test/cxx11_tensor_of_float16_gpu.cu
new file mode 100644
index 0000000..30bcc1d
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu

@@ -0,0 +1,488 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+
+using Eigen::Tensor;
+
+template<typename>
+void test_gpu_numext() {
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
+  bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
+
+  Tensor<bool, 1> half_prec(num_elem);
+  Tensor<bool, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking numext " << i << std::endl;
+    VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+
+#ifdef EIGEN_HAS_GPU_FP16
+
+template<typename>
+void test_gpu_conversion() {
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
+      d_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
+      d_conv, num_elem);
+
+  gpu_float.device(gpu_device) = gpu_float.random();
+  gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
+  gpu_conv.device(gpu_device) = gpu_half.cast<float>();
+
+  Tensor<float, 1> initial(num_elem);
+  Tensor<float, 1> final(num_elem);
+  gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
+
+  for (int i = 0; i < num_elem; ++i) {
+    VERIFY_IS_APPROX(initial(i), final(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_half);
+  gpu_device.deallocate(d_conv);
+}
+
+template<typename>
+void test_gpu_unary() {
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(gpu_device) = gpu_float.abs();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
+
+  Tensor<float, 1> half_prec(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking unary " << i << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_gpu_elementwise() {
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
+      d_float1, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
+      d_float2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random();
+  gpu_float2.device(gpu_device) = gpu_float2.random();
+  gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1;
+  gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
+
+  Tensor<float, 1> half_prec(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
+    VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
+  }
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_gpu_trancendental() {
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
+  gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
+  gpu_float3.device(gpu_device) = gpu_float3.random();
+  gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
+  gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
+  gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
+  gpu_res4_float.device(gpu_device) = gpu_float3.expm1().cast<Eigen::half>();
+
+  gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
+  gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
+
+  gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>();
+  gpu_res2_half.device(gpu_device) = gpu_res2_half.log();
+
+  gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
+  gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
+
+  gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
+  gpu_res3_half.device(gpu_device) = gpu_res3_half.expm1();
+
+  Tensor<float, 1> input1(num_elem);
+  Tensor<Eigen::half, 1> half_prec1(num_elem);
+  Tensor<Eigen::half, 1> full_prec1(num_elem);
+  Tensor<float, 1> input2(num_elem);
+  Tensor<Eigen::half, 1> half_prec2(num_elem);
+  Tensor<Eigen::half, 1> full_prec2(num_elem);
+  Tensor<float, 1> input3(num_elem);
+  Tensor<Eigen::half, 1> half_prec3(num_elem);
+  Tensor<Eigen::half, 1> full_prec3(num_elem);
+  gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
+  }
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
+    if(std::abs(input2(i)-1.f)<0.05f) // log lacks accuracy nearby 1
+      VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
+    else
+      VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
+  }
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
+  }
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_float3);
+  gpu_device.deallocate(d_res1_half);
+  gpu_device.deallocate(d_res1_float);
+  gpu_device.deallocate(d_res2_half);
+  gpu_device.deallocate(d_res2_float);
+  gpu_device.deallocate(d_res3_float);
+  gpu_device.deallocate(d_res3_half);
+}
+
+template<typename>
+void test_gpu_contractions() {
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int rows = 23;
+  int cols = 23;
+  int num_elem = rows*cols;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+      d_float1, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+      d_float2, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
+      d_res_half, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
+      d_res_float, rows, cols);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
+  gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
+
+  typedef Tensor<float, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+  gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
+
+  Tensor<Eigen::half, 2> half_prec(rows, cols);
+  Tensor<Eigen::half, 2> full_prec(rows, cols);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
+      if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
+        VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
+      }
+    }
+  }
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_gpu_reductions(int size1, int size2, int redux) {
+
+   std::cout << "Reducing " << size1 << " by " << size2
+             << " tensor along dim " << redux << std::endl;
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = size1*size2;
+  int result_size = (redux == 1 ? size1 : size2);
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
+      d_float, size1, size2);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, result_size);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, result_size);
+
+  gpu_float.device(gpu_device) = gpu_float.random() * 2.0f;
+
+  Eigen::array<int, 1> redux_dim = {redux};
+  gpu_res_float.device(gpu_device) = gpu_float.sum(redux_dim).cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().sum(redux_dim);
+
+  Tensor<Eigen::half, 1> half_prec(result_size);
+  Tensor<Eigen::half, 1> full_prec(result_size);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < result_size; ++i) {
+    std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_gpu_reductions() {
+  test_gpu_reductions<void>(13, 13, 0);
+  test_gpu_reductions<void>(13, 13, 1);
+
+  test_gpu_reductions<void>(35, 36, 0);
+  test_gpu_reductions<void>(35, 36, 1);
+
+  test_gpu_reductions<void>(36, 35, 0);
+  test_gpu_reductions<void>(36, 35, 1);
+}
+
+template<typename>
+void test_gpu_full_reductions() {
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int size = 13;
+  int num_elem = size*size;
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
+      d_float, size, size);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
+      d_res_half);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
+      d_res_float);
+
+  gpu_float.device(gpu_device) = gpu_float.random();
+
+  gpu_res_float.device(gpu_device) = gpu_float.sum().cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().sum();
+
+  Tensor<Eigen::half, 0> half_prec;
+  Tensor<Eigen::half, 0> full_prec;
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  VERIFY_IS_APPROX(full_prec(), half_prec());
+
+  gpu_res_float.device(gpu_device) = gpu_float.maximum().cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().maximum();
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  VERIFY_IS_APPROX(full_prec(), half_prec());
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_gpu_forced_evals() {
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
+      d_res_half1, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
+      d_res_half2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  Eigen::array<int, 1> no_bcast;
+  no_bcast[0] = 1;
+
+  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(gpu_device) = gpu_float.abs();
+  gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
+  gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
+
+  Tensor<float, 1> half_prec1(num_elem);
+  Tensor<float, 1> half_prec2(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half2, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
+    VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_res_half1);
+  gpu_device.deallocate(d_res_half2);
+  gpu_device.deallocate(d_res_float);
+}
+#endif
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu)
+{
+  CALL_SUBTEST_1(test_gpu_numext<void>());
+
+#ifdef EIGEN_HAS_GPU_FP16
+  CALL_SUBTEST_1(test_gpu_conversion<void>());
+  CALL_SUBTEST_1(test_gpu_unary<void>());
+  CALL_SUBTEST_1(test_gpu_elementwise<void>());
+  CALL_SUBTEST_1(test_gpu_trancendental<void>());
+  CALL_SUBTEST_2(test_gpu_contractions<void>());
+  CALL_SUBTEST_3(test_gpu_reductions<void>());
+  CALL_SUBTEST_4(test_gpu_full_reductions<void>());
+  CALL_SUBTEST_5(test_gpu_forced_evals<void>());
+#else
+  std::cout << "Half floats are not supported by this version of gpu: skipping the test" << std::endl;
+#endif
+}

diff --git a/unsupported/test/cxx11_tensor_of_strings.cpp b/unsupported/test/cxx11_tensor_of_strings.cpp
new file mode 100644
index 0000000..1596562
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_of_strings.cpp

@@ -0,0 +1,152 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+static void test_assign()
+{
+  std::string data1[6];
+  TensorMap<Tensor<std::string, 2>> mat1(data1, 2, 3);
+  std::string data2[6];
+  const TensorMap<Tensor<const std::string, 2>> mat2(data2, 2, 3);
+
+  for (int i = 0; i < 6; ++i) {
+    std::ostringstream s1;
+    s1 << "abc" << i*3;
+    data1[i] = s1.str();
+    std::ostringstream s2;
+    s2 << "def" << i*5;
+    data2[i] = s2.str();
+  }
+
+  Tensor<std::string, 2> rslt1;
+  rslt1 = mat1;
+  Tensor<std::string, 2> rslt2;
+  rslt2 = mat2;
+
+  Tensor<std::string, 2> rslt3 = mat1;
+  Tensor<std::string, 2> rslt4 = mat2;
+
+  Tensor<std::string, 2> rslt5(mat1);
+  Tensor<std::string, 2> rslt6(mat2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(rslt1(i,j), data1[i+2*j]);
+      VERIFY_IS_EQUAL(rslt2(i,j), data2[i+2*j]);
+      VERIFY_IS_EQUAL(rslt3(i,j), data1[i+2*j]);
+      VERIFY_IS_EQUAL(rslt4(i,j), data2[i+2*j]);
+      VERIFY_IS_EQUAL(rslt5(i,j), data1[i+2*j]);
+      VERIFY_IS_EQUAL(rslt6(i,j), data2[i+2*j]);
+    }
+  }
+}
+
+
+static void test_concat()
+{
+  Tensor<std::string, 2> t1(2, 3);
+  Tensor<std::string, 2> t2(2, 3);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      std::ostringstream s1;
+      s1 << "abc" << i + j*2;
+      t1(i, j) = s1.str();
+      std::ostringstream s2;
+      s2 << "def" << i*5 + j*32;
+      t2(i, j) = s2.str();
+    }
+  }
+
+  Tensor<std::string, 2> result = t1.concatenate(t2, 1);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 6);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(result(i, j),   t1(i, j));
+      VERIFY_IS_EQUAL(result(i, j+3), t2(i, j));
+    }
+  }
+}
+
+
+static void test_slices()
+{
+  Tensor<std::string, 2> data(2, 6);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      std::ostringstream s1;
+      s1 << "abc" << i + j*2;
+      data(i, j) = s1.str();
+    }
+  }
+
+  const Eigen::DSizes<ptrdiff_t, 2> half_size(2, 3);
+  const Eigen::DSizes<ptrdiff_t, 2> first_half(0, 0);
+  const Eigen::DSizes<ptrdiff_t, 2> second_half(0, 3);
+
+  Tensor<std::string, 2> t1 = data.slice(first_half, half_size);
+  Tensor<std::string, 2> t2 = data.slice(second_half, half_size);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(data(i, j),   t1(i, j));
+      VERIFY_IS_EQUAL(data(i, j+3), t2(i, j));
+    }
+  }
+}
+
+
+static void test_additions()
+{
+  Tensor<std::string, 1> data1(3);
+  Tensor<std::string, 1> data2(3);
+  for (int i = 0; i < 3; ++i) {
+    data1(i) = "abc";
+    std::ostringstream s1;
+    s1 << i;
+    data2(i) = s1.str();
+  }
+
+  Tensor<std::string, 1> sum = data1 + data2;
+  for (int i = 0; i < 3; ++i) {
+    std::ostringstream concat;
+    concat << "abc" << i;
+    std::string expected = concat.str();
+    VERIFY_IS_EQUAL(sum(i), expected);
+  }
+}
+
+
+static void test_initialization()
+{
+  Tensor<std::string, 2> a(2, 3);
+  a.setConstant(std::string("foo"));
+  for (int i = 0; i < 2*3; ++i) {
+    VERIFY_IS_EQUAL(a(i), std::string("foo"));
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_of_strings)
+{
+  // Beware: none of this is likely to ever work on a GPU.
+  CALL_SUBTEST(test_assign());
+  CALL_SUBTEST(test_concat());
+  CALL_SUBTEST(test_slices());
+  CALL_SUBTEST(test_additions());
+  CALL_SUBTEST(test_initialization());
+}

diff --git a/unsupported/test/cxx11_tensor_padding.cpp b/unsupported/test/cxx11_tensor_padding.cpp
new file mode 100644
index 0000000..b8a329d
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_padding.cpp

@@ -0,0 +1,93 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_simple_padding()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  array<std::pair<ptrdiff_t, ptrdiff_t>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(2, 1);
+  paddings[2] = std::make_pair(3, 4);
+  paddings[3] = std::make_pair(0, 0);
+
+  Tensor<float, 4, DataLayout> padded;
+  padded = tensor.pad(paddings);
+
+  VERIFY_IS_EQUAL(padded.dimension(0), 2+0);
+  VERIFY_IS_EQUAL(padded.dimension(1), 3+3);
+  VERIFY_IS_EQUAL(padded.dimension(2), 5+7);
+  VERIFY_IS_EQUAL(padded.dimension(3), 7+0);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      for (int k = 0; k < 12; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l));
+          } else {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f);
+          }
+        }
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_padded_expr()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  array<std::pair<ptrdiff_t, ptrdiff_t>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(2, 1);
+  paddings[2] = std::make_pair(3, 4);
+  paddings[3] = std::make_pair(0, 0);
+
+  Eigen::DSizes<ptrdiff_t, 2> reshape_dims;
+  reshape_dims[0] = 12;
+  reshape_dims[1] = 84;
+
+  Tensor<float, 2, DataLayout> result;
+  result = tensor.pad(paddings).reshape(reshape_dims);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      for (int k = 0; k < 12; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          const float result_value = DataLayout == ColMajor ?
+              result(i+2*j,k+12*l) : result(j+6*i,l+7*k);
+          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+            VERIFY_IS_EQUAL(result_value, tensor(i,j-2,k-3,l));
+          } else {
+            VERIFY_IS_EQUAL(result_value, 0.0f);
+          }
+        }
+      }
+    }
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_padding)
+{
+  CALL_SUBTEST(test_simple_padding<ColMajor>());
+  CALL_SUBTEST(test_simple_padding<RowMajor>());
+  CALL_SUBTEST(test_padded_expr<ColMajor>());
+  CALL_SUBTEST(test_padded_expr<RowMajor>());
+}

diff --git a/unsupported/test/cxx11_tensor_padding_sycl.cpp b/unsupported/test/cxx11_tensor_padding_sycl.cpp
new file mode 100644
index 0000000..727a9ff
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_padding_sycl.cpp

@@ -0,0 +1,157 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_simple_padding(const Eigen::SyclDevice& sycl_device)
+{
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  tensor.setRandom();
+
+  array<std::pair<IndexType, IndexType>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(2, 1);
+  paddings[2] = std::make_pair(3, 4);
+  paddings[3] = std::make_pair(0, 0);
+
+  IndexType padedSizeDim1 = 2;
+  IndexType padedSizeDim2 = 6;
+  IndexType padedSizeDim3 = 12;
+  IndexType padedSizeDim4 = 7;
+  array<IndexType, 4> padedtensorRange = {{padedSizeDim1, padedSizeDim2, padedSizeDim3, padedSizeDim4}};
+
+  Tensor<DataType, 4, DataLayout, IndexType> padded(padedtensorRange);
+
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(padded.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu2(gpu_data2, padedtensorRange);
+
+  VERIFY_IS_EQUAL(padded.dimension(0), 2+0);
+  VERIFY_IS_EQUAL(padded.dimension(1), 3+3);
+  VERIFY_IS_EQUAL(padded.dimension(2), 5+7);
+  VERIFY_IS_EQUAL(padded.dimension(3), 7+0);
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.pad(paddings);
+  sycl_device.memcpyDeviceToHost(padded.data(), gpu_data2,(padded.size())*sizeof(DataType));
+  for (IndexType i = 0; i < padedSizeDim1; ++i) {
+    for (IndexType j = 0; j < padedSizeDim2; ++j) {
+      for (IndexType k = 0; k < padedSizeDim3; ++k) {
+        for (IndexType l = 0; l < padedSizeDim4; ++l) {
+          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l));
+          } else {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f);
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+}
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_padded_expr(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  tensor.setRandom();
+
+  array<std::pair<IndexType, IndexType>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(2, 1);
+  paddings[2] = std::make_pair(3, 4);
+  paddings[3] = std::make_pair(0, 0);
+
+  Eigen::DSizes<IndexType, 2> reshape_dims;
+  reshape_dims[0] = 12;
+  reshape_dims[1] = 84;
+
+
+  Tensor<DataType, 2, DataLayout, IndexType>  result(reshape_dims);
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(result.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu2(gpu_data2, reshape_dims);
+
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.pad(paddings).reshape(reshape_dims);
+  sycl_device.memcpyDeviceToHost(result.data(), gpu_data2,(result.size())*sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 6; ++j) {
+      for (IndexType k = 0; k < 12; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          const float result_value = DataLayout == ColMajor ?
+              result(i+2*j,k+12*l) : result(j+6*i,l+7*k);
+          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+            VERIFY_IS_EQUAL(result_value, tensor(i,j-2,k-3,l));
+          } else {
+            VERIFY_IS_EQUAL(result_value, 0.0f);
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_padding_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_padding<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_padding<DataType, ColMajor, int64_t>(sycl_device);
+  test_padded_expr<DataType, RowMajor, int64_t>(sycl_device);
+  test_padded_expr<DataType, ColMajor, int64_t>(sycl_device);
+
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_padding_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_padding_test_per_device<float>(device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_patch.cpp b/unsupported/test/cxx11_tensor_patch.cpp
new file mode 100644
index 0000000..498ab8c
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_patch.cpp

@@ -0,0 +1,172 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_simple_patch()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> patch_dims;
+
+  patch_dims[0] = 1;
+  patch_dims[1] = 1;
+  patch_dims[2] = 1;
+  patch_dims[3] = 1;
+
+  Tensor<float, 5, DataLayout> no_patch;
+  no_patch = tensor.extract_patches(patch_dims);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(no_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(2), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(3), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(4), tensor.size());
+  } else {
+    VERIFY_IS_EQUAL(no_patch.dimension(0), tensor.size());
+    VERIFY_IS_EQUAL(no_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(2), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(3), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(4), 1);
+  }
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]);
+  }
+
+  patch_dims[0] = 2;
+  patch_dims[1] = 3;
+  patch_dims[2] = 5;
+  patch_dims[3] = 7;
+  Tensor<float, 5, DataLayout> single_patch;
+  single_patch = tensor.extract_patches(patch_dims);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(single_patch.dimension(0), 2);
+    VERIFY_IS_EQUAL(single_patch.dimension(1), 3);
+    VERIFY_IS_EQUAL(single_patch.dimension(2), 5);
+    VERIFY_IS_EQUAL(single_patch.dimension(3), 7);
+    VERIFY_IS_EQUAL(single_patch.dimension(4), 1);
+  } else {
+    VERIFY_IS_EQUAL(single_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(single_patch.dimension(1), 2);
+    VERIFY_IS_EQUAL(single_patch.dimension(2), 3);
+    VERIFY_IS_EQUAL(single_patch.dimension(3), 5);
+    VERIFY_IS_EQUAL(single_patch.dimension(4), 7);
+  }
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    VERIFY_IS_EQUAL(tensor.data()[i], single_patch.data()[i]);
+  }
+
+  patch_dims[0] = 1;
+  patch_dims[1] = 2;
+  patch_dims[2] = 2;
+  patch_dims[3] = 1;
+  Tensor<float, 5, DataLayout> twod_patch;
+  twod_patch = tensor.extract_patches(patch_dims);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(twod_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(twod_patch.dimension(1), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(3), 1);
+    VERIFY_IS_EQUAL(twod_patch.dimension(4), 2*2*4*7);
+  } else {
+    VERIFY_IS_EQUAL(twod_patch.dimension(0), 2*2*4*7);
+    VERIFY_IS_EQUAL(twod_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(3), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(4), 1);
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 4; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          int patch_loc;
+          if (DataLayout == ColMajor) {
+            patch_loc = i + 2 * (j + 2 * (k + 4 * l));
+          } else {
+            patch_loc = l + 7 * (k + 4 * (j + 2 * i));
+          }
+          for (int x = 0; x < 2; ++x) {
+            for (int y = 0; y < 2; ++y) {
+              if (DataLayout == ColMajor) {
+                VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(0,x,y,0,patch_loc));
+              } else {
+                VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(patch_loc,0,x,y,0));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  patch_dims[0] = 1;
+  patch_dims[1] = 2;
+  patch_dims[2] = 3;
+  patch_dims[3] = 5;
+  Tensor<float, 5, DataLayout> threed_patch;
+  threed_patch = tensor.extract_patches(patch_dims);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(threed_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(threed_patch.dimension(1), 2);
+    VERIFY_IS_EQUAL(threed_patch.dimension(2), 3);
+    VERIFY_IS_EQUAL(threed_patch.dimension(3), 5);
+    VERIFY_IS_EQUAL(threed_patch.dimension(4), 2*2*3*3);
+  } else {
+    VERIFY_IS_EQUAL(threed_patch.dimension(0), 2*2*3*3);
+    VERIFY_IS_EQUAL(threed_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(threed_patch.dimension(2), 2);
+    VERIFY_IS_EQUAL(threed_patch.dimension(3), 3);
+    VERIFY_IS_EQUAL(threed_patch.dimension(4), 5);
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          int patch_loc;
+          if (DataLayout == ColMajor) {
+            patch_loc = i + 2 * (j + 2 * (k + 3 * l));
+          } else {
+            patch_loc = l + 3 * (k + 3 * (j + 2 * i));
+          }
+          for (int x = 0; x < 2; ++x) {
+            for (int y = 0; y < 3; ++y) {
+              for (int z = 0; z < 5; ++z) {
+                if (DataLayout == ColMajor) {
+                  VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(0,x,y,z,patch_loc));
+                } else {
+                  VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(patch_loc,0,x,y,z));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_patch)
+{
+   CALL_SUBTEST(test_simple_patch<ColMajor>());
+   CALL_SUBTEST(test_simple_patch<RowMajor>());
+   //   CALL_SUBTEST(test_expr_shuffling());
+}

diff --git a/unsupported/test/cxx11_tensor_patch_sycl.cpp b/unsupported/test/cxx11_tensor_patch_sycl.cpp
new file mode 100644
index 0000000..7f92bec
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_patch_sycl.cpp

@@ -0,0 +1,249 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_patch_sycl(const Eigen::SyclDevice& sycl_device){
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  array<IndexType, 5> patchTensorRange;
+  if (DataLayout == ColMajor) {
+   patchTensorRange = {{1, 1, 1, 1, sizeDim1*sizeDim2*sizeDim3*sizeDim4}};
+  }else{
+     patchTensorRange = {{sizeDim1*sizeDim2*sizeDim3*sizeDim4,1, 1, 1, 1}};
+  }
+
+  Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 5, DataLayout,IndexType> no_patch(patchTensorRange);
+
+  tensor.setRandom();
+
+  array<ptrdiff_t, 4> patch_dims;
+  patch_dims[0] = 1;
+  patch_dims[1] = 1;
+  patch_dims[2] = 1;
+  patch_dims[3] = 1;
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  size_t patchTensorBuffSize =no_patch.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_no_patch  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_no_patch(gpu_data_no_patch, patchTensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  gpu_no_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+  sycl_device.memcpyDeviceToHost(no_patch.data(), gpu_data_no_patch, patchTensorBuffSize);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(no_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(2), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(3), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(4), tensor.size());
+  } else {
+    VERIFY_IS_EQUAL(no_patch.dimension(0), tensor.size());
+    VERIFY_IS_EQUAL(no_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(2), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(3), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(4), 1);
+  }
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]);
+  }
+
+  patch_dims[0] = 2;
+  patch_dims[1] = 3;
+  patch_dims[2] = 5;
+  patch_dims[3] = 7;
+
+  if (DataLayout == ColMajor) {
+   patchTensorRange = {{sizeDim1,sizeDim2,sizeDim3,sizeDim4,1}};
+  }else{
+     patchTensorRange = {{1,sizeDim1,sizeDim2,sizeDim3,sizeDim4}};
+  }
+  Tensor<DataType, 5, DataLayout,IndexType> single_patch(patchTensorRange);
+  patchTensorBuffSize =single_patch.size()*sizeof(DataType);
+  DataType* gpu_data_single_patch  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_single_patch(gpu_data_single_patch, patchTensorRange);
+
+  gpu_single_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+  sycl_device.memcpyDeviceToHost(single_patch.data(), gpu_data_single_patch, patchTensorBuffSize);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(single_patch.dimension(0), 2);
+    VERIFY_IS_EQUAL(single_patch.dimension(1), 3);
+    VERIFY_IS_EQUAL(single_patch.dimension(2), 5);
+    VERIFY_IS_EQUAL(single_patch.dimension(3), 7);
+    VERIFY_IS_EQUAL(single_patch.dimension(4), 1);
+  } else {
+    VERIFY_IS_EQUAL(single_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(single_patch.dimension(1), 2);
+    VERIFY_IS_EQUAL(single_patch.dimension(2), 3);
+    VERIFY_IS_EQUAL(single_patch.dimension(3), 5);
+    VERIFY_IS_EQUAL(single_patch.dimension(4), 7);
+  }
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    VERIFY_IS_EQUAL(tensor.data()[i], single_patch.data()[i]);
+  }
+  patch_dims[0] = 1;
+  patch_dims[1] = 2;
+  patch_dims[2] = 2;
+  patch_dims[3] = 1;
+  
+  if (DataLayout == ColMajor) {
+   patchTensorRange = {{1,2,2,1,2*2*4*7}};
+  }else{
+     patchTensorRange = {{2*2*4*7, 1, 2,2,1}};
+  }
+  Tensor<DataType, 5, DataLayout,IndexType> twod_patch(patchTensorRange);
+  patchTensorBuffSize =twod_patch.size()*sizeof(DataType);
+  DataType* gpu_data_twod_patch  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_twod_patch(gpu_data_twod_patch, patchTensorRange);
+
+  gpu_twod_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+  sycl_device.memcpyDeviceToHost(twod_patch.data(), gpu_data_twod_patch, patchTensorBuffSize);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(twod_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(twod_patch.dimension(1), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(3), 1);
+    VERIFY_IS_EQUAL(twod_patch.dimension(4), 2*2*4*7);
+  } else {
+    VERIFY_IS_EQUAL(twod_patch.dimension(0), 2*2*4*7);
+    VERIFY_IS_EQUAL(twod_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(3), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(4), 1);
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 4; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          int patch_loc;
+          if (DataLayout == ColMajor) {
+            patch_loc = i + 2 * (j + 2 * (k + 4 * l));
+          } else {
+            patch_loc = l + 7 * (k + 4 * (j + 2 * i));
+          }
+          for (int x = 0; x < 2; ++x) {
+            for (int y = 0; y < 2; ++y) {
+              if (DataLayout == ColMajor) {
+                VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(0,x,y,0,patch_loc));
+              } else {
+                VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(patch_loc,0,x,y,0));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  patch_dims[0] = 1;
+  patch_dims[1] = 2;
+  patch_dims[2] = 3;
+  patch_dims[3] = 5;
+
+  if (DataLayout == ColMajor) {
+   patchTensorRange = {{1,2,3,5,2*2*3*3}};
+  }else{
+     patchTensorRange = {{2*2*3*3, 1, 2,3,5}};
+  }
+  Tensor<DataType, 5, DataLayout,IndexType> threed_patch(patchTensorRange);
+  patchTensorBuffSize =threed_patch.size()*sizeof(DataType);
+  DataType* gpu_data_threed_patch  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_threed_patch(gpu_data_threed_patch, patchTensorRange);
+
+  gpu_threed_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+  sycl_device.memcpyDeviceToHost(threed_patch.data(), gpu_data_threed_patch, patchTensorBuffSize);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(threed_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(threed_patch.dimension(1), 2);
+    VERIFY_IS_EQUAL(threed_patch.dimension(2), 3);
+    VERIFY_IS_EQUAL(threed_patch.dimension(3), 5);
+    VERIFY_IS_EQUAL(threed_patch.dimension(4), 2*2*3*3);
+  } else {
+    VERIFY_IS_EQUAL(threed_patch.dimension(0), 2*2*3*3);
+    VERIFY_IS_EQUAL(threed_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(threed_patch.dimension(2), 2);
+    VERIFY_IS_EQUAL(threed_patch.dimension(3), 3);
+    VERIFY_IS_EQUAL(threed_patch.dimension(4), 5);
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          int patch_loc;
+          if (DataLayout == ColMajor) {
+            patch_loc = i + 2 * (j + 2 * (k + 3 * l));
+          } else {
+            patch_loc = l + 3 * (k + 3 * (j + 2 * i));
+          }
+          for (int x = 0; x < 2; ++x) {
+            for (int y = 0; y < 3; ++y) {
+              for (int z = 0; z < 5; ++z) {
+                if (DataLayout == ColMajor) {
+                  VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(0,x,y,z,patch_loc));
+                } else {
+                  VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(patch_loc,0,x,y,z));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_no_patch);
+  sycl_device.deallocate(gpu_data_single_patch);
+  sycl_device.deallocate(gpu_data_twod_patch);
+  sycl_device.deallocate(gpu_data_threed_patch);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_patch_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_patch_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_patch_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_patch_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_tensor_patch_test_per_device<float>(device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_random.cpp b/unsupported/test/cxx11_tensor_random.cpp
new file mode 100644
index 0000000..b9d4c55
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_random.cpp

@@ -0,0 +1,86 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+template<typename Scalar>
+static void test_default()
+{
+  Tensor<Scalar, 1> vec(6);
+  vec.setRandom();
+
+  // Fixme: we should check that the generated numbers follow a uniform
+  // distribution instead.
+  for (int i = 1; i < 6; ++i) {
+    VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1));
+  }
+}
+
+template<typename Scalar>
+static void test_normal()
+{
+  Tensor<Scalar, 1> vec(6);
+  vec.template setRandom<Eigen::internal::NormalRandomGenerator<Scalar>>();
+
+  // Fixme: we should check that the generated numbers follow a gaussian
+  // distribution instead.
+  for (int i = 1; i < 6; ++i) {
+    VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1));
+  }
+}
+
+
+struct MyGenerator {
+  MyGenerator() { }
+  MyGenerator(const MyGenerator&) { }
+
+  // Return a random value to be used.  "element_location" is the
+  // location of the entry to set in the tensor, it can typically
+  // be ignored.
+  int operator()(Eigen::DenseIndex element_location, Eigen::DenseIndex /*unused*/ = 0) const {
+    return static_cast<int>(3 * element_location);
+  }
+
+  // Same as above but generates several numbers at a time.
+  internal::packet_traits<int>::type packetOp(
+      Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const {
+    const int packetSize = internal::packet_traits<int>::size;
+    EIGEN_ALIGN_MAX int values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = static_cast<int>(3 * (packet_location + i));
+    }
+    return internal::pload<typename internal::packet_traits<int>::type>(values);
+  }
+};
+
+
+static void test_custom()
+{
+  Tensor<int, 1> vec(6);
+  vec.setRandom<MyGenerator>();
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(vec(i), 3*i);
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_random)
+{
+  CALL_SUBTEST((test_default<float>()));
+  CALL_SUBTEST((test_normal<float>()));
+  CALL_SUBTEST((test_default<double>()));
+  CALL_SUBTEST((test_normal<double>()));
+  CALL_SUBTEST((test_default<Eigen::half>()));
+  CALL_SUBTEST((test_normal<Eigen::half>()));
+  CALL_SUBTEST((test_default<Eigen::bfloat16>()));
+  CALL_SUBTEST((test_normal<Eigen::bfloat16>()));
+  CALL_SUBTEST(test_custom());
+}

diff --git a/unsupported/test/cxx11_tensor_random_gpu.cu b/unsupported/test/cxx11_tensor_random_gpu.cu
new file mode 100644
index 0000000..090986e
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_random_gpu.cu

@@ -0,0 +1,86 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+
+#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
+void test_gpu_random_uniform()
+{
+  Tensor<float, 2> out(72,97);
+  out.setZero();
+
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_out;
+  gpuMalloc((void**)(&d_out), out_bytes);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
+
+  gpu_out.device(gpu_device) = gpu_out.random();
+
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  // For now we just check this code doesn't crash.
+  // TODO: come up with a valid test of randomness
+}
+
+
+void test_gpu_random_normal()
+{
+  Tensor<float, 2> out(72,97);
+  out.setZero();
+
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_out;
+  gpuMalloc((void**)(&d_out), out_bytes);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
+
+  Eigen::internal::NormalRandomGenerator<float> gen(true);
+  gpu_out.device(gpu_device) = gpu_out.random(gen);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+}
+
+static void test_complex()
+{
+  Tensor<std::complex<float>, 1> vec(6);
+  vec.setRandom();
+
+  // Fixme: we should check that the generated numbers follow a uniform
+  // distribution instead.
+  for (int i = 1; i < 6; ++i) {
+    VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1));
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_random_gpu)
+{
+  CALL_SUBTEST(test_gpu_random_uniform());
+  CALL_SUBTEST(test_gpu_random_normal());
+  CALL_SUBTEST(test_complex());
+}

diff --git a/unsupported/test/cxx11_tensor_random_sycl.cpp b/unsupported/test/cxx11_tensor_random_sycl.cpp
new file mode 100644
index 0000000..6c83894
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_random_sycl.cpp

@@ -0,0 +1,100 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_sycl_random_uniform(const Eigen::SyclDevice& sycl_device)
+{
+  Tensor<DataType, 2,DataLayout, IndexType> out(72,97);
+  out.setZero();
+
+  std::size_t out_bytes = out.size() * sizeof(DataType);
+
+  IndexType sizeDim0 = 72;
+  IndexType sizeDim1 = 97;
+
+  array<IndexType, 2> tensorRange = {{sizeDim0, sizeDim1}};
+
+  DataType* d_out  = static_cast<DataType*>(sycl_device.allocate(out_bytes));
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(d_out, tensorRange);
+
+  gpu_out.device(sycl_device)=gpu_out.random();
+  sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes);
+  for(IndexType i=1; i<sizeDim0; i++)
+    for(IndexType j=1; j<sizeDim1; j++)
+    {
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i,j-1));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j-1));    }
+
+  // For now we just check thes code doesn't crash.
+  // TODO: come up with a valid test of randomness
+  sycl_device.deallocate(d_out);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_random_normal(const Eigen::SyclDevice& sycl_device)
+{
+  Tensor<DataType, 2,DataLayout,IndexType> out(72,97);
+  out.setZero();
+  std::size_t out_bytes = out.size() * sizeof(DataType);
+
+  IndexType sizeDim0 = 72;
+  IndexType sizeDim1 = 97;
+
+  array<IndexType, 2> tensorRange = {{sizeDim0, sizeDim1}};
+
+  DataType* d_out  = static_cast<DataType*>(sycl_device.allocate(out_bytes));
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(d_out, tensorRange);
+  Eigen::internal::NormalRandomGenerator<DataType> gen(true);
+  gpu_out.device(sycl_device)=gpu_out.random(gen);
+  sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes);
+  for(IndexType i=1; i<sizeDim0; i++)
+    for(IndexType j=1; j<sizeDim1; j++)
+    {
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i,j-1));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j-1));
+
+    }
+
+  // For now we just check thes code doesn't crash.
+  // TODO: come up with a valid test of randomness
+  sycl_device.deallocate(d_out);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_random_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_sycl_random_uniform<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_random_uniform<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_random_normal<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_random_normal<DataType, ColMajor, int64_t>(sycl_device);
+
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_random_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_random_test_per_device<float>(device));
+#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
+    CALL_SUBTEST(sycl_random_test_per_device<double>(device));
+#endif
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp
new file mode 100644
index 0000000..c46c4c9
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_reduction.cpp

@@ -0,0 +1,532 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <limits>
+#include <numeric>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int DataLayout>
+static void test_trivial_reductions() {
+  {
+    Tensor<float, 0, DataLayout> tensor;
+    tensor.setRandom();
+    array<ptrdiff_t, 0> reduction_axis;
+
+    Tensor<float, 0, DataLayout> result = tensor.sum(reduction_axis);
+    VERIFY_IS_EQUAL(result(), tensor());
+  }
+
+  {
+    Tensor<float, 1, DataLayout> tensor(7);
+    tensor.setRandom();
+    array<ptrdiff_t, 0> reduction_axis;
+
+    Tensor<float, 1, DataLayout> result = tensor.sum(reduction_axis);
+    VERIFY_IS_EQUAL(result.dimension(0), 7);
+    for (int i = 0; i < 7; ++i) {
+      VERIFY_IS_EQUAL(result(i), tensor(i));
+    }
+  }
+
+  {
+    Tensor<float, 2, DataLayout> tensor(2, 3);
+    tensor.setRandom();
+    array<ptrdiff_t, 0> reduction_axis;
+
+    Tensor<float, 2, DataLayout> result = tensor.sum(reduction_axis);
+    VERIFY_IS_EQUAL(result.dimension(0), 2);
+    VERIFY_IS_EQUAL(result.dimension(1), 3);
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        VERIFY_IS_EQUAL(result(i, j), tensor(i, j));
+      }
+    }
+  }
+}
+
+template <typename Scalar,int DataLayout>
+static void test_simple_reductions() {
+  Tensor<Scalar, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+  // Add a little offset so that the product reductions won't be close to zero.
+  tensor += tensor.constant(Scalar(0.5f));
+  array<ptrdiff_t, 2> reduction_axis2;
+  reduction_axis2[0] = 1;
+  reduction_axis2[1] = 3;
+
+  Tensor<Scalar, 2, DataLayout> result = tensor.sum(reduction_axis2);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 5);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      Scalar sum = Scalar(0.0f);
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          sum += tensor(i, k, j, l);
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), sum);
+    }
+  }
+
+  {
+    Tensor<Scalar, 0, DataLayout> sum1 = tensor.sum();
+    VERIFY_IS_EQUAL(sum1.rank(), 0);
+
+    array<ptrdiff_t, 4> reduction_axis4;
+    reduction_axis4[0] = 0;
+    reduction_axis4[1] = 1;
+    reduction_axis4[2] = 2;
+    reduction_axis4[3] = 3;
+    Tensor<Scalar, 0, DataLayout> sum2 = tensor.sum(reduction_axis4);
+    VERIFY_IS_EQUAL(sum2.rank(), 0);
+
+    VERIFY_IS_APPROX(sum1(), sum2());
+  }
+
+  reduction_axis2[0] = 0;
+  reduction_axis2[1] = 2;
+  result = tensor.prod(reduction_axis2);
+  VERIFY_IS_EQUAL(result.dimension(0), 3);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      Scalar prod = Scalar(1.0f);
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          prod *= tensor(k, i, l, j);
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), prod);
+    }
+  }
+
+  {
+    Tensor<Scalar, 0, DataLayout> prod1 = tensor.prod();
+    VERIFY_IS_EQUAL(prod1.rank(), 0);
+
+    array<ptrdiff_t, 4> reduction_axis4;
+    reduction_axis4[0] = 0;
+    reduction_axis4[1] = 1;
+    reduction_axis4[2] = 2;
+    reduction_axis4[3] = 3;
+    Tensor<Scalar, 0, DataLayout> prod2 = tensor.prod(reduction_axis4);
+    VERIFY_IS_EQUAL(prod2.rank(), 0);
+
+    VERIFY_IS_APPROX(prod1(), prod2());
+  }
+
+  reduction_axis2[0] = 0;
+  reduction_axis2[1] = 2;
+  result = tensor.maximum(reduction_axis2);
+  VERIFY_IS_EQUAL(result.dimension(0), 3);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      Scalar max_val = std::numeric_limits<Scalar>::lowest();
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          max_val = (std::max)(max_val, tensor(k, i, l, j));
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), max_val);
+    }
+  }
+
+  {
+    Tensor<Scalar, 0, DataLayout> max1 = tensor.maximum();
+    VERIFY_IS_EQUAL(max1.rank(), 0);
+
+    array<ptrdiff_t, 4> reduction_axis4;
+    reduction_axis4[0] = 0;
+    reduction_axis4[1] = 1;
+    reduction_axis4[2] = 2;
+    reduction_axis4[3] = 3;
+    Tensor<Scalar, 0, DataLayout> max2 = tensor.maximum(reduction_axis4);
+    VERIFY_IS_EQUAL(max2.rank(), 0);
+
+    VERIFY_IS_APPROX(max1(), max2());
+  }
+
+  reduction_axis2[0] = 0;
+  reduction_axis2[1] = 1;
+  result = tensor.minimum(reduction_axis2);
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      Scalar min_val = (std::numeric_limits<Scalar>::max)();
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          min_val = (std::min)(min_val, tensor(k, l, i, j));
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), min_val);
+    }
+  }
+
+  {
+    Tensor<Scalar, 0, DataLayout> min1 = tensor.minimum();
+    VERIFY_IS_EQUAL(min1.rank(), 0);
+
+    array<ptrdiff_t, 4> reduction_axis4;
+    reduction_axis4[0] = 0;
+    reduction_axis4[1] = 1;
+    reduction_axis4[2] = 2;
+    reduction_axis4[3] = 3;
+    Tensor<Scalar, 0, DataLayout> min2 = tensor.minimum(reduction_axis4);
+    VERIFY_IS_EQUAL(min2.rank(), 0);
+
+    VERIFY_IS_APPROX(min1(), min2());
+  }
+
+  reduction_axis2[0] = 0;
+  reduction_axis2[1] = 1;
+  result = tensor.mean(reduction_axis2);
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      Scalar sum = Scalar(0.0f);
+      int count = 0;
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          sum += tensor(k, l, i, j);
+          ++count;
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), sum / Scalar(count));
+    }
+  }
+
+  {
+    Tensor<Scalar, 0, DataLayout> mean1 = tensor.mean();
+    VERIFY_IS_EQUAL(mean1.rank(), 0);
+
+    array<ptrdiff_t, 4> reduction_axis4;
+    reduction_axis4[0] = 0;
+    reduction_axis4[1] = 1;
+    reduction_axis4[2] = 2;
+    reduction_axis4[3] = 3;
+    Tensor<Scalar, 0, DataLayout> mean2 = tensor.mean(reduction_axis4);
+    VERIFY_IS_EQUAL(mean2.rank(), 0);
+
+    VERIFY_IS_APPROX(mean1(), mean2());
+  }
+
+  {
+    Tensor<int, 1> ints(10);
+    std::iota(ints.data(), ints.data() + ints.dimension(0), 0);
+
+    TensorFixedSize<bool, Sizes<> > all_;
+    all_ = ints.all();
+    VERIFY(!all_());
+    all_ = (ints >= ints.constant(0)).all();
+    VERIFY(all_());
+
+    TensorFixedSize<bool, Sizes<> > any;
+    any = (ints > ints.constant(10)).any();
+    VERIFY(!any());
+    any = (ints < ints.constant(1)).any();
+    VERIFY(any());
+  }
+}
+
+
+template <int DataLayout>
+static void test_reductions_in_expr() {
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+  array<ptrdiff_t, 2> reduction_axis2;
+  reduction_axis2[0] = 1;
+  reduction_axis2[1] = 3;
+
+  Tensor<float, 2, DataLayout> result(2, 5);
+  result = result.constant(1.0f) - tensor.sum(reduction_axis2);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 5);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      float sum = 0.0f;
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          sum += tensor(i, k, j, l);
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), 1.0f - sum);
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_full_reductions() {
+  Tensor<float, 2, DataLayout> tensor(2, 3);
+  tensor.setRandom();
+  array<ptrdiff_t, 2> reduction_axis;
+  reduction_axis[0] = 0;
+  reduction_axis[1] = 1;
+
+  Tensor<float, 0, DataLayout> result = tensor.sum(reduction_axis);
+  VERIFY_IS_EQUAL(result.rank(), 0);
+
+  float sum = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      sum += tensor(i, j);
+    }
+  }
+  VERIFY_IS_APPROX(result(0), sum);
+
+  result = tensor.square().sum(reduction_axis).sqrt();
+  VERIFY_IS_EQUAL(result.rank(), 0);
+
+  sum = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      sum += tensor(i, j) * tensor(i, j);
+    }
+  }
+  VERIFY_IS_APPROX(result(), sqrtf(sum));
+}
+
+struct UserReducer {
+  static const bool PacketAccess = false;
+  UserReducer(float offset) : offset_(offset) {}
+  void reduce(const float val, float* accum) { *accum += val * val; }
+  float initialize() const { return 0; }
+  float finalize(const float accum) const { return 1.0f / (accum + offset_); }
+
+ private:
+  const float offset_;
+};
+
+template <int DataLayout>
+static void test_user_defined_reductions() {
+  Tensor<float, 2, DataLayout> tensor(5, 7);
+  tensor.setRandom();
+  array<ptrdiff_t, 1> reduction_axis;
+  reduction_axis[0] = 1;
+
+  UserReducer reducer(10.0f);
+  Tensor<float, 1, DataLayout> result = tensor.reduce(reduction_axis, reducer);
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  for (int i = 0; i < 5; ++i) {
+    float expected = 10.0f;
+    for (int j = 0; j < 7; ++j) {
+      expected += tensor(i, j) * tensor(i, j);
+    }
+    expected = 1.0f / expected;
+    VERIFY_IS_APPROX(result(i), expected);
+  }
+}
+
+template <int DataLayout>
+static void test_tensor_maps() {
+  int inputs[2 * 3 * 5 * 7];
+  TensorMap<Tensor<int, 4, DataLayout> > tensor_map(inputs, 2, 3, 5, 7);
+  TensorMap<Tensor<const int, 4, DataLayout> > tensor_map_const(inputs, 2, 3, 5,
+                                                                7);
+  const TensorMap<Tensor<const int, 4, DataLayout> > tensor_map_const_const(
+      inputs, 2, 3, 5, 7);
+
+  tensor_map.setRandom();
+  array<ptrdiff_t, 2> reduction_axis;
+  reduction_axis[0] = 1;
+  reduction_axis[1] = 3;
+
+  Tensor<int, 2, DataLayout> result = tensor_map.sum(reduction_axis);
+  Tensor<int, 2, DataLayout> result2 = tensor_map_const.sum(reduction_axis);
+  Tensor<int, 2, DataLayout> result3 =
+      tensor_map_const_const.sum(reduction_axis);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      int sum = 0;
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          sum += tensor_map(i, k, j, l);
+        }
+      }
+      VERIFY_IS_EQUAL(result(i, j), sum);
+      VERIFY_IS_EQUAL(result2(i, j), sum);
+      VERIFY_IS_EQUAL(result3(i, j), sum);
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_static_dims() {
+  Tensor<float, 4, DataLayout> in(72, 53, 97, 113);
+  Tensor<float, 2, DataLayout> out(72, 97);
+  in.setRandom();
+
+#if !EIGEN_HAS_CONSTEXPR
+  array<int, 2> reduction_axis;
+  reduction_axis[0] = 1;
+  reduction_axis[1] = 3;
+#else
+  Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<3> > reduction_axis;
+#endif
+
+  out = in.maximum(reduction_axis);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      float expected = -1e10f;
+      for (int k = 0; k < 53; ++k) {
+        for (int l = 0; l < 113; ++l) {
+          expected = (std::max)(expected, in(i, k, j, l));
+        }
+      }
+      VERIFY_IS_EQUAL(out(i, j), expected);
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_innermost_last_dims() {
+  Tensor<float, 4, DataLayout> in(72, 53, 97, 113);
+  Tensor<float, 2, DataLayout> out(97, 113);
+  in.setRandom();
+
+// Reduce on the innermost dimensions.
+#if !EIGEN_HAS_CONSTEXPR
+  array<int, 2> reduction_axis;
+  reduction_axis[0] = 0;
+  reduction_axis[1] = 1;
+#else
+  // This triggers the use of packets for ColMajor.
+  Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1> > reduction_axis;
+#endif
+
+  out = in.maximum(reduction_axis);
+
+  for (int i = 0; i < 97; ++i) {
+    for (int j = 0; j < 113; ++j) {
+      float expected = -1e10f;
+      for (int k = 0; k < 53; ++k) {
+        for (int l = 0; l < 72; ++l) {
+          expected = (std::max)(expected, in(l, k, i, j));
+        }
+      }
+      VERIFY_IS_EQUAL(out(i, j), expected);
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_innermost_first_dims() {
+  Tensor<float, 4, DataLayout> in(72, 53, 97, 113);
+  Tensor<float, 2, DataLayout> out(72, 53);
+  in.setRandom();
+
+// Reduce on the innermost dimensions.
+#if !EIGEN_HAS_CONSTEXPR
+  array<int, 2> reduction_axis;
+  reduction_axis[0] = 2;
+  reduction_axis[1] = 3;
+#else
+  // This triggers the use of packets for RowMajor.
+  Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>> reduction_axis;
+#endif
+
+  out = in.maximum(reduction_axis);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 53; ++j) {
+      float expected = -1e10f;
+      for (int k = 0; k < 97; ++k) {
+        for (int l = 0; l < 113; ++l) {
+          expected = (std::max)(expected, in(i, j, k, l));
+        }
+      }
+      VERIFY_IS_EQUAL(out(i, j), expected);
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_reduce_middle_dims() {
+  Tensor<float, 4, DataLayout> in(72, 53, 97, 113);
+  Tensor<float, 2, DataLayout> out(72, 53);
+  in.setRandom();
+
+// Reduce on the innermost dimensions.
+#if !EIGEN_HAS_CONSTEXPR
+  array<int, 2> reduction_axis;
+  reduction_axis[0] = 1;
+  reduction_axis[1] = 2;
+#else
+  // This triggers the use of packets for RowMajor.
+  Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2>> reduction_axis;
+#endif
+
+  out = in.maximum(reduction_axis);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 113; ++j) {
+      float expected = -1e10f;
+      for (int k = 0; k < 53; ++k) {
+        for (int l = 0; l < 97; ++l) {
+          expected = (std::max)(expected, in(i, k, l, j));
+        }
+      }
+      VERIFY_IS_EQUAL(out(i, j), expected);
+    }
+  }
+}
+
+static void test_sum_accuracy() {
+  Tensor<float, 3> tensor(101, 101, 101);
+  for (float prescribed_mean : {1.0f, 10.0f, 100.0f, 1000.0f, 10000.0f}) {
+    tensor.setRandom();
+    tensor += tensor.constant(prescribed_mean);
+
+    Tensor<float, 0> sum = tensor.sum();
+    double expected_sum = 0.0;
+    for (int i = 0; i < 101; ++i) {
+      for (int j = 0; j < 101; ++j) {
+        for (int k = 0; k < 101; ++k) {
+          expected_sum += static_cast<double>(tensor(i, j, k));
+        }
+      }
+    }
+    VERIFY_IS_APPROX(sum(), static_cast<float>(expected_sum));
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_reduction) {
+  CALL_SUBTEST(test_trivial_reductions<ColMajor>());
+  CALL_SUBTEST(test_trivial_reductions<RowMajor>());
+  CALL_SUBTEST(( test_simple_reductions<float,ColMajor>() ));
+  CALL_SUBTEST(( test_simple_reductions<float,RowMajor>() ));
+  CALL_SUBTEST(( test_simple_reductions<Eigen::half,ColMajor>() ));
+  CALL_SUBTEST(( test_simple_reductions<Eigen::bfloat16,ColMajor>() ));
+  CALL_SUBTEST(test_reductions_in_expr<ColMajor>());
+  CALL_SUBTEST(test_reductions_in_expr<RowMajor>());
+  CALL_SUBTEST(test_full_reductions<ColMajor>());
+  CALL_SUBTEST(test_full_reductions<RowMajor>());
+  CALL_SUBTEST(test_user_defined_reductions<ColMajor>());
+  CALL_SUBTEST(test_user_defined_reductions<RowMajor>());
+  CALL_SUBTEST(test_tensor_maps<ColMajor>());
+  CALL_SUBTEST(test_tensor_maps<RowMajor>());
+  CALL_SUBTEST(test_static_dims<ColMajor>());
+  CALL_SUBTEST(test_static_dims<RowMajor>());
+  CALL_SUBTEST(test_innermost_last_dims<ColMajor>());
+  CALL_SUBTEST(test_innermost_last_dims<RowMajor>());
+  CALL_SUBTEST(test_innermost_first_dims<ColMajor>());
+  CALL_SUBTEST(test_innermost_first_dims<RowMajor>());
+  CALL_SUBTEST(test_reduce_middle_dims<ColMajor>());
+  CALL_SUBTEST(test_reduce_middle_dims<RowMajor>());
+  CALL_SUBTEST(test_sum_accuracy());
+}

diff --git a/unsupported/test/cxx11_tensor_reduction_gpu.cu b/unsupported/test/cxx11_tensor_reduction_gpu.cu
new file mode 100644
index 0000000..122ac94
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_reduction_gpu.cu

@@ -0,0 +1,154 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+
+template<typename Type, int DataLayout>
+static void test_full_reductions() {
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  const int num_rows = internal::random<int>(1024, 5*1024);
+  const int num_cols = internal::random<int>(1024, 5*1024);
+
+  Tensor<Type, 2, DataLayout> in(num_rows, num_cols);
+  in.setRandom();
+
+  Tensor<Type, 0, DataLayout> full_redux;
+  full_redux = in.sum();
+
+  std::size_t in_bytes = in.size() * sizeof(Type);
+  std::size_t out_bytes = full_redux.size() * sizeof(Type);
+  Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes));
+  Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes));
+  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
+
+  TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+  TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr);
+
+  out_gpu.device(gpu_device) = in_gpu.sum();
+
+  Tensor<Type, 0, DataLayout> full_redux_gpu;
+  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
+  gpu_device.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+  gpu_device.deallocate(gpu_in_ptr);
+  gpu_device.deallocate(gpu_out_ptr);
+}
+
+template<typename Type, int DataLayout>
+static void test_first_dim_reductions() {
+  int dim_x = 33;
+  int dim_y = 1;
+  int dim_z = 128;
+
+  Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
+  in.setRandom();
+
+  Eigen::array<int, 1> red_axis;
+  red_axis[0] = 0;
+  Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
+
+  // Create device
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice dev(&stream);
+  
+  // Create data(T)
+  Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
+  Type* out_data = (Type*)dev.allocate(dim_z*dim_y*sizeof(Type));
+  Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
+  Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_y, dim_z);
+  
+  // Perform operation
+  dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
+  gpu_out.device(dev) = gpu_in.sum(red_axis);
+  gpu_out.device(dev) += gpu_in.sum(red_axis);
+  Tensor<Type, 2, DataLayout> redux_gpu(dim_y, dim_z);
+  dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
+  dev.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (int i = 0; i < gpu_out.size(); ++i) {
+    VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
+  }
+
+  dev.deallocate(in_data);
+  dev.deallocate(out_data);
+}
+
+template<typename Type, int DataLayout>
+static void test_last_dim_reductions() {
+  int dim_x = 128;
+  int dim_y = 1;
+  int dim_z = 33;
+
+  Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
+  in.setRandom();
+
+  Eigen::array<int, 1> red_axis;
+  red_axis[0] = 2;
+  Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
+
+  // Create device
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice dev(&stream);
+  
+  // Create data
+  Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
+  Type* out_data = (Type*)dev.allocate(dim_x*dim_y*sizeof(Type));
+  Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
+  Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_x, dim_y);
+  
+  // Perform operation
+  dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
+  gpu_out.device(dev) = gpu_in.sum(red_axis);
+  gpu_out.device(dev) += gpu_in.sum(red_axis);
+  Tensor<Type, 2, DataLayout> redux_gpu(dim_x, dim_y);
+  dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
+  dev.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (int i = 0; i < gpu_out.size(); ++i) {
+    VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
+  }
+
+  dev.deallocate(in_data);
+  dev.deallocate(out_data);
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_reduction_gpu) {
+  CALL_SUBTEST_1((test_full_reductions<float, ColMajor>()));
+  CALL_SUBTEST_1((test_full_reductions<double, ColMajor>()));
+  CALL_SUBTEST_2((test_full_reductions<float, RowMajor>()));
+  CALL_SUBTEST_2((test_full_reductions<double, RowMajor>()));
+  
+  CALL_SUBTEST_3((test_first_dim_reductions<float, ColMajor>()));
+  CALL_SUBTEST_3((test_first_dim_reductions<double, ColMajor>()));
+  CALL_SUBTEST_4((test_first_dim_reductions<float, RowMajor>()));
+// Outer reductions of doubles aren't supported just yet.  					      
+//  CALL_SUBTEST_4((test_first_dim_reductions<double, RowMajor>()))
+
+  CALL_SUBTEST_5((test_last_dim_reductions<float, ColMajor>()));
+// Outer reductions of doubles aren't supported just yet.  					      
+//  CALL_SUBTEST_5((test_last_dim_reductions<double, ColMajor>()));
+  CALL_SUBTEST_6((test_last_dim_reductions<float, RowMajor>()));
+  CALL_SUBTEST_6((test_last_dim_reductions<double, RowMajor>()));
+}

diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
new file mode 100644
index 0000000..a297716
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp

@@ -0,0 +1,1014 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+#define EIGEN_HAS_CONSTEXPR 1
+
+#include "main.h"
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 753;
+  const IndexType num_cols = 537;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+
+  array<IndexType, 2> outRange = {{1, 1}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> full_redux(outRange);
+  Tensor<DataType, 2, DataLayout, IndexType> full_redux_gpu(outRange);
+
+  in.setRandom();
+  auto dim = DSizes<IndexType, 2>(1, 1);
+  full_redux = in.sum().reshape(dim);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(
+      sizeof(DataType) * (full_redux_gpu.dimensions().TotalSize()));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data,
+                                                                outRange);
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.sum().reshape(dim);
+  sycl_device.memcpyDeviceToHost(
+      full_redux_gpu.data(), gpu_out_data,
+      (full_redux_gpu.dimensions().TotalSize()) * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  std::cout << "SYCL FULL :" << full_redux_gpu(0, 0)
+            << ", CPU FULL: " << full_redux(0, 0) << "\n";
+  VERIFY_IS_APPROX(full_redux_gpu(0, 0), full_redux(0, 0));
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_sum_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.sum();
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.sum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_max_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 4096;
+  const IndexType num_cols = 4096;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
+
+  in.setRandom();
+
+  full_redux = in.maximum();
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_max_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set the initial value to be the max.
+  // As we don't include this in the reduction the result should not be 2.
+  in(0) = static_cast<DataType>(2);
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.maximum();
+  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 4096;
+  const IndexType num_cols = 4096;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+  array<IndexType, 1> argRange = {{num_cols}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  //  red_axis[1]=1;
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> in_arg1(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> in_arg2(tensorRange);
+  Tensor<bool, 1, DataLayout, IndexType> out_arg_cpu(argRange);
+  Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu(argRange);
+  Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu_helper(argRange);
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
+
+  in.setRandom();
+  in_arg1.setRandom();
+  in_arg2.setRandom();
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_in_arg1_data = static_cast<DataType*>(sycl_device.allocate(
+      in_arg1.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_in_arg2_data = static_cast<DataType*>(sycl_device.allocate(
+      in_arg2.dimensions().TotalSize() * sizeof(DataType)));
+  bool* gpu_out_arg__gpu_helper_data = static_cast<bool*>(sycl_device.allocate(
+      out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
+  bool* gpu_out_arg_data = static_cast<bool*>(sycl_device.allocate(
+      out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg1_gpu(
+      gpu_in_arg1_data, tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg2_gpu(
+      gpu_in_arg2_data, tensorRange);
+  TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu(
+      gpu_out_arg_data, argRange);
+  TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu_helper(
+      gpu_out_arg__gpu_helper_data, argRange);
+  TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
+
+  // CPU VERSION
+  out_arg_cpu =
+      (in_arg1.argmax(1) == in_arg2.argmax(1))
+          .select(out_arg_cpu.constant(true), out_arg_cpu.constant(false));
+  full_redux = (out_arg_cpu.template cast<float>())
+                   .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+
+  // GPU VERSION
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_arg1_data, in_arg1.data(),
+      (in_arg1.dimensions().TotalSize()) * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_arg2_data, in_arg2.data(),
+      (in_arg2.dimensions().TotalSize()) * sizeof(DataType));
+  out_Argout_gpu_helper.device(sycl_device) =
+      (in_Arg1_gpu.argmax(1) == in_Arg2_gpu.argmax(1));
+  out_Argout_gpu.device(sycl_device) =
+      (out_Argout_gpu_helper)
+          .select(out_Argout_gpu.constant(true),
+                  out_Argout_gpu.constant(false));
+  out_gpu.device(sycl_device) =
+      (out_Argout_gpu.template cast<float>())
+          .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  std::cout << "SYCL : " << full_redux_gpu() << " , CPU : " << full_redux()
+            << '\n';
+  VERIFY_IS_EQUAL(full_redux_gpu(), full_redux());
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_in_arg1_data);
+  sycl_device.deallocate(gpu_in_arg2_data);
+  sycl_device.deallocate(gpu_out_arg__gpu_helper_data);
+  sycl_device.deallocate(gpu_out_arg_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_mean_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.mean();
+  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.mean();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_mean_with_odd_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  // This is a particular case which illustrates a possible problem when the
+  // number of local threads in a workgroup is even, but is not a power of two.
+  using data_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  // 2177 = (17 * 128) + 1 gives rise to 18 local threads.
+  // 8708 = 4 * 2177 = 4 * (17 * 128) + 4 uses 18 vectorised local threads.
+  const IndexType n_elems = 8707;
+  array<IndexType, 1> tensor_range = {{n_elems}};
+
+  data_tensor in(tensor_range);
+  DataType full_redux;
+  DataType full_redux_gpu;
+  TensorMap<scalar_tensor> red_cpu(&full_redux);
+  TensorMap<scalar_tensor> red_gpu(&full_redux_gpu);
+
+  const DataType const_val = static_cast<DataType>(0.6391);
+  in = in.constant(const_val);
+
+  Eigen::IndexList<Eigen::type2index<0>> red_axis;
+  red_cpu = in.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+  VERIFY_IS_APPROX(const_val, red_cpu());
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data, tensor_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) =
+      in_gpu.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+  sycl_device.memcpyDeviceToHost(red_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu, full_redux);
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_min_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 876;
+  const IndexType num_cols = 953;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
+
+  in.setRandom();
+
+  full_redux = in.minimum();
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.minimum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_min_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set the initial value to be the min.
+  // As we don't include this in the reduction the result should not be -2.
+  in(0) = static_cast<DataType>(-2);
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.minimum();
+  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.minimum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_max_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 145;
+  IndexType dim_y = 1;
+  IndexType dim_z = 67;
+
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux = in.maximum(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_max_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
+
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  array<IndexType, 1> reduced_range = {{num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+  const IndexType n_reduced = num_cols;
+
+  data_tensor in(tensor_range);
+  reduced_tensor redux;
+  reduced_tensor redux_gpu(reduced_range);
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set maximum value outside of the considered range.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    in(i) = static_cast<DataType>(2);
+  }
+
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  redux = in_offset.maximum(red_axis);
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_NOT_EQUAL(redux(i), in(i));
+  }
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(
+      sycl_device.allocate(n_reduced * sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<reduced_tensor> out_gpu(gpu_out_data, reduced_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
+  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data,
+                                 n_reduced * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_APPROX(redux_gpu(i), redux(i));
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_max_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
+
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  array<IndexType, 1> full_reduced_range = {{num_rows}};
+  array<IndexType, 1> reduced_range = {{num_rows - 1}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+  const IndexType n_reduced = reduced_range[0];
+
+  data_tensor in(tensor_range);
+  reduced_tensor redux(full_reduced_range);
+  reduced_tensor redux_gpu(reduced_range);
+
+  in.setRandom();
+  redux.setZero();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set maximum value outside of the considered range.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    in(i) = static_cast<DataType>(2);
+  }
+
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 1;
+
+  const IndexType offset = 64;
+  // Introduce an offset in both the input and the output.
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  TensorMap<reduced_tensor> red_offset(redux.data() + 1, reduced_range);
+  red_offset = in_offset.maximum(red_axis);
+
+  // Check that the first value hasn't been changed and that the reduced values
+  // are not equal to the previously set maximum in the input outside the range.
+  VERIFY_IS_EQUAL(redux(0), static_cast<DataType>(0));
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_NOT_EQUAL(red_offset(i), in(i));
+  }
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(
+      sycl_device.allocate((n_reduced + 1) * sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<reduced_tensor> out_gpu(gpu_out_data + 1, reduced_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
+  sycl_device.memcpyDeviceToHost(redux_gpu.data(), out_gpu.data(),
+                                 n_reduced * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_APPROX(redux_gpu(i), red_offset(i));
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device, IndexType dim_x, IndexType dim_y) {
+  array<IndexType, 2> tensorRange = {{dim_x, dim_y}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  array<IndexType, 1> reduced_tensorRange = {{dim_y}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+  redux = in.sum(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType i = 0; i < redux.size(); i++) {
+    VERIFY_IS_APPROX(redux_gpu.data()[i], redux.data()[i]);
+  }
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 145;
+  IndexType dim_y = 1;
+  IndexType dim_z = 67;
+
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux = in.mean(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 64;
+  IndexType dim_y = 1;
+  IndexType dim_z = 32;
+
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 2;
+  array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux = in.mean(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 64;
+  IndexType dim_y = 1;
+  IndexType dim_z = 32;
+
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 2;
+  array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux = in.sum(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  auto tensorRange = Sizes<64, 32>(64, 32);
+  // auto red_axis =  Sizes<0,1>(0,1);
+  Eigen::IndexList<Eigen::type2index<1>> red_axis;
+  auto reduced_tensorRange = Sizes<64>(64);
+  TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
+
+  in_fix.setRandom();
+
+  redux_fix = in_fix.sum(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(
+      gpu_in_data, tensorRange);
+  TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in_fix.data(),
+      (in_fix.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu_fix.device(sycl_device) = in_gpu_fix.sum(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu_fix.data(), gpu_out_data,
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
+    VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  auto tensorRange = Sizes<64, 32>(64, 32);
+  Eigen::IndexList<Eigen::type2index<1>> red_axis;
+  auto reduced_tensorRange = Sizes<64>(64);
+  TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
+
+  in_fix.setRandom();
+  redux_fix = in_fix.mean(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(
+      gpu_in_data, tensorRange);
+  TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in_fix.data(),
+      (in_fix.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu_fix.device(sycl_device) = in_gpu_fix.mean(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu_fix.data(), gpu_out_data,
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
+  sycl_device.synchronize();
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
+    VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+// SYCL supports a generic case of reduction where the accumulator is a
+// different type than the input data This is an example on how to get if a
+// Tensor contains nan and/or inf in one reduction
+template <typename InT, typename OutT>
+struct CustomReducer {
+  static const bool PacketAccess = false;
+  static const bool IsStateful = false;
+
+  static constexpr OutT InfBit = 1;
+  static constexpr OutT NanBit = 2;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const InT x,
+                                                    OutT* accum) const {
+    if (Eigen::numext::isinf(x))
+      *accum |= InfBit;
+    else if (Eigen::numext::isnan(x))
+      *accum |= NanBit;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const OutT x,
+                                                    OutT* accum) const {
+    *accum |= x;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT initialize() const {
+    return OutT(0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT finalize(const OutT accum) const {
+    return accum;
+  }
+};
+
+template <typename DataType, typename AccumType, int DataLayout,
+          typename IndexType>
+static void test_full_reductions_custom_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  constexpr IndexType InSize = 64;
+  auto tensorRange = Sizes<InSize>(InSize);
+  Eigen::IndexList<Eigen::type2index<0>> dims;
+  auto reduced_tensorRange = Sizes<>();
+  TensorFixedSize<DataType, Sizes<InSize>, DataLayout> in_fix;
+  TensorFixedSize<AccumType, Sizes<>, DataLayout> redux_gpu_fix;
+
+  CustomReducer<DataType, AccumType> reducer;
+
+  in_fix.setRandom();
+
+  size_t in_size_bytes = in_fix.dimensions().TotalSize() * sizeof(DataType);
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(in_size_bytes));
+  AccumType* gpu_out_data =
+      static_cast<AccumType*>(sycl_device.allocate(sizeof(AccumType)));
+
+  TensorMap<TensorFixedSize<DataType, Sizes<InSize>, DataLayout>> in_gpu_fix(
+      gpu_in_data, tensorRange);
+  TensorMap<TensorFixedSize<AccumType, Sizes<>, DataLayout>> out_gpu_fix(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), in_size_bytes);
+  out_gpu_fix.device(sycl_device) = in_gpu_fix.reduce(dims, reducer);
+  sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data,
+                                 sizeof(AccumType));
+  VERIFY_IS_EQUAL(redux_gpu_fix(0), AccumType(0));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_test_full_per_device(const Dev& sycl_device) {
+  test_full_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_full_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_full_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
+
+  test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_full_reductions_custom_sycl<DataType, int, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_custom_sycl<DataType, int, ColMajor, int64_t>(
+      sycl_device);
+  sycl_device.synchronize();
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_full_offset_per_device(const Dev& sycl_device) {
+  test_full_reductions_sum_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_sum_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_min_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_min_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_max_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_mean_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_mean_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_mean_with_odd_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  sycl_device.synchronize();
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_test_first_dim_per_device(const Dev& sycl_device) {
+  test_first_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device,
+                                                                  4197, 4097);
+  test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
+                                                                  4197, 4097);
+  test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
+                                                                  129, 8);
+  test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_first_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  sycl_device.synchronize();
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_test_last_dim_per_device(const Dev& sycl_device) {
+  test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_last_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_last_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_last_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_last_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_last_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  sycl_device.synchronize();
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.template get_info<cl::sycl::info::device::name>()
+              << std::endl;
+    QueueInterface queueInterface(device);
+    auto sycl_device = Eigen::SyclDevice(&queueInterface);
+    CALL_SUBTEST_1(sycl_reduction_test_full_per_device<float>(sycl_device));
+    CALL_SUBTEST_2(sycl_reduction_full_offset_per_device<float>(sycl_device));
+    CALL_SUBTEST_3(
+        sycl_reduction_test_first_dim_per_device<float>(sycl_device));
+    CALL_SUBTEST_4(sycl_reduction_test_last_dim_per_device<float>(sycl_device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp
new file mode 100644
index 0000000..7dbd047
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_ref.cpp

@@ -0,0 +1,248 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_simple_lvalue_ref()
+{
+  Tensor<int, 1> input(6);
+  input.setRandom();
+
+  TensorRef<Tensor<int, 1>> ref3(input);
+  TensorRef<Tensor<int, 1>> ref4 = input;
+
+  VERIFY_IS_EQUAL(ref3.data(), input.data());
+  VERIFY_IS_EQUAL(ref4.data(), input.data());
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(ref3(i), input(i));
+    VERIFY_IS_EQUAL(ref4(i), input(i));
+  }
+
+  for (int i = 0; i < 6; ++i) {
+    ref3.coeffRef(i) = i;
+  }
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(input(i), i);
+  }
+  for (int i = 0; i < 6; ++i) {
+    ref4.coeffRef(i) = -i * 2;
+  }
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(input(i), -i*2);
+  }
+}
+
+
+static void test_simple_rvalue_ref()
+{
+  Tensor<int, 1> input1(6);
+  input1.setRandom();
+  Tensor<int, 1> input2(6);
+  input2.setRandom();
+
+  TensorRef<Tensor<int, 1>> ref3(input1 + input2);
+  TensorRef<Tensor<int, 1>> ref4 = input1 + input2;
+
+  VERIFY_IS_NOT_EQUAL(ref3.data(), input1.data());
+  VERIFY_IS_NOT_EQUAL(ref4.data(), input1.data());
+  VERIFY_IS_NOT_EQUAL(ref3.data(), input2.data());
+  VERIFY_IS_NOT_EQUAL(ref4.data(), input2.data());
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(ref3(i), input1(i) + input2(i));
+    VERIFY_IS_EQUAL(ref4(i), input1(i) + input2(i));
+  }
+}
+
+
+static void test_multiple_dims()
+{
+  Tensor<float, 3> input(3,5,7);
+  input.setRandom();
+
+  TensorRef<Tensor<float, 3>> ref(input);
+  VERIFY_IS_EQUAL(ref.data(), input.data());
+  VERIFY_IS_EQUAL(ref.dimension(0), 3);
+  VERIFY_IS_EQUAL(ref.dimension(1), 5);
+  VERIFY_IS_EQUAL(ref.dimension(2), 7);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(ref(i,j,k), input(i,j,k));
+      }
+    }
+  }
+}
+
+
+static void test_slice()
+{
+  Tensor<float, 5> tensor(2,3,5,7,11);
+  tensor.setRandom();
+
+  Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5);
+  Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1);
+  TensorRef<Tensor<float, 5>> slice = tensor.slice(indices, sizes);
+  VERIFY_IS_EQUAL(slice(0,0,0,0,0), tensor(1,2,3,4,5));
+
+  Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5);
+  Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3);
+  slice = tensor.slice(indices2, sizes2);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+      }
+    }
+  }
+
+  Eigen::DSizes<ptrdiff_t, 5> indices3(0,0,0,0,0);
+  Eigen::DSizes<ptrdiff_t, 5> sizes3(2,3,1,1,1);
+  slice = tensor.slice(indices3, sizes3);
+  VERIFY_IS_EQUAL(slice.data(), tensor.data());
+}
+
+
+static void test_ref_of_ref()
+{
+  Tensor<float, 3> input(3,5,7);
+  input.setRandom();
+
+  TensorRef<Tensor<float, 3>> ref(input);
+  TensorRef<Tensor<float, 3>> ref_of_ref(ref);
+  TensorRef<Tensor<float, 3>> ref_of_ref2;
+  ref_of_ref2 = ref;
+
+  VERIFY_IS_EQUAL(ref_of_ref.data(), input.data());
+  VERIFY_IS_EQUAL(ref_of_ref.dimension(0), 3);
+  VERIFY_IS_EQUAL(ref_of_ref.dimension(1), 5);
+  VERIFY_IS_EQUAL(ref_of_ref.dimension(2), 7);
+
+  VERIFY_IS_EQUAL(ref_of_ref2.data(), input.data());
+  VERIFY_IS_EQUAL(ref_of_ref2.dimension(0), 3);
+  VERIFY_IS_EQUAL(ref_of_ref2.dimension(1), 5);
+  VERIFY_IS_EQUAL(ref_of_ref2.dimension(2), 7);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(ref_of_ref(i,j,k), input(i,j,k));
+        VERIFY_IS_EQUAL(ref_of_ref2(i,j,k), input(i,j,k));
+     }
+    }
+  }
+}
+
+
+static void test_ref_in_expr()
+{
+  Tensor<float, 3> input(3,5,7);
+  input.setRandom();
+  TensorRef<Tensor<float, 3>> input_ref(input);
+
+  Tensor<float, 3> result(3,5,7);
+  result.setRandom();
+  TensorRef<Tensor<float, 3>> result_ref(result);
+
+  Tensor<float, 3> bias(3,5,7);
+  bias.setRandom();
+
+  result_ref = input_ref + bias;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(result_ref(i,j,k), input(i,j,k) + bias(i,j,k));
+        VERIFY_IS_NOT_EQUAL(result(i,j,k), input(i,j,k) + bias(i,j,k));
+      }
+    }
+  }
+
+  result = result_ref;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(result(i,j,k), input(i,j,k) + bias(i,j,k));
+      }
+    }
+  }
+}
+
+
+static void test_coeff_ref()
+{
+  Tensor<float, 5> tensor(2,3,5,7,11);
+  tensor.setRandom();
+  Tensor<float, 5> original = tensor;
+
+  TensorRef<Tensor<float, 4>> slice = tensor.chip(7, 4);
+  slice.coeffRef(0, 0, 0, 0) = 1.0f;
+  slice.coeffRef(1, 0, 0, 0) += 2.0f;
+
+  VERIFY_IS_EQUAL(tensor(0,0,0,0,7), 1.0f);
+  VERIFY_IS_EQUAL(tensor(1,0,0,0,7), original(1,0,0,0,7) + 2.0f);
+}
+
+
+static void test_nested_ops_with_ref()
+{
+  Tensor<float, 4> t(2, 3, 5, 7);
+  t.setRandom();
+  TensorMap<Tensor<const float, 4> > m(t.data(), 2, 3, 5, 7);
+  array<std::pair<ptrdiff_t, ptrdiff_t>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(2, 1);
+  paddings[2] = std::make_pair(3, 4);
+  paddings[3] = std::make_pair(0, 0);
+  DSizes<Eigen::DenseIndex, 4> shuffle_dims(0, 1, 2, 3);
+  TensorRef<Tensor<const float, 4> > ref(m.pad(paddings));
+  array<std::pair<ptrdiff_t, ptrdiff_t>, 4> trivial;
+  trivial[0] = std::make_pair(0, 0);
+  trivial[1] = std::make_pair(0, 0);
+  trivial[2] = std::make_pair(0, 0);
+  trivial[3] = std::make_pair(0, 0);
+  Tensor<float, 4> padded = ref.shuffle(shuffle_dims).pad(trivial);
+  VERIFY_IS_EQUAL(padded.dimension(0), 2+0);
+  VERIFY_IS_EQUAL(padded.dimension(1), 3+3);
+  VERIFY_IS_EQUAL(padded.dimension(2), 5+7);
+  VERIFY_IS_EQUAL(padded.dimension(3), 7+0);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      for (int k = 0; k < 12; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), t(i,j-2,k-3,l));
+          } else {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f);
+          }
+        }
+      }
+    }
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_ref)
+{
+  CALL_SUBTEST(test_simple_lvalue_ref());
+  CALL_SUBTEST(test_simple_rvalue_ref());
+  CALL_SUBTEST(test_multiple_dims());
+  CALL_SUBTEST(test_slice());
+  CALL_SUBTEST(test_ref_of_ref());
+  CALL_SUBTEST(test_ref_in_expr());
+  CALL_SUBTEST(test_coeff_ref());
+  CALL_SUBTEST(test_nested_ops_with_ref());
+}

diff --git a/unsupported/test/cxx11_tensor_reverse.cpp b/unsupported/test/cxx11_tensor_reverse.cpp
new file mode 100644
index 0000000..5e44ec0
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_reverse.cpp

@@ -0,0 +1,190 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com and
+//                    Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::array;
+
+template <int DataLayout>
+static void test_simple_reverse()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  array<bool, 4> dim_rev;
+  dim_rev[0] = false;
+  dim_rev[1] = true;
+  dim_rev[2] = true;
+  dim_rev[3] = false;
+
+  Tensor<float, 4, DataLayout> reversed_tensor;
+  reversed_tensor = tensor.reverse(dim_rev);
+
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(0), 2);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(1), 3);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(2), 5);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l));
+        }
+      }
+    }
+  }
+
+  dim_rev[0] = true;
+  dim_rev[1] = false;
+  dim_rev[2] = false;
+  dim_rev[3] = false;
+
+  reversed_tensor = tensor.reverse(dim_rev);
+
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(0), 2);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(1), 3);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(2), 5);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(3), 7);
+
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l));
+        }
+      }
+    }
+  }
+
+  dim_rev[0] = true;
+  dim_rev[1] = false;
+  dim_rev[2] = false;
+  dim_rev[3] = true;
+
+  reversed_tensor = tensor.reverse(dim_rev);
+
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(0), 2);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(1), 3);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(2), 5);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(3), 7);
+
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l));
+        }
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_expr_reverse(bool LValue)
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  array<bool, 4> dim_rev;
+  dim_rev[0] = false;
+  dim_rev[1] = true;
+  dim_rev[2] = false;
+  dim_rev[3] = true;
+
+  Tensor<float, 4, DataLayout> expected(2, 3, 5, 7);
+  if (LValue) {
+    expected.reverse(dim_rev) = tensor;
+  } else {
+    expected = tensor.reverse(dim_rev);
+  }
+
+  Tensor<float, 4, DataLayout> result(2,3,5,7);
+
+  array<ptrdiff_t, 4> src_slice_dim;
+  src_slice_dim[0] = 2;
+  src_slice_dim[1] = 3;
+  src_slice_dim[2] = 1;
+  src_slice_dim[3] = 7;
+  array<ptrdiff_t, 4> src_slice_start;
+  src_slice_start[0] = 0;
+  src_slice_start[1] = 0;
+  src_slice_start[2] = 0;
+  src_slice_start[3] = 0;
+  array<ptrdiff_t, 4> dst_slice_dim = src_slice_dim;
+  array<ptrdiff_t, 4> dst_slice_start = src_slice_start;
+
+  for (int i = 0; i < 5; ++i) {
+    if (LValue) {
+      result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev) =
+          tensor.slice(src_slice_start, src_slice_dim);
+    } else {
+      result.slice(dst_slice_start, dst_slice_dim) =
+          tensor.slice(src_slice_start, src_slice_dim).reverse(dim_rev);
+    }
+    src_slice_start[2] += 1;
+    dst_slice_start[2] += 1;
+  }
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 3);
+  VERIFY_IS_EQUAL(result.dimension(2), 5);
+  VERIFY_IS_EQUAL(result.dimension(3), 7);
+
+  for (int i = 0; i < expected.dimension(0); ++i) {
+    for (int j = 0; j < expected.dimension(1); ++j) {
+      for (int k = 0; k < expected.dimension(2); ++k) {
+        for (int l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  dst_slice_start[2] = 0;
+  result.setRandom();
+  for (int i = 0; i < 5; ++i) {
+     if (LValue) {
+       result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev) =
+           tensor.slice(dst_slice_start, dst_slice_dim);
+     } else {
+       result.slice(dst_slice_start, dst_slice_dim) =
+           tensor.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
+     }
+    dst_slice_start[2] += 1;
+  }
+
+  for (int i = 0; i < expected.dimension(0); ++i) {
+    for (int j = 0; j < expected.dimension(1); ++j) {
+      for (int k = 0; k < expected.dimension(2); ++k) {
+        for (int l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_reverse)
+{
+  CALL_SUBTEST(test_simple_reverse<ColMajor>());
+  CALL_SUBTEST(test_simple_reverse<RowMajor>());
+  CALL_SUBTEST(test_expr_reverse<ColMajor>(true));
+  CALL_SUBTEST(test_expr_reverse<RowMajor>(true));
+  CALL_SUBTEST(test_expr_reverse<ColMajor>(false));
+  CALL_SUBTEST(test_expr_reverse<RowMajor>(false));
+}

diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
new file mode 100644
index 0000000..dd30c23
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp

@@ -0,0 +1,253 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) {
+  IndexType dim1 = 2;
+  IndexType dim2 = 3;
+  IndexType dim3 = 5;
+  IndexType dim4 = 7;
+
+  array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> reversed_tensor(tensorRange);
+  tensor.setRandom();
+
+  array<bool, 4> dim_rev;
+  dim_rev[0] = false;
+  dim_rev[1] = true;
+  dim_rev[2] = true;
+  dim_rev[3] = false;
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data,
+                                                                tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu(gpu_out_data,
+                                                                 tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, tensor.data(),
+      (tensor.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
+  sycl_device.memcpyDeviceToHost(
+      reversed_tensor.data(), gpu_out_data,
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i, j, k, l),
+                          reversed_tensor(i, 2 - j, 4 - k, l));
+        }
+      }
+    }
+  }
+  dim_rev[0] = true;
+  dim_rev[1] = false;
+  dim_rev[2] = false;
+  dim_rev[3] = false;
+
+  out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
+  sycl_device.memcpyDeviceToHost(
+      reversed_tensor.data(), gpu_out_data,
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i, j, k, l), reversed_tensor(1 - i, j, k, l));
+        }
+      }
+    }
+  }
+
+  dim_rev[0] = true;
+  dim_rev[1] = false;
+  dim_rev[2] = false;
+  dim_rev[3] = true;
+  out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
+  sycl_device.memcpyDeviceToHost(
+      reversed_tensor.data(), gpu_out_data,
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i, j, k, l),
+                          reversed_tensor(1 - i, j, k, 6 - l));
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_expr_reverse(const Eigen::SyclDevice& sycl_device,
+                              bool LValue) {
+  IndexType dim1 = 2;
+  IndexType dim2 = 3;
+  IndexType dim3 = 5;
+  IndexType dim4 = 7;
+
+  array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> expected(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> result(tensorRange);
+  tensor.setRandom();
+
+  array<bool, 4> dim_rev;
+  dim_rev[0] = false;
+  dim_rev[1] = true;
+  dim_rev[2] = false;
+  dim_rev[3] = true;
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data_expected = static_cast<DataType*>(sycl_device.allocate(
+      expected.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data_result = static_cast<DataType*>(
+      sycl_device.allocate(result.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data,
+                                                                tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_expected(
+      gpu_out_data_expected, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_result(
+      gpu_out_data_result, tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, tensor.data(),
+      (tensor.dimensions().TotalSize()) * sizeof(DataType));
+
+  if (LValue) {
+    out_gpu_expected.reverse(dim_rev).device(sycl_device) = in_gpu;
+  } else {
+    out_gpu_expected.device(sycl_device) = in_gpu.reverse(dim_rev);
+  }
+  sycl_device.memcpyDeviceToHost(
+      expected.data(), gpu_out_data_expected,
+      expected.dimensions().TotalSize() * sizeof(DataType));
+
+  array<IndexType, 4> src_slice_dim;
+  src_slice_dim[0] = 2;
+  src_slice_dim[1] = 3;
+  src_slice_dim[2] = 1;
+  src_slice_dim[3] = 7;
+  array<IndexType, 4> src_slice_start;
+  src_slice_start[0] = 0;
+  src_slice_start[1] = 0;
+  src_slice_start[2] = 0;
+  src_slice_start[3] = 0;
+  array<IndexType, 4> dst_slice_dim = src_slice_dim;
+  array<IndexType, 4> dst_slice_start = src_slice_start;
+
+  for (IndexType i = 0; i < 5; ++i) {
+    if (LValue) {
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim)
+          .reverse(dim_rev)
+          .device(sycl_device) = in_gpu.slice(src_slice_start, src_slice_dim);
+    } else {
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
+          in_gpu.slice(src_slice_start, src_slice_dim).reverse(dim_rev);
+    }
+    src_slice_start[2] += 1;
+    dst_slice_start[2] += 1;
+  }
+  sycl_device.memcpyDeviceToHost(
+      result.data(), gpu_out_data_result,
+      result.dimensions().TotalSize() * sizeof(DataType));
+
+  for (IndexType i = 0; i < expected.dimension(0); ++i) {
+    for (IndexType j = 0; j < expected.dimension(1); ++j) {
+      for (IndexType k = 0; k < expected.dimension(2); ++k) {
+        for (IndexType l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l));
+        }
+      }
+    }
+  }
+
+  dst_slice_start[2] = 0;
+  result.setRandom();
+  sycl_device.memcpyHostToDevice(
+      gpu_out_data_result, result.data(),
+      (result.dimensions().TotalSize()) * sizeof(DataType));
+  for (IndexType i = 0; i < 5; ++i) {
+    if (LValue) {
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim)
+          .reverse(dim_rev)
+          .device(sycl_device) = in_gpu.slice(dst_slice_start, dst_slice_dim);
+    } else {
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
+          in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
+    }
+    dst_slice_start[2] += 1;
+  }
+  sycl_device.memcpyDeviceToHost(
+      result.data(), gpu_out_data_result,
+      result.dimensions().TotalSize() * sizeof(DataType));
+
+  for (IndexType i = 0; i < expected.dimension(0); ++i) {
+    for (IndexType j = 0; j < expected.dimension(1); ++j) {
+      for (IndexType k = 0; k < expected.dimension(2); ++k) {
+        for (IndexType l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l));
+        }
+      }
+    }
+  }
+}
+
+template <typename DataType>
+void sycl_reverse_test_per_device(const cl::sycl::device& d) {
+  QueueInterface queueInterface(d);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_reverse<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_reverse<DataType, ColMajor, int64_t>(sycl_device);
+  test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, false);
+  test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, false);
+  test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, true);
+  test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, true);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_reverse_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.get_info<cl::sycl::info::device::name>() << std::endl;
+    CALL_SUBTEST_1(sycl_reverse_test_per_device<short>(device));
+    CALL_SUBTEST_2(sycl_reverse_test_per_device<int>(device));
+    CALL_SUBTEST_3(sycl_reverse_test_per_device<unsigned int>(device));
+#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
+    CALL_SUBTEST_4(sycl_reverse_test_per_device<double>(device));
+#endif
+    CALL_SUBTEST_5(sycl_reverse_test_per_device<float>(device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_roundings.cpp b/unsupported/test/cxx11_tensor_roundings.cpp
new file mode 100644
index 0000000..83b5923
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_roundings.cpp

@@ -0,0 +1,62 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+static void test_float_rounding()
+{
+  Tensor<float, 2> ftensor(20,30);
+  ftensor = ftensor.random() * 100.f;
+
+  Tensor<float, 2> result = ftensor.round();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(result(i,j), numext::round(ftensor(i,j)));
+    }
+  }
+}
+
+static void test_float_flooring()
+{
+  Tensor<float, 2> ftensor(20,30);
+  ftensor = ftensor.random() * 100.f;
+
+  Tensor<float, 2> result = ftensor.floor();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(result(i,j), numext::floor(ftensor(i,j)));
+    }
+  }
+}
+
+static void test_float_ceiling()
+{
+  Tensor<float, 2> ftensor(20,30);
+  ftensor = ftensor.random() * 100.f;
+
+  Tensor<float, 2> result = ftensor.ceil();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(result(i,j), numext::ceil(ftensor(i,j)));
+    }
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_roundings)
+{
+   CALL_SUBTEST(test_float_rounding());
+   CALL_SUBTEST(test_float_ceiling());
+   CALL_SUBTEST(test_float_flooring());
+}

diff --git a/unsupported/test/cxx11_tensor_scan.cpp b/unsupported/test/cxx11_tensor_scan.cpp
new file mode 100644
index 0000000..dccee9e
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_scan.cpp

@@ -0,0 +1,110 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <limits>
+#include <numeric>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int DataLayout, typename Type=float, bool Exclusive = false>
+static void test_1d_scan()
+{
+  int size = 50;
+  Tensor<Type, 1, DataLayout> tensor(size);
+  tensor.setRandom();
+  Tensor<Type, 1, DataLayout> result = tensor.cumsum(0, Exclusive);
+
+  VERIFY_IS_EQUAL(tensor.dimension(0), result.dimension(0));
+
+  float accum = 0;
+  for (int i = 0; i < size; i++) {
+    if (Exclusive) {
+      VERIFY_IS_EQUAL(result(i), accum);
+      accum += tensor(i);
+    } else {
+      accum += tensor(i);
+      VERIFY_IS_EQUAL(result(i), accum);
+    }
+  }
+
+  accum = 1;
+  result = tensor.cumprod(0, Exclusive);
+  for (int i = 0; i < size; i++) {
+    if (Exclusive) {
+      VERIFY_IS_EQUAL(result(i), accum);
+      accum *= tensor(i);
+    } else {
+      accum *= tensor(i);
+      VERIFY_IS_EQUAL(result(i), accum);
+    }
+  }
+}
+
+template <int DataLayout, typename Type=float>
+static void test_4d_scan()
+{
+  int size = 5;
+  Tensor<Type, 4, DataLayout> tensor(size, size, size, size);
+  tensor.setRandom();
+
+  Tensor<Type, 4, DataLayout> result(size, size, size, size);
+
+  result = tensor.cumsum(0);
+  float accum = 0;
+  for (int i = 0; i < size; i++) {
+    accum += tensor(i, 1, 2, 3);
+    VERIFY_IS_EQUAL(result(i, 1, 2, 3), accum);
+  }
+  result = tensor.cumsum(1);
+  accum = 0;
+  for (int i = 0; i < size; i++) {
+    accum += tensor(1, i, 2, 3);
+    VERIFY_IS_EQUAL(result(1, i, 2, 3), accum);
+  }
+  result = tensor.cumsum(2);
+  accum = 0;
+  for (int i = 0; i < size; i++) {
+    accum += tensor(1, 2, i, 3);
+    VERIFY_IS_EQUAL(result(1, 2, i, 3), accum);
+  }
+  result = tensor.cumsum(3);
+  accum = 0;
+  for (int i = 0; i < size; i++) {
+    accum += tensor(1, 2, 3, i);
+    VERIFY_IS_EQUAL(result(1, 2, 3, i), accum);
+  }
+}
+
+template <int DataLayout>
+static void test_tensor_maps() {
+  int inputs[20];
+  TensorMap<Tensor<int, 1, DataLayout> > tensor_map(inputs, 20);
+  tensor_map.setRandom();
+
+  Tensor<int, 1, DataLayout> result = tensor_map.cumsum(0);
+
+  int accum = 0;
+  for (int i = 0; i < 20; ++i) {
+    accum += tensor_map(i);
+    VERIFY_IS_EQUAL(result(i), accum);
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_scan) {
+  CALL_SUBTEST((test_1d_scan<ColMajor, float, true>()));
+  CALL_SUBTEST((test_1d_scan<ColMajor, float, false>()));
+  CALL_SUBTEST((test_1d_scan<RowMajor, float, true>()));
+  CALL_SUBTEST((test_1d_scan<RowMajor, float, false>()));
+  CALL_SUBTEST(test_4d_scan<ColMajor>());
+  CALL_SUBTEST(test_4d_scan<RowMajor>());
+  CALL_SUBTEST(test_tensor_maps<ColMajor>());
+  CALL_SUBTEST(test_tensor_maps<RowMajor>());
+}

diff --git a/unsupported/test/cxx11_tensor_scan_gpu.cu b/unsupported/test/cxx11_tensor_scan_gpu.cu
new file mode 100644
index 0000000..770a144
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_scan_gpu.cu

@@ -0,0 +1,78 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template<int DataLayout>
+void test_gpu_cumsum(int m_size, int k_size, int n_size)
+{
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+  Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
+  Tensor<float, 3, DataLayout> t_result(m_size, k_size, n_size);
+  Tensor<float, 3, DataLayout> t_result_gpu(m_size, k_size, n_size);
+
+  t_input.setRandom();
+
+  std::size_t t_input_bytes = t_input.size()  * sizeof(float);
+  std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+  float* d_t_input;
+  float* d_t_result;
+
+  gpuMalloc((void**)(&d_t_input), t_input_bytes);
+  gpuMalloc((void**)(&d_t_result), t_result_bytes);
+
+  gpuMemcpy(d_t_input, t_input.data(), t_input_bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
+      gpu_t_input(d_t_input, Eigen::array<int, 3>(m_size, k_size, n_size));
+  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
+      gpu_t_result(d_t_result, Eigen::array<int, 3>(m_size, k_size, n_size));
+
+  gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
+  t_result = t_input.cumsum(1);
+
+  gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
+  for (DenseIndex i = 0; i < t_result.size(); i++) {
+    if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+              << " vs " <<  t_result_gpu(i) << std::endl;
+    assert(false);
+  }
+
+  gpuFree((void*)d_t_input);
+  gpuFree((void*)d_t_result);
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_scan_gpu)
+{
+  CALL_SUBTEST_1(test_gpu_cumsum<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_2(test_gpu_cumsum<RowMajor>(128, 128, 128));
+}

diff --git a/unsupported/test/cxx11_tensor_scan_sycl.cpp b/unsupported/test/cxx11_tensor_scan_sycl.cpp
new file mode 100644
index 0000000..09c45fc
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_scan_sycl.cpp

@@ -0,0 +1,141 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_cumsum(const Eigen::SyclDevice& sycl_device, IndexType m_size,
+                      IndexType k_size, IndexType n_size, int consume_dim,
+                      bool exclusive) {
+  static const DataType error_threshold = 1e-4f;
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size
+            << " consume_dim : " << consume_dim << ")" << std::endl;
+  Tensor<DataType, 3, DataLayout, IndexType> t_input(m_size, k_size, n_size);
+  Tensor<DataType, 3, DataLayout, IndexType> t_result(m_size, k_size, n_size);
+  Tensor<DataType, 3, DataLayout, IndexType> t_result_gpu(m_size, k_size,
+                                                          n_size);
+
+  t_input.setRandom();
+  std::size_t t_input_bytes = t_input.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType* gpu_data_in =
+      static_cast<DataType*>(sycl_device.allocate(t_input_bytes));
+  DataType* gpu_data_out =
+      static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
+
+  array<IndexType, 3> tensorRange = {{m_size, k_size, n_size}};
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_input(
+      gpu_data_in, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_result(
+      gpu_data_out, tensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data_in, t_input.data(), t_input_bytes);
+  sycl_device.memcpyHostToDevice(gpu_data_out, t_input.data(), t_input_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_input.cumsum(consume_dim, exclusive);
+
+  t_result = t_input.cumsum(consume_dim, exclusive);
+
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), gpu_data_out,
+                                 t_result_bytes);
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << " CPU : " << t_result(i)
+              << " vs SYCL : " << t_result_gpu(i) << std::endl;
+    assert(false);
+  }
+  sycl_device.deallocate(gpu_data_in);
+  sycl_device.deallocate(gpu_data_out);
+}
+
+template <typename DataType, typename Dev>
+void sycl_scan_test_exclusive_dim0_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                true);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                true);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_exclusive_dim1_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                true);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                true);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_exclusive_dim2_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                true);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                true);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_inclusive_dim0_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                false);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                false);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_inclusive_dim1_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                false);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                false);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_inclusive_dim2_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                false);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                false);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_scan_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.template get_info<cl::sycl::info::device::name>()
+              << std::endl;
+    QueueInterface queueInterface(device);
+    auto sycl_device = Eigen::SyclDevice(&queueInterface);
+    CALL_SUBTEST_1(
+        sycl_scan_test_exclusive_dim0_per_device<float>(sycl_device));
+    CALL_SUBTEST_2(
+        sycl_scan_test_exclusive_dim1_per_device<float>(sycl_device));
+    CALL_SUBTEST_3(
+        sycl_scan_test_exclusive_dim2_per_device<float>(sycl_device));
+    CALL_SUBTEST_4(
+        sycl_scan_test_inclusive_dim0_per_device<float>(sycl_device));
+    CALL_SUBTEST_5(
+        sycl_scan_test_inclusive_dim1_per_device<float>(sycl_device));
+    CALL_SUBTEST_6(
+        sycl_scan_test_inclusive_dim2_per_device<float>(sycl_device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp
new file mode 100644
index 0000000..89a64c0
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_shuffling.cpp

@@ -0,0 +1,283 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::array;
+
+template <int DataLayout>
+static void test_simple_shuffling()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> shuffles;
+  shuffles[0] = 0;
+  shuffles[1] = 1;
+  shuffles[2] = 2;
+  shuffles[3] = 3;
+
+  Tensor<float, 4, DataLayout> no_shuffle;
+  no_shuffle = tensor.shuffle(shuffles);
+
+  VERIFY_IS_EQUAL(no_shuffle.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(2), 5);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  shuffles[0] = 2;
+  shuffles[1] = 3;
+  shuffles[2] = 1;
+  shuffles[3] = 0;
+  Tensor<float, 4, DataLayout> shuffle;
+  shuffle = tensor.shuffle(shuffles);
+
+  VERIFY_IS_EQUAL(shuffle.dimension(0), 5);
+  VERIFY_IS_EQUAL(shuffle.dimension(1), 7);
+  VERIFY_IS_EQUAL(shuffle.dimension(2), 3);
+  VERIFY_IS_EQUAL(shuffle.dimension(3), 2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i));
+        }
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_expr_shuffling()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  array<ptrdiff_t, 4> shuffles;
+  shuffles[0] = 2;
+  shuffles[1] = 3;
+  shuffles[2] = 1;
+  shuffles[3] = 0;
+  Tensor<float, 4, DataLayout> expected;
+  expected = tensor.shuffle(shuffles);
+
+  Tensor<float, 4, DataLayout> result(5, 7, 3, 2);
+
+  array<ptrdiff_t, 4> src_slice_dim{{2, 3, 1, 7}};
+  array<ptrdiff_t, 4> src_slice_start{{0, 0, 0, 0}};
+  array<ptrdiff_t, 4> dst_slice_dim{{1, 7, 3, 2}};
+  array<ptrdiff_t, 4> dst_slice_start{{0, 0, 0, 0}};
+
+  for (int i = 0; i < 5; ++i) {
+    result.slice(dst_slice_start, dst_slice_dim) =
+        tensor.slice(src_slice_start, src_slice_dim).shuffle(shuffles);
+    src_slice_start[2] += 1;
+    dst_slice_start[0] += 1;
+  }
+
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  VERIFY_IS_EQUAL(result.dimension(2), 3);
+  VERIFY_IS_EQUAL(result.dimension(3), 2);
+
+  for (int i = 0; i < expected.dimension(0); ++i) {
+    for (int j = 0; j < expected.dimension(1); ++j) {
+      for (int k = 0; k < expected.dimension(2); ++k) {
+        for (int l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  dst_slice_start[0] = 0;
+  result.setRandom();
+  for (int i = 0; i < 5; ++i) {
+    result.slice(dst_slice_start, dst_slice_dim) =
+        tensor.shuffle(shuffles).slice(dst_slice_start, dst_slice_dim);
+    dst_slice_start[0] += 1;
+  }
+
+  for (int i = 0; i < expected.dimension(0); ++i) {
+    for (int j = 0; j < expected.dimension(1); ++j) {
+      for (int k = 0; k < expected.dimension(2); ++k) {
+        for (int l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_shuffling_as_value()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> shuffles;
+  shuffles[2] = 0;
+  shuffles[3] = 1;
+  shuffles[1] = 2;
+  shuffles[0] = 3;
+  Tensor<float, 4, DataLayout> shuffle(5,7,3,2);
+  shuffle.shuffle(shuffles) = tensor;
+
+  VERIFY_IS_EQUAL(shuffle.dimension(0), 5);
+  VERIFY_IS_EQUAL(shuffle.dimension(1), 7);
+  VERIFY_IS_EQUAL(shuffle.dimension(2), 3);
+  VERIFY_IS_EQUAL(shuffle.dimension(3), 2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i));
+        }
+      }
+    }
+  }
+
+  array<ptrdiff_t, 4> no_shuffle;
+  no_shuffle[0] = 0;
+  no_shuffle[1] = 1;
+  no_shuffle[2] = 2;
+  no_shuffle[3] = 3;
+  Tensor<float, 4, DataLayout> shuffle2(5,7,3,2);
+  shuffle2.shuffle(shuffles) = tensor.shuffle(no_shuffle);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 2; ++l) {
+          VERIFY_IS_EQUAL(shuffle2(i,j,k,l), shuffle(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_shuffle_unshuffle()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  // Choose a random permutation.
+  array<ptrdiff_t, 4> shuffles;
+  for (int i = 0; i < 4; ++i) {
+    shuffles[i] = i;
+  }
+  array<ptrdiff_t, 4> shuffles_inverse;
+  for (int i = 0; i < 4; ++i) {
+    const ptrdiff_t index = internal::random<ptrdiff_t>(i, 3);
+    shuffles_inverse[shuffles[index]] = i;
+    std::swap(shuffles[i], shuffles[index]);
+  }
+
+  Tensor<float, 4, DataLayout> shuffle;
+  shuffle = tensor.shuffle(shuffles).shuffle(shuffles_inverse);
+
+  VERIFY_IS_EQUAL(shuffle.dimension(0), 2);
+  VERIFY_IS_EQUAL(shuffle.dimension(1), 3);
+  VERIFY_IS_EQUAL(shuffle.dimension(2), 5);
+  VERIFY_IS_EQUAL(shuffle.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_empty_shuffling()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,0,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> shuffles;
+  shuffles[0] = 0;
+  shuffles[1] = 1;
+  shuffles[2] = 2;
+  shuffles[3] = 3;
+
+  Tensor<float, 4, DataLayout> no_shuffle;
+  no_shuffle = tensor.shuffle(shuffles);
+
+  VERIFY_IS_EQUAL(no_shuffle.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(2), 0);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 0; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  shuffles[0] = 2;
+  shuffles[1] = 3;
+  shuffles[2] = 1;
+  shuffles[3] = 0;
+  Tensor<float, 4, DataLayout> shuffle;
+  shuffle = tensor.shuffle(shuffles);
+
+  VERIFY_IS_EQUAL(shuffle.dimension(0), 0);
+  VERIFY_IS_EQUAL(shuffle.dimension(1), 7);
+  VERIFY_IS_EQUAL(shuffle.dimension(2), 3);
+  VERIFY_IS_EQUAL(shuffle.dimension(3), 2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 0; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i));
+        }
+      }
+    }
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_shuffling)
+{
+  CALL_SUBTEST(test_simple_shuffling<ColMajor>());
+  CALL_SUBTEST(test_simple_shuffling<RowMajor>());
+  CALL_SUBTEST(test_expr_shuffling<ColMajor>());
+  CALL_SUBTEST(test_expr_shuffling<RowMajor>());
+  CALL_SUBTEST(test_shuffling_as_value<ColMajor>());
+  CALL_SUBTEST(test_shuffling_as_value<RowMajor>());
+  CALL_SUBTEST(test_shuffle_unshuffle<ColMajor>());
+  CALL_SUBTEST(test_shuffle_unshuffle<RowMajor>());
+  CALL_SUBTEST(test_empty_shuffling<ColMajor>());
+  CALL_SUBTEST(test_empty_shuffling<RowMajor>());
+}

diff --git a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
new file mode 100644
index 0000000..ca4e8b5
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp

@@ -0,0 +1,117 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) {
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> no_shuffle(tensorRange);
+  tensor.setRandom();
+
+  const size_t buffSize = tensor.size() * sizeof(DataType);
+  array<IndexType, 4> shuffles;
+  shuffles[0] = 0;
+  shuffles[1] = 1;
+  shuffles[2] = 2;
+  shuffles[3] = 3;
+  DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(buffSize));
+  DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(buffSize));
+
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu1(gpu_data1,
+                                                             tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu2(gpu_data2,
+                                                             tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(), buffSize);
+
+  gpu2.device(sycl_device) = gpu1.shuffle(shuffles);
+  sycl_device.memcpyDeviceToHost(no_shuffle.data(), gpu_data2, buffSize);
+  sycl_device.synchronize();
+
+  VERIFY_IS_EQUAL(no_shuffle.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(3), sizeDim4);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
+          VERIFY_IS_EQUAL(tensor(i, j, k, l), no_shuffle(i, j, k, l));
+        }
+      }
+    }
+  }
+
+  shuffles[0] = 2;
+  shuffles[1] = 3;
+  shuffles[2] = 1;
+  shuffles[3] = 0;
+  array<IndexType, 4> tensorrangeShuffle = {
+      {sizeDim3, sizeDim4, sizeDim2, sizeDim1}};
+  Tensor<DataType, 4, DataLayout, IndexType> shuffle(tensorrangeShuffle);
+  DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(buffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu3(
+      gpu_data3, tensorrangeShuffle);
+
+  gpu3.device(sycl_device) = gpu1.shuffle(shuffles);
+  sycl_device.memcpyDeviceToHost(shuffle.data(), gpu_data3, buffSize);
+  sycl_device.synchronize();
+
+  VERIFY_IS_EQUAL(shuffle.dimension(0), sizeDim3);
+  VERIFY_IS_EQUAL(shuffle.dimension(1), sizeDim4);
+  VERIFY_IS_EQUAL(shuffle.dimension(2), sizeDim2);
+  VERIFY_IS_EQUAL(shuffle.dimension(3), sizeDim1);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
+          VERIFY_IS_EQUAL(tensor(i, j, k, l), shuffle(k, l, j, i));
+        }
+      }
+    }
+  }
+}
+
+template <typename DataType, typename dev_Selector>
+void sycl_shuffling_test_per_device(dev_Selector s) {
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_shuffling_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_shuffling_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_shuffling_test_per_device<float>(device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp
new file mode 100644
index 0000000..6d70f54
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_simple.cpp

@@ -0,0 +1,327 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_0d()
+{
+  Tensor<int, 0> scalar1;
+  Tensor<int, 0, RowMajor> scalar2;
+  Tensor<int, 0> scalar3;
+  Tensor<int, 0, RowMajor> scalar4;
+
+  scalar3.resize();
+  scalar4.resize();
+
+  scalar1() = 7;
+  scalar2() = 13;
+  scalar3.setValues(17);
+  scalar4.setZero();
+
+  VERIFY_IS_EQUAL(scalar1.rank(), 0);
+  VERIFY_IS_EQUAL(scalar1.size(), 1);
+
+  VERIFY_IS_EQUAL(scalar1(), 7);
+  VERIFY_IS_EQUAL(scalar2(), 13);
+  VERIFY_IS_EQUAL(scalar3(), 17);
+  VERIFY_IS_EQUAL(scalar4(), 0);
+
+  Tensor<int, 0> scalar5(scalar1);
+
+  VERIFY_IS_EQUAL(scalar5(), 7);
+  VERIFY_IS_EQUAL(scalar5.data()[0], 7);
+}
+
+static void test_1d()
+{
+  Tensor<int, 1> vec1(6);
+  Tensor<int, 1, RowMajor> vec2(6);
+  Tensor<int, 1> vec3;
+  Tensor<int, 1, RowMajor> vec4;
+
+  vec3.resize(6);
+  vec4.resize(6);
+
+  vec1(0) = 4;  vec2(0) = 0; vec3(0) = 5;
+  vec1(1) = 8;  vec2(1) = 1; vec3(1) = 4;
+  vec1(2) = 15; vec2(2) = 2; vec3(2) = 3;
+  vec1(3) = 16; vec2(3) = 3; vec3(3) = 2;
+  vec1(4) = 23; vec2(4) = 4; vec3(4) = 1;
+  vec1(5) = 42; vec2(5) = 5; vec3(5) = 0;
+  vec4.setZero();
+
+  VERIFY_IS_EQUAL((vec1.rank()), 1);
+  VERIFY_IS_EQUAL((vec1.size()), 6);
+  VERIFY_IS_EQUAL((vec1.dimensions()[0]), 6);
+
+  VERIFY_IS_EQUAL((vec1[0]), 4);
+  VERIFY_IS_EQUAL((vec1[1]), 8);
+  VERIFY_IS_EQUAL((vec1[2]), 15);
+  VERIFY_IS_EQUAL((vec1[3]), 16);
+  VERIFY_IS_EQUAL((vec1[4]), 23);
+  VERIFY_IS_EQUAL((vec1[5]), 42);
+
+  VERIFY_IS_EQUAL((vec2[0]), 0);
+  VERIFY_IS_EQUAL((vec2[1]), 1);
+  VERIFY_IS_EQUAL((vec2[2]), 2);
+  VERIFY_IS_EQUAL((vec2[3]), 3);
+  VERIFY_IS_EQUAL((vec2[4]), 4);
+  VERIFY_IS_EQUAL((vec2[5]), 5);
+
+  VERIFY_IS_EQUAL((vec3[0]), 5);
+  VERIFY_IS_EQUAL((vec3[1]), 4);
+  VERIFY_IS_EQUAL((vec3[2]), 3);
+  VERIFY_IS_EQUAL((vec3[3]), 2);
+  VERIFY_IS_EQUAL((vec3[4]), 1);
+  VERIFY_IS_EQUAL((vec3[5]), 0);
+
+  VERIFY_IS_EQUAL((vec4[0]), 0);
+  VERIFY_IS_EQUAL((vec4[1]), 0);
+  VERIFY_IS_EQUAL((vec4[2]), 0);
+  VERIFY_IS_EQUAL((vec4[3]), 0);
+  VERIFY_IS_EQUAL((vec4[4]), 0);
+  VERIFY_IS_EQUAL((vec4[5]), 0);
+
+  Tensor<int, 1> vec5(vec1);
+
+  VERIFY_IS_EQUAL((vec5(0)), 4);
+  VERIFY_IS_EQUAL((vec5(1)), 8);
+  VERIFY_IS_EQUAL((vec5(2)), 15);
+  VERIFY_IS_EQUAL((vec5(3)), 16);
+  VERIFY_IS_EQUAL((vec5(4)), 23);
+  VERIFY_IS_EQUAL((vec5(5)), 42);
+
+  VERIFY_IS_EQUAL((vec5.data()[0]), 4);
+  VERIFY_IS_EQUAL((vec5.data()[1]), 8);
+  VERIFY_IS_EQUAL((vec5.data()[2]), 15);
+  VERIFY_IS_EQUAL((vec5.data()[3]), 16);
+  VERIFY_IS_EQUAL((vec5.data()[4]), 23);
+  VERIFY_IS_EQUAL((vec5.data()[5]), 42);
+}
+
+static void test_2d()
+{
+  Tensor<int, 2> mat1(2,3);
+  Tensor<int, 2, RowMajor> mat2(2,3);
+
+  mat1(0,0) = 0;
+  mat1(0,1) = 1;
+  mat1(0,2) = 2;
+  mat1(1,0) = 3;
+  mat1(1,1) = 4;
+  mat1(1,2) = 5;
+
+  mat2(0,0) = 0;
+  mat2(0,1) = 1;
+  mat2(0,2) = 2;
+  mat2(1,0) = 3;
+  mat2(1,1) = 4;
+  mat2(1,2) = 5;
+
+  VERIFY_IS_EQUAL((mat1.rank()), 2);
+  VERIFY_IS_EQUAL((mat1.size()), 6);
+  VERIFY_IS_EQUAL((mat1.dimensions()[0]), 2);
+  VERIFY_IS_EQUAL((mat1.dimensions()[1]), 3);
+
+  VERIFY_IS_EQUAL((mat2.rank()), 2);
+  VERIFY_IS_EQUAL((mat2.size()), 6);
+  VERIFY_IS_EQUAL((mat2.dimensions()[0]), 2);
+  VERIFY_IS_EQUAL((mat2.dimensions()[1]), 3);
+
+  VERIFY_IS_EQUAL((mat1.data()[0]), 0);
+  VERIFY_IS_EQUAL((mat1.data()[1]), 3);
+  VERIFY_IS_EQUAL((mat1.data()[2]), 1);
+  VERIFY_IS_EQUAL((mat1.data()[3]), 4);
+  VERIFY_IS_EQUAL((mat1.data()[4]), 2);
+  VERIFY_IS_EQUAL((mat1.data()[5]), 5);
+
+  VERIFY_IS_EQUAL((mat2.data()[0]), 0);
+  VERIFY_IS_EQUAL((mat2.data()[1]), 1);
+  VERIFY_IS_EQUAL((mat2.data()[2]), 2);
+  VERIFY_IS_EQUAL((mat2.data()[3]), 3);
+  VERIFY_IS_EQUAL((mat2.data()[4]), 4);
+  VERIFY_IS_EQUAL((mat2.data()[5]), 5);
+}
+
+static void test_3d()
+{
+  Tensor<int, 3> epsilon(3,3,3);
+  epsilon.setZero();
+  epsilon(0,1,2) = epsilon(2,0,1) = epsilon(1,2,0) = 1;
+  epsilon(2,1,0) = epsilon(0,2,1) = epsilon(1,0,2) = -1;
+
+  VERIFY_IS_EQUAL((epsilon.size()), 27);
+  VERIFY_IS_EQUAL((epsilon.dimensions()[0]), 3);
+  VERIFY_IS_EQUAL((epsilon.dimensions()[1]), 3);
+  VERIFY_IS_EQUAL((epsilon.dimensions()[2]), 3);
+
+  VERIFY_IS_EQUAL((epsilon(0,0,0)), 0);
+  VERIFY_IS_EQUAL((epsilon(0,0,1)), 0);
+  VERIFY_IS_EQUAL((epsilon(0,0,2)), 0);
+  VERIFY_IS_EQUAL((epsilon(0,1,0)), 0);
+  VERIFY_IS_EQUAL((epsilon(0,1,1)), 0);
+  VERIFY_IS_EQUAL((epsilon(0,2,0)), 0);
+  VERIFY_IS_EQUAL((epsilon(0,2,2)), 0);
+  VERIFY_IS_EQUAL((epsilon(1,0,0)), 0);
+  VERIFY_IS_EQUAL((epsilon(1,0,1)), 0);
+  VERIFY_IS_EQUAL((epsilon(1,1,0)), 0);
+  VERIFY_IS_EQUAL((epsilon(1,1,1)), 0);
+  VERIFY_IS_EQUAL((epsilon(1,1,2)), 0);
+  VERIFY_IS_EQUAL((epsilon(1,2,1)), 0);
+  VERIFY_IS_EQUAL((epsilon(1,2,2)), 0);
+  VERIFY_IS_EQUAL((epsilon(2,0,0)), 0);
+  VERIFY_IS_EQUAL((epsilon(2,0,2)), 0);
+  VERIFY_IS_EQUAL((epsilon(2,1,1)), 0);
+  VERIFY_IS_EQUAL((epsilon(2,1,2)), 0);
+  VERIFY_IS_EQUAL((epsilon(2,2,0)), 0);
+  VERIFY_IS_EQUAL((epsilon(2,2,1)), 0);
+  VERIFY_IS_EQUAL((epsilon(2,2,2)), 0);
+
+  VERIFY_IS_EQUAL((epsilon(0,1,2)), 1);
+  VERIFY_IS_EQUAL((epsilon(2,0,1)), 1);
+  VERIFY_IS_EQUAL((epsilon(1,2,0)), 1);
+  VERIFY_IS_EQUAL((epsilon(2,1,0)), -1);
+  VERIFY_IS_EQUAL((epsilon(0,2,1)), -1);
+  VERIFY_IS_EQUAL((epsilon(1,0,2)), -1);
+
+  array<Eigen::DenseIndex, 3> dims;
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 4;
+  Tensor<int, 3> t1(dims);
+  Tensor<int, 3, RowMajor> t2(dims);
+
+  VERIFY_IS_EQUAL((t1.size()), 24);
+  VERIFY_IS_EQUAL((t1.dimensions()[0]), 2);
+  VERIFY_IS_EQUAL((t1.dimensions()[1]), 3);
+  VERIFY_IS_EQUAL((t1.dimensions()[2]), 4);
+
+  VERIFY_IS_EQUAL((t2.size()), 24);
+  VERIFY_IS_EQUAL((t2.dimensions()[0]), 2);
+  VERIFY_IS_EQUAL((t2.dimensions()[1]), 3);
+  VERIFY_IS_EQUAL((t2.dimensions()[2]), 4);
+
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 3; j++) {
+      for (int k = 0; k < 4; k++) {
+        t1(i, j, k) = 100 * i + 10 * j + k;
+        t2(i, j, k) = 100 * i + 10 * j + k;
+      }
+    }
+  }
+
+  VERIFY_IS_EQUAL((t1.data()[0]),    0);
+  VERIFY_IS_EQUAL((t1.data()[1]),  100);
+  VERIFY_IS_EQUAL((t1.data()[2]),   10);
+  VERIFY_IS_EQUAL((t1.data()[3]),  110);
+  VERIFY_IS_EQUAL((t1.data()[4]),   20);
+  VERIFY_IS_EQUAL((t1.data()[5]),  120);
+  VERIFY_IS_EQUAL((t1.data()[6]),    1);
+  VERIFY_IS_EQUAL((t1.data()[7]),  101);
+  VERIFY_IS_EQUAL((t1.data()[8]),   11);
+  VERIFY_IS_EQUAL((t1.data()[9]),  111);
+  VERIFY_IS_EQUAL((t1.data()[10]),  21);
+  VERIFY_IS_EQUAL((t1.data()[11]), 121);
+  VERIFY_IS_EQUAL((t1.data()[12]),   2);
+  VERIFY_IS_EQUAL((t1.data()[13]), 102);
+  VERIFY_IS_EQUAL((t1.data()[14]),  12);
+  VERIFY_IS_EQUAL((t1.data()[15]), 112);
+  VERIFY_IS_EQUAL((t1.data()[16]),  22);
+  VERIFY_IS_EQUAL((t1.data()[17]), 122);
+  VERIFY_IS_EQUAL((t1.data()[18]),   3);
+  VERIFY_IS_EQUAL((t1.data()[19]), 103);
+  VERIFY_IS_EQUAL((t1.data()[20]),  13);
+  VERIFY_IS_EQUAL((t1.data()[21]), 113);
+  VERIFY_IS_EQUAL((t1.data()[22]),  23);
+  VERIFY_IS_EQUAL((t1.data()[23]), 123);
+
+  VERIFY_IS_EQUAL((t2.data()[0]),    0);
+  VERIFY_IS_EQUAL((t2.data()[1]),    1);
+  VERIFY_IS_EQUAL((t2.data()[2]),    2);
+  VERIFY_IS_EQUAL((t2.data()[3]),    3);
+  VERIFY_IS_EQUAL((t2.data()[4]),   10);
+  VERIFY_IS_EQUAL((t2.data()[5]),   11);
+  VERIFY_IS_EQUAL((t2.data()[6]),   12);
+  VERIFY_IS_EQUAL((t2.data()[7]),   13);
+  VERIFY_IS_EQUAL((t2.data()[8]),   20);
+  VERIFY_IS_EQUAL((t2.data()[9]),   21);
+  VERIFY_IS_EQUAL((t2.data()[10]),  22);
+  VERIFY_IS_EQUAL((t2.data()[11]),  23);
+  VERIFY_IS_EQUAL((t2.data()[12]), 100);
+  VERIFY_IS_EQUAL((t2.data()[13]), 101);
+  VERIFY_IS_EQUAL((t2.data()[14]), 102);
+  VERIFY_IS_EQUAL((t2.data()[15]), 103);
+  VERIFY_IS_EQUAL((t2.data()[16]), 110);
+  VERIFY_IS_EQUAL((t2.data()[17]), 111);
+  VERIFY_IS_EQUAL((t2.data()[18]), 112);
+  VERIFY_IS_EQUAL((t2.data()[19]), 113);
+  VERIFY_IS_EQUAL((t2.data()[20]), 120);
+  VERIFY_IS_EQUAL((t2.data()[21]), 121);
+  VERIFY_IS_EQUAL((t2.data()[22]), 122);
+  VERIFY_IS_EQUAL((t2.data()[23]), 123);
+}
+
+static void test_simple_assign()
+{
+  Tensor<int, 3> epsilon(3,3,3);
+  epsilon.setZero();
+  epsilon(0,1,2) = epsilon(2,0,1) = epsilon(1,2,0) = 1;
+  epsilon(2,1,0) = epsilon(0,2,1) = epsilon(1,0,2) = -1;
+
+  Tensor<int, 3> e2(3,3,3);
+  e2.setZero();
+  VERIFY_IS_EQUAL((e2(1,2,0)), 0);
+
+  e2 = epsilon;
+  VERIFY_IS_EQUAL((e2(1,2,0)), 1);
+  VERIFY_IS_EQUAL((e2(0,1,2)), 1);
+  VERIFY_IS_EQUAL((e2(2,0,1)), 1);
+  VERIFY_IS_EQUAL((e2(2,1,0)), -1);
+  VERIFY_IS_EQUAL((e2(0,2,1)), -1);
+  VERIFY_IS_EQUAL((e2(1,0,2)), -1);
+}
+
+static void test_resize()
+{
+  Tensor<int, 3> epsilon;
+  epsilon.resize(2,3,7);
+  VERIFY_IS_EQUAL(epsilon.dimension(0), 2);
+  VERIFY_IS_EQUAL(epsilon.dimension(1), 3);
+  VERIFY_IS_EQUAL(epsilon.dimension(2), 7);
+  VERIFY_IS_EQUAL(epsilon.size(), 2*3*7);
+
+  const int* old_data = epsilon.data();
+  epsilon.resize(3,2,7);
+  VERIFY_IS_EQUAL(epsilon.dimension(0), 3);
+  VERIFY_IS_EQUAL(epsilon.dimension(1), 2);
+  VERIFY_IS_EQUAL(epsilon.dimension(2), 7);
+  VERIFY_IS_EQUAL(epsilon.size(), 2*3*7);
+  VERIFY_IS_EQUAL(epsilon.data(), old_data);
+
+  epsilon.resize(3,5,7);
+  VERIFY_IS_EQUAL(epsilon.dimension(0), 3);
+  VERIFY_IS_EQUAL(epsilon.dimension(1), 5);
+  VERIFY_IS_EQUAL(epsilon.dimension(2), 7);
+  VERIFY_IS_EQUAL(epsilon.size(), 3*5*7);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_simple)
+{
+  CALL_SUBTEST(test_0d());
+  CALL_SUBTEST(test_1d());
+  CALL_SUBTEST(test_2d());
+  CALL_SUBTEST(test_3d());
+  CALL_SUBTEST(test_simple_assign());
+  CALL_SUBTEST(test_resize());
+}

diff --git a/unsupported/test/cxx11_tensor_striding.cpp b/unsupported/test/cxx11_tensor_striding.cpp
new file mode 100644
index 0000000..aefdfa9
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_striding.cpp

@@ -0,0 +1,119 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_simple_striding()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> strides;
+  strides[0] = 1;
+  strides[1] = 1;
+  strides[2] = 1;
+  strides[3] = 1;
+
+  Tensor<float, 4, DataLayout> no_stride;
+  no_stride = tensor.stride(strides);
+
+  VERIFY_IS_EQUAL(no_stride.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_stride.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_stride.dimension(2), 5);
+  VERIFY_IS_EQUAL(no_stride.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+  Tensor<float, 4, DataLayout> stride;
+  stride = tensor.stride(strides);
+
+  VERIFY_IS_EQUAL(stride.dimension(0), 1);
+  VERIFY_IS_EQUAL(stride.dimension(1), 1);
+  VERIFY_IS_EQUAL(stride.dimension(2), 3);
+  VERIFY_IS_EQUAL(stride.dimension(3), 3);
+
+  for (int i = 0; i < 1; ++i) {
+    for (int j = 0; j < 1; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+template<int DataLayout>
+static void test_striding_as_lvalue()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> strides;
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+
+  Tensor<float, 4, DataLayout> result(3, 12, 10, 21);
+  result.stride(strides) = tensor;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), result(2*i,4*j,2*k,3*l));
+        }
+      }
+    }
+  }
+
+  array<ptrdiff_t, 4> no_strides;
+  no_strides[0] = 1;
+  no_strides[1] = 1;
+  no_strides[2] = 1;
+  no_strides[3] = 1;
+  Tensor<float, 4, DataLayout> result2(3, 12, 10, 21);
+  result2.stride(strides) = tensor.stride(no_strides);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), result2(2*i,4*j,2*k,3*l));
+        }
+      }
+    }
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_striding)
+{
+  CALL_SUBTEST(test_simple_striding<ColMajor>());
+  CALL_SUBTEST(test_simple_striding<RowMajor>());
+  CALL_SUBTEST(test_striding_as_lvalue<ColMajor>());
+  CALL_SUBTEST(test_striding_as_lvalue<RowMajor>());
+}

diff --git a/unsupported/test/cxx11_tensor_striding_sycl.cpp b/unsupported/test/cxx11_tensor_striding_sycl.cpp
new file mode 100644
index 0000000..d3b1fa7
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_striding_sycl.cpp

@@ -0,0 +1,203 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include <iostream>
+#include <chrono>
+#include <ctime>
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_striding(const Eigen::SyclDevice& sycl_device)
+{
+
+  Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}};
+  Eigen::array<IndexType, 4> stride_dims = {{1,1,3,3}};
+
+
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims);
+  Tensor<DataType, 4, DataLayout,IndexType> no_stride(tensor_dims);
+  Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims);
+
+
+  std::size_t tensor_bytes = tensor.size()  * sizeof(DataType);
+  std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType);
+  std::size_t stride_bytes = stride.size() * sizeof(DataType);
+  DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes));
+  DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes));
+  DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, tensor_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims);
+
+
+  tensor.setRandom();
+  array<IndexType, 4> strides;
+  strides[0] = 1;
+  strides[1] = 1;
+  strides[2] = 1;
+  strides[3] = 1;
+  sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes);
+  gpu_no_stride.device(sycl_device)=gpu_tensor.stride(strides);
+  sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes);
+
+  //no_stride = tensor.stride(strides);
+
+  VERIFY_IS_EQUAL(no_stride.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_stride.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_stride.dimension(2), 5);
+  VERIFY_IS_EQUAL(no_stride.dimension(3), 7);
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+//Tensor<float, 4, DataLayout> stride;
+//  stride = tensor.stride(strides);
+
+  gpu_stride.device(sycl_device)=gpu_tensor.stride(strides);
+  sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes);
+
+  VERIFY_IS_EQUAL(stride.dimension(0), 1);
+  VERIFY_IS_EQUAL(stride.dimension(1), 1);
+  VERIFY_IS_EQUAL(stride.dimension(2), 3);
+  VERIFY_IS_EQUAL(stride.dimension(3), 3);
+
+  for (IndexType i = 0; i < 1; ++i) {
+    for (IndexType j = 0; j < 1; ++j) {
+      for (IndexType k = 0; k < 3; ++k) {
+        for (IndexType l = 0; l < 3; ++l) {
+          VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(d_tensor);
+  sycl_device.deallocate(d_no_stride);
+  sycl_device.deallocate(d_stride);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device)
+{
+
+  Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}};
+  Eigen::array<IndexType, 4> stride_dims = {{3,12,10,21}};
+
+
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims);
+  Tensor<DataType, 4, DataLayout,IndexType> no_stride(stride_dims);
+  Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims);
+
+
+  std::size_t tensor_bytes = tensor.size()  * sizeof(DataType);
+  std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType);
+  std::size_t stride_bytes = stride.size() * sizeof(DataType);
+
+  DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes));
+  DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes));
+  DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, stride_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims);
+
+  //Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<IndexType, 4> strides;
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+
+//  Tensor<float, 4, DataLayout> result(3, 12, 10, 21);
+//  result.stride(strides) = tensor;
+  sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes);
+  gpu_stride.stride(strides).device(sycl_device)=gpu_tensor;
+  sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes);
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), stride(2*i,4*j,2*k,3*l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> no_strides;
+  no_strides[0] = 1;
+  no_strides[1] = 1;
+  no_strides[2] = 1;
+  no_strides[3] = 1;
+//  Tensor<float, 4, DataLayout> result2(3, 12, 10, 21);
+//  result2.stride(strides) = tensor.stride(no_strides);
+
+  gpu_no_stride.stride(strides).device(sycl_device)=gpu_tensor.stride(no_strides);
+  sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes);
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(2*i,4*j,2*k,3*l));
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(d_tensor);
+  sycl_device.deallocate(d_no_stride);
+  sycl_device.deallocate(d_stride);
+}
+
+
+template <typename Dev_selector> void tensorStridingPerDevice(Dev_selector& s){
+  QueueInterface queueInterface(s);
+  auto sycl_device=Eigen::SyclDevice(&queueInterface);
+  test_simple_striding<float, ColMajor, int64_t>(sycl_device);
+  test_simple_striding<float, RowMajor, int64_t>(sycl_device);
+  test_striding_as_lvalue<float, ColMajor, int64_t>(sycl_device);
+  test_striding_as_lvalue<float, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_striding_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(tensorStridingPerDevice(device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_sugar.cpp b/unsupported/test/cxx11_tensor_sugar.cpp
new file mode 100644
index 0000000..2ca5c47
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_sugar.cpp

@@ -0,0 +1,81 @@
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_comparison_sugar() {
+  // we already trust comparisons between tensors, we're simply checking that
+  // the sugared versions are doing the same thing
+  Tensor<int, 3> t(6, 7, 5);
+
+  t.setRandom();
+  // make sure we have at least one value == 0
+  t(0,0,0) = 0;
+
+  Tensor<bool,0> b;
+
+#define TEST_TENSOR_EQUAL(e1, e2) \
+  b = ((e1) == (e2)).all();       \
+  VERIFY(b())
+
+#define TEST_OP(op) TEST_TENSOR_EQUAL(t op 0, t op t.constant(0))
+
+  TEST_OP(==);
+  TEST_OP(!=);
+  TEST_OP(<=);
+  TEST_OP(>=);
+  TEST_OP(<);
+  TEST_OP(>);
+#undef TEST_OP
+#undef TEST_TENSOR_EQUAL
+}
+
+
+static void test_scalar_sugar_add_mul() {
+  Tensor<float, 3> A(6, 7, 5);
+  Tensor<float, 3> B(6, 7, 5);
+  A.setRandom();
+  B.setRandom();
+
+  const float alpha = 0.43f;
+  const float beta = 0.21f;
+  const float gamma = 0.14f;
+
+  Tensor<float, 3> R = A.constant(gamma) + A * A.constant(alpha) + B * B.constant(beta);
+  Tensor<float, 3> S = A * alpha + B * beta + gamma;
+  Tensor<float, 3> T = gamma + alpha * A + beta * B;
+
+  for (int i = 0; i < 6*7*5; ++i) {
+    VERIFY_IS_APPROX(R(i), S(i));
+    VERIFY_IS_APPROX(R(i), T(i));
+  }
+}
+
+static void test_scalar_sugar_sub_div() {
+  Tensor<float, 3> A(6, 7, 5);
+  Tensor<float, 3> B(6, 7, 5);
+  A.setRandom();
+  B.setRandom();
+
+  const float alpha = 0.43f;
+  const float beta = 0.21f;
+  const float gamma = 0.14f;
+  const float delta = 0.32f;
+
+  Tensor<float, 3> R = A.constant(gamma) - A / A.constant(alpha)
+      - B.constant(beta) / B - A.constant(delta);
+  Tensor<float, 3> S = gamma - A / alpha - beta / B - delta;
+
+  for (int i = 0; i < 6*7*5; ++i) {
+    VERIFY_IS_APPROX(R(i), S(i));
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_sugar)
+{
+  CALL_SUBTEST(test_comparison_sugar());
+  CALL_SUBTEST(test_scalar_sugar_add_mul());
+  CALL_SUBTEST(test_scalar_sugar_sub_div());
+}

diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp
new file mode 100644
index 0000000..e6c5e23
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_sycl.cpp

@@ -0,0 +1,361 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
+  IndexType sizeDim1 = 5;
+  IndexType sizeDim2 = 5;
+  IndexType sizeDim3 = 1;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out1(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out2(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out3(tensorRange);
+
+  in1 = in1.random();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(out1.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, in1.data(),(in1.size())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data2, in1.data(),(in1.size())*sizeof(DataType));
+  gpu1.device(sycl_device) = gpu1 * 3.14f;
+  gpu2.device(sycl_device) = gpu2 * 2.7f;
+  sycl_device.memcpyDeviceToHost(out1.data(), gpu_data1,(out1.size())*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out2.data(), gpu_data1,(out2.size())*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out3.data(), gpu_data2,(out3.size())*sizeof(DataType));
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < in1.size(); ++i) {
+  //  std::cout << "SYCL DATA : " << out1(i) << "  vs  CPU DATA : " << in1(i) * 3.14f << "\n";
+    VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f);
+    VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f);
+    VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f);
+  }
+
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) {
+  IndexType size = 20;
+  array<IndexType, 1> tensorRange = {{size}};
+  Tensor<DataType, 1, DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> in2(tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> out(tensorRange);
+
+  in1 = in1.random();
+  in2 = in1;
+
+  DataType* gpu_data  = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> gpu1(gpu_data, tensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data(),(in1.size())*sizeof(DataType));
+  sycl_device.synchronize();
+  in1.setZero();
+
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, out.size()*sizeof(DataType));
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < in1.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), in2(i));
+  }
+
+  sycl_device.deallocate(gpu_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_mem_sync_offsets(const Eigen::SyclDevice &sycl_device) {
+  using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>;
+  IndexType full_size = 32;
+  IndexType half_size = full_size / 2;
+  array<IndexType, 1> tensorRange = {{full_size}};
+  tensor_type in1(tensorRange);
+  tensor_type out(tensorRange);
+
+  DataType* gpu_data  = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
+  TensorMap<tensor_type> gpu1(gpu_data, tensorRange);
+
+  in1 = in1.random();
+  // Copy all data to device, then permute on copy back to host
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data + half_size, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data() + half_size, gpu_data, half_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < half_size; ++i) {
+    VERIFY_IS_APPROX(out(i), in1(i + half_size));
+    VERIFY_IS_APPROX(out(i + half_size), in1(i));
+  }
+
+  in1 = in1.random();
+  out.setZero();
+  // Permute copies to device, then copy all back to host
+  sycl_device.memcpyHostToDevice(gpu_data + half_size, in1.data(), half_size * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data() + half_size, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < half_size; ++i) {
+    VERIFY_IS_APPROX(out(i), in1(i + half_size));
+    VERIFY_IS_APPROX(out(i + half_size), in1(i));
+  }
+
+  in1 = in1.random();
+  out.setZero();
+  DataType* gpu_data_out  = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
+  TensorMap<tensor_type> gpu2(gpu_data_out, tensorRange);
+  // Copy all to device, permute copies on device, then copy all back to host
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType));
+  sycl_device.memcpy(gpu_data_out + half_size, gpu_data, half_size * sizeof(DataType));
+  sycl_device.memcpy(gpu_data_out, gpu_data + half_size, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, full_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < half_size; ++i) {
+    VERIFY_IS_APPROX(out(i), in1(i + half_size));
+    VERIFY_IS_APPROX(out(i + half_size), in1(i));
+  }
+
+  sycl_device.deallocate(gpu_data_out);
+  sycl_device.deallocate(gpu_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_memset_offsets(const Eigen::SyclDevice &sycl_device) {
+  using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>;
+  IndexType full_size = 32;
+  IndexType half_size = full_size / 2;
+  array<IndexType, 1> tensorRange = {{full_size}};
+  tensor_type cpu_out(tensorRange);
+  tensor_type out(tensorRange);
+
+  cpu_out.setZero();
+
+  std::memset(cpu_out.data(), 0, half_size * sizeof(DataType));
+  std::memset(cpu_out.data() + half_size, 1, half_size * sizeof(DataType));
+
+  DataType* gpu_data  = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
+  TensorMap<tensor_type> gpu1(gpu_data, tensorRange);
+
+  sycl_device.memset(gpu_data, 0, half_size * sizeof(DataType));
+  sycl_device.memset(gpu_data + half_size, 1, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < full_size; ++i) {
+    VERIFY_IS_APPROX(out(i), cpu_out(i));
+  }
+
+  sycl_device.deallocate(gpu_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
+
+  IndexType sizeDim1 = 100;
+  IndexType sizeDim2 = 10;
+  IndexType sizeDim3 = 20;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3,DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> in2(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> in3(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> out(tensorRange);
+
+  in2 = in2.random();
+  in3 = in3.random();
+
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.size()*sizeof(DataType)));
+  DataType * gpu_in3_data  = static_cast<DataType*>(sycl_device.allocate(in3.size()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in3(gpu_in3_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
+
+  /// a=1.2f
+  gpu_in1.device(sycl_device) = gpu_in1.constant(1.2f);
+  sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.size())*sizeof(DataType));
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(in1(i,j,k), 1.2f);
+      }
+    }
+  }
+  printf("a=1.2f Test passed\n");
+
+  /// a=b*1.2f
+  gpu_out.device(sycl_device) = gpu_in1 * 1.2f;
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k),
+                         in1(i,j,k) * 1.2f);
+      }
+    }
+  }
+  printf("a=b*1.2f Test Passed\n");
+
+  /// c=a*b
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.size())*sizeof(DataType));
+  gpu_out.device(sycl_device) = gpu_in1 * gpu_in2;
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k),
+                         in1(i,j,k) *
+                             in2(i,j,k));
+      }
+    }
+  }
+  printf("c=a*b Test Passed\n");
+
+  /// c=a+b
+  gpu_out.device(sycl_device) = gpu_in1 + gpu_in2;
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k),
+                         in1(i,j,k) +
+                             in2(i,j,k));
+      }
+    }
+  }
+  printf("c=a+b Test Passed\n");
+
+  /// c=a*a
+  gpu_out.device(sycl_device) = gpu_in1 * gpu_in1;
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k),
+                         in1(i,j,k) *
+                             in1(i,j,k));
+      }
+    }
+  }
+  printf("c= a*a Test Passed\n");
+
+  //a*3.14f + b*2.7f
+  gpu_out.device(sycl_device) =  gpu_in1 * gpu_in1.constant(3.14f) + gpu_in2 * gpu_in2.constant(2.7f);
+  sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k),
+                         in1(i,j,k) * 3.14f
+                       + in2(i,j,k) * 2.7f);
+      }
+    }
+  }
+  printf("a*3.14f + b*2.7f Test Passed\n");
+
+  ///d= (a>0.5? b:c)
+  sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.size())*sizeof(DataType));
+  gpu_out.device(sycl_device) =(gpu_in1 > gpu_in1.constant(0.5f)).select(gpu_in2, gpu_in3);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) > 0.5f)
+                                                ? in2(i, j, k)
+                                                : in3(i, j, k));
+      }
+    }
+  }
+  printf("d= (a>0.5? b:c) Test Passed\n");
+  sycl_device.deallocate(gpu_in1_data);
+  sycl_device.deallocate(gpu_in2_data);
+  sycl_device.deallocate(gpu_in3_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+template<typename Scalar1, typename Scalar2,  int DataLayout, typename IndexType>
+static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){
+    IndexType size = 20;
+    array<IndexType, 1> tensorRange = {{size}};
+    Tensor<Scalar1, 1, DataLayout, IndexType> in(tensorRange);
+    Tensor<Scalar2, 1, DataLayout, IndexType> out(tensorRange);
+    Tensor<Scalar2, 1, DataLayout, IndexType> out_host(tensorRange);
+
+    in = in.random();
+
+    Scalar1* gpu_in_data  = static_cast<Scalar1*>(sycl_device.allocate(in.size()*sizeof(Scalar1)));
+    Scalar2 * gpu_out_data =  static_cast<Scalar2*>(sycl_device.allocate(out.size()*sizeof(Scalar2)));
+
+    TensorMap<Tensor<Scalar1, 1, DataLayout, IndexType>> gpu_in(gpu_in_data, tensorRange);
+    TensorMap<Tensor<Scalar2, 1, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
+    sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.size())*sizeof(Scalar1));
+    gpu_out.device(sycl_device) = gpu_in. template cast<Scalar2>();
+    sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, out.size()*sizeof(Scalar2));
+    out_host = in. template cast<Scalar2>();
+    for(IndexType i=0; i< size; i++)
+    {
+      VERIFY_IS_APPROX(out(i), out_host(i));
+    }
+    printf("cast Test Passed\n");
+    sycl_device.deallocate(gpu_in_data);
+    sycl_device.deallocate(gpu_out_data);
+}
+template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_sycl_mem_transfers<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_computations<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_mem_sync<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_mem_sync_offsets<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_memset_offsets<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_mem_transfers<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_computations<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_mem_sync<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_cast<DataType, int, RowMajor, int64_t>(sycl_device);
+  test_sycl_cast<DataType, int, ColMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
+  }
+}

diff --git a/unsupported/test/cxx11_tensor_symmetry.cpp b/unsupported/test/cxx11_tensor_symmetry.cpp
new file mode 100644
index 0000000..fed269a
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_symmetry.cpp

@@ -0,0 +1,818 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+#include <Eigen/CXX11/TensorSymmetry>
+
+#include <map>
+#include <set>
+
+using Eigen::Tensor;
+using Eigen::SGroup;
+using Eigen::DynamicSGroup;
+using Eigen::StaticSGroup;
+using Eigen::Symmetry;
+using Eigen::AntiSymmetry;
+using Eigen::Hermiticity;
+using Eigen::AntiHermiticity;
+
+using Eigen::NegationFlag;
+using Eigen::ConjugationFlag;
+using Eigen::GlobalZeroFlag;
+using Eigen::GlobalRealFlag;
+using Eigen::GlobalImagFlag;
+
+// helper function to determine if the compiler intantiated a static
+// or dynamic symmetry group
+template<typename... Sym>
+bool isDynGroup(StaticSGroup<Sym...> const& dummy)
+{
+  (void)dummy;
+  return false;
+}
+
+bool isDynGroup(DynamicSGroup const& dummy)
+{
+  (void)dummy;
+  return true;
+}
+
+// helper class for checking that the symmetry groups are correct
+struct checkIdx {
+  template<typename ArrType>
+  static inline int doCheck_(ArrType e, int flags, int dummy, std::set<uint64_t>& found, std::map<uint64_t, int> const& expected)
+  {
+    // use decimal representation of value
+    uint64_t value = e[0];
+    for (std::size_t i = 1; i < e.size(); i++)
+      value = value * 10 + e[i];
+
+    // we want to make sure that we find each element
+    auto it = expected.find(value);
+    VERIFY((it != expected.end()));
+    VERIFY_IS_EQUAL(it->second, flags);
+
+    // we want to make sure we only have each element once;
+    // set::insert returns true for the second part of the pair
+    // if the element was really inserted and not already there
+    auto p = found.insert(value);
+    VERIFY((p.second));
+
+    return dummy;
+  }
+
+  static inline int run(std::vector<int> e, int flags, int dummy, std::set<uint64_t>& found, std::map<uint64_t, int> const& expected)
+  {
+    return doCheck_(e, flags, dummy, found, expected);
+  }
+
+  template<std::size_t N>
+  static inline int run(std::array<int, N> e, int flags, int dummy, std::set<uint64_t>& found, std::map<uint64_t, int> const& expected)
+  {
+    return doCheck_(e, flags, dummy, found, expected);
+  }
+};
+
+static void test_symgroups_static()
+{
+  std::array<int, 7> identity{{0,1,2,3,4,5,6}};
+
+  // Simple static symmetry group
+  StaticSGroup<
+    AntiSymmetry<0,1>,
+    Hermiticity<0,2>
+  > group;
+
+  std::set<uint64_t> found;
+  std::map<uint64_t, int> expected;
+  expected[ 123456] = 0;
+  expected[1023456] = NegationFlag;
+  expected[2103456] = ConjugationFlag;
+  expected[1203456] = ConjugationFlag | NegationFlag;
+  expected[2013456] = ConjugationFlag | NegationFlag;
+  expected[ 213456] = ConjugationFlag;
+
+  VERIFY_IS_EQUAL(group.size(), 6u);
+  VERIFY_IS_EQUAL(group.globalFlags(), GlobalImagFlag);
+  group.apply<checkIdx, int>(identity, 0, found, expected);
+  VERIFY_IS_EQUAL(found.size(), 6u);
+}
+
+static void test_symgroups_dynamic()
+{
+  std::vector<int> identity;
+  for (int i = 0; i <= 6; i++)
+    identity.push_back(i);
+
+  // Simple dynamic symmetry group
+  DynamicSGroup group;
+  group.add(0,1,NegationFlag);
+  group.add(0,2,ConjugationFlag);
+
+  VERIFY_IS_EQUAL(group.size(), 6u);
+  VERIFY_IS_EQUAL(group.globalFlags(), GlobalImagFlag);
+
+  std::set<uint64_t> found;
+  std::map<uint64_t, int> expected;
+  expected[ 123456] = 0;
+  expected[1023456] = NegationFlag;
+  expected[2103456] = ConjugationFlag;
+  expected[1203456] = ConjugationFlag | NegationFlag;
+  expected[2013456] = ConjugationFlag | NegationFlag;
+  expected[ 213456] = ConjugationFlag;
+
+  VERIFY_IS_EQUAL(group.size(), 6u);
+  VERIFY_IS_EQUAL(group.globalFlags(), GlobalImagFlag);
+  group.apply<checkIdx, int>(identity, 0, found, expected);
+  VERIFY_IS_EQUAL(found.size(), 6u);
+}
+
+static void test_symgroups_selection()
+{
+  std::array<int, 7> identity7{{0,1,2,3,4,5,6}};
+  std::array<int, 10> identity10{{0,1,2,3,4,5,6,7,8,9}};
+
+  {
+    // Do the same test as in test_symgroups_static but
+    // require selection via SGroup
+    SGroup<
+      AntiSymmetry<0,1>,
+      Hermiticity<0,2>
+    > group;
+
+    std::set<uint64_t> found;
+    std::map<uint64_t, int> expected;
+    expected[ 123456] = 0;
+    expected[1023456] = NegationFlag;
+    expected[2103456] = ConjugationFlag;
+    expected[1203456] = ConjugationFlag | NegationFlag;
+    expected[2013456] = ConjugationFlag | NegationFlag;
+    expected[ 213456] = ConjugationFlag;
+
+    VERIFY(!isDynGroup(group));
+    VERIFY_IS_EQUAL(group.size(), 6u);
+    VERIFY_IS_EQUAL(group.globalFlags(), GlobalImagFlag);
+    group.apply<checkIdx, int>(identity7, 0, found, expected);
+    VERIFY_IS_EQUAL(found.size(), 6u);
+  }
+
+  {
+    // simple factorizing group: 5 generators, 2^5 = 32 elements
+    // selection should make this dynamic, although static group
+    // can still be reasonably generated
+    SGroup<
+      Symmetry<0,1>,
+      Symmetry<2,3>,
+      Symmetry<4,5>,
+      Symmetry<6,7>,
+      Symmetry<8,9>
+    > group;
+
+    std::set<uint64_t> found;
+    std::map<uint64_t, int> expected;
+    expected[ 123456789] = 0; expected[ 123456798] = 0; expected[ 123457689] = 0; expected[ 123457698] = 0;
+    expected[ 123546789] = 0; expected[ 123546798] = 0; expected[ 123547689] = 0; expected[ 123547698] = 0;
+    expected[ 132456789] = 0; expected[ 132456798] = 0; expected[ 132457689] = 0; expected[ 132457698] = 0;
+    expected[ 132546789] = 0; expected[ 132546798] = 0; expected[ 132547689] = 0; expected[ 132547698] = 0;
+    expected[1023456789] = 0; expected[1023456798] = 0; expected[1023457689] = 0; expected[1023457698] = 0;
+    expected[1023546789] = 0; expected[1023546798] = 0; expected[1023547689] = 0; expected[1023547698] = 0;
+    expected[1032456789] = 0; expected[1032456798] = 0; expected[1032457689] = 0; expected[1032457698] = 0;
+    expected[1032546789] = 0; expected[1032546798] = 0; expected[1032547689] = 0; expected[1032547698] = 0;
+
+    VERIFY(isDynGroup(group));
+    VERIFY_IS_EQUAL(group.size(), 32u);
+    VERIFY_IS_EQUAL(group.globalFlags(), 0);
+    group.apply<checkIdx, int>(identity10, 0, found, expected);
+    VERIFY_IS_EQUAL(found.size(), 32u);
+
+    // no verify that we could also generate a static group
+    // with these generators
+    found.clear();
+    StaticSGroup<
+      Symmetry<0,1>,
+      Symmetry<2,3>,
+      Symmetry<4,5>,
+      Symmetry<6,7>,
+      Symmetry<8,9>
+    > group_static;
+    VERIFY_IS_EQUAL(group_static.size(), 32u);
+    VERIFY_IS_EQUAL(group_static.globalFlags(), 0);
+    group_static.apply<checkIdx, int>(identity10, 0, found, expected);
+    VERIFY_IS_EQUAL(found.size(), 32u);
+  }
+
+  {
+    // try to create a HUGE group
+    SGroup<
+      Symmetry<0,1>,
+      Symmetry<1,2>,
+      Symmetry<2,3>,
+      Symmetry<3,4>,
+      Symmetry<4,5>,
+      Symmetry<5,6>
+    > group;
+
+    std::set<uint64_t> found;
+    uint64_t pre_expected[5040] = {
+       123456, 1023456,  213456, 2013456, 1203456, 2103456,  132456, 1032456,  312456, 3012456, 1302456, 3102456,
+       231456, 2031456,  321456, 3021456, 2301456, 3201456, 1230456, 2130456, 1320456, 3120456, 2310456, 3210456,
+       124356, 1024356,  214356, 2014356, 1204356, 2104356,  142356, 1042356,  412356, 4012356, 1402356, 4102356,
+       241356, 2041356,  421356, 4021356, 2401356, 4201356, 1240356, 2140356, 1420356, 4120356, 2410356, 4210356,
+       134256, 1034256,  314256, 3014256, 1304256, 3104256,  143256, 1043256,  413256, 4013256, 1403256, 4103256,
+       341256, 3041256,  431256, 4031256, 3401256, 4301256, 1340256, 3140256, 1430256, 4130256, 3410256, 4310256,
+       234156, 2034156,  324156, 3024156, 2304156, 3204156,  243156, 2043156,  423156, 4023156, 2403156, 4203156,
+       342156, 3042156,  432156, 4032156, 3402156, 4302156, 2340156, 3240156, 2430156, 4230156, 3420156, 4320156,
+      1234056, 2134056, 1324056, 3124056, 2314056, 3214056, 1243056, 2143056, 1423056, 4123056, 2413056, 4213056,
+      1342056, 3142056, 1432056, 4132056, 3412056, 4312056, 2341056, 3241056, 2431056, 4231056, 3421056, 4321056,
+       123546, 1023546,  213546, 2013546, 1203546, 2103546,  132546, 1032546,  312546, 3012546, 1302546, 3102546,
+       231546, 2031546,  321546, 3021546, 2301546, 3201546, 1230546, 2130546, 1320546, 3120546, 2310546, 3210546,
+       125346, 1025346,  215346, 2015346, 1205346, 2105346,  152346, 1052346,  512346, 5012346, 1502346, 5102346,
+       251346, 2051346,  521346, 5021346, 2501346, 5201346, 1250346, 2150346, 1520346, 5120346, 2510346, 5210346,
+       135246, 1035246,  315246, 3015246, 1305246, 3105246,  153246, 1053246,  513246, 5013246, 1503246, 5103246,
+       351246, 3051246,  531246, 5031246, 3501246, 5301246, 1350246, 3150246, 1530246, 5130246, 3510246, 5310246,
+       235146, 2035146,  325146, 3025146, 2305146, 3205146,  253146, 2053146,  523146, 5023146, 2503146, 5203146,
+       352146, 3052146,  532146, 5032146, 3502146, 5302146, 2350146, 3250146, 2530146, 5230146, 3520146, 5320146,
+      1235046, 2135046, 1325046, 3125046, 2315046, 3215046, 1253046, 2153046, 1523046, 5123046, 2513046, 5213046,
+      1352046, 3152046, 1532046, 5132046, 3512046, 5312046, 2351046, 3251046, 2531046, 5231046, 3521046, 5321046,
+       124536, 1024536,  214536, 2014536, 1204536, 2104536,  142536, 1042536,  412536, 4012536, 1402536, 4102536,
+       241536, 2041536,  421536, 4021536, 2401536, 4201536, 1240536, 2140536, 1420536, 4120536, 2410536, 4210536,
+       125436, 1025436,  215436, 2015436, 1205436, 2105436,  152436, 1052436,  512436, 5012436, 1502436, 5102436,
+       251436, 2051436,  521436, 5021436, 2501436, 5201436, 1250436, 2150436, 1520436, 5120436, 2510436, 5210436,
+       145236, 1045236,  415236, 4015236, 1405236, 4105236,  154236, 1054236,  514236, 5014236, 1504236, 5104236,
+       451236, 4051236,  541236, 5041236, 4501236, 5401236, 1450236, 4150236, 1540236, 5140236, 4510236, 5410236,
+       245136, 2045136,  425136, 4025136, 2405136, 4205136,  254136, 2054136,  524136, 5024136, 2504136, 5204136,
+       452136, 4052136,  542136, 5042136, 4502136, 5402136, 2450136, 4250136, 2540136, 5240136, 4520136, 5420136,
+      1245036, 2145036, 1425036, 4125036, 2415036, 4215036, 1254036, 2154036, 1524036, 5124036, 2514036, 5214036,
+      1452036, 4152036, 1542036, 5142036, 4512036, 5412036, 2451036, 4251036, 2541036, 5241036, 4521036, 5421036,
+       134526, 1034526,  314526, 3014526, 1304526, 3104526,  143526, 1043526,  413526, 4013526, 1403526, 4103526,
+       341526, 3041526,  431526, 4031526, 3401526, 4301526, 1340526, 3140526, 1430526, 4130526, 3410526, 4310526,
+       135426, 1035426,  315426, 3015426, 1305426, 3105426,  153426, 1053426,  513426, 5013426, 1503426, 5103426,
+       351426, 3051426,  531426, 5031426, 3501426, 5301426, 1350426, 3150426, 1530426, 5130426, 3510426, 5310426,
+       145326, 1045326,  415326, 4015326, 1405326, 4105326,  154326, 1054326,  514326, 5014326, 1504326, 5104326,
+       451326, 4051326,  541326, 5041326, 4501326, 5401326, 1450326, 4150326, 1540326, 5140326, 4510326, 5410326,
+       345126, 3045126,  435126, 4035126, 3405126, 4305126,  354126, 3054126,  534126, 5034126, 3504126, 5304126,
+       453126, 4053126,  543126, 5043126, 4503126, 5403126, 3450126, 4350126, 3540126, 5340126, 4530126, 5430126,
+      1345026, 3145026, 1435026, 4135026, 3415026, 4315026, 1354026, 3154026, 1534026, 5134026, 3514026, 5314026,
+      1453026, 4153026, 1543026, 5143026, 4513026, 5413026, 3451026, 4351026, 3541026, 5341026, 4531026, 5431026,
+       234516, 2034516,  324516, 3024516, 2304516, 3204516,  243516, 2043516,  423516, 4023516, 2403516, 4203516,
+       342516, 3042516,  432516, 4032516, 3402516, 4302516, 2340516, 3240516, 2430516, 4230516, 3420516, 4320516,
+       235416, 2035416,  325416, 3025416, 2305416, 3205416,  253416, 2053416,  523416, 5023416, 2503416, 5203416,
+       352416, 3052416,  532416, 5032416, 3502416, 5302416, 2350416, 3250416, 2530416, 5230416, 3520416, 5320416,
+       245316, 2045316,  425316, 4025316, 2405316, 4205316,  254316, 2054316,  524316, 5024316, 2504316, 5204316,
+       452316, 4052316,  542316, 5042316, 4502316, 5402316, 2450316, 4250316, 2540316, 5240316, 4520316, 5420316,
+       345216, 3045216,  435216, 4035216, 3405216, 4305216,  354216, 3054216,  534216, 5034216, 3504216, 5304216,
+       453216, 4053216,  543216, 5043216, 4503216, 5403216, 3450216, 4350216, 3540216, 5340216, 4530216, 5430216,
+      2345016, 3245016, 2435016, 4235016, 3425016, 4325016, 2354016, 3254016, 2534016, 5234016, 3524016, 5324016,
+      2453016, 4253016, 2543016, 5243016, 4523016, 5423016, 3452016, 4352016, 3542016, 5342016, 4532016, 5432016,
+      1234506, 2134506, 1324506, 3124506, 2314506, 3214506, 1243506, 2143506, 1423506, 4123506, 2413506, 4213506,
+      1342506, 3142506, 1432506, 4132506, 3412506, 4312506, 2341506, 3241506, 2431506, 4231506, 3421506, 4321506,
+      1235406, 2135406, 1325406, 3125406, 2315406, 3215406, 1253406, 2153406, 1523406, 5123406, 2513406, 5213406,
+      1352406, 3152406, 1532406, 5132406, 3512406, 5312406, 2351406, 3251406, 2531406, 5231406, 3521406, 5321406,
+      1245306, 2145306, 1425306, 4125306, 2415306, 4215306, 1254306, 2154306, 1524306, 5124306, 2514306, 5214306,
+      1452306, 4152306, 1542306, 5142306, 4512306, 5412306, 2451306, 4251306, 2541306, 5241306, 4521306, 5421306,
+      1345206, 3145206, 1435206, 4135206, 3415206, 4315206, 1354206, 3154206, 1534206, 5134206, 3514206, 5314206,
+      1453206, 4153206, 1543206, 5143206, 4513206, 5413206, 3451206, 4351206, 3541206, 5341206, 4531206, 5431206,
+      2345106, 3245106, 2435106, 4235106, 3425106, 4325106, 2354106, 3254106, 2534106, 5234106, 3524106, 5324106,
+      2453106, 4253106, 2543106, 5243106, 4523106, 5423106, 3452106, 4352106, 3542106, 5342106, 4532106, 5432106,
+       123465, 1023465,  213465, 2013465, 1203465, 2103465,  132465, 1032465,  312465, 3012465, 1302465, 3102465,
+       231465, 2031465,  321465, 3021465, 2301465, 3201465, 1230465, 2130465, 1320465, 3120465, 2310465, 3210465,
+       124365, 1024365,  214365, 2014365, 1204365, 2104365,  142365, 1042365,  412365, 4012365, 1402365, 4102365,
+       241365, 2041365,  421365, 4021365, 2401365, 4201365, 1240365, 2140365, 1420365, 4120365, 2410365, 4210365,
+       134265, 1034265,  314265, 3014265, 1304265, 3104265,  143265, 1043265,  413265, 4013265, 1403265, 4103265,
+       341265, 3041265,  431265, 4031265, 3401265, 4301265, 1340265, 3140265, 1430265, 4130265, 3410265, 4310265,
+       234165, 2034165,  324165, 3024165, 2304165, 3204165,  243165, 2043165,  423165, 4023165, 2403165, 4203165,
+       342165, 3042165,  432165, 4032165, 3402165, 4302165, 2340165, 3240165, 2430165, 4230165, 3420165, 4320165,
+      1234065, 2134065, 1324065, 3124065, 2314065, 3214065, 1243065, 2143065, 1423065, 4123065, 2413065, 4213065,
+      1342065, 3142065, 1432065, 4132065, 3412065, 4312065, 2341065, 3241065, 2431065, 4231065, 3421065, 4321065,
+       123645, 1023645,  213645, 2013645, 1203645, 2103645,  132645, 1032645,  312645, 3012645, 1302645, 3102645,
+       231645, 2031645,  321645, 3021645, 2301645, 3201645, 1230645, 2130645, 1320645, 3120645, 2310645, 3210645,
+       126345, 1026345,  216345, 2016345, 1206345, 2106345,  162345, 1062345,  612345, 6012345, 1602345, 6102345,
+       261345, 2061345,  621345, 6021345, 2601345, 6201345, 1260345, 2160345, 1620345, 6120345, 2610345, 6210345,
+       136245, 1036245,  316245, 3016245, 1306245, 3106245,  163245, 1063245,  613245, 6013245, 1603245, 6103245,
+       361245, 3061245,  631245, 6031245, 3601245, 6301245, 1360245, 3160245, 1630245, 6130245, 3610245, 6310245,
+       236145, 2036145,  326145, 3026145, 2306145, 3206145,  263145, 2063145,  623145, 6023145, 2603145, 6203145,
+       362145, 3062145,  632145, 6032145, 3602145, 6302145, 2360145, 3260145, 2630145, 6230145, 3620145, 6320145,
+      1236045, 2136045, 1326045, 3126045, 2316045, 3216045, 1263045, 2163045, 1623045, 6123045, 2613045, 6213045,
+      1362045, 3162045, 1632045, 6132045, 3612045, 6312045, 2361045, 3261045, 2631045, 6231045, 3621045, 6321045,
+       124635, 1024635,  214635, 2014635, 1204635, 2104635,  142635, 1042635,  412635, 4012635, 1402635, 4102635,
+       241635, 2041635,  421635, 4021635, 2401635, 4201635, 1240635, 2140635, 1420635, 4120635, 2410635, 4210635,
+       126435, 1026435,  216435, 2016435, 1206435, 2106435,  162435, 1062435,  612435, 6012435, 1602435, 6102435,
+       261435, 2061435,  621435, 6021435, 2601435, 6201435, 1260435, 2160435, 1620435, 6120435, 2610435, 6210435,
+       146235, 1046235,  416235, 4016235, 1406235, 4106235,  164235, 1064235,  614235, 6014235, 1604235, 6104235,
+       461235, 4061235,  641235, 6041235, 4601235, 6401235, 1460235, 4160235, 1640235, 6140235, 4610235, 6410235,
+       246135, 2046135,  426135, 4026135, 2406135, 4206135,  264135, 2064135,  624135, 6024135, 2604135, 6204135,
+       462135, 4062135,  642135, 6042135, 4602135, 6402135, 2460135, 4260135, 2640135, 6240135, 4620135, 6420135,
+      1246035, 2146035, 1426035, 4126035, 2416035, 4216035, 1264035, 2164035, 1624035, 6124035, 2614035, 6214035,
+      1462035, 4162035, 1642035, 6142035, 4612035, 6412035, 2461035, 4261035, 2641035, 6241035, 4621035, 6421035,
+       134625, 1034625,  314625, 3014625, 1304625, 3104625,  143625, 1043625,  413625, 4013625, 1403625, 4103625,
+       341625, 3041625,  431625, 4031625, 3401625, 4301625, 1340625, 3140625, 1430625, 4130625, 3410625, 4310625,
+       136425, 1036425,  316425, 3016425, 1306425, 3106425,  163425, 1063425,  613425, 6013425, 1603425, 6103425,
+       361425, 3061425,  631425, 6031425, 3601425, 6301425, 1360425, 3160425, 1630425, 6130425, 3610425, 6310425,
+       146325, 1046325,  416325, 4016325, 1406325, 4106325,  164325, 1064325,  614325, 6014325, 1604325, 6104325,
+       461325, 4061325,  641325, 6041325, 4601325, 6401325, 1460325, 4160325, 1640325, 6140325, 4610325, 6410325,
+       346125, 3046125,  436125, 4036125, 3406125, 4306125,  364125, 3064125,  634125, 6034125, 3604125, 6304125,
+       463125, 4063125,  643125, 6043125, 4603125, 6403125, 3460125, 4360125, 3640125, 6340125, 4630125, 6430125,
+      1346025, 3146025, 1436025, 4136025, 3416025, 4316025, 1364025, 3164025, 1634025, 6134025, 3614025, 6314025,
+      1463025, 4163025, 1643025, 6143025, 4613025, 6413025, 3461025, 4361025, 3641025, 6341025, 4631025, 6431025,
+       234615, 2034615,  324615, 3024615, 2304615, 3204615,  243615, 2043615,  423615, 4023615, 2403615, 4203615,
+       342615, 3042615,  432615, 4032615, 3402615, 4302615, 2340615, 3240615, 2430615, 4230615, 3420615, 4320615,
+       236415, 2036415,  326415, 3026415, 2306415, 3206415,  263415, 2063415,  623415, 6023415, 2603415, 6203415,
+       362415, 3062415,  632415, 6032415, 3602415, 6302415, 2360415, 3260415, 2630415, 6230415, 3620415, 6320415,
+       246315, 2046315,  426315, 4026315, 2406315, 4206315,  264315, 2064315,  624315, 6024315, 2604315, 6204315,
+       462315, 4062315,  642315, 6042315, 4602315, 6402315, 2460315, 4260315, 2640315, 6240315, 4620315, 6420315,
+       346215, 3046215,  436215, 4036215, 3406215, 4306215,  364215, 3064215,  634215, 6034215, 3604215, 6304215,
+       463215, 4063215,  643215, 6043215, 4603215, 6403215, 3460215, 4360215, 3640215, 6340215, 4630215, 6430215,
+      2346015, 3246015, 2436015, 4236015, 3426015, 4326015, 2364015, 3264015, 2634015, 6234015, 3624015, 6324015,
+      2463015, 4263015, 2643015, 6243015, 4623015, 6423015, 3462015, 4362015, 3642015, 6342015, 4632015, 6432015,
+      1234605, 2134605, 1324605, 3124605, 2314605, 3214605, 1243605, 2143605, 1423605, 4123605, 2413605, 4213605,
+      1342605, 3142605, 1432605, 4132605, 3412605, 4312605, 2341605, 3241605, 2431605, 4231605, 3421605, 4321605,
+      1236405, 2136405, 1326405, 3126405, 2316405, 3216405, 1263405, 2163405, 1623405, 6123405, 2613405, 6213405,
+      1362405, 3162405, 1632405, 6132405, 3612405, 6312405, 2361405, 3261405, 2631405, 6231405, 3621405, 6321405,
+      1246305, 2146305, 1426305, 4126305, 2416305, 4216305, 1264305, 2164305, 1624305, 6124305, 2614305, 6214305,
+      1462305, 4162305, 1642305, 6142305, 4612305, 6412305, 2461305, 4261305, 2641305, 6241305, 4621305, 6421305,
+      1346205, 3146205, 1436205, 4136205, 3416205, 4316205, 1364205, 3164205, 1634205, 6134205, 3614205, 6314205,
+      1463205, 4163205, 1643205, 6143205, 4613205, 6413205, 3461205, 4361205, 3641205, 6341205, 4631205, 6431205,
+      2346105, 3246105, 2436105, 4236105, 3426105, 4326105, 2364105, 3264105, 2634105, 6234105, 3624105, 6324105,
+      2463105, 4263105, 2643105, 6243105, 4623105, 6423105, 3462105, 4362105, 3642105, 6342105, 4632105, 6432105,
+       123564, 1023564,  213564, 2013564, 1203564, 2103564,  132564, 1032564,  312564, 3012564, 1302564, 3102564,
+       231564, 2031564,  321564, 3021564, 2301564, 3201564, 1230564, 2130564, 1320564, 3120564, 2310564, 3210564,
+       125364, 1025364,  215364, 2015364, 1205364, 2105364,  152364, 1052364,  512364, 5012364, 1502364, 5102364,
+       251364, 2051364,  521364, 5021364, 2501364, 5201364, 1250364, 2150364, 1520364, 5120364, 2510364, 5210364,
+       135264, 1035264,  315264, 3015264, 1305264, 3105264,  153264, 1053264,  513264, 5013264, 1503264, 5103264,
+       351264, 3051264,  531264, 5031264, 3501264, 5301264, 1350264, 3150264, 1530264, 5130264, 3510264, 5310264,
+       235164, 2035164,  325164, 3025164, 2305164, 3205164,  253164, 2053164,  523164, 5023164, 2503164, 5203164,
+       352164, 3052164,  532164, 5032164, 3502164, 5302164, 2350164, 3250164, 2530164, 5230164, 3520164, 5320164,
+      1235064, 2135064, 1325064, 3125064, 2315064, 3215064, 1253064, 2153064, 1523064, 5123064, 2513064, 5213064,
+      1352064, 3152064, 1532064, 5132064, 3512064, 5312064, 2351064, 3251064, 2531064, 5231064, 3521064, 5321064,
+       123654, 1023654,  213654, 2013654, 1203654, 2103654,  132654, 1032654,  312654, 3012654, 1302654, 3102654,
+       231654, 2031654,  321654, 3021654, 2301654, 3201654, 1230654, 2130654, 1320654, 3120654, 2310654, 3210654,
+       126354, 1026354,  216354, 2016354, 1206354, 2106354,  162354, 1062354,  612354, 6012354, 1602354, 6102354,
+       261354, 2061354,  621354, 6021354, 2601354, 6201354, 1260354, 2160354, 1620354, 6120354, 2610354, 6210354,
+       136254, 1036254,  316254, 3016254, 1306254, 3106254,  163254, 1063254,  613254, 6013254, 1603254, 6103254,
+       361254, 3061254,  631254, 6031254, 3601254, 6301254, 1360254, 3160254, 1630254, 6130254, 3610254, 6310254,
+       236154, 2036154,  326154, 3026154, 2306154, 3206154,  263154, 2063154,  623154, 6023154, 2603154, 6203154,
+       362154, 3062154,  632154, 6032154, 3602154, 6302154, 2360154, 3260154, 2630154, 6230154, 3620154, 6320154,
+      1236054, 2136054, 1326054, 3126054, 2316054, 3216054, 1263054, 2163054, 1623054, 6123054, 2613054, 6213054,
+      1362054, 3162054, 1632054, 6132054, 3612054, 6312054, 2361054, 3261054, 2631054, 6231054, 3621054, 6321054,
+       125634, 1025634,  215634, 2015634, 1205634, 2105634,  152634, 1052634,  512634, 5012634, 1502634, 5102634,
+       251634, 2051634,  521634, 5021634, 2501634, 5201634, 1250634, 2150634, 1520634, 5120634, 2510634, 5210634,
+       126534, 1026534,  216534, 2016534, 1206534, 2106534,  162534, 1062534,  612534, 6012534, 1602534, 6102534,
+       261534, 2061534,  621534, 6021534, 2601534, 6201534, 1260534, 2160534, 1620534, 6120534, 2610534, 6210534,
+       156234, 1056234,  516234, 5016234, 1506234, 5106234,  165234, 1065234,  615234, 6015234, 1605234, 6105234,
+       561234, 5061234,  651234, 6051234, 5601234, 6501234, 1560234, 5160234, 1650234, 6150234, 5610234, 6510234,
+       256134, 2056134,  526134, 5026134, 2506134, 5206134,  265134, 2065134,  625134, 6025134, 2605134, 6205134,
+       562134, 5062134,  652134, 6052134, 5602134, 6502134, 2560134, 5260134, 2650134, 6250134, 5620134, 6520134,
+      1256034, 2156034, 1526034, 5126034, 2516034, 5216034, 1265034, 2165034, 1625034, 6125034, 2615034, 6215034,
+      1562034, 5162034, 1652034, 6152034, 5612034, 6512034, 2561034, 5261034, 2651034, 6251034, 5621034, 6521034,
+       135624, 1035624,  315624, 3015624, 1305624, 3105624,  153624, 1053624,  513624, 5013624, 1503624, 5103624,
+       351624, 3051624,  531624, 5031624, 3501624, 5301624, 1350624, 3150624, 1530624, 5130624, 3510624, 5310624,
+       136524, 1036524,  316524, 3016524, 1306524, 3106524,  163524, 1063524,  613524, 6013524, 1603524, 6103524,
+       361524, 3061524,  631524, 6031524, 3601524, 6301524, 1360524, 3160524, 1630524, 6130524, 3610524, 6310524,
+       156324, 1056324,  516324, 5016324, 1506324, 5106324,  165324, 1065324,  615324, 6015324, 1605324, 6105324,
+       561324, 5061324,  651324, 6051324, 5601324, 6501324, 1560324, 5160324, 1650324, 6150324, 5610324, 6510324,
+       356124, 3056124,  536124, 5036124, 3506124, 5306124,  365124, 3065124,  635124, 6035124, 3605124, 6305124,
+       563124, 5063124,  653124, 6053124, 5603124, 6503124, 3560124, 5360124, 3650124, 6350124, 5630124, 6530124,
+      1356024, 3156024, 1536024, 5136024, 3516024, 5316024, 1365024, 3165024, 1635024, 6135024, 3615024, 6315024,
+      1563024, 5163024, 1653024, 6153024, 5613024, 6513024, 3561024, 5361024, 3651024, 6351024, 5631024, 6531024,
+       235614, 2035614,  325614, 3025614, 2305614, 3205614,  253614, 2053614,  523614, 5023614, 2503614, 5203614,
+       352614, 3052614,  532614, 5032614, 3502614, 5302614, 2350614, 3250614, 2530614, 5230614, 3520614, 5320614,
+       236514, 2036514,  326514, 3026514, 2306514, 3206514,  263514, 2063514,  623514, 6023514, 2603514, 6203514,
+       362514, 3062514,  632514, 6032514, 3602514, 6302514, 2360514, 3260514, 2630514, 6230514, 3620514, 6320514,
+       256314, 2056314,  526314, 5026314, 2506314, 5206314,  265314, 2065314,  625314, 6025314, 2605314, 6205314,
+       562314, 5062314,  652314, 6052314, 5602314, 6502314, 2560314, 5260314, 2650314, 6250314, 5620314, 6520314,
+       356214, 3056214,  536214, 5036214, 3506214, 5306214,  365214, 3065214,  635214, 6035214, 3605214, 6305214,
+       563214, 5063214,  653214, 6053214, 5603214, 6503214, 3560214, 5360214, 3650214, 6350214, 5630214, 6530214,
+      2356014, 3256014, 2536014, 5236014, 3526014, 5326014, 2365014, 3265014, 2635014, 6235014, 3625014, 6325014,
+      2563014, 5263014, 2653014, 6253014, 5623014, 6523014, 3562014, 5362014, 3652014, 6352014, 5632014, 6532014,
+      1235604, 2135604, 1325604, 3125604, 2315604, 3215604, 1253604, 2153604, 1523604, 5123604, 2513604, 5213604,
+      1352604, 3152604, 1532604, 5132604, 3512604, 5312604, 2351604, 3251604, 2531604, 5231604, 3521604, 5321604,
+      1236504, 2136504, 1326504, 3126504, 2316504, 3216504, 1263504, 2163504, 1623504, 6123504, 2613504, 6213504,
+      1362504, 3162504, 1632504, 6132504, 3612504, 6312504, 2361504, 3261504, 2631504, 6231504, 3621504, 6321504,
+      1256304, 2156304, 1526304, 5126304, 2516304, 5216304, 1265304, 2165304, 1625304, 6125304, 2615304, 6215304,
+      1562304, 5162304, 1652304, 6152304, 5612304, 6512304, 2561304, 5261304, 2651304, 6251304, 5621304, 6521304,
+      1356204, 3156204, 1536204, 5136204, 3516204, 5316204, 1365204, 3165204, 1635204, 6135204, 3615204, 6315204,
+      1563204, 5163204, 1653204, 6153204, 5613204, 6513204, 3561204, 5361204, 3651204, 6351204, 5631204, 6531204,
+      2356104, 3256104, 2536104, 5236104, 3526104, 5326104, 2365104, 3265104, 2635104, 6235104, 3625104, 6325104,
+      2563104, 5263104, 2653104, 6253104, 5623104, 6523104, 3562104, 5362104, 3652104, 6352104, 5632104, 6532104,
+       124563, 1024563,  214563, 2014563, 1204563, 2104563,  142563, 1042563,  412563, 4012563, 1402563, 4102563,
+       241563, 2041563,  421563, 4021563, 2401563, 4201563, 1240563, 2140563, 1420563, 4120563, 2410563, 4210563,
+       125463, 1025463,  215463, 2015463, 1205463, 2105463,  152463, 1052463,  512463, 5012463, 1502463, 5102463,
+       251463, 2051463,  521463, 5021463, 2501463, 5201463, 1250463, 2150463, 1520463, 5120463, 2510463, 5210463,
+       145263, 1045263,  415263, 4015263, 1405263, 4105263,  154263, 1054263,  514263, 5014263, 1504263, 5104263,
+       451263, 4051263,  541263, 5041263, 4501263, 5401263, 1450263, 4150263, 1540263, 5140263, 4510263, 5410263,
+       245163, 2045163,  425163, 4025163, 2405163, 4205163,  254163, 2054163,  524163, 5024163, 2504163, 5204163,
+       452163, 4052163,  542163, 5042163, 4502163, 5402163, 2450163, 4250163, 2540163, 5240163, 4520163, 5420163,
+      1245063, 2145063, 1425063, 4125063, 2415063, 4215063, 1254063, 2154063, 1524063, 5124063, 2514063, 5214063,
+      1452063, 4152063, 1542063, 5142063, 4512063, 5412063, 2451063, 4251063, 2541063, 5241063, 4521063, 5421063,
+       124653, 1024653,  214653, 2014653, 1204653, 2104653,  142653, 1042653,  412653, 4012653, 1402653, 4102653,
+       241653, 2041653,  421653, 4021653, 2401653, 4201653, 1240653, 2140653, 1420653, 4120653, 2410653, 4210653,
+       126453, 1026453,  216453, 2016453, 1206453, 2106453,  162453, 1062453,  612453, 6012453, 1602453, 6102453,
+       261453, 2061453,  621453, 6021453, 2601453, 6201453, 1260453, 2160453, 1620453, 6120453, 2610453, 6210453,
+       146253, 1046253,  416253, 4016253, 1406253, 4106253,  164253, 1064253,  614253, 6014253, 1604253, 6104253,
+       461253, 4061253,  641253, 6041253, 4601253, 6401253, 1460253, 4160253, 1640253, 6140253, 4610253, 6410253,
+       246153, 2046153,  426153, 4026153, 2406153, 4206153,  264153, 2064153,  624153, 6024153, 2604153, 6204153,
+       462153, 4062153,  642153, 6042153, 4602153, 6402153, 2460153, 4260153, 2640153, 6240153, 4620153, 6420153,
+      1246053, 2146053, 1426053, 4126053, 2416053, 4216053, 1264053, 2164053, 1624053, 6124053, 2614053, 6214053,
+      1462053, 4162053, 1642053, 6142053, 4612053, 6412053, 2461053, 4261053, 2641053, 6241053, 4621053, 6421053,
+       125643, 1025643,  215643, 2015643, 1205643, 2105643,  152643, 1052643,  512643, 5012643, 1502643, 5102643,
+       251643, 2051643,  521643, 5021643, 2501643, 5201643, 1250643, 2150643, 1520643, 5120643, 2510643, 5210643,
+       126543, 1026543,  216543, 2016543, 1206543, 2106543,  162543, 1062543,  612543, 6012543, 1602543, 6102543,
+       261543, 2061543,  621543, 6021543, 2601543, 6201543, 1260543, 2160543, 1620543, 6120543, 2610543, 6210543,
+       156243, 1056243,  516243, 5016243, 1506243, 5106243,  165243, 1065243,  615243, 6015243, 1605243, 6105243,
+       561243, 5061243,  651243, 6051243, 5601243, 6501243, 1560243, 5160243, 1650243, 6150243, 5610243, 6510243,
+       256143, 2056143,  526143, 5026143, 2506143, 5206143,  265143, 2065143,  625143, 6025143, 2605143, 6205143,
+       562143, 5062143,  652143, 6052143, 5602143, 6502143, 2560143, 5260143, 2650143, 6250143, 5620143, 6520143,
+      1256043, 2156043, 1526043, 5126043, 2516043, 5216043, 1265043, 2165043, 1625043, 6125043, 2615043, 6215043,
+      1562043, 5162043, 1652043, 6152043, 5612043, 6512043, 2561043, 5261043, 2651043, 6251043, 5621043, 6521043,
+       145623, 1045623,  415623, 4015623, 1405623, 4105623,  154623, 1054623,  514623, 5014623, 1504623, 5104623,
+       451623, 4051623,  541623, 5041623, 4501623, 5401623, 1450623, 4150623, 1540623, 5140623, 4510623, 5410623,
+       146523, 1046523,  416523, 4016523, 1406523, 4106523,  164523, 1064523,  614523, 6014523, 1604523, 6104523,
+       461523, 4061523,  641523, 6041523, 4601523, 6401523, 1460523, 4160523, 1640523, 6140523, 4610523, 6410523,
+       156423, 1056423,  516423, 5016423, 1506423, 5106423,  165423, 1065423,  615423, 6015423, 1605423, 6105423,
+       561423, 5061423,  651423, 6051423, 5601423, 6501423, 1560423, 5160423, 1650423, 6150423, 5610423, 6510423,
+       456123, 4056123,  546123, 5046123, 4506123, 5406123,  465123, 4065123,  645123, 6045123, 4605123, 6405123,
+       564123, 5064123,  654123, 6054123, 5604123, 6504123, 4560123, 5460123, 4650123, 6450123, 5640123, 6540123,
+      1456023, 4156023, 1546023, 5146023, 4516023, 5416023, 1465023, 4165023, 1645023, 6145023, 4615023, 6415023,
+      1564023, 5164023, 1654023, 6154023, 5614023, 6514023, 4561023, 5461023, 4651023, 6451023, 5641023, 6541023,
+       245613, 2045613,  425613, 4025613, 2405613, 4205613,  254613, 2054613,  524613, 5024613, 2504613, 5204613,
+       452613, 4052613,  542613, 5042613, 4502613, 5402613, 2450613, 4250613, 2540613, 5240613, 4520613, 5420613,
+       246513, 2046513,  426513, 4026513, 2406513, 4206513,  264513, 2064513,  624513, 6024513, 2604513, 6204513,
+       462513, 4062513,  642513, 6042513, 4602513, 6402513, 2460513, 4260513, 2640513, 6240513, 4620513, 6420513,
+       256413, 2056413,  526413, 5026413, 2506413, 5206413,  265413, 2065413,  625413, 6025413, 2605413, 6205413,
+       562413, 5062413,  652413, 6052413, 5602413, 6502413, 2560413, 5260413, 2650413, 6250413, 5620413, 6520413,
+       456213, 4056213,  546213, 5046213, 4506213, 5406213,  465213, 4065213,  645213, 6045213, 4605213, 6405213,
+       564213, 5064213,  654213, 6054213, 5604213, 6504213, 4560213, 5460213, 4650213, 6450213, 5640213, 6540213,
+      2456013, 4256013, 2546013, 5246013, 4526013, 5426013, 2465013, 4265013, 2645013, 6245013, 4625013, 6425013,
+      2564013, 5264013, 2654013, 6254013, 5624013, 6524013, 4562013, 5462013, 4652013, 6452013, 5642013, 6542013,
+      1245603, 2145603, 1425603, 4125603, 2415603, 4215603, 1254603, 2154603, 1524603, 5124603, 2514603, 5214603,
+      1452603, 4152603, 1542603, 5142603, 4512603, 5412603, 2451603, 4251603, 2541603, 5241603, 4521603, 5421603,
+      1246503, 2146503, 1426503, 4126503, 2416503, 4216503, 1264503, 2164503, 1624503, 6124503, 2614503, 6214503,
+      1462503, 4162503, 1642503, 6142503, 4612503, 6412503, 2461503, 4261503, 2641503, 6241503, 4621503, 6421503,
+      1256403, 2156403, 1526403, 5126403, 2516403, 5216403, 1265403, 2165403, 1625403, 6125403, 2615403, 6215403,
+      1562403, 5162403, 1652403, 6152403, 5612403, 6512403, 2561403, 5261403, 2651403, 6251403, 5621403, 6521403,
+      1456203, 4156203, 1546203, 5146203, 4516203, 5416203, 1465203, 4165203, 1645203, 6145203, 4615203, 6415203,
+      1564203, 5164203, 1654203, 6154203, 5614203, 6514203, 4561203, 5461203, 4651203, 6451203, 5641203, 6541203,
+      2456103, 4256103, 2546103, 5246103, 4526103, 5426103, 2465103, 4265103, 2645103, 6245103, 4625103, 6425103,
+      2564103, 5264103, 2654103, 6254103, 5624103, 6524103, 4562103, 5462103, 4652103, 6452103, 5642103, 6542103,
+       134562, 1034562,  314562, 3014562, 1304562, 3104562,  143562, 1043562,  413562, 4013562, 1403562, 4103562,
+       341562, 3041562,  431562, 4031562, 3401562, 4301562, 1340562, 3140562, 1430562, 4130562, 3410562, 4310562,
+       135462, 1035462,  315462, 3015462, 1305462, 3105462,  153462, 1053462,  513462, 5013462, 1503462, 5103462,
+       351462, 3051462,  531462, 5031462, 3501462, 5301462, 1350462, 3150462, 1530462, 5130462, 3510462, 5310462,
+       145362, 1045362,  415362, 4015362, 1405362, 4105362,  154362, 1054362,  514362, 5014362, 1504362, 5104362,
+       451362, 4051362,  541362, 5041362, 4501362, 5401362, 1450362, 4150362, 1540362, 5140362, 4510362, 5410362,
+       345162, 3045162,  435162, 4035162, 3405162, 4305162,  354162, 3054162,  534162, 5034162, 3504162, 5304162,
+       453162, 4053162,  543162, 5043162, 4503162, 5403162, 3450162, 4350162, 3540162, 5340162, 4530162, 5430162,
+      1345062, 3145062, 1435062, 4135062, 3415062, 4315062, 1354062, 3154062, 1534062, 5134062, 3514062, 5314062,
+      1453062, 4153062, 1543062, 5143062, 4513062, 5413062, 3451062, 4351062, 3541062, 5341062, 4531062, 5431062,
+       134652, 1034652,  314652, 3014652, 1304652, 3104652,  143652, 1043652,  413652, 4013652, 1403652, 4103652,
+       341652, 3041652,  431652, 4031652, 3401652, 4301652, 1340652, 3140652, 1430652, 4130652, 3410652, 4310652,
+       136452, 1036452,  316452, 3016452, 1306452, 3106452,  163452, 1063452,  613452, 6013452, 1603452, 6103452,
+       361452, 3061452,  631452, 6031452, 3601452, 6301452, 1360452, 3160452, 1630452, 6130452, 3610452, 6310452,
+       146352, 1046352,  416352, 4016352, 1406352, 4106352,  164352, 1064352,  614352, 6014352, 1604352, 6104352,
+       461352, 4061352,  641352, 6041352, 4601352, 6401352, 1460352, 4160352, 1640352, 6140352, 4610352, 6410352,
+       346152, 3046152,  436152, 4036152, 3406152, 4306152,  364152, 3064152,  634152, 6034152, 3604152, 6304152,
+       463152, 4063152,  643152, 6043152, 4603152, 6403152, 3460152, 4360152, 3640152, 6340152, 4630152, 6430152,
+      1346052, 3146052, 1436052, 4136052, 3416052, 4316052, 1364052, 3164052, 1634052, 6134052, 3614052, 6314052,
+      1463052, 4163052, 1643052, 6143052, 4613052, 6413052, 3461052, 4361052, 3641052, 6341052, 4631052, 6431052,
+       135642, 1035642,  315642, 3015642, 1305642, 3105642,  153642, 1053642,  513642, 5013642, 1503642, 5103642,
+       351642, 3051642,  531642, 5031642, 3501642, 5301642, 1350642, 3150642, 1530642, 5130642, 3510642, 5310642,
+       136542, 1036542,  316542, 3016542, 1306542, 3106542,  163542, 1063542,  613542, 6013542, 1603542, 6103542,
+       361542, 3061542,  631542, 6031542, 3601542, 6301542, 1360542, 3160542, 1630542, 6130542, 3610542, 6310542,
+       156342, 1056342,  516342, 5016342, 1506342, 5106342,  165342, 1065342,  615342, 6015342, 1605342, 6105342,
+       561342, 5061342,  651342, 6051342, 5601342, 6501342, 1560342, 5160342, 1650342, 6150342, 5610342, 6510342,
+       356142, 3056142,  536142, 5036142, 3506142, 5306142,  365142, 3065142,  635142, 6035142, 3605142, 6305142,
+       563142, 5063142,  653142, 6053142, 5603142, 6503142, 3560142, 5360142, 3650142, 6350142, 5630142, 6530142,
+      1356042, 3156042, 1536042, 5136042, 3516042, 5316042, 1365042, 3165042, 1635042, 6135042, 3615042, 6315042,
+      1563042, 5163042, 1653042, 6153042, 5613042, 6513042, 3561042, 5361042, 3651042, 6351042, 5631042, 6531042,
+       145632, 1045632,  415632, 4015632, 1405632, 4105632,  154632, 1054632,  514632, 5014632, 1504632, 5104632,
+       451632, 4051632,  541632, 5041632, 4501632, 5401632, 1450632, 4150632, 1540632, 5140632, 4510632, 5410632,
+       146532, 1046532,  416532, 4016532, 1406532, 4106532,  164532, 1064532,  614532, 6014532, 1604532, 6104532,
+       461532, 4061532,  641532, 6041532, 4601532, 6401532, 1460532, 4160532, 1640532, 6140532, 4610532, 6410532,
+       156432, 1056432,  516432, 5016432, 1506432, 5106432,  165432, 1065432,  615432, 6015432, 1605432, 6105432,
+       561432, 5061432,  651432, 6051432, 5601432, 6501432, 1560432, 5160432, 1650432, 6150432, 5610432, 6510432,
+       456132, 4056132,  546132, 5046132, 4506132, 5406132,  465132, 4065132,  645132, 6045132, 4605132, 6405132,
+       564132, 5064132,  654132, 6054132, 5604132, 6504132, 4560132, 5460132, 4650132, 6450132, 5640132, 6540132,
+      1456032, 4156032, 1546032, 5146032, 4516032, 5416032, 1465032, 4165032, 1645032, 6145032, 4615032, 6415032,
+      1564032, 5164032, 1654032, 6154032, 5614032, 6514032, 4561032, 5461032, 4651032, 6451032, 5641032, 6541032,
+       345612, 3045612,  435612, 4035612, 3405612, 4305612,  354612, 3054612,  534612, 5034612, 3504612, 5304612,
+       453612, 4053612,  543612, 5043612, 4503612, 5403612, 3450612, 4350612, 3540612, 5340612, 4530612, 5430612,
+       346512, 3046512,  436512, 4036512, 3406512, 4306512,  364512, 3064512,  634512, 6034512, 3604512, 6304512,
+       463512, 4063512,  643512, 6043512, 4603512, 6403512, 3460512, 4360512, 3640512, 6340512, 4630512, 6430512,
+       356412, 3056412,  536412, 5036412, 3506412, 5306412,  365412, 3065412,  635412, 6035412, 3605412, 6305412,
+       563412, 5063412,  653412, 6053412, 5603412, 6503412, 3560412, 5360412, 3650412, 6350412, 5630412, 6530412,
+       456312, 4056312,  546312, 5046312, 4506312, 5406312,  465312, 4065312,  645312, 6045312, 4605312, 6405312,
+       564312, 5064312,  654312, 6054312, 5604312, 6504312, 4560312, 5460312, 4650312, 6450312, 5640312, 6540312,
+      3456012, 4356012, 3546012, 5346012, 4536012, 5436012, 3465012, 4365012, 3645012, 6345012, 4635012, 6435012,
+      3564012, 5364012, 3654012, 6354012, 5634012, 6534012, 4563012, 5463012, 4653012, 6453012, 5643012, 6543012,
+      1345602, 3145602, 1435602, 4135602, 3415602, 4315602, 1354602, 3154602, 1534602, 5134602, 3514602, 5314602,
+      1453602, 4153602, 1543602, 5143602, 4513602, 5413602, 3451602, 4351602, 3541602, 5341602, 4531602, 5431602,
+      1346502, 3146502, 1436502, 4136502, 3416502, 4316502, 1364502, 3164502, 1634502, 6134502, 3614502, 6314502,
+      1463502, 4163502, 1643502, 6143502, 4613502, 6413502, 3461502, 4361502, 3641502, 6341502, 4631502, 6431502,
+      1356402, 3156402, 1536402, 5136402, 3516402, 5316402, 1365402, 3165402, 1635402, 6135402, 3615402, 6315402,
+      1563402, 5163402, 1653402, 6153402, 5613402, 6513402, 3561402, 5361402, 3651402, 6351402, 5631402, 6531402,
+      1456302, 4156302, 1546302, 5146302, 4516302, 5416302, 1465302, 4165302, 1645302, 6145302, 4615302, 6415302,
+      1564302, 5164302, 1654302, 6154302, 5614302, 6514302, 4561302, 5461302, 4651302, 6451302, 5641302, 6541302,
+      3456102, 4356102, 3546102, 5346102, 4536102, 5436102, 3465102, 4365102, 3645102, 6345102, 4635102, 6435102,
+      3564102, 5364102, 3654102, 6354102, 5634102, 6534102, 4563102, 5463102, 4653102, 6453102, 5643102, 6543102,
+       234561, 2034561,  324561, 3024561, 2304561, 3204561,  243561, 2043561,  423561, 4023561, 2403561, 4203561,
+       342561, 3042561,  432561, 4032561, 3402561, 4302561, 2340561, 3240561, 2430561, 4230561, 3420561, 4320561,
+       235461, 2035461,  325461, 3025461, 2305461, 3205461,  253461, 2053461,  523461, 5023461, 2503461, 5203461,
+       352461, 3052461,  532461, 5032461, 3502461, 5302461, 2350461, 3250461, 2530461, 5230461, 3520461, 5320461,
+       245361, 2045361,  425361, 4025361, 2405361, 4205361,  254361, 2054361,  524361, 5024361, 2504361, 5204361,
+       452361, 4052361,  542361, 5042361, 4502361, 5402361, 2450361, 4250361, 2540361, 5240361, 4520361, 5420361,
+       345261, 3045261,  435261, 4035261, 3405261, 4305261,  354261, 3054261,  534261, 5034261, 3504261, 5304261,
+       453261, 4053261,  543261, 5043261, 4503261, 5403261, 3450261, 4350261, 3540261, 5340261, 4530261, 5430261,
+      2345061, 3245061, 2435061, 4235061, 3425061, 4325061, 2354061, 3254061, 2534061, 5234061, 3524061, 5324061,
+      2453061, 4253061, 2543061, 5243061, 4523061, 5423061, 3452061, 4352061, 3542061, 5342061, 4532061, 5432061,
+       234651, 2034651,  324651, 3024651, 2304651, 3204651,  243651, 2043651,  423651, 4023651, 2403651, 4203651,
+       342651, 3042651,  432651, 4032651, 3402651, 4302651, 2340651, 3240651, 2430651, 4230651, 3420651, 4320651,
+       236451, 2036451,  326451, 3026451, 2306451, 3206451,  263451, 2063451,  623451, 6023451, 2603451, 6203451,
+       362451, 3062451,  632451, 6032451, 3602451, 6302451, 2360451, 3260451, 2630451, 6230451, 3620451, 6320451,
+       246351, 2046351,  426351, 4026351, 2406351, 4206351,  264351, 2064351,  624351, 6024351, 2604351, 6204351,
+       462351, 4062351,  642351, 6042351, 4602351, 6402351, 2460351, 4260351, 2640351, 6240351, 4620351, 6420351,
+       346251, 3046251,  436251, 4036251, 3406251, 4306251,  364251, 3064251,  634251, 6034251, 3604251, 6304251,
+       463251, 4063251,  643251, 6043251, 4603251, 6403251, 3460251, 4360251, 3640251, 6340251, 4630251, 6430251,
+      2346051, 3246051, 2436051, 4236051, 3426051, 4326051, 2364051, 3264051, 2634051, 6234051, 3624051, 6324051,
+      2463051, 4263051, 2643051, 6243051, 4623051, 6423051, 3462051, 4362051, 3642051, 6342051, 4632051, 6432051,
+       235641, 2035641,  325641, 3025641, 2305641, 3205641,  253641, 2053641,  523641, 5023641, 2503641, 5203641,
+       352641, 3052641,  532641, 5032641, 3502641, 5302641, 2350641, 3250641, 2530641, 5230641, 3520641, 5320641,
+       236541, 2036541,  326541, 3026541, 2306541, 3206541,  263541, 2063541,  623541, 6023541, 2603541, 6203541,
+       362541, 3062541,  632541, 6032541, 3602541, 6302541, 2360541, 3260541, 2630541, 6230541, 3620541, 6320541,
+       256341, 2056341,  526341, 5026341, 2506341, 5206341,  265341, 2065341,  625341, 6025341, 2605341, 6205341,
+       562341, 5062341,  652341, 6052341, 5602341, 6502341, 2560341, 5260341, 2650341, 6250341, 5620341, 6520341,
+       356241, 3056241,  536241, 5036241, 3506241, 5306241,  365241, 3065241,  635241, 6035241, 3605241, 6305241,
+       563241, 5063241,  653241, 6053241, 5603241, 6503241, 3560241, 5360241, 3650241, 6350241, 5630241, 6530241,
+      2356041, 3256041, 2536041, 5236041, 3526041, 5326041, 2365041, 3265041, 2635041, 6235041, 3625041, 6325041,
+      2563041, 5263041, 2653041, 6253041, 5623041, 6523041, 3562041, 5362041, 3652041, 6352041, 5632041, 6532041,
+       245631, 2045631,  425631, 4025631, 2405631, 4205631,  254631, 2054631,  524631, 5024631, 2504631, 5204631,
+       452631, 4052631,  542631, 5042631, 4502631, 5402631, 2450631, 4250631, 2540631, 5240631, 4520631, 5420631,
+       246531, 2046531,  426531, 4026531, 2406531, 4206531,  264531, 2064531,  624531, 6024531, 2604531, 6204531,
+       462531, 4062531,  642531, 6042531, 4602531, 6402531, 2460531, 4260531, 2640531, 6240531, 4620531, 6420531,
+       256431, 2056431,  526431, 5026431, 2506431, 5206431,  265431, 2065431,  625431, 6025431, 2605431, 6205431,
+       562431, 5062431,  652431, 6052431, 5602431, 6502431, 2560431, 5260431, 2650431, 6250431, 5620431, 6520431,
+       456231, 4056231,  546231, 5046231, 4506231, 5406231,  465231, 4065231,  645231, 6045231, 4605231, 6405231,
+       564231, 5064231,  654231, 6054231, 5604231, 6504231, 4560231, 5460231, 4650231, 6450231, 5640231, 6540231,
+      2456031, 4256031, 2546031, 5246031, 4526031, 5426031, 2465031, 4265031, 2645031, 6245031, 4625031, 6425031,
+      2564031, 5264031, 2654031, 6254031, 5624031, 6524031, 4562031, 5462031, 4652031, 6452031, 5642031, 6542031,
+       345621, 3045621,  435621, 4035621, 3405621, 4305621,  354621, 3054621,  534621, 5034621, 3504621, 5304621,
+       453621, 4053621,  543621, 5043621, 4503621, 5403621, 3450621, 4350621, 3540621, 5340621, 4530621, 5430621,
+       346521, 3046521,  436521, 4036521, 3406521, 4306521,  364521, 3064521,  634521, 6034521, 3604521, 6304521,
+       463521, 4063521,  643521, 6043521, 4603521, 6403521, 3460521, 4360521, 3640521, 6340521, 4630521, 6430521,
+       356421, 3056421,  536421, 5036421, 3506421, 5306421,  365421, 3065421,  635421, 6035421, 3605421, 6305421,
+       563421, 5063421,  653421, 6053421, 5603421, 6503421, 3560421, 5360421, 3650421, 6350421, 5630421, 6530421,
+       456321, 4056321,  546321, 5046321, 4506321, 5406321,  465321, 4065321,  645321, 6045321, 4605321, 6405321,
+       564321, 5064321,  654321, 6054321, 5604321, 6504321, 4560321, 5460321, 4650321, 6450321, 5640321, 6540321,
+      3456021, 4356021, 3546021, 5346021, 4536021, 5436021, 3465021, 4365021, 3645021, 6345021, 4635021, 6435021,
+      3564021, 5364021, 3654021, 6354021, 5634021, 6534021, 4563021, 5463021, 4653021, 6453021, 5643021, 6543021,
+      2345601, 3245601, 2435601, 4235601, 3425601, 4325601, 2354601, 3254601, 2534601, 5234601, 3524601, 5324601,
+      2453601, 4253601, 2543601, 5243601, 4523601, 5423601, 3452601, 4352601, 3542601, 5342601, 4532601, 5432601,
+      2346501, 3246501, 2436501, 4236501, 3426501, 4326501, 2364501, 3264501, 2634501, 6234501, 3624501, 6324501,
+      2463501, 4263501, 2643501, 6243501, 4623501, 6423501, 3462501, 4362501, 3642501, 6342501, 4632501, 6432501,
+      2356401, 3256401, 2536401, 5236401, 3526401, 5326401, 2365401, 3265401, 2635401, 6235401, 3625401, 6325401,
+      2563401, 5263401, 2653401, 6253401, 5623401, 6523401, 3562401, 5362401, 3652401, 6352401, 5632401, 6532401,
+      2456301, 4256301, 2546301, 5246301, 4526301, 5426301, 2465301, 4265301, 2645301, 6245301, 4625301, 6425301,
+      2564301, 5264301, 2654301, 6254301, 5624301, 6524301, 4562301, 5462301, 4652301, 6452301, 5642301, 6542301,
+      3456201, 4356201, 3546201, 5346201, 4536201, 5436201, 3465201, 4365201, 3645201, 6345201, 4635201, 6435201,
+      3564201, 5364201, 3654201, 6354201, 5634201, 6534201, 4563201, 5463201, 4653201, 6453201, 5643201, 6543201,
+      1234560, 2134560, 1324560, 3124560, 2314560, 3214560, 1243560, 2143560, 1423560, 4123560, 2413560, 4213560,
+      1342560, 3142560, 1432560, 4132560, 3412560, 4312560, 2341560, 3241560, 2431560, 4231560, 3421560, 4321560,
+      1235460, 2135460, 1325460, 3125460, 2315460, 3215460, 1253460, 2153460, 1523460, 5123460, 2513460, 5213460,
+      1352460, 3152460, 1532460, 5132460, 3512460, 5312460, 2351460, 3251460, 2531460, 5231460, 3521460, 5321460,
+      1245360, 2145360, 1425360, 4125360, 2415360, 4215360, 1254360, 2154360, 1524360, 5124360, 2514360, 5214360,
+      1452360, 4152360, 1542360, 5142360, 4512360, 5412360, 2451360, 4251360, 2541360, 5241360, 4521360, 5421360,
+      1345260, 3145260, 1435260, 4135260, 3415260, 4315260, 1354260, 3154260, 1534260, 5134260, 3514260, 5314260,
+      1453260, 4153260, 1543260, 5143260, 4513260, 5413260, 3451260, 4351260, 3541260, 5341260, 4531260, 5431260,
+      2345160, 3245160, 2435160, 4235160, 3425160, 4325160, 2354160, 3254160, 2534160, 5234160, 3524160, 5324160,
+      2453160, 4253160, 2543160, 5243160, 4523160, 5423160, 3452160, 4352160, 3542160, 5342160, 4532160, 5432160,
+      1234650, 2134650, 1324650, 3124650, 2314650, 3214650, 1243650, 2143650, 1423650, 4123650, 2413650, 4213650,
+      1342650, 3142650, 1432650, 4132650, 3412650, 4312650, 2341650, 3241650, 2431650, 4231650, 3421650, 4321650,
+      1236450, 2136450, 1326450, 3126450, 2316450, 3216450, 1263450, 2163450, 1623450, 6123450, 2613450, 6213450,
+      1362450, 3162450, 1632450, 6132450, 3612450, 6312450, 2361450, 3261450, 2631450, 6231450, 3621450, 6321450,
+      1246350, 2146350, 1426350, 4126350, 2416350, 4216350, 1264350, 2164350, 1624350, 6124350, 2614350, 6214350,
+      1462350, 4162350, 1642350, 6142350, 4612350, 6412350, 2461350, 4261350, 2641350, 6241350, 4621350, 6421350,
+      1346250, 3146250, 1436250, 4136250, 3416250, 4316250, 1364250, 3164250, 1634250, 6134250, 3614250, 6314250,
+      1463250, 4163250, 1643250, 6143250, 4613250, 6413250, 3461250, 4361250, 3641250, 6341250, 4631250, 6431250,
+      2346150, 3246150, 2436150, 4236150, 3426150, 4326150, 2364150, 3264150, 2634150, 6234150, 3624150, 6324150,
+      2463150, 4263150, 2643150, 6243150, 4623150, 6423150, 3462150, 4362150, 3642150, 6342150, 4632150, 6432150,
+      1235640, 2135640, 1325640, 3125640, 2315640, 3215640, 1253640, 2153640, 1523640, 5123640, 2513640, 5213640,
+      1352640, 3152640, 1532640, 5132640, 3512640, 5312640, 2351640, 3251640, 2531640, 5231640, 3521640, 5321640,
+      1236540, 2136540, 1326540, 3126540, 2316540, 3216540, 1263540, 2163540, 1623540, 6123540, 2613540, 6213540,
+      1362540, 3162540, 1632540, 6132540, 3612540, 6312540, 2361540, 3261540, 2631540, 6231540, 3621540, 6321540,
+      1256340, 2156340, 1526340, 5126340, 2516340, 5216340, 1265340, 2165340, 1625340, 6125340, 2615340, 6215340,
+      1562340, 5162340, 1652340, 6152340, 5612340, 6512340, 2561340, 5261340, 2651340, 6251340, 5621340, 6521340,
+      1356240, 3156240, 1536240, 5136240, 3516240, 5316240, 1365240, 3165240, 1635240, 6135240, 3615240, 6315240,
+      1563240, 5163240, 1653240, 6153240, 5613240, 6513240, 3561240, 5361240, 3651240, 6351240, 5631240, 6531240,
+      2356140, 3256140, 2536140, 5236140, 3526140, 5326140, 2365140, 3265140, 2635140, 6235140, 3625140, 6325140,
+      2563140, 5263140, 2653140, 6253140, 5623140, 6523140, 3562140, 5362140, 3652140, 6352140, 5632140, 6532140,
+      1245630, 2145630, 1425630, 4125630, 2415630, 4215630, 1254630, 2154630, 1524630, 5124630, 2514630, 5214630,
+      1452630, 4152630, 1542630, 5142630, 4512630, 5412630, 2451630, 4251630, 2541630, 5241630, 4521630, 5421630,
+      1246530, 2146530, 1426530, 4126530, 2416530, 4216530, 1264530, 2164530, 1624530, 6124530, 2614530, 6214530,
+      1462530, 4162530, 1642530, 6142530, 4612530, 6412530, 2461530, 4261530, 2641530, 6241530, 4621530, 6421530,
+      1256430, 2156430, 1526430, 5126430, 2516430, 5216430, 1265430, 2165430, 1625430, 6125430, 2615430, 6215430,
+      1562430, 5162430, 1652430, 6152430, 5612430, 6512430, 2561430, 5261430, 2651430, 6251430, 5621430, 6521430,
+      1456230, 4156230, 1546230, 5146230, 4516230, 5416230, 1465230, 4165230, 1645230, 6145230, 4615230, 6415230,
+      1564230, 5164230, 1654230, 6154230, 5614230, 6514230, 4561230, 5461230, 4651230, 6451230, 5641230, 6541230,
+      2456130, 4256130, 2546130, 5246130, 4526130, 5426130, 2465130, 4265130, 2645130, 6245130, 4625130, 6425130,
+      2564130, 5264130, 2654130, 6254130, 5624130, 6524130, 4562130, 5462130, 4652130, 6452130, 5642130, 6542130,
+      1345620, 3145620, 1435620, 4135620, 3415620, 4315620, 1354620, 3154620, 1534620, 5134620, 3514620, 5314620,
+      1453620, 4153620, 1543620, 5143620, 4513620, 5413620, 3451620, 4351620, 3541620, 5341620, 4531620, 5431620,
+      1346520, 3146520, 1436520, 4136520, 3416520, 4316520, 1364520, 3164520, 1634520, 6134520, 3614520, 6314520,
+      1463520, 4163520, 1643520, 6143520, 4613520, 6413520, 3461520, 4361520, 3641520, 6341520, 4631520, 6431520,
+      1356420, 3156420, 1536420, 5136420, 3516420, 5316420, 1365420, 3165420, 1635420, 6135420, 3615420, 6315420,
+      1563420, 5163420, 1653420, 6153420, 5613420, 6513420, 3561420, 5361420, 3651420, 6351420, 5631420, 6531420,
+      1456320, 4156320, 1546320, 5146320, 4516320, 5416320, 1465320, 4165320, 1645320, 6145320, 4615320, 6415320,
+      1564320, 5164320, 1654320, 6154320, 5614320, 6514320, 4561320, 5461320, 4651320, 6451320, 5641320, 6541320,
+      3456120, 4356120, 3546120, 5346120, 4536120, 5436120, 3465120, 4365120, 3645120, 6345120, 4635120, 6435120,
+      3564120, 5364120, 3654120, 6354120, 5634120, 6534120, 4563120, 5463120, 4653120, 6453120, 5643120, 6543120,
+      2345610, 3245610, 2435610, 4235610, 3425610, 4325610, 2354610, 3254610, 2534610, 5234610, 3524610, 5324610,
+      2453610, 4253610, 2543610, 5243610, 4523610, 5423610, 3452610, 4352610, 3542610, 5342610, 4532610, 5432610,
+      2346510, 3246510, 2436510, 4236510, 3426510, 4326510, 2364510, 3264510, 2634510, 6234510, 3624510, 6324510,
+      2463510, 4263510, 2643510, 6243510, 4623510, 6423510, 3462510, 4362510, 3642510, 6342510, 4632510, 6432510,
+      2356410, 3256410, 2536410, 5236410, 3526410, 5326410, 2365410, 3265410, 2635410, 6235410, 3625410, 6325410,
+      2563410, 5263410, 2653410, 6253410, 5623410, 6523410, 3562410, 5362410, 3652410, 6352410, 5632410, 6532410,
+      2456310, 4256310, 2546310, 5246310, 4526310, 5426310, 2465310, 4265310, 2645310, 6245310, 4625310, 6425310,
+      2564310, 5264310, 2654310, 6254310, 5624310, 6524310, 4562310, 5462310, 4652310, 6452310, 5642310, 6542310,
+      3456210, 4356210, 3546210, 5346210, 4536210, 5436210, 3465210, 4365210, 3645210, 6345210, 4635210, 6435210,
+      3564210, 5364210, 3654210, 6354210, 5634210, 6534210, 4563210, 5463210, 4653210, 6453210, 5643210, 6543210
+    };
+    std::map<uint64_t, int> expected;
+    for (std::size_t i = 0; i < 5040; i++)
+      expected[pre_expected[i]] = 0; // flags are 0, everything is symmetric here
+
+    VERIFY(isDynGroup(group));
+    VERIFY_IS_EQUAL(group.size(), 5040u);
+    VERIFY_IS_EQUAL(group.globalFlags(), 0);
+    group.apply<checkIdx, int>(identity7, 0, found, expected);
+    VERIFY_IS_EQUAL(found.size(), 5040u);
+  }
+}
+
+static void test_tensor_epsilon()
+{
+  SGroup<AntiSymmetry<0,1>, AntiSymmetry<1,2>> sym;
+  Tensor<int, 3> epsilon(3,3,3);
+
+  epsilon.setZero();
+  sym(epsilon, 0, 1, 2) = 1;
+
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 3; j++) {
+      for (int k = 0; k < 3; k++) {
+        VERIFY_IS_EQUAL((epsilon(i,j,k)), (- (j - i) * (k - j) * (i - k) / 2) );
+      }
+    }
+  }
+}
+
+static void test_tensor_sym()
+{
+  SGroup<Symmetry<0,1>, Symmetry<2,3>> sym;
+  Tensor<int, 4> t(10,10,10,10);
+
+  t.setZero();
+
+  for (int l = 0; l < 10; l++) {
+    for (int k = l; k < 10; k++) {
+      for (int j = 0; j < 10; j++) {
+        for (int i = j; i < 10; i++) {
+          sym(t, i, j, k, l) = (i + j) * (k + l);
+        }
+      }
+    }
+  }
+
+  for (int l = 0; l < 10; l++) {
+    for (int k = 0; k < 10; k++) {
+      for (int j = 0; j < 10; j++) {
+        for (int i = 0; i < 10; i++) {
+          VERIFY_IS_EQUAL((t(i, j, k, l)), ((i + j) * (k + l)));
+        }
+      }
+    }
+  }
+
+}
+
+static void test_tensor_asym()
+{
+  SGroup<AntiSymmetry<0,1>, AntiSymmetry<2,3>> sym;
+  Tensor<int, 4> t(10,10,10,10);
+
+  t.setZero();
+
+  for (int l = 0; l < 10; l++) {
+    for (int k = l + 1; k < 10; k++) {
+      for (int j = 0; j < 10; j++) {
+        for (int i = j + 1; i < 10; i++) {
+          sym(t, i, j, k, l) = ((i * j) + (k * l));
+        }
+      }
+    }
+  }
+
+  for (int l = 0; l < 10; l++) {
+    for (int k = 0; k < 10; k++) {
+      for (int j = 0; j < 10; j++) {
+        for (int i = 0; i < 10; i++) {
+          if (i < j && k < l)
+            VERIFY_IS_EQUAL((t(i, j, k, l)), (((i * j) + (k * l))));
+          else if (i > j && k > l)
+            VERIFY_IS_EQUAL((t(i, j, k, l)), (((i * j) + (k * l))));
+          else if (i < j && k > l)
+            VERIFY_IS_EQUAL((t(i, j, k, l)), (- ((i * j) + (k * l))));
+          else if (i > j && k < l)
+            VERIFY_IS_EQUAL((t(i, j, k, l)), (- ((i * j) + (k * l))));
+          else
+            VERIFY_IS_EQUAL((t(i, j, k, l)), 0);
+        }
+      }
+    }
+  }
+}
+
+static void test_tensor_dynsym()
+{
+  DynamicSGroup sym;
+  sym.addSymmetry(0,1);
+  sym.addSymmetry(2,3);
+  Tensor<int, 4> t(10,10,10,10);
+
+  t.setZero();
+
+  for (int l = 0; l < 10; l++) {
+    for (int k = l; k < 10; k++) {
+      for (int j = 0; j < 10; j++) {
+        for (int i = j; i < 10; i++) {
+          sym(t, i, j, k, l) = (i + j) * (k + l);
+        }
+      }
+    }
+  }
+
+  for (int l = 0; l < 10; l++) {
+    for (int k = 0; k < 10; k++) {
+      for (int j = 0; j < 10; j++) {
+        for (int i = 0; i < 10; i++) {
+          VERIFY_IS_EQUAL((t(i, j, k, l)), ((i + j) * (k + l)));
+        }
+      }
+    }
+  }
+}
+
+static void test_tensor_randacc()
+{
+  SGroup<Symmetry<0,1>, Symmetry<2,3>> sym;
+  Tensor<int, 4> t(10,10,10,10);
+
+  t.setZero();
+
+  // set elements 1 million times, that way we access the
+  // entire matrix
+  for (int n = 0; n < 1000000; n++) {
+    int i = rand() % 10;
+    int j = rand() % 10;
+    int k = rand() % 10;
+    int l = rand() % 10;
+    // only access those indices in a given order
+    if (i < j)
+      std::swap(i, j);
+    if (k < l)
+      std::swap(k, l);
+    sym(t, i, j, k, l) = (i + j) * (k + l);
+  }
+
+  for (int l = 0; l < 10; l++) {
+    for (int k = 0; k < 10; k++) {
+      for (int j = 0; j < 10; j++) {
+        for (int i = 0; i < 10; i++) {
+          VERIFY_IS_EQUAL((t(i, j, k, l)), ((i + j) * (k + l)));
+        }
+      }
+    }
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_symmetry)
+{
+  CALL_SUBTEST(test_symgroups_static());
+  CALL_SUBTEST(test_symgroups_dynamic());
+  CALL_SUBTEST(test_symgroups_selection());
+  CALL_SUBTEST(test_tensor_epsilon());
+  CALL_SUBTEST(test_tensor_sym());
+  CALL_SUBTEST(test_tensor_asym());
+  CALL_SUBTEST(test_tensor_dynsym());
+  CALL_SUBTEST(test_tensor_randacc());
+}
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */

diff --git a/unsupported/test/cxx11_tensor_thread_local.cpp b/unsupported/test/cxx11_tensor_thread_local.cpp
new file mode 100644
index 0000000..7e866f6
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_thread_local.cpp

@@ -0,0 +1,149 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+#include <iostream>
+#include <unordered_set>
+
+#include "main.h"
+#include <Eigen/CXX11/ThreadPool>
+
+struct Counter {
+  Counter() = default;
+
+  void inc() {
+    // Check that mutation happens only in a thread that created this counter.
+    VERIFY_IS_EQUAL(std::this_thread::get_id(), created_by);
+    counter_value++;
+  }
+  int value() { return counter_value; }
+
+  std::thread::id created_by;
+  int counter_value = 0;
+};
+
+struct InitCounter {
+  void operator()(Counter& counter) {
+    counter.created_by = std::this_thread::get_id();
+  }
+};
+
+void test_simple_thread_local() {
+  int num_threads = internal::random<int>(4, 32);
+  Eigen::ThreadPool thread_pool(num_threads);
+  Eigen::ThreadLocal<Counter, InitCounter> counter(num_threads, InitCounter());
+
+  int num_tasks = 3 * num_threads;
+  Eigen::Barrier barrier(num_tasks);
+
+  for (int i = 0; i < num_tasks; ++i) {
+    thread_pool.Schedule([&counter, &barrier]() {
+      Counter& local = counter.local();
+      local.inc();
+
+      std::this_thread::sleep_for(std::chrono::milliseconds(100));
+      barrier.Notify();
+    });
+  }
+
+  barrier.Wait();
+
+  counter.ForEach(
+      [](std::thread::id, Counter& cnt) { VERIFY_IS_EQUAL(cnt.value(), 3); });
+}
+
+void test_zero_sized_thread_local() {
+  Eigen::ThreadLocal<Counter, InitCounter> counter(0, InitCounter());
+
+  Counter& local = counter.local();
+  local.inc();
+
+  int total = 0;
+  counter.ForEach([&total](std::thread::id, Counter& cnt) {
+    total += cnt.value();
+    VERIFY_IS_EQUAL(cnt.value(), 1);
+  });
+
+  VERIFY_IS_EQUAL(total, 1);
+}
+
+// All thread local values fits into the lock-free storage.
+void test_large_number_of_tasks_no_spill() {
+  int num_threads = internal::random<int>(4, 32);
+  Eigen::ThreadPool thread_pool(num_threads);
+  Eigen::ThreadLocal<Counter, InitCounter> counter(num_threads, InitCounter());
+
+  int num_tasks = 10000;
+  Eigen::Barrier barrier(num_tasks);
+
+  for (int i = 0; i < num_tasks; ++i) {
+    thread_pool.Schedule([&counter, &barrier]() {
+      Counter& local = counter.local();
+      local.inc();
+      barrier.Notify();
+    });
+  }
+
+  barrier.Wait();
+
+  int total = 0;
+  std::unordered_set<std::thread::id> unique_threads;
+
+  counter.ForEach([&](std::thread::id id, Counter& cnt) {
+    total += cnt.value();
+    unique_threads.insert(id);
+  });
+
+  VERIFY_IS_EQUAL(total, num_tasks);
+  // Not all threads in a pool might be woken up to execute submitted tasks.
+  // Also thread_pool.Schedule() might use current thread if queue is full.
+  VERIFY_IS_EQUAL(
+      unique_threads.size() <= (static_cast<size_t>(num_threads + 1)), true);
+}
+
+// Lock free thread local storage is too small to fit all the unique threads,
+// and it spills to a map guarded by a mutex.
+void test_large_number_of_tasks_with_spill() {
+  int num_threads = internal::random<int>(4, 32);
+  Eigen::ThreadPool thread_pool(num_threads);
+  Eigen::ThreadLocal<Counter, InitCounter> counter(1, InitCounter());
+
+  int num_tasks = 10000;
+  Eigen::Barrier barrier(num_tasks);
+
+  for (int i = 0; i < num_tasks; ++i) {
+    thread_pool.Schedule([&counter, &barrier]() {
+      Counter& local = counter.local();
+      local.inc();
+      barrier.Notify();
+    });
+  }
+
+  barrier.Wait();
+
+  int total = 0;
+  std::unordered_set<std::thread::id> unique_threads;
+
+  counter.ForEach([&](std::thread::id id, Counter& cnt) {
+    total += cnt.value();
+    unique_threads.insert(id);
+  });
+
+  VERIFY_IS_EQUAL(total, num_tasks);
+  // Not all threads in a pool might be woken up to execute submitted tasks.
+  // Also thread_pool.Schedule() might use current thread if queue is full.
+  VERIFY_IS_EQUAL(
+      unique_threads.size() <= (static_cast<size_t>(num_threads + 1)), true);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_thread_local) {
+  CALL_SUBTEST(test_simple_thread_local());
+  CALL_SUBTEST(test_zero_sized_thread_local());
+  CALL_SUBTEST(test_large_number_of_tasks_no_spill());
+  CALL_SUBTEST(test_large_number_of_tasks_with_spill());
+}

diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
new file mode 100644
index 0000000..b772a1d
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp

@@ -0,0 +1,721 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+
+#include "main.h"
+#include <iostream>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+class TestAllocator : public Allocator {
+ public:
+  ~TestAllocator() EIGEN_OVERRIDE {}
+  EIGEN_DEVICE_FUNC void* allocate(size_t num_bytes) const EIGEN_OVERRIDE {
+    const_cast<TestAllocator*>(this)->alloc_count_++;
+    return internal::aligned_malloc(num_bytes);
+  }
+  EIGEN_DEVICE_FUNC void deallocate(void* buffer) const EIGEN_OVERRIDE {
+    const_cast<TestAllocator*>(this)->dealloc_count_++;
+    internal::aligned_free(buffer);
+  }
+
+  int alloc_count() const { return alloc_count_; }
+  int dealloc_count() const { return dealloc_count_; }
+
+ private:
+  int alloc_count_ = 0;
+  int dealloc_count_ = 0;
+};
+
+void test_multithread_elementwise()
+{
+  Tensor<float, 3> in1(200, 30, 70);
+  Tensor<float, 3> in2(200, 30, 70);
+  Tensor<double, 3> out(200, 30, 70);
+
+  in1.setRandom();
+  in2.setRandom();
+
+  Eigen::ThreadPool tp(internal::random<int>(3, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
+  out.device(thread_pool_device) = (in1 + in2 * 3.14f).cast<double>();
+
+  for (int i = 0; i < 200; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f));
+      }
+    }
+  }
+}
+
+void test_async_multithread_elementwise()
+{
+  Tensor<float, 3> in1(200, 30, 70);
+  Tensor<float, 3> in2(200, 30, 70);
+  Tensor<double, 3> out(200, 30, 70);
+
+  in1.setRandom();
+  in2.setRandom();
+
+  Eigen::ThreadPool tp(internal::random<int>(3, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
+
+  Eigen::Barrier b(1);
+  out.device(thread_pool_device, [&b]() { b.Notify(); }) = (in1 + in2 * 3.14f).cast<double>();
+  b.Wait();
+
+  for (int i = 0; i < 200; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f));
+      }
+    }
+  }
+}
+
+void test_multithread_compound_assignment()
+{
+  Tensor<float, 3> in1(2,3,7);
+  Tensor<float, 3> in2(2,3,7);
+  Tensor<float, 3> out(2,3,7);
+
+  in1.setRandom();
+  in2.setRandom();
+
+  Eigen::ThreadPool tp(internal::random<int>(3, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
+  out.device(thread_pool_device) = in1;
+  out.device(thread_pool_device) += in2 * 3.14f;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f);
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+void test_multithread_contraction()
+{
+  Tensor<float, 4, DataLayout> t_left(30, 50, 37, 31);
+  Tensor<float, 5, DataLayout> t_right(37, 31, 70, 2, 10);
+  Tensor<float, 5, DataLayout> t_result(30, 50, 70, 2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  // this contraction should be equivalent to a single matrix multiplication
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+
+  typedef Map<Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 1500, 1147);
+  MapXf m_right(t_right.data(), 1147, 1400);
+  Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
+
+  Eigen::ThreadPool tp(4);
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, 4);
+
+  // compute results by separate methods
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+
+ for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+    if (fabsf(t_result(i) - m_result(i)) < 1e-4f) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), m_result(i), 1e-4f)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+              << " vs " <<  m_result(i) << std::endl;
+    assert(false);
+  }
+}
+
+template<int DataLayout>
+void test_contraction_corner_cases()
+{
+  Tensor<float, 2, DataLayout> t_left(32, 500);
+  Tensor<float, 2, DataLayout> t_right(32, 28*28);
+  Tensor<float, 2, DataLayout> t_result(500, 28*28);
+
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
+  t_result = t_result.constant(NAN);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims{{DimPair(0, 0)}};
+
+  typedef Map<Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 32, 500);
+  MapXf m_right(t_right.data(), 32, 28*28);
+  Matrix<float, Dynamic, Dynamic, DataLayout> m_result(500, 28*28);
+
+  Eigen::ThreadPool tp(12);
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, 12);
+
+  // compute results by separate methods
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  m_result = m_left.transpose() * m_right;
+
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!(numext::isnan)(t_result.data()[i]));
+    if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
+      std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  t_left.resize(32, 1);
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_result.resize (1, 28*28);
+  t_result = t_result.constant(NAN);
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  new(&m_left) MapXf(t_left.data(), 32, 1);
+  m_result = m_left.transpose() * m_right;
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!(numext::isnan)(t_result.data()[i]));
+    if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
+      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  t_left.resize(32, 500);
+  t_right.resize(32, 4);
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
+  t_result.resize (500, 4);
+  t_result = t_result.constant(NAN);
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  new(&m_left) MapXf(t_left.data(), 32, 500);
+  new(&m_right) MapXf(t_right.data(), 32, 4);
+  m_result = m_left.transpose() * m_right;
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!(numext::isnan)(t_result.data()[i]));
+    if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
+      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  t_left.resize(32, 1);
+  t_right.resize(32, 4);
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
+  t_result.resize (1, 4);
+  t_result = t_result.constant(NAN);
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  new(&m_left) MapXf(t_left.data(), 32, 1);
+  new(&m_right) MapXf(t_right.data(), 32, 4);
+  m_result = m_left.transpose() * m_right;
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!(numext::isnan)(t_result.data()[i]));
+    if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
+      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+}
+
+template<int DataLayout>
+void test_multithread_contraction_agrees_with_singlethread() {
+  int contract_size = internal::random<int>(1, 5000);
+
+  Tensor<float, 3, DataLayout> left(internal::random<int>(1, 80),
+                                    contract_size,
+                                    internal::random<int>(1, 100));
+
+  Tensor<float, 4, DataLayout> right(internal::random<int>(1, 25),
+                                     internal::random<int>(1, 37),
+                                     contract_size,
+                                     internal::random<int>(1, 51));
+
+  left.setRandom();
+  right.setRandom();
+
+  // add constants to shift values away from 0 for more precision
+  left += left.constant(1.5f);
+  right += right.constant(1.5f);
+
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
+
+  Eigen::ThreadPool tp(internal::random<int>(2, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11));
+
+  Tensor<float, 5, DataLayout> st_result;
+  st_result = left.contract(right, dims);
+
+  Tensor<float, 5, DataLayout> tp_result(st_result.dimensions());
+  tp_result.device(thread_pool_device) = left.contract(right, dims);
+
+  VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
+  for (ptrdiff_t i = 0; i < st_result.size(); i++) {
+    // if both of the values are very small, then do nothing (because the test will fail
+    // due to numerical precision issues when values are small)
+    if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) {
+      VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
+    }
+  }
+}
+
+// Apply Sqrt to all output elements.
+struct SqrtOutputKernel {
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
+      const TensorContractionParams&, Index, Index, Index num_rows,
+      Index num_cols) const {
+    for (int i = 0; i < num_rows; ++i) {
+      for (int j = 0; j < num_cols; ++j) {
+        output_mapper(i, j) = std::sqrt(output_mapper(i, j));
+      }
+    }
+  }
+};
+
+template <int DataLayout>
+static void test_multithread_contraction_with_output_kernel() {
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(2, 11);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
+  Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
+  Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in mat4 to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 1500, 248);
+  MapXf m_right(t_right.data(), 248, 1400);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+
+  // compute results by separate methods
+  t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
+
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+    VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+  }
+}
+
+template<int DataLayout>
+void test_async_multithread_contraction_agrees_with_singlethread()
+{
+  int contract_size = internal::random<int>(100, 500);
+
+  Tensor<float, 3, DataLayout> left(internal::random<int>(10, 40),
+                                    contract_size,
+                                    internal::random<int>(10, 40));
+
+  Tensor<float, 4, DataLayout> right(
+      internal::random<int>(1, 20), internal::random<int>(1, 20), contract_size,
+      internal::random<int>(1, 20));
+
+  left.setRandom();
+  right.setRandom();
+
+  // add constants to shift values away from 0 for more precision
+  left += left.constant(1.5f);
+  right += right.constant(1.5f);
+
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
+
+  Eigen::ThreadPool tp(internal::random<int>(2, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(8, 32));
+
+  Tensor<float, 5, DataLayout> st_result;
+  st_result = left.contract(right, dims);
+
+  Tensor<float, 5, DataLayout> tp_result(st_result.dimensions());
+
+  Eigen::Barrier barrier(1);
+  tp_result.device(thread_pool_device, [&barrier]() { barrier.Notify(); }) =
+      left.contract(right, dims);
+  barrier.Wait();
+
+  VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
+  for (ptrdiff_t i = 0; i < st_result.size(); i++) {
+    // if both of the values are very small, then do nothing (because the test
+    // will fail due to numerical precision issues when values are small)
+    if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) {
+      VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
+    }
+  }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization.
+template <int DataLayout>
+static void test_sharded_by_inner_dim_contraction()
+{
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(4, 16);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 2, DataLayout> t_left(2, 10000);
+  Tensor<float, 2, DataLayout> t_right(10000, 10);
+  Tensor<float, 2, DataLayout> t_result(2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in t_result to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 2, 10000);
+  MapXf m_right(t_right.data(), 10000, 10);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+  // compute results by separate methods
+  t_result.device(device) = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization with output kernel.
+template <int DataLayout>
+static void test_sharded_by_inner_dim_contraction_with_output_kernel()
+{
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(4, 16);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 2, DataLayout> t_left(2, 10000);
+  Tensor<float, 2, DataLayout> t_right(10000, 10);
+  Tensor<float, 2, DataLayout> t_result(2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in t_result to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 2, 10000);
+  MapXf m_right(t_right.data(), 10000, 10);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+  // compute results by separate methods
+  t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+  }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization.
+template <int DataLayout>
+static void test_async_sharded_by_inner_dim_contraction()
+{
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(4, 16);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 2, DataLayout> t_left(2, 10000);
+  Tensor<float, 2, DataLayout> t_right(10000, 10);
+  Tensor<float, 2, DataLayout> t_result(2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in t_result to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 2, 10000);
+  MapXf m_right(t_right.data(), 10000, 10);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+  // compute results by separate methods
+  Eigen::Barrier barrier(1);
+  t_result.device(device, [&barrier]() { barrier.Notify(); }) =
+      t_left.contract(t_right, dims);
+  barrier.Wait();
+
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization with output kernel.
+template <int DataLayout>
+static void test_async_sharded_by_inner_dim_contraction_with_output_kernel()
+{
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(4, 16);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 2, DataLayout> t_left(2, 10000);
+  Tensor<float, 2, DataLayout> t_right(10000, 10);
+  Tensor<float, 2, DataLayout> t_result(2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in t_result to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 2, 10000);
+  MapXf m_right(t_right.data(), 10000, 10);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+  // compute results by separate methods
+  Eigen::Barrier barrier(1);
+  t_result.device(device, [&barrier]() { barrier.Notify(); }) =
+      t_left.contract(t_right, dims, SqrtOutputKernel());
+  barrier.Wait();
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+  }
+}
+
+template<int DataLayout>
+void test_full_contraction() {
+  int contract_size1 = internal::random<int>(1, 500);
+  int contract_size2 = internal::random<int>(1, 500);
+
+  Tensor<float, 2, DataLayout> left(contract_size1,
+                                    contract_size2);
+  Tensor<float, 2, DataLayout> right(contract_size1,
+                                    contract_size2);
+  left.setRandom();
+  right.setRandom();
+
+  // add constants to shift values away from 0 for more precision
+  left += left.constant(1.5f);
+  right += right.constant(1.5f);
+
+  typedef Tensor<float, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}});
+
+  Eigen::ThreadPool tp(internal::random<int>(2, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11));
+
+  Tensor<float, 0, DataLayout> st_result;
+  st_result = left.contract(right, dims);
+
+  Tensor<float, 0, DataLayout> tp_result;
+  tp_result.device(thread_pool_device) = left.contract(right, dims);
+
+  VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
+  // if both of the values are very small, then do nothing (because the test will fail
+  // due to numerical precision issues when values are small)
+  if (numext::abs(st_result() - tp_result()) >= 1e-4f) {
+    VERIFY_IS_APPROX(st_result(), tp_result());
+  }
+}
+
+template<int DataLayout>
+void test_multithreaded_reductions() {
+  const int num_threads = internal::random<int>(3, 11);
+  ThreadPool thread_pool(num_threads);
+  Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, num_threads);
+
+  const int num_rows = internal::random<int>(13, 732);
+  const int num_cols = internal::random<int>(13, 732);
+  Tensor<float, 2, DataLayout> t1(num_rows, num_cols);
+  t1.setRandom();
+
+  Tensor<float, 0, DataLayout> full_redux;
+  full_redux = t1.sum();
+
+  Tensor<float, 0, DataLayout> full_redux_tp;
+  full_redux_tp.device(thread_pool_device) = t1.sum();
+
+  // Check that the single threaded and the multi threaded reductions return
+  // the same result.
+  VERIFY_IS_APPROX(full_redux(), full_redux_tp());
+}
+
+
+void test_memcpy() {
+
+  for (int i = 0; i < 5; ++i) {
+    const int num_threads = internal::random<int>(3, 11);
+    Eigen::ThreadPool tp(num_threads);
+    Eigen::ThreadPoolDevice thread_pool_device(&tp, num_threads);
+
+    const int size = internal::random<int>(13, 7632);
+    Tensor<float, 1> t1(size);
+    t1.setRandom();
+    std::vector<float> result(size);
+    thread_pool_device.memcpy(&result[0], t1.data(), size*sizeof(float));
+    for (int j = 0; j < size; j++) {
+      VERIFY_IS_EQUAL(t1(j), result[j]);
+    }
+  }
+}
+
+
+void test_multithread_random()
+{
+  Eigen::ThreadPool tp(2);
+  Eigen::ThreadPoolDevice device(&tp, 2);
+  Tensor<float, 1> t(1 << 20);
+  t.device(device) = t.random<Eigen::internal::NormalRandomGenerator<float>>();
+}
+
+template<int DataLayout>
+void test_multithread_shuffle(Allocator* allocator)
+{
+  Tensor<float, 4, DataLayout> tensor(17,5,7,11);
+  tensor.setRandom();
+
+  const int num_threads = internal::random<int>(2, 11);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads, allocator);
+
+  Tensor<float, 4, DataLayout> shuffle(7,5,11,17);
+  array<ptrdiff_t, 4> shuffles = {{2,1,3,0}};
+  shuffle.device(device) = tensor.shuffle(shuffles);
+
+  for (int i = 0; i < 17; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,j,l,i));
+        }
+      }
+    }
+  }
+}
+
+void test_threadpool_allocate(TestAllocator* allocator)
+{
+  const int num_threads = internal::random<int>(2, 11);
+  const int num_allocs = internal::random<int>(2, 11);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads, allocator);
+
+  for (int a = 0; a < num_allocs; ++a) {
+    void* ptr = device.allocate(512);
+    device.deallocate(ptr);
+  }
+  VERIFY(allocator != NULL);
+  VERIFY_IS_EQUAL(allocator->alloc_count(), num_allocs);
+  VERIFY_IS_EQUAL(allocator->dealloc_count(), num_allocs);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool)
+{
+  CALL_SUBTEST_1(test_multithread_elementwise());
+  CALL_SUBTEST_1(test_async_multithread_elementwise());
+  CALL_SUBTEST_1(test_multithread_compound_assignment());
+
+  CALL_SUBTEST_2(test_multithread_contraction<ColMajor>());
+  CALL_SUBTEST_2(test_multithread_contraction<RowMajor>());
+
+  CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<ColMajor>());
+  CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
+  CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<RowMajor>());
+
+  CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<ColMajor>());
+  CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<RowMajor>());
+
+  // Test EvalShardedByInnerDimContext parallelization strategy.
+  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<ColMajor>());
+  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<RowMajor>());
+  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
+
+  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<ColMajor>());
+  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<RowMajor>());
+  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
+
+  // Exercise various cases that have been problematic in the past.
+  CALL_SUBTEST_7(test_contraction_corner_cases<ColMajor>());
+  CALL_SUBTEST_7(test_contraction_corner_cases<RowMajor>());
+
+  CALL_SUBTEST_8(test_full_contraction<ColMajor>());
+  CALL_SUBTEST_8(test_full_contraction<RowMajor>());
+
+  CALL_SUBTEST_9(test_multithreaded_reductions<ColMajor>());
+  CALL_SUBTEST_9(test_multithreaded_reductions<RowMajor>());
+
+  CALL_SUBTEST_10(test_memcpy());
+  CALL_SUBTEST_10(test_multithread_random());
+
+  TestAllocator test_allocator;
+  CALL_SUBTEST_11(test_multithread_shuffle<ColMajor>(NULL));
+  CALL_SUBTEST_11(test_multithread_shuffle<RowMajor>(&test_allocator));
+  CALL_SUBTEST_11(test_threadpool_allocate(&test_allocator));
+
+  // Force CMake to split this test.
+  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11
+}

diff --git a/unsupported/test/cxx11_tensor_trace.cpp b/unsupported/test/cxx11_tensor_trace.cpp
new file mode 100644
index 0000000..0097228
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_trace.cpp

@@ -0,0 +1,172 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gagan Goel <gagan.nith@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::array;
+
+template <int DataLayout>
+static void test_0D_trace() {
+  Tensor<float, 0, DataLayout> tensor;
+  tensor.setRandom();
+  array<ptrdiff_t, 0> dims;
+  Tensor<float, 0, DataLayout> result = tensor.trace(dims);
+  VERIFY_IS_EQUAL(result(), tensor());
+}
+
+
+template <int DataLayout>
+static void test_all_dimensions_trace() {
+  Tensor<float, 3, DataLayout> tensor1(5, 5, 5);
+  tensor1.setRandom();
+  Tensor<float, 0, DataLayout> result1 = tensor1.trace();
+  VERIFY_IS_EQUAL(result1.rank(), 0);
+  float sum = 0.0f;
+  for (int i = 0; i < 5; ++i) {
+    sum += tensor1(i, i, i);
+  }
+  VERIFY_IS_EQUAL(result1(), sum);
+
+  Tensor<float, 5, DataLayout> tensor2(7, 7, 7, 7, 7);
+  tensor2.setRandom();
+  array<ptrdiff_t, 5> dims = { { 2, 1, 0, 3, 4 } };
+  Tensor<float, 0, DataLayout> result2 = tensor2.trace(dims);
+  VERIFY_IS_EQUAL(result2.rank(), 0);
+  sum = 0.0f;
+  for (int i = 0; i < 7; ++i) {
+    sum += tensor2(i, i, i, i, i);
+  }
+  VERIFY_IS_EQUAL(result2(), sum);
+}
+
+
+template <int DataLayout>
+static void test_simple_trace() {
+  Tensor<float, 3, DataLayout> tensor1(3, 5, 3);
+  tensor1.setRandom();
+  array<ptrdiff_t, 2> dims1 = { { 0, 2 } };
+  Tensor<float, 1, DataLayout> result1 = tensor1.trace(dims1);
+  VERIFY_IS_EQUAL(result1.rank(), 1);
+  VERIFY_IS_EQUAL(result1.dimension(0), 5);
+  float sum = 0.0f;
+  for (int i = 0; i < 5; ++i) {
+    sum = 0.0f;
+    for (int j = 0; j < 3; ++j) {
+      sum += tensor1(j, i, j);
+    }
+    VERIFY_IS_EQUAL(result1(i), sum);
+  }
+
+  Tensor<float, 4, DataLayout> tensor2(5, 5, 7, 7);
+  tensor2.setRandom();
+  array<ptrdiff_t, 2> dims2 = { { 2, 3 } };
+  Tensor<float, 2, DataLayout> result2 = tensor2.trace(dims2);
+  VERIFY_IS_EQUAL(result2.rank(), 2);
+  VERIFY_IS_EQUAL(result2.dimension(0), 5);
+  VERIFY_IS_EQUAL(result2.dimension(1), 5);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      sum = 0.0f;
+      for (int k = 0; k < 7; ++k) {
+        sum += tensor2(i, j, k, k);
+      }
+      VERIFY_IS_EQUAL(result2(i, j), sum);
+    }
+  }
+
+  array<ptrdiff_t, 2> dims3 = { { 1, 0 } };
+  Tensor<float, 2, DataLayout> result3 = tensor2.trace(dims3);
+  VERIFY_IS_EQUAL(result3.rank(), 2);
+  VERIFY_IS_EQUAL(result3.dimension(0), 7);
+  VERIFY_IS_EQUAL(result3.dimension(1), 7);
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      sum = 0.0f;
+      for (int k = 0; k < 5; ++k) {
+        sum += tensor2(k, k, i, j);
+      }
+      VERIFY_IS_EQUAL(result3(i, j), sum);
+    }
+  }
+
+  Tensor<float, 5, DataLayout> tensor3(3, 7, 3, 7, 3);
+  tensor3.setRandom();
+  array<ptrdiff_t, 3> dims4 = { { 0, 2, 4 } };
+  Tensor<float, 2, DataLayout> result4 = tensor3.trace(dims4);
+  VERIFY_IS_EQUAL(result4.rank(), 2);
+  VERIFY_IS_EQUAL(result4.dimension(0), 7);
+  VERIFY_IS_EQUAL(result4.dimension(1), 7);
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      sum = 0.0f;
+      for (int k = 0; k < 3; ++k) {
+        sum += tensor3(k, i, k, j, k);
+      }
+      VERIFY_IS_EQUAL(result4(i, j), sum);
+    }
+  }
+
+  Tensor<float, 5, DataLayout> tensor4(3, 7, 4, 7, 5);
+  tensor4.setRandom();
+  array<ptrdiff_t, 2> dims5 = { { 1, 3 } };
+  Tensor<float, 3, DataLayout> result5 = tensor4.trace(dims5);
+  VERIFY_IS_EQUAL(result5.rank(), 3);
+  VERIFY_IS_EQUAL(result5.dimension(0), 3);
+  VERIFY_IS_EQUAL(result5.dimension(1), 4);
+  VERIFY_IS_EQUAL(result5.dimension(2), 5);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        sum = 0.0f;
+        for (int l = 0; l < 7; ++l) {
+          sum += tensor4(i, l, j, l, k);
+        }
+        VERIFY_IS_EQUAL(result5(i, j, k), sum);
+      }
+    }
+  }
+}
+
+
+template<int DataLayout>
+static void test_trace_in_expr() {
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 3);
+  tensor.setRandom();
+  array<ptrdiff_t, 2> dims = { { 1, 3 } };
+  Tensor<float, 2, DataLayout> result(2, 5);
+  result = result.constant(1.0f) - tensor.trace(dims);
+  VERIFY_IS_EQUAL(result.rank(), 2);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 5);
+  float sum = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      sum = 0.0f;
+      for (int k = 0; k < 3; ++k) {
+        sum += tensor(i, k, j, k);
+      }
+      VERIFY_IS_EQUAL(result(i, j), 1.0f - sum);
+    }
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_trace) {
+  CALL_SUBTEST(test_0D_trace<ColMajor>());
+  CALL_SUBTEST(test_0D_trace<RowMajor>());
+  CALL_SUBTEST(test_all_dimensions_trace<ColMajor>());
+  CALL_SUBTEST(test_all_dimensions_trace<RowMajor>());
+  CALL_SUBTEST(test_simple_trace<ColMajor>());
+  CALL_SUBTEST(test_simple_trace<RowMajor>());
+  CALL_SUBTEST(test_trace_in_expr<ColMajor>());
+  CALL_SUBTEST(test_trace_in_expr<RowMajor>());
+}

diff --git a/unsupported/test/cxx11_tensor_uint128.cpp b/unsupported/test/cxx11_tensor_uint128.cpp
new file mode 100644
index 0000000..46fceaa
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_uint128.cpp

@@ -0,0 +1,160 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+#if EIGEN_COMP_MSVC || !defined(__SIZEOF_INT128__)
+#define EIGEN_NO_INT128
+#else
+typedef __uint128_t uint128_t;
+#endif
+
+// Only run the test on compilers that support 128bit integers natively
+#ifndef EIGEN_NO_INT128
+
+using Eigen::internal::TensorUInt128;
+using Eigen::internal::static_val;
+
+void VERIFY_EQUAL(TensorUInt128<uint64_t, uint64_t> actual, uint128_t expected) {
+  bool matchl = actual.lower() == static_cast<uint64_t>(expected);
+  bool matchh = actual.upper() == static_cast<uint64_t>(expected >> 64);
+  if (!matchl || !matchh) {
+    const char* testname = g_test_stack.back().c_str();
+    std::cerr << "Test " << testname << " failed in " << __FILE__
+              << " (" << __LINE__ << ")"
+              << std::endl;
+    abort();
+  }
+}
+
+
+void test_add() {
+  uint64_t incr = internal::random<uint64_t>(1, 9999999999);
+  for (uint64_t i1 = 0; i1 < 100; ++i1) {
+    for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
+      TensorUInt128<uint64_t, uint64_t> i(i1, i2);
+      uint128_t a = (static_cast<uint128_t>(i1) << 64) + static_cast<uint128_t>(i2);
+      for (uint64_t j1 = 0; j1 < 100; ++j1) {
+        for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
+          TensorUInt128<uint64_t, uint64_t> j(j1, j2);
+          uint128_t b = (static_cast<uint128_t>(j1) << 64) + static_cast<uint128_t>(j2);
+          TensorUInt128<uint64_t, uint64_t> actual = i + j;
+          uint128_t expected = a + b;
+          VERIFY_EQUAL(actual, expected);
+        }
+      }
+    }
+  }
+}
+
+void test_sub() {
+  uint64_t incr = internal::random<uint64_t>(1, 9999999999);
+  for (uint64_t i1 = 0; i1 < 100; ++i1) {
+    for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
+      TensorUInt128<uint64_t, uint64_t> i(i1, i2);
+      uint128_t a = (static_cast<uint128_t>(i1) << 64) + static_cast<uint128_t>(i2);
+      for (uint64_t j1 = 0; j1 < 100; ++j1) {
+        for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
+          TensorUInt128<uint64_t, uint64_t> j(j1, j2);
+          uint128_t b = (static_cast<uint128_t>(j1) << 64) + static_cast<uint128_t>(j2);
+          TensorUInt128<uint64_t, uint64_t> actual = i - j;
+          uint128_t expected = a - b;
+          VERIFY_EQUAL(actual, expected);
+        }
+      }
+    }
+  }
+}
+
+void test_mul() {
+  uint64_t incr = internal::random<uint64_t>(1, 9999999999);
+  for (uint64_t i1 = 0; i1 < 100; ++i1) {
+    for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
+      TensorUInt128<uint64_t, uint64_t> i(i1, i2);
+      uint128_t a = (static_cast<uint128_t>(i1) << 64) + static_cast<uint128_t>(i2);
+      for (uint64_t j1 = 0; j1 < 100; ++j1) {
+        for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
+          TensorUInt128<uint64_t, uint64_t> j(j1, j2);
+          uint128_t b = (static_cast<uint128_t>(j1) << 64) + static_cast<uint128_t>(j2);
+          TensorUInt128<uint64_t, uint64_t> actual = i * j;
+          uint128_t expected = a * b;
+          VERIFY_EQUAL(actual, expected);
+        }
+      }
+    }
+  }
+}
+
+void test_div() {
+  uint64_t incr = internal::random<uint64_t>(1, 9999999999);
+  for (uint64_t i1 = 0; i1 < 100; ++i1) {
+    for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
+      TensorUInt128<uint64_t, uint64_t> i(i1, i2);
+      uint128_t a = (static_cast<uint128_t>(i1) << 64) + static_cast<uint128_t>(i2);
+      for (uint64_t j1 = 0; j1 < 100; ++j1) {
+        for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
+          TensorUInt128<uint64_t, uint64_t> j(j1, j2);
+          uint128_t b = (static_cast<uint128_t>(j1) << 64) + static_cast<uint128_t>(j2);
+          TensorUInt128<uint64_t, uint64_t> actual = i / j;
+          uint128_t expected = a / b;
+          VERIFY_EQUAL(actual, expected);
+        }
+      }
+    }
+  }
+}
+
+void test_misc1() {
+  uint64_t incr = internal::random<uint64_t>(1, 9999999999);
+  for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
+    TensorUInt128<static_val<0>, uint64_t> i(0, i2);
+    uint128_t a = static_cast<uint128_t>(i2);
+    for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
+      TensorUInt128<static_val<0>, uint64_t> j(0, j2);
+      uint128_t b = static_cast<uint128_t>(j2);
+      uint64_t actual = (i * j).upper();
+      uint64_t expected = (a * b) >> 64;
+      VERIFY_IS_EQUAL(actual, expected);
+    }
+  }
+}
+
+void test_misc2() {
+  int64_t incr = internal::random<int64_t>(1, 100);
+  for (int64_t log_div = 0; log_div < 63; ++log_div) {
+    for (int64_t divider = 1; divider <= 1000000 * incr; divider += incr) {
+      uint64_t expected = (static_cast<uint128_t>(1) << (64+log_div)) / static_cast<uint128_t>(divider) - (static_cast<uint128_t>(1) << 64) + 1;
+      uint64_t shift = 1ULL << log_div;
+
+      TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1));
+      uint64_t actual = static_cast<uint64_t>(result);
+      VERIFY_IS_EQUAL(actual, expected);
+    }
+  }
+}
+#endif
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_uint128)
+{
+#ifdef EIGEN_NO_INT128
+  // Skip the test on compilers that don't support 128bit integers natively
+  return;
+#else
+  CALL_SUBTEST_1(test_add());
+  CALL_SUBTEST_2(test_sub());
+  CALL_SUBTEST_3(test_mul());
+  CALL_SUBTEST_4(test_div());
+  CALL_SUBTEST_5(test_misc1());
+  CALL_SUBTEST_6(test_misc2());
+#endif
+}

diff --git a/unsupported/test/cxx11_tensor_volume_patch.cpp b/unsupported/test/cxx11_tensor_volume_patch.cpp
new file mode 100644
index 0000000..862212e
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_volume_patch.cpp

@@ -0,0 +1,112 @@
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+static void test_single_voxel_patch()
+{
+  Tensor<float, 5> tensor(4,2,3,5,7);
+  tensor.setRandom();
+  Tensor<float, 5, RowMajor> tensor_row_major = tensor.swap_layout();
+
+  Tensor<float, 6> single_voxel_patch;
+  single_voxel_patch = tensor.extract_volume_patches(1, 1, 1);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(0), 4);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(4), 2 * 3 * 5);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(5), 7);
+
+  Tensor<float, 6, RowMajor> single_voxel_patch_row_major;
+  single_voxel_patch_row_major = tensor_row_major.extract_volume_patches(1, 1, 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(1), 2 * 3 * 5);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(4), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(5), 4);
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    VERIFY_IS_EQUAL(tensor.data()[i], single_voxel_patch.data()[i]);
+    VERIFY_IS_EQUAL(tensor_row_major.data()[i], single_voxel_patch_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor.data()[i], tensor_row_major.data()[i]);
+  }
+}
+
+
+static void test_entire_volume_patch()
+{
+  const int depth = 4;
+  const int patch_z = 2;
+  const int patch_y = 3;
+  const int patch_x = 5;
+  const int batch = 7;
+
+  Tensor<float, 5> tensor(depth, patch_z, patch_y, patch_x, batch);
+  tensor.setRandom();
+  Tensor<float, 5, RowMajor> tensor_row_major = tensor.swap_layout();
+
+  Tensor<float, 6> entire_volume_patch;
+  entire_volume_patch = tensor.extract_volume_patches(patch_z, patch_y, patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(0), depth);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(1), patch_z);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(2), patch_y);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(3), patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(4), patch_z * patch_y * patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(5), batch);
+
+  Tensor<float, 6, RowMajor> entire_volume_patch_row_major;
+  entire_volume_patch_row_major = tensor_row_major.extract_volume_patches(patch_z, patch_y, patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(0), batch);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(1), patch_z * patch_y * patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(2), patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(3), patch_y);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(4), patch_z);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(5), depth);
+
+  const int dz = patch_z - 1;
+  const int dy = patch_y - 1;
+  const int dx = patch_x - 1;
+
+  const int forward_pad_z = dz / 2;
+  const int forward_pad_y = dy / 2;
+  const int forward_pad_x = dx / 2;
+
+  for (int pz = 0; pz < patch_z; pz++) {
+    for (int py = 0; py < patch_y; py++) {
+      for (int px = 0; px < patch_x; px++) {
+        const int patchId = pz + patch_z * (py + px * patch_y);
+        for (int z = 0; z < patch_z; z++) {
+          for (int y = 0; y < patch_y; y++) {
+            for (int x = 0; x < patch_x; x++) {
+              for (int b = 0; b < batch; b++) {
+                for (int d = 0; d < depth; d++) {
+                  float expected = 0.0f;
+                  float expected_row_major = 0.0f;
+                  const int eff_z = z - forward_pad_z + pz;
+                  const int eff_y = y - forward_pad_y + py;
+                  const int eff_x = x - forward_pad_x + px;
+                  if (eff_z >= 0 && eff_y >= 0 && eff_x >= 0 &&
+                      eff_z < patch_z && eff_y < patch_y && eff_x < patch_x) {
+                    expected = tensor(d, eff_z, eff_y, eff_x, b);
+                    expected_row_major = tensor_row_major(b, eff_x, eff_y, eff_z, d);
+                  }
+                  VERIFY_IS_EQUAL(entire_volume_patch(d, z, y, x, patchId, b), expected);
+                  VERIFY_IS_EQUAL(entire_volume_patch_row_major(b, patchId, x, y, z, d), expected_row_major);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch)
+{
+  CALL_SUBTEST(test_single_voxel_patch());
+  CALL_SUBTEST(test_entire_volume_patch());
+}

diff --git a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
new file mode 100644
index 0000000..8d99a48
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp

@@ -0,0 +1,222 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+static const int DataLayout = ColMajor;
+
+template <typename DataType, typename IndexType>
+static void test_single_voxel_patch_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+IndexType sizeDim0 = 4;
+IndexType sizeDim1 = 2;
+IndexType sizeDim2 = 3;
+IndexType sizeDim3 = 5;
+IndexType sizeDim4 = 7;
+array<IndexType, 5> tensorColMajorRange = {{sizeDim0, sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+array<IndexType, 5> tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1, sizeDim0}};
+Tensor<DataType, 5, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+Tensor<DataType, 5, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+tensor_col_major.setRandom();
+
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 5, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 5, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+
+
+  // single volume patch: ColMajor
+  array<IndexType, 6> patchColMajorTensorRange={{sizeDim0,1, 1, 1, sizeDim1*sizeDim2*sizeDim3, sizeDim4}};
+  Tensor<DataType, 6, DataLayout,IndexType> single_voxel_patch_col_major(patchColMajorTensorRange);
+  size_t patchTensorBuffSize =single_voxel_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_voxel_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 6, DataLayout,IndexType>> gpu_single_voxel_patch_col_major(gpu_data_single_voxel_patch_col_major, patchColMajorTensorRange);
+  gpu_single_voxel_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(1, 1, 1);
+  sycl_device.memcpyDeviceToHost(single_voxel_patch_col_major.data(), gpu_data_single_voxel_patch_col_major, patchTensorBuffSize);
+
+
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(0), 4);
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(4), 2 * 3 * 5);
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(5), 7);
+
+  array<IndexType, 6> patchRowMajorTensorRange={{sizeDim4, sizeDim1*sizeDim2*sizeDim3, 1, 1, 1, sizeDim0}};
+  Tensor<DataType, 6, RowMajor,IndexType> single_voxel_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =single_voxel_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_voxel_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 6, RowMajor,IndexType>> gpu_single_voxel_patch_row_major(gpu_data_single_voxel_patch_row_major, patchRowMajorTensorRange);
+  gpu_single_voxel_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(1, 1, 1);
+  sycl_device.memcpyDeviceToHost(single_voxel_patch_row_major.data(), gpu_data_single_voxel_patch_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(1), 2 * 3 * 5);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(4), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(5), 4);
+
+ sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+ for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+       VERIFY_IS_EQUAL(tensor_col_major.data()[i], single_voxel_patch_col_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor_row_major.data()[i], single_voxel_patch_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]);
+  }
+
+
+  sycl_device.deallocate(gpu_data_col_major);
+  sycl_device.deallocate(gpu_data_row_major);
+  sycl_device.deallocate(gpu_data_single_voxel_patch_col_major);
+  sycl_device.deallocate(gpu_data_single_voxel_patch_row_major);
+}
+
+template <typename DataType, typename IndexType>
+static void test_entire_volume_patch_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  const int depth = 4;
+  const int patch_z = 2;
+  const int patch_y = 3;
+  const int patch_x = 5;
+  const int batch = 7;
+
+  array<IndexType, 5> tensorColMajorRange = {{depth, patch_z, patch_y, patch_x, batch}};
+  array<IndexType, 5> tensorRowMajorRange = {{batch, patch_x, patch_y, patch_z, depth}};
+  Tensor<DataType, 5, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  Tensor<DataType, 5, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+  tensor_col_major.setRandom();
+
+
+    DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+    DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+    TensorMap<Tensor<DataType, 5, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+    TensorMap<Tensor<DataType, 5, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+    sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+    gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+    sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+
+    // single volume patch: ColMajor
+    array<IndexType, 6> patchColMajorTensorRange={{depth,patch_z, patch_y, patch_x, patch_z*patch_y*patch_x, batch}};
+    Tensor<DataType, 6, DataLayout,IndexType> entire_volume_patch_col_major(patchColMajorTensorRange);
+    size_t patchTensorBuffSize =entire_volume_patch_col_major.size()*sizeof(DataType);
+    DataType* gpu_data_entire_volume_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+    TensorMap<Tensor<DataType, 6, DataLayout,IndexType>> gpu_entire_volume_patch_col_major(gpu_data_entire_volume_patch_col_major, patchColMajorTensorRange);
+    gpu_entire_volume_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(patch_z, patch_y, patch_x);
+    sycl_device.memcpyDeviceToHost(entire_volume_patch_col_major.data(), gpu_data_entire_volume_patch_col_major, patchTensorBuffSize);
+
+
+//  Tensor<float, 5> tensor(depth, patch_z, patch_y, patch_x, batch);
+//  tensor.setRandom();
+//  Tensor<float, 5, RowMajor> tensor_row_major = tensor.swap_layout();
+
+  //Tensor<float, 6> entire_volume_patch;
+  //entire_volume_patch = tensor.extract_volume_patches(patch_z, patch_y, patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(0), depth);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(1), patch_z);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(2), patch_y);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(3), patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(4), patch_z * patch_y * patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(5), batch);
+
+//  Tensor<float, 6, RowMajor> entire_volume_patch_row_major;
+  //entire_volume_patch_row_major = tensor_row_major.extract_volume_patches(patch_z, patch_y, patch_x);
+
+  array<IndexType, 6> patchRowMajorTensorRange={{batch,patch_z*patch_y*patch_x, patch_x, patch_y, patch_z, depth}};
+  Tensor<DataType, 6, RowMajor,IndexType> entire_volume_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =entire_volume_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_entire_volume_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 6, RowMajor,IndexType>> gpu_entire_volume_patch_row_major(gpu_data_entire_volume_patch_row_major, patchRowMajorTensorRange);
+  gpu_entire_volume_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(patch_z, patch_y, patch_x);
+  sycl_device.memcpyDeviceToHost(entire_volume_patch_row_major.data(), gpu_data_entire_volume_patch_row_major, patchTensorBuffSize);
+
+
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(0), batch);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(1), patch_z * patch_y * patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(2), patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(3), patch_y);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(4), patch_z);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(5), depth);
+
+  const int dz = patch_z - 1;
+  const int dy = patch_y - 1;
+  const int dx = patch_x - 1;
+
+  const int forward_pad_z = dz / 2;
+  const int forward_pad_y = dy / 2;
+  const int forward_pad_x = dx / 2;
+
+  for (int pz = 0; pz < patch_z; pz++) {
+    for (int py = 0; py < patch_y; py++) {
+      for (int px = 0; px < patch_x; px++) {
+        const int patchId = pz + patch_z * (py + px * patch_y);
+        for (int z = 0; z < patch_z; z++) {
+          for (int y = 0; y < patch_y; y++) {
+            for (int x = 0; x < patch_x; x++) {
+              for (int b = 0; b < batch; b++) {
+                for (int d = 0; d < depth; d++) {
+                  float expected = 0.0f;
+                  float expected_row_major = 0.0f;
+                  const int eff_z = z - forward_pad_z + pz;
+                  const int eff_y = y - forward_pad_y + py;
+                  const int eff_x = x - forward_pad_x + px;
+                  if (eff_z >= 0 && eff_y >= 0 && eff_x >= 0 &&
+                      eff_z < patch_z && eff_y < patch_y && eff_x < patch_x) {
+                    expected = tensor_col_major(d, eff_z, eff_y, eff_x, b);
+                    expected_row_major = tensor_row_major(b, eff_x, eff_y, eff_z, d);
+                  }
+                  VERIFY_IS_EQUAL(entire_volume_patch_col_major(d, z, y, x, patchId, b), expected);
+                  VERIFY_IS_EQUAL(entire_volume_patch_row_major(b, patchId, x, y, z, d), expected_row_major);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_col_major);
+  sycl_device.deallocate(gpu_data_row_major);
+  sycl_device.deallocate(gpu_data_entire_volume_patch_col_major);
+  sycl_device.deallocate(gpu_data_entire_volume_patch_row_major);
+}
+
+
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_volume_patch_test_per_device(dev_Selector s){
+QueueInterface queueInterface(s);
+auto sycl_device = Eigen::SyclDevice(&queueInterface);
+std::cout << "Running on " << s.template get_info<cl::sycl::info::device::name>() << std::endl;
+test_single_voxel_patch_sycl<DataType, int64_t>(sycl_device);
+test_entire_volume_patch_sycl<DataType, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch_sycl)
+{
+for (const auto& device :Eigen::get_sycl_supported_devices()) {
+  CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<float>(device));
+}
+}

diff --git a/unsupported/test/dgmres.cpp b/unsupported/test/dgmres.cpp
new file mode 100644
index 0000000..5f63161
--- /dev/null
+++ b/unsupported/test/dgmres.cpp

@@ -0,0 +1,31 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2012 desire Nuentsa <desire.nuentsa_wakam@inria.fr
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "../../test/sparse_solver.h"
+#include <unsupported/Eigen/IterativeSolvers>
+
+template<typename T> void test_dgmres_T()
+{
+  DGMRES<SparseMatrix<T>, DiagonalPreconditioner<T> > dgmres_colmajor_diag;
+  DGMRES<SparseMatrix<T>, IdentityPreconditioner    > dgmres_colmajor_I;
+  DGMRES<SparseMatrix<T>, IncompleteLUT<T> >           dgmres_colmajor_ilut;
+  //GMRES<SparseMatrix<T>, SSORPreconditioner<T> >     dgmres_colmajor_ssor;
+
+  CALL_SUBTEST( check_sparse_square_solving(dgmres_colmajor_diag)  );
+//   CALL_SUBTEST( check_sparse_square_solving(dgmres_colmajor_I)     );
+  CALL_SUBTEST( check_sparse_square_solving(dgmres_colmajor_ilut)     );
+  //CALL_SUBTEST( check_sparse_square_solving(dgmres_colmajor_ssor)     );
+}
+
+EIGEN_DECLARE_TEST(dgmres)
+{
+  CALL_SUBTEST_1(test_dgmres_T<double>());
+  CALL_SUBTEST_2(test_dgmres_T<std::complex<double> >());
+}

diff --git a/unsupported/test/forward_adolc.cpp b/unsupported/test/forward_adolc.cpp
new file mode 100644
index 0000000..14a909d
--- /dev/null
+++ b/unsupported/test/forward_adolc.cpp

@@ -0,0 +1,141 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/Dense>
+
+#define NUMBER_DIRECTIONS 16
+#include <unsupported/Eigen/AdolcForward>
+
+template<typename Vector>
+EIGEN_DONT_INLINE typename Vector::Scalar foo(const Vector& p)
+{
+  typedef typename Vector::Scalar Scalar;
+  return (p-Vector(Scalar(-1),Scalar(1.))).norm() + (p.array().sqrt().abs() * p.array().sin()).sum() + p.dot(p);
+}
+
+template<typename _Scalar, int NX=Dynamic, int NY=Dynamic>
+struct TestFunc1
+{
+  typedef _Scalar Scalar;
+  enum {
+    InputsAtCompileTime = NX,
+    ValuesAtCompileTime = NY
+  };
+  typedef Matrix<Scalar,InputsAtCompileTime,1> InputType;
+  typedef Matrix<Scalar,ValuesAtCompileTime,1> ValueType;
+  typedef Matrix<Scalar,ValuesAtCompileTime,InputsAtCompileTime> JacobianType;
+
+  int m_inputs, m_values;
+
+  TestFunc1() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
+  TestFunc1(int inputs_, int values_) : m_inputs(inputs_), m_values(values_) {}
+
+  int inputs() const { return m_inputs; }
+  int values() const { return m_values; }
+
+  template<typename T>
+  void operator() (const Matrix<T,InputsAtCompileTime,1>& x, Matrix<T,ValuesAtCompileTime,1>* _v) const
+  {
+    Matrix<T,ValuesAtCompileTime,1>& v = *_v;
+
+    v[0] = 2 * x[0] * x[0] + x[0] * x[1];
+    v[1] = 3 * x[1] * x[0] + 0.5 * x[1] * x[1];
+    if(inputs()>2)
+    {
+      v[0] += 0.5 * x[2];
+      v[1] += x[2];
+    }
+    if(values()>2)
+    {
+      v[2] = 3 * x[1] * x[0] * x[0];
+    }
+    if (inputs()>2 && values()>2)
+      v[2] *= x[2];
+  }
+
+  void operator() (const InputType& x, ValueType* v, JacobianType* _j) const
+  {
+    (*this)(x, v);
+
+    if(_j)
+    {
+      JacobianType& j = *_j;
+
+      j(0,0) = 4 * x[0] + x[1];
+      j(1,0) = 3 * x[1];
+
+      j(0,1) = x[0];
+      j(1,1) = 3 * x[0] + 2 * 0.5 * x[1];
+
+      if (inputs()>2)
+      {
+        j(0,2) = 0.5;
+        j(1,2) = 1;
+      }
+      if(values()>2)
+      {
+        j(2,0) = 3 * x[1] * 2 * x[0];
+        j(2,1) = 3 * x[0] * x[0];
+      }
+      if (inputs()>2 && values()>2)
+      {
+        j(2,0) *= x[2];
+        j(2,1) *= x[2];
+
+        j(2,2) = 3 * x[1] * x[0] * x[0];
+        j(2,2) = 3 * x[1] * x[0] * x[0];
+      }
+    }
+  }
+};
+
+template<typename Func> void adolc_forward_jacobian(const Func& f)
+{
+    typename Func::InputType x = Func::InputType::Random(f.inputs());
+    typename Func::ValueType y(f.values()), yref(f.values());
+    typename Func::JacobianType j(f.values(),f.inputs()), jref(f.values(),f.inputs());
+
+    jref.setZero();
+    yref.setZero();
+    f(x,&yref,&jref);
+//     std::cerr << y.transpose() << "\n\n";;
+//     std::cerr << j << "\n\n";;
+
+    j.setZero();
+    y.setZero();
+    AdolcForwardJacobian<Func> autoj(f);
+    autoj(x, &y, &j);
+//     std::cerr << y.transpose() << "\n\n";;
+//     std::cerr << j << "\n\n";;
+
+    VERIFY_IS_APPROX(y, yref);
+    VERIFY_IS_APPROX(j, jref);
+}
+
+EIGEN_DECLARE_TEST(forward_adolc)
+{
+  adtl::setNumDir(NUMBER_DIRECTIONS);
+
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST(( adolc_forward_jacobian(TestFunc1<double,2,2>()) ));
+    CALL_SUBTEST(( adolc_forward_jacobian(TestFunc1<double,2,3>()) ));
+    CALL_SUBTEST(( adolc_forward_jacobian(TestFunc1<double,3,2>()) ));
+    CALL_SUBTEST(( adolc_forward_jacobian(TestFunc1<double,3,3>()) ));
+    CALL_SUBTEST(( adolc_forward_jacobian(TestFunc1<double>(3,3)) ));
+  }
+
+  {
+    // simple instantiation tests
+    Matrix<adtl::adouble,2,1> x;
+    foo(x);
+    Matrix<adtl::adouble,Dynamic,Dynamic> A(4,4);;
+    A.selfadjointView<Lower>().eigenvalues();
+  }
+}

diff --git a/unsupported/test/gmres.cpp b/unsupported/test/gmres.cpp
new file mode 100644
index 0000000..8d2254b
--- /dev/null
+++ b/unsupported/test/gmres.cpp

@@ -0,0 +1,31 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2012 Kolja Brix <brix@igpm.rwth-aaachen.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "../../test/sparse_solver.h"
+#include <Eigen/IterativeSolvers>
+
+template<typename T> void test_gmres_T()
+{
+  GMRES<SparseMatrix<T>, DiagonalPreconditioner<T> > gmres_colmajor_diag;
+  GMRES<SparseMatrix<T>, IdentityPreconditioner    > gmres_colmajor_I;
+  GMRES<SparseMatrix<T>, IncompleteLUT<T> >           gmres_colmajor_ilut;
+  //GMRES<SparseMatrix<T>, SSORPreconditioner<T> >     gmres_colmajor_ssor;
+
+  CALL_SUBTEST( check_sparse_square_solving(gmres_colmajor_diag)  );
+//   CALL_SUBTEST( check_sparse_square_solving(gmres_colmajor_I)     );
+  CALL_SUBTEST( check_sparse_square_solving(gmres_colmajor_ilut)     );
+  //CALL_SUBTEST( check_sparse_square_solving(gmres_colmajor_ssor)     );
+}
+
+EIGEN_DECLARE_TEST(gmres)
+{
+  CALL_SUBTEST_1(test_gmres_T<double>());
+  CALL_SUBTEST_2(test_gmres_T<std::complex<double> >());
+}

diff --git a/unsupported/test/idrs.cpp b/unsupported/test/idrs.cpp
new file mode 100644
index 0000000..f88c016
--- /dev/null
+++ b/unsupported/test/idrs.cpp

@@ -0,0 +1,27 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2012 Kolja Brix <brix@igpm.rwth-aaachen.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "../../test/sparse_solver.h"
+#include <Eigen/IterativeSolvers>
+
+template<typename T> void test_idrs_T()
+{
+  IDRS<SparseMatrix<T>, DiagonalPreconditioner<T> > idrs_colmajor_diag;
+  IDRS<SparseMatrix<T>, IncompleteLUT<T> >           idrs_colmajor_ilut;
+
+  CALL_SUBTEST( check_sparse_square_solving(idrs_colmajor_diag)  );
+  CALL_SUBTEST( check_sparse_square_solving(idrs_colmajor_ilut)     );
+}
+
+EIGEN_DECLARE_TEST(idrs)
+{
+  CALL_SUBTEST_1(test_idrs_T<double>());
+  CALL_SUBTEST_2(test_idrs_T<std::complex<double> >());
+}

diff --git a/unsupported/test/kronecker_product.cpp b/unsupported/test/kronecker_product.cpp
new file mode 100644
index 0000000..b5b764c
--- /dev/null
+++ b/unsupported/test/kronecker_product.cpp

@@ -0,0 +1,252 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Kolja Brix <brix@igpm.rwth-aachen.de>
+// Copyright (C) 2011 Andreas Platen <andiplaten@gmx.de>
+// Copyright (C) 2012 Chen-Pang He <jdh8@ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifdef EIGEN_TEST_PART_1
+
+#include "sparse.h"
+#include <Eigen/SparseExtra>
+#include <Eigen/KroneckerProduct>
+
+template<typename MatrixType>
+void check_dimension(const MatrixType& ab, const int rows,  const int cols)
+{
+  VERIFY_IS_EQUAL(ab.rows(), rows);
+  VERIFY_IS_EQUAL(ab.cols(), cols);
+}
+
+
+template<typename MatrixType>
+void check_kronecker_product(const MatrixType& ab)
+{
+  VERIFY_IS_EQUAL(ab.rows(), 6);
+  VERIFY_IS_EQUAL(ab.cols(), 6);
+  VERIFY_IS_EQUAL(ab.nonZeros(),  36);
+  VERIFY_IS_APPROX(ab.coeff(0,0), -0.4017367630386106);
+  VERIFY_IS_APPROX(ab.coeff(0,1),  0.1056863433932735);
+  VERIFY_IS_APPROX(ab.coeff(0,2), -0.7255206194554212);
+  VERIFY_IS_APPROX(ab.coeff(0,3),  0.1908653336744706);
+  VERIFY_IS_APPROX(ab.coeff(0,4),  0.350864567234111);
+  VERIFY_IS_APPROX(ab.coeff(0,5), -0.0923032108308013);
+  VERIFY_IS_APPROX(ab.coeff(1,0),  0.415417514804677);
+  VERIFY_IS_APPROX(ab.coeff(1,1), -0.2369227701722048);
+  VERIFY_IS_APPROX(ab.coeff(1,2),  0.7502275131458511);
+  VERIFY_IS_APPROX(ab.coeff(1,3), -0.4278731019742696);
+  VERIFY_IS_APPROX(ab.coeff(1,4), -0.3628129162264507);
+  VERIFY_IS_APPROX(ab.coeff(1,5),  0.2069210808481275);
+  VERIFY_IS_APPROX(ab.coeff(2,0),  0.05465890160863986);
+  VERIFY_IS_APPROX(ab.coeff(2,1), -0.2634092511419858);
+  VERIFY_IS_APPROX(ab.coeff(2,2),  0.09871180285793758);
+  VERIFY_IS_APPROX(ab.coeff(2,3), -0.4757066334017702);
+  VERIFY_IS_APPROX(ab.coeff(2,4), -0.04773740823058334);
+  VERIFY_IS_APPROX(ab.coeff(2,5),  0.2300535609645254);
+  VERIFY_IS_APPROX(ab.coeff(3,0), -0.8172945853260133);
+  VERIFY_IS_APPROX(ab.coeff(3,1),  0.2150086428359221);
+  VERIFY_IS_APPROX(ab.coeff(3,2),  0.5825113847292743);
+  VERIFY_IS_APPROX(ab.coeff(3,3), -0.1532433770097174);
+  VERIFY_IS_APPROX(ab.coeff(3,4), -0.329383387282399);
+  VERIFY_IS_APPROX(ab.coeff(3,5),  0.08665207912033064);
+  VERIFY_IS_APPROX(ab.coeff(4,0),  0.8451267514863225);
+  VERIFY_IS_APPROX(ab.coeff(4,1), -0.481996458918977);
+  VERIFY_IS_APPROX(ab.coeff(4,2), -0.6023482390791535);
+  VERIFY_IS_APPROX(ab.coeff(4,3),  0.3435339347164565);
+  VERIFY_IS_APPROX(ab.coeff(4,4),  0.3406002157428891);
+  VERIFY_IS_APPROX(ab.coeff(4,5), -0.1942526344200915);
+  VERIFY_IS_APPROX(ab.coeff(5,0),  0.1111982482925399);
+  VERIFY_IS_APPROX(ab.coeff(5,1), -0.5358806424754169);
+  VERIFY_IS_APPROX(ab.coeff(5,2), -0.07925446559335647);
+  VERIFY_IS_APPROX(ab.coeff(5,3),  0.3819388757769038);
+  VERIFY_IS_APPROX(ab.coeff(5,4),  0.04481475387219876);
+  VERIFY_IS_APPROX(ab.coeff(5,5), -0.2159688616158057);
+}
+
+
+template<typename MatrixType>
+void check_sparse_kronecker_product(const MatrixType& ab)
+{
+  VERIFY_IS_EQUAL(ab.rows(), 12);
+  VERIFY_IS_EQUAL(ab.cols(), 10);
+  VERIFY_IS_EQUAL(ab.nonZeros(), 3*2);
+  VERIFY_IS_APPROX(ab.coeff(3,0), -0.04);
+  VERIFY_IS_APPROX(ab.coeff(5,1),  0.05);
+  VERIFY_IS_APPROX(ab.coeff(0,6), -0.08);
+  VERIFY_IS_APPROX(ab.coeff(2,7),  0.10);
+  VERIFY_IS_APPROX(ab.coeff(6,8),  0.12);
+  VERIFY_IS_APPROX(ab.coeff(8,9), -0.15);
+}
+
+
+EIGEN_DECLARE_TEST(kronecker_product)
+{
+  // DM = dense matrix; SM = sparse matrix
+
+  Matrix<double, 2, 3> DM_a;
+  SparseMatrix<double> SM_a(2,3);
+  SM_a.insert(0,0) = DM_a.coeffRef(0,0) = -0.4461540300782201;
+  SM_a.insert(0,1) = DM_a.coeffRef(0,1) = -0.8057364375283049;
+  SM_a.insert(0,2) = DM_a.coeffRef(0,2) =  0.3896572459516341;
+  SM_a.insert(1,0) = DM_a.coeffRef(1,0) = -0.9076572187376921;
+  SM_a.insert(1,1) = DM_a.coeffRef(1,1) =  0.6469156566545853;
+  SM_a.insert(1,2) = DM_a.coeffRef(1,2) = -0.3658010398782789;
+
+  MatrixXd             DM_b(3,2);
+  SparseMatrix<double> SM_b(3,2);
+  SM_b.insert(0,0) = DM_b.coeffRef(0,0) =  0.9004440976767099;
+  SM_b.insert(0,1) = DM_b.coeffRef(0,1) = -0.2368830858139832;
+  SM_b.insert(1,0) = DM_b.coeffRef(1,0) = -0.9311078389941825;
+  SM_b.insert(1,1) = DM_b.coeffRef(1,1) =  0.5310335762980047;
+  SM_b.insert(2,0) = DM_b.coeffRef(2,0) = -0.1225112806872035;
+  SM_b.insert(2,1) = DM_b.coeffRef(2,1) =  0.5903998022741264;
+
+  SparseMatrix<double,RowMajor> SM_row_a(SM_a), SM_row_b(SM_b);
+
+  // test DM_fixedSize = kroneckerProduct(DM_block,DM)
+  Matrix<double, 6, 6> DM_fix_ab = kroneckerProduct(DM_a.topLeftCorner<2,3>(),DM_b);
+
+  CALL_SUBTEST(check_kronecker_product(DM_fix_ab));
+  CALL_SUBTEST(check_kronecker_product(kroneckerProduct(DM_a.topLeftCorner<2,3>(),DM_b)));
+
+  for(int i=0;i<DM_fix_ab.rows();++i)
+    for(int j=0;j<DM_fix_ab.cols();++j)
+       VERIFY_IS_APPROX(kroneckerProduct(DM_a,DM_b).coeff(i,j), DM_fix_ab(i,j));
+
+  // test DM_block = kroneckerProduct(DM,DM)
+  MatrixXd DM_block_ab(10,15);
+  DM_block_ab.block<6,6>(2,5) = kroneckerProduct(DM_a,DM_b);
+  CALL_SUBTEST(check_kronecker_product(DM_block_ab.block<6,6>(2,5)));
+
+  // test DM = kroneckerProduct(DM,DM)
+  MatrixXd DM_ab = kroneckerProduct(DM_a,DM_b);
+  CALL_SUBTEST(check_kronecker_product(DM_ab));
+  CALL_SUBTEST(check_kronecker_product(kroneckerProduct(DM_a,DM_b)));
+
+  // test SM = kroneckerProduct(SM,DM)
+  SparseMatrix<double> SM_ab = kroneckerProduct(SM_a,DM_b);
+  CALL_SUBTEST(check_kronecker_product(SM_ab));
+  SparseMatrix<double,RowMajor> SM_ab2 = kroneckerProduct(SM_a,DM_b);
+  CALL_SUBTEST(check_kronecker_product(SM_ab2));
+  CALL_SUBTEST(check_kronecker_product(kroneckerProduct(SM_a,DM_b)));
+
+  // test SM = kroneckerProduct(DM,SM)
+  SM_ab.setZero();
+  SM_ab.insert(0,0)=37.0;
+  SM_ab = kroneckerProduct(DM_a,SM_b);
+  CALL_SUBTEST(check_kronecker_product(SM_ab));
+  SM_ab2.setZero();
+  SM_ab2.insert(0,0)=37.0;
+  SM_ab2 = kroneckerProduct(DM_a,SM_b);
+  CALL_SUBTEST(check_kronecker_product(SM_ab2));
+  CALL_SUBTEST(check_kronecker_product(kroneckerProduct(DM_a,SM_b)));
+
+  // test SM = kroneckerProduct(SM,SM)
+  SM_ab.resize(2,33);
+  SM_ab.insert(0,0)=37.0;
+  SM_ab = kroneckerProduct(SM_a,SM_b);
+  CALL_SUBTEST(check_kronecker_product(SM_ab));
+  SM_ab2.resize(5,11);
+  SM_ab2.insert(0,0)=37.0;
+  SM_ab2 = kroneckerProduct(SM_a,SM_b);
+  CALL_SUBTEST(check_kronecker_product(SM_ab2));
+  CALL_SUBTEST(check_kronecker_product(kroneckerProduct(SM_a,SM_b)));
+
+  // test SM = kroneckerProduct(SM,SM) with sparse pattern
+  SM_a.resize(4,5);
+  SM_b.resize(3,2);
+  SM_a.resizeNonZeros(0);
+  SM_b.resizeNonZeros(0);
+  SM_a.insert(1,0) = -0.1;
+  SM_a.insert(0,3) = -0.2;
+  SM_a.insert(2,4) =  0.3;
+  SM_a.finalize();
+
+  SM_b.insert(0,0) =  0.4;
+  SM_b.insert(2,1) = -0.5;
+  SM_b.finalize();
+  SM_ab.resize(1,1);
+  SM_ab.insert(0,0)=37.0;
+  SM_ab = kroneckerProduct(SM_a,SM_b);
+  CALL_SUBTEST(check_sparse_kronecker_product(SM_ab));
+
+  // test dimension of result of DM = kroneckerProduct(DM,DM)
+  MatrixXd DM_a2(2,1);
+  MatrixXd DM_b2(5,4);
+  MatrixXd DM_ab2 = kroneckerProduct(DM_a2,DM_b2);
+  CALL_SUBTEST(check_dimension(DM_ab2,2*5,1*4));
+  DM_a2.resize(10,9);
+  DM_b2.resize(4,8);
+  DM_ab2 = kroneckerProduct(DM_a2,DM_b2);
+  CALL_SUBTEST(check_dimension(DM_ab2,10*4,9*8));
+
+  for(int i = 0; i < g_repeat; i++)
+  {
+    double density = Eigen::internal::random<double>(0.01,0.5);
+    int ra = Eigen::internal::random<int>(1,50);
+    int ca = Eigen::internal::random<int>(1,50);
+    int rb = Eigen::internal::random<int>(1,50);
+    int cb = Eigen::internal::random<int>(1,50);
+    SparseMatrix<float,ColMajor> sA(ra,ca), sB(rb,cb), sC;
+    SparseMatrix<float,RowMajor> sC2;
+    MatrixXf dA(ra,ca), dB(rb,cb), dC;
+    initSparse(density, dA, sA);
+    initSparse(density, dB, sB);
+
+    sC = kroneckerProduct(sA,sB);
+    dC = kroneckerProduct(dA,dB);
+    VERIFY_IS_APPROX(MatrixXf(sC),dC);
+
+    sC = kroneckerProduct(sA.transpose(),sB);
+    dC = kroneckerProduct(dA.transpose(),dB);
+    VERIFY_IS_APPROX(MatrixXf(sC),dC);
+
+    sC = kroneckerProduct(sA.transpose(),sB.transpose());
+    dC = kroneckerProduct(dA.transpose(),dB.transpose());
+    VERIFY_IS_APPROX(MatrixXf(sC),dC);
+
+    sC = kroneckerProduct(sA,sB.transpose());
+    dC = kroneckerProduct(dA,dB.transpose());
+    VERIFY_IS_APPROX(MatrixXf(sC),dC);
+
+    sC2 = kroneckerProduct(sA,sB);
+    dC = kroneckerProduct(dA,dB);
+    VERIFY_IS_APPROX(MatrixXf(sC2),dC);
+
+    sC2 = kroneckerProduct(dA,sB);
+    dC = kroneckerProduct(dA,dB);
+    VERIFY_IS_APPROX(MatrixXf(sC2),dC);
+
+    sC2 = kroneckerProduct(sA,dB);
+    dC = kroneckerProduct(dA,dB);
+    VERIFY_IS_APPROX(MatrixXf(sC2),dC);
+
+    sC2 = kroneckerProduct(2*sA,sB);
+    dC = kroneckerProduct(2*dA,dB);
+    VERIFY_IS_APPROX(MatrixXf(sC2),dC);
+  }
+}
+
+#endif
+
+#ifdef EIGEN_TEST_PART_2
+
+// simply check that for a dense kronecker product, sparse module is not needed
+#include "main.h"
+#include <Eigen/KroneckerProduct>
+
+EIGEN_DECLARE_TEST(kronecker_product)
+{
+  MatrixXd a(2,2), b(3,3), c;
+  a.setRandom();
+  b.setRandom();
+  c = kroneckerProduct(a,b);
+  VERIFY_IS_APPROX(c.block(3,3,3,3), a(1,1)*b);
+}
+
+#endif

diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp
new file mode 100644
index 0000000..7f9a81c
--- /dev/null
+++ b/unsupported/test/levenberg_marquardt.cpp

@@ -0,0 +1,1477 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+// Copyright (C) 2012 desire Nuentsa <desire.nuentsa_wakam@inria.fr
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+// FIXME: These tests all check for hard-coded values. Ideally, parameters and start estimates should be randomized.
+
+
+#include <stdio.h>
+
+#include "main.h"
+#include <unsupported/Eigen/LevenbergMarquardt>
+
+// This disables some useless Warnings on MSVC.
+// It is intended to be done for this test only.
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+
+using std::sqrt;
+
+// tolerance for chekcing number of iterations
+#define LM_EVAL_COUNT_TOL 4/3
+
+struct lmder_functor : DenseFunctor<double>
+{
+    lmder_functor(void): DenseFunctor<double>(3,15) {}
+    int operator()(const VectorXd &x, VectorXd &fvec) const
+    {
+        double tmp1, tmp2, tmp3;
+        static const double y[15] = {1.4e-1, 1.8e-1, 2.2e-1, 2.5e-1, 2.9e-1, 3.2e-1, 3.5e-1,
+            3.9e-1, 3.7e-1, 5.8e-1, 7.3e-1, 9.6e-1, 1.34, 2.1, 4.39};
+
+        for (int i = 0; i < values(); i++)
+        {
+            tmp1 = i+1;
+            tmp2 = 16 - i - 1;
+            tmp3 = (i>=8)? tmp2 : tmp1;
+            fvec[i] = y[i] - (x[0] + tmp1/(x[1]*tmp2 + x[2]*tmp3));
+        }
+        return 0;
+    }
+
+    int df(const VectorXd &x, MatrixXd &fjac) const
+    {
+        double tmp1, tmp2, tmp3, tmp4;
+        for (int i = 0; i < values(); i++)
+        {
+            tmp1 = i+1;
+            tmp2 = 16 - i - 1;
+            tmp3 = (i>=8)? tmp2 : tmp1;
+            tmp4 = (x[1]*tmp2 + x[2]*tmp3); tmp4 = tmp4*tmp4;
+            fjac(i,0) = -1;
+            fjac(i,1) = tmp1*tmp2/tmp4;
+            fjac(i,2) = tmp1*tmp3/tmp4;
+        }
+        return 0;
+    }
+};
+
+void testLmder1()
+{
+  int n=3, info;
+
+  VectorXd x;
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, 1.);
+
+  // do the computation
+  lmder_functor functor;
+  LevenbergMarquardt<lmder_functor> lm(functor);
+  info = lm.lmder1(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 6);
+  VERIFY_IS_EQUAL(lm.njev(), 5);
+
+  // check norm
+  VERIFY_IS_APPROX(lm.fvec().blueNorm(), 0.09063596);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << 0.08241058, 1.133037, 2.343695;
+  VERIFY_IS_APPROX(x, x_ref);
+}
+
+void testLmder()
+{
+  const int m=15, n=3;
+  int info;
+  double fnorm, covfac;
+  VectorXd x;
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, 1.);
+
+  // do the computation
+  lmder_functor functor;
+  LevenbergMarquardt<lmder_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return values
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 6);
+  VERIFY_IS_EQUAL(lm.njev(), 5);
+
+  // check norm
+  fnorm = lm.fvec().blueNorm();
+  VERIFY_IS_APPROX(fnorm, 0.09063596);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << 0.08241058, 1.133037, 2.343695;
+  VERIFY_IS_APPROX(x, x_ref);
+
+  // check covariance
+  covfac = fnorm*fnorm/(m-n);
+  internal::covar(lm.matrixR(), lm.permutation().indices()); // TODO : move this as a function of lm
+
+  MatrixXd cov_ref(n,n);
+  cov_ref <<
+      0.0001531202,   0.002869941,  -0.002656662,
+      0.002869941,    0.09480935,   -0.09098995,
+      -0.002656662,   -0.09098995,    0.08778727;
+
+//  std::cout << fjac*covfac << std::endl;
+
+  MatrixXd cov;
+  cov =  covfac*lm.matrixR().topLeftCorner<n,n>();
+  VERIFY_IS_APPROX( cov, cov_ref);
+  // TODO: why isn't this allowed ? :
+  // VERIFY_IS_APPROX( covfac*fjac.topLeftCorner<n,n>() , cov_ref);
+}
+
+struct lmdif_functor : DenseFunctor<double>
+{
+    lmdif_functor(void) : DenseFunctor<double>(3,15) {}
+    int operator()(const VectorXd &x, VectorXd &fvec) const
+    {
+        int i;
+        double tmp1,tmp2,tmp3;
+        static const double y[15]={1.4e-1,1.8e-1,2.2e-1,2.5e-1,2.9e-1,3.2e-1,3.5e-1,3.9e-1,
+            3.7e-1,5.8e-1,7.3e-1,9.6e-1,1.34e0,2.1e0,4.39e0};
+
+        assert(x.size()==3);
+        assert(fvec.size()==15);
+        for (i=0; i<15; i++)
+        {
+            tmp1 = i+1;
+            tmp2 = 15 - i;
+            tmp3 = tmp1;
+
+            if (i >= 8) tmp3 = tmp2;
+            fvec[i] = y[i] - (x[0] + tmp1/(x[1]*tmp2 + x[2]*tmp3));
+        }
+        return 0;
+    }
+};
+
+void testLmdif1()
+{
+  const int n=3;
+  int info;
+
+  VectorXd x(n), fvec(15);
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, 1.);
+
+  // do the computation
+  lmdif_functor functor;
+  DenseIndex nfev;
+  info = LevenbergMarquardt<lmdif_functor>::lmdif1(functor, x, &nfev);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+//   VERIFY_IS_EQUAL(nfev, 26);
+
+  // check norm
+  functor(x, fvec);
+  VERIFY_IS_APPROX(fvec.blueNorm(), 0.09063596);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << 0.0824106, 1.1330366, 2.3436947;
+  VERIFY_IS_APPROX(x, x_ref);
+
+}
+
+void testLmdif()
+{
+  const int m=15, n=3;
+  int info;
+  double fnorm, covfac;
+  VectorXd x(n);
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, 1.);
+
+  // do the computation
+  lmdif_functor functor;
+  NumericalDiff<lmdif_functor> numDiff(functor);
+  LevenbergMarquardt<NumericalDiff<lmdif_functor> > lm(numDiff);
+  info = lm.minimize(x);
+
+  // check return values
+  VERIFY_IS_EQUAL(info, 1);
+//   VERIFY_IS_EQUAL(lm.nfev(), 26);
+
+  // check norm
+  fnorm = lm.fvec().blueNorm();
+  VERIFY_IS_APPROX(fnorm, 0.09063596);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << 0.08241058, 1.133037, 2.343695;
+  VERIFY_IS_APPROX(x, x_ref);
+
+  // check covariance
+  covfac = fnorm*fnorm/(m-n);
+  internal::covar(lm.matrixR(), lm.permutation().indices()); // TODO : move this as a function of lm
+
+  MatrixXd cov_ref(n,n);
+  cov_ref <<
+      0.0001531202,   0.002869942,  -0.002656662,
+      0.002869942,    0.09480937,   -0.09098997,
+      -0.002656662,   -0.09098997,    0.08778729;
+
+//  std::cout << fjac*covfac << std::endl;
+
+  MatrixXd cov;
+  cov =  covfac*lm.matrixR().topLeftCorner<n,n>();
+  VERIFY_IS_APPROX( cov, cov_ref);
+  // TODO: why isn't this allowed ? :
+  // VERIFY_IS_APPROX( covfac*fjac.topLeftCorner<n,n>() , cov_ref);
+}
+
+struct chwirut2_functor : DenseFunctor<double>
+{
+    chwirut2_functor(void) : DenseFunctor<double>(3,54) {}
+    static const double m_x[54];
+    static const double m_y[54];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        int i;
+
+        assert(b.size()==3);
+        assert(fvec.size()==54);
+        for(i=0; i<54; i++) {
+            double x = m_x[i];
+            fvec[i] = exp(-b[0]*x)/(b[1]+b[2]*x) - m_y[i];
+        }
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==3);
+        assert(fjac.rows()==54);
+        assert(fjac.cols()==3);
+        for(int i=0; i<54; i++) {
+            double x = m_x[i];
+            double factor = 1./(b[1]+b[2]*x);
+            double e = exp(-b[0]*x);
+            fjac(i,0) = -x*e*factor;
+            fjac(i,1) = -e*factor*factor;
+            fjac(i,2) = -x*e*factor*factor;
+        }
+        return 0;
+    }
+};
+const double chwirut2_functor::m_x[54] = { 0.500E0, 1.000E0, 1.750E0, 3.750E0, 5.750E0, 0.875E0, 2.250E0, 3.250E0, 5.250E0, 0.750E0, 1.750E0, 2.750E0, 4.750E0, 0.625E0, 1.250E0, 2.250E0, 4.250E0, .500E0, 3.000E0, .750E0, 3.000E0, 1.500E0, 6.000E0, 3.000E0, 6.000E0, 1.500E0, 3.000E0, .500E0, 2.000E0, 4.000E0, .750E0, 2.000E0, 5.000E0, .750E0, 2.250E0, 3.750E0, 5.750E0, 3.000E0, .750E0, 2.500E0, 4.000E0, .750E0, 2.500E0, 4.000E0, .750E0, 2.500E0, 4.000E0, .500E0, 6.000E0, 3.000E0, .500E0, 2.750E0, .500E0, 1.750E0};
+const double chwirut2_functor::m_y[54] = { 92.9000E0 ,57.1000E0 ,31.0500E0 ,11.5875E0 ,8.0250E0 ,63.6000E0 ,21.4000E0 ,14.2500E0 ,8.4750E0 ,63.8000E0 ,26.8000E0 ,16.4625E0 ,7.1250E0 ,67.3000E0 ,41.0000E0 ,21.1500E0 ,8.1750E0 ,81.5000E0 ,13.1200E0 ,59.9000E0 ,14.6200E0 ,32.9000E0 ,5.4400E0 ,12.5600E0 ,5.4400E0 ,32.0000E0 ,13.9500E0 ,75.8000E0 ,20.0000E0 ,10.4200E0 ,59.5000E0 ,21.6700E0 ,8.5500E0 ,62.0000E0 ,20.2000E0 ,7.7600E0 ,3.7500E0 ,11.8100E0 ,54.7000E0 ,23.7000E0 ,11.5500E0 ,61.3000E0 ,17.7000E0 ,8.7400E0 ,59.2000E0 ,16.3000E0 ,8.6200E0 ,81.0000E0 ,4.8700E0 ,14.6200E0 ,81.7000E0 ,17.1700E0 ,81.3000E0 ,28.9000E0  };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/chwirut2.shtml
+void testNistChwirut2(void)
+{
+  const int n=3;
+  LevenbergMarquardtSpace::Status info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 0.1, 0.01, 0.02;
+  // do the computation
+  chwirut2_functor functor;
+  LevenbergMarquardt<chwirut2_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+//   VERIFY_IS_EQUAL(lm.nfev(), 10);
+  VERIFY_IS_EQUAL(lm.njev(), 8);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.1304802941E+02);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.6657666537E-01);
+  VERIFY_IS_APPROX(x[1], 5.1653291286E-03);
+  VERIFY_IS_APPROX(x[2], 1.2150007096E-02);
+
+  /*
+   * Second try
+   */
+  x<< 0.15, 0.008, 0.010;
+  // do the computation
+  lm.resetParameters();
+  lm.setFtol(1.E6*NumTraits<double>::epsilon());
+  lm.setXtol(1.E6*NumTraits<double>::epsilon());
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+//   VERIFY_IS_EQUAL(lm.nfev(), 7);
+  VERIFY_IS_EQUAL(lm.njev(), 6);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.1304802941E+02);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.6657666537E-01);
+  VERIFY_IS_APPROX(x[1], 5.1653291286E-03);
+  VERIFY_IS_APPROX(x[2], 1.2150007096E-02);
+}
+
+
+struct misra1a_functor : DenseFunctor<double>
+{
+    misra1a_functor(void) : DenseFunctor<double>(2,14) {}
+    static const double m_x[14];
+    static const double m_y[14];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==2);
+        assert(fvec.size()==14);
+        for(int i=0; i<14; i++) {
+            fvec[i] = b[0]*(1.-exp(-b[1]*m_x[i])) - m_y[i] ;
+        }
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==2);
+        assert(fjac.rows()==14);
+        assert(fjac.cols()==2);
+        for(int i=0; i<14; i++) {
+            fjac(i,0) = (1.-exp(-b[1]*m_x[i]));
+            fjac(i,1) = (b[0]*m_x[i]*exp(-b[1]*m_x[i]));
+        }
+        return 0;
+    }
+};
+const double misra1a_functor::m_x[14] = { 77.6E0, 114.9E0, 141.1E0, 190.8E0, 239.9E0, 289.0E0, 332.8E0, 378.4E0, 434.8E0, 477.3E0, 536.8E0, 593.1E0, 689.1E0, 760.0E0};
+const double misra1a_functor::m_y[14] = { 10.07E0, 14.73E0, 17.94E0, 23.93E0, 29.61E0, 35.18E0, 40.02E0, 44.82E0, 50.76E0, 55.05E0, 61.01E0, 66.40E0, 75.47E0, 81.78E0};
+
+// http://www.itl.nist.gov/div898/strd/nls/data/misra1a.shtml
+void testNistMisra1a(void)
+{
+  const int n=2;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 500., 0.0001;
+  // do the computation
+  misra1a_functor functor;
+  LevenbergMarquardt<misra1a_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 19);
+  VERIFY_IS_EQUAL(lm.njev(), 15);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.2455138894E-01);
+  // check x
+  VERIFY_IS_APPROX(x[0], 2.3894212918E+02);
+  VERIFY_IS_APPROX(x[1], 5.5015643181E-04);
+
+  /*
+   * Second try
+   */
+  x<< 250., 0.0005;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 5);
+  VERIFY_IS_EQUAL(lm.njev(), 4);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.2455138894E-01);
+  // check x
+  VERIFY_IS_APPROX(x[0], 2.3894212918E+02);
+  VERIFY_IS_APPROX(x[1], 5.5015643181E-04);
+}
+
+struct hahn1_functor : DenseFunctor<double>
+{
+    hahn1_functor(void) : DenseFunctor<double>(7,236) {}
+    static const double m_x[236];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        static const double m_y[236] = { .591E0 , 1.547E0 , 2.902E0 , 2.894E0 , 4.703E0 , 6.307E0 , 7.03E0  , 7.898E0 , 9.470E0 , 9.484E0 , 10.072E0 , 10.163E0 , 11.615E0 , 12.005E0 , 12.478E0 , 12.982E0 , 12.970E0 , 13.926E0 , 14.452E0 , 14.404E0 , 15.190E0 , 15.550E0 , 15.528E0 , 15.499E0 , 16.131E0 , 16.438E0 , 16.387E0 , 16.549E0 , 16.872E0 , 16.830E0 , 16.926E0 , 16.907E0 , 16.966E0 , 17.060E0 , 17.122E0 , 17.311E0 , 17.355E0 , 17.668E0 , 17.767E0 , 17.803E0 , 17.765E0 , 17.768E0 , 17.736E0 , 17.858E0 , 17.877E0 , 17.912E0 , 18.046E0 , 18.085E0 , 18.291E0 , 18.357E0 , 18.426E0 , 18.584E0 , 18.610E0 , 18.870E0 , 18.795E0 , 19.111E0 , .367E0 , .796E0 , 0.892E0 , 1.903E0 , 2.150E0 , 3.697E0 , 5.870E0 , 6.421E0 , 7.422E0 , 9.944E0 , 11.023E0 , 11.87E0  , 12.786E0 , 14.067E0 , 13.974E0 , 14.462E0 , 14.464E0 , 15.381E0 , 15.483E0 , 15.59E0  , 16.075E0 , 16.347E0 , 16.181E0 , 16.915E0 , 17.003E0 , 16.978E0 , 17.756E0 , 17.808E0 , 17.868E0 , 18.481E0 , 18.486E0 , 19.090E0 , 16.062E0 , 16.337E0 , 16.345E0 ,
+        16.388E0 , 17.159E0 , 17.116E0 , 17.164E0 , 17.123E0 , 17.979E0 , 17.974E0 , 18.007E0 , 17.993E0 , 18.523E0 , 18.669E0 , 18.617E0 , 19.371E0 , 19.330E0 , 0.080E0 , 0.248E0 , 1.089E0 , 1.418E0 , 2.278E0 , 3.624E0 , 4.574E0 , 5.556E0 , 7.267E0 , 7.695E0 , 9.136E0 , 9.959E0 , 9.957E0 , 11.600E0 , 13.138E0 , 13.564E0 , 13.871E0 , 13.994E0 , 14.947E0 , 15.473E0 , 15.379E0 , 15.455E0 , 15.908E0 , 16.114E0 , 17.071E0 , 17.135E0 , 17.282E0 , 17.368E0 , 17.483E0 , 17.764E0 , 18.185E0 , 18.271E0 , 18.236E0 , 18.237E0 , 18.523E0 , 18.627E0 , 18.665E0 , 19.086E0 , 0.214E0 , 0.943E0 , 1.429E0 , 2.241E0 , 2.951E0 , 3.782E0 , 4.757E0 , 5.602E0 , 7.169E0 , 8.920E0 , 10.055E0 , 12.035E0 , 12.861E0 , 13.436E0 , 14.167E0 , 14.755E0 , 15.168E0 , 15.651E0 , 15.746E0 , 16.216E0 , 16.445E0 , 16.965E0 , 17.121E0 , 17.206E0 , 17.250E0 , 17.339E0 , 17.793E0 , 18.123E0 , 18.49E0  , 18.566E0 , 18.645E0 , 18.706E0 , 18.924E0 , 19.1E0   , 0.375E0 , 0.471E0 , 1.504E0 , 2.204E0 , 2.813E0 , 4.765E0 , 9.835E0 , 10.040E0 , 11.946E0 , 
+12.596E0 , 
+13.303E0 , 13.922E0 , 14.440E0 , 14.951E0 , 15.627E0 , 15.639E0 , 15.814E0 , 16.315E0 , 16.334E0 , 16.430E0 , 16.423E0 , 17.024E0 , 17.009E0 , 17.165E0 , 17.134E0 , 17.349E0 , 17.576E0 , 17.848E0 , 18.090E0 , 18.276E0 , 18.404E0 , 18.519E0 , 19.133E0 , 19.074E0 , 19.239E0 , 19.280E0 , 19.101E0 , 19.398E0 , 19.252E0 , 19.89E0  , 20.007E0 , 19.929E0 , 19.268E0 , 19.324E0 , 20.049E0 , 20.107E0 , 20.062E0 , 20.065E0 , 19.286E0 , 19.972E0 , 20.088E0 , 20.743E0 , 20.83E0  , 20.935E0 , 21.035E0 , 20.93E0  , 21.074E0 , 21.085E0 , 20.935E0 };
+
+        //        int called=0; printf("call hahn1_functor with  iflag=%d, called=%d\n", iflag, called); if (iflag==1) called++;
+
+        assert(b.size()==7);
+        assert(fvec.size()==236);
+        for(int i=0; i<236; i++) {
+            double x=m_x[i], xx=x*x, xxx=xx*x;
+            fvec[i] = (b[0]+b[1]*x+b[2]*xx+b[3]*xxx) / (1.+b[4]*x+b[5]*xx+b[6]*xxx) - m_y[i];
+        }
+        return 0;
+    }
+
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==7);
+        assert(fjac.rows()==236);
+        assert(fjac.cols()==7);
+        for(int i=0; i<236; i++) {
+            double x=m_x[i], xx=x*x, xxx=xx*x;
+            double fact = 1./(1.+b[4]*x+b[5]*xx+b[6]*xxx);
+            fjac(i,0) = 1.*fact;
+            fjac(i,1) = x*fact;
+            fjac(i,2) = xx*fact;
+            fjac(i,3) = xxx*fact;
+            fact = - (b[0]+b[1]*x+b[2]*xx+b[3]*xxx) * fact * fact;
+            fjac(i,4) = x*fact;
+            fjac(i,5) = xx*fact;
+            fjac(i,6) = xxx*fact;
+        }
+        return 0;
+    }
+};
+const double hahn1_functor::m_x[236] = { 24.41E0 , 34.82E0 , 44.09E0 , 45.07E0 , 54.98E0 , 65.51E0 , 70.53E0 , 75.70E0 , 89.57E0 , 91.14E0 , 96.40E0 , 97.19E0 , 114.26E0 , 120.25E0 , 127.08E0 , 133.55E0 , 133.61E0 , 158.67E0 , 172.74E0 , 171.31E0 , 202.14E0 , 220.55E0 , 221.05E0 , 221.39E0 , 250.99E0 , 268.99E0 , 271.80E0 , 271.97E0 , 321.31E0 , 321.69E0 , 330.14E0 , 333.03E0 , 333.47E0 , 340.77E0 , 345.65E0 , 373.11E0 , 373.79E0 , 411.82E0 , 419.51E0 , 421.59E0 , 422.02E0 , 422.47E0 , 422.61E0 , 441.75E0 , 447.41E0 , 448.7E0  , 472.89E0 , 476.69E0 , 522.47E0 , 522.62E0 , 524.43E0 , 546.75E0 , 549.53E0 , 575.29E0 , 576.00E0 , 625.55E0 , 20.15E0 , 28.78E0 , 29.57E0 , 37.41E0 , 39.12E0 , 50.24E0 , 61.38E0 , 66.25E0 , 73.42E0 , 95.52E0 , 107.32E0 , 122.04E0 , 134.03E0 , 163.19E0 , 163.48E0 , 175.70E0 , 179.86E0 , 211.27E0 , 217.78E0 , 219.14E0 , 262.52E0 , 268.01E0 , 268.62E0 , 336.25E0 , 337.23E0 , 339.33E0 , 427.38E0 , 428.58E0 , 432.68E0 , 528.99E0 , 531.08E0 , 628.34E0 , 253.24E0 , 273.13E0 , 273.66E0 ,
+282.10E0 , 346.62E0 , 347.19E0 , 348.78E0 , 351.18E0 , 450.10E0 , 450.35E0 , 451.92E0 , 455.56E0 , 552.22E0 , 553.56E0 , 555.74E0 , 652.59E0 , 656.20E0 , 14.13E0 , 20.41E0 , 31.30E0 , 33.84E0 , 39.70E0 , 48.83E0 , 54.50E0 , 60.41E0 , 72.77E0 , 75.25E0 , 86.84E0 , 94.88E0 , 96.40E0 , 117.37E0 , 139.08E0 , 147.73E0 , 158.63E0 , 161.84E0 , 192.11E0 , 206.76E0 , 209.07E0 , 213.32E0 , 226.44E0 , 237.12E0 , 330.90E0 , 358.72E0 , 370.77E0 , 372.72E0 , 396.24E0 , 416.59E0 , 484.02E0 , 495.47E0 , 514.78E0 , 515.65E0 , 519.47E0 , 544.47E0 , 560.11E0 , 620.77E0 , 18.97E0 , 28.93E0 , 33.91E0 , 40.03E0 , 44.66E0 , 49.87E0 , 55.16E0 , 60.90E0 , 72.08E0 , 85.15E0 , 97.06E0 , 119.63E0 , 133.27E0 , 143.84E0 , 161.91E0 , 180.67E0 , 198.44E0 , 226.86E0 , 229.65E0 , 258.27E0 , 273.77E0 , 339.15E0 , 350.13E0 , 362.75E0 , 371.03E0 , 393.32E0 , 448.53E0 , 473.78E0 , 511.12E0 , 524.70E0 , 548.75E0 , 551.64E0 , 574.02E0 , 623.86E0 , 21.46E0 , 24.33E0 , 33.43E0 , 39.22E0 , 44.18E0 , 55.02E0 , 94.33E0 , 96.44E0 , 118.82E0 , 128.48E0 ,
+141.94E0 , 156.92E0 , 171.65E0 , 190.00E0 , 223.26E0 , 223.88E0 , 231.50E0 , 265.05E0 , 269.44E0 , 271.78E0 , 273.46E0 , 334.61E0 , 339.79E0 , 349.52E0 , 358.18E0 , 377.98E0 , 394.77E0 , 429.66E0 , 468.22E0 , 487.27E0 , 519.54E0 , 523.03E0 , 612.99E0 , 638.59E0 , 641.36E0 , 622.05E0 , 631.50E0 , 663.97E0 , 646.9E0  , 748.29E0 , 749.21E0 , 750.14E0 , 647.04E0 , 646.89E0 , 746.9E0  , 748.43E0 , 747.35E0 , 749.27E0 , 647.61E0 , 747.78E0 , 750.51E0 , 851.37E0 , 845.97E0 , 847.54E0 , 849.93E0 , 851.61E0 , 849.75E0 , 850.98E0 , 848.23E0};
+
+// http://www.itl.nist.gov/div898/strd/nls/data/hahn1.shtml
+void testNistHahn1(void)
+{
+  const int  n=7;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 10., -1., .05, -.00001, -.05, .001, -.000001;
+  // do the computation
+  hahn1_functor functor;
+  LevenbergMarquardt<hahn1_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 11);
+  VERIFY_IS_EQUAL(lm.njev(), 10);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.5324382854E+00);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.0776351733E+00);
+  VERIFY_IS_APPROX(x[1],-1.2269296921E-01);
+  VERIFY_IS_APPROX(x[2], 4.0863750610E-03);
+  VERIFY_IS_APPROX(x[3],-1.426264e-06); // shoulde be : -1.4262662514E-06
+  VERIFY_IS_APPROX(x[4],-5.7609940901E-03);
+  VERIFY_IS_APPROX(x[5], 2.4053735503E-04);
+  VERIFY_IS_APPROX(x[6],-1.2314450199E-07);
+
+  /*
+   * Second try
+   */
+  x<< .1, -.1, .005, -.000001, -.005, .0001, -.0000001;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+//   VERIFY_IS_EQUAL(lm.nfev(), 11);
+  VERIFY_IS_EQUAL(lm.njev(), 10);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.5324382854E+00);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.077640); // should be :  1.0776351733E+00
+  VERIFY_IS_APPROX(x[1], -0.1226933); // should be : -1.2269296921E-01
+  VERIFY_IS_APPROX(x[2], 0.004086383); // should be : 4.0863750610E-03
+  VERIFY_IS_APPROX(x[3], -1.426277e-06); // shoulde be : -1.4262662514E-06
+  VERIFY_IS_APPROX(x[4],-5.7609940901E-03);
+  VERIFY_IS_APPROX(x[5], 0.00024053772); // should be : 2.4053735503E-04
+  VERIFY_IS_APPROX(x[6], -1.231450e-07); // should be : -1.2314450199E-07
+
+}
+
+struct misra1d_functor : DenseFunctor<double>
+{
+    misra1d_functor(void) : DenseFunctor<double>(2,14) {}
+    static const double x[14];
+    static const double y[14];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==2);
+        assert(fvec.size()==14);
+        for(int i=0; i<14; i++) {
+            fvec[i] = b[0]*b[1]*x[i]/(1.+b[1]*x[i]) - y[i];
+        }
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==2);
+        assert(fjac.rows()==14);
+        assert(fjac.cols()==2);
+        for(int i=0; i<14; i++) {
+            double den = 1.+b[1]*x[i];
+            fjac(i,0) = b[1]*x[i] / den;
+            fjac(i,1) = b[0]*x[i]*(den-b[1]*x[i])/den/den;
+        }
+        return 0;
+    }
+};
+const double misra1d_functor::x[14] = { 77.6E0, 114.9E0, 141.1E0, 190.8E0, 239.9E0, 289.0E0, 332.8E0, 378.4E0, 434.8E0, 477.3E0, 536.8E0, 593.1E0, 689.1E0, 760.0E0};
+const double misra1d_functor::y[14] = { 10.07E0, 14.73E0, 17.94E0, 23.93E0, 29.61E0, 35.18E0, 40.02E0, 44.82E0, 50.76E0, 55.05E0, 61.01E0, 66.40E0, 75.47E0, 81.78E0};
+
+// http://www.itl.nist.gov/div898/strd/nls/data/misra1d.shtml
+void testNistMisra1d(void)
+{
+  const int n=2;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 500., 0.0001;
+  // do the computation
+  misra1d_functor functor;
+  LevenbergMarquardt<misra1d_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 9);
+  VERIFY_IS_EQUAL(lm.njev(), 7);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.6419295283E-02);
+  // check x
+  VERIFY_IS_APPROX(x[0], 4.3736970754E+02);
+  VERIFY_IS_APPROX(x[1], 3.0227324449E-04);
+
+  /*
+   * Second try
+   */
+  x<< 450., 0.0003;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 4);
+  VERIFY_IS_EQUAL(lm.njev(), 3);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.6419295283E-02);
+  // check x
+  VERIFY_IS_APPROX(x[0], 4.3736970754E+02);
+  VERIFY_IS_APPROX(x[1], 3.0227324449E-04);
+}
+
+
+struct lanczos1_functor : DenseFunctor<double>
+{
+    lanczos1_functor(void) : DenseFunctor<double>(6,24) {}
+    static const double x[24];
+    static const double y[24];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==6);
+        assert(fvec.size()==24);
+        for(int i=0; i<24; i++)
+            fvec[i] = b[0]*exp(-b[1]*x[i]) + b[2]*exp(-b[3]*x[i]) + b[4]*exp(-b[5]*x[i])  - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==6);
+        assert(fjac.rows()==24);
+        assert(fjac.cols()==6);
+        for(int i=0; i<24; i++) {
+            fjac(i,0) = exp(-b[1]*x[i]);
+            fjac(i,1) = -b[0]*x[i]*exp(-b[1]*x[i]);
+            fjac(i,2) = exp(-b[3]*x[i]);
+            fjac(i,3) = -b[2]*x[i]*exp(-b[3]*x[i]);
+            fjac(i,4) = exp(-b[5]*x[i]);
+            fjac(i,5) = -b[4]*x[i]*exp(-b[5]*x[i]);
+        }
+        return 0;
+    }
+};
+const double lanczos1_functor::x[24] = { 0.000000000000E+00, 5.000000000000E-02, 1.000000000000E-01, 1.500000000000E-01, 2.000000000000E-01, 2.500000000000E-01, 3.000000000000E-01, 3.500000000000E-01, 4.000000000000E-01, 4.500000000000E-01, 5.000000000000E-01, 5.500000000000E-01, 6.000000000000E-01, 6.500000000000E-01, 7.000000000000E-01, 7.500000000000E-01, 8.000000000000E-01, 8.500000000000E-01, 9.000000000000E-01, 9.500000000000E-01, 1.000000000000E+00, 1.050000000000E+00, 1.100000000000E+00, 1.150000000000E+00 };
+const double lanczos1_functor::y[24] = { 2.513400000000E+00 ,2.044333373291E+00 ,1.668404436564E+00 ,1.366418021208E+00 ,1.123232487372E+00 ,9.268897180037E-01 ,7.679338563728E-01 ,6.388775523106E-01 ,5.337835317402E-01 ,4.479363617347E-01 ,3.775847884350E-01 ,3.197393199326E-01 ,2.720130773746E-01 ,2.324965529032E-01 ,1.996589546065E-01 ,1.722704126914E-01 ,1.493405660168E-01 ,1.300700206922E-01 ,1.138119324644E-01 ,1.000415587559E-01 ,8.833209084540E-02 ,7.833544019350E-02 ,6.976693743449E-02 ,6.239312536719E-02 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/lanczos1.shtml
+void testNistLanczos1(void)
+{
+  const int n=6;
+  LevenbergMarquardtSpace::Status info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 1.2, 0.3, 5.6, 5.5, 6.5, 7.6;
+  // do the computation
+  lanczos1_functor functor;
+  LevenbergMarquardt<lanczos1_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall);
+  VERIFY_IS_EQUAL(lm.nfev(), 79);
+  VERIFY_IS_EQUAL(lm.njev(), 72);
+  // check norm^2
+  VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25);
+  // check x
+  VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
+  VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
+  VERIFY_IS_APPROX(x[2], 8.6070000013E-01);
+  VERIFY_IS_APPROX(x[3], 3.0000000002E+00);
+  VERIFY_IS_APPROX(x[4], 1.5575999998E+00);
+  VERIFY_IS_APPROX(x[5], 5.0000000001E+00);
+
+  /*
+   * Second try
+   */
+  x<< 0.5, 0.7, 3.6, 4.2, 4., 6.3;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall);
+  VERIFY_IS_EQUAL(lm.nfev(), 9);
+  VERIFY_IS_EQUAL(lm.njev(), 8);
+  // check norm^2
+  VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25);
+  // check x
+  VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
+  VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
+  VERIFY_IS_APPROX(x[2], 8.6070000013E-01);
+  VERIFY_IS_APPROX(x[3], 3.0000000002E+00);
+  VERIFY_IS_APPROX(x[4], 1.5575999998E+00);
+  VERIFY_IS_APPROX(x[5], 5.0000000001E+00);
+
+}
+
+struct rat42_functor : DenseFunctor<double>
+{
+    rat42_functor(void) : DenseFunctor<double>(3,9) {}
+    static const double x[9];
+    static const double y[9];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==3);
+        assert(fvec.size()==9);
+        for(int i=0; i<9; i++) {
+            fvec[i] = b[0] / (1.+exp(b[1]-b[2]*x[i])) - y[i];
+        }
+        return 0;
+    }
+
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==3);
+        assert(fjac.rows()==9);
+        assert(fjac.cols()==3);
+        for(int i=0; i<9; i++) {
+            double e = exp(b[1]-b[2]*x[i]);
+            fjac(i,0) = 1./(1.+e);
+            fjac(i,1) = -b[0]*e/(1.+e)/(1.+e);
+            fjac(i,2) = +b[0]*e*x[i]/(1.+e)/(1.+e);
+        }
+        return 0;
+    }
+};
+const double rat42_functor::x[9] = { 9.000E0, 14.000E0, 21.000E0, 28.000E0, 42.000E0, 57.000E0, 63.000E0, 70.000E0, 79.000E0 };
+const double rat42_functor::y[9] = { 8.930E0 ,10.800E0 ,18.590E0 ,22.330E0 ,39.350E0 ,56.110E0 ,61.730E0 ,64.620E0 ,67.080E0 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/ratkowsky2.shtml
+void testNistRat42(void)
+{
+  const int n=3;
+  LevenbergMarquardtSpace::Status info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 100., 1., 0.1;
+  // do the computation
+  rat42_functor functor;
+  LevenbergMarquardt<rat42_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
+  VERIFY_IS_EQUAL(lm.nfev(), 10);
+  VERIFY_IS_EQUAL(lm.njev(), 8);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.0565229338E+00);
+  // check x
+  VERIFY_IS_APPROX(x[0], 7.2462237576E+01);
+  VERIFY_IS_APPROX(x[1], 2.6180768402E+00);
+  VERIFY_IS_APPROX(x[2], 6.7359200066E-02);
+
+  /*
+   * Second try
+   */
+  x<< 75., 2.5, 0.07;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
+  VERIFY_IS_EQUAL(lm.nfev(), 6);
+  VERIFY_IS_EQUAL(lm.njev(), 5);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.0565229338E+00);
+  // check x
+  VERIFY_IS_APPROX(x[0], 7.2462237576E+01);
+  VERIFY_IS_APPROX(x[1], 2.6180768402E+00);
+  VERIFY_IS_APPROX(x[2], 6.7359200066E-02);
+}
+
+struct MGH10_functor : DenseFunctor<double>
+{
+    MGH10_functor(void) : DenseFunctor<double>(3,16) {}
+    static const double x[16];
+    static const double y[16];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==3);
+        assert(fvec.size()==16);
+        for(int i=0; i<16; i++)
+            fvec[i] =  b[0] * exp(b[1]/(x[i]+b[2])) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==3);
+        assert(fjac.rows()==16);
+        assert(fjac.cols()==3);
+        for(int i=0; i<16; i++) {
+            double factor = 1./(x[i]+b[2]);
+            double e = exp(b[1]*factor);
+            fjac(i,0) = e;
+            fjac(i,1) = b[0]*factor*e;
+            fjac(i,2) = -b[1]*b[0]*factor*factor*e;
+        }
+        return 0;
+    }
+};
+const double MGH10_functor::x[16] = { 5.000000E+01, 5.500000E+01, 6.000000E+01, 6.500000E+01, 7.000000E+01, 7.500000E+01, 8.000000E+01, 8.500000E+01, 9.000000E+01, 9.500000E+01, 1.000000E+02, 1.050000E+02, 1.100000E+02, 1.150000E+02, 1.200000E+02, 1.250000E+02 };
+const double MGH10_functor::y[16] = { 3.478000E+04, 2.861000E+04, 2.365000E+04, 1.963000E+04, 1.637000E+04, 1.372000E+04, 1.154000E+04, 9.744000E+03, 8.261000E+03, 7.030000E+03, 6.005000E+03, 5.147000E+03, 4.427000E+03, 3.820000E+03, 3.307000E+03, 2.872000E+03 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/mgh10.shtml
+void testNistMGH10(void)
+{
+  const int n=3;
+  LevenbergMarquardtSpace::Status info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 2., 400000., 25000.;
+  // do the computation
+  MGH10_functor functor;
+  LevenbergMarquardt<MGH10_functor> lm(functor);
+  info = lm.minimize(x);
+  ++g_test_level;
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
+  --g_test_level;
+  // was: VERIFY_IS_EQUAL(info, 1);
+
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7945855171E+01);
+  // check x
+  VERIFY_IS_APPROX(x[0], 5.6096364710E-03);
+  VERIFY_IS_APPROX(x[1], 6.1813463463E+03);
+  VERIFY_IS_APPROX(x[2], 3.4522363462E+02);
+  
+  // check return value
+
+  ++g_test_level;
+  VERIFY_IS_EQUAL(lm.nfev(), 284 );
+  VERIFY_IS_EQUAL(lm.njev(), 249 );
+  --g_test_level;
+  VERIFY(lm.nfev() < 284 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev() < 249 * LM_EVAL_COUNT_TOL);
+
+  /*
+   * Second try
+   */
+  x<< 0.02, 4000., 250.;
+  // do the computation
+  info = lm.minimize(x);
+  ++g_test_level;
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
+  // was: VERIFY_IS_EQUAL(info, 1);
+  --g_test_level;
+
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7945855171E+01);
+  // check x
+  VERIFY_IS_APPROX(x[0], 5.6096364710E-03);
+  VERIFY_IS_APPROX(x[1], 6.1813463463E+03);
+  VERIFY_IS_APPROX(x[2], 3.4522363462E+02);
+  
+  // check return value
+  ++g_test_level;
+  VERIFY_IS_EQUAL(lm.nfev(), 126);
+  VERIFY_IS_EQUAL(lm.njev(), 116);
+  --g_test_level;
+  VERIFY(lm.nfev() < 126 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev() < 116 * LM_EVAL_COUNT_TOL);
+}
+
+
+struct BoxBOD_functor : DenseFunctor<double>
+{
+    BoxBOD_functor(void) : DenseFunctor<double>(2,6) {}
+    static const double x[6];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        static const double y[6] = { 109., 149., 149., 191., 213., 224. };
+        assert(b.size()==2);
+        assert(fvec.size()==6);
+        for(int i=0; i<6; i++)
+            fvec[i] =  b[0]*(1.-exp(-b[1]*x[i])) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==2);
+        assert(fjac.rows()==6);
+        assert(fjac.cols()==2);
+        for(int i=0; i<6; i++) {
+            double e = exp(-b[1]*x[i]);
+            fjac(i,0) = 1.-e;
+            fjac(i,1) = b[0]*x[i]*e;
+        }
+        return 0;
+    }
+};
+const double BoxBOD_functor::x[6] = { 1., 2., 3., 5., 7., 10. };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/boxbod.shtml
+void testNistBoxBOD(void)
+{
+  const int n=2;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 1., 1.;
+  // do the computation
+  BoxBOD_functor functor;
+  LevenbergMarquardt<BoxBOD_functor> lm(functor);
+  lm.setFtol(1.E6*NumTraits<double>::epsilon());
+  lm.setXtol(1.E6*NumTraits<double>::epsilon());
+  lm.setFactor(10);
+  info = lm.minimize(x);
+
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.1680088766E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 2.1380940889E+02);
+  VERIFY_IS_APPROX(x[1], 5.4723748542E-01);
+  
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY(lm.nfev() < 31); // 31
+  VERIFY(lm.njev() < 25); // 25
+
+  /*
+   * Second try
+   */
+  x<< 100., 0.75;
+  // do the computation
+  lm.resetParameters();
+  lm.setFtol(NumTraits<double>::epsilon());
+  lm.setXtol( NumTraits<double>::epsilon());
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1); 
+  ++g_test_level;
+  VERIFY_IS_EQUAL(lm.nfev(), 16 );
+  VERIFY_IS_EQUAL(lm.njev(), 15 );
+  --g_test_level;
+  VERIFY(lm.nfev() < 16 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev() < 15 * LM_EVAL_COUNT_TOL);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.1680088766E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 2.1380940889E+02);
+  VERIFY_IS_APPROX(x[1], 5.4723748542E-01);
+}
+
+struct MGH17_functor : DenseFunctor<double>
+{
+    MGH17_functor(void) : DenseFunctor<double>(5,33) {}
+    static const double x[33];
+    static const double y[33];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==5);
+        assert(fvec.size()==33);
+        for(int i=0; i<33; i++)
+            fvec[i] =  b[0] + b[1]*exp(-b[3]*x[i]) +  b[2]*exp(-b[4]*x[i]) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==5);
+        assert(fjac.rows()==33);
+        assert(fjac.cols()==5);
+        for(int i=0; i<33; i++) {
+            fjac(i,0) = 1.;
+            fjac(i,1) = exp(-b[3]*x[i]);
+            fjac(i,2) = exp(-b[4]*x[i]);
+            fjac(i,3) = -x[i]*b[1]*exp(-b[3]*x[i]);
+            fjac(i,4) = -x[i]*b[2]*exp(-b[4]*x[i]);
+        }
+        return 0;
+    }
+};
+const double MGH17_functor::x[33] = { 0.000000E+00, 1.000000E+01, 2.000000E+01, 3.000000E+01, 4.000000E+01, 5.000000E+01, 6.000000E+01, 7.000000E+01, 8.000000E+01, 9.000000E+01, 1.000000E+02, 1.100000E+02, 1.200000E+02, 1.300000E+02, 1.400000E+02, 1.500000E+02, 1.600000E+02, 1.700000E+02, 1.800000E+02, 1.900000E+02, 2.000000E+02, 2.100000E+02, 2.200000E+02, 2.300000E+02, 2.400000E+02, 2.500000E+02, 2.600000E+02, 2.700000E+02, 2.800000E+02, 2.900000E+02, 3.000000E+02, 3.100000E+02, 3.200000E+02 };
+const double MGH17_functor::y[33] = { 8.440000E-01, 9.080000E-01, 9.320000E-01, 9.360000E-01, 9.250000E-01, 9.080000E-01, 8.810000E-01, 8.500000E-01, 8.180000E-01, 7.840000E-01, 7.510000E-01, 7.180000E-01, 6.850000E-01, 6.580000E-01, 6.280000E-01, 6.030000E-01, 5.800000E-01, 5.580000E-01, 5.380000E-01, 5.220000E-01, 5.060000E-01, 4.900000E-01, 4.780000E-01, 4.670000E-01, 4.570000E-01, 4.480000E-01, 4.380000E-01, 4.310000E-01, 4.240000E-01, 4.200000E-01, 4.140000E-01, 4.110000E-01, 4.060000E-01 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/mgh17.shtml
+void testNistMGH17(void)
+{
+  const int n=5;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 50., 150., -100., 1., 2.;
+  // do the computation
+  MGH17_functor functor;
+  LevenbergMarquardt<MGH17_functor> lm(functor);
+  lm.setFtol(NumTraits<double>::epsilon());
+  lm.setXtol(NumTraits<double>::epsilon());
+  lm.setMaxfev(1000);
+  info = lm.minimize(x);
+
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.4648946975E-05);
+  // check x
+  VERIFY_IS_APPROX(x[0], 3.7541005211E-01);
+  VERIFY_IS_APPROX(x[1], 1.9358469127E+00);
+  VERIFY_IS_APPROX(x[2], -1.4646871366E+00);
+  VERIFY_IS_APPROX(x[3], 1.2867534640E-02);
+  VERIFY_IS_APPROX(x[4], 2.2122699662E-02);
+  
+    // check return value
+//   VERIFY_IS_EQUAL(info, 2);  //FIXME Use (lm.info() == Success)
+  VERIFY(lm.nfev() < 700 ); // 602
+  VERIFY(lm.njev() < 600 ); // 545
+
+  /*
+   * Second try
+   */
+  x<< 0.5  ,1.5  ,-1   ,0.01 ,0.02;
+  // do the computation
+  lm.resetParameters();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 18);
+  VERIFY_IS_EQUAL(lm.njev(), 15);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.4648946975E-05);
+  // check x
+  VERIFY_IS_APPROX(x[0], 3.7541005211E-01);
+  VERIFY_IS_APPROX(x[1], 1.9358469127E+00);
+  VERIFY_IS_APPROX(x[2], -1.4646871366E+00);
+  VERIFY_IS_APPROX(x[3], 1.2867534640E-02);
+  VERIFY_IS_APPROX(x[4], 2.2122699662E-02);
+}
+
+struct MGH09_functor : DenseFunctor<double>
+{
+    MGH09_functor(void) : DenseFunctor<double>(4,11) {}
+    static const double _x[11];
+    static const double y[11];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==4);
+        assert(fvec.size()==11);
+        for(int i=0; i<11; i++) {
+            double x = _x[i], xx=x*x;
+            fvec[i] = b[0]*(xx+x*b[1])/(xx+x*b[2]+b[3]) - y[i];
+        }
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==4);
+        assert(fjac.rows()==11);
+        assert(fjac.cols()==4);
+        for(int i=0; i<11; i++) {
+            double x = _x[i], xx=x*x;
+            double factor = 1./(xx+x*b[2]+b[3]);
+            fjac(i,0) = (xx+x*b[1]) * factor;
+            fjac(i,1) = b[0]*x* factor;
+            fjac(i,2) = - b[0]*(xx+x*b[1]) * x * factor * factor;
+            fjac(i,3) = - b[0]*(xx+x*b[1]) * factor * factor;
+        }
+        return 0;
+    }
+};
+const double MGH09_functor::_x[11] = { 4., 2., 1., 5.E-1 , 2.5E-01, 1.670000E-01, 1.250000E-01,  1.E-01, 8.330000E-02, 7.140000E-02, 6.250000E-02 };
+const double MGH09_functor::y[11] = { 1.957000E-01, 1.947000E-01, 1.735000E-01, 1.600000E-01, 8.440000E-02, 6.270000E-02, 4.560000E-02, 3.420000E-02, 3.230000E-02, 2.350000E-02, 2.460000E-02 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/mgh09.shtml
+void testNistMGH09(void)
+{
+  const int n=4;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 25., 39, 41.5, 39.;
+  // do the computation
+  MGH09_functor functor;
+  LevenbergMarquardt<MGH09_functor> lm(functor);
+  lm.setMaxfev(1000);
+  info = lm.minimize(x);
+
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 3.0750560385E-04);
+  // check x
+  VERIFY_IS_APPROX(x[0], 0.1928077089); // should be 1.9280693458E-01
+  VERIFY_IS_APPROX(x[1], 0.19126423573); // should be 1.9128232873E-01
+  VERIFY_IS_APPROX(x[2], 0.12305309914); // should be 1.2305650693E-01
+  VERIFY_IS_APPROX(x[3], 0.13605395375); // should be 1.3606233068E-01
+  // check return value
+  VERIFY_IS_EQUAL(info, 1); 
+  VERIFY(lm.nfev() < 510 ); // 490
+  VERIFY(lm.njev() < 400 ); // 376
+
+  /*
+   * Second try
+   */
+  x<< 0.25, 0.39, 0.415, 0.39;
+  // do the computation
+  lm.resetParameters();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 18);
+  VERIFY_IS_EQUAL(lm.njev(), 16);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 3.0750560385E-04);
+  // check x
+  VERIFY_IS_APPROX(x[0], 0.19280781); // should be 1.9280693458E-01
+  VERIFY_IS_APPROX(x[1], 0.19126265); // should be 1.9128232873E-01
+  VERIFY_IS_APPROX(x[2], 0.12305280); // should be 1.2305650693E-01
+  VERIFY_IS_APPROX(x[3], 0.13605322); // should be 1.3606233068E-01
+}
+
+
+
+struct Bennett5_functor : DenseFunctor<double>
+{
+    Bennett5_functor(void) : DenseFunctor<double>(3,154) {}
+    static const double x[154];
+    static const double y[154];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==3);
+        assert(fvec.size()==154);
+        for(int i=0; i<154; i++)
+            fvec[i] = b[0]* pow(b[1]+x[i],-1./b[2]) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==3);
+        assert(fjac.rows()==154);
+        assert(fjac.cols()==3);
+        for(int i=0; i<154; i++) {
+            double e = pow(b[1]+x[i],-1./b[2]);
+            fjac(i,0) = e;
+            fjac(i,1) = - b[0]*e/b[2]/(b[1]+x[i]);
+            fjac(i,2) = b[0]*e*log(b[1]+x[i])/b[2]/b[2];
+        }
+        return 0;
+    }
+};
+const double Bennett5_functor::x[154] = { 7.447168E0, 8.102586E0, 8.452547E0, 8.711278E0, 8.916774E0, 9.087155E0, 9.232590E0, 9.359535E0, 9.472166E0, 9.573384E0, 9.665293E0, 9.749461E0, 9.827092E0, 9.899128E0, 9.966321E0, 10.029280E0, 10.088510E0, 10.144430E0, 10.197380E0, 10.247670E0, 10.295560E0, 10.341250E0, 10.384950E0, 10.426820E0, 10.467000E0, 10.505640E0, 10.542830E0, 10.578690E0, 10.613310E0, 10.646780E0, 10.679150E0, 10.710520E0, 10.740920E0, 10.770440E0, 10.799100E0, 10.826970E0, 10.854080E0, 10.880470E0, 10.906190E0, 10.931260E0, 10.955720E0, 10.979590E0, 11.002910E0, 11.025700E0, 11.047980E0, 11.069770E0, 11.091100E0, 11.111980E0, 11.132440E0, 11.152480E0, 11.172130E0, 11.191410E0, 11.210310E0, 11.228870E0, 11.247090E0, 11.264980E0, 11.282560E0, 11.299840E0, 11.316820E0, 11.333520E0, 11.349940E0, 11.366100E0, 11.382000E0, 11.397660E0, 11.413070E0, 11.428240E0, 11.443200E0, 11.457930E0, 11.472440E0, 11.486750E0, 11.500860E0, 11.514770E0, 11.528490E0, 11.542020E0, 11.555380E0, 11.568550E0,
+11.581560E0, 11.594420E0, 11.607121E0, 11.619640E0, 11.632000E0, 11.644210E0, 11.656280E0, 11.668200E0, 11.679980E0, 11.691620E0, 11.703130E0, 11.714510E0, 11.725760E0, 11.736880E0, 11.747890E0, 11.758780E0, 11.769550E0, 11.780200E0, 11.790730E0, 11.801160E0, 11.811480E0, 11.821700E0, 11.831810E0, 11.841820E0, 11.851730E0, 11.861550E0, 11.871270E0, 11.880890E0, 11.890420E0, 11.899870E0, 11.909220E0, 11.918490E0, 11.927680E0, 11.936780E0, 11.945790E0, 11.954730E0, 11.963590E0, 11.972370E0, 11.981070E0, 11.989700E0, 11.998260E0, 12.006740E0, 12.015150E0, 12.023490E0, 12.031760E0, 12.039970E0, 12.048100E0, 12.056170E0, 12.064180E0, 12.072120E0, 12.080010E0, 12.087820E0, 12.095580E0, 12.103280E0, 12.110920E0, 12.118500E0, 12.126030E0, 12.133500E0, 12.140910E0, 12.148270E0, 12.155570E0, 12.162830E0, 12.170030E0, 12.177170E0, 12.184270E0, 12.191320E0, 12.198320E0, 12.205270E0, 12.212170E0, 12.219030E0, 12.225840E0, 12.232600E0, 12.239320E0, 12.245990E0, 12.252620E0, 12.259200E0, 12.265750E0, 12.272240E0 };
+const double Bennett5_functor::y[154] = { -34.834702E0 ,-34.393200E0 ,-34.152901E0 ,-33.979099E0 ,-33.845901E0 ,-33.732899E0 ,-33.640301E0 ,-33.559200E0 ,-33.486801E0 ,-33.423100E0 ,-33.365101E0 ,-33.313000E0 ,-33.260899E0 ,-33.217400E0 ,-33.176899E0 ,-33.139198E0 ,-33.101601E0 ,-33.066799E0 ,-33.035000E0 ,-33.003101E0 ,-32.971298E0 ,-32.942299E0 ,-32.916302E0 ,-32.890202E0 ,-32.864101E0 ,-32.841000E0 ,-32.817799E0 ,-32.797501E0 ,-32.774300E0 ,-32.757000E0 ,-32.733799E0 ,-32.716400E0 ,-32.699100E0 ,-32.678799E0 ,-32.661400E0 ,-32.644001E0 ,-32.626701E0 ,-32.612202E0 ,-32.597698E0 ,-32.583199E0 ,-32.568699E0 ,-32.554298E0 ,-32.539799E0 ,-32.525299E0 ,-32.510799E0 ,-32.499199E0 ,-32.487598E0 ,-32.473202E0 ,-32.461601E0 ,-32.435501E0 ,-32.435501E0 ,-32.426800E0 ,-32.412300E0 ,-32.400799E0 ,-32.392101E0 ,-32.380501E0 ,-32.366001E0 ,-32.357300E0 ,-32.348598E0 ,-32.339901E0 ,-32.328400E0 ,-32.319698E0 ,-32.311001E0 ,-32.299400E0 ,-32.290699E0 ,-32.282001E0 ,-32.273300E0 ,-32.264599E0 ,-32.256001E0 ,-32.247299E0
+,-32.238602E0 ,-32.229900E0 ,-32.224098E0 ,-32.215401E0 ,-32.203800E0 ,-32.198002E0 ,-32.189400E0 ,-32.183601E0 ,-32.174900E0 ,-32.169102E0 ,-32.163300E0 ,-32.154598E0 ,-32.145901E0 ,-32.140099E0 ,-32.131401E0 ,-32.125599E0 ,-32.119801E0 ,-32.111198E0 ,-32.105400E0 ,-32.096699E0 ,-32.090900E0 ,-32.088001E0 ,-32.079300E0 ,-32.073502E0 ,-32.067699E0 ,-32.061901E0 ,-32.056099E0 ,-32.050301E0 ,-32.044498E0 ,-32.038799E0 ,-32.033001E0 ,-32.027199E0 ,-32.024300E0 ,-32.018501E0 ,-32.012699E0 ,-32.004002E0 ,-32.001099E0 ,-31.995300E0 ,-31.989500E0 ,-31.983700E0 ,-31.977900E0 ,-31.972099E0 ,-31.969299E0 ,-31.963501E0 ,-31.957701E0 ,-31.951900E0 ,-31.946100E0 ,-31.940300E0 ,-31.937401E0 ,-31.931601E0 ,-31.925800E0 ,-31.922899E0 ,-31.917101E0 ,-31.911301E0 ,-31.908400E0 ,-31.902599E0 ,-31.896900E0 ,-31.893999E0 ,-31.888201E0 ,-31.885300E0 ,-31.882401E0 ,-31.876600E0 ,-31.873699E0 ,-31.867901E0 ,-31.862101E0 ,-31.859200E0 ,-31.856300E0 ,-31.850500E0 ,-31.844700E0 ,-31.841801E0 ,-31.838900E0 ,-31.833099E0 ,-31.830200E0 ,
+-31.827299E0 ,-31.821600E0 ,-31.818701E0 ,-31.812901E0 ,-31.809999E0 ,-31.807100E0 ,-31.801300E0 ,-31.798401E0 ,-31.795500E0 ,-31.789700E0 ,-31.786800E0 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/bennett5.shtml
+void testNistBennett5(void)
+{
+  const int  n=3;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< -2000., 50., 0.8;
+  // do the computation
+  Bennett5_functor functor;
+  LevenbergMarquardt<Bennett5_functor> lm(functor);
+  lm.setMaxfev(1000);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 758);
+  VERIFY_IS_EQUAL(lm.njev(), 744);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.2404744073E-04);
+  // check x
+  VERIFY_IS_APPROX(x[0], -2.5235058043E+03);
+  VERIFY_IS_APPROX(x[1], 4.6736564644E+01);
+  VERIFY_IS_APPROX(x[2], 9.3218483193E-01);
+  /*
+   * Second try
+   */
+  x<< -1500., 45., 0.85;
+  // do the computation
+  lm.resetParameters();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 203);
+  VERIFY_IS_EQUAL(lm.njev(), 192);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.2404744073E-04);
+  // check x
+  VERIFY_IS_APPROX(x[0], -2523.3007865); // should be -2.5235058043E+03
+  VERIFY_IS_APPROX(x[1], 46.735705771); // should be 4.6736564644E+01);
+  VERIFY_IS_APPROX(x[2], 0.93219881891); // should be 9.3218483193E-01);
+}
+
+struct thurber_functor : DenseFunctor<double>
+{
+    thurber_functor(void) : DenseFunctor<double>(7,37) {}
+    static const double _x[37];
+    static const double _y[37];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        //        int called=0; printf("call hahn1_functor with  iflag=%d, called=%d\n", iflag, called); if (iflag==1) called++;
+        assert(b.size()==7);
+        assert(fvec.size()==37);
+        for(int i=0; i<37; i++) {
+            double x=_x[i], xx=x*x, xxx=xx*x;
+            fvec[i] = (b[0]+b[1]*x+b[2]*xx+b[3]*xxx) / (1.+b[4]*x+b[5]*xx+b[6]*xxx) - _y[i];
+        }
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==7);
+        assert(fjac.rows()==37);
+        assert(fjac.cols()==7);
+        for(int i=0; i<37; i++) {
+            double x=_x[i], xx=x*x, xxx=xx*x;
+            double fact = 1./(1.+b[4]*x+b[5]*xx+b[6]*xxx);
+            fjac(i,0) = 1.*fact;
+            fjac(i,1) = x*fact;
+            fjac(i,2) = xx*fact;
+            fjac(i,3) = xxx*fact;
+            fact = - (b[0]+b[1]*x+b[2]*xx+b[3]*xxx) * fact * fact;
+            fjac(i,4) = x*fact;
+            fjac(i,5) = xx*fact;
+            fjac(i,6) = xxx*fact;
+        }
+        return 0;
+    }
+};
+const double thurber_functor::_x[37] = { -3.067E0, -2.981E0, -2.921E0, -2.912E0, -2.840E0, -2.797E0, -2.702E0, -2.699E0, -2.633E0, -2.481E0, -2.363E0, -2.322E0, -1.501E0, -1.460E0, -1.274E0, -1.212E0, -1.100E0, -1.046E0, -0.915E0, -0.714E0, -0.566E0, -0.545E0, -0.400E0, -0.309E0, -0.109E0, -0.103E0, 0.010E0, 0.119E0, 0.377E0, 0.790E0, 0.963E0, 1.006E0, 1.115E0, 1.572E0, 1.841E0, 2.047E0, 2.200E0 };
+const double thurber_functor::_y[37] = { 80.574E0, 84.248E0, 87.264E0, 87.195E0, 89.076E0, 89.608E0, 89.868E0, 90.101E0, 92.405E0, 95.854E0, 100.696E0, 101.060E0, 401.672E0, 390.724E0, 567.534E0, 635.316E0, 733.054E0, 759.087E0, 894.206E0, 990.785E0, 1090.109E0, 1080.914E0, 1122.643E0, 1178.351E0, 1260.531E0, 1273.514E0, 1288.339E0, 1327.543E0, 1353.863E0, 1414.509E0, 1425.208E0, 1421.384E0, 1442.962E0, 1464.350E0, 1468.705E0, 1447.894E0, 1457.628E0};
+
+// http://www.itl.nist.gov/div898/strd/nls/data/thurber.shtml
+void testNistThurber(void)
+{
+  const int n=7;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 1000 ,1000 ,400 ,40 ,0.7,0.3,0.0 ;
+  // do the computation
+  thurber_functor functor;
+  LevenbergMarquardt<thurber_functor> lm(functor);
+  lm.setFtol(1.E4*NumTraits<double>::epsilon());
+  lm.setXtol(1.E4*NumTraits<double>::epsilon());
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 39);
+  VERIFY_IS_EQUAL(lm.njev(), 36);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.6427082397E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.2881396800E+03);
+  VERIFY_IS_APPROX(x[1], 1.4910792535E+03);
+  VERIFY_IS_APPROX(x[2], 5.8323836877E+02);
+  VERIFY_IS_APPROX(x[3], 7.5416644291E+01);
+  VERIFY_IS_APPROX(x[4], 9.6629502864E-01);
+  VERIFY_IS_APPROX(x[5], 3.9797285797E-01);
+  VERIFY_IS_APPROX(x[6], 4.9727297349E-02);
+
+  /*
+   * Second try
+   */
+  x<< 1300 ,1500 ,500  ,75   ,1    ,0.4  ,0.05  ;
+  // do the computation
+  lm.resetParameters();
+  lm.setFtol(1.E4*NumTraits<double>::epsilon());
+  lm.setXtol(1.E4*NumTraits<double>::epsilon());
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 29);
+  VERIFY_IS_EQUAL(lm.njev(), 28);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.6427082397E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.2881396800E+03);
+  VERIFY_IS_APPROX(x[1], 1.4910792535E+03);
+  VERIFY_IS_APPROX(x[2], 5.8323836877E+02);
+  VERIFY_IS_APPROX(x[3], 7.5416644291E+01);
+  VERIFY_IS_APPROX(x[4], 9.6629502864E-01);
+  VERIFY_IS_APPROX(x[5], 3.9797285797E-01);
+  VERIFY_IS_APPROX(x[6], 4.9727297349E-02);
+}
+
+struct rat43_functor : DenseFunctor<double>
+{
+    rat43_functor(void) : DenseFunctor<double>(4,15) {}
+    static const double x[15];
+    static const double y[15];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==4);
+        assert(fvec.size()==15);
+        for(int i=0; i<15; i++)
+            fvec[i] = b[0] * pow(1.+exp(b[1]-b[2]*x[i]),-1./b[3]) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==4);
+        assert(fjac.rows()==15);
+        assert(fjac.cols()==4);
+        for(int i=0; i<15; i++) {
+            double e = exp(b[1]-b[2]*x[i]);
+            double power = -1./b[3];
+            fjac(i,0) = pow(1.+e, power);
+            fjac(i,1) = power*b[0]*e*pow(1.+e, power-1.);
+            fjac(i,2) = -power*b[0]*e*x[i]*pow(1.+e, power-1.);
+            fjac(i,3) = b[0]*power*power*log(1.+e)*pow(1.+e, power);
+        }
+        return 0;
+    }
+};
+const double rat43_functor::x[15] = { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15. };
+const double rat43_functor::y[15] = { 16.08, 33.83, 65.80, 97.20, 191.55, 326.20, 386.87, 520.53, 590.03, 651.92, 724.93, 699.56, 689.96, 637.56, 717.41 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/ratkowsky3.shtml
+void testNistRat43(void)
+{
+  const int n=4;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 100., 10., 1., 1.;
+  // do the computation
+  rat43_functor functor;
+  LevenbergMarquardt<rat43_functor> lm(functor);
+  lm.setFtol(1.E6*NumTraits<double>::epsilon());
+  lm.setXtol(1.E6*NumTraits<double>::epsilon());
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 27);
+  VERIFY_IS_EQUAL(lm.njev(), 20);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7864049080E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 6.9964151270E+02);
+  VERIFY_IS_APPROX(x[1], 5.2771253025E+00);
+  VERIFY_IS_APPROX(x[2], 7.5962938329E-01);
+  VERIFY_IS_APPROX(x[3], 1.2792483859E+00);
+
+  /*
+   * Second try
+   */
+  x<< 700., 5., 0.75, 1.3;
+  // do the computation
+  lm.resetParameters();
+  lm.setFtol(1.E5*NumTraits<double>::epsilon());
+  lm.setXtol(1.E5*NumTraits<double>::epsilon());
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 9);
+  VERIFY_IS_EQUAL(lm.njev(), 8);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7864049080E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 6.9964151270E+02);
+  VERIFY_IS_APPROX(x[1], 5.2771253025E+00);
+  VERIFY_IS_APPROX(x[2], 7.5962938329E-01);
+  VERIFY_IS_APPROX(x[3], 1.2792483859E+00);
+}
+
+
+
+struct eckerle4_functor : DenseFunctor<double>
+{
+    eckerle4_functor(void) : DenseFunctor<double>(3,35) {}
+    static const double x[35];
+    static const double y[35];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==3);
+        assert(fvec.size()==35);
+        for(int i=0; i<35; i++)
+            fvec[i] = b[0]/b[1] * exp(-0.5*(x[i]-b[2])*(x[i]-b[2])/(b[1]*b[1])) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==3);
+        assert(fjac.rows()==35);
+        assert(fjac.cols()==3);
+        for(int i=0; i<35; i++) {
+            double b12 = b[1]*b[1];
+            double e = exp(-0.5*(x[i]-b[2])*(x[i]-b[2])/b12);
+            fjac(i,0) = e / b[1];
+            fjac(i,1) = ((x[i]-b[2])*(x[i]-b[2])/b12-1.) * b[0]*e/b12;
+            fjac(i,2) = (x[i]-b[2])*e*b[0]/b[1]/b12;
+        }
+        return 0;
+    }
+};
+const double eckerle4_functor::x[35] = { 400.0, 405.0, 410.0, 415.0, 420.0, 425.0, 430.0, 435.0, 436.5, 438.0, 439.5, 441.0, 442.5, 444.0, 445.5, 447.0, 448.5, 450.0, 451.5, 453.0, 454.5, 456.0, 457.5, 459.0, 460.5, 462.0, 463.5, 465.0, 470.0, 475.0, 480.0, 485.0, 490.0, 495.0, 500.0};
+const double eckerle4_functor::y[35] = { 0.0001575, 0.0001699, 0.0002350, 0.0003102, 0.0004917, 0.0008710, 0.0017418, 0.0046400, 0.0065895, 0.0097302, 0.0149002, 0.0237310, 0.0401683, 0.0712559, 0.1264458, 0.2073413, 0.2902366, 0.3445623, 0.3698049, 0.3668534, 0.3106727, 0.2078154, 0.1164354, 0.0616764, 0.0337200, 0.0194023, 0.0117831, 0.0074357, 0.0022732, 0.0008800, 0.0004579, 0.0002345, 0.0001586, 0.0001143, 0.0000710 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/eckerle4.shtml
+void testNistEckerle4(void)
+{
+  const int n=3;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 1., 10., 500.;
+  // do the computation
+  eckerle4_functor functor;
+  LevenbergMarquardt<eckerle4_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 18);
+  VERIFY_IS_EQUAL(lm.njev(), 15);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.4635887487E-03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.5543827178);
+  VERIFY_IS_APPROX(x[1], 4.0888321754);
+  VERIFY_IS_APPROX(x[2], 4.5154121844E+02);
+
+  /*
+   * Second try
+   */
+  x<< 1.5, 5., 450.;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 7);
+  VERIFY_IS_EQUAL(lm.njev(), 6);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.4635887487E-03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.5543827178);
+  VERIFY_IS_APPROX(x[1], 4.0888321754);
+  VERIFY_IS_APPROX(x[2], 4.5154121844E+02);
+}
+
+EIGEN_DECLARE_TEST(levenberg_marquardt)
+{
+    // Tests using the examples provided by (c)minpack
+    CALL_SUBTEST(testLmder1());
+    CALL_SUBTEST(testLmder());
+    CALL_SUBTEST(testLmdif1());
+//     CALL_SUBTEST(testLmstr1());
+//     CALL_SUBTEST(testLmstr());
+    CALL_SUBTEST(testLmdif());
+
+    // NIST tests, level of difficulty = "Lower"
+    CALL_SUBTEST(testNistMisra1a());
+    CALL_SUBTEST(testNistChwirut2());
+
+    // NIST tests, level of difficulty = "Average"
+    CALL_SUBTEST(testNistHahn1());
+    CALL_SUBTEST(testNistMisra1d());
+    CALL_SUBTEST(testNistMGH17());
+    CALL_SUBTEST(testNistLanczos1());
+
+//     // NIST tests, level of difficulty = "Higher"
+    CALL_SUBTEST(testNistRat42());
+    CALL_SUBTEST(testNistMGH10());
+    CALL_SUBTEST(testNistBoxBOD());
+//     CALL_SUBTEST(testNistMGH09());
+    CALL_SUBTEST(testNistBennett5());
+    CALL_SUBTEST(testNistThurber());
+    CALL_SUBTEST(testNistRat43());
+    CALL_SUBTEST(testNistEckerle4());
+}

diff --git a/unsupported/test/matrix_exponential.cpp b/unsupported/test/matrix_exponential.cpp
new file mode 100644
index 0000000..b032cbf
--- /dev/null
+++ b/unsupported/test/matrix_exponential.cpp

@@ -0,0 +1,141 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "matrix_functions.h"
+
+double binom(int n, int k)
+{
+  double res = 1;
+  for (int i=0; i<k; i++)
+    res = res * (n-k+i+1) / (i+1);
+  return res;
+}
+
+template <typename T>
+T expfn(T x, int)
+{
+  return std::exp(x);
+}
+
+template <typename T>
+void test2dRotation(double tol)
+{
+  Matrix<T,2,2> A, B, C;
+  T angle;
+
+  A << 0, 1, -1, 0;
+  for (int i=0; i<=20; i++)
+  {
+    angle = static_cast<T>(pow(10, i / 5. - 2));
+    B << std::cos(angle), std::sin(angle), -std::sin(angle), std::cos(angle);
+
+    C = (angle*A).matrixFunction(expfn);
+    std::cout << "test2dRotation: i = " << i << "   error funm = " << relerr(C, B);
+    VERIFY(C.isApprox(B, static_cast<T>(tol)));
+
+    C = (angle*A).exp();
+    std::cout << "   error expm = " << relerr(C, B) << "\n";
+    VERIFY(C.isApprox(B, static_cast<T>(tol)));
+  }
+}
+
+template <typename T>
+void test2dHyperbolicRotation(double tol)
+{
+  Matrix<std::complex<T>,2,2> A, B, C;
+  std::complex<T> imagUnit(0,1);
+  T angle, ch, sh;
+
+  for (int i=0; i<=20; i++)
+  {
+    angle = static_cast<T>((i-10) / 2.0);
+    ch = std::cosh(angle);
+    sh = std::sinh(angle);
+    A << 0, angle*imagUnit, -angle*imagUnit, 0;
+    B << ch, sh*imagUnit, -sh*imagUnit, ch;
+
+    C = A.matrixFunction(expfn);
+    std::cout << "test2dHyperbolicRotation: i = " << i << "   error funm = " << relerr(C, B);
+    VERIFY(C.isApprox(B, static_cast<T>(tol)));
+
+    C = A.exp();
+    std::cout << "   error expm = " << relerr(C, B) << "\n";
+    VERIFY(C.isApprox(B, static_cast<T>(tol)));
+  }
+}
+
+template <typename T>
+void testPascal(double tol)
+{
+  for (int size=1; size<20; size++)
+  {
+    Matrix<T,Dynamic,Dynamic> A(size,size), B(size,size), C(size,size);
+    A.setZero();
+    for (int i=0; i<size-1; i++)
+      A(i+1,i) = static_cast<T>(i+1);
+    B.setZero();
+    for (int i=0; i<size; i++)
+      for (int j=0; j<=i; j++)
+    B(i,j) = static_cast<T>(binom(i,j));
+
+    C = A.matrixFunction(expfn);
+    std::cout << "testPascal: size = " << size << "   error funm = " << relerr(C, B);
+    VERIFY(C.isApprox(B, static_cast<T>(tol)));
+
+    C = A.exp();
+    std::cout << "   error expm = " << relerr(C, B) << "\n";
+    VERIFY(C.isApprox(B, static_cast<T>(tol)));
+  }
+}
+
+template<typename MatrixType>
+void randomTest(const MatrixType& m, double tol)
+{
+  /* this test covers the following files:
+     Inverse.h
+  */
+  typename MatrixType::Index rows = m.rows();
+  typename MatrixType::Index cols = m.cols();
+  MatrixType m1(rows, cols), m2(rows, cols), identity = MatrixType::Identity(rows, cols);
+
+  typedef typename NumTraits<typename internal::traits<MatrixType>::Scalar>::Real RealScalar;
+
+  for(int i = 0; i < g_repeat; i++) {
+    m1 = MatrixType::Random(rows, cols);
+
+    m2 = m1.matrixFunction(expfn) * (-m1).matrixFunction(expfn);
+    std::cout << "randomTest: error funm = " << relerr(identity, m2);
+    VERIFY(identity.isApprox(m2, static_cast<RealScalar>(tol)));
+
+    m2 = m1.exp() * (-m1).exp();
+    std::cout << "   error expm = " << relerr(identity, m2) << "\n";
+    VERIFY(identity.isApprox(m2, static_cast<RealScalar>(tol)));
+  }
+}
+
+EIGEN_DECLARE_TEST(matrix_exponential)
+{
+  CALL_SUBTEST_2(test2dRotation<double>(1e-13));
+  CALL_SUBTEST_1(test2dRotation<float>(2e-5));  // was 1e-5, relaxed for clang 2.8 / linux / x86-64
+  CALL_SUBTEST_8(test2dRotation<long double>(1e-13)); 
+  CALL_SUBTEST_2(test2dHyperbolicRotation<double>(1e-14));
+  CALL_SUBTEST_1(test2dHyperbolicRotation<float>(1e-5));
+  CALL_SUBTEST_8(test2dHyperbolicRotation<long double>(1e-14));
+  CALL_SUBTEST_6(testPascal<float>(1e-6));
+  CALL_SUBTEST_5(testPascal<double>(1e-15));
+  CALL_SUBTEST_2(randomTest(Matrix2d(), 1e-13));
+  CALL_SUBTEST_7(randomTest(Matrix<double,3,3,RowMajor>(), 1e-13));
+  CALL_SUBTEST_3(randomTest(Matrix4cd(), 1e-13));
+  CALL_SUBTEST_4(randomTest(MatrixXd(8,8), 1e-13));
+  CALL_SUBTEST_1(randomTest(Matrix2f(), 1e-4));
+  CALL_SUBTEST_5(randomTest(Matrix3cf(), 1e-4));
+  CALL_SUBTEST_1(randomTest(Matrix4f(), 1e-4));
+  CALL_SUBTEST_6(randomTest(MatrixXf(8,8), 1e-4));
+  CALL_SUBTEST_9(randomTest(Matrix<long double,Dynamic,Dynamic>(7,7), 1e-13));
+}

diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp
new file mode 100644
index 0000000..6d75373
--- /dev/null
+++ b/unsupported/test/matrix_function.cpp

@@ -0,0 +1,227 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <unsupported/Eigen/MatrixFunctions>
+
+// Variant of VERIFY_IS_APPROX which uses absolute error instead of
+// relative error.
+#define VERIFY_IS_APPROX_ABS(a, b) VERIFY(test_isApprox_abs(a, b))
+
+template<typename Type1, typename Type2>
+inline bool test_isApprox_abs(const Type1& a, const Type2& b)
+{
+  return ((a-b).array().abs() < test_precision<typename Type1::RealScalar>()).all();
+}
+
+
+// Returns a matrix with eigenvalues clustered around 0, 1 and 2.
+template<typename MatrixType>
+MatrixType randomMatrixWithRealEivals(const Index size)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  MatrixType diag = MatrixType::Zero(size, size);
+  for (Index i = 0; i < size; ++i) {
+    diag(i, i) = Scalar(RealScalar(internal::random<int>(0,2)))
+      + internal::random<Scalar>() * Scalar(RealScalar(0.01));
+  }
+  MatrixType A = MatrixType::Random(size, size);
+  HouseholderQR<MatrixType> QRofA(A);
+  return QRofA.householderQ().inverse() * diag * QRofA.householderQ();
+}
+
+template <typename MatrixType, int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex>
+struct randomMatrixWithImagEivals
+{
+  // Returns a matrix with eigenvalues clustered around 0 and +/- i.
+  static MatrixType run(const Index size);
+};
+
+// Partial specialization for real matrices
+template<typename MatrixType>
+struct randomMatrixWithImagEivals<MatrixType, 0>
+{
+  static MatrixType run(const Index size)
+  {
+    typedef typename MatrixType::Scalar Scalar;
+    MatrixType diag = MatrixType::Zero(size, size);
+    Index i = 0;
+    while (i < size) {
+      Index randomInt = internal::random<Index>(-1, 1);
+      if (randomInt == 0 || i == size-1) {
+        diag(i, i) = internal::random<Scalar>() * Scalar(0.01);
+        ++i;
+      } else {
+        Scalar alpha = Scalar(randomInt) + internal::random<Scalar>() * Scalar(0.01);
+        diag(i, i+1) = alpha;
+        diag(i+1, i) = -alpha;
+        i += 2;
+      }
+    }
+    MatrixType A = MatrixType::Random(size, size);
+    HouseholderQR<MatrixType> QRofA(A);
+    return QRofA.householderQ().inverse() * diag * QRofA.householderQ();
+  }
+};
+
+// Partial specialization for complex matrices
+template<typename MatrixType>
+struct randomMatrixWithImagEivals<MatrixType, 1>
+{
+  static MatrixType run(const Index size)
+  {
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    const Scalar imagUnit(0, 1);
+    MatrixType diag = MatrixType::Zero(size, size);
+    for (Index i = 0; i < size; ++i) {
+      diag(i, i) = Scalar(RealScalar(internal::random<Index>(-1, 1))) * imagUnit
+        + internal::random<Scalar>() * Scalar(RealScalar(0.01));
+    }
+    MatrixType A = MatrixType::Random(size, size);
+    HouseholderQR<MatrixType> QRofA(A);
+    return QRofA.householderQ().inverse() * diag * QRofA.householderQ();
+  }
+};
+
+
+template<typename MatrixType>
+void testMatrixExponential(const MatrixType& A)
+{
+  typedef typename internal::traits<MatrixType>::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef std::complex<RealScalar> ComplexScalar;
+
+  VERIFY_IS_APPROX(A.exp(), A.matrixFunction(internal::stem_function_exp<ComplexScalar>));
+}
+
+template<typename MatrixType>
+void testMatrixLogarithm(const MatrixType& A)
+{
+  typedef typename internal::traits<MatrixType>::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  MatrixType scaledA;
+  RealScalar maxImagPartOfSpectrum = A.eigenvalues().imag().cwiseAbs().maxCoeff();
+  if (maxImagPartOfSpectrum >= RealScalar(0.9L * EIGEN_PI))
+    scaledA = A * RealScalar(0.9L * EIGEN_PI) / maxImagPartOfSpectrum;
+  else
+    scaledA = A;
+
+  // identity X.exp().log() = X only holds if Im(lambda) < pi for all eigenvalues of X
+  MatrixType expA = scaledA.exp();
+  MatrixType logExpA = expA.log();
+  VERIFY_IS_APPROX(logExpA, scaledA);
+}
+
+template<typename MatrixType>
+void testHyperbolicFunctions(const MatrixType& A)
+{
+  // Need to use absolute error because of possible cancellation when
+  // adding/subtracting expA and expmA.
+  VERIFY_IS_APPROX_ABS(A.sinh(), (A.exp() - (-A).exp()) / 2);
+  VERIFY_IS_APPROX_ABS(A.cosh(), (A.exp() + (-A).exp()) / 2);
+}
+
+template<typename MatrixType>
+void testGonioFunctions(const MatrixType& A)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef std::complex<RealScalar> ComplexScalar;
+  typedef Matrix<ComplexScalar, MatrixType::RowsAtCompileTime, 
+                 MatrixType::ColsAtCompileTime, MatrixType::Options> ComplexMatrix;
+
+  ComplexScalar imagUnit(0,1);
+  ComplexScalar two(2,0);
+
+  ComplexMatrix Ac = A.template cast<ComplexScalar>();
+  
+  ComplexMatrix exp_iA = (imagUnit * Ac).exp();
+  ComplexMatrix exp_miA = (-imagUnit * Ac).exp();
+  
+  ComplexMatrix sinAc = A.sin().template cast<ComplexScalar>();
+  VERIFY_IS_APPROX_ABS(sinAc, (exp_iA - exp_miA) / (two*imagUnit));
+  
+  ComplexMatrix cosAc = A.cos().template cast<ComplexScalar>();
+  VERIFY_IS_APPROX_ABS(cosAc, (exp_iA + exp_miA) / 2);
+}
+
+template<typename MatrixType>
+void testMatrix(const MatrixType& A)
+{
+  testMatrixExponential(A);
+  testMatrixLogarithm(A);
+  testHyperbolicFunctions(A);
+  testGonioFunctions(A);
+}
+
+template<typename MatrixType>
+void testMatrixType(const MatrixType& m)
+{
+  // Matrices with clustered eigenvalue lead to different code paths
+  // in MatrixFunction.h and are thus useful for testing.
+
+  const Index size = m.rows();
+  for (int i = 0; i < g_repeat; i++) {
+    testMatrix(MatrixType::Random(size, size).eval());
+    testMatrix(randomMatrixWithRealEivals<MatrixType>(size));
+    testMatrix(randomMatrixWithImagEivals<MatrixType>::run(size));
+  }
+}
+
+template<typename MatrixType>
+void testMapRef(const MatrixType& A)
+{
+  // Test if passing Ref and Map objects is possible
+  // (Regression test for Bug #1796)
+  Index size = A.rows();
+  MatrixType X; X.setRandom(size, size);
+  MatrixType Y(size,size);
+  Ref<      MatrixType> R(Y);
+  Ref<const MatrixType> Rc(X);
+  Map<      MatrixType> M(Y.data(), size, size);
+  Map<const MatrixType> Mc(X.data(), size, size);
+
+  X = X*X; // make sure sqrt is possible
+  Y = X.sqrt();
+  R = Rc.sqrt();
+  M = Mc.sqrt();
+  Y = X.exp();
+  R = Rc.exp();
+  M = Mc.exp();
+  X = Y; // make sure log is possible
+  Y = X.log();
+  R = Rc.log();
+  M = Mc.log();
+
+  Y = X.cos() + Rc.cos() + Mc.cos();
+  Y = X.sin() + Rc.sin() + Mc.sin();
+
+  Y = X.cosh() + Rc.cosh() + Mc.cosh();
+  Y = X.sinh() + Rc.sinh() + Mc.sinh();
+}
+
+
+EIGEN_DECLARE_TEST(matrix_function)
+{
+  CALL_SUBTEST_1(testMatrixType(Matrix<float,1,1>()));
+  CALL_SUBTEST_2(testMatrixType(Matrix3cf()));
+  CALL_SUBTEST_3(testMatrixType(MatrixXf(8,8)));
+  CALL_SUBTEST_4(testMatrixType(Matrix2d()));
+  CALL_SUBTEST_5(testMatrixType(Matrix<double,5,5,RowMajor>()));
+  CALL_SUBTEST_6(testMatrixType(Matrix4cd()));
+  CALL_SUBTEST_7(testMatrixType(MatrixXd(13,13)));
+
+  CALL_SUBTEST_1(testMapRef(Matrix<float,1,1>()));
+  CALL_SUBTEST_2(testMapRef(Matrix3cf()));
+  CALL_SUBTEST_3(testMapRef(MatrixXf(8,8)));
+  CALL_SUBTEST_7(testMapRef(MatrixXd(13,13)));
+}

diff --git a/unsupported/test/matrix_functions.h b/unsupported/test/matrix_functions.h
new file mode 100644
index 0000000..4e26364
--- /dev/null
+++ b/unsupported/test/matrix_functions.h

@@ -0,0 +1,67 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2011 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <unsupported/Eigen/MatrixFunctions>
+
+// For complex matrices, any matrix is fine.
+template<typename MatrixType, int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex>
+struct processTriangularMatrix
+{
+  static void run(MatrixType&, MatrixType&, const MatrixType&)
+  { }
+};
+
+// For real matrices, make sure none of the eigenvalues are negative.
+template<typename MatrixType>
+struct processTriangularMatrix<MatrixType,0>
+{
+  static void run(MatrixType& m, MatrixType& T, const MatrixType& U)
+  {
+    const Index size = m.cols();
+
+    for (Index i=0; i < size; ++i) {
+      if (i == size - 1 || T.coeff(i+1,i) == 0)
+        T.coeffRef(i,i) = std::abs(T.coeff(i,i));
+      else
+        ++i;
+    }
+    m = U * T * U.transpose();
+  }
+};
+
+template <typename MatrixType, int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex>
+struct generateTestMatrix;
+
+template <typename MatrixType>
+struct generateTestMatrix<MatrixType,0>
+{
+  static void run(MatrixType& result, typename MatrixType::Index size)
+  {
+    result = MatrixType::Random(size, size);
+    RealSchur<MatrixType> schur(result);
+    MatrixType T = schur.matrixT();
+    processTriangularMatrix<MatrixType>::run(result, T, schur.matrixU());
+  }
+};
+
+template <typename MatrixType>
+struct generateTestMatrix<MatrixType,1>
+{
+  static void run(MatrixType& result, typename MatrixType::Index size)
+  {
+    result = MatrixType::Random(size, size);
+  }
+};
+
+template <typename Derived, typename OtherDerived>
+typename Derived::RealScalar relerr(const MatrixBase<Derived>& A, const MatrixBase<OtherDerived>& B)
+{
+  return std::sqrt((A - B).cwiseAbs2().sum() / (std::min)(A.cwiseAbs2().sum(), B.cwiseAbs2().sum()));
+}

diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp
new file mode 100644
index 0000000..dbaf9db
--- /dev/null
+++ b/unsupported/test/matrix_power.cpp

@@ -0,0 +1,204 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012, 2013 Chen-Pang He <jdh8@ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "matrix_functions.h"
+
+template<typename T>
+void test2dRotation(const T& tol)
+{
+  Matrix<T,2,2> A, B, C;
+  T angle, c, s;
+
+  A << 0, 1, -1, 0;
+  MatrixPower<Matrix<T,2,2> > Apow(A);
+
+  for (int i=0; i<=20; ++i) {
+    angle = std::pow(T(10), T(i-10) / T(5.));
+    c = std::cos(angle);
+    s = std::sin(angle);
+    B << c, s, -s, c;
+
+    C = Apow(std::ldexp(angle,1) / T(EIGEN_PI));
+    std::cout << "test2dRotation: i = " << i << "   error powerm = " << relerr(C,B) << '\n';
+    VERIFY(C.isApprox(B, tol));
+  }
+}
+
+template<typename T>
+void test2dHyperbolicRotation(const T& tol)
+{
+  Matrix<std::complex<T>,2,2> A, B, C;
+  T angle, ch = std::cosh((T)1);
+  std::complex<T> ish(0, std::sinh((T)1));
+
+  A << ch, ish, -ish, ch;
+  MatrixPower<Matrix<std::complex<T>,2,2> > Apow(A);
+
+  for (int i=0; i<=20; ++i) {
+    angle = std::ldexp(static_cast<T>(i-10), -1);
+    ch = std::cosh(angle);
+    ish = std::complex<T>(0, std::sinh(angle));
+    B << ch, ish, -ish, ch;
+
+    C = Apow(angle);
+    std::cout << "test2dHyperbolicRotation: i = " << i << "   error powerm = " << relerr(C,B) << '\n';
+    VERIFY(C.isApprox(B, tol));
+  }
+}
+
+template<typename T>
+void test3dRotation(const T& tol)
+{
+  Matrix<T,3,1> v;
+  T angle;
+
+  for (int i=0; i<=20; ++i) {
+    v = Matrix<T,3,1>::Random();
+    v.normalize();
+    angle = std::pow(T(10), T(i-10) / T(5.));
+    VERIFY(AngleAxis<T>(angle, v).matrix().isApprox(AngleAxis<T>(1,v).matrix().pow(angle), tol));
+  }
+}
+
+template<typename MatrixType>
+void testGeneral(const MatrixType& m, const typename MatrixType::RealScalar& tol)
+{
+  typedef typename MatrixType::RealScalar RealScalar;
+  MatrixType m1, m2, m3, m4, m5;
+  RealScalar x, y;
+
+  for (int i=0; i < g_repeat; ++i) {
+    generateTestMatrix<MatrixType>::run(m1, m.rows());
+    MatrixPower<MatrixType> mpow(m1);
+
+    x = internal::random<RealScalar>();
+    y = internal::random<RealScalar>();
+    m2 = mpow(x);
+    m3 = mpow(y);
+
+    m4 = mpow(x+y);
+    m5.noalias() = m2 * m3;
+    VERIFY(m4.isApprox(m5, tol));
+
+    m4 = mpow(x*y);
+    m5 = m2.pow(y);
+    VERIFY(m4.isApprox(m5, tol));
+
+    m4 = (std::abs(x) * m1).pow(y);
+    m5 = std::pow(std::abs(x), y) * m3;
+    VERIFY(m4.isApprox(m5, tol));
+  }
+}
+
+template<typename MatrixType>
+void testSingular(const MatrixType& m_const, const typename MatrixType::RealScalar& tol)
+{
+  // we need to pass by reference in order to prevent errors with
+  // MSVC for aligned data types ...
+  MatrixType& m = const_cast<MatrixType&>(m_const);
+
+  const int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex;
+  typedef typename internal::conditional<IsComplex, TriangularView<MatrixType,Upper>, const MatrixType&>::type TriangularType;
+  typename internal::conditional< IsComplex, ComplexSchur<MatrixType>, RealSchur<MatrixType> >::type schur;
+  MatrixType T;
+
+  for (int i=0; i < g_repeat; ++i) {
+    m.setRandom();
+    m.col(0).fill(0);
+
+    schur.compute(m);
+    T = schur.matrixT();
+    const MatrixType& U = schur.matrixU();
+    processTriangularMatrix<MatrixType>::run(m, T, U);
+    MatrixPower<MatrixType> mpow(m);
+
+    T = T.sqrt();
+    VERIFY(mpow(0.5L).isApprox(U * (TriangularType(T) * U.adjoint()), tol));
+
+    T = T.sqrt();
+    VERIFY(mpow(0.25L).isApprox(U * (TriangularType(T) * U.adjoint()), tol));
+
+    T = T.sqrt();
+    VERIFY(mpow(0.125L).isApprox(U * (TriangularType(T) * U.adjoint()), tol));
+  }
+}
+
+template<typename MatrixType>
+void testLogThenExp(const MatrixType& m_const, const typename MatrixType::RealScalar& tol)
+{
+  // we need to pass by reference in order to prevent errors with
+  // MSVC for aligned data types ...
+  MatrixType& m = const_cast<MatrixType&>(m_const);
+
+  typedef typename MatrixType::Scalar Scalar;
+  Scalar x;
+
+  for (int i=0; i < g_repeat; ++i) {
+    generateTestMatrix<MatrixType>::run(m, m.rows());
+    x = internal::random<Scalar>();
+    VERIFY(m.pow(x).isApprox((x * m.log()).exp(), tol));
+  }
+}
+
+typedef Matrix<double,3,3,RowMajor>         Matrix3dRowMajor;
+typedef Matrix<long double,3,3>             Matrix3e;
+typedef Matrix<long double,Dynamic,Dynamic> MatrixXe;
+ 
+EIGEN_DECLARE_TEST(matrix_power)
+{
+  CALL_SUBTEST_2(test2dRotation<double>(1e-13));
+  CALL_SUBTEST_1(test2dRotation<float>(2e-5f));  // was 1e-5, relaxed for clang 2.8 / linux / x86-64
+  CALL_SUBTEST_9(test2dRotation<long double>(1e-13L));
+  CALL_SUBTEST_2(test2dHyperbolicRotation<double>(1e-14));
+  CALL_SUBTEST_1(test2dHyperbolicRotation<float>(1e-5f));
+  CALL_SUBTEST_9(test2dHyperbolicRotation<long double>(1e-14L));
+
+  CALL_SUBTEST_10(test3dRotation<double>(1e-13));
+  CALL_SUBTEST_11(test3dRotation<float>(1e-5f));
+  CALL_SUBTEST_12(test3dRotation<long double>(1e-13L));
+
+  CALL_SUBTEST_2(testGeneral(Matrix2d(),         1e-13));
+  CALL_SUBTEST_7(testGeneral(Matrix3dRowMajor(), 1e-13));
+  CALL_SUBTEST_3(testGeneral(Matrix4cd(),        1e-13));
+  CALL_SUBTEST_4(testGeneral(MatrixXd(8,8),      2e-12));
+  CALL_SUBTEST_1(testGeneral(Matrix2f(),         1e-4f));
+  CALL_SUBTEST_5(testGeneral(Matrix3cf(),        1e-4f));
+  CALL_SUBTEST_8(testGeneral(Matrix4f(),         1e-4f));
+  CALL_SUBTEST_6(testGeneral(MatrixXf(2,2),      1e-3f)); // see bug 614
+  CALL_SUBTEST_9(testGeneral(MatrixXe(7,7),      1e-13L));
+  CALL_SUBTEST_10(testGeneral(Matrix3d(),        1e-13));
+  CALL_SUBTEST_11(testGeneral(Matrix3f(),        1e-4f));
+  CALL_SUBTEST_12(testGeneral(Matrix3e(),        1e-13L));
+
+  CALL_SUBTEST_2(testSingular(Matrix2d(),         1e-13));
+  CALL_SUBTEST_7(testSingular(Matrix3dRowMajor(), 1e-13));
+  CALL_SUBTEST_3(testSingular(Matrix4cd(),        1e-13));
+  CALL_SUBTEST_4(testSingular(MatrixXd(8,8),      2e-12));
+  CALL_SUBTEST_1(testSingular(Matrix2f(),         1e-4f));
+  CALL_SUBTEST_5(testSingular(Matrix3cf(),        1e-4f));
+  CALL_SUBTEST_8(testSingular(Matrix4f(),         1e-4f));
+  CALL_SUBTEST_6(testSingular(MatrixXf(2,2),      1e-3f));
+  CALL_SUBTEST_9(testSingular(MatrixXe(7,7),      1e-13L));
+  CALL_SUBTEST_10(testSingular(Matrix3d(),        1e-13));
+  CALL_SUBTEST_11(testSingular(Matrix3f(),        1e-4f));
+  CALL_SUBTEST_12(testSingular(Matrix3e(),        1e-13L));
+
+  CALL_SUBTEST_2(testLogThenExp(Matrix2d(),         1e-13));
+  CALL_SUBTEST_7(testLogThenExp(Matrix3dRowMajor(), 1e-13));
+  CALL_SUBTEST_3(testLogThenExp(Matrix4cd(),        1e-13));
+  CALL_SUBTEST_4(testLogThenExp(MatrixXd(8,8),      2e-12));
+  CALL_SUBTEST_1(testLogThenExp(Matrix2f(),         1e-4f));
+  CALL_SUBTEST_5(testLogThenExp(Matrix3cf(),        1e-4f));
+  CALL_SUBTEST_8(testLogThenExp(Matrix4f(),         1e-4f));
+  CALL_SUBTEST_6(testLogThenExp(MatrixXf(2,2),      1e-3f));
+  CALL_SUBTEST_9(testLogThenExp(MatrixXe(7,7),      1e-13L));
+  CALL_SUBTEST_10(testLogThenExp(Matrix3d(),        1e-13));
+  CALL_SUBTEST_11(testLogThenExp(Matrix3f(),        1e-4f));
+  CALL_SUBTEST_12(testLogThenExp(Matrix3e(),        1e-13L));
+}

diff --git a/unsupported/test/matrix_square_root.cpp b/unsupported/test/matrix_square_root.cpp
new file mode 100644
index 0000000..034f292
--- /dev/null
+++ b/unsupported/test/matrix_square_root.cpp

@@ -0,0 +1,31 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "matrix_functions.h"
+
+template<typename MatrixType>
+void testMatrixSqrt(const MatrixType& m)
+{
+  MatrixType A;
+  generateTestMatrix<MatrixType>::run(A, m.rows());
+  MatrixType sqrtA = A.sqrt();
+  VERIFY_IS_APPROX(sqrtA * sqrtA, A);
+}
+
+EIGEN_DECLARE_TEST(matrix_square_root)
+{
+  for (int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1(testMatrixSqrt(Matrix3cf()));
+    CALL_SUBTEST_2(testMatrixSqrt(MatrixXcd(12,12)));
+    CALL_SUBTEST_3(testMatrixSqrt(Matrix4f()));
+    CALL_SUBTEST_4(testMatrixSqrt(Matrix<double,Dynamic,Dynamic,RowMajor>(9, 9)));
+    CALL_SUBTEST_5(testMatrixSqrt(Matrix<float,1,1>()));
+    CALL_SUBTEST_5(testMatrixSqrt(Matrix<std::complex<float>,1,1>()));
+  }
+}

diff --git a/unsupported/test/minres.cpp b/unsupported/test/minres.cpp
new file mode 100644
index 0000000..2eb40fe
--- /dev/null
+++ b/unsupported/test/minres.cpp

@@ -0,0 +1,44 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Giacomo Po <gpo@ucla.edu>
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#include <cmath>
+
+#include "../../test/sparse_solver.h"
+#include <Eigen/IterativeSolvers>
+
+template<typename T> void test_minres_T()
+{
+  // Identity preconditioner
+  MINRES<SparseMatrix<T>, Lower, IdentityPreconditioner    > minres_colmajor_lower_I;
+  MINRES<SparseMatrix<T>, Upper, IdentityPreconditioner    > minres_colmajor_upper_I;
+
+  // Diagonal preconditioner
+  MINRES<SparseMatrix<T>, Lower, DiagonalPreconditioner<T> > minres_colmajor_lower_diag;
+  MINRES<SparseMatrix<T>, Upper, DiagonalPreconditioner<T> > minres_colmajor_upper_diag;
+  MINRES<SparseMatrix<T>, Lower|Upper, DiagonalPreconditioner<T> > minres_colmajor_uplo_diag;
+  
+  // call tests for SPD matrix
+  CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_I) );
+  CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_upper_I) );
+    
+  CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_diag)  );
+  CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_upper_diag)  );
+  CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_uplo_diag)  );
+    
+  // TO DO: symmetric semi-definite matrix
+  // TO DO: symmetric indefinite matrix
+
+}
+
+EIGEN_DECLARE_TEST(minres)
+{
+  CALL_SUBTEST_1(test_minres_T<double>());
+//  CALL_SUBTEST_2(test_minres_T<std::compex<double> >());
+
+}

diff --git a/unsupported/test/mpreal_support.cpp b/unsupported/test/mpreal_support.cpp
new file mode 100644
index 0000000..10beb07
--- /dev/null
+++ b/unsupported/test/mpreal_support.cpp

@@ -0,0 +1,66 @@
+#include <mpreal.h>  // Must be included before main.h.
+#include "main.h"
+#include <Eigen/MPRealSupport>
+#include <Eigen/LU>
+#include <Eigen/Eigenvalues>
+#include <sstream>
+
+using namespace mpfr;
+using namespace Eigen;
+
+EIGEN_DECLARE_TEST(mpreal_support)
+{
+  // set precision to 256 bits (double has only 53 bits)
+  mpreal::set_default_prec(256);
+  typedef Matrix<mpreal,Eigen::Dynamic,Eigen::Dynamic> MatrixXmp;
+  typedef Matrix<std::complex<mpreal>,Eigen::Dynamic,Eigen::Dynamic> MatrixXcmp;
+
+  std::cerr << "epsilon =         " << NumTraits<mpreal>::epsilon() << "\n";
+  std::cerr << "dummy_precision = " << NumTraits<mpreal>::dummy_precision() << "\n";
+  std::cerr << "highest =         " << NumTraits<mpreal>::highest() << "\n";
+  std::cerr << "lowest =          " << NumTraits<mpreal>::lowest() << "\n";
+  std::cerr << "digits10 =        " << NumTraits<mpreal>::digits10() << "\n";
+
+  for(int i = 0; i < g_repeat; i++) {
+    int s = Eigen::internal::random<int>(1,100);
+    MatrixXmp A = MatrixXmp::Random(s,s);
+    MatrixXmp B = MatrixXmp::Random(s,s);
+    MatrixXmp S = A.adjoint() * A;
+    MatrixXmp X;
+    MatrixXcmp Ac = MatrixXcmp::Random(s,s);
+    MatrixXcmp Bc = MatrixXcmp::Random(s,s);
+    MatrixXcmp Sc = Ac.adjoint() * Ac;
+    MatrixXcmp Xc;
+    
+    // Basic stuffs
+    VERIFY_IS_APPROX(A.real(), A);
+    VERIFY(Eigen::internal::isApprox(A.array().abs2().sum(), A.squaredNorm()));
+    VERIFY_IS_APPROX(A.array().exp(),         exp(A.array()));
+    VERIFY_IS_APPROX(A.array().abs2().sqrt(), A.array().abs());
+    VERIFY_IS_APPROX(A.array().sin(),         sin(A.array()));
+    VERIFY_IS_APPROX(A.array().cos(),         cos(A.array()));
+
+    // Cholesky
+    X = S.selfadjointView<Lower>().llt().solve(B);
+    VERIFY_IS_APPROX((S.selfadjointView<Lower>()*X).eval(),B);
+
+    Xc = Sc.selfadjointView<Lower>().llt().solve(Bc);
+    VERIFY_IS_APPROX((Sc.selfadjointView<Lower>()*Xc).eval(),Bc);
+    
+    // partial LU
+    X = A.lu().solve(B);
+    VERIFY_IS_APPROX((A*X).eval(),B);
+
+    // symmetric eigenvalues
+    SelfAdjointEigenSolver<MatrixXmp> eig(S);
+    VERIFY_IS_EQUAL(eig.info(), Success);
+    VERIFY( (S.selfadjointView<Lower>() * eig.eigenvectors()).isApprox(eig.eigenvectors() * eig.eigenvalues().asDiagonal(), NumTraits<mpreal>::dummy_precision()*1e3) );
+  }
+  
+  {
+    MatrixXmp A(8,3); A.setRandom();
+    // test output (interesting things happen in this code)
+    std::stringstream stream;
+    stream << A;
+  }
+}

diff --git a/unsupported/test/openglsupport.cpp b/unsupported/test/openglsupport.cpp
new file mode 100644
index 0000000..1c44381
--- /dev/null
+++ b/unsupported/test/openglsupport.cpp

@@ -0,0 +1,600 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <main.h>
+#include <iostream>
+#include <string>
+
+#if defined(__APPLE_CC__)
+  // Prevent deprecation warnings caused by GLEW on MacOS.
+  #define GL_SILENCE_DEPRECATION 1
+#endif
+#include <GL/glew.h>
+#include <Eigen/OpenGLSupport>
+#if defined(__APPLE_CC__)
+  #include <GLUT/glut.h>
+#else
+  #include <GL/freeglut.h>
+#endif
+
+using namespace Eigen;
+
+#define VERIFY_MATRIX(CODE,REF) { \
+    glMatrixMode(GL_MODELVIEW); \
+    glLoadIdentity(); \
+    CODE; \
+    Matrix<float,4,4,ColMajor> m; m.setZero(); \
+    glGet(GL_MODELVIEW_MATRIX, m); \
+    if(!(REF).cast<float>().isApprox(m)) { \
+      std::cerr << "Expected:\n" << ((REF).cast<float>()) << "\n" << "got\n" << m << "\n\n"; \
+    } \
+    VERIFY_IS_APPROX((REF).cast<float>(), m); \
+  }
+
+#define VERIFY_UNIFORM(SUFFIX,NAME,TYPE) { \
+    TYPE value; value.setRandom(); \
+    TYPE data; \
+    int loc = glGetUniformLocation(prg_id, #NAME); \
+    VERIFY((loc!=-1) && "uniform not found"); \
+    glUniform(loc,value); \
+    EIGEN_CAT(glGetUniform,SUFFIX)(prg_id,loc,data.data()); \
+    if(!value.isApprox(data)) { \
+      std::cerr << "Expected:\n" << value << "\n" << "got\n" << data << "\n\n"; \
+    } \
+    VERIFY_IS_APPROX(value, data); \
+  }
+
+#define VERIFY_UNIFORMi(NAME,TYPE) { \
+    TYPE value = TYPE::Random().eval().cast<float>().cast<TYPE::Scalar>(); \
+    TYPE data; \
+    int loc = glGetUniformLocation(prg_id, #NAME); \
+    VERIFY((loc!=-1) && "uniform not found"); \
+    glUniform(loc,value); \
+    glGetUniformiv(prg_id,loc,(GLint*)data.data()); \
+    if(!value.isApprox(data)) { \
+      std::cerr << "Expected:\n" << value << "\n" << "got\n" << data << "\n\n"; \
+    } \
+    VERIFY_IS_APPROX(value, data); \
+  }
+
+void printProgramInfoLog(GLuint objectID)
+{
+    int infologLength, charsWritten;
+    GLchar *infoLog;
+    glGetProgramiv(objectID, GL_INFO_LOG_LENGTH, &infologLength);
+    if(infologLength > 0)
+    {
+        infoLog = new GLchar[infologLength];
+        glGetProgramInfoLog(objectID, infologLength, &charsWritten, infoLog);
+        if (charsWritten > 0)
+          std::cerr << "Program info : \n" << infoLog << std::endl;
+        delete[] infoLog;
+    }
+}
+
+void printShaderInfoLog(GLuint objectID)
+{
+    int infologLength, charsWritten;
+    GLchar *infoLog;
+    glGetShaderiv(objectID, GL_INFO_LOG_LENGTH, &infologLength);
+    if(infologLength > 0)
+    {
+        infoLog = new GLchar[infologLength];
+        glGetShaderInfoLog(objectID, infologLength, &charsWritten, infoLog);
+        if (charsWritten > 0)
+          std::cerr << "Shader info : \n" << infoLog << std::endl;
+        delete[] infoLog;
+    }
+}
+
+GLint createProgram(const char* vtx, const char* frg, bool print_errors = true)
+{
+  GLint prg_id = glCreateProgram();
+  GLint vtx_id = glCreateShader(GL_VERTEX_SHADER);
+  GLint frg_id = glCreateShader(GL_FRAGMENT_SHADER);
+  GLint ok;
+
+  glShaderSource(vtx_id, 1, &vtx, 0);
+  glCompileShader(vtx_id);
+  glGetShaderiv(vtx_id, GL_COMPILE_STATUS, &ok);
+  if(!ok)
+  {
+    if (print_errors)
+    {
+      std::cerr << "vtx compilation failed\n";
+      std::cerr << "Source:\n" << vtx << "\n";
+      printShaderInfoLog(vtx_id);
+    }
+    glDeleteShader(vtx_id);
+    return GL_ZERO;
+  }
+
+  glShaderSource(frg_id, 1, &frg, 0);
+  glCompileShader(frg_id);
+  glGetShaderiv(frg_id, GL_COMPILE_STATUS, &ok);
+  if(!ok)
+  {
+    if (print_errors)
+    {
+      std::cerr << "frg compilation failed.\n";
+      std::cerr << "Source:\n" << frg << "\n";
+      printShaderInfoLog(frg_id);
+    }
+    glDeleteShader(vtx_id);
+    glDeleteShader(frg_id);
+    return GL_ZERO;
+  }
+
+  glAttachShader(prg_id, vtx_id);
+  glAttachShader(prg_id, frg_id);
+  glLinkProgram(prg_id);
+
+  // Delete shaders once linked.
+  glDeleteShader(vtx_id);
+  glDeleteShader(frg_id);
+  glGetProgramiv(prg_id, GL_LINK_STATUS, &ok);
+  if(!ok)
+  {
+    if (print_errors)
+    {
+      std::cerr << "linking failed.\n";
+      printProgramInfoLog(prg_id);
+    }
+    glDeleteProgram(prg_id);
+    return GL_ZERO;
+  }
+
+  glUseProgram(prg_id);
+  return prg_id;
+}
+
+GLint createProgram(const std::string& vtx, const std::string& frg, bool print_errors = true)
+{
+  return createProgram(vtx.c_str(), frg.c_str(), print_errors);
+}
+
+std::string getGlslVersionString(int gl_major_version, int gl_minor_version)
+{
+  switch (gl_major_version)
+  {
+    case 2:
+      switch (gl_minor_version)
+      {
+        case 0:
+          return "#version 110";
+        case 1:
+          return "#version 120";
+      }
+      break;
+    case 3:
+      switch (gl_minor_version)
+      {
+        case 0:
+          return "#version 130";
+        case 1:
+          return "#version 140";
+        case 2:
+          return "#version 150";
+        case 3:
+          return "#version 330";
+      }
+      break;
+    case 4:
+      switch (gl_minor_version)
+      {
+        case 0:
+          return "#version 400";
+        case 1:
+          return "#version 410";
+        case 2:
+          return "#version 420";
+        case 3:
+          return "#version 430";
+        case 4:
+          return "#version 440";
+        case 5:
+          return "#version 450";
+        case 6:
+          return "#version 460";
+      }
+      break;
+  }
+  return "";
+}
+
+void find_and_replace(
+  std::string& str,
+  const std::string& find,
+  const std::string& replace)
+{
+  size_t loc = 0;
+  size_t flen = find.length();
+  size_t rlen = replace.length();
+  while ( (loc = str.find(find, loc)) != std::string::npos) {
+    str.replace(loc, flen, replace);
+    loc += rlen;
+  }
+}
+
+// Finds and replaces a set of substrings in a string.
+std::string format(
+  const std::string& str,
+  const std::vector<std::string>& find,
+  const std::vector<std::string>& replace)
+{
+  std::string out = str;
+  for (std::size_t i=0; i<find.size(); ++i) {
+    find_and_replace(out, find[i], replace[i]);
+  }
+  return out;
+}
+
+// GLUT display function that runs test.  Must be run within the display loop
+// in order to properly destroy resources.
+void openglsupport_test_loop()
+{
+  // Get context info.
+  const GLubyte* gl_version_string = glGetString(GL_VERSION);
+  std::cerr << "GL version: " << gl_version_string << std::endl;
+  std::cerr << "GLSL version: " << glGetString(GL_SHADING_LANGUAGE_VERSION) << std::endl;
+  // Parse version from string since GL_MAJOR_VERSION is only supported in GL 3.0+.
+  // Version string guaranteed to be <major>.<minor><vender extension>.
+  GLint gl_major_version = gl_version_string[0] - '0';
+  GLint gl_minor_version = gl_version_string[2] - '0';
+  bool legacy_gl = gl_major_version < 3 || (gl_major_version == 3 && gl_minor_version < 2);
+
+  // Fixed-function pipeline removed in OpenGL 3.2.
+  if (legacy_gl)
+  {
+    // Draw a basic triangle.
+    Vector3f v3f;
+    Matrix3f rot;
+    glBegin(GL_POINTS);
+    {
+      glVertex(v3f);
+      glVertex(2*v3f+v3f);
+      glVertex(rot*v3f);
+    }
+    glEnd();
+
+    // 4x4 matrices
+    Matrix4f mf44; mf44.setRandom();
+    VERIFY_MATRIX(glLoadMatrix(mf44), mf44);
+    VERIFY_MATRIX(glMultMatrix(mf44), mf44);
+    Matrix4d md44; md44.setRandom();
+    VERIFY_MATRIX(glLoadMatrix(md44), md44);
+    VERIFY_MATRIX(glMultMatrix(md44), md44);
+
+    // Quaternion
+    Quaterniond qd(AngleAxisd(internal::random<double>(), Vector3d::Random()));
+    VERIFY_MATRIX(glRotate(qd), Projective3d(qd).matrix());
+
+    Quaternionf qf(AngleAxisf(internal::random<double>(), Vector3f::Random()));
+    VERIFY_MATRIX(glRotate(qf), Projective3f(qf).matrix());
+
+    // 3D Transform
+    Transform<float,3,AffineCompact> acf3; acf3.matrix().setRandom();
+    VERIFY_MATRIX(glLoadMatrix(acf3), Projective3f(acf3).matrix());
+    VERIFY_MATRIX(glMultMatrix(acf3), Projective3f(acf3).matrix());
+
+    Transform<float,3,Affine> af3(acf3);
+    VERIFY_MATRIX(glLoadMatrix(af3), Projective3f(af3).matrix());
+    VERIFY_MATRIX(glMultMatrix(af3), Projective3f(af3).matrix());
+
+    Transform<float,3,Projective> pf3; pf3.matrix().setRandom();
+    VERIFY_MATRIX(glLoadMatrix(pf3), Projective3f(pf3).matrix());
+    VERIFY_MATRIX(glMultMatrix(pf3), Projective3f(pf3).matrix());
+
+    Transform<double,3,AffineCompact> acd3; acd3.matrix().setRandom();
+    VERIFY_MATRIX(glLoadMatrix(acd3), Projective3d(acd3).matrix());
+    VERIFY_MATRIX(glMultMatrix(acd3), Projective3d(acd3).matrix());
+
+    Transform<double,3,Affine> ad3(acd3);
+    VERIFY_MATRIX(glLoadMatrix(ad3), Projective3d(ad3).matrix());
+    VERIFY_MATRIX(glMultMatrix(ad3), Projective3d(ad3).matrix());
+
+    Transform<double,3,Projective> pd3; pd3.matrix().setRandom();
+    VERIFY_MATRIX(glLoadMatrix(pd3), Projective3d(pd3).matrix());
+    VERIFY_MATRIX(glMultMatrix(pd3), Projective3d(pd3).matrix());
+
+    // translations (2D and 3D)
+    {
+      Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 0;
+      VERIFY_MATRIX(glTranslate(vf2), Projective3f(Translation3f(vf23)).matrix());
+      Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 0;
+      VERIFY_MATRIX(glTranslate(vd2), Projective3d(Translation3d(vd23)).matrix());
+
+      Vector3f vf3; vf3.setRandom();
+      VERIFY_MATRIX(glTranslate(vf3), Projective3f(Translation3f(vf3)).matrix());
+      Vector3d vd3; vd3.setRandom();
+      VERIFY_MATRIX(glTranslate(vd3), Projective3d(Translation3d(vd3)).matrix());
+
+      Translation<float,3> tf3; tf3.vector().setRandom();
+      VERIFY_MATRIX(glTranslate(tf3), Projective3f(tf3).matrix());
+
+      Translation<double,3> td3;  td3.vector().setRandom();
+      VERIFY_MATRIX(glTranslate(td3), Projective3d(td3).matrix());
+    }
+
+    // scaling (2D and 3D)
+    {
+      Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 1;
+      VERIFY_MATRIX(glScale(vf2), Projective3f(Scaling(vf23)).matrix());
+      Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 1;
+      VERIFY_MATRIX(glScale(vd2), Projective3d(Scaling(vd23)).matrix());
+
+      Vector3f vf3; vf3.setRandom();
+      VERIFY_MATRIX(glScale(vf3), Projective3f(Scaling(vf3)).matrix());
+      Vector3d vd3; vd3.setRandom();
+      VERIFY_MATRIX(glScale(vd3), Projective3d(Scaling(vd3)).matrix());
+
+      UniformScaling<float> usf(internal::random<float>());
+      VERIFY_MATRIX(glScale(usf), Projective3f(usf).matrix());
+
+      UniformScaling<double> usd(internal::random<double>());
+      VERIFY_MATRIX(glScale(usd), Projective3d(usd).matrix());
+    }
+  } else {
+    std::cerr << "Warning: fixed-function pipeline was not tested.\n";
+  }
+
+  // Dynamic shader substitution variables.
+  // Modern shaders require a version string, and newer runtimes fail to
+  // compile old GLSL versions. Thus, we dynamically set the GLSL version
+  // string based on runtime. Also, pre OpenGL 3.0, the output gl_FragColor was
+  // built-in. This was deprecated in OpenGL 3.0, requiring us to explicitly
+  // define the output variable.
+  std::vector<std::string> glsl_vars;
+  glsl_vars.push_back("${GLSL_VERSION}");
+  glsl_vars.push_back("${FRAG_OUTPUT_DECLARATION}");
+  glsl_vars.push_back("${FRAG_OUTPUT_VARIABLE}");
+
+  std::vector<std::string> glsl_vals;
+  glsl_vals.push_back(getGlslVersionString(gl_major_version, gl_minor_version));
+  if (gl_major_version >= 3) {
+    glsl_vals.push_back("out vec4 fragColor;");
+    glsl_vals.push_back("fragColor");
+  } else {
+    glsl_vals.push_back("");
+    glsl_vals.push_back("gl_FragColor");
+  }
+
+  // uniform
+  {
+    // vertex shader.
+    std::string vtx = format(
+      "${GLSL_VERSION}\n"
+      "void main(void) {\n"
+      "  gl_Position = vec4(0,0,0,1);\n"
+      "}\n",
+      glsl_vars, glsl_vals);
+
+#ifdef GL_VERSION_2_0
+    if(GLEW_VERSION_2_0 && GL_VERSION_2_0)
+    {
+      std::string frg = format(
+        "${GLSL_VERSION}\n"
+        "uniform vec2 v2f;\n"
+        "uniform vec3 v3f;\n"
+        "uniform vec4 v4f;\n"
+        "uniform ivec2 v2i;\n"
+        "uniform ivec3 v3i;\n"
+        "uniform ivec4 v4i;\n"
+        "uniform mat2 m2f;\n"
+        "uniform mat3 m3f;\n"
+        "uniform mat4 m4f;\n"
+        "${FRAG_OUTPUT_DECLARATION}\n"
+        "void main(void) { \n"
+        "  ${FRAG_OUTPUT_VARIABLE} = vec4(v2f[0]+v3f[0]+v4f[0])+vec4(v2i[0]+v3i[0]+v4i[0])+vec4(m2f[0][0]+m3f[0][0]+m4f[0][0]);\n"
+        "}\n",
+        glsl_vars, glsl_vals);
+
+      GLint prg_id = createProgram(vtx, frg);
+      VERIFY(prg_id > 0 && "Failed to create program.");
+      VERIFY_UNIFORM(fv, v2f, Vector2f);
+      VERIFY_UNIFORM(fv, v3f, Vector3f);
+      VERIFY_UNIFORM(fv, v4f, Vector4f);
+      VERIFY_UNIFORMi(v2i, Vector2i);
+      VERIFY_UNIFORMi(v3i, Vector3i);
+      VERIFY_UNIFORMi(v4i, Vector4i);
+      VERIFY_UNIFORM(fv, m2f, Matrix2f);
+      VERIFY_UNIFORM(fv, m3f, Matrix3f);
+      VERIFY_UNIFORM(fv, m4f, Matrix4f);
+      glDeleteProgram(prg_id);
+    }
+    else
+#endif
+      std::cerr << "Warning: opengl 2.0 was not tested.\n";
+
+#ifdef GL_VERSION_2_1
+    if(GLEW_VERSION_2_1 && GL_VERSION_2_1 &&
+        (gl_major_version > 2 || (gl_major_version == 2 && gl_minor_version >= 1)))
+    {
+      std::string frg = format(
+        "${GLSL_VERSION}\n"
+        "uniform mat2x3 m23f;\n"
+        "uniform mat3x2 m32f;\n"
+        "uniform mat2x4 m24f;\n"
+        "uniform mat4x2 m42f;\n"
+        "uniform mat3x4 m34f;\n"
+        "uniform mat4x3 m43f;\n"
+        "${FRAG_OUTPUT_DECLARATION}\n"
+        "void main(void) {\n"
+        "  ${FRAG_OUTPUT_VARIABLE} = vec4(m23f[0][0]+m32f[0][0]+m24f[0][0]+m42f[0][0]+m34f[0][0]+m43f[0][0]);\n"
+        "}\n",
+        glsl_vars, glsl_vals);
+
+      GLint prg_id = createProgram(vtx, frg);
+      VERIFY(prg_id > 0 && "Failed to create program.");
+      typedef Matrix<float,2,3> Matrix23f;
+      typedef Matrix<float,3,2> Matrix32f;
+      typedef Matrix<float,2,4> Matrix24f;
+      typedef Matrix<float,4,2> Matrix42f;
+      typedef Matrix<float,3,4> Matrix34f;
+      typedef Matrix<float,4,3> Matrix43f;
+
+      VERIFY_UNIFORM(fv, m23f, Matrix23f);
+      VERIFY_UNIFORM(fv, m32f, Matrix32f);
+      VERIFY_UNIFORM(fv, m24f, Matrix24f);
+      VERIFY_UNIFORM(fv, m42f, Matrix42f);
+      VERIFY_UNIFORM(fv, m34f, Matrix34f);
+      VERIFY_UNIFORM(fv, m43f, Matrix43f);
+      glDeleteProgram(prg_id);
+    }
+    else
+#endif
+      std::cerr << "Warning: opengl 2.1 was not tested.\n";
+
+#ifdef GL_VERSION_3_0
+    if(GLEW_VERSION_3_0 && GL_VERSION_3_0 && gl_major_version >= 3)
+    {
+      std::string frg = format(
+        "${GLSL_VERSION}\n"
+        "uniform uvec2 v2ui;\n"
+        "uniform uvec3 v3ui;\n"
+        "uniform uvec4 v4ui;\n"
+        "${FRAG_OUTPUT_DECLARATION}\n"
+        "void main(void) {\n"
+        "  ${FRAG_OUTPUT_VARIABLE} = vec4(v2ui[0]+v3ui[0]+v4ui[0]);\n"
+        "}\n",
+        glsl_vars, glsl_vals);
+
+      GLint prg_id = createProgram(vtx, frg);
+      VERIFY(prg_id > 0 && "Failed to create program.");
+      typedef Matrix<unsigned int,2,1> Vector2ui;
+      typedef Matrix<unsigned int,3,1> Vector3ui;
+      typedef Matrix<unsigned int,4,1> Vector4ui;
+
+      VERIFY_UNIFORMi(v2ui, Vector2ui);
+      VERIFY_UNIFORMi(v3ui, Vector3ui);
+      VERIFY_UNIFORMi(v4ui, Vector4ui);
+      glDeleteProgram(prg_id);
+    }
+    else
+#endif
+      std::cerr << "Warning: opengl 3.0 was not tested.\n";
+
+    // dvecn supported if >= 4.1 or ARB_vertex_attrib_64bit
+    bool has_fp64_native = (gl_major_version == 4 && gl_minor_version >= 1);
+    bool has_fp64_extension = false;
+#ifdef GLEW_ARB_gpu_shader_fp64
+    if(GLEW_ARB_gpu_shader_fp64)
+    {
+      // Check that extension can actually be compiled.
+      if (has_fp64_extension)
+      {
+        std::string frg = format(
+          "${GLSL_VERSION}\n"
+          "#extension GL_ARB_gpu_shader_fp64 : enable\n"
+          "uniform dvec2 dv2;\n"
+          "${FRAG_OUTPUT_DECLARATION}\n"
+          "void main(void) {\n"
+          "  ${FRAG_OUTPUT_VARIABLE} = vec4(dv2.x, dv2.y, dv2.x, dv2.y);\n"
+          "}\n",
+          glsl_vars, glsl_vals);
+        GLint prg_id = createProgram(vtx, frg, /*print_errors=*/false);
+        if (prg_id)
+        {
+          has_fp64_extension = true;
+          glDeleteProgram(prg_id);
+        }
+      }
+    }
+#endif
+
+    if( has_fp64_native || has_fp64_extension )
+    {
+      std::vector<std::string> glsl_vars_with_extension = glsl_vars;
+      glsl_vars_with_extension.push_back("${GLSL_EXTENSIONS}");
+      std::vector<std::string> glsl_vals_with_extension = glsl_vals;
+      if (has_fp64_extension)
+      {
+        glsl_vals_with_extension.push_back("#extension GL_ARB_gpu_shader_fp64 : enable");
+      }
+      else
+      {
+        glsl_vals_with_extension.push_back("");
+      }
+
+      std::string frg = format(
+        "${GLSL_VERSION}\n"
+        "${GLSL_EXTENSIONS}\n"
+        "uniform dvec2 v2d;\n"
+        "uniform dvec3 v3d;\n"
+        "uniform dvec4 v4d;\n"
+        "${FRAG_OUTPUT_DECLARATION}\n"
+        "void main(void) {\n"
+        "  ${FRAG_OUTPUT_VARIABLE} = vec4(v2d[0]+v3d[0]+v4d[0]);\n"
+        "}\n",
+        glsl_vars_with_extension, glsl_vals_with_extension);
+
+      GLint prg_id = createProgram(vtx,frg);
+      VERIFY(prg_id > 0 && "Failed to create program.");
+      VERIFY_UNIFORM(dv, v2d, Vector2d);
+      VERIFY_UNIFORM(dv, v3d, Vector3d);
+      VERIFY_UNIFORM(dv, v4d, Vector4d);
+      glDeleteProgram(prg_id);
+    }
+    else
+      std::cerr << "Warning: dvec (fp64) was not tested.\n";
+  }
+
+  // Exit loop - Leaving main loop is supported by freeglut, otherwise we
+  // are forced to exit.
+#ifdef FREEGLUT
+  glutLeaveMainLoop();
+  // Trigger another display loop iteration. Otherwise, it just hangs.
+  glutPostRedisplay();
+#else
+  exit(0);
+#endif
+}
+
+EIGEN_DECLARE_TEST(openglsupport)
+{
+  int argc = 0;
+  glutInit(&argc, 0);
+
+  GLint glut_display_mode = GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH;
+
+#ifndef EIGEN_LEGACY_OPENGL
+  // Initialize 3.2+ OpenGL context.
+#if defined(__APPLE_CC__)
+  glut_display_mode |= GLUT_3_2_CORE_PROFILE;
+#elif defined(FREEGLUT)
+  glutInitContextVersion(3, 2);
+  glutInitContextFlags(GLUT_FORWARD_COMPATIBLE);
+  glutInitContextProfile(GLUT_CORE_PROFILE);
+#endif
+#endif
+
+  glutInitDisplayMode(glut_display_mode);
+  glutInitWindowPosition(0, 0);
+  glutInitWindowSize(10, 10);
+
+  int window = glutCreateWindow("Eigen");
+  if(window <= 0)
+  {
+    std::cerr << "Error: Unable to create GLUT Window.\n";
+    exit(1);
+  }
+
+  glewExperimental = GL_TRUE;
+  if(glewInit() != GLEW_OK)
+  {
+    std::cerr << "Warning: Failed to initialize GLEW.\n";
+    exit(1);
+  }
+
+  // Run test in display, otherwise GLUT fails to clean up and leads to memory
+  // access errors on exit.
+  glutDisplayFunc(openglsupport_test_loop);
+  glutMainLoop();
+  glutDestroyWindow(window);
+}

diff --git a/unsupported/test/polynomialsolver.cpp b/unsupported/test/polynomialsolver.cpp
new file mode 100644
index 0000000..4ff9bda
--- /dev/null
+++ b/unsupported/test/polynomialsolver.cpp

@@ -0,0 +1,232 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Manuel Yguel <manuel.yguel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <unsupported/Eigen/Polynomials>
+#include <iostream>
+#include <algorithm>
+
+using namespace std;
+
+namespace Eigen {
+namespace internal {
+template<int Size>
+struct increment_if_fixed_size
+{
+  enum {
+    ret = (Size == Dynamic) ? Dynamic : Size+1
+  };
+};
+}
+}
+
+template<typename PolynomialType>
+PolynomialType polyder(const PolynomialType& p)
+{
+  typedef typename PolynomialType::Scalar Scalar;
+  PolynomialType res(p.size());
+  for(Index i=1; i<p.size(); ++i)
+    res[i-1] = p[i]*Scalar(i);
+  res[p.size()-1] = 0.;
+  return res;
+}
+
+template<int Deg, typename POLYNOMIAL, typename SOLVER>
+bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve )
+{
+  typedef typename POLYNOMIAL::Scalar Scalar;
+  typedef typename POLYNOMIAL::RealScalar RealScalar;
+
+  typedef typename SOLVER::RootsType    RootsType;
+  typedef Matrix<RealScalar,Deg,1>      EvalRootsType;
+
+  const Index deg = pols.size()-1;
+
+  // Test template constructor from coefficient vector
+  SOLVER solve_constr (pols);
+
+  psolve.compute( pols );
+  const RootsType& roots( psolve.roots() );
+  EvalRootsType evr( deg );
+  POLYNOMIAL pols_der = polyder(pols);
+  EvalRootsType der( deg );
+  for( int i=0; i<roots.size(); ++i ){
+    evr[i] = std::abs( poly_eval( pols, roots[i] ) );
+    der[i] = numext::maxi(RealScalar(1.), std::abs( poly_eval( pols_der, roots[i] ) ));
+  }
+
+  // we need to divide by the magnitude of the derivative because
+  // with a high derivative is very small error in the value of the root
+  // yiels a very large error in the polynomial evaluation.
+  bool evalToZero = (evr.cwiseQuotient(der)).isZero( test_precision<Scalar>() );
+  if( !evalToZero )
+  {
+    cerr << "WRONG root: " << endl;
+    cerr << "Polynomial: " << pols.transpose() << endl;
+    cerr << "Roots found: " << roots.transpose() << endl;
+    cerr << "Abs value of the polynomial at the roots: " << evr.transpose() << endl;
+    cerr << endl;
+  }
+
+  std::vector<RealScalar> rootModuli( roots.size() );
+  Map< EvalRootsType > aux( &rootModuli[0], roots.size() );
+  aux = roots.array().abs();
+  std::sort( rootModuli.begin(), rootModuli.end() );
+  bool distinctModuli=true;
+  for( size_t i=1; i<rootModuli.size() && distinctModuli; ++i )
+  {
+    if( internal::isApprox( rootModuli[i], rootModuli[i-1] ) ){
+      distinctModuli = false; }
+  }
+  VERIFY( evalToZero || !distinctModuli );
+
+  return distinctModuli;
+}
+
+
+
+
+
+
+
+template<int Deg, typename POLYNOMIAL>
+void evalSolver( const POLYNOMIAL& pols )
+{
+  typedef typename POLYNOMIAL::Scalar Scalar;
+
+  typedef PolynomialSolver<Scalar, Deg > PolynomialSolverType;
+
+  PolynomialSolverType psolve;
+  aux_evalSolver<Deg, POLYNOMIAL, PolynomialSolverType>( pols, psolve );
+}
+
+
+
+
+template< int Deg, typename POLYNOMIAL, typename ROOTS, typename REAL_ROOTS >
+void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const REAL_ROOTS& real_roots )
+{
+  using std::sqrt;
+  typedef typename POLYNOMIAL::Scalar Scalar;
+  typedef typename POLYNOMIAL::RealScalar RealScalar;
+
+  typedef PolynomialSolver<Scalar, Deg >              PolynomialSolverType;
+
+  PolynomialSolverType psolve;
+  if( aux_evalSolver<Deg, POLYNOMIAL, PolynomialSolverType>( pols, psolve ) )
+  {
+    //It is supposed that
+    // 1) the roots found are correct
+    // 2) the roots have distinct moduli
+
+    //Test realRoots
+    std::vector< RealScalar > calc_realRoots;
+    psolve.realRoots( calc_realRoots,  test_precision<RealScalar>());
+    VERIFY_IS_EQUAL( calc_realRoots.size() , (size_t)real_roots.size() );
+
+    const RealScalar psPrec = sqrt( test_precision<RealScalar>() );
+
+    for( size_t i=0; i<calc_realRoots.size(); ++i )
+    {
+      bool found = false;
+      for( size_t j=0; j<calc_realRoots.size()&& !found; ++j )
+      {
+        if( internal::isApprox( calc_realRoots[i], real_roots[j], psPrec ) ){
+          found = true; }
+      }
+      VERIFY( found );
+    }
+
+    //Test greatestRoot
+    VERIFY( internal::isApprox( roots.array().abs().maxCoeff(),
+          abs( psolve.greatestRoot() ), psPrec ) );
+
+    //Test smallestRoot
+    VERIFY( internal::isApprox( roots.array().abs().minCoeff(),
+          abs( psolve.smallestRoot() ), psPrec ) );
+
+    bool hasRealRoot;
+    //Test absGreatestRealRoot
+    RealScalar r = psolve.absGreatestRealRoot( hasRealRoot );
+    VERIFY( hasRealRoot == (real_roots.size() > 0 ) );
+    if( hasRealRoot ){
+      VERIFY( internal::isApprox( real_roots.array().abs().maxCoeff(), abs(r), psPrec ) );  }
+
+    //Test absSmallestRealRoot
+    r = psolve.absSmallestRealRoot( hasRealRoot );
+    VERIFY( hasRealRoot == (real_roots.size() > 0 ) );
+    if( hasRealRoot ){
+      VERIFY( internal::isApprox( real_roots.array().abs().minCoeff(), abs( r ), psPrec ) ); }
+
+    //Test greatestRealRoot
+    r = psolve.greatestRealRoot( hasRealRoot );
+    VERIFY( hasRealRoot == (real_roots.size() > 0 ) );
+    if( hasRealRoot ){
+      VERIFY( internal::isApprox( real_roots.array().maxCoeff(), r, psPrec ) ); }
+
+    //Test smallestRealRoot
+    r = psolve.smallestRealRoot( hasRealRoot );
+    VERIFY( hasRealRoot == (real_roots.size() > 0 ) );
+    if( hasRealRoot ){
+    VERIFY( internal::isApprox( real_roots.array().minCoeff(), r, psPrec ) ); }
+  }
+}
+
+
+template<typename _Scalar, int _Deg>
+void polynomialsolver(int deg)
+{
+  typedef typename NumTraits<_Scalar>::Real RealScalar;
+  typedef internal::increment_if_fixed_size<_Deg>     Dim;
+  typedef Matrix<_Scalar,Dim::ret,1>                  PolynomialType;
+  typedef Matrix<_Scalar,_Deg,1>                      EvalRootsType;
+  typedef Matrix<RealScalar,_Deg,1>                   RealRootsType;
+
+  cout << "Standard cases" << endl;
+  PolynomialType pols = PolynomialType::Random(deg+1);
+  evalSolver<_Deg,PolynomialType>( pols );
+
+  cout << "Hard cases" << endl;
+  _Scalar multipleRoot = internal::random<_Scalar>();
+  EvalRootsType allRoots = EvalRootsType::Constant(deg,multipleRoot);
+  roots_to_monicPolynomial( allRoots, pols );
+  evalSolver<_Deg,PolynomialType>( pols );
+
+  cout << "Test sugar" << endl;
+  RealRootsType realRoots = RealRootsType::Random(deg);
+  roots_to_monicPolynomial( realRoots, pols );
+  evalSolverSugarFunction<_Deg>(
+      pols,
+      realRoots.template cast <std::complex<RealScalar> >().eval(),
+      realRoots );
+}
+
+EIGEN_DECLARE_TEST(polynomialsolver)
+{
+  for(int i = 0; i < g_repeat; i++)
+  {
+    CALL_SUBTEST_1( (polynomialsolver<float,1>(1)) );
+    CALL_SUBTEST_2( (polynomialsolver<double,2>(2)) );
+    CALL_SUBTEST_3( (polynomialsolver<double,3>(3)) );
+    CALL_SUBTEST_4( (polynomialsolver<float,4>(4)) );
+    CALL_SUBTEST_5( (polynomialsolver<double,5>(5)) );
+    CALL_SUBTEST_6( (polynomialsolver<float,6>(6)) );
+    CALL_SUBTEST_7( (polynomialsolver<float,7>(7)) );
+    CALL_SUBTEST_8( (polynomialsolver<double,8>(8)) );
+
+    CALL_SUBTEST_9( (polynomialsolver<float,Dynamic>(
+            internal::random<int>(9,13)
+            )) );
+    CALL_SUBTEST_10((polynomialsolver<double,Dynamic>(
+            internal::random<int>(9,13)
+            )) );
+    CALL_SUBTEST_11((polynomialsolver<float,Dynamic>(1)) );
+    CALL_SUBTEST_12((polynomialsolver<std::complex<double>,Dynamic>(internal::random<int>(2,13))) );
+  }
+}

diff --git a/unsupported/test/polynomialutils.cpp b/unsupported/test/polynomialutils.cpp
new file mode 100644
index 0000000..8ff4519
--- /dev/null
+++ b/unsupported/test/polynomialutils.cpp

@@ -0,0 +1,113 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Manuel Yguel <manuel.yguel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <unsupported/Eigen/Polynomials>
+#include <iostream>
+
+using namespace std;
+
+namespace Eigen {
+namespace internal {
+template<int Size>
+struct increment_if_fixed_size
+{
+  enum {
+    ret = (Size == Dynamic) ? Dynamic : Size+1
+  };
+};
+}
+}
+
+template<typename _Scalar, int _Deg>
+void realRoots_to_monicPolynomial_test(int deg)
+{
+  typedef internal::increment_if_fixed_size<_Deg>            Dim;
+  typedef Matrix<_Scalar,Dim::ret,1>                  PolynomialType;
+  typedef Matrix<_Scalar,_Deg,1>                      EvalRootsType;
+
+  PolynomialType pols(deg+1);
+  EvalRootsType roots = EvalRootsType::Random(deg);
+  roots_to_monicPolynomial( roots, pols );
+
+  EvalRootsType evr( deg );
+  for( int i=0; i<roots.size(); ++i ){
+    evr[i] = std::abs( poly_eval( pols, roots[i] ) ); }
+
+  bool evalToZero = evr.isZero( test_precision<_Scalar>() );
+  if( !evalToZero ){
+    cerr << evr.transpose() << endl; }
+  VERIFY( evalToZero );
+}
+
+template<typename _Scalar> void realRoots_to_monicPolynomial_scalar()
+{
+  CALL_SUBTEST_2( (realRoots_to_monicPolynomial_test<_Scalar,2>(2)) );
+  CALL_SUBTEST_3( (realRoots_to_monicPolynomial_test<_Scalar,3>(3)) );
+  CALL_SUBTEST_4( (realRoots_to_monicPolynomial_test<_Scalar,4>(4)) );
+  CALL_SUBTEST_5( (realRoots_to_monicPolynomial_test<_Scalar,5>(5)) );
+  CALL_SUBTEST_6( (realRoots_to_monicPolynomial_test<_Scalar,6>(6)) );
+  CALL_SUBTEST_7( (realRoots_to_monicPolynomial_test<_Scalar,7>(7)) );
+  CALL_SUBTEST_8( (realRoots_to_monicPolynomial_test<_Scalar,17>(17)) );
+
+  CALL_SUBTEST_9( (realRoots_to_monicPolynomial_test<_Scalar,Dynamic>(
+          internal::random<int>(18,26) )) );
+}
+
+
+
+
+template<typename _Scalar, int _Deg>
+void CauchyBounds(int deg)
+{
+  typedef internal::increment_if_fixed_size<_Deg>            Dim;
+  typedef Matrix<_Scalar,Dim::ret,1>                  PolynomialType;
+  typedef Matrix<_Scalar,_Deg,1>                      EvalRootsType;
+
+  PolynomialType pols(deg+1);
+  EvalRootsType roots = EvalRootsType::Random(deg);
+  roots_to_monicPolynomial( roots, pols );
+  _Scalar M = cauchy_max_bound( pols );
+  _Scalar m = cauchy_min_bound( pols );
+  _Scalar Max = roots.array().abs().maxCoeff();
+  _Scalar min = roots.array().abs().minCoeff();
+  bool eval = (M >= Max) && (m <= min);
+  if( !eval )
+  {
+    cerr << "Roots: " << roots << endl;
+    cerr << "Bounds: (" << m << ", " << M << ")" << endl;
+    cerr << "Min,Max: (" << min << ", " << Max << ")" << endl;
+  }
+  VERIFY( eval );
+}
+
+template<typename _Scalar> void CauchyBounds_scalar()
+{
+  CALL_SUBTEST_2( (CauchyBounds<_Scalar,2>(2)) );
+  CALL_SUBTEST_3( (CauchyBounds<_Scalar,3>(3)) );
+  CALL_SUBTEST_4( (CauchyBounds<_Scalar,4>(4)) );
+  CALL_SUBTEST_5( (CauchyBounds<_Scalar,5>(5)) );
+  CALL_SUBTEST_6( (CauchyBounds<_Scalar,6>(6)) );
+  CALL_SUBTEST_7( (CauchyBounds<_Scalar,7>(7)) );
+  CALL_SUBTEST_8( (CauchyBounds<_Scalar,17>(17)) );
+
+  CALL_SUBTEST_9( (CauchyBounds<_Scalar,Dynamic>(
+          internal::random<int>(18,26) )) );
+}
+
+EIGEN_DECLARE_TEST(polynomialutils)
+{
+  for(int i = 0; i < g_repeat; i++)
+  {
+    realRoots_to_monicPolynomial_scalar<double>();
+    realRoots_to_monicPolynomial_scalar<float>();
+    CauchyBounds_scalar<double>();
+    CauchyBounds_scalar<float>();
+  }
+}

diff --git a/unsupported/test/sparse_extra.cpp b/unsupported/test/sparse_extra.cpp
new file mode 100644
index 0000000..602c2cb
--- /dev/null
+++ b/unsupported/test/sparse_extra.cpp

@@ -0,0 +1,226 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+// import basic and product tests for deprecated DynamicSparseMatrix
+#if 0 // sparse_basic(DynamicSparseMatrix) does not compile at all -> disabled
+static long g_realloc_count = 0;
+#define EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN g_realloc_count++;
+
+static long g_dense_op_sparse_count = 0;
+#define EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN g_dense_op_sparse_count++;
+#define EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN g_dense_op_sparse_count+=10;
+#define EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN g_dense_op_sparse_count+=20;
+
+#define EIGEN_SPARSE_TEST_INCLUDED_FROM_SPARSE_EXTRA 1
+#endif
+
+#define EIGEN_NO_DEPRECATED_WARNING
+// Disable counting of temporaries, since sparse_product(DynamicSparseMatrix)
+// has an extra copy-assignment.
+#define EIGEN_SPARSE_PRODUCT_IGNORE_TEMPORARY_COUNT
+#include "sparse_product.cpp"
+
+#if 0 // sparse_basic(DynamicSparseMatrix) does not compile at all -> disabled
+#include "sparse_basic.cpp"
+#endif
+
+#if EIGEN_HAS_CXX11
+
+#ifdef min
+#undef min
+#endif
+
+#ifdef max
+#undef max
+#endif
+
+#include <unordered_map>
+#define EIGEN_UNORDERED_MAP_SUPPORT
+
+#endif
+
+
+#include <Eigen/SparseExtra>
+
+template<typename SetterType,typename DenseType, typename Scalar, int Options>
+bool test_random_setter(SparseMatrix<Scalar,Options>& sm, const DenseType& ref, const std::vector<Vector2i>& nonzeroCoords)
+{
+  {
+    sm.setZero();
+    SetterType w(sm);
+    std::vector<Vector2i> remaining = nonzeroCoords;
+    while(!remaining.empty())
+    {
+      int i = internal::random<int>(0,static_cast<int>(remaining.size())-1);
+      w(remaining[i].x(),remaining[i].y()) = ref.coeff(remaining[i].x(),remaining[i].y());
+      remaining[i] = remaining.back();
+      remaining.pop_back();
+    }
+  }
+  return sm.isApprox(ref);
+}
+
+template<typename SetterType,typename DenseType, typename T>
+bool test_random_setter(DynamicSparseMatrix<T>& sm, const DenseType& ref, const std::vector<Vector2i>& nonzeroCoords)
+{
+  sm.setZero();
+  std::vector<Vector2i> remaining = nonzeroCoords;
+  while(!remaining.empty())
+  {
+    int i = internal::random<int>(0,static_cast<int>(remaining.size())-1);
+    sm.coeffRef(remaining[i].x(),remaining[i].y()) = ref.coeff(remaining[i].x(),remaining[i].y());
+    remaining[i] = remaining.back();
+    remaining.pop_back();
+  }
+  return sm.isApprox(ref);
+}
+
+template<typename SparseMatrixType> void sparse_extra(const SparseMatrixType& ref)
+{
+  const Index rows = ref.rows();
+  const Index cols = ref.cols();
+  typedef typename SparseMatrixType::Scalar Scalar;
+  enum { Flags = SparseMatrixType::Flags };
+
+  double density = (std::max)(8./(rows*cols), 0.01);
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+  typedef Matrix<Scalar,Dynamic,1> DenseVector;
+  Scalar eps = 1e-6;
+
+  SparseMatrixType m(rows, cols);
+  DenseMatrix refMat = DenseMatrix::Zero(rows, cols);
+  DenseVector vec1 = DenseVector::Random(rows);
+
+  std::vector<Vector2i> zeroCoords;
+  std::vector<Vector2i> nonzeroCoords;
+  initSparse<Scalar>(density, refMat, m, 0, &zeroCoords, &nonzeroCoords);
+
+  if (zeroCoords.size()==0 || nonzeroCoords.size()==0)
+    return;
+
+  // test coeff and coeffRef
+  for (int i=0; i<(int)zeroCoords.size(); ++i)
+  {
+    VERIFY_IS_MUCH_SMALLER_THAN( m.coeff(zeroCoords[i].x(),zeroCoords[i].y()), eps );
+    if(internal::is_same<SparseMatrixType,SparseMatrix<Scalar,Flags> >::value)
+      VERIFY_RAISES_ASSERT( m.coeffRef(zeroCoords[0].x(),zeroCoords[0].y()) = 5 );
+  }
+  VERIFY_IS_APPROX(m, refMat);
+
+  m.coeffRef(nonzeroCoords[0].x(), nonzeroCoords[0].y()) = Scalar(5);
+  refMat.coeffRef(nonzeroCoords[0].x(), nonzeroCoords[0].y()) = Scalar(5);
+
+  VERIFY_IS_APPROX(m, refMat);
+
+  // random setter
+//   {
+//     m.setZero();
+//     VERIFY_IS_NOT_APPROX(m, refMat);
+//     SparseSetter<SparseMatrixType, RandomAccessPattern> w(m);
+//     std::vector<Vector2i> remaining = nonzeroCoords;
+//     while(!remaining.empty())
+//     {
+//       int i = internal::random<int>(0,remaining.size()-1);
+//       w->coeffRef(remaining[i].x(),remaining[i].y()) = refMat.coeff(remaining[i].x(),remaining[i].y());
+//       remaining[i] = remaining.back();
+//       remaining.pop_back();
+//     }
+//   }
+//   VERIFY_IS_APPROX(m, refMat);
+
+    VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, StdMapTraits> >(m,refMat,nonzeroCoords) ));
+    #ifdef EIGEN_UNORDERED_MAP_SUPPORT
+    VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, StdUnorderedMapTraits> >(m,refMat,nonzeroCoords) ));
+    #endif
+    #ifdef EIGEN_GOOGLEHASH_SUPPORT
+    VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, GoogleDenseHashMapTraits> >(m,refMat,nonzeroCoords) ));
+    VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, GoogleSparseHashMapTraits> >(m,refMat,nonzeroCoords) ));
+    #endif
+
+
+  // test RandomSetter
+  /*{
+    SparseMatrixType m1(rows,cols), m2(rows,cols);
+    DenseMatrix refM1 = DenseMatrix::Zero(rows, rows);
+    initSparse<Scalar>(density, refM1, m1);
+    {
+      Eigen::RandomSetter<SparseMatrixType > setter(m2);
+      for (int j=0; j<m1.outerSize(); ++j)
+        for (typename SparseMatrixType::InnerIterator i(m1,j); i; ++i)
+          setter(i.index(), j) = i.value();
+    }
+    VERIFY_IS_APPROX(m1, m2);
+  }*/
+
+
+}
+
+
+template<typename SparseMatrixType>
+void check_marketio()
+{
+  typedef Matrix<typename SparseMatrixType::Scalar, Dynamic, Dynamic> DenseMatrix;
+  Index rows = internal::random<Index>(1,100);
+  Index cols = internal::random<Index>(1,100);
+  SparseMatrixType m1, m2;
+  m1 = DenseMatrix::Random(rows, cols).sparseView();
+  saveMarket(m1, "sparse_extra.mtx");
+  loadMarket(m2, "sparse_extra.mtx");
+  VERIFY_IS_EQUAL(DenseMatrix(m1),DenseMatrix(m2));
+}
+
+template<typename VectorType>
+void check_marketio_vector()
+{
+  Index size = internal::random<Index>(1,100);
+  VectorType v1, v2;
+  v1 = VectorType::Random(size);
+  saveMarketVector(v1, "vector_extra.mtx");
+  loadMarketVector(v2, "vector_extra.mtx");
+  VERIFY_IS_EQUAL(v1,v2);
+}
+
+EIGEN_DECLARE_TEST(sparse_extra)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    int s = Eigen::internal::random<int>(1,50);
+    CALL_SUBTEST_1( sparse_extra(SparseMatrix<double>(8, 8)) );
+    CALL_SUBTEST_2( sparse_extra(SparseMatrix<std::complex<double> >(s, s)) );
+    CALL_SUBTEST_1( sparse_extra(SparseMatrix<double>(s, s)) );
+
+    CALL_SUBTEST_3( sparse_extra(DynamicSparseMatrix<double>(s, s)) );
+//    CALL_SUBTEST_3(( sparse_basic(DynamicSparseMatrix<double>(s, s)) ));
+//    CALL_SUBTEST_3(( sparse_basic(DynamicSparseMatrix<double,ColMajor,long int>(s, s)) ));
+
+    CALL_SUBTEST_3( (sparse_product<DynamicSparseMatrix<float, ColMajor> >()) );
+    CALL_SUBTEST_3( (sparse_product<DynamicSparseMatrix<float, RowMajor> >()) );
+
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<float,ColMajor,int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<double,ColMajor,int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<float>,ColMajor,int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<double>,ColMajor,int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<float,ColMajor,long int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<double,ColMajor,long int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<float>,ColMajor,long int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<double>,ColMajor,long int> >()) );
+
+
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<float,1,Dynamic> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<double,1,Dynamic> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<float>,1,Dynamic> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<double>,1,Dynamic> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<float,Dynamic,1> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<double,Dynamic,1> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<float>,Dynamic,1> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<double>,Dynamic,1> >()) );
+
+    TEST_SET_BUT_UNUSED_VARIABLE(s);
+  }
+}

diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp
new file mode 100644
index 0000000..589bb76
--- /dev/null
+++ b/unsupported/test/special_functions.cpp

@@ -0,0 +1,497 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <limits.h>
+#include "main.h"
+#include "../Eigen/SpecialFunctions"
+
+// Hack to allow "implicit" conversions from double to Scalar via comma-initialization.
+template<typename Derived>
+Eigen::CommaInitializer<Derived> operator<<(Eigen::DenseBase<Derived>& dense, double v) {
+  return (dense << static_cast<typename Derived::Scalar>(v));
+}
+
+template<typename XprType>
+Eigen::CommaInitializer<XprType>& operator,(Eigen::CommaInitializer<XprType>& ci, double v) {
+  return (ci, static_cast<typename XprType::Scalar>(v));
+}
+
+template<typename X, typename Y>
+void verify_component_wise(const X& x, const Y& y)
+{
+  for(Index i=0; i<x.size(); ++i)
+  {
+    if((numext::isfinite)(y(i)))
+      VERIFY_IS_APPROX( x(i), y(i) );
+    else if((numext::isnan)(y(i)))
+      VERIFY((numext::isnan)(x(i)));
+    else
+      VERIFY_IS_EQUAL( x(i), y(i) );
+  }
+}
+
+template<typename ArrayType> void array_special_functions()
+{
+  using std::abs;
+  using std::sqrt;
+  typedef typename ArrayType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  Scalar plusinf = std::numeric_limits<Scalar>::infinity();
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+
+  Index rows = internal::random<Index>(1,30);
+  Index cols = 1;
+
+  // API
+  {
+    ArrayType m1 = ArrayType::Random(rows,cols);
+#if EIGEN_HAS_C99_MATH
+    VERIFY_IS_APPROX(m1.lgamma(), lgamma(m1));
+    VERIFY_IS_APPROX(m1.digamma(), digamma(m1));
+    VERIFY_IS_APPROX(m1.erf(), erf(m1));
+    VERIFY_IS_APPROX(m1.erfc(), erfc(m1));
+#endif  // EIGEN_HAS_C99_MATH
+  }
+
+
+#if EIGEN_HAS_C99_MATH
+  // check special functions (comparing against numpy implementation)
+  if (!NumTraits<Scalar>::IsComplex)
+  {
+
+    {
+      ArrayType m1 = ArrayType::Random(rows,cols);
+      ArrayType m2 = ArrayType::Random(rows,cols);
+
+      // Test various propreties of igamma & igammac.  These are normalized
+      // gamma integrals where
+      //   igammac(a, x) = Gamma(a, x) / Gamma(a)
+      //   igamma(a, x) = gamma(a, x) / Gamma(a)
+      // where Gamma and gamma are considered the standard unnormalized
+      // upper and lower incomplete gamma functions, respectively.
+      ArrayType a = m1.abs() + Scalar(2);
+      ArrayType x = m2.abs() + Scalar(2);
+      ArrayType zero = ArrayType::Zero(rows, cols);
+      ArrayType one = ArrayType::Constant(rows, cols, Scalar(1.0));
+      ArrayType a_m1 = a - one;
+      ArrayType Gamma_a_x = Eigen::igammac(a, x) * a.lgamma().exp();
+      ArrayType Gamma_a_m1_x = Eigen::igammac(a_m1, x) * a_m1.lgamma().exp();
+      ArrayType gamma_a_x = Eigen::igamma(a, x) * a.lgamma().exp();
+      ArrayType gamma_a_m1_x = Eigen::igamma(a_m1, x) * a_m1.lgamma().exp();
+
+
+      // Gamma(a, 0) == Gamma(a)
+      VERIFY_IS_APPROX(Eigen::igammac(a, zero), one);
+
+      // Gamma(a, x) + gamma(a, x) == Gamma(a)
+      VERIFY_IS_APPROX(Gamma_a_x + gamma_a_x, a.lgamma().exp());
+
+      // Gamma(a, x) == (a - 1) * Gamma(a-1, x) + x^(a-1) * exp(-x)
+      VERIFY_IS_APPROX(Gamma_a_x, (a - Scalar(1)) * Gamma_a_m1_x + x.pow(a-Scalar(1)) * (-x).exp());
+
+      // gamma(a, x) == (a - 1) * gamma(a-1, x) - x^(a-1) * exp(-x)
+      VERIFY_IS_APPROX(gamma_a_x, (a - Scalar(1)) * gamma_a_m1_x - x.pow(a-Scalar(1)) * (-x).exp());
+    }
+    {
+      // Verify for large a and x that values are between 0 and 1.
+      ArrayType m1 = ArrayType::Random(rows,cols);
+      ArrayType m2 = ArrayType::Random(rows,cols);
+      int max_exponent = std::numeric_limits<Scalar>::max_exponent10;
+      ArrayType a = m1.abs() *  Scalar(pow(10., max_exponent - 1));
+      ArrayType x = m2.abs() *  Scalar(pow(10., max_exponent - 1));
+      for (int i = 0; i < a.size(); ++i) {
+        Scalar igam = numext::igamma(a(i), x(i));
+        VERIFY(0 <= igam);
+        VERIFY(igam <= 1);
+      }
+    }
+
+    {
+      // Check exact values of igamma and igammac against a third party calculation.
+      Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+      Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+
+      // location i*6+j corresponds to a_s[i], x_s[j].
+      Scalar igamma_s[][6] = {
+          {Scalar(0.0), nan, nan, nan, nan, nan},
+          {Scalar(0.0), Scalar(0.6321205588285578), Scalar(0.7768698398515702),
+           Scalar(0.9816843611112658), Scalar(9.999500016666262e-05),
+           Scalar(1.0)},
+          {Scalar(0.0), Scalar(0.4275932955291202), Scalar(0.608374823728911),
+           Scalar(0.9539882943107686), Scalar(7.522076445089201e-07),
+           Scalar(1.0)},
+          {Scalar(0.0), Scalar(0.01898815687615381),
+           Scalar(0.06564245437845008), Scalar(0.5665298796332909),
+           Scalar(4.166333347221828e-18), Scalar(1.0)},
+          {Scalar(0.0), Scalar(0.9999780593618628), Scalar(0.9999899967080838),
+           Scalar(0.9999996219837988), Scalar(0.9991370418689945), Scalar(1.0)},
+          {Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0),
+           Scalar(0.5042041932513908)}};
+      Scalar igammac_s[][6] = {
+          {nan, nan, nan, nan, nan, nan},
+          {Scalar(1.0), Scalar(0.36787944117144233),
+           Scalar(0.22313016014842982), Scalar(0.018315638888734182),
+           Scalar(0.9999000049998333), Scalar(0.0)},
+          {Scalar(1.0), Scalar(0.5724067044708798), Scalar(0.3916251762710878),
+           Scalar(0.04601170568923136), Scalar(0.9999992477923555),
+           Scalar(0.0)},
+          {Scalar(1.0), Scalar(0.9810118431238462), Scalar(0.9343575456215499),
+           Scalar(0.4334701203667089), Scalar(1.0), Scalar(0.0)},
+          {Scalar(1.0), Scalar(2.1940638138146658e-05),
+           Scalar(1.0003291916285e-05), Scalar(3.7801620118431334e-07),
+           Scalar(0.0008629581310054535), Scalar(0.0)},
+          {Scalar(1.0), Scalar(1.0), Scalar(1.0), Scalar(1.0), Scalar(1.0),
+           Scalar(0.49579580674813944)}};
+
+      for (int i = 0; i < 6; ++i) {
+        for (int j = 0; j < 6; ++j) {
+          if ((std::isnan)(igamma_s[i][j])) {
+            VERIFY((std::isnan)(numext::igamma(a_s[i], x_s[j])));
+          } else {
+            VERIFY_IS_APPROX(numext::igamma(a_s[i], x_s[j]), igamma_s[i][j]);
+          }
+
+          if ((std::isnan)(igammac_s[i][j])) {
+            VERIFY((std::isnan)(numext::igammac(a_s[i], x_s[j])));
+          } else {
+            VERIFY_IS_APPROX(numext::igammac(a_s[i], x_s[j]), igammac_s[i][j]);
+          }
+        }
+      }
+    }
+  }
+#endif  // EIGEN_HAS_C99_MATH
+
+  // Check the ndtri function against scipy.special.ndtri
+  {
+    ArrayType x(7), res(7), ref(7);
+    x << 0.5, 0.2, 0.8, 0.9, 0.1, 0.99, 0.01;
+    ref << 0., -0.8416212335729142, 0.8416212335729142, 1.2815515655446004, -1.2815515655446004, 2.3263478740408408, -2.3263478740408408;
+    CALL_SUBTEST( verify_component_wise(ref, ref); );
+    CALL_SUBTEST( res = x.ndtri(); verify_component_wise(res, ref); );
+    CALL_SUBTEST( res = ndtri(x); verify_component_wise(res, ref); );
+
+    // ndtri(normal_cdf(x)) ~= x
+    CALL_SUBTEST(
+        ArrayType m1 = ArrayType::Random(32);
+        using std::sqrt;
+
+        ArrayType cdf_val = (m1 / Scalar(sqrt(2.))).erf();
+        cdf_val = (cdf_val + Scalar(1)) / Scalar(2);
+        verify_component_wise(cdf_val.ndtri(), m1););
+
+  }
+
+  // Check the zeta function against scipy.special.zeta
+  {
+    ArrayType x(10), q(10), res(10), ref(10);
+    x << 1.5,   4, 10.5, 10000.5,    3,      1,    0.9,  2,  3,  4;
+    q <<   2, 1.5,    3,  1.0001, -2.5, 1.2345, 1.2345, -1, -2, -3;
+    ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan, plusinf, nan, plusinf;
+    CALL_SUBTEST( verify_component_wise(ref, ref); );
+    CALL_SUBTEST( res = x.zeta(q); verify_component_wise(res, ref); );
+    CALL_SUBTEST( res = zeta(x,q); verify_component_wise(res, ref); );
+  }
+
+  // digamma
+  {
+    ArrayType x(9), res(9), ref(9);
+    x << 1, 1.5, 4, -10.5, 10000.5, 0, -1, -2, -3;
+    ref << -0.5772156649015329, 0.03648997397857645, 1.2561176684318, 2.398239129535781, 9.210340372392849, nan, nan, nan, nan;
+    CALL_SUBTEST( verify_component_wise(ref, ref); );
+
+    CALL_SUBTEST( res = x.digamma(); verify_component_wise(res, ref); );
+    CALL_SUBTEST( res = digamma(x);  verify_component_wise(res, ref); );
+  }
+
+#if EIGEN_HAS_C99_MATH
+  {
+    ArrayType n(16), x(16), res(16), ref(16);
+    n << 1, 1,    1, 1.5,   17,   31,   28,    8,   42,  147, 170, -1,  0,  1,  2,  3;
+    x << 2, 3, 25.5, 1.5,  4.7, 11.8, 17.7, 30.2, 15.8, 54.1,  64, -1, -2, -3, -4, -5;
+    ref << 0.644934066848, 0.394934066848, 0.0399946696496, nan, 293.334565435, 0.445487887616, -2.47810300902e-07, -8.29668781082e-09, -0.434562276666, 0.567742190178, -0.0108615497927, nan, nan, plusinf, nan, plusinf;
+    CALL_SUBTEST( verify_component_wise(ref, ref); );
+
+    if(sizeof(RealScalar)>=8) {  // double
+      // Reason for commented line: http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1232
+      //       CALL_SUBTEST( res = x.polygamma(n); verify_component_wise(res, ref); );
+      CALL_SUBTEST( res = polygamma(n,x);  verify_component_wise(res, ref); );
+    }
+    else {
+      //       CALL_SUBTEST( res = x.polygamma(n); verify_component_wise(res.head(8), ref.head(8)); );
+      CALL_SUBTEST( res = polygamma(n,x); verify_component_wise(res.head(8), ref.head(8)); );
+    }
+  }
+#endif
+
+#if EIGEN_HAS_C99_MATH
+  {
+    // Inputs and ground truth generated with scipy via:
+    //   a = np.logspace(-3, 3, 5) - 1e-3
+    //   b = np.logspace(-3, 3, 5) - 1e-3
+    //   x = np.linspace(-0.1, 1.1, 5)
+    //   (full_a, full_b, full_x) = np.vectorize(lambda a, b, x: (a, b, x))(*np.ix_(a, b, x))
+    //   full_a = full_a.flatten().tolist()  # same for full_b, full_x
+    //   v = scipy.special.betainc(full_a, full_b, full_x).flatten().tolist()
+    //
+    // Note in Eigen, we call betainc with arguments in the order (x, a, b).
+    ArrayType a(125);
+    ArrayType b(125);
+    ArrayType x(125);
+    ArrayType v(125);
+    ArrayType res(125);
+
+    a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+        0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+        0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+        999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+        999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+        999.999, 999.999, 999.999;
+
+    b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+        0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999,
+        999.999, 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.999, 0.999, 0.999, 0.999,
+        0.999, 31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+        999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+        999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+        999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+        999.999, 999.999;
+
+    x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+        0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+        0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
+        0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
+        -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+        1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+        0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+        0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
+        0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+        0.8, 1.1;
+
+    v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+        nan, nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
+        0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
+        0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
+        0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan,
+        nan, nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
+        0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
+        0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
+        0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
+        0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
+        1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06,
+        nan, nan, 7.864342668429763e-23, 3.015969667594166e-10,
+        0.0008598571564165444, nan, nan, 6.031987710123844e-08,
+        0.5000000000000007, 0.9999999396801229, nan, nan, 0.9999999999999999,
+        0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan, nan,
+        nan, 0.0, 7.029920380986636e-306, 2.2450728208591345e-101, nan, nan,
+        0.0, 9.275871147869727e-302, 1.2232913026152827e-97, nan, nan, 0.0,
+        3.0891393081932924e-252, 2.9303043666183996e-60, nan, nan,
+        2.248913486879199e-196, 0.5000000000004947, 0.9999999999999999, nan;
+
+    CALL_SUBTEST(res = betainc(a, b, x);
+                 verify_component_wise(res, v););
+  }
+
+  // Test various properties of betainc
+  {
+    ArrayType m1 = ArrayType::Random(32);
+    ArrayType m2 = ArrayType::Random(32);
+    ArrayType m3 = ArrayType::Random(32);
+    ArrayType one = ArrayType::Constant(32, Scalar(1.0));
+    const Scalar eps = std::numeric_limits<Scalar>::epsilon();
+    ArrayType a = (m1 * Scalar(4)).exp();
+    ArrayType b = (m2 * Scalar(4)).exp();
+    ArrayType x = m3.abs();
+
+    // betainc(a, 1, x) == x**a
+    CALL_SUBTEST(
+        ArrayType test = betainc(a, one, x);
+        ArrayType expected = x.pow(a);
+        verify_component_wise(test, expected););
+
+    // betainc(1, b, x) == 1 - (1 - x)**b
+    CALL_SUBTEST(
+        ArrayType test = betainc(one, b, x);
+        ArrayType expected = one - (one - x).pow(b);
+        verify_component_wise(test, expected););
+
+    // betainc(a, b, x) == 1 - betainc(b, a, 1-x)
+    CALL_SUBTEST(
+        ArrayType test = betainc(a, b, x) + betainc(b, a, one - x);
+        ArrayType expected = one;
+        verify_component_wise(test, expected););
+
+    // betainc(a+1, b, x) = betainc(a, b, x) - x**a * (1 - x)**b / (a * beta(a, b))
+    CALL_SUBTEST(
+        ArrayType num = x.pow(a) * (one - x).pow(b);
+        ArrayType denom = a * (a.lgamma() + b.lgamma() - (a + b).lgamma()).exp();
+        // Add eps to rhs and lhs so that component-wise test doesn't result in
+        // nans when both outputs are zeros.
+        ArrayType expected = betainc(a, b, x) - num / denom + eps;
+        ArrayType test = betainc(a + one, b, x) + eps;
+        if (sizeof(Scalar) >= 8) { // double
+          verify_component_wise(test, expected);
+        } else {
+          // Reason for limited test: http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1232
+          verify_component_wise(test.head(8), expected.head(8));
+        });
+
+    // betainc(a, b+1, x) = betainc(a, b, x) + x**a * (1 - x)**b / (b * beta(a, b))
+    CALL_SUBTEST(
+        // Add eps to rhs and lhs so that component-wise test doesn't result in
+        // nans when both outputs are zeros.
+        ArrayType num = x.pow(a) * (one - x).pow(b);
+        ArrayType denom = b * (a.lgamma() + b.lgamma() - (a + b).lgamma()).exp();
+        ArrayType expected = betainc(a, b, x) + num / denom + eps;
+        ArrayType test = betainc(a, b + one, x) + eps;
+        verify_component_wise(test, expected););
+  }
+#endif  // EIGEN_HAS_C99_MATH
+
+    /* Code to generate the data for the following two test cases.
+    N = 5
+    np.random.seed(3)
+
+    a = np.logspace(-2, 3, 6)
+    a = np.ravel(np.tile(np.reshape(a, [-1, 1]), [1, N]))
+    x = np.random.gamma(a, 1.0)
+    x = np.maximum(x, np.finfo(np.float32).tiny)
+
+    def igamma(a, x):
+      return mpmath.gammainc(a, 0, x, regularized=True)
+
+    def igamma_der_a(a, x):
+      res = mpmath.diff(lambda a_prime: igamma(a_prime, x), a)
+      return np.float64(res)
+
+    def gamma_sample_der_alpha(a, x):
+      igamma_x = igamma(a, x)
+      def igammainv_of_igamma(a_prime):
+        return mpmath.findroot(lambda x_prime: igamma(a_prime, x_prime) -
+            igamma_x, x, solver='newton')
+      return np.float64(mpmath.diff(igammainv_of_igamma, a))
+
+    v_igamma_der_a = np.vectorize(igamma_der_a)(a, x)
+    v_gamma_sample_der_alpha = np.vectorize(gamma_sample_der_alpha)(a, x)
+  */
+
+#if EIGEN_HAS_C99_MATH
+  // Test igamma_der_a
+  {
+    ArrayType a(30);
+    ArrayType x(30);
+    ArrayType res(30);
+    ArrayType v(30);
+
+    a << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0,
+        1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
+        100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+    x << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+        1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16,
+        0.0132865061065, 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06,
+        0.333412038288, 1.18135687766, 0.580629033777, 0.170631439426,
+        0.786686768458, 7.63873279537, 13.1944344379, 11.896042354,
+        10.5830172417, 10.5020942233, 92.8918587747, 95.003720371,
+        86.3715926467, 96.0330217672, 82.6389930677, 968.702906754,
+        969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+    v << -32.7256441441, -36.4394150514, -9.66467612263, -36.4394150514,
+        -36.4394150514, -1.0891900302, -2.66351229645, -2.48666868596,
+        -0.929700494428, -3.56327722764, -0.455320135314, -0.391437214323,
+        -0.491352055991, -0.350454834292, -0.471773162921, -0.104084440522,
+        -0.0723646747909, -0.0992828975532, -0.121638215446, -0.122619605294,
+        -0.0317670267286, -0.0359974812869, -0.0154359225363, -0.0375775365921,
+        -0.00794899153653, -0.00777303219211, -0.00796085782042,
+        -0.0125850719397, -0.00455500206958, -0.00476436993148;
+
+    CALL_SUBTEST(res = igamma_der_a(a, x); verify_component_wise(res, v););
+  }
+
+  // Test gamma_sample_der_alpha
+  {
+    ArrayType alpha(30);
+    ArrayType sample(30);
+    ArrayType res(30);
+    ArrayType v(30);
+
+    alpha << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0,
+        1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
+        100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+    sample << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+        1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16,
+        0.0132865061065, 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06,
+        0.333412038288, 1.18135687766, 0.580629033777, 0.170631439426,
+        0.786686768458, 7.63873279537, 13.1944344379, 11.896042354,
+        10.5830172417, 10.5020942233, 92.8918587747, 95.003720371,
+        86.3715926467, 96.0330217672, 82.6389930677, 968.702906754,
+        969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+    v << 7.42424742367e-23, 1.02004297287e-34, 0.0130155240738,
+        1.02004297287e-34, 1.02004297287e-34, 1.96505168277e-13, 0.525575786243,
+        0.713903991771, 2.32077561808e-14, 0.000179348049886, 0.635500453302,
+        1.27561284917, 0.878125852156, 0.41565819538, 1.03606488534,
+        0.885964824887, 1.16424049334, 1.10764479598, 1.04590810812,
+        1.04193666963, 0.965193152414, 0.976217589464, 0.93008035061,
+        0.98153216096, 0.909196397698, 0.98434963993, 0.984738050206,
+        1.00106492525, 0.97734200649, 1.02198794179;
+
+    CALL_SUBTEST(res = gamma_sample_der_alpha(alpha, sample);
+                 verify_component_wise(res, v););
+  }
+#endif  // EIGEN_HAS_C99_MATH
+}
+
+EIGEN_DECLARE_TEST(special_functions)
+{
+  CALL_SUBTEST_1(array_special_functions<ArrayXf>());
+  CALL_SUBTEST_2(array_special_functions<ArrayXd>());
+  // TODO(cantonios): half/bfloat16 don't have enough precision to reproduce results above.
+  // CALL_SUBTEST_3(array_special_functions<ArrayX<Eigen::half>>());
+  // CALL_SUBTEST_4(array_special_functions<ArrayX<Eigen::bfloat16>>());
+}

diff --git a/unsupported/test/special_packetmath.cpp b/unsupported/test/special_packetmath.cpp
new file mode 100644
index 0000000..31233f1
--- /dev/null
+++ b/unsupported/test/special_packetmath.cpp

@@ -0,0 +1,149 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <limits>
+#include "packetmath_test_shared.h"
+#include "../Eigen/SpecialFunctions"
+
+template<typename Scalar,typename Packet> void packetmath_real()
+{
+  using std::abs;
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+
+  const int size = PacketSize*4;
+  EIGEN_ALIGN_MAX Scalar data1[PacketSize*4];
+  EIGEN_ALIGN_MAX Scalar data2[PacketSize*4];
+  EIGEN_ALIGN_MAX Scalar ref[PacketSize*4];
+
+#if EIGEN_HAS_C99_MATH
+  {
+    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    test::packet_helper<internal::packet_traits<Scalar>::HasLGamma,Packet> h;
+    h.store(data2, internal::plgamma(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+  }
+  if (internal::packet_traits<Scalar>::HasErf) {
+    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    test::packet_helper<internal::packet_traits<Scalar>::HasErf,Packet> h;
+    h.store(data2, internal::perf(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+  }
+  {
+    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    test::packet_helper<internal::packet_traits<Scalar>::HasErfc,Packet> h;
+    h.store(data2, internal::perfc(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+  }
+  {
+    for (int i=0; i<size; ++i) {
+      data1[i] = internal::random<Scalar>(Scalar(0),Scalar(1));
+    }
+    CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasNdtri, numext::ndtri, internal::pndtri);
+  }
+#endif  // EIGEN_HAS_C99_MATH
+
+  // For bessel_i*e and bessel_j*, the valid range is negative reals.
+  {
+    const int max_exponent = numext::mini(std::numeric_limits<Scalar>::max_exponent10-1, 6);
+    for (int i=0; i<size; ++i)
+    {
+      data1[i] = internal::random<Scalar>(Scalar(-1),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-max_exponent),Scalar(max_exponent))));
+      data2[i] = internal::random<Scalar>(Scalar(-1),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-max_exponent),Scalar(max_exponent))));
+    }
+
+    CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i0e, internal::pbessel_i0e);
+    CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i1e, internal::pbessel_i1e);
+    CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_j0, internal::pbessel_j0);
+    CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_j1, internal::pbessel_j1);
+  }
+
+  // Use a smaller data range for the bessel_i* as these can become very large.
+  // Following #1693, we also restrict this range further to avoid inf's due to
+  // differences in pexp and exp.
+  for (int i=0; i<size; ++i) {
+      data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+      data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+  }
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i0, internal::pbessel_i0);
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i1, internal::pbessel_i1);
+
+
+  // y_i, and k_i are valid for x > 0.
+  {
+    const int max_exponent = numext::mini(std::numeric_limits<Scalar>::max_exponent10-1, 5);
+    for (int i=0; i<size; ++i)
+    {
+      data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-2),Scalar(max_exponent))));
+      data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-2),Scalar(max_exponent))));
+    }
+  }
+
+  // TODO(srvasude): Re-enable this test once properly investigated why the
+  // scalar and vector paths differ.
+  // CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_y0, internal::pbessel_y0);
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_y1, internal::pbessel_y1);
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k0e, internal::pbessel_k0e);
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k1e, internal::pbessel_k1e);
+
+  // Following #1693, we restrict the range for exp to avoid zeroing out too
+  // fast.
+  for (int i=0; i<size; ++i) {
+      data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+      data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+  }
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k0, internal::pbessel_k0);
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k1, internal::pbessel_k1);
+
+
+  for (int i=0; i<size; ++i) {
+      data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+      data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+  }
+
+#if EIGEN_HAS_C99_MATH && (EIGEN_COMP_CXXVER >= 11)
+  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLGamma, std::lgamma, internal::plgamma);
+  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErf, std::erf, internal::perf);
+  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErfc, std::erfc, internal::perfc);
+#endif
+
+}
+
+namespace Eigen {
+namespace test {
+
+template<typename Scalar,typename PacketType, bool IsComplex, bool IsInteger>
+struct runall {
+  static void run() {
+    packetmath_real<Scalar,PacketType>();
+  }
+};
+
+}
+}
+
+EIGEN_DECLARE_TEST(special_packetmath)
+{
+  g_first_pass = true;
+  for(int i = 0; i < g_repeat; i++) {
+
+    CALL_SUBTEST_1( test::runner<float>::run() );
+    CALL_SUBTEST_2( test::runner<double>::run() );
+    CALL_SUBTEST_3( test::runner<Eigen::half>::run() );
+    CALL_SUBTEST_4( test::runner<Eigen::bfloat16>::run() );
+    g_first_pass = false;
+  }
+}

diff --git a/unsupported/test/splines.cpp b/unsupported/test/splines.cpp
new file mode 100644
index 0000000..88ec87b
--- /dev/null
+++ b/unsupported/test/splines.cpp

@@ -0,0 +1,281 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010-2011 Hauke Heibel <heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <unsupported/Eigen/Splines>
+
+namespace Eigen {
+  
+  // lets do some explicit instantiations and thus
+  // force the compilation of all spline functions...
+  template class Spline<double, 2, Dynamic>;
+  template class Spline<double, 3, Dynamic>;
+
+  template class Spline<double, 2, 2>;
+  template class Spline<double, 2, 3>;
+  template class Spline<double, 2, 4>;
+  template class Spline<double, 2, 5>;
+
+  template class Spline<float, 2, Dynamic>;
+  template class Spline<float, 3, Dynamic>;
+
+  template class Spline<float, 3, 2>;
+  template class Spline<float, 3, 3>;
+  template class Spline<float, 3, 4>;
+  template class Spline<float, 3, 5>;
+
+}
+
+Spline<double, 2, Dynamic> closed_spline2d()
+{
+  RowVectorXd knots(12);
+  knots << 0,
+    0,
+    0,
+    0,
+    0.867193179093898,
+    1.660330955342408,
+    2.605084834823134,
+    3.484154586374428,
+    4.252699478956276,
+    4.252699478956276,
+    4.252699478956276,
+    4.252699478956276;
+
+  MatrixXd ctrls(8,2);
+  ctrls << -0.370967741935484,   0.236842105263158,
+    -0.231401860693277,   0.442245185027632,
+    0.344361228532831,   0.773369994120753,
+    0.828990216203802,   0.106550882647595,
+    0.407270163678382,  -1.043452922172848,
+    -0.488467813584053,  -0.390098582530090,
+    -0.494657189446427,   0.054804824897884,
+    -0.370967741935484,   0.236842105263158;
+  ctrls.transposeInPlace();
+
+  return Spline<double, 2, Dynamic>(knots, ctrls);
+}
+
+/* create a reference spline */
+Spline<double, 3, Dynamic> spline3d()
+{
+  RowVectorXd knots(11);
+  knots << 0,
+    0,
+    0,
+    0.118997681558377,
+    0.162611735194631,
+    0.498364051982143,
+    0.655098003973841,
+    0.679702676853675,
+    1.000000000000000,
+    1.000000000000000,
+    1.000000000000000;
+
+  MatrixXd ctrls(8,3);
+  ctrls <<    0.959743958516081,   0.340385726666133,   0.585267750979777,
+    0.223811939491137,   0.751267059305653,   0.255095115459269,
+    0.505957051665142,   0.699076722656686,   0.890903252535799,
+    0.959291425205444,   0.547215529963803,   0.138624442828679,
+    0.149294005559057,   0.257508254123736,   0.840717255983663,
+    0.254282178971531,   0.814284826068816,   0.243524968724989,
+    0.929263623187228,   0.349983765984809,   0.196595250431208,
+    0.251083857976031,   0.616044676146639,   0.473288848902729;
+  ctrls.transposeInPlace();
+
+  return Spline<double, 3, Dynamic>(knots, ctrls);
+}
+
+/* compares evaluations against known results */
+void eval_spline3d()
+{
+  Spline3d spline = spline3d();
+
+  RowVectorXd u(10);
+  u << 0.351659507062997,
+    0.830828627896291,
+    0.585264091152724,
+    0.549723608291140,
+    0.917193663829810,
+    0.285839018820374,
+    0.757200229110721,
+    0.753729094278495,
+    0.380445846975357,
+    0.567821640725221;
+
+  MatrixXd pts(10,3);
+  pts << 0.707620811535916,   0.510258911240815,   0.417485437023409,
+    0.603422256426978,   0.529498282727551,   0.270351549348981,
+    0.228364197569334,   0.423745615677815,   0.637687289287490,
+    0.275556796335168,   0.350856706427970,   0.684295784598905,
+    0.514519311047655,   0.525077224890754,   0.351628308305896,
+    0.724152914315666,   0.574461155457304,   0.469860285484058,
+    0.529365063753288,   0.613328702656816,   0.237837040141739,
+    0.522469395136878,   0.619099658652895,   0.237139665242069,
+    0.677357023849552,   0.480655768435853,   0.422227610314397,
+    0.247046593173758,   0.380604672404750,   0.670065791405019;
+  pts.transposeInPlace();
+
+  for (int i=0; i<u.size(); ++i)
+  {
+    Vector3d pt = spline(u(i));
+    VERIFY( (pt - pts.col(i)).norm() < 1e-14 );
+  }
+}
+
+/* compares evaluations on corner cases */
+void eval_spline3d_onbrks()
+{
+  Spline3d spline = spline3d();
+
+  RowVectorXd u = spline.knots();
+
+  MatrixXd pts(11,3);
+  pts <<    0.959743958516081,   0.340385726666133,   0.585267750979777,
+    0.959743958516081,   0.340385726666133,   0.585267750979777,
+    0.959743958516081,   0.340385726666133,   0.585267750979777,
+    0.430282980289940,   0.713074680056118,   0.720373307943349,
+    0.558074875553060,   0.681617921034459,   0.804417124839942,
+    0.407076008291750,   0.349707710518163,   0.617275937419545,
+    0.240037008286602,   0.738739390398014,   0.324554153129411,
+    0.302434111480572,   0.781162443963899,   0.240177089094644,
+    0.251083857976031,   0.616044676146639,   0.473288848902729,
+    0.251083857976031,   0.616044676146639,   0.473288848902729,
+    0.251083857976031,   0.616044676146639,   0.473288848902729;
+  pts.transposeInPlace();
+
+  for (int i=0; i<u.size(); ++i)
+  {
+    Vector3d pt = spline(u(i));
+    VERIFY( (pt - pts.col(i)).norm() < 1e-14 );
+  }
+}
+
+void eval_closed_spline2d()
+{
+  Spline2d spline = closed_spline2d();
+
+  RowVectorXd u(12);
+  u << 0,
+    0.332457030395796,
+    0.356467130532952,
+    0.453562180176215,
+    0.648017921874804,
+    0.973770235555003,
+    1.882577647219307,
+    2.289408593930498,
+    3.511951429883045,
+    3.884149321369450,
+    4.236261590369414,
+    4.252699478956276;
+
+  MatrixXd pts(12,2);
+  pts << -0.370967741935484,   0.236842105263158,
+    -0.152576775123250,   0.448975001279334,
+    -0.133417538277668,   0.461615613865667,
+    -0.053199060826740,   0.507630360006299,
+    0.114249591147281,   0.570414135097409,
+    0.377810316891987,   0.560497102875315,
+    0.665052120135908,  -0.157557441109611,
+    0.516006487053228,  -0.559763292174825,
+    -0.379486035348887,  -0.331959640488223,
+    -0.462034726249078,  -0.039105670080824,
+    -0.378730600917982,   0.225127015099919,
+    -0.370967741935484,   0.236842105263158;
+  pts.transposeInPlace();
+
+  for (int i=0; i<u.size(); ++i)
+  {
+    Vector2d pt = spline(u(i));
+    VERIFY( (pt - pts.col(i)).norm() < 1e-14 );
+  }
+}
+
+void check_global_interpolation2d()
+{
+  typedef Spline2d::PointType PointType;
+  typedef Spline2d::KnotVectorType KnotVectorType;
+  typedef Spline2d::ControlPointVectorType ControlPointVectorType;
+
+  ControlPointVectorType points = ControlPointVectorType::Random(2,100);
+
+  KnotVectorType chord_lengths; // knot parameters
+  Eigen::ChordLengths(points, chord_lengths);
+
+  // interpolation without knot parameters
+  {
+    const Spline2d spline = SplineFitting<Spline2d>::Interpolate(points,3);  
+
+    for (Eigen::DenseIndex i=0; i<points.cols(); ++i)
+    {
+      PointType pt = spline( chord_lengths(i) );
+      PointType ref = points.col(i);
+      VERIFY( (pt - ref).matrix().norm() < 1e-14 );
+    }
+  }
+
+  // interpolation with given knot parameters
+  {
+    const Spline2d spline = SplineFitting<Spline2d>::Interpolate(points,3,chord_lengths);  
+
+    for (Eigen::DenseIndex i=0; i<points.cols(); ++i)
+    {
+      PointType pt = spline( chord_lengths(i) );
+      PointType ref = points.col(i);
+      VERIFY( (pt - ref).matrix().norm() < 1e-14 );
+    }
+  }
+}
+
+void check_global_interpolation_with_derivatives2d()
+{
+  typedef Spline2d::PointType PointType;
+  typedef Spline2d::KnotVectorType KnotVectorType;
+
+  const Eigen::DenseIndex numPoints = 100;
+  const unsigned int dimension = 2;
+  const unsigned int degree = 3;
+
+  ArrayXXd points = ArrayXXd::Random(dimension, numPoints);
+
+  KnotVectorType knots;
+  Eigen::ChordLengths(points, knots);
+
+  ArrayXXd derivatives = ArrayXXd::Random(dimension, numPoints);
+  VectorXd derivativeIndices(numPoints);
+
+  for (Eigen::DenseIndex i = 0; i < numPoints; ++i)
+      derivativeIndices(i) = static_cast<double>(i);
+
+  const Spline2d spline = SplineFitting<Spline2d>::InterpolateWithDerivatives(
+    points, derivatives, derivativeIndices, degree);  
+    
+  for (Eigen::DenseIndex i = 0; i < points.cols(); ++i)
+  {
+    PointType point = spline(knots(i));
+    PointType referencePoint = points.col(i);
+    VERIFY_IS_APPROX(point, referencePoint);
+    PointType derivative = spline.derivatives(knots(i), 1).col(1);
+    PointType referenceDerivative = derivatives.col(i);
+    VERIFY_IS_APPROX(derivative, referenceDerivative);
+  }
+}
+
+EIGEN_DECLARE_TEST(splines)
+{
+  for (int i = 0; i < g_repeat; ++i)
+  {
+    CALL_SUBTEST( eval_spline3d() );
+    CALL_SUBTEST( eval_spline3d_onbrks() );
+    CALL_SUBTEST( eval_closed_spline2d() );
+    CALL_SUBTEST( check_global_interpolation2d() );
+    CALL_SUBTEST( check_global_interpolation_with_derivatives2d() );
+  }
+}